From 424fd79612ae96e2b389b015c3283b8394201559 Mon Sep 17 00:00:00 2001
From: "kernel-team@fedoraproject.org" <kernel-team@fedoraproject.org>
Date: Fri, 10 Feb 2012 14:56:13 -0500
Subject: [PATCH 001/737] scsi: sd_revalidate_disk prevent NULL ptr deref

Bugzilla: 754518
Upstream-status: Fedora mustard (might be worth dropping...)
(cherry picked from commit 375799a9c882e8dcf2bd663102305315eda18ddb)
Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Cristian Gafton <gafton@amazon.com>
Reviewed-by: Guru Anbalagane <guruanb@amazon.com>

Reviewed-by: Cristian Gafton <gafton@amazon.com>
Reviewed-by: Frederick Lefebvre <fredlef@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Signed-off-by: Vallish Vaidyeshwara <vallish@amazon.com>
---
 drivers/scsi/sd.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 58f66176bcb28..f0468990cefe9 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -3189,15 +3189,22 @@ static bool sd_validate_opt_xfer_size(struct scsi_disk *sdkp,
 static int sd_revalidate_disk(struct gendisk *disk)
 {
 	struct scsi_disk *sdkp = scsi_disk(disk);
-	struct scsi_device *sdp = sdkp->device;
-	struct request_queue *q = sdkp->disk->queue;
-	sector_t old_capacity = sdkp->capacity;
+	struct scsi_device *sdp;
+	struct request_queue *q;
+	sector_t old_capacity;
 	unsigned char *buffer;
 	unsigned int dev_max, rw_max;
 
 	SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp,
 				      "sd_revalidate_disk\n"));
 
+	if (WARN_ONCE((!sdkp), "Invalid scsi_disk from %p\n", disk))
+		goto out;
+
+	sdp = sdkp->device;
+	q = sdkp->disk->queue;
+	old_capacity = sdkp->capacity;
+
 	/*
 	 * If the device is offline, don't try and read capacity or any
 	 * of the other niceties.

From 3d4535af0ad21f177898ed54faf76f99508fbff4 Mon Sep 17 00:00:00 2001
From: Alakesh Haloi <alakeshh@amazon.com>
Date: Fri, 27 Oct 2017 00:36:15 +0000
Subject: [PATCH 002/737] not-for-upstream: testmgr config changes to enable
 FIPS boot

The Federal Information Processing Standard (FIPS) Publication 140-2, is a
computer security standard, developed by a U.S. Government and industry working
group to validate the quality of cryptographic modules. Enabling FIPS mode
involves the following steps:
a. prelinking needs to be disabled. PRELINKING=no in /etc/sysconfig/prelink
b. Install dracut-fips package
   # yum install dracut-fips.
   Installing dracut-fipes enables module signing by default and also enables
   scripts that do FIPS integrity verification, regardless of whether FIPS mode
   is on. If FIPS mode is on, and verification failure is detected, then syste
   will panic.
c. Recreate initramfs
   # dracut -v -f
d. Modify kernel command line to include the following option fips=1. For gaub2
   based system add fips=1 to the end of the CMDLINE in /etc/default/grub and
   then run the following command
   # grub2-mkconfig -o /boot/grub2/grub.cfg
e. Reboot the system.

In FIPS mode, some self tests are run by dracut-fips package which is otherwise
not the case for kernel not running in FIPS mode. The changes in the tests
mentioned in this CR is only relevant for kernel running in FIPS mode.

In this changeset, we enable/disable cryptographic algorithms in FIPS mode to
make sure that we enable the tests that are supportedand disable the tests that
are not supported in our kernel. Among the tests that are not supported are the
SHA3 family of tests and their hmac versions. Also gcm(aesni) is disabled as
the support is currently missing in the kernel. Also we should remember that,
this change is not an effort to make the kernel FIPS compliant. FIPS compliance
needs to be done by certified authority. This change is about adding support
for FIPS mode. Running official FIPS compliance may necessiate support for
additional cryptographic algorithms or remove fips_enabed flag in the tests for
few algorithms as the need may arise.

FIPS mode for a test is disabled by removing fips_enabled = 1 from the test
description in testmgr.c. Adding support is more involved. The test needs to
be implemented and pointed to in the structure used to describe the test.

In FIPS mode, only the tests that are tagged with fips_enabled=1 are run and
rest of the tests are ignored. So if you are not sure about an algorithm which
needs to be enabled in FIPS mode, it needs to be disabled in testmgr.c.

NU: because FIPS enablement is distro specific.

Signed-off-by: Alakesh Haloi <alakeshh@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Reviewed-by: Anchal Agarwal <anchalag@amazon.com>

Reviewed-by: Cristian Gafton <gafton@amazon.com>
Reviewed-by: Frederick Lefebvre <fredlef@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Signed-off-by: Vallish Vaidyeshwara <vallish@amazon.com>
---
 crypto/testmgr.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index a64a639eddfa4..51c99630c61ae 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -5003,28 +5003,24 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "hmac(sha3-224)",
 		.test = alg_test_hash,
-		.fips_allowed = 1,
 		.suite = {
 			.hash = __VECS(hmac_sha3_224_tv_template)
 		}
 	}, {
 		.alg = "hmac(sha3-256)",
 		.test = alg_test_hash,
-		.fips_allowed = 1,
 		.suite = {
 			.hash = __VECS(hmac_sha3_256_tv_template)
 		}
 	}, {
 		.alg = "hmac(sha3-384)",
 		.test = alg_test_hash,
-		.fips_allowed = 1,
 		.suite = {
 			.hash = __VECS(hmac_sha3_384_tv_template)
 		}
 	}, {
 		.alg = "hmac(sha3-512)",
 		.test = alg_test_hash,
-		.fips_allowed = 1,
 		.suite = {
 			.hash = __VECS(hmac_sha3_512_tv_template)
 		}
@@ -5238,7 +5234,6 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.alg = "rfc4106(gcm(aes))",
 		.generic_driver = "rfc4106(gcm_base(ctr(aes-generic),ghash-generic))",
 		.test = alg_test_aead,
-		.fips_allowed = 1,
 		.suite = {
 			.aead = {
 				____VECS(aes_gcm_rfc4106_tv_template),
@@ -5346,28 +5341,24 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "sha3-224",
 		.test = alg_test_hash,
-		.fips_allowed = 1,
 		.suite = {
 			.hash = __VECS(sha3_224_tv_template)
 		}
 	}, {
 		.alg = "sha3-256",
 		.test = alg_test_hash,
-		.fips_allowed = 1,
 		.suite = {
 			.hash = __VECS(sha3_256_tv_template)
 		}
 	}, {
 		.alg = "sha3-384",
 		.test = alg_test_hash,
-		.fips_allowed = 1,
 		.suite = {
 			.hash = __VECS(sha3_384_tv_template)
 		}
 	}, {
 		.alg = "sha3-512",
 		.test = alg_test_hash,
-		.fips_allowed = 1,
 		.suite = {
 			.hash = __VECS(sha3_512_tv_template)
 		}
@@ -5536,6 +5527,10 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.suite = {
 			.hash = __VECS(xxhash64_tv_template)
 		}
+	}, {
+		.alg = "zlib",
+		.test = alg_test_null,
+		.fips_allowed = 1,
 	}, {
 		.alg = "zlib-deflate",
 		.test = alg_test_comp,

From 831f2d8b9f59c6b32d27358e41d3edcf647edc19 Mon Sep 17 00:00:00 2001
From: Vallish Vaidyeshwara <vallish@amazon.com>
Date: Mon, 12 Feb 2018 22:29:56 +0000
Subject: [PATCH 003/737] drivers: introduce AMAZON_DRIVER_UPDATES

This provides a central place to maintain out-of-tree drivers.
Renamed from VENDOR_AMAZON because the name was no longer appropriate.

Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Cristian Gafton <gafton@amazon.com>
Reviewed-by: Guru Anbalagane <guruanb@amazon.com>

Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Reviewed-by: Anchal Agarwal <anchalag@amazon.com>
Signed-off-by: Vallish Vaidyeshwara <vallish@amazon.com>
---
 drivers/Kconfig         |  2 ++
 drivers/Makefile        |  1 +
 drivers/amazon/Kconfig  | 15 +++++++++++++++
 drivers/amazon/Makefile |  3 +++
 4 files changed, 21 insertions(+)
 create mode 100644 drivers/amazon/Kconfig
 create mode 100644 drivers/amazon/Makefile

diff --git a/drivers/Kconfig b/drivers/Kconfig
index dcecc9f6e33f7..fff51c18a5896 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -235,4 +235,6 @@ source "drivers/interconnect/Kconfig"
 source "drivers/counter/Kconfig"
 
 source "drivers/most/Kconfig"
+
+source "drivers/amazon/Kconfig"
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 5762280377186..4b9dfb802c301 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -189,3 +189,4 @@ obj-$(CONFIG_GNSS)		+= gnss/
 obj-$(CONFIG_INTERCONNECT)	+= interconnect/
 obj-$(CONFIG_COUNTER)		+= counter/
 obj-$(CONFIG_MOST)		+= most/
+obj-$(CONFIG_AMAZON_DRIVER_UPDATES)    += amazon/
diff --git a/drivers/amazon/Kconfig b/drivers/amazon/Kconfig
new file mode 100644
index 0000000000000..7cc44c84699e8
--- /dev/null
+++ b/drivers/amazon/Kconfig
@@ -0,0 +1,15 @@
+#
+# Amazon driver updates configuration
+#
+
+config AMAZON_DRIVER_UPDATES
+	bool "Amazon Driver Updates"
+	default y
+	depends on PCI || EXPERIMENTAL
+	---help---
+	 Amazon driver updates includes out-of-tree drivers and/or modifeid
+	 versions of the drivers present in the stable kernel tree.
+
+if AMAZON_DRIVER_UPDATES
+
+endif # AMAZON_DRIVER_UPDATES
diff --git a/drivers/amazon/Makefile b/drivers/amazon/Makefile
new file mode 100644
index 0000000000000..6b4996dcbe52f
--- /dev/null
+++ b/drivers/amazon/Makefile
@@ -0,0 +1,3 @@
+#
+# Amazon Driver Updates
+#

From 2408edcefa4957083e7e00af146010d41d8ad506 Mon Sep 17 00:00:00 2001
From: Vallish Vaidyeshwara <vallish@amazon.com>
Date: Mon, 12 Feb 2018 22:38:52 +0000
Subject: [PATCH 004/737] drivers/amazon: add network device drivers support

This is a place holder for network device driver updates.

Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Cristian Gafton <gafton@amazon.com>
Reviewed-by: Guru Anbalagane <guruanb@amazon.com>

Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Reviewed-by: Anchal Agarwal <anchalag@amazon.com>
Signed-off-by: Vallish Vaidyeshwara <vallish@amazon.com>
---
 drivers/amazon/Makefile     | 1 +
 drivers/amazon/net/Makefile | 3 +++
 2 files changed, 4 insertions(+)
 create mode 100644 drivers/amazon/net/Makefile

diff --git a/drivers/amazon/Makefile b/drivers/amazon/Makefile
index 6b4996dcbe52f..fc5f70dd7487d 100644
--- a/drivers/amazon/Makefile
+++ b/drivers/amazon/Makefile
@@ -1,3 +1,4 @@
 #
 # Amazon Driver Updates
 #
+obj-$(CONFIG_AMAZON_DRIVER_UPDATES)	+= net/
diff --git a/drivers/amazon/net/Makefile b/drivers/amazon/net/Makefile
new file mode 100644
index 0000000000000..6b4996dcbe52f
--- /dev/null
+++ b/drivers/amazon/net/Makefile
@@ -0,0 +1,3 @@
+#
+# Amazon Driver Updates
+#

From a27f029071e4dc73e85259fdea9e4183127eff2a Mon Sep 17 00:00:00 2001
From: Vallish Vaidyeshwara <vallish@amazon.com>
Date: Mon, 12 Feb 2018 22:51:01 +0000
Subject: [PATCH 005/737] drivers/amazon: introduce AMAZON_ENA_ETHERNET

This option is for out-of-tree ENA driver.

Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Cristian Gafton <gafton@amazon.com>
Reviewed-by: Guru Anbalagane <guruanb@amazon.com>

Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Reviewed-by: Anchal Agarwal <anchalag@amazon.com>
Signed-off-by: Vallish Vaidyeshwara <vallish@amazon.com>
---
 drivers/amazon/Kconfig      | 9 +++++++++
 drivers/amazon/net/Makefile | 1 +
 2 files changed, 10 insertions(+)

diff --git a/drivers/amazon/Kconfig b/drivers/amazon/Kconfig
index 7cc44c84699e8..eb0f5450bb1d8 100644
--- a/drivers/amazon/Kconfig
+++ b/drivers/amazon/Kconfig
@@ -12,4 +12,13 @@ config AMAZON_DRIVER_UPDATES
 
 if AMAZON_DRIVER_UPDATES
 
+config AMAZON_ENA_ETHERNET
+	tristate "Elastic Network Adapter (ENA) support"
+	depends on PCI_MSI && !ENA_ETHERNET
+	---help---
+	  This driver supports Elastic Network Adapter (ENA)
+
+	  To compile this driver as a module, choose M here.
+	  The module will be called ena.
+
 endif # AMAZON_DRIVER_UPDATES
diff --git a/drivers/amazon/net/Makefile b/drivers/amazon/net/Makefile
index 6b4996dcbe52f..d59ce86b1311d 100644
--- a/drivers/amazon/net/Makefile
+++ b/drivers/amazon/net/Makefile
@@ -1,3 +1,4 @@
 #
 # Amazon Driver Updates
 #
+obj-$(CONFIG_AMAZON_ENA_ETHERNET)	+= ena/

From f8f8417a781875b261ea2538d9ce5197512d655c Mon Sep 17 00:00:00 2001
From: Vallish Vaidyeshwara <vallish@amazon.com>
Date: Tue, 27 Feb 2018 06:11:30 +0000
Subject: [PATCH 006/737] Importing Amazon ENA driver 1.5.0 into
 amazon-4.14.y/master.

Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Reviewed-by: Anchal Agarwal <anchalag@amazon.com>
Signed-off-by: Vallish Vaidyeshwara <vallish@amazon.com>
---
 drivers/amazon/net/ena/Makefile          |   11 +
 drivers/amazon/net/ena/ena_admin_defs.h  | 1018 ++++++
 drivers/amazon/net/ena/ena_com.c         | 2714 +++++++++++++++
 drivers/amazon/net/ena/ena_com.h         | 1051 ++++++
 drivers/amazon/net/ena/ena_common_defs.h |   48 +
 drivers/amazon/net/ena/ena_eth_com.c     |  518 +++
 drivers/amazon/net/ena/ena_eth_com.h     |  162 +
 drivers/amazon/net/ena/ena_eth_io_defs.h |  416 +++
 drivers/amazon/net/ena/ena_ethtool.c     | 1015 ++++++
 drivers/amazon/net/ena/ena_netdev.c      | 3968 ++++++++++++++++++++++
 drivers/amazon/net/ena/ena_netdev.h      |  484 +++
 drivers/amazon/net/ena/ena_pci_id_tbl.h  |   67 +
 drivers/amazon/net/ena/ena_regs_defs.h   |  169 +
 drivers/amazon/net/ena/ena_sysfs.c       |  268 ++
 drivers/amazon/net/ena/ena_sysfs.h       |   55 +
 drivers/amazon/net/ena/kcompat.h         |  570 ++++
 16 files changed, 12534 insertions(+)
 create mode 100644 drivers/amazon/net/ena/Makefile
 create mode 100644 drivers/amazon/net/ena/ena_admin_defs.h
 create mode 100644 drivers/amazon/net/ena/ena_com.c
 create mode 100644 drivers/amazon/net/ena/ena_com.h
 create mode 100644 drivers/amazon/net/ena/ena_common_defs.h
 create mode 100644 drivers/amazon/net/ena/ena_eth_com.c
 create mode 100644 drivers/amazon/net/ena/ena_eth_com.h
 create mode 100644 drivers/amazon/net/ena/ena_eth_io_defs.h
 create mode 100644 drivers/amazon/net/ena/ena_ethtool.c
 create mode 100644 drivers/amazon/net/ena/ena_netdev.c
 create mode 100644 drivers/amazon/net/ena/ena_netdev.h
 create mode 100644 drivers/amazon/net/ena/ena_pci_id_tbl.h
 create mode 100644 drivers/amazon/net/ena/ena_regs_defs.h
 create mode 100644 drivers/amazon/net/ena/ena_sysfs.c
 create mode 100644 drivers/amazon/net/ena/ena_sysfs.h
 create mode 100644 drivers/amazon/net/ena/kcompat.h

diff --git a/drivers/amazon/net/ena/Makefile b/drivers/amazon/net/ena/Makefile
new file mode 100644
index 0000000000000..0e671d0b389d4
--- /dev/null
+++ b/drivers/amazon/net/ena/Makefile
@@ -0,0 +1,11 @@
+#
+# Makefile for the Elastic Network Adapter (ENA) device drivers.
+# ENA Source is: https://github.com/amzn/amzn-drivers.
+# Current ENA source is based on ena_linux_1.5.0 tag.
+#
+
+obj-$(CONFIG_AMAZON_ENA_ETHERNET) += ena.o
+
+ena-y := ena_netdev.o ena_com.o ena_eth_com.o ena_ethtool.o
+
+ena-$(CONFIG_SYSFS) += ena_sysfs.o
diff --git a/drivers/amazon/net/ena/ena_admin_defs.h b/drivers/amazon/net/ena/ena_admin_defs.h
new file mode 100644
index 0000000000000..4532e574ebcdc
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_admin_defs.h
@@ -0,0 +1,1018 @@
+/*
+ * Copyright 2015 - 2016 Amazon.com, Inc. or its affiliates.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _ENA_ADMIN_H_
+#define _ENA_ADMIN_H_
+
+enum ena_admin_aq_opcode {
+	ENA_ADMIN_CREATE_SQ	= 1,
+
+	ENA_ADMIN_DESTROY_SQ	= 2,
+
+	ENA_ADMIN_CREATE_CQ	= 3,
+
+	ENA_ADMIN_DESTROY_CQ	= 4,
+
+	ENA_ADMIN_GET_FEATURE	= 8,
+
+	ENA_ADMIN_SET_FEATURE	= 9,
+
+	ENA_ADMIN_GET_STATS	= 11,
+};
+
+enum ena_admin_aq_completion_status {
+	ENA_ADMIN_SUCCESS			= 0,
+
+	ENA_ADMIN_RESOURCE_ALLOCATION_FAILURE	= 1,
+
+	ENA_ADMIN_BAD_OPCODE			= 2,
+
+	ENA_ADMIN_UNSUPPORTED_OPCODE		= 3,
+
+	ENA_ADMIN_MALFORMED_REQUEST		= 4,
+
+	/* Additional status is provided in ACQ entry extended_status */
+	ENA_ADMIN_ILLEGAL_PARAMETER		= 5,
+
+	ENA_ADMIN_UNKNOWN_ERROR			= 6,
+};
+
+enum ena_admin_aq_feature_id {
+	ENA_ADMIN_DEVICE_ATTRIBUTES		= 1,
+
+	ENA_ADMIN_MAX_QUEUES_NUM		= 2,
+
+	ENA_ADMIN_HW_HINTS			= 3,
+
+	ENA_ADMIN_RSS_HASH_FUNCTION		= 10,
+
+	ENA_ADMIN_STATELESS_OFFLOAD_CONFIG	= 11,
+
+	ENA_ADMIN_RSS_REDIRECTION_TABLE_CONFIG	= 12,
+
+	ENA_ADMIN_MTU				= 14,
+
+	ENA_ADMIN_RSS_HASH_INPUT		= 18,
+
+	ENA_ADMIN_INTERRUPT_MODERATION		= 20,
+
+	ENA_ADMIN_AENQ_CONFIG			= 26,
+
+	ENA_ADMIN_LINK_CONFIG			= 27,
+
+	ENA_ADMIN_HOST_ATTR_CONFIG		= 28,
+
+	ENA_ADMIN_FEATURES_OPCODE_NUM		= 32,
+};
+
+enum ena_admin_placement_policy_type {
+	/* descriptors and headers are in host memory */
+	ENA_ADMIN_PLACEMENT_POLICY_HOST	= 1,
+
+	/* descriptors and headers are in device memory (a.k.a Low Latency
+	 * Queue)
+	 */
+	ENA_ADMIN_PLACEMENT_POLICY_DEV	= 3,
+};
+
+enum ena_admin_link_types {
+	ENA_ADMIN_LINK_SPEED_1G		= 0x1,
+
+	ENA_ADMIN_LINK_SPEED_2_HALF_G	= 0x2,
+
+	ENA_ADMIN_LINK_SPEED_5G		= 0x4,
+
+	ENA_ADMIN_LINK_SPEED_10G	= 0x8,
+
+	ENA_ADMIN_LINK_SPEED_25G	= 0x10,
+
+	ENA_ADMIN_LINK_SPEED_40G	= 0x20,
+
+	ENA_ADMIN_LINK_SPEED_50G	= 0x40,
+
+	ENA_ADMIN_LINK_SPEED_100G	= 0x80,
+
+	ENA_ADMIN_LINK_SPEED_200G	= 0x100,
+
+	ENA_ADMIN_LINK_SPEED_400G	= 0x200,
+};
+
+enum ena_admin_completion_policy_type {
+	/* completion queue entry for each sq descriptor */
+	ENA_ADMIN_COMPLETION_POLICY_DESC		= 0,
+
+	/* completion queue entry upon request in sq descriptor */
+	ENA_ADMIN_COMPLETION_POLICY_DESC_ON_DEMAND	= 1,
+
+	/* current queue head pointer is updated in OS memory upon sq
+	 * descriptor request
+	 */
+	ENA_ADMIN_COMPLETION_POLICY_HEAD_ON_DEMAND	= 2,
+
+	/* current queue head pointer is updated in OS memory for each sq
+	 * descriptor
+	 */
+	ENA_ADMIN_COMPLETION_POLICY_HEAD		= 3,
+};
+
+/* basic stats return ena_admin_basic_stats while extanded stats return a
+ * buffer (string format) with additional statistics per queue and per
+ * device id
+ */
+enum ena_admin_get_stats_type {
+	ENA_ADMIN_GET_STATS_TYPE_BASIC		= 0,
+
+	ENA_ADMIN_GET_STATS_TYPE_EXTENDED	= 1,
+};
+
+enum ena_admin_get_stats_scope {
+	ENA_ADMIN_SPECIFIC_QUEUE	= 0,
+
+	ENA_ADMIN_ETH_TRAFFIC		= 1,
+};
+
+struct ena_admin_aq_common_desc {
+	/* 11:0 : command_id
+	 * 15:12 : reserved12
+	 */
+	u16 command_id;
+
+	/* as appears in ena_admin_aq_opcode */
+	u8 opcode;
+
+	/* 0 : phase
+	 * 1 : ctrl_data - control buffer address valid
+	 * 2 : ctrl_data_indirect - control buffer address
+	 *    points to list of pages with addresses of control
+	 *    buffers
+	 * 7:3 : reserved3
+	 */
+	u8 flags;
+};
+
+/* used in ena_admin_aq_entry. Can point directly to control data, or to a
+ * page list chunk. Used also at the end of indirect mode page list chunks,
+ * for chaining.
+ */
+struct ena_admin_ctrl_buff_info {
+	u32 length;
+
+	struct ena_common_mem_addr address;
+};
+
+struct ena_admin_sq {
+	u16 sq_idx;
+
+	/* 4:0 : reserved
+	 * 7:5 : sq_direction - 0x1 - Tx; 0x2 - Rx
+	 */
+	u8 sq_identity;
+
+	u8 reserved1;
+};
+
+struct ena_admin_aq_entry {
+	struct ena_admin_aq_common_desc aq_common_descriptor;
+
+	union {
+		u32 inline_data_w1[3];
+
+		struct ena_admin_ctrl_buff_info control_buffer;
+	} u;
+
+	u32 inline_data_w4[12];
+};
+
+struct ena_admin_acq_common_desc {
+	/* command identifier to associate it with the aq descriptor
+	 * 11:0 : command_id
+	 * 15:12 : reserved12
+	 */
+	u16 command;
+
+	u8 status;
+
+	/* 0 : phase
+	 * 7:1 : reserved1
+	 */
+	u8 flags;
+
+	u16 extended_status;
+
+	/* serves as a hint what AQ entries can be revoked */
+	u16 sq_head_indx;
+};
+
+struct ena_admin_acq_entry {
+	struct ena_admin_acq_common_desc acq_common_descriptor;
+
+	u32 response_specific_data[14];
+};
+
+struct ena_admin_aq_create_sq_cmd {
+	struct ena_admin_aq_common_desc aq_common_descriptor;
+
+	/* 4:0 : reserved0_w1
+	 * 7:5 : sq_direction - 0x1 - Tx, 0x2 - Rx
+	 */
+	u8 sq_identity;
+
+	u8 reserved8_w1;
+
+	/* 3:0 : placement_policy - Describing where the SQ
+	 *    descriptor ring and the SQ packet headers reside:
+	 *    0x1 - descriptors and headers are in OS memory,
+	 *    0x3 - descriptors and headers in device memory
+	 *    (a.k.a Low Latency Queue)
+	 * 6:4 : completion_policy - Describing what policy
+	 *    to use for generation completion entry (cqe) in
+	 *    the CQ associated with this SQ: 0x0 - cqe for each
+	 *    sq descriptor, 0x1 - cqe upon request in sq
+	 *    descriptor, 0x2 - current queue head pointer is
+	 *    updated in OS memory upon sq descriptor request
+	 *    0x3 - current queue head pointer is updated in OS
+	 *    memory for each sq descriptor
+	 * 7 : reserved15_w1
+	 */
+	u8 sq_caps_2;
+
+	/* 0 : is_physically_contiguous - Described if the
+	 *    queue ring memory is allocated in physical
+	 *    contiguous pages or split.
+	 * 7:1 : reserved17_w1
+	 */
+	u8 sq_caps_3;
+
+	/* associated completion queue id. This CQ must be created prior to
+	 *    SQ creation
+	 */
+	u16 cq_idx;
+
+	/* submission queue depth in entries */
+	u16 sq_depth;
+
+	/* SQ physical base address in OS memory. This field should not be
+	 * used for Low Latency queues. Has to be page aligned.
+	 */
+	struct ena_common_mem_addr sq_ba;
+
+	/* specifies queue head writeback location in OS memory. Valid if
+	 * completion_policy is set to completion_policy_head_on_demand or
+	 * completion_policy_head. Has to be cache aligned
+	 */
+	struct ena_common_mem_addr sq_head_writeback;
+
+	u32 reserved0_w7;
+
+	u32 reserved0_w8;
+};
+
+enum ena_admin_sq_direction {
+	ENA_ADMIN_SQ_DIRECTION_TX	= 1,
+
+	ENA_ADMIN_SQ_DIRECTION_RX	= 2,
+};
+
+struct ena_admin_acq_create_sq_resp_desc {
+	struct ena_admin_acq_common_desc acq_common_desc;
+
+	u16 sq_idx;
+
+	u16 reserved;
+
+	/* queue doorbell address as an offset to PCIe MMIO REG BAR */
+	u32 sq_doorbell_offset;
+
+	/* low latency queue ring base address as an offset to PCIe MMIO
+	 * LLQ_MEM BAR
+	 */
+	u32 llq_descriptors_offset;
+
+	/* low latency queue headers' memory as an offset to PCIe MMIO
+	 * LLQ_MEM BAR
+	 */
+	u32 llq_headers_offset;
+};
+
+struct ena_admin_aq_destroy_sq_cmd {
+	struct ena_admin_aq_common_desc aq_common_descriptor;
+
+	struct ena_admin_sq sq;
+};
+
+struct ena_admin_acq_destroy_sq_resp_desc {
+	struct ena_admin_acq_common_desc acq_common_desc;
+};
+
+struct ena_admin_aq_create_cq_cmd {
+	struct ena_admin_aq_common_desc aq_common_descriptor;
+
+	/* 4:0 : reserved5
+	 * 5 : interrupt_mode_enabled - if set, cq operates
+	 *    in interrupt mode, otherwise - polling
+	 * 7:6 : reserved6
+	 */
+	u8 cq_caps_1;
+
+	/* 4:0 : cq_entry_size_words - size of CQ entry in
+	 *    32-bit words, valid values: 4, 8.
+	 * 7:5 : reserved7
+	 */
+	u8 cq_caps_2;
+
+	/* completion queue depth in # of entries. must be power of 2 */
+	u16 cq_depth;
+
+	/* msix vector assigned to this cq */
+	u32 msix_vector;
+
+	/* cq physical base address in OS memory. CQ must be physically
+	 * contiguous
+	 */
+	struct ena_common_mem_addr cq_ba;
+};
+
+struct ena_admin_acq_create_cq_resp_desc {
+	struct ena_admin_acq_common_desc acq_common_desc;
+
+	u16 cq_idx;
+
+	/* actual cq depth in number of entries */
+	u16 cq_actual_depth;
+
+	u32 numa_node_register_offset;
+
+	u32 cq_head_db_register_offset;
+
+	u32 cq_interrupt_unmask_register_offset;
+};
+
+struct ena_admin_aq_destroy_cq_cmd {
+	struct ena_admin_aq_common_desc aq_common_descriptor;
+
+	u16 cq_idx;
+
+	u16 reserved1;
+};
+
+struct ena_admin_acq_destroy_cq_resp_desc {
+	struct ena_admin_acq_common_desc acq_common_desc;
+};
+
+/* ENA AQ Get Statistics command. Extended statistics are placed in control
+ * buffer pointed by AQ entry
+ */
+struct ena_admin_aq_get_stats_cmd {
+	struct ena_admin_aq_common_desc aq_common_descriptor;
+
+	union {
+		/* command specific inline data */
+		u32 inline_data_w1[3];
+
+		struct ena_admin_ctrl_buff_info control_buffer;
+	} u;
+
+	/* stats type as defined in enum ena_admin_get_stats_type */
+	u8 type;
+
+	/* stats scope defined in enum ena_admin_get_stats_scope */
+	u8 scope;
+
+	u16 reserved3;
+
+	/* queue id. used when scope is specific_queue */
+	u16 queue_idx;
+
+	/* device id, value 0xFFFF means mine. only privileged device can get
+	 *    stats of other device
+	 */
+	u16 device_id;
+};
+
+/* Basic Statistics Command. */
+struct ena_admin_basic_stats {
+	u32 tx_bytes_low;
+
+	u32 tx_bytes_high;
+
+	u32 tx_pkts_low;
+
+	u32 tx_pkts_high;
+
+	u32 rx_bytes_low;
+
+	u32 rx_bytes_high;
+
+	u32 rx_pkts_low;
+
+	u32 rx_pkts_high;
+
+	u32 rx_drops_low;
+
+	u32 rx_drops_high;
+};
+
+struct ena_admin_acq_get_stats_resp {
+	struct ena_admin_acq_common_desc acq_common_desc;
+
+	struct ena_admin_basic_stats basic_stats;
+};
+
+struct ena_admin_get_set_feature_common_desc {
+	/* 1:0 : select - 0x1 - current value; 0x3 - default
+	 *    value
+	 * 7:3 : reserved3
+	 */
+	u8 flags;
+
+	/* as appears in ena_admin_aq_feature_id */
+	u8 feature_id;
+
+	u16 reserved16;
+};
+
+struct ena_admin_device_attr_feature_desc {
+	u32 impl_id;
+
+	u32 device_version;
+
+	/* bitmap of ena_admin_aq_feature_id */
+	u32 supported_features;
+
+	u32 reserved3;
+
+	/* Indicates how many bits are used physical address access. */
+	u32 phys_addr_width;
+
+	/* Indicates how many bits are used virtual address access. */
+	u32 virt_addr_width;
+
+	/* unicast MAC address (in Network byte order) */
+	u8 mac_addr[6];
+
+	u8 reserved7[2];
+
+	u32 max_mtu;
+};
+
+struct ena_admin_queue_feature_desc {
+	/* including LLQs */
+	u32 max_sq_num;
+
+	u32 max_sq_depth;
+
+	u32 max_cq_num;
+
+	u32 max_cq_depth;
+
+	u32 max_llq_num;
+
+	u32 max_llq_depth;
+
+	u32 max_header_size;
+
+	/* Maximum Descriptors number, including meta descriptor, allowed for
+	 *    a single Tx packet
+	 */
+	u16 max_packet_tx_descs;
+
+	/* Maximum Descriptors number allowed for a single Rx packet */
+	u16 max_packet_rx_descs;
+};
+
+struct ena_admin_set_feature_mtu_desc {
+	/* exclude L2 */
+	u32 mtu;
+};
+
+struct ena_admin_set_feature_host_attr_desc {
+	/* host OS info base address in OS memory. host info is 4KB of
+	 * physically contiguous
+	 */
+	struct ena_common_mem_addr os_info_ba;
+
+	/* host debug area base address in OS memory. debug area must be
+	 * physically contiguous
+	 */
+	struct ena_common_mem_addr debug_ba;
+
+	/* debug area size */
+	u32 debug_area_size;
+};
+
+struct ena_admin_feature_intr_moder_desc {
+	/* interrupt delay granularity in usec */
+	u16 intr_delay_resolution;
+
+	u16 reserved;
+};
+
+struct ena_admin_get_feature_link_desc {
+	/* Link speed in Mb */
+	u32 speed;
+
+	/* bit field of enum ena_admin_link types */
+	u32 supported;
+
+	/* 0 : autoneg
+	 * 1 : duplex - Full Duplex
+	 * 31:2 : reserved2
+	 */
+	u32 flags;
+};
+
+struct ena_admin_feature_aenq_desc {
+	/* bitmask for AENQ groups the device can report */
+	u32 supported_groups;
+
+	/* bitmask for AENQ groups to report */
+	u32 enabled_groups;
+};
+
+struct ena_admin_feature_offload_desc {
+	/* 0 : TX_L3_csum_ipv4
+	 * 1 : TX_L4_ipv4_csum_part - The checksum field
+	 *    should be initialized with pseudo header checksum
+	 * 2 : TX_L4_ipv4_csum_full
+	 * 3 : TX_L4_ipv6_csum_part - The checksum field
+	 *    should be initialized with pseudo header checksum
+	 * 4 : TX_L4_ipv6_csum_full
+	 * 5 : tso_ipv4
+	 * 6 : tso_ipv6
+	 * 7 : tso_ecn
+	 */
+	u32 tx;
+
+	/* Receive side supported stateless offload
+	 * 0 : RX_L3_csum_ipv4 - IPv4 checksum
+	 * 1 : RX_L4_ipv4_csum - TCP/UDP/IPv4 checksum
+	 * 2 : RX_L4_ipv6_csum - TCP/UDP/IPv6 checksum
+	 * 3 : RX_hash - Hash calculation
+	 */
+	u32 rx_supported;
+
+	u32 rx_enabled;
+};
+
+enum ena_admin_hash_functions {
+	ENA_ADMIN_TOEPLITZ	= 1,
+
+	ENA_ADMIN_CRC32		= 2,
+};
+
+struct ena_admin_feature_rss_flow_hash_control {
+	u32 keys_num;
+
+	u32 reserved;
+
+	u32 key[10];
+};
+
+struct ena_admin_feature_rss_flow_hash_function {
+	/* 7:0 : funcs - bitmask of ena_admin_hash_functions */
+	u32 supported_func;
+
+	/* 7:0 : selected_func - bitmask of
+	 *    ena_admin_hash_functions
+	 */
+	u32 selected_func;
+
+	/* initial value */
+	u32 init_val;
+};
+
+/* RSS flow hash protocols */
+enum ena_admin_flow_hash_proto {
+	ENA_ADMIN_RSS_TCP4	= 0,
+
+	ENA_ADMIN_RSS_UDP4	= 1,
+
+	ENA_ADMIN_RSS_TCP6	= 2,
+
+	ENA_ADMIN_RSS_UDP6	= 3,
+
+	ENA_ADMIN_RSS_IP4	= 4,
+
+	ENA_ADMIN_RSS_IP6	= 5,
+
+	ENA_ADMIN_RSS_IP4_FRAG	= 6,
+
+	ENA_ADMIN_RSS_NOT_IP	= 7,
+
+	/* TCPv6 with extension header */
+	ENA_ADMIN_RSS_TCP6_EX	= 8,
+
+	/* IPv6 with extension header */
+	ENA_ADMIN_RSS_IP6_EX	= 9,
+
+	ENA_ADMIN_RSS_PROTO_NUM	= 16,
+};
+
+/* RSS flow hash fields */
+enum ena_admin_flow_hash_fields {
+	/* Ethernet Dest Addr */
+	ENA_ADMIN_RSS_L2_DA	= BIT(0),
+
+	/* Ethernet Src Addr */
+	ENA_ADMIN_RSS_L2_SA	= BIT(1),
+
+	/* ipv4/6 Dest Addr */
+	ENA_ADMIN_RSS_L3_DA	= BIT(2),
+
+	/* ipv4/6 Src Addr */
+	ENA_ADMIN_RSS_L3_SA	= BIT(3),
+
+	/* tcp/udp Dest Port */
+	ENA_ADMIN_RSS_L4_DP	= BIT(4),
+
+	/* tcp/udp Src Port */
+	ENA_ADMIN_RSS_L4_SP	= BIT(5),
+};
+
+struct ena_admin_proto_input {
+	/* flow hash fields (bitwise according to ena_admin_flow_hash_fields) */
+	u16 fields;
+
+	u16 reserved2;
+};
+
+struct ena_admin_feature_rss_hash_control {
+	struct ena_admin_proto_input supported_fields[ENA_ADMIN_RSS_PROTO_NUM];
+
+	struct ena_admin_proto_input selected_fields[ENA_ADMIN_RSS_PROTO_NUM];
+
+	struct ena_admin_proto_input reserved2[ENA_ADMIN_RSS_PROTO_NUM];
+
+	struct ena_admin_proto_input reserved3[ENA_ADMIN_RSS_PROTO_NUM];
+};
+
+struct ena_admin_feature_rss_flow_hash_input {
+	/* supported hash input sorting
+	 * 1 : L3_sort - support swap L3 addresses if DA is
+	 *    smaller than SA
+	 * 2 : L4_sort - support swap L4 ports if DP smaller
+	 *    SP
+	 */
+	u16 supported_input_sort;
+
+	/* enabled hash input sorting
+	 * 1 : enable_L3_sort - enable swap L3 addresses if
+	 *    DA smaller than SA
+	 * 2 : enable_L4_sort - enable swap L4 ports if DP
+	 *    smaller than SP
+	 */
+	u16 enabled_input_sort;
+};
+
+enum ena_admin_os_type {
+	ENA_ADMIN_OS_LINUX	= 1,
+
+	ENA_ADMIN_OS_WIN	= 2,
+
+	ENA_ADMIN_OS_DPDK	= 3,
+
+	ENA_ADMIN_OS_FREEBSD	= 4,
+
+	ENA_ADMIN_OS_IPXE	= 5,
+};
+
+struct ena_admin_host_info {
+	/* defined in enum ena_admin_os_type */
+	u32 os_type;
+
+	/* os distribution string format */
+	u8 os_dist_str[128];
+
+	/* OS distribution numeric format */
+	u32 os_dist;
+
+	/* kernel version string format */
+	u8 kernel_ver_str[32];
+
+	/* Kernel version numeric format */
+	u32 kernel_ver;
+
+	/* 7:0 : major
+	 * 15:8 : minor
+	 * 23:16 : sub_minor
+	 */
+	u32 driver_version;
+
+	/* features bitmap */
+	u32 supported_network_features[4];
+};
+
+struct ena_admin_rss_ind_table_entry {
+	u16 cq_idx;
+
+	u16 reserved;
+};
+
+struct ena_admin_feature_rss_ind_table {
+	/* min supported table size (2^min_size) */
+	u16 min_size;
+
+	/* max supported table size (2^max_size) */
+	u16 max_size;
+
+	/* table size (2^size) */
+	u16 size;
+
+	u16 reserved;
+
+	/* index of the inline entry. 0xFFFFFFFF means invalid */
+	u32 inline_index;
+
+	/* used for updating single entry, ignored when setting the entire
+	 * table through the control buffer.
+	 */
+	struct ena_admin_rss_ind_table_entry inline_entry;
+};
+
+/* When hint value is 0, driver should use it's own predefined value */
+struct ena_admin_ena_hw_hints {
+	/* value in ms */
+	u16 mmio_read_timeout;
+
+	/* value in ms */
+	u16 driver_watchdog_timeout;
+
+	/* Per packet tx completion timeout. value in ms */
+	u16 missing_tx_completion_timeout;
+
+	u16 missed_tx_completion_count_threshold_to_reset;
+
+	/* value in ms */
+	u16 admin_completion_tx_timeout;
+
+	u16 netdev_wd_timeout;
+
+	u16 max_tx_sgl_size;
+
+	u16 max_rx_sgl_size;
+
+	u16 reserved[8];
+};
+
+struct ena_admin_get_feat_cmd {
+	struct ena_admin_aq_common_desc aq_common_descriptor;
+
+	struct ena_admin_ctrl_buff_info control_buffer;
+
+	struct ena_admin_get_set_feature_common_desc feat_common;
+
+	u32 raw[11];
+};
+
+struct ena_admin_get_feat_resp {
+	struct ena_admin_acq_common_desc acq_common_desc;
+
+	union {
+		u32 raw[14];
+
+		struct ena_admin_device_attr_feature_desc dev_attr;
+
+		struct ena_admin_queue_feature_desc max_queue;
+
+		struct ena_admin_feature_aenq_desc aenq;
+
+		struct ena_admin_get_feature_link_desc link;
+
+		struct ena_admin_feature_offload_desc offload;
+
+		struct ena_admin_feature_rss_flow_hash_function flow_hash_func;
+
+		struct ena_admin_feature_rss_flow_hash_input flow_hash_input;
+
+		struct ena_admin_feature_rss_ind_table ind_table;
+
+		struct ena_admin_feature_intr_moder_desc intr_moderation;
+
+		struct ena_admin_ena_hw_hints hw_hints;
+	} u;
+};
+
+struct ena_admin_set_feat_cmd {
+	struct ena_admin_aq_common_desc aq_common_descriptor;
+
+	struct ena_admin_ctrl_buff_info control_buffer;
+
+	struct ena_admin_get_set_feature_common_desc feat_common;
+
+	union {
+		u32 raw[11];
+
+		/* mtu size */
+		struct ena_admin_set_feature_mtu_desc mtu;
+
+		/* host attributes */
+		struct ena_admin_set_feature_host_attr_desc host_attr;
+
+		/* AENQ configuration */
+		struct ena_admin_feature_aenq_desc aenq;
+
+		/* rss flow hash function */
+		struct ena_admin_feature_rss_flow_hash_function flow_hash_func;
+
+		/* rss flow hash input */
+		struct ena_admin_feature_rss_flow_hash_input flow_hash_input;
+
+		/* rss indirection table */
+		struct ena_admin_feature_rss_ind_table ind_table;
+	} u;
+};
+
+struct ena_admin_set_feat_resp {
+	struct ena_admin_acq_common_desc acq_common_desc;
+
+	union {
+		u32 raw[14];
+	} u;
+};
+
+struct ena_admin_aenq_common_desc {
+	u16 group;
+
+	u16 syndrom;
+
+	/* 0 : phase */
+	u8 flags;
+
+	u8 reserved1[3];
+
+	u32 timestamp_low;
+
+	u32 timestamp_high;
+};
+
+/* asynchronous event notification groups */
+enum ena_admin_aenq_group {
+	ENA_ADMIN_LINK_CHANGE		= 0,
+
+	ENA_ADMIN_FATAL_ERROR		= 1,
+
+	ENA_ADMIN_WARNING		= 2,
+
+	ENA_ADMIN_NOTIFICATION		= 3,
+
+	ENA_ADMIN_KEEP_ALIVE		= 4,
+
+	ENA_ADMIN_AENQ_GROUPS_NUM	= 5,
+};
+
+enum ena_admin_aenq_notification_syndrom {
+	ENA_ADMIN_SUSPEND	= 0,
+
+	ENA_ADMIN_RESUME	= 1,
+
+	ENA_ADMIN_UPDATE_HINTS	= 2,
+};
+
+struct ena_admin_aenq_entry {
+	struct ena_admin_aenq_common_desc aenq_common_desc;
+
+	/* command specific inline data */
+	u32 inline_data_w4[12];
+};
+
+struct ena_admin_aenq_link_change_desc {
+	struct ena_admin_aenq_common_desc aenq_common_desc;
+
+	/* 0 : link_status */
+	u32 flags;
+};
+
+struct ena_admin_aenq_keep_alive_desc {
+	struct ena_admin_aenq_common_desc aenq_common_desc;
+
+	u32 rx_drops_low;
+
+	u32 rx_drops_high;
+};
+
+struct ena_admin_ena_mmio_req_read_less_resp {
+	u16 req_id;
+
+	u16 reg_off;
+
+	/* value is valid when poll is cleared */
+	u32 reg_val;
+};
+
+/* aq_common_desc */
+#define ENA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK GENMASK(11, 0)
+#define ENA_ADMIN_AQ_COMMON_DESC_PHASE_MASK BIT(0)
+#define ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_SHIFT 1
+#define ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_MASK BIT(1)
+#define ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_SHIFT 2
+#define ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK BIT(2)
+
+/* sq */
+#define ENA_ADMIN_SQ_SQ_DIRECTION_SHIFT 5
+#define ENA_ADMIN_SQ_SQ_DIRECTION_MASK GENMASK(7, 5)
+
+/* acq_common_desc */
+#define ENA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK GENMASK(11, 0)
+#define ENA_ADMIN_ACQ_COMMON_DESC_PHASE_MASK BIT(0)
+
+/* aq_create_sq_cmd */
+#define ENA_ADMIN_AQ_CREATE_SQ_CMD_SQ_DIRECTION_SHIFT 5
+#define ENA_ADMIN_AQ_CREATE_SQ_CMD_SQ_DIRECTION_MASK GENMASK(7, 5)
+#define ENA_ADMIN_AQ_CREATE_SQ_CMD_PLACEMENT_POLICY_MASK GENMASK(3, 0)
+#define ENA_ADMIN_AQ_CREATE_SQ_CMD_COMPLETION_POLICY_SHIFT 4
+#define ENA_ADMIN_AQ_CREATE_SQ_CMD_COMPLETION_POLICY_MASK GENMASK(6, 4)
+#define ENA_ADMIN_AQ_CREATE_SQ_CMD_IS_PHYSICALLY_CONTIGUOUS_MASK BIT(0)
+
+/* aq_create_cq_cmd */
+#define ENA_ADMIN_AQ_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED_SHIFT 5
+#define ENA_ADMIN_AQ_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED_MASK BIT(5)
+#define ENA_ADMIN_AQ_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS_MASK GENMASK(4, 0)
+
+/* get_set_feature_common_desc */
+#define ENA_ADMIN_GET_SET_FEATURE_COMMON_DESC_SELECT_MASK GENMASK(1, 0)
+
+/* get_feature_link_desc */
+#define ENA_ADMIN_GET_FEATURE_LINK_DESC_AUTONEG_MASK BIT(0)
+#define ENA_ADMIN_GET_FEATURE_LINK_DESC_DUPLEX_SHIFT 1
+#define ENA_ADMIN_GET_FEATURE_LINK_DESC_DUPLEX_MASK BIT(1)
+
+/* feature_offload_desc */
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L3_CSUM_IPV4_MASK BIT(0)
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV4_CSUM_PART_SHIFT 1
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV4_CSUM_PART_MASK BIT(1)
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV4_CSUM_FULL_SHIFT 2
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV4_CSUM_FULL_MASK BIT(2)
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV6_CSUM_PART_SHIFT 3
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV6_CSUM_PART_MASK BIT(3)
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV6_CSUM_FULL_SHIFT 4
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV6_CSUM_FULL_MASK BIT(4)
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV4_SHIFT 5
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV4_MASK BIT(5)
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV6_SHIFT 6
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV6_MASK BIT(6)
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_ECN_SHIFT 7
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_ECN_MASK BIT(7)
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L3_CSUM_IPV4_MASK BIT(0)
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV4_CSUM_SHIFT 1
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV4_CSUM_MASK BIT(1)
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV6_CSUM_SHIFT 2
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV6_CSUM_MASK BIT(2)
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_HASH_SHIFT 3
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_HASH_MASK BIT(3)
+
+/* feature_rss_flow_hash_function */
+#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_FUNCTION_FUNCS_MASK GENMASK(7, 0)
+#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_FUNCTION_SELECTED_FUNC_MASK GENMASK(7, 0)
+
+/* feature_rss_flow_hash_input */
+#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L3_SORT_SHIFT 1
+#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L3_SORT_MASK BIT(1)
+#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L4_SORT_SHIFT 2
+#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L4_SORT_MASK BIT(2)
+#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_ENABLE_L3_SORT_SHIFT 1
+#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_ENABLE_L3_SORT_MASK BIT(1)
+#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_ENABLE_L4_SORT_SHIFT 2
+#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_ENABLE_L4_SORT_MASK BIT(2)
+
+/* host_info */
+#define ENA_ADMIN_HOST_INFO_MAJOR_MASK GENMASK(7, 0)
+#define ENA_ADMIN_HOST_INFO_MINOR_SHIFT 8
+#define ENA_ADMIN_HOST_INFO_MINOR_MASK GENMASK(15, 8)
+#define ENA_ADMIN_HOST_INFO_SUB_MINOR_SHIFT 16
+#define ENA_ADMIN_HOST_INFO_SUB_MINOR_MASK GENMASK(23, 16)
+
+/* aenq_common_desc */
+#define ENA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK BIT(0)
+
+/* aenq_link_change_desc */
+#define ENA_ADMIN_AENQ_LINK_CHANGE_DESC_LINK_STATUS_MASK BIT(0)
+
+#endif /*_ENA_ADMIN_H_ */
diff --git a/drivers/amazon/net/ena/ena_com.c b/drivers/amazon/net/ena/ena_com.c
new file mode 100644
index 0000000000000..2480863044a88
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_com.c
@@ -0,0 +1,2714 @@
+/*
+ * Copyright 2015 Amazon.com, Inc. or its affiliates.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ena_com.h"
+
+/*****************************************************************************/
+/*****************************************************************************/
+
+/* Timeout in micro-sec */
+#define ADMIN_CMD_TIMEOUT_US (3000000)
+
+#define ENA_ASYNC_QUEUE_DEPTH 16
+#define ENA_ADMIN_QUEUE_DEPTH 32
+
+#define MIN_ENA_VER (((ENA_COMMON_SPEC_VERSION_MAJOR) << \
+		ENA_REGS_VERSION_MAJOR_VERSION_SHIFT) \
+		| (ENA_COMMON_SPEC_VERSION_MINOR))
+
+#define ENA_CTRL_MAJOR		0
+#define ENA_CTRL_MINOR		0
+#define ENA_CTRL_SUB_MINOR	1
+
+#define MIN_ENA_CTRL_VER \
+	(((ENA_CTRL_MAJOR) << \
+	(ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT)) | \
+	((ENA_CTRL_MINOR) << \
+	(ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT)) | \
+	(ENA_CTRL_SUB_MINOR))
+
+#define ENA_DMA_ADDR_TO_UINT32_LOW(x)	((u32)((u64)(x)))
+#define ENA_DMA_ADDR_TO_UINT32_HIGH(x)	((u32)(((u64)(x)) >> 32))
+
+#define ENA_MMIO_READ_TIMEOUT 0xFFFFFFFF
+
+#define ENA_REGS_ADMIN_INTR_MASK 1
+
+#define ENA_POLL_MS	5
+
+/*****************************************************************************/
+/*****************************************************************************/
+/*****************************************************************************/
+
+enum ena_cmd_status {
+	ENA_CMD_SUBMITTED,
+	ENA_CMD_COMPLETED,
+	/* Abort - canceled by the driver */
+	ENA_CMD_ABORTED,
+};
+
+struct ena_comp_ctx {
+	struct completion wait_event;
+	struct ena_admin_acq_entry *user_cqe;
+	u32 comp_size;
+	enum ena_cmd_status status;
+	/* status from the device */
+	u8 comp_status;
+	u8 cmd_opcode;
+	bool occupied;
+};
+
+struct ena_com_stats_ctx {
+	struct ena_admin_aq_get_stats_cmd get_cmd;
+	struct ena_admin_acq_get_stats_resp get_resp;
+};
+
+static inline int ena_com_mem_addr_set(struct ena_com_dev *ena_dev,
+				       struct ena_common_mem_addr *ena_addr,
+				       dma_addr_t addr)
+{
+	if ((addr & GENMASK_ULL(ena_dev->dma_addr_bits - 1, 0)) != addr) {
+		pr_err("dma address has more bits that the device supports\n");
+		return -EINVAL;
+	}
+
+	ena_addr->mem_addr_low = lower_32_bits(addr);
+	ena_addr->mem_addr_high = (u16)upper_32_bits(addr);
+
+	return 0;
+}
+
+static int ena_com_admin_init_sq(struct ena_com_admin_queue *queue)
+{
+	struct ena_com_admin_sq *sq = &queue->sq;
+	u16 size = ADMIN_SQ_SIZE(queue->q_depth);
+
+	sq->entries = dma_zalloc_coherent(queue->q_dmadev, size, &sq->dma_addr,
+					  GFP_KERNEL);
+
+	if (!sq->entries) {
+		pr_err("memory allocation failed");
+		return -ENOMEM;
+	}
+
+	sq->head = 0;
+	sq->tail = 0;
+	sq->phase = 1;
+
+	sq->db_addr = NULL;
+
+	return 0;
+}
+
+static int ena_com_admin_init_cq(struct ena_com_admin_queue *queue)
+{
+	struct ena_com_admin_cq *cq = &queue->cq;
+	u16 size = ADMIN_CQ_SIZE(queue->q_depth);
+
+	cq->entries = dma_zalloc_coherent(queue->q_dmadev, size, &cq->dma_addr,
+					  GFP_KERNEL);
+
+	if (!cq->entries) {
+		pr_err("memory allocation failed");
+		return -ENOMEM;
+	}
+
+	cq->head = 0;
+	cq->phase = 1;
+
+	return 0;
+}
+
+static int ena_com_admin_init_aenq(struct ena_com_dev *dev,
+				   struct ena_aenq_handlers *aenq_handlers)
+{
+	struct ena_com_aenq *aenq = &dev->aenq;
+	u32 addr_low, addr_high, aenq_caps;
+	u16 size;
+
+	dev->aenq.q_depth = ENA_ASYNC_QUEUE_DEPTH;
+	size = ADMIN_AENQ_SIZE(ENA_ASYNC_QUEUE_DEPTH);
+	aenq->entries = dma_zalloc_coherent(dev->dmadev, size, &aenq->dma_addr,
+					    GFP_KERNEL);
+
+	if (!aenq->entries) {
+		pr_err("memory allocation failed");
+		return -ENOMEM;
+	}
+
+	aenq->head = aenq->q_depth;
+	aenq->phase = 1;
+
+	addr_low = ENA_DMA_ADDR_TO_UINT32_LOW(aenq->dma_addr);
+	addr_high = ENA_DMA_ADDR_TO_UINT32_HIGH(aenq->dma_addr);
+
+	writel(addr_low, dev->reg_bar + ENA_REGS_AENQ_BASE_LO_OFF);
+	writel(addr_high, dev->reg_bar + ENA_REGS_AENQ_BASE_HI_OFF);
+
+	aenq_caps = 0;
+	aenq_caps |= dev->aenq.q_depth & ENA_REGS_AENQ_CAPS_AENQ_DEPTH_MASK;
+	aenq_caps |= (sizeof(struct ena_admin_aenq_entry)
+		      << ENA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_SHIFT) &
+		     ENA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_MASK;
+	writel(aenq_caps, dev->reg_bar + ENA_REGS_AENQ_CAPS_OFF);
+
+	if (unlikely(!aenq_handlers)) {
+		pr_err("aenq handlers pointer is NULL\n");
+		return -EINVAL;
+	}
+
+	aenq->aenq_handlers = aenq_handlers;
+
+	return 0;
+}
+
+static inline void comp_ctxt_release(struct ena_com_admin_queue *queue,
+				     struct ena_comp_ctx *comp_ctx)
+{
+	comp_ctx->occupied = false;
+	atomic_dec(&queue->outstanding_cmds);
+}
+
+static struct ena_comp_ctx *get_comp_ctxt(struct ena_com_admin_queue *queue,
+					  u16 command_id, bool capture)
+{
+	if (unlikely(command_id >= queue->q_depth)) {
+		pr_err("command id is larger than the queue size. cmd_id: %u queue size %d\n",
+		       command_id, queue->q_depth);
+		return NULL;
+	}
+
+	if (unlikely(queue->comp_ctx[command_id].occupied && capture)) {
+		pr_err("Completion context is occupied\n");
+		return NULL;
+	}
+
+	if (capture) {
+		atomic_inc(&queue->outstanding_cmds);
+		queue->comp_ctx[command_id].occupied = true;
+	}
+
+	return &queue->comp_ctx[command_id];
+}
+
+static struct ena_comp_ctx *__ena_com_submit_admin_cmd(struct ena_com_admin_queue *admin_queue,
+						       struct ena_admin_aq_entry *cmd,
+						       size_t cmd_size_in_bytes,
+						       struct ena_admin_acq_entry *comp,
+						       size_t comp_size_in_bytes)
+{
+	struct ena_comp_ctx *comp_ctx;
+	u16 tail_masked, cmd_id;
+	u16 queue_size_mask;
+	u16 cnt;
+
+	queue_size_mask = admin_queue->q_depth - 1;
+
+	tail_masked = admin_queue->sq.tail & queue_size_mask;
+
+	/* In case of queue FULL */
+	cnt = atomic_read(&admin_queue->outstanding_cmds);
+	if (cnt >= admin_queue->q_depth) {
+		pr_debug("admin queue is full.\n");
+		admin_queue->stats.out_of_space++;
+		return ERR_PTR(-ENOSPC);
+	}
+
+	cmd_id = admin_queue->curr_cmd_id;
+
+	cmd->aq_common_descriptor.flags |= admin_queue->sq.phase &
+		ENA_ADMIN_AQ_COMMON_DESC_PHASE_MASK;
+
+	cmd->aq_common_descriptor.command_id |= cmd_id &
+		ENA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK;
+
+	comp_ctx = get_comp_ctxt(admin_queue, cmd_id, true);
+	if (unlikely(!comp_ctx))
+		return ERR_PTR(-EINVAL);
+
+	comp_ctx->status = ENA_CMD_SUBMITTED;
+	comp_ctx->comp_size = (u32)comp_size_in_bytes;
+	comp_ctx->user_cqe = comp;
+	comp_ctx->cmd_opcode = cmd->aq_common_descriptor.opcode;
+
+	reinit_completion(&comp_ctx->wait_event);
+
+	memcpy(&admin_queue->sq.entries[tail_masked], cmd, cmd_size_in_bytes);
+
+	admin_queue->curr_cmd_id = (admin_queue->curr_cmd_id + 1) &
+		queue_size_mask;
+
+	admin_queue->sq.tail++;
+	admin_queue->stats.submitted_cmd++;
+
+	if (unlikely((admin_queue->sq.tail & queue_size_mask) == 0))
+		admin_queue->sq.phase = !admin_queue->sq.phase;
+
+	writel(admin_queue->sq.tail, admin_queue->sq.db_addr);
+
+	return comp_ctx;
+}
+
+static inline int ena_com_init_comp_ctxt(struct ena_com_admin_queue *queue)
+{
+	size_t size = queue->q_depth * sizeof(struct ena_comp_ctx);
+	struct ena_comp_ctx *comp_ctx;
+	u16 i;
+
+	queue->comp_ctx = devm_kzalloc(queue->q_dmadev, size, GFP_KERNEL);
+	if (unlikely(!queue->comp_ctx)) {
+		pr_err("memory allocation failed");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < queue->q_depth; i++) {
+		comp_ctx = get_comp_ctxt(queue, i, false);
+		if (comp_ctx)
+			init_completion(&comp_ctx->wait_event);
+	}
+
+	return 0;
+}
+
+static struct ena_comp_ctx *ena_com_submit_admin_cmd(struct ena_com_admin_queue *admin_queue,
+						     struct ena_admin_aq_entry *cmd,
+						     size_t cmd_size_in_bytes,
+						     struct ena_admin_acq_entry *comp,
+						     size_t comp_size_in_bytes)
+{
+	unsigned long flags;
+	struct ena_comp_ctx *comp_ctx;
+
+	spin_lock_irqsave(&admin_queue->q_lock, flags);
+	if (unlikely(!admin_queue->running_state)) {
+		spin_unlock_irqrestore(&admin_queue->q_lock, flags);
+		return ERR_PTR(-ENODEV);
+	}
+	comp_ctx = __ena_com_submit_admin_cmd(admin_queue, cmd,
+					      cmd_size_in_bytes,
+					      comp,
+					      comp_size_in_bytes);
+	if (unlikely(IS_ERR(comp_ctx)))
+		admin_queue->running_state = false;
+	spin_unlock_irqrestore(&admin_queue->q_lock, flags);
+
+	return comp_ctx;
+}
+
+static int ena_com_init_io_sq(struct ena_com_dev *ena_dev,
+			      struct ena_com_create_io_ctx *ctx,
+			      struct ena_com_io_sq *io_sq)
+{
+	size_t size;
+	int dev_node = 0;
+
+	memset(&io_sq->desc_addr, 0x0, sizeof(io_sq->desc_addr));
+
+	io_sq->desc_entry_size =
+		(io_sq->direction == ENA_COM_IO_QUEUE_DIRECTION_TX) ?
+		sizeof(struct ena_eth_io_tx_desc) :
+		sizeof(struct ena_eth_io_rx_desc);
+
+	size = io_sq->desc_entry_size * io_sq->q_depth;
+	io_sq->bus = ena_dev->bus;
+
+	if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST) {
+		dev_node = dev_to_node(ena_dev->dmadev);
+		set_dev_node(ena_dev->dmadev, ctx->numa_node);
+		io_sq->desc_addr.virt_addr =
+			dma_zalloc_coherent(ena_dev->dmadev, size,
+					    &io_sq->desc_addr.phys_addr,
+					    GFP_KERNEL);
+		set_dev_node(ena_dev->dmadev, dev_node);
+		if (!io_sq->desc_addr.virt_addr) {
+			io_sq->desc_addr.virt_addr =
+				dma_zalloc_coherent(ena_dev->dmadev, size,
+						    &io_sq->desc_addr.phys_addr,
+						    GFP_KERNEL);
+		}
+	} else {
+		dev_node = dev_to_node(ena_dev->dmadev);
+		set_dev_node(ena_dev->dmadev, ctx->numa_node);
+		io_sq->desc_addr.virt_addr =
+			devm_kzalloc(ena_dev->dmadev, size, GFP_KERNEL);
+		set_dev_node(ena_dev->dmadev, dev_node);
+		if (!io_sq->desc_addr.virt_addr) {
+			io_sq->desc_addr.virt_addr =
+				devm_kzalloc(ena_dev->dmadev, size, GFP_KERNEL);
+		}
+	}
+
+	if (!io_sq->desc_addr.virt_addr) {
+		pr_err("memory allocation failed");
+		return -ENOMEM;
+	}
+
+	io_sq->tail = 0;
+	io_sq->next_to_comp = 0;
+	io_sq->phase = 1;
+
+	return 0;
+}
+
+static int ena_com_init_io_cq(struct ena_com_dev *ena_dev,
+			      struct ena_com_create_io_ctx *ctx,
+			      struct ena_com_io_cq *io_cq)
+{
+	size_t size;
+	int prev_node = 0;
+
+	memset(&io_cq->cdesc_addr, 0x0, sizeof(io_cq->cdesc_addr));
+
+	/* Use the basic completion descriptor for Rx */
+	io_cq->cdesc_entry_size_in_bytes =
+		(io_cq->direction == ENA_COM_IO_QUEUE_DIRECTION_TX) ?
+		sizeof(struct ena_eth_io_tx_cdesc) :
+		sizeof(struct ena_eth_io_rx_cdesc_base);
+
+	size = io_cq->cdesc_entry_size_in_bytes * io_cq->q_depth;
+	io_cq->bus = ena_dev->bus;
+
+	prev_node = dev_to_node(ena_dev->dmadev);
+	set_dev_node(ena_dev->dmadev, ctx->numa_node);
+	io_cq->cdesc_addr.virt_addr =
+		dma_zalloc_coherent(ena_dev->dmadev, size,
+				    &io_cq->cdesc_addr.phys_addr, GFP_KERNEL);
+	set_dev_node(ena_dev->dmadev, prev_node);
+	if (!io_cq->cdesc_addr.virt_addr) {
+		io_cq->cdesc_addr.virt_addr =
+			dma_zalloc_coherent(ena_dev->dmadev, size,
+					    &io_cq->cdesc_addr.phys_addr,
+					    GFP_KERNEL);
+	}
+
+	if (!io_cq->cdesc_addr.virt_addr) {
+		pr_err("memory allocation failed");
+		return -ENOMEM;
+	}
+
+	io_cq->phase = 1;
+	io_cq->head = 0;
+
+	return 0;
+}
+
+static void ena_com_handle_single_admin_completion(struct ena_com_admin_queue *admin_queue,
+						   struct ena_admin_acq_entry *cqe)
+{
+	struct ena_comp_ctx *comp_ctx;
+	u16 cmd_id;
+
+	cmd_id = cqe->acq_common_descriptor.command &
+		ENA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK;
+
+	comp_ctx = get_comp_ctxt(admin_queue, cmd_id, false);
+	if (unlikely(!comp_ctx)) {
+		pr_err("comp_ctx is NULL. Changing the admin queue running state\n");
+		admin_queue->running_state = false;
+		return;
+	}
+
+	comp_ctx->status = ENA_CMD_COMPLETED;
+	comp_ctx->comp_status = cqe->acq_common_descriptor.status;
+
+	if (comp_ctx->user_cqe)
+		memcpy(comp_ctx->user_cqe, (void *)cqe, comp_ctx->comp_size);
+
+	if (!admin_queue->polling)
+		complete(&comp_ctx->wait_event);
+}
+
+static void ena_com_handle_admin_completion(struct ena_com_admin_queue *admin_queue)
+{
+	struct ena_admin_acq_entry *cqe = NULL;
+	u16 comp_num = 0;
+	u16 head_masked;
+	u8 phase;
+
+	head_masked = admin_queue->cq.head & (admin_queue->q_depth - 1);
+	phase = admin_queue->cq.phase;
+
+	cqe = &admin_queue->cq.entries[head_masked];
+
+	/* Go over all the completions */
+	while ((cqe->acq_common_descriptor.flags &
+			ENA_ADMIN_ACQ_COMMON_DESC_PHASE_MASK) == phase) {
+		/* Do not read the rest of the completion entry before the
+		 * phase bit was validated
+		 */
+		rmb();
+		ena_com_handle_single_admin_completion(admin_queue, cqe);
+
+		head_masked++;
+		comp_num++;
+		if (unlikely(head_masked == admin_queue->q_depth)) {
+			head_masked = 0;
+			phase = !phase;
+		}
+
+		cqe = &admin_queue->cq.entries[head_masked];
+	}
+
+	admin_queue->cq.head += comp_num;
+	admin_queue->cq.phase = phase;
+	admin_queue->sq.head += comp_num;
+	admin_queue->stats.completed_cmd += comp_num;
+}
+
+static int ena_com_comp_status_to_errno(u8 comp_status)
+{
+	if (unlikely(comp_status != 0))
+		pr_err("admin command failed[%u]\n", comp_status);
+
+	if (unlikely(comp_status > ENA_ADMIN_UNKNOWN_ERROR))
+		return -EINVAL;
+
+	switch (comp_status) {
+	case ENA_ADMIN_SUCCESS:
+		return 0;
+	case ENA_ADMIN_RESOURCE_ALLOCATION_FAILURE:
+		return -ENOMEM;
+	case ENA_ADMIN_UNSUPPORTED_OPCODE:
+		return -EOPNOTSUPP;
+	case ENA_ADMIN_BAD_OPCODE:
+	case ENA_ADMIN_MALFORMED_REQUEST:
+	case ENA_ADMIN_ILLEGAL_PARAMETER:
+	case ENA_ADMIN_UNKNOWN_ERROR:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ena_com_wait_and_process_admin_cq_polling(struct ena_comp_ctx *comp_ctx,
+						     struct ena_com_admin_queue *admin_queue)
+{
+	unsigned long flags, timeout;
+	int ret;
+
+	timeout = jiffies + usecs_to_jiffies(admin_queue->completion_timeout);
+
+	while (1) {
+		spin_lock_irqsave(&admin_queue->q_lock, flags);
+		ena_com_handle_admin_completion(admin_queue);
+		spin_unlock_irqrestore(&admin_queue->q_lock, flags);
+
+		if (comp_ctx->status != ENA_CMD_SUBMITTED)
+			break;
+
+		if (time_is_before_jiffies(timeout)) {
+			pr_err("Wait for completion (polling) timeout\n");
+			/* ENA didn't have any completion */
+			spin_lock_irqsave(&admin_queue->q_lock, flags);
+			admin_queue->stats.no_completion++;
+			admin_queue->running_state = false;
+			spin_unlock_irqrestore(&admin_queue->q_lock, flags);
+
+			ret = -ETIME;
+			goto err;
+		}
+
+		msleep(ENA_POLL_MS);
+	}
+
+	if (unlikely(comp_ctx->status == ENA_CMD_ABORTED)) {
+		pr_err("Command was aborted\n");
+		spin_lock_irqsave(&admin_queue->q_lock, flags);
+		admin_queue->stats.aborted_cmd++;
+		spin_unlock_irqrestore(&admin_queue->q_lock, flags);
+		ret = -ENODEV;
+		goto err;
+	}
+
+	WARN(comp_ctx->status != ENA_CMD_COMPLETED, "Invalid comp status %d\n",
+	     comp_ctx->status);
+
+	ret = ena_com_comp_status_to_errno(comp_ctx->comp_status);
+err:
+	comp_ctxt_release(admin_queue, comp_ctx);
+	return ret;
+}
+
+static int ena_com_wait_and_process_admin_cq_interrupts(struct ena_comp_ctx *comp_ctx,
+							struct ena_com_admin_queue *admin_queue)
+{
+	unsigned long flags;
+	int ret;
+
+	wait_for_completion_timeout(&comp_ctx->wait_event,
+				    usecs_to_jiffies(
+					    admin_queue->completion_timeout));
+
+	/* In case the command wasn't completed find out the root cause.
+	 * There might be 2 kinds of errors
+	 * 1) No completion (timeout reached)
+	 * 2) There is completion but the device didn't get any msi-x interrupt.
+	 */
+	if (unlikely(comp_ctx->status == ENA_CMD_SUBMITTED)) {
+		spin_lock_irqsave(&admin_queue->q_lock, flags);
+		ena_com_handle_admin_completion(admin_queue);
+		admin_queue->stats.no_completion++;
+		spin_unlock_irqrestore(&admin_queue->q_lock, flags);
+
+		if (comp_ctx->status == ENA_CMD_COMPLETED)
+			pr_err("The ena device have completion but the driver didn't receive any MSI-X interrupt (cmd %d)\n",
+			       comp_ctx->cmd_opcode);
+		else
+			pr_err("The ena device doesn't send any completion for the admin cmd %d status %d\n",
+			       comp_ctx->cmd_opcode, comp_ctx->status);
+
+		admin_queue->running_state = false;
+		ret = -ETIME;
+		goto err;
+	}
+
+	ret = ena_com_comp_status_to_errno(comp_ctx->comp_status);
+err:
+	comp_ctxt_release(admin_queue, comp_ctx);
+	return ret;
+}
+
+/* This method read the hardware device register through posting writes
+ * and waiting for response
+ * On timeout the function will return ENA_MMIO_READ_TIMEOUT
+ */
+static u32 ena_com_reg_bar_read32(struct ena_com_dev *ena_dev, u16 offset)
+{
+	struct ena_com_mmio_read *mmio_read = &ena_dev->mmio_read;
+	volatile struct ena_admin_ena_mmio_req_read_less_resp *read_resp =
+		mmio_read->read_resp;
+	u32 mmio_read_reg, ret, i;
+	unsigned long flags;
+	u32 timeout = mmio_read->reg_read_to;
+
+	might_sleep();
+
+	if (timeout == 0)
+		timeout = ENA_REG_READ_TIMEOUT;
+
+	/* If readless is disabled, perform regular read */
+	if (!mmio_read->readless_supported)
+		return readl(ena_dev->reg_bar + offset);
+
+	spin_lock_irqsave(&mmio_read->lock, flags);
+	mmio_read->seq_num++;
+
+	read_resp->req_id = mmio_read->seq_num + 0xDEAD;
+	mmio_read_reg = (offset << ENA_REGS_MMIO_REG_READ_REG_OFF_SHIFT) &
+			ENA_REGS_MMIO_REG_READ_REG_OFF_MASK;
+	mmio_read_reg |= mmio_read->seq_num &
+			ENA_REGS_MMIO_REG_READ_REQ_ID_MASK;
+
+	/* make sure read_resp->req_id get updated before the hw can write
+	 * there
+	 */
+	wmb();
+
+	writel(mmio_read_reg, ena_dev->reg_bar + ENA_REGS_MMIO_REG_READ_OFF);
+
+	for (i = 0; i < timeout; i++) {
+		if (read_resp->req_id == mmio_read->seq_num)
+			break;
+
+		udelay(1);
+	}
+
+	if (unlikely(i == timeout)) {
+		pr_err("reading reg failed for timeout. expected: req id[%hu] offset[%hu] actual: req id[%hu] offset[%hu]\n",
+		       mmio_read->seq_num, offset, read_resp->req_id,
+		       read_resp->reg_off);
+		ret = ENA_MMIO_READ_TIMEOUT;
+		goto err;
+	}
+
+	if (read_resp->reg_off != offset) {
+		pr_err("Read failure: wrong offset provided");
+		ret = ENA_MMIO_READ_TIMEOUT;
+	} else {
+		ret = read_resp->reg_val;
+	}
+err:
+	spin_unlock_irqrestore(&mmio_read->lock, flags);
+
+	return ret;
+}
+
+/* There are two types to wait for completion.
+ * Polling mode - wait until the completion is available.
+ * Async mode - wait on wait queue until the completion is ready
+ * (or the timeout expired).
+ * It is expected that the IRQ called ena_com_handle_admin_completion
+ * to mark the completions.
+ */
+static int ena_com_wait_and_process_admin_cq(struct ena_comp_ctx *comp_ctx,
+					     struct ena_com_admin_queue *admin_queue)
+{
+	if (admin_queue->polling)
+		return ena_com_wait_and_process_admin_cq_polling(comp_ctx,
+								 admin_queue);
+
+	return ena_com_wait_and_process_admin_cq_interrupts(comp_ctx,
+							    admin_queue);
+}
+
+static int ena_com_destroy_io_sq(struct ena_com_dev *ena_dev,
+				 struct ena_com_io_sq *io_sq)
+{
+	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
+	struct ena_admin_aq_destroy_sq_cmd destroy_cmd;
+	struct ena_admin_acq_destroy_sq_resp_desc destroy_resp;
+	u8 direction;
+	int ret;
+
+	memset(&destroy_cmd, 0x0, sizeof(destroy_cmd));
+
+	if (io_sq->direction == ENA_COM_IO_QUEUE_DIRECTION_TX)
+		direction = ENA_ADMIN_SQ_DIRECTION_TX;
+	else
+		direction = ENA_ADMIN_SQ_DIRECTION_RX;
+
+	destroy_cmd.sq.sq_identity |= (direction <<
+		ENA_ADMIN_SQ_SQ_DIRECTION_SHIFT) &
+		ENA_ADMIN_SQ_SQ_DIRECTION_MASK;
+
+	destroy_cmd.sq.sq_idx = io_sq->idx;
+	destroy_cmd.aq_common_descriptor.opcode = ENA_ADMIN_DESTROY_SQ;
+
+	ret = ena_com_execute_admin_command(admin_queue,
+					    (struct ena_admin_aq_entry *)&destroy_cmd,
+					    sizeof(destroy_cmd),
+					    (struct ena_admin_acq_entry *)&destroy_resp,
+					    sizeof(destroy_resp));
+
+	if (unlikely(ret && (ret != -ENODEV)))
+		pr_err("failed to destroy io sq error: %d\n", ret);
+
+	return ret;
+}
+
+static void ena_com_io_queue_free(struct ena_com_dev *ena_dev,
+				  struct ena_com_io_sq *io_sq,
+				  struct ena_com_io_cq *io_cq)
+{
+	size_t size;
+
+	if (io_cq->cdesc_addr.virt_addr) {
+		size = io_cq->cdesc_entry_size_in_bytes * io_cq->q_depth;
+
+		dma_free_coherent(ena_dev->dmadev, size,
+				  io_cq->cdesc_addr.virt_addr,
+				  io_cq->cdesc_addr.phys_addr);
+
+		io_cq->cdesc_addr.virt_addr = NULL;
+	}
+
+	if (io_sq->desc_addr.virt_addr) {
+		size = io_sq->desc_entry_size * io_sq->q_depth;
+
+		if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST)
+			dma_free_coherent(ena_dev->dmadev, size,
+					  io_sq->desc_addr.virt_addr,
+					  io_sq->desc_addr.phys_addr);
+		else
+			devm_kfree(ena_dev->dmadev, io_sq->desc_addr.virt_addr);
+
+		io_sq->desc_addr.virt_addr = NULL;
+	}
+}
+
+static int wait_for_reset_state(struct ena_com_dev *ena_dev, u32 timeout,
+				u16 exp_state)
+{
+	u32 val, i;
+
+	/* Convert timeout from resolution of 100ms to ENA_POLL_MS */
+	timeout = (timeout * 100) / ENA_POLL_MS;
+
+	for (i = 0; i < timeout; i++) {
+		val = ena_com_reg_bar_read32(ena_dev, ENA_REGS_DEV_STS_OFF);
+
+		if (unlikely(val == ENA_MMIO_READ_TIMEOUT)) {
+			pr_err("Reg read timeout occurred\n");
+			return -ETIME;
+		}
+
+		if ((val & ENA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK) ==
+			exp_state)
+			return 0;
+
+		msleep(ENA_POLL_MS);
+	}
+
+	return -ETIME;
+}
+
+static bool ena_com_check_supported_feature_id(struct ena_com_dev *ena_dev,
+					       enum ena_admin_aq_feature_id feature_id)
+{
+	u32 feature_mask = 1 << feature_id;
+
+	/* Device attributes is always supported */
+	if ((feature_id != ENA_ADMIN_DEVICE_ATTRIBUTES) &&
+	    !(ena_dev->supported_features & feature_mask))
+		return false;
+
+	return true;
+}
+
+static int ena_com_get_feature_ex(struct ena_com_dev *ena_dev,
+				  struct ena_admin_get_feat_resp *get_resp,
+				  enum ena_admin_aq_feature_id feature_id,
+				  dma_addr_t control_buf_dma_addr,
+				  u32 control_buff_size)
+{
+	struct ena_com_admin_queue *admin_queue;
+	struct ena_admin_get_feat_cmd get_cmd;
+	int ret;
+
+	if (!ena_com_check_supported_feature_id(ena_dev, feature_id)) {
+		pr_debug("Feature %d isn't supported\n", feature_id);
+		return -EOPNOTSUPP;
+	}
+
+	memset(&get_cmd, 0x0, sizeof(get_cmd));
+	admin_queue = &ena_dev->admin_queue;
+
+	get_cmd.aq_common_descriptor.opcode = ENA_ADMIN_GET_FEATURE;
+
+	if (control_buff_size)
+		get_cmd.aq_common_descriptor.flags =
+			ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK;
+	else
+		get_cmd.aq_common_descriptor.flags = 0;
+
+	ret = ena_com_mem_addr_set(ena_dev,
+				   &get_cmd.control_buffer.address,
+				   control_buf_dma_addr);
+	if (unlikely(ret)) {
+		pr_err("memory address set failed\n");
+		return ret;
+	}
+
+	get_cmd.control_buffer.length = control_buff_size;
+
+	get_cmd.feat_common.feature_id = feature_id;
+
+	ret = ena_com_execute_admin_command(admin_queue,
+					    (struct ena_admin_aq_entry *)
+					    &get_cmd,
+					    sizeof(get_cmd),
+					    (struct ena_admin_acq_entry *)
+					    get_resp,
+					    sizeof(*get_resp));
+
+	if (unlikely(ret))
+		pr_err("Failed to submit get_feature command %d error: %d\n",
+		       feature_id, ret);
+
+	return ret;
+}
+
+static int ena_com_get_feature(struct ena_com_dev *ena_dev,
+			       struct ena_admin_get_feat_resp *get_resp,
+			       enum ena_admin_aq_feature_id feature_id)
+{
+	return ena_com_get_feature_ex(ena_dev,
+				      get_resp,
+				      feature_id,
+				      0,
+				      0);
+}
+
+static int ena_com_hash_key_allocate(struct ena_com_dev *ena_dev)
+{
+	struct ena_rss *rss = &ena_dev->rss;
+
+	rss->hash_key =
+		dma_zalloc_coherent(ena_dev->dmadev, sizeof(*rss->hash_key),
+				    &rss->hash_key_dma_addr, GFP_KERNEL);
+
+	if (unlikely(!rss->hash_key))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void ena_com_hash_key_destroy(struct ena_com_dev *ena_dev)
+{
+	struct ena_rss *rss = &ena_dev->rss;
+
+	if (rss->hash_key)
+		dma_free_coherent(ena_dev->dmadev, sizeof(*rss->hash_key),
+				  rss->hash_key, rss->hash_key_dma_addr);
+	rss->hash_key = NULL;
+}
+
+static int ena_com_hash_ctrl_init(struct ena_com_dev *ena_dev)
+{
+	struct ena_rss *rss = &ena_dev->rss;
+
+	rss->hash_ctrl =
+		dma_zalloc_coherent(ena_dev->dmadev, sizeof(*rss->hash_ctrl),
+				    &rss->hash_ctrl_dma_addr, GFP_KERNEL);
+
+	if (unlikely(!rss->hash_ctrl))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void ena_com_hash_ctrl_destroy(struct ena_com_dev *ena_dev)
+{
+	struct ena_rss *rss = &ena_dev->rss;
+
+	if (rss->hash_ctrl)
+		dma_free_coherent(ena_dev->dmadev, sizeof(*rss->hash_ctrl),
+				  rss->hash_ctrl, rss->hash_ctrl_dma_addr);
+	rss->hash_ctrl = NULL;
+}
+
+static int ena_com_indirect_table_allocate(struct ena_com_dev *ena_dev,
+					   u16 log_size)
+{
+	struct ena_rss *rss = &ena_dev->rss;
+	struct ena_admin_get_feat_resp get_resp;
+	size_t tbl_size;
+	int ret;
+
+	ret = ena_com_get_feature(ena_dev, &get_resp,
+				  ENA_ADMIN_RSS_REDIRECTION_TABLE_CONFIG);
+	if (unlikely(ret))
+		return ret;
+
+	if ((get_resp.u.ind_table.min_size > log_size) ||
+	    (get_resp.u.ind_table.max_size < log_size)) {
+		pr_err("indirect table size doesn't fit. requested size: %d while min is:%d and max %d\n",
+		       1 << log_size, 1 << get_resp.u.ind_table.min_size,
+		       1 << get_resp.u.ind_table.max_size);
+		return -EINVAL;
+	}
+
+	tbl_size = (1ULL << log_size) *
+		sizeof(struct ena_admin_rss_ind_table_entry);
+
+	rss->rss_ind_tbl =
+		dma_zalloc_coherent(ena_dev->dmadev, tbl_size,
+				    &rss->rss_ind_tbl_dma_addr, GFP_KERNEL);
+	if (unlikely(!rss->rss_ind_tbl))
+		goto mem_err1;
+
+	tbl_size = (1ULL << log_size) * sizeof(u16);
+	rss->host_rss_ind_tbl =
+		devm_kzalloc(ena_dev->dmadev, tbl_size, GFP_KERNEL);
+	if (unlikely(!rss->host_rss_ind_tbl))
+		goto mem_err2;
+
+	rss->tbl_log_size = log_size;
+
+	return 0;
+
+mem_err2:
+	tbl_size = (1ULL << log_size) *
+		sizeof(struct ena_admin_rss_ind_table_entry);
+
+	dma_free_coherent(ena_dev->dmadev, tbl_size, rss->rss_ind_tbl,
+			  rss->rss_ind_tbl_dma_addr);
+	rss->rss_ind_tbl = NULL;
+mem_err1:
+	rss->tbl_log_size = 0;
+	return -ENOMEM;
+}
+
+static void ena_com_indirect_table_destroy(struct ena_com_dev *ena_dev)
+{
+	struct ena_rss *rss = &ena_dev->rss;
+	size_t tbl_size = (1ULL << rss->tbl_log_size) *
+		sizeof(struct ena_admin_rss_ind_table_entry);
+
+	if (rss->rss_ind_tbl)
+		dma_free_coherent(ena_dev->dmadev, tbl_size, rss->rss_ind_tbl,
+				  rss->rss_ind_tbl_dma_addr);
+	rss->rss_ind_tbl = NULL;
+
+	if (rss->host_rss_ind_tbl)
+		devm_kfree(ena_dev->dmadev, rss->host_rss_ind_tbl);
+	rss->host_rss_ind_tbl = NULL;
+}
+
+static int ena_com_create_io_sq(struct ena_com_dev *ena_dev,
+				struct ena_com_io_sq *io_sq, u16 cq_idx)
+{
+	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
+	struct ena_admin_aq_create_sq_cmd create_cmd;
+	struct ena_admin_acq_create_sq_resp_desc cmd_completion;
+	u8 direction;
+	int ret;
+
+	memset(&create_cmd, 0x0, sizeof(create_cmd));
+
+	create_cmd.aq_common_descriptor.opcode = ENA_ADMIN_CREATE_SQ;
+
+	if (io_sq->direction == ENA_COM_IO_QUEUE_DIRECTION_TX)
+		direction = ENA_ADMIN_SQ_DIRECTION_TX;
+	else
+		direction = ENA_ADMIN_SQ_DIRECTION_RX;
+
+	create_cmd.sq_identity |= (direction <<
+		ENA_ADMIN_AQ_CREATE_SQ_CMD_SQ_DIRECTION_SHIFT) &
+		ENA_ADMIN_AQ_CREATE_SQ_CMD_SQ_DIRECTION_MASK;
+
+	create_cmd.sq_caps_2 |= io_sq->mem_queue_type &
+		ENA_ADMIN_AQ_CREATE_SQ_CMD_PLACEMENT_POLICY_MASK;
+
+	create_cmd.sq_caps_2 |= (ENA_ADMIN_COMPLETION_POLICY_DESC <<
+		ENA_ADMIN_AQ_CREATE_SQ_CMD_COMPLETION_POLICY_SHIFT) &
+		ENA_ADMIN_AQ_CREATE_SQ_CMD_COMPLETION_POLICY_MASK;
+
+	create_cmd.sq_caps_3 |=
+		ENA_ADMIN_AQ_CREATE_SQ_CMD_IS_PHYSICALLY_CONTIGUOUS_MASK;
+
+	create_cmd.cq_idx = cq_idx;
+	create_cmd.sq_depth = io_sq->q_depth;
+
+	if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST) {
+		ret = ena_com_mem_addr_set(ena_dev,
+					   &create_cmd.sq_ba,
+					   io_sq->desc_addr.phys_addr);
+		if (unlikely(ret)) {
+			pr_err("memory address set failed\n");
+			return ret;
+		}
+	}
+
+	ret = ena_com_execute_admin_command(admin_queue,
+					    (struct ena_admin_aq_entry *)&create_cmd,
+					    sizeof(create_cmd),
+					    (struct ena_admin_acq_entry *)&cmd_completion,
+					    sizeof(cmd_completion));
+	if (unlikely(ret)) {
+		pr_err("Failed to create IO SQ. error: %d\n", ret);
+		return ret;
+	}
+
+	io_sq->idx = cmd_completion.sq_idx;
+
+	io_sq->db_addr = (u32 __iomem *)((uintptr_t)ena_dev->reg_bar +
+		(uintptr_t)cmd_completion.sq_doorbell_offset);
+
+	if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
+		io_sq->header_addr = (u8 __iomem *)((uintptr_t)ena_dev->mem_bar
+				+ cmd_completion.llq_headers_offset);
+
+		io_sq->desc_addr.pbuf_dev_addr =
+			(u8 __iomem *)((uintptr_t)ena_dev->mem_bar +
+			cmd_completion.llq_descriptors_offset);
+	}
+
+	pr_debug("created sq[%u], depth[%u]\n", io_sq->idx, io_sq->q_depth);
+
+	return ret;
+}
+
+static int ena_com_ind_tbl_convert_to_device(struct ena_com_dev *ena_dev)
+{
+	struct ena_rss *rss = &ena_dev->rss;
+	struct ena_com_io_sq *io_sq;
+	u16 qid;
+	int i;
+
+	for (i = 0; i < 1 << rss->tbl_log_size; i++) {
+		qid = rss->host_rss_ind_tbl[i];
+		if (qid >= ENA_TOTAL_NUM_QUEUES)
+			return -EINVAL;
+
+		io_sq = &ena_dev->io_sq_queues[qid];
+
+		if (io_sq->direction != ENA_COM_IO_QUEUE_DIRECTION_RX)
+			return -EINVAL;
+
+		rss->rss_ind_tbl[i].cq_idx = io_sq->idx;
+	}
+
+	return 0;
+}
+
+static int ena_com_ind_tbl_convert_from_device(struct ena_com_dev *ena_dev)
+{
+	u16 dev_idx_to_host_tbl[ENA_TOTAL_NUM_QUEUES] = { (u16)-1 };
+	struct ena_rss *rss = &ena_dev->rss;
+	u8 idx;
+	u16 i;
+
+	for (i = 0; i < ENA_TOTAL_NUM_QUEUES; i++)
+		dev_idx_to_host_tbl[ena_dev->io_sq_queues[i].idx] = i;
+
+	for (i = 0; i < 1 << rss->tbl_log_size; i++) {
+		if (rss->rss_ind_tbl[i].cq_idx > ENA_TOTAL_NUM_QUEUES)
+			return -EINVAL;
+		idx = (u8)rss->rss_ind_tbl[i].cq_idx;
+
+		if (dev_idx_to_host_tbl[idx] > ENA_TOTAL_NUM_QUEUES)
+			return -EINVAL;
+
+		rss->host_rss_ind_tbl[i] = dev_idx_to_host_tbl[idx];
+	}
+
+	return 0;
+}
+
+static int ena_com_init_interrupt_moderation_table(struct ena_com_dev *ena_dev)
+{
+	size_t size;
+
+	size = sizeof(struct ena_intr_moder_entry) * ENA_INTR_MAX_NUM_OF_LEVELS;
+
+	ena_dev->intr_moder_tbl =
+		devm_kzalloc(ena_dev->dmadev, size, GFP_KERNEL);
+	if (!ena_dev->intr_moder_tbl)
+		return -ENOMEM;
+
+	ena_com_config_default_interrupt_moderation_table(ena_dev);
+
+	return 0;
+}
+
+static void ena_com_update_intr_delay_resolution(struct ena_com_dev *ena_dev,
+						 u16 intr_delay_resolution)
+{
+	struct ena_intr_moder_entry *intr_moder_tbl = ena_dev->intr_moder_tbl;
+	unsigned int i;
+
+	if (!intr_delay_resolution) {
+		pr_err("Illegal intr_delay_resolution provided. Going to use default 1 usec resolution\n");
+		intr_delay_resolution = 1;
+	}
+	ena_dev->intr_delay_resolution = intr_delay_resolution;
+
+	/* update Rx */
+	for (i = 0; i < ENA_INTR_MAX_NUM_OF_LEVELS; i++)
+		intr_moder_tbl[i].intr_moder_interval /= intr_delay_resolution;
+
+	/* update Tx */
+	ena_dev->intr_moder_tx_interval /= intr_delay_resolution;
+}
+
+/*****************************************************************************/
+/*******************************      API       ******************************/
+/*****************************************************************************/
+
+int ena_com_execute_admin_command(struct ena_com_admin_queue *admin_queue,
+				  struct ena_admin_aq_entry *cmd,
+				  size_t cmd_size,
+				  struct ena_admin_acq_entry *comp,
+				  size_t comp_size)
+{
+	struct ena_comp_ctx *comp_ctx;
+	int ret;
+
+	comp_ctx = ena_com_submit_admin_cmd(admin_queue, cmd, cmd_size,
+					    comp, comp_size);
+	if (unlikely(IS_ERR(comp_ctx))) {
+		if (comp_ctx == ERR_PTR(-ENODEV))
+			pr_debug("Failed to submit command [%ld]\n",
+				 PTR_ERR(comp_ctx));
+		else
+			pr_err("Failed to submit command [%ld]\n",
+			       PTR_ERR(comp_ctx));
+
+		return PTR_ERR(comp_ctx);
+	}
+
+	ret = ena_com_wait_and_process_admin_cq(comp_ctx, admin_queue);
+	if (unlikely(ret)) {
+		if (admin_queue->running_state)
+			pr_err("Failed to process command. ret = %d\n", ret);
+		else
+			pr_debug("Failed to process command. ret = %d\n", ret);
+	}
+	return ret;
+}
+
+int ena_com_create_io_cq(struct ena_com_dev *ena_dev,
+			 struct ena_com_io_cq *io_cq)
+{
+	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
+	struct ena_admin_aq_create_cq_cmd create_cmd;
+	struct ena_admin_acq_create_cq_resp_desc cmd_completion;
+	int ret;
+
+	memset(&create_cmd, 0x0, sizeof(create_cmd));
+
+	create_cmd.aq_common_descriptor.opcode = ENA_ADMIN_CREATE_CQ;
+
+	create_cmd.cq_caps_2 |= (io_cq->cdesc_entry_size_in_bytes / 4) &
+		ENA_ADMIN_AQ_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS_MASK;
+	create_cmd.cq_caps_1 |=
+		ENA_ADMIN_AQ_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED_MASK;
+
+	create_cmd.msix_vector = io_cq->msix_vector;
+	create_cmd.cq_depth = io_cq->q_depth;
+
+	ret = ena_com_mem_addr_set(ena_dev,
+				   &create_cmd.cq_ba,
+				   io_cq->cdesc_addr.phys_addr);
+	if (unlikely(ret)) {
+		pr_err("memory address set failed\n");
+		return ret;
+	}
+
+	ret = ena_com_execute_admin_command(admin_queue,
+					    (struct ena_admin_aq_entry *)&create_cmd,
+					    sizeof(create_cmd),
+					    (struct ena_admin_acq_entry *)&cmd_completion,
+					    sizeof(cmd_completion));
+	if (unlikely(ret)) {
+		pr_err("Failed to create IO CQ. error: %d\n", ret);
+		return ret;
+	}
+
+	io_cq->idx = cmd_completion.cq_idx;
+
+	io_cq->unmask_reg = (u32 __iomem *)((uintptr_t)ena_dev->reg_bar +
+		cmd_completion.cq_interrupt_unmask_register_offset);
+
+	if (cmd_completion.cq_head_db_register_offset)
+		io_cq->cq_head_db_reg =
+			(u32 __iomem *)((uintptr_t)ena_dev->reg_bar +
+			cmd_completion.cq_head_db_register_offset);
+
+	if (cmd_completion.numa_node_register_offset)
+		io_cq->numa_node_cfg_reg =
+			(u32 __iomem *)((uintptr_t)ena_dev->reg_bar +
+			cmd_completion.numa_node_register_offset);
+
+	pr_debug("created cq[%u], depth[%u]\n", io_cq->idx, io_cq->q_depth);
+
+	return ret;
+}
+
+int ena_com_get_io_handlers(struct ena_com_dev *ena_dev, u16 qid,
+			    struct ena_com_io_sq **io_sq,
+			    struct ena_com_io_cq **io_cq)
+{
+	if (qid >= ENA_TOTAL_NUM_QUEUES) {
+		pr_err("Invalid queue number %d but the max is %d\n", qid,
+		       ENA_TOTAL_NUM_QUEUES);
+		return -EINVAL;
+	}
+
+	*io_sq = &ena_dev->io_sq_queues[qid];
+	*io_cq = &ena_dev->io_cq_queues[qid];
+
+	return 0;
+}
+
+void ena_com_abort_admin_commands(struct ena_com_dev *ena_dev)
+{
+	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
+	struct ena_comp_ctx *comp_ctx;
+	u16 i;
+
+	if (!admin_queue->comp_ctx)
+		return;
+
+	for (i = 0; i < admin_queue->q_depth; i++) {
+		comp_ctx = get_comp_ctxt(admin_queue, i, false);
+		if (unlikely(!comp_ctx))
+			break;
+
+		comp_ctx->status = ENA_CMD_ABORTED;
+
+		complete(&comp_ctx->wait_event);
+	}
+}
+
+void ena_com_wait_for_abort_completion(struct ena_com_dev *ena_dev)
+{
+	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
+	unsigned long flags;
+
+	spin_lock_irqsave(&admin_queue->q_lock, flags);
+	while (atomic_read(&admin_queue->outstanding_cmds) != 0) {
+		spin_unlock_irqrestore(&admin_queue->q_lock, flags);
+		msleep(ENA_POLL_MS);
+		spin_lock_irqsave(&admin_queue->q_lock, flags);
+	}
+	spin_unlock_irqrestore(&admin_queue->q_lock, flags);
+}
+
+int ena_com_destroy_io_cq(struct ena_com_dev *ena_dev,
+			  struct ena_com_io_cq *io_cq)
+{
+	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
+	struct ena_admin_aq_destroy_cq_cmd destroy_cmd;
+	struct ena_admin_acq_destroy_cq_resp_desc destroy_resp;
+	int ret;
+
+	memset(&destroy_cmd, 0x0, sizeof(destroy_cmd));
+
+	destroy_cmd.cq_idx = io_cq->idx;
+	destroy_cmd.aq_common_descriptor.opcode = ENA_ADMIN_DESTROY_CQ;
+
+	ret = ena_com_execute_admin_command(admin_queue,
+					    (struct ena_admin_aq_entry *)&destroy_cmd,
+					    sizeof(destroy_cmd),
+					    (struct ena_admin_acq_entry *)&destroy_resp,
+					    sizeof(destroy_resp));
+
+	if (unlikely(ret && (ret != -ENODEV)))
+		pr_err("Failed to destroy IO CQ. error: %d\n", ret);
+
+	return ret;
+}
+
+bool ena_com_get_admin_running_state(struct ena_com_dev *ena_dev)
+{
+	return ena_dev->admin_queue.running_state;
+}
+
+void ena_com_set_admin_running_state(struct ena_com_dev *ena_dev, bool state)
+{
+	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
+	unsigned long flags;
+
+	spin_lock_irqsave(&admin_queue->q_lock, flags);
+	ena_dev->admin_queue.running_state = state;
+	spin_unlock_irqrestore(&admin_queue->q_lock, flags);
+}
+
+void ena_com_admin_aenq_enable(struct ena_com_dev *ena_dev)
+{
+	u16 depth = ena_dev->aenq.q_depth;
+
+	WARN(ena_dev->aenq.head != depth, "Invalid AENQ state\n");
+
+	/* Init head_db to mark that all entries in the queue
+	 * are initially available
+	 */
+	writel(depth, ena_dev->reg_bar + ENA_REGS_AENQ_HEAD_DB_OFF);
+}
+
+int ena_com_set_aenq_config(struct ena_com_dev *ena_dev, u32 groups_flag)
+{
+	struct ena_com_admin_queue *admin_queue;
+	struct ena_admin_set_feat_cmd cmd;
+	struct ena_admin_set_feat_resp resp;
+	struct ena_admin_get_feat_resp get_resp;
+	int ret;
+
+	ret = ena_com_get_feature(ena_dev, &get_resp, ENA_ADMIN_AENQ_CONFIG);
+	if (ret) {
+		pr_info("Can't get aenq configuration\n");
+		return ret;
+	}
+
+	if ((get_resp.u.aenq.supported_groups & groups_flag) != groups_flag) {
+		pr_warn("Trying to set unsupported aenq events. supported flag: %x asked flag: %x\n",
+			get_resp.u.aenq.supported_groups, groups_flag);
+		return -EOPNOTSUPP;
+	}
+
+	memset(&cmd, 0x0, sizeof(cmd));
+	admin_queue = &ena_dev->admin_queue;
+
+	cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE;
+	cmd.aq_common_descriptor.flags = 0;
+	cmd.feat_common.feature_id = ENA_ADMIN_AENQ_CONFIG;
+	cmd.u.aenq.enabled_groups = groups_flag;
+
+	ret = ena_com_execute_admin_command(admin_queue,
+					    (struct ena_admin_aq_entry *)&cmd,
+					    sizeof(cmd),
+					    (struct ena_admin_acq_entry *)&resp,
+					    sizeof(resp));
+
+	if (unlikely(ret))
+		pr_err("Failed to config AENQ ret: %d\n", ret);
+
+	return ret;
+}
+
+int ena_com_get_dma_width(struct ena_com_dev *ena_dev)
+{
+	u32 caps = ena_com_reg_bar_read32(ena_dev, ENA_REGS_CAPS_OFF);
+	int width;
+
+	if (unlikely(caps == ENA_MMIO_READ_TIMEOUT)) {
+		pr_err("Reg read timeout occurred\n");
+		return -ETIME;
+	}
+
+	width = (caps & ENA_REGS_CAPS_DMA_ADDR_WIDTH_MASK) >>
+		ENA_REGS_CAPS_DMA_ADDR_WIDTH_SHIFT;
+
+	pr_debug("ENA dma width: %d\n", width);
+
+	if ((width < 32) || width > ENA_MAX_PHYS_ADDR_SIZE_BITS) {
+		pr_err("DMA width illegal value: %d\n", width);
+		return -EINVAL;
+	}
+
+	ena_dev->dma_addr_bits = width;
+
+	return width;
+}
+
+int ena_com_validate_version(struct ena_com_dev *ena_dev)
+{
+	u32 ver;
+	u32 ctrl_ver;
+	u32 ctrl_ver_masked;
+
+	/* Make sure the ENA version and the controller version are at least
+	 * as the driver expects
+	 */
+	ver = ena_com_reg_bar_read32(ena_dev, ENA_REGS_VERSION_OFF);
+	ctrl_ver = ena_com_reg_bar_read32(ena_dev,
+					  ENA_REGS_CONTROLLER_VERSION_OFF);
+
+	if (unlikely((ver == ENA_MMIO_READ_TIMEOUT) ||
+		     (ctrl_ver == ENA_MMIO_READ_TIMEOUT))) {
+		pr_err("Reg read timeout occurred\n");
+		return -ETIME;
+	}
+
+	pr_info("ena device version: %d.%d\n",
+		(ver & ENA_REGS_VERSION_MAJOR_VERSION_MASK) >>
+			ENA_REGS_VERSION_MAJOR_VERSION_SHIFT,
+		ver & ENA_REGS_VERSION_MINOR_VERSION_MASK);
+
+	if (ver < MIN_ENA_VER) {
+		pr_err("ENA version is lower than the minimal version the driver supports\n");
+		return -1;
+	}
+
+	pr_info("ena controller version: %d.%d.%d implementation version %d\n",
+		(ctrl_ver & ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK) >>
+			ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT,
+		(ctrl_ver & ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK) >>
+			ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT,
+		(ctrl_ver & ENA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK),
+		(ctrl_ver & ENA_REGS_CONTROLLER_VERSION_IMPL_ID_MASK) >>
+			ENA_REGS_CONTROLLER_VERSION_IMPL_ID_SHIFT);
+
+	ctrl_ver_masked =
+		(ctrl_ver & ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK) |
+		(ctrl_ver & ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK) |
+		(ctrl_ver & ENA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK);
+
+	/* Validate the ctrl version without the implementation ID */
+	if (ctrl_ver_masked < MIN_ENA_CTRL_VER) {
+		pr_err("ENA ctrl version is lower than the minimal ctrl version the driver supports\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+void ena_com_admin_destroy(struct ena_com_dev *ena_dev)
+{
+	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
+	struct ena_com_admin_cq *cq = &admin_queue->cq;
+	struct ena_com_admin_sq *sq = &admin_queue->sq;
+	struct ena_com_aenq *aenq = &ena_dev->aenq;
+	u16 size;
+
+	if (admin_queue->comp_ctx)
+		devm_kfree(ena_dev->dmadev, admin_queue->comp_ctx);
+	admin_queue->comp_ctx = NULL;
+	size = ADMIN_SQ_SIZE(admin_queue->q_depth);
+	if (sq->entries)
+		dma_free_coherent(ena_dev->dmadev, size, sq->entries,
+				  sq->dma_addr);
+	sq->entries = NULL;
+
+	size = ADMIN_CQ_SIZE(admin_queue->q_depth);
+	if (cq->entries)
+		dma_free_coherent(ena_dev->dmadev, size, cq->entries,
+				  cq->dma_addr);
+	cq->entries = NULL;
+
+	size = ADMIN_AENQ_SIZE(aenq->q_depth);
+	if (ena_dev->aenq.entries)
+		dma_free_coherent(ena_dev->dmadev, size, aenq->entries,
+				  aenq->dma_addr);
+	aenq->entries = NULL;
+}
+
+void ena_com_set_admin_polling_mode(struct ena_com_dev *ena_dev, bool polling)
+{
+	u32 mask_value = 0;
+
+	if (polling)
+		mask_value = ENA_REGS_ADMIN_INTR_MASK;
+
+	writel(mask_value, ena_dev->reg_bar + ENA_REGS_INTR_MASK_OFF);
+	ena_dev->admin_queue.polling = polling;
+}
+
+int ena_com_mmio_reg_read_request_init(struct ena_com_dev *ena_dev)
+{
+	struct ena_com_mmio_read *mmio_read = &ena_dev->mmio_read;
+
+	spin_lock_init(&mmio_read->lock);
+	mmio_read->read_resp =
+		dma_zalloc_coherent(ena_dev->dmadev,
+				    sizeof(*mmio_read->read_resp),
+				    &mmio_read->read_resp_dma_addr, GFP_KERNEL);
+	if (unlikely(!mmio_read->read_resp))
+		return -ENOMEM;
+
+	ena_com_mmio_reg_read_request_write_dev_addr(ena_dev);
+
+	mmio_read->read_resp->req_id = 0x0;
+	mmio_read->seq_num = 0x0;
+	mmio_read->readless_supported = true;
+
+	return 0;
+}
+
+void ena_com_set_mmio_read_mode(struct ena_com_dev *ena_dev, bool readless_supported)
+{
+	struct ena_com_mmio_read *mmio_read = &ena_dev->mmio_read;
+
+	mmio_read->readless_supported = readless_supported;
+}
+
+void ena_com_mmio_reg_read_request_destroy(struct ena_com_dev *ena_dev)
+{
+	struct ena_com_mmio_read *mmio_read = &ena_dev->mmio_read;
+
+	writel(0x0, ena_dev->reg_bar + ENA_REGS_MMIO_RESP_LO_OFF);
+	writel(0x0, ena_dev->reg_bar + ENA_REGS_MMIO_RESP_HI_OFF);
+
+	dma_free_coherent(ena_dev->dmadev, sizeof(*mmio_read->read_resp),
+			  mmio_read->read_resp, mmio_read->read_resp_dma_addr);
+
+	mmio_read->read_resp = NULL;
+}
+
+void ena_com_mmio_reg_read_request_write_dev_addr(struct ena_com_dev *ena_dev)
+{
+	struct ena_com_mmio_read *mmio_read = &ena_dev->mmio_read;
+	u32 addr_low, addr_high;
+
+	addr_low = ENA_DMA_ADDR_TO_UINT32_LOW(mmio_read->read_resp_dma_addr);
+	addr_high = ENA_DMA_ADDR_TO_UINT32_HIGH(mmio_read->read_resp_dma_addr);
+
+	writel(addr_low, ena_dev->reg_bar + ENA_REGS_MMIO_RESP_LO_OFF);
+	writel(addr_high, ena_dev->reg_bar + ENA_REGS_MMIO_RESP_HI_OFF);
+}
+
+int ena_com_admin_init(struct ena_com_dev *ena_dev,
+		       struct ena_aenq_handlers *aenq_handlers,
+		       bool init_spinlock)
+{
+	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
+	u32 aq_caps, acq_caps, dev_sts, addr_low, addr_high;
+	int ret;
+
+	dev_sts = ena_com_reg_bar_read32(ena_dev, ENA_REGS_DEV_STS_OFF);
+
+	if (unlikely(dev_sts == ENA_MMIO_READ_TIMEOUT)) {
+		pr_err("Reg read timeout occurred\n");
+		return -ETIME;
+	}
+
+	if (!(dev_sts & ENA_REGS_DEV_STS_READY_MASK)) {
+		pr_err("Device isn't ready, abort com init\n");
+		return -ENODEV;
+	}
+
+	admin_queue->q_depth = ENA_ADMIN_QUEUE_DEPTH;
+
+	admin_queue->bus = ena_dev->bus;
+	admin_queue->q_dmadev = ena_dev->dmadev;
+	admin_queue->polling = false;
+	admin_queue->curr_cmd_id = 0;
+
+	atomic_set(&admin_queue->outstanding_cmds, 0);
+
+	if (init_spinlock)
+		spin_lock_init(&admin_queue->q_lock);
+
+	ret = ena_com_init_comp_ctxt(admin_queue);
+	if (ret)
+		goto error;
+
+	ret = ena_com_admin_init_sq(admin_queue);
+	if (ret)
+		goto error;
+
+	ret = ena_com_admin_init_cq(admin_queue);
+	if (ret)
+		goto error;
+
+	admin_queue->sq.db_addr = (u32 __iomem *)((uintptr_t)ena_dev->reg_bar +
+		ENA_REGS_AQ_DB_OFF);
+
+	addr_low = ENA_DMA_ADDR_TO_UINT32_LOW(admin_queue->sq.dma_addr);
+	addr_high = ENA_DMA_ADDR_TO_UINT32_HIGH(admin_queue->sq.dma_addr);
+
+	writel(addr_low, ena_dev->reg_bar + ENA_REGS_AQ_BASE_LO_OFF);
+	writel(addr_high, ena_dev->reg_bar + ENA_REGS_AQ_BASE_HI_OFF);
+
+	addr_low = ENA_DMA_ADDR_TO_UINT32_LOW(admin_queue->cq.dma_addr);
+	addr_high = ENA_DMA_ADDR_TO_UINT32_HIGH(admin_queue->cq.dma_addr);
+
+	writel(addr_low, ena_dev->reg_bar + ENA_REGS_ACQ_BASE_LO_OFF);
+	writel(addr_high, ena_dev->reg_bar + ENA_REGS_ACQ_BASE_HI_OFF);
+
+	aq_caps = 0;
+	aq_caps |= admin_queue->q_depth & ENA_REGS_AQ_CAPS_AQ_DEPTH_MASK;
+	aq_caps |= (sizeof(struct ena_admin_aq_entry) <<
+			ENA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_SHIFT) &
+			ENA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_MASK;
+
+	acq_caps = 0;
+	acq_caps |= admin_queue->q_depth & ENA_REGS_ACQ_CAPS_ACQ_DEPTH_MASK;
+	acq_caps |= (sizeof(struct ena_admin_acq_entry) <<
+		ENA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_SHIFT) &
+		ENA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_MASK;
+
+	writel(aq_caps, ena_dev->reg_bar + ENA_REGS_AQ_CAPS_OFF);
+	writel(acq_caps, ena_dev->reg_bar + ENA_REGS_ACQ_CAPS_OFF);
+	ret = ena_com_admin_init_aenq(ena_dev, aenq_handlers);
+	if (ret)
+		goto error;
+
+	admin_queue->running_state = true;
+
+	return 0;
+error:
+	ena_com_admin_destroy(ena_dev);
+
+	return ret;
+}
+
+int ena_com_create_io_queue(struct ena_com_dev *ena_dev,
+			    struct ena_com_create_io_ctx *ctx)
+{
+	struct ena_com_io_sq *io_sq;
+	struct ena_com_io_cq *io_cq;
+	int ret;
+
+	if (ctx->qid >= ENA_TOTAL_NUM_QUEUES) {
+		pr_err("Qid (%d) is bigger than max num of queues (%d)\n",
+		       ctx->qid, ENA_TOTAL_NUM_QUEUES);
+		return -EINVAL;
+	}
+
+	io_sq = &ena_dev->io_sq_queues[ctx->qid];
+	io_cq = &ena_dev->io_cq_queues[ctx->qid];
+
+	memset(io_sq, 0x0, sizeof(*io_sq));
+	memset(io_cq, 0x0, sizeof(*io_cq));
+
+	/* Init CQ */
+	io_cq->q_depth = ctx->queue_size;
+	io_cq->direction = ctx->direction;
+	io_cq->qid = ctx->qid;
+
+	io_cq->msix_vector = ctx->msix_vector;
+
+	io_sq->q_depth = ctx->queue_size;
+	io_sq->direction = ctx->direction;
+	io_sq->qid = ctx->qid;
+
+	io_sq->mem_queue_type = ctx->mem_queue_type;
+
+	if (ctx->direction == ENA_COM_IO_QUEUE_DIRECTION_TX)
+		/* header length is limited to 8 bits */
+		io_sq->tx_max_header_size =
+			min_t(u32, ena_dev->tx_max_header_size, SZ_256);
+
+	ret = ena_com_init_io_sq(ena_dev, ctx, io_sq);
+	if (ret)
+		goto error;
+	ret = ena_com_init_io_cq(ena_dev, ctx, io_cq);
+	if (ret)
+		goto error;
+
+	ret = ena_com_create_io_cq(ena_dev, io_cq);
+	if (ret)
+		goto error;
+
+	ret = ena_com_create_io_sq(ena_dev, io_sq, io_cq->idx);
+	if (ret)
+		goto destroy_io_cq;
+
+	return 0;
+
+destroy_io_cq:
+	ena_com_destroy_io_cq(ena_dev, io_cq);
+error:
+	ena_com_io_queue_free(ena_dev, io_sq, io_cq);
+	return ret;
+}
+
+void ena_com_destroy_io_queue(struct ena_com_dev *ena_dev, u16 qid)
+{
+	struct ena_com_io_sq *io_sq;
+	struct ena_com_io_cq *io_cq;
+
+	if (qid >= ENA_TOTAL_NUM_QUEUES) {
+		pr_err("Qid (%d) is bigger than max num of queues (%d)\n", qid,
+		       ENA_TOTAL_NUM_QUEUES);
+		return;
+	}
+
+	io_sq = &ena_dev->io_sq_queues[qid];
+	io_cq = &ena_dev->io_cq_queues[qid];
+
+	ena_com_destroy_io_sq(ena_dev, io_sq);
+	ena_com_destroy_io_cq(ena_dev, io_cq);
+
+	ena_com_io_queue_free(ena_dev, io_sq, io_cq);
+}
+
+int ena_com_get_link_params(struct ena_com_dev *ena_dev,
+			    struct ena_admin_get_feat_resp *resp)
+{
+	return ena_com_get_feature(ena_dev, resp, ENA_ADMIN_LINK_CONFIG);
+}
+
+int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev,
+			      struct ena_com_dev_get_features_ctx *get_feat_ctx)
+{
+	struct ena_admin_get_feat_resp get_resp;
+	int rc;
+
+	rc = ena_com_get_feature(ena_dev, &get_resp,
+				 ENA_ADMIN_DEVICE_ATTRIBUTES);
+	if (rc)
+		return rc;
+
+	memcpy(&get_feat_ctx->dev_attr, &get_resp.u.dev_attr,
+	       sizeof(get_resp.u.dev_attr));
+	ena_dev->supported_features = get_resp.u.dev_attr.supported_features;
+
+	rc = ena_com_get_feature(ena_dev, &get_resp,
+				 ENA_ADMIN_MAX_QUEUES_NUM);
+	if (rc)
+		return rc;
+
+	memcpy(&get_feat_ctx->max_queues, &get_resp.u.max_queue,
+	       sizeof(get_resp.u.max_queue));
+	ena_dev->tx_max_header_size = get_resp.u.max_queue.max_header_size;
+
+	rc = ena_com_get_feature(ena_dev, &get_resp,
+				 ENA_ADMIN_AENQ_CONFIG);
+	if (rc)
+		return rc;
+
+	memcpy(&get_feat_ctx->aenq, &get_resp.u.aenq,
+	       sizeof(get_resp.u.aenq));
+
+	rc = ena_com_get_feature(ena_dev, &get_resp,
+				 ENA_ADMIN_STATELESS_OFFLOAD_CONFIG);
+	if (rc)
+		return rc;
+
+	memcpy(&get_feat_ctx->offload, &get_resp.u.offload,
+	       sizeof(get_resp.u.offload));
+
+	/* Driver hints isn't mandatory admin command. So in case the
+	 * command isn't supported set driver hints to 0
+	 */
+	rc = ena_com_get_feature(ena_dev, &get_resp, ENA_ADMIN_HW_HINTS);
+
+	if (!rc)
+		memcpy(&get_feat_ctx->hw_hints, &get_resp.u.hw_hints,
+		       sizeof(get_resp.u.hw_hints));
+	else if (rc == -EOPNOTSUPP)
+		memset(&get_feat_ctx->hw_hints, 0x0,
+		       sizeof(get_feat_ctx->hw_hints));
+	else
+		return rc;
+
+	return 0;
+}
+
+void ena_com_admin_q_comp_intr_handler(struct ena_com_dev *ena_dev)
+{
+	ena_com_handle_admin_completion(&ena_dev->admin_queue);
+}
+
+/* ena_handle_specific_aenq_event:
+ * return the handler that is relevant to the specific event group
+ */
+static ena_aenq_handler ena_com_get_specific_aenq_cb(struct ena_com_dev *dev,
+						     u16 group)
+{
+	struct ena_aenq_handlers *aenq_handlers = dev->aenq.aenq_handlers;
+
+	if ((group < ENA_MAX_HANDLERS) && aenq_handlers->handlers[group])
+		return aenq_handlers->handlers[group];
+
+	return aenq_handlers->unimplemented_handler;
+}
+
+/* ena_aenq_intr_handler:
+ * handles the aenq incoming events.
+ * pop events from the queue and apply the specific handler
+ */
+void ena_com_aenq_intr_handler(struct ena_com_dev *dev, void *data)
+{
+	struct ena_admin_aenq_entry *aenq_e;
+	struct ena_admin_aenq_common_desc *aenq_common;
+	struct ena_com_aenq *aenq  = &dev->aenq;
+	ena_aenq_handler handler_cb;
+	u16 masked_head, processed = 0;
+	u8 phase;
+
+	masked_head = aenq->head & (aenq->q_depth - 1);
+	phase = aenq->phase;
+	aenq_e = &aenq->entries[masked_head]; /* Get first entry */
+	aenq_common = &aenq_e->aenq_common_desc;
+
+	/* Go over all the events */
+	while ((aenq_common->flags & ENA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK) ==
+	       phase) {
+		pr_debug("AENQ! Group[%x] Syndrom[%x] timestamp: [%llus]\n",
+			 aenq_common->group, aenq_common->syndrom,
+			 (u64)aenq_common->timestamp_low +
+				 ((u64)aenq_common->timestamp_high << 32));
+
+		/* Handle specific event*/
+		handler_cb = ena_com_get_specific_aenq_cb(dev,
+							  aenq_common->group);
+		handler_cb(data, aenq_e); /* call the actual event handler*/
+
+		/* Get next event entry */
+		masked_head++;
+		processed++;
+
+		if (unlikely(masked_head == aenq->q_depth)) {
+			masked_head = 0;
+			phase = !phase;
+		}
+		aenq_e = &aenq->entries[masked_head];
+		aenq_common = &aenq_e->aenq_common_desc;
+	}
+
+	aenq->head += processed;
+	aenq->phase = phase;
+
+	/* Don't update aenq doorbell if there weren't any processed events */
+	if (!processed)
+		return;
+
+	/* write the aenq doorbell after all AENQ descriptors were read */
+	mb();
+	writel((u32)aenq->head, dev->reg_bar + ENA_REGS_AENQ_HEAD_DB_OFF);
+}
+
+int ena_com_dev_reset(struct ena_com_dev *ena_dev,
+		      enum ena_regs_reset_reason_types reset_reason)
+{
+	u32 stat, timeout, cap, reset_val;
+	int rc;
+
+	stat = ena_com_reg_bar_read32(ena_dev, ENA_REGS_DEV_STS_OFF);
+	cap = ena_com_reg_bar_read32(ena_dev, ENA_REGS_CAPS_OFF);
+
+	if (unlikely((stat == ENA_MMIO_READ_TIMEOUT) ||
+		     (cap == ENA_MMIO_READ_TIMEOUT))) {
+		pr_err("Reg read32 timeout occurred\n");
+		return -ETIME;
+	}
+
+	if ((stat & ENA_REGS_DEV_STS_READY_MASK) == 0) {
+		pr_err("Device isn't ready, can't reset device\n");
+		return -EINVAL;
+	}
+
+	timeout = (cap & ENA_REGS_CAPS_RESET_TIMEOUT_MASK) >>
+			ENA_REGS_CAPS_RESET_TIMEOUT_SHIFT;
+	if (timeout == 0) {
+		pr_err("Invalid timeout value\n");
+		return -EINVAL;
+	}
+
+	/* start reset */
+	reset_val = ENA_REGS_DEV_CTL_DEV_RESET_MASK;
+	reset_val |= (reset_reason << ENA_REGS_DEV_CTL_RESET_REASON_SHIFT) &
+		     ENA_REGS_DEV_CTL_RESET_REASON_MASK;
+	writel(reset_val, ena_dev->reg_bar + ENA_REGS_DEV_CTL_OFF);
+
+	/* Write again the MMIO read request address */
+	ena_com_mmio_reg_read_request_write_dev_addr(ena_dev);
+
+	rc = wait_for_reset_state(ena_dev, timeout,
+				  ENA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK);
+	if (rc != 0) {
+		pr_err("Reset indication didn't turn on\n");
+		return rc;
+	}
+
+	/* reset done */
+	writel(0, ena_dev->reg_bar + ENA_REGS_DEV_CTL_OFF);
+	rc = wait_for_reset_state(ena_dev, timeout, 0);
+	if (rc != 0) {
+		pr_err("Reset indication didn't turn off\n");
+		return rc;
+	}
+
+	timeout = (cap & ENA_REGS_CAPS_ADMIN_CMD_TO_MASK) >>
+		ENA_REGS_CAPS_ADMIN_CMD_TO_SHIFT;
+	if (timeout)
+		/* the resolution of timeout reg is 100ms */
+		ena_dev->admin_queue.completion_timeout = timeout * 100000;
+	else
+		ena_dev->admin_queue.completion_timeout = ADMIN_CMD_TIMEOUT_US;
+
+	return 0;
+}
+
+static int ena_get_dev_stats(struct ena_com_dev *ena_dev,
+			     struct ena_com_stats_ctx *ctx,
+			     enum ena_admin_get_stats_type type)
+{
+	struct ena_admin_aq_get_stats_cmd *get_cmd = &ctx->get_cmd;
+	struct ena_admin_acq_get_stats_resp *get_resp = &ctx->get_resp;
+	struct ena_com_admin_queue *admin_queue;
+	int ret;
+
+	admin_queue = &ena_dev->admin_queue;
+
+	get_cmd->aq_common_descriptor.opcode = ENA_ADMIN_GET_STATS;
+	get_cmd->aq_common_descriptor.flags = 0;
+	get_cmd->type = type;
+
+	ret =  ena_com_execute_admin_command(admin_queue,
+					     (struct ena_admin_aq_entry *)get_cmd,
+					     sizeof(*get_cmd),
+					     (struct ena_admin_acq_entry *)get_resp,
+					     sizeof(*get_resp));
+
+	if (unlikely(ret))
+		pr_err("Failed to get stats. error: %d\n", ret);
+
+	return ret;
+}
+
+int ena_com_get_dev_basic_stats(struct ena_com_dev *ena_dev,
+				struct ena_admin_basic_stats *stats)
+{
+	struct ena_com_stats_ctx ctx;
+	int ret;
+
+	memset(&ctx, 0x0, sizeof(ctx));
+	ret = ena_get_dev_stats(ena_dev, &ctx, ENA_ADMIN_GET_STATS_TYPE_BASIC);
+	if (likely(ret == 0))
+		memcpy(stats, &ctx.get_resp.basic_stats,
+		       sizeof(ctx.get_resp.basic_stats));
+
+	return ret;
+}
+
+int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, int mtu)
+{
+	struct ena_com_admin_queue *admin_queue;
+	struct ena_admin_set_feat_cmd cmd;
+	struct ena_admin_set_feat_resp resp;
+	int ret;
+
+	if (!ena_com_check_supported_feature_id(ena_dev, ENA_ADMIN_MTU)) {
+		pr_debug("Feature %d isn't supported\n", ENA_ADMIN_MTU);
+		return -EOPNOTSUPP;
+	}
+
+	memset(&cmd, 0x0, sizeof(cmd));
+	admin_queue = &ena_dev->admin_queue;
+
+	cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE;
+	cmd.aq_common_descriptor.flags = 0;
+	cmd.feat_common.feature_id = ENA_ADMIN_MTU;
+	cmd.u.mtu.mtu = mtu;
+
+	ret = ena_com_execute_admin_command(admin_queue,
+					    (struct ena_admin_aq_entry *)&cmd,
+					    sizeof(cmd),
+					    (struct ena_admin_acq_entry *)&resp,
+					    sizeof(resp));
+
+	if (unlikely(ret))
+		pr_err("Failed to set mtu %d. error: %d\n", mtu, ret);
+
+	return ret;
+}
+
+int ena_com_get_offload_settings(struct ena_com_dev *ena_dev,
+				 struct ena_admin_feature_offload_desc *offload)
+{
+	int ret;
+	struct ena_admin_get_feat_resp resp;
+
+	ret = ena_com_get_feature(ena_dev, &resp,
+				  ENA_ADMIN_STATELESS_OFFLOAD_CONFIG);
+	if (unlikely(ret)) {
+		pr_err("Failed to get offload capabilities %d\n", ret);
+		return ret;
+	}
+
+	memcpy(offload, &resp.u.offload, sizeof(resp.u.offload));
+
+	return 0;
+}
+
+int ena_com_set_hash_function(struct ena_com_dev *ena_dev)
+{
+	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
+	struct ena_rss *rss = &ena_dev->rss;
+	struct ena_admin_set_feat_cmd cmd;
+	struct ena_admin_set_feat_resp resp;
+	struct ena_admin_get_feat_resp get_resp;
+	int ret;
+
+	if (!ena_com_check_supported_feature_id(ena_dev,
+						ENA_ADMIN_RSS_HASH_FUNCTION)) {
+		pr_debug("Feature %d isn't supported\n",
+			 ENA_ADMIN_RSS_HASH_FUNCTION);
+		return -EOPNOTSUPP;
+	}
+
+	/* Validate hash function is supported */
+	ret = ena_com_get_feature(ena_dev, &get_resp,
+				  ENA_ADMIN_RSS_HASH_FUNCTION);
+	if (unlikely(ret))
+		return ret;
+
+	if (get_resp.u.flow_hash_func.supported_func & (1 << rss->hash_func)) {
+		pr_err("Func hash %d isn't supported by device, abort\n",
+		       rss->hash_func);
+		return -EOPNOTSUPP;
+	}
+
+	memset(&cmd, 0x0, sizeof(cmd));
+
+	cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE;
+	cmd.aq_common_descriptor.flags =
+		ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK;
+	cmd.feat_common.feature_id = ENA_ADMIN_RSS_HASH_FUNCTION;
+	cmd.u.flow_hash_func.init_val = rss->hash_init_val;
+	cmd.u.flow_hash_func.selected_func = 1 << rss->hash_func;
+
+	ret = ena_com_mem_addr_set(ena_dev,
+				   &cmd.control_buffer.address,
+				   rss->hash_key_dma_addr);
+	if (unlikely(ret)) {
+		pr_err("memory address set failed\n");
+		return ret;
+	}
+
+	cmd.control_buffer.length = sizeof(*rss->hash_key);
+
+	ret = ena_com_execute_admin_command(admin_queue,
+					    (struct ena_admin_aq_entry *)&cmd,
+					    sizeof(cmd),
+					    (struct ena_admin_acq_entry *)&resp,
+					    sizeof(resp));
+	if (unlikely(ret)) {
+		pr_err("Failed to set hash function %d. error: %d\n",
+		       rss->hash_func, ret);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int ena_com_fill_hash_function(struct ena_com_dev *ena_dev,
+			       enum ena_admin_hash_functions func,
+			       const u8 *key, u16 key_len, u32 init_val)
+{
+	struct ena_rss *rss = &ena_dev->rss;
+	struct ena_admin_get_feat_resp get_resp;
+	struct ena_admin_feature_rss_flow_hash_control *hash_key =
+		rss->hash_key;
+	int rc;
+
+	/* Make sure size is a mult of DWs */
+	if (unlikely(key_len & 0x3))
+		return -EINVAL;
+
+	rc = ena_com_get_feature_ex(ena_dev, &get_resp,
+				    ENA_ADMIN_RSS_HASH_FUNCTION,
+				    rss->hash_key_dma_addr,
+				    sizeof(*rss->hash_key));
+	if (unlikely(rc))
+		return rc;
+
+	if (!((1 << func) & get_resp.u.flow_hash_func.supported_func)) {
+		pr_err("Flow hash function %d isn't supported\n", func);
+		return -EOPNOTSUPP;
+	}
+
+	switch (func) {
+	case ENA_ADMIN_TOEPLITZ:
+		if (key_len > sizeof(hash_key->key)) {
+			pr_err("key len (%hu) is bigger than the max supported (%zu)\n",
+			       key_len, sizeof(hash_key->key));
+			return -EINVAL;
+		}
+
+		memcpy(hash_key->key, key, key_len);
+		rss->hash_init_val = init_val;
+		hash_key->keys_num = key_len >> 2;
+		break;
+	case ENA_ADMIN_CRC32:
+		rss->hash_init_val = init_val;
+		break;
+	default:
+		pr_err("Invalid hash function (%d)\n", func);
+		return -EINVAL;
+	}
+
+	rc = ena_com_set_hash_function(ena_dev);
+
+	/* Restore the old function */
+	if (unlikely(rc))
+		ena_com_get_hash_function(ena_dev, NULL, NULL);
+
+	return rc;
+}
+
+int ena_com_get_hash_function(struct ena_com_dev *ena_dev,
+			      enum ena_admin_hash_functions *func,
+			      u8 *key)
+{
+	struct ena_rss *rss = &ena_dev->rss;
+	struct ena_admin_get_feat_resp get_resp;
+	struct ena_admin_feature_rss_flow_hash_control *hash_key =
+		rss->hash_key;
+	int rc;
+
+	rc = ena_com_get_feature_ex(ena_dev, &get_resp,
+				    ENA_ADMIN_RSS_HASH_FUNCTION,
+				    rss->hash_key_dma_addr,
+				    sizeof(*rss->hash_key));
+	if (unlikely(rc))
+		return rc;
+
+	rss->hash_func = get_resp.u.flow_hash_func.selected_func;
+	if (func)
+		*func = rss->hash_func;
+
+	if (key)
+		memcpy(key, hash_key->key, (size_t)(hash_key->keys_num) << 2);
+
+	return 0;
+}
+
+int ena_com_get_hash_ctrl(struct ena_com_dev *ena_dev,
+			  enum ena_admin_flow_hash_proto proto,
+			  u16 *fields)
+{
+	struct ena_rss *rss = &ena_dev->rss;
+	struct ena_admin_get_feat_resp get_resp;
+	int rc;
+
+	rc = ena_com_get_feature_ex(ena_dev, &get_resp,
+				    ENA_ADMIN_RSS_HASH_INPUT,
+				    rss->hash_ctrl_dma_addr,
+				    sizeof(*rss->hash_ctrl));
+	if (unlikely(rc))
+		return rc;
+
+	if (fields)
+		*fields = rss->hash_ctrl->selected_fields[proto].fields;
+
+	return 0;
+}
+
+int ena_com_set_hash_ctrl(struct ena_com_dev *ena_dev)
+{
+	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
+	struct ena_rss *rss = &ena_dev->rss;
+	struct ena_admin_feature_rss_hash_control *hash_ctrl = rss->hash_ctrl;
+	struct ena_admin_set_feat_cmd cmd;
+	struct ena_admin_set_feat_resp resp;
+	int ret;
+
+	if (!ena_com_check_supported_feature_id(ena_dev,
+						ENA_ADMIN_RSS_HASH_INPUT)) {
+		pr_debug("Feature %d isn't supported\n",
+			 ENA_ADMIN_RSS_HASH_INPUT);
+		return -EOPNOTSUPP;
+	}
+
+	memset(&cmd, 0x0, sizeof(cmd));
+
+	cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE;
+	cmd.aq_common_descriptor.flags =
+		ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK;
+	cmd.feat_common.feature_id = ENA_ADMIN_RSS_HASH_INPUT;
+	cmd.u.flow_hash_input.enabled_input_sort =
+		ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L3_SORT_MASK |
+		ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L4_SORT_MASK;
+
+	ret = ena_com_mem_addr_set(ena_dev,
+				   &cmd.control_buffer.address,
+				   rss->hash_ctrl_dma_addr);
+	if (unlikely(ret)) {
+		pr_err("memory address set failed\n");
+		return ret;
+	}
+	cmd.control_buffer.length = sizeof(*hash_ctrl);
+
+	ret = ena_com_execute_admin_command(admin_queue,
+					    (struct ena_admin_aq_entry *)&cmd,
+					    sizeof(cmd),
+					    (struct ena_admin_acq_entry *)&resp,
+					    sizeof(resp));
+	if (unlikely(ret))
+		pr_err("Failed to set hash input. error: %d\n", ret);
+
+	return ret;
+}
+
+int ena_com_set_default_hash_ctrl(struct ena_com_dev *ena_dev)
+{
+	struct ena_rss *rss = &ena_dev->rss;
+	struct ena_admin_feature_rss_hash_control *hash_ctrl =
+		rss->hash_ctrl;
+	u16 available_fields = 0;
+	int rc, i;
+
+	/* Get the supported hash input */
+	rc = ena_com_get_hash_ctrl(ena_dev, 0, NULL);
+	if (unlikely(rc))
+		return rc;
+
+	hash_ctrl->selected_fields[ENA_ADMIN_RSS_TCP4].fields =
+		ENA_ADMIN_RSS_L3_SA | ENA_ADMIN_RSS_L3_DA |
+		ENA_ADMIN_RSS_L4_DP | ENA_ADMIN_RSS_L4_SP;
+
+	hash_ctrl->selected_fields[ENA_ADMIN_RSS_UDP4].fields =
+		ENA_ADMIN_RSS_L3_SA | ENA_ADMIN_RSS_L3_DA |
+		ENA_ADMIN_RSS_L4_DP | ENA_ADMIN_RSS_L4_SP;
+
+	hash_ctrl->selected_fields[ENA_ADMIN_RSS_TCP6].fields =
+		ENA_ADMIN_RSS_L3_SA | ENA_ADMIN_RSS_L3_DA |
+		ENA_ADMIN_RSS_L4_DP | ENA_ADMIN_RSS_L4_SP;
+
+	hash_ctrl->selected_fields[ENA_ADMIN_RSS_UDP6].fields =
+		ENA_ADMIN_RSS_L3_SA | ENA_ADMIN_RSS_L3_DA |
+		ENA_ADMIN_RSS_L4_DP | ENA_ADMIN_RSS_L4_SP;
+
+	hash_ctrl->selected_fields[ENA_ADMIN_RSS_IP4].fields =
+		ENA_ADMIN_RSS_L3_SA | ENA_ADMIN_RSS_L3_DA;
+
+	hash_ctrl->selected_fields[ENA_ADMIN_RSS_IP6].fields =
+		ENA_ADMIN_RSS_L3_SA | ENA_ADMIN_RSS_L3_DA;
+
+	hash_ctrl->selected_fields[ENA_ADMIN_RSS_IP4_FRAG].fields =
+		ENA_ADMIN_RSS_L3_SA | ENA_ADMIN_RSS_L3_DA;
+
+	hash_ctrl->selected_fields[ENA_ADMIN_RSS_NOT_IP].fields =
+		ENA_ADMIN_RSS_L2_DA | ENA_ADMIN_RSS_L2_SA;
+
+	for (i = 0; i < ENA_ADMIN_RSS_PROTO_NUM; i++) {
+		available_fields = hash_ctrl->selected_fields[i].fields &
+				hash_ctrl->supported_fields[i].fields;
+		if (available_fields != hash_ctrl->selected_fields[i].fields) {
+			pr_err("hash control doesn't support all the desire configuration. proto %x supported %x selected %x\n",
+			       i, hash_ctrl->supported_fields[i].fields,
+			       hash_ctrl->selected_fields[i].fields);
+			return -EOPNOTSUPP;
+		}
+	}
+
+	rc = ena_com_set_hash_ctrl(ena_dev);
+
+	/* In case of failure, restore the old hash ctrl */
+	if (unlikely(rc))
+		ena_com_get_hash_ctrl(ena_dev, 0, NULL);
+
+	return rc;
+}
+
+int ena_com_fill_hash_ctrl(struct ena_com_dev *ena_dev,
+			   enum ena_admin_flow_hash_proto proto,
+			   u16 hash_fields)
+{
+	struct ena_rss *rss = &ena_dev->rss;
+	struct ena_admin_feature_rss_hash_control *hash_ctrl = rss->hash_ctrl;
+	u16 supported_fields;
+	int rc;
+
+	if (proto >= ENA_ADMIN_RSS_PROTO_NUM) {
+		pr_err("Invalid proto num (%u)\n", proto);
+		return -EINVAL;
+	}
+
+	/* Get the ctrl table */
+	rc = ena_com_get_hash_ctrl(ena_dev, proto, NULL);
+	if (unlikely(rc))
+		return rc;
+
+	/* Make sure all the fields are supported */
+	supported_fields = hash_ctrl->supported_fields[proto].fields;
+	if ((hash_fields & supported_fields) != hash_fields) {
+		pr_err("proto %d doesn't support the required fields %x. supports only: %x\n",
+		       proto, hash_fields, supported_fields);
+	}
+
+	hash_ctrl->selected_fields[proto].fields = hash_fields;
+
+	rc = ena_com_set_hash_ctrl(ena_dev);
+
+	/* In case of failure, restore the old hash ctrl */
+	if (unlikely(rc))
+		ena_com_get_hash_ctrl(ena_dev, 0, NULL);
+
+	return 0;
+}
+
+int ena_com_indirect_table_fill_entry(struct ena_com_dev *ena_dev,
+				      u16 entry_idx, u16 entry_value)
+{
+	struct ena_rss *rss = &ena_dev->rss;
+
+	if (unlikely(entry_idx >= (1 << rss->tbl_log_size)))
+		return -EINVAL;
+
+	if (unlikely((entry_value > ENA_TOTAL_NUM_QUEUES)))
+		return -EINVAL;
+
+	rss->host_rss_ind_tbl[entry_idx] = entry_value;
+
+	return 0;
+}
+
+int ena_com_indirect_table_set(struct ena_com_dev *ena_dev)
+{
+	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
+	struct ena_rss *rss = &ena_dev->rss;
+	struct ena_admin_set_feat_cmd cmd;
+	struct ena_admin_set_feat_resp resp;
+	int ret;
+
+	if (!ena_com_check_supported_feature_id(
+		    ena_dev, ENA_ADMIN_RSS_REDIRECTION_TABLE_CONFIG)) {
+		pr_debug("Feature %d isn't supported\n",
+			 ENA_ADMIN_RSS_REDIRECTION_TABLE_CONFIG);
+		return -EOPNOTSUPP;
+	}
+
+	ret = ena_com_ind_tbl_convert_to_device(ena_dev);
+	if (ret) {
+		pr_err("Failed to convert host indirection table to device table\n");
+		return ret;
+	}
+
+	memset(&cmd, 0x0, sizeof(cmd));
+
+	cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE;
+	cmd.aq_common_descriptor.flags =
+		ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK;
+	cmd.feat_common.feature_id = ENA_ADMIN_RSS_REDIRECTION_TABLE_CONFIG;
+	cmd.u.ind_table.size = rss->tbl_log_size;
+	cmd.u.ind_table.inline_index = 0xFFFFFFFF;
+
+	ret = ena_com_mem_addr_set(ena_dev,
+				   &cmd.control_buffer.address,
+				   rss->rss_ind_tbl_dma_addr);
+	if (unlikely(ret)) {
+		pr_err("memory address set failed\n");
+		return ret;
+	}
+
+	cmd.control_buffer.length = (1ULL << rss->tbl_log_size) *
+		sizeof(struct ena_admin_rss_ind_table_entry);
+
+	ret = ena_com_execute_admin_command(admin_queue,
+					    (struct ena_admin_aq_entry *)&cmd,
+					    sizeof(cmd),
+					    (struct ena_admin_acq_entry *)&resp,
+					    sizeof(resp));
+
+	if (unlikely(ret))
+		pr_err("Failed to set indirect table. error: %d\n", ret);
+
+	return ret;
+}
+
+int ena_com_indirect_table_get(struct ena_com_dev *ena_dev, u32 *ind_tbl)
+{
+	struct ena_rss *rss = &ena_dev->rss;
+	struct ena_admin_get_feat_resp get_resp;
+	u32 tbl_size;
+	int i, rc;
+
+	tbl_size = (1ULL << rss->tbl_log_size) *
+		sizeof(struct ena_admin_rss_ind_table_entry);
+
+	rc = ena_com_get_feature_ex(ena_dev, &get_resp,
+				    ENA_ADMIN_RSS_REDIRECTION_TABLE_CONFIG,
+				    rss->rss_ind_tbl_dma_addr,
+				    tbl_size);
+	if (unlikely(rc))
+		return rc;
+
+	if (!ind_tbl)
+		return 0;
+
+	rc = ena_com_ind_tbl_convert_from_device(ena_dev);
+	if (unlikely(rc))
+		return rc;
+
+	for (i = 0; i < (1 << rss->tbl_log_size); i++)
+		ind_tbl[i] = rss->host_rss_ind_tbl[i];
+
+	return 0;
+}
+
+int ena_com_rss_init(struct ena_com_dev *ena_dev, u16 indr_tbl_log_size)
+{
+	int rc;
+
+	memset(&ena_dev->rss, 0x0, sizeof(ena_dev->rss));
+
+	rc = ena_com_indirect_table_allocate(ena_dev, indr_tbl_log_size);
+	if (unlikely(rc))
+		goto err_indr_tbl;
+
+	rc = ena_com_hash_key_allocate(ena_dev);
+	if (unlikely(rc))
+		goto err_hash_key;
+
+	rc = ena_com_hash_ctrl_init(ena_dev);
+	if (unlikely(rc))
+		goto err_hash_ctrl;
+
+	return 0;
+
+err_hash_ctrl:
+	ena_com_hash_key_destroy(ena_dev);
+err_hash_key:
+	ena_com_indirect_table_destroy(ena_dev);
+err_indr_tbl:
+
+	return rc;
+}
+
+void ena_com_rss_destroy(struct ena_com_dev *ena_dev)
+{
+	ena_com_indirect_table_destroy(ena_dev);
+	ena_com_hash_key_destroy(ena_dev);
+	ena_com_hash_ctrl_destroy(ena_dev);
+
+	memset(&ena_dev->rss, 0x0, sizeof(ena_dev->rss));
+}
+
+int ena_com_allocate_host_info(struct ena_com_dev *ena_dev)
+{
+	struct ena_host_attribute *host_attr = &ena_dev->host_attr;
+
+	host_attr->host_info =
+		dma_zalloc_coherent(ena_dev->dmadev, SZ_4K,
+				    &host_attr->host_info_dma_addr, GFP_KERNEL);
+	if (unlikely(!host_attr->host_info))
+		return -ENOMEM;
+
+	return 0;
+}
+
+int ena_com_allocate_debug_area(struct ena_com_dev *ena_dev,
+				u32 debug_area_size)
+{
+	struct ena_host_attribute *host_attr = &ena_dev->host_attr;
+
+	host_attr->debug_area_virt_addr =
+		dma_zalloc_coherent(ena_dev->dmadev, debug_area_size,
+				    &host_attr->debug_area_dma_addr, GFP_KERNEL);
+	if (unlikely(!host_attr->debug_area_virt_addr)) {
+		host_attr->debug_area_size = 0;
+		return -ENOMEM;
+	}
+
+	host_attr->debug_area_size = debug_area_size;
+
+	return 0;
+}
+
+void ena_com_delete_host_info(struct ena_com_dev *ena_dev)
+{
+	struct ena_host_attribute *host_attr = &ena_dev->host_attr;
+
+	if (host_attr->host_info) {
+		dma_free_coherent(ena_dev->dmadev, SZ_4K, host_attr->host_info,
+				  host_attr->host_info_dma_addr);
+		host_attr->host_info = NULL;
+	}
+}
+
+void ena_com_delete_debug_area(struct ena_com_dev *ena_dev)
+{
+	struct ena_host_attribute *host_attr = &ena_dev->host_attr;
+
+	if (host_attr->debug_area_virt_addr) {
+		dma_free_coherent(ena_dev->dmadev, host_attr->debug_area_size,
+				  host_attr->debug_area_virt_addr,
+				  host_attr->debug_area_dma_addr);
+		host_attr->debug_area_virt_addr = NULL;
+	}
+}
+
+int ena_com_set_host_attributes(struct ena_com_dev *ena_dev)
+{
+	struct ena_host_attribute *host_attr = &ena_dev->host_attr;
+	struct ena_com_admin_queue *admin_queue;
+	struct ena_admin_set_feat_cmd cmd;
+	struct ena_admin_set_feat_resp resp;
+
+	int ret;
+
+	/* Host attribute config is called before ena_com_get_dev_attr_feat
+	 * so ena_com can't check if the feature is supported.
+	 */
+
+	memset(&cmd, 0x0, sizeof(cmd));
+	admin_queue = &ena_dev->admin_queue;
+
+	cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE;
+	cmd.feat_common.feature_id = ENA_ADMIN_HOST_ATTR_CONFIG;
+
+	ret = ena_com_mem_addr_set(ena_dev,
+				   &cmd.u.host_attr.debug_ba,
+				   host_attr->debug_area_dma_addr);
+	if (unlikely(ret)) {
+		pr_err("memory address set failed\n");
+		return ret;
+	}
+
+	ret = ena_com_mem_addr_set(ena_dev,
+				   &cmd.u.host_attr.os_info_ba,
+				   host_attr->host_info_dma_addr);
+	if (unlikely(ret)) {
+		pr_err("memory address set failed\n");
+		return ret;
+	}
+
+	cmd.u.host_attr.debug_area_size = host_attr->debug_area_size;
+
+	ret = ena_com_execute_admin_command(admin_queue,
+					    (struct ena_admin_aq_entry *)&cmd,
+					    sizeof(cmd),
+					    (struct ena_admin_acq_entry *)&resp,
+					    sizeof(resp));
+
+	if (unlikely(ret))
+		pr_err("Failed to set host attributes: %d\n", ret);
+
+	return ret;
+}
+
+/* Interrupt moderation */
+bool ena_com_interrupt_moderation_supported(struct ena_com_dev *ena_dev)
+{
+	return ena_com_check_supported_feature_id(ena_dev,
+						  ENA_ADMIN_INTERRUPT_MODERATION);
+}
+
+int ena_com_update_nonadaptive_moderation_interval_tx(struct ena_com_dev *ena_dev,
+						      u32 tx_coalesce_usecs)
+{
+	if (!ena_dev->intr_delay_resolution) {
+		pr_err("Illegal interrupt delay granularity value\n");
+		return -EFAULT;
+	}
+
+	ena_dev->intr_moder_tx_interval = tx_coalesce_usecs /
+		ena_dev->intr_delay_resolution;
+
+	return 0;
+}
+
+int ena_com_update_nonadaptive_moderation_interval_rx(struct ena_com_dev *ena_dev,
+						      u32 rx_coalesce_usecs)
+{
+	if (!ena_dev->intr_delay_resolution) {
+		pr_err("Illegal interrupt delay granularity value\n");
+		return -EFAULT;
+	}
+
+	/* We use LOWEST entry of moderation table for storing
+	 * nonadaptive interrupt coalescing values
+	 */
+	ena_dev->intr_moder_tbl[ENA_INTR_MODER_LOWEST].intr_moder_interval =
+		rx_coalesce_usecs / ena_dev->intr_delay_resolution;
+
+	return 0;
+}
+
+void ena_com_destroy_interrupt_moderation(struct ena_com_dev *ena_dev)
+{
+	if (ena_dev->intr_moder_tbl)
+		devm_kfree(ena_dev->dmadev, ena_dev->intr_moder_tbl);
+	ena_dev->intr_moder_tbl = NULL;
+}
+
+int ena_com_init_interrupt_moderation(struct ena_com_dev *ena_dev)
+{
+	struct ena_admin_get_feat_resp get_resp;
+	u16 delay_resolution;
+	int rc;
+
+	rc = ena_com_get_feature(ena_dev, &get_resp,
+				 ENA_ADMIN_INTERRUPT_MODERATION);
+
+	if (rc) {
+		if (rc == -EOPNOTSUPP) {
+			pr_debug("Feature %d isn't supported\n",
+				 ENA_ADMIN_INTERRUPT_MODERATION);
+			rc = 0;
+		} else {
+			pr_err("Failed to get interrupt moderation admin cmd. rc: %d\n",
+			       rc);
+		}
+
+		/* no moderation supported, disable adaptive support */
+		ena_com_disable_adaptive_moderation(ena_dev);
+		return rc;
+	}
+
+	rc = ena_com_init_interrupt_moderation_table(ena_dev);
+	if (rc)
+		goto err;
+
+	/* if moderation is supported by device we set adaptive moderation */
+	delay_resolution = get_resp.u.intr_moderation.intr_delay_resolution;
+	ena_com_update_intr_delay_resolution(ena_dev, delay_resolution);
+	ena_com_enable_adaptive_moderation(ena_dev);
+
+	return 0;
+err:
+	ena_com_destroy_interrupt_moderation(ena_dev);
+	return rc;
+}
+
+void ena_com_config_default_interrupt_moderation_table(struct ena_com_dev *ena_dev)
+{
+	struct ena_intr_moder_entry *intr_moder_tbl = ena_dev->intr_moder_tbl;
+
+	if (!intr_moder_tbl)
+		return;
+
+	intr_moder_tbl[ENA_INTR_MODER_LOWEST].intr_moder_interval =
+		ENA_INTR_LOWEST_USECS;
+	intr_moder_tbl[ENA_INTR_MODER_LOWEST].pkts_per_interval =
+		ENA_INTR_LOWEST_PKTS;
+	intr_moder_tbl[ENA_INTR_MODER_LOWEST].bytes_per_interval =
+		ENA_INTR_LOWEST_BYTES;
+
+	intr_moder_tbl[ENA_INTR_MODER_LOW].intr_moder_interval =
+		ENA_INTR_LOW_USECS;
+	intr_moder_tbl[ENA_INTR_MODER_LOW].pkts_per_interval =
+		ENA_INTR_LOW_PKTS;
+	intr_moder_tbl[ENA_INTR_MODER_LOW].bytes_per_interval =
+		ENA_INTR_LOW_BYTES;
+
+	intr_moder_tbl[ENA_INTR_MODER_MID].intr_moder_interval =
+		ENA_INTR_MID_USECS;
+	intr_moder_tbl[ENA_INTR_MODER_MID].pkts_per_interval =
+		ENA_INTR_MID_PKTS;
+	intr_moder_tbl[ENA_INTR_MODER_MID].bytes_per_interval =
+		ENA_INTR_MID_BYTES;
+
+	intr_moder_tbl[ENA_INTR_MODER_HIGH].intr_moder_interval =
+		ENA_INTR_HIGH_USECS;
+	intr_moder_tbl[ENA_INTR_MODER_HIGH].pkts_per_interval =
+		ENA_INTR_HIGH_PKTS;
+	intr_moder_tbl[ENA_INTR_MODER_HIGH].bytes_per_interval =
+		ENA_INTR_HIGH_BYTES;
+
+	intr_moder_tbl[ENA_INTR_MODER_HIGHEST].intr_moder_interval =
+		ENA_INTR_HIGHEST_USECS;
+	intr_moder_tbl[ENA_INTR_MODER_HIGHEST].pkts_per_interval =
+		ENA_INTR_HIGHEST_PKTS;
+	intr_moder_tbl[ENA_INTR_MODER_HIGHEST].bytes_per_interval =
+		ENA_INTR_HIGHEST_BYTES;
+}
+
+unsigned int ena_com_get_nonadaptive_moderation_interval_tx(struct ena_com_dev *ena_dev)
+{
+	return ena_dev->intr_moder_tx_interval;
+}
+
+unsigned int ena_com_get_nonadaptive_moderation_interval_rx(struct ena_com_dev *ena_dev)
+{
+	struct ena_intr_moder_entry *intr_moder_tbl = ena_dev->intr_moder_tbl;
+
+	if (intr_moder_tbl)
+		return intr_moder_tbl[ENA_INTR_MODER_LOWEST].intr_moder_interval;
+
+	return 0;
+}
+
+void ena_com_init_intr_moderation_entry(struct ena_com_dev *ena_dev,
+					enum ena_intr_moder_level level,
+					struct ena_intr_moder_entry *entry)
+{
+	struct ena_intr_moder_entry *intr_moder_tbl = ena_dev->intr_moder_tbl;
+
+	if (level >= ENA_INTR_MAX_NUM_OF_LEVELS)
+		return;
+
+	intr_moder_tbl[level].intr_moder_interval = entry->intr_moder_interval;
+	if (ena_dev->intr_delay_resolution)
+		intr_moder_tbl[level].intr_moder_interval /=
+			ena_dev->intr_delay_resolution;
+	intr_moder_tbl[level].pkts_per_interval = entry->pkts_per_interval;
+
+	/* use hardcoded value until ethtool supports bytecount parameter */
+	if (entry->bytes_per_interval != ENA_INTR_BYTE_COUNT_NOT_SUPPORTED)
+		intr_moder_tbl[level].bytes_per_interval = entry->bytes_per_interval;
+}
+
+void ena_com_get_intr_moderation_entry(struct ena_com_dev *ena_dev,
+				       enum ena_intr_moder_level level,
+				       struct ena_intr_moder_entry *entry)
+{
+	struct ena_intr_moder_entry *intr_moder_tbl = ena_dev->intr_moder_tbl;
+
+	if (level >= ENA_INTR_MAX_NUM_OF_LEVELS)
+		return;
+
+	entry->intr_moder_interval = intr_moder_tbl[level].intr_moder_interval;
+	if (ena_dev->intr_delay_resolution)
+		entry->intr_moder_interval *= ena_dev->intr_delay_resolution;
+	entry->pkts_per_interval =
+	intr_moder_tbl[level].pkts_per_interval;
+	entry->bytes_per_interval = intr_moder_tbl[level].bytes_per_interval;
+}
diff --git a/drivers/amazon/net/ena/ena_com.h b/drivers/amazon/net/ena/ena_com.h
new file mode 100644
index 0000000000000..bd9c00110f87d
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_com.h
@@ -0,0 +1,1051 @@
+/*
+ * Copyright 2015 Amazon.com, Inc. or its affiliates.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ENA_COM
+#define ENA_COM
+
+#include <linux/compiler.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+
+#include "kcompat.h"
+#include "ena_common_defs.h"
+#include "ena_admin_defs.h"
+#include "ena_eth_io_defs.h"
+#include "ena_regs_defs.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#define ENA_MAX_NUM_IO_QUEUES		128U
+/* We need to queues for each IO (on for Tx and one for Rx) */
+#define ENA_TOTAL_NUM_QUEUES		(2 * (ENA_MAX_NUM_IO_QUEUES))
+
+#define ENA_MAX_HANDLERS 256
+
+#define ENA_MAX_PHYS_ADDR_SIZE_BITS 48
+
+/* Unit in usec */
+#define ENA_REG_READ_TIMEOUT 200000
+
+#define ADMIN_SQ_SIZE(depth)	((depth) * sizeof(struct ena_admin_aq_entry))
+#define ADMIN_CQ_SIZE(depth)	((depth) * sizeof(struct ena_admin_acq_entry))
+#define ADMIN_AENQ_SIZE(depth)	((depth) * sizeof(struct ena_admin_aenq_entry))
+
+/*****************************************************************************/
+/*****************************************************************************/
+/* ENA adaptive interrupt moderation settings */
+
+#define ENA_INTR_LOWEST_USECS           (0)
+#define ENA_INTR_LOWEST_PKTS            (3)
+#define ENA_INTR_LOWEST_BYTES           (2 * 1524)
+
+#define ENA_INTR_LOW_USECS              (32)
+#define ENA_INTR_LOW_PKTS               (12)
+#define ENA_INTR_LOW_BYTES              (16 * 1024)
+
+#define ENA_INTR_MID_USECS              (80)
+#define ENA_INTR_MID_PKTS               (48)
+#define ENA_INTR_MID_BYTES              (64 * 1024)
+
+#define ENA_INTR_HIGH_USECS             (128)
+#define ENA_INTR_HIGH_PKTS              (96)
+#define ENA_INTR_HIGH_BYTES             (128 * 1024)
+
+#define ENA_INTR_HIGHEST_USECS          (192)
+#define ENA_INTR_HIGHEST_PKTS           (128)
+#define ENA_INTR_HIGHEST_BYTES          (192 * 1024)
+
+#define ENA_INTR_INITIAL_TX_INTERVAL_USECS		196
+#define ENA_INTR_INITIAL_RX_INTERVAL_USECS		4
+#define ENA_INTR_DELAY_OLD_VALUE_WEIGHT			6
+#define ENA_INTR_DELAY_NEW_VALUE_WEIGHT			4
+#define ENA_INTR_MODER_LEVEL_STRIDE			1
+#define ENA_INTR_BYTE_COUNT_NOT_SUPPORTED		0xFFFFFF
+
+#define ENA_HW_HINTS_NO_TIMEOUT				0xFFFF
+
+enum ena_intr_moder_level {
+	ENA_INTR_MODER_LOWEST = 0,
+	ENA_INTR_MODER_LOW,
+	ENA_INTR_MODER_MID,
+	ENA_INTR_MODER_HIGH,
+	ENA_INTR_MODER_HIGHEST,
+	ENA_INTR_MAX_NUM_OF_LEVELS,
+};
+
+struct ena_intr_moder_entry {
+	unsigned int intr_moder_interval;
+	unsigned int pkts_per_interval;
+	unsigned int bytes_per_interval;
+};
+
+enum queue_direction {
+	ENA_COM_IO_QUEUE_DIRECTION_TX,
+	ENA_COM_IO_QUEUE_DIRECTION_RX
+};
+
+struct ena_com_buf {
+	dma_addr_t paddr; /**< Buffer physical address */
+	u16 len; /**< Buffer length in bytes */
+};
+
+struct ena_com_rx_buf_info {
+	u16 len;
+	u16 req_id;
+};
+
+struct ena_com_io_desc_addr {
+	u8 __iomem *pbuf_dev_addr; /* LLQ address */
+	u8 *virt_addr;
+	dma_addr_t phys_addr;
+};
+
+struct ena_com_tx_meta {
+	u16 mss;
+	u16 l3_hdr_len;
+	u16 l3_hdr_offset;
+	u16 l4_hdr_len; /* In words */
+};
+
+struct ena_com_io_cq {
+	struct ena_com_io_desc_addr cdesc_addr;
+	void *bus;
+
+	/* Interrupt unmask register */
+	u32 __iomem *unmask_reg;
+
+	/* The completion queue head doorbell register */
+	u32 __iomem *cq_head_db_reg;
+
+	/* numa configuration register (for TPH) */
+	u32 __iomem *numa_node_cfg_reg;
+
+	/* The value to write to the above register to unmask
+	 * the interrupt of this queue
+	 */
+	u32 msix_vector;
+
+	enum queue_direction direction;
+
+	/* holds the number of cdesc of the current packet */
+	u16 cur_rx_pkt_cdesc_count;
+	/* save the firt cdesc idx of the current packet */
+	u16 cur_rx_pkt_cdesc_start_idx;
+
+	u16 q_depth;
+	/* Caller qid */
+	u16 qid;
+
+	/* Device queue index */
+	u16 idx;
+	u16 head;
+	u16 last_head_update;
+	u8 phase;
+	u8 cdesc_entry_size_in_bytes;
+
+} ____cacheline_aligned;
+
+struct ena_com_io_sq {
+	struct ena_com_io_desc_addr desc_addr;
+	void *bus;
+
+	u32 __iomem *db_addr;
+	u8 __iomem *header_addr;
+
+	enum queue_direction direction;
+	enum ena_admin_placement_policy_type mem_queue_type;
+
+	u32 msix_vector;
+	struct ena_com_tx_meta cached_tx_meta;
+
+	u16 q_depth;
+	u16 qid;
+
+	u16 idx;
+	u16 tail;
+	u16 next_to_comp;
+	u32 tx_max_header_size;
+	u8 phase;
+	u8 desc_entry_size;
+	u8 dma_addr_bits;
+} ____cacheline_aligned;
+
+struct ena_com_admin_cq {
+	struct ena_admin_acq_entry *entries;
+	dma_addr_t dma_addr;
+
+	u16 head;
+	u8 phase;
+};
+
+struct ena_com_admin_sq {
+	struct ena_admin_aq_entry *entries;
+	dma_addr_t dma_addr;
+
+	u32 __iomem *db_addr;
+
+	u16 head;
+	u16 tail;
+	u8 phase;
+
+};
+
+struct ena_com_stats_admin {
+	u32 aborted_cmd;
+	u32 submitted_cmd;
+	u32 completed_cmd;
+	u32 out_of_space;
+	u32 no_completion;
+};
+
+struct ena_com_admin_queue {
+	void *q_dmadev;
+	void *bus;
+	spinlock_t q_lock; /* spinlock for the admin queue */
+
+	struct ena_comp_ctx *comp_ctx;
+	u32 completion_timeout;
+	u16 q_depth;
+	struct ena_com_admin_cq cq;
+	struct ena_com_admin_sq sq;
+
+	/* Indicate if the admin queue should poll for completion */
+	bool polling;
+
+	u16 curr_cmd_id;
+
+	/* Indicate that the ena was initialized and can
+	 * process new admin commands
+	 */
+	bool running_state;
+
+	/* Count the number of outstanding admin commands */
+	atomic_t outstanding_cmds;
+
+	struct ena_com_stats_admin stats;
+};
+
+struct ena_aenq_handlers;
+
+struct ena_com_aenq {
+	u16 head;
+	u8 phase;
+	struct ena_admin_aenq_entry *entries;
+	dma_addr_t dma_addr;
+	u16 q_depth;
+	struct ena_aenq_handlers *aenq_handlers;
+};
+
+struct ena_com_mmio_read {
+	struct ena_admin_ena_mmio_req_read_less_resp *read_resp;
+	dma_addr_t read_resp_dma_addr;
+	u32 reg_read_to; /* in us */
+	u16 seq_num;
+	bool readless_supported;
+	/* spin lock to ensure a single outstanding read */
+	spinlock_t lock;
+};
+
+struct ena_rss {
+	/* Indirect table */
+	u16 *host_rss_ind_tbl;
+	struct ena_admin_rss_ind_table_entry *rss_ind_tbl;
+	dma_addr_t rss_ind_tbl_dma_addr;
+	u16 tbl_log_size;
+
+	/* Hash key */
+	enum ena_admin_hash_functions hash_func;
+	struct ena_admin_feature_rss_flow_hash_control *hash_key;
+	dma_addr_t hash_key_dma_addr;
+	u32 hash_init_val;
+
+	/* Flow Control */
+	struct ena_admin_feature_rss_hash_control *hash_ctrl;
+	dma_addr_t hash_ctrl_dma_addr;
+
+};
+
+struct ena_host_attribute {
+	/* Debug area */
+	u8 *debug_area_virt_addr;
+	dma_addr_t debug_area_dma_addr;
+	u32 debug_area_size;
+
+	/* Host information */
+	struct ena_admin_host_info *host_info;
+	dma_addr_t host_info_dma_addr;
+};
+
+/* Each ena_dev is a PCI function. */
+struct ena_com_dev {
+	struct ena_com_admin_queue admin_queue;
+	struct ena_com_aenq aenq;
+	struct ena_com_io_cq io_cq_queues[ENA_TOTAL_NUM_QUEUES];
+	struct ena_com_io_sq io_sq_queues[ENA_TOTAL_NUM_QUEUES];
+	u8 __iomem *reg_bar;
+	void __iomem *mem_bar;
+	void *dmadev;
+	void *bus;
+
+	enum ena_admin_placement_policy_type tx_mem_queue_type;
+	u32 tx_max_header_size;
+	u16 stats_func; /* Selected function for extended statistic dump */
+	u16 stats_queue; /* Selected queue for extended statistic dump */
+
+	struct ena_com_mmio_read mmio_read;
+
+	struct ena_rss rss;
+	u32 supported_features;
+	u32 dma_addr_bits;
+
+	struct ena_host_attribute host_attr;
+	bool adaptive_coalescing;
+	u16 intr_delay_resolution;
+	u32 intr_moder_tx_interval;
+	struct ena_intr_moder_entry *intr_moder_tbl;
+};
+
+struct ena_com_dev_get_features_ctx {
+	struct ena_admin_queue_feature_desc max_queues;
+	struct ena_admin_device_attr_feature_desc dev_attr;
+	struct ena_admin_feature_aenq_desc aenq;
+	struct ena_admin_feature_offload_desc offload;
+	struct ena_admin_ena_hw_hints hw_hints;
+};
+
+struct ena_com_create_io_ctx {
+	enum ena_admin_placement_policy_type mem_queue_type;
+	enum queue_direction direction;
+	int numa_node;
+	u32 msix_vector;
+	u16 queue_size;
+	u16 qid;
+};
+
+typedef void (*ena_aenq_handler)(void *data,
+	struct ena_admin_aenq_entry *aenq_e);
+
+/* Holds aenq handlers. Indexed by AENQ event group */
+struct ena_aenq_handlers {
+	ena_aenq_handler handlers[ENA_MAX_HANDLERS];
+	ena_aenq_handler unimplemented_handler;
+};
+
+/*****************************************************************************/
+/*****************************************************************************/
+
+/* ena_com_mmio_reg_read_request_init - Init the mmio reg read mechanism
+ * @ena_dev: ENA communication layer struct
+ *
+ * Initialize the register read mechanism.
+ *
+ * @note: This method must be the first stage in the initialization sequence.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int ena_com_mmio_reg_read_request_init(struct ena_com_dev *ena_dev);
+
+/* ena_com_set_mmio_read_mode - Enable/disable the mmio reg read mechanism
+ * @ena_dev: ENA communication layer struct
+ * @readless_supported: readless mode (enable/disable)
+ */
+void ena_com_set_mmio_read_mode(struct ena_com_dev *ena_dev,
+				bool readless_supported);
+
+/* ena_com_mmio_reg_read_request_write_dev_addr - Write the mmio reg read return
+ * value physical address.
+ * @ena_dev: ENA communication layer struct
+ */
+void ena_com_mmio_reg_read_request_write_dev_addr(struct ena_com_dev *ena_dev);
+
+/* ena_com_mmio_reg_read_request_destroy - Destroy the mmio reg read mechanism
+ * @ena_dev: ENA communication layer struct
+ */
+void ena_com_mmio_reg_read_request_destroy(struct ena_com_dev *ena_dev);
+
+/* ena_com_admin_init - Init the admin and the async queues
+ * @ena_dev: ENA communication layer struct
+ * @aenq_handlers: Those handlers to be called upon event.
+ * @init_spinlock: Indicate if this method should init the admin spinlock or
+ * the spinlock was init before (for example, in a case of FLR).
+ *
+ * Initialize the admin submission and completion queues.
+ * Initialize the asynchronous events notification queues.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int ena_com_admin_init(struct ena_com_dev *ena_dev,
+		       struct ena_aenq_handlers *aenq_handlers,
+		       bool init_spinlock);
+
+/* ena_com_admin_destroy - Destroy the admin and the async events queues.
+ * @ena_dev: ENA communication layer struct
+ *
+ * @note: Before calling this method, the caller must validate that the device
+ * won't send any additional admin completions/aenq.
+ * To achieve that, a FLR is recommended.
+ */
+void ena_com_admin_destroy(struct ena_com_dev *ena_dev);
+
+/* ena_com_dev_reset - Perform device FLR to the device.
+ * @ena_dev: ENA communication layer struct
+ * @reset_reason: Specify what is the trigger for the reset in case of an error.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int ena_com_dev_reset(struct ena_com_dev *ena_dev,
+		      enum ena_regs_reset_reason_types reset_reason);
+
+/* ena_com_create_io_queue - Create io queue.
+ * @ena_dev: ENA communication layer struct
+ * @ctx - create context structure
+ *
+ * Create the submission and the completion queues.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int ena_com_create_io_queue(struct ena_com_dev *ena_dev,
+			    struct ena_com_create_io_ctx *ctx);
+
+/* ena_com_destroy_io_queue - Destroy IO queue with the queue id - qid.
+ * @ena_dev: ENA communication layer struct
+ * @qid - the caller virtual queue id.
+ */
+void ena_com_destroy_io_queue(struct ena_com_dev *ena_dev, u16 qid);
+
+/* ena_com_get_io_handlers - Return the io queue handlers
+ * @ena_dev: ENA communication layer struct
+ * @qid - the caller virtual queue id.
+ * @io_sq - IO submission queue handler
+ * @io_cq - IO completion queue handler.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int ena_com_get_io_handlers(struct ena_com_dev *ena_dev, u16 qid,
+			    struct ena_com_io_sq **io_sq,
+			    struct ena_com_io_cq **io_cq);
+
+/* ena_com_admin_aenq_enable - ENAble asynchronous event notifications
+ * @ena_dev: ENA communication layer struct
+ *
+ * After this method, aenq event can be received via AENQ.
+ */
+void ena_com_admin_aenq_enable(struct ena_com_dev *ena_dev);
+
+/* ena_com_set_admin_running_state - Set the state of the admin queue
+ * @ena_dev: ENA communication layer struct
+ *
+ * Change the state of the admin queue (enable/disable)
+ */
+void ena_com_set_admin_running_state(struct ena_com_dev *ena_dev, bool state);
+
+/* ena_com_get_admin_running_state - Get the admin queue state
+ * @ena_dev: ENA communication layer struct
+ *
+ * Retrieve the state of the admin queue (enable/disable)
+ *
+ * @return - current polling mode (enable/disable)
+ */
+bool ena_com_get_admin_running_state(struct ena_com_dev *ena_dev);
+
+/* ena_com_set_admin_polling_mode - Set the admin completion queue polling mode
+ * @ena_dev: ENA communication layer struct
+ * @polling: ENAble/Disable polling mode
+ *
+ * Set the admin completion mode.
+ */
+void ena_com_set_admin_polling_mode(struct ena_com_dev *ena_dev, bool polling);
+
+/* ena_com_set_admin_polling_mode - Get the admin completion queue polling mode
+ * @ena_dev: ENA communication layer struct
+ *
+ * Get the admin completion mode.
+ * If polling mode is on, ena_com_execute_admin_command will perform a
+ * polling on the admin completion queue for the commands completion,
+ * otherwise it will wait on wait event.
+ *
+ * @return state
+ */
+bool ena_com_get_ena_admin_polling_mode(struct ena_com_dev *ena_dev);
+
+/* ena_com_admin_q_comp_intr_handler - admin queue interrupt handler
+ * @ena_dev: ENA communication layer struct
+ *
+ * This method go over the admin completion queue and wake up all the pending
+ * threads that wait on the commands wait event.
+ *
+ * @note: Should be called after MSI-X interrupt.
+ */
+void ena_com_admin_q_comp_intr_handler(struct ena_com_dev *ena_dev);
+
+/* ena_com_aenq_intr_handler - AENQ interrupt handler
+ * @ena_dev: ENA communication layer struct
+ *
+ * This method go over the async event notification queue and call the proper
+ * aenq handler.
+ */
+void ena_com_aenq_intr_handler(struct ena_com_dev *dev, void *data);
+
+/* ena_com_abort_admin_commands - Abort all the outstanding admin commands.
+ * @ena_dev: ENA communication layer struct
+ *
+ * This method aborts all the outstanding admin commands.
+ * The caller should then call ena_com_wait_for_abort_completion to make sure
+ * all the commands were completed.
+ */
+void ena_com_abort_admin_commands(struct ena_com_dev *ena_dev);
+
+/* ena_com_wait_for_abort_completion - Wait for admin commands abort.
+ * @ena_dev: ENA communication layer struct
+ *
+ * This method wait until all the outstanding admin commands will be completed.
+ */
+void ena_com_wait_for_abort_completion(struct ena_com_dev *ena_dev);
+
+/* ena_com_validate_version - Validate the device parameters
+ * @ena_dev: ENA communication layer struct
+ *
+ * This method validate the device parameters are the same as the saved
+ * parameters in ena_dev.
+ * This method is useful after device reset, to validate the device mac address
+ * and the device offloads are the same as before the reset.
+ *
+ * @return - 0 on success negative value otherwise.
+ */
+int ena_com_validate_version(struct ena_com_dev *ena_dev);
+
+/* ena_com_get_link_params - Retrieve physical link parameters.
+ * @ena_dev: ENA communication layer struct
+ * @resp: Link parameters
+ *
+ * Retrieve the physical link parameters,
+ * like speed, auto-negotiation and full duplex support.
+ *
+ * @return - 0 on Success negative value otherwise.
+ */
+int ena_com_get_link_params(struct ena_com_dev *ena_dev,
+			    struct ena_admin_get_feat_resp *resp);
+
+/* ena_com_get_dma_width - Retrieve physical dma address width the device
+ * supports.
+ * @ena_dev: ENA communication layer struct
+ *
+ * Retrieve the maximum physical address bits the device can handle.
+ *
+ * @return: > 0 on Success and negative value otherwise.
+ */
+int ena_com_get_dma_width(struct ena_com_dev *ena_dev);
+
+/* ena_com_set_aenq_config - Set aenq groups configurations
+ * @ena_dev: ENA communication layer struct
+ * @groups flag: bit fields flags of enum ena_admin_aenq_group.
+ *
+ * Configure which aenq event group the driver would like to receive.
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_set_aenq_config(struct ena_com_dev *ena_dev, u32 groups_flag);
+
+/* ena_com_get_dev_attr_feat - Get device features
+ * @ena_dev: ENA communication layer struct
+ * @get_feat_ctx: returned context that contain the get features.
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev,
+			      struct ena_com_dev_get_features_ctx *get_feat_ctx);
+
+/* ena_com_get_dev_basic_stats - Get device basic statistics
+ * @ena_dev: ENA communication layer struct
+ * @stats: stats return value
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_get_dev_basic_stats(struct ena_com_dev *ena_dev,
+				struct ena_admin_basic_stats *stats);
+
+/* ena_com_set_dev_mtu - Configure the device mtu.
+ * @ena_dev: ENA communication layer struct
+ * @mtu: mtu value
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, int mtu);
+
+/* ena_com_get_offload_settings - Retrieve the device offloads capabilities
+ * @ena_dev: ENA communication layer struct
+ * @offlad: offload return value
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_get_offload_settings(struct ena_com_dev *ena_dev,
+				 struct ena_admin_feature_offload_desc *offload);
+
+/* ena_com_rss_init - Init RSS
+ * @ena_dev: ENA communication layer struct
+ * @log_size: indirection log size
+ *
+ * Allocate RSS/RFS resources.
+ * The caller then can configure rss using ena_com_set_hash_function,
+ * ena_com_set_hash_ctrl and ena_com_indirect_table_set.
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_rss_init(struct ena_com_dev *ena_dev, u16 log_size);
+
+/* ena_com_rss_destroy - Destroy rss
+ * @ena_dev: ENA communication layer struct
+ *
+ * Free all the RSS/RFS resources.
+ */
+void ena_com_rss_destroy(struct ena_com_dev *ena_dev);
+
+/* ena_com_fill_hash_function - Fill RSS hash function
+ * @ena_dev: ENA communication layer struct
+ * @func: The hash function (Toeplitz or crc)
+ * @key: Hash key (for toeplitz hash)
+ * @key_len: key length (max length 10 DW)
+ * @init_val: initial value for the hash function
+ *
+ * Fill the ena_dev resources with the desire hash function, hash key, key_len
+ * and key initial value (if needed by the hash function).
+ * To flush the key into the device the caller should call
+ * ena_com_set_hash_function.
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_fill_hash_function(struct ena_com_dev *ena_dev,
+			       enum ena_admin_hash_functions func,
+			       const u8 *key, u16 key_len, u32 init_val);
+
+/* ena_com_set_hash_function - Flush the hash function and it dependencies to
+ * the device.
+ * @ena_dev: ENA communication layer struct
+ *
+ * Flush the hash function and it dependencies (key, key length and
+ * initial value) if needed.
+ *
+ * @note: Prior to this method the caller should call ena_com_fill_hash_function
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_set_hash_function(struct ena_com_dev *ena_dev);
+
+/* ena_com_get_hash_function - Retrieve the hash function and the hash key
+ * from the device.
+ * @ena_dev: ENA communication layer struct
+ * @func: hash function
+ * @key: hash key
+ *
+ * Retrieve the hash function and the hash key from the device.
+ *
+ * @note: If the caller called ena_com_fill_hash_function but didn't flash
+ * it to the device, the new configuration will be lost.
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_get_hash_function(struct ena_com_dev *ena_dev,
+			      enum ena_admin_hash_functions *func,
+			      u8 *key);
+
+/* ena_com_fill_hash_ctrl - Fill RSS hash control
+ * @ena_dev: ENA communication layer struct.
+ * @proto: The protocol to configure.
+ * @hash_fields: bit mask of ena_admin_flow_hash_fields
+ *
+ * Fill the ena_dev resources with the desire hash control (the ethernet
+ * fields that take part of the hash) for a specific protocol.
+ * To flush the hash control to the device, the caller should call
+ * ena_com_set_hash_ctrl.
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_fill_hash_ctrl(struct ena_com_dev *ena_dev,
+			   enum ena_admin_flow_hash_proto proto,
+			   u16 hash_fields);
+
+/* ena_com_set_hash_ctrl - Flush the hash control resources to the device.
+ * @ena_dev: ENA communication layer struct
+ *
+ * Flush the hash control (the ethernet fields that take part of the hash)
+ *
+ * @note: Prior to this method the caller should call ena_com_fill_hash_ctrl.
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_set_hash_ctrl(struct ena_com_dev *ena_dev);
+
+/* ena_com_get_hash_ctrl - Retrieve the hash control from the device.
+ * @ena_dev: ENA communication layer struct
+ * @proto: The protocol to retrieve.
+ * @fields: bit mask of ena_admin_flow_hash_fields.
+ *
+ * Retrieve the hash control from the device.
+ *
+ * @note, If the caller called ena_com_fill_hash_ctrl but didn't flash
+ * it to the device, the new configuration will be lost.
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_get_hash_ctrl(struct ena_com_dev *ena_dev,
+			  enum ena_admin_flow_hash_proto proto,
+			  u16 *fields);
+
+/* ena_com_set_default_hash_ctrl - Set the hash control to a default
+ * configuration.
+ * @ena_dev: ENA communication layer struct
+ *
+ * Fill the ena_dev resources with the default hash control configuration.
+ * To flush the hash control to the device, the caller should call
+ * ena_com_set_hash_ctrl.
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_set_default_hash_ctrl(struct ena_com_dev *ena_dev);
+
+/* ena_com_indirect_table_fill_entry - Fill a single entry in the RSS
+ * indirection table
+ * @ena_dev: ENA communication layer struct.
+ * @entry_idx - indirection table entry.
+ * @entry_value - redirection value
+ *
+ * Fill a single entry of the RSS indirection table in the ena_dev resources.
+ * To flush the indirection table to the device, the called should call
+ * ena_com_indirect_table_set.
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_indirect_table_fill_entry(struct ena_com_dev *ena_dev,
+				      u16 entry_idx, u16 entry_value);
+
+/* ena_com_indirect_table_set - Flush the indirection table to the device.
+ * @ena_dev: ENA communication layer struct
+ *
+ * Flush the indirection hash control to the device.
+ * Prior to this method the caller should call ena_com_indirect_table_fill_entry
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_indirect_table_set(struct ena_com_dev *ena_dev);
+
+/* ena_com_indirect_table_get - Retrieve the indirection table from the device.
+ * @ena_dev: ENA communication layer struct
+ * @ind_tbl: indirection table
+ *
+ * Retrieve the RSS indirection table from the device.
+ *
+ * @note: If the caller called ena_com_indirect_table_fill_entry but didn't flash
+ * it to the device, the new configuration will be lost.
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_indirect_table_get(struct ena_com_dev *ena_dev, u32 *ind_tbl);
+
+/* ena_com_allocate_host_info - Allocate host info resources.
+ * @ena_dev: ENA communication layer struct
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_allocate_host_info(struct ena_com_dev *ena_dev);
+
+/* ena_com_allocate_debug_area - Allocate debug area.
+ * @ena_dev: ENA communication layer struct
+ * @debug_area_size - debug area size.
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_allocate_debug_area(struct ena_com_dev *ena_dev,
+				u32 debug_area_size);
+
+/* ena_com_delete_debug_area - Free the debug area resources.
+ * @ena_dev: ENA communication layer struct
+ *
+ * Free the allocate debug area.
+ */
+void ena_com_delete_debug_area(struct ena_com_dev *ena_dev);
+
+/* ena_com_delete_host_info - Free the host info resources.
+ * @ena_dev: ENA communication layer struct
+ *
+ * Free the allocate host info.
+ */
+void ena_com_delete_host_info(struct ena_com_dev *ena_dev);
+
+/* ena_com_set_host_attributes - Update the device with the host
+ * attributes (debug area and host info) base address.
+ * @ena_dev: ENA communication layer struct
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_set_host_attributes(struct ena_com_dev *ena_dev);
+
+/* ena_com_create_io_cq - Create io completion queue.
+ * @ena_dev: ENA communication layer struct
+ * @io_cq - io completion queue handler
+
+ * Create IO completion queue.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int ena_com_create_io_cq(struct ena_com_dev *ena_dev,
+			 struct ena_com_io_cq *io_cq);
+
+/* ena_com_destroy_io_cq - Destroy io completion queue.
+ * @ena_dev: ENA communication layer struct
+ * @io_cq - io completion queue handler
+
+ * Destroy IO completion queue.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int ena_com_destroy_io_cq(struct ena_com_dev *ena_dev,
+			  struct ena_com_io_cq *io_cq);
+
+/* ena_com_execute_admin_command - Execute admin command
+ * @admin_queue: admin queue.
+ * @cmd: the admin command to execute.
+ * @cmd_size: the command size.
+ * @cmd_completion: command completion return value.
+ * @cmd_comp_size: command completion size.
+
+ * Submit an admin command and then wait until the device will return a
+ * completion.
+ * The completion will be copyed into cmd_comp.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int ena_com_execute_admin_command(struct ena_com_admin_queue *admin_queue,
+				  struct ena_admin_aq_entry *cmd,
+				  size_t cmd_size,
+				  struct ena_admin_acq_entry *cmd_comp,
+				  size_t cmd_comp_size);
+
+/* ena_com_init_interrupt_moderation - Init interrupt moderation
+ * @ena_dev: ENA communication layer struct
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int ena_com_init_interrupt_moderation(struct ena_com_dev *ena_dev);
+
+/* ena_com_destroy_interrupt_moderation - Destroy interrupt moderation resources
+ * @ena_dev: ENA communication layer struct
+ */
+void ena_com_destroy_interrupt_moderation(struct ena_com_dev *ena_dev);
+
+/* ena_com_interrupt_moderation_supported - Return if interrupt moderation
+ * capability is supported by the device.
+ *
+ * @return - supported or not.
+ */
+bool ena_com_interrupt_moderation_supported(struct ena_com_dev *ena_dev);
+
+/* ena_com_config_default_interrupt_moderation_table - Restore the interrupt
+ * moderation table back to the default parameters.
+ * @ena_dev: ENA communication layer struct
+ */
+void ena_com_config_default_interrupt_moderation_table(struct ena_com_dev *ena_dev);
+
+/* ena_com_update_nonadaptive_moderation_interval_tx - Update the
+ * non-adaptive interval in Tx direction.
+ * @ena_dev: ENA communication layer struct
+ * @tx_coalesce_usecs: Interval in usec.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int ena_com_update_nonadaptive_moderation_interval_tx(struct ena_com_dev *ena_dev,
+						      u32 tx_coalesce_usecs);
+
+/* ena_com_update_nonadaptive_moderation_interval_rx - Update the
+ * non-adaptive interval in Rx direction.
+ * @ena_dev: ENA communication layer struct
+ * @rx_coalesce_usecs: Interval in usec.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int ena_com_update_nonadaptive_moderation_interval_rx(struct ena_com_dev *ena_dev,
+						      u32 rx_coalesce_usecs);
+
+/* ena_com_get_nonadaptive_moderation_interval_tx - Retrieve the
+ * non-adaptive interval in Tx direction.
+ * @ena_dev: ENA communication layer struct
+ *
+ * @return - interval in usec
+ */
+unsigned int ena_com_get_nonadaptive_moderation_interval_tx(struct ena_com_dev *ena_dev);
+
+/* ena_com_get_nonadaptive_moderation_interval_rx - Retrieve the
+ * non-adaptive interval in Rx direction.
+ * @ena_dev: ENA communication layer struct
+ *
+ * @return - interval in usec
+ */
+unsigned int ena_com_get_nonadaptive_moderation_interval_rx(struct ena_com_dev *ena_dev);
+
+/* ena_com_init_intr_moderation_entry - Update a single entry in the interrupt
+ * moderation table.
+ * @ena_dev: ENA communication layer struct
+ * @level: Interrupt moderation table level
+ * @entry: Entry value
+ *
+ * Update a single entry in the interrupt moderation table.
+ */
+void ena_com_init_intr_moderation_entry(struct ena_com_dev *ena_dev,
+					enum ena_intr_moder_level level,
+					struct ena_intr_moder_entry *entry);
+
+/* ena_com_get_intr_moderation_entry - Init ena_intr_moder_entry.
+ * @ena_dev: ENA communication layer struct
+ * @level: Interrupt moderation table level
+ * @entry: Entry to fill.
+ *
+ * Initialize the entry according to the adaptive interrupt moderation table.
+ */
+void ena_com_get_intr_moderation_entry(struct ena_com_dev *ena_dev,
+				       enum ena_intr_moder_level level,
+				       struct ena_intr_moder_entry *entry);
+
+static inline bool ena_com_get_adaptive_moderation_enabled(struct ena_com_dev *ena_dev)
+{
+	return ena_dev->adaptive_coalescing;
+}
+
+static inline void ena_com_enable_adaptive_moderation(struct ena_com_dev *ena_dev)
+{
+	ena_dev->adaptive_coalescing = true;
+}
+
+static inline void ena_com_disable_adaptive_moderation(struct ena_com_dev *ena_dev)
+{
+	ena_dev->adaptive_coalescing = false;
+}
+
+/* ena_com_calculate_interrupt_delay - Calculate new interrupt delay
+ * @ena_dev: ENA communication layer struct
+ * @pkts: Number of packets since the last update
+ * @bytes: Number of bytes received since the last update.
+ * @smoothed_interval: Returned interval
+ * @moder_tbl_idx: Current table level as input update new level as return
+ * value.
+ */
+static inline void ena_com_calculate_interrupt_delay(struct ena_com_dev *ena_dev,
+						     unsigned int pkts,
+						     unsigned int bytes,
+						     unsigned int *smoothed_interval,
+						     unsigned int *moder_tbl_idx)
+{
+	enum ena_intr_moder_level curr_moder_idx, new_moder_idx;
+	struct ena_intr_moder_entry *curr_moder_entry;
+	struct ena_intr_moder_entry *pred_moder_entry;
+	struct ena_intr_moder_entry *new_moder_entry;
+	struct ena_intr_moder_entry *intr_moder_tbl = ena_dev->intr_moder_tbl;
+	unsigned int interval;
+
+	/* We apply adaptive moderation on Rx path only.
+	 * Tx uses static interrupt moderation.
+	 */
+	if (!pkts || !bytes)
+		/* Tx interrupt, or spurious interrupt,
+		 * in both cases we just use same delay values
+		 */
+		return;
+
+	curr_moder_idx = (enum ena_intr_moder_level)(*moder_tbl_idx);
+	if (unlikely(curr_moder_idx >= ENA_INTR_MAX_NUM_OF_LEVELS)) {
+		pr_err("Wrong moderation index %u\n", curr_moder_idx);
+		return;
+	}
+
+	curr_moder_entry = &intr_moder_tbl[curr_moder_idx];
+	new_moder_idx = curr_moder_idx;
+
+	if (curr_moder_idx == ENA_INTR_MODER_LOWEST) {
+		if ((pkts > curr_moder_entry->pkts_per_interval) ||
+		    (bytes > curr_moder_entry->bytes_per_interval))
+			new_moder_idx =
+				(enum ena_intr_moder_level)(curr_moder_idx + ENA_INTR_MODER_LEVEL_STRIDE);
+	} else {
+		pred_moder_entry = &intr_moder_tbl[curr_moder_idx - ENA_INTR_MODER_LEVEL_STRIDE];
+
+		if ((pkts <= pred_moder_entry->pkts_per_interval) ||
+		    (bytes <= pred_moder_entry->bytes_per_interval))
+			new_moder_idx =
+				(enum ena_intr_moder_level)(curr_moder_idx - ENA_INTR_MODER_LEVEL_STRIDE);
+		else if ((pkts > curr_moder_entry->pkts_per_interval) ||
+			 (bytes > curr_moder_entry->bytes_per_interval)) {
+			if (curr_moder_idx != ENA_INTR_MODER_HIGHEST)
+				new_moder_idx =
+					(enum ena_intr_moder_level)(curr_moder_idx + ENA_INTR_MODER_LEVEL_STRIDE);
+		}
+	}
+	new_moder_entry = &intr_moder_tbl[new_moder_idx];
+
+	interval = new_moder_entry->intr_moder_interval;
+	*smoothed_interval = (
+		(interval * ENA_INTR_DELAY_NEW_VALUE_WEIGHT +
+		ENA_INTR_DELAY_OLD_VALUE_WEIGHT * (*smoothed_interval)) + 5) /
+		10;
+
+	*moder_tbl_idx = new_moder_idx;
+}
+
+/* ena_com_update_intr_reg - Prepare interrupt register
+ * @intr_reg: interrupt register to update.
+ * @rx_delay_interval: Rx interval in usecs
+ * @tx_delay_interval: Tx interval in usecs
+ * @unmask: unask enable/disable
+ *
+ * Prepare interrupt update register with the supplied parameters.
+ */
+static inline void ena_com_update_intr_reg(struct ena_eth_io_intr_reg *intr_reg,
+					   u32 rx_delay_interval,
+					   u32 tx_delay_interval,
+					   bool unmask)
+{
+	intr_reg->intr_control = 0;
+	intr_reg->intr_control |= rx_delay_interval &
+		ENA_ETH_IO_INTR_REG_RX_INTR_DELAY_MASK;
+
+	intr_reg->intr_control |=
+		(tx_delay_interval << ENA_ETH_IO_INTR_REG_TX_INTR_DELAY_SHIFT)
+		& ENA_ETH_IO_INTR_REG_TX_INTR_DELAY_MASK;
+
+	if (unmask)
+		intr_reg->intr_control |= ENA_ETH_IO_INTR_REG_INTR_UNMASK_MASK;
+}
+
+#endif /* !(ENA_COM) */
diff --git a/drivers/amazon/net/ena/ena_common_defs.h b/drivers/amazon/net/ena/ena_common_defs.h
new file mode 100644
index 0000000000000..bb8d73676eab6
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_common_defs.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2015 - 2016 Amazon.com, Inc. or its affiliates.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _ENA_COMMON_H_
+#define _ENA_COMMON_H_
+
+#define ENA_COMMON_SPEC_VERSION_MAJOR	0 /*  */
+#define ENA_COMMON_SPEC_VERSION_MINOR	10 /*  */
+
+/* ENA operates with 48-bit memory addresses. ena_mem_addr_t */
+struct ena_common_mem_addr {
+	u32 mem_addr_low;
+
+	u16 mem_addr_high;
+
+	/* MBZ */
+	u16 reserved16;
+};
+
+#endif /*_ENA_COMMON_H_ */
diff --git a/drivers/amazon/net/ena/ena_eth_com.c b/drivers/amazon/net/ena/ena_eth_com.c
new file mode 100644
index 0000000000000..582ea54e25b25
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_eth_com.c
@@ -0,0 +1,518 @@
+/*
+ * Copyright 2015 Amazon.com, Inc. or its affiliates.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ena_eth_com.h"
+
+static inline struct ena_eth_io_rx_cdesc_base *ena_com_get_next_rx_cdesc(
+	struct ena_com_io_cq *io_cq)
+{
+	struct ena_eth_io_rx_cdesc_base *cdesc;
+	u16 expected_phase, head_masked;
+	u16 desc_phase;
+
+	head_masked = io_cq->head & (io_cq->q_depth - 1);
+	expected_phase = io_cq->phase;
+
+	cdesc = (struct ena_eth_io_rx_cdesc_base *)(io_cq->cdesc_addr.virt_addr
+			+ (head_masked * io_cq->cdesc_entry_size_in_bytes));
+
+	desc_phase = (READ_ONCE(cdesc->status) & ENA_ETH_IO_RX_CDESC_BASE_PHASE_MASK) >>
+			ENA_ETH_IO_RX_CDESC_BASE_PHASE_SHIFT;
+
+	if (desc_phase != expected_phase)
+		return NULL;
+
+	return cdesc;
+}
+
+static inline void ena_com_cq_inc_head(struct ena_com_io_cq *io_cq)
+{
+	io_cq->head++;
+
+	/* Switch phase bit in case of wrap around */
+	if (unlikely((io_cq->head & (io_cq->q_depth - 1)) == 0))
+		io_cq->phase ^= 1;
+}
+
+static inline void *get_sq_desc(struct ena_com_io_sq *io_sq)
+{
+	u16 tail_masked;
+	u32 offset;
+
+	tail_masked = io_sq->tail & (io_sq->q_depth - 1);
+
+	offset = tail_masked * io_sq->desc_entry_size;
+
+	return (void *)((uintptr_t)io_sq->desc_addr.virt_addr + offset);
+}
+
+static inline void ena_com_copy_curr_sq_desc_to_dev(struct ena_com_io_sq *io_sq)
+{
+	u16 tail_masked = io_sq->tail & (io_sq->q_depth - 1);
+	u32 offset = tail_masked * io_sq->desc_entry_size;
+
+	/* In case this queue isn't a LLQ */
+	if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST)
+		return;
+
+	memcpy_toio(io_sq->desc_addr.pbuf_dev_addr + offset,
+		    io_sq->desc_addr.virt_addr + offset,
+		    io_sq->desc_entry_size);
+}
+
+static inline void ena_com_sq_update_tail(struct ena_com_io_sq *io_sq)
+{
+	io_sq->tail++;
+
+	/* Switch phase bit in case of wrap around */
+	if (unlikely((io_sq->tail & (io_sq->q_depth - 1)) == 0))
+		io_sq->phase ^= 1;
+}
+
+static inline int ena_com_write_header(struct ena_com_io_sq *io_sq,
+				       u8 *head_src, u16 header_len)
+{
+	u16 tail_masked = io_sq->tail & (io_sq->q_depth - 1);
+	u8 __iomem *dev_head_addr =
+		io_sq->header_addr + (tail_masked * io_sq->tx_max_header_size);
+
+	if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST)
+		return 0;
+
+	if (unlikely(!io_sq->header_addr)) {
+		pr_err("Push buffer header ptr is NULL\n");
+		return -EINVAL;
+	}
+
+	memcpy_toio(dev_head_addr, head_src, header_len);
+
+	return 0;
+}
+
+static inline struct ena_eth_io_rx_cdesc_base *
+	ena_com_rx_cdesc_idx_to_ptr(struct ena_com_io_cq *io_cq, u16 idx)
+{
+	idx &= (io_cq->q_depth - 1);
+	return (struct ena_eth_io_rx_cdesc_base *)
+		((uintptr_t)io_cq->cdesc_addr.virt_addr +
+		idx * io_cq->cdesc_entry_size_in_bytes);
+}
+
+static inline u16 ena_com_cdesc_rx_pkt_get(struct ena_com_io_cq *io_cq,
+					   u16 *first_cdesc_idx)
+{
+	struct ena_eth_io_rx_cdesc_base *cdesc;
+	u16 count = 0, head_masked;
+	u32 last = 0;
+
+	do {
+		cdesc = ena_com_get_next_rx_cdesc(io_cq);
+		if (!cdesc)
+			break;
+
+		ena_com_cq_inc_head(io_cq);
+		count++;
+		last = (READ_ONCE(cdesc->status) & ENA_ETH_IO_RX_CDESC_BASE_LAST_MASK) >>
+			ENA_ETH_IO_RX_CDESC_BASE_LAST_SHIFT;
+	} while (!last);
+
+	if (last) {
+		*first_cdesc_idx = io_cq->cur_rx_pkt_cdesc_start_idx;
+		count += io_cq->cur_rx_pkt_cdesc_count;
+
+		head_masked = io_cq->head & (io_cq->q_depth - 1);
+
+		io_cq->cur_rx_pkt_cdesc_count = 0;
+		io_cq->cur_rx_pkt_cdesc_start_idx = head_masked;
+
+		pr_debug("ena q_id: %d packets were completed. first desc idx %u descs# %d\n",
+			 io_cq->qid, *first_cdesc_idx, count);
+	} else {
+		io_cq->cur_rx_pkt_cdesc_count += count;
+		count = 0;
+	}
+
+	return count;
+}
+
+static inline bool ena_com_meta_desc_changed(struct ena_com_io_sq *io_sq,
+					     struct ena_com_tx_ctx *ena_tx_ctx)
+{
+	int rc;
+
+	if (ena_tx_ctx->meta_valid) {
+		rc = memcmp(&io_sq->cached_tx_meta,
+			    &ena_tx_ctx->ena_meta,
+			    sizeof(struct ena_com_tx_meta));
+
+		if (unlikely(rc != 0))
+			return true;
+	}
+
+	return false;
+}
+
+static inline void ena_com_create_and_store_tx_meta_desc(struct ena_com_io_sq *io_sq,
+							 struct ena_com_tx_ctx *ena_tx_ctx)
+{
+	struct ena_eth_io_tx_meta_desc *meta_desc = NULL;
+	struct ena_com_tx_meta *ena_meta = &ena_tx_ctx->ena_meta;
+
+	meta_desc = get_sq_desc(io_sq);
+	memset(meta_desc, 0x0, sizeof(struct ena_eth_io_tx_meta_desc));
+
+	meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_META_DESC_MASK;
+
+	meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_EXT_VALID_MASK;
+
+	/* bits 0-9 of the mss */
+	meta_desc->word2 |= (ena_meta->mss <<
+		ENA_ETH_IO_TX_META_DESC_MSS_LO_SHIFT) &
+		ENA_ETH_IO_TX_META_DESC_MSS_LO_MASK;
+	/* bits 10-13 of the mss */
+	meta_desc->len_ctrl |= ((ena_meta->mss >> 10) <<
+		ENA_ETH_IO_TX_META_DESC_MSS_HI_SHIFT) &
+		ENA_ETH_IO_TX_META_DESC_MSS_HI_MASK;
+
+	/* Extended meta desc */
+	meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_ETH_META_TYPE_MASK;
+	meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_META_STORE_MASK;
+	meta_desc->len_ctrl |= (io_sq->phase <<
+		ENA_ETH_IO_TX_META_DESC_PHASE_SHIFT) &
+		ENA_ETH_IO_TX_META_DESC_PHASE_MASK;
+
+	meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_FIRST_MASK;
+	meta_desc->word2 |= ena_meta->l3_hdr_len &
+		ENA_ETH_IO_TX_META_DESC_L3_HDR_LEN_MASK;
+	meta_desc->word2 |= (ena_meta->l3_hdr_offset <<
+		ENA_ETH_IO_TX_META_DESC_L3_HDR_OFF_SHIFT) &
+		ENA_ETH_IO_TX_META_DESC_L3_HDR_OFF_MASK;
+
+	meta_desc->word2 |= (ena_meta->l4_hdr_len <<
+		ENA_ETH_IO_TX_META_DESC_L4_HDR_LEN_IN_WORDS_SHIFT) &
+		ENA_ETH_IO_TX_META_DESC_L4_HDR_LEN_IN_WORDS_MASK;
+
+	meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_META_STORE_MASK;
+
+	/* Cached the meta desc */
+	memcpy(&io_sq->cached_tx_meta, ena_meta,
+	       sizeof(struct ena_com_tx_meta));
+
+	ena_com_copy_curr_sq_desc_to_dev(io_sq);
+	ena_com_sq_update_tail(io_sq);
+}
+
+static inline void ena_com_rx_set_flags(struct ena_com_rx_ctx *ena_rx_ctx,
+					struct ena_eth_io_rx_cdesc_base *cdesc)
+{
+	ena_rx_ctx->l3_proto = cdesc->status &
+		ENA_ETH_IO_RX_CDESC_BASE_L3_PROTO_IDX_MASK;
+	ena_rx_ctx->l4_proto =
+		(cdesc->status & ENA_ETH_IO_RX_CDESC_BASE_L4_PROTO_IDX_MASK) >>
+		ENA_ETH_IO_RX_CDESC_BASE_L4_PROTO_IDX_SHIFT;
+	ena_rx_ctx->l3_csum_err =
+		(cdesc->status & ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM_ERR_MASK) >>
+		ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM_ERR_SHIFT;
+	ena_rx_ctx->l4_csum_err =
+		(cdesc->status & ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_ERR_MASK) >>
+		ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_ERR_SHIFT;
+	ena_rx_ctx->hash = cdesc->hash;
+	ena_rx_ctx->frag =
+		(cdesc->status & ENA_ETH_IO_RX_CDESC_BASE_IPV4_FRAG_MASK) >>
+		ENA_ETH_IO_RX_CDESC_BASE_IPV4_FRAG_SHIFT;
+
+	pr_debug("ena_rx_ctx->l3_proto %d ena_rx_ctx->l4_proto %d\nena_rx_ctx->l3_csum_err %d ena_rx_ctx->l4_csum_err %d\nhash frag %d frag: %d cdesc_status: %x\n",
+		 ena_rx_ctx->l3_proto, ena_rx_ctx->l4_proto,
+		 ena_rx_ctx->l3_csum_err, ena_rx_ctx->l4_csum_err,
+		 ena_rx_ctx->hash, ena_rx_ctx->frag, cdesc->status);
+}
+
+/*****************************************************************************/
+/*****************************     API      **********************************/
+/*****************************************************************************/
+
+int ena_com_prepare_tx(struct ena_com_io_sq *io_sq,
+		       struct ena_com_tx_ctx *ena_tx_ctx,
+		       int *nb_hw_desc)
+{
+	struct ena_eth_io_tx_desc *desc = NULL;
+	struct ena_com_buf *ena_bufs = ena_tx_ctx->ena_bufs;
+	void *push_header = ena_tx_ctx->push_header;
+	u16 header_len = ena_tx_ctx->header_len;
+	u16 num_bufs = ena_tx_ctx->num_bufs;
+	int total_desc, i, rc;
+	bool have_meta;
+	u64 addr_hi;
+
+	WARN(io_sq->direction != ENA_COM_IO_QUEUE_DIRECTION_TX, "wrong Q type");
+
+	/* num_bufs +1 for potential meta desc */
+	if (ena_com_sq_empty_space(io_sq) < (num_bufs + 1)) {
+		pr_err("Not enough space in the tx queue\n");
+		return -ENOMEM;
+	}
+
+	if (unlikely(header_len > io_sq->tx_max_header_size)) {
+		pr_err("header size is too large %d max header: %d\n",
+		       header_len, io_sq->tx_max_header_size);
+		return -EINVAL;
+	}
+
+	/* start with pushing the header (if needed) */
+	rc = ena_com_write_header(io_sq, push_header, header_len);
+	if (unlikely(rc))
+		return rc;
+
+	have_meta = ena_tx_ctx->meta_valid && ena_com_meta_desc_changed(io_sq,
+			ena_tx_ctx);
+	if (have_meta)
+		ena_com_create_and_store_tx_meta_desc(io_sq, ena_tx_ctx);
+
+	/* If the caller doesn't want send packets */
+	if (unlikely(!num_bufs && !header_len)) {
+		*nb_hw_desc = have_meta ? 0 : 1;
+		return 0;
+	}
+
+	desc = get_sq_desc(io_sq);
+	memset(desc, 0x0, sizeof(struct ena_eth_io_tx_desc));
+
+	/* Set first desc when we don't have meta descriptor */
+	if (!have_meta)
+		desc->len_ctrl |= ENA_ETH_IO_TX_DESC_FIRST_MASK;
+
+	desc->buff_addr_hi_hdr_sz |= (header_len <<
+		ENA_ETH_IO_TX_DESC_HEADER_LENGTH_SHIFT) &
+		ENA_ETH_IO_TX_DESC_HEADER_LENGTH_MASK;
+	desc->len_ctrl |= (io_sq->phase << ENA_ETH_IO_TX_DESC_PHASE_SHIFT) &
+		ENA_ETH_IO_TX_DESC_PHASE_MASK;
+
+	desc->len_ctrl |= ENA_ETH_IO_TX_DESC_COMP_REQ_MASK;
+
+	/* Bits 0-9 */
+	desc->meta_ctrl |= (ena_tx_ctx->req_id <<
+		ENA_ETH_IO_TX_DESC_REQ_ID_LO_SHIFT) &
+		ENA_ETH_IO_TX_DESC_REQ_ID_LO_MASK;
+
+	desc->meta_ctrl |= (ena_tx_ctx->df <<
+		ENA_ETH_IO_TX_DESC_DF_SHIFT) &
+		ENA_ETH_IO_TX_DESC_DF_MASK;
+
+	/* Bits 10-15 */
+	desc->len_ctrl |= ((ena_tx_ctx->req_id >> 10) <<
+		ENA_ETH_IO_TX_DESC_REQ_ID_HI_SHIFT) &
+		ENA_ETH_IO_TX_DESC_REQ_ID_HI_MASK;
+
+	if (ena_tx_ctx->meta_valid) {
+		desc->meta_ctrl |= (ena_tx_ctx->tso_enable <<
+			ENA_ETH_IO_TX_DESC_TSO_EN_SHIFT) &
+			ENA_ETH_IO_TX_DESC_TSO_EN_MASK;
+		desc->meta_ctrl |= ena_tx_ctx->l3_proto &
+			ENA_ETH_IO_TX_DESC_L3_PROTO_IDX_MASK;
+		desc->meta_ctrl |= (ena_tx_ctx->l4_proto <<
+			ENA_ETH_IO_TX_DESC_L4_PROTO_IDX_SHIFT) &
+			ENA_ETH_IO_TX_DESC_L4_PROTO_IDX_MASK;
+		desc->meta_ctrl |= (ena_tx_ctx->l3_csum_enable <<
+			ENA_ETH_IO_TX_DESC_L3_CSUM_EN_SHIFT) &
+			ENA_ETH_IO_TX_DESC_L3_CSUM_EN_MASK;
+		desc->meta_ctrl |= (ena_tx_ctx->l4_csum_enable <<
+			ENA_ETH_IO_TX_DESC_L4_CSUM_EN_SHIFT) &
+			ENA_ETH_IO_TX_DESC_L4_CSUM_EN_MASK;
+		desc->meta_ctrl |= (ena_tx_ctx->l4_csum_partial <<
+			ENA_ETH_IO_TX_DESC_L4_CSUM_PARTIAL_SHIFT) &
+			ENA_ETH_IO_TX_DESC_L4_CSUM_PARTIAL_MASK;
+	}
+
+	for (i = 0; i < num_bufs; i++) {
+		/* The first desc share the same desc as the header */
+		if (likely(i != 0)) {
+			ena_com_copy_curr_sq_desc_to_dev(io_sq);
+			ena_com_sq_update_tail(io_sq);
+
+			desc = get_sq_desc(io_sq);
+			memset(desc, 0x0, sizeof(struct ena_eth_io_tx_desc));
+
+			desc->len_ctrl |= (io_sq->phase <<
+				ENA_ETH_IO_TX_DESC_PHASE_SHIFT) &
+				ENA_ETH_IO_TX_DESC_PHASE_MASK;
+		}
+
+		desc->len_ctrl |= ena_bufs->len &
+			ENA_ETH_IO_TX_DESC_LENGTH_MASK;
+
+		addr_hi = ((ena_bufs->paddr &
+			GENMASK_ULL(io_sq->dma_addr_bits - 1, 32)) >> 32);
+
+		desc->buff_addr_lo = (u32)ena_bufs->paddr;
+		desc->buff_addr_hi_hdr_sz |= addr_hi &
+			ENA_ETH_IO_TX_DESC_ADDR_HI_MASK;
+		ena_bufs++;
+	}
+
+	/* set the last desc indicator */
+	desc->len_ctrl |= ENA_ETH_IO_TX_DESC_LAST_MASK;
+
+	ena_com_copy_curr_sq_desc_to_dev(io_sq);
+
+	ena_com_sq_update_tail(io_sq);
+
+	total_desc = max_t(u16, num_bufs, 1);
+	total_desc += have_meta ? 1 : 0;
+
+	*nb_hw_desc = total_desc;
+	return 0;
+}
+
+int ena_com_rx_pkt(struct ena_com_io_cq *io_cq,
+		   struct ena_com_io_sq *io_sq,
+		   struct ena_com_rx_ctx *ena_rx_ctx)
+{
+	struct ena_com_rx_buf_info *ena_buf = &ena_rx_ctx->ena_bufs[0];
+	struct ena_eth_io_rx_cdesc_base *cdesc = NULL;
+	u16 cdesc_idx = 0;
+	u16 nb_hw_desc;
+	u16 i;
+
+	WARN(io_cq->direction != ENA_COM_IO_QUEUE_DIRECTION_RX, "wrong Q type");
+
+	nb_hw_desc = ena_com_cdesc_rx_pkt_get(io_cq, &cdesc_idx);
+	if (nb_hw_desc == 0) {
+		ena_rx_ctx->descs = nb_hw_desc;
+		return 0;
+	}
+
+	pr_debug("fetch rx packet: queue %d completed desc: %d\n", io_cq->qid,
+		 nb_hw_desc);
+
+	if (unlikely(nb_hw_desc > ena_rx_ctx->max_bufs)) {
+		pr_err("Too many RX cdescs (%d) > MAX(%d)\n", nb_hw_desc,
+		       ena_rx_ctx->max_bufs);
+		return -ENOSPC;
+	}
+
+	for (i = 0; i < nb_hw_desc; i++) {
+		cdesc = ena_com_rx_cdesc_idx_to_ptr(io_cq, cdesc_idx + i);
+
+		ena_buf->len = cdesc->length;
+		ena_buf->req_id = cdesc->req_id;
+		ena_buf++;
+	}
+
+	/* Update SQ head ptr */
+	io_sq->next_to_comp += nb_hw_desc;
+
+	pr_debug("[%s][QID#%d] Updating SQ head to: %d\n", __func__, io_sq->qid,
+		 io_sq->next_to_comp);
+
+	/* Get rx flags from the last pkt */
+	ena_com_rx_set_flags(ena_rx_ctx, cdesc);
+
+	ena_rx_ctx->descs = nb_hw_desc;
+	return 0;
+}
+
+int ena_com_add_single_rx_desc(struct ena_com_io_sq *io_sq,
+			       struct ena_com_buf *ena_buf,
+			       u16 req_id)
+{
+	struct ena_eth_io_rx_desc *desc;
+
+	WARN(io_sq->direction != ENA_COM_IO_QUEUE_DIRECTION_RX, "wrong Q type");
+
+	if (unlikely(ena_com_sq_empty_space(io_sq) == 0))
+		return -ENOSPC;
+
+	desc = get_sq_desc(io_sq);
+	memset(desc, 0x0, sizeof(struct ena_eth_io_rx_desc));
+
+	desc->length = ena_buf->len;
+
+	desc->ctrl |= ENA_ETH_IO_RX_DESC_FIRST_MASK;
+	desc->ctrl |= ENA_ETH_IO_RX_DESC_LAST_MASK;
+	desc->ctrl |= io_sq->phase & ENA_ETH_IO_RX_DESC_PHASE_MASK;
+	desc->ctrl |= ENA_ETH_IO_RX_DESC_COMP_REQ_MASK;
+
+	desc->req_id = req_id;
+
+	desc->buff_addr_lo = (u32)ena_buf->paddr;
+	desc->buff_addr_hi =
+		((ena_buf->paddr & GENMASK_ULL(io_sq->dma_addr_bits - 1, 32)) >> 32);
+
+	ena_com_sq_update_tail(io_sq);
+
+	return 0;
+}
+
+int ena_com_tx_comp_req_id_get(struct ena_com_io_cq *io_cq, u16 *req_id)
+{
+	u8 expected_phase, cdesc_phase;
+	struct ena_eth_io_tx_cdesc *cdesc;
+	u16 masked_head;
+
+	masked_head = io_cq->head & (io_cq->q_depth - 1);
+	expected_phase = io_cq->phase;
+
+	cdesc = (struct ena_eth_io_tx_cdesc *)
+		((uintptr_t)io_cq->cdesc_addr.virt_addr +
+		(masked_head * io_cq->cdesc_entry_size_in_bytes));
+
+	/* When the current completion descriptor phase isn't the same as the
+	 * expected, it mean that the device still didn't update
+	 * this completion.
+	 */
+	cdesc_phase = READ_ONCE(cdesc->flags) & ENA_ETH_IO_TX_CDESC_PHASE_MASK;
+	if (cdesc_phase != expected_phase)
+		return -EAGAIN;
+
+	if (unlikely(cdesc->req_id >= io_cq->q_depth)) {
+		pr_err("Invalid req id %d\n", cdesc->req_id);
+		return -EINVAL;
+	}
+
+	ena_com_cq_inc_head(io_cq);
+
+	*req_id = READ_ONCE(cdesc->req_id);
+
+	return 0;
+}
+
+bool ena_com_cq_empty(struct ena_com_io_cq *io_cq)
+{
+	struct ena_eth_io_rx_cdesc_base *cdesc;
+
+	cdesc = ena_com_get_next_rx_cdesc(io_cq);
+	if(cdesc)
+		return false;
+	else
+		return true;
+}
+
diff --git a/drivers/amazon/net/ena/ena_eth_com.h b/drivers/amazon/net/ena/ena_eth_com.h
new file mode 100644
index 0000000000000..2f7657227cfe9
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_eth_com.h
@@ -0,0 +1,162 @@
+/*
+ * Copyright 2015 Amazon.com, Inc. or its affiliates.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ENA_ETH_COM_H_
+#define ENA_ETH_COM_H_
+
+#include "ena_com.h"
+
+/* head update threshold in units of (queue size / ENA_COMP_HEAD_THRESH) */
+#define ENA_COMP_HEAD_THRESH 4
+
+struct ena_com_tx_ctx {
+	struct ena_com_tx_meta ena_meta;
+	struct ena_com_buf *ena_bufs;
+	/* For LLQ, header buffer - pushed to the device mem space */
+	void *push_header;
+
+	enum ena_eth_io_l3_proto_index l3_proto;
+	enum ena_eth_io_l4_proto_index l4_proto;
+	u16 num_bufs;
+	u16 req_id;
+	/* For regular queue, indicate the size of the header
+	 * For LLQ, indicate the size of the pushed buffer
+	 */
+	u16 header_len;
+
+	u8 meta_valid;
+	u8 tso_enable;
+	u8 l3_csum_enable;
+	u8 l4_csum_enable;
+	u8 l4_csum_partial;
+	u8 df; /* Don't fragment */
+};
+
+struct ena_com_rx_ctx {
+	struct ena_com_rx_buf_info *ena_bufs;
+	enum ena_eth_io_l3_proto_index l3_proto;
+	enum ena_eth_io_l4_proto_index l4_proto;
+	bool l3_csum_err;
+	bool l4_csum_err;
+	/* fragmented packet */
+	bool frag;
+	u32 hash;
+	u16 descs;
+	int max_bufs;
+};
+
+int ena_com_prepare_tx(struct ena_com_io_sq *io_sq,
+		       struct ena_com_tx_ctx *ena_tx_ctx,
+		       int *nb_hw_desc);
+
+int ena_com_rx_pkt(struct ena_com_io_cq *io_cq,
+		   struct ena_com_io_sq *io_sq,
+		   struct ena_com_rx_ctx *ena_rx_ctx);
+
+int ena_com_add_single_rx_desc(struct ena_com_io_sq *io_sq,
+			       struct ena_com_buf *ena_buf,
+			       u16 req_id);
+
+int ena_com_tx_comp_req_id_get(struct ena_com_io_cq *io_cq, u16 *req_id);
+
+bool ena_com_cq_empty(struct ena_com_io_cq *io_cq);
+
+static inline void ena_com_unmask_intr(struct ena_com_io_cq *io_cq,
+				       struct ena_eth_io_intr_reg *intr_reg)
+{
+	writel(intr_reg->intr_control, io_cq->unmask_reg);
+}
+
+static inline int ena_com_sq_empty_space(struct ena_com_io_sq *io_sq)
+{
+	u16 tail, next_to_comp, cnt;
+
+	next_to_comp = io_sq->next_to_comp;
+	tail = io_sq->tail;
+	cnt = tail - next_to_comp;
+
+	return io_sq->q_depth - 1 - cnt;
+}
+
+static inline int ena_com_write_sq_doorbell(struct ena_com_io_sq *io_sq)
+{
+	u16 tail;
+
+	tail = io_sq->tail;
+
+	pr_debug("write submission queue doorbell for queue: %d tail: %d\n",
+		 io_sq->qid, tail);
+
+	writel(tail, io_sq->db_addr);
+
+	return 0;
+}
+
+static inline int ena_com_update_dev_comp_head(struct ena_com_io_cq *io_cq)
+{
+	u16 unreported_comp, head;
+	bool need_update;
+
+	head = io_cq->head;
+	unreported_comp = head - io_cq->last_head_update;
+	need_update = unreported_comp > (io_cq->q_depth / ENA_COMP_HEAD_THRESH);
+
+	if (io_cq->cq_head_db_reg && need_update) {
+		pr_debug("Write completion queue doorbell for queue %d: head: %d\n",
+			 io_cq->qid, head);
+		writel(head, io_cq->cq_head_db_reg);
+		io_cq->last_head_update = head;
+	}
+
+	return 0;
+}
+
+static inline void ena_com_update_numa_node(struct ena_com_io_cq *io_cq,
+					    u8 numa_node)
+{
+	struct ena_eth_io_numa_node_cfg_reg numa_cfg;
+
+	if (!io_cq->numa_node_cfg_reg)
+		return;
+
+	numa_cfg.numa_cfg = (numa_node & ENA_ETH_IO_NUMA_NODE_CFG_REG_NUMA_MASK)
+		| ENA_ETH_IO_NUMA_NODE_CFG_REG_ENABLED_MASK;
+
+	writel(numa_cfg.numa_cfg, io_cq->numa_node_cfg_reg);
+}
+
+static inline void ena_com_comp_ack(struct ena_com_io_sq *io_sq, u16 elem)
+{
+	io_sq->next_to_comp += elem;
+}
+
+#endif /* ENA_ETH_COM_H_ */
diff --git a/drivers/amazon/net/ena/ena_eth_io_defs.h b/drivers/amazon/net/ena/ena_eth_io_defs.h
new file mode 100644
index 0000000000000..f320c58793a52
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_eth_io_defs.h
@@ -0,0 +1,416 @@
+/*
+ * Copyright 2015 - 2016 Amazon.com, Inc. or its affiliates.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _ENA_ETH_IO_H_
+#define _ENA_ETH_IO_H_
+
+enum ena_eth_io_l3_proto_index {
+	ENA_ETH_IO_L3_PROTO_UNKNOWN	= 0,
+
+	ENA_ETH_IO_L3_PROTO_IPV4	= 8,
+
+	ENA_ETH_IO_L3_PROTO_IPV6	= 11,
+
+	ENA_ETH_IO_L3_PROTO_FCOE	= 21,
+
+	ENA_ETH_IO_L3_PROTO_ROCE	= 22,
+};
+
+enum ena_eth_io_l4_proto_index {
+	ENA_ETH_IO_L4_PROTO_UNKNOWN		= 0,
+
+	ENA_ETH_IO_L4_PROTO_TCP			= 12,
+
+	ENA_ETH_IO_L4_PROTO_UDP			= 13,
+
+	ENA_ETH_IO_L4_PROTO_ROUTEABLE_ROCE	= 23,
+};
+
+struct ena_eth_io_tx_desc {
+	/* 15:0 : length - Buffer length in bytes, must
+	 *    include any packet trailers that the ENA supposed
+	 *    to update like End-to-End CRC, Authentication GMAC
+	 *    etc. This length must not include the
+	 *    'Push_Buffer' length. This length must not include
+	 *    the 4-byte added in the end for 802.3 Ethernet FCS
+	 * 21:16 : req_id_hi - Request ID[15:10]
+	 * 22 : reserved22 - MBZ
+	 * 23 : meta_desc - MBZ
+	 * 24 : phase
+	 * 25 : reserved1 - MBZ
+	 * 26 : first - Indicates first descriptor in
+	 *    transaction
+	 * 27 : last - Indicates last descriptor in
+	 *    transaction
+	 * 28 : comp_req - Indicates whether completion
+	 *    should be posted, after packet is transmitted.
+	 *    Valid only for first descriptor
+	 * 30:29 : reserved29 - MBZ
+	 * 31 : reserved31 - MBZ
+	 */
+	u32 len_ctrl;
+
+	/* 3:0 : l3_proto_idx - L3 protocol. This field
+	 *    required when l3_csum_en,l3_csum or tso_en are set.
+	 * 4 : DF - IPv4 DF, must be 0 if packet is IPv4 and
+	 *    DF flags of the IPv4 header is 0. Otherwise must
+	 *    be set to 1
+	 * 6:5 : reserved5
+	 * 7 : tso_en - Enable TSO, For TCP only.
+	 * 12:8 : l4_proto_idx - L4 protocol. This field need
+	 *    to be set when l4_csum_en or tso_en are set.
+	 * 13 : l3_csum_en - enable IPv4 header checksum.
+	 * 14 : l4_csum_en - enable TCP/UDP checksum.
+	 * 15 : ethernet_fcs_dis - when set, the controller
+	 *    will not append the 802.3 Ethernet Frame Check
+	 *    Sequence to the packet
+	 * 16 : reserved16
+	 * 17 : l4_csum_partial - L4 partial checksum. when
+	 *    set to 0, the ENA calculates the L4 checksum,
+	 *    where the Destination Address required for the
+	 *    TCP/UDP pseudo-header is taken from the actual
+	 *    packet L3 header. when set to 1, the ENA doesn't
+	 *    calculate the sum of the pseudo-header, instead,
+	 *    the checksum field of the L4 is used instead. When
+	 *    TSO enabled, the checksum of the pseudo-header
+	 *    must not include the tcp length field. L4 partial
+	 *    checksum should be used for IPv6 packet that
+	 *    contains Routing Headers.
+	 * 20:18 : reserved18 - MBZ
+	 * 21 : reserved21 - MBZ
+	 * 31:22 : req_id_lo - Request ID[9:0]
+	 */
+	u32 meta_ctrl;
+
+	u32 buff_addr_lo;
+
+	/* address high and header size
+	 * 15:0 : addr_hi - Buffer Pointer[47:32]
+	 * 23:16 : reserved16_w2
+	 * 31:24 : header_length - Header length. For Low
+	 *    Latency Queues, this fields indicates the number
+	 *    of bytes written to the headers' memory. For
+	 *    normal queues, if packet is TCP or UDP, and longer
+	 *    than max_header_size, then this field should be
+	 *    set to the sum of L4 header offset and L4 header
+	 *    size(without options), otherwise, this field
+	 *    should be set to 0. For both modes, this field
+	 *    must not exceed the max_header_size.
+	 *    max_header_size value is reported by the Max
+	 *    Queues Feature descriptor
+	 */
+	u32 buff_addr_hi_hdr_sz;
+};
+
+struct ena_eth_io_tx_meta_desc {
+	/* 9:0 : req_id_lo - Request ID[9:0]
+	 * 11:10 : reserved10 - MBZ
+	 * 12 : reserved12 - MBZ
+	 * 13 : reserved13 - MBZ
+	 * 14 : ext_valid - if set, offset fields in Word2
+	 *    are valid Also MSS High in Word 0 and bits [31:24]
+	 *    in Word 3
+	 * 15 : reserved15
+	 * 19:16 : mss_hi
+	 * 20 : eth_meta_type - 0: Tx Metadata Descriptor, 1:
+	 *    Extended Metadata Descriptor
+	 * 21 : meta_store - Store extended metadata in queue
+	 *    cache
+	 * 22 : reserved22 - MBZ
+	 * 23 : meta_desc - MBO
+	 * 24 : phase
+	 * 25 : reserved25 - MBZ
+	 * 26 : first - Indicates first descriptor in
+	 *    transaction
+	 * 27 : last - Indicates last descriptor in
+	 *    transaction
+	 * 28 : comp_req - Indicates whether completion
+	 *    should be posted, after packet is transmitted.
+	 *    Valid only for first descriptor
+	 * 30:29 : reserved29 - MBZ
+	 * 31 : reserved31 - MBZ
+	 */
+	u32 len_ctrl;
+
+	/* 5:0 : req_id_hi
+	 * 31:6 : reserved6 - MBZ
+	 */
+	u32 word1;
+
+	/* 7:0 : l3_hdr_len
+	 * 15:8 : l3_hdr_off
+	 * 21:16 : l4_hdr_len_in_words - counts the L4 header
+	 *    length in words. there is an explicit assumption
+	 *    that L4 header appears right after L3 header and
+	 *    L4 offset is based on l3_hdr_off+l3_hdr_len
+	 * 31:22 : mss_lo
+	 */
+	u32 word2;
+
+	u32 reserved;
+};
+
+struct ena_eth_io_tx_cdesc {
+	/* Request ID[15:0] */
+	u16 req_id;
+
+	u8 status;
+
+	/* flags
+	 * 0 : phase
+	 * 7:1 : reserved1
+	 */
+	u8 flags;
+
+	u16 sub_qid;
+
+	u16 sq_head_idx;
+};
+
+struct ena_eth_io_rx_desc {
+	/* In bytes. 0 means 64KB */
+	u16 length;
+
+	/* MBZ */
+	u8 reserved2;
+
+	/* 0 : phase
+	 * 1 : reserved1 - MBZ
+	 * 2 : first - Indicates first descriptor in
+	 *    transaction
+	 * 3 : last - Indicates last descriptor in transaction
+	 * 4 : comp_req
+	 * 5 : reserved5 - MBO
+	 * 7:6 : reserved6 - MBZ
+	 */
+	u8 ctrl;
+
+	u16 req_id;
+
+	/* MBZ */
+	u16 reserved6;
+
+	u32 buff_addr_lo;
+
+	u16 buff_addr_hi;
+
+	/* MBZ */
+	u16 reserved16_w3;
+};
+
+/* 4-word format Note: all ethernet parsing information are valid only when
+ * last=1
+ */
+struct ena_eth_io_rx_cdesc_base {
+	/* 4:0 : l3_proto_idx
+	 * 6:5 : src_vlan_cnt
+	 * 7 : reserved7 - MBZ
+	 * 12:8 : l4_proto_idx
+	 * 13 : l3_csum_err - when set, either the L3
+	 *    checksum error detected, or, the controller didn't
+	 *    validate the checksum. This bit is valid only when
+	 *    l3_proto_idx indicates IPv4 packet
+	 * 14 : l4_csum_err - when set, either the L4
+	 *    checksum error detected, or, the controller didn't
+	 *    validate the checksum. This bit is valid only when
+	 *    l4_proto_idx indicates TCP/UDP packet, and,
+	 *    ipv4_frag is not set
+	 * 15 : ipv4_frag - Indicates IPv4 fragmented packet
+	 * 23:16 : reserved16
+	 * 24 : phase
+	 * 25 : l3_csum2 - second checksum engine result
+	 * 26 : first - Indicates first descriptor in
+	 *    transaction
+	 * 27 : last - Indicates last descriptor in
+	 *    transaction
+	 * 29:28 : reserved28
+	 * 30 : buffer - 0: Metadata descriptor. 1: Buffer
+	 *    Descriptor was used
+	 * 31 : reserved31
+	 */
+	u32 status;
+
+	u16 length;
+
+	u16 req_id;
+
+	/* 32-bit hash result */
+	u32 hash;
+
+	u16 sub_qid;
+
+	u16 reserved;
+};
+
+/* 8-word format */
+struct ena_eth_io_rx_cdesc_ext {
+	struct ena_eth_io_rx_cdesc_base base;
+
+	u32 buff_addr_lo;
+
+	u16 buff_addr_hi;
+
+	u16 reserved16;
+
+	u32 reserved_w6;
+
+	u32 reserved_w7;
+};
+
+struct ena_eth_io_intr_reg {
+	/* 14:0 : rx_intr_delay
+	 * 29:15 : tx_intr_delay
+	 * 30 : intr_unmask
+	 * 31 : reserved
+	 */
+	u32 intr_control;
+};
+
+struct ena_eth_io_numa_node_cfg_reg {
+	/* 7:0 : numa
+	 * 30:8 : reserved
+	 * 31 : enabled
+	 */
+	u32 numa_cfg;
+};
+
+/* tx_desc */
+#define ENA_ETH_IO_TX_DESC_LENGTH_MASK GENMASK(15, 0)
+#define ENA_ETH_IO_TX_DESC_REQ_ID_HI_SHIFT 16
+#define ENA_ETH_IO_TX_DESC_REQ_ID_HI_MASK GENMASK(21, 16)
+#define ENA_ETH_IO_TX_DESC_META_DESC_SHIFT 23
+#define ENA_ETH_IO_TX_DESC_META_DESC_MASK BIT(23)
+#define ENA_ETH_IO_TX_DESC_PHASE_SHIFT 24
+#define ENA_ETH_IO_TX_DESC_PHASE_MASK BIT(24)
+#define ENA_ETH_IO_TX_DESC_FIRST_SHIFT 26
+#define ENA_ETH_IO_TX_DESC_FIRST_MASK BIT(26)
+#define ENA_ETH_IO_TX_DESC_LAST_SHIFT 27
+#define ENA_ETH_IO_TX_DESC_LAST_MASK BIT(27)
+#define ENA_ETH_IO_TX_DESC_COMP_REQ_SHIFT 28
+#define ENA_ETH_IO_TX_DESC_COMP_REQ_MASK BIT(28)
+#define ENA_ETH_IO_TX_DESC_L3_PROTO_IDX_MASK GENMASK(3, 0)
+#define ENA_ETH_IO_TX_DESC_DF_SHIFT 4
+#define ENA_ETH_IO_TX_DESC_DF_MASK BIT(4)
+#define ENA_ETH_IO_TX_DESC_TSO_EN_SHIFT 7
+#define ENA_ETH_IO_TX_DESC_TSO_EN_MASK BIT(7)
+#define ENA_ETH_IO_TX_DESC_L4_PROTO_IDX_SHIFT 8
+#define ENA_ETH_IO_TX_DESC_L4_PROTO_IDX_MASK GENMASK(12, 8)
+#define ENA_ETH_IO_TX_DESC_L3_CSUM_EN_SHIFT 13
+#define ENA_ETH_IO_TX_DESC_L3_CSUM_EN_MASK BIT(13)
+#define ENA_ETH_IO_TX_DESC_L4_CSUM_EN_SHIFT 14
+#define ENA_ETH_IO_TX_DESC_L4_CSUM_EN_MASK BIT(14)
+#define ENA_ETH_IO_TX_DESC_ETHERNET_FCS_DIS_SHIFT 15
+#define ENA_ETH_IO_TX_DESC_ETHERNET_FCS_DIS_MASK BIT(15)
+#define ENA_ETH_IO_TX_DESC_L4_CSUM_PARTIAL_SHIFT 17
+#define ENA_ETH_IO_TX_DESC_L4_CSUM_PARTIAL_MASK BIT(17)
+#define ENA_ETH_IO_TX_DESC_REQ_ID_LO_SHIFT 22
+#define ENA_ETH_IO_TX_DESC_REQ_ID_LO_MASK GENMASK(31, 22)
+#define ENA_ETH_IO_TX_DESC_ADDR_HI_MASK GENMASK(15, 0)
+#define ENA_ETH_IO_TX_DESC_HEADER_LENGTH_SHIFT 24
+#define ENA_ETH_IO_TX_DESC_HEADER_LENGTH_MASK GENMASK(31, 24)
+
+/* tx_meta_desc */
+#define ENA_ETH_IO_TX_META_DESC_REQ_ID_LO_MASK GENMASK(9, 0)
+#define ENA_ETH_IO_TX_META_DESC_EXT_VALID_SHIFT 14
+#define ENA_ETH_IO_TX_META_DESC_EXT_VALID_MASK BIT(14)
+#define ENA_ETH_IO_TX_META_DESC_MSS_HI_SHIFT 16
+#define ENA_ETH_IO_TX_META_DESC_MSS_HI_MASK GENMASK(19, 16)
+#define ENA_ETH_IO_TX_META_DESC_ETH_META_TYPE_SHIFT 20
+#define ENA_ETH_IO_TX_META_DESC_ETH_META_TYPE_MASK BIT(20)
+#define ENA_ETH_IO_TX_META_DESC_META_STORE_SHIFT 21
+#define ENA_ETH_IO_TX_META_DESC_META_STORE_MASK BIT(21)
+#define ENA_ETH_IO_TX_META_DESC_META_DESC_SHIFT 23
+#define ENA_ETH_IO_TX_META_DESC_META_DESC_MASK BIT(23)
+#define ENA_ETH_IO_TX_META_DESC_PHASE_SHIFT 24
+#define ENA_ETH_IO_TX_META_DESC_PHASE_MASK BIT(24)
+#define ENA_ETH_IO_TX_META_DESC_FIRST_SHIFT 26
+#define ENA_ETH_IO_TX_META_DESC_FIRST_MASK BIT(26)
+#define ENA_ETH_IO_TX_META_DESC_LAST_SHIFT 27
+#define ENA_ETH_IO_TX_META_DESC_LAST_MASK BIT(27)
+#define ENA_ETH_IO_TX_META_DESC_COMP_REQ_SHIFT 28
+#define ENA_ETH_IO_TX_META_DESC_COMP_REQ_MASK BIT(28)
+#define ENA_ETH_IO_TX_META_DESC_REQ_ID_HI_MASK GENMASK(5, 0)
+#define ENA_ETH_IO_TX_META_DESC_L3_HDR_LEN_MASK GENMASK(7, 0)
+#define ENA_ETH_IO_TX_META_DESC_L3_HDR_OFF_SHIFT 8
+#define ENA_ETH_IO_TX_META_DESC_L3_HDR_OFF_MASK GENMASK(15, 8)
+#define ENA_ETH_IO_TX_META_DESC_L4_HDR_LEN_IN_WORDS_SHIFT 16
+#define ENA_ETH_IO_TX_META_DESC_L4_HDR_LEN_IN_WORDS_MASK GENMASK(21, 16)
+#define ENA_ETH_IO_TX_META_DESC_MSS_LO_SHIFT 22
+#define ENA_ETH_IO_TX_META_DESC_MSS_LO_MASK GENMASK(31, 22)
+
+/* tx_cdesc */
+#define ENA_ETH_IO_TX_CDESC_PHASE_MASK BIT(0)
+
+/* rx_desc */
+#define ENA_ETH_IO_RX_DESC_PHASE_MASK BIT(0)
+#define ENA_ETH_IO_RX_DESC_FIRST_SHIFT 2
+#define ENA_ETH_IO_RX_DESC_FIRST_MASK BIT(2)
+#define ENA_ETH_IO_RX_DESC_LAST_SHIFT 3
+#define ENA_ETH_IO_RX_DESC_LAST_MASK BIT(3)
+#define ENA_ETH_IO_RX_DESC_COMP_REQ_SHIFT 4
+#define ENA_ETH_IO_RX_DESC_COMP_REQ_MASK BIT(4)
+
+/* rx_cdesc_base */
+#define ENA_ETH_IO_RX_CDESC_BASE_L3_PROTO_IDX_MASK GENMASK(4, 0)
+#define ENA_ETH_IO_RX_CDESC_BASE_SRC_VLAN_CNT_SHIFT 5
+#define ENA_ETH_IO_RX_CDESC_BASE_SRC_VLAN_CNT_MASK GENMASK(6, 5)
+#define ENA_ETH_IO_RX_CDESC_BASE_L4_PROTO_IDX_SHIFT 8
+#define ENA_ETH_IO_RX_CDESC_BASE_L4_PROTO_IDX_MASK GENMASK(12, 8)
+#define ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM_ERR_SHIFT 13
+#define ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM_ERR_MASK BIT(13)
+#define ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_ERR_SHIFT 14
+#define ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_ERR_MASK BIT(14)
+#define ENA_ETH_IO_RX_CDESC_BASE_IPV4_FRAG_SHIFT 15
+#define ENA_ETH_IO_RX_CDESC_BASE_IPV4_FRAG_MASK BIT(15)
+#define ENA_ETH_IO_RX_CDESC_BASE_PHASE_SHIFT 24
+#define ENA_ETH_IO_RX_CDESC_BASE_PHASE_MASK BIT(24)
+#define ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM2_SHIFT 25
+#define ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM2_MASK BIT(25)
+#define ENA_ETH_IO_RX_CDESC_BASE_FIRST_SHIFT 26
+#define ENA_ETH_IO_RX_CDESC_BASE_FIRST_MASK BIT(26)
+#define ENA_ETH_IO_RX_CDESC_BASE_LAST_SHIFT 27
+#define ENA_ETH_IO_RX_CDESC_BASE_LAST_MASK BIT(27)
+#define ENA_ETH_IO_RX_CDESC_BASE_BUFFER_SHIFT 30
+#define ENA_ETH_IO_RX_CDESC_BASE_BUFFER_MASK BIT(30)
+
+/* intr_reg */
+#define ENA_ETH_IO_INTR_REG_RX_INTR_DELAY_MASK GENMASK(14, 0)
+#define ENA_ETH_IO_INTR_REG_TX_INTR_DELAY_SHIFT 15
+#define ENA_ETH_IO_INTR_REG_TX_INTR_DELAY_MASK GENMASK(29, 15)
+#define ENA_ETH_IO_INTR_REG_INTR_UNMASK_SHIFT 30
+#define ENA_ETH_IO_INTR_REG_INTR_UNMASK_MASK BIT(30)
+
+/* numa_node_cfg_reg */
+#define ENA_ETH_IO_NUMA_NODE_CFG_REG_NUMA_MASK GENMASK(7, 0)
+#define ENA_ETH_IO_NUMA_NODE_CFG_REG_ENABLED_SHIFT 31
+#define ENA_ETH_IO_NUMA_NODE_CFG_REG_ENABLED_MASK BIT(31)
+
+#endif /*_ENA_ETH_IO_H_ */
diff --git a/drivers/amazon/net/ena/ena_ethtool.c b/drivers/amazon/net/ena/ena_ethtool.c
new file mode 100644
index 0000000000000..fcd002f0a7fbd
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_ethtool.c
@@ -0,0 +1,1015 @@
+/*
+ * Copyright 2015 Amazon.com, Inc. or its affiliates.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+
+#include "ena_netdev.h"
+
+struct ena_stats {
+	char name[ETH_GSTRING_LEN];
+	int stat_offset;
+};
+
+#define ENA_STAT_ENA_COM_ENTRY(stat) { \
+	.name = #stat, \
+	.stat_offset = offsetof(struct ena_com_stats_admin, stat) \
+}
+
+#define ENA_STAT_ENTRY(stat, stat_type) { \
+	.name = #stat, \
+	.stat_offset = offsetof(struct ena_stats_##stat_type, stat) \
+}
+
+#define ENA_STAT_RX_ENTRY(stat) \
+	ENA_STAT_ENTRY(stat, rx)
+
+#define ENA_STAT_TX_ENTRY(stat) \
+	ENA_STAT_ENTRY(stat, tx)
+
+#define ENA_STAT_GLOBAL_ENTRY(stat) \
+	ENA_STAT_ENTRY(stat, dev)
+
+static const struct ena_stats ena_stats_global_strings[] = {
+	ENA_STAT_GLOBAL_ENTRY(tx_timeout),
+	ENA_STAT_GLOBAL_ENTRY(suspend),
+	ENA_STAT_GLOBAL_ENTRY(resume),
+	ENA_STAT_GLOBAL_ENTRY(wd_expired),
+	ENA_STAT_GLOBAL_ENTRY(interface_up),
+	ENA_STAT_GLOBAL_ENTRY(interface_down),
+	ENA_STAT_GLOBAL_ENTRY(admin_q_pause),
+};
+
+static const struct ena_stats ena_stats_tx_strings[] = {
+	ENA_STAT_TX_ENTRY(cnt),
+	ENA_STAT_TX_ENTRY(bytes),
+	ENA_STAT_TX_ENTRY(queue_stop),
+	ENA_STAT_TX_ENTRY(queue_wakeup),
+	ENA_STAT_TX_ENTRY(dma_mapping_err),
+	ENA_STAT_TX_ENTRY(linearize),
+	ENA_STAT_TX_ENTRY(linearize_failed),
+	ENA_STAT_TX_ENTRY(napi_comp),
+	ENA_STAT_TX_ENTRY(tx_poll),
+	ENA_STAT_TX_ENTRY(doorbells),
+	ENA_STAT_TX_ENTRY(prepare_ctx_err),
+	ENA_STAT_TX_ENTRY(bad_req_id),
+	ENA_STAT_TX_ENTRY(missed_tx),
+};
+
+static const struct ena_stats ena_stats_rx_strings[] = {
+	ENA_STAT_RX_ENTRY(cnt),
+	ENA_STAT_RX_ENTRY(bytes),
+	ENA_STAT_RX_ENTRY(refil_partial),
+	ENA_STAT_RX_ENTRY(bad_csum),
+	ENA_STAT_RX_ENTRY(page_alloc_fail),
+	ENA_STAT_RX_ENTRY(skb_alloc_fail),
+	ENA_STAT_RX_ENTRY(dma_mapping_err),
+	ENA_STAT_RX_ENTRY(bad_desc_num),
+	ENA_STAT_RX_ENTRY(rx_copybreak_pkt),
+#if ENA_BUSY_POLL_SUPPORT
+	ENA_STAT_RX_ENTRY(bp_yield),
+	ENA_STAT_RX_ENTRY(bp_missed),
+	ENA_STAT_RX_ENTRY(bp_cleaned),
+#endif
+	ENA_STAT_RX_ENTRY(bad_req_id),
+	ENA_STAT_RX_ENTRY(empty_rx_ring),
+};
+
+static const struct ena_stats ena_stats_ena_com_strings[] = {
+	ENA_STAT_ENA_COM_ENTRY(aborted_cmd),
+	ENA_STAT_ENA_COM_ENTRY(submitted_cmd),
+	ENA_STAT_ENA_COM_ENTRY(completed_cmd),
+	ENA_STAT_ENA_COM_ENTRY(out_of_space),
+	ENA_STAT_ENA_COM_ENTRY(no_completion),
+};
+
+#define ENA_STATS_ARRAY_GLOBAL	ARRAY_SIZE(ena_stats_global_strings)
+#define ENA_STATS_ARRAY_TX	ARRAY_SIZE(ena_stats_tx_strings)
+#define ENA_STATS_ARRAY_RX	ARRAY_SIZE(ena_stats_rx_strings)
+#define ENA_STATS_ARRAY_ENA_COM	ARRAY_SIZE(ena_stats_ena_com_strings)
+
+static void ena_safe_update_stat(u64 *src, u64 *dst,
+				 struct u64_stats_sync *syncp)
+{
+	unsigned int start;
+
+	do {
+		start = u64_stats_fetch_begin_irq(syncp);
+		*(dst) = *src;
+	} while (u64_stats_fetch_retry_irq(syncp, start));
+}
+
+static void ena_queue_stats(struct ena_adapter *adapter, u64 **data)
+{
+	const struct ena_stats *ena_stats;
+	struct ena_ring *ring;
+
+	u64 *ptr;
+	int i, j;
+
+	for (i = 0; i < adapter->num_queues; i++) {
+		/* Tx stats */
+		ring = &adapter->tx_ring[i];
+
+		for (j = 0; j < ENA_STATS_ARRAY_TX; j++) {
+			ena_stats = &ena_stats_tx_strings[j];
+
+			ptr = (u64 *)((uintptr_t)&ring->tx_stats +
+				(uintptr_t)ena_stats->stat_offset);
+
+			ena_safe_update_stat(ptr, (*data)++, &ring->syncp);
+		}
+
+		/* Rx stats */
+		ring = &adapter->rx_ring[i];
+
+		for (j = 0; j < ENA_STATS_ARRAY_RX; j++) {
+			ena_stats = &ena_stats_rx_strings[j];
+
+			ptr = (u64 *)((uintptr_t)&ring->rx_stats +
+				(uintptr_t)ena_stats->stat_offset);
+
+			ena_safe_update_stat(ptr, (*data)++, &ring->syncp);
+		}
+	}
+}
+
+static void ena_dev_admin_queue_stats(struct ena_adapter *adapter, u64 **data)
+{
+	const struct ena_stats *ena_stats;
+	u32 *ptr;
+	int i;
+
+	for (i = 0; i < ENA_STATS_ARRAY_ENA_COM; i++) {
+		ena_stats = &ena_stats_ena_com_strings[i];
+
+		ptr = (u32 *)((uintptr_t)&adapter->ena_dev->admin_queue.stats +
+			(uintptr_t)ena_stats->stat_offset);
+
+		*(*data)++ = *ptr;
+	}
+}
+
+static void ena_get_ethtool_stats(struct net_device *netdev,
+				  struct ethtool_stats *stats,
+				  u64 *data)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	const struct ena_stats *ena_stats;
+	u64 *ptr;
+	int i;
+
+	for (i = 0; i < ENA_STATS_ARRAY_GLOBAL; i++) {
+		ena_stats = &ena_stats_global_strings[i];
+
+		ptr = (u64 *)((uintptr_t)&adapter->dev_stats +
+			(uintptr_t)ena_stats->stat_offset);
+
+		ena_safe_update_stat(ptr, data++, &adapter->syncp);
+	}
+
+	ena_queue_stats(adapter, &data);
+	ena_dev_admin_queue_stats(adapter, &data);
+}
+
+int ena_get_sset_count(struct net_device *netdev, int sset)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+
+	if (sset != ETH_SS_STATS)
+		return -EOPNOTSUPP;
+
+	return  adapter->num_queues * (ENA_STATS_ARRAY_TX + ENA_STATS_ARRAY_RX)
+		+ ENA_STATS_ARRAY_GLOBAL + ENA_STATS_ARRAY_ENA_COM;
+}
+
+static void ena_queue_strings(struct ena_adapter *adapter, u8 **data)
+{
+	const struct ena_stats *ena_stats;
+	int i, j;
+
+	for (i = 0; i < adapter->num_queues; i++) {
+		/* Tx stats */
+		for (j = 0; j < ENA_STATS_ARRAY_TX; j++) {
+			ena_stats = &ena_stats_tx_strings[j];
+
+			snprintf(*data, ETH_GSTRING_LEN,
+				 "queue_%u_tx_%s", i, ena_stats->name);
+			 (*data) += ETH_GSTRING_LEN;
+		}
+		/* Rx stats */
+		for (j = 0; j < ENA_STATS_ARRAY_RX; j++) {
+			ena_stats = &ena_stats_rx_strings[j];
+
+			snprintf(*data, ETH_GSTRING_LEN,
+				 "queue_%u_rx_%s", i, ena_stats->name);
+			(*data) += ETH_GSTRING_LEN;
+		}
+	}
+}
+
+static void ena_com_dev_strings(u8 **data)
+{
+	const struct ena_stats *ena_stats;
+	int i;
+
+	for (i = 0; i < ENA_STATS_ARRAY_ENA_COM; i++) {
+		ena_stats = &ena_stats_ena_com_strings[i];
+
+		snprintf(*data, ETH_GSTRING_LEN,
+			 "ena_admin_q_%s", ena_stats->name);
+		(*data) += ETH_GSTRING_LEN;
+	}
+}
+
+static void ena_get_strings(struct net_device *netdev, u32 sset, u8 *data)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	const struct ena_stats *ena_stats;
+	int i;
+
+	if (sset != ETH_SS_STATS)
+		return;
+
+	for (i = 0; i < ENA_STATS_ARRAY_GLOBAL; i++) {
+		ena_stats = &ena_stats_global_strings[i];
+
+		memcpy(data, ena_stats->name, ETH_GSTRING_LEN);
+		data += ETH_GSTRING_LEN;
+	}
+
+	ena_queue_strings(adapter, &data);
+	ena_com_dev_strings(&data);
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0)
+static int ena_get_link_ksettings(struct net_device *netdev,
+				  struct ethtool_link_ksettings *link_ksettings)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	struct ena_admin_get_feature_link_desc *link;
+	struct ena_admin_get_feat_resp feat_resp;
+	int rc;
+
+	rc = ena_com_get_link_params(ena_dev, &feat_resp);
+	if (rc)
+		return rc;
+
+	link = &feat_resp.u.link;
+	link_ksettings->base.speed = link->speed;
+
+	if (link->flags & ENA_ADMIN_GET_FEATURE_LINK_DESC_AUTONEG_MASK) {
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     supported, Autoneg);
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     supported, Autoneg);
+	}
+
+	link_ksettings->base.autoneg =
+		(link->flags & ENA_ADMIN_GET_FEATURE_LINK_DESC_AUTONEG_MASK) ?
+		AUTONEG_ENABLE : AUTONEG_DISABLE;
+
+	link_ksettings->base.duplex = DUPLEX_FULL;
+
+	return 0;
+}
+
+#else
+static int ena_get_settings(struct net_device *netdev,
+			    struct ethtool_cmd *ecmd)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	struct ena_admin_get_feature_link_desc *link;
+	struct ena_admin_get_feat_resp feat_resp;
+	int rc;
+
+	rc = ena_com_get_link_params(ena_dev, &feat_resp);
+	if (rc)
+		return rc;
+
+	link = &feat_resp.u.link;
+
+	ethtool_cmd_speed_set(ecmd, link->speed);
+
+	if (link->flags & ENA_ADMIN_GET_FEATURE_LINK_DESC_DUPLEX_MASK)
+		ecmd->duplex = DUPLEX_FULL;
+	else
+		ecmd->duplex = DUPLEX_HALF;
+
+	if (link->flags & ENA_ADMIN_GET_FEATURE_LINK_DESC_AUTONEG_MASK)
+		ecmd->autoneg = AUTONEG_ENABLE;
+	else
+		ecmd->autoneg = AUTONEG_DISABLE;
+
+	return 0;
+}
+
+#endif
+static int ena_get_coalesce(struct net_device *net_dev,
+			    struct ethtool_coalesce *coalesce)
+{
+	struct ena_adapter *adapter = netdev_priv(net_dev);
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+
+	if (!ena_com_interrupt_moderation_supported(ena_dev)) {
+		/* the devie doesn't support interrupt moderation */
+		return -EOPNOTSUPP;
+	}
+	coalesce->tx_coalesce_usecs =
+		ena_com_get_nonadaptive_moderation_interval_tx(ena_dev) /
+			ena_dev->intr_delay_resolution;
+	if (!ena_com_get_adaptive_moderation_enabled(ena_dev)) {
+		coalesce->rx_coalesce_usecs =
+			ena_com_get_nonadaptive_moderation_interval_rx(ena_dev)
+			/ ena_dev->intr_delay_resolution;
+	}
+	coalesce->use_adaptive_rx_coalesce =
+		ena_com_get_adaptive_moderation_enabled(ena_dev);
+
+	return 0;
+}
+
+static void ena_update_tx_rings_intr_moderation(struct ena_adapter *adapter)
+{
+	unsigned int val;
+	int i;
+
+	val = ena_com_get_nonadaptive_moderation_interval_tx(adapter->ena_dev);
+
+	for (i = 0; i < adapter->num_queues; i++)
+		adapter->tx_ring[i].smoothed_interval = val;
+}
+
+static int ena_set_coalesce(struct net_device *net_dev,
+			    struct ethtool_coalesce *coalesce)
+{
+	struct ena_adapter *adapter = netdev_priv(net_dev);
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	int rc;
+
+	if (!ena_com_interrupt_moderation_supported(ena_dev)) {
+		/* the devie doesn't support interrupt moderation */
+		return -EOPNOTSUPP;
+	}
+
+	if (coalesce->rx_coalesce_usecs_irq ||
+	    coalesce->rx_max_coalesced_frames_irq ||
+	    coalesce->tx_coalesce_usecs_irq ||
+	    coalesce->tx_max_coalesced_frames ||
+	    coalesce->tx_max_coalesced_frames_irq ||
+	    coalesce->stats_block_coalesce_usecs ||
+	    coalesce->use_adaptive_tx_coalesce ||
+	    coalesce->pkt_rate_low ||
+	    coalesce->tx_coalesce_usecs_low ||
+	    coalesce->tx_max_coalesced_frames_low ||
+	    coalesce->pkt_rate_high ||
+	    coalesce->tx_coalesce_usecs_high ||
+	    coalesce->tx_max_coalesced_frames_high ||
+	    coalesce->rate_sample_interval)
+		return -EINVAL;
+
+	/* Note, adaptive coalescing settings are updated through sysfs */
+	if (coalesce->rx_max_coalesced_frames ||
+	    coalesce->rx_coalesce_usecs_low ||
+	    coalesce->rx_max_coalesced_frames_low ||
+	    coalesce->rx_coalesce_usecs_high ||
+	    coalesce->rx_max_coalesced_frames_high)
+		return -EINVAL;
+	rc = ena_com_update_nonadaptive_moderation_interval_tx(ena_dev,
+							       coalesce->tx_coalesce_usecs);
+	if (rc)
+		return rc;
+
+	ena_update_tx_rings_intr_moderation(adapter);
+
+	if (ena_com_get_adaptive_moderation_enabled(ena_dev)) {
+		if (!coalesce->use_adaptive_rx_coalesce) {
+			ena_com_disable_adaptive_moderation(ena_dev);
+			rc = ena_com_update_nonadaptive_moderation_interval_rx(ena_dev,
+									       coalesce->rx_coalesce_usecs);
+			return rc;
+		} else {
+			/* was in adaptive mode and remains in it,
+			 * allow to update only tx_usecs, rx is managed through sysfs
+			 */
+			if (coalesce->rx_coalesce_usecs)
+				return -EINVAL;
+		}
+	} else { /* was in non-adaptive mode */
+		if (coalesce->use_adaptive_rx_coalesce) {
+			ena_com_enable_adaptive_moderation(ena_dev);
+		} else {
+			rc = ena_com_update_nonadaptive_moderation_interval_rx(ena_dev,
+									       coalesce->rx_coalesce_usecs);
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+static u32 ena_get_msglevel(struct net_device *netdev)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+
+	return adapter->msg_enable;
+}
+
+static void ena_set_msglevel(struct net_device *netdev, u32 value)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+
+	adapter->msg_enable = value;
+}
+
+static void ena_get_drvinfo(struct net_device *dev,
+			    struct ethtool_drvinfo *info)
+{
+	struct ena_adapter *adapter = netdev_priv(dev);
+
+	strlcpy(info->driver, DRV_MODULE_NAME, sizeof(info->driver));
+	strlcpy(info->version, DRV_MODULE_VERSION, sizeof(info->version));
+	strlcpy(info->bus_info, pci_name(adapter->pdev),
+		sizeof(info->bus_info));
+}
+
+static void ena_get_ringparam(struct net_device *netdev,
+			      struct ethtool_ringparam *ring)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	struct ena_ring *tx_ring = &adapter->tx_ring[0];
+	struct ena_ring *rx_ring = &adapter->rx_ring[0];
+
+	ring->rx_max_pending = rx_ring->ring_size;
+	ring->tx_max_pending = tx_ring->ring_size;
+	ring->rx_pending = rx_ring->ring_size;
+	ring->tx_pending = tx_ring->ring_size;
+}
+
+#ifdef ETHTOOL_GRXRINGS
+static u32 ena_flow_hash_to_flow_type(u16 hash_fields)
+{
+	u32 data = 0;
+
+	if (hash_fields & ENA_ADMIN_RSS_L2_DA)
+		data |= RXH_L2DA;
+
+	if (hash_fields & ENA_ADMIN_RSS_L3_DA)
+		data |= RXH_IP_DST;
+
+	if (hash_fields & ENA_ADMIN_RSS_L3_SA)
+		data |= RXH_IP_SRC;
+
+	if (hash_fields & ENA_ADMIN_RSS_L4_DP)
+		data |= RXH_L4_B_2_3;
+
+	if (hash_fields & ENA_ADMIN_RSS_L4_SP)
+		data |= RXH_L4_B_0_1;
+
+	return data;
+}
+
+static u16 ena_flow_data_to_flow_hash(u32 hash_fields)
+{
+	u16 data = 0;
+
+	if (hash_fields & RXH_L2DA)
+		data |= ENA_ADMIN_RSS_L2_DA;
+
+	if (hash_fields & RXH_IP_DST)
+		data |= ENA_ADMIN_RSS_L3_DA;
+
+	if (hash_fields & RXH_IP_SRC)
+		data |= ENA_ADMIN_RSS_L3_SA;
+
+	if (hash_fields & RXH_L4_B_2_3)
+		data |= ENA_ADMIN_RSS_L4_DP;
+
+	if (hash_fields & RXH_L4_B_0_1)
+		data |= ENA_ADMIN_RSS_L4_SP;
+
+	return data;
+}
+
+static int ena_get_rss_hash(struct ena_com_dev *ena_dev,
+			    struct ethtool_rxnfc *cmd)
+{
+	enum ena_admin_flow_hash_proto proto;
+	u16 hash_fields;
+	int rc;
+
+	cmd->data = 0;
+
+	switch (cmd->flow_type) {
+	case TCP_V4_FLOW:
+		proto = ENA_ADMIN_RSS_TCP4;
+		break;
+	case UDP_V4_FLOW:
+		proto = ENA_ADMIN_RSS_UDP4;
+		break;
+	case TCP_V6_FLOW:
+		proto = ENA_ADMIN_RSS_TCP6;
+		break;
+	case UDP_V6_FLOW:
+		proto = ENA_ADMIN_RSS_UDP6;
+		break;
+	case IPV4_FLOW:
+		proto = ENA_ADMIN_RSS_IP4;
+		break;
+	case IPV6_FLOW:
+		proto = ENA_ADMIN_RSS_IP6;
+		break;
+	case ETHER_FLOW:
+		proto = ENA_ADMIN_RSS_NOT_IP;
+		break;
+	case AH_V4_FLOW:
+	case ESP_V4_FLOW:
+	case AH_V6_FLOW:
+	case ESP_V6_FLOW:
+	case SCTP_V4_FLOW:
+	case AH_ESP_V4_FLOW:
+		return -EOPNOTSUPP;
+	default:
+		return -EINVAL;
+	}
+
+	rc = ena_com_get_hash_ctrl(ena_dev, proto, &hash_fields);
+	if (rc)
+		return rc;
+
+	cmd->data = ena_flow_hash_to_flow_type(hash_fields);
+
+	return 0;
+}
+
+static int ena_set_rss_hash(struct ena_com_dev *ena_dev,
+			    struct ethtool_rxnfc *cmd)
+{
+	enum ena_admin_flow_hash_proto proto;
+	u16 hash_fields;
+
+	switch (cmd->flow_type) {
+	case TCP_V4_FLOW:
+		proto = ENA_ADMIN_RSS_TCP4;
+		break;
+	case UDP_V4_FLOW:
+		proto = ENA_ADMIN_RSS_UDP4;
+		break;
+	case TCP_V6_FLOW:
+		proto = ENA_ADMIN_RSS_TCP6;
+		break;
+	case UDP_V6_FLOW:
+		proto = ENA_ADMIN_RSS_UDP6;
+		break;
+	case IPV4_FLOW:
+		proto = ENA_ADMIN_RSS_IP4;
+		break;
+	case IPV6_FLOW:
+		proto = ENA_ADMIN_RSS_IP6;
+		break;
+	case ETHER_FLOW:
+		proto = ENA_ADMIN_RSS_NOT_IP;
+		break;
+	case AH_V4_FLOW:
+	case ESP_V4_FLOW:
+	case AH_V6_FLOW:
+	case ESP_V6_FLOW:
+	case SCTP_V4_FLOW:
+	case AH_ESP_V4_FLOW:
+		return -EOPNOTSUPP;
+	default:
+		return -EINVAL;
+	}
+
+	hash_fields = ena_flow_data_to_flow_hash(cmd->data);
+
+	return ena_com_fill_hash_ctrl(ena_dev, proto, hash_fields);
+}
+
+static int ena_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *info)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	int rc = 0;
+
+	switch (info->cmd) {
+	case ETHTOOL_SRXFH:
+		rc = ena_set_rss_hash(adapter->ena_dev, info);
+		break;
+	case ETHTOOL_SRXCLSRLDEL:
+	case ETHTOOL_SRXCLSRLINS:
+	default:
+		netif_err(adapter, drv, netdev,
+			  "Command parameter %d is not supported\n", info->cmd);
+		rc = -EOPNOTSUPP;
+	}
+
+	return rc;
+}
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(3, 2, 0)
+static int ena_get_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *info,
+			 void *rules)
+#else
+static int ena_get_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *info,
+			 u32 *rules)
+#endif
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	int rc = 0;
+
+	switch (info->cmd) {
+	case ETHTOOL_GRXRINGS:
+		info->data = adapter->num_queues;
+		rc = 0;
+		break;
+	case ETHTOOL_GRXFH:
+		rc = ena_get_rss_hash(adapter->ena_dev, info);
+		break;
+	case ETHTOOL_GRXCLSRLCNT:
+	case ETHTOOL_GRXCLSRULE:
+	case ETHTOOL_GRXCLSRLALL:
+	default:
+		netif_err(adapter, drv, netdev,
+			  "Command parameter %d is not supported\n", info->cmd);
+		rc = -EOPNOTSUPP;
+	}
+
+	return rc;
+}
+#endif /* ETHTOOL_GRXRINGS */
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0)
+static u32 ena_get_rxfh_indir_size(struct net_device *netdev)
+{
+	return ENA_RX_RSS_TABLE_SIZE;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
+static u32 ena_get_rxfh_key_size(struct net_device *netdev)
+{
+	return ENA_HASH_KEY_SIZE;
+}
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0)
+static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
+			u8 *hfunc)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	enum ena_admin_hash_functions ena_func;
+	u8 func;
+	int rc;
+
+	rc = ena_com_indirect_table_get(adapter->ena_dev, indir);
+	if (rc)
+		return rc;
+
+	rc = ena_com_get_hash_function(adapter->ena_dev, &ena_func, key);
+	if (rc)
+		return rc;
+
+	switch (ena_func) {
+	case ENA_ADMIN_TOEPLITZ:
+		func = ETH_RSS_HASH_TOP;
+		break;
+	case ENA_ADMIN_CRC32:
+		func = ETH_RSS_HASH_XOR;
+		break;
+	default:
+		netif_err(adapter, drv, netdev,
+			  "Command parameter is not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (hfunc)
+		*hfunc = func;
+
+	return rc;
+}
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
+static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	int rc;
+
+	rc = ena_com_indirect_table_get(adapter->ena_dev, indir);
+	if (rc)
+		return rc;
+
+	rc = ena_com_get_hash_function(adapter->ena_dev, NULL, key);
+	if (rc) {
+		return rc;
+	}
+
+	return rc;
+}
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0)/* >= 3.16.0 */
+static int ena_get_rxfh(struct net_device *netdev, u32 *indir)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+
+	return ena_com_indirect_table_get(adapter->ena_dev, indir);
+}
+#endif /* >= 3.8.0 */
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0)
+static int ena_set_rxfh(struct net_device *netdev, const u32 *indir,
+			const u8 *key, const u8 hfunc)
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
+static int ena_set_rxfh(struct net_device *netdev, const u32 *indir,
+			const u8 *key)
+#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	enum ena_admin_hash_functions func;
+	int rc, i;
+
+	if (indir) {
+		for (i = 0; i < ENA_RX_RSS_TABLE_SIZE; i++) {
+			rc = ena_com_indirect_table_fill_entry(ena_dev,
+							       ENA_IO_RXQ_IDX(indir[i]),
+							       i);
+			if (unlikely(rc)) {
+				netif_err(adapter, drv, netdev,
+					  "Cannot fill indirect table (index is too large)\n");
+				return rc;
+			}
+		}
+
+		rc = ena_com_indirect_table_set(ena_dev);
+		if (rc) {
+			netif_err(adapter, drv, netdev,
+				  "Cannot set indirect table\n");
+			return rc == -EPERM ? -EOPNOTSUPP : rc;
+		}
+	}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0)
+	switch (hfunc) {
+	case ETH_RSS_HASH_TOP:
+		func = ENA_ADMIN_TOEPLITZ;
+		break;
+	case ETH_RSS_HASH_XOR:
+		func = ENA_ADMIN_CRC32;
+		break;
+	default:
+		netif_err(adapter, drv, netdev, "Unsupported hfunc %d\n",
+			  hfunc);
+		return -EOPNOTSUPP;
+	}
+#else /* Kernel 3.19 */
+	func = ENA_ADMIN_TOEPLITZ;
+#endif
+
+	if (key) {
+		rc = ena_com_fill_hash_function(ena_dev, func, key,
+						ENA_HASH_KEY_SIZE,
+						0xFFFFFFFF);
+		if (unlikely(rc)) {
+			netif_err(adapter, drv, netdev, "Cannot fill key\n");
+			return rc == -EPERM ? -EOPNOTSUPP : rc;
+		}
+	}
+
+	return 0;
+}
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) /* Kernel > 3.16 */
+static int ena_set_rxfh(struct net_device *netdev, const u32 *indir)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	int rc, i;
+
+	if (!indir)
+		return 0;
+	for (i = 0; i < ENA_RX_RSS_TABLE_SIZE; i++) {
+		rc = ena_com_indirect_table_fill_entry(ena_dev, i,
+						       ENA_IO_RXQ_IDX(indir[i]));
+		if (unlikely(rc)) {
+			netif_err(adapter, drv, netdev,
+				  "Cannot fill indirect table (index is too large)\n");
+			return rc;
+		}
+	}
+
+	rc = ena_com_indirect_table_set(ena_dev);
+	if (unlikely(rc)) {
+		netif_err(adapter, drv, netdev, "Cannot set indirect table\n");
+		return rc == -EPERM ? -EOPNOTSUPP : rc;
+	}
+
+	return 0;
+}
+#endif /* Kernel > 3.16 */
+#endif /* ETHTOOL_GRXFH */
+#ifndef HAVE_RHEL6_ETHTOOL_OPS_EXT_STRUCT
+
+#ifdef ETHTOOL_SCHANNELS
+static void ena_get_channels(struct net_device *netdev,
+			     struct ethtool_channels *channels)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+
+	channels->max_rx = adapter->num_queues;
+	channels->max_tx = adapter->num_queues;
+	channels->max_other = 0;
+	channels->max_combined = 0;
+	channels->rx_count = adapter->num_queues;
+	channels->tx_count = adapter->num_queues;
+	channels->other_count = 0;
+	channels->combined_count = 0;
+}
+#endif /* ETHTOOL_SCHANNELS */
+
+#endif /* HAVE_RHEL6_ETHTOOL_OPS_EXT_STRUCT */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0)
+static int ena_get_tunable(struct net_device *netdev,
+			   const struct ethtool_tunable *tuna, void *data)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	int ret = 0;
+
+	switch (tuna->id) {
+	case ETHTOOL_RX_COPYBREAK:
+		*(u32 *)data = adapter->rx_copybreak;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static int ena_set_tunable(struct net_device *netdev,
+			   const struct ethtool_tunable *tuna,
+			   const void *data)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	int ret = 0;
+	u32 len;
+
+	switch (tuna->id) {
+	case ETHTOOL_RX_COPYBREAK:
+		len = *(u32 *)data;
+		if (len > adapter->netdev->mtu) {
+			ret = -EINVAL;
+			break;
+		}
+		adapter->rx_copybreak = len;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+#endif /* 3.18.0 */
+
+static const struct ethtool_ops ena_ethtool_ops = {
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0)
+	.get_link_ksettings	= ena_get_link_ksettings,
+#else
+	.get_settings		= ena_get_settings,
+#endif
+	.get_drvinfo		= ena_get_drvinfo,
+	.get_msglevel		= ena_get_msglevel,
+	.set_msglevel		= ena_set_msglevel,
+	.get_link		= ethtool_op_get_link,
+	.get_coalesce		= ena_get_coalesce,
+	.set_coalesce		= ena_set_coalesce,
+	.get_ringparam		= ena_get_ringparam,
+	.get_sset_count         = ena_get_sset_count,
+	.get_strings		= ena_get_strings,
+	.get_ethtool_stats      = ena_get_ethtool_stats,
+#ifdef ETHTOOL_GRXRINGS
+	.get_rxnfc		= ena_get_rxnfc,
+	.set_rxnfc		= ena_set_rxnfc,
+#endif /* ETHTOOL_GRXRINGS */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0)
+	.get_rxfh_indir_size    = ena_get_rxfh_indir_size,
+#endif /* >= 3.8.0 */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
+	.get_rxfh_key_size	= ena_get_rxfh_key_size,
+	.get_rxfh		= ena_get_rxfh,
+	.set_rxfh		= ena_set_rxfh,
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0)
+	.get_rxfh_indir		= ena_get_rxfh,
+	.set_rxfh_indir		= ena_set_rxfh,
+#endif /* >= 3.8.0 */
+#ifndef HAVE_RHEL6_ETHTOOL_OPS_EXT_STRUCT
+#ifdef ETHTOOL_SCHANNELS
+	.get_channels		= ena_get_channels,
+#endif /* ETHTOOL_SCHANNELS */
+#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0)
+	.get_tunable		= ena_get_tunable,
+	.set_tunable		= ena_set_tunable,
+#endif
+};
+
+void ena_set_ethtool_ops(struct net_device *netdev)
+{
+	netdev->ethtool_ops = &ena_ethtool_ops;
+}
+
+static void ena_dump_stats_ex(struct ena_adapter *adapter, u8 *buf)
+{
+	struct net_device *netdev = adapter->netdev;
+	u8 *strings_buf;
+	u64 *data_buf;
+	int strings_num;
+	int i, rc;
+
+	strings_num = ena_get_sset_count(netdev, ETH_SS_STATS);
+	if (strings_num <= 0) {
+		netif_err(adapter, drv, netdev, "Can't get stats num\n");
+		return;
+	}
+
+	strings_buf = devm_kzalloc(&adapter->pdev->dev,
+				   strings_num * ETH_GSTRING_LEN,
+				   GFP_ATOMIC);
+	if (!strings_buf) {
+		netif_err(adapter, drv, netdev,
+			  "failed to alloc strings_buf\n");
+		return;
+	}
+
+	data_buf = devm_kzalloc(&adapter->pdev->dev,
+				strings_num * sizeof(u64),
+				GFP_ATOMIC);
+	if (!data_buf) {
+		netif_err(adapter, drv, netdev,
+			  "failed to allocate data buf\n");
+		devm_kfree(&adapter->pdev->dev, strings_buf);
+		return;
+	}
+
+	ena_get_strings(netdev, ETH_SS_STATS, strings_buf);
+	ena_get_ethtool_stats(netdev, NULL, data_buf);
+
+	/* If there is a buffer, dump stats, otherwise print them to dmesg */
+	if (buf)
+		for (i = 0; i < strings_num; i++) {
+			rc = snprintf(buf, ETH_GSTRING_LEN + sizeof(u64),
+				      "%s %llu\n",
+				      strings_buf + i * ETH_GSTRING_LEN,
+				      data_buf[i]);
+			buf += rc;
+		}
+	else
+		for (i = 0; i < strings_num; i++)
+			netif_err(adapter, drv, netdev, "%s: %llu\n",
+				  strings_buf + i * ETH_GSTRING_LEN,
+				  data_buf[i]);
+
+	devm_kfree(&adapter->pdev->dev, strings_buf);
+	devm_kfree(&adapter->pdev->dev, data_buf);
+}
+
+void ena_dump_stats_to_buf(struct ena_adapter *adapter, u8 *buf)
+{
+	if (!buf)
+		return;
+
+	ena_dump_stats_ex(adapter, buf);
+}
+
+void ena_dump_stats_to_dmesg(struct ena_adapter *adapter)
+{
+	ena_dump_stats_ex(adapter, NULL);
+}
diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c
new file mode 100644
index 0000000000000..4f82a3df99f70
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_netdev.c
@@ -0,0 +1,3968 @@
+/*
+ * Copyright 2015 Amazon.com, Inc. or its affiliates.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#ifdef CONFIG_RFS_ACCEL
+#include <linux/cpu_rmap.h>
+#endif /* CONFIG_RFS_ACCEL */
+#include <linux/ethtool.h>
+#include <linux/if_vlan.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/numa.h>
+#include <linux/pci.h>
+#include <linux/utsname.h>
+#include <linux/version.h>
+#include <linux/vmalloc.h>
+#if defined(CONFIG_NET_RX_BUSY_POLL) && (LINUX_VERSION_CODE < KERNEL_VERSION(4,5,0))
+#include <net/busy_poll.h>
+#endif
+#include <net/ip.h>
+
+#include "ena_netdev.h"
+#include "ena_pci_id_tbl.h"
+#include "ena_sysfs.h"
+
+static char version[] = DEVICE_NAME " v" DRV_MODULE_VERSION "\n";
+
+MODULE_AUTHOR("Amazon.com, Inc. or its affiliates");
+MODULE_DESCRIPTION(DEVICE_NAME);
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_MODULE_VERSION);
+
+/* Time in jiffies before concluding the transmitter is hung. */
+#define TX_TIMEOUT  (5 * HZ)
+
+#define ENA_NAPI_BUDGET 64
+
+#define DEFAULT_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_IFUP | \
+		NETIF_MSG_TX_DONE | NETIF_MSG_TX_ERR | NETIF_MSG_RX_ERR)
+static int debug = -1;
+module_param(debug, int, 0);
+MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
+
+static struct ena_aenq_handlers aenq_handlers;
+
+static struct workqueue_struct *ena_wq;
+
+MODULE_DEVICE_TABLE(pci, ena_pci_tbl);
+
+static int ena_rss_init_default(struct ena_adapter *adapter);
+static void check_for_admin_com_state(struct ena_adapter *adapter);
+static void ena_destroy_device(struct ena_adapter *adapter);
+static int ena_restore_device(struct ena_adapter *adapter);
+
+static void ena_tx_timeout(struct net_device *dev)
+{
+	struct ena_adapter *adapter = netdev_priv(dev);
+
+	/* Change the state of the device to trigger reset
+	 * Check that we are not in the middle or a trigger already
+	 */
+
+	if (test_and_set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))
+		return;
+
+	adapter->reset_reason = ENA_REGS_RESET_OS_NETDEV_WD;
+	u64_stats_update_begin(&adapter->syncp);
+	adapter->dev_stats.tx_timeout++;
+	u64_stats_update_end(&adapter->syncp);
+
+	netif_err(adapter, tx_err, dev, "Transmit time out\n");
+}
+
+static void update_rx_ring_mtu(struct ena_adapter *adapter, int mtu)
+{
+	int i;
+
+	for (i = 0; i < adapter->num_queues; i++)
+		adapter->rx_ring[i].mtu = mtu;
+}
+
+static int ena_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct ena_adapter *adapter = netdev_priv(dev);
+	int ret;
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0)
+	if ((new_mtu > adapter->max_mtu) || (new_mtu < ENA_MIN_MTU)) {
+		netif_err(adapter, drv, dev,
+			  "Invalid MTU setting. new_mtu: %d max mtu: %d min mtu: %d\n",
+			  new_mtu, adapter->max_mtu, ENA_MIN_MTU);
+		return -EINVAL;
+	}
+#endif
+	ret = ena_com_set_dev_mtu(adapter->ena_dev, new_mtu);
+	if (!ret) {
+		netif_dbg(adapter, drv, dev, "set MTU to %d\n", new_mtu);
+		update_rx_ring_mtu(adapter, new_mtu);
+		dev->mtu = new_mtu;
+	} else {
+		netif_err(adapter, drv, dev, "Failed to set MTU to %d\n",
+			  new_mtu);
+	}
+
+	return ret;
+}
+
+static int ena_init_rx_cpu_rmap(struct ena_adapter *adapter)
+{
+#ifdef CONFIG_RFS_ACCEL
+	u32 i;
+	int rc;
+
+	adapter->netdev->rx_cpu_rmap = alloc_irq_cpu_rmap(adapter->num_queues);
+	if (!adapter->netdev->rx_cpu_rmap)
+		return -ENOMEM;
+	for (i = 0; i < adapter->num_queues; i++) {
+		int irq_idx = ENA_IO_IRQ_IDX(i);
+
+		rc = irq_cpu_rmap_add(adapter->netdev->rx_cpu_rmap,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
+				      adapter->msix_entries[irq_idx].vector);
+#else
+				      pci_irq_vector(adapter->pdev, irq_idx));
+#endif
+		if (rc) {
+			free_irq_cpu_rmap(adapter->netdev->rx_cpu_rmap);
+			adapter->netdev->rx_cpu_rmap = NULL;
+			return rc;
+		}
+	}
+#endif /* CONFIG_RFS_ACCEL */
+	return 0;
+}
+
+static void ena_init_io_rings_common(struct ena_adapter *adapter,
+				     struct ena_ring *ring, u16 qid)
+{
+	ring->qid = qid;
+	ring->pdev = adapter->pdev;
+	ring->dev = &adapter->pdev->dev;
+	ring->netdev = adapter->netdev;
+	ring->napi = &adapter->ena_napi[qid].napi;
+	ring->adapter = adapter;
+	ring->ena_dev = adapter->ena_dev;
+	ring->per_napi_packets = 0;
+	ring->per_napi_bytes = 0;
+	ring->cpu = 0;
+	ring->first_interrupt = false;
+	ring->no_interrupt_event_cnt = 0;
+	u64_stats_init(&ring->syncp);
+}
+
+static void ena_init_io_rings(struct ena_adapter *adapter)
+{
+	struct ena_com_dev *ena_dev;
+	struct ena_ring *txr, *rxr;
+	int i;
+
+	ena_dev = adapter->ena_dev;
+
+	for (i = 0; i < adapter->num_queues; i++) {
+		txr = &adapter->tx_ring[i];
+		rxr = &adapter->rx_ring[i];
+
+		/* TX/RX common ring state */
+		ena_init_io_rings_common(adapter, txr, i);
+		ena_init_io_rings_common(adapter, rxr, i);
+
+		/* TX specific ring state */
+		txr->ring_size = adapter->tx_ring_size;
+		txr->tx_max_header_size = ena_dev->tx_max_header_size;
+		txr->tx_mem_queue_type = ena_dev->tx_mem_queue_type;
+		txr->sgl_size = adapter->max_tx_sgl_size;
+		txr->smoothed_interval =
+			ena_com_get_nonadaptive_moderation_interval_tx(ena_dev);
+
+		/* RX specific ring state */
+		rxr->ring_size = adapter->rx_ring_size;
+		rxr->rx_copybreak = adapter->rx_copybreak;
+		rxr->sgl_size = adapter->max_rx_sgl_size;
+		rxr->smoothed_interval =
+			ena_com_get_nonadaptive_moderation_interval_rx(ena_dev);
+		rxr->empty_rx_queue = 0;
+	}
+}
+
+/* ena_setup_tx_resources - allocate I/O Tx resources (Descriptors)
+ * @adapter: network interface device structure
+ * @qid: queue index
+ *
+ * Return 0 on success, negative on failure
+ */
+static int ena_setup_tx_resources(struct ena_adapter *adapter, int qid)
+{
+	struct ena_ring *tx_ring = &adapter->tx_ring[qid];
+	struct ena_irq *ena_irq = &adapter->irq_tbl[ENA_IO_IRQ_IDX(qid)];
+	int size, i, node;
+
+	if (tx_ring->tx_buffer_info) {
+		netif_err(adapter, ifup,
+			  adapter->netdev, "tx_buffer_info info is not NULL");
+		return -EEXIST;
+	}
+
+	size = sizeof(struct ena_tx_buffer) * tx_ring->ring_size;
+	node = cpu_to_node(ena_irq->cpu);
+
+	tx_ring->tx_buffer_info = vzalloc_node(size, node);
+	if (!tx_ring->tx_buffer_info) {
+		tx_ring->tx_buffer_info = vzalloc(size);
+		if (!tx_ring->tx_buffer_info)
+			return -ENOMEM;
+	}
+
+	size = sizeof(u16) * tx_ring->ring_size;
+	tx_ring->free_tx_ids = vzalloc_node(size, node);
+	if (!tx_ring->free_tx_ids) {
+		tx_ring->free_tx_ids = vzalloc(size);
+		if (!tx_ring->free_tx_ids) {
+			vfree(tx_ring->tx_buffer_info);
+			return -ENOMEM;
+		}
+	}
+
+	/* Req id ring for TX out of order completions */
+	for (i = 0; i < tx_ring->ring_size; i++)
+		tx_ring->free_tx_ids[i] = i;
+
+	/* Reset tx statistics */
+	memset(&tx_ring->tx_stats, 0x0, sizeof(tx_ring->tx_stats));
+
+	tx_ring->next_to_use = 0;
+	tx_ring->next_to_clean = 0;
+	tx_ring->cpu = ena_irq->cpu;
+	return 0;
+}
+
+/* ena_free_tx_resources - Free I/O Tx Resources per Queue
+ * @adapter: network interface device structure
+ * @qid: queue index
+ *
+ * Free all transmit software resources
+ */
+static void ena_free_tx_resources(struct ena_adapter *adapter, int qid)
+{
+	struct ena_ring *tx_ring = &adapter->tx_ring[qid];
+
+	vfree(tx_ring->tx_buffer_info);
+	tx_ring->tx_buffer_info = NULL;
+
+	vfree(tx_ring->free_tx_ids);
+	tx_ring->free_tx_ids = NULL;
+}
+
+/* ena_setup_all_tx_resources - allocate I/O Tx queues resources for All queues
+ * @adapter: private structure
+ *
+ * Return 0 on success, negative on failure
+ */
+static int ena_setup_all_tx_resources(struct ena_adapter *adapter)
+{
+	int i, rc = 0;
+
+	for (i = 0; i < adapter->num_queues; i++) {
+		rc = ena_setup_tx_resources(adapter, i);
+		if (rc)
+			goto err_setup_tx;
+	}
+
+	return 0;
+
+err_setup_tx:
+
+	netif_err(adapter, ifup, adapter->netdev,
+		  "Tx queue %d: allocation failed\n", i);
+
+	/* rewind the index freeing the rings as we go */
+	while (i--)
+		ena_free_tx_resources(adapter, i);
+	return rc;
+}
+
+/* ena_free_all_io_tx_resources - Free I/O Tx Resources for All Queues
+ * @adapter: board private structure
+ *
+ * Free all transmit software resources
+ */
+static void ena_free_all_io_tx_resources(struct ena_adapter *adapter)
+{
+	int i;
+
+	for (i = 0; i < adapter->num_queues; i++)
+		ena_free_tx_resources(adapter, i);
+}
+
+static inline int validate_rx_req_id(struct ena_ring *rx_ring, u16 req_id)
+{
+	if (likely(req_id < rx_ring->ring_size))
+		return 0;
+
+	netif_err(rx_ring->adapter, rx_err, rx_ring->netdev,
+		  "Invalid rx req_id: %hu\n", req_id);
+
+	u64_stats_update_begin(&rx_ring->syncp);
+	rx_ring->rx_stats.bad_req_id++;
+	u64_stats_update_end(&rx_ring->syncp);
+
+	/* Trigger device reset */
+	rx_ring->adapter->reset_reason = ENA_REGS_RESET_INV_RX_REQ_ID;
+	set_bit(ENA_FLAG_TRIGGER_RESET, &rx_ring->adapter->flags);
+	return -EFAULT;
+}
+
+/* ena_setup_rx_resources - allocate I/O Rx resources (Descriptors)
+ * @adapter: network interface device structure
+ * @qid: queue index
+ *
+ * Returns 0 on success, negative on failure
+ */
+static int ena_setup_rx_resources(struct ena_adapter *adapter,
+				  u32 qid)
+{
+	struct ena_ring *rx_ring = &adapter->rx_ring[qid];
+	struct ena_irq *ena_irq = &adapter->irq_tbl[ENA_IO_IRQ_IDX(qid)];
+	int size, node, i;
+
+	if (rx_ring->rx_buffer_info) {
+		netif_err(adapter, ifup, adapter->netdev,
+			  "rx_buffer_info is not NULL");
+		return -EEXIST;
+	}
+
+	/* alloc extra element so in rx path
+	 * we can always prefetch rx_info + 1
+	 */
+	size = sizeof(struct ena_rx_buffer) * (rx_ring->ring_size + 1);
+	node = cpu_to_node(ena_irq->cpu);
+
+	rx_ring->rx_buffer_info = vzalloc_node(size, node);
+	if (!rx_ring->rx_buffer_info) {
+		rx_ring->rx_buffer_info = vzalloc(size);
+		if (!rx_ring->rx_buffer_info)
+			return -ENOMEM;
+	}
+
+	size = sizeof(u16) * rx_ring->ring_size;
+	rx_ring->free_rx_ids = vzalloc_node(size, node);
+	if (!rx_ring->free_rx_ids) {
+		rx_ring->free_rx_ids = vzalloc(size);
+		if (!rx_ring->free_rx_ids) {
+			vfree(rx_ring->rx_buffer_info);
+			return -ENOMEM;
+		}
+	}
+
+	/* Req id ring for receiving RX pkts out of order */
+	for (i = 0; i < rx_ring->ring_size; i++)
+		rx_ring->free_rx_ids[i] = i;
+
+	/* Reset rx statistics */
+	memset(&rx_ring->rx_stats, 0x0, sizeof(rx_ring->rx_stats));
+
+#if ENA_BUSY_POLL_SUPPORT
+	ena_bp_init_lock(rx_ring);
+#endif
+	rx_ring->next_to_clean = 0;
+	rx_ring->next_to_use = 0;
+	rx_ring->cpu = ena_irq->cpu;
+
+	return 0;
+}
+
+/* ena_free_rx_resources - Free I/O Rx Resources
+ * @adapter: network interface device structure
+ * @qid: queue index
+ *
+ * Free all receive software resources
+ */
+static void ena_free_rx_resources(struct ena_adapter *adapter,
+				  u32 qid)
+{
+	struct ena_ring *rx_ring = &adapter->rx_ring[qid];
+
+	vfree(rx_ring->rx_buffer_info);
+	rx_ring->rx_buffer_info = NULL;
+
+	vfree(rx_ring->free_rx_ids);
+	rx_ring->free_rx_ids = NULL;
+}
+
+/* ena_setup_all_rx_resources - allocate I/O Rx queues resources for all queues
+ * @adapter: board private structure
+ *
+ * Return 0 on success, negative on failure
+ */
+static int ena_setup_all_rx_resources(struct ena_adapter *adapter)
+{
+	int i, rc = 0;
+
+	for (i = 0; i < adapter->num_queues; i++) {
+		rc = ena_setup_rx_resources(adapter, i);
+		if (rc)
+			goto err_setup_rx;
+	}
+
+	return 0;
+
+err_setup_rx:
+
+	netif_err(adapter, ifup, adapter->netdev,
+		  "Rx queue %d: allocation failed\n", i);
+
+	/* rewind the index freeing the rings as we go */
+	while (i--)
+		ena_free_rx_resources(adapter, i);
+	return rc;
+}
+
+/* ena_free_all_io_rx_resources - Free I/O Rx Resources for All Queues
+ * @adapter: board private structure
+ *
+ * Free all receive software resources
+ */
+static void ena_free_all_io_rx_resources(struct ena_adapter *adapter)
+{
+	int i;
+
+	for (i = 0; i < adapter->num_queues; i++)
+		ena_free_rx_resources(adapter, i);
+}
+
+static inline int ena_alloc_rx_page(struct ena_ring *rx_ring,
+				    struct ena_rx_buffer *rx_info, gfp_t gfp)
+{
+	struct ena_com_buf *ena_buf;
+	struct page *page;
+	dma_addr_t dma;
+
+	/* if previous allocated page is not used */
+	if (unlikely(rx_info->page))
+		return 0;
+
+	page = alloc_page(gfp);
+	if (unlikely(!page)) {
+		u64_stats_update_begin(&rx_ring->syncp);
+		rx_ring->rx_stats.page_alloc_fail++;
+		u64_stats_update_end(&rx_ring->syncp);
+		return -ENOMEM;
+	}
+
+	dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE,
+			   DMA_FROM_DEVICE);
+	if (unlikely(dma_mapping_error(rx_ring->dev, dma))) {
+		u64_stats_update_begin(&rx_ring->syncp);
+		rx_ring->rx_stats.dma_mapping_err++;
+		u64_stats_update_end(&rx_ring->syncp);
+
+		__free_page(page);
+		return -EIO;
+	}
+	netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
+		  "alloc page %p, rx_info %p\n", page, rx_info);
+
+	rx_info->page = page;
+	rx_info->page_offset = 0;
+	ena_buf = &rx_info->ena_buf;
+	ena_buf->paddr = dma;
+	ena_buf->len = PAGE_SIZE;
+
+	return 0;
+}
+
+static void ena_free_rx_page(struct ena_ring *rx_ring,
+			     struct ena_rx_buffer *rx_info)
+{
+	struct page *page = rx_info->page;
+	struct ena_com_buf *ena_buf = &rx_info->ena_buf;
+
+	if (unlikely(!page)) {
+		netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev,
+			   "Trying to free unallocated buffer\n");
+		return;
+	}
+
+	dma_unmap_page(rx_ring->dev, ena_buf->paddr, PAGE_SIZE,
+		       DMA_FROM_DEVICE);
+
+	__free_page(page);
+	rx_info->page = NULL;
+}
+
+static int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num)
+{
+	u16 next_to_use, req_id;
+	u32 i;
+	int rc;
+
+	next_to_use = rx_ring->next_to_use;
+
+	for (i = 0; i < num; i++) {
+		struct ena_rx_buffer *rx_info;
+
+		req_id = rx_ring->free_rx_ids[next_to_use];
+		rc = validate_rx_req_id(rx_ring, req_id);
+		if (unlikely(rc < 0))
+			break;
+
+		rx_info = &rx_ring->rx_buffer_info[req_id];
+
+
+		rc = ena_alloc_rx_page(rx_ring, rx_info,
+				       __GFP_COLD | GFP_ATOMIC | __GFP_COMP);
+		if (unlikely(rc < 0)) {
+			netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev,
+				   "failed to alloc buffer for rx queue %d\n",
+				   rx_ring->qid);
+			break;
+		}
+		rc = ena_com_add_single_rx_desc(rx_ring->ena_com_io_sq,
+						&rx_info->ena_buf,
+						req_id);
+		if (unlikely(rc)) {
+			netif_warn(rx_ring->adapter, rx_status, rx_ring->netdev,
+				   "failed to add buffer for rx queue %d\n",
+				   rx_ring->qid);
+			break;
+		}
+		next_to_use = ENA_RX_RING_IDX_NEXT(next_to_use,
+						   rx_ring->ring_size);
+	}
+
+	if (unlikely(i < num)) {
+		u64_stats_update_begin(&rx_ring->syncp);
+		rx_ring->rx_stats.refil_partial++;
+		u64_stats_update_end(&rx_ring->syncp);
+		netdev_warn(rx_ring->netdev,
+			    "refilled rx qid %d with only %d buffers (from %d)\n",
+			    rx_ring->qid, i, num);
+	}
+
+	if (likely(i)) {
+		/* Add memory barrier to make sure the desc were written before
+		 * issue a doorbell
+		 */
+		wmb();
+		ena_com_write_sq_doorbell(rx_ring->ena_com_io_sq);
+	}
+
+	rx_ring->next_to_use = next_to_use;
+
+	return i;
+}
+
+static void ena_free_rx_bufs(struct ena_adapter *adapter,
+			     u32 qid)
+{
+	struct ena_ring *rx_ring = &adapter->rx_ring[qid];
+	u32 i;
+
+	for (i = 0; i < rx_ring->ring_size; i++) {
+		struct ena_rx_buffer *rx_info = &rx_ring->rx_buffer_info[i];
+
+		if (rx_info->page)
+			ena_free_rx_page(rx_ring, rx_info);
+	}
+}
+
+/* ena_refill_all_rx_bufs - allocate all queues Rx buffers
+ * @adapter: board private structure
+ *
+ */
+static void ena_refill_all_rx_bufs(struct ena_adapter *adapter)
+{
+	struct ena_ring *rx_ring;
+	int i, rc, bufs_num;
+
+	for (i = 0; i < adapter->num_queues; i++) {
+		rx_ring = &adapter->rx_ring[i];
+		bufs_num = rx_ring->ring_size - 1;
+		rc = ena_refill_rx_bufs(rx_ring, bufs_num);
+
+		if (unlikely(rc != bufs_num))
+			netif_warn(rx_ring->adapter, rx_status, rx_ring->netdev,
+				   "refilling Queue %d failed. allocated %d buffers from: %d\n",
+				   i, rc, bufs_num);
+	}
+}
+
+static void ena_free_all_rx_bufs(struct ena_adapter *adapter)
+{
+	int i;
+
+	for (i = 0; i < adapter->num_queues; i++)
+		ena_free_rx_bufs(adapter, i);
+}
+
+/* ena_free_tx_bufs - Free Tx Buffers per Queue
+ * @tx_ring: TX ring for which buffers be freed
+ */
+static void ena_free_tx_bufs(struct ena_ring *tx_ring)
+{
+	bool print_once = true;
+	u32 i;
+
+	for (i = 0; i < tx_ring->ring_size; i++) {
+		struct ena_tx_buffer *tx_info = &tx_ring->tx_buffer_info[i];
+		struct ena_com_buf *ena_buf;
+		int nr_frags;
+		int j;
+
+		if (!tx_info->skb)
+			continue;
+
+		if (print_once) {
+			netdev_notice(tx_ring->netdev,
+				      "free uncompleted tx skb qid %d idx 0x%x\n",
+				      tx_ring->qid, i);
+			print_once = false;
+		} else {
+			netdev_dbg(tx_ring->netdev,
+				   "free uncompleted tx skb qid %d idx 0x%x\n",
+				   tx_ring->qid, i);
+		}
+
+		ena_buf = tx_info->bufs;
+		dma_unmap_single(tx_ring->dev,
+				 ena_buf->paddr,
+				 ena_buf->len,
+				 DMA_TO_DEVICE);
+
+		/* unmap remaining mapped pages */
+		nr_frags = tx_info->num_of_bufs - 1;
+		for (j = 0; j < nr_frags; j++) {
+			ena_buf++;
+			dma_unmap_page(tx_ring->dev,
+				       ena_buf->paddr,
+				       ena_buf->len,
+				       DMA_TO_DEVICE);
+		}
+
+		dev_kfree_skb_any(tx_info->skb);
+	}
+	netdev_tx_reset_queue(netdev_get_tx_queue(tx_ring->netdev,
+						  tx_ring->qid));
+}
+
+static void ena_free_all_tx_bufs(struct ena_adapter *adapter)
+{
+	struct ena_ring *tx_ring;
+	int i;
+
+	for (i = 0; i < adapter->num_queues; i++) {
+		tx_ring = &adapter->tx_ring[i];
+		ena_free_tx_bufs(tx_ring);
+	}
+}
+
+static void ena_destroy_all_tx_queues(struct ena_adapter *adapter)
+{
+	u16 ena_qid;
+	int i;
+
+	for (i = 0; i < adapter->num_queues; i++) {
+		ena_qid = ENA_IO_TXQ_IDX(i);
+		ena_com_destroy_io_queue(adapter->ena_dev, ena_qid);
+	}
+}
+
+static void ena_destroy_all_rx_queues(struct ena_adapter *adapter)
+{
+	u16 ena_qid;
+	int i;
+
+	for (i = 0; i < adapter->num_queues; i++) {
+		ena_qid = ENA_IO_RXQ_IDX(i);
+		ena_com_destroy_io_queue(adapter->ena_dev, ena_qid);
+	}
+}
+
+static void ena_destroy_all_io_queues(struct ena_adapter *adapter)
+{
+	ena_destroy_all_tx_queues(adapter);
+	ena_destroy_all_rx_queues(adapter);
+}
+
+static int validate_tx_req_id(struct ena_ring *tx_ring, u16 req_id)
+{
+	struct ena_tx_buffer *tx_info = NULL;
+
+	if (likely(req_id < tx_ring->ring_size)) {
+		tx_info = &tx_ring->tx_buffer_info[req_id];
+		if (likely(tx_info->skb))
+			return 0;
+	}
+
+	if (tx_info)
+		netif_err(tx_ring->adapter, tx_done, tx_ring->netdev,
+			  "tx_info doesn't have valid skb\n");
+	else
+		netif_err(tx_ring->adapter, tx_done, tx_ring->netdev,
+			  "Invalid req_id: %hu\n", req_id);
+
+	u64_stats_update_begin(&tx_ring->syncp);
+	tx_ring->tx_stats.bad_req_id++;
+	u64_stats_update_end(&tx_ring->syncp);
+
+	/* Trigger device reset */
+	tx_ring->adapter->reset_reason = ENA_REGS_RESET_INV_TX_REQ_ID;
+	set_bit(ENA_FLAG_TRIGGER_RESET, &tx_ring->adapter->flags);
+	return -EFAULT;
+}
+
+static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget)
+{
+	struct netdev_queue *txq;
+	bool above_thresh;
+	u32 tx_bytes = 0;
+	u32 total_done = 0;
+	u16 next_to_clean;
+	u16 req_id;
+	int tx_pkts = 0;
+	int rc;
+
+	next_to_clean = tx_ring->next_to_clean;
+	txq = netdev_get_tx_queue(tx_ring->netdev, tx_ring->qid);
+
+	while (tx_pkts < budget) {
+		struct ena_tx_buffer *tx_info;
+		struct sk_buff *skb;
+		struct ena_com_buf *ena_buf;
+		int i, nr_frags;
+
+		rc = ena_com_tx_comp_req_id_get(tx_ring->ena_com_io_cq,
+						&req_id);
+		if (rc)
+			break;
+
+		rc = validate_tx_req_id(tx_ring, req_id);
+		if (rc)
+			break;
+
+		tx_info = &tx_ring->tx_buffer_info[req_id];
+		skb = tx_info->skb;
+
+		/* prefetch skb_end_pointer() to speedup skb_shinfo(skb) */
+		prefetch(&skb->end);
+
+		tx_info->skb = NULL;
+		tx_info->last_jiffies = 0;
+
+		if (likely(tx_info->num_of_bufs != 0)) {
+			ena_buf = tx_info->bufs;
+
+			dma_unmap_single(tx_ring->dev,
+					 dma_unmap_addr(ena_buf, paddr),
+					 dma_unmap_len(ena_buf, len),
+					 DMA_TO_DEVICE);
+
+			/* unmap remaining mapped pages */
+			nr_frags = tx_info->num_of_bufs - 1;
+			for (i = 0; i < nr_frags; i++) {
+				ena_buf++;
+				dma_unmap_page(tx_ring->dev,
+					       dma_unmap_addr(ena_buf, paddr),
+					       dma_unmap_len(ena_buf, len),
+					       DMA_TO_DEVICE);
+			}
+		}
+
+		netif_dbg(tx_ring->adapter, tx_done, tx_ring->netdev,
+			  "tx_poll: q %d skb %p completed\n", tx_ring->qid,
+			  skb);
+
+		tx_bytes += skb->len;
+		dev_kfree_skb(skb);
+		tx_pkts++;
+		total_done += tx_info->tx_descs;
+
+		tx_ring->free_tx_ids[next_to_clean] = req_id;
+		next_to_clean = ENA_TX_RING_IDX_NEXT(next_to_clean,
+						     tx_ring->ring_size);
+	}
+
+	tx_ring->next_to_clean = next_to_clean;
+	ena_com_comp_ack(tx_ring->ena_com_io_sq, total_done);
+	ena_com_update_dev_comp_head(tx_ring->ena_com_io_cq);
+
+	netif_dbg(tx_ring->adapter, tx_done, tx_ring->netdev,
+		  "tx_poll: q %d done. total pkts: %d\n",
+		  tx_ring->qid, tx_pkts);
+
+	/* need to make the rings circular update visible to
+	 * ena_start_xmit() before checking for netif_queue_stopped().
+	 */
+	smp_mb();
+
+	above_thresh = ena_com_sq_empty_space(tx_ring->ena_com_io_sq) >
+		ENA_TX_WAKEUP_THRESH;
+	if (unlikely(netif_tx_queue_stopped(txq) && above_thresh)) {
+		__netif_tx_lock(txq, smp_processor_id());
+		above_thresh = ena_com_sq_empty_space(tx_ring->ena_com_io_sq) >
+			ENA_TX_WAKEUP_THRESH;
+		if (netif_tx_queue_stopped(txq) && above_thresh) {
+			netif_tx_wake_queue(txq);
+			u64_stats_update_begin(&tx_ring->syncp);
+			tx_ring->tx_stats.queue_wakeup++;
+			u64_stats_update_end(&tx_ring->syncp);
+		}
+		__netif_tx_unlock(txq);
+	}
+
+	tx_ring->per_napi_bytes += tx_bytes;
+	tx_ring->per_napi_packets += tx_pkts;
+
+	return tx_pkts;
+}
+
+static struct sk_buff *ena_alloc_skb(struct ena_ring *rx_ring, bool frags)
+{
+	struct sk_buff *skb;
+
+	if (frags)
+		skb = napi_get_frags(rx_ring->napi);
+	else
+		skb = netdev_alloc_skb_ip_align(rx_ring->netdev,
+						rx_ring->rx_copybreak);
+
+	if (unlikely(!skb)) {
+		u64_stats_update_begin(&rx_ring->syncp);
+		rx_ring->rx_stats.skb_alloc_fail++;
+		u64_stats_update_end(&rx_ring->syncp);
+		netif_dbg(rx_ring->adapter, rx_err, rx_ring->netdev,
+			  "Failed to allocate skb. frags: %d\n", frags);
+		return NULL;
+	}
+
+	return skb;
+}
+
+static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
+				  struct ena_com_rx_buf_info *ena_bufs,
+				  u32 descs,
+				  u16 *next_to_clean)
+{
+	struct sk_buff *skb;
+	struct ena_rx_buffer *rx_info;
+	u16 len, req_id, buf = 0;
+#if ENA_BUSY_POLL_SUPPORT
+	bool polling;
+#endif
+	void *va;
+
+	len = ena_bufs[buf].len;
+	req_id = ena_bufs[buf].req_id;
+	rx_info = &rx_ring->rx_buffer_info[req_id];
+
+	if (unlikely(!rx_info->page)) {
+		netif_err(rx_ring->adapter, rx_err, rx_ring->netdev,
+			  "Page is NULL\n");
+		return NULL;
+	}
+
+	netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
+		  "rx_info %p page %p\n",
+		  rx_info, rx_info->page);
+
+	/* save virt address of first buffer */
+	va = page_address(rx_info->page) + rx_info->page_offset;
+	prefetch(va + NET_IP_ALIGN);
+
+	if (len <= rx_ring->rx_copybreak) {
+		skb = ena_alloc_skb(rx_ring, false);
+		if (unlikely(!skb))
+			return NULL;
+
+		netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
+			  "rx allocated small packet. len %d. data_len %d\n",
+			  skb->len, skb->data_len);
+
+		/* sync this buffer for CPU use */
+		dma_sync_single_for_cpu(rx_ring->dev,
+					dma_unmap_addr(&rx_info->ena_buf, paddr),
+					len,
+					DMA_FROM_DEVICE);
+		skb_copy_to_linear_data(skb, va, len);
+		dma_sync_single_for_device(rx_ring->dev,
+					   dma_unmap_addr(&rx_info->ena_buf, paddr),
+					   len,
+					   DMA_FROM_DEVICE);
+
+		skb_put(skb, len);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0)
+		skb_mark_napi_id(skb, rx_ring->napi);
+#endif
+		skb->protocol = eth_type_trans(skb, rx_ring->netdev);
+		rx_ring->free_rx_ids[*next_to_clean] = req_id;
+		*next_to_clean = ENA_RX_RING_IDX_ADD(*next_to_clean, descs,
+						     rx_ring->ring_size);
+		return skb;
+	}
+
+#if ENA_BUSY_POLL_SUPPORT
+	polling = ena_bp_busy_polling(rx_ring);
+	/* For busy poll don't allocate frag */
+	skb = ena_alloc_skb(rx_ring, !polling);
+#else
+	skb = ena_alloc_skb(rx_ring, true);
+#endif
+	if (unlikely(!skb))
+		return NULL;
+
+	do {
+		dma_unmap_page(rx_ring->dev,
+			       dma_unmap_addr(&rx_info->ena_buf, paddr),
+			       PAGE_SIZE, DMA_FROM_DEVICE);
+
+		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_info->page,
+				rx_info->page_offset, len, PAGE_SIZE);
+
+		netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
+			  "rx skb updated. len %d. data_len %d\n",
+			  skb->len, skb->data_len);
+
+		rx_info->page = NULL;
+
+		rx_ring->free_rx_ids[*next_to_clean] = req_id;
+		*next_to_clean =
+			ENA_RX_RING_IDX_NEXT(*next_to_clean,
+					     rx_ring->ring_size);
+		if (likely(--descs == 0))
+			break;
+
+		buf++;
+		len = ena_bufs[buf].len;
+		req_id = ena_bufs[buf].req_id;
+		rx_info = &rx_ring->rx_buffer_info[req_id];
+	} while (1);
+
+#if ENA_BUSY_POLL_SUPPORT
+	if (polling) {
+		int hlen;
+
+		/* copy header into the skb linear data */
+		hlen = rx_ring->rx_copybreak;
+		skb_copy_to_linear_data(skb, va, hlen);
+
+		/* adjust the first segment and skb len */
+		skb_shinfo(skb)->frags[0].page_offset += hlen;
+		skb_shinfo(skb)->frags[0].size -= hlen;
+		skb->data_len -= hlen;
+		skb->tail += hlen;
+		skb->protocol = eth_type_trans(skb, rx_ring->netdev);
+		skb_mark_napi_id(skb, rx_ring->napi);
+	}
+#endif
+	return skb;
+}
+
+/* ena_rx_checksum - indicate in skb if hw indicated a good cksum
+ * @adapter: structure containing adapter specific data
+ * @ena_rx_ctx: received packet context/metadata
+ * @skb: skb currently being received and modified
+ */
+static inline void ena_rx_checksum(struct ena_ring *rx_ring,
+				   struct ena_com_rx_ctx *ena_rx_ctx,
+				   struct sk_buff *skb)
+{
+	/* Rx csum disabled */
+	if (unlikely(!(rx_ring->netdev->features & NETIF_F_RXCSUM))) {
+		skb->ip_summed = CHECKSUM_NONE;
+		return;
+	}
+
+	/* For fragmented packets the checksum isn't valid */
+	if (ena_rx_ctx->frag) {
+		skb->ip_summed = CHECKSUM_NONE;
+		return;
+	}
+
+	/* if IP and error */
+	if (unlikely((ena_rx_ctx->l3_proto == ENA_ETH_IO_L3_PROTO_IPV4) &&
+		     (ena_rx_ctx->l3_csum_err))) {
+		/* ipv4 checksum error */
+		skb->ip_summed = CHECKSUM_NONE;
+		u64_stats_update_begin(&rx_ring->syncp);
+		rx_ring->rx_stats.bad_csum++;
+		u64_stats_update_end(&rx_ring->syncp);
+		netif_dbg(rx_ring->adapter, rx_err, rx_ring->netdev,
+			  "RX IPv4 header checksum error\n");
+		return;
+	}
+
+	/* if TCP/UDP */
+	if (likely((ena_rx_ctx->l4_proto == ENA_ETH_IO_L4_PROTO_TCP) ||
+		   (ena_rx_ctx->l4_proto == ENA_ETH_IO_L4_PROTO_UDP))) {
+		if (unlikely(ena_rx_ctx->l4_csum_err)) {
+			/* TCP/UDP checksum error */
+			u64_stats_update_begin(&rx_ring->syncp);
+			rx_ring->rx_stats.bad_csum++;
+			u64_stats_update_end(&rx_ring->syncp);
+			netif_dbg(rx_ring->adapter, rx_err, rx_ring->netdev,
+				  "RX L4 checksum error\n");
+			skb->ip_summed = CHECKSUM_NONE;
+			return;
+		}
+
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	}
+}
+
+static void ena_set_rx_hash(struct ena_ring *rx_ring,
+			    struct ena_com_rx_ctx *ena_rx_ctx,
+			    struct sk_buff *skb)
+{
+#ifdef NETIF_F_RXHASH
+	enum pkt_hash_types hash_type;
+
+	if (likely(rx_ring->netdev->features & NETIF_F_RXHASH)) {
+		if (likely((ena_rx_ctx->l4_proto == ENA_ETH_IO_L4_PROTO_TCP) ||
+			   (ena_rx_ctx->l4_proto == ENA_ETH_IO_L4_PROTO_UDP)))
+
+			hash_type = PKT_HASH_TYPE_L4;
+		else
+			hash_type = PKT_HASH_TYPE_NONE;
+
+		/* Override hash type if the packet is fragmented */
+		if (ena_rx_ctx->frag)
+			hash_type = PKT_HASH_TYPE_NONE;
+
+		skb_set_hash(skb, ena_rx_ctx->hash, hash_type);
+	}
+#endif /* NETIF_F_RXHASH */
+}
+
+/* ena_clean_rx_irq - Cleanup RX irq
+ * @rx_ring: RX ring to clean
+ * @napi: napi handler
+ * @budget: how many packets driver is allowed to clean
+ *
+ * Returns the number of cleaned buffers.
+ */
+static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
+			    u32 budget)
+{
+	u16 next_to_clean = rx_ring->next_to_clean;
+	u32 res_budget, work_done;
+
+	struct ena_com_rx_ctx ena_rx_ctx;
+	struct ena_adapter *adapter;
+	struct sk_buff *skb;
+	int refill_required;
+	int refill_threshold;
+	int rc = 0;
+	int total_len = 0;
+	int rx_copybreak_pkt = 0;
+	int i;
+
+	netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
+		  "%s qid %d\n", __func__, rx_ring->qid);
+	res_budget = budget;
+
+	do {
+		ena_rx_ctx.ena_bufs = rx_ring->ena_bufs;
+		ena_rx_ctx.max_bufs = rx_ring->sgl_size;
+		ena_rx_ctx.descs = 0;
+		rc = ena_com_rx_pkt(rx_ring->ena_com_io_cq,
+				    rx_ring->ena_com_io_sq,
+				    &ena_rx_ctx);
+		if (unlikely(rc))
+			goto error;
+
+		if (unlikely(ena_rx_ctx.descs == 0))
+			break;
+
+		netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
+			  "rx_poll: q %d got packet from ena. descs #: %d l3 proto %d l4 proto %d hash: %x\n",
+			  rx_ring->qid, ena_rx_ctx.descs, ena_rx_ctx.l3_proto,
+			  ena_rx_ctx.l4_proto, ena_rx_ctx.hash);
+
+		/* allocate skb and fill it */
+		skb = ena_rx_skb(rx_ring, rx_ring->ena_bufs, ena_rx_ctx.descs,
+				 &next_to_clean);
+
+		/* exit if we failed to retrieve a buffer */
+		if (unlikely(!skb)) {
+			for (i = 0; i < ena_rx_ctx.descs; i++) {
+				rx_ring->free_tx_ids[next_to_clean] =
+					rx_ring->ena_bufs[i].req_id;
+				next_to_clean =
+					ENA_RX_RING_IDX_NEXT(next_to_clean,
+							     rx_ring->ring_size);
+			}
+			break;
+		}
+
+		ena_rx_checksum(rx_ring, &ena_rx_ctx, skb);
+
+		ena_set_rx_hash(rx_ring, &ena_rx_ctx, skb);
+
+		skb_record_rx_queue(skb, rx_ring->qid);
+
+		if (rx_ring->ena_bufs[0].len <= rx_ring->rx_copybreak) {
+			total_len += rx_ring->ena_bufs[0].len;
+			rx_copybreak_pkt++;
+#if ENA_BUSY_POLL_SUPPORT
+			if (ena_bp_busy_polling(rx_ring))
+				netif_receive_skb(skb);
+			else
+				napi_gro_receive(napi, skb);
+#else
+			napi_gro_receive(napi, skb);
+#endif
+		} else {
+			total_len += skb->len;
+#if ENA_BUSY_POLL_SUPPORT
+			if (ena_bp_busy_polling(rx_ring))
+				netif_receive_skb(skb);
+			else
+				napi_gro_frags(napi);
+#else
+			napi_gro_frags(napi);
+#endif
+		}
+
+		res_budget--;
+	} while (likely(res_budget));
+
+	work_done = budget - res_budget;
+	rx_ring->per_napi_bytes += total_len;
+	rx_ring->per_napi_packets += work_done;
+	u64_stats_update_begin(&rx_ring->syncp);
+	rx_ring->rx_stats.bytes += total_len;
+	rx_ring->rx_stats.cnt += work_done;
+	rx_ring->rx_stats.rx_copybreak_pkt += rx_copybreak_pkt;
+	u64_stats_update_end(&rx_ring->syncp);
+
+	rx_ring->next_to_clean = next_to_clean;
+
+	refill_required = ena_com_sq_empty_space(rx_ring->ena_com_io_sq);
+	refill_threshold = rx_ring->ring_size / ENA_RX_REFILL_THRESH_DIVIDER;
+
+	/* Optimization, try to batch new rx buffers */
+	if (refill_required > refill_threshold) {
+		ena_com_update_dev_comp_head(rx_ring->ena_com_io_cq);
+		ena_refill_rx_bufs(rx_ring, refill_required);
+	}
+
+	return work_done;
+
+error:
+	adapter = netdev_priv(rx_ring->netdev);
+
+	u64_stats_update_begin(&rx_ring->syncp);
+	rx_ring->rx_stats.bad_desc_num++;
+	u64_stats_update_end(&rx_ring->syncp);
+
+	/* Too many desc from the device. Trigger reset */
+	adapter->reset_reason = ENA_REGS_RESET_TOO_MANY_RX_DESCS;
+	set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+
+	return 0;
+}
+
+inline void ena_adjust_intr_moderation(struct ena_ring *rx_ring,
+				       struct ena_ring *tx_ring)
+{
+	/* We apply adaptive moderation on Rx path only.
+	 * Tx uses static interrupt moderation.
+	 */
+	ena_com_calculate_interrupt_delay(rx_ring->ena_dev,
+					  rx_ring->per_napi_packets,
+					  rx_ring->per_napi_bytes,
+					  &rx_ring->smoothed_interval,
+					  &rx_ring->moder_tbl_idx);
+
+	/* Reset per napi packets/bytes */
+	tx_ring->per_napi_packets = 0;
+	tx_ring->per_napi_bytes = 0;
+	rx_ring->per_napi_packets = 0;
+	rx_ring->per_napi_bytes = 0;
+}
+
+static inline void ena_unmask_interrupt(struct ena_ring *tx_ring,
+					struct ena_ring *rx_ring)
+{
+	struct ena_eth_io_intr_reg intr_reg;
+
+	/* Update intr register: rx intr delay,
+	 * tx intr delay and interrupt unmask
+	 */
+	ena_com_update_intr_reg(&intr_reg,
+				rx_ring->smoothed_interval,
+				tx_ring->smoothed_interval,
+				true);
+
+	/* It is a shared MSI-X.
+	 * Tx and Rx CQ have pointer to it.
+	 * So we use one of them to reach the intr reg
+	 */
+	ena_com_unmask_intr(rx_ring->ena_com_io_cq, &intr_reg);
+}
+
+static inline void ena_update_ring_numa_node(struct ena_ring *tx_ring,
+					     struct ena_ring *rx_ring)
+{
+	int cpu = get_cpu();
+	int numa_node;
+
+	/* Check only one ring since the 2 rings are running on the same cpu */
+	if (likely(tx_ring->cpu == cpu))
+		goto out;
+
+	numa_node = cpu_to_node(cpu);
+	put_cpu();
+
+	if (numa_node != NUMA_NO_NODE) {
+		ena_com_update_numa_node(tx_ring->ena_com_io_cq, numa_node);
+		ena_com_update_numa_node(rx_ring->ena_com_io_cq, numa_node);
+	}
+
+	tx_ring->cpu = cpu;
+	rx_ring->cpu = cpu;
+
+	return;
+out:
+	put_cpu();
+}
+
+static int ena_io_poll(struct napi_struct *napi, int budget)
+{
+	struct ena_napi *ena_napi = container_of(napi, struct ena_napi, napi);
+	struct ena_ring *tx_ring, *rx_ring;
+
+	u32 tx_work_done;
+	u32 rx_work_done;
+	int tx_budget;
+	int napi_comp_call = 0;
+	int ret;
+
+	tx_ring = ena_napi->tx_ring;
+	rx_ring = ena_napi->rx_ring;
+
+	tx_budget = tx_ring->ring_size / ENA_TX_POLL_BUDGET_DIVIDER;
+
+	if (!test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags) ||
+	    test_bit(ENA_FLAG_TRIGGER_RESET, &tx_ring->adapter->flags)) {
+		napi_complete_done(napi, 0);
+		return 0;
+	}
+#if ENA_BUSY_POLL_SUPPORT
+	if (!ena_bp_lock_napi(rx_ring))
+		return budget;
+#endif
+
+	tx_work_done = ena_clean_tx_irq(tx_ring, tx_budget);
+	rx_work_done = ena_clean_rx_irq(rx_ring, napi, budget);
+
+	/* If the device is about to reset or down, avoid unmask
+	 * the interrupt and return 0 so NAPI won't reschedule
+	 */
+	if (unlikely(!test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags) ||
+		     test_bit(ENA_FLAG_TRIGGER_RESET, &tx_ring->adapter->flags))) {
+		napi_complete_done(napi, 0);
+		ret = 0;
+
+	} else if ((budget > rx_work_done) && (tx_budget > tx_work_done)) {
+		napi_comp_call = 1;
+
+		/* Update numa and unmask the interrupt only when schedule
+		 * from the interrupt context (vs from sk_busy_loop)
+		 */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
+		if (napi_complete_done(napi, rx_work_done)) {
+#else
+		napi_complete_done(napi, rx_work_done);
+		if (atomic_cmpxchg(&ena_napi->unmask_interrupt, 1, 0)) {
+#endif
+			/* Tx and Rx share the same interrupt vector */
+			if (ena_com_get_adaptive_moderation_enabled(rx_ring->ena_dev))
+				ena_adjust_intr_moderation(rx_ring, tx_ring);
+
+			ena_unmask_interrupt(tx_ring, rx_ring);
+		}
+
+		ena_update_ring_numa_node(tx_ring, rx_ring);
+
+		ret = rx_work_done;
+	} else {
+		ret = budget;
+	}
+
+	u64_stats_update_begin(&tx_ring->syncp);
+	tx_ring->tx_stats.napi_comp += napi_comp_call;
+	tx_ring->tx_stats.tx_poll++;
+	u64_stats_update_end(&tx_ring->syncp);
+
+#if ENA_BUSY_POLL_SUPPORT
+	ena_bp_unlock_napi(rx_ring);
+#endif
+	return ret;
+}
+
+static irqreturn_t ena_intr_msix_mgmnt(int irq, void *data)
+{
+	struct ena_adapter *adapter = (struct ena_adapter *)data;
+
+	ena_com_admin_q_comp_intr_handler(adapter->ena_dev);
+
+	/* Don't call the aenq handler before probe is done */
+	if (likely(test_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags)))
+		ena_com_aenq_intr_handler(adapter->ena_dev, data);
+
+	return IRQ_HANDLED;
+}
+
+/* ena_intr_msix_io - MSI-X Interrupt Handler for Tx/Rx
+ * @irq: interrupt number
+ * @data: pointer to a network interface private napi device structure
+ */
+static irqreturn_t ena_intr_msix_io(int irq, void *data)
+{
+	struct ena_napi *ena_napi = data;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
+	napi_schedule_irqoff(&ena_napi->napi);
+#else
+	atomic_set(&ena_napi->unmask_interrupt, 1);
+	napi_schedule_irqoff(&ena_napi->napi);
+#endif
+
+	ena_napi->tx_ring->first_interrupt = true;
+	ena_napi->rx_ring->first_interrupt = true;
+
+	return IRQ_HANDLED;
+}
+
+/* Reserve a single MSI-X vector for management (admin + aenq).
+ * plus reserve one vector for each potential io queue.
+ * the number of potential io queues is the minimum of what the device
+ * supports and the number of vCPUs.
+ */
+static int ena_enable_msix(struct ena_adapter *adapter, int num_queues)
+{
+	int msix_vecs, irq_cnt;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
+	int i;
+#endif
+
+	if (test_bit(ENA_FLAG_MSIX_ENABLED, &adapter->flags)) {
+		netif_err(adapter, probe, adapter->netdev,
+			  "Error, MSI-X is already enabled\n");
+		return -EPERM;
+	}
+
+	/* Reserved the max msix vectors we might need */
+	msix_vecs = ENA_MAX_MSIX_VEC(num_queues);
+
+	netif_dbg(adapter, probe, adapter->netdev,
+		  "trying to enable MSI-X, vectors %d\n", msix_vecs);
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
+	adapter->msix_entries = vzalloc(msix_vecs * sizeof(struct msix_entry));
+
+	if (!adapter->msix_entries)
+		return -ENOMEM;
+
+	for (i = 0; i < msix_vecs; i++)
+		adapter->msix_entries[i].entry = i;
+
+	irq_cnt = pci_enable_msix_range(adapter->pdev, adapter->msix_entries,
+					ENA_MIN_MSIX_VEC, msix_vecs);
+#else
+	irq_cnt = pci_alloc_irq_vectors(adapter->pdev, ENA_MIN_MSIX_VEC,
+					msix_vecs, PCI_IRQ_MSIX);
+#endif
+
+	if (irq_cnt < 0) {
+		netif_err(adapter, probe, adapter->netdev,
+			  "Failed to enable MSI-X. irq_cnt %d\n", irq_cnt);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
+		vfree(adapter->msix_entries);
+#endif
+		return -ENOSPC;
+	}
+
+	if (irq_cnt != msix_vecs) {
+		netif_notice(adapter, probe, adapter->netdev,
+			     "enable only %d MSI-X (out of %d), reduce the number of queues\n",
+			     irq_cnt, msix_vecs);
+		adapter->num_queues = irq_cnt - ENA_ADMIN_MSIX_VEC;
+	}
+
+	if (ena_init_rx_cpu_rmap(adapter))
+		netif_warn(adapter, probe, adapter->netdev,
+			   "Failed to map IRQs to CPUs\n");
+
+	adapter->msix_vecs = irq_cnt;
+	set_bit(ENA_FLAG_MSIX_ENABLED, &adapter->flags);
+
+	return 0;
+}
+
+static void ena_setup_mgmnt_intr(struct ena_adapter *adapter)
+{
+	u32 cpu;
+
+	snprintf(adapter->irq_tbl[ENA_MGMNT_IRQ_IDX].name,
+		 ENA_IRQNAME_SIZE, "ena-mgmnt@pci:%s",
+		 pci_name(adapter->pdev));
+	adapter->irq_tbl[ENA_MGMNT_IRQ_IDX].handler =
+		ena_intr_msix_mgmnt;
+	adapter->irq_tbl[ENA_MGMNT_IRQ_IDX].data = adapter;
+	adapter->irq_tbl[ENA_MGMNT_IRQ_IDX].vector =
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
+		adapter->msix_entries[ENA_MGMNT_IRQ_IDX].vector;
+#else
+		pci_irq_vector(adapter->pdev, ENA_MGMNT_IRQ_IDX);
+#endif
+	cpu = cpumask_first(cpu_online_mask);
+	adapter->irq_tbl[ENA_MGMNT_IRQ_IDX].cpu = cpu;
+	cpumask_set_cpu(cpu,
+			&adapter->irq_tbl[ENA_MGMNT_IRQ_IDX].affinity_hint_mask);
+}
+
+static void ena_setup_io_intr(struct ena_adapter *adapter)
+{
+	struct net_device *netdev;
+	int irq_idx, i, cpu;
+
+	netdev = adapter->netdev;
+
+	for (i = 0; i < adapter->num_queues; i++) {
+		irq_idx = ENA_IO_IRQ_IDX(i);
+		cpu = i % num_online_cpus();
+
+		snprintf(adapter->irq_tbl[irq_idx].name, ENA_IRQNAME_SIZE,
+			 "%s-Tx-Rx-%d", netdev->name, i);
+		adapter->irq_tbl[irq_idx].handler = ena_intr_msix_io;
+		adapter->irq_tbl[irq_idx].data = &adapter->ena_napi[i];
+		adapter->irq_tbl[irq_idx].vector =
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
+			adapter->msix_entries[irq_idx].vector;
+#else
+			pci_irq_vector(adapter->pdev, irq_idx);
+#endif
+		adapter->irq_tbl[irq_idx].cpu = cpu;
+
+		cpumask_set_cpu(cpu,
+				&adapter->irq_tbl[irq_idx].affinity_hint_mask);
+	}
+}
+
+static int ena_request_mgmnt_irq(struct ena_adapter *adapter)
+{
+	unsigned long flags = 0;
+	struct ena_irq *irq;
+	int rc;
+
+	irq = &adapter->irq_tbl[ENA_MGMNT_IRQ_IDX];
+	rc = request_irq(irq->vector, irq->handler, flags, irq->name,
+			 irq->data);
+	if (rc) {
+		netif_err(adapter, probe, adapter->netdev,
+			  "failed to request admin irq\n");
+		return rc;
+	}
+
+	netif_dbg(adapter, probe, adapter->netdev,
+		  "set affinity hint of mgmnt irq.to 0x%lx (irq vector: %d)\n",
+		  irq->affinity_hint_mask.bits[0], irq->vector);
+
+	return rc;
+}
+
+static int ena_request_io_irq(struct ena_adapter *adapter)
+{
+	unsigned long flags = 0;
+	struct ena_irq *irq;
+	int rc = 0, i, k;
+
+	if (!test_bit(ENA_FLAG_MSIX_ENABLED, &adapter->flags)) {
+		netif_err(adapter, ifup, adapter->netdev,
+			  "Failed to request I/O IRQ: MSI-X is not enabled\n");
+		return -EINVAL;
+	}
+
+	for (i = ENA_IO_IRQ_FIRST_IDX; i < adapter->msix_vecs; i++) {
+		irq = &adapter->irq_tbl[i];
+		rc = request_irq(irq->vector, irq->handler, flags, irq->name,
+				 irq->data);
+		if (rc) {
+			netif_err(adapter, ifup, adapter->netdev,
+				  "Failed to request I/O IRQ. index %d rc %d\n",
+				   i, rc);
+			goto err;
+		}
+
+		netif_dbg(adapter, ifup, adapter->netdev,
+			  "set affinity hint of irq. index %d to 0x%lx (irq vector: %d)\n",
+			  i, irq->affinity_hint_mask.bits[0], irq->vector);
+	}
+
+	return rc;
+
+err:
+	for (k = ENA_IO_IRQ_FIRST_IDX; k < i; k++) {
+		irq = &adapter->irq_tbl[k];
+		free_irq(irq->vector, irq->data);
+	}
+
+	return rc;
+}
+
+static void ena_free_mgmnt_irq(struct ena_adapter *adapter)
+{
+	struct ena_irq *irq;
+
+	irq = &adapter->irq_tbl[ENA_MGMNT_IRQ_IDX];
+	synchronize_irq(irq->vector);
+	irq_set_affinity_hint(irq->vector, NULL);
+	free_irq(irq->vector, irq->data);
+}
+
+static void ena_free_io_irq(struct ena_adapter *adapter)
+{
+	struct ena_irq *irq;
+	int i;
+
+#ifdef CONFIG_RFS_ACCEL
+	if (adapter->msix_vecs >= 1) {
+		free_irq_cpu_rmap(adapter->netdev->rx_cpu_rmap);
+		adapter->netdev->rx_cpu_rmap = NULL;
+	}
+#endif /* CONFIG_RFS_ACCEL */
+
+	for (i = ENA_IO_IRQ_FIRST_IDX; i < adapter->msix_vecs; i++) {
+		irq = &adapter->irq_tbl[i];
+		irq_set_affinity_hint(irq->vector, NULL);
+		free_irq(irq->vector, irq->data);
+	}
+}
+
+static void ena_disable_msix(struct ena_adapter *adapter)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
+	if (test_and_clear_bit(ENA_FLAG_MSIX_ENABLED, &adapter->flags))
+		pci_disable_msix(adapter->pdev);
+
+	if (adapter->msix_entries)
+		vfree(adapter->msix_entries);
+	adapter->msix_entries = NULL;
+#else
+	if (test_and_clear_bit(ENA_FLAG_MSIX_ENABLED, &adapter->flags))
+		pci_free_irq_vectors(adapter->pdev);
+#endif
+}
+
+static void ena_disable_io_intr_sync(struct ena_adapter *adapter)
+{
+	int i;
+
+	if (!netif_running(adapter->netdev))
+		return;
+
+	for (i = ENA_IO_IRQ_FIRST_IDX; i < adapter->msix_vecs; i++)
+		synchronize_irq(adapter->irq_tbl[i].vector);
+}
+
+static void ena_del_napi(struct ena_adapter *adapter)
+{
+	int i;
+
+	for (i = 0; i < adapter->num_queues; i++) {
+		napi_hash_del(&adapter->ena_napi[i].napi);
+		netif_napi_del(&adapter->ena_napi[i].napi);
+	}
+}
+
+static void ena_init_napi(struct ena_adapter *adapter)
+{
+	struct ena_napi *napi;
+	int i;
+
+	for (i = 0; i < adapter->num_queues; i++) {
+		napi = &adapter->ena_napi[i];
+
+		netif_napi_add(adapter->netdev,
+			       &adapter->ena_napi[i].napi,
+			       ena_io_poll,
+			       ENA_NAPI_BUDGET);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0)
+		napi_hash_add(&adapter->ena_napi[i].napi);
+#endif
+		napi->rx_ring = &adapter->rx_ring[i];
+		napi->tx_ring = &adapter->tx_ring[i];
+		napi->qid = i;
+	}
+}
+
+#if ENA_BUSY_POLL_SUPPORT
+static void ena_napi_disable_all(struct ena_adapter *adapter)
+{
+	struct ena_ring *rx_ring;
+	int i, timeout;
+
+	for (i = 0; i < adapter->num_queues; i++) {
+		napi_disable(&adapter->ena_napi[i].napi);
+
+		rx_ring = &adapter->rx_ring[i];
+		timeout = 100;
+		while (!ena_bp_disable(rx_ring)) {
+			netif_info(adapter, ifdown, adapter->netdev,
+				   "Rx queue %d locked\n", i);
+			usleep_range(1000, 2000);
+			timeout--;
+
+			if (!timeout) {
+				netif_err(adapter, ifdown, adapter->netdev,
+					  "Tx queue is stuck\n");
+				continue;
+			}
+		}
+	}
+}
+#else
+static void ena_napi_disable_all(struct ena_adapter *adapter)
+{
+	int i;
+
+	for (i = 0; i < adapter->num_queues; i++)
+		napi_disable(&adapter->ena_napi[i].napi);
+}
+#endif
+
+static void ena_napi_enable_all(struct ena_adapter *adapter)
+{
+	int i;
+
+	for (i = 0; i < adapter->num_queues; i++)
+		napi_enable(&adapter->ena_napi[i].napi);
+}
+
+static void ena_restore_ethtool_params(struct ena_adapter *adapter)
+{
+	adapter->tx_usecs = 0;
+	adapter->rx_usecs = 0;
+	adapter->tx_frames = 1;
+	adapter->rx_frames = 1;
+}
+
+/* Configure the Rx forwarding */
+static int ena_rss_configure(struct ena_adapter *adapter)
+{
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	int rc;
+
+	/* In case the RSS table wasn't initialized by probe */
+	if (!ena_dev->rss.tbl_log_size) {
+		rc = ena_rss_init_default(adapter);
+		if (rc && (rc != -EOPNOTSUPP)) {
+			netif_err(adapter, ifup, adapter->netdev,
+				  "Failed to init RSS rc: %d\n", rc);
+			return rc;
+		}
+	}
+
+	/* Set indirect table */
+	rc = ena_com_indirect_table_set(ena_dev);
+	if (unlikely(rc && rc != -EOPNOTSUPP))
+		return rc;
+
+	/* Configure hash function (if supported) */
+	rc = ena_com_set_hash_function(ena_dev);
+	if (unlikely(rc && (rc != -EOPNOTSUPP)))
+		return rc;
+
+	/* Configure hash inputs (if supported) */
+	rc = ena_com_set_hash_ctrl(ena_dev);
+	if (unlikely(rc && (rc != -EOPNOTSUPP)))
+		return rc;
+
+	return 0;
+}
+
+static int ena_up_complete(struct ena_adapter *adapter)
+{
+	int rc;
+
+	rc = ena_rss_configure(adapter);
+	if (rc)
+		return rc;
+
+	ena_init_napi(adapter);
+
+	ena_change_mtu(adapter->netdev, adapter->netdev->mtu);
+
+	ena_refill_all_rx_bufs(adapter);
+
+	/* enable transmits */
+	netif_tx_start_all_queues(adapter->netdev);
+
+	ena_restore_ethtool_params(adapter);
+
+	ena_napi_enable_all(adapter);
+
+	return 0;
+}
+
+static int ena_create_io_tx_queue(struct ena_adapter *adapter, int qid)
+{
+	struct ena_com_create_io_ctx ctx = { 0 };
+	struct ena_com_dev *ena_dev;
+	struct ena_ring *tx_ring;
+	u32 msix_vector;
+	u16 ena_qid;
+	int rc;
+
+	ena_dev = adapter->ena_dev;
+
+	tx_ring = &adapter->tx_ring[qid];
+	msix_vector = ENA_IO_IRQ_IDX(qid);
+	ena_qid = ENA_IO_TXQ_IDX(qid);
+
+	ctx.direction = ENA_COM_IO_QUEUE_DIRECTION_TX;
+	ctx.qid = ena_qid;
+	ctx.mem_queue_type = ena_dev->tx_mem_queue_type;
+	ctx.msix_vector = msix_vector;
+	ctx.queue_size = adapter->tx_ring_size;
+	ctx.numa_node = cpu_to_node(tx_ring->cpu);
+
+	rc = ena_com_create_io_queue(ena_dev, &ctx);
+	if (rc) {
+		netif_err(adapter, ifup, adapter->netdev,
+			  "Failed to create I/O TX queue num %d rc: %d\n",
+			  qid, rc);
+		return rc;
+	}
+
+	rc = ena_com_get_io_handlers(ena_dev, ena_qid,
+				     &tx_ring->ena_com_io_sq,
+				     &tx_ring->ena_com_io_cq);
+	if (rc) {
+		netif_err(adapter, ifup, adapter->netdev,
+			  "Failed to get TX queue handlers. TX queue num %d rc: %d\n",
+			  qid, rc);
+		ena_com_destroy_io_queue(ena_dev, ena_qid);
+		return rc;
+	}
+
+	ena_com_update_numa_node(tx_ring->ena_com_io_cq, ctx.numa_node);
+	return rc;
+}
+
+static int ena_create_all_io_tx_queues(struct ena_adapter *adapter)
+{
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	int rc, i;
+
+	for (i = 0; i < adapter->num_queues; i++) {
+		rc = ena_create_io_tx_queue(adapter, i);
+		if (rc)
+			goto create_err;
+	}
+
+	return 0;
+
+create_err:
+	while (i--)
+		ena_com_destroy_io_queue(ena_dev, ENA_IO_TXQ_IDX(i));
+
+	return rc;
+}
+
+static int ena_create_io_rx_queue(struct ena_adapter *adapter, int qid)
+{
+	struct ena_com_dev *ena_dev;
+	struct ena_com_create_io_ctx ctx = { 0 };
+	struct ena_ring *rx_ring;
+	u32 msix_vector;
+	u16 ena_qid;
+	int rc;
+
+	ena_dev = adapter->ena_dev;
+
+	rx_ring = &adapter->rx_ring[qid];
+	msix_vector = ENA_IO_IRQ_IDX(qid);
+	ena_qid = ENA_IO_RXQ_IDX(qid);
+
+	ctx.qid = ena_qid;
+	ctx.direction = ENA_COM_IO_QUEUE_DIRECTION_RX;
+	ctx.mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
+	ctx.msix_vector = msix_vector;
+	ctx.queue_size = adapter->rx_ring_size;
+	ctx.numa_node = cpu_to_node(rx_ring->cpu);
+
+	rc = ena_com_create_io_queue(ena_dev, &ctx);
+	if (rc) {
+		netif_err(adapter, ifup, adapter->netdev,
+			  "Failed to create I/O RX queue num %d rc: %d\n",
+			  qid, rc);
+		return rc;
+	}
+
+	rc = ena_com_get_io_handlers(ena_dev, ena_qid,
+				     &rx_ring->ena_com_io_sq,
+				     &rx_ring->ena_com_io_cq);
+	if (rc) {
+		netif_err(adapter, ifup, adapter->netdev,
+			  "Failed to get RX queue handlers. RX queue num %d rc: %d\n",
+			  qid, rc);
+		ena_com_destroy_io_queue(ena_dev, ena_qid);
+		return rc;
+	}
+
+	ena_com_update_numa_node(rx_ring->ena_com_io_cq, ctx.numa_node);
+
+	return rc;
+}
+
+static int ena_create_all_io_rx_queues(struct ena_adapter *adapter)
+{
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	int rc, i;
+
+	for (i = 0; i < adapter->num_queues; i++) {
+		rc = ena_create_io_rx_queue(adapter, i);
+		if (rc)
+			goto create_err;
+	}
+
+	return 0;
+
+create_err:
+	while (i--)
+		ena_com_destroy_io_queue(ena_dev, ENA_IO_RXQ_IDX(i));
+
+	return rc;
+}
+
+static int ena_up(struct ena_adapter *adapter)
+{
+	int rc, i;
+
+	netdev_dbg(adapter->netdev, "%s\n", __func__);
+
+	ena_setup_io_intr(adapter);
+
+	rc = ena_request_io_irq(adapter);
+	if (rc)
+		goto err_req_irq;
+
+	/* allocate transmit descriptors */
+	rc = ena_setup_all_tx_resources(adapter);
+	if (rc)
+		goto err_setup_tx;
+
+	/* allocate receive descriptors */
+	rc = ena_setup_all_rx_resources(adapter);
+	if (rc)
+		goto err_setup_rx;
+
+	/* Create TX queues */
+	rc = ena_create_all_io_tx_queues(adapter);
+	if (rc)
+		goto err_create_tx_queues;
+
+	/* Create RX queues */
+	rc = ena_create_all_io_rx_queues(adapter);
+	if (rc)
+		goto err_create_rx_queues;
+
+	rc = ena_up_complete(adapter);
+	if (rc)
+		goto err_up;
+
+	if (test_bit(ENA_FLAG_LINK_UP, &adapter->flags))
+		netif_carrier_on(adapter->netdev);
+
+	u64_stats_update_begin(&adapter->syncp);
+	adapter->dev_stats.interface_up++;
+	u64_stats_update_end(&adapter->syncp);
+
+	set_bit(ENA_FLAG_DEV_UP, &adapter->flags);
+
+	/* Enable completion queues interrupt */
+	for (i = 0; i < adapter->num_queues; i++)
+		ena_unmask_interrupt(&adapter->tx_ring[i],
+				     &adapter->rx_ring[i]);
+
+	/* schedule napi in case we had pending packets
+	 * from the last time we disable napi
+	 */
+	for (i = 0; i < adapter->num_queues; i++)
+		napi_schedule(&adapter->ena_napi[i].napi);
+
+	return rc;
+
+err_up:
+	ena_destroy_all_rx_queues(adapter);
+err_create_rx_queues:
+	ena_destroy_all_tx_queues(adapter);
+err_create_tx_queues:
+	ena_free_all_io_rx_resources(adapter);
+err_setup_rx:
+	ena_free_all_io_tx_resources(adapter);
+err_setup_tx:
+	ena_free_io_irq(adapter);
+err_req_irq:
+
+	return rc;
+}
+
+static void ena_down(struct ena_adapter *adapter)
+{
+	netif_info(adapter, ifdown, adapter->netdev, "%s\n", __func__);
+
+	clear_bit(ENA_FLAG_DEV_UP, &adapter->flags);
+
+	u64_stats_update_begin(&adapter->syncp);
+	adapter->dev_stats.interface_down++;
+	u64_stats_update_end(&adapter->syncp);
+
+	netif_carrier_off(adapter->netdev);
+	netif_tx_disable(adapter->netdev);
+
+	/* After this point the napi handler won't enable the tx queue */
+	ena_napi_disable_all(adapter);
+
+	/* After destroy the queue there won't be any new interrupts */
+
+	if (test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags)) {
+		int rc;
+
+		rc = ena_com_dev_reset(adapter->ena_dev, adapter->reset_reason);
+		if (rc)
+			dev_err(&adapter->pdev->dev, "Device reset failed\n");
+	}
+
+	ena_destroy_all_io_queues(adapter);
+
+	ena_disable_io_intr_sync(adapter);
+	ena_free_io_irq(adapter);
+	ena_del_napi(adapter);
+
+	ena_free_all_tx_bufs(adapter);
+	ena_free_all_rx_bufs(adapter);
+	ena_free_all_io_tx_resources(adapter);
+	ena_free_all_io_rx_resources(adapter);
+}
+
+/* ena_open - Called when a network interface is made active
+ * @netdev: network interface device structure
+ *
+ * Returns 0 on success, negative value on failure
+ *
+ * The open entry point is called when a network interface is made
+ * active by the system (IFF_UP).  At this point all resources needed
+ * for transmit and receive operations are allocated, the interrupt
+ * handler is registered with the OS, the watchdog timer is started,
+ * and the stack is notified that the interface is ready.
+ */
+static int ena_open(struct net_device *netdev)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	int rc;
+
+	/* Notify the stack of the actual queue counts. */
+	rc = netif_set_real_num_tx_queues(netdev, adapter->num_queues);
+	if (rc) {
+		netif_err(adapter, ifup, netdev, "Can't set num tx queues\n");
+		return rc;
+	}
+
+	rc = netif_set_real_num_rx_queues(netdev, adapter->num_queues);
+	if (rc) {
+		netif_err(adapter, ifup, netdev, "Can't set num rx queues\n");
+		return rc;
+	}
+
+	rc = ena_up(adapter);
+	if (rc)
+		return rc;
+
+	return rc;
+}
+
+/* ena_close - Disables a network interface
+ * @netdev: network interface device structure
+ *
+ * Returns 0, this is not allowed to fail
+ *
+ * The close entry point is called when an interface is de-activated
+ * by the OS.  The hardware is still under the drivers control, but
+ * needs to be disabled.  A global MAC reset is issued to stop the
+ * hardware, and all transmit and receive resources are freed.
+ */
+static int ena_close(struct net_device *netdev)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+
+	netif_dbg(adapter, ifdown, netdev, "%s\n", __func__);
+
+	if (test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
+		ena_down(adapter);
+
+	/* Check for device status and issue reset if needed*/
+	check_for_admin_com_state(adapter);
+	if (unlikely(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) {
+		netif_err(adapter, ifdown, adapter->netdev,
+			  "Destroy failure, restarting device\n");
+		ena_dump_stats_to_dmesg(adapter);
+		/* rtnl lock already obtained in dev_ioctl() layer */
+		ena_destroy_device(adapter);
+		ena_restore_device(adapter);
+	}
+
+	return 0;
+}
+
+static void ena_tx_csum(struct ena_com_tx_ctx *ena_tx_ctx, struct sk_buff *skb)
+{
+	u32 mss = skb_shinfo(skb)->gso_size;
+	struct ena_com_tx_meta *ena_meta = &ena_tx_ctx->ena_meta;
+	u8 l4_protocol = 0;
+
+	if ((skb->ip_summed == CHECKSUM_PARTIAL) || mss) {
+		ena_tx_ctx->l4_csum_enable = 1;
+		if (mss) {
+			ena_tx_ctx->tso_enable = 1;
+			ena_meta->l4_hdr_len = tcp_hdr(skb)->doff;
+			ena_tx_ctx->l4_csum_partial = 0;
+		} else {
+			ena_tx_ctx->tso_enable = 0;
+			ena_meta->l4_hdr_len = 0;
+			ena_tx_ctx->l4_csum_partial = 1;
+		}
+
+		switch (ip_hdr(skb)->version) {
+		case IPVERSION:
+			ena_tx_ctx->l3_proto = ENA_ETH_IO_L3_PROTO_IPV4;
+			if (ip_hdr(skb)->frag_off & htons(IP_DF))
+				ena_tx_ctx->df = 1;
+			if (mss)
+				ena_tx_ctx->l3_csum_enable = 1;
+			l4_protocol = ip_hdr(skb)->protocol;
+			break;
+		case 6:
+			ena_tx_ctx->l3_proto = ENA_ETH_IO_L3_PROTO_IPV6;
+			l4_protocol = ipv6_hdr(skb)->nexthdr;
+			break;
+		default:
+			break;
+		}
+
+		if (l4_protocol == IPPROTO_TCP)
+			ena_tx_ctx->l4_proto = ENA_ETH_IO_L4_PROTO_TCP;
+		else
+			ena_tx_ctx->l4_proto = ENA_ETH_IO_L4_PROTO_UDP;
+
+		ena_meta->mss = mss;
+		ena_meta->l3_hdr_len = skb_network_header_len(skb);
+		ena_meta->l3_hdr_offset = skb_network_offset(skb);
+		ena_tx_ctx->meta_valid = 1;
+
+	} else {
+		ena_tx_ctx->meta_valid = 0;
+	}
+}
+
+static int ena_check_and_linearize_skb(struct ena_ring *tx_ring,
+				       struct sk_buff *skb)
+{
+	int num_frags, header_len, rc;
+
+	num_frags = skb_shinfo(skb)->nr_frags;
+	header_len = skb_headlen(skb);
+
+	if (num_frags < tx_ring->sgl_size)
+		return 0;
+
+	if ((num_frags == tx_ring->sgl_size) &&
+	    (header_len < tx_ring->tx_max_header_size))
+		return 0;
+
+	u64_stats_update_begin(&tx_ring->syncp);
+	tx_ring->tx_stats.linearize++;
+	u64_stats_update_end(&tx_ring->syncp);
+
+	rc = skb_linearize(skb);
+	if (unlikely(rc)) {
+		u64_stats_update_begin(&tx_ring->syncp);
+		tx_ring->tx_stats.linearize_failed++;
+		u64_stats_update_end(&tx_ring->syncp);
+	}
+
+	return rc;
+}
+
+/* Called with netif_tx_lock. */
+static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ena_adapter *adapter = netdev_priv(dev);
+	struct ena_tx_buffer *tx_info;
+	struct ena_com_tx_ctx ena_tx_ctx;
+	struct ena_ring *tx_ring;
+	struct netdev_queue *txq;
+	struct ena_com_buf *ena_buf;
+	void *push_hdr;
+	u32 len, pkt_len, last_frag;
+	u16 next_to_use;
+	u16 req_id;
+	u16 push_len;
+	u16 header_len;
+	dma_addr_t dma;
+	int qid, rc, nb_hw_desc;
+	int i = -1;
+
+	netif_dbg(adapter, tx_queued, dev, "%s skb %p\n", __func__, skb);
+	/*  Determine which tx ring we will be placed on */
+	qid = skb_get_queue_mapping(skb);
+	tx_ring = &adapter->tx_ring[qid];
+	txq = netdev_get_tx_queue(dev, qid);
+
+	rc = ena_check_and_linearize_skb(tx_ring, skb);
+	if (unlikely(rc))
+		goto error_drop_packet;
+
+	skb_tx_timestamp(skb);
+	len = skb_headlen(skb);
+	pkt_len = skb->len;
+
+	next_to_use = tx_ring->next_to_use;
+	req_id = tx_ring->free_tx_ids[next_to_use];
+	tx_info = &tx_ring->tx_buffer_info[req_id];
+	tx_info->num_of_bufs = 0;
+
+	WARN(tx_info->skb, "SKB isn't NULL req_id %d\n", req_id);
+	ena_buf = tx_info->bufs;
+	tx_info->skb = skb;
+
+	if (tx_ring->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
+		/* prepared the push buffer */
+		push_len = min_t(u32, pkt_len, tx_ring->tx_max_header_size);
+		header_len = push_len;
+		push_hdr = skb->data;
+	} else {
+		push_hdr = NULL;
+		push_len = 0;
+		header_len = min_t(u32, len, tx_ring->tx_max_header_size);
+	}
+
+	netif_dbg(adapter, tx_queued, dev,
+		  "skb: %p header_buf->vaddr: %p push_len: %d\n", skb,
+		  push_hdr, push_len);
+
+	if (len > push_len) {
+		dma = dma_map_single(tx_ring->dev, skb->data + push_len,
+				     len - push_len, DMA_TO_DEVICE);
+		if (dma_mapping_error(tx_ring->dev, dma))
+			goto error_report_dma_error;
+
+		ena_buf->paddr = dma;
+		ena_buf->len = len - push_len;
+
+		ena_buf++;
+		tx_info->num_of_bufs++;
+	}
+
+	last_frag = skb_shinfo(skb)->nr_frags;
+
+	for (i = 0; i < last_frag; i++) {
+		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+		len = skb_frag_size(frag);
+		dma = skb_frag_dma_map(tx_ring->dev, frag, 0, len,
+				       DMA_TO_DEVICE);
+		if (dma_mapping_error(tx_ring->dev, dma))
+			goto error_report_dma_error;
+
+		ena_buf->paddr = dma;
+		ena_buf->len = len;
+		ena_buf++;
+	}
+
+	tx_info->num_of_bufs += last_frag;
+
+	memset(&ena_tx_ctx, 0x0, sizeof(struct ena_com_tx_ctx));
+	ena_tx_ctx.ena_bufs = tx_info->bufs;
+	ena_tx_ctx.push_header = push_hdr;
+	ena_tx_ctx.num_bufs = tx_info->num_of_bufs;
+	ena_tx_ctx.req_id = req_id;
+	ena_tx_ctx.header_len = header_len;
+
+	/* set flags and meta data */
+	ena_tx_csum(&ena_tx_ctx, skb);
+
+	/* prepare the packet's descriptors to dma engine */
+	rc = ena_com_prepare_tx(tx_ring->ena_com_io_sq, &ena_tx_ctx,
+				&nb_hw_desc);
+
+	if (unlikely(rc)) {
+		netif_err(adapter, tx_queued, dev,
+			  "failed to prepare tx bufs\n");
+		u64_stats_update_begin(&tx_ring->syncp);
+		tx_ring->tx_stats.queue_stop++;
+		tx_ring->tx_stats.prepare_ctx_err++;
+		u64_stats_update_end(&tx_ring->syncp);
+		netif_tx_stop_queue(txq);
+		goto error_unmap_dma;
+	}
+
+	u64_stats_update_begin(&tx_ring->syncp);
+	tx_ring->tx_stats.cnt++;
+	tx_ring->tx_stats.bytes += skb->len;
+	u64_stats_update_end(&tx_ring->syncp);
+
+	tx_info->tx_descs = nb_hw_desc;
+	tx_info->last_jiffies = jiffies;
+	tx_info->print_once = 0;
+
+	tx_ring->next_to_use = ENA_TX_RING_IDX_NEXT(next_to_use,
+		tx_ring->ring_size);
+
+	/* This WMB is aimed to:
+	 * 1 - perform smp barrier before reading next_to_completion
+	 * 2 - make sure the desc were written before trigger DB
+	 */
+	wmb();
+
+	/* stop the queue when no more space available, the packet can have up
+	 * to sgl_size + 2. one for the meta descriptor and one for header
+	 * (if the header is larger than tx_max_header_size).
+	 */
+	if (unlikely(ena_com_sq_empty_space(tx_ring->ena_com_io_sq) <
+		     (tx_ring->sgl_size + 2))) {
+		netif_dbg(adapter, tx_queued, dev, "%s stop queue %d\n",
+			  __func__, qid);
+
+		netif_tx_stop_queue(txq);
+		u64_stats_update_begin(&tx_ring->syncp);
+		tx_ring->tx_stats.queue_stop++;
+		u64_stats_update_end(&tx_ring->syncp);
+
+		/* There is a rare condition where this function decide to
+		 * stop the queue but meanwhile clean_tx_irq updates
+		 * next_to_completion and terminates.
+		 * The queue will remain stopped forever.
+		 * To solve this issue this function perform rmb, check
+		 * the wakeup condition and wake up the queue if needed.
+		 */
+		smp_rmb();
+
+		if (ena_com_sq_empty_space(tx_ring->ena_com_io_sq)
+				> ENA_TX_WAKEUP_THRESH) {
+			netif_tx_wake_queue(txq);
+			u64_stats_update_begin(&tx_ring->syncp);
+			tx_ring->tx_stats.queue_wakeup++;
+			u64_stats_update_end(&tx_ring->syncp);
+		}
+	}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0)
+	if (netif_xmit_stopped(txq) || !skb->xmit_more) {
+#endif
+		/* trigger the dma engine */
+		ena_com_write_sq_doorbell(tx_ring->ena_com_io_sq);
+		u64_stats_update_begin(&tx_ring->syncp);
+		tx_ring->tx_stats.doorbells++;
+		u64_stats_update_end(&tx_ring->syncp);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0)
+	}
+#endif
+
+	return NETDEV_TX_OK;
+
+error_report_dma_error:
+	u64_stats_update_begin(&tx_ring->syncp);
+	tx_ring->tx_stats.dma_mapping_err++;
+	u64_stats_update_end(&tx_ring->syncp);
+	netdev_warn(adapter->netdev, "failed to map skb\n");
+
+	tx_info->skb = NULL;
+
+error_unmap_dma:
+	if (i >= 0) {
+		/* save value of frag that failed */
+		last_frag = i;
+
+		/* start back at beginning and unmap skb */
+		tx_info->skb = NULL;
+		ena_buf = tx_info->bufs;
+		dma_unmap_single(tx_ring->dev, dma_unmap_addr(ena_buf, paddr),
+				 dma_unmap_len(ena_buf, len), DMA_TO_DEVICE);
+
+		/* unmap remaining mapped pages */
+		for (i = 0; i < last_frag; i++) {
+			ena_buf++;
+			dma_unmap_page(tx_ring->dev, dma_unmap_addr(ena_buf, paddr),
+				       dma_unmap_len(ena_buf, len), DMA_TO_DEVICE);
+		}
+	}
+
+error_drop_packet:
+
+	dev_kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void ena_netpoll(struct net_device *netdev)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	int i;
+
+	/* Dont schedule NAPI if the driver is in the middle of reset
+	 * or netdev is down.
+	 */
+
+	if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags) ||
+	    test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))
+		return;
+
+	for (i = 0; i < adapter->num_queues; i++)
+		napi_schedule(&adapter->ena_napi[i].napi);
+}
+#endif /* CONFIG_NET_POLL_CONTROLLER */
+
+#ifdef  HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK
+static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb,
+			    void *accel_priv, select_queue_fallback_t fallback)
+#elif defined HAVE_NDO_SELECT_QUEUE_ACCEL
+/* Return subqueue id on this core (one per core). */
+static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb,
+			    void *accel_priv)
+#else
+static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb)
+#endif
+{
+	u16 qid;
+	/* we suspect that this is good for in--kernel network services that
+	 * want to loop incoming skb rx to tx in normal user generated traffic,
+	 * most probably we will not get to this
+	 */
+	if (skb_rx_queue_recorded(skb))
+		qid = skb_get_rx_queue(skb);
+	else
+#if (defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK)
+		qid = fallback(dev, skb);
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,9,0)
+		qid = __netdev_pick_tx(dev, skb);
+#else
+		qid = skb_tx_hash(dev, skb);
+#endif /* HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK */
+
+	return qid;
+}
+#ifdef HAVE_SET_RX_MODE
+
+/* Unicast, Multicast and Promiscuous mode set
+ * @netdev: network interface device structure
+ *
+ * The set_rx_mode entry point is called whenever the unicast or multicast
+ * address lists or the network interface flags are updated.  This routine is
+ * responsible for configuring the hardware for proper unicast, multicast,
+ * promiscuous mode, and all-multi behavior.
+ */
+static void ena_set_rx_mode(struct net_device *netdev)
+{
+/*	struct ena_adapter *adapter = netdev_priv(netdev); */
+	/* TODO set Rx mode */
+
+	if (netdev->flags & IFF_PROMISC) {
+	} else if (netdev->flags & IFF_ALLMULTI) {
+	} else if (netdev_mc_empty(netdev)) {
+	} else {
+	}
+}
+#endif /* HAVE_SET_RX_MODE */
+
+static void ena_config_host_info(struct ena_com_dev *ena_dev)
+{
+	struct ena_admin_host_info *host_info;
+	int rc;
+
+	/* Allocate only the host info */
+	rc = ena_com_allocate_host_info(ena_dev);
+	if (rc) {
+		pr_err("Cannot allocate host info\n");
+		return;
+	}
+
+	host_info = ena_dev->host_attr.host_info;
+
+	host_info->os_type = ENA_ADMIN_OS_LINUX;
+	host_info->kernel_ver = LINUX_VERSION_CODE;
+	strncpy(host_info->kernel_ver_str, utsname()->version,
+		sizeof(host_info->kernel_ver_str) - 1);
+	host_info->os_dist = 0;
+	strncpy(host_info->os_dist_str, utsname()->release,
+		sizeof(host_info->os_dist_str) - 1);
+	host_info->driver_version =
+		(DRV_MODULE_VER_MAJOR) |
+		(DRV_MODULE_VER_MINOR << ENA_ADMIN_HOST_INFO_MINOR_SHIFT) |
+		(DRV_MODULE_VER_SUBMINOR << ENA_ADMIN_HOST_INFO_SUB_MINOR_SHIFT);
+
+	rc = ena_com_set_host_attributes(ena_dev);
+	if (rc) {
+		if (rc == -EOPNOTSUPP)
+			pr_warn("Cannot set host attributes\n");
+		else
+			pr_err("Cannot set host attributes\n");
+
+		goto err;
+	}
+
+	return;
+
+err:
+	ena_com_delete_host_info(ena_dev);
+}
+
+static void ena_config_debug_area(struct ena_adapter *adapter)
+{
+	u32 debug_area_size;
+	int rc, ss_count;
+
+	ss_count = ena_get_sset_count(adapter->netdev, ETH_SS_STATS);
+	if (ss_count <= 0) {
+		netif_err(adapter, drv, adapter->netdev,
+			  "SS count is negative\n");
+		return;
+	}
+
+	/* allocate 32 bytes for each string and 64bit for the value */
+	debug_area_size = ss_count * ETH_GSTRING_LEN + sizeof(u64) * ss_count;
+
+	rc = ena_com_allocate_debug_area(adapter->ena_dev, debug_area_size);
+	if (rc) {
+		pr_err("Cannot allocate debug area\n");
+		return;
+	}
+
+	rc = ena_com_set_host_attributes(adapter->ena_dev);
+	if (rc) {
+		if (rc == -EOPNOTSUPP)
+			netif_warn(adapter, drv, adapter->netdev,
+				   "Cannot set host attributes\n");
+		else
+			netif_err(adapter, drv, adapter->netdev,
+				  "Cannot set host attributes\n");
+		goto err;
+	}
+
+	return;
+err:
+	ena_com_delete_debug_area(adapter->ena_dev);
+}
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36))
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,11,0))
+static void ena_get_stats64(struct net_device *netdev,
+			    struct rtnl_link_stats64 *stats)
+#else
+static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
+						 struct rtnl_link_stats64 *stats)
+#endif
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	struct ena_ring *rx_ring, *tx_ring;
+	unsigned int start;
+	u64 rx_drops;
+	int i;
+
+	if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,11,0))
+		return;
+#else
+		return NULL;
+#endif
+
+	for (i = 0; i < adapter->num_queues; i++) {
+		u64 bytes, packets;
+
+		tx_ring = &adapter->tx_ring[i];
+
+		do {
+			start = u64_stats_fetch_begin_irq(&tx_ring->syncp);
+			packets = tx_ring->tx_stats.cnt;
+			bytes = tx_ring->tx_stats.bytes;
+		} while (u64_stats_fetch_retry_irq(&tx_ring->syncp, start));
+
+		stats->tx_packets += packets;
+		stats->tx_bytes += bytes;
+
+		rx_ring = &adapter->rx_ring[i];
+
+		do {
+			start = u64_stats_fetch_begin_irq(&rx_ring->syncp);
+			packets = rx_ring->rx_stats.cnt;
+			bytes = rx_ring->rx_stats.bytes;
+		} while (u64_stats_fetch_retry_irq(&rx_ring->syncp, start));
+
+		stats->rx_packets += packets;
+		stats->rx_bytes += bytes;
+	}
+
+	do {
+		start = u64_stats_fetch_begin_irq(&adapter->syncp);
+		rx_drops = adapter->dev_stats.rx_drops;
+	} while (u64_stats_fetch_retry_irq(&adapter->syncp, start));
+
+	stats->rx_dropped = rx_drops;
+
+	stats->multicast = 0;
+	stats->collisions = 0;
+
+	stats->rx_length_errors = 0;
+	stats->rx_crc_errors = 0;
+	stats->rx_frame_errors = 0;
+	stats->rx_fifo_errors = 0;
+	stats->rx_missed_errors = 0;
+	stats->tx_window_errors = 0;
+
+	stats->rx_errors = 0;
+	stats->tx_errors = 0;
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,11,0))
+		return stats;
+#endif
+}
+#else /* kernel > 2.6.36 */
+static struct net_device_stats *ena_get_stats(struct net_device *netdev)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	struct ena_ring *rx_ring, *tx_ring;
+	unsigned long rx_drops;
+	struct net_device_stats *stats = &netdev->stats;
+	unsigned int start;
+	int i;
+
+	memset(stats, 0, sizeof(*stats));
+	for (i = 0; i < adapter->num_queues; i++) {
+		unsigned long bytes, packets;
+
+		tx_ring = &adapter->tx_ring[i];
+		do {
+			start = u64_stats_fetch_begin_irq(&tx_ring->syncp);
+			packets = (unsigned long)tx_ring->tx_stats.cnt;
+			bytes = (unsigned long)tx_ring->tx_stats.bytes;
+		} while (u64_stats_fetch_retry_irq(&tx_ring->syncp, start));
+
+		stats->tx_packets += packets;
+		stats->tx_bytes += bytes;
+
+		rx_ring = &adapter->rx_ring[i];
+
+		do {
+			start = u64_stats_fetch_begin_irq(&tx_ring->syncp);
+			packets = (unsigned long)rx_ring->rx_stats.cnt;
+			bytes = (unsigned long)rx_ring->rx_stats.bytes;
+		} while (u64_stats_fetch_retry_irq(&tx_ring->syncp, start));
+
+		stats->rx_packets += packets;
+		stats->rx_bytes += bytes;
+	}
+
+	do {
+		start = u64_stats_fetch_begin_irq(&tx_ring->syncp);
+		rx_drops = (unsigned long)adapter->dev_stats.rx_drops;
+	} while (u64_stats_fetch_retry_irq(&tx_ring->syncp, start));
+
+	stats->rx_dropped = rx_drops;
+
+	stats->multicast = 0;
+	stats->collisions = 0;
+
+	stats->rx_length_errors = 0;
+	stats->rx_crc_errors = 0;
+	stats->rx_frame_errors = 0;
+	stats->rx_fifo_errors = 0;
+	stats->rx_missed_errors = 0;
+	stats->tx_window_errors = 0;
+
+	stats->rx_errors = 0;
+	stats->tx_errors = 0;
+
+	return stats;
+}
+#endif
+#if ENA_BUSY_POLL_SUPPORT
+
+#define ENA_BP_NAPI_BUDGET 8
+static int ena_busy_poll(struct napi_struct *napi)
+{
+	struct ena_napi *ena_napi = container_of(napi, struct ena_napi, napi);
+	struct ena_ring *rx_ring = ena_napi->rx_ring;
+	struct ena_adapter *adapter= rx_ring->adapter;
+	int done;
+
+	if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
+		return LL_FLUSH_FAILED;
+
+	if (!ena_bp_lock_poll(rx_ring))
+		return LL_FLUSH_BUSY;
+
+	done = ena_clean_rx_irq(rx_ring, napi, ENA_BP_NAPI_BUDGET);
+	if (likely(done))
+		rx_ring->rx_stats.bp_cleaned += done;
+	else
+		rx_ring->rx_stats.bp_missed++;
+
+	ena_bp_unlock_poll(rx_ring);
+
+	return done;
+}
+#endif
+
+static const struct net_device_ops ena_netdev_ops = {
+	.ndo_open		= ena_open,
+	.ndo_stop		= ena_close,
+	.ndo_start_xmit		= ena_start_xmit,
+	.ndo_select_queue	= ena_select_queue,
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36))
+	.ndo_get_stats64	= ena_get_stats64,
+#else
+	.ndo_get_stats		= ena_get_stats,
+#endif
+	.ndo_tx_timeout		= ena_tx_timeout,
+	.ndo_change_mtu		= ena_change_mtu,
+	.ndo_set_mac_address	= NULL,
+#ifdef	HAVE_SET_RX_MODE
+	.ndo_set_rx_mode	= ena_set_rx_mode,
+#endif
+	.ndo_validate_addr	= eth_validate_addr,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller	= ena_netpoll,
+#if ENA_BUSY_POLL_SUPPORT
+	.ndo_busy_poll		= ena_busy_poll,
+#endif
+#endif /* CONFIG_NET_POLL_CONTROLLER */
+};
+
+static int ena_device_validate_params(struct ena_adapter *adapter,
+				      struct ena_com_dev_get_features_ctx *get_feat_ctx)
+{
+	struct net_device *netdev = adapter->netdev;
+	int rc;
+
+	rc = ether_addr_equal(get_feat_ctx->dev_attr.mac_addr,
+			      adapter->mac_addr);
+	if (!rc) {
+		netif_err(adapter, drv, netdev,
+			  "Error, mac address are different\n");
+		return -EINVAL;
+	}
+
+	if ((get_feat_ctx->max_queues.max_cq_num < adapter->num_queues) ||
+	    (get_feat_ctx->max_queues.max_sq_num < adapter->num_queues)) {
+		netif_err(adapter, drv, netdev,
+			  "Error, device doesn't support enough queues\n");
+		return -EINVAL;
+	}
+
+	if (get_feat_ctx->dev_attr.max_mtu < netdev->mtu) {
+		netif_err(adapter, drv, netdev,
+			  "Error, device max mtu is smaller than netdev MTU\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ena_device_init(struct ena_com_dev *ena_dev, struct pci_dev *pdev,
+			   struct ena_com_dev_get_features_ctx *get_feat_ctx,
+			   bool *wd_state)
+{
+	struct device *dev = &pdev->dev;
+	bool readless_supported;
+	u32 aenq_groups;
+	int dma_width;
+	int rc;
+
+	rc = ena_com_mmio_reg_read_request_init(ena_dev);
+	if (rc) {
+		dev_err(dev, "failed to init mmio read less\n");
+		return rc;
+	}
+
+	/* The PCIe configuration space revision id indicate if mmio reg
+	 * read is disabled
+	 */
+	readless_supported = !(pdev->revision & ENA_MMIO_DISABLE_REG_READ);
+	ena_com_set_mmio_read_mode(ena_dev, readless_supported);
+
+	rc = ena_com_dev_reset(ena_dev, ENA_REGS_RESET_NORMAL);
+	if (rc) {
+		dev_err(dev, "Can not reset device\n");
+		goto err_mmio_read_less;
+	}
+
+	rc = ena_com_validate_version(ena_dev);
+	if (rc) {
+		dev_err(dev, "device version is too low\n");
+		goto err_mmio_read_less;
+	}
+
+	dma_width = ena_com_get_dma_width(ena_dev);
+	if (dma_width < 0) {
+		dev_err(dev, "Invalid dma width value %d", dma_width);
+		rc = dma_width;
+		goto err_mmio_read_less;
+	}
+
+	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(dma_width));
+	if (rc) {
+		dev_err(dev, "pci_set_dma_mask failed 0x%x\n", rc);
+		goto err_mmio_read_less;
+	}
+
+	rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(dma_width));
+	if (rc) {
+		dev_err(dev, "err_pci_set_consistent_dma_mask failed 0x%x\n",
+			rc);
+		goto err_mmio_read_less;
+	}
+
+	/* ENA admin level init */
+	rc = ena_com_admin_init(ena_dev, &aenq_handlers, true);
+	if (rc) {
+		dev_err(dev,
+			"Can not initialize ena admin queue with device\n");
+		goto err_mmio_read_less;
+	}
+
+	/* To enable the msix interrupts the driver needs to know the number
+	 * of queues. So the driver uses polling mode to retrieve this
+	 * information
+	 */
+	ena_com_set_admin_polling_mode(ena_dev, true);
+
+	ena_config_host_info(ena_dev);
+
+	/* Get Device Attributes*/
+	rc = ena_com_get_dev_attr_feat(ena_dev, get_feat_ctx);
+	if (rc) {
+		dev_err(dev, "Cannot get attribute for ena device rc=%d\n", rc);
+		goto err_admin_init;
+	}
+
+	/* Try to turn all the available aenq groups */
+	aenq_groups = BIT(ENA_ADMIN_LINK_CHANGE) |
+		BIT(ENA_ADMIN_FATAL_ERROR) |
+		BIT(ENA_ADMIN_WARNING) |
+		BIT(ENA_ADMIN_NOTIFICATION) |
+		BIT(ENA_ADMIN_KEEP_ALIVE);
+
+	aenq_groups &= get_feat_ctx->aenq.supported_groups;
+
+	rc = ena_com_set_aenq_config(ena_dev, aenq_groups);
+	if (rc) {
+		dev_err(dev, "Cannot configure aenq groups rc= %d\n", rc);
+		goto err_admin_init;
+	}
+
+	*wd_state = !!(aenq_groups & BIT(ENA_ADMIN_KEEP_ALIVE));
+
+	return 0;
+
+err_admin_init:
+	ena_com_delete_host_info(ena_dev);
+	ena_com_admin_destroy(ena_dev);
+err_mmio_read_less:
+	ena_com_mmio_reg_read_request_destroy(ena_dev);
+
+	return rc;
+}
+
+static int ena_enable_msix_and_set_admin_interrupts(struct ena_adapter *adapter,
+						    int io_vectors)
+{
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	struct device *dev = &adapter->pdev->dev;
+	int rc;
+
+	rc = ena_enable_msix(adapter, io_vectors);
+	if (rc) {
+		dev_err(dev, "Can not reserve msix vectors\n");
+		return rc;
+	}
+
+	ena_setup_mgmnt_intr(adapter);
+
+	rc = ena_request_mgmnt_irq(adapter);
+	if (rc) {
+		dev_err(dev, "Can not setup management interrupts\n");
+		goto err_disable_msix;
+	}
+
+	ena_com_set_admin_polling_mode(ena_dev, false);
+
+	ena_com_admin_aenq_enable(ena_dev);
+
+	return 0;
+
+err_disable_msix:
+	ena_disable_msix(adapter);
+
+	return rc;
+}
+
+static void ena_destroy_device(struct ena_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	bool dev_up;
+
+	netif_carrier_off(netdev);
+
+	del_timer_sync(&adapter->timer_service);
+
+	dev_up = test_bit(ENA_FLAG_DEV_UP, &adapter->flags);
+	adapter->dev_up_before_reset = dev_up;
+
+	ena_sysfs_terminate(&adapter->pdev->dev);
+	ena_com_set_admin_running_state(ena_dev, false);
+
+	if (test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
+		ena_down(adapter);
+
+	/* Before releasing the ENA resources, a device reset is required.
+	 * (to prevent the device from accessing them).
+	 * In case the reset flag is set and the device is up, ena_down()
+	 * already perform the reset, so it can be skipped.
+	 */
+	if (!(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags) && dev_up))
+		ena_com_dev_reset(adapter->ena_dev, adapter->reset_reason);
+
+	ena_free_mgmnt_irq(adapter);
+
+	ena_disable_msix(adapter);
+
+	ena_com_abort_admin_commands(ena_dev);
+
+	ena_com_wait_for_abort_completion(ena_dev);
+
+	ena_com_admin_destroy(ena_dev);
+
+	ena_com_mmio_reg_read_request_destroy(ena_dev);
+
+	adapter->reset_reason = ENA_REGS_RESET_NORMAL;
+
+	clear_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+}
+
+static int ena_restore_device(struct ena_adapter *adapter)
+{
+	struct ena_com_dev_get_features_ctx get_feat_ctx;
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	struct pci_dev *pdev = adapter->pdev;
+	bool wd_state;
+	int rc;
+
+	set_bit(ENA_FLAG_ONGOING_RESET, &adapter->flags);
+	rc = ena_device_init(ena_dev, adapter->pdev, &get_feat_ctx, &wd_state);
+	if (rc) {
+		dev_err(&pdev->dev, "Can not initialize device\n");
+		goto err;
+	}
+	adapter->wd_state = wd_state;
+
+	rc = ena_device_validate_params(adapter, &get_feat_ctx);
+	if (rc) {
+		dev_err(&pdev->dev, "Validation of device parameters failed\n");
+		goto err_device_destroy;
+	}
+
+	clear_bit(ENA_FLAG_ONGOING_RESET, &adapter->flags);
+	/* Make sure we don't have a race with AENQ Links state handler */
+	if (test_bit(ENA_FLAG_LINK_UP, &adapter->flags))
+		netif_carrier_on(adapter->netdev);
+
+	rc = ena_enable_msix_and_set_admin_interrupts(adapter,
+						      adapter->num_queues);
+	if (rc) {
+		dev_err(&pdev->dev, "Enable MSI-X failed\n");
+		goto err_device_destroy;
+	}
+	rc = ena_sysfs_init(&pdev->dev);
+	if (rc) {
+		dev_err(&pdev->dev, "Cannot initialize sysfs\n");
+		goto err_disable_msix;
+	}
+	/* If the interface was up before the reset bring it up */
+	if (adapter->dev_up_before_reset) {
+		rc = ena_up(adapter);
+		if (rc) {
+			dev_err(&pdev->dev, "Failed to create I/O queues\n");
+			goto err_sysfs_terminate;
+		}
+	}
+
+	mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ));
+	dev_err(&pdev->dev, "Device reset completed successfully\n");
+
+	return rc;
+err_sysfs_terminate:
+	ena_sysfs_terminate(&pdev->dev);
+err_disable_msix:
+	ena_free_mgmnt_irq(adapter);
+	ena_disable_msix(adapter);
+err_device_destroy:
+	ena_com_admin_destroy(ena_dev);
+err:
+	clear_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags);
+	clear_bit(ENA_FLAG_ONGOING_RESET, &adapter->flags);
+	dev_err(&pdev->dev,
+		"Reset attempt failed. Can not reset the device\n");
+
+	return rc;
+}
+
+static void ena_fw_reset_device(struct work_struct *work)
+{
+	struct ena_adapter *adapter =
+		container_of(work, struct ena_adapter, reset_task);
+	struct pci_dev *pdev = adapter->pdev;
+
+	if (unlikely(!test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) {
+		dev_err(&pdev->dev,
+			"device reset schedule while reset bit is off\n");
+		return;
+	}
+	rtnl_lock();
+	ena_destroy_device(adapter);
+	ena_restore_device(adapter);
+	rtnl_unlock();
+}
+
+static int check_for_rx_interrupt_queue(struct ena_adapter *adapter,
+					struct ena_ring *rx_ring)
+{
+	if (likely(rx_ring->first_interrupt))
+		return 0;
+
+	if (ena_com_cq_empty(rx_ring->ena_com_io_cq))
+		return 0;
+
+	rx_ring->no_interrupt_event_cnt++;
+
+	if (rx_ring->no_interrupt_event_cnt == ENA_MAX_NO_INTERRUPT_ITERATIONS) {
+	       netif_err(adapter, rx_err, adapter->netdev,
+			 "Potential MSIX issue on Rx side Queue = %d. Reset the device\n",
+			 rx_ring->qid);
+	       adapter->reset_reason = ENA_REGS_RESET_MISS_INTERRUPT;
+	       set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+	       return -EIO;
+	}
+
+	return 0;
+}
+
+static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
+				       struct ena_ring *tx_ring)
+{
+	struct ena_tx_buffer *tx_buf;
+	unsigned long last_jiffies;
+	u32 missed_tx = 0;
+	int i, rc = 0;
+
+	for (i = 0; i < tx_ring->ring_size; i++) {
+		tx_buf = &tx_ring->tx_buffer_info[i];
+		last_jiffies = tx_buf->last_jiffies;
+
+		if (last_jiffies == 0)
+			/* no pending Tx at this location */
+			continue;
+
+		if (unlikely(!tx_ring->first_interrupt && time_is_before_jiffies(last_jiffies +
+					 2 * adapter->missing_tx_completion_to))) {
+			/* If after graceful period interrupt is still not received, we schedule a reset*/
+			       netif_err(adapter, tx_err, adapter->netdev,
+					 "Potential MSIX issue on Tx side Queue = %d. Reset the device\n",
+					 tx_ring->qid);
+			       adapter->reset_reason = ENA_REGS_RESET_MISS_INTERRUPT;
+			       set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+			       return -EIO;
+		}
+
+		if (unlikely(time_is_before_jiffies(last_jiffies +
+				adapter->missing_tx_completion_to))) {
+			if (!tx_buf->print_once)
+				netif_notice(adapter, tx_err, adapter->netdev,
+					     "Found a Tx that wasn't completed on time, qid %d, index %d.\n",
+					     tx_ring->qid, i);
+
+			tx_buf->print_once = 1;
+			missed_tx++;
+		}
+	}
+
+	if (unlikely(missed_tx > adapter->missing_tx_completion_threshold)) {
+		netif_err(adapter, tx_err, adapter->netdev,
+			  "The number of lost tx completions is above the threshold (%d > %d). Reset the device\n",
+			  missed_tx,
+			  adapter->missing_tx_completion_threshold);
+		adapter->reset_reason =
+			ENA_REGS_RESET_MISS_TX_CMPL;
+		set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+		rc = -EIO;
+	}
+
+	u64_stats_update_begin(&tx_ring->syncp);
+	tx_ring->tx_stats.missed_tx = missed_tx;
+	u64_stats_update_end(&tx_ring->syncp);
+
+	return rc;
+}
+
+static void check_for_missing_completions(struct ena_adapter *adapter)
+{
+	struct ena_ring *tx_ring;
+	struct ena_ring *rx_ring;
+	int i, budget, rc;
+
+	/* Make sure the driver doesn't turn the device in other process */
+	smp_rmb();
+
+	if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
+		return;
+
+	if (test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))
+		return;
+
+	if (adapter->missing_tx_completion_to == ENA_HW_HINTS_NO_TIMEOUT)
+		return;
+
+	budget = ENA_MONITORED_TX_QUEUES;
+
+	for (i = adapter->last_monitored_tx_qid; i < adapter->num_queues; i++) {
+		tx_ring = &adapter->tx_ring[i];
+		rx_ring = &adapter->rx_ring[i];
+
+		rc = check_missing_comp_in_tx_queue(adapter, tx_ring);
+		if (unlikely(rc))
+			return;
+
+		rc = check_for_rx_interrupt_queue(adapter, rx_ring);
+		if (unlikely(rc))
+			return;
+
+		budget--;
+		if (!budget)
+			break;
+	}
+
+	adapter->last_monitored_tx_qid = i % adapter->num_queues;
+}
+
+/* trigger napi schedule after 2 consecutive detections */
+#define EMPTY_RX_REFILL 2
+/* For the rare case where the device runs out of Rx descriptors and the
+ * napi handler failed to refill new Rx descriptors (due to a lack of memory
+ * for example).
+ * This case will lead to a deadlock:
+ * The device won't send interrupts since all the new Rx packets will be dropped
+ * The napi handler won't allocate new Rx descriptors so the device will be
+ * able to send new packets.
+ *
+ * This scenario can happen when the kernel's vm.min_free_kbytes is too small.
+ * It is recommended to have at least 512MB, with a minimum of 128MB for
+ * constrained environment).
+ *
+ * When such a situation is detected - Reschedule napi
+ */
+static void check_for_empty_rx_ring(struct ena_adapter *adapter)
+{
+	struct ena_ring *rx_ring;
+	int i, refill_required;
+
+	if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
+		return;
+
+	if (test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))
+		return;
+
+	for (i = 0; i < adapter->num_queues; i++) {
+		rx_ring = &adapter->rx_ring[i];
+
+		refill_required =
+			ena_com_sq_empty_space(rx_ring->ena_com_io_sq);
+		if (unlikely(refill_required == (rx_ring->ring_size - 1))) {
+			rx_ring->empty_rx_queue++;
+
+			if (rx_ring->empty_rx_queue >= EMPTY_RX_REFILL) {
+				u64_stats_update_begin(&rx_ring->syncp);
+				rx_ring->rx_stats.empty_rx_ring++;
+				u64_stats_update_end(&rx_ring->syncp);
+
+				netif_err(adapter, drv, adapter->netdev,
+					  "trigger refill for ring %d\n", i);
+
+				napi_schedule(rx_ring->napi);
+				rx_ring->empty_rx_queue = 0;
+			}
+		} else {
+			rx_ring->empty_rx_queue = 0;
+		}
+	}
+}
+
+/* Check for keep alive expiration */
+static void check_for_missing_keep_alive(struct ena_adapter *adapter)
+{
+	unsigned long keep_alive_expired;
+
+	if (!adapter->wd_state)
+		return;
+
+	if (adapter->keep_alive_timeout == ENA_HW_HINTS_NO_TIMEOUT)
+		return;
+
+	keep_alive_expired = round_jiffies(adapter->last_keep_alive_jiffies +
+					   adapter->keep_alive_timeout);
+	if (unlikely(time_is_before_jiffies(keep_alive_expired))) {
+		netif_err(adapter, drv, adapter->netdev,
+			  "Keep alive watchdog timeout.\n");
+		u64_stats_update_begin(&adapter->syncp);
+		adapter->dev_stats.wd_expired++;
+		u64_stats_update_end(&adapter->syncp);
+		adapter->reset_reason = ENA_REGS_RESET_KEEP_ALIVE_TO;
+		set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+	}
+}
+
+static void check_for_admin_com_state(struct ena_adapter *adapter)
+{
+	if (unlikely(!ena_com_get_admin_running_state(adapter->ena_dev))) {
+		netif_err(adapter, drv, adapter->netdev,
+			  "ENA admin queue is not in running state!\n");
+		u64_stats_update_begin(&adapter->syncp);
+		adapter->dev_stats.admin_q_pause++;
+		u64_stats_update_end(&adapter->syncp);
+		adapter->reset_reason = ENA_REGS_RESET_ADMIN_TO;
+		set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+	}
+}
+
+static void ena_update_hints(struct ena_adapter *adapter,
+			     struct ena_admin_ena_hw_hints *hints)
+{
+	struct net_device *netdev = adapter->netdev;
+
+	if (hints->admin_completion_tx_timeout)
+		adapter->ena_dev->admin_queue.completion_timeout =
+			hints->admin_completion_tx_timeout * 1000;
+
+	if (hints->mmio_read_timeout)
+		/* convert to usec */
+		adapter->ena_dev->mmio_read.reg_read_to =
+			hints->mmio_read_timeout * 1000;
+
+	if (hints->missed_tx_completion_count_threshold_to_reset)
+		adapter->missing_tx_completion_threshold =
+			hints->missed_tx_completion_count_threshold_to_reset;
+
+	if (hints->missing_tx_completion_timeout) {
+		if (hints->missing_tx_completion_timeout == ENA_HW_HINTS_NO_TIMEOUT)
+			adapter->missing_tx_completion_to = ENA_HW_HINTS_NO_TIMEOUT;
+		else
+			adapter->missing_tx_completion_to =
+				msecs_to_jiffies(hints->missing_tx_completion_timeout);
+	}
+
+	if (hints->netdev_wd_timeout)
+		netdev->watchdog_timeo = msecs_to_jiffies(hints->netdev_wd_timeout);
+
+	if (hints->driver_watchdog_timeout) {
+		if (hints->driver_watchdog_timeout == ENA_HW_HINTS_NO_TIMEOUT)
+			adapter->keep_alive_timeout = ENA_HW_HINTS_NO_TIMEOUT;
+		else
+			adapter->keep_alive_timeout =
+				msecs_to_jiffies(hints->driver_watchdog_timeout);
+	}
+}
+
+static void ena_update_host_info(struct ena_admin_host_info *host_info,
+				 struct net_device *netdev)
+{
+	host_info->supported_network_features[0] =
+		netdev->features & GENMASK_ULL(31, 0);
+	host_info->supported_network_features[1] =
+		(netdev->features & GENMASK_ULL(63, 32)) >> 32;
+}
+
+static void ena_timer_service(unsigned long data)
+{
+	struct ena_adapter *adapter = (struct ena_adapter *)data;
+	u8 *debug_area = adapter->ena_dev->host_attr.debug_area_virt_addr;
+	struct ena_admin_host_info *host_info =
+		adapter->ena_dev->host_attr.host_info;
+
+	check_for_missing_keep_alive(adapter);
+
+	check_for_admin_com_state(adapter);
+
+	check_for_missing_completions(adapter);
+
+	check_for_empty_rx_ring(adapter);
+
+	if (debug_area)
+		ena_dump_stats_to_buf(adapter, debug_area);
+
+	if (host_info)
+		ena_update_host_info(host_info, adapter->netdev);
+
+	if (unlikely(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) {
+		netif_err(adapter, drv, adapter->netdev,
+			  "Trigger reset is on\n");
+		ena_dump_stats_to_dmesg(adapter);
+		queue_work(ena_wq, &adapter->reset_task);
+		return;
+	}
+
+	/* Reset the timer */
+	mod_timer(&adapter->timer_service, jiffies + HZ);
+}
+
+static int ena_calc_io_queue_num(struct pci_dev *pdev,
+				 struct ena_com_dev *ena_dev,
+				 struct ena_com_dev_get_features_ctx *get_feat_ctx)
+{
+	int io_sq_num, io_queue_num;
+
+	/* In case of LLQ use the llq number in the get feature cmd */
+	if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
+		io_sq_num = get_feat_ctx->max_queues.max_llq_num;
+
+		if (io_sq_num == 0) {
+			dev_err(&pdev->dev,
+				"Trying to use LLQ but llq_num is 0. Fall back into regular queues\n");
+
+			ena_dev->tx_mem_queue_type =
+				ENA_ADMIN_PLACEMENT_POLICY_HOST;
+			io_sq_num = get_feat_ctx->max_queues.max_sq_num;
+		}
+	} else {
+		io_sq_num = get_feat_ctx->max_queues.max_sq_num;
+	}
+
+	io_queue_num = min_t(int, num_online_cpus(), ENA_MAX_NUM_IO_QUEUES);
+	io_queue_num = min_t(int, io_queue_num, io_sq_num);
+	io_queue_num = min_t(int, io_queue_num,
+			     get_feat_ctx->max_queues.max_cq_num);
+	/* 1 IRQ for for mgmnt and 1 IRQs for each IO direction */
+	io_queue_num = min_t(int, io_queue_num, pci_msix_vec_count(pdev) - 1);
+	if (unlikely(!io_queue_num)) {
+		dev_err(&pdev->dev, "The device doesn't have io queues\n");
+		return -EFAULT;
+	}
+
+	return io_queue_num;
+}
+
+static void ena_set_push_mode(struct pci_dev *pdev, struct ena_com_dev *ena_dev,
+			      struct ena_com_dev_get_features_ctx *get_feat_ctx)
+{
+	bool has_mem_bar;
+
+	has_mem_bar = pci_select_bars(pdev, IORESOURCE_MEM) & BIT(ENA_MEM_BAR);
+
+	/* Enable push mode if device supports LLQ */
+	if (has_mem_bar && (get_feat_ctx->max_queues.max_llq_num > 0))
+		ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_DEV;
+	else
+		ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
+}
+
+static void ena_set_dev_offloads(struct ena_com_dev_get_features_ctx *feat,
+				 struct net_device *netdev)
+{
+	netdev_features_t dev_features = 0;
+
+	/* Set offload features */
+	if (feat->offload.tx &
+		ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV4_CSUM_PART_MASK)
+		dev_features |= NETIF_F_IP_CSUM;
+
+#ifdef NETIF_F_IPV6_CSUM
+	if (feat->offload.tx &
+		ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV6_CSUM_PART_MASK)
+		dev_features |= NETIF_F_IPV6_CSUM;
+#endif /* NETIF_F_IPV6_CSUM */
+
+	if (feat->offload.tx & ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV4_MASK)
+		dev_features |= NETIF_F_TSO;
+
+	if (feat->offload.tx & ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV6_MASK)
+		dev_features |= NETIF_F_TSO6;
+
+	if (feat->offload.tx & ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_ECN_MASK)
+		dev_features |= NETIF_F_TSO_ECN;
+
+	if (feat->offload.rx_supported &
+		ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV4_CSUM_MASK)
+		dev_features |= NETIF_F_RXCSUM;
+
+	if (feat->offload.rx_supported &
+		ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV6_CSUM_MASK)
+		dev_features |= NETIF_F_RXCSUM;
+
+	netdev->features =
+		dev_features |
+		NETIF_F_SG |
+#ifdef NETIF_F_RXHASH
+		NETIF_F_RXHASH |
+#endif /* NETIF_F_RXHASH */
+		NETIF_F_HIGHDMA;
+
+#ifdef HAVE_RHEL6_NET_DEVICE_OPS_EXT
+	do {
+		u32 hw_features = get_netdev_hw_features(netdev);
+		hw_features |= netdev->features;
+		set_netdev_hw_features(netdev, hw_features);
+	} while (0);
+#else
+	netdev->hw_features |= netdev->features;
+#endif
+	netdev->vlan_features |= netdev->features;
+}
+
+static void ena_set_conf_feat_params(struct ena_adapter *adapter,
+				     struct ena_com_dev_get_features_ctx *feat)
+{
+	struct net_device *netdev = adapter->netdev;
+
+	/* Copy mac address */
+	if (!is_valid_ether_addr(feat->dev_attr.mac_addr)) {
+		eth_hw_addr_random(netdev);
+		ether_addr_copy(adapter->mac_addr, netdev->dev_addr);
+	} else {
+		ether_addr_copy(adapter->mac_addr, feat->dev_attr.mac_addr);
+		ether_addr_copy(netdev->dev_addr, adapter->mac_addr);
+	}
+
+	/* Set offload features */
+	ena_set_dev_offloads(feat, netdev);
+
+	adapter->max_mtu = feat->dev_attr.max_mtu;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
+	netdev->max_mtu = adapter->max_mtu;
+	netdev->min_mtu = ENA_MIN_MTU;
+#endif
+}
+
+static int ena_rss_init_default(struct ena_adapter *adapter)
+{
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	struct device *dev = &adapter->pdev->dev;
+	int rc, i;
+	u32 val;
+
+	rc = ena_com_rss_init(ena_dev, ENA_RX_RSS_TABLE_LOG_SIZE);
+	if (unlikely(rc)) {
+		dev_err(dev, "Cannot init indirect table\n");
+		goto err_rss_init;
+	}
+
+	for (i = 0; i < ENA_RX_RSS_TABLE_SIZE; i++) {
+		val = ethtool_rxfh_indir_default(i, adapter->num_queues);
+		rc = ena_com_indirect_table_fill_entry(ena_dev, i,
+						       ENA_IO_RXQ_IDX(val));
+		if (unlikely(rc && (rc != -EOPNOTSUPP))) {
+			dev_err(dev, "Cannot fill indirect table\n");
+			goto err_fill_indir;
+		}
+	}
+
+	rc = ena_com_fill_hash_function(ena_dev, ENA_ADMIN_CRC32, NULL,
+					ENA_HASH_KEY_SIZE, 0xFFFFFFFF);
+	if (unlikely(rc && (rc != -EOPNOTSUPP))) {
+		dev_err(dev, "Cannot fill hash function\n");
+		goto err_fill_indir;
+	}
+
+	rc = ena_com_set_default_hash_ctrl(ena_dev);
+	if (unlikely(rc && (rc != -EOPNOTSUPP))) {
+		dev_err(dev, "Cannot fill hash control\n");
+		goto err_fill_indir;
+	}
+
+	return 0;
+
+err_fill_indir:
+	ena_com_rss_destroy(ena_dev);
+err_rss_init:
+
+	return rc;
+}
+
+static void ena_release_bars(struct ena_com_dev *ena_dev, struct pci_dev *pdev)
+{
+	int release_bars;
+
+	if (ena_dev->mem_bar)
+		devm_iounmap(&pdev->dev, ena_dev->mem_bar);
+
+	if (ena_dev->reg_bar)
+		devm_iounmap(&pdev->dev, ena_dev->reg_bar);
+
+	release_bars = pci_select_bars(pdev, IORESOURCE_MEM) & ENA_BAR_MASK;
+	pci_release_selected_regions(pdev, release_bars);
+}
+
+static int ena_calc_queue_size(struct pci_dev *pdev,
+			       struct ena_com_dev *ena_dev,
+			       u16 *max_tx_sgl_size,
+			       u16 *max_rx_sgl_size,
+			       struct ena_com_dev_get_features_ctx *get_feat_ctx)
+{
+	u32 queue_size = ENA_DEFAULT_RING_SIZE;
+
+	queue_size = min_t(u32, queue_size,
+			   get_feat_ctx->max_queues.max_cq_depth);
+	queue_size = min_t(u32, queue_size,
+			   get_feat_ctx->max_queues.max_sq_depth);
+
+	if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
+		queue_size = min_t(u32, queue_size,
+				   get_feat_ctx->max_queues.max_llq_depth);
+
+	queue_size = rounddown_pow_of_two(queue_size);
+
+	if (unlikely(!queue_size)) {
+		dev_err(&pdev->dev, "Invalid queue size\n");
+		return -EFAULT;
+	}
+
+	*max_tx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
+				 get_feat_ctx->max_queues.max_packet_tx_descs);
+	*max_rx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
+				 get_feat_ctx->max_queues.max_packet_rx_descs);
+
+	return queue_size;
+}
+
+/* ena_probe - Device Initialization Routine
+ * @pdev: PCI device information struct
+ * @ent: entry in ena_pci_tbl
+ *
+ * Returns 0 on success, negative on failure
+ *
+ * ena_probe initializes an adapter identified by a pci_dev structure.
+ * The OS initialization, configuring of the adapter private structure,
+ * and a hardware reset occur.
+ */
+static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	struct ena_com_dev_get_features_ctx get_feat_ctx;
+	static int version_printed;
+	struct net_device *netdev;
+	struct ena_adapter *adapter;
+	struct ena_com_dev *ena_dev = NULL;
+	static int adapters_found;
+	int io_queue_num, bars, rc;
+	int queue_size;
+	u16 tx_sgl_size = 0;
+	u16 rx_sgl_size = 0;
+	bool wd_state;
+
+	dev_dbg(&pdev->dev, "%s\n", __func__);
+
+	if (version_printed++ == 0)
+		dev_info(&pdev->dev, "%s", version);
+
+	rc = pci_enable_device_mem(pdev);
+	if (rc) {
+		dev_err(&pdev->dev, "pci_enable_device_mem() failed!\n");
+		return rc;
+	}
+
+	pci_set_master(pdev);
+
+	ena_dev = vzalloc(sizeof(*ena_dev));
+	if (!ena_dev) {
+		rc = -ENOMEM;
+		goto err_disable_device;
+	}
+
+	bars = pci_select_bars(pdev, IORESOURCE_MEM) & ENA_BAR_MASK;
+	rc = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME);
+	if (rc) {
+		dev_err(&pdev->dev, "pci_request_selected_regions failed %d\n",
+			rc);
+		goto err_free_ena_dev;
+	}
+
+	ena_dev->reg_bar = devm_ioremap(&pdev->dev,
+					pci_resource_start(pdev, ENA_REG_BAR),
+					pci_resource_len(pdev, ENA_REG_BAR));
+	if (!ena_dev->reg_bar) {
+		dev_err(&pdev->dev, "failed to remap regs bar\n");
+		rc = -EFAULT;
+		goto err_free_region;
+	}
+
+	ena_dev->dmadev = &pdev->dev;
+
+	rc = ena_device_init(ena_dev, pdev, &get_feat_ctx, &wd_state);
+	if (rc) {
+		dev_err(&pdev->dev, "ena device init failed\n");
+		if (rc == -ETIME)
+			rc = -EPROBE_DEFER;
+		goto err_free_region;
+	}
+
+	ena_set_push_mode(pdev, ena_dev, &get_feat_ctx);
+
+	if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
+		ena_dev->mem_bar = devm_ioremap_wc(&pdev->dev,
+						   pci_resource_start(pdev, ENA_MEM_BAR),
+						   pci_resource_len(pdev, ENA_MEM_BAR));
+		if (!ena_dev->mem_bar) {
+			rc = -EFAULT;
+			goto err_device_destroy;
+		}
+	}
+
+	/* initial Tx interrupt delay, Assumes 1 usec granularity.
+	* Updated during device initialization with the real granularity
+	*/
+	ena_dev->intr_moder_tx_interval = ENA_INTR_INITIAL_TX_INTERVAL_USECS;
+	io_queue_num = ena_calc_io_queue_num(pdev, ena_dev, &get_feat_ctx);
+	queue_size = ena_calc_queue_size(pdev, ena_dev, &tx_sgl_size,
+					 &rx_sgl_size, &get_feat_ctx);
+	if ((queue_size <= 0) || (io_queue_num <= 0)) {
+		rc = -EFAULT;
+		goto err_device_destroy;
+	}
+
+	dev_info(&pdev->dev, "creating %d io queues. queue size: %d\n",
+		 io_queue_num, queue_size);
+
+	/* dev zeroed in init_etherdev */
+	netdev = alloc_etherdev_mq(sizeof(struct ena_adapter), io_queue_num);
+	if (!netdev) {
+		dev_err(&pdev->dev, "alloc_etherdev_mq failed\n");
+		rc = -ENOMEM;
+		goto err_device_destroy;
+	}
+
+	SET_NETDEV_DEV(netdev, &pdev->dev);
+
+	adapter = netdev_priv(netdev);
+	pci_set_drvdata(pdev, adapter);
+
+	adapter->ena_dev = ena_dev;
+	adapter->netdev = netdev;
+	adapter->pdev = pdev;
+
+	ena_set_conf_feat_params(adapter, &get_feat_ctx);
+
+	adapter->msg_enable = netif_msg_init(debug, DEFAULT_MSG_ENABLE);
+	adapter->reset_reason = ENA_REGS_RESET_NORMAL;
+
+	adapter->tx_ring_size = queue_size;
+	adapter->rx_ring_size = queue_size;
+
+	adapter->max_tx_sgl_size = tx_sgl_size;
+	adapter->max_rx_sgl_size = rx_sgl_size;
+
+	adapter->num_queues = io_queue_num;
+	adapter->last_monitored_tx_qid = 0;
+
+	adapter->rx_copybreak = ENA_DEFAULT_RX_COPYBREAK;
+	adapter->wd_state = wd_state;
+
+	snprintf(adapter->name, ENA_NAME_MAX_LEN, "ena_%d", adapters_found);
+
+	rc = ena_com_init_interrupt_moderation(adapter->ena_dev);
+	if (rc) {
+		dev_err(&pdev->dev,
+			"Failed to query interrupt moderation feature\n");
+		goto err_netdev_destroy;
+	}
+	ena_init_io_rings(adapter);
+
+	netdev->netdev_ops = &ena_netdev_ops;
+	netdev->watchdog_timeo = TX_TIMEOUT;
+	ena_set_ethtool_ops(netdev);
+
+#if defined(NETIF_F_MQ_TX_LOCK_OPT)
+	netdev->features &= ~NETIF_F_MQ_TX_LOCK_OPT;
+#endif /* defined(NETIF_F_MQ_TX_LOCK_OPT) */
+#ifdef IFF_UNICAST_FLT
+	netdev->priv_flags |= IFF_UNICAST_FLT;
+#endif /* IFF_UNICAST_FLT */
+
+	u64_stats_init(&adapter->syncp);
+
+	rc = ena_enable_msix_and_set_admin_interrupts(adapter, io_queue_num);
+	if (rc) {
+		dev_err(&pdev->dev,
+			"Failed to enable and set the admin interrupts\n");
+		goto err_worker_destroy;
+	}
+	rc = ena_sysfs_init(&adapter->pdev->dev);
+	if (rc) {
+		dev_err(&pdev->dev, "Cannot init sysfs\n");
+		goto err_free_msix;
+	}
+	rc = ena_rss_init_default(adapter);
+	if (rc && (rc != -EOPNOTSUPP)) {
+		dev_err(&pdev->dev, "Cannot init RSS rc: %d\n", rc);
+		goto err_terminate_sysfs;
+	}
+
+	ena_config_debug_area(adapter);
+
+	memcpy(adapter->netdev->perm_addr, adapter->mac_addr, netdev->addr_len);
+
+	rc = register_netdev(netdev);
+	if (rc) {
+		dev_err(&pdev->dev, "Cannot register net device\n");
+		goto err_rss;
+	}
+
+	netif_carrier_off(netdev);
+
+	INIT_WORK(&adapter->reset_task, ena_fw_reset_device);
+
+	adapter->last_keep_alive_jiffies = jiffies;
+	adapter->keep_alive_timeout = ENA_DEVICE_KALIVE_TIMEOUT;
+	adapter->missing_tx_completion_to = TX_TIMEOUT;
+	adapter->missing_tx_completion_threshold = MAX_NUM_OF_TIMEOUTED_PACKETS;
+
+	ena_update_hints(adapter, &get_feat_ctx.hw_hints);
+
+	setup_timer(&adapter->timer_service, ena_timer_service,
+		    (unsigned long)adapter);
+	mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ));
+
+	dev_info(&pdev->dev, "%s found at mem %lx, mac addr %pM Queues %d\n",
+		 DEVICE_NAME, (long)pci_resource_start(pdev, 0),
+		 netdev->dev_addr, io_queue_num);
+
+	set_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags);
+
+	adapters_found++;
+
+	return 0;
+
+err_rss:
+	ena_com_delete_debug_area(ena_dev);
+	ena_com_rss_destroy(ena_dev);
+err_terminate_sysfs:
+	ena_sysfs_terminate(&pdev->dev);
+err_free_msix:
+	ena_com_dev_reset(ena_dev, ENA_REGS_RESET_INIT_ERR);
+	ena_free_mgmnt_irq(adapter);
+	ena_disable_msix(adapter);
+err_worker_destroy:
+	ena_com_destroy_interrupt_moderation(ena_dev);
+	del_timer(&adapter->timer_service);
+err_netdev_destroy:
+	free_netdev(netdev);
+err_device_destroy:
+	ena_com_delete_host_info(ena_dev);
+	ena_com_admin_destroy(ena_dev);
+err_free_region:
+	ena_release_bars(ena_dev, pdev);
+err_free_ena_dev:
+	vfree(ena_dev);
+err_disable_device:
+	pci_disable_device(pdev);
+	return rc;
+}
+
+/*****************************************************************************/
+#ifdef HAVE_SRIOV_CONFIGURE
+static int ena_sriov_configure(struct pci_dev *dev, int numvfs)
+{
+	int rc;
+
+	if (numvfs > 0) {
+		rc = pci_enable_sriov(dev, numvfs);
+		if (rc != 0) {
+			dev_err(&dev->dev,
+				"pci_enable_sriov failed to enable: %d vfs with the error: %d\n",
+				numvfs, rc);
+			return rc;
+		}
+
+		return numvfs;
+	}
+
+	if (numvfs == 0) {
+		pci_disable_sriov(dev);
+		return 0;
+	}
+
+	return -EINVAL;
+}
+#endif /* HAVE_SRIOV_CONFIGURE */
+
+/*****************************************************************************/
+/*****************************************************************************/
+
+/* ena_remove - Device Removal Routine
+ * @pdev: PCI device information struct
+ *
+ * ena_remove is called by the PCI subsystem to alert the driver
+ * that it should release a PCI device.
+ */
+static void ena_remove(struct pci_dev *pdev)
+{
+	struct ena_adapter *adapter = pci_get_drvdata(pdev);
+	struct ena_com_dev *ena_dev;
+	struct net_device *netdev;
+
+	ena_dev = adapter->ena_dev;
+	netdev = adapter->netdev;
+
+#ifdef CONFIG_RFS_ACCEL
+	if ((adapter->msix_vecs >= 1) && (netdev->rx_cpu_rmap)) {
+		free_irq_cpu_rmap(netdev->rx_cpu_rmap);
+		netdev->rx_cpu_rmap = NULL;
+	}
+#endif /* CONFIG_RFS_ACCEL */
+
+	unregister_netdev(netdev);
+	ena_sysfs_terminate(&pdev->dev);
+	del_timer_sync(&adapter->timer_service);
+
+	cancel_work_sync(&adapter->reset_task);
+
+	/* Reset the device only if the device is running. */
+	if (test_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags))
+		ena_com_dev_reset(ena_dev, adapter->reset_reason);
+
+	ena_free_mgmnt_irq(adapter);
+
+	ena_disable_msix(adapter);
+
+	free_netdev(netdev);
+
+	ena_com_mmio_reg_read_request_destroy(ena_dev);
+
+	ena_com_abort_admin_commands(ena_dev);
+
+	ena_com_wait_for_abort_completion(ena_dev);
+
+	ena_com_admin_destroy(ena_dev);
+
+	ena_com_rss_destroy(ena_dev);
+
+	ena_com_delete_debug_area(ena_dev);
+
+	ena_com_delete_host_info(ena_dev);
+
+	ena_release_bars(ena_dev, pdev);
+
+	pci_disable_device(pdev);
+
+	ena_com_destroy_interrupt_moderation(ena_dev);
+
+	vfree(ena_dev);
+}
+
+#ifdef CONFIG_PM
+/* ena_suspend - PM suspend callback
+ * @pdev: PCI device information struct
+ * @state:power state
+ */
+static int ena_suspend(struct pci_dev *pdev,  pm_message_t state)
+{
+	struct ena_adapter *adapter = pci_get_drvdata(pdev);
+
+	u64_stats_update_begin(&adapter->syncp);
+	adapter->dev_stats.suspend++;
+	u64_stats_update_end(&adapter->syncp);
+
+	rtnl_lock();
+	if (unlikely(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) {
+		dev_err(&pdev->dev,
+			"ignoring device reset request as the device is being suspended\n");
+		clear_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+	}
+	ena_destroy_device(adapter);
+	rtnl_unlock();
+	return 0;
+}
+
+/* ena_resume - PM resume callback
+ * @pdev: PCI device information struct
+ *
+ */
+static int ena_resume(struct pci_dev *pdev)
+{
+	struct ena_adapter *adapter = pci_get_drvdata(pdev);
+	int rc;
+
+	u64_stats_update_begin(&adapter->syncp);
+	adapter->dev_stats.resume++;
+	u64_stats_update_end(&adapter->syncp);
+
+	rtnl_lock();
+	rc = ena_restore_device(adapter);
+	rtnl_unlock();
+	return rc;
+}
+#endif
+
+static struct pci_driver ena_pci_driver = {
+	.name		= DRV_MODULE_NAME,
+	.id_table	= ena_pci_tbl,
+	.probe		= ena_probe,
+	.remove		= ena_remove,
+#ifdef CONFIG_PM
+	.suspend    = ena_suspend,
+	.resume     = ena_resume,
+#endif
+#ifdef HAVE_SRIOV_CONFIGURE
+	.sriov_configure = ena_sriov_configure,
+#endif /* HAVE_SRIOV_CONFIGURE */
+};
+
+static int __init ena_init(void)
+{
+	pr_info("%s", version);
+
+	ena_wq = create_singlethread_workqueue(DRV_MODULE_NAME);
+	if (!ena_wq) {
+		pr_err("Failed to create workqueue\n");
+		return -ENOMEM;
+	}
+
+	return pci_register_driver(&ena_pci_driver);
+}
+
+static void __exit ena_cleanup(void)
+{
+	pci_unregister_driver(&ena_pci_driver);
+
+	if (ena_wq) {
+		destroy_workqueue(ena_wq);
+		ena_wq = NULL;
+	}
+}
+
+/******************************************************************************
+ ******************************** AENQ Handlers *******************************
+ *****************************************************************************/
+/* ena_update_on_link_change:
+ * Notify the network interface about the change in link status
+ */
+static void ena_update_on_link_change(void *adapter_data,
+				      struct ena_admin_aenq_entry *aenq_e)
+{
+	struct ena_adapter *adapter = (struct ena_adapter *)adapter_data;
+	struct ena_admin_aenq_link_change_desc *aenq_desc =
+		(struct ena_admin_aenq_link_change_desc *)aenq_e;
+	int status = aenq_desc->flags &
+		ENA_ADMIN_AENQ_LINK_CHANGE_DESC_LINK_STATUS_MASK;
+
+	if (status) {
+		netdev_dbg(adapter->netdev, "%s\n", __func__);
+		set_bit(ENA_FLAG_LINK_UP, &adapter->flags);
+		if (!test_bit(ENA_FLAG_ONGOING_RESET, &adapter->flags))
+			netif_carrier_on(adapter->netdev);
+	} else {
+		clear_bit(ENA_FLAG_LINK_UP, &adapter->flags);
+		netif_carrier_off(adapter->netdev);
+	}
+}
+
+static void ena_keep_alive_wd(void *adapter_data,
+			      struct ena_admin_aenq_entry *aenq_e)
+{
+	struct ena_adapter *adapter = (struct ena_adapter *)adapter_data;
+	struct ena_admin_aenq_keep_alive_desc *desc;
+	u64 rx_drops;
+
+	desc = (struct ena_admin_aenq_keep_alive_desc *)aenq_e;
+	adapter->last_keep_alive_jiffies = jiffies;
+
+	rx_drops = ((u64)desc->rx_drops_high << 32) | desc->rx_drops_low;
+
+	u64_stats_update_begin(&adapter->syncp);
+	adapter->dev_stats.rx_drops = rx_drops;
+	u64_stats_update_end(&adapter->syncp);
+}
+
+static void ena_notification(void *adapter_data,
+			     struct ena_admin_aenq_entry *aenq_e)
+{
+	struct ena_adapter *adapter = (struct ena_adapter *)adapter_data;
+	struct ena_admin_ena_hw_hints *hints;
+
+	WARN(aenq_e->aenq_common_desc.group != ENA_ADMIN_NOTIFICATION,
+	     "Invalid group(%x) expected %x\n",
+	     aenq_e->aenq_common_desc.group,
+	     ENA_ADMIN_NOTIFICATION);
+
+	switch (aenq_e->aenq_common_desc.syndrom) {
+	case ENA_ADMIN_UPDATE_HINTS:
+		hints = (struct ena_admin_ena_hw_hints *)
+			(&aenq_e->inline_data_w4);
+		ena_update_hints(adapter, hints);
+		break;
+	default:
+		netif_err(adapter, drv, adapter->netdev,
+			  "Invalid aenq notification link state %d\n",
+			  aenq_e->aenq_common_desc.syndrom);
+	}
+}
+
+/* This handler will called for unknown event group or unimplemented handlers*/
+static void unimplemented_aenq_handler(void *data,
+				       struct ena_admin_aenq_entry *aenq_e)
+{
+	struct ena_adapter *adapter = (struct ena_adapter *)data;
+
+	netif_err(adapter, drv, adapter->netdev,
+		  "Unknown event was received or event with unimplemented handler\n");
+}
+
+static struct ena_aenq_handlers aenq_handlers = {
+	.handlers = {
+		[ENA_ADMIN_LINK_CHANGE] = ena_update_on_link_change,
+		[ENA_ADMIN_NOTIFICATION] = ena_notification,
+		[ENA_ADMIN_KEEP_ALIVE] = ena_keep_alive_wd,
+	},
+	.unimplemented_handler = unimplemented_aenq_handler
+};
+
+module_init(ena_init);
+module_exit(ena_cleanup);
diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h
new file mode 100644
index 0000000000000..e806e05580df0
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_netdev.h
@@ -0,0 +1,484 @@
+/*
+ * Copyright 2015 Amazon.com, Inc. or its affiliates.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ENA_H
+#define ENA_H
+
+#include <linux/bitops.h>
+#include <linux/etherdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/u64_stats_sync.h>
+
+#include "kcompat.h"
+#include "ena_com.h"
+#include "ena_eth_com.h"
+
+#define DRV_MODULE_VER_MAJOR	1
+#define DRV_MODULE_VER_MINOR	5
+#define DRV_MODULE_VER_SUBMINOR 0
+
+#define DRV_MODULE_NAME		"ena"
+#ifndef DRV_MODULE_VERSION
+#define DRV_MODULE_VERSION \
+	__stringify(DRV_MODULE_VER_MAJOR) "."	\
+	__stringify(DRV_MODULE_VER_MINOR) "."	\
+	__stringify(DRV_MODULE_VER_SUBMINOR) "g"
+#endif
+
+#define DEVICE_NAME	"Elastic Network Adapter (ENA)"
+
+/* 1 for AENQ + ADMIN */
+#define ENA_ADMIN_MSIX_VEC		1
+#define ENA_MAX_MSIX_VEC(io_queues)	(ENA_ADMIN_MSIX_VEC + (io_queues))
+
+#define ENA_MIN_MSIX_VEC		2
+
+#define ENA_REG_BAR			0
+#define ENA_MEM_BAR			2
+#define ENA_BAR_MASK (BIT(ENA_REG_BAR) | BIT(ENA_MEM_BAR))
+
+#define ENA_DEFAULT_RING_SIZE	(1024)
+
+#define ENA_TX_WAKEUP_THRESH		(MAX_SKB_FRAGS + 2)
+#define ENA_DEFAULT_RX_COPYBREAK	(128 - NET_IP_ALIGN)
+
+/* limit the buffer size to 600 bytes to handle MTU changes from very
+ * small to very large, in which case the number of buffers per packet
+ * could exceed ENA_PKT_MAX_BUFS
+ */
+#define ENA_DEFAULT_MIN_RX_BUFF_ALLOC_SIZE 600
+
+#define ENA_MIN_MTU		128
+
+#define ENA_NAME_MAX_LEN	20
+#define ENA_IRQNAME_SIZE	40
+
+#define ENA_PKT_MAX_BUFS	19
+
+#define ENA_RX_RSS_TABLE_LOG_SIZE  7
+#define ENA_RX_RSS_TABLE_SIZE	(1 << ENA_RX_RSS_TABLE_LOG_SIZE)
+
+#define ENA_HASH_KEY_SIZE	40
+
+/* The number of tx packet completions that will be handled each NAPI poll
+ * cycle is ring_size / ENA_TX_POLL_BUDGET_DIVIDER.
+ */
+#define ENA_TX_POLL_BUDGET_DIVIDER	4
+
+/* Refill Rx queue when number of available descriptors is below
+ * QUEUE_SIZE / ENA_RX_REFILL_THRESH_DIVIDER
+ */
+#define ENA_RX_REFILL_THRESH_DIVIDER	8
+
+/* Number of queues to check for missing queues per timer service */
+#define ENA_MONITORED_TX_QUEUES	4
+/* Max timeout packets before device reset */
+#define MAX_NUM_OF_TIMEOUTED_PACKETS 128
+
+#define ENA_TX_RING_IDX_NEXT(idx, ring_size) (((idx) + 1) & ((ring_size) - 1))
+
+#define ENA_RX_RING_IDX_NEXT(idx, ring_size) (((idx) + 1) & ((ring_size) - 1))
+#define ENA_RX_RING_IDX_ADD(idx, n, ring_size) \
+	(((idx) + (n)) & ((ring_size) - 1))
+
+#define ENA_IO_TXQ_IDX(q)	(2 * (q))
+#define ENA_IO_RXQ_IDX(q)	(2 * (q) + 1)
+
+#define ENA_MGMNT_IRQ_IDX		0
+#define ENA_IO_IRQ_FIRST_IDX		1
+#define ENA_IO_IRQ_IDX(q)		(ENA_IO_IRQ_FIRST_IDX + (q))
+
+/* ENA device should send keep alive msg every 1 sec.
+ * We wait for 6 sec just to be on the safe side.
+ */
+#define ENA_DEVICE_KALIVE_TIMEOUT	(6 * HZ)
+#define ENA_MAX_NO_INTERRUPT_ITERATIONS 3
+
+#define ENA_MMIO_DISABLE_REG_READ	BIT(0)
+
+struct ena_irq {
+	irq_handler_t handler;
+	void *data;
+	int cpu;
+	u32 vector;
+	cpumask_t affinity_hint_mask;
+	char name[ENA_IRQNAME_SIZE];
+};
+
+struct ena_napi {
+	struct napi_struct napi ____cacheline_aligned;
+	struct ena_ring *tx_ring;
+	struct ena_ring *rx_ring;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0)
+	atomic_t unmask_interrupt;
+#endif
+	u32 qid;
+};
+
+struct ena_tx_buffer {
+	struct sk_buff *skb;
+	/* num of ena desc for this specific skb
+	 * (includes data desc and metadata desc)
+	 */
+	u32 tx_descs;
+	/* num of buffers used by this skb */
+	u32 num_of_bufs;
+
+	/* Used for detect missing tx packets to limit the number of prints */
+	u32 print_once;
+	/* Save the last jiffies to detect missing tx packets
+	 *
+	 * sets to non zero value on ena_start_xmit and set to zero on
+	 * napi and timer_Service_routine.
+	 *
+	 * while this value is not protected by lock,
+	 * a given packet is not expected to be handled by ena_start_xmit
+	 * and by napi/timer_service at the same time.
+	 */
+	unsigned long last_jiffies;
+	struct ena_com_buf bufs[ENA_PKT_MAX_BUFS];
+} ____cacheline_aligned;
+
+struct ena_rx_buffer {
+	struct sk_buff *skb;
+	struct page *page;
+	u32 page_offset;
+	struct ena_com_buf ena_buf;
+} ____cacheline_aligned;
+
+struct ena_stats_tx {
+	u64 cnt;
+	u64 bytes;
+	u64 queue_stop;
+	u64 prepare_ctx_err;
+	u64 queue_wakeup;
+	u64 dma_mapping_err;
+	u64 linearize;
+	u64 linearize_failed;
+	u64 napi_comp;
+	u64 tx_poll;
+	u64 doorbells;
+	u64 bad_req_id;
+	u64 missed_tx;
+};
+
+struct ena_stats_rx {
+	u64 cnt;
+	u64 bytes;
+	u64 refil_partial;
+	u64 bad_csum;
+	u64 page_alloc_fail;
+	u64 skb_alloc_fail;
+	u64 dma_mapping_err;
+	u64 bad_desc_num;
+	u64 rx_copybreak_pkt;
+#if ENA_BUSY_POLL_SUPPORT
+	u64 bp_yield;
+	u64 bp_missed;
+	u64 bp_cleaned;
+#endif
+	u64 bad_req_id;
+	u64 empty_rx_ring;
+};
+
+struct ena_ring {
+	union {
+		/* Holds the empty requests for TX/RX
+		 * out of order completions
+		 */
+		u16 *free_tx_ids;
+		u16 *free_rx_ids;
+	};
+
+	union {
+		struct ena_tx_buffer *tx_buffer_info;
+		struct ena_rx_buffer *rx_buffer_info;
+	};
+
+	/* cache ptr to avoid using the adapter */
+	struct device *dev;
+	struct pci_dev *pdev;
+	struct napi_struct *napi;
+	struct net_device *netdev;
+	struct ena_com_dev *ena_dev;
+	struct ena_adapter *adapter;
+	struct ena_com_io_cq *ena_com_io_cq;
+	struct ena_com_io_sq *ena_com_io_sq;
+
+	u16 next_to_use;
+	u16 next_to_clean;
+	u16 rx_copybreak;
+	u16 qid;
+	u16 mtu;
+	u16 sgl_size;
+
+	/* The maximum header length the device can handle */
+	u8 tx_max_header_size;
+
+	bool first_interrupt;
+	u16 no_interrupt_event_cnt;
+
+	/* cpu for TPH */
+	int cpu;
+	 /* number of tx/rx_buffer_info's entries */
+	int ring_size;
+
+	enum ena_admin_placement_policy_type tx_mem_queue_type;
+
+	struct ena_com_rx_buf_info ena_bufs[ENA_PKT_MAX_BUFS];
+	u32  smoothed_interval;
+	u32  per_napi_packets;
+	u32  per_napi_bytes;
+	enum ena_intr_moder_level moder_tbl_idx;
+	struct u64_stats_sync syncp;
+	union {
+		struct ena_stats_tx tx_stats;
+		struct ena_stats_rx rx_stats;
+	};
+	int empty_rx_queue;
+#if ENA_BUSY_POLL_SUPPORT
+	atomic_t bp_state;
+#endif
+} ____cacheline_aligned;
+
+#if ENA_BUSY_POLL_SUPPORT
+enum ena_busy_poll_state_t {
+	ENA_BP_STATE_IDLE = 0,
+	ENA_BP_STATE_NAPI,
+	ENA_BP_STATE_POLL,
+	ENA_BP_STATE_DISABLE
+};
+#endif
+struct ena_stats_dev {
+	u64 tx_timeout;
+	u64 suspend;
+	u64 resume;
+	u64 wd_expired;
+	u64 interface_up;
+	u64 interface_down;
+	u64 admin_q_pause;
+	u64 rx_drops;
+};
+
+enum ena_flags_t {
+	ENA_FLAG_DEVICE_RUNNING,
+	ENA_FLAG_DEV_UP,
+	ENA_FLAG_LINK_UP,
+	ENA_FLAG_MSIX_ENABLED,
+	ENA_FLAG_TRIGGER_RESET,
+	ENA_FLAG_ONGOING_RESET
+};
+
+/* adapter specific private data structure */
+struct ena_adapter {
+	struct ena_com_dev *ena_dev;
+	/* OS defined structs */
+	struct net_device *netdev;
+	struct pci_dev *pdev;
+
+	/* rx packets that shorter that this len will be copied to the skb
+	 * header
+	 */
+	u32 rx_copybreak;
+	u32 max_mtu;
+
+	int num_queues;
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
+	struct msix_entry *msix_entries;
+#endif
+	int msix_vecs;
+
+	u32 missing_tx_completion_threshold;
+
+	u32 tx_usecs, rx_usecs; /* interrupt moderation */
+	u32 tx_frames, rx_frames; /* interrupt moderation */
+
+	u32 tx_ring_size;
+	u32 rx_ring_size;
+
+	u32 msg_enable;
+
+	u16 max_tx_sgl_size;
+	u16 max_rx_sgl_size;
+
+	u8 mac_addr[ETH_ALEN];
+
+	unsigned long keep_alive_timeout;
+	unsigned long missing_tx_completion_to;
+
+	char name[ENA_NAME_MAX_LEN];
+
+	unsigned long flags;
+	/* TX */
+	struct ena_ring tx_ring[ENA_MAX_NUM_IO_QUEUES]
+		____cacheline_aligned_in_smp;
+
+	/* RX */
+	struct ena_ring rx_ring[ENA_MAX_NUM_IO_QUEUES]
+		____cacheline_aligned_in_smp;
+
+	struct ena_napi ena_napi[ENA_MAX_NUM_IO_QUEUES];
+
+	struct ena_irq irq_tbl[ENA_MAX_MSIX_VEC(ENA_MAX_NUM_IO_QUEUES)];
+
+	/* timer service */
+	struct work_struct reset_task;
+	struct timer_list timer_service;
+
+	bool wd_state;
+	bool dev_up_before_reset;
+	unsigned long last_keep_alive_jiffies;
+
+	struct u64_stats_sync syncp;
+	struct ena_stats_dev dev_stats;
+
+	/* last queue index that was checked for uncompleted tx packets */
+	u32 last_monitored_tx_qid;
+
+	enum ena_regs_reset_reason_types reset_reason;
+};
+
+void ena_set_ethtool_ops(struct net_device *netdev);
+
+void ena_dump_stats_to_dmesg(struct ena_adapter *adapter);
+
+void ena_dump_stats_to_buf(struct ena_adapter *adapter, u8 *buf);
+
+int ena_get_sset_count(struct net_device *netdev, int sset);
+
+#if ENA_BUSY_POLL_SUPPORT
+static inline void ena_bp_init_lock(struct ena_ring *rx_ring)
+{
+	/* reset state to idle */
+	atomic_set(&rx_ring->bp_state, ENA_BP_STATE_IDLE);
+}
+
+/* called from the napi routine to get ownership of the ring */
+static inline bool ena_bp_lock_napi(struct ena_ring *rx_ring)
+{
+	int rc = atomic_cmpxchg(&rx_ring->bp_state, ENA_BP_STATE_IDLE,
+				ENA_BP_STATE_NAPI);
+	if (rc != ENA_BP_STATE_IDLE) {
+		u64_stats_update_begin(&rx_ring->syncp);
+		rx_ring->rx_stats.bp_yield++;
+		u64_stats_update_end(&rx_ring->syncp);
+	}
+
+	return rc == ENA_BP_STATE_IDLE;
+}
+
+static inline void ena_bp_unlock_napi(struct ena_ring *rx_ring)
+{
+	WARN_ON(atomic_read(&rx_ring->bp_state) != ENA_BP_STATE_NAPI);
+
+	/* flush any outstanding Rx frames */
+	if (rx_ring->napi->gro_list)
+		napi_gro_flush(rx_ring->napi, false);
+
+	/* reset state to idle */
+	atomic_set(&rx_ring->bp_state, ENA_BP_STATE_IDLE);
+}
+
+/* called from ena_ll_busy_poll() */
+static inline bool ena_bp_lock_poll(struct ena_ring *rx_ring)
+{
+	int rc = atomic_cmpxchg(&rx_ring->bp_state, ENA_BP_STATE_IDLE,
+				ENA_BP_STATE_POLL);
+	if (rc != ENA_BP_STATE_IDLE) {
+		u64_stats_update_begin(&rx_ring->syncp);
+		rx_ring->rx_stats.bp_yield++;
+		u64_stats_update_end(&rx_ring->syncp);
+	}
+
+	return rc == ENA_BP_STATE_IDLE;
+}
+
+static inline void ena_bp_unlock_poll(struct ena_ring *rx_ring)
+{
+	WARN_ON(atomic_read(&rx_ring->bp_state) != ENA_BP_STATE_POLL);
+
+	/* reset state to idle */
+	atomic_set(&rx_ring->bp_state, ENA_BP_STATE_IDLE);
+}
+
+/* true if a socket is polling, even if it did not get the lock */
+static inline bool ena_bp_busy_polling(struct ena_ring *rx_ring)
+{
+	return atomic_read(&rx_ring->bp_state) == ENA_BP_STATE_POLL;
+}
+
+static inline bool ena_bp_disable(struct ena_ring *rx_ring)
+{
+	int rc = atomic_cmpxchg(&rx_ring->bp_state, ENA_BP_STATE_IDLE,
+				ENA_BP_STATE_DISABLE);
+
+	return rc == ENA_BP_STATE_IDLE;
+}
+#else
+static inline void ena_bp_init_lock(struct ena_ring *rx_ring)
+{
+}
+
+static inline bool ena_bp_lock_napi(struct ena_ring *rx_ring)
+{
+       return true;
+}
+
+static inline void ena_bp_unlock_napi(struct ena_ring *rx_ring)
+{
+}
+
+static inline bool ena_bp_lock_poll(struct ena_ring *rx_ring)
+{
+       return false;
+}
+
+static inline void ena_bp_unlock_poll(struct ena_ring *rx_ring)
+{
+}
+
+static inline bool ena_bp_busy_polling(struct ena_ring *rx_ring)
+{
+       return false;
+}
+
+static inline bool ena_bp_disable(struct ena_ring *rx_ring)
+{
+	return true;
+}
+#endif /* ENA_BUSY_POLL_SUPPORT */
+
+#endif /* !(ENA_H) */
diff --git a/drivers/amazon/net/ena/ena_pci_id_tbl.h b/drivers/amazon/net/ena/ena_pci_id_tbl.h
new file mode 100644
index 0000000000000..f80d2a47fa94a
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_pci_id_tbl.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2015 Amazon.com, Inc. or its affiliates.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ENA_PCI_ID_TBL_H_
+#define ENA_PCI_ID_TBL_H_
+
+#ifndef PCI_VENDOR_ID_AMAZON
+#define PCI_VENDOR_ID_AMAZON 0x1d0f
+#endif
+
+#ifndef PCI_DEV_ID_ENA_PF
+#define PCI_DEV_ID_ENA_PF	0x0ec2
+#endif
+
+#ifndef PCI_DEV_ID_ENA_LLQ_PF
+#define PCI_DEV_ID_ENA_LLQ_PF	0x1ec2
+#endif
+
+#ifndef PCI_DEV_ID_ENA_VF
+#define PCI_DEV_ID_ENA_VF	0xec20
+#endif
+
+#ifndef PCI_DEV_ID_ENA_LLQ_VF
+#define PCI_DEV_ID_ENA_LLQ_VF	0xec21
+#endif
+
+#define ENA_PCI_ID_TABLE_ENTRY(devid) \
+	{PCI_DEVICE(PCI_VENDOR_ID_AMAZON, devid)},
+
+static const struct pci_device_id ena_pci_tbl[] = {
+	ENA_PCI_ID_TABLE_ENTRY(PCI_DEV_ID_ENA_PF)
+	ENA_PCI_ID_TABLE_ENTRY(PCI_DEV_ID_ENA_LLQ_PF)
+	ENA_PCI_ID_TABLE_ENTRY(PCI_DEV_ID_ENA_VF)
+	ENA_PCI_ID_TABLE_ENTRY(PCI_DEV_ID_ENA_LLQ_VF)
+	{ }
+};
+
+#endif /* ENA_PCI_ID_TBL_H_ */
diff --git a/drivers/amazon/net/ena/ena_regs_defs.h b/drivers/amazon/net/ena/ena_regs_defs.h
new file mode 100644
index 0000000000000..48ca97fbe7bc6
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_regs_defs.h
@@ -0,0 +1,169 @@
+/*
+ * Copyright 2015 - 2016 Amazon.com, Inc. or its affiliates.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _ENA_REGS_H_
+#define _ENA_REGS_H_
+
+enum ena_regs_reset_reason_types {
+	ENA_REGS_RESET_NORMAL			= 0,
+
+	ENA_REGS_RESET_KEEP_ALIVE_TO		= 1,
+
+	ENA_REGS_RESET_ADMIN_TO			= 2,
+
+	ENA_REGS_RESET_MISS_TX_CMPL		= 3,
+
+	ENA_REGS_RESET_INV_RX_REQ_ID		= 4,
+
+	ENA_REGS_RESET_INV_TX_REQ_ID		= 5,
+
+	ENA_REGS_RESET_TOO_MANY_RX_DESCS	= 6,
+
+	ENA_REGS_RESET_INIT_ERR			= 7,
+
+	ENA_REGS_RESET_DRIVER_INVALID_STATE	= 8,
+
+	ENA_REGS_RESET_OS_TRIGGER		= 9,
+
+	ENA_REGS_RESET_OS_NETDEV_WD		= 10,
+
+	ENA_REGS_RESET_SHUTDOWN			= 11,
+
+	ENA_REGS_RESET_USER_TRIGGER		= 12,
+
+	ENA_REGS_RESET_GENERIC			= 13,
+
+	ENA_REGS_RESET_MISS_INTERRUPT		= 14,
+};
+
+/* ena_registers offsets */
+#define ENA_REGS_VERSION_OFF		0x0
+#define ENA_REGS_CONTROLLER_VERSION_OFF		0x4
+#define ENA_REGS_CAPS_OFF		0x8
+#define ENA_REGS_CAPS_EXT_OFF		0xc
+#define ENA_REGS_AQ_BASE_LO_OFF		0x10
+#define ENA_REGS_AQ_BASE_HI_OFF		0x14
+#define ENA_REGS_AQ_CAPS_OFF		0x18
+#define ENA_REGS_ACQ_BASE_LO_OFF		0x20
+#define ENA_REGS_ACQ_BASE_HI_OFF		0x24
+#define ENA_REGS_ACQ_CAPS_OFF		0x28
+#define ENA_REGS_AQ_DB_OFF		0x2c
+#define ENA_REGS_ACQ_TAIL_OFF		0x30
+#define ENA_REGS_AENQ_CAPS_OFF		0x34
+#define ENA_REGS_AENQ_BASE_LO_OFF		0x38
+#define ENA_REGS_AENQ_BASE_HI_OFF		0x3c
+#define ENA_REGS_AENQ_HEAD_DB_OFF		0x40
+#define ENA_REGS_AENQ_TAIL_OFF		0x44
+#define ENA_REGS_INTR_MASK_OFF		0x4c
+#define ENA_REGS_DEV_CTL_OFF		0x54
+#define ENA_REGS_DEV_STS_OFF		0x58
+#define ENA_REGS_MMIO_REG_READ_OFF		0x5c
+#define ENA_REGS_MMIO_RESP_LO_OFF		0x60
+#define ENA_REGS_MMIO_RESP_HI_OFF		0x64
+#define ENA_REGS_RSS_IND_ENTRY_UPDATE_OFF		0x68
+
+/* version register */
+#define ENA_REGS_VERSION_MINOR_VERSION_MASK		0xff
+#define ENA_REGS_VERSION_MAJOR_VERSION_SHIFT		8
+#define ENA_REGS_VERSION_MAJOR_VERSION_MASK		0xff00
+
+/* controller_version register */
+#define ENA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK		0xff
+#define ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT		8
+#define ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK		0xff00
+#define ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT		16
+#define ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK		0xff0000
+#define ENA_REGS_CONTROLLER_VERSION_IMPL_ID_SHIFT		24
+#define ENA_REGS_CONTROLLER_VERSION_IMPL_ID_MASK		0xff000000
+
+/* caps register */
+#define ENA_REGS_CAPS_CONTIGUOUS_QUEUE_REQUIRED_MASK		0x1
+#define ENA_REGS_CAPS_RESET_TIMEOUT_SHIFT		1
+#define ENA_REGS_CAPS_RESET_TIMEOUT_MASK		0x3e
+#define ENA_REGS_CAPS_DMA_ADDR_WIDTH_SHIFT		8
+#define ENA_REGS_CAPS_DMA_ADDR_WIDTH_MASK		0xff00
+#define ENA_REGS_CAPS_ADMIN_CMD_TO_SHIFT		16
+#define ENA_REGS_CAPS_ADMIN_CMD_TO_MASK		0xf0000
+
+/* aq_caps register */
+#define ENA_REGS_AQ_CAPS_AQ_DEPTH_MASK		0xffff
+#define ENA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_SHIFT		16
+#define ENA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_MASK		0xffff0000
+
+/* acq_caps register */
+#define ENA_REGS_ACQ_CAPS_ACQ_DEPTH_MASK		0xffff
+#define ENA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_SHIFT		16
+#define ENA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_MASK		0xffff0000
+
+/* aenq_caps register */
+#define ENA_REGS_AENQ_CAPS_AENQ_DEPTH_MASK		0xffff
+#define ENA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_SHIFT		16
+#define ENA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_MASK		0xffff0000
+
+/* dev_ctl register */
+#define ENA_REGS_DEV_CTL_DEV_RESET_MASK		0x1
+#define ENA_REGS_DEV_CTL_AQ_RESTART_SHIFT		1
+#define ENA_REGS_DEV_CTL_AQ_RESTART_MASK		0x2
+#define ENA_REGS_DEV_CTL_QUIESCENT_SHIFT		2
+#define ENA_REGS_DEV_CTL_QUIESCENT_MASK		0x4
+#define ENA_REGS_DEV_CTL_IO_RESUME_SHIFT		3
+#define ENA_REGS_DEV_CTL_IO_RESUME_MASK		0x8
+#define ENA_REGS_DEV_CTL_RESET_REASON_SHIFT		28
+#define ENA_REGS_DEV_CTL_RESET_REASON_MASK		0xf0000000
+
+/* dev_sts register */
+#define ENA_REGS_DEV_STS_READY_MASK		0x1
+#define ENA_REGS_DEV_STS_AQ_RESTART_IN_PROGRESS_SHIFT		1
+#define ENA_REGS_DEV_STS_AQ_RESTART_IN_PROGRESS_MASK		0x2
+#define ENA_REGS_DEV_STS_AQ_RESTART_FINISHED_SHIFT		2
+#define ENA_REGS_DEV_STS_AQ_RESTART_FINISHED_MASK		0x4
+#define ENA_REGS_DEV_STS_RESET_IN_PROGRESS_SHIFT		3
+#define ENA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK		0x8
+#define ENA_REGS_DEV_STS_RESET_FINISHED_SHIFT		4
+#define ENA_REGS_DEV_STS_RESET_FINISHED_MASK		0x10
+#define ENA_REGS_DEV_STS_FATAL_ERROR_SHIFT		5
+#define ENA_REGS_DEV_STS_FATAL_ERROR_MASK		0x20
+#define ENA_REGS_DEV_STS_QUIESCENT_STATE_IN_PROGRESS_SHIFT		6
+#define ENA_REGS_DEV_STS_QUIESCENT_STATE_IN_PROGRESS_MASK		0x40
+#define ENA_REGS_DEV_STS_QUIESCENT_STATE_ACHIEVED_SHIFT		7
+#define ENA_REGS_DEV_STS_QUIESCENT_STATE_ACHIEVED_MASK		0x80
+
+/* mmio_reg_read register */
+#define ENA_REGS_MMIO_REG_READ_REQ_ID_MASK		0xffff
+#define ENA_REGS_MMIO_REG_READ_REG_OFF_SHIFT		16
+#define ENA_REGS_MMIO_REG_READ_REG_OFF_MASK		0xffff0000
+
+/* rss_ind_entry_update register */
+#define ENA_REGS_RSS_IND_ENTRY_UPDATE_INDEX_MASK		0xffff
+#define ENA_REGS_RSS_IND_ENTRY_UPDATE_CQ_IDX_SHIFT		16
+#define ENA_REGS_RSS_IND_ENTRY_UPDATE_CQ_IDX_MASK		0xffff0000
+
+#endif /*_ENA_REGS_H_ */
diff --git a/drivers/amazon/net/ena/ena_sysfs.c b/drivers/amazon/net/ena/ena_sysfs.c
new file mode 100644
index 0000000000000..b8aa5387cb715
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_sysfs.c
@@ -0,0 +1,268 @@
+/*
+ * Copyright 2015 Amazon.com, Inc. or its affiliates.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/stat.h>
+#include <linux/sysfs.h>
+
+#include "ena_com.h"
+#include "ena_netdev.h"
+#include "ena_sysfs.h"
+
+struct dev_ext_ena_attribute {
+        struct device_attribute attr;
+        void *var;
+};
+
+#define to_ext_attr(x) container_of(x, struct dev_ext_ena_attribute, attr)
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0)
+static ssize_t ena_store_rx_copybreak(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t len)
+{
+	struct ena_adapter *adapter = dev_get_drvdata(dev);
+	unsigned long rx_copybreak;
+	struct ena_ring *rx_ring;
+	int err, i;
+
+	err = kstrtoul(buf, 10, &rx_copybreak);
+	if (err < 0)
+		return err;
+
+	if (len > adapter->netdev->mtu)
+		return -EINVAL;
+
+	rtnl_lock();
+	adapter->rx_copybreak = rx_copybreak;
+
+	for (i = 0; i < adapter->num_queues; i++) {
+		rx_ring = &adapter->rx_ring[i];
+		rx_ring->rx_copybreak = rx_copybreak;
+	}
+	rtnl_unlock();
+
+	return len;
+}
+
+static ssize_t ena_show_rx_copybreak(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	struct ena_adapter *adapter = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%d\n", adapter->rx_copybreak);
+}
+
+static DEVICE_ATTR(rx_copybreak, S_IRUGO | S_IWUSR, ena_show_rx_copybreak,
+		   ena_store_rx_copybreak);
+#endif /* kernel version < 3.18 */
+
+
+/* adaptive interrupt moderation */
+static ssize_t ena_show_intr_moderation(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct ena_intr_moder_entry entry;
+	struct dev_ext_ena_attribute *ea = to_ext_attr(attr);
+	enum ena_intr_moder_level level = (enum ena_intr_moder_level)ea->var;
+	struct ena_adapter *adapter = dev_get_drvdata(dev);
+	ssize_t rc = 0;
+
+	ena_com_get_intr_moderation_entry(adapter->ena_dev, level, &entry);
+
+	rc = sprintf(buf, "%u %u %u\n",
+		     entry.intr_moder_interval,
+		     entry.pkts_per_interval,
+		     entry.bytes_per_interval);
+
+	return rc;
+}
+
+static ssize_t ena_store_intr_moderation(struct device *dev,
+					 struct device_attribute *attr,
+					 const char *buf,
+					 size_t count)
+{
+	struct ena_intr_moder_entry entry;
+	struct dev_ext_ena_attribute *ea = to_ext_attr(attr);
+	struct ena_adapter *adapter = dev_get_drvdata(dev);
+	enum ena_intr_moder_level level = (enum ena_intr_moder_level)ea->var;
+	int cnt;
+
+	cnt = sscanf(buf, "%u %u %u",
+		     &entry.intr_moder_interval,
+		     &entry.pkts_per_interval,
+		     &entry.bytes_per_interval);
+
+	if (cnt != 3)
+		return -EINVAL;
+
+	ena_com_init_intr_moderation_entry(adapter->ena_dev, level, &entry);
+
+	return count;
+}
+
+static ssize_t ena_store_intr_moderation_restore_default(struct device *dev,
+							 struct device_attribute *attr,
+							 const char *buf,
+							 size_t len)
+{
+	struct ena_adapter *adapter = dev_get_drvdata(dev);
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	unsigned long restore_default;
+	int err;
+
+	err = kstrtoul(buf, 10, &restore_default);
+	if (err < 0)
+		return err;
+
+	if (ena_com_interrupt_moderation_supported(ena_dev) && restore_default) {
+		ena_com_config_default_interrupt_moderation_table(ena_dev);
+		ena_com_enable_adaptive_moderation(ena_dev);
+	}
+
+	return len;
+}
+
+static ssize_t ena_store_enable_adaptive_intr_moderation(struct device *dev,
+							 struct device_attribute *attr,
+							 const char *buf,
+							 size_t len)
+{
+	struct ena_adapter *adapter = dev_get_drvdata(dev);
+	unsigned long enable_moderation;
+	int err;
+
+	err = kstrtoul(buf, 10, &enable_moderation);
+	if (err < 0)
+		return err;
+
+	if (enable_moderation == 0)
+		ena_com_disable_adaptive_moderation(adapter->ena_dev);
+	else
+		ena_com_enable_adaptive_moderation(adapter->ena_dev);
+
+	return len;
+}
+
+static ssize_t ena_show_enable_adaptive_intr_moderation(struct device *dev,
+							struct device_attribute *attr,
+							char *buf)
+{
+	struct ena_adapter *adapter = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%d\n",
+		       ena_com_get_adaptive_moderation_enabled(adapter->ena_dev));
+}
+
+static DEVICE_ATTR(enable_adaptive_intr_moderation, S_IRUGO | S_IWUSR,
+		ena_show_enable_adaptive_intr_moderation,
+		ena_store_enable_adaptive_intr_moderation);
+
+static DEVICE_ATTR(intr_moderation_restore_default, S_IWUSR | S_IWGRP,
+		NULL, ena_store_intr_moderation_restore_default);
+
+#define INTR_MODERATION_PREPARE_ATTR(_name, _type) {			\
+	__ATTR(intr_moderation_##_name, (S_IRUGO | S_IWUSR | S_IWGRP),	\
+		ena_show_intr_moderation, ena_store_intr_moderation), \
+		(void *)_type }
+
+/* Device attrs - intr moderation */
+static struct dev_ext_ena_attribute dev_attr_intr_moderation[] = {
+	INTR_MODERATION_PREPARE_ATTR(lowest, ENA_INTR_MODER_LOWEST),
+	INTR_MODERATION_PREPARE_ATTR(low, ENA_INTR_MODER_LOW),
+	INTR_MODERATION_PREPARE_ATTR(mid, ENA_INTR_MODER_MID),
+	INTR_MODERATION_PREPARE_ATTR(high, ENA_INTR_MODER_HIGH),
+	INTR_MODERATION_PREPARE_ATTR(highest, ENA_INTR_MODER_HIGHEST),
+};
+
+/******************************************************************************
+ *****************************************************************************/
+int ena_sysfs_init(struct device *dev)
+{
+	int i, rc;
+	struct ena_adapter *adapter = dev_get_drvdata(dev);
+
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0)
+	if (device_create_file(dev, &dev_attr_rx_copybreak))
+		dev_err(dev, "failed to create rx_copybreak sysfs entry");
+#endif
+
+	if (ena_com_interrupt_moderation_supported(adapter->ena_dev)) {
+		if (device_create_file(dev,
+				       &dev_attr_intr_moderation_restore_default))
+			dev_err(dev,
+				"failed to create intr_moderation_restore_default");
+
+		if (device_create_file(dev,
+				       &dev_attr_enable_adaptive_intr_moderation))
+			dev_err(dev,
+				"failed to create adaptive_intr_moderation_enable");
+
+		for (i = 0; i < ARRAY_SIZE(dev_attr_intr_moderation); i++) {
+			rc = sysfs_create_file(&dev->kobj,
+					       &dev_attr_intr_moderation[i].attr.attr);
+			if (rc) {
+				dev_err(dev,
+					"%s: sysfs_create_file(intr_moderation %d) failed\n",
+					__func__, i);
+				return rc;
+			}
+		}
+	}
+
+	return 0;
+}
+
+/******************************************************************************
+ *****************************************************************************/
+void ena_sysfs_terminate(struct device *dev)
+{
+	struct ena_adapter *adapter = dev_get_drvdata(dev);
+	int i;
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0)
+	device_remove_file(dev, &dev_attr_rx_copybreak);
+#endif
+	if (ena_com_interrupt_moderation_supported(adapter->ena_dev)) {
+		for (i = 0; i < ARRAY_SIZE(dev_attr_intr_moderation); i++)
+			sysfs_remove_file(&dev->kobj,
+					  &dev_attr_intr_moderation[i].attr.attr);
+		device_remove_file(dev,
+				   &dev_attr_enable_adaptive_intr_moderation);
+		device_remove_file(dev,
+				   &dev_attr_intr_moderation_restore_default);
+	}
+}
diff --git a/drivers/amazon/net/ena/ena_sysfs.h b/drivers/amazon/net/ena/ena_sysfs.h
new file mode 100644
index 0000000000000..dc0d4c90cd327
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_sysfs.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2015 Amazon.com, Inc. or its affiliates.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ENA_SYSFS_H__
+#define __ENA_SYSFS_H__
+
+#ifdef CONFIG_SYSFS
+
+int ena_sysfs_init(struct device *dev);
+
+void ena_sysfs_terminate(struct device *dev);
+
+#else /* CONFIG_SYSFS */
+
+static inline int ena_sysfs_init(struct device *dev)
+{
+	return 0;
+}
+
+static inline void ena_sysfs_terminate(struct device *dev)
+{
+}
+
+#endif /* CONFIG_SYSFS */
+
+#endif /* __ENA_SYSFS_H__ */
diff --git a/drivers/amazon/net/ena/kcompat.h b/drivers/amazon/net/ena/kcompat.h
new file mode 100644
index 0000000000000..a945574f75805
--- /dev/null
+++ b/drivers/amazon/net/ena/kcompat.h
@@ -0,0 +1,570 @@
+/*******************************************************************************
+Modified by Amazon 2015-2016.
+Copyright 2015-2016, Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+Modifications subject to the terms and conditions of the GNU General
+Public License, version 2.
+*******************************************************************************/
+
+/*******************************************************************************
+
+Intel 10 Gigabit PCI Express Linux driver
+Copyright(c) 1999 - 2013 Intel Corporation.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms and conditions of the GNU General Public License,
+version 2, as published by the Free Software Foundation.
+
+This program is distributed in the hope it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+
+The full GNU General Public License is included in this distribution in
+the file called "COPYING".
+
+Contact Information:
+e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+
+*******************************************************************************/
+
+#ifndef _KCOMPAT_H_
+#define _KCOMPAT_H_
+
+#ifndef LINUX_VERSION_CODE
+#include <linux/version.h>
+#else
+#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c))
+#endif
+
+#include <asm/io.h>
+
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) )
+#include <linux/utsrelease.h>
+#else
+#include <generated/utsrelease.h>
+#endif
+
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/in.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/ip.h>
+#include <linux/kconfig.h>
+#include <linux/list.h>
+#include <linux/mii.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/pci.h>
+#include <linux/sched.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include <linux/udp.h>
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,6,0)
+#include <linux/sizes.h>
+#endif
+
+#ifndef SZ_4K
+#define SZ_4K 0x00001000
+#endif
+
+#ifndef SZ_256
+#define SZ_256 0x0000100
+#endif
+
+#ifdef HAVE_POLL_CONTROLLER
+#define CONFIG_NET_POLL_CONTROLLER
+#endif
+
+#define ENA_BUSY_POLL_SUPPORT defined(CONFIG_NET_RX_BUSY_POLL) && \
+	LINUX_VERSION_CODE < KERNEL_VERSION(4,5,0) && \
+	LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
+
+/******************************************************************************/
+/************************** Ubuntu macros *************************************/
+/******************************************************************************/
+
+/* Ubuntu Release ABI is the 4th digit of their kernel version. You can find
+ * it in /usr/src/linux/$(uname -r)/include/generated/utsrelease.h for new
+ * enough versions of Ubuntu. Otherwise you can simply see it in the output of
+ * uname as the 4th digit of the kernel. The UTS_UBUNTU_RELEASE_ABI is not in
+ * the linux-source package, but in the linux-headers package. It begins to
+ * appear in later releases of 14.04 and 14.10.
+ *
+ * Ex:
+ * <Ubuntu 14.04.1>
+ *  $uname -r
+ *  3.13.0-45-generic
+ * ABI is 45
+ *
+ * <Ubuntu 14.10>
+ *  $uname -r
+ *  3.16.0-23-generic
+ * ABI is 23
+ */
+#ifndef UTS_UBUNTU_RELEASE_ABI
+#define UTS_UBUNTU_RELEASE_ABI 0
+#define UBUNTU_VERSION_CODE 0
+#else
+
+#if UTS_UBUNTU_RELEASE_ABI > 255
+#undef UTS_UBUNTU_RELEASE_ABI
+#define UTS_UBUNTU_RELEASE_ABI 0
+#endif /* UTS_UBUNTU_RELEASE_ABI > 255 */
+
+/* Ubuntu does not provide actual release version macro, so we use the kernel
+ * version plus the ABI to generate a unique version code specific to Ubuntu.
+ * In addition, we mask the lower 8 bits of LINUX_VERSION_CODE in order to
+ * ignore differences in sublevel which are not important since we have the
+ * ABI value. Otherwise, it becomes impossible to correlate ABI to version for
+ * ordering checks.
+ */
+#define UBUNTU_VERSION_CODE (((LINUX_VERSION_CODE & ~0xFF) << 8) + (UTS_UBUNTU_RELEASE_ABI))
+
+#endif /* UTS_UBUNTU_RELEASE_ABI */
+
+/* Note that the 3rd digit is always zero, and will be ignored. This is
+ * because Ubuntu kernels are based on x.y.0-ABI values, and while their linux
+ * version codes are 3 digit, this 3rd digit is superseded by the ABI value.
+ */
+#define UBUNTU_VERSION(a,b,c,d) ((KERNEL_VERSION(a,b,0) << 8) + (d))
+
+/******************************************************************************/
+/**************************** SuSE macros *************************************/
+/******************************************************************************/
+
+/* SuSE version macro is the same as Linux kernel version */
+#ifndef SLE_VERSION
+#define SLE_VERSION(a,b,c) KERNEL_VERSION(a,b,c)
+#endif
+#ifdef CONFIG_SUSE_KERNEL
+#if ( LINUX_VERSION_CODE >= KERNEL_VERSION(3,12,28) )
+/* SLES12 is at least 3.12.28+ based */
+#define SLE_VERSION_CODE SLE_VERSION(12,0,0)
+#endif /* LINUX_VERSION_CODE == KERNEL_VERSION(x,y,z) */
+#endif /* CONFIG_SUSE_KERNEL */
+#ifndef SLE_VERSION_CODE
+#define SLE_VERSION_CODE 0
+#endif /* SLE_VERSION_CODE */
+
+
+/******************************************************************************/
+/**************************** RHEL macros *************************************/
+/******************************************************************************/
+
+#ifndef RHEL_RELEASE_VERSION
+#define RHEL_RELEASE_VERSION(a,b) (((a) << 8) + (b))
+#endif
+#ifndef AX_RELEASE_VERSION
+#define AX_RELEASE_VERSION(a,b) (((a) << 8) + (b))
+#endif
+
+#ifndef AX_RELEASE_CODE
+#define AX_RELEASE_CODE 0
+#endif
+
+#ifndef RHEL_RELEASE_CODE
+#define RHEL_RELEASE_CODE 0
+#endif
+
+#if (AX_RELEASE_CODE && AX_RELEASE_CODE == AX_RELEASE_VERSION(3,0))
+#define RHEL_RELEASE_CODE RHEL_RELEASE_VERSION(5,0)
+#elif (AX_RELEASE_CODE && AX_RELEASE_CODE == AX_RELEASE_VERSION(3,1))
+#define RHEL_RELEASE_CODE RHEL_RELEASE_VERSION(5,1)
+#elif (AX_RELEASE_CODE && AX_RELEASE_CODE == AX_RELEASE_VERSION(3,2))
+#define RHEL_RELEASE_CODE RHEL_RELEASE_VERSION(5,3)
+#endif
+
+/*****************************************************************************/
+#if (RHEL_RELEASE_CODE && \
+	(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,6)) && \
+     (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0)))
+#define HAVE_RHEL6_NET_DEVICE_OPS_EXT
+#endif
+
+#if (RHEL_RELEASE_CODE && \
+	(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,4)) && \
+     (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0)))
+#define HAVE_RHEL6_ETHTOOL_OPS_EXT_STRUCT
+#endif /* RHEL >= 6.4 && RHEL < 7.0 */
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0) || \
+	(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,5))
+#include <net/busy_poll.h>
+#endif
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) )
+#ifndef netif_set_real_num_tx_queues
+static inline int _kc_netif_set_real_num_tx_queues(struct net_device *dev,
+                                                   unsigned int txq)
+{
+        netif_set_real_num_tx_queues(dev, txq);
+        return 0;
+}
+#define netif_set_real_num_tx_queues(dev, txq) \
+        _kc_netif_set_real_num_tx_queues(dev, txq)
+#endif
+#ifndef netif_set_real_num_rx_queues
+static inline int __kc_netif_set_real_num_rx_queues(struct net_device __always_unused *dev,
+                                                    unsigned int __always_unused rxq)
+{
+        return 0;
+}
+#define netif_set_real_num_rx_queues(dev, rxq) \
+        __kc_netif_set_real_num_rx_queues((dev), (rxq))
+#endif
+#endif /* < 2.6.37 */
+
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0) )
+#if !(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,5))
+typedef u32 netdev_features_t;
+#endif
+#undef PCI_EXP_TYPE_RC_EC
+#define  PCI_EXP_TYPE_RC_EC	0xa	/* Root Complex Event Collector */
+#ifndef CONFIG_BQL
+#define netdev_tx_completed_queue(_q, _p, _b) do {} while (0)
+#define netdev_completed_queue(_n, _p, _b) do {} while (0)
+#define netdev_tx_sent_queue(_q, _b) do {} while (0)
+#define netdev_sent_queue(_n, _b) do {} while (0)
+#define netdev_tx_reset_queue(_q) do {} while (0)
+#define netdev_reset_queue(_n) do {} while (0)
+#endif
+
+#endif /* < 3.3.0 */
+
+/******************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0) )
+#ifndef skb_add_rx_frag
+#define skb_add_rx_frag _kc_skb_add_rx_frag
+static inline void _kc_skb_add_rx_frag(struct sk_buff *skb, int i,
+				       struct page *page, int off, int size,
+				       unsigned int truesize)
+{
+	skb_fill_page_desc(skb, i, page, off, size);
+	skb->len += size;
+	skb->data_len += size;
+	skb->truesize += truesize;
+}
+#endif
+#ifdef NET_ADDR_RANDOM
+#define eth_hw_addr_random(N) do { \
+	eth_random_addr(N->dev_addr); \
+	N->addr_assign_type |= NET_ADDR_RANDOM; \
+	} while (0)
+#else /* NET_ADDR_RANDOM */
+#define eth_hw_addr_random(N) eth_random_addr(N->dev_addr)
+#endif /* NET_ADDR_RANDOM */
+#if !(RHEL_RELEASE_CODE)
+/* If probe retry doesn't define, return no device */
+#define EPROBE_DEFER ENODEV
+#endif
+#endif /* >= 3.4.0 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0) )
+#if !(RHEL_RELEASE_CODE)
+static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2)
+{
+	const u16 *a = (const u16 *)addr1;
+	const u16 *b = (const u16 *)addr2;
+
+	return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) == 0;
+}
+#endif
+#endif /* >= 3.5.0 */
+
+/******************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,6,0) )
+#ifndef eth_random_addr
+#define eth_random_addr _kc_eth_random_addr
+static inline void _kc_eth_random_addr(u8 *addr)
+{
+        get_random_bytes(addr, ETH_ALEN);
+        addr[0] &= 0xfe; /* clear multicast */
+        addr[0] |= 0x02; /* set local assignment */
+}
+#endif
+#endif /* < 3.6.0 */
+
+/******************************************************************************/
+#ifndef CONFIG_NET_RX_BUSY_POLL
+static inline void skb_mark_napi_id(struct sk_buff *skb,
+				    struct napi_struct *napi)
+{
+
+}
+
+static inline void napi_hash_del(struct napi_struct *napi)
+{
+
+}
+
+static inline void napi_hash_add(struct napi_struct *napi)
+{
+
+}
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0) )
+/* cpu_rmap is buggy on older version and causes dead lock */
+#ifdef CONFIG_RFS_ACCEL
+#undef CONFIG_RFS_ACCEL
+#endif
+
+#if !(RHEL_RELEASE_CODE)
+static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings)
+{
+	return index % n_rx_rings;
+}
+#endif
+#else /* >= 3.8.0 */
+#ifndef HAVE_SRIOV_CONFIGURE
+#define HAVE_SRIOV_CONFIGURE
+#endif
+#endif /* >= 3.8.0 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,11,0) )
+#if RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,2))
+#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK
+#endif
+#endif
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE >= KERNEL_VERSION(3,12,0) )
+#if ( SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0))
+#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK
+#endif
+#endif /* >= 3.12.0 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE >= KERNEL_VERSION(3,13,0) )
+#if (UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,24))
+#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK
+#else
+#define HAVE_NDO_SELECT_QUEUE_ACCEL
+#endif
+#else
+
+#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
+# define u64_stats_init(syncp)  seqcount_init(syncp.seq)
+#else
+# define u64_stats_init(syncp)  do { } while (0)
+#endif
+
+#if !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) && \
+	!(RHEL_RELEASE_CODE && ((RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,8) && \
+	                        (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0))) \
+                            || (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,1))))
+static inline void reinit_completion(struct completion *x)
+{
+         x->done = 0;
+}
+#endif /* SLE 12 */
+
+#if (!(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,0)) && \
+     !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)))
+static inline int pci_enable_msix_range(struct pci_dev *dev,
+					struct msix_entry *entries,
+					int minvec,
+					int maxvec)
+{
+	int nvec = maxvec;
+	int rc;
+
+	if (maxvec < minvec)
+		return -ERANGE;
+
+	do {
+		rc = pci_enable_msix(dev, entries, nvec);
+		if (rc < 0) {
+			return rc;
+		} else if (rc > 0) {
+			if (rc < minvec)
+				return -ENOSPC;
+			nvec = rc;
+		}
+	} while (rc);
+
+	return nvec;
+}
+#endif
+
+#endif /* >= 3.13.0 */
+
+/*****************************************************************************/
+#if (( LINUX_VERSION_CODE < KERNEL_VERSION(3,13,8) ) && \
+     !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7,4)) && \
+     !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)))
+enum pkt_hash_types {
+	PKT_HASH_TYPE_NONE,	/* Undefined type */
+	PKT_HASH_TYPE_L2,	/* Input: src_MAC, dest_MAC */
+	PKT_HASH_TYPE_L3,	/* Input: src_IP, dst_IP */
+	PKT_HASH_TYPE_L4,	/* Input: src_IP, dst_IP, src_port, dst_port */
+};
+
+static inline void skb_set_hash(struct sk_buff *skb, __u32 hash,
+	enum pkt_hash_types type)
+{
+	skb->l4_rxhash = (type == PKT_HASH_TYPE_L4);
+	skb->rxhash = hash;
+}
+#endif
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0) )
+/* for ndo_dfwd_ ops add_station, del_station and _start_xmit */
+#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK
+#else
+#if !(RHEL_RELEASE_CODE && ((RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7,4) \
+                        && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,1)) \
+                        || RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0))) && \
+    !(UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,105))
+static inline int pci_msix_vec_count(struct pci_dev *dev)
+{
+	int pos;
+	u16 control;
+
+	pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+	if (!pos)
+		return -EINVAL;
+
+	pci_read_config_word(dev, pos + PCI_MSIX_FLAGS, &control);
+	return (control & 0x7FF) + 1;
+}
+#if !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) && \
+    !(RHEL_RELEASE_CODE == RHEL_RELEASE_VERSION(7,0))
+static inline void ether_addr_copy(u8 *dst, const u8 *src)
+{
+	memcpy(dst, src, 6);
+}
+#endif /* SLE 12 */
+#endif /* RHEL 7 */
+#endif
+
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(6,8)))
+#define napi_gro_flush(napi, flush_old) napi_gro_flush(napi)
+#endif
+
+#if ( LINUX_VERSION_CODE >= KERNEL_VERSION(3,15,0) || \
+	(UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE > UBUNTU_VERSION(3,13,0,24))) || \
+	(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) || \
+	(RHEL_RELEASE_CODE && ((RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7,4) \
+	                     && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2)) \
+                           || RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0)))
+#else
+static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp,
+					     unsigned int start)
+{
+	return u64_stats_fetch_retry(syncp, start);
+}
+
+static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp)
+{
+	return u64_stats_fetch_begin(syncp);
+}
+
+#endif
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0) )
+#undef GENMASK
+#define GENMASK(h, l)	(((U32_C(1) << ((h) - (l) + 1)) - 1) << (l))
+#undef GENMASK_ULL
+#define GENMASK_ULL(h, l) (((U64_C(1) << ((h) - (l) + 1)) - 1) << (l))
+#endif
+/*****************************************************************************/
+
+#if ( LINUX_VERSION_CODE >= KERNEL_VERSION(3,19,0) ) \
+	|| (SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) \
+	|| (RHEL_RELEASE_CODE && ((RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7,4) \
+	                        && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,1)) \
+	                        || RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0)))
+#else
+static inline void netdev_rss_key_fill(void *buffer, size_t len)
+{
+	get_random_bytes(buffer, len);
+}
+#endif
+
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0) ) && \
+    !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) && \
+    !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2))
+
+static inline void napi_schedule_irqoff(struct napi_struct *n)
+{
+	napi_schedule(n);
+}
+
+static inline void __napi_schedule_irqoff(struct napi_struct *n)
+{
+	__napi_schedule(n);
+}
+
+#ifndef READ_ONCE
+#define READ_ONCE(var) (*((volatile typeof(var) *)(&(var))))
+#endif
+#endif /* Kernel 3.19 */
+
+/*****************************************************************************/
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0) \
+	|| (RHEL_RELEASE_CODE && ((RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(6,7)) && \
+	                          (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0))) \
+	                      || RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2)) \
+	|| (UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,19,0,51))
+#else
+static inline void napi_complete_done(struct napi_struct *n, int work_done)
+{
+	napi_complete(n);
+}
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,1,0) \
+	|| (UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,126)) && \
+	(UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE < UBUNTU_VERSION(3,14,0,0))
+
+#else
+
+static inline void ioremap_release(struct device *dev, void *res)
+{
+	iounmap(*(void __iomem **)res);
+}
+
+
+static inline void __iomem *devm_ioremap_wc(struct device *dev,
+					    resource_size_t offset,
+					    resource_size_t size)
+{
+	void __iomem **ptr, *addr;
+
+	ptr = devres_alloc(ioremap_release, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return NULL;
+
+	addr = ioremap_wc(offset, size);
+	if (addr) {
+		*ptr = addr;
+		devres_add(dev, ptr);
+	} else
+		devres_free(ptr);
+
+	return addr;
+}
+#endif
+
+#endif /* _KCOMPAT_H_ */

From 412c32a25854c01611dfeac3b6f2707e31a47299 Mon Sep 17 00:00:00 2001
From: Munehisa Kamata <kamatam@amazon.com>
Date: Wed, 12 Jul 2017 23:35:17 +0000
Subject: [PATCH 007/737] xen/manage: keep track of the on-going suspend mode

To differentiate between Xen suspend, PM suspend and PM hibernation,
keep track of the on-going suspend mode by mainly using a new PM
notifier. Since Xen suspend doesn't have corresponding PM event, its
main logic is modfied to acquire pm_mutex and set the current mode.

Note that we may see deadlock if PM suspend/hibernation is interrupted
by Xen suspend. PM suspend/hibernation depends on xenwatch thread to
process xenbus state transactions, but the thread will sleep to wait
pm_mutex which is already held by PM suspend/hibernation context in the
scenario. Though, acquirng pm_mutex is still right thing to do, and we
would need to modify Xen shutdown code to avoid the issue. This will be
fixed by a separate patch.

Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Sebastian Biemueller <sbiemue@amazon.com>
Reviewed-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
---
 drivers/xen/manage.c | 58 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index cd046684e0d1b..3660b16552d44 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -14,6 +14,7 @@
 #include <linux/freezer.h>
 #include <linux/syscore_ops.h>
 #include <linux/export.h>
+#include <linux/suspend.h>
 
 #include <xen/xen.h>
 #include <xen/xenbus.h>
@@ -40,6 +41,16 @@ enum shutdown_state {
 /* Ignore multiple shutdown requests. */
 static enum shutdown_state shutting_down = SHUTDOWN_INVALID;
 
+enum suspend_modes {
+        NO_SUSPEND = 0,
+        XEN_SUSPEND,
+        PM_SUSPEND,
+        PM_HIBERNATION,
+};
+
+/* Protected by pm_mutex */
+static enum suspend_modes suspend_mode = NO_SUSPEND;
+
 struct suspend_info {
 	int cancelled;
 };
@@ -99,6 +110,10 @@ static void do_suspend(void)
 	int err;
 	struct suspend_info si;
 
+	lock_system_sleep();
+
+	suspend_mode = XEN_SUSPEND;
+
 	shutting_down = SHUTDOWN_SUSPEND;
 
 	err = freeze_processes();
@@ -162,6 +177,10 @@ static void do_suspend(void)
 	thaw_processes();
 out:
 	shutting_down = SHUTDOWN_INVALID;
+
+	suspend_mode = NO_SUSPEND;
+
+	unlock_system_sleep();
 }
 #endif	/* CONFIG_HIBERNATE_CALLBACKS */
 
@@ -387,3 +406,42 @@ int xen_setup_shutdown_event(void)
 EXPORT_SYMBOL_GPL(xen_setup_shutdown_event);
 
 subsys_initcall(xen_setup_shutdown_event);
+
+static int xen_pm_notifier(struct notifier_block *notifier,
+			   unsigned long pm_event, void *unused)
+{
+	switch (pm_event) {
+	case PM_SUSPEND_PREPARE:
+		suspend_mode = PM_SUSPEND;
+		break;
+	case PM_HIBERNATION_PREPARE:
+	case PM_RESTORE_PREPARE:
+		suspend_mode = PM_HIBERNATION;
+		break;
+	case PM_POST_SUSPEND:
+	case PM_POST_RESTORE:
+	case PM_POST_HIBERNATION:
+		/* Set back to the default */
+		suspend_mode = NO_SUSPEND;
+		break;
+	default:
+		pr_warn("Receive unknown PM event 0x%lx\n", pm_event);
+		return -EINVAL;
+	}
+
+	return 0;
+};
+
+static struct notifier_block xen_pm_notifier_block = {
+	.notifier_call = xen_pm_notifier
+};
+
+static int xen_setup_pm_notifier(void)
+{
+	if (!xen_hvm_domain())
+		return -ENODEV;
+
+	return register_pm_notifier(&xen_pm_notifier_block);
+}
+
+subsys_initcall(xen_setup_pm_notifier);

From b0ed8d4d6c39a57575a49c5a0e2067a77dc16c55 Mon Sep 17 00:00:00 2001
From: Munehisa Kamata <kamatam@amazon.com>
Date: Thu, 13 Jul 2017 00:12:32 +0000
Subject: [PATCH 008/737] xen/manage: introduce helper function to know the
 on-going suspend mode

Introduce simple functions which help to know the on-going suspend mode
so that other Xen-related code can behave differently according to the
current suspend mode.

Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Alakesh Haloi <alakeshh@amazon.com>
Reviewed-by: Sebastian Biemueller <sbiemue@amazon.com>
Reviewed-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
---
 drivers/xen/manage.c  | 15 +++++++++++++++
 include/xen/xen-ops.h |  4 ++++
 2 files changed, 19 insertions(+)

diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 3660b16552d44..042fc68dc7a36 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -51,6 +51,21 @@ enum suspend_modes {
 /* Protected by pm_mutex */
 static enum suspend_modes suspend_mode = NO_SUSPEND;
 
+bool xen_suspend_mode_is_xen_suspend(void)
+{
+	return suspend_mode == XEN_SUSPEND;
+}
+
+bool xen_suspend_mode_is_pm_suspend(void)
+{
+	return suspend_mode == PM_SUSPEND;
+}
+
+bool xen_suspend_mode_is_pm_hibernation(void)
+{
+	return suspend_mode == PM_HIBERNATION;
+}
+
 struct suspend_info {
 	int cancelled;
 };
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 39a5580f8feb0..5d7e168952acf 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -40,6 +40,10 @@ u64 xen_steal_clock(int cpu);
 
 int xen_setup_shutdown_event(void);
 
+bool xen_suspend_mode_is_xen_suspend(void);
+bool xen_suspend_mode_is_pm_suspend(void);
+bool xen_suspend_mode_is_pm_hibernation(void);
+
 extern unsigned long *xen_contiguous_bitmap;
 
 #if defined(CONFIG_XEN_PV) || defined(CONFIG_ARM) || defined(CONFIG_ARM64)

From 9fc350892abdccca1d3dc09407c137f897a532cd Mon Sep 17 00:00:00 2001
From: Munehisa Kamata <kamatam@amazon.com>
Date: Thu, 13 Jul 2017 02:00:31 +0000
Subject: [PATCH 009/737] xenbus: add freeze/thaw/restore callbacks support

Since commit b3e96c0c7562 ("xen: use freeze/restore/thaw PM events for
suspend/resume/chkpt"), xenbus uses PMSG_FREEZE, PMSG_THAW and
PMSG_RESTORE events for Xen suspend. However, they're actually assigned
to xenbus_dev_suspend(), xenbus_dev_cancel() and xenbus_dev_resume()
respectively, and only suspend and resume callbacks are supported at
driver level. To support PM suspend and PM hibernation, modify the bus
level PM callbacks to invoke not only device driver's suspend/resume but
also freeze/thaw/restore.

Note that we'll use freeze/restore callbacks even for PM suspend whereas
suspend/resume callbacks are normally used in the case, becausae the
existing xenbus device drivers already have suspend/resume callbacks
specifically designed for Xen suspend. So we can allow the device
drivers to keep the existing callbacks wihtout modification.

Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
---
 drivers/xen/xenbus/xenbus_probe.c | 99 ++++++++++++++++++++++++++-----
 include/xen/xenbus.h              |  3 +
 2 files changed, 86 insertions(+), 16 deletions(-)

diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index 98d870672dc5e..8e9198c904e37 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -50,6 +50,7 @@
 #include <linux/io.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/suspend.h>
 
 #include <asm/page.h>
 #include <asm/xen/hypervisor.h>
@@ -600,26 +601,47 @@ int xenbus_dev_suspend(struct device *dev)
 	struct xenbus_driver *drv;
 	struct xenbus_device *xdev
 		= container_of(dev, struct xenbus_device, dev);
+	int (*cb)(struct xenbus_device *) = NULL;
+	bool xen_suspend = xen_suspend_mode_is_xen_suspend();
 
 	DPRINTK("%s", xdev->nodename);
 
 	if (dev->driver == NULL)
 		return 0;
 	drv = to_xenbus_driver(dev->driver);
-	if (drv->suspend)
-		err = drv->suspend(xdev);
-	if (err)
-		dev_warn(dev, "suspend failed: %i\n", err);
+
+	if (xen_suspend)
+		cb = drv->suspend;
+	else
+		cb = drv->freeze;
+
+	if (cb)
+		err = cb(xdev);
+
+	if (err) {
+		dev_warn(dev, "%s failed: %i\n", xen_suspend ?
+			"suspend" : "freeze", err);
+		return err;
+	}
+
+	if (!xen_suspend) {
+		/* Forget otherend since this can become stale after restore */
+		free_otherend_watch(xdev);
+		free_otherend_details(xdev);
+	}
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xenbus_dev_suspend);
 
 int xenbus_dev_resume(struct device *dev)
 {
-	int err;
+	int err = 0;
 	struct xenbus_driver *drv;
 	struct xenbus_device *xdev
 		= container_of(dev, struct xenbus_device, dev);
+	int (*cb)(struct xenbus_device *) = NULL;
+	bool xen_suspend = xen_suspend_mode_is_xen_suspend();
 
 	DPRINTK("%s", xdev->nodename);
 
@@ -628,23 +650,32 @@ int xenbus_dev_resume(struct device *dev)
 	drv = to_xenbus_driver(dev->driver);
 	err = talk_to_otherend(xdev);
 	if (err) {
-		dev_warn(dev, "resume (talk_to_otherend) failed: %i\n", err);
+		dev_warn(dev, "%s (talk_to_otherend) failed: %i\n",
+			xen_suspend ? "resume" : "restore", err);
 		return err;
 	}
 
-	xdev->state = XenbusStateInitialising;
+	if (xen_suspend)
+		xdev->state = XenbusStateInitialising;
 
-	if (drv->resume) {
-		err = drv->resume(xdev);
-		if (err) {
-			dev_warn(dev, "resume failed: %i\n", err);
-			return err;
-		}
+	if (xen_suspend)
+		cb = drv->resume;
+	else
+		cb = drv->restore;
+
+	if (cb)
+		err = cb(xdev);
+
+	if (err) {
+		dev_warn(dev, "%s failed: %i\n",
+			xen_suspend ? "resume" : "restore", err);
+		return err;
 	}
 
 	err = watch_otherend(xdev);
 	if (err) {
-		dev_warn(dev, "resume (watch_otherend) failed: %d\n", err);
+		dev_warn(dev, "%s (watch_otherend) failed: %d.\n",
+			xen_suspend ? "resume" : "restore", err);
 		return err;
 	}
 
@@ -654,8 +685,44 @@ EXPORT_SYMBOL_GPL(xenbus_dev_resume);
 
 int xenbus_dev_cancel(struct device *dev)
 {
-	/* Do nothing */
-	DPRINTK("cancel");
+	int err = 0;
+	struct xenbus_driver *drv;
+	struct xenbus_device *xdev
+		= container_of(dev, struct xenbus_device, dev);
+	bool xen_suspend = xen_suspend_mode_is_xen_suspend();
+
+	if (xen_suspend) {
+		/* Do nothing */
+		DPRINTK("cancel");
+		return 0;
+	}
+
+	DPRINTK("%s", xdev->nodename);
+
+	if (dev->driver == NULL)
+		return 0;
+	drv = to_xenbus_driver(dev->driver);
+
+	err = talk_to_otherend(xdev);
+	if (err) {
+		dev_warn(dev, "thaw (talk_to_otherend) failed: %d.\n", err);
+		return err;
+	}
+
+	if (drv->thaw) {
+		err = drv->thaw(xdev);
+		if (err) {
+			dev_warn(dev, "thaw failed: %i\n", err);
+			return err;
+		}
+	}
+
+	err = watch_otherend(xdev);
+	if (err) {
+		dev_warn(dev, "thaw (watch_otherend) failed: %d.\n", err);
+		return err;
+	}
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xenbus_dev_cancel);
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
index bf3cfc7c35d0b..58190b842089d 100644
--- a/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@ -113,6 +113,9 @@ struct xenbus_driver {
 	int (*remove)(struct xenbus_device *dev);
 	int (*suspend)(struct xenbus_device *dev);
 	int (*resume)(struct xenbus_device *dev);
+	int (*freeze)(struct xenbus_device *dev);
+	int (*thaw)(struct xenbus_device *dev);
+	int (*restore)(struct xenbus_device *dev);
 	int (*uevent)(struct xenbus_device *, struct kobj_uevent_env *);
 	struct device_driver driver;
 	int (*read_otherend_details)(struct xenbus_device *dev);

From c15b81021286a712cbb02b36d823db6dc365bf39 Mon Sep 17 00:00:00 2001
From: Anchal Agarwal <anchalag@amazon.com>
Date: Thu, 22 Feb 2018 21:52:42 +0000
Subject: [PATCH 010/737] x86/xen: Introduce new function to map
 HYPERVISOR_shared_info on Resume

Introduce a small function which re-uses shared page's PA allocated
during guest initialization time in reserve_shared_info() and not
allocate new page during resume flow.
It also  does the mapping of shared_info_page by calling
xen_hvm_init_shared_info() to use the function.

Backport Notes:
We don't need this commit 8d5ce0dad4ab2a4c8c8a3c36f6fb8c46b695b053 ("x86/xen:
decouple shared_info mapping from xen_hvm_init_shared_info()") here since
xen_hvm_init_shared_info changed in 4.14 kernel just to do the mapping and
allocation of shared page is done in a separate function.
We don't need to decouple this kernel API anymore

Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Sebastian Biemueller <sbiemue@amazon.com>
Reviewed-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
---
 arch/x86/xen/enlighten_hvm.c | 7 +++++++
 arch/x86/xen/xen-ops.h       | 2 ++
 2 files changed, 9 insertions(+)

diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index ec50b7423a4c8..fa58de5f971a9 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -29,6 +29,13 @@
 
 static unsigned long shared_info_pfn;
 
+void xen_hvm_map_shared_info(void)
+{
+        xen_hvm_init_shared_info();
+        if(shared_info_pfn)
+                 HYPERVISOR_shared_info = __va(PFN_PHYS(shared_info_pfn));
+}
+
 void xen_hvm_init_shared_info(void)
 {
 	struct xen_add_to_physmap xatp;
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 8695809b88f08..75cca4fc20473 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -55,6 +55,8 @@ void xen_enable_sysenter(void);
 void xen_enable_syscall(void);
 void xen_vcpu_restore(void);
 
+void xen_callback_vector(void);
+void xen_hvm_map_shared_info(void);
 void xen_hvm_init_shared_info(void);
 void xen_unplug_emulated_devices(void);
 

From 7d2df096153c8c97b23658c8ae5e82b5e44d41d7 Mon Sep 17 00:00:00 2001
From: Munehisa Kamata <kamatam@amazon.com>
Date: Sat, 11 Feb 2017 00:53:56 +0000
Subject: [PATCH 011/737] x86/xen: add system core suspend and resume callbacks

Add Xen PVHVM specific system core callbacks for PM suspend and
hibernation support. The callbacks suspend and resume Xen primitives,
like shared_info, pvclock and grant table. Note that Xen suspend can
handle them in a different manner, but system core callbacks are called
from the context. So if the callbacks are called from Xen suspend
context, return immediately.

Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
---
 arch/x86/xen/enlighten_hvm.c |  1 +
 arch/x86/xen/suspend.c       | 53 ++++++++++++++++++++++++++++++++++++
 include/xen/xen-ops.h        |  2 ++
 3 files changed, 56 insertions(+)

diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index fa58de5f971a9..3970821ec2928 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -219,6 +219,7 @@ static void __init xen_hvm_guest_init(void)
 	if (!no_vector_callback && xen_feature(XENFEAT_hvm_callback_vector))
 		xen_have_vector_callback = 1;
 
+	xen_setup_syscore_ops();
 	xen_hvm_smp_init();
 	WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_hvm, xen_cpu_dead_hvm));
 	xen_unplug_emulated_devices();
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 1d83152c761bc..784c4484100bb 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -2,17 +2,22 @@
 #include <linux/types.h>
 #include <linux/tick.h>
 #include <linux/percpu-defs.h>
+#include <linux/syscore_ops.h>
+#include <linux/kernel_stat.h>
 
 #include <xen/xen.h>
 #include <xen/interface/xen.h>
+#include <xen/interface/memory.h>
 #include <xen/grant_table.h>
 #include <xen/events.h>
+#include <xen/xen-ops.h>
 
 #include <asm/cpufeatures.h>
 #include <asm/msr-index.h>
 #include <asm/xen/hypercall.h>
 #include <asm/xen/page.h>
 #include <asm/fixmap.h>
+#include <asm/pvclock.h>
 
 #include "xen-ops.h"
 #include "mmu.h"
@@ -82,3 +87,51 @@ void xen_arch_suspend(void)
 
 	on_each_cpu(xen_vcpu_notify_suspend, NULL, 1);
 }
+
+static int xen_syscore_suspend(void)
+{
+	struct xen_remove_from_physmap xrfp;
+	int ret;
+
+	/* Xen suspend does similar stuffs in its own logic */
+	if (xen_suspend_mode_is_xen_suspend())
+		return 0;
+
+	xrfp.domid = DOMID_SELF;
+	xrfp.gpfn = __pa(HYPERVISOR_shared_info) >> PAGE_SHIFT;
+
+	ret = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrfp);
+	if (!ret)
+		HYPERVISOR_shared_info = &xen_dummy_shared_info;
+
+	return ret;
+}
+
+static void xen_syscore_resume(void)
+{
+	/* Xen suspend does similar stuffs in its own logic */
+	if (xen_suspend_mode_is_xen_suspend())
+		return;
+
+	/* No need to setup vcpu_info as it's already moved off */
+	xen_hvm_map_shared_info();
+
+	pvclock_resume();
+
+	gnttab_resume();
+}
+
+/*
+ * These callbacks will be called with interrupts disabled and when having only
+ * one CPU online.
+ */
+static struct syscore_ops xen_hvm_syscore_ops = {
+	.suspend = xen_syscore_suspend,
+	.resume = xen_syscore_resume
+};
+
+void __init xen_setup_syscore_ops(void)
+{
+	if (xen_hvm_domain())
+		register_syscore_ops(&xen_hvm_syscore_ops);
+}
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 5d7e168952acf..d47efcaacddb4 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -44,6 +44,8 @@ bool xen_suspend_mode_is_xen_suspend(void);
 bool xen_suspend_mode_is_pm_suspend(void);
 bool xen_suspend_mode_is_pm_hibernation(void);
 
+void xen_setup_syscore_ops(void);
+
 extern unsigned long *xen_contiguous_bitmap;
 
 #if defined(CONFIG_XEN_PV) || defined(CONFIG_ARM) || defined(CONFIG_ARM64)

From da61f3472f26698fd067ca4086e4d0c0574ab16f Mon Sep 17 00:00:00 2001
From: Munehisa Kamata <kamatam@amazon.com>
Date: Thu, 8 Jun 2017 19:15:55 +0000
Subject: [PATCH 012/737] xen-blkfront: add callbacks for PM suspend and
 hibernation

Add freeze and restore callbacks for PM suspend and hibernation support.
The freeze handler stops a block-layer queue and disconnect the frontend
from the backend while freeing ring_info and associated resources. The
restore handler re-allocates ring_info and re-connect to the backedend,
so the rest of the kernel can continue to use the block device
transparently.Also, the handlers are used for both PM
suspend and hibernation so that we can keep the existing suspend/resume
callbacks for Xen suspend without modification.
If a backend doesn't have commit 12ea729645ac ("xen/blkback: unmap all
persistent grants when frontend gets disconnected"), the frontend may see
massive amount of grant table warning when freeing resources.

 [   36.852659] deferring g.e. 0xf9 (pfn 0xffffffffffffffff)
 [   36.855089] xen:grant_table: WARNING: g.e. 0x112 still in use!

In this case, persistent grants would need to be disabled.

Ensure no reqs/rsps in rings before disconnecting. When disconnecting
the frontend from the backend in blkfront_freeze(), there still may be
unconsumed requests or responses in the rings, especially when the
backend is backed by network-based device. If the frontend gets
disconnected with such reqs/rsps remaining there, it can cause
grant warnings and/or losing reqs/rsps by freeing pages afterward.
This can lead resumed kernel into unrecoverable state like unexpected
freeing of grant page and/or hung task due to the lost reqs or rsps.
Therefore we have to ensure that there is no unconsumed requests or
responses before disconnecting.

Actually, the frontend just needs to wait for some amount of time so that
the backend can process the requests, put responses and notify the
frontend back. Timeout used here is based on some heuristic. If we somehow
hit the timeout, it would mean something serious happens in the backend,
the frontend will just return an error to PM core and PM suspend/hibernation
will be aborted. This may be something should be fixed by the backend side,
but a frontend side fix is probably still worth doing to work with
broader backends.

Backport Note:
Unlike 4.9 kernel, blk-mq is default for 4.14 kernel and request-based
mode cod eis not included in this frontend driver.

Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
---
 drivers/block/xen-blkfront.c | 166 +++++++++++++++++++++++++++++++++--
 1 file changed, 157 insertions(+), 9 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index d68a8ca2161fb..d00968cc4cfb0 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -48,6 +48,8 @@
 #include <linux/list.h>
 #include <linux/workqueue.h>
 #include <linux/sched/mm.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
 
 #include <xen/xen.h>
 #include <xen/xenbus.h>
@@ -81,6 +83,8 @@ enum blkif_state {
 	BLKIF_STATE_CONNECTED,
 	BLKIF_STATE_SUSPENDED,
 	BLKIF_STATE_ERROR,
+	BLKIF_STATE_FREEZING,
+	BLKIF_STATE_FROZEN,
 };
 
 struct grant {
@@ -229,6 +233,7 @@ struct blkfront_info
 	struct list_head requests;
 	struct bio_list bio_list;
 	struct list_head info_list;
+	struct completion wait_backend_disconnected;
 };
 
 static unsigned int nr_minors;
@@ -270,6 +275,16 @@ static DEFINE_SPINLOCK(minor_lock);
 static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo);
 static void blkfront_gather_backend_features(struct blkfront_info *info);
 static int negotiate_mq(struct blkfront_info *info);
+static void __blkif_free(struct blkfront_info *info);
+
+static inline bool blkfront_ring_is_busy(struct blkif_front_ring *ring)
+{
+	if (RING_SIZE(ring) > RING_FREE_REQUESTS(ring) ||
+	    RING_HAS_UNCONSUMED_RESPONSES(ring))
+		return true;
+	else
+		return false;
+}
 
 #define for_each_rinfo(info, ptr, idx)				\
 	for ((ptr) = (info)->rinfo, (idx) = 0;			\
@@ -1026,6 +1041,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
 	info->sector_size = sector_size;
 	info->physical_sector_size = physical_sector_size;
 	blkif_set_queue_limits(info);
+	init_completion(&info->wait_backend_disconnected);
 
 	return 0;
 }
@@ -1249,6 +1265,8 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
 /* Already hold rinfo->ring_lock. */
 static inline void kick_pending_request_queues_locked(struct blkfront_ring_info *rinfo)
 {
+	if (unlikely(rinfo->dev_info->connected == BLKIF_STATE_FREEZING))
+                return;
 	if (!RING_FULL(&rinfo->ring))
 		blk_mq_start_stopped_hw_queues(rinfo->dev_info->rq, true);
 }
@@ -1373,9 +1391,6 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo)
 
 static void blkif_free(struct blkfront_info *info, int suspend)
 {
-	unsigned int i;
-	struct blkfront_ring_info *rinfo;
-
 	/* Prevent new requests being issued until we fix things up. */
 	info->connected = suspend ?
 		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
@@ -1383,6 +1398,14 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 	if (info->rq)
 		blk_mq_stop_hw_queues(info->rq);
 
+	__blkif_free(info);
+}
+
+static void __blkif_free(struct blkfront_info *info)
+{
+	unsigned int i;
+	struct blkfront_ring_info *rinfo;
+
 	for_each_rinfo(info, rinfo, i)
 		blkif_free_ring(rinfo);
 
@@ -1594,8 +1617,10 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 	unsigned int eoiflag = XEN_EOI_FLAG_SPURIOUS;
 
 	if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
-		xen_irq_lateeoi(irq, XEN_EOI_FLAG_SPURIOUS);
-		return IRQ_HANDLED;
+		if (info->connected != BLKIF_STATE_FREEZING) {
+			xen_irq_lateeoi(irq, XEN_EOI_FLAG_SPURIOUS);
+			return IRQ_HANDLED;
+		}
 	}
 
 	spin_lock_irqsave(&rinfo->ring_lock, flags);
@@ -2113,6 +2138,7 @@ static int blkif_recover(struct blkfront_info *info)
 	unsigned int segs;
 	struct blkfront_ring_info *rinfo;
 
+	bool frozen = info->connected == BLKIF_STATE_FROZEN;
 	blkfront_gather_backend_features(info);
 	/* Reset limits changed by blk_mq_update_nr_hw_queues(). */
 	blkif_set_queue_limits(info);
@@ -2134,6 +2160,9 @@ static int blkif_recover(struct blkfront_info *info)
 		kick_pending_request_queues(rinfo);
 	}
 
+	if (frozen)
+		return 0;
+
 	list_for_each_entry_safe(req, n, &info->requests, queuelist) {
 		/* Requeue pending requests (flush or discard) */
 		list_del_init(&req->queuelist);
@@ -2447,6 +2476,7 @@ static void blkfront_connect(struct blkfront_info *info)
 
 		return;
 	case BLKIF_STATE_SUSPENDED:
+	case BLKIF_STATE_FROZEN:
 		/*
 		 * If we are recovering from suspension, we need to wait
 		 * for the backend to announce it's features before
@@ -2564,13 +2594,38 @@ static void blkback_changed(struct xenbus_device *dev,
 		break;
 
 	case XenbusStateClosed:
-		if (dev->state == XenbusStateClosed)
+		if (dev->state == XenbusStateClosed) {
+			if (info->connected == BLKIF_STATE_FREEZING) {
+				__blkif_free(info);
+				info->connected = BLKIF_STATE_FROZEN;
+				complete(&info->wait_backend_disconnected);
+				break;
+			}
+
+			break;
+		}
+
+		/*
+		 * We may somehow receive backend's Closed again while thawing
+		 * or restoring and it causes thawing or restoring to fail.
+		 * Ignore such unexpected state anyway.
+		*/
+		if (info->connected == BLKIF_STATE_FROZEN &&
+				dev->state == XenbusStateInitialised) {
+			dev_dbg(&dev->dev,
+					"ignore the backend's Closed state: %s",
+					dev->nodename);
 			break;
+		}		
 		fallthrough;
 	case XenbusStateClosing:
-		if (info)
-			blkfront_closing(info);
-		break;
+		if (info) {
+                        if (info->connected == BLKIF_STATE_FREEZING)
+                                xenbus_frontend_closed(dev);
+                        else
+                                blkfront_closing(info);
+                }
+                break;
 	}
 }
 
@@ -2713,6 +2768,96 @@ static void blkif_release(struct gendisk *disk, fmode_t mode)
 	mutex_unlock(&blkfront_mutex);
 }
 
+static int blkfront_freeze(struct xenbus_device *dev)
+{
+	unsigned int i;
+	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+        struct blkfront_ring_info *rinfo;
+        struct blkif_front_ring *ring;
+	/* This would be reasonable timeout as used in xenbus_dev_shutdown() */
+	unsigned int timeout = 5 * HZ;
+	int err = 0;
+	
+	info->connected = BLKIF_STATE_FREEZING;
+	
+	blk_mq_stop_hw_queues(info->rq);
+	
+	for (i = 0; i < info->nr_rings; i++) {
+		rinfo = &info->rinfo[i];
+
+		gnttab_cancel_free_callback(&rinfo->callback);
+		flush_work(&rinfo->work);
+	}
+	
+	for (i = 0; i < info->nr_rings; i++) {
+                spinlock_t *lock;
+                bool busy;
+                unsigned long req_timeout_ms = 25;
+                unsigned long ring_timeout;
+
+                rinfo = &info->rinfo[i];
+                ring = &rinfo->ring;
+
+                lock = &rinfo->ring_lock;
+
+                ring_timeout = jiffies +
+                        msecs_to_jiffies(req_timeout_ms * RING_SIZE(ring));
+
+                do {
+                        spin_lock_irq(lock);
+                        busy = blkfront_ring_is_busy(ring);
+                        spin_unlock_irq(lock);
+
+                        if (busy)
+                                msleep(req_timeout_ms);
+                        else
+                                break;
+                } while (time_is_after_jiffies(ring_timeout));
+
+                /* Timed out */
+                if (busy) {
+                        xenbus_dev_error(dev, err, "the ring is still busy");
+                        info->connected = BLKIF_STATE_CONNECTED;
+                        return -EBUSY;
+                }
+        }
+	
+	/* Kick the backend to disconnect */
+	xenbus_switch_state(dev, XenbusStateClosing);
+
+	/*
+	 * We don't want to move forward before the frontend is diconnected
+	 * from the backend cleanly.
+	 */
+	timeout = wait_for_completion_timeout(&info->wait_backend_disconnected,
+					      timeout);
+	if (!timeout) {
+		err = -EBUSY;
+		xenbus_dev_error(dev, err, "Freezing timed out;"
+				 "the device may become inconsistent state");
+	}
+
+	return err;
+}
+
+static int blkfront_restore(struct xenbus_device *dev)
+{
+	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+	int err = 0;
+
+	err = negotiate_mq(info);
+	if (err)
+		goto out;
+
+	err = talk_to_blkback(dev, info);
+	if (err)
+		goto out;
+	blk_mq_update_nr_hw_queues(&info->tag_set, info->nr_rings);
+
+out:
+	return err;
+}
+
 static const struct block_device_operations xlvbd_block_fops =
 {
 	.owner = THIS_MODULE,
@@ -2736,6 +2881,9 @@ static struct xenbus_driver blkfront_driver = {
 	.resume = blkfront_resume,
 	.otherend_changed = blkback_changed,
 	.is_ready = blkfront_is_ready,
+	.freeze = blkfront_freeze,
+	.thaw = blkfront_restore,
+	.restore = blkfront_restore
 };
 
 static void purge_persistent_grants(struct blkfront_info *info)

From ca3562e12886ac0d6afc640e3cf04e585d87ee96 Mon Sep 17 00:00:00 2001
From: Munehisa Kamata <kamatam@amazon.com>
Date: Mon, 9 Jan 2017 23:36:52 +0000
Subject: [PATCH 013/737] xen-netfront: add callbacks for PM suspend and
 hibernation support

Add freeze and restore callbacks for PM suspend and hibernation support.
The freeze handler simply disconnects the frotnend from the backend and
frees resources associated with queues after disabling the net_device
from the system. The restore handler just changes the frontend state and
let the xenbus handler to re-allocate the resources and re-connect to the
backend. This can be performed transparently to the rest of the system.
The handlers are used for both PM suspend and hibernation so that we can
keep the existing suspend/resume callbacks for Xen suspend without
modification. Freezing netfront devices is normally expected to finish within a few
hundred milliseconds, but it can rarely take more than 5 seconds and
hit the hard coded timeout, it would depend on backend state which may
be congested and/or have complex configuration. While it's rare case,
longer default timeout seems a bit more reasonable here to avoid hitting
the timeout. Also, make it configurable via module parameter so that we
can cover broader setups than what we know currently.

Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Reviewed-by: Munehisa Kamata <kamatam@amazon.com>
---
 drivers/net/xen-netfront.c | 97 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 96 insertions(+), 1 deletion(-)

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 3d149890fa36e..945d8dd5aaf26 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -43,6 +43,7 @@
 #include <linux/moduleparam.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
+#include <linux/completion.h>
 #include <net/ip.h>
 #include <linux/bpf.h>
 #include <net/page_pool.h>
@@ -59,6 +60,12 @@
 #include <xen/interface/memory.h>
 #include <xen/interface/grant_table.h>
 
+enum netif_freeze_state {
+	NETIF_FREEZE_STATE_UNFROZEN,
+	NETIF_FREEZE_STATE_FREEZING,
+	NETIF_FREEZE_STATE_FROZEN,
+};
+
 /* Module parameters */
 #define MAX_QUEUES_DEFAULT 8
 static unsigned int xennet_max_queues;
@@ -72,6 +79,12 @@ MODULE_PARM_DESC(trusted, "Is the backend trusted");
 
 #define XENNET_TIMEOUT  (5 * HZ)
 
+static unsigned int netfront_freeze_timeout_secs = 10;
+module_param_named(freeze_timeout_secs,
+		   netfront_freeze_timeout_secs, uint, 0644);
+MODULE_PARM_DESC(freeze_timeout_secs,
+		 "timeout when freezing netfront device in seconds");
+
 static const struct ethtool_ops xennet_ethtool_ops;
 
 struct netfront_cb {
@@ -183,6 +196,10 @@ struct netfront_info {
 	bool bounce;
 
 	atomic_t rx_gso_checksum_fixup;
+
+	int freeze_state;
+
+	struct completion wait_backend_disconnected;
 };
 
 struct netfront_rx_info {
@@ -913,6 +930,21 @@ static void xennet_set_rx_rsp_cons(struct netfront_queue *queue, RING_IDX val)
 	spin_unlock_irqrestore(&queue->rx_cons_lock, flags);
 }
 
+static int xennet_disable_interrupts(struct net_device *dev)
+{
+	struct netfront_info *np = netdev_priv(dev);
+	unsigned int num_queues = dev->real_num_tx_queues;
+	unsigned int i;
+	struct netfront_queue *queue;
+
+	for (i = 0; i < num_queues; ++i) {
+		queue = &np->queues[i];
+		disable_irq(queue->tx_irq);
+		disable_irq(queue->rx_irq);
+	}
+	return 0;
+}
+
 static void xennet_move_rx_slot(struct netfront_queue *queue, struct sk_buff *skb,
 				grant_ref_t ref)
 {
@@ -1724,6 +1756,8 @@ static struct net_device *xennet_create_dev(struct xenbus_device *dev)
 
 	np->queues = NULL;
 
+	init_completion(&np->wait_backend_disconnected);
+
 	err = -ENOMEM;
 	np->rx_stats = netdev_alloc_pcpu_stats(struct netfront_stats);
 	if (np->rx_stats == NULL)
@@ -2280,6 +2314,50 @@ static int xennet_create_queues(struct netfront_info *info,
 	return 0;
 }
 
+static int netfront_freeze(struct xenbus_device *dev)
+{
+	struct netfront_info *info = dev_get_drvdata(&dev->dev);
+	unsigned long timeout = netfront_freeze_timeout_secs * HZ;
+	int err = 0;
+
+	xennet_disable_interrupts(info->netdev);
+
+	netif_device_detach(info->netdev);
+
+	info->freeze_state = NETIF_FREEZE_STATE_FREEZING;
+
+	/* Kick the backend to disconnect */
+	xenbus_switch_state(dev, XenbusStateClosing);
+
+	/* We don't want to move forward before the frontend is diconnected
+	 * from the backend cleanly.
+	 */
+	timeout = wait_for_completion_timeout(&info->wait_backend_disconnected,
+					      timeout);
+	if (!timeout) {
+		err = -EBUSY;
+		xenbus_dev_error(dev, err, "Freezing timed out;"
+				 "the device may become inconsistent state");
+		return err;
+	}
+
+	/* Tear down queues */
+	xennet_disconnect_backend(info);
+	xennet_destroy_queues(info);
+
+	info->freeze_state = NETIF_FREEZE_STATE_FROZEN;
+
+	return err;
+}
+
+static int netfront_restore(struct xenbus_device *dev)
+{
+	/* Kick the backend to re-connect */
+	xenbus_switch_state(dev, XenbusStateInitialising);
+
+	return 0;
+}
+
 /* Common code used when first setting up, and when resuming. */
 static int talk_to_netback(struct xenbus_device *dev,
 			   struct netfront_info *info)
@@ -2512,6 +2590,8 @@ static int xennet_connect(struct net_device *dev)
 		spin_unlock_bh(&queue->rx_lock);
 	}
 
+	np->freeze_state = NETIF_FREEZE_STATE_UNFROZEN;
+
 	return 0;
 }
 
@@ -2549,10 +2629,22 @@ static void netback_changed(struct xenbus_device *dev,
 		break;
 
 	case XenbusStateClosed:
-		if (dev->state == XenbusStateClosed)
+		if (dev->state == XenbusStateClosed) {
+			/* dpm context is waiting for the backend */
+			if (np->freeze_state == NETIF_FREEZE_STATE_FREEZING)
+				complete(&np->wait_backend_disconnected);
 			break;
+		}
 		fallthrough;	/* Missed the backend's CLOSING state */
 	case XenbusStateClosing:
+		/* We may see unexpected Closed or Closing from the backend.
+		 * Just ignore it not to prevent the frontend from being
+		 * re-connected in the case of PM suspend or hibernation.
+		 */
+		if (np->freeze_state == NETIF_FREEZE_STATE_FROZEN &&
+				dev->state == XenbusStateInitialising) {
+			break;
+		}
 		xenbus_frontend_closed(dev);
 		break;
 	}
@@ -2715,6 +2807,9 @@ static struct xenbus_driver netfront_driver = {
 	.probe = netfront_probe,
 	.remove = xennet_remove,
 	.resume = netfront_resume,
+	.freeze = netfront_freeze,
+	.thaw	= netfront_restore,
+	.restore = netfront_restore,
 	.otherend_changed = netback_changed,
 };
 

From f35fa341d4a86764bd510010734db0c0facfc566 Mon Sep 17 00:00:00 2001
From: Munehisa Kamata <kamatam@amazon.com>
Date: Thu, 13 Jul 2017 07:22:39 +0000
Subject: [PATCH 014/737] xen/time: introduce xen_{save,restore}_steal_clock

Currently, steal time accounting code in scheduler expects steal clock
callback to provide monotonically increasing value. If the accounting
code receives a smaller value than previous one, it uses a negative
value to calculate steal time and results in incorrectly updated idle
and steal time accounting. This breaks userspace tools which read
/proc/stat.

top - 08:05:35 up  2:12,  3 users,  load average: 0.00, 0.07, 0.23
Tasks:  80 total,   1 running,  79 sleeping,   0 stopped,   0 zombie
Cpu(s):  0.0%us,  0.0%sy,  0.0%ni,30100.0%id,  0.0%wa,  0.0%hi, 0.0%si,-1253874204672.0%st

This can actually happen when a Xen PVHVM guest gets restored from
hibernation, because such a restored guest is just a fresh domain from
Xen perspective and the time information in runstate info starts over
from scratch.

This patch introduces xen_save_steal_clock() which saves current values
in runstate info into per-cpu variables. Its couterpart,
xen_restore_steal_clock(), sets offset if it found the current values in
runstate info are smaller than previous ones. xen_steal_clock() is also
modified to use the offset to ensure that scheduler only sees
monotonically increasing number.

Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
---
 drivers/xen/time.c    | 29 ++++++++++++++++++++++++++++-
 include/xen/xen-ops.h |  2 ++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/drivers/xen/time.c b/drivers/xen/time.c
index 108edbcbc040f..87a1fd88989dc 100644
--- a/drivers/xen/time.c
+++ b/drivers/xen/time.c
@@ -23,6 +23,9 @@ static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
 
 static DEFINE_PER_CPU(u64[4], old_runstate_time);
 
+static DEFINE_PER_CPU(u64, xen_prev_steal_clock);
+static DEFINE_PER_CPU(u64, xen_steal_clock_offset);
+
 /* return an consistent snapshot of 64-bit time/counter value */
 static u64 get64(const u64 *p)
 {
@@ -149,7 +152,7 @@ bool xen_vcpu_stolen(int vcpu)
 	return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable;
 }
 
-u64 xen_steal_clock(int cpu)
+static u64 __xen_steal_clock(int cpu)
 {
 	struct vcpu_runstate_info state;
 
@@ -157,6 +160,30 @@ u64 xen_steal_clock(int cpu)
 	return state.time[RUNSTATE_runnable] + state.time[RUNSTATE_offline];
 }
 
+u64 xen_steal_clock(int cpu)
+{
+	return __xen_steal_clock(cpu) + per_cpu(xen_steal_clock_offset, cpu);
+}
+
+void xen_save_steal_clock(int cpu)
+{
+	per_cpu(xen_prev_steal_clock, cpu) = xen_steal_clock(cpu);
+}
+
+void xen_restore_steal_clock(int cpu)
+{
+	u64 steal_clock = __xen_steal_clock(cpu);
+
+	if (per_cpu(xen_prev_steal_clock, cpu) > steal_clock) {
+		/* Need to update the offset */
+		per_cpu(xen_steal_clock_offset, cpu) =
+		    per_cpu(xen_prev_steal_clock, cpu) - steal_clock;
+	} else {
+		/* Avoid unnecessary steal clock warp */
+		per_cpu(xen_steal_clock_offset, cpu) = 0;
+	}
+}
+
 void xen_setup_runstate_info(int cpu)
 {
 	struct vcpu_register_runstate_memory_area area;
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index d47efcaacddb4..bd1d993676a97 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -37,6 +37,8 @@ void xen_time_setup_guest(void);
 void xen_manage_runstate_time(int action);
 void xen_get_runstate_snapshot(struct vcpu_runstate_info *res);
 u64 xen_steal_clock(int cpu);
+void xen_save_steal_clock(int cpu);
+void xen_restore_steal_clock(int cpu);
 
 int xen_setup_shutdown_event(void);
 

From 3197ae2413edeb1dd241ee10876b5bed89c7f436 Mon Sep 17 00:00:00 2001
From: Munehisa Kamata <kamatam@amazon.com>
Date: Fri, 21 Jul 2017 06:06:12 +0000
Subject: [PATCH 015/737] x86/xen: save and restore steal clock

Save steal clock values of all present CPUs in the system core ops
suspend callbacks. Also, restore a boot CPU's steal clock in the system
core resume callback. For non-boot CPUs, restore after they're brought
up, because runstate info for non-boot CPUs are not active until then.

Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
---
 arch/x86/xen/suspend.c | 13 ++++++++++++-
 arch/x86/xen/time.c    |  3 +++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 784c4484100bb..dae0f74f5390d 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -91,12 +91,20 @@ void xen_arch_suspend(void)
 static int xen_syscore_suspend(void)
 {
 	struct xen_remove_from_physmap xrfp;
-	int ret;
+	int cpu, ret;
 
 	/* Xen suspend does similar stuffs in its own logic */
 	if (xen_suspend_mode_is_xen_suspend())
 		return 0;
 
+	for_each_present_cpu(cpu) {
+		/*
+		 * Nonboot CPUs are already offline, but the last copy of
+		 * runstate info is still accessible.
+		 */
+		xen_save_steal_clock(cpu);
+	}
+
 	xrfp.domid = DOMID_SELF;
 	xrfp.gpfn = __pa(HYPERVISOR_shared_info) >> PAGE_SHIFT;
 
@@ -118,6 +126,9 @@ static void xen_syscore_resume(void)
 
 	pvclock_resume();
 
+	/* Nonboot CPUs will be resumed when they're brought up */
+	xen_restore_steal_clock(smp_processor_id());
+
 	gnttab_resume();
 }
 
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 8183d17e1cf17..4cb4491ba4e26 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -546,6 +546,9 @@ static void xen_hvm_setup_cpu_clockevents(void)
 {
 	int cpu = smp_processor_id();
 	xen_setup_runstate_info(cpu);
+	if (cpu)
+		xen_restore_steal_clock(cpu);
+
 	/*
 	 * xen_setup_timer(cpu) - snprintf is bad in atomic context. Hence
 	 * doing it xen_hvm_cpu_notify (which gets called by smp_init during

From ca89ff3382c0a648a0cf2462062a53b60b091de3 Mon Sep 17 00:00:00 2001
From: Munehisa Kamata <kamatam@amazon.com>
Date: Thu, 24 Aug 2017 22:54:14 +0000
Subject: [PATCH 016/737] xen/events: add xen_shutdown_pirqs helper function

Add a simple helper function to "shutdown" active PIRQs, which actually
closes event channels but keeps related IRQ structures intact. PM
suspend/hibernation code will rely on this.

Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
---
 drivers/xen/events/events_base.c | 12 ++++++++++++
 include/xen/events.h             |  1 +
 2 files changed, 13 insertions(+)

diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index fba78daee449a..970d37c1e3a78 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -2044,6 +2044,18 @@ void xen_irq_resume(void)
 	restore_pirqs();
 }
 
+void xen_shutdown_pirqs(void)
+{
+	struct irq_info *info;
+
+	list_for_each_entry(info, &xen_irq_list_head, list) {
+		if (info->type != IRQT_PIRQ || !VALID_EVTCHN(info->evtchn))
+			continue;
+
+		shutdown_pirq(irq_get_irq_data(info->irq));
+	}
+}
+
 static struct irq_chip xen_dynamic_chip __read_mostly = {
 	.name			= "xen-dyn",
 
diff --git a/include/xen/events.h b/include/xen/events.h
index 8ec418e30c7fb..64046a929ae79 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -84,6 +84,7 @@ static inline void notify_remote_via_evtchn(evtchn_port_t port)
 void notify_remote_via_irq(int irq);
 
 void xen_irq_resume(void);
+void xen_shutdown_pirqs(void);
 
 /* Clear an irq's pending state, in preparation for polling on it */
 void xen_clear_irq_pending(int irq);

From 052cc6fa4eb22b428923be88a52b6eed4839b286 Mon Sep 17 00:00:00 2001
From: Munehisa Kamata <kamatam@amazon.com>
Date: Thu, 24 Aug 2017 22:56:36 +0000
Subject: [PATCH 017/737] x86/xen: close event channels for PIRQs in system
 core suspend callback

Close event channels allocated for devices which are backed by PIRQ and
still active when suspending the system core. Normally, the devices are
emulated legacy devices, e.g. PS/2 keyboard, floppy controller and etc.

Without this, in PM hibernation, information about the event channel
remains in hibernation image, but there is no guarantee that the same
event channel numbers are assigned to the devices when restoring the
system. This may cause conflict like the following and prevent some
devices from being restored correctly.

[  102.330821] ------------[ cut here ]------------
[  102.333264] WARNING: CPU: 0 PID: 2324 at
drivers/xen/events/events_base.c:878 bind_evtchn_to_irq+0x88/0xf0
...
[  102.348057] Call Trace:
[  102.348057]  [<ffffffff813001df>] dump_stack+0x63/0x84
[  102.348057]  [<ffffffff81071811>] __warn+0xd1/0xf0
[  102.348057]  [<ffffffff810718fd>] warn_slowpath_null+0x1d/0x20
[  102.348057]  [<ffffffff8139a1f8>] bind_evtchn_to_irq+0x88/0xf0
[  102.348057]  [<ffffffffa00cd420>] ? blkif_copy_from_grant+0xb0/0xb0 [xen_blkfront]
[  102.348057]  [<ffffffff8139a307>] bind_evtchn_to_irqhandler+0x27/0x80
[  102.348057]  [<ffffffffa00cc785>] talk_to_blkback+0x425/0xcd0 [xen_blkfront]
[  102.348057]  [<ffffffff811e0c8a>] ? __kmalloc+0x1ea/0x200
[  102.348057]  [<ffffffffa00ce84d>] blkfront_restore+0x2d/0x60 [xen_blkfront]
[  102.348057]  [<ffffffff813a0078>] xenbus_dev_restore+0x58/0x100
[  102.348057]  [<ffffffff813a1ff0>] ?  xenbus_frontend_delayed_resume+0x20/0x20
[  102.348057]  [<ffffffff813a200e>] xenbus_dev_cond_restore+0x1e/0x30
[  102.348057]  [<ffffffff813f797e>] dpm_run_callback+0x4e/0x130
[  102.348057]  [<ffffffff813f7f17>] device_resume+0xe7/0x210
[  102.348057]  [<ffffffff813f7810>] ? pm_dev_dbg+0x80/0x80
[  102.348057]  [<ffffffff813f9374>] dpm_resume+0x114/0x2f0
[  102.348057]  [<ffffffff810c00cf>] hibernation_snapshot+0x15f/0x380
[  102.348057]  [<ffffffff810c0ac3>] hibernate+0x183/0x290
[  102.348057]  [<ffffffff810be1af>] state_store+0xcf/0xe0
[  102.348057]  [<ffffffff813020bf>] kobj_attr_store+0xf/0x20
[  102.348057]  [<ffffffff8127c88a>] sysfs_kf_write+0x3a/0x50
[  102.348057]  [<ffffffff8127c3bb>] kernfs_fop_write+0x10b/0x190
[  102.348057]  [<ffffffff81200008>] __vfs_write+0x28/0x120
[  102.348057]  [<ffffffff81200c19>] ? rw_verify_area+0x49/0xb0
[  102.348057]  [<ffffffff81200e62>] vfs_write+0xb2/0x1b0
[  102.348057]  [<ffffffff81202196>] SyS_write+0x46/0xa0
[  102.348057]  [<ffffffff81520cf7>] entry_SYSCALL_64_fastpath+0x1a/0xa9
[  102.423005] ---[ end trace b8d6718e22e2b107 ]---
[  102.425031] genirq: Flags mismatch irq 6. 00000000 (blkif) vs. 00000000 (floppy)

Note that we don't explicitly re-allocate event channels for such
devices in the resume callback. Re-allocation will occur when PM core
re-enable IRQs for the devices at later point.

Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
---
 arch/x86/xen/suspend.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index dae0f74f5390d..affa63d4b6bdc 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -105,6 +105,8 @@ static int xen_syscore_suspend(void)
 		xen_save_steal_clock(cpu);
 	}
 
+	xen_shutdown_pirqs();
+
 	xrfp.domid = DOMID_SELF;
 	xrfp.gpfn = __pa(HYPERVISOR_shared_info) >> PAGE_SHIFT;
 

From 22751dc50176ec8573b2e3a7b43af52d5d94c723 Mon Sep 17 00:00:00 2001
From: Aleksei Besogonov <cyberax@amazon.com>
Date: Fri, 27 Oct 2017 17:59:18 +0000
Subject: [PATCH 018/737] PM / hibernate: update the resume offset on
 SNAPSHOT_SET_SWAP_AREA

The SNAPSHOT_SET_SWAP_AREA is supposed to be used to set the hibernation
offset on a running kernel to enable hibernating to a swap file.
However, it doesn't actually update the swsusp_resume_block variable. As
a result, the hibernation fails at the last step (after all the data is
written out) in the validation of the swap signature in
mark_swapfiles().

Before this patch, the command line processing was the only place where
swsusp_resume_block was set.

Signed-off-by: Aleksei Besogonov <cyberax@amazon.com>
Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
---
 kernel/power/user.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/power/user.c b/kernel/power/user.c
index 13cca2e2c2bc6..2b2535dc2ea26 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -239,6 +239,10 @@ static int snapshot_set_swap_area(struct snapshot_data *data,
 	if (data->swap < 0)
 		return swdev ? -ENODEV : -EINVAL;
 	data->dev = swdev;
+
+	swsusp_resume_device = swdev;
+	swsusp_resume_block = offset;
+
 	return 0;
 }
 

From b164d2bed8d21bf2ed41fce459435764063fb290 Mon Sep 17 00:00:00 2001
From: Anchal Agarwal <anchalag@amazon.com>
Date: Tue, 27 Mar 2018 17:23:50 +0000
Subject: [PATCH 019/737] Revert "xen: dont fiddle with event channel masking
 in suspend/resume"

This reverts commit e91b2b1194335ca83d8a40fa4e0efd480bf2babe.
evtchn are supposed to be masked during resume however they are not
which causes special interrupts like PV spinlock to cause kernel
BUG() as its expects the IRQ to be masked. This causes instances
that are live migrated successfully to crash after few minutes.

Signed-off--by: Anchal Agarwal <anchalag@amazon.com>
Signed-off--by: Eduardo Valentin <eduval@amazon.com>
Reviewed-by: Frank van der Linden <fllinden@amazon.com>
Reviewed-by: Alakesh Haloi <alakeshh@amazon.com>
Reviewed-by: Vallish Vaidyeshwara <vallish@amazon.com>
---
 drivers/xen/events/events_base.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 970d37c1e3a78..69ce4b3d2105a 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -493,6 +493,14 @@ static void bind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int cpu)
 	info->cpu = cpu;
 }
 
+static void xen_evtchn_mask_all(void)
+{
+	evtchn_port_t evtchn;
+
+	for (evtchn = 0; evtchn < xen_evtchn_nr_channels(); evtchn++)
+		mask_evtchn(evtchn);
+}
+
 /**
  * notify_remote_via_irq - send event to remote end of event channel via irq
  * @irq: irq of event channel to send event to
@@ -2028,6 +2036,7 @@ void xen_irq_resume(void)
 	struct irq_info *info;
 
 	/* New event-channel space is not 'live' yet. */
+	xen_evtchn_mask_all();
 	xen_evtchn_resume();
 
 	/* No IRQ <-> event-channel mappings. */
@@ -2173,7 +2182,6 @@ static int xen_evtchn_cpu_dead(unsigned int cpu)
 void __init xen_init_IRQ(void)
 {
 	int ret = -EINVAL;
-	evtchn_port_t evtchn;
 
 	if (xen_fifo_events)
 		ret = xen_evtchn_fifo_init();
@@ -2193,8 +2201,7 @@ void __init xen_init_IRQ(void)
 	BUG_ON(!evtchn_to_irq);
 
 	/* No event channels are 'live' right now. */
-	for (evtchn = 0; evtchn < xen_evtchn_nr_channels(); evtchn++)
-		mask_evtchn(evtchn);
+	xen_evtchn_mask_all();
 
 	pirq_needs_eoi = pirq_needs_eoi_flag;
 

From f41df834f507f3fb11644a04625126cf693519ab Mon Sep 17 00:00:00 2001
From: Anchal Agarwal <anchalag@amazon.com>
Date: Tue, 5 Jun 2018 20:51:31 +0000
Subject: [PATCH 020/737] xen-blkfront: Fixed blkfront_restore to remove a call
 to negotiate_mq

The code for talk_to_blkback API changed in kernel-4.14.45 to include a call to
negotiate_mq. Subsequent calls causes kernel panic
[   84.440105] Call Trace:
[   84.443707]  talk_to_blkback+0x6d/0x8b0 [xen_blkfront]
[   84.449147]  blkfront_restore+0x33/0x60 [xen_blkfront]
[   84.453336]  ? xenbus_read_otherend_details+0x50/0xb0
[   84.457804]  xenbus_dev_cancel+0x5f/0x160
[   84.463286]  ? xenbus_dev_resume+0x170/0x170
[   84.466891]  dpm_run_callback+0x3b/0x100
[   84.470516]  device_resume+0x10d/0x420
[   84.473844]  dpm_resume+0xfd/0x2f0
[   84.476984]  hibernation_snapshot+0x218/0x410
[   84.480794]  hibernate+0x14b/0x270
[   84.484030]  state_store+0x50/0x60
[   84.487443]  kernfs_fop_write+0x105/0x180
[   84.492695]  __vfs_write+0x36/0x160
[   84.496672]  ? __audit_syscall_entry+0xbc/0x110
[   84.502123]  vfs_write+0xad/0x1a0
[   84.506857]  SyS_write+0x52/0xc0
[   84.511420]  do_syscall_64+0x67/0x100
[   84.516365]  entry_SYSCALL_64_after_hwframe+0x3d/0xa2
[   84.522571] RIP: 0033:0x7f44a03407e4
[   84.526210] RSP: 002b:00007ffd5e0ec3c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[   84.534041] RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007f44a03407e4
[   84.542571] RDX: 0000000000000004 RSI: 0000000001e94990 RDI: 0000000000000001
[   84.549142] RBP: 0000000001e94990 R08: 00007f44a060c8c0 R09: 00007f44a0c57740
[   84.554658] R10: 00007f44a03cd320 R11: 0000000000000246 R12: 0000000000000004
[   84.560411] R13: 0000000000000001 R14: 00007f44a060b760 R15: 0000000000000004
[   84.565744] Code: 39 ab e8 00 00 00 77 8a 31 c0 5b 5d c3 44 8b 05 50 57 00 00 45 85 c0 0f 84 2f ff ff ff 89 c0 48 69 f8 e0 40 01 00 e9 30 ff ff ff <0f> 0b 48 8b 7b 28 48 c7 c2 78 58 16 a0 be f4 ff ff ff e8 7e 37
[   84.580594] RIP: negotiate_mq+0x12b/0x150 [xen_blkfront] RSP: ffffc90000ebbc70

Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Frank van der Linden <fllinden@amazon.com>
Reviewed-by: Vallish Vaidyeshwara <vallish@amazon.com>
---
 drivers/block/xen-blkfront.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index d00968cc4cfb0..3107f05524572 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -2844,11 +2844,6 @@ static int blkfront_restore(struct xenbus_device *dev)
 {
 	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
 	int err = 0;
-
-	err = negotiate_mq(info);
-	if (err)
-		goto out;
-
 	err = talk_to_blkback(dev, info);
 	if (err)
 		goto out;

From 7b96826f26017243545eca4fdf59693cc7c2b47e Mon Sep 17 00:00:00 2001
From: Eduardo Valentin <eduval@amazon.com>
Date: Mon, 9 Apr 2018 20:50:29 +0000
Subject: [PATCH 021/737] x86: tsc: avoid system instability in hibernation

System instability are seen during resume from hibernation when system
is under heavy CPU load. This is due to the lack of update of sched
clock data, and the scheduler would then think that heavy CPU hog
tasks need more time in CPU, causing the system to freeze
during the unfreezing of tasks. For example, threaded irqs,
and kernel processes servicing network interface may be delayed
for several tens of seconds, causing the system to be unreachable.

Situation like this can be reported by using lockup detectors
such as workqueue lockup detectors:

[root@ip-172-31-67-114 ec2-user]# echo disk > /sys/power/state

Message from syslogd@ip-172-31-67-114 at May  7 18:23:21 ...
 kernel:BUG: workqueue lockup - pool cpus=0 node=0 flags=0x0 nice=0 stuck for 57s!

Message from syslogd@ip-172-31-67-114 at May  7 18:23:21 ...
 kernel:BUG: workqueue lockup - pool cpus=1 node=0 flags=0x0 nice=0 stuck for 57s!

Message from syslogd@ip-172-31-67-114 at May  7 18:23:21 ...
 kernel:BUG: workqueue lockup - pool cpus=3 node=0 flags=0x1 nice=0 stuck for 57s!

Message from syslogd@ip-172-31-67-114 at May  7 18:29:06 ...
 kernel:BUG: workqueue lockup - pool cpus=3 node=0 flags=0x1 nice=0 stuck for 403s!

The fix for this situation is to mark the sched clock as unstable
as early as possible in the resume path, leaving it unstable
for the duration of the resume process. This will force the
scheduler to attempt to align the sched clock across CPUs using
the delta with time of day, updating sched clock data. In a post
hibernation event, we can then mark the sched clock as stable
again, avoiding unnecessary syncs with time of day on systems
in which TSC is reliable.

Reviewed-by: Erik Quanstrom <quanstro@amazon.com>
Reviewed-by: Frank van der Linden <fllinden@amazon.com>
Reviewed-by: Balbir Singh <sblbir@amazon.com>
Reviewed-by: Munehisa Kamata <kamatam@amazon.com>
Tested-by: Anchal Agarwal <anchalag@amazon.com>
Signed-off-by: Eduardo Valentin <eduval@amazon.com>
---
 arch/x86/kernel/tsc.c       | 29 +++++++++++++++++++++++++++++
 include/linux/sched/clock.h |  5 +++++
 kernel/sched/clock.c        |  4 ++--
 3 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 13d1a0ac8916a..fdf97fe5b08db 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -14,6 +14,7 @@
 #include <linux/percpu.h>
 #include <linux/timex.h>
 #include <linux/static_key.h>
+#include <linux/suspend.h>
 
 #include <asm/hpet.h>
 #include <asm/timer.h>
@@ -1574,3 +1575,31 @@ unsigned long calibrate_delay_is_known(void)
 	return 0;
 }
 #endif
+
+static int tsc_pm_notifier(struct notifier_block *notifier,
+                          unsigned long pm_event, void *unused)
+{
+	switch (pm_event) {
+	case PM_HIBERNATION_PREPARE:
+		clear_sched_clock_stable();
+		break;
+	case PM_POST_HIBERNATION:
+		/* Set back to the default */
+		if (!check_tsc_unstable())
+			set_sched_clock_stable();
+		break;
+	}
+
+	return 0;
+};
+
+static struct notifier_block tsc_pm_notifier_block = {
+       .notifier_call = tsc_pm_notifier,
+};
+
+static int tsc_setup_pm_notifier(void)
+{
+       return register_pm_notifier(&tsc_pm_notifier_block);
+}
+
+subsys_initcall(tsc_setup_pm_notifier);
diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
index 867d588314e03..902654ac5f7e7 100644
--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -32,6 +32,10 @@ static inline void clear_sched_clock_stable(void)
 {
 }
 
+static inline void set_sched_clock_stable(void)
+{
+}
+
 static inline void sched_clock_idle_sleep_event(void)
 {
 }
@@ -51,6 +55,7 @@ static inline u64 local_clock(void)
 }
 #else
 extern int sched_clock_stable(void);
+extern void set_sched_clock_stable(void);
 extern void clear_sched_clock_stable(void);
 
 /*
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 12bca64dff731..fc7bf3ef711e6 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -116,7 +116,7 @@ static void __scd_stamp(struct sched_clock_data *scd)
 	scd->tick_raw = sched_clock();
 }
 
-static void __set_sched_clock_stable(void)
+void set_sched_clock_stable(void)
 {
 	struct sched_clock_data *scd;
 
@@ -236,7 +236,7 @@ static int __init sched_clock_init_late(void)
 	smp_mb(); /* matches {set,clear}_sched_clock_stable() */
 
 	if (__sched_clock_stable_early)
-		__set_sched_clock_stable();
+		set_sched_clock_stable();
 
 	return 0;
 }

From 95b57ba045366253aa44b26486be41b08d216c72 Mon Sep 17 00:00:00 2001
From: Eduardo Valentin <eduval@amazon.com>
Date: Thu, 18 Oct 2018 22:13:48 +0000
Subject: [PATCH 022/737] block: xen-blkfront: consider new dom0 features on
 restore

On regular start, the instance will perform a regular boot, in which rootfs
is mounted accordingly to the xen-blkback features (in particular
feature-barrier and feature-flush-cache). That will setup the journal
accordingly to the provided features on SB.
On a start from hibernation, the instance boots, detects that a hibernation
image is present, push the image to memory and jumps back where it was. There
is no regular mount of the rootfs, it uses the data structures already in
the previous saved memory image.
Now, When the instance hibernates, it may move from its original dom0 to a new dom0
when it is restarted.
So, given the above, if the xen-blkback features change then the guest
can be in trouble. And I see the original assumption was that the
dom0 environment would be preserved. I did a couple of experiments,
and I confirm that these particular features change quite a lot across
hibernation attempts:
[ 2343.157903] blkfront: xvda: barrier or flush: disabled; persistent grants: disabled; indirect descriptors: enabled;
[ 2444.712339] blkfront: xvda: barrier or flush: disabled; persistent grants: disabled; indirect descriptors: enabled;
[ 2537.105884] blkfront: xvda: flush diskcache: enabled; persistent grants: disabled; indirect descriptors: enabled;
[ 2636.641298] blkfront: xvda: barrier or flush: disabled; persistent grants: disabled; indirect descriptors: enabled;
[ 2729.868349] blkfront: xvda: flush diskcache: enabled; persistent grants: disabled; indirect descriptors: enabled;
[ 2827.118979] blkfront: xvda: flush diskcache: enabled; persistent grants: disabled; indirect descriptors: enabled;
[ 2924.812599] blkfront: xvda: flush diskcache: enabled; persistent grants: disabled; indirect descriptors: enabled;
[ 3018.063399] blkfront: xvda: flush diskcache: enabled; persistent grants: disabled; indirect descriptors: enabled;
[ 3116.685040] blkfront: xvda: flush diskcache: enabled; persistent grants: disabled; indirect descriptors: enabled;
[ 3209.164475] blkfront: xvda: barrier or flush: disabled; persistent grants: disabled; indirect descriptors: enabled;
[ 3317.981362] blkfront: xvda: barrier or flush: disabled; persistent grants: disabled; indirect descriptors: enabled;
[ 3415.939725] blkfront: xvda: flush diskcache: enabled; persistent grants: disabled; indirect descriptors: enabled;
[ 3514.202478] blkfront: xvda: barrier or flush: disabled; persistent grants: disabled; indirect descriptors: enabled;
[ 3619.355791] blkfront: xvda: barrier or flush: disabled; persistent grants: disabled; indirect descriptors: enabled;

Now, considering the above, this patch fixes the following scenario:
a. Instance boots and sets up bio queue on a dom0 A with softbarrier supported.
b. hibernates
c. When asked to restore, the instance is back on dom0 B with unsupported
softbarrier.
d. Restoration goes well until next journal commit is issued. Remember that
it is still using the previous image rootfs data structures, therefore
is gonna request a softbarrier.
e. The bio will error out and throw a "operation not supported" message
and cause the journal to fail, and it will decide to remount
the rootfs as RO.
[ 1138.909290] print_req_error: operation not supported error, dev xvda, sector 4470400, flags 6008
[ 1139.025685] Aborting journal on device xvda1-8.
[ 1139.029758] print_req_error: operation not supported error, dev xvda, sector 4460544, flags 26008
[ 1139.326119] Buffer I/O error on dev xvda1, logical block 0, lost sync page write
[ 1139.331398] EXT4-fs error (device xvda1): ext4_journal_check_start:61: Detected aborted journal
[ 1139.337296] EXT4-fs (xvda1): Remounting filesystem read-only
[ 1139.341006] EXT4-fs (xvda1): previous I/O error to superblock detected
[ 1139.345704] print_req_error: operation not supported error, dev xvda, sector 4096, flags 26008

The fix is essentially to read xenbus to query the new xen
blkback capabilities and update them into the request queue.

Reviewed-by: Balbir Singh <sblbir@amazon.com>
Reviewed-by: Vallish Vaidyeshwara <vallish@amazon.com>
Signed-off-by: Eduardo Valentin <eduval@amazon.com>
---
 drivers/block/xen-blkfront.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 3107f05524572..e9dbdf68b99b3 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -2844,6 +2844,9 @@ static int blkfront_restore(struct xenbus_device *dev)
 {
 	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
 	int err = 0;
+
+	blkfront_gather_backend_features(info);
+	xlvbd_flush(info);
 	err = talk_to_blkback(dev, info);
 	if (err)
 		goto out;

From 696eecc4b71825b35467f67d1b329142ce528263 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Fri, 26 Oct 2018 21:27:54 +0000
Subject: [PATCH 023/737] xen: restore pirqs on resume from hibernation.

The hibernation code unlinks event channels from these (legacy) IRQs, so they
must be reinitialized on wakeup, much like in the Xen suspend/resume case.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
Reviewed-by: Cristian Gafton <gafton@amazon.com>
Reviewed-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Alakesh Haloi <alakeshh@amazon.com>
---
 arch/x86/xen/suspend.c           | 2 ++
 drivers/xen/events/events_base.c | 5 +++++
 include/xen/events.h             | 1 +
 3 files changed, 8 insertions(+)

diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index affa63d4b6bdc..39644923b623e 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -132,6 +132,8 @@ static void xen_syscore_resume(void)
 	xen_restore_steal_clock(smp_processor_id());
 
 	gnttab_resume();
+
+	xen_restore_pirqs();
 }
 
 /*
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 69ce4b3d2105a..b9568d5c477c0 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -2065,6 +2065,11 @@ void xen_shutdown_pirqs(void)
 	}
 }
 
+void xen_restore_pirqs(void)
+{
+	restore_pirqs();
+}
+
 static struct irq_chip xen_dynamic_chip __read_mostly = {
 	.name			= "xen-dyn",
 
diff --git a/include/xen/events.h b/include/xen/events.h
index 64046a929ae79..4c174577ef512 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -85,6 +85,7 @@ void notify_remote_via_irq(int irq);
 
 void xen_irq_resume(void);
 void xen_shutdown_pirqs(void);
+void xen_restore_pirqs(void);
 
 /* Clear an irq's pending state, in preparation for polling on it */
 void xen_clear_irq_pending(int irq);

From 0381352fd39b2e9b9e48826e41056fe60ef15856 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Sat, 10 Nov 2018 00:18:32 +0000
Subject: [PATCH 024/737] xen: Only restore the ACPI SCI interrupt in
 xen_restore_pirqs.

Restoring all PIRQs, which is the right thing to do, was causing problems
on larger instances. This is a horrible workaround until this issue is fully
understood.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
Reviewed-by: Alakesh Haloi <alakeshh@amazon.com>
Reviewed-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Qian Lu <luqia@amazon.com>
---
 drivers/xen/events/events_base.c | 42 +++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index b9568d5c477c0..064999298c393 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -65,6 +65,10 @@
 #include <xen/interface/vcpu.h>
 #include <asm/hw_irq.h>
 
+#ifdef CONFIG_ACPI
+#include <linux/acpi.h>
+#endif
+
 #include "events_internal.h"
 
 #undef MODULE_PARAM_PREFIX
@@ -2065,9 +2069,45 @@ void xen_shutdown_pirqs(void)
 	}
 }
 
+/*
+ * For now, only restore the ACPI SCI pirq.
+ */
 void xen_restore_pirqs(void)
 {
-	restore_pirqs();
+#ifdef CONFIG_ACPI
+	int pirq, rc, irq, gsi;
+	struct physdev_map_pirq map_irq;
+	struct irq_info *info;
+
+	list_for_each_entry(info, &xen_irq_list_head, list) {
+		if (info->type != IRQT_PIRQ)
+			continue;
+
+		pirq = info->u.pirq.pirq;
+		gsi = info->u.pirq.gsi;
+		irq = info->irq;
+
+		if (gsi != acpi_gbl_FADT.sci_interrupt)
+			continue;
+
+		map_irq.domid = DOMID_SELF;
+		map_irq.type = MAP_PIRQ_TYPE_GSI;
+		map_irq.index = gsi;
+		map_irq.pirq = pirq;
+
+		rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
+		if (rc) {
+			pr_warn("xen: ACPI SCI interrupt map failed, rc=%d\n",
+				rc);
+			xen_free_irq(irq);
+			continue;
+		}
+
+		printk(KERN_DEBUG "xen: restored ACPI SCI interrupt\n");
+
+		__startup_pirq(irq);
+	}
+#endif
 }
 
 static struct irq_chip xen_dynamic_chip __read_mostly = {

From 4c28cd1053083e3b77e0ea4936ce33b5b927a2d2 Mon Sep 17 00:00:00 2001
From: Alakesh Haloi <alakeshh@amazon.com>
Date: Sat, 10 Nov 2018 00:53:11 +0000
Subject: [PATCH 025/737] net: ena: Import the ENA v2 driver (2.0.2g)

Signed-off-by: Alakesh Haloi <alakeshh@amazon.com>
Reviewed-by: Frank van der Linden <fllinden@amazon.com>
Reviewed-by: Anchal Agarwal <anchalag@amazon.com>
---
 drivers/amazon/net/ena/ena_admin_defs.h  | 515 ++++++++-----
 drivers/amazon/net/ena/ena_com.c         | 470 ++++++++++--
 drivers/amazon/net/ena/ena_com.h         | 110 ++-
 drivers/amazon/net/ena/ena_common_defs.h |   7 +-
 drivers/amazon/net/ena/ena_eth_com.c     | 336 ++++++---
 drivers/amazon/net/ena/ena_eth_com.h     | 125 ++-
 drivers/amazon/net/ena/ena_eth_io_defs.h | 232 +++---
 drivers/amazon/net/ena/ena_ethtool.c     |  87 ++-
 drivers/amazon/net/ena/ena_netdev.c      | 917 ++++++++++++++---------
 drivers/amazon/net/ena/ena_netdev.h      |  43 +-
 drivers/amazon/net/ena/ena_pci_id_tbl.h  |   0
 drivers/amazon/net/ena/ena_regs_defs.h   | 209 +++---
 drivers/amazon/net/ena/ena_sysfs.c       |   6 -
 drivers/amazon/net/ena/ena_sysfs.h       |   0
 drivers/amazon/net/ena/kcompat.h         | 126 ++--
 15 files changed, 2121 insertions(+), 1062 deletions(-)
 mode change 100644 => 100755 drivers/amazon/net/ena/ena_admin_defs.h
 mode change 100644 => 100755 drivers/amazon/net/ena/ena_common_defs.h
 mode change 100644 => 100755 drivers/amazon/net/ena/ena_eth_io_defs.h
 mode change 100644 => 100755 drivers/amazon/net/ena/ena_ethtool.c
 mode change 100644 => 100755 drivers/amazon/net/ena/ena_netdev.c
 mode change 100644 => 100755 drivers/amazon/net/ena/ena_netdev.h
 mode change 100644 => 100755 drivers/amazon/net/ena/ena_pci_id_tbl.h
 mode change 100644 => 100755 drivers/amazon/net/ena/ena_regs_defs.h
 mode change 100644 => 100755 drivers/amazon/net/ena/ena_sysfs.c
 mode change 100644 => 100755 drivers/amazon/net/ena/ena_sysfs.h
 mode change 100644 => 100755 drivers/amazon/net/ena/kcompat.h

diff --git a/drivers/amazon/net/ena/ena_admin_defs.h b/drivers/amazon/net/ena/ena_admin_defs.h
old mode 100644
new mode 100755
index 4532e574ebcdc..8da5f41d28fdb
--- a/drivers/amazon/net/ena/ena_admin_defs.h
+++ b/drivers/amazon/net/ena/ena_admin_defs.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2015 - 2016 Amazon.com, Inc. or its affiliates.
+ * Copyright 2015 - 2018 Amazon.com, Inc. or its affiliates.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -32,115 +33,86 @@
 #ifndef _ENA_ADMIN_H_
 #define _ENA_ADMIN_H_
 
-enum ena_admin_aq_opcode {
-	ENA_ADMIN_CREATE_SQ	= 1,
-
-	ENA_ADMIN_DESTROY_SQ	= 2,
-
-	ENA_ADMIN_CREATE_CQ	= 3,
-
-	ENA_ADMIN_DESTROY_CQ	= 4,
-
-	ENA_ADMIN_GET_FEATURE	= 8,
+#define ENA_ADMIN_EXTRA_PROPERTIES_STRING_LEN 32
+#define ENA_ADMIN_EXTRA_PROPERTIES_COUNT     32
 
-	ENA_ADMIN_SET_FEATURE	= 9,
-
-	ENA_ADMIN_GET_STATS	= 11,
+enum ena_admin_aq_opcode {
+	ENA_ADMIN_CREATE_SQ                         = 1,
+	ENA_ADMIN_DESTROY_SQ                        = 2,
+	ENA_ADMIN_CREATE_CQ                         = 3,
+	ENA_ADMIN_DESTROY_CQ                        = 4,
+	ENA_ADMIN_GET_FEATURE                       = 8,
+	ENA_ADMIN_SET_FEATURE                       = 9,
+	ENA_ADMIN_GET_STATS                         = 11,
 };
 
 enum ena_admin_aq_completion_status {
-	ENA_ADMIN_SUCCESS			= 0,
-
-	ENA_ADMIN_RESOURCE_ALLOCATION_FAILURE	= 1,
-
-	ENA_ADMIN_BAD_OPCODE			= 2,
-
-	ENA_ADMIN_UNSUPPORTED_OPCODE		= 3,
-
-	ENA_ADMIN_MALFORMED_REQUEST		= 4,
-
+	ENA_ADMIN_SUCCESS                           = 0,
+	ENA_ADMIN_RESOURCE_ALLOCATION_FAILURE       = 1,
+	ENA_ADMIN_BAD_OPCODE                        = 2,
+	ENA_ADMIN_UNSUPPORTED_OPCODE                = 3,
+	ENA_ADMIN_MALFORMED_REQUEST                 = 4,
 	/* Additional status is provided in ACQ entry extended_status */
-	ENA_ADMIN_ILLEGAL_PARAMETER		= 5,
-
-	ENA_ADMIN_UNKNOWN_ERROR			= 6,
+	ENA_ADMIN_ILLEGAL_PARAMETER                 = 5,
+	ENA_ADMIN_UNKNOWN_ERROR                     = 6,
+	ENA_ADMIN_RESOURCE_BUSY                     = 7,
 };
 
 enum ena_admin_aq_feature_id {
-	ENA_ADMIN_DEVICE_ATTRIBUTES		= 1,
-
-	ENA_ADMIN_MAX_QUEUES_NUM		= 2,
-
-	ENA_ADMIN_HW_HINTS			= 3,
-
-	ENA_ADMIN_RSS_HASH_FUNCTION		= 10,
-
-	ENA_ADMIN_STATELESS_OFFLOAD_CONFIG	= 11,
-
-	ENA_ADMIN_RSS_REDIRECTION_TABLE_CONFIG	= 12,
-
-	ENA_ADMIN_MTU				= 14,
-
-	ENA_ADMIN_RSS_HASH_INPUT		= 18,
-
-	ENA_ADMIN_INTERRUPT_MODERATION		= 20,
-
-	ENA_ADMIN_AENQ_CONFIG			= 26,
-
-	ENA_ADMIN_LINK_CONFIG			= 27,
-
-	ENA_ADMIN_HOST_ATTR_CONFIG		= 28,
-
-	ENA_ADMIN_FEATURES_OPCODE_NUM		= 32,
+	ENA_ADMIN_DEVICE_ATTRIBUTES                 = 1,
+	ENA_ADMIN_MAX_QUEUES_NUM                    = 2,
+	ENA_ADMIN_HW_HINTS                          = 3,
+	ENA_ADMIN_LLQ                               = 4,
+	ENA_ADMIN_EXTRA_PROPERTIES_STRINGS          = 5,
+	ENA_ADMIN_EXTRA_PROPERTIES_FLAGS            = 6,
+	ENA_ADMIN_MAX_QUEUES_EXT                    = 7,
+	ENA_ADMIN_RSS_HASH_FUNCTION                 = 10,
+	ENA_ADMIN_STATELESS_OFFLOAD_CONFIG          = 11,
+	ENA_ADMIN_RSS_REDIRECTION_TABLE_CONFIG      = 12,
+	ENA_ADMIN_MTU                               = 14,
+	ENA_ADMIN_RSS_HASH_INPUT                    = 18,
+	ENA_ADMIN_INTERRUPT_MODERATION              = 20,
+	ENA_ADMIN_AENQ_CONFIG                       = 26,
+	ENA_ADMIN_LINK_CONFIG                       = 27,
+	ENA_ADMIN_HOST_ATTR_CONFIG                  = 28,
+	ENA_ADMIN_FEATURES_OPCODE_NUM               = 32,
 };
 
 enum ena_admin_placement_policy_type {
 	/* descriptors and headers are in host memory */
-	ENA_ADMIN_PLACEMENT_POLICY_HOST	= 1,
-
+	ENA_ADMIN_PLACEMENT_POLICY_HOST             = 1,
 	/* descriptors and headers are in device memory (a.k.a Low Latency
 	 * Queue)
 	 */
-	ENA_ADMIN_PLACEMENT_POLICY_DEV	= 3,
+	ENA_ADMIN_PLACEMENT_POLICY_DEV              = 3,
 };
 
 enum ena_admin_link_types {
-	ENA_ADMIN_LINK_SPEED_1G		= 0x1,
-
-	ENA_ADMIN_LINK_SPEED_2_HALF_G	= 0x2,
-
-	ENA_ADMIN_LINK_SPEED_5G		= 0x4,
-
-	ENA_ADMIN_LINK_SPEED_10G	= 0x8,
-
-	ENA_ADMIN_LINK_SPEED_25G	= 0x10,
-
-	ENA_ADMIN_LINK_SPEED_40G	= 0x20,
-
-	ENA_ADMIN_LINK_SPEED_50G	= 0x40,
-
-	ENA_ADMIN_LINK_SPEED_100G	= 0x80,
-
-	ENA_ADMIN_LINK_SPEED_200G	= 0x100,
-
-	ENA_ADMIN_LINK_SPEED_400G	= 0x200,
+	ENA_ADMIN_LINK_SPEED_1G                     = 0x1,
+	ENA_ADMIN_LINK_SPEED_2_HALF_G               = 0x2,
+	ENA_ADMIN_LINK_SPEED_5G                     = 0x4,
+	ENA_ADMIN_LINK_SPEED_10G                    = 0x8,
+	ENA_ADMIN_LINK_SPEED_25G                    = 0x10,
+	ENA_ADMIN_LINK_SPEED_40G                    = 0x20,
+	ENA_ADMIN_LINK_SPEED_50G                    = 0x40,
+	ENA_ADMIN_LINK_SPEED_100G                   = 0x80,
+	ENA_ADMIN_LINK_SPEED_200G                   = 0x100,
+	ENA_ADMIN_LINK_SPEED_400G                   = 0x200,
 };
 
 enum ena_admin_completion_policy_type {
 	/* completion queue entry for each sq descriptor */
-	ENA_ADMIN_COMPLETION_POLICY_DESC		= 0,
-
+	ENA_ADMIN_COMPLETION_POLICY_DESC            = 0,
 	/* completion queue entry upon request in sq descriptor */
-	ENA_ADMIN_COMPLETION_POLICY_DESC_ON_DEMAND	= 1,
-
+	ENA_ADMIN_COMPLETION_POLICY_DESC_ON_DEMAND  = 1,
 	/* current queue head pointer is updated in OS memory upon sq
 	 * descriptor request
 	 */
-	ENA_ADMIN_COMPLETION_POLICY_HEAD_ON_DEMAND	= 2,
-
+	ENA_ADMIN_COMPLETION_POLICY_HEAD_ON_DEMAND  = 2,
 	/* current queue head pointer is updated in OS memory for each sq
 	 * descriptor
 	 */
-	ENA_ADMIN_COMPLETION_POLICY_HEAD		= 3,
+	ENA_ADMIN_COMPLETION_POLICY_HEAD            = 3,
 };
 
 /* basic stats return ena_admin_basic_stats while extanded stats return a
@@ -148,15 +120,13 @@ enum ena_admin_completion_policy_type {
  * device id
  */
 enum ena_admin_get_stats_type {
-	ENA_ADMIN_GET_STATS_TYPE_BASIC		= 0,
-
-	ENA_ADMIN_GET_STATS_TYPE_EXTENDED	= 1,
+	ENA_ADMIN_GET_STATS_TYPE_BASIC              = 0,
+	ENA_ADMIN_GET_STATS_TYPE_EXTENDED           = 1,
 };
 
 enum ena_admin_get_stats_scope {
-	ENA_ADMIN_SPECIFIC_QUEUE	= 0,
-
-	ENA_ADMIN_ETH_TRAFFIC		= 1,
+	ENA_ADMIN_SPECIFIC_QUEUE                    = 0,
+	ENA_ADMIN_ETH_TRAFFIC                       = 1,
 };
 
 struct ena_admin_aq_common_desc {
@@ -227,7 +197,9 @@ struct ena_admin_acq_common_desc {
 
 	u16 extended_status;
 
-	/* serves as a hint what AQ entries can be revoked */
+	/* indicates to the driver which AQ entry has been consumed by the
+	 *    device and could be reused
+	 */
 	u16 sq_head_indx;
 };
 
@@ -296,9 +268,8 @@ struct ena_admin_aq_create_sq_cmd {
 };
 
 enum ena_admin_sq_direction {
-	ENA_ADMIN_SQ_DIRECTION_TX	= 1,
-
-	ENA_ADMIN_SQ_DIRECTION_RX	= 2,
+	ENA_ADMIN_SQ_DIRECTION_TX                   = 1,
+	ENA_ADMIN_SQ_DIRECTION_RX                   = 2,
 };
 
 struct ena_admin_acq_create_sq_resp_desc {
@@ -456,7 +427,13 @@ struct ena_admin_get_set_feature_common_desc {
 	/* as appears in ena_admin_aq_feature_id */
 	u8 feature_id;
 
-	u16 reserved16;
+	/* The driver specifies the max feature version it supports and the
+	 *    device responds with the currently supported feature version. The
+	 *    field is zero based
+	 */
+	u8 feature_version;
+
+	u8 reserved8;
 };
 
 struct ena_admin_device_attr_feature_desc {
@@ -483,8 +460,118 @@ struct ena_admin_device_attr_feature_desc {
 	u32 max_mtu;
 };
 
+enum ena_admin_llq_header_location {
+	/* header is in descriptor list */
+	ENA_ADMIN_INLINE_HEADER                     = 1,
+	/* header in a separate ring, implies 16B descriptor list entry */
+	ENA_ADMIN_HEADER_RING                       = 2,
+};
+
+enum ena_admin_llq_ring_entry_size {
+	ENA_ADMIN_LIST_ENTRY_SIZE_128B              = 1,
+	ENA_ADMIN_LIST_ENTRY_SIZE_192B              = 2,
+	ENA_ADMIN_LIST_ENTRY_SIZE_256B              = 4,
+};
+
+enum ena_admin_llq_num_descs_before_header {
+	ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_0     = 0,
+	ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_1     = 1,
+	ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_2     = 2,
+	ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_4     = 4,
+	ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_8     = 8,
+};
+
+/* packet descriptor list entry always starts with one or more descriptors,
+ * followed by a header. The rest of the descriptors are located in the
+ * beginning of the subsequent entry. Stride refers to how the rest of the
+ * descriptors are placed. This field is relevant only for inline header
+ * mode
+ */
+enum ena_admin_llq_stride_ctrl {
+	ENA_ADMIN_SINGLE_DESC_PER_ENTRY             = 1,
+	ENA_ADMIN_MULTIPLE_DESCS_PER_ENTRY          = 2,
+};
+
+struct ena_admin_feature_llq_desc {
+	u32 max_llq_num;
+
+	u32 max_llq_depth;
+
+	/*  specify the header locations the device supports. bitfield of
+	 *    enum ena_admin_llq_header_location.
+	 */
+	u16 header_location_ctrl_supported;
+
+	/* the header location the driver selected to use. */
+	u16 header_location_ctrl_enabled;
+
+	/* if inline header is specified - this is the size of descriptor
+	 *    list entry. If header in a separate ring is specified - this is
+	 *    the size of header ring entry. bitfield of enum
+	 *    ena_admin_llq_ring_entry_size. specify the entry sizes the device
+	 *    supports
+	 */
+	u16 entry_size_ctrl_supported;
+
+	/* the entry size the driver selected to use. */
+	u16 entry_size_ctrl_enabled;
+
+	/* valid only if inline header is specified. First entry associated
+	 *    with the packet includes descriptors and header. Rest of the
+	 *    entries occupied by descriptors. This parameter defines the max
+	 *    number of descriptors precedding the header in the first entry.
+	 *    The field is bitfield of enum
+	 *    ena_admin_llq_num_descs_before_header and specify the values the
+	 *    device supports
+	 */
+	u16 desc_num_before_header_supported;
+
+	/* the desire field the driver selected to use */
+	u16 desc_num_before_header_enabled;
+
+	/* valid only if inline was chosen. bitfield of enum
+	 *    ena_admin_llq_stride_ctrl
+	 */
+	u16 descriptors_stride_ctrl_supported;
+
+	/* the stride control the driver selected to use */
+	u16 descriptors_stride_ctrl_enabled;
+
+	/* Maximum size in bytes taken by llq entries in a single tx burst.
+	 * Set to 0 when there is no such limit.
+	 */
+	u32 max_tx_burst_size;
+};
+
+struct ena_admin_queue_ext_feature_fields {
+	u32 max_tx_sq_num;
+
+	u32 max_tx_cq_num;
+
+	u32 max_rx_sq_num;
+
+	u32 max_rx_cq_num;
+
+	u32 max_tx_sq_depth;
+
+	u32 max_tx_cq_depth;
+
+	u32 max_rx_sq_depth;
+
+	u32 max_rx_cq_depth;
+
+	u32 max_tx_header_size;
+
+	/* Maximum Descriptors number, including meta descriptor, allowed for
+	 *    a single Tx packet
+	 */
+	u16 max_per_packet_tx_descs;
+
+	/* Maximum Descriptors number allowed for a single Rx packet */
+	u16 max_per_packet_rx_descs;
+};
+
 struct ena_admin_queue_feature_desc {
-	/* including LLQs */
 	u32 max_sq_num;
 
 	u32 max_sq_depth;
@@ -493,9 +580,9 @@ struct ena_admin_queue_feature_desc {
 
 	u32 max_cq_depth;
 
-	u32 max_llq_num;
+	u32 max_legacy_llq_num;
 
-	u32 max_llq_depth;
+	u32 max_legacy_llq_depth;
 
 	u32 max_header_size;
 
@@ -513,6 +600,14 @@ struct ena_admin_set_feature_mtu_desc {
 	u32 mtu;
 };
 
+struct ena_admin_get_extra_properties_strings_desc {
+	u32 count;
+};
+
+struct ena_admin_get_extra_properties_flags_desc {
+	u32 flags;
+};
+
 struct ena_admin_set_feature_host_attr_desc {
 	/* host OS info base address in OS memory. host info is 4KB of
 	 * physically contiguous
@@ -583,9 +678,8 @@ struct ena_admin_feature_offload_desc {
 };
 
 enum ena_admin_hash_functions {
-	ENA_ADMIN_TOEPLITZ	= 1,
-
-	ENA_ADMIN_CRC32		= 2,
+	ENA_ADMIN_TOEPLITZ                          = 1,
+	ENA_ADMIN_CRC32                             = 2,
 };
 
 struct ena_admin_feature_rss_flow_hash_control {
@@ -611,50 +705,35 @@ struct ena_admin_feature_rss_flow_hash_function {
 
 /* RSS flow hash protocols */
 enum ena_admin_flow_hash_proto {
-	ENA_ADMIN_RSS_TCP4	= 0,
-
-	ENA_ADMIN_RSS_UDP4	= 1,
-
-	ENA_ADMIN_RSS_TCP6	= 2,
-
-	ENA_ADMIN_RSS_UDP6	= 3,
-
-	ENA_ADMIN_RSS_IP4	= 4,
-
-	ENA_ADMIN_RSS_IP6	= 5,
-
-	ENA_ADMIN_RSS_IP4_FRAG	= 6,
-
-	ENA_ADMIN_RSS_NOT_IP	= 7,
-
+	ENA_ADMIN_RSS_TCP4                          = 0,
+	ENA_ADMIN_RSS_UDP4                          = 1,
+	ENA_ADMIN_RSS_TCP6                          = 2,
+	ENA_ADMIN_RSS_UDP6                          = 3,
+	ENA_ADMIN_RSS_IP4                           = 4,
+	ENA_ADMIN_RSS_IP6                           = 5,
+	ENA_ADMIN_RSS_IP4_FRAG                      = 6,
+	ENA_ADMIN_RSS_NOT_IP                        = 7,
 	/* TCPv6 with extension header */
-	ENA_ADMIN_RSS_TCP6_EX	= 8,
-
+	ENA_ADMIN_RSS_TCP6_EX                       = 8,
 	/* IPv6 with extension header */
-	ENA_ADMIN_RSS_IP6_EX	= 9,
-
-	ENA_ADMIN_RSS_PROTO_NUM	= 16,
+	ENA_ADMIN_RSS_IP6_EX                        = 9,
+	ENA_ADMIN_RSS_PROTO_NUM                     = 16,
 };
 
 /* RSS flow hash fields */
 enum ena_admin_flow_hash_fields {
 	/* Ethernet Dest Addr */
-	ENA_ADMIN_RSS_L2_DA	= BIT(0),
-
+	ENA_ADMIN_RSS_L2_DA                         = BIT(0),
 	/* Ethernet Src Addr */
-	ENA_ADMIN_RSS_L2_SA	= BIT(1),
-
+	ENA_ADMIN_RSS_L2_SA                         = BIT(1),
 	/* ipv4/6 Dest Addr */
-	ENA_ADMIN_RSS_L3_DA	= BIT(2),
-
+	ENA_ADMIN_RSS_L3_DA                         = BIT(2),
 	/* ipv4/6 Src Addr */
-	ENA_ADMIN_RSS_L3_SA	= BIT(3),
-
+	ENA_ADMIN_RSS_L3_SA                         = BIT(3),
 	/* tcp/udp Dest Port */
-	ENA_ADMIN_RSS_L4_DP	= BIT(4),
-
+	ENA_ADMIN_RSS_L4_DP                         = BIT(4),
 	/* tcp/udp Src Port */
-	ENA_ADMIN_RSS_L4_SP	= BIT(5),
+	ENA_ADMIN_RSS_L4_SP                         = BIT(5),
 };
 
 struct ena_admin_proto_input {
@@ -693,15 +772,13 @@ struct ena_admin_feature_rss_flow_hash_input {
 };
 
 enum ena_admin_os_type {
-	ENA_ADMIN_OS_LINUX	= 1,
-
-	ENA_ADMIN_OS_WIN	= 2,
-
-	ENA_ADMIN_OS_DPDK	= 3,
-
-	ENA_ADMIN_OS_FREEBSD	= 4,
-
-	ENA_ADMIN_OS_IPXE	= 5,
+	ENA_ADMIN_OS_LINUX                          = 1,
+	ENA_ADMIN_OS_WIN                            = 2,
+	ENA_ADMIN_OS_DPDK                           = 3,
+	ENA_ADMIN_OS_FREEBSD                        = 4,
+	ENA_ADMIN_OS_IPXE                           = 5,
+	ENA_ADMIN_OS_ESXI                           = 6,
+	ENA_ADMIN_OS_GROUPS_NUM                     = 6,
 };
 
 struct ena_admin_host_info {
@@ -723,11 +800,27 @@ struct ena_admin_host_info {
 	/* 7:0 : major
 	 * 15:8 : minor
 	 * 23:16 : sub_minor
+	 * 31:24 : module_type
 	 */
 	u32 driver_version;
 
 	/* features bitmap */
-	u32 supported_network_features[4];
+	u32 supported_network_features[2];
+
+	/* ENA spec version of driver */
+	u16 ena_spec_version;
+
+	/* ENA device's Bus, Device and Function
+	 * 2:0 : function
+	 * 7:3 : device
+	 * 15:8 : bus
+	 */
+	u16 bdf;
+
+	/* Number of CPUs */
+	u16 num_cpus;
+
+	u16 reserved;
 };
 
 struct ena_admin_rss_ind_table_entry {
@@ -746,7 +839,12 @@ struct ena_admin_feature_rss_ind_table {
 	/* table size (2^size) */
 	u16 size;
 
-	u16 reserved;
+	/* 0 : one_entry_update - The ENA device supports
+	 *    setting a single RSS table entry
+	 */
+	u8 flags;
+
+	u8 reserved;
 
 	/* index of the inline entry. 0xFFFFFFFF means invalid */
 	u32 inline_index;
@@ -792,6 +890,19 @@ struct ena_admin_get_feat_cmd {
 	u32 raw[11];
 };
 
+struct ena_admin_queue_ext_feature_desc {
+	/* version */
+	u8 version;
+
+	u8 reserved1[3];
+
+	union {
+		struct ena_admin_queue_ext_feature_fields max_queue_ext;
+
+		u32 raw[10];
+	} ;
+};
+
 struct ena_admin_get_feat_resp {
 	struct ena_admin_acq_common_desc acq_common_desc;
 
@@ -800,8 +911,12 @@ struct ena_admin_get_feat_resp {
 
 		struct ena_admin_device_attr_feature_desc dev_attr;
 
+		struct ena_admin_feature_llq_desc llq;
+
 		struct ena_admin_queue_feature_desc max_queue;
 
+		struct ena_admin_queue_ext_feature_desc max_queue_ext;
+
 		struct ena_admin_feature_aenq_desc aenq;
 
 		struct ena_admin_get_feature_link_desc link;
@@ -817,6 +932,10 @@ struct ena_admin_get_feat_resp {
 		struct ena_admin_feature_intr_moder_desc intr_moderation;
 
 		struct ena_admin_ena_hw_hints hw_hints;
+
+		struct ena_admin_get_extra_properties_strings_desc extra_properties_strings;
+
+		struct ena_admin_get_extra_properties_flags_desc extra_properties_flags;
 	} u;
 };
 
@@ -847,6 +966,9 @@ struct ena_admin_set_feat_cmd {
 
 		/* rss indirection table */
 		struct ena_admin_feature_rss_ind_table ind_table;
+
+		/* LLQ configuration */
+		struct ena_admin_feature_llq_desc llq;
 	} u;
 };
 
@@ -863,7 +985,9 @@ struct ena_admin_aenq_common_desc {
 
 	u16 syndrom;
 
-	/* 0 : phase */
+	/* 0 : phase
+	 * 7:1 : reserved - MBZ
+	 */
 	u8 flags;
 
 	u8 reserved1[3];
@@ -875,25 +999,18 @@ struct ena_admin_aenq_common_desc {
 
 /* asynchronous event notification groups */
 enum ena_admin_aenq_group {
-	ENA_ADMIN_LINK_CHANGE		= 0,
-
-	ENA_ADMIN_FATAL_ERROR		= 1,
-
-	ENA_ADMIN_WARNING		= 2,
-
-	ENA_ADMIN_NOTIFICATION		= 3,
-
-	ENA_ADMIN_KEEP_ALIVE		= 4,
-
-	ENA_ADMIN_AENQ_GROUPS_NUM	= 5,
+	ENA_ADMIN_LINK_CHANGE                       = 0,
+	ENA_ADMIN_FATAL_ERROR                       = 1,
+	ENA_ADMIN_WARNING                           = 2,
+	ENA_ADMIN_NOTIFICATION                      = 3,
+	ENA_ADMIN_KEEP_ALIVE                        = 4,
+	ENA_ADMIN_AENQ_GROUPS_NUM                   = 5,
 };
 
 enum ena_admin_aenq_notification_syndrom {
-	ENA_ADMIN_SUSPEND	= 0,
-
-	ENA_ADMIN_RESUME	= 1,
-
-	ENA_ADMIN_UPDATE_HINTS	= 2,
+	ENA_ADMIN_SUSPEND                           = 0,
+	ENA_ADMIN_RESUME                            = 1,
+	ENA_ADMIN_UPDATE_HINTS                      = 2,
 };
 
 struct ena_admin_aenq_entry {
@@ -928,27 +1045,27 @@ struct ena_admin_ena_mmio_req_read_less_resp {
 };
 
 /* aq_common_desc */
-#define ENA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK GENMASK(11, 0)
-#define ENA_ADMIN_AQ_COMMON_DESC_PHASE_MASK BIT(0)
-#define ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_SHIFT 1
-#define ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_MASK BIT(1)
-#define ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_SHIFT 2
-#define ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK BIT(2)
+#define ENA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK            GENMASK(11, 0)
+#define ENA_ADMIN_AQ_COMMON_DESC_PHASE_MASK                 BIT(0)
+#define ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_SHIFT            1
+#define ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_MASK             BIT(1)
+#define ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_SHIFT   2
+#define ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK    BIT(2)
 
 /* sq */
-#define ENA_ADMIN_SQ_SQ_DIRECTION_SHIFT 5
-#define ENA_ADMIN_SQ_SQ_DIRECTION_MASK GENMASK(7, 5)
+#define ENA_ADMIN_SQ_SQ_DIRECTION_SHIFT                     5
+#define ENA_ADMIN_SQ_SQ_DIRECTION_MASK                      GENMASK(7, 5)
 
 /* acq_common_desc */
-#define ENA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK GENMASK(11, 0)
-#define ENA_ADMIN_ACQ_COMMON_DESC_PHASE_MASK BIT(0)
+#define ENA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK           GENMASK(11, 0)
+#define ENA_ADMIN_ACQ_COMMON_DESC_PHASE_MASK                BIT(0)
 
 /* aq_create_sq_cmd */
-#define ENA_ADMIN_AQ_CREATE_SQ_CMD_SQ_DIRECTION_SHIFT 5
-#define ENA_ADMIN_AQ_CREATE_SQ_CMD_SQ_DIRECTION_MASK GENMASK(7, 5)
-#define ENA_ADMIN_AQ_CREATE_SQ_CMD_PLACEMENT_POLICY_MASK GENMASK(3, 0)
-#define ENA_ADMIN_AQ_CREATE_SQ_CMD_COMPLETION_POLICY_SHIFT 4
-#define ENA_ADMIN_AQ_CREATE_SQ_CMD_COMPLETION_POLICY_MASK GENMASK(6, 4)
+#define ENA_ADMIN_AQ_CREATE_SQ_CMD_SQ_DIRECTION_SHIFT       5
+#define ENA_ADMIN_AQ_CREATE_SQ_CMD_SQ_DIRECTION_MASK        GENMASK(7, 5)
+#define ENA_ADMIN_AQ_CREATE_SQ_CMD_PLACEMENT_POLICY_MASK    GENMASK(3, 0)
+#define ENA_ADMIN_AQ_CREATE_SQ_CMD_COMPLETION_POLICY_SHIFT  4
+#define ENA_ADMIN_AQ_CREATE_SQ_CMD_COMPLETION_POLICY_MASK   GENMASK(6, 4)
 #define ENA_ADMIN_AQ_CREATE_SQ_CMD_IS_PHYSICALLY_CONTIGUOUS_MASK BIT(0)
 
 /* aq_create_cq_cmd */
@@ -957,12 +1074,12 @@ struct ena_admin_ena_mmio_req_read_less_resp {
 #define ENA_ADMIN_AQ_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS_MASK GENMASK(4, 0)
 
 /* get_set_feature_common_desc */
-#define ENA_ADMIN_GET_SET_FEATURE_COMMON_DESC_SELECT_MASK GENMASK(1, 0)
+#define ENA_ADMIN_GET_SET_FEATURE_COMMON_DESC_SELECT_MASK   GENMASK(1, 0)
 
 /* get_feature_link_desc */
-#define ENA_ADMIN_GET_FEATURE_LINK_DESC_AUTONEG_MASK BIT(0)
-#define ENA_ADMIN_GET_FEATURE_LINK_DESC_DUPLEX_SHIFT 1
-#define ENA_ADMIN_GET_FEATURE_LINK_DESC_DUPLEX_MASK BIT(1)
+#define ENA_ADMIN_GET_FEATURE_LINK_DESC_AUTONEG_MASK        BIT(0)
+#define ENA_ADMIN_GET_FEATURE_LINK_DESC_DUPLEX_SHIFT        1
+#define ENA_ADMIN_GET_FEATURE_LINK_DESC_DUPLEX_MASK         BIT(1)
 
 /* feature_offload_desc */
 #define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L3_CSUM_IPV4_MASK BIT(0)
@@ -974,19 +1091,19 @@ struct ena_admin_ena_mmio_req_read_less_resp {
 #define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV6_CSUM_PART_MASK BIT(3)
 #define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV6_CSUM_FULL_SHIFT 4
 #define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV6_CSUM_FULL_MASK BIT(4)
-#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV4_SHIFT 5
-#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV4_MASK BIT(5)
-#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV6_SHIFT 6
-#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV6_MASK BIT(6)
-#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_ECN_SHIFT 7
-#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_ECN_MASK BIT(7)
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV4_SHIFT       5
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV4_MASK        BIT(5)
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV6_SHIFT       6
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV6_MASK        BIT(6)
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_ECN_SHIFT        7
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_ECN_MASK         BIT(7)
 #define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L3_CSUM_IPV4_MASK BIT(0)
 #define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV4_CSUM_SHIFT 1
 #define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV4_CSUM_MASK BIT(1)
 #define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV6_CSUM_SHIFT 2
 #define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV6_CSUM_MASK BIT(2)
-#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_HASH_SHIFT 3
-#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_HASH_MASK BIT(3)
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_HASH_SHIFT        3
+#define ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_HASH_MASK         BIT(3)
 
 /* feature_rss_flow_hash_function */
 #define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_FUNCTION_FUNCS_MASK GENMASK(7, 0)
@@ -994,25 +1111,35 @@ struct ena_admin_ena_mmio_req_read_less_resp {
 
 /* feature_rss_flow_hash_input */
 #define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L3_SORT_SHIFT 1
-#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L3_SORT_MASK BIT(1)
+#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L3_SORT_MASK  BIT(1)
 #define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L4_SORT_SHIFT 2
-#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L4_SORT_MASK BIT(2)
+#define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_L4_SORT_MASK  BIT(2)
 #define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_ENABLE_L3_SORT_SHIFT 1
 #define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_ENABLE_L3_SORT_MASK BIT(1)
 #define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_ENABLE_L4_SORT_SHIFT 2
 #define ENA_ADMIN_FEATURE_RSS_FLOW_HASH_INPUT_ENABLE_L4_SORT_MASK BIT(2)
 
 /* host_info */
-#define ENA_ADMIN_HOST_INFO_MAJOR_MASK GENMASK(7, 0)
-#define ENA_ADMIN_HOST_INFO_MINOR_SHIFT 8
-#define ENA_ADMIN_HOST_INFO_MINOR_MASK GENMASK(15, 8)
-#define ENA_ADMIN_HOST_INFO_SUB_MINOR_SHIFT 16
-#define ENA_ADMIN_HOST_INFO_SUB_MINOR_MASK GENMASK(23, 16)
+#define ENA_ADMIN_HOST_INFO_MAJOR_MASK                      GENMASK(7, 0)
+#define ENA_ADMIN_HOST_INFO_MINOR_SHIFT                     8
+#define ENA_ADMIN_HOST_INFO_MINOR_MASK                      GENMASK(15, 8)
+#define ENA_ADMIN_HOST_INFO_SUB_MINOR_SHIFT                 16
+#define ENA_ADMIN_HOST_INFO_SUB_MINOR_MASK                  GENMASK(23, 16)
+#define ENA_ADMIN_HOST_INFO_MODULE_TYPE_SHIFT               24
+#define ENA_ADMIN_HOST_INFO_MODULE_TYPE_MASK                GENMASK(31, 24)
+#define ENA_ADMIN_HOST_INFO_FUNCTION_MASK                   GENMASK(2, 0)
+#define ENA_ADMIN_HOST_INFO_DEVICE_SHIFT                    3
+#define ENA_ADMIN_HOST_INFO_DEVICE_MASK                     GENMASK(7, 3)
+#define ENA_ADMIN_HOST_INFO_BUS_SHIFT                       8
+#define ENA_ADMIN_HOST_INFO_BUS_MASK                        GENMASK(15, 8)
+
+/* feature_rss_ind_table */
+#define ENA_ADMIN_FEATURE_RSS_IND_TABLE_ONE_ENTRY_UPDATE_MASK BIT(0)
 
 /* aenq_common_desc */
-#define ENA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK BIT(0)
+#define ENA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK               BIT(0)
 
 /* aenq_link_change_desc */
-#define ENA_ADMIN_AENQ_LINK_CHANGE_DESC_LINK_STATUS_MASK BIT(0)
+#define ENA_ADMIN_AENQ_LINK_CHANGE_DESC_LINK_STATUS_MASK    BIT(0)
 
 #endif /*_ENA_ADMIN_H_ */
diff --git a/drivers/amazon/net/ena/ena_com.c b/drivers/amazon/net/ena/ena_com.c
index 2480863044a88..51847d2797061 100644
--- a/drivers/amazon/net/ena/ena_com.c
+++ b/drivers/amazon/net/ena/ena_com.c
@@ -41,9 +41,6 @@
 #define ENA_ASYNC_QUEUE_DEPTH 16
 #define ENA_ADMIN_QUEUE_DEPTH 32
 
-#define MIN_ENA_VER (((ENA_COMMON_SPEC_VERSION_MAJOR) << \
-		ENA_REGS_VERSION_MAJOR_VERSION_SHIFT) \
-		| (ENA_COMMON_SPEC_VERSION_MINOR))
 
 #define ENA_CTRL_MAJOR		0
 #define ENA_CTRL_MINOR		0
@@ -61,6 +58,8 @@
 
 #define ENA_MMIO_READ_TIMEOUT 0xFFFFFFFF
 
+#define ENA_COM_BOUNCE_BUFFER_CNTRL_CNT	4
+
 #define ENA_REGS_ADMIN_INTR_MASK 1
 
 #define ENA_POLL_MS	5
@@ -236,7 +235,7 @@ static struct ena_comp_ctx *__ena_com_submit_admin_cmd(struct ena_com_admin_queu
 	tail_masked = admin_queue->sq.tail & queue_size_mask;
 
 	/* In case of queue FULL */
-	cnt = atomic_read(&admin_queue->outstanding_cmds);
+	cnt = (u16)atomic_read(&admin_queue->outstanding_cmds);
 	if (cnt >= admin_queue->q_depth) {
 		pr_debug("admin queue is full.\n");
 		admin_queue->stats.out_of_space++;
@@ -305,7 +304,7 @@ static struct ena_comp_ctx *ena_com_submit_admin_cmd(struct ena_com_admin_queue
 						     struct ena_admin_acq_entry *comp,
 						     size_t comp_size_in_bytes)
 {
-	unsigned long flags;
+	unsigned long flags = 0;
 	struct ena_comp_ctx *comp_ctx;
 
 	spin_lock_irqsave(&admin_queue->q_lock, flags);
@@ -317,7 +316,7 @@ static struct ena_comp_ctx *ena_com_submit_admin_cmd(struct ena_com_admin_queue
 					      cmd_size_in_bytes,
 					      comp,
 					      comp_size_in_bytes);
-	if (unlikely(IS_ERR(comp_ctx)))
+	if (IS_ERR(comp_ctx))
 		admin_queue->running_state = false;
 	spin_unlock_irqrestore(&admin_queue->q_lock, flags);
 
@@ -333,6 +332,7 @@ static int ena_com_init_io_sq(struct ena_com_dev *ena_dev,
 
 	memset(&io_sq->desc_addr, 0x0, sizeof(io_sq->desc_addr));
 
+	io_sq->dma_addr_bits = (u8)ena_dev->dma_addr_bits;
 	io_sq->desc_entry_size =
 		(io_sq->direction == ENA_COM_IO_QUEUE_DIRECTION_TX) ?
 		sizeof(struct ena_eth_io_tx_desc) :
@@ -355,21 +355,52 @@ static int ena_com_init_io_sq(struct ena_com_dev *ena_dev,
 						    &io_sq->desc_addr.phys_addr,
 						    GFP_KERNEL);
 		}
-	} else {
+
+		if (!io_sq->desc_addr.virt_addr) {
+			pr_err("memory allocation failed");
+			return -ENOMEM;
+		}
+	}
+
+	if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
+		/* Allocate bounce buffers */
+		io_sq->bounce_buf_ctrl.buffer_size =
+			ena_dev->llq_info.desc_list_entry_size;
+		io_sq->bounce_buf_ctrl.buffers_num =
+			ENA_COM_BOUNCE_BUFFER_CNTRL_CNT;
+		io_sq->bounce_buf_ctrl.next_to_use = 0;
+
+		size = io_sq->bounce_buf_ctrl.buffer_size *
+			io_sq->bounce_buf_ctrl.buffers_num;
+
 		dev_node = dev_to_node(ena_dev->dmadev);
 		set_dev_node(ena_dev->dmadev, ctx->numa_node);
-		io_sq->desc_addr.virt_addr =
+		io_sq->bounce_buf_ctrl.base_buffer =
 			devm_kzalloc(ena_dev->dmadev, size, GFP_KERNEL);
 		set_dev_node(ena_dev->dmadev, dev_node);
-		if (!io_sq->desc_addr.virt_addr) {
-			io_sq->desc_addr.virt_addr =
+		if (!io_sq->bounce_buf_ctrl.base_buffer)
+			io_sq->bounce_buf_ctrl.base_buffer =
 				devm_kzalloc(ena_dev->dmadev, size, GFP_KERNEL);
+
+		if (!io_sq->bounce_buf_ctrl.base_buffer) {
+			pr_err("bounce buffer memory allocation failed");
+			return -ENOMEM;
 		}
-	}
 
-	if (!io_sq->desc_addr.virt_addr) {
-		pr_err("memory allocation failed");
-		return -ENOMEM;
+		memcpy(&io_sq->llq_info, &ena_dev->llq_info,
+		       sizeof(io_sq->llq_info));
+
+		/* Initiate the first bounce buffer */
+		io_sq->llq_buf_ctrl.curr_bounce_buf =
+			ena_com_get_next_bounce_buffer(&io_sq->bounce_buf_ctrl);
+		memset(io_sq->llq_buf_ctrl.curr_bounce_buf,
+		       0x0, io_sq->llq_info.desc_list_entry_size);
+		io_sq->llq_buf_ctrl.descs_left_in_line =
+			io_sq->llq_info.descs_num_before_header;
+
+		if (io_sq->llq_info.max_entries_in_tx_burst > 0)
+			io_sq->entries_in_tx_burst_left =
+				io_sq->llq_info.max_entries_in_tx_burst;
 	}
 
 	io_sq->tail = 0;
@@ -460,12 +491,12 @@ static void ena_com_handle_admin_completion(struct ena_com_admin_queue *admin_qu
 	cqe = &admin_queue->cq.entries[head_masked];
 
 	/* Go over all the completions */
-	while ((cqe->acq_common_descriptor.flags &
-			ENA_ADMIN_ACQ_COMMON_DESC_PHASE_MASK) == phase) {
+	while ((READ_ONCE(cqe->acq_common_descriptor.flags) &
+		ENA_ADMIN_ACQ_COMMON_DESC_PHASE_MASK) == phase) {
 		/* Do not read the rest of the completion entry before the
 		 * phase bit was validated
 		 */
-		rmb();
+		dma_rmb();
 		ena_com_handle_single_admin_completion(admin_queue, cqe);
 
 		head_masked++;
@@ -512,7 +543,8 @@ static int ena_com_comp_status_to_errno(u8 comp_status)
 static int ena_com_wait_and_process_admin_cq_polling(struct ena_comp_ctx *comp_ctx,
 						     struct ena_com_admin_queue *admin_queue)
 {
-	unsigned long flags, timeout;
+	unsigned long flags = 0;
+	unsigned long timeout;
 	int ret;
 
 	timeout = jiffies + usecs_to_jiffies(admin_queue->completion_timeout);
@@ -558,10 +590,162 @@ static int ena_com_wait_and_process_admin_cq_polling(struct ena_comp_ctx *comp_c
 	return ret;
 }
 
+/**
+ * Set the LLQ configurations of the firmware
+ *
+ * The driver provides only the enabled feature values to the device,
+ * which in turn, checks if they are supported.
+ */
+static int ena_com_set_llq(struct ena_com_dev *ena_dev)
+{
+	struct ena_com_admin_queue *admin_queue;
+	struct ena_admin_set_feat_cmd cmd;
+	struct ena_admin_set_feat_resp resp;
+	struct ena_com_llq_info *llq_info = &ena_dev->llq_info;
+	int ret;
+
+	memset(&cmd, 0x0, sizeof(cmd));
+	admin_queue = &ena_dev->admin_queue;
+
+	cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE;
+	cmd.feat_common.feature_id = ENA_ADMIN_LLQ;
+
+	cmd.u.llq.header_location_ctrl_enabled = llq_info->header_location_ctrl;
+	cmd.u.llq.entry_size_ctrl_enabled = llq_info->desc_list_entry_size_ctrl;
+	cmd.u.llq.desc_num_before_header_enabled = llq_info->descs_num_before_header;
+	cmd.u.llq.descriptors_stride_ctrl_enabled = llq_info->desc_stride_ctrl;
+
+	ret = ena_com_execute_admin_command(admin_queue,
+					    (struct ena_admin_aq_entry *)&cmd,
+					    sizeof(cmd),
+					    (struct ena_admin_acq_entry *)&resp,
+					    sizeof(resp));
+
+	if (unlikely(ret))
+		pr_err("Failed to set LLQ configurations: %d\n", ret);
+
+	return ret;
+}
+
+static int ena_com_config_llq_info(struct ena_com_dev *ena_dev,
+				   struct ena_admin_feature_llq_desc *llq_features,
+				   struct ena_llq_configurations *llq_default_cfg)
+{
+	struct ena_com_llq_info *llq_info = &ena_dev->llq_info;
+	u16 supported_feat;
+	int rc;
+
+	memset(llq_info, 0, sizeof(*llq_info));
+
+	supported_feat = llq_features->header_location_ctrl_supported;
+
+	if (likely(supported_feat & llq_default_cfg->llq_header_location)) {
+		llq_info->header_location_ctrl =
+			llq_default_cfg->llq_header_location;
+	} else {
+		pr_err("Invalid header location control, supported: 0x%x\n",
+		       supported_feat);
+		return -EINVAL;
+	}
+
+	if (likely(llq_info->header_location_ctrl == ENA_ADMIN_INLINE_HEADER)) {
+		supported_feat = llq_features->descriptors_stride_ctrl_supported;
+		if (likely(supported_feat & llq_default_cfg->llq_stride_ctrl)) {
+			llq_info->desc_stride_ctrl = llq_default_cfg->llq_stride_ctrl;
+		} else	{
+			if (supported_feat & ENA_ADMIN_MULTIPLE_DESCS_PER_ENTRY) {
+				llq_info->desc_stride_ctrl = ENA_ADMIN_MULTIPLE_DESCS_PER_ENTRY;
+			} else if (supported_feat & ENA_ADMIN_SINGLE_DESC_PER_ENTRY) {
+				llq_info->desc_stride_ctrl = ENA_ADMIN_SINGLE_DESC_PER_ENTRY;
+			} else {
+				pr_err("Invalid desc_stride_ctrl, supported: 0x%x\n",
+				       supported_feat);
+				return -EINVAL;
+			}
+
+			pr_err("Default llq stride ctrl is not supported, performing fallback, default: 0x%x, supported: 0x%x, used: 0x%x\n",
+			       llq_default_cfg->llq_stride_ctrl, supported_feat,
+			       llq_info->desc_stride_ctrl);
+		}
+	} else {
+		llq_info->desc_stride_ctrl = 0;
+	}
+
+	supported_feat = llq_features->entry_size_ctrl_supported;
+	if (likely(supported_feat & llq_default_cfg->llq_ring_entry_size)) {
+		llq_info->desc_list_entry_size_ctrl = llq_default_cfg->llq_ring_entry_size;
+		llq_info->desc_list_entry_size = llq_default_cfg->llq_ring_entry_size_value;
+	} else {
+		if (supported_feat & ENA_ADMIN_LIST_ENTRY_SIZE_128B) {
+			llq_info->desc_list_entry_size_ctrl = ENA_ADMIN_LIST_ENTRY_SIZE_128B;
+			llq_info->desc_list_entry_size = 128;
+		} else if (supported_feat & ENA_ADMIN_LIST_ENTRY_SIZE_192B) {
+			llq_info->desc_list_entry_size_ctrl = ENA_ADMIN_LIST_ENTRY_SIZE_192B;
+			llq_info->desc_list_entry_size = 192;
+		} else if (supported_feat & ENA_ADMIN_LIST_ENTRY_SIZE_256B) {
+			llq_info->desc_list_entry_size_ctrl = ENA_ADMIN_LIST_ENTRY_SIZE_256B;
+			llq_info->desc_list_entry_size = 256;
+		} else {
+			pr_err("Invalid entry_size_ctrl, supported: 0x%x\n",
+			       supported_feat);
+			return -EINVAL;
+		}
+
+		pr_err("Default llq ring entry size is not supported, performing fallback, default: 0x%x, supported: 0x%x, used: 0x%x\n",
+		       llq_default_cfg->llq_ring_entry_size, supported_feat,
+		       llq_info->desc_list_entry_size);
+	}
+	if (unlikely(llq_info->desc_list_entry_size & 0x7)) {
+		/* The desc list entry size should be whole multiply of 8
+		 * This requirement comes from __iowrite64_copy()
+		 */
+		pr_err("illegal entry size %d\n", llq_info->desc_list_entry_size);
+		return -EINVAL;
+	}
+
+	if (llq_info->desc_stride_ctrl == ENA_ADMIN_MULTIPLE_DESCS_PER_ENTRY)
+		llq_info->descs_per_entry = llq_info->desc_list_entry_size /
+			sizeof(struct ena_eth_io_tx_desc);
+	else
+		llq_info->descs_per_entry = 1;
+
+	supported_feat = llq_features->desc_num_before_header_supported;
+	if (likely(supported_feat & llq_default_cfg->llq_num_decs_before_header)) {
+		llq_info->descs_num_before_header = llq_default_cfg->llq_num_decs_before_header;
+	} else {
+		if (supported_feat & ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_2) {
+			llq_info->descs_num_before_header = ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_2;
+		} else if (supported_feat & ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_1) {
+			llq_info->descs_num_before_header = ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_1;
+		} else if (supported_feat & ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_4) {
+			llq_info->descs_num_before_header = ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_4;
+		} else if (supported_feat & ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_8) {
+			llq_info->descs_num_before_header = ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_8;
+		} else {
+			pr_err("Invalid descs_num_before_header, supported: 0x%x\n",
+			       supported_feat);
+			return -EINVAL;
+		}
+
+		pr_err("Default llq num descs before header is not supported, performing fallback, default: 0x%x, supported: 0x%x, used: 0x%x\n",
+		       llq_default_cfg->llq_num_decs_before_header,
+		       supported_feat, llq_info->descs_num_before_header);
+	}
+
+	llq_info->max_entries_in_tx_burst =
+		(u16)(llq_features->max_tx_burst_size /	llq_default_cfg->llq_ring_entry_size_value);
+
+	rc = ena_com_set_llq(ena_dev);
+	if (rc)
+		pr_err("Cannot set LLQ configuration: %d\n", rc);
+
+	return 0;
+}
+
 static int ena_com_wait_and_process_admin_cq_interrupts(struct ena_comp_ctx *comp_ctx,
 							struct ena_com_admin_queue *admin_queue)
 {
-	unsigned long flags;
+	unsigned long flags = 0;
 	int ret;
 
 	wait_for_completion_timeout(&comp_ctx->wait_event,
@@ -607,7 +791,7 @@ static u32 ena_com_reg_bar_read32(struct ena_com_dev *ena_dev, u16 offset)
 	volatile struct ena_admin_ena_mmio_req_read_less_resp *read_resp =
 		mmio_read->read_resp;
 	u32 mmio_read_reg, ret, i;
-	unsigned long flags;
+	unsigned long flags = 0;
 	u32 timeout = mmio_read->reg_read_to;
 
 	might_sleep();
@@ -628,15 +812,10 @@ static u32 ena_com_reg_bar_read32(struct ena_com_dev *ena_dev, u16 offset)
 	mmio_read_reg |= mmio_read->seq_num &
 			ENA_REGS_MMIO_REG_READ_REQ_ID_MASK;
 
-	/* make sure read_resp->req_id get updated before the hw can write
-	 * there
-	 */
-	wmb();
-
 	writel(mmio_read_reg, ena_dev->reg_bar + ENA_REGS_MMIO_REG_READ_OFF);
 
 	for (i = 0; i < timeout; i++) {
-		if (read_resp->req_id == mmio_read->seq_num)
+		if (READ_ONCE(read_resp->req_id) == mmio_read->seq_num)
 			break;
 
 		udelay(1);
@@ -734,15 +913,17 @@ static void ena_com_io_queue_free(struct ena_com_dev *ena_dev,
 	if (io_sq->desc_addr.virt_addr) {
 		size = io_sq->desc_entry_size * io_sq->q_depth;
 
-		if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST)
-			dma_free_coherent(ena_dev->dmadev, size,
-					  io_sq->desc_addr.virt_addr,
-					  io_sq->desc_addr.phys_addr);
-		else
-			devm_kfree(ena_dev->dmadev, io_sq->desc_addr.virt_addr);
+		dma_free_coherent(ena_dev->dmadev, size,
+				  io_sq->desc_addr.virt_addr,
+				  io_sq->desc_addr.phys_addr);
 
 		io_sq->desc_addr.virt_addr = NULL;
 	}
+
+	if (io_sq->bounce_buf_ctrl.base_buffer) {
+		devm_kfree(ena_dev->dmadev, io_sq->bounce_buf_ctrl.base_buffer);
+		io_sq->bounce_buf_ctrl.base_buffer = NULL;
+	}
 }
 
 static int wait_for_reset_state(struct ena_com_dev *ena_dev, u32 timeout,
@@ -788,7 +969,8 @@ static int ena_com_get_feature_ex(struct ena_com_dev *ena_dev,
 				  struct ena_admin_get_feat_resp *get_resp,
 				  enum ena_admin_aq_feature_id feature_id,
 				  dma_addr_t control_buf_dma_addr,
-				  u32 control_buff_size)
+				  u32 control_buff_size,
+				  u8 feature_ver)
 {
 	struct ena_com_admin_queue *admin_queue;
 	struct ena_admin_get_feat_cmd get_cmd;
@@ -819,7 +1001,7 @@ static int ena_com_get_feature_ex(struct ena_com_dev *ena_dev,
 	}
 
 	get_cmd.control_buffer.length = control_buff_size;
-
+	get_cmd.feat_common.feature_version = feature_ver;
 	get_cmd.feat_common.feature_id = feature_id;
 
 	ret = ena_com_execute_admin_command(admin_queue,
@@ -839,13 +1021,15 @@ static int ena_com_get_feature_ex(struct ena_com_dev *ena_dev,
 
 static int ena_com_get_feature(struct ena_com_dev *ena_dev,
 			       struct ena_admin_get_feat_resp *get_resp,
-			       enum ena_admin_aq_feature_id feature_id)
+			       enum ena_admin_aq_feature_id feature_id,
+			       u8 feature_ver)
 {
 	return ena_com_get_feature_ex(ena_dev,
 				      get_resp,
 				      feature_id,
 				      0,
-				      0);
+				      0,
+				      feature_ver);
 }
 
 static int ena_com_hash_key_allocate(struct ena_com_dev *ena_dev)
@@ -905,7 +1089,7 @@ static int ena_com_indirect_table_allocate(struct ena_com_dev *ena_dev,
 	int ret;
 
 	ret = ena_com_get_feature(ena_dev, &get_resp,
-				  ENA_ADMIN_RSS_REDIRECTION_TABLE_CONFIG);
+				  ENA_ADMIN_RSS_REDIRECTION_TABLE_CONFIG, 0);
 	if (unlikely(ret))
 		return ret;
 
@@ -1136,7 +1320,7 @@ int ena_com_execute_admin_command(struct ena_com_admin_queue *admin_queue,
 
 	comp_ctx = ena_com_submit_admin_cmd(admin_queue, cmd, cmd_size,
 					    comp, comp_size);
-	if (unlikely(IS_ERR(comp_ctx))) {
+	if (IS_ERR(comp_ctx)) {
 		if (comp_ctx == ERR_PTR(-ENODEV))
 			pr_debug("Failed to submit command [%ld]\n",
 				 PTR_ERR(comp_ctx));
@@ -1254,7 +1438,7 @@ void ena_com_abort_admin_commands(struct ena_com_dev *ena_dev)
 void ena_com_wait_for_abort_completion(struct ena_com_dev *ena_dev)
 {
 	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
-	unsigned long flags;
+	unsigned long flags = 0;
 
 	spin_lock_irqsave(&admin_queue->q_lock, flags);
 	while (atomic_read(&admin_queue->outstanding_cmds) != 0) {
@@ -1298,7 +1482,7 @@ bool ena_com_get_admin_running_state(struct ena_com_dev *ena_dev)
 void ena_com_set_admin_running_state(struct ena_com_dev *ena_dev, bool state)
 {
 	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
-	unsigned long flags;
+	unsigned long flags = 0;
 
 	spin_lock_irqsave(&admin_queue->q_lock, flags);
 	ena_dev->admin_queue.running_state = state;
@@ -1325,14 +1509,14 @@ int ena_com_set_aenq_config(struct ena_com_dev *ena_dev, u32 groups_flag)
 	struct ena_admin_get_feat_resp get_resp;
 	int ret;
 
-	ret = ena_com_get_feature(ena_dev, &get_resp, ENA_ADMIN_AENQ_CONFIG);
+	ret = ena_com_get_feature(ena_dev, &get_resp, ENA_ADMIN_AENQ_CONFIG, 0);
 	if (ret) {
 		pr_info("Can't get aenq configuration\n");
 		return ret;
 	}
 
 	if ((get_resp.u.aenq.supported_groups & groups_flag) != groups_flag) {
-		pr_warn("Trying to set unsupported aenq events. supported flag: %x asked flag: %x\n",
+		pr_warn("Trying to set unsupported aenq events. supported flag: 0x%x asked flag: 0x%x\n",
 			get_resp.u.aenq.supported_groups, groups_flag);
 		return -EOPNOTSUPP;
 	}
@@ -1406,11 +1590,6 @@ int ena_com_validate_version(struct ena_com_dev *ena_dev)
 			ENA_REGS_VERSION_MAJOR_VERSION_SHIFT,
 		ver & ENA_REGS_VERSION_MINOR_VERSION_MASK);
 
-	if (ver < MIN_ENA_VER) {
-		pr_err("ENA version is lower than the minimal version the driver supports\n");
-		return -1;
-	}
-
 	pr_info("ena controller version: %d.%d.%d implementation version %d\n",
 		(ctrl_ver & ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK) >>
 			ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT,
@@ -1485,7 +1664,7 @@ int ena_com_mmio_reg_read_request_init(struct ena_com_dev *ena_dev)
 				    sizeof(*mmio_read->read_resp),
 				    &mmio_read->read_resp_dma_addr, GFP_KERNEL);
 	if (unlikely(!mmio_read->read_resp))
-		return -ENOMEM;
+		goto err;
 
 	ena_com_mmio_reg_read_request_write_dev_addr(ena_dev);
 
@@ -1494,6 +1673,10 @@ int ena_com_mmio_reg_read_request_init(struct ena_com_dev *ena_dev)
 	mmio_read->readless_supported = true;
 
 	return 0;
+
+err:
+
+	return -ENOMEM;
 }
 
 void ena_com_set_mmio_read_mode(struct ena_com_dev *ena_dev, bool readless_supported)
@@ -1529,8 +1712,7 @@ void ena_com_mmio_reg_read_request_write_dev_addr(struct ena_com_dev *ena_dev)
 }
 
 int ena_com_admin_init(struct ena_com_dev *ena_dev,
-		       struct ena_aenq_handlers *aenq_handlers,
-		       bool init_spinlock)
+		       struct ena_aenq_handlers *aenq_handlers)
 {
 	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
 	u32 aq_caps, acq_caps, dev_sts, addr_low, addr_high;
@@ -1557,8 +1739,7 @@ int ena_com_admin_init(struct ena_com_dev *ena_dev,
 
 	atomic_set(&admin_queue->outstanding_cmds, 0);
 
-	if (init_spinlock)
-		spin_lock_init(&admin_queue->q_lock);
+	spin_lock_init(&admin_queue->q_lock);
 
 	ret = ena_com_init_comp_ctxt(admin_queue);
 	if (ret)
@@ -1698,7 +1879,62 @@ void ena_com_destroy_io_queue(struct ena_com_dev *ena_dev, u16 qid)
 int ena_com_get_link_params(struct ena_com_dev *ena_dev,
 			    struct ena_admin_get_feat_resp *resp)
 {
-	return ena_com_get_feature(ena_dev, resp, ENA_ADMIN_LINK_CONFIG);
+	return ena_com_get_feature(ena_dev, resp, ENA_ADMIN_LINK_CONFIG, 0);
+}
+
+int ena_com_extra_properties_strings_init(struct ena_com_dev *ena_dev)
+{
+	struct ena_admin_get_feat_resp resp;
+	struct ena_extra_properties_strings *extra_properties_strings =
+			&ena_dev->extra_properties_strings;
+	u32 rc;
+	extra_properties_strings->size = ENA_ADMIN_EXTRA_PROPERTIES_COUNT *
+		ENA_ADMIN_EXTRA_PROPERTIES_STRING_LEN;
+
+	extra_properties_strings->virt_addr =
+		dma_zalloc_coherent(ena_dev->dmadev,
+				    extra_properties_strings->size,
+				    &extra_properties_strings->dma_addr,
+				    GFP_KERNEL);
+	if (unlikely(!extra_properties_strings->virt_addr)) {
+		pr_err("Failed to allocate extra properties strings\n");
+		return 0;
+	}
+
+	rc = ena_com_get_feature_ex(ena_dev, &resp,
+				    ENA_ADMIN_EXTRA_PROPERTIES_STRINGS,
+				    extra_properties_strings->dma_addr,
+				    extra_properties_strings->size, 0);
+	if (rc) {
+		pr_debug("Failed to get extra properties strings\n");
+		goto err;
+	}
+
+	return resp.u.extra_properties_strings.count;
+err:
+	ena_com_delete_extra_properties_strings(ena_dev);
+	return 0;
+}
+
+void ena_com_delete_extra_properties_strings(struct ena_com_dev *ena_dev)
+{
+	struct ena_extra_properties_strings *extra_properties_strings =
+				&ena_dev->extra_properties_strings;
+
+	if (extra_properties_strings->virt_addr) {
+		dma_free_coherent(ena_dev->dmadev,
+				  extra_properties_strings->size,
+				  extra_properties_strings->virt_addr,
+				  extra_properties_strings->dma_addr);
+		extra_properties_strings->virt_addr = NULL;
+	}
+}
+
+int ena_com_get_extra_properties_flags(struct ena_com_dev *ena_dev,
+				       struct ena_admin_get_feat_resp *resp)
+{
+	return ena_com_get_feature(ena_dev, resp,
+				   ENA_ADMIN_EXTRA_PROPERTIES_FLAGS, 0);
 }
 
 int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev,
@@ -1708,7 +1944,7 @@ int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev,
 	int rc;
 
 	rc = ena_com_get_feature(ena_dev, &get_resp,
-				 ENA_ADMIN_DEVICE_ATTRIBUTES);
+				 ENA_ADMIN_DEVICE_ATTRIBUTES, 0);
 	if (rc)
 		return rc;
 
@@ -1716,17 +1952,34 @@ int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev,
 	       sizeof(get_resp.u.dev_attr));
 	ena_dev->supported_features = get_resp.u.dev_attr.supported_features;
 
-	rc = ena_com_get_feature(ena_dev, &get_resp,
-				 ENA_ADMIN_MAX_QUEUES_NUM);
-	if (rc)
-		return rc;
+	if (ena_dev->supported_features & BIT(ENA_ADMIN_MAX_QUEUES_EXT)) {
+		rc = ena_com_get_feature(ena_dev, &get_resp,
+					 ENA_ADMIN_MAX_QUEUES_EXT,
+					 ENA_FEATURE_MAX_QUEUE_EXT_VER);
+		if (rc)
+			return rc;
+
+		if (get_resp.u.max_queue_ext.version != ENA_FEATURE_MAX_QUEUE_EXT_VER)
+			return -EINVAL;
 
-	memcpy(&get_feat_ctx->max_queues, &get_resp.u.max_queue,
-	       sizeof(get_resp.u.max_queue));
-	ena_dev->tx_max_header_size = get_resp.u.max_queue.max_header_size;
+		memcpy(&get_feat_ctx->max_queue_ext, &get_resp.u.max_queue_ext,
+		       sizeof(get_resp.u.max_queue_ext));
+		ena_dev->tx_max_header_size =
+			get_resp.u.max_queue_ext.max_queue_ext.max_tx_header_size;
+	} else {
+		rc = ena_com_get_feature(ena_dev, &get_resp,
+					 ENA_ADMIN_MAX_QUEUES_NUM, 0);
+		memcpy(&get_feat_ctx->max_queues, &get_resp.u.max_queue,
+		       sizeof(get_resp.u.max_queue));
+		ena_dev->tx_max_header_size =
+			get_resp.u.max_queue.max_header_size;
+
+		if (rc)
+			return rc;
+	}
 
 	rc = ena_com_get_feature(ena_dev, &get_resp,
-				 ENA_ADMIN_AENQ_CONFIG);
+				 ENA_ADMIN_AENQ_CONFIG, 0);
 	if (rc)
 		return rc;
 
@@ -1734,7 +1987,7 @@ int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev,
 	       sizeof(get_resp.u.aenq));
 
 	rc = ena_com_get_feature(ena_dev, &get_resp,
-				 ENA_ADMIN_STATELESS_OFFLOAD_CONFIG);
+				 ENA_ADMIN_STATELESS_OFFLOAD_CONFIG, 0);
 	if (rc)
 		return rc;
 
@@ -1744,7 +1997,7 @@ int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev,
 	/* Driver hints isn't mandatory admin command. So in case the
 	 * command isn't supported set driver hints to 0
 	 */
-	rc = ena_com_get_feature(ena_dev, &get_resp, ENA_ADMIN_HW_HINTS);
+	rc = ena_com_get_feature(ena_dev, &get_resp, ENA_ADMIN_HW_HINTS, 0);
 
 	if (!rc)
 		memcpy(&get_feat_ctx->hw_hints, &get_resp.u.hw_hints,
@@ -1755,6 +2008,26 @@ int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev,
 	else
 		return rc;
 
+	rc = ena_com_get_feature(ena_dev, &get_resp, ENA_ADMIN_LLQ, 0);
+	if (!rc)
+		memcpy(&get_feat_ctx->llq, &get_resp.u.llq,
+		       sizeof(get_resp.u.llq));
+	else if (rc == -EOPNOTSUPP)
+		memset(&get_feat_ctx->llq, 0x0, sizeof(get_feat_ctx->llq));
+	else
+		return rc;
+
+	rc = ena_com_get_feature(ena_dev, &get_resp,
+				 ENA_ADMIN_RSS_REDIRECTION_TABLE_CONFIG, 0);
+	if (!rc)
+		memcpy(&get_feat_ctx->ind_table, &get_resp.u.ind_table,
+		       sizeof(get_resp.u.ind_table));
+	else if (rc == -EOPNOTSUPP)
+		memset(&get_feat_ctx->ind_table, 0x0,
+		       sizeof(get_feat_ctx->ind_table));
+	else
+		return rc;
+
 	return 0;
 }
 
@@ -1786,6 +2059,7 @@ void ena_com_aenq_intr_handler(struct ena_com_dev *dev, void *data)
 	struct ena_admin_aenq_entry *aenq_e;
 	struct ena_admin_aenq_common_desc *aenq_common;
 	struct ena_com_aenq *aenq  = &dev->aenq;
+	unsigned long long timestamp;
 	ena_aenq_handler handler_cb;
 	u16 masked_head, processed = 0;
 	u8 phase;
@@ -1796,12 +2070,18 @@ void ena_com_aenq_intr_handler(struct ena_com_dev *dev, void *data)
 	aenq_common = &aenq_e->aenq_common_desc;
 
 	/* Go over all the events */
-	while ((aenq_common->flags & ENA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK) ==
-	       phase) {
+	while ((READ_ONCE(aenq_common->flags) &
+		ENA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK) == phase) {
+		/* Make sure the phase bit (ownership) is as expected before
+		 * reading the rest of the descriptor.
+		 */
+		dma_rmb();
+
+		timestamp =
+			(unsigned long long)aenq_common->timestamp_low |
+			((unsigned long long)aenq_common->timestamp_high << 32);
 		pr_debug("AENQ! Group[%x] Syndrom[%x] timestamp: [%llus]\n",
-			 aenq_common->group, aenq_common->syndrom,
-			 (u64)aenq_common->timestamp_low +
-				 ((u64)aenq_common->timestamp_high << 32));
+			 aenq_common->group, aenq_common->syndrom, timestamp);
 
 		/* Handle specific event*/
 		handler_cb = ena_com_get_specific_aenq_cb(dev,
@@ -1829,7 +2109,8 @@ void ena_com_aenq_intr_handler(struct ena_com_dev *dev, void *data)
 
 	/* write the aenq doorbell after all AENQ descriptors were read */
 	mb();
-	writel((u32)aenq->head, dev->reg_bar + ENA_REGS_AENQ_HEAD_DB_OFF);
+	writel_relaxed((u32)aenq->head, dev->reg_bar + ENA_REGS_AENQ_HEAD_DB_OFF);
+	mmiowb();
 }
 
 int ena_com_dev_reset(struct ena_com_dev *ena_dev,
@@ -1975,7 +2256,7 @@ int ena_com_get_offload_settings(struct ena_com_dev *ena_dev,
 	struct ena_admin_get_feat_resp resp;
 
 	ret = ena_com_get_feature(ena_dev, &resp,
-				  ENA_ADMIN_STATELESS_OFFLOAD_CONFIG);
+				  ENA_ADMIN_STATELESS_OFFLOAD_CONFIG, 0);
 	if (unlikely(ret)) {
 		pr_err("Failed to get offload capabilities %d\n", ret);
 		return ret;
@@ -2004,7 +2285,7 @@ int ena_com_set_hash_function(struct ena_com_dev *ena_dev)
 
 	/* Validate hash function is supported */
 	ret = ena_com_get_feature(ena_dev, &get_resp,
-				  ENA_ADMIN_RSS_HASH_FUNCTION);
+				  ENA_ADMIN_RSS_HASH_FUNCTION, 0);
 	if (unlikely(ret))
 		return ret;
 
@@ -2064,7 +2345,7 @@ int ena_com_fill_hash_function(struct ena_com_dev *ena_dev,
 	rc = ena_com_get_feature_ex(ena_dev, &get_resp,
 				    ENA_ADMIN_RSS_HASH_FUNCTION,
 				    rss->hash_key_dma_addr,
-				    sizeof(*rss->hash_key));
+				    sizeof(*rss->hash_key), 0);
 	if (unlikely(rc))
 		return rc;
 
@@ -2115,7 +2396,7 @@ int ena_com_get_hash_function(struct ena_com_dev *ena_dev,
 	rc = ena_com_get_feature_ex(ena_dev, &get_resp,
 				    ENA_ADMIN_RSS_HASH_FUNCTION,
 				    rss->hash_key_dma_addr,
-				    sizeof(*rss->hash_key));
+				    sizeof(*rss->hash_key), 0);
 	if (unlikely(rc))
 		return rc;
 
@@ -2140,7 +2421,7 @@ int ena_com_get_hash_ctrl(struct ena_com_dev *ena_dev,
 	rc = ena_com_get_feature_ex(ena_dev, &get_resp,
 				    ENA_ADMIN_RSS_HASH_INPUT,
 				    rss->hash_ctrl_dma_addr,
-				    sizeof(*rss->hash_ctrl));
+				    sizeof(*rss->hash_ctrl), 0);
 	if (unlikely(rc))
 		return rc;
 
@@ -2376,7 +2657,7 @@ int ena_com_indirect_table_get(struct ena_com_dev *ena_dev, u32 *ind_tbl)
 	rc = ena_com_get_feature_ex(ena_dev, &get_resp,
 				    ENA_ADMIN_RSS_REDIRECTION_TABLE_CONFIG,
 				    rss->rss_ind_tbl_dma_addr,
-				    tbl_size);
+				    tbl_size, 0);
 	if (unlikely(rc))
 		return rc;
 
@@ -2441,6 +2722,10 @@ int ena_com_allocate_host_info(struct ena_com_dev *ena_dev)
 	if (unlikely(!host_attr->host_info))
 		return -ENOMEM;
 
+	host_attr->host_info->ena_spec_version = ((ENA_COMMON_SPEC_VERSION_MAJOR <<
+		ENA_REGS_VERSION_MAJOR_VERSION_SHIFT) |
+		(ENA_COMMON_SPEC_VERSION_MINOR));
+
 	return 0;
 }
 
@@ -2586,7 +2871,7 @@ int ena_com_init_interrupt_moderation(struct ena_com_dev *ena_dev)
 	int rc;
 
 	rc = ena_com_get_feature(ena_dev, &get_resp,
-				 ENA_ADMIN_INTERRUPT_MODERATION);
+				 ENA_ADMIN_INTERRUPT_MODERATION, 0);
 
 	if (rc) {
 		if (rc == -EOPNOTSUPP) {
@@ -2712,3 +2997,34 @@ void ena_com_get_intr_moderation_entry(struct ena_com_dev *ena_dev,
 	intr_moder_tbl[level].pkts_per_interval;
 	entry->bytes_per_interval = intr_moder_tbl[level].bytes_per_interval;
 }
+
+int ena_com_config_dev_mode(struct ena_com_dev *ena_dev,
+			    struct ena_admin_feature_llq_desc *llq_features,
+			    struct ena_llq_configurations *llq_default_cfg)
+{
+	int rc;
+	int size;
+
+	if (!llq_features->max_llq_num) {
+		ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
+		return 0;
+	}
+
+	rc = ena_com_config_llq_info(ena_dev, llq_features, llq_default_cfg);
+	if (rc)
+		return rc;
+
+	/* Validate the descriptor is not too big */
+	size = ena_dev->tx_max_header_size;
+	size += ena_dev->llq_info.descs_num_before_header *
+		sizeof(struct ena_eth_io_tx_desc);
+
+	if (unlikely(ena_dev->llq_info.desc_list_entry_size < size)) {
+		pr_err("the size of the LLQ entry is smaller than needed\n");
+		return -EINVAL;
+	}
+
+	ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_DEV;
+
+	return 0;
+}
diff --git a/drivers/amazon/net/ena/ena_com.h b/drivers/amazon/net/ena/ena_com.h
index bd9c00110f87d..27a85750309f7 100644
--- a/drivers/amazon/net/ena/ena_com.h
+++ b/drivers/amazon/net/ena/ena_com.h
@@ -37,6 +37,8 @@
 #include <linux/delay.h>
 #include <linux/dma-mapping.h>
 #include <linux/gfp.h>
+#include <linux/io.h>
+#include <linux/prefetch.h>
 #include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
@@ -99,6 +101,8 @@
 
 #define ENA_HW_HINTS_NO_TIMEOUT				0xFFFF
 
+#define ENA_FEATURE_MAX_QUEUE_EXT_VER	1
+
 enum ena_intr_moder_level {
 	ENA_INTR_MODER_LOWEST = 0,
 	ENA_INTR_MODER_LOW,
@@ -108,6 +112,14 @@ enum ena_intr_moder_level {
 	ENA_INTR_MAX_NUM_OF_LEVELS,
 };
 
+struct ena_llq_configurations {
+	enum ena_admin_llq_header_location llq_header_location;
+	enum ena_admin_llq_ring_entry_size llq_ring_entry_size;
+	enum ena_admin_llq_stride_ctrl  llq_stride_ctrl;
+	enum ena_admin_llq_num_descs_before_header llq_num_decs_before_header;
+	u16 llq_ring_entry_size_value;
+};
+
 struct ena_intr_moder_entry {
 	unsigned int intr_moder_interval;
 	unsigned int pkts_per_interval;
@@ -142,6 +154,16 @@ struct ena_com_tx_meta {
 	u16 l4_hdr_len; /* In words */
 };
 
+struct ena_com_llq_info {
+	u16 header_location_ctrl;
+	u16 desc_stride_ctrl;
+	u16 desc_list_entry_size_ctrl;
+	u16 desc_list_entry_size;
+	u16 descs_num_before_header;
+	u16 descs_per_entry;
+	u16 max_entries_in_tx_burst;
+};
+
 struct ena_com_io_cq {
 	struct ena_com_io_desc_addr cdesc_addr;
 	void *bus;
@@ -180,6 +202,20 @@ struct ena_com_io_cq {
 
 } ____cacheline_aligned;
 
+struct ena_com_io_bounce_buffer_control {
+	u8 *base_buffer;
+	u16 next_to_use;
+	u16 buffer_size;
+	u16 buffers_num;  /* Must be a power of 2 */
+};
+
+/* This struct is to keep tracking the current location of the next llq entry */
+struct ena_com_llq_pkt_ctrl {
+	u8 *curr_bounce_buf;
+	u16 idx;
+	u16 descs_left_in_line;
+};
+
 struct ena_com_io_sq {
 	struct ena_com_io_desc_addr desc_addr;
 	void *bus;
@@ -192,6 +228,9 @@ struct ena_com_io_sq {
 
 	u32 msix_vector;
 	struct ena_com_tx_meta cached_tx_meta;
+	struct ena_com_llq_info llq_info;
+	struct ena_com_llq_pkt_ctrl llq_buf_ctrl;
+	struct ena_com_io_bounce_buffer_control bounce_buf_ctrl;
 
 	u16 q_depth;
 	u16 qid;
@@ -199,10 +238,12 @@ struct ena_com_io_sq {
 	u16 idx;
 	u16 tail;
 	u16 next_to_comp;
+	u16 llq_last_copy_tail;
 	u32 tx_max_header_size;
 	u8 phase;
 	u8 desc_entry_size;
 	u8 dma_addr_bits;
+	u16 entries_in_tx_burst_left;
 } ____cacheline_aligned;
 
 struct ena_com_admin_cq {
@@ -311,6 +352,12 @@ struct ena_host_attribute {
 	dma_addr_t host_info_dma_addr;
 };
 
+struct ena_extra_properties_strings {
+	u8 *virt_addr;
+	dma_addr_t dma_addr;
+	u32 size;
+};
+
 /* Each ena_dev is a PCI function. */
 struct ena_com_dev {
 	struct ena_com_admin_queue admin_queue;
@@ -338,14 +385,20 @@ struct ena_com_dev {
 	u16 intr_delay_resolution;
 	u32 intr_moder_tx_interval;
 	struct ena_intr_moder_entry *intr_moder_tbl;
+
+	struct ena_com_llq_info llq_info;
+	struct ena_extra_properties_strings extra_properties_strings;
 };
 
 struct ena_com_dev_get_features_ctx {
 	struct ena_admin_queue_feature_desc max_queues;
+	struct ena_admin_queue_ext_feature_desc max_queue_ext;
 	struct ena_admin_device_attr_feature_desc dev_attr;
 	struct ena_admin_feature_aenq_desc aenq;
 	struct ena_admin_feature_offload_desc offload;
 	struct ena_admin_ena_hw_hints hw_hints;
+	struct ena_admin_feature_llq_desc llq;
+	struct ena_admin_feature_rss_ind_table ind_table;
 };
 
 struct ena_com_create_io_ctx {
@@ -401,8 +454,6 @@ void ena_com_mmio_reg_read_request_destroy(struct ena_com_dev *ena_dev);
 /* ena_com_admin_init - Init the admin and the async queues
  * @ena_dev: ENA communication layer struct
  * @aenq_handlers: Those handlers to be called upon event.
- * @init_spinlock: Indicate if this method should init the admin spinlock or
- * the spinlock was init before (for example, in a case of FLR).
  *
  * Initialize the admin submission and completion queues.
  * Initialize the asynchronous events notification queues.
@@ -410,8 +461,7 @@ void ena_com_mmio_reg_read_request_destroy(struct ena_com_dev *ena_dev);
  * @return - 0 on success, negative value on failure.
  */
 int ena_com_admin_init(struct ena_com_dev *ena_dev,
-		       struct ena_aenq_handlers *aenq_handlers,
-		       bool init_spinlock);
+		       struct ena_aenq_handlers *aenq_handlers);
 
 /* ena_com_admin_destroy - Destroy the admin and the async events queues.
  * @ena_dev: ENA communication layer struct
@@ -561,6 +611,31 @@ int ena_com_validate_version(struct ena_com_dev *ena_dev);
 int ena_com_get_link_params(struct ena_com_dev *ena_dev,
 			    struct ena_admin_get_feat_resp *resp);
 
+/* ena_com_extra_properties_strings_init - Initialize the extra properties strings buffer.
+ * @ena_dev: ENA communication layer struct
+ *
+ * Initialize the extra properties strings buffer.
+ */
+int ena_com_extra_properties_strings_init(struct ena_com_dev *ena_dev);
+
+/* ena_com_delete_extra_properties_strings - Free the extra properties strings buffer.
+ * @ena_dev: ENA communication layer struct
+ *
+ * Free the allocated extra properties strings buffer.
+ */
+void ena_com_delete_extra_properties_strings(struct ena_com_dev *ena_dev);
+
+/* ena_com_get_extra_properties_flags - Retrieve extra properties flags.
+ * @ena_dev: ENA communication layer struct
+ * @resp: Extra properties flags.
+ *
+ * Retrieve the extra properties flags.
+ *
+ * @return - 0 on Success negative value otherwise.
+ */
+int ena_com_get_extra_properties_flags(struct ena_com_dev *ena_dev,
+				       struct ena_admin_get_feat_resp *resp);
+
 /* ena_com_get_dma_width - Retrieve physical dma address width the device
  * supports.
  * @ena_dev: ENA communication layer struct
@@ -939,6 +1014,16 @@ void ena_com_get_intr_moderation_entry(struct ena_com_dev *ena_dev,
 				       enum ena_intr_moder_level level,
 				       struct ena_intr_moder_entry *entry);
 
+/* ena_com_config_dev_mode - Configure the placement policy of the device.
+ * @ena_dev: ENA communication layer struct
+ * @llq_features: LLQ feature descriptor, retrieve via
+ *		   ena_com_get_dev_attr_feat.
+ * @ena_llq_config: The default driver LLQ parameters configurations
+ */
+int ena_com_config_dev_mode(struct ena_com_dev *ena_dev,
+			    struct ena_admin_feature_llq_desc *llq_features,
+			    struct ena_llq_configurations *llq_default_config);
+
 static inline bool ena_com_get_adaptive_moderation_enabled(struct ena_com_dev *ena_dev)
 {
 	return ena_dev->adaptive_coalescing;
@@ -1048,4 +1133,21 @@ static inline void ena_com_update_intr_reg(struct ena_eth_io_intr_reg *intr_reg,
 		intr_reg->intr_control |= ENA_ETH_IO_INTR_REG_INTR_UNMASK_MASK;
 }
 
+static inline u8 *ena_com_get_next_bounce_buffer(struct ena_com_io_bounce_buffer_control *bounce_buf_ctrl)
+{
+	u16 size, buffers_num;
+	u8 *buf;
+
+	size = bounce_buf_ctrl->buffer_size;
+	buffers_num = bounce_buf_ctrl->buffers_num;
+
+	buf = bounce_buf_ctrl->base_buffer +
+		(bounce_buf_ctrl->next_to_use++ & (buffers_num - 1)) * size;
+
+	prefetchw(bounce_buf_ctrl->base_buffer +
+		(bounce_buf_ctrl->next_to_use & (buffers_num - 1)) * size);
+
+	return buf;
+}
+
 #endif /* !(ENA_COM) */
diff --git a/drivers/amazon/net/ena/ena_common_defs.h b/drivers/amazon/net/ena/ena_common_defs.h
old mode 100644
new mode 100755
index bb8d73676eab6..450824ae7d895
--- a/drivers/amazon/net/ena/ena_common_defs.h
+++ b/drivers/amazon/net/ena/ena_common_defs.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2015 - 2016 Amazon.com, Inc. or its affiliates.
+ * Copyright 2015 - 2018 Amazon.com, Inc. or its affiliates.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -32,8 +33,8 @@
 #ifndef _ENA_COMMON_H_
 #define _ENA_COMMON_H_
 
-#define ENA_COMMON_SPEC_VERSION_MAJOR	0 /*  */
-#define ENA_COMMON_SPEC_VERSION_MINOR	10 /*  */
+#define ENA_COMMON_SPEC_VERSION_MAJOR        2
+#define ENA_COMMON_SPEC_VERSION_MINOR        0
 
 /* ENA operates with 48-bit memory addresses. ena_mem_addr_t */
 struct ena_common_mem_addr {
diff --git a/drivers/amazon/net/ena/ena_eth_com.c b/drivers/amazon/net/ena/ena_eth_com.c
index 582ea54e25b25..a2410241033fc 100644
--- a/drivers/amazon/net/ena/ena_eth_com.c
+++ b/drivers/amazon/net/ena/ena_eth_com.c
@@ -45,25 +45,22 @@ static inline struct ena_eth_io_rx_cdesc_base *ena_com_get_next_rx_cdesc(
 	cdesc = (struct ena_eth_io_rx_cdesc_base *)(io_cq->cdesc_addr.virt_addr
 			+ (head_masked * io_cq->cdesc_entry_size_in_bytes));
 
-	desc_phase = (READ_ONCE(cdesc->status) & ENA_ETH_IO_RX_CDESC_BASE_PHASE_MASK) >>
-			ENA_ETH_IO_RX_CDESC_BASE_PHASE_SHIFT;
+	desc_phase = (READ_ONCE(cdesc->status) &
+		      ENA_ETH_IO_RX_CDESC_BASE_PHASE_MASK) >>
+		     ENA_ETH_IO_RX_CDESC_BASE_PHASE_SHIFT;
 
 	if (desc_phase != expected_phase)
 		return NULL;
 
-	return cdesc;
-}
-
-static inline void ena_com_cq_inc_head(struct ena_com_io_cq *io_cq)
-{
-	io_cq->head++;
+	/* Make sure we read the rest of the descriptor after the phase bit
+	 * has been read
+	 */
+	dma_rmb();
 
-	/* Switch phase bit in case of wrap around */
-	if (unlikely((io_cq->head & (io_cq->q_depth - 1)) == 0))
-		io_cq->phase ^= 1;
+	return cdesc;
 }
 
-static inline void *get_sq_desc(struct ena_com_io_sq *io_sq)
+static inline void *get_sq_desc_regular_queue(struct ena_com_io_sq *io_sq)
 {
 	u16 tail_masked;
 	u32 offset;
@@ -75,45 +72,170 @@ static inline void *get_sq_desc(struct ena_com_io_sq *io_sq)
 	return (void *)((uintptr_t)io_sq->desc_addr.virt_addr + offset);
 }
 
-static inline void ena_com_copy_curr_sq_desc_to_dev(struct ena_com_io_sq *io_sq)
+static inline int ena_com_write_bounce_buffer_to_dev(struct ena_com_io_sq *io_sq,
+						     u8 *bounce_buffer)
 {
-	u16 tail_masked = io_sq->tail & (io_sq->q_depth - 1);
-	u32 offset = tail_masked * io_sq->desc_entry_size;
+	struct ena_com_llq_info *llq_info = &io_sq->llq_info;
 
-	/* In case this queue isn't a LLQ */
-	if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST)
-		return;
+	u16 dst_tail_mask;
+	u32 dst_offset;
 
-	memcpy_toio(io_sq->desc_addr.pbuf_dev_addr + offset,
-		    io_sq->desc_addr.virt_addr + offset,
-		    io_sq->desc_entry_size);
-}
+	dst_tail_mask = io_sq->tail & (io_sq->q_depth - 1);
+	dst_offset = dst_tail_mask * llq_info->desc_list_entry_size;
+
+	if (is_llq_max_tx_burst_exists(io_sq)) {
+		if (unlikely(!io_sq->entries_in_tx_burst_left)) {
+			pr_err("Error: trying to send more packets than tx burst allows\n");
+			return -ENOSPC;
+		}
+
+		io_sq->entries_in_tx_burst_left--;
+		pr_debug("decreasing entries_in_tx_burst_left of queue %d to %d\n",
+			 io_sq->qid, io_sq->entries_in_tx_burst_left);
+	}
+
+	/* Make sure everything was written into the bounce buffer before
+	 * writing the bounce buffer to the device
+	 */
+	wmb();
+
+	/* The line is completed. Copy it to dev */
+	__iowrite64_copy(io_sq->desc_addr.pbuf_dev_addr + dst_offset,
+			 bounce_buffer, (llq_info->desc_list_entry_size) / 8);
 
-static inline void ena_com_sq_update_tail(struct ena_com_io_sq *io_sq)
-{
 	io_sq->tail++;
 
 	/* Switch phase bit in case of wrap around */
 	if (unlikely((io_sq->tail & (io_sq->q_depth - 1)) == 0))
 		io_sq->phase ^= 1;
+
+	return 0;
 }
 
-static inline int ena_com_write_header(struct ena_com_io_sq *io_sq,
-				       u8 *head_src, u16 header_len)
+static inline int ena_com_write_header_to_bounce(struct ena_com_io_sq *io_sq,
+						 u8 *header_src,
+						 u16 header_len)
 {
-	u16 tail_masked = io_sq->tail & (io_sq->q_depth - 1);
-	u8 __iomem *dev_head_addr =
-		io_sq->header_addr + (tail_masked * io_sq->tx_max_header_size);
+	struct ena_com_llq_pkt_ctrl *pkt_ctrl = &io_sq->llq_buf_ctrl;
+	struct ena_com_llq_info *llq_info = &io_sq->llq_info;
+	u8 *bounce_buffer = pkt_ctrl->curr_bounce_buf;
+	u16 header_offset;
 
-	if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST)
+	if (unlikely(io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST))
 		return 0;
 
-	if (unlikely(!io_sq->header_addr)) {
-		pr_err("Push buffer header ptr is NULL\n");
-		return -EINVAL;
+	header_offset =
+		llq_info->descs_num_before_header * io_sq->desc_entry_size;
+
+	if (unlikely((header_offset + header_len) >
+		     llq_info->desc_list_entry_size)) {
+		pr_err("trying to write header larger than llq entry can accommodate\n");
+		return -EFAULT;
 	}
 
-	memcpy_toio(dev_head_addr, head_src, header_len);
+	if (unlikely(!bounce_buffer)) {
+		pr_err("bounce buffer is NULL\n");
+		return -EFAULT;
+	}
+
+	memcpy(bounce_buffer + header_offset, header_src, header_len);
+
+	return 0;
+}
+
+static inline void *get_sq_desc_llq(struct ena_com_io_sq *io_sq)
+{
+	struct ena_com_llq_pkt_ctrl *pkt_ctrl = &io_sq->llq_buf_ctrl;
+	u8 *bounce_buffer;
+	void *sq_desc;
+
+	bounce_buffer = pkt_ctrl->curr_bounce_buf;
+
+	if (unlikely(!bounce_buffer)) {
+		pr_err("bounce buffer is NULL\n");
+		return NULL;
+	}
+
+	sq_desc = bounce_buffer + pkt_ctrl->idx * io_sq->desc_entry_size;
+	pkt_ctrl->idx++;
+	pkt_ctrl->descs_left_in_line--;
+
+	return sq_desc;
+}
+
+static inline int ena_com_close_bounce_buffer(struct ena_com_io_sq *io_sq)
+{
+	struct ena_com_llq_pkt_ctrl *pkt_ctrl = &io_sq->llq_buf_ctrl;
+	struct ena_com_llq_info *llq_info = &io_sq->llq_info;
+	int rc;
+
+	if (unlikely(io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST))
+		return 0;
+
+	/* bounce buffer was used, so write it and get a new one */
+	if (pkt_ctrl->idx) {
+		rc = ena_com_write_bounce_buffer_to_dev(io_sq,
+							pkt_ctrl->curr_bounce_buf);
+		if (unlikely(rc))
+			return rc;
+
+		pkt_ctrl->curr_bounce_buf =
+			ena_com_get_next_bounce_buffer(&io_sq->bounce_buf_ctrl);
+		memset(io_sq->llq_buf_ctrl.curr_bounce_buf,
+		       0x0, llq_info->desc_list_entry_size);
+	}
+
+	pkt_ctrl->idx = 0;
+	pkt_ctrl->descs_left_in_line = llq_info->descs_num_before_header;
+	return 0;
+}
+
+static inline void *get_sq_desc(struct ena_com_io_sq *io_sq)
+{
+	if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
+		return get_sq_desc_llq(io_sq);
+
+	return get_sq_desc_regular_queue(io_sq);
+}
+
+static inline int ena_com_sq_update_llq_tail(struct ena_com_io_sq *io_sq)
+{
+	struct ena_com_llq_pkt_ctrl *pkt_ctrl = &io_sq->llq_buf_ctrl;
+	struct ena_com_llq_info *llq_info = &io_sq->llq_info;
+	int rc;
+
+	if (!pkt_ctrl->descs_left_in_line) {
+		rc = ena_com_write_bounce_buffer_to_dev(io_sq,
+							pkt_ctrl->curr_bounce_buf);
+		if (unlikely(rc))
+			return rc;
+
+		pkt_ctrl->curr_bounce_buf =
+			ena_com_get_next_bounce_buffer(&io_sq->bounce_buf_ctrl);
+			memset(io_sq->llq_buf_ctrl.curr_bounce_buf,
+			       0x0, llq_info->desc_list_entry_size);
+
+		pkt_ctrl->idx = 0;
+		if (unlikely(llq_info->desc_stride_ctrl == ENA_ADMIN_SINGLE_DESC_PER_ENTRY))
+			pkt_ctrl->descs_left_in_line = 1;
+		else
+			pkt_ctrl->descs_left_in_line =
+			llq_info->desc_list_entry_size / io_sq->desc_entry_size;
+	}
+
+	return 0;
+}
+
+static inline int ena_com_sq_update_tail(struct ena_com_io_sq *io_sq)
+{
+	if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
+		return ena_com_sq_update_llq_tail(io_sq);
+
+	io_sq->tail++;
+
+	/* Switch phase bit in case of wrap around */
+	if (unlikely((io_sq->tail & (io_sq->q_depth - 1)) == 0))
+		io_sq->phase ^= 1;
 
 	return 0;
 }
@@ -141,8 +263,9 @@ static inline u16 ena_com_cdesc_rx_pkt_get(struct ena_com_io_cq *io_cq,
 
 		ena_com_cq_inc_head(io_cq);
 		count++;
-		last = (READ_ONCE(cdesc->status) & ENA_ETH_IO_RX_CDESC_BASE_LAST_MASK) >>
-			ENA_ETH_IO_RX_CDESC_BASE_LAST_SHIFT;
+		last = (READ_ONCE(cdesc->status) &
+			ENA_ETH_IO_RX_CDESC_BASE_LAST_MASK) >>
+		       ENA_ETH_IO_RX_CDESC_BASE_LAST_SHIFT;
 	} while (!last);
 
 	if (last) {
@@ -164,25 +287,8 @@ static inline u16 ena_com_cdesc_rx_pkt_get(struct ena_com_io_cq *io_cq,
 	return count;
 }
 
-static inline bool ena_com_meta_desc_changed(struct ena_com_io_sq *io_sq,
-					     struct ena_com_tx_ctx *ena_tx_ctx)
-{
-	int rc;
-
-	if (ena_tx_ctx->meta_valid) {
-		rc = memcmp(&io_sq->cached_tx_meta,
-			    &ena_tx_ctx->ena_meta,
-			    sizeof(struct ena_com_tx_meta));
-
-		if (unlikely(rc != 0))
-			return true;
-	}
-
-	return false;
-}
-
-static inline void ena_com_create_and_store_tx_meta_desc(struct ena_com_io_sq *io_sq,
-							 struct ena_com_tx_ctx *ena_tx_ctx)
+static inline int ena_com_create_and_store_tx_meta_desc(struct ena_com_io_sq *io_sq,
+							struct ena_com_tx_ctx *ena_tx_ctx)
 {
 	struct ena_eth_io_tx_meta_desc *meta_desc = NULL;
 	struct ena_com_tx_meta *ena_meta = &ena_tx_ctx->ena_meta;
@@ -227,8 +333,7 @@ static inline void ena_com_create_and_store_tx_meta_desc(struct ena_com_io_sq *i
 	memcpy(&io_sq->cached_tx_meta, ena_meta,
 	       sizeof(struct ena_com_tx_meta));
 
-	ena_com_copy_curr_sq_desc_to_dev(io_sq);
-	ena_com_sq_update_tail(io_sq);
+	return ena_com_sq_update_tail(io_sq);
 }
 
 static inline void ena_com_rx_set_flags(struct ena_com_rx_ctx *ena_rx_ctx,
@@ -240,11 +345,14 @@ static inline void ena_com_rx_set_flags(struct ena_com_rx_ctx *ena_rx_ctx,
 		(cdesc->status & ENA_ETH_IO_RX_CDESC_BASE_L4_PROTO_IDX_MASK) >>
 		ENA_ETH_IO_RX_CDESC_BASE_L4_PROTO_IDX_SHIFT;
 	ena_rx_ctx->l3_csum_err =
-		(cdesc->status & ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM_ERR_MASK) >>
-		ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM_ERR_SHIFT;
+		!!((cdesc->status & ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM_ERR_MASK) >>
+		ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM_ERR_SHIFT);
 	ena_rx_ctx->l4_csum_err =
-		(cdesc->status & ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_ERR_MASK) >>
-		ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_ERR_SHIFT;
+		!!((cdesc->status & ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_ERR_MASK) >>
+		ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_ERR_SHIFT);
+	ena_rx_ctx->l4_csum_checked =
+		!!((cdesc->status & ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_CHECKED_MASK) >>
+		ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_CHECKED_SHIFT);
 	ena_rx_ctx->hash = cdesc->hash;
 	ena_rx_ctx->frag =
 		(cdesc->status & ENA_ETH_IO_RX_CDESC_BASE_IPV4_FRAG_MASK) >>
@@ -266,18 +374,19 @@ int ena_com_prepare_tx(struct ena_com_io_sq *io_sq,
 {
 	struct ena_eth_io_tx_desc *desc = NULL;
 	struct ena_com_buf *ena_bufs = ena_tx_ctx->ena_bufs;
-	void *push_header = ena_tx_ctx->push_header;
+	void *buffer_to_push = ena_tx_ctx->push_header;
 	u16 header_len = ena_tx_ctx->header_len;
 	u16 num_bufs = ena_tx_ctx->num_bufs;
-	int total_desc, i, rc;
+	u16 start_tail = io_sq->tail;
+	int i, rc;
 	bool have_meta;
 	u64 addr_hi;
 
 	WARN(io_sq->direction != ENA_COM_IO_QUEUE_DIRECTION_TX, "wrong Q type");
 
 	/* num_bufs +1 for potential meta desc */
-	if (ena_com_sq_empty_space(io_sq) < (num_bufs + 1)) {
-		pr_err("Not enough space in the tx queue\n");
+	if (unlikely(!ena_com_sq_have_enough_space(io_sq, num_bufs + 1))) {
+		pr_debug("Not enough space in the tx queue\n");
 		return -ENOMEM;
 	}
 
@@ -287,23 +396,32 @@ int ena_com_prepare_tx(struct ena_com_io_sq *io_sq,
 		return -EINVAL;
 	}
 
-	/* start with pushing the header (if needed) */
-	rc = ena_com_write_header(io_sq, push_header, header_len);
+	if (unlikely(io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV &&
+		     !buffer_to_push))
+		return -EINVAL;
+
+	rc = ena_com_write_header_to_bounce(io_sq, buffer_to_push, header_len);
 	if (unlikely(rc))
 		return rc;
 
 	have_meta = ena_tx_ctx->meta_valid && ena_com_meta_desc_changed(io_sq,
 			ena_tx_ctx);
-	if (have_meta)
-		ena_com_create_and_store_tx_meta_desc(io_sq, ena_tx_ctx);
+	if (have_meta) {
+		rc = ena_com_create_and_store_tx_meta_desc(io_sq, ena_tx_ctx);
+		if (unlikely(rc))
+			return rc;
+	}
 
-	/* If the caller doesn't want send packets */
+	/* If the caller doesn't want to send packets */
 	if (unlikely(!num_bufs && !header_len)) {
-		*nb_hw_desc = have_meta ? 0 : 1;
-		return 0;
+		rc = ena_com_close_bounce_buffer(io_sq);
+		*nb_hw_desc = io_sq->tail - start_tail;
+		return rc;
 	}
 
 	desc = get_sq_desc(io_sq);
+	if (unlikely(!desc))
+		return -EFAULT;
 	memset(desc, 0x0, sizeof(struct ena_eth_io_tx_desc));
 
 	/* Set first desc when we don't have meta descriptor */
@@ -355,10 +473,14 @@ int ena_com_prepare_tx(struct ena_com_io_sq *io_sq,
 	for (i = 0; i < num_bufs; i++) {
 		/* The first desc share the same desc as the header */
 		if (likely(i != 0)) {
-			ena_com_copy_curr_sq_desc_to_dev(io_sq);
-			ena_com_sq_update_tail(io_sq);
+			rc = ena_com_sq_update_tail(io_sq);
+			if (unlikely(rc))
+				return rc;
 
 			desc = get_sq_desc(io_sq);
+			if (unlikely(!desc))
+				return -EFAULT;
+
 			memset(desc, 0x0, sizeof(struct ena_eth_io_tx_desc));
 
 			desc->len_ctrl |= (io_sq->phase <<
@@ -381,15 +503,14 @@ int ena_com_prepare_tx(struct ena_com_io_sq *io_sq,
 	/* set the last desc indicator */
 	desc->len_ctrl |= ENA_ETH_IO_TX_DESC_LAST_MASK;
 
-	ena_com_copy_curr_sq_desc_to_dev(io_sq);
-
-	ena_com_sq_update_tail(io_sq);
+	rc = ena_com_sq_update_tail(io_sq);
+	if (unlikely(rc))
+		return rc;
 
-	total_desc = max_t(u16, num_bufs, 1);
-	total_desc += have_meta ? 1 : 0;
+	rc = ena_com_close_bounce_buffer(io_sq);
 
-	*nb_hw_desc = total_desc;
-	return 0;
+	*nb_hw_desc = io_sq->tail - start_tail;
+	return rc;
 }
 
 int ena_com_rx_pkt(struct ena_com_io_cq *io_cq,
@@ -448,18 +569,21 @@ int ena_com_add_single_rx_desc(struct ena_com_io_sq *io_sq,
 
 	WARN(io_sq->direction != ENA_COM_IO_QUEUE_DIRECTION_RX, "wrong Q type");
 
-	if (unlikely(ena_com_sq_empty_space(io_sq) == 0))
+	if (unlikely(!ena_com_sq_have_enough_space(io_sq, 1)))
 		return -ENOSPC;
 
 	desc = get_sq_desc(io_sq);
+	if (unlikely(!desc))
+		return -EFAULT;
+
 	memset(desc, 0x0, sizeof(struct ena_eth_io_rx_desc));
 
 	desc->length = ena_buf->len;
 
-	desc->ctrl |= ENA_ETH_IO_RX_DESC_FIRST_MASK;
-	desc->ctrl |= ENA_ETH_IO_RX_DESC_LAST_MASK;
-	desc->ctrl |= io_sq->phase & ENA_ETH_IO_RX_DESC_PHASE_MASK;
-	desc->ctrl |= ENA_ETH_IO_RX_DESC_COMP_REQ_MASK;
+	desc->ctrl = ENA_ETH_IO_RX_DESC_FIRST_MASK |
+		ENA_ETH_IO_RX_DESC_LAST_MASK |
+		(io_sq->phase & ENA_ETH_IO_RX_DESC_PHASE_MASK) |
+		ENA_ETH_IO_RX_DESC_COMP_REQ_MASK;
 
 	desc->req_id = req_id;
 
@@ -467,42 +591,7 @@ int ena_com_add_single_rx_desc(struct ena_com_io_sq *io_sq,
 	desc->buff_addr_hi =
 		((ena_buf->paddr & GENMASK_ULL(io_sq->dma_addr_bits - 1, 32)) >> 32);
 
-	ena_com_sq_update_tail(io_sq);
-
-	return 0;
-}
-
-int ena_com_tx_comp_req_id_get(struct ena_com_io_cq *io_cq, u16 *req_id)
-{
-	u8 expected_phase, cdesc_phase;
-	struct ena_eth_io_tx_cdesc *cdesc;
-	u16 masked_head;
-
-	masked_head = io_cq->head & (io_cq->q_depth - 1);
-	expected_phase = io_cq->phase;
-
-	cdesc = (struct ena_eth_io_tx_cdesc *)
-		((uintptr_t)io_cq->cdesc_addr.virt_addr +
-		(masked_head * io_cq->cdesc_entry_size_in_bytes));
-
-	/* When the current completion descriptor phase isn't the same as the
-	 * expected, it mean that the device still didn't update
-	 * this completion.
-	 */
-	cdesc_phase = READ_ONCE(cdesc->flags) & ENA_ETH_IO_TX_CDESC_PHASE_MASK;
-	if (cdesc_phase != expected_phase)
-		return -EAGAIN;
-
-	if (unlikely(cdesc->req_id >= io_cq->q_depth)) {
-		pr_err("Invalid req id %d\n", cdesc->req_id);
-		return -EINVAL;
-	}
-
-	ena_com_cq_inc_head(io_cq);
-
-	*req_id = READ_ONCE(cdesc->req_id);
-
-	return 0;
+	return ena_com_sq_update_tail(io_sq);
 }
 
 bool ena_com_cq_empty(struct ena_com_io_cq *io_cq)
@@ -510,9 +599,8 @@ bool ena_com_cq_empty(struct ena_com_io_cq *io_cq)
 	struct ena_eth_io_rx_cdesc_base *cdesc;
 
 	cdesc = ena_com_get_next_rx_cdesc(io_cq);
-	if(cdesc)
+	if (cdesc)
 		return false;
 	else
 		return true;
 }
-
diff --git a/drivers/amazon/net/ena/ena_eth_com.h b/drivers/amazon/net/ena/ena_eth_com.h
index 2f7657227cfe9..2a37463bc9569 100644
--- a/drivers/amazon/net/ena/ena_eth_com.h
+++ b/drivers/amazon/net/ena/ena_eth_com.h
@@ -67,6 +67,7 @@ struct ena_com_rx_ctx {
 	enum ena_eth_io_l4_proto_index l4_proto;
 	bool l3_csum_err;
 	bool l4_csum_err;
+	u8 l4_csum_checked;
 	/* fragmented packet */
 	bool frag;
 	u32 hash;
@@ -86,8 +87,6 @@ int ena_com_add_single_rx_desc(struct ena_com_io_sq *io_sq,
 			       struct ena_com_buf *ena_buf,
 			       u16 req_id);
 
-int ena_com_tx_comp_req_id_get(struct ena_com_io_cq *io_cq, u16 *req_id);
-
 bool ena_com_cq_empty(struct ena_com_io_cq *io_cq);
 
 static inline void ena_com_unmask_intr(struct ena_com_io_cq *io_cq,
@@ -96,7 +95,7 @@ static inline void ena_com_unmask_intr(struct ena_com_io_cq *io_cq,
 	writel(intr_reg->intr_control, io_cq->unmask_reg);
 }
 
-static inline int ena_com_sq_empty_space(struct ena_com_io_sq *io_sq)
+static inline int ena_com_free_desc(struct ena_com_io_sq *io_sq)
 {
 	u16 tail, next_to_comp, cnt;
 
@@ -107,17 +106,87 @@ static inline int ena_com_sq_empty_space(struct ena_com_io_sq *io_sq)
 	return io_sq->q_depth - 1 - cnt;
 }
 
-static inline int ena_com_write_sq_doorbell(struct ena_com_io_sq *io_sq)
+/* Check if the submission queue has enough space to hold required_buffers */
+static inline bool ena_com_sq_have_enough_space(struct ena_com_io_sq *io_sq,
+						u16 required_buffers)
 {
-	u16 tail;
+	int temp;
 
-	tail = io_sq->tail;
+	if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST)
+		return ena_com_free_desc(io_sq) >= required_buffers;
+
+	/* This calculation doesn't need to be 100% accurate. So to reduce
+	 * the calculation overhead just Subtract 2 lines from the free descs
+	 * (one for the header line and one to compensate the devision
+	 * down calculation.
+	 */
+	temp = required_buffers / io_sq->llq_info.descs_per_entry + 2;
+
+	return ena_com_free_desc(io_sq) > temp;
+}
+
+static inline bool ena_com_meta_desc_changed(struct ena_com_io_sq *io_sq,
+					     struct ena_com_tx_ctx *ena_tx_ctx)
+{
+	if (!ena_tx_ctx->meta_valid)
+		return false;
+
+	return !!memcmp(&io_sq->cached_tx_meta,
+			&ena_tx_ctx->ena_meta,
+			sizeof(struct ena_com_tx_meta));
+}
+
+static inline bool is_llq_max_tx_burst_exists(struct ena_com_io_sq *io_sq)
+{
+	return (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) &&
+	       io_sq->llq_info.max_entries_in_tx_burst > 0;
+}
+
+static inline bool ena_com_is_doorbell_needed(struct ena_com_io_sq *io_sq,
+					      struct ena_com_tx_ctx *ena_tx_ctx)
+{
+	struct ena_com_llq_info *llq_info;
+	int descs_after_first_entry;
+	int num_entries_needed = 1;
+	u16 num_descs;
+
+	if (!is_llq_max_tx_burst_exists(io_sq))
+		return false;
+
+	llq_info = &io_sq->llq_info;
+	num_descs = ena_tx_ctx->num_bufs;
+
+	if (unlikely(ena_com_meta_desc_changed(io_sq, ena_tx_ctx)))
+		++num_descs;
+
+	if (num_descs > llq_info->descs_num_before_header) {
+		descs_after_first_entry = num_descs - llq_info->descs_num_before_header;
+		num_entries_needed += DIV_ROUND_UP(descs_after_first_entry,
+						   llq_info->descs_per_entry);
+	}
+
+	pr_debug("queue: %d num_descs: %d num_entries_needed: %d\n", io_sq->qid,
+		 num_descs, num_entries_needed);
+
+	return num_entries_needed > io_sq->entries_in_tx_burst_left;
+}
+
+static inline int ena_com_write_sq_doorbell(struct ena_com_io_sq *io_sq)
+{
+	u16 tail = io_sq->tail;
+	u16 max_entries_in_tx_burst = io_sq->llq_info.max_entries_in_tx_burst;
 
 	pr_debug("write submission queue doorbell for queue: %d tail: %d\n",
 		 io_sq->qid, tail);
 
 	writel(tail, io_sq->db_addr);
 
+	if (is_llq_max_tx_burst_exists(io_sq)) {
+		pr_debug("reset available entries in tx burst for queue %d to %d\n",
+			 io_sq->qid, max_entries_in_tx_burst);
+		io_sq->entries_in_tx_burst_left = max_entries_in_tx_burst;
+	}
+
 	return 0;
 }
 
@@ -159,4 +228,48 @@ static inline void ena_com_comp_ack(struct ena_com_io_sq *io_sq, u16 elem)
 	io_sq->next_to_comp += elem;
 }
 
+static inline void ena_com_cq_inc_head(struct ena_com_io_cq *io_cq)
+{
+	io_cq->head++;
+
+	/* Switch phase bit in case of wrap around */
+	if (unlikely((io_cq->head & (io_cq->q_depth - 1)) == 0))
+		io_cq->phase ^= 1;
+}
+
+static inline int ena_com_tx_comp_req_id_get(struct ena_com_io_cq *io_cq,
+					     u16 *req_id)
+{
+	u8 expected_phase, cdesc_phase;
+	struct ena_eth_io_tx_cdesc *cdesc;
+	u16 masked_head;
+
+	masked_head = io_cq->head & (io_cq->q_depth - 1);
+	expected_phase = io_cq->phase;
+
+	cdesc = (struct ena_eth_io_tx_cdesc *)
+		((uintptr_t)io_cq->cdesc_addr.virt_addr +
+		(masked_head * io_cq->cdesc_entry_size_in_bytes));
+
+	/* When the current completion descriptor phase isn't the same as the
+	 * expected, it mean that the device still didn't update
+	 * this completion.
+	 */
+	cdesc_phase = READ_ONCE(cdesc->flags) & ENA_ETH_IO_TX_CDESC_PHASE_MASK;
+	if (cdesc_phase != expected_phase)
+		return -EAGAIN;
+
+	dma_rmb();
+
+	*req_id = READ_ONCE(cdesc->req_id);
+	if (unlikely(*req_id >= io_cq->q_depth)) {
+		pr_err("Invalid req id %d\n", cdesc->req_id);
+		return -EINVAL;
+	}
+
+	ena_com_cq_inc_head(io_cq);
+
+	return 0;
+}
+
 #endif /* ENA_ETH_COM_H_ */
diff --git a/drivers/amazon/net/ena/ena_eth_io_defs.h b/drivers/amazon/net/ena/ena_eth_io_defs.h
old mode 100644
new mode 100755
index f320c58793a52..bab1591c8b9cd
--- a/drivers/amazon/net/ena/ena_eth_io_defs.h
+++ b/drivers/amazon/net/ena/ena_eth_io_defs.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2015 - 2016 Amazon.com, Inc. or its affiliates.
+ * Copyright 2015 - 2018 Amazon.com, Inc. or its affiliates.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -33,25 +34,18 @@
 #define _ENA_ETH_IO_H_
 
 enum ena_eth_io_l3_proto_index {
-	ENA_ETH_IO_L3_PROTO_UNKNOWN	= 0,
-
-	ENA_ETH_IO_L3_PROTO_IPV4	= 8,
-
-	ENA_ETH_IO_L3_PROTO_IPV6	= 11,
-
-	ENA_ETH_IO_L3_PROTO_FCOE	= 21,
-
-	ENA_ETH_IO_L3_PROTO_ROCE	= 22,
+	ENA_ETH_IO_L3_PROTO_UNKNOWN                 = 0,
+	ENA_ETH_IO_L3_PROTO_IPV4                    = 8,
+	ENA_ETH_IO_L3_PROTO_IPV6                    = 11,
+	ENA_ETH_IO_L3_PROTO_FCOE                    = 21,
+	ENA_ETH_IO_L3_PROTO_ROCE                    = 22,
 };
 
 enum ena_eth_io_l4_proto_index {
-	ENA_ETH_IO_L4_PROTO_UNKNOWN		= 0,
-
-	ENA_ETH_IO_L4_PROTO_TCP			= 12,
-
-	ENA_ETH_IO_L4_PROTO_UDP			= 13,
-
-	ENA_ETH_IO_L4_PROTO_ROUTEABLE_ROCE	= 23,
+	ENA_ETH_IO_L4_PROTO_UNKNOWN                 = 0,
+	ENA_ETH_IO_L4_PROTO_TCP                     = 12,
+	ENA_ETH_IO_L4_PROTO_UDP                     = 13,
+	ENA_ETH_IO_L4_PROTO_ROUTEABLE_ROCE          = 23,
 };
 
 struct ena_eth_io_tx_desc {
@@ -242,9 +236,13 @@ struct ena_eth_io_rx_cdesc_base {
 	 *    checksum error detected, or, the controller didn't
 	 *    validate the checksum. This bit is valid only when
 	 *    l4_proto_idx indicates TCP/UDP packet, and,
-	 *    ipv4_frag is not set
+	 *    ipv4_frag is not set. This bit is valid only when
+	 *    l4_csum_checked below is set.
 	 * 15 : ipv4_frag - Indicates IPv4 fragmented packet
-	 * 23:16 : reserved16
+	 * 16 : l4_csum_checked - L4 checksum was verified
+	 *    (could be OK or error), when cleared the status of
+	 *    checksum is unknown
+	 * 23:17 : reserved17 - MBZ
 	 * 24 : phase
 	 * 25 : l3_csum2 - second checksum engine result
 	 * 26 : first - Indicates first descriptor in
@@ -303,114 +301,116 @@ struct ena_eth_io_numa_node_cfg_reg {
 };
 
 /* tx_desc */
-#define ENA_ETH_IO_TX_DESC_LENGTH_MASK GENMASK(15, 0)
-#define ENA_ETH_IO_TX_DESC_REQ_ID_HI_SHIFT 16
-#define ENA_ETH_IO_TX_DESC_REQ_ID_HI_MASK GENMASK(21, 16)
-#define ENA_ETH_IO_TX_DESC_META_DESC_SHIFT 23
-#define ENA_ETH_IO_TX_DESC_META_DESC_MASK BIT(23)
-#define ENA_ETH_IO_TX_DESC_PHASE_SHIFT 24
-#define ENA_ETH_IO_TX_DESC_PHASE_MASK BIT(24)
-#define ENA_ETH_IO_TX_DESC_FIRST_SHIFT 26
-#define ENA_ETH_IO_TX_DESC_FIRST_MASK BIT(26)
-#define ENA_ETH_IO_TX_DESC_LAST_SHIFT 27
-#define ENA_ETH_IO_TX_DESC_LAST_MASK BIT(27)
-#define ENA_ETH_IO_TX_DESC_COMP_REQ_SHIFT 28
-#define ENA_ETH_IO_TX_DESC_COMP_REQ_MASK BIT(28)
-#define ENA_ETH_IO_TX_DESC_L3_PROTO_IDX_MASK GENMASK(3, 0)
-#define ENA_ETH_IO_TX_DESC_DF_SHIFT 4
-#define ENA_ETH_IO_TX_DESC_DF_MASK BIT(4)
-#define ENA_ETH_IO_TX_DESC_TSO_EN_SHIFT 7
-#define ENA_ETH_IO_TX_DESC_TSO_EN_MASK BIT(7)
-#define ENA_ETH_IO_TX_DESC_L4_PROTO_IDX_SHIFT 8
-#define ENA_ETH_IO_TX_DESC_L4_PROTO_IDX_MASK GENMASK(12, 8)
-#define ENA_ETH_IO_TX_DESC_L3_CSUM_EN_SHIFT 13
-#define ENA_ETH_IO_TX_DESC_L3_CSUM_EN_MASK BIT(13)
-#define ENA_ETH_IO_TX_DESC_L4_CSUM_EN_SHIFT 14
-#define ENA_ETH_IO_TX_DESC_L4_CSUM_EN_MASK BIT(14)
-#define ENA_ETH_IO_TX_DESC_ETHERNET_FCS_DIS_SHIFT 15
-#define ENA_ETH_IO_TX_DESC_ETHERNET_FCS_DIS_MASK BIT(15)
-#define ENA_ETH_IO_TX_DESC_L4_CSUM_PARTIAL_SHIFT 17
-#define ENA_ETH_IO_TX_DESC_L4_CSUM_PARTIAL_MASK BIT(17)
-#define ENA_ETH_IO_TX_DESC_REQ_ID_LO_SHIFT 22
-#define ENA_ETH_IO_TX_DESC_REQ_ID_LO_MASK GENMASK(31, 22)
-#define ENA_ETH_IO_TX_DESC_ADDR_HI_MASK GENMASK(15, 0)
-#define ENA_ETH_IO_TX_DESC_HEADER_LENGTH_SHIFT 24
-#define ENA_ETH_IO_TX_DESC_HEADER_LENGTH_MASK GENMASK(31, 24)
+#define ENA_ETH_IO_TX_DESC_LENGTH_MASK                      GENMASK(15, 0)
+#define ENA_ETH_IO_TX_DESC_REQ_ID_HI_SHIFT                  16
+#define ENA_ETH_IO_TX_DESC_REQ_ID_HI_MASK                   GENMASK(21, 16)
+#define ENA_ETH_IO_TX_DESC_META_DESC_SHIFT                  23
+#define ENA_ETH_IO_TX_DESC_META_DESC_MASK                   BIT(23)
+#define ENA_ETH_IO_TX_DESC_PHASE_SHIFT                      24
+#define ENA_ETH_IO_TX_DESC_PHASE_MASK                       BIT(24)
+#define ENA_ETH_IO_TX_DESC_FIRST_SHIFT                      26
+#define ENA_ETH_IO_TX_DESC_FIRST_MASK                       BIT(26)
+#define ENA_ETH_IO_TX_DESC_LAST_SHIFT                       27
+#define ENA_ETH_IO_TX_DESC_LAST_MASK                        BIT(27)
+#define ENA_ETH_IO_TX_DESC_COMP_REQ_SHIFT                   28
+#define ENA_ETH_IO_TX_DESC_COMP_REQ_MASK                    BIT(28)
+#define ENA_ETH_IO_TX_DESC_L3_PROTO_IDX_MASK                GENMASK(3, 0)
+#define ENA_ETH_IO_TX_DESC_DF_SHIFT                         4
+#define ENA_ETH_IO_TX_DESC_DF_MASK                          BIT(4)
+#define ENA_ETH_IO_TX_DESC_TSO_EN_SHIFT                     7
+#define ENA_ETH_IO_TX_DESC_TSO_EN_MASK                      BIT(7)
+#define ENA_ETH_IO_TX_DESC_L4_PROTO_IDX_SHIFT               8
+#define ENA_ETH_IO_TX_DESC_L4_PROTO_IDX_MASK                GENMASK(12, 8)
+#define ENA_ETH_IO_TX_DESC_L3_CSUM_EN_SHIFT                 13
+#define ENA_ETH_IO_TX_DESC_L3_CSUM_EN_MASK                  BIT(13)
+#define ENA_ETH_IO_TX_DESC_L4_CSUM_EN_SHIFT                 14
+#define ENA_ETH_IO_TX_DESC_L4_CSUM_EN_MASK                  BIT(14)
+#define ENA_ETH_IO_TX_DESC_ETHERNET_FCS_DIS_SHIFT           15
+#define ENA_ETH_IO_TX_DESC_ETHERNET_FCS_DIS_MASK            BIT(15)
+#define ENA_ETH_IO_TX_DESC_L4_CSUM_PARTIAL_SHIFT            17
+#define ENA_ETH_IO_TX_DESC_L4_CSUM_PARTIAL_MASK             BIT(17)
+#define ENA_ETH_IO_TX_DESC_REQ_ID_LO_SHIFT                  22
+#define ENA_ETH_IO_TX_DESC_REQ_ID_LO_MASK                   GENMASK(31, 22)
+#define ENA_ETH_IO_TX_DESC_ADDR_HI_MASK                     GENMASK(15, 0)
+#define ENA_ETH_IO_TX_DESC_HEADER_LENGTH_SHIFT              24
+#define ENA_ETH_IO_TX_DESC_HEADER_LENGTH_MASK               GENMASK(31, 24)
 
 /* tx_meta_desc */
-#define ENA_ETH_IO_TX_META_DESC_REQ_ID_LO_MASK GENMASK(9, 0)
-#define ENA_ETH_IO_TX_META_DESC_EXT_VALID_SHIFT 14
-#define ENA_ETH_IO_TX_META_DESC_EXT_VALID_MASK BIT(14)
-#define ENA_ETH_IO_TX_META_DESC_MSS_HI_SHIFT 16
-#define ENA_ETH_IO_TX_META_DESC_MSS_HI_MASK GENMASK(19, 16)
-#define ENA_ETH_IO_TX_META_DESC_ETH_META_TYPE_SHIFT 20
-#define ENA_ETH_IO_TX_META_DESC_ETH_META_TYPE_MASK BIT(20)
-#define ENA_ETH_IO_TX_META_DESC_META_STORE_SHIFT 21
-#define ENA_ETH_IO_TX_META_DESC_META_STORE_MASK BIT(21)
-#define ENA_ETH_IO_TX_META_DESC_META_DESC_SHIFT 23
-#define ENA_ETH_IO_TX_META_DESC_META_DESC_MASK BIT(23)
-#define ENA_ETH_IO_TX_META_DESC_PHASE_SHIFT 24
-#define ENA_ETH_IO_TX_META_DESC_PHASE_MASK BIT(24)
-#define ENA_ETH_IO_TX_META_DESC_FIRST_SHIFT 26
-#define ENA_ETH_IO_TX_META_DESC_FIRST_MASK BIT(26)
-#define ENA_ETH_IO_TX_META_DESC_LAST_SHIFT 27
-#define ENA_ETH_IO_TX_META_DESC_LAST_MASK BIT(27)
-#define ENA_ETH_IO_TX_META_DESC_COMP_REQ_SHIFT 28
-#define ENA_ETH_IO_TX_META_DESC_COMP_REQ_MASK BIT(28)
-#define ENA_ETH_IO_TX_META_DESC_REQ_ID_HI_MASK GENMASK(5, 0)
-#define ENA_ETH_IO_TX_META_DESC_L3_HDR_LEN_MASK GENMASK(7, 0)
-#define ENA_ETH_IO_TX_META_DESC_L3_HDR_OFF_SHIFT 8
-#define ENA_ETH_IO_TX_META_DESC_L3_HDR_OFF_MASK GENMASK(15, 8)
-#define ENA_ETH_IO_TX_META_DESC_L4_HDR_LEN_IN_WORDS_SHIFT 16
-#define ENA_ETH_IO_TX_META_DESC_L4_HDR_LEN_IN_WORDS_MASK GENMASK(21, 16)
-#define ENA_ETH_IO_TX_META_DESC_MSS_LO_SHIFT 22
-#define ENA_ETH_IO_TX_META_DESC_MSS_LO_MASK GENMASK(31, 22)
+#define ENA_ETH_IO_TX_META_DESC_REQ_ID_LO_MASK              GENMASK(9, 0)
+#define ENA_ETH_IO_TX_META_DESC_EXT_VALID_SHIFT             14
+#define ENA_ETH_IO_TX_META_DESC_EXT_VALID_MASK              BIT(14)
+#define ENA_ETH_IO_TX_META_DESC_MSS_HI_SHIFT                16
+#define ENA_ETH_IO_TX_META_DESC_MSS_HI_MASK                 GENMASK(19, 16)
+#define ENA_ETH_IO_TX_META_DESC_ETH_META_TYPE_SHIFT         20
+#define ENA_ETH_IO_TX_META_DESC_ETH_META_TYPE_MASK          BIT(20)
+#define ENA_ETH_IO_TX_META_DESC_META_STORE_SHIFT            21
+#define ENA_ETH_IO_TX_META_DESC_META_STORE_MASK             BIT(21)
+#define ENA_ETH_IO_TX_META_DESC_META_DESC_SHIFT             23
+#define ENA_ETH_IO_TX_META_DESC_META_DESC_MASK              BIT(23)
+#define ENA_ETH_IO_TX_META_DESC_PHASE_SHIFT                 24
+#define ENA_ETH_IO_TX_META_DESC_PHASE_MASK                  BIT(24)
+#define ENA_ETH_IO_TX_META_DESC_FIRST_SHIFT                 26
+#define ENA_ETH_IO_TX_META_DESC_FIRST_MASK                  BIT(26)
+#define ENA_ETH_IO_TX_META_DESC_LAST_SHIFT                  27
+#define ENA_ETH_IO_TX_META_DESC_LAST_MASK                   BIT(27)
+#define ENA_ETH_IO_TX_META_DESC_COMP_REQ_SHIFT              28
+#define ENA_ETH_IO_TX_META_DESC_COMP_REQ_MASK               BIT(28)
+#define ENA_ETH_IO_TX_META_DESC_REQ_ID_HI_MASK              GENMASK(5, 0)
+#define ENA_ETH_IO_TX_META_DESC_L3_HDR_LEN_MASK             GENMASK(7, 0)
+#define ENA_ETH_IO_TX_META_DESC_L3_HDR_OFF_SHIFT            8
+#define ENA_ETH_IO_TX_META_DESC_L3_HDR_OFF_MASK             GENMASK(15, 8)
+#define ENA_ETH_IO_TX_META_DESC_L4_HDR_LEN_IN_WORDS_SHIFT   16
+#define ENA_ETH_IO_TX_META_DESC_L4_HDR_LEN_IN_WORDS_MASK    GENMASK(21, 16)
+#define ENA_ETH_IO_TX_META_DESC_MSS_LO_SHIFT                22
+#define ENA_ETH_IO_TX_META_DESC_MSS_LO_MASK                 GENMASK(31, 22)
 
 /* tx_cdesc */
-#define ENA_ETH_IO_TX_CDESC_PHASE_MASK BIT(0)
+#define ENA_ETH_IO_TX_CDESC_PHASE_MASK                      BIT(0)
 
 /* rx_desc */
-#define ENA_ETH_IO_RX_DESC_PHASE_MASK BIT(0)
-#define ENA_ETH_IO_RX_DESC_FIRST_SHIFT 2
-#define ENA_ETH_IO_RX_DESC_FIRST_MASK BIT(2)
-#define ENA_ETH_IO_RX_DESC_LAST_SHIFT 3
-#define ENA_ETH_IO_RX_DESC_LAST_MASK BIT(3)
-#define ENA_ETH_IO_RX_DESC_COMP_REQ_SHIFT 4
-#define ENA_ETH_IO_RX_DESC_COMP_REQ_MASK BIT(4)
+#define ENA_ETH_IO_RX_DESC_PHASE_MASK                       BIT(0)
+#define ENA_ETH_IO_RX_DESC_FIRST_SHIFT                      2
+#define ENA_ETH_IO_RX_DESC_FIRST_MASK                       BIT(2)
+#define ENA_ETH_IO_RX_DESC_LAST_SHIFT                       3
+#define ENA_ETH_IO_RX_DESC_LAST_MASK                        BIT(3)
+#define ENA_ETH_IO_RX_DESC_COMP_REQ_SHIFT                   4
+#define ENA_ETH_IO_RX_DESC_COMP_REQ_MASK                    BIT(4)
 
 /* rx_cdesc_base */
-#define ENA_ETH_IO_RX_CDESC_BASE_L3_PROTO_IDX_MASK GENMASK(4, 0)
-#define ENA_ETH_IO_RX_CDESC_BASE_SRC_VLAN_CNT_SHIFT 5
-#define ENA_ETH_IO_RX_CDESC_BASE_SRC_VLAN_CNT_MASK GENMASK(6, 5)
-#define ENA_ETH_IO_RX_CDESC_BASE_L4_PROTO_IDX_SHIFT 8
-#define ENA_ETH_IO_RX_CDESC_BASE_L4_PROTO_IDX_MASK GENMASK(12, 8)
-#define ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM_ERR_SHIFT 13
-#define ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM_ERR_MASK BIT(13)
-#define ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_ERR_SHIFT 14
-#define ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_ERR_MASK BIT(14)
-#define ENA_ETH_IO_RX_CDESC_BASE_IPV4_FRAG_SHIFT 15
-#define ENA_ETH_IO_RX_CDESC_BASE_IPV4_FRAG_MASK BIT(15)
-#define ENA_ETH_IO_RX_CDESC_BASE_PHASE_SHIFT 24
-#define ENA_ETH_IO_RX_CDESC_BASE_PHASE_MASK BIT(24)
-#define ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM2_SHIFT 25
-#define ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM2_MASK BIT(25)
-#define ENA_ETH_IO_RX_CDESC_BASE_FIRST_SHIFT 26
-#define ENA_ETH_IO_RX_CDESC_BASE_FIRST_MASK BIT(26)
-#define ENA_ETH_IO_RX_CDESC_BASE_LAST_SHIFT 27
-#define ENA_ETH_IO_RX_CDESC_BASE_LAST_MASK BIT(27)
-#define ENA_ETH_IO_RX_CDESC_BASE_BUFFER_SHIFT 30
-#define ENA_ETH_IO_RX_CDESC_BASE_BUFFER_MASK BIT(30)
+#define ENA_ETH_IO_RX_CDESC_BASE_L3_PROTO_IDX_MASK          GENMASK(4, 0)
+#define ENA_ETH_IO_RX_CDESC_BASE_SRC_VLAN_CNT_SHIFT         5
+#define ENA_ETH_IO_RX_CDESC_BASE_SRC_VLAN_CNT_MASK          GENMASK(6, 5)
+#define ENA_ETH_IO_RX_CDESC_BASE_L4_PROTO_IDX_SHIFT         8
+#define ENA_ETH_IO_RX_CDESC_BASE_L4_PROTO_IDX_MASK          GENMASK(12, 8)
+#define ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM_ERR_SHIFT          13
+#define ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM_ERR_MASK           BIT(13)
+#define ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_ERR_SHIFT          14
+#define ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_ERR_MASK           BIT(14)
+#define ENA_ETH_IO_RX_CDESC_BASE_IPV4_FRAG_SHIFT            15
+#define ENA_ETH_IO_RX_CDESC_BASE_IPV4_FRAG_MASK             BIT(15)
+#define ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_CHECKED_SHIFT      16
+#define ENA_ETH_IO_RX_CDESC_BASE_L4_CSUM_CHECKED_MASK       BIT(16)
+#define ENA_ETH_IO_RX_CDESC_BASE_PHASE_SHIFT                24
+#define ENA_ETH_IO_RX_CDESC_BASE_PHASE_MASK                 BIT(24)
+#define ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM2_SHIFT             25
+#define ENA_ETH_IO_RX_CDESC_BASE_L3_CSUM2_MASK              BIT(25)
+#define ENA_ETH_IO_RX_CDESC_BASE_FIRST_SHIFT                26
+#define ENA_ETH_IO_RX_CDESC_BASE_FIRST_MASK                 BIT(26)
+#define ENA_ETH_IO_RX_CDESC_BASE_LAST_SHIFT                 27
+#define ENA_ETH_IO_RX_CDESC_BASE_LAST_MASK                  BIT(27)
+#define ENA_ETH_IO_RX_CDESC_BASE_BUFFER_SHIFT               30
+#define ENA_ETH_IO_RX_CDESC_BASE_BUFFER_MASK                BIT(30)
 
 /* intr_reg */
-#define ENA_ETH_IO_INTR_REG_RX_INTR_DELAY_MASK GENMASK(14, 0)
-#define ENA_ETH_IO_INTR_REG_TX_INTR_DELAY_SHIFT 15
-#define ENA_ETH_IO_INTR_REG_TX_INTR_DELAY_MASK GENMASK(29, 15)
-#define ENA_ETH_IO_INTR_REG_INTR_UNMASK_SHIFT 30
-#define ENA_ETH_IO_INTR_REG_INTR_UNMASK_MASK BIT(30)
+#define ENA_ETH_IO_INTR_REG_RX_INTR_DELAY_MASK              GENMASK(14, 0)
+#define ENA_ETH_IO_INTR_REG_TX_INTR_DELAY_SHIFT             15
+#define ENA_ETH_IO_INTR_REG_TX_INTR_DELAY_MASK              GENMASK(29, 15)
+#define ENA_ETH_IO_INTR_REG_INTR_UNMASK_SHIFT               30
+#define ENA_ETH_IO_INTR_REG_INTR_UNMASK_MASK                BIT(30)
 
 /* numa_node_cfg_reg */
-#define ENA_ETH_IO_NUMA_NODE_CFG_REG_NUMA_MASK GENMASK(7, 0)
-#define ENA_ETH_IO_NUMA_NODE_CFG_REG_ENABLED_SHIFT 31
-#define ENA_ETH_IO_NUMA_NODE_CFG_REG_ENABLED_MASK BIT(31)
+#define ENA_ETH_IO_NUMA_NODE_CFG_REG_NUMA_MASK              GENMASK(7, 0)
+#define ENA_ETH_IO_NUMA_NODE_CFG_REG_ENABLED_SHIFT          31
+#define ENA_ETH_IO_NUMA_NODE_CFG_REG_ENABLED_MASK           BIT(31)
 
 #endif /*_ENA_ETH_IO_H_ */
diff --git a/drivers/amazon/net/ena/ena_ethtool.c b/drivers/amazon/net/ena/ena_ethtool.c
old mode 100644
new mode 100755
index fcd002f0a7fbd..557fb2259278c
--- a/drivers/amazon/net/ena/ena_ethtool.c
+++ b/drivers/amazon/net/ena/ena_ethtool.c
@@ -81,6 +81,7 @@ static const struct ena_stats ena_stats_tx_strings[] = {
 	ENA_STAT_TX_ENTRY(doorbells),
 	ENA_STAT_TX_ENTRY(prepare_ctx_err),
 	ENA_STAT_TX_ENTRY(bad_req_id),
+	ENA_STAT_TX_ENTRY(llq_buffer_copy),
 	ENA_STAT_TX_ENTRY(missed_tx),
 };
 
@@ -101,6 +102,7 @@ static const struct ena_stats ena_stats_rx_strings[] = {
 #endif
 	ENA_STAT_RX_ENTRY(bad_req_id),
 	ENA_STAT_RX_ENTRY(empty_rx_ring),
+	ENA_STAT_RX_ENTRY(csum_unchecked),
 };
 
 static const struct ena_stats ena_stats_ena_com_strings[] = {
@@ -200,15 +202,24 @@ static void ena_get_ethtool_stats(struct net_device *netdev,
 	ena_dev_admin_queue_stats(adapter, &data);
 }
 
+static int get_stats_sset_count(struct ena_adapter *adapter)
+{
+	return  adapter->num_queues * (ENA_STATS_ARRAY_TX + ENA_STATS_ARRAY_RX)
+		+ ENA_STATS_ARRAY_GLOBAL + ENA_STATS_ARRAY_ENA_COM;
+}
+
 int ena_get_sset_count(struct net_device *netdev, int sset)
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
 
-	if (sset != ETH_SS_STATS)
+	switch (sset) {
+	case ETH_SS_STATS:
+		return get_stats_sset_count(adapter);
+	case ETH_SS_PRIV_FLAGS:
+		return adapter->ena_extra_properties_count;
+	default:
 		return -EOPNOTSUPP;
-
-	return  adapter->num_queues * (ENA_STATS_ARRAY_TX + ENA_STATS_ARRAY_RX)
-		+ ENA_STATS_ARRAY_GLOBAL + ENA_STATS_ARRAY_ENA_COM;
+	}
 }
 
 static void ena_queue_strings(struct ena_adapter *adapter, u8 **data)
@@ -250,26 +261,56 @@ static void ena_com_dev_strings(u8 **data)
 	}
 }
 
-static void ena_get_strings(struct net_device *netdev, u32 sset, u8 *data)
+static void get_stats_strings(struct ena_adapter *adapter, u8 *data)
 {
-	struct ena_adapter *adapter = netdev_priv(netdev);
 	const struct ena_stats *ena_stats;
 	int i;
 
-	if (sset != ETH_SS_STATS)
-		return;
-
 	for (i = 0; i < ENA_STATS_ARRAY_GLOBAL; i++) {
 		ena_stats = &ena_stats_global_strings[i];
-
 		memcpy(data, ena_stats->name, ETH_GSTRING_LEN);
 		data += ETH_GSTRING_LEN;
 	}
-
 	ena_queue_strings(adapter, &data);
 	ena_com_dev_strings(&data);
 }
 
+static void get_private_flags_strings(struct ena_adapter *adapter, u8 *data)
+{
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	u8 *strings = ena_dev->extra_properties_strings.virt_addr;
+	int i;
+
+	if (unlikely(!strings)) {
+		adapter->ena_extra_properties_count = 0;
+		netif_err(adapter, drv, adapter->netdev,
+			  "Failed to allocate extra properties strings\n");
+		return;
+	}
+
+	for (i = 0; i < adapter->ena_extra_properties_count; i++) {
+		snprintf(data, ETH_GSTRING_LEN, "%s",
+			 strings + ENA_ADMIN_EXTRA_PROPERTIES_STRING_LEN * i);
+		data += ETH_GSTRING_LEN;
+	}
+}
+
+static void ena_get_strings(struct net_device *netdev, u32 sset, u8 *data)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+
+	switch (sset) {
+	case ETH_SS_STATS:
+		get_stats_strings(adapter, data);
+		break;
+	case ETH_SS_PRIV_FLAGS:
+		get_private_flags_strings(adapter, data);
+		break;
+	default:
+		break;
+	}
+}
+
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0)
 static int ena_get_link_ksettings(struct net_device *netdev,
 				  struct ethtool_link_ksettings *link_ksettings)
@@ -461,6 +502,7 @@ static void ena_get_drvinfo(struct net_device *dev,
 	strlcpy(info->version, DRV_MODULE_VERSION, sizeof(info->version));
 	strlcpy(info->bus_info, pci_name(adapter->pdev),
 		sizeof(info->bus_info));
+	info->n_priv_flags = adapter->ena_extra_properties_count;
 }
 
 static void ena_get_ringparam(struct net_device *netdev,
@@ -899,6 +941,20 @@ static int ena_set_tunable(struct net_device *netdev,
 }
 #endif /* 3.18.0 */
 
+static u32 ena_get_priv_flags(struct net_device *netdev)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	struct ena_admin_get_feat_resp get_resp;
+	u32 rc;
+
+	rc = ena_com_get_extra_properties_flags(ena_dev, &get_resp);
+	if (!rc)
+		return get_resp.u.extra_properties_flags.flags;
+
+	return 0;
+}
+
 static const struct ethtool_ops ena_ethtool_ops = {
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0)
 	.get_link_ksettings	= ena_get_link_ksettings,
@@ -939,6 +995,7 @@ static const struct ethtool_ops ena_ethtool_ops = {
 	.get_tunable		= ena_get_tunable,
 	.set_tunable		= ena_set_tunable,
 #endif
+	.get_priv_flags		= ena_get_priv_flags,
 };
 
 void ena_set_ethtool_ops(struct net_device *netdev)
@@ -960,8 +1017,8 @@ static void ena_dump_stats_ex(struct ena_adapter *adapter, u8 *buf)
 		return;
 	}
 
-	strings_buf = devm_kzalloc(&adapter->pdev->dev,
-				   strings_num * ETH_GSTRING_LEN,
+	strings_buf = devm_kcalloc(&adapter->pdev->dev,
+				   ETH_GSTRING_LEN, strings_num,
 				   GFP_ATOMIC);
 	if (!strings_buf) {
 		netif_err(adapter, drv, netdev,
@@ -969,8 +1026,8 @@ static void ena_dump_stats_ex(struct ena_adapter *adapter, u8 *buf)
 		return;
 	}
 
-	data_buf = devm_kzalloc(&adapter->pdev->dev,
-				strings_num * sizeof(u64),
+	data_buf = devm_kcalloc(&adapter->pdev->dev,
+				strings_num, sizeof(u64),
 				GFP_ATOMIC);
 	if (!data_buf) {
 		netif_err(adapter, drv, netdev,
diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c
old mode 100644
new mode 100755
index 4f82a3df99f70..4e3a5cc368c2e
--- a/drivers/amazon/net/ena/ena_netdev.c
+++ b/drivers/amazon/net/ena/ena_netdev.c
@@ -72,6 +72,10 @@ static int debug = -1;
 module_param(debug, int, 0);
 MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
 
+static int rx_queue_size = ENA_DEFAULT_RING_SIZE;
+module_param(rx_queue_size, int, S_IRUGO);
+MODULE_PARM_DESC(rx_queue_size, "Rx queue size. The size should be a power of 2. Max value is 8K\n");
+
 static struct ena_aenq_handlers aenq_handlers;
 
 static struct workqueue_struct *ena_wq;
@@ -80,8 +84,12 @@ MODULE_DEVICE_TABLE(pci, ena_pci_tbl);
 
 static int ena_rss_init_default(struct ena_adapter *adapter);
 static void check_for_admin_com_state(struct ena_adapter *adapter);
-static void ena_destroy_device(struct ena_adapter *adapter);
+static void ena_destroy_device(struct ena_adapter *adapter, bool graceful);
 static int ena_restore_device(struct ena_adapter *adapter);
+static int ena_calc_io_queue_num(struct pci_dev *pdev,
+				 struct ena_com_dev *ena_dev,
+				 struct ena_com_dev_get_features_ctx *get_feat_ctx);
+static int ena_calc_queue_size(struct ena_calc_queue_size_ctx *ctx);
 
 static void ena_tx_timeout(struct net_device *dev)
 {
@@ -254,6 +262,17 @@ static int ena_setup_tx_resources(struct ena_adapter *adapter, int qid)
 		}
 	}
 
+	size = tx_ring->tx_max_header_size;
+	tx_ring->push_buf_intermediate_buf = vzalloc_node(size, node);
+	if (!tx_ring->push_buf_intermediate_buf) {
+		tx_ring->push_buf_intermediate_buf = vzalloc(size);
+		if (!tx_ring->push_buf_intermediate_buf) {
+			vfree(tx_ring->tx_buffer_info);
+			vfree(tx_ring->free_tx_ids);
+			return -ENOMEM;
+		}
+	}
+
 	/* Req id ring for TX out of order completions */
 	for (i = 0; i < tx_ring->ring_size; i++)
 		tx_ring->free_tx_ids[i] = i;
@@ -282,6 +301,9 @@ static void ena_free_tx_resources(struct ena_adapter *adapter, int qid)
 
 	vfree(tx_ring->free_tx_ids);
 	tx_ring->free_tx_ids = NULL;
+
+	vfree(tx_ring->push_buf_intermediate_buf);
+	tx_ring->push_buf_intermediate_buf = NULL;
 }
 
 /* ena_setup_all_tx_resources - allocate I/O Tx queues resources for All queues
@@ -480,7 +502,7 @@ static inline int ena_alloc_rx_page(struct ena_ring *rx_ring,
 		return -ENOMEM;
 	}
 
-	dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE,
+	dma = dma_map_page(rx_ring->dev, page, 0, ENA_PAGE_SIZE,
 			   DMA_FROM_DEVICE);
 	if (unlikely(dma_mapping_error(rx_ring->dev, dma))) {
 		u64_stats_update_begin(&rx_ring->syncp);
@@ -497,7 +519,7 @@ static inline int ena_alloc_rx_page(struct ena_ring *rx_ring,
 	rx_info->page_offset = 0;
 	ena_buf = &rx_info->ena_buf;
 	ena_buf->paddr = dma;
-	ena_buf->len = PAGE_SIZE;
+	ena_buf->len = ENA_PAGE_SIZE;
 
 	return 0;
 }
@@ -514,7 +536,7 @@ static void ena_free_rx_page(struct ena_ring *rx_ring,
 		return;
 	}
 
-	dma_unmap_page(rx_ring->dev, ena_buf->paddr, PAGE_SIZE,
+	dma_unmap_page(rx_ring->dev, ena_buf->paddr, ENA_PAGE_SIZE,
 		       DMA_FROM_DEVICE);
 
 	__free_page(page);
@@ -541,7 +563,11 @@ static int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num)
 
 
 		rc = ena_alloc_rx_page(rx_ring, rx_info,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 15, 0)
+				       GFP_ATOMIC | __GFP_COMP);
+#else
 				       __GFP_COLD | GFP_ATOMIC | __GFP_COMP);
+#endif
 		if (unlikely(rc < 0)) {
 			netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev,
 				   "failed to alloc buffer for rx queue %d\n",
@@ -570,13 +596,9 @@ static int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num)
 			    rx_ring->qid, i, num);
 	}
 
-	if (likely(i)) {
-		/* Add memory barrier to make sure the desc were written before
-		 * issue a doorbell
-		 */
-		wmb();
+	/* ena_com_write_sq_doorbell issues a wmb() */
+	if (likely(i))
 		ena_com_write_sq_doorbell(rx_ring->ena_com_io_sq);
-	}
 
 	rx_ring->next_to_use = next_to_use;
 
@@ -626,6 +648,36 @@ static void ena_free_all_rx_bufs(struct ena_adapter *adapter)
 		ena_free_rx_bufs(adapter, i);
 }
 
+static inline void ena_unmap_tx_skb(struct ena_ring *tx_ring,
+				    struct ena_tx_buffer *tx_info)
+{
+	struct ena_com_buf *ena_buf;
+	u32 cnt;
+	int i;
+
+	ena_buf = tx_info->bufs;
+	cnt = tx_info->num_of_bufs;
+
+	if (unlikely(!cnt))
+		return;
+
+	if (tx_info->map_linear_data) {
+		dma_unmap_single(tx_ring->dev,
+				 dma_unmap_addr(ena_buf, paddr),
+				 dma_unmap_len(ena_buf, len),
+				 DMA_TO_DEVICE);
+		ena_buf++;
+		cnt--;
+	}
+
+	/* unmap remaining mapped pages */
+	for (i = 0; i < cnt; i++) {
+		dma_unmap_page(tx_ring->dev, dma_unmap_addr(ena_buf, paddr),
+			       dma_unmap_len(ena_buf, len), DMA_TO_DEVICE);
+		ena_buf++;
+	}
+}
+
 /* ena_free_tx_bufs - Free Tx Buffers per Queue
  * @tx_ring: TX ring for which buffers be freed
  */
@@ -636,9 +688,6 @@ static void ena_free_tx_bufs(struct ena_ring *tx_ring)
 
 	for (i = 0; i < tx_ring->ring_size; i++) {
 		struct ena_tx_buffer *tx_info = &tx_ring->tx_buffer_info[i];
-		struct ena_com_buf *ena_buf;
-		int nr_frags;
-		int j;
 
 		if (!tx_info->skb)
 			continue;
@@ -654,21 +703,7 @@ static void ena_free_tx_bufs(struct ena_ring *tx_ring)
 				   tx_ring->qid, i);
 		}
 
-		ena_buf = tx_info->bufs;
-		dma_unmap_single(tx_ring->dev,
-				 ena_buf->paddr,
-				 ena_buf->len,
-				 DMA_TO_DEVICE);
-
-		/* unmap remaining mapped pages */
-		nr_frags = tx_info->num_of_bufs - 1;
-		for (j = 0; j < nr_frags; j++) {
-			ena_buf++;
-			dma_unmap_page(tx_ring->dev,
-				       ena_buf->paddr,
-				       ena_buf->len,
-				       DMA_TO_DEVICE);
-		}
+		ena_unmap_tx_skb(tx_ring, tx_info);
 
 		dev_kfree_skb_any(tx_info->skb);
 	}
@@ -759,8 +794,6 @@ static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget)
 	while (tx_pkts < budget) {
 		struct ena_tx_buffer *tx_info;
 		struct sk_buff *skb;
-		struct ena_com_buf *ena_buf;
-		int i, nr_frags;
 
 		rc = ena_com_tx_comp_req_id_get(tx_ring->ena_com_io_cq,
 						&req_id);
@@ -780,24 +813,7 @@ static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget)
 		tx_info->skb = NULL;
 		tx_info->last_jiffies = 0;
 
-		if (likely(tx_info->num_of_bufs != 0)) {
-			ena_buf = tx_info->bufs;
-
-			dma_unmap_single(tx_ring->dev,
-					 dma_unmap_addr(ena_buf, paddr),
-					 dma_unmap_len(ena_buf, len),
-					 DMA_TO_DEVICE);
-
-			/* unmap remaining mapped pages */
-			nr_frags = tx_info->num_of_bufs - 1;
-			for (i = 0; i < nr_frags; i++) {
-				ena_buf++;
-				dma_unmap_page(tx_ring->dev,
-					       dma_unmap_addr(ena_buf, paddr),
-					       dma_unmap_len(ena_buf, len),
-					       DMA_TO_DEVICE);
-			}
-		}
+		ena_unmap_tx_skb(tx_ring, tx_info);
 
 		netif_dbg(tx_ring->adapter, tx_done, tx_ring->netdev,
 			  "tx_poll: q %d skb %p completed\n", tx_ring->qid,
@@ -826,12 +842,13 @@ static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget)
 	 */
 	smp_mb();
 
-	above_thresh = ena_com_sq_empty_space(tx_ring->ena_com_io_sq) >
-		ENA_TX_WAKEUP_THRESH;
+	above_thresh = ena_com_sq_have_enough_space(tx_ring->ena_com_io_sq,
+						    ENA_TX_WAKEUP_THRESH);
 	if (unlikely(netif_tx_queue_stopped(txq) && above_thresh)) {
 		__netif_tx_lock(txq, smp_processor_id());
-		above_thresh = ena_com_sq_empty_space(tx_ring->ena_com_io_sq) >
-			ENA_TX_WAKEUP_THRESH;
+		above_thresh =
+			ena_com_sq_have_enough_space(tx_ring->ena_com_io_sq,
+						     ENA_TX_WAKEUP_THRESH);
 		if (netif_tx_queue_stopped(txq) && above_thresh) {
 			netif_tx_wake_queue(txq);
 			u64_stats_update_begin(&tx_ring->syncp);
@@ -944,10 +961,10 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 	do {
 		dma_unmap_page(rx_ring->dev,
 			       dma_unmap_addr(&rx_info->ena_buf, paddr),
-			       PAGE_SIZE, DMA_FROM_DEVICE);
+			       ENA_PAGE_SIZE, DMA_FROM_DEVICE);
 
 		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_info->page,
-				rx_info->page_offset, len, PAGE_SIZE);
+				rx_info->page_offset, len, ENA_PAGE_SIZE);
 
 		netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
 			  "rx skb updated. len %d. data_len %d\n",
@@ -1036,8 +1053,19 @@ static inline void ena_rx_checksum(struct ena_ring *rx_ring,
 			return;
 		}
 
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
+		if (likely(ena_rx_ctx->l4_csum_checked)) {
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+		} else {
+			u64_stats_update_begin(&rx_ring->syncp);
+			rx_ring->rx_stats.csum_unchecked++;
+			u64_stats_update_end(&rx_ring->syncp);
+			skb->ip_summed = CHECKSUM_NONE;
+		}
+	} else {
+		skb->ip_summed = CHECKSUM_NONE;
+		return;
 	}
+
 }
 
 static void ena_set_rx_hash(struct ena_ring *rx_ring,
@@ -1168,8 +1196,10 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 
 	rx_ring->next_to_clean = next_to_clean;
 
-	refill_required = ena_com_sq_empty_space(rx_ring->ena_com_io_sq);
-	refill_threshold = rx_ring->ring_size / ENA_RX_REFILL_THRESH_DIVIDER;
+	refill_required = ena_com_free_desc(rx_ring->ena_com_io_sq);
+	refill_threshold =
+		min_t(int, rx_ring->ring_size / ENA_RX_REFILL_THRESH_DIVIDER,
+		      ENA_RX_REFILL_THRESH_PACKET);
 
 	/* Optimization, try to batch new rx buffers */
 	if (refill_required > refill_threshold) {
@@ -1353,16 +1383,17 @@ static irqreturn_t ena_intr_msix_io(int irq, void *data)
 {
 	struct ena_napi *ena_napi = data;
 
+	ena_napi->tx_ring->first_interrupt = true;
+	ena_napi->rx_ring->first_interrupt = true;
+
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
 	napi_schedule_irqoff(&ena_napi->napi);
 #else
+	smp_mb__before_atomic();
 	atomic_set(&ena_napi->unmask_interrupt, 1);
 	napi_schedule_irqoff(&ena_napi->napi);
 #endif
 
-	ena_napi->tx_ring->first_interrupt = true;
-	ena_napi->rx_ring->first_interrupt = true;
-
 	return IRQ_HANDLED;
 }
 
@@ -1386,7 +1417,6 @@ static int ena_enable_msix(struct ena_adapter *adapter, int num_queues)
 
 	/* Reserved the max msix vectors we might need */
 	msix_vecs = ENA_MAX_MSIX_VEC(num_queues);
-
 	netif_dbg(adapter, probe, adapter->netdev,
 		  "trying to enable MSI-X, vectors %d\n", msix_vecs);
 
@@ -1722,8 +1752,6 @@ static int ena_up_complete(struct ena_adapter *adapter)
 	if (rc)
 		return rc;
 
-	ena_init_napi(adapter);
-
 	ena_change_mtu(adapter->netdev, adapter->netdev->mtu);
 
 	ena_refill_all_rx_bufs(adapter);
@@ -1740,7 +1768,7 @@ static int ena_up_complete(struct ena_adapter *adapter)
 
 static int ena_create_io_tx_queue(struct ena_adapter *adapter, int qid)
 {
-	struct ena_com_create_io_ctx ctx = { 0 };
+	struct ena_com_create_io_ctx ctx;
 	struct ena_com_dev *ena_dev;
 	struct ena_ring *tx_ring;
 	u32 msix_vector;
@@ -1753,6 +1781,8 @@ static int ena_create_io_tx_queue(struct ena_adapter *adapter, int qid)
 	msix_vector = ENA_IO_IRQ_IDX(qid);
 	ena_qid = ENA_IO_TXQ_IDX(qid);
 
+	memset(&ctx, 0x0, sizeof(ctx));
+
 	ctx.direction = ENA_COM_IO_QUEUE_DIRECTION_TX;
 	ctx.qid = ena_qid;
 	ctx.mem_queue_type = ena_dev->tx_mem_queue_type;
@@ -1806,7 +1836,7 @@ static int ena_create_all_io_tx_queues(struct ena_adapter *adapter)
 static int ena_create_io_rx_queue(struct ena_adapter *adapter, int qid)
 {
 	struct ena_com_dev *ena_dev;
-	struct ena_com_create_io_ctx ctx = { 0 };
+	struct ena_com_create_io_ctx ctx;
 	struct ena_ring *rx_ring;
 	u32 msix_vector;
 	u16 ena_qid;
@@ -1818,6 +1848,8 @@ static int ena_create_io_rx_queue(struct ena_adapter *adapter, int qid)
 	msix_vector = ENA_IO_IRQ_IDX(qid);
 	ena_qid = ENA_IO_RXQ_IDX(qid);
 
+	memset(&ctx, 0x0, sizeof(ctx));
+
 	ctx.qid = ena_qid;
 	ctx.direction = ENA_COM_IO_QUEUE_DIRECTION_RX;
 	ctx.mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
@@ -1877,6 +1909,13 @@ static int ena_up(struct ena_adapter *adapter)
 
 	ena_setup_io_intr(adapter);
 
+	/* napi poll functions should be initialized before running
+	 * request_irq(), to handle a rare condition where there is a pending
+	 * interrupt, causing the ISR to fire immediately while the poll
+	 * function wasn't set yet, causing a null dereference
+	 */
+	ena_init_napi(adapter);
+
 	rc = ena_request_io_irq(adapter);
 	if (rc)
 		goto err_req_irq;
@@ -1966,6 +2005,8 @@ static void ena_down(struct ena_adapter *adapter)
 		rc = ena_com_dev_reset(adapter->ena_dev, adapter->reset_reason);
 		if (rc)
 			dev_err(&adapter->pdev->dev, "Device reset failed\n");
+		/* stop submitting admin commands on a device that was reset */
+		ena_com_set_admin_running_state(adapter->ena_dev, false);
 	}
 
 	ena_destroy_all_io_queues(adapter);
@@ -2032,6 +2073,9 @@ static int ena_close(struct net_device *netdev)
 
 	netif_dbg(adapter, ifdown, netdev, "%s\n", __func__);
 
+	if (!test_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags))
+		return 0;
+
 	if (test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
 		ena_down(adapter);
 
@@ -2042,7 +2086,7 @@ static int ena_close(struct net_device *netdev)
 			  "Destroy failure, restarting device\n");
 		ena_dump_stats_to_dmesg(adapter);
 		/* rtnl lock already obtained in dev_ioctl() layer */
-		ena_destroy_device(adapter);
+		ena_destroy_device(adapter, false);
 		ena_restore_device(adapter);
 	}
 
@@ -2128,74 +2172,70 @@ static int ena_check_and_linearize_skb(struct ena_ring *tx_ring,
 	return rc;
 }
 
-/* Called with netif_tx_lock. */
-static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
+static int ena_tx_map_skb(struct ena_ring *tx_ring,
+			  struct ena_tx_buffer *tx_info,
+			  struct sk_buff *skb,
+			  void **push_hdr,
+			  u16 *header_len)
 {
-	struct ena_adapter *adapter = netdev_priv(dev);
-	struct ena_tx_buffer *tx_info;
-	struct ena_com_tx_ctx ena_tx_ctx;
-	struct ena_ring *tx_ring;
-	struct netdev_queue *txq;
+	struct ena_adapter *adapter = tx_ring->adapter;
 	struct ena_com_buf *ena_buf;
-	void *push_hdr;
-	u32 len, pkt_len, last_frag;
-	u16 next_to_use;
-	u16 req_id;
-	u16 push_len;
-	u16 header_len;
 	dma_addr_t dma;
-	int qid, rc, nb_hw_desc;
-	int i = -1;
-
-	netif_dbg(adapter, tx_queued, dev, "%s skb %p\n", __func__, skb);
-	/*  Determine which tx ring we will be placed on */
-	qid = skb_get_queue_mapping(skb);
-	tx_ring = &adapter->tx_ring[qid];
-	txq = netdev_get_tx_queue(dev, qid);
+	u32 skb_head_len, frag_len, last_frag;
+	u16 push_len = 0;
+	u16 delta = 0;
+	int i = 0;
 
-	rc = ena_check_and_linearize_skb(tx_ring, skb);
-	if (unlikely(rc))
-		goto error_drop_packet;
-
-	skb_tx_timestamp(skb);
-	len = skb_headlen(skb);
-	pkt_len = skb->len;
-
-	next_to_use = tx_ring->next_to_use;
-	req_id = tx_ring->free_tx_ids[next_to_use];
-	tx_info = &tx_ring->tx_buffer_info[req_id];
-	tx_info->num_of_bufs = 0;
-
-	WARN(tx_info->skb, "SKB isn't NULL req_id %d\n", req_id);
-	ena_buf = tx_info->bufs;
+	skb_head_len = skb_headlen(skb);
 	tx_info->skb = skb;
+	ena_buf = tx_info->bufs;
 
 	if (tx_ring->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
-		/* prepared the push buffer */
-		push_len = min_t(u32, pkt_len, tx_ring->tx_max_header_size);
-		header_len = push_len;
-		push_hdr = skb->data;
+		/* When the device is LLQ mode, the driver will copy
+		 * the header into the device memory space.
+		 * the ena_com layer assume the header is in a linear
+		 * memory space.
+		 * This assumption might be wrong since part of the header
+		 * can be in the fragmented buffers.
+		 * Use skb_header_pointer to make sure the header is in a
+		 * linear memory space.
+		 */
+
+		push_len = min_t(u32, skb->len, tx_ring->tx_max_header_size);
+		*push_hdr = skb_header_pointer(skb, 0, push_len,
+					       tx_ring->push_buf_intermediate_buf);
+		*header_len = push_len;
+		if (unlikely(skb->data != *push_hdr)) {
+			u64_stats_update_begin(&tx_ring->syncp);
+			tx_ring->tx_stats.llq_buffer_copy++;
+			u64_stats_update_end(&tx_ring->syncp);
+
+			delta = push_len - skb_head_len;
+		}
 	} else {
-		push_hdr = NULL;
-		push_len = 0;
-		header_len = min_t(u32, len, tx_ring->tx_max_header_size);
+		*push_hdr = NULL;
+		*header_len = min_t(u32, skb_head_len,
+				    tx_ring->tx_max_header_size);
 	}
 
-	netif_dbg(adapter, tx_queued, dev,
+	netif_dbg(adapter, tx_queued, adapter->netdev,
 		  "skb: %p header_buf->vaddr: %p push_len: %d\n", skb,
-		  push_hdr, push_len);
+		  *push_hdr, push_len);
 
-	if (len > push_len) {
+	if (skb_head_len > push_len) {
 		dma = dma_map_single(tx_ring->dev, skb->data + push_len,
-				     len - push_len, DMA_TO_DEVICE);
-		if (dma_mapping_error(tx_ring->dev, dma))
+				     skb_head_len - push_len, DMA_TO_DEVICE);
+		if (unlikely(dma_mapping_error(tx_ring->dev, dma)))
 			goto error_report_dma_error;
 
 		ena_buf->paddr = dma;
-		ena_buf->len = len - push_len;
+		ena_buf->len = skb_head_len - push_len;
 
 		ena_buf++;
 		tx_info->num_of_bufs++;
+		tx_info->map_linear_data = 1;
+	} else {
+		tx_info->map_linear_data = 0;
 	}
 
 	last_frag = skb_shinfo(skb)->nr_frags;
@@ -2203,18 +2243,76 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	for (i = 0; i < last_frag; i++) {
 		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 
-		len = skb_frag_size(frag);
-		dma = skb_frag_dma_map(tx_ring->dev, frag, 0, len,
-				       DMA_TO_DEVICE);
-		if (dma_mapping_error(tx_ring->dev, dma))
+		frag_len = skb_frag_size(frag);
+
+		if (unlikely(delta >= frag_len)) {
+			delta -= frag_len;
+			continue;
+		}
+
+		dma = skb_frag_dma_map(tx_ring->dev, frag, delta,
+				       frag_len - delta, DMA_TO_DEVICE);
+		if (unlikely(dma_mapping_error(tx_ring->dev, dma)))
 			goto error_report_dma_error;
 
 		ena_buf->paddr = dma;
-		ena_buf->len = len;
+		ena_buf->len = frag_len - delta;
 		ena_buf++;
+		tx_info->num_of_bufs++;
+		delta = 0;
 	}
 
-	tx_info->num_of_bufs += last_frag;
+	return 0;
+
+error_report_dma_error:
+	u64_stats_update_begin(&tx_ring->syncp);
+	tx_ring->tx_stats.dma_mapping_err++;
+	u64_stats_update_end(&tx_ring->syncp);
+	netdev_warn(adapter->netdev, "failed to map skb\n");
+
+	tx_info->skb = NULL;
+
+	tx_info->num_of_bufs += i;
+	ena_unmap_tx_skb(tx_ring, tx_info);
+
+	return -EINVAL;
+}
+
+
+/* Called with netif_tx_lock. */
+static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ena_adapter *adapter = netdev_priv(dev);
+	struct ena_tx_buffer *tx_info;
+	struct ena_com_tx_ctx ena_tx_ctx;
+	struct ena_ring *tx_ring;
+	struct netdev_queue *txq;
+	void *push_hdr;
+	u16 next_to_use, req_id, header_len;
+	int qid, rc, nb_hw_desc;
+
+	netif_dbg(adapter, tx_queued, dev, "%s skb %p\n", __func__, skb);
+	/*  Determine which tx ring we will be placed on */
+	qid = skb_get_queue_mapping(skb);
+	tx_ring = &adapter->tx_ring[qid];
+	txq = netdev_get_tx_queue(dev, qid);
+
+	rc = ena_check_and_linearize_skb(tx_ring, skb);
+	if (unlikely(rc))
+		goto error_drop_packet;
+
+	skb_tx_timestamp(skb);
+
+	next_to_use = tx_ring->next_to_use;
+	req_id = tx_ring->free_tx_ids[next_to_use];
+	tx_info = &tx_ring->tx_buffer_info[req_id];
+	tx_info->num_of_bufs = 0;
+
+	WARN(tx_info->skb, "SKB isn't NULL req_id %d\n", req_id);
+
+	rc = ena_tx_map_skb(tx_ring, tx_info, skb, &push_hdr, &header_len);
+	if (unlikely(rc))
+		goto error_drop_packet;
 
 	memset(&ena_tx_ctx, 0x0, sizeof(struct ena_com_tx_ctx));
 	ena_tx_ctx.ena_bufs = tx_info->bufs;
@@ -2226,18 +2324,33 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* set flags and meta data */
 	ena_tx_csum(&ena_tx_ctx, skb);
 
+	if (unlikely(ena_com_is_doorbell_needed(tx_ring->ena_com_io_sq, &ena_tx_ctx))) {
+		netif_dbg(adapter, tx_queued, dev,
+			  "llq tx max burst size of queue %d achieved, writing doorbell to send burst\n",
+			  qid);
+		ena_com_write_sq_doorbell(tx_ring->ena_com_io_sq);
+	}
+
 	/* prepare the packet's descriptors to dma engine */
 	rc = ena_com_prepare_tx(tx_ring->ena_com_io_sq, &ena_tx_ctx,
 				&nb_hw_desc);
 
+	/* ena_com_prepare_tx() can't fail due to overflow of tx queue,
+	 * since the number of free descriptors in the queue is checked
+	 * after sending the previous packet. In case there isn't enough
+	 * space in the queue for the next packet, it is stopped
+	 * until there is again enough available space in the queue.
+	 * All other failure reasons of ena_com_prepare_tx() are fatal
+	 * and therefore require a device reset.
+	 */
 	if (unlikely(rc)) {
 		netif_err(adapter, tx_queued, dev,
 			  "failed to prepare tx bufs\n");
 		u64_stats_update_begin(&tx_ring->syncp);
-		tx_ring->tx_stats.queue_stop++;
 		tx_ring->tx_stats.prepare_ctx_err++;
 		u64_stats_update_end(&tx_ring->syncp);
-		netif_tx_stop_queue(txq);
+		adapter->reset_reason = ENA_REGS_RESET_DRIVER_INVALID_STATE;
+		set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
 		goto error_unmap_dma;
 	}
 
@@ -2253,18 +2366,12 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	tx_ring->next_to_use = ENA_TX_RING_IDX_NEXT(next_to_use,
 		tx_ring->ring_size);
 
-	/* This WMB is aimed to:
-	 * 1 - perform smp barrier before reading next_to_completion
-	 * 2 - make sure the desc were written before trigger DB
-	 */
-	wmb();
-
 	/* stop the queue when no more space available, the packet can have up
 	 * to sgl_size + 2. one for the meta descriptor and one for header
 	 * (if the header is larger than tx_max_header_size).
 	 */
-	if (unlikely(ena_com_sq_empty_space(tx_ring->ena_com_io_sq) <
-		     (tx_ring->sgl_size + 2))) {
+	if (unlikely(!ena_com_sq_have_enough_space(tx_ring->ena_com_io_sq,
+						   tx_ring->sgl_size + 2))) {
 		netif_dbg(adapter, tx_queued, dev, "%s stop queue %d\n",
 			  __func__, qid);
 
@@ -2277,13 +2384,14 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		 * stop the queue but meanwhile clean_tx_irq updates
 		 * next_to_completion and terminates.
 		 * The queue will remain stopped forever.
-		 * To solve this issue this function perform rmb, check
-		 * the wakeup condition and wake up the queue if needed.
+		 * To solve this issue add a mb() to make sure that
+		 * netif_tx_stop_queue() write is vissible before checking if
+		 * there is additional space in the queue.
 		 */
-		smp_rmb();
+		smp_mb();
 
-		if (ena_com_sq_empty_space(tx_ring->ena_com_io_sq)
-				> ENA_TX_WAKEUP_THRESH) {
+		if (ena_com_sq_have_enough_space(tx_ring->ena_com_io_sq,
+						 ENA_TX_WAKEUP_THRESH)) {
 			netif_tx_wake_queue(txq);
 			u64_stats_update_begin(&tx_ring->syncp);
 			tx_ring->tx_stats.queue_wakeup++;
@@ -2294,7 +2402,9 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0)
 	if (netif_xmit_stopped(txq) || !skb->xmit_more) {
 #endif
-		/* trigger the dma engine */
+		/* trigger the dma engine. ena_com_write_sq_doorbell()
+		 * has a mb
+		 */
 		ena_com_write_sq_doorbell(tx_ring->ena_com_io_sq);
 		u64_stats_update_begin(&tx_ring->syncp);
 		tx_ring->tx_stats.doorbells++;
@@ -2305,61 +2415,23 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	return NETDEV_TX_OK;
 
-error_report_dma_error:
-	u64_stats_update_begin(&tx_ring->syncp);
-	tx_ring->tx_stats.dma_mapping_err++;
-	u64_stats_update_end(&tx_ring->syncp);
-	netdev_warn(adapter->netdev, "failed to map skb\n");
-
-	tx_info->skb = NULL;
-
 error_unmap_dma:
-	if (i >= 0) {
-		/* save value of frag that failed */
-		last_frag = i;
-
-		/* start back at beginning and unmap skb */
-		tx_info->skb = NULL;
-		ena_buf = tx_info->bufs;
-		dma_unmap_single(tx_ring->dev, dma_unmap_addr(ena_buf, paddr),
-				 dma_unmap_len(ena_buf, len), DMA_TO_DEVICE);
-
-		/* unmap remaining mapped pages */
-		for (i = 0; i < last_frag; i++) {
-			ena_buf++;
-			dma_unmap_page(tx_ring->dev, dma_unmap_addr(ena_buf, paddr),
-				       dma_unmap_len(ena_buf, len), DMA_TO_DEVICE);
-		}
-	}
+	ena_unmap_tx_skb(tx_ring, tx_info);
+	tx_info->skb = NULL;
 
 error_drop_packet:
-
 	dev_kfree_skb(skb);
 	return NETDEV_TX_OK;
 }
 
-#ifdef CONFIG_NET_POLL_CONTROLLER
-static void ena_netpoll(struct net_device *netdev)
-{
-	struct ena_adapter *adapter = netdev_priv(netdev);
-	int i;
-
-	/* Dont schedule NAPI if the driver is in the middle of reset
-	 * or netdev is down.
-	 */
-
-	if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags) ||
-	    test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))
-		return;
-
-	for (i = 0; i < adapter->num_queues; i++)
-		napi_schedule(&adapter->ena_napi[i].napi);
-}
-#endif /* CONFIG_NET_POLL_CONTROLLER */
-
-#ifdef  HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK
+#ifdef HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V2
+static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb,
+			    struct net_device *sb_dev,
+			    select_queue_fallback_t fallback)
+#elif defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V1
 static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb,
-			    void *accel_priv, select_queue_fallback_t fallback)
+			    void *accel_priv,
+			    select_queue_fallback_t fallback)
 #elif defined HAVE_NDO_SELECT_QUEUE_ACCEL
 /* Return subqueue id on this core (one per core). */
 static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb,
@@ -2376,13 +2448,15 @@ static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb)
 	if (skb_rx_queue_recorded(skb))
 		qid = skb_get_rx_queue(skb);
 	else
-#if (defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK)
+#if (defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V2)
+		qid = fallback(dev, skb, NULL);
+#elif (defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V1)
 		qid = fallback(dev, skb);
 #elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,9,0)
 		qid = __netdev_pick_tx(dev, skb);
 #else
 		qid = skb_tx_hash(dev, skb);
-#endif /* HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK */
+#endif /* HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V2 */
 
 	return qid;
 }
@@ -2409,7 +2483,7 @@ static void ena_set_rx_mode(struct net_device *netdev)
 }
 #endif /* HAVE_SET_RX_MODE */
 
-static void ena_config_host_info(struct ena_com_dev *ena_dev)
+static void ena_config_host_info(struct ena_com_dev *ena_dev, struct pci_dev *pdev)
 {
 	struct ena_admin_host_info *host_info;
 	int rc;
@@ -2423,6 +2497,7 @@ static void ena_config_host_info(struct ena_com_dev *ena_dev)
 
 	host_info = ena_dev->host_attr.host_info;
 
+	host_info->bdf = (pdev->bus->number << 8) | pdev->devfn;
 	host_info->os_type = ENA_ADMIN_OS_LINUX;
 	host_info->kernel_ver = LINUX_VERSION_CODE;
 	strncpy(host_info->kernel_ver_str, utsname()->version,
@@ -2433,7 +2508,9 @@ static void ena_config_host_info(struct ena_com_dev *ena_dev)
 	host_info->driver_version =
 		(DRV_MODULE_VER_MAJOR) |
 		(DRV_MODULE_VER_MINOR << ENA_ADMIN_HOST_INFO_MINOR_SHIFT) |
-		(DRV_MODULE_VER_SUBMINOR << ENA_ADMIN_HOST_INFO_SUB_MINOR_SHIFT);
+		(DRV_MODULE_VER_SUBMINOR << ENA_ADMIN_HOST_INFO_SUB_MINOR_SHIFT) |
+		("g"[0] << ENA_ADMIN_HOST_INFO_MODULE_TYPE_SHIFT);
+	host_info->num_cpus = num_online_cpus();
 
 	rc = ena_com_set_host_attributes(ena_dev);
 	if (rc) {
@@ -2488,8 +2565,16 @@ static void ena_config_debug_area(struct ena_adapter *adapter)
 	ena_com_delete_debug_area(adapter->ena_dev);
 }
 
+static void ena_extra_properties_strings_destroy(struct net_device *netdev)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+
+	ena_com_delete_extra_properties_strings(adapter->ena_dev);
+	adapter->ena_extra_properties_count = 0;
+}
+
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36))
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,11,0))
+#ifdef NDO_GET_STATS_64_V2
 static void ena_get_stats64(struct net_device *netdev,
 			    struct rtnl_link_stats64 *stats)
 #else
@@ -2504,7 +2589,7 @@ static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
 	int i;
 
 	if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,11,0))
+#ifdef NDO_GET_STATS_64_V2
 		return;
 #else
 		return NULL;
@@ -2555,7 +2640,7 @@ static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
 
 	stats->rx_errors = 0;
 	stats->tx_errors = 0;
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,11,0))
+#ifndef NDO_GET_STATS_64_V2
 		return stats;
 #endif
 }
@@ -2663,12 +2748,9 @@ static const struct net_device_ops ena_netdev_ops = {
 	.ndo_set_rx_mode	= ena_set_rx_mode,
 #endif
 	.ndo_validate_addr	= eth_validate_addr,
-#ifdef CONFIG_NET_POLL_CONTROLLER
-	.ndo_poll_controller	= ena_netpoll,
 #if ENA_BUSY_POLL_SUPPORT
 	.ndo_busy_poll		= ena_busy_poll,
 #endif
-#endif /* CONFIG_NET_POLL_CONTROLLER */
 };
 
 static int ena_device_validate_params(struct ena_adapter *adapter,
@@ -2685,13 +2767,6 @@ static int ena_device_validate_params(struct ena_adapter *adapter,
 		return -EINVAL;
 	}
 
-	if ((get_feat_ctx->max_queues.max_cq_num < adapter->num_queues) ||
-	    (get_feat_ctx->max_queues.max_sq_num < adapter->num_queues)) {
-		netif_err(adapter, drv, netdev,
-			  "Error, device doesn't support enough queues\n");
-		return -EINVAL;
-	}
-
 	if (get_feat_ctx->dev_attr.max_mtu < netdev->mtu) {
 		netif_err(adapter, drv, netdev,
 			  "Error, device max mtu is smaller than netdev MTU\n");
@@ -2756,7 +2831,7 @@ static int ena_device_init(struct ena_com_dev *ena_dev, struct pci_dev *pdev,
 	}
 
 	/* ENA admin level init */
-	rc = ena_com_admin_init(ena_dev, &aenq_handlers, true);
+	rc = ena_com_admin_init(ena_dev, &aenq_handlers);
 	if (rc) {
 		dev_err(dev,
 			"Can not initialize ena admin queue with device\n");
@@ -2769,7 +2844,7 @@ static int ena_device_init(struct ena_com_dev *ena_dev, struct pci_dev *pdev,
 	 */
 	ena_com_set_admin_polling_mode(ena_dev, true);
 
-	ena_config_host_info(ena_dev);
+	ena_config_host_info(ena_dev, pdev);
 
 	/* Get Device Attributes*/
 	rc = ena_com_get_dev_attr_feat(ena_dev, get_feat_ctx);
@@ -2839,29 +2914,30 @@ static int ena_enable_msix_and_set_admin_interrupts(struct ena_adapter *adapter,
 	return rc;
 }
 
-static void ena_destroy_device(struct ena_adapter *adapter)
+static void ena_destroy_device(struct ena_adapter *adapter, bool graceful)
 {
 	struct net_device *netdev = adapter->netdev;
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
 	bool dev_up;
 
+	if (!test_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags))
+		return;
+
 	netif_carrier_off(netdev);
 
 	del_timer_sync(&adapter->timer_service);
 
 	dev_up = test_bit(ENA_FLAG_DEV_UP, &adapter->flags);
 	adapter->dev_up_before_reset = dev_up;
-
 	ena_sysfs_terminate(&adapter->pdev->dev);
-	ena_com_set_admin_running_state(ena_dev, false);
+	if (!graceful)
+		ena_com_set_admin_running_state(ena_dev, false);
 
 	if (test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
 		ena_down(adapter);
 
-	/* Before releasing the ENA resources, a device reset is required.
-	 * (to prevent the device from accessing them).
-	 * In case the reset flag is set and the device is up, ena_down()
-	 * already perform the reset, so it can be skipped.
+	/* Stop the device from sending AENQ events (in case reset flag is set
+	 *  and device is up, ena_down() already reset the device.
 	 */
 	if (!(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags) && dev_up))
 		ena_com_dev_reset(adapter->ena_dev, adapter->reset_reason);
@@ -2881,12 +2957,67 @@ static void ena_destroy_device(struct ena_adapter *adapter)
 	adapter->reset_reason = ENA_REGS_RESET_NORMAL;
 
 	clear_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+	clear_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags);
+}
+
+static int ena_handle_updated_queues(struct ena_adapter *adapter,
+				     struct ena_com_dev_get_features_ctx *get_feat_ctx)
+{
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	struct pci_dev *pdev = adapter->pdev;
+	struct ena_calc_queue_size_ctx calc_queue_ctx = { 0 };
+	bool are_queues_changed = false;
+	int io_queue_num, rc;
+
+	calc_queue_ctx.ena_dev = ena_dev;
+	calc_queue_ctx.get_feat_ctx = get_feat_ctx;
+	calc_queue_ctx.pdev = pdev;
+
+	io_queue_num = ena_calc_io_queue_num(pdev, ena_dev, get_feat_ctx);
+	rc = ena_calc_queue_size(&calc_queue_ctx);
+	if (unlikely(rc || io_queue_num <= 0))
+		return -EFAULT;
+
+	if (unlikely(adapter->tx_ring_size > calc_queue_ctx.tx_queue_size ||
+		     adapter->rx_ring_size > calc_queue_ctx.rx_queue_size)) {
+		dev_err(&pdev->dev,
+			"Not enough resources to allocate requested queue sizes (TX,RX)=(%d,%d), falling back to queue sizes (TX,RX)=(%d,%d)\n",
+			adapter->tx_ring_size,
+			adapter->rx_ring_size,
+			calc_queue_ctx.tx_queue_size,
+			calc_queue_ctx.rx_queue_size);
+		adapter->tx_ring_size = calc_queue_ctx.tx_queue_size;
+		adapter->rx_ring_size = calc_queue_ctx.rx_queue_size;
+		adapter->max_tx_sgl_size = calc_queue_ctx.max_tx_sgl_size;
+		adapter->max_rx_sgl_size = calc_queue_ctx.max_rx_sgl_size;
+		are_queues_changed = true;
+	}
+
+	if (unlikely(adapter->num_queues > io_queue_num)) {
+		dev_err(&pdev->dev,
+			"Not enough resources to allocate %d queues, falling back to %d queues\n",
+			adapter->num_queues, io_queue_num);
+		adapter->num_queues = io_queue_num;
+		ena_com_rss_destroy(ena_dev);
+		rc = ena_rss_init_default(adapter);
+		if (unlikely(rc && (rc != -EOPNOTSUPP))) {
+			dev_err(&pdev->dev, "Cannot init RSS rc: %d\n", rc);
+			return rc;
+		}
+		are_queues_changed = true;
+	}
+
+	if (unlikely(are_queues_changed))
+		ena_init_io_rings(adapter);
+
+	return 0;
 }
 
 static int ena_restore_device(struct ena_adapter *adapter)
 {
 	struct ena_com_dev_get_features_ctx get_feat_ctx;
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	struct net_device *netdev = adapter->netdev;
 	struct pci_dev *pdev = adapter->pdev;
 	bool wd_state;
 	int rc;
@@ -2905,10 +3036,14 @@ static int ena_restore_device(struct ena_adapter *adapter)
 		goto err_device_destroy;
 	}
 
+	rc = ena_handle_updated_queues(adapter, &get_feat_ctx);
+	if (rc)
+		goto err_device_destroy;
+
 	clear_bit(ENA_FLAG_ONGOING_RESET, &adapter->flags);
 	/* Make sure we don't have a race with AENQ Links state handler */
 	if (test_bit(ENA_FLAG_LINK_UP, &adapter->flags))
-		netif_carrier_on(adapter->netdev);
+		netif_carrier_on(netdev);
 
 	rc = ena_enable_msix_and_set_admin_interrupts(adapter,
 						      adapter->num_queues);
@@ -2923,15 +3058,18 @@ static int ena_restore_device(struct ena_adapter *adapter)
 	}
 	/* If the interface was up before the reset bring it up */
 	if (adapter->dev_up_before_reset) {
-		rc = ena_up(adapter);
+		rc = ena_open(netdev);
 		if (rc) {
 			dev_err(&pdev->dev, "Failed to create I/O queues\n");
 			goto err_sysfs_terminate;
 		}
 	}
 
+	set_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags);
 	mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ));
-	dev_err(&pdev->dev, "Device reset completed successfully\n");
+	dev_err(&pdev->dev,
+		"Device reset completed successfully, Driver info: %s\n",
+		version);
 
 	return rc;
 err_sysfs_terminate:
@@ -2940,7 +3078,11 @@ static int ena_restore_device(struct ena_adapter *adapter)
 	ena_free_mgmnt_irq(adapter);
 	ena_disable_msix(adapter);
 err_device_destroy:
+	ena_com_abort_admin_commands(ena_dev);
+	ena_com_wait_for_abort_completion(ena_dev);
 	ena_com_admin_destroy(ena_dev);
+	ena_com_dev_reset(ena_dev, ENA_REGS_RESET_DRIVER_INVALID_STATE);
+	ena_com_mmio_reg_read_request_destroy(ena_dev);
 err:
 	clear_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags);
 	clear_bit(ENA_FLAG_ONGOING_RESET, &adapter->flags);
@@ -2962,7 +3104,7 @@ static void ena_fw_reset_device(struct work_struct *work)
 		return;
 	}
 	rtnl_lock();
-	ena_destroy_device(adapter);
+	ena_destroy_device(adapter, false);
 	ena_restore_device(adapter);
 	rtnl_unlock();
 }
@@ -2979,19 +3121,20 @@ static int check_for_rx_interrupt_queue(struct ena_adapter *adapter,
 	rx_ring->no_interrupt_event_cnt++;
 
 	if (rx_ring->no_interrupt_event_cnt == ENA_MAX_NO_INTERRUPT_ITERATIONS) {
-	       netif_err(adapter, rx_err, adapter->netdev,
-			 "Potential MSIX issue on Rx side Queue = %d. Reset the device\n",
-			 rx_ring->qid);
-	       adapter->reset_reason = ENA_REGS_RESET_MISS_INTERRUPT;
-	       set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
-	       return -EIO;
+		netif_err(adapter, rx_err, adapter->netdev,
+			  "Potential MSIX issue on Rx side Queue = %d. Reset the device\n",
+			  rx_ring->qid);
+		adapter->reset_reason = ENA_REGS_RESET_MISS_INTERRUPT;
+		smp_mb__before_atomic();
+		set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+		return -EIO;
 	}
 
 	return 0;
 }
 
 static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
-				       struct ena_ring *tx_ring)
+					  struct ena_ring *tx_ring)
 {
 	struct ena_tx_buffer *tx_buf;
 	unsigned long last_jiffies;
@@ -3007,14 +3150,17 @@ static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
 			continue;
 
 		if (unlikely(!tx_ring->first_interrupt && time_is_before_jiffies(last_jiffies +
-					 2 * adapter->missing_tx_completion_to))) {
-			/* If after graceful period interrupt is still not received, we schedule a reset*/
-			       netif_err(adapter, tx_err, adapter->netdev,
-					 "Potential MSIX issue on Tx side Queue = %d. Reset the device\n",
-					 tx_ring->qid);
-			       adapter->reset_reason = ENA_REGS_RESET_MISS_INTERRUPT;
-			       set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
-			       return -EIO;
+			     2 * adapter->missing_tx_completion_to))) {
+			/* If after graceful period interrupt is still not
+			 * received, we schedule a reset
+			 */
+			netif_err(adapter, tx_err, adapter->netdev,
+				  "Potential MSIX issue on Tx side Queue = %d. Reset the device\n",
+				  tx_ring->qid);
+			adapter->reset_reason = ENA_REGS_RESET_MISS_INTERRUPT;
+			smp_mb__before_atomic();
+			set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+			return -EIO;
 		}
 
 		if (unlikely(time_is_before_jiffies(last_jiffies +
@@ -3117,8 +3263,7 @@ static void check_for_empty_rx_ring(struct ena_adapter *adapter)
 	for (i = 0; i < adapter->num_queues; i++) {
 		rx_ring = &adapter->rx_ring[i];
 
-		refill_required =
-			ena_com_sq_empty_space(rx_ring->ena_com_io_sq);
+		refill_required = ena_com_free_desc(rx_ring->ena_com_io_sq);
 		if (unlikely(refill_required == (rx_ring->ring_size - 1))) {
 			rx_ring->empty_rx_queue++;
 
@@ -3223,9 +3368,15 @@ static void ena_update_host_info(struct ena_admin_host_info *host_info,
 		(netdev->features & GENMASK_ULL(63, 32)) >> 32;
 }
 
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0)
+static void ena_timer_service(struct timer_list *t)
+{
+	struct ena_adapter *adapter = from_timer(adapter, t, timer_service);
+#else
 static void ena_timer_service(unsigned long data)
 {
 	struct ena_adapter *adapter = (struct ena_adapter *)data;
+#endif
 	u8 *debug_area = adapter->ena_dev->host_attr.debug_area_virt_addr;
 	struct ena_admin_host_info *host_info =
 		adapter->ena_dev->host_attr.host_info;
@@ -3260,28 +3411,32 @@ static int ena_calc_io_queue_num(struct pci_dev *pdev,
 				 struct ena_com_dev *ena_dev,
 				 struct ena_com_dev_get_features_ctx *get_feat_ctx)
 {
-	int io_sq_num, io_queue_num;
-
-	/* In case of LLQ use the llq number in the get feature cmd */
-	if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
-		io_sq_num = get_feat_ctx->max_queues.max_llq_num;
+	int io_tx_sq_num, io_tx_cq_num, io_rx_num, io_queue_num;
 
-		if (io_sq_num == 0) {
-			dev_err(&pdev->dev,
-				"Trying to use LLQ but llq_num is 0. Fall back into regular queues\n");
+	if (ena_dev->supported_features & BIT(ENA_ADMIN_MAX_QUEUES_EXT)) {
+		struct ena_admin_queue_ext_feature_fields *max_queue_ext =
+			&get_feat_ctx->max_queue_ext.max_queue_ext;
+		io_rx_num = min_t(int, max_queue_ext->max_rx_sq_num,
+				  max_queue_ext->max_rx_cq_num);
 
-			ena_dev->tx_mem_queue_type =
-				ENA_ADMIN_PLACEMENT_POLICY_HOST;
-			io_sq_num = get_feat_ctx->max_queues.max_sq_num;
-		}
+		io_tx_sq_num = max_queue_ext->max_tx_sq_num;
+		io_tx_cq_num = max_queue_ext->max_tx_cq_num;
 	} else {
-		io_sq_num = get_feat_ctx->max_queues.max_sq_num;
+		struct ena_admin_queue_feature_desc *max_queues =
+			&get_feat_ctx->max_queues;
+		io_tx_sq_num = max_queues->max_sq_num;
+		io_tx_cq_num = max_queues->max_cq_num;
+		io_rx_num = min_t(int, io_tx_sq_num, io_tx_cq_num);
 	}
 
+	/* In case of LLQ use the llq fields for the tx SQ/CQ */
+	if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
+		io_tx_sq_num = get_feat_ctx->llq.max_llq_num;
+
 	io_queue_num = min_t(int, num_online_cpus(), ENA_MAX_NUM_IO_QUEUES);
-	io_queue_num = min_t(int, io_queue_num, io_sq_num);
-	io_queue_num = min_t(int, io_queue_num,
-			     get_feat_ctx->max_queues.max_cq_num);
+	io_queue_num = min_t(int, io_queue_num, io_rx_num);
+	io_queue_num = min_t(int, io_queue_num, io_tx_sq_num);
+	io_queue_num = min_t(int, io_queue_num, io_tx_cq_num);
 	/* 1 IRQ for for mgmnt and 1 IRQs for each IO direction */
 	io_queue_num = min_t(int, io_queue_num, pci_msix_vec_count(pdev) - 1);
 	if (unlikely(!io_queue_num)) {
@@ -3292,18 +3447,52 @@ static int ena_calc_io_queue_num(struct pci_dev *pdev,
 	return io_queue_num;
 }
 
-static void ena_set_push_mode(struct pci_dev *pdev, struct ena_com_dev *ena_dev,
-			      struct ena_com_dev_get_features_ctx *get_feat_ctx)
+static int ena_set_queues_placement_policy(struct pci_dev *pdev,
+					   struct ena_com_dev *ena_dev,
+					   struct ena_admin_feature_llq_desc *llq,
+					   struct ena_llq_configurations *llq_default_configurations)
 {
 	bool has_mem_bar;
+	int rc;
+	u32 llq_feature_mask;
+
+	llq_feature_mask = 1 << ENA_ADMIN_LLQ;
+	if (!(ena_dev->supported_features & llq_feature_mask)) {
+		dev_err(&pdev->dev,
+			"LLQ is not supported Fallback to host mode policy.\n");
+		ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
+		return 0;
+	}
 
 	has_mem_bar = pci_select_bars(pdev, IORESOURCE_MEM) & BIT(ENA_MEM_BAR);
 
-	/* Enable push mode if device supports LLQ */
-	if (has_mem_bar && (get_feat_ctx->max_queues.max_llq_num > 0))
-		ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_DEV;
-	else
+	rc = ena_com_config_dev_mode(ena_dev, llq, llq_default_configurations);
+	if (unlikely(rc)) {
+		dev_err(&pdev->dev,
+			"Failed to configure the device mode.  Fallback to host mode policy.\n");
 		ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
+		return 0;
+	}
+
+	/* Nothing to config, exit */
+	if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST)
+		return 0;
+
+	if (!has_mem_bar) {
+		dev_err(&pdev->dev,
+			"ENA device does not expose LLQ bar. Fallback to host mode policy.\n");
+		ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
+		return 0;
+	}
+
+	ena_dev->mem_bar = devm_ioremap_wc(&pdev->dev,
+					   pci_resource_start(pdev, ENA_MEM_BAR),
+					   pci_resource_len(pdev, ENA_MEM_BAR));
+
+	if (!ena_dev->mem_bar)
+		return -EFAULT;
+
+	return 0;
 }
 
 static void ena_set_dev_offloads(struct ena_com_dev_get_features_ctx *feat,
@@ -3430,48 +3619,82 @@ static int ena_rss_init_default(struct ena_adapter *adapter)
 
 static void ena_release_bars(struct ena_com_dev *ena_dev, struct pci_dev *pdev)
 {
-	int release_bars;
-
-	if (ena_dev->mem_bar)
-		devm_iounmap(&pdev->dev, ena_dev->mem_bar);
-
-	if (ena_dev->reg_bar)
-		devm_iounmap(&pdev->dev, ena_dev->reg_bar);
+	int release_bars = pci_select_bars(pdev, IORESOURCE_MEM) & ENA_BAR_MASK;
 
-	release_bars = pci_select_bars(pdev, IORESOURCE_MEM) & ENA_BAR_MASK;
 	pci_release_selected_regions(pdev, release_bars);
 }
 
-static int ena_calc_queue_size(struct pci_dev *pdev,
-			       struct ena_com_dev *ena_dev,
-			       u16 *max_tx_sgl_size,
-			       u16 *max_rx_sgl_size,
-			       struct ena_com_dev_get_features_ctx *get_feat_ctx)
+static inline void set_default_llq_configurations(struct ena_llq_configurations *llq_config)
 {
-	u32 queue_size = ENA_DEFAULT_RING_SIZE;
+	llq_config->llq_header_location = ENA_ADMIN_INLINE_HEADER;
+	llq_config->llq_ring_entry_size = ENA_ADMIN_LIST_ENTRY_SIZE_128B;
+	llq_config->llq_stride_ctrl = ENA_ADMIN_MULTIPLE_DESCS_PER_ENTRY;
+	llq_config->llq_num_decs_before_header = ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_2;
+	llq_config->llq_ring_entry_size_value = 128;
+}
 
-	queue_size = min_t(u32, queue_size,
-			   get_feat_ctx->max_queues.max_cq_depth);
-	queue_size = min_t(u32, queue_size,
-			   get_feat_ctx->max_queues.max_sq_depth);
+static int ena_calc_queue_size(struct ena_calc_queue_size_ctx *ctx)
+{
+	struct ena_admin_feature_llq_desc *llq = &ctx->get_feat_ctx->llq;
+	struct ena_com_dev *ena_dev = ctx->ena_dev;
+	u32 tx_queue_size = ENA_DEFAULT_RING_SIZE;
+
+	if (ctx->ena_dev->supported_features & BIT(ENA_ADMIN_MAX_QUEUES_EXT)) {
+		struct ena_admin_queue_ext_feature_fields *max_queue_ext =
+			&ctx->get_feat_ctx->max_queue_ext.max_queue_ext;
+		rx_queue_size = min_t(u32, rx_queue_size,
+				      max_queue_ext->max_rx_cq_depth);
+		rx_queue_size = min_t(u32, rx_queue_size,
+				      max_queue_ext->max_rx_sq_depth);
+		tx_queue_size = min_t(u32, tx_queue_size,
+				      max_queue_ext->max_tx_cq_depth);
+
+		if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
+			tx_queue_size = min_t(u32, tx_queue_size,
+					      llq->max_llq_depth);
+		else
+			tx_queue_size = min_t(u32, tx_queue_size,
+					      max_queue_ext->max_tx_sq_depth);
 
-	if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
-		queue_size = min_t(u32, queue_size,
-				   get_feat_ctx->max_queues.max_llq_depth);
+		ctx->max_rx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
+					     max_queue_ext->max_per_packet_rx_descs);
+		ctx->max_tx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
+					     max_queue_ext->max_per_packet_tx_descs);
+	} else {
+		struct ena_admin_queue_feature_desc *max_queues =
+			&ctx->get_feat_ctx->max_queues;
+		rx_queue_size = min_t(u32, rx_queue_size,
+				      max_queues->max_cq_depth);
+		rx_queue_size = min_t(u32, rx_queue_size,
+				      max_queues->max_sq_depth);
+		tx_queue_size = min_t(u32, tx_queue_size,
+				      max_queues->max_cq_depth);
+
+		if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
+			tx_queue_size = min_t(u32, tx_queue_size,
+					      llq->max_llq_depth);
+		else
+			tx_queue_size = min_t(u32, tx_queue_size,
+					      max_queues->max_sq_depth);
+
+		ctx->max_tx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
+					     max_queues->max_packet_tx_descs);
+		ctx->max_rx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
+					     max_queues->max_packet_rx_descs);
+	}
 
-	queue_size = rounddown_pow_of_two(queue_size);
+	tx_queue_size = rounddown_pow_of_two(tx_queue_size);
+	rx_queue_size = rounddown_pow_of_two(rx_queue_size);
 
-	if (unlikely(!queue_size)) {
-		dev_err(&pdev->dev, "Invalid queue size\n");
+	if (unlikely(!rx_queue_size || !tx_queue_size)) {
+		dev_err(&ctx->pdev->dev, "Invalid queue size\n");
 		return -EFAULT;
 	}
 
-	*max_tx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
-				 get_feat_ctx->max_queues.max_packet_tx_descs);
-	*max_rx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
-				 get_feat_ctx->max_queues.max_packet_rx_descs);
+	ctx->rx_queue_size = rx_queue_size;
+	ctx->tx_queue_size = tx_queue_size;
 
-	return queue_size;
+	return 0;
 }
 
 /* ena_probe - Device Initialization Routine
@@ -3487,15 +3710,15 @@ static int ena_calc_queue_size(struct pci_dev *pdev,
 static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
 	struct ena_com_dev_get_features_ctx get_feat_ctx;
+	struct ena_calc_queue_size_ctx calc_queue_ctx = { 0 };
 	static int version_printed;
 	struct net_device *netdev;
 	struct ena_adapter *adapter;
+	struct ena_llq_configurations llq_config;
 	struct ena_com_dev *ena_dev = NULL;
+	char *queue_type_str;
 	static int adapters_found;
 	int io_queue_num, bars, rc;
-	int queue_size;
-	u16 tx_sgl_size = 0;
-	u16 rx_sgl_size = 0;
 	bool wd_state;
 
 	dev_dbg(&pdev->dev, "%s\n", __func__);
@@ -3544,32 +3767,36 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto err_free_region;
 	}
 
-	ena_set_push_mode(pdev, ena_dev, &get_feat_ctx);
+	set_default_llq_configurations(&llq_config);
 
-	if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
-		ena_dev->mem_bar = devm_ioremap_wc(&pdev->dev,
-						   pci_resource_start(pdev, ENA_MEM_BAR),
-						   pci_resource_len(pdev, ENA_MEM_BAR));
-		if (!ena_dev->mem_bar) {
-			rc = -EFAULT;
-			goto err_device_destroy;
-		}
+	rc = ena_set_queues_placement_policy(pdev, ena_dev, &get_feat_ctx.llq,
+					     &llq_config);
+	if (rc) {
+		dev_err(&pdev->dev, "ena device init failed\n");
+		goto err_device_destroy;
 	}
 
+	calc_queue_ctx.ena_dev = ena_dev;
+	calc_queue_ctx.get_feat_ctx = &get_feat_ctx;
+	calc_queue_ctx.pdev = pdev;
+
 	/* initial Tx interrupt delay, Assumes 1 usec granularity.
 	* Updated during device initialization with the real granularity
 	*/
 	ena_dev->intr_moder_tx_interval = ENA_INTR_INITIAL_TX_INTERVAL_USECS;
 	io_queue_num = ena_calc_io_queue_num(pdev, ena_dev, &get_feat_ctx);
-	queue_size = ena_calc_queue_size(pdev, ena_dev, &tx_sgl_size,
-					 &rx_sgl_size, &get_feat_ctx);
-	if ((queue_size <= 0) || (io_queue_num <= 0)) {
+	rc = ena_calc_queue_size(&calc_queue_ctx);
+	if (rc || io_queue_num <= 0) {
 		rc = -EFAULT;
 		goto err_device_destroy;
 	}
 
-	dev_info(&pdev->dev, "creating %d io queues. queue size: %d\n",
-		 io_queue_num, queue_size);
+	dev_info(&pdev->dev, "creating %d io queues. rx queue size: %d tx queue size. %d LLQ is %s\n",
+		 io_queue_num,
+		 calc_queue_ctx.rx_queue_size,
+		 calc_queue_ctx.tx_queue_size,
+		 (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) ?
+		 "ENABLED" : "DISABLED");
 
 	/* dev zeroed in init_etherdev */
 	netdev = alloc_etherdev_mq(sizeof(struct ena_adapter), io_queue_num);
@@ -3593,11 +3820,10 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	adapter->msg_enable = netif_msg_init(debug, DEFAULT_MSG_ENABLE);
 	adapter->reset_reason = ENA_REGS_RESET_NORMAL;
 
-	adapter->tx_ring_size = queue_size;
-	adapter->rx_ring_size = queue_size;
-
-	adapter->max_tx_sgl_size = tx_sgl_size;
-	adapter->max_rx_sgl_size = rx_sgl_size;
+	adapter->tx_ring_size = calc_queue_ctx.tx_queue_size;
+	adapter->rx_ring_size = calc_queue_ctx.rx_queue_size;
+	adapter->max_tx_sgl_size = calc_queue_ctx.max_tx_sgl_size;
+	adapter->max_rx_sgl_size = calc_queue_ctx.max_rx_sgl_size;
 
 	adapter->num_queues = io_queue_num;
 	adapter->last_monitored_tx_qid = 0;
@@ -3647,16 +3873,19 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	ena_config_debug_area(adapter);
 
+	adapter->ena_extra_properties_count =
+		ena_com_extra_properties_strings_init(ena_dev);
+
 	memcpy(adapter->netdev->perm_addr, adapter->mac_addr, netdev->addr_len);
 
+	netif_carrier_off(netdev);
+
 	rc = register_netdev(netdev);
 	if (rc) {
 		dev_err(&pdev->dev, "Cannot register net device\n");
 		goto err_rss;
 	}
 
-	netif_carrier_off(netdev);
-
 	INIT_WORK(&adapter->reset_task, ena_fw_reset_device);
 
 	adapter->last_keep_alive_jiffies = jiffies;
@@ -3666,13 +3895,23 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	ena_update_hints(adapter, &get_feat_ctx.hw_hints);
 
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0)
+	timer_setup(&adapter->timer_service, ena_timer_service, 0);
+#else
 	setup_timer(&adapter->timer_service, ena_timer_service,
 		    (unsigned long)adapter);
+#endif
 	mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ));
 
-	dev_info(&pdev->dev, "%s found at mem %lx, mac addr %pM Queues %d\n",
+	if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST)
+		queue_type_str = "Regular";
+	else
+		queue_type_str = "Low Latency";
+
+	dev_info(&pdev->dev,
+		 "%s found at mem %lx, mac addr %pM Queues %d, Placement policy: %s\n",
 		 DEVICE_NAME, (long)pci_resource_start(pdev, 0),
-		 netdev->dev_addr, io_queue_num);
+		 netdev->dev_addr, io_queue_num, queue_type_str);
 
 	set_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags);
 
@@ -3681,12 +3920,15 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	return 0;
 
 err_rss:
+	ena_extra_properties_strings_destroy(netdev);
 	ena_com_delete_debug_area(ena_dev);
 	ena_com_rss_destroy(ena_dev);
 err_terminate_sysfs:
 	ena_sysfs_terminate(&pdev->dev);
 err_free_msix:
 	ena_com_dev_reset(ena_dev, ENA_REGS_RESET_INIT_ERR);
+	/* stop submitting admin commands on a device that was reset */
+	ena_com_set_admin_running_state(ena_dev, false);
 	ena_free_mgmnt_irq(adapter);
 	ena_disable_msix(adapter);
 err_worker_destroy:
@@ -3706,34 +3948,6 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	return rc;
 }
 
-/*****************************************************************************/
-#ifdef HAVE_SRIOV_CONFIGURE
-static int ena_sriov_configure(struct pci_dev *dev, int numvfs)
-{
-	int rc;
-
-	if (numvfs > 0) {
-		rc = pci_enable_sriov(dev, numvfs);
-		if (rc != 0) {
-			dev_err(&dev->dev,
-				"pci_enable_sriov failed to enable: %d vfs with the error: %d\n",
-				numvfs, rc);
-			return rc;
-		}
-
-		return numvfs;
-	}
-
-	if (numvfs == 0) {
-		pci_disable_sriov(dev);
-		return 0;
-	}
-
-	return -EINVAL;
-}
-#endif /* HAVE_SRIOV_CONFIGURE */
-
-/*****************************************************************************/
 /*****************************************************************************/
 
 /* ena_remove - Device Removal Routine
@@ -3757,37 +3971,26 @@ static void ena_remove(struct pci_dev *pdev)
 		netdev->rx_cpu_rmap = NULL;
 	}
 #endif /* CONFIG_RFS_ACCEL */
-
-	unregister_netdev(netdev);
-	ena_sysfs_terminate(&pdev->dev);
 	del_timer_sync(&adapter->timer_service);
 
 	cancel_work_sync(&adapter->reset_task);
 
-	/* Reset the device only if the device is running. */
-	if (test_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags))
-		ena_com_dev_reset(ena_dev, adapter->reset_reason);
-
-	ena_free_mgmnt_irq(adapter);
+	rtnl_lock();
+	ena_destroy_device(adapter, true);
+	rtnl_unlock();
 
-	ena_disable_msix(adapter);
+	unregister_netdev(netdev);
 
 	free_netdev(netdev);
 
-	ena_com_mmio_reg_read_request_destroy(ena_dev);
-
-	ena_com_abort_admin_commands(ena_dev);
-
-	ena_com_wait_for_abort_completion(ena_dev);
-
-	ena_com_admin_destroy(ena_dev);
-
 	ena_com_rss_destroy(ena_dev);
 
 	ena_com_delete_debug_area(ena_dev);
 
 	ena_com_delete_host_info(ena_dev);
 
+	ena_extra_properties_strings_destroy(netdev);
+
 	ena_release_bars(ena_dev, pdev);
 
 	pci_disable_device(pdev);
@@ -3816,7 +4019,7 @@ static int ena_suspend(struct pci_dev *pdev,  pm_message_t state)
 			"ignoring device reset request as the device is being suspended\n");
 		clear_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
 	}
-	ena_destroy_device(adapter);
+	ena_destroy_device(adapter, true);
 	rtnl_unlock();
 	return 0;
 }
@@ -3850,9 +4053,9 @@ static struct pci_driver ena_pci_driver = {
 	.suspend    = ena_suspend,
 	.resume     = ena_resume,
 #endif
-#ifdef HAVE_SRIOV_CONFIGURE
-	.sriov_configure = ena_sriov_configure,
-#endif /* HAVE_SRIOV_CONFIGURE */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,18,0)
+	.sriov_configure = pci_sriov_configure_simple,
+#endif
 };
 
 static int __init ena_init(void)
diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h
old mode 100644
new mode 100755
index e806e05580df0..077f73455bfa9
--- a/drivers/amazon/net/ena/ena_netdev.h
+++ b/drivers/amazon/net/ena/ena_netdev.h
@@ -45,9 +45,9 @@
 #include "ena_com.h"
 #include "ena_eth_com.h"
 
-#define DRV_MODULE_VER_MAJOR	1
-#define DRV_MODULE_VER_MINOR	5
-#define DRV_MODULE_VER_SUBMINOR 0
+#define DRV_MODULE_VER_MAJOR	2
+#define DRV_MODULE_VER_MINOR	0
+#define DRV_MODULE_VER_SUBMINOR 2
 
 #define DRV_MODULE_NAME		"ena"
 #ifndef DRV_MODULE_VERSION
@@ -63,6 +63,17 @@
 #define ENA_ADMIN_MSIX_VEC		1
 #define ENA_MAX_MSIX_VEC(io_queues)	(ENA_ADMIN_MSIX_VEC + (io_queues))
 
+/* The ENA buffer length fields is 16 bit long. So when PAGE_SIZE == 64kB the
+ * driver passes 0.
+ * Since the max packet size the ENA handles is ~9kB limit the buffer length to
+ * 16kB.
+ */
+#if PAGE_SIZE > SZ_16K
+#define ENA_PAGE_SIZE SZ_16K
+#else
+#define ENA_PAGE_SIZE PAGE_SIZE
+#endif
+
 #define ENA_MIN_MSIX_VEC		2
 
 #define ENA_REG_BAR			0
@@ -72,7 +83,7 @@
 #define ENA_DEFAULT_RING_SIZE	(1024)
 
 #define ENA_TX_WAKEUP_THRESH		(MAX_SKB_FRAGS + 2)
-#define ENA_DEFAULT_RX_COPYBREAK	(128 - NET_IP_ALIGN)
+#define ENA_DEFAULT_RX_COPYBREAK	(256 - NET_IP_ALIGN)
 
 /* limit the buffer size to 600 bytes to handle MTU changes from very
  * small to very large, in which case the number of buffers per packet
@@ -97,10 +108,11 @@
  */
 #define ENA_TX_POLL_BUDGET_DIVIDER	4
 
-/* Refill Rx queue when number of available descriptors is below
- * QUEUE_SIZE / ENA_RX_REFILL_THRESH_DIVIDER
+/* Refill Rx queue when number of required descriptors is above
+ * QUEUE_SIZE / ENA_RX_REFILL_THRESH_DIVIDER or ENA_RX_REFILL_THRESH_PACKET
  */
 #define ENA_RX_REFILL_THRESH_DIVIDER	8
+#define ENA_RX_REFILL_THRESH_PACKET	256
 
 /* Number of queues to check for missing queues per timer service */
 #define ENA_MONITORED_TX_QUEUES	4
@@ -147,6 +159,16 @@ struct ena_napi {
 	u32 qid;
 };
 
+struct ena_calc_queue_size_ctx {
+	struct ena_com_dev_get_features_ctx *get_feat_ctx;
+	struct ena_com_dev *ena_dev;
+	struct pci_dev *pdev;
+	u16 rx_queue_size;
+	u16 tx_queue_size;
+	u16 max_tx_sgl_size;
+	u16 max_rx_sgl_size;
+};
+
 struct ena_tx_buffer {
 	struct sk_buff *skb;
 	/* num of ena desc for this specific skb
@@ -156,6 +178,9 @@ struct ena_tx_buffer {
 	/* num of buffers used by this skb */
 	u32 num_of_bufs;
 
+	/* Indicate if bufs[0] map the linear data of the skb. */
+	u8 map_linear_data;
+
 	/* Used for detect missing tx packets to limit the number of prints */
 	u32 print_once;
 	/* Save the last jiffies to detect missing tx packets
@@ -191,6 +216,7 @@ struct ena_stats_tx {
 	u64 tx_poll;
 	u64 doorbells;
 	u64 bad_req_id;
+	u64 llq_buffer_copy;
 	u64 missed_tx;
 };
 
@@ -211,6 +237,7 @@ struct ena_stats_rx {
 #endif
 	u64 bad_req_id;
 	u64 empty_rx_ring;
+	u64 csum_unchecked;
 };
 
 struct ena_ring {
@@ -267,6 +294,8 @@ struct ena_ring {
 		struct ena_stats_tx tx_stats;
 		struct ena_stats_rx rx_stats;
 	};
+
+	u8 *push_buf_intermediate_buf;
 	int empty_rx_queue;
 #if ENA_BUSY_POLL_SUPPORT
 	atomic_t bp_state;
@@ -369,6 +398,8 @@ struct ena_adapter {
 	u32 last_monitored_tx_qid;
 
 	enum ena_regs_reset_reason_types reset_reason;
+
+	u8 ena_extra_properties_count;
 };
 
 void ena_set_ethtool_ops(struct net_device *netdev);
diff --git a/drivers/amazon/net/ena/ena_pci_id_tbl.h b/drivers/amazon/net/ena/ena_pci_id_tbl.h
old mode 100644
new mode 100755
diff --git a/drivers/amazon/net/ena/ena_regs_defs.h b/drivers/amazon/net/ena/ena_regs_defs.h
old mode 100644
new mode 100755
index 48ca97fbe7bc6..59bd75534a627
--- a/drivers/amazon/net/ena/ena_regs_defs.h
+++ b/drivers/amazon/net/ena/ena_regs_defs.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2015 - 2016 Amazon.com, Inc. or its affiliates.
+ * Copyright 2015 - 2018 Amazon.com, Inc. or its affiliates.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -33,137 +34,125 @@
 #define _ENA_REGS_H_
 
 enum ena_regs_reset_reason_types {
-	ENA_REGS_RESET_NORMAL			= 0,
-
-	ENA_REGS_RESET_KEEP_ALIVE_TO		= 1,
-
-	ENA_REGS_RESET_ADMIN_TO			= 2,
-
-	ENA_REGS_RESET_MISS_TX_CMPL		= 3,
-
-	ENA_REGS_RESET_INV_RX_REQ_ID		= 4,
-
-	ENA_REGS_RESET_INV_TX_REQ_ID		= 5,
-
-	ENA_REGS_RESET_TOO_MANY_RX_DESCS	= 6,
-
-	ENA_REGS_RESET_INIT_ERR			= 7,
-
-	ENA_REGS_RESET_DRIVER_INVALID_STATE	= 8,
-
-	ENA_REGS_RESET_OS_TRIGGER		= 9,
-
-	ENA_REGS_RESET_OS_NETDEV_WD		= 10,
-
-	ENA_REGS_RESET_SHUTDOWN			= 11,
-
-	ENA_REGS_RESET_USER_TRIGGER		= 12,
-
-	ENA_REGS_RESET_GENERIC			= 13,
-
-	ENA_REGS_RESET_MISS_INTERRUPT		= 14,
+	ENA_REGS_RESET_NORMAL                       = 0,
+	ENA_REGS_RESET_KEEP_ALIVE_TO                = 1,
+	ENA_REGS_RESET_ADMIN_TO                     = 2,
+	ENA_REGS_RESET_MISS_TX_CMPL                 = 3,
+	ENA_REGS_RESET_INV_RX_REQ_ID                = 4,
+	ENA_REGS_RESET_INV_TX_REQ_ID                = 5,
+	ENA_REGS_RESET_TOO_MANY_RX_DESCS            = 6,
+	ENA_REGS_RESET_INIT_ERR                     = 7,
+	ENA_REGS_RESET_DRIVER_INVALID_STATE         = 8,
+	ENA_REGS_RESET_OS_TRIGGER                   = 9,
+	ENA_REGS_RESET_OS_NETDEV_WD                 = 10,
+	ENA_REGS_RESET_SHUTDOWN                     = 11,
+	ENA_REGS_RESET_USER_TRIGGER                 = 12,
+	ENA_REGS_RESET_GENERIC                      = 13,
+	ENA_REGS_RESET_MISS_INTERRUPT               = 14,
 };
 
 /* ena_registers offsets */
-#define ENA_REGS_VERSION_OFF		0x0
-#define ENA_REGS_CONTROLLER_VERSION_OFF		0x4
-#define ENA_REGS_CAPS_OFF		0x8
-#define ENA_REGS_CAPS_EXT_OFF		0xc
-#define ENA_REGS_AQ_BASE_LO_OFF		0x10
-#define ENA_REGS_AQ_BASE_HI_OFF		0x14
-#define ENA_REGS_AQ_CAPS_OFF		0x18
-#define ENA_REGS_ACQ_BASE_LO_OFF		0x20
-#define ENA_REGS_ACQ_BASE_HI_OFF		0x24
-#define ENA_REGS_ACQ_CAPS_OFF		0x28
-#define ENA_REGS_AQ_DB_OFF		0x2c
-#define ENA_REGS_ACQ_TAIL_OFF		0x30
-#define ENA_REGS_AENQ_CAPS_OFF		0x34
-#define ENA_REGS_AENQ_BASE_LO_OFF		0x38
-#define ENA_REGS_AENQ_BASE_HI_OFF		0x3c
-#define ENA_REGS_AENQ_HEAD_DB_OFF		0x40
-#define ENA_REGS_AENQ_TAIL_OFF		0x44
-#define ENA_REGS_INTR_MASK_OFF		0x4c
-#define ENA_REGS_DEV_CTL_OFF		0x54
-#define ENA_REGS_DEV_STS_OFF		0x58
-#define ENA_REGS_MMIO_REG_READ_OFF		0x5c
-#define ENA_REGS_MMIO_RESP_LO_OFF		0x60
-#define ENA_REGS_MMIO_RESP_HI_OFF		0x64
-#define ENA_REGS_RSS_IND_ENTRY_UPDATE_OFF		0x68
+
+/* 0 base */
+#define ENA_REGS_VERSION_OFF                                0x0
+#define ENA_REGS_CONTROLLER_VERSION_OFF                     0x4
+#define ENA_REGS_CAPS_OFF                                   0x8
+#define ENA_REGS_CAPS_EXT_OFF                               0xc
+#define ENA_REGS_AQ_BASE_LO_OFF                             0x10
+#define ENA_REGS_AQ_BASE_HI_OFF                             0x14
+#define ENA_REGS_AQ_CAPS_OFF                                0x18
+#define ENA_REGS_ACQ_BASE_LO_OFF                            0x20
+#define ENA_REGS_ACQ_BASE_HI_OFF                            0x24
+#define ENA_REGS_ACQ_CAPS_OFF                               0x28
+#define ENA_REGS_AQ_DB_OFF                                  0x2c
+#define ENA_REGS_ACQ_TAIL_OFF                               0x30
+#define ENA_REGS_AENQ_CAPS_OFF                              0x34
+#define ENA_REGS_AENQ_BASE_LO_OFF                           0x38
+#define ENA_REGS_AENQ_BASE_HI_OFF                           0x3c
+#define ENA_REGS_AENQ_HEAD_DB_OFF                           0x40
+#define ENA_REGS_AENQ_TAIL_OFF                              0x44
+#define ENA_REGS_INTR_MASK_OFF                              0x4c
+#define ENA_REGS_DEV_CTL_OFF                                0x54
+#define ENA_REGS_DEV_STS_OFF                                0x58
+#define ENA_REGS_MMIO_REG_READ_OFF                          0x5c
+#define ENA_REGS_MMIO_RESP_LO_OFF                           0x60
+#define ENA_REGS_MMIO_RESP_HI_OFF                           0x64
+#define ENA_REGS_RSS_IND_ENTRY_UPDATE_OFF                   0x68
 
 /* version register */
-#define ENA_REGS_VERSION_MINOR_VERSION_MASK		0xff
-#define ENA_REGS_VERSION_MAJOR_VERSION_SHIFT		8
-#define ENA_REGS_VERSION_MAJOR_VERSION_MASK		0xff00
+#define ENA_REGS_VERSION_MINOR_VERSION_MASK                 0xff
+#define ENA_REGS_VERSION_MAJOR_VERSION_SHIFT                8
+#define ENA_REGS_VERSION_MAJOR_VERSION_MASK                 0xff00
 
 /* controller_version register */
-#define ENA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK		0xff
-#define ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT		8
-#define ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK		0xff00
-#define ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT		16
-#define ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK		0xff0000
-#define ENA_REGS_CONTROLLER_VERSION_IMPL_ID_SHIFT		24
-#define ENA_REGS_CONTROLLER_VERSION_IMPL_ID_MASK		0xff000000
+#define ENA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK   0xff
+#define ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT     8
+#define ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK      0xff00
+#define ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT     16
+#define ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK      0xff0000
+#define ENA_REGS_CONTROLLER_VERSION_IMPL_ID_SHIFT           24
+#define ENA_REGS_CONTROLLER_VERSION_IMPL_ID_MASK            0xff000000
 
 /* caps register */
-#define ENA_REGS_CAPS_CONTIGUOUS_QUEUE_REQUIRED_MASK		0x1
-#define ENA_REGS_CAPS_RESET_TIMEOUT_SHIFT		1
-#define ENA_REGS_CAPS_RESET_TIMEOUT_MASK		0x3e
-#define ENA_REGS_CAPS_DMA_ADDR_WIDTH_SHIFT		8
-#define ENA_REGS_CAPS_DMA_ADDR_WIDTH_MASK		0xff00
-#define ENA_REGS_CAPS_ADMIN_CMD_TO_SHIFT		16
-#define ENA_REGS_CAPS_ADMIN_CMD_TO_MASK		0xf0000
+#define ENA_REGS_CAPS_CONTIGUOUS_QUEUE_REQUIRED_MASK        0x1
+#define ENA_REGS_CAPS_RESET_TIMEOUT_SHIFT                   1
+#define ENA_REGS_CAPS_RESET_TIMEOUT_MASK                    0x3e
+#define ENA_REGS_CAPS_DMA_ADDR_WIDTH_SHIFT                  8
+#define ENA_REGS_CAPS_DMA_ADDR_WIDTH_MASK                   0xff00
+#define ENA_REGS_CAPS_ADMIN_CMD_TO_SHIFT                    16
+#define ENA_REGS_CAPS_ADMIN_CMD_TO_MASK                     0xf0000
 
 /* aq_caps register */
-#define ENA_REGS_AQ_CAPS_AQ_DEPTH_MASK		0xffff
-#define ENA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_SHIFT		16
-#define ENA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_MASK		0xffff0000
+#define ENA_REGS_AQ_CAPS_AQ_DEPTH_MASK                      0xffff
+#define ENA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_SHIFT                16
+#define ENA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_MASK                 0xffff0000
 
 /* acq_caps register */
-#define ENA_REGS_ACQ_CAPS_ACQ_DEPTH_MASK		0xffff
-#define ENA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_SHIFT		16
-#define ENA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_MASK		0xffff0000
+#define ENA_REGS_ACQ_CAPS_ACQ_DEPTH_MASK                    0xffff
+#define ENA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_SHIFT              16
+#define ENA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_MASK               0xffff0000
 
 /* aenq_caps register */
-#define ENA_REGS_AENQ_CAPS_AENQ_DEPTH_MASK		0xffff
-#define ENA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_SHIFT		16
-#define ENA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_MASK		0xffff0000
+#define ENA_REGS_AENQ_CAPS_AENQ_DEPTH_MASK                  0xffff
+#define ENA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_SHIFT            16
+#define ENA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_MASK             0xffff0000
 
 /* dev_ctl register */
-#define ENA_REGS_DEV_CTL_DEV_RESET_MASK		0x1
-#define ENA_REGS_DEV_CTL_AQ_RESTART_SHIFT		1
-#define ENA_REGS_DEV_CTL_AQ_RESTART_MASK		0x2
-#define ENA_REGS_DEV_CTL_QUIESCENT_SHIFT		2
-#define ENA_REGS_DEV_CTL_QUIESCENT_MASK		0x4
-#define ENA_REGS_DEV_CTL_IO_RESUME_SHIFT		3
-#define ENA_REGS_DEV_CTL_IO_RESUME_MASK		0x8
-#define ENA_REGS_DEV_CTL_RESET_REASON_SHIFT		28
-#define ENA_REGS_DEV_CTL_RESET_REASON_MASK		0xf0000000
+#define ENA_REGS_DEV_CTL_DEV_RESET_MASK                     0x1
+#define ENA_REGS_DEV_CTL_AQ_RESTART_SHIFT                   1
+#define ENA_REGS_DEV_CTL_AQ_RESTART_MASK                    0x2
+#define ENA_REGS_DEV_CTL_QUIESCENT_SHIFT                    2
+#define ENA_REGS_DEV_CTL_QUIESCENT_MASK                     0x4
+#define ENA_REGS_DEV_CTL_IO_RESUME_SHIFT                    3
+#define ENA_REGS_DEV_CTL_IO_RESUME_MASK                     0x8
+#define ENA_REGS_DEV_CTL_RESET_REASON_SHIFT                 28
+#define ENA_REGS_DEV_CTL_RESET_REASON_MASK                  0xf0000000
 
 /* dev_sts register */
-#define ENA_REGS_DEV_STS_READY_MASK		0x1
-#define ENA_REGS_DEV_STS_AQ_RESTART_IN_PROGRESS_SHIFT		1
-#define ENA_REGS_DEV_STS_AQ_RESTART_IN_PROGRESS_MASK		0x2
-#define ENA_REGS_DEV_STS_AQ_RESTART_FINISHED_SHIFT		2
-#define ENA_REGS_DEV_STS_AQ_RESTART_FINISHED_MASK		0x4
-#define ENA_REGS_DEV_STS_RESET_IN_PROGRESS_SHIFT		3
-#define ENA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK		0x8
-#define ENA_REGS_DEV_STS_RESET_FINISHED_SHIFT		4
-#define ENA_REGS_DEV_STS_RESET_FINISHED_MASK		0x10
-#define ENA_REGS_DEV_STS_FATAL_ERROR_SHIFT		5
-#define ENA_REGS_DEV_STS_FATAL_ERROR_MASK		0x20
-#define ENA_REGS_DEV_STS_QUIESCENT_STATE_IN_PROGRESS_SHIFT		6
-#define ENA_REGS_DEV_STS_QUIESCENT_STATE_IN_PROGRESS_MASK		0x40
-#define ENA_REGS_DEV_STS_QUIESCENT_STATE_ACHIEVED_SHIFT		7
-#define ENA_REGS_DEV_STS_QUIESCENT_STATE_ACHIEVED_MASK		0x80
+#define ENA_REGS_DEV_STS_READY_MASK                         0x1
+#define ENA_REGS_DEV_STS_AQ_RESTART_IN_PROGRESS_SHIFT       1
+#define ENA_REGS_DEV_STS_AQ_RESTART_IN_PROGRESS_MASK        0x2
+#define ENA_REGS_DEV_STS_AQ_RESTART_FINISHED_SHIFT          2
+#define ENA_REGS_DEV_STS_AQ_RESTART_FINISHED_MASK           0x4
+#define ENA_REGS_DEV_STS_RESET_IN_PROGRESS_SHIFT            3
+#define ENA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK             0x8
+#define ENA_REGS_DEV_STS_RESET_FINISHED_SHIFT               4
+#define ENA_REGS_DEV_STS_RESET_FINISHED_MASK                0x10
+#define ENA_REGS_DEV_STS_FATAL_ERROR_SHIFT                  5
+#define ENA_REGS_DEV_STS_FATAL_ERROR_MASK                   0x20
+#define ENA_REGS_DEV_STS_QUIESCENT_STATE_IN_PROGRESS_SHIFT  6
+#define ENA_REGS_DEV_STS_QUIESCENT_STATE_IN_PROGRESS_MASK   0x40
+#define ENA_REGS_DEV_STS_QUIESCENT_STATE_ACHIEVED_SHIFT     7
+#define ENA_REGS_DEV_STS_QUIESCENT_STATE_ACHIEVED_MASK      0x80
 
 /* mmio_reg_read register */
-#define ENA_REGS_MMIO_REG_READ_REQ_ID_MASK		0xffff
-#define ENA_REGS_MMIO_REG_READ_REG_OFF_SHIFT		16
-#define ENA_REGS_MMIO_REG_READ_REG_OFF_MASK		0xffff0000
+#define ENA_REGS_MMIO_REG_READ_REQ_ID_MASK                  0xffff
+#define ENA_REGS_MMIO_REG_READ_REG_OFF_SHIFT                16
+#define ENA_REGS_MMIO_REG_READ_REG_OFF_MASK                 0xffff0000
 
 /* rss_ind_entry_update register */
-#define ENA_REGS_RSS_IND_ENTRY_UPDATE_INDEX_MASK		0xffff
-#define ENA_REGS_RSS_IND_ENTRY_UPDATE_CQ_IDX_SHIFT		16
-#define ENA_REGS_RSS_IND_ENTRY_UPDATE_CQ_IDX_MASK		0xffff0000
+#define ENA_REGS_RSS_IND_ENTRY_UPDATE_INDEX_MASK            0xffff
+#define ENA_REGS_RSS_IND_ENTRY_UPDATE_CQ_IDX_SHIFT          16
+#define ENA_REGS_RSS_IND_ENTRY_UPDATE_CQ_IDX_MASK           0xffff0000
 
 #endif /*_ENA_REGS_H_ */
diff --git a/drivers/amazon/net/ena/ena_sysfs.c b/drivers/amazon/net/ena/ena_sysfs.c
old mode 100644
new mode 100755
index b8aa5387cb715..bea56370af77e
--- a/drivers/amazon/net/ena/ena_sysfs.c
+++ b/drivers/amazon/net/ena/ena_sysfs.c
@@ -46,7 +46,6 @@ struct dev_ext_ena_attribute {
 
 #define to_ext_attr(x) container_of(x, struct dev_ext_ena_attribute, attr)
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0)
 static ssize_t ena_store_rx_copybreak(struct device *dev,
 				      struct device_attribute *attr,
 				      const char *buf, size_t len)
@@ -85,7 +84,6 @@ static ssize_t ena_show_rx_copybreak(struct device *dev,
 
 static DEVICE_ATTR(rx_copybreak, S_IRUGO | S_IWUSR, ena_show_rx_copybreak,
 		   ena_store_rx_copybreak);
-#endif /* kernel version < 3.18 */
 
 
 /* adaptive interrupt moderation */
@@ -215,10 +213,8 @@ int ena_sysfs_init(struct device *dev)
 	struct ena_adapter *adapter = dev_get_drvdata(dev);
 
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0)
 	if (device_create_file(dev, &dev_attr_rx_copybreak))
 		dev_err(dev, "failed to create rx_copybreak sysfs entry");
-#endif
 
 	if (ena_com_interrupt_moderation_supported(adapter->ena_dev)) {
 		if (device_create_file(dev,
@@ -253,9 +249,7 @@ void ena_sysfs_terminate(struct device *dev)
 	struct ena_adapter *adapter = dev_get_drvdata(dev);
 	int i;
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0)
 	device_remove_file(dev, &dev_attr_rx_copybreak);
-#endif
 	if (ena_com_interrupt_moderation_supported(adapter->ena_dev)) {
 		for (i = 0; i < ARRAY_SIZE(dev_attr_intr_moderation); i++)
 			sysfs_remove_file(&dev->kobj,
diff --git a/drivers/amazon/net/ena/ena_sysfs.h b/drivers/amazon/net/ena/ena_sysfs.h
old mode 100644
new mode 100755
diff --git a/drivers/amazon/net/ena/kcompat.h b/drivers/amazon/net/ena/kcompat.h
old mode 100644
new mode 100755
index a945574f75805..6bdbceb809b26
--- a/drivers/amazon/net/ena/kcompat.h
+++ b/drivers/amazon/net/ena/kcompat.h
@@ -76,18 +76,26 @@ Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
 #include <linux/sizes.h>
 #endif
 
+#ifndef SZ_256
+#define SZ_256 0x0000100
+#endif
+
 #ifndef SZ_4K
 #define SZ_4K 0x00001000
 #endif
 
-#ifndef SZ_256
-#define SZ_256 0x0000100
+#ifndef SZ_16K
+#define SZ_16K 0x00004000
 #endif
 
 #ifdef HAVE_POLL_CONTROLLER
 #define CONFIG_NET_POLL_CONTROLLER
 #endif
 
+#ifndef __GFP_COLD
+#define __GFP_COLD 0
+#endif
+
 #define ENA_BUSY_POLL_SUPPORT defined(CONFIG_NET_RX_BUSY_POLL) && \
 	LINUX_VERSION_CODE < KERNEL_VERSION(4,5,0) && \
 	LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
@@ -200,6 +208,12 @@ Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
 #define HAVE_RHEL6_ETHTOOL_OPS_EXT_STRUCT
 #endif /* RHEL >= 6.4 && RHEL < 7.0 */
 
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,11,0) || \
+	 (RHEL_RELEASE_CODE && \
+      RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5)))
+#define NDO_GET_STATS_64_V2
+#endif
+
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0) || \
 	(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,5))
 #include <net/busy_poll.h>
@@ -331,35 +345,30 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings)
 	return index % n_rx_rings;
 }
 #endif
-#else /* >= 3.8.0 */
-#ifndef HAVE_SRIOV_CONFIGURE
-#define HAVE_SRIOV_CONFIGURE
-#endif
 #endif /* >= 3.8.0 */
 
-/*****************************************************************************/
-#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,11,0) )
-#if RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,2))
-#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK
-#endif
-#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,19,0)
+#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V2
+#else
 
-/*****************************************************************************/
-#if ( LINUX_VERSION_CODE >= KERNEL_VERSION(3,12,0) )
-#if ( SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0))
-#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK
+#if ((LINUX_VERSION_CODE < KERNEL_VERSION(3,11,0) && \
+      RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,2))) || \
+     (LINUX_VERSION_CODE >= KERNEL_VERSION(3,12,0) && \
+      SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) || \
+     (LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0)))
+#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V1
 #endif
-#endif /* >= 3.12.0 */
 
-/*****************************************************************************/
-#if ( LINUX_VERSION_CODE >= KERNEL_VERSION(3,13,0) )
-#if (UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,24))
-#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,13,0)
+#if UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,24)
+#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V1
 #else
 #define HAVE_NDO_SELECT_QUEUE_ACCEL
 #endif
-#else
+#endif /* >= 3.13 */
+#endif /* < 4.19 */
 
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,13,0)
 #if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
 # define u64_stats_init(syncp)  seqcount_init(syncp.seq)
 #else
@@ -369,15 +378,22 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings)
 #if !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) && \
 	!(RHEL_RELEASE_CODE && ((RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,8) && \
 	                        (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0))) \
-                            || (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,1))))
+                            || (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,1)))) && \
+     !defined(UEK3_RELEASE)
 static inline void reinit_completion(struct completion *x)
 {
          x->done = 0;
 }
 #endif /* SLE 12 */
 
-#if (!(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,0)) && \
-     !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)))
+#endif /* < 3.13.0 */
+
+#if  (( LINUX_VERSION_CODE < KERNEL_VERSION(3,13,0) ) && \
+     (!(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,0) && \
+       RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7,0))) \
+     && !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0))&& \
+     !defined(UEK3_RELEASE))) || \
+     (UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE < UBUNTU_VERSION(3,13,0,30))
 static inline int pci_enable_msix_range(struct pci_dev *dev,
 					struct msix_entry *entries,
 					int minvec,
@@ -404,12 +420,20 @@ static inline int pci_enable_msix_range(struct pci_dev *dev,
 }
 #endif
 
-#endif /* >= 3.13.0 */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,13,0) && \
+    !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,1))
+static inline void *devm_kcalloc(struct device *dev,
+				 size_t n, size_t size, gfp_t flags)
+{
+	return devm_kzalloc(dev, n * size, flags | __GFP_ZERO);
+}
+#endif
 
 /*****************************************************************************/
 #if (( LINUX_VERSION_CODE < KERNEL_VERSION(3,13,8) ) && \
-     !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7,4)) && \
-     !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)))
+     !RHEL_RELEASE_CODE && \
+     !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0))) || \
+     (UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE < UBUNTU_VERSION(3,13,0,30))
 enum pkt_hash_types {
 	PKT_HASH_TYPE_NONE,	/* Undefined type */
 	PKT_HASH_TYPE_L2,	/* Input: src_MAC, dest_MAC */
@@ -426,14 +450,10 @@ static inline void skb_set_hash(struct sk_buff *skb, __u32 hash,
 #endif
 
 /*****************************************************************************/
-#if ( LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0) )
-/* for ndo_dfwd_ ops add_station, del_station and _start_xmit */
-#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK
-#else
-#if !(RHEL_RELEASE_CODE && ((RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7,4) \
-                        && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,1)) \
-                        || RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0))) && \
-    !(UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,105))
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,14,0)
+#if !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7,0) && \
+			        RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(6,6)) \
+    && !(UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,105))
 static inline int pci_msix_vec_count(struct pci_dev *dev)
 {
 	int pos;
@@ -461,11 +481,10 @@ static inline void ether_addr_copy(u8 *dst, const u8 *src)
 #endif
 
 #if ( LINUX_VERSION_CODE >= KERNEL_VERSION(3,15,0) || \
-	(UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE > UBUNTU_VERSION(3,13,0,24))) || \
+	(UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,30))) || \
 	(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) || \
-	(RHEL_RELEASE_CODE && ((RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7,4) \
-	                     && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2)) \
-                           || RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0)))
+	(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7,0) \
+	                     && RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7,1))
 #else
 static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp,
 					     unsigned int start)
@@ -480,6 +499,13 @@ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync
 
 #endif
 
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,16,0) && \
+      !(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,1))))
+
+#define smp_mb__before_atomic()	smp_mb()
+
+#endif
+
 /*****************************************************************************/
 #if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0) )
 #undef GENMASK
@@ -489,11 +515,17 @@ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync
 #endif
 /*****************************************************************************/
 
+#ifndef dma_rmb
+#define dma_rmb rmb
+#endif
+
+#ifndef writel_relaxed
+#define writel_relaxed writel
+#endif
+
 #if ( LINUX_VERSION_CODE >= KERNEL_VERSION(3,19,0) ) \
 	|| (SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) \
-	|| (RHEL_RELEASE_CODE && ((RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7,4) \
-	                        && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,1)) \
-	                        || RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0)))
+	|| (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7,0))
 #else
 static inline void netdev_rss_key_fill(void *buffer, size_t len)
 {
@@ -536,7 +568,8 @@ static inline void napi_complete_done(struct napi_struct *n, int work_done)
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,1,0) \
 	|| (UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,126)) && \
-	(UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE < UBUNTU_VERSION(3,14,0,0))
+	(UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE < UBUNTU_VERSION(3,14,0,0)) \
+	|| (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5))
 
 #else
 
@@ -567,4 +600,9 @@ static inline void __iomem *devm_ioremap_wc(struct device *dev,
 }
 #endif
 
+#if RHEL_RELEASE_CODE && \
+    RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5)
+#define ndo_change_mtu ndo_change_mtu_rh74
+#endif
+
 #endif /* _KCOMPAT_H_ */

From fa0482dba677a4c5ae3f6ab1c1cdf8f4b9c33399 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Thu, 31 Jan 2019 21:50:37 +0000
Subject: [PATCH 026/737] xen-netfront: call netif_device_attach on resume

When xennet_connect is called in the resume path, it needs
to re-attach the netif, otherwise it will no longer be
found by various operations (such as ethtool ioctls, etc).

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
Reviewed-by: Alakesh Haloi <alakeshh@amazon.com>
Reviewed-by: Vallish Vaidyeshwara <vallish@amazon.com>
---
 drivers/net/xen-netfront.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 945d8dd5aaf26..5b001a8ef7534 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -2557,6 +2557,13 @@ static int xennet_connect(struct net_device *dev)
 			device_unregister(&np->xbdev->dev);
 			return err;
 		}
+	} else {
+		/*
+		 * In the resume / thaw case, the netif needs to be
+		 * reattached, as it was detached in netfront_freeze().
+		 */
+		if (np->freeze_state == NETIF_FREEZE_STATE_FROZEN)
+			netif_device_attach(dev);
 	}
 
 	rtnl_lock();

From a350559dda46ec571e994333065a257a8365b6e7 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Tue, 12 Feb 2019 18:22:43 +0000
Subject: [PATCH 027/737] net: ena: replace dma_zalloc_coherent with
 dma_alloc_coherent

Commit dfd32cad146e3624970eee9329e99d2c6ef751b3 removed
dma_zalloc_coherent, instead making sure that dma_alloc_coherent
returns zeroed memory. So, replace all calls to dma_zalloc_coherent
in the ena driver with dma_alloc_coherent.
---
 drivers/amazon/net/ena/ena_com.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/drivers/amazon/net/ena/ena_com.c b/drivers/amazon/net/ena/ena_com.c
index 51847d2797061..0c6baa03cfba2 100644
--- a/drivers/amazon/net/ena/ena_com.c
+++ b/drivers/amazon/net/ena/ena_com.c
@@ -111,7 +111,7 @@ static int ena_com_admin_init_sq(struct ena_com_admin_queue *queue)
 	struct ena_com_admin_sq *sq = &queue->sq;
 	u16 size = ADMIN_SQ_SIZE(queue->q_depth);
 
-	sq->entries = dma_zalloc_coherent(queue->q_dmadev, size, &sq->dma_addr,
+	sq->entries = dma_alloc_coherent(queue->q_dmadev, size, &sq->dma_addr,
 					  GFP_KERNEL);
 
 	if (!sq->entries) {
@@ -133,7 +133,7 @@ static int ena_com_admin_init_cq(struct ena_com_admin_queue *queue)
 	struct ena_com_admin_cq *cq = &queue->cq;
 	u16 size = ADMIN_CQ_SIZE(queue->q_depth);
 
-	cq->entries = dma_zalloc_coherent(queue->q_dmadev, size, &cq->dma_addr,
+	cq->entries = dma_alloc_coherent(queue->q_dmadev, size, &cq->dma_addr,
 					  GFP_KERNEL);
 
 	if (!cq->entries) {
@@ -156,7 +156,7 @@ static int ena_com_admin_init_aenq(struct ena_com_dev *dev,
 
 	dev->aenq.q_depth = ENA_ASYNC_QUEUE_DEPTH;
 	size = ADMIN_AENQ_SIZE(ENA_ASYNC_QUEUE_DEPTH);
-	aenq->entries = dma_zalloc_coherent(dev->dmadev, size, &aenq->dma_addr,
+	aenq->entries = dma_alloc_coherent(dev->dmadev, size, &aenq->dma_addr,
 					    GFP_KERNEL);
 
 	if (!aenq->entries) {
@@ -345,13 +345,13 @@ static int ena_com_init_io_sq(struct ena_com_dev *ena_dev,
 		dev_node = dev_to_node(ena_dev->dmadev);
 		set_dev_node(ena_dev->dmadev, ctx->numa_node);
 		io_sq->desc_addr.virt_addr =
-			dma_zalloc_coherent(ena_dev->dmadev, size,
+			dma_alloc_coherent(ena_dev->dmadev, size,
 					    &io_sq->desc_addr.phys_addr,
 					    GFP_KERNEL);
 		set_dev_node(ena_dev->dmadev, dev_node);
 		if (!io_sq->desc_addr.virt_addr) {
 			io_sq->desc_addr.virt_addr =
-				dma_zalloc_coherent(ena_dev->dmadev, size,
+				dma_alloc_coherent(ena_dev->dmadev, size,
 						    &io_sq->desc_addr.phys_addr,
 						    GFP_KERNEL);
 		}
@@ -431,12 +431,12 @@ static int ena_com_init_io_cq(struct ena_com_dev *ena_dev,
 	prev_node = dev_to_node(ena_dev->dmadev);
 	set_dev_node(ena_dev->dmadev, ctx->numa_node);
 	io_cq->cdesc_addr.virt_addr =
-		dma_zalloc_coherent(ena_dev->dmadev, size,
+		dma_alloc_coherent(ena_dev->dmadev, size,
 				    &io_cq->cdesc_addr.phys_addr, GFP_KERNEL);
 	set_dev_node(ena_dev->dmadev, prev_node);
 	if (!io_cq->cdesc_addr.virt_addr) {
 		io_cq->cdesc_addr.virt_addr =
-			dma_zalloc_coherent(ena_dev->dmadev, size,
+			dma_alloc_coherent(ena_dev->dmadev, size,
 					    &io_cq->cdesc_addr.phys_addr,
 					    GFP_KERNEL);
 	}
@@ -1037,7 +1037,7 @@ static int ena_com_hash_key_allocate(struct ena_com_dev *ena_dev)
 	struct ena_rss *rss = &ena_dev->rss;
 
 	rss->hash_key =
-		dma_zalloc_coherent(ena_dev->dmadev, sizeof(*rss->hash_key),
+		dma_alloc_coherent(ena_dev->dmadev, sizeof(*rss->hash_key),
 				    &rss->hash_key_dma_addr, GFP_KERNEL);
 
 	if (unlikely(!rss->hash_key))
@@ -1061,7 +1061,7 @@ static int ena_com_hash_ctrl_init(struct ena_com_dev *ena_dev)
 	struct ena_rss *rss = &ena_dev->rss;
 
 	rss->hash_ctrl =
-		dma_zalloc_coherent(ena_dev->dmadev, sizeof(*rss->hash_ctrl),
+		dma_alloc_coherent(ena_dev->dmadev, sizeof(*rss->hash_ctrl),
 				    &rss->hash_ctrl_dma_addr, GFP_KERNEL);
 
 	if (unlikely(!rss->hash_ctrl))
@@ -1105,7 +1105,7 @@ static int ena_com_indirect_table_allocate(struct ena_com_dev *ena_dev,
 		sizeof(struct ena_admin_rss_ind_table_entry);
 
 	rss->rss_ind_tbl =
-		dma_zalloc_coherent(ena_dev->dmadev, tbl_size,
+		dma_alloc_coherent(ena_dev->dmadev, tbl_size,
 				    &rss->rss_ind_tbl_dma_addr, GFP_KERNEL);
 	if (unlikely(!rss->rss_ind_tbl))
 		goto mem_err1;
@@ -1660,7 +1660,7 @@ int ena_com_mmio_reg_read_request_init(struct ena_com_dev *ena_dev)
 
 	spin_lock_init(&mmio_read->lock);
 	mmio_read->read_resp =
-		dma_zalloc_coherent(ena_dev->dmadev,
+		dma_alloc_coherent(ena_dev->dmadev,
 				    sizeof(*mmio_read->read_resp),
 				    &mmio_read->read_resp_dma_addr, GFP_KERNEL);
 	if (unlikely(!mmio_read->read_resp))
@@ -1892,7 +1892,7 @@ int ena_com_extra_properties_strings_init(struct ena_com_dev *ena_dev)
 		ENA_ADMIN_EXTRA_PROPERTIES_STRING_LEN;
 
 	extra_properties_strings->virt_addr =
-		dma_zalloc_coherent(ena_dev->dmadev,
+		dma_alloc_coherent(ena_dev->dmadev,
 				    extra_properties_strings->size,
 				    &extra_properties_strings->dma_addr,
 				    GFP_KERNEL);
@@ -2717,7 +2717,7 @@ int ena_com_allocate_host_info(struct ena_com_dev *ena_dev)
 	struct ena_host_attribute *host_attr = &ena_dev->host_attr;
 
 	host_attr->host_info =
-		dma_zalloc_coherent(ena_dev->dmadev, SZ_4K,
+		dma_alloc_coherent(ena_dev->dmadev, SZ_4K,
 				    &host_attr->host_info_dma_addr, GFP_KERNEL);
 	if (unlikely(!host_attr->host_info))
 		return -ENOMEM;
@@ -2735,7 +2735,7 @@ int ena_com_allocate_debug_area(struct ena_com_dev *ena_dev,
 	struct ena_host_attribute *host_attr = &ena_dev->host_attr;
 
 	host_attr->debug_area_virt_addr =
-		dma_zalloc_coherent(ena_dev->dmadev, debug_area_size,
+		dma_alloc_coherent(ena_dev->dmadev, debug_area_size,
 				    &host_attr->debug_area_dma_addr, GFP_KERNEL);
 	if (unlikely(!host_attr->debug_area_virt_addr)) {
 		host_attr->debug_area_size = 0;

From 98f993ae09bd5390a45ce1b30a69f2324bad344c Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Wed, 6 Mar 2019 17:48:18 +0000
Subject: [PATCH 028/737] iov_iter: fix iov_for_each after accessor function
 introduction

The switch to iovec accessors broke the iov_for_each macro, because
of a missing indirection when passing the iov_iter to the new accessor
functions. This went unnoticed, as the macro is not currently used in
the kernel itself.

Pass a struct iov_iter pointer, as expected.

Fixes: 00e23707442a ("iov_iter: Use accessor function")
Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 include/linux/uio.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/linux/uio.h b/include/linux/uio.h
index cedb68e49e4f9..06ddb12f3b649 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -125,6 +125,14 @@ static inline struct iovec iov_iter_iovec(const struct iov_iter *iter)
 	};
 }
 
+#define iov_for_each(iov, iter, start)				\
+	if (iov_iter_type(&(start)) == ITER_IOVEC ||		\
+	    iov_iter_type(&(start)) == ITER_KVEC)			\
+	for (iter = (start);					\
+	     (iter).count &&					\
+	     ((iov = iov_iter_iovec(&(iter))), 1);		\
+	     iov_iter_advance(&(iter), (iov).iov_len))
+
 size_t iov_iter_copy_from_user_atomic(struct page *page,
 		struct iov_iter *i, unsigned long offset, size_t bytes);
 void iov_iter_advance(struct iov_iter *i, size_t bytes);

From 375450f4b8784c6b91f341064e24f6caa21d8ffa Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Fri, 1 Mar 2019 17:48:29 +0000
Subject: [PATCH 029/737] Import lustre client 2.10.5

---
 drivers/staging/lustrefsx/config.h            |  992 +++
 .../lustrefsx/libcfs/include/libcfs/bitmap.h  |  117 +
 .../lustrefsx/libcfs/include/libcfs/curproc.h |   81 +
 .../lustrefsx/libcfs/include/libcfs/libcfs.h  |  137 +
 .../libcfs/include/libcfs/libcfs_cpu.h        |  352 ++
 .../libcfs/include/libcfs/libcfs_crypto.h     |  213 +
 .../libcfs/include/libcfs/libcfs_debug.h      |  384 ++
 .../libcfs/include/libcfs/libcfs_fail.h       |  178 +
 .../libcfs/include/libcfs/libcfs_hash.h       |  857 +++
 .../libcfs/include/libcfs/libcfs_heap.h       |  203 +
 .../libcfs/include/libcfs/libcfs_ioctl.h      |  155 +
 .../libcfs/include/libcfs/libcfs_prim.h       |   84 +
 .../libcfs/include/libcfs/libcfs_private.h    |  416 ++
 .../libcfs/include/libcfs/libcfs_ptask.h      |  112 +
 .../libcfs/include/libcfs/libcfs_string.h     |   91 +
 .../libcfs/include/libcfs/libcfs_time.h       |   81 +
 .../libcfs/include/libcfs/libcfs_workitem.h   |  107 +
 .../libcfs/include/libcfs/linux/libcfs.h      |  150 +
 .../libcfs/include/libcfs/linux/linux-cpu.h   |  101 +
 .../include/libcfs/linux/linux-crypto.h       |   55 +
 .../libcfs/include/libcfs/linux/linux-fs.h    |  102 +
 .../libcfs/include/libcfs/linux/linux-list.h  |   52 +
 .../libcfs/include/libcfs/linux/linux-mem.h   |  132 +
 .../libcfs/include/libcfs/linux/linux-misc.h  |  128 +
 .../libcfs/include/libcfs/linux/linux-time.h  |  284 +
 .../libcfs/include/libcfs/util/ioctl.h        |   68 +
 .../libcfs/include/libcfs/util/list.h         |  499 ++
 .../libcfs/include/libcfs/util/param.h        |   40 +
 .../libcfs/include/libcfs/util/parser.h       |  115 +
 .../libcfs/include/libcfs/util/string.h       |   97 +
 .../staging/lustrefsx/libcfs/libcfs/debug.c   |  353 ++
 .../staging/lustrefsx/libcfs/libcfs/fail.c    |  137 +
 .../staging/lustrefsx/libcfs/libcfs/hash.c    | 2123 +++++++
 .../staging/lustrefsx/libcfs/libcfs/heap.c    |  499 ++
 .../lustrefsx/libcfs/libcfs/libcfs_cpu.c      |  234 +
 .../lustrefsx/libcfs/libcfs/libcfs_lock.c     |  157 +
 .../lustrefsx/libcfs/libcfs/libcfs_mem.c      |  172 +
 .../lustrefsx/libcfs/libcfs/libcfs_ptask.c    |  483 ++
 .../lustrefsx/libcfs/libcfs/libcfs_string.c   |  596 ++
 .../libcfs/libcfs/linux/crc32-pclmul_asm.S    |  243 +
 .../libcfs/linux/crc32c-pcl-intel-asm_64.S    |  466 ++
 .../lustrefsx/libcfs/libcfs/linux/inst.h      |  310 +
 .../lustrefsx/libcfs/libcfs/linux/linux-cpu.c | 1178 ++++
 .../libcfs/libcfs/linux/linux-crypto-adler.c  |  135 +
 .../libcfs/libcfs/linux/linux-crypto-crc32.c  |  148 +
 .../libcfs/linux/linux-crypto-crc32c-pclmul.c |  159 +
 .../libcfs/linux/linux-crypto-crc32pclmul.c   |  195 +
 .../libcfs/libcfs/linux/linux-crypto.c        |  510 ++
 .../libcfs/libcfs/linux/linux-curproc.c       |  297 +
 .../libcfs/libcfs/linux/linux-debug.c         |  283 +
 .../libcfs/libcfs/linux/linux-module.c        |  173 +
 .../libcfs/libcfs/linux/linux-prim.c          |  183 +
 .../libcfs/libcfs/linux/linux-tracefile.c     |  272 +
 .../staging/lustrefsx/libcfs/libcfs/module.c  |  827 +++
 .../staging/lustrefsx/libcfs/libcfs/prng.c    |  136 +
 .../lustrefsx/libcfs/libcfs/tracefile.c       | 1157 ++++
 .../lustrefsx/libcfs/libcfs/tracefile.h       |  320 +
 .../lustrefsx/libcfs/libcfs/util/l_ioctl.c    |  186 +
 .../lustrefsx/libcfs/libcfs/util/nidstrings.c | 1305 ++++
 .../lustrefsx/libcfs/libcfs/util/param.c      |  155 +
 .../lustrefsx/libcfs/libcfs/util/parser.c     |  845 +++
 .../lustrefsx/libcfs/libcfs/util/string.c     |  482 ++
 .../lustrefsx/libcfs/libcfs/watchdog.c        |  501 ++
 .../lustrefsx/libcfs/libcfs/workitem.c        |  469 ++
 .../staging/lustrefsx/lnet/include/cyaml.h    |  257 +
 .../staging/lustrefsx/lnet/include/lnet/api.h |  217 +
 .../lustrefsx/lnet/include/lnet/lib-dlc.h     |  243 +
 .../lustrefsx/lnet/include/lnet/lib-lnet.h    |  895 +++
 .../lustrefsx/lnet/include/lnet/lib-types.h   |  846 +++
 .../lustrefsx/lnet/include/lnet/lnet.h        |   46 +
 .../lustrefsx/lnet/include/lnet/lnetctl.h     |  130 +
 .../lustrefsx/lnet/include/lnet/lnetst.h      |  515 ++
 .../lustrefsx/lnet/include/lnet/nidstr.h      |  112 +
 .../lustrefsx/lnet/include/lnet/socklnd.h     |   90 +
 .../lustrefsx/lnet/include/lnet/types.h       |  671 +++
 .../lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c    | 3386 +++++++++++
 .../lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h    | 1242 ++++
 .../lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c | 3766 ++++++++++++
 .../lnet/klnds/o2iblnd/o2iblnd_modparams.c    |  315 +
 .../lustrefsx/lnet/klnds/socklnd/socklnd.c    | 2940 +++++++++
 .../lustrefsx/lnet/klnds/socklnd/socklnd.h    |  692 +++
 .../lustrefsx/lnet/klnds/socklnd/socklnd_cb.c | 2658 +++++++++
 .../lnet/klnds/socklnd/socklnd_lib.c          |  752 +++
 .../lnet/klnds/socklnd/socklnd_modparams.c    |  215 +
 .../lnet/klnds/socklnd/socklnd_proto.c        |  801 +++
 .../staging/lustrefsx/lnet/lnet/acceptor.c    |  522 ++
 drivers/staging/lustrefsx/lnet/lnet/api-ni.c  | 3182 ++++++++++
 drivers/staging/lustrefsx/lnet/lnet/config.c  | 1709 ++++++
 drivers/staging/lustrefsx/lnet/lnet/lib-eq.c  |  423 ++
 drivers/staging/lustrefsx/lnet/lnet/lib-md.c  |  557 ++
 drivers/staging/lustrefsx/lnet/lnet/lib-me.c  |  291 +
 .../staging/lustrefsx/lnet/lnet/lib-move.c    | 3143 ++++++++++
 drivers/staging/lustrefsx/lnet/lnet/lib-msg.c |  641 ++
 drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c |  983 +++
 .../staging/lustrefsx/lnet/lnet/lib-socket.c  |  649 ++
 drivers/staging/lustrefsx/lnet/lnet/lo.c      |  114 +
 drivers/staging/lustrefsx/lnet/lnet/module.c  |  261 +
 .../staging/lustrefsx/lnet/lnet/net_fault.c   | 1040 ++++
 .../staging/lustrefsx/lnet/lnet/nidstrings.c  | 1200 ++++
 drivers/staging/lustrefsx/lnet/lnet/peer.c    | 1224 ++++
 drivers/staging/lustrefsx/lnet/lnet/router.c  | 1849 ++++++
 .../staging/lustrefsx/lnet/lnet/router_proc.c |  988 +++
 .../lustrefsx/lnet/selftest/brw_test.c        |  526 ++
 .../staging/lustrefsx/lnet/selftest/conctl.c  |  924 +++
 .../staging/lustrefsx/lnet/selftest/conrpc.c  | 1402 +++++
 .../staging/lustrefsx/lnet/selftest/conrpc.h  |  146 +
 .../staging/lustrefsx/lnet/selftest/console.c | 2111 +++++++
 .../staging/lustrefsx/lnet/selftest/console.h |  257 +
 .../lustrefsx/lnet/selftest/framework.c       | 1809 ++++++
 .../staging/lustrefsx/lnet/selftest/module.c  |  165 +
 .../lustrefsx/lnet/selftest/ping_test.c       |  226 +
 drivers/staging/lustrefsx/lnet/selftest/rpc.c | 1668 ++++++
 drivers/staging/lustrefsx/lnet/selftest/rpc.h |  297 +
 .../lustrefsx/lnet/selftest/selftest.h        |  614 ++
 .../staging/lustrefsx/lnet/selftest/timer.c   |  246 +
 .../staging/lustrefsx/lnet/selftest/timer.h   |   49 +
 .../lustrefsx/lustre/fid/fid_handler.c        |  655 ++
 .../lustrefsx/lustre/fid/fid_internal.h       |   99 +
 .../staging/lustrefsx/lustre/fid/fid_lib.c    |  100 +
 .../lustrefsx/lustre/fid/fid_request.c        |  629 ++
 .../staging/lustrefsx/lustre/fid/fid_store.c  |  248 +
 .../staging/lustrefsx/lustre/fid/lproc_fid.c  |  651 ++
 .../staging/lustrefsx/lustre/fld/fld_cache.c  |  554 ++
 .../lustrefsx/lustre/fld/fld_handler.c        |  504 ++
 .../staging/lustrefsx/lustre/fld/fld_index.c  |  511 ++
 .../lustrefsx/lustre/fld/fld_internal.h       |  236 +
 .../lustrefsx/lustre/fld/fld_request.c        |  570 ++
 .../staging/lustrefsx/lustre/fld/lproc_fld.c  |  371 ++
 .../lustrefsx/lustre/include/cl_object.h      | 2493 ++++++++
 .../lustrefsx/lustre/include/dt_object.h      | 2830 +++++++++
 .../lustrefsx/lustre/include/interval_tree.h  |  131 +
 .../lustrefsx/lustre/include/llog_swab.h      |   68 +
 .../lustrefsx/lustre/include/lprocfs_status.h | 1006 ++++
 .../lustrefsx/lustre/include/lu_object.h      | 1400 +++++
 .../staging/lustrefsx/lustre/include/lu_ref.h |  259 +
 .../lustrefsx/lustre/include/lu_target.h      |  680 +++
 .../lustrefsx/lustre/include/lustre/libiam.h  |  141 +
 .../lustre/include/lustre/liblustreapi.h      |   39 +
 .../lustre/include/lustre/ll_fiemap.h         |   75 +
 .../include/lustre/lustre_barrier_user.h      |   73 +
 .../lustre/include/lustre/lustre_errno.h      |  218 +
 .../lustre/include/lustre/lustre_idl.h        | 3486 +++++++++++
 .../lustre/include/lustre/lustre_lfsck_user.h |  236 +
 .../lustre/include/lustre/lustre_user.h       | 1625 +++++
 .../lustre/include/lustre/lustreapi.h         |  822 +++
 .../lustrefsx/lustre/include/lustre_acl.h     |   52 +
 .../lustrefsx/lustre/include/lustre_barrier.h |   44 +
 .../lustrefsx/lustre/include/lustre_compat.h  |  667 +++
 .../lustrefsx/lustre/include/lustre_debug.h   |   74 +
 .../lustrefsx/lustre/include/lustre_disk.h    |  361 ++
 .../lustrefsx/lustre/include/lustre_dlm.h     | 1672 ++++++
 .../lustre/include/lustre_dlm_flags.h         |  407 ++
 .../lustrefsx/lustre/include/lustre_eacl.h    |   94 +
 .../lustrefsx/lustre/include/lustre_export.h  |  435 ++
 .../lustrefsx/lustre/include/lustre_fid.h     |  952 +++
 .../lustrefsx/lustre/include/lustre_fld.h     |  196 +
 .../lustrefsx/lustre/include/lustre_ha.h      |   60 +
 .../lustrefsx/lustre/include/lustre_handles.h |   88 +
 .../lustrefsx/lustre/include/lustre_idmap.h   |   69 +
 .../lustrefsx/lustre/include/lustre_import.h  |  390 ++
 .../lustrefsx/lustre/include/lustre_intent.h  |   68 +
 .../lustre/include/lustre_kernelcomm.h        |   57 +
 .../lustrefsx/lustre/include/lustre_lfsck.h   |  130 +
 .../lustrefsx/lustre/include/lustre_lib.h     |  405 ++
 .../lustrefsx/lustre/include/lustre_linkea.h  |   95 +
 .../lustrefsx/lustre/include/lustre_lmv.h     |  179 +
 .../lustrefsx/lustre/include/lustre_log.h     |  560 ++
 .../lustre/include/lustre_log_user.h          |   79 +
 .../lustrefsx/lustre/include/lustre_mdc.h     |  239 +
 .../lustrefsx/lustre/include/lustre_mds.h     |   74 +
 .../lustrefsx/lustre/include/lustre_net.h     | 2716 +++++++++
 .../lustrefsx/lustre/include/lustre_nodemap.h |  214 +
 .../lustrefsx/lustre/include/lustre_nrs.h     |  738 +++
 .../lustrefsx/lustre/include/lustre_nrs_crr.h |  126 +
 .../lustre/include/lustre_nrs_delay.h         |   87 +
 .../lustre/include/lustre_nrs_fifo.h          |   70 +
 .../lustrefsx/lustre/include/lustre_nrs_orr.h |  225 +
 .../lustrefsx/lustre/include/lustre_nrs_tbf.h |  343 ++
 .../lustrefsx/lustre/include/lustre_obdo.h    |   53 +
 .../lustre/include/lustre_patchless_compat.h  |  138 +
 .../lustrefsx/lustre/include/lustre_quota.h   |  244 +
 .../lustre/include/lustre_req_layout.h        |  342 ++
 .../lustrefsx/lustre/include/lustre_sec.h     | 1202 ++++
 .../lustrefsx/lustre/include/lustre_swab.h    |  134 +
 .../lustrefsx/lustre/include/lustre_update.h  |  706 +++
 .../lustrefsx/lustre/include/lustre_ver.h     |   37 +
 .../staging/lustrefsx/lustre/include/lvfs.h   |  102 +
 .../lustrefsx/lustre/include/md_object.h      |  680 +++
 .../staging/lustrefsx/lustre/include/obd.h    | 1227 ++++
 .../lustrefsx/lustre/include/obd_cache.h      |   35 +
 .../lustrefsx/lustre/include/obd_cksum.h      |  175 +
 .../lustrefsx/lustre/include/obd_class.h      | 1744 ++++++
 .../lustrefsx/lustre/include/obd_support.h    |  911 +++
 .../lustrefsx/lustre/include/obd_target.h     |   73 +
 .../lustrefsx/lustre/include/obj_update.h     |  115 +
 .../lustrefsx/lustre/include/seq_range.h      |  192 +
 .../lustre/include/uapi/linux/lustre_cfg.h    |  294 +
 .../lustre/include/uapi/linux/lustre_disk.h   |  229 +
 .../lustre/include/uapi/linux/lustre_fid.h    |  363 ++
 .../lustre/include/uapi/linux/lustre_ioctl.h  |  244 +
 .../lustre/include/uapi/linux/lustre_ostid.h  |  243 +
 .../lustre/include/uapi/linux/lustre_param.h  |   94 +
 .../lustre/include/uapi_kernelcomm.h          |   89 +
 .../lustrefsx/lustre/include/upcall_cache.h   |  154 +
 .../lustrefsx/lustre/ldlm/interval_tree.c     |  765 +++
 .../staging/lustrefsx/lustre/ldlm/l_lock.c    |   72 +
 .../lustrefsx/lustre/ldlm/ldlm_extent.c       | 1108 ++++
 .../lustrefsx/lustre/ldlm/ldlm_flock.c        |  950 +++
 .../lustrefsx/lustre/ldlm/ldlm_inodebits.c    |  250 +
 .../lustrefsx/lustre/ldlm/ldlm_internal.h     |  415 ++
 .../staging/lustrefsx/lustre/ldlm/ldlm_lib.c  | 3272 ++++++++++
 .../staging/lustrefsx/lustre/ldlm/ldlm_lock.c | 2866 +++++++++
 .../lustrefsx/lustre/ldlm/ldlm_lockd.c        | 3258 ++++++++++
 .../lustrefsx/lustre/ldlm/ldlm_plain.c        |  189 +
 .../staging/lustrefsx/lustre/ldlm/ldlm_pool.c | 1621 +++++
 .../lustrefsx/lustre/ldlm/ldlm_reclaim.c      |  411 ++
 .../lustrefsx/lustre/ldlm/ldlm_request.c      | 2415 ++++++++
 .../lustrefsx/lustre/ldlm/ldlm_resource.c     | 1708 ++++++
 .../staging/lustrefsx/lustre/llite/dcache.c   |  383 ++
 drivers/staging/lustrefsx/lustre/llite/dir.c  | 1846 ++++++
 drivers/staging/lustrefsx/lustre/llite/file.c | 4550 ++++++++++++++
 .../staging/lustrefsx/lustre/llite/glimpse.c  |  208 +
 .../lustrefsx/lustre/llite/lcommon_cl.c       |  280 +
 .../lustrefsx/lustre/llite/lcommon_misc.c     |  185 +
 .../lustrefsx/lustre/llite/llite_internal.h   | 1440 +++++
 .../lustrefsx/lustre/llite/llite_lib.c        | 2841 +++++++++
 .../lustrefsx/lustre/llite/llite_mmap.c       |  511 ++
 .../lustrefsx/lustre/llite/llite_nfs.c        |  377 ++
 .../lustrefsx/lustre/llite/lproc_llite.c      | 1832 ++++++
 .../staging/lustrefsx/lustre/llite/namei.c    | 1536 +++++
 .../lustrefsx/lustre/llite/range_lock.c       |  244 +
 .../lustrefsx/lustre/llite/range_lock.h       |   87 +
 drivers/staging/lustrefsx/lustre/llite/rw.c   | 1251 ++++
 drivers/staging/lustrefsx/lustre/llite/rw26.c |  810 +++
 .../lustrefsx/lustre/llite/statahead.c        | 1664 ++++++
 .../staging/lustrefsx/lustre/llite/super25.c  |  212 +
 .../staging/lustrefsx/lustre/llite/symlink.c  |  242 +
 .../staging/lustrefsx/lustre/llite/vvp_dev.c  |  655 ++
 .../lustrefsx/lustre/llite/vvp_internal.h     |  333 ++
 .../staging/lustrefsx/lustre/llite/vvp_io.c   | 1478 +++++
 .../staging/lustrefsx/lustre/llite/vvp_lock.c |   86 +
 .../lustrefsx/lustre/llite/vvp_object.c       |  315 +
 .../staging/lustrefsx/lustre/llite/vvp_page.c |  544 ++
 .../staging/lustrefsx/lustre/llite/xattr.c    |  819 +++
 .../staging/lustrefsx/lustre/llite/xattr26.c  |  603 ++
 .../lustrefsx/lustre/llite/xattr_cache.c      |  553 ++
 .../lustrefsx/lustre/llite/xattr_security.c   |  189 +
 .../staging/lustrefsx/lustre/lmv/lmv_fld.c    |   84 +
 .../staging/lustrefsx/lustre/lmv/lmv_intent.c |  516 ++
 .../lustrefsx/lustre/lmv/lmv_internal.h       |  161 +
 .../staging/lustrefsx/lustre/lmv/lmv_obd.c    | 3193 ++++++++++
 .../staging/lustrefsx/lustre/lmv/lproc_lmv.c  |  172 +
 .../lustrefsx/lustre/lov/lov_cl_internal.h    |  661 ++
 .../staging/lustrefsx/lustre/lov/lov_dev.c    |  392 ++
 drivers/staging/lustrefsx/lustre/lov/lov_ea.c |  546 ++
 .../lustrefsx/lustre/lov/lov_internal.h       |  364 ++
 drivers/staging/lustrefsx/lustre/lov/lov_io.c | 1224 ++++
 .../staging/lustrefsx/lustre/lov/lov_lock.c   |  377 ++
 .../staging/lustrefsx/lustre/lov/lov_merge.c  |  109 +
 .../staging/lustrefsx/lustre/lov/lov_obd.c    | 1474 +++++
 .../staging/lustrefsx/lustre/lov/lov_object.c | 1778 ++++++
 .../staging/lustrefsx/lustre/lov/lov_offset.c |  288 +
 .../staging/lustrefsx/lustre/lov/lov_pack.c   |  461 ++
 .../staging/lustrefsx/lustre/lov/lov_page.c   |  149 +
 .../staging/lustrefsx/lustre/lov/lov_pool.c   |  619 ++
 .../lustrefsx/lustre/lov/lov_request.c        |  370 ++
 .../staging/lustrefsx/lustre/lov/lovsub_dev.c |  149 +
 .../lustrefsx/lustre/lov/lovsub_lock.c        |   82 +
 .../lustrefsx/lustre/lov/lovsub_object.c      |  194 +
 .../lustrefsx/lustre/lov/lovsub_page.c        |   70 +
 .../staging/lustrefsx/lustre/lov/lproc_lov.c  |  332 ++
 .../staging/lustrefsx/lustre/mdc/lproc_mdc.c  |  231 +
 .../lustrefsx/lustre/mdc/mdc_changelog.c      |  724 +++
 .../lustrefsx/lustre/mdc/mdc_internal.h       |  166 +
 .../staging/lustrefsx/lustre/mdc/mdc_lib.c    |  551 ++
 .../staging/lustrefsx/lustre/mdc/mdc_locks.c  | 1282 ++++
 .../staging/lustrefsx/lustre/mdc/mdc_reint.c  |  434 ++
 .../lustrefsx/lustre/mdc/mdc_request.c        | 2647 ++++++++
 .../staging/lustrefsx/lustre/mgc/lproc_mgc.c  |   77 +
 .../lustrefsx/lustre/mgc/mgc_internal.h       |   70 +
 .../lustrefsx/lustre/mgc/mgc_request.c        | 2310 +++++++
 .../staging/lustrefsx/lustre/obdclass/acl.c   |  282 +
 .../lustrefsx/lustre/obdclass/cl_internal.h   |   53 +
 .../staging/lustrefsx/lustre/obdclass/cl_io.c | 1362 +++++
 .../lustrefsx/lustre/obdclass/cl_lock.c       |  288 +
 .../lustrefsx/lustre/obdclass/cl_object.c     | 1107 ++++
 .../lustrefsx/lustre/obdclass/cl_page.c       | 1141 ++++
 .../lustrefsx/lustre/obdclass/class_obd.c     |  713 +++
 .../staging/lustrefsx/lustre/obdclass/debug.c |  106 +
 .../lustrefsx/lustre/obdclass/dt_object.c     | 1097 ++++
 .../lustrefsx/lustre/obdclass/genops.c        | 2520 ++++++++
 .../staging/lustrefsx/lustre/obdclass/idmap.c |  171 +
 .../lustrefsx/lustre/obdclass/kernelcomm.c    |  261 +
 .../lustrefsx/lustre/obdclass/linkea.c        |  307 +
 .../lustre/obdclass/linux/linux-module.c      |  582 ++
 .../lustre/obdclass/linux/linux-obdo.c        |  157 +
 .../lustre/obdclass/linux/linux-sysctl.c      |  190 +
 .../staging/lustrefsx/lustre/obdclass/llog.c  | 1359 +++++
 .../lustrefsx/lustre/obdclass/llog_cat.c      | 1157 ++++
 .../lustrefsx/lustre/obdclass/llog_internal.h |   95 +
 .../lustrefsx/lustre/obdclass/llog_ioctl.c    |  496 ++
 .../lustrefsx/lustre/obdclass/llog_obd.c      |  266 +
 .../lustrefsx/lustre/obdclass/llog_osd.c      | 2143 +++++++
 .../lustrefsx/lustre/obdclass/llog_swab.c     |  472 ++
 .../lustrefsx/lustre/obdclass/llog_test.c     | 2151 +++++++
 .../lustrefsx/lustre/obdclass/local_storage.c |  973 +++
 .../lustrefsx/lustre/obdclass/local_storage.h |  102 +
 .../lustre/obdclass/lprocfs_counters.c        |  137 +
 .../lustre/obdclass/lprocfs_jobstats.c        |  670 +++
 .../lustre/obdclass/lprocfs_status.c          | 2479 ++++++++
 .../lustre/obdclass/lprocfs_status_server.c   |  803 +++
 .../lustrefsx/lustre/obdclass/lu_object.c     | 2371 ++++++++
 .../lustrefsx/lustre/obdclass/lu_ref.c        |  444 ++
 .../lustrefsx/lustre/obdclass/lu_ucred.c      |  103 +
 .../lustre/obdclass/lustre_handles.c          |  261 +
 .../lustrefsx/lustre/obdclass/lustre_peer.c   |  202 +
 .../lustrefsx/lustre/obdclass/md_attrs.c      |  186 +
 .../lustrefsx/lustre/obdclass/obd_config.c    | 2236 +++++++
 .../lustrefsx/lustre/obdclass/obd_mount.c     | 1664 ++++++
 .../lustre/obdclass/obd_mount_server.c        | 1963 ++++++
 .../staging/lustrefsx/lustre/obdclass/obdo.c  |  217 +
 .../lustrefsx/lustre/obdclass/statfs_pack.c   |   73 +
 .../lustrefsx/lustre/obdclass/upcall_cache.c  |  449 ++
 .../staging/lustrefsx/lustre/obdclass/uuid.c  |   78 +
 .../staging/lustrefsx/lustre/obdecho/echo.c   |  674 +++
 .../lustrefsx/lustre/obdecho/echo_client.c    | 3120 ++++++++++
 .../lustrefsx/lustre/obdecho/echo_internal.h  |   52 +
 .../staging/lustrefsx/lustre/osc/lproc_osc.c  |  845 +++
 .../staging/lustrefsx/lustre/osc/osc_cache.c  | 3323 +++++++++++
 .../lustrefsx/lustre/osc/osc_cl_internal.h    |  679 +++
 .../staging/lustrefsx/lustre/osc/osc_dev.c    |  246 +
 .../lustrefsx/lustre/osc/osc_internal.h       |  248 +
 drivers/staging/lustrefsx/lustre/osc/osc_io.c | 1001 ++++
 .../staging/lustrefsx/lustre/osc/osc_lock.c   | 1249 ++++
 .../staging/lustrefsx/lustre/osc/osc_object.c |  482 ++
 .../staging/lustrefsx/lustre/osc/osc_page.c   | 1107 ++++
 .../staging/lustrefsx/lustre/osc/osc_quota.c  |  303 +
 .../lustrefsx/lustre/osc/osc_request.c        | 3117 ++++++++++
 .../staging/lustrefsx/lustre/ptlrpc/client.c  | 3478 +++++++++++
 .../lustrefsx/lustre/ptlrpc/connection.c      |  240 +
 .../staging/lustrefsx/lustre/ptlrpc/errno.c   |  384 ++
 .../staging/lustrefsx/lustre/ptlrpc/events.c  |  640 ++
 .../lustrefsx/lustre/ptlrpc/gss/gss_api.h     |  179 +
 .../lustrefsx/lustre/ptlrpc/gss/gss_asn1.h    |   84 +
 .../lustrefsx/lustre/ptlrpc/gss/gss_bulk.c    |  522 ++
 .../lustre/ptlrpc/gss/gss_cli_upcall.c        |  443 ++
 .../lustrefsx/lustre/ptlrpc/gss/gss_crypto.c  |  491 ++
 .../lustrefsx/lustre/ptlrpc/gss/gss_crypto.h  |   35 +
 .../lustrefsx/lustre/ptlrpc/gss/gss_err.h     |  193 +
 .../lustre/ptlrpc/gss/gss_generic_token.c     |  285 +
 .../lustre/ptlrpc/gss/gss_internal.h          |  557 ++
 .../lustrefsx/lustre/ptlrpc/gss/gss_keyring.c | 1614 +++++
 .../lustrefsx/lustre/ptlrpc/gss/gss_krb5.h    |  160 +
 .../lustre/ptlrpc/gss/gss_krb5_mech.c         | 1554 +++++
 .../lustre/ptlrpc/gss/gss_mech_switch.c       |  359 ++
 .../lustre/ptlrpc/gss/gss_null_mech.c         |  219 +
 .../lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c  | 1254 ++++
 .../lustrefsx/lustre/ptlrpc/gss/gss_rawobj.c  |  238 +
 .../lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c |  969 +++
 .../lustre/ptlrpc/gss/gss_svc_upcall.c        | 1199 ++++
 .../lustrefsx/lustre/ptlrpc/gss/lproc_gss.c   |  226 +
 .../lustrefsx/lustre/ptlrpc/gss/sec_gss.c     | 2926 +++++++++
 .../staging/lustrefsx/lustre/ptlrpc/import.c  | 1784 ++++++
 .../staging/lustrefsx/lustre/ptlrpc/layout.c  | 2541 ++++++++
 .../lustrefsx/lustre/ptlrpc/llog_client.c     |  374 ++
 .../lustrefsx/lustre/ptlrpc/llog_net.c        |   67 +
 .../lustrefsx/lustre/ptlrpc/llog_server.c     |  338 ++
 .../lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c    | 1364 +++++
 .../staging/lustrefsx/lustre/ptlrpc/niobuf.c  |  992 +++
 .../lustre/ptlrpc/nodemap_internal.h          |  206 +
 drivers/staging/lustrefsx/lustre/ptlrpc/nrs.c | 1854 ++++++
 .../staging/lustrefsx/lustre/ptlrpc/nrs_crr.c |  883 +++
 .../lustrefsx/lustre/ptlrpc/nrs_delay.c       |  842 +++
 .../lustrefsx/lustre/ptlrpc/nrs_fifo.c        |  271 +
 .../staging/lustrefsx/lustre/ptlrpc/nrs_orr.c | 2000 +++++++
 .../staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c | 3088 ++++++++++
 .../lustrefsx/lustre/ptlrpc/pack_generic.c    | 2821 +++++++++
 .../staging/lustrefsx/lustre/ptlrpc/pers.c    |   74 +
 .../staging/lustrefsx/lustre/ptlrpc/pinger.c  |  709 +++
 .../lustrefsx/lustre/ptlrpc/ptlrpc_internal.h |  414 ++
 .../lustrefsx/lustre/ptlrpc/ptlrpc_module.c   |  155 +
 .../staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c |  965 +++
 .../staging/lustrefsx/lustre/ptlrpc/recover.c |  386 ++
 drivers/staging/lustrefsx/lustre/ptlrpc/sec.c | 2580 ++++++++
 .../lustrefsx/lustre/ptlrpc/sec_bulk.c        |  960 +++
 .../lustrefsx/lustre/ptlrpc/sec_config.c      |  967 +++
 .../staging/lustrefsx/lustre/ptlrpc/sec_ctx.c |   99 +
 .../staging/lustrefsx/lustre/ptlrpc/sec_gc.c  |  252 +
 .../lustrefsx/lustre/ptlrpc/sec_lproc.c       |  204 +
 .../lustrefsx/lustre/ptlrpc/sec_null.c        |  456 ++
 .../lustrefsx/lustre/ptlrpc/sec_plain.c       | 1035 ++++
 .../staging/lustrefsx/lustre/ptlrpc/service.c | 3309 ++++++++++
 .../lustrefsx/lustre/ptlrpc/wiretest.c        | 5300 +++++++++++++++++
 .../staging/lustrefsx/lustre/target/barrier.c |  416 ++
 .../lustrefsx/lustre/target/out_handler.c     | 1186 ++++
 .../staging/lustrefsx/lustre/target/out_lib.c | 1267 ++++
 .../lustrefsx/lustre/target/tgt_grant.c       | 1507 +++++
 .../lustrefsx/lustre/target/tgt_handler.c     | 2388 ++++++++
 .../lustrefsx/lustre/target/tgt_internal.h    |  291 +
 .../lustrefsx/lustre/target/tgt_lastrcvd.c    | 2117 +++++++
 .../lustrefsx/lustre/target/tgt_main.c        |  431 ++
 .../lustrefsx/lustre/target/update_records.c  | 1233 ++++
 .../lustrefsx/lustre/target/update_recovery.c | 1447 +++++
 .../lustrefsx/lustre/target/update_trans.c    | 1752 ++++++
 drivers/staging/lustrefsx/undef.h             |  990 +++
 405 files changed, 303101 insertions(+)
 create mode 100644 drivers/staging/lustrefsx/config.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/curproc.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_cpu.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_crypto.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_debug.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_fail.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_hash.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_heap.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ioctl.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_prim.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_string.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_time.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_workitem.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/libcfs.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-cpu.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-crypto.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-list.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/util/ioctl.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/util/list.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/util/param.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/util/parser.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/util/string.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/debug.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/fail.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/hash.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/heap.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/libcfs_cpu.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/libcfs_lock.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/libcfs_string.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux/crc32-pclmul_asm.S
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux/crc32c-pcl-intel-asm_64.S
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux/inst.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-cpu.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-adler.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-debug.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-module.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-tracefile.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/module.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/prng.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/tracefile.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/util/nidstrings.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/util/param.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/util/parser.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/util/string.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/workitem.c
 create mode 100644 drivers/staging/lustrefsx/lnet/include/cyaml.h
 create mode 100644 drivers/staging/lustrefsx/lnet/include/lnet/api.h
 create mode 100644 drivers/staging/lustrefsx/lnet/include/lnet/lib-dlc.h
 create mode 100644 drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h
 create mode 100644 drivers/staging/lustrefsx/lnet/include/lnet/lib-types.h
 create mode 100644 drivers/staging/lustrefsx/lnet/include/lnet/lnet.h
 create mode 100644 drivers/staging/lustrefsx/lnet/include/lnet/lnetctl.h
 create mode 100644 drivers/staging/lustrefsx/lnet/include/lnet/lnetst.h
 create mode 100644 drivers/staging/lustrefsx/lnet/include/lnet/nidstr.h
 create mode 100644 drivers/staging/lustrefsx/lnet/include/lnet/socklnd.h
 create mode 100644 drivers/staging/lustrefsx/lnet/include/lnet/types.h
 create mode 100644 drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c
 create mode 100644 drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h
 create mode 100644 drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c
 create mode 100644 drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_modparams.c
 create mode 100644 drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c
 create mode 100644 drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h
 create mode 100644 drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c
 create mode 100644 drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c
 create mode 100644 drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_modparams.c
 create mode 100644 drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/acceptor.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/api-ni.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/config.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/lib-eq.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/lib-md.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/lib-me.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/lib-move.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/lib-msg.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/lo.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/module.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/net_fault.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/nidstrings.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/peer.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/router.c
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/router_proc.c
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/brw_test.c
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/conctl.c
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/conrpc.c
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/conrpc.h
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/console.c
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/console.h
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/framework.c
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/module.c
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/ping_test.c
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/rpc.c
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/rpc.h
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/selftest.h
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/timer.c
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/timer.h
 create mode 100644 drivers/staging/lustrefsx/lustre/fid/fid_handler.c
 create mode 100644 drivers/staging/lustrefsx/lustre/fid/fid_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/fid/fid_lib.c
 create mode 100644 drivers/staging/lustrefsx/lustre/fid/fid_request.c
 create mode 100644 drivers/staging/lustrefsx/lustre/fid/fid_store.c
 create mode 100644 drivers/staging/lustrefsx/lustre/fid/lproc_fid.c
 create mode 100644 drivers/staging/lustrefsx/lustre/fld/fld_cache.c
 create mode 100644 drivers/staging/lustrefsx/lustre/fld/fld_handler.c
 create mode 100644 drivers/staging/lustrefsx/lustre/fld/fld_index.c
 create mode 100644 drivers/staging/lustrefsx/lustre/fld/fld_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/fld/fld_request.c
 create mode 100644 drivers/staging/lustrefsx/lustre/fld/lproc_fld.c
 create mode 100644 drivers/staging/lustrefsx/lustre/include/cl_object.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/dt_object.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/interval_tree.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/llog_swab.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lprocfs_status.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lu_object.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lu_ref.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lu_target.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre/libiam.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre/liblustreapi.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre/lustre_barrier_user.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre/lustre_errno.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre/lustre_idl.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre/lustre_lfsck_user.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_acl.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_barrier.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_compat.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_debug.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_disk.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_dlm.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_eacl.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_export.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_fid.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_fld.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_ha.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_handles.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_idmap.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_import.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_intent.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_kernelcomm.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_lfsck.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_lib.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_linkea.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_lmv.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_log.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_log_user.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_mdc.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_mds.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_net.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_nodemap.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_nrs.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_nrs_crr.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_nrs_delay.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_nrs_fifo.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_nrs_orr.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_nrs_tbf.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_obdo.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_patchless_compat.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_quota.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_sec.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_swab.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_update.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_ver.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lvfs.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/md_object.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/obd.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/obd_cache.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/obd_cksum.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/obd_class.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/obd_support.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/obd_target.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/obj_update.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/seq_range.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_cfg.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_disk.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_fid.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ioctl.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ostid.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_param.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi_kernelcomm.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/upcall_cache.h
 create mode 100644 drivers/staging/lustrefsx/lustre/ldlm/interval_tree.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ldlm/l_lock.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ldlm/ldlm_extent.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ldlm/ldlm_flock.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ldlm/ldlm_inodebits.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ldlm/ldlm_plain.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ldlm/ldlm_pool.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ldlm/ldlm_reclaim.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/dcache.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/dir.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/file.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/glimpse.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/lcommon_misc.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/llite_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/llite_lib.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/llite_nfs.c
 create mode 100755 drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/namei.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/range_lock.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/range_lock.h
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/rw.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/rw26.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/statahead.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/super25.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/symlink.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/vvp_dev.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/vvp_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/vvp_io.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/vvp_lock.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/vvp_object.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/vvp_page.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/xattr.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/xattr26.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/xattr_cache.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/xattr_security.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lov_cl_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lov_dev.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lov_ea.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lov_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lov_io.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lov_lock.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lov_merge.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lov_obd.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lov_object.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lov_offset.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lov_pack.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lov_page.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lov_pool.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lov_request.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lovsub_dev.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lovsub_lock.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lovsub_object.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lovsub_page.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lproc_lov.c
 create mode 100644 drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c
 create mode 100644 drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c
 create mode 100644 drivers/staging/lustrefsx/lustre/mdc/mdc_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c
 create mode 100644 drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c
 create mode 100644 drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c
 create mode 100644 drivers/staging/lustrefsx/lustre/mdc/mdc_request.c
 create mode 100644 drivers/staging/lustrefsx/lustre/mgc/lproc_mgc.c
 create mode 100644 drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/mgc/mgc_request.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/acl.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/cl_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/cl_io.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/cl_lock.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/cl_object.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/cl_page.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/class_obd.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/debug.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/dt_object.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/genops.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/idmap.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/kernelcomm.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/linkea.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/linux/linux-module.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/linux/linux-obdo.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/linux/linux-sysctl.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/llog.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/llog_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/llog_ioctl.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/llog_obd.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/llog_osd.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/llog_swab.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/llog_test.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/local_storage.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/local_storage.h
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/lprocfs_counters.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/lu_object.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/lu_ucred.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/lustre_handles.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/lustre_peer.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/md_attrs.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/obdo.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/statfs_pack.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/upcall_cache.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/uuid.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdecho/echo.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdecho/echo_client.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdecho/echo_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/osc/lproc_osc.c
 create mode 100644 drivers/staging/lustrefsx/lustre/osc/osc_cache.c
 create mode 100644 drivers/staging/lustrefsx/lustre/osc/osc_cl_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/osc/osc_dev.c
 create mode 100644 drivers/staging/lustrefsx/lustre/osc/osc_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/osc/osc_io.c
 create mode 100644 drivers/staging/lustrefsx/lustre/osc/osc_lock.c
 create mode 100644 drivers/staging/lustrefsx/lustre/osc/osc_object.c
 create mode 100644 drivers/staging/lustrefsx/lustre/osc/osc_page.c
 create mode 100644 drivers/staging/lustrefsx/lustre/osc/osc_quota.c
 create mode 100644 drivers/staging/lustrefsx/lustre/osc/osc_request.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/client.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/connection.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/errno.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/events.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_api.h
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_asn1.h
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_bulk.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_cli_upcall.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.h
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_err.h
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_generic_token.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_keyring.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5.h
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5_mech.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_mech_switch.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_null_mech.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_rawobj.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_svc_upcall.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/lproc_gss.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/gss/sec_gss.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/import.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/layout.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/llog_client.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/llog_net.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/llog_server.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/niobuf.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/nodemap_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/nrs.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/nrs_fifo.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/pers.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/pinger.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_module.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/recover.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/sec.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/sec_config.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/sec_gc.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/sec_lproc.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/sec_null.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/sec_plain.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/service.c
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c
 create mode 100644 drivers/staging/lustrefsx/lustre/target/barrier.c
 create mode 100644 drivers/staging/lustrefsx/lustre/target/out_handler.c
 create mode 100644 drivers/staging/lustrefsx/lustre/target/out_lib.c
 create mode 100644 drivers/staging/lustrefsx/lustre/target/tgt_grant.c
 create mode 100644 drivers/staging/lustrefsx/lustre/target/tgt_handler.c
 create mode 100644 drivers/staging/lustrefsx/lustre/target/tgt_internal.h
 create mode 100644 drivers/staging/lustrefsx/lustre/target/tgt_lastrcvd.c
 create mode 100644 drivers/staging/lustrefsx/lustre/target/tgt_main.c
 create mode 100644 drivers/staging/lustrefsx/lustre/target/update_records.c
 create mode 100644 drivers/staging/lustrefsx/lustre/target/update_recovery.c
 create mode 100644 drivers/staging/lustrefsx/lustre/target/update_trans.c
 create mode 100644 drivers/staging/lustrefsx/undef.h

diff --git a/drivers/staging/lustrefsx/config.h b/drivers/staging/lustrefsx/config.h
new file mode 100644
index 0000000000000..a0c7aaa6a0472
--- /dev/null
+++ b/drivers/staging/lustrefsx/config.h
@@ -0,0 +1,992 @@
+/* config.h.  Generated from config.h.in by configure.  */
+/* config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* enable libcfs CDEBUG, CWARN */
+#define CDEBUG_ENABLED 1
+
+/* enable libcfs ENTRY/EXIT */
+#define CDEBUG_ENTRY_EXIT 1
+
+/* enable page state tracking code */
+/* #undef CONFIG_DEBUG_PAGESTATE_TRACKING */
+
+/* enable encryption for ldiskfs */
+/* #undef CONFIG_LDISKFS_FS_ENCRYPTION */
+
+/* posix acls for ldiskfs */
+/* #undef CONFIG_LDISKFS_FS_POSIX_ACL */
+
+/* enable rw access for ldiskfs */
+/* #undef CONFIG_LDISKFS_FS_RW */
+
+/* fs security for ldiskfs */
+/* #undef CONFIG_LDISKFS_FS_SECURITY */
+
+/* extened attributes for ldiskfs */
+/* #undef CONFIG_LDISKFS_FS_XATTR */
+
+/* Max LNET payload */
+#define CONFIG_LNET_MAX_PAYLOAD LNET_MTU
+
+/* enable invariant checking */
+/* #undef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK */
+
+/* IOCTL Buffer Size */
+#define CONFIG_LUSTRE_OBD_MAX_IOCTL_BUFFER 8192
+
+/* kernel has cpu affinity support */
+/* #undef CPU_AFFINITY */
+
+/* both i_dentry/d_alias uses list */
+/* #undef DATA_FOR_LLITE_IS_LIST */
+
+/* name of ldiskfs debug program */
+#define DEBUGFS "debugfs"
+
+/* name of ldiskfs dump program */
+#define DUMPE2FS "dumpe2fs"
+
+/* name of ldiskfs fsck program */
+#define E2FSCK "e2fsck"
+
+/* name of ldiskfs e2fsprogs package */
+#define E2FSPROGS "e2fsprogs"
+
+/* name of ldiskfs label program */
+#define E2LABEL "e2label"
+
+/* do data checksums */
+#define ENABLE_CHECKSUM 1
+
+/* Use the Pinger */
+#define ENABLE_PINGER 1
+
+/* Define to 1 if you have the <asm/types.h> header file. */
+#define HAVE_ASM_TYPES_H 1
+
+/* backing_dev_info exist */
+/* #undef HAVE_BACKING_DEV_INFO */
+
+/* BDI_CAP_MAP_COPY exist */
+/* #undef HAVE_BDI_CAP_MAP_COPY */
+
+/* bio_endio takes only one argument */
+#define HAVE_BIO_ENDIO_USES_ONE_ARG 1
+
+/* bio_end_sector is defined */
+#define HAVE_BIO_END_SECTOR 1
+
+/* 'bio_integrity_enabled' is available */
+/* #undef HAVE_BIO_INTEGRITY_ENABLED */
+
+/* 'bi_bdev' is available */
+/* #undef HAVE_BI_BDEV */
+
+/* struct bio has bi_cnt */
+/* #undef HAVE_BI_CNT */
+
+/* struct bio has bi_rw */
+/* #undef HAVE_BI_RW */
+
+/* 'bi_status' is available */
+#define HAVE_BI_STATUS 1
+
+/* blkdev_get_by_dev is exported by the kernel */
+#define HAVE_BLKDEV_GET_BY_DEV 1
+
+/* Define to 1 if you have the <blkid/blkid.h> header file. */
+/* #undef HAVE_BLKID_BLKID_H */
+
+/* blk_plug struct exists */
+#define HAVE_BLK_PLUG 1
+
+/* blk_queue_max_segments is defined */
+#define HAVE_BLK_QUEUE_MAX_SEGMENTS 1
+
+/* kernel has struct bvec_iter */
+#define HAVE_BVEC_ITER 1
+
+/* cache_head has hlist cache_list */
+#define HAVE_CACHE_HEAD_HLIST 1
+
+/* have cache_register */
+/* #undef HAVE_CACHE_REGISTER */
+
+/* cancel_dirty_page is still available */
+/* #undef HAVE_CANCEL_DIRTY_PAGE */
+
+/* kernel has clean_bdev_aliases */
+#define HAVE_CLEAN_BDEV_ALIASES 1
+
+/* have clear_inode */
+#define HAVE_CLEAR_INODE 1
+
+/* compat rdma found */
+/* #undef HAVE_COMPAT_RDMA */
+
+/* cpumap_print_to_pagebuf is available */
+#define HAVE_CPUMASK_PRINT_TO_PAGEBUF 1
+
+/* kernel compiled with CRC32 functions */
+#define HAVE_CRC32 1
+
+/* struct cred has member tgcred */
+/* #undef HAVE_CRED_TGCRED */
+
+/* crypto hash helper functions are available */
+#define HAVE_CRYPTO_HASH_HELPERS 1
+
+/* current_time() has replaced CURRENT_TIME */
+#define HAVE_CURRENT_TIME 1
+
+/* dcache_lock is exist */
+/* #undef HAVE_DCACHE_LOCK */
+
+/* kernel export delete_from_page_cache */
+#define HAVE_DELETE_FROM_PAGE_CACHE 1
+
+/* dentry.d_child exist */
+#define HAVE_DENTRY_D_CHILD 1
+
+/* hlist dentry.d_u.d_alias exist */
+#define HAVE_DENTRY_D_U_D_ALIAS 1
+
+/* dentry_open uses struct path as first argument */
+#define HAVE_DENTRY_OPEN_USE_PATH 1
+
+/* direct_IO need 2 arguments */
+#define HAVE_DIRECTIO_2ARGS 1
+
+/* direct IO uses iov_iter */
+/* #undef HAVE_DIRECTIO_ITER */
+
+/* dirty_inode super_operation takes flag */
+#define HAVE_DIRTY_INODE_HAS_FLAG 1
+
+/* dir_context exist */
+#define HAVE_DIR_CONTEXT 1
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#define HAVE_DLFCN_H 1
+
+/* Have dmu_object_alloc_dnsize in ZFS */
+/* #undef HAVE_DMU_OBJECT_ALLOC_DNSIZE */
+
+/* Have dmu_objset_disown() with 3 args */
+/* #undef HAVE_DMU_OBJSET_DISOWN_3ARG */
+
+/* Have dmu_objset_own() with 6 args */
+/* #undef HAVE_DMU_OBJSET_OWN_6ARG */
+
+/* Have 6 argument dmu_pretch in ZFS */
+/* #undef HAVE_DMU_PREFETCH_6ARG */
+
+/* Have dmu_read_by_dnode() in ZFS */
+/* #undef HAVE_DMU_READ_BY_DNODE */
+
+/* Have dmu_tx_hold_write_by_dnode() in ZFS */
+/* #undef HAVE_DMU_TX_HOLD_WRITE_BY_DNODE */
+
+/* Have dmu_tx_hold_zap_by_dnode() in ZFS */
+/* #undef HAVE_DMU_TX_HOLD_ZAP_BY_DNODE */
+
+/* Have dmu_tx_mark_netfree */
+/* #undef HAVE_DMU_TX_MARK_NETFREE */
+
+/* Have native dnode accounting in ZFS */
+/* #undef HAVE_DMU_USEROBJ_ACCOUNTING */
+
+/* Have dmu_write_by_dnode() in ZFS */
+/* #undef HAVE_DMU_WRITE_BY_DNODE */
+
+/* quotactl_ops.set_dqblk takes struct fs_disk_quota */
+/* #undef HAVE_DQUOT_FS_DISK_QUOTA */
+
+/* quotactl_ops.set_dqblk takes struct kqid */
+#define HAVE_DQUOT_KQID 1
+
+/* quotactl_ops.set_dqblk takes struct qc_dqblk */
+#define HAVE_DQUOT_QC_DQBLK 1
+
+/* dquot_suspend is defined */
+#define HAVE_DQUOT_SUSPEND 1
+
+/* Have dsl_pool_config_enter/exit in ZFS */
+/* #undef HAVE_DSL_POOL_CONFIG */
+
+/* Have dsl_sync_task_do_nowait in ZFS */
+/* #undef HAVE_DSL_SYNC_TASK_DO_NOWAIT */
+
+/* dump_trace want address argument */
+/* #undef HAVE_DUMP_TRACE_ADDRESS */
+
+/* d_compare need 4 arguments */
+#define HAVE_D_COMPARE_4ARGS 1
+
+/* d_compare need 5 arguments */
+/* #undef HAVE_D_COMPARE_5ARGS */
+
+/* d_compare need 7 arguments */
+/* #undef HAVE_D_COMPARE_7ARGS */
+
+/* d_count exist */
+#define HAVE_D_COUNT 1
+
+/* d_delete first parameter declared is not const */
+#define HAVE_D_DELETE_CONST const
+
+/* have d_make_root */
+#define HAVE_D_MAKE_ROOT 1
+
+/* have parent inode as parameter */
+#define HAVE_ENCODE_FH_PARENT 1
+
+/* Define to 1 if you have the <endian.h> header file. */
+#define HAVE_ENDIAN_H 1
+
+/* Define to 1 if you have the <ext2fs/ext2fs.h> header file. */
+/* #undef HAVE_EXT2FS_EXT2FS_H */
+
+/* ext4_bread takes 4 arguments */
+/* #undef HAVE_EXT4_BREAD_4ARGS */
+
+/* i_dquot is in ext4_inode_info */
+/* #undef HAVE_EXT4_INFO_DQUOT */
+
+/* ext4_free_blocks do not require struct buffer_head */
+/* #undef HAVE_EXT_FREE_BLOCK_WITH_BUFFER_HEAD */
+
+/* Linux kernel has ext_pblock */
+/* #undef HAVE_EXT_PBLOCK */
+
+/* file handle and related syscalls are supported */
+#define HAVE_FHANDLE_GLIBC_SUPPORT 1
+
+/* kernel supports fhandles and related syscalls */
+#define HAVE_FHANDLE_SYSCALLS 1
+
+/* kernel has file_dentry */
+#define HAVE_FILE_DENTRY 1
+
+/* file_operations.fsync takes 2 arguments */
+/* #undef HAVE_FILE_FSYNC_2ARGS */
+
+/* file_operations.fsync takes 4 arguments */
+#define HAVE_FILE_FSYNC_4ARGS 1
+
+/* struct file has member f_inode */
+#define HAVE_FILE_F_INODE 1
+
+/* file_inode() has been defined */
+#define HAVE_FILE_INODE 1
+
+/* generic_file_llseek_size is exported by the kernel */
+#define HAVE_FILE_LLSEEK_SIZE 1
+
+/* kernel has generic_file_llseek_size with 5 args */
+#define HAVE_FILE_LLSEEK_SIZE_5ARGS 1
+
+/* file_operations.[read|write]_iter functions exist */
+#define HAVE_FILE_OPERATIONS_READ_WRITE_ITER 1
+
+/* filldir_t needs struct dir_context as argument */
+#define HAVE_FILLDIR_USE_CTX 1
+
+/* fpu/api.h is present */
+#define HAVE_FPU_API_HEADER 1
+
+/* struct file_system_type has mount field */
+#define HAVE_FSTYPE_MOUNT 1
+
+/* fs_struct.lock use rwlock */
+/* #undef HAVE_FS_STRUCT_RWLOCK */
+
+/* fs_struct use seqcount */
+#define HAVE_FS_STRUCT_SEQCOUNT 1
+
+/* full_name_hash need 3 arguments */
+#define HAVE_FULL_NAME_HASH_3ARGS 1
+
+/* generic_permission taken 2 arguments */
+#define HAVE_GENERIC_PERMISSION_2ARGS 1
+
+/* generic_permission taken 4 arguments */
+/* #undef HAVE_GENERIC_PERMISSION_4ARGS */
+
+/* generic_write_sync need 2 arguments */
+#define HAVE_GENERIC_WRITE_SYNC_2ARGS 1
+
+/* Define to 1 if you have the `gethostbyname' function. */
+#define HAVE_GETHOSTBYNAME 1
+
+/* get_user_pages takes 6 arguments */
+/* #undef HAVE_GET_USER_PAGES_6ARG */
+
+/* get_user_pages takes gup_flags in arguments */
+#define HAVE_GET_USER_PAGES_GUP_FLAGS 1
+
+/* struct group_info has member gid */
+#define HAVE_GROUP_INFO_GID 1
+
+/* Define this is if you enable gss */
+/* #undef HAVE_GSS */
+
+/* Define this if you enable gss keyring backend */
+/* #undef HAVE_GSS_KEYRING */
+
+/* Define this if the Kerberos GSS library supports gss_krb5_ccache_name */
+/* #undef HAVE_GSS_KRB5_CCACHE_NAME */
+
+/* Define this if you have Heimdal Kerberos libraries */
+/* #undef HAVE_HEIMDAL */
+
+/* hlist_add_after is available */
+/* #undef HAVE_HLIST_ADD_AFTER */
+
+/* hlist_for_each_entry has 3 args */
+#define HAVE_HLIST_FOR_EACH_3ARG 1
+
+/* hotplug state machine is supported */
+#define HAVE_HOTPLUG_STATE_MACHINE 1
+
+/* ib_alloc_fast_reg_mr is defined */
+/* #undef HAVE_IB_ALLOC_FAST_REG_MR */
+
+/* ib_alloc_pd has 2 arguments */
+#define HAVE_IB_ALLOC_PD_2ARGS 1
+
+/* struct ib_cq_init_attr is used by ib_create_cq */
+#define HAVE_IB_CQ_INIT_ATTR 1
+
+/* struct ib_device.attrs is defined */
+#define HAVE_IB_DEVICE_ATTRS 1
+
+/* ib_get_dma_mr is defined */
+/* #undef HAVE_IB_GET_DMA_MR */
+
+/* function ib_inc_rkey exist */
+#define HAVE_IB_INC_RKEY 1
+
+/* ib_map_mr_sg exists */
+#define HAVE_IB_MAP_MR_SG 1
+
+/* ib_map_mr_sg has 5 arguments */
+#define HAVE_IB_MAP_MR_SG_5ARGS 1
+
+/* struct ib_rdma_wr is defined */
+#define HAVE_IB_RDMA_WR 1
+
+/* inode_operations .getattr member function can gather advance stats */
+#define HAVE_INODEOPS_ENHANCED_GETATTR 1
+
+/* inode_operations has .truncate member function */
+/* #undef HAVE_INODEOPS_TRUNCATE */
+
+/* inode_operations use umode_t as parameter */
+#define HAVE_INODEOPS_USE_UMODE_T 1
+
+/* inode->i_alloc_sem is killed and use inode_dio_wait */
+#define HAVE_INODE_DIO_WAIT 1
+
+/* inode.i_rcu exists */
+#define HAVE_INODE_I_RCU 1
+
+/* inode_lock is defined */
+#define HAVE_INODE_LOCK 1
+
+/* inode_owner_or_capable exist */
+#define HAVE_INODE_OWNER_OR_CAPABLE 1
+
+/* inode_operations->permission has two args */
+#define HAVE_INODE_PERMISION_2ARGS 1
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* address_space_operations.invalidatepage needs 3 arguments */
+#define HAVE_INVALIDATE_RANGE 1
+
+/* have in_compat_syscall */
+#define HAVE_IN_COMPAT_SYSCALL 1
+
+/* inode_operations->rename need flags as argument */
+#define HAVE_IOPS_RENAME_WITH_FLAGS 1
+
+/* have iop atomic_open */
+#define HAVE_IOP_ATOMIC_OPEN 1
+
+/* generic_readlink has been removed */
+/* #undef HAVE_IOP_GENERIC_READLINK */
+
+/* inode_operations has .get_acl member function */
+#define HAVE_IOP_GET_ACL 1
+
+/* have iop get_link */
+#define HAVE_IOP_GET_LINK 1
+
+/* inode_operations has .set_acl member function */
+#define HAVE_IOP_SET_ACL 1
+
+/* inode_operations has {get,set,remove}xattr members */
+/* #undef HAVE_IOP_XATTR */
+
+/* iov_iter_init handles directional tag */
+#define HAVE_IOV_ITER_INIT_DIRECTION 1
+
+/* iov_iter_rw exist */
+#define HAVE_IOV_ITER_RW 1
+
+/* iov_iter_truncate exists */
+#define HAVE_IOV_ITER_TRUNCATE 1
+
+/* is_sxid is defined */
+#define HAVE_IS_SXID 1
+
+/* i_uid_read is present */
+#define HAVE_I_UID_READ 1
+
+/* kernel_locked is defined */
+/* #undef HAVE_KERNEL_LOCKED */
+
+/* 'struct sock' accept function requires bool argument */
+#define HAVE_KERN_SOCK_ACCEPT_FLAG_ARG 1
+
+/* struct key_match_data exist */
+#define HAVE_KEY_MATCH_DATA 1
+
+/* payload.data is an array */
+#define HAVE_KEY_PAYLOAD_DATA_ARRAY 1
+
+/* key_type->instantiate has two args */
+/* #undef HAVE_KEY_TYPE_INSTANTIATE_2ARGS */
+
+/* ki_left exist */
+/* #undef HAVE_KIOCB_KI_LEFT */
+
+/* ki_nbytes field exist */
+/* #undef HAVE_KI_NBYTES */
+
+/* have kmap_atomic has only 1 argument */
+#define HAVE_KMAP_ATOMIC_HAS_1ARG 1
+
+/* kmap_to_page is exported by the kernel */
+/* #undef HAVE_KMAP_TO_PAGE */
+
+/* Define this if you have MIT Kerberos libraries */
+/* #undef HAVE_KRB5 */
+
+/* Define this if the function krb5int_derive_key is available */
+/* #undef HAVE_KRB5INT_DERIVE_KEY */
+
+/* Define this if the function krb5_derive_key is available */
+/* #undef HAVE_KRB5_DERIVE_KEY */
+
+/* Define this if the function krb5_get_error_message is available */
+/* #undef HAVE_KRB5_GET_ERROR_MESSAGE */
+
+/* Define this if the function krb5_get_init_creds_opt_set_addressless is
+   available */
+/* #undef HAVE_KRB5_GET_INIT_CREDS_OPT_SET_ADDRESSLESS */
+
+/* kernel has kstrtoul */
+#define HAVE_KSTRTOUL 1
+
+/* kthread_worker found */
+/* #undef HAVE_KTHREAD_WORK */
+
+/* ktime_add is available */
+#define HAVE_KTIME_ADD 1
+
+/* ktime_after is available */
+#define HAVE_KTIME_AFTER 1
+
+/* ktime_before is available */
+#define HAVE_KTIME_BEFORE 1
+
+/* ktime_compare is available */
+#define HAVE_KTIME_COMPARE 1
+
+/* 'ktime_get_real_seconds' is available */
+#define HAVE_KTIME_GET_REAL_SECONDS 1
+
+/* 'ktime_get_real_ts64' is available */
+#define HAVE_KTIME_GET_REAL_TS64 1
+
+/* 'ktime_get_seconds' is available */
+#define HAVE_KTIME_GET_SECONDS 1
+
+/* 'ktime_get_ts64' is available */
+#define HAVE_KTIME_GET_TS64 1
+
+/* 'ktime_to_timespec64' is available */
+#define HAVE_KTIME_TO_TIMESPEC64 1
+
+/* enable use of ldiskfsprogs package */
+/* #undef HAVE_LDISKFSPROGS */
+
+/* kernel has ext4_map_blocks */
+/* #undef HAVE_LDISKFS_MAP_BLOCKS */
+
+/* Enable ldiskfs osd */
+/* #undef HAVE_LDISKFS_OSD */
+
+/* libefence support is requested */
+/* #undef HAVE_LIBEFENCE */
+
+/* Define to 1 if you have the `keyutils' library (-lkeyutils). */
+/* #undef HAVE_LIBKEYUTILS */
+
+/* build with libmount */
+/* #undef HAVE_LIBMOUNT */
+
+/* use libpthread for libcfs library */
+#define HAVE_LIBPTHREAD 1
+
+/* readline library is available */
+/* #undef HAVE_LIBREADLINE */
+
+/* Define to 1 if you have the <linux/random.h> header file. */
+#define HAVE_LINUX_RANDOM_H 1
+
+/* Define to 1 if you have the <linux/types.h> header file. */
+#define HAVE_LINUX_TYPES_H 1
+
+/* Define to 1 if you have the <linux/unistd.h> header file. */
+#define HAVE_LINUX_UNISTD_H 1
+
+/* Define to 1 if you have the <linux/version.h> header file. */
+#define HAVE_LINUX_VERSION_H 1
+
+/* lock-manager ops renamed to lm_xxx */
+#define HAVE_LM_XXX_LOCK_MANAGER_OPS 1
+
+/* kernel has locks_lock_file_wait */
+#define HAVE_LOCKS_LOCK_FILE_WAIT 1
+
+/* kernel has LOOP_CTL_GET_FREE */
+#define HAVE_LOOP_CTL_GET_FREE 1
+
+/* Enable lru resize support */
+#define HAVE_LRU_RESIZE_SUPPORT 1
+
+/* Define this if the Kerberos GSS library supports
+   gss_krb5_export_lucid_sec_context */
+/* #undef HAVE_LUCID_CONTEXT_SUPPORT */
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* address_space_operations.migratepage has 4 args */
+#define HAVE_MIGRATEPAGE_4ARGS 1
+
+/* kernel has include/linux/migrate.h */
+#define HAVE_MIGRATE_H 1
+
+/* kernel has include/linux/migrate_mode.h */
+/* #undef HAVE_MIGRATE_MODE_H */
+
+/* kernel module loading is possible */
+#define HAVE_MODULE_LOADING_SUPPORT 1
+
+/* Define to 1 if you have the `name_to_handle_at' function. */
+#define HAVE_NAME_TO_HANDLE_AT 1
+
+/* Define to 1 if you have the <netdb.h> header file. */
+#define HAVE_NETDB_H 1
+
+/* cancel_dirty_page with one arguement is available */
+#define HAVE_NEW_CANCEL_DIRTY_PAGE 1
+
+/* 'kernel_write' aligns with read/write helpers */
+#define HAVE_NEW_KERNEL_WRITE 1
+
+/* with oldsize */
+/* #undef HAVE_OLDSIZE_TRUNCATE_PAGECACHE */
+
+/* OpenSSL HMAC functions needed for SSK */
+/* #undef HAVE_OPENSSL_SSK */
+
+/* 'pagevec_init' takes one parameter */
+/* #undef HAVE_PAGEVEC_INIT_ONE_PARAM */
+
+/* have PCLMULQDQ instruction */
+#define HAVE_PCLMULQDQ 1
+
+/* percpu_counter_init uses GFP_* flag */
+#define HAVE_PERCPU_COUNTER_INIT_GFP_FLAG 1
+
+/* 'struct nsproxy' has 'pid_ns_for_children' */
+#define HAVE_PID_NS_FOR_CHILDREN 1
+
+/* posix_acl_to_xattr takes struct user_namespace */
+#define HAVE_POSIXACL_USER_NS 1
+
+/* 'posix_acl_update_mode' is available */
+#define HAVE_POSIX_ACL_UPDATE_MODE 1
+
+/* posix_acl_valid takes struct user_namespace */
+#define HAVE_POSIX_ACL_VALID_USER_NS 1
+
+/* proc_remove is defined */
+#define HAVE_PROC_REMOVE 1
+
+/* get_projid function exists */
+#define HAVE_PROJECT_QUOTA 1
+
+/* inode->i_nlink is protected from direct modification */
+#define HAVE_PROTECT_I_NLINK 1
+
+/* have quota64 */
+/* #undef HAVE_QUOTA64 */
+
+/* radix_tree_exceptional_entry exist */
+#define HAVE_RADIX_EXCEPTION_ENTRY 1
+
+/* rdma_create_id wants 4 args */
+/* #undef HAVE_RDMA_CREATE_ID_4ARG */
+
+/* rdma_create_id wants 5 args */
+#define HAVE_RDMA_CREATE_ID_5ARG 1
+
+/* reinit_completion is exist */
+#define HAVE_REINIT_COMPLETION 1
+
+/* kernel export remove_from_page_cache */
+/* #undef HAVE_REMOVE_FROM_PAGE_CACHE */
+
+/* remove_proc_subtree is defined */
+#define HAVE_REMOVE_PROC_SUBTREE 1
+
+/* Have sa_spill_alloc in ZFS */
+/* #undef HAVE_SA_SPILL_ALLOC */
+
+/* super_operations.evict_inode() is exist in kernel */
+#define HAVE_SBOPS_EVICT_INODE 1
+
+/* kernel supports wrapped FS freeze functions */
+#define HAVE_SB_START_WRITE 1
+
+/* Define to 1 if you have the <sched.h> header file. */
+#define HAVE_SCHED_H 1
+
+/* linux/sched header directory exist */
+#define HAVE_SCHED_HEADERS 1
+
+/* security_dentry_init_security' is defined */
+#define HAVE_SECURITY_DENTRY_INIT_SECURITY 1
+
+/* security_inode_init_security takes a callback to set xattrs */
+#define HAVE_SECURITY_IINITSEC_CALLBACK 1
+
+/* security_inode_init_security takes a 'struct qstr' parameter */
+/* #undef HAVE_SECURITY_IINITSEC_QSTR */
+
+/* support for selinux */
+#define HAVE_SELINUX 1
+
+/* Define to 1 if you have the <selinux/selinux.h> header file. */
+#define HAVE_SELINUX_SELINUX_H 1
+
+/* support server */
+/* #undef HAVE_SERVER_SUPPORT */
+
+/* Define to 1 if you have the `setns' function. */
+#define HAVE_SETNS 1
+
+/* Define this if the Kerberos GSS library supports
+   gss_krb5_set_allowable_enctypes */
+/* #undef HAVE_SET_ALLOWABLE_ENCTYPES */
+
+/* shrinker has count_objects member */
+#define HAVE_SHRINKER_COUNT 1
+
+/* shrinker want self pointer in handler */
+/* #undef HAVE_SHRINKER_WANT_SHRINK_PTR */
+
+/* shrink_control is present */
+#define HAVE_SHRINK_CONTROL 1
+
+/* simple_setattr is exported by the kernel */
+#define HAVE_SIMPLE_SETATTR 1
+
+/* sk_data_ready uses only one argument */
+#define HAVE_SK_DATA_READY_ONE_ARG 1
+
+/* kernel has sk_sleep */
+#define HAVE_SK_SLEEP 1
+
+/* sock_alloc_file is exported */
+/* #undef HAVE_SOCK_ALLOC_FILE */
+
+/* sock_alloc_file takes 3 arguments */
+#define HAVE_SOCK_ALLOC_FILE_3ARGS 1
+
+/* sock_create_kern use net as first parameter */
+#define HAVE_SOCK_CREATE_KERN_USE_NET 1
+
+/* Have spa_maxblocksize in ZFS */
+/* #undef HAVE_SPA_MAXBLOCKSIZE */
+
+/* spinlock_t is defined */
+/* #undef HAVE_SPINLOCK_T */
+
+/* struct stacktrace_ops exists */
+/* #undef HAVE_STACKTRACE_OPS */
+
+/* stacktrace_ops.warning is exist */
+/* #undef HAVE_STACKTRACE_WARNING */
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the `strlcat' function. */
+/* #undef HAVE_STRLCAT */
+
+/* Define to 1 if you have the `strlcpy' function. */
+/* #undef HAVE_STRLCPY */
+
+/* Define to 1 if you have the `strnlen' function. */
+#define HAVE_STRNLEN 1
+
+/* struct posix_acl_xattr_{header,entry} defined */
+#define HAVE_STRUCT_POSIX_ACL_XATTR 1
+
+/* submit_bio takes two arguments */
+/* #undef HAVE_SUBMIT_BIO_2ARGS */
+
+/* sunrpc_cache_pipe_upcall takes 3 args */
+/* #undef HAVE_SUNRPC_UPCALL_HAS_3ARGS */
+
+/* super_operations use dentry as parameter */
+#define HAVE_SUPEROPS_USE_DENTRY 1
+
+/* 'super_setup_bdi_name' is available */
+#define HAVE_SUPER_SETUP_BDI_NAME 1
+
+/* symlink inode operations need struct nameidata argument */
+/* #undef HAVE_SYMLINK_OPS_USE_NAMEIDATA */
+
+/* new_sync_[read|write] is exported by the kernel */
+/* #undef HAVE_SYNC_READ_WRITE */
+
+/* ctl_table has ctl_name field */
+/* #undef HAVE_SYSCTL_CTLNAME */
+
+/* Define to 1 if you have the <sys/ioctl.h> header file. */
+#define HAVE_SYS_IOCTL_H 1
+
+/* Define to 1 if you have <sys/quota.h>. */
+#define HAVE_SYS_QUOTA_H 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* tcp_sendpage use socket as first parameter */
+/* #undef HAVE_TCP_SENDPAGE_USE_SOCKET */
+
+/* 'struct timespec64' is available */
+#define HAVE_TIMESPEC64 1
+
+/* 'timespec64_sub' is available */
+#define HAVE_TIMESPEC64_SUB 1
+
+/* 'timespec64_to_ktime' is available */
+#define HAVE_TIMESPEC64_TO_KTIME 1
+
+/* topology_sibling_cpumask is available */
+#define HAVE_TOPOLOGY_SIBLING_CPUMASK 1
+
+/* kernel export truncate_complete_page */
+/* #undef HAVE_TRUNCATE_COMPLETE_PAGE */
+
+/* kernel has truncate_inode_pages_final */
+#define HAVE_TRUNCATE_INODE_PAGES_FINAL 1
+
+/* uidgid.h is present */
+#define HAVE_UIDGID_HEADER 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* kernel has vfs_rename with 5 args */
+/* #undef HAVE_VFS_RENAME_5ARGS */
+
+/* kernel has vfs_rename with 6 args */
+#define HAVE_VFS_RENAME_6ARGS 1
+
+/* '__vfs_setxattr is available */
+#define HAVE_VFS_SETXATTR 1
+
+/* kernel has vfs_unlink with 3 args */
+#define HAVE_VFS_UNLINK_3ARGS 1
+
+/* virtual_address has been replaced by address field */
+#define HAVE_VM_FAULT_ADDRESS 1
+
+/* 'struct vm_operations' remove struct vm_area_struct argument */
+#define HAVE_VM_OPS_USE_VM_FAULT_ONLY 1
+
+/* 'wait_queue_entry_t' is available */
+#define HAVE_WAIT_QUEUE_ENTRY 1
+
+/* flags field exist */
+#define HAVE_XATTR_HANDLER_FLAGS 1
+
+/* needs inode parameter */
+#define HAVE_XATTR_HANDLER_INODE_PARAM 1
+
+/* handler pointer is parameter */
+/* #undef HAVE_XATTR_HANDLER_SIMPLIFIED */
+
+/* Have zap_add_by_dnode() in ZFS */
+/* #undef HAVE_ZAP_ADD_BY_DNODE */
+
+/* Have zap_lookup_by_dnode() in ZFS */
+/* #undef HAVE_ZAP_LOOKUP_BY_DNODE */
+
+/* Have zap_remove_by_dnode() in ZFS */
+/* #undef HAVE_ZAP_REMOVE_ADD_BY_DNODE */
+
+/* Enable zfs osd */
+/* #undef HAVE_ZFS_OSD */
+
+/* __add_wait_queue_exclusive exists */
+/* #undef HAVE___ADD_WAIT_QUEUE_EXCLUSIVE */
+
+/* ext4_journal_start takes 3 arguments */
+/* #undef JOURNAL_START_HAS_3ARGS */
+
+/* Define this as the Kerberos version number */
+/* #undef KRB5_VERSION */
+
+/* enable libcfs LASSERT, LASSERTF */
+#define LIBCFS_DEBUG 1
+
+/* use dumplog on panic */
+/* #undef LNET_DUMP_ON_PANIC */
+
+/* Define to the sub-directory in which libtool stores uninstalled libraries.
+   */
+#define LT_OBJDIR ".libs/"
+
+/* Fourth number in the Lustre version */
+#define LUSTRE_FIX 0
+
+/* First number in the Lustre version */
+#define LUSTRE_MAJOR 2
+
+/* Second number in the Lustre version */
+#define LUSTRE_MINOR 10
+
+/* Third number in the Lustre version */
+#define LUSTRE_PATCH 5
+
+/* A copy of PACKAGE_VERSION */
+#define LUSTRE_VERSION_STRING "2.10.5"
+
+/* maximum number of MDS threads */
+/* #undef MDS_MAX_THREADS */
+
+/* Report minimum OST free space */
+/* #undef MIN_DF */
+
+/* name of ldiskfs mkfs program */
+#define MKE2FS "mke2fs"
+
+/* need pclmulqdq based crc32c */
+/* #undef NEED_CRC32C_ACCEL */
+
+/* need pclmulqdq based crc32 */
+/* #undef NEED_CRC32_ACCEL */
+
+/* 'ktime_get_real_ns' is not available */
+/* #undef NEED_KTIME_GET_REAL_NS */
+
+/* enable nodemap proc debug support */
+/* #undef NODEMAP_PROC_DEBUG */
+
+/* Name of package */
+#define PACKAGE "lustre"
+
+/* Define to the address where bug reports for this package should be sent. */
+#define PACKAGE_BUGREPORT "https://jira.hpdd.intel.com/"
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "Lustre"
+
+/* Define to the full name and version of this package. */
+#define PACKAGE_STRING "Lustre 2.10.5"
+
+/* Define to the one symbol short name of this package. */
+#define PACKAGE_TARNAME "lustre"
+
+/* Define to the home page for this package. */
+#define PACKAGE_URL ""
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION "2.10.5"
+
+/* name of parallel fsck program */
+#define PFSCK "fsck"
+
+/* enable randomly alloc failure */
+#define RANDOM_FAIL_ALLOC 1
+
+/* The size of `unsigned long long', as computed by sizeof. */
+#define SIZEOF_UNSIGNED_LONG_LONG 8
+
+/* use tunable backoff TCP */
+/* #undef SOCKNAL_BACKOFF */
+
+/* tunable backoff TCP in ms */
+/* #undef SOCKNAL_BACKOFF_MS */
+
+/* 'struct stacktrace_ops' address function returns an int */
+/* #undef STACKTRACE_OPS_ADDRESS_RETURN_INT */
+
+/* 'struct stacktrace_ops' has 'walk_stack' field */
+/* #undef STACKTRACE_OPS_HAVE_WALK_STACK */
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* name of ldiskfs tune program */
+#define TUNE2FS "tune2fs"
+
+/* Define this if the private function, gss_krb5_cache_name, must be used to
+   tell the Kerberos library which credentials cache to use. Otherwise, this
+   is done by setting the KRB5CCNAME environment variable */
+/* #undef USE_GSS_KRB5_CCACHE_NAME */
+
+/* Write when Checking Health */
+/* #undef USE_HEALTH_CHECK_WRITE */
+
+/* enable lu_ref reference tracking code */
+/* #undef USE_LU_REF */
+
+/* Version number of package */
+#define VERSION "2.10.5"
+
+/* zfs fix version */
+/* #undef ZFS_FIX */
+
+/* zfs major version */
+/* #undef ZFS_MAJOR */
+
+/* zfs minor version */
+/* #undef ZFS_MINOR */
+
+/* zfs patch version */
+/* #undef ZFS_PATCH */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h
new file mode 100644
index 0000000000000..28472601ed4df
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h
@@ -0,0 +1,117 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#ifndef _LIBCFS_BITMAP_H_
+#define _LIBCFS_BITMAP_H_
+
+struct cfs_bitmap {
+	unsigned int size;
+	unsigned long data[0];
+};
+
+#define CFS_BITMAP_SIZE(nbits) \
+	(((nbits / BITS_PER_LONG) + 1) * sizeof(long) + \
+	sizeof(struct cfs_bitmap))
+
+static inline
+struct cfs_bitmap *CFS_ALLOCATE_BITMAP(int size)
+{
+	struct cfs_bitmap *ptr;
+
+	LIBCFS_ALLOC(ptr, CFS_BITMAP_SIZE(size));
+	if (ptr == NULL)
+		RETURN(ptr);
+
+	ptr->size = size;
+
+	RETURN(ptr);
+}
+
+static inline void CFS_RESET_BITMAP(struct cfs_bitmap *bitmap)
+{
+	if (bitmap->size > 0) {
+		int nbits = bitmap->size;
+
+		memset(bitmap, 0, CFS_BITMAP_SIZE(nbits));
+		bitmap->size = nbits;
+	}
+}
+
+#define CFS_FREE_BITMAP(ptr)	LIBCFS_FREE(ptr, CFS_BITMAP_SIZE(ptr->size))
+
+static inline
+void cfs_bitmap_set(struct cfs_bitmap *bitmap, int nbit)
+{
+	set_bit(nbit, bitmap->data);
+}
+
+static inline
+void cfs_bitmap_clear(struct cfs_bitmap *bitmap, int nbit)
+{
+	test_and_clear_bit(nbit, bitmap->data);
+}
+
+static inline
+int cfs_bitmap_check(struct cfs_bitmap *bitmap, int nbit)
+{
+	return test_bit(nbit, bitmap->data);
+}
+
+static inline
+int cfs_bitmap_test_and_clear(struct cfs_bitmap *bitmap, int nbit)
+{
+	return test_and_clear_bit(nbit, bitmap->data);
+}
+
+/* return 0 is bitmap has none set bits */
+static inline
+int cfs_bitmap_check_empty(struct cfs_bitmap *bitmap)
+{
+	return find_first_bit(bitmap->data, bitmap->size) == bitmap->size;
+}
+
+static inline
+void cfs_bitmap_copy(struct cfs_bitmap *new, struct cfs_bitmap *old)
+{
+	size_t newsize;
+
+	LASSERT(new->size >= old->size);
+	newsize = new->size;
+	memcpy(new, old, CFS_BITMAP_SIZE(old->size));
+	new->size = newsize;
+}
+
+#define cfs_foreach_bit(bitmap, pos)					\
+	for ((pos) = find_first_bit((bitmap)->data, bitmap->size);	\
+	     (pos) < (bitmap)->size;					\
+	     (pos) = find_next_bit((bitmap)->data, (bitmap)->size, (pos) + 1))
+
+#endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/curproc.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/curproc.h
new file mode 100644
index 0000000000000..e9e0cc2109034
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/curproc.h
@@ -0,0 +1,81 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/curproc.h
+ *
+ * Lustre curproc API declaration
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#ifndef __LIBCFS_CURPROC_H__
+#define __LIBCFS_CURPROC_H__
+
+typedef __u32 cfs_cap_t;
+
+#define CFS_CAP_CHOWN                   0
+#define CFS_CAP_DAC_OVERRIDE            1
+#define CFS_CAP_DAC_READ_SEARCH         2
+#define CFS_CAP_FOWNER                  3
+#define CFS_CAP_FSETID                  4
+#define CFS_CAP_LINUX_IMMUTABLE         9
+#define CFS_CAP_SYS_ADMIN              21
+#define CFS_CAP_SYS_BOOT               23
+#define CFS_CAP_SYS_RESOURCE           24
+
+#define CFS_CAP_FS_MASK ((1 << CFS_CAP_CHOWN) |                 \
+                         (1 << CFS_CAP_DAC_OVERRIDE) |          \
+                         (1 << CFS_CAP_DAC_READ_SEARCH) |       \
+                         (1 << CFS_CAP_FOWNER) |                \
+                         (1 << CFS_CAP_FSETID ) |               \
+                         (1 << CFS_CAP_LINUX_IMMUTABLE) |       \
+                         (1 << CFS_CAP_SYS_ADMIN) |             \
+                         (1 << CFS_CAP_SYS_BOOT) |              \
+                         (1 << CFS_CAP_SYS_RESOURCE))
+
+void cfs_cap_raise(cfs_cap_t cap);
+void cfs_cap_lower(cfs_cap_t cap);
+int cfs_cap_raised(cfs_cap_t cap);
+cfs_cap_t cfs_curproc_cap_pack(void);
+void cfs_curproc_cap_unpack(cfs_cap_t cap);
+int cfs_capable(cfs_cap_t cap);
+
+/* __LIBCFS_CURPROC_H__ */
+#endif
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 80
+ * scroll-step: 1
+ * End:
+ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h
new file mode 100644
index 0000000000000..23f29d53224ee
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h
@@ -0,0 +1,137 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LIBCFS_H__
+#define __LIBCFS_LIBCFS_H__
+
+#ifdef __KERNEL__
+# include <libcfs/linux/libcfs.h>
+# include "curproc.h"
+
+#define LIBCFS_VERSION	"0.5.0"
+
+#define PO2_ROUNDUP_TYPED(x, po2, type) (-(-(type)(x) & -(type)(po2)))
+#define LOWEST_BIT_SET(x) ((x) & ~((x) - 1))
+
+/* Sparse annotations */
+#if !defined(__must_hold)
+# ifdef __CHECKER__
+#  define __must_hold(x) __attribute__((context(x, 1, 1)))
+# else	/* __CHECKER__ */
+#  define __must_hold(x)
+# endif /* !__CHECKER__ */
+#endif /* !__must_hold */
+
+/* libcfs watchdogs */
+struct lc_watchdog;
+
+/* Add a watchdog which fires after "time" milliseconds of delay.  You have to
+ * touch it once to enable it. */
+struct lc_watchdog *lc_watchdog_add(int time,
+                                    void (*cb)(pid_t pid, void *),
+                                    void *data);
+
+/* Enables a watchdog and resets its timer. */
+void lc_watchdog_touch(struct lc_watchdog *lcw, int timeout);
+#define CFS_GET_TIMEOUT(svc) (max_t(int, obd_timeout,                   \
+                          AT_OFF ? 0 : at_get(&svc->srv_at_estimate)) * \
+                          svc->srv_watchdog_factor)
+
+/* Disable a watchdog; touch it to restart it. */
+void lc_watchdog_disable(struct lc_watchdog *lcw);
+
+/* Clean up the watchdog */
+void lc_watchdog_delete(struct lc_watchdog *lcw);
+
+/* need both kernel and user-land acceptor */
+#define LNET_ACCEPTOR_MIN_RESERVED_PORT    512
+#define LNET_ACCEPTOR_MAX_RESERVED_PORT    1023
+
+/*
+ * Drop into debugger, if possible. Implementation is provided by platform.
+ */
+
+void cfs_enter_debugger(void);
+
+/*
+ * Defined by platform
+ */
+int unshare_fs_struct(void);
+sigset_t cfs_block_allsigs(void);
+sigset_t cfs_block_sigs(unsigned long sigs);
+sigset_t cfs_block_sigsinv(unsigned long sigs);
+void cfs_restore_sigs(sigset_t);
+void cfs_clear_sigpending(void);
+
+/*
+ * Random number handling
+ */
+
+/* returns a random 32-bit integer */
+unsigned int cfs_rand(void);
+/* seed the generator */
+void cfs_srand(unsigned int, unsigned int);
+void cfs_get_random_bytes(void *buf, int size);
+#endif /* __KERNEL__ */
+
+#include <libcfs/libcfs_debug.h>
+#ifdef __KERNEL__
+# include <libcfs/libcfs_private.h>
+# include <libcfs/bitmap.h>
+# include <libcfs/libcfs_cpu.h>
+# include <libcfs/libcfs_ioctl.h>
+# include <libcfs/libcfs_prim.h>
+# include <libcfs/libcfs_time.h>
+# include <libcfs/libcfs_string.h>
+# include <libcfs/libcfs_workitem.h>
+# include <libcfs/libcfs_hash.h>
+# include <libcfs/libcfs_heap.h>
+# include <libcfs/libcfs_fail.h>
+
+int libcfs_ioctl_data_adjust(struct libcfs_ioctl_data *data);
+int libcfs_ioctl(unsigned long cmd, void __user *uparam);
+
+/* container_of depends on "likely" which is defined in libcfs_private.h */
+static inline void *__container_of(const void *ptr, unsigned long shift)
+{
+	if (unlikely(IS_ERR(ptr) || ptr == NULL))
+		return ERR_CAST(ptr);
+	else
+		return (char *)ptr - shift;
+}
+
+#define container_of0(ptr, type, member)				\
+	((type *)__container_of((ptr), offsetof(type, member)))
+
+#endif /* __KERNEL__ */
+
+#endif /* _LIBCFS_LIBCFS_H_ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_cpu.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_cpu.h
new file mode 100644
index 0000000000000..9fd28ce749cfe
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_cpu.h
@@ -0,0 +1,352 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_cpu.h
+ *
+ * CPU partition
+ *   . CPU partition is virtual processing unit
+ *
+ *   . CPU partition can present 1-N cores, or 1-N NUMA nodes,
+ *     in other words, CPU partition is a processors pool.
+ *
+ * CPU Partition Table (CPT)
+ *   . a set of CPU partitions
+ *
+ *   . There are two modes for CPT: CFS_CPU_MODE_NUMA and CFS_CPU_MODE_SMP
+ *
+ *   . User can specify total number of CPU partitions while creating a
+ *     CPT, ID of CPU partition is always start from 0.
+ *
+ *     Example: if there are 8 cores on the system, while creating a CPT
+ *     with cpu_npartitions=4:
+ *              core[0, 1] = partition[0], core[2, 3] = partition[1]
+ *              core[4, 5] = partition[2], core[6, 7] = partition[3]
+ *
+ *          cpu_npartitions=1:
+ *              core[0, 1, ... 7] = partition[0]
+ *
+ *   . User can also specify CPU partitions by string pattern
+ *
+ *     Examples: cpu_partitions="0[0,1], 1[2,3]"
+ *               cpu_partitions="N 0[0-3], 1[4-8]"
+ *
+ *     The first character "N" means following numbers are numa ID
+ *
+ *   . NUMA allocators, CPU affinity threads are built over CPU partitions,
+ *     instead of HW CPUs or HW nodes.
+ *
+ *   . By default, Lustre modules should refer to the global cfs_cpt_table,
+ *     instead of accessing HW CPUs directly, so concurrency of Lustre can be
+ *     configured by cpu_npartitions of the global cfs_cpt_table
+ *
+ *   . If cpu_npartitions=1(all CPUs in one pool), lustre should work the
+ *     same way as 2.2 or earlier versions
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#ifndef __LIBCFS_CPU_H__
+#define __LIBCFS_CPU_H__
+
+#ifndef HAVE_LIBCFS_CPT
+
+struct cfs_cpt_table {
+	/* # of CPU partitions */
+	int			ctb_nparts;
+	/* cpu mask */
+	cpumask_t		ctb_mask;
+	/* node mask */
+	nodemask_t		ctb_nodemask;
+	/* version */
+	__u64			ctb_version;
+};
+
+#endif /* !HAVE_LIBCFS_CPT */
+
+/* any CPU partition */
+#define CFS_CPT_ANY		(-1)
+
+extern struct cfs_cpt_table	*cfs_cpt_table;
+
+/**
+ * destroy a CPU partition table
+ */
+void cfs_cpt_table_free(struct cfs_cpt_table *cptab);
+/**
+ * create a cfs_cpt_table with \a ncpt number of partitions
+ */
+struct cfs_cpt_table *cfs_cpt_table_alloc(int ncpt);
+/**
+ * print string information of cpt-table
+ */
+int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len);
+/**
+ * print distance information of cpt-table
+ */
+int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len);
+/**
+ * return total number of CPU partitions in \a cptab
+ */
+int cfs_cpt_number(struct cfs_cpt_table *cptab);
+/**
+ * return number of HW cores or hypter-threadings in a CPU partition \a cpt
+ */
+int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * is there any online CPU in CPU partition \a cpt
+ */
+int cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * return cpumask of CPU partition \a cpt
+ */
+cpumask_t *cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * return nodemask of CPU partition \a cpt
+ */
+nodemask_t *cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * shadow current HW processor ID to CPU-partition ID of \a cptab
+ */
+int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap);
+/**
+ * shadow HW processor ID \a CPU to CPU-partition ID by \a cptab
+ */
+int cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu);
+/**
+ * shadow HW node ID \a NODE to CPU-partition ID by \a cptab
+ */
+int cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node);
+/**
+ * NUMA distance between \a cpt1 and \a cpt2 in \a cptab
+ */
+unsigned cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2);
+/**
+ * bind current thread on a CPU-partition \a cpt of \a cptab
+ */
+int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * add \a cpu to CPU partion @cpt of \a cptab, return 1 for success,
+ * otherwise 0 is returned
+ */
+int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu);
+/**
+ * remove \a cpu from CPU partition \a cpt of \a cptab
+ */
+void cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu);
+/**
+ * add all cpus in \a mask to CPU partition \a cpt
+ * return 1 if successfully set all CPUs, otherwise return 0
+ */
+
+int cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt,
+			const cpumask_t *mask);
+/**
+ * remove all cpus in \a mask from CPU partition \a cpt
+ */
+void cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt,
+			   const cpumask_t *mask);
+/**
+ * add all cpus in NUMA node \a node to CPU partition \a cpt
+ * return 1 if successfully set all CPUs, otherwise return 0
+ */
+int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node);
+/**
+ * remove all cpus in NUMA node \a node from CPU partition \a cpt
+ */
+void cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node);
+
+/**
+ * add all cpus in node mask \a mask to CPU partition \a cpt
+ * return 1 if successfully set all CPUs, otherwise return 0
+ */
+int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt,
+			 const nodemask_t *mask);
+/**
+ * remove all cpus in node mask \a mask from CPU partition \a cpt
+ */
+void cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt,
+			    const nodemask_t *mask);
+/**
+ * convert partition id \a cpt to numa node id, if there are more than one
+ * nodes in this partition, it might return a different node id each time.
+ */
+int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt);
+
+/*
+ * allocate per-cpu-partition data, returned value is an array of pointers,
+ * variable can be indexed by CPU ID.
+ *      cptab != NULL: size of array is number of CPU partitions
+ *      cptab == NULL: size of array is number of HW cores
+ */
+void *cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size);
+/*
+ * destory per-cpu-partition variable
+ */
+void  cfs_percpt_free(void *vars);
+int   cfs_percpt_number(void *vars);
+
+#define cfs_percpt_for_each(var, i, vars)		\
+	for (i = 0; i < cfs_percpt_number(vars) &&	\
+		((var) = (vars)[i]) != NULL; i++)
+
+/*
+ * percpu partition lock
+ *
+ * There are some use-cases like this in Lustre:
+ * . each CPU partition has it's own private data which is frequently changed,
+ *   and mostly by the local CPU partition.
+ * . all CPU partitions share some global data, these data are rarely changed.
+ *
+ * LNet is typical example.
+ * CPU partition lock is designed for this kind of use-cases:
+ * . each CPU partition has it's own private lock
+ * . change on private data just needs to take the private lock
+ * . read on shared data just needs to take _any_ of private locks
+ * . change on shared data needs to take _all_ private locks,
+ *   which is slow and should be really rare.
+ */
+enum {
+	CFS_PERCPT_LOCK_EX	= -1,	/* negative */
+};
+
+struct cfs_percpt_lock {
+	/* cpu-partition-table for this lock */
+	struct cfs_cpt_table	 *pcl_cptab;
+	/* exclusively locked */
+	unsigned int		  pcl_locked;
+	/* private lock table */
+	spinlock_t		**pcl_locks;
+};
+
+/* return number of private locks */
+#define cfs_percpt_lock_num(pcl)	cfs_cpt_number(pcl->pcl_cptab)
+
+/*
+ * create a cpu-partition lock based on CPU partition table \a cptab,
+ * each private lock has extra \a psize bytes padding data
+ */
+struct cfs_percpt_lock *cfs_percpt_lock_create(struct cfs_cpt_table *cptab,
+					       struct lock_class_key *keys);
+/* destroy a cpu-partition lock */
+void cfs_percpt_lock_free(struct cfs_percpt_lock *pcl);
+
+/* lock private lock \a index of \a pcl */
+void cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index);
+/* unlock private lock \a index of \a pcl */
+void cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index);
+
+#define CFS_PERCPT_LOCK_KEYS   256
+
+/* NB: don't allocate keys dynamically, lockdep needs them to be in ".data" */
+#define cfs_percpt_lock_alloc(cptab)					\
+({									\
+	static struct lock_class_key  ___keys[CFS_PERCPT_LOCK_KEYS];	\
+	struct cfs_percpt_lock	     *___lk;				\
+									\
+	if (cfs_cpt_number(cptab) > CFS_PERCPT_LOCK_KEYS)		\
+		___lk = cfs_percpt_lock_create(cptab, NULL);		\
+	else								\
+		___lk = cfs_percpt_lock_create(cptab, ___keys);		\
+	___lk;								\
+})
+
+/**
+ * allocate \a nr_bytes of physical memory from a contiguous region with the
+ * properties of \a flags which are bound to the partition id \a cpt. This
+ * function should only be used for the case when only a few pages of memory
+ * are need.
+ */
+static inline void *
+cfs_cpt_malloc(struct cfs_cpt_table *cptab, int cpt, size_t nr_bytes,
+	       gfp_t flags)
+{
+	return kmalloc_node(nr_bytes, flags,
+			    cfs_cpt_spread_node(cptab, cpt));
+}
+
+/**
+ * allocate \a nr_bytes of virtually contiguous memory that is bound to the
+ * partition id \a cpt.
+ */
+static inline void *
+cfs_cpt_vzalloc(struct cfs_cpt_table *cptab, int cpt, size_t nr_bytes)
+{
+	/* vzalloc_node() sets __GFP_FS by default but no current Kernel
+	 * exported entry-point allows for both a NUMA node specification
+	 * and a custom allocation flags mask. This may be an issue since
+	 * __GFP_FS usage can cause some deadlock situations in our code,
+	 * like when memory reclaim started, within the same context of a
+	 * thread doing FS operations, that can also attempt conflicting FS
+	 * operations, ...
+	 */
+	return vzalloc_node(nr_bytes, cfs_cpt_spread_node(cptab, cpt));
+}
+
+/**
+ * allocate a single page of memory with the properties of \a flags were
+ * that page is bound to the partition id \a cpt.
+ */
+static inline struct page *
+cfs_page_cpt_alloc(struct cfs_cpt_table *cptab, int cpt, gfp_t flags)
+{
+	return alloc_pages_node(cfs_cpt_spread_node(cptab, cpt), flags, 0);
+}
+
+/**
+ * allocate a chunck of memory from a memory pool that is bound to the
+ * partition id \a cpt with the properites of \a flags.
+ */
+static inline void *
+cfs_mem_cache_cpt_alloc(struct kmem_cache *cachep, struct cfs_cpt_table *cptab,
+			int cpt, gfp_t flags)
+{
+	return kmem_cache_alloc_node(cachep, flags,
+				     cfs_cpt_spread_node(cptab, cpt));
+}
+
+/**
+ * iterate over all CPU partitions in \a cptab
+ */
+#define cfs_cpt_for_each(i, cptab)	\
+	for (i = 0; i < cfs_cpt_number(cptab); i++)
+
+#ifndef __read_mostly
+# define __read_mostly
+#endif
+
+#ifndef ____cacheline_aligned
+#define ____cacheline_aligned
+#endif
+
+int  cfs_cpu_init(void);
+void cfs_cpu_fini(void);
+
+#endif /* __LIBCFS_CPU_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_crypto.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_crypto.h
new file mode 100644
index 0000000000000..ea9234abc7f76
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_crypto.h
@@ -0,0 +1,213 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ */
+
+#ifndef _LIBCFS_CRYPTO_H
+#define _LIBCFS_CRYPTO_H
+
+struct cfs_crypto_hash_type {
+	char		*cht_name;      /**< hash algorithm name, equal to
+					 * format name for crypto api */
+	unsigned int    cht_key;	/**< init key by default (vaild for
+					 * 4 bytes context like crc32, adler */
+	unsigned int    cht_size;       /**< hash digest size */
+};
+
+enum cfs_crypto_hash_alg {
+	CFS_HASH_ALG_NULL	= 0,
+	CFS_HASH_ALG_ADLER32,
+	CFS_HASH_ALG_CRC32,
+	CFS_HASH_ALG_CRC32C,
+	/* hashes before here will be speed-tested at module load */
+	CFS_HASH_ALG_MD5,
+	CFS_HASH_ALG_SHA1,
+	CFS_HASH_ALG_SHA256,
+	CFS_HASH_ALG_SHA384,
+	CFS_HASH_ALG_SHA512,
+	CFS_HASH_ALG_MAX,
+	CFS_HASH_ALG_SPEED_MAX = CFS_HASH_ALG_MD5,
+	CFS_HASH_ALG_UNKNOWN	= 0xff
+};
+
+static struct cfs_crypto_hash_type hash_types[] = {
+	[CFS_HASH_ALG_NULL] = {
+		.cht_name	= "null",
+		.cht_key	= 0,
+		.cht_size	= 0
+	},
+	[CFS_HASH_ALG_ADLER32] = {
+		.cht_name	= "adler32",
+		.cht_key	= 1,
+		.cht_size	= 4
+	},
+	[CFS_HASH_ALG_CRC32] = {
+		.cht_name	= "crc32",
+		.cht_key	= ~0,
+		.cht_size	= 4
+	},
+	[CFS_HASH_ALG_CRC32C] = {
+		.cht_name	= "crc32c",
+		.cht_key	= ~0,
+		.cht_size	= 4
+	},
+	[CFS_HASH_ALG_MD5] = {
+		.cht_name	= "md5",
+		.cht_key	= 0,
+		.cht_size	= 16
+	},
+	[CFS_HASH_ALG_SHA1] = {
+		.cht_name	= "sha1",
+		.cht_key	= 0,
+		.cht_size	= 20
+	},
+	[CFS_HASH_ALG_SHA256] = {
+		.cht_name	= "sha256",
+		.cht_key	= 0,
+		.cht_size	= 32
+	},
+	[CFS_HASH_ALG_SHA384] = {
+		.cht_name	= "sha384",
+		.cht_key	= 0,
+		.cht_size	= 48
+	},
+	[CFS_HASH_ALG_SHA512] = {
+		.cht_name	= "sha512",
+		.cht_key	= 0,
+		.cht_size	= 64
+	},
+	[CFS_HASH_ALG_MAX] = {
+		.cht_name	= NULL,
+		.cht_key	= 0,
+		.cht_size	= 64
+	}
+};
+
+/* Maximum size of hash_types[].cht_size */
+#define CFS_CRYPTO_HASH_DIGESTSIZE_MAX 64
+
+/**
+ * Return hash algorithm information for the specified algorithm identifier
+ *
+ * Hash information includes algorithm name, initial seed, hash size.
+ *
+ * \retval		cfs_crypto_hash_type for valid ID (CFS_HASH_ALG_*)
+ * \retval		NULL for unknown algorithm identifier
+ */
+static inline const struct
+cfs_crypto_hash_type *cfs_crypto_hash_type(enum cfs_crypto_hash_alg hash_alg)
+{
+	struct cfs_crypto_hash_type *ht;
+
+	if (hash_alg < CFS_HASH_ALG_MAX) {
+		ht = &hash_types[hash_alg];
+		if (ht->cht_name != NULL)
+			return ht;
+	}
+	return NULL;
+}
+
+/**
+ * Return hash name for hash algorithm identifier
+ *
+ * \param[in] hash_alg	hash alrgorithm id (CFS_HASH_ALG_*)
+ *
+ * \retval		string name of known hash algorithm
+ * \retval		"unknown" if hash algorithm is unknown
+ */
+static inline const
+char *cfs_crypto_hash_name(enum cfs_crypto_hash_alg hash_alg)
+{
+	const struct cfs_crypto_hash_type *ht;
+
+	ht = cfs_crypto_hash_type(hash_alg);
+	if (ht)
+		return ht->cht_name;
+
+	return "unknown";
+}
+
+/**
+ * Return digest size for hash algorithm type
+ *
+ * \param[in] hash_alg	hash alrgorithm id (CFS_HASH_ALG_*)
+ *
+ * \retval		hash algorithm digest size in bytes
+ * \retval		0 if hash algorithm type is unknown
+ */
+static inline
+unsigned int cfs_crypto_hash_digestsize(enum cfs_crypto_hash_alg hash_alg)
+{
+	const struct cfs_crypto_hash_type *ht;
+
+	ht = cfs_crypto_hash_type(hash_alg);
+	if (ht != NULL)
+		return ht->cht_size;
+
+	return 0;
+}
+
+/**
+ * Find hash algorithm ID for the specified algorithm name
+ *
+ * \retval		hash algorithm ID for valid ID (CFS_HASH_ALG_*)
+ * \retval		CFS_HASH_ALG_UNKNOWN for unknown algorithm name
+ */
+static inline unsigned char cfs_crypto_hash_alg(const char *algname)
+{
+	enum cfs_crypto_hash_alg hash_alg;
+
+	for (hash_alg = 0; hash_alg < CFS_HASH_ALG_MAX; hash_alg++)
+		if (strcmp(hash_types[hash_alg].cht_name, algname) == 0)
+			return hash_alg;
+
+	return CFS_HASH_ALG_UNKNOWN;
+}
+
+int cfs_crypto_hash_digest(enum cfs_crypto_hash_alg hash_alg,
+			   const void *buf, unsigned int buf_len,
+			   unsigned char *key, unsigned int key_len,
+			   unsigned char *hash, unsigned int *hash_len);
+
+/* cfs crypto hash descriptor */
+struct cfs_crypto_hash_desc;
+struct page;
+
+struct cfs_crypto_hash_desc *
+	cfs_crypto_hash_init(enum cfs_crypto_hash_alg hash_alg,
+			     unsigned char *key, unsigned int key_len);
+int cfs_crypto_hash_update_page(struct cfs_crypto_hash_desc *desc,
+				struct page *page, unsigned int offset,
+				unsigned int len);
+int cfs_crypto_hash_update(struct cfs_crypto_hash_desc *desc, const void *buf,
+			   unsigned int buf_len);
+int cfs_crypto_hash_final(struct cfs_crypto_hash_desc *desc,
+			  unsigned char *hash, unsigned int *hash_len);
+int cfs_crypto_register(void);
+void cfs_crypto_unregister(void);
+int cfs_crypto_hash_speed(enum cfs_crypto_hash_alg hash_alg);
+#endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_debug.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_debug.h
new file mode 100644
index 0000000000000..2eb6b7aa57d9c
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_debug.h
@@ -0,0 +1,384 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_debug.h
+ *
+ * Debug messages and assertions
+ *
+ */
+
+#ifndef __LIBCFS_DEBUG_H__
+#define __LIBCFS_DEBUG_H__
+
+/*
+ *  Debugging
+ */
+extern unsigned int libcfs_subsystem_debug;
+extern unsigned int libcfs_stack;
+extern unsigned int libcfs_debug;
+extern unsigned int libcfs_printk;
+extern unsigned int libcfs_console_ratelimit;
+extern unsigned int libcfs_watchdog_ratelimit;
+extern unsigned int libcfs_console_max_delay;
+extern unsigned int libcfs_console_min_delay;
+extern unsigned int libcfs_console_backoff;
+extern unsigned int libcfs_debug_binary;
+extern char libcfs_debug_file_path_arr[PATH_MAX];
+
+int libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys);
+int libcfs_debug_str2mask(int *mask, const char *str, int is_subsys);
+
+/* Has there been an LBUG? */
+extern unsigned int libcfs_catastrophe;
+extern unsigned int libcfs_panic_on_lbug;
+
+/**
+ * Format for debug message headers
+ */
+struct ptldebug_header {
+        __u32 ph_len;
+        __u32 ph_flags;
+        __u32 ph_subsys;
+        __u32 ph_mask;
+        __u16 ph_cpu_id;
+        __u16 ph_type;
+	/* time_t overflow in 2106 */
+        __u32 ph_sec;
+        __u64 ph_usec;
+        __u32 ph_stack;
+        __u32 ph_pid;
+        __u32 ph_extern_pid;
+        __u32 ph_line_num;
+} __attribute__((packed));
+
+
+#define PH_FLAG_FIRST_RECORD 1
+
+/* Debugging subsystems (32 bits, non-overlapping) */
+#define S_UNDEFINED	0x00000001
+#define S_MDC		0x00000002
+#define S_MDS		0x00000004
+#define S_OSC		0x00000008
+#define S_OST		0x00000010
+#define S_CLASS		0x00000020
+#define S_LOG		0x00000040
+#define S_LLITE		0x00000080
+#define S_RPC		0x00000100
+#define S_MGMT		0x00000200
+#define S_LNET		0x00000400
+#define S_LND		0x00000800 /* ALL LNDs */
+#define S_PINGER	0x00001000
+#define S_FILTER	0x00002000
+/* unused */
+#define S_ECHO		0x00008000
+#define S_LDLM		0x00010000
+#define S_LOV		0x00020000
+#define S_LQUOTA	0x00040000
+#define S_OSD		0x00080000
+#define S_LFSCK		0x00100000
+#define S_SNAPSHOT	0x00200000
+/* unused */
+#define S_LMV		0x00800000 /* b_new_cmd */
+/* unused */
+#define S_SEC		0x02000000 /* upcall cache */
+#define S_GSS		0x04000000 /* b_new_cmd */
+/* unused */
+#define S_MGC		0x10000000
+#define S_MGS		0x20000000
+#define S_FID		0x40000000 /* b_new_cmd */
+#define S_FLD		0x80000000 /* b_new_cmd */
+
+#define LIBCFS_DEBUG_SUBSYS_NAMES {					\
+	"undefined", "mdc", "mds", "osc", "ost", "class", "log",	\
+	"llite", "rpc", "mgmt", "lnet", "lnd", "pinger", "filter", "",	\
+	"echo", "ldlm", "lov", "lquota", "osd", "lfsck", "snapshot", "",\
+	"lmv",	"", "sec", "gss", "", "mgc", "mgs", "fid", "fld", NULL }
+
+/* Debugging masks (32 bits, non-overlapping) */
+#define D_TRACE		0x00000001 /* ENTRY/EXIT markers */
+#define D_INODE		0x00000002
+#define D_SUPER		0x00000004
+#define D_EXT2		0x00000008 /* anything from ext2_debug */
+#define D_MALLOC	0x00000010 /* print malloc, free information */
+#define D_CACHE		0x00000020 /* cache-related items */
+#define D_INFO		0x00000040 /* general information */
+#define D_IOCTL		0x00000080 /* ioctl related information */
+#define D_NETERROR	0x00000100 /* network errors */
+#define D_NET		0x00000200 /* network communications */
+#define D_WARNING	0x00000400 /* CWARN(...) == CDEBUG (D_WARNING, ...) */
+#define D_BUFFS		0x00000800
+#define D_OTHER		0x00001000
+#define D_DENTRY	0x00002000
+#define D_NETTRACE	0x00004000
+#define D_PAGE		0x00008000 /* bulk page handling */
+#define D_DLMTRACE	0x00010000
+#define D_ERROR		0x00020000 /* CERROR(...) == CDEBUG (D_ERROR, ...) */
+#define D_EMERG		0x00040000 /* CEMERG(...) == CDEBUG (D_EMERG, ...) */
+#define D_HA		0x00080000 /* recovery and failover */
+#define D_RPCTRACE	0x00100000 /* for distributed debugging */
+#define D_VFSTRACE	0x00200000
+#define D_READA		0x00400000 /* read-ahead */
+#define D_MMAP		0x00800000
+#define D_CONFIG	0x01000000
+#define D_CONSOLE	0x02000000
+#define D_QUOTA		0x04000000
+#define D_SEC		0x08000000
+#define D_LFSCK		0x10000000 /* For both OI scrub and LFSCK */
+#define D_HSM		0x20000000
+#define D_SNAPSHOT	0x40000000 /* snapshot */
+#define D_LAYOUT	0x80000000
+
+#define LIBCFS_DEBUG_MASKS_NAMES {					\
+	"trace", "inode", "super", "ext2", "malloc", "cache", "info",	\
+	"ioctl", "neterror", "net", "warning", "buffs", "other",	\
+	"dentry", "nettrace", "page", "dlmtrace", "error", "emerg",	\
+	"ha", "rpctrace", "vfstrace", "reada", "mmap", "config",	\
+	"console", "quota", "sec", "lfsck", "hsm", "snapshot", "layout",\
+	NULL }
+
+#define D_CANTMASK   (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE)
+
+#ifndef DEBUG_SUBSYSTEM
+# define DEBUG_SUBSYSTEM S_UNDEFINED
+#endif
+
+#define CDEBUG_DEFAULT_MAX_DELAY (cfs_time_seconds(600))         /* jiffies */
+#define CDEBUG_DEFAULT_MIN_DELAY ((cfs_time_seconds(1) + 1) / 2) /* jiffies */
+#define CDEBUG_DEFAULT_BACKOFF   2
+struct cfs_debug_limit_state {
+	unsigned long	cdls_next;
+	unsigned int	cdls_delay;
+	int		cdls_count;
+};
+
+struct libcfs_debug_msg_data {
+	const char			*msg_file;
+	const char			*msg_fn;
+	int				 msg_subsys;
+	int				 msg_line;
+	int				 msg_mask;
+	struct cfs_debug_limit_state	*msg_cdls;
+};
+
+#define LIBCFS_DEBUG_MSG_DATA_INIT(data, mask, cdls)        \
+do {                                                        \
+        (data)->msg_subsys = DEBUG_SUBSYSTEM;               \
+        (data)->msg_file   = __FILE__;                      \
+        (data)->msg_fn     = __FUNCTION__;                  \
+        (data)->msg_line   = __LINE__;                      \
+        (data)->msg_cdls   = (cdls);                        \
+        (data)->msg_mask   = (mask);                        \
+} while (0)
+
+#define LIBCFS_DEBUG_MSG_DATA_DECL(dataname, mask, cdls)    \
+        static struct libcfs_debug_msg_data dataname = {    \
+               .msg_subsys = DEBUG_SUBSYSTEM,               \
+               .msg_file   = __FILE__,                      \
+               .msg_fn     = __FUNCTION__,                  \
+               .msg_line   = __LINE__,                      \
+               .msg_cdls   = (cdls)         };              \
+        dataname.msg_mask   = (mask);
+
+#ifdef __KERNEL__
+
+# ifdef CDEBUG_ENABLED
+
+/**
+ * Filters out logging messages based on mask and subsystem.
+ */
+static inline int cfs_cdebug_show(unsigned int mask, unsigned int subsystem)
+{
+        return mask & D_CANTMASK ||
+                ((libcfs_debug & mask) && (libcfs_subsystem_debug & subsystem));
+}
+
+#  define __CDEBUG(cdls, mask, format, ...)				\
+do {                                                                    \
+        static struct libcfs_debug_msg_data msgdata;                    \
+                                                                        \
+        CFS_CHECK_STACK(&msgdata, mask, cdls);                          \
+                                                                        \
+        if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {                   \
+                LIBCFS_DEBUG_MSG_DATA_INIT(&msgdata, mask, cdls);       \
+                libcfs_debug_msg(&msgdata, format, ## __VA_ARGS__);     \
+        }                                                               \
+} while (0)
+
+#  define CDEBUG(mask, format, ...) __CDEBUG(NULL, mask, format, ## __VA_ARGS__)
+
+#  define CDEBUG_LIMIT(mask, format, ...)				\
+do {									\
+	static struct cfs_debug_limit_state cdls;			\
+									\
+	__CDEBUG(&cdls, mask, format, ## __VA_ARGS__);			\
+} while (0)
+
+# else /* !CDEBUG_ENABLED */
+static inline int cfs_cdebug_show(unsigned int mask, unsigned int subsystem)
+{
+        return 0;
+}
+#  define CDEBUG(mask, format, ...) (void)(0)
+#  define CDEBUG_LIMIT(mask, format, ...) (void)(0)
+#  warning "CDEBUG IS DISABLED. THIS SHOULD NEVER BE DONE FOR PRODUCTION!"
+# endif /* CDEBUG_ENABLED */
+
+#else /* !__KERNEL__ */
+static inline int cfs_cdebug_show(unsigned int mask, unsigned int subsystem)
+{
+        return 0;
+}
+# define CDEBUG(mask, format, ...)					\
+do {                                                                    \
+        if (((mask) & D_CANTMASK) != 0)                                 \
+                fprintf(stderr, "(%s:%d:%s()) " format,                 \
+                        __FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__);\
+} while (0)
+
+# define CDEBUG_LIMIT CDEBUG
+
+#endif /* __KERNEL__ */
+
+/*
+ * Lustre Error Checksum: calculates checksum
+ * of Hex number by XORing each bit.
+ */
+#define LERRCHKSUM(hexnum) (((hexnum) & 0xf) ^ ((hexnum) >> 4 & 0xf) ^ \
+			   ((hexnum) >> 8 & 0xf))
+
+#define CWARN(format, ...)          CDEBUG_LIMIT(D_WARNING, format, ## __VA_ARGS__)
+#define CERROR(format, ...)         CDEBUG_LIMIT(D_ERROR, format, ## __VA_ARGS__)
+#define CNETERR(format, a...)       CDEBUG_LIMIT(D_NETERROR, format, ## a)
+#define CEMERG(format, ...)         CDEBUG_LIMIT(D_EMERG, format, ## __VA_ARGS__)
+
+#define LCONSOLE(mask, format, ...) CDEBUG(D_CONSOLE | (mask), format, ## __VA_ARGS__)
+#define LCONSOLE_INFO(format, ...)  CDEBUG_LIMIT(D_CONSOLE, format, ## __VA_ARGS__)
+#define LCONSOLE_WARN(format, ...)  CDEBUG_LIMIT(D_CONSOLE | D_WARNING, format, ## __VA_ARGS__)
+#define LCONSOLE_ERROR_MSG(errnum, format, ...) CDEBUG_LIMIT(D_CONSOLE | D_ERROR, \
+                           "%x-%x: " format, errnum, LERRCHKSUM(errnum), ## __VA_ARGS__)
+#define LCONSOLE_ERROR(format, ...) LCONSOLE_ERROR_MSG(0x00, format, ## __VA_ARGS__)
+
+#define LCONSOLE_EMERG(format, ...) CDEBUG(D_CONSOLE | D_EMERG, format, ## __VA_ARGS__)
+
+#if defined(CDEBUG_ENTRY_EXIT) && defined(__KERNEL__)
+
+void libcfs_log_goto(struct libcfs_debug_msg_data *goto_data,
+		     const char *label, long rc);
+
+# define GOTO(label, rc)						\
+do {									\
+	if (cfs_cdebug_show(D_TRACE, DEBUG_SUBSYSTEM)) {		\
+		LIBCFS_DEBUG_MSG_DATA_DECL(_goto_data, D_TRACE, NULL);	\
+		libcfs_log_goto(&_goto_data, #label, (long)(rc));	\
+	} else {							\
+		(void)(rc);						\
+	}								\
+									\
+	goto label;							\
+} while (0)
+
+
+long libcfs_log_return(struct libcfs_debug_msg_data *, long rc);
+# if BITS_PER_LONG > 32
+#  define RETURN(rc)							\
+do {									\
+	if (cfs_cdebug_show(D_TRACE, DEBUG_SUBSYSTEM)) {		\
+                LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_TRACE, NULL);	\
+                return (typeof(rc))libcfs_log_return(&msgdata,		\
+                                                     (long)(rc));	\
+	}								\
+									\
+	return (rc);							\
+} while (0)
+# else /* BITS_PER_LONG == 32 */
+/* We need an on-stack variable, because we cannot case a 32-bit pointer
+ * directly to (long long) without generating a complier warning/error, yet
+ * casting directly to (long) will truncate 64-bit return values. The log
+ * values will print as 32-bit values, but they always have been. LU-1436
+ */
+#  define RETURN(rc)							\
+do {									\
+	if (cfs_cdebug_show(D_TRACE, DEBUG_SUBSYSTEM)) {		\
+		typeof(rc) __rc = (rc);					\
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_TRACE, NULL);	\
+		libcfs_log_return(&msgdata, (long)__rc);		\
+		return __rc;						\
+	}								\
+									\
+	return (rc);							\
+} while (0)
+
+# endif /* BITS_PER_LONG > 32 */
+
+# define ENTRY	CDEBUG(D_TRACE, "Process entered\n")
+# define EXIT	CDEBUG(D_TRACE, "Process leaving\n")
+
+#else /* !CDEBUG_ENTRY_EXIT || !__KERNEL__ */
+
+# define GOTO(label, rc)						\
+	do {								\
+		((void)(rc));						\
+		goto label;						\
+	} while (0)
+
+# define RETURN(rc) return (rc)
+# define ENTRY	do { } while (0)
+# define EXIT	do { } while (0)
+
+#endif /* CDEBUG_ENTRY_EXIT && __KERNEL__ */
+
+#define RETURN_EXIT							\
+do {									\
+	EXIT;								\
+	return;								\
+} while (0)
+
+extern int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata,
+                            const char *format1, ...)
+        __attribute__ ((format (printf, 2, 3)));
+
+extern int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata,
+                              const char *format1,
+                              va_list args, const char *format2, ...)
+        __attribute__ ((format (printf, 4, 5)));
+
+#ifdef __KERNEL__
+/* other external symbols that tracefile provides: */
+extern int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
+				   const char __user *usr_buffer,
+				   int usr_buffer_nob);
+extern int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob,
+				    const char *knl_buffer, char *append);
+#endif /* __KERNEL__ */
+
+#define LIBCFS_DEBUG_FILE_PATH_DEFAULT "/tmp/lustre-log"
+
+#endif	/* __LIBCFS_DEBUG_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_fail.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_fail.h
new file mode 100644
index 0000000000000..2af5149be8f69
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_fail.h
@@ -0,0 +1,178 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Oracle Corporation, Inc.
+ */
+
+#ifndef _LIBCFS_FAIL_H
+#define _LIBCFS_FAIL_H
+
+extern unsigned long cfs_fail_loc;
+extern unsigned int cfs_fail_val;
+extern int cfs_fail_err;
+
+extern wait_queue_head_t cfs_race_waitq;
+extern int cfs_race_state;
+
+int __cfs_fail_check_set(__u32 id, __u32 value, int set);
+int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set);
+
+enum {
+        CFS_FAIL_LOC_NOSET      = 0,
+        CFS_FAIL_LOC_ORSET      = 1,
+        CFS_FAIL_LOC_RESET      = 2,
+        CFS_FAIL_LOC_VALUE      = 3
+};
+
+/* Failure injection control */
+#define CFS_FAIL_MASK_SYS    0x0000FF00
+#define CFS_FAIL_MASK_LOC   (0x000000FF | CFS_FAIL_MASK_SYS)
+
+#define CFS_FAILED_BIT       30
+/* CFS_FAILED is 0x40000000 */
+#define CFS_FAILED          (1 << CFS_FAILED_BIT)
+
+#define CFS_FAIL_ONCE_BIT    31
+/* CFS_FAIL_ONCE is 0x80000000 */
+#define CFS_FAIL_ONCE       (1 << CFS_FAIL_ONCE_BIT)
+
+/* The following flags aren't made to be combined */
+#define CFS_FAIL_SKIP        0x20000000 /* skip N times then fail */
+#define CFS_FAIL_SOME        0x10000000 /* only fail N times */
+#define CFS_FAIL_RAND        0x08000000 /* fail 1/N of the times */
+#define CFS_FAIL_USR1        0x04000000 /* user flag */
+
+/* CFS_FAULT may be combined with any one of the above flags. */
+#define CFS_FAULT	     0x02000000 /* match any CFS_FAULT_CHECK */
+
+static inline bool CFS_FAIL_PRECHECK(__u32 id)
+{
+	return cfs_fail_loc != 0 &&
+	      ((cfs_fail_loc & CFS_FAIL_MASK_LOC) == (id & CFS_FAIL_MASK_LOC) ||
+	       (cfs_fail_loc & id & CFS_FAULT));
+}
+
+static inline int cfs_fail_check_set(__u32 id, __u32 value,
+				     int set, int quiet)
+{
+	int ret = 0;
+
+	if (unlikely(CFS_FAIL_PRECHECK(id) &&
+		     (ret = __cfs_fail_check_set(id, value, set)))) {
+		if (quiet) {
+			CDEBUG(D_INFO, "*** cfs_fail_loc=%x, val=%u***\n",
+			       id, value);
+		} else {
+			LCONSOLE_INFO("*** cfs_fail_loc=%x, val=%u***\n",
+				      id, value);
+		}
+	}
+
+	return ret;
+}
+
+/* If id hit cfs_fail_loc, return 1, otherwise return 0 */
+#define CFS_FAIL_CHECK(id) \
+	cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET, 0)
+#define CFS_FAIL_CHECK_QUIET(id) \
+	cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET, 1)
+
+/* If id hit cfs_fail_loc and cfs_fail_val == (-1 or value) return 1,
+ * otherwise return 0 */
+#define CFS_FAIL_CHECK_VALUE(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_VALUE, 0)
+#define CFS_FAIL_CHECK_VALUE_QUIET(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_VALUE, 1)
+
+/* If id hit cfs_fail_loc, cfs_fail_loc |= value and return 1,
+ * otherwise return 0 */
+#define CFS_FAIL_CHECK_ORSET(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_ORSET, 0)
+#define CFS_FAIL_CHECK_ORSET_QUIET(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_ORSET, 1)
+
+/* If id hit cfs_fail_loc, cfs_fail_loc = value and return 1,
+ * otherwise return 0 */
+#define CFS_FAIL_CHECK_RESET(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_RESET, 0)
+#define CFS_FAIL_CHECK_RESET_QUIET(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_RESET, 1)
+
+static inline int cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set)
+{
+        if (unlikely(CFS_FAIL_PRECHECK(id)))
+                return __cfs_fail_timeout_set(id, value, ms, set);
+        else
+                return 0;
+}
+
+/* If id hit cfs_fail_loc, sleep for seconds or milliseconds */
+#define CFS_FAIL_TIMEOUT(id, secs) \
+        cfs_fail_timeout_set(id, 0, (secs) * 1000, CFS_FAIL_LOC_NOSET)
+
+#define CFS_FAIL_TIMEOUT_MS(id, ms) \
+        cfs_fail_timeout_set(id, 0, ms, CFS_FAIL_LOC_NOSET)
+
+/* If id hit cfs_fail_loc, cfs_fail_loc |= value and
+ * sleep seconds or milliseconds */
+#define CFS_FAIL_TIMEOUT_ORSET(id, value, secs) \
+        cfs_fail_timeout_set(id, value, (secs) * 1000, CFS_FAIL_LOC_ORSET)
+
+#define CFS_FAIL_TIMEOUT_RESET(id, value, secs) \
+	cfs_fail_timeout_set(id, value, secs * 1000, CFS_FAIL_LOC_RESET)
+
+#define CFS_FAIL_TIMEOUT_MS_ORSET(id, value, ms) \
+        cfs_fail_timeout_set(id, value, ms, CFS_FAIL_LOC_ORSET)
+
+#define CFS_FAULT_CHECK(id)			\
+	CFS_FAIL_CHECK(CFS_FAULT | (id))
+
+/* The idea here is to synchronise two threads to force a race. The
+ * first thread that calls this with a matching fail_loc is put to
+ * sleep. The next thread that calls with the same fail_loc wakes up
+ * the first and continues. */
+static inline void cfs_race(__u32 id)
+{
+	if (CFS_FAIL_PRECHECK(id)) {
+		if (unlikely(__cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET))) {
+			int rc;
+			cfs_race_state = 0;
+			CERROR("cfs_race id %x sleeping\n", id);
+			rc = wait_event_interruptible(cfs_race_waitq,
+						      cfs_race_state != 0);
+			CERROR("cfs_fail_race id %x awake, rc=%d\n", id, rc);
+		} else {
+			CERROR("cfs_fail_race id %x waking\n", id);
+			cfs_race_state = 1;
+			wake_up(&cfs_race_waitq);
+		}
+	}
+}
+#define CFS_RACE(id) cfs_race(id)
+
+#endif /* _LIBCFS_FAIL_H */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_hash.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_hash.h
new file mode 100644
index 0000000000000..0c385a337ce26
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_hash.h
@@ -0,0 +1,857 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_hash.h
+ *
+ * Hashing routines
+ *
+ */
+
+#ifndef __LIBCFS_HASH_H__
+#define __LIBCFS_HASH_H__
+
+#include <linux/hash.h>
+
+/*
+ * Knuth recommends primes in approximately golden ratio to the maximum
+ * integer representable by a machine word for multiplicative hashing.
+ * Chuck Lever verified the effectiveness of this technique:
+ * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
+ *
+ * These primes are chosen to be bit-sparse, that is operations on
+ * them can use shifts and additions instead of multiplications for
+ * machines where multiplications are slow.
+ */
+/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
+#define CFS_GOLDEN_RATIO_PRIME_32 0x9e370001UL
+/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
+#define CFS_GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001ULL
+
+/** disable debug */
+#define CFS_HASH_DEBUG_NONE	0
+/** record hash depth and output to console when it's too deep,
+ *  computing overhead is low but consume more memory */
+#define CFS_HASH_DEBUG_1	1
+/** expensive, check key validation */
+#define CFS_HASH_DEBUG_2	2
+
+#define CFS_HASH_DEBUG_LEVEL	CFS_HASH_DEBUG_NONE
+
+struct cfs_hash_ops;
+struct cfs_hash_lock_ops;
+struct cfs_hash_hlist_ops;
+
+union cfs_hash_lock {
+	rwlock_t		rw;		/**< rwlock */
+	spinlock_t		spin;		/**< spinlock */
+};
+
+/**
+ * cfs_hash_bucket is a container of:
+ * - lock, counter ...
+ * - array of hash-head starting from hsb_head[0], hash-head can be one of
+ *   . struct cfs_hash_head
+ *   . struct cfs_hash_head_dep
+ *   . struct cfs_hash_dhead
+ *   . struct cfs_hash_dhead_dep
+ *   which depends on requirement of user
+ * - some extra bytes (caller can require it while creating hash)
+ */
+struct cfs_hash_bucket {
+	union cfs_hash_lock	hsb_lock;	/**< bucket lock */
+	__u32			hsb_count;	/**< current entries */
+	__u32			hsb_version;	/**< change version */
+	unsigned int		hsb_index;	/**< index of bucket */
+	int			hsb_depmax;	/**< max depth on bucket */
+	long			hsb_head[0];	/**< hash-head array */
+};
+
+/**
+ * cfs_hash bucket descriptor, it's normally in stack of caller
+ */
+struct cfs_hash_bd {
+	/**< address of bucket */
+	struct cfs_hash_bucket	*bd_bucket;
+	/**< offset in bucket */
+	unsigned int		 bd_offset;
+};
+
+#define CFS_HASH_NAME_LEN           16      /**< default name length */
+#define CFS_HASH_BIGNAME_LEN        64      /**< bigname for param tree */
+
+#define CFS_HASH_BKT_BITS           3       /**< default bits of bucket */
+#define CFS_HASH_BITS_MAX           30      /**< max bits of bucket */
+#define CFS_HASH_BITS_MIN           CFS_HASH_BKT_BITS
+
+/**
+ * common hash attributes.
+ */
+enum cfs_hash_tag {
+        /**
+         * don't need any lock, caller will protect operations with it's
+         * own lock. With this flag:
+         *  . CFS_HASH_NO_BKTLOCK, CFS_HASH_RW_BKTLOCK, CFS_HASH_SPIN_BKTLOCK
+         *    will be ignored.
+         *  . Some functions will be disabled with this flag, i.e:
+         *    cfs_hash_for_each_empty, cfs_hash_rehash
+         */
+        CFS_HASH_NO_LOCK        = 1 << 0,
+        /** no bucket lock, use one spinlock to protect the whole hash */
+        CFS_HASH_NO_BKTLOCK     = 1 << 1,
+        /** rwlock to protect bucket */
+        CFS_HASH_RW_BKTLOCK     = 1 << 2,
+	/** spinlock to protect bucket */
+        CFS_HASH_SPIN_BKTLOCK   = 1 << 3,
+        /** always add new item to tail */
+        CFS_HASH_ADD_TAIL       = 1 << 4,
+        /** hash-table doesn't have refcount on item */
+        CFS_HASH_NO_ITEMREF     = 1 << 5,
+        /** big name for param-tree */
+        CFS_HASH_BIGNAME        = 1 << 6,
+        /** track global count */
+        CFS_HASH_COUNTER        = 1 << 7,
+        /** rehash item by new key */
+        CFS_HASH_REHASH_KEY     = 1 << 8,
+        /** Enable dynamic hash resizing */
+        CFS_HASH_REHASH         = 1 << 9,
+        /** can shrink hash-size */
+        CFS_HASH_SHRINK         = 1 << 10,
+        /** assert hash is empty on exit */
+        CFS_HASH_ASSERT_EMPTY   = 1 << 11,
+        /** record hlist depth */
+        CFS_HASH_DEPTH          = 1 << 12,
+        /**
+         * rehash is always scheduled in a different thread, so current
+         * change on hash table is non-blocking
+         */
+        CFS_HASH_NBLK_CHANGE    = 1 << 13,
+        /** NB, we typed hs_flags as  __u16, please change it
+         * if you need to extend >=16 flags */
+};
+
+/** most used attributes */
+#define CFS_HASH_DEFAULT       (CFS_HASH_RW_BKTLOCK | \
+                                CFS_HASH_COUNTER | CFS_HASH_REHASH)
+
+/**
+ * cfs_hash is a hash-table implementation for general purpose, it can support:
+ *    . two refcount modes
+ *      hash-table with & without refcount
+ *    . four lock modes
+ *      nolock, one-spinlock, rw-bucket-lock, spin-bucket-lock
+ *    . general operations
+ *      lookup, add(add_tail or add_head), delete
+ *    . rehash
+ *      grows or shrink
+ *    . iteration
+ *      locked iteration and unlocked iteration
+ *    . bigname
+ *      support long name hash
+ *    . debug
+ *      trace max searching depth
+ *
+ * Rehash:
+ * When the htable grows or shrinks, a separate task (cfs_hash_rehash_worker)
+ * is spawned to handle the rehash in the background, it's possible that other
+ * processes can concurrently perform additions, deletions, and lookups
+ * without being blocked on rehash completion, because rehash will release
+ * the global wrlock for each bucket.
+ *
+ * rehash and iteration can't run at the same time because it's too tricky
+ * to keep both of them safe and correct.
+ * As they are relatively rare operations, so:
+ *   . if iteration is in progress while we try to launch rehash, then
+ *     it just giveup, iterator will launch rehash at the end.
+ *   . if rehash is in progress while we try to iterate the hash table,
+ *     then we just wait (shouldn't be very long time), anyway, nobody
+ *     should expect iteration of whole hash-table to be non-blocking.
+ *
+ * During rehashing, a (key,object) pair may be in one of two buckets,
+ * depending on whether the worker task has yet to transfer the object
+ * to its new location in the table. Lookups and deletions need to search both
+ * locations; additions must take care to only insert into the new bucket.
+ */
+
+struct cfs_hash {
+	/** serialize with rehash, or serialize all operations if
+	 * the hash-table has CFS_HASH_NO_BKTLOCK */
+	union cfs_hash_lock		hs_lock;
+	/** hash operations */
+	struct cfs_hash_ops		*hs_ops;
+	/** hash lock operations */
+	struct cfs_hash_lock_ops	*hs_lops;
+	/** hash list operations */
+	struct cfs_hash_hlist_ops	*hs_hops;
+	/** hash buckets-table */
+	struct cfs_hash_bucket		**hs_buckets;
+	/** total number of items on this hash-table */
+	atomic_t			hs_count;
+	/** hash flags, see cfs_hash_tag for detail */
+	__u16                       hs_flags;
+	/** # of extra-bytes for bucket, for user saving extended attributes */
+        __u16                       hs_extra_bytes;
+        /** wants to iterate */
+        __u8                        hs_iterating;
+        /** hash-table is dying */
+        __u8                        hs_exiting;
+        /** current hash bits */
+        __u8                        hs_cur_bits;
+        /** min hash bits */
+        __u8                        hs_min_bits;
+        /** max hash bits */
+        __u8                        hs_max_bits;
+        /** bits for rehash */
+        __u8                        hs_rehash_bits;
+        /** bits for each bucket */
+        __u8                        hs_bkt_bits;
+        /** resize min threshold */
+        __u16                       hs_min_theta;
+        /** resize max threshold */
+        __u16                       hs_max_theta;
+        /** resize count */
+        __u32                       hs_rehash_count;
+        /** # of iterators (caller of cfs_hash_for_each_*) */
+        __u32                       hs_iterators;
+	/** rehash workitem */
+	struct cfs_workitem		hs_rehash_wi;
+	/** refcount on this hash table */
+	atomic_t			hs_refcount;
+	/** rehash buckets-table */
+	struct cfs_hash_bucket		**hs_rehash_buckets;
+#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+        /** serialize debug members */
+	spinlock_t		    hs_dep_lock;
+        /** max depth */
+        unsigned int                hs_dep_max;
+        /** id of the deepest bucket */
+        unsigned int                hs_dep_bkt;
+        /** offset in the deepest bucket */
+        unsigned int                hs_dep_off;
+        /** bits when we found the max depth */
+        unsigned int                hs_dep_bits;
+        /** workitem to output max depth */
+	struct cfs_workitem	    hs_dep_wi;
+#endif
+        /** name of htable */
+        char                        hs_name[0];
+};
+
+struct cfs_hash_lock_ops {
+        /** lock the hash table */
+	void    (*hs_lock)(union cfs_hash_lock *lock, int exclusive);
+        /** unlock the hash table */
+	void    (*hs_unlock)(union cfs_hash_lock *lock, int exclusive);
+        /** lock the hash bucket */
+	void    (*hs_bkt_lock)(union cfs_hash_lock *lock, int exclusive);
+        /** unlock the hash bucket */
+	void    (*hs_bkt_unlock)(union cfs_hash_lock *lock, int exclusive);
+};
+
+struct cfs_hash_hlist_ops {
+	/** return hlist_head of hash-head of @bd */
+	struct hlist_head *(*hop_hhead)(struct cfs_hash *hs, struct cfs_hash_bd *bd);
+	/** return hash-head size */
+	int (*hop_hhead_size)(struct cfs_hash *hs);
+	/** add @hnode to hash-head of @bd */
+	int (*hop_hnode_add)(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+				struct hlist_node *hnode);
+	/** remove @hnode from hash-head of @bd */
+	int (*hop_hnode_del)(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+				struct hlist_node *hnode);
+};
+
+struct cfs_hash_ops {
+	/** return hashed value from @key */
+	unsigned (*hs_hash)(struct cfs_hash *hs, const void *key, unsigned mask);
+	/** return key address of @hnode */
+	void *   (*hs_key)(struct hlist_node *hnode);
+	/** copy key from @hnode to @key */
+	void     (*hs_keycpy)(struct hlist_node *hnode, void *key);
+	/**
+	 *  compare @key with key of @hnode
+	 *  returns 1 on a match
+	 */
+	int      (*hs_keycmp)(const void *key, struct hlist_node *hnode);
+	/** return object address of @hnode, i.e: container_of(...hnode) */
+	void *   (*hs_object)(struct hlist_node *hnode);
+	/** get refcount of item, always called with holding bucket-lock */
+	void     (*hs_get)(struct cfs_hash *hs, struct hlist_node *hnode);
+	/** release refcount of item */
+	void     (*hs_put)(struct cfs_hash *hs, struct hlist_node *hnode);
+	/** release refcount of item, always called with holding bucket-lock */
+	void     (*hs_put_locked)(struct cfs_hash *hs, struct hlist_node *hnode);
+	/** it's called before removing of @hnode */
+	void     (*hs_exit)(struct cfs_hash *hs, struct hlist_node *hnode);
+};
+
+/** total number of buckets in @hs */
+#define CFS_HASH_NBKT(hs)       \
+        (1U << ((hs)->hs_cur_bits - (hs)->hs_bkt_bits))
+
+/** total number of buckets in @hs while rehashing */
+#define CFS_HASH_RH_NBKT(hs)    \
+        (1U << ((hs)->hs_rehash_bits - (hs)->hs_bkt_bits))
+
+/** number of hlist for in bucket */
+#define CFS_HASH_BKT_NHLIST(hs) (1U << (hs)->hs_bkt_bits)
+
+/** total number of hlist in @hs */
+#define CFS_HASH_NHLIST(hs)     (1U << (hs)->hs_cur_bits)
+
+/** total number of hlist in @hs while rehashing */
+#define CFS_HASH_RH_NHLIST(hs)  (1U << (hs)->hs_rehash_bits)
+
+static inline int
+cfs_hash_with_no_lock(struct cfs_hash *hs)
+{
+        /* caller will serialize all operations for this hash-table */
+        return (hs->hs_flags & CFS_HASH_NO_LOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_no_bktlock(struct cfs_hash *hs)
+{
+        /* no bucket lock, one single lock to protect the hash-table */
+        return (hs->hs_flags & CFS_HASH_NO_BKTLOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_rw_bktlock(struct cfs_hash *hs)
+{
+        /* rwlock to protect hash bucket */
+        return (hs->hs_flags & CFS_HASH_RW_BKTLOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_spin_bktlock(struct cfs_hash *hs)
+{
+        /* spinlock to protect hash bucket */
+        return (hs->hs_flags & CFS_HASH_SPIN_BKTLOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_add_tail(struct cfs_hash *hs)
+{
+        return (hs->hs_flags & CFS_HASH_ADD_TAIL) != 0;
+}
+
+static inline int
+cfs_hash_with_no_itemref(struct cfs_hash *hs)
+{
+        /* hash-table doesn't keep refcount on item,
+         * item can't be removed from hash unless it's
+         * ZERO refcount */
+        return (hs->hs_flags & CFS_HASH_NO_ITEMREF) != 0;
+}
+
+static inline int
+cfs_hash_with_bigname(struct cfs_hash *hs)
+{
+        return (hs->hs_flags & CFS_HASH_BIGNAME) != 0;
+}
+
+static inline int
+cfs_hash_with_counter(struct cfs_hash *hs)
+{
+        return (hs->hs_flags & CFS_HASH_COUNTER) != 0;
+}
+
+static inline int
+cfs_hash_with_rehash(struct cfs_hash *hs)
+{
+        return (hs->hs_flags & CFS_HASH_REHASH) != 0;
+}
+
+static inline int
+cfs_hash_with_rehash_key(struct cfs_hash *hs)
+{
+        return (hs->hs_flags & CFS_HASH_REHASH_KEY) != 0;
+}
+
+static inline int
+cfs_hash_with_shrink(struct cfs_hash *hs)
+{
+        return (hs->hs_flags & CFS_HASH_SHRINK) != 0;
+}
+
+static inline int
+cfs_hash_with_assert_empty(struct cfs_hash *hs)
+{
+        return (hs->hs_flags & CFS_HASH_ASSERT_EMPTY) != 0;
+}
+
+static inline int
+cfs_hash_with_depth(struct cfs_hash *hs)
+{
+        return (hs->hs_flags & CFS_HASH_DEPTH) != 0;
+}
+
+static inline int
+cfs_hash_with_nblk_change(struct cfs_hash *hs)
+{
+        return (hs->hs_flags & CFS_HASH_NBLK_CHANGE) != 0;
+}
+
+static inline int
+cfs_hash_is_exiting(struct cfs_hash *hs)
+{       /* cfs_hash_destroy is called */
+        return hs->hs_exiting;
+}
+
+static inline int
+cfs_hash_is_rehashing(struct cfs_hash *hs)
+{       /* rehash is launched */
+        return hs->hs_rehash_bits != 0;
+}
+
+static inline int
+cfs_hash_is_iterating(struct cfs_hash *hs)
+{       /* someone is calling cfs_hash_for_each_* */
+        return hs->hs_iterating || hs->hs_iterators != 0;
+}
+
+static inline int
+cfs_hash_bkt_size(struct cfs_hash *hs)
+{
+	return offsetof(struct cfs_hash_bucket, hsb_head[0]) +
+               hs->hs_hops->hop_hhead_size(hs) * CFS_HASH_BKT_NHLIST(hs) +
+               hs->hs_extra_bytes;
+}
+
+static inline unsigned
+cfs_hash_id(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+	return hs->hs_ops->hs_hash(hs, key, mask);
+}
+
+static inline void *
+cfs_hash_key(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	return hs->hs_ops->hs_key(hnode);
+}
+
+static inline void
+cfs_hash_keycpy(struct cfs_hash *hs, struct hlist_node *hnode, void *key)
+{
+	if (hs->hs_ops->hs_keycpy != NULL)
+		hs->hs_ops->hs_keycpy(hnode, key);
+}
+
+/**
+ * Returns 1 on a match,
+ */
+static inline int
+cfs_hash_keycmp(struct cfs_hash *hs, const void *key, struct hlist_node *hnode)
+{
+	return hs->hs_ops->hs_keycmp(key, hnode);
+}
+
+static inline void *
+cfs_hash_object(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	return hs->hs_ops->hs_object(hnode);
+}
+
+static inline void
+cfs_hash_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	return hs->hs_ops->hs_get(hs, hnode);
+}
+
+static inline void
+cfs_hash_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	return hs->hs_ops->hs_put_locked(hs, hnode);
+}
+
+static inline void
+cfs_hash_put(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	return hs->hs_ops->hs_put(hs, hnode);
+}
+
+static inline void
+cfs_hash_exit(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	if (hs->hs_ops->hs_exit)
+		hs->hs_ops->hs_exit(hs, hnode);
+}
+
+static inline void cfs_hash_lock(struct cfs_hash *hs, int excl)
+{
+        hs->hs_lops->hs_lock(&hs->hs_lock, excl);
+}
+
+static inline void cfs_hash_unlock(struct cfs_hash *hs, int excl)
+{
+        hs->hs_lops->hs_unlock(&hs->hs_lock, excl);
+}
+
+static inline int cfs_hash_dec_and_lock(struct cfs_hash *hs,
+					atomic_t *condition)
+{
+	LASSERT(cfs_hash_with_no_bktlock(hs));
+	return atomic_dec_and_lock(condition, &hs->hs_lock.spin);
+}
+
+static inline void cfs_hash_bd_lock(struct cfs_hash *hs,
+				    struct cfs_hash_bd *bd, int excl)
+{
+        hs->hs_lops->hs_bkt_lock(&bd->bd_bucket->hsb_lock, excl);
+}
+
+static inline void cfs_hash_bd_unlock(struct cfs_hash *hs,
+				      struct cfs_hash_bd *bd, int excl)
+{
+        hs->hs_lops->hs_bkt_unlock(&bd->bd_bucket->hsb_lock, excl);
+}
+
+/**
+ * operations on cfs_hash bucket (bd: bucket descriptor),
+ * they are normally for hash-table without rehash
+ */
+void cfs_hash_bd_get(struct cfs_hash *hs, const void *key,
+		     struct cfs_hash_bd *bd);
+
+static inline void
+cfs_hash_bd_get_and_lock(struct cfs_hash *hs, const void *key,
+			 struct cfs_hash_bd *bd, int excl)
+{
+        cfs_hash_bd_get(hs, key, bd);
+        cfs_hash_bd_lock(hs, bd, excl);
+}
+
+static inline unsigned
+cfs_hash_bd_index_get(struct cfs_hash *hs, struct cfs_hash_bd *bd)
+{
+        return bd->bd_offset | (bd->bd_bucket->hsb_index << hs->hs_bkt_bits);
+}
+
+static inline void
+cfs_hash_bd_index_set(struct cfs_hash *hs, unsigned index,
+		      struct cfs_hash_bd *bd)
+{
+        bd->bd_bucket = hs->hs_buckets[index >> hs->hs_bkt_bits];
+        bd->bd_offset = index & (CFS_HASH_BKT_NHLIST(hs) - 1U);
+}
+
+static inline void *
+cfs_hash_bd_extra_get(struct cfs_hash *hs, struct cfs_hash_bd *bd)
+{
+        return (void *)bd->bd_bucket +
+               cfs_hash_bkt_size(hs) - hs->hs_extra_bytes;
+}
+
+static inline __u32
+cfs_hash_bd_version_get(struct cfs_hash_bd *bd)
+{
+        /* need hold cfs_hash_bd_lock */
+        return bd->bd_bucket->hsb_version;
+}
+
+static inline __u32
+cfs_hash_bd_count_get(struct cfs_hash_bd *bd)
+{
+        /* need hold cfs_hash_bd_lock */
+        return bd->bd_bucket->hsb_count;
+}
+
+static inline int
+cfs_hash_bd_depmax_get(struct cfs_hash_bd *bd)
+{
+        return bd->bd_bucket->hsb_depmax;
+}
+
+static inline int
+cfs_hash_bd_compare(struct cfs_hash_bd *bd1, struct cfs_hash_bd *bd2)
+{
+        if (bd1->bd_bucket->hsb_index != bd2->bd_bucket->hsb_index)
+                return bd1->bd_bucket->hsb_index - bd2->bd_bucket->hsb_index;
+
+        if (bd1->bd_offset != bd2->bd_offset)
+                return bd1->bd_offset - bd2->bd_offset;
+
+        return 0;
+}
+
+void cfs_hash_bd_add_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			    struct hlist_node *hnode);
+void cfs_hash_bd_del_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			    struct hlist_node *hnode);
+void cfs_hash_bd_move_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd_old,
+			     struct cfs_hash_bd *bd_new,
+			     struct hlist_node *hnode);
+
+static inline int
+cfs_hash_bd_dec_and_lock(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			 atomic_t *condition)
+{
+	LASSERT(cfs_hash_with_spin_bktlock(hs));
+	return atomic_dec_and_lock(condition, &bd->bd_bucket->hsb_lock.spin);
+}
+
+static inline struct hlist_head *
+cfs_hash_bd_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd)
+{
+	return hs->hs_hops->hop_hhead(hs, bd);
+}
+
+struct hlist_node *
+cfs_hash_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			  const void *key);
+struct hlist_node *
+cfs_hash_bd_peek_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			const void *key);
+struct hlist_node *
+cfs_hash_bd_findadd_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			   const void *key, struct hlist_node *hnode,
+			   int insist_add);
+struct hlist_node *
+cfs_hash_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			   const void *key, struct hlist_node *hnode);
+
+/**
+ * operations on cfs_hash bucket (bd: bucket descriptor),
+ * they are safe for hash-table with rehash
+ */
+void cfs_hash_dual_bd_get(struct cfs_hash *hs, const void *key,
+			  struct cfs_hash_bd *bds);
+void cfs_hash_dual_bd_lock(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+			   int excl);
+void cfs_hash_dual_bd_unlock(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+			     int excl);
+
+static inline void
+cfs_hash_dual_bd_get_and_lock(struct cfs_hash *hs, const void *key,
+			      struct cfs_hash_bd *bds, int excl)
+{
+	cfs_hash_dual_bd_get(hs, key, bds);
+	cfs_hash_dual_bd_lock(hs, bds, excl);
+}
+
+struct hlist_node *
+cfs_hash_dual_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+				const void *key);
+struct hlist_node *
+cfs_hash_dual_bd_findadd_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+				const void *key, struct hlist_node *hnode,
+				int insist_add);
+struct hlist_node *
+cfs_hash_dual_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+				const void *key, struct hlist_node *hnode);
+
+/* Hash init/cleanup functions */
+struct cfs_hash *
+cfs_hash_create(char *name, unsigned cur_bits, unsigned max_bits,
+		unsigned bkt_bits, unsigned extra_bytes,
+		unsigned min_theta, unsigned max_theta,
+		struct cfs_hash_ops *ops, unsigned flags);
+
+struct cfs_hash *cfs_hash_getref(struct cfs_hash *hs);
+void cfs_hash_putref(struct cfs_hash *hs);
+
+/* Hash addition functions */
+void cfs_hash_add(struct cfs_hash *hs, const void *key,
+			struct hlist_node *hnode);
+int cfs_hash_add_unique(struct cfs_hash *hs, const void *key,
+			struct hlist_node *hnode);
+void *cfs_hash_findadd_unique(struct cfs_hash *hs, const void *key,
+			      struct hlist_node *hnode);
+
+/* Hash deletion functions */
+void *cfs_hash_del(struct cfs_hash *hs, const void *key,
+		   struct hlist_node *hnode);
+void *cfs_hash_del_key(struct cfs_hash *hs, const void *key);
+
+/* Hash lookup/for_each functions */
+#define CFS_HASH_LOOP_HOG       1024
+
+typedef int (*cfs_hash_for_each_cb_t)(struct cfs_hash *hs,
+				      struct cfs_hash_bd *bd,
+				      struct hlist_node *node,
+				      void *data);
+void *
+cfs_hash_lookup(struct cfs_hash *hs, const void *key);
+void
+cfs_hash_for_each(struct cfs_hash *hs, cfs_hash_for_each_cb_t, void *data);
+void
+cfs_hash_for_each_safe(struct cfs_hash *hs, cfs_hash_for_each_cb_t, void *data);
+int
+cfs_hash_for_each_nolock(struct cfs_hash *hs, cfs_hash_for_each_cb_t,
+			 void *data, int start);
+int
+cfs_hash_for_each_empty(struct cfs_hash *hs, cfs_hash_for_each_cb_t,
+			void *data);
+void
+cfs_hash_for_each_key(struct cfs_hash *hs, const void *key,
+		      cfs_hash_for_each_cb_t, void *data);
+typedef int (*cfs_hash_cond_opt_cb_t)(void *obj, void *data);
+void
+cfs_hash_cond_del(struct cfs_hash *hs, cfs_hash_cond_opt_cb_t, void *data);
+
+void
+cfs_hash_hlist_for_each(struct cfs_hash *hs, unsigned hindex,
+			cfs_hash_for_each_cb_t, void *data);
+int  cfs_hash_is_empty(struct cfs_hash *hs);
+__u64 cfs_hash_size_get(struct cfs_hash *hs);
+
+/*
+ * Rehash - Theta is calculated to be the average chained
+ * hash depth assuming a perfectly uniform hash function.
+ */
+void cfs_hash_rehash_cancel_locked(struct cfs_hash *hs);
+void cfs_hash_rehash_cancel(struct cfs_hash *hs);
+int  cfs_hash_rehash(struct cfs_hash *hs, int do_rehash);
+void cfs_hash_rehash_key(struct cfs_hash *hs, const void *old_key,
+			void *new_key, struct hlist_node *hnode);
+
+#if CFS_HASH_DEBUG_LEVEL > CFS_HASH_DEBUG_1
+/* Validate hnode references the correct key */
+static inline void
+cfs_hash_key_validate(struct cfs_hash *hs, const void *key,
+		      struct hlist_node *hnode)
+{
+	LASSERT(cfs_hash_keycmp(hs, key, hnode));
+}
+
+/* Validate hnode is in the correct bucket */
+static inline void
+cfs_hash_bucket_validate(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			struct hlist_node *hnode)
+{
+	struct cfs_hash_bd bds[2];
+
+	cfs_hash_dual_bd_get(hs, cfs_hash_key(hs, hnode), bds);
+	LASSERT(bds[0].bd_bucket == bd->bd_bucket ||
+		bds[1].bd_bucket == bd->bd_bucket);
+}
+
+#else /* CFS_HASH_DEBUG_LEVEL > CFS_HASH_DEBUG_1 */
+
+static inline void
+cfs_hash_key_validate(struct cfs_hash *hs, const void *key,
+			struct hlist_node *hnode) {}
+
+static inline void
+cfs_hash_bucket_validate(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			struct hlist_node *hnode) {}
+
+#endif /* CFS_HASH_DEBUG_LEVEL */
+
+#define CFS_HASH_THETA_BITS  10
+#define CFS_HASH_MIN_THETA  (1U << (CFS_HASH_THETA_BITS - 1))
+#define CFS_HASH_MAX_THETA  (1U << (CFS_HASH_THETA_BITS + 1))
+
+/* Return integer component of theta */
+static inline int __cfs_hash_theta_int(int theta)
+{
+        return (theta >> CFS_HASH_THETA_BITS);
+}
+
+/* Return a fractional value between 0 and 999 */
+static inline int __cfs_hash_theta_frac(int theta)
+{
+        return ((theta * 1000) >> CFS_HASH_THETA_BITS) -
+               (__cfs_hash_theta_int(theta) * 1000);
+}
+
+static inline int __cfs_hash_theta(struct cfs_hash *hs)
+{
+	return (atomic_read(&hs->hs_count) <<
+		CFS_HASH_THETA_BITS) >> hs->hs_cur_bits;
+}
+
+static inline void
+__cfs_hash_set_theta(struct cfs_hash *hs, int min, int max)
+{
+        LASSERT(min < max);
+        hs->hs_min_theta = (__u16)min;
+        hs->hs_max_theta = (__u16)max;
+}
+
+/* Generic debug formatting routines mainly for proc handler */
+struct seq_file;
+void cfs_hash_debug_header(struct seq_file *m);
+void cfs_hash_debug_str(struct cfs_hash *hs, struct seq_file *m);
+
+/*
+ * Generic djb2 hash algorithm for character arrays.
+ */
+static inline unsigned
+cfs_hash_djb2_hash(const void *key, size_t size, unsigned mask)
+{
+        unsigned i, hash = 5381;
+
+        LASSERT(key != NULL);
+
+        for (i = 0; i < size; i++)
+                hash = hash * 33 + ((char *)key)[i];
+
+        return (hash & mask);
+}
+
+/*
+ * Generic u32 hash algorithm.
+ */
+static inline unsigned
+cfs_hash_u32_hash(const __u32 key, unsigned mask)
+{
+        return ((key * CFS_GOLDEN_RATIO_PRIME_32) & mask);
+}
+
+/*
+ * Generic u64 hash algorithm.
+ */
+static inline unsigned
+cfs_hash_u64_hash(const __u64 key, unsigned mask)
+{
+        return ((unsigned)(key * CFS_GOLDEN_RATIO_PRIME_64) & mask);
+}
+
+/** iterate over all buckets in @bds (array of struct cfs_hash_bd) */
+#define cfs_hash_for_each_bd(bds, n, i) \
+        for (i = 0; i < n && (bds)[i].bd_bucket != NULL; i++)
+
+/** iterate over all buckets of @hs */
+#define cfs_hash_for_each_bucket(hs, bd, pos)                   \
+        for (pos = 0;                                           \
+             pos < CFS_HASH_NBKT(hs) &&                         \
+             ((bd)->bd_bucket = (hs)->hs_buckets[pos]) != NULL; pos++)
+
+/** iterate over all hlist of bucket @bd */
+#define cfs_hash_bd_for_each_hlist(hs, bd, hlist)               \
+        for ((bd)->bd_offset = 0;                               \
+             (bd)->bd_offset < CFS_HASH_BKT_NHLIST(hs) &&       \
+             (hlist = cfs_hash_bd_hhead(hs, bd)) != NULL;       \
+             (bd)->bd_offset++)
+
+/* !__LIBCFS__HASH_H__ */
+#endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_heap.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_heap.h
new file mode 100644
index 0000000000000..239e9e0547214
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_heap.h
@@ -0,0 +1,203 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ */
+/*
+ * libcfs/include/libcfs/heap.h
+ *
+ * Author: Eric Barton	<eeb@whamcloud.com>
+ *	   Liang Zhen	<liang@whamcloud.com>
+ */
+
+#ifndef __LIBCFS_HEAP_H__
+#define __LIBCFS_HEAP_H__
+
+/** \defgroup heap Binary heap
+ *
+ * The binary heap is a scalable data structure created using a binary tree. It
+ * is capable of maintaining large sets of elements sorted usually by one or
+ * more element properties, but really based on anything that can be used as a
+ * binary predicate in order to determine the relevant ordering of any two nodes
+ * that belong to the set. There is no search operation, rather the intention is
+ * for the element of the lowest priority which will always be at the root of
+ * the tree (as this is an implementation of a min-heap) to be removed by users
+ * for consumption.
+ *
+ * Users of the heap should embed a \e struct cfs_binheap_node object instance
+ * on every object of the set that they wish the binary heap instance to handle,
+ * and (at a minimum) provide a struct cfs_binheap_ops::hop_compare()
+ * implementation which is used by the heap as the binary predicate during its
+ * internal sorting operations.
+ *
+ * The current implementation enforces no locking scheme, and so assumes the
+ * user caters for locking between calls to insert, delete and lookup
+ * operations. Since the only consumer for the data structure at this point
+ * are NRS policies, and these operate on a per-CPT basis, binary heap instances
+ * are tied to a specific CPT.
+ * @{
+ */
+
+/**
+ * Binary heap node.
+ *
+ * Objects of this type are embedded into objects of the ordered set that is to
+ * be maintained by a \e struct cfs_binheap instance.
+ */
+struct cfs_binheap_node {
+	/** Index into the binary tree */
+	unsigned int	chn_index;
+};
+
+#define CBH_SHIFT	9
+#define CBH_SIZE       (1 << CBH_SHIFT)		    /* # ptrs per level */
+#define CBH_MASK       (CBH_SIZE - 1)
+#define CBH_NOB        (CBH_SIZE * sizeof(struct cfs_binheap_node *))
+
+#define CBH_POISON	0xdeadbeef
+
+/**
+ * Binary heap flags.
+ */
+enum {
+	CBH_FLAG_ATOMIC_GROW	= 1,
+};
+
+struct cfs_binheap;
+
+/**
+ * Binary heap operations.
+ */
+struct cfs_binheap_ops {
+	/**
+	 * Called right before inserting a node into the binary heap.
+	 *
+	 * Implementing this operation is optional.
+	 *
+	 * \param[in] h The heap
+	 * \param[in] e The node
+	 *
+	 * \retval 0 success
+	 * \retval != 0 error
+	 */
+	int		(*hop_enter)(struct cfs_binheap *h,
+				     struct cfs_binheap_node *e);
+	/**
+	 * Called right after removing a node from the binary heap.
+	 *
+	 * Implementing this operation is optional.
+	 *
+	 * \param[in] h The heap
+	 * \param[in] e The node
+	 */
+	void		(*hop_exit)(struct cfs_binheap *h,
+				    struct cfs_binheap_node *e);
+	/**
+	 * A binary predicate which is called during internal heap sorting
+	 * operations, and used in order to determine the relevant ordering of
+	 * two heap nodes.
+	 *
+	 * Implementing this operation is mandatory.
+	 *
+	 * \param[in] a The first heap node
+	 * \param[in] b The second heap node
+	 *
+	 * \retval 0 Node a > node b
+	 * \retval 1 Node a < node b
+	 *
+	 * \see cfs_binheap_bubble()
+	 * \see cfs_biheap_sink()
+	 */
+	int		(*hop_compare)(struct cfs_binheap_node *a,
+				       struct cfs_binheap_node *b);
+};
+
+/**
+ * Binary heap object.
+ *
+ * Sorts elements of type \e struct cfs_binheap_node
+ */
+struct cfs_binheap {
+	/** Triple indirect */
+	struct cfs_binheap_node  ****cbh_elements3;
+	/** double indirect */
+	struct cfs_binheap_node   ***cbh_elements2;
+	/** single indirect */
+	struct cfs_binheap_node    **cbh_elements1;
+	/** # elements referenced */
+	unsigned int		cbh_nelements;
+	/** high water mark */
+	unsigned int		cbh_hwm;
+	/** user flags */
+	unsigned int		cbh_flags;
+	/** operations table */
+	struct cfs_binheap_ops *cbh_ops;
+	/** private data */
+	void		       *cbh_private;
+	/** associated CPT table */
+	struct cfs_cpt_table   *cbh_cptab;
+	/** associated CPT id of this struct cfs_binheap::cbh_cptab */
+	int			cbh_cptid;
+};
+
+void cfs_binheap_destroy(struct cfs_binheap *h);
+struct cfs_binheap *
+cfs_binheap_create(struct cfs_binheap_ops *ops, unsigned int flags,
+		   unsigned count, void *arg, struct cfs_cpt_table *cptab,
+		   int cptid);
+struct cfs_binheap_node *
+cfs_binheap_find(struct cfs_binheap *h, unsigned int idx);
+int cfs_binheap_insert(struct cfs_binheap *h, struct cfs_binheap_node *e);
+void cfs_binheap_remove(struct cfs_binheap *h, struct cfs_binheap_node *e);
+void cfs_binheap_relocate(struct cfs_binheap *h, struct cfs_binheap_node *e);
+
+static inline int
+cfs_binheap_size(struct cfs_binheap *h)
+{
+	return h->cbh_nelements;
+}
+
+static inline int
+cfs_binheap_is_empty(struct cfs_binheap *h)
+{
+	return h->cbh_nelements == 0;
+}
+
+static inline struct cfs_binheap_node *
+cfs_binheap_root(struct cfs_binheap *h)
+{
+	return cfs_binheap_find(h, 0);
+}
+
+static inline struct cfs_binheap_node *
+cfs_binheap_remove_root(struct cfs_binheap *h)
+{
+	struct cfs_binheap_node *e = cfs_binheap_find(h, 0);
+
+	if (e != NULL)
+		cfs_binheap_remove(h, e);
+	return e;
+}
+
+/** @} heap */
+
+#endif /* __LIBCFS_HEAP_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ioctl.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ioctl.h
new file mode 100644
index 0000000000000..6b79096f761a0
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ioctl.h
@@ -0,0 +1,155 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2013, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_ioctl.h
+ *
+ * Low-level ioctl data structures. Kernel ioctl functions declared here,
+ * and user space functions are in libcfs/util/ioctl.h.
+ *
+ */
+
+#ifndef __LIBCFS_IOCTL_H__
+#define __LIBCFS_IOCTL_H__
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define LIBCFS_IOCTL_VERSION 0x0001000a
+#define LIBCFS_IOCTL_VERSION2 0x0001000b
+
+struct libcfs_ioctl_hdr {
+	__u32 ioc_len;
+	__u32 ioc_version;
+};
+
+/** max size to copy from userspace */
+#define LIBCFS_IOC_DATA_MAX	(128 * 1024)
+
+struct libcfs_ioctl_data {
+	struct libcfs_ioctl_hdr ioc_hdr;
+
+	__u64 ioc_nid;
+	__u64 ioc_u64[1];
+
+	__u32 ioc_flags;
+	__u32 ioc_count;
+	__u32 ioc_net;
+	__u32 ioc_u32[7];
+
+	__u32 ioc_inllen1;
+	char *ioc_inlbuf1;
+	__u32 ioc_inllen2;
+	char *ioc_inlbuf2;
+
+	__u32 ioc_plen1; /* buffers in userspace */
+	void __user *ioc_pbuf1;
+	__u32 ioc_plen2; /* buffers in userspace */
+	void __user *ioc_pbuf2;
+
+	char ioc_bulk[0];
+};
+
+struct libcfs_debug_ioctl_data
+{
+	struct libcfs_ioctl_hdr hdr;
+	unsigned int subs;
+	unsigned int debug;
+};
+
+/* 'f' ioctls are defined in lustre_ioctl.h and lustre_user.h except for: */
+#define LIBCFS_IOC_DEBUG_MASK             _IOWR('f', 250, long)
+#define IOCTL_LIBCFS_TYPE		  long
+
+#define IOC_LIBCFS_TYPE				      ('e')
+#define IOC_LIBCFS_MIN_NR			      30
+/* libcfs ioctls */
+/* IOC_LIBCFS_PANIC obsolete in 2.8.0, was _IOWR('e', 30, IOCTL_LIBCFS_TYPE) */
+#define IOC_LIBCFS_CLEAR_DEBUG             _IOWR('e', 31, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_MARK_DEBUG              _IOWR('e', 32, IOCTL_LIBCFS_TYPE)
+/* IOC_LIBCFS_MEMHOG obsolete in 2.8.0, was _IOWR('e', 36, IOCTL_LIBCFS_TYPE) */
+/* lnet ioctls */
+#define IOC_LIBCFS_GET_NI		   _IOWR('e', 50, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_FAIL_NID		   _IOWR('e', 51, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_NOTIFY_ROUTER	   _IOWR('e', 55, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_UNCONFIGURE		   _IOWR('e', 56, IOCTL_LIBCFS_TYPE)
+/*	 IOC_LIBCFS_PORTALS_COMPATIBILITY  _IOWR('e', 57, IOCTL_LIBCFS_TYPE) */
+#define IOC_LIBCFS_LNET_DIST		   _IOWR('e', 58, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_CONFIGURE		   _IOWR('e', 59, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_TESTPROTOCOMPAT	   _IOWR('e', 60, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_PING			   _IOWR('e', 61, IOCTL_LIBCFS_TYPE)
+/*	IOC_LIBCFS_DEBUG_PEER		   _IOWR('e', 62, IOCTL_LIBCFS_TYPE) */
+#define IOC_LIBCFS_LNETST		   _IOWR('e', 63, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_LNET_FAULT		   _IOWR('e', 64, IOCTL_LIBCFS_TYPE)
+/* lnd ioctls */
+#define IOC_LIBCFS_REGISTER_MYNID	   _IOWR('e', 70, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_CLOSE_CONNECTION	   _IOWR('e', 71, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_PUSH_CONNECTION	   _IOWR('e', 72, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_CONN		   _IOWR('e', 73, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_DEL_PEER		   _IOWR('e', 74, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_ADD_PEER		   _IOWR('e', 75, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_PEER		   _IOWR('e', 76, IOCTL_LIBCFS_TYPE)
+/* ioctl 77 is free for use */
+#define IOC_LIBCFS_ADD_INTERFACE	   _IOWR('e', 78, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_DEL_INTERFACE	   _IOWR('e', 79, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_INTERFACE	   _IOWR('e', 80, IOCTL_LIBCFS_TYPE)
+
+
+/*
+ * DLC Specific IOCTL numbers.
+ * In order to maintain backward compatibility with any possible external
+ * tools which might be accessing the IOCTL numbers, a new group of IOCTL
+ * number have been allocated.
+ */
+#define IOCTL_CONFIG_SIZE		   struct lnet_ioctl_config_data
+#define IOC_LIBCFS_ADD_ROUTE		   _IOWR(IOC_LIBCFS_TYPE, 81, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_DEL_ROUTE		   _IOWR(IOC_LIBCFS_TYPE, 82, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_ROUTE		   _IOWR(IOC_LIBCFS_TYPE, 83, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_ADD_NET		   _IOWR(IOC_LIBCFS_TYPE, 84, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_DEL_NET		   _IOWR(IOC_LIBCFS_TYPE, 85, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_NET		   _IOWR(IOC_LIBCFS_TYPE, 86, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_CONFIG_RTR		   _IOWR(IOC_LIBCFS_TYPE, 87, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_ADD_BUF		   _IOWR(IOC_LIBCFS_TYPE, 88, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_BUF		   _IOWR(IOC_LIBCFS_TYPE, 89, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_PEER_INFO	   _IOWR(IOC_LIBCFS_TYPE, 90, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_LNET_STATS	   _IOWR(IOC_LIBCFS_TYPE, 91, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_ADD_PEER_NI		   _IOWR(IOC_LIBCFS_TYPE, 92, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_DEL_PEER_NI		   _IOWR(IOC_LIBCFS_TYPE, 93, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_PEER_NI		   _IOWR(IOC_LIBCFS_TYPE, 94, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_ADD_LOCAL_NI		   _IOWR(IOC_LIBCFS_TYPE, 95, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_DEL_LOCAL_NI		   _IOWR(IOC_LIBCFS_TYPE, 96, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_LOCAL_NI		   _IOWR(IOC_LIBCFS_TYPE, 97, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_SET_NUMA_RANGE	   _IOWR(IOC_LIBCFS_TYPE, 98, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_NUMA_RANGE	   _IOWR(IOC_LIBCFS_TYPE, 99, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_MAX_NR					  99
+
+extern int libcfs_ioctl_data_adjust(struct libcfs_ioctl_data *data);
+
+#endif /* __LIBCFS_IOCTL_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_prim.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_prim.h
new file mode 100644
index 0000000000000..c5923984d0dd0
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_prim.h
@@ -0,0 +1,84 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_prim.h
+ *
+ * General primitives.
+ *
+ */
+
+#ifndef __LIBCFS_PRIM_H__
+#define __LIBCFS_PRIM_H__
+
+#include <linux/sched.h>
+
+/*
+ * Memory
+ */
+#if BITS_PER_LONG == 32
+/* limit to lowmem on 32-bit systems */
+# define NUM_CACHEPAGES \
+	min(totalram_pages, 1UL << (30 - PAGE_SHIFT) * 3 / 4)
+#else
+# define NUM_CACHEPAGES totalram_pages
+#endif
+
+static inline unsigned int memory_pressure_get(void)
+{
+	return current->flags & PF_MEMALLOC;
+}
+
+static inline void memory_pressure_set(void)
+{
+	current->flags |= PF_MEMALLOC;
+}
+
+static inline void memory_pressure_clr(void)
+{
+	current->flags &= ~PF_MEMALLOC;
+}
+
+static inline int cfs_memory_pressure_get_and_set(void)
+{
+	int old = memory_pressure_get();
+
+	if (!old)
+		memory_pressure_set();
+	return old;
+}
+
+static inline void cfs_memory_pressure_restore(int old)
+{
+	if (old)
+		memory_pressure_set();
+	else
+		memory_pressure_clr();
+	return;
+}
+#endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h
new file mode 100644
index 0000000000000..bcd7d56b65a94
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h
@@ -0,0 +1,416 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_private.h
+ *
+ * Various defines for libcfs.
+ *
+ */
+
+#ifndef __LIBCFS_PRIVATE_H__
+#define __LIBCFS_PRIVATE_H__
+
+#ifndef DEBUG_SUBSYSTEM
+# define DEBUG_SUBSYSTEM S_UNDEFINED
+#endif
+
+#ifdef LIBCFS_DEBUG
+
+/*
+ * When this is on, LASSERT macro includes check for assignment used instead
+ * of equality check, but doesn't have unlikely(). Turn this on from time to
+ * time to make test-builds. This shouldn't be on for production release.
+ */
+#define LASSERT_CHECKED (0)
+
+#if LASSERT_CHECKED
+/*
+ * Assertion.
+ *
+ * Strange construction with empty "then" clause is used to trigger compiler
+ * warnings on the assertions of the form LASSERT(a = b);
+ *
+ * "warning: suggest parentheses around assignment used as truth value"
+ *
+ * requires -Wall. Unfortunately this rules out use of likely/unlikely.
+ */
+#define LASSERTF(cond, fmt, ...)					\
+do {									\
+	if (cond)							\
+		;							\
+	else {								\
+		LIBCFS_DEBUG_MSG_DATA_DECL(__msg_data, D_EMERG, NULL);	\
+		libcfs_debug_msg(&__msg_data,				\
+				 "ASSERTION( %s ) failed: " fmt, #cond,	\
+				 ## __VA_ARGS__);			\
+		lbug_with_loc(&__msg_data);				\
+	}								\
+} while (0)
+
+#define LASSERT(cond) LASSERTF(cond, "\n")
+
+#else /* !LASSERT_CHECKED */
+
+#define LASSERTF(cond, fmt, ...)					\
+do {									\
+	if (unlikely(!(cond))) {					\
+		LIBCFS_DEBUG_MSG_DATA_DECL(__msg_data, D_EMERG, NULL);	\
+		libcfs_debug_msg(&__msg_data,				\
+				 "ASSERTION( %s ) failed: " fmt, #cond,	\
+				 ## __VA_ARGS__);			\
+		lbug_with_loc(&__msg_data);				\
+	}								\
+} while (0)
+
+#define LASSERT(cond) LASSERTF(cond, "\n")
+#endif /* !LASSERT_CHECKED */
+#else /* !LIBCFS_DEBUG */
+/* sizeof is to use expression without evaluating it. */
+# define LASSERT(e) ((void)sizeof!!(e))
+# define LASSERTF(cond, ...) ((void)sizeof!!(cond))
+#endif /* !LIBCFS_DEBUG */
+
+#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
+/**
+ * This is for more expensive checks that one doesn't want to be enabled all
+ * the time. LINVRNT() has to be explicitly enabled by --enable-invariants
+ * configure option.
+ */
+# define LINVRNT(exp) LASSERT(exp)
+#else
+# define LINVRNT(exp) ((void)sizeof!!(exp))
+#endif
+
+#define KLASSERT(e) LASSERT(e)
+
+void lbug_with_loc(struct libcfs_debug_msg_data *) __attribute__((noreturn));
+
+#define LBUG()                                                          \
+do {                                                                    \
+        LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_EMERG, NULL);             \
+        lbug_with_loc(&msgdata);                                        \
+} while(0)
+
+/*
+ * Memory
+ */
+#ifdef LIBCFS_DEBUG
+
+extern atomic_t libcfs_kmemory;
+
+# define libcfs_kmem_inc(ptr, size)		\
+do {						\
+	atomic_add(size, &libcfs_kmemory);	\
+} while (0)
+
+# define libcfs_kmem_dec(ptr, size)		\
+do {						\
+	atomic_sub(size, &libcfs_kmemory);	\
+} while (0)
+
+# define libcfs_kmem_read()			\
+	atomic_read(&libcfs_kmemory)
+
+#else
+# define libcfs_kmem_inc(ptr, size) do {} while (0)
+# define libcfs_kmem_dec(ptr, size) do {} while (0)
+# define libcfs_kmem_read()	(0)
+#endif /* LIBCFS_DEBUG */
+
+#ifndef LIBCFS_VMALLOC_SIZE
+#define LIBCFS_VMALLOC_SIZE        (2 << PAGE_SHIFT) /* 2 pages */
+#endif
+
+#define LIBCFS_ALLOC_PRE(size, mask)					    \
+do {									    \
+	LASSERT(!in_interrupt() ||					    \
+		((size) <= LIBCFS_VMALLOC_SIZE &&			    \
+		 ((mask) & GFP_ATOMIC)) != 0);			    \
+} while (0)
+
+#define LIBCFS_ALLOC_POST(ptr, size)					    \
+do {									    \
+	if (unlikely((ptr) == NULL)) {					    \
+		CERROR("LNET: out of memory at %s:%d (tried to alloc '"	    \
+		       #ptr "' = %d)\n", __FILE__, __LINE__, (int)(size));  \
+		CERROR("LNET: %d total bytes allocated by lnet\n",	    \
+		       libcfs_kmem_read());				    \
+	} else {							    \
+		libcfs_kmem_inc((ptr), (size));				    \
+		CDEBUG(D_MALLOC, "alloc '" #ptr "': %d at %p (tot %d).\n",  \
+		       (int)(size), (ptr), libcfs_kmem_read());		    \
+	}                                                                   \
+} while (0)
+
+/**
+ * allocate memory with GFP flags @mask
+ * The allocated memory is zeroed-out.
+ */
+#define LIBCFS_ALLOC_GFP(ptr, size, mask)				    \
+do {									    \
+	LIBCFS_ALLOC_PRE((size), (mask));				    \
+	(ptr) = (size) <= LIBCFS_VMALLOC_SIZE ?				    \
+		kzalloc((size), (mask)) : vzalloc(size);		    \
+	LIBCFS_ALLOC_POST((ptr), (size));				    \
+} while (0)
+
+/**
+ * default allocator
+ */
+#define LIBCFS_ALLOC(ptr, size) \
+	LIBCFS_ALLOC_GFP(ptr, size, GFP_NOFS)
+
+/**
+ * non-sleeping allocator
+ */
+#define LIBCFS_ALLOC_ATOMIC(ptr, size) \
+	LIBCFS_ALLOC_GFP(ptr, size, GFP_ATOMIC)
+
+/**
+ * allocate memory for specified CPU partition
+ *   \a cptab != NULL, \a cpt is CPU partition id of \a cptab
+ *   \a cptab == NULL, \a cpt is HW NUMA node id
+ * The allocated memory is zeroed-out.
+ */
+#define LIBCFS_CPT_ALLOC_GFP(ptr, cptab, cpt, size, mask)		    \
+do {									    \
+	LIBCFS_ALLOC_PRE((size), (mask));				    \
+	(ptr) = (size) <= LIBCFS_VMALLOC_SIZE ?				    \
+		cfs_cpt_malloc((cptab), (cpt), (size), (mask) | __GFP_ZERO) : \
+		cfs_cpt_vzalloc((cptab), (cpt), (size));		    \
+	LIBCFS_ALLOC_POST((ptr), (size));				    \
+} while (0)
+
+/** default numa allocator */
+#define LIBCFS_CPT_ALLOC(ptr, cptab, cpt, size)				    \
+	LIBCFS_CPT_ALLOC_GFP(ptr, cptab, cpt, size, GFP_NOFS)
+
+#define LIBCFS_FREE(ptr, size)						\
+do {									\
+	int s = (size);                                                 \
+	if (unlikely((ptr) == NULL)) {                                  \
+		CERROR("LIBCFS: free NULL '" #ptr "' (%d bytes) at "    \
+		       "%s:%d\n", s, __FILE__, __LINE__);               \
+		break;                                                  \
+	}                                                               \
+	libcfs_kmem_dec((ptr), s);                                      \
+	CDEBUG(D_MALLOC, "kfreed '" #ptr "': %d at %p (tot %d).\n",     \
+	       s, (ptr), libcfs_kmem_read());				\
+	if (unlikely(s > LIBCFS_VMALLOC_SIZE))                          \
+		vfree(ptr);						\
+	else								\
+		kfree(ptr);						\
+} while (0)
+
+/******************************************************************************/
+
+struct task_struct;
+
+void libcfs_debug_dumpstack(struct task_struct *tsk);
+void libcfs_debug_dumplog(void);
+int libcfs_debug_init(unsigned long bufsize);
+int libcfs_debug_cleanup(void);
+int libcfs_debug_clear_buffer(void);
+int libcfs_debug_mark_buffer(const char *text);
+
+/*
+ * allocate a variable array, returned value is an array of pointers.
+ * Caller can specify length of array by count.
+ */
+void *cfs_array_alloc(int count, unsigned int size);
+void  cfs_array_free(void *vars);
+
+#define LASSERT_ATOMIC_ENABLED          (1)
+
+#if LASSERT_ATOMIC_ENABLED
+
+/** assert value of @a is equal to @v */
+#define LASSERT_ATOMIC_EQ(a, v)                                 \
+do {                                                            \
+	LASSERTF(atomic_read(a) == v,                       \
+		 "value: %d\n", atomic_read((a)));          \
+} while (0)
+
+/** assert value of @a is unequal to @v */
+#define LASSERT_ATOMIC_NE(a, v)                                 \
+do {                                                            \
+	LASSERTF(atomic_read(a) != v,                       \
+		 "value: %d\n", atomic_read((a)));          \
+} while (0)
+
+/** assert value of @a is little than @v */
+#define LASSERT_ATOMIC_LT(a, v)                                 \
+do {                                                            \
+	LASSERTF(atomic_read(a) < v,                        \
+		 "value: %d\n", atomic_read((a)));          \
+} while (0)
+
+/** assert value of @a is little/equal to @v */
+#define LASSERT_ATOMIC_LE(a, v)                                 \
+do {                                                            \
+	LASSERTF(atomic_read(a) <= v,                       \
+		 "value: %d\n", atomic_read((a)));          \
+} while (0)
+
+/** assert value of @a is great than @v */
+#define LASSERT_ATOMIC_GT(a, v)                                 \
+do {                                                            \
+	LASSERTF(atomic_read(a) > v,                        \
+		 "value: %d\n", atomic_read((a)));          \
+} while (0)
+
+/** assert value of @a is great/equal to @v */
+#define LASSERT_ATOMIC_GE(a, v)                                 \
+do {                                                            \
+	LASSERTF(atomic_read(a) >= v,                       \
+		 "value: %d\n", atomic_read((a)));          \
+} while (0)
+
+/** assert value of @a is great than @v1 and little than @v2 */
+#define LASSERT_ATOMIC_GT_LT(a, v1, v2)                         \
+do {                                                            \
+	int __v = atomic_read(a);                           \
+	LASSERTF(__v > v1 && __v < v2, "value: %d\n", __v);     \
+} while (0)
+
+/** assert value of @a is great than @v1 and little/equal to @v2 */
+#define LASSERT_ATOMIC_GT_LE(a, v1, v2)                         \
+do {                                                            \
+	int __v = atomic_read(a);                           \
+	LASSERTF(__v > v1 && __v <= v2, "value: %d\n", __v);    \
+} while (0)
+
+/** assert value of @a is great/equal to @v1 and little than @v2 */
+#define LASSERT_ATOMIC_GE_LT(a, v1, v2)                         \
+do {                                                            \
+	int __v = atomic_read(a);                           \
+	LASSERTF(__v >= v1 && __v < v2, "value: %d\n", __v);    \
+} while (0)
+
+/** assert value of @a is great/equal to @v1 and little/equal to @v2 */
+#define LASSERT_ATOMIC_GE_LE(a, v1, v2)                         \
+do {                                                            \
+	int __v = atomic_read(a);                           \
+	LASSERTF(__v >= v1 && __v <= v2, "value: %d\n", __v);   \
+} while (0)
+
+#else /* !LASSERT_ATOMIC_ENABLED */
+
+#define LASSERT_ATOMIC_EQ(a, v)                 do {} while (0)
+#define LASSERT_ATOMIC_NE(a, v)                 do {} while (0)
+#define LASSERT_ATOMIC_LT(a, v)                 do {} while (0)
+#define LASSERT_ATOMIC_LE(a, v)                 do {} while (0)
+#define LASSERT_ATOMIC_GT(a, v)                 do {} while (0)
+#define LASSERT_ATOMIC_GE(a, v)                 do {} while (0)
+#define LASSERT_ATOMIC_GT_LT(a, v1, v2)         do {} while (0)
+#define LASSERT_ATOMIC_GT_LE(a, v1, v2)         do {} while (0)
+#define LASSERT_ATOMIC_GE_LT(a, v1, v2)         do {} while (0)
+#define LASSERT_ATOMIC_GE_LE(a, v1, v2)         do {} while (0)
+
+#endif /* LASSERT_ATOMIC_ENABLED */
+
+#define LASSERT_ATOMIC_ZERO(a)                  LASSERT_ATOMIC_EQ(a, 0)
+#define LASSERT_ATOMIC_POS(a)                   LASSERT_ATOMIC_GT(a, 0)
+
+#define CFS_ALLOC_PTR(ptr)      LIBCFS_ALLOC(ptr, sizeof (*(ptr)));
+#define CFS_FREE_PTR(ptr)       LIBCFS_FREE(ptr, sizeof (*(ptr)));
+
+/** Compile-time assertion.
+
+ * Check an invariant described by a constant expression at compile time by
+ * forcing a compiler error if it does not hold.  \a cond must be a constant
+ * expression as defined by the ISO C Standard:
+ *
+ *       6.8.4.2  The switch statement
+ *       ....
+ *       [#3] The expression of each case label shall be  an  integer
+ *       constant   expression  and  no  two  of  the  case  constant
+ *       expressions in the same switch statement shall have the same
+ *       value  after  conversion...
+ *
+ */
+#define CLASSERT(cond) do {switch (1) {case (cond): case 0: break; } } while (0)
+
+/* implication */
+#define ergo(a, b) (!(a) || (b))
+/* logical equivalence */
+#define equi(a, b) (!!(a) == !!(b))
+
+/* what used to be in portals_lib.h */
+#ifndef MIN
+# define MIN(a,b) (((a)<(b)) ? (a): (b))
+#endif
+#ifndef MAX
+# define MAX(a,b) (((a)>(b)) ? (a): (b))
+#endif
+
+#define MKSTR(ptr) ((ptr))? (ptr) : ""
+
+static inline size_t cfs_size_round4(size_t val)
+{
+        return (val + 3) & (~0x3);
+}
+
+#ifndef HAVE_CFS_SIZE_ROUND
+static inline size_t cfs_size_round(size_t val)
+{
+        return (val + 7) & (~0x7);
+}
+#define HAVE_CFS_SIZE_ROUND
+#endif
+
+static inline size_t cfs_size_round16(size_t val)
+{
+        return (val + 0xf) & (~0xf);
+}
+
+static inline size_t cfs_size_round32(size_t val)
+{
+        return (val + 0x1f) & (~0x1f);
+}
+
+static inline size_t cfs_size_round0(size_t val)
+{
+        if (!val)
+                return 0;
+        return (val + 1 + 7) & (~0x7);
+}
+
+static inline size_t cfs_round_strlen(char *fset)
+{
+	return cfs_size_round(strlen(fset) + 1);
+}
+
+extern struct cfs_psdev_ops libcfs_psdev_ops;
+extern struct miscdevice libcfs_dev;
+extern struct cfs_wi_sched *cfs_sched_rehash;
+
+#endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h
new file mode 100644
index 0000000000000..be78b503d651e
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h
@@ -0,0 +1,112 @@
+#ifndef __LIBCFS_PTASK_H__
+#define __LIBCFS_PTASK_H__
+
+#include <linux/types.h>
+#include <linux/bitops.h>
+#include <linux/kernel.h>
+#include <linux/cpumask.h>
+#include <linux/uaccess.h>
+#include <linux/notifier.h>
+#include <linux/workqueue.h>
+#include <linux/completion.h>
+#ifdef CONFIG_PADATA
+#include <linux/padata.h>
+#else
+struct padata_priv {};
+struct padata_instance {};
+#endif
+
+#define PTF_COMPLETE	BIT(0)
+#define PTF_AUTOFREE	BIT(1)
+#define PTF_ORDERED	BIT(2)
+#define PTF_USER_MM	BIT(3)
+#define PTF_ATOMIC	BIT(4)
+#define PTF_RETRY	BIT(5)
+
+struct cfs_ptask_engine {
+	struct padata_instance	*pte_pinst;
+	struct workqueue_struct	*pte_wq;
+	struct notifier_block	 pte_notifier;
+	int			 pte_weight;
+};
+
+struct cfs_ptask;
+typedef int (*cfs_ptask_cb_t)(struct cfs_ptask *);
+
+struct cfs_ptask {
+	struct padata_priv	 pt_padata;
+	struct completion	 pt_completion;
+	mm_segment_t		 pt_fs;
+	struct mm_struct	*pt_mm;
+	unsigned int		 pt_flags;
+	int			 pt_cbcpu;
+	cfs_ptask_cb_t		 pt_cbfunc;
+	void			*pt_cbdata;
+	int			 pt_result;
+};
+
+static inline
+struct padata_priv *cfs_ptask2padata(struct cfs_ptask *ptask)
+{
+	return &ptask->pt_padata;
+}
+
+static inline
+struct cfs_ptask *cfs_padata2ptask(struct padata_priv *padata)
+{
+	return container_of(padata, struct cfs_ptask, pt_padata);
+}
+
+static inline
+bool cfs_ptask_need_complete(struct cfs_ptask *ptask)
+{
+	return ptask->pt_flags & PTF_COMPLETE;
+}
+
+static inline
+bool cfs_ptask_is_autofree(struct cfs_ptask *ptask)
+{
+	return ptask->pt_flags & PTF_AUTOFREE;
+}
+
+static inline
+bool cfs_ptask_is_ordered(struct cfs_ptask *ptask)
+{
+	return ptask->pt_flags & PTF_ORDERED;
+}
+
+static inline
+bool cfs_ptask_use_user_mm(struct cfs_ptask *ptask)
+{
+	return ptask->pt_flags & PTF_USER_MM;
+}
+
+static inline
+bool cfs_ptask_is_atomic(struct cfs_ptask *ptask)
+{
+	return ptask->pt_flags & PTF_ATOMIC;
+}
+
+static inline
+bool cfs_ptask_is_retry(struct cfs_ptask *ptask)
+{
+	return ptask->pt_flags & PTF_RETRY;
+}
+
+static inline
+int cfs_ptask_result(struct cfs_ptask *ptask)
+{
+	return ptask->pt_result;
+}
+
+struct cfs_ptask_engine *cfs_ptengine_init(const char *, const struct cpumask *);
+void cfs_ptengine_fini(struct cfs_ptask_engine *);
+int  cfs_ptengine_set_cpumask(struct cfs_ptask_engine *, const struct cpumask *);
+int  cfs_ptengine_weight(struct cfs_ptask_engine *);
+
+int  cfs_ptask_submit(struct cfs_ptask *, struct cfs_ptask_engine *);
+int  cfs_ptask_wait_for(struct cfs_ptask *);
+int  cfs_ptask_init(struct cfs_ptask *, cfs_ptask_cb_t, void *,
+		    unsigned int, int);
+
+#endif /* __LIBCFS_PTASK_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_string.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_string.h
new file mode 100644
index 0000000000000..3c34071d35774
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_string.h
@@ -0,0 +1,91 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_string.h
+ *
+ * Generic string manipulation functions.
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ */
+
+#ifndef __LIBCFS_STRING_H__
+#define __LIBCFS_STRING_H__
+
+/* libcfs_string.c */
+char *cfs_strrstr(const char *haystack, const char *needle);
+/* Convert a text string to a bitmask */
+int cfs_str2mask(const char *str, const char *(*bit2str)(int bit),
+                 int *oldmask, int minmask, int allmask);
+/* trim leading and trailing space characters */
+char *cfs_firststr(char *str, size_t size);
+
+/**
+ * Structure to represent NULL-less strings.
+ */
+struct cfs_lstr {
+	char		*ls_str;
+	int		ls_len;
+};
+
+/*
+ * Structure to represent \<range_expr\> token of the syntax.
+ */
+struct cfs_range_expr {
+	/*
+	 * Link to cfs_expr_list::el_exprs.
+	 */
+	struct list_head	re_link;
+	__u32			re_lo;
+	__u32			re_hi;
+	__u32			re_stride;
+};
+
+struct cfs_expr_list {
+	struct list_head	el_link;
+	struct list_head	el_exprs;
+};
+
+char *cfs_trimwhite(char *str);
+int cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res);
+int cfs_str2num_check(char *str, int nob, unsigned *num,
+		      unsigned min, unsigned max);
+int cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list);
+int cfs_expr_list_print(char *buffer, int count,
+			struct cfs_expr_list *expr_list);
+int cfs_expr_list_values(struct cfs_expr_list *expr_list,
+			 int max, __u32 **values);
+void cfs_expr_list_values_free(__u32 *values, int num);
+void cfs_expr_list_free(struct cfs_expr_list *expr_list);
+int cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max,
+			struct cfs_expr_list **elpp);
+void cfs_expr_list_free_list(struct list_head *list);
+
+#endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_time.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_time.h
new file mode 100644
index 0000000000000..68947c9792296
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_time.h
@@ -0,0 +1,81 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_time.h
+ *
+ * Time functions.
+ *
+ */
+
+#ifndef __LIBCFS_TIME_H__
+#define __LIBCFS_TIME_H__
+
+/*
+ * generic time manipulation functions.
+ */
+
+static inline cfs_time_t cfs_time_add(cfs_time_t t, cfs_duration_t d)
+{
+        return (cfs_time_t)(t + d);
+}
+
+static inline cfs_duration_t cfs_time_sub(cfs_time_t t1, cfs_time_t t2)
+{
+        return (cfs_time_t)(t1 - t2);
+}
+
+static inline int cfs_time_after(cfs_time_t t1, cfs_time_t t2)
+{
+        return cfs_time_before(t2, t1);
+}
+
+static inline int cfs_time_aftereq(cfs_time_t t1, cfs_time_t t2)
+{
+        return cfs_time_beforeq(t2, t1);
+}
+
+static inline cfs_time_t cfs_time_shift(int seconds)
+{
+        return cfs_time_add(cfs_time_current(), cfs_time_seconds(seconds));
+}
+
+#define CFS_TICK	1
+
+/*
+ * return valid time-out based on user supplied one. Currently we only check
+ * that time-out is not shorted than allowed.
+ */
+static inline cfs_duration_t cfs_timeout_cap(cfs_duration_t timeout)
+{
+	if (timeout < CFS_TICK)
+		timeout = CFS_TICK;
+	return timeout;
+}
+
+#endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_workitem.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_workitem.h
new file mode 100644
index 0000000000000..84da4d98591ee
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_workitem.h
@@ -0,0 +1,107 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_workitem.h
+ *
+ * Author: Isaac Huang  <he.h.huang@oracle.com>
+ *         Liang Zhen   <zhen.liang@sun.com>
+ *
+ * A workitems is deferred work with these semantics:
+ * - a workitem always runs in thread context.
+ * - a workitem can be concurrent with other workitems but is strictly
+ *   serialized with respect to itself.
+ * - no CPU affinity, a workitem does not necessarily run on the same CPU
+ *   that schedules it. However, this might change in the future.
+ * - if a workitem is scheduled again before it has a chance to run, it
+ *   runs only once.
+ * - if a workitem is scheduled while it runs, it runs again after it
+ *   completes; this ensures that events occurring while other events are
+ *   being processed receive due attention. This behavior also allows a
+ *   workitem to reschedule itself.
+ *
+ * Usage notes:
+ * - a workitem can sleep but it should be aware of how that sleep might
+ *   affect others.
+ * - a workitem runs inside a kernel thread so there's no user space to access.
+ * - do not use a workitem if the scheduling latency can't be tolerated.
+ *
+ * When wi_action returns non-zero, it means the workitem has either been
+ * freed or reused and workitem scheduler won't touch it any more.
+ */
+
+#ifndef __LIBCFS_WORKITEM_H__
+#define __LIBCFS_WORKITEM_H__
+
+struct cfs_wi_sched;
+
+void cfs_wi_sched_destroy(struct cfs_wi_sched *);
+int cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab, int cpt,
+			int nthrs, struct cfs_wi_sched **);
+
+struct cfs_workitem;
+
+typedef int (*cfs_wi_action_t) (struct cfs_workitem *);
+
+struct cfs_workitem {
+	/** chain on runq or rerunq */
+	struct list_head	wi_list;
+	/** working function */
+	cfs_wi_action_t		wi_action;
+	/** arg for working function */
+	void			*wi_data;
+	/** in running */
+	unsigned short		wi_running:1;
+	/** scheduled */
+	unsigned short		wi_scheduled:1;
+};
+
+static inline void
+cfs_wi_init(struct cfs_workitem *wi, void *data, cfs_wi_action_t action)
+{
+	INIT_LIST_HEAD(&wi->wi_list);
+
+	wi->wi_running	 = 0;
+	wi->wi_scheduled = 0;
+	wi->wi_data	 = data;
+	wi->wi_action	 = action;
+}
+
+void cfs_wi_schedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi);
+int  cfs_wi_deschedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi);
+void cfs_wi_exit(struct cfs_wi_sched *sched, struct cfs_workitem *wi);
+
+int  cfs_wi_startup(void);
+void cfs_wi_shutdown(void);
+
+/** # workitem scheduler loops before reschedule */
+#define CFS_WI_RESCHED    128
+
+#endif /* __LIBCFS_WORKITEM_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/libcfs.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/libcfs.h
new file mode 100644
index 0000000000000..0f67a87096c0a
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/libcfs.h
@@ -0,0 +1,150 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LINUX_LIBCFS_H__
+#define __LIBCFS_LINUX_LIBCFS_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
+#endif
+
+#ifndef __KERNEL__
+#error This include is only for kernel use.
+#endif
+
+#include <linux/bitops.h>
+#include <linux/compiler.h>
+#include <linux/ctype.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/kthread.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/random.h>
+#include <linux/rbtree.h>
+#include <linux/rwsem.h>
+#include <linux/scatterlist.h>
+#include <linux/sched.h>
+#ifdef HAVE_SCHED_HEADERS
+#include <linux/sched/signal.h>
+#include <linux/sched/mm.h>
+#endif
+#include <linux/signal.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/time.h>
+#include <linux/timer.h>
+#include <linux/types.h>
+#include <linux/unistd.h>
+#include <linux/vmalloc.h>
+#include <net/sock.h>
+#include <linux/atomic.h>
+#include <asm/div64.h>
+#include <linux/timex.h>
+#include <linux/uaccess.h>
+#include <stdarg.h>
+
+#include <libcfs/linux/linux-cpu.h>
+#include <libcfs/linux/linux-time.h>
+#include <libcfs/linux/linux-mem.h>
+#include <libcfs/linux/linux-misc.h>
+#include <libcfs/linux/linux-fs.h>
+
+#if !defined(__x86_64__)
+# ifdef  __ia64__
+#  define CDEBUG_STACK() (THREAD_SIZE -                                 \
+                          ((unsigned long)__builtin_dwarf_cfa() &       \
+                           (THREAD_SIZE - 1)))
+# else
+#  define CDEBUG_STACK() (THREAD_SIZE -                                 \
+                          ((unsigned long)__builtin_frame_address(0) &  \
+                           (THREAD_SIZE - 1)))
+# endif /* __ia64__ */
+
+#define __CHECK_STACK(msgdata, mask, cdls)                              \
+do {                                                                    \
+        if (unlikely(CDEBUG_STACK() > libcfs_stack)) {                  \
+                LIBCFS_DEBUG_MSG_DATA_INIT(msgdata, D_WARNING, NULL);   \
+                libcfs_stack = CDEBUG_STACK();                          \
+                libcfs_debug_msg(msgdata,                               \
+                                 "maximum lustre stack %lu\n",          \
+                                 CDEBUG_STACK());                       \
+                (msgdata)->msg_mask = mask;                             \
+                (msgdata)->msg_cdls = cdls;                             \
+                dump_stack();                                           \
+              /*panic("LBUG");*/                                        \
+        }                                                               \
+} while (0)
+#define CFS_CHECK_STACK(msgdata, mask, cdls)  __CHECK_STACK(msgdata, mask, cdls)
+#else /* __x86_64__ */
+#define CFS_CHECK_STACK(msgdata, mask, cdls) do {} while(0)
+#define CDEBUG_STACK() (0L)
+#endif /* __x86_64__ */
+
+/**
+ * Platform specific declarations for cfs_curproc API (libcfs/curproc.h)
+ *
+ * Implementation is in linux-curproc.c
+ */
+#define CFS_CURPROC_COMM_MAX (sizeof ((struct task_struct *)0)->comm)
+
+/* helper for sysctl handlers */
+int lprocfs_call_handler(void *data, int write, loff_t *ppos,
+			 void __user *buffer, size_t *lenp,
+			 int (*handler)(void *data, int write,
+			 loff_t pos, void __user *buffer, int len));
+
+#ifndef WITH_WATCHDOG
+#define WITH_WATCHDOG
+#endif
+
+/*
+ * Macros to access common characteristics of "current" UNIX process.
+ */
+#define current_pid()             (current->pid)
+#define current_comm()            (current->comm)
+
+/* check if task is running in compat mode.*/
+int current_is_32bit(void);
+
+#endif /* _LINUX_LIBCFS_H */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-cpu.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-cpu.h
new file mode 100644
index 0000000000000..a46e252466026
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-cpu.h
@@ -0,0 +1,101 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-mem.h
+ *
+ * Basic library routines.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#ifndef __LIBCFS_LINUX_CPU_H__
+#define __LIBCFS_LINUX_CPU_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
+#endif
+
+#ifndef __KERNEL__
+#error This include is only for kernel use.
+#endif
+
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/topology.h>
+#include <linux/version.h>
+
+#ifdef CONFIG_SMP
+
+#define HAVE_LIBCFS_CPT
+
+/** virtual processing unit */
+struct cfs_cpu_partition {
+	/* CPUs mask for this partition */
+	cpumask_t			*cpt_cpumask;
+	/* nodes mask for this partition */
+	nodemask_t			*cpt_nodemask;
+	/* NUMA distance between CPTs */
+	unsigned			*cpt_distance;
+	/* spread rotor for NUMA allocator */
+	int				 cpt_spread_rotor;
+	/* NUMA node if cpt_nodemask is empty */
+	int				 cpt_node;
+};
+
+/** descriptor for CPU partitions */
+struct cfs_cpt_table {
+	/* spread rotor for NUMA allocator */
+	int				ctb_spread_rotor;
+	/* maximum NUMA distance between all nodes in table */
+	unsigned			ctb_distance;
+	/* # of CPU partitions */
+	int				 ctb_nparts;
+	/* partitions tables */
+	struct cfs_cpu_partition	*ctb_parts;
+	/* shadow HW CPU to CPU partition ID */
+	int				*ctb_cpu2cpt;
+	/* all cpus in this partition table */
+	cpumask_t			*ctb_cpumask;
+	/* shadow HW node to CPU partition ID */
+	int				*ctb_node2cpt;
+	/* all nodes in this partition table */
+	nodemask_t			*ctb_nodemask;
+};
+
+void cfs_cpu_core_siblings(int cpu, cpumask_t *mask);
+
+#endif /* CONFIG_SMP */
+
+#ifndef HAVE_TOPOLOGY_SIBLING_CPUMASK
+# define topology_sibling_cpumask(cpu)	topology_thread_cpumask(cpu)
+#endif /* HAVE_TOPOLOGY_SIBLING_CPUMASK */
+
+#endif /* __LIBCFS_LINUX_CPU_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-crypto.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-crypto.h
new file mode 100644
index 0000000000000..6346c59e516e7
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-crypto.h
@@ -0,0 +1,55 @@
+ /*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ */
+
+/**
+ * Linux crypto hash specific functions.
+ */
+
+/**
+ * Functions for start/stop shash CRC32 algorithm.
+ */
+int cfs_crypto_crc32_register(void);
+void cfs_crypto_crc32_unregister(void);
+
+/**
+ * Functions for start/stop shash adler32 algorithm.
+ */
+int cfs_crypto_adler32_register(void);
+void cfs_crypto_adler32_unregister(void);
+
+/**
+ * Functions for start/stop shash crc32 pclmulqdq
+ */
+int cfs_crypto_crc32_pclmul_register(void);
+void cfs_crypto_crc32_pclmul_unregister(void);
+
+/**
+ * Functions for start/stop shash crc32c pclmulqdq
+ */
+int cfs_crypto_crc32c_pclmul_register(void);
+void cfs_crypto_crc32c_pclmul_unregister(void);
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h
new file mode 100644
index 0000000000000..59d9874bbf978
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h
@@ -0,0 +1,102 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-fs.h
+ *
+ * Basic library routines.
+ */
+
+#ifndef __LIBCFS_LINUX_CFS_FS_H__
+#define __LIBCFS_LINUX_CFS_FS_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
+#endif
+
+#ifndef __KERNEL__
+#error This include is only for kernel use.
+#endif
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/mount.h>
+#include <linux/backing-dev.h>
+#include <linux/posix_acl_xattr.h>
+
+#ifndef HAVE_FILE_DENTRY
+static inline struct dentry *file_dentry(const struct file *file)
+{
+	return file->f_path.dentry;
+}
+#endif
+
+#if defined(HAVE_FILE_FSYNC_4ARGS) || defined(HAVE_FILE_FSYNC_2ARGS)
+#define ll_vfs_fsync_range(fp, start, end, datasync) \
+	vfs_fsync_range(fp, start, end, datasync)
+#else
+#define ll_vfs_fsync_range(fp, start, end, datasync) \
+	vfs_fsync_range(fp, file_dentry(fp), start, end, datasync)
+#endif
+
+#define flock_type(fl)			((fl)->fl_type)
+#define flock_set_type(fl, type)	do { (fl)->fl_type = (type); } while (0)
+#define flock_pid(fl)			((fl)->fl_pid)
+#define flock_set_pid(fl, pid)		do { (fl)->fl_pid = (pid); } while (0)
+#define flock_start(fl)			((fl)->fl_start)
+#define flock_set_start(fl, st)		do { (fl)->fl_start = (st); } while (0)
+#define flock_end(fl)			((fl)->fl_end)
+#define flock_set_end(fl, end)		do { (fl)->fl_end = (end); } while (0)
+
+#ifndef IFSHIFT
+#define IFSHIFT			12
+#endif
+
+#ifndef IFTODT
+#define IFTODT(type)		(((type) & S_IFMT) >> IFSHIFT)
+#endif
+#ifndef DTTOIF
+#define DTTOIF(dirtype)		((dirtype) << IFSHIFT)
+#endif
+
+#ifndef HAVE_POSIXACL_USER_NS
+/*
+ * Mask out &init_user_ns so we don't jump
+ * through hoops to define it somehow only
+ * to have it ignored anyway.
+ */
+#define posix_acl_from_xattr(a,b,c)	posix_acl_from_xattr(b,c)
+#define posix_acl_to_xattr(a,b,c,d)	posix_acl_to_xattr(b,c,d)
+#endif
+
+#ifndef HAVE_POSIX_ACL_VALID_USER_NS
+#define posix_acl_valid(a,b) posix_acl_valid(b)
+#endif
+#endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-list.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-list.h
new file mode 100644
index 0000000000000..e4a8e8d92c325
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-list.h
@@ -0,0 +1,52 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+
+#ifndef __LIBCFS_LINUX_LIST_H__
+#define __LIBCFS_LINUX_LIST_H__
+
+#include <linux/list.h>
+
+#ifdef HAVE_HLIST_FOR_EACH_3ARG
+#define cfs_hlist_for_each_entry(tpos, pos, head, member) \
+	hlist_for_each_entry(tpos, head, member)
+#define cfs_hlist_for_each_entry_safe(tpos, pos, n, head, member) \
+	hlist_for_each_entry_safe(tpos, n, head, member)
+#define cfs_hlist_for_each_entry_continue(tpos, pos, member) \
+	hlist_for_each_entry_continue(tpos, member)
+#define cfs_hlist_for_each_entry_from(tpos, pos, member) \
+	hlist_for_each_entry_from(tpos, member)
+#else
+#define cfs_hlist_for_each_entry(tpos, pos, head, member) \
+	hlist_for_each_entry(tpos, pos, head, member)
+#define cfs_hlist_for_each_entry_safe(tpos, pos, n, head, member) \
+	hlist_for_each_entry_safe(tpos, pos, n, head, member)
+#define cfs_hlist_for_each_entry_continue(tpos, pos, member) \
+	hlist_for_each_entry_continue(tpos, pos, member)
+#define cfs_hlist_for_each_entry_from(tpos, pos, member) \
+	hlist_for_each_entry_from(tpos, pos, member)
+#endif
+
+#ifdef HAVE_HLIST_ADD_AFTER
+#define hlist_add_behind(hnode, tail)	hlist_add_after(tail, hnode)
+#endif /* HAVE_HLIST_ADD_AFTER */
+
+#endif /* __LIBCFS_LINUX_LIST_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h
new file mode 100644
index 0000000000000..086d16baeaf13
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h
@@ -0,0 +1,132 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-mem.h
+ *
+ * Basic library routines.
+ */
+
+#ifndef __LIBCFS_LINUX_CFS_MEM_H__
+#define __LIBCFS_LINUX_CFS_MEM_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
+#endif
+
+#ifndef __KERNEL__
+#error This include is only for kernel use.
+#endif
+
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#ifdef HAVE_MM_INLINE
+# include <linux/mm_inline.h>
+#endif
+
+/*
+ * Shrinker
+ */
+#ifdef HAVE_SHRINK_CONTROL
+# define SHRINKER_ARGS(sc, nr_to_scan, gfp_mask)  \
+                       struct shrinker *shrinker, \
+                       struct shrink_control *sc
+# define shrink_param(sc, var) ((sc)->var)
+#else
+struct shrink_control {
+	gfp_t gfp_mask;
+	unsigned long nr_to_scan;
+};
+# ifdef HAVE_SHRINKER_WANT_SHRINK_PTR
+#  define SHRINKER_ARGS(sc, nr_to_scan, gfp_mask)  \
+                        struct shrinker *shrinker, \
+                        int nr_to_scan, gfp_t gfp_mask
+# else
+#  define SHRINKER_ARGS(sc, nr_to_scan, gfp_mask)  \
+                        int nr_to_scan, gfp_t gfp_mask
+# endif
+	/* avoid conflict with spl mm_compat.h */
+# define HAVE_SHRINK_CONTROL_STRUCT 1
+# define shrink_param(sc, var) (var)
+#endif
+
+#ifdef HAVE_SHRINKER_COUNT
+struct shrinker_var {
+	unsigned long (*count)(struct shrinker *,
+			       struct shrink_control *sc);
+	unsigned long (*scan)(struct shrinker *,
+			      struct shrink_control *sc);
+};
+# define DEF_SHRINKER_VAR(name, shrink, count_obj, scan_obj) \
+	    struct shrinker_var name = { .count = count_obj, .scan = scan_obj }
+#else
+struct shrinker_var {
+	int (*shrink)(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask));
+};
+# define DEF_SHRINKER_VAR(name, shrinker, count, scan) \
+	    struct shrinker_var name = { .shrink = shrinker }
+# define SHRINK_STOP (~0UL)
+#endif
+
+static inline
+struct shrinker *set_shrinker(int seek, struct shrinker_var *var)
+{
+        struct shrinker *s;
+
+	s = kzalloc(sizeof(*s), GFP_KERNEL);
+        if (s == NULL)
+                return (NULL);
+
+#ifdef HAVE_SHRINKER_COUNT
+	s->count_objects = var->count;
+	s->scan_objects = var->scan;
+#else
+	s->shrink = var->shrink;
+#endif
+        s->seeks = seek;
+
+        register_shrinker(s);
+
+        return s;
+}
+
+static inline
+void remove_shrinker(struct shrinker *shrinker)
+{
+        if (shrinker == NULL)
+                return;
+
+        unregister_shrinker(shrinker);
+        kfree(shrinker);
+}
+
+#endif /* __LINUX_CFS_MEM_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h
new file mode 100644
index 0000000000000..0ad585f913c94
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h
@@ -0,0 +1,128 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LINUX_MISC_H__
+#define __LIBCFS_LINUX_MISC_H__
+
+#include <linux/fs.h>
+#ifdef HAVE_SYSCTL_CTLNAME
+#define INIT_CTL_NAME	.ctl_name = CTL_UNNUMBERED,
+#define INIT_STRATEGY	.strategy = &sysctl_intvec,
+#else
+#define INIT_CTL_NAME
+#define INIT_STRATEGY
+#endif
+
+#ifndef HAVE_UIDGID_HEADER
+
+#ifndef _LINUX_UIDGID_H
+#define _LINUX_UIDGID_H
+
+typedef uid_t kuid_t;
+typedef gid_t kgid_t;
+
+#define INVALID_UID	-1
+#define INVALID_GID	-1
+
+#define GLOBAL_ROOT_UID	0
+#define GLOBAL_ROOT_GID	0
+
+static inline uid_t __kuid_val(kuid_t uid)
+{
+	return uid;
+}
+
+static inline gid_t __kgid_val(kgid_t gid)
+{
+	return gid;
+}
+
+static inline kuid_t make_kuid(struct user_namespace *from, uid_t uid)
+{
+	return uid;
+}
+
+static inline kgid_t make_kgid(struct user_namespace *from, gid_t gid)
+{
+	return gid;
+}
+
+static inline uid_t from_kuid(struct user_namespace *to, kuid_t uid)
+{
+	return uid;
+}
+
+static inline gid_t from_kgid(struct user_namespace *to, kgid_t gid)
+{
+	return gid;
+}
+
+static inline bool uid_eq(kuid_t left, kuid_t right)
+{
+	return left == right;
+}
+
+static inline bool uid_valid(kuid_t uid)
+{
+	return uid != (typeof(uid))INVALID_UID;
+}
+
+static inline bool gid_valid(kgid_t gid)
+{
+	return gid != (typeof(gid))INVALID_GID;
+}
+#endif /* _LINUX_UIDGID_H */
+
+#endif
+
+int cfs_get_environ(const char *key, char *value, int *val_len);
+
+#ifndef HAVE_WAIT_QUEUE_ENTRY
+#define wait_queue_entry_t wait_queue_t
+#endif
+
+int cfs_kernel_write(struct file *filp, const void *buf, size_t count,
+		     loff_t *pos);
+
+#ifndef HAVE_KSTRTOUL
+static inline int kstrtoul(const char *s, unsigned int base, unsigned long *res)
+{
+	char *end = (char *)s;
+
+	*res = simple_strtoul(s, &end, base);
+	if (end - s == 0)
+		return -EINVAL;
+	return 0;
+}
+#endif /* !HAVE_KSTRTOUL */
+
+#endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
new file mode 100644
index 0000000000000..3095626dea428
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
@@ -0,0 +1,284 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-time.h
+ *
+ * Implementation of portable time API for Linux (kernel and user-level).
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#ifndef __LIBCFS_LINUX_LINUX_TIME_H__
+#define __LIBCFS_LINUX_LINUX_TIME_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
+#endif
+
+#ifndef __KERNEL__
+#error This include is only for kernel use.
+#endif
+
+/* Portable time API */
+
+/*
+ * Platform provides three opaque data-types:
+ *
+ *  cfs_time_t        represents point in time. This is internal kernel
+ *                    time rather than "wall clock". This time bears no
+ *                    relation to gettimeofday().
+ *
+ *  cfs_duration_t    represents time interval with resolution of internal
+ *                    platform clock
+ *
+ *  cfs_time_t     cfs_time_current(void);
+ *  cfs_time_t     cfs_time_add    (cfs_time_t, cfs_duration_t);
+ *  cfs_duration_t cfs_time_sub    (cfs_time_t, cfs_time_t);
+ *  int            cfs_impl_time_before (cfs_time_t, cfs_time_t);
+ *  int            cfs_impl_time_before_eq(cfs_time_t, cfs_time_t);
+ *
+ *  cfs_duration_t cfs_duration_build(int64_t);
+ *
+ *  time_t         cfs_duration_sec (cfs_duration_t);
+ *  void           cfs_duration_usec(cfs_duration_t, struct timeval *);
+ *  void           cfs_duration_nsec(cfs_duration_t, struct timespec *);
+ *
+ *  CFS_TIME_FORMAT
+ *  CFS_DURATION_FORMAT
+ *
+ */
+
+#define ONE_BILLION ((u_int64_t)1000000000)
+#define ONE_MILLION 1000000
+
+#ifndef __KERNEL__
+#error This include is only for kernel use.
+#endif
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/version.h>
+#include <linux/jiffies.h>
+#include <linux/types.h>
+#include <linux/time.h>
+#include <asm/div64.h>
+
+/*
+ * Generic kernel stuff
+ */
+
+typedef unsigned long cfs_time_t;      /* jiffies */
+typedef long cfs_duration_t;
+
+#ifndef HAVE_TIMESPEC64
+
+typedef __s64 time64_t;
+
+#if __BITS_PER_LONG == 64
+
+# define timespec64 timespec
+
+static inline struct timespec64 timespec_to_timespec64(const struct timespec ts)
+{
+	return ts;
+}
+
+static inline struct timespec timespec64_to_timespec(const struct timespec64 ts)
+{
+	return ts;
+}
+
+#else
+struct timespec64 {
+	time64_t	tv_sec;		/* seconds */
+	long		tv_nsec;	/* nanoseconds */
+};
+
+static inline struct timespec64 timespec_to_timespec64(const struct timespec ts)
+{
+	struct timespec64 ret;
+
+	ret.tv_sec = ts.tv_sec;
+	ret.tv_nsec = ts.tv_nsec;
+	return ret;
+}
+
+static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64)
+{
+	struct timespec ret;
+
+	ret.tv_sec = (time_t)ts64.tv_sec;
+	ret.tv_nsec = ts64.tv_nsec;
+	return ret;
+}
+#endif /* __BITS_PER_LONG != 64 */
+
+#endif /* HAVE_TIMESPEC64 */
+
+#ifndef HAVE_KTIME_ADD
+# define ktime_add(lhs, rhs) ({ (ktime_t){ .tv64 = (lhs).tv64 + (rhs).tv64 }; })
+#endif /* !HAVE_KTIME_ADD */
+
+#ifndef HAVE_KTIME_AFTER
+static inline bool ktime_after(const ktime_t cmp1, const ktime_t cmp2)
+{
+	return cmp1.tv64 > cmp2.tv64;
+}
+#endif /* !HAVE_KTIME_AFTER */
+
+#ifndef HAVE_KTIME_BEFORE
+static inline bool ktime_before(const ktime_t cmp1, const ktime_t cmp2)
+{
+	return cmp1.tv64 < cmp2.tv64;
+}
+#endif /* !HAVE_KTIME_BEFORE */
+
+#ifndef HAVE_KTIME_COMPARE
+static inline int ktime_compare(const ktime_t cmp1, const ktime_t cmp2)
+{
+	if (cmp1.tv64 < cmp2.tv64)
+		return -1;
+	if (cmp1.tv64 > cmp2.tv64)
+		return 1;
+	return 0;
+}
+#endif /* !HAVE_KTIME_COMPARE */
+
+#ifndef HAVE_KTIME_GET_TS64
+void ktime_get_ts64(struct timespec64 *ts);
+#endif /* HAVE_KTIME_GET_TS */
+
+#ifndef HAVE_KTIME_GET_REAL_TS64
+void ktime_get_real_ts64(struct timespec64 *ts);
+#endif /* HAVE_KTIME_GET_REAL_TS */
+
+#ifndef HAVE_KTIME_GET_REAL_SECONDS
+time64_t ktime_get_real_seconds(void);
+#endif /* HAVE_KTIME_GET_REAL_SECONDS */
+
+#ifndef HAVE_KTIME_GET_SECONDS
+time64_t ktime_get_seconds(void);
+#endif /* HAVE_KTIME_GET_SECONDS */
+
+#ifdef NEED_KTIME_GET_REAL_NS
+static inline u64 ktime_get_real_ns(void)
+{
+	return ktime_to_ns(ktime_get_real());
+}
+#endif /* NEED_KTIME_GET_REAL_NS */
+
+#ifndef HAVE_KTIME_TO_TIMESPEC64
+static inline struct timespec64 ktime_to_timespec64(ktime_t kt)
+{
+	struct timespec ts = ns_to_timespec((kt).tv64);
+
+	return timespec_to_timespec64(ts);
+}
+#endif /* HAVE_KTIME_TO_TIMESPEC64 */
+
+#ifndef HAVE_TIMESPEC64_SUB
+static inline struct timespec64
+timespec64_sub(struct timespec64 later, struct timespec64 earlier)
+{
+	struct timespec diff;
+
+	diff = timespec_sub(timespec64_to_timespec(later),
+			    timespec64_to_timespec(earlier));
+	return timespec_to_timespec64(diff);
+}
+#endif
+
+#ifndef HAVE_TIMESPEC64_TO_KTIME
+static inline ktime_t timespec64_to_ktime(struct timespec64 ts)
+{
+	return ktime_set(ts.tv_sec, ts.tv_nsec);
+}
+#endif
+
+static inline int cfs_time_before(cfs_time_t t1, cfs_time_t t2)
+{
+        return time_before(t1, t2);
+}
+
+static inline int cfs_time_beforeq(cfs_time_t t1, cfs_time_t t2)
+{
+        return time_before_eq(t1, t2);
+}
+
+static inline cfs_time_t cfs_time_current(void)
+{
+        return jiffies;
+}
+
+static inline time_t cfs_time_current_sec(void)
+{
+	return get_seconds();
+}
+
+static inline cfs_duration_t cfs_time_seconds(int seconds)
+{
+	return ((cfs_duration_t)seconds) * msecs_to_jiffies(MSEC_PER_SEC);
+}
+
+static inline time_t cfs_duration_sec(cfs_duration_t d)
+{
+	return d / msecs_to_jiffies(MSEC_PER_SEC);
+}
+
+#define cfs_time_current_64 get_jiffies_64
+
+static inline __u64 cfs_time_add_64(__u64 t, __u64 d)
+{
+        return t + d;
+}
+
+static inline __u64 cfs_time_shift_64(int seconds)
+{
+        return cfs_time_add_64(cfs_time_current_64(),
+                               cfs_time_seconds(seconds));
+}
+
+static inline int cfs_time_before_64(__u64 t1, __u64 t2)
+{
+        return (__s64)t2 - (__s64)t1 > 0;
+}
+
+static inline int cfs_time_beforeq_64(__u64 t1, __u64 t2)
+{
+        return (__s64)t2 - (__s64)t1 >= 0;
+}
+
+/*
+ * One jiffy
+ */
+#define CFS_DURATION_T          "%ld"
+
+#endif /* __LIBCFS_LINUX_LINUX_TIME_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/ioctl.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/ioctl.h
new file mode 100644
index 0000000000000..600bf27b607b4
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/ioctl.h
@@ -0,0 +1,68 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/util/ioctl.h
+ *
+ * Utility functions for calling ioctls.
+ *
+ */
+#ifndef _LIBCFS_IOCTL_H_
+#define _LIBCFS_IOCTL_H_
+
+#include <stdbool.h>
+#include <linux/types.h>
+
+/* Sparse annotation. */
+#define __user
+
+#include <libcfs/libcfs_ioctl.h>
+
+#define LIBCFS_IOC_INIT(data)					\
+do {								\
+	memset(&(data), 0, sizeof(data));			\
+	(data).ioc_hdr.ioc_version = LIBCFS_IOCTL_VERSION;	\
+	(data).ioc_hdr.ioc_len = sizeof(data);			\
+} while (0)
+
+#define LIBCFS_IOC_INIT_V2(data, hdr)			\
+do {							\
+	memset(&(data), 0, sizeof(data));		\
+	(data).hdr.ioc_version = LIBCFS_IOCTL_VERSION2;	\
+	(data).hdr.ioc_len = sizeof(data);		\
+} while (0)
+
+/* FIXME - rename these to libcfs_ */
+int libcfs_ioctl_pack(struct libcfs_ioctl_data *data, char **pbuf, int max);
+void libcfs_ioctl_unpack(struct libcfs_ioctl_data *data, char *pbuf);
+int register_ioc_dev(int dev_id, const char *dev_name);
+void unregister_ioc_dev(int dev_id);
+int l_ioctl(int dev_id, unsigned int opc, void *buf);
+#endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/list.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/list.h
new file mode 100644
index 0000000000000..ef69efed6cf1e
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/list.h
@@ -0,0 +1,499 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+
+#ifndef __LIBCFS_UTIL_LIST_H__
+#define __LIBCFS_UTIL_LIST_H__
+
+/*
+ * Simple doubly linked list implementation.
+ *
+ * Some of the internal functions ("__xxx") are useful when
+ * manipulating whole lists rather than single entries, as
+ * sometimes we already know the next/prev entries and we can
+ * generate better code by using them directly rather than
+ * using the generic single-entry routines.
+ */
+
+#define prefetch(a) ((void)a)
+
+struct list_head {
+	struct list_head *next, *prev;
+};
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define INIT_LIST_HEAD(ptr) do { \
+	(ptr)->next = (ptr); (ptr)->prev = (ptr); \
+} while (0)
+
+/**
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_add(struct list_head * new,
+				  struct list_head * prev,
+				  struct list_head * next)
+{
+	next->prev = new;
+	new->next = next;
+	new->prev = prev;
+	prev->next = new;
+}
+
+/**
+ * Insert an entry at the start of a list.
+ * \param new  new entry to be inserted
+ * \param head list to add it to
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static inline void list_add(struct list_head *new,
+				struct list_head *head)
+{
+	__list_add(new, head, head->next);
+}
+
+/**
+ * Insert an entry at the end of a list.
+ * \param new  new entry to be inserted
+ * \param head list to add it to
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static inline void list_add_tail(struct list_head *new,
+				     struct list_head *head)
+{
+	__list_add(new, head->prev, head);
+}
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_del(struct list_head *prev,
+				  struct list_head *next)
+{
+	next->prev = prev;
+	prev->next = next;
+}
+
+/**
+ * Remove an entry from the list it is currently in.
+ * \param entry the entry to remove
+ * Note: list_empty(entry) does not return true after this, the entry is in an
+ * undefined state.
+ */
+static inline void list_del(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+}
+
+/**
+ * Remove an entry from the list it is currently in and reinitialize it.
+ * \param entry the entry to remove.
+ */
+static inline void list_del_init(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	INIT_LIST_HEAD(entry);
+}
+
+/**
+ * Remove an entry from the list it is currently in and insert it at the start
+ * of another list.
+ * \param list the entry to move
+ * \param head the list to move it to
+ */
+static inline void list_move(struct list_head *list,
+				 struct list_head *head)
+{
+	__list_del(list->prev, list->next);
+	list_add(list, head);
+}
+
+/**
+ * Remove an entry from the list it is currently in and insert it at the end of
+ * another list.
+ * \param list the entry to move
+ * \param head the list to move it to
+ */
+static inline void list_move_tail(struct list_head *list,
+				      struct list_head *head)
+{
+	__list_del(list->prev, list->next);
+	list_add_tail(list, head);
+}
+
+/**
+ * Test whether a list is empty
+ * \param head the list to test.
+ */
+static inline int list_empty(struct list_head *head)
+{
+	return head->next == head;
+}
+
+/**
+ * Test whether a list is empty and not being modified
+ * \param head the list to test
+ *
+ * Tests whether a list is empty _and_ checks that no other CPU might be
+ * in the process of modifying either member (next or prev)
+ *
+ * NOTE: using list_empty_careful() without synchronization
+ * can only be safe if the only activity that can happen
+ * to the list entry is list_del_init(). Eg. it cannot be used
+ * if another CPU could re-list_add() it.
+ */
+static inline int list_empty_careful(const struct list_head *head)
+{
+	struct list_head *next = head->next;
+	return (next == head) && (next == head->prev);
+}
+
+static inline void __list_splice(struct list_head *list,
+				     struct list_head *head)
+{
+	struct list_head *first = list->next;
+	struct list_head *last = list->prev;
+	struct list_head *at = head->next;
+
+	first->prev = head;
+	head->next = first;
+
+	last->next = at;
+	at->prev = last;
+}
+
+/**
+ * Join two lists
+ * \param list the new list to add.
+ * \param head the place to add it in the first list.
+ *
+ * The contents of \a list are added at the start of \a head.  \a list is in an
+ * undefined state on return.
+ */
+static inline void list_splice(struct list_head *list,
+				   struct list_head *head)
+{
+	if (!list_empty(list))
+		__list_splice(list, head);
+}
+
+static inline void list_splice_tail(struct list_head *list, struct list_head *head)
+{
+	if (!list_empty(list))
+		__list_splice(list, head->prev);
+}
+
+/**
+ * Join two lists and reinitialise the emptied list.
+ * \param list the new list to add.
+ * \param head the place to add it in the first list.
+ *
+ * The contents of \a list are added at the start of \a head.  \a list is empty
+ * on return.
+ */
+static inline void list_splice_init(struct list_head *list,
+					struct list_head *head)
+{
+	if (!list_empty(list)) {
+		__list_splice(list, head);
+		INIT_LIST_HEAD(list);
+	}
+}
+
+/**
+ * Get the container of a list
+ * \param ptr	 the embedded list.
+ * \param type	 the type of the struct this is embedded in.
+ * \param member the member name of the list within the struct.
+ */
+#define list_entry(ptr, type, member) \
+	((type *)((char *)(ptr)-(char *)(&((type *)0)->member)))
+
+/**
+ * Iterate over a list
+ * \param pos	the iterator
+ * \param head	the list to iterate over
+ *
+ * Behaviour is undefined if \a pos is removed from the list in the body of the
+ * loop.
+ */
+#define list_for_each(pos, head) \
+	for (pos = (head)->next, prefetch(pos->next); pos != (head); \
+		pos = pos->next, prefetch(pos->next))
+
+/**
+ * Iterate over a list safely
+ * \param pos	the iterator
+ * \param n     temporary storage
+ * \param head	the list to iterate over
+ *
+ * This is safe to use if \a pos could be removed from the list in the body of
+ * the loop.
+ */
+#define list_for_each_safe(pos, n, head) \
+	for (pos = (head)->next, n = pos->next; pos != (head); \
+		pos = n, n = pos->next)
+
+/**
+ * Iterate over a list continuing after existing point
+ * \param pos    the type * to use as a loop counter
+ * \param head   the list head
+ * \param member the name of the list_struct within the struct
+ */
+#define list_for_each_entry_continue(pos, head, member)                 \
+	for (pos = list_entry(pos->member.next, typeof(*pos), member);  \
+	     prefetch(pos->member.next), &pos->member != (head);        \
+	     pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * \defgroup hlist Hash List
+ * Double linked lists with a single pointer list head.
+ * Mostly useful for hash tables where the two pointer list head is too
+ * wasteful.  You lose the ability to access the tail in O(1).
+ * @{
+ */
+
+struct hlist_node {
+	struct hlist_node *next, **pprev;
+};
+
+struct hlist_head {
+	struct hlist_node *first;
+};
+
+/* @} */
+
+/*
+ * "NULL" might not be defined at this point
+ */
+#ifdef NULL
+#define NULL_P NULL
+#else
+#define NULL_P ((void *)0)
+#endif
+
+/**
+ * \addtogroup hlist
+ * @{
+ */
+
+#define HLIST_HEAD_INIT { NULL_P }
+#define HLIST_HEAD(name) struct hlist_head name = { NULL_P }
+#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL_P)
+#define INIT_HLIST_NODE(ptr) ((ptr)->next = NULL_P, (ptr)->pprev = NULL_P)
+
+static inline int hlist_unhashed(const struct hlist_node *h)
+{
+	return !h->pprev;
+}
+
+static inline int hlist_empty(const struct hlist_head *h)
+{
+	return !h->first;
+}
+
+static inline void __hlist_del(struct hlist_node *n)
+{
+	struct hlist_node *next = n->next;
+	struct hlist_node **pprev = n->pprev;
+	*pprev = next;
+	if (next)
+		next->pprev = pprev;
+}
+
+static inline void hlist_del(struct hlist_node *n)
+{
+	__hlist_del(n);
+}
+
+static inline void hlist_del_init(struct hlist_node *n)
+{
+	if (n->pprev)  {
+		__hlist_del(n);
+		INIT_HLIST_NODE(n);
+	}
+}
+
+static inline void hlist_add_head(struct hlist_node *n,
+				      struct hlist_head *h)
+{
+	struct hlist_node *first = h->first;
+	n->next = first;
+	if (first)
+		first->pprev = &n->next;
+	h->first = n;
+	n->pprev = &h->first;
+}
+
+/* next must be != NULL */
+static inline void hlist_add_before(struct hlist_node *n,
+					struct hlist_node *next)
+{
+	n->pprev = next->pprev;
+	n->next = next;
+	next->pprev = &n->next;
+	*(n->pprev) = n;
+}
+
+static inline void hlist_add_after(struct hlist_node *n,
+				       struct hlist_node *next)
+{
+	next->next = n->next;
+	n->next = next;
+	next->pprev = &n->next;
+
+	if(next->next)
+		next->next->pprev  = &next->next;
+}
+
+#define hlist_entry(ptr, type, member) container_of(ptr,type,member)
+
+#define hlist_for_each(pos, head) \
+	for (pos = (head)->first; pos && (prefetch(pos->next), 1); \
+	     pos = pos->next)
+
+#define hlist_for_each_safe(pos, n, head) \
+	for (pos = (head)->first; pos && (n = pos->next, 1); \
+	     pos = n)
+
+/**
+ * Iterate over an hlist of given type
+ * \param tpos	 the type * to use as a loop counter.
+ * \param pos	 the &struct hlist_node to use as a loop counter.
+ * \param head	 the head for your list.
+ * \param member the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry(tpos, pos, head, member)                    \
+	for (pos = (head)->first;                                            \
+	     pos && ({ prefetch(pos->next); 1;}) &&                          \
+		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+/**
+ * Iterate over an hlist continuing after existing point
+ * \param tpos	 the type * to use as a loop counter.
+ * \param pos	 the &struct hlist_node to use as a loop counter.
+ * \param member the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_continue(tpos, pos, member)                 \
+	for (pos = (pos)->next;                                              \
+	     pos && ({ prefetch(pos->next); 1;}) &&                          \
+		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+/**
+ * Iterate over an hlist continuing from an existing point
+ * \param tpos	 the type * to use as a loop counter.
+ * \param pos	 the &struct hlist_node to use as a loop counter.
+ * \param member the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_from(tpos, pos, member)			 \
+	for (; pos && ({ prefetch(pos->next); 1;}) &&                        \
+		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+/**
+ * Iterate over an hlist of given type safe against removal of list entry
+ * \param tpos	 the type * to use as a loop counter.
+ * \param pos	 the &struct hlist_node to use as a loop counter.
+ * \param n	 another &struct hlist_node to use as temporary storage
+ * \param head	 the head for your list.
+ * \param member the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_safe(tpos, pos, n, head, member)            \
+	for (pos = (head)->first;                                            \
+	     pos && ({ n = pos->next; 1; }) &&                               \
+		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = n)
+
+/* @} */
+
+/**
+ * Iterate over a list in reverse order
+ * \param pos	the &struct list_head to use as a loop counter.
+ * \param head	the head for your list.
+ */
+#define list_for_each_prev(pos, head) \
+	for (pos = (head)->prev, prefetch(pos->prev); pos != (head);     \
+		pos = pos->prev, prefetch(pos->prev))
+
+/**
+ * Iterate over a list of given type
+ * \param pos        the type * to use as a loop counter.
+ * \param head       the head for your list.
+ * \param member     the name of the list_struct within the struct.
+ */
+#define list_for_each_entry(pos, head, member)                          \
+	for (pos = list_entry((head)->next, typeof(*pos), member),      \
+		     prefetch(pos->member.next);                            \
+	     &pos->member != (head);                                        \
+	     pos = list_entry(pos->member.next, typeof(*pos), member),  \
+	     prefetch(pos->member.next))
+
+/**
+ * Iterate backwards over a list of given type.
+ * \param pos        the type * to use as a loop counter.
+ * \param head       the head for your list.
+ * \param member     the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_reverse(pos, head, member)                  \
+	for (pos = list_entry((head)->prev, typeof(*pos), member);      \
+	     prefetch(pos->member.prev), &pos->member != (head);            \
+	     pos = list_entry(pos->member.prev, typeof(*pos), member))
+
+/**
+ * Iterate over a list of given type safe against removal of list entry
+ * \param pos        the type * to use as a loop counter.
+ * \param n          another type * to use as temporary storage
+ * \param head       the head for your list.
+ * \param member     the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_safe(pos, n, head, member)                   \
+	for (pos = list_entry((head)->next, typeof(*pos), member),       \
+		n = list_entry(pos->member.next, typeof(*pos), member);  \
+	     &pos->member != (head);                                         \
+	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+/**
+ * Iterate backwards over a list of given type safely against removal of entry
+ * \param pos        the type * to use as a loop counter.
+ * \param n          another type * to use as temporary storage
+ * \param head       the head for your list.
+ * \param member     the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_safe_reverse(pos, n, head, member)		\
+	for (pos = list_entry((head)->prev, typeof(*pos), member),	\
+		n = list_entry(pos->member.prev, typeof(*pos), member);	\
+	     &pos->member != (head);					\
+	     pos = n, n = list_entry(n->member.prev, typeof(*n), member))
+
+#endif /* __LIBCFS_UTIL_LIST_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/param.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/param.h
new file mode 100644
index 0000000000000..2fd1e36b07354
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/param.h
@@ -0,0 +1,40 @@
+/*
+ * LGPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of the
+ * License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * LGPL HEADER END
+ *
+ * Copyright (c) 2015, James Simmons
+ *
+ * Author:
+ *   James Simmons <jsimmons@infradead.org>
+ */
+#ifndef _LIBCFS_UTIL_PARAM_H_
+#define _LIBCFS_UTIL_PARAM_H_
+
+#include <glob.h>
+#include <stdbool.h>
+
+static inline void cfs_free_param_data(glob_t *paths)
+{
+	globfree(paths);
+}
+
+int cfs_get_param_paths(glob_t *paths, const char *pattern, ...)
+		       __attribute__((__format__(__printf__, 2, 3)));
+
+#endif /* _LIBCFS_UTIL_PARAM_H_ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/parser.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/parser.h
new file mode 100644
index 0000000000000..2fb2db7c651dd
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/parser.h
@@ -0,0 +1,115 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/util/parser.h
+ *
+ * A command line parser.
+ *
+ */
+
+#ifndef _PARSER_H_
+#define _PARSER_H_
+
+#define HISTORY	100		/* Don't let history grow unbounded    */
+#define MAXARGS 512
+
+#define CMD_COMPLETE	0
+#define CMD_INCOMPLETE	1
+#define CMD_NONE	2
+#define CMD_AMBIG	3
+#define CMD_HELP	4
+
+typedef struct parser_cmd {
+	char 	*pc_name;
+	int 	(* pc_func)(int, char **);
+	struct parser_cmd * pc_sub_cmd;
+	char *pc_help;
+} command_t;
+
+typedef struct argcmd {
+	char    *ac_name;
+	int      (*ac_func)(int, char **);
+	char     *ac_help;
+} argcmd_t;
+
+typedef struct network {
+	char	*type;
+	char	*server;
+	int	port;
+} network_t;
+
+int Parser_quit(int argc, char **argv);
+int Parser_version(int argc, char **argv);
+void Parser_init(char *, command_t *);	/* Set prompt and load command list */
+int Parser_commands(void);			/* Start the command parser */
+void Parser_qhelp(int, char **);	/* Quick help routine */
+int Parser_help(int, char **);		/* Detailed help routine */
+void Parser_ignore_errors(int ignore);	/* Set the ignore errors flag */
+void Parser_printhelp(char *);		/* Detailed help routine */
+void Parser_exit(int, char **);		/* Shuts down command parser */
+int Parser_execarg(int argc, char **argv, command_t cmds[]);
+int execute_line(char * line);
+int Parser_list_commands(const command_t *cmdlist, char *buffer,
+			 size_t buf_size, const char *parent_cmd,
+			 int col_start, int col_num);
+
+/* Converts a string to an integer */
+int Parser_int(char *, int *);
+
+/* Prompts for a string, with default values and a maximum length */
+char *Parser_getstr(const char *prompt, const char *deft, char *res, 
+		    size_t len);
+
+/* Prompts for an integer, with minimum, maximum and default values and base */
+int Parser_getint(const char *prompt, long min, long max, long deft,
+		  int base);
+
+/* Prompts for a yes/no, with default */
+int Parser_getbool(const char *prompt, int deft);
+
+/* Extracts an integer from a string, or prompts if it cannot get one */
+long Parser_intarg(const char *inp, const char *prompt, int deft,
+		   int min, int max, int base);
+
+/* Extracts a word from the input, or propmts if it cannot get one */
+char *Parser_strarg(char *inp, const char *prompt, const char *deft,
+		    char *answer, int len);
+
+/* Extracts an integer from a string  with a base */
+int Parser_arg2int(const char *inp, long *result, int base);
+
+/* Convert human readable size string to and int; "1k" -> 1000 */
+int Parser_size(int *sizep, char *str);
+
+/* Convert a string boolean to an int; "enable" -> 1 */
+int Parser_bool(int *b, char *str);
+
+#endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/string.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/string.h
new file mode 100644
index 0000000000000..72414f0c8003a
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/string.h
@@ -0,0 +1,97 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_string.h
+ *
+ * Generic string manipulation functions.
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ */
+
+#ifndef __LIBCFS_UTIL_STRING_H__
+#define __LIBCFS_UTIL_STRING_H__
+
+#include <stddef.h>
+
+#include <linux/types.h>
+#include <libcfs/util/list.h>
+
+#ifndef HAVE_STRLCPY /* not in glibc for RHEL 5.x, remove when obsolete */
+size_t strlcpy(char *tgt, const char *src, size_t tgt_len);
+#endif
+
+#ifndef HAVE_STRLCAT /* not in glibc for RHEL 5.x, remove when obsolete */
+size_t strlcat(char *tgt, const char *src, size_t tgt_len);
+#endif
+
+/**
+ * Structure to represent NULL-less strings.
+ */
+struct cfs_lstr {
+	char		*ls_str;
+	int		ls_len;
+};
+
+/*
+ * Structure to represent \<range_expr\> token of the syntax.
+ */
+struct cfs_range_expr {
+	/*
+	 * Link to cfs_expr_list::el_exprs.
+	 */
+	struct list_head	re_link;
+	__u32			re_lo;
+	__u32			re_hi;
+	__u32			re_stride;
+};
+
+struct cfs_expr_list {
+	struct list_head	el_link;
+	struct list_head	el_exprs;
+};
+
+int cfs_expr_list_values(struct cfs_expr_list *expr_list, int max, __u32 **valpp);
+int cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res);
+int cfs_str2num_check(char *str, int nob, unsigned *num,
+		      unsigned min, unsigned max);
+int cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list);
+int cfs_expr_list_print(char *buffer, int count,
+			struct cfs_expr_list *expr_list);
+int cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max,
+			struct cfs_expr_list **elpp);
+void cfs_expr_list_free(struct cfs_expr_list *expr_list);
+void cfs_expr_list_free_list(struct list_head *list);
+int cfs_ip_addr_parse(char *str, int len, struct list_head *list);
+int cfs_ip_addr_range_gen(__u32 *ip_list, int count,
+			  struct list_head *ip_addr_expr);
+int cfs_ip_addr_match(__u32 addr, struct list_head *list);
+
+#endif
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/debug.c b/drivers/staging/lustrefsx/libcfs/libcfs/debug.c
new file mode 100644
index 0000000000000..a4aede1e3be08
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/debug.c
@@ -0,0 +1,353 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/debug.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ *
+ */
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/kthread.h>
+#include <libcfs/libcfs.h>
+#include "tracefile.h"
+
+static char debug_file_name[1024];
+
+unsigned int libcfs_subsystem_debug = ~0;
+module_param(libcfs_subsystem_debug, int, 0644);
+MODULE_PARM_DESC(libcfs_subsystem_debug, "Lustre kernel debug subsystem mask");
+EXPORT_SYMBOL(libcfs_subsystem_debug);
+
+unsigned int libcfs_debug = (D_CANTMASK |
+			     D_NETERROR | D_HA | D_CONFIG | D_IOCTL | D_LFSCK);
+module_param(libcfs_debug, int, 0644);
+MODULE_PARM_DESC(libcfs_debug, "Lustre kernel debug mask");
+EXPORT_SYMBOL(libcfs_debug);
+
+static unsigned int libcfs_debug_mb;
+module_param(libcfs_debug_mb, uint, 0644);
+MODULE_PARM_DESC(libcfs_debug_mb, "Total debug buffer size.");
+
+unsigned int libcfs_printk = D_CANTMASK;
+module_param(libcfs_printk, uint, 0644);
+MODULE_PARM_DESC(libcfs_printk, "Lustre kernel debug console mask");
+
+unsigned int libcfs_console_ratelimit = 1;
+module_param(libcfs_console_ratelimit, uint, 0644);
+MODULE_PARM_DESC(libcfs_console_ratelimit, "Lustre kernel debug console ratelimit (0 to disable)");
+
+unsigned int libcfs_console_max_delay;
+module_param(libcfs_console_max_delay, uint, 0644);
+MODULE_PARM_DESC(libcfs_console_max_delay, "Lustre kernel debug console max delay (jiffies)");
+
+unsigned int libcfs_console_min_delay;
+module_param(libcfs_console_min_delay, uint, 0644);
+MODULE_PARM_DESC(libcfs_console_min_delay, "Lustre kernel debug console min delay (jiffies)");
+
+unsigned int libcfs_console_backoff = CDEBUG_DEFAULT_BACKOFF;
+module_param(libcfs_console_backoff, uint, 0644);
+MODULE_PARM_DESC(libcfs_console_backoff, "Lustre kernel debug console backoff factor");
+
+unsigned int libcfs_debug_binary = 1;
+
+unsigned int libcfs_stack = 3 * THREAD_SIZE / 4;
+EXPORT_SYMBOL(libcfs_stack);
+
+unsigned int libcfs_catastrophe;
+EXPORT_SYMBOL(libcfs_catastrophe);
+
+unsigned int libcfs_watchdog_ratelimit = 300;
+
+unsigned int libcfs_panic_on_lbug = 1;
+module_param(libcfs_panic_on_lbug, uint, 0644);
+MODULE_PARM_DESC(libcfs_panic_on_lbug, "Lustre kernel panic on LBUG");
+
+atomic_t libcfs_kmemory = ATOMIC_INIT(0);
+EXPORT_SYMBOL(libcfs_kmemory);
+
+static wait_queue_head_t debug_ctlwq;
+
+char libcfs_debug_file_path_arr[PATH_MAX] = LIBCFS_DEBUG_FILE_PATH_DEFAULT;
+EXPORT_SYMBOL(libcfs_debug_file_path_arr);
+
+/* We need to pass a pointer here, but elsewhere this must be a const */
+static char *libcfs_debug_file_path;
+module_param(libcfs_debug_file_path, charp, 0644);
+MODULE_PARM_DESC(libcfs_debug_file_path,
+		 "Path for dumping debug logs, set 'NONE' to prevent log dumping");
+
+int libcfs_panic_in_progress;
+
+/* libcfs_debug_token2mask() expects the returned
+ * string in lower-case */
+static const char *libcfs_debug_subsys2str(int subsys)
+{
+	static const char *libcfs_debug_subsystems[] = LIBCFS_DEBUG_SUBSYS_NAMES;
+
+	if (subsys >= ARRAY_SIZE(libcfs_debug_subsystems))
+		return NULL;
+
+	return libcfs_debug_subsystems[subsys];
+}
+
+/* libcfs_debug_token2mask() expects the returned
+ * string in lower-case */
+static const char *libcfs_debug_dbg2str(int debug)
+{
+	static const char *libcfs_debug_masks[] = LIBCFS_DEBUG_MASKS_NAMES;
+
+	if (debug >= ARRAY_SIZE(libcfs_debug_masks))
+		return NULL;
+
+	return libcfs_debug_masks[debug];
+}
+
+int
+libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys)
+{
+        const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
+                                                 libcfs_debug_dbg2str;
+        int           len = 0;
+        const char   *token;
+        int           i;
+
+        if (mask == 0) {                        /* "0" */
+                if (size > 0)
+                        str[0] = '0';
+                len = 1;
+        } else {                                /* space-separated tokens */
+                for (i = 0; i < 32; i++) {
+                        if ((mask & (1 << i)) == 0)
+                                continue;
+
+                        token = fn(i);
+                        if (token == NULL)              /* unused bit */
+                                continue;
+
+                        if (len > 0) {                  /* separator? */
+                                if (len < size)
+                                        str[len] = ' ';
+                                len++;
+                        }
+
+                        while (*token != 0) {
+                                if (len < size)
+                                        str[len] = *token;
+                                token++;
+                                len++;
+                        }
+                }
+        }
+
+        /* terminate 'str' */
+        if (len < size)
+                str[len] = 0;
+        else
+                str[size - 1] = 0;
+
+        return len;
+}
+
+int
+libcfs_debug_str2mask(int *mask, const char *str, int is_subsys)
+{
+        const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
+                                                 libcfs_debug_dbg2str;
+        int         m = 0;
+        int         matched;
+        int         n;
+        int         t;
+
+        /* Allow a number for backwards compatibility */
+
+        for (n = strlen(str); n > 0; n--)
+                if (!isspace(str[n-1]))
+                        break;
+        matched = n;
+
+        if ((t = sscanf(str, "%i%n", &m, &matched)) >= 1 &&
+            matched == n) {
+                /* don't print warning for lctl set_param debug=0 or -1 */
+                if (m != 0 && m != -1)
+                        CWARN("You are trying to use a numerical value for the "
+                              "mask - this will be deprecated in a future "
+                              "release.\n");
+                *mask = m;
+                return 0;
+        }
+
+        return cfs_str2mask(str, fn, mask, is_subsys ? 0 : D_CANTMASK,
+                            0xffffffff);
+}
+
+/**
+ * Dump Lustre log to ::debug_file_path by calling tracefile_dump_all_pages()
+ */
+void libcfs_debug_dumplog_internal(void *arg)
+{
+	static time64_t last_dump_time;
+	time64_t current_time;
+	void *journal_info;
+
+	journal_info = current->journal_info;
+	current->journal_info = NULL;
+	current_time = ktime_get_real_seconds();
+
+	if (strncmp(libcfs_debug_file_path_arr, "NONE", 4) != 0 &&
+	    current_time > last_dump_time) {
+		last_dump_time = current_time;
+		snprintf(debug_file_name, sizeof(debug_file_name) - 1,
+			 "%s.%lld.%ld", libcfs_debug_file_path_arr,
+			 (s64)current_time, (uintptr_t)arg);
+		printk(KERN_ALERT "LustreError: dumping log to %s\n",
+		       debug_file_name);
+		cfs_tracefile_dump_all_pages(debug_file_name);
+		libcfs_run_debug_log_upcall(debug_file_name);
+	}
+	current->journal_info = journal_info;
+}
+
+static int libcfs_debug_dumplog_thread(void *arg)
+{
+	libcfs_debug_dumplog_internal(arg);
+	wake_up(&debug_ctlwq);
+	return 0;
+}
+
+void libcfs_debug_dumplog(void)
+{
+	wait_queue_entry_t wait;
+	struct task_struct *dumper;
+	ENTRY;
+
+	/* we're being careful to ensure that the kernel thread is
+	 * able to set our state to running as it exits before we
+	 * get to schedule() */
+	init_waitqueue_entry(&wait, current);
+	set_current_state(TASK_INTERRUPTIBLE);
+	add_wait_queue(&debug_ctlwq, &wait);
+
+	dumper = kthread_run(libcfs_debug_dumplog_thread,
+			     (void *)(long)current_pid(),
+			     "libcfs_debug_dumper");
+	if (IS_ERR(dumper))
+		printk(KERN_ERR "LustreError: cannot start log dump thread:"
+		       " %ld\n", PTR_ERR(dumper));
+	else
+		schedule();
+
+	/* be sure to teardown if cfs_create_thread() failed */
+	remove_wait_queue(&debug_ctlwq, &wait);
+	set_current_state(TASK_RUNNING);
+}
+EXPORT_SYMBOL(libcfs_debug_dumplog);
+
+int libcfs_debug_init(unsigned long bufsize)
+{
+	int    rc = 0;
+	unsigned int max = libcfs_debug_mb;
+
+	init_waitqueue_head(&debug_ctlwq);
+
+	if (libcfs_console_max_delay <= 0 || /* not set by user or */
+	    libcfs_console_min_delay <= 0 || /* set to invalid values */
+	    libcfs_console_min_delay >= libcfs_console_max_delay) {
+		libcfs_console_max_delay = CDEBUG_DEFAULT_MAX_DELAY;
+		libcfs_console_min_delay = CDEBUG_DEFAULT_MIN_DELAY;
+	}
+
+	if (libcfs_debug_file_path != NULL) {
+		strlcpy(libcfs_debug_file_path_arr,
+			libcfs_debug_file_path,
+			sizeof(libcfs_debug_file_path_arr));
+	}
+
+	/* If libcfs_debug_mb is set to an invalid value or uninitialized
+	 * then just make the total buffers smp_num_cpus * TCD_MAX_PAGES */
+	if (max > cfs_trace_max_debug_mb() || max < num_possible_cpus()) {
+		max = TCD_MAX_PAGES;
+	} else {
+		max = (max / num_possible_cpus());
+		max = (max << (20 - PAGE_SHIFT));
+	}
+	rc = cfs_tracefile_init(max);
+
+        if (rc == 0)
+                libcfs_register_panic_notifier();
+
+        return rc;
+}
+
+int libcfs_debug_cleanup(void)
+{
+        libcfs_unregister_panic_notifier();
+        cfs_tracefile_exit();
+        return 0;
+}
+
+int libcfs_debug_clear_buffer(void)
+{
+        cfs_trace_flush_pages();
+        return 0;
+}
+
+/* Debug markers, although printed by S_LNET
+ * should not be be marked as such. */
+#undef DEBUG_SUBSYSTEM
+#define DEBUG_SUBSYSTEM S_UNDEFINED
+int libcfs_debug_mark_buffer(const char *text)
+{
+        CDEBUG(D_TRACE,"***************************************************\n");
+        LCONSOLE(D_WARNING, "DEBUG MARKER: %s\n", text);
+        CDEBUG(D_TRACE,"***************************************************\n");
+
+        return 0;
+}
+#undef DEBUG_SUBSYSTEM
+#define DEBUG_SUBSYSTEM S_LNET
+
+long libcfs_log_return(struct libcfs_debug_msg_data *msgdata, long rc)
+{
+        libcfs_debug_msg(msgdata, "Process leaving (rc=%lu : %ld : %lx)\n",
+                         rc, rc, rc);
+        return rc;
+}
+EXPORT_SYMBOL(libcfs_log_return);
+
+void libcfs_log_goto(struct libcfs_debug_msg_data *msgdata, const char *label,
+		     long rc)
+{
+	libcfs_debug_msg(msgdata, "Process leaving via %s (rc=%lu : %ld"
+			 " : %#lx)\n", label, rc, rc, rc);
+}
+EXPORT_SYMBOL(libcfs_log_goto);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/fail.c b/drivers/staging/lustrefsx/libcfs/libcfs/fail.c
new file mode 100644
index 0000000000000..12addb20803f3
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/fail.c
@@ -0,0 +1,137 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Oracle Corporation, Inc.
+ */
+
+#include <libcfs/libcfs.h>
+
+unsigned long cfs_fail_loc = 0;
+unsigned int cfs_fail_val = 0;
+int cfs_fail_err;
+DECLARE_WAIT_QUEUE_HEAD(cfs_race_waitq);
+int cfs_race_state;
+
+EXPORT_SYMBOL(cfs_fail_loc);
+EXPORT_SYMBOL(cfs_fail_val);
+EXPORT_SYMBOL(cfs_fail_err);
+EXPORT_SYMBOL(cfs_race_waitq);
+EXPORT_SYMBOL(cfs_race_state);
+
+int __cfs_fail_check_set(__u32 id, __u32 value, int set)
+{
+	static atomic_t cfs_fail_count = ATOMIC_INIT(0);
+
+	LASSERT(!(id & CFS_FAIL_ONCE));
+
+	if ((cfs_fail_loc & (CFS_FAILED | CFS_FAIL_ONCE)) ==
+	    (CFS_FAILED | CFS_FAIL_ONCE)) {
+		atomic_set(&cfs_fail_count, 0); /* paranoia */
+		return 0;
+	}
+
+	/* Fail 1/cfs_fail_val times */
+	if (cfs_fail_loc & CFS_FAIL_RAND) {
+		if (cfs_fail_val < 2 || cfs_rand() % cfs_fail_val > 0)
+			return 0;
+	}
+
+	/* Skip the first cfs_fail_val, then fail */
+	if (cfs_fail_loc & CFS_FAIL_SKIP) {
+		if (atomic_inc_return(&cfs_fail_count) <= cfs_fail_val)
+			return 0;
+	}
+
+	/* check cfs_fail_val... */
+	if (set == CFS_FAIL_LOC_VALUE) {
+		if (cfs_fail_val != -1 && cfs_fail_val != value)
+			return 0;
+	}
+
+	/* Fail cfs_fail_val times, overridden by FAIL_ONCE */
+	if (cfs_fail_loc & CFS_FAIL_SOME &&
+	    (!(cfs_fail_loc & CFS_FAIL_ONCE) || cfs_fail_val <= 1)) {
+		int count = atomic_inc_return(&cfs_fail_count);
+
+		if (count >= cfs_fail_val) {
+			set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc);
+			atomic_set(&cfs_fail_count, 0);
+			/* we are lost race to increase  */
+			if (count > cfs_fail_val)
+				return 0;
+		}
+	}
+
+	/* Take into account the current call for FAIL_ONCE for ORSET only,
+	 * as RESET is a new fail_loc, it does not change the current call */
+	if ((set == CFS_FAIL_LOC_ORSET) && (value & CFS_FAIL_ONCE))
+		set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc);
+	/* Lost race to set CFS_FAILED_BIT. */
+	if (test_and_set_bit(CFS_FAILED_BIT, &cfs_fail_loc)) {
+		/* If CFS_FAIL_ONCE is valid, only one process can fail,
+		 * otherwise multi-process can fail at the same time. */
+		if (cfs_fail_loc & CFS_FAIL_ONCE)
+			return 0;
+	}
+
+	switch (set) {
+		case CFS_FAIL_LOC_NOSET:
+		case CFS_FAIL_LOC_VALUE:
+			break;
+		case CFS_FAIL_LOC_ORSET:
+			cfs_fail_loc |= value & ~(CFS_FAILED | CFS_FAIL_ONCE);
+			break;
+		case CFS_FAIL_LOC_RESET:
+			cfs_fail_loc = value;
+			atomic_set(&cfs_fail_count, 0);
+			break;
+		default:
+			LASSERTF(0, "called with bad set %u\n", set);
+			break;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(__cfs_fail_check_set);
+
+int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set)
+{
+	int ret = 0;
+
+	ret = __cfs_fail_check_set(id, value, set);
+	if (ret && likely(ms > 0)) {
+		CERROR("cfs_fail_timeout id %x sleeping for %dms\n",
+		       id, ms);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(ms) / 1000);
+		set_current_state(TASK_RUNNING);
+		CERROR("cfs_fail_timeout id %x awake\n", id);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(__cfs_fail_timeout_set);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/hash.c b/drivers/staging/lustrefsx/libcfs/libcfs/hash.c
new file mode 100644
index 0000000000000..228cf0b022a58
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/hash.c
@@ -0,0 +1,2123 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/hash.c
+ *
+ * Implement a hash class for hash process in lustre system.
+ *
+ * Author: YuZhangyong <yzy@clusterfs.com>
+ *
+ * 2008-08-15: Brian Behlendorf <behlendorf1@llnl.gov>
+ * - Simplified API and improved documentation
+ * - Added per-hash feature flags:
+ *   * CFS_HASH_DEBUG additional validation
+ *   * CFS_HASH_REHASH dynamic rehashing
+ * - Added per-hash statistics
+ * - General performance enhancements
+ *
+ * 2009-07-31: Liang Zhen <zhen.liang@sun.com>
+ * - move all stuff to libcfs
+ * - don't allow cur_bits != max_bits without setting of CFS_HASH_REHASH
+ * - ignore hs_rwlock if without CFS_HASH_REHASH setting
+ * - buckets are allocated one by one(instead of contiguous memory),
+ *   to avoid unnecessary cacheline conflict
+ *
+ * 2010-03-01: Liang Zhen <zhen.liang@sun.com>
+ * - "bucket" is a group of hlist_head now, user can specify bucket size
+ *   by bkt_bits of cfs_hash_create(), all hlist_heads in a bucket share
+ *   one lock for reducing memory overhead.
+ *
+ * - support lockless hash, caller will take care of locks:
+ *   avoid lock overhead for hash tables that are already protected
+ *   by locking in the caller for another reason
+ *
+ * - support both spin_lock/rwlock for bucket:
+ *   overhead of spinlock contention is lower than read/write
+ *   contention of rwlock, so using spinlock to serialize operations on
+ *   bucket is more reasonable for those frequently changed hash tables
+ *
+ * - support one-single lock mode:
+ *   one lock to protect all hash operations to avoid overhead of
+ *   multiple locks if hash table is always small
+ *
+ * - removed a lot of unnecessary addref & decref on hash element:
+ *   addref & decref are atomic operations in many use-cases which
+ *   are expensive.
+ *
+ * - support non-blocking cfs_hash_add() and cfs_hash_findadd():
+ *   some lustre use-cases require these functions to be strictly
+ *   non-blocking, we need to schedule required rehash on a different
+ *   thread on those cases.
+ *
+ * - safer rehash on large hash table
+ *   In old implementation, rehash function will exclusively lock the
+ *   hash table and finish rehash in one batch, it's dangerous on SMP
+ *   system because rehash millions of elements could take long time.
+ *   New implemented rehash can release lock and relax CPU in middle
+ *   of rehash, it's safe for another thread to search/change on the
+ *   hash table even it's in rehasing.
+ *
+ * - support two different refcount modes
+ *   . hash table has refcount on element
+ *   . hash table doesn't change refcount on adding/removing element
+ *
+ * - support long name hash table (for param-tree)
+ *
+ * - fix a bug for cfs_hash_rehash_key:
+ *   in old implementation, cfs_hash_rehash_key could screw up the
+ *   hash-table because @key is overwritten without any protection.
+ *   Now we need user to define hs_keycpy for those rehash enabled
+ *   hash tables, cfs_hash_rehash_key will overwrite hash-key
+ *   inside lock by calling hs_keycpy.
+ *
+ * - better hash iteration:
+ *   Now we support both locked iteration & lockless iteration of hash
+ *   table. Also, user can break the iteration by return 1 in callback.
+ */
+#include <linux/seq_file.h>
+#include <linux/log2.h>
+
+#include <libcfs/linux/linux-list.h>
+#include <libcfs/libcfs.h>
+
+#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+static unsigned int warn_on_depth = 8;
+module_param(warn_on_depth, uint, 0644);
+MODULE_PARM_DESC(warn_on_depth, "warning when hash depth is high.");
+#endif
+
+struct cfs_wi_sched *cfs_sched_rehash;
+
+static inline void
+cfs_hash_nl_lock(union cfs_hash_lock *lock, int exclusive) {}
+
+static inline void
+cfs_hash_nl_unlock(union cfs_hash_lock *lock, int exclusive) {}
+
+static inline void
+cfs_hash_spin_lock(union cfs_hash_lock *lock, int exclusive)
+	__acquires(&lock->spin)
+{
+	spin_lock(&lock->spin);
+}
+
+static inline void
+cfs_hash_spin_unlock(union cfs_hash_lock *lock, int exclusive)
+	__releases(&lock->spin)
+{
+	spin_unlock(&lock->spin);
+}
+
+static inline void
+cfs_hash_rw_lock(union cfs_hash_lock *lock, int exclusive)
+	__acquires(&lock->rw)
+{
+	if (!exclusive)
+		read_lock(&lock->rw);
+	else
+		write_lock(&lock->rw);
+}
+
+static inline void
+cfs_hash_rw_unlock(union cfs_hash_lock *lock, int exclusive)
+	__releases(&lock->rw)
+{
+	if (!exclusive)
+		read_unlock(&lock->rw);
+	else
+		write_unlock(&lock->rw);
+}
+
+/** No lock hash */
+static struct cfs_hash_lock_ops cfs_hash_nl_lops = {
+	.hs_lock	= cfs_hash_nl_lock,
+	.hs_unlock	= cfs_hash_nl_unlock,
+	.hs_bkt_lock	= cfs_hash_nl_lock,
+	.hs_bkt_unlock	= cfs_hash_nl_unlock,
+};
+
+/** no bucket lock, one spinlock to protect everything */
+static struct cfs_hash_lock_ops cfs_hash_nbl_lops = {
+	.hs_lock	= cfs_hash_spin_lock,
+	.hs_unlock	= cfs_hash_spin_unlock,
+	.hs_bkt_lock	= cfs_hash_nl_lock,
+	.hs_bkt_unlock	= cfs_hash_nl_unlock,
+};
+
+/** spin bucket lock, rehash is enabled */
+static struct cfs_hash_lock_ops cfs_hash_bkt_spin_lops = {
+	.hs_lock	= cfs_hash_rw_lock,
+	.hs_unlock	= cfs_hash_rw_unlock,
+	.hs_bkt_lock	= cfs_hash_spin_lock,
+	.hs_bkt_unlock	= cfs_hash_spin_unlock,
+};
+
+/** rw bucket lock, rehash is enabled */
+static struct cfs_hash_lock_ops cfs_hash_bkt_rw_lops = {
+	.hs_lock	= cfs_hash_rw_lock,
+	.hs_unlock	= cfs_hash_rw_unlock,
+	.hs_bkt_lock	= cfs_hash_rw_lock,
+	.hs_bkt_unlock  = cfs_hash_rw_unlock,
+};
+
+/** spin bucket lock, rehash is disabled */
+static struct cfs_hash_lock_ops cfs_hash_nr_bkt_spin_lops = {
+	.hs_lock	= cfs_hash_nl_lock,
+	.hs_unlock	= cfs_hash_nl_unlock,
+	.hs_bkt_lock	= cfs_hash_spin_lock,
+	.hs_bkt_unlock	= cfs_hash_spin_unlock,
+};
+
+/** rw bucket lock, rehash is disabled */
+static struct cfs_hash_lock_ops cfs_hash_nr_bkt_rw_lops = {
+	.hs_lock	= cfs_hash_nl_lock,
+	.hs_unlock	= cfs_hash_nl_unlock,
+	.hs_bkt_lock	= cfs_hash_rw_lock,
+	.hs_bkt_unlock	= cfs_hash_rw_unlock,
+};
+
+static void
+cfs_hash_lock_setup(struct cfs_hash *hs)
+{
+	if (cfs_hash_with_no_lock(hs)) {
+		hs->hs_lops = &cfs_hash_nl_lops;
+
+	} else if (cfs_hash_with_no_bktlock(hs)) {
+		hs->hs_lops = &cfs_hash_nbl_lops;
+		spin_lock_init(&hs->hs_lock.spin);
+
+	} else if (cfs_hash_with_rehash(hs)) {
+		rwlock_init(&hs->hs_lock.rw);
+
+                if (cfs_hash_with_rw_bktlock(hs))
+                        hs->hs_lops = &cfs_hash_bkt_rw_lops;
+                else if (cfs_hash_with_spin_bktlock(hs))
+                        hs->hs_lops = &cfs_hash_bkt_spin_lops;
+                else
+                        LBUG();
+        } else {
+                if (cfs_hash_with_rw_bktlock(hs))
+                        hs->hs_lops = &cfs_hash_nr_bkt_rw_lops;
+                else if (cfs_hash_with_spin_bktlock(hs))
+                        hs->hs_lops = &cfs_hash_nr_bkt_spin_lops;
+                else
+                        LBUG();
+        }
+}
+
+/**
+ * Simple hash head without depth tracking
+ * new element is always added to head of hlist
+ */
+struct cfs_hash_head {
+	struct hlist_head	hh_head;	/**< entries list */
+};
+
+static int
+cfs_hash_hh_hhead_size(struct cfs_hash *hs)
+{
+	return sizeof(struct cfs_hash_head);
+}
+
+static struct hlist_head *
+cfs_hash_hh_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd)
+{
+	struct cfs_hash_head *head;
+
+	head = (struct cfs_hash_head *)&bd->bd_bucket->hsb_head[0];
+	return &head[bd->bd_offset].hh_head;
+}
+
+static int
+cfs_hash_hh_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnode)
+{
+	hlist_add_head(hnode, cfs_hash_hh_hhead(hs, bd));
+	return -1; /* unknown depth */
+}
+
+static int
+cfs_hash_hh_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnode)
+{
+	hlist_del_init(hnode);
+	return -1; /* unknown depth */
+}
+
+/**
+ * Simple hash head with depth tracking
+ * new element is always added to head of hlist
+ */
+struct cfs_hash_head_dep {
+	struct hlist_head	hd_head;	/**< entries list */
+	unsigned int		hd_depth;	/**< list length */
+};
+
+static int
+cfs_hash_hd_hhead_size(struct cfs_hash *hs)
+{
+	return sizeof(struct cfs_hash_head_dep);
+}
+
+static struct hlist_head *
+cfs_hash_hd_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd)
+{
+	struct cfs_hash_head_dep   *head;
+
+	head = (struct cfs_hash_head_dep *)&bd->bd_bucket->hsb_head[0];
+	return &head[bd->bd_offset].hd_head;
+}
+
+static int
+cfs_hash_hd_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnode)
+{
+	struct cfs_hash_head_dep *hh;
+
+	hh = container_of(cfs_hash_hd_hhead(hs, bd),
+			  struct cfs_hash_head_dep, hd_head);
+	hlist_add_head(hnode, &hh->hd_head);
+	return ++hh->hd_depth;
+}
+
+static int
+cfs_hash_hd_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnode)
+{
+	struct cfs_hash_head_dep *hh;
+
+	hh = container_of(cfs_hash_hd_hhead(hs, bd),
+			  struct cfs_hash_head_dep, hd_head);
+	hlist_del_init(hnode);
+	return --hh->hd_depth;
+}
+
+/**
+ * double links hash head without depth tracking
+ * new element is always added to tail of hlist
+ */
+struct cfs_hash_dhead {
+	struct hlist_head	dh_head;	/**< entries list */
+	struct hlist_node	*dh_tail;	/**< the last entry */
+};
+
+static int
+cfs_hash_dh_hhead_size(struct cfs_hash *hs)
+{
+	return sizeof(struct cfs_hash_dhead);
+}
+
+static struct hlist_head *
+cfs_hash_dh_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd)
+{
+	struct cfs_hash_dhead *head;
+
+	head = (struct cfs_hash_dhead *)&bd->bd_bucket->hsb_head[0];
+	return &head[bd->bd_offset].dh_head;
+}
+
+static int
+cfs_hash_dh_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnode)
+{
+	struct cfs_hash_dhead *dh;
+
+	dh = container_of(cfs_hash_dh_hhead(hs, bd),
+			  struct cfs_hash_dhead, dh_head);
+	if (dh->dh_tail != NULL) /* not empty */
+		hlist_add_behind(hnode, dh->dh_tail);
+	else /* empty list */
+		hlist_add_head(hnode, &dh->dh_head);
+	dh->dh_tail = hnode;
+	return -1; /* unknown depth */
+}
+
+static int
+cfs_hash_dh_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnd)
+{
+	struct cfs_hash_dhead *dh;
+
+	dh = container_of(cfs_hash_dh_hhead(hs, bd),
+			  struct cfs_hash_dhead, dh_head);
+	if (hnd->next == NULL) { /* it's the tail */
+		dh->dh_tail = (hnd->pprev == &dh->dh_head.first) ? NULL :
+			      container_of(hnd->pprev, struct hlist_node, next);
+	}
+	hlist_del_init(hnd);
+	return -1; /* unknown depth */
+}
+
+/**
+ * double links hash head with depth tracking
+ * new element is always added to tail of hlist
+ */
+struct cfs_hash_dhead_dep {
+	struct hlist_head	dd_head;	/**< entries list */
+	struct hlist_node	*dd_tail;	/**< the last entry */
+	unsigned int		dd_depth;	/**< list length */
+};
+
+static int
+cfs_hash_dd_hhead_size(struct cfs_hash *hs)
+{
+	return sizeof(struct cfs_hash_dhead_dep);
+}
+
+static struct hlist_head *
+cfs_hash_dd_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd)
+{
+	struct cfs_hash_dhead_dep *head;
+
+	head = (struct cfs_hash_dhead_dep *)&bd->bd_bucket->hsb_head[0];
+	return &head[bd->bd_offset].dd_head;
+}
+
+static int
+cfs_hash_dd_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnode)
+{
+	struct cfs_hash_dhead_dep *dh;
+
+	dh = container_of(cfs_hash_dd_hhead(hs, bd),
+			  struct cfs_hash_dhead_dep, dd_head);
+	if (dh->dd_tail != NULL) /* not empty */
+		hlist_add_behind(hnode, dh->dd_tail);
+	else /* empty list */
+		hlist_add_head(hnode, &dh->dd_head);
+	dh->dd_tail = hnode;
+	return ++dh->dd_depth;
+}
+
+static int
+cfs_hash_dd_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		      struct hlist_node *hnd)
+{
+	struct cfs_hash_dhead_dep *dh;
+
+	dh = container_of(cfs_hash_dd_hhead(hs, bd),
+			  struct cfs_hash_dhead_dep, dd_head);
+	if (hnd->next == NULL) { /* it's the tail */
+		dh->dd_tail = (hnd->pprev == &dh->dd_head.first) ? NULL :
+			      container_of(hnd->pprev, struct hlist_node, next);
+	}
+	hlist_del_init(hnd);
+	return --dh->dd_depth;
+}
+
+static struct cfs_hash_hlist_ops cfs_hash_hh_hops = {
+       .hop_hhead      = cfs_hash_hh_hhead,
+       .hop_hhead_size = cfs_hash_hh_hhead_size,
+       .hop_hnode_add  = cfs_hash_hh_hnode_add,
+       .hop_hnode_del  = cfs_hash_hh_hnode_del,
+};
+
+static struct cfs_hash_hlist_ops cfs_hash_hd_hops = {
+       .hop_hhead      = cfs_hash_hd_hhead,
+       .hop_hhead_size = cfs_hash_hd_hhead_size,
+       .hop_hnode_add  = cfs_hash_hd_hnode_add,
+       .hop_hnode_del  = cfs_hash_hd_hnode_del,
+};
+
+static struct cfs_hash_hlist_ops cfs_hash_dh_hops = {
+       .hop_hhead      = cfs_hash_dh_hhead,
+       .hop_hhead_size = cfs_hash_dh_hhead_size,
+       .hop_hnode_add  = cfs_hash_dh_hnode_add,
+       .hop_hnode_del  = cfs_hash_dh_hnode_del,
+};
+
+static struct cfs_hash_hlist_ops cfs_hash_dd_hops = {
+       .hop_hhead      = cfs_hash_dd_hhead,
+       .hop_hhead_size = cfs_hash_dd_hhead_size,
+       .hop_hnode_add  = cfs_hash_dd_hnode_add,
+       .hop_hnode_del  = cfs_hash_dd_hnode_del,
+};
+
+static void
+cfs_hash_hlist_setup(struct cfs_hash *hs)
+{
+        if (cfs_hash_with_add_tail(hs)) {
+                hs->hs_hops = cfs_hash_with_depth(hs) ?
+                              &cfs_hash_dd_hops : &cfs_hash_dh_hops;
+        } else {
+                hs->hs_hops = cfs_hash_with_depth(hs) ?
+                              &cfs_hash_hd_hops : &cfs_hash_hh_hops;
+        }
+}
+
+static void
+cfs_hash_bd_from_key(struct cfs_hash *hs, struct cfs_hash_bucket **bkts,
+		     unsigned int bits, const void *key, struct cfs_hash_bd *bd)
+{
+        unsigned int index = cfs_hash_id(hs, key, (1U << bits) - 1);
+
+        LASSERT(bits == hs->hs_cur_bits || bits == hs->hs_rehash_bits);
+
+        bd->bd_bucket = bkts[index & ((1U << (bits - hs->hs_bkt_bits)) - 1)];
+        bd->bd_offset = index >> (bits - hs->hs_bkt_bits);
+}
+
+void
+cfs_hash_bd_get(struct cfs_hash *hs, const void *key, struct cfs_hash_bd *bd)
+{
+        /* NB: caller should hold hs->hs_rwlock if REHASH is set */
+        if (likely(hs->hs_rehash_buckets == NULL)) {
+                cfs_hash_bd_from_key(hs, hs->hs_buckets,
+                                     hs->hs_cur_bits, key, bd);
+        } else {
+                LASSERT(hs->hs_rehash_bits != 0);
+                cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets,
+                                     hs->hs_rehash_bits, key, bd);
+        }
+}
+EXPORT_SYMBOL(cfs_hash_bd_get);
+
+static inline void
+cfs_hash_bd_dep_record(struct cfs_hash *hs, struct cfs_hash_bd *bd, int dep_cur)
+{
+        if (likely(dep_cur <= bd->bd_bucket->hsb_depmax))
+                return;
+
+        bd->bd_bucket->hsb_depmax = dep_cur;
+# if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+        if (likely(warn_on_depth == 0 ||
+                   max(warn_on_depth, hs->hs_dep_max) >= dep_cur))
+                return;
+
+	spin_lock(&hs->hs_dep_lock);
+	hs->hs_dep_max  = dep_cur;
+	hs->hs_dep_bkt  = bd->bd_bucket->hsb_index;
+	hs->hs_dep_off  = bd->bd_offset;
+	hs->hs_dep_bits = hs->hs_cur_bits;
+	spin_unlock(&hs->hs_dep_lock);
+
+	cfs_wi_schedule(cfs_sched_rehash, &hs->hs_dep_wi);
+# endif
+}
+
+void
+cfs_hash_bd_add_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			struct hlist_node *hnode)
+{
+	int rc;
+
+	rc = hs->hs_hops->hop_hnode_add(hs, bd, hnode);
+	cfs_hash_bd_dep_record(hs, bd, rc);
+	bd->bd_bucket->hsb_version++;
+	if (unlikely(bd->bd_bucket->hsb_version == 0))
+		bd->bd_bucket->hsb_version++;
+	bd->bd_bucket->hsb_count++;
+
+	if (cfs_hash_with_counter(hs))
+		atomic_inc(&hs->hs_count);
+	if (!cfs_hash_with_no_itemref(hs))
+		cfs_hash_get(hs, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_bd_add_locked);
+
+void
+cfs_hash_bd_del_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		       struct hlist_node *hnode)
+{
+	hs->hs_hops->hop_hnode_del(hs, bd, hnode);
+
+	LASSERT(bd->bd_bucket->hsb_count > 0);
+	bd->bd_bucket->hsb_count--;
+	bd->bd_bucket->hsb_version++;
+	if (unlikely(bd->bd_bucket->hsb_version == 0))
+		bd->bd_bucket->hsb_version++;
+
+	if (cfs_hash_with_counter(hs)) {
+		LASSERT(atomic_read(&hs->hs_count) > 0);
+		atomic_dec(&hs->hs_count);
+	}
+	if (!cfs_hash_with_no_itemref(hs))
+		cfs_hash_put_locked(hs, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_bd_del_locked);
+
+void
+cfs_hash_bd_move_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd_old,
+			struct cfs_hash_bd *bd_new, struct hlist_node *hnode)
+{
+	struct cfs_hash_bucket *obkt = bd_old->bd_bucket;
+	struct cfs_hash_bucket *nbkt = bd_new->bd_bucket;
+        int                rc;
+
+        if (cfs_hash_bd_compare(bd_old, bd_new) == 0)
+                return;
+
+        /* use cfs_hash_bd_hnode_add/del, to avoid atomic & refcount ops
+         * in cfs_hash_bd_del/add_locked */
+        hs->hs_hops->hop_hnode_del(hs, bd_old, hnode);
+        rc = hs->hs_hops->hop_hnode_add(hs, bd_new, hnode);
+        cfs_hash_bd_dep_record(hs, bd_new, rc);
+
+        LASSERT(obkt->hsb_count > 0);
+        obkt->hsb_count--;
+        obkt->hsb_version++;
+        if (unlikely(obkt->hsb_version == 0))
+                obkt->hsb_version++;
+        nbkt->hsb_count++;
+        nbkt->hsb_version++;
+        if (unlikely(nbkt->hsb_version == 0))
+                nbkt->hsb_version++;
+}
+
+enum {
+        /** always set, for sanity (avoid ZERO intent) */
+        CFS_HS_LOOKUP_MASK_FIND     = 1 << 0,
+        /** return entry with a ref */
+        CFS_HS_LOOKUP_MASK_REF      = 1 << 1,
+        /** add entry if not existing */
+        CFS_HS_LOOKUP_MASK_ADD      = 1 << 2,
+        /** delete entry, ignore other masks */
+        CFS_HS_LOOKUP_MASK_DEL      = 1 << 3,
+};
+
+enum cfs_hash_lookup_intent {
+        /** return item w/o refcount */
+        CFS_HS_LOOKUP_IT_PEEK       = CFS_HS_LOOKUP_MASK_FIND,
+        /** return item with refcount */
+        CFS_HS_LOOKUP_IT_FIND       = (CFS_HS_LOOKUP_MASK_FIND |
+                                       CFS_HS_LOOKUP_MASK_REF),
+        /** return item w/o refcount if existed, otherwise add */
+        CFS_HS_LOOKUP_IT_ADD        = (CFS_HS_LOOKUP_MASK_FIND |
+                                       CFS_HS_LOOKUP_MASK_ADD),
+        /** return item with refcount if existed, otherwise add */
+        CFS_HS_LOOKUP_IT_FINDADD    = (CFS_HS_LOOKUP_IT_FIND |
+                                       CFS_HS_LOOKUP_MASK_ADD),
+        /** delete if existed */
+        CFS_HS_LOOKUP_IT_FINDDEL    = (CFS_HS_LOOKUP_MASK_FIND |
+                                       CFS_HS_LOOKUP_MASK_DEL)
+};
+
+static struct hlist_node *
+cfs_hash_bd_lookup_intent(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			  const void *key, struct hlist_node *hnode,
+			  enum cfs_hash_lookup_intent intent)
+
+{
+	struct hlist_head  *hhead = cfs_hash_bd_hhead(hs, bd);
+	struct hlist_node  *ehnode;
+	struct hlist_node  *match;
+	int  intent_add = (intent & CFS_HS_LOOKUP_MASK_ADD) != 0;
+
+	/* with this function, we can avoid a lot of useless refcount ops,
+	 * which are expensive atomic operations most time. */
+	match = intent_add ? NULL : hnode;
+	hlist_for_each(ehnode, hhead) {
+		if (!cfs_hash_keycmp(hs, key, ehnode))
+			continue;
+
+                if (match != NULL && match != ehnode) /* can't match */
+                        continue;
+
+                /* match and ... */
+                if ((intent & CFS_HS_LOOKUP_MASK_DEL) != 0) {
+                        cfs_hash_bd_del_locked(hs, bd, ehnode);
+                        return ehnode;
+                }
+
+                /* caller wants refcount? */
+                if ((intent & CFS_HS_LOOKUP_MASK_REF) != 0)
+                        cfs_hash_get(hs, ehnode);
+                return ehnode;
+        }
+        /* no match item */
+        if (!intent_add)
+                return NULL;
+
+        LASSERT(hnode != NULL);
+        cfs_hash_bd_add_locked(hs, bd, hnode);
+        return hnode;
+}
+
+struct hlist_node *
+cfs_hash_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			  const void *key)
+{
+	return cfs_hash_bd_lookup_intent(hs, bd, key, NULL,
+					CFS_HS_LOOKUP_IT_FIND);
+}
+EXPORT_SYMBOL(cfs_hash_bd_lookup_locked);
+
+struct hlist_node *
+cfs_hash_bd_peek_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			const void *key)
+{
+	return cfs_hash_bd_lookup_intent(hs, bd, key, NULL,
+					CFS_HS_LOOKUP_IT_PEEK);
+}
+EXPORT_SYMBOL(cfs_hash_bd_peek_locked);
+
+static void
+cfs_hash_multi_bd_lock(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+                       unsigned n, int excl)
+{
+	struct cfs_hash_bucket *prev = NULL;
+        int                i;
+
+        /**
+         * bds must be ascendantly ordered by bd->bd_bucket->hsb_index.
+         * NB: it's possible that several bds point to the same bucket but
+         * have different bd::bd_offset, so need take care of deadlock.
+         */
+        cfs_hash_for_each_bd(bds, n, i) {
+                if (prev == bds[i].bd_bucket)
+                        continue;
+
+                LASSERT(prev == NULL ||
+                        prev->hsb_index < bds[i].bd_bucket->hsb_index);
+                cfs_hash_bd_lock(hs, &bds[i], excl);
+                prev = bds[i].bd_bucket;
+        }
+}
+
+static void
+cfs_hash_multi_bd_unlock(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+                         unsigned n, int excl)
+{
+	struct cfs_hash_bucket *prev = NULL;
+        int                i;
+
+        cfs_hash_for_each_bd(bds, n, i) {
+                if (prev != bds[i].bd_bucket) {
+                        cfs_hash_bd_unlock(hs, &bds[i], excl);
+                        prev = bds[i].bd_bucket;
+                }
+        }
+}
+
+static struct hlist_node *
+cfs_hash_multi_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+				unsigned n, const void *key)
+{
+	struct hlist_node *ehnode;
+	unsigned          i;
+
+	cfs_hash_for_each_bd(bds, n, i) {
+		ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, NULL,
+							CFS_HS_LOOKUP_IT_FIND);
+		if (ehnode != NULL)
+			return ehnode;
+	}
+	return NULL;
+}
+
+static struct hlist_node *
+cfs_hash_multi_bd_findadd_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+				 unsigned n, const void *key,
+				 struct hlist_node *hnode, int noref)
+{
+	struct hlist_node *ehnode;
+	int		  intent;
+	unsigned	  i;
+
+        LASSERT(hnode != NULL);
+        intent = CFS_HS_LOOKUP_IT_PEEK | (!noref * CFS_HS_LOOKUP_MASK_REF);
+
+        cfs_hash_for_each_bd(bds, n, i) {
+                ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key,
+                                                   NULL, intent);
+                if (ehnode != NULL)
+                        return ehnode;
+        }
+
+        if (i == 1) { /* only one bucket */
+                cfs_hash_bd_add_locked(hs, &bds[0], hnode);
+        } else {
+		struct cfs_hash_bd      mybd;
+
+                cfs_hash_bd_get(hs, key, &mybd);
+                cfs_hash_bd_add_locked(hs, &mybd, hnode);
+        }
+
+        return hnode;
+}
+
+static struct hlist_node *
+cfs_hash_multi_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+				 unsigned n, const void *key,
+				 struct hlist_node *hnode)
+{
+	struct hlist_node *ehnode;
+	unsigned	   i;
+
+	cfs_hash_for_each_bd(bds, n, i) {
+		ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, hnode,
+						   CFS_HS_LOOKUP_IT_FINDDEL);
+		if (ehnode != NULL)
+			return ehnode;
+	}
+	return NULL;
+}
+
+static void
+cfs_hash_bd_order(struct cfs_hash_bd *bd1, struct cfs_hash_bd *bd2)
+{
+        int     rc;
+
+        if (bd2->bd_bucket == NULL)
+                return;
+
+        if (bd1->bd_bucket == NULL) {
+                *bd1 = *bd2;
+                bd2->bd_bucket = NULL;
+                return;
+        }
+
+        rc = cfs_hash_bd_compare(bd1, bd2);
+        if (rc == 0) {
+                bd2->bd_bucket = NULL;
+
+	} else if (rc > 0) {
+		swap(*bd1, *bd2); /* swab bd1 and bd2 */
+        }
+}
+
+void
+cfs_hash_dual_bd_get(struct cfs_hash *hs, const void *key,
+		     struct cfs_hash_bd *bds)
+{
+        /* NB: caller should hold hs_lock.rw if REHASH is set */
+        cfs_hash_bd_from_key(hs, hs->hs_buckets,
+                             hs->hs_cur_bits, key, &bds[0]);
+        if (likely(hs->hs_rehash_buckets == NULL)) {
+                /* no rehash or not rehashing */
+                bds[1].bd_bucket = NULL;
+                return;
+        }
+
+        LASSERT(hs->hs_rehash_bits != 0);
+        cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets,
+                             hs->hs_rehash_bits, key, &bds[1]);
+
+        cfs_hash_bd_order(&bds[0], &bds[1]);
+}
+
+void
+cfs_hash_dual_bd_lock(struct cfs_hash *hs, struct cfs_hash_bd *bds, int excl)
+{
+        cfs_hash_multi_bd_lock(hs, bds, 2, excl);
+}
+
+void
+cfs_hash_dual_bd_unlock(struct cfs_hash *hs, struct cfs_hash_bd *bds, int excl)
+{
+        cfs_hash_multi_bd_unlock(hs, bds, 2, excl);
+}
+
+struct hlist_node *
+cfs_hash_dual_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+                               const void *key)
+{
+        return cfs_hash_multi_bd_lookup_locked(hs, bds, 2, key);
+}
+
+struct hlist_node *
+cfs_hash_dual_bd_findadd_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+				const void *key, struct hlist_node *hnode,
+				int noref)
+{
+	return cfs_hash_multi_bd_findadd_locked(hs, bds, 2, key,
+						hnode, noref);
+}
+
+struct hlist_node *
+cfs_hash_dual_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+				const void *key, struct hlist_node *hnode)
+{
+	return cfs_hash_multi_bd_finddel_locked(hs, bds, 2, key, hnode);
+}
+
+static void
+cfs_hash_buckets_free(struct cfs_hash_bucket **buckets,
+                      int bkt_size, int prev_size, int size)
+{
+        int     i;
+
+        for (i = prev_size; i < size; i++) {
+                if (buckets[i] != NULL)
+                        LIBCFS_FREE(buckets[i], bkt_size);
+        }
+
+        LIBCFS_FREE(buckets, sizeof(buckets[0]) * size);
+}
+
+/*
+ * Create or grow bucket memory. Return old_buckets if no allocation was
+ * needed, the newly allocated buckets if allocation was needed and
+ * successful, and NULL on error.
+ */
+static struct cfs_hash_bucket **
+cfs_hash_buckets_realloc(struct cfs_hash *hs, struct cfs_hash_bucket **old_bkts,
+                         unsigned int old_size, unsigned int new_size)
+{
+	struct cfs_hash_bucket **new_bkts;
+        int                 i;
+
+        LASSERT(old_size == 0 || old_bkts != NULL);
+
+        if (old_bkts != NULL && old_size == new_size)
+                return old_bkts;
+
+        LIBCFS_ALLOC(new_bkts, sizeof(new_bkts[0]) * new_size);
+        if (new_bkts == NULL)
+                return NULL;
+
+        if (old_bkts != NULL) {
+                memcpy(new_bkts, old_bkts,
+                       min(old_size, new_size) * sizeof(*old_bkts));
+        }
+
+	for (i = old_size; i < new_size; i++) {
+		struct hlist_head *hhead;
+		struct cfs_hash_bd     bd;
+
+                LIBCFS_ALLOC(new_bkts[i], cfs_hash_bkt_size(hs));
+                if (new_bkts[i] == NULL) {
+                        cfs_hash_buckets_free(new_bkts, cfs_hash_bkt_size(hs),
+                                              old_size, new_size);
+                        return NULL;
+                }
+
+		new_bkts[i]->hsb_index   = i;
+		new_bkts[i]->hsb_version = 1;  /* shouldn't be zero */
+		new_bkts[i]->hsb_depmax  = -1; /* unknown */
+		bd.bd_bucket = new_bkts[i];
+		cfs_hash_bd_for_each_hlist(hs, &bd, hhead)
+			INIT_HLIST_HEAD(hhead);
+
+                if (cfs_hash_with_no_lock(hs) ||
+                    cfs_hash_with_no_bktlock(hs))
+                        continue;
+
+		if (cfs_hash_with_rw_bktlock(hs))
+			rwlock_init(&new_bkts[i]->hsb_lock.rw);
+		else if (cfs_hash_with_spin_bktlock(hs))
+			spin_lock_init(&new_bkts[i]->hsb_lock.spin);
+		else
+			LBUG(); /* invalid use-case */
+	}
+	return new_bkts;
+}
+
+/**
+ * Initialize new libcfs hash, where:
+ * @name     - Descriptive hash name
+ * @cur_bits - Initial hash table size, in bits
+ * @max_bits - Maximum allowed hash table resize, in bits
+ * @ops      - Registered hash table operations
+ * @flags    - CFS_HASH_REHASH enable synamic hash resizing
+ *           - CFS_HASH_SORT enable chained hash sort
+ */
+static int cfs_hash_rehash_worker(struct cfs_workitem *wi);
+
+#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+static int cfs_hash_dep_print(struct cfs_workitem *wi)
+{
+	struct cfs_hash *hs = container_of(wi, struct cfs_hash, hs_dep_wi);
+	int         dep;
+	int         bkt;
+	int         off;
+	int         bits;
+
+	spin_lock(&hs->hs_dep_lock);
+	dep  = hs->hs_dep_max;
+	bkt  = hs->hs_dep_bkt;
+	off  = hs->hs_dep_off;
+	bits = hs->hs_dep_bits;
+	spin_unlock(&hs->hs_dep_lock);
+
+	LCONSOLE_WARN("#### HASH %s (bits: %d): max depth %d at bucket %d/%d\n",
+		      hs->hs_name, bits, dep, bkt, off);
+	spin_lock(&hs->hs_dep_lock);
+	hs->hs_dep_bits = 0; /* mark as workitem done */
+	spin_unlock(&hs->hs_dep_lock);
+	return 0;
+}
+
+static void cfs_hash_depth_wi_init(struct cfs_hash *hs)
+{
+	spin_lock_init(&hs->hs_dep_lock);
+	cfs_wi_init(&hs->hs_dep_wi, hs, cfs_hash_dep_print);
+}
+
+static void cfs_hash_depth_wi_cancel(struct cfs_hash *hs)
+{
+	if (cfs_wi_deschedule(cfs_sched_rehash, &hs->hs_dep_wi))
+		return;
+
+	spin_lock(&hs->hs_dep_lock);
+	while (hs->hs_dep_bits != 0) {
+		spin_unlock(&hs->hs_dep_lock);
+		cond_resched();
+		spin_lock(&hs->hs_dep_lock);
+	}
+	spin_unlock(&hs->hs_dep_lock);
+}
+
+#else /* CFS_HASH_DEBUG_LEVEL < CFS_HASH_DEBUG_1 */
+
+static inline void cfs_hash_depth_wi_init(struct cfs_hash *hs) {}
+static inline void cfs_hash_depth_wi_cancel(struct cfs_hash *hs) {}
+
+#endif /* CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 */
+
+struct cfs_hash *
+cfs_hash_create(char *name, unsigned cur_bits, unsigned max_bits,
+                unsigned bkt_bits, unsigned extra_bytes,
+                unsigned min_theta, unsigned max_theta,
+		struct cfs_hash_ops *ops, unsigned flags)
+{
+	struct cfs_hash *hs;
+        int         len;
+
+        ENTRY;
+
+        CLASSERT(CFS_HASH_THETA_BITS < 15);
+
+        LASSERT(name != NULL);
+        LASSERT(ops != NULL);
+        LASSERT(ops->hs_key);
+        LASSERT(ops->hs_hash);
+        LASSERT(ops->hs_object);
+        LASSERT(ops->hs_keycmp);
+        LASSERT(ops->hs_get != NULL);
+	LASSERT(ops->hs_put != NULL || ops->hs_put_locked != NULL);
+
+        if ((flags & CFS_HASH_REHASH) != 0)
+                flags |= CFS_HASH_COUNTER; /* must have counter */
+
+        LASSERT(cur_bits > 0);
+        LASSERT(cur_bits >= bkt_bits);
+        LASSERT(max_bits >= cur_bits && max_bits < 31);
+        LASSERT(ergo((flags & CFS_HASH_REHASH) == 0, cur_bits == max_bits));
+        LASSERT(ergo((flags & CFS_HASH_REHASH) != 0,
+                     (flags & CFS_HASH_NO_LOCK) == 0));
+        LASSERT(ergo((flags & CFS_HASH_REHASH_KEY) != 0,
+                      ops->hs_keycpy != NULL));
+
+        len = (flags & CFS_HASH_BIGNAME) == 0 ?
+              CFS_HASH_NAME_LEN : CFS_HASH_BIGNAME_LEN;
+	LIBCFS_ALLOC(hs, offsetof(struct cfs_hash, hs_name[len]));
+        if (hs == NULL)
+                RETURN(NULL);
+
+	strlcpy(hs->hs_name, name, len);
+	hs->hs_flags = flags;
+
+	atomic_set(&hs->hs_refcount, 1);
+	atomic_set(&hs->hs_count, 0);
+
+	cfs_hash_lock_setup(hs);
+	cfs_hash_hlist_setup(hs);
+
+        hs->hs_cur_bits = (__u8)cur_bits;
+        hs->hs_min_bits = (__u8)cur_bits;
+        hs->hs_max_bits = (__u8)max_bits;
+        hs->hs_bkt_bits = (__u8)bkt_bits;
+
+        hs->hs_ops         = ops;
+        hs->hs_extra_bytes = extra_bytes;
+        hs->hs_rehash_bits = 0;
+	cfs_wi_init(&hs->hs_rehash_wi, hs, cfs_hash_rehash_worker);
+        cfs_hash_depth_wi_init(hs);
+
+        if (cfs_hash_with_rehash(hs))
+                __cfs_hash_set_theta(hs, min_theta, max_theta);
+
+        hs->hs_buckets = cfs_hash_buckets_realloc(hs, NULL, 0,
+                                                  CFS_HASH_NBKT(hs));
+        if (hs->hs_buckets != NULL)
+                return hs;
+
+	LIBCFS_FREE(hs, offsetof(struct cfs_hash, hs_name[len]));
+        RETURN(NULL);
+}
+EXPORT_SYMBOL(cfs_hash_create);
+
+/**
+ * Cleanup libcfs hash @hs.
+ */
+static void
+cfs_hash_destroy(struct cfs_hash *hs)
+{
+	struct hlist_node     *hnode;
+	struct hlist_node     *pos;
+	struct cfs_hash_bd         bd;
+	int                   i;
+	ENTRY;
+
+	LASSERT(hs != NULL);
+	LASSERT(!cfs_hash_is_exiting(hs) &&
+		!cfs_hash_is_iterating(hs));
+
+        /**
+         * prohibit further rehashes, don't need any lock because
+         * I'm the only (last) one can change it.
+         */
+        hs->hs_exiting = 1;
+        if (cfs_hash_with_rehash(hs))
+                cfs_hash_rehash_cancel(hs);
+
+        cfs_hash_depth_wi_cancel(hs);
+        /* rehash should be done/canceled */
+        LASSERT(hs->hs_buckets != NULL &&
+                hs->hs_rehash_buckets == NULL);
+
+	cfs_hash_for_each_bucket(hs, &bd, i) {
+		struct hlist_head *hhead;
+
+		LASSERT(bd.bd_bucket != NULL);
+		/* no need to take this lock, just for consistent code */
+		cfs_hash_bd_lock(hs, &bd, 1);
+
+                cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+			hlist_for_each_safe(hnode, pos, hhead) {
+					LASSERTF(!cfs_hash_with_assert_empty(hs),
+					"hash %s bucket %u(%u) is not "
+					" empty: %u items left\n",
+					hs->hs_name, bd.bd_bucket->hsb_index,
+					bd.bd_offset, bd.bd_bucket->hsb_count);
+				/* can't assert key valicate, because we
+				 * can interrupt rehash */
+				cfs_hash_bd_del_locked(hs, &bd, hnode);
+				cfs_hash_exit(hs, hnode);
+			}
+		}
+		LASSERT(bd.bd_bucket->hsb_count == 0);
+		cfs_hash_bd_unlock(hs, &bd, 1);
+		cond_resched();
+	}
+
+	LASSERT(atomic_read(&hs->hs_count) == 0);
+
+	cfs_hash_buckets_free(hs->hs_buckets, cfs_hash_bkt_size(hs),
+			      0, CFS_HASH_NBKT(hs));
+	i = cfs_hash_with_bigname(hs) ?
+	    CFS_HASH_BIGNAME_LEN : CFS_HASH_NAME_LEN;
+	LIBCFS_FREE(hs, offsetof(struct cfs_hash, hs_name[i]));
+
+	EXIT;
+}
+
+struct cfs_hash *cfs_hash_getref(struct cfs_hash *hs)
+{
+	if (atomic_inc_not_zero(&hs->hs_refcount))
+		return hs;
+	return NULL;
+}
+EXPORT_SYMBOL(cfs_hash_getref);
+
+void cfs_hash_putref(struct cfs_hash *hs)
+{
+	if (atomic_dec_and_test(&hs->hs_refcount))
+		cfs_hash_destroy(hs);
+}
+EXPORT_SYMBOL(cfs_hash_putref);
+
+static inline int
+cfs_hash_rehash_bits(struct cfs_hash *hs)
+{
+        if (cfs_hash_with_no_lock(hs) ||
+            !cfs_hash_with_rehash(hs))
+                return -EOPNOTSUPP;
+
+        if (unlikely(cfs_hash_is_exiting(hs)))
+                return -ESRCH;
+
+        if (unlikely(cfs_hash_is_rehashing(hs)))
+                return -EALREADY;
+
+        if (unlikely(cfs_hash_is_iterating(hs)))
+                return -EAGAIN;
+
+        /* XXX: need to handle case with max_theta != 2.0
+         *      and the case with min_theta != 0.5 */
+        if ((hs->hs_cur_bits < hs->hs_max_bits) &&
+            (__cfs_hash_theta(hs) > hs->hs_max_theta))
+                return hs->hs_cur_bits + 1;
+
+        if (!cfs_hash_with_shrink(hs))
+                return 0;
+
+        if ((hs->hs_cur_bits > hs->hs_min_bits) &&
+            (__cfs_hash_theta(hs) < hs->hs_min_theta))
+                return hs->hs_cur_bits - 1;
+
+        return 0;
+}
+
+/**
+ * don't allow inline rehash if:
+ * - user wants non-blocking change (add/del) on hash table
+ * - too many elements
+ */
+static inline int
+cfs_hash_rehash_inline(struct cfs_hash *hs)
+{
+	return !cfs_hash_with_nblk_change(hs) &&
+	       atomic_read(&hs->hs_count) < CFS_HASH_LOOP_HOG;
+}
+
+/**
+ * Add item @hnode to libcfs hash @hs using @key.  The registered
+ * ops->hs_get function will be called when the item is added.
+ */
+void
+cfs_hash_add(struct cfs_hash *hs, const void *key, struct hlist_node *hnode)
+{
+	struct cfs_hash_bd   bd;
+        int             bits;
+
+	LASSERT(hlist_unhashed(hnode));
+
+        cfs_hash_lock(hs, 0);
+        cfs_hash_bd_get_and_lock(hs, key, &bd, 1);
+
+        cfs_hash_key_validate(hs, key, hnode);
+        cfs_hash_bd_add_locked(hs, &bd, hnode);
+
+        cfs_hash_bd_unlock(hs, &bd, 1);
+
+        bits = cfs_hash_rehash_bits(hs);
+        cfs_hash_unlock(hs, 0);
+        if (bits > 0)
+                cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs));
+}
+EXPORT_SYMBOL(cfs_hash_add);
+
+static struct hlist_node *
+cfs_hash_find_or_add(struct cfs_hash *hs, const void *key,
+		     struct hlist_node *hnode, int noref)
+{
+	struct hlist_node *ehnode;
+	struct cfs_hash_bd     bds[2];
+	int               bits = 0;
+
+	LASSERTF(hlist_unhashed(hnode), "hnode = %p\n", hnode);
+
+	cfs_hash_lock(hs, 0);
+	cfs_hash_dual_bd_get_and_lock(hs, key, bds, 1);
+
+	cfs_hash_key_validate(hs, key, hnode);
+	ehnode = cfs_hash_dual_bd_findadd_locked(hs, bds, key,
+						 hnode, noref);
+	cfs_hash_dual_bd_unlock(hs, bds, 1);
+
+	if (ehnode == hnode) /* new item added */
+		bits = cfs_hash_rehash_bits(hs);
+	cfs_hash_unlock(hs, 0);
+	if (bits > 0)
+		cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs));
+
+	return ehnode;
+}
+
+/**
+ * Add item @hnode to libcfs hash @hs using @key.  The registered
+ * ops->hs_get function will be called if the item was added.
+ * Returns 0 on success or -EALREADY on key collisions.
+ */
+int
+cfs_hash_add_unique(struct cfs_hash *hs, const void *key,
+		    struct hlist_node *hnode)
+{
+	return cfs_hash_find_or_add(hs, key, hnode, 1) != hnode ?
+	       -EALREADY : 0;
+}
+EXPORT_SYMBOL(cfs_hash_add_unique);
+
+/**
+ * Add item @hnode to libcfs hash @hs using @key.  If this @key
+ * already exists in the hash then ops->hs_get will be called on the
+ * conflicting entry and that entry will be returned to the caller.
+ * Otherwise ops->hs_get is called on the item which was added.
+ */
+void *
+cfs_hash_findadd_unique(struct cfs_hash *hs, const void *key,
+			struct hlist_node *hnode)
+{
+	hnode = cfs_hash_find_or_add(hs, key, hnode, 0);
+
+	return cfs_hash_object(hs, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_findadd_unique);
+
+/**
+ * Delete item @hnode from the libcfs hash @hs using @key.  The @key
+ * is required to ensure the correct hash bucket is locked since there
+ * is no direct linkage from the item to the bucket.  The object
+ * removed from the hash will be returned and obs->hs_put is called
+ * on the removed object.
+ */
+void *
+cfs_hash_del(struct cfs_hash *hs, const void *key, struct hlist_node *hnode)
+{
+        void           *obj  = NULL;
+        int             bits = 0;
+	struct cfs_hash_bd   bds[2];
+
+        cfs_hash_lock(hs, 0);
+        cfs_hash_dual_bd_get_and_lock(hs, key, bds, 1);
+
+	/* NB: do nothing if @hnode is not in hash table */
+	if (hnode == NULL || !hlist_unhashed(hnode)) {
+		if (bds[1].bd_bucket == NULL && hnode != NULL) {
+			cfs_hash_bd_del_locked(hs, &bds[0], hnode);
+		} else {
+			hnode = cfs_hash_dual_bd_finddel_locked(hs, bds,
+								key, hnode);
+		}
+	}
+
+        if (hnode != NULL) {
+                obj  = cfs_hash_object(hs, hnode);
+                bits = cfs_hash_rehash_bits(hs);
+        }
+
+        cfs_hash_dual_bd_unlock(hs, bds, 1);
+        cfs_hash_unlock(hs, 0);
+        if (bits > 0)
+                cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs));
+
+        return obj;
+}
+EXPORT_SYMBOL(cfs_hash_del);
+
+/**
+ * Delete item given @key in libcfs hash @hs.  The first @key found in
+ * the hash will be removed, if the key exists multiple times in the hash
+ * @hs this function must be called once per key.  The removed object
+ * will be returned and ops->hs_put is called on the removed object.
+ */
+void *
+cfs_hash_del_key(struct cfs_hash *hs, const void *key)
+{
+        return cfs_hash_del(hs, key, NULL);
+}
+EXPORT_SYMBOL(cfs_hash_del_key);
+
+/**
+ * Lookup an item using @key in the libcfs hash @hs and return it.
+ * If the @key is found in the hash hs->hs_get() is called and the
+ * matching objects is returned.  It is the callers responsibility
+ * to call the counterpart ops->hs_put using the cfs_hash_put() macro
+ * when when finished with the object.  If the @key was not found
+ * in the hash @hs NULL is returned.
+ */
+void *
+cfs_hash_lookup(struct cfs_hash *hs, const void *key)
+{
+        void                 *obj = NULL;
+	struct hlist_node     *hnode;
+	struct cfs_hash_bd         bds[2];
+
+        cfs_hash_lock(hs, 0);
+        cfs_hash_dual_bd_get_and_lock(hs, key, bds, 0);
+
+        hnode = cfs_hash_dual_bd_lookup_locked(hs, bds, key);
+        if (hnode != NULL)
+                obj = cfs_hash_object(hs, hnode);
+
+        cfs_hash_dual_bd_unlock(hs, bds, 0);
+        cfs_hash_unlock(hs, 0);
+
+        return obj;
+}
+EXPORT_SYMBOL(cfs_hash_lookup);
+
+static void
+cfs_hash_for_each_enter(struct cfs_hash *hs)
+{
+        LASSERT(!cfs_hash_is_exiting(hs));
+
+        if (!cfs_hash_with_rehash(hs))
+                return;
+        /*
+         * NB: it's race on cfs_has_t::hs_iterating, but doesn't matter
+         * because it's just an unreliable signal to rehash-thread,
+	 * rehash-thread will try to finish rehash ASAP when seeing this.
+         */
+        hs->hs_iterating = 1;
+
+        cfs_hash_lock(hs, 1);
+        hs->hs_iterators++;
+
+        /* NB: iteration is mostly called by service thread,
+	 * we tend to cancel pending rehash-request, instead of
+         * blocking service thread, we will relaunch rehash request
+         * after iteration */
+        if (cfs_hash_is_rehashing(hs))
+                cfs_hash_rehash_cancel_locked(hs);
+        cfs_hash_unlock(hs, 1);
+}
+
+static void
+cfs_hash_for_each_exit(struct cfs_hash *hs)
+{
+	int remained;
+	int bits;
+
+	if (!cfs_hash_with_rehash(hs))
+		return;
+	cfs_hash_lock(hs, 1);
+	remained = --hs->hs_iterators;
+	bits = cfs_hash_rehash_bits(hs);
+	cfs_hash_unlock(hs, 1);
+	/* NB: it's race on cfs_has_t::hs_iterating, see above */
+	if (remained == 0)
+		hs->hs_iterating = 0;
+	if (bits > 0) {
+		cfs_hash_rehash(hs, atomic_read(&hs->hs_count) <
+				    CFS_HASH_LOOP_HOG);
+	}
+}
+
+/**
+ * For each item in the libcfs hash @hs call the passed callback @func
+ * and pass to it as an argument each hash item and the private @data.
+ *
+ * a) the function may sleep!
+ * b) during the callback:
+ *    . the bucket lock is held so the callback must never sleep.
+ *    . if @removal_safe is true, use can remove current item by
+ *      cfs_hash_bd_del_locked
+ */
+static __u64
+cfs_hash_for_each_tight(struct cfs_hash *hs, cfs_hash_for_each_cb_t func,
+			void *data, int remove_safe)
+{
+	struct hlist_node	*hnode;
+	struct hlist_node	*pos;
+	struct cfs_hash_bd	bd;
+	__u64			count = 0;
+	int			excl  = !!remove_safe;
+	int			loop  = 0;
+	int			i;
+	ENTRY;
+
+	cfs_hash_for_each_enter(hs);
+
+	cfs_hash_lock(hs, 0);
+	LASSERT(!cfs_hash_is_rehashing(hs));
+
+	cfs_hash_for_each_bucket(hs, &bd, i) {
+		struct hlist_head *hhead;
+
+		cfs_hash_bd_lock(hs, &bd, excl);
+		if (func == NULL) { /* only glimpse size */
+			count += bd.bd_bucket->hsb_count;
+			cfs_hash_bd_unlock(hs, &bd, excl);
+			continue;
+		}
+
+		cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+			hlist_for_each_safe(hnode, pos, hhead) {
+				cfs_hash_bucket_validate(hs, &bd, hnode);
+				count++;
+				loop++;
+				if (func(hs, &bd, hnode, data)) {
+					cfs_hash_bd_unlock(hs, &bd, excl);
+					goto out;
+				}
+			}
+		}
+		cfs_hash_bd_unlock(hs, &bd, excl);
+		if (loop < CFS_HASH_LOOP_HOG)
+			continue;
+		loop = 0;
+		cfs_hash_unlock(hs, 0);
+		cond_resched();
+		cfs_hash_lock(hs, 0);
+	}
+ out:
+	cfs_hash_unlock(hs, 0);
+
+	cfs_hash_for_each_exit(hs);
+	RETURN(count);
+}
+
+struct cfs_hash_cond_arg {
+        cfs_hash_cond_opt_cb_t  func;
+        void                   *arg;
+};
+
+static int
+cfs_hash_cond_del_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			 struct hlist_node *hnode, void *data)
+{
+	struct cfs_hash_cond_arg *cond = data;
+
+        if (cond->func(cfs_hash_object(hs, hnode), cond->arg))
+                cfs_hash_bd_del_locked(hs, bd, hnode);
+        return 0;
+}
+
+/**
+ * Delete item from the libcfs hash @hs when @func return true.
+ * The write lock being hold during loop for each bucket to avoid
+ * any object be reference.
+ */
+void
+cfs_hash_cond_del(struct cfs_hash *hs, cfs_hash_cond_opt_cb_t func, void *data)
+{
+	struct cfs_hash_cond_arg arg = {
+                .func   = func,
+                .arg    = data,
+        };
+
+        cfs_hash_for_each_tight(hs, cfs_hash_cond_del_locked, &arg, 1);
+}
+EXPORT_SYMBOL(cfs_hash_cond_del);
+
+void
+cfs_hash_for_each(struct cfs_hash *hs,
+                  cfs_hash_for_each_cb_t func, void *data)
+{
+        cfs_hash_for_each_tight(hs, func, data, 0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each);
+
+void
+cfs_hash_for_each_safe(struct cfs_hash *hs,
+                       cfs_hash_for_each_cb_t func, void *data)
+{
+        cfs_hash_for_each_tight(hs, func, data, 1);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_safe);
+
+static int
+cfs_hash_peek(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+	      struct hlist_node *hnode, void *data)
+{
+	*(int *)data = 0;
+	return 1; /* return 1 to break the loop */
+}
+
+int
+cfs_hash_is_empty(struct cfs_hash *hs)
+{
+        int empty = 1;
+
+        cfs_hash_for_each_tight(hs, cfs_hash_peek, &empty, 0);
+        return empty;
+}
+EXPORT_SYMBOL(cfs_hash_is_empty);
+
+__u64
+cfs_hash_size_get(struct cfs_hash *hs)
+{
+	return cfs_hash_with_counter(hs) ?
+	       atomic_read(&hs->hs_count) :
+	       cfs_hash_for_each_tight(hs, NULL, NULL, 0);
+}
+EXPORT_SYMBOL(cfs_hash_size_get);
+
+/*
+ * cfs_hash_for_each_relax:
+ * Iterate the hash table and call @func on each item without
+ * any lock. This function can't guarantee to finish iteration
+ * if these features are enabled:
+ *
+ *  a. if rehash_key is enabled, an item can be moved from
+ *     one bucket to another bucket
+ *  b. user can remove non-zero-ref item from hash-table,
+ *     so the item can be removed from hash-table, even worse,
+ *     it's possible that user changed key and insert to another
+ *     hash bucket.
+ * there's no way for us to finish iteration correctly on previous
+ * two cases, so iteration has to be stopped on change.
+ */
+static int
+cfs_hash_for_each_relax(struct cfs_hash *hs, cfs_hash_for_each_cb_t func,
+			void *data, int start)
+{
+	struct hlist_node	*hnode;
+	struct hlist_node	*next = NULL;
+	struct cfs_hash_bd	bd;
+	__u32			version;
+	int			count = 0;
+	int			stop_on_change;
+	int			has_put_locked;
+	int			rc = 0;
+	int			i, end = -1;
+	ENTRY;
+
+	stop_on_change = cfs_hash_with_rehash_key(hs) ||
+			 !cfs_hash_with_no_itemref(hs);
+	has_put_locked = hs->hs_ops->hs_put_locked != NULL;
+	cfs_hash_lock(hs, 0);
+again:
+	LASSERT(!cfs_hash_is_rehashing(hs));
+
+	cfs_hash_for_each_bucket(hs, &bd, i) {
+		struct hlist_head *hhead;
+
+		if (i < start)
+			continue;
+		else if (end > 0 && i >= end)
+			break;
+
+                cfs_hash_bd_lock(hs, &bd, 0);
+                version = cfs_hash_bd_version_get(&bd);
+
+                cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+			hnode = hhead->first;
+			if (hnode == NULL)
+				continue;
+			cfs_hash_get(hs, hnode);
+			for (; hnode != NULL; hnode = next) {
+				cfs_hash_bucket_validate(hs, &bd, hnode);
+				next = hnode->next;
+				if (next != NULL)
+					cfs_hash_get(hs, next);
+                                cfs_hash_bd_unlock(hs, &bd, 0);
+                                cfs_hash_unlock(hs, 0);
+
+				rc = func(hs, &bd, hnode, data);
+				if (stop_on_change || !has_put_locked)
+					cfs_hash_put(hs, hnode);
+
+				cond_resched();
+				count++;
+
+                                cfs_hash_lock(hs, 0);
+                                cfs_hash_bd_lock(hs, &bd, 0);
+				if (stop_on_change) {
+					if (version !=
+					    cfs_hash_bd_version_get(&bd))
+						rc = -EINTR;
+				} else if (has_put_locked) {
+					cfs_hash_put_locked(hs, hnode);
+				}
+                                if (rc) /* callback wants to break iteration */
+                                        break;
+                        }
+			if (next != NULL) {
+				if (has_put_locked) {
+					cfs_hash_put_locked(hs, next);
+					next = NULL;
+				}
+				break;
+			} else if (rc != 0) {
+				break;
+			}
+                }
+                cfs_hash_bd_unlock(hs, &bd, 0);
+		if (next != NULL && !has_put_locked) {
+			cfs_hash_put(hs, next);
+			next = NULL;
+		}
+		if (rc) /* callback wants to break iteration */
+			break;
+        }
+
+	if (start > 0 && rc == 0) {
+		end = start;
+		start = 0;
+		goto again;
+	}
+
+	cfs_hash_unlock(hs, 0);
+	return count;
+}
+
+int
+cfs_hash_for_each_nolock(struct cfs_hash *hs,
+			 cfs_hash_for_each_cb_t func, void *data, int start)
+{
+        ENTRY;
+
+        if (cfs_hash_with_no_lock(hs) ||
+            cfs_hash_with_rehash_key(hs) ||
+            !cfs_hash_with_no_itemref(hs))
+                RETURN(-EOPNOTSUPP);
+
+	if (hs->hs_ops->hs_get == NULL ||
+	   (hs->hs_ops->hs_put == NULL &&
+	    hs->hs_ops->hs_put_locked == NULL))
+		RETURN(-EOPNOTSUPP);
+
+	cfs_hash_for_each_enter(hs);
+	cfs_hash_for_each_relax(hs, func, data, start);
+	cfs_hash_for_each_exit(hs);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_nolock);
+
+/**
+ * For each hash bucket in the libcfs hash @hs call the passed callback
+ * @func until all the hash buckets are empty.  The passed callback @func
+ * or the previously registered callback hs->hs_put must remove the item
+ * from the hash.  You may either use the cfs_hash_del() or hlist_del()
+ * functions.  No rwlocks will be held during the callback @func it is
+ * safe to sleep if needed.  This function will not terminate until the
+ * hash is empty.  Note it is still possible to concurrently add new
+ * items in to the hash.  It is the callers responsibility to ensure
+ * the required locking is in place to prevent concurrent insertions.
+ */
+int
+cfs_hash_for_each_empty(struct cfs_hash *hs,
+                        cfs_hash_for_each_cb_t func, void *data)
+{
+        unsigned  i = 0;
+        ENTRY;
+
+        if (cfs_hash_with_no_lock(hs))
+                return -EOPNOTSUPP;
+
+	if (hs->hs_ops->hs_get == NULL ||
+	   (hs->hs_ops->hs_put == NULL &&
+	    hs->hs_ops->hs_put_locked == NULL))
+		return -EOPNOTSUPP;
+
+	cfs_hash_for_each_enter(hs);
+	while (cfs_hash_for_each_relax(hs, func, data, 0)) {
+		CDEBUG(D_INFO, "Try to empty hash: %s, loop: %u\n",
+		       hs->hs_name, i++);
+	}
+	cfs_hash_for_each_exit(hs);
+	RETURN(0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_empty);
+
+void
+cfs_hash_hlist_for_each(struct cfs_hash *hs, unsigned hindex,
+			cfs_hash_for_each_cb_t func, void *data)
+{
+	struct hlist_head *hhead;
+	struct hlist_node *hnode;
+	struct cfs_hash_bd	   bd;
+
+        cfs_hash_for_each_enter(hs);
+        cfs_hash_lock(hs, 0);
+        if (hindex >= CFS_HASH_NHLIST(hs))
+                goto out;
+
+	cfs_hash_bd_index_set(hs, hindex, &bd);
+
+	cfs_hash_bd_lock(hs, &bd, 0);
+	hhead = cfs_hash_bd_hhead(hs, &bd);
+	hlist_for_each(hnode, hhead) {
+		if (func(hs, &bd, hnode, data))
+			break;
+	}
+	cfs_hash_bd_unlock(hs, &bd, 0);
+out:
+	cfs_hash_unlock(hs, 0);
+	cfs_hash_for_each_exit(hs);
+}
+
+EXPORT_SYMBOL(cfs_hash_hlist_for_each);
+
+/*
+ * For each item in the libcfs hash @hs which matches the @key call
+ * the passed callback @func and pass to it as an argument each hash
+ * item and the private @data. During the callback the bucket lock
+ * is held so the callback must never sleep.
+   */
+void
+cfs_hash_for_each_key(struct cfs_hash *hs, const void *key,
+			cfs_hash_for_each_cb_t func, void *data)
+{
+	struct hlist_node *hnode;
+	struct cfs_hash_bd	   bds[2];
+	unsigned	   i;
+
+	cfs_hash_lock(hs, 0);
+
+	cfs_hash_dual_bd_get_and_lock(hs, key, bds, 0);
+
+	cfs_hash_for_each_bd(bds, 2, i) {
+		struct hlist_head *hlist = cfs_hash_bd_hhead(hs, &bds[i]);
+
+		hlist_for_each(hnode, hlist) {
+			cfs_hash_bucket_validate(hs, &bds[i], hnode);
+
+			if (cfs_hash_keycmp(hs, key, hnode)) {
+				if (func(hs, &bds[i], hnode, data))
+					break;
+			}
+		}
+	}
+
+	cfs_hash_dual_bd_unlock(hs, bds, 0);
+	cfs_hash_unlock(hs, 0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_key);
+
+/**
+ * Rehash the libcfs hash @hs to the given @bits.  This can be used
+ * to grow the hash size when excessive chaining is detected, or to
+ * shrink the hash when it is larger than needed.  When the CFS_HASH_REHASH
+ * flag is set in @hs the libcfs hash may be dynamically rehashed
+ * during addition or removal if the hash's theta value exceeds
+ * either the hs->hs_min_theta or hs->max_theta values.  By default
+ * these values are tuned to keep the chained hash depth small, and
+ * this approach assumes a reasonably uniform hashing function.  The
+ * theta thresholds for @hs are tunable via cfs_hash_set_theta().
+ */
+void
+cfs_hash_rehash_cancel_locked(struct cfs_hash *hs)
+{
+        int     i;
+
+        /* need hold cfs_hash_lock(hs, 1) */
+        LASSERT(cfs_hash_with_rehash(hs) &&
+                !cfs_hash_with_no_lock(hs));
+
+        if (!cfs_hash_is_rehashing(hs))
+                return;
+
+	if (cfs_wi_deschedule(cfs_sched_rehash, &hs->hs_rehash_wi)) {
+                hs->hs_rehash_bits = 0;
+                return;
+        }
+
+        for (i = 2; cfs_hash_is_rehashing(hs); i++) {
+		cfs_hash_unlock(hs, 1);
+		/* raise console warning while waiting too long */
+		CDEBUG(is_power_of_2(i >> 3) ? D_WARNING : D_INFO,
+		       "hash %s is still rehashing, rescheded %d\n",
+		       hs->hs_name, i - 1);
+		cond_resched();
+		cfs_hash_lock(hs, 1);
+	}
+}
+
+void
+cfs_hash_rehash_cancel(struct cfs_hash *hs)
+{
+        cfs_hash_lock(hs, 1);
+        cfs_hash_rehash_cancel_locked(hs);
+        cfs_hash_unlock(hs, 1);
+}
+
+int
+cfs_hash_rehash(struct cfs_hash *hs, int do_rehash)
+{
+        int     rc;
+
+        LASSERT(cfs_hash_with_rehash(hs) && !cfs_hash_with_no_lock(hs));
+
+        cfs_hash_lock(hs, 1);
+
+        rc = cfs_hash_rehash_bits(hs);
+        if (rc <= 0) {
+                cfs_hash_unlock(hs, 1);
+                return rc;
+        }
+
+        hs->hs_rehash_bits = rc;
+        if (!do_rehash) {
+                /* launch and return */
+		cfs_wi_schedule(cfs_sched_rehash, &hs->hs_rehash_wi);
+                cfs_hash_unlock(hs, 1);
+                return 0;
+        }
+
+        /* rehash right now */
+        cfs_hash_unlock(hs, 1);
+
+        return cfs_hash_rehash_worker(&hs->hs_rehash_wi);
+}
+
+static int
+cfs_hash_rehash_bd(struct cfs_hash *hs, struct cfs_hash_bd *old)
+{
+	struct cfs_hash_bd      new;
+	struct hlist_head *hhead;
+	struct hlist_node *hnode;
+	struct hlist_node *pos;
+	void		  *key;
+	int		   c = 0;
+
+	/* hold cfs_hash_lock(hs, 1), so don't need any bucket lock */
+	cfs_hash_bd_for_each_hlist(hs, old, hhead) {
+		hlist_for_each_safe(hnode, pos, hhead) {
+			key = cfs_hash_key(hs, hnode);
+			LASSERT(key != NULL);
+			/* Validate hnode is in the correct bucket. */
+			cfs_hash_bucket_validate(hs, old, hnode);
+			/*
+			 * Delete from old hash bucket; move to new bucket.
+			 * ops->hs_key must be defined.
+			 */
+			cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets,
+					     hs->hs_rehash_bits, key, &new);
+			cfs_hash_bd_move_locked(hs, old, &new, hnode);
+			c++;
+		}
+	}
+	return c;
+}
+
+static int
+cfs_hash_rehash_worker(struct cfs_workitem *wi)
+{
+	struct cfs_hash		*hs =
+		container_of(wi, struct cfs_hash, hs_rehash_wi);
+	struct cfs_hash_bucket **bkts;
+	struct cfs_hash_bd	bd;
+	unsigned int		old_size;
+	unsigned int		new_size;
+	int			bsize;
+	int			count = 0;
+	int			rc = 0;
+	int			i;
+
+	LASSERT(hs != NULL && cfs_hash_with_rehash(hs));
+
+        cfs_hash_lock(hs, 0);
+        LASSERT(cfs_hash_is_rehashing(hs));
+
+        old_size = CFS_HASH_NBKT(hs);
+        new_size = CFS_HASH_RH_NBKT(hs);
+
+        cfs_hash_unlock(hs, 0);
+
+        /*
+         * don't need hs::hs_rwlock for hs::hs_buckets,
+         * because nobody can change bkt-table except me.
+         */
+        bkts = cfs_hash_buckets_realloc(hs, hs->hs_buckets,
+                                        old_size, new_size);
+        cfs_hash_lock(hs, 1);
+        if (bkts == NULL) {
+                rc = -ENOMEM;
+                goto out;
+        }
+
+        if (bkts == hs->hs_buckets) {
+                bkts = NULL; /* do nothing */
+                goto out;
+        }
+
+        rc = __cfs_hash_theta(hs);
+        if ((rc >= hs->hs_min_theta) && (rc <= hs->hs_max_theta)) {
+                /* free the new allocated bkt-table */
+                old_size = new_size;
+                new_size = CFS_HASH_NBKT(hs);
+                rc = -EALREADY;
+                goto out;
+        }
+
+        LASSERT(hs->hs_rehash_buckets == NULL);
+        hs->hs_rehash_buckets = bkts;
+
+        rc = 0;
+        cfs_hash_for_each_bucket(hs, &bd, i) {
+                if (cfs_hash_is_exiting(hs)) {
+                        rc = -ESRCH;
+                        /* someone wants to destroy the hash, abort now */
+                        if (old_size < new_size) /* OK to free old bkt-table */
+                                break;
+                        /* it's shrinking, need free new bkt-table */
+                        hs->hs_rehash_buckets = NULL;
+                        old_size = new_size;
+                        new_size = CFS_HASH_NBKT(hs);
+                        goto out;
+                }
+
+                count += cfs_hash_rehash_bd(hs, &bd);
+                if (count < CFS_HASH_LOOP_HOG ||
+                    cfs_hash_is_iterating(hs)) { /* need to finish ASAP */
+                        continue;
+                }
+
+		count = 0;
+		cfs_hash_unlock(hs, 1);
+		cond_resched();
+		cfs_hash_lock(hs, 1);
+	}
+
+        hs->hs_rehash_count++;
+
+        bkts = hs->hs_buckets;
+        hs->hs_buckets = hs->hs_rehash_buckets;
+        hs->hs_rehash_buckets = NULL;
+
+        hs->hs_cur_bits = hs->hs_rehash_bits;
+ out:
+        hs->hs_rehash_bits = 0;
+	if (rc == -ESRCH) /* never be scheduled again */
+		cfs_wi_exit(cfs_sched_rehash, wi);
+        bsize = cfs_hash_bkt_size(hs);
+        cfs_hash_unlock(hs, 1);
+        /* can't refer to @hs anymore because it could be destroyed */
+        if (bkts != NULL)
+                cfs_hash_buckets_free(bkts, bsize, new_size, old_size);
+        if (rc != 0)
+		CDEBUG(D_INFO, "early quit of rehashing: %d\n", rc);
+	/* return 1 only if cfs_wi_exit is called */
+	return rc == -ESRCH;
+}
+
+/**
+ * Rehash the object referenced by @hnode in the libcfs hash @hs.  The
+ * @old_key must be provided to locate the objects previous location
+ * in the hash, and the @new_key will be used to reinsert the object.
+ * Use this function instead of a cfs_hash_add() + cfs_hash_del()
+ * combo when it is critical that there is no window in time where the
+ * object is missing from the hash.  When an object is being rehashed
+ * the registered cfs_hash_get() and cfs_hash_put() functions will
+ * not be called.
+ */
+void cfs_hash_rehash_key(struct cfs_hash *hs, const void *old_key,
+			 void *new_key, struct hlist_node *hnode)
+{
+	struct cfs_hash_bd        bds[3];
+	struct cfs_hash_bd        old_bds[2];
+	struct cfs_hash_bd        new_bd;
+
+	LASSERT(!hlist_unhashed(hnode));
+
+        cfs_hash_lock(hs, 0);
+
+        cfs_hash_dual_bd_get(hs, old_key, old_bds);
+        cfs_hash_bd_get(hs, new_key, &new_bd);
+
+        bds[0] = old_bds[0];
+        bds[1] = old_bds[1];
+        bds[2] = new_bd;
+
+        /* NB: bds[0] and bds[1] are ordered already */
+        cfs_hash_bd_order(&bds[1], &bds[2]);
+        cfs_hash_bd_order(&bds[0], &bds[1]);
+
+        cfs_hash_multi_bd_lock(hs, bds, 3, 1);
+        if (likely(old_bds[1].bd_bucket == NULL)) {
+                cfs_hash_bd_move_locked(hs, &old_bds[0], &new_bd, hnode);
+        } else {
+                cfs_hash_dual_bd_finddel_locked(hs, old_bds, old_key, hnode);
+                cfs_hash_bd_add_locked(hs, &new_bd, hnode);
+        }
+        /* overwrite key inside locks, otherwise may screw up with
+         * other operations, i.e: rehash */
+        cfs_hash_keycpy(hs, hnode, new_key);
+
+        cfs_hash_multi_bd_unlock(hs, bds, 3, 1);
+        cfs_hash_unlock(hs, 0);
+}
+EXPORT_SYMBOL(cfs_hash_rehash_key);
+
+void cfs_hash_debug_header(struct seq_file *m)
+{
+	seq_printf(m, "%-*s   cur   min   max theta t-min t-max flags rehash   count  maxdep maxdepb distribution\n",
+		   CFS_HASH_BIGNAME_LEN, "name");
+}
+EXPORT_SYMBOL(cfs_hash_debug_header);
+
+static struct cfs_hash_bucket **
+cfs_hash_full_bkts(struct cfs_hash *hs)
+{
+        /* NB: caller should hold hs->hs_rwlock if REHASH is set */
+        if (hs->hs_rehash_buckets == NULL)
+                return hs->hs_buckets;
+
+        LASSERT(hs->hs_rehash_bits != 0);
+        return hs->hs_rehash_bits > hs->hs_cur_bits ?
+               hs->hs_rehash_buckets : hs->hs_buckets;
+}
+
+static unsigned int
+cfs_hash_full_nbkt(struct cfs_hash *hs)
+{
+        /* NB: caller should hold hs->hs_rwlock if REHASH is set */
+        if (hs->hs_rehash_buckets == NULL)
+                return CFS_HASH_NBKT(hs);
+
+        LASSERT(hs->hs_rehash_bits != 0);
+        return hs->hs_rehash_bits > hs->hs_cur_bits ?
+               CFS_HASH_RH_NBKT(hs) : CFS_HASH_NBKT(hs);
+}
+
+void cfs_hash_debug_str(struct cfs_hash *hs, struct seq_file *m)
+{
+	int dist[8] = { 0, };
+	int maxdep = -1;
+	int maxdepb = -1;
+	int total = 0;
+	int theta;
+	int i;
+
+	cfs_hash_lock(hs, 0);
+	theta = __cfs_hash_theta(hs);
+
+	seq_printf(m, "%-*s %5d %5d %5d %d.%03d %d.%03d %d.%03d  0x%02x %6d ",
+		   CFS_HASH_BIGNAME_LEN, hs->hs_name,
+		   1 << hs->hs_cur_bits, 1 << hs->hs_min_bits,
+		   1 << hs->hs_max_bits,
+		   __cfs_hash_theta_int(theta), __cfs_hash_theta_frac(theta),
+		   __cfs_hash_theta_int(hs->hs_min_theta),
+		   __cfs_hash_theta_frac(hs->hs_min_theta),
+		   __cfs_hash_theta_int(hs->hs_max_theta),
+		   __cfs_hash_theta_frac(hs->hs_max_theta),
+		   hs->hs_flags, hs->hs_rehash_count);
+
+	/*
+	 * The distribution is a summary of the chained hash depth in
+	 * each of the libcfs hash buckets.  Each buckets hsb_count is
+	 * divided by the hash theta value and used to generate a
+	 * histogram of the hash distribution.  A uniform hash will
+	 * result in all hash buckets being close to the average thus
+	 * only the first few entries in the histogram will be non-zero.
+	 * If you hash function results in a non-uniform hash the will
+	 * be observable by outlier bucks in the distribution histogram.
+	 *
+	 * Uniform hash distribution:		128/128/0/0/0/0/0/0
+	 * Non-Uniform hash distribution:	128/125/0/0/0/0/2/1
+	 */
+	for (i = 0; i < cfs_hash_full_nbkt(hs); i++) {
+		struct cfs_hash_bd bd;
+
+		bd.bd_bucket = cfs_hash_full_bkts(hs)[i];
+		cfs_hash_bd_lock(hs, &bd, 0);
+		if (maxdep < bd.bd_bucket->hsb_depmax) {
+			maxdep  = bd.bd_bucket->hsb_depmax;
+			maxdepb = ffz(~maxdep);
+		}
+		total += bd.bd_bucket->hsb_count;
+		dist[min(fls(bd.bd_bucket->hsb_count / max(theta, 1)), 7)]++;
+		cfs_hash_bd_unlock(hs, &bd, 0);
+	}
+
+	seq_printf(m, "%7d %7d %7d ", total, maxdep, maxdepb);
+	for (i = 0; i < 8; i++)
+		seq_printf(m, "%d%c",  dist[i], (i == 7) ? '\n' : '/');
+
+	cfs_hash_unlock(hs, 0);
+}
+EXPORT_SYMBOL(cfs_hash_debug_str);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/heap.c b/drivers/staging/lustrefsx/libcfs/libcfs/heap.c
new file mode 100644
index 0000000000000..4efc4eba743b3
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/heap.c
@@ -0,0 +1,499 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ */
+/*
+ * libcfs/libcfs/heap.c
+ *
+ * Author: Eric Barton	<eeb@whamcloud.com>
+ *	   Liang Zhen	<liang@whamcloud.com>
+ */
+/** \addtogroup heap
+ *
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <libcfs/libcfs.h>
+
+#define CBH_ALLOC(ptr, h)						\
+do {									\
+	if (h->cbh_cptab) {						\
+		if ((h)->cbh_flags & CBH_FLAG_ATOMIC_GROW)		\
+			LIBCFS_CPT_ALLOC_GFP((ptr), h->cbh_cptab,	\
+					     h->cbh_cptid, CBH_NOB,	\
+					     GFP_ATOMIC);		\
+		else							\
+			LIBCFS_CPT_ALLOC((ptr), h->cbh_cptab,		\
+					 h->cbh_cptid, CBH_NOB);	\
+	} else {							\
+		if ((h)->cbh_flags & CBH_FLAG_ATOMIC_GROW)		\
+			LIBCFS_ALLOC_ATOMIC((ptr), CBH_NOB);		\
+		else							\
+			LIBCFS_ALLOC((ptr), CBH_NOB);   		\
+	}								\
+} while (0)
+
+#define CBH_FREE(ptr)	LIBCFS_FREE(ptr, CBH_NOB)
+
+/**
+ * Grows the capacity of a binary heap so that it can handle a larger number of
+ * \e struct cfs_binheap_node objects.
+ *
+ * \param[in] h The binary heap
+ *
+ * \retval 0	   Successfully grew the heap
+ * \retval -ENOMEM OOM error
+ */
+static int
+cfs_binheap_grow(struct cfs_binheap *h)
+{
+	struct cfs_binheap_node ***frag1 = NULL;
+	struct cfs_binheap_node  **frag2;
+	int hwm = h->cbh_hwm;
+
+	/* need a whole new chunk of pointers */
+	LASSERT((h->cbh_hwm & CBH_MASK) == 0);
+
+	if (hwm == 0) {
+		/* first use of single indirect */
+		CBH_ALLOC(h->cbh_elements1, h);
+		if (h->cbh_elements1 == NULL)
+			return -ENOMEM;
+
+		goto out;
+	}
+
+	hwm -= CBH_SIZE;
+	if (hwm < CBH_SIZE * CBH_SIZE) {
+		/* not filled double indirect */
+		CBH_ALLOC(frag2, h);
+		if (frag2 == NULL)
+			return -ENOMEM;
+
+		if (hwm == 0) {
+			/* first use of double indirect */
+			CBH_ALLOC(h->cbh_elements2, h);
+			if (h->cbh_elements2 == NULL) {
+				CBH_FREE(frag2);
+				return -ENOMEM;
+			}
+		}
+
+		h->cbh_elements2[hwm >> CBH_SHIFT] = frag2;
+		goto out;
+	}
+
+	hwm -= CBH_SIZE * CBH_SIZE;
+#if (CBH_SHIFT * 3 < 32)
+	if (hwm >= CBH_SIZE * CBH_SIZE * CBH_SIZE) {
+		/* filled triple indirect */
+		return -ENOMEM;
+	}
+#endif
+	CBH_ALLOC(frag2, h);
+	if (frag2 == NULL)
+		return -ENOMEM;
+
+	if (((hwm >> CBH_SHIFT) & CBH_MASK) == 0) {
+		/* first use of this 2nd level index */
+		CBH_ALLOC(frag1, h);
+		if (frag1 == NULL) {
+			CBH_FREE(frag2);
+			return -ENOMEM;
+		}
+	}
+
+	if (hwm == 0) {
+		/* first use of triple indirect */
+		CBH_ALLOC(h->cbh_elements3, h);
+		if (h->cbh_elements3 == NULL) {
+			CBH_FREE(frag2);
+			CBH_FREE(frag1);
+			return -ENOMEM;
+		}
+	}
+
+	if (frag1 != NULL) {
+		LASSERT(h->cbh_elements3[hwm >> (2 * CBH_SHIFT)] == NULL);
+		h->cbh_elements3[hwm >> (2 * CBH_SHIFT)] = frag1;
+	} else {
+		frag1 = h->cbh_elements3[hwm >> (2 * CBH_SHIFT)];
+		LASSERT(frag1 != NULL);
+	}
+
+	frag1[(hwm >> CBH_SHIFT) & CBH_MASK] = frag2;
+
+ out:
+	h->cbh_hwm += CBH_SIZE;
+	return 0;
+}
+
+/**
+ * Creates and initializes a binary heap instance.
+ *
+ * \param[in] ops   The operations to be used
+ * \param[in] flags The heap flags
+ * \parm[in]  count The initial heap capacity in # of elements
+ * \param[in] arg   An optional private argument
+ * \param[in] cptab The CPT table this heap instance will operate over
+ * \param[in] cptid The CPT id of \a cptab this heap instance will operate over
+ *
+ * \retval valid-pointer A newly-created and initialized binary heap object
+ * \retval NULL		 error
+ */
+struct cfs_binheap *
+cfs_binheap_create(struct cfs_binheap_ops *ops, unsigned int flags,
+		   unsigned count, void *arg, struct cfs_cpt_table *cptab,
+		   int cptid)
+{
+	struct cfs_binheap *h;
+
+	LASSERT(ops != NULL);
+	LASSERT(ops->hop_compare != NULL);
+	if (cptab) {
+		LASSERT(cptid == CFS_CPT_ANY ||
+		       (cptid >= 0 && cptid < cptab->ctb_nparts));
+		LIBCFS_CPT_ALLOC(h, cptab, cptid, sizeof(*h));
+	} else {
+		LIBCFS_ALLOC(h, sizeof(*h));
+	}
+	if (!h)
+		return NULL;
+
+	h->cbh_ops	  = ops;
+	h->cbh_nelements  = 0;
+	h->cbh_hwm	  = 0;
+	h->cbh_private	  = arg;
+	h->cbh_flags	  = flags & (~CBH_FLAG_ATOMIC_GROW);
+	h->cbh_cptab	  = cptab;
+	h->cbh_cptid	  = cptid;
+
+	while (h->cbh_hwm < count) { /* preallocate */
+		if (cfs_binheap_grow(h) != 0) {
+			cfs_binheap_destroy(h);
+			return NULL;
+		}
+	}
+
+	h->cbh_flags |= flags & CBH_FLAG_ATOMIC_GROW;
+
+	return h;
+}
+EXPORT_SYMBOL(cfs_binheap_create);
+
+/**
+ * Releases all resources associated with a binary heap instance.
+ *
+ * Deallocates memory for all indirection levels and the binary heap object
+ * itself.
+ *
+ * \param[in] h The binary heap object
+ */
+void
+cfs_binheap_destroy(struct cfs_binheap *h)
+{
+	int idx0;
+	int idx1;
+	int n;
+
+	LASSERT(h != NULL);
+
+	n = h->cbh_hwm;
+
+	if (n > 0) {
+		CBH_FREE(h->cbh_elements1);
+		n -= CBH_SIZE;
+	}
+
+	if (n > 0) {
+		for (idx0 = 0; idx0 < CBH_SIZE && n > 0; idx0++) {
+			CBH_FREE(h->cbh_elements2[idx0]);
+			n -= CBH_SIZE;
+		}
+
+		CBH_FREE(h->cbh_elements2);
+	}
+
+	if (n > 0) {
+		for (idx0 = 0; idx0 < CBH_SIZE && n > 0; idx0++) {
+
+			for (idx1 = 0; idx1 < CBH_SIZE && n > 0; idx1++) {
+				CBH_FREE(h->cbh_elements3[idx0][idx1]);
+				n -= CBH_SIZE;
+			}
+
+			CBH_FREE(h->cbh_elements3[idx0]);
+		}
+
+		CBH_FREE(h->cbh_elements3);
+	}
+
+	LIBCFS_FREE(h, sizeof(*h));
+}
+EXPORT_SYMBOL(cfs_binheap_destroy);
+
+/**
+ * Obtains a double pointer to a heap element, given its index into the binary
+ * tree.
+ *
+ * \param[in] h	  The binary heap instance
+ * \param[in] idx The requested node's index
+ *
+ * \retval valid-pointer A double pointer to a heap pointer entry
+ */
+static struct cfs_binheap_node **
+cfs_binheap_pointer(struct cfs_binheap *h, unsigned int idx)
+{
+	if (idx < CBH_SIZE)
+		return &(h->cbh_elements1[idx]);
+
+	idx -= CBH_SIZE;
+	if (idx < CBH_SIZE * CBH_SIZE)
+		return &(h->cbh_elements2[idx >> CBH_SHIFT][idx & CBH_MASK]);
+
+	idx -= CBH_SIZE * CBH_SIZE;
+	return &(h->cbh_elements3[idx >> (2 * CBH_SHIFT)]\
+				 [(idx >> CBH_SHIFT) & CBH_MASK]\
+				 [idx & CBH_MASK]);
+}
+
+/**
+ * Obtains a pointer to a heap element, given its index into the binary tree.
+ *
+ * \param[in] h	  The binary heap
+ * \param[in] idx The requested node's index
+ *
+ * \retval valid-pointer The requested heap node
+ * \retval NULL		 Supplied index is out of bounds
+ */
+struct cfs_binheap_node *
+cfs_binheap_find(struct cfs_binheap *h, unsigned int idx)
+{
+	if (idx >= h->cbh_nelements)
+		return NULL;
+
+	return *cfs_binheap_pointer(h, idx);
+}
+EXPORT_SYMBOL(cfs_binheap_find);
+
+/**
+ * Moves a node upwards, towards the root of the binary tree.
+ *
+ * \param[in] h The heap
+ * \param[in] e The node
+ *
+ * \retval 1 The position of \a e in the tree was changed at least once
+ * \retval 0 The position of \a e in the tree was not changed
+ */
+static int
+cfs_binheap_bubble(struct cfs_binheap *h, struct cfs_binheap_node *e)
+{
+	unsigned int	     cur_idx = e->chn_index;
+	struct cfs_binheap_node **cur_ptr;
+	unsigned int	     parent_idx;
+	struct cfs_binheap_node **parent_ptr;
+	int		     did_sth = 0;
+
+	cur_ptr = cfs_binheap_pointer(h, cur_idx);
+	LASSERT(*cur_ptr == e);
+
+	while (cur_idx > 0) {
+		parent_idx = (cur_idx - 1) >> 1;
+
+		parent_ptr = cfs_binheap_pointer(h, parent_idx);
+		LASSERT((*parent_ptr)->chn_index == parent_idx);
+
+		if (h->cbh_ops->hop_compare(*parent_ptr, e))
+			break;
+
+		(*parent_ptr)->chn_index = cur_idx;
+		*cur_ptr = *parent_ptr;
+		cur_ptr = parent_ptr;
+		cur_idx = parent_idx;
+		did_sth = 1;
+	}
+
+	e->chn_index = cur_idx;
+	*cur_ptr = e;
+
+	return did_sth;
+}
+
+/**
+ * Moves a node downwards, towards the last level of the binary tree.
+ *
+ * \param[in] h The heap
+ * \param[in] e The node
+ *
+ * \retval 1 The position of \a e in the tree was changed at least once
+ * \retval 0 The position of \a e in the tree was not changed
+ */
+static int
+cfs_binheap_sink(struct cfs_binheap *h, struct cfs_binheap_node *e)
+{
+	unsigned int	     n = h->cbh_nelements;
+	unsigned int	     child_idx;
+	struct cfs_binheap_node **child_ptr;
+	struct cfs_binheap_node  *child;
+	unsigned int	     child2_idx;
+	struct cfs_binheap_node **child2_ptr;
+	struct cfs_binheap_node  *child2;
+	unsigned int	     cur_idx;
+	struct cfs_binheap_node **cur_ptr;
+	int		     did_sth = 0;
+
+	cur_idx = e->chn_index;
+	cur_ptr = cfs_binheap_pointer(h, cur_idx);
+	LASSERT(*cur_ptr == e);
+
+	while (cur_idx < n) {
+		child_idx = (cur_idx << 1) + 1;
+		if (child_idx >= n)
+			break;
+
+		child_ptr = cfs_binheap_pointer(h, child_idx);
+		child = *child_ptr;
+
+		child2_idx = child_idx + 1;
+		if (child2_idx < n) {
+			child2_ptr = cfs_binheap_pointer(h, child2_idx);
+			child2 = *child2_ptr;
+
+			if (h->cbh_ops->hop_compare(child2, child)) {
+				child_idx = child2_idx;
+				child_ptr = child2_ptr;
+				child = child2;
+			}
+		}
+
+		LASSERT(child->chn_index == child_idx);
+
+		if (h->cbh_ops->hop_compare(e, child))
+			break;
+
+		child->chn_index = cur_idx;
+		*cur_ptr = child;
+		cur_ptr = child_ptr;
+		cur_idx = child_idx;
+		did_sth = 1;
+	}
+
+	e->chn_index = cur_idx;
+	*cur_ptr = e;
+
+	return did_sth;
+}
+
+/**
+ * Sort-inserts a node into the binary heap.
+ *
+ * \param[in] h The heap
+ * \param[in] e The node
+ *
+ * \retval 0	Element inserted successfully
+ * \retval != 0 error
+ */
+int
+cfs_binheap_insert(struct cfs_binheap *h, struct cfs_binheap_node *e)
+{
+	struct cfs_binheap_node **new_ptr;
+	unsigned int	     new_idx = h->cbh_nelements;
+	int		     rc;
+
+	if (new_idx == h->cbh_hwm) {
+		rc = cfs_binheap_grow(h);
+		if (rc != 0)
+			return rc;
+	}
+
+	if (h->cbh_ops->hop_enter) {
+		rc = h->cbh_ops->hop_enter(h, e);
+		if (rc != 0)
+			return rc;
+	}
+
+	e->chn_index = new_idx;
+	new_ptr = cfs_binheap_pointer(h, new_idx);
+	h->cbh_nelements++;
+	*new_ptr = e;
+
+	cfs_binheap_bubble(h, e);
+
+	return 0;
+}
+EXPORT_SYMBOL(cfs_binheap_insert);
+
+/**
+ * Removes a node from the binary heap.
+ *
+ * \param[in] h The heap
+ * \param[in] e The node
+ */
+void
+cfs_binheap_remove(struct cfs_binheap *h, struct cfs_binheap_node *e)
+{
+	unsigned int	     n = h->cbh_nelements;
+	unsigned int	     cur_idx = e->chn_index;
+	struct cfs_binheap_node **cur_ptr;
+	struct cfs_binheap_node  *last;
+
+	LASSERT(cur_idx != CBH_POISON);
+	LASSERT(cur_idx < n);
+
+	cur_ptr = cfs_binheap_pointer(h, cur_idx);
+	LASSERT(*cur_ptr == e);
+
+	n--;
+	last = *cfs_binheap_pointer(h, n);
+	h->cbh_nelements = n;
+	if (last == e)
+		return;
+
+	last->chn_index = cur_idx;
+	*cur_ptr = last;
+	cfs_binheap_relocate(h, *cur_ptr);
+
+	e->chn_index = CBH_POISON;
+	if (h->cbh_ops->hop_exit)
+		h->cbh_ops->hop_exit(h, e);
+}
+EXPORT_SYMBOL(cfs_binheap_remove);
+
+/**
+ * Relocate a node in the binary heap.
+ * Should be called whenever a node's values
+ * which affects its ranking are changed.
+ *
+ * \param[in] h The heap
+ * \param[in] e The node
+ */
+void
+cfs_binheap_relocate(struct cfs_binheap *h, struct cfs_binheap_node *e)
+{
+	if (!cfs_binheap_bubble(h, e))
+		cfs_binheap_sink(h, e);
+}
+EXPORT_SYMBOL(cfs_binheap_relocate);
+/** @} heap */
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_cpu.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_cpu.c
new file mode 100644
index 0000000000000..209333edf6b5b
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_cpu.c
@@ -0,0 +1,234 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Please see comments in libcfs/include/libcfs/libcfs_cpu.h for introduction
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <libcfs/libcfs.h>
+
+/** Global CPU partition table */
+struct cfs_cpt_table *cfs_cpt_table __read_mostly = NULL;
+EXPORT_SYMBOL(cfs_cpt_table);
+
+#ifndef HAVE_LIBCFS_CPT
+
+#define CFS_CPU_VERSION_MAGIC           0xbabecafe
+
+#define CFS_CPT_DISTANCE		1	/* Arbitrary positive value */
+
+struct cfs_cpt_table *cfs_cpt_table_alloc(int ncpt)
+{
+	struct cfs_cpt_table *cptab;
+
+	if (ncpt != 1) {
+		CERROR("Can't support cpu partition number %d\n", ncpt);
+		return NULL;
+	}
+
+	LIBCFS_ALLOC(cptab, sizeof(*cptab));
+	if (cptab != NULL) {
+		cptab->ctb_version = CFS_CPU_VERSION_MAGIC;
+		cpu_set(0, cptab->ctb_cpumask);
+		node_set(0, cptab->ctb_nodemask);
+		cptab->ctb_nparts  = ncpt;
+	}
+
+	return cptab;
+}
+EXPORT_SYMBOL(cfs_cpt_table_alloc);
+
+void cfs_cpt_table_free(struct cfs_cpt_table *cptab)
+{
+	LASSERT(cptab->ctb_version == CFS_CPU_VERSION_MAGIC);
+
+	LIBCFS_FREE(cptab, sizeof(*cptab));
+}
+EXPORT_SYMBOL(cfs_cpt_table_free);
+
+int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
+{
+	int rc = 0;
+
+	rc = snprintf(buf, len, "%d\t: %d\n", 0, 0);
+	len -= rc;
+	if (len <= 0)
+		return -EFBIG;
+
+	return rc;
+}
+EXPORT_SYMBOL(cfs_cpt_table_print);
+
+int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len)
+{
+	int	rc = 0;
+
+	rc = snprintf(buf, len, "%d\t: %d:%d\n", 0, CFS_CPT_DISTANCE);
+	len -= rc;
+	if (len <= 0)
+		return -EFBIG;
+
+	return rc;
+}
+EXPORT_SYMBOL(cfs_cpt_distance_print);
+
+int cfs_cpt_number(struct cfs_cpt_table *cptab)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_number);
+
+int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_weight);
+
+int cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_online);
+
+cpumask_t *cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt)
+{
+	return &cptab->ctb_mask;
+}
+EXPORT_SYMBOL(cfs_cpt_cpumask);
+
+nodemask_t *cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt)
+{
+	return &cptab->ctb_nodemask;
+}
+EXPORT_SYMBOL(cfs_cpt_nodemask);
+
+unsigned cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2)
+{
+	return CFS_CPT_DISTANCE;
+}
+EXPORT_SYMBOL(cfs_cpt_distance);
+
+int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpu);
+
+void cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpu);
+
+int cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt,
+			const cpumask_t *mask)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpumask);
+
+void cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt,
+			   const cpumask_t *mask)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
+
+int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_node);
+
+void cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_node);
+
+int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt,
+			 const nodemask_t *mask)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_nodemask);
+
+void cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt,
+			    const nodemask_t *mask)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
+
+int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
+{
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_spread_node);
+
+int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
+{
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_current);
+
+int cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
+{
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_of_cpu);
+
+int cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node)
+{
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_of_node);
+
+int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
+{
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_bind);
+
+void cfs_cpu_fini(void)
+{
+	if (cfs_cpt_table != NULL) {
+		cfs_cpt_table_free(cfs_cpt_table);
+		cfs_cpt_table = NULL;
+	}
+}
+
+int cfs_cpu_init(void)
+{
+	cfs_cpt_table = cfs_cpt_table_alloc(1);
+
+	return cfs_cpt_table != NULL ? 0 : -1;
+}
+
+#endif /* HAVE_LIBCFS_CPT */
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_lock.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_lock.c
new file mode 100644
index 0000000000000..c6ba9e728b688
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_lock.c
@@ -0,0 +1,157 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <libcfs/libcfs.h>
+
+/** destroy cpu-partition lock, see libcfs_private.h for more detail */
+void
+cfs_percpt_lock_free(struct cfs_percpt_lock *pcl)
+{
+	LASSERT(pcl->pcl_locks != NULL);
+	LASSERT(!pcl->pcl_locked);
+
+	cfs_percpt_free(pcl->pcl_locks);
+	LIBCFS_FREE(pcl, sizeof(*pcl));
+}
+EXPORT_SYMBOL(cfs_percpt_lock_free);
+
+/**
+ * create cpu-partition lock, see libcfs_private.h for more detail.
+ *
+ * cpu-partition lock is designed for large-scale SMP system, so we need to
+ * reduce cacheline conflict as possible as we can, that's the
+ * reason we always allocate cacheline-aligned memory block.
+ */
+struct cfs_percpt_lock *
+cfs_percpt_lock_create(struct cfs_cpt_table *cptab,
+		       struct lock_class_key *keys)
+{
+	struct cfs_percpt_lock	*pcl;
+	spinlock_t		*lock;
+	int			i;
+
+	/* NB: cptab can be NULL, pcl will be for HW CPUs on that case */
+	LIBCFS_ALLOC(pcl, sizeof(*pcl));
+	if (pcl == NULL)
+		return NULL;
+
+	pcl->pcl_cptab = cptab;
+	pcl->pcl_locks = cfs_percpt_alloc(cptab, sizeof(*lock));
+	if (pcl->pcl_locks == NULL) {
+		LIBCFS_FREE(pcl, sizeof(*pcl));
+		return NULL;
+	}
+
+	if (keys == NULL) {
+		CWARN("Cannot setup class key for percpt lock, you may see "
+		      "recursive locking warnings which are actually fake.\n");
+	}
+
+	cfs_percpt_for_each(lock, i, pcl->pcl_locks) {
+		spin_lock_init(lock);
+		if (keys != NULL)
+			lockdep_set_class(lock, &keys[i]);
+	}
+
+	return pcl;
+}
+EXPORT_SYMBOL(cfs_percpt_lock_create);
+
+/**
+ * lock a CPU partition
+ *
+ * \a index != CFS_PERCPT_LOCK_EX
+ *     hold private lock indexed by \a index
+ *
+ * \a index == CFS_PERCPT_LOCK_EX
+ *     exclusively lock @pcl and nobody can take private lock
+ */
+void
+cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index)
+__acquires(pcl->pcl_locks)
+{
+	int	ncpt = cfs_cpt_number(pcl->pcl_cptab);
+	int	i;
+
+	LASSERT(index >= CFS_PERCPT_LOCK_EX && index < ncpt);
+
+	if (ncpt == 1) {
+		index = 0;
+	} else { /* serialize with exclusive lock */
+		while (pcl->pcl_locked)
+			cpu_relax();
+	}
+
+	if (likely(index != CFS_PERCPT_LOCK_EX)) {
+		spin_lock(pcl->pcl_locks[index]);
+		return;
+	}
+
+	/* exclusive lock request */
+	for (i = 0; i < ncpt; i++) {
+		spin_lock(pcl->pcl_locks[i]);
+		if (i == 0) {
+			LASSERT(!pcl->pcl_locked);
+			/* nobody should take private lock after this
+			 * so I wouldn't starve for too long time */
+			pcl->pcl_locked = 1;
+		}
+	}
+}
+EXPORT_SYMBOL(cfs_percpt_lock);
+
+/** unlock a CPU partition */
+void
+cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index)
+__releases(pcl->pcl_locks)
+{
+	int	ncpt = cfs_cpt_number(pcl->pcl_cptab);
+	int	i;
+
+	index = ncpt == 1 ? 0 : index;
+
+	if (likely(index != CFS_PERCPT_LOCK_EX)) {
+		spin_unlock(pcl->pcl_locks[index]);
+		return;
+	}
+
+	for (i = ncpt - 1; i >= 0; i--) {
+		if (i == 0) {
+			LASSERT(pcl->pcl_locked);
+			pcl->pcl_locked = 0;
+		}
+		spin_unlock(pcl->pcl_locks[i]);
+	}
+}
+EXPORT_SYMBOL(cfs_percpt_unlock);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c
new file mode 100644
index 0000000000000..3e83f50579913
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c
@@ -0,0 +1,172 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <libcfs/libcfs.h>
+
+struct cfs_var_array {
+	unsigned int		va_count;	/* # of buffers */
+	unsigned int		va_size;	/* size of each var */
+	struct cfs_cpt_table	*va_cptab;	/* cpu partition table */
+	void			*va_ptrs[0];	/* buffer addresses */
+};
+
+/*
+ * free per-cpu data, see more detail in cfs_percpt_free
+ */
+void
+cfs_percpt_free(void *vars)
+{
+	struct	cfs_var_array *arr;
+	int	i;
+
+	arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+	for (i = 0; i < arr->va_count; i++) {
+		if (arr->va_ptrs[i] != NULL)
+			LIBCFS_FREE(arr->va_ptrs[i], arr->va_size);
+	}
+
+	LIBCFS_FREE(arr, offsetof(struct cfs_var_array,
+				  va_ptrs[arr->va_count]));
+}
+EXPORT_SYMBOL(cfs_percpt_free);
+
+/*
+ * allocate per cpu-partition variables, returned value is an array of pointers,
+ * variable can be indexed by CPU partition ID, i.e:
+ *
+ *	arr = cfs_percpt_alloc(cfs_cpu_pt, size);
+ *	then caller can access memory block for CPU 0 by arr[0],
+ *	memory block for CPU 1 by arr[1]...
+ *	memory block for CPU N by arr[N]...
+ *
+ * cacheline aligned.
+ */
+void *
+cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size)
+{
+	struct cfs_var_array	*arr;
+	int			count;
+	int			i;
+
+	count = cfs_cpt_number(cptab);
+
+	LIBCFS_ALLOC(arr, offsetof(struct cfs_var_array, va_ptrs[count]));
+	if (arr == NULL)
+		return NULL;
+
+	arr->va_size	= size = L1_CACHE_ALIGN(size);
+	arr->va_count	= count;
+	arr->va_cptab	= cptab;
+
+	for (i = 0; i < count; i++) {
+		LIBCFS_CPT_ALLOC(arr->va_ptrs[i], cptab, i, size);
+		if (arr->va_ptrs[i] == NULL) {
+			cfs_percpt_free((void *)&arr->va_ptrs[0]);
+			return NULL;
+		}
+	}
+
+	return (void *)&arr->va_ptrs[0];
+}
+EXPORT_SYMBOL(cfs_percpt_alloc);
+
+/*
+ * return number of CPUs (or number of elements in per-cpu data)
+ * according to cptab of @vars
+ */
+int
+cfs_percpt_number(void *vars)
+{
+	struct cfs_var_array *arr;
+
+	arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+	return arr->va_count;
+}
+EXPORT_SYMBOL(cfs_percpt_number);
+
+/*
+ * free variable array, see more detail in cfs_array_alloc
+ */
+void
+cfs_array_free(void *vars)
+{
+	struct cfs_var_array	*arr;
+	int			i;
+
+	arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+	for (i = 0; i < arr->va_count; i++) {
+		if (arr->va_ptrs[i] == NULL)
+			continue;
+
+		LIBCFS_FREE(arr->va_ptrs[i], arr->va_size);
+	}
+	LIBCFS_FREE(arr, offsetof(struct cfs_var_array,
+				  va_ptrs[arr->va_count]));
+}
+EXPORT_SYMBOL(cfs_array_free);
+
+/*
+ * allocate a variable array, returned value is an array of pointers.
+ * Caller can specify length of array by @count, @size is size of each
+ * memory block in array.
+ */
+void *
+cfs_array_alloc(int count, unsigned int size)
+{
+	struct cfs_var_array	*arr;
+	int			i;
+
+	LIBCFS_ALLOC(arr, offsetof(struct cfs_var_array, va_ptrs[count]));
+	if (arr == NULL)
+		return NULL;
+
+	arr->va_count	= count;
+	arr->va_size	= size;
+
+	for (i = 0; i < count; i++) {
+		LIBCFS_ALLOC(arr->va_ptrs[i], size);
+
+		if (arr->va_ptrs[i] == NULL) {
+			cfs_array_free((void *)&arr->va_ptrs[0]);
+			return NULL;
+		}
+	}
+
+	return (void *)&arr->va_ptrs[0];
+}
+EXPORT_SYMBOL(cfs_array_alloc);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c
new file mode 100644
index 0000000000000..275c01b74ad4e
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c
@@ -0,0 +1,483 @@
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/cpumask.h>
+#include <linux/cpu.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/moduleparam.h>
+#include <linux/mmu_context.h>
+
+#define DEBUG_SUBSYSTEM S_UNDEFINED
+
+#include <libcfs/libcfs.h>
+#include <libcfs/libcfs_ptask.h>
+
+/**
+ * This API based on Linux kernel padada API which is used to perform
+ * encryption and decryption on large numbers of packets without
+ * reordering those packets.
+ *
+ * It was adopted for general use in Lustre for parallelization of
+ * various functionality.
+ *
+ * The first step in using it is to set up a cfs_ptask structure to
+ * control of how this task are to be run:
+ *
+ * #include <libcfs/libcfs_ptask.h>
+ *
+ * int cfs_ptask_init(struct cfs_ptask *ptask, cfs_ptask_cb_t cbfunc,
+ *                    void *cbdata, unsigned int flags, int cpu);
+ *
+ * The cbfunc function with cbdata argument will be called in the process
+ * of getting the task done. The cpu specifies which CPU will be used for
+ * the final callback when the task is done.
+ *
+ * The submission of task is done with:
+ *
+ * int cfs_ptask_submit(struct cfs_ptask *ptask, struct cfs_ptask_engine *engine);
+ *
+ * The task is submitted to the engine for execution.
+ *
+ * In order to wait for result of task execution you should call:
+ *
+ * int cfs_ptask_wait_for(struct cfs_ptask *ptask);
+ *
+ * The tasks with flag PTF_ORDERED are executed in parallel but complete
+ * into submission order. So, waiting for last ordered task you can be sure
+ * that all previous tasks were done before this task complete.
+ */
+
+#ifndef HAVE_REINIT_COMPLETION
+/**
+ * reinit_completion - reinitialize a completion structure
+ * @x:  pointer to completion structure that is to be reinitialized
+ *
+ * This inline function should be used to reinitialize a completion
+ * structure so it can be reused. This is especially important after
+ * complete_all() is used.
+ */
+static inline void reinit_completion(struct completion *x)
+{
+	x->done = 0;
+}
+#endif
+
+#ifndef HAVE_CPUMASK_PRINT_TO_PAGEBUF
+static inline void cpumap_print_to_pagebuf(bool unused, char *buf,
+					   const struct cpumask *mask)
+{
+	cpulist_scnprintf(buf, PAGE_SIZE, mask);
+}
+#endif
+
+#ifdef CONFIG_PADATA
+static void cfs_ptask_complete(struct padata_priv *padata)
+{
+	struct cfs_ptask *ptask = cfs_padata2ptask(padata);
+
+	if (cfs_ptask_need_complete(ptask)) {
+		if (cfs_ptask_is_ordered(ptask))
+			complete(&ptask->pt_completion);
+	} else if (cfs_ptask_is_autofree(ptask)) {
+		kfree(ptask);
+	}
+}
+
+static void cfs_ptask_execute(struct padata_priv *padata)
+{
+	struct cfs_ptask *ptask = cfs_padata2ptask(padata);
+	mm_segment_t old_fs = get_fs();
+	bool bh_enabled = false;
+
+	if (!cfs_ptask_is_atomic(ptask)) {
+		local_bh_enable();
+		bh_enabled = true;
+	}
+
+	if (cfs_ptask_use_user_mm(ptask) && ptask->pt_mm != NULL) {
+		use_mm(ptask->pt_mm);
+		set_fs(ptask->pt_fs);
+	}
+
+	if (ptask->pt_cbfunc != NULL)
+		ptask->pt_result = ptask->pt_cbfunc(ptask);
+	else
+		ptask->pt_result = -ENOSYS;
+
+	if (cfs_ptask_use_user_mm(ptask) && ptask->pt_mm != NULL) {
+		set_fs(old_fs);
+		unuse_mm(ptask->pt_mm);
+		mmput(ptask->pt_mm);
+		ptask->pt_mm = NULL;
+	}
+
+	if (cfs_ptask_need_complete(ptask) && !cfs_ptask_is_ordered(ptask))
+		complete(&ptask->pt_completion);
+
+	if (bh_enabled)
+		local_bh_disable();
+
+	padata_do_serial(padata);
+}
+
+static int cfs_do_parallel(struct cfs_ptask_engine *engine,
+			   struct padata_priv *padata)
+{
+	struct cfs_ptask *ptask = cfs_padata2ptask(padata);
+	int rc;
+
+	if (cfs_ptask_need_complete(ptask))
+		reinit_completion(&ptask->pt_completion);
+
+	if (cfs_ptask_use_user_mm(ptask)) {
+		ptask->pt_mm = get_task_mm(current);
+		ptask->pt_fs = get_fs();
+	}
+	ptask->pt_result = -EINPROGRESS;
+
+retry:
+	rc = padata_do_parallel(engine->pte_pinst, padata, ptask->pt_cbcpu);
+	if (rc == -EBUSY && cfs_ptask_is_retry(ptask)) {
+		/* too many tasks already in queue */
+		schedule_timeout_uninterruptible(1);
+		goto retry;
+	}
+
+	if (rc) {
+		if (cfs_ptask_use_user_mm(ptask) && ptask->pt_mm != NULL) {
+			mmput(ptask->pt_mm);
+			ptask->pt_mm = NULL;
+		}
+		ptask->pt_result = rc;
+	}
+
+	return rc;
+}
+
+/**
+ * This function submit initialized task for async execution
+ * in engine with specified id.
+ */
+int cfs_ptask_submit(struct cfs_ptask *ptask, struct cfs_ptask_engine *engine)
+{
+	struct padata_priv *padata = cfs_ptask2padata(ptask);
+
+	if (IS_ERR_OR_NULL(engine))
+		return -EINVAL;
+
+	memset(padata, 0, sizeof(*padata));
+
+	padata->parallel = cfs_ptask_execute;
+	padata->serial   = cfs_ptask_complete;
+
+	return cfs_do_parallel(engine, padata);
+}
+
+#else  /* !CONFIG_PADATA */
+
+/**
+ * If CONFIG_PADATA is not defined this function just execute
+ * the initialized task in current thread. (emulate async execution)
+ */
+int cfs_ptask_submit(struct cfs_ptask *ptask, struct cfs_ptask_engine *engine)
+{
+	if (IS_ERR_OR_NULL(engine))
+		return -EINVAL;
+
+	if (ptask->pt_cbfunc != NULL)
+		ptask->pt_result = ptask->pt_cbfunc(ptask);
+	else
+		ptask->pt_result = -ENOSYS;
+
+	if (cfs_ptask_need_complete(ptask))
+		complete(&ptask->pt_completion);
+	else if (cfs_ptask_is_autofree(ptask))
+		kfree(ptask);
+
+	return 0;
+}
+#endif /* CONFIG_PADATA */
+
+EXPORT_SYMBOL(cfs_ptask_submit);
+
+/**
+ * This function waits when task complete async execution.
+ * The tasks with flag PTF_ORDERED are executed in parallel but completes
+ * into submission order. So, waiting for last ordered task you can be sure
+ * that all previous tasks were done before this task complete.
+ */
+int cfs_ptask_wait_for(struct cfs_ptask *ptask)
+{
+	if (!cfs_ptask_need_complete(ptask))
+		return -EINVAL;
+
+	wait_for_completion(&ptask->pt_completion);
+
+	return 0;
+}
+EXPORT_SYMBOL(cfs_ptask_wait_for);
+
+/**
+ * This function initialize internal members of task and prepare it for
+ * async execution.
+ */
+int cfs_ptask_init(struct cfs_ptask *ptask, cfs_ptask_cb_t cbfunc, void *cbdata,
+		   unsigned int flags, int cpu)
+{
+	memset(ptask, 0, sizeof(*ptask));
+
+	ptask->pt_flags  = flags;
+	ptask->pt_cbcpu  = cpu;
+	ptask->pt_mm     = NULL; /* will be set in cfs_do_parallel() */
+	ptask->pt_fs     = get_fs();
+	ptask->pt_cbfunc = cbfunc;
+	ptask->pt_cbdata = cbdata;
+	ptask->pt_result = -EAGAIN;
+
+	if (cfs_ptask_need_complete(ptask)) {
+		if (cfs_ptask_is_autofree(ptask))
+			return -EINVAL;
+
+		init_completion(&ptask->pt_completion);
+	}
+
+	if (cfs_ptask_is_atomic(ptask) && cfs_ptask_use_user_mm(ptask))
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL(cfs_ptask_init);
+
+/**
+ * This function set the mask of allowed CPUs for parallel execution
+ * for engine with specified id.
+ */
+int cfs_ptengine_set_cpumask(struct cfs_ptask_engine *engine,
+			     const struct cpumask *cpumask)
+{
+	int rc = 0;
+
+#ifdef CONFIG_PADATA
+	cpumask_var_t serial_mask;
+	cpumask_var_t parallel_mask;
+
+	if (IS_ERR_OR_NULL(engine))
+		return -EINVAL;
+
+	if (!alloc_cpumask_var(&serial_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	if (!alloc_cpumask_var(&parallel_mask, GFP_KERNEL)) {
+		free_cpumask_var(serial_mask);
+		return -ENOMEM;
+	}
+
+	cpumask_copy(parallel_mask, cpumask);
+	cpumask_copy(serial_mask, cpu_online_mask);
+
+	rc = padata_set_cpumask(engine->pte_pinst, PADATA_CPU_PARALLEL,
+				parallel_mask);
+	free_cpumask_var(parallel_mask);
+	if (rc)
+		goto out_failed_mask;
+
+	rc = padata_set_cpumask(engine->pte_pinst, PADATA_CPU_SERIAL,
+				serial_mask);
+out_failed_mask:
+	free_cpumask_var(serial_mask);
+#endif /* CONFIG_PADATA */
+
+	return rc;
+}
+EXPORT_SYMBOL(cfs_ptengine_set_cpumask);
+
+/**
+ * This function returns the count of allowed CPUs for parallel execution
+ * for engine with specified id.
+ */
+int cfs_ptengine_weight(struct cfs_ptask_engine *engine)
+{
+	if (IS_ERR_OR_NULL(engine))
+		return -EINVAL;
+
+	return engine->pte_weight;
+}
+EXPORT_SYMBOL(cfs_ptengine_weight);
+
+#ifdef CONFIG_PADATA
+static int cfs_ptask_cpumask_change_notify(struct notifier_block *self,
+					   unsigned long val, void *data)
+{
+	struct padata_cpumask *padata_cpumask = data;
+	struct cfs_ptask_engine *engine;
+
+	engine = container_of(self, struct cfs_ptask_engine, pte_notifier);
+
+	if (val & PADATA_CPU_PARALLEL)
+		engine->pte_weight = cpumask_weight(padata_cpumask->pcpu);
+
+	return 0;
+}
+
+static int cfs_ptengine_padata_init(struct cfs_ptask_engine *engine,
+				    const char *name,
+				    const struct cpumask *cpumask)
+{
+	cpumask_var_t all_mask;
+	cpumask_var_t par_mask;
+	unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_CPU_INTENSIVE;
+	int rc;
+
+	get_online_cpus();
+
+	engine->pte_wq = alloc_workqueue(name, wq_flags, 1);
+	if (engine->pte_wq == NULL)
+		GOTO(err, rc = -ENOMEM);
+
+	if (!alloc_cpumask_var(&all_mask, GFP_KERNEL))
+		GOTO(err_destroy_workqueue, rc = -ENOMEM);
+
+	if (!alloc_cpumask_var(&par_mask, GFP_KERNEL))
+		GOTO(err_free_all_mask, rc = -ENOMEM);
+
+	cpumask_copy(par_mask, cpumask);
+	if (cpumask_empty(par_mask) ||
+	    cpumask_equal(par_mask, cpu_online_mask)) {
+		cpumask_copy(all_mask, cpu_online_mask);
+		cpumask_clear(par_mask);
+		while (!cpumask_empty(all_mask)) {
+			int cpu = cpumask_first(all_mask);
+
+			cpumask_set_cpu(cpu, par_mask);
+			cpumask_andnot(all_mask, all_mask,
+					topology_sibling_cpumask(cpu));
+		}
+	}
+
+	cpumask_copy(all_mask, cpu_online_mask);
+
+	{
+		char *pa_mask_buff, *cb_mask_buff;
+
+		pa_mask_buff = (char *)__get_free_page(GFP_KERNEL);
+		if (pa_mask_buff == NULL)
+			GOTO(err_free_par_mask, rc = -ENOMEM);
+
+		cb_mask_buff = (char *)__get_free_page(GFP_KERNEL);
+		if (cb_mask_buff == NULL) {
+			free_page((unsigned long)pa_mask_buff);
+			GOTO(err_free_par_mask, rc = -ENOMEM);
+		}
+
+		cpumap_print_to_pagebuf(true, pa_mask_buff, par_mask);
+		pa_mask_buff[PAGE_SIZE - 1] = '\0';
+		cpumap_print_to_pagebuf(true, cb_mask_buff, all_mask);
+		cb_mask_buff[PAGE_SIZE - 1] = '\0';
+
+		CDEBUG(D_INFO, "%s weight=%u plist='%s' cblist='%s'\n",
+			name, cpumask_weight(par_mask),
+			pa_mask_buff, cb_mask_buff);
+
+		free_page((unsigned long)cb_mask_buff);
+		free_page((unsigned long)pa_mask_buff);
+	}
+
+	engine->pte_weight = cpumask_weight(par_mask);
+	engine->pte_pinst  = padata_alloc_possible(engine->pte_wq);
+	if (engine->pte_pinst == NULL)
+		GOTO(err_free_par_mask, rc = -ENOMEM);
+
+	engine->pte_notifier.notifier_call = cfs_ptask_cpumask_change_notify;
+	rc = padata_register_cpumask_notifier(engine->pte_pinst,
+					      &engine->pte_notifier);
+	if (rc)
+		GOTO(err_free_padata, rc);
+
+	rc = cfs_ptengine_set_cpumask(engine, par_mask);
+	if (rc)
+		GOTO(err_unregister, rc);
+
+	rc = padata_start(engine->pte_pinst);
+	if (rc)
+		GOTO(err_unregister, rc);
+
+	free_cpumask_var(par_mask);
+	free_cpumask_var(all_mask);
+
+	put_online_cpus();
+	return 0;
+
+err_unregister:
+	padata_unregister_cpumask_notifier(engine->pte_pinst,
+					   &engine->pte_notifier);
+err_free_padata:
+	padata_free(engine->pte_pinst);
+err_free_par_mask:
+	free_cpumask_var(par_mask);
+err_free_all_mask:
+	free_cpumask_var(all_mask);
+err_destroy_workqueue:
+	destroy_workqueue(engine->pte_wq);
+err:
+	put_online_cpus();
+	return rc;
+}
+
+static void cfs_ptengine_padata_fini(struct cfs_ptask_engine *engine)
+{
+	padata_stop(engine->pte_pinst);
+	padata_unregister_cpumask_notifier(engine->pte_pinst,
+					   &engine->pte_notifier);
+	padata_free(engine->pte_pinst);
+	destroy_workqueue(engine->pte_wq);
+}
+
+#else  /* !CONFIG_PADATA */
+
+static int cfs_ptengine_padata_init(struct cfs_ptask_engine *engine,
+				    const char *name,
+				    const struct cpumask *cpumask)
+{
+	engine->pte_weight = 1;
+
+	return 0;
+}
+
+static void cfs_ptengine_padata_fini(struct cfs_ptask_engine *engine)
+{
+}
+#endif /* CONFIG_PADATA */
+
+struct cfs_ptask_engine *cfs_ptengine_init(const char *name,
+					   const struct cpumask *cpumask)
+{
+	struct cfs_ptask_engine *engine;
+	int rc;
+
+	engine = kzalloc(sizeof(*engine), GFP_KERNEL);
+	if (engine == NULL)
+		GOTO(err, rc = -ENOMEM);
+
+	rc = cfs_ptengine_padata_init(engine, name, cpumask);
+	if (rc)
+		GOTO(err_free_engine, rc);
+
+	return engine;
+
+err_free_engine:
+	kfree(engine);
+err:
+	return ERR_PTR(rc);
+}
+EXPORT_SYMBOL(cfs_ptengine_init);
+
+void cfs_ptengine_fini(struct cfs_ptask_engine *engine)
+{
+	if (IS_ERR_OR_NULL(engine))
+		return;
+
+	cfs_ptengine_padata_fini(engine);
+	kfree(engine);
+}
+EXPORT_SYMBOL(cfs_ptengine_fini);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_string.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_string.c
new file mode 100644
index 0000000000000..04e1dd56dd430
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_string.c
@@ -0,0 +1,596 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * String manipulation functions.
+ *
+ * libcfs/libcfs/libcfs_string.c
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ */
+
+#include <libcfs/libcfs.h>
+
+char *cfs_strrstr(const char *haystack, const char *needle)
+{
+	char *ptr;
+
+	if (unlikely(haystack == NULL || needle == NULL))
+		return NULL;
+
+	if (strlen(needle) == 1)
+		return strrchr(haystack, needle[0]);
+
+	ptr = strstr(haystack, needle);
+	if (ptr != NULL) {
+		while (1) {
+			char *tmp;
+
+			tmp = strstr(&ptr[1], needle);
+			if (tmp == NULL)
+				return ptr;
+
+			ptr = tmp;
+		}
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(cfs_strrstr);
+
+/* Convert a text string to a bitmask */
+int cfs_str2mask(const char *str, const char *(*bit2str)(int bit),
+                 int *oldmask, int minmask, int allmask)
+{
+        const char *debugstr;
+        char op = 0;
+        int newmask = minmask, i, len, found = 0;
+        ENTRY;
+
+        /* <str> must be a list of tokens separated by whitespace
+         * and optionally an operator ('+' or '-').  If an operator
+         * appears first in <str>, '*oldmask' is used as the starting point
+         * (relative), otherwise minmask is used (absolute).  An operator
+         * applies to all following tokens up to the next operator. */
+        while (*str != 0) {
+                while (isspace(*str))
+                        str++;
+                if (*str == 0)
+                        break;
+                if (*str == '+' || *str == '-') {
+                        op = *str++;
+                        if (!found)
+                                /* only if first token is relative */
+                                newmask = *oldmask;
+                        while (isspace(*str))
+                                str++;
+                        if (*str == 0)          /* trailing op */
+                                return -EINVAL;
+                }
+
+                /* find token length */
+                for (len = 0; str[len] != 0 && !isspace(str[len]) &&
+                      str[len] != '+' && str[len] != '-'; len++);
+
+                /* match token */
+                found = 0;
+                for (i = 0; i < 32; i++) {
+                        debugstr = bit2str(i);
+                        if (debugstr != NULL &&
+                            strlen(debugstr) == len &&
+			    strncasecmp(str, debugstr, len) == 0) {
+                                if (op == '-')
+                                        newmask &= ~(1 << i);
+                                else
+                                        newmask |= (1 << i);
+                                found = 1;
+                                break;
+                        }
+                }
+                if (!found && len == 3 &&
+		    (strncasecmp(str, "ALL", len) == 0)) {
+                        if (op == '-')
+                                newmask = minmask;
+                        else
+                                newmask = allmask;
+                        found = 1;
+                }
+                if (!found) {
+                        CWARN("unknown mask '%.*s'.\n"
+                              "mask usage: [+|-]<all|type> ...\n", len, str);
+                        return -EINVAL;
+                }
+                str += len;
+        }
+
+        *oldmask = newmask;
+        return 0;
+}
+EXPORT_SYMBOL(cfs_str2mask);
+
+/* get the first string out of @str */
+char *cfs_firststr(char *str, size_t size)
+{
+        size_t i = 0;
+        char  *end;
+
+        /* trim leading spaces */
+        while (i < size && *str && isspace(*str)) {
+                ++i;
+                ++str;
+        }
+
+        /* string with all spaces */
+        if (*str == '\0')
+                goto out;
+
+        end = str;
+        while (i < size && *end != '\0' && !isspace(*end)) {
+                ++i;
+                ++end;
+        }
+
+        *end= '\0';
+out:
+        return str;
+}
+EXPORT_SYMBOL(cfs_firststr);
+
+char *
+cfs_trimwhite(char *str)
+{
+	char *end;
+
+	while (isspace(*str))
+		str++;
+
+	end = str + strlen(str);
+	while (end > str) {
+		if (!isspace(end[-1]))
+			break;
+		end--;
+	}
+
+	*end = 0;
+	return str;
+}
+EXPORT_SYMBOL(cfs_trimwhite);
+
+/**
+ * Extracts tokens from strings.
+ *
+ * Looks for \a delim in string \a next, sets \a res to point to
+ * substring before the delimiter, sets \a next right after the found
+ * delimiter.
+ *
+ * \retval 1 if \a res points to a string of non-whitespace characters
+ * \retval 0 otherwise
+ */
+int
+cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res)
+{
+	char *end;
+
+	if (next->ls_str == NULL)
+		return 0;
+
+	/* skip leading white spaces */
+	while (next->ls_len) {
+		if (!isspace(*next->ls_str))
+			break;
+		next->ls_str++;
+		next->ls_len--;
+	}
+
+	if (next->ls_len == 0) /* whitespaces only */
+		return 0;
+
+	if (*next->ls_str == delim) {
+		/* first non-writespace is the delimiter */
+		return 0;
+	}
+
+	res->ls_str = next->ls_str;
+	end = memchr(next->ls_str, delim, next->ls_len);
+	if (end == NULL) {
+		/* there is no the delimeter in the string */
+		end = next->ls_str + next->ls_len;
+		next->ls_str = NULL;
+	} else {
+		next->ls_str = end + 1;
+		next->ls_len -= (end - res->ls_str + 1);
+	}
+
+	/* skip ending whitespaces */
+	while (--end != res->ls_str) {
+		if (!isspace(*end))
+			break;
+	}
+
+	res->ls_len = end - res->ls_str + 1;
+	return 1;
+}
+EXPORT_SYMBOL(cfs_gettok);
+
+/**
+ * Converts string to integer.
+ *
+ * Accepts decimal and hexadecimal number recordings.
+ *
+ * \retval 1 if first \a nob chars of \a str convert to decimal or
+ * hexadecimal integer in the range [\a min, \a max]
+ * \retval 0 otherwise
+ */
+int
+cfs_str2num_check(char *str, int nob, unsigned *num,
+		  unsigned min, unsigned max)
+{
+	char	*endp;
+
+	*num = simple_strtoul(str, &endp, 0);
+	if (endp == str)
+		return 0;
+
+	for (; endp < str + nob; endp++) {
+		if (!isspace(*endp))
+			return 0;
+	}
+
+	return (*num >= min && *num <= max);
+}
+EXPORT_SYMBOL(cfs_str2num_check);
+
+/**
+ * Parses \<range_expr\> token of the syntax. If \a bracketed is false,
+ * \a src should only have a single token which can be \<number\> or  \*
+ *
+ * \retval pointer to allocated range_expr and initialized
+ * range_expr::re_lo, range_expr::re_hi and range_expr:re_stride if \a
+ `* src parses to
+ * \<number\> |
+ * \<number\> '-' \<number\> |
+ * \<number\> '-' \<number\> '/' \<number\>
+ * \retval 0 will be returned if it can be parsed, otherwise -EINVAL or
+ * -ENOMEM will be returned.
+ */
+static int
+cfs_range_expr_parse(struct cfs_lstr *src, unsigned min, unsigned max,
+		     int bracketed, struct cfs_range_expr **expr)
+{
+	struct cfs_range_expr	*re;
+	struct cfs_lstr		tok;
+
+	LIBCFS_ALLOC(re, sizeof(*re));
+	if (re == NULL)
+		return -ENOMEM;
+
+	if (src->ls_len == 1 && src->ls_str[0] == '*') {
+		re->re_lo = min;
+		re->re_hi = max;
+		re->re_stride = 1;
+		goto out;
+	}
+
+	if (cfs_str2num_check(src->ls_str, src->ls_len,
+			      &re->re_lo, min, max)) {
+		/* <number> is parsed */
+		re->re_hi = re->re_lo;
+		re->re_stride = 1;
+		goto out;
+	}
+
+	if (!bracketed || !cfs_gettok(src, '-', &tok))
+		goto failed;
+
+	if (!cfs_str2num_check(tok.ls_str, tok.ls_len,
+			       &re->re_lo, min, max))
+		goto failed;
+
+	/* <number> - */
+	if (cfs_str2num_check(src->ls_str, src->ls_len,
+			      &re->re_hi, min, max)) {
+		/* <number> - <number> is parsed */
+		re->re_stride = 1;
+		goto out;
+	}
+
+	/* go to check <number> '-' <number> '/' <number> */
+	if (cfs_gettok(src, '/', &tok)) {
+		if (!cfs_str2num_check(tok.ls_str, tok.ls_len,
+				       &re->re_hi, min, max))
+			goto failed;
+
+		/* <number> - <number> / ... */
+		if (cfs_str2num_check(src->ls_str, src->ls_len,
+				      &re->re_stride, min, max)) {
+			/* <number> - <number> / <number> is parsed */
+			goto out;
+		}
+	}
+
+ out:
+	*expr = re;
+	return 0;
+
+ failed:
+	LIBCFS_FREE(re, sizeof(*re));
+	return -EINVAL;
+}
+
+/**
+ * Print the range expression \a re into specified \a buffer.
+ * If \a bracketed is true, expression does not need additional
+ * brackets.
+ *
+ * \retval number of characters written
+ */
+static int
+cfs_range_expr_print(char *buffer, int count, struct cfs_range_expr *expr,
+		     bool bracketed)
+{
+	int i;
+	char s[] = "[";
+	char e[] = "]";
+
+	if (bracketed)
+		s[0] = e[0] = '\0';
+
+	if (expr->re_lo == expr->re_hi)
+		i = scnprintf(buffer, count, "%u", expr->re_lo);
+	else if (expr->re_stride == 1)
+		i = scnprintf(buffer, count, "%s%u-%u%s",
+				  s, expr->re_lo, expr->re_hi, e);
+	else
+		i = scnprintf(buffer, count, "%s%u-%u/%u%s",
+				  s, expr->re_lo, expr->re_hi,
+				  expr->re_stride, e);
+	return i;
+}
+
+/**
+ * Print a list of range expressions (\a expr_list) into specified \a buffer.
+ * If the list contains several expressions, separate them with comma
+ * and surround the list with brackets.
+ *
+ * \retval number of characters written
+ */
+int
+cfs_expr_list_print(char *buffer, int count, struct cfs_expr_list *expr_list)
+{
+	struct cfs_range_expr *expr;
+	int i = 0, j = 0;
+	int numexprs = 0;
+
+	if (count <= 0)
+		return 0;
+
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link)
+		numexprs++;
+
+	if (numexprs > 1)
+		i += scnprintf(buffer + i, count - i, "[");
+
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		if (j++ != 0)
+			i += scnprintf(buffer + i, count - i, ",");
+		i += cfs_range_expr_print(buffer + i, count - i, expr,
+					  numexprs > 1);
+	}
+
+	if (numexprs > 1)
+		i += scnprintf(buffer + i, count - i, "]");
+
+	return i;
+}
+EXPORT_SYMBOL(cfs_expr_list_print);
+
+/**
+ * Matches value (\a value) against ranges expression list \a expr_list.
+ *
+ * \retval 1 if \a value matches
+ * \retval 0 otherwise
+ */
+int
+cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list)
+{
+	struct cfs_range_expr	*expr;
+
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		if (value >= expr->re_lo && value <= expr->re_hi &&
+		    ((value - expr->re_lo) % expr->re_stride) == 0)
+			return 1;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(cfs_expr_list_match);
+
+/**
+ * Convert express list (\a expr_list) to an array of all matched values
+ *
+ * \retval N N is total number of all matched values
+ * \retval 0 if expression list is empty
+ * \retval < 0 for failure
+ */
+int
+cfs_expr_list_values(struct cfs_expr_list *expr_list, int max, __u32 **valpp)
+{
+	struct cfs_range_expr	*expr;
+	__u32			*val;
+	int			count = 0;
+	int			i;
+
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		for (i = expr->re_lo; i <= expr->re_hi; i++) {
+			if (((i - expr->re_lo) % expr->re_stride) == 0)
+				count++;
+		}
+	}
+
+	if (count == 0) /* empty expression list */
+		return 0;
+
+	if (count > max) {
+		CERROR("Number of values %d exceeds max allowed %d\n",
+		       max, count);
+		return -EINVAL;
+	}
+
+	LIBCFS_ALLOC(val, sizeof(val[0]) * count);
+	if (val == NULL)
+		return -ENOMEM;
+
+	count = 0;
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		for (i = expr->re_lo; i <= expr->re_hi; i++) {
+			if (((i - expr->re_lo) % expr->re_stride) == 0)
+				val[count++] = i;
+		}
+	}
+
+	*valpp = val;
+	return count;
+}
+EXPORT_SYMBOL(cfs_expr_list_values);
+
+void
+cfs_expr_list_values_free(__u32 *values, int num)
+{
+	/* This array is allocated by LIBCFS_ALLOC(), so it shouldn't be freed
+	 * by OBD_FREE() if it's called by module other than libcfs & LNet,
+	 * otherwise we will see fake memory leak */
+	LIBCFS_FREE(values, num * sizeof(values[0]));
+}
+EXPORT_SYMBOL(cfs_expr_list_values_free);
+
+/**
+ * Frees cfs_range_expr structures of \a expr_list.
+ *
+ * \retval none
+ */
+void
+cfs_expr_list_free(struct cfs_expr_list *expr_list)
+{
+	while (!list_empty(&expr_list->el_exprs)) {
+		struct cfs_range_expr *expr;
+
+		expr = list_entry(expr_list->el_exprs.next,
+				      struct cfs_range_expr, re_link);
+		list_del(&expr->re_link);
+		LIBCFS_FREE(expr, sizeof(*expr));
+	}
+
+	LIBCFS_FREE(expr_list, sizeof(*expr_list));
+}
+EXPORT_SYMBOL(cfs_expr_list_free);
+
+/**
+ * Parses \<cfs_expr_list\> token of the syntax.
+ *
+ * \retval 0 if \a str parses to \<number\> | \<expr_list\>
+ * \retval -errno otherwise
+ */
+int
+cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max,
+		    struct cfs_expr_list **elpp)
+{
+	struct cfs_expr_list	*expr_list;
+	struct cfs_range_expr	*expr;
+	struct cfs_lstr		src;
+	int			rc;
+
+	LIBCFS_ALLOC(expr_list, sizeof(*expr_list));
+	if (expr_list == NULL)
+		return -ENOMEM;
+
+	src.ls_str = str;
+	src.ls_len = len;
+
+	INIT_LIST_HEAD(&expr_list->el_exprs);
+
+	if (src.ls_str[0] == '[' &&
+	    src.ls_str[src.ls_len - 1] == ']') {
+		src.ls_str++;
+		src.ls_len -= 2;
+
+		rc = -EINVAL;
+		while (src.ls_str != NULL) {
+			struct cfs_lstr tok;
+
+			if (!cfs_gettok(&src, ',', &tok)) {
+				rc = -EINVAL;
+				break;
+			}
+
+			rc = cfs_range_expr_parse(&tok, min, max, 1, &expr);
+			if (rc != 0)
+				break;
+
+			list_add_tail(&expr->re_link,
+					  &expr_list->el_exprs);
+		}
+	} else {
+		rc = cfs_range_expr_parse(&src, min, max, 0, &expr);
+		if (rc == 0) {
+			list_add_tail(&expr->re_link,
+					  &expr_list->el_exprs);
+		}
+	}
+
+	if (rc != 0)
+		cfs_expr_list_free(expr_list);
+	else
+		*elpp = expr_list;
+
+	return rc;
+}
+EXPORT_SYMBOL(cfs_expr_list_parse);
+
+/**
+ * Frees cfs_expr_list structures of \a list.
+ *
+ * For each struct cfs_expr_list structure found on \a list it frees
+ * range_expr list attached to it and frees the cfs_expr_list itself.
+ *
+ * \retval none
+ */
+void
+cfs_expr_list_free_list(struct list_head *list)
+{
+	struct cfs_expr_list *el;
+
+	while (!list_empty(list)) {
+		el = list_entry(list->next,
+				    struct cfs_expr_list, el_link);
+		list_del(&el->el_link);
+		cfs_expr_list_free(el);
+	}
+}
+EXPORT_SYMBOL(cfs_expr_list_free_list);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/crc32-pclmul_asm.S b/drivers/staging/lustrefsx/libcfs/libcfs/linux/crc32-pclmul_asm.S
new file mode 100644
index 0000000000000..ede54c7084d4d
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/crc32-pclmul_asm.S
@@ -0,0 +1,243 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
+ * calculation.
+ * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
+ * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
+ * at:
+ * http://www.intel.com/products/processor/manuals/
+ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
+ * Volume 2B: Instruction Set Reference, N-Z
+ *
+ * Authors:     Gregory Prestas <Gregory_Prestas@us.xyratex.com>
+ *	      Alexander Boyko <Alexander_Boyko@xyratex.com>
+ */
+
+#define __ASSEMBLY__ 1
+#include "inst.h"
+
+.align 16
+/*
+ * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
+ * #define CONSTANT_R1  0x154442bd4LL
+ *
+ * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
+ * #define CONSTANT_R2  0x1c6e41596LL
+ */
+.Lconstant_R2R1:
+	.octa 0x00000001c6e415960000000154442bd4
+/*
+ * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
+ * #define CONSTANT_R3  0x1751997d0LL
+ *
+ * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
+ * #define CONSTANT_R4  0x0ccaa009eLL
+ */
+.Lconstant_R4R3:
+	.octa 0x00000000ccaa009e00000001751997d0
+/*
+ * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
+ * #define CONSTANT_R5  0x163cd6124LL
+ */
+.Lconstant_R5:
+	.octa 0x00000000000000000000000163cd6124
+.Lconstant_mask32:
+	.octa 0x000000000000000000000000FFFFFFFF
+/*
+ * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
+ *
+ * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL
+ * #define CONSTANT_RU  0x1F7011641LL
+ */
+.Lconstant_RUpoly:
+	.octa 0x00000001F701164100000001DB710641
+
+#define CONSTANT %xmm0
+
+#ifdef __x86_64__
+#define BUF     %rdi
+#define LEN     %rsi
+#define CRC     %edx
+#else
+#define BUF     %eax
+#define LEN     %edx
+#define CRC     %ecx
+#endif
+
+.text
+/**
+ *      Calculate crc32
+ *      BUF - buffer (16 bytes aligned)
+ *      LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63
+ *      CRC - initial crc32
+ *      return %eax crc32
+ *      uint crc32_pclmul_le_16(unsigned char const *buffer,
+ *	                     size_t len, uint crc32)
+ */
+.globl crc32_pclmul_le_16
+.align 4, 0x90
+crc32_pclmul_le_16:/* buffer and buffer size are 16 bytes aligned */
+	movdqa  (BUF), %xmm1
+	movdqa  0x10(BUF), %xmm2
+	movdqa  0x20(BUF), %xmm3
+	movdqa  0x30(BUF), %xmm4
+	movd    CRC, CONSTANT
+	pxor    CONSTANT, %xmm1
+	sub     $0x40, LEN
+	add     $0x40, BUF
+#ifndef __x86_64__
+	/* This is for position independed code(-fPIC) support for 32bit */
+	call    delta
+delta:
+	pop     %ecx
+#endif
+	cmp     $0x40, LEN
+	jb      less_64
+
+#ifdef __x86_64__
+	movdqa .Lconstant_R2R1(%rip), CONSTANT
+#else
+	movdqa .Lconstant_R2R1 - delta(%ecx), CONSTANT
+#endif
+
+loop_64:/*  64 bytes Full cache line folding */
+	prefetchnta    0x40(BUF)
+	movdqa  %xmm1, %xmm5
+	movdqa  %xmm2, %xmm6
+	movdqa  %xmm3, %xmm7
+#ifdef __x86_64__
+	movdqa  %xmm4, %xmm8
+#endif
+	PCLMULQDQ 00, CONSTANT, %xmm1
+	PCLMULQDQ 00, CONSTANT, %xmm2
+	PCLMULQDQ 00, CONSTANT, %xmm3
+#ifdef __x86_64__
+	PCLMULQDQ 00, CONSTANT, %xmm4
+#endif
+	PCLMULQDQ 0x11, CONSTANT, %xmm5
+	PCLMULQDQ 0x11, CONSTANT, %xmm6
+	PCLMULQDQ 0x11, CONSTANT, %xmm7
+#ifdef __x86_64__
+	PCLMULQDQ 0x11, CONSTANT, %xmm8
+#endif
+	pxor    %xmm5, %xmm1
+	pxor    %xmm6, %xmm2
+	pxor    %xmm7, %xmm3
+#ifdef __x86_64__
+	pxor    %xmm8, %xmm4
+#else
+	/* xmm8 unsupported for x32 */
+	movdqa  %xmm4, %xmm5
+	PCLMULQDQ 00, CONSTANT, %xmm4
+	PCLMULQDQ 0x11, CONSTANT, %xmm5
+	pxor    %xmm5, %xmm4
+#endif
+
+	pxor    (BUF), %xmm1
+	pxor    0x10(BUF), %xmm2
+	pxor    0x20(BUF), %xmm3
+	pxor    0x30(BUF), %xmm4
+
+	sub     $0x40, LEN
+	add     $0x40, BUF
+	cmp     $0x40, LEN
+	jge     loop_64
+less_64:/*  Folding cache line into 128bit */
+#ifdef __x86_64__
+	movdqa  .Lconstant_R4R3(%rip), CONSTANT
+#else
+	movdqa  .Lconstant_R4R3 - delta(%ecx), CONSTANT
+#endif
+	prefetchnta     (BUF)
+
+	movdqa  %xmm1, %xmm5
+	PCLMULQDQ 0x00, CONSTANT, %xmm1
+	PCLMULQDQ 0x11, CONSTANT, %xmm5
+	pxor    %xmm5, %xmm1
+	pxor    %xmm2, %xmm1
+
+	movdqa  %xmm1, %xmm5
+	PCLMULQDQ 0x00, CONSTANT, %xmm1
+	PCLMULQDQ 0x11, CONSTANT, %xmm5
+	pxor    %xmm5, %xmm1
+	pxor    %xmm3, %xmm1
+
+	movdqa  %xmm1, %xmm5
+	PCLMULQDQ 0x00, CONSTANT, %xmm1
+	PCLMULQDQ 0x11, CONSTANT, %xmm5
+	pxor    %xmm5, %xmm1
+	pxor    %xmm4, %xmm1
+
+	cmp     $0x10, LEN
+	jb      fold_64
+loop_16:/* Folding rest buffer into 128bit */
+	movdqa  %xmm1, %xmm5
+	PCLMULQDQ 0x00, CONSTANT, %xmm1
+	PCLMULQDQ 0x11, CONSTANT, %xmm5
+	pxor    %xmm5, %xmm1
+	pxor    (BUF), %xmm1
+	sub     $0x10, LEN
+	add     $0x10, BUF
+	cmp     $0x10, LEN
+	jge     loop_16
+
+fold_64:
+	/* perform the last 64 bit fold, also adds 32 zeroes
+	 * to the input stream */
+	PCLMULQDQ 0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
+	psrldq  $0x08, %xmm1
+	pxor    CONSTANT, %xmm1
+
+	/* final 32-bit fold */
+	movdqa  %xmm1, %xmm2
+#ifdef __x86_64__
+	movdqa  .Lconstant_R5(%rip), CONSTANT
+	movdqa  .Lconstant_mask32(%rip), %xmm3
+#else
+	movdqa  .Lconstant_R5 - delta(%ecx), CONSTANT
+	movdqa  .Lconstant_mask32 - delta(%ecx), %xmm3
+#endif
+	psrldq  $0x04, %xmm2
+	pand    %xmm3, %xmm1
+	PCLMULQDQ 0x00, CONSTANT, %xmm1
+	pxor    %xmm2, %xmm1
+
+	/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
+#ifdef __x86_64__
+	movdqa  .Lconstant_RUpoly(%rip), CONSTANT
+#else
+	movdqa  .Lconstant_RUpoly - delta(%ecx), CONSTANT
+#endif
+	movdqa  %xmm1, %xmm2
+	pand    %xmm3, %xmm1
+	PCLMULQDQ 0x10, CONSTANT, %xmm1
+	pand    %xmm3, %xmm1
+	PCLMULQDQ 0x00, CONSTANT, %xmm1
+	pxor    %xmm2, %xmm1
+	PEXTRD  0x01, %xmm1, %eax
+
+	ret
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/crc32c-pcl-intel-asm_64.S b/drivers/staging/lustrefsx/libcfs/libcfs/linux/crc32c-pcl-intel-asm_64.S
new file mode 100644
index 0000000000000..5c896b95024ea
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/crc32c-pcl-intel-asm_64.S
@@ -0,0 +1,466 @@
+/*
+ * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
+ *
+ * The white papers on CRC32C calculations with PCLMULQDQ instruction can be
+ * downloaded from:
+ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
+ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
+ *
+ * Copyright (C) 2012 Intel Corporation.
+ *
+ * Authors:
+ *	Wajdi Feghali <wajdi.k.feghali@intel.com>
+ *	James Guilford <james.guilford@intel.com>
+ *	David Cote <david.m.cote@intel.com>
+ *	Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/linkage.h>
+#include "inst.h"
+
+## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
+
+.macro LABEL prefix n
+\prefix\n\():
+.endm
+
+.macro JMPTBL_ENTRY i
+.word crc_\i - crc_array
+.endm
+
+.macro JNC_LESS_THAN j
+	jnc less_than_\j
+.endm
+
+# Define threshold where buffers are considered "small" and routed to more
+# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so
+# SMALL_SIZE can be no larger than 255.
+
+#define SMALL_SIZE 200
+
+.if (SMALL_SIZE > 255)
+.error "SMALL_SIZE must be < 256"
+.endif
+
+# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
+
+ENTRY(crc_pcl)
+#define    bufp		%rdi
+#define    bufp_dw	%edi
+#define    bufp_w	%di
+#define    bufp_b	%dil
+#define    bufptmp	%rcx
+#define    block_0	%rcx
+#define    block_1	%rdx
+#define    block_2	%r11
+#define    len		%rsi
+#define    len_dw	%esi
+#define    len_w	%si
+#define    len_b	%sil
+#define    crc_init_arg %rdx
+#define    tmp		%rbx
+#define    crc_init	%r8
+#define    crc_init_dw	%r8d
+#define    crc1		%r9
+#define    crc2		%r10
+
+	pushq   %rbx
+	pushq   %rdi
+	pushq   %rsi
+
+	## Move crc_init for Linux to a different
+	mov     crc_init_arg, crc_init
+
+	################################################################
+	## 1) ALIGN:
+	################################################################
+
+	mov     bufp, bufptmp		# rdi = *buf
+	neg     bufp
+	and     $7, bufp		# calculate the unalignment amount of
+					# the address
+	je      proc_block		# Skip if aligned
+
+	## If len is less than 8 and we're unaligned, we need to jump
+	## to special code to avoid reading beyond the end of the buffer
+	cmp     $8, len
+	jae     do_align
+	# less_than_8 expects length in upper 3 bits of len_dw
+	# less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
+	shl     $32-3+1, len_dw
+	jmp     less_than_8_post_shl1
+
+do_align:
+	#### Calculate CRC of unaligned bytes of the buffer (if any)
+	movq    (bufptmp), tmp		# load a quadward from the buffer
+	add     bufp, bufptmp		# align buffer pointer for quadword
+					# processing
+	sub     bufp, len		# update buffer length
+align_loop:
+	crc32b  %bl, crc_init_dw 	# compute crc32 of 1-byte
+	shr     $8, tmp			# get next byte
+	dec     bufp
+	jne     align_loop
+
+proc_block:
+
+	################################################################
+	## 2) PROCESS  BLOCKS:
+	################################################################
+
+	## compute num of bytes to be processed
+	movq    len, tmp		# save num bytes in tmp
+
+	cmpq    $128*24, len
+	jae     full_block
+
+continue_block:
+	cmpq    $SMALL_SIZE, len
+	jb      small
+
+	## len < 128*24
+	movq    $2731, %rax		# 2731 = ceil(2^16 / 24)
+	mul     len_dw
+	shrq    $16, %rax
+
+	## eax contains floor(bytes / 24) = num 24-byte chunks to do
+
+	## process rax 24-byte chunks (128 >= rax >= 0)
+
+	## compute end address of each block
+	## block 0 (base addr + RAX * 8)
+	## block 1 (base addr + RAX * 16)
+	## block 2 (base addr + RAX * 24)
+	lea     (bufptmp, %rax, 8), block_0
+	lea     (block_0, %rax, 8), block_1
+	lea     (block_1, %rax, 8), block_2
+
+	xor     crc1, crc1
+	xor     crc2, crc2
+
+	## branch into array
+	lea	jump_table(%rip), bufp
+	movzxw  (bufp, %rax, 2), len
+	offset=crc_array-jump_table
+	lea     offset(bufp, len, 1), bufp
+	jmp     *bufp
+
+	################################################################
+	## 2a) PROCESS FULL BLOCKS:
+	################################################################
+full_block:
+	movq    $128,%rax
+	lea     128*8*2(block_0), block_1
+	lea     128*8*3(block_0), block_2
+	add     $128*8*1, block_0
+
+	xor     crc1,crc1
+	xor     crc2,crc2
+
+	# Fall thruogh into top of crc array (crc_128)
+
+	################################################################
+	## 3) CRC Array:
+	################################################################
+
+crc_array:
+	i=128
+.rept 128-1
+.altmacro
+LABEL crc_ %i
+.noaltmacro
+	crc32q   -i*8(block_0), crc_init
+	crc32q   -i*8(block_1), crc1
+	crc32q   -i*8(block_2), crc2
+	i=(i-1)
+.endr
+
+.altmacro
+LABEL crc_ %i
+.noaltmacro
+	crc32q   -i*8(block_0), crc_init
+	crc32q   -i*8(block_1), crc1
+# SKIP  crc32  -i*8(block_2), crc2 ; Don't do this one yet
+
+	mov     block_2, block_0
+
+	################################################################
+	## 4) Combine three results:
+	################################################################
+
+	lea	(K_table-16)(%rip), bufp	# first entry is for idx 1
+	shlq    $3, %rax			# rax *= 8
+	subq    %rax, tmp			# tmp -= rax*8
+	shlq    $1, %rax
+	subq    %rax, tmp			# tmp -= rax*16
+						# (total tmp -= rax*24)
+	addq    %rax, bufp
+
+	movdqa  (bufp), %xmm0			# 2 consts: K1:K2
+
+	movq    crc_init, %xmm1			# CRC for block 1
+	PCLMULQDQ 0x00,%xmm0,%xmm1		# Multiply by K2
+
+	movq    crc1, %xmm2			# CRC for block 2
+	PCLMULQDQ 0x10, %xmm0, %xmm2		# Multiply by K1
+
+	pxor    %xmm2,%xmm1
+	movq    %xmm1, %rax
+	xor     -i*8(block_2), %rax
+	mov     crc2, crc_init
+	crc32   %rax, crc_init
+
+################################################################
+## 5) Check for end:
+################################################################
+
+LABEL crc_ 0
+	mov     tmp, len
+	cmp     $128*24, tmp
+	jae     full_block
+	cmp     $24, tmp
+	jae     continue_block
+
+less_than_24:
+	shl     $32-4, len_dw			# less_than_16 expects length
+						# in upper 4 bits of len_dw
+	jnc     less_than_16
+	crc32q  (bufptmp), crc_init
+	crc32q  8(bufptmp), crc_init
+	jz      do_return
+	add     $16, bufptmp
+	# len is less than 8 if we got here
+	# less_than_8 expects length in upper 3 bits of len_dw
+	# less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
+	shl     $2, len_dw
+	jmp     less_than_8_post_shl1
+
+	#######################################################################
+	## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
+	#######################################################################
+small:
+	shl $32-8, len_dw		# Prepare len_dw for less_than_256
+	j=256
+.rept 5					# j = {256, 128, 64, 32, 16}
+.altmacro
+LABEL less_than_ %j			# less_than_j: Length should be in
+					# upper lg(j) bits of len_dw
+	j=(j/2)
+	shl     $1, len_dw		# Get next MSB
+	JNC_LESS_THAN %j
+.noaltmacro
+	i=0
+.rept (j/8)
+	crc32q  i(bufptmp), crc_init	# Compute crc32 of 8-byte data
+	i=i+8
+.endr
+	jz      do_return		# Return if remaining length is zero
+	add     $j, bufptmp		# Advance buf
+.endr
+
+less_than_8:				# Length should be stored in
+					# upper 3 bits of len_dw
+	shl     $1, len_dw
+less_than_8_post_shl1:
+	jnc     less_than_4
+	crc32l  (bufptmp), crc_init_dw	# CRC of 4 bytes
+	jz      do_return		# return if remaining data is zero
+	add     $4, bufptmp
+less_than_4:				# Length should be stored in
+					# upper 2 bits of len_dw
+	shl     $1, len_dw
+	jnc     less_than_2
+	crc32w  (bufptmp), crc_init_dw	# CRC of 2 bytes
+	jz      do_return		# return if remaining data is zero
+	add     $2, bufptmp
+less_than_2:				# Length should be stored in the MSB
+					# of len_dw
+	shl     $1, len_dw
+	jnc     less_than_1
+	crc32b  (bufptmp), crc_init_dw	# CRC of 1 byte
+less_than_1:				# Length should be zero
+do_return:
+	movq    crc_init, %rax
+	popq    %rsi
+	popq    %rdi
+	popq    %rbx
+	ret
+
+	################################################################
+	## jump table        Table is 129 entries x 2 bytes each
+	################################################################
+.align 4
+jump_table:
+	i=0
+.rept 129
+.altmacro
+JMPTBL_ENTRY %i
+.noaltmacro
+	i=i+1
+.endr
+
+ENDPROC(crc_pcl)
+
+	################################################################
+	## PCLMULQDQ tables
+	## Table is 128 entries x 2 quad words each
+	################################################################
+.data
+.align 64
+K_table:
+	.quad 0x14cd00bd6,0x105ec76f0
+	.quad 0x0ba4fc28e,0x14cd00bd6
+	.quad 0x1d82c63da,0x0f20c0dfe
+	.quad 0x09e4addf8,0x0ba4fc28e
+	.quad 0x039d3b296,0x1384aa63a
+	.quad 0x102f9b8a2,0x1d82c63da
+	.quad 0x14237f5e6,0x01c291d04
+	.quad 0x00d3b6092,0x09e4addf8
+	.quad 0x0c96cfdc0,0x0740eef02
+	.quad 0x18266e456,0x039d3b296
+	.quad 0x0daece73e,0x0083a6eec
+	.quad 0x0ab7aff2a,0x102f9b8a2
+	.quad 0x1248ea574,0x1c1733996
+	.quad 0x083348832,0x14237f5e6
+	.quad 0x12c743124,0x02ad91c30
+	.quad 0x0b9e02b86,0x00d3b6092
+	.quad 0x018b33a4e,0x06992cea2
+	.quad 0x1b331e26a,0x0c96cfdc0
+	.quad 0x17d35ba46,0x07e908048
+	.quad 0x1bf2e8b8a,0x18266e456
+	.quad 0x1a3e0968a,0x11ed1f9d8
+	.quad 0x0ce7f39f4,0x0daece73e
+	.quad 0x061d82e56,0x0f1d0f55e
+	.quad 0x0d270f1a2,0x0ab7aff2a
+	.quad 0x1c3f5f66c,0x0a87ab8a8
+	.quad 0x12ed0daac,0x1248ea574
+	.quad 0x065863b64,0x08462d800
+	.quad 0x11eef4f8e,0x083348832
+	.quad 0x1ee54f54c,0x071d111a8
+	.quad 0x0b3e32c28,0x12c743124
+	.quad 0x0064f7f26,0x0ffd852c6
+	.quad 0x0dd7e3b0c,0x0b9e02b86
+	.quad 0x0f285651c,0x0dcb17aa4
+	.quad 0x010746f3c,0x018b33a4e
+	.quad 0x1c24afea4,0x0f37c5aee
+	.quad 0x0271d9844,0x1b331e26a
+	.quad 0x08e766a0c,0x06051d5a2
+	.quad 0x093a5f730,0x17d35ba46
+	.quad 0x06cb08e5c,0x11d5ca20e
+	.quad 0x06b749fb2,0x1bf2e8b8a
+	.quad 0x1167f94f2,0x021f3d99c
+	.quad 0x0cec3662e,0x1a3e0968a
+	.quad 0x19329634a,0x08f158014
+	.quad 0x0e6fc4e6a,0x0ce7f39f4
+	.quad 0x08227bb8a,0x1a5e82106
+	.quad 0x0b0cd4768,0x061d82e56
+	.quad 0x13c2b89c4,0x188815ab2
+	.quad 0x0d7a4825c,0x0d270f1a2
+	.quad 0x10f5ff2ba,0x105405f3e
+	.quad 0x00167d312,0x1c3f5f66c
+	.quad 0x0f6076544,0x0e9adf796
+	.quad 0x026f6a60a,0x12ed0daac
+	.quad 0x1a2adb74e,0x096638b34
+	.quad 0x19d34af3a,0x065863b64
+	.quad 0x049c3cc9c,0x1e50585a0
+	.quad 0x068bce87a,0x11eef4f8e
+	.quad 0x1524fa6c6,0x19f1c69dc
+	.quad 0x16cba8aca,0x1ee54f54c
+	.quad 0x042d98888,0x12913343e
+	.quad 0x1329d9f7e,0x0b3e32c28
+	.quad 0x1b1c69528,0x088f25a3a
+	.quad 0x02178513a,0x0064f7f26
+	.quad 0x0e0ac139e,0x04e36f0b0
+	.quad 0x0170076fa,0x0dd7e3b0c
+	.quad 0x141a1a2e2,0x0bd6f81f8
+	.quad 0x16ad828b4,0x0f285651c
+	.quad 0x041d17b64,0x19425cbba
+	.quad 0x1fae1cc66,0x010746f3c
+	.quad 0x1a75b4b00,0x18db37e8a
+	.quad 0x0f872e54c,0x1c24afea4
+	.quad 0x01e41e9fc,0x04c144932
+	.quad 0x086d8e4d2,0x0271d9844
+	.quad 0x160f7af7a,0x052148f02
+	.quad 0x05bb8f1bc,0x08e766a0c
+	.quad 0x0a90fd27a,0x0a3c6f37a
+	.quad 0x0b3af077a,0x093a5f730
+	.quad 0x04984d782,0x1d22c238e
+	.quad 0x0ca6ef3ac,0x06cb08e5c
+	.quad 0x0234e0b26,0x063ded06a
+	.quad 0x1d88abd4a,0x06b749fb2
+	.quad 0x04597456a,0x04d56973c
+	.quad 0x0e9e28eb4,0x1167f94f2
+	.quad 0x07b3ff57a,0x19385bf2e
+	.quad 0x0c9c8b782,0x0cec3662e
+	.quad 0x13a9cba9e,0x0e417f38a
+	.quad 0x093e106a4,0x19329634a
+	.quad 0x167001a9c,0x14e727980
+	.quad 0x1ddffc5d4,0x0e6fc4e6a
+	.quad 0x00df04680,0x0d104b8fc
+	.quad 0x02342001e,0x08227bb8a
+	.quad 0x00a2a8d7e,0x05b397730
+	.quad 0x168763fa6,0x0b0cd4768
+	.quad 0x1ed5a407a,0x0e78eb416
+	.quad 0x0d2c3ed1a,0x13c2b89c4
+	.quad 0x0995a5724,0x1641378f0
+	.quad 0x19b1afbc4,0x0d7a4825c
+	.quad 0x109ffedc0,0x08d96551c
+	.quad 0x0f2271e60,0x10f5ff2ba
+	.quad 0x00b0bf8ca,0x00bf80dd2
+	.quad 0x123888b7a,0x00167d312
+	.quad 0x1e888f7dc,0x18dcddd1c
+	.quad 0x002ee03b2,0x0f6076544
+	.quad 0x183e8d8fe,0x06a45d2b2
+	.quad 0x133d7a042,0x026f6a60a
+	.quad 0x116b0f50c,0x1dd3e10e8
+	.quad 0x05fabe670,0x1a2adb74e
+	.quad 0x130004488,0x0de87806c
+	.quad 0x000bcf5f6,0x19d34af3a
+	.quad 0x18f0c7078,0x014338754
+	.quad 0x017f27698,0x049c3cc9c
+	.quad 0x058ca5f00,0x15e3e77ee
+	.quad 0x1af900c24,0x068bce87a
+	.quad 0x0b5cfca28,0x0dd07448e
+	.quad 0x0ded288f8,0x1524fa6c6
+	.quad 0x059f229bc,0x1d8048348
+	.quad 0x06d390dec,0x16cba8aca
+	.quad 0x037170390,0x0a3e3e02c
+	.quad 0x06353c1cc,0x042d98888
+	.quad 0x0c4584f5c,0x0d73c7bea
+	.quad 0x1f16a3418,0x1329d9f7e
+	.quad 0x0531377e2,0x185137662
+	.quad 0x1d8d9ca7c,0x1b1c69528
+	.quad 0x0b25b29f2,0x18a08b5bc
+	.quad 0x19fb2a8b0,0x02178513a
+	.quad 0x1a08fe6ac,0x1da758ae0
+	.quad 0x045cddf4e,0x0e0ac139e
+	.quad 0x1a91647f2,0x169cf9eb0
+	.quad 0x1a0f717c4,0x0170076fa
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/inst.h b/drivers/staging/lustrefsx/libcfs/libcfs/linux/inst.h
new file mode 100644
index 0000000000000..3e115273ed885
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/inst.h
@@ -0,0 +1,310 @@
+/*
+ * Generate .byte code for some instructions not supported by old
+ * binutils.
+ */
+#ifndef X86_ASM_INST_H
+#define X86_ASM_INST_H
+
+#ifdef __ASSEMBLY__
+
+#define REG_NUM_INVALID		100
+
+#define REG_TYPE_R32		0
+#define REG_TYPE_R64		1
+#define REG_TYPE_XMM		2
+#define REG_TYPE_INVALID	100
+
+	.macro R32_NUM opd r32
+	\opd = REG_NUM_INVALID
+	.ifc \r32,%eax
+	\opd = 0
+	.endif
+	.ifc \r32,%ecx
+	\opd = 1
+	.endif
+	.ifc \r32,%edx
+	\opd = 2
+	.endif
+	.ifc \r32,%ebx
+	\opd = 3
+	.endif
+	.ifc \r32,%esp
+	\opd = 4
+	.endif
+	.ifc \r32,%ebp
+	\opd = 5
+	.endif
+	.ifc \r32,%esi
+	\opd = 6
+	.endif
+	.ifc \r32,%edi
+	\opd = 7
+	.endif
+#ifdef CONFIG_X86_64
+	.ifc \r32,%r8d
+	\opd = 8
+	.endif
+	.ifc \r32,%r9d
+	\opd = 9
+	.endif
+	.ifc \r32,%r10d
+	\opd = 10
+	.endif
+	.ifc \r32,%r11d
+	\opd = 11
+	.endif
+	.ifc \r32,%r12d
+	\opd = 12
+	.endif
+	.ifc \r32,%r13d
+	\opd = 13
+	.endif
+	.ifc \r32,%r14d
+	\opd = 14
+	.endif
+	.ifc \r32,%r15d
+	\opd = 15
+	.endif
+#endif
+	.endm
+
+	.macro R64_NUM opd r64
+	\opd = REG_NUM_INVALID
+#ifdef CONFIG_X86_64
+	.ifc \r64,%rax
+	\opd = 0
+	.endif
+	.ifc \r64,%rcx
+	\opd = 1
+	.endif
+	.ifc \r64,%rdx
+	\opd = 2
+	.endif
+	.ifc \r64,%rbx
+	\opd = 3
+	.endif
+	.ifc \r64,%rsp
+	\opd = 4
+	.endif
+	.ifc \r64,%rbp
+	\opd = 5
+	.endif
+	.ifc \r64,%rsi
+	\opd = 6
+	.endif
+	.ifc \r64,%rdi
+	\opd = 7
+	.endif
+	.ifc \r64,%r8
+	\opd = 8
+	.endif
+	.ifc \r64,%r9
+	\opd = 9
+	.endif
+	.ifc \r64,%r10
+	\opd = 10
+	.endif
+	.ifc \r64,%r11
+	\opd = 11
+	.endif
+	.ifc \r64,%r12
+	\opd = 12
+	.endif
+	.ifc \r64,%r13
+	\opd = 13
+	.endif
+	.ifc \r64,%r14
+	\opd = 14
+	.endif
+	.ifc \r64,%r15
+	\opd = 15
+	.endif
+#endif
+	.endm
+
+	.macro XMM_NUM opd xmm
+	\opd = REG_NUM_INVALID
+	.ifc \xmm,%xmm0
+	\opd = 0
+	.endif
+	.ifc \xmm,%xmm1
+	\opd = 1
+	.endif
+	.ifc \xmm,%xmm2
+	\opd = 2
+	.endif
+	.ifc \xmm,%xmm3
+	\opd = 3
+	.endif
+	.ifc \xmm,%xmm4
+	\opd = 4
+	.endif
+	.ifc \xmm,%xmm5
+	\opd = 5
+	.endif
+	.ifc \xmm,%xmm6
+	\opd = 6
+	.endif
+	.ifc \xmm,%xmm7
+	\opd = 7
+	.endif
+	.ifc \xmm,%xmm8
+	\opd = 8
+	.endif
+	.ifc \xmm,%xmm9
+	\opd = 9
+	.endif
+	.ifc \xmm,%xmm10
+	\opd = 10
+	.endif
+	.ifc \xmm,%xmm11
+	\opd = 11
+	.endif
+	.ifc \xmm,%xmm12
+	\opd = 12
+	.endif
+	.ifc \xmm,%xmm13
+	\opd = 13
+	.endif
+	.ifc \xmm,%xmm14
+	\opd = 14
+	.endif
+	.ifc \xmm,%xmm15
+	\opd = 15
+	.endif
+	.endm
+
+	.macro REG_TYPE type reg
+	R32_NUM reg_type_r32 \reg
+	R64_NUM reg_type_r64 \reg
+	XMM_NUM reg_type_xmm \reg
+	.if reg_type_r64 <> REG_NUM_INVALID
+	\type = REG_TYPE_R64
+	.elseif reg_type_r32 <> REG_NUM_INVALID
+	\type = REG_TYPE_R32
+	.elseif reg_type_xmm <> REG_NUM_INVALID
+	\type = REG_TYPE_XMM
+	.else
+	\type = REG_TYPE_INVALID
+	.endif
+	.endm
+
+	.macro PFX_OPD_SIZE
+	.byte 0x66
+	.endm
+
+	.macro PFX_REX opd1 opd2 W=0
+	.if ((\opd1 | \opd2) & 8) || \W
+	.byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3)
+	.endif
+	.endm
+
+	.macro MODRM mod opd1 opd2
+	.byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3)
+	.endm
+
+	.macro PSHUFB_XMM xmm1 xmm2
+	XMM_NUM pshufb_opd1 \xmm1
+	XMM_NUM pshufb_opd2 \xmm2
+	PFX_OPD_SIZE
+	PFX_REX pshufb_opd1 pshufb_opd2
+	.byte 0x0f, 0x38, 0x00
+	MODRM 0xc0 pshufb_opd1 pshufb_opd2
+	.endm
+
+	.macro PCLMULQDQ imm8 xmm1 xmm2
+	XMM_NUM clmul_opd1 \xmm1
+	XMM_NUM clmul_opd2 \xmm2
+	PFX_OPD_SIZE
+	PFX_REX clmul_opd1 clmul_opd2
+	.byte 0x0f, 0x3a, 0x44
+	MODRM 0xc0 clmul_opd1 clmul_opd2
+	.byte \imm8
+	.endm
+
+	.macro PEXTRD imm8 xmm gpr
+	R32_NUM extrd_opd1 \gpr
+	XMM_NUM extrd_opd2 \xmm
+	PFX_OPD_SIZE
+	PFX_REX extrd_opd1 extrd_opd2
+	.byte 0x0f, 0x3a, 0x16
+	MODRM 0xc0 extrd_opd1 extrd_opd2
+	.byte \imm8
+	.endm
+
+	.macro AESKEYGENASSIST rcon xmm1 xmm2
+	XMM_NUM aeskeygen_opd1 \xmm1
+	XMM_NUM aeskeygen_opd2 \xmm2
+	PFX_OPD_SIZE
+	PFX_REX aeskeygen_opd1 aeskeygen_opd2
+	.byte 0x0f, 0x3a, 0xdf
+	MODRM 0xc0 aeskeygen_opd1 aeskeygen_opd2
+	.byte \rcon
+	.endm
+
+	.macro AESIMC xmm1 xmm2
+	XMM_NUM aesimc_opd1 \xmm1
+	XMM_NUM aesimc_opd2 \xmm2
+	PFX_OPD_SIZE
+	PFX_REX aesimc_opd1 aesimc_opd2
+	.byte 0x0f, 0x38, 0xdb
+	MODRM 0xc0 aesimc_opd1 aesimc_opd2
+	.endm
+
+	.macro AESENC xmm1 xmm2
+	XMM_NUM aesenc_opd1 \xmm1
+	XMM_NUM aesenc_opd2 \xmm2
+	PFX_OPD_SIZE
+	PFX_REX aesenc_opd1 aesenc_opd2
+	.byte 0x0f, 0x38, 0xdc
+	MODRM 0xc0 aesenc_opd1 aesenc_opd2
+	.endm
+
+	.macro AESENCLAST xmm1 xmm2
+	XMM_NUM aesenclast_opd1 \xmm1
+	XMM_NUM aesenclast_opd2 \xmm2
+	PFX_OPD_SIZE
+	PFX_REX aesenclast_opd1 aesenclast_opd2
+	.byte 0x0f, 0x38, 0xdd
+	MODRM 0xc0 aesenclast_opd1 aesenclast_opd2
+	.endm
+
+	.macro AESDEC xmm1 xmm2
+	XMM_NUM aesdec_opd1 \xmm1
+	XMM_NUM aesdec_opd2 \xmm2
+	PFX_OPD_SIZE
+	PFX_REX aesdec_opd1 aesdec_opd2
+	.byte 0x0f, 0x38, 0xde
+	MODRM 0xc0 aesdec_opd1 aesdec_opd2
+	.endm
+
+	.macro AESDECLAST xmm1 xmm2
+	XMM_NUM aesdeclast_opd1 \xmm1
+	XMM_NUM aesdeclast_opd2 \xmm2
+	PFX_OPD_SIZE
+	PFX_REX aesdeclast_opd1 aesdeclast_opd2
+	.byte 0x0f, 0x38, 0xdf
+	MODRM 0xc0 aesdeclast_opd1 aesdeclast_opd2
+	.endm
+
+	.macro MOVQ_R64_XMM opd1 opd2
+	REG_TYPE movq_r64_xmm_opd1_type \opd1
+	.if movq_r64_xmm_opd1_type == REG_TYPE_XMM
+	XMM_NUM movq_r64_xmm_opd1 \opd1
+	R64_NUM movq_r64_xmm_opd2 \opd2
+	.else
+	R64_NUM movq_r64_xmm_opd1 \opd1
+	XMM_NUM movq_r64_xmm_opd2 \opd2
+	.endif
+	PFX_OPD_SIZE
+	PFX_REX movq_r64_xmm_opd1 movq_r64_xmm_opd2 1
+	.if movq_r64_xmm_opd1_type == REG_TYPE_XMM
+	.byte 0x0f, 0x7e
+	.else
+	.byte 0x0f, 0x6e
+	.endif
+	MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2
+	.endm
+#endif
+
+#endif
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-cpu.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-cpu.c
new file mode 100644
index 0000000000000..b7d6193425b41
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-cpu.c
@@ -0,0 +1,1178 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/cpu.h>
+#include <linux/sched.h>
+#include <libcfs/libcfs.h>
+
+#ifdef CONFIG_SMP
+
+/**
+ * modparam for setting number of partitions
+ *
+ *  0 : estimate best value based on cores or NUMA nodes
+ *  1 : disable multiple partitions
+ * >1 : specify number of partitions
+ */
+static int cpu_npartitions;
+module_param(cpu_npartitions, int, 0444);
+MODULE_PARM_DESC(cpu_npartitions, "# of CPU partitions");
+
+/**
+ * modparam for setting CPU partitions patterns:
+ *
+ * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID,
+ *      number in bracket is processor ID (core or HT)
+ *
+ * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket
+ *       are NUMA node ID, number before bracket is CPU partition ID.
+ *
+ * i.e: "N", shortcut expression to create CPT from NUMA & CPU topology
+ *
+ * NB: If user specified cpu_pattern, cpu_npartitions will be ignored
+ */
+static char *cpu_pattern = "N";
+module_param(cpu_pattern, charp, 0444);
+MODULE_PARM_DESC(cpu_pattern, "CPU partitions pattern");
+
+void cfs_cpt_table_free(struct cfs_cpt_table *cptab)
+{
+	int i;
+
+	if (cptab->ctb_cpu2cpt != NULL) {
+		LIBCFS_FREE(cptab->ctb_cpu2cpt,
+			    nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
+	}
+
+	if (cptab->ctb_node2cpt != NULL) {
+		LIBCFS_FREE(cptab->ctb_node2cpt,
+			    nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
+	}
+
+	for (i = 0; cptab->ctb_parts != NULL && i < cptab->ctb_nparts; i++) {
+		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
+
+		if (part->cpt_nodemask != NULL) {
+			LIBCFS_FREE(part->cpt_nodemask,
+				    sizeof(*part->cpt_nodemask));
+		}
+
+		if (part->cpt_cpumask != NULL)
+			LIBCFS_FREE(part->cpt_cpumask, cpumask_size());
+
+		if (part->cpt_distance) {
+			LIBCFS_FREE(part->cpt_distance,
+				cptab->ctb_nparts *
+					sizeof(part->cpt_distance[0]));
+		}
+	}
+
+	if (cptab->ctb_parts != NULL) {
+		LIBCFS_FREE(cptab->ctb_parts,
+			    cptab->ctb_nparts * sizeof(cptab->ctb_parts[0]));
+	}
+
+	if (cptab->ctb_nodemask != NULL)
+		LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
+	if (cptab->ctb_cpumask != NULL)
+		LIBCFS_FREE(cptab->ctb_cpumask, cpumask_size());
+
+	LIBCFS_FREE(cptab, sizeof(*cptab));
+}
+EXPORT_SYMBOL(cfs_cpt_table_free);
+
+struct cfs_cpt_table *cfs_cpt_table_alloc(int ncpt)
+{
+	struct cfs_cpt_table *cptab;
+	int i;
+
+	LIBCFS_ALLOC(cptab, sizeof(*cptab));
+	if (cptab == NULL)
+		return NULL;
+
+	cptab->ctb_nparts = ncpt;
+
+	LIBCFS_ALLOC(cptab->ctb_cpumask, cpumask_size());
+	LIBCFS_ALLOC(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
+
+	if (cptab->ctb_cpumask == NULL || cptab->ctb_nodemask == NULL)
+		goto failed;
+
+	LIBCFS_ALLOC(cptab->ctb_cpu2cpt,
+		     nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
+	if (cptab->ctb_cpu2cpt == NULL)
+		goto failed;
+
+	memset(cptab->ctb_cpu2cpt, -1,
+	       nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
+
+	LIBCFS_ALLOC(cptab->ctb_node2cpt,
+		     nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
+	if (cptab->ctb_node2cpt == NULL)
+		goto failed;
+
+	memset(cptab->ctb_node2cpt, -1,
+	       nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
+
+	LIBCFS_ALLOC(cptab->ctb_parts, ncpt * sizeof(cptab->ctb_parts[0]));
+	if (cptab->ctb_parts == NULL)
+		goto failed;
+
+	for (i = 0; i < ncpt; i++) {
+		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
+
+		LIBCFS_ALLOC(part->cpt_cpumask, cpumask_size());
+		if (!part->cpt_cpumask)
+			goto failed;
+
+		LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask));
+		if (!part->cpt_nodemask)
+			goto failed;
+
+		LIBCFS_ALLOC(part->cpt_distance,
+			cptab->ctb_nparts * sizeof(part->cpt_distance[0]));
+		if (!part->cpt_distance)
+			goto failed;
+	}
+
+	return cptab;
+
+failed:
+	cfs_cpt_table_free(cptab);
+	return NULL;
+}
+EXPORT_SYMBOL(cfs_cpt_table_alloc);
+
+int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
+{
+	char *tmp = buf;
+	int rc;
+	int i;
+	int j;
+
+	for (i = 0; i < cptab->ctb_nparts; i++) {
+		if (len <= 0)
+			goto err;
+
+		rc = snprintf(tmp, len, "%d\t:", i);
+		len -= rc;
+
+		if (len <= 0)
+			goto err;
+
+		tmp += rc;
+		for_each_cpu(j, cptab->ctb_parts[i].cpt_cpumask) {
+			rc = snprintf(tmp, len, " %d", j);
+			len -= rc;
+			if (len <= 0)
+				goto err;
+			tmp += rc;
+		}
+
+		*tmp = '\n';
+		tmp++;
+		len--;
+	}
+
+	return tmp - buf;
+
+err:
+	return -E2BIG;
+}
+EXPORT_SYMBOL(cfs_cpt_table_print);
+
+int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len)
+{
+	char *tmp = buf;
+	int rc;
+	int i;
+	int j;
+
+	for (i = 0; i < cptab->ctb_nparts; i++) {
+		if (len <= 0)
+			goto err;
+
+		rc = snprintf(tmp, len, "%d\t:", i);
+		len -= rc;
+
+		if (len <= 0)
+			goto err;
+
+		tmp += rc;
+		for (j = 0; j < cptab->ctb_nparts; j++) {
+			rc = snprintf(tmp, len, " %d:%d",
+				j, cptab->ctb_parts[i].cpt_distance[j]);
+			len -= rc;
+			if (len <= 0)
+				goto err;
+			tmp += rc;
+		}
+
+		*tmp = '\n';
+		tmp++;
+		len--;
+	}
+
+	return tmp - buf;
+
+err:
+	return -E2BIG;
+}
+EXPORT_SYMBOL(cfs_cpt_distance_print);
+
+int cfs_cpt_number(struct cfs_cpt_table *cptab)
+{
+	return cptab->ctb_nparts;
+}
+EXPORT_SYMBOL(cfs_cpt_number);
+
+int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
+{
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cpumask_weight(cptab->ctb_cpumask) :
+	       cpumask_weight(cptab->ctb_parts[cpt].cpt_cpumask);
+}
+EXPORT_SYMBOL(cfs_cpt_weight);
+
+int cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
+{
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cpumask_any_and(cptab->ctb_cpumask,
+			       cpu_online_mask) < nr_cpu_ids :
+	       cpumask_any_and(cptab->ctb_parts[cpt].cpt_cpumask,
+			       cpu_online_mask) < nr_cpu_ids;
+}
+EXPORT_SYMBOL(cfs_cpt_online);
+
+cpumask_t *cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt)
+{
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cptab->ctb_cpumask : cptab->ctb_parts[cpt].cpt_cpumask;
+}
+EXPORT_SYMBOL(cfs_cpt_cpumask);
+
+nodemask_t *cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt)
+{
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask;
+}
+EXPORT_SYMBOL(cfs_cpt_nodemask);
+
+unsigned cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2)
+{
+	LASSERT(cpt1 == CFS_CPT_ANY || (cpt1 >= 0 && cpt1 < cptab->ctb_nparts));
+	LASSERT(cpt2 == CFS_CPT_ANY || (cpt2 >= 0 && cpt2 < cptab->ctb_nparts));
+
+	if (cpt1 == CFS_CPT_ANY || cpt2 == CFS_CPT_ANY)
+		return cptab->ctb_distance;
+
+	return cptab->ctb_parts[cpt1].cpt_distance[cpt2];
+}
+EXPORT_SYMBOL(cfs_cpt_distance);
+
+/*
+ * Calculate the maximum NUMA distance between all nodes in the
+ * from_mask and all nodes in the to_mask.
+ */
+static unsigned cfs_cpt_distance_calculate(nodemask_t *from_mask,
+					   nodemask_t *to_mask)
+{
+	unsigned maximum;
+	unsigned distance;
+	int to;
+	int from;
+
+	maximum = 0;
+	for_each_node_mask(from, *from_mask) {
+		for_each_node_mask(to, *to_mask) {
+			distance = node_distance(from, to);
+			if (maximum < distance)
+				maximum = distance;
+		}
+	}
+	return maximum;
+}
+
+static void cfs_cpt_add_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+	cptab->ctb_cpu2cpt[cpu] = cpt;
+
+	cpumask_set_cpu(cpu, cptab->ctb_cpumask);
+	cpumask_set_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
+}
+
+static void cfs_cpt_del_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+	cpumask_clear_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
+	cpumask_clear_cpu(cpu, cptab->ctb_cpumask);
+
+	cptab->ctb_cpu2cpt[cpu] = -1;
+}
+
+static void cfs_cpt_add_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+	int cpt2;
+	struct cfs_cpu_partition *part;
+	struct cfs_cpu_partition *part2;
+
+	if (!node_isset(node, *cptab->ctb_nodemask)) {
+		/* first time node is added to the CPT table */
+		node_set(node, *cptab->ctb_nodemask);
+		cptab->ctb_node2cpt[node] = cpt;
+		cptab->ctb_distance = cfs_cpt_distance_calculate(
+							cptab->ctb_nodemask,
+							cptab->ctb_nodemask);
+	}
+
+	part = &cptab->ctb_parts[cpt];
+	if (!node_isset(node, *part->cpt_nodemask)) {
+		/* first time node is added to this CPT */
+		node_set(node, *part->cpt_nodemask);
+		for (cpt2 = 0; cpt2 < cptab->ctb_nparts; cpt2++) {
+			part2 = &cptab->ctb_parts[cpt2];
+			part->cpt_distance[cpt2] = cfs_cpt_distance_calculate(
+						part->cpt_nodemask,
+						part2->cpt_nodemask);
+			part2->cpt_distance[cpt] = cfs_cpt_distance_calculate(
+						part2->cpt_nodemask,
+						part->cpt_nodemask);
+		}
+	}
+}
+
+static void cfs_cpt_del_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+	int cpu;
+	int cpt2;
+	struct cfs_cpu_partition *part;
+	struct cfs_cpu_partition *part2;
+
+	part = &cptab->ctb_parts[cpt];
+
+	for_each_cpu(cpu, part->cpt_cpumask) {
+		/* this CPT has other CPU belonging to this node? */
+		if (cpu_to_node(cpu) == node)
+			break;
+	}
+
+	if (cpu >= nr_cpu_ids && node_isset(node,  *part->cpt_nodemask)) {
+		/* No more CPUs in the node for this CPT. */
+		node_clear(node, *part->cpt_nodemask);
+		for (cpt2 = 0; cpt2 < cptab->ctb_nparts; cpt2++) {
+			part2 = &cptab->ctb_parts[cpt2];
+			if (node_isset(node, *part2->cpt_nodemask))
+				cptab->ctb_node2cpt[node] = cpt2;
+			part->cpt_distance[cpt2] = cfs_cpt_distance_calculate(
+						part->cpt_nodemask,
+						part2->cpt_nodemask);
+			part2->cpt_distance[cpt] = cfs_cpt_distance_calculate(
+						part2->cpt_nodemask,
+						part->cpt_nodemask);
+		}
+	}
+
+	for_each_cpu(cpu, cptab->ctb_cpumask) {
+		/* this CPT-table has other CPUs belonging to this node? */
+		if (cpu_to_node(cpu) == node)
+			break;
+	}
+
+	if (cpu >= nr_cpu_ids && node_isset(node, *cptab->ctb_nodemask)) {
+		/* No more CPUs in the table for this node. */
+		node_clear(node, *cptab->ctb_nodemask);
+		cptab->ctb_node2cpt[node] = -1;
+		cptab->ctb_distance =
+			cfs_cpt_distance_calculate(cptab->ctb_nodemask,
+					cptab->ctb_nodemask);
+	}
+}
+
+int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+	LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts);
+
+	if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_online(cpu)) {
+		CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu);
+		return 0;
+	}
+
+	if (cptab->ctb_cpu2cpt[cpu] != -1) {
+		CDEBUG(D_INFO, "CPU %d is already in partition %d\n",
+		       cpu, cptab->ctb_cpu2cpt[cpu]);
+		return 0;
+	}
+
+	if (cpumask_test_cpu(cpu, cptab->ctb_cpumask)) {
+		CDEBUG(D_INFO, "CPU %d is already in cpumask\n", cpu);
+		return 0;
+	}
+	if (cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask)) {
+		CDEBUG(D_INFO, "CPU %d is already in partition %d cpumask\n",
+		       cpu, cptab->ctb_cpu2cpt[cpu]);
+		return 0;
+	}
+
+	cfs_cpt_add_cpu(cptab, cpt, cpu);
+	cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu));
+
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpu);
+
+void cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	if (cpu < 0 || cpu >= nr_cpu_ids) {
+		CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu);
+		return;
+	}
+
+	if (cpt == CFS_CPT_ANY) {
+		/* caller doesn't know the partition ID */
+		cpt = cptab->ctb_cpu2cpt[cpu];
+		if (cpt < 0) { /* not set in this CPT-table */
+			CDEBUG(D_INFO, "Try to unset cpu %d which is "
+				       "not in CPT-table %p\n", cpt, cptab);
+			return;
+		}
+
+	} else if (cpt != cptab->ctb_cpu2cpt[cpu]) {
+		CDEBUG(D_INFO, "CPU %d is not in CPU partition %d\n", cpu, cpt);
+		return;
+	}
+
+	LASSERT(cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
+	LASSERT(cpumask_test_cpu(cpu, cptab->ctb_cpumask));
+
+	cfs_cpt_del_cpu(cptab, cpt, cpu);
+	cfs_cpt_del_node(cptab, cpt, cpu_to_node(cpu));
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpu);
+
+int cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt,
+			const cpumask_t *mask)
+{
+	int cpu;
+
+	if (cpumask_weight(mask) == 0 ||
+	    cpumask_any_and(mask, cpu_online_mask) >= nr_cpu_ids) {
+		CDEBUG(D_INFO, "No online CPU is found in the CPU mask "
+			       "for CPU partition %d\n", cpt);
+		return 0;
+	}
+
+	for_each_cpu(cpu, mask) {
+		cfs_cpt_add_cpu(cptab, cpt, cpu);
+		cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu));
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpumask);
+
+void cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt,
+			   const cpumask_t *mask)
+{
+	int cpu;
+
+	for_each_cpu(cpu, mask) {
+		cfs_cpt_del_cpu(cptab, cpt, cpu);
+		cfs_cpt_del_node(cptab, cpt, cpu_to_node(cpu));
+	}
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
+
+int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+	const cpumask_t *mask;
+	int cpu;
+
+	if (node < 0 || node >= nr_node_ids) {
+		CDEBUG(D_INFO,
+		       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
+		return 0;
+	}
+
+	mask = cpumask_of_node(node);
+
+	for_each_cpu(cpu, mask)
+		cfs_cpt_add_cpu(cptab, cpt, cpu);
+
+	cfs_cpt_add_node(cptab, cpt, node);
+
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_node);
+
+void cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+	const cpumask_t *mask;
+	int cpu;
+
+	if (node < 0 || node >= nr_node_ids) {
+		CDEBUG(D_INFO,
+		       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
+		return;
+	}
+
+	mask = cpumask_of_node(node);
+
+	for_each_cpu(cpu, mask)
+		cfs_cpt_del_cpu(cptab, cpt, cpu);
+
+	cfs_cpt_del_node(cptab, cpt, node);
+}
+EXPORT_SYMBOL(cfs_cpt_unset_node);
+
+int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt,
+			 const nodemask_t *mask)
+{
+	int node;
+
+	for_each_node_mask(node, *mask)
+		cfs_cpt_set_node(cptab, cpt, node);
+
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_nodemask);
+
+void cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt,
+			    const nodemask_t *mask)
+{
+	int node;
+
+	for_each_node_mask(node, *mask)
+		cfs_cpt_unset_node(cptab, cpt, node);
+}
+EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
+
+int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
+{
+	nodemask_t *mask;
+	int weight;
+	int rotor;
+	int node = 0;
+
+	/* convert CPU partition ID to HW node id */
+
+	if (cpt < 0 || cpt >= cptab->ctb_nparts) {
+		mask  = cptab->ctb_nodemask;
+		rotor = cptab->ctb_spread_rotor++;
+	} else {
+		mask  = cptab->ctb_parts[cpt].cpt_nodemask;
+		rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++;
+		node  = cptab->ctb_parts[cpt].cpt_node;
+	}
+
+	weight = nodes_weight(*mask);
+	if (weight > 0) {
+		rotor %= weight;
+
+		for_each_node_mask(node, *mask) {
+			if (rotor-- == 0)
+				return node;
+		}
+	}
+
+	return node;
+}
+EXPORT_SYMBOL(cfs_cpt_spread_node);
+
+int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
+{
+	int cpu = smp_processor_id();
+	int cpt = cptab->ctb_cpu2cpt[cpu];
+
+	if (cpt < 0) {
+		if (!remap)
+			return cpt;
+
+		/* don't return negative value for safety of upper layer,
+		 * instead we shadow the unknown cpu to a valid partition ID */
+		cpt = cpu % cptab->ctb_nparts;
+	}
+
+	return cpt;
+}
+EXPORT_SYMBOL(cfs_cpt_current);
+
+int cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
+{
+	LASSERT(cpu >= 0 && cpu < nr_cpu_ids);
+
+	return cptab->ctb_cpu2cpt[cpu];
+}
+EXPORT_SYMBOL(cfs_cpt_of_cpu);
+
+int cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node)
+{
+	if (node < 0 || node > nr_node_ids)
+		return CFS_CPT_ANY;
+
+	return cptab->ctb_node2cpt[node];
+}
+EXPORT_SYMBOL(cfs_cpt_of_node);
+
+int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
+{
+	nodemask_t *nodemask;
+	cpumask_t *cpumask;
+	int cpu;
+	int rc;
+
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	if (cpt == CFS_CPT_ANY) {
+		cpumask = cptab->ctb_cpumask;
+		nodemask = cptab->ctb_nodemask;
+	} else {
+		cpumask = cptab->ctb_parts[cpt].cpt_cpumask;
+		nodemask = cptab->ctb_parts[cpt].cpt_nodemask;
+	}
+
+	if (!cpumask_intersects(cpumask, cpu_online_mask)) {
+		CDEBUG(D_INFO, "No online CPU found in CPU partition %d, did "
+			"someone do CPU hotplug on system? You might need to "
+			"reload Lustre modules to keep system working well.\n",
+			cpt);
+		return -ENODEV;
+	}
+
+	for_each_online_cpu(cpu) {
+		if (cpumask_test_cpu(cpu, cpumask))
+			continue;
+
+		rc = set_cpus_allowed_ptr(current, cpumask);
+		set_mems_allowed(*nodemask);
+		if (rc == 0)
+			schedule(); /* switch to allowed CPU */
+
+		return rc;
+	}
+
+	/* don't need to set affinity because all online CPUs are covered */
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_bind);
+
+/**
+ * Choose max to \a number CPUs from \a node and set them in \a cpt.
+ * We always prefer to choose CPU in the same core/socket.
+ */
+static int cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt,
+				cpumask_t *node_mask, int number)
+{
+	cpumask_t *socket_mask = NULL;
+	cpumask_t *core_mask = NULL;
+	int rc = 0;
+	int cpu;
+	int i;
+
+	LASSERT(number > 0);
+
+	if (number >= cpumask_weight(node_mask)) {
+		while (!cpumask_empty(node_mask)) {
+			cpu = cpumask_first(node_mask);
+			cpumask_clear_cpu(cpu, node_mask);
+
+			if (!cpu_online(cpu))
+				continue;
+
+			rc = cfs_cpt_set_cpu(cptab, cpt, cpu);
+			if (!rc)
+				return -EINVAL;
+		}
+		return 0;
+	}
+
+	/* allocate scratch buffer */
+	LIBCFS_ALLOC(socket_mask, cpumask_size());
+	LIBCFS_ALLOC(core_mask, cpumask_size());
+	if (socket_mask == NULL || core_mask == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	while (!cpumask_empty(node_mask)) {
+		cpu = cpumask_first(node_mask);
+
+		/* get cpumask for cores in the same socket */
+		cpumask_and(socket_mask, topology_core_cpumask(cpu), node_mask);
+		while (!cpumask_empty(socket_mask)) {
+			/* get cpumask for hts in the same core */
+			cpumask_and(core_mask,
+				    topology_sibling_cpumask(cpu), node_mask);
+
+			for_each_cpu(i, core_mask) {
+				cpumask_clear_cpu(i, socket_mask);
+				cpumask_clear_cpu(i, node_mask);
+
+				if (!cpu_online(i))
+					continue;
+
+				rc = cfs_cpt_set_cpu(cptab, cpt, i);
+				if (!rc) {
+					rc = -EINVAL;
+					goto out;
+				}
+
+				if (--number == 0)
+					goto out;
+			}
+			cpu = cpumask_first(socket_mask);
+		}
+	}
+
+out:
+	if (core_mask != NULL)
+		LIBCFS_FREE(core_mask, cpumask_size());
+	if (socket_mask != NULL)
+		LIBCFS_FREE(socket_mask, cpumask_size());
+	return rc;
+}
+
+#define CPT_WEIGHT_MIN 4
+
+static int cfs_cpt_num_estimate(void)
+{
+	int nthr = cpumask_weight(topology_sibling_cpumask(smp_processor_id()));
+	int ncpu  = num_online_cpus();
+	int ncpt = 1;
+
+	if (ncpu > CPT_WEIGHT_MIN)
+		for (ncpt = 2; ncpu > 2 * nthr * ncpt; ncpt++);
+			/* nothing */
+
+#if (BITS_PER_LONG == 32)
+	/* config many CPU partitions on 32-bit system could consume
+	 * too much memory */
+	ncpt = min(2, ncpt);
+#endif
+	while (ncpu % ncpt != 0)
+		ncpt--; /* worst case is 1 */
+
+	return ncpt;
+}
+
+static struct cfs_cpt_table *cfs_cpt_table_create(int ncpt)
+{
+	struct cfs_cpt_table *cptab = NULL;
+	cpumask_t *node_mask = NULL;
+	int cpt = 0;
+	int node;
+	int num;
+	int rem;
+	int rc = 0;
+
+	num = cfs_cpt_num_estimate();
+	if (ncpt <= 0)
+		ncpt = num;
+
+	if (ncpt > num_online_cpus() || ncpt > 4 * num) {
+		CWARN("CPU partition number %d is larger than suggested "
+		      "value (%d), your system may have performance "
+		      "issue or run out of memory while under pressure\n",
+		      ncpt, num);
+	}
+
+	cptab = cfs_cpt_table_alloc(ncpt);
+	if (cptab == NULL) {
+		CERROR("Failed to allocate CPU map(%d)\n", ncpt);
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	LIBCFS_ALLOC(node_mask, cpumask_size());
+	if (node_mask == NULL) {
+		CERROR("Failed to allocate scratch cpumask\n");
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	num = num_online_cpus() / ncpt;
+	rem = num_online_cpus() % ncpt;
+	for_each_online_node(node) {
+		cpumask_copy(node_mask, cpumask_of_node(node));
+
+		while (cpt < ncpt && !cpumask_empty(node_mask)) {
+			struct cfs_cpu_partition *part = &cptab->ctb_parts[cpt];
+			int ncpu = cpumask_weight(part->cpt_cpumask);
+
+			rc = cfs_cpt_choose_ncpus(cptab, cpt, node_mask,
+						  num - ncpu);
+			if (rc < 0) {
+				rc = -EINVAL;
+				goto failed;
+			}
+
+			ncpu = cpumask_weight(part->cpt_cpumask);
+			if (ncpu == num + !!(rem > 0)) {
+				cpt++;
+				rem--;
+			}
+		}
+	}
+
+	LIBCFS_FREE(node_mask, cpumask_size());
+	return cptab;
+
+failed:
+	CERROR("Failed (rc=%d) to setup CPU partition table with %d "
+		"partitions, online HW NUMA nodes: %d, HW CPU cores: %d.\n",
+		rc, ncpt, num_online_nodes(), num_online_cpus());
+
+	if (node_mask != NULL)
+		LIBCFS_FREE(node_mask, cpumask_size());
+
+	if (cptab != NULL)
+		cfs_cpt_table_free(cptab);
+
+	return ERR_PTR(rc);
+}
+
+static struct cfs_cpt_table *cfs_cpt_table_create_pattern(const char *pattern)
+{
+	struct cfs_cpt_table *cptab;
+	char *pattern_dup;
+	char *bracket;
+	char *str;
+	int node = 0;
+	int ncpt = 0;
+	int cpt  = 0;
+	int high;
+	int rc;
+	int c;
+	int i;
+
+	pattern_dup = kstrdup(pattern, GFP_KERNEL);
+	if (pattern_dup == NULL) {
+		CERROR("Failed to duplicate pattern '%s'\n", pattern);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	str = cfs_trimwhite(pattern_dup);
+	if (*str == 'n' || *str == 'N') {
+		str++; /* skip 'N' char */
+		node = 1; /* NUMA pattern */
+		if (*str == '\0') {
+			node = -1;
+			for_each_online_node(i) {
+				if (!cpumask_empty(cpumask_of_node(i)))
+					ncpt++;
+			}
+			if (ncpt == 1) { /* single NUMA node */
+				kfree(pattern_dup);
+				return cfs_cpt_table_create(cpu_npartitions);
+			}
+		}
+	}
+
+	if (ncpt == 0) { /* scanning bracket which is mark of partition */
+		bracket = str;
+		while ((bracket = strchr(bracket, '['))) {
+			bracket++;
+			ncpt++;
+		}
+	}
+
+	if (ncpt == 0 ||
+	    (node && ncpt > num_online_nodes()) ||
+	    (!node && ncpt > num_online_cpus())) {
+		CERROR("Invalid pattern '%s', or too many partitions %d\n",
+		       pattern_dup, ncpt);
+		rc = -EINVAL;
+		goto err_free_str;
+	}
+
+	cptab = cfs_cpt_table_alloc(ncpt);
+	if (cptab == NULL) {
+		CERROR("Failed to allocate CPU partition table\n");
+		rc = -ENOMEM;
+		goto err_free_str;
+	}
+
+	if (node < 0) { /* shortcut to create CPT from NUMA & CPU topology */
+		for_each_online_node(i) {
+			if (cpumask_empty(cpumask_of_node(i)))
+				continue;
+
+			rc = cfs_cpt_set_node(cptab, cpt++, i);
+			if (!rc) {
+				rc = -EINVAL;
+				goto err_free_table;
+			}
+		}
+		kfree(pattern_dup);
+		return cptab;
+	}
+
+	high = node ? nr_node_ids - 1 : nr_cpu_ids - 1;
+
+	for (str = cfs_trimwhite(str), c = 0; /* until break */; c++) {
+		struct cfs_range_expr *range;
+		struct cfs_expr_list *el;
+		int n;
+
+		bracket = strchr(str, '[');
+		if (bracket == NULL) {
+			if (*str != 0) {
+				CERROR("Invalid pattern '%s'\n", str);
+				rc = -EINVAL;
+				goto err_free_table;
+			} else if (c != ncpt) {
+				CERROR("Expect %d partitions but found %d\n",
+					ncpt, c);
+				rc = -EINVAL;
+				goto err_free_table;
+			}
+			break;
+		}
+
+		if (sscanf(str, "%d%n", &cpt, &n) < 1) {
+			CERROR("Invalid CPU pattern '%s'\n", str);
+			rc = -EINVAL;
+			goto err_free_table;
+		}
+
+		if (cpt < 0 || cpt >= ncpt) {
+			CERROR("Invalid partition id %d, total partitions %d\n",
+			       cpt, ncpt);
+			rc = -EINVAL;
+			goto err_free_table;
+		}
+
+		if (cfs_cpt_weight(cptab, cpt) != 0) {
+			CERROR("Partition %d has already been set.\n", cpt);
+			rc = -EPERM;
+			goto err_free_table;
+		}
+
+		str = cfs_trimwhite(str + n);
+		if (str != bracket) {
+			CERROR("Invalid pattern '%s'\n", str);
+			rc = -EINVAL;
+			goto err_free_table;
+		}
+
+		bracket = strchr(str, ']');
+		if (bracket == NULL) {
+			CERROR("Missing right bracket for partition "
+				"%d in '%s'\n", cpt, str);
+			rc = -EINVAL;
+			goto err_free_table;
+		}
+
+		rc = cfs_expr_list_parse(str, (bracket - str) + 1, 0, high,
+					 &el);
+		if (rc) {
+			CERROR("Can't parse number range in '%s'\n", str);
+			rc = -ERANGE;
+			goto err_free_table;
+		}
+
+		list_for_each_entry(range, &el->el_exprs, re_link) {
+			for (i = range->re_lo; i <= range->re_hi; i++) {
+				if ((i - range->re_lo) % range->re_stride != 0)
+					continue;
+
+				rc = node ? cfs_cpt_set_node(cptab, cpt, i)
+					  : cfs_cpt_set_cpu(cptab, cpt, i);
+				if (!rc) {
+					cfs_expr_list_free(el);
+					rc = -EINVAL;
+					goto err_free_table;
+				}
+			}
+		}
+
+		cfs_expr_list_free(el);
+
+		if (!cfs_cpt_online(cptab, cpt)) {
+			CERROR("No online CPU is found on partition %d\n", cpt);
+			rc = -ENODEV;
+			goto err_free_table;
+		}
+
+		str = cfs_trimwhite(bracket + 1);
+	}
+
+	kfree(pattern_dup);
+	return cptab;
+
+err_free_table:
+	cfs_cpt_table_free(cptab);
+err_free_str:
+	kfree(pattern_dup);
+	return ERR_PTR(rc);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+#ifdef HAVE_HOTPLUG_STATE_MACHINE
+static enum cpuhp_state lustre_cpu_online;
+
+static int cfs_cpu_online(unsigned int cpu)
+{
+	return 0;
+}
+#endif
+
+static int cfs_cpu_dead(unsigned int cpu)
+{
+	bool warn;
+
+	/* if all HTs in a core are offline, it may break affinity */
+	warn = cpumask_any_and(topology_sibling_cpumask(cpu),
+			       cpu_online_mask) >= nr_cpu_ids;
+	CDEBUG(warn ? D_WARNING : D_INFO,
+	       "Lustre: can't support CPU plug-out well now, performance and stability could be impacted [CPU %u]\n",
+	       cpu);
+	return 0;
+}
+
+#ifndef HAVE_HOTPLUG_STATE_MACHINE
+static int cfs_cpu_notify(struct notifier_block *self, unsigned long action,
+			  void *hcpu)
+{
+	int cpu = (unsigned long)hcpu;
+
+	switch (action) {
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+	default:
+		if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) {
+			CDEBUG(D_INFO, "CPU changed [cpu %u action %lx]\n",
+			       cpu, action);
+			break;
+		}
+
+		cfs_cpu_dead(cpu);
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cfs_cpu_notifier = {
+	.notifier_call	= cfs_cpu_notify,
+	.priority	= 0
+};
+#endif /* !HAVE_HOTPLUG_STATE_MACHINE */
+#endif /* CONFIG_HOTPLUG_CPU */
+
+void cfs_cpu_fini(void)
+{
+	if (!IS_ERR_OR_NULL(cfs_cpt_table))
+		cfs_cpt_table_free(cfs_cpt_table);
+
+#ifdef CONFIG_HOTPLUG_CPU
+#ifdef HAVE_HOTPLUG_STATE_MACHINE
+	if (lustre_cpu_online > 0)
+		cpuhp_remove_state_nocalls(lustre_cpu_online);
+	cpuhp_remove_state_nocalls(CPUHP_LUSTRE_CFS_DEAD);
+#else
+	unregister_hotcpu_notifier(&cfs_cpu_notifier);
+#endif /* !HAVE_HOTPLUG_STATE_MACHINE */
+#endif /* CONFIG_HOTPLUG_CPU */
+}
+
+int cfs_cpu_init(void)
+{
+	int ret = -EINVAL;
+
+	LASSERT(!cfs_cpt_table);
+
+#ifdef CONFIG_HOTPLUG_CPU
+#ifdef HAVE_HOTPLUG_STATE_MACHINE
+	ret = cpuhp_setup_state_nocalls(CPUHP_LUSTRE_CFS_DEAD,
+					"fs/lustre/cfe:dead", NULL,
+					cfs_cpu_dead);
+	if (ret < 0)
+		goto failed;
+	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+					"fs/lustre/cfe:online",
+					cfs_cpu_online, NULL);
+	if (ret < 0)
+		goto failed;
+	lustre_cpu_online = ret;
+#else
+	register_hotcpu_notifier(&cfs_cpu_notifier);
+#endif /* !HAVE_HOTPLUG_STATE_MACHINE */
+#endif /* CONFIG_HOTPLUG_CPU */
+	ret = -EINVAL;
+
+	get_online_cpus();
+	if (*cpu_pattern != 0) {
+		cfs_cpt_table = cfs_cpt_table_create_pattern(cpu_pattern);
+		if (IS_ERR(cfs_cpt_table)) {
+			CERROR("Failed to create cptab from pattern '%s'\n",
+				cpu_pattern);
+			ret = PTR_ERR(cfs_cpt_table);
+			goto failed;
+		}
+
+	} else {
+		cfs_cpt_table = cfs_cpt_table_create(cpu_npartitions);
+		if (IS_ERR(cfs_cpt_table)) {
+			CERROR("Failed to create cptab with npartitions %d\n",
+				cpu_npartitions);
+			ret = PTR_ERR(cfs_cpt_table);
+			goto failed;
+		}
+	}
+	put_online_cpus();
+
+	LCONSOLE(0, "HW NUMA nodes: %d, HW CPU cores: %d, npartitions: %d\n",
+		 num_online_nodes(), num_online_cpus(),
+		 cfs_cpt_number(cfs_cpt_table));
+	return 0;
+
+failed:
+	put_online_cpus();
+	cfs_cpu_fini();
+	return ret;
+}
+
+#endif
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-adler.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-adler.c
new file mode 100644
index 0000000000000..90f502f35e580
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-adler.c
@@ -0,0 +1,135 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ */
+
+/*
+ * This is crypto api shash wrappers to zlib_adler32.
+ */
+
+#include <linux/module.h>
+#include <linux/zutil.h>
+#include <libcfs/linux/linux-crypto.h>
+#include <crypto/internal/hash.h>
+
+#define CHKSUM_BLOCK_SIZE	1
+#define CHKSUM_DIGEST_SIZE	4
+
+static int adler32_cra_init(struct crypto_tfm *tfm)
+{
+	u32 *key = crypto_tfm_ctx(tfm);
+
+	*key = 1;
+
+	return 0;
+}
+
+static int adler32_setkey(struct crypto_shash *hash, const u8 *key,
+			  unsigned int keylen)
+{
+	u32 *mctx = crypto_shash_ctx(hash);
+
+	if (keylen != sizeof(u32)) {
+		crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	*mctx = *(u32 *)key;
+	return 0;
+}
+
+static int adler32_init(struct shash_desc *desc)
+{
+	u32 *mctx = crypto_shash_ctx(desc->tfm);
+	u32 *cksump = shash_desc_ctx(desc);
+
+	*cksump = *mctx;
+
+	return 0;
+}
+
+static int adler32_update(struct shash_desc *desc, const u8 *data,
+			  unsigned int len)
+{
+	u32 *cksump = shash_desc_ctx(desc);
+
+	*cksump = zlib_adler32(*cksump, data, len);
+	return 0;
+}
+static int __adler32_finup(u32 *cksump, const u8 *data, unsigned int len,
+			   u8 *out)
+{
+	*(u32 *)out = zlib_adler32(*cksump, data, len);
+	return 0;
+}
+
+static int adler32_finup(struct shash_desc *desc, const u8 *data,
+			 unsigned int len, u8 *out)
+{
+	return __adler32_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int adler32_final(struct shash_desc *desc, u8 *out)
+{
+	u32 *cksump = shash_desc_ctx(desc);
+
+	*(u32 *)out = *cksump;
+	return 0;
+}
+
+static int adler32_digest(struct shash_desc *desc, const u8 *data,
+			  unsigned int len, u8 *out)
+{
+	return __adler32_finup(crypto_shash_ctx(desc->tfm), data, len,
+				    out);
+}
+static struct shash_alg alg = {
+	.setkey		= adler32_setkey,
+	.init		= adler32_init,
+	.update		= adler32_update,
+	.final		= adler32_final,
+	.finup		= adler32_finup,
+	.digest		= adler32_digest,
+	.descsize	= sizeof(u32),
+	.digestsize	= CHKSUM_DIGEST_SIZE,
+	.base		= {
+		.cra_name		= "adler32",
+		.cra_driver_name	= "adler32-zlib",
+		.cra_priority		= 100,
+		.cra_blocksize		= CHKSUM_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(u32),
+		.cra_module		= THIS_MODULE,
+		.cra_init		= adler32_cra_init,
+	}
+};
+
+int cfs_crypto_adler32_register(void)
+{
+	return crypto_register_shash(&alg);
+}
+
+void cfs_crypto_adler32_unregister(void)
+{
+	crypto_unregister_shash(&alg);
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32.c
new file mode 100644
index 0000000000000..58e4691cfb3de
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32.c
@@ -0,0 +1,148 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ */
+
+/*
+ * This is crypto api shash wrappers to crc32_le.
+ */
+
+#include <linux/module.h>
+#include <linux/crc32.h>
+#include <libcfs/linux/linux-crypto.h>
+#include <crypto/internal/hash.h>
+
+#define CHKSUM_BLOCK_SIZE	1
+#define CHKSUM_DIGEST_SIZE	4
+
+static u32 __crc32_le(u32 crc, unsigned char const *p, size_t len)
+{
+	return crc32_le(crc, p, len);
+}
+
+/** No default init with ~0 */
+static int crc32_cra_init(struct crypto_tfm *tfm)
+{
+	u32 *key = crypto_tfm_ctx(tfm);
+
+	*key = 0;
+
+	return 0;
+}
+
+
+/*
+ * Setting the seed allows arbitrary accumulators and flexible XOR policy
+ * If your algorithm starts with ~0, then XOR with ~0 before you set
+ * the seed.
+ */
+static int crc32_setkey(struct crypto_shash *hash, const u8 *key,
+			unsigned int keylen)
+{
+	u32 *mctx = crypto_shash_ctx(hash);
+
+	if (keylen != sizeof(u32)) {
+		crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	*mctx = le32_to_cpup((__le32 *)key);
+	return 0;
+}
+
+static int crc32_init(struct shash_desc *desc)
+{
+	u32 *mctx = crypto_shash_ctx(desc->tfm);
+	u32 *crcp = shash_desc_ctx(desc);
+
+	*crcp = *mctx;
+
+	return 0;
+}
+
+static int crc32_update(struct shash_desc *desc, const u8 *data,
+			unsigned int len)
+{
+	u32 *crcp = shash_desc_ctx(desc);
+
+	*crcp = __crc32_le(*crcp, data, len);
+	return 0;
+}
+/* No final XOR 0xFFFFFFFF, like crc32_le */
+static int __crc32_finup(u32 *crcp, const u8 *data, unsigned int len,
+			 u8 *out)
+{
+	*(__le32 *)out = cpu_to_le32(__crc32_le(*crcp, data, len));
+	return 0;
+}
+
+static int crc32_finup(struct shash_desc *desc, const u8 *data,
+		       unsigned int len, u8 *out)
+{
+	return __crc32_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int crc32_final(struct shash_desc *desc, u8 *out)
+{
+	u32 *crcp = shash_desc_ctx(desc);
+
+	*(__le32 *)out = cpu_to_le32p(crcp);
+	return 0;
+}
+
+static int crc32_digest(struct shash_desc *desc, const u8 *data,
+			unsigned int len, u8 *out)
+{
+	return __crc32_finup(crypto_shash_ctx(desc->tfm), data, len,
+			     out);
+}
+static struct shash_alg alg = {
+	.setkey		= crc32_setkey,
+	.init		= crc32_init,
+	.update		= crc32_update,
+	.final		= crc32_final,
+	.finup		= crc32_finup,
+	.digest		= crc32_digest,
+	.descsize	= sizeof(u32),
+	.digestsize	= CHKSUM_DIGEST_SIZE,
+	.base		= {
+		.cra_name		= "crc32",
+		.cra_driver_name	= "crc32-table",
+		.cra_priority		= 100,
+		.cra_blocksize		= CHKSUM_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(u32),
+		.cra_module		= THIS_MODULE,
+		.cra_init		= crc32_cra_init,
+	}
+};
+
+int cfs_crypto_crc32_register(void)
+{
+	return crypto_register_shash(&alg);
+}
+
+void cfs_crypto_crc32_unregister(void)
+{
+	crypto_unregister_shash(&alg);
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c
new file mode 100644
index 0000000000000..fc55ad7969fab
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c
@@ -0,0 +1,159 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Wrappers for kernel crypto shash api to pclmulqdq crc32c imlementation.
+ *
+ * Author:     James Simmons <jsimmons@infradead.org>
+ */
+#include <linux/crc32.h>
+#include <crypto/internal/hash.h>
+#include <linux/crc32.h>
+#include <asm/cpufeature.h>
+#ifdef HAVE_FPU_API_HEADER
+#include <asm/fpu/api.h>
+#else
+#include <asm/i387.h>
+#endif
+#include <libcfs/libcfs.h>
+
+#define CHKSUM_BLOCK_SIZE	1
+#define CHKSUM_DIGEST_SIZE	4
+
+asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
+				unsigned int crc_init);
+
+static int crc32c_pclmul_cra_init(struct crypto_tfm *tfm)
+{
+	u32 *key = crypto_tfm_ctx(tfm);
+
+	*key = ~0;
+	return 0;
+}
+
+/*
+ * Setting the seed allows arbitrary accumulators and flexible XOR policy
+ * If your algorithm starts with ~0, then XOR with ~0 before you set
+ * the seed.
+ */
+static int crc32c_pclmul_setkey(struct crypto_shash *hash, const u8 *key,
+			unsigned int keylen)
+{
+	u32 *mctx = crypto_shash_ctx(hash);
+
+	if (keylen != sizeof(u32)) {
+		crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	*mctx = le32_to_cpup((__le32 *)key);
+	return 0;
+}
+
+static int crc32c_pclmul_init(struct shash_desc *desc)
+{
+	u32 *mctx = crypto_shash_ctx(desc->tfm);
+	u32 *crcp = shash_desc_ctx(desc);
+
+	*crcp = *mctx;
+	return 0;
+}
+
+static int crc32c_pclmul_update(struct shash_desc *desc, const u8 *data,
+			       unsigned int len)
+{
+	u32 *crcp = shash_desc_ctx(desc);
+
+	kernel_fpu_begin();
+	*crcp = crc_pcl(data, len, *crcp);
+	kernel_fpu_end();
+	return 0;
+}
+
+/* No final XOR 0xFFFFFFFF, like crc32_le */
+static int __crc32c_pclmul_finup(u32 *crcp, const u8 *data, unsigned int len,
+				u8 *out)
+{
+	kernel_fpu_begin();
+	*(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
+	kernel_fpu_end();
+	return 0;
+}
+
+static int crc32c_pclmul_finup(struct shash_desc *desc, const u8 *data,
+			      unsigned int len, u8 *out)
+{
+	return __crc32c_pclmul_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int crc32c_pclmul_digest(struct shash_desc *desc, const u8 *data,
+			       unsigned int len, u8 *out)
+{
+	return __crc32c_pclmul_finup(crypto_shash_ctx(desc->tfm), data, len,
+				    out);
+}
+
+static int crc32c_pclmul_final(struct shash_desc *desc, u8 *out)
+{
+	u32 *crcp = shash_desc_ctx(desc);
+
+	*(__le32 *)out = ~cpu_to_le32p(crcp);
+	return 0;
+}
+
+static struct shash_alg alg = {
+	.setkey		= crc32c_pclmul_setkey,
+	.init		= crc32c_pclmul_init,
+	.update		= crc32c_pclmul_update,
+	.final		= crc32c_pclmul_final,
+	.finup		= crc32c_pclmul_finup,
+	.digest		= crc32c_pclmul_digest,
+	.descsize	= sizeof(u32),
+	.digestsize	= CHKSUM_DIGEST_SIZE,
+	.base		= {
+			.cra_name		= "crc32c",
+			.cra_driver_name	= "crc32c-pclmul",
+			.cra_priority		= 150,
+			.cra_blocksize		= CHKSUM_BLOCK_SIZE,
+			.cra_ctxsize		= sizeof(u32),
+			.cra_module		= THIS_MODULE,
+			.cra_init		= crc32c_pclmul_cra_init,
+	}
+};
+
+#ifndef X86_FEATURE_XMM4_2
+#define X86_FEATURE_XMM4_2	(4*32+20)	/* "sse4_2" SSE-4.2 */
+#endif
+
+int cfs_crypto_crc32c_pclmul_register(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_XMM4_2)) {
+		CDEBUG(D_INFO, "CRC32 instruction is not detected.\n");
+		return -ENODEV;
+	}
+	return crypto_register_shash(&alg);
+}
+
+void cfs_crypto_crc32c_pclmul_unregister(void)
+{
+	crypto_unregister_shash(&alg);
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c
new file mode 100644
index 0000000000000..a238e4e39fce0
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c
@@ -0,0 +1,195 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Wrappers for kernel crypto shash api to pclmulqdq crc32 imlementation.
+ *
+ * Author:     Alexander Boyko <Alexander_Boyko@xyratex.com>
+ */
+#include <linux/crc32.h>
+#include <crypto/internal/hash.h>
+#include <linux/crc32.h>
+#include <asm/cpufeature.h>
+#ifdef HAVE_FPU_API_HEADER
+#include <asm/fpu/api.h>
+#else
+#include <asm/i387.h>
+#endif
+#include <libcfs/libcfs.h>
+
+#define CHKSUM_BLOCK_SIZE	1
+#define CHKSUM_DIGEST_SIZE	4
+
+#define PCLMUL_MIN_LEN		64L     /* minimum size of buffer
+					 * for crc32_pclmul_le_16 */
+#define SCALE_F			16L	/* size of xmm register */
+#define SCALE_F_MASK		(SCALE_F - 1)
+
+u32 crc32_pclmul_le_16(unsigned char const *buffer, size_t len, u32 crc32);
+
+static u32 __attribute__((pure))
+	crc32_pclmul_le(u32 crc, unsigned char const *p, size_t len)
+{
+	unsigned int iquotient;
+	unsigned int iremainder;
+	unsigned int prealign;
+
+	if (len < PCLMUL_MIN_LEN + SCALE_F_MASK)
+		return crc32_le(crc, p, len);
+
+	if ((long)p & SCALE_F_MASK) {
+		/* align p to 16 byte */
+		prealign = SCALE_F - ((long)p & SCALE_F_MASK);
+
+		crc = crc32_le(crc, p, prealign);
+		len -= prealign;
+		p = (unsigned char *)(((unsigned long)p + SCALE_F_MASK) &
+				     ~SCALE_F_MASK);
+	}
+	iquotient = len & (~SCALE_F_MASK);
+	iremainder = len & SCALE_F_MASK;
+
+	kernel_fpu_begin();
+	crc = crc32_pclmul_le_16(p, iquotient, crc);
+	kernel_fpu_end();
+
+	if (iremainder)
+		crc = crc32_le(crc, p + iquotient, iremainder);
+
+	return crc;
+}
+
+static int crc32_pclmul_cra_init(struct crypto_tfm *tfm)
+{
+	u32 *key = crypto_tfm_ctx(tfm);
+
+	*key = 0;
+
+	return 0;
+}
+
+/*
+ * Setting the seed allows arbitrary accumulators and flexible XOR policy
+ * If your algorithm starts with ~0, then XOR with ~0 before you set
+ * the seed.
+ */
+static int crc32_pclmul_setkey(struct crypto_shash *hash, const u8 *key,
+			unsigned int keylen)
+{
+	u32 *mctx = crypto_shash_ctx(hash);
+
+	if (keylen != sizeof(u32)) {
+		crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	*mctx = le32_to_cpup((__le32 *)key);
+	return 0;
+}
+
+static int crc32_pclmul_init(struct shash_desc *desc)
+{
+	u32 *mctx = crypto_shash_ctx(desc->tfm);
+	u32 *crcp = shash_desc_ctx(desc);
+
+	*crcp = *mctx;
+
+	return 0;
+}
+
+static int crc32_pclmul_update(struct shash_desc *desc, const u8 *data,
+			       unsigned int len)
+{
+	u32 *crcp = shash_desc_ctx(desc);
+
+	*crcp = crc32_pclmul_le(*crcp, data, len);
+	return 0;
+}
+
+/* No final XOR 0xFFFFFFFF, like crc32_le */
+static int __crc32_pclmul_finup(u32 *crcp, const u8 *data, unsigned int len,
+				u8 *out)
+{
+	*(__le32 *)out = cpu_to_le32(crc32_pclmul_le(*crcp, data, len));
+	return 0;
+}
+
+static int crc32_pclmul_finup(struct shash_desc *desc, const u8 *data,
+			      unsigned int len, u8 *out)
+{
+	return __crc32_pclmul_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int crc32_pclmul_final(struct shash_desc *desc, u8 *out)
+{
+	u32 *crcp = shash_desc_ctx(desc);
+
+	*(__le32 *)out = cpu_to_le32p(crcp);
+	return 0;
+}
+
+static int crc32_pclmul_digest(struct shash_desc *desc, const u8 *data,
+			       unsigned int len, u8 *out)
+{
+	return __crc32_pclmul_finup(crypto_shash_ctx(desc->tfm), data, len,
+				    out);
+}
+
+static struct shash_alg alg = {
+	.setkey		= crc32_pclmul_setkey,
+	.init		= crc32_pclmul_init,
+	.update		= crc32_pclmul_update,
+	.final		= crc32_pclmul_final,
+	.finup		= crc32_pclmul_finup,
+	.digest		= crc32_pclmul_digest,
+	.descsize	= sizeof(u32),
+	.digestsize	= CHKSUM_DIGEST_SIZE,
+	.base		= {
+			.cra_name		= "crc32",
+			.cra_driver_name	= "crc32-pclmul",
+			.cra_priority		= 200,
+			.cra_blocksize		= CHKSUM_BLOCK_SIZE,
+			.cra_ctxsize		= sizeof(u32),
+			.cra_module		= THIS_MODULE,
+			.cra_init		= crc32_pclmul_cra_init,
+	}
+};
+
+#ifndef X86_FEATURE_PCLMULQDQ
+#define X86_FEATURE_PCLMULQDQ	(4*32+1)	/* PCLMULQDQ instruction */
+#endif
+
+int cfs_crypto_crc32_pclmul_register(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
+		CDEBUG(D_INFO, "PCLMULQDQ-NI instructions are not detected.\n");
+		return -ENODEV;
+	}
+	return crypto_register_shash(&alg);
+}
+
+void cfs_crypto_crc32_pclmul_unregister(void)
+{
+	crypto_unregister_shash(&alg);
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto.c
new file mode 100644
index 0000000000000..1991a86a49598
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto.c
@@ -0,0 +1,510 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+
+#include <crypto/hash.h>
+#include <linux/scatterlist.h>
+#include <libcfs/libcfs.h>
+#include <libcfs/libcfs_crypto.h>
+#include <libcfs/linux/linux-crypto.h>
+
+#ifndef HAVE_CRYPTO_HASH_HELPERS
+static inline const char *crypto_ahash_alg_name(struct crypto_ahash *tfm)
+{
+	return crypto_tfm_alg_name(crypto_ahash_tfm(tfm));
+}
+
+static inline const char *crypto_ahash_driver_name(struct crypto_ahash *tfm)
+{
+	return crypto_tfm_alg_driver_name(crypto_ahash_tfm(tfm));
+}
+#endif
+
+/**
+ *  Array of hash algorithm speed in MByte per second
+ */
+static int cfs_crypto_hash_speeds[CFS_HASH_ALG_MAX];
+
+/**
+ * Initialize the state descriptor for the specified hash algorithm.
+ *
+ * An internal routine to allocate the hash-specific state in \a hdesc for
+ * use with cfs_crypto_hash_digest() to compute the hash of a single message,
+ * though possibly in multiple chunks.  The descriptor internal state should
+ * be freed with cfs_crypto_hash_final().
+ *
+ * \param[in]  hash_alg	hash algorithm id (CFS_HASH_ALG_*)
+ * \param[out] type	pointer to the hash description in hash_types[] array
+ * \param[in,out] req	ahash request to be initialized
+ * \param[in]  key	initial hash value/state, NULL to use default value
+ * \param[in]  key_len	length of \a key
+ *
+ * \retval		0 on success
+ * \retval		negative errno on failure
+ */
+static int cfs_crypto_hash_alloc(enum cfs_crypto_hash_alg hash_alg,
+				 const struct cfs_crypto_hash_type **type,
+				 struct ahash_request **req,
+				 unsigned char *key,
+				 unsigned int key_len)
+{
+	struct crypto_ahash *tfm;
+	int err = 0;
+
+	*type = cfs_crypto_hash_type(hash_alg);
+
+	if (*type == NULL) {
+		CWARN("Unsupported hash algorithm id = %d, max id is %d\n",
+		      hash_alg, CFS_HASH_ALG_MAX);
+		return -EINVAL;
+	}
+	tfm = crypto_alloc_ahash((*type)->cht_name, 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(tfm)) {
+		CDEBUG(D_INFO, "Failed to alloc crypto hash %s\n",
+		       (*type)->cht_name);
+		return PTR_ERR(tfm);
+	}
+
+	*req = ahash_request_alloc(tfm, GFP_KERNEL);
+	if (!*req) {
+		CDEBUG(D_INFO, "Failed to alloc ahash_request for %s\n",
+		       (*type)->cht_name);
+		crypto_free_ahash(tfm);
+		return -ENOMEM;
+	}
+
+	ahash_request_set_callback(*req, 0, NULL, NULL);
+
+	if (key)
+		err = crypto_ahash_setkey(tfm, key, key_len);
+	else if ((*type)->cht_key != 0)
+		err = crypto_ahash_setkey(tfm,
+					 (unsigned char *)&((*type)->cht_key),
+					 (*type)->cht_size);
+
+	if (err != 0) {
+		ahash_request_free(*req);
+		crypto_free_ahash(tfm);
+		return err;
+	}
+
+	CDEBUG(D_INFO, "Using crypto hash: %s (%s) speed %d MB/s\n",
+	       crypto_ahash_alg_name(tfm), crypto_ahash_driver_name(tfm),
+	       cfs_crypto_hash_speeds[hash_alg]);
+
+	err = crypto_ahash_init(*req);
+	if (err) {
+		ahash_request_free(*req);
+		crypto_free_ahash(tfm);
+	}
+	return err;
+}
+
+/**
+ * Calculate hash digest for the passed buffer.
+ *
+ * This should be used when computing the hash on a single contiguous buffer.
+ * It combines the hash initialization, computation, and cleanup.
+ *
+ * \param[in] hash_alg	id of hash algorithm (CFS_HASH_ALG_*)
+ * \param[in] buf	data buffer on which to compute hash
+ * \param[in] buf_len	length of \a buf in bytes
+ * \param[in] key	initial value/state for algorithm, if \a key = NULL
+ *			use default initial value
+ * \param[in] key_len	length of \a key in bytes
+ * \param[out] hash	pointer to computed hash value, if \a hash = NULL then
+ *			\a hash_len is to digest size in bytes, retval -ENOSPC
+ * \param[in,out] hash_len size of \a hash buffer
+ *
+ * \retval -EINVAL       \a buf, \a buf_len, \a hash_len, \a hash_alg invalid
+ * \retval -ENOENT       \a hash_alg is unsupported
+ * \retval -ENOSPC       \a hash is NULL, or \a hash_len less than digest size
+ * \retval		0 for success
+ * \retval		negative errno for other errors from lower layers.
+ */
+int cfs_crypto_hash_digest(enum cfs_crypto_hash_alg hash_alg,
+			   const void *buf, unsigned int buf_len,
+			   unsigned char *key, unsigned int key_len,
+			   unsigned char *hash, unsigned int *hash_len)
+{
+	struct scatterlist	sl;
+	struct ahash_request *req;
+	int			err;
+	const struct cfs_crypto_hash_type	*type;
+
+	if (!buf || buf_len == 0 || !hash_len)
+		return -EINVAL;
+
+	err = cfs_crypto_hash_alloc(hash_alg, &type, &req, key, key_len);
+	if (err != 0)
+		return err;
+
+	if (!hash || *hash_len < type->cht_size) {
+		*hash_len = type->cht_size;
+		crypto_free_ahash(crypto_ahash_reqtfm(req));
+		ahash_request_free(req);
+		return -ENOSPC;
+	}
+	sg_init_one(&sl, (void *)buf, buf_len);
+
+	ahash_request_set_crypt(req, &sl, hash, sl.length);
+	err = crypto_ahash_digest(req);
+	crypto_free_ahash(crypto_ahash_reqtfm(req));
+	ahash_request_free(req);
+
+	return err;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_digest);
+
+/**
+ * Allocate and initialize desriptor for hash algorithm.
+ *
+ * This should be used to initialize a hash descriptor for multiple calls
+ * to a single hash function when computing the hash across multiple
+ * separate buffers or pages using cfs_crypto_hash_update{,_page}().
+ *
+ * The hash descriptor should be freed with cfs_crypto_hash_final().
+ *
+ * \param[in] hash_alg	algorithm id (CFS_HASH_ALG_*)
+ * \param[in] key	initial value/state for algorithm, if \a key = NULL
+ *			use default initial value
+ * \param[in] key_len	length of \a key in bytes
+ *
+ * \retval		pointer to descriptor of hash instance
+ * \retval		ERR_PTR(errno) in case of error
+ */
+struct cfs_crypto_hash_desc *
+	cfs_crypto_hash_init(enum cfs_crypto_hash_alg hash_alg,
+			     unsigned char *key, unsigned int key_len)
+{
+	struct ahash_request *req;
+	int					err;
+	const struct cfs_crypto_hash_type       *type;
+
+	err = cfs_crypto_hash_alloc(hash_alg, &type, &req, key, key_len);
+	if (err)
+		return ERR_PTR(err);
+	return (struct cfs_crypto_hash_desc *)req;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_init);
+
+/**
+ * Update hash digest computed on data within the given \a page
+ *
+ * \param[in] hdesc	hash state descriptor
+ * \param[in] page	data page on which to compute the hash
+ * \param[in] offset	offset within \a page at which to start hash
+ * \param[in] len	length of data on which to compute hash
+ *
+ * \retval		0 for success
+ * \retval		negative errno on failure
+ */
+int cfs_crypto_hash_update_page(struct cfs_crypto_hash_desc *hdesc,
+				struct page *page, unsigned int offset,
+				unsigned int len)
+{
+	struct ahash_request *req = (void *)hdesc;
+	struct scatterlist sl;
+
+	sg_init_table(&sl, 1);
+	sg_set_page(&sl, page, len, offset & ~PAGE_MASK);
+
+	ahash_request_set_crypt(req, &sl, NULL, sl.length);
+	return crypto_ahash_update(req);
+}
+EXPORT_SYMBOL(cfs_crypto_hash_update_page);
+
+/**
+ * Update hash digest computed on the specified data
+ *
+ * \param[in] hdesc	hash state descriptor
+ * \param[in] buf	data buffer on which to compute the hash
+ * \param[in] buf_len	length of \buf on which to compute hash
+ *
+ * \retval		0 for success
+ * \retval		negative errno on failure
+ */
+int cfs_crypto_hash_update(struct cfs_crypto_hash_desc *hdesc,
+			   const void *buf, unsigned int buf_len)
+{
+	struct ahash_request *req = (void *)hdesc;
+	struct scatterlist sl;
+
+	sg_init_one(&sl, (void *)buf, buf_len);
+
+	ahash_request_set_crypt(req, &sl, NULL, sl.length);
+	return crypto_ahash_update(req);
+}
+EXPORT_SYMBOL(cfs_crypto_hash_update);
+
+/**
+ * Finish hash calculation, copy hash digest to buffer, clean up hash descriptor
+ *
+ * \param[in]	hdesc		hash descriptor
+ * \param[out]	hash		pointer to hash buffer to store hash digest
+ * \param[in,out] hash_len	pointer to hash buffer size, if \a hash == NULL
+ *				or hash_len == NULL only free \a hdesc instead
+ *				of computing the hash
+ *
+ * \retval		0 for success
+ * \retval		-EOVERFLOW if hash_len is too small for the hash digest
+ * \retval		negative errno for other errors from lower layers
+ */
+int cfs_crypto_hash_final(struct cfs_crypto_hash_desc *hdesc,
+			  unsigned char *hash, unsigned int *hash_len)
+{
+	struct ahash_request *req = (void *)hdesc;
+	int size = crypto_ahash_digestsize(crypto_ahash_reqtfm(req));
+	int err;
+
+	if (!hash || !hash_len) {
+		err = 0;
+		goto free;
+	}
+	if (*hash_len < size) {
+		err = -EOVERFLOW;
+		goto free;
+	}
+
+	ahash_request_set_crypt(req, NULL, hash, 0);
+	err = crypto_ahash_final(req);
+	if (err == 0)
+		*hash_len = size;
+free:
+	crypto_free_ahash(crypto_ahash_reqtfm(req));
+	ahash_request_free(req);
+
+	return err;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_final);
+
+/**
+ * Compute the speed of specified hash function
+ *
+ * Run a speed test on the given hash algorithm on buffer using a 1MB buffer
+ * size.  This is a reasonable buffer size for Lustre RPCs, even if the actual
+ * RPC size is larger or smaller.
+ *
+ * The speed is stored internally in the cfs_crypto_hash_speeds[] array, and
+ * is available through the cfs_crypto_hash_speed() function.
+ *
+ * \param[in] hash_alg	hash algorithm id (CFS_HASH_ALG_*)
+ * \param[in] buf	data buffer on which to compute the hash
+ * \param[in] buf_len	length of \buf on which to compute hash
+ */
+static void cfs_crypto_performance_test(enum cfs_crypto_hash_alg hash_alg)
+{
+	int			buf_len = max(PAGE_SIZE, 1048576UL);
+	void			*buf;
+	unsigned long		start, end;
+	int			err = 0;
+	unsigned long		bcount;
+	struct page		*page;
+	unsigned char		hash[CFS_CRYPTO_HASH_DIGESTSIZE_MAX];
+	unsigned int		hash_len = sizeof(hash);
+
+	page = alloc_page(GFP_KERNEL);
+	if (page == NULL) {
+		err = -ENOMEM;
+		goto out_err;
+	}
+
+	buf = kmap(page);
+	memset(buf, 0xAD, PAGE_SIZE);
+	kunmap(page);
+
+	for (start = jiffies, end = start + msecs_to_jiffies(MSEC_PER_SEC / 4),
+	     bcount = 0; time_before(jiffies, end) && err == 0; bcount++) {
+		struct cfs_crypto_hash_desc *hdesc;
+		int i;
+
+		hdesc = cfs_crypto_hash_init(hash_alg, NULL, 0);
+		if (IS_ERR(hdesc)) {
+			err = PTR_ERR(hdesc);
+			break;
+		}
+
+		for (i = 0; i < buf_len / PAGE_SIZE; i++) {
+			err = cfs_crypto_hash_update_page(hdesc, page, 0,
+							  PAGE_SIZE);
+			if (err != 0)
+				break;
+		}
+
+		err = cfs_crypto_hash_final(hdesc, hash, &hash_len);
+		if (err != 0)
+			break;
+	}
+	end = jiffies;
+	__free_page(page);
+out_err:
+	if (err != 0) {
+		cfs_crypto_hash_speeds[hash_alg] = err;
+		CDEBUG(D_INFO, "Crypto hash algorithm %s test error: rc = %d\n",
+		       cfs_crypto_hash_name(hash_alg), err);
+	} else {
+		unsigned long   tmp;
+
+		tmp = ((bcount * buf_len / jiffies_to_msecs(end - start)) *
+		       1000) / (1024 * 1024);
+		cfs_crypto_hash_speeds[hash_alg] = (int)tmp;
+		CDEBUG(D_CONFIG, "Crypto hash algorithm %s speed = %d MB/s\n",
+		       cfs_crypto_hash_name(hash_alg),
+		       cfs_crypto_hash_speeds[hash_alg]);
+	}
+}
+
+/**
+ * hash speed in Mbytes per second for valid hash algorithm
+ *
+ * Return the performance of the specified \a hash_alg that was
+ * computed using cfs_crypto_performance_test().  If the performance
+ * has not yet been computed, do that when it is first requested.
+ * That avoids computing the speed when it is not actually needed.
+ * To avoid competing threads computing the checksum speed at the
+ * same time, only compute a single checksum speed at one time.
+ *
+ * \param[in] hash_alg	hash algorithm id (CFS_HASH_ALG_*)
+ *
+ * \retval		positive speed of the hash function in MB/s
+ * \retval		-ENOENT if \a hash_alg is unsupported
+ * \retval		negative errno if \a hash_alg speed is unavailable
+ */
+int cfs_crypto_hash_speed(enum cfs_crypto_hash_alg hash_alg)
+{
+	if (hash_alg < CFS_HASH_ALG_MAX) {
+		if (unlikely(cfs_crypto_hash_speeds[hash_alg] == 0)) {
+			static DEFINE_MUTEX(crypto_hash_speed_mutex);
+
+			mutex_lock(&crypto_hash_speed_mutex);
+			if (cfs_crypto_hash_speeds[hash_alg] == 0)
+				cfs_crypto_performance_test(hash_alg);
+			mutex_unlock(&crypto_hash_speed_mutex);
+		}
+		return cfs_crypto_hash_speeds[hash_alg];
+	}
+
+	return -ENOENT;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_speed);
+
+/**
+ * Run the performance test for all hash algorithms.
+ *
+ * Run the cfs_crypto_performance_test() benchmark for some of the available
+ * hash functions at module load time.  This can't be reliably done at runtime
+ * since the CPUs may be under load from thousands of connecting clients when
+ * the first client connects and the checksum speeds are needed.
+ *
+ * Since the setup cost and computation speed of various hash algorithms is
+ * a function of the buffer size (and possibly internal contention of offload
+ * engines), this speed only represents an estimate of the actual speed under
+ * actual usage, but is reasonable for comparing available algorithms.
+ *
+ * The actual speeds are available via cfs_crypto_hash_speed() for later
+ * comparison.
+ *
+ * \retval		0 on success
+ * \retval		-ENOMEM if no memory is available for test buffer
+ */
+static int cfs_crypto_test_hashes(void)
+{
+	enum cfs_crypto_hash_alg hash_alg;
+
+	for (hash_alg = 1; hash_alg < CFS_HASH_ALG_SPEED_MAX; hash_alg++)
+		cfs_crypto_performance_test(hash_alg);
+
+	return 0;
+}
+
+static int adler32;
+
+#ifdef HAVE_CRC32
+static int crc32;
+#endif
+#ifdef HAVE_PCLMULQDQ
+#ifdef NEED_CRC32_ACCEL
+static int crc32_pclmul;
+#endif
+#ifdef NEED_CRC32C_ACCEL
+static int crc32c_pclmul;
+#endif
+#endif /* HAVE_PCLMULQDQ */
+
+/**
+ * Register available hash functions
+ *
+ * \retval		0
+ */
+int cfs_crypto_register(void)
+{
+	request_module("crc32c");
+
+	adler32 = cfs_crypto_adler32_register();
+
+#ifdef HAVE_CRC32
+	crc32 = cfs_crypto_crc32_register();
+#endif
+#ifdef HAVE_PCLMULQDQ
+#ifdef NEED_CRC32_ACCEL
+	crc32_pclmul = cfs_crypto_crc32_pclmul_register();
+#endif
+#ifdef NEED_CRC32C_ACCEL
+	crc32c_pclmul = cfs_crypto_crc32c_pclmul_register();
+#endif
+#endif /* HAVE_PCLMULQDQ */
+
+	/* check all algorithms and do performance test */
+	cfs_crypto_test_hashes();
+
+	return 0;
+}
+
+/**
+ * Unregister previously registered hash functions
+ */
+void cfs_crypto_unregister(void)
+{
+	if (adler32 == 0)
+		cfs_crypto_adler32_unregister();
+
+#ifdef HAVE_CRC32
+	if (crc32 == 0)
+		cfs_crypto_crc32_unregister();
+#endif
+#ifdef HAVE_PCLMULQDQ
+#ifdef NEED_CRC32_ACCEL
+	if (crc32_pclmul == 0)
+		cfs_crypto_crc32_pclmul_unregister();
+#endif
+#ifdef NEED_CRC32C_ACCEL
+	if (crc32c_pclmul == 0)
+		cfs_crypto_crc32c_pclmul_unregister();
+#endif
+#endif /* HAVE_PCLMULQDQ */
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c
new file mode 100644
index 0000000000000..7b2e46e61b1bf
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c
@@ -0,0 +1,297 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/linux/linux-curproc.c
+ *
+ * Lustre curproc API implementation for Linux kernel
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#include <linux/sched.h>
+#include <linux/fs_struct.h>
+
+#include <linux/compat.h>
+#include <linux/thread_info.h>
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <libcfs/libcfs.h>
+
+/*
+ * Implementation of cfs_curproc API (see portals/include/libcfs/curproc.h)
+ * for Linux kernel.
+ */
+
+/* Currently all the CFS_CAP_* defines match CAP_* ones. */
+#define cfs_cap_pack(cap) (cap)
+#define cfs_cap_unpack(cap) (cap)
+
+void cfs_cap_raise(cfs_cap_t cap)
+{
+        struct cred *cred;
+        if ((cred = prepare_creds())) {
+                cap_raise(cred->cap_effective, cfs_cap_unpack(cap));
+                commit_creds(cred);
+        }
+}
+
+void cfs_cap_lower(cfs_cap_t cap)
+{
+        struct cred *cred;
+        if ((cred = prepare_creds())) {
+                cap_lower(cred->cap_effective, cfs_cap_unpack(cap));
+                commit_creds(cred);
+        }
+}
+
+int cfs_cap_raised(cfs_cap_t cap)
+{
+        return cap_raised(current_cap(), cfs_cap_unpack(cap));
+}
+
+static void cfs_kernel_cap_pack(kernel_cap_t kcap, cfs_cap_t *cap)
+{
+#if defined (_LINUX_CAPABILITY_VERSION) && _LINUX_CAPABILITY_VERSION == 0x19980330
+        *cap = cfs_cap_pack(kcap);
+#elif defined (_LINUX_CAPABILITY_VERSION) && _LINUX_CAPABILITY_VERSION == 0x20071026
+        *cap = cfs_cap_pack(kcap[0]);
+#elif defined(_KERNEL_CAPABILITY_VERSION) && _KERNEL_CAPABILITY_VERSION == 0x20080522
+        /* XXX lost high byte */
+        *cap = cfs_cap_pack(kcap.cap[0]);
+#else
+        #error "need correct _KERNEL_CAPABILITY_VERSION "
+#endif
+}
+
+static void cfs_kernel_cap_unpack(kernel_cap_t *kcap, cfs_cap_t cap)
+{
+#if defined (_LINUX_CAPABILITY_VERSION) && _LINUX_CAPABILITY_VERSION == 0x19980330
+        *kcap = cfs_cap_unpack(cap);
+#elif defined (_LINUX_CAPABILITY_VERSION) && _LINUX_CAPABILITY_VERSION == 0x20071026
+        (*kcap)[0] = cfs_cap_unpack(cap);
+#elif defined(_KERNEL_CAPABILITY_VERSION) && _KERNEL_CAPABILITY_VERSION == 0x20080522
+        kcap->cap[0] = cfs_cap_unpack(cap);
+#else
+        #error "need correct _KERNEL_CAPABILITY_VERSION "
+#endif
+}
+
+cfs_cap_t cfs_curproc_cap_pack(void)
+{
+        cfs_cap_t cap;
+        cfs_kernel_cap_pack(current_cap(), &cap);
+        return cap;
+}
+
+void cfs_curproc_cap_unpack(cfs_cap_t cap)
+{
+        struct cred *cred;
+        if ((cred = prepare_creds())) {
+                cfs_kernel_cap_unpack(&cred->cap_effective, cap);
+                commit_creds(cred);
+        }
+}
+
+int cfs_capable(cfs_cap_t cap)
+{
+        return capable(cfs_cap_unpack(cap));
+}
+
+static int cfs_access_process_vm(struct task_struct *tsk,
+				 struct mm_struct *mm,
+				 unsigned long addr,
+				 void *buf, int len, int write)
+{
+	/* Just copied from kernel for the kernels which doesn't
+	 * have access_process_vm() exported */
+	struct vm_area_struct *vma;
+	struct page *page;
+	void *old_buf = buf;
+
+	/* Avoid deadlocks on mmap_sem if called from sys_mmap_pgoff(),
+	 * which is already holding mmap_sem for writes.  If some other
+	 * thread gets the write lock in the meantime, this thread will
+	 * block, but at least it won't deadlock on itself.  LU-1735 */
+	if (down_read_trylock(&mm->mmap_sem) == 0)
+		return -EDEADLK;
+
+	/* ignore errors, just check how much was successfully transferred */
+	while (len) {
+		int bytes, rc, offset;
+		void *maddr;
+
+#if defined(HAVE_GET_USER_PAGES_GUP_FLAGS)
+		rc = get_user_pages(addr, 1, write ? FOLL_WRITE : 0, &page, &vma);
+#elif defined(HAVE_GET_USER_PAGES_6ARG)
+		rc = get_user_pages(addr, 1, write, 1, &page, &vma);
+#else
+		rc = get_user_pages(tsk, mm, addr, 1, write, 1, &page, &vma);
+#endif
+		if (rc <= 0)
+			break;
+
+		bytes = len;
+		offset = addr & (PAGE_SIZE-1);
+		if (bytes > PAGE_SIZE-offset)
+			bytes = PAGE_SIZE-offset;
+
+		maddr = kmap(page);
+		if (write) {
+			copy_to_user_page(vma, page, addr,
+					  maddr + offset, buf, bytes);
+			set_page_dirty_lock(page);
+		} else {
+			copy_from_user_page(vma, page, addr,
+					    buf, maddr + offset, bytes);
+		}
+		kunmap(page);
+		put_page(page);
+		len -= bytes;
+		buf += bytes;
+		addr += bytes;
+	}
+	up_read(&mm->mmap_sem);
+
+	return buf - old_buf;
+}
+
+/* Read the environment variable of current process specified by @key. */
+int cfs_get_environ(const char *key, char *value, int *val_len)
+{
+	struct mm_struct *mm;
+	char *buffer;
+	int buf_len = PAGE_SIZE;
+	int key_len = strlen(key);
+	unsigned long addr;
+	int rc;
+	bool skip = false;
+	ENTRY;
+
+	buffer = kmalloc(buf_len, GFP_USER);
+	if (!buffer)
+		RETURN(-ENOMEM);
+
+	mm = get_task_mm(current);
+	if (!mm) {
+		kfree(buffer);
+		RETURN(-EINVAL);
+	}
+
+	addr = mm->env_start;
+	while (addr < mm->env_end) {
+		int this_len, retval, scan_len;
+		char *env_start, *env_end;
+
+		memset(buffer, 0, buf_len);
+
+		this_len = min_t(int, mm->env_end - addr, buf_len);
+		retval = cfs_access_process_vm(current, mm, addr, buffer,
+					       this_len, 0);
+		if (retval < 0)
+			GOTO(out, rc = retval);
+		else if (retval != this_len)
+			break;
+
+		addr += retval;
+
+		/* Parse the buffer to find out the specified key/value pair.
+		 * The "key=value" entries are separated by '\0'. */
+		env_start = buffer;
+		scan_len = this_len;
+		while (scan_len) {
+			char *entry;
+			int entry_len;
+
+			env_end = memscan(env_start, '\0', scan_len);
+			LASSERT(env_end >= env_start &&
+				env_end <= env_start + scan_len);
+
+			/* The last entry of this buffer cross the buffer
+			 * boundary, reread it in next cycle. */
+			if (unlikely(env_end - env_start == scan_len)) {
+				/* Just skip the entry larger than page size,
+				 * it can't be jobID env variable. */
+				if (unlikely(scan_len == this_len))
+					skip = true;
+				else
+					addr -= scan_len;
+				break;
+			} else if (unlikely(skip)) {
+				skip = false;
+				goto skip;
+			}
+
+			entry = env_start;
+			entry_len = env_end - env_start;
+
+			/* Key length + length of '=' */
+			if (entry_len > key_len + 1 &&
+			    !memcmp(entry, key, key_len)) {
+				entry += key_len + 1;
+				entry_len -= key_len + 1;
+				/* The 'value' buffer passed in is too small.*/
+				if (entry_len >= *val_len)
+					GOTO(out, rc = -EOVERFLOW);
+
+				memcpy(value, entry, entry_len);
+				*val_len = entry_len;
+				GOTO(out, rc = 0);
+			}
+skip:
+			scan_len -= (env_end - env_start + 1);
+			env_start = env_end + 1;
+		}
+	}
+	GOTO(out, rc = -ENOENT);
+
+out:
+	mmput(mm);
+	kfree((void *)buffer);
+	return rc;
+}
+EXPORT_SYMBOL(cfs_get_environ);
+
+EXPORT_SYMBOL(cfs_cap_raise);
+EXPORT_SYMBOL(cfs_cap_lower);
+EXPORT_SYMBOL(cfs_cap_raised);
+EXPORT_SYMBOL(cfs_curproc_cap_pack);
+EXPORT_SYMBOL(cfs_capable);
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 80
+ * scroll-step: 1
+ * End:
+ */
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-debug.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-debug.c
new file mode 100644
index 0000000000000..edd19396dd69f
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-debug.c
@@ -0,0 +1,283 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/linux/linux-debug.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#ifdef HAVE_KERNEL_LOCKED
+#include <linux/smp_lock.h>
+#endif
+#include <linux/string.h>
+#include <linux/unistd.h>
+#include <linux/stacktrace.h>
+#include <linux/utsname.h>
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include <libcfs/libcfs.h>
+
+#include "tracefile.h"
+
+char lnet_debug_log_upcall[1024] = "/usr/lib/lustre/lnet_debug_log_upcall";
+
+/**
+ * Upcall function once a Lustre log has been dumped.
+ *
+ * \param file  path of the dumped log
+ */
+void libcfs_run_debug_log_upcall(char *file)
+{
+        char *argv[3];
+        int   rc;
+        char *envp[] = {
+                "HOME=/",
+                "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+                NULL};
+        ENTRY;
+
+        argv[0] = lnet_debug_log_upcall;
+
+        LASSERTF(file != NULL, "called on a null filename\n");
+        argv[1] = file; //only need to pass the path of the file
+
+        argv[2] = NULL;
+
+        rc = call_usermodehelper(argv[0], argv, envp, 1);
+        if (rc < 0 && rc != -ENOENT) {
+                CERROR("Error %d invoking LNET debug log upcall %s %s; "
+                       "check /proc/sys/lnet/debug_log_upcall\n",
+                       rc, argv[0], argv[1]);
+        } else {
+                CDEBUG(D_HA, "Invoked LNET debug log upcall %s %s\n",
+                       argv[0], argv[1]);
+        }
+
+        EXIT;
+}
+
+/* coverity[+kill] */
+void lbug_with_loc(struct libcfs_debug_msg_data *msgdata)
+{
+	libcfs_catastrophe = 1;
+	libcfs_debug_msg(msgdata, "LBUG\n");
+
+	if (in_interrupt()) {
+		panic("LBUG in interrupt.\n");
+		/* not reached */
+	}
+
+	libcfs_debug_dumpstack(NULL);
+	if (libcfs_panic_on_lbug)
+		panic("LBUG");
+	else
+		libcfs_debug_dumplog();
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	while (1)
+		schedule();
+}
+EXPORT_SYMBOL(lbug_with_loc);
+
+#ifdef CONFIG_STACKTRACE
+
+#define MAX_ST_ENTRIES	100
+static DEFINE_SPINLOCK(st_lock);
+
+static void libcfs_call_trace(struct task_struct *tsk)
+{
+	struct stack_trace trace;
+	static unsigned long entries[MAX_ST_ENTRIES];
+
+	trace.nr_entries = 0;
+	trace.max_entries = MAX_ST_ENTRIES;
+	trace.entries = entries;
+	trace.skip = 0;
+
+	spin_lock(&st_lock);
+	pr_info("Pid: %d, comm: %.20s %s %s\n", tsk->pid, tsk->comm,
+	       init_utsname()->release, init_utsname()->version);
+	pr_info("Call Trace:\n");
+	save_stack_trace_tsk(tsk, &trace);
+	print_stack_trace(&trace, 0);
+	spin_unlock(&st_lock);
+}
+
+#else /* !CONFIG_STACKTRACE */
+
+#ifdef CONFIG_X86
+#include <linux/nmi.h>
+#include <asm/stacktrace.h>
+
+#ifdef HAVE_STACKTRACE_OPS
+#ifdef HAVE_STACKTRACE_WARNING
+static void
+print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+	printk("%s", (char *)data);
+	print_symbol(msg, symbol);
+	printk("\n");
+}
+
+static void print_trace_warning(void *data, char *msg)
+{
+	printk("%s%s\n", (char *)data, msg);
+}
+#endif
+
+static int print_trace_stack(void *data, char *name)
+{
+	printk(" <%s> ", name);
+	return 0;
+}
+
+#ifdef STACKTRACE_OPS_ADDRESS_RETURN_INT
+static int
+#else
+static void
+#endif
+print_trace_address(void *data, unsigned long addr, int reliable)
+{
+	char fmt[32];
+
+	touch_nmi_watchdog();
+	sprintf(fmt, " [<%016lx>] %s%%s\n", addr, reliable ? "": "? ");
+	__print_symbol(fmt, addr);
+#ifdef STACKTRACE_OPS_ADDRESS_RETURN_INT
+	return 0;
+#endif
+}
+
+static const struct stacktrace_ops print_trace_ops = {
+#ifdef HAVE_STACKTRACE_WARNING
+	.warning = print_trace_warning,
+	.warning_symbol = print_trace_warning_symbol,
+#endif
+	.stack = print_trace_stack,
+	.address = print_trace_address,
+#ifdef STACKTRACE_OPS_HAVE_WALK_STACK
+	.walk_stack = print_context_stack,
+#endif
+};
+#endif /* HAVE_STACKTRACE_OPS */
+
+static void libcfs_call_trace(struct task_struct *tsk)
+{
+#ifdef HAVE_STACKTRACE_OPS
+	printk("Pid: %d, comm: %.20s\n", tsk->pid, tsk->comm);
+	printk("\nCall Trace:\n");
+	dump_trace(tsk, NULL, NULL,
+#ifdef HAVE_DUMP_TRACE_ADDRESS
+		   0,
+#endif /* HAVE_DUMP_TRACE_ADDRESS */
+		   &print_trace_ops, NULL);
+	printk("\n");
+#else /* !HAVE_STACKTRACE_OPS */
+	if (tsk == current)
+		dump_stack();
+	else
+		CWARN("can't show stack: kernel doesn't export show_task\n");
+#endif /* HAVE_STACKTRACE_OPS */
+}
+
+#else /* !CONFIG_X86 */
+
+static void libcfs_call_trace(struct task_struct *tsk)
+{
+	if (tsk == current)
+		dump_stack();
+	else
+		CWARN("can't show stack: kernel doesn't export show_task\n");
+}
+
+#endif /* CONFIG_X86 */
+
+#endif /* CONFIG_STACKTRACE */
+
+void libcfs_debug_dumpstack(struct task_struct *tsk)
+{
+	libcfs_call_trace(tsk ?: current);
+}
+EXPORT_SYMBOL(libcfs_debug_dumpstack);
+
+struct task_struct *libcfs_current(void)
+{
+        CWARN("current task struct is %p\n", current);
+        return current;
+}
+
+static int panic_notifier(struct notifier_block *self, unsigned long unused1,
+                         void *unused2)
+{
+        if (libcfs_panic_in_progress)
+                return 0;
+
+        libcfs_panic_in_progress = 1;
+        mb();
+
+#ifdef LNET_DUMP_ON_PANIC
+        /* This is currently disabled because it spews far too much to the
+         * console on the rare cases it is ever triggered. */
+
+        if (in_interrupt()) {
+                cfs_trace_debug_print();
+        } else {
+#ifdef HAVE_KERNEL_LOCKED
+		while (kernel_locked())
+			unlock_kernel();
+#endif
+		libcfs_debug_dumplog_internal((void *)(long)current_pid());
+        }
+#endif
+        return 0;
+}
+
+static struct notifier_block libcfs_panic_notifier = {
+	.notifier_call	= panic_notifier,
+	.next		= NULL,
+	.priority	= 10000
+};
+
+void libcfs_register_panic_notifier(void)
+{
+        atomic_notifier_chain_register(&panic_notifier_list, &libcfs_panic_notifier);
+}
+
+void libcfs_unregister_panic_notifier(void)
+{
+        atomic_notifier_chain_unregister(&panic_notifier_list, &libcfs_panic_notifier);
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-module.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-module.c
new file mode 100644
index 0000000000000..839f9324ac5ca
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-module.c
@@ -0,0 +1,173 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/miscdevice.h>
+#include <libcfs/libcfs.h>
+
+static inline size_t libcfs_ioctl_packlen(struct libcfs_ioctl_data *data)
+{
+	size_t len = sizeof(*data);
+
+	len += (data->ioc_inllen1 + 7) & ~7;
+	len += (data->ioc_inllen2 + 7) & ~7;
+	return len;
+}
+
+static bool libcfs_ioctl_is_invalid(struct libcfs_ioctl_data *data)
+{
+	if (data->ioc_hdr.ioc_len > BIT(30))
+		return true;
+
+	if (data->ioc_inllen1 > BIT(30))
+		return true;
+
+	if (data->ioc_inllen2 > BIT(30))
+		return true;
+
+	if (data->ioc_inlbuf1 && !data->ioc_inllen1)
+		return true;
+
+	if (data->ioc_inlbuf2 && !data->ioc_inllen2)
+		return true;
+
+	if (data->ioc_pbuf1 && !data->ioc_plen1)
+		return true;
+
+	if (data->ioc_pbuf2 && !data->ioc_plen2)
+		return true;
+
+	if (data->ioc_plen1 && !data->ioc_pbuf1)
+		return true;
+
+	if (data->ioc_plen2 && !data->ioc_pbuf2)
+		return true;
+
+	if (libcfs_ioctl_packlen(data) != data->ioc_hdr.ioc_len)
+		return true;
+
+	if (data->ioc_inllen1 &&
+	    data->ioc_bulk[((data->ioc_inllen1 + 7) & ~7) +
+			     data->ioc_inllen2 - 1] != '\0')
+		return true;
+
+	return false;
+}
+
+int libcfs_ioctl_data_adjust(struct libcfs_ioctl_data *data)
+{
+	ENTRY;
+
+	if (libcfs_ioctl_is_invalid(data)) {
+		CERROR("libcfs ioctl: parameter not correctly formatted\n");
+		RETURN(-EINVAL);
+	}
+
+	if (data->ioc_inllen1 != 0)
+		data->ioc_inlbuf1 = &data->ioc_bulk[0];
+
+	if (data->ioc_inllen2 != 0)
+		data->ioc_inlbuf2 = &data->ioc_bulk[0] +
+				    cfs_size_round(data->ioc_inllen1);
+
+	RETURN(0);
+}
+
+int libcfs_ioctl_getdata(struct libcfs_ioctl_hdr **hdr_pp,
+			 struct libcfs_ioctl_hdr __user *uhdr)
+{
+	struct libcfs_ioctl_hdr   hdr;
+	int err = 0;
+	ENTRY;
+
+	if (copy_from_user(&hdr, uhdr, sizeof(hdr)))
+		RETURN(-EFAULT);
+
+	if (hdr.ioc_version != LIBCFS_IOCTL_VERSION &&
+	    hdr.ioc_version != LIBCFS_IOCTL_VERSION2) {
+		CERROR("libcfs ioctl: version mismatch expected %#x, got %#x\n",
+		       LIBCFS_IOCTL_VERSION, hdr.ioc_version);
+		RETURN(-EINVAL);
+	}
+
+	if (hdr.ioc_len < sizeof(struct libcfs_ioctl_hdr)) {
+		CERROR("libcfs ioctl: user buffer too small for ioctl\n");
+		RETURN(-EINVAL);
+	}
+
+	if (hdr.ioc_len > LIBCFS_IOC_DATA_MAX) {
+		CERROR("libcfs ioctl: user buffer is too large %d/%d\n",
+		       hdr.ioc_len, LIBCFS_IOC_DATA_MAX);
+		RETURN(-EINVAL);
+	}
+
+	LIBCFS_ALLOC(*hdr_pp, hdr.ioc_len);
+	if (*hdr_pp == NULL)
+		RETURN(-ENOMEM);
+
+	if (copy_from_user(*hdr_pp, uhdr, hdr.ioc_len))
+		GOTO(failed, err = -EFAULT);
+
+	RETURN(0);
+failed:
+	LIBCFS_FREE(*hdr_pp, hdr.ioc_len);
+	RETURN(err);
+}
+
+static long
+libcfs_psdev_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (_IOC_TYPE(cmd) != IOC_LIBCFS_TYPE ||
+	    _IOC_NR(cmd) < IOC_LIBCFS_MIN_NR ||
+	    _IOC_NR(cmd) > IOC_LIBCFS_MAX_NR) {
+		CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n",
+		       _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd));
+		return -EINVAL;
+	}
+
+	return libcfs_ioctl(cmd, (void __user *)arg);
+}
+
+static struct file_operations libcfs_fops = {
+	.owner		= THIS_MODULE,
+	.unlocked_ioctl = libcfs_psdev_ioctl,
+};
+
+struct miscdevice libcfs_dev = {
+	.minor	= MISC_DYNAMIC_MINOR,
+	.name	= "lnet",
+	.fops	= &libcfs_fops
+};
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c
new file mode 100644
index 0000000000000..a7d5679412f6c
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c
@@ -0,0 +1,183 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/fs_struct.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <libcfs/libcfs.h>
+
+#if defined(CONFIG_KGDB)
+#include <asm/kgdb.h>
+#endif
+
+#ifndef HAVE_KTIME_GET_TS64
+void ktime_get_ts64(struct timespec64 *ts)
+{
+	struct timespec now;
+
+	ktime_get_ts(&now);
+	*ts = timespec_to_timespec64(now);
+}
+EXPORT_SYMBOL(ktime_get_ts64);
+#endif /* HAVE_KTIME_GET_TS64 */
+
+#ifndef HAVE_KTIME_GET_REAL_TS64
+void ktime_get_real_ts64(struct timespec64 *ts)
+{
+	struct timespec now;
+
+	getnstimeofday(&now);
+	*ts = timespec_to_timespec64(now);
+}
+EXPORT_SYMBOL(ktime_get_real_ts64);
+#endif /* HAVE_KTIME_GET_REAL_TS64 */
+
+#ifndef HAVE_KTIME_GET_REAL_SECONDS
+/*
+ * Get the seconds portion of CLOCK_REALTIME (wall clock).
+ * This is the clock that can be altered by NTP and is
+ * independent of a reboot.
+ */
+time64_t ktime_get_real_seconds(void)
+{
+	return (time64_t)get_seconds();
+}
+EXPORT_SYMBOL(ktime_get_real_seconds);
+#endif /* HAVE_KTIME_GET_REAL_SECONDS */
+
+#ifndef HAVE_KTIME_GET_SECONDS
+/*
+ * Get the seconds portion of CLOCK_MONOTONIC
+ * This clock is immutable and is reset across
+ * reboots. For older platforms this is a
+ * wrapper around get_seconds which is valid
+ * until 2038. By that time this will be gone
+ * one would hope.
+ */
+time64_t ktime_get_seconds(void)
+{
+	struct timespec64 now;
+
+	ktime_get_ts64(&now);
+	return now.tv_sec;
+}
+EXPORT_SYMBOL(ktime_get_seconds);
+#endif /* HAVE_KTIME_GET_SECONDS */
+
+int cfs_kernel_write(struct file *filp, const void *buf, size_t count,
+		     loff_t *pos)
+{
+#ifdef HAVE_NEW_KERNEL_WRITE
+	return kernel_write(filp, buf, count, pos);
+#else
+	mm_segment_t __old_fs = get_fs();
+	int rc;
+
+	set_fs(get_ds());
+	rc = vfs_write(filp, (__force const char __user *)buf, count, pos);
+	set_fs(__old_fs);
+
+	return rc;
+#endif
+}
+EXPORT_SYMBOL(cfs_kernel_write);
+
+sigset_t
+cfs_block_allsigs(void)
+{
+	unsigned long	flags;
+	sigset_t	old;
+
+	spin_lock_irqsave(&current->sighand->siglock, flags);
+	old = current->blocked;
+	sigfillset(&current->blocked);
+	recalc_sigpending();
+	spin_unlock_irqrestore(&current->sighand->siglock, flags);
+	return old;
+}
+EXPORT_SYMBOL(cfs_block_allsigs);
+
+sigset_t cfs_block_sigs(unsigned long sigs)
+{
+	unsigned long  flags;
+	sigset_t	old;
+
+	spin_lock_irqsave(&current->sighand->siglock, flags);
+	old = current->blocked;
+	sigaddsetmask(&current->blocked, sigs);
+	recalc_sigpending();
+	spin_unlock_irqrestore(&current->sighand->siglock, flags);
+	return old;
+}
+EXPORT_SYMBOL(cfs_block_sigs);
+
+/* Block all signals except for the @sigs */
+sigset_t cfs_block_sigsinv(unsigned long sigs)
+{
+	unsigned long flags;
+	sigset_t old;
+
+	spin_lock_irqsave(&current->sighand->siglock, flags);
+	old = current->blocked;
+	sigaddsetmask(&current->blocked, ~sigs);
+	recalc_sigpending();
+	spin_unlock_irqrestore(&current->sighand->siglock, flags);
+	return old;
+}
+EXPORT_SYMBOL(cfs_block_sigsinv);
+
+void
+cfs_restore_sigs(sigset_t old)
+{
+	unsigned long  flags;
+
+	spin_lock_irqsave(&current->sighand->siglock, flags);
+	current->blocked = old;
+	recalc_sigpending();
+	spin_unlock_irqrestore(&current->sighand->siglock, flags);
+}
+EXPORT_SYMBOL(cfs_restore_sigs);
+
+void
+cfs_clear_sigpending(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&current->sighand->siglock, flags);
+	clear_tsk_thread_flag(current, TIF_SIGPENDING);
+	spin_unlock_irqrestore(&current->sighand->siglock, flags);
+}
+EXPORT_SYMBOL(cfs_clear_sigpending);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-tracefile.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-tracefile.c
new file mode 100644
index 0000000000000..6e08612044541
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-tracefile.c
@@ -0,0 +1,272 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#define LUSTRE_TRACEFILE_PRIVATE
+
+#include <libcfs/libcfs.h>
+#include "tracefile.h"
+
+/* percents to share the total debug memory for each type */
+static unsigned int pages_factor[CFS_TCD_TYPE_MAX] = {
+	80,  /* 80% pages for CFS_TCD_TYPE_PROC */
+	10,  /* 10% pages for CFS_TCD_TYPE_SOFTIRQ */
+	10   /* 10% pages for CFS_TCD_TYPE_IRQ */
+};
+
+char *cfs_trace_console_buffers[NR_CPUS][CFS_TCD_TYPE_MAX];
+
+static DECLARE_RWSEM(cfs_tracefile_sem);
+
+int cfs_tracefile_init_arch()
+{
+	int    i;
+	int    j;
+	struct cfs_trace_cpu_data *tcd;
+
+	/* initialize trace_data */
+	memset(cfs_trace_data, 0, sizeof(cfs_trace_data));
+	for (i = 0; i < CFS_TCD_TYPE_MAX; i++) {
+		cfs_trace_data[i] =
+			kmalloc(sizeof(union cfs_trace_data_union) *
+				num_possible_cpus(), GFP_KERNEL);
+		if (cfs_trace_data[i] == NULL)
+			goto out;
+
+	}
+
+	/* arch related info initialized */
+	cfs_tcd_for_each(tcd, i, j) {
+		spin_lock_init(&tcd->tcd_lock);
+		tcd->tcd_pages_factor = pages_factor[i];
+		tcd->tcd_type = i;
+		tcd->tcd_cpu = j;
+	}
+
+	for (i = 0; i < num_possible_cpus(); i++)
+		for (j = 0; j < 3; j++) {
+			cfs_trace_console_buffers[i][j] =
+				kmalloc(CFS_TRACE_CONSOLE_BUFFER_SIZE,
+					GFP_KERNEL);
+
+			if (cfs_trace_console_buffers[i][j] == NULL)
+				goto out;
+		}
+
+	return 0;
+
+out:
+	cfs_tracefile_fini_arch();
+	printk(KERN_ERR "lnet: Not enough memory\n");
+	return -ENOMEM;
+}
+
+void cfs_tracefile_fini_arch()
+{
+	int    i;
+	int    j;
+
+	for (i = 0; i < num_possible_cpus(); i++)
+		for (j = 0; j < 3; j++)
+			if (cfs_trace_console_buffers[i][j] != NULL) {
+				kfree(cfs_trace_console_buffers[i][j]);
+				cfs_trace_console_buffers[i][j] = NULL;
+			}
+
+	for (i = 0; cfs_trace_data[i] != NULL; i++) {
+		kfree(cfs_trace_data[i]);
+		cfs_trace_data[i] = NULL;
+	}
+}
+
+void cfs_tracefile_read_lock()
+{
+	down_read(&cfs_tracefile_sem);
+}
+
+void cfs_tracefile_read_unlock()
+{
+	up_read(&cfs_tracefile_sem);
+}
+
+void cfs_tracefile_write_lock()
+{
+	down_write(&cfs_tracefile_sem);
+}
+
+void cfs_tracefile_write_unlock()
+{
+	up_write(&cfs_tracefile_sem);
+}
+
+enum cfs_trace_buf_type cfs_trace_buf_idx_get()
+{
+	if (in_irq())
+		return CFS_TCD_TYPE_IRQ;
+	else if (in_softirq())
+		return CFS_TCD_TYPE_SOFTIRQ;
+	else
+		return CFS_TCD_TYPE_PROC;
+}
+
+/*
+ * The walking argument indicates the locking comes from all tcd types
+ * iterator and we must lock it and dissable local irqs to avoid deadlocks
+ * with other interrupt locks that might be happening. See LU-1311
+ * for details.
+ */
+int cfs_trace_lock_tcd(struct cfs_trace_cpu_data *tcd, int walking)
+__acquires(&tcd->tcd_lock)
+{
+	__LASSERT(tcd->tcd_type < CFS_TCD_TYPE_MAX);
+	if (tcd->tcd_type == CFS_TCD_TYPE_IRQ)
+		spin_lock_irqsave(&tcd->tcd_lock, tcd->tcd_lock_flags);
+	else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ)
+		spin_lock_bh(&tcd->tcd_lock);
+	else if (unlikely(walking))
+		spin_lock_irq(&tcd->tcd_lock);
+	else
+		spin_lock(&tcd->tcd_lock);
+	return 1;
+}
+
+void cfs_trace_unlock_tcd(struct cfs_trace_cpu_data *tcd, int walking)
+__releases(&tcd->tcd_lock)
+{
+	__LASSERT(tcd->tcd_type < CFS_TCD_TYPE_MAX);
+	if (tcd->tcd_type == CFS_TCD_TYPE_IRQ)
+		spin_unlock_irqrestore(&tcd->tcd_lock, tcd->tcd_lock_flags);
+	else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ)
+		spin_unlock_bh(&tcd->tcd_lock);
+	else if (unlikely(walking))
+		spin_unlock_irq(&tcd->tcd_lock);
+	else
+		spin_unlock(&tcd->tcd_lock);
+}
+
+int cfs_tcd_owns_tage(struct cfs_trace_cpu_data *tcd,
+                      struct cfs_trace_page *tage)
+{
+	/*
+	 * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+	 * from here: this will lead to infinite recursion.
+	 */
+	return tcd->tcd_cpu == tage->cpu;
+}
+
+void
+cfs_set_ptldebug_header(struct ptldebug_header *header,
+                        struct libcfs_debug_msg_data *msgdata,
+                        unsigned long stack)
+{
+	struct timespec64 ts;
+
+	ktime_get_real_ts64(&ts);
+
+	header->ph_subsys = msgdata->msg_subsys;
+	header->ph_mask = msgdata->msg_mask;
+	header->ph_cpu_id = smp_processor_id();
+	header->ph_type = cfs_trace_buf_idx_get();
+	/* y2038 safe since all user space treats this as unsigned, but
+	 * will overflow in 2106
+	 */
+	header->ph_sec = (u32)ts.tv_sec;
+	header->ph_usec = ts.tv_nsec / NSEC_PER_USEC;
+	header->ph_stack = stack;
+	header->ph_pid = current->pid;
+	header->ph_line_num = msgdata->msg_line;
+	header->ph_extern_pid = 0;
+	return;
+}
+
+static char *
+dbghdr_to_err_string(struct ptldebug_header *hdr)
+{
+        switch (hdr->ph_subsys) {
+
+                case S_LND:
+                case S_LNET:
+                        return "LNetError";
+                default:
+                        return "LustreError";
+        }
+}
+
+static char *
+dbghdr_to_info_string(struct ptldebug_header *hdr)
+{
+        switch (hdr->ph_subsys) {
+
+                case S_LND:
+                case S_LNET:
+                        return "LNet";
+                default:
+                        return "Lustre";
+        }
+}
+
+void cfs_print_to_console(struct ptldebug_header *hdr, int mask,
+                          const char *buf, int len, const char *file,
+                          const char *fn)
+{
+	char *prefix = "Lustre", *ptype = NULL;
+
+	if ((mask & D_EMERG) != 0) {
+		prefix = dbghdr_to_err_string(hdr);
+		ptype = KERN_EMERG;
+	} else if ((mask & D_ERROR) != 0) {
+		prefix = dbghdr_to_err_string(hdr);
+		ptype = KERN_ERR;
+	} else if ((mask & D_WARNING) != 0) {
+		prefix = dbghdr_to_info_string(hdr);
+		ptype = KERN_WARNING;
+	} else if ((mask & (D_CONSOLE | libcfs_printk)) != 0) {
+		prefix = dbghdr_to_info_string(hdr);
+		ptype = KERN_INFO;
+	}
+
+	if ((mask & D_CONSOLE) != 0) {
+		printk("%s%s: %.*s", ptype, prefix, len, buf);
+	} else {
+		printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix,
+                       hdr->ph_pid, hdr->ph_extern_pid, file, hdr->ph_line_num,
+                       fn, len, buf);
+	}
+	return;
+}
+
+int cfs_trace_max_debug_mb(void)
+{
+	int  total_mb = (totalram_pages >> (20 - PAGE_SHIFT));
+
+	return MAX(512, (total_mb * 80)/100);
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/module.c b/drivers/staging/lustrefsx/libcfs/libcfs/module.c
new file mode 100644
index 0000000000000..910a44bc97f48
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/module.c
@@ -0,0 +1,827 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <linux/uio.h>
+#include <linux/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/list.h>
+
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <asm/div64.h>
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <libcfs/libcfs.h>
+#include <libcfs/libcfs_crypto.h>
+#include <lnet/lib-lnet.h>
+#include "tracefile.h"
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *lnet_table_header;
+#endif
+
+static DECLARE_RWSEM(ioctl_list_sem);
+static LIST_HEAD(ioctl_list);
+
+int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand)
+{
+	int rc = 0;
+
+	down_write(&ioctl_list_sem);
+	if (!list_empty(&hand->item))
+		rc = -EBUSY;
+	else
+		list_add_tail(&hand->item, &ioctl_list);
+	up_write(&ioctl_list_sem);
+
+	return rc;
+}
+EXPORT_SYMBOL(libcfs_register_ioctl);
+
+int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand)
+{
+	int rc = 0;
+
+	down_write(&ioctl_list_sem);
+	if (list_empty(&hand->item))
+		rc = -ENOENT;
+	else
+		list_del_init(&hand->item);
+	up_write(&ioctl_list_sem);
+
+	return rc;
+}
+EXPORT_SYMBOL(libcfs_deregister_ioctl);
+
+int libcfs_ioctl(unsigned long cmd, void __user *uparam)
+{
+	struct libcfs_ioctl_data *data = NULL;
+	struct libcfs_ioctl_hdr  *hdr;
+	int			  err;
+	ENTRY;
+
+	/* 'cmd' and permissions get checked in our arch-specific caller */
+	err = libcfs_ioctl_getdata(&hdr, uparam);
+	if (err != 0) {
+		CDEBUG_LIMIT(D_ERROR,
+			     "libcfs ioctl: data header error %d\n", err);
+		RETURN(err);
+	}
+
+	if (hdr->ioc_version == LIBCFS_IOCTL_VERSION) {
+		/* The libcfs_ioctl_data_adjust() function performs adjustment
+		 * operations on the libcfs_ioctl_data structure to make
+		 * it usable by the code.  This doesn't need to be called
+		 * for new data structures added. */
+		data = container_of(hdr, struct libcfs_ioctl_data, ioc_hdr);
+		err = libcfs_ioctl_data_adjust(data);
+		if (err != 0)
+			GOTO(out, err);
+	}
+
+	CDEBUG(D_IOCTL, "libcfs ioctl cmd %lu\n", cmd);
+	switch (cmd) {
+	case IOC_LIBCFS_CLEAR_DEBUG:
+		libcfs_debug_clear_buffer();
+		break;
+	case IOC_LIBCFS_MARK_DEBUG:
+		if (data == NULL ||
+		    data->ioc_inlbuf1 == NULL ||
+		    data->ioc_inlbuf1[data->ioc_inllen1 - 1] != '\0')
+			GOTO(out, err = -EINVAL);
+
+		libcfs_debug_mark_buffer(data->ioc_inlbuf1);
+		break;
+
+	default: {
+		struct libcfs_ioctl_handler *hand;
+
+		err = -EINVAL;
+		down_read(&ioctl_list_sem);
+		list_for_each_entry(hand, &ioctl_list, item) {
+			err = hand->handle_ioctl(cmd, hdr);
+			if (err == -EINVAL)
+				continue;
+
+			if (err == 0) {
+				if (copy_to_user(uparam, hdr, hdr->ioc_len))
+					err = -EFAULT;
+			}
+			break;
+		}
+		up_read(&ioctl_list_sem);
+		break; }
+	}
+out:
+	LIBCFS_FREE(hdr, hdr->ioc_len);
+	RETURN(err);
+}
+
+int
+lprocfs_call_handler(void *data, int write, loff_t *ppos,
+		     void __user *buffer, size_t *lenp,
+		     int (*handler)(void *data, int write, loff_t pos,
+				    void __user *buffer, int len))
+{
+	int rc = handler(data, write, *ppos, buffer, *lenp);
+
+	if (rc < 0)
+		return rc;
+
+	if (write) {
+		*ppos += *lenp;
+	} else {
+		*lenp = rc;
+		*ppos += rc;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_call_handler);
+
+static int __proc_dobitmasks(void *data, int write,
+			     loff_t pos, void __user *buffer, int nob)
+{
+	const int     tmpstrlen = 512;
+	char         *tmpstr;
+	int           rc;
+	unsigned int *mask = data;
+	int           is_subsys = (mask == &libcfs_subsystem_debug) ? 1 : 0;
+	int           is_printk = (mask == &libcfs_printk) ? 1 : 0;
+
+	rc = cfs_trace_allocate_string_buffer(&tmpstr, tmpstrlen);
+	if (rc < 0)
+		return rc;
+
+	if (!write) {
+		libcfs_debug_mask2str(tmpstr, tmpstrlen, *mask, is_subsys);
+		rc = strlen(tmpstr);
+
+		if (pos >= rc) {
+			rc = 0;
+		} else {
+			rc = cfs_trace_copyout_string(buffer, nob,
+						      tmpstr + pos, "\n");
+		}
+	} else {
+		rc = cfs_trace_copyin_string(tmpstr, tmpstrlen, buffer, nob);
+		if (rc < 0) {
+			kfree(tmpstr);
+			return rc;
+		}
+
+		rc = libcfs_debug_str2mask(mask, tmpstr, is_subsys);
+		/* Always print LBUG/LASSERT to console, so keep this mask */
+		if (is_printk)
+			*mask |= D_EMERG;
+	}
+
+	kfree(tmpstr);
+	return rc;
+}
+
+static int
+proc_dobitmasks(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
+				    __proc_dobitmasks);
+}
+
+static int min_watchdog_ratelimit;		/* disable ratelimiting */
+static int max_watchdog_ratelimit = (24*60*60); /* limit to once per day */
+
+static int __proc_dump_kernel(void *data, int write,
+			      loff_t pos, void __user *buffer, int nob)
+{
+	if (!write)
+		return 0;
+
+	return cfs_trace_dump_debug_buffer_usrstr(buffer, nob);
+}
+
+static int
+proc_dump_kernel(struct ctl_table *table, int write, void __user *buffer,
+		 size_t *lenp, loff_t *ppos)
+{
+	return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
+				    __proc_dump_kernel);
+}
+
+static int __proc_daemon_file(void *data, int write,
+			      loff_t pos, void __user *buffer, int nob)
+{
+	if (!write) {
+		int len = strlen(cfs_tracefile);
+
+		if (pos >= len)
+			return 0;
+
+		return cfs_trace_copyout_string(buffer, nob,
+						cfs_tracefile + pos, "\n");
+	}
+
+	return cfs_trace_daemon_command_usrstr(buffer, nob);
+}
+
+static int
+proc_daemon_file(struct ctl_table *table, int write, void __user *buffer,
+		 size_t *lenp, loff_t *ppos)
+{
+	return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
+				    __proc_daemon_file);
+}
+
+static int __proc_debug_mb(void *data, int write,
+			   loff_t pos, void __user *buffer, int nob)
+{
+	if (!write) {
+		char tmpstr[32];
+		int  len = snprintf(tmpstr, sizeof(tmpstr), "%d",
+				    cfs_trace_get_debug_mb());
+
+		if (pos >= len)
+			return 0;
+
+		return cfs_trace_copyout_string(buffer, nob, tmpstr + pos,
+						"\n");
+	}
+
+	return cfs_trace_set_debug_mb_usrstr(buffer, nob);
+}
+
+static int
+proc_debug_mb(struct ctl_table *table, int write, void __user *buffer,
+	      size_t *lenp, loff_t *ppos)
+{
+	return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
+				    __proc_debug_mb);
+}
+
+static int
+proc_console_max_delay_cs(struct ctl_table *table, int write,
+			  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int rc, max_delay_cs;
+	struct ctl_table dummy = *table;
+	cfs_duration_t d;
+
+	dummy.data = &max_delay_cs;
+	dummy.proc_handler = &proc_dointvec;
+
+	if (!write) { /* read */
+		max_delay_cs = cfs_duration_sec(libcfs_console_max_delay * 100);
+		rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+		return rc;
+	}
+
+	/* write */
+	max_delay_cs = 0;
+	rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+	if (rc < 0)
+		return rc;
+	if (max_delay_cs <= 0)
+		return -EINVAL;
+
+	d = cfs_time_seconds(max_delay_cs) / 100;
+	if (d == 0 || d < libcfs_console_min_delay)
+		return -EINVAL;
+	libcfs_console_max_delay = d;
+
+	return rc;
+}
+
+static int
+proc_console_min_delay_cs(struct ctl_table *table, int write,
+			  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int rc, min_delay_cs;
+	struct ctl_table dummy = *table;
+	cfs_duration_t d;
+
+	dummy.data = &min_delay_cs;
+	dummy.proc_handler = &proc_dointvec;
+
+	if (!write) { /* read */
+		min_delay_cs = cfs_duration_sec(libcfs_console_min_delay * 100);
+		rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+		return rc;
+	}
+
+	/* write */
+	min_delay_cs = 0;
+	rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+	if (rc < 0)
+		return rc;
+	if (min_delay_cs <= 0)
+		return -EINVAL;
+
+	d = cfs_time_seconds(min_delay_cs) / 100;
+	if (d == 0 || d > libcfs_console_max_delay)
+		return -EINVAL;
+	libcfs_console_min_delay = d;
+
+	return rc;
+}
+
+static int
+proc_console_backoff(struct ctl_table *table, int write, void __user *buffer,
+		     size_t *lenp, loff_t *ppos)
+{
+	int rc, backoff;
+	struct ctl_table dummy = *table;
+
+	dummy.data = &backoff;
+	dummy.proc_handler = &proc_dointvec;
+
+	if (!write) { /* read */
+		backoff = libcfs_console_backoff;
+		rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+		return rc;
+	}
+
+	/* write */
+	backoff = 0;
+	rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+	if (rc < 0)
+		return rc;
+
+	if (backoff <= 0)
+		return -EINVAL;
+
+	libcfs_console_backoff = backoff;
+
+	return rc;
+}
+
+static int
+libcfs_force_lbug(struct ctl_table *table, int write, void __user *buffer,
+		  size_t *lenp, loff_t *ppos)
+{
+	if (write)
+		LBUG();
+	return 0;
+}
+
+static int
+proc_fail_loc(struct ctl_table *table, int write, void __user *buffer,
+	      size_t *lenp, loff_t *ppos)
+{
+	int rc;
+	long old_fail_loc = cfs_fail_loc;
+
+	rc = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+	if (old_fail_loc != cfs_fail_loc)
+		wake_up(&cfs_race_waitq);
+	return rc;
+}
+
+static int __proc_cpt_table(void *data, int write,
+			    loff_t pos, void __user *buffer, int nob)
+{
+	char *buf = NULL;
+	int   len = 4096;
+	int   rc  = 0;
+
+	if (write)
+		return -EPERM;
+
+	LASSERT(cfs_cpt_table != NULL);
+
+	while (1) {
+		LIBCFS_ALLOC(buf, len);
+		if (buf == NULL)
+			return -ENOMEM;
+
+		rc = cfs_cpt_table_print(cfs_cpt_table, buf, len);
+		if (rc >= 0)
+			break;
+
+		if (rc == -EFBIG) {
+			LIBCFS_FREE(buf, len);
+			len <<= 1;
+			continue;
+		}
+		goto out;
+	}
+
+	if (pos >= rc) {
+		rc = 0;
+		goto out;
+	}
+
+	rc = cfs_trace_copyout_string(buffer, nob, buf + pos, NULL);
+out:
+	if (buf != NULL)
+		LIBCFS_FREE(buf, len);
+	return rc;
+}
+
+static int
+proc_cpt_table(struct ctl_table *table, int write, void __user *buffer,
+	       size_t *lenp, loff_t *ppos)
+{
+	return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
+				    __proc_cpt_table);
+}
+
+static int __proc_cpt_distance(void *data, int write,
+			       loff_t pos, void __user *buffer, int nob)
+{
+	char *buf = NULL;
+	int   len = 4096;
+	int   rc  = 0;
+
+	if (write)
+		return -EPERM;
+
+	LASSERT(cfs_cpt_table != NULL);
+
+	while (1) {
+		LIBCFS_ALLOC(buf, len);
+		if (buf == NULL)
+			return -ENOMEM;
+
+		rc = cfs_cpt_distance_print(cfs_cpt_table, buf, len);
+		if (rc >= 0)
+			break;
+
+		if (rc == -EFBIG) {
+			LIBCFS_FREE(buf, len);
+			len <<= 1;
+			continue;
+		}
+		goto out;
+	}
+
+	if (pos >= rc) {
+		rc = 0;
+		goto out;
+	}
+
+	rc = cfs_trace_copyout_string(buffer, nob, buf + pos, NULL);
+ out:
+	if (buf != NULL)
+		LIBCFS_FREE(buf, len);
+	return rc;
+}
+
+static int
+proc_cpt_distance(struct ctl_table *table, int write, void __user *buffer,
+	       size_t *lenp, loff_t *ppos)
+{
+	return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
+				     __proc_cpt_distance);
+}
+
+static struct ctl_table lnet_table[] = {
+	/*
+	 * NB No .strategy entries have been provided since sysctl(8) prefers
+	 * to go via /proc for portability.
+	 */
+	{
+		INIT_CTL_NAME
+		.procname	= "debug",
+		.data		= &libcfs_debug,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dobitmasks,
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "subsystem_debug",
+		.data		= &libcfs_subsystem_debug,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dobitmasks,
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "printk",
+		.data		= &libcfs_printk,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dobitmasks,
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "console_ratelimit",
+		.data		= &libcfs_console_ratelimit,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "console_max_delay_centisecs",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_console_max_delay_cs
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "console_min_delay_centisecs",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_console_min_delay_cs
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "console_backoff",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_console_backoff
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "debug_path",
+		.data		= libcfs_debug_file_path_arr,
+		.maxlen		= sizeof(libcfs_debug_file_path_arr),
+		.mode		= 0644,
+		.proc_handler	= &proc_dostring,
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "cpu_partition_table",
+		.maxlen		= 128,
+		.mode		= 0444,
+		.proc_handler	= &proc_cpt_table,
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "cpu_partition_distance",
+		.maxlen		= 128,
+		.mode		= 0444,
+		.proc_handler	= &proc_cpt_distance,
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "debug_log_upcall",
+		.data		= lnet_debug_log_upcall,
+		.maxlen		= sizeof(lnet_debug_log_upcall),
+		.mode		= 0644,
+		.proc_handler	= &proc_dostring,
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "lnet_memused",
+		.data		= (int *)&libcfs_kmemory.counter,
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "catastrophe",
+		.data		= &libcfs_catastrophe,
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "panic_on_lbug",
+		.data		= &libcfs_panic_on_lbug,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "dump_kernel",
+		.maxlen		= 256,
+		.mode		= 0200,
+		.proc_handler	= &proc_dump_kernel,
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "daemon_file",
+		.mode		= 0644,
+		.maxlen		= 256,
+		.proc_handler	= &proc_daemon_file,
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "debug_mb",
+		.mode		= 0644,
+		.proc_handler	= &proc_debug_mb,
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "watchdog_ratelimit",
+		.data		= &libcfs_watchdog_ratelimit,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &min_watchdog_ratelimit,
+		.extra2		= &max_watchdog_ratelimit,
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "force_lbug",
+		.data		= NULL,
+		.maxlen		= 0,
+		.mode		= 0200,
+		.proc_handler	= &libcfs_force_lbug
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "fail_loc",
+		.data		= &cfs_fail_loc,
+		.maxlen		= sizeof(cfs_fail_loc),
+		.mode		= 0644,
+		.proc_handler	= &proc_fail_loc
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "fail_val",
+		.data		= &cfs_fail_val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "fail_err",
+		.data		= &cfs_fail_err,
+		.maxlen		= sizeof(cfs_fail_err),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+	}
+};
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table top_table[] = {
+	{
+		INIT_CTL_NAME
+		.procname       = "lnet",
+		.mode           = 0555,
+		.data           = NULL,
+		.maxlen         = 0,
+		.child          = lnet_table,
+	},
+	{ .procname = NULL }
+};
+#endif
+
+static int insert_proc(void)
+{
+#ifdef CONFIG_SYSCTL
+	if (lnet_table_header == NULL)
+		lnet_table_header = register_sysctl_table(top_table);
+#endif
+	return 0;
+}
+
+static void remove_proc(void)
+{
+#ifdef CONFIG_SYSCTL
+	if (lnet_table_header != NULL)
+		unregister_sysctl_table(lnet_table_header);
+
+	lnet_table_header = NULL;
+#endif
+}
+
+static int __init libcfs_init(void)
+{
+	int rc;
+
+	rc = libcfs_debug_init(5 * 1024 * 1024);
+	if (rc < 0) {
+		printk(KERN_ERR "LustreError: libcfs_debug_init: %d\n", rc);
+		return (rc);
+	}
+
+	rc = cfs_cpu_init();
+	if (rc != 0)
+		goto cleanup_debug;
+
+	rc = misc_register(&libcfs_dev);
+	if (rc) {
+		CERROR("misc_register: error %d\n", rc);
+		goto cleanup_cpu;
+	}
+
+	rc = cfs_wi_startup();
+	if (rc) {
+		CERROR("initialize workitem: error %d\n", rc);
+		goto cleanup_deregister;
+	}
+
+	/* max to 4 threads, should be enough for rehash */
+	rc = min(cfs_cpt_weight(cfs_cpt_table, CFS_CPT_ANY), 4);
+	rc = cfs_wi_sched_create("cfs_rh", cfs_cpt_table, CFS_CPT_ANY,
+				 rc, &cfs_sched_rehash);
+	if (rc != 0) {
+		CERROR("Startup workitem scheduler: error: %d\n", rc);
+		goto cleanup_deregister;
+	}
+
+	rc = cfs_crypto_register();
+	if (rc) {
+		CERROR("cfs_crypto_regster: error %d\n", rc);
+		goto cleanup_wi;
+	}
+
+
+	rc = insert_proc();
+	if (rc) {
+		CERROR("insert_proc: error %d\n", rc);
+		goto cleanup_crypto;
+	}
+
+	CDEBUG (D_OTHER, "portals setup OK\n");
+	return 0;
+cleanup_crypto:
+	cfs_crypto_unregister();
+cleanup_wi:
+	cfs_wi_shutdown();
+cleanup_deregister:
+	misc_deregister(&libcfs_dev);
+cleanup_cpu:
+	cfs_cpu_fini();
+cleanup_debug:
+	libcfs_debug_cleanup();
+	return rc;
+}
+
+static void __exit libcfs_exit(void)
+{
+	int rc;
+
+	remove_proc();
+
+	CDEBUG(D_MALLOC, "before Portals cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+	if (cfs_sched_rehash != NULL) {
+		cfs_wi_sched_destroy(cfs_sched_rehash);
+		cfs_sched_rehash = NULL;
+	}
+
+	cfs_crypto_unregister();
+	cfs_wi_shutdown();
+
+	misc_deregister(&libcfs_dev);
+
+	cfs_cpu_fini();
+
+	if (atomic_read(&libcfs_kmemory) != 0)
+		CERROR("Portals memory leaked: %d bytes\n",
+		       atomic_read(&libcfs_kmemory));
+
+	rc = libcfs_debug_cleanup();
+	if (rc)
+		printk(KERN_ERR "LustreError: libcfs_debug_cleanup: %d\n",
+		       rc);
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre helper library");
+MODULE_VERSION(LIBCFS_VERSION);
+MODULE_LICENSE("GPL");
+
+module_init(libcfs_init);
+module_exit(libcfs_exit);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/prng.c b/drivers/staging/lustrefsx/libcfs/libcfs/prng.c
new file mode 100644
index 0000000000000..03931745c9003
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/prng.c
@@ -0,0 +1,136 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/prng.c
+ *
+ * concatenation of following two 16-bit multiply with carry generators
+ * x(n)=a*x(n-1)+carry mod 2^16 and y(n)=b*y(n-1)+carry mod 2^16,
+ * number and carry packed within the same 32 bit integer.
+ * algorithm recommended by Marsaglia
+*/
+
+#include <linux/random.h>
+#include <libcfs/libcfs.h>
+
+/*
+From: George Marsaglia <geo@stat.fsu.edu>
+Newsgroups: sci.math
+Subject: Re: A RANDOM NUMBER GENERATOR FOR C
+Date: Tue, 30 Sep 1997 05:29:35 -0700
+
+ * You may replace the two constants 36969 and 18000 by any
+ * pair of distinct constants from this list:
+ * 18000 18030 18273 18513 18879 19074 19098 19164 19215 19584
+ * 19599 19950 20088 20508 20544 20664 20814 20970 21153 21243
+ * 21423 21723 21954 22125 22188 22293 22860 22938 22965 22974
+ * 23109 23124 23163 23208 23508 23520 23553 23658 23865 24114
+ * 24219 24660 24699 24864 24948 25023 25308 25443 26004 26088
+ * 26154 26550 26679 26838 27183 27258 27753 27795 27810 27834
+ * 27960 28320 28380 28689 28710 28794 28854 28959 28980 29013
+ * 29379 29889 30135 30345 30459 30714 30903 30963 31059 31083
+ * (or any other 16-bit constants k for which both k*2^16-1
+ * and k*2^15-1 are prime) */
+
+#define RANDOM_CONST_A 18030
+#define RANDOM_CONST_B 29013
+
+static unsigned int seed_x = 521288629;
+static unsigned int seed_y = 362436069;
+
+/**
+ * cfs_rand - creates new seeds
+ *
+ * First it creates new seeds from the previous seeds. Then it generates a
+ * new psuedo random number for use.
+ *
+ * Returns a pseudo-random 32-bit integer
+ */
+unsigned int cfs_rand(void)
+{
+        seed_x = RANDOM_CONST_A * (seed_x & 65535) + (seed_x >> 16);
+        seed_y = RANDOM_CONST_B * (seed_y & 65535) + (seed_y >> 16);
+
+        return ((seed_x << 16) + (seed_y & 65535));
+}
+EXPORT_SYMBOL(cfs_rand);
+
+/**
+ * cfs_srand - sets the inital seed
+ * @seed1 : (seed_x) should have the most entropy in the low bits of the word
+ * @seed2 : (seed_y) should have the most entropy in the high bits of the word
+ *
+ * Replaces the original seeds with new values. Used to generate a new pseudo
+ * random numbers.
+ */
+void cfs_srand(unsigned int seed1, unsigned int seed2)
+{
+        if (seed1)
+                seed_x = seed1; /* use default seeds if parameter is 0 */
+        if (seed2)
+                seed_y = seed2;
+}
+EXPORT_SYMBOL(cfs_srand);
+
+/**
+ * cfs_get_random_bytes - generate a bunch of random numbers
+ * @buf : buffer to fill with random numbers
+ * @size: size of passed in buffer
+ *
+ * Fills a buffer with random bytes
+ */
+void cfs_get_random_bytes(void *buf, int size)
+{
+	int *p = buf;
+	int rem, tmp;
+
+	LASSERT(size >= 0);
+
+	rem = min((int)((unsigned long)buf & (sizeof(int) - 1)), size);
+	if (rem) {
+		get_random_bytes(&tmp, sizeof(tmp));
+		tmp ^= cfs_rand();
+		memcpy(buf, &tmp, rem);
+		p = buf + rem;
+		size -= rem;
+	}
+
+	while (size >= sizeof(int)) {
+		get_random_bytes(&tmp, sizeof(tmp));
+		*p = cfs_rand() ^ tmp;
+		size -= sizeof(int);
+		p++;
+	}
+	buf = p;
+	if (size) {
+		get_random_bytes(&tmp, sizeof(tmp));
+		tmp ^= cfs_rand();
+		memcpy(buf, &tmp, size);
+	}
+}
+EXPORT_SYMBOL(cfs_get_random_bytes);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c
new file mode 100644
index 0000000000000..798471bb694d9
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c
@@ -0,0 +1,1157 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/tracefile.c
+ *
+ * Author: Zach Brown <zab@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+
+#define DEBUG_SUBSYSTEM S_LNET
+#define LUSTRE_TRACEFILE_PRIVATE
+#include "tracefile.h"
+
+#include <linux/kthread.h>
+#include <libcfs/linux/linux-misc.h>
+#include <libcfs/libcfs.h>
+
+/* XXX move things up to the top, comment */
+union cfs_trace_data_union (*cfs_trace_data[TCD_MAX_TYPES])[NR_CPUS] __cacheline_aligned;
+
+char cfs_tracefile[TRACEFILE_NAME_SIZE];
+long long cfs_tracefile_size = CFS_TRACEFILE_SIZE;
+static struct tracefiled_ctl trace_tctl;
+static DEFINE_MUTEX(cfs_trace_thread_mutex);
+static int thread_running = 0;
+
+static atomic_t cfs_tage_allocated = ATOMIC_INIT(0);
+
+static void put_pages_on_tcd_daemon_list(struct page_collection *pc,
+					struct cfs_trace_cpu_data *tcd);
+
+static inline struct cfs_trace_page *
+cfs_tage_from_list(struct list_head *list)
+{
+	return list_entry(list, struct cfs_trace_page, linkage);
+}
+
+static struct cfs_trace_page *cfs_tage_alloc(gfp_t gfp)
+{
+	struct page            *page;
+	struct cfs_trace_page *tage;
+
+	/* My caller is trying to free memory */
+	if (!in_interrupt() && memory_pressure_get())
+		return NULL;
+
+	/*
+	 * Don't spam console with allocation failures: they will be reported
+	 * by upper layer anyway.
+	 */
+	gfp |= __GFP_NOWARN;
+	page = alloc_page(gfp);
+	if (page == NULL)
+		return NULL;
+
+	tage = kmalloc(sizeof(*tage), gfp);
+	if (tage == NULL) {
+		__free_page(page);
+		return NULL;
+	}
+
+	tage->page = page;
+	atomic_inc(&cfs_tage_allocated);
+	return tage;
+}
+
+static void cfs_tage_free(struct cfs_trace_page *tage)
+{
+	__LASSERT(tage != NULL);
+	__LASSERT(tage->page != NULL);
+
+	__free_page(tage->page);
+	kfree(tage);
+	atomic_dec(&cfs_tage_allocated);
+}
+
+static void cfs_tage_to_tail(struct cfs_trace_page *tage,
+			     struct list_head *queue)
+{
+	__LASSERT(tage != NULL);
+	__LASSERT(queue != NULL);
+
+	list_move_tail(&tage->linkage, queue);
+}
+
+int cfs_trace_refill_stock(struct cfs_trace_cpu_data *tcd, gfp_t gfp,
+			   struct list_head *stock)
+{
+	int i;
+
+	/*
+	 * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+	 * from here: this will lead to infinite recursion.
+	 */
+
+	for (i = 0; i + tcd->tcd_cur_stock_pages < TCD_STOCK_PAGES ; ++ i) {
+		struct cfs_trace_page *tage;
+
+		tage = cfs_tage_alloc(gfp);
+		if (tage == NULL)
+			break;
+		list_add_tail(&tage->linkage, stock);
+	}
+	return i;
+}
+
+/* return a page that has 'len' bytes left at the end */
+static struct cfs_trace_page *
+cfs_trace_get_tage_try(struct cfs_trace_cpu_data *tcd, unsigned long len)
+{
+        struct cfs_trace_page *tage;
+
+        if (tcd->tcd_cur_pages > 0) {
+		__LASSERT(!list_empty(&tcd->tcd_pages));
+                tage = cfs_tage_from_list(tcd->tcd_pages.prev);
+		if (tage->used + len <= PAGE_SIZE)
+                        return tage;
+        }
+
+	if (tcd->tcd_cur_pages < tcd->tcd_max_pages) {
+		if (tcd->tcd_cur_stock_pages > 0) {
+			tage = cfs_tage_from_list(tcd->tcd_stock_pages.prev);
+			--tcd->tcd_cur_stock_pages;
+			list_del_init(&tage->linkage);
+		} else {
+			tage = cfs_tage_alloc(GFP_ATOMIC);
+			if (unlikely(tage == NULL)) {
+				if ((!memory_pressure_get() ||
+				     in_interrupt()) && printk_ratelimit())
+					printk(KERN_WARNING
+					       "cannot allocate a tage (%ld)\n",
+					       tcd->tcd_cur_pages);
+				return NULL;
+			}
+		}
+
+		tage->used = 0;
+		tage->cpu = smp_processor_id();
+		tage->type = tcd->tcd_type;
+		list_add_tail(&tage->linkage, &tcd->tcd_pages);
+		tcd->tcd_cur_pages++;
+
+		if (tcd->tcd_cur_pages > 8 && thread_running) {
+			struct tracefiled_ctl *tctl = &trace_tctl;
+			/*
+			 * wake up tracefiled to process some pages.
+			 */
+			wake_up(&tctl->tctl_waitq);
+		}
+		return tage;
+        }
+        return NULL;
+}
+
+static void cfs_tcd_shrink(struct cfs_trace_cpu_data *tcd)
+{
+	int pgcount = tcd->tcd_cur_pages / 10;
+	struct page_collection pc;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+
+	/*
+	 * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+	 * from here: this will lead to infinite recursion.
+	 */
+
+	if (printk_ratelimit())
+		printk(KERN_WARNING "debug daemon buffer overflowed; "
+			"discarding 10%% of pages (%d of %ld)\n",
+			pgcount + 1, tcd->tcd_cur_pages);
+
+	INIT_LIST_HEAD(&pc.pc_pages);
+
+	list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, linkage) {
+		if (pgcount-- == 0)
+			break;
+
+		list_move_tail(&tage->linkage, &pc.pc_pages);
+		tcd->tcd_cur_pages--;
+	}
+	put_pages_on_tcd_daemon_list(&pc, tcd);
+}
+
+/* return a page that has 'len' bytes left at the end */
+static struct cfs_trace_page *cfs_trace_get_tage(struct cfs_trace_cpu_data *tcd,
+                                                 unsigned long len)
+{
+        struct cfs_trace_page *tage;
+
+        /*
+         * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+         * from here: this will lead to infinite recursion.
+         */
+
+	if (len > PAGE_SIZE) {
+		printk(KERN_ERR
+		       "cowardly refusing to write %lu bytes in a page\n", len);
+		return NULL;
+	}
+
+        tage = cfs_trace_get_tage_try(tcd, len);
+        if (tage != NULL)
+                return tage;
+        if (thread_running)
+                cfs_tcd_shrink(tcd);
+        if (tcd->tcd_cur_pages > 0) {
+                tage = cfs_tage_from_list(tcd->tcd_pages.next);
+                tage->used = 0;
+                cfs_tage_to_tail(tage, &tcd->tcd_pages);
+        }
+        return tage;
+}
+
+int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata,
+                     const char *format, ...)
+{
+        va_list args;
+        int     rc;
+
+        va_start(args, format);
+        rc = libcfs_debug_vmsg2(msgdata, format, args, NULL);
+        va_end(args);
+
+        return rc;
+}
+EXPORT_SYMBOL(libcfs_debug_msg);
+
+int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata,
+                       const char *format1, va_list args,
+                       const char *format2, ...)
+{
+        struct cfs_trace_cpu_data *tcd = NULL;
+        struct ptldebug_header     header = {0};
+        struct cfs_trace_page     *tage;
+        /* string_buf is used only if tcd != NULL, and is always set then */
+        char                      *string_buf = NULL;
+        char                      *debug_buf;
+        int                        known_size;
+        int                        needed = 85; /* average message length */
+        int                        max_nob;
+        va_list                    ap;
+        int                        i;
+        int                        remain;
+        int                        mask = msgdata->msg_mask;
+        char                      *file = (char *)msgdata->msg_file;
+	struct cfs_debug_limit_state *cdls = msgdata->msg_cdls;
+
+        if (strchr(file, '/'))
+                file = strrchr(file, '/') + 1;
+
+        tcd = cfs_trace_get_tcd();
+
+        /* cfs_trace_get_tcd() grabs a lock, which disables preemption and
+         * pins us to a particular CPU.  This avoids an smp_processor_id()
+         * warning on Linux when debugging is enabled. */
+        cfs_set_ptldebug_header(&header, msgdata, CDEBUG_STACK());
+
+        if (tcd == NULL)                /* arch may not log in IRQ context */
+                goto console;
+
+        if (tcd->tcd_cur_pages == 0)
+                header.ph_flags |= PH_FLAG_FIRST_RECORD;
+
+        if (tcd->tcd_shutting_down) {
+                cfs_trace_put_tcd(tcd);
+                tcd = NULL;
+                goto console;
+        }
+
+	known_size = strlen(file) + 1;
+        if (msgdata->msg_fn)
+                known_size += strlen(msgdata->msg_fn) + 1;
+
+        if (libcfs_debug_binary)
+                known_size += sizeof(header);
+
+        /*/
+         * '2' used because vsnprintf return real size required for output
+         * _without_ terminating NULL.
+         * if needed is to small for this format.
+         */
+        for (i = 0; i < 2; i++) {
+                tage = cfs_trace_get_tage(tcd, needed + known_size + 1);
+                if (tage == NULL) {
+			if (needed + known_size > PAGE_SIZE)
+                                mask |= D_ERROR;
+
+                        cfs_trace_put_tcd(tcd);
+                        tcd = NULL;
+                        goto console;
+                }
+
+		string_buf = (char *)page_address(tage->page) +
+                                        tage->used + known_size;
+
+		max_nob = PAGE_SIZE - tage->used - known_size;
+		if (max_nob <= 0) {
+			printk(KERN_EMERG "negative max_nob: %d\n",
+			       max_nob);
+			mask |= D_ERROR;
+			cfs_trace_put_tcd(tcd);
+			tcd = NULL;
+			goto console;
+		}
+
+                needed = 0;
+                if (format1) {
+                        va_copy(ap, args);
+                        needed = vsnprintf(string_buf, max_nob, format1, ap);
+                        va_end(ap);
+                }
+
+                if (format2) {
+                        remain = max_nob - needed;
+                        if (remain < 0)
+                                remain = 0;
+
+                        va_start(ap, format2);
+                        needed += vsnprintf(string_buf + needed, remain,
+                                            format2, ap);
+                        va_end(ap);
+                }
+
+                if (needed < max_nob) /* well. printing ok.. */
+                        break;
+        }
+
+	if (*(string_buf+needed-1) != '\n')
+		printk(KERN_INFO "format at %s:%d:%s doesn't end in "
+		       "newline\n", file, msgdata->msg_line, msgdata->msg_fn);
+
+        header.ph_len = known_size + needed;
+	debug_buf = (char *)page_address(tage->page) + tage->used;
+
+        if (libcfs_debug_binary) {
+                memcpy(debug_buf, &header, sizeof(header));
+                tage->used += sizeof(header);
+                debug_buf += sizeof(header);
+        }
+
+        strcpy(debug_buf, file);
+        tage->used += strlen(file) + 1;
+        debug_buf += strlen(file) + 1;
+
+        if (msgdata->msg_fn) {
+                strcpy(debug_buf, msgdata->msg_fn);
+                tage->used += strlen(msgdata->msg_fn) + 1;
+                debug_buf += strlen(msgdata->msg_fn) + 1;
+        }
+
+        __LASSERT(debug_buf == string_buf);
+
+        tage->used += needed;
+	__LASSERT(tage->used <= PAGE_SIZE);
+
+console:
+        if ((mask & libcfs_printk) == 0) {
+                /* no console output requested */
+                if (tcd != NULL)
+                        cfs_trace_put_tcd(tcd);
+                return 1;
+        }
+
+        if (cdls != NULL) {
+                if (libcfs_console_ratelimit &&
+                    cdls->cdls_next != 0 &&     /* not first time ever */
+                    !cfs_time_after(cfs_time_current(), cdls->cdls_next)) {
+                        /* skipping a console message */
+                        cdls->cdls_count++;
+                        if (tcd != NULL)
+                                cfs_trace_put_tcd(tcd);
+                        return 1;
+                }
+
+                if (cfs_time_after(cfs_time_current(), cdls->cdls_next +
+                                                       libcfs_console_max_delay
+                                                       + cfs_time_seconds(10))) {
+                        /* last timeout was a long time ago */
+                        cdls->cdls_delay /= libcfs_console_backoff * 4;
+                } else {
+                        cdls->cdls_delay *= libcfs_console_backoff;
+                }
+
+		if (cdls->cdls_delay < libcfs_console_min_delay)
+			cdls->cdls_delay = libcfs_console_min_delay;
+		else if (cdls->cdls_delay > libcfs_console_max_delay)
+			cdls->cdls_delay = libcfs_console_max_delay;
+
+                /* ensure cdls_next is never zero after it's been seen */
+                cdls->cdls_next = (cfs_time_current() + cdls->cdls_delay) | 1;
+        }
+
+        if (tcd != NULL) {
+                cfs_print_to_console(&header, mask, string_buf, needed, file,
+                                     msgdata->msg_fn);
+                cfs_trace_put_tcd(tcd);
+        } else {
+                string_buf = cfs_trace_get_console_buffer();
+
+                needed = 0;
+                if (format1 != NULL) {
+                        va_copy(ap, args);
+                        needed = vsnprintf(string_buf,
+                                           CFS_TRACE_CONSOLE_BUFFER_SIZE,
+                                           format1, ap);
+                        va_end(ap);
+                }
+                if (format2 != NULL) {
+                        remain = CFS_TRACE_CONSOLE_BUFFER_SIZE - needed;
+                        if (remain > 0) {
+                                va_start(ap, format2);
+                                needed += vsnprintf(string_buf+needed, remain,
+                                                    format2, ap);
+                                va_end(ap);
+                        }
+                }
+                cfs_print_to_console(&header, mask,
+                                     string_buf, needed, file, msgdata->msg_fn);
+
+		put_cpu();
+        }
+
+        if (cdls != NULL && cdls->cdls_count != 0) {
+                string_buf = cfs_trace_get_console_buffer();
+
+                needed = snprintf(string_buf, CFS_TRACE_CONSOLE_BUFFER_SIZE,
+                                  "Skipped %d previous similar message%s\n",
+                                  cdls->cdls_count,
+                                  (cdls->cdls_count > 1) ? "s" : "");
+
+                cfs_print_to_console(&header, mask,
+                                     string_buf, needed, file, msgdata->msg_fn);
+
+		put_cpu();
+                cdls->cdls_count = 0;
+        }
+
+        return 0;
+}
+EXPORT_SYMBOL(libcfs_debug_vmsg2);
+
+void
+cfs_trace_assertion_failed(const char *str,
+			   struct libcfs_debug_msg_data *msgdata)
+{
+	struct ptldebug_header hdr;
+
+	libcfs_panic_in_progress = 1;
+	libcfs_catastrophe = 1;
+	smp_mb();
+
+	cfs_set_ptldebug_header(&hdr, msgdata, CDEBUG_STACK());
+
+	cfs_print_to_console(&hdr, D_EMERG, str, strlen(str),
+			     msgdata->msg_file, msgdata->msg_fn);
+
+	panic("Lustre debug assertion failure\n");
+
+	/* not reached */
+}
+
+static void
+panic_collect_pages(struct page_collection *pc)
+{
+	/* Do the collect_pages job on a single CPU: assumes that all other
+	 * CPUs have been stopped during a panic.  If this isn't true for some
+	 * arch, this will have to be implemented separately in each arch.  */
+	int			   i;
+	int			   j;
+	struct cfs_trace_cpu_data *tcd;
+
+	INIT_LIST_HEAD(&pc->pc_pages);
+
+	cfs_tcd_for_each(tcd, i, j) {
+		list_splice_init(&tcd->tcd_pages, &pc->pc_pages);
+		tcd->tcd_cur_pages = 0;
+
+		if (pc->pc_want_daemon_pages) {
+			list_splice_init(&tcd->tcd_daemon_pages,
+						&pc->pc_pages);
+			tcd->tcd_cur_daemon_pages = 0;
+		}
+	}
+}
+
+static void collect_pages_on_all_cpus(struct page_collection *pc)
+{
+	struct cfs_trace_cpu_data *tcd;
+	int i, cpu;
+
+	for_each_possible_cpu(cpu) {
+		cfs_tcd_for_each_type_lock(tcd, i, cpu) {
+			list_splice_init(&tcd->tcd_pages, &pc->pc_pages);
+			tcd->tcd_cur_pages = 0;
+			if (pc->pc_want_daemon_pages) {
+				list_splice_init(&tcd->tcd_daemon_pages,
+							&pc->pc_pages);
+				tcd->tcd_cur_daemon_pages = 0;
+			}
+		}
+	}
+}
+
+static void collect_pages(struct page_collection *pc)
+{
+	INIT_LIST_HEAD(&pc->pc_pages);
+
+	if (libcfs_panic_in_progress)
+		panic_collect_pages(pc);
+	else
+		collect_pages_on_all_cpus(pc);
+}
+
+static void put_pages_back_on_all_cpus(struct page_collection *pc)
+{
+        struct cfs_trace_cpu_data *tcd;
+	struct list_head *cur_head;
+        struct cfs_trace_page *tage;
+        struct cfs_trace_page *tmp;
+        int i, cpu;
+
+	for_each_possible_cpu(cpu) {
+                cfs_tcd_for_each_type_lock(tcd, i, cpu) {
+                        cur_head = tcd->tcd_pages.next;
+
+			list_for_each_entry_safe(tage, tmp, &pc->pc_pages,
+						 linkage) {
+
+				__LASSERT_TAGE_INVARIANT(tage);
+
+				if (tage->cpu != cpu || tage->type != i)
+					continue;
+
+				cfs_tage_to_tail(tage, cur_head);
+				tcd->tcd_cur_pages++;
+			}
+		}
+	}
+}
+
+static void put_pages_back(struct page_collection *pc)
+{
+        if (!libcfs_panic_in_progress)
+                put_pages_back_on_all_cpus(pc);
+}
+
+/* Add pages to a per-cpu debug daemon ringbuffer.  This buffer makes sure that
+ * we have a good amount of data at all times for dumping during an LBUG, even
+ * if we have been steadily writing (and otherwise discarding) pages via the
+ * debug daemon. */
+static void put_pages_on_tcd_daemon_list(struct page_collection *pc,
+					 struct cfs_trace_cpu_data *tcd)
+{
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+
+	list_for_each_entry_safe(tage, tmp, &pc->pc_pages, linkage) {
+		__LASSERT_TAGE_INVARIANT(tage);
+
+		if (tage->cpu != tcd->tcd_cpu || tage->type != tcd->tcd_type)
+			continue;
+
+		cfs_tage_to_tail(tage, &tcd->tcd_daemon_pages);
+		tcd->tcd_cur_daemon_pages++;
+
+		if (tcd->tcd_cur_daemon_pages > tcd->tcd_max_pages) {
+			struct cfs_trace_page *victim;
+
+			__LASSERT(!list_empty(&tcd->tcd_daemon_pages));
+			victim = cfs_tage_from_list(tcd->tcd_daemon_pages.next);
+
+                        __LASSERT_TAGE_INVARIANT(victim);
+
+			list_del(&victim->linkage);
+			cfs_tage_free(victim);
+			tcd->tcd_cur_daemon_pages--;
+		}
+	}
+}
+
+static void put_pages_on_daemon_list(struct page_collection *pc)
+{
+        struct cfs_trace_cpu_data *tcd;
+        int i, cpu;
+
+	for_each_possible_cpu(cpu) {
+                cfs_tcd_for_each_type_lock(tcd, i, cpu)
+                        put_pages_on_tcd_daemon_list(pc, tcd);
+        }
+}
+
+void cfs_trace_debug_print(void)
+{
+	struct page_collection pc;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+
+	pc.pc_want_daemon_pages = 1;
+	collect_pages(&pc);
+	list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
+		char *p, *file, *fn;
+		struct page *page;
+
+		__LASSERT_TAGE_INVARIANT(tage);
+
+		page = tage->page;
+		p = page_address(page);
+		while (p < ((char *)page_address(page) + tage->used)) {
+                        struct ptldebug_header *hdr;
+                        int len;
+                        hdr = (void *)p;
+                        p += sizeof(*hdr);
+                        file = p;
+                        p += strlen(file) + 1;
+                        fn = p;
+                        p += strlen(fn) + 1;
+                        len = hdr->ph_len - (int)(p - (char *)hdr);
+
+                        cfs_print_to_console(hdr, D_EMERG, p, len, file, fn);
+
+                        p += len;
+                }
+
+		list_del(&tage->linkage);
+		cfs_tage_free(tage);
+	}
+}
+
+int cfs_tracefile_dump_all_pages(char *filename)
+{
+	struct page_collection	pc;
+	struct file		*filp;
+	struct cfs_trace_page	*tage;
+	struct cfs_trace_page	*tmp;
+	char			*buf;
+	int rc;
+
+	cfs_tracefile_write_lock();
+
+	filp = filp_open(filename, O_CREAT|O_EXCL|O_WRONLY|O_LARGEFILE, 0600);
+	if (IS_ERR(filp)) {
+		rc = PTR_ERR(filp);
+		filp = NULL;
+		printk(KERN_ERR "LustreError: can't open %s for dump: rc %d\n",
+		      filename, rc);
+		goto out;
+	}
+
+        pc.pc_want_daemon_pages = 1;
+        collect_pages(&pc);
+	if (list_empty(&pc.pc_pages)) {
+                rc = 0;
+                goto close;
+        }
+
+	/* ok, for now, just write the pages.  in the future we'll be building
+	 * iobufs with the pages and calling generic_direct_IO */
+	list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
+
+		__LASSERT_TAGE_INVARIANT(tage);
+
+		buf = kmap(tage->page);
+		rc = cfs_kernel_write(filp, buf, tage->used, &filp->f_pos);
+		kunmap(tage->page);
+		if (rc != (int)tage->used) {
+			printk(KERN_WARNING "wanted to write %u but wrote "
+			       "%d\n", tage->used, rc);
+			put_pages_back(&pc);
+			__LASSERT(list_empty(&pc.pc_pages));
+			break;
+		}
+		list_del(&tage->linkage);
+                cfs_tage_free(tage);
+        }
+
+	rc = ll_vfs_fsync_range(filp, 0, LLONG_MAX, 1);
+	if (rc)
+		printk(KERN_ERR "sync returns %d\n", rc);
+close:
+	filp_close(filp, NULL);
+out:
+	cfs_tracefile_write_unlock();
+	return rc;
+}
+
+void cfs_trace_flush_pages(void)
+{
+	struct page_collection pc;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+
+	pc.pc_want_daemon_pages = 1;
+	collect_pages(&pc);
+	list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
+
+		__LASSERT_TAGE_INVARIANT(tage);
+
+		list_del(&tage->linkage);
+		cfs_tage_free(tage);
+	}
+}
+
+int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
+			    const char __user *usr_buffer, int usr_buffer_nob)
+{
+        int    nob;
+
+        if (usr_buffer_nob > knl_buffer_nob)
+                return -EOVERFLOW;
+
+	if (copy_from_user(knl_buffer, usr_buffer, usr_buffer_nob))
+                return -EFAULT;
+
+        nob = strnlen(knl_buffer, usr_buffer_nob);
+        while (nob-- >= 0)                      /* strip trailing whitespace */
+                if (!isspace(knl_buffer[nob]))
+                        break;
+
+        if (nob < 0)                            /* empty string */
+                return -EINVAL;
+
+        if (nob == knl_buffer_nob)              /* no space to terminate */
+                return -EOVERFLOW;
+
+        knl_buffer[nob + 1] = 0;                /* terminate */
+        return 0;
+}
+EXPORT_SYMBOL(cfs_trace_copyin_string);
+
+int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob,
+                             const char *knl_buffer, char *append)
+{
+        /* NB if 'append' != NULL, it's a single character to append to the
+         * copied out string - usually "\n", for /proc entries and "" (i.e. a
+         * terminating zero byte) for sysctl entries */
+        int   nob = strlen(knl_buffer);
+
+        if (nob > usr_buffer_nob)
+                nob = usr_buffer_nob;
+
+	if (copy_to_user(usr_buffer, knl_buffer, nob))
+                return -EFAULT;
+
+        if (append != NULL && nob < usr_buffer_nob) {
+		if (copy_to_user(usr_buffer + nob, append, 1))
+                        return -EFAULT;
+
+                nob++;
+        }
+
+        return nob;
+}
+EXPORT_SYMBOL(cfs_trace_copyout_string);
+
+int cfs_trace_allocate_string_buffer(char **str, int nob)
+{
+	if (nob > 2 * PAGE_SIZE)	/* string must be "sensible" */
+                return -EINVAL;
+
+	*str = kmalloc(nob, GFP_KERNEL | __GFP_ZERO);
+        if (*str == NULL)
+                return -ENOMEM;
+
+        return 0;
+}
+
+int cfs_trace_dump_debug_buffer_usrstr(void __user *usr_str, int usr_str_nob)
+{
+        char         *str;
+        int           rc;
+
+        rc = cfs_trace_allocate_string_buffer(&str, usr_str_nob + 1);
+        if (rc != 0)
+                return rc;
+
+        rc = cfs_trace_copyin_string(str, usr_str_nob + 1,
+                                     usr_str, usr_str_nob);
+        if (rc != 0)
+                goto out;
+
+        if (str[0] != '/') {
+                rc = -EINVAL;
+                goto out;
+        }
+        rc = cfs_tracefile_dump_all_pages(str);
+out:
+	kfree(str);
+        return rc;
+}
+
+int cfs_trace_daemon_command(char *str)
+{
+        int       rc = 0;
+
+        cfs_tracefile_write_lock();
+
+        if (strcmp(str, "stop") == 0) {
+                cfs_tracefile_write_unlock();
+                cfs_trace_stop_thread();
+                cfs_tracefile_write_lock();
+                memset(cfs_tracefile, 0, sizeof(cfs_tracefile));
+
+        } else if (strncmp(str, "size=", 5) == 0) {
+                cfs_tracefile_size = simple_strtoul(str + 5, NULL, 0);
+                if (cfs_tracefile_size < 10 || cfs_tracefile_size > 20480)
+                        cfs_tracefile_size = CFS_TRACEFILE_SIZE;
+                else
+                        cfs_tracefile_size <<= 20;
+
+        } else if (strlen(str) >= sizeof(cfs_tracefile)) {
+                rc = -ENAMETOOLONG;
+        } else if (str[0] != '/') {
+                rc = -EINVAL;
+        } else {
+		strcpy(cfs_tracefile, str);
+
+		printk(KERN_INFO
+		       "Lustre: debug daemon will attempt to start writing "
+		       "to %s (%lukB max)\n", cfs_tracefile,
+		       (long)(cfs_tracefile_size >> 10));
+
+		cfs_trace_start_thread();
+        }
+
+        cfs_tracefile_write_unlock();
+        return rc;
+}
+
+int cfs_trace_daemon_command_usrstr(void __user *usr_str, int usr_str_nob)
+{
+        char *str;
+        int   rc;
+
+        rc = cfs_trace_allocate_string_buffer(&str, usr_str_nob + 1);
+        if (rc != 0)
+                return rc;
+
+        rc = cfs_trace_copyin_string(str, usr_str_nob + 1,
+                                 usr_str, usr_str_nob);
+        if (rc == 0)
+                rc = cfs_trace_daemon_command(str);
+
+	kfree(str);
+        return rc;
+}
+
+int cfs_trace_set_debug_mb(int mb)
+{
+	int i;
+	int j;
+	int pages;
+	int limit = cfs_trace_max_debug_mb();
+	struct cfs_trace_cpu_data *tcd;
+
+	if (mb < num_possible_cpus()) {
+		printk(KERN_WARNING
+		       "Lustre: %d MB is too small for debug buffer size, "
+		       "setting it to %d MB.\n", mb, num_possible_cpus());
+		mb = num_possible_cpus();
+	}
+
+	if (mb > limit) {
+		printk(KERN_WARNING
+		       "Lustre: %d MB is too large for debug buffer size, "
+		       "setting it to %d MB.\n", mb, limit);
+		mb = limit;
+	}
+
+	mb /= num_possible_cpus();
+	pages = mb << (20 - PAGE_SHIFT);
+
+	cfs_tracefile_write_lock();
+
+	cfs_tcd_for_each(tcd, i, j)
+		tcd->tcd_max_pages = (pages * tcd->tcd_pages_factor) / 100;
+
+	cfs_tracefile_write_unlock();
+
+	return 0;
+}
+
+int cfs_trace_set_debug_mb_usrstr(void __user *usr_str, int usr_str_nob)
+{
+        char     str[32];
+        int      rc;
+
+        rc = cfs_trace_copyin_string(str, sizeof(str), usr_str, usr_str_nob);
+        if (rc < 0)
+                return rc;
+
+        return cfs_trace_set_debug_mb(simple_strtoul(str, NULL, 0));
+}
+
+int cfs_trace_get_debug_mb(void)
+{
+        int i;
+        int j;
+        struct cfs_trace_cpu_data *tcd;
+        int total_pages = 0;
+
+        cfs_tracefile_read_lock();
+
+        cfs_tcd_for_each(tcd, i, j)
+                total_pages += tcd->tcd_max_pages;
+
+        cfs_tracefile_read_unlock();
+
+	return (total_pages >> (20 - PAGE_SHIFT)) + 1;
+}
+
+static int tracefiled(void *arg)
+{
+	struct page_collection pc;
+	struct tracefiled_ctl *tctl = arg;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+	struct file *filp;
+	char *buf;
+	int last_loop = 0;
+	int rc;
+
+	/* we're started late enough that we pick up init's fs context */
+	/* this is so broken in uml?  what on earth is going on? */
+
+	complete(&tctl->tctl_start);
+
+	while (1) {
+		wait_queue_entry_t __wait;
+
+                pc.pc_want_daemon_pages = 0;
+                collect_pages(&pc);
+		if (list_empty(&pc.pc_pages))
+                        goto end_loop;
+
+                filp = NULL;
+                cfs_tracefile_read_lock();
+                if (cfs_tracefile[0] != 0) {
+			filp = filp_open(cfs_tracefile,
+					 O_CREAT | O_RDWR | O_LARGEFILE,
+					 0600);
+			if (IS_ERR(filp)) {
+				rc = PTR_ERR(filp);
+				filp = NULL;
+				printk(KERN_WARNING "couldn't open %s: "
+				       "%d\n", cfs_tracefile, rc);
+			}
+		}
+                cfs_tracefile_read_unlock();
+                if (filp == NULL) {
+                        put_pages_on_daemon_list(&pc);
+			__LASSERT(list_empty(&pc.pc_pages));
+                        goto end_loop;
+                }
+
+		list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
+			struct dentry *de = file_dentry(filp);
+			static loff_t f_pos;
+
+			__LASSERT_TAGE_INVARIANT(tage);
+
+			if (f_pos >= (off_t)cfs_tracefile_size)
+				f_pos = 0;
+			else if (f_pos > i_size_read(de->d_inode))
+				f_pos = i_size_read(de->d_inode);
+
+			buf = kmap(tage->page);
+			rc = cfs_kernel_write(filp, buf, tage->used, &f_pos);
+			kunmap(tage->page);
+			if (rc != (int)tage->used) {
+				printk(KERN_WARNING "wanted to write %u "
+				       "but wrote %d\n", tage->used, rc);
+				put_pages_back(&pc);
+				__LASSERT(list_empty(&pc.pc_pages));
+				break;
+			}
+                }
+
+		filp_close(filp, NULL);
+                put_pages_on_daemon_list(&pc);
+		if (!list_empty(&pc.pc_pages)) {
+                        int i;
+
+			printk(KERN_ALERT "Lustre: trace pages aren't "
+			       " empty\n");
+			printk(KERN_ERR "total cpus(%d): ",
+			       num_possible_cpus());
+			for (i = 0; i < num_possible_cpus(); i++)
+				if (cpu_online(i))
+					printk(KERN_ERR "%d(on) ", i);
+				else
+					printk(KERN_ERR "%d(off) ", i);
+			printk(KERN_ERR "\n");
+
+			i = 0;
+			list_for_each_entry_safe(tage, tmp, &pc.pc_pages,
+						     linkage)
+				printk(KERN_ERR "page %d belongs to cpu "
+				       "%d\n", ++i, tage->cpu);
+			printk(KERN_ERR "There are %d pages unwritten\n",
+			       i);
+		}
+		__LASSERT(list_empty(&pc.pc_pages));
+end_loop:
+		if (atomic_read(&tctl->tctl_shutdown)) {
+			if (last_loop == 0) {
+				last_loop = 1;
+				continue;
+			} else {
+				break;
+			}
+		}
+		init_waitqueue_entry(&__wait, current);
+		add_wait_queue(&tctl->tctl_waitq, &__wait);
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(1));
+		remove_wait_queue(&tctl->tctl_waitq, &__wait);
+        }
+	complete(&tctl->tctl_stop);
+        return 0;
+}
+
+int cfs_trace_start_thread(void)
+{
+        struct tracefiled_ctl *tctl = &trace_tctl;
+        int rc = 0;
+
+	mutex_lock(&cfs_trace_thread_mutex);
+        if (thread_running)
+                goto out;
+
+	init_completion(&tctl->tctl_start);
+	init_completion(&tctl->tctl_stop);
+	init_waitqueue_head(&tctl->tctl_waitq);
+	atomic_set(&tctl->tctl_shutdown, 0);
+
+	if (IS_ERR(kthread_run(tracefiled, tctl, "ktracefiled"))) {
+		rc = -ECHILD;
+		goto out;
+	}
+
+	wait_for_completion(&tctl->tctl_start);
+	thread_running = 1;
+out:
+	mutex_unlock(&cfs_trace_thread_mutex);
+        return rc;
+}
+
+void cfs_trace_stop_thread(void)
+{
+	struct tracefiled_ctl *tctl = &trace_tctl;
+
+	mutex_lock(&cfs_trace_thread_mutex);
+	if (thread_running) {
+		printk(KERN_INFO
+		       "Lustre: shutting down debug daemon thread...\n");
+		atomic_set(&tctl->tctl_shutdown, 1);
+		wait_for_completion(&tctl->tctl_stop);
+		thread_running = 0;
+	}
+	mutex_unlock(&cfs_trace_thread_mutex);
+}
+
+int cfs_tracefile_init(int max_pages)
+{
+	struct cfs_trace_cpu_data *tcd;
+	int	i;
+	int	j;
+	int	rc;
+	int	factor;
+
+	rc = cfs_tracefile_init_arch();
+	if (rc != 0)
+		return rc;
+
+	cfs_tcd_for_each(tcd, i, j) {
+		/* tcd_pages_factor is initialized int tracefile_init_arch. */
+		factor = tcd->tcd_pages_factor;
+		INIT_LIST_HEAD(&tcd->tcd_pages);
+		INIT_LIST_HEAD(&tcd->tcd_stock_pages);
+		INIT_LIST_HEAD(&tcd->tcd_daemon_pages);
+		tcd->tcd_cur_pages = 0;
+		tcd->tcd_cur_stock_pages = 0;
+		tcd->tcd_cur_daemon_pages = 0;
+		tcd->tcd_max_pages = (max_pages * factor) / 100;
+		LASSERT(tcd->tcd_max_pages > 0);
+		tcd->tcd_shutting_down = 0;
+	}
+	return 0;
+}
+
+static void trace_cleanup_on_all_cpus(void)
+{
+	struct cfs_trace_cpu_data *tcd;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+	int i, cpu;
+
+	for_each_possible_cpu(cpu) {
+		cfs_tcd_for_each_type_lock(tcd, i, cpu) {
+			tcd->tcd_shutting_down = 1;
+
+			list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, linkage) {
+				__LASSERT_TAGE_INVARIANT(tage);
+
+				list_del(&tage->linkage);
+				cfs_tage_free(tage);
+			}
+			tcd->tcd_cur_pages = 0;
+		}
+	}
+}
+
+static void cfs_trace_cleanup(void)
+{
+	struct page_collection pc;
+
+	INIT_LIST_HEAD(&pc.pc_pages);
+
+	trace_cleanup_on_all_cpus();
+
+	cfs_tracefile_fini_arch();
+}
+
+void cfs_tracefile_exit(void)
+{
+        cfs_trace_stop_thread();
+        cfs_trace_cleanup();
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.h b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.h
new file mode 100644
index 0000000000000..2f5dc4f272783
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.h
@@ -0,0 +1,320 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_TRACEFILE_H__
+#define __LIBCFS_TRACEFILE_H__
+
+#include <libcfs/libcfs.h>
+
+enum cfs_trace_buf_type {
+	CFS_TCD_TYPE_PROC = 0,
+	CFS_TCD_TYPE_SOFTIRQ,
+	CFS_TCD_TYPE_IRQ,
+	CFS_TCD_TYPE_MAX
+};
+
+/* trace file lock routines */
+
+#define TRACEFILE_NAME_SIZE 1024
+extern char      cfs_tracefile[TRACEFILE_NAME_SIZE];
+extern long long cfs_tracefile_size;
+
+/**
+ * The path of debug log dump upcall script.
+ */
+extern char lnet_debug_log_upcall[1024];
+
+extern void libcfs_run_debug_log_upcall(char *file);
+
+int  cfs_tracefile_init_arch(void);
+void cfs_tracefile_fini_arch(void);
+
+void cfs_tracefile_read_lock(void);
+void cfs_tracefile_read_unlock(void);
+void cfs_tracefile_write_lock(void);
+void cfs_tracefile_write_unlock(void);
+
+int cfs_tracefile_dump_all_pages(char *filename);
+void cfs_trace_debug_print(void);
+void cfs_trace_flush_pages(void);
+int cfs_trace_start_thread(void);
+void cfs_trace_stop_thread(void);
+int cfs_tracefile_init(int max_pages);
+void cfs_tracefile_exit(void);
+
+
+
+int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
+			    const char __user *usr_buffer, int usr_buffer_nob);
+int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob,
+                             const char *knl_str, char *append);
+int cfs_trace_allocate_string_buffer(char **str, int nob);
+int cfs_trace_dump_debug_buffer_usrstr(void __user *usr_str, int usr_str_nob);
+int cfs_trace_daemon_command(char *str);
+int cfs_trace_daemon_command_usrstr(void __user *usr_str, int usr_str_nob);
+int cfs_trace_set_debug_mb(int mb);
+int cfs_trace_set_debug_mb_usrstr(void __user *usr_str, int usr_str_nob);
+int cfs_trace_get_debug_mb(void);
+
+extern void libcfs_debug_dumplog_internal(void *arg);
+extern void libcfs_register_panic_notifier(void);
+extern void libcfs_unregister_panic_notifier(void);
+extern int  libcfs_panic_in_progress;
+extern int  cfs_trace_max_debug_mb(void);
+
+#define TCD_MAX_PAGES (5 << (20 - PAGE_SHIFT))
+#define TCD_STOCK_PAGES (TCD_MAX_PAGES)
+#define CFS_TRACEFILE_SIZE (500 << 20)
+
+#ifdef LUSTRE_TRACEFILE_PRIVATE
+
+/*
+ * Private declare for tracefile
+ */
+#define TCD_MAX_PAGES (5 << (20 - PAGE_SHIFT))
+#define TCD_STOCK_PAGES (TCD_MAX_PAGES)
+
+#define CFS_TRACEFILE_SIZE (500 << 20)
+
+/* Size of a buffer for sprinting console messages if we can't get a page
+ * from system */
+#define CFS_TRACE_CONSOLE_BUFFER_SIZE   1024
+
+union cfs_trace_data_union {
+	struct cfs_trace_cpu_data {
+		/*
+		 * Even though this structure is meant to be per-CPU, locking
+		 * is needed because in some places the data may be accessed
+		 * from other CPUs. This lock is directly used in trace_get_tcd
+		 * and trace_put_tcd, which are called in libcfs_debug_vmsg2 and
+		 * tcd_for_each_type_lock
+		 */
+		spinlock_t		tcd_lock;
+		unsigned long           tcd_lock_flags;
+
+		/*
+		 * pages with trace records not yet processed by tracefiled.
+		 */
+		struct list_head	tcd_pages;
+		/* number of pages on ->tcd_pages */
+		unsigned long		tcd_cur_pages;
+
+		/*
+		 * pages with trace records already processed by
+		 * tracefiled. These pages are kept in memory, so that some
+		 * portion of log can be written in the event of LBUG. This
+		 * list is maintained in LRU order.
+		 *
+		 * Pages are moved to ->tcd_daemon_pages by tracefiled()
+		 * (put_pages_on_daemon_list()). LRU pages from this list are
+		 * discarded when list grows too large.
+		 */
+		struct list_head	tcd_daemon_pages;
+		/* number of pages on ->tcd_daemon_pages */
+		unsigned long		tcd_cur_daemon_pages;
+
+		/*
+		 * Maximal number of pages allowed on ->tcd_pages and
+		 * ->tcd_daemon_pages each.
+		 * Always TCD_MAX_PAGES * tcd_pages_factor / 100 in current
+		 * implementation.
+		 */
+		unsigned long           tcd_max_pages;
+
+		/*
+		 * preallocated pages to write trace records into. Pages from
+		 * ->tcd_stock_pages are moved to ->tcd_pages by
+		 * portals_debug_msg().
+		 *
+		 * This list is necessary, because on some platforms it's
+		 * impossible to perform efficient atomic page allocation in a
+		 * non-blockable context.
+		 *
+		 * Such platforms fill ->tcd_stock_pages "on occasion", when
+		 * tracing code is entered in blockable context.
+		 *
+		 * trace_get_tage_try() tries to get a page from
+		 * ->tcd_stock_pages first and resorts to atomic page
+		 * allocation only if this queue is empty. ->tcd_stock_pages
+		 * is replenished when tracing code is entered in blocking
+		 * context (darwin-tracefile.c:trace_get_tcd()). We try to
+		 * maintain TCD_STOCK_PAGES (40 by default) pages in this
+		 * queue. Atomic allocation is only required if more than
+		 * TCD_STOCK_PAGES pagesful are consumed by trace records all
+		 * emitted in non-blocking contexts. Which is quite unlikely.
+		 */
+		struct list_head	tcd_stock_pages;
+		/* number of pages on ->tcd_stock_pages */
+		unsigned long           tcd_cur_stock_pages;
+
+		unsigned short          tcd_shutting_down;
+		unsigned short          tcd_cpu;
+		unsigned short          tcd_type;
+		/* The factors to share debug memory. */
+		unsigned short          tcd_pages_factor;
+	} tcd;
+	char __pad[L1_CACHE_ALIGN(sizeof(struct cfs_trace_cpu_data))];
+};
+
+#define TCD_MAX_TYPES      8
+extern union cfs_trace_data_union (*cfs_trace_data[TCD_MAX_TYPES])[NR_CPUS];
+
+#define cfs_tcd_for_each(tcd, i, j)					  \
+    for (i = 0; cfs_trace_data[i] != NULL; i++)				  \
+	for (j = 0, ((tcd) = &(*cfs_trace_data[i])[j].tcd);		  \
+	     j < num_possible_cpus();					  \
+	     j++, (tcd) = &(*cfs_trace_data[i])[j].tcd)
+
+#define cfs_tcd_for_each_type_lock(tcd, i, cpu)                           \
+    for (i = 0; cfs_trace_data[i] &&                                      \
+         (tcd = &(*cfs_trace_data[i])[cpu].tcd) &&                        \
+         cfs_trace_lock_tcd(tcd, 1); cfs_trace_unlock_tcd(tcd, 1), i++)
+
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
+struct page_collection {
+	struct list_head	pc_pages;
+	/*
+	 * if this flag is set, collect_pages() will spill both
+	 * ->tcd_daemon_pages and ->tcd_pages to the ->pc_pages. Otherwise,
+	 * only ->tcd_pages are spilled.
+	 */
+	int			pc_want_daemon_pages;
+};
+
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
+struct tracefiled_ctl {
+	struct completion	tctl_start;
+	struct completion	tctl_stop;
+	wait_queue_head_t	tctl_waitq;
+	pid_t			tctl_pid;
+	atomic_t		tctl_shutdown;
+};
+
+/*
+ * small data-structure for each page owned by tracefiled.
+ */
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
+struct cfs_trace_page {
+	/*
+	 * page itself
+	 */
+	struct page		*page;
+	/*
+	 * linkage into one of the lists in trace_data_union or
+	 * page_collection
+	 */
+	struct list_head	linkage;
+	/*
+	 * number of bytes used within this page
+	 */
+	unsigned int		used;
+	/*
+	 * cpu that owns this page
+	 */
+	unsigned short		cpu;
+	/*
+	 * type(context) of this page
+	 */
+	unsigned short		type;
+};
+
+extern void cfs_set_ptldebug_header(struct ptldebug_header *header,
+                                    struct libcfs_debug_msg_data *m,
+                                    unsigned long stack);
+extern void cfs_print_to_console(struct ptldebug_header *hdr, int mask,
+                                 const char *buf, int len, const char *file,
+                                 const char *fn);
+
+extern int cfs_trace_lock_tcd(struct cfs_trace_cpu_data *tcd, int walking);
+extern void cfs_trace_unlock_tcd(struct cfs_trace_cpu_data *tcd, int walking);
+
+extern char *cfs_trace_console_buffers[NR_CPUS][CFS_TCD_TYPE_MAX];
+extern enum cfs_trace_buf_type cfs_trace_buf_idx_get(void);
+
+static inline char *cfs_trace_get_console_buffer(void)
+{
+	unsigned int i = get_cpu();
+	unsigned int j = cfs_trace_buf_idx_get();
+
+	return cfs_trace_console_buffers[i][j];
+}
+
+static inline struct cfs_trace_cpu_data *cfs_trace_get_tcd(void)
+{
+	struct cfs_trace_cpu_data *tcd =
+		&(*cfs_trace_data[cfs_trace_buf_idx_get()])[get_cpu()].tcd;
+
+	cfs_trace_lock_tcd(tcd, 0);
+
+	return tcd;
+}
+
+static inline void cfs_trace_put_tcd(struct cfs_trace_cpu_data *tcd)
+{
+	cfs_trace_unlock_tcd(tcd, 0);
+	put_cpu();
+}
+
+int cfs_trace_refill_stock(struct cfs_trace_cpu_data *tcd, gfp_t gfp,
+				struct list_head *stock);
+
+
+int cfs_tcd_owns_tage(struct cfs_trace_cpu_data *tcd,
+                      struct cfs_trace_page *tage);
+
+extern void cfs_trace_assertion_failed(const char *str,
+                                       struct libcfs_debug_msg_data *m);
+
+/* ASSERTION that is safe to use within the debug system */
+#define __LASSERT(cond)                                                 \
+do {                                                                    \
+        if (unlikely(!(cond))) {                                        \
+                LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_EMERG, NULL);     \
+                cfs_trace_assertion_failed("ASSERTION("#cond") failed", \
+                                           &msgdata);                   \
+        }                                                               \
+} while (0)
+
+#define __LASSERT_TAGE_INVARIANT(tage)                                  \
+do {                                                                    \
+        __LASSERT(tage != NULL);                                        \
+        __LASSERT(tage->page != NULL);                                  \
+	__LASSERT(tage->used <= PAGE_SIZE);                         \
+	__LASSERT(page_count(tage->page) > 0);                      \
+} while (0)
+
+#endif	/* LUSTRE_TRACEFILE_PRIVATE */
+
+#endif /* __LIBCFS_TRACEFILE_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c
new file mode 100644
index 0000000000000..ceec8703a829a
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c
@@ -0,0 +1,186 @@
+/*
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ *   This file is part of Lustre, https://wiki.hpdd.intel.com/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define __USE_FILE_OFFSET64
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <linux/types.h>
+
+#include <libcfs/util/ioctl.h>
+#include <lnet/lnetctl.h>
+
+struct ioc_dev {
+	const char *dev_name;
+	int dev_fd;
+};
+
+static struct ioc_dev ioc_dev_list[10];
+
+static int
+open_ioc_dev(int dev_id)
+{
+        const char * dev_name;
+
+	if (dev_id < 0 ||
+            dev_id >= sizeof(ioc_dev_list) / sizeof(ioc_dev_list[0]))
+                return -EINVAL;
+
+        dev_name = ioc_dev_list[dev_id].dev_name;
+        if (dev_name == NULL) {
+                fprintf(stderr, "unknown device id: %d\n", dev_id);
+                return -EINVAL;
+        }
+
+        if (ioc_dev_list[dev_id].dev_fd < 0) {
+		int fd = open(dev_name, O_RDWR);
+
+                if (fd < 0) {
+                        fprintf(stderr, "opening %s failed: %s\n"
+                                "hint: the kernel modules may not be loaded\n",
+                                dev_name, strerror(errno));
+                        return fd;
+                }
+                ioc_dev_list[dev_id].dev_fd = fd;
+        }
+
+        return ioc_dev_list[dev_id].dev_fd;
+}
+
+
+int l_ioctl(int dev_id, unsigned int opc, void *buf)
+{
+        int fd, rc;
+
+        fd = open_ioc_dev(dev_id);
+	if (fd < 0)
+                return fd;
+
+	rc = ioctl(fd, opc, buf);
+
+	return rc;
+}
+
+/* register a device to send ioctls to.  */
+int
+register_ioc_dev(int dev_id, const char *dev_name)
+{
+	if (dev_id < 0 ||
+            dev_id >= sizeof(ioc_dev_list) / sizeof(ioc_dev_list[0]))
+                return -EINVAL;
+
+        unregister_ioc_dev(dev_id);
+
+        ioc_dev_list[dev_id].dev_name = dev_name;
+        ioc_dev_list[dev_id].dev_fd = -1;
+
+        return dev_id;
+}
+
+void
+unregister_ioc_dev(int dev_id)
+{
+	if (dev_id < 0 ||
+	    dev_id >= sizeof(ioc_dev_list) / sizeof(ioc_dev_list[0]))
+		return;
+
+	if (ioc_dev_list[dev_id].dev_name != NULL &&
+	    ioc_dev_list[dev_id].dev_fd >= 0)
+		close(ioc_dev_list[dev_id].dev_fd);
+
+	ioc_dev_list[dev_id].dev_name = NULL;
+	ioc_dev_list[dev_id].dev_fd = -1;
+}
+
+static inline size_t libcfs_ioctl_packlen(struct libcfs_ioctl_data *data)
+{
+	size_t len = sizeof(*data);
+
+	len += (data->ioc_inllen1 + 7) & ~7;
+	len += (data->ioc_inllen2 + 7) & ~7;
+	return len;
+}
+
+int libcfs_ioctl_pack(struct libcfs_ioctl_data *data, char **pbuf,
+                                    int max)
+{
+	char *ptr;
+	struct libcfs_ioctl_data *overlay;
+	data->ioc_hdr.ioc_len = libcfs_ioctl_packlen(data);
+	data->ioc_hdr.ioc_version = LIBCFS_IOCTL_VERSION;
+
+	if (*pbuf != NULL && libcfs_ioctl_packlen(data) > max)
+		return 1;
+	if (*pbuf == NULL)
+		*pbuf = malloc(data->ioc_hdr.ioc_len);
+	if (*pbuf == NULL)
+		return 1;
+	overlay = (struct libcfs_ioctl_data *)*pbuf;
+	memcpy(*pbuf, data, sizeof(*data));
+
+	ptr = overlay->ioc_bulk;
+	if (data->ioc_inlbuf1 != NULL) {
+		memcpy((char *)ptr, (const char *)data->ioc_inlbuf1,
+		       data->ioc_inllen1);
+		ptr += ((data->ioc_inllen1 + 7) & ~7);
+	}
+	if (data->ioc_inlbuf2 != NULL) {
+		memcpy((char *)ptr, (const char *)data->ioc_inlbuf2,
+		       data->ioc_inllen2);
+		ptr += ((data->ioc_inllen2 + 7) & ~7);
+	}
+
+	return 0;
+}
+
+void
+libcfs_ioctl_unpack(struct libcfs_ioctl_data *data, char *pbuf)
+{
+	struct libcfs_ioctl_data *overlay = (struct libcfs_ioctl_data *)pbuf;
+	char *ptr;
+
+	/* Preserve the caller's buffer pointers */
+	overlay->ioc_inlbuf1 = data->ioc_inlbuf1;
+	overlay->ioc_inlbuf2 = data->ioc_inlbuf2;
+
+	memcpy(data, pbuf, sizeof(*data));
+	ptr = &overlay->ioc_bulk[0];
+
+	if (data->ioc_inlbuf1 != NULL) {
+		memcpy((char *)data->ioc_inlbuf1, (const char *)ptr,
+		       data->ioc_inllen1);
+		ptr += ((data->ioc_inllen1 + 7) & ~7);
+	}
+	if (data->ioc_inlbuf2 != NULL) {
+		memcpy((char *)data->ioc_inlbuf2, (const char *)ptr,
+		       data->ioc_inllen2);
+		ptr += ((data->ioc_inllen2 + 7) & ~7);
+	}
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/nidstrings.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/nidstrings.c
new file mode 100644
index 0000000000000..04a33bdef4c4c
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/nidstrings.c
@@ -0,0 +1,1305 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/util/nidstrings.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <libcfs/util/string.h>
+#include <lnet/types.h>
+#include <lnet/nidstr.h>
+#ifdef HAVE_NETDB_H
+# include <netdb.h>
+#endif
+
+/* max value for numeric network address */
+#define MAX_NUMERIC_VALUE 0xffffffff
+
+#define IPSTRING_LENGTH 16
+
+/* CAVEAT VENDITOR! Keep the canonical string representation of nets/nids
+ * consistent in all conversion functions.  Some code fragments are copied
+ * around for the sake of clarity...
+ */
+
+/* CAVEAT EMPTOR! Racey temporary buffer allocation!
+ * Choose the number of nidstrings to support the MAXIMUM expected number of
+ * concurrent users.  If there are more, the returned string will be volatile.
+ * NB this number must allow for a process to be descheduled for a timeslice
+ * between getting its string and using it.
+ */
+
+static char      libcfs_nidstrings[LNET_NIDSTR_COUNT][LNET_NIDSTR_SIZE];
+static int       libcfs_nidstring_idx;
+
+char *
+libcfs_next_nidstring(void)
+{
+	char          *str;
+
+	str = libcfs_nidstrings[libcfs_nidstring_idx++];
+	if (libcfs_nidstring_idx ==
+	    sizeof(libcfs_nidstrings)/sizeof(libcfs_nidstrings[0]))
+		libcfs_nidstring_idx = 0;
+
+	return str;
+}
+
+static int
+libcfs_lo_str2addr(const char *str, int nob, __u32 *addr)
+{
+	*addr = 0;
+	return 1;
+}
+
+static void
+libcfs_ip_addr2str(__u32 addr, char *str, size_t size)
+{
+	snprintf(str, size, "%u.%u.%u.%u",
+		 (addr >> 24) & 0xff, (addr >> 16) & 0xff,
+		 (addr >> 8) & 0xff, addr & 0xff);
+}
+
+/* CAVEAT EMPTOR XscanfX
+ * I use "%n" at the end of a sscanf format to detect trailing junk.  However
+ * sscanf may return immediately if it sees the terminating '0' in a string, so
+ * I initialise the %n variable to the expected length.  If sscanf sets it;
+ * fine, if it doesn't, then the scan ended at the end of the string, which is
+ * fine too :) */
+static int
+libcfs_ip_str2addr(const char *str, int nob, __u32 *addr)
+{
+	unsigned int	a;
+	unsigned int	b;
+	unsigned int	c;
+	unsigned int	d;
+	int		n = nob; /* XscanfX */
+
+	/* numeric IP? */
+	if (sscanf(str, "%u.%u.%u.%u%n", &a, &b, &c, &d, &n) >= 4 &&
+	    n == nob &&
+	    (a & ~0xff) == 0 && (b & ~0xff) == 0 &&
+	    (c & ~0xff) == 0 && (d & ~0xff) == 0) {
+		*addr = ((a<<24)|(b<<16)|(c<<8)|d);
+		return 1;
+	}
+
+#ifdef HAVE_GETHOSTBYNAME
+	/* known hostname? */
+	if (('a' <= str[0] && str[0] <= 'z') ||
+	    ('A' <= str[0] && str[0] <= 'Z')) {
+		char *tmp;
+
+		tmp = calloc(1, nob + 1);
+		if (tmp != NULL) {
+			struct hostent *he;
+
+			memcpy(tmp, str, nob);
+			tmp[nob] = 0;
+
+			he = gethostbyname(tmp);
+
+			free(tmp);
+
+			if (he != NULL) {
+				__u32 ip = *(__u32 *)he->h_addr;
+
+				*addr = ntohl(ip);
+				return 1;
+			}
+		}
+	}
+#endif
+	return 0;
+}
+
+int
+cfs_ip_addr_parse(char *str, int len, struct list_head *list)
+{
+	struct cfs_expr_list *el;
+	struct cfs_lstr src;
+	int rc;
+	int i;
+
+	src.ls_str = str;
+	src.ls_len = len;
+	i = 0;
+
+	while (src.ls_str != NULL) {
+		struct cfs_lstr res;
+
+		if (!cfs_gettok(&src, '.', &res)) {
+			rc = -EINVAL;
+			goto out;
+		}
+
+		rc = cfs_expr_list_parse(res.ls_str, res.ls_len, 0, 255, &el);
+		if (rc != 0)
+			goto out;
+
+		list_add_tail(&el->el_link, list);
+		i++;
+	}
+
+	if (i == 4)
+		return 0;
+
+	rc = -EINVAL;
+out:
+	cfs_expr_list_free_list(list);
+
+	return rc;
+}
+
+static int
+libcfs_ip_addr_range_print(char *buffer, int count, struct list_head *list)
+{
+	int i = 0, j = 0;
+	struct cfs_expr_list *el;
+
+	list_for_each_entry(el, list, el_link) {
+		assert(j++ < 4);
+		if (i != 0)
+			i += snprintf(buffer + i, count - i, ".");
+		i += cfs_expr_list_print(buffer + i, count - i, el);
+	}
+	return i;
+}
+
+static int
+cfs_ip_addr_range_gen_recurse(__u32 *ip_list, int *count, int shift,
+			      __u32 result, struct list_head *head_el,
+			      struct cfs_expr_list *octet_el)
+{
+	__u32 value = 0;
+	int i;
+	struct cfs_expr_list *next_octet_el;
+	struct cfs_range_expr *octet_expr;
+
+	/*
+	 * each octet can have multiple expressions so we need to traverse
+	 * all of the expressions
+	 */
+	list_for_each_entry(octet_expr, &octet_el->el_exprs, re_link) {
+		for (i = octet_expr->re_lo; i <= octet_expr->re_hi; i++) {
+			if (((i - octet_expr->re_lo) % octet_expr->re_stride) == 0) {
+				/*
+				 * we have a hit calculate the result and
+				 * pass it forward to the next iteration
+				 * of the recursion.
+				 */
+				next_octet_el =
+					list_entry(octet_el->el_link.next,
+							typeof(*next_octet_el),
+							el_link);
+				value = result | (i << (shift * 8));
+				if (next_octet_el->el_link.next != head_el) {
+					/*
+					 * We still have more octets in
+					 * the IP address so traverse
+					 * that. We're doing a depth first
+					 * recursion here.
+					 */
+					if (cfs_ip_addr_range_gen_recurse(ip_list, count,
+									  shift - 1, value,
+									  head_el,
+									  next_octet_el) == -1)
+						return -1;
+				} else {
+					/*
+					 * We have hit a leaf so store the
+					 * calculated IP address in the
+					 * list. If we have run out of
+					 * space stop the recursion.
+					 */
+					if (*count == -1)
+						return -1;
+					/* add ip to the list */
+					ip_list[*count] = value;
+					(*count)--;
+				}
+			}
+		}
+	}
+	return 0;
+}
+
+/*
+ * only generate maximum of count ip addresses from the given expression
+ */
+int
+cfs_ip_addr_range_gen(__u32 *ip_list, int count, struct list_head *ip_addr_expr)
+{
+	struct cfs_expr_list *octet_el;
+	int idx = count - 1;
+
+	octet_el = list_entry(ip_addr_expr->next, typeof(*octet_el), el_link);
+
+	(void) cfs_ip_addr_range_gen_recurse(ip_list, &idx, 3, 0, &octet_el->el_link, octet_el);
+
+	return idx;
+}
+
+/**
+ * Matches address (\a addr) against address set encoded in \a list.
+ *
+ * \retval 1 if \a addr matches
+ * \retval 0 otherwise
+ */
+int
+cfs_ip_addr_match(__u32 addr, struct list_head *list)
+{
+	struct cfs_expr_list *el;
+	int i = 0;
+
+	list_for_each_entry_reverse(el, list, el_link) {
+		if (!cfs_expr_list_match(addr & 0xff, el))
+			return 0;
+		addr >>= 8;
+		i++;
+	}
+
+	return i == 4;
+}
+
+static void
+libcfs_decnum_addr2str(__u32 addr, char *str, size_t size)
+{
+	snprintf(str, size, "%u", addr);
+}
+
+static int
+libcfs_num_str2addr(const char *str, int nob, __u32 *addr)
+{
+	int     n;
+
+	n = nob;
+	if (sscanf(str, "0x%x%n", addr, &n) >= 1 && n == nob)
+		return 1;
+
+	n = nob;
+	if (sscanf(str, "0X%x%n", addr, &n) >= 1 && n == nob)
+		return 1;
+
+	n = nob;
+	if (sscanf(str, "%u%n", addr, &n) >= 1 && n == nob)
+		return 1;
+
+	return 0;
+}
+
+/**
+ * Nf_parse_addrlist method for networks using numeric addresses.
+ *
+ * Examples of such networks are gm and elan.
+ *
+ * \retval 0 if \a str parsed to numeric address
+ * \retval errno otherwise
+ */
+static int
+libcfs_num_parse(char *str, int len, struct list_head *list)
+{
+	struct cfs_expr_list *el;
+	int	rc;
+
+	rc = cfs_expr_list_parse(str, len, 0, MAX_NUMERIC_VALUE, &el);
+	if (rc == 0)
+		list_add_tail(&el->el_link, list);
+
+	return rc;
+}
+
+static int
+libcfs_num_addr_range_print(char *buffer, int count, struct list_head *list)
+{
+	struct cfs_expr_list *el;
+	int i = 0, j = 0;
+
+	list_for_each_entry(el, list, el_link) {
+		assert(j++ < 1);
+		i += cfs_expr_list_print(buffer + i, count - i, el);
+	}
+	return i;
+}
+
+/*
+ * Nf_match_addr method for networks using numeric addresses
+ *
+ * \retval 1 on match
+ * \retval 0 otherwise
+ */
+static int
+libcfs_num_match(__u32 addr, struct list_head *numaddr)
+{
+	struct cfs_expr_list *el;
+
+	assert(!list_empty(numaddr));
+	el = list_entry(numaddr->next, struct cfs_expr_list, el_link);
+
+	return cfs_expr_list_match(addr, el);
+}
+
+static int cfs_ip_min_max(struct list_head *nidlist, __u32 *min, __u32 *max);
+static int cfs_num_min_max(struct list_head *nidlist, __u32 *min, __u32 *max);
+
+static struct netstrfns libcfs_netstrfns[] = {
+	{
+		.nf_type		= LOLND,
+		.nf_name		= "lo",
+		.nf_modname		= "klolnd",
+		.nf_addr2str		= libcfs_decnum_addr2str,
+		.nf_str2addr		= libcfs_lo_str2addr,
+		.nf_parse_addrlist	= libcfs_num_parse,
+		.nf_print_addrlist	= libcfs_num_addr_range_print,
+		.nf_match_addr		= libcfs_num_match,
+		.nf_min_max		= cfs_num_min_max
+	},
+	{
+		.nf_type		= SOCKLND,
+		.nf_name		= "tcp",
+		.nf_modname		= "ksocklnd",
+		.nf_addr2str		= libcfs_ip_addr2str,
+		.nf_str2addr		= libcfs_ip_str2addr,
+		.nf_parse_addrlist	= cfs_ip_addr_parse,
+		.nf_print_addrlist	= libcfs_ip_addr_range_print,
+		.nf_match_addr		= cfs_ip_addr_match,
+		.nf_min_max		= cfs_ip_min_max
+	},
+	{
+		.nf_type		= O2IBLND,
+		.nf_name		= "o2ib",
+		.nf_modname		= "ko2iblnd",
+		.nf_addr2str		= libcfs_ip_addr2str,
+		.nf_str2addr		= libcfs_ip_str2addr,
+		.nf_parse_addrlist	= cfs_ip_addr_parse,
+		.nf_print_addrlist	= libcfs_ip_addr_range_print,
+		.nf_match_addr		= cfs_ip_addr_match,
+		.nf_min_max		= cfs_ip_min_max
+	},
+	{
+		.nf_type		= GNILND,
+		.nf_name		= "gni",
+		.nf_modname		= "kgnilnd",
+		.nf_addr2str		= libcfs_decnum_addr2str,
+		.nf_str2addr		= libcfs_num_str2addr,
+		.nf_parse_addrlist	= libcfs_num_parse,
+		.nf_print_addrlist	= libcfs_num_addr_range_print,
+		.nf_match_addr		= libcfs_num_match,
+		.nf_min_max		= cfs_num_min_max
+	},
+	{
+		.nf_type		= GNIIPLND,
+		.nf_name		= "gip",
+		.nf_modname		= "kgnilnd",
+		.nf_addr2str		= libcfs_ip_addr2str,
+		.nf_str2addr		= libcfs_ip_str2addr,
+		.nf_parse_addrlist	= cfs_ip_addr_parse,
+		.nf_print_addrlist	= libcfs_ip_addr_range_print,
+		.nf_match_addr		= cfs_ip_addr_match,
+		.nf_min_max		= cfs_ip_min_max
+	},
+	{
+		.nf_type		= PTL4LND,
+		.nf_name		= "ptlf",
+		.nf_modname		= "kptl4lnd",
+		.nf_addr2str		= libcfs_decnum_addr2str,
+		.nf_str2addr		= libcfs_num_str2addr,
+		.nf_parse_addrlist	= libcfs_num_parse,
+		.nf_print_addrlist	= libcfs_num_addr_range_print,
+		.nf_match_addr		= libcfs_num_match,
+		.nf_min_max		= cfs_num_min_max
+	}
+};
+
+static const size_t libcfs_nnetstrfns =
+	sizeof(libcfs_netstrfns)/sizeof(libcfs_netstrfns[0]);
+
+static struct netstrfns *
+libcfs_lnd2netstrfns(__u32 lnd)
+{
+	int	i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++)
+		if (lnd == libcfs_netstrfns[i].nf_type)
+			return &libcfs_netstrfns[i];
+
+	return NULL;
+}
+
+static struct netstrfns *
+libcfs_namenum2netstrfns(const char *name)
+{
+	struct netstrfns *nf;
+	int               i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++) {
+		nf = &libcfs_netstrfns[i];
+		if (!strncmp(name, nf->nf_name, strlen(nf->nf_name)))
+			return nf;
+	}
+	return NULL;
+}
+
+static struct netstrfns *
+libcfs_name2netstrfns(const char *name)
+{
+	int    i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++)
+		if (!strcmp(libcfs_netstrfns[i].nf_name, name))
+			return &libcfs_netstrfns[i];
+
+	return NULL;
+}
+
+int
+libcfs_isknown_lnd(__u32 lnd)
+{
+	return libcfs_lnd2netstrfns(lnd) != NULL;
+}
+
+char *
+libcfs_lnd2modname(__u32 lnd)
+{
+	struct netstrfns *nf = libcfs_lnd2netstrfns(lnd);
+
+	return (nf == NULL) ? NULL : nf->nf_modname;
+}
+
+int
+libcfs_str2lnd(const char *str)
+{
+	struct netstrfns *nf = libcfs_name2netstrfns(str);
+
+	if (nf != NULL)
+		return nf->nf_type;
+
+	return -1;
+}
+
+char *
+libcfs_lnd2str_r(__u32 lnd, char *buf, size_t buf_size)
+{
+	struct netstrfns *nf;
+
+	nf = libcfs_lnd2netstrfns(lnd);
+	if (nf == NULL)
+		snprintf(buf, buf_size, "?%u?", lnd);
+	else
+		snprintf(buf, buf_size, "%s", nf->nf_name);
+
+	return buf;
+}
+
+char *
+libcfs_net2str_r(__u32 net, char *buf, size_t buf_size)
+{
+	__u32		  nnum = LNET_NETNUM(net);
+	__u32		  lnd  = LNET_NETTYP(net);
+	struct netstrfns *nf;
+
+	nf = libcfs_lnd2netstrfns(lnd);
+	if (nf == NULL)
+		snprintf(buf, buf_size, "<%u:%u>", lnd, nnum);
+	else if (nnum == 0)
+		snprintf(buf, buf_size, "%s", nf->nf_name);
+	else
+		snprintf(buf, buf_size, "%s%u", nf->nf_name, nnum);
+
+	return buf;
+}
+
+char *
+libcfs_nid2str_r(lnet_nid_t nid, char *buf, size_t buf_size)
+{
+	__u32		  addr = LNET_NIDADDR(nid);
+	__u32		  net  = LNET_NIDNET(nid);
+	__u32		  nnum = LNET_NETNUM(net);
+	__u32		  lnd  = LNET_NETTYP(net);
+	struct netstrfns *nf;
+
+	if (nid == LNET_NID_ANY) {
+		strncpy(buf, "<?>", buf_size);
+		buf[buf_size - 1] = '\0';
+		return buf;
+	}
+
+	nf = libcfs_lnd2netstrfns(lnd);
+	if (nf == NULL) {
+		snprintf(buf, buf_size, "%x@<%u:%u>", addr, lnd, nnum);
+	} else {
+		size_t addr_len;
+
+		nf->nf_addr2str(addr, buf, buf_size);
+		addr_len = strlen(buf);
+		if (nnum == 0)
+			snprintf(buf + addr_len, buf_size - addr_len, "@%s",
+				 nf->nf_name);
+		else
+			snprintf(buf + addr_len, buf_size - addr_len, "@%s%u",
+				 nf->nf_name, nnum);
+	}
+
+	return buf;
+}
+
+static struct netstrfns *
+libcfs_str2net_internal(const char *str, __u32 *net)
+{
+	struct netstrfns *nf = NULL;
+	int		  nob;
+	unsigned int	  netnum;
+	int		  i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++) {
+		nf = &libcfs_netstrfns[i];
+		if (!strncmp(str, nf->nf_name, strlen(nf->nf_name)))
+			break;
+	}
+
+	if (i == libcfs_nnetstrfns)
+		return NULL;
+
+	nob = strlen(nf->nf_name);
+
+	if (strlen(str) == (unsigned int)nob) {
+		netnum = 0;
+	} else {
+		if (nf->nf_type == LOLND) /* net number not allowed */
+			return NULL;
+
+		str += nob;
+		i = strlen(str);
+		if (sscanf(str, "%u%n", &netnum, &i) < 1 ||
+		    i != (int)strlen(str))
+			return NULL;
+	}
+
+	*net = LNET_MKNET(nf->nf_type, netnum);
+	return nf;
+}
+
+__u32
+libcfs_str2net(const char *str)
+{
+	__u32  net;
+
+	if (libcfs_str2net_internal(str, &net) != NULL)
+		return net;
+
+	return LNET_NIDNET(LNET_NID_ANY);
+}
+
+lnet_nid_t
+libcfs_str2nid(const char *str)
+{
+	const char       *sep = strchr(str, '@');
+	struct netstrfns *nf;
+	__u32             net;
+	__u32             addr;
+
+	if (sep != NULL) {
+		nf = libcfs_str2net_internal(sep + 1, &net);
+		if (nf == NULL)
+			return LNET_NID_ANY;
+	} else {
+		sep = str + strlen(str);
+		net = LNET_MKNET(SOCKLND, 0);
+		nf = libcfs_lnd2netstrfns(SOCKLND);
+		assert(nf != NULL);
+	}
+
+	if (!nf->nf_str2addr(str, (int)(sep - str), &addr))
+		return LNET_NID_ANY;
+
+	return LNET_MKNID(net, addr);
+}
+
+char *
+libcfs_id2str(struct lnet_process_id id)
+{
+	char *str = libcfs_next_nidstring();
+
+	if (id.pid == LNET_PID_ANY) {
+		snprintf(str, LNET_NIDSTR_SIZE,
+			 "LNET_PID_ANY-%s", libcfs_nid2str(id.nid));
+		return str;
+	}
+
+	snprintf(str, LNET_NIDSTR_SIZE, "%s%u-%s",
+		 ((id.pid & LNET_PID_USERFLAG) != 0) ? "U" : "",
+		 (id.pid & ~LNET_PID_USERFLAG), libcfs_nid2str(id.nid));
+	return str;
+}
+
+int
+libcfs_str2anynid(lnet_nid_t *nidp, const char *str)
+{
+	if (!strcmp(str, "*")) {
+		*nidp = LNET_NID_ANY;
+		return 1;
+	}
+
+	*nidp = libcfs_str2nid(str);
+	return *nidp != LNET_NID_ANY;
+}
+
+/**
+ * Nid range list syntax.
+ * \verbatim
+ *
+ * <nidlist>         :== <nidrange> [ ' ' <nidrange> ]
+ * <nidrange>        :== <addrrange> '@' <net>
+ * <addrrange>       :== '*' |
+ *                       <ipaddr_range> |
+ *			 <cfs_expr_list>
+ * <ipaddr_range>    :== <cfs_expr_list>.<cfs_expr_list>.<cfs_expr_list>.
+ *			 <cfs_expr_list>
+ * <cfs_expr_list>   :== <number> |
+ *                       <expr_list>
+ * <expr_list>       :== '[' <range_expr> [ ',' <range_expr>] ']'
+ * <range_expr>      :== <number> |
+ *                       <number> '-' <number> |
+ *                       <number> '-' <number> '/' <number>
+ * <net>             :== <netname> | <netname><number>
+ * <netname>         :== "lo" | "tcp" | "o2ib" | "cib" | "openib" | "iib" |
+ *                       "vib" | "ra" | "elan" | "mx" | "ptl"
+ * \endverbatim
+ */
+
+/**
+ * Structure to represent \<nidrange\> token of the syntax.
+ *
+ * One of this is created for each \<net\> parsed.
+ */
+struct nidrange {
+	/**
+	 * Link to list of this structures which is built on nid range
+	 * list parsing.
+	 */
+	struct list_head nr_link;
+	/**
+	 * List head for addrrange::ar_link.
+	 */
+	struct list_head nr_addrranges;
+	/**
+	 * Flag indicating that *@<net> is found.
+	 */
+	int nr_all;
+	/**
+	 * Pointer to corresponding element of libcfs_netstrfns.
+	 */
+	struct netstrfns *nr_netstrfns;
+	/**
+	 * Number of network. E.g. 5 if \<net\> is "elan5".
+	 */
+	int nr_netnum;
+};
+
+/**
+ * Structure to represent \<addrrange\> token of the syntax.
+ */
+struct addrrange {
+	/**
+	 * Link to nidrange::nr_addrranges.
+	 */
+	struct list_head ar_link;
+	/**
+	 * List head for cfs_expr_list::el_list.
+	 */
+	struct list_head ar_numaddr_ranges;
+};
+
+/**
+ * Parses \<addrrange\> token on the syntax.
+ *
+ * Allocates struct addrrange and links to \a nidrange via
+ * (nidrange::nr_addrranges)
+ *
+ * \retval 0 if \a src parses to '*' | \<ipaddr_range\> | \<cfs_expr_list\>
+ * \retval -errno otherwise
+ */
+static int
+parse_addrange(const struct cfs_lstr *src, struct nidrange *nidrange)
+{
+	struct addrrange *addrrange;
+
+	if (src->ls_len == 1 && src->ls_str[0] == '*') {
+		nidrange->nr_all = 1;
+		return 0;
+	}
+
+	addrrange = calloc(1, sizeof(struct addrrange));
+	if (addrrange == NULL)
+		return -ENOMEM;
+	list_add_tail(&addrrange->ar_link, &nidrange->nr_addrranges);
+	INIT_LIST_HEAD(&addrrange->ar_numaddr_ranges);
+
+	return nidrange->nr_netstrfns->nf_parse_addrlist(src->ls_str,
+						src->ls_len,
+						&addrrange->ar_numaddr_ranges);
+}
+
+/**
+ * Finds or creates struct nidrange.
+ *
+ * Checks if \a src is a valid network name, looks for corresponding
+ * nidrange on the ist of nidranges (\a nidlist), creates new struct
+ * nidrange if it is not found.
+ *
+ * \retval pointer to struct nidrange matching network specified via \a src
+ * \retval NULL if \a src does not match any network
+ */
+static struct nidrange *
+add_nidrange(const struct cfs_lstr *src,
+	     struct list_head *nidlist)
+{
+	struct netstrfns *nf;
+	struct nidrange *nr;
+	int endlen;
+	unsigned netnum;
+
+	if (src->ls_len >= LNET_NIDSTR_SIZE)
+		return NULL;
+
+	nf = libcfs_namenum2netstrfns(src->ls_str);
+	if (nf == NULL)
+		return NULL;
+	endlen = src->ls_len - strlen(nf->nf_name);
+	if (endlen == 0)
+		/* network name only, e.g. "elan" or "tcp" */
+		netnum = 0;
+	else {
+		/* e.g. "elan25" or "tcp23", refuse to parse if
+		 * network name is not appended with decimal or
+		 * hexadecimal number */
+		if (!cfs_str2num_check(src->ls_str + strlen(nf->nf_name),
+				       endlen, &netnum, 0, MAX_NUMERIC_VALUE))
+			return NULL;
+	}
+
+	list_for_each_entry(nr, nidlist, nr_link) {
+		if (nr->nr_netstrfns != nf)
+			continue;
+		if (nr->nr_netnum != netnum)
+			continue;
+		return nr;
+	}
+
+	nr = calloc(1, sizeof(struct nidrange));
+	if (nr == NULL)
+		return NULL;
+	list_add_tail(&nr->nr_link, nidlist);
+	INIT_LIST_HEAD(&nr->nr_addrranges);
+	nr->nr_netstrfns = nf;
+	nr->nr_all = 0;
+	nr->nr_netnum = netnum;
+
+	return nr;
+}
+
+/**
+ * Parses \<nidrange\> token of the syntax.
+ *
+ * \retval 1 if \a src parses to \<addrrange\> '@' \<net\>
+ * \retval 0 otherwise
+ */
+static int
+parse_nidrange(struct cfs_lstr *src, struct list_head *nidlist)
+{
+	struct cfs_lstr addrrange;
+	struct cfs_lstr net;
+	struct cfs_lstr tmp;
+	struct nidrange *nr;
+
+	tmp = *src;
+	if (cfs_gettok(src, '@', &addrrange) == 0)
+		goto failed;
+
+	if (cfs_gettok(src, '@', &net) == 0 || src->ls_str != NULL)
+		goto failed;
+
+	nr = add_nidrange(&net, nidlist);
+	if (nr == NULL)
+		goto failed;
+
+	if (parse_addrange(&addrrange, nr) != 0)
+		goto failed;
+
+	return 1;
+ failed:
+	fprintf(stderr, "can't parse nidrange: \"%.*s\"\n",
+		tmp.ls_len, tmp.ls_str);
+	return 0;
+}
+
+/**
+ * Frees addrrange structures of \a list.
+ *
+ * For each struct addrrange structure found on \a list it frees
+ * cfs_expr_list list attached to it and frees the addrrange itself.
+ *
+ * \retval none
+ */
+static void
+free_addrranges(struct list_head *list)
+{
+	while (!list_empty(list)) {
+		struct addrrange *ar;
+
+		ar = list_entry(list->next, struct addrrange, ar_link);
+
+		cfs_expr_list_free_list(&ar->ar_numaddr_ranges);
+		list_del(&ar->ar_link);
+		free(ar);
+	}
+}
+
+/**
+ * Frees nidrange strutures of \a list.
+ *
+ * For each struct nidrange structure found on \a list it frees
+ * addrrange list attached to it and frees the nidrange itself.
+ *
+ * \retval none
+ */
+void
+cfs_free_nidlist(struct list_head *list)
+{
+	struct list_head *pos, *next;
+	struct nidrange *nr;
+
+	list_for_each_safe(pos, next, list) {
+		nr = list_entry(pos, struct nidrange, nr_link);
+		free_addrranges(&nr->nr_addrranges);
+		list_del(pos);
+		free(nr);
+	}
+}
+
+/**
+ * Parses nid range list.
+ *
+ * Parses with rigorous syntax and overflow checking \a str into
+ * \<nidrange\> [ ' ' \<nidrange\> ], compiles \a str into set of
+ * structures and links that structure to \a nidlist. The resulting
+ * list can be used to match a NID againts set of NIDS defined by \a
+ * str.
+ * \see cfs_match_nid
+ *
+ * \retval 1 on success
+ * \retval 0 otherwise
+ */
+int
+cfs_parse_nidlist(char *str, int len, struct list_head *nidlist)
+{
+	struct cfs_lstr src;
+	struct cfs_lstr res;
+	int rc;
+
+	src.ls_str = str;
+	src.ls_len = len;
+	INIT_LIST_HEAD(nidlist);
+	while (src.ls_str) {
+		rc = cfs_gettok(&src, ' ', &res);
+		if (rc == 0) {
+			cfs_free_nidlist(nidlist);
+			return 0;
+		}
+		rc = parse_nidrange(&res, nidlist);
+		if (rc == 0) {
+			cfs_free_nidlist(nidlist);
+			return 0;
+		}
+	}
+	return 1;
+}
+
+/**
+ * Matches a nid (\a nid) against the compiled list of nidranges (\a nidlist).
+ *
+ * \see cfs_parse_nidlist()
+ *
+ * \retval 1 on match
+ * \retval 0  otherwises
+ */
+int cfs_match_nid(lnet_nid_t nid, struct list_head *nidlist)
+{
+	struct nidrange *nr;
+	struct addrrange *ar;
+
+	list_for_each_entry(nr, nidlist, nr_link) {
+		if (nr->nr_netstrfns->nf_type != LNET_NETTYP(LNET_NIDNET(nid)))
+			continue;
+		if (nr->nr_netnum != LNET_NETNUM(LNET_NIDNET(nid)))
+			continue;
+		if (nr->nr_all)
+			return 1;
+		list_for_each_entry(ar, &nr->nr_addrranges, ar_link)
+			if (nr->nr_netstrfns->nf_match_addr(LNET_NIDADDR(nid),
+							&ar->ar_numaddr_ranges))
+				return 1;
+	}
+	return 0;
+}
+
+/**
+ * Print the network part of the nidrange \a nr into the specified \a buffer.
+ *
+ * \retval number of characters written
+ */
+static int
+cfs_print_network(char *buffer, int count, struct nidrange *nr)
+{
+	struct netstrfns *nf = nr->nr_netstrfns;
+
+	if (nr->nr_netnum == 0)
+		return snprintf(buffer, count, "@%s", nf->nf_name);
+	else
+		return snprintf(buffer, count, "@%s%u",
+				    nf->nf_name, nr->nr_netnum);
+}
+
+
+/**
+ * Print a list of addrrange (\a addrranges) into the specified \a buffer.
+ * At max \a count characters can be printed into \a buffer.
+ *
+ * \retval number of characters written
+ */
+static int
+cfs_print_addrranges(char *buffer, int count, struct list_head *addrranges,
+		     struct nidrange *nr)
+{
+	int i = 0;
+	struct addrrange *ar;
+	struct netstrfns *nf = nr->nr_netstrfns;
+
+	list_for_each_entry(ar, addrranges, ar_link) {
+		if (i != 0)
+			i += snprintf(buffer + i, count - i, " ");
+		i += nf->nf_print_addrlist(buffer + i, count - i,
+					   &ar->ar_numaddr_ranges);
+		i += cfs_print_network(buffer + i, count - i, nr);
+	}
+	return i;
+}
+
+/**
+ * Print a list of nidranges (\a nidlist) into the specified \a buffer.
+ * At max \a count characters can be printed into \a buffer.
+ * Nidranges are separated by a space character.
+ *
+ * \retval number of characters written
+ */
+int cfs_print_nidlist(char *buffer, int count, struct list_head *nidlist)
+{
+	int i = 0;
+	struct nidrange *nr;
+
+	if (count <= 0)
+		return 0;
+
+	list_for_each_entry(nr, nidlist, nr_link) {
+		if (i != 0)
+			i += snprintf(buffer + i, count - i, " ");
+
+		if (nr->nr_all != 0) {
+			assert(list_empty(&nr->nr_addrranges));
+			i += snprintf(buffer + i, count - i, "*");
+			i += cfs_print_network(buffer + i, count - i, nr);
+		} else {
+			i += cfs_print_addrranges(buffer + i, count - i,
+						  &nr->nr_addrranges, nr);
+		}
+	}
+	return i;
+}
+
+/**
+ * Determines minimum and maximum addresses for a single
+ * numeric address range
+ *
+ * \param	ar
+ * \param[out]	*min_nid __u32 representation of min NID
+ * \param[out]	*max_nid __u32 representation of max NID
+ * \retval	-EINVAL unsupported LNET range
+ * \retval	-ERANGE non-contiguous LNET range
+ */
+static int cfs_ip_ar_min_max(struct addrrange *ar, __u32 *min_nid,
+			      __u32 *max_nid)
+{
+	struct cfs_expr_list *expr_list;
+	struct cfs_range_expr *range;
+	unsigned int min_ip[4] = {0};
+	unsigned int max_ip[4] = {0};
+	int cur_octet = 0;
+	bool expect_full_octet = false;
+
+	list_for_each_entry(expr_list, &ar->ar_numaddr_ranges, el_link) {
+		int re_count = 0;
+
+		list_for_each_entry(range, &expr_list->el_exprs, re_link) {
+			/* XXX: add support for multiple & non-contig. re's */
+			if (re_count > 0)
+				return -EINVAL;
+
+			/* if a previous octet was ranged, then all remaining
+			 * octets must be full for contiguous range */
+			if (expect_full_octet && (range->re_lo != 0 ||
+						  range->re_hi != 255))
+				return -ERANGE;
+
+			if (range->re_stride != 1)
+				return -ERANGE;
+
+			if (range->re_lo > range->re_hi)
+				return -EINVAL;
+
+			if (range->re_lo != range->re_hi)
+				expect_full_octet = true;
+
+			min_ip[cur_octet] = range->re_lo;
+			max_ip[cur_octet] = range->re_hi;
+
+			re_count++;
+		}
+
+		cur_octet++;
+	}
+
+	if (min_nid != NULL)
+		*min_nid = ((min_ip[0] << 24) | (min_ip[1] << 16) |
+			    (min_ip[2] << 8) | min_ip[3]);
+
+	if (max_nid != NULL)
+		*max_nid = ((max_ip[0] << 24) | (max_ip[1] << 16) |
+			    (max_ip[2] << 8) | max_ip[3]);
+
+	return 0;
+}
+
+/**
+ * Determines minimum and maximum addresses for a single
+ * numeric address range
+ *
+ * \param	ar
+ * \param[out]	*min_nid __u32 representation of min NID
+ * \param[out]	*max_nid __u32 representation of max NID
+ * \retval	-EINVAL unsupported LNET range
+ */
+static int cfs_num_ar_min_max(struct addrrange *ar, __u32 *min_nid,
+			       __u32 *max_nid)
+{
+	struct cfs_expr_list *el;
+	struct cfs_range_expr *re;
+	unsigned int min_addr = 0;
+	unsigned int max_addr = 0;
+
+	list_for_each_entry(el, &ar->ar_numaddr_ranges, el_link) {
+		int re_count = 0;
+
+		list_for_each_entry(re, &el->el_exprs, re_link) {
+			if (re_count > 0)
+				return -EINVAL;
+			if (re->re_lo > re->re_hi)
+				return -EINVAL;
+
+			if (re->re_lo < min_addr || min_addr == 0)
+				min_addr = re->re_lo;
+			if (re->re_hi > max_addr)
+				max_addr = re->re_hi;
+
+			re_count++;
+		}
+	}
+
+	if (min_nid != NULL)
+		*min_nid = min_addr;
+	if (max_nid != NULL)
+		*max_nid = max_addr;
+
+	return 0;
+}
+
+/**
+ * Takes a linked list of nidrange expressions, determines the minimum
+ * and maximum nid and creates appropriate nid structures
+ *
+ * \param	*nidlist
+ * \param[out]	*min_nid string representation of min NID
+ * \param[out]	*max_nid string representation of max NID
+ * \retval	-EINVAL unsupported LNET range
+ * \retval	-ERANGE non-contiguous LNET range
+ */
+int cfs_nidrange_find_min_max(struct list_head *nidlist, char *min_nid,
+			      char *max_nid, size_t nidstr_length)
+{
+	struct nidrange *first_nidrange;
+	int netnum;
+	struct netstrfns *nf;
+	char *lndname;
+	__u32 min_addr;
+	__u32 max_addr;
+	char min_addr_str[IPSTRING_LENGTH];
+	char max_addr_str[IPSTRING_LENGTH];
+	int rc;
+
+	first_nidrange = list_entry(nidlist->next, struct nidrange, nr_link);
+
+	netnum = first_nidrange->nr_netnum;
+	nf = first_nidrange->nr_netstrfns;
+	lndname = nf->nf_name;
+
+	rc = nf->nf_min_max(nidlist, &min_addr, &max_addr);
+	if (rc < 0)
+		return rc;
+
+	nf->nf_addr2str(min_addr, min_addr_str, sizeof(min_addr_str));
+	nf->nf_addr2str(max_addr, max_addr_str, sizeof(max_addr_str));
+
+	snprintf(min_nid, nidstr_length, "%s@%s%d", min_addr_str, lndname,
+		 netnum);
+	snprintf(max_nid, nidstr_length, "%s@%s%d", max_addr_str, lndname,
+		 netnum);
+
+	return 0;
+}
+
+/**
+ * Determines the min and max NID values for num LNDs
+ *
+ * \param	*nidlist
+ * \param[out]	*min_nid if provided, returns string representation of min NID
+ * \param[out]	*max_nid if provided, returns string representation of max NID
+ * \retval	-EINVAL unsupported LNET range
+ * \retval	-ERANGE non-contiguous LNET range
+ */
+static int cfs_num_min_max(struct list_head *nidlist, __u32 *min_nid,
+			    __u32 *max_nid)
+{
+	struct nidrange *nr;
+	struct addrrange *ar;
+	unsigned int tmp_min_addr = 0;
+	unsigned int tmp_max_addr = 0;
+	unsigned int min_addr = 0;
+	unsigned int max_addr = 0;
+	int nidlist_count = 0;
+	int rc;
+
+	list_for_each_entry(nr, nidlist, nr_link) {
+		if (nidlist_count > 0)
+			return -EINVAL;
+
+		list_for_each_entry(ar, &nr->nr_addrranges, ar_link) {
+			rc = cfs_num_ar_min_max(ar, &tmp_min_addr,
+						&tmp_max_addr);
+			if (rc < 0)
+				return rc;
+
+			if (tmp_min_addr < min_addr || min_addr == 0)
+				min_addr = tmp_min_addr;
+			if (tmp_max_addr > max_addr)
+				max_addr = tmp_min_addr;
+		}
+	}
+	if (max_nid != NULL)
+		*max_nid = max_addr;
+	if (min_nid != NULL)
+		*min_nid = min_addr;
+
+	return 0;
+}
+
+/**
+ * Takes an nidlist and determines the minimum and maximum
+ * ip addresses.
+ *
+ * \param	*nidlist
+ * \param[out]	*min_nid if provided, returns string representation of min NID
+ * \param[out]	*max_nid if provided, returns string representation of max NID
+ * \retval	-EINVAL unsupported LNET range
+ * \retval	-ERANGE non-contiguous LNET range
+ */
+static int cfs_ip_min_max(struct list_head *nidlist, __u32 *min_nid,
+			   __u32 *max_nid)
+{
+	struct nidrange *nr;
+	struct addrrange *ar;
+	__u32 tmp_min_ip_addr = 0;
+	__u32 tmp_max_ip_addr = 0;
+	__u32 min_ip_addr = 0;
+	__u32 max_ip_addr = 0;
+	int nidlist_count = 0;
+	int rc;
+
+	list_for_each_entry(nr, nidlist, nr_link) {
+		if (nidlist_count > 0)
+			return -EINVAL;
+
+		if (nr->nr_all) {
+			min_ip_addr = 0;
+			max_ip_addr = 0xffffffff;
+			break;
+		}
+
+		list_for_each_entry(ar, &nr->nr_addrranges, ar_link) {
+			rc = cfs_ip_ar_min_max(ar, &tmp_min_ip_addr,
+					       &tmp_max_ip_addr);
+			if (rc < 0)
+				return rc;
+
+			if (tmp_min_ip_addr < min_ip_addr || min_ip_addr == 0)
+				min_ip_addr = tmp_min_ip_addr;
+			if (tmp_max_ip_addr > max_ip_addr)
+				max_ip_addr = tmp_max_ip_addr;
+		}
+
+		nidlist_count++;
+	}
+
+	if (max_nid != NULL)
+		*max_nid = max_ip_addr;
+	if (min_nid != NULL)
+		*min_nid = min_ip_addr;
+
+	return 0;
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/param.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/param.c
new file mode 100644
index 0000000000000..9facce6bfa975
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/param.c
@@ -0,0 +1,155 @@
+/*
+ * LGPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * All rights reserved. This program and the accompanying materials
+ * are made available under the terms of the GNU Lesser General Public License
+ * (LGPL) version 2.1 or (at your discretion) any later version.
+ * (LGPL) version 2.1 accompanies this distribution, and is available at
+ * http://www.gnu.org/licenses/lgpl-2.1.html
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * LGPL HEADER END
+ */
+/*
+ * libcfs/libcfs/utils/param.c
+ *
+ * This code handles user interaction with the configuration interface
+ * to the Lustre file system to fine tune it.
+ */
+#include <errno.h>
+#include <fcntl.h>
+#include <glob.h>
+#include <mntent.h>
+#include <paths.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <linux/limits.h>
+#include <libcfs/util/string.h>
+#include <sys/vfs.h>
+#include <linux/magic.h>
+
+/**
+ * Get parameter path matching the pattern
+ *
+ * \param[out] paths	glob_t structure used to hold the final result
+ * \param[in]  pattern	the pattern containing sprintf format specifiers
+ *			which will be used to create the path to match
+ *
+ * The \param pattern is appended to the default path glob to complete the
+ * absolute path to the file the caller is requesting. If the results point
+ * to one or more files that exist those results are stored in the \param
+ * paths glob_t structure that is passed by the caller.
+ *
+ * Lustre tunables traditionally were in /proc/{sys,fs}/{lnet,lustre}
+ * but in upstream kernels starting with Linux 4.2 these parameters
+ * have been moved to /sys/fs/lustre and /sys/kernel/debug/{lnet,lustre}
+ * so the user tools need to check both locations.
+ *
+ * \retval	 0 for success, with results stored in \param paths.
+ * \retval	-1 for failure with errno set to report the reason.
+ */
+int
+cfs_get_param_paths(glob_t *paths, const char *pattern, ...)
+{
+	char path[PATH_MAX] = "{/sys/{fs,kernel/debug}/{lnet,lustre}/,"
+			       "/proc/{fs,sys}/{lnet,lustre}/}";
+	static bool test_mounted = false;
+	size_t len = strlen(path);
+	char buf[PATH_MAX];
+	struct statfs statfsbuf;
+	va_list args;
+	int rc;
+
+
+	if (test_mounted)
+		goto skip_mounting;
+	test_mounted = true;
+
+	rc = statfs("/sys/kernel/debug/", &statfsbuf);
+	if (rc == 0 && statfsbuf.f_type == DEBUGFS_MAGIC)
+		goto skip_mounting;
+
+	if (mount("none", "/sys/kernel/debug", "debugfs", 0, "") == -1) {
+		/* Already mounted or don't have permission to mount is okay */
+		if (errno != EPERM && errno != EBUSY)
+			fprintf(stderr, "Warning: failed to mount debug: %s\n",
+				strerror(errno));
+	} else {
+		struct stat mtab;
+
+		/* This is all for RHEL6 which is old school. Can be removed
+		 * later when RHEL6 client support is dropped. */
+		rc = lstat(_PATH_MOUNTED, &mtab);
+		if (!rc && !S_ISLNK(mtab.st_mode)) {
+			FILE *fp = setmntent(_PATH_MOUNTED, "r+");
+
+			if (fp != NULL) {
+				const struct mntent fs = {
+					.mnt_fsname	= "debugfs",
+					.mnt_dir	= "/sys/kernel/debug",
+					.mnt_type	= "debugfs",
+					.mnt_opts	= "rw,relatime",
+				};
+
+				rc = addmntent(fp, &fs);
+				if (rc) {
+					fprintf(stderr,
+						"failed to add debugfs to %s: %s\n",
+						_PATH_MOUNTED, strerror(errno));
+				}
+				endmntent(fp);
+			} else {
+				fprintf(stderr, "could not open %s: %s\n",
+					_PATH_MOUNTED, strerror(errno));
+			}
+		}
+	}
+skip_mounting:
+	va_start(args, pattern);
+	rc = vsnprintf(buf, sizeof(buf), pattern, args);
+	va_end(args);
+	if (rc < 0) {
+		return rc;
+	} else if (rc >= sizeof(buf)) {
+		errno = EINVAL;
+		return -1;
+	}
+	len += rc;
+
+	if (strlcat(path, buf, sizeof(path)) != len) {
+		errno = E2BIG;
+		return -1;
+	}
+
+	rc = glob(path, GLOB_BRACE, NULL, paths);
+	if (rc != 0) {
+		switch (rc) {
+		case GLOB_NOSPACE:
+			errno = ENOMEM;
+			break;
+		case GLOB_ABORTED:
+			errno = ENODEV;
+			break;
+		case GLOB_NOMATCH:
+		default:
+			errno = ENOENT;
+			break;
+		}
+		rc = -1;
+	}
+
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/parser.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/parser.c
new file mode 100644
index 0000000000000..9afdaa07f8883
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/parser.c
@@ -0,0 +1,845 @@
+/*
+ * Copyright (C) 2001 Cluster File Systems, Inc.
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <getopt.h>
+#include <malloc.h>
+#ifdef HAVE_LIBREADLINE
+# include <readline/history.h>
+# include <readline/readline.h>
+#endif /* HAVE_LIBREADLINE */
+#include <string.h>
+#include <unistd.h>
+
+#include <libcfs/util/parser.h>
+#include <lustre_ver.h>
+
+static command_t * top_level;           /* Top level of commands, initialized by
+                                    * InitParser                              */
+static char * parser_prompt = NULL;/* Parser prompt, set by InitParser      */
+static int done;                   /* Set to 1 if user types exit or quit   */
+static int ignore_errors;       /* Normally, the parser will quit when
+                                   an error occurs in non-interacive
+                                   mode. Setting this to non-zero will
+                                   force it to keep buggering on. */
+
+
+/* static functions */
+static char *skipwhitespace(char *s);
+static char *skiptowhitespace(char *s);
+static command_t *find_cmd(char *name, command_t cmds[], char **next);
+static int process(char *s, char **next, command_t *lookup, command_t **result,
+		   char **prev);
+
+static char * skipwhitespace(char * s)
+{
+        char * t;
+        int    len;
+
+        len = (int)strlen(s);
+        for (t = s; t <= s + len && isspace(*t); t++);
+        return(t);
+}
+
+
+static char * skiptowhitespace(char * s)
+{
+        char * t;
+
+        for (t = s; *t && !isspace(*t); t++);
+        return(t);
+}
+
+static int line2args(char *line, char **argv, int maxargs)
+{
+	char *arg;
+	int i = 0;
+
+	arg = strtok(line, " \t");
+	if (arg == NULL || maxargs < 1)
+		return 0;
+
+	argv[i++] = arg;
+	while ((arg = strtok(NULL, " \t")) != NULL && i < maxargs)
+		argv[i++] = arg;
+	return i;
+}
+
+/* find a command -- return it if unique otherwise print alternatives */
+static command_t *Parser_findargcmd(char *name, command_t cmds[])
+{
+        command_t *cmd;
+
+        for (cmd = cmds; cmd->pc_name; cmd++) {
+                if (strcmp(name, cmd->pc_name) == 0)
+                        return cmd;
+        }
+        return NULL;
+}
+
+void Parser_ignore_errors(int ignore)
+{
+        ignore_errors = ignore;
+}
+
+int Parser_execarg(int argc, char **argv, command_t cmds[])
+{
+	command_t *cmd;
+
+	cmd = Parser_findargcmd(argv[0], cmds);
+	if (cmd != NULL && cmd->pc_func != NULL) {
+		int rc = (cmd->pc_func)(argc, argv);
+		if (rc == CMD_HELP)
+			fprintf(stderr, "%s\n", cmd->pc_help);
+		return rc;
+	} else {
+		printf("Try interactive use without arguments or use one of:\n");
+		for (cmd = cmds; cmd->pc_name; cmd++)
+			printf("\"%s\"\n", cmd->pc_name);
+		printf("as argument.\n");
+	}
+	return -1;
+}
+
+/* returns the command_t * (NULL if not found) corresponding to a
+   _partial_ match with the first token in name.  It sets *next to
+   point to the following token. Does not modify *name. */
+static command_t * find_cmd(char * name, command_t cmds[], char ** next)
+{
+        int    i, len;
+
+        if (!cmds || !name )
+                return NULL;
+
+        /* This sets name to point to the first non-white space character,
+           and next to the first whitespace after name, len to the length: do
+           this with strtok*/
+        name = skipwhitespace(name);
+        *next = skiptowhitespace(name);
+        len = (int)(*next - name);
+        if (len == 0)
+                return NULL;
+
+        for (i = 0; cmds[i].pc_name; i++) {
+                if (strncasecmp(name, cmds[i].pc_name, len) == 0) {
+                        *next = skipwhitespace(*next);
+                        return(&cmds[i]);
+                }
+        }
+        return NULL;
+}
+
+/* Recursively process a command line string s and find the command
+   corresponding to it. This can be ambiguous, full, incomplete,
+   non-existent. */
+static int process(char *s, char ** next, command_t *lookup,
+                   command_t **result, char **prev)
+{
+        *result = find_cmd(s, lookup, next);
+        *prev = s;
+
+        /* non existent */
+        if (!*result)
+                return CMD_NONE;
+
+        /* found entry: is it ambigous, i.e. not exact command name and
+           more than one command in the list matches.  Note that find_cmd
+           points to the first ambiguous entry */
+        if (strncasecmp(s, (*result)->pc_name, strlen((*result)->pc_name))) {
+                char *another_next;
+                command_t *another_result = find_cmd(s, (*result) + 1,
+                                                     &another_next);
+                int found_another = 0;
+
+                while (another_result) {
+                        if (strncasecmp(s, another_result->pc_name,
+                                        strlen(another_result->pc_name)) == 0){
+                                *result = another_result;
+                                *next = another_next;
+                                goto got_it;
+                        }
+                        another_result = find_cmd(s, another_result + 1,
+                                                  &another_next);
+                        found_another = 1;
+                }
+                if (found_another)
+                        return CMD_AMBIG;
+        }
+
+got_it:
+	/* found a unique command: component or full? */
+	if ((*result)->pc_func != NULL) {
+		return CMD_COMPLETE;
+	} else {
+		if (**next == '\0') {
+			return CMD_INCOMPLETE;
+		} else {
+			return process(*next, next, (*result)->pc_sub_cmd,
+				       result, prev);
+		}
+	}
+}
+
+#ifdef HAVE_LIBREADLINE
+static command_t * match_tbl;   /* Command completion against this table */
+static char * command_generator(const char * text, int state)
+{
+        static int index,
+                len;
+        char       *name;
+
+        /* Do we have a match table? */
+        if (!match_tbl)
+                return NULL;
+
+        /* If this is the first time called on this word, state is 0 */
+        if (!state) {
+                index = 0;
+                len = (int)strlen(text);
+        }
+
+        /* Return next name in the command list that paritally matches test */
+        while ( (name = (match_tbl + index)->pc_name) ) {
+                index++;
+
+                if (strncasecmp(name, text, len) == 0) {
+                        return(strdup(name));
+                }
+        }
+
+        /* No more matches */
+        return NULL;
+}
+
+/* probably called by readline */
+static char **command_completion(const char *text, int start, int end)
+{
+        command_t   * table;
+        char        * pos;
+
+        match_tbl = top_level;
+        
+        for (table = find_cmd(rl_line_buffer, match_tbl, &pos);
+             table; table = find_cmd(pos, match_tbl, &pos)) 
+        {
+
+                if (*(pos - 1) == ' ') match_tbl = table->pc_sub_cmd;
+        }
+
+	return rl_completion_matches(text, command_generator);
+}
+#endif
+
+/* take a string and execute the function or print help */
+int execute_line(char * line)
+{
+	command_t	*cmd, *ambig;
+	char		*prev;
+	char		*next, *tmp;
+	char		*argv[MAXARGS];
+	int		i;
+	int		rc = 0;
+
+	switch (process(line, &next, top_level, &cmd, &prev)) {
+	case CMD_AMBIG:
+		fprintf(stderr, "Ambiguous command \'%s\'\nOptions: ", line);
+		while ((ambig = find_cmd(prev, cmd, &tmp))) {
+			fprintf(stderr, "%s ", ambig->pc_name);
+			cmd = ambig + 1;
+		}
+		fprintf(stderr, "\n");
+		break;
+	case CMD_NONE:
+		fprintf(stderr, "No such command, type help\n");
+		break;
+	case CMD_INCOMPLETE:
+		fprintf(stderr, "'%s' incomplete command.  Use '%s x' where "
+			"x is one of:\n", line, line);
+		fprintf(stderr, "\t");
+		for (i = 0; cmd->pc_sub_cmd[i].pc_name; i++)
+			fprintf(stderr, "%s ", cmd->pc_sub_cmd[i].pc_name);
+		fprintf(stderr, "\n");
+		break;
+	case CMD_COMPLETE:
+		optind = 0;
+		i = line2args(line, argv, MAXARGS);
+		rc = (cmd->pc_func)(i, argv);
+
+		if (rc == CMD_HELP)
+			fprintf(stderr, "%s\n", cmd->pc_help);
+
+		break;
+	}
+
+	return rc;
+}
+
+#ifdef HAVE_LIBREADLINE
+static void noop_int_fn(int unused) { }
+static void noop_void_fn(void) { }
+#endif
+
+/* just in case you're ever in an airplane and discover you
+ * forgot to install readline-dev. :) */
+static int init_input(void)
+{
+	int interactive = isatty(fileno(stdin));
+
+#ifdef HAVE_LIBREADLINE
+	using_history();
+	stifle_history(HISTORY);
+
+	if (!interactive) {
+		rl_prep_term_function = noop_int_fn;
+		rl_deprep_term_function = noop_void_fn;
+	}
+
+	rl_attempted_completion_function = command_completion;
+	rl_completion_entry_function = command_generator;
+#endif
+	return interactive;
+}
+
+#ifndef HAVE_LIBREADLINE
+#define add_history(s)
+char * readline(char * prompt)
+{
+        int size = 2048;
+        char *line = malloc(size);
+        char *ptr = line;
+        int c;
+        int eof = 0;
+
+        if (line == NULL)
+                return NULL;
+        if (prompt)
+                printf ("%s", prompt);
+
+        while (1) {
+                if ((c = fgetc(stdin)) != EOF) {
+                        if (c == '\n')
+                                goto out;
+                        *ptr++ = (char)c;
+
+                        if (ptr - line >= size - 1) {
+                                char *tmp;
+
+                                size *= 2;
+                                tmp = malloc(size);
+                                if (tmp == NULL)
+                                        goto outfree;
+                                memcpy(tmp, line, ptr - line);
+                                ptr = tmp + (ptr - line);
+                                free(line);
+                                line = tmp;
+                        }
+                } else {
+                        eof = 1;
+                        if (ferror(stdin) || feof(stdin))
+                                goto outfree;
+                        goto out;
+                }
+        }
+out:
+        *ptr = 0;
+        if (eof && (strlen(line) == 0)) {
+                free(line);
+                line = NULL;
+        }
+        return line;
+outfree:
+        free(line);
+        return NULL;
+}
+#endif
+
+/* this is the command execution machine */
+int Parser_commands(void)
+{
+        char *line, *s;
+        int rc = 0, save_error = 0;
+        int interactive;
+
+        interactive = init_input();
+
+        while(!done) {
+                line = readline(interactive ? parser_prompt : NULL);
+
+                if (!line) break;
+
+                s = skipwhitespace(line);
+
+                if (*s) {
+                        add_history(s);
+                        rc = execute_line(s);
+                }
+                /* stop on error if not-interactive */
+                if (rc != 0 && !interactive) {
+                        if (save_error == 0)
+                                save_error = rc;
+                        if (!ignore_errors)
+                                done = 1;
+                }
+
+                free(line);
+        }
+        if (save_error)
+                rc = save_error;
+        return rc;
+}
+
+
+/* sets the parser prompt */
+void Parser_init(char * prompt, command_t * cmds)
+{
+        done = 0;
+        top_level = cmds;
+        if (parser_prompt) free(parser_prompt);
+        parser_prompt = strdup(prompt);
+}
+
+/* frees the parser prompt */
+void Parser_exit(int argc, char *argv[])
+{
+        done = 1;
+        free(parser_prompt);
+        parser_prompt = NULL;
+}
+
+/* convert a string to an integer */
+int Parser_int(char *s, int *val)
+{
+        int ret;
+
+        if (*s != '0')
+                ret = sscanf(s, "%d", val);
+        else if (*(s+1) != 'x')
+                ret = sscanf(s, "%o", val);
+        else {
+                s++;
+                ret = sscanf(++s, "%x", val);
+        }
+
+        return(ret);
+}
+
+
+void Parser_qhelp(int argc, char *argv[]) {
+
+	printf("usage: %s [COMMAND] [OPTIONS]... [ARGS]\n",
+		program_invocation_short_name);
+	printf("Without any parameters, interactive mode is invoked\n");
+
+	printf("Try '%s help <COMMAND>' or '%s --list-commands' for more information\n",
+		program_invocation_short_name, program_invocation_short_name);
+}
+
+int Parser_help(int argc, char **argv)
+{
+        char line[1024];
+        char *next, *prev, *tmp;
+        command_t *result, *ambig;
+        int i;
+
+        if ( argc == 1 ) {
+                Parser_qhelp(argc, argv);
+                return 0;
+        }
+
+	/* Joining command line arguments without space is not critical here
+	 * because of this string is used for search a help topic and assume
+	 * that only one argument will be (the name of topic). For example:
+	 * lst > help ping run
+	 * pingrun: Unknown command. */
+	line[0] = '\0';
+	for (i = 1;  i < argc; i++) {
+		if (strlen(argv[i]) >= sizeof(line) - strlen(line))
+			return -E2BIG;
+		/* The function strlcat() cannot be used here because of
+		 * this function is used in LNet utils that is not linked
+		 * with libcfs.a. */
+		strncat(line, argv[i], sizeof(line) - strlen(line));
+	}
+
+        switch ( process(line, &next, top_level, &result, &prev) ) {
+        case CMD_COMPLETE:
+                fprintf(stderr, "%s: %s\n",line, result->pc_help);
+                break;
+        case CMD_NONE:
+                fprintf(stderr, "%s: Unknown command.\n", line);
+                break;
+        case CMD_INCOMPLETE:
+                fprintf(stderr,
+                        "'%s' incomplete command.  Use '%s x' where x is one of:\n",
+                        line, line);
+                fprintf(stderr, "\t");
+                for (i = 0; result->pc_sub_cmd[i].pc_name; i++) {
+                        fprintf(stderr, "%s ", result->pc_sub_cmd[i].pc_name);
+                }
+                fprintf(stderr, "\n");
+                break;
+        case CMD_AMBIG:
+                fprintf(stderr, "Ambiguous command \'%s\'\nOptions: ", line);
+                while( (ambig = find_cmd(prev, result, &tmp)) ) {
+                        fprintf(stderr, "%s ", ambig->pc_name);
+                        result = ambig + 1;
+                }
+                fprintf(stderr, "\n");
+                break;
+        }
+        return 0;
+}
+
+
+void Parser_printhelp(char *cmd)
+{
+        char *argv[] = { "help", cmd };
+        Parser_help(2, argv);
+}
+
+
+/*************************************************************************
+ * COMMANDS                                                              *
+ *************************************************************************/
+
+/**
+ * Parser_list_commands() - Output a list of the supported commands.
+ * @cmdlist:	  Array of structures describing the commands.
+ * @buffer:	  String buffer used to temporarily store the output text.
+ * @buf_size:	  Length of the string buffer.
+ * @parent_cmd:	  When called recursively, contains the name of the parent cmd.
+ * @col_start:	  Column where printing should begin.
+ * @col_num:	  The number of commands printed in a single row.
+ *
+ * The commands and subcommands supported by the utility are printed, arranged
+ * into several columns for readability.  If a command supports subcommands, the
+ * function is called recursively, and the name of the parent command is
+ * supplied so that it can be prepended to the names of the subcommands.
+ *
+ * Return: The number of items that were printed.
+ */
+int Parser_list_commands(const command_t *cmdlist, char *buffer,
+			 size_t buf_size, const char *parent_cmd,
+			 int col_start, int col_num)
+{
+	int col = col_start;
+	int char_max;
+	int len;
+	int count = 0;
+	int rc;
+
+	if (col_start >= col_num)
+		return 0;
+
+	char_max = (buf_size - 1) / col_num; /* Reserve 1 char for NUL */
+
+	for (; cmdlist->pc_name != NULL; cmdlist++) {
+		if (cmdlist->pc_func == NULL && cmdlist->pc_sub_cmd == NULL)
+			break;
+		count++;
+		if (parent_cmd != NULL)
+			len = snprintf(&buffer[col * char_max],
+				       char_max + 1, "%s %s", parent_cmd,
+				       cmdlist->pc_name);
+		else
+			len = snprintf(&buffer[col * char_max],
+				       char_max + 1, "%s", cmdlist->pc_name);
+
+		/* Add trailing spaces to pad the entry to the column size */
+		if (len < char_max) {
+			snprintf(&buffer[col * char_max] + len,
+				 char_max - len + 1, "%*s", char_max - len,
+				 " ");
+		} else {
+			buffer[(col + 1) * char_max - 1] = ' ';
+		}
+
+		col++;
+		if (col >= col_num) {
+			fprintf(stdout, "%s\n", buffer);
+			col = 0;
+			buffer[0] = '\0';
+		}
+
+		if (cmdlist->pc_sub_cmd != NULL) {
+			rc = Parser_list_commands(cmdlist->pc_sub_cmd, buffer,
+						 buf_size, cmdlist->pc_name,
+						 col, col_num);
+			col = (col + rc) % col_num;
+			count += rc;
+		}
+	}
+	if (parent_cmd == NULL && col != 0)
+		fprintf(stdout, "%s\n", buffer);
+	return count;
+}
+
+char *Parser_getstr(const char *prompt, const char *deft, char *res,
+                    size_t len)
+{
+        char *line = NULL;
+        int size = strlen(prompt) + strlen(deft) + 8;
+        char *theprompt;
+        theprompt = malloc(size);
+        assert(theprompt);
+
+        sprintf(theprompt, "%s [%s]: ", prompt, deft);
+
+        line  = readline(theprompt);
+        free(theprompt);
+
+	/* The function strlcpy() cannot be used here because of
+	 * this function is used in LNet utils that is not linked
+	 * with libcfs.a. */
+	if (line == NULL || *line == '\0')
+		strncpy(res, deft, len);
+	else
+		strncpy(res, line, len);
+	res[len - 1] = '\0';
+
+	if (line != NULL) {
+		free(line);
+		return res;
+	}
+	return NULL;
+}
+
+/* get integer from prompt, loop forever to get it */
+int Parser_getint(const char *prompt, long min, long max, long deft, int base)
+{
+        int rc;
+        long result;
+        char *line;
+        int size = strlen(prompt) + 40;
+        char *theprompt = malloc(size);
+        assert(theprompt);
+        sprintf(theprompt,"%s [%ld, (0x%lx)]: ", prompt, deft, deft);
+
+        fflush(stdout);
+
+        do {
+                line = NULL;
+                line = readline(theprompt);
+                if ( !line ) {
+                        fprintf(stdout, "Please enter an integer.\n");
+                        fflush(stdout);
+                        continue;
+                }
+                if ( *line == '\0' ) {
+                        free(line);
+                        result =  deft;
+                        break;
+                }
+                rc = Parser_arg2int(line, &result, base);
+                free(line);
+                if ( rc != 0 ) {
+                        fprintf(stdout, "Invalid string.\n");
+                        fflush(stdout);
+                } else if ( result > max || result < min ) {
+                        fprintf(stdout, "Error: response must lie between %ld and %ld.\n",
+                                min, max);
+                        fflush(stdout);
+                } else {
+                        break;
+                }
+        } while ( 1 ) ;
+
+        if (theprompt)
+                free(theprompt);
+        return result;
+
+}
+
+/* get boolean (starting with YyNn; loop forever */
+int Parser_getbool(const char *prompt, int deft)
+{
+        int result = 0;
+        char *line;
+        int size = strlen(prompt) + 8;
+        char *theprompt = malloc(size);
+        assert(theprompt);
+
+        fflush(stdout);
+
+        if ( deft != 0 && deft != 1 ) {
+                fprintf(stderr, "Error: Parser_getbool given bad default %d\n",
+                        deft);
+                assert ( 0 );
+        }
+        sprintf(theprompt, "%s [%s]: ", prompt, (deft==0)? "N" : "Y");
+
+        do {
+                line = NULL;
+                line = readline(theprompt);
+                if ( line == NULL ) {
+                        result = deft;
+                        break;
+                }
+                if ( *line == '\0' ) {
+                        result = deft;
+                        break;
+                }
+                if ( *line == 'y' || *line == 'Y' ) {
+                        result = 1;
+                        break;
+                }
+                if ( *line == 'n' || *line == 'N' ) {
+                        result = 0;
+                        break;
+                }
+                if ( line )
+                        free(line);
+                fprintf(stdout, "Invalid string. Must start with yY or nN\n");
+                fflush(stdout);
+        } while ( 1 );
+
+        if ( line )
+                free(line);
+        if ( theprompt )
+                free(theprompt);
+        return result;
+}
+
+/* parse int out of a string or prompt for it */
+long Parser_intarg(const char *inp, const char *prompt, int deft,
+                   int min, int max, int base)
+{
+        long result;
+        int rc;
+
+        rc = Parser_arg2int(inp, &result, base);
+
+        if ( rc == 0 ) {
+                return result;
+        } else {
+                return Parser_getint(prompt, deft, min, max, base);
+        }
+}
+
+/* parse int out of a string or prompt for it */
+char *Parser_strarg(char *inp, const char *prompt, const char *deft,
+                    char *answer, int len)
+{
+        if ( inp == NULL || *inp == '\0' ) {
+                return Parser_getstr(prompt, deft, answer, len);
+        } else
+                return inp;
+}
+
+/* change a string into a number: return 0 on success. No invalid characters
+   allowed. The processing of base and validity follows strtol(3)*/
+int Parser_arg2int(const char *inp, long *result, int base)
+{
+        char *endptr;
+
+        if ( (base !=0) && (base < 2 || base > 36) )
+                return 1;
+
+        *result = strtol(inp, &endptr, base);
+
+        if ( *inp != '\0' && *endptr == '\0' )
+                return 0;
+        else
+                return 1;
+}
+
+/* Convert human readable size string to and int; "1k" -> 1000 */
+int Parser_size (int *sizep, char *str) {
+        int size;
+        char mod[32];
+
+        switch (sscanf (str, "%d%1[gGmMkK]", &size, mod)) {
+        default:
+                return (-1);
+
+        case 1:
+                *sizep = size;
+                return (0);
+
+        case 2:
+                switch (*mod) {
+                case 'g':
+                case 'G':
+                        *sizep = size << 30;
+                        return (0);
+
+                case 'm':
+                case 'M':
+                        *sizep = size << 20;
+                        return (0);
+
+                case 'k':
+                case 'K':
+                        *sizep = size << 10;
+                        return (0);
+
+                default:
+                        *sizep = size;
+                        return (0);
+                }
+        }
+}
+
+/* Convert a string boolean to an int; "enable" -> 1 */
+int Parser_bool (int *b, char *str) {
+        if (!strcasecmp (str, "no") ||
+            !strcasecmp (str, "n") ||
+            !strcasecmp (str, "off") ||
+            !strcasecmp (str, "down") ||
+            !strcasecmp (str, "disable"))
+        {
+                *b = 0;
+                return (0);
+        }
+
+        if (!strcasecmp (str, "yes") ||
+            !strcasecmp (str, "y") ||
+            !strcasecmp (str, "on") ||
+            !strcasecmp (str, "up") ||
+            !strcasecmp (str, "enable"))
+        {
+                *b = 1;
+                return (0);
+        }
+
+        return (-1);
+}
+
+int Parser_quit(int argc, char **argv)
+{
+        argc = argc;
+        argv = argv;
+        done = 1;
+        return 0;
+}
+
+int Parser_version(int argc, char **argv)
+{
+	fprintf(stdout, "%s %s\n", program_invocation_short_name,
+		LUSTRE_VERSION_STRING);
+	return 0;
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/string.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/string.c
new file mode 100644
index 0000000000000..9078500020bb9
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/string.c
@@ -0,0 +1,482 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * String manipulation functions.
+ *
+ * libcfs/libcfs/util/string.c
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ */
+#include <ctype.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <libcfs/util/string.h>
+
+/*
+ * According manual of strlcpy() and strlcat() the functions should return
+ * the total length of the string they tried to create. For strlcpy() that
+ * means the length of src. For strlcat() that means the initial length of
+ * dst plus the length of src. So, the function strnlen() cannot be used
+ * otherwise the return value will be wrong.
+ */
+#ifndef HAVE_STRLCPY /* not in glibc for RHEL 5.x, remove when obsolete */
+size_t strlcpy(char *dst, const char *src, size_t size)
+{
+	size_t ret = strlen(src);
+
+	if (size) {
+		size_t len = (ret >= size) ? size - 1 : ret;
+		memcpy(dst, src, len);
+		dst[len] = '\0';
+	}
+	return ret;
+}
+#endif
+
+#ifndef HAVE_STRLCAT /* not in glibc for RHEL 5.x, remove when obsolete */
+size_t strlcat(char *dst, const char *src, size_t size)
+{
+	size_t dsize = strlen(dst);
+	size_t len = strlen(src);
+	size_t ret = dsize + len;
+
+	dst  += dsize;
+	size -= dsize;
+	if (len >= size)
+		len = size-1;
+	memcpy(dst, src, len);
+	dst[len] = '\0';
+	return ret;
+}
+#endif
+
+/**
+ * Extracts tokens from strings.
+ *
+ * Looks for \a delim in string \a next, sets \a res to point to
+ * substring before the delimiter, sets \a next right after the found
+ * delimiter.
+ *
+ * \retval 1 if \a res points to a string of non-whitespace characters
+ * \retval 0 otherwise
+ */
+int
+cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res)
+{
+	char *end;
+
+	if (next->ls_str == NULL)
+		return 0;
+
+	/* skip leading white spaces */
+	while (next->ls_len) {
+		if (!isspace(*next->ls_str))
+			break;
+		next->ls_str++;
+		next->ls_len--;
+	}
+
+	if (next->ls_len == 0) /* whitespaces only */
+		return 0;
+
+	if (*next->ls_str == delim) {
+		/* first non-writespace is the delimiter */
+		return 0;
+	}
+
+	res->ls_str = next->ls_str;
+	end = memchr(next->ls_str, delim, next->ls_len);
+	if (end == NULL) {
+		/* there is no the delimeter in the string */
+		end = next->ls_str + next->ls_len;
+		next->ls_str = NULL;
+	} else {
+		next->ls_str = end + 1;
+		next->ls_len -= (end - res->ls_str + 1);
+	}
+
+	/* skip ending whitespaces */
+	while (--end != res->ls_str) {
+		if (!isspace(*end))
+			break;
+	}
+
+	res->ls_len = end - res->ls_str + 1;
+	return 1;
+}
+
+/**
+ * Converts string to integer.
+ *
+ * Accepts decimal and hexadecimal number recordings.
+ *
+ * \retval 1 if first \a nob chars of \a str convert to decimal or
+ * hexadecimal integer in the range [\a min, \a max]
+ * \retval 0 otherwise
+ */
+int
+cfs_str2num_check(char *str, int nob, unsigned *num,
+		  unsigned min, unsigned max)
+{
+	char	*endp;
+
+	*num = strtoul(str, &endp, 0);
+	if (endp == str)
+		return 0;
+
+	for (; endp < str + nob; endp++) {
+		if (!isspace(*endp))
+			return 0;
+	}
+
+	return (*num >= min && *num <= max);
+}
+
+/**
+ * Parses \<range_expr\> token of the syntax. If \a bracketed is false,
+ * \a src should only have a single token which can be \<number\> or  \*
+ *
+ * \retval pointer to allocated range_expr and initialized
+ * range_expr::re_lo, range_expr::re_hi and range_expr:re_stride if \a
+ * src parses to
+ * \<number\> |
+ * \<number\> '-' \<number\> |
+ * \<number\> '-' \<number\> '/' \<number\>
+ * \retval 0 will be returned if it can be parsed, otherwise -EINVAL or
+ * -ENOMEM will be returned.
+ */
+static int
+cfs_range_expr_parse(struct cfs_lstr *src, unsigned min, unsigned max,
+		     int bracketed, struct cfs_range_expr **expr)
+{
+	struct cfs_range_expr	*re;
+	struct cfs_lstr		tok;
+
+	re = calloc(1, sizeof(*re));
+	if (re == NULL)
+		return -ENOMEM;
+
+	if (src->ls_len == 1 && src->ls_str[0] == '*') {
+		re->re_lo = min;
+		re->re_hi = max;
+		re->re_stride = 1;
+		goto out;
+	}
+
+	if (cfs_str2num_check(src->ls_str, src->ls_len,
+			      &re->re_lo, min, max)) {
+		/* <number> is parsed */
+		re->re_hi = re->re_lo;
+		re->re_stride = 1;
+		goto out;
+	}
+
+	if (!bracketed || !cfs_gettok(src, '-', &tok))
+		goto failed;
+
+	if (!cfs_str2num_check(tok.ls_str, tok.ls_len,
+			       &re->re_lo, min, max))
+		goto failed;
+
+	/* <number> - */
+	if (cfs_str2num_check(src->ls_str, src->ls_len,
+			      &re->re_hi, min, max)) {
+		/* <number> - <number> is parsed */
+		re->re_stride = 1;
+		goto out;
+	}
+
+	/* go to check <number> '-' <number> '/' <number> */
+	if (cfs_gettok(src, '/', &tok)) {
+		if (!cfs_str2num_check(tok.ls_str, tok.ls_len,
+				       &re->re_hi, min, max))
+			goto failed;
+
+		/* <number> - <number> / ... */
+		if (cfs_str2num_check(src->ls_str, src->ls_len,
+				      &re->re_stride, min, max)) {
+			/* <number> - <number> / <number> is parsed */
+			goto out;
+		}
+	}
+
+ out:
+	*expr = re;
+	return 0;
+
+ failed:
+	free(re);
+	return -EINVAL;
+}
+
+/**
+ * Print the range expression \a re into specified \a buffer.
+ * If \a bracketed is true, expression does not need additional
+ * brackets.
+ *
+ * \retval number of characters written
+ */
+static int
+cfs_range_expr_print(char *buffer, int count, struct cfs_range_expr *expr,
+		     bool bracketed)
+{
+	int i;
+	char s[] = "[";
+	char e[] = "]";
+
+	if (bracketed)
+		s[0] = e[0] = '\0';
+
+	if (expr->re_lo == expr->re_hi)
+		i = snprintf(buffer, count, "%u", expr->re_lo);
+	else if (expr->re_stride == 1)
+		i = snprintf(buffer, count, "%s%u-%u%s",
+				  s, expr->re_lo, expr->re_hi, e);
+	else
+		i = snprintf(buffer, count, "%s%u-%u/%u%s",
+				  s, expr->re_lo, expr->re_hi,
+				  expr->re_stride, e);
+	return i;
+}
+
+/**
+ * Print a list of range expressions (\a expr_list) into specified \a buffer.
+ * If the list contains several expressions, separate them with comma
+ * and surround the list with brackets.
+ *
+ * \retval number of characters written
+ */
+int
+cfs_expr_list_print(char *buffer, int count, struct cfs_expr_list *expr_list)
+{
+	struct cfs_range_expr *expr;
+	int i = 0, j = 0;
+	int numexprs = 0;
+
+	if (count <= 0)
+		return 0;
+
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link)
+		numexprs++;
+
+	if (numexprs > 1)
+		i += snprintf(buffer + i, count - i, "[");
+
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		if (j++ != 0)
+			i += snprintf(buffer + i, count - i, ",");
+		i += cfs_range_expr_print(buffer + i, count - i, expr,
+					  numexprs > 1);
+	}
+
+	if (numexprs > 1)
+		i += snprintf(buffer + i, count - i, "]");
+
+	return i;
+}
+
+/**
+ * Matches value (\a value) against ranges expression list \a expr_list.
+ *
+ * \retval 1 if \a value matches
+ * \retval 0 otherwise
+ */
+int
+cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list)
+{
+	struct cfs_range_expr	*expr;
+
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		if (value >= expr->re_lo && value <= expr->re_hi &&
+		    ((value - expr->re_lo) % expr->re_stride) == 0)
+			return 1;
+	}
+
+	return 0;
+}
+
+/**
+ * Convert express list (\a expr_list) to an array of all matched values
+ *
+ * \retval N N is total number of all matched values
+ * \retval 0 if expression list is empty
+ * \retval < 0 for failure
+ */
+int
+cfs_expr_list_values(struct cfs_expr_list *expr_list, int max, __u32 **valpp)
+{
+	struct cfs_range_expr	*expr;
+	__u32			*val;
+	int			count = 0;
+	int			i;
+
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		for (i = expr->re_lo; i <= expr->re_hi; i++) {
+			if (((i - expr->re_lo) % expr->re_stride) == 0)
+				count++;
+		}
+	}
+
+	if (count == 0) /* empty expression list */
+		return 0;
+
+	if (count > max)
+		return -EINVAL;
+
+	val = calloc(sizeof(val[0]), count);
+	if (val == NULL)
+		return -ENOMEM;
+
+	count = 0;
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		for (i = expr->re_lo; i <= expr->re_hi; i++) {
+			if (((i - expr->re_lo) % expr->re_stride) == 0)
+				val[count++] = i;
+		}
+	}
+
+	*valpp = val;
+	return count;
+}
+
+void
+cfs_expr_list_values_free(__u32 *values, int num)
+{
+	/* This array is allocated by LIBCFS_ALLOC(), so it shouldn't be freed
+	 * by OBD_FREE() if it's called by module other than libcfs & LNet,
+	 * otherwise we will see fake memory leak */
+	free(values);
+}
+
+/**
+ * Frees cfs_range_expr structures of \a expr_list.
+ *
+ * \retval none
+ */
+void
+cfs_expr_list_free(struct cfs_expr_list *expr_list)
+{
+	while (!list_empty(&expr_list->el_exprs)) {
+		struct cfs_range_expr *expr;
+
+		expr = list_entry(expr_list->el_exprs.next,
+				  struct cfs_range_expr, re_link);
+		list_del(&expr->re_link);
+		free(expr);
+	}
+
+	free(expr_list);
+}
+
+/**
+ * Parses \<cfs_expr_list\> token of the syntax.
+ *
+ * \retval 0 if \a str parses to \<number\> | \<expr_list\>
+ * \retval -errno otherwise
+ */
+int
+cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max,
+		    struct cfs_expr_list **elpp)
+{
+	struct cfs_expr_list	*expr_list;
+	struct cfs_range_expr	*expr;
+	struct cfs_lstr		src;
+	int			rc;
+
+	expr_list = calloc(1, sizeof(*expr_list));
+	if (expr_list == NULL)
+		return -ENOMEM;
+
+	src.ls_str = str;
+	src.ls_len = len;
+
+	INIT_LIST_HEAD(&expr_list->el_exprs);
+
+	if (src.ls_str[0] == '[' &&
+	    src.ls_str[src.ls_len - 1] == ']') {
+		src.ls_str++;
+		src.ls_len -= 2;
+
+		rc = -EINVAL;
+		while (src.ls_str != NULL) {
+			struct cfs_lstr tok;
+
+			if (!cfs_gettok(&src, ',', &tok)) {
+				rc = -EINVAL;
+				break;
+			}
+
+			rc = cfs_range_expr_parse(&tok, min, max, 1, &expr);
+			if (rc != 0)
+				break;
+
+			list_add_tail(&expr->re_link,
+					  &expr_list->el_exprs);
+		}
+	} else {
+		rc = cfs_range_expr_parse(&src, min, max, 0, &expr);
+		if (rc == 0) {
+			list_add_tail(&expr->re_link,
+					  &expr_list->el_exprs);
+		}
+	}
+
+	if (rc != 0)
+		cfs_expr_list_free(expr_list);
+	else
+		*elpp = expr_list;
+
+	return rc;
+}
+
+/**
+ * Frees cfs_expr_list structures of \a list.
+ *
+ * For each struct cfs_expr_list structure found on \a list it frees
+ * range_expr list attached to it and frees the cfs_expr_list itself.
+ *
+ * \retval none
+ */
+void
+cfs_expr_list_free_list(struct list_head *list)
+{
+	struct cfs_expr_list *el;
+
+	while (!list_empty(list)) {
+		el = list_entry(list->next,
+				    struct cfs_expr_list, el_link);
+		list_del(&el->el_link);
+		cfs_expr_list_free(el);
+	}
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c b/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c
new file mode 100644
index 0000000000000..359ca943e95d8
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c
@@ -0,0 +1,501 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/watchdog.c
+ *
+ * Author: Jacob Berkman <jacob@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/kthread.h>
+#include <libcfs/libcfs.h>
+#include "tracefile.h"
+
+struct lc_watchdog {
+	spinlock_t		lcw_lock;	/* check or change lcw_list */
+	int			lcw_refcount;	/* must hold lcw_pending_timers_lock */
+	struct timer_list	lcw_timer;	/* kernel timer */
+	struct list_head	lcw_list;	/* chain on pending list */
+	ktime_t			lcw_last_touched;/* last touched stamp */
+	struct task_struct     *lcw_task;	/* owner task */
+	void			(*lcw_callback)(pid_t, void *);
+	void			*lcw_data;
+
+	pid_t			lcw_pid;
+
+	enum {
+		LC_WATCHDOG_DISABLED,
+		LC_WATCHDOG_ENABLED,
+		LC_WATCHDOG_EXPIRED
+	} lcw_state;
+};
+
+#ifdef WITH_WATCHDOG
+/*
+ * The dispatcher will complete lcw_start_completion when it starts,
+ * and lcw_stop_completion when it exits.
+ * Wake lcw_event_waitq to signal timer callback dispatches.
+ */
+static struct completion lcw_start_completion;
+static struct completion  lcw_stop_completion;
+static wait_queue_head_t lcw_event_waitq;
+
+/*
+ * Set this and wake lcw_event_waitq to stop the dispatcher.
+ */
+enum {
+        LCW_FLAG_STOP = 0
+};
+static unsigned long lcw_flags = 0;
+
+/*
+ * Number of outstanding watchdogs.
+ * When it hits 1, we start the dispatcher.
+ * When it hits 0, we stop the dispatcher.
+ */
+static __u32         lcw_refcount = 0;
+static DEFINE_MUTEX(lcw_refcount_mutex);
+
+/*
+ * List of timers that have fired that need their callbacks run by the
+ * dispatcher.
+ */
+/* BH lock! */
+static DEFINE_SPINLOCK(lcw_pending_timers_lock);
+static struct list_head lcw_pending_timers = LIST_HEAD_INIT(lcw_pending_timers);
+
+/* Last time a watchdog expired */
+static time64_t lcw_last_watchdog_time;
+static int lcw_recent_watchdog_count;
+
+static void
+lcw_dump(struct lc_watchdog *lcw)
+{
+        ENTRY;
+        rcu_read_lock();
+       if (lcw->lcw_task == NULL) {
+		LCONSOLE_WARN("Process %d was not found in the task "
+                              "list; watchdog callback may be incomplete\n",
+                              (int)lcw->lcw_pid);
+        } else {
+                libcfs_debug_dumpstack(lcw->lcw_task);
+        }
+
+        rcu_read_unlock();
+        EXIT;
+}
+
+static void lcw_cb(uintptr_t data)
+{
+        struct lc_watchdog *lcw = (struct lc_watchdog *)data;
+        ENTRY;
+
+        if (lcw->lcw_state != LC_WATCHDOG_ENABLED) {
+                EXIT;
+                return;
+        }
+
+        lcw->lcw_state = LC_WATCHDOG_EXPIRED;
+
+	spin_lock_bh(&lcw->lcw_lock);
+	LASSERT(list_empty(&lcw->lcw_list));
+
+	spin_lock_bh(&lcw_pending_timers_lock);
+	lcw->lcw_refcount++; /* +1 for pending list */
+	list_add(&lcw->lcw_list, &lcw_pending_timers);
+	wake_up(&lcw_event_waitq);
+
+	spin_unlock_bh(&lcw_pending_timers_lock);
+	spin_unlock_bh(&lcw->lcw_lock);
+	EXIT;
+}
+
+static int is_watchdog_fired(void)
+{
+	int rc;
+
+	if (test_bit(LCW_FLAG_STOP, &lcw_flags))
+		return 1;
+
+	spin_lock_bh(&lcw_pending_timers_lock);
+	rc = !list_empty(&lcw_pending_timers);
+	spin_unlock_bh(&lcw_pending_timers_lock);
+	return rc;
+}
+
+static void lcw_dump_stack(struct lc_watchdog *lcw)
+{
+	time64_t current_time = ktime_get_seconds();
+	struct timespec64 timediff;
+	time64_t delta_time;
+
+	timediff = ktime_to_timespec64(ktime_sub(ktime_get(),
+				       lcw->lcw_last_touched));
+
+	/* LU-9235: Don't dump stack if the thread is just touched. */
+	if (timediff.tv_sec == 0)
+		return;
+
+	/*
+	 * Check to see if we should throttle the watchdog timer to avoid
+	 * too many dumps going to the console thus triggering an NMI.
+	 */
+	delta_time = current_time - lcw_last_watchdog_time;
+	if (delta_time < libcfs_watchdog_ratelimit &&
+	    lcw_recent_watchdog_count > 3) {
+		LCONSOLE_WARN("Service thread pid %u was inactive for %lu.%.02lus. Watchdog stack traces are limited to 3 per %d seconds, skipping this one.\n",
+			      (int)lcw->lcw_pid,
+			      timediff.tv_sec,
+			      timediff.tv_nsec / (NSEC_PER_SEC / 100),
+			      libcfs_watchdog_ratelimit);
+	} else {
+		if (delta_time < libcfs_watchdog_ratelimit) {
+			lcw_recent_watchdog_count++;
+		} else {
+			memcpy(&lcw_last_watchdog_time, &current_time,
+			       sizeof(current_time));
+			lcw_recent_watchdog_count = 0;
+		}
+
+		LCONSOLE_WARN("Service thread pid %u was inactive for %lu.%.02lus. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:\n",
+			      (int)lcw->lcw_pid,
+			      timediff.tv_sec,
+			      timediff.tv_nsec / (NSEC_PER_SEC / 100));
+		lcw_dump(lcw);
+	}
+}
+
+/*
+ * Provided watchdog handlers
+ */
+
+static void lc_watchdog_dumplog(pid_t pid, void *data)
+{
+	libcfs_debug_dumplog_internal((void *)((uintptr_t)pid));
+}
+
+static int lcw_dispatch_main(void *data)
+{
+        int                 rc = 0;
+        struct lc_watchdog *lcw;
+	struct list_head zombies = LIST_HEAD_INIT(zombies);
+
+        ENTRY;
+
+	complete(&lcw_start_completion);
+
+        while (1) {
+                int dumplog = 1;
+
+		rc = wait_event_interruptible(lcw_event_waitq,
+					      is_watchdog_fired());
+                CDEBUG(D_INFO, "Watchdog got woken up...\n");
+		if (test_bit(LCW_FLAG_STOP, &lcw_flags)) {
+			CDEBUG(D_INFO, "LCW_FLAG_STOP set, shutting down...\n");
+
+			spin_lock_bh(&lcw_pending_timers_lock);
+			rc = !list_empty(&lcw_pending_timers);
+			spin_unlock_bh(&lcw_pending_timers_lock);
+			if (rc) {
+				CERROR("pending timers list was not empty at "
+				       "time of watchdog dispatch shutdown\n");
+			}
+			break;
+		}
+
+		spin_lock_bh(&lcw_pending_timers_lock);
+		while (!list_empty(&lcw_pending_timers)) {
+			int is_dumplog;
+
+			lcw = list_entry(lcw_pending_timers.next,
+					 struct lc_watchdog, lcw_list);
+			/* +1 ref for callback to make sure lwc wouldn't be
+			 * deleted after releasing lcw_pending_timers_lock */
+			lcw->lcw_refcount++;
+			spin_unlock_bh(&lcw_pending_timers_lock);
+
+			/* lock ordering */
+			spin_lock_bh(&lcw->lcw_lock);
+			spin_lock_bh(&lcw_pending_timers_lock);
+
+			if (list_empty(&lcw->lcw_list)) {
+				/* already removed from pending list */
+				lcw->lcw_refcount--; /* -1 ref for callback */
+				if (lcw->lcw_refcount == 0)
+					list_add(&lcw->lcw_list, &zombies);
+				spin_unlock_bh(&lcw->lcw_lock);
+				/* still hold lcw_pending_timers_lock */
+				continue;
+			}
+
+			list_del_init(&lcw->lcw_list);
+			lcw->lcw_refcount--; /* -1 ref for pending list */
+
+			spin_unlock_bh(&lcw_pending_timers_lock);
+			spin_unlock_bh(&lcw->lcw_lock);
+
+			CDEBUG(D_INFO, "found lcw for pid %d\n",
+                               lcw->lcw_pid);
+                        lcw_dump_stack(lcw);
+
+                        is_dumplog = lcw->lcw_callback == lc_watchdog_dumplog;
+                        if (lcw->lcw_state != LC_WATCHDOG_DISABLED &&
+                            (dumplog || !is_dumplog)) {
+                                lcw->lcw_callback(lcw->lcw_pid, lcw->lcw_data);
+                                if (dumplog && is_dumplog)
+                                        dumplog = 0;
+                        }
+
+			spin_lock_bh(&lcw_pending_timers_lock);
+			lcw->lcw_refcount--; /* -1 ref for callback */
+			if (lcw->lcw_refcount == 0)
+				list_add(&lcw->lcw_list, &zombies);
+		}
+		spin_unlock_bh(&lcw_pending_timers_lock);
+
+		while (!list_empty(&zombies)) {
+			lcw = list_entry(zombies.next,
+					     struct lc_watchdog, lcw_list);
+			list_del_init(&lcw->lcw_list);
+			LIBCFS_FREE(lcw, sizeof(*lcw));
+		}
+	}
+
+	complete(&lcw_stop_completion);
+
+	RETURN(rc);
+}
+
+static void lcw_dispatch_start(void)
+{
+	struct task_struct *task;
+
+	ENTRY;
+	LASSERT(lcw_refcount == 1);
+
+	init_completion(&lcw_stop_completion);
+	init_completion(&lcw_start_completion);
+	init_waitqueue_head(&lcw_event_waitq);
+
+	CDEBUG(D_INFO, "starting dispatch thread\n");
+	task = kthread_run(lcw_dispatch_main, NULL, "lc_watchdogd");
+	if (IS_ERR(task)) {
+		CERROR("error spawning watchdog dispatch thread: %ld\n",
+			PTR_ERR(task));
+		EXIT;
+		return;
+	}
+	wait_for_completion(&lcw_start_completion);
+	CDEBUG(D_INFO, "watchdog dispatcher initialization complete.\n");
+
+	EXIT;
+}
+
+static void lcw_dispatch_stop(void)
+{
+	ENTRY;
+	LASSERT(lcw_refcount == 0);
+
+	CDEBUG(D_INFO, "trying to stop watchdog dispatcher.\n");
+
+	set_bit(LCW_FLAG_STOP, &lcw_flags);
+	wake_up(&lcw_event_waitq);
+
+	wait_for_completion(&lcw_stop_completion);
+
+	CDEBUG(D_INFO, "watchdog dispatcher has shut down.\n");
+
+	EXIT;
+}
+
+struct lc_watchdog *lc_watchdog_add(int timeout,
+                                    void (*callback)(pid_t, void *),
+                                    void *data)
+{
+        struct lc_watchdog *lcw = NULL;
+        ENTRY;
+
+        LIBCFS_ALLOC(lcw, sizeof(*lcw));
+        if (lcw == NULL) {
+                CDEBUG(D_INFO, "Could not allocate new lc_watchdog\n");
+                RETURN(ERR_PTR(-ENOMEM));
+        }
+
+	spin_lock_init(&lcw->lcw_lock);
+	lcw->lcw_refcount = 1; /* refcount for owner */
+	lcw->lcw_task     = current;
+	lcw->lcw_pid      = current_pid();
+	lcw->lcw_callback = (callback != NULL) ? callback : lc_watchdog_dumplog;
+	lcw->lcw_data     = data;
+	lcw->lcw_state    = LC_WATCHDOG_DISABLED;
+
+	INIT_LIST_HEAD(&lcw->lcw_list);
+	setup_timer(&lcw->lcw_timer, lcw_cb, (unsigned long)lcw);
+
+	mutex_lock(&lcw_refcount_mutex);
+	if (++lcw_refcount == 1)
+		lcw_dispatch_start();
+	mutex_unlock(&lcw_refcount_mutex);
+
+	/* Keep this working in case we enable them by default */
+	if (lcw->lcw_state == LC_WATCHDOG_ENABLED) {
+		lcw->lcw_last_touched = ktime_get();
+		mod_timer(&lcw->lcw_timer, cfs_time_seconds(timeout) +
+			  jiffies);
+	}
+
+        RETURN(lcw);
+}
+EXPORT_SYMBOL(lc_watchdog_add);
+
+static void lcw_update_time(struct lc_watchdog *lcw, const char *message)
+{
+	ktime_t newtime = ktime_get();
+
+	if (lcw->lcw_state == LC_WATCHDOG_EXPIRED) {
+		ktime_t lapse = ktime_sub(newtime, lcw->lcw_last_touched);
+		struct timespec64 timediff;
+
+		timediff = ktime_to_timespec64(lapse);
+		LCONSOLE_WARN("Service thread pid %u %s after %lu.%.02lus. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).\n",
+			      lcw->lcw_pid, message,
+			      timediff.tv_sec,
+			      timediff.tv_nsec / (NSEC_PER_SEC / 100));
+	}
+	lcw->lcw_last_touched = newtime;
+}
+
+static void lc_watchdog_del_pending(struct lc_watchdog *lcw)
+{
+	spin_lock_bh(&lcw->lcw_lock);
+	if (unlikely(!list_empty(&lcw->lcw_list))) {
+		spin_lock_bh(&lcw_pending_timers_lock);
+		list_del_init(&lcw->lcw_list);
+		lcw->lcw_refcount--; /* -1 ref for pending list */
+		spin_unlock_bh(&lcw_pending_timers_lock);
+	}
+
+	spin_unlock_bh(&lcw->lcw_lock);
+}
+
+void lc_watchdog_touch(struct lc_watchdog *lcw, int timeout)
+{
+	ENTRY;
+	LASSERT(lcw != NULL);
+
+	lc_watchdog_del_pending(lcw);
+
+	lcw_update_time(lcw, "resumed");
+
+	mod_timer(&lcw->lcw_timer, jiffies + cfs_time_seconds(timeout));
+	lcw->lcw_state = LC_WATCHDOG_ENABLED;
+
+	EXIT;
+}
+EXPORT_SYMBOL(lc_watchdog_touch);
+
+void lc_watchdog_disable(struct lc_watchdog *lcw)
+{
+        ENTRY;
+        LASSERT(lcw != NULL);
+
+        lc_watchdog_del_pending(lcw);
+
+        lcw_update_time(lcw, "completed");
+        lcw->lcw_state = LC_WATCHDOG_DISABLED;
+
+        EXIT;
+}
+EXPORT_SYMBOL(lc_watchdog_disable);
+
+void lc_watchdog_delete(struct lc_watchdog *lcw)
+{
+        int dead;
+
+        ENTRY;
+        LASSERT(lcw != NULL);
+
+	del_timer(&lcw->lcw_timer);
+
+        lcw_update_time(lcw, "stopped");
+
+	spin_lock_bh(&lcw->lcw_lock);
+	spin_lock_bh(&lcw_pending_timers_lock);
+	if (unlikely(!list_empty(&lcw->lcw_list))) {
+		list_del_init(&lcw->lcw_list);
+		lcw->lcw_refcount--; /* -1 ref for pending list */
+	}
+
+	lcw->lcw_refcount--; /* -1 ref for owner */
+	dead = lcw->lcw_refcount == 0;
+	spin_unlock_bh(&lcw_pending_timers_lock);
+	spin_unlock_bh(&lcw->lcw_lock);
+
+	if (dead)
+		LIBCFS_FREE(lcw, sizeof(*lcw));
+
+	mutex_lock(&lcw_refcount_mutex);
+	if (--lcw_refcount == 0)
+		lcw_dispatch_stop();
+	mutex_unlock(&lcw_refcount_mutex);
+
+	EXIT;
+}
+EXPORT_SYMBOL(lc_watchdog_delete);
+
+#else   /* !defined(WITH_WATCHDOG) */
+
+struct lc_watchdog *lc_watchdog_add(int timeout,
+                                    void (*callback)(pid_t pid, void *),
+                                    void *data)
+{
+        static struct lc_watchdog      watchdog;
+        return &watchdog;
+}
+EXPORT_SYMBOL(lc_watchdog_add);
+
+void lc_watchdog_touch(struct lc_watchdog *lcw, int timeout)
+{
+}
+EXPORT_SYMBOL(lc_watchdog_touch);
+
+void lc_watchdog_disable(struct lc_watchdog *lcw)
+{
+}
+EXPORT_SYMBOL(lc_watchdog_disable);
+
+void lc_watchdog_delete(struct lc_watchdog *lcw)
+{
+}
+EXPORT_SYMBOL(lc_watchdog_delete);
+
+#endif
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/workitem.c b/drivers/staging/lustrefsx/libcfs/libcfs/workitem.c
new file mode 100644
index 0000000000000..fb4fd643ee0c0
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/workitem.c
@@ -0,0 +1,469 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/workitem.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ *         Liang Zhen  <zhen.liang@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/kthread.h>
+#include <libcfs/libcfs.h>
+
+#define CFS_WS_NAME_LEN         16
+
+struct cfs_wi_sched {
+	struct list_head		ws_list;	/* chain on global list */
+	/** serialised workitems */
+	spinlock_t			ws_lock;
+	/** where schedulers sleep */
+	wait_queue_head_t		ws_waitq;
+	/** concurrent workitems */
+	struct list_head		ws_runq;
+	/** rescheduled running-workitems, a workitem can be rescheduled
+	 * while running in wi_action(), but we don't to execute it again
+	 * unless it returns from wi_action(), so we put it on ws_rerunq
+	 * while rescheduling, and move it to runq after it returns
+	 * from wi_action() */
+	struct list_head		ws_rerunq;
+	/** CPT-table for this scheduler */
+	struct cfs_cpt_table	*ws_cptab;
+	/** CPT id for affinity */
+	int			ws_cpt;
+	/** number of scheduled workitems */
+	int			ws_nscheduled;
+	/** started scheduler thread, protected by cfs_wi_data::wi_glock */
+	unsigned int		ws_nthreads:30;
+	/** shutting down, protected by cfs_wi_data::wi_glock */
+	unsigned int		ws_stopping:1;
+	/** serialize starting thread, protected by cfs_wi_data::wi_glock */
+	unsigned int		ws_starting:1;
+	/** scheduler name */
+	char			ws_name[CFS_WS_NAME_LEN];
+};
+
+static struct cfs_workitem_data {
+	/** serialize */
+	spinlock_t		wi_glock;
+	/** list of all schedulers */
+	struct list_head	wi_scheds;
+	/** WI module is initialized */
+	int			wi_init;
+	/** shutting down the whole WI module */
+	int			wi_stopping;
+} cfs_wi_data;
+
+static inline int
+cfs_wi_sched_cansleep(struct cfs_wi_sched *sched)
+{
+	spin_lock(&sched->ws_lock);
+	if (sched->ws_stopping) {
+		spin_unlock(&sched->ws_lock);
+		return 0;
+	}
+
+	if (!list_empty(&sched->ws_runq)) {
+		spin_unlock(&sched->ws_lock);
+		return 0;
+	}
+	spin_unlock(&sched->ws_lock);
+	return 1;
+}
+
+/* XXX:
+ * 0. it only works when called from wi->wi_action.
+ * 1. when it returns no one shall try to schedule the workitem.
+ */
+void
+cfs_wi_exit(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
+{
+	LASSERT(!in_interrupt()); /* because we use plain spinlock */
+	LASSERT(!sched->ws_stopping);
+
+	spin_lock(&sched->ws_lock);
+
+	LASSERT(wi->wi_running);
+
+	if (wi->wi_scheduled) { /* cancel pending schedules */
+		LASSERT(!list_empty(&wi->wi_list));
+		list_del_init(&wi->wi_list);
+
+		LASSERT(sched->ws_nscheduled > 0);
+		sched->ws_nscheduled--;
+	}
+
+	LASSERT(list_empty(&wi->wi_list));
+
+	wi->wi_scheduled = 1; /* LBUG future schedule attempts */
+	spin_unlock(&sched->ws_lock);
+
+	return;
+}
+EXPORT_SYMBOL(cfs_wi_exit);
+
+/**
+ * cancel schedule request of workitem \a wi
+ */
+int
+cfs_wi_deschedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
+{
+	int	rc;
+
+	LASSERT(!in_interrupt()); /* because we use plain spinlock */
+	LASSERT(!sched->ws_stopping);
+
+        /*
+         * return 0 if it's running already, otherwise return 1, which
+         * means the workitem will not be scheduled and will not have
+         * any race with wi_action.
+         */
+	spin_lock(&sched->ws_lock);
+
+	rc = !(wi->wi_running);
+
+	if (wi->wi_scheduled) { /* cancel pending schedules */
+		LASSERT(!list_empty(&wi->wi_list));
+		list_del_init(&wi->wi_list);
+
+		LASSERT(sched->ws_nscheduled > 0);
+		sched->ws_nscheduled--;
+
+		wi->wi_scheduled = 0;
+	}
+
+	LASSERT (list_empty(&wi->wi_list));
+
+	spin_unlock(&sched->ws_lock);
+	return rc;
+}
+EXPORT_SYMBOL(cfs_wi_deschedule);
+
+/*
+ * Workitem scheduled with (serial == 1) is strictly serialised not only with
+ * itself, but also with others scheduled this way.
+ *
+ * Now there's only one static serialised queue, but in the future more might
+ * be added, and even dynamic creation of serialised queues might be supported.
+ */
+void
+cfs_wi_schedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
+{
+	LASSERT(!in_interrupt()); /* because we use plain spinlock */
+	LASSERT(!sched->ws_stopping);
+
+	spin_lock(&sched->ws_lock);
+
+	if (!wi->wi_scheduled) {
+		LASSERT (list_empty(&wi->wi_list));
+
+		wi->wi_scheduled = 1;
+		sched->ws_nscheduled++;
+		if (!wi->wi_running) {
+			list_add_tail(&wi->wi_list, &sched->ws_runq);
+			wake_up(&sched->ws_waitq);
+		} else {
+			list_add(&wi->wi_list, &sched->ws_rerunq);
+		}
+	}
+
+	LASSERT (!list_empty(&wi->wi_list));
+	spin_unlock(&sched->ws_lock);
+	return;
+}
+EXPORT_SYMBOL(cfs_wi_schedule);
+
+static int
+cfs_wi_scheduler(void *arg)
+{
+	struct cfs_wi_sched *sched = (struct cfs_wi_sched *)arg;
+
+	cfs_block_allsigs();
+
+	/* CPT affinity scheduler? */
+	if (sched->ws_cptab != NULL)
+		if (cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt) != 0)
+			CWARN("Unable to bind %s on CPU partition %d\n",
+				sched->ws_name, sched->ws_cpt);
+
+	spin_lock(&cfs_wi_data.wi_glock);
+
+	LASSERT(sched->ws_starting == 1);
+	sched->ws_starting--;
+	sched->ws_nthreads++;
+
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	spin_lock(&sched->ws_lock);
+
+	while (!sched->ws_stopping) {
+		int		nloops = 0;
+		int		rc;
+		struct cfs_workitem *wi;
+
+		while (!list_empty(&sched->ws_runq) &&
+		       nloops < CFS_WI_RESCHED) {
+			wi = list_entry(sched->ws_runq.next,
+					struct cfs_workitem, wi_list);
+			LASSERT(wi->wi_scheduled && !wi->wi_running);
+
+			list_del_init(&wi->wi_list);
+
+			LASSERT(sched->ws_nscheduled > 0);
+			sched->ws_nscheduled--;
+
+                        wi->wi_running   = 1;
+                        wi->wi_scheduled = 0;
+
+			spin_unlock(&sched->ws_lock);
+                        nloops++;
+
+                        rc = (*wi->wi_action) (wi);
+
+			spin_lock(&sched->ws_lock);
+                        if (rc != 0) /* WI should be dead, even be freed! */
+                                continue;
+
+			wi->wi_running = 0;
+			if (list_empty(&wi->wi_list))
+                                continue;
+
+			LASSERT(wi->wi_scheduled);
+			/* wi is rescheduled, should be on rerunq now, we
+			 * move it to runq so it can run action now */
+			list_move_tail(&wi->wi_list, &sched->ws_runq);
+                }
+
+		if (!list_empty(&sched->ws_runq)) {
+			spin_unlock(&sched->ws_lock);
+			/* don't sleep because some workitems still
+			 * expect me to come back soon */
+			cond_resched();
+			spin_lock(&sched->ws_lock);
+			continue;
+		}
+
+		spin_unlock(&sched->ws_lock);
+		rc = wait_event_interruptible_exclusive(sched->ws_waitq,
+				!cfs_wi_sched_cansleep(sched));
+		spin_lock(&sched->ws_lock);
+        }
+
+	spin_unlock(&sched->ws_lock);
+
+	spin_lock(&cfs_wi_data.wi_glock);
+	sched->ws_nthreads--;
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	return 0;
+}
+
+void
+cfs_wi_sched_destroy(struct cfs_wi_sched *sched)
+{
+	LASSERT(cfs_wi_data.wi_init);
+	LASSERT(!cfs_wi_data.wi_stopping);
+
+	spin_lock(&cfs_wi_data.wi_glock);
+	if (sched->ws_stopping) {
+		CDEBUG(D_INFO, "%s is in progress of stopping\n",
+		       sched->ws_name);
+		spin_unlock(&cfs_wi_data.wi_glock);
+		return;
+	}
+
+	LASSERT(!list_empty(&sched->ws_list));
+	sched->ws_stopping = 1;
+
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	wake_up_all(&sched->ws_waitq);
+
+	spin_lock(&cfs_wi_data.wi_glock);
+	{
+		int i = 2;
+
+		while (sched->ws_nthreads > 0) {
+			CDEBUG(is_power_of_2(++i) ? D_WARNING : D_NET,
+			       "waiting for %d threads of WI sched[%s] to "
+			       "terminate\n", sched->ws_nthreads,
+			       sched->ws_name);
+
+			spin_unlock(&cfs_wi_data.wi_glock);
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(cfs_time_seconds(1) / 20);
+			spin_lock(&cfs_wi_data.wi_glock);
+		}
+	}
+
+	list_del(&sched->ws_list);
+
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	LASSERT(sched->ws_nscheduled == 0);
+
+	LIBCFS_FREE(sched, sizeof(*sched));
+}
+EXPORT_SYMBOL(cfs_wi_sched_destroy);
+
+int
+cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab,
+		    int cpt, int nthrs, struct cfs_wi_sched **sched_pp)
+{
+	struct cfs_wi_sched	*sched;
+
+	LASSERT(cfs_wi_data.wi_init);
+	LASSERT(!cfs_wi_data.wi_stopping);
+	LASSERT(cptab == NULL || cpt == CFS_CPT_ANY ||
+		(cpt >= 0 && cpt < cfs_cpt_number(cptab)));
+
+	LIBCFS_ALLOC(sched, sizeof(*sched));
+	if (sched == NULL)
+		return -ENOMEM;
+
+	if (strlen(name) > sizeof(sched->ws_name)-1) {
+		LIBCFS_FREE(sched, sizeof(*sched));
+		return -E2BIG;
+	}
+	strlcpy(sched->ws_name, name, sizeof(sched->ws_name));
+
+	sched->ws_cptab = cptab;
+	sched->ws_cpt = cpt;
+
+	spin_lock_init(&sched->ws_lock);
+	init_waitqueue_head(&sched->ws_waitq);
+
+	INIT_LIST_HEAD(&sched->ws_runq);
+	INIT_LIST_HEAD(&sched->ws_rerunq);
+	INIT_LIST_HEAD(&sched->ws_list);
+
+	for (; nthrs > 0; nthrs--)  {
+		char			name[16];
+		struct task_struct	*task;
+
+		spin_lock(&cfs_wi_data.wi_glock);
+		while (sched->ws_starting > 0) {
+			spin_unlock(&cfs_wi_data.wi_glock);
+			schedule();
+			spin_lock(&cfs_wi_data.wi_glock);
+		}
+
+		sched->ws_starting++;
+		spin_unlock(&cfs_wi_data.wi_glock);
+
+		if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) {
+			snprintf(name, sizeof(name), "%s_%02d_%02d",
+				 sched->ws_name, sched->ws_cpt,
+				 sched->ws_nthreads);
+		} else {
+			snprintf(name, sizeof(name), "%s_%02d",
+				 sched->ws_name, sched->ws_nthreads);
+		}
+
+		task = kthread_run(cfs_wi_scheduler, sched, name);
+		if (IS_ERR(task)) {
+			int rc = PTR_ERR(task);
+
+			CERROR("Failed to create thread for "
+				"WI scheduler %s: %d\n", name, rc);
+
+			spin_lock(&cfs_wi_data.wi_glock);
+
+			/* make up for cfs_wi_sched_destroy */
+			list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
+			sched->ws_starting--;
+
+			spin_unlock(&cfs_wi_data.wi_glock);
+
+			cfs_wi_sched_destroy(sched);
+			return rc;
+		}
+	}
+
+	spin_lock(&cfs_wi_data.wi_glock);
+	list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	*sched_pp = sched;
+	return 0;
+}
+EXPORT_SYMBOL(cfs_wi_sched_create);
+
+int
+cfs_wi_startup(void)
+{
+	memset(&cfs_wi_data, 0, sizeof(struct cfs_workitem_data));
+
+	spin_lock_init(&cfs_wi_data.wi_glock);
+	INIT_LIST_HEAD(&cfs_wi_data.wi_scheds);
+	cfs_wi_data.wi_init = 1;
+
+	return 0;
+}
+
+void
+cfs_wi_shutdown (void)
+{
+	struct cfs_wi_sched	*sched;
+
+	spin_lock(&cfs_wi_data.wi_glock);
+	cfs_wi_data.wi_stopping = 1;
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	/* nobody should contend on this list */
+	list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
+		sched->ws_stopping = 1;
+		wake_up_all(&sched->ws_waitq);
+	}
+
+	list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
+		spin_lock(&cfs_wi_data.wi_glock);
+
+		while (sched->ws_nthreads != 0) {
+			spin_unlock(&cfs_wi_data.wi_glock);
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(cfs_time_seconds(1) / 20);
+			spin_lock(&cfs_wi_data.wi_glock);
+		}
+		spin_unlock(&cfs_wi_data.wi_glock);
+	}
+
+	while (!list_empty(&cfs_wi_data.wi_scheds)) {
+		sched = list_entry(cfs_wi_data.wi_scheds.next,
+				       struct cfs_wi_sched, ws_list);
+		list_del(&sched->ws_list);
+		LIBCFS_FREE(sched, sizeof(*sched));
+	}
+
+	cfs_wi_data.wi_stopping = 0;
+	cfs_wi_data.wi_init = 0;
+}
diff --git a/drivers/staging/lustrefsx/lnet/include/cyaml.h b/drivers/staging/lustrefsx/lnet/include/cyaml.h
new file mode 100644
index 0000000000000..c9c21c750a45d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/cyaml.h
@@ -0,0 +1,257 @@
+/*
+ * LGPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of the
+ * License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * LGPL HEADER END
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * Author:
+ *   Amir Shehata <amir.shehata@intel.com>
+ */
+
+#ifndef CYAML_H
+#define CYAML_H
+
+#include <stdbool.h>
+
+enum cYAML_object_type {
+	CYAML_TYPE_FALSE = 0,
+	CYAML_TYPE_TRUE,
+	CYAML_TYPE_NULL,
+	CYAML_TYPE_NUMBER,
+	CYAML_TYPE_STRING,
+	CYAML_TYPE_ARRAY,
+	CYAML_TYPE_OBJECT
+};
+
+struct cYAML {
+	/* next/prev allow you to walk array/object chains. */
+	struct cYAML *cy_next, *cy_prev;
+	/* An array or object item will have a child pointer pointing
+	   to a chain of the items in the array/object. */
+	struct cYAML *cy_child;
+	/* The type of the item, as above. */
+	enum cYAML_object_type cy_type;
+
+	/* The item's string, if type==CYAML_TYPE_STRING */
+	char *cy_valuestring;
+	/* The item's number, if type==CYAML_TYPE_NUMBER */
+	int cy_valueint;
+	/* The item's number, if type==CYAML_TYPE_NUMBER */
+	double cy_valuedouble;
+	/* The item's name string, if this item is the child of,
+	   or is in the list of subitems of an object. */
+	char *cy_string;
+	/* user data which might need to be tracked per object */
+	void *cy_user_data;
+};
+
+typedef void (*cYAML_user_data_free_cb)(void *);
+
+/*
+ * cYAML_walk_cb
+ *   Callback called when recursing through the tree
+ *
+ *   cYAML* - pointer to the node currently being visitied
+ *   void* - user data passed to the callback.
+ *   void** - output value from the callback
+ *
+ * Returns true to continue recursing.  false to stop recursing
+ */
+typedef bool (*cYAML_walk_cb)(struct cYAML *, void *, void**);
+
+/*
+ * cYAML_build_tree
+ *   Build a tree representation of the YAML formatted text passed in.
+ *
+ *   yaml_file - YAML file to parse and build tree representation
+ *   yaml_blk - blk of YAML.  yaml_file takes precedence if both
+ *   are defined.
+ *   yaml_blk_size - length of the yaml block (obtained via strlen)
+ */
+struct cYAML *cYAML_build_tree(char *yaml_file, const char *yaml_blk,
+				size_t yaml_blk_size,
+				struct cYAML **err_str, bool debug);
+
+/*
+ * cYAML_print_tree
+ *   Print the textual representation of a YAML tree to stderr
+ *
+ *   node - Node where you want to start printing
+ */
+void cYAML_print_tree(struct cYAML *node);
+
+/*
+ * cYAML_print_tree2file
+ *   Print the textual representation of a YAML tree to file
+ *
+ *   f - file to print to
+ *   node - Node where you want to start printing
+ */
+void cYAML_print_tree2file(FILE *f, struct cYAML *node);
+
+/*
+ * cYAML_free_tree
+ *   Free the cYAML tree returned as part of the cYAML_build_tree
+ *
+ *   node - root of the tree to be freed
+ */
+void cYAML_free_tree(struct cYAML *node);
+
+/*
+ * cYAML_get_object_item
+ *   Returns the cYAML object which key correspods to the name passed in
+ *   This function searches only through the current level.
+ *
+ *   parent - is the parent object on which you want to conduct the search
+ *   name - key name of the object you want to find.
+ */
+struct cYAML *cYAML_get_object_item(struct cYAML *parent,
+				    const char *name);
+
+/*
+ * cYAML_get_next_seq_item
+ *   Returns the next item in the YAML sequence.  This function uses the
+ *   itm parameter to keep track of its position in the sequence.  If the
+ *   itm parameter is reset to NULL between calls that resets and returns
+ *   the first item in the sequence.
+ *   This function returns NULL when there are no more items in the
+ *   sequence.
+ *
+ *   seq - is the head node of the YAML sequence
+ *   itm - [OUT] next sequence item to continue looking from next time.
+ *
+ */
+struct cYAML *cYAML_get_next_seq_item(struct cYAML *seq,
+				      struct cYAML **itm);
+
+/*
+ * cYAML_is_seq
+ *   Returns 1 if the node provided is an ARRAY 0 otherwise
+ *
+ *   node - the node to examine
+ *
+ */
+bool cYAML_is_sequence(struct cYAML *node);
+
+/*
+ * cYAML_find_object
+ *   Returns the cYAML object which key correspods to the name passed in
+ *   this function searches the entire tree.
+ *
+ *   root - is the root of the tree on which you want to conduct the search
+ *   name - key name of the object you want to find.
+ */
+struct cYAML *cYAML_find_object(struct cYAML *root, const char *key);
+
+/*
+ * cYAML_clean_usr_data
+ *   walks the tree and for each node with some user data it calls the
+ *   free_cb with the user data as a parameter.
+ *
+ *   node: node to start the walk from
+ *   free_cb: cb to call to cleanup the user data
+ */
+void cYAML_clean_usr_data(struct cYAML *node,
+			  cYAML_user_data_free_cb free_cb);
+
+/*
+ * cYAML_create_object
+ *  Creates a CYAML of type OBJECT
+ *
+ *  parent - parent node
+ *  key - node key
+ */
+struct cYAML *cYAML_create_object(struct cYAML *parent, char *key);
+
+/*
+ * cYAML_create_seq
+ *  Creates a CYAML of type ARRAY
+ *  Once this is created, more sequence items can be added.
+ *
+ *  parent - parent node
+ *  key - node key
+ */
+struct cYAML *cYAML_create_seq(struct cYAML *parent, char *key);
+
+/*
+ * cYAML_create_object
+ *  Create a sequence item, which can have more entites added underneath
+ *  it
+ *
+ *  parent - parent node
+ */
+struct cYAML *cYAML_create_seq_item(struct cYAML *seq);
+
+/*
+ * cYAML_create_string
+ *   Creates a cYAML node of type STRING
+ *
+ *   parent - parent node
+ *   key - node key
+ *   value - value of node
+ */
+struct cYAML *cYAML_create_string(struct cYAML *parent, char *key,
+				  char *value);
+
+/*
+ * cYAML_create_string
+ *   Creates a cYAML node of type STRING
+ *
+ *   parent - parent node
+ *   key - node key
+ *   value - value of node
+ */
+struct cYAML *cYAML_create_number(struct cYAML *parent, char *key,
+				  double value);
+
+/*
+ * cYAML_insert_sibling
+ *   inserts one cYAML object as a sibling to another
+ *
+ *   root - root node to have a sibling added to
+ *   sibling - sibling to be added
+ */
+void cYAML_insert_sibling(struct cYAML *root, struct cYAML *sibling);
+
+/*
+ * cYAML_insert_child
+ *   inserts one cYAML object as a child to another
+ *
+ *   parent - parent node to have a child added to
+ *   child - child to be added
+ */
+void cYAML_insert_child(struct cYAML *parent, struct cYAML *node);
+
+/*
+ * cYAML_build_error
+ *   Build a YAML error message given:
+ *
+ *   rc - return code to add in the error
+ *   seq_no - a sequence number to add in the error
+ *   cmd - the command that failed.
+ *   entity - command entity that failed.
+ *   err_str - error string to add in the error
+ *   root - the root to which to add the YAML error
+ */
+void cYAML_build_error(int rc, int seq_no, char *cmd,
+			char *entity, char *err_str,
+			struct cYAML **root);
+
+
+#endif /* CYAML_H */
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/api.h b/drivers/staging/lustrefsx/lnet/include/lnet/api.h
new file mode 100644
index 0000000000000..84c6bd0039632
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/api.h
@@ -0,0 +1,217 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_API_H__
+#define __LNET_API_H__
+
+/** \defgroup lnet LNet
+ *
+ * The Lustre Networking subsystem.
+ *
+ * LNet is an asynchronous message-passing API, which provides an unreliable
+ * connectionless service that can't guarantee any order. It supports OFA IB,
+ * TCP/IP, and Cray Portals, and routes between heterogeneous networks.
+ * @{
+ */
+
+#ifndef __KERNEL__
+# error This include is only for kernel use.
+#endif
+
+#include <lnet/types.h>
+
+/** \defgroup lnet_init_fini Initialization and cleanup
+ * The LNet must be properly initialized before any LNet calls can be made.
+ * @{ */
+int LNetNIInit(lnet_pid_t requested_pid);
+int LNetNIFini(void);
+/** @} lnet_init_fini */
+
+/** \defgroup lnet_addr LNet addressing and basic types
+ *
+ * Addressing scheme and basic data types of LNet.
+ *
+ * The LNet API is memory-oriented, so LNet must be able to address not only
+ * end-points but also memory region within a process address space.
+ * An ::lnet_nid_t addresses an end-point. An ::lnet_pid_t identifies a process
+ * in a node. A portal represents an opening in the address space of a
+ * process. Match bits is criteria to identify a region of memory inside a
+ * portal, and offset specifies an offset within the memory region.
+ *
+ * LNet creates a table of portals for each process during initialization.
+ * This table has MAX_PORTALS entries and its size can't be dynamically
+ * changed. A portal stays empty until the owning process starts to add
+ * memory regions to it. A portal is sometimes called an index because
+ * it's an entry in the portals table of a process.
+ *
+ * \see LNetMEAttach
+ * @{ */
+int LNetGetId(unsigned int index, struct lnet_process_id *id);
+int LNetDist(lnet_nid_t nid, lnet_nid_t *srcnid, __u32 *order);
+lnet_nid_t LNetPrimaryNID(lnet_nid_t nid);
+bool LNetIsPeerLocal(lnet_nid_t nid);
+
+/** @} lnet_addr */
+
+
+/** \defgroup lnet_me Match entries
+ *
+ * A match entry (abbreviated as ME) describes a set of criteria to accept
+ * incoming requests.
+ *
+ * A portal is essentially a match list plus a set of attributes. A match
+ * list is a chain of MEs. Each ME includes a pointer to a memory descriptor
+ * and a set of match criteria. The match criteria can be used to reject
+ * incoming requests based on process ID or the match bits provided in the
+ * request. MEs can be dynamically inserted into a match list by LNetMEAttach()
+ * and LNetMEInsert(), and removed from its list by LNetMEUnlink().
+ * @{ */
+int LNetMEAttach(unsigned int	   portal,
+		 struct lnet_process_id	match_id_in,
+		 __u64		   match_bits_in,
+		 __u64		   ignore_bits_in,
+		 enum lnet_unlink	unlink_in,
+		 enum lnet_ins_pos	pos_in,
+		 struct lnet_handle_me	*handle_out);
+
+int LNetMEInsert(struct lnet_handle_me	current_in,
+		 struct lnet_process_id match_id_in,
+		 __u64		   match_bits_in,
+		 __u64		   ignore_bits_in,
+		 enum lnet_unlink  unlink_in,
+		 enum lnet_ins_pos position_in,
+		 struct lnet_handle_me *handle_out);
+
+int LNetMEUnlink(struct lnet_handle_me current_in);
+/** @} lnet_me */
+
+/** \defgroup lnet_md Memory descriptors
+ *
+ * A memory descriptor contains information about a region of a user's
+ * memory (either in kernel or user space) and optionally points to an
+ * event queue where information about the operations performed on the
+ * memory descriptor are recorded. Memory descriptor is abbreviated as
+ * MD and can be used interchangeably with the memory region it describes.
+ *
+ * The LNet API provides two operations to create MDs: LNetMDAttach()
+ * and LNetMDBind(); one operation to unlink and release the resources
+ * associated with a MD: LNetMDUnlink().
+ * @{ */
+int LNetMDAttach(struct lnet_handle_me current_in,
+		 struct lnet_md	   md_in,
+		 enum lnet_unlink  unlink_in,
+		 struct lnet_handle_md *md_handle_out);
+
+int LNetMDBind(struct lnet_md	 md_in,
+	       enum lnet_unlink unlink_in,
+	       struct lnet_handle_md *md_handle_out);
+
+int LNetMDUnlink(struct lnet_handle_md md_in);
+/** @} lnet_md */
+
+/** \defgroup lnet_eq Events and event queues
+ *
+ * Event queues (abbreviated as EQ) are used to log operations performed on
+ * local MDs. In particular, they signal the completion of a data transmission
+ * into or out of a MD. They can also be used to hold acknowledgments for
+ * completed PUT operations and indicate when a MD has been unlinked. Multiple
+ * MDs can share a single EQ. An EQ may have an optional event handler
+ * associated with it. If an event handler exists, it will be run for each
+ * event that is deposited into the EQ.
+ *
+ * In addition to the struct lnet_handle_eq, the LNet API defines two types
+ * associated with events: The ::lnet_event_kind defines the kinds of events
+ * that can be stored in an EQ. The struct lnet_event defines a structure that
+ * holds the information about with an event.
+ *
+ * There are five functions for dealing with EQs: LNetEQAlloc() is used to
+ * create an EQ and allocate the resources needed, while LNetEQFree()
+ * releases these resources and free the EQ. LNetEQGet() retrieves the next
+ * event from an EQ, and LNetEQWait() can be used to block a process until
+ * an EQ has at least one event. LNetEQPoll() can be used to test or wait
+ * on multiple EQs.
+ * @{ */
+int LNetEQAlloc(unsigned int	   count_in,
+		lnet_eq_handler_t  handler,
+		struct lnet_handle_eq *handle_out);
+
+int LNetEQFree(struct lnet_handle_eq eventq_in);
+
+int LNetEQGet(struct lnet_handle_eq eventq_in,
+	      struct lnet_event *event_out);
+
+int LNetEQWait(struct lnet_handle_eq eventq_in,
+	       struct lnet_event *event_out);
+
+int LNetEQPoll(struct lnet_handle_eq *eventqs_in,
+	       int		 neq_in,
+	       signed long	 timeout,
+	       struct lnet_event *event_out,
+	       int		*which_eq_out);
+/** @} lnet_eq */
+
+/** \defgroup lnet_data Data movement operations
+ *
+ * The LNet API provides two data movement operations: LNetPut()
+ * and LNetGet().
+ * @{ */
+int LNetPut(lnet_nid_t	      self,
+	    struct lnet_handle_md md_in,
+	    enum lnet_ack_req	ack_req_in,
+	    struct lnet_process_id target_in,
+	    unsigned int      portal_in,
+	    __u64	      match_bits_in,
+	    unsigned int      offset_in,
+	    __u64	      hdr_data_in);
+
+int LNetGet(lnet_nid_t	      self,
+	    struct lnet_handle_md md_in,
+	    struct lnet_process_id target_in,
+	    unsigned int      portal_in,
+	    __u64	      match_bits_in,
+	    unsigned int      offset_in);
+/** @} lnet_data */
+
+
+/** \defgroup lnet_misc Miscellaneous operations.
+ * Miscellaneous operations.
+ * @{ */
+
+int LNetSetLazyPortal(int portal);
+int LNetClearLazyPortal(int portal);
+int LNetCtl(unsigned int cmd, void *arg);
+void LNetDebugPeer(struct lnet_process_id id);
+
+/** @} lnet_misc */
+
+/** @} lnet */
+#endif
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lib-dlc.h b/drivers/staging/lustrefsx/lnet/include/lnet/lib-dlc.h
new file mode 100644
index 0000000000000..4141f7c492c22
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/lib-dlc.h
@@ -0,0 +1,243 @@
+/*
+ * LGPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library.
+ *
+ * LGPL HEADER END
+ *
+ */
+/*
+ * Copyright (c) 2014, 2016, Intel Corporation.
+ */
+/*
+ * Author: Amir Shehata <amir.shehata@intel.com>
+ */
+
+#ifndef LNET_DLC_H
+#define LNET_DLC_H
+
+#include <libcfs/libcfs_ioctl.h>
+#include <lnet/types.h>
+
+#define MAX_NUM_SHOW_ENTRIES	32
+#define LNET_MAX_STR_LEN	128
+#define LNET_MAX_SHOW_NUM_CPT	128
+#define LNET_UNDEFINED_HOPS	((__u32) -1)
+
+/*
+ * To allow for future enhancements to extend the tunables
+ * add a hdr to this structure, so that the version can be set
+ * and checked for backwards compatibility. Newer versions of LNet
+ * can still work with older versions of lnetctl. The restriction is
+ * that the structure can be added to and not removed from in order
+ * to not invalidate older lnetctl utilities. Moreover, the order of
+ * fields must remain the same, and new fields appended to the structure
+ *
+ * That said all existing LND tunables will be added in this structure
+ * to avoid future changes.
+ */
+struct lnet_ioctl_config_lnd_cmn_tunables {
+	__u32 lct_version;
+	__s32 lct_peer_timeout;
+	__s32 lct_peer_tx_credits;
+	__s32 lct_peer_rtr_credits;
+	__s32 lct_max_tx_credits;
+};
+
+struct lnet_ioctl_config_o2iblnd_tunables {
+	__u32 lnd_version;
+	__u32 lnd_peercredits_hiw;
+	__u32 lnd_map_on_demand;
+	__u32 lnd_concurrent_sends;
+	__u32 lnd_fmr_pool_size;
+	__u32 lnd_fmr_flush_trigger;
+	__u32 lnd_fmr_cache;
+	__u16 lnd_conns_per_peer;
+	__u16 lnd_ntx;
+};
+
+struct lnet_lnd_tunables {
+	union {
+		struct lnet_ioctl_config_o2iblnd_tunables lnd_o2ib;
+	} lnd_tun_u;
+};
+
+struct lnet_ioctl_config_lnd_tunables {
+	struct lnet_ioctl_config_lnd_cmn_tunables lt_cmn;
+	struct lnet_lnd_tunables lt_tun;
+};
+
+struct lnet_ioctl_net_config {
+	char ni_interfaces[LNET_NUM_INTERFACES][LNET_MAX_STR_LEN];
+	__u32 ni_status;
+	__u32 ni_cpts[LNET_MAX_SHOW_NUM_CPT];
+	char cfg_bulk[0];
+};
+
+#define LNET_TINY_BUF_IDX	0
+#define LNET_SMALL_BUF_IDX	1
+#define LNET_LARGE_BUF_IDX	2
+
+/* # different router buffer pools */
+#define LNET_NRBPOOLS		(LNET_LARGE_BUF_IDX + 1)
+
+struct lnet_ioctl_pool_cfg {
+	struct {
+		__u32 pl_npages;
+		__u32 pl_nbuffers;
+		__u32 pl_credits;
+		__u32 pl_mincredits;
+	} pl_pools[LNET_NRBPOOLS];
+	__u32 pl_routing;
+};
+
+struct lnet_ioctl_ping_data {
+	struct libcfs_ioctl_hdr ping_hdr;
+
+	__u32 op_param;
+	__u32 ping_count;
+	__u32 ping_flags;
+	bool mr_info;
+	lnet_process_id_t ping_id;
+	lnet_process_id_t __user *ping_buf;
+};
+
+struct lnet_ioctl_config_data {
+	struct libcfs_ioctl_hdr cfg_hdr;
+
+	__u32 cfg_net;
+	__u32 cfg_count;
+	__u64 cfg_nid;
+	__u32 cfg_ncpts;
+
+	union {
+		struct {
+			__u32 rtr_hop;
+			__u32 rtr_priority;
+			__u32 rtr_flags;
+		} cfg_route;
+		struct {
+			char net_intf[LNET_MAX_STR_LEN];
+			__s32 net_peer_timeout;
+			__s32 net_peer_tx_credits;
+			__s32 net_peer_rtr_credits;
+			__s32 net_max_tx_credits;
+			__u32 net_cksum_algo;
+			__u32 net_interface_count;
+		} cfg_net;
+		struct {
+			__u32 buf_enable;
+			__s32 buf_tiny;
+			__s32 buf_small;
+			__s32 buf_large;
+		} cfg_buffers;
+	} cfg_config_u;
+
+	char cfg_bulk[0];
+};
+
+struct lnet_ioctl_comm_count {
+	__u32 ico_get_count;
+	__u32 ico_put_count;
+	__u32 ico_reply_count;
+	__u32 ico_ack_count;
+	__u32 ico_hello_count;
+};
+
+struct lnet_ioctl_element_stats {
+	__u32 iel_send_count;
+	__u32 iel_recv_count;
+	__u32 iel_drop_count;
+};
+
+struct lnet_ioctl_element_msg_stats {
+	struct libcfs_ioctl_hdr im_hdr;
+	__u32 im_idx;
+	struct lnet_ioctl_comm_count im_send_stats;
+	struct lnet_ioctl_comm_count im_recv_stats;
+	struct lnet_ioctl_comm_count im_drop_stats;
+};
+
+/*
+ * lnet_ioctl_config_ni
+ *  This structure describes an NI configuration. There are multiple components
+ *  when configuring an NI: Net, Interfaces, CPT list and LND tunables
+ *  A network is passed as a string to the DLC and translated using
+ *  libcfs_str2net()
+ *  An interface is the name of the system configured interface
+ *  (ex eth0, ib1)
+ *  CPT is the list of CPTS LND tunables are passed in the lic_bulk area
+ */
+struct lnet_ioctl_config_ni {
+	struct libcfs_ioctl_hdr lic_cfg_hdr;
+	lnet_nid_t		lic_nid;
+	char			lic_ni_intf[LNET_NUM_INTERFACES][LNET_MAX_STR_LEN];
+	char			lic_legacy_ip2nets[LNET_MAX_STR_LEN];
+	__u32			lic_cpts[LNET_MAX_SHOW_NUM_CPT];
+	__u32			lic_ncpts;
+	__u32			lic_status;
+	__u32			lic_tcp_bonding;
+	__u32			lic_idx;
+	__s32			lic_dev_cpt;
+	char			pad[4];
+	char			lic_bulk[0];
+};
+
+struct lnet_peer_ni_credit_info {
+	char cr_aliveness[LNET_MAX_STR_LEN];
+	__u32 cr_refcount;
+	__s32 cr_ni_peer_tx_credits;
+	__s32 cr_peer_tx_credits;
+	__s32 cr_peer_min_tx_credits;
+	__u32 cr_peer_tx_qnob;
+	__s32 cr_peer_rtr_credits;
+	__s32 cr_peer_min_rtr_credits;
+	__u32 cr_ncpt;
+};
+
+struct lnet_ioctl_peer {
+	struct libcfs_ioctl_hdr pr_hdr;
+	__u32 pr_count;
+	__u32 pr_pad;
+	lnet_nid_t pr_nid;
+
+	union {
+		struct lnet_peer_ni_credit_info  pr_peer_credits;
+	} pr_lnd_u;
+};
+
+struct lnet_ioctl_peer_cfg {
+	struct libcfs_ioctl_hdr prcfg_hdr;
+	lnet_nid_t prcfg_prim_nid;
+	lnet_nid_t prcfg_cfg_nid;
+	__u32 prcfg_count;
+	bool prcfg_mr;
+	__u32 prcfg_state;
+	__u32 prcfg_size;
+	void __user *prcfg_bulk;
+};
+
+struct lnet_ioctl_numa_range {
+	struct libcfs_ioctl_hdr nr_hdr;
+	__u32 nr_range;
+};
+
+struct lnet_ioctl_lnet_stats {
+	struct libcfs_ioctl_hdr st_hdr;
+	struct lnet_counters st_cntrs;
+};
+
+#endif /* LNET_DLC_H */
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h b/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h
new file mode 100644
index 0000000000000..48db6dd08a2a3
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h
@@ -0,0 +1,895 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/lib-lnet.h
+ *
+ * Top level include for library side routines
+ */
+
+#ifndef __LNET_LIB_LNET_H__
+#define __LNET_LIB_LNET_H__
+
+#ifndef __KERNEL__
+# error This include is only for kernel use.
+#endif
+
+#include <libcfs/linux/linux-misc.h>
+#include <libcfs/libcfs.h>
+#include <lnet/api.h>
+#include <lnet/lnet.h>
+#include <lnet/lib-types.h>
+
+extern struct lnet the_lnet;			/* THE network */
+
+#if (BITS_PER_LONG == 32)
+/* 2 CPTs, allowing more CPTs might make us under memory pressure */
+# define LNET_CPT_MAX_BITS     1
+
+#else /* 64-bit system */
+/*
+ * 256 CPTs for thousands of CPUs, allowing more CPTs might make us
+ * under risk of consuming all lh_cookie.
+ */
+# define LNET_CPT_MAX_BITS     8
+#endif /* BITS_PER_LONG == 32 */
+
+/* max allowed CPT number */
+#define LNET_CPT_MAX		(1 << LNET_CPT_MAX_BITS)
+
+#define LNET_CPT_NUMBER		(the_lnet.ln_cpt_number)
+#define LNET_CPT_BITS		(the_lnet.ln_cpt_bits)
+#define LNET_CPT_MASK		((1ULL << LNET_CPT_BITS) - 1)
+
+/** exclusive lock */
+#define LNET_LOCK_EX		CFS_PERCPT_LOCK_EX
+
+static inline int lnet_is_route_alive(struct lnet_route *route)
+{
+	if (!route->lr_gateway->lpni_alive)
+		return 0; /* gateway is down */
+	if ((route->lr_gateway->lpni_ping_feats &
+	     LNET_PING_FEAT_NI_STATUS) == 0)
+		return 1; /* no NI status, assume it's alive */
+	/* has NI status, check # down NIs */
+	return route->lr_downis == 0;
+}
+
+static inline int lnet_is_wire_handle_none(struct lnet_handle_wire *wh)
+{
+	return (wh->wh_interface_cookie == LNET_WIRE_HANDLE_COOKIE_NONE &&
+		wh->wh_object_cookie == LNET_WIRE_HANDLE_COOKIE_NONE);
+}
+
+static inline int lnet_md_exhausted(struct lnet_libmd *md)
+{
+	return (md->md_threshold == 0 ||
+		((md->md_options & LNET_MD_MAX_SIZE) != 0 &&
+		 md->md_offset + md->md_max_size > md->md_length));
+}
+
+static inline int lnet_md_unlinkable(struct lnet_libmd *md)
+{
+	/* Should unlink md when its refcount is 0 and either:
+	 *  - md has been flagged for deletion (by auto unlink or LNetM[DE]Unlink,
+	 *    in the latter case md may not be exhausted).
+	 *  - auto unlink is on and md is exhausted.
+	 */
+	if (md->md_refcount != 0)
+		return 0;
+
+	if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) != 0)
+		return 1;
+
+	return ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0 &&
+		lnet_md_exhausted(md));
+}
+
+#define lnet_cpt_table()	(the_lnet.ln_cpt_table)
+#define lnet_cpt_current()	cfs_cpt_current(the_lnet.ln_cpt_table, 1)
+
+static inline int
+lnet_cpt_of_cookie(__u64 cookie)
+{
+	unsigned int cpt = (cookie >> LNET_COOKIE_TYPE_BITS) & LNET_CPT_MASK;
+
+	/* LNET_CPT_NUMBER doesn't have to be power2, which means we can
+	 * get illegal cpt from it's invalid cookie */
+	return cpt < LNET_CPT_NUMBER ? cpt : cpt % LNET_CPT_NUMBER;
+}
+
+static inline void
+lnet_res_lock(int cpt)
+{
+	cfs_percpt_lock(the_lnet.ln_res_lock, cpt);
+}
+
+static inline void
+lnet_res_unlock(int cpt)
+{
+	cfs_percpt_unlock(the_lnet.ln_res_lock, cpt);
+}
+
+static inline int
+lnet_res_lock_current(void)
+{
+	int cpt = lnet_cpt_current();
+
+	lnet_res_lock(cpt);
+	return cpt;
+}
+
+static inline void
+lnet_net_lock(int cpt)
+{
+	cfs_percpt_lock(the_lnet.ln_net_lock, cpt);
+}
+
+static inline void
+lnet_net_unlock(int cpt)
+{
+	cfs_percpt_unlock(the_lnet.ln_net_lock, cpt);
+}
+
+static inline int
+lnet_net_lock_current(void)
+{
+	int cpt = lnet_cpt_current();
+
+	lnet_net_lock(cpt);
+	return cpt;
+}
+
+#define LNET_LOCK()		lnet_net_lock(LNET_LOCK_EX)
+#define LNET_UNLOCK()		lnet_net_unlock(LNET_LOCK_EX)
+
+#define lnet_ptl_lock(ptl)	spin_lock(&(ptl)->ptl_lock)
+#define lnet_ptl_unlock(ptl)	spin_unlock(&(ptl)->ptl_lock)
+#define lnet_eq_wait_lock()	spin_lock(&the_lnet.ln_eq_wait_lock)
+#define lnet_eq_wait_unlock()	spin_unlock(&the_lnet.ln_eq_wait_lock)
+#define lnet_ni_lock(ni)	spin_lock(&(ni)->ni_lock)
+#define lnet_ni_unlock(ni)	spin_unlock(&(ni)->ni_lock)
+
+#define MAX_PORTALS	64
+
+#define LNET_SMALL_MD_SIZE   offsetof(struct lnet_libmd, md_iov.iov[1])
+extern struct kmem_cache *lnet_mes_cachep;	 /* MEs kmem_cache */
+extern struct kmem_cache *lnet_small_mds_cachep; /* <= LNET_SMALL_MD_SIZE bytes
+						  * MDs kmem_cache */
+
+static inline struct lnet_eq *
+lnet_eq_alloc (void)
+{
+	struct lnet_eq *eq;
+
+	LIBCFS_ALLOC(eq, sizeof(*eq));
+	return (eq);
+}
+
+static inline void
+lnet_eq_free(struct lnet_eq *eq)
+{
+	LIBCFS_FREE(eq, sizeof(*eq));
+}
+
+static inline struct lnet_libmd *
+lnet_md_alloc(struct lnet_md *umd)
+{
+	struct lnet_libmd *md;
+	unsigned int  size;
+	unsigned int  niov;
+
+	if ((umd->options & LNET_MD_KIOV) != 0) {
+		niov = umd->length;
+		size = offsetof(struct lnet_libmd, md_iov.kiov[niov]);
+	} else {
+		niov = ((umd->options & LNET_MD_IOVEC) != 0) ?
+		       umd->length : 1;
+		size = offsetof(struct lnet_libmd, md_iov.iov[niov]);
+	}
+
+	if (size <= LNET_SMALL_MD_SIZE) {
+		md = kmem_cache_alloc(lnet_small_mds_cachep,
+				      GFP_NOFS | __GFP_ZERO);
+		if (md) {
+			CDEBUG(D_MALLOC, "slab-alloced 'md' of size %u at "
+			       "%p.\n", size, md);
+		} else {
+			CDEBUG(D_MALLOC, "failed to allocate 'md' of size %u\n",
+			       size);
+			return NULL;
+		}
+	} else {
+		LIBCFS_ALLOC(md, size);
+	}
+
+	if (md != NULL) {
+		/* Set here in case of early free */
+		md->md_options = umd->options;
+		md->md_niov = niov;
+		INIT_LIST_HEAD(&md->md_list);
+	}
+
+	return md;
+}
+
+static inline void
+lnet_md_free(struct lnet_libmd *md)
+{
+	unsigned int  size;
+
+	if ((md->md_options & LNET_MD_KIOV) != 0)
+		size = offsetof(struct lnet_libmd, md_iov.kiov[md->md_niov]);
+	else
+		size = offsetof(struct lnet_libmd, md_iov.iov[md->md_niov]);
+
+	if (size <= LNET_SMALL_MD_SIZE) {
+		CDEBUG(D_MALLOC, "slab-freed 'md' at %p.\n", md);
+		kmem_cache_free(lnet_small_mds_cachep, md);
+	} else {
+		LIBCFS_FREE(md, size);
+	}
+}
+
+static inline struct lnet_me *
+lnet_me_alloc (void)
+{
+	struct lnet_me *me;
+
+	me = kmem_cache_alloc(lnet_mes_cachep, GFP_NOFS | __GFP_ZERO);
+
+	if (me)
+		CDEBUG(D_MALLOC, "slab-alloced 'me' at %p.\n", me);
+	else
+		CDEBUG(D_MALLOC, "failed to allocate 'me'\n");
+
+	return me;
+}
+
+static inline void
+lnet_me_free(struct lnet_me *me)
+{
+	CDEBUG(D_MALLOC, "slab-freed 'me' at %p.\n", me);
+	kmem_cache_free(lnet_mes_cachep, me);
+}
+
+struct lnet_libhandle *lnet_res_lh_lookup(struct lnet_res_container *rec,
+				     __u64 cookie);
+void lnet_res_lh_initialize(struct lnet_res_container *rec,
+			    struct lnet_libhandle *lh);
+static inline void
+lnet_res_lh_invalidate(struct lnet_libhandle *lh)
+{
+	/* ALWAYS called with resource lock held */
+	/* NB: cookie is still useful, don't reset it */
+	list_del(&lh->lh_hash_chain);
+}
+
+static inline void
+lnet_eq2handle(struct lnet_handle_eq *handle, struct lnet_eq *eq)
+{
+	if (eq == NULL) {
+		LNetInvalidateEQHandle(handle);
+		return;
+	}
+
+	handle->cookie = eq->eq_lh.lh_cookie;
+}
+
+static inline struct lnet_eq *
+lnet_handle2eq(struct lnet_handle_eq *handle)
+{
+	/* ALWAYS called with resource lock held */
+	struct lnet_libhandle *lh;
+
+	lh = lnet_res_lh_lookup(&the_lnet.ln_eq_container, handle->cookie);
+	if (lh == NULL)
+		return NULL;
+
+	return lh_entry(lh, struct lnet_eq, eq_lh);
+}
+
+static inline void
+lnet_md2handle(struct lnet_handle_md *handle, struct lnet_libmd *md)
+{
+	handle->cookie = md->md_lh.lh_cookie;
+}
+
+static inline struct lnet_libmd *
+lnet_handle2md(struct lnet_handle_md *handle)
+{
+	/* ALWAYS called with resource lock held */
+	struct lnet_libhandle *lh;
+	int		 cpt;
+
+	cpt = lnet_cpt_of_cookie(handle->cookie);
+	lh = lnet_res_lh_lookup(the_lnet.ln_md_containers[cpt],
+				handle->cookie);
+	if (lh == NULL)
+		return NULL;
+
+	return lh_entry(lh, struct lnet_libmd, md_lh);
+}
+
+static inline struct lnet_libmd *
+lnet_wire_handle2md(struct lnet_handle_wire *wh)
+{
+	/* ALWAYS called with resource lock held */
+	struct lnet_libhandle *lh;
+	int		 cpt;
+
+	if (wh->wh_interface_cookie != the_lnet.ln_interface_cookie)
+		return NULL;
+
+	cpt = lnet_cpt_of_cookie(wh->wh_object_cookie);
+	lh = lnet_res_lh_lookup(the_lnet.ln_md_containers[cpt],
+				wh->wh_object_cookie);
+	if (lh == NULL)
+		return NULL;
+
+	return lh_entry(lh, struct lnet_libmd, md_lh);
+}
+
+static inline void
+lnet_me2handle(struct lnet_handle_me *handle, struct lnet_me *me)
+{
+	handle->cookie = me->me_lh.lh_cookie;
+}
+
+static inline struct lnet_me *
+lnet_handle2me(struct lnet_handle_me *handle)
+{
+	/* ALWAYS called with resource lock held */
+	struct lnet_libhandle *lh;
+	int		 cpt;
+
+	cpt = lnet_cpt_of_cookie(handle->cookie);
+	lh = lnet_res_lh_lookup(the_lnet.ln_me_containers[cpt],
+				handle->cookie);
+	if (lh == NULL)
+		return NULL;
+
+	return lh_entry(lh, struct lnet_me, me_lh);
+}
+
+static inline void
+lnet_peer_ni_addref_locked(struct lnet_peer_ni *lp)
+{
+	LASSERT (atomic_read(&lp->lpni_refcount) > 0);
+	atomic_inc(&lp->lpni_refcount);
+}
+
+extern void lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lp);
+
+static inline void
+lnet_peer_ni_decref_locked(struct lnet_peer_ni *lp)
+{
+	LASSERT (atomic_read(&lp->lpni_refcount) > 0);
+	atomic_dec(&lp->lpni_refcount);
+	if (atomic_read(&lp->lpni_refcount) == 0)
+		lnet_destroy_peer_ni_locked(lp);
+}
+
+static inline int
+lnet_isrouter(struct lnet_peer_ni *lp)
+{
+	return lp->lpni_rtr_refcount != 0;
+}
+
+static inline void
+lnet_ni_addref_locked(struct lnet_ni *ni, int cpt)
+{
+	LASSERT(cpt >= 0 && cpt < LNET_CPT_NUMBER);
+	LASSERT(*ni->ni_refs[cpt] >= 0);
+
+	(*ni->ni_refs[cpt])++;
+}
+
+static inline void
+lnet_ni_addref(struct lnet_ni *ni)
+{
+	lnet_net_lock(0);
+	lnet_ni_addref_locked(ni, 0);
+	lnet_net_unlock(0);
+}
+
+static inline void
+lnet_ni_decref_locked(struct lnet_ni *ni, int cpt)
+{
+	LASSERT(cpt >= 0 && cpt < LNET_CPT_NUMBER);
+	LASSERT(*ni->ni_refs[cpt] > 0);
+
+	(*ni->ni_refs[cpt])--;
+}
+
+static inline void
+lnet_ni_decref(struct lnet_ni *ni)
+{
+	lnet_net_lock(0);
+	lnet_ni_decref_locked(ni, 0);
+	lnet_net_unlock(0);
+}
+
+static inline struct lnet_msg *
+lnet_msg_alloc(void)
+{
+	struct lnet_msg *msg;
+
+	LIBCFS_ALLOC(msg, sizeof(*msg));
+
+	/* no need to zero, LIBCFS_ALLOC does for us */
+	return (msg);
+}
+
+static inline void
+lnet_msg_free(struct lnet_msg *msg)
+{
+	LASSERT(!msg->msg_onactivelist);
+	LIBCFS_FREE(msg, sizeof(*msg));
+}
+
+void lnet_ni_free(struct lnet_ni *ni);
+void lnet_net_free(struct lnet_net *net);
+
+struct lnet_net *
+lnet_net_alloc(__u32 net_type, struct list_head *netlist);
+
+struct lnet_ni *
+lnet_ni_alloc(struct lnet_net *net, struct cfs_expr_list *el,
+	      char *iface);
+struct lnet_ni *
+lnet_ni_alloc_w_cpt_array(struct lnet_net *net, __u32 *cpts, __u32 ncpts,
+			  char *iface);
+
+static inline int
+lnet_nid2peerhash(lnet_nid_t nid)
+{
+	return hash_long(nid, LNET_PEER_HASH_BITS);
+}
+
+static inline struct list_head *
+lnet_net2rnethash(__u32 net)
+{
+	return &the_lnet.ln_remote_nets_hash[(LNET_NETNUM(net) +
+		LNET_NETTYP(net)) &
+		((1U << the_lnet.ln_remote_nets_hbits) - 1)];
+}
+
+extern struct lnet_lnd the_lolnd;
+extern int avoid_asym_router_failure;
+
+extern unsigned int lnet_nid_cpt_hash(lnet_nid_t nid, unsigned int number);
+extern int lnet_cpt_of_nid_locked(lnet_nid_t nid, struct lnet_ni *ni);
+extern int lnet_cpt_of_nid(lnet_nid_t nid, struct lnet_ni *ni);
+extern struct lnet_ni *lnet_nid2ni_locked(lnet_nid_t nid, int cpt);
+extern struct lnet_ni *lnet_nid2ni_addref(lnet_nid_t nid);
+extern struct lnet_ni *lnet_net2ni_locked(__u32 net, int cpt);
+extern struct lnet_ni *lnet_net2ni_addref(__u32 net);
+bool lnet_is_ni_healthy_locked(struct lnet_ni *ni);
+struct lnet_net *lnet_get_net_locked(__u32 net_id);
+
+int lnet_lib_init(void);
+void lnet_lib_exit(void);
+
+extern unsigned int lnet_numa_range;
+extern int portal_rotor;
+
+int lnet_notify(struct lnet_ni *ni, lnet_nid_t peer, int alive,
+		cfs_time_t when);
+void lnet_notify_locked(struct lnet_peer_ni *lp, int notifylnd, int alive,
+			cfs_time_t when);
+int lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway_nid,
+		   unsigned int priority);
+int lnet_check_routes(void);
+int lnet_del_route(__u32 net, lnet_nid_t gw_nid);
+void lnet_destroy_routes(void);
+int lnet_get_route(int idx, __u32 *net, __u32 *hops,
+		   lnet_nid_t *gateway, __u32 *alive, __u32 *priority);
+int lnet_get_rtr_pool_cfg(int idx, struct lnet_ioctl_pool_cfg *pool_cfg);
+struct lnet_ni *lnet_get_next_ni_locked(struct lnet_net *mynet,
+					struct lnet_ni *prev);
+struct lnet_ni *lnet_get_ni_idx_locked(int idx);
+
+struct libcfs_ioctl_handler {
+	struct list_head item;
+	int (*handle_ioctl)(unsigned int cmd, struct libcfs_ioctl_hdr *hdr);
+};
+
+#define DECLARE_IOCTL_HANDLER(ident, func)			\
+	static struct libcfs_ioctl_handler ident = {		\
+		.item	      = LIST_HEAD_INIT(ident.item),	\
+		.handle_ioctl = func				\
+	}
+
+extern int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand);
+extern int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand);
+extern int libcfs_ioctl_getdata(struct libcfs_ioctl_hdr **hdr_pp,
+				struct libcfs_ioctl_hdr __user *uparam);
+
+void lnet_proc_init(void);
+void lnet_proc_fini(void);
+int  lnet_rtrpools_alloc(int im_a_router);
+void lnet_destroy_rtrbuf(struct lnet_rtrbuf *rb, int npages);
+int  lnet_rtrpools_adjust(int tiny, int small, int large);
+int lnet_rtrpools_enable(void);
+void lnet_rtrpools_disable(void);
+void lnet_rtrpools_free(int keep_pools);
+struct lnet_remotenet *lnet_find_rnet_locked(__u32 net);
+int lnet_dyn_add_net(struct lnet_ioctl_config_data *conf);
+int lnet_dyn_del_net(__u32 net);
+int lnet_dyn_add_ni(struct lnet_ioctl_config_ni *conf);
+int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf);
+int lnet_clear_lazy_portal(struct lnet_ni *ni, int portal, char *reason);
+struct lnet_net *lnet_get_net_locked(__u32 net_id);
+
+int lnet_islocalnid(lnet_nid_t nid);
+int lnet_islocalnet(__u32 net);
+
+void lnet_msg_attach_md(struct lnet_msg *msg, struct lnet_libmd *md,
+			unsigned int offset, unsigned int mlen);
+void lnet_msg_detach_md(struct lnet_msg *msg, int status);
+void lnet_build_unlink_event(struct lnet_libmd *md, struct lnet_event *ev);
+void lnet_build_msg_event(struct lnet_msg *msg, enum lnet_event_kind ev_type);
+void lnet_msg_commit(struct lnet_msg *msg, int cpt);
+void lnet_msg_decommit(struct lnet_msg *msg, int cpt, int status);
+
+void lnet_eq_enqueue_event(struct lnet_eq *eq, struct lnet_event *ev);
+void lnet_prep_send(struct lnet_msg *msg, int type,
+		    struct lnet_process_id target, unsigned int offset,
+		    unsigned int len);
+int lnet_send(lnet_nid_t nid, struct lnet_msg *msg, lnet_nid_t rtr_nid);
+void lnet_return_tx_credits_locked(struct lnet_msg *msg);
+void lnet_return_rx_credits_locked(struct lnet_msg *msg);
+void lnet_schedule_blocked_locked(struct lnet_rtrbufpool *rbp);
+void lnet_drop_routed_msgs_locked(struct list_head *list, int cpt);
+
+/* portals functions */
+/* portals attributes */
+static inline int
+lnet_ptl_is_lazy(struct lnet_portal *ptl)
+{
+	return !!(ptl->ptl_options & LNET_PTL_LAZY);
+}
+
+static inline int
+lnet_ptl_is_unique(struct lnet_portal *ptl)
+{
+	return !!(ptl->ptl_options & LNET_PTL_MATCH_UNIQUE);
+}
+
+static inline int
+lnet_ptl_is_wildcard(struct lnet_portal *ptl)
+{
+	return !!(ptl->ptl_options & LNET_PTL_MATCH_WILDCARD);
+}
+
+static inline void
+lnet_ptl_setopt(struct lnet_portal *ptl, int opt)
+{
+	ptl->ptl_options |= opt;
+}
+
+static inline void
+lnet_ptl_unsetopt(struct lnet_portal *ptl, int opt)
+{
+	ptl->ptl_options &= ~opt;
+}
+
+/* match-table functions */
+struct list_head *lnet_mt_match_head(struct lnet_match_table *mtable,
+			       struct lnet_process_id id, __u64 mbits);
+struct lnet_match_table *lnet_mt_of_attach(unsigned int index,
+					   struct lnet_process_id id,
+					   __u64 mbits, __u64 ignore_bits,
+					   enum lnet_ins_pos pos);
+int lnet_mt_match_md(struct lnet_match_table *mtable,
+		     struct lnet_match_info *info, struct lnet_msg *msg);
+
+/* portals match/attach functions */
+void lnet_ptl_attach_md(struct lnet_me *me, struct lnet_libmd *md,
+			struct list_head *matches, struct list_head *drops);
+void lnet_ptl_detach_md(struct lnet_me *me, struct lnet_libmd *md);
+int lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg);
+
+/* initialized and finalize portals */
+int lnet_portals_create(void);
+void lnet_portals_destroy(void);
+
+/* message functions */
+int lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr,
+	       lnet_nid_t fromnid, void *private, int rdma_req);
+int lnet_parse_local(struct lnet_ni *ni, struct lnet_msg *msg);
+int lnet_parse_forward_locked(struct lnet_ni *ni, struct lnet_msg *msg);
+
+void lnet_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg,
+	       int delayed, unsigned int offset, unsigned int mlen,
+	       unsigned int rlen);
+void lnet_ni_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg,
+		  int delayed, unsigned int offset,
+		  unsigned int mlen, unsigned int rlen);
+
+struct lnet_msg *lnet_create_reply_msg(struct lnet_ni *ni,
+				       struct lnet_msg *get_msg);
+void lnet_set_reply_msg_len(struct lnet_ni *ni, struct lnet_msg *msg,
+			    unsigned int len);
+
+void lnet_finalize(struct lnet_msg *msg, int rc);
+
+void lnet_drop_message(struct lnet_ni *ni, int cpt, void *private,
+		       unsigned int nob);
+void lnet_drop_delayed_msg_list(struct list_head *head, char *reason);
+void lnet_recv_delayed_msg_list(struct list_head *head);
+
+int lnet_msg_container_setup(struct lnet_msg_container *container, int cpt);
+void lnet_msg_container_cleanup(struct lnet_msg_container *container);
+void lnet_msg_containers_destroy(void);
+int lnet_msg_containers_create(void);
+
+char *lnet_msgtyp2str(int type);
+void lnet_print_hdr(struct lnet_hdr *hdr);
+int lnet_fail_nid(lnet_nid_t nid, unsigned int threshold);
+
+/** \addtogroup lnet_fault_simulation @{ */
+
+int lnet_fault_ctl(int cmd, struct libcfs_ioctl_data *data);
+int lnet_fault_init(void);
+void lnet_fault_fini(void);
+
+bool lnet_drop_rule_match(struct lnet_hdr *hdr);
+
+int lnet_delay_rule_add(struct lnet_fault_attr *attr);
+int lnet_delay_rule_del(lnet_nid_t src, lnet_nid_t dst, bool shutdown);
+int lnet_delay_rule_list(int pos, struct lnet_fault_attr *attr,
+			 struct lnet_fault_stat *stat);
+void lnet_delay_rule_reset(void);
+void lnet_delay_rule_check(void);
+bool lnet_delay_rule_match_locked(struct lnet_hdr *hdr, struct lnet_msg *msg);
+
+/** @} lnet_fault_simulation */
+
+void lnet_counters_get(struct lnet_counters *counters);
+void lnet_counters_reset(void);
+
+unsigned int lnet_iov_nob(unsigned int niov, struct kvec *iov);
+int lnet_extract_iov(int dst_niov, struct kvec *dst,
+		      int src_niov, struct kvec *src,
+		      unsigned int offset, unsigned int len);
+
+unsigned int lnet_kiov_nob (unsigned int niov, lnet_kiov_t *iov);
+int lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst,
+		     int src_niov, lnet_kiov_t *src,
+		     unsigned int offset, unsigned int len);
+
+void lnet_copy_iov2iov(unsigned int ndiov, struct kvec *diov,
+		       unsigned int doffset,
+		       unsigned int nsiov, struct kvec *siov,
+		       unsigned int soffset, unsigned int nob);
+void lnet_copy_kiov2iov(unsigned int niov, struct kvec *iov,
+			unsigned int iovoffset,
+			unsigned int nkiov, lnet_kiov_t *kiov,
+			unsigned int kiovoffset, unsigned int nob);
+void lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov,
+			unsigned int kiovoffset,
+			unsigned int niov, struct kvec *iov,
+			unsigned int iovoffset, unsigned int nob);
+void lnet_copy_kiov2kiov(unsigned int ndkiov, lnet_kiov_t *dkiov,
+			 unsigned int doffset,
+			 unsigned int nskiov, lnet_kiov_t *skiov,
+			 unsigned int soffset, unsigned int nob);
+
+static inline void
+lnet_copy_iov2flat(int dlen, void *dest, unsigned int doffset,
+		   unsigned int nsiov, struct kvec *siov, unsigned int soffset,
+		   unsigned int nob)
+{
+	struct kvec diov = { .iov_base = dest, .iov_len = dlen };
+
+	lnet_copy_iov2iov(1, &diov, doffset,
+			  nsiov, siov, soffset, nob);
+}
+
+static inline void
+lnet_copy_kiov2flat(int dlen, void *dest, unsigned int doffset,
+		    unsigned int nsiov, lnet_kiov_t *skiov,
+		    unsigned int soffset, unsigned int nob)
+{
+	struct kvec diov = { .iov_base = dest, .iov_len = dlen };
+
+	lnet_copy_kiov2iov(1, &diov, doffset,
+			   nsiov, skiov, soffset, nob);
+}
+
+static inline void
+lnet_copy_flat2iov(unsigned int ndiov, struct kvec *diov, unsigned int doffset,
+		   int slen, void *src, unsigned int soffset,
+		   unsigned int nob)
+{
+	struct kvec siov = { .iov_base = src, .iov_len = slen };
+	lnet_copy_iov2iov(ndiov, diov, doffset,
+			  1, &siov, soffset, nob);
+}
+
+static inline void
+lnet_copy_flat2kiov(unsigned int ndiov, lnet_kiov_t *dkiov,
+		    unsigned int doffset, int slen, void *src,
+		    unsigned int soffset, unsigned int nob)
+{
+	struct kvec siov = { .iov_base = src, .iov_len = slen };
+	lnet_copy_iov2kiov(ndiov, dkiov, doffset,
+			   1, &siov, soffset, nob);
+}
+
+void lnet_me_unlink(struct lnet_me *me);
+
+void lnet_md_unlink(struct lnet_libmd *md);
+void lnet_md_deconstruct(struct lnet_libmd *lmd, struct lnet_md *umd);
+struct page *lnet_kvaddr_to_page(unsigned long vaddr);
+int lnet_cpt_of_md(struct lnet_libmd *md, unsigned int offset);
+
+void lnet_register_lnd(struct lnet_lnd *lnd);
+void lnet_unregister_lnd(struct lnet_lnd *lnd);
+
+int lnet_connect(struct socket **sockp, lnet_nid_t peer_nid,
+		 __u32 local_ip, __u32 peer_ip, int peer_port);
+void lnet_connect_console_error(int rc, lnet_nid_t peer_nid,
+                                __u32 peer_ip, int port);
+int lnet_count_acceptor_nets(void);
+int lnet_acceptor_timeout(void);
+int lnet_acceptor_port(void);
+int lnet_acceptor_start(void);
+void lnet_acceptor_stop(void);
+
+int lnet_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask);
+int lnet_ipif_enumerate(char ***names);
+void lnet_ipif_free_enumeration(char **names, int n);
+int lnet_sock_setbuf(struct socket *socket, int txbufsize, int rxbufsize);
+int lnet_sock_getbuf(struct socket *socket, int *txbufsize, int *rxbufsize);
+int lnet_sock_getaddr(struct socket *socket, bool remote, __u32 *ip, int *port);
+int lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout);
+int lnet_sock_read(struct socket *sock, void *buffer, int nob, int timeout);
+
+int lnet_sock_listen(struct socket **sockp, __u32 ip, int port, int backlog);
+int lnet_sock_accept(struct socket **newsockp, struct socket *sock);
+int lnet_sock_connect(struct socket **sockp, int *fatal,
+			__u32 local_ip, int local_port,
+			__u32 peer_ip, int peer_port);
+
+int lnet_peers_start_down(void);
+int lnet_peer_buffer_credits(struct lnet_net *net);
+
+int lnet_router_checker_start(void);
+void lnet_router_checker_stop(void);
+void lnet_router_ni_update_locked(struct lnet_peer_ni *gw, __u32 net);
+void lnet_swap_pinginfo(struct lnet_ping_info *info);
+
+int lnet_parse_ip2nets(char **networksp, char *ip2nets);
+int lnet_parse_routes(char *route_str, int *im_a_router);
+int lnet_parse_networks(struct list_head *nilist, char *networks,
+			bool use_tcp_bonding);
+bool lnet_net_unique(__u32 net_id, struct list_head *nilist,
+		     struct lnet_net **net);
+bool lnet_ni_unique_net(struct list_head *nilist, char *iface);
+void lnet_incr_dlc_seq(void);
+__u32 lnet_get_dlc_seq_locked(void);
+
+struct lnet_peer_ni *lnet_get_next_peer_ni_locked(struct lnet_peer *peer,
+						  struct lnet_peer_net *peer_net,
+						  struct lnet_peer_ni *prev);
+struct lnet_peer *lnet_find_or_create_peer_locked(lnet_nid_t dst_nid, int cpt);
+struct lnet_peer_ni *lnet_nid2peerni_locked(lnet_nid_t nid, int cpt);
+struct lnet_peer_ni *lnet_nid2peerni_ex(lnet_nid_t nid, int cpt);
+struct lnet_peer_ni *lnet_find_peer_ni_locked(lnet_nid_t nid);
+void lnet_peer_net_added(struct lnet_net *net);
+lnet_nid_t lnet_peer_primary_nid_locked(lnet_nid_t nid);
+void lnet_peer_tables_cleanup(struct lnet_net *net);
+void lnet_peer_uninit(void);
+int lnet_peer_tables_create(void);
+void lnet_debug_peer(lnet_nid_t nid);
+struct lnet_peer_net *lnet_peer_get_net_locked(struct lnet_peer *peer,
+					       __u32 net_id);
+bool lnet_peer_is_ni_pref_locked(struct lnet_peer_ni *lpni,
+				 struct lnet_ni *ni);
+int lnet_add_peer_ni_to_peer(lnet_nid_t key_nid, lnet_nid_t nid, bool mr);
+int lnet_del_peer_ni_from_peer(lnet_nid_t key_nid, lnet_nid_t nid);
+int lnet_get_peer_info(__u32 idx, lnet_nid_t *primary_nid, lnet_nid_t *nid,
+		       bool *mr,
+		       struct lnet_peer_ni_credit_info __user *peer_ni_info,
+		       struct lnet_ioctl_element_stats __user *peer_ni_stats);
+int lnet_get_peer_ni_info(__u32 peer_index, __u64 *nid,
+			  char alivness[LNET_MAX_STR_LEN],
+			  __u32 *cpt_iter, __u32 *refcount,
+			  __u32 *ni_peer_tx_credits, __u32 *peer_tx_credits,
+			  __u32 *peer_rtr_credits, __u32 *peer_min_rtr_credtis,
+			  __u32 *peer_tx_qnob);
+
+
+static inline __u32
+lnet_get_num_peer_nis(struct lnet_peer *peer)
+{
+	struct lnet_peer_net *lpn;
+	struct lnet_peer_ni *lpni;
+	__u32 count = 0;
+
+	list_for_each_entry(lpn, &peer->lp_peer_nets, lpn_on_peer_list)
+		list_for_each_entry(lpni, &lpn->lpn_peer_nis,
+				    lpni_on_peer_net_list)
+			count++;
+
+	return count;
+}
+
+static inline bool
+lnet_is_peer_ni_healthy_locked(struct lnet_peer_ni *lpni)
+{
+	return lpni->lpni_healthy;
+}
+
+static inline void
+lnet_set_peer_ni_health_locked(struct lnet_peer_ni *lpni, bool health)
+{
+	lpni->lpni_healthy = health;
+}
+
+static inline bool
+lnet_is_peer_net_healthy_locked(struct lnet_peer_net *peer_net)
+{
+	struct lnet_peer_ni *lpni;
+
+	list_for_each_entry(lpni, &peer_net->lpn_peer_nis,
+			    lpni_on_peer_net_list) {
+		if (lnet_is_peer_ni_healthy_locked(lpni))
+			return true;
+	}
+
+	return false;
+}
+
+static inline bool
+lnet_is_peer_healthy_locked(struct lnet_peer *peer)
+{
+	struct lnet_peer_net *peer_net;
+
+	list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_on_peer_list) {
+		if (lnet_is_peer_net_healthy_locked(peer_net))
+			return true;
+	}
+
+	return false;
+}
+
+static inline void
+lnet_peer_set_alive(struct lnet_peer_ni *lp)
+{
+	lp->lpni_last_alive = lp->lpni_last_query = cfs_time_current();
+	if (!lp->lpni_alive)
+		lnet_notify_locked(lp, 0, 1, lp->lpni_last_alive);
+}
+
+#endif
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lib-types.h b/drivers/staging/lustrefsx/lnet/include/lnet/lib-types.h
new file mode 100644
index 0000000000000..9b8af0e45a4c8
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/lib-types.h
@@ -0,0 +1,846 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/lib-types.h
+ *
+ * Types used by the library side routines that do not need to be
+ * exposed to the user application
+ */
+
+#ifndef __LNET_LIB_TYPES_H__
+#define __LNET_LIB_TYPES_H__
+
+#ifndef __KERNEL__
+# error This include is only for kernel use.
+#endif
+
+#include <linux/kthread.h>
+#include <linux/uio.h>
+#include <linux/types.h>
+
+#include <lnet/lnetctl.h>
+
+/* Max payload size */
+#ifndef CONFIG_LNET_MAX_PAYLOAD
+# error "CONFIG_LNET_MAX_PAYLOAD must be defined in config.h"
+#endif
+
+#define LNET_MAX_PAYLOAD       CONFIG_LNET_MAX_PAYLOAD
+#if (LNET_MAX_PAYLOAD < LNET_MTU)
+# error "LNET_MAX_PAYLOAD too small - error in configure --with-max-payload-mb"
+#elif (LNET_MAX_PAYLOAD > (PAGE_SIZE * LNET_MAX_IOV))
+# error "LNET_MAX_PAYLOAD too large - error in configure --with-max-payload-mb"
+#endif
+
+/* forward refs */
+struct lnet_libmd;
+
+typedef struct lnet_msg {
+	struct list_head	msg_activelist;
+	struct list_head	msg_list;	/* Q for credits/MD */
+
+	struct lnet_process_id	msg_target;
+	/* Primary NID of the source. */
+	lnet_nid_t		msg_initiator;
+	/* where is it from, it's only for building event */
+	lnet_nid_t		msg_from;
+	__u32			msg_type;
+
+	/* committed for sending */
+	unsigned int		msg_tx_committed:1;
+	/* CPT # this message committed for sending */
+	unsigned int		msg_tx_cpt:15;
+	/* committed for receiving */
+	unsigned int		msg_rx_committed:1;
+	/* CPT # this message committed for receiving */
+	unsigned int		msg_rx_cpt:15;
+	/* queued for tx credit */
+	unsigned int		msg_tx_delayed:1;
+	/* queued for RX buffer */
+	unsigned int		msg_rx_delayed:1;
+	/* ready for pending on RX delay list */
+	unsigned int		msg_rx_ready_delay:1;
+
+	unsigned int          msg_vmflush:1;      /* VM trying to free memory */
+	unsigned int          msg_target_is_router:1; /* sending to a router */
+	unsigned int          msg_routing:1;      /* being forwarded */
+	unsigned int          msg_ack:1;          /* ack on finalize (PUT) */
+	unsigned int          msg_sending:1;      /* outgoing message */
+	unsigned int          msg_receiving:1;    /* being received */
+	unsigned int          msg_txcredit:1;     /* taken an NI send credit */
+	unsigned int          msg_peertxcredit:1; /* taken a peer send credit */
+	unsigned int          msg_rtrcredit:1;    /* taken a globel router credit */
+	unsigned int          msg_peerrtrcredit:1; /* taken a peer router credit */
+	unsigned int          msg_onactivelist:1; /* on the activelist */
+	unsigned int	      msg_rdma_get:1;
+
+	struct lnet_peer_ni  *msg_txpeer;         /* peer I'm sending to */
+	struct lnet_peer_ni  *msg_rxpeer;         /* peer I received from */
+
+	void                 *msg_private;
+	struct lnet_libmd    *msg_md;
+	/* the NI the message was sent or received over */
+	struct lnet_ni       *msg_txni;
+	struct lnet_ni       *msg_rxni;
+
+	unsigned int          msg_len;
+	unsigned int          msg_wanted;
+	unsigned int          msg_offset;
+	unsigned int          msg_niov;
+	struct kvec	     *msg_iov;
+	lnet_kiov_t          *msg_kiov;
+
+	struct lnet_event	msg_ev;
+	struct lnet_hdr		msg_hdr;
+} lnet_msg_t;
+
+typedef struct lnet_libhandle {
+	struct list_head	lh_hash_chain;
+	__u64			lh_cookie;
+} lnet_libhandle_t;
+
+#define lh_entry(ptr, type, member) \
+	((type *)((char *)(ptr)-(char *)(&((type *)0)->member)))
+
+typedef struct lnet_eq {
+	struct list_head	eq_list;
+	struct lnet_libhandle	eq_lh;
+	unsigned long		eq_enq_seq;
+	unsigned long		eq_deq_seq;
+	unsigned int		eq_size;
+	lnet_eq_handler_t	eq_callback;
+	struct lnet_event	*eq_events;
+	int			**eq_refs;	/* percpt refcount for EQ */
+} lnet_eq_t;
+
+typedef struct lnet_me {
+	struct list_head	me_list;
+	struct lnet_libhandle	me_lh;
+	struct lnet_process_id	me_match_id;
+	unsigned int		me_portal;
+	unsigned int		me_pos;		/* hash offset in mt_hash */
+	__u64			me_match_bits;
+	__u64			me_ignore_bits;
+	enum lnet_unlink	me_unlink;
+	struct lnet_libmd      *me_md;
+} lnet_me_t;
+
+typedef struct lnet_libmd {
+	struct list_head	md_list;
+	struct lnet_libhandle	md_lh;
+	struct lnet_me	       *md_me;
+	char		       *md_start;
+	unsigned int		md_offset;
+	unsigned int		md_length;
+	unsigned int		md_max_size;
+	int			md_threshold;
+	int			md_refcount;
+	unsigned int		md_options;
+	unsigned int		md_flags;
+	unsigned int		md_niov;	/* # frags at end of struct */
+	void		       *md_user_ptr;
+	struct lnet_eq	       *md_eq;
+	struct lnet_handle_md	md_bulk_handle;
+	union {
+		struct kvec	iov[LNET_MAX_IOV];
+		lnet_kiov_t	kiov[LNET_MAX_IOV];
+	} md_iov;
+} lnet_libmd_t;
+
+#define LNET_MD_FLAG_ZOMBIE	 (1 << 0)
+#define LNET_MD_FLAG_AUTO_UNLINK (1 << 1)
+#define LNET_MD_FLAG_ABORTED	 (1 << 2)
+
+typedef struct lnet_test_peer {
+	/* info about peers we are trying to fail */
+	struct list_head	tp_list;	/* ln_test_peers */
+	lnet_nid_t		tp_nid;		/* matching nid */
+	unsigned int		tp_threshold;	/* # failures to simulate */
+} lnet_test_peer_t;
+
+#define LNET_COOKIE_TYPE_MD    1
+#define LNET_COOKIE_TYPE_ME    2
+#define LNET_COOKIE_TYPE_EQ    3
+#define LNET_COOKIE_TYPE_BITS  2
+#define LNET_COOKIE_MASK	((1ULL << LNET_COOKIE_TYPE_BITS) - 1ULL)
+
+struct lnet_ni;					 /* forward ref */
+struct socket;
+
+typedef struct lnet_lnd {
+	/* fields managed by portals */
+	struct list_head	lnd_list;	/* stash in the LND table */
+	int			lnd_refcount;	/* # active instances */
+
+	/* fields initialized by the LND */
+	__u32			lnd_type;
+
+	int  (*lnd_startup)(struct lnet_ni *ni);
+	void (*lnd_shutdown)(struct lnet_ni *ni);
+	int  (*lnd_ctl)(struct lnet_ni *ni, unsigned int cmd, void *arg);
+
+	/* In data movement APIs below, payload buffers are described as a set
+	 * of 'niov' fragments which are...
+	 * EITHER
+	 *    in virtual memory (struct kvec *iov != NULL)
+	 * OR
+	 *    in pages (kernel only: plt_kiov_t *kiov != NULL).
+	 * The LND may NOT overwrite these fragment descriptors.
+	 * An 'offset' and may specify a byte offset within the set of
+	 * fragments to start from
+	 */
+
+	/* Start sending a preformatted message.  'private' is NULL for PUT and
+	 * GET messages; otherwise this is a response to an incoming message
+	 * and 'private' is the 'private' passed to lnet_parse().  Return
+	 * non-zero for immediate failure, otherwise complete later with
+	 * lnet_finalize() */
+	int (*lnd_send)(struct lnet_ni *ni, void *private,
+			struct lnet_msg *msg);
+
+	/* Start receiving 'mlen' bytes of payload data, skipping the following
+	 * 'rlen' - 'mlen' bytes. 'private' is the 'private' passed to
+	 * lnet_parse().  Return non-zero for immedaite failure, otherwise
+	 * complete later with lnet_finalize().  This also gives back a receive
+	 * credit if the LND does flow control. */
+	int (*lnd_recv)(struct lnet_ni *ni, void *private, struct lnet_msg *msg,
+			int delayed, unsigned int niov,
+			struct kvec *iov, lnet_kiov_t *kiov,
+			unsigned int offset, unsigned int mlen, unsigned int rlen);
+
+	/* lnet_parse() has had to delay processing of this message
+	 * (e.g. waiting for a forwarding buffer or send credits).  Give the
+	 * LND a chance to free urgently needed resources.  If called, return 0
+	 * for success and do NOT give back a receive credit; that has to wait
+	 * until lnd_recv() gets called.  On failure return < 0 and
+	 * release resources; lnd_recv() will not be called. */
+	int (*lnd_eager_recv)(struct lnet_ni *ni, void *private,
+			      struct lnet_msg *msg, void **new_privatep);
+
+	/* notification of peer health */
+	void (*lnd_notify)(struct lnet_ni *ni, lnet_nid_t peer, int alive);
+
+	/* query of peer aliveness */
+	void (*lnd_query)(struct lnet_ni *ni, lnet_nid_t peer, cfs_time_t *when);
+
+	/* accept a new connection */
+	int (*lnd_accept)(struct lnet_ni *ni, struct socket *sock);
+} lnd_t;
+
+typedef struct lnet_ni_status {
+	lnet_nid_t ns_nid;
+	__u32	   ns_status;
+	__u32	   ns_unused;
+} WIRE_ATTR lnet_ni_status_t;
+
+struct lnet_tx_queue {
+	int			tq_credits;	/* # tx credits free */
+	int			tq_credits_min;	/* lowest it's been */
+	int			tq_credits_max;	/* total # tx credits */
+	struct list_head	tq_delayed;	/* delayed TXs */
+};
+
+enum lnet_net_state {
+	/* set when net block is allocated */
+	LNET_NET_STATE_INIT = 0,
+	/* set when NIs in net are started successfully */
+	LNET_NET_STATE_ACTIVE,
+	/* set if all NIs in net are in FAILED state */
+	LNET_NET_STATE_INACTIVE,
+	/* set when shutting down a NET */
+	LNET_NET_STATE_DELETING
+};
+
+enum lnet_ni_state {
+	/* set when NI block is allocated */
+	LNET_NI_STATE_INIT = 0,
+	/* set when NI is started successfully */
+	LNET_NI_STATE_ACTIVE,
+	/* set when LND notifies NI failed */
+	LNET_NI_STATE_FAILED,
+	/* set when LND notifies NI degraded */
+	LNET_NI_STATE_DEGRADED,
+	/* set when shuttding down NI */
+	LNET_NI_STATE_DELETING
+};
+
+struct lnet_element_stats {
+	atomic_t	send_count;
+	atomic_t	recv_count;
+	atomic_t	drop_count;
+};
+
+struct lnet_net {
+	/* chain on the ln_nets */
+	struct list_head	net_list;
+
+	/* net ID, which is composed of
+	 * (net_type << 16) | net_num.
+	 * net_type can be one of the enumerated types defined in
+	 * lnet/include/lnet/nidstr.h */
+	__u32			net_id;
+
+	/* priority of the network */
+	__u32			net_prio;
+
+	/* total number of CPTs in the array */
+	__u32			net_ncpts;
+
+	/* cumulative CPTs of all NIs in this net */
+	__u32			*net_cpts;
+
+	/* network tunables */
+	struct lnet_ioctl_config_lnd_cmn_tunables net_tunables;
+
+	/*
+	 * boolean to indicate that the tunables have been set and
+	 * shouldn't be reset
+	 */
+	bool			net_tunables_set;
+
+	/* procedural interface */
+	struct lnet_lnd		*net_lnd;
+
+	/* list of NIs on this net */
+	struct list_head	net_ni_list;
+
+	/* list of NIs being added, but not started yet */
+	struct list_head	net_ni_added;
+
+	/* dying LND instances */
+	struct list_head	net_ni_zombie;
+
+	/* network state */
+	enum lnet_net_state	net_state;
+};
+
+typedef struct lnet_ni {
+	/* chain on the lnet_net structure */
+	struct list_head	ni_netlist;
+
+	/* chain on net_ni_cpt */
+	struct list_head	ni_cptlist;
+
+	spinlock_t		ni_lock;
+
+	/* number of CPTs */
+	int			ni_ncpts;
+
+	/* bond NI on some CPTs */
+	__u32			*ni_cpts;
+
+	/* interface's NID */
+	lnet_nid_t		ni_nid;
+
+	/* instance-specific data */
+	void			*ni_data;
+
+	/* per ni credits */
+	atomic_t		ni_tx_credits;
+
+	/* percpt TX queues */
+	struct lnet_tx_queue	**ni_tx_queues;
+
+	/* percpt reference count */
+	int			**ni_refs;
+
+	/* when I was last alive */
+	long			ni_last_alive;
+
+	/* pointer to parent network */
+	struct lnet_net		*ni_net;
+
+	/* my health status */
+	struct lnet_ni_status	*ni_status;
+
+	/* NI FSM */
+	enum lnet_ni_state	ni_state;
+
+	/* per NI LND tunables */
+	struct lnet_lnd_tunables ni_lnd_tunables;
+
+	/* lnd tunables set explicitly */
+	bool ni_lnd_tunables_set;
+
+	/* NI statistics */
+	struct lnet_element_stats ni_stats;
+
+	/* physical device CPT */
+	int			ni_dev_cpt;
+
+	/* sequence number used to round robin over nis within a net */
+	__u32			ni_seq;
+
+	/*
+	 * equivalent interfaces to use
+	 * This is an array because socklnd bonding can still be configured
+	 */
+	char			*ni_interfaces[LNET_NUM_INTERFACES];
+	struct net		*ni_net_ns;     /* original net namespace */
+} lnet_ni_t;
+
+#define LNET_PROTO_PING_MATCHBITS	0x8000000000000000LL
+
+/* NB: value of these features equal to LNET_PROTO_PING_VERSION_x
+ * of old LNet, so there shouldn't be any compatibility issue */
+#define LNET_PING_FEAT_INVAL		(0)		/* no feature */
+#define LNET_PING_FEAT_BASE		(1 << 0)	/* just a ping */
+#define LNET_PING_FEAT_NI_STATUS	(1 << 1)	/* return NI status */
+#define LNET_PING_FEAT_RTE_DISABLED	(1 << 2)	/* Routing enabled */
+
+#define LNET_PING_FEAT_MASK		(LNET_PING_FEAT_BASE | \
+					 LNET_PING_FEAT_NI_STATUS)
+
+typedef struct lnet_ping_info {
+	__u32			pi_magic;
+	__u32			pi_features;
+	lnet_pid_t		pi_pid;
+	__u32			pi_nnis;
+	struct lnet_ni_status	pi_ni[0];
+} WIRE_ATTR lnet_ping_info_t;
+
+/* router checker data, per router */
+#define LNET_MAX_RTR_NIS   16
+#define LNET_PINGINFO_SIZE offsetof(struct lnet_ping_info, pi_ni[LNET_MAX_RTR_NIS])
+typedef struct lnet_rc_data {
+	/* chain on the_lnet.ln_zombie_rcd or ln_deathrow_rcd */
+	struct list_head	rcd_list;
+	struct lnet_handle_md	rcd_mdh;	/* ping buffer MD */
+	struct lnet_peer_ni	*rcd_gateway;	/* reference to gateway */
+	struct lnet_ping_info	*rcd_pinginfo;	/* ping buffer */
+} lnet_rc_data_t;
+
+struct lnet_peer_ni {
+	/* chain on peer_net */
+	struct list_head	lpni_on_peer_net_list;
+	/* chain on remote peer list */
+	struct list_head	lpni_on_remote_peer_ni_list;
+	/* chain on peer hash */
+	struct list_head	lpni_hashlist;
+	/* messages blocking for tx credits */
+	struct list_head	lpni_txq;
+	/* messages blocking for router credits */
+	struct list_head	lpni_rtrq;
+	/* chain on router list */
+	struct list_head	lpni_rtr_list;
+	/* pointer to peer net I'm part of */
+	struct lnet_peer_net	*lpni_peer_net;
+	/* statistics kept on each peer NI */
+	struct lnet_element_stats lpni_stats;
+	/* spin lock protecting credits and lpni_txq / lpni_rtrq */
+	spinlock_t		lpni_lock;
+	/* # tx credits available */
+	int			lpni_txcredits;
+	/* low water mark */
+	int			lpni_mintxcredits;
+	/* # router credits */
+	int			lpni_rtrcredits;
+	/* low water mark */
+	int			lpni_minrtrcredits;
+	/* bytes queued for sending */
+	long			lpni_txqnob;
+	/* alive/dead? */
+	bool			lpni_alive;
+	/* notification outstanding? */
+	bool			lpni_notify;
+	/* outstanding notification for LND? */
+	bool			lpni_notifylnd;
+	/* some thread is handling notification */
+	bool			lpni_notifying;
+	/* SEND event outstanding from ping */
+	bool			lpni_ping_notsent;
+	/* # times router went dead<->alive. Protected with lpni_lock */
+	int			lpni_alive_count;
+	/* time of last aliveness news */
+	cfs_time_t		lpni_timestamp;
+	/* time of last ping attempt */
+	cfs_time_t		lpni_ping_timestamp;
+	/* != 0 if ping reply expected */
+	cfs_time_t		lpni_ping_deadline;
+	/* when I was last alive */
+	cfs_time_t		lpni_last_alive;
+	/* when lpni_ni was queried last time */
+	cfs_time_t		lpni_last_query;
+	/* network peer is on */
+	struct lnet_net		*lpni_net;
+	/* peer's NID */
+	lnet_nid_t		lpni_nid;
+	/* # refs */
+	atomic_t		lpni_refcount;
+	/* CPT this peer attached on */
+	int			lpni_cpt;
+	/* # refs from lnet_route_t::lr_gateway */
+	int			lpni_rtr_refcount;
+	/* sequence number used to round robin over peer nis within a net */
+	__u32			lpni_seq;
+	/* sequence number used to round robin over gateways */
+	__u32			lpni_gw_seq;
+	/* health flag */
+	bool			lpni_healthy;
+	/* returned RC ping features. Protected with lpni_lock */
+	unsigned int		lpni_ping_feats;
+	/* routes on this peer */
+	struct list_head	lpni_routes;
+	/* array of preferred local nids */
+	lnet_nid_t		*lpni_pref_nids;
+	/* number of preferred NIDs in lnpi_pref_nids */
+	__u32			lpni_pref_nnids;
+	/* router checker state */
+	struct lnet_rc_data	*lpni_rcd;
+};
+
+struct lnet_peer {
+	/* chain on global peer list */
+	struct list_head	lp_on_lnet_peer_list;
+
+	/* list of peer nets */
+	struct list_head	lp_peer_nets;
+
+	/* primary NID of the peer */
+	lnet_nid_t		lp_primary_nid;
+
+	/* peer is Multi-Rail enabled peer */
+	bool			lp_multi_rail;
+};
+
+struct lnet_peer_net {
+	/* chain on peer block */
+	struct list_head	lpn_on_peer_list;
+
+	/* list of peer_nis on this network */
+	struct list_head	lpn_peer_nis;
+
+	/* pointer to the peer I'm part of */
+	struct lnet_peer	*lpn_peer;
+
+	/* Net ID */
+	__u32			lpn_net_id;
+};
+
+/* peer hash size */
+#define LNET_PEER_HASH_BITS	9
+#define LNET_PEER_HASH_SIZE	(1 << LNET_PEER_HASH_BITS)
+
+/* peer hash table */
+struct lnet_peer_table {
+	int			pt_version;	/* /proc validity stamp */
+	atomic_t		pt_number;	/* # peers extant */
+	struct list_head	*pt_hash;	/* NID->peer hash */
+	struct list_head	pt_zombie_list;	/* zombie peers */
+	int			pt_zombies;	/* # zombie peers */
+	spinlock_t		pt_zombie_lock;	/* protect list and count */
+};
+
+/* peer aliveness is enabled only on routers for peers in a network where the
+ * struct lnet_ni::ni_peertimeout has been set to a positive value
+ */
+#define lnet_peer_aliveness_enabled(lp) (the_lnet.ln_routing != 0 && \
+					((lp)->lpni_net) && \
+					(lp)->lpni_net->net_tunables.lct_peer_timeout > 0)
+
+typedef struct lnet_route {
+	struct list_head	lr_list;	/* chain on net */
+	struct list_head	lr_gwlist;	/* chain on gateway */
+	struct lnet_peer_ni	*lr_gateway;	/* router node */
+	__u32			lr_net;		/* remote network number */
+	int			lr_seq;		/* sequence for round-robin */
+	unsigned int		lr_downis;	/* number of down NIs */
+	__u32			lr_hops;	/* how far I am */
+	unsigned int		lr_priority;	/* route priority */
+} lnet_route_t;
+
+#define LNET_REMOTE_NETS_HASH_DEFAULT	(1U << 7)
+#define LNET_REMOTE_NETS_HASH_MAX	(1U << 16)
+#define LNET_REMOTE_NETS_HASH_SIZE	(1 << the_lnet.ln_remote_nets_hbits)
+
+typedef struct lnet_remotenet {
+	/* chain on ln_remote_nets_hash */
+	struct list_head	lrn_list;
+	/* routes to me */
+	struct list_head	lrn_routes;
+	/* my net number */
+	__u32			lrn_net;
+} lnet_remotenet_t;
+
+/** lnet message has credit and can be submitted to lnd for send/receive */
+#define LNET_CREDIT_OK		0
+/** lnet message is waiting for credit */
+#define LNET_CREDIT_WAIT	1
+
+typedef struct lnet_rtrbufpool {
+	/* my free buffer pool */
+	struct list_head	rbp_bufs;
+	/* messages blocking for a buffer */
+	struct list_head	rbp_msgs;
+	/* # pages in each buffer */
+	int			rbp_npages;
+	/* requested number of buffers */
+	int			rbp_req_nbuffers;
+	/* # buffers actually allocated */
+	int			rbp_nbuffers;
+	/* # free buffers / blocked messages */
+	int			rbp_credits;
+	/* low water mark */
+	int			rbp_mincredits;
+} lnet_rtrbufpool_t;
+
+typedef struct lnet_rtrbuf {
+	struct list_head	 rb_list;	/* chain on rbp_bufs */
+	struct lnet_rtrbufpool	*rb_pool;	/* owning pool */
+	lnet_kiov_t		 rb_kiov[0];	/* the buffer space */
+} lnet_rtrbuf_t;
+
+#define LNET_PEER_HASHSIZE   503		/* prime! */
+
+enum lnet_match_flags {
+	/* Didn't match anything */
+	LNET_MATCHMD_NONE	= (1 << 0),
+	/* Matched OK */
+	LNET_MATCHMD_OK		= (1 << 1),
+	/* Must be discarded */
+	LNET_MATCHMD_DROP	= (1 << 2),
+	/* match and buffer is exhausted */
+	LNET_MATCHMD_EXHAUSTED	= (1 << 3),
+	/* match or drop */
+	LNET_MATCHMD_FINISH	= (LNET_MATCHMD_OK | LNET_MATCHMD_DROP),
+};
+
+/* Options for struct lnet_portal::ptl_options */
+#define LNET_PTL_LAZY		    (1 << 0)
+#define LNET_PTL_MATCH_UNIQUE	    (1 << 1)	/* unique match, for RDMA */
+#define LNET_PTL_MATCH_WILDCARD     (1 << 2)	/* wildcard match, request portal */
+
+/* parameter for matching operations (GET, PUT) */
+struct lnet_match_info {
+	__u64			mi_mbits;
+	struct lnet_process_id	mi_id;
+	unsigned int		mi_cpt;
+	unsigned int		mi_opc;
+	unsigned int		mi_portal;
+	unsigned int		mi_rlength;
+	unsigned int		mi_roffset;
+};
+
+/* ME hash of RDMA portal */
+#define LNET_MT_HASH_BITS		8
+#define LNET_MT_HASH_SIZE		(1 << LNET_MT_HASH_BITS)
+#define LNET_MT_HASH_MASK		(LNET_MT_HASH_SIZE - 1)
+/* we allocate (LNET_MT_HASH_SIZE + 1) entries for lnet_match_table::mt_hash,
+ * the last entry is reserved for MEs with ignore-bits */
+#define LNET_MT_HASH_IGNORE		LNET_MT_HASH_SIZE
+/* __u64 has 2^6 bits, so need 2^(LNET_MT_HASH_BITS - LNET_MT_BITS_U64) which
+ * is 4 __u64s as bit-map, and add an extra __u64 (only use one bit) for the
+ * ME-list with ignore-bits, which is mtable::mt_hash[LNET_MT_HASH_IGNORE] */
+#define LNET_MT_BITS_U64		6	/* 2^6 bits */
+#define LNET_MT_EXHAUSTED_BITS		(LNET_MT_HASH_BITS - LNET_MT_BITS_U64)
+#define LNET_MT_EXHAUSTED_BMAP		((1 << LNET_MT_EXHAUSTED_BITS) + 1)
+
+/* portal match table */
+struct lnet_match_table {
+	/* reserved for upcoming patches, CPU partition ID */
+	unsigned int		mt_cpt;
+	unsigned int		mt_portal;	/* portal index */
+	/* match table is set as "enabled" if there's non-exhausted MD
+	 * attached on mt_mhash, it's only valid for wildcard portal */
+	unsigned int		mt_enabled;
+	/* bitmap to flag whether MEs on mt_hash are exhausted or not */
+	__u64			mt_exhausted[LNET_MT_EXHAUSTED_BMAP];
+	struct list_head	*mt_mhash;	/* matching hash */
+};
+
+/* these are only useful for wildcard portal */
+/* Turn off message rotor for wildcard portals */
+#define	LNET_PTL_ROTOR_OFF	0
+/* round-robin dispatch all PUT messages for wildcard portals */
+#define	LNET_PTL_ROTOR_ON	1
+/* round-robin dispatch routed PUT message for wildcard portals */
+#define	LNET_PTL_ROTOR_RR_RT	2
+/* dispatch routed PUT message by hashing source NID for wildcard portals */
+#define	LNET_PTL_ROTOR_HASH_RT	3
+
+typedef struct lnet_portal {
+	spinlock_t		ptl_lock;
+	unsigned int		ptl_index;	/* portal ID, reserved */
+	/* flags on this portal: lazy, unique... */
+	unsigned int		ptl_options;
+	/* list of messages which are stealing buffer */
+	struct list_head	ptl_msg_stealing;
+	/* messages blocking for MD */
+	struct list_head	ptl_msg_delayed;
+	/* Match table for each CPT */
+	struct lnet_match_table	**ptl_mtables;
+	/* spread rotor of incoming "PUT" */
+	unsigned int		ptl_rotor;
+	/* # active entries for this portal */
+	int			ptl_mt_nmaps;
+	/* array of active entries' cpu-partition-id */
+	int			ptl_mt_maps[0];
+} lnet_portal_t;
+
+#define LNET_LH_HASH_BITS	12
+#define LNET_LH_HASH_SIZE	(1ULL << LNET_LH_HASH_BITS)
+#define LNET_LH_HASH_MASK	(LNET_LH_HASH_SIZE - 1)
+
+/* resource container (ME, MD, EQ) */
+struct lnet_res_container {
+	unsigned int		rec_type;	/* container type */
+	__u64			rec_lh_cookie;	/* cookie generator */
+	struct list_head	rec_active;	/* active resource list */
+	struct list_head	*rec_lh_hash;	/* handle hash */
+};
+
+/* message container */
+struct lnet_msg_container {
+	int			msc_init;	/* initialized or not */
+	/* max # threads finalizing */
+	int			msc_nfinalizers;
+	/* msgs waiting to complete finalizing */
+	struct list_head	msc_finalizing;
+	struct list_head	msc_active;	/* active message list */
+	/* threads doing finalization */
+	void			**msc_finalizers;
+};
+
+/* Router Checker states */
+#define LNET_RC_STATE_SHUTDOWN		0	/* not started */
+#define LNET_RC_STATE_RUNNING		1	/* started up OK */
+#define LNET_RC_STATE_STOPPING		2	/* telling thread to stop */
+
+/* LNet states */
+#define LNET_STATE_SHUTDOWN		0	/* not started */
+#define LNET_STATE_RUNNING		1	/* started up OK */
+#define LNET_STATE_STOPPING		2	/* telling thread to stop */
+
+typedef struct lnet {
+	/* CPU partition table of LNet */
+	struct cfs_cpt_table		*ln_cpt_table;
+	/* number of CPTs in ln_cpt_table */
+	unsigned int			ln_cpt_number;
+	unsigned int			ln_cpt_bits;
+
+	/* protect LNet resources (ME/MD/EQ) */
+	struct cfs_percpt_lock		*ln_res_lock;
+	/* # portals */
+	int				ln_nportals;
+	/* the vector of portals */
+	struct lnet_portal		**ln_portals;
+	/* percpt ME containers */
+	struct lnet_res_container	**ln_me_containers;
+	/* percpt MD container */
+	struct lnet_res_container	**ln_md_containers;
+
+	/* Event Queue container */
+	struct lnet_res_container	ln_eq_container;
+	wait_queue_head_t		ln_eq_waitq;
+	spinlock_t			ln_eq_wait_lock;
+
+	unsigned int			ln_remote_nets_hbits;
+
+	/* protect NI, peer table, credits, routers, rtrbuf... */
+	struct cfs_percpt_lock		*ln_net_lock;
+	/* percpt message containers for active/finalizing/freed message */
+	struct lnet_msg_container	**ln_msg_containers;
+	struct lnet_counters		**ln_counters;
+	struct lnet_peer_table		**ln_peer_tables;
+	/* list of configured or discovered peers */
+	struct list_head		ln_peers;
+	/* list of peer nis not on a local network */
+	struct list_head		ln_remote_peer_ni_list;
+	/* failure simulation */
+	struct list_head		ln_test_peers;
+	struct list_head		ln_drop_rules;
+	struct list_head		ln_delay_rules;
+	/* LND instances */
+	struct list_head		ln_nets;
+	/* the loopback NI */
+	struct lnet_ni			*ln_loni;
+	/* network zombie list */
+	struct list_head		ln_net_zombie;
+
+	/* remote networks with routes to them */
+	struct list_head		*ln_remote_nets_hash;
+	/* validity stamp */
+	__u64				ln_remote_nets_version;
+	/* list of all known routers */
+	struct list_head		ln_routers;
+	/* validity stamp */
+	__u64				ln_routers_version;
+	/* percpt router buffer pools */
+	struct lnet_rtrbufpool		**ln_rtrpools;
+
+	struct lnet_handle_md		ln_ping_target_md;
+	struct lnet_handle_eq		ln_ping_target_eq;
+	struct lnet_ping_info		*ln_ping_info;
+
+	/* router checker startup/shutdown state */
+	int				ln_rc_state;
+	/* router checker's event queue */
+	struct lnet_handle_eq		ln_rc_eqh;
+	/* rcd still pending on net */
+	struct list_head		ln_rcd_deathrow;
+	/* rcd ready for free */
+	struct list_head		ln_rcd_zombie;
+	/* serialise startup/shutdown */
+	struct semaphore		ln_rc_signal;
+
+	struct mutex			ln_api_mutex;
+	struct mutex			ln_lnd_mutex;
+	/* Have I called LNetNIInit myself? */
+	int				ln_niinit_self;
+	/* LNetNIInit/LNetNIFini counter */
+	int				ln_refcount;
+	/* SHUTDOWN/RUNNING/STOPPING */
+	int				ln_state;
+
+	int				ln_routing;	/* am I a router? */
+	lnet_pid_t			ln_pid;		/* requested pid */
+	/* uniquely identifies this ni in this epoch */
+	__u64				ln_interface_cookie;
+	/* registered LNDs */
+	struct list_head		ln_lnds;
+
+	/* test protocol compatibility flags */
+	int				ln_testprotocompat;
+
+	/* 0 - load the NIs from the mod params
+	 * 1 - do not load the NIs from the mod params
+	 * Reverse logic to ensure that other calls to LNetNIInit
+	 * need no change
+	 */
+	bool				ln_nis_from_mod_params;
+
+	/* waitq for router checker.  As long as there are no routes in
+	 * the list, the router checker will sleep on this queue.  when
+	 * routes are added the thread will wake up */
+	wait_queue_head_t		ln_rc_waitq;
+} lnet_t;
+
+#endif
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lnet.h b/drivers/staging/lustrefsx/lnet/include/lnet/lnet.h
new file mode 100644
index 0000000000000..54061f593496e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/lnet.h
@@ -0,0 +1,46 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_H__
+#define __LNET_H__
+
+/*
+ * lnet.h
+ *
+ * User application interface file
+ */
+
+#include <lnet/types.h>
+#include <lnet/lib-dlc.h>
+#include <lnet/nidstr.h>
+
+#endif
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lnetctl.h b/drivers/staging/lustrefsx/lnet/include/lnet/lnetctl.h
new file mode 100644
index 0000000000000..bdd0cb4f84083
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/lnetctl.h
@@ -0,0 +1,130 @@
+/*
+ *   This file is part of Lustre, https://wiki.hpdd.intel.com/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * header for lnet ioctl
+ */
+/*
+ * Copyright (c) 2014, Intel Corporation.
+ */
+#ifndef _LNETCTL_H_
+#define _LNETCTL_H_
+
+#include <lnet/types.h>
+
+/** \addtogroup lnet_fault_simulation
+ * @{ */
+
+enum {
+	LNET_CTL_DROP_ADD,
+	LNET_CTL_DROP_DEL,
+	LNET_CTL_DROP_RESET,
+	LNET_CTL_DROP_LIST,
+	LNET_CTL_DELAY_ADD,
+	LNET_CTL_DELAY_DEL,
+	LNET_CTL_DELAY_RESET,
+	LNET_CTL_DELAY_LIST,
+};
+
+#define LNET_ACK_BIT		(1 << 0)
+#define LNET_PUT_BIT		(1 << 1)
+#define LNET_GET_BIT		(1 << 2)
+#define LNET_REPLY_BIT		(1 << 3)
+
+/** ioctl parameter for LNet fault simulation */
+struct lnet_fault_attr {
+	/**
+	 * source NID of drop rule
+	 * LNET_NID_ANY is wildcard for all sources
+	 * 255.255.255.255@net is wildcard for all addresses from @net
+	 */
+	lnet_nid_t			fa_src;
+	/** destination NID of drop rule, see \a dr_src for details */
+	lnet_nid_t			fa_dst;
+	/**
+	 * Portal mask to drop, -1 means all portals, for example:
+	 * fa_ptl_mask = (1 << _LDLM_CB_REQUEST_PORTAL ) |
+	 *		 (1 << LDLM_CANCEL_REQUEST_PORTAL)
+	 *
+	 * If it is non-zero then only PUT and GET will be filtered, otherwise
+	 * there is no portal filter, all matched messages will be checked.
+	 */
+	__u64				fa_ptl_mask;
+	/**
+	 * message types to drop, for example:
+	 * dra_type = LNET_DROP_ACK_BIT | LNET_DROP_PUT_BIT
+	 *
+	 * If it is non-zero then only specified message types are filtered,
+	 * otherwise all message types will be checked.
+	 */
+	__u32				fa_msg_mask;
+	union {
+		/** message drop simulation */
+		struct {
+			/** drop rate of this rule */
+			__u32			da_rate;
+			/**
+			 * time interval of message drop, it is exclusive
+			 * with da_rate
+			 */
+			__u32			da_interval;
+		} drop;
+		/** message latency simulation */
+		struct {
+			__u32			la_rate;
+			/**
+			 * time interval of message delay, it is exclusive
+			 * with la_rate
+			 */
+			__u32			la_interval;
+			/** latency to delay */
+			__u32			la_latency;
+		} delay;
+		__u64			space[8];
+	} u;
+
+};
+
+/** fault simluation stats */
+struct lnet_fault_stat {
+	/** total # matched messages */
+	__u64				fs_count;
+	/** # dropped LNET_MSG_PUT by this rule */
+	__u64				fs_put;
+	/** # dropped LNET_MSG_ACK by this rule */
+	__u64				fs_ack;
+	/** # dropped LNET_MSG_GET by this rule */
+	__u64				fs_get;
+	/** # dropped LNET_MSG_REPLY by this rule */
+	__u64				fs_reply;
+	union {
+		struct {
+			/** total # dropped messages */
+			__u64			ds_dropped;
+		} drop;
+		struct {
+			/** total # delayed messages */
+			__u64			ls_delayed;
+		} delay;
+		__u64			space[8];
+	} u;
+};
+
+/** @} lnet_fault_simulation */
+
+#define LNET_DEV_ID	0
+#define LNET_DEV_PATH	"/dev/lnet"
+
+#endif
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lnetst.h b/drivers/staging/lustrefsx/lnet/include/lnet/lnetst.h
new file mode 100644
index 0000000000000..a43978ff592f4
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/lnetst.h
@@ -0,0 +1,515 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/lnetst.h
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#ifndef __LNET_ST_H__
+#define __LNET_ST_H__
+
+#define LST_FEAT_NONE		(0)
+#define LST_FEAT_BULK_LEN	(1 << 0)	/* enable variable page size */
+
+#define LST_FEATS_EMPTY		(LST_FEAT_NONE)
+#define LST_FEATS_MASK		(LST_FEAT_NONE | LST_FEAT_BULK_LEN)
+
+#define LST_NAME_SIZE		32		/* max name buffer length */
+
+#define LSTIO_DEBUG		0xC00		/* debug */
+#define LSTIO_SESSION_NEW	0xC01		/* create session */
+#define LSTIO_SESSION_END	0xC02		/* end session */
+#define LSTIO_SESSION_INFO	0xC03		/* query session */
+#define LSTIO_GROUP_ADD		0xC10		/* add group */
+#define LSTIO_GROUP_LIST	0xC11		/* list all groups in session */
+#define LSTIO_GROUP_INFO	0xC12		/* query defailt infomation of specified group */
+#define LSTIO_GROUP_DEL		0xC13		/* delete group */
+#define LSTIO_NODES_ADD		0xC14		/* add nodes to specified group */
+#define LSTIO_GROUP_UPDATE	0xC15		/* update group */
+#define LSTIO_BATCH_ADD		0xC20		/* add batch */
+#define LSTIO_BATCH_START	0xC21		/* start batch */
+#define LSTIO_BATCH_STOP	0xC22		/* stop batch */
+#define LSTIO_BATCH_DEL		0xC23		/* delete batch */
+#define LSTIO_BATCH_LIST	0xC24		/* show all batches in the session */
+#define LSTIO_BATCH_INFO	0xC25		/* show defail of specified batch */
+#define LSTIO_TEST_ADD		0xC26		/* add test (to batch) */
+#define LSTIO_BATCH_QUERY	0xC27		/* query batch status */
+#define LSTIO_STAT_QUERY	0xC30		/* get stats */
+
+struct lst_sid {
+	lnet_nid_t	ses_nid;	/* nid of console node */
+	__u64		ses_stamp;	/* time stamp */
+};					/*** session id */
+
+extern struct lst_sid LST_INVALID_SID;
+
+struct lst_bid {
+	__u64		bat_id;		/* unique id in session */
+};
+
+/* Status of test node */
+#define LST_NODE_ACTIVE		0x1	/* node in this session */
+#define LST_NODE_BUSY		0x2	/* node is taken by other session */
+#define LST_NODE_DOWN		0x4	/* node is down */
+#define LST_NODE_UNKNOWN	0x8	/* node not in session */
+
+struct lstcon_node_ent {
+	struct lnet_process_id	nde_id;		/* id of node */
+	int			nde_state;	/* state of node */
+};						/*** node entry, for list_group command */
+
+struct lstcon_ndlist_ent {
+	int	nle_nnode;	/* # of nodes */
+	int	nle_nactive;	/* # of active nodes */
+	int	nle_nbusy;	/* # of busy nodes */
+	int	nle_ndown;	/* # of down nodes */
+	int	nle_nunknown;	/* # of unknown nodes */
+};				/*** node_list entry, for list_batch command */
+
+struct lstcon_test_ent {
+	int	tse_type;	/* test type */
+	int	tse_loop;	/* loop count */
+	int	tse_concur;	/* concurrency of test */
+};				/*** test summary entry, for list_batch command */
+
+struct lstcon_batch_ent {
+	int	bae_state;	/* batch status */
+	int	bae_timeout;	/* batch timeout */
+	int	bae_ntest;	/* # of tests in the batch */
+};				/*** batch summary entry, for list_batch command */
+
+struct lstcon_test_batch_ent {
+	struct lstcon_ndlist_ent	tbe_cli_nle;	/* client (group) node_list entry */
+	struct lstcon_ndlist_ent	tbe_srv_nle;	/* server (group) node_list entry */
+	union {
+		struct lstcon_test_ent  tbe_test;	/* test entry */
+		struct lstcon_batch_ent tbe_batch;	/* batch entry */
+	} u;
+};							/*** test/batch verbose information entry,
+							 *** for list_batch command */
+
+struct lstcon_rpc_ent {
+	struct list_head	rpe_link;		/* link chain */
+	struct lnet_process_id	rpe_peer;		/* peer's id */
+	struct timeval		rpe_stamp;		/* time stamp of RPC */
+	int			rpe_state;		/* peer's state */
+	int			rpe_rpc_errno;		/* RPC errno */
+
+	struct lst_sid		rpe_sid;		/* peer's session id */
+	int			rpe_fwk_errno;		/* framework errno */
+	int			rpe_priv[4];		/* private data */
+	char			rpe_payload[0];		/* private reply payload */
+};
+
+struct lstcon_trans_stat {
+	int	trs_rpc_stat[4];	/* RPCs stat (0: total, 1: failed, 2: finished, 4: reserved */
+	int	trs_rpc_errno;		/* RPC errno */
+	int	trs_fwk_stat[8];	/* framework stat */
+	int	trs_fwk_errno;		/* errno of the first remote error */
+	void   *trs_fwk_private;	/* private framework stat */
+};
+
+static inline int
+lstcon_rpc_stat_total(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_rpc_stat[0] : stat->trs_rpc_stat[0];
+}
+
+static inline int
+lstcon_rpc_stat_success(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_rpc_stat[1] : stat->trs_rpc_stat[1];
+}
+
+static inline int
+lstcon_rpc_stat_failure(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_rpc_stat[2] : stat->trs_rpc_stat[2];
+}
+
+static inline int
+lstcon_sesop_stat_success(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_sesop_stat_failure(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+static inline int
+lstcon_sesqry_stat_active(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_sesqry_stat_busy(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+static inline int
+lstcon_sesqry_stat_unknown(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[2] : stat->trs_fwk_stat[2];
+}
+
+static inline int
+lstcon_tsbop_stat_success(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_tsbop_stat_failure(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+static inline int
+lstcon_tsbqry_stat_idle(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_tsbqry_stat_run(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+static inline int
+lstcon_tsbqry_stat_failure(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[2] : stat->trs_fwk_stat[2];
+}
+
+static inline int
+lstcon_statqry_stat_success(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_statqry_stat_failure(struct lstcon_trans_stat *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+/* create a session */
+struct lstio_session_new_args {
+	int			lstio_ses_key;		/* IN: local key */
+	int			lstio_ses_timeout;	/* IN: session timeout */
+	int			lstio_ses_force;	/* IN: force create ? */
+	/** IN: session features */
+	unsigned		lstio_ses_feats;
+	struct lst_sid __user  *lstio_ses_idp;		/* OUT: session id */
+	int			lstio_ses_nmlen;	/* IN: name length */
+	char __user	       *lstio_ses_namep;	/* IN: session name */
+};
+
+/* query current session */
+struct lstio_session_info_args {
+	struct lst_sid __user	*lstio_ses_idp;		/* OUT: session id */
+	int __user		*lstio_ses_keyp;	/* OUT: local key */
+	/** OUT: session features */
+	unsigned __user		*lstio_ses_featp;
+	struct lstcon_ndlist_ent __user *lstio_ses_ndinfo; /* OUT: */
+	int			 lstio_ses_nmlen;	/* IN: name length */
+	char __user		*lstio_ses_namep;	/* OUT: session name */
+};
+
+/* delete a session */
+struct lstio_session_end_args {
+	int			lstio_ses_key;		/* IN: session key */
+};
+
+#define LST_OPC_SESSION		1
+#define LST_OPC_GROUP		2
+#define LST_OPC_NODES		3
+#define LST_OPC_BATCHCLI	4
+#define LST_OPC_BATCHSRV	5
+
+struct lstio_debug_args {
+	int			lstio_dbg_key;		/* IN: session key */
+	int			lstio_dbg_type;		/* IN: debug sessin|batch|group|nodes list */
+	int			lstio_dbg_flags;	/* IN: reserved debug flags */
+	int			lstio_dbg_timeout;	/* IN: timeout of debug */
+
+	int			lstio_dbg_nmlen;	/* IN: len of name */
+	char __user	       *lstio_dbg_namep;	/* IN: name of group|batch */
+	int			lstio_dbg_count;	/* IN: # of test nodes to debug */
+	struct lnet_process_id __user *lstio_dbg_idsp;	/* IN: id of test nodes */
+	/* OUT: list head of result buffer */
+	struct list_head __user *lstio_dbg_resultp;
+};
+
+struct lstio_group_add_args {
+	int			lstio_grp_key;		/* IN: session key */
+	int			lstio_grp_nmlen;	/* IN: name length */
+	char __user	       *lstio_grp_namep;	/* IN: group name */
+};
+
+struct lstio_group_del_args {
+	int			lstio_grp_key;		/* IN: session key */
+	int			lstio_grp_nmlen;	/* IN: name length */
+	char __user	       *lstio_grp_namep;	/* IN: group name */
+};
+
+#define LST_GROUP_CLEAN		1			/* remove inactive nodes in the group */
+#define LST_GROUP_REFRESH	2			/* refresh inactive nodes in the group */
+#define LST_GROUP_RMND		3			/* delete nodes from the group */
+
+struct lstio_group_update_args {
+	int			lstio_grp_key;		/* IN: session key */
+	int			lstio_grp_opc;		/* IN: OPC */
+	int			lstio_grp_args;		/* IN: arguments */
+	int			lstio_grp_nmlen;	/* IN: name length */
+	char __user	       *lstio_grp_namep;	/* IN: group name */
+	int			lstio_grp_count;	/* IN: # of nodes id */
+	struct lnet_process_id __user *lstio_grp_idsp;	/* IN: array of nodes */
+	/* OUT: list head of result buffer */
+	struct list_head __user	*lstio_grp_resultp;
+};
+
+struct lstio_group_nodes_args {
+	int			 lstio_grp_key;		/* IN: session key */
+	int			 lstio_grp_nmlen;	/* IN: name length */
+	char __user		*lstio_grp_namep;	/* IN: group name */
+	int			 lstio_grp_count;	/* IN: # of nodes */
+	/** OUT: session features */
+	unsigned __user		*lstio_grp_featp;
+	struct lnet_process_id __user *lstio_grp_idsp;	/* IN: nodes */
+	/* OUT: list head of result buffer */
+	struct list_head __user	*lstio_grp_resultp;
+};
+
+struct lstio_group_list_args {
+	int			lstio_grp_key;		/* IN: session key */
+	int			lstio_grp_idx;		/* IN: group idx */
+	int			lstio_grp_nmlen;	/* IN: name len */
+	char __user	       *lstio_grp_namep;	/* OUT: name */
+};
+
+struct lstio_group_info_args {
+	int			lstio_grp_key;		/* IN: session key */
+	int			lstio_grp_nmlen;	/* IN: name len */
+	char __user	       *lstio_grp_namep;	/* IN: name */
+	struct lstcon_ndlist_ent __user *lstio_grp_entp;/* OUT: description of group */
+
+	int __user	       *lstio_grp_idxp;		/* IN/OUT: node index */
+	int __user	       *lstio_grp_ndentp;	/* IN/OUT: # of nodent */
+	struct lstcon_node_ent __user *lstio_grp_dentsp;/* OUT: nodent array */
+};
+
+#define LST_DEFAULT_BATCH	"batch"			/* default batch name */
+
+struct lstio_batch_add_args {
+	int			lstio_bat_key;		/* IN: session key */
+	int			lstio_bat_nmlen;	/* IN: name length */
+	char __user	       *lstio_bat_namep;	/* IN: batch name */
+};
+
+struct lstio_batch_del_args {
+	int			lstio_bat_key;		/* IN: session key */
+	int			lstio_bat_nmlen;	/* IN: name length */
+	char __user	       *lstio_bat_namep;	/* IN: batch name */
+};
+
+struct lstio_batch_run_args {
+	/* IN: session key */
+	int			 lstio_bat_key;
+	/* IN: timeout for the batch */
+	int			 lstio_bat_timeout;
+	/* IN: name length */
+	int			 lstio_bat_nmlen;
+	/* IN: batch name */
+	char __user		*lstio_bat_namep;
+	/* OUT: list head of result buffer */
+	struct list_head __user *lstio_bat_resultp;
+};
+
+struct lstio_batch_stop_args {
+	/* IN: session key */
+	int			 lstio_bat_key;
+	/* IN: abort unfinished test RPC */
+	int			 lstio_bat_force;
+	/* IN: name length */
+	int			 lstio_bat_nmlen;
+	/* IN: batch name */
+	char __user		*lstio_bat_namep;
+	/* OUT: list head of result buffer */
+	struct list_head __user *lstio_bat_resultp;
+};
+
+struct lstio_batch_query_args {
+	/* IN: session key */
+	int			lstio_bat_key;
+	/* IN: test index */
+	int			lstio_bat_testidx;
+	/* IN: is test client? */
+	int			lstio_bat_client;
+	/* IN: timeout for waiting */
+	int			lstio_bat_timeout;
+	/* IN: name length */
+	int			lstio_bat_nmlen;
+	/* IN: batch name */
+	char __user		*lstio_bat_namep;
+	/* OUT: list head of result buffer */
+	struct list_head __user *lstio_bat_resultp;
+};
+
+struct lstio_batch_list_args {
+	int			lstio_bat_key;		/* IN: session key */
+	int			lstio_bat_idx;		/* IN: index */
+	int			lstio_bat_nmlen;	/* IN: name length */
+	char __user	       *lstio_bat_namep;	/* IN: batch name */
+};
+
+struct lstio_batch_info_args {
+	int			lstio_bat_key;		/* IN: session key */
+	int			lstio_bat_nmlen;	/* IN: name length */
+	char __user	       *lstio_bat_namep;	/* IN: name */
+	int			lstio_bat_server;	/* IN: query server or not */
+	int			lstio_bat_testidx;	/* IN: test index */
+	struct lstcon_test_batch_ent __user *lstio_bat_entp;/* OUT: batch ent */
+
+	int __user	       *lstio_bat_idxp;		/* IN/OUT: index of node */
+	int __user	       *lstio_bat_ndentp;	/* IN/OUT: # of nodent */
+	struct lstcon_node_ent __user *lstio_bat_dentsp;/* array of nodent */
+};
+
+/* add stat in session */
+struct lstio_stat_args {
+	/* IN: session key */
+	int			lstio_sta_key;
+	/* IN: timeout for stat requst */
+	int			lstio_sta_timeout;
+	/* IN: group name length */
+	int			lstio_sta_nmlen;
+	/* IN: group name */
+	char __user	       *lstio_sta_namep;
+	/* IN: # of pid */
+	int			lstio_sta_count;
+	/* IN: pid */
+	struct lnet_process_id __user *lstio_sta_idsp;
+	/* OUT: list head of result buffer */
+	struct list_head __user *lstio_sta_resultp;
+};
+
+enum lst_test_type {
+	LST_TEST_BULK	= 1,
+	LST_TEST_PING	= 2
+};
+
+/* create a test in a batch */
+#define LST_MAX_CONCUR		1024			/* Max concurrency of test */
+
+struct lstio_test_args {
+	int			lstio_tes_key;		/* IN: session key */
+	int			lstio_tes_bat_nmlen;	/* IN: batch name len */
+	char __user	       *lstio_tes_bat_name;	/* IN: batch name */
+	int			lstio_tes_type;		/* IN: test type */
+	int			lstio_tes_oneside;	/* IN: one sided test */
+	int			lstio_tes_loop;		/* IN: loop count */
+	int			lstio_tes_concur;	/* IN: concurrency */
+
+	int			lstio_tes_dist;		/* IN: node distribution in destination groups */
+	int			lstio_tes_span;		/* IN: node span in destination groups */
+	int			lstio_tes_sgrp_nmlen;	/* IN: source group name length */
+	char __user	       *lstio_tes_sgrp_name;	/* IN: group name */
+	int			lstio_tes_dgrp_nmlen;	/* IN: destination group name length */
+	char __user	       *lstio_tes_dgrp_name;	/* IN: group name */
+
+	/* IN: param buffer len */
+	int			 lstio_tes_param_len;
+	/* IN: parameter for specified test:
+	       lstio_bulk_param_t,
+	       lstio_ping_param_t,
+	       ... more */
+	void __user		*lstio_tes_param;
+	/* OUT: private returned value */
+	int __user		*lstio_tes_retp;
+	/* OUT: list head of result buffer */
+	struct list_head __user *lstio_tes_resultp;
+};
+
+enum lst_brw_type {
+	LST_BRW_READ	= 1,
+	LST_BRW_WRITE	= 2
+};
+
+enum lst_brw_flags {
+	LST_BRW_CHECK_NONE   = 1,
+	LST_BRW_CHECK_SIMPLE = 2,
+	LST_BRW_CHECK_FULL   = 3
+};
+
+struct lst_test_bulk_param {
+	int blk_opc;		/* bulk operation code */
+	int blk_size;		/* size (bytes) */
+	int blk_time;		/* time of running the test*/
+	int blk_flags;		/* reserved flags */
+	int blk_cli_off;	/* bulk offset on client */
+	int blk_srv_off;	/* reserved: bulk offset on server */
+};
+
+struct lst_test_ping_param {
+	int png_size;		/* size of ping message */
+	int png_time;		/* time */
+	int png_loop;		/* loop */
+	int png_flags;		/* reserved flags */
+};
+
+struct srpc_counters {
+	__u32 errors;
+	__u32 rpcs_sent;
+	__u32 rpcs_rcvd;
+	__u32 rpcs_dropped;
+	__u32 rpcs_expired;
+	__u64 bulk_get;
+	__u64 bulk_put;
+} WIRE_ATTR;
+
+struct sfw_counters {
+	/** milliseconds since current session started */
+	__u32 running_ms;
+	__u32 active_batches;
+	__u32 zombie_sessions;
+	__u32 brw_errors;
+	__u32 ping_errors;
+} WIRE_ATTR;
+
+#endif
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/nidstr.h b/drivers/staging/lustrefsx/lnet/include/lnet/nidstr.h
new file mode 100644
index 0000000000000..be14a1dfcf71d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/nidstr.h
@@ -0,0 +1,112 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, 2015, Intel Corporation.
+ */
+#ifndef _LNET_NIDSTRINGS_H
+#define _LNET_NIDSTRINGS_H
+#include <lnet/types.h>
+
+/**
+ *  Lustre Network Driver types.
+ */
+enum {
+	/* Only add to these values (i.e. don't ever change or redefine them):
+	 * network addresses depend on them... */
+	/*QSWLND	= 1, removed v2_7_50                 */
+	SOCKLND		= 2,
+	/*GMLND		= 3, removed v2_0_0-rc1a-16-gc660aac */
+	/*PTLLND	= 4, removed v2_7_50                 */
+	O2IBLND		= 5,
+	/*CIBLND	= 6, removed v2_0_0-rc1a-175-gd2b8a0e */
+	/*OPENIBLND	= 7, removed v2_0_0-rc1a-175-gd2b8a0e */
+	/*IIBLND	= 8, removed v2_0_0-rc1a-175-gd2b8a0e */
+	LOLND		= 9,
+	/*RALND		= 10, removed v2_7_50_0-34-g8be9e41    */
+	/*VIBLND	= 11, removed v2_0_0-rc1a-175-gd2b8a0e */
+	/*MXLND		= 12, removed v2_7_50_0-34-g8be9e41    */
+	GNILND		= 13,
+	GNIIPLND	= 14,
+	PTL4LND		= 15,
+};
+
+struct list_head;
+
+#define LNET_NIDSTR_COUNT 1024	/* # of nidstrings */
+#define LNET_NIDSTR_SIZE  32	/* size of each one (see below for usage) */
+
+/* support decl needed by both kernel and user space */
+char *libcfs_next_nidstring(void);
+int libcfs_isknown_lnd(__u32 lnd);
+char *libcfs_lnd2modname(__u32 lnd);
+char *libcfs_lnd2str_r(__u32 lnd, char *buf, size_t buf_size);
+static inline char *libcfs_lnd2str(__u32 lnd)
+{
+	return libcfs_lnd2str_r(lnd, libcfs_next_nidstring(),
+				LNET_NIDSTR_SIZE);
+}
+int libcfs_str2lnd(const char *str);
+char *libcfs_net2str_r(__u32 net, char *buf, size_t buf_size);
+static inline char *libcfs_net2str(__u32 net)
+{
+	return libcfs_net2str_r(net, libcfs_next_nidstring(),
+				LNET_NIDSTR_SIZE);
+}
+char *libcfs_nid2str_r(lnet_nid_t nid, char *buf, size_t buf_size);
+static inline char *libcfs_nid2str(lnet_nid_t nid)
+{
+	return libcfs_nid2str_r(nid, libcfs_next_nidstring(),
+				LNET_NIDSTR_SIZE);
+}
+__u32 libcfs_str2net(const char *str);
+lnet_nid_t libcfs_str2nid(const char *str);
+int libcfs_str2anynid(lnet_nid_t *nid, const char *str);
+char *libcfs_id2str(struct lnet_process_id id);
+void cfs_free_nidlist(struct list_head *list);
+int cfs_parse_nidlist(char *str, int len, struct list_head *list);
+int cfs_print_nidlist(char *buffer, int count, struct list_head *list);
+int cfs_match_nid(lnet_nid_t nid, struct list_head *list);
+
+int cfs_ip_addr_parse(char *str, int len, struct list_head *list);
+int cfs_ip_addr_match(__u32 addr, struct list_head *list);
+int cfs_nidrange_find_min_max(struct list_head *nidlist, char *min_nid,
+			       char *max_nid, size_t nidstr_length);
+
+struct netstrfns {
+	__u32	nf_type;
+	char	*nf_name;
+	char	*nf_modname;
+	void	(*nf_addr2str)(__u32 addr, char *str, size_t size);
+	int	(*nf_str2addr)(const char *str, int nob, __u32 *addr);
+	int	(*nf_parse_addrlist)(char *str, int len,
+				     struct list_head *list);
+	int	(*nf_print_addrlist)(char *buffer, int count,
+				     struct list_head *list);
+	int	(*nf_match_addr)(__u32 addr, struct list_head *list);
+	int	(*nf_min_max)(struct list_head *nidlist, __u32 *min_nid,
+			      __u32 *max_nid);
+};
+
+#endif /* _LNET_NIDSTRINGS_H */
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/socklnd.h b/drivers/staging/lustrefsx/lnet/include/lnet/socklnd.h
new file mode 100644
index 0000000000000..843d35c06105a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/socklnd.h
@@ -0,0 +1,90 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/socklnd.h
+ *
+ * #defines shared between socknal implementation and utilities
+ */
+#ifndef __LNET_LNET_SOCKLND_H__
+#define __LNET_LNET_SOCKLND_H__
+
+#include <lnet/types.h>
+
+#define SOCKLND_CONN_NONE     (-1)
+#define SOCKLND_CONN_ANY	0
+#define SOCKLND_CONN_CONTROL	1
+#define SOCKLND_CONN_BULK_IN	2
+#define SOCKLND_CONN_BULK_OUT	3
+#define SOCKLND_CONN_NTYPES	4
+
+#define SOCKLND_CONN_ACK	SOCKLND_CONN_BULK_IN
+
+struct ksock_hello_msg {
+	__u32			kshm_magic;	/* magic number of socklnd message */
+	__u32			kshm_version;	/* version of socklnd message */
+	lnet_nid_t		kshm_src_nid;	/* sender's nid */
+	lnet_nid_t		kshm_dst_nid;	/* destination nid */
+	lnet_pid_t		kshm_src_pid;	/* sender's pid */
+	lnet_pid_t		kshm_dst_pid;	/* destination pid */
+	__u64			kshm_src_incarnation; /* sender's incarnation */
+	__u64			kshm_dst_incarnation; /* destination's incarnation */
+	__u32			kshm_ctype;	/* connection type */
+	__u32			kshm_nips;	/* # IP addrs */
+	__u32			kshm_ips[0];	/* IP addrs */
+} WIRE_ATTR;
+
+struct ksock_lnet_msg {
+	struct lnet_hdr		ksnm_hdr;	/* lnet hdr */
+
+	/*
+	 * ksnm_payload is removed because of winnt compiler's limitation:
+	 * zero-sized array can only be placed at the tail of [nested]
+	 * structure definitions. lnet payload will be stored just after
+	 * the body of structure struct ksock_lnet_msg
+	 */
+} WIRE_ATTR;
+
+struct ksock_msg {
+	__u32			ksm_type;	/* type of socklnd message */
+	__u32			ksm_csum;	/* checksum if != 0 */
+	__u64			ksm_zc_cookies[2]; /* Zero-Copy request/ACK cookie */
+	union {
+		struct ksock_lnet_msg lnetmsg;	/* lnet message, it's empty if it's NOOP */
+	} WIRE_ATTR ksm_u;
+} WIRE_ATTR;
+
+#define KSOCK_MSG_NOOP		0xc0		/* ksm_u empty */
+#define KSOCK_MSG_LNET		0xc1		/* lnet msg */
+
+/* We need to know this number to parse hello msg from ksocklnd in
+ * other LND (usocklnd, for example) */
+#define KSOCK_PROTO_V2		2
+#define KSOCK_PROTO_V3		3
+
+#endif
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/types.h b/drivers/staging/lustrefsx/lnet/include/lnet/types.h
new file mode 100644
index 0000000000000..e4bfe3d4951dd
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/types.h
@@ -0,0 +1,671 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_TYPES_H__
+#define __LNET_TYPES_H__
+
+/** \addtogroup lnet
+ * @{ */
+
+#include <linux/types.h>
+/** \addtogroup lnet_addr
+ * @{ */
+
+#define LNET_VERSION		"0.7.0"
+
+/** Portal reserved for LNet's own use.
+ * \see lustre/include/lustre/lustre_idl.h for Lustre portal assignments.
+ */
+#define LNET_RESERVED_PORTAL	  0
+
+/**
+ * Address of an end-point in an LNet network.
+ *
+ * A node can have multiple end-points and hence multiple addresses.
+ * An LNet network can be a simple network (e.g. tcp0) or a network of
+ * LNet networks connected by LNet routers. Therefore an end-point address
+ * has two parts: network ID, and address within a network.
+ *
+ * \see LNET_NIDNET, LNET_NIDADDR, and LNET_MKNID.
+ */
+typedef __u64 lnet_nid_t;
+
+/**
+ * ID of a process in a node. Shortened as PID to distinguish from
+ * lnet_process_id, the global process ID.
+ */
+typedef __u32 lnet_pid_t;
+
+/** wildcard NID that matches any end-point address */
+#define LNET_NID_ANY	  ((lnet_nid_t) -1)
+/** wildcard PID that matches any lnet_pid_t */
+#define LNET_PID_ANY	  ((lnet_pid_t) -1)
+
+#define LNET_PID_RESERVED 0xf0000000 /* reserved bits in PID */
+#define LNET_PID_USERFLAG 0x80000000 /* set in userspace peers */
+#define LNET_PID_LUSTRE 12345
+
+/* how an LNET NID encodes net:address */
+/** extract the address part of an lnet_nid_t */
+
+static inline __u32 LNET_NIDADDR(lnet_nid_t nid)
+{
+	return nid & 0xffffffff;
+}
+
+static inline __u32 LNET_NIDNET(lnet_nid_t nid)
+{
+	return (nid >> 32) & 0xffffffff;
+}
+
+static inline lnet_nid_t LNET_MKNID(__u32 net, __u32 addr)
+{
+	return (((__u64)net) << 32) | addr;
+}
+
+static inline __u32 LNET_NETNUM(__u32 net)
+{
+	return net & 0xffff;
+}
+
+static inline __u32 LNET_NETTYP(__u32 net)
+{
+	return (net >> 16) & 0xffff;
+}
+
+static inline __u32 LNET_MKNET(__u32 type, __u32 num)
+{
+	return (type << 16) | num;
+}
+
+#define WIRE_ATTR	__attribute__((packed))
+
+/* Packed version of struct lnet_process_id to transfer via network */
+typedef struct lnet_process_id_packed {
+	lnet_nid_t nid;
+	lnet_pid_t pid;	/* node id / process id */
+} WIRE_ATTR lnet_process_id_packed;
+
+/* The wire handle's interface cookie only matches one network interface in
+ * one epoch (i.e. new cookie when the interface restarts or the node
+ * reboots).  The object cookie only matches one object on that interface
+ * during that object's lifetime (i.e. no cookie re-use). */
+typedef struct lnet_handle_wire {
+	__u64 wh_interface_cookie;
+	__u64 wh_object_cookie;
+} WIRE_ATTR lnet_handle_wire_t;
+
+typedef enum lnet_msg_type {
+	LNET_MSG_ACK = 0,
+	LNET_MSG_PUT,
+	LNET_MSG_GET,
+	LNET_MSG_REPLY,
+	LNET_MSG_HELLO,
+} lnet_msg_type_t;
+
+/* The variant fields of the portals message header are aligned on an 8
+ * byte boundary in the message header.  Note that all types used in these
+ * wire structs MUST be fixed size and the smaller types are placed at the
+ * end. */
+struct lnet_ack {
+	struct lnet_handle_wire	dst_wmd;
+	__u64			match_bits;
+	__u32			mlength;
+} WIRE_ATTR;
+
+struct lnet_put {
+	struct lnet_handle_wire	ack_wmd;
+	__u64			match_bits;
+	__u64			hdr_data;
+	__u32			ptl_index;
+	__u32			offset;
+} WIRE_ATTR;
+
+struct lnet_get {
+	struct lnet_handle_wire	return_wmd;
+	__u64			match_bits;
+	__u32			ptl_index;
+	__u32			src_offset;
+	__u32			sink_length;
+} WIRE_ATTR;
+
+struct lnet_reply {
+	struct lnet_handle_wire	dst_wmd;
+} WIRE_ATTR;
+
+struct lnet_hello {
+	__u64			incarnation;
+	__u32			type;
+} WIRE_ATTR;
+
+typedef struct lnet_hdr {
+	lnet_nid_t	dest_nid;
+	lnet_nid_t	src_nid;
+	lnet_pid_t	dest_pid;
+	lnet_pid_t	src_pid;
+	__u32		type;		/* enum lnet_msg_type */
+	__u32		payload_length;	/* payload data to follow */
+	/*<------__u64 aligned------->*/
+	union {
+		struct lnet_ack		ack;
+		struct lnet_put		put;
+		struct lnet_get		get;
+		struct lnet_reply	reply;
+		struct lnet_hello	hello;
+	} msg;
+} WIRE_ATTR lnet_hdr_t;
+
+/* A HELLO message contains a magic number and protocol version
+ * code in the header's dest_nid, the peer's NID in the src_nid, and
+ * LNET_MSG_HELLO in the type field.  All other common fields are zero
+ * (including payload_size; i.e. no payload).
+ * This is for use by byte-stream LNDs (e.g. TCP/IP) to check the peer is
+ * running the same protocol and to find out its NID. These LNDs should
+ * exchange HELLO messages when a connection is first established.  Individual
+ * LNDs can put whatever else they fancy in lnet_hdr::msg.
+ */
+typedef struct lnet_magicversion {
+	__u32	magic;		/* LNET_PROTO_TCP_MAGIC */
+	__u16	version_major;	/* increment on incompatible change */
+	__u16	version_minor;	/* increment on compatible change */
+} WIRE_ATTR lnet_magic_version_t;
+
+/* PROTO MAGIC for LNDs */
+#define LNET_PROTO_IB_MAGIC		0x0be91b91
+#define LNET_PROTO_GNI_MAGIC		0xb00fbabe /* ask Kim */
+#define LNET_PROTO_TCP_MAGIC		0xeebc0ded
+#define LNET_PROTO_ACCEPTOR_MAGIC	0xacce7100
+#define LNET_PROTO_PING_MAGIC		0x70696E67 /* 'ping' */
+
+/* Placeholder for a future "unified" protocol across all LNDs */
+/* Current LNDs that receive a request with this magic will respond
+ * with a "stub" reply using their current protocol */
+#define LNET_PROTO_MAGIC		0x45726963 /* ! */
+
+#define LNET_PROTO_TCP_VERSION_MAJOR	1
+#define LNET_PROTO_TCP_VERSION_MINOR	0
+
+/* Acceptor connection request */
+typedef struct lnet_acceptor_connreq {
+	__u32	acr_magic;	/* PTL_ACCEPTOR_PROTO_MAGIC */
+	__u32	acr_version;	/* protocol version */
+	__u64	acr_nid;	/* target NID */
+} WIRE_ATTR lnet_acceptor_connreq_t;
+
+#define LNET_PROTO_ACCEPTOR_VERSION	1
+
+typedef struct lnet_counters {
+	__u32	msgs_alloc;
+	__u32	msgs_max;
+	__u32	errors;
+	__u32	send_count;
+	__u32	recv_count;
+	__u32	route_count;
+	__u32	drop_count;
+	__u64	send_length;
+	__u64	recv_length;
+	__u64	route_length;
+	__u64	drop_length;
+} WIRE_ATTR lnet_counters_t;
+
+#define LNET_NI_STATUS_UP	0x15aac0de
+#define LNET_NI_STATUS_DOWN	0xdeadface
+#define LNET_NI_STATUS_INVALID	0x00000000
+
+/*
+ * This is a hard-coded limit on the number of interfaces supported by
+ * the interface bonding implemented by the ksocknal LND. It must be
+ * defined here because it is used in LNet data structures that are
+ * common to all LNDs.
+ */
+#define LNET_NUM_INTERFACES	16
+
+/**
+ * Objects maintained by the LNet are accessed through handles. Handle types
+ * have names of the form lnet_handle_xx, where xx is one of the two letter
+ * object type codes ('eq' for event queue, 'md' for memory descriptor, and
+ * 'me' for match entry). Each type of object is given a unique handle type
+ * to enhance type checking.
+ */
+#define LNET_WIRE_HANDLE_COOKIE_NONE   (-1)
+
+typedef struct lnet_handle_eq {
+	__u64	cookie;
+} lnet_handle_eq_t;
+
+/**
+ * Invalidate eq handle \a h.
+ */
+static inline void LNetInvalidateEQHandle(struct lnet_handle_eq *h)
+{
+	h->cookie = LNET_WIRE_HANDLE_COOKIE_NONE;
+}
+
+/**
+ * Check whether eq handle \a h is invalid.
+ *
+ * \return 1 if handle is invalid, 0 if valid.
+ */
+static inline int LNetEQHandleIsInvalid(struct lnet_handle_eq h)
+{
+	return (LNET_WIRE_HANDLE_COOKIE_NONE == h.cookie);
+}
+
+typedef struct lnet_handle_md {
+	__u64	cookie;
+} lnet_handle_md_t;
+
+/**
+ * Invalidate md handle \a h.
+ */
+static inline void LNetInvalidateMDHandle(struct lnet_handle_md *h)
+{
+	h->cookie = LNET_WIRE_HANDLE_COOKIE_NONE;
+}
+
+/**
+ * Check whether eq handle \a h is invalid.
+ *
+ * \return 1 if handle is invalid, 0 if valid.
+ */
+static inline int LNetMDHandleIsInvalid(struct lnet_handle_md h)
+{
+	return (LNET_WIRE_HANDLE_COOKIE_NONE == h.cookie);
+}
+
+typedef struct lnet_handle_me {
+	__u64	cookie;
+} lnet_handle_me_t;
+
+/**
+ * Global process ID.
+ */
+typedef struct lnet_process_id {
+	/** node id */
+	lnet_nid_t nid;
+	/** process id */
+	lnet_pid_t pid;
+} lnet_process_id_t;
+/** @} lnet_addr */
+
+/** \addtogroup lnet_me
+ * @{ */
+
+/**
+ * Specifies whether the match entry or memory descriptor should be unlinked
+ * automatically (LNET_UNLINK) or not (LNET_RETAIN).
+ */
+typedef enum lnet_unlink {
+	LNET_RETAIN = 0,
+	LNET_UNLINK
+} lnet_unlink_t;
+
+/**
+ * Values of the type enum lnet_ins_pos are used to control where a new match
+ * entry is inserted. The value LNET_INS_BEFORE is used to insert the new
+ * entry before the current entry or before the head of the list. The value
+ * LNET_INS_AFTER is used to insert the new entry after the current entry
+ * or after the last item in the list.
+ */
+typedef enum lnet_ins_pos {
+	/** insert ME before current position or head of the list */
+	LNET_INS_BEFORE,
+	/** insert ME after current position or tail of the list */
+	LNET_INS_AFTER,
+	/** attach ME at tail of local CPU partition ME list */
+	LNET_INS_LOCAL
+} lnet_ins_pos;
+
+/** @} lnet_me */
+
+/** \addtogroup lnet_md
+ * @{ */
+
+/**
+ * Defines the visible parts of a memory descriptor. Values of this type
+ * are used to initialize memory descriptors.
+ */
+typedef struct lnet_md {
+	/**
+	 * Specify the memory region associated with the memory descriptor.
+	 * If the options field has:
+	 * - LNET_MD_KIOV bit set: The start field points to the starting
+	 * address of an array of lnet_kiov_t and the length field specifies
+	 * the number of entries in the array. The length can't be bigger
+	 * than LNET_MAX_IOV. The lnet_kiov_t is used to describe page-based
+	 * fragments that are not necessarily mapped in virtal memory.
+	 * - LNET_MD_IOVEC bit set: The start field points to the starting
+	 * address of an array of struct kvec and the length field specifies
+	 * the number of entries in the array. The length can't be bigger
+	 * than LNET_MAX_IOV. The struct kvec is used to describe fragments
+	 * that have virtual addresses.
+	 * - Otherwise: The memory region is contiguous. The start field
+	 * specifies the starting address for the memory region and the
+	 * length field specifies its length.
+	 *
+	 * When the memory region is fragmented, all fragments but the first
+	 * one must start on page boundary, and all but the last must end on
+	 * page boundary.
+	 */
+	void		*start;
+	unsigned int	 length;
+	/**
+	 * Specifies the maximum number of operations that can be performed
+	 * on the memory descriptor. An operation is any action that could
+	 * possibly generate an event. In the usual case, the threshold value
+	 * is decremented for each operation on the MD. When the threshold
+	 * drops to zero, the MD becomes inactive and does not respond to
+	 * operations. A threshold value of LNET_MD_THRESH_INF indicates that
+	 * there is no bound on the number of operations that may be applied
+	 * to a MD.
+	 */
+	int		 threshold;
+	/**
+	 * Specifies the largest incoming request that the memory descriptor
+	 * should respond to. When the unused portion of a MD (length -
+	 * local offset) falls below this value, the MD becomes inactive and
+	 * does not respond to further operations. This value is only used
+	 * if the LNET_MD_MAX_SIZE option is set.
+	 */
+	int		 max_size;
+	/**
+	 * Specifies the behavior of the memory descriptor. A bitwise OR
+	 * of the following values can be used:
+	 * - LNET_MD_OP_PUT: The LNet PUT operation is allowed on this MD.
+	 * - LNET_MD_OP_GET: The LNet GET operation is allowed on this MD.
+	 * - LNET_MD_MANAGE_REMOTE: The offset used in accessing the memory
+	 *   region is provided by the incoming request. By default, the
+	 *   offset is maintained locally. When maintained locally, the
+	 *   offset is incremented by the length of the request so that
+	 *   the next operation (PUT or GET) will access the next part of
+	 *   the memory region. Note that only one offset variable exists
+	 *   per memory descriptor. If both PUT and GET operations are
+	 *   performed on a memory descriptor, the offset is updated each time.
+	 * - LNET_MD_TRUNCATE: The length provided in the incoming request can
+	 *   be reduced to match the memory available in the region (determined
+	 *   by subtracting the offset from the length of the memory region).
+	 *   By default, if the length in the incoming operation is greater
+	 *   than the amount of memory available, the operation is rejected.
+	 * - LNET_MD_ACK_DISABLE: An acknowledgment should not be sent for
+	 *   incoming PUT operations, even if requested. By default,
+	 *   acknowledgments are sent for PUT operations that request an
+	 *   acknowledgment. Acknowledgments are never sent for GET operations.
+	 *   The data sent in the REPLY serves as an implicit acknowledgment.
+	 * - LNET_MD_KIOV: The start and length fields specify an array of
+	 *   lnet_kiov_t.
+	 * - LNET_MD_IOVEC: The start and length fields specify an array of
+	 *   struct iovec.
+	 * - LNET_MD_MAX_SIZE: The max_size field is valid.
+	 * - LNET_MD_BULK_HANDLE: The bulk_handle field is valid.
+	 *
+	 * Note:
+	 * - LNET_MD_KIOV or LNET_MD_IOVEC allows for a scatter/gather
+	 *   capability for memory descriptors. They can't be both set.
+	 * - When LNET_MD_MAX_SIZE is set, the total length of the memory
+	 *   region (i.e. sum of all fragment lengths) must not be less than
+	 *   \a max_size.
+	 */
+	unsigned int	 options;
+	/**
+	 * A user-specified value that is associated with the memory
+	 * descriptor. The value does not need to be a pointer, but must fit
+	 * in the space used by a pointer. This value is recorded in events
+	 * associated with operations on this MD.
+	 */
+	void		*user_ptr;
+	/**
+	 * A handle for the event queue used to log the operations performed on
+	 * the memory region. If this argument is a NULL handle (i.e. nullified
+	 * by LNetInvalidateHandle()), operations performed on this memory
+	 * descriptor are not logged.
+	 */
+	struct lnet_handle_eq eq_handle;
+	/**
+	 * The bulk MD handle which was registered to describe the buffers
+	 * either to be used to transfer data to the peer or receive data
+	 * from the peer. This allows LNet to properly determine the NUMA
+	 * node on which the memory was allocated and use that to select the
+	 * nearest local network interface. This value is only used
+	 * if the LNET_MD_BULK_HANDLE option is set.
+	 */
+	struct lnet_handle_md bulk_handle;
+} lnet_md_t;
+
+/* Max Transfer Unit (minimum supported everywhere).
+ * CAVEAT EMPTOR, with multinet (i.e. routers forwarding between networks)
+ * these limits are system wide and not interface-local. */
+#define LNET_MTU_BITS	20
+#define LNET_MTU	(1 << LNET_MTU_BITS)
+
+/** limit on the number of fragments in discontiguous MDs */
+#define LNET_MAX_IOV	256
+
+/**
+ * Options for the MD structure. See struct lnet_md::options.
+ */
+#define LNET_MD_OP_PUT		     (1 << 0)
+/** See struct lnet_md::options. */
+#define LNET_MD_OP_GET		     (1 << 1)
+/** See struct lnet_md::options. */
+#define LNET_MD_MANAGE_REMOTE	     (1 << 2)
+/* unused			     (1 << 3) */
+/** See struct lnet_md::options. */
+#define LNET_MD_TRUNCATE	     (1 << 4)
+/** See struct lnet_md::options. */
+#define LNET_MD_ACK_DISABLE	     (1 << 5)
+/** See struct lnet_md::options. */
+#define LNET_MD_IOVEC		     (1 << 6)
+/** See struct lnet_md::options. */
+#define LNET_MD_MAX_SIZE	     (1 << 7)
+/** See struct lnet_md::options. */
+#define LNET_MD_KIOV		     (1 << 8)
+/** See struct lnet_md::options. */
+#define LNET_MD_BULK_HANDLE	     (1 << 9)
+
+/* For compatibility with Cray Portals */
+#define LNET_MD_PHYS			     0
+
+/** Infinite threshold on MD operations. See struct lnet_md::threshold */
+#define LNET_MD_THRESH_INF	 (-1)
+
+/**
+ * A page-based fragment of a MD.
+ */
+typedef struct {
+	/** Pointer to the page where the fragment resides */
+	struct page	 *kiov_page;
+	/** Length in bytes of the fragment */
+	unsigned int	 kiov_len;
+	/**
+	 * Starting offset of the fragment within the page. Note that the
+	 * end of the fragment must not pass the end of the page; i.e.,
+	 * kiov_len + kiov_offset <= PAGE_SIZE.
+	 */
+	unsigned int	 kiov_offset;
+} lnet_kiov_t;
+/** @} lnet_md */
+
+/** \addtogroup lnet_eq
+ * @{ */
+
+/**
+ * Six types of events can be logged in an event queue.
+ */
+typedef enum lnet_event_kind {
+	/** An incoming GET operation has completed on the MD. */
+	LNET_EVENT_GET		= 1,
+	/**
+	 * An incoming PUT operation has completed on the MD. The
+	 * underlying layers will not alter the memory (on behalf of this
+	 * operation) once this event has been logged.
+	 */
+	LNET_EVENT_PUT,
+	/**
+	 * A REPLY operation has completed. This event is logged after the
+	 * data (if any) from the REPLY has been written into the MD.
+	 */
+	LNET_EVENT_REPLY,
+	/** An acknowledgment has been received. */
+	LNET_EVENT_ACK,
+	/**
+	 * An outgoing send (PUT or GET) operation has completed. This event
+	 * is logged after the entire buffer has been sent and it is safe for
+	 * the caller to reuse the buffer.
+	 *
+	 * Note:
+	 * - The LNET_EVENT_SEND doesn't guarantee message delivery. It can
+	 *   happen even when the message has not yet been put out on wire.
+	 * - It's unsafe to assume that in an outgoing GET operation
+	 *   the LNET_EVENT_SEND event would happen before the
+	 *   LNET_EVENT_REPLY event. The same holds for LNET_EVENT_SEND and
+	 *   LNET_EVENT_ACK events in an outgoing PUT operation.
+	 */
+	LNET_EVENT_SEND,
+	/**
+	 * A MD has been unlinked. Note that LNetMDUnlink() does not
+	 * necessarily trigger an LNET_EVENT_UNLINK event.
+	 * \see LNetMDUnlink
+	 */
+	LNET_EVENT_UNLINK,
+} lnet_event_kind_t;
+
+#define LNET_SEQ_GT(a, b)	(((signed long)((a) - (b))) > 0)
+
+/**
+ * Information about an event on a MD.
+ */
+typedef struct lnet_event {
+	/** The identifier (nid, pid) of the target. */
+	struct lnet_process_id   target;
+	/** The identifier (nid, pid) of the initiator. */
+	struct lnet_process_id   initiator;
+	/** The source NID on the initiator. */
+	struct lnet_process_id   source;
+	/**
+	 * The NID of the immediate sender. If the request has been forwarded
+	 * by routers, this is the NID of the last hop; otherwise it's the
+	 * same as the source.
+	 */
+	lnet_nid_t          sender;
+	/** Indicates the type of the event. */
+	enum lnet_event_kind	type;
+	/** The portal table index specified in the request */
+	unsigned int        pt_index;
+	/** A copy of the match bits specified in the request. */
+	__u64               match_bits;
+	/** The length (in bytes) specified in the request. */
+	unsigned int        rlength;
+	/**
+	 * The length (in bytes) of the data that was manipulated by the
+	 * operation. For truncated operations, the manipulated length will be
+	 * the number of bytes specified by the MD (possibly with an offset,
+	 * see struct lnet_md). For all other operations, the manipulated length
+	 * will be the length of the requested operation, i.e. rlength.
+	 */
+	unsigned int        mlength;
+	/**
+	 * The handle to the MD associated with the event. The handle may be
+	 * invalid if the MD has been unlinked.
+	 */
+	struct lnet_handle_md	md_handle;
+	/**
+	 * A snapshot of the state of the MD immediately after the event has
+	 * been processed. In particular, the threshold field in md will
+	 * reflect the value of the threshold after the operation occurred.
+	 */
+	struct lnet_md           md;
+	/**
+	 * 64 bits of out-of-band user data. Only valid for LNET_EVENT_PUT.
+	 * \see LNetPut
+	 */
+	__u64               hdr_data;
+	/**
+	 * Indicates the completion status of the operation. It's 0 for
+	 * successful operations, otherwise it's an error code.
+	 */
+	int                 status;
+	/**
+	 * Indicates whether the MD has been unlinked. Note that:
+	 * - An event with unlinked set is the last event on the MD.
+	 * - This field is also set for an explicit LNET_EVENT_UNLINK event.
+	 * \see LNetMDUnlink
+	 */
+	int                 unlinked;
+	/**
+	 * The displacement (in bytes) into the memory region that the
+	 * operation used. The offset can be determined by the operation for
+	 * a remote managed MD or by the local MD.
+	 * \see struct lnet_md::options
+	 */
+	unsigned int        offset;
+	/**
+	 * The sequence number for this event. Sequence numbers are unique
+	 * to each event.
+	 */
+	volatile unsigned long sequence;
+} lnet_event_t;
+
+/**
+ * Event queue handler function type.
+ *
+ * The EQ handler runs for each event that is deposited into the EQ. The
+ * handler is supplied with a pointer to the event that triggered the
+ * handler invocation.
+ *
+ * The handler must not block, must be reentrant, and must not call any LNet
+ * API functions. It should return as quickly as possible.
+ */
+typedef void (*lnet_eq_handler_t)(struct lnet_event *event);
+#define LNET_EQ_HANDLER_NONE NULL
+/** @} lnet_eq */
+
+/** \addtogroup lnet_data
+ * @{ */
+
+/**
+ * Specify whether an acknowledgment should be sent by target when the PUT
+ * operation completes (i.e., when the data has been written to a MD of the
+ * target process).
+ *
+ * \see struct lnet_md::options for the discussion on LNET_MD_ACK_DISABLE
+ * by which acknowledgments can be disabled for a MD.
+ */
+typedef enum lnet_ack_req {
+	/** Request an acknowledgment */
+	LNET_ACK_REQ,
+	/** Request that no acknowledgment should be generated. */
+	LNET_NOACK_REQ
+} lnet_ack_req_t;
+/** @} lnet_data */
+
+/** @} lnet */
+#endif
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c
new file mode 100644
index 0000000000000..110b6e699f095
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c
@@ -0,0 +1,3386 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include <asm/page.h>
+#include "o2iblnd.h"
+
+static struct lnet_lnd the_o2iblnd;
+
+kib_data_t              kiblnd_data;
+
+static __u32
+kiblnd_cksum (void *ptr, int nob)
+{
+        char  *c  = ptr;
+        __u32  sum = 0;
+
+        while (nob-- > 0)
+                sum = ((sum << 1) | (sum >> 31)) + *c++;
+
+        /* ensure I don't return 0 (== no checksum) */
+        return (sum == 0) ? 1 : sum;
+}
+
+static char *
+kiblnd_msgtype2str(int type)
+{
+        switch (type) {
+        case IBLND_MSG_CONNREQ:
+                return "CONNREQ";
+
+        case IBLND_MSG_CONNACK:
+                return "CONNACK";
+
+        case IBLND_MSG_NOOP:
+                return "NOOP";
+
+        case IBLND_MSG_IMMEDIATE:
+                return "IMMEDIATE";
+
+        case IBLND_MSG_PUT_REQ:
+                return "PUT_REQ";
+
+        case IBLND_MSG_PUT_NAK:
+                return "PUT_NAK";
+
+        case IBLND_MSG_PUT_ACK:
+                return "PUT_ACK";
+
+        case IBLND_MSG_PUT_DONE:
+                return "PUT_DONE";
+
+        case IBLND_MSG_GET_REQ:
+                return "GET_REQ";
+
+        case IBLND_MSG_GET_DONE:
+                return "GET_DONE";
+
+        default:
+                return "???";
+        }
+}
+
+static int
+kiblnd_msgtype2size(int type)
+{
+        const int hdr_size = offsetof(kib_msg_t, ibm_u);
+
+        switch (type) {
+        case IBLND_MSG_CONNREQ:
+        case IBLND_MSG_CONNACK:
+                return hdr_size + sizeof(kib_connparams_t);
+
+        case IBLND_MSG_NOOP:
+                return hdr_size;
+
+        case IBLND_MSG_IMMEDIATE:
+                return offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]);
+
+        case IBLND_MSG_PUT_REQ:
+                return hdr_size + sizeof(kib_putreq_msg_t);
+
+        case IBLND_MSG_PUT_ACK:
+                return hdr_size + sizeof(kib_putack_msg_t);
+
+        case IBLND_MSG_GET_REQ:
+                return hdr_size + sizeof(kib_get_msg_t);
+
+        case IBLND_MSG_PUT_NAK:
+        case IBLND_MSG_PUT_DONE:
+        case IBLND_MSG_GET_DONE:
+                return hdr_size + sizeof(kib_completion_msg_t);
+        default:
+                return -1;
+        }
+}
+
+static int
+kiblnd_unpack_rd(kib_msg_t *msg, int flip)
+{
+        kib_rdma_desc_t   *rd;
+        int                nob;
+        int                n;
+        int                i;
+
+        LASSERT (msg->ibm_type == IBLND_MSG_GET_REQ ||
+                 msg->ibm_type == IBLND_MSG_PUT_ACK);
+
+        rd = msg->ibm_type == IBLND_MSG_GET_REQ ?
+                              &msg->ibm_u.get.ibgm_rd :
+                              &msg->ibm_u.putack.ibpam_rd;
+
+        if (flip) {
+                __swab32s(&rd->rd_key);
+                __swab32s(&rd->rd_nfrags);
+        }
+
+        n = rd->rd_nfrags;
+
+        if (n <= 0 || n > IBLND_MAX_RDMA_FRAGS) {
+                CERROR("Bad nfrags: %d, should be 0 < n <= %d\n",
+                       n, IBLND_MAX_RDMA_FRAGS);
+                return 1;
+        }
+
+        nob = offsetof (kib_msg_t, ibm_u) +
+              kiblnd_rd_msg_size(rd, msg->ibm_type, n);
+
+        if (msg->ibm_nob < nob) {
+                CERROR("Short %s: %d(%d)\n",
+                       kiblnd_msgtype2str(msg->ibm_type), msg->ibm_nob, nob);
+                return 1;
+        }
+
+        if (!flip)
+                return 0;
+
+        for (i = 0; i < n; i++) {
+                __swab32s(&rd->rd_frags[i].rf_nob);
+                __swab64s(&rd->rd_frags[i].rf_addr);
+        }
+
+        return 0;
+}
+
+void
+kiblnd_pack_msg(struct lnet_ni *ni, kib_msg_t *msg, int version,
+		int credits, lnet_nid_t dstnid, __u64 dststamp)
+{
+        kib_net_t *net = ni->ni_data;
+
+        /* CAVEAT EMPTOR! all message fields not set here should have been
+         * initialised previously. */
+        msg->ibm_magic    = IBLND_MSG_MAGIC;
+        msg->ibm_version  = version;
+        /*   ibm_type */
+        msg->ibm_credits  = credits;
+        /*   ibm_nob */
+        msg->ibm_cksum    = 0;
+        msg->ibm_srcnid   = ni->ni_nid;
+        msg->ibm_srcstamp = net->ibn_incarnation;
+        msg->ibm_dstnid   = dstnid;
+        msg->ibm_dststamp = dststamp;
+
+        if (*kiblnd_tunables.kib_cksum) {
+                /* NB ibm_cksum zero while computing cksum */
+                msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob);
+        }
+}
+
+int
+kiblnd_unpack_msg(kib_msg_t *msg, int nob)
+{
+        const int hdr_size = offsetof(kib_msg_t, ibm_u);
+        __u32     msg_cksum;
+        __u16     version;
+        int       msg_nob;
+        int       flip;
+
+        /* 6 bytes are enough to have received magic + version */
+        if (nob < 6) {
+                CERROR("Short message: %d\n", nob);
+                return -EPROTO;
+        }
+
+        if (msg->ibm_magic == IBLND_MSG_MAGIC) {
+                flip = 0;
+        } else if (msg->ibm_magic == __swab32(IBLND_MSG_MAGIC)) {
+                flip = 1;
+        } else {
+                CERROR("Bad magic: %08x\n", msg->ibm_magic);
+                return -EPROTO;
+        }
+
+        version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
+        if (version != IBLND_MSG_VERSION &&
+            version != IBLND_MSG_VERSION_1) {
+                CERROR("Bad version: %x\n", version);
+                return -EPROTO;
+        }
+
+        if (nob < hdr_size) {
+                CERROR("Short message: %d\n", nob);
+                return -EPROTO;
+        }
+
+        msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
+        if (msg_nob > nob) {
+                CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
+                return -EPROTO;
+        }
+
+        /* checksum must be computed with ibm_cksum zero and BEFORE anything
+         * gets flipped */
+        msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
+        msg->ibm_cksum = 0;
+        if (msg_cksum != 0 &&
+            msg_cksum != kiblnd_cksum(msg, msg_nob)) {
+                CERROR("Bad checksum\n");
+                return -EPROTO;
+        }
+
+        msg->ibm_cksum = msg_cksum;
+
+        if (flip) {
+                /* leave magic unflipped as a clue to peer_ni endianness */
+                msg->ibm_version = version;
+                CLASSERT (sizeof(msg->ibm_type) == 1);
+                CLASSERT (sizeof(msg->ibm_credits) == 1);
+                msg->ibm_nob     = msg_nob;
+                __swab64s(&msg->ibm_srcnid);
+                __swab64s(&msg->ibm_srcstamp);
+                __swab64s(&msg->ibm_dstnid);
+                __swab64s(&msg->ibm_dststamp);
+        }
+
+        if (msg->ibm_srcnid == LNET_NID_ANY) {
+                CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
+                return -EPROTO;
+        }
+
+        if (msg_nob < kiblnd_msgtype2size(msg->ibm_type)) {
+                CERROR("Short %s: %d(%d)\n", kiblnd_msgtype2str(msg->ibm_type),
+                       msg_nob, kiblnd_msgtype2size(msg->ibm_type));
+                return -EPROTO;
+        }
+
+        switch (msg->ibm_type) {
+        default:
+                CERROR("Unknown message type %x\n", msg->ibm_type);
+                return -EPROTO;
+
+        case IBLND_MSG_NOOP:
+        case IBLND_MSG_IMMEDIATE:
+        case IBLND_MSG_PUT_REQ:
+                break;
+
+        case IBLND_MSG_PUT_ACK:
+        case IBLND_MSG_GET_REQ:
+                if (kiblnd_unpack_rd(msg, flip))
+                        return -EPROTO;
+                break;
+
+        case IBLND_MSG_PUT_NAK:
+        case IBLND_MSG_PUT_DONE:
+        case IBLND_MSG_GET_DONE:
+                if (flip)
+                        __swab32s(&msg->ibm_u.completion.ibcm_status);
+                break;
+
+        case IBLND_MSG_CONNREQ:
+        case IBLND_MSG_CONNACK:
+                if (flip) {
+                        __swab16s(&msg->ibm_u.connparams.ibcp_queue_depth);
+                        __swab16s(&msg->ibm_u.connparams.ibcp_max_frags);
+                        __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
+                }
+                break;
+        }
+        return 0;
+}
+
+int
+kiblnd_create_peer(struct lnet_ni *ni, kib_peer_ni_t **peerp, lnet_nid_t nid)
+{
+	kib_peer_ni_t	*peer_ni;
+	kib_net_t	*net = ni->ni_data;
+	int		cpt = lnet_cpt_of_nid(nid, ni);
+	unsigned long   flags;
+
+	LASSERT(net != NULL);
+	LASSERT(nid != LNET_NID_ANY);
+
+	LIBCFS_CPT_ALLOC(peer_ni, lnet_cpt_table(), cpt, sizeof(*peer_ni));
+        if (peer_ni == NULL) {
+                CERROR("Cannot allocate peer_ni\n");
+                return -ENOMEM;
+        }
+
+	peer_ni->ibp_ni = ni;
+	peer_ni->ibp_nid = nid;
+	peer_ni->ibp_error = 0;
+	peer_ni->ibp_last_alive = 0;
+	peer_ni->ibp_max_frags = kiblnd_cfg_rdma_frags(peer_ni->ibp_ni);
+	peer_ni->ibp_queue_depth = ni->ni_net->net_tunables.lct_peer_tx_credits;
+	atomic_set(&peer_ni->ibp_refcount, 1);	/* 1 ref for caller */
+
+	INIT_LIST_HEAD(&peer_ni->ibp_list);	/* not in the peer_ni table yet */
+	INIT_LIST_HEAD(&peer_ni->ibp_conns);
+	INIT_LIST_HEAD(&peer_ni->ibp_tx_queue);
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	/* always called with a ref on ni, which prevents ni being shutdown */
+	LASSERT(net->ibn_shutdown == 0);
+
+	/* npeers only grows with the global lock held */
+	atomic_inc(&net->ibn_npeers);
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	*peerp = peer_ni;
+	return 0;
+}
+
+void
+kiblnd_destroy_peer (kib_peer_ni_t *peer_ni)
+{
+	kib_net_t *net = peer_ni->ibp_ni->ni_data;
+
+	LASSERT(net != NULL);
+	LASSERT (atomic_read(&peer_ni->ibp_refcount) == 0);
+	LASSERT(!kiblnd_peer_active(peer_ni));
+	LASSERT(kiblnd_peer_idle(peer_ni));
+	LASSERT(list_empty(&peer_ni->ibp_tx_queue));
+
+	LIBCFS_FREE(peer_ni, sizeof(*peer_ni));
+
+	/* NB a peer_ni's connections keep a reference on their peer_ni until
+	 * they are destroyed, so we can be assured that _all_ state to do
+	 * with this peer_ni has been cleaned up when its refcount drops to
+	 * zero. */
+	atomic_dec(&net->ibn_npeers);
+}
+
+kib_peer_ni_t *
+kiblnd_find_peer_locked(struct lnet_ni *ni, lnet_nid_t nid)
+{
+	/* the caller is responsible for accounting the additional reference
+	 * that this creates */
+	struct list_head	*peer_list = kiblnd_nid2peerlist(nid);
+	struct list_head	*tmp;
+	kib_peer_ni_t		*peer_ni;
+
+	list_for_each(tmp, peer_list) {
+
+		peer_ni = list_entry(tmp, kib_peer_ni_t, ibp_list);
+		LASSERT(!kiblnd_peer_idle(peer_ni));
+
+		/*
+		 * Match a peer if its NID and the NID of the local NI it
+		 * communicates over are the same. Otherwise don't match
+		 * the peer, which will result in a new lnd peer being
+		 * created.
+		 */
+		if (peer_ni->ibp_nid != nid ||
+		    peer_ni->ibp_ni->ni_nid != ni->ni_nid)
+			continue;
+
+		CDEBUG(D_NET, "got peer_ni [%p] -> %s (%d) version: %x\n",
+		       peer_ni, libcfs_nid2str(nid),
+		       atomic_read(&peer_ni->ibp_refcount),
+		       peer_ni->ibp_version);
+		return peer_ni;
+	}
+	return NULL;
+}
+
+void
+kiblnd_unlink_peer_locked (kib_peer_ni_t *peer_ni)
+{
+	LASSERT(list_empty(&peer_ni->ibp_conns));
+
+        LASSERT (kiblnd_peer_active(peer_ni));
+	list_del_init(&peer_ni->ibp_list);
+        /* lose peerlist's ref */
+        kiblnd_peer_decref(peer_ni);
+}
+
+static int
+kiblnd_get_peer_info(struct lnet_ni *ni, int index,
+		     lnet_nid_t *nidp, int *count)
+{
+	kib_peer_ni_t		*peer_ni;
+	struct list_head	*ptmp;
+	int			 i;
+	unsigned long		 flags;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+        for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
+
+		list_for_each(ptmp, &kiblnd_data.kib_peers[i]) {
+
+			peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list);
+			LASSERT(!kiblnd_peer_idle(peer_ni));
+
+			if (peer_ni->ibp_ni != ni)
+				continue;
+
+			if (index-- > 0)
+				continue;
+
+			*nidp = peer_ni->ibp_nid;
+			*count = atomic_read(&peer_ni->ibp_refcount);
+
+			read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+					       flags);
+			return 0;
+		}
+	}
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+	return -ENOENT;
+}
+
+static void
+kiblnd_del_peer_locked (kib_peer_ni_t *peer_ni)
+{
+	struct list_head	*ctmp;
+	struct list_head	*cnxt;
+	kib_conn_t		*conn;
+
+	if (list_empty(&peer_ni->ibp_conns)) {
+		kiblnd_unlink_peer_locked(peer_ni);
+	} else {
+		list_for_each_safe(ctmp, cnxt, &peer_ni->ibp_conns) {
+			conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+			kiblnd_close_conn_locked(conn, 0);
+		}
+		/* NB closing peer_ni's last conn unlinked it. */
+	}
+	/* NB peer_ni now unlinked; might even be freed if the peer_ni table had the
+	 * last ref on it. */
+}
+
+static int
+kiblnd_del_peer(struct lnet_ni *ni, lnet_nid_t nid)
+{
+	struct list_head	zombies = LIST_HEAD_INIT(zombies);
+	struct list_head	*ptmp;
+	struct list_head	*pnxt;
+	kib_peer_ni_t		*peer_ni;
+	int			lo;
+	int			hi;
+	int			i;
+	unsigned long		flags;
+	int			rc = -ENOENT;
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+        if (nid != LNET_NID_ANY) {
+                lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
+        } else {
+                lo = 0;
+                hi = kiblnd_data.kib_peer_hash_size - 1;
+        }
+
+	for (i = lo; i <= hi; i++) {
+		list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
+			peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list);
+			LASSERT(!kiblnd_peer_idle(peer_ni));
+
+			if (peer_ni->ibp_ni != ni)
+				continue;
+
+			if (!(nid == LNET_NID_ANY || peer_ni->ibp_nid == nid))
+				continue;
+
+			if (!list_empty(&peer_ni->ibp_tx_queue)) {
+				LASSERT(list_empty(&peer_ni->ibp_conns));
+
+				list_splice_init(&peer_ni->ibp_tx_queue,
+						 &zombies);
+			}
+
+			kiblnd_del_peer_locked(peer_ni);
+			rc = 0;		/* matched something */
+		}
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	kiblnd_txlist_done(&zombies, -EIO);
+
+	return rc;
+}
+
+static kib_conn_t *
+kiblnd_get_conn_by_idx(struct lnet_ni *ni, int index)
+{
+	kib_peer_ni_t		*peer_ni;
+	struct list_head	*ptmp;
+	kib_conn_t		*conn;
+	struct list_head	*ctmp;
+	int			i;
+	unsigned long		flags;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
+		list_for_each(ptmp, &kiblnd_data.kib_peers[i]) {
+
+			peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list);
+			LASSERT(!kiblnd_peer_idle(peer_ni));
+
+			if (peer_ni->ibp_ni != ni)
+				continue;
+
+			list_for_each(ctmp, &peer_ni->ibp_conns) {
+				if (index-- > 0)
+					continue;
+
+				conn = list_entry(ctmp, kib_conn_t, ibc_list);
+				kiblnd_conn_addref(conn);
+				read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+						       flags);
+				return conn;
+			}
+		}
+	}
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+        return NULL;
+}
+
+static void
+kiblnd_debug_rx (kib_rx_t *rx)
+{
+        CDEBUG(D_CONSOLE, "      %p status %d msg_type %x cred %d\n",
+               rx, rx->rx_status, rx->rx_msg->ibm_type,
+               rx->rx_msg->ibm_credits);
+}
+
+static void
+kiblnd_debug_tx (kib_tx_t *tx)
+{
+        CDEBUG(D_CONSOLE, "      %p snd %d q %d w %d rc %d dl %lx "
+	       "cookie %#llx msg %s%s type %x cred %d\n",
+               tx, tx->tx_sending, tx->tx_queued, tx->tx_waiting,
+               tx->tx_status, tx->tx_deadline, tx->tx_cookie,
+               tx->tx_lntmsg[0] == NULL ? "-" : "!",
+               tx->tx_lntmsg[1] == NULL ? "-" : "!",
+               tx->tx_msg->ibm_type, tx->tx_msg->ibm_credits);
+}
+
+void
+kiblnd_debug_conn (kib_conn_t *conn)
+{
+	struct list_head	*tmp;
+	int			i;
+
+	spin_lock(&conn->ibc_lock);
+
+	CDEBUG(D_CONSOLE, "conn[%d] %p [version %x] -> %s:\n",
+	       atomic_read(&conn->ibc_refcount), conn,
+	       conn->ibc_version, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+	CDEBUG(D_CONSOLE, "   state %d nposted %d/%d cred %d o_cred %d "
+	       " r_cred %d\n", conn->ibc_state, conn->ibc_noops_posted,
+	       conn->ibc_nsends_posted, conn->ibc_credits,
+	       conn->ibc_outstanding_credits, conn->ibc_reserved_credits);
+	CDEBUG(D_CONSOLE, "   comms_err %d\n", conn->ibc_comms_error);
+
+	CDEBUG(D_CONSOLE, "   early_rxs:\n");
+	list_for_each(tmp, &conn->ibc_early_rxs)
+		kiblnd_debug_rx(list_entry(tmp, kib_rx_t, rx_list));
+
+	CDEBUG(D_CONSOLE, "   tx_noops:\n");
+	list_for_each(tmp, &conn->ibc_tx_noops)
+		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+	CDEBUG(D_CONSOLE, "   tx_queue_nocred:\n");
+	list_for_each(tmp, &conn->ibc_tx_queue_nocred)
+		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+	CDEBUG(D_CONSOLE, "   tx_queue_rsrvd:\n");
+	list_for_each(tmp, &conn->ibc_tx_queue_rsrvd)
+		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+	CDEBUG(D_CONSOLE, "   tx_queue:\n");
+	list_for_each(tmp, &conn->ibc_tx_queue)
+		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+	CDEBUG(D_CONSOLE, "   active_txs:\n");
+	list_for_each(tmp, &conn->ibc_active_txs)
+		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+	CDEBUG(D_CONSOLE, "   rxs:\n");
+	for (i = 0; i < IBLND_RX_MSGS(conn); i++)
+		kiblnd_debug_rx(&conn->ibc_rxs[i]);
+
+	spin_unlock(&conn->ibc_lock);
+}
+
+int
+kiblnd_translate_mtu(int value)
+{
+        switch (value) {
+        default:
+                return -1;
+        case 0:
+                return 0;
+        case 256:
+                return IB_MTU_256;
+        case 512:
+                return IB_MTU_512;
+        case 1024:
+                return IB_MTU_1024;
+        case 2048:
+                return IB_MTU_2048;
+        case 4096:
+                return IB_MTU_4096;
+        }
+}
+
+static void
+kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid)
+{
+        int           mtu;
+
+        /* XXX There is no path record for iWARP, set by netdev->change_mtu? */
+        if (cmid->route.path_rec == NULL)
+                return;
+
+        mtu = kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu);
+        LASSERT (mtu >= 0);
+        if (mtu != 0)
+                cmid->route.path_rec->mtu = mtu;
+}
+
+static int
+kiblnd_get_completion_vector(kib_conn_t *conn, int cpt)
+{
+	cpumask_t	*mask;
+	int		vectors;
+	int		off;
+	int		i;
+	lnet_nid_t	ibp_nid;
+
+	vectors = conn->ibc_cmid->device->num_comp_vectors;
+	if (vectors <= 1)
+		return 0;
+
+	mask = cfs_cpt_cpumask(lnet_cpt_table(), cpt);
+
+	/* hash NID to CPU id in this partition... */
+	ibp_nid = conn->ibc_peer->ibp_nid;
+	off = do_div(ibp_nid, cpumask_weight(mask));
+	for_each_cpu(i, mask) {
+		if (off-- == 0)
+			return i % vectors;
+	}
+
+	LBUG();
+	return 1;
+}
+
+/*
+ * Get the scheduler bound to this CPT. If the scheduler has no
+ * threads, which means that the CPT has no CPUs, then grab the
+ * next scheduler that we can use.
+ *
+ * This case would be triggered if a NUMA node is configured with
+ * no associated CPUs.
+ */
+static struct kib_sched_info *
+kiblnd_get_scheduler(int cpt)
+{
+	struct kib_sched_info *sched;
+	int i;
+
+	sched = kiblnd_data.kib_scheds[cpt];
+
+	if (sched->ibs_nthreads > 0)
+		return sched;
+
+	cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) {
+		if (sched->ibs_nthreads > 0) {
+			CDEBUG(D_NET, "scheduler[%d] has no threads. selected scheduler[%d]\n",
+					cpt, sched->ibs_cpt);
+			return sched;
+		}
+	}
+
+	return NULL;
+}
+
+kib_conn_t *
+kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
+		   int state, int version)
+{
+	/* CAVEAT EMPTOR:
+	 * If the new conn is created successfully it takes over the caller's
+	 * ref on 'peer_ni'.  It also "owns" 'cmid' and destroys it when it itself
+	 * is destroyed.  On failure, the caller's ref on 'peer_ni' remains and
+	 * she must dispose of 'cmid'.  (Actually I'd block forever if I tried
+	 * to destroy 'cmid' here since I'm called from the CM which still has
+	 * its ref on 'cmid'). */
+	rwlock_t	       *glock = &kiblnd_data.kib_global_lock;
+	kib_net_t              *net = peer_ni->ibp_ni->ni_data;
+	kib_dev_t              *dev;
+	struct ib_qp_init_attr *init_qp_attr;
+	struct kib_sched_info	*sched;
+#ifdef HAVE_IB_CQ_INIT_ATTR
+	struct ib_cq_init_attr  cq_attr = {};
+#endif
+	kib_conn_t		*conn;
+	struct ib_cq		*cq;
+	unsigned long		flags;
+	int			cpt;
+	int			rc;
+	int			i;
+
+	LASSERT(net != NULL);
+	LASSERT(!in_interrupt());
+
+	dev = net->ibn_dev;
+
+	cpt = lnet_cpt_of_nid(peer_ni->ibp_nid, peer_ni->ibp_ni);
+	sched = kiblnd_get_scheduler(cpt);
+
+	if (sched == NULL) {
+		CERROR("no schedulers available. node is unhealthy\n");
+		goto failed_0;
+	}
+
+	/*
+	 * The cpt might have changed if we ended up selecting a non cpt
+	 * native scheduler. So use the scheduler's cpt instead.
+	 */
+	cpt = sched->ibs_cpt;
+
+	LIBCFS_CPT_ALLOC(init_qp_attr, lnet_cpt_table(), cpt,
+			 sizeof(*init_qp_attr));
+	if (init_qp_attr == NULL) {
+		CERROR("Can't allocate qp_attr for %s\n",
+		       libcfs_nid2str(peer_ni->ibp_nid));
+		goto failed_0;
+	}
+
+	LIBCFS_CPT_ALLOC(conn, lnet_cpt_table(), cpt, sizeof(*conn));
+	if (conn == NULL) {
+		CERROR("Can't allocate connection for %s\n",
+		       libcfs_nid2str(peer_ni->ibp_nid));
+		goto failed_1;
+	}
+
+	conn->ibc_state = IBLND_CONN_INIT;
+	conn->ibc_version = version;
+	conn->ibc_peer = peer_ni;			/* I take the caller's ref */
+	cmid->context = conn;			/* for future CM callbacks */
+	conn->ibc_cmid = cmid;
+	conn->ibc_max_frags = peer_ni->ibp_max_frags;
+	conn->ibc_queue_depth = peer_ni->ibp_queue_depth;
+
+	INIT_LIST_HEAD(&conn->ibc_early_rxs);
+	INIT_LIST_HEAD(&conn->ibc_tx_noops);
+	INIT_LIST_HEAD(&conn->ibc_tx_queue);
+	INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd);
+	INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred);
+	INIT_LIST_HEAD(&conn->ibc_active_txs);
+	spin_lock_init(&conn->ibc_lock);
+
+	LIBCFS_CPT_ALLOC(conn->ibc_connvars, lnet_cpt_table(), cpt,
+			 sizeof(*conn->ibc_connvars));
+	if (conn->ibc_connvars == NULL) {
+		CERROR("Can't allocate in-progress connection state\n");
+		goto failed_2;
+	}
+
+	write_lock_irqsave(glock, flags);
+	if (dev->ibd_failover) {
+		write_unlock_irqrestore(glock, flags);
+		CERROR("%s: failover in progress\n", dev->ibd_ifname);
+		goto failed_2;
+	}
+
+	if (dev->ibd_hdev->ibh_ibdev != cmid->device) {
+		/* wakeup failover thread and teardown connection */
+		if (kiblnd_dev_can_failover(dev)) {
+			list_add_tail(&dev->ibd_fail_list,
+				      &kiblnd_data.kib_failed_devs);
+			wake_up(&kiblnd_data.kib_failover_waitq);
+		}
+
+		write_unlock_irqrestore(glock, flags);
+		CERROR("cmid HCA(%s), kib_dev(%s) need failover\n",
+		       cmid->device->name, dev->ibd_ifname);
+		goto failed_2;
+	}
+
+        kiblnd_hdev_addref_locked(dev->ibd_hdev);
+        conn->ibc_hdev = dev->ibd_hdev;
+
+        kiblnd_setup_mtu_locked(cmid);
+
+	write_unlock_irqrestore(glock, flags);
+
+	LIBCFS_CPT_ALLOC(conn->ibc_rxs, lnet_cpt_table(), cpt,
+			 IBLND_RX_MSGS(conn) * sizeof(kib_rx_t));
+	if (conn->ibc_rxs == NULL) {
+		CERROR("Cannot allocate RX buffers\n");
+		goto failed_2;
+	}
+
+	rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, cpt,
+				IBLND_RX_MSG_PAGES(conn));
+	if (rc != 0)
+		goto failed_2;
+
+	kiblnd_map_rx_descs(conn);
+
+#ifdef HAVE_IB_CQ_INIT_ATTR
+	cq_attr.cqe = IBLND_CQ_ENTRIES(conn);
+	cq_attr.comp_vector = kiblnd_get_completion_vector(conn, cpt);
+	cq = ib_create_cq(cmid->device,
+			  kiblnd_cq_completion, kiblnd_cq_event, conn,
+			  &cq_attr);
+#else
+	cq = ib_create_cq(cmid->device,
+			  kiblnd_cq_completion, kiblnd_cq_event, conn,
+			  IBLND_CQ_ENTRIES(conn),
+			  kiblnd_get_completion_vector(conn, cpt));
+#endif
+	if (IS_ERR(cq)) {
+		CERROR("Failed to create CQ with %d CQEs: %ld\n",
+			IBLND_CQ_ENTRIES(conn), PTR_ERR(cq));
+		goto failed_2;
+	}
+
+        conn->ibc_cq = cq;
+
+	rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+	if (rc != 0) {
+		CERROR("Can't request completion notification: %d\n", rc);
+		goto failed_2;
+	}
+
+	init_qp_attr->event_handler = kiblnd_qp_event;
+	init_qp_attr->qp_context = conn;
+	init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(conn);
+	init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn);
+	init_qp_attr->cap.max_send_sge = *kiblnd_tunables.kib_wrq_sge;
+	init_qp_attr->cap.max_recv_sge = 1;
+	init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+	init_qp_attr->qp_type = IB_QPT_RC;
+	init_qp_attr->send_cq = cq;
+	init_qp_attr->recv_cq = cq;
+
+	conn->ibc_sched = sched;
+
+	do {
+		rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
+		if (!rc || init_qp_attr->cap.max_send_wr < 16)
+			break;
+
+		init_qp_attr->cap.max_send_wr -= init_qp_attr->cap.max_send_wr / 4;
+	} while (rc);
+
+	if (rc) {
+		CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d, "
+		       "send_sge: %d, recv_sge: %d\n",
+		       rc, init_qp_attr->cap.max_send_wr,
+		       init_qp_attr->cap.max_recv_wr,
+		       init_qp_attr->cap.max_send_sge,
+		       init_qp_attr->cap.max_recv_sge);
+		goto failed_2;
+	}
+
+	if (init_qp_attr->cap.max_send_wr != IBLND_SEND_WRS(conn))
+		CDEBUG(D_NET, "original send wr %d, created with %d\n",
+			IBLND_SEND_WRS(conn), init_qp_attr->cap.max_send_wr);
+
+	LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
+
+	/* 1 ref for caller and each rxmsg */
+	atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(conn));
+	conn->ibc_nrx = IBLND_RX_MSGS(conn);
+
+	/* post receives */
+	for (i = 0; i < IBLND_RX_MSGS(conn); i++) {
+		rc = kiblnd_post_rx(&conn->ibc_rxs[i], IBLND_POSTRX_NO_CREDIT);
+		if (rc != 0) {
+			CERROR("Can't post rxmsg: %d\n", rc);
+
+			/* Make posted receives complete */
+			kiblnd_abort_receives(conn);
+
+			/* correct # of posted buffers
+			 * NB locking needed now I'm racing with completion */
+			spin_lock_irqsave(&sched->ibs_lock, flags);
+			conn->ibc_nrx -= IBLND_RX_MSGS(conn) - i;
+			spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+                        /* cmid will be destroyed by CM(ofed) after cm_callback
+                         * returned, so we can't refer it anymore
+                         * (by kiblnd_connd()->kiblnd_destroy_conn) */
+                        rdma_destroy_qp(conn->ibc_cmid);
+                        conn->ibc_cmid = NULL;
+
+			/* Drop my own and unused rxbuffer refcounts */
+			while (i++ <= IBLND_RX_MSGS(conn))
+				kiblnd_conn_decref(conn);
+
+                        return NULL;
+                }
+        }
+
+        /* Init successful! */
+        LASSERT (state == IBLND_CONN_ACTIVE_CONNECT ||
+                 state == IBLND_CONN_PASSIVE_WAIT);
+        conn->ibc_state = state;
+
+        /* 1 more conn */
+	atomic_inc(&net->ibn_nconns);
+        return conn;
+
+ failed_2:
+	kiblnd_destroy_conn(conn, true);
+ failed_1:
+        LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
+ failed_0:
+        return NULL;
+}
+
+void
+kiblnd_destroy_conn(kib_conn_t *conn, bool free_conn)
+{
+	struct rdma_cm_id *cmid = conn->ibc_cmid;
+	kib_peer_ni_t        *peer_ni = conn->ibc_peer;
+	int                rc;
+
+	LASSERT (!in_interrupt());
+	LASSERT (atomic_read(&conn->ibc_refcount) == 0);
+	LASSERT(list_empty(&conn->ibc_early_rxs));
+	LASSERT(list_empty(&conn->ibc_tx_noops));
+	LASSERT(list_empty(&conn->ibc_tx_queue));
+	LASSERT(list_empty(&conn->ibc_tx_queue_rsrvd));
+	LASSERT(list_empty(&conn->ibc_tx_queue_nocred));
+	LASSERT(list_empty(&conn->ibc_active_txs));
+	LASSERT (conn->ibc_noops_posted == 0);
+	LASSERT (conn->ibc_nsends_posted == 0);
+
+	switch (conn->ibc_state) {
+	default:
+		/* conn must be completely disengaged from the network */
+		LBUG();
+
+	case IBLND_CONN_DISCONNECTED:
+		/* connvars should have been freed already */
+		LASSERT (conn->ibc_connvars == NULL);
+		break;
+
+	case IBLND_CONN_INIT:
+		break;
+	}
+
+	/* conn->ibc_cmid might be destroyed by CM already */
+	if (cmid != NULL && cmid->qp != NULL)
+		rdma_destroy_qp(cmid);
+
+	if (conn->ibc_cq != NULL) {
+		rc = ib_destroy_cq(conn->ibc_cq);
+		if (rc != 0)
+			CWARN("Error destroying CQ: %d\n", rc);
+	}
+
+	if (conn->ibc_rx_pages != NULL)
+		kiblnd_unmap_rx_descs(conn);
+
+	if (conn->ibc_rxs != NULL) {
+		LIBCFS_FREE(conn->ibc_rxs,
+			    IBLND_RX_MSGS(conn) * sizeof(kib_rx_t));
+	}
+
+	if (conn->ibc_connvars != NULL)
+		LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+
+	if (conn->ibc_hdev != NULL)
+		kiblnd_hdev_decref(conn->ibc_hdev);
+
+	/* See CAVEAT EMPTOR above in kiblnd_create_conn */
+	if (conn->ibc_state != IBLND_CONN_INIT) {
+		kib_net_t *net = peer_ni->ibp_ni->ni_data;
+
+		kiblnd_peer_decref(peer_ni);
+		rdma_destroy_id(cmid);
+		atomic_dec(&net->ibn_nconns);
+	}
+
+	if (free_conn)
+		LIBCFS_FREE(conn, sizeof(*conn));
+}
+
+int
+kiblnd_close_peer_conns_locked(kib_peer_ni_t *peer_ni, int why)
+{
+	kib_conn_t		*conn;
+	struct list_head	*ctmp;
+	struct list_head	*cnxt;
+	int			count = 0;
+
+	list_for_each_safe(ctmp, cnxt, &peer_ni->ibp_conns) {
+		conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+		CDEBUG(D_NET, "Closing conn -> %s, "
+			      "version: %x, reason: %d\n",
+		       libcfs_nid2str(peer_ni->ibp_nid),
+		       conn->ibc_version, why);
+
+		kiblnd_close_conn_locked(conn, why);
+		count++;
+	}
+
+	return count;
+}
+
+int
+kiblnd_close_stale_conns_locked(kib_peer_ni_t *peer_ni,
+				int version, __u64 incarnation)
+{
+	kib_conn_t		*conn;
+	struct list_head	*ctmp;
+	struct list_head	*cnxt;
+	int			count = 0;
+
+	list_for_each_safe(ctmp, cnxt, &peer_ni->ibp_conns) {
+		conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+		if (conn->ibc_version     == version &&
+		    conn->ibc_incarnation == incarnation)
+			continue;
+
+		CDEBUG(D_NET, "Closing stale conn -> %s version: %x, "
+			      "incarnation:%#llx(%x, %#llx)\n",
+		       libcfs_nid2str(peer_ni->ibp_nid),
+		       conn->ibc_version, conn->ibc_incarnation,
+		       version, incarnation);
+
+		kiblnd_close_conn_locked(conn, -ESTALE);
+		count++;
+	}
+
+	return count;
+}
+
+static int
+kiblnd_close_matching_conns(struct lnet_ni *ni, lnet_nid_t nid)
+{
+	kib_peer_ni_t		*peer_ni;
+	struct list_head	*ptmp;
+	struct list_head	*pnxt;
+	int			lo;
+	int			hi;
+	int			i;
+	unsigned long		flags;
+	int			count = 0;
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	if (nid != LNET_NID_ANY)
+		lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
+	else {
+		lo = 0;
+		hi = kiblnd_data.kib_peer_hash_size - 1;
+	}
+
+	for (i = lo; i <= hi; i++) {
+		list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
+
+			peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list);
+			LASSERT(!kiblnd_peer_idle(peer_ni));
+
+			if (peer_ni->ibp_ni != ni)
+				continue;
+
+			if (!(nid == LNET_NID_ANY || nid == peer_ni->ibp_nid))
+				continue;
+
+			count += kiblnd_close_peer_conns_locked(peer_ni, 0);
+		}
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	/* wildcards always succeed */
+	if (nid == LNET_NID_ANY)
+		return 0;
+
+	return (count == 0) ? -ENOENT : 0;
+}
+
+static int
+kiblnd_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
+{
+        struct libcfs_ioctl_data *data = arg;
+        int                       rc = -EINVAL;
+
+        switch(cmd) {
+        case IOC_LIBCFS_GET_PEER: {
+                lnet_nid_t   nid = 0;
+                int          count = 0;
+
+                rc = kiblnd_get_peer_info(ni, data->ioc_count,
+                                          &nid, &count);
+                data->ioc_nid    = nid;
+                data->ioc_count  = count;
+                break;
+        }
+
+        case IOC_LIBCFS_DEL_PEER: {
+                rc = kiblnd_del_peer(ni, data->ioc_nid);
+                break;
+        }
+        case IOC_LIBCFS_GET_CONN: {
+                kib_conn_t *conn;
+
+                rc = 0;
+                conn = kiblnd_get_conn_by_idx(ni, data->ioc_count);
+                if (conn == NULL) {
+                        rc = -ENOENT;
+                        break;
+                }
+
+		LASSERT(conn->ibc_cmid != NULL);
+		data->ioc_nid = conn->ibc_peer->ibp_nid;
+		if (conn->ibc_cmid->route.path_rec == NULL)
+			data->ioc_u32[0] = 0; /* iWarp has no path MTU */
+		else
+			data->ioc_u32[0] =
+			ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu);
+		kiblnd_conn_decref(conn);
+		break;
+        }
+        case IOC_LIBCFS_CLOSE_CONNECTION: {
+                rc = kiblnd_close_matching_conns(ni, data->ioc_nid);
+                break;
+        }
+
+        default:
+                break;
+        }
+
+        return rc;
+}
+
+static void
+kiblnd_query(struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when)
+{
+	cfs_time_t	last_alive = 0;
+	cfs_time_t	now = cfs_time_current();
+	rwlock_t	*glock = &kiblnd_data.kib_global_lock;
+	kib_peer_ni_t	*peer_ni;
+	unsigned long	flags;
+
+	read_lock_irqsave(glock, flags);
+
+	peer_ni = kiblnd_find_peer_locked(ni, nid);
+	if (peer_ni != NULL)
+		last_alive = peer_ni->ibp_last_alive;
+
+	read_unlock_irqrestore(glock, flags);
+
+	if (last_alive != 0)
+		*when = last_alive;
+
+	/* peer_ni is not persistent in hash, trigger peer_ni creation
+	 * and connection establishment with a NULL tx */
+	if (peer_ni == NULL)
+		kiblnd_launch_tx(ni, NULL, nid);
+
+	CDEBUG(D_NET, "peer_ni %s %p, alive %ld secs ago\n",
+	       libcfs_nid2str(nid), peer_ni,
+	       last_alive ? cfs_duration_sec(now - last_alive) : -1);
+	return;
+}
+
+static void
+kiblnd_free_pages(kib_pages_t *p)
+{
+	int	npages = p->ibp_npages;
+	int	i;
+
+	for (i = 0; i < npages; i++) {
+		if (p->ibp_pages[i] != NULL)
+			__free_page(p->ibp_pages[i]);
+	}
+
+	LIBCFS_FREE(p, offsetof(kib_pages_t, ibp_pages[npages]));
+}
+
+int
+kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages)
+{
+	kib_pages_t	*p;
+	int		i;
+
+	LIBCFS_CPT_ALLOC(p, lnet_cpt_table(), cpt,
+			 offsetof(kib_pages_t, ibp_pages[npages]));
+        if (p == NULL) {
+                CERROR("Can't allocate descriptor for %d pages\n", npages);
+                return -ENOMEM;
+        }
+
+        memset(p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
+        p->ibp_npages = npages;
+
+        for (i = 0; i < npages; i++) {
+		p->ibp_pages[i] = cfs_page_cpt_alloc(lnet_cpt_table(), cpt,
+						     GFP_NOFS);
+                if (p->ibp_pages[i] == NULL) {
+                        CERROR("Can't allocate page %d of %d\n", i, npages);
+                        kiblnd_free_pages(p);
+                        return -ENOMEM;
+                }
+        }
+
+        *pp = p;
+        return 0;
+}
+
+void
+kiblnd_unmap_rx_descs(kib_conn_t *conn)
+{
+        kib_rx_t *rx;
+        int       i;
+
+        LASSERT (conn->ibc_rxs != NULL);
+        LASSERT (conn->ibc_hdev != NULL);
+
+	for (i = 0; i < IBLND_RX_MSGS(conn); i++) {
+		rx = &conn->ibc_rxs[i];
+
+		LASSERT(rx->rx_nob >= 0); /* not posted */
+
+		kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev,
+					KIBLND_UNMAP_ADDR(rx, rx_msgunmap,
+							  rx->rx_msgaddr),
+					IBLND_MSG_SIZE, DMA_FROM_DEVICE);
+	}
+
+        kiblnd_free_pages(conn->ibc_rx_pages);
+
+        conn->ibc_rx_pages = NULL;
+}
+
+void
+kiblnd_map_rx_descs(kib_conn_t *conn)
+{
+        kib_rx_t       *rx;
+        struct page    *pg;
+        int             pg_off;
+        int             ipg;
+        int             i;
+
+	for (pg_off = ipg = i = 0; i < IBLND_RX_MSGS(conn); i++) {
+		pg = conn->ibc_rx_pages->ibp_pages[ipg];
+		rx = &conn->ibc_rxs[i];
+
+		rx->rx_conn = conn;
+		rx->rx_msg = (kib_msg_t *)(((char *)page_address(pg)) + pg_off);
+
+		rx->rx_msgaddr =
+			kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev,
+					      rx->rx_msg, IBLND_MSG_SIZE,
+					      DMA_FROM_DEVICE);
+		LASSERT(!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev,
+						  rx->rx_msgaddr));
+		KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr);
+
+		CDEBUG(D_NET, "rx %d: %p %#llx(%#llx)\n",
+		       i, rx->rx_msg, rx->rx_msgaddr,
+		       (__u64)(page_to_phys(pg) + pg_off));
+
+		pg_off += IBLND_MSG_SIZE;
+		LASSERT(pg_off <= PAGE_SIZE);
+
+		if (pg_off == PAGE_SIZE) {
+			pg_off = 0;
+			ipg++;
+			LASSERT(ipg <= IBLND_RX_MSG_PAGES(conn));
+		}
+	}
+}
+
+static void
+kiblnd_unmap_tx_pool(kib_tx_pool_t *tpo)
+{
+        kib_hca_dev_t  *hdev = tpo->tpo_hdev;
+        kib_tx_t       *tx;
+        int             i;
+
+        LASSERT (tpo->tpo_pool.po_allocated == 0);
+
+        if (hdev == NULL)
+                return;
+
+        for (i = 0; i < tpo->tpo_pool.po_size; i++) {
+                tx = &tpo->tpo_tx_descs[i];
+                kiblnd_dma_unmap_single(hdev->ibh_ibdev,
+                                        KIBLND_UNMAP_ADDR(tx, tx_msgunmap,
+                                                          tx->tx_msgaddr),
+                                        IBLND_MSG_SIZE, DMA_TO_DEVICE);
+        }
+
+        kiblnd_hdev_decref(hdev);
+        tpo->tpo_hdev = NULL;
+}
+
+static kib_hca_dev_t *
+kiblnd_current_hdev(kib_dev_t *dev)
+{
+        kib_hca_dev_t *hdev;
+        unsigned long  flags;
+        int            i = 0;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	while (dev->ibd_failover) {
+		read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+		if (i++ % 50 == 0)
+			CDEBUG(D_NET, "%s: Wait for failover\n",
+			       dev->ibd_ifname);
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(1) / 100);
+
+		read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	}
+
+	kiblnd_hdev_addref_locked(dev->ibd_hdev);
+	hdev = dev->ibd_hdev;
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	return hdev;
+}
+
+static void
+kiblnd_map_tx_pool(kib_tx_pool_t *tpo)
+{
+        kib_pages_t    *txpgs = tpo->tpo_tx_pages;
+        kib_pool_t     *pool  = &tpo->tpo_pool;
+        kib_net_t      *net   = pool->po_owner->ps_net;
+	kib_dev_t      *dev;
+        struct page    *page;
+        kib_tx_t       *tx;
+        int             page_offset;
+        int             ipage;
+        int             i;
+
+        LASSERT (net != NULL);
+
+	dev = net->ibn_dev;
+
+        /* pre-mapped messages are not bigger than 1 page */
+        CLASSERT (IBLND_MSG_SIZE <= PAGE_SIZE);
+
+        /* No fancy arithmetic when we do the buffer calculations */
+        CLASSERT (PAGE_SIZE % IBLND_MSG_SIZE == 0);
+
+        tpo->tpo_hdev = kiblnd_current_hdev(dev);
+
+	for (ipage = page_offset = i = 0; i < pool->po_size; i++) {
+		page = txpgs->ibp_pages[ipage];
+		tx = &tpo->tpo_tx_descs[i];
+
+		tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
+					   page_offset);
+
+		tx->tx_msgaddr = kiblnd_dma_map_single(tpo->tpo_hdev->ibh_ibdev,
+						       tx->tx_msg,
+						       IBLND_MSG_SIZE,
+						       DMA_TO_DEVICE);
+		LASSERT(!kiblnd_dma_mapping_error(tpo->tpo_hdev->ibh_ibdev,
+						  tx->tx_msgaddr));
+		KIBLND_UNMAP_ADDR_SET(tx, tx_msgunmap, tx->tx_msgaddr);
+
+		list_add(&tx->tx_list, &pool->po_free_list);
+
+		page_offset += IBLND_MSG_SIZE;
+		LASSERT(page_offset <= PAGE_SIZE);
+
+		if (page_offset == PAGE_SIZE) {
+			page_offset = 0;
+			ipage++;
+			LASSERT(ipage <= txpgs->ibp_npages);
+		}
+	}
+}
+
+#ifdef HAVE_IB_GET_DMA_MR
+struct ib_mr *
+kiblnd_find_rd_dma_mr(struct lnet_ni *ni, kib_rdma_desc_t *rd,
+		      int negotiated_nfrags)
+{
+	kib_net_t     *net   = ni->ni_data;
+	kib_hca_dev_t *hdev  = net->ibn_dev->ibd_hdev;
+	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+	int	mod;
+	__u16	nfrags;
+
+	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+	mod = tunables->lnd_map_on_demand;
+	nfrags = (negotiated_nfrags != -1) ? negotiated_nfrags : mod;
+
+	LASSERT(hdev->ibh_mrs != NULL);
+
+	if (mod > 0 && nfrags <= rd->rd_nfrags)
+		return NULL;
+
+	return hdev->ibh_mrs;
+}
+#endif
+
+static void
+kiblnd_destroy_fmr_pool(kib_fmr_pool_t *fpo)
+{
+	LASSERT(fpo->fpo_map_count == 0);
+
+	if (fpo->fpo_is_fmr) {
+		if (fpo->fmr.fpo_fmr_pool)
+			ib_destroy_fmr_pool(fpo->fmr.fpo_fmr_pool);
+	} else {
+		struct kib_fast_reg_descriptor *frd, *tmp;
+		int i = 0;
+
+		list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list,
+					 frd_list) {
+			list_del(&frd->frd_list);
+#ifndef HAVE_IB_MAP_MR_SG
+			ib_free_fast_reg_page_list(frd->frd_frpl);
+#endif
+			ib_dereg_mr(frd->frd_mr);
+			LIBCFS_FREE(frd, sizeof(*frd));
+			i++;
+		}
+		if (i < fpo->fast_reg.fpo_pool_size)
+			CERROR("FastReg pool still has %d regions registered\n",
+				fpo->fast_reg.fpo_pool_size - i);
+	}
+
+	if (fpo->fpo_hdev)
+		kiblnd_hdev_decref(fpo->fpo_hdev);
+
+	LIBCFS_FREE(fpo, sizeof(*fpo));
+}
+
+static void
+kiblnd_destroy_fmr_pool_list(struct list_head *head)
+{
+	kib_fmr_pool_t *fpo, *tmp;
+
+	list_for_each_entry_safe(fpo, tmp, head, fpo_list) {
+		list_del(&fpo->fpo_list);
+		kiblnd_destroy_fmr_pool(fpo);
+	}
+}
+
+static int
+kiblnd_fmr_pool_size(struct lnet_ioctl_config_o2iblnd_tunables *tunables,
+		     int ncpts)
+{
+	int size = tunables->lnd_fmr_pool_size / ncpts;
+
+	return max(IBLND_FMR_POOL, size);
+}
+
+static int
+kiblnd_fmr_flush_trigger(struct lnet_ioctl_config_o2iblnd_tunables *tunables,
+			 int ncpts)
+{
+	int size = tunables->lnd_fmr_flush_trigger / ncpts;
+
+	return max(IBLND_FMR_POOL_FLUSH, size);
+}
+
+static int kiblnd_alloc_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t *fpo)
+{
+	struct ib_fmr_pool_param param = {
+		.max_pages_per_fmr = LNET_MAX_PAYLOAD/PAGE_SIZE,
+		.page_shift        = PAGE_SHIFT,
+		.access            = (IB_ACCESS_LOCAL_WRITE |
+				      IB_ACCESS_REMOTE_WRITE),
+		.pool_size	   = fps->fps_pool_size,
+		.dirty_watermark   = fps->fps_flush_trigger,
+		.flush_function    = NULL,
+		.flush_arg         = NULL,
+		.cache             = !!fps->fps_cache };
+	int rc = 0;
+
+	fpo->fmr.fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd,
+						   &param);
+	if (IS_ERR(fpo->fmr.fpo_fmr_pool)) {
+		rc = PTR_ERR(fpo->fmr.fpo_fmr_pool);
+		if (rc != -ENOSYS)
+			CERROR("Failed to create FMR pool: %d\n", rc);
+		else
+			CERROR("FMRs are not supported\n");
+	}
+
+	return rc;
+}
+
+static int kiblnd_alloc_freg_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t *fpo)
+{
+	struct kib_fast_reg_descriptor *frd, *tmp;
+	int i, rc;
+
+	INIT_LIST_HEAD(&fpo->fast_reg.fpo_pool_list);
+	fpo->fast_reg.fpo_pool_size = 0;
+	for (i = 0; i < fps->fps_pool_size; i++) {
+		LIBCFS_CPT_ALLOC(frd, lnet_cpt_table(), fps->fps_cpt,
+				 sizeof(*frd));
+		if (!frd) {
+			CERROR("Failed to allocate a new fast_reg descriptor\n");
+			rc = -ENOMEM;
+			goto out;
+		}
+		frd->frd_mr = NULL;
+
+#ifndef HAVE_IB_MAP_MR_SG
+		frd->frd_frpl = ib_alloc_fast_reg_page_list(fpo->fpo_hdev->ibh_ibdev,
+							    LNET_MAX_PAYLOAD/PAGE_SIZE);
+		if (IS_ERR(frd->frd_frpl)) {
+			rc = PTR_ERR(frd->frd_frpl);
+			CERROR("Failed to allocate ib_fast_reg_page_list: %d\n",
+				rc);
+			frd->frd_frpl = NULL;
+			goto out_middle;
+		}
+#endif
+
+#ifdef HAVE_IB_ALLOC_FAST_REG_MR
+		frd->frd_mr = ib_alloc_fast_reg_mr(fpo->fpo_hdev->ibh_pd,
+						   LNET_MAX_PAYLOAD/PAGE_SIZE);
+#else
+		frd->frd_mr = ib_alloc_mr(fpo->fpo_hdev->ibh_pd,
+					  IB_MR_TYPE_MEM_REG,
+					  LNET_MAX_PAYLOAD/PAGE_SIZE);
+#endif
+		if (IS_ERR(frd->frd_mr)) {
+			rc = PTR_ERR(frd->frd_mr);
+			CERROR("Failed to allocate ib_fast_reg_mr: %d\n", rc);
+			frd->frd_mr = NULL;
+			goto out_middle;
+		}
+
+		/* There appears to be a bug in MLX5 code where you must
+		 * invalidate the rkey of a new FastReg pool before first
+		 * using it. Thus, I am marking the FRD invalid here. */
+		frd->frd_valid = false;
+
+		list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list);
+		fpo->fast_reg.fpo_pool_size++;
+	}
+
+	return 0;
+
+out_middle:
+	if (frd->frd_mr)
+		ib_dereg_mr(frd->frd_mr);
+#ifndef HAVE_IB_MAP_MR_SG
+	if (frd->frd_frpl)
+		ib_free_fast_reg_page_list(frd->frd_frpl);
+#endif
+	LIBCFS_FREE(frd, sizeof(*frd));
+
+out:
+	list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list,
+				 frd_list) {
+		list_del(&frd->frd_list);
+#ifndef HAVE_IB_MAP_MR_SG
+		ib_free_fast_reg_page_list(frd->frd_frpl);
+#endif
+		ib_dereg_mr(frd->frd_mr);
+		LIBCFS_FREE(frd, sizeof(*frd));
+	}
+
+	return rc;
+}
+
+static int
+kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t **pp_fpo)
+{
+	struct ib_device_attr *dev_attr;
+	kib_dev_t *dev = fps->fps_net->ibn_dev;
+	kib_fmr_pool_t *fpo;
+	int rc;
+
+#ifndef HAVE_IB_DEVICE_ATTRS
+	dev_attr = kmalloc(sizeof(*dev_attr), GFP_KERNEL);
+	if (!dev_attr)
+		return -ENOMEM;
+#endif
+
+	LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo));
+	if (!fpo) {
+		rc = -ENOMEM;
+		goto out_dev_attr;
+	}
+
+	fpo->fpo_hdev = kiblnd_current_hdev(dev);
+
+#ifdef HAVE_IB_DEVICE_ATTRS
+	dev_attr = &fpo->fpo_hdev->ibh_ibdev->attrs;
+#else
+	rc = ib_query_device(fpo->fpo_hdev->ibh_ibdev, dev_attr);
+	if (rc) {
+		CERROR("Query device failed for %s: %d\n",
+			fpo->fpo_hdev->ibh_ibdev->name, rc);
+		goto out_dev_attr;
+	}
+#endif
+
+	/* Check for FMR or FastReg support */
+	fpo->fpo_is_fmr = 0;
+	if (fpo->fpo_hdev->ibh_ibdev->alloc_fmr &&
+	    fpo->fpo_hdev->ibh_ibdev->dealloc_fmr &&
+	    fpo->fpo_hdev->ibh_ibdev->map_phys_fmr &&
+	    fpo->fpo_hdev->ibh_ibdev->unmap_fmr) {
+		LCONSOLE_INFO("Using FMR for registration\n");
+		fpo->fpo_is_fmr = 1;
+	} else if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
+		LCONSOLE_INFO("Using FastReg for registration\n");
+	} else {
+		rc = -ENOSYS;
+		LCONSOLE_ERROR_MSG(rc, "IB device does not support FMRs nor FastRegs, can't register memory\n");
+		goto out_dev_attr;
+	}
+
+	if (fpo->fpo_is_fmr)
+		rc = kiblnd_alloc_fmr_pool(fps, fpo);
+	else
+		rc = kiblnd_alloc_freg_pool(fps, fpo);
+	if (rc)
+		goto out_fpo;
+
+#ifndef HAVE_IB_DEVICE_ATTRS
+	kfree(dev_attr);
+#endif
+	fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+	fpo->fpo_owner    = fps;
+	*pp_fpo = fpo;
+
+	return 0;
+
+out_fpo:
+	kiblnd_hdev_decref(fpo->fpo_hdev);
+	LIBCFS_FREE(fpo, sizeof(*fpo));
+
+out_dev_attr:
+#ifndef HAVE_IB_DEVICE_ATTRS
+	kfree(dev_attr);
+#endif
+
+	return rc;
+}
+
+static void
+kiblnd_fail_fmr_poolset(kib_fmr_poolset_t *fps, struct list_head *zombies)
+{
+	if (fps->fps_net == NULL) /* intialized? */
+		return;
+
+	spin_lock(&fps->fps_lock);
+
+	while (!list_empty(&fps->fps_pool_list)) {
+		kib_fmr_pool_t *fpo = list_entry(fps->fps_pool_list.next,
+                                                 kib_fmr_pool_t, fpo_list);
+		fpo->fpo_failed = 1;
+		list_del(&fpo->fpo_list);
+		if (fpo->fpo_map_count == 0)
+			list_add(&fpo->fpo_list, zombies);
+		else
+			list_add(&fpo->fpo_list, &fps->fps_failed_pool_list);
+	}
+
+	spin_unlock(&fps->fps_lock);
+}
+
+static void
+kiblnd_fini_fmr_poolset(kib_fmr_poolset_t *fps)
+{
+	if (fps->fps_net != NULL) { /* initialized? */
+		kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list);
+		kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list);
+	}
+}
+
+static int
+kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt, int ncpts,
+			kib_net_t *net,
+			struct lnet_ioctl_config_o2iblnd_tunables *tunables)
+{
+	kib_fmr_pool_t *fpo;
+	int		rc;
+
+	memset(fps, 0, sizeof(kib_fmr_poolset_t));
+
+	fps->fps_net = net;
+	fps->fps_cpt = cpt;
+
+	fps->fps_pool_size = kiblnd_fmr_pool_size(tunables, ncpts);
+	fps->fps_flush_trigger = kiblnd_fmr_flush_trigger(tunables, ncpts);
+	fps->fps_cache = tunables->lnd_fmr_cache;
+
+	spin_lock_init(&fps->fps_lock);
+	INIT_LIST_HEAD(&fps->fps_pool_list);
+	INIT_LIST_HEAD(&fps->fps_failed_pool_list);
+
+	rc = kiblnd_create_fmr_pool(fps, &fpo);
+	if (rc == 0)
+		list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
+
+	return rc;
+}
+
+static int
+kiblnd_fmr_pool_is_idle(kib_fmr_pool_t *fpo, cfs_time_t now)
+{
+        if (fpo->fpo_map_count != 0) /* still in use */
+                return 0;
+        if (fpo->fpo_failed)
+                return 1;
+        return cfs_time_aftereq(now, fpo->fpo_deadline);
+}
+
+static int
+kiblnd_map_tx_pages(kib_tx_t *tx, kib_rdma_desc_t *rd)
+{
+	kib_hca_dev_t	*hdev;
+	__u64		*pages = tx->tx_pages;
+	int		npages;
+	int		size;
+	int		i;
+
+	hdev = tx->tx_pool->tpo_hdev;
+
+	for (i = 0, npages = 0; i < rd->rd_nfrags; i++) {
+		for (size = 0; size <  rd->rd_frags[i].rf_nob;
+			size += hdev->ibh_page_size) {
+			pages[npages++] = (rd->rd_frags[i].rf_addr &
+					   hdev->ibh_page_mask) + size;
+		}
+	}
+
+	return npages;
+}
+
+void
+kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status)
+{
+	struct list_head   zombies = LIST_HEAD_INIT(zombies);
+	kib_fmr_pool_t    *fpo = fmr->fmr_pool;
+	kib_fmr_poolset_t *fps;
+	cfs_time_t         now = cfs_time_current();
+	kib_fmr_pool_t    *tmp;
+	int                rc;
+
+	if (!fpo)
+		return;
+
+	fps = fpo->fpo_owner;
+	if (fpo->fpo_is_fmr) {
+		if (fmr->fmr_pfmr) {
+			rc = ib_fmr_pool_unmap(fmr->fmr_pfmr);
+			LASSERT(!rc);
+			fmr->fmr_pfmr = NULL;
+		}
+
+		if (status) {
+			rc = ib_flush_fmr_pool(fpo->fmr.fpo_fmr_pool);
+			LASSERT(!rc);
+		}
+	} else {
+		struct kib_fast_reg_descriptor *frd = fmr->fmr_frd;
+
+		if (frd) {
+			frd->frd_valid = false;
+			spin_lock(&fps->fps_lock);
+			list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list);
+			spin_unlock(&fps->fps_lock);
+			fmr->fmr_frd = NULL;
+		}
+	}
+	fmr->fmr_pool = NULL;
+
+	spin_lock(&fps->fps_lock);
+	fpo->fpo_map_count--;	/* decref the pool */
+
+	list_for_each_entry_safe(fpo, tmp, &fps->fps_pool_list, fpo_list) {
+		/* the first pool is persistent */
+		if (fps->fps_pool_list.next == &fpo->fpo_list)
+			continue;
+
+		if (kiblnd_fmr_pool_is_idle(fpo, now)) {
+			list_move(&fpo->fpo_list, &zombies);
+			fps->fps_version++;
+		}
+	}
+	spin_unlock(&fps->fps_lock);
+
+	if (!list_empty(&zombies))
+		kiblnd_destroy_fmr_pool_list(&zombies);
+}
+
+int
+kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, kib_rdma_desc_t *rd,
+		    __u32 nob, __u64 iov, kib_fmr_t *fmr, bool *is_fastreg)
+{
+	kib_fmr_pool_t *fpo;
+	__u64 *pages = tx->tx_pages;
+	__u64 version;
+	bool is_rx = (rd != tx->tx_rd);
+	bool tx_pages_mapped = 0;
+	int npages = 0;
+	int rc;
+
+again:
+	spin_lock(&fps->fps_lock);
+	version = fps->fps_version;
+	list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) {
+		fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+		fpo->fpo_map_count++;
+
+		if (fpo->fpo_is_fmr) {
+			struct ib_pool_fmr *pfmr;
+
+			*is_fastreg = 0;
+			spin_unlock(&fps->fps_lock);
+
+			if (!tx_pages_mapped) {
+				npages = kiblnd_map_tx_pages(tx, rd);
+				tx_pages_mapped = 1;
+			}
+
+			pfmr = ib_fmr_pool_map_phys(fpo->fmr.fpo_fmr_pool,
+						    pages, npages, iov);
+			if (likely(!IS_ERR(pfmr))) {
+				fmr->fmr_key  = is_rx ? pfmr->fmr->rkey
+						      : pfmr->fmr->lkey;
+				fmr->fmr_frd  = NULL;
+				fmr->fmr_pfmr = pfmr;
+				fmr->fmr_pool = fpo;
+				return 0;
+			}
+			rc = PTR_ERR(pfmr);
+		} else {
+			*is_fastreg = 1;
+			if (!list_empty(&fpo->fast_reg.fpo_pool_list)) {
+				struct kib_fast_reg_descriptor *frd;
+#ifdef HAVE_IB_MAP_MR_SG
+				struct ib_reg_wr *wr;
+				int n;
+#else
+				struct ib_rdma_wr *wr;
+				struct ib_fast_reg_page_list *frpl;
+#endif
+				struct ib_mr *mr;
+
+				frd = list_first_entry(&fpo->fast_reg.fpo_pool_list,
+							struct kib_fast_reg_descriptor,
+							frd_list);
+				list_del(&frd->frd_list);
+				spin_unlock(&fps->fps_lock);
+
+#ifndef HAVE_IB_MAP_MR_SG
+				frpl = frd->frd_frpl;
+#endif
+				mr   = frd->frd_mr;
+
+				if (!frd->frd_valid) {
+					struct ib_rdma_wr *inv_wr;
+					__u32 key = is_rx ? mr->rkey : mr->lkey;
+
+					inv_wr = &frd->frd_inv_wr;
+					memset(inv_wr, 0, sizeof(*inv_wr));
+
+					inv_wr->wr.opcode = IB_WR_LOCAL_INV;
+					inv_wr->wr.wr_id  = IBLND_WID_MR;
+					inv_wr->wr.ex.invalidate_rkey = key;
+
+					/* Bump the key */
+					key = ib_inc_rkey(key);
+					ib_update_fast_reg_key(mr, key);
+				}
+
+#ifdef HAVE_IB_MAP_MR_SG
+#ifdef HAVE_IB_MAP_MR_SG_5ARGS
+				n = ib_map_mr_sg(mr, tx->tx_frags,
+						 tx->tx_nfrags, NULL, PAGE_SIZE);
+#else
+				n = ib_map_mr_sg(mr, tx->tx_frags,
+						 tx->tx_nfrags, PAGE_SIZE);
+#endif
+				if (unlikely(n != tx->tx_nfrags)) {
+					CERROR("Failed to map mr %d/%d "
+					       "elements\n", n, tx->tx_nfrags);
+					return n < 0 ? n : -EINVAL;
+				}
+
+				wr = &frd->frd_fastreg_wr;
+				memset(wr, 0, sizeof(*wr));
+
+				wr->wr.opcode = IB_WR_REG_MR;
+				wr->wr.wr_id  = IBLND_WID_MR;
+				wr->wr.num_sge = 0;
+				wr->wr.send_flags = 0;
+				wr->mr = mr;
+				wr->key = is_rx ? mr->rkey : mr->lkey;
+				wr->access = (IB_ACCESS_LOCAL_WRITE |
+					      IB_ACCESS_REMOTE_WRITE);
+#else
+				if (!tx_pages_mapped) {
+					npages = kiblnd_map_tx_pages(tx, rd);
+					tx_pages_mapped = 1;
+				}
+
+				LASSERT(npages <= frpl->max_page_list_len);
+				memcpy(frpl->page_list, pages,
+					sizeof(*pages) * npages);
+
+				/* Prepare FastReg WR */
+				wr = &frd->frd_fastreg_wr;
+				memset(wr, 0, sizeof(*wr));
+
+				wr->wr.opcode = IB_WR_FAST_REG_MR;
+				wr->wr.wr_id  = IBLND_WID_MR;
+
+				wr->wr.wr.fast_reg.iova_start = iov;
+				wr->wr.wr.fast_reg.page_list  = frpl;
+				wr->wr.wr.fast_reg.page_list_len = npages;
+				wr->wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+				wr->wr.wr.fast_reg.length = nob;
+				wr->wr.wr.fast_reg.rkey =
+						is_rx ? mr->rkey : mr->lkey;
+				wr->wr.wr.fast_reg.access_flags =
+						(IB_ACCESS_LOCAL_WRITE |
+						 IB_ACCESS_REMOTE_WRITE);
+#endif
+
+				fmr->fmr_key  = is_rx ? mr->rkey : mr->lkey;
+				fmr->fmr_frd  = frd;
+				fmr->fmr_pfmr = NULL;
+				fmr->fmr_pool = fpo;
+				return 0;
+			}
+			spin_unlock(&fps->fps_lock);
+			rc = -EAGAIN;
+		}
+
+		spin_lock(&fps->fps_lock);
+		fpo->fpo_map_count--;
+		if (rc != -EAGAIN) {
+			spin_unlock(&fps->fps_lock);
+			return rc;
+		}
+
+		/* EAGAIN and ... */
+		if (version != fps->fps_version) {
+			spin_unlock(&fps->fps_lock);
+			goto again;
+		}
+	}
+
+	if (fps->fps_increasing) {
+		spin_unlock(&fps->fps_lock);
+		CDEBUG(D_NET, "Another thread is allocating new "
+		       "FMR pool, waiting for her to complete\n");
+		schedule();
+		goto again;
+
+	}
+
+	if (cfs_time_before(cfs_time_current(), fps->fps_next_retry)) {
+		/* someone failed recently */
+		spin_unlock(&fps->fps_lock);
+		return -EAGAIN;
+	}
+
+	fps->fps_increasing = 1;
+	spin_unlock(&fps->fps_lock);
+
+	CDEBUG(D_NET, "Allocate new FMR pool\n");
+	rc = kiblnd_create_fmr_pool(fps, &fpo);
+	spin_lock(&fps->fps_lock);
+	fps->fps_increasing = 0;
+	if (rc == 0) {
+		fps->fps_version++;
+		list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
+	} else {
+		fps->fps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
+	}
+	spin_unlock(&fps->fps_lock);
+
+	goto again;
+}
+
+static void
+kiblnd_fini_pool(kib_pool_t *pool)
+{
+	LASSERT(list_empty(&pool->po_free_list));
+	LASSERT(pool->po_allocated == 0);
+
+	CDEBUG(D_NET, "Finalize %s pool\n", pool->po_owner->ps_name);
+}
+
+static void
+kiblnd_init_pool(kib_poolset_t *ps, kib_pool_t *pool, int size)
+{
+	CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name);
+
+	memset(pool, 0, sizeof(kib_pool_t));
+	INIT_LIST_HEAD(&pool->po_free_list);
+	pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+	pool->po_owner	  = ps;
+	pool->po_size	  = size;
+}
+
+static void
+kiblnd_destroy_pool_list(struct list_head *head)
+{
+	kib_pool_t *pool;
+
+	while (!list_empty(head)) {
+		pool = list_entry(head->next, kib_pool_t, po_list);
+		list_del(&pool->po_list);
+
+		LASSERT(pool->po_owner != NULL);
+		pool->po_owner->ps_pool_destroy(pool);
+	}
+}
+
+static void
+kiblnd_fail_poolset(kib_poolset_t *ps, struct list_head *zombies)
+{
+	if (ps->ps_net == NULL) /* intialized? */
+		return;
+
+	spin_lock(&ps->ps_lock);
+	while (!list_empty(&ps->ps_pool_list)) {
+		kib_pool_t *po = list_entry(ps->ps_pool_list.next,
+                                            kib_pool_t, po_list);
+		po->po_failed = 1;
+		list_del(&po->po_list);
+		if (po->po_allocated == 0)
+			list_add(&po->po_list, zombies);
+		else
+			list_add(&po->po_list, &ps->ps_failed_pool_list);
+	}
+	spin_unlock(&ps->ps_lock);
+}
+
+static void
+kiblnd_fini_poolset(kib_poolset_t *ps)
+{
+	if (ps->ps_net != NULL) { /* initialized? */
+		kiblnd_destroy_pool_list(&ps->ps_failed_pool_list);
+		kiblnd_destroy_pool_list(&ps->ps_pool_list);
+	}
+}
+
+static int
+kiblnd_init_poolset(kib_poolset_t *ps, int cpt,
+		    kib_net_t *net, char *name, int size,
+		    kib_ps_pool_create_t po_create,
+		    kib_ps_pool_destroy_t po_destroy,
+		    kib_ps_node_init_t nd_init,
+		    kib_ps_node_fini_t nd_fini)
+{
+	kib_pool_t	*pool;
+	int		rc;
+
+	memset(ps, 0, sizeof(kib_poolset_t));
+
+	ps->ps_cpt	    = cpt;
+        ps->ps_net          = net;
+        ps->ps_pool_create  = po_create;
+        ps->ps_pool_destroy = po_destroy;
+        ps->ps_node_init    = nd_init;
+        ps->ps_node_fini    = nd_fini;
+        ps->ps_pool_size    = size;
+	if (strlcpy(ps->ps_name, name, sizeof(ps->ps_name))
+	    >= sizeof(ps->ps_name))
+		return -E2BIG;
+	spin_lock_init(&ps->ps_lock);
+	INIT_LIST_HEAD(&ps->ps_pool_list);
+	INIT_LIST_HEAD(&ps->ps_failed_pool_list);
+
+	rc = ps->ps_pool_create(ps, size, &pool);
+	if (rc == 0)
+		list_add(&pool->po_list, &ps->ps_pool_list);
+	else
+		CERROR("Failed to create the first pool for %s\n", ps->ps_name);
+
+	return rc;
+}
+
+static int
+kiblnd_pool_is_idle(kib_pool_t *pool, cfs_time_t now)
+{
+        if (pool->po_allocated != 0) /* still in use */
+                return 0;
+        if (pool->po_failed)
+                return 1;
+        return cfs_time_aftereq(now, pool->po_deadline);
+}
+
+void
+kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node)
+{
+	struct list_head zombies = LIST_HEAD_INIT(zombies);
+	kib_poolset_t	*ps = pool->po_owner;
+	kib_pool_t	*tmp;
+	cfs_time_t	 now = cfs_time_current();
+
+	spin_lock(&ps->ps_lock);
+
+	if (ps->ps_node_fini != NULL)
+		ps->ps_node_fini(pool, node);
+
+	LASSERT(pool->po_allocated > 0);
+	list_add(node, &pool->po_free_list);
+	pool->po_allocated--;
+
+	list_for_each_entry_safe(pool, tmp, &ps->ps_pool_list, po_list) {
+		/* the first pool is persistent */
+		if (ps->ps_pool_list.next == &pool->po_list)
+			continue;
+
+		if (kiblnd_pool_is_idle(pool, now))
+			list_move(&pool->po_list, &zombies);
+	}
+	spin_unlock(&ps->ps_lock);
+
+	if (!list_empty(&zombies))
+		kiblnd_destroy_pool_list(&zombies);
+}
+
+struct list_head *
+kiblnd_pool_alloc_node(kib_poolset_t *ps)
+{
+	struct list_head	*node;
+	kib_pool_t		*pool;
+	int			rc;
+	unsigned int		interval = 1;
+	cfs_time_t		time_before;
+	unsigned int		trips = 0;
+
+again:
+	spin_lock(&ps->ps_lock);
+	list_for_each_entry(pool, &ps->ps_pool_list, po_list) {
+		if (list_empty(&pool->po_free_list))
+			continue;
+
+		pool->po_allocated++;
+		pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+		node = pool->po_free_list.next;
+		list_del(node);
+
+		if (ps->ps_node_init != NULL) {
+			/* still hold the lock */
+			ps->ps_node_init(pool, node);
+		}
+		spin_unlock(&ps->ps_lock);
+		return node;
+	}
+
+	/* no available tx pool and ... */
+	if (ps->ps_increasing) {
+		/* another thread is allocating a new pool */
+		spin_unlock(&ps->ps_lock);
+		trips++;
+                CDEBUG(D_NET, "Another thread is allocating new "
+		       "%s pool, waiting %d HZs for her to complete."
+		       "trips = %d\n",
+		       ps->ps_name, interval, trips);
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(interval);
+		if (interval < cfs_time_seconds(1))
+			interval *= 2;
+
+                goto again;
+        }
+
+	if (cfs_time_before(cfs_time_current(), ps->ps_next_retry)) {
+		/* someone failed recently */
+		spin_unlock(&ps->ps_lock);
+		return NULL;
+	}
+
+	ps->ps_increasing = 1;
+	spin_unlock(&ps->ps_lock);
+
+	CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name);
+	time_before = cfs_time_current();
+	rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool);
+	CDEBUG(D_NET, "ps_pool_create took %lu HZ to complete",
+	       cfs_time_current() - time_before);
+
+	spin_lock(&ps->ps_lock);
+	ps->ps_increasing = 0;
+	if (rc == 0) {
+		list_add_tail(&pool->po_list, &ps->ps_pool_list);
+	} else {
+		ps->ps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
+		CERROR("Can't allocate new %s pool because out of memory\n",
+		       ps->ps_name);
+	}
+	spin_unlock(&ps->ps_lock);
+
+	goto again;
+}
+
+static void
+kiblnd_destroy_tx_pool(kib_pool_t *pool)
+{
+        kib_tx_pool_t  *tpo = container_of(pool, kib_tx_pool_t, tpo_pool);
+        int             i;
+
+        LASSERT (pool->po_allocated == 0);
+
+        if (tpo->tpo_tx_pages != NULL) {
+                kiblnd_unmap_tx_pool(tpo);
+                kiblnd_free_pages(tpo->tpo_tx_pages);
+        }
+
+        if (tpo->tpo_tx_descs == NULL)
+                goto out;
+
+        for (i = 0; i < pool->po_size; i++) {
+		kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+		int	  wrq_sge = *kiblnd_tunables.kib_wrq_sge;
+
+		list_del(&tx->tx_list);
+                if (tx->tx_pages != NULL)
+                        LIBCFS_FREE(tx->tx_pages,
+                                    LNET_MAX_IOV *
+                                    sizeof(*tx->tx_pages));
+                if (tx->tx_frags != NULL)
+                        LIBCFS_FREE(tx->tx_frags,
+				    (1 + IBLND_MAX_RDMA_FRAGS) *
+				    sizeof(*tx->tx_frags));
+                if (tx->tx_wrq != NULL)
+                        LIBCFS_FREE(tx->tx_wrq,
+                                    (1 + IBLND_MAX_RDMA_FRAGS) *
+                                    sizeof(*tx->tx_wrq));
+		if (tx->tx_sge != NULL)
+			LIBCFS_FREE(tx->tx_sge,
+				    (1 + IBLND_MAX_RDMA_FRAGS) * wrq_sge *
+				    sizeof(*tx->tx_sge));
+                if (tx->tx_rd != NULL)
+                        LIBCFS_FREE(tx->tx_rd,
+                                    offsetof(kib_rdma_desc_t,
+                                             rd_frags[IBLND_MAX_RDMA_FRAGS]));
+        }
+
+        LIBCFS_FREE(tpo->tpo_tx_descs,
+                    pool->po_size * sizeof(kib_tx_t));
+out:
+        kiblnd_fini_pool(pool);
+        LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
+}
+
+static int kiblnd_tx_pool_size(struct lnet_ni *ni, int ncpts)
+{
+	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+	int ntx;
+
+	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+	ntx = tunables->lnd_ntx / ncpts;
+
+	return max(IBLND_TX_POOL, ntx);
+}
+
+static int
+kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
+{
+        int            i;
+        int            npg;
+        kib_pool_t    *pool;
+        kib_tx_pool_t *tpo;
+
+	LIBCFS_CPT_ALLOC(tpo, lnet_cpt_table(), ps->ps_cpt, sizeof(*tpo));
+        if (tpo == NULL) {
+                CERROR("Failed to allocate TX pool\n");
+                return -ENOMEM;
+        }
+
+        pool = &tpo->tpo_pool;
+        kiblnd_init_pool(ps, pool, size);
+        tpo->tpo_tx_descs = NULL;
+        tpo->tpo_tx_pages = NULL;
+
+        npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE;
+	if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg) != 0) {
+		CERROR("Can't allocate tx pages: %d\n", npg);
+		LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
+		return -ENOMEM;
+	}
+
+	LIBCFS_CPT_ALLOC(tpo->tpo_tx_descs, lnet_cpt_table(), ps->ps_cpt,
+			 size * sizeof(kib_tx_t));
+        if (tpo->tpo_tx_descs == NULL) {
+                CERROR("Can't allocate %d tx descriptors\n", size);
+                ps->ps_pool_destroy(pool);
+                return -ENOMEM;
+        }
+
+        memset(tpo->tpo_tx_descs, 0, size * sizeof(kib_tx_t));
+
+        for (i = 0; i < size; i++) {
+		kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+		int	  wrq_sge = *kiblnd_tunables.kib_wrq_sge;
+
+                tx->tx_pool = tpo;
+		if (ps->ps_net->ibn_fmr_ps != NULL) {
+			LIBCFS_CPT_ALLOC(tx->tx_pages,
+					 lnet_cpt_table(), ps->ps_cpt,
+					 LNET_MAX_IOV * sizeof(*tx->tx_pages));
+			if (tx->tx_pages == NULL)
+				break;
+		}
+
+		LIBCFS_CPT_ALLOC(tx->tx_frags, lnet_cpt_table(), ps->ps_cpt,
+				 (1 + IBLND_MAX_RDMA_FRAGS) *
+				 sizeof(*tx->tx_frags));
+		if (tx->tx_frags == NULL)
+			break;
+
+		sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS + 1);
+
+		LIBCFS_CPT_ALLOC(tx->tx_wrq, lnet_cpt_table(), ps->ps_cpt,
+				 (1 + IBLND_MAX_RDMA_FRAGS) *
+				 sizeof(*tx->tx_wrq));
+		if (tx->tx_wrq == NULL)
+			break;
+
+		LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt,
+				 (1 + IBLND_MAX_RDMA_FRAGS) * wrq_sge *
+				 sizeof(*tx->tx_sge));
+		if (tx->tx_sge == NULL)
+			break;
+
+		LIBCFS_CPT_ALLOC(tx->tx_rd, lnet_cpt_table(), ps->ps_cpt,
+				 offsetof(kib_rdma_desc_t,
+					  rd_frags[IBLND_MAX_RDMA_FRAGS]));
+		if (tx->tx_rd == NULL)
+			break;
+        }
+
+        if (i == size) {
+                kiblnd_map_tx_pool(tpo);
+                *pp_po = pool;
+                return 0;
+        }
+
+        ps->ps_pool_destroy(pool);
+        return -ENOMEM;
+}
+
+static void
+kiblnd_tx_init(kib_pool_t *pool, struct list_head *node)
+{
+	kib_tx_poolset_t *tps = container_of(pool->po_owner, kib_tx_poolset_t,
+					     tps_poolset);
+	kib_tx_t	 *tx  = list_entry(node, kib_tx_t, tx_list);
+
+	tx->tx_cookie = tps->tps_next_tx_cookie++;
+}
+
+static void
+kiblnd_net_fini_pools(kib_net_t *net)
+{
+	int	i;
+
+	cfs_cpt_for_each(i, lnet_cpt_table()) {
+		kib_tx_poolset_t	*tps;
+		kib_fmr_poolset_t	*fps;
+
+		if (net->ibn_tx_ps != NULL) {
+			tps = net->ibn_tx_ps[i];
+			kiblnd_fini_poolset(&tps->tps_poolset);
+		}
+
+		if (net->ibn_fmr_ps != NULL) {
+			fps = net->ibn_fmr_ps[i];
+			kiblnd_fini_fmr_poolset(fps);
+		}
+	}
+
+	if (net->ibn_tx_ps != NULL) {
+		cfs_percpt_free(net->ibn_tx_ps);
+		net->ibn_tx_ps = NULL;
+	}
+
+	if (net->ibn_fmr_ps != NULL) {
+		cfs_percpt_free(net->ibn_fmr_ps);
+		net->ibn_fmr_ps = NULL;
+	}
+}
+
+static int
+kiblnd_net_init_pools(kib_net_t *net, struct lnet_ni *ni, __u32 *cpts,
+		      int ncpts)
+{
+	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+#ifdef HAVE_IB_GET_DMA_MR
+	unsigned long	flags;
+#endif
+	int		cpt;
+	int		rc;
+	int		i;
+
+	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+
+#ifdef HAVE_IB_GET_DMA_MR
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	if (tunables->lnd_map_on_demand == 0) {
+		read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+					   flags);
+		goto create_tx_pool;
+	}
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+#endif
+
+	if (tunables->lnd_fmr_pool_size < tunables->lnd_ntx / 4) {
+		CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n",
+		       tunables->lnd_fmr_pool_size,
+		       tunables->lnd_ntx / 4);
+		rc = -EINVAL;
+		goto failed;
+	}
+
+	/* TX pool must be created later than FMR, see LU-2268
+	 * for details */
+	LASSERT(net->ibn_tx_ps == NULL);
+
+	/* premapping can fail if ibd_nmr > 1, so we always create
+	 * FMR pool and map-on-demand if premapping failed */
+
+	net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
+					   sizeof(kib_fmr_poolset_t));
+	if (net->ibn_fmr_ps == NULL) {
+		CERROR("Failed to allocate FMR pool array\n");
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	for (i = 0; i < ncpts; i++) {
+		cpt = (cpts == NULL) ? i : cpts[i];
+		rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, ncpts,
+					     net, tunables);
+		if (rc != 0) {
+			CERROR("Can't initialize FMR pool for CPT %d: %d\n",
+			       cpt, rc);
+			goto failed;
+		}
+	}
+
+	if (i > 0)
+		LASSERT(i == ncpts);
+
+#ifdef HAVE_IB_GET_DMA_MR
+ create_tx_pool:
+#endif
+	net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(),
+					  sizeof(kib_tx_poolset_t));
+	if (net->ibn_tx_ps == NULL) {
+		CERROR("Failed to allocate tx pool array\n");
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	for (i = 0; i < ncpts; i++) {
+		cpt = (cpts == NULL) ? i : cpts[i];
+		rc = kiblnd_init_poolset(&net->ibn_tx_ps[cpt]->tps_poolset,
+					 cpt, net, "TX",
+					 kiblnd_tx_pool_size(ni, ncpts),
+					 kiblnd_create_tx_pool,
+					 kiblnd_destroy_tx_pool,
+					 kiblnd_tx_init, NULL);
+		if (rc != 0) {
+			CERROR("Can't initialize TX pool for CPT %d: %d\n",
+			       cpt, rc);
+			goto failed;
+		}
+	}
+
+	return 0;
+ failed:
+	kiblnd_net_fini_pools(net);
+	LASSERT(rc != 0);
+	return rc;
+}
+
+static int
+kiblnd_hdev_get_attr(kib_hca_dev_t *hdev)
+{
+#ifndef HAVE_IB_DEVICE_ATTRS
+	struct ib_device_attr *attr;
+	int                    rc;
+#endif
+
+        /* It's safe to assume a HCA can handle a page size
+         * matching that of the native system */
+        hdev->ibh_page_shift = PAGE_SHIFT;
+        hdev->ibh_page_size  = 1 << PAGE_SHIFT;
+        hdev->ibh_page_mask  = ~((__u64)hdev->ibh_page_size - 1);
+
+#ifdef HAVE_IB_DEVICE_ATTRS
+	hdev->ibh_mr_size = hdev->ibh_ibdev->attrs.max_mr_size;
+#else
+        LIBCFS_ALLOC(attr, sizeof(*attr));
+        if (attr == NULL) {
+                CERROR("Out of memory\n");
+                return -ENOMEM;
+        }
+
+        rc = ib_query_device(hdev->ibh_ibdev, attr);
+        if (rc == 0)
+                hdev->ibh_mr_size = attr->max_mr_size;
+
+        LIBCFS_FREE(attr, sizeof(*attr));
+
+        if (rc != 0) {
+                CERROR("Failed to query IB device: %d\n", rc);
+                return rc;
+        }
+#endif
+
+        if (hdev->ibh_mr_size == ~0ULL) {
+                hdev->ibh_mr_shift = 64;
+                return 0;
+        }
+
+	CERROR("Invalid mr size: %#llx\n", hdev->ibh_mr_size);
+        return -EINVAL;
+}
+
+#ifdef HAVE_IB_GET_DMA_MR
+static void
+kiblnd_hdev_cleanup_mrs(kib_hca_dev_t *hdev)
+{
+	if (hdev->ibh_mrs == NULL)
+		return;
+
+	ib_dereg_mr(hdev->ibh_mrs);
+
+	hdev->ibh_mrs = NULL;
+}
+#endif
+
+void
+kiblnd_hdev_destroy(kib_hca_dev_t *hdev)
+{
+#ifdef HAVE_IB_GET_DMA_MR
+        kiblnd_hdev_cleanup_mrs(hdev);
+#endif
+
+        if (hdev->ibh_pd != NULL)
+                ib_dealloc_pd(hdev->ibh_pd);
+
+        if (hdev->ibh_cmid != NULL)
+                rdma_destroy_id(hdev->ibh_cmid);
+
+        LIBCFS_FREE(hdev, sizeof(*hdev));
+}
+
+#ifdef HAVE_IB_GET_DMA_MR
+static int
+kiblnd_hdev_setup_mrs(kib_hca_dev_t *hdev)
+{
+	struct ib_mr *mr;
+	int           rc;
+	int           acflags = IB_ACCESS_LOCAL_WRITE |
+				IB_ACCESS_REMOTE_WRITE;
+
+	rc = kiblnd_hdev_get_attr(hdev);
+	if (rc != 0)
+		return rc;
+
+	mr = ib_get_dma_mr(hdev->ibh_pd, acflags);
+	if (IS_ERR(mr)) {
+		CERROR("Failed ib_get_dma_mr: %ld\n", PTR_ERR(mr));
+		kiblnd_hdev_cleanup_mrs(hdev);
+		return PTR_ERR(mr);
+	}
+
+	hdev->ibh_mrs = mr;
+
+	return 0;
+}
+#endif
+
+static int
+kiblnd_dummy_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
+{       /* DUMMY */
+        return 0;
+}
+
+static int
+kiblnd_dev_need_failover(kib_dev_t *dev)
+{
+        struct rdma_cm_id  *cmid;
+        struct sockaddr_in  srcaddr;
+        struct sockaddr_in  dstaddr;
+        int                 rc;
+
+        if (dev->ibd_hdev == NULL || /* initializing */
+            dev->ibd_hdev->ibh_cmid == NULL || /* listener is dead */
+            *kiblnd_tunables.kib_dev_failover > 1) /* debugging */
+                return 1;
+
+        /* XXX: it's UGLY, but I don't have better way to find
+         * ib-bonding HCA failover because:
+         *
+         * a. no reliable CM event for HCA failover...
+         * b. no OFED API to get ib_device for current net_device...
+         *
+         * We have only two choices at this point:
+         *
+         * a. rdma_bind_addr(), it will conflict with listener cmid
+         * b. rdma_resolve_addr() to zero addr */
+        cmid = kiblnd_rdma_create_id(kiblnd_dummy_callback, dev, RDMA_PS_TCP,
+                                     IB_QPT_RC);
+        if (IS_ERR(cmid)) {
+                rc = PTR_ERR(cmid);
+                CERROR("Failed to create cmid for failover: %d\n", rc);
+                return rc;
+        }
+
+        memset(&srcaddr, 0, sizeof(srcaddr));
+        srcaddr.sin_family      = AF_INET;
+        srcaddr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
+
+        memset(&dstaddr, 0, sizeof(dstaddr));
+        dstaddr.sin_family = AF_INET;
+        rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr,
+                               (struct sockaddr *)&dstaddr, 1);
+	if (rc != 0 || cmid->device == NULL) {
+		CERROR("Failed to bind %s:%pI4h to device(%p): %d\n",
+		       dev->ibd_ifname, &dev->ibd_ifip,
+		       cmid->device, rc);
+                rdma_destroy_id(cmid);
+                return rc;
+        }
+
+	rc = dev->ibd_hdev->ibh_ibdev != cmid->device; /* true for failover */
+	rdma_destroy_id(cmid);
+	return rc;
+}
+
+int
+kiblnd_dev_failover(kib_dev_t *dev)
+{
+	struct list_head    zombie_tpo = LIST_HEAD_INIT(zombie_tpo);
+	struct list_head    zombie_ppo = LIST_HEAD_INIT(zombie_ppo);
+	struct list_head    zombie_fpo = LIST_HEAD_INIT(zombie_fpo);
+        struct rdma_cm_id  *cmid  = NULL;
+        kib_hca_dev_t      *hdev  = NULL;
+        kib_hca_dev_t      *old;
+        struct ib_pd       *pd;
+        kib_net_t          *net;
+        struct sockaddr_in  addr;
+        unsigned long       flags;
+        int                 rc = 0;
+	int		    i;
+
+        LASSERT (*kiblnd_tunables.kib_dev_failover > 1 ||
+                 dev->ibd_can_failover ||
+                 dev->ibd_hdev == NULL);
+
+        rc = kiblnd_dev_need_failover(dev);
+        if (rc <= 0)
+                goto out;
+
+        if (dev->ibd_hdev != NULL &&
+            dev->ibd_hdev->ibh_cmid != NULL) {
+                /* XXX it's not good to close old listener at here,
+                 * because we can fail to create new listener.
+                 * But we have to close it now, otherwise rdma_bind_addr
+                 * will return EADDRINUSE... How crap! */
+		write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+		cmid = dev->ibd_hdev->ibh_cmid;
+		/* make next schedule of kiblnd_dev_need_failover()
+		 * return 1 for me */
+		dev->ibd_hdev->ibh_cmid  = NULL;
+		write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+                rdma_destroy_id(cmid);
+        }
+
+        cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, dev, RDMA_PS_TCP,
+                                     IB_QPT_RC);
+        if (IS_ERR(cmid)) {
+                rc = PTR_ERR(cmid);
+                CERROR("Failed to create cmid for failover: %d\n", rc);
+                goto out;
+        }
+
+        memset(&addr, 0, sizeof(addr));
+        addr.sin_family      = AF_INET;
+        addr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
+        addr.sin_port        = htons(*kiblnd_tunables.kib_service);
+
+        /* Bind to failover device or port */
+        rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr);
+	if (rc != 0 || cmid->device == NULL) {
+		CERROR("Failed to bind %s:%pI4h to device(%p): %d\n",
+		       dev->ibd_ifname, &dev->ibd_ifip,
+		       cmid->device, rc);
+                rdma_destroy_id(cmid);
+                goto out;
+        }
+
+	LIBCFS_ALLOC(hdev, sizeof(*hdev));
+        if (hdev == NULL) {
+                CERROR("Failed to allocate kib_hca_dev\n");
+                rdma_destroy_id(cmid);
+                rc = -ENOMEM;
+                goto out;
+        }
+
+        atomic_set(&hdev->ibh_ref, 1);
+        hdev->ibh_dev   = dev;
+        hdev->ibh_cmid  = cmid;
+        hdev->ibh_ibdev = cmid->device;
+
+#ifdef HAVE_IB_ALLOC_PD_2ARGS
+	pd = ib_alloc_pd(cmid->device, 0);
+#else
+	pd = ib_alloc_pd(cmid->device);
+#endif
+	if (IS_ERR(pd)) {
+		rc = PTR_ERR(pd);
+		CERROR("Can't allocate PD: %d\n", rc);
+		goto out;
+	}
+
+        hdev->ibh_pd = pd;
+
+        rc = rdma_listen(cmid, 0);
+        if (rc != 0) {
+                CERROR("Can't start new listener: %d\n", rc);
+                goto out;
+        }
+
+#ifdef HAVE_IB_GET_DMA_MR
+	rc = kiblnd_hdev_setup_mrs(hdev);
+	if (rc != 0) {
+		CERROR("Can't setup device: %d\n", rc);
+		goto out;
+	}
+#else
+	rc = kiblnd_hdev_get_attr(hdev);
+	if (rc != 0) {
+		CERROR("Can't get device attributes: %d\n", rc);
+		goto out;
+	}
+#endif
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	old = dev->ibd_hdev;
+	dev->ibd_hdev = hdev;	/* take over the refcount */
+	hdev = old;
+
+	list_for_each_entry(net, &dev->ibd_nets, ibn_list) {
+		cfs_cpt_for_each(i, lnet_cpt_table()) {
+			kiblnd_fail_poolset(&net->ibn_tx_ps[i]->tps_poolset,
+					    &zombie_tpo);
+
+			if (net->ibn_fmr_ps != NULL)
+				kiblnd_fail_fmr_poolset(net->ibn_fmr_ps[i],
+							&zombie_fpo);
+		}
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+ out:
+	if (!list_empty(&zombie_tpo))
+		kiblnd_destroy_pool_list(&zombie_tpo);
+	if (!list_empty(&zombie_ppo))
+		kiblnd_destroy_pool_list(&zombie_ppo);
+	if (!list_empty(&zombie_fpo))
+		kiblnd_destroy_fmr_pool_list(&zombie_fpo);
+	if (hdev != NULL)
+		kiblnd_hdev_decref(hdev);
+
+	if (rc != 0)
+		dev->ibd_failed_failover++;
+	else
+		dev->ibd_failed_failover = 0;
+
+	return rc;
+}
+
+void
+kiblnd_destroy_dev (kib_dev_t *dev)
+{
+        LASSERT (dev->ibd_nnets == 0);
+	LASSERT(list_empty(&dev->ibd_nets));
+
+	list_del(&dev->ibd_fail_list);
+	list_del(&dev->ibd_list);
+
+        if (dev->ibd_hdev != NULL)
+                kiblnd_hdev_decref(dev->ibd_hdev);
+
+        LIBCFS_FREE(dev, sizeof(*dev));
+}
+
+static kib_dev_t *
+kiblnd_create_dev(char *ifname)
+{
+        struct net_device *netdev;
+        kib_dev_t         *dev;
+        __u32              netmask;
+        __u32              ip;
+        int                up;
+        int                rc;
+
+	rc = lnet_ipif_query(ifname, &up, &ip, &netmask);
+        if (rc != 0) {
+                CERROR("Can't query IPoIB interface %s: %d\n",
+                       ifname, rc);
+                return NULL;
+        }
+
+        if (!up) {
+                CERROR("Can't query IPoIB interface %s: it's down\n", ifname);
+                return NULL;
+        }
+
+        LIBCFS_ALLOC(dev, sizeof(*dev));
+        if (dev == NULL)
+                return NULL;
+
+        netdev = dev_get_by_name(&init_net, ifname);
+        if (netdev == NULL) {
+                dev->ibd_can_failover = 0;
+        } else {
+                dev->ibd_can_failover = !!(netdev->flags & IFF_MASTER);
+                dev_put(netdev);
+        }
+
+	INIT_LIST_HEAD(&dev->ibd_nets);
+	INIT_LIST_HEAD(&dev->ibd_list); /* not yet in kib_devs */
+	INIT_LIST_HEAD(&dev->ibd_fail_list);
+        dev->ibd_ifip = ip;
+        strcpy(&dev->ibd_ifname[0], ifname);
+
+        /* initialize the device */
+        rc = kiblnd_dev_failover(dev);
+        if (rc != 0) {
+                CERROR("Can't initialize device: %d\n", rc);
+                LIBCFS_FREE(dev, sizeof(*dev));
+                return NULL;
+        }
+
+	list_add_tail(&dev->ibd_list,
+                          &kiblnd_data.kib_devs);
+        return dev;
+}
+
+static void
+kiblnd_base_shutdown(void)
+{
+	struct kib_sched_info	*sched;
+	int			i;
+
+	LASSERT(list_empty(&kiblnd_data.kib_devs));
+
+        CDEBUG(D_MALLOC, "before LND base cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+        switch (kiblnd_data.kib_init) {
+        default:
+                LBUG();
+
+        case IBLND_INIT_ALL:
+        case IBLND_INIT_DATA:
+                LASSERT (kiblnd_data.kib_peers != NULL);
+                for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
+			LASSERT(list_empty(&kiblnd_data.kib_peers[i]));
+                }
+		LASSERT(list_empty(&kiblnd_data.kib_connd_zombies));
+		LASSERT(list_empty(&kiblnd_data.kib_connd_conns));
+		LASSERT(list_empty(&kiblnd_data.kib_reconn_list));
+		LASSERT(list_empty(&kiblnd_data.kib_reconn_wait));
+
+		/* flag threads to terminate; wake and wait for them to die */
+		kiblnd_data.kib_shutdown = 1;
+
+		/* NB: we really want to stop scheduler threads net by net
+		 * instead of the whole module, this should be improved
+		 * with dynamic configuration LNet */
+		cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds)
+			wake_up_all(&sched->ibs_waitq);
+
+		wake_up_all(&kiblnd_data.kib_connd_waitq);
+		wake_up_all(&kiblnd_data.kib_failover_waitq);
+
+		i = 2;
+		while (atomic_read(&kiblnd_data.kib_nthreads) != 0) {
+			i++;
+			/* power of 2? */
+			CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+			       "Waiting for %d threads to terminate\n",
+			       atomic_read(&kiblnd_data.kib_nthreads));
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(cfs_time_seconds(1));
+		}
+
+                /* fall through */
+
+        case IBLND_INIT_NOTHING:
+                break;
+        }
+
+	if (kiblnd_data.kib_peers != NULL) {
+		LIBCFS_FREE(kiblnd_data.kib_peers,
+			    sizeof(struct list_head) *
+			    kiblnd_data.kib_peer_hash_size);
+	}
+
+	if (kiblnd_data.kib_scheds != NULL)
+		cfs_percpt_free(kiblnd_data.kib_scheds);
+
+        CDEBUG(D_MALLOC, "after LND base cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+	kiblnd_data.kib_init = IBLND_INIT_NOTHING;
+	module_put(THIS_MODULE);
+}
+
+static void
+kiblnd_shutdown(struct lnet_ni *ni)
+{
+        kib_net_t        *net = ni->ni_data;
+	rwlock_t     *g_lock = &kiblnd_data.kib_global_lock;
+        int               i;
+        unsigned long     flags;
+
+        LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL);
+
+        if (net == NULL)
+                goto out;
+
+        CDEBUG(D_MALLOC, "before LND net cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+	write_lock_irqsave(g_lock, flags);
+	net->ibn_shutdown = 1;
+	write_unlock_irqrestore(g_lock, flags);
+
+        switch (net->ibn_init) {
+        default:
+                LBUG();
+
+        case IBLND_INIT_ALL:
+                /* nuke all existing peers within this net */
+                kiblnd_del_peer(ni, LNET_NID_ANY);
+
+		/* Wait for all peer_ni state to clean up */
+		i = 2;
+		while (atomic_read(&net->ibn_npeers) != 0) {
+			i++;
+			/* power of 2? */
+			CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+			       "%s: waiting for %d peers to disconnect\n",
+			       libcfs_nid2str(ni->ni_nid),
+			       atomic_read(&net->ibn_npeers));
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(cfs_time_seconds(1));
+		}
+
+		kiblnd_net_fini_pools(net);
+
+		write_lock_irqsave(g_lock, flags);
+		LASSERT(net->ibn_dev->ibd_nnets > 0);
+		net->ibn_dev->ibd_nnets--;
+		list_del(&net->ibn_list);
+		write_unlock_irqrestore(g_lock, flags);
+
+                /* fall through */
+
+        case IBLND_INIT_NOTHING:
+		LASSERT (atomic_read(&net->ibn_nconns) == 0);
+
+                if (net->ibn_dev != NULL &&
+                    net->ibn_dev->ibd_nnets == 0)
+                        kiblnd_destroy_dev(net->ibn_dev);
+
+                break;
+        }
+
+        CDEBUG(D_MALLOC, "after LND net cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+        net->ibn_init = IBLND_INIT_NOTHING;
+        ni->ni_data = NULL;
+
+        LIBCFS_FREE(net, sizeof(*net));
+
+out:
+	if (list_empty(&kiblnd_data.kib_devs))
+                kiblnd_base_shutdown();
+        return;
+}
+
+static int
+kiblnd_base_startup(void)
+{
+	struct kib_sched_info	*sched;
+	int			rc;
+	int			i;
+
+	LASSERT(kiblnd_data.kib_init == IBLND_INIT_NOTHING);
+
+	try_module_get(THIS_MODULE);
+	memset(&kiblnd_data, 0, sizeof(kiblnd_data)); /* zero pointers, flags etc */
+
+	rwlock_init(&kiblnd_data.kib_global_lock);
+
+	INIT_LIST_HEAD(&kiblnd_data.kib_devs);
+	INIT_LIST_HEAD(&kiblnd_data.kib_failed_devs);
+
+	kiblnd_data.kib_peer_hash_size = IBLND_PEER_HASH_SIZE;
+	LIBCFS_ALLOC(kiblnd_data.kib_peers,
+		     sizeof(struct list_head) *
+		     kiblnd_data.kib_peer_hash_size);
+	if (kiblnd_data.kib_peers == NULL)
+		goto failed;
+
+	for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++)
+		INIT_LIST_HEAD(&kiblnd_data.kib_peers[i]);
+
+	spin_lock_init(&kiblnd_data.kib_connd_lock);
+	INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns);
+	INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies);
+	INIT_LIST_HEAD(&kiblnd_data.kib_reconn_list);
+	INIT_LIST_HEAD(&kiblnd_data.kib_reconn_wait);
+
+	init_waitqueue_head(&kiblnd_data.kib_connd_waitq);
+	init_waitqueue_head(&kiblnd_data.kib_failover_waitq);
+
+	kiblnd_data.kib_scheds = cfs_percpt_alloc(lnet_cpt_table(),
+						  sizeof(*sched));
+	if (kiblnd_data.kib_scheds == NULL)
+		goto failed;
+
+	cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) {
+		int	nthrs;
+
+		spin_lock_init(&sched->ibs_lock);
+		INIT_LIST_HEAD(&sched->ibs_conns);
+		init_waitqueue_head(&sched->ibs_waitq);
+
+		nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+		if (*kiblnd_tunables.kib_nscheds > 0) {
+			nthrs = min(nthrs, *kiblnd_tunables.kib_nscheds);
+		} else {
+			/* max to half of CPUs, another half is reserved for
+			 * upper layer modules */
+			nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
+		}
+
+		sched->ibs_nthreads_max = nthrs;
+		sched->ibs_cpt = i;
+	}
+
+        kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR;
+
+        /* lists/ptrs/locks initialised */
+        kiblnd_data.kib_init = IBLND_INIT_DATA;
+        /*****************************************************/
+
+	rc = kiblnd_thread_start(kiblnd_connd, NULL, "kiblnd_connd");
+        if (rc != 0) {
+                CERROR("Can't spawn o2iblnd connd: %d\n", rc);
+                goto failed;
+        }
+
+	if (*kiblnd_tunables.kib_dev_failover != 0)
+		rc = kiblnd_thread_start(kiblnd_failover_thread, NULL,
+					 "kiblnd_failover");
+
+        if (rc != 0) {
+                CERROR("Can't spawn o2iblnd failover thread: %d\n", rc);
+                goto failed;
+        }
+
+        /* flag everything initialised */
+        kiblnd_data.kib_init = IBLND_INIT_ALL;
+        /*****************************************************/
+
+        return 0;
+
+ failed:
+        kiblnd_base_shutdown();
+        return -ENETDOWN;
+}
+
+static int
+kiblnd_start_schedulers(struct kib_sched_info *sched)
+{
+	int	rc = 0;
+	int	nthrs;
+	int	i;
+
+	if (sched->ibs_nthreads == 0) {
+		if (*kiblnd_tunables.kib_nscheds > 0) {
+			nthrs = sched->ibs_nthreads_max;
+		} else {
+			nthrs = cfs_cpt_weight(lnet_cpt_table(),
+					       sched->ibs_cpt);
+			nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
+			nthrs = min(IBLND_N_SCHED_HIGH, nthrs);
+		}
+	} else {
+		LASSERT(sched->ibs_nthreads <= sched->ibs_nthreads_max);
+		/* increase one thread if there is new interface */
+		nthrs = (sched->ibs_nthreads < sched->ibs_nthreads_max);
+	}
+
+	for (i = 0; i < nthrs; i++) {
+		long	id;
+		char	name[20];
+		id = KIB_THREAD_ID(sched->ibs_cpt, sched->ibs_nthreads + i);
+		snprintf(name, sizeof(name), "kiblnd_sd_%02ld_%02ld",
+			 KIB_THREAD_CPT(id), KIB_THREAD_TID(id));
+		rc = kiblnd_thread_start(kiblnd_scheduler, (void *)id, name);
+		if (rc == 0)
+			continue;
+
+		CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
+		       sched->ibs_cpt, sched->ibs_nthreads + i, rc);
+		break;
+	}
+
+	sched->ibs_nthreads += i;
+	return rc;
+}
+
+static int
+kiblnd_dev_start_threads(kib_dev_t *dev, int newdev, __u32 *cpts, int ncpts)
+{
+	int	cpt;
+	int	rc;
+	int	i;
+
+	for (i = 0; i < ncpts; i++) {
+		struct kib_sched_info *sched;
+
+		cpt = (cpts == NULL) ? i : cpts[i];
+		sched = kiblnd_data.kib_scheds[cpt];
+
+		if (!newdev && sched->ibs_nthreads > 0)
+			continue;
+
+		rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]);
+		if (rc != 0) {
+			CERROR("Failed to start scheduler threads for %s\n",
+			       dev->ibd_ifname);
+			return rc;
+		}
+	}
+	return 0;
+}
+
+static kib_dev_t *
+kiblnd_dev_search(char *ifname)
+{
+	kib_dev_t	*alias = NULL;
+	kib_dev_t	*dev;
+	char		*colon;
+	char		*colon2;
+
+	colon = strchr(ifname, ':');
+	list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
+		if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
+			return dev;
+
+		if (alias != NULL)
+			continue;
+
+		colon2 = strchr(dev->ibd_ifname, ':');
+		if (colon != NULL)
+			*colon = 0;
+		if (colon2 != NULL)
+			*colon2 = 0;
+
+		if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
+			alias = dev;
+
+		if (colon != NULL)
+			*colon = ':';
+		if (colon2 != NULL)
+			*colon2 = ':';
+	}
+	return alias;
+}
+
+static int
+kiblnd_startup(struct lnet_ni *ni)
+{
+        char                     *ifname;
+        kib_dev_t                *ibdev = NULL;
+        kib_net_t                *net;
+        unsigned long             flags;
+        int                       rc;
+	int			  newdev;
+	int			  node_id;
+
+        LASSERT (ni->ni_net->net_lnd == &the_o2iblnd);
+
+        if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
+                rc = kiblnd_base_startup();
+                if (rc != 0)
+                        return rc;
+        }
+
+        LIBCFS_ALLOC(net, sizeof(*net));
+        ni->ni_data = net;
+        if (net == NULL)
+                goto failed;
+
+	net->ibn_incarnation = ktime_get_real_ns() / NSEC_PER_USEC;
+
+	kiblnd_tunables_setup(ni);
+
+	if (ni->ni_interfaces[0] != NULL) {
+		/* Use the IPoIB interface specified in 'networks=' */
+
+		CLASSERT(LNET_NUM_INTERFACES > 1);
+		if (ni->ni_interfaces[1] != NULL) {
+			CERROR("Multiple interfaces not supported\n");
+			goto failed;
+		}
+
+		ifname = ni->ni_interfaces[0];
+	} else {
+		ifname = *kiblnd_tunables.kib_default_ipif;
+	}
+
+        if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
+                CERROR("IPoIB interface name too long: %s\n", ifname);
+                goto failed;
+        }
+
+	ibdev = kiblnd_dev_search(ifname);
+
+	newdev = ibdev == NULL;
+	/* hmm...create kib_dev even for alias */
+	if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0)
+		ibdev = kiblnd_create_dev(ifname);
+
+	if (ibdev == NULL)
+		goto failed;
+
+	node_id = dev_to_node(ibdev->ibd_hdev->ibh_ibdev->dma_device);
+	ni->ni_dev_cpt = cfs_cpt_of_node(lnet_cpt_table(), node_id);
+
+	net->ibn_dev = ibdev;
+	ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
+
+	rc = kiblnd_dev_start_threads(ibdev, newdev,
+				      ni->ni_cpts, ni->ni_ncpts);
+	if (rc != 0)
+		goto failed;
+
+	rc = kiblnd_net_init_pools(net, ni, ni->ni_cpts, ni->ni_ncpts);
+        if (rc != 0) {
+                CERROR("Failed to initialize NI pools: %d\n", rc);
+                goto failed;
+        }
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	ibdev->ibd_nnets++;
+	list_add_tail(&net->ibn_list, &ibdev->ibd_nets);
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+        net->ibn_init = IBLND_INIT_ALL;
+
+        return 0;
+
+failed:
+	if (net != NULL && net->ibn_dev == NULL && ibdev != NULL)
+                kiblnd_destroy_dev(ibdev);
+
+        kiblnd_shutdown(ni);
+
+        CDEBUG(D_NET, "kiblnd_startup failed\n");
+        return -ENETDOWN;
+}
+
+static struct lnet_lnd the_o2iblnd = {
+	.lnd_type	= O2IBLND,
+	.lnd_startup	= kiblnd_startup,
+	.lnd_shutdown	= kiblnd_shutdown,
+	.lnd_ctl	= kiblnd_ctl,
+	.lnd_query	= kiblnd_query,
+	.lnd_send	= kiblnd_send,
+	.lnd_recv	= kiblnd_recv,
+};
+
+static void __exit ko2iblnd_exit(void)
+{
+	lnet_unregister_lnd(&the_o2iblnd);
+}
+
+static int __init ko2iblnd_init(void)
+{
+	int rc;
+
+	CLASSERT(sizeof(kib_msg_t) <= IBLND_MSG_SIZE);
+	CLASSERT(offsetof(kib_msg_t,
+			  ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) <=
+		 IBLND_MSG_SIZE);
+	CLASSERT(offsetof(kib_msg_t,
+			  ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
+		 <= IBLND_MSG_SIZE);
+
+	rc = kiblnd_tunables_init();
+	if (rc != 0)
+		return rc;
+
+	lnet_register_lnd(&the_o2iblnd);
+
+	return 0;
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("OpenIB gen2 LNet Network Driver");
+MODULE_VERSION("2.8.0");
+MODULE_LICENSE("GPL");
+
+module_init(ko2iblnd_init);
+module_exit(ko2iblnd_exit);
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h
new file mode 100644
index 0000000000000..d8ad1421092d6
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h
@@ -0,0 +1,1242 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd.h
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#ifdef HAVE_COMPAT_RDMA
+#include <linux/compat-2.6.h>
+
+#ifdef LINUX_3_17_COMPAT_H
+#undef NEED_KTIME_GET_REAL_NS
+#endif
+
+#endif
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/uio.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+#include <linux/pci.h>
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,32)
+#include <linux/pci-dma.h>
+#endif
+
+#include <net/sock.h>
+#include <linux/in.h>
+
+#include <rdma/rdma_cm.h>
+#include <rdma/ib_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_fmr_pool.h>
+
+#define DEBUG_SUBSYSTEM S_LND
+
+#include <libcfs/libcfs.h>
+#include <lnet/lnet.h>
+#include <lnet/lib-lnet.h>
+
+#define IBLND_PEER_HASH_SIZE		101	/* # peer_ni lists */
+/* # scheduler loops before reschedule */
+#define IBLND_RESCHED			100
+
+#define IBLND_N_SCHED			2
+#define IBLND_N_SCHED_HIGH		4
+
+typedef struct
+{
+	int              *kib_dev_failover;     /* HCA failover */
+	unsigned int     *kib_service;          /* IB service number */
+	int              *kib_min_reconnect_interval; /* first failed connection retry... */
+	int              *kib_max_reconnect_interval; /* ...exponentially increasing to this */
+	int              *kib_cksum;            /* checksum kib_msg_t? */
+	int              *kib_timeout;          /* comms timeout (seconds) */
+	int              *kib_keepalive;        /* keepalive timeout (seconds) */
+	int              *kib_ntx;              /* # tx descs */
+	char            **kib_default_ipif;     /* default IPoIB interface */
+	int              *kib_retry_count;
+	int              *kib_rnr_retry_count;
+	int		 *kib_ib_mtu;		/* IB MTU */
+	int              *kib_require_priv_port;/* accept only privileged ports */
+	int              *kib_use_priv_port;    /* use privileged port for active connect */
+	/* # threads on each CPT */
+	int		 *kib_nscheds;
+	int		 *kib_wrq_sge;		/* # sg elements per wrq */
+} kib_tunables_t;
+
+extern kib_tunables_t  kiblnd_tunables;
+
+#define IBLND_MSG_QUEUE_SIZE_V1      8          /* V1 only : # messages/RDMAs in-flight */
+#define IBLND_CREDIT_HIGHWATER_V1    7          /* V1 only : when eagerly to return credits */
+
+#define IBLND_CREDITS_DEFAULT        8          /* default # of peer_ni credits */
+#define IBLND_CREDITS_MAX          ((typeof(((kib_msg_t*) 0)->ibm_credits)) - 1)  /* Max # of peer_ni credits */
+
+/* when eagerly to return credits */
+#define IBLND_CREDITS_HIGHWATER(t, v) ((v) == IBLND_MSG_VERSION_1 ? \
+					IBLND_CREDIT_HIGHWATER_V1 : \
+					t->lnd_peercredits_hiw)
+
+#ifdef HAVE_RDMA_CREATE_ID_5ARG
+# define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(current->nsproxy->net_ns, \
+								cb, dev, \
+								ps, qpt)
+#else
+# ifdef HAVE_RDMA_CREATE_ID_4ARG
+#  define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(cb, dev, \
+								 ps, qpt)
+# else
+#  define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(cb, dev, ps)
+# endif
+#endif
+
+/* 2 OOB shall suffice for 1 keepalive and 1 returning credits */
+#define IBLND_OOB_CAPABLE(v)       ((v) != IBLND_MSG_VERSION_1)
+#define IBLND_OOB_MSGS(v)           (IBLND_OOB_CAPABLE(v) ? 2 : 0)
+
+#define IBLND_MSG_SIZE              (4<<10)                 /* max size of queued messages (inc hdr) */
+#define IBLND_MAX_RDMA_FRAGS         LNET_MAX_IOV           /* max # of fragments supported */
+
+/************************/
+/* derived constants... */
+/* Pools (shared by connections on each CPT) */
+/* These pools can grow at runtime, so don't need give a very large value */
+#define IBLND_TX_POOL			256
+#define IBLND_FMR_POOL			256
+#define IBLND_FMR_POOL_FLUSH		192
+
+/* RX messages (per connection) */
+#define IBLND_RX_MSGS(c)	\
+	((c->ibc_queue_depth) * 2 + IBLND_OOB_MSGS(c->ibc_version))
+#define IBLND_RX_MSG_BYTES(c)       (IBLND_RX_MSGS(c) * IBLND_MSG_SIZE)
+#define IBLND_RX_MSG_PAGES(c)	\
+	((IBLND_RX_MSG_BYTES(c) + PAGE_SIZE - 1) / PAGE_SIZE)
+
+/* WRs and CQEs (per connection) */
+#define IBLND_RECV_WRS(c)            IBLND_RX_MSGS(c)
+#define IBLND_SEND_WRS(c)	\
+	((c->ibc_max_frags + 1) * kiblnd_concurrent_sends(c->ibc_version, \
+							  c->ibc_peer->ibp_ni))
+#define IBLND_CQ_ENTRIES(c)         (IBLND_RECV_WRS(c) + IBLND_SEND_WRS(c))
+
+struct kib_hca_dev;
+
+/* o2iblnd can run over aliased interface */
+#ifdef IFALIASZ
+#define KIB_IFNAME_SIZE              IFALIASZ
+#else
+#define KIB_IFNAME_SIZE              256
+#endif
+
+typedef struct
+{
+	struct list_head	ibd_list;	/* chain on kib_devs */
+	struct list_head	ibd_fail_list;	/* chain on kib_failed_devs */
+	__u32			ibd_ifip;	/* IPoIB interface IP */
+	/** IPoIB interface name */
+	char			ibd_ifname[KIB_IFNAME_SIZE];
+	int			ibd_nnets;	/* # nets extant */
+
+	cfs_time_t		ibd_next_failover;
+	/* # failover failures */
+	int			ibd_failed_failover;
+	/* failover in progress */
+	unsigned int		ibd_failover;
+	/* IPoIB interface is a bonding master */
+	unsigned int		ibd_can_failover;
+	struct list_head	ibd_nets;
+	struct kib_hca_dev	*ibd_hdev;
+} kib_dev_t;
+
+typedef struct kib_hca_dev
+{
+	struct rdma_cm_id   *ibh_cmid;          /* listener cmid */
+	struct ib_device    *ibh_ibdev;         /* IB device */
+	int                  ibh_page_shift;    /* page shift of current HCA */
+	int                  ibh_page_size;     /* page size of current HCA */
+	__u64                ibh_page_mask;     /* page mask of current HCA */
+	int                  ibh_mr_shift;      /* bits shift of max MR size */
+	__u64                ibh_mr_size;       /* size of MR */
+#ifdef HAVE_IB_GET_DMA_MR
+	struct ib_mr        *ibh_mrs;           /* global MR */
+#endif
+	struct ib_pd        *ibh_pd;            /* PD */
+	kib_dev_t           *ibh_dev;           /* owner */
+	atomic_t             ibh_ref;           /* refcount */
+} kib_hca_dev_t;
+
+/** # of seconds to keep pool alive */
+#define IBLND_POOL_DEADLINE     300
+/** # of seconds to retry if allocation failed */
+#define IBLND_POOL_RETRY        1
+
+typedef struct
+{
+        int                     ibp_npages;             /* # pages */
+        struct page            *ibp_pages[0];           /* page array */
+} kib_pages_t;
+
+struct kib_pool;
+struct kib_poolset;
+
+typedef int  (*kib_ps_pool_create_t)(struct kib_poolset *ps,
+				     int inc, struct kib_pool **pp_po);
+typedef void (*kib_ps_pool_destroy_t)(struct kib_pool *po);
+typedef void (*kib_ps_node_init_t)(struct kib_pool *po, struct list_head *node);
+typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, struct list_head *node);
+
+struct kib_net;
+
+#define IBLND_POOL_NAME_LEN     32
+
+typedef struct kib_poolset
+{
+	/* serialize */
+	spinlock_t		ps_lock;
+	/* network it belongs to */
+	struct kib_net		*ps_net;
+	/* pool set name */
+	char			ps_name[IBLND_POOL_NAME_LEN];
+	/* list of pools */
+	struct list_head	ps_pool_list;
+	/* failed pool list */
+	struct list_head	ps_failed_pool_list;
+	/* time stamp for retry if failed to allocate */
+	cfs_time_t		ps_next_retry;
+	/* is allocating new pool */
+	int			ps_increasing;
+	/* new pool size */
+	int			ps_pool_size;
+	/* CPT id */
+	int			ps_cpt;
+
+	/* create a new pool */
+	kib_ps_pool_create_t	ps_pool_create;
+	/* destroy a pool */
+	kib_ps_pool_destroy_t	ps_pool_destroy;
+	/* initialize new allocated node */
+	kib_ps_node_init_t	ps_node_init;
+	/* finalize node */
+	kib_ps_node_fini_t	ps_node_fini;
+} kib_poolset_t;
+
+typedef struct kib_pool
+{
+	/* chain on pool list */
+	struct list_head	po_list;
+	/* pre-allocated node */
+	struct list_head	po_free_list;
+	/* pool_set of this pool */
+	kib_poolset_t	       *po_owner;
+	/* deadline of this pool */
+	cfs_time_t		po_deadline;
+	/* # of elements in use */
+	int			po_allocated;
+	/* pool is created on failed HCA */
+	int			po_failed;
+	/* # of pre-allocated elements */
+	int			po_size;
+} kib_pool_t;
+
+typedef struct {
+        kib_poolset_t           tps_poolset;            /* pool-set */
+        __u64                   tps_next_tx_cookie;     /* cookie of TX */
+} kib_tx_poolset_t;
+
+typedef struct {
+        kib_pool_t              tpo_pool;               /* pool */
+        struct kib_hca_dev     *tpo_hdev;               /* device for this pool */
+        struct kib_tx          *tpo_tx_descs;           /* all the tx descriptors */
+        kib_pages_t            *tpo_tx_pages;           /* premapped tx msg pages */
+} kib_tx_pool_t;
+
+typedef struct
+{
+	spinlock_t		fps_lock;		/* serialize */
+	struct kib_net	       *fps_net;		/* IB network */
+	struct list_head	fps_pool_list;		/* FMR pool list */
+	struct list_head	fps_failed_pool_list;	/* FMR pool list */
+	__u64			fps_version;		/* validity stamp */
+	int			fps_cpt;		/* CPT id */
+	int			fps_pool_size;
+	int			fps_flush_trigger;
+	int			fps_cache;
+	/* is allocating new pool */
+	int			fps_increasing;
+	/* time stamp for retry if failed to allocate */
+	cfs_time_t		fps_next_retry;
+} kib_fmr_poolset_t;
+
+#ifndef HAVE_IB_RDMA_WR
+struct ib_rdma_wr {
+	struct ib_send_wr wr;
+};
+#endif
+
+struct kib_fast_reg_descriptor { /* For fast registration */
+	struct list_head		 frd_list;
+	struct ib_rdma_wr		 frd_inv_wr;
+#ifdef HAVE_IB_MAP_MR_SG
+	struct ib_reg_wr		 frd_fastreg_wr;
+#else
+	struct ib_rdma_wr		 frd_fastreg_wr;
+	struct ib_fast_reg_page_list    *frd_frpl;
+#endif
+	struct ib_mr			*frd_mr;
+	bool				 frd_valid;
+};
+
+typedef struct
+{
+	struct list_head	fpo_list;	/* chain on pool list */
+	struct kib_hca_dev     *fpo_hdev;	/* device for this pool */
+	kib_fmr_poolset_t      *fpo_owner;	/* owner of this pool */
+	union {
+		struct {
+			struct ib_fmr_pool *fpo_fmr_pool; /* IB FMR pool */
+		} fmr;
+		struct { /* For fast registration */
+			struct list_head  fpo_pool_list;
+			int		  fpo_pool_size;
+		} fast_reg;
+	};
+	cfs_time_t		fpo_deadline;	/* deadline of this pool */
+	int			fpo_failed;	/* fmr pool is failed */
+	int			fpo_map_count;	/* # of mapped FMR */
+	int			fpo_is_fmr;
+} kib_fmr_pool_t;
+
+typedef struct {
+	kib_fmr_pool_t			*fmr_pool;	/* pool of FMR */
+	struct ib_pool_fmr		*fmr_pfmr;	/* IB pool fmr */
+	struct kib_fast_reg_descriptor	*fmr_frd;
+	u32				 fmr_key;
+} kib_fmr_t;
+
+typedef struct kib_net
+{
+	/* chain on kib_dev_t::ibd_nets */
+	struct list_head	ibn_list;
+	__u64			ibn_incarnation;/* my epoch */
+	int			ibn_init;	/* initialisation state */
+	int			ibn_shutdown;	/* shutting down? */
+
+	atomic_t		ibn_npeers;	/* # peers extant */
+	atomic_t		ibn_nconns;	/* # connections extant */
+
+	kib_tx_poolset_t	**ibn_tx_ps;	/* tx pool-set */
+	kib_fmr_poolset_t	**ibn_fmr_ps;	/* fmr pool-set */
+
+	kib_dev_t		*ibn_dev;	/* underlying IB device */
+} kib_net_t;
+
+#define KIB_THREAD_SHIFT		16
+#define KIB_THREAD_ID(cpt, tid)		((cpt) << KIB_THREAD_SHIFT | (tid))
+#define KIB_THREAD_CPT(id)		((id) >> KIB_THREAD_SHIFT)
+#define KIB_THREAD_TID(id)		((id) & ((1UL << KIB_THREAD_SHIFT) - 1))
+
+struct kib_sched_info {
+	/* serialise */
+	spinlock_t		ibs_lock;
+	/* schedulers sleep here */
+	wait_queue_head_t	ibs_waitq;
+	/* conns to check for rx completions */
+	struct list_head	ibs_conns;
+	/* number of scheduler threads */
+	int			ibs_nthreads;
+	/* max allowed scheduler threads */
+	int			ibs_nthreads_max;
+	int			ibs_cpt;	/* CPT id */
+};
+
+typedef struct
+{
+	int			kib_init;	/* initialisation state */
+	int			kib_shutdown;	/* shut down? */
+	struct list_head	kib_devs;	/* IB devices extant */
+	/* list head of failed devices */
+	struct list_head	kib_failed_devs;
+	/* schedulers sleep here */
+	wait_queue_head_t	kib_failover_waitq;
+	atomic_t		kib_nthreads;	/* # live threads */
+	/* stabilize net/dev/peer_ni/conn ops */
+	rwlock_t		kib_global_lock;
+	/* hash table of all my known peers */
+	struct list_head	*kib_peers;
+	/* size of kib_peers */
+	int			kib_peer_hash_size;
+	/* the connd task (serialisation assertions) */
+	void			*kib_connd;
+	/* connections to setup/teardown */
+	struct list_head	kib_connd_conns;
+	/* connections with zero refcount */
+	struct list_head	kib_connd_zombies;
+	/* connections to reconnect */
+	struct list_head	kib_reconn_list;
+	/* peers wait for reconnection */
+	struct list_head	kib_reconn_wait;
+	/*
+	 * The second that peers are pulled out from \a kib_reconn_wait
+	 * for reconnection.
+	 */
+	unsigned int		kib_reconn_sec;
+	/* connection daemon sleeps here */
+	wait_queue_head_t	kib_connd_waitq;
+	spinlock_t		kib_connd_lock;	/* serialise */
+	struct ib_qp_attr	kib_error_qpa;	/* QP->ERROR */
+	/* percpt data for schedulers */
+	struct kib_sched_info	**kib_scheds;
+} kib_data_t;
+
+#define IBLND_INIT_NOTHING         0
+#define IBLND_INIT_DATA            1
+#define IBLND_INIT_ALL             2
+
+/************************************************************************
+ * IB Wire message format.
+ * These are sent in sender's byte order (i.e. receiver flips).
+ */
+
+typedef struct kib_connparams
+{
+        __u16             ibcp_queue_depth;
+        __u16             ibcp_max_frags;
+        __u32             ibcp_max_msg_size;
+} WIRE_ATTR kib_connparams_t;
+
+typedef struct
+{
+	struct lnet_hdr		ibim_hdr;	/* portals header */
+	char			ibim_payload[0];/* piggy-backed payload */
+} WIRE_ATTR kib_immediate_msg_t;
+
+typedef struct
+{
+        __u32             rf_nob;               /* # bytes this frag */
+        __u64             rf_addr;              /* CAVEAT EMPTOR: misaligned!! */
+} WIRE_ATTR kib_rdma_frag_t;
+
+typedef struct
+{
+        __u32             rd_key;               /* local/remote key */
+        __u32             rd_nfrags;            /* # fragments */
+        kib_rdma_frag_t   rd_frags[0];          /* buffer frags */
+} WIRE_ATTR kib_rdma_desc_t;
+
+typedef struct
+{
+	struct lnet_hdr		ibprm_hdr;	/* portals header */
+	__u64			ibprm_cookie;	/* opaque completion cookie */
+} WIRE_ATTR kib_putreq_msg_t;
+
+typedef struct
+{
+        __u64             ibpam_src_cookie;     /* reflected completion cookie */
+        __u64             ibpam_dst_cookie;     /* opaque completion cookie */
+        kib_rdma_desc_t   ibpam_rd;             /* sender's sink buffer */
+} WIRE_ATTR kib_putack_msg_t;
+
+typedef struct
+{
+	struct lnet_hdr		ibgm_hdr;	/* portals header */
+	__u64			ibgm_cookie;	/* opaque completion cookie */
+	kib_rdma_desc_t		ibgm_rd;	/* rdma descriptor */
+} WIRE_ATTR kib_get_msg_t;
+
+typedef struct
+{
+        __u64             ibcm_cookie;          /* opaque completion cookie */
+        __s32             ibcm_status;          /* < 0 failure: >= 0 length */
+} WIRE_ATTR kib_completion_msg_t;
+
+typedef struct
+{
+        /* First 2 fields fixed FOR ALL TIME */
+        __u32             ibm_magic;            /* I'm an ibnal message */
+        __u16             ibm_version;          /* this is my version number */
+
+        __u8              ibm_type;             /* msg type */
+        __u8              ibm_credits;          /* returned credits */
+        __u32             ibm_nob;              /* # bytes in whole message */
+        __u32             ibm_cksum;            /* checksum (0 == no checksum) */
+        __u64             ibm_srcnid;           /* sender's NID */
+        __u64             ibm_srcstamp;         /* sender's incarnation */
+        __u64             ibm_dstnid;           /* destination's NID */
+        __u64             ibm_dststamp;         /* destination's incarnation */
+
+        union {
+                kib_connparams_t      connparams;
+                kib_immediate_msg_t   immediate;
+                kib_putreq_msg_t      putreq;
+                kib_putack_msg_t      putack;
+                kib_get_msg_t         get;
+                kib_completion_msg_t  completion;
+        } WIRE_ATTR ibm_u;
+} WIRE_ATTR kib_msg_t;
+
+#define IBLND_MSG_MAGIC LNET_PROTO_IB_MAGIC	/* unique magic */
+
+#define IBLND_MSG_VERSION_1         0x11
+#define IBLND_MSG_VERSION_2         0x12
+#define IBLND_MSG_VERSION           IBLND_MSG_VERSION_2
+
+#define IBLND_MSG_CONNREQ           0xc0        /* connection request */
+#define IBLND_MSG_CONNACK           0xc1        /* connection acknowledge */
+#define IBLND_MSG_NOOP              0xd0        /* nothing (just credits) */
+#define IBLND_MSG_IMMEDIATE         0xd1        /* immediate */
+#define IBLND_MSG_PUT_REQ           0xd2        /* putreq (src->sink) */
+#define IBLND_MSG_PUT_NAK           0xd3        /* completion (sink->src) */
+#define IBLND_MSG_PUT_ACK           0xd4        /* putack (sink->src) */
+#define IBLND_MSG_PUT_DONE          0xd5        /* completion (src->sink) */
+#define IBLND_MSG_GET_REQ           0xd6        /* getreq (sink->src) */
+#define IBLND_MSG_GET_DONE          0xd7        /* completion (src->sink: all OK) */
+
+typedef struct {
+        __u32            ibr_magic;             /* sender's magic */
+        __u16            ibr_version;           /* sender's version */
+        __u8             ibr_why;               /* reject reason */
+        __u8             ibr_padding;           /* padding */
+        __u64            ibr_incarnation;       /* incarnation of peer_ni */
+        kib_connparams_t ibr_cp;                /* connection parameters */
+} WIRE_ATTR kib_rej_t;
+
+/* connection rejection reasons */
+#define IBLND_REJECT_CONN_RACE       1          /* You lost connection race */
+#define IBLND_REJECT_NO_RESOURCES    2          /* Out of memory/conns etc */
+#define IBLND_REJECT_FATAL           3          /* Anything else */
+
+#define IBLND_REJECT_CONN_UNCOMPAT   4          /* incompatible version peer_ni */
+#define IBLND_REJECT_CONN_STALE      5          /* stale peer_ni */
+
+/* peer_ni's rdma frags doesn't match mine */
+#define IBLND_REJECT_RDMA_FRAGS      6
+/* peer_ni's msg queue size doesn't match mine */
+#define IBLND_REJECT_MSG_QUEUE_SIZE  7
+#define IBLND_REJECT_INVALID_SRV_ID  8
+
+/***********************************************************************/
+
+typedef struct kib_rx                           /* receive message */
+{
+	/* queue for attention */
+	struct list_head	rx_list;
+	/* owning conn */
+	struct kib_conn	       *rx_conn;
+	/* # bytes received (-1 while posted) */
+	int			rx_nob;
+	/* completion status */
+	enum ib_wc_status	rx_status;
+	/* message buffer (host vaddr) */
+	kib_msg_t	       *rx_msg;
+	/* message buffer (I/O addr) */
+	__u64			rx_msgaddr;
+	/* for dma_unmap_single() */
+	DECLARE_PCI_UNMAP_ADDR(rx_msgunmap);
+	/* receive work item... */
+	struct ib_recv_wr	rx_wrq;
+	/* ...and its memory */
+	struct ib_sge		rx_sge;
+} kib_rx_t;
+
+#define IBLND_POSTRX_DONT_POST    0             /* don't post */
+#define IBLND_POSTRX_NO_CREDIT    1             /* post: no credits */
+#define IBLND_POSTRX_PEER_CREDIT  2             /* post: give peer_ni back 1 credit */
+#define IBLND_POSTRX_RSRVD_CREDIT 3             /* post: give myself back 1 reserved credit */
+
+typedef struct kib_tx                           /* transmit message */
+{
+	/* queue on idle_txs ibc_tx_queue etc. */
+	struct list_head	tx_list;
+	/* pool I'm from */
+	kib_tx_pool_t		*tx_pool;
+	/* owning conn */
+	struct kib_conn		*tx_conn;
+	/* # tx callbacks outstanding */
+	short			tx_sending;
+	/* queued for sending */
+	short			tx_queued;
+	/* waiting for peer_ni */
+	short			tx_waiting;
+	/* LNET completion status */
+	int			tx_status;
+	/* completion deadline */
+	unsigned long		tx_deadline;
+	/* completion cookie */
+	__u64			tx_cookie;
+	/* lnet msgs to finalize on completion */
+	struct lnet_msg		*tx_lntmsg[2];
+	/* message buffer (host vaddr) */
+	kib_msg_t		*tx_msg;
+	/* message buffer (I/O addr) */
+	__u64			tx_msgaddr;
+	/* for dma_unmap_single() */
+	DECLARE_PCI_UNMAP_ADDR(tx_msgunmap);
+	/** sge for tx_msgaddr */
+	struct ib_sge		tx_msgsge;
+	/* # send work items */
+	int			tx_nwrq;
+	/* # used scatter/gather elements */
+	int			tx_nsge;
+	/* send work items... */
+	struct ib_rdma_wr	*tx_wrq;
+	/* ...and their memory */
+	struct ib_sge		*tx_sge;
+	/* rdma descriptor */
+	kib_rdma_desc_t		*tx_rd;
+	/* # entries in... */
+	int			tx_nfrags;
+	/* dma_map_sg descriptor */
+	struct scatterlist	*tx_frags;
+	/* rdma phys page addrs */
+	__u64			*tx_pages;
+	/* FMR */
+	kib_fmr_t		fmr;
+				/* dma direction */
+	int			tx_dmadir;
+} kib_tx_t;
+
+typedef struct kib_connvars
+{
+        /* connection-in-progress variables */
+        kib_msg_t                 cv_msg;
+} kib_connvars_t;
+
+typedef struct kib_conn
+{
+	/* scheduler information */
+	struct kib_sched_info	*ibc_sched;
+	/* owning peer_ni */
+	struct kib_peer		*ibc_peer;
+	/* HCA bound on */
+	kib_hca_dev_t		*ibc_hdev;
+	/* stash on peer_ni's conn list */
+	struct list_head	ibc_list;
+	/* schedule for attention */
+	struct list_head	ibc_sched_list;
+	/* version of connection */
+	__u16			ibc_version;
+	/* reconnect later */
+	__u16			ibc_reconnect:1;
+	/* which instance of the peer */
+	__u64			ibc_incarnation;
+	/* # users */
+	atomic_t		ibc_refcount;
+	/* what's happening */
+	int			ibc_state;
+	/* # uncompleted sends */
+	int			ibc_nsends_posted;
+	/* # uncompleted NOOPs */
+	int			ibc_noops_posted;
+	/* # credits I have */
+	int			ibc_credits;
+	/* # credits to return */
+	int			ibc_outstanding_credits;
+	/* # ACK/DONE msg credits */
+	int			ibc_reserved_credits;
+	/* set on comms error */
+	int			ibc_comms_error;
+	/* connections queue depth */
+	__u16			ibc_queue_depth;
+	/* connections max frags */
+	__u16			ibc_max_frags;
+	/* receive buffers owned */
+	unsigned int		ibc_nrx:16;
+	/* scheduled for attention */
+	unsigned int		ibc_scheduled:1;
+	/* CQ callback fired */
+	unsigned int		ibc_ready:1;
+	/* time of last send */
+	unsigned long		ibc_last_send;
+	/** link chain for kiblnd_check_conns only */
+	struct list_head	ibc_connd_list;
+	/** rxs completed before ESTABLISHED */
+	struct list_head	ibc_early_rxs;
+	/** IBLND_MSG_NOOPs for IBLND_MSG_VERSION_1 */
+	struct list_head	ibc_tx_noops;
+	/* sends that need a credit */
+	struct list_head	ibc_tx_queue;
+	/* sends that don't need a credit */
+	struct list_head	ibc_tx_queue_nocred;
+	/* sends that need to reserve an ACK/DONE msg */
+	struct list_head	ibc_tx_queue_rsrvd;
+	/* active tx awaiting completion */
+	struct list_head	ibc_active_txs;
+	/* serialise */
+	spinlock_t		ibc_lock;
+	/* the rx descs */
+	kib_rx_t		*ibc_rxs;
+	/* premapped rx msg pages */
+	kib_pages_t		*ibc_rx_pages;
+
+	/* CM id */
+	struct rdma_cm_id	*ibc_cmid;
+	/* completion queue */
+	struct ib_cq		*ibc_cq;
+
+	/* in-progress connection state */
+	kib_connvars_t		*ibc_connvars;
+} kib_conn_t;
+
+#define IBLND_CONN_INIT               0         /* being initialised */
+#define IBLND_CONN_ACTIVE_CONNECT     1         /* active sending req */
+#define IBLND_CONN_PASSIVE_WAIT       2         /* passive waiting for rtu */
+#define IBLND_CONN_ESTABLISHED        3         /* connection established */
+#define IBLND_CONN_CLOSING            4         /* being closed */
+#define IBLND_CONN_DISCONNECTED       5         /* disconnected */
+
+typedef struct kib_peer
+{
+	/* stash on global peer_ni list */
+	struct list_head	ibp_list;
+	/* who's on the other end(s) */
+	lnet_nid_t		ibp_nid;
+	/* LNet interface */
+	struct lnet_ni		*ibp_ni;
+	/* all active connections */
+	struct list_head	ibp_conns;
+	/* next connection to send on for round robin */
+	struct kib_conn		*ibp_next_conn;
+	/* msgs waiting for a conn */
+	struct list_head	ibp_tx_queue;
+	/* incarnation of peer_ni */
+	__u64			ibp_incarnation;
+	/* when (in jiffies) I was last alive */
+	cfs_time_t		ibp_last_alive;
+	/* # users */
+	atomic_t		ibp_refcount;
+	/* version of peer_ni */
+	__u16			ibp_version;
+	/* current passive connection attempts */
+	unsigned short		ibp_accepting;
+	/* current active connection attempts */
+	unsigned short		ibp_connecting;
+	/* reconnect this peer_ni later */
+	unsigned char		ibp_reconnecting;
+	/* counter of how many times we triggered a conn race */
+	unsigned char		ibp_races;
+	/* # consecutive reconnection attempts to this peer */
+	unsigned int		ibp_reconnected;
+	/* errno on closing this peer_ni */
+	int			ibp_error;
+	/* max map_on_demand */
+	__u16			ibp_max_frags;
+	/* max_peer_credits */
+	__u16			ibp_queue_depth;
+} kib_peer_ni_t;
+
+#ifndef HAVE_IB_INC_RKEY
+/**
+ * ib_inc_rkey - increments the key portion of the given rkey. Can be used
+ * for calculating a new rkey for type 2 memory windows.
+ * @rkey - the rkey to increment.
+ */
+static inline u32 ib_inc_rkey(u32 rkey)
+{
+	const u32 mask = 0x000000ff;
+	return ((rkey + 1) & mask) | (rkey & ~mask);
+}
+#endif
+
+extern kib_data_t      kiblnd_data;
+
+extern void kiblnd_hdev_destroy(kib_hca_dev_t *hdev);
+
+int kiblnd_msg_queue_size(int version, struct lnet_ni *ni);
+
+/* max # of fragments configured by user */
+static inline int
+kiblnd_cfg_rdma_frags(struct lnet_ni *ni)
+{
+	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+	int mod;
+
+	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+	mod = tunables->lnd_map_on_demand;
+	return mod != 0 ? mod : IBLND_MAX_RDMA_FRAGS;
+}
+
+static inline int
+kiblnd_rdma_frags(int version, struct lnet_ni *ni)
+{
+	return version == IBLND_MSG_VERSION_1 ?
+	  IBLND_MAX_RDMA_FRAGS :
+	  kiblnd_cfg_rdma_frags(ni);
+}
+
+static inline int
+kiblnd_concurrent_sends(int version, struct lnet_ni *ni)
+{
+	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+	int concurrent_sends;
+
+	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+	concurrent_sends = tunables->lnd_concurrent_sends;
+
+	if (version == IBLND_MSG_VERSION_1) {
+		if (concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2)
+			return IBLND_MSG_QUEUE_SIZE_V1 * 2;
+
+		if (concurrent_sends < IBLND_MSG_QUEUE_SIZE_V1 / 2)
+			return IBLND_MSG_QUEUE_SIZE_V1 / 2;
+	}
+
+	return concurrent_sends;
+}
+
+static inline void
+kiblnd_hdev_addref_locked(kib_hca_dev_t *hdev)
+{
+	LASSERT(atomic_read(&hdev->ibh_ref) > 0);
+	atomic_inc(&hdev->ibh_ref);
+}
+
+static inline void
+kiblnd_hdev_decref(kib_hca_dev_t *hdev)
+{
+	LASSERT(atomic_read(&hdev->ibh_ref) > 0);
+	if (atomic_dec_and_test(&hdev->ibh_ref))
+		kiblnd_hdev_destroy(hdev);
+}
+
+static inline int
+kiblnd_dev_can_failover(kib_dev_t *dev)
+{
+	if (!list_empty(&dev->ibd_fail_list)) /* already scheduled */
+                return 0;
+
+        if (*kiblnd_tunables.kib_dev_failover == 0) /* disabled */
+                return 0;
+
+        if (*kiblnd_tunables.kib_dev_failover > 1) /* force failover */
+                return 1;
+
+        return dev->ibd_can_failover;
+}
+
+#define kiblnd_conn_addref(conn)                                \
+do {                                                            \
+        CDEBUG(D_NET, "conn[%p] (%d)++\n",                      \
+	       (conn), atomic_read(&(conn)->ibc_refcount)); \
+	atomic_inc(&(conn)->ibc_refcount);                  \
+} while (0)
+
+#define kiblnd_conn_decref(conn)					\
+do {									\
+	unsigned long flags;						\
+									\
+	CDEBUG(D_NET, "conn[%p] (%d)--\n",				\
+	       (conn), atomic_read(&(conn)->ibc_refcount));		\
+	LASSERT_ATOMIC_POS(&(conn)->ibc_refcount);			\
+	if (atomic_dec_and_test(&(conn)->ibc_refcount)) {		\
+		spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);	\
+		list_add_tail(&(conn)->ibc_list,			\
+				  &kiblnd_data.kib_connd_zombies);	\
+		wake_up(&kiblnd_data.kib_connd_waitq);		\
+		spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);\
+	}								\
+} while (0)
+
+#define kiblnd_peer_addref(peer_ni)                                \
+do {                                                            \
+	CDEBUG(D_NET, "peer_ni[%p] -> %s (%d)++\n",                \
+	       (peer_ni), libcfs_nid2str((peer_ni)->ibp_nid),         \
+	       atomic_read (&(peer_ni)->ibp_refcount));        	\
+	atomic_inc(&(peer_ni)->ibp_refcount);                  	\
+} while (0)
+
+#define kiblnd_peer_decref(peer_ni)                                \
+do {                                                            \
+	CDEBUG(D_NET, "peer_ni[%p] -> %s (%d)--\n",                \
+	       (peer_ni), libcfs_nid2str((peer_ni)->ibp_nid),         \
+	       atomic_read (&(peer_ni)->ibp_refcount));        	\
+	LASSERT_ATOMIC_POS(&(peer_ni)->ibp_refcount);              \
+	if (atomic_dec_and_test(&(peer_ni)->ibp_refcount))     	\
+		kiblnd_destroy_peer(peer_ni);                      \
+} while (0)
+
+static inline bool
+kiblnd_peer_connecting(kib_peer_ni_t *peer_ni)
+{
+	return peer_ni->ibp_connecting != 0 ||
+	       peer_ni->ibp_reconnecting != 0 ||
+	       peer_ni->ibp_accepting != 0;
+}
+
+static inline bool
+kiblnd_peer_idle(kib_peer_ni_t *peer_ni)
+{
+	return !kiblnd_peer_connecting(peer_ni) && list_empty(&peer_ni->ibp_conns);
+}
+
+static inline struct list_head *
+kiblnd_nid2peerlist (lnet_nid_t nid)
+{
+	unsigned int hash =
+		((unsigned int)nid) % kiblnd_data.kib_peer_hash_size;
+
+	return &kiblnd_data.kib_peers[hash];
+}
+
+static inline int
+kiblnd_peer_active (kib_peer_ni_t *peer_ni)
+{
+	/* Am I in the peer_ni hash table? */
+	return !list_empty(&peer_ni->ibp_list);
+}
+
+static inline struct kib_conn *
+kiblnd_get_conn_locked (kib_peer_ni_t *peer_ni)
+{
+	struct list_head *next;
+
+	LASSERT(!list_empty(&peer_ni->ibp_conns));
+
+	/* Advance to next connection, be sure to skip the head node */
+	if (!peer_ni->ibp_next_conn ||
+	    peer_ni->ibp_next_conn->ibc_list.next == &peer_ni->ibp_conns)
+		next = peer_ni->ibp_conns.next;
+	else
+		next = peer_ni->ibp_next_conn->ibc_list.next;
+	peer_ni->ibp_next_conn = list_entry(next, struct kib_conn, ibc_list);
+
+	return peer_ni->ibp_next_conn;
+}
+
+static inline int
+kiblnd_send_keepalive(kib_conn_t *conn)
+{
+	return (*kiblnd_tunables.kib_keepalive > 0) &&
+		cfs_time_after(jiffies, conn->ibc_last_send +
+			       msecs_to_jiffies(*kiblnd_tunables.kib_keepalive *
+						MSEC_PER_SEC));
+}
+
+static inline int
+kiblnd_need_noop(kib_conn_t *conn)
+{
+	struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
+	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+
+	LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+
+        if (conn->ibc_outstanding_credits <
+	    IBLND_CREDITS_HIGHWATER(tunables, conn->ibc_version) &&
+            !kiblnd_send_keepalive(conn))
+                return 0; /* No need to send NOOP */
+
+        if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
+		if (!list_empty(&conn->ibc_tx_queue_nocred))
+                        return 0; /* NOOP can be piggybacked */
+
+                /* No tx to piggyback NOOP onto or no credit to send a tx */
+		return (list_empty(&conn->ibc_tx_queue) ||
+                        conn->ibc_credits == 0);
+        }
+
+	if (!list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */
+	    !list_empty(&conn->ibc_tx_queue_nocred) || /* piggyback NOOP */
+            conn->ibc_credits == 0)                    /* no credit */
+                return 0;
+
+        if (conn->ibc_credits == 1 &&      /* last credit reserved for */
+            conn->ibc_outstanding_credits == 0) /* giving back credits */
+                return 0;
+
+        /* No tx to piggyback NOOP onto or no credit to send a tx */
+	return (list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 1);
+}
+
+static inline void
+kiblnd_abort_receives(kib_conn_t *conn)
+{
+        ib_modify_qp(conn->ibc_cmid->qp,
+                     &kiblnd_data.kib_error_qpa, IB_QP_STATE);
+}
+
+static inline const char *
+kiblnd_queue2str(kib_conn_t *conn, struct list_head *q)
+{
+	if (q == &conn->ibc_tx_queue)
+		return "tx_queue";
+
+	if (q == &conn->ibc_tx_queue_rsrvd)
+		return "tx_queue_rsrvd";
+
+	if (q == &conn->ibc_tx_queue_nocred)
+		return "tx_queue_nocred";
+
+	if (q == &conn->ibc_active_txs)
+		return "active_txs";
+
+	LBUG();
+	return NULL;
+}
+
+/* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the
+ * lowest bits of the work request id to stash the work item type. */
+
+#define IBLND_WID_INVAL	0
+#define IBLND_WID_TX	1
+#define IBLND_WID_RX	2
+#define IBLND_WID_RDMA	3
+#define IBLND_WID_MR	4
+#define IBLND_WID_MASK	7UL
+
+static inline __u64
+kiblnd_ptr2wreqid (void *ptr, int type)
+{
+        unsigned long lptr = (unsigned long)ptr;
+
+        LASSERT ((lptr & IBLND_WID_MASK) == 0);
+        LASSERT ((type & ~IBLND_WID_MASK) == 0);
+        return (__u64)(lptr | type);
+}
+
+static inline void *
+kiblnd_wreqid2ptr (__u64 wreqid)
+{
+        return (void *)(((unsigned long)wreqid) & ~IBLND_WID_MASK);
+}
+
+static inline int
+kiblnd_wreqid2type (__u64 wreqid)
+{
+        return (wreqid & IBLND_WID_MASK);
+}
+
+static inline void
+kiblnd_set_conn_state (kib_conn_t *conn, int state)
+{
+	conn->ibc_state = state;
+	smp_mb();
+}
+
+static inline void
+kiblnd_init_msg (kib_msg_t *msg, int type, int body_nob)
+{
+        msg->ibm_type = type;
+        msg->ibm_nob  = offsetof(kib_msg_t, ibm_u) + body_nob;
+}
+
+static inline int
+kiblnd_rd_size (kib_rdma_desc_t *rd)
+{
+        int   i;
+        int   size;
+
+        for (i = size = 0; i < rd->rd_nfrags; i++)
+                size += rd->rd_frags[i].rf_nob;
+
+        return size;
+}
+
+static inline __u64
+kiblnd_rd_frag_addr(kib_rdma_desc_t *rd, int index)
+{
+        return rd->rd_frags[index].rf_addr;
+}
+
+static inline __u32
+kiblnd_rd_frag_size(kib_rdma_desc_t *rd, int index)
+{
+        return rd->rd_frags[index].rf_nob;
+}
+
+static inline __u32
+kiblnd_rd_frag_key(kib_rdma_desc_t *rd, int index)
+{
+        return rd->rd_key;
+}
+
+static inline int
+kiblnd_rd_consume_frag(kib_rdma_desc_t *rd, int index, __u32 nob)
+{
+        if (nob < rd->rd_frags[index].rf_nob) {
+                rd->rd_frags[index].rf_addr += nob;
+                rd->rd_frags[index].rf_nob  -= nob;
+        } else {
+                index ++;
+        }
+
+        return index;
+}
+
+static inline int
+kiblnd_rd_msg_size(kib_rdma_desc_t *rd, int msgtype, int n)
+{
+        LASSERT (msgtype == IBLND_MSG_GET_REQ ||
+                 msgtype == IBLND_MSG_PUT_ACK);
+
+        return msgtype == IBLND_MSG_GET_REQ ?
+               offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]) :
+               offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
+}
+
+static inline __u64
+kiblnd_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+        return ib_dma_mapping_error(dev, dma_addr);
+}
+
+static inline __u64 kiblnd_dma_map_single(struct ib_device *dev,
+                                          void *msg, size_t size,
+                                          enum dma_data_direction direction)
+{
+        return ib_dma_map_single(dev, msg, size, direction);
+}
+
+static inline void kiblnd_dma_unmap_single(struct ib_device *dev,
+                                           __u64 addr, size_t size,
+                                          enum dma_data_direction direction)
+{
+        ib_dma_unmap_single(dev, addr, size, direction);
+}
+
+#define KIBLND_UNMAP_ADDR_SET(p, m, a)  do {} while (0)
+#define KIBLND_UNMAP_ADDR(p, m, a)      (a)
+
+static inline int kiblnd_dma_map_sg(struct ib_device *dev,
+                                    struct scatterlist *sg, int nents,
+                                    enum dma_data_direction direction)
+{
+        return ib_dma_map_sg(dev, sg, nents, direction);
+}
+
+static inline void kiblnd_dma_unmap_sg(struct ib_device *dev,
+                                       struct scatterlist *sg, int nents,
+                                       enum dma_data_direction direction)
+{
+        ib_dma_unmap_sg(dev, sg, nents, direction);
+}
+
+static inline __u64 kiblnd_sg_dma_address(struct ib_device *dev,
+                                          struct scatterlist *sg)
+{
+        return ib_sg_dma_address(dev, sg);
+}
+
+static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev,
+                                             struct scatterlist *sg)
+{
+        return ib_sg_dma_len(dev, sg);
+}
+
+/* XXX We use KIBLND_CONN_PARAM(e) as writable buffer, it's not strictly
+ * right because OFED1.2 defines it as const, to use it we have to add
+ * (void *) cast to overcome "const" */
+
+#define KIBLND_CONN_PARAM(e)            ((e)->param.conn.private_data)
+#define KIBLND_CONN_PARAM_LEN(e)        ((e)->param.conn.private_data_len)
+
+#ifdef HAVE_IB_GET_DMA_MR
+struct ib_mr *kiblnd_find_rd_dma_mr(struct lnet_ni *ni, kib_rdma_desc_t *rd,
+				    int negotiated_nfrags);
+#endif
+void kiblnd_map_rx_descs(kib_conn_t *conn);
+void kiblnd_unmap_rx_descs(kib_conn_t *conn);
+void kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node);
+struct list_head *kiblnd_pool_alloc_node(kib_poolset_t *ps);
+
+int  kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx,
+			 kib_rdma_desc_t *rd, __u32 nob, __u64 iov,
+			 kib_fmr_t *fmr, bool *is_fastreg);
+void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status);
+
+int  kiblnd_tunables_setup(struct lnet_ni *ni);
+int  kiblnd_tunables_init(void);
+
+int  kiblnd_connd (void *arg);
+int  kiblnd_scheduler(void *arg);
+int  kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name);
+int  kiblnd_failover_thread (void *arg);
+
+int  kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages);
+
+int  kiblnd_cm_callback(struct rdma_cm_id *cmid,
+                        struct rdma_cm_event *event);
+int  kiblnd_translate_mtu(int value);
+
+int  kiblnd_dev_failover(kib_dev_t *dev);
+int kiblnd_create_peer(struct lnet_ni *ni, kib_peer_ni_t **peerp,
+		       lnet_nid_t nid);
+void kiblnd_destroy_peer (kib_peer_ni_t *peer);
+bool kiblnd_reconnect_peer(kib_peer_ni_t *peer);
+void kiblnd_destroy_dev (kib_dev_t *dev);
+void kiblnd_unlink_peer_locked (kib_peer_ni_t *peer_ni);
+kib_peer_ni_t *kiblnd_find_peer_locked(struct lnet_ni *ni, lnet_nid_t nid);
+int  kiblnd_close_stale_conns_locked (kib_peer_ni_t *peer_ni,
+                                      int version, __u64 incarnation);
+int  kiblnd_close_peer_conns_locked (kib_peer_ni_t *peer_ni, int why);
+
+kib_conn_t *kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
+			       int state, int version);
+void kiblnd_destroy_conn(kib_conn_t *conn, bool free_conn);
+void kiblnd_close_conn (kib_conn_t *conn, int error);
+void kiblnd_close_conn_locked (kib_conn_t *conn, int error);
+
+void kiblnd_launch_tx(struct lnet_ni *ni, kib_tx_t *tx, lnet_nid_t nid);
+void kiblnd_txlist_done(struct list_head *txlist, int status);
+
+void kiblnd_qp_event(struct ib_event *event, void *arg);
+void kiblnd_cq_event(struct ib_event *event, void *arg);
+void kiblnd_cq_completion(struct ib_cq *cq, void *arg);
+
+void kiblnd_pack_msg(struct lnet_ni *ni, kib_msg_t *msg, int version,
+		     int credits, lnet_nid_t dstnid, __u64 dststamp);
+int  kiblnd_unpack_msg(kib_msg_t *msg, int nob);
+int  kiblnd_post_rx (kib_rx_t *rx, int credit);
+
+int  kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg);
+int kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
+		int delayed, unsigned int niov, struct kvec *iov,
+		lnet_kiov_t *kiov, unsigned int offset, unsigned int mlen,
+		unsigned int rlen);
+
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c
new file mode 100644
index 0000000000000..42147c7b01e68
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c
@@ -0,0 +1,3766 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd_cb.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "o2iblnd.h"
+
+#define MAX_CONN_RACES_BEFORE_ABORT 20
+
+static void kiblnd_peer_alive(kib_peer_ni_t *peer_ni);
+static void kiblnd_peer_connect_failed(kib_peer_ni_t *peer_ni, int active, int error);
+static void kiblnd_init_tx_msg(struct lnet_ni *ni, kib_tx_t *tx,
+			       int type, int body_nob);
+static int kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type,
+			    int resid, kib_rdma_desc_t *dstrd, __u64 dstcookie);
+static void kiblnd_queue_tx_locked(kib_tx_t *tx, kib_conn_t *conn);
+static void kiblnd_queue_tx(kib_tx_t *tx, kib_conn_t *conn);
+
+static void kiblnd_unmap_tx(kib_tx_t *tx);
+static void kiblnd_check_sends_locked(kib_conn_t *conn);
+
+void
+kiblnd_tx_done(kib_tx_t *tx)
+{
+	struct lnet_msg *lntmsg[2];
+	int         rc;
+	int         i;
+
+	LASSERT (!in_interrupt());
+	LASSERT (!tx->tx_queued);               /* mustn't be queued for sending */
+	LASSERT (tx->tx_sending == 0);          /* mustn't be awaiting sent callback */
+	LASSERT (!tx->tx_waiting);              /* mustn't be awaiting peer_ni response */
+	LASSERT (tx->tx_pool != NULL);
+
+	kiblnd_unmap_tx(tx);
+
+	/* tx may have up to 2 lnet msgs to finalise */
+	lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
+	lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
+	rc = tx->tx_status;
+
+	if (tx->tx_conn != NULL) {
+		kiblnd_conn_decref(tx->tx_conn);
+		tx->tx_conn = NULL;
+	}
+
+	tx->tx_nwrq = tx->tx_nsge = 0;
+	tx->tx_status = 0;
+
+	kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list);
+
+	/* delay finalize until my descs have been freed */
+	for (i = 0; i < 2; i++) {
+		if (lntmsg[i] == NULL)
+			continue;
+
+		lnet_finalize(lntmsg[i], rc);
+	}
+}
+
+void
+kiblnd_txlist_done(struct list_head *txlist, int status)
+{
+	kib_tx_t *tx;
+
+	while (!list_empty(txlist)) {
+		tx = list_entry(txlist->next, kib_tx_t, tx_list);
+
+		list_del(&tx->tx_list);
+		/* complete now */
+		tx->tx_waiting = 0;
+		tx->tx_status = status;
+		kiblnd_tx_done(tx);
+	}
+}
+
+static kib_tx_t *
+kiblnd_get_idle_tx(struct lnet_ni *ni, lnet_nid_t target)
+{
+	kib_net_t		*net = (kib_net_t *)ni->ni_data;
+	struct list_head	*node;
+	kib_tx_t		*tx;
+	kib_tx_poolset_t	*tps;
+
+	tps = net->ibn_tx_ps[lnet_cpt_of_nid(target, ni)];
+	node = kiblnd_pool_alloc_node(&tps->tps_poolset);
+        if (node == NULL)
+                return NULL;
+        tx = container_of(node, kib_tx_t, tx_list);
+
+        LASSERT (tx->tx_nwrq == 0);
+        LASSERT (!tx->tx_queued);
+        LASSERT (tx->tx_sending == 0);
+        LASSERT (!tx->tx_waiting);
+        LASSERT (tx->tx_status == 0);
+        LASSERT (tx->tx_conn == NULL);
+        LASSERT (tx->tx_lntmsg[0] == NULL);
+        LASSERT (tx->tx_lntmsg[1] == NULL);
+        LASSERT (tx->tx_nfrags == 0);
+
+        return tx;
+}
+
+static void
+kiblnd_drop_rx(kib_rx_t *rx)
+{
+	kib_conn_t		*conn	= rx->rx_conn;
+	struct kib_sched_info	*sched	= conn->ibc_sched;
+	unsigned long		flags;
+
+	spin_lock_irqsave(&sched->ibs_lock, flags);
+	LASSERT(conn->ibc_nrx > 0);
+	conn->ibc_nrx--;
+	spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+	kiblnd_conn_decref(conn);
+}
+
+int
+kiblnd_post_rx (kib_rx_t *rx, int credit)
+{
+	kib_conn_t         *conn = rx->rx_conn;
+	kib_net_t          *net = conn->ibc_peer->ibp_ni->ni_data;
+	struct ib_recv_wr  *bad_wrq = NULL;
+#ifdef HAVE_IB_GET_DMA_MR
+	struct ib_mr       *mr = conn->ibc_hdev->ibh_mrs;
+#endif
+	int                 rc;
+
+	LASSERT (net != NULL);
+	LASSERT (!in_interrupt());
+	LASSERT (credit == IBLND_POSTRX_NO_CREDIT ||
+		 credit == IBLND_POSTRX_PEER_CREDIT ||
+		 credit == IBLND_POSTRX_RSRVD_CREDIT);
+#ifdef HAVE_IB_GET_DMA_MR
+	LASSERT(mr != NULL);
+
+	rx->rx_sge.lkey   = mr->lkey;
+#else
+	rx->rx_sge.lkey   = conn->ibc_hdev->ibh_pd->local_dma_lkey;
+#endif
+        rx->rx_sge.addr   = rx->rx_msgaddr;
+        rx->rx_sge.length = IBLND_MSG_SIZE;
+
+        rx->rx_wrq.next = NULL;
+        rx->rx_wrq.sg_list = &rx->rx_sge;
+        rx->rx_wrq.num_sge = 1;
+        rx->rx_wrq.wr_id = kiblnd_ptr2wreqid(rx, IBLND_WID_RX);
+
+        LASSERT (conn->ibc_state >= IBLND_CONN_INIT);
+        LASSERT (rx->rx_nob >= 0);              /* not posted */
+
+        if (conn->ibc_state > IBLND_CONN_ESTABLISHED) {
+                kiblnd_drop_rx(rx);             /* No more posts for this rx */
+                return 0;
+        }
+
+        rx->rx_nob = -1;                        /* flag posted */
+
+	/* NB: need an extra reference after ib_post_recv because we don't
+	 * own this rx (and rx::rx_conn) anymore, LU-5678.
+	 */
+	kiblnd_conn_addref(conn);
+	rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq, &bad_wrq);
+	if (unlikely(rc != 0)) {
+		CERROR("Can't post rx for %s: %d, bad_wrq: %p\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid), rc, bad_wrq);
+		rx->rx_nob = 0;
+	}
+
+	if (conn->ibc_state < IBLND_CONN_ESTABLISHED) /* Initial post */
+		goto out;
+
+	if (unlikely(rc != 0)) {
+		kiblnd_close_conn(conn, rc);
+		kiblnd_drop_rx(rx);	/* No more posts for this rx */
+		goto out;
+	}
+
+	if (credit == IBLND_POSTRX_NO_CREDIT)
+		goto out;
+
+	spin_lock(&conn->ibc_lock);
+	if (credit == IBLND_POSTRX_PEER_CREDIT)
+		conn->ibc_outstanding_credits++;
+	else
+		conn->ibc_reserved_credits++;
+	kiblnd_check_sends_locked(conn);
+	spin_unlock(&conn->ibc_lock);
+
+out:
+	kiblnd_conn_decref(conn);
+	return rc;
+}
+
+static kib_tx_t *
+kiblnd_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
+{
+	struct list_head *tmp;
+
+	list_for_each(tmp, &conn->ibc_active_txs) {
+		kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
+
+		LASSERT(!tx->tx_queued);
+		LASSERT(tx->tx_sending != 0 || tx->tx_waiting);
+
+		if (tx->tx_cookie != cookie)
+			continue;
+
+		if (tx->tx_waiting &&
+		    tx->tx_msg->ibm_type == txtype)
+			return tx;
+
+		CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
+		      tx->tx_waiting ? "" : "NOT ",
+		      tx->tx_msg->ibm_type, txtype);
+	}
+	return NULL;
+}
+
+static void
+kiblnd_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
+{
+	kib_tx_t    *tx;
+	struct lnet_ni   *ni = conn->ibc_peer->ibp_ni;
+	int          idle;
+
+	spin_lock(&conn->ibc_lock);
+
+	tx = kiblnd_find_waiting_tx_locked(conn, txtype, cookie);
+	if (tx == NULL) {
+		spin_unlock(&conn->ibc_lock);
+
+		CWARN("Unmatched completion type %x cookie %#llx from %s\n",
+                      txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                kiblnd_close_conn(conn, -EPROTO);
+                return;
+        }
+
+        if (tx->tx_status == 0) {               /* success so far */
+                if (status < 0) {               /* failed? */
+                        tx->tx_status = status;
+                } else if (txtype == IBLND_MSG_GET_REQ) {
+                        lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status);
+                }
+        }
+
+        tx->tx_waiting = 0;
+
+        idle = !tx->tx_queued && (tx->tx_sending == 0);
+        if (idle)
+		list_del(&tx->tx_list);
+
+	spin_unlock(&conn->ibc_lock);
+
+	if (idle)
+		kiblnd_tx_done(tx);
+}
+
+static void
+kiblnd_send_completion(kib_conn_t *conn, int type, int status, __u64 cookie)
+{
+	struct lnet_ni   *ni = conn->ibc_peer->ibp_ni;
+	kib_tx_t    *tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+
+        if (tx == NULL) {
+                CERROR("Can't get tx for completion %x for %s\n",
+                       type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                return;
+        }
+
+        tx->tx_msg->ibm_u.completion.ibcm_status = status;
+        tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
+        kiblnd_init_tx_msg(ni, tx, type, sizeof(kib_completion_msg_t));
+
+        kiblnd_queue_tx(tx, conn);
+}
+
+static void
+kiblnd_handle_rx (kib_rx_t *rx)
+{
+        kib_msg_t    *msg = rx->rx_msg;
+        kib_conn_t   *conn = rx->rx_conn;
+	struct lnet_ni    *ni = conn->ibc_peer->ibp_ni;
+        int           credits = msg->ibm_credits;
+        kib_tx_t     *tx;
+        int           rc = 0;
+        int           rc2;
+        int           post_credit;
+
+        LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+        CDEBUG (D_NET, "Received %x[%d] from %s\n",
+                msg->ibm_type, credits,
+                libcfs_nid2str(conn->ibc_peer->ibp_nid));
+
+        if (credits != 0) {
+                /* Have I received credits that will let me send? */
+		spin_lock(&conn->ibc_lock);
+
+		if (conn->ibc_credits + credits >
+		    conn->ibc_queue_depth) {
+			rc2 = conn->ibc_credits;
+			spin_unlock(&conn->ibc_lock);
+
+			CERROR("Bad credits from %s: %d + %d > %d\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid),
+			       rc2, credits,
+			       conn->ibc_queue_depth);
+
+			kiblnd_close_conn(conn, -EPROTO);
+			kiblnd_post_rx(rx, IBLND_POSTRX_NO_CREDIT);
+			return;
+		}
+
+                conn->ibc_credits += credits;
+
+                /* This ensures the credit taken by NOOP can be returned */
+                if (msg->ibm_type == IBLND_MSG_NOOP &&
+                    !IBLND_OOB_CAPABLE(conn->ibc_version)) /* v1 only */
+                        conn->ibc_outstanding_credits++;
+
+		kiblnd_check_sends_locked(conn);
+		spin_unlock(&conn->ibc_lock);
+        }
+
+        switch (msg->ibm_type) {
+        default:
+                CERROR("Bad IBLND message type %x from %s\n",
+                       msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                post_credit = IBLND_POSTRX_NO_CREDIT;
+                rc = -EPROTO;
+                break;
+
+        case IBLND_MSG_NOOP:
+                if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
+                        post_credit = IBLND_POSTRX_NO_CREDIT;
+                        break;
+                }
+
+                if (credits != 0) /* credit already posted */
+                        post_credit = IBLND_POSTRX_NO_CREDIT;
+                else              /* a keepalive NOOP */
+                        post_credit = IBLND_POSTRX_PEER_CREDIT;
+                break;
+
+        case IBLND_MSG_IMMEDIATE:
+                post_credit = IBLND_POSTRX_DONT_POST;
+                rc = lnet_parse(ni, &msg->ibm_u.immediate.ibim_hdr,
+                                msg->ibm_srcnid, rx, 0);
+                if (rc < 0)                     /* repost on error */
+                        post_credit = IBLND_POSTRX_PEER_CREDIT;
+                break;
+
+        case IBLND_MSG_PUT_REQ:
+                post_credit = IBLND_POSTRX_DONT_POST;
+                rc = lnet_parse(ni, &msg->ibm_u.putreq.ibprm_hdr,
+                                msg->ibm_srcnid, rx, 1);
+                if (rc < 0)                     /* repost on error */
+                        post_credit = IBLND_POSTRX_PEER_CREDIT;
+                break;
+
+        case IBLND_MSG_PUT_NAK:
+                CWARN ("PUT_NACK from %s\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+                kiblnd_handle_completion(conn, IBLND_MSG_PUT_REQ,
+                                         msg->ibm_u.completion.ibcm_status,
+                                         msg->ibm_u.completion.ibcm_cookie);
+                break;
+
+        case IBLND_MSG_PUT_ACK:
+                post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+
+		spin_lock(&conn->ibc_lock);
+		tx = kiblnd_find_waiting_tx_locked(conn, IBLND_MSG_PUT_REQ,
+					msg->ibm_u.putack.ibpam_src_cookie);
+		if (tx != NULL)
+			list_del(&tx->tx_list);
+		spin_unlock(&conn->ibc_lock);
+
+                if (tx == NULL) {
+                        CERROR("Unmatched PUT_ACK from %s\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        rc = -EPROTO;
+                        break;
+                }
+
+                LASSERT (tx->tx_waiting);
+                /* CAVEAT EMPTOR: I could be racing with tx_complete, but...
+                 * (a) I can overwrite tx_msg since my peer_ni has received it!
+                 * (b) tx_waiting set tells tx_complete() it's not done. */
+
+		tx->tx_nwrq = tx->tx_nsge = 0;	/* overwrite PUT_REQ */
+
+                rc2 = kiblnd_init_rdma(conn, tx, IBLND_MSG_PUT_DONE,
+                                       kiblnd_rd_size(&msg->ibm_u.putack.ibpam_rd),
+                                       &msg->ibm_u.putack.ibpam_rd,
+                                       msg->ibm_u.putack.ibpam_dst_cookie);
+                if (rc2 < 0)
+                        CERROR("Can't setup rdma for PUT to %s: %d\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2);
+
+		spin_lock(&conn->ibc_lock);
+		tx->tx_waiting = 0;	/* clear waiting and queue atomically */
+		kiblnd_queue_tx_locked(tx, conn);
+		spin_unlock(&conn->ibc_lock);
+		break;
+
+        case IBLND_MSG_PUT_DONE:
+                post_credit = IBLND_POSTRX_PEER_CREDIT;
+                kiblnd_handle_completion(conn, IBLND_MSG_PUT_ACK,
+                                         msg->ibm_u.completion.ibcm_status,
+                                         msg->ibm_u.completion.ibcm_cookie);
+                break;
+
+        case IBLND_MSG_GET_REQ:
+                post_credit = IBLND_POSTRX_DONT_POST;
+                rc = lnet_parse(ni, &msg->ibm_u.get.ibgm_hdr,
+                                msg->ibm_srcnid, rx, 1);
+                if (rc < 0)                     /* repost on error */
+                        post_credit = IBLND_POSTRX_PEER_CREDIT;
+                break;
+
+        case IBLND_MSG_GET_DONE:
+                post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+                kiblnd_handle_completion(conn, IBLND_MSG_GET_REQ,
+                                         msg->ibm_u.completion.ibcm_status,
+                                         msg->ibm_u.completion.ibcm_cookie);
+                break;
+        }
+
+        if (rc < 0)                             /* protocol error */
+                kiblnd_close_conn(conn, rc);
+
+        if (post_credit != IBLND_POSTRX_DONT_POST)
+                kiblnd_post_rx(rx, post_credit);
+}
+
+static void
+kiblnd_rx_complete (kib_rx_t *rx, int status, int nob)
+{
+        kib_msg_t    *msg = rx->rx_msg;
+        kib_conn_t   *conn = rx->rx_conn;
+	struct lnet_ni    *ni = conn->ibc_peer->ibp_ni;
+        kib_net_t    *net = ni->ni_data;
+        int           rc;
+        int           err = -EIO;
+
+        LASSERT (net != NULL);
+        LASSERT (rx->rx_nob < 0);               /* was posted */
+        rx->rx_nob = 0;                         /* isn't now */
+
+        if (conn->ibc_state > IBLND_CONN_ESTABLISHED)
+                goto ignore;
+
+        if (status != IB_WC_SUCCESS) {
+                CNETERR("Rx from %s failed: %d\n",
+                        libcfs_nid2str(conn->ibc_peer->ibp_nid), status);
+                goto failed;
+        }
+
+        LASSERT (nob >= 0);
+        rx->rx_nob = nob;
+
+        rc = kiblnd_unpack_msg(msg, rx->rx_nob);
+        if (rc != 0) {
+                CERROR ("Error %d unpacking rx from %s\n",
+                        rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                goto failed;
+        }
+
+        if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
+            msg->ibm_dstnid != ni->ni_nid ||
+            msg->ibm_srcstamp != conn->ibc_incarnation ||
+            msg->ibm_dststamp != net->ibn_incarnation) {
+                CERROR ("Stale rx from %s\n",
+                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                err = -ESTALE;
+                goto failed;
+        }
+
+        /* set time last known alive */
+        kiblnd_peer_alive(conn->ibc_peer);
+
+        /* racing with connection establishment/teardown! */
+
+        if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+		rwlock_t  *g_lock = &kiblnd_data.kib_global_lock;
+		unsigned long  flags;
+
+		write_lock_irqsave(g_lock, flags);
+		/* must check holding global lock to eliminate race */
+		if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+			list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
+			write_unlock_irqrestore(g_lock, flags);
+			return;
+		}
+		write_unlock_irqrestore(g_lock, flags);
+        }
+        kiblnd_handle_rx(rx);
+        return;
+
+ failed:
+        CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
+        kiblnd_close_conn(conn, err);
+ ignore:
+        kiblnd_drop_rx(rx);                     /* Don't re-post rx. */
+}
+
+static int
+kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, __u32 nob)
+{
+	kib_hca_dev_t		*hdev;
+	kib_fmr_poolset_t	*fps;
+	int			cpt;
+	int			rc;
+	bool			is_fastreg = 0;
+
+	LASSERT(tx->tx_pool != NULL);
+	LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL);
+
+	hdev = tx->tx_pool->tpo_hdev;
+	cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
+
+	fps = net->ibn_fmr_ps[cpt];
+	rc = kiblnd_fmr_pool_map(fps, tx, rd, nob, 0, &tx->fmr, &is_fastreg);
+	if (rc != 0) {
+		CERROR("Can't map %u pages: %d\n", nob, rc);
+		return rc;
+	}
+
+	/* If rd is not tx_rd, it's going to get sent to a peer_ni, who will need
+	 * the rkey */
+	rd->rd_key = tx->fmr.fmr_key;
+	if (!is_fastreg)
+		rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask;
+	rd->rd_frags[0].rf_nob   = nob;
+	rd->rd_nfrags = 1;
+
+	return 0;
+}
+
+static void
+kiblnd_unmap_tx(kib_tx_t *tx)
+{
+	if (tx->fmr.fmr_pfmr || tx->fmr.fmr_frd)
+		kiblnd_fmr_pool_unmap(&tx->fmr, tx->tx_status);
+
+	if (tx->tx_nfrags != 0) {
+		kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev,
+				    tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
+		tx->tx_nfrags = 0;
+	}
+}
+
+static int
+kiblnd_map_tx(struct lnet_ni *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, int nfrags)
+{
+	kib_net_t     *net   = ni->ni_data;
+	kib_hca_dev_t *hdev  = net->ibn_dev->ibd_hdev;
+#ifdef HAVE_IB_GET_DMA_MR
+	struct ib_mr  *mr    = NULL;
+#endif
+	__u32 nob;
+	int i;
+
+        /* If rd is not tx_rd, it's going to get sent to a peer_ni and I'm the
+         * RDMA sink */
+        tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+	tx->tx_nfrags = nfrags;
+
+	rd->rd_nfrags = kiblnd_dma_map_sg(hdev->ibh_ibdev, tx->tx_frags,
+					  tx->tx_nfrags, tx->tx_dmadir);
+
+        for (i = 0, nob = 0; i < rd->rd_nfrags; i++) {
+                rd->rd_frags[i].rf_nob  = kiblnd_sg_dma_len(
+                        hdev->ibh_ibdev, &tx->tx_frags[i]);
+                rd->rd_frags[i].rf_addr = kiblnd_sg_dma_address(
+                        hdev->ibh_ibdev, &tx->tx_frags[i]);
+                nob += rd->rd_frags[i].rf_nob;
+        }
+
+#ifdef HAVE_IB_GET_DMA_MR
+	mr = kiblnd_find_rd_dma_mr(ni, rd,
+				   (tx->tx_conn != NULL) ?
+				   tx->tx_conn->ibc_max_frags : -1);
+	if (mr != NULL) {
+		/* found pre-mapping MR */
+		rd->rd_key = (rd != tx->tx_rd) ? mr->rkey : mr->lkey;
+		return 0;
+	}
+#endif
+
+	if (net->ibn_fmr_ps != NULL)
+		return kiblnd_fmr_map_tx(net, tx, rd, nob);
+
+	return -EINVAL;
+}
+
+
+static int
+kiblnd_setup_rd_iov(struct lnet_ni *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
+		    unsigned int niov, struct kvec *iov, int offset, int nob)
+{
+        kib_net_t          *net = ni->ni_data;
+        struct page        *page;
+        struct scatterlist *sg;
+        unsigned long       vaddr;
+        int                 fragnob;
+        int                 page_offset;
+
+        LASSERT (nob > 0);
+        LASSERT (niov > 0);
+        LASSERT (net != NULL);
+
+        while (offset >= iov->iov_len) {
+                offset -= iov->iov_len;
+                niov--;
+                iov++;
+                LASSERT (niov > 0);
+        }
+
+	sg = tx->tx_frags;
+	do {
+		LASSERT(niov > 0);
+
+		vaddr = ((unsigned long)iov->iov_base) + offset;
+		page_offset = vaddr & (PAGE_SIZE - 1);
+		page = lnet_kvaddr_to_page(vaddr);
+		if (page == NULL) {
+			CERROR("Can't find page\n");
+			return -EFAULT;
+		}
+
+		fragnob = min((int)(iov->iov_len - offset), nob);
+		fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
+
+		sg_set_page(sg, page, fragnob, page_offset);
+		sg = sg_next(sg);
+		if (!sg) {
+			CERROR("lacking enough sg entries to map tx\n");
+			return -EFAULT;
+		}
+
+                if (offset + fragnob < iov->iov_len) {
+                        offset += fragnob;
+                } else {
+                        offset = 0;
+                        iov++;
+                        niov--;
+                }
+                nob -= fragnob;
+        } while (nob > 0);
+
+        return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
+}
+
+static int
+kiblnd_setup_rd_kiov(struct lnet_ni *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
+		     int nkiov, lnet_kiov_t *kiov, int offset, int nob)
+{
+        kib_net_t          *net = ni->ni_data;
+        struct scatterlist *sg;
+        int                 fragnob;
+
+        CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+
+        LASSERT (nob > 0);
+        LASSERT (nkiov > 0);
+        LASSERT (net != NULL);
+
+        while (offset >= kiov->kiov_len) {
+                offset -= kiov->kiov_len;
+                nkiov--;
+                kiov++;
+                LASSERT (nkiov > 0);
+        }
+
+        sg = tx->tx_frags;
+        do {
+                LASSERT (nkiov > 0);
+
+                fragnob = min((int)(kiov->kiov_len - offset), nob);
+
+		sg_set_page(sg, kiov->kiov_page, fragnob,
+			    kiov->kiov_offset + offset);
+		sg = sg_next(sg);
+		if (!sg) {
+			CERROR("lacking enough sg entries to map tx\n");
+			return -EFAULT;
+		}
+
+                offset = 0;
+                kiov++;
+                nkiov--;
+                nob -= fragnob;
+        } while (nob > 0);
+
+        return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
+}
+
+static int
+kiblnd_post_tx_locked (kib_conn_t *conn, kib_tx_t *tx, int credit)
+__must_hold(&conn->ibc_lock)
+{
+	kib_msg_t *msg = tx->tx_msg;
+	kib_peer_ni_t *peer_ni = conn->ibc_peer;
+	struct lnet_ni *ni = peer_ni->ibp_ni;
+	int ver = conn->ibc_version;
+	int rc;
+	int done;
+
+	LASSERT(tx->tx_queued);
+	/* We rely on this for QP sizing */
+	LASSERT(tx->tx_nwrq > 0 && tx->tx_nsge >= 0);
+	LASSERT(tx->tx_nwrq <= 1 + conn->ibc_max_frags);
+
+	LASSERT(credit == 0 || credit == 1);
+	LASSERT(conn->ibc_outstanding_credits >= 0);
+	LASSERT(conn->ibc_outstanding_credits <= conn->ibc_queue_depth);
+	LASSERT(conn->ibc_credits >= 0);
+	LASSERT(conn->ibc_credits <= conn->ibc_queue_depth);
+
+	if (conn->ibc_nsends_posted ==
+	    kiblnd_concurrent_sends(ver, ni)) {
+                /* tx completions outstanding... */
+                CDEBUG(D_NET, "%s: posted enough\n",
+                       libcfs_nid2str(peer_ni->ibp_nid));
+                return -EAGAIN;
+        }
+
+        if (credit != 0 && conn->ibc_credits == 0) {   /* no credits */
+                CDEBUG(D_NET, "%s: no credits\n",
+                       libcfs_nid2str(peer_ni->ibp_nid));
+                return -EAGAIN;
+        }
+
+        if (credit != 0 && !IBLND_OOB_CAPABLE(ver) &&
+            conn->ibc_credits == 1 &&   /* last credit reserved */
+            msg->ibm_type != IBLND_MSG_NOOP) {      /* for NOOP */
+                CDEBUG(D_NET, "%s: not using last credit\n",
+                       libcfs_nid2str(peer_ni->ibp_nid));
+                return -EAGAIN;
+        }
+
+        /* NB don't drop ibc_lock before bumping tx_sending */
+	list_del(&tx->tx_list);
+        tx->tx_queued = 0;
+
+        if (msg->ibm_type == IBLND_MSG_NOOP &&
+            (!kiblnd_need_noop(conn) ||     /* redundant NOOP */
+             (IBLND_OOB_CAPABLE(ver) && /* posted enough NOOP */
+              conn->ibc_noops_posted == IBLND_OOB_MSGS(ver)))) {
+		/* OK to drop when posted enough NOOPs, since
+		 * kiblnd_check_sends_locked will queue NOOP again when
+		 * posted NOOPs complete */
+		spin_unlock(&conn->ibc_lock);
+		kiblnd_tx_done(tx);
+		spin_lock(&conn->ibc_lock);
+                CDEBUG(D_NET, "%s(%d): redundant or enough NOOP\n",
+                       libcfs_nid2str(peer_ni->ibp_nid),
+                       conn->ibc_noops_posted);
+                return 0;
+        }
+
+        kiblnd_pack_msg(peer_ni->ibp_ni, msg, ver, conn->ibc_outstanding_credits,
+                        peer_ni->ibp_nid, conn->ibc_incarnation);
+
+	conn->ibc_credits -= credit;
+	conn->ibc_outstanding_credits = 0;
+	conn->ibc_nsends_posted++;
+	if (msg->ibm_type == IBLND_MSG_NOOP)
+		conn->ibc_noops_posted++;
+
+	/* CAVEAT EMPTOR!  This tx could be the PUT_DONE of an RDMA
+	 * PUT.  If so, it was first queued here as a PUT_REQ, sent and
+	 * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
+	 * and then re-queued here.  It's (just) possible that
+	 * tx_sending is non-zero if we've not done the tx_complete()
+	 * from the first send; hence the ++ rather than = below. */
+	tx->tx_sending++;
+	list_add(&tx->tx_list, &conn->ibc_active_txs);
+
+        /* I'm still holding ibc_lock! */
+        if (conn->ibc_state != IBLND_CONN_ESTABLISHED) {
+                rc = -ECONNABORTED;
+        } else if (tx->tx_pool->tpo_pool.po_failed ||
+                 conn->ibc_hdev != tx->tx_pool->tpo_hdev) {
+                /* close_conn will launch failover */
+                rc = -ENETDOWN;
+        } else {
+		struct kib_fast_reg_descriptor *frd = tx->fmr.fmr_frd;
+		struct ib_send_wr *bad = &tx->tx_wrq[tx->tx_nwrq - 1].wr;
+		struct ib_send_wr *wr  = &tx->tx_wrq[0].wr;
+
+		if (frd != NULL) {
+			if (!frd->frd_valid) {
+				wr = &frd->frd_inv_wr.wr;
+				wr->next = &frd->frd_fastreg_wr.wr;
+			} else {
+				wr = &frd->frd_fastreg_wr.wr;
+			}
+			frd->frd_fastreg_wr.wr.next = &tx->tx_wrq[0].wr;
+		}
+
+		LASSERTF(bad->wr_id == kiblnd_ptr2wreqid(tx, IBLND_WID_TX),
+			 "bad wr_id %#llx, opc %d, flags %d, peer_ni: %s\n",
+			 bad->wr_id, bad->opcode, bad->send_flags,
+			 libcfs_nid2str(conn->ibc_peer->ibp_nid));
+
+		bad = NULL;
+		rc = ib_post_send(conn->ibc_cmid->qp, wr, &bad);
+	}
+
+        conn->ibc_last_send = jiffies;
+
+        if (rc == 0)
+                return 0;
+
+        /* NB credits are transferred in the actual
+         * message, which can only be the last work item */
+        conn->ibc_credits += credit;
+        conn->ibc_outstanding_credits += msg->ibm_credits;
+        conn->ibc_nsends_posted--;
+        if (msg->ibm_type == IBLND_MSG_NOOP)
+                conn->ibc_noops_posted--;
+
+        tx->tx_status = rc;
+        tx->tx_waiting = 0;
+        tx->tx_sending--;
+
+        done = (tx->tx_sending == 0);
+        if (done)
+		list_del(&tx->tx_list);
+
+	spin_unlock(&conn->ibc_lock);
+
+        if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
+                CERROR("Error %d posting transmit to %s\n",
+                       rc, libcfs_nid2str(peer_ni->ibp_nid));
+        else
+                CDEBUG(D_NET, "Error %d posting transmit to %s\n",
+                       rc, libcfs_nid2str(peer_ni->ibp_nid));
+
+        kiblnd_close_conn(conn, rc);
+
+	if (done)
+		kiblnd_tx_done(tx);
+
+	spin_lock(&conn->ibc_lock);
+
+	return -EIO;
+}
+
+static void
+kiblnd_check_sends_locked(kib_conn_t *conn)
+{
+        int        ver = conn->ibc_version;
+	struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
+        kib_tx_t  *tx;
+
+        /* Don't send anything until after the connection is established */
+        if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+                CDEBUG(D_NET, "%s too soon\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                return;
+        }
+
+	LASSERT(conn->ibc_nsends_posted <=
+		kiblnd_concurrent_sends(ver, ni));
+        LASSERT (!IBLND_OOB_CAPABLE(ver) ||
+                 conn->ibc_noops_posted <= IBLND_OOB_MSGS(ver));
+        LASSERT (conn->ibc_reserved_credits >= 0);
+
+        while (conn->ibc_reserved_credits > 0 &&
+	       !list_empty(&conn->ibc_tx_queue_rsrvd)) {
+		tx = list_entry(conn->ibc_tx_queue_rsrvd.next,
+                                    kib_tx_t, tx_list);
+		list_del(&tx->tx_list);
+		list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
+                conn->ibc_reserved_credits--;
+        }
+
+        if (kiblnd_need_noop(conn)) {
+		spin_unlock(&conn->ibc_lock);
+
+		tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+		if (tx != NULL)
+			kiblnd_init_tx_msg(ni, tx, IBLND_MSG_NOOP, 0);
+
+		spin_lock(&conn->ibc_lock);
+                if (tx != NULL)
+                        kiblnd_queue_tx_locked(tx, conn);
+        }
+
+        for (;;) {
+                int credit;
+
+		if (!list_empty(&conn->ibc_tx_queue_nocred)) {
+                        credit = 0;
+			tx = list_entry(conn->ibc_tx_queue_nocred.next,
+                                            kib_tx_t, tx_list);
+		} else if (!list_empty(&conn->ibc_tx_noops)) {
+                        LASSERT (!IBLND_OOB_CAPABLE(ver));
+                        credit = 1;
+			tx = list_entry(conn->ibc_tx_noops.next,
+                                        kib_tx_t, tx_list);
+		} else if (!list_empty(&conn->ibc_tx_queue)) {
+                        credit = 1;
+			tx = list_entry(conn->ibc_tx_queue.next,
+                                            kib_tx_t, tx_list);
+                } else
+                        break;
+
+                if (kiblnd_post_tx_locked(conn, tx, credit) != 0)
+                        break;
+        }
+}
+
+static void
+kiblnd_tx_complete (kib_tx_t *tx, int status)
+{
+        int           failed = (status != IB_WC_SUCCESS);
+        kib_conn_t   *conn = tx->tx_conn;
+        int           idle;
+
+        LASSERT (tx->tx_sending > 0);
+
+        if (failed) {
+                if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
+			CNETERR("Tx -> %s cookie %#llx"
+                                " sending %d waiting %d: failed %d\n",
+                                libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                                tx->tx_cookie, tx->tx_sending, tx->tx_waiting,
+                                status);
+
+                kiblnd_close_conn(conn, -EIO);
+        } else {
+                kiblnd_peer_alive(conn->ibc_peer);
+        }
+
+	spin_lock(&conn->ibc_lock);
+
+        /* I could be racing with rdma completion.  Whoever makes 'tx' idle
+         * gets to free it, which also drops its ref on 'conn'. */
+
+        tx->tx_sending--;
+        conn->ibc_nsends_posted--;
+        if (tx->tx_msg->ibm_type == IBLND_MSG_NOOP)
+                conn->ibc_noops_posted--;
+
+        if (failed) {
+                tx->tx_waiting = 0;             /* don't wait for peer_ni */
+                tx->tx_status = -EIO;
+        }
+
+        idle = (tx->tx_sending == 0) &&         /* This is the final callback */
+               !tx->tx_waiting &&               /* Not waiting for peer_ni */
+               !tx->tx_queued;                  /* Not re-queued (PUT_DONE) */
+        if (idle)
+		list_del(&tx->tx_list);
+
+	kiblnd_check_sends_locked(conn);
+	spin_unlock(&conn->ibc_lock);
+
+	if (idle)
+		kiblnd_tx_done(tx);
+}
+
+static void
+kiblnd_init_tx_msg(struct lnet_ni *ni, kib_tx_t *tx, int type, int body_nob)
+{
+	kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev;
+	struct ib_sge *sge = &tx->tx_msgsge;
+	struct ib_rdma_wr *wrq;
+	int nob = offsetof(kib_msg_t, ibm_u) + body_nob;
+#ifdef HAVE_IB_GET_DMA_MR
+	struct ib_mr *mr = hdev->ibh_mrs;
+#endif
+
+	LASSERT(tx->tx_nwrq >= 0);
+	LASSERT(tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1);
+	LASSERT(nob <= IBLND_MSG_SIZE);
+#ifdef HAVE_IB_GET_DMA_MR
+	LASSERT(mr != NULL);
+#endif
+
+	kiblnd_init_msg(tx->tx_msg, type, body_nob);
+
+#ifdef HAVE_IB_GET_DMA_MR
+	sge->lkey   = mr->lkey;
+#else
+	sge->lkey   = hdev->ibh_pd->local_dma_lkey;
+#endif
+	sge->addr   = tx->tx_msgaddr;
+	sge->length = nob;
+
+	wrq = &tx->tx_wrq[tx->tx_nwrq];
+	memset(wrq, 0, sizeof(*wrq));
+
+	wrq->wr.next		= NULL;
+	wrq->wr.wr_id		= kiblnd_ptr2wreqid(tx, IBLND_WID_TX);
+	wrq->wr.sg_list		= sge;
+	wrq->wr.num_sge		= 1;
+	wrq->wr.opcode		= IB_WR_SEND;
+	wrq->wr.send_flags	= IB_SEND_SIGNALED;
+
+	tx->tx_nwrq++;
+}
+
+static int
+kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type,
+		 int resid, kib_rdma_desc_t *dstrd, __u64 dstcookie)
+{
+	kib_msg_t	  *ibmsg = tx->tx_msg;
+	kib_rdma_desc_t   *srcrd = tx->tx_rd;
+	struct ib_rdma_wr *wrq = NULL;
+	struct ib_sge	  *sge;
+	int		   rc  = resid;
+	int		   srcidx;
+	int		   dstidx;
+	int		   sge_nob;
+	int		   wrq_sge;
+
+	LASSERT(!in_interrupt());
+	LASSERT(tx->tx_nwrq == 0 && tx->tx_nsge == 0);
+	LASSERT(type == IBLND_MSG_GET_DONE || type == IBLND_MSG_PUT_DONE);
+
+	for (srcidx = dstidx = wrq_sge = sge_nob = 0;
+	     resid > 0; resid -= sge_nob) {
+		int	prev = dstidx;
+
+		if (srcidx >= srcrd->rd_nfrags) {
+			CERROR("Src buffer exhausted: %d frags\n", srcidx);
+			rc = -EPROTO;
+			break;
+		}
+
+		if (dstidx >= dstrd->rd_nfrags) {
+			CERROR("Dst buffer exhausted: %d frags\n", dstidx);
+			rc = -EPROTO;
+			break;
+		}
+
+		if (tx->tx_nwrq >= conn->ibc_max_frags) {
+			CERROR("RDMA has too many fragments for peer_ni %s (%d), "
+			       "src idx/frags: %d/%d dst idx/frags: %d/%d\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid),
+			       conn->ibc_max_frags,
+			       srcidx, srcrd->rd_nfrags,
+			       dstidx, dstrd->rd_nfrags);
+			rc = -EMSGSIZE;
+			break;
+		}
+
+		sge_nob = MIN(MIN(kiblnd_rd_frag_size(srcrd, srcidx),
+				  kiblnd_rd_frag_size(dstrd, dstidx)), resid);
+
+		sge = &tx->tx_sge[tx->tx_nsge];
+		sge->addr   = kiblnd_rd_frag_addr(srcrd, srcidx);
+		sge->lkey   = kiblnd_rd_frag_key(srcrd, srcidx);
+		sge->length = sge_nob;
+
+		if (wrq_sge == 0) {
+			wrq = &tx->tx_wrq[tx->tx_nwrq];
+
+			wrq->wr.next	= &(wrq + 1)->wr;
+			wrq->wr.wr_id	= kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA);
+			wrq->wr.sg_list	= sge;
+			wrq->wr.opcode	= IB_WR_RDMA_WRITE;
+			wrq->wr.send_flags = 0;
+
+#ifdef HAVE_IB_RDMA_WR
+			wrq->remote_addr	= kiblnd_rd_frag_addr(dstrd,
+								      dstidx);
+			wrq->rkey		= kiblnd_rd_frag_key(dstrd,
+								     dstidx);
+#else
+			wrq->wr.wr.rdma.remote_addr = kiblnd_rd_frag_addr(dstrd,
+									dstidx);
+			wrq->wr.wr.rdma.rkey	= kiblnd_rd_frag_key(dstrd,
+								     dstidx);
+#endif
+		}
+
+		srcidx = kiblnd_rd_consume_frag(srcrd, srcidx, sge_nob);
+		dstidx = kiblnd_rd_consume_frag(dstrd, dstidx, sge_nob);
+
+		wrq_sge++;
+		if (wrq_sge == *kiblnd_tunables.kib_wrq_sge || dstidx != prev) {
+			tx->tx_nwrq++;
+			wrq->wr.num_sge = wrq_sge;
+			wrq_sge = 0;
+		}
+		tx->tx_nsge++;
+	}
+
+	if (rc < 0)	/* no RDMA if completing with failure */
+		tx->tx_nwrq = tx->tx_nsge = 0;
+
+        ibmsg->ibm_u.completion.ibcm_status = rc;
+        ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
+        kiblnd_init_tx_msg(conn->ibc_peer->ibp_ni, tx,
+                           type, sizeof (kib_completion_msg_t));
+
+        return rc;
+}
+
+static void
+kiblnd_queue_tx_locked(kib_tx_t *tx, kib_conn_t *conn)
+{
+	struct list_head *q;
+
+	LASSERT(tx->tx_nwrq > 0);	/* work items set up */
+	LASSERT(!tx->tx_queued);	/* not queued for sending already */
+	LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+	tx->tx_queued = 1;
+	tx->tx_deadline = jiffies +
+			  msecs_to_jiffies(*kiblnd_tunables.kib_timeout *
+					   MSEC_PER_SEC);
+
+        if (tx->tx_conn == NULL) {
+                kiblnd_conn_addref(conn);
+                tx->tx_conn = conn;
+                LASSERT (tx->tx_msg->ibm_type != IBLND_MSG_PUT_DONE);
+        } else {
+                /* PUT_DONE first attached to conn as a PUT_REQ */
+                LASSERT (tx->tx_conn == conn);
+                LASSERT (tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE);
+        }
+
+        switch (tx->tx_msg->ibm_type) {
+        default:
+                LBUG();
+
+        case IBLND_MSG_PUT_REQ:
+        case IBLND_MSG_GET_REQ:
+                q = &conn->ibc_tx_queue_rsrvd;
+                break;
+
+        case IBLND_MSG_PUT_NAK:
+        case IBLND_MSG_PUT_ACK:
+        case IBLND_MSG_PUT_DONE:
+        case IBLND_MSG_GET_DONE:
+                q = &conn->ibc_tx_queue_nocred;
+                break;
+
+        case IBLND_MSG_NOOP:
+                if (IBLND_OOB_CAPABLE(conn->ibc_version))
+                        q = &conn->ibc_tx_queue_nocred;
+                else
+                        q = &conn->ibc_tx_noops;
+                break;
+
+        case IBLND_MSG_IMMEDIATE:
+                q = &conn->ibc_tx_queue;
+                break;
+        }
+
+	list_add_tail(&tx->tx_list, q);
+}
+
+static void
+kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
+{
+	spin_lock(&conn->ibc_lock);
+	kiblnd_queue_tx_locked(tx, conn);
+	kiblnd_check_sends_locked(conn);
+	spin_unlock(&conn->ibc_lock);
+}
+
+static int kiblnd_resolve_addr(struct rdma_cm_id *cmid,
+                               struct sockaddr_in *srcaddr,
+                               struct sockaddr_in *dstaddr,
+                               int timeout_ms)
+{
+        unsigned short port;
+        int rc;
+
+        /* allow the port to be reused */
+        rc = rdma_set_reuseaddr(cmid, 1);
+        if (rc != 0) {
+                CERROR("Unable to set reuse on cmid: %d\n", rc);
+                return rc;
+        }
+
+        /* look for a free privileged port */
+        for (port = PROT_SOCK-1; port > 0; port--) {
+                srcaddr->sin_port = htons(port);
+                rc = rdma_resolve_addr(cmid,
+                                       (struct sockaddr *)srcaddr,
+                                       (struct sockaddr *)dstaddr,
+                                       timeout_ms);
+                if (rc == 0) {
+                        CDEBUG(D_NET, "bound to port %hu\n", port);
+                        return 0;
+                } else if (rc == -EADDRINUSE || rc == -EADDRNOTAVAIL) {
+                        CDEBUG(D_NET, "bind to port %hu failed: %d\n",
+                               port, rc);
+                } else {
+                        return rc;
+                }
+        }
+
+        CERROR("Failed to bind to a free privileged port\n");
+        return rc;
+}
+
+static void
+kiblnd_connect_peer (kib_peer_ni_t *peer_ni)
+{
+        struct rdma_cm_id *cmid;
+        kib_dev_t         *dev;
+        kib_net_t         *net = peer_ni->ibp_ni->ni_data;
+        struct sockaddr_in srcaddr;
+        struct sockaddr_in dstaddr;
+        int                rc;
+
+        LASSERT (net != NULL);
+        LASSERT (peer_ni->ibp_connecting > 0);
+
+        cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, peer_ni, RDMA_PS_TCP,
+                                     IB_QPT_RC);
+
+        if (IS_ERR(cmid)) {
+                CERROR("Can't create CMID for %s: %ld\n",
+                       libcfs_nid2str(peer_ni->ibp_nid), PTR_ERR(cmid));
+                rc = PTR_ERR(cmid);
+                goto failed;
+        }
+
+        dev = net->ibn_dev;
+        memset(&srcaddr, 0, sizeof(srcaddr));
+        srcaddr.sin_family = AF_INET;
+        srcaddr.sin_addr.s_addr = htonl(dev->ibd_ifip);
+
+        memset(&dstaddr, 0, sizeof(dstaddr));
+        dstaddr.sin_family = AF_INET;
+        dstaddr.sin_port = htons(*kiblnd_tunables.kib_service);
+        dstaddr.sin_addr.s_addr = htonl(LNET_NIDADDR(peer_ni->ibp_nid));
+
+        kiblnd_peer_addref(peer_ni);               /* cmid's ref */
+
+        if (*kiblnd_tunables.kib_use_priv_port) {
+                rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr,
+                                         *kiblnd_tunables.kib_timeout * 1000);
+        } else {
+                rc = rdma_resolve_addr(cmid,
+                                       (struct sockaddr *)&srcaddr,
+                                       (struct sockaddr *)&dstaddr,
+                                       *kiblnd_tunables.kib_timeout * 1000);
+        }
+        if (rc != 0) {
+                /* Can't initiate address resolution:  */
+                CERROR("Can't resolve addr for %s: %d\n",
+                       libcfs_nid2str(peer_ni->ibp_nid), rc);
+                goto failed2;
+        }
+
+	return;
+
+ failed2:
+	kiblnd_peer_connect_failed(peer_ni, 1, rc);
+	kiblnd_peer_decref(peer_ni);               /* cmid's ref */
+	rdma_destroy_id(cmid);
+	return;
+ failed:
+	kiblnd_peer_connect_failed(peer_ni, 1, rc);
+}
+
+bool
+kiblnd_reconnect_peer(kib_peer_ni_t *peer_ni)
+{
+	rwlock_t	 *glock = &kiblnd_data.kib_global_lock;
+	char		 *reason = NULL;
+	struct list_head  txs;
+	unsigned long	  flags;
+
+	INIT_LIST_HEAD(&txs);
+
+	write_lock_irqsave(glock, flags);
+	if (peer_ni->ibp_reconnecting == 0) {
+		if (peer_ni->ibp_accepting)
+			reason = "accepting";
+		else if (peer_ni->ibp_connecting)
+			reason = "connecting";
+		else if (!list_empty(&peer_ni->ibp_conns))
+			reason = "connected";
+		else /* connected then closed */
+			reason = "closed";
+
+		goto no_reconnect;
+	}
+
+	if (peer_ni->ibp_accepting)
+		CNETERR("Detecting race between accepting and reconnecting\n");
+	peer_ni->ibp_reconnecting--;
+
+	if (!kiblnd_peer_active(peer_ni)) {
+		list_splice_init(&peer_ni->ibp_tx_queue, &txs);
+		reason = "unlinked";
+		goto no_reconnect;
+	}
+
+	peer_ni->ibp_connecting++;
+	peer_ni->ibp_reconnected++;
+
+	write_unlock_irqrestore(glock, flags);
+
+	kiblnd_connect_peer(peer_ni);
+	return true;
+
+ no_reconnect:
+	write_unlock_irqrestore(glock, flags);
+
+	CWARN("Abort reconnection of %s: %s\n",
+	      libcfs_nid2str(peer_ni->ibp_nid), reason);
+	kiblnd_txlist_done(&txs, -ECONNABORTED);
+	return false;
+}
+
+void
+kiblnd_launch_tx(struct lnet_ni *ni, kib_tx_t *tx, lnet_nid_t nid)
+{
+        kib_peer_ni_t        *peer_ni;
+        kib_peer_ni_t        *peer2;
+        kib_conn_t        *conn;
+	rwlock_t	*g_lock = &kiblnd_data.kib_global_lock;
+        unsigned long      flags;
+        int                rc;
+	int		   i;
+	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+
+        /* If I get here, I've committed to send, so I complete the tx with
+         * failure on any problems */
+
+        LASSERT (tx == NULL || tx->tx_conn == NULL); /* only set when assigned a conn */
+        LASSERT (tx == NULL || tx->tx_nwrq > 0);     /* work items have been set up */
+
+        /* First time, just use a read lock since I expect to find my peer_ni
+         * connected */
+	read_lock_irqsave(g_lock, flags);
+
+        peer_ni = kiblnd_find_peer_locked(ni, nid);
+	if (peer_ni != NULL && !list_empty(&peer_ni->ibp_conns)) {
+                /* Found a peer_ni with an established connection */
+                conn = kiblnd_get_conn_locked(peer_ni);
+                kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+		read_unlock_irqrestore(g_lock, flags);
+
+                if (tx != NULL)
+                        kiblnd_queue_tx(tx, conn);
+                kiblnd_conn_decref(conn); /* ...to here */
+                return;
+        }
+
+	read_unlock(g_lock);
+	/* Re-try with a write lock */
+	write_lock(g_lock);
+
+        peer_ni = kiblnd_find_peer_locked(ni, nid);
+        if (peer_ni != NULL) {
+		if (list_empty(&peer_ni->ibp_conns)) {
+                        /* found a peer_ni, but it's still connecting... */
+			LASSERT(kiblnd_peer_connecting(peer_ni));
+                        if (tx != NULL)
+				list_add_tail(&tx->tx_list,
+                                                  &peer_ni->ibp_tx_queue);
+			write_unlock_irqrestore(g_lock, flags);
+		} else {
+			conn = kiblnd_get_conn_locked(peer_ni);
+			kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+			write_unlock_irqrestore(g_lock, flags);
+
+                        if (tx != NULL)
+                                kiblnd_queue_tx(tx, conn);
+                        kiblnd_conn_decref(conn); /* ...to here */
+                }
+                return;
+        }
+
+	write_unlock_irqrestore(g_lock, flags);
+
+	/* Allocate a peer_ni ready to add to the peer_ni table and retry */
+	rc = kiblnd_create_peer(ni, &peer_ni, nid);
+	if (rc != 0) {
+		CERROR("Can't create peer_ni %s\n", libcfs_nid2str(nid));
+		if (tx != NULL) {
+			tx->tx_status = -EHOSTUNREACH;
+			tx->tx_waiting = 0;
+			kiblnd_tx_done(tx);
+		}
+		return;
+	}
+
+	write_lock_irqsave(g_lock, flags);
+
+        peer2 = kiblnd_find_peer_locked(ni, nid);
+        if (peer2 != NULL) {
+		if (list_empty(&peer2->ibp_conns)) {
+                        /* found a peer_ni, but it's still connecting... */
+			LASSERT(kiblnd_peer_connecting(peer2));
+                        if (tx != NULL)
+				list_add_tail(&tx->tx_list,
+                                                  &peer2->ibp_tx_queue);
+			write_unlock_irqrestore(g_lock, flags);
+		} else {
+			conn = kiblnd_get_conn_locked(peer2);
+			kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+			write_unlock_irqrestore(g_lock, flags);
+
+                        if (tx != NULL)
+                                kiblnd_queue_tx(tx, conn);
+                        kiblnd_conn_decref(conn); /* ...to here */
+                }
+
+                kiblnd_peer_decref(peer_ni);
+                return;
+        }
+
+	/* Brand new peer_ni */
+	LASSERT(peer_ni->ibp_connecting == 0);
+	tunables = &peer_ni->ibp_ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+	peer_ni->ibp_connecting = tunables->lnd_conns_per_peer;
+
+	/* always called with a ref on ni, which prevents ni being shutdown */
+	LASSERT(((kib_net_t *)ni->ni_data)->ibn_shutdown == 0);
+
+	if (tx != NULL)
+		list_add_tail(&tx->tx_list, &peer_ni->ibp_tx_queue);
+
+        kiblnd_peer_addref(peer_ni);
+	list_add_tail(&peer_ni->ibp_list, kiblnd_nid2peerlist(nid));
+
+	write_unlock_irqrestore(g_lock, flags);
+
+	for (i = 0; i < tunables->lnd_conns_per_peer; i++)
+		kiblnd_connect_peer(peer_ni);
+        kiblnd_peer_decref(peer_ni);
+}
+
+int
+kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
+{
+	struct lnet_hdr *hdr = &lntmsg->msg_hdr;
+	int               type = lntmsg->msg_type;
+	struct lnet_process_id target = lntmsg->msg_target;
+	int               target_is_router = lntmsg->msg_target_is_router;
+	int               routing = lntmsg->msg_routing;
+	unsigned int      payload_niov = lntmsg->msg_niov;
+	struct kvec      *payload_iov = lntmsg->msg_iov;
+	lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
+	unsigned int      payload_offset = lntmsg->msg_offset;
+	unsigned int      payload_nob = lntmsg->msg_len;
+	kib_msg_t        *ibmsg;
+	kib_rdma_desc_t  *rd;
+	kib_tx_t         *tx;
+	int               nob;
+	int               rc;
+
+        /* NB 'private' is different depending on what we're sending.... */
+
+        CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
+               payload_nob, payload_niov, libcfs_id2str(target));
+
+	LASSERT (payload_nob == 0 || payload_niov > 0);
+	LASSERT (payload_niov <= LNET_MAX_IOV);
+
+	/* Thread context */
+	LASSERT (!in_interrupt());
+	/* payload is either all vaddrs or all pages */
+	LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+
+	switch (type) {
+	default:
+		LBUG();
+		return (-EIO);
+
+        case LNET_MSG_ACK:
+                LASSERT (payload_nob == 0);
+                break;
+
+        case LNET_MSG_GET:
+                if (routing || target_is_router)
+                        break;                  /* send IMMEDIATE */
+
+                /* is the REPLY message too small for RDMA? */
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
+                if (nob <= IBLND_MSG_SIZE)
+                        break;                  /* send IMMEDIATE */
+
+		tx = kiblnd_get_idle_tx(ni, target.nid);
+		if (tx == NULL) {
+			CERROR("Can't allocate txd for GET to %s\n",
+			       libcfs_nid2str(target.nid));
+			return -ENOMEM;
+		}
+
+		ibmsg = tx->tx_msg;
+		rd = &ibmsg->ibm_u.get.ibgm_rd;
+		if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
+			rc = kiblnd_setup_rd_iov(ni, tx, rd,
+						 lntmsg->msg_md->md_niov,
+						 lntmsg->msg_md->md_iov.iov,
+						 0, lntmsg->msg_md->md_length);
+		else
+			rc = kiblnd_setup_rd_kiov(ni, tx, rd,
+						  lntmsg->msg_md->md_niov,
+						  lntmsg->msg_md->md_iov.kiov,
+						  0, lntmsg->msg_md->md_length);
+		if (rc != 0) {
+			CERROR("Can't setup GET sink for %s: %d\n",
+			       libcfs_nid2str(target.nid), rc);
+			kiblnd_tx_done(tx);
+			return -EIO;
+		}
+
+		nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[rd->rd_nfrags]);
+		ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
+		ibmsg->ibm_u.get.ibgm_hdr = *hdr;
+
+                kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob);
+
+                tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg);
+		if (tx->tx_lntmsg[1] == NULL) {
+			CERROR("Can't create reply for GET -> %s\n",
+			       libcfs_nid2str(target.nid));
+			kiblnd_tx_done(tx);
+			return -EIO;
+		}
+
+                tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg[0,1] on completion */
+                tx->tx_waiting = 1;             /* waiting for GET_DONE */
+                kiblnd_launch_tx(ni, tx, target.nid);
+                return 0;
+
+        case LNET_MSG_REPLY:
+        case LNET_MSG_PUT:
+                /* Is the payload small enough not to need RDMA? */
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+                if (nob <= IBLND_MSG_SIZE)
+                        break;                  /* send IMMEDIATE */
+
+		tx = kiblnd_get_idle_tx(ni, target.nid);
+                if (tx == NULL) {
+                        CERROR("Can't allocate %s txd for %s\n",
+                               type == LNET_MSG_PUT ? "PUT" : "REPLY",
+                               libcfs_nid2str(target.nid));
+                        return -ENOMEM;
+                }
+
+                if (payload_kiov == NULL)
+                        rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd,
+                                                 payload_niov, payload_iov,
+                                                 payload_offset, payload_nob);
+                else
+                        rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
+                                                  payload_niov, payload_kiov,
+                                                  payload_offset, payload_nob);
+		if (rc != 0) {
+			CERROR("Can't setup PUT src for %s: %d\n",
+			       libcfs_nid2str(target.nid), rc);
+			kiblnd_tx_done(tx);
+			return -EIO;
+		}
+
+                ibmsg = tx->tx_msg;
+                ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
+                ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
+                kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
+
+                tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
+                tx->tx_waiting = 1;             /* waiting for PUT_{ACK,NAK} */
+                kiblnd_launch_tx(ni, tx, target.nid);
+                return 0;
+        }
+
+        /* send IMMEDIATE */
+
+        LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
+                 <= IBLND_MSG_SIZE);
+
+	tx = kiblnd_get_idle_tx(ni, target.nid);
+        if (tx == NULL) {
+                CERROR ("Can't send %d to %s: tx descs exhausted\n",
+                        type, libcfs_nid2str(target.nid));
+                return -ENOMEM;
+        }
+
+        ibmsg = tx->tx_msg;
+        ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
+
+        if (payload_kiov != NULL)
+                lnet_copy_kiov2flat(IBLND_MSG_SIZE, ibmsg,
+                                    offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                    payload_niov, payload_kiov,
+                                    payload_offset, payload_nob);
+        else
+                lnet_copy_iov2flat(IBLND_MSG_SIZE, ibmsg,
+                                   offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                   payload_niov, payload_iov,
+                                   payload_offset, payload_nob);
+
+        nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
+        kiblnd_init_tx_msg(ni, tx, IBLND_MSG_IMMEDIATE, nob);
+
+        tx->tx_lntmsg[0] = lntmsg;              /* finalise lntmsg on completion */
+        kiblnd_launch_tx(ni, tx, target.nid);
+        return 0;
+}
+
+static void
+kiblnd_reply(struct lnet_ni *ni, kib_rx_t *rx, struct lnet_msg *lntmsg)
+{
+	struct lnet_process_id target = lntmsg->msg_target;
+        unsigned int      niov = lntmsg->msg_niov;
+	struct kvec      *iov = lntmsg->msg_iov;
+        lnet_kiov_t      *kiov = lntmsg->msg_kiov;
+        unsigned int      offset = lntmsg->msg_offset;
+        unsigned int      nob = lntmsg->msg_len;
+        kib_tx_t         *tx;
+        int               rc;
+
+	tx = kiblnd_get_idle_tx(ni, rx->rx_conn->ibc_peer->ibp_nid);
+        if (tx == NULL) {
+                CERROR("Can't get tx for REPLY to %s\n",
+                       libcfs_nid2str(target.nid));
+                goto failed_0;
+        }
+
+        if (nob == 0)
+                rc = 0;
+        else if (kiov == NULL)
+                rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd,
+                                         niov, iov, offset, nob);
+        else
+                rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
+                                          niov, kiov, offset, nob);
+
+        if (rc != 0) {
+                CERROR("Can't setup GET src for %s: %d\n",
+                       libcfs_nid2str(target.nid), rc);
+                goto failed_1;
+        }
+
+        rc = kiblnd_init_rdma(rx->rx_conn, tx,
+                              IBLND_MSG_GET_DONE, nob,
+                              &rx->rx_msg->ibm_u.get.ibgm_rd,
+                              rx->rx_msg->ibm_u.get.ibgm_cookie);
+        if (rc < 0) {
+                CERROR("Can't setup rdma for GET from %s: %d\n",
+                       libcfs_nid2str(target.nid), rc);
+                goto failed_1;
+        }
+
+	if (nob == 0) {
+		/* No RDMA: local completion may happen now! */
+		lnet_finalize(lntmsg, 0);
+	} else {
+		/* RDMA: lnet_finalize(lntmsg) when it
+		 * completes */
+		tx->tx_lntmsg[0] = lntmsg;
+	}
+
+        kiblnd_queue_tx(tx, rx->rx_conn);
+        return;
+
+ failed_1:
+	kiblnd_tx_done(tx);
+ failed_0:
+	lnet_finalize(lntmsg, -EIO);
+}
+
+int
+kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
+	    int delayed, unsigned int niov, struct kvec *iov, lnet_kiov_t *kiov,
+	    unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+        kib_rx_t    *rx = private;
+        kib_msg_t   *rxmsg = rx->rx_msg;
+        kib_conn_t  *conn = rx->rx_conn;
+        kib_tx_t    *tx;
+	__u64	     ibprm_cookie;
+	int          nob;
+	int          post_credit = IBLND_POSTRX_PEER_CREDIT;
+	int          rc = 0;
+
+	LASSERT (mlen <= rlen);
+	LASSERT (!in_interrupt());
+	/* Either all pages or all vaddrs */
+	LASSERT (!(kiov != NULL && iov != NULL));
+
+	switch (rxmsg->ibm_type) {
+	default:
+		LBUG();
+
+        case IBLND_MSG_IMMEDIATE:
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+                if (nob > rx->rx_nob) {
+                        CERROR ("Immediate message from %s too big: %d(%d)\n",
+                                libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
+                                nob, rx->rx_nob);
+                        rc = -EPROTO;
+                        break;
+                }
+
+                if (kiov != NULL)
+                        lnet_copy_flat2kiov(niov, kiov, offset,
+                                            IBLND_MSG_SIZE, rxmsg,
+                                            offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                            mlen);
+                else
+                        lnet_copy_flat2iov(niov, iov, offset,
+                                           IBLND_MSG_SIZE, rxmsg,
+                                           offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                           mlen);
+		lnet_finalize(lntmsg, 0);
+		break;
+
+	case IBLND_MSG_PUT_REQ: {
+		kib_msg_t	*txmsg;
+		kib_rdma_desc_t *rd;
+		ibprm_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
+
+		if (mlen == 0) {
+			lnet_finalize(lntmsg, 0);
+			kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK,
+					       0, ibprm_cookie);
+			break;
+		}
+
+		tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+                if (tx == NULL) {
+                        CERROR("Can't allocate tx for %s\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        /* Not replying will break the connection */
+                        rc = -ENOMEM;
+                        break;
+                }
+
+		txmsg = tx->tx_msg;
+		rd = &txmsg->ibm_u.putack.ibpam_rd;
+		if (kiov == NULL)
+			rc = kiblnd_setup_rd_iov(ni, tx, rd,
+						 niov, iov, offset, mlen);
+		else
+			rc = kiblnd_setup_rd_kiov(ni, tx, rd,
+						  niov, kiov, offset, mlen);
+		if (rc != 0) {
+			CERROR("Can't setup PUT sink for %s: %d\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+			kiblnd_tx_done(tx);
+			/* tell peer_ni it's over */
+			kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK,
+					       rc, ibprm_cookie);
+			break;
+		}
+
+		nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[rd->rd_nfrags]);
+		txmsg->ibm_u.putack.ibpam_src_cookie = ibprm_cookie;
+		txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
+
+                kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_ACK, nob);
+
+                tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
+                tx->tx_waiting = 1;             /* waiting for PUT_DONE */
+                kiblnd_queue_tx(tx, conn);
+
+                /* reposted buffer reserved for PUT_DONE */
+                post_credit = IBLND_POSTRX_NO_CREDIT;
+                break;
+		}
+
+        case IBLND_MSG_GET_REQ:
+                if (lntmsg != NULL) {
+                        /* Optimized GET; RDMA lntmsg's payload */
+                        kiblnd_reply(ni, rx, lntmsg);
+                } else {
+                        /* GET didn't match anything */
+                        kiblnd_send_completion(rx->rx_conn, IBLND_MSG_GET_DONE,
+                                               -ENODATA,
+                                               rxmsg->ibm_u.get.ibgm_cookie);
+                }
+                break;
+        }
+
+        kiblnd_post_rx(rx, post_credit);
+        return rc;
+}
+
+int
+kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name)
+{
+	struct task_struct *task = kthread_run(fn, arg, name);
+
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+
+	atomic_inc(&kiblnd_data.kib_nthreads);
+	return 0;
+}
+
+static void
+kiblnd_thread_fini (void)
+{
+	atomic_dec (&kiblnd_data.kib_nthreads);
+}
+
+static void
+kiblnd_peer_alive (kib_peer_ni_t *peer_ni)
+{
+	/* This is racy, but everyone's only writing cfs_time_current() */
+	peer_ni->ibp_last_alive = cfs_time_current();
+	smp_mb();
+}
+
+static void
+kiblnd_peer_notify (kib_peer_ni_t *peer_ni)
+{
+        int           error = 0;
+        cfs_time_t    last_alive = 0;
+        unsigned long flags;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	if (kiblnd_peer_idle(peer_ni) && peer_ni->ibp_error != 0) {
+                error = peer_ni->ibp_error;
+                peer_ni->ibp_error = 0;
+
+                last_alive = peer_ni->ibp_last_alive;
+        }
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+        if (error != 0)
+                lnet_notify(peer_ni->ibp_ni,
+                            peer_ni->ibp_nid, 0, last_alive);
+}
+
+void
+kiblnd_close_conn_locked (kib_conn_t *conn, int error)
+{
+        /* This just does the immediate housekeeping.  'error' is zero for a
+         * normal shutdown which can happen only after the connection has been
+         * established.  If the connection is established, schedule the
+         * connection to be finished off by the connd.  Otherwise the connd is
+         * already dealing with it (either to set it up or tear it down).
+         * Caller holds kib_global_lock exclusively in irq context */
+        kib_peer_ni_t       *peer_ni = conn->ibc_peer;
+        kib_dev_t        *dev;
+        unsigned long     flags;
+
+        LASSERT (error != 0 || conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+        if (error != 0 && conn->ibc_comms_error == 0)
+                conn->ibc_comms_error = error;
+
+        if (conn->ibc_state != IBLND_CONN_ESTABLISHED)
+                return; /* already being handled  */
+
+        if (error == 0 &&
+	    list_empty(&conn->ibc_tx_noops) &&
+	    list_empty(&conn->ibc_tx_queue) &&
+	    list_empty(&conn->ibc_tx_queue_rsrvd) &&
+	    list_empty(&conn->ibc_tx_queue_nocred) &&
+	    list_empty(&conn->ibc_active_txs)) {
+                CDEBUG(D_NET, "closing conn to %s\n", 
+                       libcfs_nid2str(peer_ni->ibp_nid));
+        } else {
+                CNETERR("Closing conn to %s: error %d%s%s%s%s%s\n",
+                       libcfs_nid2str(peer_ni->ibp_nid), error,
+		       list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
+		       list_empty(&conn->ibc_tx_noops) ? "" : "(sending_noops)",
+		       list_empty(&conn->ibc_tx_queue_rsrvd) ?
+						"" : "(sending_rsrvd)",
+		       list_empty(&conn->ibc_tx_queue_nocred) ?
+						 "" : "(sending_nocred)",
+		       list_empty(&conn->ibc_active_txs) ? "" : "(waiting)");
+	}
+
+	dev = ((kib_net_t *)peer_ni->ibp_ni->ni_data)->ibn_dev;
+	if (peer_ni->ibp_next_conn == conn)
+		/* clear next_conn so it won't be used */
+		peer_ni->ibp_next_conn = NULL;
+	list_del(&conn->ibc_list);
+	/* connd (see below) takes over ibc_list's ref */
+
+	if (list_empty(&peer_ni->ibp_conns) &&    /* no more conns */
+            kiblnd_peer_active(peer_ni)) {         /* still in peer_ni table */
+                kiblnd_unlink_peer_locked(peer_ni);
+
+                /* set/clear error on last conn */
+                peer_ni->ibp_error = conn->ibc_comms_error;
+        }
+
+        kiblnd_set_conn_state(conn, IBLND_CONN_CLOSING);
+
+	if (error != 0 &&
+	    kiblnd_dev_can_failover(dev)) {
+		list_add_tail(&dev->ibd_fail_list,
+			      &kiblnd_data.kib_failed_devs);
+		wake_up(&kiblnd_data.kib_failover_waitq);
+	}
+
+	spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+
+	list_add_tail(&conn->ibc_list, &kiblnd_data.kib_connd_conns);
+	wake_up(&kiblnd_data.kib_connd_waitq);
+
+	spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+}
+
+void
+kiblnd_close_conn(kib_conn_t *conn, int error)
+{
+	unsigned long flags;
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	kiblnd_close_conn_locked(conn, error);
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+}
+
+static void
+kiblnd_handle_early_rxs(kib_conn_t *conn)
+{
+	unsigned long    flags;
+	kib_rx_t        *rx;
+
+	LASSERT(!in_interrupt());
+	LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	while (!list_empty(&conn->ibc_early_rxs)) {
+		rx = list_entry(conn->ibc_early_rxs.next,
+				    kib_rx_t, rx_list);
+		list_del(&rx->rx_list);
+		write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+		kiblnd_handle_rx(rx);
+
+		write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	}
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+}
+
+static void
+kiblnd_abort_txs(kib_conn_t *conn, struct list_head *txs)
+{
+	struct list_head	 zombies = LIST_HEAD_INIT(zombies);
+	struct list_head	*tmp;
+	struct list_head	*nxt;
+	kib_tx_t		*tx;
+
+	spin_lock(&conn->ibc_lock);
+
+	list_for_each_safe(tmp, nxt, txs) {
+		tx = list_entry(tmp, kib_tx_t, tx_list);
+
+		if (txs == &conn->ibc_active_txs) {
+			LASSERT(!tx->tx_queued);
+			LASSERT(tx->tx_waiting ||
+				tx->tx_sending != 0);
+		} else {
+			LASSERT(tx->tx_queued);
+		}
+
+		tx->tx_status = -ECONNABORTED;
+		tx->tx_waiting = 0;
+
+		if (tx->tx_sending == 0) {
+			tx->tx_queued = 0;
+			list_del(&tx->tx_list);
+			list_add(&tx->tx_list, &zombies);
+		}
+	}
+
+	spin_unlock(&conn->ibc_lock);
+
+	kiblnd_txlist_done(&zombies, -ECONNABORTED);
+}
+
+static void
+kiblnd_finalise_conn (kib_conn_t *conn)
+{
+	LASSERT (!in_interrupt());
+	LASSERT (conn->ibc_state > IBLND_CONN_INIT);
+
+	kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED);
+
+	/* abort_receives moves QP state to IB_QPS_ERR.  This is only required
+	 * for connections that didn't get as far as being connected, because
+	 * rdma_disconnect() does this for free. */
+	kiblnd_abort_receives(conn);
+
+	/* Complete all tx descs not waiting for sends to complete.
+	 * NB we should be safe from RDMA now that the QP has changed state */
+
+	kiblnd_abort_txs(conn, &conn->ibc_tx_noops);
+	kiblnd_abort_txs(conn, &conn->ibc_tx_queue);
+	kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
+	kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred);
+	kiblnd_abort_txs(conn, &conn->ibc_active_txs);
+
+	kiblnd_handle_early_rxs(conn);
+}
+
+static void
+kiblnd_peer_connect_failed(kib_peer_ni_t *peer_ni, int active, int error)
+{
+	struct list_head zombies = LIST_HEAD_INIT(zombies);
+	unsigned long	flags;
+
+	LASSERT (error != 0);
+	LASSERT (!in_interrupt());
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	if (active) {
+		LASSERT(peer_ni->ibp_connecting > 0);
+		peer_ni->ibp_connecting--;
+	} else {
+		LASSERT (peer_ni->ibp_accepting > 0);
+		peer_ni->ibp_accepting--;
+	}
+
+	if (kiblnd_peer_connecting(peer_ni)) {
+		/* another connection attempt under way... */
+		write_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+					flags);
+		return;
+	}
+
+	peer_ni->ibp_reconnected = 0;
+	if (list_empty(&peer_ni->ibp_conns)) {
+		/* Take peer_ni's blocked transmits to complete with error */
+		list_add(&zombies, &peer_ni->ibp_tx_queue);
+		list_del_init(&peer_ni->ibp_tx_queue);
+
+		if (kiblnd_peer_active(peer_ni))
+			kiblnd_unlink_peer_locked(peer_ni);
+
+		peer_ni->ibp_error = error;
+	} else {
+		/* Can't have blocked transmits if there are connections */
+		LASSERT(list_empty(&peer_ni->ibp_tx_queue));
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	kiblnd_peer_notify(peer_ni);
+
+	if (list_empty(&zombies))
+		return;
+
+	CNETERR("Deleting messages for %s: connection failed\n",
+		libcfs_nid2str(peer_ni->ibp_nid));
+
+	kiblnd_txlist_done(&zombies, -EHOSTUNREACH);
+}
+
+static void
+kiblnd_connreq_done(kib_conn_t *conn, int status)
+{
+	kib_peer_ni_t	 *peer_ni = conn->ibc_peer;
+	kib_tx_t	 *tx;
+	struct list_head txs;
+	unsigned long	 flags;
+	int		 active;
+
+        active = (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+
+	CDEBUG(D_NET,"%s: active(%d), version(%x), status(%d)\n",
+	       libcfs_nid2str(peer_ni->ibp_nid), active,
+	       conn->ibc_version, status);
+
+	LASSERT (!in_interrupt());
+	LASSERT ((conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT &&
+		  peer_ni->ibp_connecting > 0) ||
+		 (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT &&
+		  peer_ni->ibp_accepting > 0));
+
+        LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+        conn->ibc_connvars = NULL;
+
+        if (status != 0) {
+                /* failed to establish connection */
+                kiblnd_peer_connect_failed(peer_ni, active, status);
+                kiblnd_finalise_conn(conn);
+                return;
+        }
+
+        /* connection established */
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+        conn->ibc_last_send = jiffies;
+        kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED);
+        kiblnd_peer_alive(peer_ni);
+
+	/* Add conn to peer_ni's list and nuke any dangling conns from a different
+	 * peer_ni instance... */
+	kiblnd_conn_addref(conn);	/* +1 ref for ibc_list */
+	list_add(&conn->ibc_list, &peer_ni->ibp_conns);
+	peer_ni->ibp_reconnected = 0;
+	if (active)
+		peer_ni->ibp_connecting--;
+	else
+		peer_ni->ibp_accepting--;
+
+        if (peer_ni->ibp_version == 0) {
+                peer_ni->ibp_version     = conn->ibc_version;
+                peer_ni->ibp_incarnation = conn->ibc_incarnation;
+        }
+
+        if (peer_ni->ibp_version     != conn->ibc_version ||
+            peer_ni->ibp_incarnation != conn->ibc_incarnation) {
+                kiblnd_close_stale_conns_locked(peer_ni, conn->ibc_version,
+                                                conn->ibc_incarnation);
+                peer_ni->ibp_version     = conn->ibc_version;
+                peer_ni->ibp_incarnation = conn->ibc_incarnation;
+        }
+
+	/* grab pending txs while I have the lock */
+	INIT_LIST_HEAD(&txs);
+	list_splice_init(&peer_ni->ibp_tx_queue, &txs);
+
+        if (!kiblnd_peer_active(peer_ni) ||        /* peer_ni has been deleted */
+            conn->ibc_comms_error != 0) {       /* error has happened already */
+
+                /* start to shut down connection */
+                kiblnd_close_conn_locked(conn, -ECONNABORTED);
+		write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+		kiblnd_txlist_done(&txs, -ECONNABORTED);
+
+		return;
+	}
+
+	/* +1 ref for myself, this connection is visible to other threads
+	 * now, refcount of peer:ibp_conns can be released by connection
+	 * close from either a different thread, or the calling of
+	 * kiblnd_check_sends_locked() below. See bz21911 for details.
+	 */
+	kiblnd_conn_addref(conn);
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	/* Schedule blocked txs
+	 * Note: if we are running with conns_per_peer > 1, these blocked
+	 * txs will all get scheduled to the first connection which gets
+	 * scheduled.  We won't be using round robin on this first batch.
+	 */
+	spin_lock(&conn->ibc_lock);
+	while (!list_empty(&txs)) {
+		tx = list_entry(txs.next, kib_tx_t, tx_list);
+		list_del(&tx->tx_list);
+
+		kiblnd_queue_tx_locked(tx, conn);
+	}
+	kiblnd_check_sends_locked(conn);
+	spin_unlock(&conn->ibc_lock);
+
+	/* schedule blocked rxs */
+	kiblnd_handle_early_rxs(conn);
+	kiblnd_conn_decref(conn);
+}
+
+static void
+kiblnd_reject(struct rdma_cm_id *cmid, kib_rej_t *rej)
+{
+        int          rc;
+
+        rc = rdma_reject(cmid, rej, sizeof(*rej));
+
+        if (rc != 0)
+                CWARN("Error %d sending reject\n", rc);
+}
+
+static int
+kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
+{
+	rwlock_t		*g_lock = &kiblnd_data.kib_global_lock;
+        kib_msg_t             *reqmsg = priv;
+        kib_msg_t             *ackmsg;
+        kib_dev_t             *ibdev;
+        kib_peer_ni_t            *peer_ni;
+        kib_peer_ni_t            *peer2;
+        kib_conn_t            *conn;
+	struct lnet_ni             *ni  = NULL;
+        kib_net_t             *net = NULL;
+        lnet_nid_t             nid;
+        struct rdma_conn_param cp;
+        kib_rej_t              rej;
+	int                    version = IBLND_MSG_VERSION;
+	unsigned long          flags;
+	int                    rc;
+	struct sockaddr_in    *peer_addr;
+	LASSERT (!in_interrupt());
+
+	/* cmid inherits 'context' from the corresponding listener id */
+	ibdev = (kib_dev_t *)cmid->context;
+	LASSERT (ibdev != NULL);
+
+        memset(&rej, 0, sizeof(rej));
+        rej.ibr_magic                = IBLND_MSG_MAGIC;
+        rej.ibr_why                  = IBLND_REJECT_FATAL;
+        rej.ibr_cp.ibcp_max_msg_size = IBLND_MSG_SIZE;
+
+        peer_addr = (struct sockaddr_in *)&(cmid->route.addr.dst_addr);
+        if (*kiblnd_tunables.kib_require_priv_port &&
+            ntohs(peer_addr->sin_port) >= PROT_SOCK) {
+		__u32 ip = ntohl(peer_addr->sin_addr.s_addr);
+		CERROR("peer_ni's port (%pI4h:%hu) is not privileged\n",
+		       &ip, ntohs(peer_addr->sin_port));
+		goto failed;
+	}
+
+	if (priv_nob < offsetof(kib_msg_t, ibm_type)) {
+		CERROR("Short connection request\n");
+		goto failed;
+	}
+
+	/* Future protocol version compatibility support!  If the
+	 * o2iblnd-specific protocol changes, or when LNET unifies
+	 * protocols over all LNDs, the initial connection will
+	 * negotiate a protocol version.  I trap this here to avoid
+	 * console errors; the reject tells the peer_ni which protocol I
+	 * speak. */
+	if (reqmsg->ibm_magic == LNET_PROTO_MAGIC ||
+	    reqmsg->ibm_magic == __swab32(LNET_PROTO_MAGIC))
+		goto failed;
+	if (reqmsg->ibm_magic == IBLND_MSG_MAGIC &&
+	    reqmsg->ibm_version != IBLND_MSG_VERSION &&
+	    reqmsg->ibm_version != IBLND_MSG_VERSION_1)
+		goto failed;
+	if (reqmsg->ibm_magic == __swab32(IBLND_MSG_MAGIC) &&
+	    reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION) &&
+	    reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION_1))
+		goto failed;
+
+	rc = kiblnd_unpack_msg(reqmsg, priv_nob);
+	if (rc != 0) {
+		CERROR("Can't parse connection request: %d\n", rc);
+		goto failed;
+	}
+
+	nid = reqmsg->ibm_srcnid;
+	ni  = lnet_nid2ni_addref(reqmsg->ibm_dstnid);
+
+	if (ni != NULL) {
+		net = (kib_net_t *)ni->ni_data;
+		rej.ibr_incarnation = net->ibn_incarnation;
+	}
+
+	if (ni == NULL ||                         /* no matching net */
+	    ni->ni_nid != reqmsg->ibm_dstnid ||   /* right NET, wrong NID! */
+	    net->ibn_dev != ibdev) {              /* wrong device */
+		CERROR("Can't accept conn from %s on %s (%s:%d:%pI4h): "
+		       "bad dst nid %s\n", libcfs_nid2str(nid),
+		       ni == NULL ? "NA" : libcfs_nid2str(ni->ni_nid),
+		       ibdev->ibd_ifname, ibdev->ibd_nnets,
+			&ibdev->ibd_ifip,
+		       libcfs_nid2str(reqmsg->ibm_dstnid));
+
+		goto failed;
+	}
+
+       /* check time stamp as soon as possible */
+	if (reqmsg->ibm_dststamp != 0 &&
+	    reqmsg->ibm_dststamp != net->ibn_incarnation) {
+		CWARN("Stale connection request\n");
+		rej.ibr_why = IBLND_REJECT_CONN_STALE;
+		goto failed;
+	}
+
+	/* I can accept peer_ni's version */
+	version = reqmsg->ibm_version;
+
+	if (reqmsg->ibm_type != IBLND_MSG_CONNREQ) {
+		CERROR("Unexpected connreq msg type: %x from %s\n",
+		       reqmsg->ibm_type, libcfs_nid2str(nid));
+		goto failed;
+	}
+
+	if (reqmsg->ibm_u.connparams.ibcp_queue_depth >
+	    kiblnd_msg_queue_size(version, ni)) {
+		CERROR("Can't accept conn from %s, queue depth too large: "
+		       " %d (<=%d wanted)\n",
+		       libcfs_nid2str(nid),
+		       reqmsg->ibm_u.connparams.ibcp_queue_depth,
+		       kiblnd_msg_queue_size(version, ni));
+
+		if (version == IBLND_MSG_VERSION)
+			rej.ibr_why = IBLND_REJECT_MSG_QUEUE_SIZE;
+
+		goto failed;
+	}
+
+	if (reqmsg->ibm_u.connparams.ibcp_max_frags >
+	    kiblnd_rdma_frags(version, ni)) {
+		CWARN("Can't accept conn from %s (version %x): "
+		      "max_frags %d too large (%d wanted)\n",
+		      libcfs_nid2str(nid), version,
+		      reqmsg->ibm_u.connparams.ibcp_max_frags,
+		      kiblnd_rdma_frags(version, ni));
+
+		if (version >= IBLND_MSG_VERSION)
+			rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
+
+		goto failed;
+	} else if (reqmsg->ibm_u.connparams.ibcp_max_frags <
+		   kiblnd_rdma_frags(version, ni) &&
+		   net->ibn_fmr_ps == NULL) {
+		CWARN("Can't accept conn from %s (version %x): "
+		      "max_frags %d incompatible without FMR pool "
+		      "(%d wanted)\n",
+		      libcfs_nid2str(nid), version,
+		      reqmsg->ibm_u.connparams.ibcp_max_frags,
+		      kiblnd_rdma_frags(version, ni));
+
+		if (version == IBLND_MSG_VERSION)
+			rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
+
+		goto failed;
+	}
+
+        if (reqmsg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
+                CERROR("Can't accept %s: message size %d too big (%d max)\n",
+                       libcfs_nid2str(nid),
+                       reqmsg->ibm_u.connparams.ibcp_max_msg_size,
+                       IBLND_MSG_SIZE);
+                goto failed;
+        }
+
+	/* assume 'nid' is a new peer_ni; create  */
+	rc = kiblnd_create_peer(ni, &peer_ni, nid);
+	if (rc != 0) {
+		CERROR("Can't create peer_ni for %s\n", libcfs_nid2str(nid));
+		rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
+		goto failed;
+	}
+
+	/* We have validated the peer's parameters so use those */
+	peer_ni->ibp_max_frags = reqmsg->ibm_u.connparams.ibcp_max_frags;
+	peer_ni->ibp_queue_depth = reqmsg->ibm_u.connparams.ibcp_queue_depth;
+
+	write_lock_irqsave(g_lock, flags);
+
+        peer2 = kiblnd_find_peer_locked(ni, nid);
+        if (peer2 != NULL) {
+                if (peer2->ibp_version == 0) {
+                        peer2->ibp_version     = version;
+                        peer2->ibp_incarnation = reqmsg->ibm_srcstamp;
+                }
+
+                /* not the guy I've talked with */
+                if (peer2->ibp_incarnation != reqmsg->ibm_srcstamp ||
+                    peer2->ibp_version     != version) {
+			kiblnd_close_peer_conns_locked(peer2, -ESTALE);
+
+			if (kiblnd_peer_active(peer2)) {
+				peer2->ibp_incarnation = reqmsg->ibm_srcstamp;
+				peer2->ibp_version = version;
+			}
+			write_unlock_irqrestore(g_lock, flags);
+
+			CWARN("Conn stale %s version %x/%x incarnation %llu/%llu\n",
+			      libcfs_nid2str(nid), peer2->ibp_version, version,
+			      peer2->ibp_incarnation, reqmsg->ibm_srcstamp);
+
+                        kiblnd_peer_decref(peer_ni);
+                        rej.ibr_why = IBLND_REJECT_CONN_STALE;
+                        goto failed;
+                }
+
+		/* Tie-break connection race in favour of the higher NID.
+		 * If we keep running into a race condition multiple times,
+		 * we have to assume that the connection attempt with the
+		 * higher NID is stuck in a connecting state and will never
+		 * recover.  As such, we pass through this if-block and let
+		 * the lower NID connection win so we can move forward.
+		 */
+		if (peer2->ibp_connecting != 0 &&
+		    nid < ni->ni_nid && peer2->ibp_races <
+		    MAX_CONN_RACES_BEFORE_ABORT) {
+			peer2->ibp_races++;
+			write_unlock_irqrestore(g_lock, flags);
+
+			CDEBUG(D_NET, "Conn race %s\n",
+			       libcfs_nid2str(peer2->ibp_nid));
+
+			kiblnd_peer_decref(peer_ni);
+			rej.ibr_why = IBLND_REJECT_CONN_RACE;
+			goto failed;
+		}
+		if (peer2->ibp_races >= MAX_CONN_RACES_BEFORE_ABORT)
+			CNETERR("Conn race %s: unresolved after %d attempts, letting lower NID win\n",
+				libcfs_nid2str(peer2->ibp_nid),
+				MAX_CONN_RACES_BEFORE_ABORT);
+		/*
+		 * passive connection is allowed even this peer_ni is waiting for
+		 * reconnection.
+		 */
+		peer2->ibp_reconnecting = 0;
+		peer2->ibp_races = 0;
+		peer2->ibp_accepting++;
+		kiblnd_peer_addref(peer2);
+
+		/* Race with kiblnd_launch_tx (active connect) to create peer_ni
+		 * so copy validated parameters since we now know what the
+		 * peer_ni's limits are */
+		peer2->ibp_max_frags = peer_ni->ibp_max_frags;
+		peer2->ibp_queue_depth = peer_ni->ibp_queue_depth;
+
+		write_unlock_irqrestore(g_lock, flags);
+                kiblnd_peer_decref(peer_ni);
+                peer_ni = peer2;
+        } else {
+                /* Brand new peer_ni */
+                LASSERT (peer_ni->ibp_accepting == 0);
+                LASSERT (peer_ni->ibp_version == 0 &&
+                         peer_ni->ibp_incarnation == 0);
+
+                peer_ni->ibp_accepting   = 1;
+                peer_ni->ibp_version     = version;
+                peer_ni->ibp_incarnation = reqmsg->ibm_srcstamp;
+
+                /* I have a ref on ni that prevents it being shutdown */
+                LASSERT (net->ibn_shutdown == 0);
+
+                kiblnd_peer_addref(peer_ni);
+		list_add_tail(&peer_ni->ibp_list, kiblnd_nid2peerlist(nid));
+
+		write_unlock_irqrestore(g_lock, flags);
+        }
+
+	conn = kiblnd_create_conn(peer_ni, cmid, IBLND_CONN_PASSIVE_WAIT, version);
+        if (conn == NULL) {
+                kiblnd_peer_connect_failed(peer_ni, 0, -ENOMEM);
+                kiblnd_peer_decref(peer_ni);
+                rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
+                goto failed;
+        }
+
+        /* conn now "owns" cmid, so I return success from here on to ensure the
+         * CM callback doesn't destroy cmid. */
+	conn->ibc_incarnation      = reqmsg->ibm_srcstamp;
+	conn->ibc_credits          = conn->ibc_queue_depth;
+	conn->ibc_reserved_credits = conn->ibc_queue_depth;
+	LASSERT(conn->ibc_credits + conn->ibc_reserved_credits +
+		IBLND_OOB_MSGS(version) <= IBLND_RX_MSGS(conn));
+
+        ackmsg = &conn->ibc_connvars->cv_msg;
+        memset(ackmsg, 0, sizeof(*ackmsg));
+
+        kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK,
+                        sizeof(ackmsg->ibm_u.connparams));
+	ackmsg->ibm_u.connparams.ibcp_queue_depth  = conn->ibc_queue_depth;
+	ackmsg->ibm_u.connparams.ibcp_max_frags    = conn->ibc_max_frags;
+	ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
+
+        kiblnd_pack_msg(ni, ackmsg, version, 0, nid, reqmsg->ibm_srcstamp);
+
+        memset(&cp, 0, sizeof(cp));
+        cp.private_data        = ackmsg;
+        cp.private_data_len    = ackmsg->ibm_nob;
+        cp.responder_resources = 0;             /* No atomic ops or RDMA reads */
+        cp.initiator_depth     = 0;
+        cp.flow_control        = 1;
+        cp.retry_count         = *kiblnd_tunables.kib_retry_count;
+        cp.rnr_retry_count     = *kiblnd_tunables.kib_rnr_retry_count;
+
+        CDEBUG(D_NET, "Accept %s\n", libcfs_nid2str(nid));
+
+        rc = rdma_accept(cmid, &cp);
+        if (rc != 0) {
+                CERROR("Can't accept %s: %d\n", libcfs_nid2str(nid), rc);
+                rej.ibr_version = version;
+                rej.ibr_why     = IBLND_REJECT_FATAL;
+
+                kiblnd_reject(cmid, &rej);
+                kiblnd_connreq_done(conn, rc);
+                kiblnd_conn_decref(conn);
+        }
+
+        lnet_ni_decref(ni);
+        return 0;
+
+ failed:
+	if (ni != NULL) {
+		rej.ibr_cp.ibcp_queue_depth =
+			kiblnd_msg_queue_size(version, ni);
+		rej.ibr_cp.ibcp_max_frags   = kiblnd_rdma_frags(version, ni);
+		lnet_ni_decref(ni);
+	}
+
+	rej.ibr_version = version;
+	kiblnd_reject(cmid, &rej);
+
+	return -ECONNREFUSED;
+}
+
+static void
+kiblnd_check_reconnect(kib_conn_t *conn, int version,
+		       __u64 incarnation, int why, kib_connparams_t *cp)
+{
+	rwlock_t	*glock = &kiblnd_data.kib_global_lock;
+	kib_peer_ni_t	*peer_ni = conn->ibc_peer;
+	char		*reason;
+	int		 msg_size = IBLND_MSG_SIZE;
+	int		 frag_num = -1;
+	int		 queue_dep = -1;
+	bool		 reconnect;
+	unsigned long	 flags;
+
+	LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+	LASSERT(peer_ni->ibp_connecting > 0);	/* 'conn' at least */
+
+	if (cp) {
+		msg_size	= cp->ibcp_max_msg_size;
+		frag_num	= cp->ibcp_max_frags;
+		queue_dep	= cp->ibcp_queue_depth;
+	}
+
+	write_lock_irqsave(glock, flags);
+        /* retry connection if it's still needed and no other connection
+         * attempts (active or passive) are in progress
+         * NB: reconnect is still needed even when ibp_tx_queue is
+         * empty if ibp_version != version because reconnect may be
+         * initiated by kiblnd_query() */
+	reconnect = (!list_empty(&peer_ni->ibp_tx_queue) ||
+		     peer_ni->ibp_version != version) &&
+		    peer_ni->ibp_connecting &&
+		    peer_ni->ibp_accepting == 0;
+	if (!reconnect) {
+		reason = "no need";
+		goto out;
+	}
+
+        switch (why) {
+        default:
+                reason = "Unknown";
+                break;
+
+	case IBLND_REJECT_RDMA_FRAGS: {
+		struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+
+		if (!cp) {
+			reason = "can't negotiate max frags";
+			goto out;
+		}
+		tunables = &peer_ni->ibp_ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+		if (!tunables->lnd_map_on_demand) {
+			reason = "map_on_demand must be enabled";
+			goto out;
+		}
+		if (conn->ibc_max_frags <= frag_num) {
+			reason = "unsupported max frags";
+			goto out;
+		}
+
+		peer_ni->ibp_max_frags = frag_num;
+		reason = "rdma fragments";
+		break;
+	}
+	case IBLND_REJECT_MSG_QUEUE_SIZE:
+		if (!cp) {
+			reason = "can't negotiate queue depth";
+			goto out;
+		}
+		if (conn->ibc_queue_depth <= queue_dep) {
+			reason = "unsupported queue depth";
+			goto out;
+		}
+
+		peer_ni->ibp_queue_depth = queue_dep;
+		reason = "queue depth";
+		break;
+
+        case IBLND_REJECT_CONN_STALE:
+                reason = "stale";
+                break;
+
+        case IBLND_REJECT_CONN_RACE:
+                reason = "conn race";
+                break;
+
+        case IBLND_REJECT_CONN_UNCOMPAT:
+                reason = "version negotiation";
+                break;
+
+	case IBLND_REJECT_INVALID_SRV_ID:
+		reason = "invalid service id";
+		break;
+        }
+
+	conn->ibc_reconnect = 1;
+	peer_ni->ibp_reconnecting++;
+	peer_ni->ibp_version = version;
+	if (incarnation != 0)
+		peer_ni->ibp_incarnation = incarnation;
+ out:
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	CNETERR("%s: %s (%s), %x, %x, msg_size: %d, queue_depth: %d/%d, max_frags: %d/%d\n",
+		libcfs_nid2str(peer_ni->ibp_nid),
+		reconnect ? "reconnect" : "don't reconnect",
+		reason, IBLND_MSG_VERSION, version, msg_size,
+		conn->ibc_queue_depth, queue_dep,
+		conn->ibc_max_frags, frag_num);
+	/*
+	 * if conn::ibc_reconnect is TRUE, connd will reconnect to the peer_ni
+	 * while destroying the zombie
+	 */
+}
+
+static void
+kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
+{
+	kib_peer_ni_t    *peer_ni = conn->ibc_peer;
+
+	LASSERT (!in_interrupt());
+	LASSERT (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+
+	switch (reason) {
+	case IB_CM_REJ_STALE_CONN:
+		kiblnd_check_reconnect(conn, IBLND_MSG_VERSION, 0,
+				       IBLND_REJECT_CONN_STALE, NULL);
+		break;
+
+        case IB_CM_REJ_INVALID_SERVICE_ID:
+		kiblnd_check_reconnect(conn, IBLND_MSG_VERSION, 0,
+				       IBLND_REJECT_INVALID_SRV_ID, NULL);
+                CNETERR("%s rejected: no listener at %d\n",
+                        libcfs_nid2str(peer_ni->ibp_nid),
+                        *kiblnd_tunables.kib_service);
+                break;
+
+        case IB_CM_REJ_CONSUMER_DEFINED:
+                if (priv_nob >= offsetof(kib_rej_t, ibr_padding)) {
+                        kib_rej_t        *rej         = priv;
+                        kib_connparams_t *cp          = NULL;
+                        int               flip        = 0;
+                        __u64             incarnation = -1;
+
+                        /* NB. default incarnation is -1 because:
+                         * a) V1 will ignore dst incarnation in connreq.
+                         * b) V2 will provide incarnation while rejecting me,
+                         *    -1 will be overwrote.
+                         *
+                         * if I try to connect to a V1 peer_ni with V2 protocol,
+                         * it rejected me then upgrade to V2, I have no idea
+                         * about the upgrading and try to reconnect with V1,
+                         * in this case upgraded V2 can find out I'm trying to
+                         * talk to the old guy and reject me(incarnation is -1). 
+                         */
+
+                        if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) ||
+                            rej->ibr_magic == __swab32(LNET_PROTO_MAGIC)) {
+                                __swab32s(&rej->ibr_magic);
+                                __swab16s(&rej->ibr_version);
+                                flip = 1;
+                        }
+
+                        if (priv_nob >= sizeof(kib_rej_t) &&
+                            rej->ibr_version > IBLND_MSG_VERSION_1) {
+                                /* priv_nob is always 148 in current version
+                                 * of OFED, so we still need to check version.
+                                 * (define of IB_CM_REJ_PRIVATE_DATA_SIZE) */
+                                cp = &rej->ibr_cp;
+
+                                if (flip) {
+                                        __swab64s(&rej->ibr_incarnation);
+                                        __swab16s(&cp->ibcp_queue_depth);
+                                        __swab16s(&cp->ibcp_max_frags);
+                                        __swab32s(&cp->ibcp_max_msg_size);
+                                }
+
+                                incarnation = rej->ibr_incarnation;
+                        }
+
+                        if (rej->ibr_magic != IBLND_MSG_MAGIC &&
+                            rej->ibr_magic != LNET_PROTO_MAGIC) {
+                                CERROR("%s rejected: consumer defined fatal error\n",
+                                       libcfs_nid2str(peer_ni->ibp_nid));
+                                break;
+                        }
+
+                        if (rej->ibr_version != IBLND_MSG_VERSION &&
+                            rej->ibr_version != IBLND_MSG_VERSION_1) {
+                                CERROR("%s rejected: o2iblnd version %x error\n",
+                                       libcfs_nid2str(peer_ni->ibp_nid),
+                                       rej->ibr_version);
+                                break;
+                        }
+
+                        if (rej->ibr_why     == IBLND_REJECT_FATAL &&
+                            rej->ibr_version == IBLND_MSG_VERSION_1) {
+                                CDEBUG(D_NET, "rejected by old version peer_ni %s: %x\n",
+                                       libcfs_nid2str(peer_ni->ibp_nid), rej->ibr_version);
+
+                                if (conn->ibc_version != IBLND_MSG_VERSION_1)
+                                        rej->ibr_why = IBLND_REJECT_CONN_UNCOMPAT;
+                        }
+
+                        switch (rej->ibr_why) {
+                        case IBLND_REJECT_CONN_RACE:
+                        case IBLND_REJECT_CONN_STALE:
+                        case IBLND_REJECT_CONN_UNCOMPAT:
+			case IBLND_REJECT_MSG_QUEUE_SIZE:
+			case IBLND_REJECT_RDMA_FRAGS:
+				kiblnd_check_reconnect(conn, rej->ibr_version,
+						incarnation, rej->ibr_why, cp);
+                                break;
+
+                        case IBLND_REJECT_NO_RESOURCES:
+                                CERROR("%s rejected: o2iblnd no resources\n",
+                                       libcfs_nid2str(peer_ni->ibp_nid));
+                                break;
+
+                        case IBLND_REJECT_FATAL:
+                                CERROR("%s rejected: o2iblnd fatal error\n",
+                                       libcfs_nid2str(peer_ni->ibp_nid));
+                                break;
+
+                        default:
+                                CERROR("%s rejected: o2iblnd reason %d\n",
+                                       libcfs_nid2str(peer_ni->ibp_nid),
+                                       rej->ibr_why);
+                                break;
+                        }
+                        break;
+                }
+                /* fall through */
+        default:
+                CNETERR("%s rejected: reason %d, size %d\n",
+                        libcfs_nid2str(peer_ni->ibp_nid), reason, priv_nob);
+                break;
+        }
+
+        kiblnd_connreq_done(conn, -ECONNREFUSED);
+}
+
+static void
+kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob)
+{
+        kib_peer_ni_t    *peer_ni = conn->ibc_peer;
+	struct lnet_ni *ni   = peer_ni->ibp_ni;
+        kib_net_t     *net  = ni->ni_data;
+        kib_msg_t     *msg  = priv;
+        int            ver  = conn->ibc_version;
+        int            rc   = kiblnd_unpack_msg(msg, priv_nob);
+        unsigned long  flags;
+
+        LASSERT (net != NULL);
+
+        if (rc != 0) {
+                CERROR("Can't unpack connack from %s: %d\n",
+                       libcfs_nid2str(peer_ni->ibp_nid), rc);
+                goto failed;
+        }
+
+        if (msg->ibm_type != IBLND_MSG_CONNACK) {
+                CERROR("Unexpected message %d from %s\n",
+                       msg->ibm_type, libcfs_nid2str(peer_ni->ibp_nid));
+                rc = -EPROTO;
+                goto failed;
+        }
+
+        if (ver != msg->ibm_version) {
+                CERROR("%s replied version %x is different with "
+                       "requested version %x\n",
+                       libcfs_nid2str(peer_ni->ibp_nid), msg->ibm_version, ver);
+                rc = -EPROTO;
+                goto failed;
+        }
+
+	if (msg->ibm_u.connparams.ibcp_queue_depth >
+	    conn->ibc_queue_depth) {
+		CERROR("%s has incompatible queue depth %d (<=%d wanted)\n",
+		       libcfs_nid2str(peer_ni->ibp_nid),
+		       msg->ibm_u.connparams.ibcp_queue_depth,
+		       conn->ibc_queue_depth);
+		rc = -EPROTO;
+		goto failed;
+	}
+
+	if (msg->ibm_u.connparams.ibcp_max_frags >
+	    conn->ibc_max_frags) {
+		CERROR("%s has incompatible max_frags %d (<=%d wanted)\n",
+		       libcfs_nid2str(peer_ni->ibp_nid),
+		       msg->ibm_u.connparams.ibcp_max_frags,
+		       conn->ibc_max_frags);
+		rc = -EPROTO;
+		goto failed;
+	}
+
+        if (msg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
+                CERROR("%s max message size %d too big (%d max)\n",
+                       libcfs_nid2str(peer_ni->ibp_nid),
+                       msg->ibm_u.connparams.ibcp_max_msg_size,
+                       IBLND_MSG_SIZE);
+                rc = -EPROTO;
+                goto failed;
+        }
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	if (msg->ibm_dstnid == ni->ni_nid &&
+	    msg->ibm_dststamp == net->ibn_incarnation)
+		rc = 0;
+	else
+		rc = -ESTALE;
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+        if (rc != 0) {
+                CERROR("Bad connection reply from %s, rc = %d, "
+                       "version: %x max_frags: %d\n",
+                       libcfs_nid2str(peer_ni->ibp_nid), rc,
+                       msg->ibm_version, msg->ibm_u.connparams.ibcp_max_frags);
+                goto failed;
+        }
+
+	conn->ibc_incarnation      = msg->ibm_srcstamp;
+	conn->ibc_credits          = msg->ibm_u.connparams.ibcp_queue_depth;
+	conn->ibc_reserved_credits = msg->ibm_u.connparams.ibcp_queue_depth;
+	conn->ibc_queue_depth      = msg->ibm_u.connparams.ibcp_queue_depth;
+	conn->ibc_max_frags        = msg->ibm_u.connparams.ibcp_max_frags;
+	LASSERT(conn->ibc_credits + conn->ibc_reserved_credits +
+		IBLND_OOB_MSGS(ver) <= IBLND_RX_MSGS(conn));
+
+        kiblnd_connreq_done(conn, 0);
+        return;
+
+ failed:
+        /* NB My QP has already established itself, so I handle anything going
+         * wrong here by setting ibc_comms_error.
+         * kiblnd_connreq_done(0) moves the conn state to ESTABLISHED, but then
+         * immediately tears it down. */
+
+        LASSERT (rc != 0);
+        conn->ibc_comms_error = rc;
+        kiblnd_connreq_done(conn, 0);
+}
+
+static int
+kiblnd_active_connect (struct rdma_cm_id *cmid)
+{
+        kib_peer_ni_t              *peer_ni = (kib_peer_ni_t *)cmid->context;
+        kib_conn_t              *conn;
+        kib_msg_t               *msg;
+        struct rdma_conn_param   cp;
+        int                      version;
+        __u64                    incarnation;
+        unsigned long            flags;
+        int                      rc;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	incarnation = peer_ni->ibp_incarnation;
+	version     = (peer_ni->ibp_version == 0) ? IBLND_MSG_VERSION :
+						 peer_ni->ibp_version;
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	conn = kiblnd_create_conn(peer_ni, cmid, IBLND_CONN_ACTIVE_CONNECT,
+				  version);
+        if (conn == NULL) {
+                kiblnd_peer_connect_failed(peer_ni, 1, -ENOMEM);
+                kiblnd_peer_decref(peer_ni); /* lose cmid's ref */
+                return -ENOMEM;
+        }
+
+        /* conn "owns" cmid now, so I return success from here on to ensure the
+         * CM callback doesn't destroy cmid. conn also takes over cmid's ref
+         * on peer_ni */
+
+        msg = &conn->ibc_connvars->cv_msg;
+
+	memset(msg, 0, sizeof(*msg));
+	kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams));
+	msg->ibm_u.connparams.ibcp_queue_depth  = conn->ibc_queue_depth;
+	msg->ibm_u.connparams.ibcp_max_frags    = conn->ibc_max_frags;
+	msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
+
+        kiblnd_pack_msg(peer_ni->ibp_ni, msg, version,
+                        0, peer_ni->ibp_nid, incarnation);
+
+        memset(&cp, 0, sizeof(cp));
+        cp.private_data        = msg;
+        cp.private_data_len    = msg->ibm_nob;
+        cp.responder_resources = 0;             /* No atomic ops or RDMA reads */
+        cp.initiator_depth     = 0;
+        cp.flow_control        = 1;
+        cp.retry_count         = *kiblnd_tunables.kib_retry_count;
+        cp.rnr_retry_count     = *kiblnd_tunables.kib_rnr_retry_count;
+
+        LASSERT(cmid->context == (void *)conn);
+        LASSERT(conn->ibc_cmid == cmid);
+
+        rc = rdma_connect(cmid, &cp);
+        if (rc != 0) {
+                CERROR("Can't connect to %s: %d\n",
+                       libcfs_nid2str(peer_ni->ibp_nid), rc);
+                kiblnd_connreq_done(conn, rc);
+                kiblnd_conn_decref(conn);
+        }
+
+        return 0;
+}
+
+int
+kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
+{
+        kib_peer_ni_t  *peer_ni;
+        kib_conn_t  *conn;
+	int          rc;
+
+	switch (event->event) {
+	default:
+                CERROR("Unexpected event: %d, status: %d\n",
+                       event->event, event->status);
+                LBUG();
+
+	case RDMA_CM_EVENT_CONNECT_REQUEST:
+                /* destroy cmid on failure */
+		rc = kiblnd_passive_connect(cmid, 
+                                            (void *)KIBLND_CONN_PARAM(event),
+                                            KIBLND_CONN_PARAM_LEN(event));
+                CDEBUG(D_NET, "connreq: %d\n", rc);
+                return rc;
+                
+	case RDMA_CM_EVENT_ADDR_ERROR:
+                peer_ni = (kib_peer_ni_t *)cmid->context;
+                CNETERR("%s: ADDR ERROR %d\n",
+                       libcfs_nid2str(peer_ni->ibp_nid), event->status);
+                kiblnd_peer_connect_failed(peer_ni, 1, -EHOSTUNREACH);
+                kiblnd_peer_decref(peer_ni);
+                return -EHOSTUNREACH;      /* rc != 0 destroys cmid */
+
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+                peer_ni = (kib_peer_ni_t *)cmid->context;
+
+                CDEBUG(D_NET,"%s Addr resolved: %d\n",
+                       libcfs_nid2str(peer_ni->ibp_nid), event->status);
+
+                if (event->status != 0) {
+                        CNETERR("Can't resolve address for %s: %d\n",
+                                libcfs_nid2str(peer_ni->ibp_nid), event->status);
+                        rc = event->status;
+                } else {
+                        rc = rdma_resolve_route(
+                                cmid, *kiblnd_tunables.kib_timeout * 1000);
+			if (rc == 0) {
+				kib_net_t *net = peer_ni->ibp_ni->ni_data;
+				kib_dev_t *dev = net->ibn_dev;
+
+				CDEBUG(D_NET, "%s: connection bound to "\
+				       "%s:%pI4h:%s\n",
+				       libcfs_nid2str(peer_ni->ibp_nid),
+				       dev->ibd_ifname,
+				       &dev->ibd_ifip, cmid->device->name);
+
+				return 0;
+			}
+
+                        /* Can't initiate route resolution */
+                        CERROR("Can't resolve route for %s: %d\n",
+                               libcfs_nid2str(peer_ni->ibp_nid), rc);
+                }
+                kiblnd_peer_connect_failed(peer_ni, 1, rc);
+                kiblnd_peer_decref(peer_ni);
+                return rc;                      /* rc != 0 destroys cmid */
+
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+                peer_ni = (kib_peer_ni_t *)cmid->context;
+                CNETERR("%s: ROUTE ERROR %d\n",
+                        libcfs_nid2str(peer_ni->ibp_nid), event->status);
+                kiblnd_peer_connect_failed(peer_ni, 1, -EHOSTUNREACH);
+                kiblnd_peer_decref(peer_ni);
+                return -EHOSTUNREACH;           /* rc != 0 destroys cmid */
+
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+                peer_ni = (kib_peer_ni_t *)cmid->context;
+                CDEBUG(D_NET,"%s Route resolved: %d\n",
+                       libcfs_nid2str(peer_ni->ibp_nid), event->status);
+
+                if (event->status == 0)
+                        return kiblnd_active_connect(cmid);
+
+                CNETERR("Can't resolve route for %s: %d\n",
+                       libcfs_nid2str(peer_ni->ibp_nid), event->status);
+                kiblnd_peer_connect_failed(peer_ni, 1, event->status);
+                kiblnd_peer_decref(peer_ni);
+                return event->status;           /* rc != 0 destroys cmid */
+                
+	case RDMA_CM_EVENT_UNREACHABLE:
+                conn = (kib_conn_t *)cmid->context;
+                LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
+                        conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
+                CNETERR("%s: UNREACHABLE %d\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
+                kiblnd_connreq_done(conn, -ENETDOWN);
+                kiblnd_conn_decref(conn);
+                return 0;
+
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+                conn = (kib_conn_t *)cmid->context;
+                LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
+                        conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
+                CNETERR("%s: CONNECT ERROR %d\n",
+                        libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
+                kiblnd_connreq_done(conn, -ENOTCONN);
+                kiblnd_conn_decref(conn);
+                return 0;
+
+	case RDMA_CM_EVENT_REJECTED:
+                conn = (kib_conn_t *)cmid->context;
+                switch (conn->ibc_state) {
+                default:
+                        LBUG();
+
+                case IBLND_CONN_PASSIVE_WAIT:
+                        CERROR ("%s: REJECTED %d\n",
+                                libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                                event->status);
+                        kiblnd_connreq_done(conn, -ECONNRESET);
+                        break;
+
+                case IBLND_CONN_ACTIVE_CONNECT:
+                        kiblnd_rejected(conn, event->status,
+                                        (void *)KIBLND_CONN_PARAM(event),
+                                        KIBLND_CONN_PARAM_LEN(event));
+                        break;
+                }
+                kiblnd_conn_decref(conn);
+                return 0;
+
+	case RDMA_CM_EVENT_ESTABLISHED:
+                conn = (kib_conn_t *)cmid->context;
+                switch (conn->ibc_state) {
+                default:
+                        LBUG();
+
+                case IBLND_CONN_PASSIVE_WAIT:
+                        CDEBUG(D_NET, "ESTABLISHED (passive): %s\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        kiblnd_connreq_done(conn, 0);
+                        break;
+
+                case IBLND_CONN_ACTIVE_CONNECT:
+                        CDEBUG(D_NET, "ESTABLISHED(active): %s\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        kiblnd_check_connreply(conn,
+                                               (void *)KIBLND_CONN_PARAM(event),
+                                               KIBLND_CONN_PARAM_LEN(event));
+                        break;
+                }
+                /* net keeps its ref on conn! */
+                return 0;
+
+        case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+                CDEBUG(D_NET, "Ignore TIMEWAIT_EXIT event\n");
+                return 0;
+
+	case RDMA_CM_EVENT_DISCONNECTED:
+                conn = (kib_conn_t *)cmid->context;
+                if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+                        CERROR("%s DISCONNECTED\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        kiblnd_connreq_done(conn, -ECONNRESET);
+                } else {
+                        kiblnd_close_conn(conn, 0);
+                }
+                kiblnd_conn_decref(conn);
+                cmid->context = NULL;
+                return 0;
+
+        case RDMA_CM_EVENT_DEVICE_REMOVAL:
+                LCONSOLE_ERROR_MSG(0x131,
+                                   "Received notification of device removal\n"
+                                   "Please shutdown LNET to allow this to proceed\n");
+                /* Can't remove network from underneath LNET for now, so I have
+                 * to ignore this */
+                return 0;
+
+        case RDMA_CM_EVENT_ADDR_CHANGE:
+                LCONSOLE_INFO("Physical link changed (eg hca/port)\n");
+                return 0;
+        }
+}
+
+static int
+kiblnd_check_txs_locked(kib_conn_t *conn, struct list_head *txs)
+{
+	kib_tx_t	 *tx;
+	struct list_head *ttmp;
+
+	list_for_each(ttmp, txs) {
+		tx = list_entry(ttmp, kib_tx_t, tx_list);
+
+		if (txs != &conn->ibc_active_txs) {
+			LASSERT(tx->tx_queued);
+		} else {
+			LASSERT(!tx->tx_queued);
+			LASSERT(tx->tx_waiting || tx->tx_sending != 0);
+		}
+
+		if (cfs_time_aftereq(jiffies, tx->tx_deadline)) {
+			CERROR("Timed out tx: %s, %lu seconds\n",
+			       kiblnd_queue2str(conn, txs),
+			       cfs_duration_sec(jiffies - tx->tx_deadline));
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static int
+kiblnd_conn_timed_out_locked(kib_conn_t *conn)
+{
+        return  kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue) ||
+                kiblnd_check_txs_locked(conn, &conn->ibc_tx_noops) ||
+                kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_rsrvd) ||
+                kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_nocred) ||
+                kiblnd_check_txs_locked(conn, &conn->ibc_active_txs);
+}
+
+static void
+kiblnd_check_conns (int idx)
+{
+	struct list_head  closes = LIST_HEAD_INIT(closes);
+	struct list_head  checksends = LIST_HEAD_INIT(checksends);
+	struct list_head  timedout_txs = LIST_HEAD_INIT(timedout_txs);
+	struct list_head *peers = &kiblnd_data.kib_peers[idx];
+	struct list_head *ptmp;
+	kib_peer_ni_t	 *peer_ni;
+	kib_conn_t	 *conn;
+	kib_tx_t	 *tx, *tx_tmp;
+	struct list_head *ctmp;
+	unsigned long	  flags;
+
+	/* NB. We expect to have a look at all the peers and not find any
+	 * RDMAs to time out, so we just use a shared lock while we
+	 * take a look... */
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	list_for_each(ptmp, peers) {
+		peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list);
+
+		/* Check tx_deadline */
+		list_for_each_entry_safe(tx, tx_tmp, &peer_ni->ibp_tx_queue, tx_list) {
+			if (cfs_time_aftereq(jiffies, tx->tx_deadline)) {
+				CWARN("Timed out tx for %s: %lu seconds\n",
+				      libcfs_nid2str(peer_ni->ibp_nid),
+				      cfs_duration_sec(jiffies - tx->tx_deadline));
+				list_move(&tx->tx_list, &timedout_txs);
+			}
+		}
+
+		list_for_each(ctmp, &peer_ni->ibp_conns) {
+			int timedout;
+			int sendnoop;
+
+			conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+			LASSERT(conn->ibc_state == IBLND_CONN_ESTABLISHED);
+
+			spin_lock(&conn->ibc_lock);
+
+			sendnoop = kiblnd_need_noop(conn);
+			timedout = kiblnd_conn_timed_out_locked(conn);
+			if (!sendnoop && !timedout) {
+				spin_unlock(&conn->ibc_lock);
+				continue;
+			}
+
+			if (timedout) {
+				CERROR("Timed out RDMA with %s (%lu): "
+				       "c: %u, oc: %u, rc: %u\n",
+				       libcfs_nid2str(peer_ni->ibp_nid),
+				       cfs_duration_sec(cfs_time_current() -
+							peer_ni->ibp_last_alive),
+				       conn->ibc_credits,
+				       conn->ibc_outstanding_credits,
+				       conn->ibc_reserved_credits);
+				list_add(&conn->ibc_connd_list, &closes);
+			} else {
+				list_add(&conn->ibc_connd_list, &checksends);
+			}
+			/* +ref for 'closes' or 'checksends' */
+			kiblnd_conn_addref(conn);
+
+			spin_unlock(&conn->ibc_lock);
+		}
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	if (!list_empty(&timedout_txs))
+		kiblnd_txlist_done(&timedout_txs, -ETIMEDOUT);
+
+	/* Handle timeout by closing the whole
+	 * connection. We can only be sure RDMA activity
+	 * has ceased once the QP has been modified. */
+	while (!list_empty(&closes)) {
+		conn = list_entry(closes.next,
+				  kib_conn_t, ibc_connd_list);
+		list_del(&conn->ibc_connd_list);
+		kiblnd_close_conn(conn, -ETIMEDOUT);
+		kiblnd_conn_decref(conn);
+	}
+
+	/* In case we have enough credits to return via a
+	 * NOOP, but there were no non-blocking tx descs
+	 * free to do it last time... */
+	while (!list_empty(&checksends)) {
+		conn = list_entry(checksends.next,
+				  kib_conn_t, ibc_connd_list);
+		list_del(&conn->ibc_connd_list);
+
+		spin_lock(&conn->ibc_lock);
+		kiblnd_check_sends_locked(conn);
+		spin_unlock(&conn->ibc_lock);
+
+		kiblnd_conn_decref(conn);
+	}
+}
+
+static void
+kiblnd_disconnect_conn (kib_conn_t *conn)
+{
+	LASSERT (!in_interrupt());
+	LASSERT (current == kiblnd_data.kib_connd);
+	LASSERT (conn->ibc_state == IBLND_CONN_CLOSING);
+
+	rdma_disconnect(conn->ibc_cmid);
+	kiblnd_finalise_conn(conn);
+
+	kiblnd_peer_notify(conn->ibc_peer);
+}
+
+/*
+ * High-water for reconnection to the same peer_ni, reconnection attempt should
+ * be delayed after trying more than KIB_RECONN_HIGH_RACE.
+ */
+#define KIB_RECONN_HIGH_RACE	10
+/*
+ * Allow connd to take a break and handle other things after consecutive
+ * reconnection attemps.
+ */
+#define KIB_RECONN_BREAK	100
+
+int
+kiblnd_connd (void *arg)
+{
+	spinlock_t	  *lock= &kiblnd_data.kib_connd_lock;
+	wait_queue_entry_t wait;
+	unsigned long      flags;
+	kib_conn_t        *conn;
+	int                timeout;
+	int                i;
+	int                dropped_lock;
+	int                peer_index = 0;
+	unsigned long      deadline = jiffies;
+
+	cfs_block_allsigs();
+
+	init_waitqueue_entry(&wait, current);
+	kiblnd_data.kib_connd = current;
+
+	spin_lock_irqsave(lock, flags);
+
+	while (!kiblnd_data.kib_shutdown) {
+		int reconn = 0;
+
+                dropped_lock = 0;
+
+		if (!list_empty(&kiblnd_data.kib_connd_zombies)) {
+			kib_peer_ni_t *peer_ni = NULL;
+
+			conn = list_entry(kiblnd_data.kib_connd_zombies.next,
+					  kib_conn_t, ibc_list);
+			list_del(&conn->ibc_list);
+			if (conn->ibc_reconnect) {
+				peer_ni = conn->ibc_peer;
+				kiblnd_peer_addref(peer_ni);
+			}
+
+			spin_unlock_irqrestore(lock, flags);
+			dropped_lock = 1;
+
+			kiblnd_destroy_conn(conn, !peer_ni);
+
+			spin_lock_irqsave(lock, flags);
+			if (!peer_ni)
+				continue;
+
+			conn->ibc_peer = peer_ni;
+			if (peer_ni->ibp_reconnected < KIB_RECONN_HIGH_RACE)
+				list_add_tail(&conn->ibc_list,
+					      &kiblnd_data.kib_reconn_list);
+			else
+				list_add_tail(&conn->ibc_list,
+					      &kiblnd_data.kib_reconn_wait);
+		}
+
+		if (!list_empty(&kiblnd_data.kib_connd_conns)) {
+			conn = list_entry(kiblnd_data.kib_connd_conns.next,
+					      kib_conn_t, ibc_list);
+			list_del(&conn->ibc_list);
+
+			spin_unlock_irqrestore(lock, flags);
+			dropped_lock = 1;
+
+			kiblnd_disconnect_conn(conn);
+			kiblnd_conn_decref(conn);
+
+			spin_lock_irqsave(lock, flags);
+                }
+
+		while (reconn < KIB_RECONN_BREAK) {
+			if (kiblnd_data.kib_reconn_sec != get_seconds()) {
+				kiblnd_data.kib_reconn_sec = get_seconds();
+				list_splice_init(&kiblnd_data.kib_reconn_wait,
+						 &kiblnd_data.kib_reconn_list);
+			}
+
+			if (list_empty(&kiblnd_data.kib_reconn_list))
+				break;
+
+			conn = list_entry(kiblnd_data.kib_reconn_list.next,
+					  kib_conn_t, ibc_list);
+			list_del(&conn->ibc_list);
+
+			spin_unlock_irqrestore(lock, flags);
+			dropped_lock = 1;
+
+			reconn += kiblnd_reconnect_peer(conn->ibc_peer);
+			kiblnd_peer_decref(conn->ibc_peer);
+			LIBCFS_FREE(conn, sizeof(*conn));
+
+			spin_lock_irqsave(lock, flags);
+		}
+
+                /* careful with the jiffy wrap... */
+                timeout = (int)(deadline - jiffies);
+                if (timeout <= 0) {
+                        const int n = 4;
+                        const int p = 1;
+                        int       chunk = kiblnd_data.kib_peer_hash_size;
+
+			spin_unlock_irqrestore(lock, flags);
+                        dropped_lock = 1;
+
+                        /* Time to check for RDMA timeouts on a few more
+                         * peers: I do checks every 'p' seconds on a
+                         * proportion of the peer_ni table and I need to check
+                         * every connection 'n' times within a timeout
+                         * interval, to ensure I detect a timeout on any
+                         * connection within (n+1)/n times the timeout
+                         * interval. */
+
+                        if (*kiblnd_tunables.kib_timeout > n * p)
+                                chunk = (chunk * n * p) /
+                                        *kiblnd_tunables.kib_timeout;
+                        if (chunk == 0)
+                                chunk = 1;
+
+			for (i = 0; i < chunk; i++) {
+				kiblnd_check_conns(peer_index);
+				peer_index = (peer_index + 1) %
+					     kiblnd_data.kib_peer_hash_size;
+			}
+
+			deadline += msecs_to_jiffies(p * MSEC_PER_SEC);
+			spin_lock_irqsave(lock, flags);
+		}
+
+		if (dropped_lock)
+			continue;
+
+		/* Nothing to do for 'timeout'  */
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
+		spin_unlock_irqrestore(lock, flags);
+
+		schedule_timeout(timeout);
+
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
+		spin_lock_irqsave(lock, flags);
+	}
+
+	spin_unlock_irqrestore(lock, flags);
+
+	kiblnd_thread_fini();
+	return 0;
+}
+
+void
+kiblnd_qp_event(struct ib_event *event, void *arg)
+{
+        kib_conn_t *conn = arg;
+
+        switch (event->event) {
+        case IB_EVENT_COMM_EST:
+                CDEBUG(D_NET, "%s established\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		/* We received a packet but connection isn't established
+		 * probably handshake packet was lost, so free to
+		 * force make connection established */
+		rdma_notify(conn->ibc_cmid, IB_EVENT_COMM_EST);
+                return;
+
+        default:
+                CERROR("%s: Async QP event type %d\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
+                return;
+        }
+}
+
+static void
+kiblnd_complete (struct ib_wc *wc)
+{
+	switch (kiblnd_wreqid2type(wc->wr_id)) {
+	default:
+		LBUG();
+
+	case IBLND_WID_MR:
+		if (wc->status != IB_WC_SUCCESS &&
+		    wc->status != IB_WC_WR_FLUSH_ERR)
+			CNETERR("FastReg failed: %d\n", wc->status);
+		return;
+
+        case IBLND_WID_RDMA:
+                /* We only get RDMA completion notification if it fails.  All
+                 * subsequent work items, including the final SEND will fail
+                 * too.  However we can't print out any more info about the
+                 * failing RDMA because 'tx' might be back on the idle list or
+                 * even reused already if we didn't manage to post all our work
+                 * items */
+                CNETERR("RDMA (tx: %p) failed: %d\n",
+                        kiblnd_wreqid2ptr(wc->wr_id), wc->status);
+                return;
+
+        case IBLND_WID_TX:
+                kiblnd_tx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status);
+                return;
+
+        case IBLND_WID_RX:
+                kiblnd_rx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status,
+                                   wc->byte_len);
+                return;
+        }
+}
+
+void
+kiblnd_cq_completion(struct ib_cq *cq, void *arg)
+{
+	/* NB I'm not allowed to schedule this conn once its refcount has
+	 * reached 0.  Since fundamentally I'm racing with scheduler threads
+	 * consuming my CQ I could be called after all completions have
+	 * occurred.  But in this case, ibc_nrx == 0 && ibc_nsends_posted == 0
+	 * and this CQ is about to be destroyed so I NOOP. */
+	kib_conn_t		*conn = (kib_conn_t *)arg;
+	struct kib_sched_info	*sched = conn->ibc_sched;
+	unsigned long		flags;
+
+	LASSERT(cq == conn->ibc_cq);
+
+	spin_lock_irqsave(&sched->ibs_lock, flags);
+
+	conn->ibc_ready = 1;
+
+	if (!conn->ibc_scheduled &&
+	    (conn->ibc_nrx > 0 ||
+	     conn->ibc_nsends_posted > 0)) {
+		kiblnd_conn_addref(conn); /* +1 ref for sched_conns */
+		conn->ibc_scheduled = 1;
+		list_add_tail(&conn->ibc_sched_list, &sched->ibs_conns);
+
+		if (waitqueue_active(&sched->ibs_waitq))
+			wake_up(&sched->ibs_waitq);
+	}
+
+	spin_unlock_irqrestore(&sched->ibs_lock, flags);
+}
+
+void
+kiblnd_cq_event(struct ib_event *event, void *arg)
+{
+        kib_conn_t *conn = arg;
+
+        CERROR("%s: async CQ event type %d\n",
+               libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
+}
+
+int
+kiblnd_scheduler(void *arg)
+{
+	long			id = (long)arg;
+	struct kib_sched_info	*sched;
+	kib_conn_t		*conn;
+	wait_queue_entry_t      wait;
+	unsigned long		flags;
+	struct ib_wc		wc;
+	int			did_something;
+	int			busy_loops = 0;
+	int			rc;
+
+	cfs_block_allsigs();
+
+	init_waitqueue_entry(&wait, current);
+
+	sched = kiblnd_data.kib_scheds[KIB_THREAD_CPT(id)];
+
+	rc = cfs_cpt_bind(lnet_cpt_table(), sched->ibs_cpt);
+	if (rc != 0) {
+		CWARN("Unable to bind on CPU partition %d, please verify "
+		      "whether all CPUs are healthy and reload modules if "
+		      "necessary, otherwise your system might under risk of "
+		      "low performance\n", sched->ibs_cpt);
+	}
+
+	spin_lock_irqsave(&sched->ibs_lock, flags);
+
+	while (!kiblnd_data.kib_shutdown) {
+		if (busy_loops++ >= IBLND_RESCHED) {
+			spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+			cond_resched();
+			busy_loops = 0;
+
+			spin_lock_irqsave(&sched->ibs_lock, flags);
+		}
+
+		did_something = 0;
+
+		if (!list_empty(&sched->ibs_conns)) {
+			conn = list_entry(sched->ibs_conns.next,
+					      kib_conn_t, ibc_sched_list);
+			/* take over kib_sched_conns' ref on conn... */
+			LASSERT(conn->ibc_scheduled);
+			list_del(&conn->ibc_sched_list);
+			conn->ibc_ready = 0;
+
+			spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+			wc.wr_id = IBLND_WID_INVAL;
+
+                        rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
+                        if (rc == 0) {
+                                rc = ib_req_notify_cq(conn->ibc_cq,
+                                                      IB_CQ_NEXT_COMP);
+                                if (rc < 0) {
+                                        CWARN("%s: ib_req_notify_cq failed: %d, "
+                                              "closing connection\n",
+                                              libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+                                        kiblnd_close_conn(conn, -EIO);
+                                        kiblnd_conn_decref(conn);
+					spin_lock_irqsave(&sched->ibs_lock,
+							      flags);
+					continue;
+				}
+
+				rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
+			}
+
+			if (unlikely(rc > 0 && wc.wr_id == IBLND_WID_INVAL)) {
+				LCONSOLE_ERROR(
+					"ib_poll_cq (rc: %d) returned invalid "
+					"wr_id, opcode %d, status: %d, "
+					"vendor_err: %d, conn: %s status: %d\n"
+					"please upgrade firmware and OFED or "
+					"contact vendor.\n", rc,
+					wc.opcode, wc.status, wc.vendor_err,
+					libcfs_nid2str(conn->ibc_peer->ibp_nid),
+					conn->ibc_state);
+				rc = -EINVAL;
+			}
+
+			if (rc < 0) {
+				CWARN("%s: ib_poll_cq failed: %d, "
+				      "closing connection\n",
+				      libcfs_nid2str(conn->ibc_peer->ibp_nid),
+				      rc);
+				kiblnd_close_conn(conn, -EIO);
+				kiblnd_conn_decref(conn);
+				spin_lock_irqsave(&sched->ibs_lock, flags);
+				continue;
+			}
+
+			spin_lock_irqsave(&sched->ibs_lock, flags);
+
+			if (rc != 0 || conn->ibc_ready) {
+				/* There may be another completion waiting; get
+				 * another scheduler to check while I handle
+				 * this one... */
+				/* +1 ref for sched_conns */
+				kiblnd_conn_addref(conn);
+				list_add_tail(&conn->ibc_sched_list,
+						  &sched->ibs_conns);
+				if (waitqueue_active(&sched->ibs_waitq))
+					wake_up(&sched->ibs_waitq);
+			} else {
+				conn->ibc_scheduled = 0;
+			}
+
+			if (rc != 0) {
+				spin_unlock_irqrestore(&sched->ibs_lock, flags);
+				kiblnd_complete(&wc);
+
+				spin_lock_irqsave(&sched->ibs_lock, flags);
+                        }
+
+                        kiblnd_conn_decref(conn); /* ...drop my ref from above */
+                        did_something = 1;
+                }
+
+                if (did_something)
+                        continue;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue_exclusive(&sched->ibs_waitq, &wait);
+		spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+		schedule();
+		busy_loops = 0;
+
+		remove_wait_queue(&sched->ibs_waitq, &wait);
+		set_current_state(TASK_RUNNING);
+		spin_lock_irqsave(&sched->ibs_lock, flags);
+	}
+
+	spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+	kiblnd_thread_fini();
+	return 0;
+}
+
+int
+kiblnd_failover_thread(void *arg)
+{
+	rwlock_t	*glock = &kiblnd_data.kib_global_lock;
+	kib_dev_t	*dev;
+	wait_queue_entry_t wait;
+	unsigned long	 flags;
+	int		 rc;
+
+	LASSERT(*kiblnd_tunables.kib_dev_failover != 0);
+
+	cfs_block_allsigs();
+
+	init_waitqueue_entry(&wait, current);
+	write_lock_irqsave(glock, flags);
+
+        while (!kiblnd_data.kib_shutdown) {
+                int     do_failover = 0;
+                int     long_sleep;
+
+		list_for_each_entry(dev, &kiblnd_data.kib_failed_devs,
+                                    ibd_fail_list) {
+                        if (cfs_time_before(cfs_time_current(),
+                                            dev->ibd_next_failover))
+                                continue;
+                        do_failover = 1;
+                        break;
+                }
+
+                if (do_failover) {
+			list_del_init(&dev->ibd_fail_list);
+                        dev->ibd_failover = 1;
+			write_unlock_irqrestore(glock, flags);
+
+			rc = kiblnd_dev_failover(dev);
+
+			write_lock_irqsave(glock, flags);
+
+                        LASSERT (dev->ibd_failover);
+                        dev->ibd_failover = 0;
+                        if (rc >= 0) { /* Device is OK or failover succeed */
+                                dev->ibd_next_failover = cfs_time_shift(3);
+                                continue;
+                        }
+
+                        /* failed to failover, retry later */
+                        dev->ibd_next_failover =
+                                cfs_time_shift(min(dev->ibd_failed_failover, 10));
+                        if (kiblnd_dev_can_failover(dev)) {
+				list_add_tail(&dev->ibd_fail_list,
+                                              &kiblnd_data.kib_failed_devs);
+                        }
+
+                        continue;
+                }
+
+                /* long sleep if no more pending failover */
+		long_sleep = list_empty(&kiblnd_data.kib_failed_devs);
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
+		write_unlock_irqrestore(glock, flags);
+
+		rc = schedule_timeout(long_sleep ? cfs_time_seconds(10) :
+						   cfs_time_seconds(1));
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
+		write_lock_irqsave(glock, flags);
+
+                if (!long_sleep || rc != 0)
+                        continue;
+
+                /* have a long sleep, routine check all active devices,
+                 * we need checking like this because if there is not active
+                 * connection on the dev and no SEND from local, we may listen
+                 * on wrong HCA for ever while there is a bonding failover */
+		list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
+                        if (kiblnd_dev_can_failover(dev)) {
+				list_add_tail(&dev->ibd_fail_list,
+                                              &kiblnd_data.kib_failed_devs);
+                        }
+                }
+        }
+
+	write_unlock_irqrestore(glock, flags);
+
+        kiblnd_thread_fini();
+        return 0;
+}
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_modparams.c
new file mode 100644
index 0000000000000..72cb50ecd14f5
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_modparams.c
@@ -0,0 +1,315 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd_modparams.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "o2iblnd.h"
+
+#define CURRENT_LND_VERSION 1
+
+static int service = 987;
+module_param(service, int, 0444);
+MODULE_PARM_DESC(service, "service number (within RDMA_PS_TCP)");
+
+static int cksum = 0;
+module_param(cksum, int, 0644);
+MODULE_PARM_DESC(cksum, "set non-zero to enable message (not RDMA) checksums");
+
+static int timeout = 50;
+module_param(timeout, int, 0644);
+MODULE_PARM_DESC(timeout, "timeout (seconds)");
+
+/* Number of threads in each scheduler pool which is percpt,
+ * we will estimate reasonable value based on CPUs if it's set to zero. */
+static int nscheds;
+module_param(nscheds, int, 0444);
+MODULE_PARM_DESC(nscheds, "number of threads in each scheduler pool");
+
+static unsigned int conns_per_peer = 1;
+module_param(conns_per_peer, uint, 0444);
+MODULE_PARM_DESC(conns_per_peer, "number of connections per peer");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int ntx = 512;
+module_param(ntx, int, 0444);
+MODULE_PARM_DESC(ntx, "# of message descriptors allocated for each pool");
+
+/* NB: this value is shared by all CPTs */
+static int credits = 256;
+module_param(credits, int, 0444);
+MODULE_PARM_DESC(credits, "# concurrent sends");
+
+static int peer_credits = 8;
+module_param(peer_credits, int, 0444);
+MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer");
+
+static int peer_credits_hiw = 0;
+module_param(peer_credits_hiw, int, 0444);
+MODULE_PARM_DESC(peer_credits_hiw, "when eagerly to return credits");
+
+static int peer_buffer_credits = 0;
+module_param(peer_buffer_credits, int, 0444);
+MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits");
+
+static int peer_timeout = 180;
+module_param(peer_timeout, int, 0444);
+MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)");
+
+static char *ipif_name = "ib0";
+module_param(ipif_name, charp, 0444);
+MODULE_PARM_DESC(ipif_name, "IPoIB interface name");
+
+static int retry_count = 5;
+module_param(retry_count, int, 0644);
+MODULE_PARM_DESC(retry_count, "Retransmissions when no ACK received");
+
+static int rnr_retry_count = 6;
+module_param(rnr_retry_count, int, 0644);
+MODULE_PARM_DESC(rnr_retry_count, "RNR retransmissions");
+
+static int keepalive = 100;
+module_param(keepalive, int, 0644);
+MODULE_PARM_DESC(keepalive, "Idle time in seconds before sending a keepalive");
+
+static int ib_mtu;
+module_param(ib_mtu, int, 0444);
+MODULE_PARM_DESC(ib_mtu, "IB MTU 256/512/1024/2048/4096");
+
+static int concurrent_sends;
+module_param(concurrent_sends, int, 0444);
+MODULE_PARM_DESC(concurrent_sends, "send work-queue sizing");
+
+#ifdef HAVE_IB_GET_DMA_MR
+#define IBLND_DEFAULT_MAP_ON_DEMAND 0
+#define IBLND_MIN_MAP_ON_DEMAND 0
+#else
+#define IBLND_DEFAULT_MAP_ON_DEMAND IBLND_MAX_RDMA_FRAGS
+#define IBLND_MIN_MAP_ON_DEMAND 1
+#endif
+static int map_on_demand = IBLND_DEFAULT_MAP_ON_DEMAND;
+module_param(map_on_demand, int, 0444);
+MODULE_PARM_DESC(map_on_demand, "map on demand");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int fmr_pool_size = 512;
+module_param(fmr_pool_size, int, 0444);
+MODULE_PARM_DESC(fmr_pool_size, "size of fmr pool on each CPT (>= ntx / 4)");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int fmr_flush_trigger = 384;
+module_param(fmr_flush_trigger, int, 0444);
+MODULE_PARM_DESC(fmr_flush_trigger, "# dirty FMRs that triggers pool flush");
+
+static int fmr_cache = 1;
+module_param(fmr_cache, int, 0444);
+MODULE_PARM_DESC(fmr_cache, "non-zero to enable FMR caching");
+
+/*
+ * 0: disable failover
+ * 1: enable failover if necessary
+ * 2: force to failover (for debug)
+ */
+static int dev_failover = 0;
+module_param(dev_failover, int, 0444);
+MODULE_PARM_DESC(dev_failover, "HCA failover for bonding (0 off, 1 on, other values reserved)");
+
+static int require_privileged_port;
+module_param(require_privileged_port, int, 0644);
+MODULE_PARM_DESC(require_privileged_port, "require privileged port when accepting connection");
+
+static int use_privileged_port = 1;
+module_param(use_privileged_port, int, 0644);
+MODULE_PARM_DESC(use_privileged_port, "use privileged port when initiating connection");
+
+static unsigned int wrq_sge = 2;
+module_param(wrq_sge, uint, 0444);
+MODULE_PARM_DESC(wrq_sge, "# scatter/gather element per work request");
+
+kib_tunables_t kiblnd_tunables = {
+        .kib_dev_failover           = &dev_failover,
+        .kib_service                = &service,
+        .kib_cksum                  = &cksum,
+        .kib_timeout                = &timeout,
+        .kib_keepalive              = &keepalive,
+        .kib_default_ipif           = &ipif_name,
+        .kib_retry_count            = &retry_count,
+        .kib_rnr_retry_count        = &rnr_retry_count,
+        .kib_ib_mtu                 = &ib_mtu,
+        .kib_require_priv_port      = &require_privileged_port,
+	.kib_use_priv_port	    = &use_privileged_port,
+	.kib_nscheds		    = &nscheds,
+	.kib_wrq_sge		    = &wrq_sge,
+};
+
+static struct lnet_ioctl_config_o2iblnd_tunables default_tunables;
+
+/* # messages/RDMAs in-flight */
+int
+kiblnd_msg_queue_size(int version, struct lnet_ni *ni)
+{
+	if (version == IBLND_MSG_VERSION_1)
+		return IBLND_MSG_QUEUE_SIZE_V1;
+	else if (ni)
+		return ni->ni_net->net_tunables.lct_peer_tx_credits;
+	else
+		return peer_credits;
+}
+
+int
+kiblnd_tunables_setup(struct lnet_ni *ni)
+{
+	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+	struct lnet_ioctl_config_lnd_cmn_tunables *net_tunables;
+
+	/*
+	 * if there was no tunables specified, setup the tunables to be
+	 * defaulted
+	 */
+	if (!ni->ni_lnd_tunables_set)
+		memcpy(&ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib,
+		       &default_tunables, sizeof(*tunables));
+
+	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+
+	/* Current API version */
+	tunables->lnd_version = CURRENT_LND_VERSION;
+
+	if (kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu) < 0) {
+		CERROR("Invalid ib_mtu %d, expected 256/512/1024/2048/4096\n",
+		       *kiblnd_tunables.kib_ib_mtu);
+		return -EINVAL;
+	}
+
+	net_tunables = &ni->ni_net->net_tunables;
+
+	if (net_tunables->lct_peer_timeout == -1)
+		net_tunables->lct_peer_timeout = peer_timeout;
+
+	if (net_tunables->lct_max_tx_credits == -1)
+		net_tunables->lct_max_tx_credits = credits;
+
+	if (net_tunables->lct_peer_tx_credits == -1)
+		net_tunables->lct_peer_tx_credits = peer_credits;
+
+	if (net_tunables->lct_peer_rtr_credits == -1)
+		net_tunables->lct_peer_rtr_credits = peer_buffer_credits;
+
+	if (net_tunables->lct_peer_tx_credits < IBLND_CREDITS_DEFAULT)
+		net_tunables->lct_peer_tx_credits = IBLND_CREDITS_DEFAULT;
+
+	if (net_tunables->lct_peer_tx_credits > IBLND_CREDITS_MAX)
+		net_tunables->lct_peer_tx_credits = IBLND_CREDITS_MAX;
+
+	if (net_tunables->lct_peer_tx_credits >
+	    net_tunables->lct_max_tx_credits)
+		net_tunables->lct_peer_tx_credits =
+			net_tunables->lct_max_tx_credits;
+
+	if (!tunables->lnd_peercredits_hiw)
+		tunables->lnd_peercredits_hiw = peer_credits_hiw;
+
+	if (tunables->lnd_peercredits_hiw < net_tunables->lct_peer_tx_credits / 2)
+		tunables->lnd_peercredits_hiw = net_tunables->lct_peer_tx_credits / 2;
+
+	if (tunables->lnd_peercredits_hiw >= net_tunables->lct_peer_tx_credits)
+		tunables->lnd_peercredits_hiw = net_tunables->lct_peer_tx_credits - 1;
+
+	if (tunables->lnd_map_on_demand < IBLND_MIN_MAP_ON_DEMAND ||
+	    tunables->lnd_map_on_demand > IBLND_MAX_RDMA_FRAGS) {
+		/* Use the default */
+		CWARN("Invalid map_on_demand (%d), expects %d - %d. Using default of %d\n",
+		      tunables->lnd_map_on_demand, IBLND_MIN_MAP_ON_DEMAND,
+		      IBLND_MAX_RDMA_FRAGS, IBLND_DEFAULT_MAP_ON_DEMAND);
+		tunables->lnd_map_on_demand = IBLND_DEFAULT_MAP_ON_DEMAND;
+	}
+
+	if (tunables->lnd_map_on_demand == 1) {
+		/* don't make sense to create map if only one fragment */
+		tunables->lnd_map_on_demand = 2;
+	}
+
+	if (tunables->lnd_concurrent_sends == 0) {
+		if (tunables->lnd_map_on_demand > 0 &&
+		    tunables->lnd_map_on_demand <= IBLND_MAX_RDMA_FRAGS / 8) {
+			tunables->lnd_concurrent_sends =
+					net_tunables->lct_peer_tx_credits * 2;
+		} else {
+			tunables->lnd_concurrent_sends =
+				net_tunables->lct_peer_tx_credits;
+		}
+	}
+
+	if (tunables->lnd_concurrent_sends > net_tunables->lct_peer_tx_credits * 2)
+		tunables->lnd_concurrent_sends = net_tunables->lct_peer_tx_credits * 2;
+
+	if (tunables->lnd_concurrent_sends < net_tunables->lct_peer_tx_credits / 2)
+		tunables->lnd_concurrent_sends = net_tunables->lct_peer_tx_credits / 2;
+
+	if (tunables->lnd_concurrent_sends < net_tunables->lct_peer_tx_credits) {
+		CWARN("Concurrent sends %d is lower than message "
+		      "queue size: %d, performance may drop slightly.\n",
+		      tunables->lnd_concurrent_sends,
+		      net_tunables->lct_peer_tx_credits);
+	}
+
+	if (!tunables->lnd_fmr_pool_size)
+		tunables->lnd_fmr_pool_size = fmr_pool_size;
+	if (!tunables->lnd_fmr_flush_trigger)
+		tunables->lnd_fmr_flush_trigger = fmr_flush_trigger;
+	if (!tunables->lnd_fmr_cache)
+		tunables->lnd_fmr_cache = fmr_cache;
+	if (!tunables->lnd_ntx)
+		tunables->lnd_ntx = ntx;
+	if (!tunables->lnd_conns_per_peer) {
+		tunables->lnd_conns_per_peer = (conns_per_peer) ?
+			conns_per_peer : 1;
+	}
+
+	return 0;
+}
+
+int
+kiblnd_tunables_init(void)
+{
+	default_tunables.lnd_version = CURRENT_LND_VERSION;
+	default_tunables.lnd_peercredits_hiw = peer_credits_hiw,
+	default_tunables.lnd_map_on_demand = map_on_demand;
+	default_tunables.lnd_concurrent_sends = concurrent_sends;
+	default_tunables.lnd_fmr_pool_size = fmr_pool_size;
+	default_tunables.lnd_fmr_flush_trigger = fmr_flush_trigger;
+	default_tunables.lnd_fmr_cache = fmr_cache;
+	default_tunables.lnd_ntx = ntx;
+	default_tunables.lnd_conns_per_peer = conns_per_peer;
+	return 0;
+}
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c
new file mode 100644
index 0000000000000..c4700c0713948
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c
@@ -0,0 +1,2940 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/socklnd/socklnd.c
+ *
+ * Author: Zach Brown <zab@zabbo.net>
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include <linux/pci.h>
+#include "socklnd.h"
+
+static struct lnet_lnd the_ksocklnd;
+ksock_nal_data_t        ksocknal_data;
+
+static ksock_interface_t *
+ksocknal_ip2iface(struct lnet_ni *ni, __u32 ip)
+{
+	ksock_net_t *net = ni->ni_data;
+	int i;
+	ksock_interface_t *iface;
+
+	for (i = 0; i < net->ksnn_ninterfaces; i++) {
+		LASSERT(i < LNET_NUM_INTERFACES);
+		iface = &net->ksnn_interfaces[i];
+
+		if (iface->ksni_ipaddr == ip)
+			return iface;
+	}
+
+	return NULL;
+}
+
+static ksock_route_t *
+ksocknal_create_route (__u32 ipaddr, int port)
+{
+	ksock_route_t *route;
+
+	LIBCFS_ALLOC (route, sizeof (*route));
+	if (route == NULL)
+		return (NULL);
+
+	atomic_set (&route->ksnr_refcount, 1);
+	route->ksnr_peer = NULL;
+	route->ksnr_retry_interval = 0;         /* OK to connect at any time */
+	route->ksnr_ipaddr = ipaddr;
+        route->ksnr_port = port;
+        route->ksnr_scheduled = 0;
+        route->ksnr_connecting = 0;
+        route->ksnr_connected = 0;
+        route->ksnr_deleted = 0;
+        route->ksnr_conn_count = 0;
+        route->ksnr_share_count = 0;
+
+        return (route);
+}
+
+void
+ksocknal_destroy_route (ksock_route_t *route)
+{
+	LASSERT (atomic_read(&route->ksnr_refcount) == 0);
+
+	if (route->ksnr_peer != NULL)
+		ksocknal_peer_decref(route->ksnr_peer);
+
+	LIBCFS_FREE (route, sizeof (*route));
+}
+
+static int
+ksocknal_create_peer(ksock_peer_ni_t **peerp, struct lnet_ni *ni,
+		     struct lnet_process_id id)
+{
+	int		cpt = lnet_cpt_of_nid(id.nid, ni);
+	ksock_net_t	*net = ni->ni_data;
+	ksock_peer_ni_t	*peer_ni;
+
+	LASSERT(id.nid != LNET_NID_ANY);
+	LASSERT(id.pid != LNET_PID_ANY);
+	LASSERT(!in_interrupt());
+
+	LIBCFS_CPT_ALLOC(peer_ni, lnet_cpt_table(), cpt, sizeof(*peer_ni));
+	if (peer_ni == NULL)
+		return -ENOMEM;
+
+	peer_ni->ksnp_ni = ni;
+	peer_ni->ksnp_id = id;
+	atomic_set(&peer_ni->ksnp_refcount, 1);	/* 1 ref for caller */
+	peer_ni->ksnp_closing = 0;
+	peer_ni->ksnp_accepting = 0;
+	peer_ni->ksnp_proto = NULL;
+	peer_ni->ksnp_last_alive = 0;
+	peer_ni->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1;
+
+	INIT_LIST_HEAD(&peer_ni->ksnp_conns);
+	INIT_LIST_HEAD(&peer_ni->ksnp_routes);
+	INIT_LIST_HEAD(&peer_ni->ksnp_tx_queue);
+	INIT_LIST_HEAD(&peer_ni->ksnp_zc_req_list);
+	spin_lock_init(&peer_ni->ksnp_lock);
+
+	spin_lock_bh(&net->ksnn_lock);
+
+	if (net->ksnn_shutdown) {
+		spin_unlock_bh(&net->ksnn_lock);
+
+		LIBCFS_FREE(peer_ni, sizeof(*peer_ni));
+		CERROR("Can't create peer_ni: network shutdown\n");
+		return -ESHUTDOWN;
+	}
+
+	net->ksnn_npeers++;
+
+	spin_unlock_bh(&net->ksnn_lock);
+
+	*peerp = peer_ni;
+	return 0;
+}
+
+void
+ksocknal_destroy_peer (ksock_peer_ni_t *peer_ni)
+{
+	ksock_net_t    *net = peer_ni->ksnp_ni->ni_data;
+
+	CDEBUG (D_NET, "peer_ni %s %p deleted\n",
+		libcfs_id2str(peer_ni->ksnp_id), peer_ni);
+
+	LASSERT(atomic_read(&peer_ni->ksnp_refcount) == 0);
+	LASSERT(peer_ni->ksnp_accepting == 0);
+	LASSERT(list_empty(&peer_ni->ksnp_conns));
+	LASSERT(list_empty(&peer_ni->ksnp_routes));
+	LASSERT(list_empty(&peer_ni->ksnp_tx_queue));
+	LASSERT(list_empty(&peer_ni->ksnp_zc_req_list));
+
+	LIBCFS_FREE(peer_ni, sizeof(*peer_ni));
+
+        /* NB a peer_ni's connections and routes keep a reference on their peer_ni
+         * until they are destroyed, so we can be assured that _all_ state to
+         * do with this peer_ni has been cleaned up when its refcount drops to
+         * zero. */
+	spin_lock_bh(&net->ksnn_lock);
+	net->ksnn_npeers--;
+	spin_unlock_bh(&net->ksnn_lock);
+}
+
+ksock_peer_ni_t *
+ksocknal_find_peer_locked(struct lnet_ni *ni, struct lnet_process_id id)
+{
+	struct list_head *peer_list = ksocknal_nid2peerlist(id.nid);
+	struct list_head *tmp;
+	ksock_peer_ni_t	 *peer_ni;
+
+	list_for_each(tmp, peer_list) {
+
+		peer_ni = list_entry(tmp, ksock_peer_ni_t, ksnp_list);
+
+		LASSERT(!peer_ni->ksnp_closing);
+
+		if (peer_ni->ksnp_ni != ni)
+			continue;
+
+		if (peer_ni->ksnp_id.nid != id.nid ||
+		    peer_ni->ksnp_id.pid != id.pid)
+			continue;
+
+		CDEBUG(D_NET, "got peer_ni [%p] -> %s (%d)\n",
+		       peer_ni, libcfs_id2str(id),
+		       atomic_read(&peer_ni->ksnp_refcount));
+		return peer_ni;
+	}
+	return NULL;
+}
+
+ksock_peer_ni_t *
+ksocknal_find_peer(struct lnet_ni *ni, struct lnet_process_id id)
+{
+        ksock_peer_ni_t     *peer_ni;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+	peer_ni = ksocknal_find_peer_locked(ni, id);
+	if (peer_ni != NULL)			/* +1 ref for caller? */
+		ksocknal_peer_addref(peer_ni);
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+        return (peer_ni);
+}
+
+static void
+ksocknal_unlink_peer_locked(ksock_peer_ni_t *peer_ni)
+{
+	int i;
+	__u32 ip;
+	ksock_interface_t *iface;
+
+	for (i = 0; i < peer_ni->ksnp_n_passive_ips; i++) {
+		LASSERT(i < LNET_NUM_INTERFACES);
+		ip = peer_ni->ksnp_passive_ips[i];
+
+		iface = ksocknal_ip2iface(peer_ni->ksnp_ni, ip);
+		/*
+		 * All IPs in peer_ni->ksnp_passive_ips[] come from the
+		 * interface list, therefore the call must succeed.
+		 */
+		LASSERT(iface != NULL);
+
+		CDEBUG(D_NET, "peer_ni=%p iface=%p ksni_nroutes=%d\n",
+		       peer_ni, iface, iface->ksni_nroutes);
+		iface->ksni_npeers--;
+	}
+
+	LASSERT(list_empty(&peer_ni->ksnp_conns));
+	LASSERT(list_empty(&peer_ni->ksnp_routes));
+	LASSERT(!peer_ni->ksnp_closing);
+	peer_ni->ksnp_closing = 1;
+	list_del(&peer_ni->ksnp_list);
+	/* lose peerlist's ref */
+	ksocknal_peer_decref(peer_ni);
+}
+
+static int
+ksocknal_get_peer_info(struct lnet_ni *ni, int index,
+		       struct lnet_process_id *id, __u32 *myip, __u32 *peer_ip,
+		       int *port, int *conn_count, int *share_count)
+{
+	ksock_peer_ni_t	  *peer_ni;
+	struct list_head  *ptmp;
+	ksock_route_t     *route;
+	struct list_head  *rtmp;
+	int		   i;
+        int                j;
+	int		   rc = -ENOENT;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+		list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
+			peer_ni = list_entry(ptmp, ksock_peer_ni_t, ksnp_list);
+
+			if (peer_ni->ksnp_ni != ni)
+				continue;
+
+			if (peer_ni->ksnp_n_passive_ips == 0 &&
+			    list_empty(&peer_ni->ksnp_routes)) {
+				if (index-- > 0)
+					continue;
+
+                                *id = peer_ni->ksnp_id;
+                                *myip = 0;
+                                *peer_ip = 0;
+                                *port = 0;
+                                *conn_count = 0;
+                                *share_count = 0;
+                                rc = 0;
+                                goto out;
+                        }
+
+			for (j = 0; j < peer_ni->ksnp_n_passive_ips; j++) {
+				if (index-- > 0)
+					continue;
+
+                                *id = peer_ni->ksnp_id;
+                                *myip = peer_ni->ksnp_passive_ips[j];
+                                *peer_ip = 0;
+                                *port = 0;
+                                *conn_count = 0;
+                                *share_count = 0;
+                                rc = 0;
+                                goto out;
+                        }
+
+			list_for_each(rtmp, &peer_ni->ksnp_routes) {
+				if (index-- > 0)
+					continue;
+
+				route = list_entry(rtmp, ksock_route_t,
+						   ksnr_list);
+
+				*id = peer_ni->ksnp_id;
+				*myip = route->ksnr_myipaddr;
+				*peer_ip = route->ksnr_ipaddr;
+				*port = route->ksnr_port;
+				*conn_count = route->ksnr_conn_count;
+				*share_count = route->ksnr_share_count;
+				rc = 0;
+				goto out;
+			}
+		}
+	}
+out:
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+	return rc;
+}
+
+static void
+ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn)
+{
+	ksock_peer_ni_t	  *peer_ni = route->ksnr_peer;
+	int		   type = conn->ksnc_type;
+	ksock_interface_t *iface;
+
+	conn->ksnc_route = route;
+	ksocknal_route_addref(route);
+
+	if (route->ksnr_myipaddr != conn->ksnc_myipaddr) {
+		if (route->ksnr_myipaddr == 0) {
+			/* route wasn't bound locally yet (the initial route) */
+			CDEBUG(D_NET, "Binding %s %pI4h to %pI4h\n",
+			       libcfs_id2str(peer_ni->ksnp_id),
+			       &route->ksnr_ipaddr,
+			       &conn->ksnc_myipaddr);
+		} else {
+			CDEBUG(D_NET, "Rebinding %s %pI4h from %pI4h "
+			       "to %pI4h\n", libcfs_id2str(peer_ni->ksnp_id),
+			       &route->ksnr_ipaddr,
+			       &route->ksnr_myipaddr,
+			       &conn->ksnc_myipaddr);
+
+                        iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+                                                  route->ksnr_myipaddr);
+                        if (iface != NULL)
+                                iface->ksni_nroutes--;
+                }
+                route->ksnr_myipaddr = conn->ksnc_myipaddr;
+                iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+                                          route->ksnr_myipaddr);
+                if (iface != NULL)
+                        iface->ksni_nroutes++;
+        }
+
+        route->ksnr_connected |= (1<<type);
+        route->ksnr_conn_count++;
+
+        /* Successful connection => further attempts can
+         * proceed immediately */
+        route->ksnr_retry_interval = 0;
+}
+
+static void
+ksocknal_add_route_locked (ksock_peer_ni_t *peer_ni, ksock_route_t *route)
+{
+	struct list_head *tmp;
+	ksock_conn_t	 *conn;
+	ksock_route_t	 *route2;
+
+	LASSERT(!peer_ni->ksnp_closing);
+	LASSERT(route->ksnr_peer == NULL);
+	LASSERT(!route->ksnr_scheduled);
+	LASSERT(!route->ksnr_connecting);
+	LASSERT(route->ksnr_connected == 0);
+
+	/* LASSERT(unique) */
+	list_for_each(tmp, &peer_ni->ksnp_routes) {
+		route2 = list_entry(tmp, ksock_route_t, ksnr_list);
+
+		if (route2->ksnr_ipaddr == route->ksnr_ipaddr) {
+			CERROR("Duplicate route %s %pI4h\n",
+			       libcfs_id2str(peer_ni->ksnp_id),
+			       &route->ksnr_ipaddr);
+			LBUG();
+		}
+	}
+
+	route->ksnr_peer = peer_ni;
+	ksocknal_peer_addref(peer_ni);
+	/* peer_ni's routelist takes over my ref on 'route' */
+	list_add_tail(&route->ksnr_list, &peer_ni->ksnp_routes);
+
+	list_for_each(tmp, &peer_ni->ksnp_conns) {
+		conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+		if (conn->ksnc_ipaddr != route->ksnr_ipaddr)
+			continue;
+
+		ksocknal_associate_route_conn_locked(route, conn);
+		/* keep going (typed routes) */
+	}
+}
+
+static void
+ksocknal_del_route_locked (ksock_route_t *route)
+{
+	ksock_peer_ni_t	  *peer_ni = route->ksnr_peer;
+	ksock_interface_t *iface;
+	ksock_conn_t	  *conn;
+	struct list_head  *ctmp;
+	struct list_head  *cnxt;
+
+	LASSERT(!route->ksnr_deleted);
+
+	/* Close associated conns */
+	list_for_each_safe(ctmp, cnxt, &peer_ni->ksnp_conns) {
+		conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
+
+		if (conn->ksnc_route != route)
+			continue;
+
+		ksocknal_close_conn_locked(conn, 0);
+	}
+
+	if (route->ksnr_myipaddr != 0) {
+		iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+					  route->ksnr_myipaddr);
+		if (iface != NULL)
+			iface->ksni_nroutes--;
+	}
+
+	route->ksnr_deleted = 1;
+	list_del(&route->ksnr_list);
+	ksocknal_route_decref(route);		/* drop peer_ni's ref */
+
+	if (list_empty(&peer_ni->ksnp_routes) &&
+	    list_empty(&peer_ni->ksnp_conns)) {
+		/* I've just removed the last route to a peer_ni with no active
+		 * connections */
+		ksocknal_unlink_peer_locked(peer_ni);
+	}
+}
+
+int
+ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ipaddr,
+		  int port)
+{
+	struct list_head *tmp;
+	ksock_peer_ni_t	 *peer_ni;
+	ksock_peer_ni_t	 *peer2;
+	ksock_route_t	 *route;
+	ksock_route_t	 *route2;
+	int		  rc;
+
+        if (id.nid == LNET_NID_ANY ||
+            id.pid == LNET_PID_ANY)
+                return (-EINVAL);
+
+        /* Have a brand new peer_ni ready... */
+        rc = ksocknal_create_peer(&peer_ni, ni, id);
+        if (rc != 0)
+                return rc;
+
+        route = ksocknal_create_route (ipaddr, port);
+        if (route == NULL) {
+                ksocknal_peer_decref(peer_ni);
+                return (-ENOMEM);
+        }
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+        /* always called with a ref on ni, so shutdown can't have started */
+        LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0);
+
+	peer2 = ksocknal_find_peer_locked(ni, id);
+	if (peer2 != NULL) {
+		ksocknal_peer_decref(peer_ni);
+		peer_ni = peer2;
+	} else {
+		/* peer_ni table takes my ref on peer_ni */
+		list_add_tail(&peer_ni->ksnp_list,
+			      ksocknal_nid2peerlist(id.nid));
+	}
+
+	route2 = NULL;
+	list_for_each(tmp, &peer_ni->ksnp_routes) {
+		route2 = list_entry(tmp, ksock_route_t, ksnr_list);
+
+		if (route2->ksnr_ipaddr == ipaddr)
+			break;
+
+		route2 = NULL;
+	}
+	if (route2 == NULL) {
+		ksocknal_add_route_locked(peer_ni, route);
+		route->ksnr_share_count++;
+	} else {
+		ksocknal_route_decref(route);
+		route2->ksnr_share_count++;
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	return 0;
+}
+
+static void
+ksocknal_del_peer_locked (ksock_peer_ni_t *peer_ni, __u32 ip)
+{
+	ksock_conn_t	 *conn;
+	ksock_route_t	 *route;
+	struct list_head *tmp;
+	struct list_head *nxt;
+	int		  nshared;
+
+	LASSERT(!peer_ni->ksnp_closing);
+
+	/* Extra ref prevents peer_ni disappearing until I'm done with it */
+	ksocknal_peer_addref(peer_ni);
+
+	list_for_each_safe(tmp, nxt, &peer_ni->ksnp_routes) {
+		route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+		/* no match */
+		if (!(ip == 0 || route->ksnr_ipaddr == ip))
+			continue;
+
+		route->ksnr_share_count = 0;
+		/* This deletes associated conns too */
+		ksocknal_del_route_locked(route);
+	}
+
+	nshared = 0;
+	list_for_each_safe(tmp, nxt, &peer_ni->ksnp_routes) {
+		route = list_entry(tmp, ksock_route_t, ksnr_list);
+		nshared += route->ksnr_share_count;
+	}
+
+	if (nshared == 0) {
+		/* remove everything else if there are no explicit entries
+		 * left */
+
+		list_for_each_safe(tmp, nxt, &peer_ni->ksnp_routes) {
+			route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+			/* we should only be removing auto-entries */
+			LASSERT(route->ksnr_share_count == 0);
+			ksocknal_del_route_locked(route);
+		}
+
+		list_for_each_safe(tmp, nxt, &peer_ni->ksnp_conns) {
+			conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+			ksocknal_close_conn_locked(conn, 0);
+		}
+	}
+
+	ksocknal_peer_decref(peer_ni);
+		/* NB peer_ni unlinks itself when last conn/route is removed */
+}
+
+static int
+ksocknal_del_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip)
+{
+	struct list_head  zombies = LIST_HEAD_INIT(zombies);
+	struct list_head *ptmp;
+	struct list_head *pnxt;
+	ksock_peer_ni_t     *peer_ni;
+	int		  lo;
+	int		  hi;
+	int		  i;
+	int		  rc = -ENOENT;
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	if (id.nid != LNET_NID_ANY) {
+		hi = (int)(ksocknal_nid2peerlist(id.nid) -
+			   ksocknal_data.ksnd_peers);
+		lo = hi;
+	} else {
+		lo = 0;
+		hi = ksocknal_data.ksnd_peer_hash_size - 1;
+	}
+
+	for (i = lo; i <= hi; i++) {
+		list_for_each_safe(ptmp, pnxt,
+				   &ksocknal_data.ksnd_peers[i]) {
+			peer_ni = list_entry(ptmp, ksock_peer_ni_t, ksnp_list);
+
+			if (peer_ni->ksnp_ni != ni)
+				continue;
+
+			if (!((id.nid == LNET_NID_ANY ||
+			       peer_ni->ksnp_id.nid == id.nid) &&
+			      (id.pid == LNET_PID_ANY ||
+			       peer_ni->ksnp_id.pid == id.pid)))
+				continue;
+
+			ksocknal_peer_addref(peer_ni);	/* a ref for me... */
+
+			ksocknal_del_peer_locked(peer_ni, ip);
+
+			if (peer_ni->ksnp_closing &&
+			    !list_empty(&peer_ni->ksnp_tx_queue)) {
+				LASSERT(list_empty(&peer_ni->ksnp_conns));
+				LASSERT(list_empty(&peer_ni->ksnp_routes));
+
+				list_splice_init(&peer_ni->ksnp_tx_queue,
+						 &zombies);
+			}
+
+			ksocknal_peer_decref(peer_ni);	/* ...till here */
+
+			rc = 0;				/* matched! */
+		}
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	ksocknal_txlist_done(ni, &zombies, -ENETDOWN);
+
+	return rc;
+}
+
+static ksock_conn_t *
+ksocknal_get_conn_by_idx(struct lnet_ni *ni, int index)
+{
+	ksock_peer_ni_t	 *peer_ni;
+	struct list_head *ptmp;
+	ksock_conn_t	 *conn;
+	struct list_head *ctmp;
+	int		  i;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+		list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
+			peer_ni = list_entry(ptmp, ksock_peer_ni_t, ksnp_list);
+
+			LASSERT(!peer_ni->ksnp_closing);
+
+			if (peer_ni->ksnp_ni != ni)
+				continue;
+
+			list_for_each(ctmp, &peer_ni->ksnp_conns) {
+				if (index-- > 0)
+					continue;
+
+				conn = list_entry(ctmp, ksock_conn_t,
+						  ksnc_list);
+				ksocknal_conn_addref(conn);
+				read_unlock(&ksocknal_data. \
+					    ksnd_global_lock);
+				return conn;
+			}
+		}
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+	return NULL;
+}
+
+static ksock_sched_t *
+ksocknal_choose_scheduler_locked(unsigned int cpt)
+{
+	struct ksock_sched_info	*info = ksocknal_data.ksnd_sched_info[cpt];
+	ksock_sched_t		*sched;
+	int			i;
+
+	if (info->ksi_nthreads == 0) {
+		cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
+			if (info->ksi_nthreads > 0) {
+				CDEBUG(D_NET, "scheduler[%d] has no threads. selected scheduler[%d]\n",
+				       cpt, info->ksi_cpt);
+				goto select_sched;
+			}
+		}
+		return NULL;
+	}
+
+select_sched:
+	sched = &info->ksi_scheds[0];
+	/*
+	 * NB: it's safe so far, but info->ksi_nthreads could be changed
+	 * at runtime when we have dynamic LNet configuration, then we
+	 * need to take care of this.
+	 */
+	for (i = 1; i < info->ksi_nthreads; i++) {
+		if (sched->kss_nconns > info->ksi_scheds[i].kss_nconns)
+			sched = &info->ksi_scheds[i];
+	}
+
+	return sched;
+}
+
+static int
+ksocknal_local_ipvec(struct lnet_ni *ni, __u32 *ipaddrs)
+{
+	ksock_net_t *net = ni->ni_data;
+	int i;
+	int nip;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	nip = net->ksnn_ninterfaces;
+	LASSERT(nip <= LNET_NUM_INTERFACES);
+
+	/*
+	 * Only offer interfaces for additional connections if I have
+	 * more than one.
+	 */
+	if (nip < 2) {
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+		return 0;
+	}
+
+	for (i = 0; i < nip; i++) {
+		ipaddrs[i] = net->ksnn_interfaces[i].ksni_ipaddr;
+		LASSERT(ipaddrs[i] != 0);
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+	return nip;
+}
+
+static int
+ksocknal_match_peerip (ksock_interface_t *iface, __u32 *ips, int nips)
+{
+        int   best_netmatch = 0;
+        int   best_xor      = 0;
+        int   best          = -1;
+        int   this_xor;
+        int   this_netmatch;
+        int   i;
+
+        for (i = 0; i < nips; i++) {
+                if (ips[i] == 0)
+                        continue;
+
+                this_xor = (ips[i] ^ iface->ksni_ipaddr);
+                this_netmatch = ((this_xor & iface->ksni_netmask) == 0) ? 1 : 0;
+
+                if (!(best < 0 ||
+                      best_netmatch < this_netmatch ||
+                      (best_netmatch == this_netmatch &&
+                       best_xor > this_xor)))
+                        continue;
+
+                best = i;
+                best_netmatch = this_netmatch;
+                best_xor = this_xor;
+        }
+
+        LASSERT (best >= 0);
+        return (best);
+}
+
+static int
+ksocknal_select_ips(ksock_peer_ni_t *peer_ni, __u32 *peerips, int n_peerips)
+{
+	rwlock_t		*global_lock = &ksocknal_data.ksnd_global_lock;
+        ksock_net_t        *net = peer_ni->ksnp_ni->ni_data;
+        ksock_interface_t  *iface;
+        ksock_interface_t  *best_iface;
+        int                 n_ips;
+        int                 i;
+        int                 j;
+        int                 k;
+        __u32               ip;
+        __u32               xor;
+        int                 this_netmatch;
+        int                 best_netmatch;
+        int                 best_npeers;
+
+        /* CAVEAT EMPTOR: We do all our interface matching with an
+         * exclusive hold of global lock at IRQ priority.  We're only
+         * expecting to be dealing with small numbers of interfaces, so the
+         * O(n**3)-ness shouldn't matter */
+
+        /* Also note that I'm not going to return more than n_peerips
+         * interfaces, even if I have more myself */
+
+	write_lock_bh(global_lock);
+
+	LASSERT(n_peerips <= LNET_NUM_INTERFACES);
+	LASSERT(net->ksnn_ninterfaces <= LNET_NUM_INTERFACES);
+
+	/* Only match interfaces for additional connections
+         * if I have > 1 interface */
+        n_ips = (net->ksnn_ninterfaces < 2) ? 0 :
+                MIN(n_peerips, net->ksnn_ninterfaces);
+
+        for (i = 0; peer_ni->ksnp_n_passive_ips < n_ips; i++) {
+                /*              ^ yes really... */
+
+                /* If we have any new interfaces, first tick off all the
+                 * peer_ni IPs that match old interfaces, then choose new
+                 * interfaces to match the remaining peer_ni IPS.
+                 * We don't forget interfaces we've stopped using; we might
+                 * start using them again... */
+
+                if (i < peer_ni->ksnp_n_passive_ips) {
+                        /* Old interface. */
+                        ip = peer_ni->ksnp_passive_ips[i];
+                        best_iface = ksocknal_ip2iface(peer_ni->ksnp_ni, ip);
+
+                        /* peer_ni passive ips are kept up to date */
+                        LASSERT(best_iface != NULL);
+                } else {
+                        /* choose a new interface */
+                        LASSERT (i == peer_ni->ksnp_n_passive_ips);
+
+                        best_iface = NULL;
+                        best_netmatch = 0;
+                        best_npeers = 0;
+
+                        for (j = 0; j < net->ksnn_ninterfaces; j++) {
+                                iface = &net->ksnn_interfaces[j];
+                                ip = iface->ksni_ipaddr;
+
+                                for (k = 0; k < peer_ni->ksnp_n_passive_ips; k++)
+                                        if (peer_ni->ksnp_passive_ips[k] == ip)
+                                                break;
+
+                                if (k < peer_ni->ksnp_n_passive_ips) /* using it already */
+                                        continue;
+
+                                k = ksocknal_match_peerip(iface, peerips, n_peerips);
+                                xor = (ip ^ peerips[k]);
+                                this_netmatch = ((xor & iface->ksni_netmask) == 0) ? 1 : 0;
+
+                                if (!(best_iface == NULL ||
+                                      best_netmatch < this_netmatch ||
+                                      (best_netmatch == this_netmatch &&
+                                       best_npeers > iface->ksni_npeers)))
+                                        continue;
+
+                                best_iface = iface;
+                                best_netmatch = this_netmatch;
+                                best_npeers = iface->ksni_npeers;
+                        }
+
+			LASSERT(best_iface != NULL);
+
+                        best_iface->ksni_npeers++;
+                        ip = best_iface->ksni_ipaddr;
+                        peer_ni->ksnp_passive_ips[i] = ip;
+                        peer_ni->ksnp_n_passive_ips = i+1;
+                }
+
+                /* mark the best matching peer_ni IP used */
+                j = ksocknal_match_peerip(best_iface, peerips, n_peerips);
+                peerips[j] = 0;
+        }
+
+        /* Overwrite input peer_ni IP addresses */
+        memcpy(peerips, peer_ni->ksnp_passive_ips, n_ips * sizeof(*peerips));
+
+	write_unlock_bh(global_lock);
+
+        return (n_ips);
+}
+
+static void
+ksocknal_create_routes(ksock_peer_ni_t *peer_ni, int port,
+                       __u32 *peer_ipaddrs, int npeer_ipaddrs)
+{
+	ksock_route_t		*newroute = NULL;
+	rwlock_t		*global_lock = &ksocknal_data.ksnd_global_lock;
+	struct lnet_ni *ni = peer_ni->ksnp_ni;
+	ksock_net_t		*net = ni->ni_data;
+	struct list_head	*rtmp;
+	ksock_route_t		*route;
+	ksock_interface_t	*iface;
+	ksock_interface_t	*best_iface;
+	int			best_netmatch;
+	int			this_netmatch;
+	int			best_nroutes;
+	int			i;
+	int			j;
+
+        /* CAVEAT EMPTOR: We do all our interface matching with an
+         * exclusive hold of global lock at IRQ priority.  We're only
+         * expecting to be dealing with small numbers of interfaces, so the
+         * O(n**3)-ness here shouldn't matter */
+
+	write_lock_bh(global_lock);
+
+        if (net->ksnn_ninterfaces < 2) {
+		/* Only create additional connections
+                 * if I have > 1 interface */
+		write_unlock_bh(global_lock);
+                return;
+        }
+
+	LASSERT(npeer_ipaddrs <= LNET_NUM_INTERFACES);
+
+        for (i = 0; i < npeer_ipaddrs; i++) {
+                if (newroute != NULL) {
+                        newroute->ksnr_ipaddr = peer_ipaddrs[i];
+                } else {
+			write_unlock_bh(global_lock);
+
+                        newroute = ksocknal_create_route(peer_ipaddrs[i], port);
+                        if (newroute == NULL)
+                                return;
+
+			write_lock_bh(global_lock);
+                }
+
+                if (peer_ni->ksnp_closing) {
+                        /* peer_ni got closed under me */
+                        break;
+                }
+
+		/* Already got a route? */
+		route = NULL;
+		list_for_each(rtmp, &peer_ni->ksnp_routes) {
+			route = list_entry(rtmp, ksock_route_t, ksnr_list);
+
+			if (route->ksnr_ipaddr == newroute->ksnr_ipaddr)
+				break;
+
+			route = NULL;
+		}
+		if (route != NULL)
+			continue;
+
+		best_iface = NULL;
+		best_nroutes = 0;
+		best_netmatch = 0;
+
+		LASSERT(net->ksnn_ninterfaces <= LNET_NUM_INTERFACES);
+
+		/* Select interface to connect from */
+		for (j = 0; j < net->ksnn_ninterfaces; j++) {
+			iface = &net->ksnn_interfaces[j];
+
+			/* Using this interface already? */
+			list_for_each(rtmp, &peer_ni->ksnp_routes) {
+				route = list_entry(rtmp, ksock_route_t,
+						   ksnr_list);
+
+				if (route->ksnr_myipaddr == iface->ksni_ipaddr)
+					break;
+
+				route = NULL;
+			}
+			if (route != NULL)
+				continue;
+
+                        this_netmatch = (((iface->ksni_ipaddr ^
+                                           newroute->ksnr_ipaddr) &
+                                           iface->ksni_netmask) == 0) ? 1 : 0;
+
+                        if (!(best_iface == NULL ||
+                              best_netmatch < this_netmatch ||
+                              (best_netmatch == this_netmatch &&
+                               best_nroutes > iface->ksni_nroutes)))
+                                continue;
+
+                        best_iface = iface;
+                        best_netmatch = this_netmatch;
+                        best_nroutes = iface->ksni_nroutes;
+                }
+
+                if (best_iface == NULL)
+                        continue;
+
+                newroute->ksnr_myipaddr = best_iface->ksni_ipaddr;
+                best_iface->ksni_nroutes++;
+
+                ksocknal_add_route_locked(peer_ni, newroute);
+                newroute = NULL;
+        }
+
+	write_unlock_bh(global_lock);
+        if (newroute != NULL)
+                ksocknal_route_decref(newroute);
+}
+
+int
+ksocknal_accept(struct lnet_ni *ni, struct socket *sock)
+{
+	ksock_connreq_t	*cr;
+	int		 rc;
+	__u32		 peer_ip;
+	int		 peer_port;
+
+	rc = lnet_sock_getaddr(sock, true, &peer_ip, &peer_port);
+	LASSERT(rc == 0);		/* we succeeded before */
+
+	LIBCFS_ALLOC(cr, sizeof(*cr));
+	if (cr == NULL) {
+		LCONSOLE_ERROR_MSG(0x12f, "Dropping connection request from "
+				   "%pI4h: memory exhausted\n", &peer_ip);
+		return -ENOMEM;
+	}
+
+	lnet_ni_addref(ni);
+	cr->ksncr_ni   = ni;
+	cr->ksncr_sock = sock;
+
+	spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+
+	list_add_tail(&cr->ksncr_list, &ksocknal_data.ksnd_connd_connreqs);
+	wake_up(&ksocknal_data.ksnd_connd_waitq);
+
+	spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+	return 0;
+}
+
+static int
+ksocknal_connecting (ksock_peer_ni_t *peer_ni, __u32 ipaddr)
+{
+	ksock_route_t *route;
+
+	list_for_each_entry(route, &peer_ni->ksnp_routes, ksnr_list) {
+		if (route->ksnr_ipaddr == ipaddr)
+			return route->ksnr_connecting;
+	}
+	return 0;
+}
+
+int
+ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
+		     struct socket *sock, int type)
+{
+	rwlock_t		*global_lock = &ksocknal_data.ksnd_global_lock;
+	struct list_head	zombies = LIST_HEAD_INIT(zombies);
+	struct lnet_process_id peerid;
+	struct list_head	*tmp;
+        __u64              incarnation;
+        ksock_conn_t      *conn;
+        ksock_conn_t      *conn2;
+        ksock_peer_ni_t      *peer_ni = NULL;
+        ksock_peer_ni_t      *peer2;
+        ksock_sched_t     *sched;
+	struct ksock_hello_msg *hello;
+	int		   cpt;
+        ksock_tx_t        *tx;
+        ksock_tx_t        *txtmp;
+        int                rc;
+	int                rc2;
+        int                active;
+        char              *warn = NULL;
+
+        active = (route != NULL);
+
+        LASSERT (active == (type != SOCKLND_CONN_NONE));
+
+        LIBCFS_ALLOC(conn, sizeof(*conn));
+        if (conn == NULL) {
+                rc = -ENOMEM;
+                goto failed_0;
+        }
+
+        conn->ksnc_peer = NULL;
+        conn->ksnc_route = NULL;
+        conn->ksnc_sock = sock;
+	/* 2 ref, 1 for conn, another extra ref prevents socket
+	 * being closed before establishment of connection */
+	atomic_set (&conn->ksnc_sock_refcount, 2);
+	conn->ksnc_type = type;
+	ksocknal_lib_save_callback(sock, conn);
+	atomic_set (&conn->ksnc_conn_refcount, 1); /* 1 ref for me */
+
+	conn->ksnc_rx_ready = 0;
+	conn->ksnc_rx_scheduled = 0;
+
+	INIT_LIST_HEAD(&conn->ksnc_tx_queue);
+	conn->ksnc_tx_ready = 0;
+	conn->ksnc_tx_scheduled = 0;
+	conn->ksnc_tx_carrier = NULL;
+	atomic_set (&conn->ksnc_tx_nob, 0);
+
+	LIBCFS_ALLOC(hello, offsetof(struct ksock_hello_msg,
+				     kshm_ips[LNET_NUM_INTERFACES]));
+        if (hello == NULL) {
+                rc = -ENOMEM;
+                goto failed_1;
+        }
+
+        /* stash conn's local and remote addrs */
+        rc = ksocknal_lib_get_conn_addrs (conn);
+        if (rc != 0)
+                goto failed_1;
+
+        /* Find out/confirm peer_ni's NID and connection type and get the
+         * vector of interfaces she's willing to let me connect to.
+         * Passive connections use the listener timeout since the peer_ni sends
+         * eagerly */
+
+        if (active) {
+                peer_ni = route->ksnr_peer;
+                LASSERT(ni == peer_ni->ksnp_ni);
+
+                /* Active connection sends HELLO eagerly */
+                hello->kshm_nips = ksocknal_local_ipvec(ni, hello->kshm_ips);
+                peerid = peer_ni->ksnp_id;
+
+		write_lock_bh(global_lock);
+                conn->ksnc_proto = peer_ni->ksnp_proto;
+		write_unlock_bh(global_lock);
+
+                if (conn->ksnc_proto == NULL) {
+                         conn->ksnc_proto = &ksocknal_protocol_v3x;
+#if SOCKNAL_VERSION_DEBUG
+                         if (*ksocknal_tunables.ksnd_protocol == 2)
+                                 conn->ksnc_proto = &ksocknal_protocol_v2x;
+                         else if (*ksocknal_tunables.ksnd_protocol == 1)
+                                 conn->ksnc_proto = &ksocknal_protocol_v1x;
+#endif
+                }
+
+                rc = ksocknal_send_hello (ni, conn, peerid.nid, hello);
+                if (rc != 0)
+                        goto failed_1;
+        } else {
+                peerid.nid = LNET_NID_ANY;
+                peerid.pid = LNET_PID_ANY;
+
+                /* Passive, get protocol from peer_ni */
+                conn->ksnc_proto = NULL;
+        }
+
+        rc = ksocknal_recv_hello (ni, conn, hello, &peerid, &incarnation);
+        if (rc < 0)
+                goto failed_1;
+
+        LASSERT (rc == 0 || active);
+        LASSERT (conn->ksnc_proto != NULL);
+        LASSERT (peerid.nid != LNET_NID_ANY);
+
+	cpt = lnet_cpt_of_nid(peerid.nid, ni);
+
+        if (active) {
+                ksocknal_peer_addref(peer_ni);
+		write_lock_bh(global_lock);
+        } else {
+                rc = ksocknal_create_peer(&peer_ni, ni, peerid);
+                if (rc != 0)
+                        goto failed_1;
+
+		write_lock_bh(global_lock);
+
+                /* called with a ref on ni, so shutdown can't have started */
+                LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0);
+
+		peer2 = ksocknal_find_peer_locked(ni, peerid);
+		if (peer2 == NULL) {
+			/* NB this puts an "empty" peer_ni in the peer_ni
+			 * table (which takes my ref) */
+			list_add_tail(&peer_ni->ksnp_list,
+				      ksocknal_nid2peerlist(peerid.nid));
+		} else {
+			ksocknal_peer_decref(peer_ni);
+			peer_ni = peer2;
+		}
+
+                /* +1 ref for me */
+                ksocknal_peer_addref(peer_ni);
+                peer_ni->ksnp_accepting++;
+
+                /* Am I already connecting to this guy?  Resolve in
+                 * favour of higher NID... */
+                if (peerid.nid < ni->ni_nid &&
+                    ksocknal_connecting(peer_ni, conn->ksnc_ipaddr)) {
+                        rc = EALREADY;
+                        warn = "connection race resolution";
+                        goto failed_2;
+                }
+        }
+
+        if (peer_ni->ksnp_closing ||
+            (active && route->ksnr_deleted)) {
+                /* peer_ni/route got closed under me */
+                rc = -ESTALE;
+                warn = "peer_ni/route removed";
+                goto failed_2;
+        }
+
+	if (peer_ni->ksnp_proto == NULL) {
+		/* Never connected before.
+		 * NB recv_hello may have returned EPROTO to signal my peer_ni
+		 * wants a different protocol than the one I asked for.
+		 */
+		LASSERT(list_empty(&peer_ni->ksnp_conns));
+
+		peer_ni->ksnp_proto = conn->ksnc_proto;
+		peer_ni->ksnp_incarnation = incarnation;
+	}
+
+        if (peer_ni->ksnp_proto != conn->ksnc_proto ||
+            peer_ni->ksnp_incarnation != incarnation) {
+                /* peer_ni rebooted or I've got the wrong protocol version */
+                ksocknal_close_peer_conns_locked(peer_ni, 0, 0);
+
+                peer_ni->ksnp_proto = NULL;
+                rc = ESTALE;
+                warn = peer_ni->ksnp_incarnation != incarnation ?
+                       "peer_ni rebooted" :
+                       "wrong proto version";
+                goto failed_2;
+        }
+
+        switch (rc) {
+        default:
+                LBUG();
+        case 0:
+                break;
+        case EALREADY:
+                warn = "lost conn race";
+                goto failed_2;
+        case EPROTO:
+                warn = "retry with different protocol version";
+                goto failed_2;
+        }
+
+	/* Refuse to duplicate an existing connection, unless this is a
+	 * loopback connection */
+	if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) {
+		list_for_each(tmp, &peer_ni->ksnp_conns) {
+			conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+                        if (conn2->ksnc_ipaddr != conn->ksnc_ipaddr ||
+                            conn2->ksnc_myipaddr != conn->ksnc_myipaddr ||
+                            conn2->ksnc_type != conn->ksnc_type)
+                                continue;
+
+                        /* Reply on a passive connection attempt so the peer_ni
+                         * realises we're connected. */
+                        LASSERT (rc == 0);
+                        if (!active)
+                                rc = EALREADY;
+
+                        warn = "duplicate";
+                        goto failed_2;
+                }
+        }
+
+        /* If the connection created by this route didn't bind to the IP
+         * address the route connected to, the connection/route matching
+         * code below probably isn't going to work. */
+        if (active &&
+            route->ksnr_ipaddr != conn->ksnc_ipaddr) {
+		CERROR("Route %s %pI4h connected to %pI4h\n",
+                       libcfs_id2str(peer_ni->ksnp_id),
+		       &route->ksnr_ipaddr,
+		       &conn->ksnc_ipaddr);
+        }
+
+	/* Search for a route corresponding to the new connection and
+	 * create an association.  This allows incoming connections created
+	 * by routes in my peer_ni to match my own route entries so I don't
+	 * continually create duplicate routes. */
+	list_for_each(tmp, &peer_ni->ksnp_routes) {
+		route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+		if (route->ksnr_ipaddr != conn->ksnc_ipaddr)
+			continue;
+
+		ksocknal_associate_route_conn_locked(route, conn);
+		break;
+	}
+
+	conn->ksnc_peer = peer_ni;                 /* conn takes my ref on peer_ni */
+	peer_ni->ksnp_last_alive = cfs_time_current();
+	peer_ni->ksnp_send_keepalive = 0;
+	peer_ni->ksnp_error = 0;
+
+	sched = ksocknal_choose_scheduler_locked(cpt);
+	if (!sched) {
+		CERROR("no schedulers available. node is unhealthy\n");
+		goto failed_2;
+	}
+	/*
+	 * The cpt might have changed if we ended up selecting a non cpt
+	 * native scheduler. So use the scheduler's cpt instead.
+	 */
+	cpt = sched->kss_info->ksi_cpt;
+        sched->kss_nconns++;
+        conn->ksnc_scheduler = sched;
+
+	conn->ksnc_tx_last_post = cfs_time_current();
+	/* Set the deadline for the outgoing HELLO to drain */
+	conn->ksnc_tx_bufnob = sock->sk->sk_wmem_queued;
+	conn->ksnc_tx_deadline = cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+	smp_mb();   /* order with adding to peer_ni's conn list */
+
+	list_add(&conn->ksnc_list, &peer_ni->ksnp_conns);
+	ksocknal_conn_addref(conn);
+
+	ksocknal_new_packet(conn, 0);
+
+        conn->ksnc_zc_capable = ksocknal_lib_zc_capable(conn);
+
+	/* Take packets blocking for this connection. */
+	list_for_each_entry_safe(tx, txtmp, &peer_ni->ksnp_tx_queue, tx_list) {
+		if (conn->ksnc_proto->pro_match_tx(conn, tx, tx->tx_nonblk) ==
+		    SOCKNAL_MATCH_NO)
+			continue;
+
+		list_del(&tx->tx_list);
+		ksocknal_queue_tx_locked(tx, conn);
+	}
+
+	write_unlock_bh(global_lock);
+
+        /* We've now got a new connection.  Any errors from here on are just
+         * like "normal" comms errors and we close the connection normally.
+         * NB (a) we still have to send the reply HELLO for passive
+	 *        connections,
+         *    (b) normal I/O on the conn is blocked until I setup and call the
+         *        socket callbacks.
+         */
+
+	CDEBUG(D_NET, "New conn %s p %d.x %pI4h -> %pI4h/%d"
+	       " incarnation:%lld sched[%d:%d]\n",
+	       libcfs_id2str(peerid), conn->ksnc_proto->pro_version,
+	       &conn->ksnc_myipaddr, &conn->ksnc_ipaddr,
+	       conn->ksnc_port, incarnation, cpt,
+	       (int)(sched - &sched->kss_info->ksi_scheds[0]));
+
+        if (active) {
+                /* additional routes after interface exchange? */
+                ksocknal_create_routes(peer_ni, conn->ksnc_port,
+                                       hello->kshm_ips, hello->kshm_nips);
+        } else {
+                hello->kshm_nips = ksocknal_select_ips(peer_ni, hello->kshm_ips,
+                                                       hello->kshm_nips);
+                rc = ksocknal_send_hello(ni, conn, peerid.nid, hello);
+        }
+
+	LIBCFS_FREE(hello, offsetof(struct ksock_hello_msg,
+				    kshm_ips[LNET_NUM_INTERFACES]));
+
+        /* setup the socket AFTER I've received hello (it disables
+         * SO_LINGER).  I might call back to the acceptor who may want
+         * to send a protocol version response and then close the
+         * socket; this ensures the socket only tears down after the
+         * response has been sent. */
+        if (rc == 0)
+                rc = ksocknal_lib_setup_sock(sock);
+
+	write_lock_bh(global_lock);
+
+        /* NB my callbacks block while I hold ksnd_global_lock */
+        ksocknal_lib_set_callback(sock, conn);
+
+        if (!active)
+                peer_ni->ksnp_accepting--;
+
+	write_unlock_bh(global_lock);
+
+        if (rc != 0) {
+		write_lock_bh(global_lock);
+                if (!conn->ksnc_closing) {
+                        /* could be closed by another thread */
+                        ksocknal_close_conn_locked(conn, rc);
+                }
+		write_unlock_bh(global_lock);
+        } else if (ksocknal_connsock_addref(conn) == 0) {
+                /* Allow I/O to proceed. */
+                ksocknal_read_callback(conn);
+                ksocknal_write_callback(conn);
+                ksocknal_connsock_decref(conn);
+        }
+
+        ksocknal_connsock_decref(conn);
+        ksocknal_conn_decref(conn);
+        return rc;
+
+failed_2:
+	if (!peer_ni->ksnp_closing &&
+	    list_empty(&peer_ni->ksnp_conns) &&
+	    list_empty(&peer_ni->ksnp_routes)) {
+		list_add(&zombies, &peer_ni->ksnp_tx_queue);
+		list_del_init(&peer_ni->ksnp_tx_queue);
+		ksocknal_unlink_peer_locked(peer_ni);
+	}
+
+	write_unlock_bh(global_lock);
+
+        if (warn != NULL) {
+                if (rc < 0)
+                        CERROR("Not creating conn %s type %d: %s\n",
+                               libcfs_id2str(peerid), conn->ksnc_type, warn);
+                else
+                        CDEBUG(D_NET, "Not creating conn %s type %d: %s\n",
+                              libcfs_id2str(peerid), conn->ksnc_type, warn);
+        }
+
+        if (!active) {
+                if (rc > 0) {
+			/* Request retry by replying with CONN_NONE
+                         * ksnc_proto has been set already */
+                        conn->ksnc_type = SOCKLND_CONN_NONE;
+                        hello->kshm_nips = 0;
+                        ksocknal_send_hello(ni, conn, peerid.nid, hello);
+                }
+
+		write_lock_bh(global_lock);
+                peer_ni->ksnp_accepting--;
+		write_unlock_bh(global_lock);
+        }
+
+	/*
+	 * If we get here without an error code, just use -EALREADY.
+	 * Depending on how we got here, the error may be positive
+	 * or negative. Normalize the value for ksocknal_txlist_done().
+	 */
+	rc2 = (rc == 0 ? -EALREADY : (rc > 0 ? -rc : rc));
+	ksocknal_txlist_done(ni, &zombies, rc2);
+        ksocknal_peer_decref(peer_ni);
+
+failed_1:
+	if (hello != NULL)
+		LIBCFS_FREE(hello, offsetof(struct ksock_hello_msg,
+					    kshm_ips[LNET_NUM_INTERFACES]));
+
+	LIBCFS_FREE(conn, sizeof(*conn));
+
+failed_0:
+	sock_release(sock);
+	return rc;
+}
+
+void
+ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
+{
+        /* This just does the immmediate housekeeping, and queues the
+         * connection for the reaper to terminate.
+         * Caller holds ksnd_global_lock exclusively in irq context */
+        ksock_peer_ni_t      *peer_ni = conn->ksnc_peer;
+        ksock_route_t     *route;
+        ksock_conn_t      *conn2;
+	struct list_head  *tmp;
+
+	LASSERT(peer_ni->ksnp_error == 0);
+	LASSERT(!conn->ksnc_closing);
+	conn->ksnc_closing = 1;
+
+	/* ksnd_deathrow_conns takes over peer_ni's ref */
+	list_del(&conn->ksnc_list);
+
+	route = conn->ksnc_route;
+	if (route != NULL) {
+		/* dissociate conn from route... */
+		LASSERT(!route->ksnr_deleted);
+		LASSERT((route->ksnr_connected & (1 << conn->ksnc_type)) != 0);
+
+		conn2 = NULL;
+		list_for_each(tmp, &peer_ni->ksnp_conns) {
+			conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+			if (conn2->ksnc_route == route &&
+			    conn2->ksnc_type == conn->ksnc_type)
+				break;
+
+			conn2 = NULL;
+		}
+		if (conn2 == NULL)
+			route->ksnr_connected &= ~(1 << conn->ksnc_type);
+
+		conn->ksnc_route = NULL;
+
+		ksocknal_route_decref(route);	/* drop conn's ref on route */
+	}
+
+	if (list_empty(&peer_ni->ksnp_conns)) {
+		/* No more connections to this peer_ni */
+
+		if (!list_empty(&peer_ni->ksnp_tx_queue)) {
+				ksock_tx_t *tx;
+
+			LASSERT(conn->ksnc_proto == &ksocknal_protocol_v3x);
+
+			/* throw them to the last connection...,
+			 * these TXs will be send to /dev/null by scheduler */
+			list_for_each_entry(tx, &peer_ni->ksnp_tx_queue,
+					    tx_list)
+				ksocknal_tx_prep(conn, tx);
+
+			spin_lock_bh(&conn->ksnc_scheduler->kss_lock);
+			list_splice_init(&peer_ni->ksnp_tx_queue,
+					 &conn->ksnc_tx_queue);
+			spin_unlock_bh(&conn->ksnc_scheduler->kss_lock);
+		}
+
+		/* renegotiate protocol version */
+		peer_ni->ksnp_proto = NULL;
+		/* stash last conn close reason */
+		peer_ni->ksnp_error = error;
+
+		if (list_empty(&peer_ni->ksnp_routes)) {
+			/* I've just closed last conn belonging to a
+			 * peer_ni with no routes to it */
+			ksocknal_unlink_peer_locked(peer_ni);
+		}
+	}
+
+	spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+	list_add_tail(&conn->ksnc_list,
+		      &ksocknal_data.ksnd_deathrow_conns);
+	wake_up(&ksocknal_data.ksnd_reaper_waitq);
+
+	spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+}
+
+void
+ksocknal_peer_failed (ksock_peer_ni_t *peer_ni)
+{
+        int        notify = 0;
+	cfs_time_t last_alive = 0;
+
+	/* There has been a connection failure or comms error; but I'll only
+	 * tell LNET I think the peer_ni is dead if it's to another kernel and
+	 * there are no connections or connection attempts in existence. */
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	if ((peer_ni->ksnp_id.pid & LNET_PID_USERFLAG) == 0 &&
+	     list_empty(&peer_ni->ksnp_conns) &&
+	     peer_ni->ksnp_accepting == 0 &&
+	     ksocknal_find_connecting_route_locked(peer_ni) == NULL) {
+		notify = 1;
+		last_alive = peer_ni->ksnp_last_alive;
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	if (notify)
+		lnet_notify(peer_ni->ksnp_ni, peer_ni->ksnp_id.nid, 0,
+			    last_alive);
+}
+
+void
+ksocknal_finalize_zcreq(ksock_conn_t *conn)
+{
+	ksock_peer_ni_t	 *peer_ni = conn->ksnc_peer;
+	ksock_tx_t	 *tx;
+	ksock_tx_t	 *tmp;
+	struct list_head  zlist = LIST_HEAD_INIT(zlist);
+
+	/* NB safe to finalize TXs because closing of socket will
+	 * abort all buffered data */
+	LASSERT(conn->ksnc_sock == NULL);
+
+	spin_lock(&peer_ni->ksnp_lock);
+
+	list_for_each_entry_safe(tx, tmp, &peer_ni->ksnp_zc_req_list, tx_zc_list) {
+		if (tx->tx_conn != conn)
+			continue;
+
+		LASSERT(tx->tx_msg.ksm_zc_cookies[0] != 0);
+
+		tx->tx_msg.ksm_zc_cookies[0] = 0;
+		tx->tx_zc_aborted = 1;	/* mark it as not-acked */
+		list_del(&tx->tx_zc_list);
+		list_add(&tx->tx_zc_list, &zlist);
+	}
+
+	spin_unlock(&peer_ni->ksnp_lock);
+
+	while (!list_empty(&zlist)) {
+		tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list);
+
+		list_del(&tx->tx_zc_list);
+		ksocknal_tx_decref(tx);
+	}
+}
+
+void
+ksocknal_terminate_conn(ksock_conn_t *conn)
+{
+        /* This gets called by the reaper (guaranteed thread context) to
+         * disengage the socket from its callbacks and close it.
+         * ksnc_refcount will eventually hit zero, and then the reaper will
+         * destroy it. */
+        ksock_peer_ni_t     *peer_ni = conn->ksnc_peer;
+        ksock_sched_t    *sched = conn->ksnc_scheduler;
+        int               failed = 0;
+
+        LASSERT(conn->ksnc_closing);
+
+        /* wake up the scheduler to "send" all remaining packets to /dev/null */
+	spin_lock_bh(&sched->kss_lock);
+
+        /* a closing conn is always ready to tx */
+        conn->ksnc_tx_ready = 1;
+
+        if (!conn->ksnc_tx_scheduled &&
+	    !list_empty(&conn->ksnc_tx_queue)) {
+		list_add_tail(&conn->ksnc_tx_list,
+                               &sched->kss_tx_conns);
+                conn->ksnc_tx_scheduled = 1;
+                /* extra ref for scheduler */
+		ksocknal_conn_addref(conn);
+
+		wake_up (&sched->kss_waitq);
+	}
+
+	spin_unlock_bh(&sched->kss_lock);
+
+	/* serialise with callbacks */
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+        ksocknal_lib_reset_callback(conn->ksnc_sock, conn);
+
+        /* OK, so this conn may not be completely disengaged from its
+         * scheduler yet, but it _has_ committed to terminate... */
+        conn->ksnc_scheduler->kss_nconns--;
+
+        if (peer_ni->ksnp_error != 0) {
+                /* peer_ni's last conn closed in error */
+		LASSERT(list_empty(&peer_ni->ksnp_conns));
+                failed = 1;
+                peer_ni->ksnp_error = 0;     /* avoid multiple notifications */
+        }
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+        if (failed)
+                ksocknal_peer_failed(peer_ni);
+
+        /* The socket is closed on the final put; either here, or in
+         * ksocknal_{send,recv}msg().  Since we set up the linger2 option
+         * when the connection was established, this will close the socket
+         * immediately, aborting anything buffered in it. Any hung
+         * zero-copy transmits will therefore complete in finite time. */
+        ksocknal_connsock_decref(conn);
+}
+
+void
+ksocknal_queue_zombie_conn (ksock_conn_t *conn)
+{
+	/* Queue the conn for the reaper to destroy */
+
+	LASSERT(atomic_read(&conn->ksnc_conn_refcount) == 0);
+	spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+	list_add_tail(&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns);
+	wake_up(&ksocknal_data.ksnd_reaper_waitq);
+
+	spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+}
+
+void
+ksocknal_destroy_conn (ksock_conn_t *conn)
+{
+	cfs_time_t      last_rcv;
+
+	/* Final coup-de-grace of the reaper */
+	CDEBUG (D_NET, "connection %p\n", conn);
+
+	LASSERT (atomic_read (&conn->ksnc_conn_refcount) == 0);
+	LASSERT (atomic_read (&conn->ksnc_sock_refcount) == 0);
+	LASSERT (conn->ksnc_sock == NULL);
+	LASSERT (conn->ksnc_route == NULL);
+	LASSERT (!conn->ksnc_tx_scheduled);
+	LASSERT (!conn->ksnc_rx_scheduled);
+	LASSERT(list_empty(&conn->ksnc_tx_queue));
+
+        /* complete current receive if any */
+        switch (conn->ksnc_rx_state) {
+        case SOCKNAL_RX_LNET_PAYLOAD:
+                last_rcv = conn->ksnc_rx_deadline -
+			   cfs_time_seconds(*ksocknal_tunables.ksnd_timeout);
+		CERROR("Completing partial receive from %s[%d], "
+		       "ip %pI4h:%d, with error, wanted: %d, left: %d, "
+		       "last alive is %ld secs ago\n",
+                       libcfs_id2str(conn->ksnc_peer->ksnp_id), conn->ksnc_type,
+		       &conn->ksnc_ipaddr, conn->ksnc_port,
+                       conn->ksnc_rx_nob_wanted, conn->ksnc_rx_nob_left,
+		       cfs_duration_sec(cfs_time_sub(cfs_time_current(),
+					last_rcv)));
+		lnet_finalize(conn->ksnc_cookie, -EIO);
+		break;
+        case SOCKNAL_RX_LNET_HEADER:
+                if (conn->ksnc_rx_started)
+			CERROR("Incomplete receive of lnet header from %s, "
+			       "ip %pI4h:%d, with error, protocol: %d.x.\n",
+                               libcfs_id2str(conn->ksnc_peer->ksnp_id),
+			       &conn->ksnc_ipaddr, conn->ksnc_port,
+                               conn->ksnc_proto->pro_version);
+                break;
+        case SOCKNAL_RX_KSM_HEADER:
+                if (conn->ksnc_rx_started)
+			CERROR("Incomplete receive of ksock message from %s, "
+			       "ip %pI4h:%d, with error, protocol: %d.x.\n",
+                               libcfs_id2str(conn->ksnc_peer->ksnp_id),
+			       &conn->ksnc_ipaddr, conn->ksnc_port,
+                               conn->ksnc_proto->pro_version);
+                break;
+        case SOCKNAL_RX_SLOP:
+                if (conn->ksnc_rx_started)
+			CERROR("Incomplete receive of slops from %s, "
+			       "ip %pI4h:%d, with error\n",
+                               libcfs_id2str(conn->ksnc_peer->ksnp_id),
+			       &conn->ksnc_ipaddr, conn->ksnc_port);
+               break;
+        default:
+                LBUG ();
+                break;
+        }
+
+        ksocknal_peer_decref(conn->ksnc_peer);
+
+        LIBCFS_FREE (conn, sizeof (*conn));
+}
+
+int
+ksocknal_close_peer_conns_locked (ksock_peer_ni_t *peer_ni, __u32 ipaddr, int why)
+{
+        ksock_conn_t       *conn;
+	struct list_head         *ctmp;
+	struct list_head         *cnxt;
+        int                 count = 0;
+
+	list_for_each_safe(ctmp, cnxt, &peer_ni->ksnp_conns) {
+		conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
+
+                if (ipaddr == 0 ||
+                    conn->ksnc_ipaddr == ipaddr) {
+                        count++;
+                        ksocknal_close_conn_locked (conn, why);
+                }
+        }
+
+        return (count);
+}
+
+int
+ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why)
+{
+        ksock_peer_ni_t     *peer_ni = conn->ksnc_peer;
+        __u32             ipaddr = conn->ksnc_ipaddr;
+        int               count;
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+        count = ksocknal_close_peer_conns_locked (peer_ni, ipaddr, why);
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+        return (count);
+}
+
+int
+ksocknal_close_matching_conns(struct lnet_process_id id, __u32 ipaddr)
+{
+        ksock_peer_ni_t       *peer_ni;
+	struct list_head         *ptmp;
+	struct list_head         *pnxt;
+        int                 lo;
+        int                 hi;
+        int                 i;
+        int                 count = 0;
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+        if (id.nid != LNET_NID_ANY)
+                lo = hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers);
+        else {
+                lo = 0;
+                hi = ksocknal_data.ksnd_peer_hash_size - 1;
+        }
+
+        for (i = lo; i <= hi; i++) {
+		list_for_each_safe(ptmp, pnxt, &ksocknal_data.ksnd_peers[i]) {
+
+			peer_ni = list_entry(ptmp, ksock_peer_ni_t, ksnp_list);
+
+                        if (!((id.nid == LNET_NID_ANY || id.nid == peer_ni->ksnp_id.nid) &&
+                              (id.pid == LNET_PID_ANY || id.pid == peer_ni->ksnp_id.pid)))
+                                continue;
+
+                        count += ksocknal_close_peer_conns_locked (peer_ni, ipaddr, 0);
+                }
+        }
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+        /* wildcards always succeed */
+        if (id.nid == LNET_NID_ANY || id.pid == LNET_PID_ANY || ipaddr == 0)
+                return (0);
+
+        return (count == 0 ? -ENOENT : 0);
+}
+
+void
+ksocknal_notify(struct lnet_ni *ni, lnet_nid_t gw_nid, int alive)
+{
+	/* The router is telling me she's been notified of a change in
+	 * gateway state....
+	 */
+	struct lnet_process_id id = {
+		.nid	= gw_nid,
+		.pid	= LNET_PID_ANY,
+	};
+
+        CDEBUG (D_NET, "gw %s %s\n", libcfs_nid2str(gw_nid),
+                alive ? "up" : "down");
+
+        if (!alive) {
+                /* If the gateway crashed, close all open connections... */
+                ksocknal_close_matching_conns (id, 0);
+                return;
+        }
+
+        /* ...otherwise do nothing.  We can only establish new connections
+         * if we have autroutes, and these connect on demand. */
+}
+
+void
+ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when)
+{
+	int connect = 1;
+	cfs_time_t last_alive = 0;
+	cfs_time_t now = cfs_time_current();
+	ksock_peer_ni_t *peer_ni = NULL;
+	rwlock_t *glock = &ksocknal_data.ksnd_global_lock;
+	struct lnet_process_id id = {
+		.nid = nid,
+		.pid = LNET_PID_LUSTRE,
+	};
+
+	read_lock(glock);
+
+        peer_ni = ksocknal_find_peer_locked(ni, id);
+        if (peer_ni != NULL) {
+		struct list_head       *tmp;
+                ksock_conn_t     *conn;
+                int               bufnob;
+
+		list_for_each(tmp, &peer_ni->ksnp_conns) {
+			conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+			bufnob = conn->ksnc_sock->sk->sk_wmem_queued;
+
+			if (bufnob < conn->ksnc_tx_bufnob) {
+				/* something got ACKed */
+				conn->ksnc_tx_deadline =
+					cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+                                peer_ni->ksnp_last_alive = now;
+                                conn->ksnc_tx_bufnob = bufnob;
+                        }
+                }
+
+                last_alive = peer_ni->ksnp_last_alive;
+                if (ksocknal_find_connectable_route_locked(peer_ni) == NULL)
+                        connect = 0;
+        }
+
+	read_unlock(glock);
+
+        if (last_alive != 0)
+		*when = last_alive;
+
+	CDEBUG(D_NET, "peer_ni %s %p, alive %ld secs ago, connect %d\n",
+               libcfs_nid2str(nid), peer_ni,
+	       last_alive ? cfs_duration_sec(now - last_alive) : -1,
+               connect);
+
+        if (!connect)
+                return;
+
+        ksocknal_add_peer(ni, id, LNET_NIDADDR(nid), lnet_acceptor_port());
+
+	write_lock_bh(glock);
+
+        peer_ni = ksocknal_find_peer_locked(ni, id);
+        if (peer_ni != NULL)
+                ksocknal_launch_all_connections_locked(peer_ni);
+
+	write_unlock_bh(glock);
+        return;
+}
+
+static void
+ksocknal_push_peer (ksock_peer_ni_t *peer_ni)
+{
+        int               index;
+        int               i;
+	struct list_head       *tmp;
+        ksock_conn_t     *conn;
+
+        for (index = 0; ; index++) {
+		read_lock(&ksocknal_data.ksnd_global_lock);
+
+                i = 0;
+                conn = NULL;
+
+		list_for_each(tmp, &peer_ni->ksnp_conns) {
+                        if (i++ == index) {
+				conn = list_entry(tmp, ksock_conn_t,
+                                                       ksnc_list);
+                                ksocknal_conn_addref(conn);
+                                break;
+                        }
+                }
+
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+
+                if (conn == NULL)
+                        break;
+
+                ksocknal_lib_push_conn (conn);
+                ksocknal_conn_decref(conn);
+        }
+}
+
+static int
+ksocknal_push(struct lnet_ni *ni, struct lnet_process_id id)
+{
+	struct list_head *start;
+	struct list_head *end;
+	struct list_head *tmp;
+	int		  rc = -ENOENT;
+	unsigned int	  hsize = ksocknal_data.ksnd_peer_hash_size;
+
+	if (id.nid == LNET_NID_ANY) {
+		start = &ksocknal_data.ksnd_peers[0];
+		end = &ksocknal_data.ksnd_peers[hsize - 1];
+	} else {
+		start = end = ksocknal_nid2peerlist(id.nid);
+	}
+
+	for (tmp = start; tmp <= end; tmp++) {
+		int	peer_off; /* searching offset in peer_ni hash table */
+
+		for (peer_off = 0; ; peer_off++) {
+			ksock_peer_ni_t *peer_ni;
+			int	      i = 0;
+
+			read_lock(&ksocknal_data.ksnd_global_lock);
+			list_for_each_entry(peer_ni, tmp, ksnp_list) {
+				if (!((id.nid == LNET_NID_ANY ||
+				       id.nid == peer_ni->ksnp_id.nid) &&
+				      (id.pid == LNET_PID_ANY ||
+				       id.pid == peer_ni->ksnp_id.pid)))
+					continue;
+
+				if (i++ == peer_off) {
+					ksocknal_peer_addref(peer_ni);
+					break;
+				}
+			}
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+
+			if (i == 0) /* no match */
+				break;
+
+			rc = 0;
+			ksocknal_push_peer(peer_ni);
+			ksocknal_peer_decref(peer_ni);
+		}
+	}
+	return rc;
+}
+
+static int
+ksocknal_add_interface(struct lnet_ni *ni, __u32 ipaddress, __u32 netmask)
+{
+	ksock_net_t *net = ni->ni_data;
+	ksock_interface_t *iface;
+	int rc;
+	int i;
+	int j;
+	struct list_head *ptmp;
+	ksock_peer_ni_t *peer_ni;
+	struct list_head *rtmp;
+	ksock_route_t *route;
+
+	if (ipaddress == 0 ||
+	    netmask == 0)
+		return -EINVAL;
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	iface = ksocknal_ip2iface(ni, ipaddress);
+	if (iface != NULL) {
+		/* silently ignore dups */
+		rc = 0;
+	} else if (net->ksnn_ninterfaces == LNET_NUM_INTERFACES) {
+		rc = -ENOSPC;
+	} else {
+		iface = &net->ksnn_interfaces[net->ksnn_ninterfaces++];
+
+		iface->ksni_ipaddr = ipaddress;
+		iface->ksni_netmask = netmask;
+		iface->ksni_nroutes = 0;
+		iface->ksni_npeers = 0;
+
+		for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+			list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
+				peer_ni = list_entry(ptmp, ksock_peer_ni_t,
+						     ksnp_list);
+
+				for (j = 0; j < peer_ni->ksnp_n_passive_ips; j++)
+					if (peer_ni->ksnp_passive_ips[j] == ipaddress)
+						iface->ksni_npeers++;
+
+				list_for_each(rtmp, &peer_ni->ksnp_routes) {
+					route = list_entry(rtmp,
+							   ksock_route_t,
+							   ksnr_list);
+
+					if (route->ksnr_myipaddr == ipaddress)
+						iface->ksni_nroutes++;
+				}
+			}
+		}
+
+		rc = 0;
+		/* NB only new connections will pay attention to the new interface! */
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	return rc;
+}
+
+static void
+ksocknal_peer_del_interface_locked(ksock_peer_ni_t *peer_ni, __u32 ipaddr)
+{
+	struct list_head         *tmp;
+	struct list_head         *nxt;
+        ksock_route_t      *route;
+        ksock_conn_t       *conn;
+        int                 i;
+        int                 j;
+
+        for (i = 0; i < peer_ni->ksnp_n_passive_ips; i++)
+                if (peer_ni->ksnp_passive_ips[i] == ipaddr) {
+                        for (j = i+1; j < peer_ni->ksnp_n_passive_ips; j++)
+                                peer_ni->ksnp_passive_ips[j-1] =
+                                        peer_ni->ksnp_passive_ips[j];
+                        peer_ni->ksnp_n_passive_ips--;
+                        break;
+                }
+
+	list_for_each_safe(tmp, nxt, &peer_ni->ksnp_routes) {
+		route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+                if (route->ksnr_myipaddr != ipaddr)
+                        continue;
+
+                if (route->ksnr_share_count != 0) {
+                        /* Manually created; keep, but unbind */
+                        route->ksnr_myipaddr = 0;
+                } else {
+                        ksocknal_del_route_locked(route);
+                }
+        }
+
+	list_for_each_safe(tmp, nxt, &peer_ni->ksnp_conns) {
+		conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+                if (conn->ksnc_myipaddr == ipaddr)
+                        ksocknal_close_conn_locked (conn, 0);
+        }
+}
+
+static int
+ksocknal_del_interface(struct lnet_ni *ni, __u32 ipaddress)
+{
+        ksock_net_t       *net = ni->ni_data;
+        int                rc = -ENOENT;
+	struct list_head        *tmp;
+	struct list_head        *nxt;
+        ksock_peer_ni_t      *peer_ni;
+        __u32              this_ip;
+        int                i;
+        int                j;
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+        for (i = 0; i < net->ksnn_ninterfaces; i++) {
+                this_ip = net->ksnn_interfaces[i].ksni_ipaddr;
+
+                if (!(ipaddress == 0 ||
+                      ipaddress == this_ip))
+                        continue;
+
+                rc = 0;
+
+                for (j = i+1; j < net->ksnn_ninterfaces; j++)
+                        net->ksnn_interfaces[j-1] =
+                                net->ksnn_interfaces[j];
+
+                net->ksnn_ninterfaces--;
+
+                for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) {
+			list_for_each_safe(tmp, nxt,
+                                               &ksocknal_data.ksnd_peers[j]) {
+				peer_ni = list_entry(tmp, ksock_peer_ni_t,
+                                                      ksnp_list);
+
+                                if (peer_ni->ksnp_ni != ni)
+                                        continue;
+
+                                ksocknal_peer_del_interface_locked(peer_ni, this_ip);
+                        }
+                }
+        }
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+        return (rc);
+}
+
+int
+ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
+{
+	struct lnet_process_id id = {0};
+        struct libcfs_ioctl_data *data = arg;
+        int rc;
+
+        switch(cmd) {
+        case IOC_LIBCFS_GET_INTERFACE: {
+                ksock_net_t       *net = ni->ni_data;
+                ksock_interface_t *iface;
+
+		read_lock(&ksocknal_data.ksnd_global_lock);
+
+                if (data->ioc_count >= (__u32)net->ksnn_ninterfaces) {
+                        rc = -ENOENT;
+                } else {
+                        rc = 0;
+                        iface = &net->ksnn_interfaces[data->ioc_count];
+
+                        data->ioc_u32[0] = iface->ksni_ipaddr;
+                        data->ioc_u32[1] = iface->ksni_netmask;
+                        data->ioc_u32[2] = iface->ksni_npeers;
+                        data->ioc_u32[3] = iface->ksni_nroutes;
+                }
+
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+                return rc;
+        }
+
+        case IOC_LIBCFS_ADD_INTERFACE:
+                return ksocknal_add_interface(ni,
+                                              data->ioc_u32[0], /* IP address */
+                                              data->ioc_u32[1]); /* net mask */
+
+        case IOC_LIBCFS_DEL_INTERFACE:
+                return ksocknal_del_interface(ni,
+                                              data->ioc_u32[0]); /* IP address */
+
+        case IOC_LIBCFS_GET_PEER: {
+                __u32            myip = 0;
+                __u32            ip = 0;
+                int              port = 0;
+                int              conn_count = 0;
+                int              share_count = 0;
+
+                rc = ksocknal_get_peer_info(ni, data->ioc_count,
+                                            &id, &myip, &ip, &port,
+                                            &conn_count,  &share_count);
+                if (rc != 0)
+                        return rc;
+
+                data->ioc_nid    = id.nid;
+                data->ioc_count  = share_count;
+                data->ioc_u32[0] = ip;
+                data->ioc_u32[1] = port;
+                data->ioc_u32[2] = myip;
+                data->ioc_u32[3] = conn_count;
+                data->ioc_u32[4] = id.pid;
+                return 0;
+        }
+
+        case IOC_LIBCFS_ADD_PEER:
+                id.nid = data->ioc_nid;
+		id.pid = LNET_PID_LUSTRE;
+                return ksocknal_add_peer (ni, id,
+                                          data->ioc_u32[0], /* IP */
+                                          data->ioc_u32[1]); /* port */
+
+        case IOC_LIBCFS_DEL_PEER:
+                id.nid = data->ioc_nid;
+                id.pid = LNET_PID_ANY;
+                return ksocknal_del_peer (ni, id,
+                                          data->ioc_u32[0]); /* IP */
+
+        case IOC_LIBCFS_GET_CONN: {
+                int           txmem;
+                int           rxmem;
+                int           nagle;
+                ksock_conn_t *conn = ksocknal_get_conn_by_idx (ni, data->ioc_count);
+
+                if (conn == NULL)
+                        return -ENOENT;
+
+                ksocknal_lib_get_conn_tunables(conn, &txmem, &rxmem, &nagle);
+
+                data->ioc_count  = txmem;
+                data->ioc_nid    = conn->ksnc_peer->ksnp_id.nid;
+                data->ioc_flags  = nagle;
+                data->ioc_u32[0] = conn->ksnc_ipaddr;
+                data->ioc_u32[1] = conn->ksnc_port;
+                data->ioc_u32[2] = conn->ksnc_myipaddr;
+                data->ioc_u32[3] = conn->ksnc_type;
+		data->ioc_u32[4] = conn->ksnc_scheduler->kss_info->ksi_cpt;
+                data->ioc_u32[5] = rxmem;
+                data->ioc_u32[6] = conn->ksnc_peer->ksnp_id.pid;
+                ksocknal_conn_decref(conn);
+                return 0;
+        }
+
+        case IOC_LIBCFS_CLOSE_CONNECTION:
+                id.nid = data->ioc_nid;
+                id.pid = LNET_PID_ANY;
+                return ksocknal_close_matching_conns (id,
+                                                      data->ioc_u32[0]);
+
+        case IOC_LIBCFS_REGISTER_MYNID:
+                /* Ignore if this is a noop */
+                if (data->ioc_nid == ni->ni_nid)
+                        return 0;
+
+                CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
+                       libcfs_nid2str(data->ioc_nid),
+                       libcfs_nid2str(ni->ni_nid));
+                return -EINVAL;
+
+        case IOC_LIBCFS_PUSH_CONNECTION:
+                id.nid = data->ioc_nid;
+                id.pid = LNET_PID_ANY;
+                return ksocknal_push(ni, id);
+
+        default:
+                return -EINVAL;
+        }
+        /* not reached */
+}
+
+static void
+ksocknal_free_buffers (void)
+{
+	LASSERT (atomic_read(&ksocknal_data.ksnd_nactive_txs) == 0);
+
+	if (ksocknal_data.ksnd_sched_info != NULL) {
+		struct ksock_sched_info	*info;
+		int			i;
+
+		cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
+			if (info->ksi_scheds != NULL) {
+				LIBCFS_FREE(info->ksi_scheds,
+					    info->ksi_nthreads_max *
+					    sizeof(info->ksi_scheds[0]));
+			}
+		}
+		cfs_percpt_free(ksocknal_data.ksnd_sched_info);
+	}
+
+        LIBCFS_FREE (ksocknal_data.ksnd_peers,
+		     sizeof(struct list_head) *
+                     ksocknal_data.ksnd_peer_hash_size);
+
+	spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+	if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
+		struct list_head	zlist;
+		ksock_tx_t	*tx;
+
+		list_add(&zlist, &ksocknal_data.ksnd_idle_noop_txs);
+		list_del_init(&ksocknal_data.ksnd_idle_noop_txs);
+		spin_unlock(&ksocknal_data.ksnd_tx_lock);
+
+		while (!list_empty(&zlist)) {
+			tx = list_entry(zlist.next, ksock_tx_t, tx_list);
+			list_del(&tx->tx_list);
+			LIBCFS_FREE(tx, tx->tx_desc_size);
+		}
+	} else {
+		spin_unlock(&ksocknal_data.ksnd_tx_lock);
+	}
+}
+
+static void
+ksocknal_base_shutdown(void)
+{
+	struct ksock_sched_info *info;
+	ksock_sched_t		*sched;
+	int			i;
+	int			j;
+
+	CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+	       atomic_read (&libcfs_kmemory));
+	LASSERT (ksocknal_data.ksnd_nnets == 0);
+
+        switch (ksocknal_data.ksnd_init) {
+        default:
+                LASSERT (0);
+
+        case SOCKNAL_INIT_ALL:
+        case SOCKNAL_INIT_DATA:
+                LASSERT (ksocknal_data.ksnd_peers != NULL);
+                for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+			LASSERT(list_empty(&ksocknal_data.ksnd_peers[i]));
+                }
+
+		LASSERT(list_empty(&ksocknal_data.ksnd_nets));
+		LASSERT(list_empty(&ksocknal_data.ksnd_enomem_conns));
+		LASSERT(list_empty(&ksocknal_data.ksnd_zombie_conns));
+		LASSERT(list_empty(&ksocknal_data.ksnd_connd_connreqs));
+		LASSERT(list_empty(&ksocknal_data.ksnd_connd_routes));
+
+		if (ksocknal_data.ksnd_sched_info != NULL) {
+			cfs_percpt_for_each(info, i,
+					    ksocknal_data.ksnd_sched_info) {
+				if (info->ksi_scheds == NULL)
+					continue;
+
+				for (j = 0; j < info->ksi_nthreads_max; j++) {
+
+					sched = &info->ksi_scheds[j];
+					LASSERT(list_empty(&sched->\
+							       kss_tx_conns));
+					LASSERT(list_empty(&sched->\
+							       kss_rx_conns));
+					LASSERT(list_empty(&sched-> \
+						  kss_zombie_noop_txs));
+					LASSERT(sched->kss_nconns == 0);
+				}
+			}
+		}
+
+		/* flag threads to terminate; wake and wait for them to die */
+		ksocknal_data.ksnd_shuttingdown = 1;
+		wake_up_all(&ksocknal_data.ksnd_connd_waitq);
+		wake_up_all(&ksocknal_data.ksnd_reaper_waitq);
+
+		if (ksocknal_data.ksnd_sched_info != NULL) {
+			cfs_percpt_for_each(info, i,
+					    ksocknal_data.ksnd_sched_info) {
+				if (info->ksi_scheds == NULL)
+					continue;
+
+				for (j = 0; j < info->ksi_nthreads_max; j++) {
+					sched = &info->ksi_scheds[j];
+					wake_up_all(&sched->kss_waitq);
+				}
+			}
+		}
+
+		i = 4;
+		read_lock(&ksocknal_data.ksnd_global_lock);
+		while (ksocknal_data.ksnd_nthreads != 0) {
+			i++;
+			/* power of 2? */
+			CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+				"waiting for %d threads to terminate\n",
+				ksocknal_data.ksnd_nthreads);
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(cfs_time_seconds(1));
+			read_lock(&ksocknal_data.ksnd_global_lock);
+		}
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+
+                ksocknal_free_buffers();
+
+                ksocknal_data.ksnd_init = SOCKNAL_INIT_NOTHING;
+                break;
+        }
+
+	CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+	       atomic_read (&libcfs_kmemory));
+
+	module_put(THIS_MODULE);
+}
+
+static int
+ksocknal_base_startup(void)
+{
+	struct ksock_sched_info	*info;
+	int			rc;
+	int			i;
+
+        LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
+        LASSERT (ksocknal_data.ksnd_nnets == 0);
+
+        memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */
+
+        ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE;
+	LIBCFS_ALLOC(ksocknal_data.ksnd_peers,
+		     sizeof(struct list_head) *
+		     ksocknal_data.ksnd_peer_hash_size);
+        if (ksocknal_data.ksnd_peers == NULL)
+                return -ENOMEM;
+
+        for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++)
+		INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]);
+
+	rwlock_init(&ksocknal_data.ksnd_global_lock);
+	INIT_LIST_HEAD(&ksocknal_data.ksnd_nets);
+
+	spin_lock_init(&ksocknal_data.ksnd_reaper_lock);
+	INIT_LIST_HEAD(&ksocknal_data.ksnd_enomem_conns);
+	INIT_LIST_HEAD(&ksocknal_data.ksnd_zombie_conns);
+	INIT_LIST_HEAD(&ksocknal_data.ksnd_deathrow_conns);
+	init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq);
+
+	spin_lock_init(&ksocknal_data.ksnd_connd_lock);
+	INIT_LIST_HEAD(&ksocknal_data.ksnd_connd_connreqs);
+	INIT_LIST_HEAD(&ksocknal_data.ksnd_connd_routes);
+	init_waitqueue_head(&ksocknal_data.ksnd_connd_waitq);
+
+	spin_lock_init(&ksocknal_data.ksnd_tx_lock);
+	INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_noop_txs);
+
+	/* NB memset above zeros whole of ksocknal_data */
+
+	/* flag lists/ptrs/locks initialised */
+	ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
+	try_module_get(THIS_MODULE);
+
+	ksocknal_data.ksnd_sched_info = cfs_percpt_alloc(lnet_cpt_table(),
+							 sizeof(*info));
+	if (ksocknal_data.ksnd_sched_info == NULL)
+		goto failed;
+
+	cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
+		ksock_sched_t	*sched;
+		int		nthrs;
+
+		nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+		if (*ksocknal_tunables.ksnd_nscheds > 0) {
+			nthrs = min(nthrs, *ksocknal_tunables.ksnd_nscheds);
+		} else {
+			/* max to half of CPUs, assume another half should be
+			 * reserved for upper layer modules */
+			nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
+		}
+
+		info->ksi_nthreads_max = nthrs;
+		info->ksi_cpt = i;
+
+		if (nthrs != 0) {
+			LIBCFS_CPT_ALLOC(info->ksi_scheds, lnet_cpt_table(), i,
+					 info->ksi_nthreads_max *
+						sizeof(*sched));
+			if (info->ksi_scheds == NULL)
+				goto failed;
+
+			for (; nthrs > 0; nthrs--) {
+				sched = &info->ksi_scheds[nthrs - 1];
+
+				sched->kss_info = info;
+				spin_lock_init(&sched->kss_lock);
+				INIT_LIST_HEAD(&sched->kss_rx_conns);
+				INIT_LIST_HEAD(&sched->kss_tx_conns);
+				INIT_LIST_HEAD(&sched->kss_zombie_noop_txs);
+				init_waitqueue_head(&sched->kss_waitq);
+			}
+		}
+        }
+
+        ksocknal_data.ksnd_connd_starting         = 0;
+        ksocknal_data.ksnd_connd_failed_stamp     = 0;
+	ksocknal_data.ksnd_connd_starting_stamp   = cfs_time_current_sec();
+        /* must have at least 2 connds to remain responsive to accepts while
+         * connecting */
+        if (*ksocknal_tunables.ksnd_nconnds < SOCKNAL_CONND_RESV + 1)
+                *ksocknal_tunables.ksnd_nconnds = SOCKNAL_CONND_RESV + 1;
+
+        if (*ksocknal_tunables.ksnd_nconnds_max <
+            *ksocknal_tunables.ksnd_nconnds) {
+                ksocknal_tunables.ksnd_nconnds_max =
+                        ksocknal_tunables.ksnd_nconnds;
+        }
+
+        for (i = 0; i < *ksocknal_tunables.ksnd_nconnds; i++) {
+		char name[16];
+		spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+		ksocknal_data.ksnd_connd_starting++;
+		spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+
+
+		snprintf(name, sizeof(name), "socknal_cd%02d", i);
+		rc = ksocknal_thread_start(ksocknal_connd,
+					   (void *)((uintptr_t)i), name);
+		if (rc != 0) {
+			spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+			ksocknal_data.ksnd_connd_starting--;
+			spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+                        CERROR("Can't spawn socknal connd: %d\n", rc);
+                        goto failed;
+                }
+        }
+
+	rc = ksocknal_thread_start(ksocknal_reaper, NULL, "socknal_reaper");
+        if (rc != 0) {
+                CERROR ("Can't spawn socknal reaper: %d\n", rc);
+                goto failed;
+        }
+
+        /* flag everything initialised */
+        ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL;
+
+        return 0;
+
+ failed:
+        ksocknal_base_shutdown();
+        return -ENETDOWN;
+}
+
+static void
+ksocknal_debug_peerhash(struct lnet_ni *ni)
+{
+	ksock_peer_ni_t	*peer_ni = NULL;
+	struct list_head	*tmp;
+	int		i;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+        for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+		list_for_each(tmp, &ksocknal_data.ksnd_peers[i]) {
+			peer_ni = list_entry(tmp, ksock_peer_ni_t, ksnp_list);
+
+                        if (peer_ni->ksnp_ni == ni) break;
+
+                        peer_ni = NULL;
+                }
+        }
+
+        if (peer_ni != NULL) {
+                ksock_route_t *route;
+                ksock_conn_t  *conn;
+
+		CWARN ("Active peer_ni on shutdown: %s, ref %d, scnt %d, "
+		       "closing %d, accepting %d, err %d, zcookie %llu, "
+		       "txq %d, zc_req %d\n", libcfs_id2str(peer_ni->ksnp_id),
+		       atomic_read(&peer_ni->ksnp_refcount),
+		       peer_ni->ksnp_sharecount, peer_ni->ksnp_closing,
+		       peer_ni->ksnp_accepting, peer_ni->ksnp_error,
+		       peer_ni->ksnp_zc_next_cookie,
+		       !list_empty(&peer_ni->ksnp_tx_queue),
+		       !list_empty(&peer_ni->ksnp_zc_req_list));
+
+		list_for_each(tmp, &peer_ni->ksnp_routes) {
+			route = list_entry(tmp, ksock_route_t, ksnr_list);
+			CWARN ("Route: ref %d, schd %d, conn %d, cnted %d, "
+			       "del %d\n", atomic_read(&route->ksnr_refcount),
+			       route->ksnr_scheduled, route->ksnr_connecting,
+			       route->ksnr_connected, route->ksnr_deleted);
+		}
+
+		list_for_each(tmp, &peer_ni->ksnp_conns) {
+			conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+			CWARN ("Conn: ref %d, sref %d, t %d, c %d\n",
+			       atomic_read(&conn->ksnc_conn_refcount),
+			       atomic_read(&conn->ksnc_sock_refcount),
+			       conn->ksnc_type, conn->ksnc_closing);
+		}
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+	return;
+}
+
+void
+ksocknal_shutdown(struct lnet_ni *ni)
+{
+	ksock_net_t *net = ni->ni_data;
+	struct lnet_process_id anyid = {
+		.nid = LNET_NID_ANY,
+		.pid = LNET_PID_ANY,
+	};
+	int i;
+
+        LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL);
+        LASSERT(ksocknal_data.ksnd_nnets > 0);
+
+	spin_lock_bh(&net->ksnn_lock);
+	net->ksnn_shutdown = 1;                 /* prevent new peers */
+	spin_unlock_bh(&net->ksnn_lock);
+
+	/* Delete all peers */
+	ksocknal_del_peer(ni, anyid, 0);
+
+	/* Wait for all peer_ni state to clean up */
+	i = 2;
+	spin_lock_bh(&net->ksnn_lock);
+	while (net->ksnn_npeers != 0) {
+		spin_unlock_bh(&net->ksnn_lock);
+
+		i++;
+		CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+		       "waiting for %d peers to disconnect\n",
+		       net->ksnn_npeers);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(1));
+
+		ksocknal_debug_peerhash(ni);
+
+		spin_lock_bh(&net->ksnn_lock);
+	}
+	spin_unlock_bh(&net->ksnn_lock);
+
+        for (i = 0; i < net->ksnn_ninterfaces; i++) {
+                LASSERT (net->ksnn_interfaces[i].ksni_npeers == 0);
+                LASSERT (net->ksnn_interfaces[i].ksni_nroutes == 0);
+        }
+
+	list_del(&net->ksnn_list);
+	LIBCFS_FREE(net, sizeof(*net));
+
+        ksocknal_data.ksnd_nnets--;
+        if (ksocknal_data.ksnd_nnets == 0)
+                ksocknal_base_shutdown();
+}
+
+static int
+ksocknal_enumerate_interfaces(ksock_net_t *net)
+{
+        char      **names;
+        int         i;
+        int         j;
+        int         rc;
+        int         n;
+
+	n = lnet_ipif_enumerate(&names);
+        if (n <= 0) {
+                CERROR("Can't enumerate interfaces: %d\n", n);
+                return n;
+        }
+
+        for (i = j = 0; i < n; i++) {
+                int        up;
+                __u32      ip;
+                __u32      mask;
+
+                if (!strcmp(names[i], "lo")) /* skip the loopback IF */
+                        continue;
+
+		rc = lnet_ipif_query(names[i], &up, &ip, &mask);
+                if (rc != 0) {
+                        CWARN("Can't get interface %s info: %d\n",
+                              names[i], rc);
+                        continue;
+                }
+
+                if (!up) {
+                        CWARN("Ignoring interface %s (down)\n",
+                              names[i]);
+                        continue;
+                }
+
+		if (j == LNET_NUM_INTERFACES) {
+			CWARN("Ignoring interface %s (too many interfaces)\n",
+			      names[i]);
+			continue;
+		}
+
+                net->ksnn_interfaces[j].ksni_ipaddr = ip;
+                net->ksnn_interfaces[j].ksni_netmask = mask;
+		strlcpy(net->ksnn_interfaces[j].ksni_name,
+			names[i], sizeof(net->ksnn_interfaces[j].ksni_name));
+                j++;
+        }
+
+	lnet_ipif_free_enumeration(names, n);
+
+        if (j == 0)
+                CERROR("Can't find any usable interfaces\n");
+
+        return j;
+}
+
+static int
+ksocknal_search_new_ipif(ksock_net_t *net)
+{
+	int	new_ipif = 0;
+	int	i;
+
+	for (i = 0; i < net->ksnn_ninterfaces; i++) {
+		char		*ifnam = &net->ksnn_interfaces[i].ksni_name[0];
+		char		*colon = strchr(ifnam, ':');
+		int		found  = 0;
+		ksock_net_t	*tmp;
+		int		j;
+
+		if (colon != NULL) /* ignore alias device */
+			*colon = 0;
+
+		list_for_each_entry(tmp, &ksocknal_data.ksnd_nets,
+					ksnn_list) {
+			for (j = 0; !found && j < tmp->ksnn_ninterfaces; j++) {
+				char *ifnam2 = &tmp->ksnn_interfaces[j].\
+					     ksni_name[0];
+				char *colon2 = strchr(ifnam2, ':');
+
+				if (colon2 != NULL)
+					*colon2 = 0;
+
+				found = strcmp(ifnam, ifnam2) == 0;
+				if (colon2 != NULL)
+					*colon2 = ':';
+			}
+			if (found)
+				break;
+		}
+
+		new_ipif += !found;
+		if (colon != NULL)
+			*colon = ':';
+	}
+
+	return new_ipif;
+}
+
+static int
+ksocknal_start_schedulers(struct ksock_sched_info *info)
+{
+	int	nthrs;
+	int	rc = 0;
+	int	i;
+
+	if (info->ksi_nthreads == 0) {
+		if (*ksocknal_tunables.ksnd_nscheds > 0) {
+			nthrs = info->ksi_nthreads_max;
+		} else {
+			nthrs = cfs_cpt_weight(lnet_cpt_table(),
+					       info->ksi_cpt);
+			nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
+			nthrs = min(SOCKNAL_NSCHEDS_HIGH, nthrs);
+		}
+		nthrs = min(nthrs, info->ksi_nthreads_max);
+	} else {
+		LASSERT(info->ksi_nthreads <= info->ksi_nthreads_max);
+		/* increase two threads if there is new interface */
+		nthrs = min(2, info->ksi_nthreads_max - info->ksi_nthreads);
+	}
+
+	for (i = 0; i < nthrs; i++) {
+		long		id;
+		char		name[20];
+		ksock_sched_t	*sched;
+		id = KSOCK_THREAD_ID(info->ksi_cpt, info->ksi_nthreads + i);
+		sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
+		snprintf(name, sizeof(name), "socknal_sd%02d_%02d",
+			 info->ksi_cpt, (int)(sched - &info->ksi_scheds[0]));
+
+		rc = ksocknal_thread_start(ksocknal_scheduler,
+					   (void *)id, name);
+		if (rc == 0)
+			continue;
+
+		CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
+		       info->ksi_cpt, info->ksi_nthreads + i, rc);
+		break;
+	}
+
+	info->ksi_nthreads += i;
+	return rc;
+}
+
+static int
+ksocknal_net_start_threads(ksock_net_t *net, __u32 *cpts, int ncpts)
+{
+	int	newif = ksocknal_search_new_ipif(net);
+	int	rc;
+	int	i;
+
+	if (ncpts > 0 && ncpts > cfs_cpt_number(lnet_cpt_table()))
+		return -EINVAL;
+
+	for (i = 0; i < ncpts; i++) {
+		struct ksock_sched_info	*info;
+		int cpt = (cpts == NULL) ? i : cpts[i];
+
+		LASSERT(cpt < cfs_cpt_number(lnet_cpt_table()));
+		info = ksocknal_data.ksnd_sched_info[cpt];
+
+		if (!newif && info->ksi_nthreads > 0)
+			continue;
+
+		rc = ksocknal_start_schedulers(info);
+		if (rc != 0)
+			return rc;
+	}
+	return 0;
+}
+
+int
+ksocknal_startup(struct lnet_ni *ni)
+{
+	ksock_net_t  *net;
+	int           rc;
+	int           i;
+	struct net_device *net_dev;
+	int node_id;
+
+        LASSERT (ni->ni_net->net_lnd == &the_ksocklnd);
+
+        if (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING) {
+                rc = ksocknal_base_startup();
+                if (rc != 0)
+                        return rc;
+        }
+
+        LIBCFS_ALLOC(net, sizeof(*net));
+        if (net == NULL)
+                goto fail_0;
+
+	spin_lock_init(&net->ksnn_lock);
+	net->ksnn_incarnation = ktime_get_real_ns();
+	ni->ni_data = net;
+	if (!ni->ni_net->net_tunables_set) {
+		ni->ni_net->net_tunables.lct_peer_timeout =
+			*ksocknal_tunables.ksnd_peertimeout;
+		ni->ni_net->net_tunables.lct_max_tx_credits =
+			*ksocknal_tunables.ksnd_credits;
+		ni->ni_net->net_tunables.lct_peer_tx_credits =
+			*ksocknal_tunables.ksnd_peertxcredits;
+		ni->ni_net->net_tunables.lct_peer_rtr_credits =
+			*ksocknal_tunables.ksnd_peerrtrcredits;
+		ni->ni_net->net_tunables_set = true;
+	}
+
+
+	if (ni->ni_interfaces[0] == NULL) {
+		rc = ksocknal_enumerate_interfaces(net);
+		if (rc <= 0)
+			goto fail_1;
+
+		net->ksnn_ninterfaces = 1;
+	} else {
+		for (i = 0; i < LNET_NUM_INTERFACES; i++) {
+			int up;
+
+			if (ni->ni_interfaces[i] == NULL)
+				break;
+
+			rc = lnet_ipif_query(ni->ni_interfaces[i], &up,
+				&net->ksnn_interfaces[i].ksni_ipaddr,
+				&net->ksnn_interfaces[i].ksni_netmask);
+
+			if (rc != 0) {
+				CERROR("Can't get interface %s info: %d\n",
+				       ni->ni_interfaces[i], rc);
+				goto fail_1;
+			}
+
+			if (!up) {
+				CERROR("Interface %s is down\n",
+				       ni->ni_interfaces[i]);
+				goto fail_1;
+			}
+
+			strlcpy(net->ksnn_interfaces[i].ksni_name,
+				ni->ni_interfaces[i],
+				sizeof(net->ksnn_interfaces[i].ksni_name));
+
+		}
+		net->ksnn_ninterfaces = i;
+	}
+
+	net_dev = dev_get_by_name(&init_net,
+				  net->ksnn_interfaces[0].ksni_name);
+	if (net_dev != NULL) {
+		node_id = dev_to_node(&net_dev->dev);
+		ni->ni_dev_cpt = cfs_cpt_of_node(lnet_cpt_table(), node_id);
+		dev_put(net_dev);
+	} else {
+		ni->ni_dev_cpt = CFS_CPT_ANY;
+	}
+
+	/* call it before add it to ksocknal_data.ksnd_nets */
+	rc = ksocknal_net_start_threads(net, ni->ni_cpts, ni->ni_ncpts);
+	if (rc != 0)
+		goto fail_1;
+
+	ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid),
+				net->ksnn_interfaces[0].ksni_ipaddr);
+	list_add(&net->ksnn_list, &ksocknal_data.ksnd_nets);
+
+        ksocknal_data.ksnd_nnets++;
+
+        return 0;
+
+ fail_1:
+        LIBCFS_FREE(net, sizeof(*net));
+ fail_0:
+        if (ksocknal_data.ksnd_nnets == 0)
+                ksocknal_base_shutdown();
+
+        return -ENETDOWN;
+}
+
+
+static void __exit ksocklnd_exit(void)
+{
+	lnet_unregister_lnd(&the_ksocklnd);
+}
+
+static int __init ksocklnd_init(void)
+{
+	int rc;
+
+	/* check ksnr_connected/connecting field large enough */
+	CLASSERT(SOCKLND_CONN_NTYPES <= 4);
+	CLASSERT(SOCKLND_CONN_ACK == SOCKLND_CONN_BULK_IN);
+
+	/* initialize the_ksocklnd */
+	the_ksocklnd.lnd_type     = SOCKLND;
+	the_ksocklnd.lnd_startup  = ksocknal_startup;
+	the_ksocklnd.lnd_shutdown = ksocknal_shutdown;
+	the_ksocklnd.lnd_ctl      = ksocknal_ctl;
+	the_ksocklnd.lnd_send     = ksocknal_send;
+	the_ksocklnd.lnd_recv     = ksocknal_recv;
+	the_ksocklnd.lnd_notify   = ksocknal_notify;
+	the_ksocklnd.lnd_query    = ksocknal_query;
+	the_ksocklnd.lnd_accept   = ksocknal_accept;
+
+	rc = ksocknal_tunables_init();
+	if (rc != 0)
+		return rc;
+
+	lnet_register_lnd(&the_ksocklnd);
+
+	return 0;
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("TCP Socket LNet Network Driver");
+MODULE_VERSION("2.8.0");
+MODULE_LICENSE("GPL");
+
+module_init(ksocklnd_init);
+module_exit(ksocklnd_exit);
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h
new file mode 100644
index 0000000000000..4668fc162ba34
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h
@@ -0,0 +1,692 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ *
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef _SOCKLND_SOCKLND_H_
+#define _SOCKLND_SOCKLND_H_
+
+#define DEBUG_PORTAL_ALLOC
+#define DEBUG_SUBSYSTEM S_LND
+
+#include <linux/crc32.h>
+#include <linux/errno.h>
+#include <linux/if.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/kmod.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/syscalls.h>
+#include <linux/sysctl.h>
+#include <linux/uio.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#include <libcfs/libcfs.h>
+#include <lnet/lnet.h>
+#include <lnet/lib-lnet.h>
+#include <lnet/socklnd.h>
+
+#ifdef HAVE_TCP_SENDPAGE_USE_SOCKET
+# define cfs_tcp_sendpage(sk, page, offset, size, flags) \
+	tcp_sendpage((sk)->sk_socket, page, offset, size, flags)
+#else /* !HAVE_TCP_SENDPAGE_USE_SOCKET */
+# define cfs_tcp_sendpage(sk, page, offset, size, flags) \
+	tcp_sendpage(sk, page, offset, size, flags)
+#endif /* HAVE_TCP_SENDPAGE_USE_SOCKET */
+
+#ifndef NETIF_F_CSUM_MASK
+# define NETIF_F_CSUM_MASK NETIF_F_ALL_CSUM
+#endif
+
+/* assume one thread for each connection type */
+#define SOCKNAL_NSCHEDS		3
+#define SOCKNAL_NSCHEDS_HIGH	(SOCKNAL_NSCHEDS << 1)
+
+#define SOCKNAL_PEER_HASH_SIZE  101             /* # peer_ni lists */
+#define SOCKNAL_RESCHED         100             /* # scheduler loops before reschedule */
+#define SOCKNAL_INSANITY_RECONN 5000            /* connd is trying on reconn infinitely */
+#define SOCKNAL_ENOMEM_RETRY    CFS_TICK        /* jiffies between retries */
+
+#define SOCKNAL_SINGLE_FRAG_TX      0           /* disable multi-fragment sends */
+#define SOCKNAL_SINGLE_FRAG_RX      0           /* disable multi-fragment receives */
+
+#define SOCKNAL_VERSION_DEBUG       0           /* enable protocol version debugging */
+
+/* risk kmap deadlock on multi-frag I/O (backs off to single-frag if disabled).
+ * no risk if we're not running on a CONFIG_HIGHMEM platform. */
+#ifdef CONFIG_HIGHMEM
+# define SOCKNAL_RISK_KMAP_DEADLOCK  0
+#else
+# define SOCKNAL_RISK_KMAP_DEADLOCK  1
+#endif
+
+struct ksock_sched_info;
+
+typedef struct                                  /* per scheduler state */
+{
+	spinlock_t		kss_lock;	/* serialise */
+	struct list_head	kss_rx_conns;	/* conn waiting to be read */
+	/* conn waiting to be written */
+	struct list_head	kss_tx_conns;
+	/* zombie noop tx list */
+	struct list_head	kss_zombie_noop_txs;
+	wait_queue_head_t	kss_waitq;	/* where scheduler sleeps */
+	/* # connections assigned to this scheduler */
+	int			kss_nconns;
+	struct ksock_sched_info	*kss_info;	/* owner of it */
+#if !SOCKNAL_SINGLE_FRAG_RX
+	struct page		*kss_rx_scratch_pgs[LNET_MAX_IOV];
+#endif
+#if !SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_SINGLE_FRAG_RX
+	struct kvec		kss_scratch_iov[LNET_MAX_IOV];
+#endif
+} ksock_sched_t;
+
+struct ksock_sched_info {
+	int			ksi_nthreads_max; /* max allowed threads */
+	int			ksi_nthreads;	/* number of threads */
+	int			ksi_cpt;	/* CPT id */
+	ksock_sched_t		*ksi_scheds;	/* array of schedulers */
+};
+
+#define KSOCK_CPT_SHIFT			16
+#define KSOCK_THREAD_ID(cpt, sid)	(((cpt) << KSOCK_CPT_SHIFT) | (sid))
+#define KSOCK_THREAD_CPT(id)		((id) >> KSOCK_CPT_SHIFT)
+#define KSOCK_THREAD_SID(id)		((id) & ((1UL << KSOCK_CPT_SHIFT) - 1))
+
+typedef struct                                  /* in-use interface */
+{
+	__u32		ksni_ipaddr;		/* interface's IP address */
+	__u32		ksni_netmask;		/* interface's network mask */
+	int		ksni_nroutes;		/* # routes using (active) */
+	int		ksni_npeers;		/* # peers using (passive) */
+	char		ksni_name[IFNAMSIZ];	/* interface name */
+} ksock_interface_t;
+
+typedef struct
+{
+	/* "stuck" socket timeout (seconds) */
+	int              *ksnd_timeout;
+	/* # scheduler threads in each pool while starting */
+	int		 *ksnd_nscheds;
+        int              *ksnd_nconnds;         /* # connection daemons */
+        int              *ksnd_nconnds_max;     /* max # connection daemons */
+        int              *ksnd_min_reconnectms; /* first connection retry after (ms)... */
+        int              *ksnd_max_reconnectms; /* ...exponentially increasing to this */
+        int              *ksnd_eager_ack;       /* make TCP ack eagerly? */
+        int              *ksnd_typed_conns;     /* drive sockets by type? */
+        int              *ksnd_min_bulk;        /* smallest "large" message */
+        int              *ksnd_tx_buffer_size;  /* socket tx buffer size */
+        int              *ksnd_rx_buffer_size;  /* socket rx buffer size */
+        int              *ksnd_nagle;           /* enable NAGLE? */
+        int              *ksnd_round_robin;     /* round robin for multiple interfaces */
+        int              *ksnd_keepalive;       /* # secs for sending keepalive NOOP */
+        int              *ksnd_keepalive_idle;  /* # idle secs before 1st probe */
+        int              *ksnd_keepalive_count; /* # probes */
+        int              *ksnd_keepalive_intvl; /* time between probes */
+        int              *ksnd_credits;         /* # concurrent sends */
+        int              *ksnd_peertxcredits;   /* # concurrent sends to 1 peer_ni */
+        int              *ksnd_peerrtrcredits;  /* # per-peer_ni router buffer credits */
+        int              *ksnd_peertimeout;     /* seconds to consider peer_ni dead */
+        int              *ksnd_enable_csum;     /* enable check sum */
+        int              *ksnd_inject_csum_error; /* set non-zero to inject checksum error */
+        int              *ksnd_nonblk_zcack;    /* always send zc-ack on non-blocking connection */
+        unsigned int     *ksnd_zc_min_payload;  /* minimum zero copy payload size */
+        int              *ksnd_zc_recv;         /* enable ZC receive (for Chelsio TOE) */
+        int              *ksnd_zc_recv_min_nfrags; /* minimum # of fragments to enable ZC receive */
+#ifdef CPU_AFFINITY
+        int              *ksnd_irq_affinity;    /* enable IRQ affinity? */
+#endif
+#ifdef SOCKNAL_BACKOFF
+        int              *ksnd_backoff_init;    /* initial TCP backoff */
+        int              *ksnd_backoff_max;     /* maximum TCP backoff */
+#endif
+#if SOCKNAL_VERSION_DEBUG
+        int              *ksnd_protocol;        /* protocol version */
+#endif
+} ksock_tunables_t;
+
+typedef struct
+{
+	__u64		  ksnn_incarnation;	/* my epoch */
+	spinlock_t	  ksnn_lock;		/* serialise */
+	struct list_head  ksnn_list;		/* chain on global list */
+	int		  ksnn_npeers;		/* # peers */
+	int		  ksnn_shutdown;	/* shutting down? */
+	int		  ksnn_ninterfaces;	/* IP interfaces */
+	ksock_interface_t ksnn_interfaces[LNET_NUM_INTERFACES];
+} ksock_net_t;
+
+/** connd timeout */
+#define SOCKNAL_CONND_TIMEOUT  120
+/** reserved thread for accepting & creating new connd */
+#define SOCKNAL_CONND_RESV     1
+
+typedef struct
+{
+	int			ksnd_init;	/* initialisation state */
+	int			ksnd_nnets;	/* # networks set up */
+	struct list_head	ksnd_nets;	/* list of nets */
+	/* stabilize peer_ni/conn ops */
+	rwlock_t		ksnd_global_lock;
+	/* hash table of all my known peers */
+	struct list_head	*ksnd_peers;
+	int			ksnd_peer_hash_size; /* size of ksnd_peers */
+
+	int			ksnd_nthreads;	/* # live threads */
+	int			ksnd_shuttingdown; /* tell threads to exit */
+	/* schedulers information */
+	struct ksock_sched_info	**ksnd_sched_info;
+
+	atomic_t      ksnd_nactive_txs;    /* #active txs */
+
+	/* conns to close: reaper_lock*/
+	struct list_head	ksnd_deathrow_conns;
+	/* conns to free: reaper_lock */
+	struct list_head	ksnd_zombie_conns;
+	/* conns to retry: reaper_lock*/
+	struct list_head	ksnd_enomem_conns;
+	/* reaper sleeps here */
+	wait_queue_head_t       ksnd_reaper_waitq;
+	/* when reaper will wake */
+	cfs_time_t        ksnd_reaper_waketime;
+	/* serialise */
+	spinlock_t	  ksnd_reaper_lock;
+
+	int               ksnd_enomem_tx;      /* test ENOMEM sender */
+	int               ksnd_stall_tx;       /* test sluggish sender */
+	int               ksnd_stall_rx;       /* test sluggish receiver */
+
+	/* incoming connection requests */
+	struct list_head	ksnd_connd_connreqs;
+	/* routes waiting to be connected */
+	struct list_head	ksnd_connd_routes;
+	/* connds sleep here */
+	wait_queue_head_t	ksnd_connd_waitq;
+	/* # connds connecting */
+	int			ksnd_connd_connecting;
+	/** time stamp of the last failed connecting attempt */
+	long			ksnd_connd_failed_stamp;
+	/** # starting connd */
+	unsigned		ksnd_connd_starting;
+	/** time stamp of the last starting connd */
+	long			ksnd_connd_starting_stamp;
+	/** # running connd */
+	unsigned		ksnd_connd_running;
+	/* serialise */
+	spinlock_t		ksnd_connd_lock;
+
+	/* list head for freed noop tx */
+	struct list_head	ksnd_idle_noop_txs;
+	/* serialise, g_lock unsafe */
+	spinlock_t		ksnd_tx_lock;
+
+} ksock_nal_data_t;
+
+#define SOCKNAL_INIT_NOTHING    0
+#define SOCKNAL_INIT_DATA       1
+#define SOCKNAL_INIT_ALL        2
+
+/* A packet just assembled for transmission is represented by 1 or more
+ * struct kvec fragments (the first frag contains the portals header),
+ * followed by 0 or more lnet_kiov_t fragments.
+ *
+ * On the receive side, initially 1 struct kvec fragment is posted for
+ * receive (the header).  Once the header has been received, the payload is
+ * received into either struct kvec or lnet_kiov_t fragments, depending on
+ * what the header matched or whether the message needs forwarding. */
+
+struct ksock_conn;                              /* forward ref */
+struct ksock_peer;                              /* forward ref */
+struct ksock_route;                             /* forward ref */
+struct ksock_proto;                             /* forward ref */
+
+typedef struct                                  /* transmit packet */
+{
+	struct list_head   tx_list;	/* queue on conn for transmission etc */
+	struct list_head   tx_zc_list;	/* queue on peer_ni for ZC request */
+	atomic_t       tx_refcount;    /* tx reference count */
+	int            tx_nob;         /* # packet bytes */
+	int            tx_resid;       /* residual bytes */
+	int            tx_niov;        /* # packet kvec frags */
+	struct kvec  *tx_iov;         /* packet kvec frags */
+        int            tx_nkiov;       /* # packet page frags */
+        unsigned short tx_zc_aborted;  /* aborted ZC request */
+        unsigned short tx_zc_capable:1; /* payload is large enough for ZC */
+        unsigned short tx_zc_checked:1; /* Have I checked if I should ZC? */
+        unsigned short tx_nonblk:1;    /* it's a non-blocking ACK */
+        lnet_kiov_t   *tx_kiov;        /* packet page frags */
+	struct ksock_conn *tx_conn;        /* owning conn */
+	struct lnet_msg	  *tx_lnetmsg;	/* lnet message for lnet_finalize() */
+	cfs_time_t     tx_deadline;    /* when (in jiffies) tx times out */
+	struct ksock_msg    tx_msg;         /* socklnd message buffer */
+        int            tx_desc_size;   /* size of this descriptor */
+        union {
+                struct {
+			struct kvec iov;	/* virt hdr */
+			lnet_kiov_t kiov[0];	/* paged payload */
+                }                  paged;
+                struct {
+			struct kvec iov[1];	/* virt hdr + payload */
+                }                  virt;
+        }                       tx_frags;
+} ksock_tx_t;
+
+#define KSOCK_NOOP_TX_SIZE  ((int)offsetof(ksock_tx_t, tx_frags.paged.kiov[0]))
+
+/* network zero copy callback descriptor embedded in ksock_tx_t */
+
+/* space for the rx frag descriptors; we either read a single contiguous
+ * header, or up to LNET_MAX_IOV frags of payload of either type. */
+typedef union {
+	struct kvec     iov[LNET_MAX_IOV];
+        lnet_kiov_t      kiov[LNET_MAX_IOV];
+} ksock_rxiovspace_t;
+
+#define SOCKNAL_RX_KSM_HEADER   1               /* reading ksock message header */
+#define SOCKNAL_RX_LNET_HEADER  2               /* reading lnet message header */
+#define SOCKNAL_RX_PARSE        3               /* Calling lnet_parse() */
+#define SOCKNAL_RX_PARSE_WAIT   4               /* waiting to be told to read the body */
+#define SOCKNAL_RX_LNET_PAYLOAD 5               /* reading lnet payload (to deliver here) */
+#define SOCKNAL_RX_SLOP         6               /* skipping body */
+
+typedef struct ksock_conn
+{
+	struct ksock_peer  *ksnc_peer;		/* owning peer_ni */
+	struct ksock_route *ksnc_route;		/* owning route */
+	struct list_head    ksnc_list;		/* stash on peer_ni's conn list */
+	struct socket       *ksnc_sock;		/* actual socket */
+	void                *ksnc_saved_data_ready; /* socket's original data_ready() callback */
+	void                *ksnc_saved_write_space; /* socket's original write_space() callback */
+	atomic_t            ksnc_conn_refcount; /* conn refcount */
+	atomic_t            ksnc_sock_refcount; /* sock refcount */
+	ksock_sched_t       *ksnc_scheduler;  /* who schedules this connection */
+	__u32               ksnc_myipaddr;   /* my IP */
+        __u32               ksnc_ipaddr;     /* peer_ni's IP */
+        int                 ksnc_port;       /* peer_ni's port */
+	signed int          ksnc_type:3;     /* type of connection,
+					      * should be signed value */
+	unsigned int	    ksnc_closing:1;  /* being shut down */
+	unsigned int	    ksnc_flip:1;     /* flip or not, only for V2.x */
+	unsigned int	    ksnc_zc_capable:1; /* enable to ZC */
+        struct ksock_proto *ksnc_proto;      /* protocol for the connection */
+
+	/* READER */
+
+	/* where I enq waiting input or a forwarding descriptor */
+	struct list_head   ksnc_rx_list;
+	cfs_time_t            ksnc_rx_deadline; /* when (in jiffies) receive times out */
+        __u8                  ksnc_rx_started;  /* started receiving a message */
+        __u8                  ksnc_rx_ready;    /* data ready to read */
+        __u8                  ksnc_rx_scheduled;/* being progressed */
+        __u8                  ksnc_rx_state;    /* what is being read */
+        int                   ksnc_rx_nob_left; /* # bytes to next hdr/body */
+        int                   ksnc_rx_nob_wanted; /* bytes actually wanted */
+	int                   ksnc_rx_niov;     /* # kvec frags */
+	struct kvec          *ksnc_rx_iov;      /* the kvec frags */
+        int                   ksnc_rx_nkiov;    /* # page frags */
+        lnet_kiov_t          *ksnc_rx_kiov;     /* the page frags */
+        ksock_rxiovspace_t    ksnc_rx_iov_space;/* space for frag descriptors */
+        __u32                 ksnc_rx_csum;     /* partial checksum for incoming data */
+        void                 *ksnc_cookie;      /* rx lnet_finalize passthru arg */
+	struct ksock_msg	ksnc_msg;	/* incoming message buffer:
+						 * V2.x message takes the
+						 * whole struct
+						 * V1.x message is a bare
+						 * struct lnet_hdr, it's stored
+						 * in ksnc_msg.ksm_u.lnetmsg
+						 */
+	/* -- WRITER -- */
+	/* where I enq waiting for output space */
+	struct list_head	ksnc_tx_list;
+	/* packets waiting to be sent */
+	struct list_head	ksnc_tx_queue;
+	/* next TX that can carry a LNet message or ZC-ACK */
+	ksock_tx_t		*ksnc_tx_carrier;
+	/* when (in jiffies) tx times out */
+	cfs_time_t		ksnc_tx_deadline;
+	/* send buffer marker */
+	int			ksnc_tx_bufnob;
+	/* # bytes queued */
+	atomic_t		ksnc_tx_nob;
+	/* write space */
+	int			ksnc_tx_ready;
+	/* being progressed */
+	int			ksnc_tx_scheduled;
+	/* time stamp of the last posted TX */
+	cfs_time_t		ksnc_tx_last_post;
+} ksock_conn_t;
+
+typedef struct ksock_route
+{
+	struct list_head   ksnr_list;		/* chain on peer_ni route list */
+	struct list_head   ksnr_connd_list;	/* chain on ksnr_connd_routes */
+	struct ksock_peer *ksnr_peer;		/* owning peer_ni */
+	atomic_t	   ksnr_refcount;	/* # users */
+	cfs_time_t            ksnr_timeout;     /* when (in jiffies) reconnection can happen next */
+	cfs_duration_t        ksnr_retry_interval; /* how long between retries */
+        __u32                 ksnr_myipaddr;    /* my IP */
+        __u32                 ksnr_ipaddr;      /* IP address to connect to */
+        int                   ksnr_port;        /* port to connect to */
+        unsigned int          ksnr_scheduled:1; /* scheduled for attention */
+        unsigned int          ksnr_connecting:1;/* connection establishment in progress */
+        unsigned int          ksnr_connected:4; /* connections established by type */
+        unsigned int          ksnr_deleted:1;   /* been removed from peer_ni? */
+        unsigned int          ksnr_share_count; /* created explicitly? */
+        int                   ksnr_conn_count;  /* # conns established by this route */
+} ksock_route_t;
+
+#define SOCKNAL_KEEPALIVE_PING          1       /* cookie for keepalive ping */
+
+typedef struct ksock_peer
+{
+	struct list_head	ksnp_list;	/* stash on global peer_ni list */
+	cfs_time_t            ksnp_last_alive;  /* when (in jiffies) I was last alive */
+	struct lnet_process_id	ksnp_id;	/* who's on the other end(s) */
+	atomic_t              ksnp_refcount; /* # users */
+	int                   ksnp_sharecount;  /* lconf usage counter */
+	int                   ksnp_closing;  /* being closed */
+	int                   ksnp_accepting;/* # passive connections pending */
+	int                   ksnp_error;    /* errno on closing last conn */
+	__u64                 ksnp_zc_next_cookie;/* ZC completion cookie */
+	__u64                 ksnp_incarnation;   /* latest known peer_ni incarnation */
+	struct ksock_proto   *ksnp_proto;    /* latest known peer_ni protocol */
+	struct list_head	ksnp_conns;	/* all active connections */
+	struct list_head	ksnp_routes;	/* routes */
+	struct list_head	ksnp_tx_queue;	/* waiting packets */
+	spinlock_t	      ksnp_lock;	/* serialize, g_lock unsafe */
+	/* zero copy requests wait for ACK  */
+	struct list_head	ksnp_zc_req_list;
+	cfs_time_t            ksnp_send_keepalive; /* time to send keepalive */
+	struct lnet_ni       *ksnp_ni;       /* which network */
+	int                   ksnp_n_passive_ips; /* # of... */
+	__u32                 ksnp_passive_ips[LNET_NUM_INTERFACES]; /* preferred local interfaces */
+} ksock_peer_ni_t;
+
+typedef struct ksock_connreq
+{
+	/* stash on ksnd_connd_connreqs */
+	struct list_head	ksncr_list;
+	/* chosen NI */
+	struct lnet_ni		*ksncr_ni;
+	/* accepted socket */
+	struct socket		*ksncr_sock;
+} ksock_connreq_t;
+
+extern ksock_nal_data_t ksocknal_data;
+extern ksock_tunables_t ksocknal_tunables;
+
+#define SOCKNAL_MATCH_NO        0        /* TX can't match type of connection */
+#define SOCKNAL_MATCH_YES       1        /* TX matches type of connection */
+#define SOCKNAL_MATCH_MAY       2        /* TX can be sent on the connection, but not preferred */
+
+typedef struct ksock_proto
+{
+        int           pro_version;                                              /* version number of protocol */
+	int         (*pro_send_hello)(ksock_conn_t *, struct ksock_hello_msg *);     /* handshake function */
+	int         (*pro_recv_hello)(ksock_conn_t *, struct ksock_hello_msg *, int);/* handshake function */
+        void        (*pro_pack)(ksock_tx_t *);                                  /* message pack */
+	void        (*pro_unpack)(struct ksock_msg *);				/* message unpack */
+        ksock_tx_t *(*pro_queue_tx_msg)(ksock_conn_t *, ksock_tx_t *);          /* queue tx on the connection */
+        int         (*pro_queue_tx_zcack)(ksock_conn_t *, ksock_tx_t *, __u64); /* queue ZC ack on the connection */
+        int         (*pro_handle_zcreq)(ksock_conn_t *, __u64, int);            /* handle ZC request */
+        int         (*pro_handle_zcack)(ksock_conn_t *, __u64, __u64);          /* handle ZC ACK */
+        int         (*pro_match_tx)(ksock_conn_t *, ksock_tx_t *, int);         /* msg type matches the connection type:
+                                                                                 * return value:
+                                                                                 *   return MATCH_NO  : no
+                                                                                 *   return MATCH_YES : matching type
+                                                                                 *   return MATCH_MAY : can be backup */
+} ksock_proto_t;
+
+extern ksock_proto_t ksocknal_protocol_v1x;
+extern ksock_proto_t ksocknal_protocol_v2x;
+extern ksock_proto_t ksocknal_protocol_v3x;
+
+#define KSOCK_PROTO_V1_MAJOR    LNET_PROTO_TCP_VERSION_MAJOR
+#define KSOCK_PROTO_V1_MINOR    LNET_PROTO_TCP_VERSION_MINOR
+#define KSOCK_PROTO_V1          KSOCK_PROTO_V1_MAJOR
+
+#ifndef CPU_MASK_NONE
+#define CPU_MASK_NONE   0UL
+#endif
+
+static inline __u32 ksocknal_csum(__u32 crc, unsigned char const *p, size_t len)
+{
+#if 1
+	return crc32_le(crc, p, len);
+#else
+	while (len-- > 0)
+		crc = ((crc + 0x100) & ~0xff) | ((crc + *p++) & 0xff) ;
+
+	return crc;
+#endif
+}
+
+static inline int
+ksocknal_route_mask(void)
+{
+        if (!*ksocknal_tunables.ksnd_typed_conns)
+                return (1 << SOCKLND_CONN_ANY);
+
+        return ((1 << SOCKLND_CONN_CONTROL) |
+                (1 << SOCKLND_CONN_BULK_IN) |
+                (1 << SOCKLND_CONN_BULK_OUT));
+}
+
+static inline struct list_head *
+ksocknal_nid2peerlist (lnet_nid_t nid)
+{
+        unsigned int hash = ((unsigned int)nid) % ksocknal_data.ksnd_peer_hash_size;
+
+        return (&ksocknal_data.ksnd_peers [hash]);
+}
+
+static inline void
+ksocknal_conn_addref (ksock_conn_t *conn)
+{
+	LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+	atomic_inc(&conn->ksnc_conn_refcount);
+}
+
+extern void ksocknal_queue_zombie_conn (ksock_conn_t *conn);
+extern void ksocknal_finalize_zcreq(ksock_conn_t *conn);
+
+static inline void
+ksocknal_conn_decref (ksock_conn_t *conn)
+{
+	LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+	if (atomic_dec_and_test(&conn->ksnc_conn_refcount))
+		ksocknal_queue_zombie_conn(conn);
+}
+
+static inline int
+ksocknal_connsock_addref (ksock_conn_t *conn)
+{
+	int   rc = -ESHUTDOWN;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+	if (!conn->ksnc_closing) {
+		LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0);
+		atomic_inc(&conn->ksnc_sock_refcount);
+		rc = 0;
+	}
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	return (rc);
+}
+
+static inline void
+ksocknal_connsock_decref (ksock_conn_t *conn)
+{
+	LASSERT (atomic_read(&conn->ksnc_sock_refcount) > 0);
+	if (atomic_dec_and_test(&conn->ksnc_sock_refcount)) {
+		LASSERT (conn->ksnc_closing);
+		sock_release(conn->ksnc_sock);
+		conn->ksnc_sock = NULL;
+		ksocknal_finalize_zcreq(conn);
+	}
+}
+
+static inline void
+ksocknal_tx_addref (ksock_tx_t *tx)
+{
+	LASSERT (atomic_read(&tx->tx_refcount) > 0);
+	atomic_inc(&tx->tx_refcount);
+}
+
+extern void ksocknal_tx_prep (ksock_conn_t *, ksock_tx_t *tx);
+extern void ksocknal_tx_done(struct lnet_ni *ni, ksock_tx_t *tx, int error);
+
+static inline void
+ksocknal_tx_decref (ksock_tx_t *tx)
+{
+	LASSERT (atomic_read(&tx->tx_refcount) > 0);
+	if (atomic_dec_and_test(&tx->tx_refcount))
+		ksocknal_tx_done(NULL, tx, 0);
+}
+
+static inline void
+ksocknal_route_addref (ksock_route_t *route)
+{
+	LASSERT (atomic_read(&route->ksnr_refcount) > 0);
+	atomic_inc(&route->ksnr_refcount);
+}
+
+extern void ksocknal_destroy_route (ksock_route_t *route);
+
+static inline void
+ksocknal_route_decref (ksock_route_t *route)
+{
+	LASSERT (atomic_read (&route->ksnr_refcount) > 0);
+	if (atomic_dec_and_test(&route->ksnr_refcount))
+		ksocknal_destroy_route (route);
+}
+
+static inline void
+ksocknal_peer_addref (ksock_peer_ni_t *peer_ni)
+{
+	LASSERT (atomic_read (&peer_ni->ksnp_refcount) > 0);
+	atomic_inc(&peer_ni->ksnp_refcount);
+}
+
+extern void ksocknal_destroy_peer (ksock_peer_ni_t *peer_ni);
+
+static inline void
+ksocknal_peer_decref (ksock_peer_ni_t *peer_ni)
+{
+	LASSERT (atomic_read (&peer_ni->ksnp_refcount) > 0);
+	if (atomic_dec_and_test(&peer_ni->ksnp_refcount))
+		ksocknal_destroy_peer (peer_ni);
+}
+
+int ksocknal_startup(struct lnet_ni *ni);
+void ksocknal_shutdown(struct lnet_ni *ni);
+int ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg);
+int ksocknal_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg);
+int ksocknal_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
+		  int delayed, unsigned int niov,
+		  struct kvec *iov, lnet_kiov_t *kiov,
+                  unsigned int offset, unsigned int mlen, unsigned int rlen);
+int ksocknal_accept(struct lnet_ni *ni, struct socket *sock);
+
+int ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip,
+		      int port);
+ksock_peer_ni_t *ksocknal_find_peer_locked(struct lnet_ni *ni,
+					   struct lnet_process_id id);
+ksock_peer_ni_t *ksocknal_find_peer(struct lnet_ni *ni,
+				    struct lnet_process_id id);
+extern void ksocknal_peer_failed (ksock_peer_ni_t *peer_ni);
+extern int ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
+				struct socket *sock, int type);
+extern void ksocknal_close_conn_locked (ksock_conn_t *conn, int why);
+extern void ksocknal_terminate_conn (ksock_conn_t *conn);
+extern void ksocknal_destroy_conn (ksock_conn_t *conn);
+extern int  ksocknal_close_peer_conns_locked (ksock_peer_ni_t *peer_ni,
+                                              __u32 ipaddr, int why);
+extern int ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why);
+int ksocknal_close_matching_conns(struct lnet_process_id id, __u32 ipaddr);
+extern ksock_conn_t *ksocknal_find_conn_locked(ksock_peer_ni_t *peer_ni,
+                                               ksock_tx_t *tx, int nonblk);
+
+extern int  ksocknal_launch_packet(struct lnet_ni *ni, ksock_tx_t *tx,
+				   struct lnet_process_id id);
+extern ksock_tx_t *ksocknal_alloc_tx(int type, int size);
+extern void ksocknal_free_tx (ksock_tx_t *tx);
+extern ksock_tx_t *ksocknal_alloc_tx_noop(__u64 cookie, int nonblk);
+extern void ksocknal_next_tx_carrier(ksock_conn_t *conn);
+extern void ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn);
+extern void ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist,
+				 int error);
+extern void ksocknal_notify(struct lnet_ni *ni, lnet_nid_t gw_nid, int alive);
+extern void ksocknal_query (struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when);
+extern int ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name);
+extern void ksocknal_thread_fini (void);
+extern void ksocknal_launch_all_connections_locked (ksock_peer_ni_t *peer_ni);
+extern ksock_route_t *ksocknal_find_connectable_route_locked (ksock_peer_ni_t *peer_ni);
+extern ksock_route_t *ksocknal_find_connecting_route_locked (ksock_peer_ni_t *peer_ni);
+extern int ksocknal_new_packet (ksock_conn_t *conn, int skip);
+extern int ksocknal_scheduler (void *arg);
+extern int ksocknal_connd (void *arg);
+extern int ksocknal_reaper (void *arg);
+int ksocknal_send_hello(struct lnet_ni *ni, ksock_conn_t *conn,
+			lnet_nid_t peer_nid, struct ksock_hello_msg *hello);
+int ksocknal_recv_hello(struct lnet_ni *ni, ksock_conn_t *conn,
+			struct ksock_hello_msg *hello,
+			struct lnet_process_id *id,
+			__u64 *incarnation);
+extern void ksocknal_read_callback(ksock_conn_t *conn);
+extern void ksocknal_write_callback(ksock_conn_t *conn);
+
+extern int ksocknal_lib_zc_capable(ksock_conn_t *conn);
+extern void ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn);
+extern void ksocknal_lib_set_callback(struct socket *sock,  ksock_conn_t *conn);
+extern void ksocknal_lib_reset_callback(struct socket *sock,
+					ksock_conn_t *conn);
+extern void ksocknal_lib_push_conn(ksock_conn_t *conn);
+extern int ksocknal_lib_get_conn_addrs(ksock_conn_t *conn);
+extern int ksocknal_lib_setup_sock(struct socket *so);
+extern int ksocknal_lib_send_iov(ksock_conn_t *conn, ksock_tx_t *tx);
+extern int ksocknal_lib_send_kiov(ksock_conn_t *conn, ksock_tx_t *tx);
+extern void ksocknal_lib_eager_ack(ksock_conn_t *conn);
+extern int ksocknal_lib_recv_iov(ksock_conn_t *conn);
+extern int ksocknal_lib_recv_kiov(ksock_conn_t *conn);
+extern int ksocknal_lib_get_conn_tunables(ksock_conn_t *conn, int *txmem,
+					  int *rxmem, int *nagle);
+
+extern int ksocknal_tunables_init(void);
+
+extern void ksocknal_lib_csum_tx(ksock_tx_t *tx);
+
+extern int ksocknal_lib_memory_pressure(ksock_conn_t *conn);
+extern int ksocknal_lib_bind_thread_to_cpu(int id);
+
+#endif /* _SOCKLND_SOCKLND_H_ */
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c
new file mode 100644
index 0000000000000..8892aad0403dd
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c
@@ -0,0 +1,2658 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ *
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, https://wiki.hpdd.intel.com/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+ksock_tx_t *
+ksocknal_alloc_tx(int type, int size)
+{
+	ksock_tx_t *tx = NULL;
+
+	if (type == KSOCK_MSG_NOOP) {
+		LASSERT(size == KSOCK_NOOP_TX_SIZE);
+
+		/* searching for a noop tx in free list */
+		spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+		if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
+			tx = list_entry(ksocknal_data.ksnd_idle_noop_txs. \
+					    next, ksock_tx_t, tx_list);
+			LASSERT(tx->tx_desc_size == size);
+			list_del(&tx->tx_list);
+		}
+
+		spin_unlock(&ksocknal_data.ksnd_tx_lock);
+        }
+
+        if (tx == NULL)
+                LIBCFS_ALLOC(tx, size);
+
+        if (tx == NULL)
+                return NULL;
+
+	atomic_set(&tx->tx_refcount, 1);
+	tx->tx_zc_aborted = 0;
+	tx->tx_zc_capable = 0;
+	tx->tx_zc_checked = 0;
+	tx->tx_desc_size  = size;
+
+	atomic_inc(&ksocknal_data.ksnd_nactive_txs);
+
+	return tx;
+}
+
+ksock_tx_t *
+ksocknal_alloc_tx_noop(__u64 cookie, int nonblk)
+{
+        ksock_tx_t *tx;
+
+        tx = ksocknal_alloc_tx(KSOCK_MSG_NOOP, KSOCK_NOOP_TX_SIZE);
+        if (tx == NULL) {
+                CERROR("Can't allocate noop tx desc\n");
+                return NULL;
+        }
+
+        tx->tx_conn     = NULL;
+        tx->tx_lnetmsg  = NULL;
+        tx->tx_kiov     = NULL;
+        tx->tx_nkiov    = 0;
+        tx->tx_iov      = tx->tx_frags.virt.iov;
+        tx->tx_niov     = 1;
+        tx->tx_nonblk   = nonblk;
+
+	tx->tx_msg.ksm_csum = 0;
+	tx->tx_msg.ksm_type = KSOCK_MSG_NOOP;
+	tx->tx_msg.ksm_zc_cookies[0] = 0;
+        tx->tx_msg.ksm_zc_cookies[1] = cookie;
+
+        return tx;
+}
+
+
+void
+ksocknal_free_tx (ksock_tx_t *tx)
+{
+	atomic_dec(&ksocknal_data.ksnd_nactive_txs);
+
+	if (tx->tx_lnetmsg == NULL && tx->tx_desc_size == KSOCK_NOOP_TX_SIZE) {
+		/* it's a noop tx */
+		spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+		list_add(&tx->tx_list, &ksocknal_data.ksnd_idle_noop_txs);
+
+		spin_unlock(&ksocknal_data.ksnd_tx_lock);
+	} else {
+		LIBCFS_FREE(tx, tx->tx_desc_size);
+	}
+}
+
+static int
+ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	struct kvec *iov = tx->tx_iov;
+        int    nob;
+        int    rc;
+
+        LASSERT (tx->tx_niov > 0);
+
+        /* Never touch tx->tx_iov inside ksocknal_lib_send_iov() */
+        rc = ksocknal_lib_send_iov(conn, tx);
+
+        if (rc <= 0)                            /* sent nothing? */
+                return (rc);
+
+        nob = rc;
+        LASSERT (nob <= tx->tx_resid);
+        tx->tx_resid -= nob;
+
+        /* "consume" iov */
+        do {
+                LASSERT (tx->tx_niov > 0);
+
+                if (nob < (int) iov->iov_len) {
+			iov->iov_base += nob;
+                        iov->iov_len -= nob;
+                        return (rc);
+                }
+
+                nob -= iov->iov_len;
+                tx->tx_iov = ++iov;
+                tx->tx_niov--;
+        } while (nob != 0);
+
+        return (rc);
+}
+
+static int
+ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+        lnet_kiov_t    *kiov = tx->tx_kiov;
+        int     nob;
+        int     rc;
+
+        LASSERT (tx->tx_niov == 0);
+        LASSERT (tx->tx_nkiov > 0);
+
+        /* Never touch tx->tx_kiov inside ksocknal_lib_send_kiov() */
+        rc = ksocknal_lib_send_kiov(conn, tx);
+
+        if (rc <= 0)                            /* sent nothing? */
+                return (rc);
+
+        nob = rc;
+        LASSERT (nob <= tx->tx_resid);
+        tx->tx_resid -= nob;
+
+        /* "consume" kiov */
+        do {
+                LASSERT(tx->tx_nkiov > 0);
+
+                if (nob < (int)kiov->kiov_len) {
+                        kiov->kiov_offset += nob;
+                        kiov->kiov_len -= nob;
+                        return rc;
+                }
+
+                nob -= (int)kiov->kiov_len;
+                tx->tx_kiov = ++kiov;
+                tx->tx_nkiov--;
+        } while (nob != 0);
+
+        return (rc);
+}
+
+static int
+ksocknal_transmit(ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	int	rc;
+	int	bufnob;
+
+	if (ksocknal_data.ksnd_stall_tx != 0) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(ksocknal_data.ksnd_stall_tx));
+	}
+
+	LASSERT(tx->tx_resid != 0);
+
+        rc = ksocknal_connsock_addref(conn);
+        if (rc != 0) {
+                LASSERT (conn->ksnc_closing);
+                return (-ESHUTDOWN);
+        }
+
+        do {
+                if (ksocknal_data.ksnd_enomem_tx > 0) {
+                        /* testing... */
+                        ksocknal_data.ksnd_enomem_tx--;
+                        rc = -EAGAIN;
+                } else if (tx->tx_niov != 0) {
+                        rc = ksocknal_send_iov (conn, tx);
+                } else {
+                        rc = ksocknal_send_kiov (conn, tx);
+                }
+
+		bufnob = conn->ksnc_sock->sk->sk_wmem_queued;
+                if (rc > 0)                     /* sent something? */
+                        conn->ksnc_tx_bufnob += rc; /* account it */
+
+		if (bufnob < conn->ksnc_tx_bufnob) {
+			/* allocated send buffer bytes < computed; infer
+			 * something got ACKed */
+			conn->ksnc_tx_deadline =
+				cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+			conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+			conn->ksnc_tx_bufnob = bufnob;
+			smp_mb();
+		}
+
+		if (rc <= 0) { /* Didn't write anything? */
+
+                        if (rc == 0) /* some stacks return 0 instead of -EAGAIN */
+                                rc = -EAGAIN;
+
+                        /* Check if EAGAIN is due to memory pressure */
+                        if(rc == -EAGAIN && ksocknal_lib_memory_pressure(conn))
+                                rc = -ENOMEM;
+
+                        break;
+                }
+
+                /* socket's wmem_queued now includes 'rc' bytes */
+		atomic_sub (rc, &conn->ksnc_tx_nob);
+                rc = 0;
+
+        } while (tx->tx_resid != 0);
+
+        ksocknal_connsock_decref(conn);
+        return (rc);
+}
+
+static int
+ksocknal_recv_iov (ksock_conn_t *conn)
+{
+	struct kvec *iov = conn->ksnc_rx_iov;
+        int     nob;
+        int     rc;
+
+        LASSERT (conn->ksnc_rx_niov > 0);
+
+	/* Never touch conn->ksnc_rx_iov or change connection
+         * status inside ksocknal_lib_recv_iov */
+        rc = ksocknal_lib_recv_iov(conn);
+
+        if (rc <= 0)
+                return (rc);
+
+        /* received something... */
+        nob = rc;
+
+	conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+	conn->ksnc_rx_deadline =
+		cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+	smp_mb();                       /* order with setting rx_started */
+	conn->ksnc_rx_started = 1;
+
+	conn->ksnc_rx_nob_wanted -= nob;
+	conn->ksnc_rx_nob_left -= nob;
+
+        do {
+                LASSERT (conn->ksnc_rx_niov > 0);
+
+                if (nob < (int)iov->iov_len) {
+                        iov->iov_len -= nob;
+			iov->iov_base += nob;
+                        return (-EAGAIN);
+                }
+
+                nob -= iov->iov_len;
+                conn->ksnc_rx_iov = ++iov;
+                conn->ksnc_rx_niov--;
+        } while (nob != 0);
+
+        return (rc);
+}
+
+static int
+ksocknal_recv_kiov (ksock_conn_t *conn)
+{
+        lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
+        int     nob;
+        int     rc;
+        LASSERT (conn->ksnc_rx_nkiov > 0);
+
+	/* Never touch conn->ksnc_rx_kiov or change connection
+         * status inside ksocknal_lib_recv_iov */
+        rc = ksocknal_lib_recv_kiov(conn);
+
+        if (rc <= 0)
+                return (rc);
+
+        /* received something... */
+        nob = rc;
+
+	conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+	conn->ksnc_rx_deadline =
+		cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+	smp_mb();                       /* order with setting rx_started */
+	conn->ksnc_rx_started = 1;
+
+	conn->ksnc_rx_nob_wanted -= nob;
+	conn->ksnc_rx_nob_left -= nob;
+
+        do {
+                LASSERT (conn->ksnc_rx_nkiov > 0);
+
+                if (nob < (int) kiov->kiov_len) {
+                        kiov->kiov_offset += nob;
+                        kiov->kiov_len -= nob;
+                        return -EAGAIN;
+                }
+
+                nob -= kiov->kiov_len;
+                conn->ksnc_rx_kiov = ++kiov;
+                conn->ksnc_rx_nkiov--;
+        } while (nob != 0);
+
+        return 1;
+}
+
+static int
+ksocknal_receive (ksock_conn_t *conn)
+{
+        /* Return 1 on success, 0 on EOF, < 0 on error.
+         * Caller checks ksnc_rx_nob_wanted to determine
+         * progress/completion. */
+        int     rc;
+        ENTRY;
+
+	if (ksocknal_data.ksnd_stall_rx != 0) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(ksocknal_data.ksnd_stall_rx));
+	}
+
+        rc = ksocknal_connsock_addref(conn);
+        if (rc != 0) {
+                LASSERT (conn->ksnc_closing);
+                return (-ESHUTDOWN);
+        }
+
+        for (;;) {
+                if (conn->ksnc_rx_niov != 0)
+                        rc = ksocknal_recv_iov (conn);
+                else
+                        rc = ksocknal_recv_kiov (conn);
+
+                if (rc <= 0) {
+                        /* error/EOF or partial receive */
+                        if (rc == -EAGAIN) {
+                                rc = 1;
+                        } else if (rc == 0 && conn->ksnc_rx_started) {
+                                /* EOF in the middle of a message */
+                                rc = -EPROTO;
+                        }
+                        break;
+                }
+
+                /* Completed a fragment */
+
+                if (conn->ksnc_rx_nob_wanted == 0) {
+                        rc = 1;
+                        break;
+                }
+        }
+
+        ksocknal_connsock_decref(conn);
+        RETURN (rc);
+}
+
+void
+ksocknal_tx_done(struct lnet_ni *ni, ksock_tx_t *tx, int rc)
+{
+	struct lnet_msg *lnetmsg = tx->tx_lnetmsg;
+        ENTRY;
+
+	LASSERT(ni != NULL || tx->tx_conn != NULL);
+
+	if (!rc && (tx->tx_resid != 0 || tx->tx_zc_aborted))
+		rc = -EIO;
+
+	if (tx->tx_conn != NULL)
+		ksocknal_conn_decref(tx->tx_conn);
+
+	ksocknal_free_tx(tx);
+	if (lnetmsg != NULL) /* KSOCK_MSG_NOOP go without lnetmsg */
+		lnet_finalize(lnetmsg, rc);
+
+	EXIT;
+}
+
+void
+ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist, int error)
+{
+	ksock_tx_t *tx;
+
+	while (!list_empty(txlist)) {
+		tx = list_entry(txlist->next, ksock_tx_t, tx_list);
+
+		if (error && tx->tx_lnetmsg != NULL) {
+			CNETERR("Deleting packet type %d len %d %s->%s\n",
+				le32_to_cpu(tx->tx_lnetmsg->msg_hdr.type),
+				le32_to_cpu(tx->tx_lnetmsg->msg_hdr.payload_length),
+				libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.src_nid)),
+				libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.dest_nid)));
+		} else if (error) {
+			CNETERR("Deleting noop packet\n");
+		}
+
+		list_del(&tx->tx_list);
+
+		LASSERT(atomic_read(&tx->tx_refcount) == 1);
+		ksocknal_tx_done(ni, tx, error);
+	}
+}
+
+static void
+ksocknal_check_zc_req(ksock_tx_t *tx)
+{
+        ksock_conn_t   *conn = tx->tx_conn;
+        ksock_peer_ni_t   *peer_ni = conn->ksnc_peer;
+
+        /* Set tx_msg.ksm_zc_cookies[0] to a unique non-zero cookie and add tx
+         * to ksnp_zc_req_list if some fragment of this message should be sent
+         * zero-copy.  Our peer_ni will send an ACK containing this cookie when
+         * she has received this message to tell us we can signal completion.
+         * tx_msg.ksm_zc_cookies[0] remains non-zero while tx is on
+         * ksnp_zc_req_list. */
+        LASSERT (tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+        LASSERT (tx->tx_zc_capable);
+
+        tx->tx_zc_checked = 1;
+
+        if (conn->ksnc_proto == &ksocknal_protocol_v1x ||
+            !conn->ksnc_zc_capable)
+                return;
+
+        /* assign cookie and queue tx to pending list, it will be released when
+         * a matching ack is received. See ksocknal_handle_zcack() */
+
+        ksocknal_tx_addref(tx);
+
+	spin_lock(&peer_ni->ksnp_lock);
+
+        /* ZC_REQ is going to be pinned to the peer_ni */
+	tx->tx_deadline =
+		cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+
+        LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0);
+
+        tx->tx_msg.ksm_zc_cookies[0] = peer_ni->ksnp_zc_next_cookie++;
+
+        if (peer_ni->ksnp_zc_next_cookie == 0)
+                peer_ni->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1;
+
+	list_add_tail(&tx->tx_zc_list, &peer_ni->ksnp_zc_req_list);
+
+	spin_unlock(&peer_ni->ksnp_lock);
+}
+
+static void
+ksocknal_uncheck_zc_req(ksock_tx_t *tx)
+{
+	ksock_peer_ni_t   *peer_ni = tx->tx_conn->ksnc_peer;
+
+	LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+	LASSERT(tx->tx_zc_capable);
+
+	tx->tx_zc_checked = 0;
+
+	spin_lock(&peer_ni->ksnp_lock);
+
+	if (tx->tx_msg.ksm_zc_cookies[0] == 0) {
+		/* Not waiting for an ACK */
+		spin_unlock(&peer_ni->ksnp_lock);
+		return;
+	}
+
+	tx->tx_msg.ksm_zc_cookies[0] = 0;
+	list_del(&tx->tx_zc_list);
+
+	spin_unlock(&peer_ni->ksnp_lock);
+
+	ksocknal_tx_decref(tx);
+}
+
+static int
+ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+        int            rc;
+
+        if (tx->tx_zc_capable && !tx->tx_zc_checked)
+                ksocknal_check_zc_req(tx);
+
+        rc = ksocknal_transmit (conn, tx);
+
+        CDEBUG (D_NET, "send(%d) %d\n", tx->tx_resid, rc);
+
+        if (tx->tx_resid == 0) {
+                /* Sent everything OK */
+                LASSERT (rc == 0);
+
+                return (0);
+        }
+
+        if (rc == -EAGAIN)
+                return (rc);
+
+        if (rc == -ENOMEM) {
+                static int counter;
+
+                counter++;   /* exponential backoff warnings */
+                if ((counter & (-counter)) == counter)
+                        CWARN("%u ENOMEM tx %p (%u allocated)\n",
+			      counter, conn, atomic_read(&libcfs_kmemory));
+
+                /* Queue on ksnd_enomem_conns for retry after a timeout */
+		spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+                /* enomem list takes over scheduler's ref... */
+                LASSERT (conn->ksnc_tx_scheduled);
+		list_add_tail(&conn->ksnc_tx_list,
+                                  &ksocknal_data.ksnd_enomem_conns);
+		if (!cfs_time_aftereq(cfs_time_add(cfs_time_current(),
+					SOCKNAL_ENOMEM_RETRY),
+					ksocknal_data.ksnd_reaper_waketime))
+			wake_up(&ksocknal_data.ksnd_reaper_waitq);
+
+		spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+		return (rc);
+	}
+
+        /* Actual error */
+        LASSERT (rc < 0);
+
+        if (!conn->ksnc_closing) {
+                switch (rc) {
+                case -ECONNRESET:
+			LCONSOLE_WARN("Host %pI4h reset our connection "
+                                      "while we were sending data; it may have "
+                                      "rebooted.\n",
+				      &conn->ksnc_ipaddr);
+                        break;
+                default:
+                        LCONSOLE_WARN("There was an unexpected network error "
+				      "while writing to %pI4h: %d.\n",
+				      &conn->ksnc_ipaddr, rc);
+                        break;
+                }
+		CDEBUG(D_NET, "[%p] Error %d on write to %s ip %pI4h:%d\n",
+		       conn, rc, libcfs_id2str(conn->ksnc_peer->ksnp_id),
+		       &conn->ksnc_ipaddr, conn->ksnc_port);
+        }
+
+        if (tx->tx_zc_checked)
+                ksocknal_uncheck_zc_req(tx);
+
+        /* it's not an error if conn is being closed */
+        ksocknal_close_conn_and_siblings (conn,
+                                          (conn->ksnc_closing) ? 0 : rc);
+
+        return (rc);
+}
+
+static void
+ksocknal_launch_connection_locked (ksock_route_t *route)
+{
+
+        /* called holding write lock on ksnd_global_lock */
+
+        LASSERT (!route->ksnr_scheduled);
+        LASSERT (!route->ksnr_connecting);
+        LASSERT ((ksocknal_route_mask() & ~route->ksnr_connected) != 0);
+
+        route->ksnr_scheduled = 1;              /* scheduling conn for connd */
+        ksocknal_route_addref(route);           /* extra ref for connd */
+
+	spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+
+	list_add_tail(&route->ksnr_connd_list,
+			  &ksocknal_data.ksnd_connd_routes);
+	wake_up(&ksocknal_data.ksnd_connd_waitq);
+
+	spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+}
+
+void
+ksocknal_launch_all_connections_locked (ksock_peer_ni_t *peer_ni)
+{
+        ksock_route_t *route;
+
+        /* called holding write lock on ksnd_global_lock */
+        for (;;) {
+                /* launch any/all connections that need it */
+                route = ksocknal_find_connectable_route_locked(peer_ni);
+                if (route == NULL)
+                        return;
+
+                ksocknal_launch_connection_locked(route);
+        }
+}
+
+ksock_conn_t *
+ksocknal_find_conn_locked(ksock_peer_ni_t *peer_ni, ksock_tx_t *tx, int nonblk)
+{
+	struct list_head *tmp;
+        ksock_conn_t     *conn;
+        ksock_conn_t     *typed = NULL;
+        ksock_conn_t     *fallback = NULL;
+        int               tnob     = 0;
+        int               fnob     = 0;
+
+	list_for_each(tmp, &peer_ni->ksnp_conns) {
+		ksock_conn_t *c  = list_entry(tmp, ksock_conn_t, ksnc_list);
+		int           nob = atomic_read(&c->ksnc_tx_nob) +
+					c->ksnc_sock->sk->sk_wmem_queued;
+                int           rc;
+
+                LASSERT (!c->ksnc_closing);
+                LASSERT (c->ksnc_proto != NULL &&
+                         c->ksnc_proto->pro_match_tx != NULL);
+
+                rc = c->ksnc_proto->pro_match_tx(c, tx, nonblk);
+
+                switch (rc) {
+                default:
+                        LBUG();
+                case SOCKNAL_MATCH_NO: /* protocol rejected the tx */
+                        continue;
+
+                case SOCKNAL_MATCH_YES: /* typed connection */
+                        if (typed == NULL || tnob > nob ||
+                            (tnob == nob && *ksocknal_tunables.ksnd_round_robin &&
+			     cfs_time_after(typed->ksnc_tx_last_post, c->ksnc_tx_last_post))) {
+                                typed = c;
+                                tnob  = nob;
+                        }
+                        break;
+
+                case SOCKNAL_MATCH_MAY: /* fallback connection */
+                        if (fallback == NULL || fnob > nob ||
+                            (fnob == nob && *ksocknal_tunables.ksnd_round_robin &&
+			     cfs_time_after(fallback->ksnc_tx_last_post, c->ksnc_tx_last_post))) {
+                                fallback = c;
+                                fnob     = nob;
+                        }
+                        break;
+                }
+        }
+
+        /* prefer the typed selection */
+        conn = (typed != NULL) ? typed : fallback;
+
+        if (conn != NULL)
+		conn->ksnc_tx_last_post = cfs_time_current();
+
+        return conn;
+}
+
+void
+ksocknal_tx_prep(ksock_conn_t *conn, ksock_tx_t *tx)
+{
+        conn->ksnc_proto->pro_pack(tx);
+
+	atomic_add (tx->tx_nob, &conn->ksnc_tx_nob);
+        ksocknal_conn_addref(conn); /* +1 ref for tx */
+        tx->tx_conn = conn;
+}
+
+void
+ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn)
+{
+        ksock_sched_t *sched = conn->ksnc_scheduler;
+	struct ksock_msg   *msg = &tx->tx_msg;
+        ksock_tx_t    *ztx = NULL;
+        int            bufnob = 0;
+
+        /* called holding global lock (read or irq-write) and caller may
+         * not have dropped this lock between finding conn and calling me,
+         * so we don't need the {get,put}connsock dance to deref
+         * ksnc_sock... */
+        LASSERT(!conn->ksnc_closing);
+
+	CDEBUG(D_NET, "Sending to %s ip %pI4h:%d\n",
+	       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+	       &conn->ksnc_ipaddr, conn->ksnc_port);
+
+        ksocknal_tx_prep(conn, tx);
+
+        /* Ensure the frags we've been given EXACTLY match the number of
+         * bytes we want to send.  Many TCP/IP stacks disregard any total
+	 * size parameters passed to them and just look at the frags.
+         *
+         * We always expect at least 1 mapped fragment containing the
+         * complete ksocknal message header. */
+        LASSERT (lnet_iov_nob (tx->tx_niov, tx->tx_iov) +
+                 lnet_kiov_nob(tx->tx_nkiov, tx->tx_kiov) ==
+                 (unsigned int)tx->tx_nob);
+        LASSERT (tx->tx_niov >= 1);
+        LASSERT (tx->tx_resid == tx->tx_nob);
+
+        CDEBUG (D_NET, "Packet %p type %d, nob %d niov %d nkiov %d\n",
+                tx, (tx->tx_lnetmsg != NULL) ? tx->tx_lnetmsg->msg_hdr.type:
+                                               KSOCK_MSG_NOOP,
+                tx->tx_nob, tx->tx_niov, tx->tx_nkiov);
+
+	bufnob = conn->ksnc_sock->sk->sk_wmem_queued;
+	spin_lock_bh(&sched->kss_lock);
+
+	if (list_empty(&conn->ksnc_tx_queue) && bufnob == 0) {
+		/* First packet starts the timeout */
+		conn->ksnc_tx_deadline =
+			cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+		if (conn->ksnc_tx_bufnob > 0) /* something got ACKed */
+			conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+		conn->ksnc_tx_bufnob = 0;
+		smp_mb(); /* order with adding to tx_queue */
+	}
+
+	if (msg->ksm_type == KSOCK_MSG_NOOP) {
+		/* The packet is noop ZC ACK, try to piggyback the ack_cookie
+		 * on a normal packet so I don't need to send it */
+                LASSERT (msg->ksm_zc_cookies[1] != 0);
+                LASSERT (conn->ksnc_proto->pro_queue_tx_zcack != NULL);
+
+                if (conn->ksnc_proto->pro_queue_tx_zcack(conn, tx, 0))
+                        ztx = tx; /* ZC ACK piggybacked on ztx release tx later */
+
+        } else {
+                /* It's a normal packet - can it piggback a noop zc-ack that
+                 * has been queued already? */
+                LASSERT (msg->ksm_zc_cookies[1] == 0);
+                LASSERT (conn->ksnc_proto->pro_queue_tx_msg != NULL);
+
+                ztx = conn->ksnc_proto->pro_queue_tx_msg(conn, tx);
+                /* ztx will be released later */
+        }
+
+        if (ztx != NULL) {
+		atomic_sub (ztx->tx_nob, &conn->ksnc_tx_nob);
+		list_add_tail(&ztx->tx_list, &sched->kss_zombie_noop_txs);
+        }
+
+	if (conn->ksnc_tx_ready &&      /* able to send */
+	    !conn->ksnc_tx_scheduled) { /* not scheduled to send */
+		/* +1 ref for scheduler */
+		ksocknal_conn_addref(conn);
+		list_add_tail(&conn->ksnc_tx_list,
+				   &sched->kss_tx_conns);
+		conn->ksnc_tx_scheduled = 1;
+		wake_up(&sched->kss_waitq);
+	}
+
+	spin_unlock_bh(&sched->kss_lock);
+}
+
+
+ksock_route_t *
+ksocknal_find_connectable_route_locked (ksock_peer_ni_t *peer_ni)
+{
+	cfs_time_t     now = cfs_time_current();
+	struct list_head    *tmp;
+        ksock_route_t *route;
+
+	list_for_each(tmp, &peer_ni->ksnp_routes) {
+		route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+                LASSERT (!route->ksnr_connecting || route->ksnr_scheduled);
+
+                if (route->ksnr_scheduled)      /* connections being established */
+                        continue;
+
+                /* all route types connected ? */
+                if ((ksocknal_route_mask() & ~route->ksnr_connected) == 0)
+                        continue;
+
+                if (!(route->ksnr_retry_interval == 0 || /* first attempt */
+		      cfs_time_aftereq(now, route->ksnr_timeout))) {
+                        CDEBUG(D_NET,
+			       "Too soon to retry route %pI4h "
+			       "(cnted %d, interval %ld, %ld secs later)\n",
+			       &route->ksnr_ipaddr,
+                               route->ksnr_connected,
+                               route->ksnr_retry_interval,
+			       cfs_duration_sec(route->ksnr_timeout - now));
+                        continue;
+                }
+
+                return (route);
+        }
+
+        return (NULL);
+}
+
+ksock_route_t *
+ksocknal_find_connecting_route_locked (ksock_peer_ni_t *peer_ni)
+{
+	struct list_head        *tmp;
+        ksock_route_t     *route;
+
+	list_for_each(tmp, &peer_ni->ksnp_routes) {
+		route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+                LASSERT (!route->ksnr_connecting || route->ksnr_scheduled);
+
+                if (route->ksnr_scheduled)
+                        return (route);
+        }
+
+        return (NULL);
+}
+
+int
+ksocknal_launch_packet(struct lnet_ni *ni, ksock_tx_t *tx,
+		       struct lnet_process_id id)
+{
+        ksock_peer_ni_t     *peer_ni;
+        ksock_conn_t     *conn;
+	rwlock_t     *g_lock;
+        int               retry;
+        int               rc;
+
+        LASSERT (tx->tx_conn == NULL);
+
+        g_lock = &ksocknal_data.ksnd_global_lock;
+
+        for (retry = 0;; retry = 1) {
+		read_lock(g_lock);
+                peer_ni = ksocknal_find_peer_locked(ni, id);
+                if (peer_ni != NULL) {
+                        if (ksocknal_find_connectable_route_locked(peer_ni) == NULL) {
+                                conn = ksocknal_find_conn_locked(peer_ni, tx, tx->tx_nonblk);
+                                if (conn != NULL) {
+                                        /* I've got no routes that need to be
+                                         * connecting and I do have an actual
+                                         * connection... */
+                                        ksocknal_queue_tx_locked (tx, conn);
+					read_unlock(g_lock);
+                                        return (0);
+                                }
+                        }
+                }
+
+                /* I'll need a write lock... */
+		read_unlock(g_lock);
+
+		write_lock_bh(g_lock);
+
+                peer_ni = ksocknal_find_peer_locked(ni, id);
+                if (peer_ni != NULL)
+                        break;
+
+		write_unlock_bh(g_lock);
+
+                if ((id.pid & LNET_PID_USERFLAG) != 0) {
+                        CERROR("Refusing to create a connection to "
+                               "userspace process %s\n", libcfs_id2str(id));
+                        return -EHOSTUNREACH;
+                }
+
+                if (retry) {
+                        CERROR("Can't find peer_ni %s\n", libcfs_id2str(id));
+                        return -EHOSTUNREACH;
+                }
+
+                rc = ksocknal_add_peer(ni, id,
+                                       LNET_NIDADDR(id.nid),
+                                       lnet_acceptor_port());
+                if (rc != 0) {
+                        CERROR("Can't add peer_ni %s: %d\n",
+                               libcfs_id2str(id), rc);
+                        return rc;
+                }
+        }
+
+        ksocknal_launch_all_connections_locked(peer_ni);
+
+        conn = ksocknal_find_conn_locked(peer_ni, tx, tx->tx_nonblk);
+        if (conn != NULL) {
+                /* Connection exists; queue message on it */
+                ksocknal_queue_tx_locked (tx, conn);
+		write_unlock_bh(g_lock);
+                return (0);
+        }
+
+        if (peer_ni->ksnp_accepting > 0 ||
+            ksocknal_find_connecting_route_locked (peer_ni) != NULL) {
+                /* the message is going to be pinned to the peer_ni */
+		tx->tx_deadline =
+			cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+
+                /* Queue the message until a connection is established */
+		list_add_tail(&tx->tx_list, &peer_ni->ksnp_tx_queue);
+		write_unlock_bh(g_lock);
+                return 0;
+        }
+
+	write_unlock_bh(g_lock);
+
+        /* NB Routes may be ignored if connections to them failed recently */
+        CNETERR("No usable routes to %s\n", libcfs_id2str(id));
+        return (-EHOSTUNREACH);
+}
+
+int
+ksocknal_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
+{
+	int mpflag = 1;
+	int type = lntmsg->msg_type;
+	struct lnet_process_id target = lntmsg->msg_target;
+        unsigned int      payload_niov = lntmsg->msg_niov;
+	struct kvec *payload_iov = lntmsg->msg_iov;
+        lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
+        unsigned int      payload_offset = lntmsg->msg_offset;
+        unsigned int      payload_nob = lntmsg->msg_len;
+        ksock_tx_t       *tx;
+        int               desc_size;
+        int               rc;
+
+        /* NB 'private' is different depending on what we're sending.
+         * Just ignore it... */
+
+        CDEBUG(D_NET, "sending %u bytes in %d frags to %s\n",
+               payload_nob, payload_niov, libcfs_id2str(target));
+
+	LASSERT (payload_nob == 0 || payload_niov > 0);
+	LASSERT (payload_niov <= LNET_MAX_IOV);
+	/* payload is either all vaddrs or all pages */
+	LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+	LASSERT (!in_interrupt ());
+
+	if (payload_iov != NULL)
+		desc_size = offsetof(ksock_tx_t,
+				     tx_frags.virt.iov[1 + payload_niov]);
+	else
+		desc_size = offsetof(ksock_tx_t,
+				     tx_frags.paged.kiov[payload_niov]);
+
+        if (lntmsg->msg_vmflush)
+                mpflag = cfs_memory_pressure_get_and_set();
+        tx = ksocknal_alloc_tx(KSOCK_MSG_LNET, desc_size);
+        if (tx == NULL) {
+                CERROR("Can't allocate tx desc type %d size %d\n",
+                       type, desc_size);
+                if (lntmsg->msg_vmflush)
+                        cfs_memory_pressure_restore(mpflag);
+                return (-ENOMEM);
+        }
+
+        tx->tx_conn = NULL;                     /* set when assigned a conn */
+        tx->tx_lnetmsg = lntmsg;
+
+        if (payload_iov != NULL) {
+                tx->tx_kiov = NULL;
+                tx->tx_nkiov = 0;
+                tx->tx_iov = tx->tx_frags.virt.iov;
+                tx->tx_niov = 1 +
+                              lnet_extract_iov(payload_niov, &tx->tx_iov[1],
+                                               payload_niov, payload_iov,
+                                               payload_offset, payload_nob);
+        } else {
+                tx->tx_niov = 1;
+                tx->tx_iov = &tx->tx_frags.paged.iov;
+                tx->tx_kiov = tx->tx_frags.paged.kiov;
+                tx->tx_nkiov = lnet_extract_kiov(payload_niov, tx->tx_kiov,
+                                                 payload_niov, payload_kiov,
+                                                 payload_offset, payload_nob);
+
+                if (payload_nob >= *ksocknal_tunables.ksnd_zc_min_payload)
+                        tx->tx_zc_capable = 1;
+        }
+
+	tx->tx_msg.ksm_csum = 0;
+	tx->tx_msg.ksm_type = KSOCK_MSG_LNET;
+	tx->tx_msg.ksm_zc_cookies[0] = 0;
+	tx->tx_msg.ksm_zc_cookies[1] = 0;
+
+        /* The first fragment will be set later in pro_pack */
+        rc = ksocknal_launch_packet(ni, tx, target);
+        if (!mpflag)
+                cfs_memory_pressure_restore(mpflag);
+
+        if (rc == 0)
+                return (0);
+
+        ksocknal_free_tx(tx);
+        return (-EIO);
+}
+
+int
+ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name)
+{
+	struct task_struct *task = kthread_run(fn, arg, name);
+
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+	ksocknal_data.ksnd_nthreads++;
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+	return 0;
+}
+
+void
+ksocknal_thread_fini (void)
+{
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+        ksocknal_data.ksnd_nthreads--;
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+}
+
+int
+ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip)
+{
+        static char ksocknal_slop_buffer[4096];
+
+        int            nob;
+        unsigned int   niov;
+        int            skipped;
+
+        LASSERT(conn->ksnc_proto != NULL);
+
+        if ((*ksocknal_tunables.ksnd_eager_ack & conn->ksnc_type) != 0) {
+                /* Remind the socket to ack eagerly... */
+                ksocknal_lib_eager_ack(conn);
+        }
+
+	if (nob_to_skip == 0) {         /* right at next packet boundary now */
+		conn->ksnc_rx_started = 0;
+		smp_mb();                       /* racing with timeout thread */
+
+		switch (conn->ksnc_proto->pro_version) {
+		case  KSOCK_PROTO_V2:
+		case  KSOCK_PROTO_V3:
+                        conn->ksnc_rx_state = SOCKNAL_RX_KSM_HEADER;
+			conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space;
+                        conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg;
+
+			conn->ksnc_rx_nob_wanted = offsetof(struct ksock_msg, ksm_u);
+			conn->ksnc_rx_nob_left = offsetof(struct ksock_msg, ksm_u);
+			conn->ksnc_rx_iov[0].iov_len  = offsetof(struct ksock_msg, ksm_u);
+                        break;
+
+		case KSOCK_PROTO_V1:
+			/* Receiving bare struct lnet_hdr */
+			conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER;
+			conn->ksnc_rx_nob_wanted = sizeof(struct lnet_hdr);
+			conn->ksnc_rx_nob_left = sizeof(struct lnet_hdr);
+
+			conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space;
+			conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg.ksm_u.lnetmsg;
+			conn->ksnc_rx_iov[0].iov_len = sizeof(struct lnet_hdr);
+                        break;
+
+                default:
+                        LBUG ();
+                }
+                conn->ksnc_rx_niov = 1;
+
+                conn->ksnc_rx_kiov = NULL;
+                conn->ksnc_rx_nkiov = 0;
+                conn->ksnc_rx_csum = ~0;
+                return (1);
+        }
+
+        /* Set up to skip as much as possible now.  If there's more left
+         * (ran out of iov entries) we'll get called again */
+
+        conn->ksnc_rx_state = SOCKNAL_RX_SLOP;
+        conn->ksnc_rx_nob_left = nob_to_skip;
+	conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space;
+        skipped = 0;
+        niov = 0;
+
+        do {
+                nob = MIN (nob_to_skip, sizeof (ksocknal_slop_buffer));
+
+                conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer;
+                conn->ksnc_rx_iov[niov].iov_len  = nob;
+                niov++;
+                skipped += nob;
+                nob_to_skip -=nob;
+
+        } while (nob_to_skip != 0 &&    /* mustn't overflow conn's rx iov */
+		 niov < sizeof(conn->ksnc_rx_iov_space) / sizeof(struct kvec));
+
+        conn->ksnc_rx_niov = niov;
+        conn->ksnc_rx_kiov = NULL;
+        conn->ksnc_rx_nkiov = 0;
+        conn->ksnc_rx_nob_wanted = skipped;
+        return (0);
+}
+
+static int
+ksocknal_process_receive (ksock_conn_t *conn)
+{
+	struct lnet_hdr *lhdr;
+	struct lnet_process_id *id;
+	int rc;
+
+	LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+
+	/* NB: sched lock NOT held */
+	/* SOCKNAL_RX_LNET_HEADER is here for backward compatibility */
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_KSM_HEADER ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_LNET_HEADER ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_SLOP);
+ again:
+        if (conn->ksnc_rx_nob_wanted != 0) {
+                rc = ksocknal_receive(conn);
+
+		if (rc <= 0) {
+			struct lnet_process_id ksnp_id;
+
+			ksnp_id = conn->ksnc_peer->ksnp_id;
+
+			LASSERT(rc != -EAGAIN);
+			if (rc == 0)
+				CDEBUG(D_NET, "[%p] EOF from %s "
+				       "ip %pI4h:%d\n", conn,
+					libcfs_id2str(ksnp_id),
+					&conn->ksnc_ipaddr,
+                                        conn->ksnc_port);
+			else if (!conn->ksnc_closing)
+				CERROR("[%p] Error %d on read from %s "
+				       "ip %pI4h:%d\n", conn, rc,
+				       libcfs_id2str(ksnp_id),
+				       &conn->ksnc_ipaddr,
+				       conn->ksnc_port);
+
+                        /* it's not an error if conn is being closed */
+                        ksocknal_close_conn_and_siblings (conn,
+                                                          (conn->ksnc_closing) ? 0 : rc);
+                        return (rc == 0 ? -ESHUTDOWN : rc);
+                }
+
+                if (conn->ksnc_rx_nob_wanted != 0) {
+                        /* short read */
+                        return (-EAGAIN);
+                }
+        }
+        switch (conn->ksnc_rx_state) {
+        case SOCKNAL_RX_KSM_HEADER:
+                if (conn->ksnc_flip) {
+                        __swab32s(&conn->ksnc_msg.ksm_type);
+                        __swab32s(&conn->ksnc_msg.ksm_csum);
+                        __swab64s(&conn->ksnc_msg.ksm_zc_cookies[0]);
+                        __swab64s(&conn->ksnc_msg.ksm_zc_cookies[1]);
+                }
+
+                if (conn->ksnc_msg.ksm_type != KSOCK_MSG_NOOP &&
+                    conn->ksnc_msg.ksm_type != KSOCK_MSG_LNET) {
+                        CERROR("%s: Unknown message type: %x\n",
+                               libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                               conn->ksnc_msg.ksm_type);
+                        ksocknal_new_packet(conn, 0);
+                        ksocknal_close_conn_and_siblings(conn, -EPROTO);
+                        return (-EPROTO);
+                }
+
+                if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP &&
+                    conn->ksnc_msg.ksm_csum != 0 &&     /* has checksum */
+                    conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) {
+                        /* NOOP Checksum error */
+                        CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n",
+                               libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                               conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum);
+                        ksocknal_new_packet(conn, 0);
+                        ksocknal_close_conn_and_siblings(conn, -EPROTO);
+                        return (-EIO);
+                }
+
+                if (conn->ksnc_msg.ksm_zc_cookies[1] != 0) {
+                        __u64 cookie = 0;
+
+                        LASSERT (conn->ksnc_proto != &ksocknal_protocol_v1x);
+
+                        if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP)
+                                cookie = conn->ksnc_msg.ksm_zc_cookies[0];
+
+                        rc = conn->ksnc_proto->pro_handle_zcack(conn, cookie,
+                                               conn->ksnc_msg.ksm_zc_cookies[1]);
+
+                        if (rc != 0) {
+				CERROR("%s: Unknown ZC-ACK cookie: %llu, %llu\n",
+                                       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                                       cookie, conn->ksnc_msg.ksm_zc_cookies[1]);
+                                ksocknal_new_packet(conn, 0);
+                                ksocknal_close_conn_and_siblings(conn, -EPROTO);
+                                return (rc);
+                        }
+                }
+
+                if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP) {
+                        ksocknal_new_packet (conn, 0);
+                        return 0;       /* NOOP is done and just return */
+                }
+
+                conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER;
+		conn->ksnc_rx_nob_wanted = sizeof(struct ksock_lnet_msg);
+		conn->ksnc_rx_nob_left = sizeof(struct ksock_lnet_msg);
+
+		conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space;
+                conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg.ksm_u.lnetmsg;
+		conn->ksnc_rx_iov[0].iov_len  = sizeof(struct ksock_lnet_msg);
+
+                conn->ksnc_rx_niov = 1;
+                conn->ksnc_rx_kiov = NULL;
+                conn->ksnc_rx_nkiov = 0;
+
+                goto again;     /* read lnet header now */
+
+        case SOCKNAL_RX_LNET_HEADER:
+                /* unpack message header */
+                conn->ksnc_proto->pro_unpack(&conn->ksnc_msg);
+
+                if ((conn->ksnc_peer->ksnp_id.pid & LNET_PID_USERFLAG) != 0) {
+                        /* Userspace peer_ni */
+                        lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr;
+                        id   = &conn->ksnc_peer->ksnp_id;
+
+                        /* Substitute process ID assigned at connection time */
+                        lhdr->src_pid = cpu_to_le32(id->pid);
+                        lhdr->src_nid = cpu_to_le64(id->nid);
+                }
+
+                conn->ksnc_rx_state = SOCKNAL_RX_PARSE;
+                ksocknal_conn_addref(conn);     /* ++ref while parsing */
+
+                rc = lnet_parse(conn->ksnc_peer->ksnp_ni,
+                                &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr,
+                                conn->ksnc_peer->ksnp_id.nid, conn, 0);
+                if (rc < 0) {
+                        /* I just received garbage: give up on this conn */
+                        ksocknal_new_packet(conn, 0);
+                        ksocknal_close_conn_and_siblings (conn, rc);
+                        ksocknal_conn_decref(conn);
+                        return (-EPROTO);
+                }
+
+                /* I'm racing with ksocknal_recv() */
+                LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_PARSE ||
+                         conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD);
+
+                if (conn->ksnc_rx_state != SOCKNAL_RX_LNET_PAYLOAD)
+                        return 0;
+
+                /* ksocknal_recv() got called */
+                goto again;
+
+        case SOCKNAL_RX_LNET_PAYLOAD:
+                /* payload all received */
+                rc = 0;
+
+                if (conn->ksnc_rx_nob_left == 0 &&   /* not truncating */
+                    conn->ksnc_msg.ksm_csum != 0 &&  /* has checksum */
+                    conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) {
+                        CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n",
+                               libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                               conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum);
+                        rc = -EIO;
+                }
+
+                if (rc == 0 && conn->ksnc_msg.ksm_zc_cookies[0] != 0) {
+                        LASSERT(conn->ksnc_proto != &ksocknal_protocol_v1x);
+
+                        lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr;
+                        id   = &conn->ksnc_peer->ksnp_id;
+
+                        rc = conn->ksnc_proto->pro_handle_zcreq(conn,
+                                        conn->ksnc_msg.ksm_zc_cookies[0],
+                                        *ksocknal_tunables.ksnd_nonblk_zcack ||
+                                        le64_to_cpu(lhdr->src_nid) != id->nid);
+                }
+
+		lnet_finalize(conn->ksnc_cookie, rc);
+
+                if (rc != 0) {
+                        ksocknal_new_packet(conn, 0);
+                        ksocknal_close_conn_and_siblings (conn, rc);
+                        return (-EPROTO);
+                }
+                /* Fall through */
+
+        case SOCKNAL_RX_SLOP:
+                /* starting new packet? */
+                if (ksocknal_new_packet (conn, conn->ksnc_rx_nob_left))
+                        return 0;       /* come back later */
+                goto again;             /* try to finish reading slop now */
+
+        default:
+                break;
+        }
+
+        /* Not Reached */
+        LBUG ();
+        return (-EINVAL);                       /* keep gcc happy */
+}
+
+int
+ksocknal_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg,
+	      int delayed, unsigned int niov, struct kvec *iov,
+	      lnet_kiov_t *kiov, unsigned int offset, unsigned int mlen,
+	      unsigned int rlen)
+{
+        ksock_conn_t  *conn = (ksock_conn_t *)private;
+        ksock_sched_t *sched = conn->ksnc_scheduler;
+
+        LASSERT (mlen <= rlen);
+        LASSERT (niov <= LNET_MAX_IOV);
+
+        conn->ksnc_cookie = msg;
+        conn->ksnc_rx_nob_wanted = mlen;
+        conn->ksnc_rx_nob_left   = rlen;
+
+        if (mlen == 0 || iov != NULL) {
+                conn->ksnc_rx_nkiov = 0;
+                conn->ksnc_rx_kiov = NULL;
+                conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov;
+                conn->ksnc_rx_niov =
+                        lnet_extract_iov(LNET_MAX_IOV, conn->ksnc_rx_iov,
+                                         niov, iov, offset, mlen);
+        } else {
+                conn->ksnc_rx_niov = 0;
+                conn->ksnc_rx_iov  = NULL;
+                conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov;
+                conn->ksnc_rx_nkiov =
+                        lnet_extract_kiov(LNET_MAX_IOV, conn->ksnc_rx_kiov,
+                                          niov, kiov, offset, mlen);
+        }
+
+        LASSERT (mlen ==
+                 lnet_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
+                 lnet_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
+
+        LASSERT (conn->ksnc_rx_scheduled);
+
+	spin_lock_bh(&sched->kss_lock);
+
+	switch (conn->ksnc_rx_state) {
+	case SOCKNAL_RX_PARSE_WAIT:
+		list_add_tail(&conn->ksnc_rx_list, &sched->kss_rx_conns);
+		wake_up(&sched->kss_waitq);
+		LASSERT(conn->ksnc_rx_ready);
+		break;
+
+        case SOCKNAL_RX_PARSE:
+                /* scheduler hasn't noticed I'm parsing yet */
+                break;
+        }
+
+        conn->ksnc_rx_state = SOCKNAL_RX_LNET_PAYLOAD;
+
+	spin_unlock_bh(&sched->kss_lock);
+	ksocknal_conn_decref(conn);
+	return 0;
+}
+
+static inline int
+ksocknal_sched_cansleep(ksock_sched_t *sched)
+{
+	int           rc;
+
+	spin_lock_bh(&sched->kss_lock);
+
+	rc = (!ksocknal_data.ksnd_shuttingdown &&
+	      list_empty(&sched->kss_rx_conns) &&
+	      list_empty(&sched->kss_tx_conns));
+
+	spin_unlock_bh(&sched->kss_lock);
+	return rc;
+}
+
+int ksocknal_scheduler(void *arg)
+{
+	struct ksock_sched_info	*info;
+	ksock_sched_t		*sched;
+	ksock_conn_t		*conn;
+	ksock_tx_t		*tx;
+	int			rc;
+	int			nloops = 0;
+	long			id = (long)arg;
+
+	info = ksocknal_data.ksnd_sched_info[KSOCK_THREAD_CPT(id)];
+	sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
+
+	cfs_block_allsigs();
+
+	rc = cfs_cpt_bind(lnet_cpt_table(), info->ksi_cpt);
+	if (rc != 0) {
+		CWARN("Can't set CPU partition affinity to %d: %d\n",
+			info->ksi_cpt, rc);
+	}
+
+	spin_lock_bh(&sched->kss_lock);
+
+        while (!ksocknal_data.ksnd_shuttingdown) {
+                int did_something = 0;
+
+                /* Ensure I progress everything semi-fairly */
+
+		if (!list_empty(&sched->kss_rx_conns)) {
+			conn = list_entry(sched->kss_rx_conns.next,
+                                              ksock_conn_t, ksnc_rx_list);
+			list_del(&conn->ksnc_rx_list);
+
+                        LASSERT(conn->ksnc_rx_scheduled);
+                        LASSERT(conn->ksnc_rx_ready);
+
+                        /* clear rx_ready in case receive isn't complete.
+                         * Do it BEFORE we call process_recv, since
+                         * data_ready can set it any time after we release
+                         * kss_lock. */
+                        conn->ksnc_rx_ready = 0;
+			spin_unlock_bh(&sched->kss_lock);
+
+			rc = ksocknal_process_receive(conn);
+
+			spin_lock_bh(&sched->kss_lock);
+
+                        /* I'm the only one that can clear this flag */
+                        LASSERT(conn->ksnc_rx_scheduled);
+
+                        /* Did process_receive get everything it wanted? */
+                        if (rc == 0)
+                                conn->ksnc_rx_ready = 1;
+
+                        if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) {
+                                /* Conn blocked waiting for ksocknal_recv()
+                                 * I change its state (under lock) to signal
+                                 * it can be rescheduled */
+                                conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT;
+                        } else if (conn->ksnc_rx_ready) {
+                                /* reschedule for rx */
+				list_add_tail(&conn->ksnc_rx_list,
+                                                   &sched->kss_rx_conns);
+                        } else {
+                                conn->ksnc_rx_scheduled = 0;
+                                /* drop my ref */
+                                ksocknal_conn_decref(conn);
+                        }
+
+                        did_something = 1;
+                }
+
+		if (!list_empty(&sched->kss_tx_conns)) {
+			struct list_head zlist = LIST_HEAD_INIT(zlist);
+
+			if (!list_empty(&sched->kss_zombie_noop_txs)) {
+				list_add(&zlist,
+                                             &sched->kss_zombie_noop_txs);
+				list_del_init(&sched->kss_zombie_noop_txs);
+                        }
+
+			conn = list_entry(sched->kss_tx_conns.next,
+                                              ksock_conn_t, ksnc_tx_list);
+			list_del(&conn->ksnc_tx_list);
+
+                        LASSERT(conn->ksnc_tx_scheduled);
+                        LASSERT(conn->ksnc_tx_ready);
+			LASSERT(!list_empty(&conn->ksnc_tx_queue));
+
+			tx = list_entry(conn->ksnc_tx_queue.next,
+                                            ksock_tx_t, tx_list);
+
+                        if (conn->ksnc_tx_carrier == tx)
+                                ksocknal_next_tx_carrier(conn);
+
+                        /* dequeue now so empty list => more to send */
+			list_del(&tx->tx_list);
+
+                        /* Clear tx_ready in case send isn't complete.  Do
+                         * it BEFORE we call process_transmit, since
+                         * write_space can set it any time after we release
+                         * kss_lock. */
+                        conn->ksnc_tx_ready = 0;
+			spin_unlock_bh(&sched->kss_lock);
+
+			if (!list_empty(&zlist)) {
+				/* free zombie noop txs, it's fast because
+                                 * noop txs are just put in freelist */
+                                ksocknal_txlist_done(NULL, &zlist, 0);
+                        }
+
+                        rc = ksocknal_process_transmit(conn, tx);
+
+                        if (rc == -ENOMEM || rc == -EAGAIN) {
+                                /* Incomplete send: replace tx on HEAD of tx_queue */
+				spin_lock_bh(&sched->kss_lock);
+				list_add(&tx->tx_list,
+					     &conn->ksnc_tx_queue);
+			} else {
+				/* Complete send; tx -ref */
+				ksocknal_tx_decref(tx);
+
+				spin_lock_bh(&sched->kss_lock);
+                                /* assume space for more */
+                                conn->ksnc_tx_ready = 1;
+                        }
+
+                        if (rc == -ENOMEM) {
+                                /* Do nothing; after a short timeout, this
+                                 * conn will be reposted on kss_tx_conns. */
+                        } else if (conn->ksnc_tx_ready &&
+				   !list_empty(&conn->ksnc_tx_queue)) {
+                                /* reschedule for tx */
+				list_add_tail(&conn->ksnc_tx_list,
+                                                   &sched->kss_tx_conns);
+                        } else {
+                                conn->ksnc_tx_scheduled = 0;
+                                /* drop my ref */
+                                ksocknal_conn_decref(conn);
+                        }
+
+                        did_something = 1;
+                }
+                if (!did_something ||           /* nothing to do */
+                    ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */
+			spin_unlock_bh(&sched->kss_lock);
+
+                        nloops = 0;
+
+                        if (!did_something) {   /* wait for something to do */
+				rc = wait_event_interruptible_exclusive(
+					sched->kss_waitq,
+					!ksocknal_sched_cansleep(sched));
+				LASSERT (rc == 0);
+			} else {
+				cond_resched();
+			}
+
+			spin_lock_bh(&sched->kss_lock);
+		}
+	}
+
+	spin_unlock_bh(&sched->kss_lock);
+	ksocknal_thread_fini();
+	return 0;
+}
+
+/*
+ * Add connection to kss_rx_conns of scheduler
+ * and wakeup the scheduler.
+ */
+void ksocknal_read_callback (ksock_conn_t *conn)
+{
+	ksock_sched_t *sched;
+	ENTRY;
+
+	sched = conn->ksnc_scheduler;
+
+	spin_lock_bh(&sched->kss_lock);
+
+	conn->ksnc_rx_ready = 1;
+
+	if (!conn->ksnc_rx_scheduled) {  /* not being progressed */
+		list_add_tail(&conn->ksnc_rx_list,
+				  &sched->kss_rx_conns);
+		conn->ksnc_rx_scheduled = 1;
+		/* extra ref for scheduler */
+		ksocknal_conn_addref(conn);
+
+		wake_up (&sched->kss_waitq);
+	}
+	spin_unlock_bh(&sched->kss_lock);
+
+	EXIT;
+}
+
+/*
+ * Add connection to kss_tx_conns of scheduler
+ * and wakeup the scheduler.
+ */
+void ksocknal_write_callback(ksock_conn_t *conn)
+{
+	ksock_sched_t *sched;
+	ENTRY;
+
+	sched = conn->ksnc_scheduler;
+
+	spin_lock_bh(&sched->kss_lock);
+
+	conn->ksnc_tx_ready = 1;
+
+	if (!conn->ksnc_tx_scheduled && /* not being progressed */
+	    !list_empty(&conn->ksnc_tx_queue)) { /* packets to send */
+		list_add_tail(&conn->ksnc_tx_list, &sched->kss_tx_conns);
+		conn->ksnc_tx_scheduled = 1;
+		/* extra ref for scheduler */
+		ksocknal_conn_addref(conn);
+
+		wake_up(&sched->kss_waitq);
+	}
+
+	spin_unlock_bh(&sched->kss_lock);
+
+	EXIT;
+}
+
+static ksock_proto_t *
+ksocknal_parse_proto_version (struct ksock_hello_msg *hello)
+{
+        __u32   version = 0;
+
+        if (hello->kshm_magic == LNET_PROTO_MAGIC)
+                version = hello->kshm_version;
+        else if (hello->kshm_magic == __swab32(LNET_PROTO_MAGIC))
+                version = __swab32(hello->kshm_version);
+
+        if (version != 0) {
+#if SOCKNAL_VERSION_DEBUG
+                if (*ksocknal_tunables.ksnd_protocol == 1)
+                        return NULL;
+
+                if (*ksocknal_tunables.ksnd_protocol == 2 &&
+                    version == KSOCK_PROTO_V3)
+                        return NULL;
+#endif
+                if (version == KSOCK_PROTO_V2)
+                        return &ksocknal_protocol_v2x;
+
+                if (version == KSOCK_PROTO_V3)
+                        return &ksocknal_protocol_v3x;
+
+                return NULL;
+        }
+
+        if (hello->kshm_magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC)) {
+		struct lnet_magicversion *hmv;
+
+		CLASSERT(sizeof(struct lnet_magicversion) ==
+			 offsetof(struct ksock_hello_msg, kshm_src_nid));
+
+		hmv = (struct lnet_magicversion *)hello;
+
+                if (hmv->version_major == cpu_to_le16 (KSOCK_PROTO_V1_MAJOR) &&
+                    hmv->version_minor == cpu_to_le16 (KSOCK_PROTO_V1_MINOR))
+                        return &ksocknal_protocol_v1x;
+        }
+
+        return NULL;
+}
+
+int
+ksocknal_send_hello(struct lnet_ni *ni, ksock_conn_t *conn,
+		    lnet_nid_t peer_nid, struct ksock_hello_msg *hello)
+{
+	/* CAVEAT EMPTOR: this byte flips 'ipaddrs' */
+	ksock_net_t         *net = (ksock_net_t *)ni->ni_data;
+
+	LASSERT(hello->kshm_nips <= LNET_NUM_INTERFACES);
+
+	/* rely on caller to hold a ref on socket so it wouldn't disappear */
+	LASSERT(conn->ksnc_proto != NULL);
+
+	hello->kshm_src_nid         = ni->ni_nid;
+	hello->kshm_dst_nid         = peer_nid;
+	hello->kshm_src_pid         = the_lnet.ln_pid;
+
+	hello->kshm_src_incarnation = net->ksnn_incarnation;
+	hello->kshm_ctype           = conn->ksnc_type;
+
+	return conn->ksnc_proto->pro_send_hello(conn, hello);
+}
+
+static int
+ksocknal_invert_type(int type)
+{
+        switch (type)
+        {
+        case SOCKLND_CONN_ANY:
+        case SOCKLND_CONN_CONTROL:
+                return (type);
+        case SOCKLND_CONN_BULK_IN:
+                return SOCKLND_CONN_BULK_OUT;
+        case SOCKLND_CONN_BULK_OUT:
+                return SOCKLND_CONN_BULK_IN;
+        default:
+                return (SOCKLND_CONN_NONE);
+        }
+}
+
+int
+ksocknal_recv_hello(struct lnet_ni *ni, ksock_conn_t *conn,
+		    struct ksock_hello_msg *hello,
+		    struct lnet_process_id *peerid,
+		    __u64 *incarnation)
+{
+        /* Return < 0        fatal error
+         *        0          success
+         *        EALREADY   lost connection race
+         *        EPROTO     protocol version mismatch
+         */
+	struct socket        *sock = conn->ksnc_sock;
+        int                  active = (conn->ksnc_proto != NULL);
+        int                  timeout;
+        int                  proto_match;
+        int                  rc;
+        ksock_proto_t       *proto;
+	struct lnet_process_id    recv_id;
+
+	/* socket type set on active connections - not set on passive */
+	LASSERT(!active == !(conn->ksnc_type != SOCKLND_CONN_NONE));
+
+	timeout = active ? *ksocknal_tunables.ksnd_timeout :
+			    lnet_acceptor_timeout();
+
+	rc = lnet_sock_read(sock, &hello->kshm_magic,
+			    sizeof(hello->kshm_magic), timeout);
+        if (rc != 0) {
+		CERROR("Error %d reading HELLO from %pI4h\n",
+		       rc, &conn->ksnc_ipaddr);
+                LASSERT (rc < 0);
+                return rc;
+        }
+
+        if (hello->kshm_magic != LNET_PROTO_MAGIC &&
+            hello->kshm_magic != __swab32(LNET_PROTO_MAGIC) &&
+            hello->kshm_magic != le32_to_cpu (LNET_PROTO_TCP_MAGIC)) {
+                /* Unexpected magic! */
+                CERROR ("Bad magic(1) %#08x (%#08x expected) from "
+			"%pI4h\n", __cpu_to_le32 (hello->kshm_magic),
+			LNET_PROTO_TCP_MAGIC, &conn->ksnc_ipaddr);
+                return -EPROTO;
+        }
+
+	rc = lnet_sock_read(sock, &hello->kshm_version,
+			    sizeof(hello->kshm_version), timeout);
+        if (rc != 0) {
+		CERROR("Error %d reading HELLO from %pI4h\n",
+		       rc, &conn->ksnc_ipaddr);
+		LASSERT(rc < 0);
+                return rc;
+        }
+
+        proto = ksocknal_parse_proto_version(hello);
+        if (proto == NULL) {
+                if (!active) {
+                        /* unknown protocol from peer_ni, tell peer_ni my protocol */
+                        conn->ksnc_proto = &ksocknal_protocol_v3x;
+#if SOCKNAL_VERSION_DEBUG
+                        if (*ksocknal_tunables.ksnd_protocol == 2)
+                                conn->ksnc_proto = &ksocknal_protocol_v2x;
+                        else if (*ksocknal_tunables.ksnd_protocol == 1)
+                                conn->ksnc_proto = &ksocknal_protocol_v1x;
+#endif
+                        hello->kshm_nips = 0;
+                        ksocknal_send_hello(ni, conn, ni->ni_nid, hello);
+                }
+
+		CERROR("Unknown protocol version (%d.x expected) from %pI4h\n",
+		       conn->ksnc_proto->pro_version, &conn->ksnc_ipaddr);
+
+                return -EPROTO;
+        }
+
+        proto_match = (conn->ksnc_proto == proto);
+        conn->ksnc_proto = proto;
+
+        /* receive the rest of hello message anyway */
+        rc = conn->ksnc_proto->pro_recv_hello(conn, hello, timeout);
+        if (rc != 0) {
+		CERROR("Error %d reading or checking hello from from %pI4h\n",
+		       rc, &conn->ksnc_ipaddr);
+                LASSERT (rc < 0);
+                return rc;
+        }
+
+        *incarnation = hello->kshm_src_incarnation;
+
+        if (hello->kshm_src_nid == LNET_NID_ANY) {
+                CERROR("Expecting a HELLO hdr with a NID, but got LNET_NID_ANY"
+		       "from %pI4h\n", &conn->ksnc_ipaddr);
+                return -EPROTO;
+        }
+
+        if (!active &&
+            conn->ksnc_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) {
+                /* Userspace NAL assigns peer_ni process ID from socket */
+                recv_id.pid = conn->ksnc_port | LNET_PID_USERFLAG;
+                recv_id.nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), conn->ksnc_ipaddr);
+        } else {
+                recv_id.nid = hello->kshm_src_nid;
+                recv_id.pid = hello->kshm_src_pid;
+        }
+
+        if (!active) {
+                *peerid = recv_id;
+
+		/* peer_ni determines type */
+		conn->ksnc_type = ksocknal_invert_type(hello->kshm_ctype);
+		if (conn->ksnc_type == SOCKLND_CONN_NONE) {
+			CERROR("Unexpected type %d from %s ip %pI4h\n",
+			       hello->kshm_ctype, libcfs_id2str(*peerid),
+			       &conn->ksnc_ipaddr);
+			return -EPROTO;
+		}
+		return 0;
+	}
+
+        if (peerid->pid != recv_id.pid ||
+            peerid->nid != recv_id.nid) {
+                LCONSOLE_ERROR_MSG(0x130, "Connected successfully to %s on host"
+				   " %pI4h, but they claimed they were "
+                                   "%s; please check your Lustre "
+                                   "configuration.\n",
+                                   libcfs_id2str(*peerid),
+				   &conn->ksnc_ipaddr,
+                                   libcfs_id2str(recv_id));
+                return -EPROTO;
+        }
+
+        if (hello->kshm_ctype == SOCKLND_CONN_NONE) {
+                /* Possible protocol mismatch or I lost the connection race */
+                return proto_match ? EALREADY : EPROTO;
+        }
+
+	if (ksocknal_invert_type(hello->kshm_ctype) != conn->ksnc_type) {
+		CERROR("Mismatched types: me %d, %s ip %pI4h %d\n",
+		       conn->ksnc_type, libcfs_id2str(*peerid),
+		       &conn->ksnc_ipaddr,
+		       hello->kshm_ctype);
+		return -EPROTO;
+	}
+	return 0;
+}
+
+static int
+ksocknal_connect (ksock_route_t *route)
+{
+	struct list_head        zombies = LIST_HEAD_INIT(zombies);
+        ksock_peer_ni_t     *peer_ni = route->ksnr_peer;
+        int               type;
+        int               wanted;
+	struct socket     *sock;
+	cfs_time_t        deadline;
+        int               retry_later = 0;
+        int               rc = 0;
+
+	deadline = cfs_time_add(cfs_time_current(),
+				cfs_time_seconds(*ksocknal_tunables.ksnd_timeout));
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+        LASSERT (route->ksnr_scheduled);
+        LASSERT (!route->ksnr_connecting);
+
+        route->ksnr_connecting = 1;
+
+        for (;;) {
+                wanted = ksocknal_route_mask() & ~route->ksnr_connected;
+
+                /* stop connecting if peer_ni/route got closed under me, or
+                 * route got connected while queued */
+                if (peer_ni->ksnp_closing || route->ksnr_deleted ||
+                    wanted == 0) {
+                        retry_later = 0;
+                        break;
+                }
+
+                /* reschedule if peer_ni is connecting to me */
+                if (peer_ni->ksnp_accepting > 0) {
+                        CDEBUG(D_NET,
+                               "peer_ni %s(%d) already connecting to me, retry later.\n",
+                               libcfs_nid2str(peer_ni->ksnp_id.nid), peer_ni->ksnp_accepting);
+                        retry_later = 1;
+                }
+
+                if (retry_later) /* needs reschedule */
+                        break;
+
+                if ((wanted & (1 << SOCKLND_CONN_ANY)) != 0) {
+                        type = SOCKLND_CONN_ANY;
+                } else if ((wanted & (1 << SOCKLND_CONN_CONTROL)) != 0) {
+                        type = SOCKLND_CONN_CONTROL;
+                } else if ((wanted & (1 << SOCKLND_CONN_BULK_IN)) != 0) {
+                        type = SOCKLND_CONN_BULK_IN;
+                } else {
+                        LASSERT ((wanted & (1 << SOCKLND_CONN_BULK_OUT)) != 0);
+                        type = SOCKLND_CONN_BULK_OUT;
+                }
+
+		write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+		if (cfs_time_aftereq(cfs_time_current(), deadline)) {
+                        rc = -ETIMEDOUT;
+                        lnet_connect_console_error(rc, peer_ni->ksnp_id.nid,
+                                                   route->ksnr_ipaddr,
+                                                   route->ksnr_port);
+                        goto failed;
+                }
+
+                rc = lnet_connect(&sock, peer_ni->ksnp_id.nid,
+                                  route->ksnr_myipaddr,
+                                  route->ksnr_ipaddr, route->ksnr_port);
+                if (rc != 0)
+                        goto failed;
+
+                rc = ksocknal_create_conn(peer_ni->ksnp_ni, route, sock, type);
+                if (rc < 0) {
+                        lnet_connect_console_error(rc, peer_ni->ksnp_id.nid,
+                                                   route->ksnr_ipaddr,
+                                                   route->ksnr_port);
+                        goto failed;
+                }
+
+                /* A +ve RC means I have to retry because I lost the connection
+                 * race or I have to renegotiate protocol version */
+                retry_later = (rc != 0);
+                if (retry_later)
+                        CDEBUG(D_NET, "peer_ni %s: conn race, retry later.\n",
+                               libcfs_nid2str(peer_ni->ksnp_id.nid));
+
+		write_lock_bh(&ksocknal_data.ksnd_global_lock);
+        }
+
+        route->ksnr_scheduled = 0;
+        route->ksnr_connecting = 0;
+
+        if (retry_later) {
+                /* re-queue for attention; this frees me up to handle
+                 * the peer_ni's incoming connection request */
+
+                if (rc == EALREADY ||
+                    (rc == 0 && peer_ni->ksnp_accepting > 0)) {
+                        /* We want to introduce a delay before next
+                         * attempt to connect if we lost conn race,
+                         * but the race is resolved quickly usually,
+                         * so min_reconnectms should be good heuristic */
+			route->ksnr_retry_interval =
+				cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000;
+			route->ksnr_timeout = cfs_time_add(cfs_time_current(),
+							   route->ksnr_retry_interval);
+                }
+
+                ksocknal_launch_connection_locked(route);
+        }
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+        return retry_later;
+
+ failed:
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+        route->ksnr_scheduled = 0;
+        route->ksnr_connecting = 0;
+
+	/* This is a retry rather than a new connection */
+	route->ksnr_retry_interval *= 2;
+	route->ksnr_retry_interval =
+		MAX(route->ksnr_retry_interval,
+		    cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000);
+	route->ksnr_retry_interval =
+		MIN(route->ksnr_retry_interval,
+		    cfs_time_seconds(*ksocknal_tunables.ksnd_max_reconnectms)/1000);
+
+	LASSERT (route->ksnr_retry_interval != 0);
+	route->ksnr_timeout = cfs_time_add(cfs_time_current(),
+					   route->ksnr_retry_interval);
+
+	if (!list_empty(&peer_ni->ksnp_tx_queue) &&
+            peer_ni->ksnp_accepting == 0 &&
+            ksocknal_find_connecting_route_locked(peer_ni) == NULL) {
+                ksock_conn_t *conn;
+
+                /* ksnp_tx_queue is queued on a conn on successful
+                 * connection for V1.x and V2.x */
+		if (!list_empty(&peer_ni->ksnp_conns)) {
+			conn = list_entry(peer_ni->ksnp_conns.next,
+                                              ksock_conn_t, ksnc_list);
+                        LASSERT (conn->ksnc_proto == &ksocknal_protocol_v3x);
+                }
+
+                /* take all the blocked packets while I've got the lock and
+                 * complete below... */
+		list_splice_init(&peer_ni->ksnp_tx_queue, &zombies);
+        }
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	ksocknal_peer_failed(peer_ni);
+	ksocknal_txlist_done(peer_ni->ksnp_ni, &zombies, rc);
+	return 0;
+}
+
+/*
+ * check whether we need to create more connds.
+ * It will try to create new thread if it's necessary, @timeout can
+ * be updated if failed to create, so caller wouldn't keep try while
+ * running out of resource.
+ */
+static int
+ksocknal_connd_check_start(long sec, long *timeout)
+{
+	char name[16];
+        int rc;
+        int total = ksocknal_data.ksnd_connd_starting +
+                    ksocknal_data.ksnd_connd_running;
+
+        if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) {
+                /* still in initializing */
+                return 0;
+        }
+
+        if (total >= *ksocknal_tunables.ksnd_nconnds_max ||
+            total > ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV) {
+                /* can't create more connd, or still have enough
+                 * threads to handle more connecting */
+                return 0;
+        }
+
+        if (list_empty(&ksocknal_data.ksnd_connd_routes)) {
+                /* no pending connecting request */
+                return 0;
+        }
+
+        if (sec - ksocknal_data.ksnd_connd_failed_stamp <= 1) {
+                /* may run out of resource, retry later */
+                *timeout = cfs_time_seconds(1);
+                return 0;
+        }
+
+        if (ksocknal_data.ksnd_connd_starting > 0) {
+                /* serialize starting to avoid flood */
+                return 0;
+        }
+
+        ksocknal_data.ksnd_connd_starting_stamp = sec;
+        ksocknal_data.ksnd_connd_starting++;
+	spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+
+	/* NB: total is the next id */
+	snprintf(name, sizeof(name), "socknal_cd%02d", total);
+	rc = ksocknal_thread_start(ksocknal_connd, NULL, name);
+
+	spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+        if (rc == 0)
+                return 1;
+
+        /* we tried ... */
+        LASSERT(ksocknal_data.ksnd_connd_starting > 0);
+        ksocknal_data.ksnd_connd_starting--;
+	ksocknal_data.ksnd_connd_failed_stamp = cfs_time_current_sec();
+
+        return 1;
+}
+
+/*
+ * check whether current thread can exit, it will return 1 if there are too
+ * many threads and no creating in past 120 seconds.
+ * Also, this function may update @timeout to make caller come back
+ * again to recheck these conditions.
+ */
+static int
+ksocknal_connd_check_stop(long sec, long *timeout)
+{
+        int val;
+
+        if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) {
+                /* still in initializing */
+                return 0;
+        }
+
+        if (ksocknal_data.ksnd_connd_starting > 0) {
+                /* in progress of starting new thread */
+                return 0;
+        }
+
+        if (ksocknal_data.ksnd_connd_running <=
+            *ksocknal_tunables.ksnd_nconnds) { /* can't shrink */
+                return 0;
+        }
+
+        /* created thread in past 120 seconds? */
+        val = (int)(ksocknal_data.ksnd_connd_starting_stamp +
+                    SOCKNAL_CONND_TIMEOUT - sec);
+
+        *timeout = (val > 0) ? cfs_time_seconds(val) :
+                               cfs_time_seconds(SOCKNAL_CONND_TIMEOUT);
+        if (val > 0)
+                return 0;
+
+        /* no creating in past 120 seconds */
+
+        return ksocknal_data.ksnd_connd_running >
+               ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV;
+}
+
+/* Go through connd_routes queue looking for a route that we can process
+ * right now, @timeout_p can be updated if we need to come back later */
+static ksock_route_t *
+ksocknal_connd_get_route_locked(signed long *timeout_p)
+{
+	ksock_route_t *route;
+	cfs_time_t     now;
+
+	now = cfs_time_current();
+
+	/* connd_routes can contain both pending and ordinary routes */
+	list_for_each_entry(route, &ksocknal_data.ksnd_connd_routes,
+				 ksnr_connd_list) {
+
+		if (route->ksnr_retry_interval == 0 ||
+		    cfs_time_aftereq(now, route->ksnr_timeout))
+			return route;
+
+		if (*timeout_p == MAX_SCHEDULE_TIMEOUT ||
+		    (int)*timeout_p > (int)(route->ksnr_timeout - now))
+			*timeout_p = (int)(route->ksnr_timeout - now);
+	}
+
+	return NULL;
+}
+
+int
+ksocknal_connd (void *arg)
+{
+	spinlock_t    *connd_lock = &ksocknal_data.ksnd_connd_lock;
+	ksock_connreq_t   *cr;
+	wait_queue_entry_t wait;
+	int                nloops = 0;
+	int                cons_retry = 0;
+
+	cfs_block_allsigs();
+
+	init_waitqueue_entry(&wait, current);
+
+	spin_lock_bh(connd_lock);
+
+	LASSERT(ksocknal_data.ksnd_connd_starting > 0);
+	ksocknal_data.ksnd_connd_starting--;
+	ksocknal_data.ksnd_connd_running++;
+
+	while (!ksocknal_data.ksnd_shuttingdown) {
+		ksock_route_t *route = NULL;
+		long sec = cfs_time_current_sec();
+		long timeout = MAX_SCHEDULE_TIMEOUT;
+		int  dropped_lock = 0;
+
+		if (ksocknal_connd_check_stop(sec, &timeout)) {
+			/* wakeup another one to check stop */
+			wake_up(&ksocknal_data.ksnd_connd_waitq);
+			break;
+		}
+
+                if (ksocknal_connd_check_start(sec, &timeout)) {
+                        /* created new thread */
+                        dropped_lock = 1;
+                }
+
+		if (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) {
+                        /* Connection accepted by the listener */
+			cr = list_entry(ksocknal_data.ksnd_connd_connreqs. \
+                                            next, ksock_connreq_t, ksncr_list);
+
+			list_del(&cr->ksncr_list);
+			spin_unlock_bh(connd_lock);
+			dropped_lock = 1;
+
+			ksocknal_create_conn(cr->ksncr_ni, NULL,
+					     cr->ksncr_sock, SOCKLND_CONN_NONE);
+			lnet_ni_decref(cr->ksncr_ni);
+			LIBCFS_FREE(cr, sizeof(*cr));
+
+			spin_lock_bh(connd_lock);
+                }
+
+                /* Only handle an outgoing connection request if there
+                 * is a thread left to handle incoming connections and
+                 * create new connd */
+                if (ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV <
+                    ksocknal_data.ksnd_connd_running) {
+                        route = ksocknal_connd_get_route_locked(&timeout);
+                }
+                if (route != NULL) {
+			list_del(&route->ksnr_connd_list);
+                        ksocknal_data.ksnd_connd_connecting++;
+			spin_unlock_bh(connd_lock);
+                        dropped_lock = 1;
+
+                        if (ksocknal_connect(route)) {
+                                /* consecutive retry */
+                                if (cons_retry++ > SOCKNAL_INSANITY_RECONN) {
+                                        CWARN("massive consecutive "
+					      "re-connecting to %pI4h\n",
+					      &route->ksnr_ipaddr);
+                                        cons_retry = 0;
+                                }
+                        } else {
+                                cons_retry = 0;
+                        }
+
+                        ksocknal_route_decref(route);
+
+			spin_lock_bh(connd_lock);
+			ksocknal_data.ksnd_connd_connecting--;
+		}
+
+		if (dropped_lock) {
+			if (++nloops < SOCKNAL_RESCHED)
+				continue;
+			spin_unlock_bh(connd_lock);
+			nloops = 0;
+			cond_resched();
+			spin_lock_bh(connd_lock);
+			continue;
+		}
+
+		/* Nothing to do for 'timeout'  */
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue_exclusive(&ksocknal_data.ksnd_connd_waitq, &wait);
+		spin_unlock_bh(connd_lock);
+
+		nloops = 0;
+		schedule_timeout(timeout);
+
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&ksocknal_data.ksnd_connd_waitq, &wait);
+		spin_lock_bh(connd_lock);
+	}
+	ksocknal_data.ksnd_connd_running--;
+	spin_unlock_bh(connd_lock);
+
+	ksocknal_thread_fini();
+	return 0;
+}
+
+static ksock_conn_t *
+ksocknal_find_timed_out_conn (ksock_peer_ni_t *peer_ni)
+{
+        /* We're called with a shared lock on ksnd_global_lock */
+        ksock_conn_t      *conn;
+	struct list_head        *ctmp;
+
+	list_for_each(ctmp, &peer_ni->ksnp_conns) {
+                int     error;
+		conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
+
+                /* Don't need the {get,put}connsock dance to deref ksnc_sock */
+                LASSERT (!conn->ksnc_closing);
+
+		error = conn->ksnc_sock->sk->sk_err;
+                if (error != 0) {
+                        ksocknal_conn_addref(conn);
+
+                        switch (error) {
+                        case ECONNRESET:
+                                CNETERR("A connection with %s "
+					"(%pI4h:%d) was reset; "
+                                        "it may have rebooted.\n",
+                                        libcfs_id2str(peer_ni->ksnp_id),
+					&conn->ksnc_ipaddr,
+                                        conn->ksnc_port);
+                                break;
+                        case ETIMEDOUT:
+                                CNETERR("A connection with %s "
+					"(%pI4h:%d) timed out; the "
+                                        "network or node may be down.\n",
+                                        libcfs_id2str(peer_ni->ksnp_id),
+					&conn->ksnc_ipaddr,
+                                        conn->ksnc_port);
+                                break;
+                        default:
+                                CNETERR("An unexpected network error %d "
+                                        "occurred with %s "
+					"(%pI4h:%d\n", error,
+                                        libcfs_id2str(peer_ni->ksnp_id),
+					&conn->ksnc_ipaddr,
+                                        conn->ksnc_port);
+                                break;
+                        }
+
+                        return (conn);
+                }
+
+                if (conn->ksnc_rx_started &&
+		    cfs_time_aftereq(cfs_time_current(),
+				     conn->ksnc_rx_deadline)) {
+                        /* Timed out incomplete incoming message */
+                        ksocknal_conn_addref(conn);
+			CNETERR("Timeout receiving from %s (%pI4h:%d), "
+                                "state %d wanted %d left %d\n",
+                                libcfs_id2str(peer_ni->ksnp_id),
+				&conn->ksnc_ipaddr,
+                                conn->ksnc_port,
+                                conn->ksnc_rx_state,
+                                conn->ksnc_rx_nob_wanted,
+                                conn->ksnc_rx_nob_left);
+                        return (conn);
+                }
+
+		if ((!list_empty(&conn->ksnc_tx_queue) ||
+		     conn->ksnc_sock->sk->sk_wmem_queued != 0) &&
+		    cfs_time_aftereq(cfs_time_current(),
+				     conn->ksnc_tx_deadline)) {
+                        /* Timed out messages queued for sending or
+                         * buffered in the socket's send buffer */
+                        ksocknal_conn_addref(conn);
+			CNETERR("Timeout sending data to %s (%pI4h:%d) "
+                                "the network or that node may be down.\n",
+                                libcfs_id2str(peer_ni->ksnp_id),
+				&conn->ksnc_ipaddr, conn->ksnc_port);
+                        return (conn);
+                }
+        }
+
+        return (NULL);
+}
+
+static inline void
+ksocknal_flush_stale_txs(ksock_peer_ni_t *peer_ni)
+{
+	ksock_tx_t	  *tx;
+	struct list_head	stale_txs = LIST_HEAD_INIT(stale_txs);
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	while (!list_empty(&peer_ni->ksnp_tx_queue)) {
+		tx = list_entry(peer_ni->ksnp_tx_queue.next,
+				     ksock_tx_t, tx_list);
+
+		if (!cfs_time_aftereq(cfs_time_current(),
+				      tx->tx_deadline))
+			break;
+
+		list_del(&tx->tx_list);
+		list_add_tail(&tx->tx_list, &stale_txs);
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	ksocknal_txlist_done(peer_ni->ksnp_ni, &stale_txs, -ETIMEDOUT);
+}
+
+static int
+ksocknal_send_keepalive_locked(ksock_peer_ni_t *peer_ni)
+__must_hold(&ksocknal_data.ksnd_global_lock)
+{
+        ksock_sched_t  *sched;
+        ksock_conn_t   *conn;
+        ksock_tx_t     *tx;
+
+	/* last_alive will be updated by create_conn */
+	if (list_empty(&peer_ni->ksnp_conns))
+                return 0;
+
+        if (peer_ni->ksnp_proto != &ksocknal_protocol_v3x)
+                return 0;
+
+        if (*ksocknal_tunables.ksnd_keepalive <= 0 ||
+	    cfs_time_before(cfs_time_current(),
+			    cfs_time_add(peer_ni->ksnp_last_alive,
+					 cfs_time_seconds(*ksocknal_tunables.ksnd_keepalive))))
+                return 0;
+
+	if (cfs_time_before(cfs_time_current(),
+			    peer_ni->ksnp_send_keepalive))
+                return 0;
+
+        /* retry 10 secs later, so we wouldn't put pressure
+         * on this peer_ni if we failed to send keepalive this time */
+	peer_ni->ksnp_send_keepalive = cfs_time_shift(10);
+
+        conn = ksocknal_find_conn_locked(peer_ni, NULL, 1);
+        if (conn != NULL) {
+                sched = conn->ksnc_scheduler;
+
+		spin_lock_bh(&sched->kss_lock);
+		if (!list_empty(&conn->ksnc_tx_queue)) {
+			spin_unlock_bh(&sched->kss_lock);
+			/* there is an queued ACK, don't need keepalive */
+			return 0;
+		}
+
+		spin_unlock_bh(&sched->kss_lock);
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	/* cookie = 1 is reserved for keepalive PING */
+	tx = ksocknal_alloc_tx_noop(1, 1);
+	if (tx == NULL) {
+		read_lock(&ksocknal_data.ksnd_global_lock);
+		return -ENOMEM;
+	}
+
+	if (ksocknal_launch_packet(peer_ni->ksnp_ni, tx, peer_ni->ksnp_id) == 0) {
+		read_lock(&ksocknal_data.ksnd_global_lock);
+		return 1;
+	}
+
+	ksocknal_free_tx(tx);
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	return -EIO;
+}
+
+
+static void
+ksocknal_check_peer_timeouts (int idx)
+{
+	struct list_head       *peers = &ksocknal_data.ksnd_peers[idx];
+        ksock_peer_ni_t     *peer_ni;
+        ksock_conn_t     *conn;
+        ksock_tx_t       *tx;
+
+ again:
+        /* NB. We expect to have a look at all the peers and not find any
+         * connections to time out, so we just use a shared lock while we
+         * take a look... */
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	list_for_each_entry(peer_ni, peers, ksnp_list) {
+		ksock_tx_t *tx_stale;
+		cfs_time_t  deadline = 0;
+		int         resid = 0;
+		int         n     = 0;
+
+                if (ksocknal_send_keepalive_locked(peer_ni) != 0) {
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+                        goto again;
+                }
+
+                conn = ksocknal_find_timed_out_conn (peer_ni);
+
+                if (conn != NULL) {
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+
+                        ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT);
+
+                        /* NB we won't find this one again, but we can't
+                         * just proceed with the next peer_ni, since we dropped
+                         * ksnd_global_lock and it might be dead already! */
+                        ksocknal_conn_decref(conn);
+                        goto again;
+                }
+
+                /* we can't process stale txs right here because we're
+                 * holding only shared lock */
+		if (!list_empty(&peer_ni->ksnp_tx_queue)) {
+                        ksock_tx_t *tx =
+				list_entry(peer_ni->ksnp_tx_queue.next,
+                                                ksock_tx_t, tx_list);
+
+			if (cfs_time_aftereq(cfs_time_current(),
+					     tx->tx_deadline)) {
+
+                                ksocknal_peer_addref(peer_ni);
+				read_unlock(&ksocknal_data.ksnd_global_lock);
+
+                                ksocknal_flush_stale_txs(peer_ni);
+
+                                ksocknal_peer_decref(peer_ni);
+                                goto again;
+                        }
+                }
+
+		if (list_empty(&peer_ni->ksnp_zc_req_list))
+                        continue;
+
+		tx_stale = NULL;
+		spin_lock(&peer_ni->ksnp_lock);
+		list_for_each_entry(tx, &peer_ni->ksnp_zc_req_list, tx_zc_list) {
+			if (!cfs_time_aftereq(cfs_time_current(),
+					      tx->tx_deadline))
+                                break;
+                        /* ignore the TX if connection is being closed */
+                        if (tx->tx_conn->ksnc_closing)
+                                continue;
+                        n++;
+			if (tx_stale == NULL)
+				tx_stale = tx;
+                }
+
+		if (tx_stale == NULL) {
+			spin_unlock(&peer_ni->ksnp_lock);
+			continue;
+		}
+
+		deadline = tx_stale->tx_deadline;
+		resid    = tx_stale->tx_resid;
+		conn     = tx_stale->tx_conn;
+		ksocknal_conn_addref(conn);
+
+		spin_unlock(&peer_ni->ksnp_lock);
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+
+		CERROR("Total %d stale ZC_REQs for peer_ni %s detected; the "
+		       "oldest(%p) timed out %ld secs ago, "
+		       "resid: %d, wmem: %d\n",
+		       n, libcfs_nid2str(peer_ni->ksnp_id.nid), tx_stale,
+		       cfs_duration_sec(cfs_time_current() - deadline),
+		       resid, conn->ksnc_sock->sk->sk_wmem_queued);
+
+                ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT);
+                ksocknal_conn_decref(conn);
+                goto again;
+        }
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+}
+
+int ksocknal_reaper(void *arg)
+{
+	wait_queue_entry_t wait;
+	ksock_conn_t      *conn;
+	ksock_sched_t     *sched;
+	struct list_head         enomem_conns;
+        int                nenomem_conns;
+	cfs_duration_t     timeout;
+        int                i;
+        int                peer_index = 0;
+	cfs_time_t         deadline = cfs_time_current();
+
+        cfs_block_allsigs ();
+
+	INIT_LIST_HEAD(&enomem_conns);
+	init_waitqueue_entry(&wait, current);
+
+	spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+        while (!ksocknal_data.ksnd_shuttingdown) {
+
+		if (!list_empty(&ksocknal_data.ksnd_deathrow_conns)) {
+			conn = list_entry(ksocknal_data. \
+                                               ksnd_deathrow_conns.next,
+                                               ksock_conn_t, ksnc_list);
+			list_del(&conn->ksnc_list);
+
+			spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+			ksocknal_terminate_conn(conn);
+			ksocknal_conn_decref(conn);
+
+			spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+                        continue;
+                }
+
+		if (!list_empty(&ksocknal_data.ksnd_zombie_conns)) {
+			conn = list_entry(ksocknal_data.ksnd_zombie_conns.\
+                                               next, ksock_conn_t, ksnc_list);
+			list_del(&conn->ksnc_list);
+
+			spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+			ksocknal_destroy_conn(conn);
+
+			spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+                        continue;
+                }
+
+		if (!list_empty(&ksocknal_data.ksnd_enomem_conns)) {
+			list_add(&enomem_conns,
+                                     &ksocknal_data.ksnd_enomem_conns);
+			list_del_init(&ksocknal_data.ksnd_enomem_conns);
+                }
+
+		spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+                /* reschedule all the connections that stalled with ENOMEM... */
+                nenomem_conns = 0;
+		while (!list_empty(&enomem_conns)) {
+			conn = list_entry(enomem_conns.next,
+                                               ksock_conn_t, ksnc_tx_list);
+			list_del(&conn->ksnc_tx_list);
+
+                        sched = conn->ksnc_scheduler;
+
+			spin_lock_bh(&sched->kss_lock);
+
+			LASSERT(conn->ksnc_tx_scheduled);
+			conn->ksnc_tx_ready = 1;
+			list_add_tail(&conn->ksnc_tx_list,
+					  &sched->kss_tx_conns);
+			wake_up(&sched->kss_waitq);
+
+			spin_unlock_bh(&sched->kss_lock);
+                        nenomem_conns++;
+                }
+
+                /* careful with the jiffy wrap... */
+		while ((timeout = cfs_time_sub(deadline,
+					       cfs_time_current())) <= 0) {
+                        const int n = 4;
+                        const int p = 1;
+                        int       chunk = ksocknal_data.ksnd_peer_hash_size;
+
+                        /* Time to check for timeouts on a few more peers: I do
+                         * checks every 'p' seconds on a proportion of the peer_ni
+                         * table and I need to check every connection 'n' times
+                         * within a timeout interval, to ensure I detect a
+                         * timeout on any connection within (n+1)/n times the
+                         * timeout interval. */
+
+                        if (*ksocknal_tunables.ksnd_timeout > n * p)
+                                chunk = (chunk * n * p) /
+                                        *ksocknal_tunables.ksnd_timeout;
+                        if (chunk == 0)
+                                chunk = 1;
+
+                        for (i = 0; i < chunk; i++) {
+                                ksocknal_check_peer_timeouts (peer_index);
+                                peer_index = (peer_index + 1) %
+                                             ksocknal_data.ksnd_peer_hash_size;
+                        }
+
+			deadline = cfs_time_add(deadline, cfs_time_seconds(p));
+                }
+
+                if (nenomem_conns != 0) {
+                        /* Reduce my timeout if I rescheduled ENOMEM conns.
+                         * This also prevents me getting woken immediately
+                         * if any go back on my enomem list. */
+                        timeout = SOCKNAL_ENOMEM_RETRY;
+                }
+		ksocknal_data.ksnd_reaper_waketime =
+			cfs_time_add(cfs_time_current(), timeout);
+
+			set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&ksocknal_data.ksnd_reaper_waitq, &wait);
+
+		if (!ksocknal_data.ksnd_shuttingdown &&
+		    list_empty(&ksocknal_data.ksnd_deathrow_conns) &&
+		    list_empty(&ksocknal_data.ksnd_zombie_conns))
+			schedule_timeout(timeout);
+
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&ksocknal_data.ksnd_reaper_waitq, &wait);
+
+		spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+	}
+
+	spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+	ksocknal_thread_fini();
+	return 0;
+}
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c
new file mode 100644
index 0000000000000..1215488b89d62
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c
@@ -0,0 +1,752 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include "socklnd.h"
+
+int
+ksocknal_lib_get_conn_addrs (ksock_conn_t *conn)
+{
+	int rc = lnet_sock_getaddr(conn->ksnc_sock, true,
+                                     &conn->ksnc_ipaddr,
+                                     &conn->ksnc_port);
+
+        /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
+        LASSERT (!conn->ksnc_closing);
+
+        if (rc != 0) {
+                CERROR ("Error %d getting sock peer_ni IP\n", rc);
+                return rc;
+        }
+
+	rc = lnet_sock_getaddr(conn->ksnc_sock, false,
+                                 &conn->ksnc_myipaddr, NULL);
+        if (rc != 0) {
+                CERROR ("Error %d getting sock local IP\n", rc);
+                return rc;
+        }
+
+        return 0;
+}
+
+int
+ksocknal_lib_zc_capable(ksock_conn_t *conn)
+{
+	int  caps = conn->ksnc_sock->sk->sk_route_caps;
+
+	if (conn->ksnc_proto == &ksocknal_protocol_v1x)
+		return 0;
+
+	/* ZC if the socket supports scatter/gather and doesn't need software
+	 * checksums */
+	return ((caps & NETIF_F_SG) != 0 && (caps & NETIF_F_CSUM_MASK) != 0);
+}
+
+int
+ksocknal_lib_send_iov(ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	struct socket  *sock = conn->ksnc_sock;
+	int		nob;
+	int		rc;
+
+	if (*ksocknal_tunables.ksnd_enable_csum	       && /* checksum enabled */
+	    conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection  */
+	    tx->tx_nob == tx->tx_resid		       && /* frist sending    */
+	    tx->tx_msg.ksm_csum == 0)			  /* not checksummed  */
+		ksocknal_lib_csum_tx(tx);
+
+	/* NB we can't trust socket ops to either consume our iovs
+	 * or leave them alone. */
+
+	{
+#if SOCKNAL_SINGLE_FRAG_TX
+		struct kvec scratch;
+		struct kvec *scratchiov = &scratch;
+		unsigned int niov = 1;
+#else
+		struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+		unsigned int niov = tx->tx_niov;
+#endif
+		struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
+                int  i;
+
+		for (nob = i = 0; i < niov; i++) {
+			scratchiov[i] = tx->tx_iov[i];
+			nob += scratchiov[i].iov_len;
+		}
+
+		if (!list_empty(&conn->ksnc_tx_queue) ||
+		    nob < tx->tx_resid)
+			msg.msg_flags |= MSG_MORE;
+
+		rc = kernel_sendmsg(sock, &msg, scratchiov, niov, nob);
+	}
+	return rc;
+}
+
+int
+ksocknal_lib_send_kiov(ksock_conn_t *conn, ksock_tx_t *tx)
+{
+        struct socket *sock = conn->ksnc_sock;
+        lnet_kiov_t   *kiov = tx->tx_kiov;
+        int            rc;
+        int            nob;
+
+        /* Not NOOP message */
+        LASSERT (tx->tx_lnetmsg != NULL);
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone. */
+        if (tx->tx_msg.ksm_zc_cookies[0] != 0) {
+                /* Zero copy is enabled */
+                struct sock   *sk = sock->sk;
+                struct page   *page = kiov->kiov_page;
+                int            offset = kiov->kiov_offset;
+                int            fragsize = kiov->kiov_len;
+                int            msgflg = MSG_DONTWAIT;
+
+                CDEBUG(D_NET, "page %p + offset %x for %d\n",
+                               page, offset, kiov->kiov_len);
+
+		if (!list_empty(&conn->ksnc_tx_queue) ||
+                    fragsize < tx->tx_resid)
+                        msgflg |= MSG_MORE;
+
+                if (sk->sk_prot->sendpage != NULL) {
+                        rc = sk->sk_prot->sendpage(sk, page,
+                                                   offset, fragsize, msgflg);
+                } else {
+                        rc = cfs_tcp_sendpage(sk, page, offset, fragsize,
+                                              msgflg);
+                }
+        } else {
+#if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK
+		struct kvec	scratch;
+		struct kvec   *scratchiov = &scratch;
+		unsigned int	niov = 1;
+#else
+#ifdef CONFIG_HIGHMEM
+#warning "XXX risk of kmap deadlock on multiple frags..."
+#endif
+		struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+		unsigned int  niov = tx->tx_nkiov;
+#endif
+		struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
+		int	      i;
+
+		for (nob = i = 0; i < niov; i++) {
+			scratchiov[i].iov_base = kmap(kiov[i].kiov_page) +
+						 kiov[i].kiov_offset;
+			nob += scratchiov[i].iov_len = kiov[i].kiov_len;
+		}
+
+		if (!list_empty(&conn->ksnc_tx_queue) ||
+		    nob < tx->tx_resid)
+			msg.msg_flags |= MSG_MORE;
+
+		rc = kernel_sendmsg(sock, &msg, scratchiov, niov, nob);
+
+		for (i = 0; i < niov; i++)
+			kunmap(kiov[i].kiov_page);
+	}
+	return rc;
+}
+
+void
+ksocknal_lib_eager_ack (ksock_conn_t *conn)
+{
+        int            opt = 1;
+        struct socket *sock = conn->ksnc_sock;
+
+        /* Remind the socket to ACK eagerly.  If I don't, the socket might
+         * think I'm about to send something it could piggy-back the ACK
+         * on, introducing delay in completing zero-copy sends in my
+         * peer_ni. */
+
+	kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
+			  (char *)&opt, sizeof(opt));
+}
+
+int
+ksocknal_lib_recv_iov (ksock_conn_t *conn)
+{
+#if SOCKNAL_SINGLE_FRAG_RX
+	struct kvec  scratch;
+	struct kvec *scratchiov = &scratch;
+	unsigned int  niov = 1;
+#else
+	struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+	unsigned int  niov = conn->ksnc_rx_niov;
+#endif
+	struct kvec *iov = conn->ksnc_rx_iov;
+	struct msghdr msg = {
+		.msg_flags      = 0
+	};
+        int          nob;
+        int          i;
+        int          rc;
+        int          fragnob;
+        int          sum;
+        __u32        saved_csum;
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone. */
+        LASSERT (niov > 0);
+
+        for (nob = i = 0; i < niov; i++) {
+                scratchiov[i] = iov[i];
+                nob += scratchiov[i].iov_len;
+        }
+        LASSERT (nob <= conn->ksnc_rx_nob_wanted);
+
+	rc = kernel_recvmsg(conn->ksnc_sock, &msg, scratchiov, niov, nob,
+			    MSG_DONTWAIT);
+
+        saved_csum = 0;
+        if (conn->ksnc_proto == &ksocknal_protocol_v2x) {
+                saved_csum = conn->ksnc_msg.ksm_csum;
+                conn->ksnc_msg.ksm_csum = 0;
+        }
+
+        if (saved_csum != 0) {
+                /* accumulate checksum */
+                for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
+                        LASSERT (i < niov);
+
+                        fragnob = iov[i].iov_len;
+                        if (fragnob > sum)
+                                fragnob = sum;
+
+                        conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum,
+                                                           iov[i].iov_base, fragnob);
+                }
+                conn->ksnc_msg.ksm_csum = saved_csum;
+        }
+
+        return rc;
+}
+
+static void
+ksocknal_lib_kiov_vunmap(void *addr)
+{
+        if (addr == NULL)
+                return;
+
+        vunmap(addr);
+}
+
+static void *
+ksocknal_lib_kiov_vmap(lnet_kiov_t *kiov, int niov,
+		       struct kvec *iov, struct page **pages)
+{
+        void             *addr;
+        int               nob;
+        int               i;
+
+        if (!*ksocknal_tunables.ksnd_zc_recv || pages == NULL)
+                return NULL;
+
+        LASSERT (niov <= LNET_MAX_IOV);
+
+        if (niov < 2 ||
+            niov < *ksocknal_tunables.ksnd_zc_recv_min_nfrags)
+                return NULL;
+
+        for (nob = i = 0; i < niov; i++) {
+                if ((kiov[i].kiov_offset != 0 && i > 0) ||
+		    (kiov[i].kiov_offset + kiov[i].kiov_len !=
+		     PAGE_SIZE && i < niov - 1))
+                        return NULL;
+
+                pages[i] = kiov[i].kiov_page;
+                nob += kiov[i].kiov_len;
+        }
+
+        addr = vmap(pages, niov, VM_MAP, PAGE_KERNEL);
+        if (addr == NULL)
+                return NULL;
+
+        iov->iov_base = addr + kiov[0].kiov_offset;
+        iov->iov_len = nob;
+
+        return addr;
+}
+
+int
+ksocknal_lib_recv_kiov (ksock_conn_t *conn)
+{
+#if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK
+	struct kvec   scratch;
+	struct kvec  *scratchiov = &scratch;
+        struct page  **pages      = NULL;
+        unsigned int   niov       = 1;
+#else
+#ifdef CONFIG_HIGHMEM
+#warning "XXX risk of kmap deadlock on multiple frags..."
+#endif
+	struct kvec  *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+	struct page  **pages      = conn->ksnc_scheduler->kss_rx_scratch_pgs;
+	unsigned int   niov       = conn->ksnc_rx_nkiov;
+#endif
+	lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
+	struct msghdr msg = {
+		.msg_flags      = 0
+	};
+        int          nob;
+        int          i;
+        int          rc;
+        void        *base;
+        void        *addr;
+        int          sum;
+        int          fragnob;
+	int n;
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone. */
+	if ((addr = ksocknal_lib_kiov_vmap(kiov, niov, scratchiov, pages)) != NULL) {
+		nob = scratchiov[0].iov_len;
+		n = 1;
+
+	} else {
+		for (nob = i = 0; i < niov; i++) {
+			nob += scratchiov[i].iov_len = kiov[i].kiov_len;
+			scratchiov[i].iov_base = kmap(kiov[i].kiov_page) +
+						 kiov[i].kiov_offset;
+		}
+		n = niov;
+	}
+
+	LASSERT (nob <= conn->ksnc_rx_nob_wanted);
+
+	rc = kernel_recvmsg(conn->ksnc_sock, &msg, scratchiov, n, nob,
+			    MSG_DONTWAIT);
+
+        if (conn->ksnc_msg.ksm_csum != 0) {
+                for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
+                        LASSERT (i < niov);
+
+                        /* Dang! have to kmap again because I have nowhere to stash the
+                         * mapped address.  But by doing it while the page is still
+                         * mapped, the kernel just bumps the map count and returns me
+                         * the address it stashed. */
+                        base = kmap(kiov[i].kiov_page) + kiov[i].kiov_offset;
+                        fragnob = kiov[i].kiov_len;
+                        if (fragnob > sum)
+                                fragnob = sum;
+
+                        conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum,
+                                                           base, fragnob);
+
+                        kunmap(kiov[i].kiov_page);
+                }
+        }
+
+        if (addr != NULL) {
+                ksocknal_lib_kiov_vunmap(addr);
+        } else {
+                for (i = 0; i < niov; i++)
+                        kunmap(kiov[i].kiov_page);
+        }
+
+        return (rc);
+}
+
+void
+ksocknal_lib_csum_tx(ksock_tx_t *tx)
+{
+        int          i;
+        __u32        csum;
+        void        *base;
+
+        LASSERT(tx->tx_iov[0].iov_base == (void *)&tx->tx_msg);
+        LASSERT(tx->tx_conn != NULL);
+        LASSERT(tx->tx_conn->ksnc_proto == &ksocknal_protocol_v2x);
+
+        tx->tx_msg.ksm_csum = 0;
+
+        csum = ksocknal_csum(~0, (void *)tx->tx_iov[0].iov_base,
+                             tx->tx_iov[0].iov_len);
+
+        if (tx->tx_kiov != NULL) {
+                for (i = 0; i < tx->tx_nkiov; i++) {
+                        base = kmap(tx->tx_kiov[i].kiov_page) +
+                               tx->tx_kiov[i].kiov_offset;
+
+                        csum = ksocknal_csum(csum, base, tx->tx_kiov[i].kiov_len);
+
+                        kunmap(tx->tx_kiov[i].kiov_page);
+                }
+        } else {
+                for (i = 1; i < tx->tx_niov; i++)
+                        csum = ksocknal_csum(csum, tx->tx_iov[i].iov_base,
+                                             tx->tx_iov[i].iov_len);
+        }
+
+        if (*ksocknal_tunables.ksnd_inject_csum_error) {
+                csum++;
+                *ksocknal_tunables.ksnd_inject_csum_error = 0;
+        }
+
+        tx->tx_msg.ksm_csum = csum;
+}
+
+int
+ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
+{
+        struct socket *sock = conn->ksnc_sock;
+        int            len;
+        int            rc;
+
+        rc = ksocknal_connsock_addref(conn);
+        if (rc != 0) {
+                LASSERT (conn->ksnc_closing);
+                *txmem = *rxmem = *nagle = 0;
+                return (-ESHUTDOWN);
+        }
+
+	rc = lnet_sock_getbuf(sock, txmem, rxmem);
+        if (rc == 0) {
+                len = sizeof(*nagle);
+		rc = kernel_getsockopt(sock, SOL_TCP, TCP_NODELAY,
+				       (char *)nagle, &len);
+        }
+
+        ksocknal_connsock_decref(conn);
+
+        if (rc == 0)
+                *nagle = !*nagle;
+        else
+                *txmem = *rxmem = *nagle = 0;
+
+        return (rc);
+}
+
+int
+ksocknal_lib_setup_sock (struct socket *sock)
+{
+        int             rc;
+        int             option;
+        int             keep_idle;
+        int             keep_intvl;
+        int             keep_count;
+        int             do_keepalive;
+        struct linger   linger;
+
+        sock->sk->sk_allocation = GFP_NOFS;
+
+        /* Ensure this socket aborts active sends immediately when we close
+         * it. */
+
+        linger.l_onoff = 0;
+        linger.l_linger = 0;
+
+	rc = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
+			       (char *)&linger, sizeof(linger));
+        if (rc != 0) {
+                CERROR ("Can't set SO_LINGER: %d\n", rc);
+                return (rc);
+        }
+
+        option = -1;
+	rc = kernel_setsockopt(sock, SOL_TCP, TCP_LINGER2,
+			       (char *)&option, sizeof(option));
+        if (rc != 0) {
+                CERROR ("Can't set SO_LINGER2: %d\n", rc);
+                return (rc);
+        }
+
+        if (!*ksocknal_tunables.ksnd_nagle) {
+                option = 1;
+
+		rc = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
+				       (char *)&option, sizeof(option));
+                if (rc != 0) {
+                        CERROR ("Can't disable nagle: %d\n", rc);
+                        return (rc);
+                }
+        }
+
+	rc = lnet_sock_setbuf(sock,
+			      *ksocknal_tunables.ksnd_tx_buffer_size,
+			      *ksocknal_tunables.ksnd_rx_buffer_size);
+        if (rc != 0) {
+                CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n",
+                        *ksocknal_tunables.ksnd_tx_buffer_size,
+                        *ksocknal_tunables.ksnd_rx_buffer_size, rc);
+                return (rc);
+        }
+
+/* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */
+#ifdef SOCKNAL_BACKOFF
+        if (*ksocknal_tunables.ksnd_backoff_init > 0) {
+                option = *ksocknal_tunables.ksnd_backoff_init;
+#ifdef SOCKNAL_BACKOFF_MS
+                option *= 1000;
+#endif
+
+		rc = kernel_setsockopt(sock, SOL_TCP, TCP_BACKOFF_INIT,
+				       (char *)&option, sizeof(option));
+                if (rc != 0) {
+                        CERROR ("Can't set initial tcp backoff %d: %d\n",
+                                option, rc);
+                        return (rc);
+                }
+        }
+
+        if (*ksocknal_tunables.ksnd_backoff_max > 0) {
+                option = *ksocknal_tunables.ksnd_backoff_max;
+#ifdef SOCKNAL_BACKOFF_MS
+                option *= 1000;
+#endif
+
+		rc = kernel_setsockopt(sock, SOL_TCP, TCP_BACKOFF_MAX,
+				       (char *)&option, sizeof(option));
+                if (rc != 0) {
+                        CERROR ("Can't set maximum tcp backoff %d: %d\n",
+                                option, rc);
+                        return (rc);
+                }
+        }
+#endif
+
+        /* snapshot tunables */
+        keep_idle  = *ksocknal_tunables.ksnd_keepalive_idle;
+        keep_count = *ksocknal_tunables.ksnd_keepalive_count;
+        keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;
+
+        do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0);
+
+        option = (do_keepalive ? 1 : 0);
+	rc = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
+			       (char *)&option, sizeof(option));
+        if (rc != 0) {
+                CERROR ("Can't set SO_KEEPALIVE: %d\n", rc);
+                return (rc);
+        }
+
+        if (!do_keepalive)
+                return (0);
+
+	rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE,
+			       (char *)&keep_idle, sizeof(keep_idle));
+        if (rc != 0) {
+                CERROR ("Can't set TCP_KEEPIDLE: %d\n", rc);
+                return (rc);
+        }
+
+	rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
+			       (char *)&keep_intvl, sizeof(keep_intvl));
+        if (rc != 0) {
+                CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc);
+                return (rc);
+        }
+
+	rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
+			       (char *)&keep_count, sizeof(keep_count));
+        if (rc != 0) {
+                CERROR ("Can't set TCP_KEEPCNT: %d\n", rc);
+                return (rc);
+        }
+
+        return (0);
+}
+
+void
+ksocknal_lib_push_conn (ksock_conn_t *conn)
+{
+        struct sock    *sk;
+        struct tcp_sock *tp;
+        int             nonagle;
+        int             val = 1;
+        int             rc;
+
+        rc = ksocknal_connsock_addref(conn);
+        if (rc != 0)                            /* being shut down */
+                return;
+
+	sk = conn->ksnc_sock->sk;
+	tp = tcp_sk(sk);
+
+        lock_sock (sk);
+        nonagle = tp->nonagle;
+        tp->nonagle = 1;
+        release_sock (sk);
+
+	rc = kernel_setsockopt(conn->ksnc_sock, SOL_TCP, TCP_NODELAY,
+			       (char *)&val, sizeof(val));
+        LASSERT (rc == 0);
+
+        lock_sock (sk);
+        tp->nonagle = nonagle;
+        release_sock (sk);
+
+        ksocknal_connsock_decref(conn);
+}
+
+extern void ksocknal_read_callback (ksock_conn_t *conn);
+extern void ksocknal_write_callback (ksock_conn_t *conn);
+/*
+ * socket call back in Linux
+ */
+static void
+#ifdef HAVE_SK_DATA_READY_ONE_ARG
+ksocknal_data_ready(struct sock *sk)
+#else
+ksocknal_data_ready(struct sock *sk, int n)
+#endif
+{
+	ksock_conn_t  *conn;
+	ENTRY;
+
+        /* interleave correctly with closing sockets... */
+        LASSERT(!in_irq());
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	conn = sk->sk_user_data;
+	if (conn == NULL) {	/* raced with ksocknal_terminate_conn */
+		LASSERT(sk->sk_data_ready != &ksocknal_data_ready);
+#ifdef HAVE_SK_DATA_READY_ONE_ARG
+		sk->sk_data_ready(sk);
+#else
+		sk->sk_data_ready(sk, n);
+#endif
+	} else
+		ksocknal_read_callback(conn);
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	EXIT;
+}
+
+static void
+ksocknal_write_space (struct sock *sk)
+{
+        ksock_conn_t  *conn;
+        int            wspace;
+        int            min_wpace;
+
+        /* interleave correctly with closing sockets... */
+        LASSERT(!in_irq());
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+        conn = sk->sk_user_data;
+	wspace = sk_stream_wspace(sk);
+	min_wpace = sk_stream_min_wspace(sk);
+
+        CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n",
+               sk, wspace, min_wpace, conn,
+               (conn == NULL) ? "" : (conn->ksnc_tx_ready ?
+                                      " ready" : " blocked"),
+               (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ?
+                                      " scheduled" : " idle"),
+	       (conn == NULL) ? "" : (list_empty(&conn->ksnc_tx_queue) ?
+                                      " empty" : " queued"));
+
+        if (conn == NULL) {             /* raced with ksocknal_terminate_conn */
+                LASSERT (sk->sk_write_space != &ksocknal_write_space);
+                sk->sk_write_space (sk);
+
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+                return;
+        }
+
+        if (wspace >= min_wpace) {              /* got enough space */
+                ksocknal_write_callback(conn);
+
+                /* Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the
+                 * ENOMEM check in ksocknal_transmit is race-free (think about
+                 * it). */
+
+                clear_bit (SOCK_NOSPACE, &sk->sk_socket->flags);
+        }
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+}
+
+void
+ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn)
+{
+        conn->ksnc_saved_data_ready = sock->sk->sk_data_ready;
+        conn->ksnc_saved_write_space = sock->sk->sk_write_space;
+}
+
+void
+ksocknal_lib_set_callback(struct socket *sock,  ksock_conn_t *conn)
+{
+        sock->sk->sk_user_data = conn;
+        sock->sk->sk_data_ready = ksocknal_data_ready;
+        sock->sk->sk_write_space = ksocknal_write_space;
+        return;
+}
+
+void
+ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn)
+{
+        /* Remove conn's network callbacks.
+         * NB I _have_ to restore the callback, rather than storing a noop,
+         * since the socket could survive past this module being unloaded!! */
+        sock->sk->sk_data_ready = conn->ksnc_saved_data_ready;
+        sock->sk->sk_write_space = conn->ksnc_saved_write_space;
+
+        /* A callback could be in progress already; they hold a read lock
+         * on ksnd_global_lock (to serialise with me) and NOOP if
+         * sk_user_data is NULL. */
+        sock->sk->sk_user_data = NULL;
+
+        return ;
+}
+
+int
+ksocknal_lib_memory_pressure(ksock_conn_t *conn)
+{
+	int            rc = 0;
+	ksock_sched_t *sched;
+
+	sched = conn->ksnc_scheduler;
+	spin_lock_bh(&sched->kss_lock);
+
+	if (!test_bit(SOCK_NOSPACE, &conn->ksnc_sock->flags) &&
+            !conn->ksnc_tx_ready) {
+                /* SOCK_NOSPACE is set when the socket fills
+                 * and cleared in the write_space callback
+                 * (which also sets ksnc_tx_ready).  If
+                 * SOCK_NOSPACE and ksnc_tx_ready are BOTH
+                 * zero, I didn't fill the socket and
+                 * write_space won't reschedule me, so I
+                 * return -ENOMEM to get my caller to retry
+                 * after a timeout */
+                rc = -ENOMEM;
+        }
+
+	spin_unlock_bh(&sched->kss_lock);
+
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_modparams.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_modparams.c
new file mode 100644
index 0000000000000..6495703626094
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_modparams.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+static int sock_timeout = 50;
+module_param(sock_timeout, int, 0644);
+MODULE_PARM_DESC(sock_timeout, "dead socket timeout (seconds)");
+
+static int credits = 256;
+module_param(credits, int, 0444);
+MODULE_PARM_DESC(credits, "# concurrent sends");
+
+static int peer_credits = 8;
+module_param(peer_credits, int, 0444);
+MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer");
+
+static int peer_buffer_credits;
+module_param(peer_buffer_credits, int, 0444);
+MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits");
+
+static int peer_timeout = 180;
+module_param(peer_timeout, int, 0444);
+MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)");
+
+/* Number of daemons in each thread pool which is percpt,
+ * we will estimate reasonable value based on CPUs if it's not set. */
+static unsigned int nscheds;
+module_param(nscheds, int, 0444);
+MODULE_PARM_DESC(nscheds, "# scheduler daemons in each pool while starting");
+
+static int nconnds = 4;
+module_param(nconnds, int, 0444);
+MODULE_PARM_DESC(nconnds, "# connection daemons while starting");
+
+static int nconnds_max = 64;
+module_param(nconnds_max, int, 0444);
+MODULE_PARM_DESC(nconnds_max, "max # connection daemons");
+
+static int min_reconnectms = 1000;
+module_param(min_reconnectms, int, 0644);
+MODULE_PARM_DESC(min_reconnectms, "min connection retry interval (mS)");
+
+static int max_reconnectms = 60000;
+module_param(max_reconnectms, int, 0644);
+MODULE_PARM_DESC(max_reconnectms, "max connection retry interval (mS)");
+
+static int eager_ack;
+module_param(eager_ack, int, 0644);
+MODULE_PARM_DESC(eager_ack, "send tcp ack packets eagerly");
+
+static int typed_conns = 1;
+module_param(typed_conns, int, 0444);
+MODULE_PARM_DESC(typed_conns, "use different sockets for bulk");
+
+static int min_bulk = (1<<10);
+module_param(min_bulk, int, 0644);
+MODULE_PARM_DESC(min_bulk, "smallest 'large' message");
+
+# define DEFAULT_BUFFER_SIZE 0
+static int tx_buffer_size = DEFAULT_BUFFER_SIZE;
+module_param(tx_buffer_size, int, 0644);
+MODULE_PARM_DESC(tx_buffer_size, "socket tx buffer size (0 for system default)");
+
+static int rx_buffer_size = DEFAULT_BUFFER_SIZE;
+module_param(rx_buffer_size, int, 0644);
+MODULE_PARM_DESC(rx_buffer_size, "socket rx buffer size (0 for system default)");
+
+static int nagle = 0;
+module_param(nagle, int, 0644);
+MODULE_PARM_DESC(nagle, "enable NAGLE?");
+
+static int round_robin = 1;
+module_param(round_robin, int, 0644);
+MODULE_PARM_DESC(round_robin, "Round robin for multiple interfaces");
+
+static int keepalive = 30;
+module_param(keepalive, int, 0644);
+MODULE_PARM_DESC(keepalive, "# seconds before send keepalive");
+
+static int keepalive_idle = 30;
+module_param(keepalive_idle, int, 0644);
+MODULE_PARM_DESC(keepalive_idle, "# idle seconds before probe");
+
+#define DEFAULT_KEEPALIVE_COUNT  5
+static int keepalive_count = DEFAULT_KEEPALIVE_COUNT;
+module_param(keepalive_count, int, 0644);
+MODULE_PARM_DESC(keepalive_count, "# missed probes == dead");
+
+static int keepalive_intvl = 5;
+module_param(keepalive_intvl, int, 0644);
+MODULE_PARM_DESC(keepalive_intvl, "seconds between probes");
+
+static int enable_csum = 0;
+module_param(enable_csum, int, 0644);
+MODULE_PARM_DESC(enable_csum, "enable check sum");
+
+static int inject_csum_error = 0;
+module_param(inject_csum_error, int, 0644);
+MODULE_PARM_DESC(inject_csum_error, "set non-zero to inject a checksum error");
+
+#ifdef CPU_AFFINITY
+static int enable_irq_affinity = 0;
+module_param(enable_irq_affinity, int, 0644);
+MODULE_PARM_DESC(enable_irq_affinity, "enable IRQ affinity");
+#endif
+
+static int nonblk_zcack = 1;
+module_param(nonblk_zcack, int, 0644);
+MODULE_PARM_DESC(nonblk_zcack, "always send ZC-ACK on non-blocking connection");
+
+static unsigned int zc_min_payload = (16 << 10);
+module_param(zc_min_payload, int, 0644);
+MODULE_PARM_DESC(zc_min_payload, "minimum payload size to zero copy");
+
+static unsigned int zc_recv = 0;
+module_param(zc_recv, int, 0644);
+MODULE_PARM_DESC(zc_recv, "enable ZC recv for Chelsio driver");
+
+static unsigned int zc_recv_min_nfrags = 16;
+module_param(zc_recv_min_nfrags, int, 0644);
+MODULE_PARM_DESC(zc_recv_min_nfrags, "minimum # of fragments to enable ZC recv");
+
+#ifdef SOCKNAL_BACKOFF
+static int backoff_init = 3;
+module_param(backoff_init, int, 0644);
+MODULE_PARM_DESC(backoff_init, "seconds for initial tcp backoff");
+
+static int backoff_max = 3;
+module_param(backoff_max, int, 0644);
+MODULE_PARM_DESC(backoff_max, "seconds for maximum tcp backoff");
+#endif
+
+#if SOCKNAL_VERSION_DEBUG
+static int protocol = 3;
+module_param(protocol, int, 0644);
+MODULE_PARM_DESC(protocol, "protocol version");
+#endif
+
+ksock_tunables_t ksocknal_tunables;
+
+int ksocknal_tunables_init(void)
+{
+
+        /* initialize ksocknal_tunables structure */
+        ksocknal_tunables.ksnd_timeout            = &sock_timeout;
+	ksocknal_tunables.ksnd_nscheds		  = &nscheds;
+        ksocknal_tunables.ksnd_nconnds            = &nconnds;
+        ksocknal_tunables.ksnd_nconnds_max        = &nconnds_max;
+        ksocknal_tunables.ksnd_min_reconnectms    = &min_reconnectms;
+        ksocknal_tunables.ksnd_max_reconnectms    = &max_reconnectms;
+        ksocknal_tunables.ksnd_eager_ack          = &eager_ack;
+        ksocknal_tunables.ksnd_typed_conns        = &typed_conns;
+        ksocknal_tunables.ksnd_min_bulk           = &min_bulk;
+        ksocknal_tunables.ksnd_tx_buffer_size     = &tx_buffer_size;
+        ksocknal_tunables.ksnd_rx_buffer_size     = &rx_buffer_size;
+        ksocknal_tunables.ksnd_nagle              = &nagle;
+        ksocknal_tunables.ksnd_round_robin        = &round_robin;
+        ksocknal_tunables.ksnd_keepalive          = &keepalive;
+        ksocknal_tunables.ksnd_keepalive_idle     = &keepalive_idle;
+        ksocknal_tunables.ksnd_keepalive_count    = &keepalive_count;
+        ksocknal_tunables.ksnd_keepalive_intvl    = &keepalive_intvl;
+        ksocknal_tunables.ksnd_credits            = &credits;
+        ksocknal_tunables.ksnd_peertxcredits      = &peer_credits;
+        ksocknal_tunables.ksnd_peerrtrcredits     = &peer_buffer_credits;
+        ksocknal_tunables.ksnd_peertimeout        = &peer_timeout;
+        ksocknal_tunables.ksnd_enable_csum        = &enable_csum;
+        ksocknal_tunables.ksnd_inject_csum_error  = &inject_csum_error;
+        ksocknal_tunables.ksnd_nonblk_zcack       = &nonblk_zcack;
+        ksocknal_tunables.ksnd_zc_min_payload     = &zc_min_payload;
+        ksocknal_tunables.ksnd_zc_recv            = &zc_recv;
+        ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags;
+
+#ifdef CPU_AFFINITY
+	if (enable_irq_affinity) {
+		CWARN("irq_affinity is removed from socklnd because modern "
+		      "computer always has fast CPUs and more cores than "
+		      "# NICs, although you still can set irq_affinity by "
+		      "another way, please check manual for details.\n");
+	}
+        ksocknal_tunables.ksnd_irq_affinity       = &enable_irq_affinity;
+#endif
+
+#ifdef SOCKNAL_BACKOFF
+        ksocknal_tunables.ksnd_backoff_init       = &backoff_init;
+        ksocknal_tunables.ksnd_backoff_max        = &backoff_max;
+#endif
+
+#if SOCKNAL_VERSION_DEBUG
+        ksocknal_tunables.ksnd_protocol           = &protocol;
+#endif
+
+        if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10))
+                *ksocknal_tunables.ksnd_zc_min_payload = (2 << 10);
+
+	return 0;
+};
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c
new file mode 100644
index 0000000000000..98109ec2ff7bc
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c
@@ -0,0 +1,801 @@
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, https://wiki.hpdd.intel.com/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+/*
+ * Protocol entries :
+ *   pro_send_hello       : send hello message
+ *   pro_recv_hello       : receive hello message
+ *   pro_pack             : pack message header
+ *   pro_unpack           : unpack message header
+ *   pro_queue_tx_zcack() : Called holding BH lock: kss_lock
+ *                          return 1 if ACK is piggybacked, otherwise return 0
+ *   pro_queue_tx_msg()   : Called holding BH lock: kss_lock
+ *                          return the ACK that piggybacked by my message, or NULL
+ *   pro_handle_zcreq()   : handler of incoming ZC-REQ
+ *   pro_handle_zcack()   : handler of incoming ZC-ACK
+ *   pro_match_tx()       : Called holding glock
+ */
+
+static ksock_tx_t *
+ksocknal_queue_tx_msg_v1(ksock_conn_t *conn, ksock_tx_t *tx_msg)
+{
+        /* V1.x, just enqueue it */
+	list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+        return NULL;
+}
+
+void
+ksocknal_next_tx_carrier(ksock_conn_t *conn)
+{
+        ksock_tx_t     *tx = conn->ksnc_tx_carrier;
+
+        /* Called holding BH lock: conn->ksnc_scheduler->kss_lock */
+	LASSERT(!list_empty(&conn->ksnc_tx_queue));
+	LASSERT(tx != NULL);
+
+        /* Next TX that can carry ZC-ACK or LNet message */
+        if (tx->tx_list.next == &conn->ksnc_tx_queue) {
+                /* no more packets queued */
+                conn->ksnc_tx_carrier = NULL;
+        } else {
+		conn->ksnc_tx_carrier = list_entry(tx->tx_list.next,
+                                                       ksock_tx_t, tx_list);
+		LASSERT(conn->ksnc_tx_carrier->tx_msg.ksm_type ==
+			tx->tx_msg.ksm_type);
+        }
+}
+
+static int
+ksocknal_queue_tx_zcack_v2(ksock_conn_t *conn,
+                           ksock_tx_t *tx_ack, __u64 cookie)
+{
+        ksock_tx_t *tx = conn->ksnc_tx_carrier;
+
+        LASSERT (tx_ack == NULL ||
+                 tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+        /*
+         * Enqueue or piggyback tx_ack / cookie
+         * . no tx can piggyback cookie of tx_ack (or cookie), just
+         *   enqueue the tx_ack (if tx_ack != NUL) and return NULL.
+         * . There is tx can piggyback cookie of tx_ack (or cookie),
+         *   piggyback the cookie and return the tx.
+         */
+        if (tx == NULL) {
+                if (tx_ack != NULL) {
+			list_add_tail(&tx_ack->tx_list,
+                                          &conn->ksnc_tx_queue);
+                        conn->ksnc_tx_carrier = tx_ack;
+                }
+                return 0;
+        }
+
+        if (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP) {
+                /* tx is noop zc-ack, can't piggyback zc-ack cookie */
+                if (tx_ack != NULL)
+			list_add_tail(&tx_ack->tx_list,
+                                          &conn->ksnc_tx_queue);
+                return 0;
+        }
+
+        LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_LNET);
+        LASSERT(tx->tx_msg.ksm_zc_cookies[1] == 0);
+
+        if (tx_ack != NULL)
+                cookie = tx_ack->tx_msg.ksm_zc_cookies[1];
+
+        /* piggyback the zc-ack cookie */
+        tx->tx_msg.ksm_zc_cookies[1] = cookie;
+        /* move on to the next TX which can carry cookie */
+        ksocknal_next_tx_carrier(conn);
+
+        return 1;
+}
+
+static ksock_tx_t *
+ksocknal_queue_tx_msg_v2(ksock_conn_t *conn, ksock_tx_t *tx_msg)
+{
+        ksock_tx_t  *tx  = conn->ksnc_tx_carrier;
+
+        /*
+         * Enqueue tx_msg:
+         * . If there is no NOOP on the connection, just enqueue
+         *   tx_msg and return NULL
+         * . If there is NOOP on the connection, piggyback the cookie
+         *   and replace the NOOP tx, and return the NOOP tx.
+         */
+        if (tx == NULL) { /* nothing on queue */
+		list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+                conn->ksnc_tx_carrier = tx_msg;
+                return NULL;
+        }
+
+        if (tx->tx_msg.ksm_type == KSOCK_MSG_LNET) { /* nothing to carry */
+		list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+                return NULL;
+        }
+
+        LASSERT (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+        /* There is a noop zc-ack can be piggybacked */
+        tx_msg->tx_msg.ksm_zc_cookies[1] = tx->tx_msg.ksm_zc_cookies[1];
+        ksocknal_next_tx_carrier(conn);
+
+        /* use new_tx to replace the noop zc-ack packet */
+	list_add(&tx_msg->tx_list, &tx->tx_list);
+	list_del(&tx->tx_list);
+
+        return tx;
+}
+
+static int
+ksocknal_queue_tx_zcack_v3(ksock_conn_t *conn,
+                           ksock_tx_t *tx_ack, __u64 cookie)
+{
+        ksock_tx_t *tx;
+
+        if (conn->ksnc_type != SOCKLND_CONN_ACK)
+                return ksocknal_queue_tx_zcack_v2(conn, tx_ack, cookie);
+
+        /* non-blocking ZC-ACK (to router) */
+        LASSERT (tx_ack == NULL ||
+                 tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+        if ((tx = conn->ksnc_tx_carrier) == NULL) {
+                if (tx_ack != NULL) {
+			list_add_tail(&tx_ack->tx_list,
+                                          &conn->ksnc_tx_queue);
+                        conn->ksnc_tx_carrier = tx_ack;
+                }
+                return 0;
+        }
+
+        /* conn->ksnc_tx_carrier != NULL */
+
+        if (tx_ack != NULL)
+                cookie = tx_ack->tx_msg.ksm_zc_cookies[1];
+
+        if (cookie == SOCKNAL_KEEPALIVE_PING) /* ignore keepalive PING */
+                return 1;
+
+        if (tx->tx_msg.ksm_zc_cookies[1] == SOCKNAL_KEEPALIVE_PING) {
+                /* replace the keepalive PING with a real ACK */
+                LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0);
+                tx->tx_msg.ksm_zc_cookies[1] = cookie;
+                return 1;
+        }
+
+        if (cookie == tx->tx_msg.ksm_zc_cookies[0] ||
+            cookie == tx->tx_msg.ksm_zc_cookies[1]) {
+		CWARN("%s: duplicated ZC cookie: %llu\n",
+                      libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie);
+                return 1; /* XXX return error in the future */
+        }
+
+        if (tx->tx_msg.ksm_zc_cookies[0] == 0) {
+                /* NOOP tx has only one ZC-ACK cookie, can carry at least one more */
+                if (tx->tx_msg.ksm_zc_cookies[1] > cookie) {
+                        tx->tx_msg.ksm_zc_cookies[0] = tx->tx_msg.ksm_zc_cookies[1];
+                        tx->tx_msg.ksm_zc_cookies[1] = cookie;
+                } else {
+                        tx->tx_msg.ksm_zc_cookies[0] = cookie;
+                }
+
+                if (tx->tx_msg.ksm_zc_cookies[0] - tx->tx_msg.ksm_zc_cookies[1] > 2) {
+                        /* not likely to carry more ACKs, skip it to simplify logic */
+                        ksocknal_next_tx_carrier(conn);
+                }
+
+                return 1;
+        }
+
+        /* takes two or more cookies already */
+
+        if (tx->tx_msg.ksm_zc_cookies[0] > tx->tx_msg.ksm_zc_cookies[1]) {
+                __u64   tmp = 0;
+
+		/* two separated cookies: (a+2, a) or (a+1, a) */
+                LASSERT (tx->tx_msg.ksm_zc_cookies[0] -
+                         tx->tx_msg.ksm_zc_cookies[1] <= 2);
+
+                if (tx->tx_msg.ksm_zc_cookies[0] -
+                    tx->tx_msg.ksm_zc_cookies[1] == 2) {
+                        if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1)
+                                tmp = cookie;
+                } else if (cookie == tx->tx_msg.ksm_zc_cookies[1] - 1) {
+                        tmp = tx->tx_msg.ksm_zc_cookies[1];
+                } else if (cookie == tx->tx_msg.ksm_zc_cookies[0] + 1) {
+                        tmp = tx->tx_msg.ksm_zc_cookies[0];
+                }
+
+                if (tmp != 0) {
+                        /* range of cookies */
+                        tx->tx_msg.ksm_zc_cookies[0] = tmp - 1;
+                        tx->tx_msg.ksm_zc_cookies[1] = tmp + 1;
+                        return 1;
+                }
+
+        } else {
+                /* ksm_zc_cookies[0] < ksm_zc_cookies[1], it is range of cookies */
+                if (cookie >= tx->tx_msg.ksm_zc_cookies[0] &&
+                    cookie <= tx->tx_msg.ksm_zc_cookies[1]) {
+			CWARN("%s: duplicated ZC cookie: %llu\n",
+                              libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie);
+                        return 1; /* XXX: return error in the future */
+                }
+
+                if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1) {
+                        tx->tx_msg.ksm_zc_cookies[1] = cookie;
+                        return 1;
+                }
+
+                if (cookie == tx->tx_msg.ksm_zc_cookies[0] - 1) {
+                        tx->tx_msg.ksm_zc_cookies[0] = cookie;
+                        return 1;
+                }
+        }
+
+        /* failed to piggyback ZC-ACK */
+        if (tx_ack != NULL) {
+		list_add_tail(&tx_ack->tx_list, &conn->ksnc_tx_queue);
+                /* the next tx can piggyback at least 1 ACK */
+                ksocknal_next_tx_carrier(conn);
+        }
+
+        return 0;
+}
+
+static int
+ksocknal_match_tx(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
+{
+        int nob;
+
+#if SOCKNAL_VERSION_DEBUG
+        if (!*ksocknal_tunables.ksnd_typed_conns)
+                return SOCKNAL_MATCH_YES;
+#endif
+
+        if (tx == NULL || tx->tx_lnetmsg == NULL) {
+                /* noop packet */
+		nob = offsetof(struct ksock_msg, ksm_u);
+        } else {
+                nob = tx->tx_lnetmsg->msg_len +
+                      ((conn->ksnc_proto == &ksocknal_protocol_v1x) ?
+		       sizeof(struct lnet_hdr) : sizeof(struct ksock_msg));
+        }
+
+        /* default checking for typed connection */
+        switch (conn->ksnc_type) {
+        default:
+                CERROR("ksnc_type bad: %u\n", conn->ksnc_type);
+                LBUG();
+        case SOCKLND_CONN_ANY:
+                return SOCKNAL_MATCH_YES;
+
+        case SOCKLND_CONN_BULK_IN:
+                return SOCKNAL_MATCH_MAY;
+
+        case SOCKLND_CONN_BULK_OUT:
+                if (nob < *ksocknal_tunables.ksnd_min_bulk)
+                        return SOCKNAL_MATCH_MAY;
+                else
+                        return SOCKNAL_MATCH_YES;
+
+        case SOCKLND_CONN_CONTROL:
+                if (nob >= *ksocknal_tunables.ksnd_min_bulk)
+                        return SOCKNAL_MATCH_MAY;
+                else
+                        return SOCKNAL_MATCH_YES;
+        }
+}
+
+static int
+ksocknal_match_tx_v3(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
+{
+        int nob;
+
+        if (tx == NULL || tx->tx_lnetmsg == NULL)
+		nob = offsetof(struct ksock_msg, ksm_u);
+        else
+		nob = tx->tx_lnetmsg->msg_len + sizeof(struct ksock_msg);
+
+        switch (conn->ksnc_type) {
+        default:
+                CERROR("ksnc_type bad: %u\n", conn->ksnc_type);
+                LBUG();
+        case SOCKLND_CONN_ANY:
+                return SOCKNAL_MATCH_NO;
+
+        case SOCKLND_CONN_ACK:
+                if (nonblk)
+                        return SOCKNAL_MATCH_YES;
+                else if (tx == NULL || tx->tx_lnetmsg == NULL)
+                        return SOCKNAL_MATCH_MAY;
+                else
+                        return SOCKNAL_MATCH_NO;
+
+        case SOCKLND_CONN_BULK_OUT:
+                if (nonblk)
+                        return SOCKNAL_MATCH_NO;
+                else if (nob < *ksocknal_tunables.ksnd_min_bulk)
+                        return SOCKNAL_MATCH_MAY;
+                else
+                        return SOCKNAL_MATCH_YES;
+
+        case SOCKLND_CONN_CONTROL:
+                if (nonblk)
+                        return SOCKNAL_MATCH_NO;
+                else if (nob >= *ksocknal_tunables.ksnd_min_bulk)
+                        return SOCKNAL_MATCH_MAY;
+                else
+                        return SOCKNAL_MATCH_YES;
+        }
+}
+
+/* (Sink) handle incoming ZC request from sender */
+static int
+ksocknal_handle_zcreq(ksock_conn_t *c, __u64 cookie, int remote)
+{
+	ksock_peer_ni_t   *peer_ni = c->ksnc_peer;
+	ksock_conn_t   *conn;
+	ksock_tx_t     *tx;
+	int             rc;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	conn = ksocknal_find_conn_locked(peer_ni, NULL, !!remote);
+	if (conn != NULL) {
+		ksock_sched_t *sched = conn->ksnc_scheduler;
+
+		LASSERT(conn->ksnc_proto->pro_queue_tx_zcack != NULL);
+
+		spin_lock_bh(&sched->kss_lock);
+
+		rc = conn->ksnc_proto->pro_queue_tx_zcack(conn, NULL, cookie);
+
+		spin_unlock_bh(&sched->kss_lock);
+
+		if (rc) { /* piggybacked */
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+			return 0;
+		}
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+        /* ACK connection is not ready, or can't piggyback the ACK */
+        tx = ksocknal_alloc_tx_noop(cookie, !!remote);
+        if (tx == NULL)
+                return -ENOMEM;
+
+        if ((rc = ksocknal_launch_packet(peer_ni->ksnp_ni, tx, peer_ni->ksnp_id)) == 0)
+                return 0;
+
+        ksocknal_free_tx(tx);
+        return rc;
+}
+
+/* (Sender) handle ZC_ACK from sink */
+static int
+ksocknal_handle_zcack(ksock_conn_t *conn, __u64 cookie1, __u64 cookie2)
+{
+        ksock_peer_ni_t      *peer_ni = conn->ksnc_peer;
+        ksock_tx_t        *tx;
+        ksock_tx_t        *tmp;
+	struct list_head        zlist = LIST_HEAD_INIT(zlist);
+        int                count;
+
+        if (cookie1 == 0)
+                cookie1 = cookie2;
+
+        count = (cookie1 > cookie2) ? 2 : (cookie2 - cookie1 + 1);
+
+        if (cookie2 == SOCKNAL_KEEPALIVE_PING &&
+            conn->ksnc_proto == &ksocknal_protocol_v3x) {
+                /* keepalive PING for V3.x, just ignore it */
+                return count == 1 ? 0 : -EPROTO;
+        }
+
+	spin_lock(&peer_ni->ksnp_lock);
+
+	list_for_each_entry_safe(tx, tmp,
+                                     &peer_ni->ksnp_zc_req_list, tx_zc_list) {
+                __u64 c = tx->tx_msg.ksm_zc_cookies[0];
+
+                if (c == cookie1 || c == cookie2 || (cookie1 < c && c < cookie2)) {
+                        tx->tx_msg.ksm_zc_cookies[0] = 0;
+			list_del(&tx->tx_zc_list);
+			list_add(&tx->tx_zc_list, &zlist);
+
+                        if (--count == 0)
+                                break;
+                }
+        }
+
+	spin_unlock(&peer_ni->ksnp_lock);
+
+	while (!list_empty(&zlist)) {
+		tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list);
+		list_del(&tx->tx_zc_list);
+                ksocknal_tx_decref(tx);
+        }
+
+        return count == 0 ? 0 : -EPROTO;
+}
+
+static int
+ksocknal_send_hello_v1 (ksock_conn_t *conn, struct ksock_hello_msg *hello)
+{
+	struct socket *sock = conn->ksnc_sock;
+	struct lnet_hdr *hdr;
+	struct lnet_magicversion *hmv;
+	int rc;
+	int i;
+
+	CLASSERT(sizeof(struct lnet_magicversion) ==
+		 offsetof(struct lnet_hdr, src_nid));
+
+	LIBCFS_ALLOC(hdr, sizeof(*hdr));
+	if (hdr == NULL) {
+		CERROR("Can't allocate struct lnet_hdr\n");
+		return -ENOMEM;
+	}
+
+	hmv = (struct lnet_magicversion *)&hdr->dest_nid;
+
+	/* Re-organize V2.x message header to V1.x (struct lnet_hdr)
+         * header and send out */
+        hmv->magic         = cpu_to_le32 (LNET_PROTO_TCP_MAGIC);
+        hmv->version_major = cpu_to_le16 (KSOCK_PROTO_V1_MAJOR);
+        hmv->version_minor = cpu_to_le16 (KSOCK_PROTO_V1_MINOR);
+
+        if (the_lnet.ln_testprotocompat != 0) {
+                /* single-shot proto check */
+                LNET_LOCK();
+                if ((the_lnet.ln_testprotocompat & 1) != 0) {
+                        hmv->version_major++;   /* just different! */
+                        the_lnet.ln_testprotocompat &= ~1;
+                }
+                if ((the_lnet.ln_testprotocompat & 2) != 0) {
+                        hmv->magic = LNET_PROTO_MAGIC;
+                        the_lnet.ln_testprotocompat &= ~2;
+                }
+                LNET_UNLOCK();
+        }
+
+        hdr->src_nid        = cpu_to_le64 (hello->kshm_src_nid);
+        hdr->src_pid        = cpu_to_le32 (hello->kshm_src_pid);
+        hdr->type           = cpu_to_le32 (LNET_MSG_HELLO);
+        hdr->payload_length = cpu_to_le32 (hello->kshm_nips * sizeof(__u32));
+        hdr->msg.hello.type = cpu_to_le32 (hello->kshm_ctype);
+        hdr->msg.hello.incarnation = cpu_to_le64 (hello->kshm_src_incarnation);
+
+	rc = lnet_sock_write(sock, hdr, sizeof(*hdr), lnet_acceptor_timeout());
+        if (rc != 0) {
+		CNETERR("Error %d sending HELLO hdr to %pI4h/%d\n",
+			rc, &conn->ksnc_ipaddr, conn->ksnc_port);
+                goto out;
+        }
+
+        if (hello->kshm_nips == 0)
+                goto out;
+
+        for (i = 0; i < (int) hello->kshm_nips; i++) {
+                hello->kshm_ips[i] = __cpu_to_le32 (hello->kshm_ips[i]);
+        }
+
+	rc = lnet_sock_write(sock, hello->kshm_ips,
+                               hello->kshm_nips * sizeof(__u32),
+                               lnet_acceptor_timeout());
+        if (rc != 0) {
+                CNETERR("Error %d sending HELLO payload (%d)"
+			" to %pI4h/%d\n", rc, hello->kshm_nips,
+			&conn->ksnc_ipaddr, conn->ksnc_port);
+        }
+out:
+        LIBCFS_FREE(hdr, sizeof(*hdr));
+
+        return rc;
+}
+
+static int
+ksocknal_send_hello_v2 (ksock_conn_t *conn, struct ksock_hello_msg *hello)
+{
+	struct socket   *sock = conn->ksnc_sock;
+        int             rc;
+
+        hello->kshm_magic   = LNET_PROTO_MAGIC;
+        hello->kshm_version = conn->ksnc_proto->pro_version;
+
+        if (the_lnet.ln_testprotocompat != 0) {
+                /* single-shot proto check */
+                LNET_LOCK();
+                if ((the_lnet.ln_testprotocompat & 1) != 0) {
+                        hello->kshm_version++;   /* just different! */
+                        the_lnet.ln_testprotocompat &= ~1;
+                }
+                LNET_UNLOCK();
+        }
+
+	rc = lnet_sock_write(sock, hello, offsetof(struct ksock_hello_msg, kshm_ips),
+                               lnet_acceptor_timeout());
+
+        if (rc != 0) {
+		CNETERR("Error %d sending HELLO hdr to %pI4h/%d\n",
+			rc, &conn->ksnc_ipaddr, conn->ksnc_port);
+                return rc;
+        }
+
+        if (hello->kshm_nips == 0)
+                return 0;
+
+	rc = lnet_sock_write(sock, hello->kshm_ips,
+                               hello->kshm_nips * sizeof(__u32),
+                               lnet_acceptor_timeout());
+        if (rc != 0) {
+                CNETERR("Error %d sending HELLO payload (%d)"
+			" to %pI4h/%d\n", rc, hello->kshm_nips,
+			&conn->ksnc_ipaddr, conn->ksnc_port);
+        }
+
+        return rc;
+}
+
+static int
+ksocknal_recv_hello_v1(ksock_conn_t *conn, struct ksock_hello_msg *hello,int timeout)
+{
+	struct socket *sock = conn->ksnc_sock;
+	struct lnet_hdr *hdr;
+	int rc;
+	int i;
+
+	LIBCFS_ALLOC(hdr, sizeof(*hdr));
+	if (hdr == NULL) {
+		CERROR("Can't allocate struct lnet_hdr\n");
+		return -ENOMEM;
+	}
+
+	rc = lnet_sock_read(sock, &hdr->src_nid,
+			      sizeof(*hdr) - offsetof(struct lnet_hdr, src_nid),
+			      timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading rest of HELLO hdr from %pI4h\n",
+		       rc, &conn->ksnc_ipaddr);
+		LASSERT(rc < 0 && rc != -EALREADY);
+		goto out;
+	}
+
+        /* ...and check we got what we expected */
+        if (hdr->type != cpu_to_le32 (LNET_MSG_HELLO)) {
+                CERROR ("Expecting a HELLO hdr,"
+			" but got type %d from %pI4h\n",
+                        le32_to_cpu (hdr->type),
+			&conn->ksnc_ipaddr);
+                rc = -EPROTO;
+                goto out;
+        }
+
+        hello->kshm_src_nid         = le64_to_cpu (hdr->src_nid);
+        hello->kshm_src_pid         = le32_to_cpu (hdr->src_pid);
+        hello->kshm_src_incarnation = le64_to_cpu (hdr->msg.hello.incarnation);
+        hello->kshm_ctype           = le32_to_cpu (hdr->msg.hello.type);
+        hello->kshm_nips            = le32_to_cpu (hdr->payload_length) /
+                                         sizeof (__u32);
+
+	if (hello->kshm_nips > LNET_NUM_INTERFACES) {
+		CERROR("Bad nips %d from ip %pI4h\n",
+		       hello->kshm_nips, &conn->ksnc_ipaddr);
+		rc = -EPROTO;
+		goto out;
+	}
+
+        if (hello->kshm_nips == 0)
+                goto out;
+
+	rc = lnet_sock_read(sock, hello->kshm_ips,
+                              hello->kshm_nips * sizeof(__u32), timeout);
+        if (rc != 0) {
+		CERROR("Error %d reading IPs from ip %pI4h\n",
+		       rc, &conn->ksnc_ipaddr);
+		LASSERT(rc < 0 && rc != -EALREADY);
+                goto out;
+        }
+
+        for (i = 0; i < (int) hello->kshm_nips; i++) {
+                hello->kshm_ips[i] = __le32_to_cpu(hello->kshm_ips[i]);
+
+                if (hello->kshm_ips[i] == 0) {
+			CERROR("Zero IP[%d] from ip %pI4h\n",
+			       i, &conn->ksnc_ipaddr);
+                        rc = -EPROTO;
+                        break;
+                }
+        }
+out:
+        LIBCFS_FREE(hdr, sizeof(*hdr));
+
+        return rc;
+}
+
+static int
+ksocknal_recv_hello_v2(ksock_conn_t *conn, struct ksock_hello_msg *hello,
+		       int timeout)
+{
+	struct socket	  *sock = conn->ksnc_sock;
+        int                rc;
+        int                i;
+
+        if (hello->kshm_magic == LNET_PROTO_MAGIC)
+                conn->ksnc_flip = 0;
+        else
+                conn->ksnc_flip = 1;
+
+	rc = lnet_sock_read(sock, &hello->kshm_src_nid,
+			      offsetof(struct ksock_hello_msg, kshm_ips) -
+				       offsetof(struct ksock_hello_msg, kshm_src_nid),
+                              timeout);
+        if (rc != 0) {
+		CERROR("Error %d reading HELLO from %pI4h\n",
+		       rc, &conn->ksnc_ipaddr);
+		LASSERT(rc < 0 && rc != -EALREADY);
+                return rc;
+        }
+
+        if (conn->ksnc_flip) {
+                __swab32s(&hello->kshm_src_pid);
+                __swab64s(&hello->kshm_src_nid);
+                __swab32s(&hello->kshm_dst_pid);
+                __swab64s(&hello->kshm_dst_nid);
+                __swab64s(&hello->kshm_src_incarnation);
+                __swab64s(&hello->kshm_dst_incarnation);
+                __swab32s(&hello->kshm_ctype);
+                __swab32s(&hello->kshm_nips);
+        }
+
+	if (hello->kshm_nips > LNET_NUM_INTERFACES) {
+		CERROR("Bad nips %d from ip %pI4h\n",
+		       hello->kshm_nips, &conn->ksnc_ipaddr);
+		return -EPROTO;
+	}
+
+        if (hello->kshm_nips == 0)
+                return 0;
+
+	rc = lnet_sock_read(sock, hello->kshm_ips,
+			    hello->kshm_nips * sizeof(__u32), timeout);
+        if (rc != 0) {
+		CERROR("Error %d reading IPs from ip %pI4h\n",
+		       rc, &conn->ksnc_ipaddr);
+		LASSERT(rc < 0 && rc != -EALREADY);
+                return rc;
+        }
+
+        for (i = 0; i < (int) hello->kshm_nips; i++) {
+                if (conn->ksnc_flip)
+                        __swab32s(&hello->kshm_ips[i]);
+
+                if (hello->kshm_ips[i] == 0) {
+			CERROR("Zero IP[%d] from ip %pI4h\n",
+			       i, &conn->ksnc_ipaddr);
+                        return -EPROTO;
+                }
+        }
+
+        return 0;
+}
+
+static void
+ksocknal_pack_msg_v1(ksock_tx_t *tx)
+{
+	/* V1.x has no KSOCK_MSG_NOOP */
+	LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+	LASSERT(tx->tx_lnetmsg != NULL);
+
+	tx->tx_iov[0].iov_base = (void *)&tx->tx_lnetmsg->msg_hdr;
+	tx->tx_iov[0].iov_len  = sizeof(struct lnet_hdr);
+
+	tx->tx_nob = tx->tx_lnetmsg->msg_len + sizeof(struct lnet_hdr);
+	tx->tx_resid = tx->tx_nob;
+}
+
+static void
+ksocknal_pack_msg_v2(ksock_tx_t *tx)
+{
+        tx->tx_iov[0].iov_base = (void *)&tx->tx_msg;
+
+        if (tx->tx_lnetmsg != NULL) {
+                LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+
+                tx->tx_msg.ksm_u.lnetmsg.ksnm_hdr = tx->tx_lnetmsg->msg_hdr;
+		tx->tx_iov[0].iov_len = sizeof(struct ksock_msg);
+		tx->tx_resid = tx->tx_nob = sizeof(struct ksock_msg) + tx->tx_lnetmsg->msg_len;
+        } else {
+                LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+		tx->tx_iov[0].iov_len = offsetof(struct ksock_msg, ksm_u.lnetmsg.ksnm_hdr);
+		tx->tx_resid = tx->tx_nob = offsetof(struct ksock_msg,  ksm_u.lnetmsg.ksnm_hdr);
+        }
+        /* Don't checksum before start sending, because packet can be piggybacked with ACK */
+}
+
+static void
+ksocknal_unpack_msg_v1(struct ksock_msg *msg)
+{
+        msg->ksm_csum           = 0;
+        msg->ksm_type           = KSOCK_MSG_LNET;
+        msg->ksm_zc_cookies[0]  = msg->ksm_zc_cookies[1]  = 0;
+}
+
+static void
+ksocknal_unpack_msg_v2(struct ksock_msg *msg)
+{
+        return;  /* Do nothing */
+}
+
+ksock_proto_t  ksocknal_protocol_v1x =
+{
+        .pro_version            = KSOCK_PROTO_V1,
+        .pro_send_hello         = ksocknal_send_hello_v1,
+        .pro_recv_hello         = ksocknal_recv_hello_v1,
+        .pro_pack               = ksocknal_pack_msg_v1,
+        .pro_unpack             = ksocknal_unpack_msg_v1,
+        .pro_queue_tx_msg       = ksocknal_queue_tx_msg_v1,
+        .pro_handle_zcreq       = NULL,
+        .pro_handle_zcack       = NULL,
+        .pro_queue_tx_zcack     = NULL,
+        .pro_match_tx           = ksocknal_match_tx
+};
+
+ksock_proto_t  ksocknal_protocol_v2x =
+{
+        .pro_version            = KSOCK_PROTO_V2,
+        .pro_send_hello         = ksocknal_send_hello_v2,
+        .pro_recv_hello         = ksocknal_recv_hello_v2,
+        .pro_pack               = ksocknal_pack_msg_v2,
+        .pro_unpack             = ksocknal_unpack_msg_v2,
+        .pro_queue_tx_msg       = ksocknal_queue_tx_msg_v2,
+        .pro_queue_tx_zcack     = ksocknal_queue_tx_zcack_v2,
+        .pro_handle_zcreq       = ksocknal_handle_zcreq,
+        .pro_handle_zcack       = ksocknal_handle_zcack,
+        .pro_match_tx           = ksocknal_match_tx
+};
+
+ksock_proto_t  ksocknal_protocol_v3x =
+{
+        .pro_version            = KSOCK_PROTO_V3,
+        .pro_send_hello         = ksocknal_send_hello_v2,
+        .pro_recv_hello         = ksocknal_recv_hello_v2,
+        .pro_pack               = ksocknal_pack_msg_v2,
+        .pro_unpack             = ksocknal_unpack_msg_v2,
+        .pro_queue_tx_msg       = ksocknal_queue_tx_msg_v2,
+        .pro_queue_tx_zcack     = ksocknal_queue_tx_zcack_v3,
+        .pro_handle_zcreq       = ksocknal_handle_zcreq,
+        .pro_handle_zcack       = ksocknal_handle_zcack,
+        .pro_match_tx           = ksocknal_match_tx_v3
+};
+
diff --git a/drivers/staging/lustrefsx/lnet/lnet/acceptor.c b/drivers/staging/lustrefsx/lnet/lnet/acceptor.c
new file mode 100644
index 0000000000000..885cd85a8c20f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/acceptor.c
@@ -0,0 +1,522 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/completion.h>
+#include <net/sock.h>
+#include <lnet/lib-lnet.h>
+
+static int   accept_port    = 988;
+static int   accept_backlog = 127;
+static int   accept_timeout = 5;
+
+static struct {
+	int			pta_shutdown;
+	struct socket		*pta_sock;
+	struct completion	pta_signal;
+} lnet_acceptor_state = {
+	.pta_shutdown = 1
+};
+
+int
+lnet_acceptor_port(void)
+{
+	return accept_port;
+}
+
+static inline int
+lnet_accept_magic(__u32 magic, __u32 constant)
+{
+	return (magic == constant ||
+		magic == __swab32(constant));
+}
+
+EXPORT_SYMBOL(lnet_acceptor_port);
+
+static char *accept = "secure";
+
+module_param(accept, charp, 0444);
+MODULE_PARM_DESC(accept, "Accept connections (secure|all|none)");
+module_param(accept_port, int, 0444);
+MODULE_PARM_DESC(accept_port, "Acceptor's port (same on all nodes)");
+module_param(accept_backlog, int, 0444);
+MODULE_PARM_DESC(accept_backlog, "Acceptor's listen backlog");
+module_param(accept_timeout, int, 0644);
+MODULE_PARM_DESC(accept_timeout, "Acceptor's timeout (seconds)");
+
+static char *accept_type = NULL;
+
+static int
+lnet_acceptor_get_tunables(void)
+{
+	/* Userland acceptor uses 'accept_type' instead of 'accept', due to
+	 * conflict with 'accept(2)', but kernel acceptor still uses 'accept'
+	 * for compatibility. Hence the trick. */
+	accept_type = accept;
+	return 0;
+}
+
+int
+lnet_acceptor_timeout(void)
+{
+	return accept_timeout;
+}
+EXPORT_SYMBOL(lnet_acceptor_timeout);
+
+void
+lnet_connect_console_error (int rc, lnet_nid_t peer_nid,
+			   __u32 peer_ip, int peer_port)
+{
+	switch (rc) {
+	/* "normal" errors */
+	case -ECONNREFUSED:
+		CNETERR("Connection to %s at host %pI4h on port %d was "
+			"refused: check that Lustre is running on that node.\n",
+			libcfs_nid2str(peer_nid), &peer_ip, peer_port);
+		break;
+	case -EHOSTUNREACH:
+	case -ENETUNREACH:
+		CNETERR("Connection to %s at host %pI4h "
+			"was unreachable: the network or that node may "
+			"be down, or Lustre may be misconfigured.\n",
+			libcfs_nid2str(peer_nid), &peer_ip);
+		break;
+	case -ETIMEDOUT:
+		CNETERR("Connection to %s at host %pI4h on "
+			"port %d took too long: that node may be hung "
+			"or experiencing high load.\n",
+			libcfs_nid2str(peer_nid), &peer_ip, peer_port);
+		break;
+	case -ECONNRESET:
+		LCONSOLE_ERROR_MSG(0x11b, "Connection to %s at host %pI4h"
+				   " on port %d was reset: "
+				   "is it running a compatible version of "
+				   "Lustre and is %s one of its NIDs?\n",
+				   libcfs_nid2str(peer_nid), &peer_ip,
+				   peer_port, libcfs_nid2str(peer_nid));
+		break;
+	case -EPROTO:
+		LCONSOLE_ERROR_MSG(0x11c, "Protocol error connecting to %s at "
+				   "host %pI4h on port %d: is it running "
+				   "a compatible version of Lustre?\n",
+				   libcfs_nid2str(peer_nid), &peer_ip,
+				   peer_port);
+		break;
+	case -EADDRINUSE:
+		LCONSOLE_ERROR_MSG(0x11d, "No privileged ports available to "
+				   "connect to %s at host %pI4h on port "
+				   "%d\n", libcfs_nid2str(peer_nid),
+				   &peer_ip, peer_port);
+		break;
+	default:
+		LCONSOLE_ERROR_MSG(0x11e, "Unexpected error %d connecting to %s"
+				   " at host %pI4h on port %d\n", rc,
+				   libcfs_nid2str(peer_nid),
+				   &peer_ip, peer_port);
+		break;
+	}
+}
+EXPORT_SYMBOL(lnet_connect_console_error);
+
+int
+lnet_connect(struct socket **sockp, lnet_nid_t peer_nid,
+	    __u32 local_ip, __u32 peer_ip, int peer_port)
+{
+	struct lnet_acceptor_connreq cr;
+	struct socket		*sock;
+	int			rc;
+	int			port;
+	int			fatal;
+
+	CLASSERT(sizeof(cr) <= 16);		/* not too big to be on the stack */
+
+	for (port = LNET_ACCEPTOR_MAX_RESERVED_PORT;
+	     port >= LNET_ACCEPTOR_MIN_RESERVED_PORT;
+	     --port) {
+		/* Iterate through reserved ports. */
+
+		rc = lnet_sock_connect(&sock, &fatal,
+					 local_ip, port,
+					 peer_ip, peer_port);
+		if (rc != 0) {
+			if (fatal)
+				goto failed;
+			continue;
+		}
+
+		CLASSERT(LNET_PROTO_ACCEPTOR_VERSION == 1);
+
+		cr.acr_magic   = LNET_PROTO_ACCEPTOR_MAGIC;
+		cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+		cr.acr_nid     = peer_nid;
+
+		if (the_lnet.ln_testprotocompat != 0) {
+			/* single-shot proto check */
+			lnet_net_lock(LNET_LOCK_EX);
+			if ((the_lnet.ln_testprotocompat & 4) != 0) {
+				cr.acr_version++;
+				the_lnet.ln_testprotocompat &= ~4;
+			}
+			if ((the_lnet.ln_testprotocompat & 8) != 0) {
+				cr.acr_magic = LNET_PROTO_MAGIC;
+				the_lnet.ln_testprotocompat &= ~8;
+			}
+			lnet_net_unlock(LNET_LOCK_EX);
+		}
+
+		rc = lnet_sock_write(sock, &cr, sizeof(cr),
+				       accept_timeout);
+		if (rc != 0)
+			goto failed_sock;
+
+		*sockp = sock;
+		return 0;
+	}
+
+	rc = -EADDRINUSE;
+	goto failed;
+
+failed_sock:
+	sock_release(sock);
+failed:
+	lnet_connect_console_error(rc, peer_nid, peer_ip, peer_port);
+	return rc;
+}
+EXPORT_SYMBOL(lnet_connect);
+
+static int
+lnet_accept(struct socket *sock, __u32 magic)
+{
+	struct lnet_acceptor_connreq cr;
+	__u32			peer_ip;
+	int			peer_port;
+	int			rc;
+	int			flip;
+	struct lnet_ni *ni;
+	char		       *str;
+
+	LASSERT(sizeof(cr) <= 16);		/* not too big for the stack */
+
+	rc = lnet_sock_getaddr(sock, true, &peer_ip, &peer_port);
+	LASSERT(rc == 0);			/* we succeeded before */
+
+	if (!lnet_accept_magic(magic, LNET_PROTO_ACCEPTOR_MAGIC)) {
+
+		if (lnet_accept_magic(magic, LNET_PROTO_MAGIC)) {
+			/* future version compatibility!
+			 * When LNET unifies protocols over all LNDs, the first
+			 * thing sent will be a version query.	I send back
+			 * LNET_PROTO_ACCEPTOR_MAGIC to tell her I'm "old" */
+
+			memset(&cr, 0, sizeof(cr));
+			cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC;
+			cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+			rc = lnet_sock_write(sock, &cr, sizeof(cr),
+					       accept_timeout);
+
+			if (rc != 0)
+				CERROR("Error sending magic+version in response"
+				       "to LNET magic from %pI4h: %d\n",
+				       &peer_ip, rc);
+			return -EPROTO;
+		}
+
+		if (magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC))
+			str = "'old' socknal/tcpnal";
+		else
+			str = "unrecognised";
+
+		LCONSOLE_ERROR_MSG(0x11f, "Refusing connection from %pI4h"
+				   " magic %08x: %s acceptor protocol\n",
+				   &peer_ip, magic, str);
+		return -EPROTO;
+	}
+
+	flip = (magic != LNET_PROTO_ACCEPTOR_MAGIC);
+
+	rc = lnet_sock_read(sock, &cr.acr_version,
+			      sizeof(cr.acr_version),
+			      accept_timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading connection request version from "
+		       "%pI4h\n", rc, &peer_ip);
+		return -EIO;
+	}
+
+	if (flip)
+		__swab32s(&cr.acr_version);
+
+	if (cr.acr_version != LNET_PROTO_ACCEPTOR_VERSION) {
+		/* future version compatibility!
+		 * An acceptor-specific protocol rev will first send a version
+		 * query.  I send back my current version to tell her I'm
+		 * "old". */
+		int peer_version = cr.acr_version;
+
+		memset(&cr, 0, sizeof(cr));
+		cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC;
+		cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+
+		rc = lnet_sock_write(sock, &cr, sizeof(cr),
+				       accept_timeout);
+
+		if (rc != 0)
+			CERROR("Error sending magic+version in response"
+			       "to version %d from %pI4h: %d\n",
+			       peer_version, &peer_ip, rc);
+		return -EPROTO;
+	}
+
+	rc = lnet_sock_read(sock, &cr.acr_nid,
+			      sizeof(cr) -
+			      offsetof(struct lnet_acceptor_connreq, acr_nid),
+			      accept_timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading connection request from "
+		       "%pI4h\n", rc, &peer_ip);
+		return -EIO;
+	}
+
+	if (flip)
+		__swab64s(&cr.acr_nid);
+
+	ni = lnet_nid2ni_addref(cr.acr_nid);
+	if (ni == NULL ||               /* no matching net */
+	    ni->ni_nid != cr.acr_nid) { /* right NET, wrong NID! */
+		if (ni != NULL)
+			lnet_ni_decref(ni);
+		LCONSOLE_ERROR_MSG(0x120, "Refusing connection from %pI4h "
+				   "for %s: No matching NI\n",
+				   &peer_ip, libcfs_nid2str(cr.acr_nid));
+		return -EPERM;
+	}
+
+	if (ni->ni_net->net_lnd->lnd_accept == NULL) {
+		/* This catches a request for the loopback LND */
+		lnet_ni_decref(ni);
+		LCONSOLE_ERROR_MSG(0x121, "Refusing connection from %pI4h "
+				  "for %s: NI doesn not accept IP connections\n",
+				  &peer_ip, libcfs_nid2str(cr.acr_nid));
+		return -EPERM;
+	}
+
+	CDEBUG(D_NET, "Accept %s from %pI4h\n",
+	       libcfs_nid2str(cr.acr_nid), &peer_ip);
+
+	rc = ni->ni_net->net_lnd->lnd_accept(ni, sock);
+
+	lnet_ni_decref(ni);
+	return rc;
+}
+
+static int
+lnet_acceptor(void *arg)
+{
+	struct socket  *newsock;
+	int	       rc;
+	__u32	       magic;
+	__u32	       peer_ip;
+	int	       peer_port;
+	int	       secure = (int)((uintptr_t)arg);
+
+	LASSERT(lnet_acceptor_state.pta_sock == NULL);
+
+	cfs_block_allsigs();
+
+	rc = lnet_sock_listen(&lnet_acceptor_state.pta_sock,
+				0, accept_port, accept_backlog);
+	if (rc != 0) {
+		if (rc == -EADDRINUSE)
+			LCONSOLE_ERROR_MSG(0x122, "Can't start acceptor on port"
+					   " %d: port already in use\n",
+					   accept_port);
+		else
+			LCONSOLE_ERROR_MSG(0x123, "Can't start acceptor on port "
+					   "%d: unexpected error %d\n",
+					   accept_port, rc);
+
+		lnet_acceptor_state.pta_sock = NULL;
+	} else {
+		LCONSOLE(0, "Accept %s, port %d\n", accept_type, accept_port);
+	}
+
+	/* set init status and unblock parent */
+	lnet_acceptor_state.pta_shutdown = rc;
+	complete(&lnet_acceptor_state.pta_signal);
+
+	if (rc != 0)
+		return rc;
+
+	while (!lnet_acceptor_state.pta_shutdown) {
+
+		rc = lnet_sock_accept(&newsock, lnet_acceptor_state.pta_sock);
+		if (rc != 0) {
+			if (rc != -EAGAIN) {
+				CWARN("Accept error %d: pausing...\n", rc);
+				set_current_state(TASK_UNINTERRUPTIBLE);
+				schedule_timeout(cfs_time_seconds(1));
+			}
+			continue;
+		}
+
+		/* maybe we're waken up with lnet_sock_abort_accept() */
+		if (lnet_acceptor_state.pta_shutdown) {
+			sock_release(newsock);
+			break;
+		}
+
+		rc = lnet_sock_getaddr(newsock, true, &peer_ip, &peer_port);
+		if (rc != 0) {
+			CERROR("Can't determine new connection's address\n");
+			goto failed;
+		}
+
+		if (secure && peer_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) {
+			CERROR("Refusing connection from %pI4h: "
+			       "insecure port %d\n", &peer_ip, peer_port);
+			goto failed;
+		}
+
+		rc = lnet_sock_read(newsock, &magic, sizeof(magic),
+				      accept_timeout);
+		if (rc != 0) {
+			CERROR("Error %d reading connection request from "
+			       "%pI4h\n", rc, &peer_ip);
+			goto failed;
+		}
+
+		rc = lnet_accept(newsock, magic);
+		if (rc != 0)
+			goto failed;
+
+		continue;
+
+failed:
+		sock_release(newsock);
+	}
+
+	sock_release(lnet_acceptor_state.pta_sock);
+	lnet_acceptor_state.pta_sock = NULL;
+
+	CDEBUG(D_NET, "Acceptor stopping\n");
+
+	/* unblock lnet_acceptor_stop() */
+	complete(&lnet_acceptor_state.pta_signal);
+	return 0;
+}
+
+static inline int
+accept2secure(const char *acc, long *sec)
+{
+	if (!strcmp(acc, "secure")) {
+		*sec = 1;
+		return 1;
+	} else if (!strcmp(acc, "all")) {
+		*sec = 0;
+		return 1;
+	} else if (!strcmp(acc, "none")) {
+		return 0;
+	} else {
+		LCONSOLE_ERROR_MSG(0x124, "Can't parse 'accept=\"%s\"'\n",
+				   acc);
+		return -EINVAL;
+	}
+}
+
+int
+lnet_acceptor_start(void)
+{
+	struct task_struct *task;
+	int  rc;
+	long rc2;
+	long secure;
+
+	/* if acceptor is already running return immediately */
+	if (!lnet_acceptor_state.pta_shutdown)
+		return 0;
+
+	LASSERT(lnet_acceptor_state.pta_sock == NULL);
+
+	rc = lnet_acceptor_get_tunables();
+	if (rc != 0)
+		return rc;
+
+	init_completion(&lnet_acceptor_state.pta_signal);
+	rc = accept2secure(accept_type, &secure);
+	if (rc <= 0)
+		return rc;
+
+	if (lnet_count_acceptor_nets() == 0)  /* not required */
+		return 0;
+
+	task = kthread_run(lnet_acceptor, (void *)(uintptr_t)secure,
+			   "acceptor_%03ld", secure);
+	if (IS_ERR(task)) {
+		rc2 = PTR_ERR(task);
+		CERROR("Can't start acceptor thread: %ld\n", rc2);
+
+		return -ESRCH;
+	}
+
+	/* wait for acceptor to startup */
+	wait_for_completion(&lnet_acceptor_state.pta_signal);
+
+	if (!lnet_acceptor_state.pta_shutdown) {
+		/* started OK */
+		LASSERT(lnet_acceptor_state.pta_sock != NULL);
+		return 0;
+	}
+
+	LASSERT(lnet_acceptor_state.pta_sock == NULL);
+
+	return -ENETDOWN;
+}
+
+void
+lnet_acceptor_stop(void)
+{
+	struct sock *sk;
+
+	if (lnet_acceptor_state.pta_shutdown) /* not running */
+		return;
+
+	lnet_acceptor_state.pta_shutdown = 1;
+
+	sk = lnet_acceptor_state.pta_sock->sk;
+
+	/* awake any sleepers using safe method */
+	sk->sk_state_change(sk);
+
+	/* block until acceptor signals exit */
+	wait_for_completion(&lnet_acceptor_state.pta_signal);
+}
diff --git a/drivers/staging/lustrefsx/lnet/lnet/api-ni.c b/drivers/staging/lustrefsx/lnet/lnet/api-ni.c
new file mode 100644
index 0000000000000..c70e26680b447
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/api-ni.c
@@ -0,0 +1,3182 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/log2.h>
+#include <linux/ktime.h>
+
+#include <lnet/lib-lnet.h>
+
+#define D_LNI D_CONSOLE
+
+struct lnet the_lnet;		/* THE state of the network */
+EXPORT_SYMBOL(the_lnet);
+
+static char *ip2nets = "";
+module_param(ip2nets, charp, 0444);
+MODULE_PARM_DESC(ip2nets, "LNET network <- IP table");
+
+static char *networks = "";
+module_param(networks, charp, 0444);
+MODULE_PARM_DESC(networks, "local networks");
+
+static char *routes = "";
+module_param(routes, charp, 0444);
+MODULE_PARM_DESC(routes, "routes to non-local networks");
+
+static int rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT;
+module_param(rnet_htable_size, int, 0444);
+MODULE_PARM_DESC(rnet_htable_size, "size of remote network hash table");
+
+static int use_tcp_bonding = false;
+module_param(use_tcp_bonding, int, 0444);
+MODULE_PARM_DESC(use_tcp_bonding,
+		 "Set to 1 to use socklnd bonding. 0 to use Multi-Rail");
+
+unsigned int lnet_numa_range = 0;
+module_param(lnet_numa_range, uint, 0444);
+MODULE_PARM_DESC(lnet_numa_range,
+		"NUMA range to consider during Multi-Rail selection");
+
+/*
+ * This sequence number keeps track of how many times DLC was used to
+ * update the local NIs. It is incremented when a NI is added or
+ * removed and checked when sending a message to determine if there is
+ * a need to re-run the selection algorithm. See lnet_select_pathway()
+ * for more details on its usage.
+ */
+static atomic_t lnet_dlc_seq_no = ATOMIC_INIT(0);
+
+static int lnet_ping(struct lnet_process_id id, signed long timeout,
+		     struct lnet_process_id __user *ids, int n_ids);
+
+static char *
+lnet_get_routes(void)
+{
+	return routes;
+}
+
+static char *
+lnet_get_networks(void)
+{
+	char   *nets;
+	int	rc;
+
+	if (*networks != 0 && *ip2nets != 0) {
+		LCONSOLE_ERROR_MSG(0x101, "Please specify EITHER 'networks' or "
+				   "'ip2nets' but not both at once\n");
+		return NULL;
+	}
+
+	if (*ip2nets != 0) {
+		rc = lnet_parse_ip2nets(&nets, ip2nets);
+		return (rc == 0) ? nets : NULL;
+	}
+
+	if (*networks != 0)
+		return networks;
+
+	return "tcp";
+}
+
+static void
+lnet_init_locks(void)
+{
+	spin_lock_init(&the_lnet.ln_eq_wait_lock);
+	init_waitqueue_head(&the_lnet.ln_eq_waitq);
+	init_waitqueue_head(&the_lnet.ln_rc_waitq);
+	mutex_init(&the_lnet.ln_lnd_mutex);
+	mutex_init(&the_lnet.ln_api_mutex);
+}
+
+static void
+lnet_fini_locks(void)
+{
+}
+
+struct kmem_cache *lnet_mes_cachep;	   /* MEs kmem_cache */
+struct kmem_cache *lnet_small_mds_cachep;  /* <= LNET_SMALL_MD_SIZE bytes
+					    *  MDs kmem_cache */
+
+static int
+lnet_descriptor_setup(void)
+{
+	/* create specific kmem_cache for MEs and small MDs (i.e., originally
+	 * allocated in <size-xxx> kmem_cache).
+	 */
+	lnet_mes_cachep = kmem_cache_create("lnet_MEs", sizeof(struct lnet_me),
+					    0, 0, NULL);
+	if (!lnet_mes_cachep)
+		return -ENOMEM;
+
+	lnet_small_mds_cachep = kmem_cache_create("lnet_small_MDs",
+						  LNET_SMALL_MD_SIZE, 0, 0,
+						  NULL);
+	if (!lnet_small_mds_cachep)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void
+lnet_descriptor_cleanup(void)
+{
+
+	if (lnet_small_mds_cachep) {
+		kmem_cache_destroy(lnet_small_mds_cachep);
+		lnet_small_mds_cachep = NULL;
+	}
+
+	if (lnet_mes_cachep) {
+		kmem_cache_destroy(lnet_mes_cachep);
+		lnet_mes_cachep = NULL;
+	}
+}
+
+static int
+lnet_create_remote_nets_table(void)
+{
+	int		  i;
+	struct list_head *hash;
+
+	LASSERT(the_lnet.ln_remote_nets_hash == NULL);
+	LASSERT(the_lnet.ln_remote_nets_hbits > 0);
+	LIBCFS_ALLOC(hash, LNET_REMOTE_NETS_HASH_SIZE * sizeof(*hash));
+	if (hash == NULL) {
+		CERROR("Failed to create remote nets hash table\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&hash[i]);
+	the_lnet.ln_remote_nets_hash = hash;
+	return 0;
+}
+
+static void
+lnet_destroy_remote_nets_table(void)
+{
+	int i;
+
+	if (the_lnet.ln_remote_nets_hash == NULL)
+		return;
+
+	for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++)
+		LASSERT(list_empty(&the_lnet.ln_remote_nets_hash[i]));
+
+	LIBCFS_FREE(the_lnet.ln_remote_nets_hash,
+		    LNET_REMOTE_NETS_HASH_SIZE *
+		    sizeof(the_lnet.ln_remote_nets_hash[0]));
+	the_lnet.ln_remote_nets_hash = NULL;
+}
+
+static void
+lnet_destroy_locks(void)
+{
+	if (the_lnet.ln_res_lock != NULL) {
+		cfs_percpt_lock_free(the_lnet.ln_res_lock);
+		the_lnet.ln_res_lock = NULL;
+	}
+
+	if (the_lnet.ln_net_lock != NULL) {
+		cfs_percpt_lock_free(the_lnet.ln_net_lock);
+		the_lnet.ln_net_lock = NULL;
+	}
+
+	lnet_fini_locks();
+}
+
+static int
+lnet_create_locks(void)
+{
+	lnet_init_locks();
+
+	the_lnet.ln_res_lock = cfs_percpt_lock_alloc(lnet_cpt_table());
+	if (the_lnet.ln_res_lock == NULL)
+		goto failed;
+
+	the_lnet.ln_net_lock = cfs_percpt_lock_alloc(lnet_cpt_table());
+	if (the_lnet.ln_net_lock == NULL)
+		goto failed;
+
+	return 0;
+
+ failed:
+	lnet_destroy_locks();
+	return -ENOMEM;
+}
+
+static void lnet_assert_wire_constants(void)
+{
+	/* Wire protocol assertions generated by 'wirecheck'
+	 * running on Linux robert.bartonsoftware.com 2.6.8-1.521
+	 * #1 Mon Aug 16 09:01:18 EDT 2004 i686 athlon i386 GNU/Linux
+	 * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7) */
+
+	/* Constants... */
+	CLASSERT(LNET_PROTO_TCP_MAGIC == 0xeebc0ded);
+	CLASSERT(LNET_PROTO_TCP_VERSION_MAJOR == 1);
+	CLASSERT(LNET_PROTO_TCP_VERSION_MINOR == 0);
+	CLASSERT(LNET_MSG_ACK == 0);
+	CLASSERT(LNET_MSG_PUT == 1);
+	CLASSERT(LNET_MSG_GET == 2);
+	CLASSERT(LNET_MSG_REPLY == 3);
+	CLASSERT(LNET_MSG_HELLO == 4);
+
+	/* Checks for struct lnet_handle_wire */
+	CLASSERT((int)sizeof(struct lnet_handle_wire) == 16);
+	CLASSERT((int)offsetof(struct lnet_handle_wire, wh_interface_cookie) == 0);
+	CLASSERT((int)sizeof(((struct lnet_handle_wire *)0)->wh_interface_cookie) == 8);
+	CLASSERT((int)offsetof(struct lnet_handle_wire, wh_object_cookie) == 8);
+	CLASSERT((int)sizeof(((struct lnet_handle_wire *)0)->wh_object_cookie) == 8);
+
+	/* Checks for struct struct lnet_magicversion */
+	CLASSERT((int)sizeof(struct lnet_magicversion) == 8);
+	CLASSERT((int)offsetof(struct lnet_magicversion, magic) == 0);
+	CLASSERT((int)sizeof(((struct lnet_magicversion *)0)->magic) == 4);
+	CLASSERT((int)offsetof(struct lnet_magicversion, version_major) == 4);
+	CLASSERT((int)sizeof(((struct lnet_magicversion *)0)->version_major) == 2);
+	CLASSERT((int)offsetof(struct lnet_magicversion, version_minor) == 6);
+	CLASSERT((int)sizeof(((struct lnet_magicversion *)0)->version_minor) == 2);
+
+	/* Checks for struct struct lnet_hdr */
+	CLASSERT((int)sizeof(struct lnet_hdr) == 72);
+	CLASSERT((int)offsetof(struct lnet_hdr, dest_nid) == 0);
+	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->dest_nid) == 8);
+	CLASSERT((int)offsetof(struct lnet_hdr, src_nid) == 8);
+	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->src_nid) == 8);
+	CLASSERT((int)offsetof(struct lnet_hdr, dest_pid) == 16);
+	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->dest_pid) == 4);
+	CLASSERT((int)offsetof(struct lnet_hdr, src_pid) == 20);
+	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->src_pid) == 4);
+	CLASSERT((int)offsetof(struct lnet_hdr, type) == 24);
+	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->type) == 4);
+	CLASSERT((int)offsetof(struct lnet_hdr, payload_length) == 28);
+	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->payload_length) == 4);
+	CLASSERT((int)offsetof(struct lnet_hdr, msg) == 32);
+	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg) == 40);
+
+	/* Ack */
+	CLASSERT((int)offsetof(struct lnet_hdr, msg.ack.dst_wmd) == 32);
+	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.ack.dst_wmd) == 16);
+	CLASSERT((int)offsetof(struct lnet_hdr, msg.ack.match_bits) == 48);
+	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.ack.match_bits) == 8);
+	CLASSERT((int)offsetof(struct lnet_hdr, msg.ack.mlength) == 56);
+	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.ack.mlength) == 4);
+
+	/* Put */
+	CLASSERT((int)offsetof(struct lnet_hdr, msg.put.ack_wmd) == 32);
+	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.put.ack_wmd) == 16);
+	CLASSERT((int)offsetof(struct lnet_hdr, msg.put.match_bits) == 48);
+	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.put.match_bits) == 8);
+	CLASSERT((int)offsetof(struct lnet_hdr, msg.put.hdr_data) == 56);
+	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.put.hdr_data) == 8);
+	CLASSERT((int)offsetof(struct lnet_hdr, msg.put.ptl_index) == 64);
+	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.put.ptl_index) == 4);
+	CLASSERT((int)offsetof(struct lnet_hdr, msg.put.offset) == 68);
+	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.put.offset) == 4);
+
+	/* Get */
+	CLASSERT((int)offsetof(struct lnet_hdr, msg.get.return_wmd) == 32);
+	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.get.return_wmd) == 16);
+	CLASSERT((int)offsetof(struct lnet_hdr, msg.get.match_bits) == 48);
+	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.get.match_bits) == 8);
+	CLASSERT((int)offsetof(struct lnet_hdr, msg.get.ptl_index) == 56);
+	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.get.ptl_index) == 4);
+	CLASSERT((int)offsetof(struct lnet_hdr, msg.get.src_offset) == 60);
+	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.get.src_offset) == 4);
+	CLASSERT((int)offsetof(struct lnet_hdr, msg.get.sink_length) == 64);
+	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.get.sink_length) == 4);
+
+	/* Reply */
+	CLASSERT((int)offsetof(struct lnet_hdr, msg.reply.dst_wmd) == 32);
+	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.reply.dst_wmd) == 16);
+
+	/* Hello */
+	CLASSERT((int)offsetof(struct lnet_hdr, msg.hello.incarnation) == 32);
+	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.hello.incarnation) == 8);
+	CLASSERT((int)offsetof(struct lnet_hdr, msg.hello.type) == 40);
+	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.hello.type) == 4);
+}
+
+static struct lnet_lnd *lnet_find_lnd_by_type(__u32 type)
+{
+	struct lnet_lnd *lnd;
+	struct list_head *tmp;
+
+	/* holding lnd mutex */
+	list_for_each(tmp, &the_lnet.ln_lnds) {
+		lnd = list_entry(tmp, struct lnet_lnd, lnd_list);
+
+		if (lnd->lnd_type == type)
+			return lnd;
+	}
+	return NULL;
+}
+
+void
+lnet_register_lnd(struct lnet_lnd *lnd)
+{
+	mutex_lock(&the_lnet.ln_lnd_mutex);
+
+	LASSERT(libcfs_isknown_lnd(lnd->lnd_type));
+	LASSERT(lnet_find_lnd_by_type(lnd->lnd_type) == NULL);
+
+	list_add_tail(&lnd->lnd_list, &the_lnet.ln_lnds);
+	lnd->lnd_refcount = 0;
+
+	CDEBUG(D_NET, "%s LND registered\n", libcfs_lnd2str(lnd->lnd_type));
+
+	mutex_unlock(&the_lnet.ln_lnd_mutex);
+}
+EXPORT_SYMBOL(lnet_register_lnd);
+
+void
+lnet_unregister_lnd(struct lnet_lnd *lnd)
+{
+	mutex_lock(&the_lnet.ln_lnd_mutex);
+
+	LASSERT(lnet_find_lnd_by_type(lnd->lnd_type) == lnd);
+	LASSERT(lnd->lnd_refcount == 0);
+
+	list_del(&lnd->lnd_list);
+	CDEBUG(D_NET, "%s LND unregistered\n", libcfs_lnd2str(lnd->lnd_type));
+
+	mutex_unlock(&the_lnet.ln_lnd_mutex);
+}
+EXPORT_SYMBOL(lnet_unregister_lnd);
+
+void
+lnet_counters_get(struct lnet_counters *counters)
+{
+	struct lnet_counters *ctr;
+	int		i;
+
+	memset(counters, 0, sizeof(*counters));
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) {
+		counters->msgs_max     += ctr->msgs_max;
+		counters->msgs_alloc   += ctr->msgs_alloc;
+		counters->errors       += ctr->errors;
+		counters->send_count   += ctr->send_count;
+		counters->recv_count   += ctr->recv_count;
+		counters->route_count  += ctr->route_count;
+		counters->drop_count   += ctr->drop_count;
+		counters->send_length  += ctr->send_length;
+		counters->recv_length  += ctr->recv_length;
+		counters->route_length += ctr->route_length;
+		counters->drop_length  += ctr->drop_length;
+
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+EXPORT_SYMBOL(lnet_counters_get);
+
+void
+lnet_counters_reset(void)
+{
+	struct lnet_counters *counters;
+	int		i;
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	cfs_percpt_for_each(counters, i, the_lnet.ln_counters)
+		memset(counters, 0, sizeof(struct lnet_counters));
+
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+static char *
+lnet_res_type2str(int type)
+{
+	switch (type) {
+	default:
+		LBUG();
+	case LNET_COOKIE_TYPE_MD:
+		return "MD";
+	case LNET_COOKIE_TYPE_ME:
+		return "ME";
+	case LNET_COOKIE_TYPE_EQ:
+		return "EQ";
+	}
+}
+
+static void
+lnet_res_container_cleanup(struct lnet_res_container *rec)
+{
+	int	count = 0;
+
+	if (rec->rec_type == 0) /* not set yet, it's uninitialized */
+		return;
+
+	while (!list_empty(&rec->rec_active)) {
+		struct list_head *e = rec->rec_active.next;
+
+		list_del_init(e);
+		if (rec->rec_type == LNET_COOKIE_TYPE_EQ) {
+			lnet_eq_free(list_entry(e, struct lnet_eq, eq_list));
+
+		} else if (rec->rec_type == LNET_COOKIE_TYPE_MD) {
+			lnet_md_free(list_entry(e, struct lnet_libmd, md_list));
+
+		} else { /* NB: Active MEs should be attached on portals */
+			LBUG();
+		}
+		count++;
+	}
+
+	if (count > 0) {
+		/* Found alive MD/ME/EQ, user really should unlink/free
+		 * all of them before finalize LNet, but if someone didn't,
+		 * we have to recycle garbage for him */
+		CERROR("%d active elements on exit of %s container\n",
+		       count, lnet_res_type2str(rec->rec_type));
+	}
+
+	if (rec->rec_lh_hash != NULL) {
+		LIBCFS_FREE(rec->rec_lh_hash,
+			    LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0]));
+		rec->rec_lh_hash = NULL;
+	}
+
+	rec->rec_type = 0; /* mark it as finalized */
+}
+
+static int
+lnet_res_container_setup(struct lnet_res_container *rec, int cpt, int type)
+{
+	int	rc = 0;
+	int	i;
+
+	LASSERT(rec->rec_type == 0);
+
+	rec->rec_type = type;
+	INIT_LIST_HEAD(&rec->rec_active);
+
+	rec->rec_lh_cookie = (cpt << LNET_COOKIE_TYPE_BITS) | type;
+
+	/* Arbitrary choice of hash table size */
+	LIBCFS_CPT_ALLOC(rec->rec_lh_hash, lnet_cpt_table(), cpt,
+			 LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0]));
+	if (rec->rec_lh_hash == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < LNET_LH_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&rec->rec_lh_hash[i]);
+
+	return 0;
+
+out:
+	CERROR("Failed to setup %s resource container\n",
+	       lnet_res_type2str(type));
+	lnet_res_container_cleanup(rec);
+	return rc;
+}
+
+static void
+lnet_res_containers_destroy(struct lnet_res_container **recs)
+{
+	struct lnet_res_container	*rec;
+	int				i;
+
+	cfs_percpt_for_each(rec, i, recs)
+		lnet_res_container_cleanup(rec);
+
+	cfs_percpt_free(recs);
+}
+
+static struct lnet_res_container **
+lnet_res_containers_create(int type)
+{
+	struct lnet_res_container	**recs;
+	struct lnet_res_container	*rec;
+	int				rc;
+	int				i;
+
+	recs = cfs_percpt_alloc(lnet_cpt_table(), sizeof(*rec));
+	if (recs == NULL) {
+		CERROR("Failed to allocate %s resource containers\n",
+		       lnet_res_type2str(type));
+		return NULL;
+	}
+
+	cfs_percpt_for_each(rec, i, recs) {
+		rc = lnet_res_container_setup(rec, i, type);
+		if (rc != 0) {
+			lnet_res_containers_destroy(recs);
+			return NULL;
+		}
+	}
+
+	return recs;
+}
+
+struct lnet_libhandle *
+lnet_res_lh_lookup(struct lnet_res_container *rec, __u64 cookie)
+{
+	/* ALWAYS called with lnet_res_lock held */
+	struct list_head	*head;
+	struct lnet_libhandle	*lh;
+	unsigned int		hash;
+
+	if ((cookie & LNET_COOKIE_MASK) != rec->rec_type)
+		return NULL;
+
+	hash = cookie >> (LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS);
+	head = &rec->rec_lh_hash[hash & LNET_LH_HASH_MASK];
+
+	list_for_each_entry(lh, head, lh_hash_chain) {
+		if (lh->lh_cookie == cookie)
+			return lh;
+	}
+
+	return NULL;
+}
+
+void
+lnet_res_lh_initialize(struct lnet_res_container *rec,
+		       struct lnet_libhandle *lh)
+{
+	/* ALWAYS called with lnet_res_lock held */
+	unsigned int	ibits = LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS;
+	unsigned int	hash;
+
+	lh->lh_cookie = rec->rec_lh_cookie;
+	rec->rec_lh_cookie += 1 << ibits;
+
+	hash = (lh->lh_cookie >> ibits) & LNET_LH_HASH_MASK;
+
+	list_add(&lh->lh_hash_chain, &rec->rec_lh_hash[hash]);
+}
+
+static int lnet_unprepare(void);
+
+static int
+lnet_prepare(lnet_pid_t requested_pid)
+{
+	/* Prepare to bring up the network */
+	struct lnet_res_container **recs;
+	int			  rc = 0;
+
+	if (requested_pid == LNET_PID_ANY) {
+		/* Don't instantiate LNET just for me */
+		return -ENETDOWN;
+	}
+
+	LASSERT(the_lnet.ln_refcount == 0);
+
+	the_lnet.ln_routing = 0;
+
+	LASSERT((requested_pid & LNET_PID_USERFLAG) == 0);
+	the_lnet.ln_pid = requested_pid;
+
+	INIT_LIST_HEAD(&the_lnet.ln_test_peers);
+	INIT_LIST_HEAD(&the_lnet.ln_peers);
+	INIT_LIST_HEAD(&the_lnet.ln_remote_peer_ni_list);
+	INIT_LIST_HEAD(&the_lnet.ln_nets);
+	INIT_LIST_HEAD(&the_lnet.ln_routers);
+	INIT_LIST_HEAD(&the_lnet.ln_drop_rules);
+	INIT_LIST_HEAD(&the_lnet.ln_delay_rules);
+
+	rc = lnet_descriptor_setup();
+	if (rc != 0)
+		goto failed;
+
+	rc = lnet_create_remote_nets_table();
+	if (rc != 0)
+		goto failed;
+
+	/*
+	 * NB the interface cookie in wire handles guards against delayed
+	 * replies and ACKs appearing valid after reboot.
+	 */
+	the_lnet.ln_interface_cookie = ktime_get_real_ns();
+
+	the_lnet.ln_counters = cfs_percpt_alloc(lnet_cpt_table(),
+						sizeof(struct lnet_counters));
+	if (the_lnet.ln_counters == NULL) {
+		CERROR("Failed to allocate counters for LNet\n");
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	rc = lnet_peer_tables_create();
+	if (rc != 0)
+		goto failed;
+
+	rc = lnet_msg_containers_create();
+	if (rc != 0)
+		goto failed;
+
+	rc = lnet_res_container_setup(&the_lnet.ln_eq_container, 0,
+				      LNET_COOKIE_TYPE_EQ);
+	if (rc != 0)
+		goto failed;
+
+	recs = lnet_res_containers_create(LNET_COOKIE_TYPE_ME);
+	if (recs == NULL) {
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	the_lnet.ln_me_containers = recs;
+
+	recs = lnet_res_containers_create(LNET_COOKIE_TYPE_MD);
+	if (recs == NULL) {
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	the_lnet.ln_md_containers = recs;
+
+	rc = lnet_portals_create();
+	if (rc != 0) {
+		CERROR("Failed to create portals for LNet: %d\n", rc);
+		goto failed;
+	}
+
+	return 0;
+
+ failed:
+	lnet_unprepare();
+	return rc;
+}
+
+static int
+lnet_unprepare (void)
+{
+	/* NB no LNET_LOCK since this is the last reference.  All LND instances
+	 * have shut down already, so it is safe to unlink and free all
+	 * descriptors, even those that appear committed to a network op (eg MD
+	 * with non-zero pending count) */
+
+	lnet_fail_nid(LNET_NID_ANY, 0);
+
+	LASSERT(the_lnet.ln_refcount == 0);
+	LASSERT(list_empty(&the_lnet.ln_test_peers));
+	LASSERT(list_empty(&the_lnet.ln_nets));
+
+	lnet_portals_destroy();
+
+	if (the_lnet.ln_md_containers != NULL) {
+		lnet_res_containers_destroy(the_lnet.ln_md_containers);
+		the_lnet.ln_md_containers = NULL;
+	}
+
+	if (the_lnet.ln_me_containers != NULL) {
+		lnet_res_containers_destroy(the_lnet.ln_me_containers);
+		the_lnet.ln_me_containers = NULL;
+	}
+
+	lnet_res_container_cleanup(&the_lnet.ln_eq_container);
+
+	lnet_msg_containers_destroy();
+	lnet_peer_uninit();
+	lnet_rtrpools_free(0);
+
+	if (the_lnet.ln_counters != NULL) {
+		cfs_percpt_free(the_lnet.ln_counters);
+		the_lnet.ln_counters = NULL;
+	}
+	lnet_destroy_remote_nets_table();
+	lnet_descriptor_cleanup();
+
+	return 0;
+}
+
+struct lnet_ni  *
+lnet_net2ni_locked(__u32 net_id, int cpt)
+{
+	struct lnet_ni	 *ni;
+	struct lnet_net	 *net;
+
+	LASSERT(cpt != LNET_LOCK_EX);
+
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		if (net->net_id == net_id) {
+			ni = list_entry(net->net_ni_list.next, struct lnet_ni,
+					ni_netlist);
+			return ni;
+		}
+	}
+
+	return NULL;
+}
+
+struct lnet_ni *
+lnet_net2ni_addref(__u32 net)
+{
+	struct lnet_ni *ni;
+
+	lnet_net_lock(0);
+	ni = lnet_net2ni_locked(net, 0);
+	if (ni)
+		lnet_ni_addref_locked(ni, 0);
+	lnet_net_unlock(0);
+
+	return ni;
+}
+EXPORT_SYMBOL(lnet_net2ni_addref);
+
+struct lnet_net *
+lnet_get_net_locked(__u32 net_id)
+{
+	struct lnet_net	 *net;
+
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		if (net->net_id == net_id)
+			return net;
+	}
+
+	return NULL;
+}
+
+unsigned int
+lnet_nid_cpt_hash(lnet_nid_t nid, unsigned int number)
+{
+	__u64		key = nid;
+	unsigned int	val;
+
+	LASSERT(number >= 1 && number <= LNET_CPT_NUMBER);
+
+	if (number == 1)
+		return 0;
+
+	val = hash_long(key, LNET_CPT_BITS);
+	/* NB: LNET_CP_NUMBER doesn't have to be PO2 */
+	if (val < number)
+		return val;
+
+	return (unsigned int)(key + val + (val >> 1)) % number;
+}
+
+int
+lnet_cpt_of_nid_locked(lnet_nid_t nid, struct lnet_ni *ni)
+{
+	struct lnet_net *net;
+
+	/* must called with hold of lnet_net_lock */
+	if (LNET_CPT_NUMBER == 1)
+		return 0; /* the only one */
+
+	/*
+	 * If NI is provided then use the CPT identified in the NI cpt
+	 * list if one exists. If one doesn't exist, then that NI is
+	 * associated with all CPTs and it follows that the net it belongs
+	 * to is implicitly associated with all CPTs, so just hash the nid
+	 * and return that.
+	 */
+	if (ni != NULL) {
+		if (ni->ni_cpts != NULL)
+			return ni->ni_cpts[lnet_nid_cpt_hash(nid,
+							     ni->ni_ncpts)];
+		else
+			return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+	}
+
+	/* no NI provided so look at the net */
+	net = lnet_get_net_locked(LNET_NIDNET(nid));
+
+	if (net != NULL && net->net_cpts != NULL) {
+		return net->net_cpts[lnet_nid_cpt_hash(nid, net->net_ncpts)];
+	}
+
+	return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+}
+
+int
+lnet_cpt_of_nid(lnet_nid_t nid, struct lnet_ni *ni)
+{
+	int	cpt;
+	int	cpt2;
+
+	if (LNET_CPT_NUMBER == 1)
+		return 0; /* the only one */
+
+	cpt = lnet_net_lock_current();
+
+	cpt2 = lnet_cpt_of_nid_locked(nid, ni);
+
+	lnet_net_unlock(cpt);
+
+	return cpt2;
+}
+EXPORT_SYMBOL(lnet_cpt_of_nid);
+
+int
+lnet_islocalnet(__u32 net_id)
+{
+	struct lnet_net *net;
+	int		cpt;
+	bool		local;
+
+	cpt = lnet_net_lock_current();
+
+	net = lnet_get_net_locked(net_id);
+
+	local = net != NULL;
+
+	lnet_net_unlock(cpt);
+
+	return local;
+}
+
+bool
+lnet_is_ni_healthy_locked(struct lnet_ni *ni)
+{
+	if (ni->ni_state == LNET_NI_STATE_ACTIVE ||
+	    ni->ni_state == LNET_NI_STATE_DEGRADED)
+		return true;
+
+	return false;
+}
+
+struct lnet_ni  *
+lnet_nid2ni_locked(lnet_nid_t nid, int cpt)
+{
+	struct lnet_net  *net;
+	struct lnet_ni	 *ni;
+
+	LASSERT(cpt != LNET_LOCK_EX);
+
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+			if (ni->ni_nid == nid)
+				return ni;
+		}
+	}
+
+	return NULL;
+}
+
+struct lnet_ni *
+lnet_nid2ni_addref(lnet_nid_t nid)
+{
+	struct lnet_ni *ni;
+
+	lnet_net_lock(0);
+	ni = lnet_nid2ni_locked(nid, 0);
+	if (ni)
+		lnet_ni_addref_locked(ni, 0);
+	lnet_net_unlock(0);
+
+	return ni;
+}
+EXPORT_SYMBOL(lnet_nid2ni_addref);
+
+int
+lnet_islocalnid(lnet_nid_t nid)
+{
+	struct lnet_ni	*ni;
+	int		cpt;
+
+	cpt = lnet_net_lock_current();
+	ni = lnet_nid2ni_locked(nid, cpt);
+	lnet_net_unlock(cpt);
+
+	return ni != NULL;
+}
+
+int
+lnet_count_acceptor_nets(void)
+{
+	/* Return the # of NIs that need the acceptor. */
+	int		 count = 0;
+	struct lnet_net  *net;
+	int		 cpt;
+
+	cpt = lnet_net_lock_current();
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		/* all socklnd type networks should have the acceptor
+		 * thread started */
+		if (net->net_lnd->lnd_accept != NULL)
+			count++;
+	}
+
+	lnet_net_unlock(cpt);
+
+	return count;
+}
+
+static struct lnet_ping_info *
+lnet_ping_info_create(int num_ni)
+{
+	struct lnet_ping_info *ping_info;
+	unsigned int	 infosz;
+
+	infosz = offsetof(struct lnet_ping_info, pi_ni[num_ni]);
+	LIBCFS_ALLOC(ping_info, infosz);
+	if (ping_info == NULL) {
+		CERROR("Can't allocate ping info[%d]\n", num_ni);
+		return NULL;
+	}
+
+	ping_info->pi_nnis = num_ni;
+	ping_info->pi_pid = the_lnet.ln_pid;
+	ping_info->pi_magic = LNET_PROTO_PING_MAGIC;
+	ping_info->pi_features = LNET_PING_FEAT_NI_STATUS;
+
+	return ping_info;
+}
+
+static inline int
+lnet_get_net_ni_count_locked(struct lnet_net *net)
+{
+	struct lnet_ni	*ni;
+	int		count = 0;
+
+	list_for_each_entry(ni, &net->net_ni_list, ni_netlist)
+		count++;
+
+	return count;
+}
+
+static inline int
+lnet_get_net_ni_count_pre(struct lnet_net *net)
+{
+	struct lnet_ni	*ni;
+	int		count = 0;
+
+	list_for_each_entry(ni, &net->net_ni_added, ni_netlist)
+		count++;
+
+	return count;
+}
+
+static inline int
+lnet_get_ni_count(void)
+{
+	struct lnet_ni	*ni;
+	struct lnet_net *net;
+	int		count = 0;
+
+	lnet_net_lock(0);
+
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		list_for_each_entry(ni, &net->net_ni_list, ni_netlist)
+			count++;
+	}
+
+	lnet_net_unlock(0);
+
+	return count;
+}
+
+static inline void
+lnet_ping_info_free(struct lnet_ping_info *pinfo)
+{
+	LIBCFS_FREE(pinfo,
+		    offsetof(struct lnet_ping_info,
+			     pi_ni[pinfo->pi_nnis]));
+}
+
+static void
+lnet_ping_info_destroy(void)
+{
+	struct lnet_net *net;
+	struct lnet_ni	*ni;
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+			lnet_ni_lock(ni);
+			ni->ni_status = NULL;
+			lnet_ni_unlock(ni);
+		}
+	}
+
+	lnet_ping_info_free(the_lnet.ln_ping_info);
+	the_lnet.ln_ping_info = NULL;
+
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+static void
+lnet_ping_event_handler(struct lnet_event *event)
+{
+	struct lnet_ping_info *pinfo = event->md.user_ptr;
+
+	if (event->unlinked)
+		pinfo->pi_features = LNET_PING_FEAT_INVAL;
+}
+
+static int
+lnet_ping_info_setup(struct lnet_ping_info **ppinfo,
+		     struct lnet_handle_md *md_handle,
+		     int ni_count, bool set_eq)
+{
+	struct lnet_process_id id = {
+		.nid = LNET_NID_ANY,
+		.pid = LNET_PID_ANY
+	};
+	struct lnet_handle_me me_handle;
+	struct lnet_md md = { NULL };
+	int rc, rc2;
+
+	if (set_eq) {
+		rc = LNetEQAlloc(0, lnet_ping_event_handler,
+				 &the_lnet.ln_ping_target_eq);
+		if (rc != 0) {
+			CERROR("Can't allocate ping EQ: %d\n", rc);
+			return rc;
+		}
+	}
+
+	*ppinfo = lnet_ping_info_create(ni_count);
+	if (*ppinfo == NULL) {
+		rc = -ENOMEM;
+		goto failed_0;
+	}
+
+	rc = LNetMEAttach(LNET_RESERVED_PORTAL, id,
+			  LNET_PROTO_PING_MATCHBITS, 0,
+			  LNET_UNLINK, LNET_INS_AFTER,
+			  &me_handle);
+	if (rc != 0) {
+		CERROR("Can't create ping ME: %d\n", rc);
+		goto failed_1;
+	}
+
+	/* initialize md content */
+	md.start     = *ppinfo;
+	md.length    = offsetof(struct lnet_ping_info,
+				pi_ni[(*ppinfo)->pi_nnis]);
+	md.threshold = LNET_MD_THRESH_INF;
+	md.max_size  = 0;
+	md.options   = LNET_MD_OP_GET | LNET_MD_TRUNCATE |
+		       LNET_MD_MANAGE_REMOTE;
+	md.user_ptr  = NULL;
+	md.eq_handle = the_lnet.ln_ping_target_eq;
+	md.user_ptr = *ppinfo;
+
+	rc = LNetMDAttach(me_handle, md, LNET_RETAIN, md_handle);
+	if (rc != 0) {
+		CERROR("Can't attach ping MD: %d\n", rc);
+		goto failed_2;
+	}
+
+	return 0;
+
+failed_2:
+	rc2 = LNetMEUnlink(me_handle);
+	LASSERT(rc2 == 0);
+failed_1:
+	lnet_ping_info_free(*ppinfo);
+	*ppinfo = NULL;
+failed_0:
+	if (set_eq)
+		LNetEQFree(the_lnet.ln_ping_target_eq);
+	return rc;
+}
+
+static void
+lnet_ping_md_unlink(struct lnet_ping_info *pinfo, struct lnet_handle_md *md_handle)
+{
+	sigset_t	blocked = cfs_block_allsigs();
+
+	LNetMDUnlink(*md_handle);
+	LNetInvalidateMDHandle(md_handle);
+
+	/* NB md could be busy; this just starts the unlink */
+	while (pinfo->pi_features != LNET_PING_FEAT_INVAL) {
+		CDEBUG(D_NET, "Still waiting for ping MD to unlink\n");
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(1));
+	}
+
+	cfs_restore_sigs(blocked);
+}
+
+static void
+lnet_ping_info_install_locked(struct lnet_ping_info *ping_info)
+{
+	int			i;
+	struct lnet_ni		*ni;
+	struct lnet_net		*net;
+	struct lnet_ni_status *ns;
+
+	i = 0;
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+			LASSERT(i < ping_info->pi_nnis);
+
+			ns = &ping_info->pi_ni[i];
+
+			ns->ns_nid = ni->ni_nid;
+
+			lnet_ni_lock(ni);
+			ns->ns_status = (ni->ni_status != NULL) ?
+					ni->ni_status->ns_status :
+						LNET_NI_STATUS_UP;
+			ni->ni_status = ns;
+			lnet_ni_unlock(ni);
+
+			i++;
+		}
+
+	}
+}
+
+static void
+lnet_ping_target_update(struct lnet_ping_info *pinfo,
+			struct lnet_handle_md md_handle)
+{
+	struct lnet_ping_info *old_pinfo = NULL;
+	struct lnet_handle_md old_md;
+
+	/* switch the NIs to point to the new ping info created */
+	lnet_net_lock(LNET_LOCK_EX);
+
+	if (!the_lnet.ln_routing)
+		pinfo->pi_features |= LNET_PING_FEAT_RTE_DISABLED;
+	lnet_ping_info_install_locked(pinfo);
+
+	if (the_lnet.ln_ping_info != NULL) {
+		old_pinfo = the_lnet.ln_ping_info;
+		old_md = the_lnet.ln_ping_target_md;
+	}
+	the_lnet.ln_ping_target_md = md_handle;
+	the_lnet.ln_ping_info = pinfo;
+
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	if (old_pinfo != NULL) {
+		/* unlink the old ping info */
+		lnet_ping_md_unlink(old_pinfo, &old_md);
+		lnet_ping_info_free(old_pinfo);
+	}
+}
+
+static void
+lnet_ping_target_fini(void)
+{
+	int		rc;
+
+	lnet_ping_md_unlink(the_lnet.ln_ping_info,
+			    &the_lnet.ln_ping_target_md);
+
+	rc = LNetEQFree(the_lnet.ln_ping_target_eq);
+	LASSERT(rc == 0);
+
+	lnet_ping_info_destroy();
+}
+
+static int
+lnet_ni_tq_credits(struct lnet_ni *ni)
+{
+	int	credits;
+
+	LASSERT(ni->ni_ncpts >= 1);
+
+	if (ni->ni_ncpts == 1)
+		return ni->ni_net->net_tunables.lct_max_tx_credits;
+
+	credits = ni->ni_net->net_tunables.lct_max_tx_credits / ni->ni_ncpts;
+	credits = max(credits, 8 * ni->ni_net->net_tunables.lct_peer_tx_credits);
+	credits = min(credits, ni->ni_net->net_tunables.lct_max_tx_credits);
+
+	return credits;
+}
+
+static void
+lnet_ni_unlink_locked(struct lnet_ni *ni)
+{
+	if (!list_empty(&ni->ni_cptlist)) {
+		list_del_init(&ni->ni_cptlist);
+		lnet_ni_decref_locked(ni, 0);
+	}
+
+	/* move it to zombie list and nobody can find it anymore */
+	LASSERT(!list_empty(&ni->ni_netlist));
+	list_move(&ni->ni_netlist, &ni->ni_net->net_ni_zombie);
+	lnet_ni_decref_locked(ni, 0);
+}
+
+static void
+lnet_clear_zombies_nis_locked(struct lnet_net *net)
+{
+	int		i;
+	int		islo;
+	struct lnet_ni	*ni;
+	struct list_head *zombie_list = &net->net_ni_zombie;
+
+	/*
+	 * Now wait for the NIs I just nuked to show up on the zombie
+	 * list and shut them down in guaranteed thread context
+	 */
+	i = 2;
+	while (!list_empty(zombie_list)) {
+		int	*ref;
+		int	j;
+
+		ni = list_entry(zombie_list->next,
+				struct lnet_ni, ni_netlist);
+		list_del_init(&ni->ni_netlist);
+		/* the ni should be in deleting state. If it's not it's
+		 * a bug */
+		LASSERT(ni->ni_state == LNET_NI_STATE_DELETING);
+		cfs_percpt_for_each(ref, j, ni->ni_refs) {
+			if (*ref == 0)
+				continue;
+			/* still busy, add it back to zombie list */
+			list_add(&ni->ni_netlist, zombie_list);
+			break;
+		}
+
+		if (!list_empty(&ni->ni_netlist)) {
+			lnet_net_unlock(LNET_LOCK_EX);
+			++i;
+			if ((i & (-i)) == i) {
+				CDEBUG(D_WARNING,
+				       "Waiting for zombie LNI %s\n",
+				       libcfs_nid2str(ni->ni_nid));
+			}
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(cfs_time_seconds(1));
+			lnet_net_lock(LNET_LOCK_EX);
+			continue;
+		}
+
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		islo = ni->ni_net->net_lnd->lnd_type == LOLND;
+
+		LASSERT(!in_interrupt());
+		(net->net_lnd->lnd_shutdown)(ni);
+
+		if (!islo)
+			CDEBUG(D_LNI, "Removed LNI %s\n",
+			      libcfs_nid2str(ni->ni_nid));
+
+		lnet_ni_free(ni);
+		i = 2;
+		lnet_net_lock(LNET_LOCK_EX);
+	}
+}
+
+/* shutdown down the NI and release refcount */
+static void
+lnet_shutdown_lndni(struct lnet_ni *ni)
+{
+	int i;
+	struct lnet_net *net = ni->ni_net;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	ni->ni_state = LNET_NI_STATE_DELETING;
+	lnet_ni_unlink_locked(ni);
+	lnet_incr_dlc_seq();
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	/* clear messages for this NI on the lazy portal */
+	for (i = 0; i < the_lnet.ln_nportals; i++)
+		lnet_clear_lazy_portal(ni, i, "Shutting down NI");
+
+	lnet_net_lock(LNET_LOCK_EX);
+	lnet_clear_zombies_nis_locked(net);
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+static void
+lnet_shutdown_lndnet(struct lnet_net *net)
+{
+	struct lnet_ni *ni;
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	net->net_state = LNET_NET_STATE_DELETING;
+
+	list_del_init(&net->net_list);
+
+	while (!list_empty(&net->net_ni_list)) {
+		ni = list_entry(net->net_ni_list.next,
+				struct lnet_ni, ni_netlist);
+		lnet_net_unlock(LNET_LOCK_EX);
+		lnet_shutdown_lndni(ni);
+		lnet_net_lock(LNET_LOCK_EX);
+	}
+
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	/* Do peer table cleanup for this net */
+	lnet_peer_tables_cleanup(net);
+
+	lnet_net_lock(LNET_LOCK_EX);
+	/*
+	 * decrement ref count on lnd only when the entire network goes
+	 * away
+	 */
+	net->net_lnd->lnd_refcount--;
+
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	lnet_net_free(net);
+}
+
+static void
+lnet_shutdown_lndnets(void)
+{
+	struct lnet_net *net;
+
+	/* NB called holding the global mutex */
+
+	/* All quiet on the API front */
+	LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING);
+	LASSERT(the_lnet.ln_refcount == 0);
+
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_state = LNET_STATE_STOPPING;
+
+	while (!list_empty(&the_lnet.ln_nets)) {
+		/*
+		 * move the nets to the zombie list to avoid them being
+		 * picked up for new work. LONET is also included in the
+		 * Nets that will be moved to the zombie list
+		 */
+		net = list_entry(the_lnet.ln_nets.next,
+				 struct lnet_net, net_list);
+		list_move(&net->net_list, &the_lnet.ln_net_zombie);
+	}
+
+	/* Drop the cached loopback Net. */
+	if (the_lnet.ln_loni != NULL) {
+		lnet_ni_decref_locked(the_lnet.ln_loni, 0);
+		the_lnet.ln_loni = NULL;
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	/* iterate through the net zombie list and delete each net */
+	while (!list_empty(&the_lnet.ln_net_zombie)) {
+		net = list_entry(the_lnet.ln_net_zombie.next,
+				 struct lnet_net, net_list);
+		lnet_shutdown_lndnet(net);
+	}
+
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_state = LNET_STATE_SHUTDOWN;
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+static int
+lnet_startup_lndni(struct lnet_ni *ni, struct lnet_lnd_tunables *tun)
+{
+	int			rc = -EINVAL;
+	struct lnet_tx_queue	*tq;
+	int			i;
+	struct lnet_net		*net = ni->ni_net;
+
+	mutex_lock(&the_lnet.ln_lnd_mutex);
+
+	if (tun) {
+		memcpy(&ni->ni_lnd_tunables, tun, sizeof(*tun));
+		ni->ni_lnd_tunables_set = true;
+	}
+
+	rc = (net->net_lnd->lnd_startup)(ni);
+
+	mutex_unlock(&the_lnet.ln_lnd_mutex);
+
+	if (rc != 0) {
+		LCONSOLE_ERROR_MSG(0x105, "Error %d starting up LNI %s\n",
+				   rc, libcfs_lnd2str(net->net_lnd->lnd_type));
+		lnet_net_lock(LNET_LOCK_EX);
+		net->net_lnd->lnd_refcount--;
+		lnet_net_unlock(LNET_LOCK_EX);
+		goto failed0;
+	}
+
+	ni->ni_state = LNET_NI_STATE_ACTIVE;
+
+	/* We keep a reference on the loopback net through the loopback NI */
+	if (net->net_lnd->lnd_type == LOLND) {
+		lnet_ni_addref(ni);
+		LASSERT(the_lnet.ln_loni == NULL);
+		the_lnet.ln_loni = ni;
+		ni->ni_net->net_tunables.lct_peer_tx_credits = 0;
+		ni->ni_net->net_tunables.lct_peer_rtr_credits = 0;
+		ni->ni_net->net_tunables.lct_max_tx_credits = 0;
+		ni->ni_net->net_tunables.lct_peer_timeout = 0;
+		return 0;
+	}
+
+	if (ni->ni_net->net_tunables.lct_peer_tx_credits == 0 ||
+	    ni->ni_net->net_tunables.lct_max_tx_credits == 0) {
+		LCONSOLE_ERROR_MSG(0x107, "LNI %s has no %scredits\n",
+				   libcfs_lnd2str(net->net_lnd->lnd_type),
+				   ni->ni_net->net_tunables.lct_peer_tx_credits == 0 ?
+					"" : "per-peer ");
+		/* shutdown the NI since if we get here then it must've already
+		 * been started
+		 */
+		lnet_shutdown_lndni(ni);
+		return -EINVAL;
+	}
+
+	cfs_percpt_for_each(tq, i, ni->ni_tx_queues) {
+		tq->tq_credits_min =
+		tq->tq_credits_max =
+		tq->tq_credits = lnet_ni_tq_credits(ni);
+	}
+
+	atomic_set(&ni->ni_tx_credits,
+		   lnet_ni_tq_credits(ni) * ni->ni_ncpts);
+
+	CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n",
+		libcfs_nid2str(ni->ni_nid),
+		ni->ni_net->net_tunables.lct_peer_tx_credits,
+		lnet_ni_tq_credits(ni) * LNET_CPT_NUMBER,
+		ni->ni_net->net_tunables.lct_peer_rtr_credits,
+		ni->ni_net->net_tunables.lct_peer_timeout);
+
+	return 0;
+failed0:
+	lnet_ni_free(ni);
+	return rc;
+}
+
+static int
+lnet_startup_lndnet(struct lnet_net *net, struct lnet_lnd_tunables *tun)
+{
+	struct lnet_ni *ni;
+	struct lnet_net	*net_l = NULL;
+	struct list_head	local_ni_list;
+	int			rc;
+	int			ni_count = 0;
+	__u32			lnd_type;
+	struct lnet_lnd *lnd;
+	int			peer_timeout =
+		net->net_tunables.lct_peer_timeout;
+	int			maxtxcredits =
+		net->net_tunables.lct_max_tx_credits;
+	int			peerrtrcredits =
+		net->net_tunables.lct_peer_rtr_credits;
+
+	INIT_LIST_HEAD(&local_ni_list);
+
+	/*
+	 * make sure that this net is unique. If it isn't then
+	 * we are adding interfaces to an already existing network, and
+	 * 'net' is just a convenient way to pass in the list.
+	 * if it is unique we need to find the LND and load it if
+	 * necessary.
+	 */
+	if (lnet_net_unique(net->net_id, &the_lnet.ln_nets, &net_l)) {
+		lnd_type = LNET_NETTYP(net->net_id);
+
+		LASSERT(libcfs_isknown_lnd(lnd_type));
+
+		mutex_lock(&the_lnet.ln_lnd_mutex);
+		lnd = lnet_find_lnd_by_type(lnd_type);
+
+		if (lnd == NULL) {
+			mutex_unlock(&the_lnet.ln_lnd_mutex);
+			rc = request_module("%s", libcfs_lnd2modname(lnd_type));
+			mutex_lock(&the_lnet.ln_lnd_mutex);
+
+			lnd = lnet_find_lnd_by_type(lnd_type);
+			if (lnd == NULL) {
+				mutex_unlock(&the_lnet.ln_lnd_mutex);
+				CERROR("Can't load LND %s, module %s, rc=%d\n",
+				libcfs_lnd2str(lnd_type),
+				libcfs_lnd2modname(lnd_type), rc);
+#ifndef HAVE_MODULE_LOADING_SUPPORT
+				LCONSOLE_ERROR_MSG(0x104, "Your kernel must be "
+						"compiled with kernel module "
+						"loading support.");
+#endif
+				rc = -EINVAL;
+				goto failed0;
+			}
+		}
+
+		lnet_net_lock(LNET_LOCK_EX);
+		lnd->lnd_refcount++;
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		net->net_lnd = lnd;
+
+		mutex_unlock(&the_lnet.ln_lnd_mutex);
+
+		net_l = net;
+	}
+
+	/*
+	 * net_l: if the network being added is unique then net_l
+	 *        will point to that network
+	 *        if the network being added is not unique then
+	 *        net_l points to the existing network.
+	 *
+	 * When we enter the loop below, we'll pick NIs off he
+	 * network beign added and start them up, then add them to
+	 * a local ni list. Once we've successfully started all
+	 * the NIs then we join the local NI list (of started up
+	 * networks) with the net_l->net_ni_list, which should
+	 * point to the correct network to add the new ni list to
+	 *
+	 * If any of the new NIs fail to start up, then we want to
+	 * iterate through the local ni list, which should include
+	 * any NIs which were successfully started up, and shut
+	 * them down.
+	 *
+	 * After than we want to delete the network being added,
+	 * to avoid a memory leak.
+	 */
+
+	/*
+	 * When a network uses TCP bonding then all its interfaces
+	 * must be specified when the network is first defined: the
+	 * TCP bonding code doesn't allow for interfaces to be added
+	 * or removed.
+	 */
+	if (net_l != net && net_l != NULL && use_tcp_bonding &&
+	    LNET_NETTYP(net_l->net_id) == SOCKLND) {
+		rc = -EINVAL;
+		goto failed0;
+	}
+
+	while (!list_empty(&net->net_ni_added)) {
+		ni = list_entry(net->net_ni_added.next, struct lnet_ni,
+				ni_netlist);
+		list_del_init(&ni->ni_netlist);
+
+		/* make sure that the the NI we're about to start
+		 * up is actually unique. if it's not fail. */
+		if (!lnet_ni_unique_net(&net_l->net_ni_list,
+					ni->ni_interfaces[0])) {
+			rc = -EINVAL;
+			goto failed1;
+		}
+
+		/* adjust the pointer the parent network, just in case it
+		 * the net is a duplicate */
+		ni->ni_net = net_l;
+
+		rc = lnet_startup_lndni(ni, tun);
+
+		LASSERT(ni->ni_net->net_tunables.lct_peer_timeout <= 0 ||
+			ni->ni_net->net_lnd->lnd_query != NULL);
+
+		if (rc < 0)
+			goto failed1;
+
+		lnet_ni_addref(ni);
+		list_add_tail(&ni->ni_netlist, &local_ni_list);
+
+		ni_count++;
+	}
+
+	lnet_net_lock(LNET_LOCK_EX);
+	list_splice_tail(&local_ni_list, &net_l->net_ni_list);
+	lnet_incr_dlc_seq();
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	/* if the network is not unique then we don't want to keep
+	 * it around after we're done. Free it. Otherwise add that
+	 * net to the global the_lnet.ln_nets */
+	if (net_l != net && net_l != NULL) {
+		/*
+		 * TODO - note. currently the tunables can not be updated
+		 * once added
+		 */
+		lnet_net_free(net);
+	} else {
+		net->net_state = LNET_NET_STATE_ACTIVE;
+		/*
+		 * restore tunables after it has been overwitten by the
+		 * lnd
+		 */
+		if (peer_timeout != -1)
+			net->net_tunables.lct_peer_timeout = peer_timeout;
+		if (maxtxcredits != -1)
+			net->net_tunables.lct_max_tx_credits = maxtxcredits;
+		if (peerrtrcredits != -1)
+			net->net_tunables.lct_peer_rtr_credits = peerrtrcredits;
+
+		lnet_net_lock(LNET_LOCK_EX);
+		list_add_tail(&net->net_list, &the_lnet.ln_nets);
+		lnet_net_unlock(LNET_LOCK_EX);
+	}
+
+	return ni_count;
+
+failed1:
+	/*
+	 * shutdown the new NIs that are being started up
+	 * free the NET being started
+	 */
+	while (!list_empty(&local_ni_list)) {
+		ni = list_entry(local_ni_list.next, struct lnet_ni,
+				ni_netlist);
+
+		lnet_shutdown_lndni(ni);
+	}
+
+failed0:
+	lnet_net_free(net);
+
+	return rc;
+}
+
+static int
+lnet_startup_lndnets(struct list_head *netlist)
+{
+	struct lnet_net		*net;
+	int			rc;
+	int			ni_count = 0;
+
+	/*
+	 * Change to running state before bringing up the LNDs. This
+	 * allows lnet_shutdown_lndnets() to assert that we've passed
+	 * through here.
+	 */
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_state = LNET_STATE_RUNNING;
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	while (!list_empty(netlist)) {
+		net = list_entry(netlist->next, struct lnet_net, net_list);
+		list_del_init(&net->net_list);
+
+		rc = lnet_startup_lndnet(net, NULL);
+
+		if (rc < 0)
+			goto failed;
+
+		ni_count += rc;
+	}
+
+	return ni_count;
+failed:
+	lnet_shutdown_lndnets();
+
+	return rc;
+}
+
+/**
+ * Initialize LNet library.
+ *
+ * Automatically called at module loading time. Caller has to call
+ * lnet_lib_exit() after a call to lnet_lib_init(), if and only if the
+ * latter returned 0. It must be called exactly once.
+ *
+ * \retval 0 on success
+ * \retval -ve on failures.
+ */
+int lnet_lib_init(void)
+{
+	int rc;
+
+	lnet_assert_wire_constants();
+
+	memset(&the_lnet, 0, sizeof(the_lnet));
+
+	/* refer to global cfs_cpt_table for now */
+	the_lnet.ln_cpt_table	= cfs_cpt_table;
+	the_lnet.ln_cpt_number	= cfs_cpt_number(cfs_cpt_table);
+
+	LASSERT(the_lnet.ln_cpt_number > 0);
+	if (the_lnet.ln_cpt_number > LNET_CPT_MAX) {
+		/* we are under risk of consuming all lh_cookie */
+		CERROR("Can't have %d CPTs for LNet (max allowed is %d), "
+		       "please change setting of CPT-table and retry\n",
+		       the_lnet.ln_cpt_number, LNET_CPT_MAX);
+		return -E2BIG;
+	}
+
+	while ((1 << the_lnet.ln_cpt_bits) < the_lnet.ln_cpt_number)
+		the_lnet.ln_cpt_bits++;
+
+	rc = lnet_create_locks();
+	if (rc != 0) {
+		CERROR("Can't create LNet global locks: %d\n", rc);
+		return rc;
+	}
+
+	the_lnet.ln_refcount = 0;
+	LNetInvalidateEQHandle(&the_lnet.ln_rc_eqh);
+	INIT_LIST_HEAD(&the_lnet.ln_lnds);
+	INIT_LIST_HEAD(&the_lnet.ln_net_zombie);
+	INIT_LIST_HEAD(&the_lnet.ln_rcd_zombie);
+	INIT_LIST_HEAD(&the_lnet.ln_rcd_deathrow);
+
+	/* The hash table size is the number of bits it takes to express the set
+	 * ln_num_routes, minus 1 (better to under estimate than over so we
+	 * don't waste memory). */
+	if (rnet_htable_size <= 0)
+		rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT;
+	else if (rnet_htable_size > LNET_REMOTE_NETS_HASH_MAX)
+		rnet_htable_size = LNET_REMOTE_NETS_HASH_MAX;
+	the_lnet.ln_remote_nets_hbits = max_t(int, 1,
+					   order_base_2(rnet_htable_size) - 1);
+
+	/* All LNDs apart from the LOLND are in separate modules.  They
+	 * register themselves when their module loads, and unregister
+	 * themselves when their module is unloaded. */
+	lnet_register_lnd(&the_lolnd);
+	return 0;
+}
+
+/**
+ * Finalize LNet library.
+ *
+ * \pre lnet_lib_init() called with success.
+ * \pre All LNet users called LNetNIFini() for matching LNetNIInit() calls.
+ */
+void lnet_lib_exit(void)
+{
+	LASSERT(the_lnet.ln_refcount == 0);
+
+	while (!list_empty(&the_lnet.ln_lnds))
+		lnet_unregister_lnd(list_entry(the_lnet.ln_lnds.next,
+					       struct lnet_lnd, lnd_list));
+	lnet_destroy_locks();
+}
+
+/**
+ * Set LNet PID and start LNet interfaces, routing, and forwarding.
+ *
+ * Users must call this function at least once before any other functions.
+ * For each successful call there must be a corresponding call to
+ * LNetNIFini(). For subsequent calls to LNetNIInit(), \a requested_pid is
+ * ignored.
+ *
+ * The PID used by LNet may be different from the one requested.
+ * See LNetGetId().
+ *
+ * \param requested_pid PID requested by the caller.
+ *
+ * \return >= 0 on success, and < 0 error code on failures.
+ */
+int
+LNetNIInit(lnet_pid_t requested_pid)
+{
+	int			im_a_router = 0;
+	int			rc;
+	int			ni_count;
+	struct lnet_ping_info	*pinfo;
+	struct lnet_handle_md	md_handle;
+	struct list_head	net_head;
+	struct lnet_net		*net;
+
+	INIT_LIST_HEAD(&net_head);
+
+	mutex_lock(&the_lnet.ln_api_mutex);
+
+	CDEBUG(D_OTHER, "refs %d\n", the_lnet.ln_refcount);
+
+	if (the_lnet.ln_refcount > 0) {
+		rc = the_lnet.ln_refcount++;
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+	}
+
+	rc = lnet_prepare(requested_pid);
+	if (rc != 0) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+	}
+
+	/* create a network for Loopback network */
+	net = lnet_net_alloc(LNET_MKNET(LOLND, 0), &net_head);
+	if (net == NULL) {
+		rc = -ENOMEM;
+		goto err_empty_list;
+	}
+
+	/* Add in the loopback NI */
+	if (lnet_ni_alloc(net, NULL, NULL) == NULL) {
+		rc = -ENOMEM;
+		goto err_empty_list;
+	}
+
+	/* If LNet is being initialized via DLC it is possible
+	 * that the user requests not to load module parameters (ones which
+	 * are supported by DLC) on initialization.  Therefore, make sure not
+	 * to load networks, routes and forwarding from module parameters
+	 * in this case.  On cleanup in case of failure only clean up
+	 * routes if it has been loaded */
+	if (!the_lnet.ln_nis_from_mod_params) {
+		rc = lnet_parse_networks(&net_head, lnet_get_networks(),
+					 use_tcp_bonding);
+		if (rc < 0)
+			goto err_empty_list;
+	}
+
+	ni_count = lnet_startup_lndnets(&net_head);
+	if (ni_count < 0) {
+		rc = ni_count;
+		goto err_empty_list;
+	}
+
+	if (!the_lnet.ln_nis_from_mod_params) {
+		rc = lnet_parse_routes(lnet_get_routes(), &im_a_router);
+		if (rc != 0)
+			goto err_shutdown_lndnis;
+
+		rc = lnet_check_routes();
+		if (rc != 0)
+			goto err_destroy_routes;
+
+		rc = lnet_rtrpools_alloc(im_a_router);
+		if (rc != 0)
+			goto err_destroy_routes;
+	}
+
+	rc = lnet_acceptor_start();
+	if (rc != 0)
+		goto err_destroy_routes;
+
+	the_lnet.ln_refcount = 1;
+	/* Now I may use my own API functions... */
+
+	rc = lnet_ping_info_setup(&pinfo, &md_handle, ni_count, true);
+	if (rc != 0)
+		goto err_acceptor_stop;
+
+	lnet_ping_target_update(pinfo, md_handle);
+
+	rc = lnet_router_checker_start();
+	if (rc != 0)
+		goto err_stop_ping;
+
+	lnet_fault_init();
+	lnet_proc_init();
+
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	return 0;
+
+err_stop_ping:
+	lnet_ping_target_fini();
+err_acceptor_stop:
+	the_lnet.ln_refcount = 0;
+	lnet_acceptor_stop();
+err_destroy_routes:
+	if (!the_lnet.ln_nis_from_mod_params)
+		lnet_destroy_routes();
+err_shutdown_lndnis:
+	lnet_shutdown_lndnets();
+err_empty_list:
+	lnet_unprepare();
+	LASSERT(rc < 0);
+	mutex_unlock(&the_lnet.ln_api_mutex);
+	while (!list_empty(&net_head)) {
+		struct lnet_net *net;
+
+		net = list_entry(net_head.next, struct lnet_net, net_list);
+		list_del_init(&net->net_list);
+		lnet_net_free(net);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(LNetNIInit);
+
+/**
+ * Stop LNet interfaces, routing, and forwarding.
+ *
+ * Users must call this function once for each successful call to LNetNIInit().
+ * Once the LNetNIFini() operation has been started, the results of pending
+ * API operations are undefined.
+ *
+ * \return always 0 for current implementation.
+ */
+int
+LNetNIFini()
+{
+	mutex_lock(&the_lnet.ln_api_mutex);
+
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	if (the_lnet.ln_refcount != 1) {
+		the_lnet.ln_refcount--;
+	} else {
+		LASSERT(!the_lnet.ln_niinit_self);
+
+		lnet_fault_fini();
+
+		lnet_proc_fini();
+		lnet_router_checker_stop();
+		lnet_ping_target_fini();
+
+		/* Teardown fns that use my own API functions BEFORE here */
+		the_lnet.ln_refcount = 0;
+
+		lnet_acceptor_stop();
+		lnet_destroy_routes();
+		lnet_shutdown_lndnets();
+		lnet_unprepare();
+	}
+
+	mutex_unlock(&the_lnet.ln_api_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(LNetNIFini);
+
+/**
+ * Grabs the ni data from the ni structure and fills the out
+ * parameters
+ *
+ * \param[in] ni network	interface structure
+ * \param[out] cfg_ni		NI config information
+ * \param[out] tun		network and LND tunables
+ */
+static void
+lnet_fill_ni_info(struct lnet_ni *ni, struct lnet_ioctl_config_ni *cfg_ni,
+		   struct lnet_ioctl_config_lnd_tunables *tun,
+		   struct lnet_ioctl_element_stats *stats,
+		   __u32 tun_size)
+{
+	size_t min_size = 0;
+	int i;
+
+	if (!ni || !cfg_ni || !tun)
+		return;
+
+	if (ni->ni_interfaces[0] != NULL) {
+		for (i = 0; i < ARRAY_SIZE(ni->ni_interfaces); i++) {
+			if (ni->ni_interfaces[i] != NULL) {
+				strncpy(cfg_ni->lic_ni_intf[i],
+					ni->ni_interfaces[i],
+					sizeof(cfg_ni->lic_ni_intf[i]));
+			}
+		}
+	}
+
+	cfg_ni->lic_nid = ni->ni_nid;
+	cfg_ni->lic_status = ni->ni_status->ns_status;
+	cfg_ni->lic_tcp_bonding = use_tcp_bonding;
+	cfg_ni->lic_dev_cpt = ni->ni_dev_cpt;
+
+	memcpy(&tun->lt_cmn, &ni->ni_net->net_tunables, sizeof(tun->lt_cmn));
+
+	if (stats) {
+		stats->iel_send_count = atomic_read(&ni->ni_stats.send_count);
+		stats->iel_recv_count = atomic_read(&ni->ni_stats.recv_count);
+	}
+
+	/*
+	 * tun->lt_tun will always be present, but in order to be
+	 * backwards compatible, we need to deal with the cases when
+	 * tun->lt_tun is smaller than what the kernel has, because it
+	 * comes from an older version of a userspace program, then we'll
+	 * need to copy as much information as we have available space.
+	 */
+	min_size = tun_size - sizeof(tun->lt_cmn);
+	memcpy(&tun->lt_tun, &ni->ni_lnd_tunables, min_size);
+
+	/* copy over the cpts */
+	if (ni->ni_ncpts == LNET_CPT_NUMBER &&
+	    ni->ni_cpts == NULL)  {
+		for (i = 0; i < ni->ni_ncpts; i++)
+			cfg_ni->lic_cpts[i] = i;
+	} else {
+		for (i = 0;
+		     ni->ni_cpts != NULL && i < ni->ni_ncpts &&
+		     i < LNET_MAX_SHOW_NUM_CPT;
+		     i++)
+			cfg_ni->lic_cpts[i] = ni->ni_cpts[i];
+	}
+	cfg_ni->lic_ncpts = ni->ni_ncpts;
+}
+
+/**
+ * NOTE: This is a legacy function left in the code to be backwards
+ * compatible with older userspace programs. It should eventually be
+ * removed.
+ *
+ * Grabs the ni data from the ni structure and fills the out
+ * parameters
+ *
+ * \param[in] ni network	interface structure
+ * \param[out] config		config information
+ */
+static void
+lnet_fill_ni_info_legacy(struct lnet_ni *ni,
+			 struct lnet_ioctl_config_data *config)
+{
+	struct lnet_ioctl_net_config *net_config;
+	struct lnet_ioctl_config_lnd_tunables *lnd_cfg = NULL;
+	size_t min_size, tunable_size = 0;
+	int i;
+
+	if (!ni || !config)
+		return;
+
+	net_config = (struct lnet_ioctl_net_config *) config->cfg_bulk;
+	if (!net_config)
+		return;
+
+	BUILD_BUG_ON(ARRAY_SIZE(ni->ni_interfaces) !=
+		     ARRAY_SIZE(net_config->ni_interfaces));
+
+	for (i = 0; i < ARRAY_SIZE(ni->ni_interfaces); i++) {
+		if (!ni->ni_interfaces[i])
+			break;
+
+		strncpy(net_config->ni_interfaces[i],
+			ni->ni_interfaces[i],
+			sizeof(net_config->ni_interfaces[i]));
+	}
+
+	config->cfg_nid = ni->ni_nid;
+	config->cfg_config_u.cfg_net.net_peer_timeout =
+		ni->ni_net->net_tunables.lct_peer_timeout;
+	config->cfg_config_u.cfg_net.net_max_tx_credits =
+		ni->ni_net->net_tunables.lct_max_tx_credits;
+	config->cfg_config_u.cfg_net.net_peer_tx_credits =
+		ni->ni_net->net_tunables.lct_peer_tx_credits;
+	config->cfg_config_u.cfg_net.net_peer_rtr_credits =
+		ni->ni_net->net_tunables.lct_peer_rtr_credits;
+
+	net_config->ni_status = ni->ni_status->ns_status;
+
+	if (ni->ni_cpts) {
+		int num_cpts = min(ni->ni_ncpts, LNET_MAX_SHOW_NUM_CPT);
+
+		for (i = 0; i < num_cpts; i++)
+			net_config->ni_cpts[i] = ni->ni_cpts[i];
+
+		config->cfg_ncpts = num_cpts;
+	}
+
+	/*
+	 * See if user land tools sent in a newer and larger version
+	 * of struct lnet_tunables than what the kernel uses.
+	 */
+	min_size = sizeof(*config) + sizeof(*net_config);
+
+	if (config->cfg_hdr.ioc_len > min_size)
+		tunable_size = config->cfg_hdr.ioc_len - min_size;
+
+	/* Don't copy too much data to user space */
+	min_size = min(tunable_size, sizeof(ni->ni_lnd_tunables));
+	lnd_cfg = (struct lnet_ioctl_config_lnd_tunables *)net_config->cfg_bulk;
+
+	if (lnd_cfg && min_size) {
+		memcpy(&lnd_cfg->lt_tun, &ni->ni_lnd_tunables, min_size);
+		config->cfg_config_u.cfg_net.net_interface_count = 1;
+
+		/* Tell user land that kernel side has less data */
+		if (tunable_size > sizeof(ni->ni_lnd_tunables)) {
+			min_size = tunable_size - sizeof(ni->ni_lnd_tunables);
+			config->cfg_hdr.ioc_len -= min_size;
+		}
+	}
+}
+
+struct lnet_ni *
+lnet_get_ni_idx_locked(int idx)
+{
+	struct lnet_ni		*ni;
+	struct lnet_net		*net;
+
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+			if (idx-- == 0)
+				return ni;
+		}
+	}
+
+	return NULL;
+}
+
+struct lnet_ni *
+lnet_get_next_ni_locked(struct lnet_net *mynet, struct lnet_ni *prev)
+{
+	struct lnet_ni		*ni;
+	struct lnet_net		*net = mynet;
+
+	if (prev == NULL) {
+		if (net == NULL)
+			net = list_entry(the_lnet.ln_nets.next, struct lnet_net,
+					net_list);
+		ni = list_entry(net->net_ni_list.next, struct lnet_ni,
+				ni_netlist);
+
+		return ni;
+	}
+
+	if (prev->ni_netlist.next == &prev->ni_net->net_ni_list) {
+		/* if you reached the end of the ni list and the net is
+		 * specified, then there are no more nis in that net */
+		if (net != NULL)
+			return NULL;
+
+		/* we reached the end of this net ni list. move to the
+		 * next net */
+		if (prev->ni_net->net_list.next == &the_lnet.ln_nets)
+			/* no more nets and no more NIs. */
+			return NULL;
+
+		/* get the next net */
+		net = list_entry(prev->ni_net->net_list.next, struct lnet_net,
+				 net_list);
+		/* get the ni on it */
+		ni = list_entry(net->net_ni_list.next, struct lnet_ni,
+				ni_netlist);
+
+		return ni;
+	}
+
+	/* there are more nis left */
+	ni = list_entry(prev->ni_netlist.next, struct lnet_ni, ni_netlist);
+
+	return ni;
+}
+
+int
+lnet_get_net_config(struct lnet_ioctl_config_data *config)
+{
+	struct lnet_ni *ni;
+	int cpt;
+	int rc = -ENOENT;
+	int idx = config->cfg_count;
+
+	cpt = lnet_net_lock_current();
+
+	ni = lnet_get_ni_idx_locked(idx);
+
+	if (ni != NULL) {
+		rc = 0;
+		lnet_ni_lock(ni);
+		lnet_fill_ni_info_legacy(ni, config);
+		lnet_ni_unlock(ni);
+	}
+
+	lnet_net_unlock(cpt);
+	return rc;
+}
+
+int
+lnet_get_ni_config(struct lnet_ioctl_config_ni *cfg_ni,
+		   struct lnet_ioctl_config_lnd_tunables *tun,
+		   struct lnet_ioctl_element_stats *stats,
+		   __u32 tun_size)
+{
+	struct lnet_ni		*ni;
+	int			cpt;
+	int			rc = -ENOENT;
+
+	if (!cfg_ni || !tun || !stats)
+		return -EINVAL;
+
+	cpt = lnet_net_lock_current();
+
+	ni = lnet_get_ni_idx_locked(cfg_ni->lic_idx);
+
+	if (ni) {
+		rc = 0;
+		lnet_ni_lock(ni);
+		lnet_fill_ni_info(ni, cfg_ni, tun, stats, tun_size);
+		lnet_ni_unlock(ni);
+	}
+
+	lnet_net_unlock(cpt);
+	return rc;
+}
+
+static int lnet_add_net_common(struct lnet_net *net,
+			       struct lnet_ioctl_config_lnd_tunables *tun)
+{
+	__u32			net_id;
+	struct lnet_ping_info	*pinfo;
+	struct lnet_handle_md	md_handle;
+	int			rc;
+	struct lnet_remotenet *rnet;
+	int			net_ni_count;
+	int			num_acceptor_nets;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	rnet = lnet_find_rnet_locked(net->net_id);
+	lnet_net_unlock(LNET_LOCK_EX);
+	/*
+	 * make sure that the net added doesn't invalidate the current
+	 * configuration LNet is keeping
+	 */
+	if (rnet) {
+		CERROR("Adding net %s will invalidate routing configuration\n",
+		       libcfs_net2str(net->net_id));
+		lnet_net_free(net);
+		return -EUSERS;
+	}
+
+	/*
+	 * make sure you calculate the correct number of slots in the ping
+	 * info. Since the ping info is a flattened list of all the NIs,
+	 * we should allocate enough slots to accomodate the number of NIs
+	 * which will be added.
+	 *
+	 * since ni hasn't been configured yet, use
+	 * lnet_get_net_ni_count_pre() which checks the net_ni_added list
+	 */
+	net_ni_count = lnet_get_net_ni_count_pre(net);
+
+	rc = lnet_ping_info_setup(&pinfo, &md_handle,
+				  net_ni_count + lnet_get_ni_count(),
+				  false);
+	if (rc < 0) {
+		lnet_net_free(net);
+		return rc;
+	}
+
+	if (tun)
+		memcpy(&net->net_tunables,
+		       &tun->lt_cmn, sizeof(net->net_tunables));
+	else
+		memset(&net->net_tunables, -1, sizeof(net->net_tunables));
+
+	/*
+	 * before starting this network get a count of the current TCP
+	 * networks which require the acceptor thread running. If that
+	 * count is == 0 before we start up this network, then we'd want to
+	 * start up the acceptor thread after starting up this network
+	 */
+	num_acceptor_nets = lnet_count_acceptor_nets();
+
+	net_id = net->net_id;
+
+	rc = lnet_startup_lndnet(net,
+				 (tun) ? &tun->lt_tun : NULL);
+	if (rc < 0)
+		goto failed;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	net = lnet_get_net_locked(net_id);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	LASSERT(net);
+
+	/*
+	 * Start the acceptor thread if this is the first network
+	 * being added that requires the thread.
+	 */
+	if (net->net_lnd->lnd_accept && num_acceptor_nets == 0) {
+		rc = lnet_acceptor_start();
+		if (rc < 0) {
+			/* shutdown the net that we just started */
+			CERROR("Failed to start up acceptor thread\n");
+			lnet_shutdown_lndnet(net);
+			goto failed;
+		}
+	}
+
+	lnet_net_lock(LNET_LOCK_EX);
+	lnet_peer_net_added(net);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	lnet_ping_target_update(pinfo, md_handle);
+
+	return 0;
+
+failed:
+	lnet_ping_md_unlink(pinfo, &md_handle);
+	lnet_ping_info_free(pinfo);
+	return rc;
+}
+
+static int lnet_handle_legacy_ip2nets(char *ip2nets,
+				      struct lnet_ioctl_config_lnd_tunables *tun)
+{
+	struct lnet_net *net;
+	char *nets;
+	int rc;
+	struct list_head net_head;
+
+	INIT_LIST_HEAD(&net_head);
+
+	rc = lnet_parse_ip2nets(&nets, ip2nets);
+	if (rc < 0)
+		return rc;
+
+	rc = lnet_parse_networks(&net_head, nets, use_tcp_bonding);
+	if (rc < 0)
+		return rc;
+
+	mutex_lock(&the_lnet.ln_api_mutex);
+	while (!list_empty(&net_head)) {
+		net = list_entry(net_head.next, struct lnet_net, net_list);
+		list_del_init(&net->net_list);
+		rc = lnet_add_net_common(net, tun);
+		if (rc < 0)
+			goto out;
+	}
+
+out:
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	while (!list_empty(&net_head)) {
+		net = list_entry(net_head.next, struct lnet_net, net_list);
+		list_del_init(&net->net_list);
+		lnet_net_free(net);
+	}
+	return rc;
+}
+
+int lnet_dyn_add_ni(struct lnet_ioctl_config_ni *conf)
+{
+	struct lnet_net *net;
+	struct lnet_ni *ni;
+	struct lnet_ioctl_config_lnd_tunables *tun = NULL;
+	int rc, i;
+	__u32 net_id;
+
+	/* get the tunables if they are available */
+	if (conf->lic_cfg_hdr.ioc_len >=
+	    sizeof(*conf) + sizeof(*tun))
+		tun = (struct lnet_ioctl_config_lnd_tunables *)
+			conf->lic_bulk;
+
+	/* handle legacy ip2nets from DLC */
+	if (conf->lic_legacy_ip2nets[0] != '\0')
+		return lnet_handle_legacy_ip2nets(conf->lic_legacy_ip2nets,
+						  tun);
+
+	net_id = LNET_NIDNET(conf->lic_nid);
+
+	net = lnet_net_alloc(net_id, NULL);
+	if (!net)
+		return -ENOMEM;
+
+	for (i = 0; i < conf->lic_ncpts; i++) {
+		if (conf->lic_cpts[i] >= LNET_CPT_NUMBER)
+			return -EINVAL;
+	}
+
+	ni = lnet_ni_alloc_w_cpt_array(net, conf->lic_cpts, conf->lic_ncpts,
+				       conf->lic_ni_intf[0]);
+	if (!ni)
+		return -ENOMEM;
+
+	mutex_lock(&the_lnet.ln_api_mutex);
+
+	rc = lnet_add_net_common(net, tun);
+
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	return rc;
+}
+
+int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf)
+{
+	struct lnet_net	 *net;
+	struct lnet_ni *ni;
+	__u32 net_id = LNET_NIDNET(conf->lic_nid);
+	struct lnet_ping_info *pinfo;
+	struct lnet_handle_md md_handle;
+	int		  rc;
+	int		  net_count;
+	__u32		  addr;
+
+	/* don't allow userspace to shutdown the LOLND */
+	if (LNET_NETTYP(net_id) == LOLND)
+		return -EINVAL;
+
+	mutex_lock(&the_lnet.ln_api_mutex);
+
+	lnet_net_lock(0);
+
+	net = lnet_get_net_locked(net_id);
+	if (!net) {
+		CERROR("net %s not found\n",
+		       libcfs_net2str(net_id));
+		rc = -ENOENT;
+		goto net_unlock;
+	}
+
+	addr = LNET_NIDADDR(conf->lic_nid);
+	if (addr == 0) {
+		/* remove the entire net */
+		net_count = lnet_get_net_ni_count_locked(net);
+
+		lnet_net_unlock(0);
+
+		/* create and link a new ping info, before removing the old one */
+		rc = lnet_ping_info_setup(&pinfo, &md_handle,
+					lnet_get_ni_count() - net_count,
+					false);
+		if (rc != 0)
+			goto out;
+
+		lnet_shutdown_lndnet(net);
+
+		if (lnet_count_acceptor_nets() == 0)
+			lnet_acceptor_stop();
+
+		lnet_ping_target_update(pinfo, md_handle);
+
+		goto out;
+	}
+
+	ni = lnet_nid2ni_locked(conf->lic_nid, 0);
+	if (!ni) {
+		CERROR("nid %s not found \n",
+		       libcfs_nid2str(conf->lic_nid));
+		rc = -ENOENT;
+		goto net_unlock;
+	}
+
+	net_count = lnet_get_net_ni_count_locked(net);
+
+	lnet_net_unlock(0);
+
+	/* create and link a new ping info, before removing the old one */
+	rc = lnet_ping_info_setup(&pinfo, &md_handle,
+				  lnet_get_ni_count() - 1, false);
+	if (rc != 0)
+		goto out;
+
+	lnet_shutdown_lndni(ni);
+
+	if (lnet_count_acceptor_nets() == 0)
+		lnet_acceptor_stop();
+
+	lnet_ping_target_update(pinfo, md_handle);
+
+	/* check if the net is empty and remove it if it is */
+	if (net_count == 1)
+		lnet_shutdown_lndnet(net);
+
+	goto out;
+
+net_unlock:
+	lnet_net_unlock(0);
+out:
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	return rc;
+}
+
+/*
+ * lnet_dyn_add_net and lnet_dyn_del_net are now deprecated.
+ * They are only expected to be called for unique networks.
+ * That can be as a result of older DLC library
+ * calls. Multi-Rail DLC and beyond no longer uses these APIs.
+ */
+int
+lnet_dyn_add_net(struct lnet_ioctl_config_data *conf)
+{
+	struct lnet_net		*net;
+	struct list_head	net_head;
+	int			rc;
+	struct lnet_ioctl_config_lnd_tunables tun;
+	char *nets = conf->cfg_config_u.cfg_net.net_intf;
+
+	INIT_LIST_HEAD(&net_head);
+
+	/* Create a net/ni structures for the network string */
+	rc = lnet_parse_networks(&net_head, nets, use_tcp_bonding);
+	if (rc <= 0)
+		return rc == 0 ? -EINVAL : rc;
+
+	mutex_lock(&the_lnet.ln_api_mutex);
+
+	if (rc > 1) {
+		rc = -EINVAL; /* only add one network per call */
+		goto out_unlock_clean;
+	}
+
+	net = list_entry(net_head.next, struct lnet_net, net_list);
+	list_del_init(&net->net_list);
+
+	LASSERT(lnet_net_unique(net->net_id, &the_lnet.ln_nets, NULL));
+
+	memset(&tun, 0, sizeof(tun));
+
+	tun.lt_cmn.lct_peer_timeout =
+	  conf->cfg_config_u.cfg_net.net_peer_timeout;
+	tun.lt_cmn.lct_peer_tx_credits =
+	  conf->cfg_config_u.cfg_net.net_peer_tx_credits;
+	tun.lt_cmn.lct_peer_rtr_credits =
+	  conf->cfg_config_u.cfg_net.net_peer_rtr_credits;
+	tun.lt_cmn.lct_max_tx_credits =
+	  conf->cfg_config_u.cfg_net.net_max_tx_credits;
+
+	rc = lnet_add_net_common(net, &tun);
+
+out_unlock_clean:
+	mutex_unlock(&the_lnet.ln_api_mutex);
+	while (!list_empty(&net_head)) {
+		/* net_head list is empty in success case */
+		net = list_entry(net_head.next, struct lnet_net, net_list);
+		list_del_init(&net->net_list);
+		lnet_net_free(net);
+	}
+	return rc;
+}
+
+int
+lnet_dyn_del_net(__u32 net_id)
+{
+	struct lnet_net	 *net;
+	struct lnet_ping_info *pinfo;
+	struct lnet_handle_md md_handle;
+	int		  rc;
+	int		  net_ni_count;
+
+	/* don't allow userspace to shutdown the LOLND */
+	if (LNET_NETTYP(net_id) == LOLND)
+		return -EINVAL;
+
+	mutex_lock(&the_lnet.ln_api_mutex);
+
+	lnet_net_lock(0);
+
+	net = lnet_get_net_locked(net_id);
+	if (net == NULL) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	net_ni_count = lnet_get_net_ni_count_locked(net);
+
+	lnet_net_unlock(0);
+
+	/* create and link a new ping info, before removing the old one */
+	rc = lnet_ping_info_setup(&pinfo, &md_handle,
+				  lnet_get_ni_count() - net_ni_count, false);
+	if (rc != 0)
+		goto out;
+
+	lnet_shutdown_lndnet(net);
+
+	if (lnet_count_acceptor_nets() == 0)
+		lnet_acceptor_stop();
+
+	lnet_ping_target_update(pinfo, md_handle);
+
+out:
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	return rc;
+}
+
+void lnet_incr_dlc_seq(void)
+{
+	atomic_inc(&lnet_dlc_seq_no);
+}
+
+__u32 lnet_get_dlc_seq_locked(void)
+{
+	return atomic_read(&lnet_dlc_seq_no);
+}
+
+/**
+ * LNet ioctl handler.
+ *
+ */
+int
+LNetCtl(unsigned int cmd, void *arg)
+{
+	struct libcfs_ioctl_data *data = arg;
+	struct lnet_ioctl_config_data *config;
+	struct lnet_process_id	  id = {0};
+	struct lnet_ni		 *ni;
+	int			  rc;
+
+	BUILD_BUG_ON(sizeof(struct lnet_ioctl_net_config) +
+		     sizeof(struct lnet_ioctl_config_data) > LIBCFS_IOC_DATA_MAX);
+
+	switch (cmd) {
+	case IOC_LIBCFS_GET_NI:
+		rc = LNetGetId(data->ioc_count, &id);
+		data->ioc_nid = id.nid;
+		return rc;
+
+	case IOC_LIBCFS_FAIL_NID:
+		return lnet_fail_nid(data->ioc_nid, data->ioc_count);
+
+	case IOC_LIBCFS_ADD_ROUTE:
+		config = arg;
+
+		if (config->cfg_hdr.ioc_len < sizeof(*config))
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_add_route(config->cfg_net,
+				    config->cfg_config_u.cfg_route.rtr_hop,
+				    config->cfg_nid,
+				    config->cfg_config_u.cfg_route.
+					rtr_priority);
+		if (rc == 0) {
+			rc = lnet_check_routes();
+			if (rc != 0)
+				lnet_del_route(config->cfg_net,
+					       config->cfg_nid);
+		}
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+
+	case IOC_LIBCFS_DEL_ROUTE:
+		config = arg;
+
+		if (config->cfg_hdr.ioc_len < sizeof(*config))
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_del_route(config->cfg_net, config->cfg_nid);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+
+	case IOC_LIBCFS_GET_ROUTE:
+		config = arg;
+
+		if (config->cfg_hdr.ioc_len < sizeof(*config))
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_get_route(config->cfg_count,
+				    &config->cfg_net,
+				    &config->cfg_config_u.cfg_route.rtr_hop,
+				    &config->cfg_nid,
+				    &config->cfg_config_u.cfg_route.rtr_flags,
+				    &config->cfg_config_u.cfg_route.
+					rtr_priority);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+
+	case IOC_LIBCFS_GET_LOCAL_NI: {
+		struct lnet_ioctl_config_ni *cfg_ni;
+		struct lnet_ioctl_config_lnd_tunables *tun = NULL;
+		struct lnet_ioctl_element_stats *stats;
+		__u32 tun_size;
+
+		cfg_ni = arg;
+		/* get the tunables if they are available */
+		if (cfg_ni->lic_cfg_hdr.ioc_len <
+		    sizeof(*cfg_ni) + sizeof(*stats)+ sizeof(*tun))
+			return -EINVAL;
+
+		stats = (struct lnet_ioctl_element_stats *)
+			cfg_ni->lic_bulk;
+		tun = (struct lnet_ioctl_config_lnd_tunables *)
+				(cfg_ni->lic_bulk + sizeof(*stats));
+
+		tun_size = cfg_ni->lic_cfg_hdr.ioc_len - sizeof(*cfg_ni) -
+			sizeof(*stats);
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_get_ni_config(cfg_ni, tun, stats, tun_size);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+	}
+
+	case IOC_LIBCFS_GET_NET: {
+		size_t total = sizeof(*config) +
+			       sizeof(struct lnet_ioctl_net_config);
+		config = arg;
+
+		if (config->cfg_hdr.ioc_len < total)
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_get_net_config(config);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+	}
+
+	case IOC_LIBCFS_GET_LNET_STATS:
+	{
+		struct lnet_ioctl_lnet_stats *lnet_stats = arg;
+
+		if (lnet_stats->st_hdr.ioc_len < sizeof(*lnet_stats))
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		lnet_counters_get(&lnet_stats->st_cntrs);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return 0;
+	}
+
+	case IOC_LIBCFS_CONFIG_RTR:
+		config = arg;
+
+		if (config->cfg_hdr.ioc_len < sizeof(*config))
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		if (config->cfg_config_u.cfg_buffers.buf_enable) {
+			rc = lnet_rtrpools_enable();
+			mutex_unlock(&the_lnet.ln_api_mutex);
+			return rc;
+		}
+		lnet_rtrpools_disable();
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return 0;
+
+	case IOC_LIBCFS_ADD_BUF:
+		config = arg;
+
+		if (config->cfg_hdr.ioc_len < sizeof(*config))
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_rtrpools_adjust(config->cfg_config_u.cfg_buffers.
+						buf_tiny,
+					  config->cfg_config_u.cfg_buffers.
+						buf_small,
+					  config->cfg_config_u.cfg_buffers.
+						buf_large);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+
+	case IOC_LIBCFS_SET_NUMA_RANGE: {
+		struct lnet_ioctl_numa_range *numa;
+		numa = arg;
+		if (numa->nr_hdr.ioc_len != sizeof(*numa))
+			return -EINVAL;
+		mutex_lock(&the_lnet.ln_api_mutex);
+		lnet_numa_range = numa->nr_range;
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return 0;
+	}
+
+	case IOC_LIBCFS_GET_NUMA_RANGE: {
+		struct lnet_ioctl_numa_range *numa;
+		numa = arg;
+		if (numa->nr_hdr.ioc_len != sizeof(*numa))
+			return -EINVAL;
+		numa->nr_range = lnet_numa_range;
+		return 0;
+	}
+
+	case IOC_LIBCFS_GET_BUF: {
+		struct lnet_ioctl_pool_cfg *pool_cfg;
+		size_t total = sizeof(*config) + sizeof(*pool_cfg);
+
+		config = arg;
+
+		if (config->cfg_hdr.ioc_len < total)
+			return -EINVAL;
+
+		pool_cfg = (struct lnet_ioctl_pool_cfg *)config->cfg_bulk;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_get_rtr_pool_cfg(config->cfg_count, pool_cfg);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+	}
+
+	case IOC_LIBCFS_ADD_PEER_NI: {
+		struct lnet_ioctl_peer_cfg *cfg = arg;
+
+		if (cfg->prcfg_hdr.ioc_len < sizeof(*cfg))
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_add_peer_ni_to_peer(cfg->prcfg_prim_nid,
+					      cfg->prcfg_cfg_nid,
+					      cfg->prcfg_mr);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+	}
+
+	case IOC_LIBCFS_DEL_PEER_NI: {
+		struct lnet_ioctl_peer_cfg *cfg = arg;
+
+		if (cfg->prcfg_hdr.ioc_len < sizeof(*cfg))
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_del_peer_ni_from_peer(cfg->prcfg_prim_nid,
+						cfg->prcfg_cfg_nid);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+	}
+
+	case IOC_LIBCFS_GET_PEER_INFO: {
+		struct lnet_ioctl_peer *peer_info = arg;
+
+		if (peer_info->pr_hdr.ioc_len < sizeof(*peer_info))
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_get_peer_ni_info(
+		   peer_info->pr_count,
+		   &peer_info->pr_nid,
+		   peer_info->pr_lnd_u.pr_peer_credits.cr_aliveness,
+		   &peer_info->pr_lnd_u.pr_peer_credits.cr_ncpt,
+		   &peer_info->pr_lnd_u.pr_peer_credits.cr_refcount,
+		   &peer_info->pr_lnd_u.pr_peer_credits.cr_ni_peer_tx_credits,
+		   &peer_info->pr_lnd_u.pr_peer_credits.cr_peer_tx_credits,
+		   &peer_info->pr_lnd_u.pr_peer_credits.cr_peer_rtr_credits,
+		   &peer_info->pr_lnd_u.pr_peer_credits.cr_peer_min_tx_credits,
+		   &peer_info->pr_lnd_u.pr_peer_credits.cr_peer_tx_qnob);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+	}
+
+	case IOC_LIBCFS_GET_PEER_NI: {
+		struct lnet_ioctl_peer_cfg *cfg = arg;
+		struct lnet_peer_ni_credit_info __user *lpni_cri;
+		struct lnet_ioctl_element_stats __user *lpni_stats;
+		size_t usr_size = sizeof(*lpni_cri) + sizeof(*lpni_stats);
+
+		if ((cfg->prcfg_hdr.ioc_len != sizeof(*cfg)) ||
+		    (cfg->prcfg_size != usr_size))
+			return -EINVAL;
+
+		lpni_cri = cfg->prcfg_bulk;
+		lpni_stats = cfg->prcfg_bulk + sizeof(*lpni_cri);
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_get_peer_info(cfg->prcfg_count, &cfg->prcfg_prim_nid,
+					&cfg->prcfg_cfg_nid, &cfg->prcfg_mr,
+					lpni_cri, lpni_stats);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+	}
+
+	case IOC_LIBCFS_NOTIFY_ROUTER:
+		return lnet_notify(NULL, data->ioc_nid, data->ioc_flags,
+				  cfs_time_current() -
+				  cfs_time_seconds(cfs_time_current_sec() -
+						   (time_t)data->ioc_u64[0]));
+
+	case IOC_LIBCFS_LNET_DIST:
+		rc = LNetDist(data->ioc_nid, &data->ioc_nid, &data->ioc_u32[1]);
+		if (rc < 0 && rc != -EHOSTUNREACH)
+			return rc;
+
+		data->ioc_u32[0] = rc;
+		return 0;
+
+	case IOC_LIBCFS_TESTPROTOCOMPAT:
+		lnet_net_lock(LNET_LOCK_EX);
+		the_lnet.ln_testprotocompat = data->ioc_flags;
+		lnet_net_unlock(LNET_LOCK_EX);
+		return 0;
+
+	case IOC_LIBCFS_LNET_FAULT:
+		return lnet_fault_ctl(data->ioc_flags, data);
+
+	case IOC_LIBCFS_PING: {
+		signed long timeout;
+
+		id.nid = data->ioc_nid;
+		id.pid = data->ioc_u32[0];
+
+		/* Don't block longer than 2 minutes */
+		if (data->ioc_u32[1] > 120 * MSEC_PER_SEC)
+			return -EINVAL;
+
+		/* If timestamp is negative then disable timeout */
+		if ((s32)data->ioc_u32[1] < 0)
+			timeout = MAX_SCHEDULE_TIMEOUT;
+		else
+			timeout = msecs_to_jiffies(data->ioc_u32[1]);
+
+		rc = lnet_ping(id, timeout, data->ioc_pbuf1,
+			       data->ioc_plen1 / sizeof(struct lnet_process_id));
+		if (rc < 0)
+			return rc;
+		data->ioc_count = rc;
+		return 0;
+	}
+
+	default:
+		ni = lnet_net2ni_addref(data->ioc_net);
+		if (ni == NULL)
+			return -EINVAL;
+
+		if (ni->ni_net->net_lnd->lnd_ctl == NULL)
+			rc = -EINVAL;
+		else
+			rc = ni->ni_net->net_lnd->lnd_ctl(ni, cmd, arg);
+
+		lnet_ni_decref(ni);
+		return rc;
+	}
+	/* not reached */
+}
+EXPORT_SYMBOL(LNetCtl);
+
+void LNetDebugPeer(struct lnet_process_id id)
+{
+	lnet_debug_peer(id.nid);
+}
+EXPORT_SYMBOL(LNetDebugPeer);
+
+/**
+ * Determine if the specified peer \a nid is on the local node.
+ *
+ * \param nid	peer nid to check
+ *
+ * \retval true		If peer NID is on the local node.
+ * \retval false	If peer NID is not on the local node.
+ */
+bool LNetIsPeerLocal(lnet_nid_t nid)
+{
+	struct lnet_net *net;
+	struct lnet_ni *ni;
+	int cpt;
+
+	cpt = lnet_net_lock_current();
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+			if (ni->ni_nid == nid) {
+				lnet_net_unlock(cpt);
+				return true;
+			}
+		}
+	}
+	lnet_net_unlock(cpt);
+
+	return false;
+}
+EXPORT_SYMBOL(LNetIsPeerLocal);
+
+/**
+ * Retrieve the struct lnet_process_id ID of LNet interface at \a index.
+ * Note that all interfaces share a same PID, as requested by LNetNIInit().
+ *
+ * \param index Index of the interface to look up.
+ * \param id On successful return, this location will hold the
+ * struct lnet_process_id ID of the interface.
+ *
+ * \retval 0 If an interface exists at \a index.
+ * \retval -ENOENT If no interface has been found.
+ */
+int
+LNetGetId(unsigned int index, struct lnet_process_id *id)
+{
+	struct lnet_ni	 *ni;
+	struct lnet_net  *net;
+	int		  cpt;
+	int		  rc = -ENOENT;
+
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	cpt = lnet_net_lock_current();
+
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+			if (index-- != 0)
+				continue;
+
+			id->nid = ni->ni_nid;
+			id->pid = the_lnet.ln_pid;
+			rc = 0;
+			break;
+		}
+	}
+
+	lnet_net_unlock(cpt);
+	return rc;
+}
+EXPORT_SYMBOL(LNetGetId);
+
+static int lnet_ping(struct lnet_process_id id, signed long timeout,
+		     struct lnet_process_id __user *ids, int n_ids)
+{
+	struct lnet_handle_eq eqh;
+	struct lnet_handle_md mdh;
+	struct lnet_event event;
+	struct lnet_md md = { NULL };
+	int		     which;
+	int		     unlinked = 0;
+	int		     replied = 0;
+	const signed long a_long_time = msecs_to_jiffies(60 * MSEC_PER_SEC);
+	int		     infosz;
+	struct lnet_ping_info *info;
+	struct lnet_process_id tmpid;
+	int		     i;
+	int		     nob;
+	int		     rc;
+	int		     rc2;
+	sigset_t	 blocked;
+
+	infosz = offsetof(struct lnet_ping_info, pi_ni[n_ids]);
+
+	/* n_ids limit is arbitrary */
+	if (n_ids <= 0 || n_ids > 20 || id.nid == LNET_NID_ANY)
+		return -EINVAL;
+
+	if (id.pid == LNET_PID_ANY)
+		id.pid = LNET_PID_LUSTRE;
+
+	LIBCFS_ALLOC(info, infosz);
+	if (info == NULL)
+		return -ENOMEM;
+
+	/* NB 2 events max (including any unlink event) */
+	rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &eqh);
+	if (rc != 0) {
+		CERROR("Can't allocate EQ: %d\n", rc);
+		goto out_0;
+	}
+
+	/* initialize md content */
+	md.start     = info;
+	md.length    = infosz;
+	md.threshold = 2; /*GET/REPLY*/
+	md.max_size  = 0;
+	md.options   = LNET_MD_TRUNCATE;
+	md.user_ptr  = NULL;
+	md.eq_handle = eqh;
+
+	rc = LNetMDBind(md, LNET_UNLINK, &mdh);
+	if (rc != 0) {
+		CERROR("Can't bind MD: %d\n", rc);
+		goto out_1;
+	}
+
+	rc = LNetGet(LNET_NID_ANY, mdh, id,
+		     LNET_RESERVED_PORTAL,
+		     LNET_PROTO_PING_MATCHBITS, 0);
+
+	if (rc != 0) {
+		/* Don't CERROR; this could be deliberate! */
+
+		rc2 = LNetMDUnlink(mdh);
+		LASSERT(rc2 == 0);
+
+		/* NB must wait for the UNLINK event below... */
+		unlinked = 1;
+		timeout = a_long_time;
+	}
+
+	do {
+		/* MUST block for unlink to complete */
+		if (unlinked)
+			blocked = cfs_block_allsigs();
+
+		rc2 = LNetEQPoll(&eqh, 1, timeout, &event, &which);
+
+		if (unlinked)
+			cfs_restore_sigs(blocked);
+
+		CDEBUG(D_NET, "poll %d(%d %d)%s\n", rc2,
+		       (rc2 <= 0) ? -1 : event.type,
+		       (rc2 <= 0) ? -1 : event.status,
+		       (rc2 > 0 && event.unlinked) ? " unlinked" : "");
+
+		LASSERT(rc2 != -EOVERFLOW);	/* can't miss anything */
+
+		if (rc2 <= 0 || event.status != 0) {
+			/* timeout or error */
+			if (!replied && rc == 0)
+				rc = (rc2 < 0) ? rc2 :
+				     (rc2 == 0) ? -ETIMEDOUT :
+				     event.status;
+
+			if (!unlinked) {
+				/* Ensure completion in finite time... */
+				LNetMDUnlink(mdh);
+				/* No assertion (racing with network) */
+				unlinked = 1;
+				timeout = a_long_time;
+			} else if (rc2 == 0) {
+				/* timed out waiting for unlink */
+				CWARN("ping %s: late network completion\n",
+				      libcfs_id2str(id));
+			}
+		} else if (event.type == LNET_EVENT_REPLY) {
+			replied = 1;
+			rc = event.mlength;
+		}
+
+	} while (rc2 <= 0 || !event.unlinked);
+
+	if (!replied) {
+		if (rc >= 0)
+			CWARN("%s: Unexpected rc >= 0 but no reply!\n",
+			      libcfs_id2str(id));
+		rc = -EIO;
+		goto out_1;
+	}
+
+	nob = rc;
+	LASSERT(nob >= 0 && nob <= infosz);
+
+	rc = -EPROTO;				/* if I can't parse... */
+
+	if (nob < 8) {
+		/* can't check magic/version */
+		CERROR("%s: ping info too short %d\n",
+		       libcfs_id2str(id), nob);
+		goto out_1;
+	}
+
+	if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) {
+		lnet_swap_pinginfo(info);
+	} else if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
+		CERROR("%s: Unexpected magic %08x\n",
+		       libcfs_id2str(id), info->pi_magic);
+		goto out_1;
+	}
+
+	if ((info->pi_features & LNET_PING_FEAT_NI_STATUS) == 0) {
+		CERROR("%s: ping w/o NI status: 0x%x\n",
+		       libcfs_id2str(id), info->pi_features);
+		goto out_1;
+	}
+
+	if (nob < offsetof(struct lnet_ping_info, pi_ni[0])) {
+		CERROR("%s: Short reply %d(%d min)\n", libcfs_id2str(id),
+		       nob, (int)offsetof(struct lnet_ping_info, pi_ni[0]));
+		goto out_1;
+	}
+
+	if (info->pi_nnis < n_ids)
+		n_ids = info->pi_nnis;
+
+	if (nob < offsetof(struct lnet_ping_info, pi_ni[n_ids])) {
+		CERROR("%s: Short reply %d(%d expected)\n", libcfs_id2str(id),
+		       nob, (int)offsetof(struct lnet_ping_info, pi_ni[n_ids]));
+		goto out_1;
+	}
+
+	rc = -EFAULT;				/* If I SEGV... */
+
+	memset(&tmpid, 0, sizeof(tmpid));
+	for (i = 0; i < n_ids; i++) {
+		tmpid.pid = info->pi_pid;
+		tmpid.nid = info->pi_ni[i].ns_nid;
+		if (copy_to_user(&ids[i], &tmpid, sizeof(tmpid)))
+			goto out_1;
+	}
+	rc = info->pi_nnis;
+
+ out_1:
+	rc2 = LNetEQFree(eqh);
+	if (rc2 != 0)
+		CERROR("rc2 %d\n", rc2);
+	LASSERT(rc2 == 0);
+
+ out_0:
+	LIBCFS_FREE(info, infosz);
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/lnet/lnet/config.c b/drivers/staging/lustrefsx/lnet/lnet/config.c
new file mode 100644
index 0000000000000..2c15e1f5f79a2
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/config.c
@@ -0,0 +1,1709 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/nsproxy.h>
+#include <net/net_namespace.h>
+#include <lnet/lib-lnet.h>
+
+/* tmp struct for parsing routes */
+struct lnet_text_buf {
+	struct list_head	ltb_list;	/* stash on lists */
+	int			ltb_size;	/* allocated size */
+	char			ltb_text[0];	/* text buffer */
+};
+
+static int lnet_tbnob = 0;			/* track text buf allocation */
+#define LNET_MAX_TEXTBUF_NOB	 (64<<10)	/* bound allocation */
+#define LNET_SINGLE_TEXTBUF_NOB  (4<<10)
+
+#define SPACESTR " \t\v\r\n"
+#define DELIMITERS ":()[]"
+
+static void
+lnet_syntax(const char *name, const char *str, int offset, int width)
+{
+	static char dots[LNET_SINGLE_TEXTBUF_NOB];
+	static char dashes[LNET_SINGLE_TEXTBUF_NOB];
+
+	memset(dots, '.', sizeof(dots));
+	dots[sizeof(dots)-1] = 0;
+	memset(dashes, '-', sizeof(dashes));
+	dashes[sizeof(dashes)-1] = 0;
+
+	LCONSOLE_ERROR_MSG(0x10f, "Error parsing '%s=\"%s\"'\n", name, str);
+	LCONSOLE_ERROR_MSG(0x110, "here...........%.*s..%.*s|%.*s|\n",
+			   (int)strlen(name), dots, offset, dots,
+			    (width < 1) ? 0 : width - 1, dashes);
+}
+
+static int
+lnet_issep (char c)
+{
+	switch (c) {
+	case '\n':
+	case '\r':
+	case ';':
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+bool
+lnet_net_unique(__u32 net_id, struct list_head *netlist,
+		struct lnet_net **net)
+{
+	struct lnet_net  *net_l;
+
+	if (!netlist)
+		return true;
+
+	list_for_each_entry(net_l, netlist, net_list) {
+		if (net_l->net_id == net_id) {
+			if (net != NULL)
+				*net = net_l;
+			return false;
+		}
+	}
+
+	return true;
+}
+
+/* check that the NI is unique within the list of NIs already added to
+ * a network */
+bool
+lnet_ni_unique_net(struct list_head *nilist, char *iface)
+{
+	struct list_head *tmp;
+	struct lnet_ni *ni;
+
+	list_for_each(tmp, nilist) {
+		ni = list_entry(tmp, struct lnet_ni, ni_netlist);
+
+		if (ni->ni_interfaces[0] != NULL &&
+		    strncmp(ni->ni_interfaces[0], iface, strlen(iface)) == 0)
+			return false;
+	}
+
+	return true;
+}
+
+/* check that the NI is unique to the interfaces with in the same NI.
+ * This is only a consideration if use_tcp_bonding is set */
+static bool
+lnet_ni_unique_ni(char *iface_list[LNET_NUM_INTERFACES], char *iface)
+{
+	int i;
+	for (i = 0; i < LNET_NUM_INTERFACES; i++) {
+		if (iface_list[i] != NULL &&
+		    strncmp(iface_list[i], iface, strlen(iface)) == 0)
+			return false;
+	}
+
+	return true;
+}
+
+static bool
+in_array(__u32 *array, __u32 size, __u32 value)
+{
+	int i;
+
+	for (i = 0; i < size; i++) {
+		if (array[i] == value)
+			return false;
+	}
+
+	return true;
+}
+
+static int
+lnet_net_append_cpts(__u32 *cpts, __u32 ncpts, struct lnet_net *net)
+{
+	__u32 *added_cpts = NULL;
+	int i, j = 0, rc = 0;
+
+	/*
+	 * no need to go futher since a subset of the NIs already exist on
+	 * all CPTs
+	 */
+	if (net->net_ncpts == LNET_CPT_NUMBER)
+		return 0;
+
+	if (cpts == NULL) {
+		/* there is an NI which will exist on all CPTs */
+		if (net->net_cpts != NULL)
+			LIBCFS_FREE(net->net_cpts, sizeof(*net->net_cpts) *
+				    net->net_ncpts);
+		net->net_cpts = NULL;
+		net->net_ncpts = LNET_CPT_NUMBER;
+		return 0;
+	}
+
+	if (net->net_cpts == NULL) {
+		LIBCFS_ALLOC(net->net_cpts, sizeof(*net->net_cpts) * ncpts);
+		if (net->net_cpts == NULL)
+			return -ENOMEM;
+		memcpy(net->net_cpts, cpts, ncpts);
+		net->net_ncpts = ncpts;
+		return 0;
+	}
+
+	LIBCFS_ALLOC(added_cpts, sizeof(*added_cpts) * LNET_CPT_NUMBER);
+	if (added_cpts == NULL)
+		return -ENOMEM;
+
+	for (i = 0; i < ncpts; i++) {
+		if (!in_array(net->net_cpts, net->net_ncpts, cpts[i])) {
+			added_cpts[j] = cpts[i];
+			j++;
+		}
+	}
+
+	/* append the new cpts if any to the list of cpts in the net */
+	if (j > 0) {
+		__u32 *array = NULL, *loc;
+		__u32 total_entries = j + net->net_ncpts;
+
+		LIBCFS_ALLOC(array, sizeof(*net->net_cpts) * total_entries);
+		if (array == NULL) {
+			rc = -ENOMEM;
+			goto failed;
+		}
+
+		memcpy(array, net->net_cpts, net->net_ncpts);
+		loc = array + net->net_ncpts;
+		memcpy(loc, added_cpts, j);
+
+		LIBCFS_FREE(net->net_cpts, sizeof(*net->net_cpts) *
+			    net->net_ncpts);
+		net->net_ncpts = total_entries;
+		net->net_cpts = array;
+	}
+
+failed:
+	LIBCFS_FREE(added_cpts, sizeof(*added_cpts) * LNET_CPT_NUMBER);
+
+	return rc;
+}
+
+static void
+lnet_net_remove_cpts(__u32 *cpts, __u32 ncpts, struct lnet_net *net)
+{
+	struct lnet_ni *ni;
+	int rc;
+
+	/*
+	 * Operation Assumption:
+	 *	This function is called after an NI has been removed from
+	 *	its parent net.
+	 *
+	 * if we're removing an NI which exists on all CPTs then
+	 * we have to check if any of the other NIs on this net also
+	 * exists on all CPTs. If none, then we need to build our Net CPT
+	 * list based on the remaining NIs.
+	 *
+	 * If the NI being removed exist on a subset of the CPTs then we
+	 * alo rebuild the Net CPT list based on the remaining NIs, which
+	 * should resutl in the expected Net CPT list.
+	 */
+
+	/*
+	 * sometimes this function can be called due to some failure
+	 * creating an NI, before any of the cpts are allocated, so check
+	 * for that case and don't do anything
+	 */
+	if (ncpts == 0)
+		return;
+
+	if (ncpts == LNET_CPT_NUMBER) {
+		/*
+		 * first iteration through the NI list in the net to see
+		 * if any of the NIs exist on all the CPTs. If one is
+		 * found then our job is done.
+		 */
+		list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+			if (ni->ni_ncpts == LNET_CPT_NUMBER)
+				return;
+		}
+	}
+
+	/*
+	 * Rebuild the Net CPT list again, thereby only including only the
+	 * CPTs which the remaining NIs are associated with.
+	 */
+	if (net->net_cpts != NULL) {
+		LIBCFS_FREE(net->net_cpts,
+			sizeof(*net->net_cpts) * net->net_ncpts);
+		net->net_cpts = NULL;
+	}
+
+	list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+		rc = lnet_net_append_cpts(ni->ni_cpts, ni->ni_ncpts,
+					  net);
+		if (rc != 0) {
+			CERROR("Out of Memory\n");
+			/*
+			 * do our best to keep on going. Delete
+			 * the net cpts and set it to NULL. This
+			 * way we can keep on going but less
+			 * efficiently, since memory accesses might be
+			 * accross CPT lines.
+			 */
+			if (net->net_cpts != NULL) {
+				LIBCFS_FREE(net->net_cpts,
+						sizeof(*net->net_cpts) *
+						net->net_ncpts);
+				net->net_cpts = NULL;
+				net->net_ncpts = LNET_CPT_NUMBER;
+			}
+			return;
+		}
+	}
+}
+
+void
+lnet_ni_free(struct lnet_ni *ni)
+{
+	int i;
+
+	lnet_net_remove_cpts(ni->ni_cpts, ni->ni_ncpts, ni->ni_net);
+
+	if (ni->ni_refs != NULL)
+		cfs_percpt_free(ni->ni_refs);
+
+	if (ni->ni_tx_queues != NULL)
+		cfs_percpt_free(ni->ni_tx_queues);
+
+	if (ni->ni_cpts != NULL)
+		cfs_expr_list_values_free(ni->ni_cpts, ni->ni_ncpts);
+
+	for (i = 0; i < LNET_NUM_INTERFACES &&
+		    ni->ni_interfaces[i] != NULL; i++) {
+		LIBCFS_FREE(ni->ni_interfaces[i],
+			    strlen(ni->ni_interfaces[i]) + 1);
+	}
+
+	/* release reference to net namespace */
+	if (ni->ni_net_ns != NULL)
+		put_net(ni->ni_net_ns);
+
+	LIBCFS_FREE(ni, sizeof(*ni));
+}
+
+void
+lnet_net_free(struct lnet_net *net)
+{
+	struct list_head *tmp, *tmp2;
+	struct lnet_ni *ni;
+
+	LASSERT(list_empty(&net->net_ni_zombie));
+
+	/*
+	 * delete any nis that haven't been added yet. This could happen
+	 * if there is a failure on net startup
+	 */
+	list_for_each_safe(tmp, tmp2, &net->net_ni_added) {
+		ni = list_entry(tmp, struct lnet_ni, ni_netlist);
+		list_del_init(&ni->ni_netlist);
+		lnet_ni_free(ni);
+	}
+
+	/* delete any nis which have been started. */
+	list_for_each_safe(tmp, tmp2, &net->net_ni_list) {
+		ni = list_entry(tmp, struct lnet_ni, ni_netlist);
+		list_del_init(&ni->ni_netlist);
+		lnet_ni_free(ni);
+	}
+
+	if (net->net_cpts != NULL)
+		LIBCFS_FREE(net->net_cpts,
+			    sizeof(*net->net_cpts) * net->net_ncpts);
+
+	LIBCFS_FREE(net, sizeof(*net));
+}
+
+struct lnet_net *
+lnet_net_alloc(__u32 net_id, struct list_head *net_list)
+{
+	struct lnet_net		*net;
+
+	if (!lnet_net_unique(net_id, net_list, NULL)) {
+		CERROR("Duplicate net %s. Ignore\n",
+		       libcfs_net2str(net_id));
+		return NULL;
+	}
+
+	LIBCFS_ALLOC(net, sizeof(*net));
+	if (net == NULL) {
+		CERROR("Out of memory creating network %s\n",
+		       libcfs_net2str(net_id));
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&net->net_list);
+	INIT_LIST_HEAD(&net->net_ni_list);
+	INIT_LIST_HEAD(&net->net_ni_added);
+	INIT_LIST_HEAD(&net->net_ni_zombie);
+
+	net->net_id = net_id;
+	net->net_state = LNET_NET_STATE_INIT;
+
+	/* initialize global paramters to undefiend */
+	net->net_tunables.lct_peer_timeout = -1;
+	net->net_tunables.lct_max_tx_credits = -1;
+	net->net_tunables.lct_peer_tx_credits = -1;
+	net->net_tunables.lct_peer_rtr_credits = -1;
+
+	if (net_list)
+		list_add_tail(&net->net_list, net_list);
+
+	return net;
+}
+
+static int
+lnet_ni_add_interface(struct lnet_ni *ni, char *iface)
+{
+	int niface = 0;
+
+	if (ni == NULL)
+		return -ENOMEM;
+
+	if (!lnet_ni_unique_ni(ni->ni_interfaces, iface))
+		return -EINVAL;
+
+	/* Allocate a separate piece of memory and copy
+	 * into it the string, so we don't have
+	 * a depencency on the tokens string.  This way we
+	 * can free the tokens at the end of the function.
+	 * The newly allocated ni_interfaces[] can be
+	 * freed when freeing the NI */
+	while (niface < LNET_NUM_INTERFACES &&
+	       ni->ni_interfaces[niface] != NULL)
+		niface++;
+
+	if (niface >= LNET_NUM_INTERFACES) {
+		LCONSOLE_ERROR_MSG(0x115, "Too many interfaces "
+				   "for net %s\n",
+				   libcfs_net2str(LNET_NIDNET(ni->ni_nid)));
+		return -EINVAL;
+	}
+
+	LIBCFS_ALLOC(ni->ni_interfaces[niface],
+		     strlen(iface) + 1);
+
+	if (ni->ni_interfaces[niface] == NULL) {
+		CERROR("Can't allocate net interface name\n");
+		return -ENOMEM;
+	}
+
+	strncpy(ni->ni_interfaces[niface], iface,
+		strlen(iface) + 1);
+
+	return 0;
+}
+
+static struct lnet_ni *
+lnet_ni_alloc_common(struct lnet_net *net, char *iface)
+{
+	struct lnet_tx_queue	*tq;
+	struct lnet_ni		*ni;
+	int			i;
+
+	if (iface != NULL)
+		/* make sure that this NI is unique in the net it's
+		 * being added to */
+		if (!lnet_ni_unique_net(&net->net_ni_added, iface))
+			return NULL;
+
+	LIBCFS_ALLOC(ni, sizeof(*ni));
+	if (ni == NULL) {
+		CERROR("Out of memory creating network interface %s%s\n",
+		       libcfs_net2str(net->net_id),
+		       (iface != NULL) ? iface : "");
+		return NULL;
+	}
+
+	spin_lock_init(&ni->ni_lock);
+	INIT_LIST_HEAD(&ni->ni_cptlist);
+	INIT_LIST_HEAD(&ni->ni_netlist);
+	ni->ni_refs = cfs_percpt_alloc(lnet_cpt_table(),
+				       sizeof(*ni->ni_refs[0]));
+	if (ni->ni_refs == NULL)
+		goto failed;
+
+	ni->ni_tx_queues = cfs_percpt_alloc(lnet_cpt_table(),
+					    sizeof(*ni->ni_tx_queues[0]));
+	if (ni->ni_tx_queues == NULL)
+		goto failed;
+
+	cfs_percpt_for_each(tq, i, ni->ni_tx_queues)
+		INIT_LIST_HEAD(&tq->tq_delayed);
+
+	ni->ni_net = net;
+	/* LND will fill in the address part of the NID */
+	ni->ni_nid = LNET_MKNID(net->net_id, 0);
+
+	/* Store net namespace in which current ni is being created */
+	if (current->nsproxy->net_ns != NULL)
+		ni->ni_net_ns = get_net(current->nsproxy->net_ns);
+	else
+		ni->ni_net_ns = NULL;
+
+	ni->ni_last_alive = cfs_time_current_sec();
+	ni->ni_state = LNET_NI_STATE_INIT;
+	list_add_tail(&ni->ni_netlist, &net->net_ni_added);
+
+	/*
+	 * if an interface name is provided then make sure to add in that
+	 * interface name in NI
+	 */
+	if (iface)
+		if (lnet_ni_add_interface(ni, iface) != 0)
+			goto failed;
+
+	return ni;
+failed:
+	lnet_ni_free(ni);
+	return NULL;
+}
+
+/* allocate and add to the provided network */
+struct lnet_ni *
+lnet_ni_alloc(struct lnet_net *net, struct cfs_expr_list *el, char *iface)
+{
+	struct lnet_ni		*ni;
+	int			rc;
+
+	ni = lnet_ni_alloc_common(net, iface);
+	if (!ni)
+		return NULL;
+
+	if (!el) {
+		ni->ni_cpts  = NULL;
+		ni->ni_ncpts = LNET_CPT_NUMBER;
+	} else {
+		rc = cfs_expr_list_values(el, LNET_CPT_NUMBER, &ni->ni_cpts);
+		if (rc <= 0) {
+			CERROR("Failed to set CPTs for NI %s(%s): %d\n",
+			       libcfs_net2str(net->net_id),
+			       (iface != NULL) ? iface : "", rc);
+			goto failed;
+		}
+
+		LASSERT(rc <= LNET_CPT_NUMBER);
+		if (rc == LNET_CPT_NUMBER) {
+			LIBCFS_FREE(ni->ni_cpts, rc * sizeof(ni->ni_cpts[0]));
+			ni->ni_cpts = NULL;
+		}
+
+		ni->ni_ncpts = rc;
+	}
+
+	rc = lnet_net_append_cpts(ni->ni_cpts, ni->ni_ncpts, net);
+	if (rc != 0)
+		goto failed;
+
+	return ni;
+failed:
+	lnet_ni_free(ni);
+	return NULL;
+}
+
+struct lnet_ni *
+lnet_ni_alloc_w_cpt_array(struct lnet_net *net, __u32 *cpts, __u32 ncpts,
+			  char *iface)
+{
+	struct lnet_ni		*ni;
+	int			rc;
+
+	ni = lnet_ni_alloc_common(net, iface);
+	if (!ni)
+		return NULL;
+
+	if (ncpts == 0) {
+		ni->ni_cpts  = NULL;
+		ni->ni_ncpts = LNET_CPT_NUMBER;
+	} else {
+		size_t array_size = ncpts * sizeof(ni->ni_cpts[0]);
+		LIBCFS_ALLOC(ni->ni_cpts, array_size);
+		if (ni->ni_cpts == NULL)
+			goto failed;
+		memcpy(ni->ni_cpts, cpts, array_size);
+		ni->ni_ncpts = ncpts;
+	}
+
+	rc = lnet_net_append_cpts(ni->ni_cpts, ni->ni_ncpts, net);
+	if (rc != 0)
+		goto failed;
+
+	return ni;
+failed:
+	lnet_ni_free(ni);
+	return NULL;
+}
+
+/*
+ * Parse the networks string and create the matching set of NIs on the
+ * nilist.
+ */
+int
+lnet_parse_networks(struct list_head *netlist, char *networks,
+		    bool use_tcp_bonding)
+{
+	struct cfs_expr_list *net_el = NULL;
+	struct cfs_expr_list *ni_el = NULL;
+	int		tokensize;
+	char		*tokens;
+	char		*str;
+	struct lnet_net *net;
+	struct lnet_ni	*ni = NULL;
+	__u32		net_id;
+	int		nnets = 0;
+
+	if (networks == NULL) {
+		CERROR("networks string is undefined\n");
+		return -EINVAL;
+	}
+
+	if (strlen(networks) > LNET_SINGLE_TEXTBUF_NOB) {
+		/* _WAY_ conservative */
+		LCONSOLE_ERROR_MSG(0x112, "Can't parse networks: string too "
+				   "long\n");
+		return -EINVAL;
+	}
+
+	tokensize = strlen(networks) + 1;
+
+	LIBCFS_ALLOC(tokens, tokensize);
+	if (tokens == NULL) {
+		CERROR("Can't allocate net tokens\n");
+		return -ENOMEM;
+	}
+
+	memcpy(tokens, networks, tokensize);
+	str = tokens;
+
+	/*
+	 * Main parser loop.
+	 *
+	 * NB we don't check interface conflicts here; it's the LNDs
+	 * responsibility (if it cares at all)
+	 */
+	do {
+		char *nistr;
+		char *elstr;
+		char *name;
+		int rc;
+
+		/*
+		 * Parse a network string into its components.
+		 *
+		 * <name>{"("...")"}{"["<el>"]"}
+		 */
+
+		/* Network name (mandatory) */
+		while (isspace(*str))
+			*str++ = '\0';
+		if (!*str)
+			break;
+		name = str;
+		str += strcspn(str, SPACESTR ":()[],");
+		while (isspace(*str))
+			*str++ = '\0';
+
+		/* Interface list (optional) */
+		if (*str == '(') {
+			*str++ = '\0';
+			nistr = str;
+			str += strcspn(str, ")");
+			if (*str != ')') {
+				str = nistr;
+				goto failed_syntax;
+			}
+			do {
+				*str++ = '\0';
+			} while (isspace(*str));
+		} else {
+			nistr = NULL;
+		}
+
+		/* CPT expression (optional) */
+		if (*str == '[') {
+			elstr = str;
+			str += strcspn(str, "]");
+			if (*str != ']') {
+				str = elstr;
+				goto failed_syntax;
+			}
+			rc = cfs_expr_list_parse(elstr, str - elstr + 1,
+						0, LNET_CPT_NUMBER - 1,
+						&net_el);
+			if (rc != 0) {
+				str = elstr;
+				goto failed_syntax;
+			}
+			*elstr = '\0';
+			do {
+				*str++ = '\0';
+			} while (isspace(*str));
+		}
+
+		/* Bad delimiters */
+		if (*str && (strchr(DELIMITERS, *str) != NULL))
+			goto failed_syntax;
+
+		/* go to the next net if it exits */
+		str += strcspn(str, ",");
+		if (*str == ',')
+			*str++ = '\0';
+
+		/*
+		 * At this point the name is properly terminated.
+		 */
+		net_id = libcfs_str2net(name);
+		if (net_id == LNET_NIDNET(LNET_NID_ANY)) {
+			LCONSOLE_ERROR_MSG(0x113,
+					"Unrecognised network type\n");
+			str = name;
+			goto failed_syntax;
+		}
+
+		if (LNET_NETTYP(net_id) == LOLND) {
+			/* Loopback is implicit, and there can be only one. */
+			if (net_el) {
+				cfs_expr_list_free(net_el);
+				net_el = NULL;
+			}
+			/* Should we error out instead? */
+			continue;
+		}
+
+		/*
+		 * All network paramaters are now known.
+		 */
+		nnets++;
+
+		/* always allocate a net, since we will eventually add an
+		 * interface to it, or we will fail, in which case we'll
+		 * just delete it */
+		net = lnet_net_alloc(net_id, netlist);
+		if (IS_ERR_OR_NULL(net))
+			goto failed;
+
+		if (!nistr ||
+		    (use_tcp_bonding && LNET_NETTYP(net_id) == SOCKLND)) {
+			/*
+			 * No interface list was specified, allocate a
+			 * ni using the defaults.
+			 */
+			ni = lnet_ni_alloc(net, net_el, NULL);
+			if (IS_ERR_OR_NULL(ni))
+				goto failed;
+
+			if (!nistr) {
+				if (net_el) {
+					cfs_expr_list_free(net_el);
+					net_el = NULL;
+				}
+				continue;
+			}
+		}
+
+		do {
+			elstr = NULL;
+
+			/* Interface name (mandatory) */
+			while (isspace(*nistr))
+				*nistr++ = '\0';
+			name = nistr;
+			nistr += strcspn(nistr, SPACESTR "[],");
+			while (isspace(*nistr))
+				*nistr++ = '\0';
+
+			/* CPT expression (optional) */
+			if (*nistr == '[') {
+				elstr = nistr;
+				nistr += strcspn(nistr, "]");
+				if (*nistr != ']') {
+					str = elstr;
+					goto failed_syntax;
+				}
+				rc = cfs_expr_list_parse(elstr,
+							nistr - elstr + 1,
+							0, LNET_CPT_NUMBER - 1,
+							&ni_el);
+				if (rc != 0) {
+					str = elstr;
+					goto failed_syntax;
+				}
+				*elstr = '\0';
+				do {
+					*nistr++ = '\0';
+				} while (isspace(*nistr));
+			} else {
+				ni_el = net_el;
+			}
+
+			/*
+			 * End of single interface specificaton,
+			 * advance to the start of the next one, if
+			 * any.
+			 */
+			if (*nistr == ',') {
+				do {
+					*nistr++ = '\0';
+				} while (isspace(*nistr));
+				if (!*nistr) {
+					str = nistr;
+					goto failed_syntax;
+				}
+			} else if (*nistr) {
+				str = nistr;
+				goto failed_syntax;
+			}
+
+			/*
+			 * At this point the name is properly terminated.
+			 */
+			if (!*name) {
+				str = name;
+				goto failed_syntax;
+			}
+
+			if (use_tcp_bonding &&
+			    LNET_NETTYP(net->net_id) == SOCKLND) {
+				rc = lnet_ni_add_interface(ni, name);
+				if (rc != 0)
+					goto failed;
+			} else {
+				ni = lnet_ni_alloc(net, ni_el, name);
+				if (IS_ERR_OR_NULL(ni))
+					goto failed;
+			}
+
+			if (ni_el) {
+				if (ni_el != net_el) {
+					cfs_expr_list_free(ni_el);
+					ni_el = NULL;
+				}
+			}
+		} while (*nistr);
+
+		if (net_el) {
+			cfs_expr_list_free(net_el);
+			net_el = NULL;
+		}
+	} while (*str);
+
+	LIBCFS_FREE(tokens, tokensize);
+	return nnets;
+
+ failed_syntax:
+	lnet_syntax("networks", networks, (int)(str - tokens), strlen(str));
+ failed:
+	/* free the net list and all the nis on each net */
+	while (!list_empty(netlist)) {
+		net = list_entry(netlist->next, struct lnet_net, net_list);
+
+		list_del_init(&net->net_list);
+		lnet_net_free(net);
+	}
+
+	if (ni_el && ni_el != net_el)
+		cfs_expr_list_free(ni_el);
+	if (net_el)
+		cfs_expr_list_free(net_el);
+
+	LIBCFS_FREE(tokens, tokensize);
+
+	return -EINVAL;
+}
+
+static struct lnet_text_buf *lnet_new_text_buf(int str_len)
+{
+	struct lnet_text_buf *ltb;
+	int nob;
+
+	/* NB allocate space for the terminating 0 */
+	nob = offsetof(struct lnet_text_buf, ltb_text[str_len + 1]);
+	if (nob > LNET_SINGLE_TEXTBUF_NOB) {
+		/* _way_ conservative for "route net gateway..." */
+		CERROR("text buffer too big\n");
+		return NULL;
+	}
+
+	if (lnet_tbnob + nob > LNET_MAX_TEXTBUF_NOB) {
+		CERROR("Too many text buffers\n");
+		return NULL;
+	}
+
+	LIBCFS_ALLOC(ltb, nob);
+	if (ltb == NULL)
+		return NULL;
+
+	ltb->ltb_size = nob;
+	ltb->ltb_text[0] = 0;
+	lnet_tbnob += nob;
+	return ltb;
+}
+
+static void
+lnet_free_text_buf(struct lnet_text_buf *ltb)
+{
+	lnet_tbnob -= ltb->ltb_size;
+	LIBCFS_FREE(ltb, ltb->ltb_size);
+}
+
+static void
+lnet_free_text_bufs(struct list_head *tbs)
+{
+	struct lnet_text_buf  *ltb;
+
+	while (!list_empty(tbs)) {
+		ltb = list_entry(tbs->next, struct lnet_text_buf, ltb_list);
+
+		list_del(&ltb->ltb_list);
+		lnet_free_text_buf(ltb);
+	}
+}
+
+void
+lnet_print_text_bufs(struct list_head *tbs)
+{
+	struct list_head *tmp;
+	struct lnet_text_buf  *ltb;
+
+	list_for_each(tmp, tbs) {
+		ltb = list_entry(tmp, struct lnet_text_buf, ltb_list);
+
+		CDEBUG(D_WARNING, "%s\n", ltb->ltb_text);
+	}
+
+	CDEBUG(D_WARNING, "%d allocated\n", lnet_tbnob);
+}
+
+static int
+lnet_str2tbs_sep(struct list_head *tbs, char *str)
+{
+	struct list_head  pending;
+	char		 *sep;
+	int		  nob;
+	int		  i;
+	struct lnet_text_buf  *ltb;
+
+	INIT_LIST_HEAD(&pending);
+
+	/* Split 'str' into separate commands */
+	for (;;) {
+		/* skip leading whitespace */
+		while (isspace(*str))
+			str++;
+
+		/* scan for separator or comment */
+		for (sep = str; *sep != 0; sep++)
+			if (lnet_issep(*sep) || *sep == '#')
+				break;
+
+		nob = (int)(sep - str);
+		if (nob > 0) {
+			ltb = lnet_new_text_buf(nob);
+			if (ltb == NULL) {
+				lnet_free_text_bufs(&pending);
+				return -ENOMEM;
+			}
+
+			for (i = 0; i < nob; i++)
+				if (isspace(str[i]))
+					ltb->ltb_text[i] = ' ';
+				else
+					ltb->ltb_text[i] = str[i];
+
+			ltb->ltb_text[nob] = 0;
+
+			list_add_tail(&ltb->ltb_list, &pending);
+		}
+
+		if (*sep == '#') {
+			/* scan for separator */
+			do {
+				sep++;
+			} while (*sep != 0 && !lnet_issep(*sep));
+		}
+
+		if (*sep == 0)
+			break;
+
+		str = sep + 1;
+	}
+
+	list_splice(&pending, tbs->prev);
+	return 0;
+}
+
+static int
+lnet_expand1tb(struct list_head *list,
+	       char *str, char *sep1, char *sep2,
+	       char *item, int itemlen)
+{
+	int		 len1 = (int)(sep1 - str);
+	int		 len2 = strlen(sep2 + 1);
+	struct lnet_text_buf *ltb;
+
+	LASSERT (*sep1 == '[');
+	LASSERT (*sep2 == ']');
+
+	ltb = lnet_new_text_buf(len1 + itemlen + len2);
+	if (ltb == NULL)
+		return -ENOMEM;
+
+	memcpy(ltb->ltb_text, str, len1);
+	memcpy(&ltb->ltb_text[len1], item, itemlen);
+	memcpy(&ltb->ltb_text[len1+itemlen], sep2 + 1, len2);
+	ltb->ltb_text[len1 + itemlen + len2] = 0;
+
+	list_add_tail(&ltb->ltb_list, list);
+	return 0;
+}
+
+static int
+lnet_str2tbs_expand(struct list_head *tbs, char *str)
+{
+	char		  num[16];
+	struct list_head  pending;
+	char		 *sep;
+	char		 *sep2;
+	char		 *parsed;
+	char		 *enditem;
+	int		  lo;
+	int		  hi;
+	int		  stride;
+	int		  i;
+	int		  nob;
+	int		  scanned;
+
+	INIT_LIST_HEAD(&pending);
+
+	sep = strchr(str, '[');
+	if (sep == NULL)			/* nothing to expand */
+		return 0;
+
+	sep2 = strchr(sep, ']');
+	if (sep2 == NULL)
+		goto failed;
+
+	for (parsed = sep; parsed < sep2; parsed = enditem) {
+
+		enditem = ++parsed;
+		while (enditem < sep2 && *enditem != ',')
+			enditem++;
+
+		if (enditem == parsed)		/* no empty items */
+			goto failed;
+
+		if (sscanf(parsed, "%d-%d/%d%n", &lo, &hi, &stride, &scanned) < 3) {
+
+			if (sscanf(parsed, "%d-%d%n", &lo, &hi, &scanned) < 2) {
+
+				/* simple string enumeration */
+				if (lnet_expand1tb(&pending, str, sep, sep2,
+						   parsed, (int)(enditem - parsed)) != 0)
+					goto failed;
+
+				continue;
+			}
+
+			stride = 1;
+		}
+
+		/* range expansion */
+
+		if (enditem != parsed + scanned) /* no trailing junk */
+			goto failed;
+
+		if (hi < 0 || lo < 0 || stride < 0 || hi < lo ||
+		    (hi - lo) % stride != 0)
+			goto failed;
+
+		for (i = lo; i <= hi; i += stride) {
+
+			snprintf(num, sizeof(num), "%d", i);
+			nob = strlen(num);
+			if (nob + 1 == sizeof(num))
+				goto failed;
+
+			if (lnet_expand1tb(&pending, str, sep, sep2,
+					   num, nob) != 0)
+				goto failed;
+		}
+	}
+
+	list_splice(&pending, tbs->prev);
+	return 1;
+
+ failed:
+	lnet_free_text_bufs(&pending);
+	return -EINVAL;
+}
+
+static int
+lnet_parse_hops (char *str, unsigned int *hops)
+{
+	int	len = strlen(str);
+	int	nob = len;
+
+	return (sscanf(str, "%u%n", hops, &nob) >= 1 &&
+		nob == len &&
+		*hops > 0 && *hops < 256);
+}
+
+#define LNET_PRIORITY_SEPARATOR (':')
+
+static int
+lnet_parse_priority(char *str, unsigned int *priority, char **token)
+{
+	int   nob;
+	char *sep;
+	int   len;
+
+	sep = strchr(str, LNET_PRIORITY_SEPARATOR);
+	if (sep == NULL) {
+		*priority = 0;
+		return 0;
+	}
+	len = strlen(sep + 1);
+
+	if ((sscanf((sep+1), "%u%n", priority, &nob) < 1) || (len != nob)) {
+		/* Update the caller's token pointer so it treats the found
+		   priority as the token to report in the error message. */
+		*token += sep - str + 1;
+		return -EINVAL;
+	}
+
+	CDEBUG(D_NET, "gateway %s, priority %d, nob %d\n", str, *priority, nob);
+
+	/*
+	 * Change priority separator to \0 to be able to parse NID
+	 */
+	*sep = '\0';
+	return 0;
+}
+
+static int
+lnet_parse_route (char *str, int *im_a_router)
+{
+	/* static scratch buffer OK (single threaded) */
+	static char	  cmd[LNET_SINGLE_TEXTBUF_NOB];
+
+	struct list_head  nets;
+	struct list_head  gateways;
+	struct list_head *tmp1;
+	struct list_head *tmp2;
+	__u32		  net;
+	lnet_nid_t	  nid;
+	struct lnet_text_buf  *ltb;
+	int		  rc;
+	char		 *sep;
+	char		 *token = str;
+	int		  ntokens = 0;
+	int		  myrc = -1;
+	__u32		  hops;
+	int		  got_hops = 0;
+	unsigned int	  priority = 0;
+
+	INIT_LIST_HEAD(&gateways);
+	INIT_LIST_HEAD(&nets);
+
+	/* save a copy of the string for error messages */
+	strncpy(cmd, str, sizeof(cmd));
+	cmd[sizeof(cmd) - 1] = '\0';
+
+	sep = str;
+	for (;;) {
+		/* scan for token start */
+		while (isspace(*sep))
+			sep++;
+		if (*sep == 0) {
+			if (ntokens < (got_hops ? 3 : 2))
+				goto token_error;
+			break;
+		}
+
+		ntokens++;
+		token = sep++;
+
+		/* scan for token end */
+		while (*sep != 0 && !isspace(*sep))
+			sep++;
+		if (*sep != 0)
+			*sep++ = 0;
+
+		if (ntokens == 1) {
+			tmp2 = &nets;		/* expanding nets */
+		} else if (ntokens == 2 &&
+			   lnet_parse_hops(token, &hops)) {
+			got_hops = 1;		/* got a hop count */
+			continue;
+		} else {
+			tmp2 = &gateways;	/* expanding gateways */
+		}
+
+		ltb = lnet_new_text_buf(strlen(token));
+		if (ltb == NULL)
+			goto out;
+
+		strcpy(ltb->ltb_text, token);
+		tmp1 = &ltb->ltb_list;
+		list_add_tail(tmp1, tmp2);
+
+		while (tmp1 != tmp2) {
+			ltb = list_entry(tmp1, struct lnet_text_buf, ltb_list);
+
+			rc = lnet_str2tbs_expand(tmp1->next, ltb->ltb_text);
+			if (rc < 0)
+				goto token_error;
+
+			tmp1 = tmp1->next;
+
+			if (rc > 0) {		/* expanded! */
+				list_del(&ltb->ltb_list);
+				lnet_free_text_buf(ltb);
+				continue;
+			}
+
+			if (ntokens == 1) {
+				net = libcfs_str2net(ltb->ltb_text);
+				if (net == LNET_NIDNET(LNET_NID_ANY) ||
+				    LNET_NETTYP(net) == LOLND)
+					goto token_error;
+			} else {
+				rc = lnet_parse_priority(ltb->ltb_text,
+							 &priority, &token);
+				if (rc < 0)
+					goto token_error;
+
+				nid = libcfs_str2nid(ltb->ltb_text);
+				if (nid == LNET_NID_ANY ||
+				    LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
+					goto token_error;
+			}
+		}
+	}
+
+	/* if there are no hops set then we want to flag this value as
+	 * unset since hops is an optional parameter */
+	if (!got_hops)
+		hops = LNET_UNDEFINED_HOPS;
+
+	LASSERT(!list_empty(&nets));
+	LASSERT(!list_empty(&gateways));
+
+	list_for_each(tmp1, &nets) {
+		ltb = list_entry(tmp1, struct lnet_text_buf, ltb_list);
+		net = libcfs_str2net(ltb->ltb_text);
+		LASSERT (net != LNET_NIDNET(LNET_NID_ANY));
+
+		list_for_each(tmp2, &gateways) {
+			ltb = list_entry(tmp2, struct lnet_text_buf, ltb_list);
+			nid = libcfs_str2nid(ltb->ltb_text);
+			LASSERT(nid != LNET_NID_ANY);
+
+			if (lnet_islocalnid(nid)) {
+				*im_a_router = 1;
+				continue;
+			}
+
+			rc = lnet_add_route(net, hops, nid, priority);
+			if (rc != 0 && rc != -EEXIST && rc != -EHOSTUNREACH) {
+				CERROR("Can't create route "
+				       "to %s via %s\n",
+				       libcfs_net2str(net),
+				       libcfs_nid2str(nid));
+				goto out;
+			}
+		}
+	}
+
+	myrc = 0;
+	goto out;
+
+token_error:
+	lnet_syntax("routes", cmd, (int)(token - str), strlen(token));
+out:
+	lnet_free_text_bufs(&nets);
+	lnet_free_text_bufs(&gateways);
+	return myrc;
+}
+
+static int
+lnet_parse_route_tbs(struct list_head *tbs, int *im_a_router)
+{
+	struct lnet_text_buf   *ltb;
+
+	while (!list_empty(tbs)) {
+		ltb = list_entry(tbs->next, struct lnet_text_buf, ltb_list);
+
+		if (lnet_parse_route(ltb->ltb_text, im_a_router) < 0) {
+			lnet_free_text_bufs(tbs);
+			return -EINVAL;
+		}
+
+		list_del(&ltb->ltb_list);
+		lnet_free_text_buf(ltb);
+	}
+
+	return 0;
+}
+
+int
+lnet_parse_routes (char *routes, int *im_a_router)
+{
+	struct list_head tbs;
+	int		 rc = 0;
+
+	*im_a_router = 0;
+
+	INIT_LIST_HEAD(&tbs);
+
+	if (lnet_str2tbs_sep(&tbs, routes) < 0) {
+		CERROR("Error parsing routes\n");
+		rc = -EINVAL;
+	} else {
+		rc = lnet_parse_route_tbs(&tbs, im_a_router);
+	}
+
+	LASSERT (lnet_tbnob == 0);
+	return rc;
+}
+
+static int
+lnet_match_network_token(char *token, int len, __u32 *ipaddrs, int nip)
+{
+	struct list_head list = LIST_HEAD_INIT(list);
+	int		rc;
+	int		i;
+
+	rc = cfs_ip_addr_parse(token, len, &list);
+	if (rc != 0)
+		return rc;
+
+	for (rc = i = 0; !rc && i < nip; i++)
+		rc = cfs_ip_addr_match(ipaddrs[i], &list);
+
+	cfs_expr_list_free_list(&list);
+
+	return rc;
+}
+
+static int
+lnet_match_network_tokens(char *net_entry, __u32 *ipaddrs, int nip)
+{
+	static char tokens[LNET_SINGLE_TEXTBUF_NOB];
+
+	int   matched = 0;
+	int   ntokens = 0;
+	int   len;
+	char *net = NULL;
+	char *sep;
+	char *token;
+	int   rc;
+
+	LASSERT(strlen(net_entry) < sizeof(tokens));
+
+	/* work on a copy of the string */
+	strcpy(tokens, net_entry);
+	sep = tokens;
+	for (;;) {
+		/* scan for token start */
+		while (isspace(*sep))
+			sep++;
+		if (*sep == 0)
+			break;
+
+		token = sep++;
+
+		/* scan for token end */
+		while (*sep != 0 && !isspace(*sep))
+			sep++;
+		if (*sep != 0)
+			*sep++ = 0;
+
+		if (ntokens++ == 0) {
+			net = token;
+			continue;
+		}
+
+		len = strlen(token);
+
+		rc = lnet_match_network_token(token, len, ipaddrs, nip);
+		if (rc < 0) {
+			lnet_syntax("ip2nets", net_entry,
+				    (int)(token - tokens), len);
+			return rc;
+		}
+
+		matched |= (rc != 0);
+	}
+
+	if (!matched)
+		return 0;
+
+	strcpy(net_entry, net);			/* replace with matched net */
+	return 1;
+}
+
+static __u32
+lnet_netspec2net(char *netspec)
+{
+	char   *bracket = strchr(netspec, '(');
+	__u32	net;
+
+	if (bracket != NULL)
+		*bracket = 0;
+
+	net = libcfs_str2net(netspec);
+
+	if (bracket != NULL)
+		*bracket = '(';
+
+	return net;
+}
+
+static int
+lnet_splitnets(char *source, struct list_head *nets)
+{
+	int		  offset = 0;
+	int		  offset2;
+	int		  len;
+	struct lnet_text_buf  *tb;
+	struct lnet_text_buf  *tb2;
+	struct list_head *t;
+	char		 *sep;
+	char		 *bracket;
+	__u32		  net;
+
+	LASSERT(!list_empty(nets));
+	LASSERT(nets->next == nets->prev);	/* single entry */
+
+	tb = list_entry(nets->next, struct lnet_text_buf, ltb_list);
+
+	for (;;) {
+		sep = strchr(tb->ltb_text, ',');
+		bracket = strchr(tb->ltb_text, '(');
+
+		if (sep != NULL &&
+		    bracket != NULL &&
+		    bracket < sep) {
+			/* netspec lists interfaces... */
+
+			offset2 = offset + (int)(bracket - tb->ltb_text);
+			len = strlen(bracket);
+
+			bracket = strchr(bracket + 1, ')');
+
+			if (bracket == NULL ||
+			    !(bracket[1] == ',' || bracket[1] == 0)) {
+				lnet_syntax("ip2nets", source, offset2, len);
+				return -EINVAL;
+			}
+
+			sep = (bracket[1] == 0) ? NULL : bracket + 1;
+		}
+
+		if (sep != NULL)
+			*sep++ = 0;
+
+		net = lnet_netspec2net(tb->ltb_text);
+		if (net == LNET_NIDNET(LNET_NID_ANY)) {
+			lnet_syntax("ip2nets", source, offset,
+				    strlen(tb->ltb_text));
+			return -EINVAL;
+		}
+
+		list_for_each(t, nets) {
+			tb2 = list_entry(t, struct lnet_text_buf, ltb_list);
+
+			if (tb2 == tb)
+				continue;
+
+			if (net == lnet_netspec2net(tb2->ltb_text)) {
+				/* duplicate network */
+				lnet_syntax("ip2nets", source, offset,
+					    strlen(tb->ltb_text));
+				return -EINVAL;
+			}
+		}
+
+		if (sep == NULL)
+			return 0;
+
+		offset += (int)(sep - tb->ltb_text);
+		len = strlen(sep);
+		tb2 = lnet_new_text_buf(len);
+		if (tb2 == NULL)
+			return -ENOMEM;
+
+		strncpy(tb2->ltb_text, sep, len);
+		tb2->ltb_text[len] = '\0';
+		list_add_tail(&tb2->ltb_list, nets);
+
+		tb = tb2;
+	}
+}
+
+static int
+lnet_match_networks (char **networksp, char *ip2nets, __u32 *ipaddrs, int nip)
+{
+	static char	  networks[LNET_SINGLE_TEXTBUF_NOB];
+	static char	  source[LNET_SINGLE_TEXTBUF_NOB];
+
+	struct list_head  raw_entries;
+	struct list_head  matched_nets;
+	struct list_head  current_nets;
+	struct list_head *t;
+	struct list_head *t2;
+	struct lnet_text_buf  *tb;
+	struct lnet_text_buf  *tb2;
+	__u32		  net1;
+	__u32		  net2;
+	int		  len;
+	int		  count;
+	int		  dup;
+	int		  rc;
+
+	INIT_LIST_HEAD(&raw_entries);
+	if (lnet_str2tbs_sep(&raw_entries, ip2nets) < 0) {
+		CERROR("Error parsing ip2nets\n");
+		LASSERT(lnet_tbnob == 0);
+		return -EINVAL;
+	}
+
+	INIT_LIST_HEAD(&matched_nets);
+	INIT_LIST_HEAD(&current_nets);
+	networks[0] = 0;
+	count = 0;
+	len = 0;
+	rc = 0;
+
+	while (!list_empty(&raw_entries)) {
+		tb = list_entry(raw_entries.next, struct lnet_text_buf,
+				ltb_list);
+
+		strncpy(source, tb->ltb_text, sizeof(source));
+		source[sizeof(source) - 1] = '\0';
+
+		/* replace ltb_text with the network(s) add on match */
+		rc = lnet_match_network_tokens(tb->ltb_text, ipaddrs, nip);
+		if (rc < 0)
+			break;
+
+		list_del(&tb->ltb_list);
+
+		if (rc == 0) {			/* no match */
+			lnet_free_text_buf(tb);
+			continue;
+		}
+
+		/* split into separate networks */
+		INIT_LIST_HEAD(&current_nets);
+		list_add(&tb->ltb_list, &current_nets);
+		rc = lnet_splitnets(source, &current_nets);
+		if (rc < 0)
+			break;
+
+		dup = 0;
+		list_for_each(t, &current_nets) {
+			tb = list_entry(t, struct lnet_text_buf, ltb_list);
+			net1 = lnet_netspec2net(tb->ltb_text);
+			LASSERT(net1 != LNET_NIDNET(LNET_NID_ANY));
+
+			list_for_each(t2, &matched_nets) {
+				tb2 = list_entry(t2, struct lnet_text_buf,
+						 ltb_list);
+				net2 = lnet_netspec2net(tb2->ltb_text);
+				LASSERT(net2 != LNET_NIDNET(LNET_NID_ANY));
+
+				if (net1 == net2) {
+					dup = 1;
+					break;
+				}
+			}
+
+			if (dup)
+				break;
+		}
+
+		if (dup) {
+			lnet_free_text_bufs(&current_nets);
+			continue;
+		}
+
+		list_for_each_safe(t, t2, &current_nets) {
+			tb = list_entry(t, struct lnet_text_buf, ltb_list);
+
+			list_del(&tb->ltb_list);
+			list_add_tail(&tb->ltb_list, &matched_nets);
+
+			len += snprintf(networks + len, sizeof(networks) - len,
+					"%s%s", (len == 0) ? "" : ",",
+					tb->ltb_text);
+
+			if (len >= sizeof(networks)) {
+				CERROR("Too many matched networks\n");
+				rc = -E2BIG;
+				goto out;
+			}
+		}
+
+		count++;
+	}
+
+ out:
+	lnet_free_text_bufs(&raw_entries);
+	lnet_free_text_bufs(&matched_nets);
+	lnet_free_text_bufs(&current_nets);
+	LASSERT(lnet_tbnob == 0);
+
+	if (rc < 0)
+		return rc;
+
+	*networksp = networks;
+	return count;
+}
+
+static void
+lnet_ipaddr_free_enumeration(__u32 *ipaddrs, int nip)
+{
+	LIBCFS_FREE(ipaddrs, nip * sizeof(*ipaddrs));
+}
+
+static int
+lnet_ipaddr_enumerate (__u32 **ipaddrsp)
+{
+	int	   up;
+	__u32	   netmask;
+	__u32	  *ipaddrs;
+	__u32	  *ipaddrs2;
+	int	   nip;
+	char	 **ifnames;
+	int	   nif = lnet_ipif_enumerate(&ifnames);
+	int	   i;
+	int	   rc;
+
+	if (nif <= 0)
+		return nif;
+
+	LIBCFS_ALLOC(ipaddrs, nif * sizeof(*ipaddrs));
+	if (ipaddrs == NULL) {
+		CERROR("Can't allocate ipaddrs[%d]\n", nif);
+		lnet_ipif_free_enumeration(ifnames, nif);
+		return -ENOMEM;
+	}
+
+	for (i = nip = 0; i < nif; i++) {
+		if (!strcmp(ifnames[i], "lo"))
+			continue;
+
+		rc = lnet_ipif_query(ifnames[i], &up,
+				       &ipaddrs[nip], &netmask);
+		if (rc != 0) {
+			CWARN("Can't query interface %s: %d\n",
+			      ifnames[i], rc);
+			continue;
+		}
+
+		if (!up) {
+			CWARN("Ignoring interface %s: it's down\n",
+			      ifnames[i]);
+			continue;
+		}
+
+		nip++;
+	}
+
+	lnet_ipif_free_enumeration(ifnames, nif);
+
+	if (nip == nif) {
+		*ipaddrsp = ipaddrs;
+	} else {
+		if (nip > 0) {
+			LIBCFS_ALLOC(ipaddrs2, nip * sizeof(*ipaddrs2));
+			if (ipaddrs2 == NULL) {
+				CERROR("Can't allocate ipaddrs[%d]\n", nip);
+				nip = -ENOMEM;
+			} else {
+				memcpy(ipaddrs2, ipaddrs,
+					nip * sizeof(*ipaddrs));
+				*ipaddrsp = ipaddrs2;
+				rc = nip;
+			}
+		}
+		lnet_ipaddr_free_enumeration(ipaddrs, nif);
+	}
+	return nip;
+}
+
+int
+lnet_parse_ip2nets (char **networksp, char *ip2nets)
+{
+	__u32	  *ipaddrs = NULL;
+	int	   nip = lnet_ipaddr_enumerate(&ipaddrs);
+	int	   rc;
+
+	if (nip < 0) {
+		LCONSOLE_ERROR_MSG(0x117, "Error %d enumerating local IP "
+				   "interfaces for ip2nets to match\n", nip);
+		return nip;
+	}
+
+	if (nip == 0) {
+		LCONSOLE_ERROR_MSG(0x118, "No local IP interfaces "
+				   "for ip2nets to match\n");
+		return -ENOENT;
+	}
+
+	rc = lnet_match_networks(networksp, ip2nets, ipaddrs, nip);
+	lnet_ipaddr_free_enumeration(ipaddrs, nip);
+
+	if (rc < 0) {
+		LCONSOLE_ERROR_MSG(0x119, "Error %d parsing ip2nets\n", rc);
+		return rc;
+	}
+
+	if (rc == 0) {
+		LCONSOLE_ERROR_MSG(0x11a, "ip2nets does not match "
+				   "any local IP interfaces\n");
+		return -ENOENT;
+	}
+
+	return 0;
+}
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-eq.c b/drivers/staging/lustrefsx/lnet/lnet/lib-eq.c
new file mode 100644
index 0000000000000..3bca6b77539a6
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-eq.c
@@ -0,0 +1,423 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-eq.c
+ *
+ * Library level Event queue management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <lnet/lib-lnet.h>
+
+/**
+ * Create an event queue that has room for \a count number of events.
+ *
+ * The event queue is circular and older events will be overwritten by new
+ * ones if they are not removed in time by the user using the functions
+ * LNetEQGet(), LNetEQWait(), or LNetEQPoll(). It is up to the user to
+ * determine the appropriate size of the event queue to prevent this loss
+ * of events. Note that when EQ handler is specified in \a callback, no
+ * event loss can happen, since the handler is run for each event deposited
+ * into the EQ.
+ *
+ * \param count The number of events to be stored in the event queue. It
+ * will be rounded up to the next power of two.
+ * \param callback A handler function that runs when an event is deposited
+ * into the EQ. The constant value LNET_EQ_HANDLER_NONE can be used to
+ * indicate that no event handler is desired.
+ * \param handle On successful return, this location will hold a handle for
+ * the newly created EQ.
+ *
+ * \retval 0	   On success.
+ * \retval -EINVAL If an parameter is not valid.
+ * \retval -ENOMEM If memory for the EQ can't be allocated.
+ *
+ * \see lnet_eq_handler_t for the discussion on EQ handler semantics.
+ */
+int
+LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback,
+	    struct lnet_handle_eq *handle)
+{
+	struct lnet_eq *eq;
+
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	/* We need count to be a power of 2 so that when eq_{enq,deq}_seq
+	 * overflow, they don't skip entries, so the queue has the same
+	 * apparent capacity at all times */
+
+	if (count)
+		count = roundup_pow_of_two(count);
+
+	if (callback != LNET_EQ_HANDLER_NONE && count != 0) {
+		CWARN("EQ callback is guaranteed to get every event, "
+		      "do you still want to set eqcount %d for polling "
+		      "event which will have locking overhead? "
+		      "Please contact with developer to confirm\n", count);
+	}
+
+	/* count can be 0 if only need callback, we can eliminate
+	 * overhead of enqueue event */
+	if (count == 0 && callback == LNET_EQ_HANDLER_NONE)
+		return -EINVAL;
+
+	eq = lnet_eq_alloc();
+	if (eq == NULL)
+		return -ENOMEM;
+
+	if (count != 0) {
+		LIBCFS_ALLOC(eq->eq_events, count * sizeof(struct lnet_event));
+		if (eq->eq_events == NULL)
+			goto failed;
+		/* NB allocator has set all event sequence numbers to 0,
+		 * so all them should be earlier than eq_deq_seq */
+	}
+
+	eq->eq_deq_seq = 1;
+	eq->eq_enq_seq = 1;
+	eq->eq_size = count;
+	eq->eq_callback = callback;
+
+	eq->eq_refs = cfs_percpt_alloc(lnet_cpt_table(),
+				       sizeof(*eq->eq_refs[0]));
+	if (eq->eq_refs == NULL)
+		goto failed;
+
+	/* MUST hold both exclusive lnet_res_lock */
+	lnet_res_lock(LNET_LOCK_EX);
+	/* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do
+	 * both EQ lookup and poll event with only lnet_eq_wait_lock */
+	lnet_eq_wait_lock();
+
+	lnet_res_lh_initialize(&the_lnet.ln_eq_container, &eq->eq_lh);
+	list_add(&eq->eq_list, &the_lnet.ln_eq_container.rec_active);
+
+	lnet_eq_wait_unlock();
+	lnet_res_unlock(LNET_LOCK_EX);
+
+	lnet_eq2handle(handle, eq);
+	return 0;
+
+failed:
+	if (eq->eq_events != NULL)
+		LIBCFS_FREE(eq->eq_events, count * sizeof(struct lnet_event));
+
+	if (eq->eq_refs != NULL)
+		cfs_percpt_free(eq->eq_refs);
+
+	lnet_eq_free(eq);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(LNetEQAlloc);
+
+/**
+ * Release the resources associated with an event queue if it's idle;
+ * otherwise do nothing and it's up to the user to try again.
+ *
+ * \param eqh A handle for the event queue to be released.
+ *
+ * \retval 0 If the EQ is not in use and freed.
+ * \retval -ENOENT If \a eqh does not point to a valid EQ.
+ * \retval -EBUSY  If the EQ is still in use by some MDs.
+ */
+int
+LNetEQFree(struct lnet_handle_eq eqh)
+{
+	struct lnet_eq	*eq;
+	struct lnet_event	*events = NULL;
+	int		**refs = NULL;
+	int		*ref;
+	int		rc = 0;
+	int		size = 0;
+	int		i;
+
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	lnet_res_lock(LNET_LOCK_EX);
+	/* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do
+	 * both EQ lookup and poll event with only lnet_eq_wait_lock */
+	lnet_eq_wait_lock();
+
+	eq = lnet_handle2eq(&eqh);
+	if (eq == NULL) {
+		rc = -ENOENT;
+		goto out;
+	}
+
+	cfs_percpt_for_each(ref, i, eq->eq_refs) {
+		LASSERT(*ref >= 0);
+		if (*ref == 0)
+			continue;
+
+		CDEBUG(D_NET, "Event equeue (%d: %d) busy on destroy.\n",
+		       i, *ref);
+		rc = -EBUSY;
+		goto out;
+	}
+
+	/* stash for free after lock dropped */
+	events	= eq->eq_events;
+	size	= eq->eq_size;
+	refs	= eq->eq_refs;
+
+	lnet_res_lh_invalidate(&eq->eq_lh);
+	list_del(&eq->eq_list);
+	lnet_eq_free(eq);
+ out:
+	lnet_eq_wait_unlock();
+	lnet_res_unlock(LNET_LOCK_EX);
+
+	if (events != NULL)
+		LIBCFS_FREE(events, size * sizeof(struct lnet_event));
+	if (refs != NULL)
+		cfs_percpt_free(refs);
+
+	return rc;
+}
+EXPORT_SYMBOL(LNetEQFree);
+
+void
+lnet_eq_enqueue_event(struct lnet_eq *eq, struct lnet_event *ev)
+{
+	/* MUST called with resource lock hold but w/o lnet_eq_wait_lock */
+	int index;
+
+	if (eq->eq_size == 0) {
+		LASSERT(eq->eq_callback != LNET_EQ_HANDLER_NONE);
+		eq->eq_callback(ev);
+		return;
+	}
+
+	lnet_eq_wait_lock();
+	ev->sequence = eq->eq_enq_seq++;
+
+	LASSERT(eq->eq_size == LOWEST_BIT_SET(eq->eq_size));
+	index = ev->sequence & (eq->eq_size - 1);
+
+	eq->eq_events[index] = *ev;
+
+	if (eq->eq_callback != LNET_EQ_HANDLER_NONE)
+		eq->eq_callback(ev);
+
+	/* Wake anyone waiting in LNetEQPoll() */
+	if (waitqueue_active(&the_lnet.ln_eq_waitq))
+		wake_up_all(&the_lnet.ln_eq_waitq);
+	lnet_eq_wait_unlock();
+}
+
+static int
+lnet_eq_dequeue_event(struct lnet_eq *eq, struct lnet_event *ev)
+{
+	int		new_index = eq->eq_deq_seq & (eq->eq_size - 1);
+	struct lnet_event	*new_event = &eq->eq_events[new_index];
+	int		rc;
+	ENTRY;
+
+	/* must called with lnet_eq_wait_lock hold */
+	if (LNET_SEQ_GT(eq->eq_deq_seq, new_event->sequence))
+		RETURN(0);
+
+	/* We've got a new event... */
+	*ev = *new_event;
+
+	CDEBUG(D_INFO, "event: %p, sequence: %lu, eq->size: %u\n",
+	       new_event, eq->eq_deq_seq, eq->eq_size);
+
+	/* ...but did it overwrite an event we've not seen yet? */
+	if (eq->eq_deq_seq == new_event->sequence) {
+		rc = 1;
+	} else {
+		/* don't complain with CERROR: some EQs are sized small
+		 * anyway; if it's important, the caller should complain */
+		CDEBUG(D_NET, "Event Queue Overflow: eq seq %lu ev seq %lu\n",
+		       eq->eq_deq_seq, new_event->sequence);
+		rc = -EOVERFLOW;
+	}
+
+	eq->eq_deq_seq = new_event->sequence + 1;
+	RETURN(rc);
+}
+
+/**
+ * A nonblocking function that can be used to get the next event in an EQ.
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully. The event is removed from the queue.
+ *
+ * \param eventq A handle for the event queue.
+ * \param event On successful return (1 or -EOVERFLOW), this location will
+ * hold the next event in the EQ.
+ *
+ * \retval 0	      No pending event in the EQ.
+ * \retval 1	      Indicates success.
+ * \retval -ENOENT    If \a eventq does not point to a valid EQ.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ has been dropped due to limited space in the EQ.
+ */
+int
+LNetEQGet(struct lnet_handle_eq eventq, struct lnet_event *event)
+{
+	int which;
+
+	return LNetEQPoll(&eventq, 1, 0,
+			 event, &which);
+}
+EXPORT_SYMBOL(LNetEQGet);
+
+/**
+ * Block the calling process until there is an event in the EQ.
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully. This function returns the next event
+ * in the EQ and removes it from the EQ.
+ *
+ * \param eventq A handle for the event queue.
+ * \param event On successful return (1 or -EOVERFLOW), this location will
+ * hold the next event in the EQ.
+ *
+ * \retval 1	      Indicates success.
+ * \retval -ENOENT    If \a eventq does not point to a valid EQ.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ has been dropped due to limited space in the EQ.
+ */
+int
+LNetEQWait(struct lnet_handle_eq eventq, struct lnet_event *event)
+{
+	int which;
+
+	return LNetEQPoll(&eventq, 1, MAX_SCHEDULE_TIMEOUT,
+			  event, &which);
+}
+EXPORT_SYMBOL(LNetEQWait);
+
+static int
+lnet_eq_wait_locked(signed long *timeout)
+__must_hold(&the_lnet.ln_eq_wait_lock)
+{
+	signed long tms = *timeout;
+	wait_queue_entry_t wl;
+	int wait;
+
+	if (tms == 0)
+		return -ENXIO; /* don't want to wait and no new event */
+
+	init_waitqueue_entry(&wl, current);
+	add_wait_queue(&the_lnet.ln_eq_waitq, &wl);
+
+	lnet_eq_wait_unlock();
+
+	tms = schedule_timeout_interruptible(tms);
+	wait = tms != 0; /* might need to call here again */
+	*timeout = tms;
+
+	lnet_eq_wait_lock();
+	remove_wait_queue(&the_lnet.ln_eq_waitq, &wl);
+
+	return wait;
+}
+
+/**
+ * Block the calling process until there's an event from a set of EQs or
+ * timeout happens.
+ *
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully, in which case the corresponding event
+ * is consumed.
+ *
+ * LNetEQPoll() provides a timeout to allow applications to poll, block for a
+ * fixed period, or block indefinitely.
+ *
+ * \param eventqs,neq An array of EQ handles, and size of the array.
+ * \param timeout Time in jiffies to wait for an event to occur on
+ * one of the EQs. The constant MAX_SCHEDULE_TIMEOUT can be used to indicate an
+ * infinite timeout.
+ * \param event,which On successful return (1 or -EOVERFLOW), \a event will
+ * hold the next event in the EQs, and \a which will contain the index of the
+ * EQ from which the event was taken.
+ *
+ * \retval 0	      No pending event in the EQs after timeout.
+ * \retval 1	      Indicates success.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ indicated by \a which has been dropped due to limited space in the EQ.
+ * \retval -ENOENT    If there's an invalid handle in \a eventqs.
+ */
+int
+LNetEQPoll(struct lnet_handle_eq *eventqs, int neq, signed long timeout,
+	   struct lnet_event *event, int *which)
+{
+	int	wait = 1;
+	int	rc;
+	int	i;
+	ENTRY;
+
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	if (neq < 1)
+		RETURN(-ENOENT);
+
+	lnet_eq_wait_lock();
+
+	for (;;) {
+		for (i = 0; i < neq; i++) {
+			struct lnet_eq *eq = lnet_handle2eq(&eventqs[i]);
+
+			if (eq == NULL) {
+				lnet_eq_wait_unlock();
+				RETURN(-ENOENT);
+			}
+
+			rc = lnet_eq_dequeue_event(eq, event);
+			if (rc != 0) {
+				lnet_eq_wait_unlock();
+				*which = i;
+				RETURN(rc);
+			}
+		}
+
+		if (wait == 0)
+			break;
+
+		/*
+		 * return value of lnet_eq_wait_locked:
+		 * -1 : did nothing and it's sure no new event
+		 *  1 : sleep inside and wait until new event
+		 *  0 : don't want to wait anymore, but might have new event
+		 *	so need to call dequeue again
+		 */
+		wait = lnet_eq_wait_locked(&timeout);
+		if (wait < 0) /* no new event */
+			break;
+	}
+
+	lnet_eq_wait_unlock();
+	RETURN(0);
+}
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-md.c b/drivers/staging/lustrefsx/lnet/lnet/lib-md.c
new file mode 100644
index 0000000000000..a3d0487063cbd
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-md.c
@@ -0,0 +1,557 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-md.c
+ *
+ * Memory Descriptor management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <lnet/lib-lnet.h>
+
+/* must be called with lnet_res_lock held */
+void
+lnet_md_unlink(struct lnet_libmd *md)
+{
+	if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) == 0) {
+		/* first unlink attempt... */
+		struct lnet_me *me = md->md_me;
+
+		md->md_flags |= LNET_MD_FLAG_ZOMBIE;
+
+		/* Disassociate from ME (if any), and unlink it if it was created
+		 * with LNET_UNLINK */
+		if (me != NULL) {
+			/* detach MD from portal */
+			lnet_ptl_detach_md(me, md);
+			if (me->me_unlink == LNET_UNLINK)
+				lnet_me_unlink(me);
+		}
+
+		/* ensure all future handle lookups fail */
+		lnet_res_lh_invalidate(&md->md_lh);
+	}
+
+	if (md->md_refcount != 0) {
+		CDEBUG(D_NET, "Queueing unlink of md %p\n", md);
+		return;
+	}
+
+	CDEBUG(D_NET, "Unlinking md %p\n", md);
+
+	if (md->md_eq != NULL) {
+		int	cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie);
+
+		LASSERT(*md->md_eq->eq_refs[cpt] > 0);
+		(*md->md_eq->eq_refs[cpt])--;
+	}
+
+	LASSERT(!list_empty(&md->md_list));
+	list_del_init(&md->md_list);
+	lnet_md_free(md);
+}
+
+struct page *
+lnet_kvaddr_to_page(unsigned long vaddr)
+{
+	if (is_vmalloc_addr((void *)vaddr))
+		return vmalloc_to_page((void *)vaddr);
+
+#ifdef CONFIG_HIGHMEM
+
+#ifdef HAVE_KMAP_TO_PAGE
+	/*
+	 * This ifdef is added to handle the kernel versions
+	 * which have kmap_to_page() function exported. If so,
+	 * we should use it. Otherwise, remain with the legacy check.
+	 */
+	return kmap_to_page((void *)vaddr);
+#else
+
+	if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
+		/* No highmem pages only used for bulk (kiov) I/O */
+		CERROR("find page for address in highmem\n");
+		LBUG();
+	}
+	return virt_to_page(vaddr);
+#endif /* HAVE_KMAP_TO_PAGE */
+#else
+
+	return virt_to_page(vaddr);
+#endif /* CONFIG_HIGHMEM */
+}
+EXPORT_SYMBOL(lnet_kvaddr_to_page);
+
+int
+lnet_cpt_of_md(struct lnet_libmd *md, unsigned int offset)
+{
+	int cpt = CFS_CPT_ANY;
+	unsigned int niov;
+
+	/*
+	 * if the md_options has a bulk handle then we want to look at the
+	 * bulk md because that's the data which we will be DMAing
+	 */
+	if (md && (md->md_options & LNET_MD_BULK_HANDLE) != 0 &&
+	    !LNetMDHandleIsInvalid(md->md_bulk_handle))
+		md = lnet_handle2md(&md->md_bulk_handle);
+
+	if (!md || md->md_niov == 0)
+		return CFS_CPT_ANY;
+
+	niov = md->md_niov;
+
+	/*
+	 * There are three cases to handle:
+	 *  1. The MD is using lnet_kiov_t
+	 *  2. The MD is using struct kvec
+	 *  3. Contiguous buffer allocated via vmalloc
+	 *
+	 *  in case 2 we can use virt_to_page() macro to get the page
+	 *  address of the memory kvec describes.
+	 *
+	 *  in case 3 use is_vmalloc_addr() and vmalloc_to_page()
+	 *
+	 * The offset provided can be within the first iov/kiov entry or
+	 * it could go beyond it. In that case we need to make sure to
+	 * look at the page which actually contains the data that will be
+	 * DMAed.
+	 */
+	if ((md->md_options & LNET_MD_KIOV) != 0) {
+		lnet_kiov_t *kiov = md->md_iov.kiov;
+
+		while (offset >= kiov->kiov_len) {
+			offset -= kiov->kiov_len;
+			niov--;
+			kiov++;
+			if (niov == 0) {
+				CERROR("offset %d goes beyond kiov\n", offset);
+				goto out;
+			}
+		}
+
+		cpt = cfs_cpt_of_node(lnet_cpt_table(),
+				page_to_nid(kiov->kiov_page));
+	} else {
+		struct kvec *iov = md->md_iov.iov;
+		unsigned long vaddr;
+		struct page *page;
+
+		while (offset >= iov->iov_len) {
+			offset -= iov->iov_len;
+			niov--;
+			iov++;
+			if (niov == 0) {
+				CERROR("offset %d goes beyond iov\n", offset);
+				goto out;
+			}
+		}
+
+		vaddr = ((unsigned long)iov->iov_base) + offset;
+		page = lnet_kvaddr_to_page(vaddr);
+		if (!page) {
+			CERROR("Couldn't resolve vaddr 0x%lx to page\n", vaddr);
+			goto out;
+		}
+		cpt = cfs_cpt_of_node(lnet_cpt_table(), page_to_nid(page));
+	}
+
+out:
+	return cpt;
+}
+
+static int
+lnet_md_build(struct lnet_libmd *lmd, struct lnet_md *umd, int unlink)
+{
+	int	     i;
+	unsigned int niov;
+	int	     total_length = 0;
+
+	lmd->md_me = NULL;
+	lmd->md_start = umd->start;
+	lmd->md_offset = 0;
+	lmd->md_max_size = umd->max_size;
+	lmd->md_options = umd->options;
+	lmd->md_user_ptr = umd->user_ptr;
+	lmd->md_eq = NULL;
+	lmd->md_threshold = umd->threshold;
+	lmd->md_refcount = 0;
+	lmd->md_flags = (unlink == LNET_UNLINK) ? LNET_MD_FLAG_AUTO_UNLINK : 0;
+	lmd->md_bulk_handle = umd->bulk_handle;
+
+	if ((umd->options & LNET_MD_IOVEC) != 0) {
+
+		if ((umd->options & LNET_MD_KIOV) != 0) /* Can't specify both */
+			return -EINVAL;
+
+		lmd->md_niov = niov = umd->length;
+		memcpy(lmd->md_iov.iov, umd->start,
+		       niov * sizeof(lmd->md_iov.iov[0]));
+
+		for (i = 0; i < (int)niov; i++) {
+			/* We take the base address on trust */
+			if (lmd->md_iov.iov[i].iov_len <= 0) /* invalid length */
+				return -EINVAL;
+
+			total_length += lmd->md_iov.iov[i].iov_len;
+		}
+
+		lmd->md_length = total_length;
+
+		if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
+		    (umd->max_size < 0 ||
+		     umd->max_size > total_length)) // illegal max_size
+			return -EINVAL;
+
+	} else if ((umd->options & LNET_MD_KIOV) != 0) {
+		lmd->md_niov = niov = umd->length;
+		memcpy(lmd->md_iov.kiov, umd->start,
+		       niov * sizeof(lmd->md_iov.kiov[0]));
+
+		for (i = 0; i < (int)niov; i++) {
+			/* We take the page pointer on trust */
+			if (lmd->md_iov.kiov[i].kiov_offset +
+			    lmd->md_iov.kiov[i].kiov_len > PAGE_SIZE)
+				return -EINVAL; /* invalid length */
+
+			total_length += lmd->md_iov.kiov[i].kiov_len;
+		}
+
+		lmd->md_length = total_length;
+
+		if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
+		    (umd->max_size < 0 ||
+		     umd->max_size > total_length)) // illegal max_size
+			return -EINVAL;
+	} else {   /* contiguous */
+		lmd->md_length = umd->length;
+		lmd->md_niov = niov = 1;
+		lmd->md_iov.iov[0].iov_base = umd->start;
+		lmd->md_iov.iov[0].iov_len = umd->length;
+
+		if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
+		    (umd->max_size < 0 ||
+		     umd->max_size > (int)umd->length)) // illegal max_size
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* must be called with resource lock held */
+static int
+lnet_md_link(struct lnet_libmd *md, struct lnet_handle_eq eq_handle, int cpt)
+{
+	struct lnet_res_container *container = the_lnet.ln_md_containers[cpt];
+
+	/* NB we are passed an allocated, but inactive md.
+	 * if we return success, caller may lnet_md_unlink() it.
+	 * otherwise caller may only lnet_md_free() it.
+	 */
+	/* This implementation doesn't know how to create START events or
+	 * disable END events.	Best to LASSERT our caller is compliant so
+	 * we find out quickly...  */
+	/*  TODO - reevaluate what should be here in light of
+	 * the removal of the start and end events
+	 * maybe there we shouldn't even allow LNET_EQ_NONE!)
+	 * LASSERT (eq == NULL);
+	 */
+	if (!LNetEQHandleIsInvalid(eq_handle)) {
+		md->md_eq = lnet_handle2eq(&eq_handle);
+
+		if (md->md_eq == NULL)
+			return -ENOENT;
+
+		(*md->md_eq->eq_refs[cpt])++;
+	}
+
+	lnet_res_lh_initialize(container, &md->md_lh);
+
+	LASSERT(list_empty(&md->md_list));
+	list_add(&md->md_list, &container->rec_active);
+
+	return 0;
+}
+
+/* must be called with lnet_res_lock held */
+void
+lnet_md_deconstruct(struct lnet_libmd *lmd, struct lnet_md *umd)
+{
+	/* NB this doesn't copy out all the iov entries so when a
+	 * discontiguous MD is copied out, the target gets to know the
+	 * original iov pointer (in start) and the number of entries it had
+	 * and that's all.
+	 */
+	umd->start = lmd->md_start;
+	umd->length = ((lmd->md_options & (LNET_MD_IOVEC | LNET_MD_KIOV)) == 0) ?
+		      lmd->md_length : lmd->md_niov;
+	umd->threshold = lmd->md_threshold;
+	umd->max_size = lmd->md_max_size;
+	umd->options = lmd->md_options;
+	umd->user_ptr = lmd->md_user_ptr;
+	lnet_eq2handle(&umd->eq_handle, lmd->md_eq);
+}
+
+static int
+lnet_md_validate(struct lnet_md *umd)
+{
+	if (umd->start == NULL && umd->length != 0) {
+		CERROR("MD start pointer can not be NULL with length %u\n",
+		       umd->length);
+		return -EINVAL;
+	}
+
+	if ((umd->options & (LNET_MD_KIOV | LNET_MD_IOVEC)) != 0 &&
+	    umd->length > LNET_MAX_IOV) {
+		CERROR("Invalid option: too many fragments %u, %d max\n",
+		       umd->length, LNET_MAX_IOV);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * Create a memory descriptor and attach it to a ME
+ *
+ * \param meh A handle for a ME to associate the new MD with.
+ * \param umd Provides initial values for the user-visible parts of a MD.
+ * Other than its use for initialization, there is no linkage between this
+ * structure and the MD maintained by the LNet.
+ * \param unlink A flag to indicate whether the MD is automatically unlinked
+ * when it becomes inactive, either because the operation threshold drops to
+ * zero or because the available memory becomes less than \a umd.max_size.
+ * (Note that the check for unlinking a MD only occurs after the completion
+ * of a successful operation on the MD.) The value LNET_UNLINK enables auto
+ * unlinking; the value LNET_RETAIN disables it.
+ * \param handle On successful returns, a handle to the newly created MD is
+ * saved here. This handle can be used later in LNetMDUnlink().
+ *
+ * \retval 0	   On success.
+ * \retval -EINVAL If \a umd is not valid.
+ * \retval -ENOMEM If new MD cannot be allocated.
+ * \retval -ENOENT Either \a meh or \a umd.eq_handle does not point to a
+ * valid object. Note that it's OK to supply a NULL \a umd.eq_handle by
+ * calling LNetInvalidateHandle() on it.
+ * \retval -EBUSY  If the ME pointed to by \a meh is already associated with
+ * a MD.
+ */
+int
+LNetMDAttach(struct lnet_handle_me meh, struct lnet_md umd,
+	     enum lnet_unlink unlink, struct lnet_handle_md *handle)
+{
+	struct list_head	matches = LIST_HEAD_INIT(matches);
+	struct list_head	drops = LIST_HEAD_INIT(drops);
+	struct lnet_me		*me;
+	struct lnet_libmd	*md;
+	int			cpt;
+	int			rc;
+
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	if (lnet_md_validate(&umd) != 0)
+		return -EINVAL;
+
+	if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) == 0) {
+		CERROR("Invalid option: no MD_OP set\n");
+		return -EINVAL;
+	}
+
+	md = lnet_md_alloc(&umd);
+	if (md == NULL)
+		return -ENOMEM;
+
+	rc = lnet_md_build(md, &umd, unlink);
+	if (rc != 0)
+		goto out_free;
+
+	cpt = lnet_cpt_of_cookie(meh.cookie);
+
+	lnet_res_lock(cpt);
+
+	me = lnet_handle2me(&meh);
+	if (me == NULL)
+		rc = -ENOENT;
+	else if (me->me_md != NULL)
+		rc = -EBUSY;
+	else
+		rc = lnet_md_link(md, umd.eq_handle, cpt);
+
+	if (rc != 0)
+		goto out_unlock;
+
+	/* attach this MD to portal of ME and check if it matches any
+	 * blocked msgs on this portal */
+	lnet_ptl_attach_md(me, md, &matches, &drops);
+
+	lnet_md2handle(handle, md);
+
+	lnet_res_unlock(cpt);
+
+	lnet_drop_delayed_msg_list(&drops, "Bad match");
+	lnet_recv_delayed_msg_list(&matches);
+
+	return 0;
+
+out_unlock:
+	lnet_res_unlock(cpt);
+out_free:
+	lnet_md_free(md);
+	return rc;
+}
+EXPORT_SYMBOL(LNetMDAttach);
+
+/**
+ * Create a "free floating" memory descriptor - a MD that is not associated
+ * with a ME. Such MDs are usually used in LNetPut() and LNetGet() operations.
+ *
+ * \param umd,unlink See the discussion for LNetMDAttach().
+ * \param handle On successful returns, a handle to the newly created MD is
+ * saved here. This handle can be used later in LNetMDUnlink(), LNetPut(),
+ * and LNetGet() operations.
+ *
+ * \retval 0	   On success.
+ * \retval -EINVAL If \a umd is not valid.
+ * \retval -ENOMEM If new MD cannot be allocated.
+ * \retval -ENOENT \a umd.eq_handle does not point to a valid EQ. Note that
+ * it's OK to supply a NULL \a umd.eq_handle by calling
+ * LNetInvalidateHandle() on it.
+ */
+int
+LNetMDBind(struct lnet_md umd, enum lnet_unlink unlink,
+	   struct lnet_handle_md *handle)
+{
+	struct lnet_libmd	*md;
+	int		cpt;
+	int		rc;
+
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	if (lnet_md_validate(&umd) != 0)
+		return -EINVAL;
+
+	if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) != 0) {
+		CERROR("Invalid option: GET|PUT illegal on active MDs\n");
+		return -EINVAL;
+	}
+
+	md = lnet_md_alloc(&umd);
+	if (md == NULL)
+		return -ENOMEM;
+
+	rc = lnet_md_build(md, &umd, unlink);
+	if (rc != 0)
+		goto out_free;
+
+	cpt = lnet_res_lock_current();
+
+	rc = lnet_md_link(md, umd.eq_handle, cpt);
+	if (rc != 0)
+		goto out_unlock;
+
+	lnet_md2handle(handle, md);
+
+	lnet_res_unlock(cpt);
+	return 0;
+
+ out_unlock:
+	lnet_res_unlock(cpt);
+
+ out_free:
+	lnet_md_free(md);
+	return rc;
+}
+EXPORT_SYMBOL(LNetMDBind);
+
+/**
+ * Unlink the memory descriptor from any ME it may be linked to and release
+ * the internal resources associated with it. As a result, active messages
+ * associated with the MD may get aborted.
+ *
+ * This function does not free the memory region associated with the MD;
+ * i.e., the memory the user allocated for this MD. If the ME associated with
+ * this MD is not NULL and was created with auto unlink enabled, the ME is
+ * unlinked as well (see LNetMEAttach()).
+ *
+ * Explicitly unlinking a MD via this function call has the same behavior as
+ * a MD that has been automatically unlinked, except that no LNET_EVENT_UNLINK
+ * is generated in the latter case.
+ *
+ * An unlinked event can be reported in two ways:
+ * - If there's no pending operations on the MD, it's unlinked immediately
+ *   and an LNET_EVENT_UNLINK event is logged before this function returns.
+ * - Otherwise, the MD is only marked for deletion when this function
+ *   returns, and the unlinked event will be piggybacked on the event of
+ *   the completion of the last operation by setting the unlinked field of
+ *   the event. No dedicated LNET_EVENT_UNLINK event is generated.
+ *
+ * Note that in both cases the unlinked field of the event is always set; no
+ * more event will happen on the MD after such an event is logged.
+ *
+ * \param mdh A handle for the MD to be unlinked.
+ *
+ * \retval 0	   On success.
+ * \retval -ENOENT If \a mdh does not point to a valid MD object.
+ */
+int
+LNetMDUnlink(struct lnet_handle_md mdh)
+{
+	struct lnet_event ev;
+	struct lnet_libmd *md;
+	int cpt;
+
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	cpt = lnet_cpt_of_cookie(mdh.cookie);
+	lnet_res_lock(cpt);
+
+	md = lnet_handle2md(&mdh);
+	if (md == NULL) {
+		lnet_res_unlock(cpt);
+		return -ENOENT;
+	}
+
+	md->md_flags |= LNET_MD_FLAG_ABORTED;
+	/* If the MD is busy, lnet_md_unlink just marks it for deletion, and
+	 * when the LND is done, the completion event flags that the MD was
+	 * unlinked. Otherwise, we enqueue an event now... */
+	if (md->md_eq != NULL && md->md_refcount == 0) {
+		lnet_build_unlink_event(md, &ev);
+		lnet_eq_enqueue_event(md->md_eq, &ev);
+	}
+
+	lnet_md_unlink(md);
+
+	lnet_res_unlock(cpt);
+	return 0;
+}
+EXPORT_SYMBOL(LNetMDUnlink);
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-me.c b/drivers/staging/lustrefsx/lnet/lnet/lib-me.c
new file mode 100644
index 0000000000000..1a1d9b1bdb671
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-me.c
@@ -0,0 +1,291 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-me.c
+ *
+ * Match Entry management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <lnet/lib-lnet.h>
+
+/**
+ * Create and attach a match entry to the match list of \a portal. The new
+ * ME is empty, i.e. not associated with a memory descriptor. LNetMDAttach()
+ * can be used to attach a MD to an empty ME.
+ *
+ * \param portal The portal table index where the ME should be attached.
+ * \param match_id Specifies the match criteria for the process ID of
+ * the requester. The constants LNET_PID_ANY and LNET_NID_ANY can be
+ * used to wildcard either of the identifiers in the struct lnet_process_id
+ * structure.
+ * \param match_bits,ignore_bits Specify the match criteria to apply
+ * to the match bits in the incoming request. The ignore bits are used
+ * to mask out insignificant bits in the incoming match bits. The resulting
+ * bits are then compared to the ME's match bits to determine if the
+ * incoming request meets the match criteria.
+ * \param unlink Indicates whether the ME should be unlinked when the memory
+ * descriptor associated with it is unlinked (Note that the check for
+ * unlinking a ME only occurs when the memory descriptor is unlinked.).
+ * Valid values are LNET_RETAIN and LNET_UNLINK.
+ * \param pos Indicates whether the new ME should be prepended or
+ * appended to the match list. Allowed constants: LNET_INS_BEFORE,
+ * LNET_INS_AFTER.
+ * \param handle On successful returns, a handle to the newly created ME
+ * object is saved here. This handle can be used later in LNetMEInsert(),
+ * LNetMEUnlink(), or LNetMDAttach() functions.
+ *
+ * \retval 0	   On success.
+ * \retval -EINVAL If \a portal is invalid.
+ * \retval -ENOMEM If new ME object cannot be allocated.
+ */
+int
+LNetMEAttach(unsigned int portal,
+	     struct lnet_process_id match_id,
+	     __u64 match_bits, __u64 ignore_bits,
+	     enum lnet_unlink unlink, enum lnet_ins_pos pos,
+	     struct lnet_handle_me *handle)
+{
+	struct lnet_match_table *mtable;
+	struct lnet_me		*me;
+	struct list_head	*head;
+
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	if ((int)portal >= the_lnet.ln_nportals)
+		return -EINVAL;
+
+	mtable = lnet_mt_of_attach(portal, match_id,
+				   match_bits, ignore_bits, pos);
+	if (mtable == NULL) /* can't match portal type */
+		return -EPERM;
+
+	me = lnet_me_alloc();
+	if (me == NULL)
+		return -ENOMEM;
+
+	lnet_res_lock(mtable->mt_cpt);
+
+	me->me_portal = portal;
+	me->me_match_id = match_id;
+	me->me_match_bits = match_bits;
+	me->me_ignore_bits = ignore_bits;
+	me->me_unlink = unlink;
+	me->me_md = NULL;
+
+	lnet_res_lh_initialize(the_lnet.ln_me_containers[mtable->mt_cpt],
+			       &me->me_lh);
+	if (ignore_bits != 0)
+		head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE];
+	else
+		head = lnet_mt_match_head(mtable, match_id, match_bits);
+
+	me->me_pos = head - &mtable->mt_mhash[0];
+	if (pos == LNET_INS_AFTER || pos == LNET_INS_LOCAL)
+		list_add_tail(&me->me_list, head);
+	else
+		list_add(&me->me_list, head);
+
+	lnet_me2handle(handle, me);
+
+	lnet_res_unlock(mtable->mt_cpt);
+	return 0;
+}
+EXPORT_SYMBOL(LNetMEAttach);
+
+/**
+ * Create and a match entry and insert it before or after the ME pointed to by
+ * \a current_meh. The new ME is empty, i.e. not associated with a memory
+ * descriptor. LNetMDAttach() can be used to attach a MD to an empty ME.
+ *
+ * This function is identical to LNetMEAttach() except for the position
+ * where the new ME is inserted.
+ *
+ * \param current_meh A handle for a ME. The new ME will be inserted
+ * immediately before or immediately after this ME.
+ * \param match_id,match_bits,ignore_bits,unlink,pos,handle See the discussion
+ * for LNetMEAttach().
+ *
+ * \retval 0	   On success.
+ * \retval -ENOMEM If new ME object cannot be allocated.
+ * \retval -ENOENT If \a current_meh does not point to a valid match entry.
+ */
+int
+LNetMEInsert(struct lnet_handle_me current_meh,
+	     struct lnet_process_id match_id,
+	     __u64 match_bits, __u64 ignore_bits,
+	     enum lnet_unlink unlink, enum lnet_ins_pos pos,
+	     struct lnet_handle_me *handle)
+{
+	struct lnet_me		*current_me;
+	struct lnet_me		*new_me;
+	struct lnet_portal	*ptl;
+	int			cpt;
+
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	if (pos == LNET_INS_LOCAL)
+		return -EPERM;
+
+	new_me = lnet_me_alloc();
+	if (new_me == NULL)
+		return -ENOMEM;
+
+	cpt = lnet_cpt_of_cookie(current_meh.cookie);
+
+	lnet_res_lock(cpt);
+
+	current_me = lnet_handle2me(&current_meh);
+	if (current_me == NULL) {
+		lnet_me_free(new_me);
+
+		lnet_res_unlock(cpt);
+		return -ENOENT;
+	}
+
+	LASSERT(current_me->me_portal < the_lnet.ln_nportals);
+
+	ptl = the_lnet.ln_portals[current_me->me_portal];
+	if (lnet_ptl_is_unique(ptl)) {
+		/* nosense to insertion on unique portal */
+		lnet_me_free(new_me);
+		lnet_res_unlock(cpt);
+		return -EPERM;
+	}
+
+	new_me->me_pos = current_me->me_pos;
+	new_me->me_portal = current_me->me_portal;
+	new_me->me_match_id = match_id;
+	new_me->me_match_bits = match_bits;
+	new_me->me_ignore_bits = ignore_bits;
+	new_me->me_unlink = unlink;
+	new_me->me_md = NULL;
+
+	lnet_res_lh_initialize(the_lnet.ln_me_containers[cpt], &new_me->me_lh);
+
+	if (pos == LNET_INS_AFTER)
+		list_add(&new_me->me_list, &current_me->me_list);
+	else
+		list_add_tail(&new_me->me_list, &current_me->me_list);
+
+	lnet_me2handle(handle, new_me);
+
+	lnet_res_unlock(cpt);
+
+	return 0;
+}
+EXPORT_SYMBOL(LNetMEInsert);
+
+/**
+ * Unlink a match entry from its match list.
+ *
+ * This operation also releases any resources associated with the ME. If a
+ * memory descriptor is attached to the ME, then it will be unlinked as well
+ * and an unlink event will be generated. It is an error to use the ME handle
+ * after calling LNetMEUnlink().
+ *
+ * \param meh A handle for the ME to be unlinked.
+ *
+ * \retval 0	   On success.
+ * \retval -ENOENT If \a meh does not point to a valid ME.
+ * \see LNetMDUnlink() for the discussion on delivering unlink event.
+ */
+int
+LNetMEUnlink(struct lnet_handle_me meh)
+{
+	struct lnet_me *me;
+	struct lnet_libmd *md;
+	struct lnet_event ev;
+	int cpt;
+
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	cpt = lnet_cpt_of_cookie(meh.cookie);
+	lnet_res_lock(cpt);
+
+	me = lnet_handle2me(&meh);
+	if (me == NULL) {
+		lnet_res_unlock(cpt);
+		return -ENOENT;
+	}
+
+	md = me->me_md;
+	if (md != NULL) {
+		md->md_flags |= LNET_MD_FLAG_ABORTED;
+		if (md->md_eq != NULL && md->md_refcount == 0) {
+			lnet_build_unlink_event(md, &ev);
+			lnet_eq_enqueue_event(md->md_eq, &ev);
+		}
+	}
+
+	lnet_me_unlink(me);
+
+	lnet_res_unlock(cpt);
+	return 0;
+}
+EXPORT_SYMBOL(LNetMEUnlink);
+
+/* call with lnet_res_lock please */
+void
+lnet_me_unlink(struct lnet_me *me)
+{
+	list_del(&me->me_list);
+
+	if (me->me_md != NULL) {
+		struct lnet_libmd *md = me->me_md;
+
+		/* detach MD from portal of this ME */
+		lnet_ptl_detach_md(me, md);
+		lnet_md_unlink(md);
+	}
+
+	lnet_res_lh_invalidate(&me->me_lh);
+	lnet_me_free(me);
+}
+
+#if 0
+static void
+lib_me_dump(struct lnet_me *me)
+{
+	CWARN("Match Entry %p (%#llx)\n", me,
+	      me->me_lh.lh_cookie);
+
+	CWARN("\tMatch/Ignore\t= %016lx / %016lx\n",
+	      me->me_match_bits, me->me_ignore_bits);
+
+	CWARN("\tMD\t= %p\n", me->md);
+	CWARN("\tprev\t= %p\n",
+	      list_entry(me->me_list.prev, struct lnet_me, me_list));
+	CWARN("\tnext\t= %p\n",
+	      list_entry(me->me_list.next, struct lnet_me, me_list));
+}
+#endif
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-move.c b/drivers/staging/lustrefsx/lnet/lnet/lib-move.c
new file mode 100644
index 0000000000000..b60106f949b69
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-move.c
@@ -0,0 +1,3143 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-move.c
+ *
+ * Data movement routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <lnet/lib-lnet.h>
+#include <linux/nsproxy.h>
+#include <net/net_namespace.h>
+
+static int local_nid_dist_zero = 1;
+module_param(local_nid_dist_zero, int, 0444);
+MODULE_PARM_DESC(local_nid_dist_zero, "Reserved");
+
+int
+lnet_fail_nid(lnet_nid_t nid, unsigned int threshold)
+{
+	struct lnet_test_peer *tp;
+	struct list_head *el;
+	struct list_head *next;
+	struct list_head  cull;
+
+	/* NB: use lnet_net_lock(0) to serialize operations on test peers */
+	if (threshold != 0) {
+		/* Adding a new entry */
+		LIBCFS_ALLOC(tp, sizeof(*tp));
+		if (tp == NULL)
+			return -ENOMEM;
+
+		tp->tp_nid = nid;
+		tp->tp_threshold = threshold;
+
+		lnet_net_lock(0);
+		list_add_tail(&tp->tp_list, &the_lnet.ln_test_peers);
+		lnet_net_unlock(0);
+		return 0;
+	}
+
+	/* removing entries */
+	INIT_LIST_HEAD(&cull);
+
+	lnet_net_lock(0);
+
+	list_for_each_safe(el, next, &the_lnet.ln_test_peers) {
+		tp = list_entry(el, struct lnet_test_peer, tp_list);
+
+		if (tp->tp_threshold == 0 ||	/* needs culling anyway */
+		    nid == LNET_NID_ANY ||	/* removing all entries */
+		    tp->tp_nid == nid) {	/* matched this one */
+			list_del(&tp->tp_list);
+			list_add(&tp->tp_list, &cull);
+		}
+	}
+
+	lnet_net_unlock(0);
+
+	while (!list_empty(&cull)) {
+		tp = list_entry(cull.next, struct lnet_test_peer, tp_list);
+
+		list_del(&tp->tp_list);
+		LIBCFS_FREE(tp, sizeof(*tp));
+	}
+	return 0;
+}
+
+static int
+fail_peer (lnet_nid_t nid, int outgoing)
+{
+	struct lnet_test_peer *tp;
+	struct list_head *el;
+	struct list_head *next;
+	struct list_head  cull;
+	int		  fail = 0;
+
+	INIT_LIST_HEAD(&cull);
+
+	/* NB: use lnet_net_lock(0) to serialize operations on test peers */
+	lnet_net_lock(0);
+
+	list_for_each_safe(el, next, &the_lnet.ln_test_peers) {
+		tp = list_entry(el, struct lnet_test_peer, tp_list);
+
+		if (tp->tp_threshold == 0) {
+			/* zombie entry */
+			if (outgoing) {
+				/* only cull zombies on outgoing tests,
+				 * since we may be at interrupt priority on
+				 * incoming messages. */
+				list_del(&tp->tp_list);
+				list_add(&tp->tp_list, &cull);
+			}
+			continue;
+		}
+
+		if (tp->tp_nid == LNET_NID_ANY ||	/* fail every peer */
+		    nid == tp->tp_nid) {		/* fail this peer */
+			fail = 1;
+
+			if (tp->tp_threshold != LNET_MD_THRESH_INF) {
+				tp->tp_threshold--;
+				if (outgoing &&
+				    tp->tp_threshold == 0) {
+					/* see above */
+					list_del(&tp->tp_list);
+					list_add(&tp->tp_list, &cull);
+				}
+			}
+			break;
+		}
+	}
+
+	lnet_net_unlock(0);
+
+	while (!list_empty(&cull)) {
+		tp = list_entry(cull.next, struct lnet_test_peer, tp_list);
+		list_del(&tp->tp_list);
+
+		LIBCFS_FREE(tp, sizeof(*tp));
+	}
+
+	return fail;
+}
+
+unsigned int
+lnet_iov_nob(unsigned int niov, struct kvec *iov)
+{
+	unsigned int nob = 0;
+
+	LASSERT(niov == 0 || iov != NULL);
+	while (niov-- > 0)
+		nob += (iov++)->iov_len;
+
+	return (nob);
+}
+EXPORT_SYMBOL(lnet_iov_nob);
+
+void
+lnet_copy_iov2iov(unsigned int ndiov, struct kvec *diov, unsigned int doffset,
+		  unsigned int nsiov, struct kvec *siov, unsigned int soffset,
+		  unsigned int nob)
+{
+	/* NB diov, siov are READ-ONLY */
+	unsigned int  this_nob;
+
+	if (nob == 0)
+		return;
+
+	/* skip complete frags before 'doffset' */
+	LASSERT(ndiov > 0);
+	while (doffset >= diov->iov_len) {
+		doffset -= diov->iov_len;
+		diov++;
+		ndiov--;
+		LASSERT(ndiov > 0);
+	}
+
+	/* skip complete frags before 'soffset' */
+	LASSERT(nsiov > 0);
+	while (soffset >= siov->iov_len) {
+		soffset -= siov->iov_len;
+		siov++;
+		nsiov--;
+		LASSERT(nsiov > 0);
+	}
+
+	do {
+		LASSERT(ndiov > 0);
+		LASSERT(nsiov > 0);
+		this_nob = MIN(diov->iov_len - doffset,
+			       siov->iov_len - soffset);
+		this_nob = MIN(this_nob, nob);
+
+		memcpy((char *)diov->iov_base + doffset,
+		       (char *)siov->iov_base + soffset, this_nob);
+		nob -= this_nob;
+
+		if (diov->iov_len > doffset + this_nob) {
+			doffset += this_nob;
+		} else {
+			diov++;
+			ndiov--;
+			doffset = 0;
+		}
+
+		if (siov->iov_len > soffset + this_nob) {
+			soffset += this_nob;
+		} else {
+			siov++;
+			nsiov--;
+			soffset = 0;
+		}
+	} while (nob > 0);
+}
+EXPORT_SYMBOL(lnet_copy_iov2iov);
+
+int
+lnet_extract_iov(int dst_niov, struct kvec *dst,
+		 int src_niov, struct kvec *src,
+		 unsigned int offset, unsigned int len)
+{
+	/* Initialise 'dst' to the subset of 'src' starting at 'offset',
+	 * for exactly 'len' bytes, and return the number of entries.
+	 * NB not destructive to 'src' */
+	unsigned int	frag_len;
+	unsigned int	niov;
+
+	if (len == 0)				/* no data => */
+		return (0);			/* no frags */
+
+	LASSERT(src_niov > 0);
+	while (offset >= src->iov_len) {      /* skip initial frags */
+		offset -= src->iov_len;
+		src_niov--;
+		src++;
+		LASSERT(src_niov > 0);
+	}
+
+	niov = 1;
+	for (;;) {
+		LASSERT(src_niov > 0);
+		LASSERT((int)niov <= dst_niov);
+
+		frag_len = src->iov_len - offset;
+		dst->iov_base = ((char *)src->iov_base) + offset;
+
+		if (len <= frag_len) {
+			dst->iov_len = len;
+			return (niov);
+		}
+
+		dst->iov_len = frag_len;
+
+		len -= frag_len;
+		dst++;
+		src++;
+		niov++;
+		src_niov--;
+		offset = 0;
+	}
+}
+EXPORT_SYMBOL(lnet_extract_iov);
+
+
+unsigned int
+lnet_kiov_nob(unsigned int niov, lnet_kiov_t *kiov)
+{
+	unsigned int  nob = 0;
+
+	LASSERT(niov == 0 || kiov != NULL);
+	while (niov-- > 0)
+		nob += (kiov++)->kiov_len;
+
+	return (nob);
+}
+EXPORT_SYMBOL(lnet_kiov_nob);
+
+void
+lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset,
+		    unsigned int nsiov, lnet_kiov_t *siov, unsigned int soffset,
+		    unsigned int nob)
+{
+	/* NB diov, siov are READ-ONLY */
+	unsigned int	this_nob;
+	char	       *daddr = NULL;
+	char	       *saddr = NULL;
+
+	if (nob == 0)
+		return;
+
+	LASSERT (!in_interrupt ());
+
+	LASSERT (ndiov > 0);
+	while (doffset >= diov->kiov_len) {
+		doffset -= diov->kiov_len;
+		diov++;
+		ndiov--;
+		LASSERT(ndiov > 0);
+	}
+
+	LASSERT(nsiov > 0);
+	while (soffset >= siov->kiov_len) {
+		soffset -= siov->kiov_len;
+		siov++;
+		nsiov--;
+		LASSERT(nsiov > 0);
+	}
+
+	do {
+		LASSERT(ndiov > 0);
+		LASSERT(nsiov > 0);
+		this_nob = MIN(diov->kiov_len - doffset,
+			       siov->kiov_len - soffset);
+		this_nob = MIN(this_nob, nob);
+
+		if (daddr == NULL)
+			daddr = ((char *)kmap(diov->kiov_page)) +
+				diov->kiov_offset + doffset;
+		if (saddr == NULL)
+			saddr = ((char *)kmap(siov->kiov_page)) +
+				siov->kiov_offset + soffset;
+
+		/* Vanishing risk of kmap deadlock when mapping 2 pages.
+		 * However in practice at least one of the kiovs will be mapped
+		 * kernel pages and the map/unmap will be NOOPs */
+
+		memcpy (daddr, saddr, this_nob);
+		nob -= this_nob;
+
+		if (diov->kiov_len > doffset + this_nob) {
+			daddr += this_nob;
+			doffset += this_nob;
+		} else {
+			kunmap(diov->kiov_page);
+			daddr = NULL;
+			diov++;
+			ndiov--;
+			doffset = 0;
+		}
+
+		if (siov->kiov_len > soffset + this_nob) {
+			saddr += this_nob;
+			soffset += this_nob;
+		} else {
+			kunmap(siov->kiov_page);
+			saddr = NULL;
+			siov++;
+			nsiov--;
+			soffset = 0;
+		}
+	} while (nob > 0);
+
+	if (daddr != NULL)
+		kunmap(diov->kiov_page);
+	if (saddr != NULL)
+		kunmap(siov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_kiov2kiov);
+
+void
+lnet_copy_kiov2iov (unsigned int niov, struct kvec *iov, unsigned int iovoffset,
+		    unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset,
+		    unsigned int nob)
+{
+	/* NB iov, kiov are READ-ONLY */
+	unsigned int	this_nob;
+	char	       *addr = NULL;
+
+	if (nob == 0)
+		return;
+
+	LASSERT (!in_interrupt ());
+
+	LASSERT (niov > 0);
+	while (iovoffset >= iov->iov_len) {
+		iovoffset -= iov->iov_len;
+		iov++;
+		niov--;
+		LASSERT(niov > 0);
+	}
+
+	LASSERT(nkiov > 0);
+	while (kiovoffset >= kiov->kiov_len) {
+		kiovoffset -= kiov->kiov_len;
+		kiov++;
+		nkiov--;
+		LASSERT(nkiov > 0);
+	}
+
+	do {
+		LASSERT(niov > 0);
+		LASSERT(nkiov > 0);
+		this_nob = MIN(iov->iov_len - iovoffset,
+			       kiov->kiov_len - kiovoffset);
+		this_nob = MIN(this_nob, nob);
+
+		if (addr == NULL)
+			addr = ((char *)kmap(kiov->kiov_page)) +
+				kiov->kiov_offset + kiovoffset;
+
+		memcpy((char *)iov->iov_base + iovoffset, addr, this_nob);
+		nob -= this_nob;
+
+		if (iov->iov_len > iovoffset + this_nob) {
+			iovoffset += this_nob;
+		} else {
+			iov++;
+			niov--;
+			iovoffset = 0;
+		}
+
+		if (kiov->kiov_len > kiovoffset + this_nob) {
+			addr += this_nob;
+			kiovoffset += this_nob;
+		} else {
+			kunmap(kiov->kiov_page);
+			addr = NULL;
+			kiov++;
+			nkiov--;
+			kiovoffset = 0;
+		}
+
+	} while (nob > 0);
+
+	if (addr != NULL)
+		kunmap(kiov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_kiov2iov);
+
+void
+lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset,
+		   unsigned int niov, struct kvec *iov, unsigned int iovoffset,
+		   unsigned int nob)
+{
+	/* NB kiov, iov are READ-ONLY */
+	unsigned int	this_nob;
+	char	       *addr = NULL;
+
+	if (nob == 0)
+		return;
+
+	LASSERT (!in_interrupt ());
+
+	LASSERT (nkiov > 0);
+	while (kiovoffset >= kiov->kiov_len) {
+		kiovoffset -= kiov->kiov_len;
+		kiov++;
+		nkiov--;
+		LASSERT(nkiov > 0);
+	}
+
+	LASSERT(niov > 0);
+	while (iovoffset >= iov->iov_len) {
+		iovoffset -= iov->iov_len;
+		iov++;
+		niov--;
+		LASSERT(niov > 0);
+	}
+
+	do {
+		LASSERT(nkiov > 0);
+		LASSERT(niov > 0);
+		this_nob = MIN(kiov->kiov_len - kiovoffset,
+			       iov->iov_len - iovoffset);
+		this_nob = MIN(this_nob, nob);
+
+		if (addr == NULL)
+			addr = ((char *)kmap(kiov->kiov_page)) +
+				kiov->kiov_offset + kiovoffset;
+
+		memcpy (addr, (char *)iov->iov_base + iovoffset, this_nob);
+		nob -= this_nob;
+
+		if (kiov->kiov_len > kiovoffset + this_nob) {
+			addr += this_nob;
+			kiovoffset += this_nob;
+		} else {
+			kunmap(kiov->kiov_page);
+			addr = NULL;
+			kiov++;
+			nkiov--;
+			kiovoffset = 0;
+		}
+
+		if (iov->iov_len > iovoffset + this_nob) {
+			iovoffset += this_nob;
+		} else {
+			iov++;
+			niov--;
+			iovoffset = 0;
+		}
+	} while (nob > 0);
+
+	if (addr != NULL)
+		kunmap(kiov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_iov2kiov);
+
+int
+lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst,
+		  int src_niov, lnet_kiov_t *src,
+		  unsigned int offset, unsigned int len)
+{
+	/* Initialise 'dst' to the subset of 'src' starting at 'offset',
+	 * for exactly 'len' bytes, and return the number of entries.
+	 * NB not destructive to 'src' */
+	unsigned int	frag_len;
+	unsigned int	niov;
+
+	if (len == 0)				/* no data => */
+		return (0);			/* no frags */
+
+	LASSERT(src_niov > 0);
+	while (offset >= src->kiov_len) {      /* skip initial frags */
+		offset -= src->kiov_len;
+		src_niov--;
+		src++;
+		LASSERT(src_niov > 0);
+	}
+
+	niov = 1;
+	for (;;) {
+		LASSERT(src_niov > 0);
+		LASSERT((int)niov <= dst_niov);
+
+		frag_len = src->kiov_len - offset;
+		dst->kiov_page = src->kiov_page;
+		dst->kiov_offset = src->kiov_offset + offset;
+
+		if (len <= frag_len) {
+			dst->kiov_len = len;
+			LASSERT(dst->kiov_offset + dst->kiov_len <= PAGE_SIZE);
+			return niov;
+		}
+
+		dst->kiov_len = frag_len;
+		LASSERT(dst->kiov_offset + dst->kiov_len <= PAGE_SIZE);
+
+		len -= frag_len;
+		dst++;
+		src++;
+		niov++;
+		src_niov--;
+		offset = 0;
+	}
+}
+EXPORT_SYMBOL(lnet_extract_kiov);
+
+void
+lnet_ni_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg,
+	     int delayed, unsigned int offset, unsigned int mlen,
+	     unsigned int rlen)
+{
+	unsigned int  niov = 0;
+	struct kvec *iov = NULL;
+	lnet_kiov_t  *kiov = NULL;
+	int	      rc;
+
+	LASSERT (!in_interrupt ());
+	LASSERT (mlen == 0 || msg != NULL);
+
+	if (msg != NULL) {
+		LASSERT(msg->msg_receiving);
+		LASSERT(!msg->msg_sending);
+		LASSERT(rlen == msg->msg_len);
+		LASSERT(mlen <= msg->msg_len);
+		LASSERT(msg->msg_offset == offset);
+		LASSERT(msg->msg_wanted == mlen);
+
+		msg->msg_receiving = 0;
+
+		if (mlen != 0) {
+			niov = msg->msg_niov;
+			iov  = msg->msg_iov;
+			kiov = msg->msg_kiov;
+
+			LASSERT (niov > 0);
+			LASSERT ((iov == NULL) != (kiov == NULL));
+		}
+	}
+
+	rc = (ni->ni_net->net_lnd->lnd_recv)(ni, private, msg, delayed,
+					     niov, iov, kiov, offset, mlen,
+					     rlen);
+	if (rc < 0)
+		lnet_finalize(msg, rc);
+}
+
+static void
+lnet_setpayloadbuffer(struct lnet_msg *msg)
+{
+	struct lnet_libmd *md = msg->msg_md;
+
+	LASSERT(msg->msg_len > 0);
+	LASSERT(!msg->msg_routing);
+	LASSERT(md != NULL);
+	LASSERT(msg->msg_niov == 0);
+	LASSERT(msg->msg_iov == NULL);
+	LASSERT(msg->msg_kiov == NULL);
+
+	msg->msg_niov = md->md_niov;
+	if ((md->md_options & LNET_MD_KIOV) != 0)
+		msg->msg_kiov = md->md_iov.kiov;
+	else
+		msg->msg_iov = md->md_iov.iov;
+}
+
+void
+lnet_prep_send(struct lnet_msg *msg, int type, struct lnet_process_id target,
+	       unsigned int offset, unsigned int len)
+{
+	msg->msg_type = type;
+	msg->msg_target = target;
+	msg->msg_len = len;
+	msg->msg_offset = offset;
+
+	if (len != 0)
+		lnet_setpayloadbuffer(msg);
+
+	memset (&msg->msg_hdr, 0, sizeof (msg->msg_hdr));
+	msg->msg_hdr.type           = cpu_to_le32(type);
+	msg->msg_hdr.dest_pid       = cpu_to_le32(target.pid);
+	/* src_nid will be set later */
+	msg->msg_hdr.src_pid        = cpu_to_le32(the_lnet.ln_pid);
+	msg->msg_hdr.payload_length = cpu_to_le32(len);
+}
+
+static void
+lnet_ni_send(struct lnet_ni *ni, struct lnet_msg *msg)
+{
+	void   *priv = msg->msg_private;
+	int	rc;
+
+	LASSERT (!in_interrupt ());
+	LASSERT (LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND ||
+		 (msg->msg_txcredit && msg->msg_peertxcredit));
+
+	rc = (ni->ni_net->net_lnd->lnd_send)(ni, priv, msg);
+	if (rc < 0)
+		lnet_finalize(msg, rc);
+}
+
+static int
+lnet_ni_eager_recv(struct lnet_ni *ni, struct lnet_msg *msg)
+{
+	int	rc;
+
+	LASSERT(!msg->msg_sending);
+	LASSERT(msg->msg_receiving);
+	LASSERT(!msg->msg_rx_ready_delay);
+	LASSERT(ni->ni_net->net_lnd->lnd_eager_recv != NULL);
+
+	msg->msg_rx_ready_delay = 1;
+	rc = (ni->ni_net->net_lnd->lnd_eager_recv)(ni, msg->msg_private, msg,
+						  &msg->msg_private);
+	if (rc != 0) {
+		CERROR("recv from %s / send to %s aborted: "
+		       "eager_recv failed %d\n",
+		       libcfs_nid2str(msg->msg_rxpeer->lpni_nid),
+		       libcfs_id2str(msg->msg_target), rc);
+		LASSERT(rc < 0); /* required by my callers */
+	}
+
+	return rc;
+}
+
+/*
+ * This function can be called from two paths:
+ *	1. when sending a message
+ *	2. when decommiting a message (lnet_msg_decommit_tx())
+ * In both these cases the peer_ni should have it's reference count
+ * acquired by the caller and therefore it is safe to drop the spin
+ * lock before calling lnd_query()
+ */
+static void
+lnet_ni_query_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp)
+{
+	cfs_time_t last_alive = 0;
+	int cpt = lnet_cpt_of_nid_locked(lp->lpni_nid, ni);
+
+	LASSERT(lnet_peer_aliveness_enabled(lp));
+	LASSERT(ni->ni_net->net_lnd->lnd_query != NULL);
+
+	lnet_net_unlock(cpt);
+	(ni->ni_net->net_lnd->lnd_query)(ni, lp->lpni_nid, &last_alive);
+	lnet_net_lock(cpt);
+
+	lp->lpni_last_query = cfs_time_current();
+
+	if (last_alive != 0) /* NI has updated timestamp */
+		lp->lpni_last_alive = last_alive;
+}
+
+/* NB: always called with lnet_net_lock held */
+static inline int
+lnet_peer_is_alive (struct lnet_peer_ni *lp, cfs_time_t now)
+{
+	int        alive;
+	cfs_time_t deadline;
+
+	LASSERT (lnet_peer_aliveness_enabled(lp));
+
+	/*
+	 * Trust lnet_notify() if it has more recent aliveness news, but
+	 * ignore the initial assumed death (see lnet_peers_start_down()).
+	 */
+	spin_lock(&lp->lpni_lock);
+	if (!lp->lpni_alive && lp->lpni_alive_count > 0 &&
+	    cfs_time_aftereq(lp->lpni_timestamp, lp->lpni_last_alive)) {
+		spin_unlock(&lp->lpni_lock);
+		return 0;
+	}
+
+	deadline =
+	  cfs_time_add(lp->lpni_last_alive,
+		       cfs_time_seconds(lp->lpni_net->net_tunables.
+					lct_peer_timeout));
+	alive = cfs_time_after(deadline, now);
+
+	/*
+	 * Update obsolete lp_alive except for routers assumed to be dead
+	 * initially, because router checker would update aliveness in this
+	 * case, and moreover lpni_last_alive at peer creation is assumed.
+	 */
+	if (alive && !lp->lpni_alive &&
+	    !(lnet_isrouter(lp) && lp->lpni_alive_count == 0)) {
+		spin_unlock(&lp->lpni_lock);
+		lnet_notify_locked(lp, 0, 1, lp->lpni_last_alive);
+	} else {
+		spin_unlock(&lp->lpni_lock);
+	}
+
+	return alive;
+}
+
+
+/* NB: returns 1 when alive, 0 when dead, negative when error;
+ *     may drop the lnet_net_lock */
+static int
+lnet_peer_alive_locked (struct lnet_ni *ni, struct lnet_peer_ni *lp)
+{
+	cfs_time_t now = cfs_time_current();
+
+	if (!lnet_peer_aliveness_enabled(lp))
+		return -ENODEV;
+
+	if (lnet_peer_is_alive(lp, now))
+		return 1;
+
+	/*
+	 * Peer appears dead, but we should avoid frequent NI queries (at
+	 * most once per lnet_queryinterval seconds).
+	 */
+	if (lp->lpni_last_query != 0) {
+		static const int lnet_queryinterval = 1;
+
+		cfs_time_t next_query =
+			   cfs_time_add(lp->lpni_last_query,
+					cfs_time_seconds(lnet_queryinterval));
+
+		if (cfs_time_before(now, next_query)) {
+			if (lp->lpni_alive)
+				CWARN("Unexpected aliveness of peer %s: "
+				      "%d < %d (%d/%d)\n",
+				      libcfs_nid2str(lp->lpni_nid),
+				      (int)now, (int)next_query,
+				      lnet_queryinterval,
+				      lp->lpni_net->net_tunables.lct_peer_timeout);
+			return 0;
+		}
+	}
+
+	/* query NI for latest aliveness news */
+	lnet_ni_query_locked(ni, lp);
+
+	if (lnet_peer_is_alive(lp, now))
+		return 1;
+
+	lnet_notify_locked(lp, 0, 0, lp->lpni_last_alive);
+	return 0;
+}
+
+/**
+ * \param msg The message to be sent.
+ * \param do_send True if lnet_ni_send() should be called in this function.
+ *	  lnet_send() is going to lnet_net_unlock immediately after this, so
+ *	  it sets do_send FALSE and I don't do the unlock/send/lock bit.
+ *
+ * \retval LNET_CREDIT_OK If \a msg sent or OK to send.
+ * \retval LNET_CREDIT_WAIT If \a msg blocked for credit.
+ * \retval -EHOSTUNREACH If the next hop of the message appears dead.
+ * \retval -ECANCELED If the MD of the message has been unlinked.
+ */
+static int
+lnet_post_send_locked(struct lnet_msg *msg, int do_send)
+{
+	struct lnet_peer_ni	*lp = msg->msg_txpeer;
+	struct lnet_ni		*ni = msg->msg_txni;
+	int			cpt = msg->msg_tx_cpt;
+	struct lnet_tx_queue	*tq = ni->ni_tx_queues[cpt];
+
+	/* non-lnet_send() callers have checked before */
+	LASSERT(!do_send || msg->msg_tx_delayed);
+	LASSERT(!msg->msg_receiving);
+	LASSERT(msg->msg_tx_committed);
+
+	/* NB 'lp' is always the next hop */
+	if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 &&
+	    lnet_peer_alive_locked(ni, lp) == 0) {
+		the_lnet.ln_counters[cpt]->drop_count++;
+		the_lnet.ln_counters[cpt]->drop_length += msg->msg_len;
+		lnet_net_unlock(cpt);
+		if (msg->msg_txpeer)
+			atomic_inc(&msg->msg_txpeer->lpni_stats.drop_count);
+		if (msg->msg_txni)
+			atomic_inc(&msg->msg_txni->ni_stats.drop_count);
+
+		CNETERR("Dropping message for %s: peer not alive\n",
+			libcfs_id2str(msg->msg_target));
+		if (do_send)
+			lnet_finalize(msg, -EHOSTUNREACH);
+
+		lnet_net_lock(cpt);
+		return -EHOSTUNREACH;
+	}
+
+	if (msg->msg_md != NULL &&
+	    (msg->msg_md->md_flags & LNET_MD_FLAG_ABORTED) != 0) {
+		lnet_net_unlock(cpt);
+
+		CNETERR("Aborting message for %s: LNetM[DE]Unlink() already "
+			"called on the MD/ME.\n",
+			libcfs_id2str(msg->msg_target));
+		if (do_send)
+			lnet_finalize(msg, -ECANCELED);
+
+		lnet_net_lock(cpt);
+		return -ECANCELED;
+	}
+
+	if (!msg->msg_peertxcredit) {
+		spin_lock(&lp->lpni_lock);
+		LASSERT((lp->lpni_txcredits < 0) ==
+			!list_empty(&lp->lpni_txq));
+
+		msg->msg_peertxcredit = 1;
+		lp->lpni_txqnob += msg->msg_len + sizeof(struct lnet_hdr);
+		lp->lpni_txcredits--;
+
+		if (lp->lpni_txcredits < lp->lpni_mintxcredits)
+			lp->lpni_mintxcredits = lp->lpni_txcredits;
+
+		if (lp->lpni_txcredits < 0) {
+			msg->msg_tx_delayed = 1;
+			list_add_tail(&msg->msg_list, &lp->lpni_txq);
+			spin_unlock(&lp->lpni_lock);
+			return LNET_CREDIT_WAIT;
+		}
+		spin_unlock(&lp->lpni_lock);
+	}
+
+	if (!msg->msg_txcredit) {
+		LASSERT((tq->tq_credits < 0) ==
+			!list_empty(&tq->tq_delayed));
+
+		msg->msg_txcredit = 1;
+		tq->tq_credits--;
+		atomic_dec(&ni->ni_tx_credits);
+
+		if (tq->tq_credits < tq->tq_credits_min)
+			tq->tq_credits_min = tq->tq_credits;
+
+		if (tq->tq_credits < 0) {
+			msg->msg_tx_delayed = 1;
+			list_add_tail(&msg->msg_list, &tq->tq_delayed);
+			return LNET_CREDIT_WAIT;
+		}
+	}
+
+	if (do_send) {
+		lnet_net_unlock(cpt);
+		lnet_ni_send(ni, msg);
+		lnet_net_lock(cpt);
+	}
+	return LNET_CREDIT_OK;
+}
+
+
+static struct lnet_rtrbufpool *
+lnet_msg2bufpool(struct lnet_msg *msg)
+{
+	struct lnet_rtrbufpool	*rbp;
+	int			cpt;
+
+	LASSERT(msg->msg_rx_committed);
+
+	cpt = msg->msg_rx_cpt;
+	rbp = &the_lnet.ln_rtrpools[cpt][0];
+
+	LASSERT(msg->msg_len <= LNET_MTU);
+	while (msg->msg_len > (unsigned int)rbp->rbp_npages * PAGE_SIZE) {
+		rbp++;
+		LASSERT(rbp < &the_lnet.ln_rtrpools[cpt][LNET_NRBPOOLS]);
+	}
+
+	return rbp;
+}
+
+static int
+lnet_post_routed_recv_locked(struct lnet_msg *msg, int do_recv)
+{
+	/* lnet_parse is going to lnet_net_unlock immediately after this, so it
+	 * sets do_recv FALSE and I don't do the unlock/send/lock bit.
+	 * I return LNET_CREDIT_WAIT if msg blocked and LNET_CREDIT_OK if
+	 * received or OK to receive */
+	struct lnet_peer_ni *lp = msg->msg_rxpeer;
+	struct lnet_rtrbufpool *rbp;
+	struct lnet_rtrbuf *rb;
+
+	LASSERT (msg->msg_iov == NULL);
+	LASSERT (msg->msg_kiov == NULL);
+	LASSERT (msg->msg_niov == 0);
+	LASSERT (msg->msg_routing);
+	LASSERT (msg->msg_receiving);
+	LASSERT (!msg->msg_sending);
+
+	/* non-lnet_parse callers only receive delayed messages */
+	LASSERT(!do_recv || msg->msg_rx_delayed);
+
+	if (!msg->msg_peerrtrcredit) {
+		spin_lock(&lp->lpni_lock);
+		LASSERT((lp->lpni_rtrcredits < 0) ==
+			!list_empty(&lp->lpni_rtrq));
+
+		msg->msg_peerrtrcredit = 1;
+		lp->lpni_rtrcredits--;
+		if (lp->lpni_rtrcredits < lp->lpni_minrtrcredits)
+			lp->lpni_minrtrcredits = lp->lpni_rtrcredits;
+
+		if (lp->lpni_rtrcredits < 0) {
+			/* must have checked eager_recv before here */
+			LASSERT(msg->msg_rx_ready_delay);
+			msg->msg_rx_delayed = 1;
+			list_add_tail(&msg->msg_list, &lp->lpni_rtrq);
+			spin_unlock(&lp->lpni_lock);
+			return LNET_CREDIT_WAIT;
+		}
+		spin_unlock(&lp->lpni_lock);
+	}
+
+	rbp = lnet_msg2bufpool(msg);
+
+	if (!msg->msg_rtrcredit) {
+		msg->msg_rtrcredit = 1;
+		rbp->rbp_credits--;
+		if (rbp->rbp_credits < rbp->rbp_mincredits)
+			rbp->rbp_mincredits = rbp->rbp_credits;
+
+		if (rbp->rbp_credits < 0) {
+			/* must have checked eager_recv before here */
+			LASSERT(msg->msg_rx_ready_delay);
+			msg->msg_rx_delayed = 1;
+			list_add_tail(&msg->msg_list, &rbp->rbp_msgs);
+			return LNET_CREDIT_WAIT;
+		}
+	}
+
+	LASSERT(!list_empty(&rbp->rbp_bufs));
+	rb = list_entry(rbp->rbp_bufs.next, struct lnet_rtrbuf, rb_list);
+	list_del(&rb->rb_list);
+
+	msg->msg_niov = rbp->rbp_npages;
+	msg->msg_kiov = &rb->rb_kiov[0];
+
+	if (do_recv) {
+		int cpt = msg->msg_rx_cpt;
+
+		lnet_net_unlock(cpt);
+		lnet_ni_recv(msg->msg_rxni, msg->msg_private, msg, 1,
+			     0, msg->msg_len, msg->msg_len);
+		lnet_net_lock(cpt);
+	}
+	return LNET_CREDIT_OK;
+}
+
+void
+lnet_return_tx_credits_locked(struct lnet_msg *msg)
+{
+	struct lnet_peer_ni	*txpeer = msg->msg_txpeer;
+	struct lnet_ni		*txni = msg->msg_txni;
+	struct lnet_msg		*msg2;
+
+	if (msg->msg_txcredit) {
+		struct lnet_ni	     *ni = msg->msg_txni;
+		struct lnet_tx_queue *tq = ni->ni_tx_queues[msg->msg_tx_cpt];
+
+		/* give back NI txcredits */
+		msg->msg_txcredit = 0;
+
+		LASSERT((tq->tq_credits < 0) ==
+			!list_empty(&tq->tq_delayed));
+
+		tq->tq_credits++;
+		atomic_inc(&ni->ni_tx_credits);
+		if (tq->tq_credits <= 0) {
+			msg2 = list_entry(tq->tq_delayed.next,
+					  struct lnet_msg, msg_list);
+			list_del(&msg2->msg_list);
+
+			LASSERT(msg2->msg_txni == ni);
+			LASSERT(msg2->msg_tx_delayed);
+			LASSERT(msg2->msg_tx_cpt == msg->msg_tx_cpt);
+
+			(void) lnet_post_send_locked(msg2, 1);
+		}
+	}
+
+	if (msg->msg_peertxcredit) {
+		/* give back peer txcredits */
+		msg->msg_peertxcredit = 0;
+
+		spin_lock(&txpeer->lpni_lock);
+		LASSERT((txpeer->lpni_txcredits < 0) ==
+			!list_empty(&txpeer->lpni_txq));
+
+		txpeer->lpni_txqnob -= msg->msg_len + sizeof(struct lnet_hdr);
+		LASSERT(txpeer->lpni_txqnob >= 0);
+
+		txpeer->lpni_txcredits++;
+		if (txpeer->lpni_txcredits <= 0) {
+			int msg2_cpt;
+
+			msg2 = list_entry(txpeer->lpni_txq.next,
+					      struct lnet_msg, msg_list);
+			list_del(&msg2->msg_list);
+			spin_unlock(&txpeer->lpni_lock);
+
+			LASSERT(msg2->msg_txpeer == txpeer);
+			LASSERT(msg2->msg_tx_delayed);
+
+			msg2_cpt = msg2->msg_tx_cpt;
+
+			/*
+			 * The msg_cpt can be different from the msg2_cpt
+			 * so we need to make sure we lock the correct cpt
+			 * for msg2.
+			 * Once we call lnet_post_send_locked() it is no
+			 * longer safe to access msg2, since it could've
+			 * been freed by lnet_finalize(), but we still
+			 * need to relock the correct cpt, so we cache the
+			 * msg2_cpt for the purpose of the check that
+			 * follows the call to lnet_pose_send_locked().
+			 */
+			if (msg2_cpt != msg->msg_tx_cpt) {
+				lnet_net_unlock(msg->msg_tx_cpt);
+				lnet_net_lock(msg2_cpt);
+			}
+                        (void) lnet_post_send_locked(msg2, 1);
+			if (msg2_cpt != msg->msg_tx_cpt) {
+				lnet_net_unlock(msg2_cpt);
+				lnet_net_lock(msg->msg_tx_cpt);
+			}
+                } else {
+			spin_unlock(&txpeer->lpni_lock);
+		}
+        }
+
+	if (txni != NULL) {
+		msg->msg_txni = NULL;
+		lnet_ni_decref_locked(txni, msg->msg_tx_cpt);
+	}
+
+	if (txpeer != NULL) {
+		/*
+		 * TODO:
+		 * Once the patch for the health comes in we need to set
+		 * the health of the peer ni to bad when we fail to send
+		 * a message.
+		 * int status = msg->msg_ev.status;
+		 * if (status != 0)
+		 *	lnet_set_peer_ni_health_locked(txpeer, false)
+		 */
+		msg->msg_txpeer = NULL;
+		lnet_peer_ni_decref_locked(txpeer);
+	}
+}
+
+void
+lnet_schedule_blocked_locked(struct lnet_rtrbufpool *rbp)
+{
+	struct lnet_msg	*msg;
+
+	if (list_empty(&rbp->rbp_msgs))
+		return;
+	msg = list_entry(rbp->rbp_msgs.next,
+			 struct lnet_msg, msg_list);
+	list_del(&msg->msg_list);
+
+	(void)lnet_post_routed_recv_locked(msg, 1);
+}
+
+void
+lnet_drop_routed_msgs_locked(struct list_head *list, int cpt)
+{
+	struct lnet_msg *msg;
+	struct lnet_msg *tmp;
+
+	lnet_net_unlock(cpt);
+
+	list_for_each_entry_safe(msg, tmp, list, msg_list) {
+		lnet_ni_recv(msg->msg_rxni, msg->msg_private, NULL,
+			     0, 0, 0, msg->msg_hdr.payload_length);
+		list_del_init(&msg->msg_list);
+		lnet_finalize(msg, -ECANCELED);
+	}
+
+	lnet_net_lock(cpt);
+}
+
+void
+lnet_return_rx_credits_locked(struct lnet_msg *msg)
+{
+	struct lnet_peer_ni *rxpeer = msg->msg_rxpeer;
+	struct lnet_ni *rxni = msg->msg_rxni;
+	struct lnet_msg	*msg2;
+
+	if (msg->msg_rtrcredit) {
+		/* give back global router credits */
+		struct lnet_rtrbuf *rb;
+		struct lnet_rtrbufpool *rbp;
+
+		/* NB If a msg ever blocks for a buffer in rbp_msgs, it stays
+		 * there until it gets one allocated, or aborts the wait
+		 * itself */
+		LASSERT(msg->msg_kiov != NULL);
+
+		rb = list_entry(msg->msg_kiov, struct lnet_rtrbuf, rb_kiov[0]);
+		rbp = rb->rb_pool;
+
+		msg->msg_kiov = NULL;
+		msg->msg_rtrcredit = 0;
+
+		LASSERT(rbp == lnet_msg2bufpool(msg));
+
+		LASSERT((rbp->rbp_credits > 0) ==
+			!list_empty(&rbp->rbp_bufs));
+
+		/* If routing is now turned off, we just drop this buffer and
+		 * don't bother trying to return credits.  */
+		if (!the_lnet.ln_routing) {
+			lnet_destroy_rtrbuf(rb, rbp->rbp_npages);
+			goto routing_off;
+		}
+
+		/* It is possible that a user has lowered the desired number of
+		 * buffers in this pool.  Make sure we never put back
+		 * more buffers than the stated number. */
+		if (unlikely(rbp->rbp_credits >= rbp->rbp_req_nbuffers)) {
+			/* Discard this buffer so we don't have too
+			 * many. */
+			lnet_destroy_rtrbuf(rb, rbp->rbp_npages);
+			rbp->rbp_nbuffers--;
+		} else {
+			list_add(&rb->rb_list, &rbp->rbp_bufs);
+			rbp->rbp_credits++;
+			if (rbp->rbp_credits <= 0)
+				lnet_schedule_blocked_locked(rbp);
+		}
+	}
+
+routing_off:
+	if (msg->msg_peerrtrcredit) {
+		/* give back peer router credits */
+		msg->msg_peerrtrcredit = 0;
+
+		spin_lock(&rxpeer->lpni_lock);
+		LASSERT((rxpeer->lpni_rtrcredits < 0) ==
+			!list_empty(&rxpeer->lpni_rtrq));
+
+		rxpeer->lpni_rtrcredits++;
+
+		/* drop all messages which are queued to be routed on that
+		 * peer. */
+		if (!the_lnet.ln_routing) {
+			struct list_head drop;
+			INIT_LIST_HEAD(&drop);
+			list_splice_init(&rxpeer->lpni_rtrq, &drop);
+			spin_unlock(&rxpeer->lpni_lock);
+			lnet_drop_routed_msgs_locked(&drop, msg->msg_rx_cpt);
+		} else if (rxpeer->lpni_rtrcredits <= 0) {
+			msg2 = list_entry(rxpeer->lpni_rtrq.next,
+					  struct lnet_msg, msg_list);
+			list_del(&msg2->msg_list);
+			spin_unlock(&rxpeer->lpni_lock);
+			(void) lnet_post_routed_recv_locked(msg2, 1);
+		} else {
+			spin_unlock(&rxpeer->lpni_lock);
+		}
+	}
+	if (rxni != NULL) {
+		msg->msg_rxni = NULL;
+		lnet_ni_decref_locked(rxni, msg->msg_rx_cpt);
+	}
+	if (rxpeer != NULL) {
+		msg->msg_rxpeer = NULL;
+		lnet_peer_ni_decref_locked(rxpeer);
+	}
+}
+
+static int
+lnet_compare_peers(struct lnet_peer_ni *p1, struct lnet_peer_ni *p2)
+{
+	if (p1->lpni_txqnob < p2->lpni_txqnob)
+		return 1;
+
+	if (p1->lpni_txqnob > p2->lpni_txqnob)
+		return -1;
+
+	if (p1->lpni_txcredits > p2->lpni_txcredits)
+		return 1;
+
+	if (p1->lpni_txcredits < p2->lpni_txcredits)
+		return -1;
+
+	return 0;
+}
+
+static int
+lnet_compare_routes(struct lnet_route *r1, struct lnet_route *r2)
+{
+	struct lnet_peer_ni *p1 = r1->lr_gateway;
+	struct lnet_peer_ni *p2 = r2->lr_gateway;
+	int r1_hops = (r1->lr_hops == LNET_UNDEFINED_HOPS) ? 1 : r1->lr_hops;
+	int r2_hops = (r2->lr_hops == LNET_UNDEFINED_HOPS) ? 1 : r2->lr_hops;
+	int rc;
+
+	if (r1->lr_priority < r2->lr_priority)
+		return 1;
+
+	if (r1->lr_priority > r2->lr_priority)
+		return -1;
+
+	if (r1_hops < r2_hops)
+		return 1;
+
+	if (r1_hops > r2_hops)
+		return -1;
+
+	rc = lnet_compare_peers(p1, p2);
+	if (rc)
+		return rc;
+
+	if (r1->lr_seq - r2->lr_seq <= 0)
+		return 1;
+
+	return -1;
+}
+
+static struct lnet_peer_ni *
+lnet_find_route_locked(struct lnet_net *net, lnet_nid_t target,
+		       lnet_nid_t rtr_nid)
+{
+	struct lnet_remotenet	*rnet;
+	struct lnet_route		*route;
+	struct lnet_route		*best_route;
+	struct lnet_route		*last_route;
+	struct lnet_peer_ni	*lpni_best;
+	struct lnet_peer_ni	*lp;
+	int			rc;
+
+	/* If @rtr_nid is not LNET_NID_ANY, return the gateway with
+	 * rtr_nid nid, otherwise find the best gateway I can use */
+
+	rnet = lnet_find_rnet_locked(LNET_NIDNET(target));
+	if (rnet == NULL)
+		return NULL;
+
+	lpni_best = NULL;
+	best_route = last_route = NULL;
+	list_for_each_entry(route, &rnet->lrn_routes, lr_list) {
+		lp = route->lr_gateway;
+
+		if (!lnet_is_route_alive(route))
+			continue;
+
+		if (net != NULL && lp->lpni_net != net)
+			continue;
+
+		if (lp->lpni_nid == rtr_nid) /* it's pre-determined router */
+			return lp;
+
+		if (lpni_best == NULL) {
+			best_route = last_route = route;
+			lpni_best = lp;
+			continue;
+		}
+
+		/* no protection on below fields, but it's harmless */
+		if (last_route->lr_seq - route->lr_seq < 0)
+			last_route = route;
+
+		rc = lnet_compare_routes(route, best_route);
+		if (rc < 0)
+			continue;
+
+		best_route = route;
+		lpni_best = lp;
+	}
+
+	/* set sequence number on the best router to the latest sequence + 1
+	 * so we can round-robin all routers, it's race and inaccurate but
+	 * harmless and functional  */
+	if (best_route != NULL)
+		best_route->lr_seq = last_route->lr_seq + 1;
+	return lpni_best;
+}
+
+static struct lnet_ni *
+lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *cur_ni,
+		 int md_cpt)
+{
+	struct lnet_ni *ni = NULL, *best_ni = cur_ni;
+	unsigned int shortest_distance;
+	int best_credits;
+
+	if (best_ni == NULL) {
+		shortest_distance = UINT_MAX;
+		best_credits = INT_MIN;
+	} else {
+		shortest_distance = cfs_cpt_distance(lnet_cpt_table(), md_cpt,
+						     best_ni->ni_dev_cpt);
+		best_credits = atomic_read(&best_ni->ni_tx_credits);
+	}
+
+	while ((ni = lnet_get_next_ni_locked(local_net, ni))) {
+		unsigned int distance;
+		int ni_credits;
+
+		if (!lnet_is_ni_healthy_locked(ni))
+			continue;
+
+		ni_credits = atomic_read(&ni->ni_tx_credits);
+
+		/*
+		 * calculate the distance from the CPT on which
+		 * the message memory is allocated to the CPT of
+		 * the NI's physical device
+		 */
+		distance = cfs_cpt_distance(lnet_cpt_table(),
+					    md_cpt,
+					    ni->ni_dev_cpt);
+
+		/*
+		 * All distances smaller than the NUMA range
+		 * are treated equally.
+		 */
+		if (distance < lnet_numa_range)
+			distance = lnet_numa_range;
+
+		/*
+		 * Select on shorter distance, then available
+		 * credits, then round-robin.
+		 */
+		if (distance > shortest_distance) {
+			continue;
+		} else if (distance < shortest_distance) {
+			shortest_distance = distance;
+		} else if (ni_credits < best_credits) {
+			continue;
+		} else if (ni_credits == best_credits) {
+			if (best_ni && (best_ni)->ni_seq <= ni->ni_seq)
+				continue;
+		}
+		best_ni = ni;
+		best_credits = ni_credits;
+	}
+
+	return best_ni;
+}
+
+static int
+lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
+		    struct lnet_msg *msg, lnet_nid_t rtr_nid)
+{
+	struct lnet_ni		*best_ni;
+	struct lnet_peer_ni	*best_lpni;
+	struct lnet_peer_ni	*best_gw;
+	struct lnet_peer_ni	*lpni;
+	struct lnet_peer_ni	*final_dst;
+	struct lnet_peer	*peer;
+	struct lnet_peer_net	*peer_net;
+	struct lnet_net		*local_net;
+	__u32			seq;
+	int			cpt, cpt2, rc;
+	bool			routing;
+	bool			routing2;
+	bool			ni_is_pref;
+	bool			preferred;
+	bool			local_found;
+	int			best_lpni_credits;
+	int			md_cpt;
+
+	/*
+	 * get an initial CPT to use for locking. The idea here is not to
+	 * serialize the calls to select_pathway, so that as many
+	 * operations can run concurrently as possible. To do that we use
+	 * the CPT where this call is being executed. Later on when we
+	 * determine the CPT to use in lnet_message_commit, we switch the
+	 * lock and check if there was any configuration change.  If none,
+	 * then we proceed, if there is, then we restart the operation.
+	 */
+	cpt = lnet_net_lock_current();
+
+	md_cpt = lnet_cpt_of_md(msg->msg_md, msg->msg_offset);
+	if (md_cpt == CFS_CPT_ANY)
+		md_cpt = cpt;
+
+again:
+	best_ni = NULL;
+	best_lpni = NULL;
+	best_gw = NULL;
+	final_dst = NULL;
+	local_net = NULL;
+	routing = false;
+	routing2 = false;
+	local_found = false;
+
+	seq = lnet_get_dlc_seq_locked();
+
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+		lnet_net_unlock(cpt);
+		return -ESHUTDOWN;
+	}
+
+	peer = lnet_find_or_create_peer_locked(dst_nid, cpt);
+	if (IS_ERR(peer)) {
+		lnet_net_unlock(cpt);
+		return PTR_ERR(peer);
+	}
+
+	/* If peer is not healthy then can not send anything to it */
+	if (!lnet_is_peer_healthy_locked(peer)) {
+		lnet_net_unlock(cpt);
+		return -EHOSTUNREACH;
+	}
+
+	if (!peer->lp_multi_rail && lnet_get_num_peer_nis(peer) > 1) {
+		lnet_net_unlock(cpt);
+		CERROR("peer %s is declared to be non MR capable, "
+		       "yet configured with more than one NID\n",
+		       libcfs_nid2str(dst_nid));
+		return -EINVAL;
+	}
+
+	/*
+	 * STEP 1: first jab at determining best_ni
+	 * if src_nid is explicitly specified, then best_ni is already
+	 * pre-determiend for us. Otherwise we need to select the best
+	 * one to use later on
+	 */
+	if (src_nid != LNET_NID_ANY) {
+		best_ni = lnet_nid2ni_locked(src_nid, cpt);
+		if (!best_ni) {
+			lnet_net_unlock(cpt);
+			LCONSOLE_WARN("Can't send to %s: src %s is not a "
+				      "local nid\n", libcfs_nid2str(dst_nid),
+				      libcfs_nid2str(src_nid));
+			return -EINVAL;
+		}
+	}
+
+	if (msg->msg_type == LNET_MSG_REPLY ||
+	    msg->msg_type == LNET_MSG_ACK ||
+	    !peer->lp_multi_rail ||
+	    best_ni) {
+		/*
+		 * for replies we want to respond on the same peer_ni we
+		 * received the message on if possible. If not, then pick
+		 * a peer_ni to send to
+		 *
+		 * if the peer is non-multi-rail then you want to send to
+		 * the dst_nid provided as well.
+		 *
+		 * If the best_ni has already been determined, IE the
+		 * src_nid has been specified, then use the
+		 * destination_nid provided as well, since we're
+		 * continuing a series of related messages for the same
+		 * RPC.
+		 *
+		 * It is expected to find the lpni using dst_nid, since we
+		 * created it earlier.
+		 */
+		best_lpni = lnet_find_peer_ni_locked(dst_nid);
+		if (best_lpni)
+			lnet_peer_ni_decref_locked(best_lpni);
+
+		if (best_lpni && !lnet_get_net_locked(LNET_NIDNET(dst_nid))) {
+			/*
+			 * this lpni is not on a local network so we need
+			 * to route this reply.
+			 */
+			best_gw = lnet_find_route_locked(NULL,
+							 best_lpni->lpni_nid,
+							 rtr_nid);
+			if (best_gw) {
+				/*
+				* RULE: Each node considers only the next-hop
+				*
+				* We're going to route the message, so change the peer to
+				* the router.
+				*/
+				LASSERT(best_gw->lpni_peer_net);
+				LASSERT(best_gw->lpni_peer_net->lpn_peer);
+				peer = best_gw->lpni_peer_net->lpn_peer;
+
+				/*
+				* if the router is not multi-rail then use the best_gw
+				* found to send the message to
+				*/
+				if (!peer->lp_multi_rail)
+					best_lpni = best_gw;
+				else
+					best_lpni = NULL;
+
+				routing = true;
+			} else {
+				best_lpni = NULL;
+			}
+		} else if (!best_lpni) {
+			lnet_net_unlock(cpt);
+			CERROR("unable to send msg_type %d to "
+			      "originating %s. Destination NID not in DB\n",
+			      msg->msg_type, libcfs_nid2str(dst_nid));
+			return -EINVAL;
+		}
+	}
+
+	/*
+	 * if the peer is not MR capable, then we should always send to it
+	 * using the first NI in the NET we determined.
+	 */
+	if (!peer->lp_multi_rail) {
+		if (!best_lpni) {
+			lnet_net_unlock(cpt);
+			CERROR("no route to %s\n",
+			       libcfs_nid2str(dst_nid));
+			return -EHOSTUNREACH;
+		}
+
+		/* best ni could be set because src_nid was provided */
+		if (!best_ni) {
+			best_ni = lnet_net2ni_locked(best_lpni->lpni_net->net_id, cpt);
+			if (!best_ni) {
+				lnet_net_unlock(cpt);
+				CERROR("no path to %s from net %s\n",
+				libcfs_nid2str(best_lpni->lpni_nid),
+				libcfs_net2str(best_lpni->lpni_net->net_id));
+				return -EHOSTUNREACH;
+			}
+		}
+	}
+
+	/*
+	 * if we already found a best_ni because src_nid is specified and
+	 * best_lpni because we are replying to a message then just send
+	 * the message
+	 */
+	if (best_ni && best_lpni)
+		goto send;
+
+	/*
+	 * If we already found a best_ni because src_nid is specified then
+	 * pick the peer then send the message
+	 */
+	if (best_ni)
+		goto pick_peer;
+
+	/*
+	 * pick the best_ni by going through all the possible networks of
+	 * that peer and see which local NI is best suited to talk to that
+	 * peer.
+	 *
+	 * Locally connected networks will always be preferred over
+	 * a routed network. If there are only routed paths to the peer,
+	 * then the best route is chosen. If all routes are equal then
+	 * they are used in round robin.
+	 */
+	list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_on_peer_list) {
+		if (!lnet_is_peer_net_healthy_locked(peer_net))
+			continue;
+
+		local_net = lnet_get_net_locked(peer_net->lpn_net_id);
+		if (!local_net && !routing && !local_found) {
+			struct lnet_peer_ni *net_gw;
+
+			lpni = list_entry(peer_net->lpn_peer_nis.next,
+					  struct lnet_peer_ni,
+					  lpni_on_peer_net_list);
+
+			net_gw = lnet_find_route_locked(NULL,
+							lpni->lpni_nid,
+							rtr_nid);
+			if (!net_gw)
+				continue;
+
+			if (best_gw) {
+				/*
+				 * lnet_find_route_locked() call
+				 * will return the best_Gw on the
+				 * lpni->lpni_nid network.
+				 * However, best_gw and net_gw can
+				 * be on different networks.
+				 * Therefore need to compare them
+				 * to pick the better of either.
+				 */
+				if (lnet_compare_peers(best_gw, net_gw) > 0)
+					continue;
+				if (best_gw->lpni_gw_seq <= net_gw->lpni_gw_seq)
+					continue;
+			}
+			best_gw = net_gw;
+			final_dst = lpni;
+
+			routing2 = true;
+		} else {
+			best_gw = NULL;
+			final_dst = NULL;
+			routing2 = false;
+			local_found = true;
+		}
+
+		/*
+		 * a gw on this network is found, but there could be
+		 * other better gateways on other networks. So don't pick
+		 * the best_ni until we determine the best_gw.
+		 */
+		if (best_gw)
+			continue;
+
+		/* if no local_net found continue */
+		if (!local_net)
+			continue;
+
+		/*
+		 * Iterate through the NIs in this local Net and select
+		 * the NI to send from. The selection is determined by
+		 * these 3 criterion in the following priority:
+		 *	1. NUMA
+		 *	2. NI available credits
+		 *	3. Round Robin
+		 */
+		best_ni = lnet_get_best_ni(local_net, best_ni, md_cpt);
+	}
+
+	if (!best_ni && !best_gw) {
+		lnet_net_unlock(cpt);
+		LCONSOLE_WARN("No local ni found to send from to %s\n",
+			libcfs_nid2str(dst_nid));
+		return -EINVAL;
+	}
+
+	if (!best_ni) {
+		best_ni = lnet_get_best_ni(best_gw->lpni_net, best_ni, md_cpt);
+		LASSERT(best_gw && best_ni);
+
+		/*
+		 * We're going to route the message, so change the peer to
+		 * the router.
+		 */
+		LASSERT(best_gw->lpni_peer_net);
+		LASSERT(best_gw->lpni_peer_net->lpn_peer);
+		best_gw->lpni_gw_seq++;
+		peer = best_gw->lpni_peer_net->lpn_peer;
+	}
+
+	/*
+	 * Now that we selected the NI to use increment its sequence
+	 * number so the Round Robin algorithm will detect that it has
+	 * been used and pick the next NI.
+	 */
+	best_ni->ni_seq++;
+
+pick_peer:
+	/*
+	 * At this point the best_ni is on a local network on which
+	 * the peer has a peer_ni as well
+	 */
+	peer_net = lnet_peer_get_net_locked(peer,
+					    best_ni->ni_net->net_id);
+	/*
+	 * peer_net is not available or the src_nid is explicitly defined
+	 * and the peer_net for that src_nid is unhealthy. find a route to
+	 * the destination nid.
+	 */
+	if (!peer_net ||
+	    (src_nid != LNET_NID_ANY &&
+	     !lnet_is_peer_net_healthy_locked(peer_net))) {
+		best_gw = lnet_find_route_locked(best_ni->ni_net,
+						 dst_nid,
+						 rtr_nid);
+		/*
+		 * if no route is found for that network then
+		 * move onto the next peer_ni in the peer
+		 */
+		if (!best_gw) {
+			lnet_net_unlock(cpt);
+			LCONSOLE_WARN("No route to peer from %s\n",
+				libcfs_nid2str(best_ni->ni_nid));
+			return -EHOSTUNREACH;
+		}
+
+		CDEBUG(D_NET, "Best route to %s via %s for %s %d\n",
+			libcfs_nid2str(dst_nid),
+			libcfs_nid2str(best_gw->lpni_nid),
+			lnet_msgtyp2str(msg->msg_type), msg->msg_len);
+
+		routing2 = true;
+		/*
+		 * RULE: Each node considers only the next-hop
+		 *
+		 * We're going to route the message, so change the peer to
+		 * the router.
+		 */
+		LASSERT(best_gw->lpni_peer_net);
+		LASSERT(best_gw->lpni_peer_net->lpn_peer);
+		peer = best_gw->lpni_peer_net->lpn_peer;
+	} else if (!lnet_is_peer_net_healthy_locked(peer_net)) {
+		/*
+		 * this peer_net is unhealthy but we still have an opportunity
+		 * to find another peer_net that we can use
+		 */
+		__u32 net_id = peer_net->lpn_net_id;
+		LCONSOLE_WARN("peer net %s unhealthy\n",
+			      libcfs_net2str(net_id));
+		goto again;
+	}
+
+	/*
+	 * Look at the peer NIs for the destination peer that connect
+	 * to the chosen net. If a peer_ni is preferred when using the
+	 * best_ni to communicate, we use that one. If there is no
+	 * preferred peer_ni, or there are multiple preferred peer_ni,
+	 * the available transmit credits are used. If the transmit
+	 * credits are equal, we round-robin over the peer_ni.
+	 */
+	lpni = NULL;
+	best_lpni_credits = INT_MIN;
+	preferred = false;
+	best_lpni = NULL;
+	while ((lpni = lnet_get_next_peer_ni_locked(peer, peer_net, lpni))) {
+		/*
+		 * if this peer ni is not healthy just skip it, no point in
+		 * examining it further
+		 */
+		if (!lnet_is_peer_ni_healthy_locked(lpni))
+			continue;
+		ni_is_pref = lnet_peer_is_ni_pref_locked(lpni, best_ni);
+
+		/* if this is a preferred peer use it */
+		if (!preferred && ni_is_pref) {
+			preferred = true;
+		} else if (preferred && !ni_is_pref) {
+			/*
+			 * this is not the preferred peer so let's ignore
+			 * it.
+			 */
+			continue;
+		} else if (lpni->lpni_txcredits < best_lpni_credits) {
+			/*
+			 * We already have a peer that has more credits
+			 * available than this one. No need to consider
+			 * this peer further.
+			 */
+			continue;
+		} else if (lpni->lpni_txcredits == best_lpni_credits) {
+			/*
+			 * The best peer found so far and the current peer
+			 * have the same number of available credits let's
+			 * make sure to select between them using Round
+			 * Robin
+			 */
+			if (best_lpni) {
+				if (best_lpni->lpni_seq <= lpni->lpni_seq)
+					continue;
+			}
+		}
+
+		best_lpni = lpni;
+		best_lpni_credits = lpni->lpni_txcredits;
+	}
+
+	/* if we still can't find a peer ni then we can't reach it */
+	if (!best_lpni) {
+		__u32 net_id = (peer_net) ? peer_net->lpn_net_id :
+			LNET_NIDNET(dst_nid);
+		lnet_net_unlock(cpt);
+		LCONSOLE_WARN("no peer_ni found on peer net %s\n",
+				libcfs_net2str(net_id));
+		return -EHOSTUNREACH;
+	}
+
+
+send:
+	/* Shortcut for loopback. */
+	if (best_ni == the_lnet.ln_loni) {
+		/* No send credit hassles with LOLND */
+		lnet_ni_addref_locked(best_ni, cpt);
+		msg->msg_hdr.dest_nid = cpu_to_le64(best_ni->ni_nid);
+		if (!msg->msg_routing)
+			msg->msg_hdr.src_nid = cpu_to_le64(best_ni->ni_nid);
+		msg->msg_target.nid = best_ni->ni_nid;
+		lnet_msg_commit(msg, cpt);
+		msg->msg_txni = best_ni;
+		lnet_net_unlock(cpt);
+
+		return LNET_CREDIT_OK;
+	}
+
+	routing = routing || routing2;
+
+	/*
+	 * Increment sequence number of the peer selected so that we
+	 * pick the next one in Round Robin.
+	 */
+	best_lpni->lpni_seq++;
+
+	/*
+	 * grab a reference on the peer_ni so it sticks around even if
+	 * we need to drop and relock the lnet_net_lock below.
+	 */
+	lnet_peer_ni_addref_locked(best_lpni);
+
+	/*
+	 * Use lnet_cpt_of_nid() to determine the CPT used to commit the
+	 * message. This ensures that we get a CPT that is correct for
+	 * the NI when the NI has been restricted to a subset of all CPTs.
+	 * If the selected CPT differs from the one currently locked, we
+	 * must unlock and relock the lnet_net_lock(), and then check whether
+	 * the configuration has changed. We don't have a hold on the best_ni
+	 * yet, and it may have vanished.
+	 */
+	cpt2 = lnet_cpt_of_nid_locked(best_lpni->lpni_nid, best_ni);
+	if (cpt != cpt2) {
+		lnet_net_unlock(cpt);
+		cpt = cpt2;
+		lnet_net_lock(cpt);
+		if (seq != lnet_get_dlc_seq_locked()) {
+			lnet_peer_ni_decref_locked(best_lpni);
+			goto again;
+		}
+	}
+
+	/*
+	 * store the best_lpni in the message right away to avoid having
+	 * to do the same operation under different conditions
+	 */
+	msg->msg_txpeer = best_lpni;
+	msg->msg_txni = best_ni;
+
+	/*
+	 * grab a reference for the best_ni since now it's in use in this
+	 * send. the reference will need to be dropped when the message is
+	 * finished in lnet_finalize()
+	 */
+	lnet_ni_addref_locked(msg->msg_txni, cpt);
+
+	/*
+	 * Always set the target.nid to the best peer picked. Either the
+	 * nid will be one of the preconfigured NIDs, or the same NID as
+	 * what was originally set in the target or it will be the NID of
+	 * a router if this message should be routed
+	 */
+	msg->msg_target.nid = msg->msg_txpeer->lpni_nid;
+
+	/*
+	 * lnet_msg_commit assigns the correct cpt to the message, which
+	 * is used to decrement the correct refcount on the ni when it's
+	 * time to return the credits
+	 */
+	lnet_msg_commit(msg, cpt);
+
+	/*
+	 * If we are routing the message then we don't need to overwrite
+	 * the src_nid since it would've been set at the origin. Otherwise
+	 * we are the originator so we need to set it.
+	 */
+	if (!msg->msg_routing)
+		msg->msg_hdr.src_nid = cpu_to_le64(msg->msg_txni->ni_nid);
+
+	if (routing) {
+		msg->msg_target_is_router = 1;
+		msg->msg_target.pid = LNET_PID_LUSTRE;
+		/*
+		 * since we're routing we want to ensure that the
+		 * msg_hdr.dest_nid is set to the final destination. When
+		 * the router receives this message it knows how to route
+		 * it.
+		 */
+		msg->msg_hdr.dest_nid =
+			cpu_to_le64(final_dst ? final_dst->lpni_nid : dst_nid);
+	} else {
+		/*
+		 * if we're not routing set the dest_nid to the best peer
+		 * ni that we picked earlier in the algorithm.
+		 */
+		msg->msg_hdr.dest_nid = cpu_to_le64(msg->msg_txpeer->lpni_nid);
+	}
+
+	rc = lnet_post_send_locked(msg, 0);
+
+	if (!rc)
+		CDEBUG(D_NET, "TRACE: %s(%s:%s) -> %s(%s:%s) : %s\n",
+		       libcfs_nid2str(msg->msg_hdr.src_nid),
+		       libcfs_nid2str(msg->msg_txni->ni_nid),
+		       libcfs_nid2str(src_nid),
+		       libcfs_nid2str(msg->msg_hdr.dest_nid),
+		       libcfs_nid2str(dst_nid),
+		       libcfs_nid2str(msg->msg_txpeer->lpni_nid),
+		       lnet_msgtyp2str(msg->msg_type));
+
+	lnet_net_unlock(cpt);
+
+	return rc;
+}
+
+int
+lnet_send(lnet_nid_t src_nid, struct lnet_msg *msg, lnet_nid_t rtr_nid)
+{
+	lnet_nid_t		dst_nid = msg->msg_target.nid;
+	int			rc;
+
+	/*
+	 * NB: rtr_nid is set to LNET_NID_ANY for all current use-cases,
+	 * but we might want to use pre-determined router for ACK/REPLY
+	 * in the future
+	 */
+	/* NB: ni != NULL == interface pre-determined (ACK/REPLY) */
+	LASSERT (msg->msg_txpeer == NULL);
+	LASSERT (!msg->msg_sending);
+	LASSERT (!msg->msg_target_is_router);
+	LASSERT (!msg->msg_receiving);
+
+	msg->msg_sending = 1;
+
+	LASSERT(!msg->msg_tx_committed);
+
+	rc = lnet_select_pathway(src_nid, dst_nid, msg, rtr_nid);
+	if (rc < 0)
+		return rc;
+
+	if (rc == LNET_CREDIT_OK)
+		lnet_ni_send(msg->msg_txni, msg);
+
+	/* rc == LNET_CREDIT_OK or LNET_CREDIT_WAIT */
+	return 0;
+}
+
+void
+lnet_drop_message(struct lnet_ni *ni, int cpt, void *private, unsigned int nob)
+{
+	lnet_net_lock(cpt);
+	the_lnet.ln_counters[cpt]->drop_count++;
+	the_lnet.ln_counters[cpt]->drop_length += nob;
+	lnet_net_unlock(cpt);
+
+	lnet_ni_recv(ni, private, NULL, 0, 0, 0, nob);
+}
+
+static void
+lnet_recv_put(struct lnet_ni *ni, struct lnet_msg *msg)
+{
+	struct lnet_hdr	*hdr = &msg->msg_hdr;
+
+	if (msg->msg_wanted != 0)
+		lnet_setpayloadbuffer(msg);
+
+	lnet_build_msg_event(msg, LNET_EVENT_PUT);
+
+	/* Must I ACK?	If so I'll grab the ack_wmd out of the header and put
+	 * it back into the ACK during lnet_finalize() */
+	msg->msg_ack = (!lnet_is_wire_handle_none(&hdr->msg.put.ack_wmd) &&
+			(msg->msg_md->md_options & LNET_MD_ACK_DISABLE) == 0);
+
+	lnet_ni_recv(ni, msg->msg_private, msg, msg->msg_rx_delayed,
+		     msg->msg_offset, msg->msg_wanted, hdr->payload_length);
+}
+
+static int
+lnet_parse_put(struct lnet_ni *ni, struct lnet_msg *msg)
+{
+	struct lnet_hdr		*hdr = &msg->msg_hdr;
+	struct lnet_match_info	info;
+	int			rc;
+	bool			ready_delay;
+
+	/* Convert put fields to host byte order */
+	hdr->msg.put.match_bits	= le64_to_cpu(hdr->msg.put.match_bits);
+	hdr->msg.put.ptl_index	= le32_to_cpu(hdr->msg.put.ptl_index);
+	hdr->msg.put.offset	= le32_to_cpu(hdr->msg.put.offset);
+
+	/* Primary peer NID. */
+	info.mi_id.nid	= msg->msg_initiator;
+	info.mi_id.pid	= hdr->src_pid;
+	info.mi_opc	= LNET_MD_OP_PUT;
+	info.mi_portal	= hdr->msg.put.ptl_index;
+	info.mi_rlength	= hdr->payload_length;
+	info.mi_roffset	= hdr->msg.put.offset;
+	info.mi_mbits	= hdr->msg.put.match_bits;
+	info.mi_cpt	= lnet_cpt_of_nid(msg->msg_rxpeer->lpni_nid, ni);
+
+	msg->msg_rx_ready_delay = ni->ni_net->net_lnd->lnd_eager_recv == NULL;
+	ready_delay = msg->msg_rx_ready_delay;
+
+ again:
+	rc = lnet_ptl_match_md(&info, msg);
+	switch (rc) {
+	default:
+		LBUG();
+
+	case LNET_MATCHMD_OK:
+		lnet_recv_put(ni, msg);
+		return 0;
+
+	case LNET_MATCHMD_NONE:
+		if (ready_delay)
+			/* no eager_recv or has already called it, should
+			 * have been attached on delayed list */
+			return 0;
+
+		rc = lnet_ni_eager_recv(ni, msg);
+		if (rc == 0) {
+			ready_delay = true;
+			goto again;
+		}
+		/* fall through */
+
+	case LNET_MATCHMD_DROP:
+		CNETERR("Dropping PUT from %s portal %d match %llu"
+			" offset %d length %d: %d\n",
+			libcfs_id2str(info.mi_id), info.mi_portal,
+			info.mi_mbits, info.mi_roffset, info.mi_rlength, rc);
+
+		return -ENOENT;	/* -ve: OK but no match */
+	}
+}
+
+static int
+lnet_parse_get(struct lnet_ni *ni, struct lnet_msg *msg, int rdma_get)
+{
+	struct lnet_match_info info;
+	struct lnet_hdr *hdr = &msg->msg_hdr;
+	struct lnet_process_id source_id;
+	struct lnet_handle_wire	reply_wmd;
+	int rc;
+
+	/* Convert get fields to host byte order */
+	hdr->msg.get.match_bits	  = le64_to_cpu(hdr->msg.get.match_bits);
+	hdr->msg.get.ptl_index	  = le32_to_cpu(hdr->msg.get.ptl_index);
+	hdr->msg.get.sink_length  = le32_to_cpu(hdr->msg.get.sink_length);
+	hdr->msg.get.src_offset	  = le32_to_cpu(hdr->msg.get.src_offset);
+
+	source_id.nid = hdr->src_nid;
+	source_id.pid = hdr->src_pid;
+	/* Primary peer NID */
+	info.mi_id.nid	= msg->msg_initiator;
+	info.mi_id.pid	= hdr->src_pid;
+	info.mi_opc	= LNET_MD_OP_GET;
+	info.mi_portal	= hdr->msg.get.ptl_index;
+	info.mi_rlength	= hdr->msg.get.sink_length;
+	info.mi_roffset	= hdr->msg.get.src_offset;
+	info.mi_mbits	= hdr->msg.get.match_bits;
+	info.mi_cpt	= lnet_cpt_of_nid(msg->msg_rxpeer->lpni_nid, ni);
+
+	rc = lnet_ptl_match_md(&info, msg);
+	if (rc == LNET_MATCHMD_DROP) {
+		CNETERR("Dropping GET from %s portal %d match %llu"
+			" offset %d length %d\n",
+			libcfs_id2str(info.mi_id), info.mi_portal,
+			info.mi_mbits, info.mi_roffset, info.mi_rlength);
+		return -ENOENT;	/* -ve: OK but no match */
+	}
+
+	LASSERT(rc == LNET_MATCHMD_OK);
+
+	lnet_build_msg_event(msg, LNET_EVENT_GET);
+
+	reply_wmd = hdr->msg.get.return_wmd;
+
+	lnet_prep_send(msg, LNET_MSG_REPLY, source_id,
+		       msg->msg_offset, msg->msg_wanted);
+
+	msg->msg_hdr.msg.reply.dst_wmd = reply_wmd;
+
+	if (rdma_get) {
+		/* The LND completes the REPLY from her recv procedure */
+		lnet_ni_recv(ni, msg->msg_private, msg, 0,
+			     msg->msg_offset, msg->msg_len, msg->msg_len);
+		return 0;
+	}
+
+	lnet_ni_recv(ni, msg->msg_private, NULL, 0, 0, 0, 0);
+	msg->msg_receiving = 0;
+
+	rc = lnet_send(ni->ni_nid, msg, LNET_NID_ANY);
+	if (rc < 0) {
+		/* didn't get as far as lnet_ni_send() */
+		CERROR("%s: Unable to send REPLY for GET from %s: %d\n",
+		       libcfs_nid2str(ni->ni_nid),
+		       libcfs_id2str(info.mi_id), rc);
+
+		lnet_finalize(msg, rc);
+	}
+
+	return 0;
+}
+
+static int
+lnet_parse_reply(struct lnet_ni *ni, struct lnet_msg *msg)
+{
+	void		 *private = msg->msg_private;
+	struct lnet_hdr	 *hdr = &msg->msg_hdr;
+	struct lnet_process_id src = {0};
+	struct lnet_libmd	 *md;
+	int		  rlength;
+	int		  mlength;
+	int			cpt;
+
+	cpt = lnet_cpt_of_cookie(hdr->msg.reply.dst_wmd.wh_object_cookie);
+	lnet_res_lock(cpt);
+
+	src.nid = hdr->src_nid;
+	src.pid = hdr->src_pid;
+
+	/* NB handles only looked up by creator (no flips) */
+	md = lnet_wire_handle2md(&hdr->msg.reply.dst_wmd);
+	if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+		CNETERR("%s: Dropping REPLY from %s for %s "
+			"MD %#llx.%#llx\n",
+			libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+			(md == NULL) ? "invalid" : "inactive",
+			hdr->msg.reply.dst_wmd.wh_interface_cookie,
+			hdr->msg.reply.dst_wmd.wh_object_cookie);
+		if (md != NULL && md->md_me != NULL)
+			CERROR("REPLY MD also attached to portal %d\n",
+			       md->md_me->me_portal);
+
+		lnet_res_unlock(cpt);
+		return -ENOENT;	/* -ve: OK but no match */
+	}
+
+	LASSERT(md->md_offset == 0);
+
+	rlength = hdr->payload_length;
+	mlength = MIN(rlength, (int)md->md_length);
+
+	if (mlength < rlength &&
+	    (md->md_options & LNET_MD_TRUNCATE) == 0) {
+		CNETERR("%s: Dropping REPLY from %s length %d "
+			"for MD %#llx would overflow (%d)\n",
+			libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+			rlength, hdr->msg.reply.dst_wmd.wh_object_cookie,
+			mlength);
+		lnet_res_unlock(cpt);
+		return -ENOENT;	/* -ve: OK but no match */
+	}
+
+	CDEBUG(D_NET, "%s: Reply from %s of length %d/%d into md %#llx\n",
+	       libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+	       mlength, rlength, hdr->msg.reply.dst_wmd.wh_object_cookie);
+
+	lnet_msg_attach_md(msg, md, 0, mlength);
+
+	if (mlength != 0)
+		lnet_setpayloadbuffer(msg);
+
+	lnet_res_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_REPLY);
+
+	lnet_ni_recv(ni, private, msg, 0, 0, mlength, rlength);
+	return 0;
+}
+
+static int
+lnet_parse_ack(struct lnet_ni *ni, struct lnet_msg *msg)
+{
+	struct lnet_hdr	 *hdr = &msg->msg_hdr;
+	struct lnet_process_id src = {0};
+	struct lnet_libmd	 *md;
+	int			cpt;
+
+	src.nid = hdr->src_nid;
+	src.pid = hdr->src_pid;
+
+	/* Convert ack fields to host byte order */
+	hdr->msg.ack.match_bits = le64_to_cpu(hdr->msg.ack.match_bits);
+	hdr->msg.ack.mlength = le32_to_cpu(hdr->msg.ack.mlength);
+
+	cpt = lnet_cpt_of_cookie(hdr->msg.ack.dst_wmd.wh_object_cookie);
+	lnet_res_lock(cpt);
+
+	/* NB handles only looked up by creator (no flips) */
+	md = lnet_wire_handle2md(&hdr->msg.ack.dst_wmd);
+	if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+		/* Don't moan; this is expected */
+		CDEBUG(D_NET,
+		       "%s: Dropping ACK from %s to %s MD %#llx.%#llx\n",
+		       libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+		       (md == NULL) ? "invalid" : "inactive",
+		       hdr->msg.ack.dst_wmd.wh_interface_cookie,
+		       hdr->msg.ack.dst_wmd.wh_object_cookie);
+		if (md != NULL && md->md_me != NULL)
+			CERROR("Source MD also attached to portal %d\n",
+			       md->md_me->me_portal);
+
+		lnet_res_unlock(cpt);
+		return -ENOENT;			 /* -ve! */
+	}
+
+	CDEBUG(D_NET, "%s: ACK from %s into md %#llx\n",
+	       libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+	       hdr->msg.ack.dst_wmd.wh_object_cookie);
+
+	lnet_msg_attach_md(msg, md, 0, 0);
+
+	lnet_res_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_ACK);
+
+	lnet_ni_recv(ni, msg->msg_private, msg, 0, 0, 0, msg->msg_len);
+	return 0;
+}
+
+/**
+ * \retval LNET_CREDIT_OK	If \a msg is forwarded
+ * \retval LNET_CREDIT_WAIT	If \a msg is blocked because w/o buffer
+ * \retval -ve			error code
+ */
+int
+lnet_parse_forward_locked(struct lnet_ni *ni, struct lnet_msg *msg)
+{
+	int	rc = 0;
+
+	if (!the_lnet.ln_routing)
+		return -ECANCELED;
+
+	if (msg->msg_rxpeer->lpni_rtrcredits <= 0 ||
+	    lnet_msg2bufpool(msg)->rbp_credits <= 0) {
+		if (ni->ni_net->net_lnd->lnd_eager_recv == NULL) {
+			msg->msg_rx_ready_delay = 1;
+		} else {
+			lnet_net_unlock(msg->msg_rx_cpt);
+			rc = lnet_ni_eager_recv(ni, msg);
+			lnet_net_lock(msg->msg_rx_cpt);
+		}
+	}
+
+	if (rc == 0)
+		rc = lnet_post_routed_recv_locked(msg, 0);
+	return rc;
+}
+
+int
+lnet_parse_local(struct lnet_ni *ni, struct lnet_msg *msg)
+{
+	int	rc;
+
+	switch (msg->msg_type) {
+	case LNET_MSG_ACK:
+		rc = lnet_parse_ack(ni, msg);
+		break;
+	case LNET_MSG_PUT:
+		rc = lnet_parse_put(ni, msg);
+		break;
+	case LNET_MSG_GET:
+		rc = lnet_parse_get(ni, msg, msg->msg_rdma_get);
+		break;
+	case LNET_MSG_REPLY:
+		rc = lnet_parse_reply(ni, msg);
+		break;
+	default: /* prevent an unused label if !kernel */
+		LASSERT(0);
+		return -EPROTO;
+	}
+
+	LASSERT(rc == 0 || rc == -ENOENT);
+	return rc;
+}
+
+char *
+lnet_msgtyp2str (int type)
+{
+	switch (type) {
+	case LNET_MSG_ACK:
+		return ("ACK");
+	case LNET_MSG_PUT:
+		return ("PUT");
+	case LNET_MSG_GET:
+		return ("GET");
+	case LNET_MSG_REPLY:
+		return ("REPLY");
+	case LNET_MSG_HELLO:
+		return ("HELLO");
+	default:
+		return ("<UNKNOWN>");
+	}
+}
+
+void
+lnet_print_hdr(struct lnet_hdr *hdr)
+{
+	struct lnet_process_id src = {
+		.nid = hdr->src_nid,
+		.pid = hdr->src_pid,
+	};
+	struct lnet_process_id dst = {
+		.nid = hdr->dest_nid,
+		.pid = hdr->dest_pid,
+	};
+	char *type_str = lnet_msgtyp2str(hdr->type);
+
+	CWARN("P3 Header at %p of type %s\n", hdr, type_str);
+	CWARN("    From %s\n", libcfs_id2str(src));
+	CWARN("    To	%s\n", libcfs_id2str(dst));
+
+	switch (hdr->type) {
+	default:
+		break;
+
+	case LNET_MSG_PUT:
+		CWARN("    Ptl index %d, ack md %#llx.%#llx, "
+		      "match bits %llu\n",
+		      hdr->msg.put.ptl_index,
+		      hdr->msg.put.ack_wmd.wh_interface_cookie,
+		      hdr->msg.put.ack_wmd.wh_object_cookie,
+		      hdr->msg.put.match_bits);
+		CWARN("    Length %d, offset %d, hdr data %#llx\n",
+		      hdr->payload_length, hdr->msg.put.offset,
+		      hdr->msg.put.hdr_data);
+		break;
+
+	case LNET_MSG_GET:
+		CWARN("    Ptl index %d, return md %#llx.%#llx, "
+		      "match bits %llu\n", hdr->msg.get.ptl_index,
+		      hdr->msg.get.return_wmd.wh_interface_cookie,
+		      hdr->msg.get.return_wmd.wh_object_cookie,
+		      hdr->msg.get.match_bits);
+		CWARN("    Length %d, src offset %d\n",
+		      hdr->msg.get.sink_length,
+		      hdr->msg.get.src_offset);
+		break;
+
+	case LNET_MSG_ACK:
+		CWARN("    dst md %#llx.%#llx, "
+		      "manipulated length %d\n",
+		      hdr->msg.ack.dst_wmd.wh_interface_cookie,
+		      hdr->msg.ack.dst_wmd.wh_object_cookie,
+		      hdr->msg.ack.mlength);
+		break;
+
+	case LNET_MSG_REPLY:
+		CWARN("    dst md %#llx.%#llx, "
+		      "length %d\n",
+		      hdr->msg.reply.dst_wmd.wh_interface_cookie,
+		      hdr->msg.reply.dst_wmd.wh_object_cookie,
+		      hdr->payload_length);
+	}
+
+}
+
+int
+lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
+	   void *private, int rdma_req)
+{
+	int		rc = 0;
+	int		cpt;
+	int		for_me;
+	struct lnet_msg	*msg;
+	lnet_pid_t     dest_pid;
+	lnet_nid_t     dest_nid;
+	lnet_nid_t     src_nid;
+	struct lnet_peer_ni *lpni;
+	__u32          payload_length;
+	__u32          type;
+
+	LASSERT (!in_interrupt ());
+
+	type = le32_to_cpu(hdr->type);
+	src_nid = le64_to_cpu(hdr->src_nid);
+	dest_nid = le64_to_cpu(hdr->dest_nid);
+	dest_pid = le32_to_cpu(hdr->dest_pid);
+	payload_length = le32_to_cpu(hdr->payload_length);
+
+	for_me = (ni->ni_nid == dest_nid);
+	cpt = lnet_cpt_of_nid(from_nid, ni);
+
+	CDEBUG(D_NET, "TRACE: %s(%s) <- %s : %s\n",
+		libcfs_nid2str(dest_nid),
+		libcfs_nid2str(ni->ni_nid),
+		libcfs_nid2str(src_nid),
+		lnet_msgtyp2str(type));
+
+	switch (type) {
+	case LNET_MSG_ACK:
+	case LNET_MSG_GET:
+		if (payload_length > 0) {
+			CERROR("%s, src %s: bad %s payload %d (0 expected)\n",
+			       libcfs_nid2str(from_nid),
+			       libcfs_nid2str(src_nid),
+			       lnet_msgtyp2str(type), payload_length);
+			return -EPROTO;
+		}
+		break;
+
+	case LNET_MSG_PUT:
+	case LNET_MSG_REPLY:
+		if (payload_length >
+		    (__u32)(for_me ? LNET_MAX_PAYLOAD : LNET_MTU)) {
+			CERROR("%s, src %s: bad %s payload %d "
+			       "(%d max expected)\n",
+			       libcfs_nid2str(from_nid),
+			       libcfs_nid2str(src_nid),
+			       lnet_msgtyp2str(type),
+			       payload_length,
+			       for_me ? LNET_MAX_PAYLOAD : LNET_MTU);
+			return -EPROTO;
+		}
+		break;
+
+	default:
+		CERROR("%s, src %s: Bad message type 0x%x\n",
+		       libcfs_nid2str(from_nid),
+		       libcfs_nid2str(src_nid), type);
+		return -EPROTO;
+	}
+
+	if (the_lnet.ln_routing &&
+	    ni->ni_last_alive != cfs_time_current_sec()) {
+		/* NB: so far here is the only place to set NI status to "up */
+		lnet_ni_lock(ni);
+		ni->ni_last_alive = cfs_time_current_sec();
+		if (ni->ni_status != NULL &&
+		    ni->ni_status->ns_status == LNET_NI_STATUS_DOWN)
+			ni->ni_status->ns_status = LNET_NI_STATUS_UP;
+		lnet_ni_unlock(ni);
+	}
+
+	/* Regard a bad destination NID as a protocol error.  Senders should
+	 * know what they're doing; if they don't they're misconfigured, buggy
+	 * or malicious so we chop them off at the knees :) */
+
+	if (!for_me) {
+		if (LNET_NIDNET(dest_nid) == LNET_NIDNET(ni->ni_nid)) {
+			/* should have gone direct */
+			CERROR("%s, src %s: Bad dest nid %s "
+			       "(should have been sent direct)\n",
+				libcfs_nid2str(from_nid),
+				libcfs_nid2str(src_nid),
+				libcfs_nid2str(dest_nid));
+			return -EPROTO;
+		}
+
+		if (lnet_islocalnid(dest_nid)) {
+			/* dest is another local NI; sender should have used
+			 * this node's NID on its own network */
+			CERROR("%s, src %s: Bad dest nid %s "
+			       "(it's my nid but on a different network)\n",
+				libcfs_nid2str(from_nid),
+				libcfs_nid2str(src_nid),
+				libcfs_nid2str(dest_nid));
+			return -EPROTO;
+		}
+
+		if (rdma_req && type == LNET_MSG_GET) {
+			CERROR("%s, src %s: Bad optimized GET for %s "
+			       "(final destination must be me)\n",
+				libcfs_nid2str(from_nid),
+				libcfs_nid2str(src_nid),
+				libcfs_nid2str(dest_nid));
+			return -EPROTO;
+		}
+
+		if (!the_lnet.ln_routing) {
+			CERROR("%s, src %s: Dropping message for %s "
+			       "(routing not enabled)\n",
+				libcfs_nid2str(from_nid),
+				libcfs_nid2str(src_nid),
+				libcfs_nid2str(dest_nid));
+			goto drop;
+		}
+	}
+
+	/* Message looks OK; we're not going to return an error, so we MUST
+	 * call back lnd_recv() come what may... */
+
+	if (!list_empty(&the_lnet.ln_test_peers) &&	/* normally we don't */
+	    fail_peer(src_nid, 0)) {			/* shall we now? */
+		CERROR("%s, src %s: Dropping %s to simulate failure\n",
+		       libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+		       lnet_msgtyp2str(type));
+		goto drop;
+	}
+
+	if (!list_empty(&the_lnet.ln_drop_rules) &&
+	    lnet_drop_rule_match(hdr)) {
+		CDEBUG(D_NET, "%s, src %s, dst %s: Dropping %s to simulate"
+			      "silent message loss\n",
+		       libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+		       libcfs_nid2str(dest_nid), lnet_msgtyp2str(type));
+		goto drop;
+	}
+
+
+	msg = lnet_msg_alloc();
+	if (msg == NULL) {
+		CERROR("%s, src %s: Dropping %s (out of memory)\n",
+		       libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+		       lnet_msgtyp2str(type));
+		goto drop;
+	}
+
+	/* msg zeroed in lnet_msg_alloc; i.e. flags all clear,
+	 * pointers NULL etc */
+
+	msg->msg_type = type;
+	msg->msg_private = private;
+	msg->msg_receiving = 1;
+	msg->msg_rdma_get = rdma_req;
+	msg->msg_len = msg->msg_wanted = payload_length;
+	msg->msg_offset = 0;
+	msg->msg_hdr = *hdr;
+	/* for building message event */
+	msg->msg_from = from_nid;
+	if (!for_me) {
+		msg->msg_target.pid	= dest_pid;
+		msg->msg_target.nid	= dest_nid;
+		msg->msg_routing	= 1;
+
+	} else {
+		/* convert common msg->hdr fields to host byteorder */
+		msg->msg_hdr.type	= type;
+		msg->msg_hdr.src_nid	= src_nid;
+		msg->msg_hdr.src_pid	= le32_to_cpu(msg->msg_hdr.src_pid);
+		msg->msg_hdr.dest_nid	= dest_nid;
+		msg->msg_hdr.dest_pid	= dest_pid;
+		msg->msg_hdr.payload_length = payload_length;
+	}
+
+	lnet_net_lock(cpt);
+	lpni = lnet_nid2peerni_locked(from_nid, cpt);
+	if (IS_ERR(lpni)) {
+		lnet_net_unlock(cpt);
+		CERROR("%s, src %s: Dropping %s "
+		       "(error %ld looking up sender)\n",
+		       libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+		       lnet_msgtyp2str(type), PTR_ERR(lpni));
+		lnet_msg_free(msg);
+		if (rc == -ESHUTDOWN)
+			/* We are shutting down.  Don't do anything more */
+			return 0;
+		goto drop;
+	}
+	msg->msg_rxpeer = lpni;
+	msg->msg_rxni = ni;
+	lnet_ni_addref_locked(ni, cpt);
+	/* Multi-Rail: Primary NID of source. */
+	msg->msg_initiator = lnet_peer_primary_nid_locked(src_nid);
+
+	if (lnet_isrouter(msg->msg_rxpeer)) {
+		lnet_peer_set_alive(msg->msg_rxpeer);
+		if (avoid_asym_router_failure &&
+		    LNET_NIDNET(src_nid) != LNET_NIDNET(from_nid)) {
+			/* received a remote message from router, update
+			 * remote NI status on this router.
+			 * NB: multi-hop routed message will be ignored.
+			 */
+			lnet_router_ni_update_locked(msg->msg_rxpeer,
+						     LNET_NIDNET(src_nid));
+		}
+	}
+
+	lnet_msg_commit(msg, cpt);
+
+	/* message delay simulation */
+	if (unlikely(!list_empty(&the_lnet.ln_delay_rules) &&
+		     lnet_delay_rule_match_locked(hdr, msg))) {
+		lnet_net_unlock(cpt);
+		return 0;
+	}
+
+	if (!for_me) {
+		rc = lnet_parse_forward_locked(ni, msg);
+		lnet_net_unlock(cpt);
+
+		if (rc < 0)
+			goto free_drop;
+
+		if (rc == LNET_CREDIT_OK) {
+			lnet_ni_recv(ni, msg->msg_private, msg, 0,
+				     0, payload_length, payload_length);
+		}
+		return 0;
+	}
+
+	lnet_net_unlock(cpt);
+
+	rc = lnet_parse_local(ni, msg);
+	if (rc != 0)
+		goto free_drop;
+	return 0;
+
+ free_drop:
+	LASSERT(msg->msg_md == NULL);
+	lnet_finalize(msg, rc);
+
+ drop:
+	lnet_drop_message(ni, cpt, private, payload_length);
+	return 0;
+}
+EXPORT_SYMBOL(lnet_parse);
+
+void
+lnet_drop_delayed_msg_list(struct list_head *head, char *reason)
+{
+	while (!list_empty(head)) {
+		struct lnet_process_id id = {0};
+		struct lnet_msg	*msg;
+
+		msg = list_entry(head->next, struct lnet_msg, msg_list);
+		list_del(&msg->msg_list);
+
+		id.nid = msg->msg_hdr.src_nid;
+		id.pid = msg->msg_hdr.src_pid;
+
+		LASSERT(msg->msg_md == NULL);
+		LASSERT(msg->msg_rx_delayed);
+		LASSERT(msg->msg_rxpeer != NULL);
+		LASSERT(msg->msg_hdr.type == LNET_MSG_PUT);
+
+		CWARN("Dropping delayed PUT from %s portal %d match %llu"
+		      " offset %d length %d: %s\n",
+		      libcfs_id2str(id),
+		      msg->msg_hdr.msg.put.ptl_index,
+		      msg->msg_hdr.msg.put.match_bits,
+		      msg->msg_hdr.msg.put.offset,
+		      msg->msg_hdr.payload_length, reason);
+
+		/* NB I can't drop msg's ref on msg_rxpeer until after I've
+		 * called lnet_drop_message(), so I just hang onto msg as well
+		 * until that's done */
+
+		lnet_drop_message(msg->msg_rxni, msg->msg_rx_cpt,
+				  msg->msg_private, msg->msg_len);
+		/*
+		 * NB: message will not generate event because w/o attached MD,
+		 * but we still should give error code so lnet_msg_decommit()
+		 * can skip counters operations and other checks.
+		 */
+		lnet_finalize(msg, -ENOENT);
+	}
+}
+
+void
+lnet_recv_delayed_msg_list(struct list_head *head)
+{
+	while (!list_empty(head)) {
+		struct lnet_msg	*msg;
+		struct lnet_process_id id;
+
+		msg = list_entry(head->next, struct lnet_msg, msg_list);
+		list_del(&msg->msg_list);
+
+		/* md won't disappear under me, since each msg
+		 * holds a ref on it */
+
+		id.nid = msg->msg_hdr.src_nid;
+		id.pid = msg->msg_hdr.src_pid;
+
+		LASSERT(msg->msg_rx_delayed);
+		LASSERT(msg->msg_md != NULL);
+		LASSERT(msg->msg_rxpeer != NULL);
+		LASSERT(msg->msg_rxni != NULL);
+		LASSERT(msg->msg_hdr.type == LNET_MSG_PUT);
+
+		CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d "
+		       "match %llu offset %d length %d.\n",
+			libcfs_id2str(id), msg->msg_hdr.msg.put.ptl_index,
+			msg->msg_hdr.msg.put.match_bits,
+			msg->msg_hdr.msg.put.offset,
+			msg->msg_hdr.payload_length);
+
+		lnet_recv_put(msg->msg_rxni, msg);
+	}
+}
+
+/**
+ * Initiate an asynchronous PUT operation.
+ *
+ * There are several events associated with a PUT: completion of the send on
+ * the initiator node (LNET_EVENT_SEND), and when the send completes
+ * successfully, the receipt of an acknowledgment (LNET_EVENT_ACK) indicating
+ * that the operation was accepted by the target. The event LNET_EVENT_PUT is
+ * used at the target node to indicate the completion of incoming data
+ * delivery.
+ *
+ * The local events will be logged in the EQ associated with the MD pointed to
+ * by \a mdh handle. Using a MD without an associated EQ results in these
+ * events being discarded. In this case, the caller must have another
+ * mechanism (e.g., a higher level protocol) for determining when it is safe
+ * to modify the memory region associated with the MD.
+ *
+ * Note that LNet does not guarantee the order of LNET_EVENT_SEND and
+ * LNET_EVENT_ACK, though intuitively ACK should happen after SEND.
+ *
+ * \param self Indicates the NID of a local interface through which to send
+ * the PUT request. Use LNET_NID_ANY to let LNet choose one by itself.
+ * \param mdh A handle for the MD that describes the memory to be sent. The MD
+ * must be "free floating" (See LNetMDBind()).
+ * \param ack Controls whether an acknowledgment is requested.
+ * Acknowledgments are only sent when they are requested by the initiating
+ * process and the target MD enables them.
+ * \param target A process identifier for the target process.
+ * \param portal The index in the \a target's portal table.
+ * \param match_bits The match bits to use for MD selection at the target
+ * process.
+ * \param offset The offset into the target MD (only used when the target
+ * MD has the LNET_MD_MANAGE_REMOTE option set).
+ * \param hdr_data 64 bits of user data that can be included in the message
+ * header. This data is written to an event queue entry at the target if an
+ * EQ is present on the matching MD.
+ *
+ * \retval  0	   Success, and only in this case events will be generated
+ * and logged to EQ (if it exists).
+ * \retval -EIO    Simulated failure.
+ * \retval -ENOMEM Memory allocation failure.
+ * \retval -ENOENT Invalid MD object.
+ *
+ * \see struct lnet_event::hdr_data and lnet_event_kind_t.
+ */
+int
+LNetPut(lnet_nid_t self, struct lnet_handle_md mdh, enum lnet_ack_req ack,
+	struct lnet_process_id target, unsigned int portal,
+	__u64 match_bits, unsigned int offset,
+	__u64 hdr_data)
+{
+	struct lnet_msg		*msg;
+	struct lnet_libmd	*md;
+	int			cpt;
+	int			rc;
+
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	if (!list_empty(&the_lnet.ln_test_peers) &&	/* normally we don't */
+	    fail_peer(target.nid, 1)) {			/* shall we now? */
+		CERROR("Dropping PUT to %s: simulated failure\n",
+		       libcfs_id2str(target));
+		return -EIO;
+	}
+
+	msg = lnet_msg_alloc();
+	if (msg == NULL) {
+		CERROR("Dropping PUT to %s: ENOMEM on struct lnet_msg\n",
+		       libcfs_id2str(target));
+		return -ENOMEM;
+	}
+	msg->msg_vmflush = !!memory_pressure_get();
+
+	cpt = lnet_cpt_of_cookie(mdh.cookie);
+	lnet_res_lock(cpt);
+
+	md = lnet_handle2md(&mdh);
+	if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+		CERROR("Dropping PUT (%llu:%d:%s): MD (%d) invalid\n",
+		       match_bits, portal, libcfs_id2str(target),
+		       md == NULL ? -1 : md->md_threshold);
+		if (md != NULL && md->md_me != NULL)
+			CERROR("Source MD also attached to portal %d\n",
+			       md->md_me->me_portal);
+		lnet_res_unlock(cpt);
+
+		lnet_msg_free(msg);
+		return -ENOENT;
+	}
+
+	CDEBUG(D_NET, "LNetPut -> %s\n", libcfs_id2str(target));
+
+	lnet_msg_attach_md(msg, md, 0, 0);
+
+	lnet_prep_send(msg, LNET_MSG_PUT, target, 0, md->md_length);
+
+	msg->msg_hdr.msg.put.match_bits = cpu_to_le64(match_bits);
+	msg->msg_hdr.msg.put.ptl_index = cpu_to_le32(portal);
+	msg->msg_hdr.msg.put.offset = cpu_to_le32(offset);
+	msg->msg_hdr.msg.put.hdr_data = hdr_data;
+
+	/* NB handles only looked up by creator (no flips) */
+	if (ack == LNET_ACK_REQ) {
+		msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie =
+			the_lnet.ln_interface_cookie;
+		msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie =
+			md->md_lh.lh_cookie;
+	} else {
+		msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie =
+			LNET_WIRE_HANDLE_COOKIE_NONE;
+		msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie =
+			LNET_WIRE_HANDLE_COOKIE_NONE;
+	}
+
+	lnet_res_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_SEND);
+
+	rc = lnet_send(self, msg, LNET_NID_ANY);
+	if (rc != 0) {
+		CNETERR("Error sending PUT to %s: %d\n",
+			libcfs_id2str(target), rc);
+		lnet_finalize(msg, rc);
+	}
+
+	/* completion will be signalled by an event */
+	return 0;
+}
+EXPORT_SYMBOL(LNetPut);
+
+/*
+ * The LND can DMA direct to the GET md (i.e. no REPLY msg).  This
+ * returns a msg for the LND to pass to lnet_finalize() when the sink
+ * data has been received.
+ *
+ * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when
+ * lnet_finalize() is called on it, so the LND must call this first
+ */
+struct lnet_msg *
+lnet_create_reply_msg(struct lnet_ni *ni, struct lnet_msg *getmsg)
+{
+	struct lnet_msg	*msg = lnet_msg_alloc();
+	struct lnet_libmd *getmd = getmsg->msg_md;
+	struct lnet_process_id peer_id = getmsg->msg_target;
+	int cpt;
+
+	LASSERT(!getmsg->msg_target_is_router);
+	LASSERT(!getmsg->msg_routing);
+
+	if (msg == NULL) {
+		CERROR("%s: Dropping REPLY from %s: can't allocate msg\n",
+		       libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id));
+		goto drop;
+	}
+
+	cpt = lnet_cpt_of_cookie(getmd->md_lh.lh_cookie);
+	lnet_res_lock(cpt);
+
+	LASSERT(getmd->md_refcount > 0);
+
+	if (getmd->md_threshold == 0) {
+		CERROR("%s: Dropping REPLY from %s for inactive MD %p\n",
+			libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id),
+			getmd);
+		lnet_res_unlock(cpt);
+		goto drop;
+	}
+
+	LASSERT(getmd->md_offset == 0);
+
+	CDEBUG(D_NET, "%s: Reply from %s md %p\n",
+	       libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), getmd);
+
+	/* setup information for lnet_build_msg_event */
+	msg->msg_initiator = getmsg->msg_txpeer->lpni_peer_net->lpn_peer->lp_primary_nid;
+	msg->msg_from = peer_id.nid;
+	msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */
+	msg->msg_hdr.src_nid = peer_id.nid;
+	msg->msg_hdr.payload_length = getmd->md_length;
+	msg->msg_receiving = 1; /* required by lnet_msg_attach_md */
+
+	lnet_msg_attach_md(msg, getmd, getmd->md_offset, getmd->md_length);
+	lnet_res_unlock(cpt);
+
+	cpt = lnet_cpt_of_nid(peer_id.nid, ni);
+
+	lnet_net_lock(cpt);
+	lnet_msg_commit(msg, cpt);
+	lnet_net_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_REPLY);
+
+	return msg;
+
+ drop:
+	cpt = lnet_cpt_of_nid(peer_id.nid, ni);
+
+	lnet_net_lock(cpt);
+	the_lnet.ln_counters[cpt]->drop_count++;
+	the_lnet.ln_counters[cpt]->drop_length += getmd->md_length;
+	lnet_net_unlock(cpt);
+
+	if (msg != NULL)
+		lnet_msg_free(msg);
+
+	return NULL;
+}
+EXPORT_SYMBOL(lnet_create_reply_msg);
+
+void
+lnet_set_reply_msg_len(struct lnet_ni *ni, struct lnet_msg *reply,
+		       unsigned int len)
+{
+	/* Set the REPLY length, now the RDMA that elides the REPLY message has
+	 * completed and I know it. */
+	LASSERT(reply != NULL);
+	LASSERT(reply->msg_type == LNET_MSG_GET);
+	LASSERT(reply->msg_ev.type == LNET_EVENT_REPLY);
+
+	/* NB I trusted my peer to RDMA.  If she tells me she's written beyond
+	 * the end of my buffer, I might as well be dead. */
+	LASSERT(len <= reply->msg_ev.mlength);
+
+	reply->msg_ev.mlength = len;
+}
+EXPORT_SYMBOL(lnet_set_reply_msg_len);
+
+/**
+ * Initiate an asynchronous GET operation.
+ *
+ * On the initiator node, an LNET_EVENT_SEND is logged when the GET request
+ * is sent, and an LNET_EVENT_REPLY is logged when the data returned from
+ * the target node in the REPLY has been written to local MD.
+ *
+ * On the target node, an LNET_EVENT_GET is logged when the GET request
+ * arrives and is accepted into a MD.
+ *
+ * \param self,target,portal,match_bits,offset See the discussion in LNetPut().
+ * \param mdh A handle for the MD that describes the memory into which the
+ * requested data will be received. The MD must be "free floating" (See LNetMDBind()).
+ *
+ * \retval  0	   Success, and only in this case events will be generated
+ * and logged to EQ (if it exists) of the MD.
+ * \retval -EIO    Simulated failure.
+ * \retval -ENOMEM Memory allocation failure.
+ * \retval -ENOENT Invalid MD object.
+ */
+int
+LNetGet(lnet_nid_t self, struct lnet_handle_md mdh,
+	struct lnet_process_id target, unsigned int portal,
+	__u64 match_bits, unsigned int offset)
+{
+	struct lnet_msg		*msg;
+	struct lnet_libmd	*md;
+	int			cpt;
+	int			rc;
+
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	if (!list_empty(&the_lnet.ln_test_peers) &&	/* normally we don't */
+	    fail_peer(target.nid, 1))			/* shall we now? */
+	{
+		CERROR("Dropping GET to %s: simulated failure\n",
+		       libcfs_id2str(target));
+		return -EIO;
+	}
+
+	msg = lnet_msg_alloc();
+	if (msg == NULL) {
+		CERROR("Dropping GET to %s: ENOMEM on struct lnet_msg\n",
+		       libcfs_id2str(target));
+		return -ENOMEM;
+	}
+
+	cpt = lnet_cpt_of_cookie(mdh.cookie);
+	lnet_res_lock(cpt);
+
+	md = lnet_handle2md(&mdh);
+	if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+		CERROR("Dropping GET (%llu:%d:%s): MD (%d) invalid\n",
+		       match_bits, portal, libcfs_id2str(target),
+		       md == NULL ? -1 : md->md_threshold);
+		if (md != NULL && md->md_me != NULL)
+			CERROR("REPLY MD also attached to portal %d\n",
+			       md->md_me->me_portal);
+
+		lnet_res_unlock(cpt);
+
+		lnet_msg_free(msg);
+		return -ENOENT;
+	}
+
+	CDEBUG(D_NET, "LNetGet -> %s\n", libcfs_id2str(target));
+
+	lnet_msg_attach_md(msg, md, 0, 0);
+
+	lnet_prep_send(msg, LNET_MSG_GET, target, 0, 0);
+
+	msg->msg_hdr.msg.get.match_bits = cpu_to_le64(match_bits);
+	msg->msg_hdr.msg.get.ptl_index = cpu_to_le32(portal);
+	msg->msg_hdr.msg.get.src_offset = cpu_to_le32(offset);
+	msg->msg_hdr.msg.get.sink_length = cpu_to_le32(md->md_length);
+
+	/* NB handles only looked up by creator (no flips) */
+	msg->msg_hdr.msg.get.return_wmd.wh_interface_cookie =
+		the_lnet.ln_interface_cookie;
+	msg->msg_hdr.msg.get.return_wmd.wh_object_cookie =
+		md->md_lh.lh_cookie;
+
+	lnet_res_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_SEND);
+
+	rc = lnet_send(self, msg, LNET_NID_ANY);
+	if (rc < 0) {
+		CNETERR("Error sending GET to %s: %d\n",
+			libcfs_id2str(target), rc);
+		lnet_finalize(msg, rc);
+	}
+
+	/* completion will be signalled by an event */
+	return 0;
+}
+EXPORT_SYMBOL(LNetGet);
+
+/**
+ * Calculate distance to node at \a dstnid.
+ *
+ * \param dstnid Target NID.
+ * \param srcnidp If not NULL, NID of the local interface to reach \a dstnid
+ * is saved here.
+ * \param orderp If not NULL, order of the route to reach \a dstnid is saved
+ * here.
+ *
+ * \retval 0 If \a dstnid belongs to a local interface, and reserved option
+ * local_nid_dist_zero is set, which is the default.
+ * \retval positives Distance to target NID, i.e. number of hops plus one.
+ * \retval -EHOSTUNREACH If \a dstnid is not reachable.
+ */
+int
+LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp)
+{
+	struct list_head	*e;
+	struct lnet_ni *ni = NULL;
+	struct lnet_remotenet *rnet;
+	__u32			dstnet = LNET_NIDNET(dstnid);
+	int			hops;
+	int			cpt;
+	__u32			order = 2;
+	struct list_head	*rn_list;
+
+	/* if !local_nid_dist_zero, I don't return a distance of 0 ever
+	 * (when lustre sees a distance of 0, it substitutes 0@lo), so I
+	 * keep order 0 free for 0@lo and order 1 free for a local NID
+	 * match */
+
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	cpt = lnet_net_lock_current();
+
+	while ((ni = lnet_get_next_ni_locked(NULL, ni))) {
+		if (ni->ni_nid == dstnid) {
+			if (srcnidp != NULL)
+				*srcnidp = dstnid;
+			if (orderp != NULL) {
+				if (LNET_NETTYP(LNET_NIDNET(dstnid)) == LOLND)
+					*orderp = 0;
+				else
+					*orderp = 1;
+			}
+			lnet_net_unlock(cpt);
+
+			return local_nid_dist_zero ? 0 : 1;
+		}
+
+		if (LNET_NIDNET(ni->ni_nid) == dstnet) {
+			/* Check if ni was originally created in
+			 * current net namespace.
+			 * If not, assign order above 0xffff0000,
+			 * to make this ni not a priority. */
+			if (!net_eq(ni->ni_net_ns, current->nsproxy->net_ns))
+				order += 0xffff0000;
+
+			if (srcnidp != NULL)
+				*srcnidp = ni->ni_nid;
+			if (orderp != NULL)
+				*orderp = order;
+			lnet_net_unlock(cpt);
+			return 1;
+		}
+
+		order++;
+	}
+
+	rn_list = lnet_net2rnethash(dstnet);
+	list_for_each(e, rn_list) {
+		rnet = list_entry(e, struct lnet_remotenet, lrn_list);
+
+		if (rnet->lrn_net == dstnet) {
+			struct lnet_route *route;
+			struct lnet_route *shortest = NULL;
+			__u32 shortest_hops = LNET_UNDEFINED_HOPS;
+			__u32 route_hops;
+
+			LASSERT(!list_empty(&rnet->lrn_routes));
+
+			list_for_each_entry(route, &rnet->lrn_routes,
+					    lr_list) {
+				route_hops = route->lr_hops;
+				if (route_hops == LNET_UNDEFINED_HOPS)
+					route_hops = 1;
+				if (shortest == NULL ||
+				    route_hops < shortest_hops) {
+					shortest = route;
+					shortest_hops = route_hops;
+				}
+			}
+
+			LASSERT(shortest != NULL);
+			hops = shortest_hops;
+			if (srcnidp != NULL) {
+				ni = lnet_get_next_ni_locked(
+					shortest->lr_gateway->lpni_net,
+					NULL);
+				*srcnidp = ni->ni_nid;
+			}
+			if (orderp != NULL)
+				*orderp = order;
+			lnet_net_unlock(cpt);
+			return hops + 1;
+		}
+		order++;
+	}
+
+	lnet_net_unlock(cpt);
+	return -EHOSTUNREACH;
+}
+EXPORT_SYMBOL(LNetDist);
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-msg.c b/drivers/staging/lustrefsx/lnet/lnet/lib-msg.c
new file mode 100644
index 0000000000000..1b90855375a20
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-msg.c
@@ -0,0 +1,641 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-msg.c
+ *
+ * Message decoding, parsing and finalizing routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <lnet/lib-lnet.h>
+
+void
+lnet_build_unlink_event(struct lnet_libmd *md, struct lnet_event *ev)
+{
+	ENTRY;
+
+	memset(ev, 0, sizeof(*ev));
+
+	ev->status   = 0;
+	ev->unlinked = 1;
+	ev->type     = LNET_EVENT_UNLINK;
+	lnet_md_deconstruct(md, &ev->md);
+	lnet_md2handle(&ev->md_handle, md);
+	EXIT;
+}
+
+/*
+ * Don't need any lock, must be called after lnet_commit_md
+ */
+void
+lnet_build_msg_event(struct lnet_msg *msg, enum lnet_event_kind ev_type)
+{
+	struct lnet_hdr	*hdr = &msg->msg_hdr;
+	struct lnet_event *ev = &msg->msg_ev;
+
+	LASSERT(!msg->msg_routing);
+
+	ev->type = ev_type;
+
+	if (ev_type == LNET_EVENT_SEND) {
+		/* event for active message */
+		ev->target.nid	  = le64_to_cpu(hdr->dest_nid);
+		ev->target.pid	  = le32_to_cpu(hdr->dest_pid);
+		ev->initiator.nid = LNET_NID_ANY;
+		ev->initiator.pid = the_lnet.ln_pid;
+		ev->source.nid	  = LNET_NID_ANY;
+		ev->source.pid    = the_lnet.ln_pid;
+		ev->sender	  = LNET_NID_ANY;
+
+	} else {
+		/* event for passive message */
+		ev->target.pid	  = hdr->dest_pid;
+		ev->target.nid	  = hdr->dest_nid;
+		ev->initiator.pid = hdr->src_pid;
+		/* Multi-Rail: resolve src_nid to "primary" peer NID */
+		ev->initiator.nid = msg->msg_initiator;
+		/* Multi-Rail: track source NID. */
+		ev->source.pid	  = hdr->src_pid;
+		ev->source.nid	  = hdr->src_nid;
+		ev->rlength       = hdr->payload_length;
+		ev->sender	  = msg->msg_from;
+		ev->mlength	  = msg->msg_wanted;
+		ev->offset	  = msg->msg_offset;
+	}
+
+	switch (ev_type) {
+	default:
+		LBUG();
+
+	case LNET_EVENT_PUT: /* passive PUT */
+		ev->pt_index   = hdr->msg.put.ptl_index;
+		ev->match_bits = hdr->msg.put.match_bits;
+		ev->hdr_data   = hdr->msg.put.hdr_data;
+		return;
+
+	case LNET_EVENT_GET: /* passive GET */
+		ev->pt_index   = hdr->msg.get.ptl_index;
+		ev->match_bits = hdr->msg.get.match_bits;
+		ev->hdr_data   = 0;
+		return;
+
+	case LNET_EVENT_ACK: /* ACK */
+		ev->match_bits = hdr->msg.ack.match_bits;
+		ev->mlength    = hdr->msg.ack.mlength;
+		return;
+
+	case LNET_EVENT_REPLY: /* REPLY */
+		return;
+
+	case LNET_EVENT_SEND: /* active message */
+		if (msg->msg_type == LNET_MSG_PUT) {
+			ev->pt_index   = le32_to_cpu(hdr->msg.put.ptl_index);
+			ev->match_bits = le64_to_cpu(hdr->msg.put.match_bits);
+			ev->offset     = le32_to_cpu(hdr->msg.put.offset);
+			ev->mlength    =
+			ev->rlength    = le32_to_cpu(hdr->payload_length);
+			ev->hdr_data   = le64_to_cpu(hdr->msg.put.hdr_data);
+
+		} else {
+			LASSERT(msg->msg_type == LNET_MSG_GET);
+			ev->pt_index   = le32_to_cpu(hdr->msg.get.ptl_index);
+			ev->match_bits = le64_to_cpu(hdr->msg.get.match_bits);
+			ev->mlength    =
+			ev->rlength    = le32_to_cpu(hdr->msg.get.sink_length);
+			ev->offset     = le32_to_cpu(hdr->msg.get.src_offset);
+			ev->hdr_data   = 0;
+		}
+		return;
+	}
+}
+
+void
+lnet_msg_commit(struct lnet_msg *msg, int cpt)
+{
+	struct lnet_msg_container *container = the_lnet.ln_msg_containers[cpt];
+	struct lnet_counters *counters = the_lnet.ln_counters[cpt];
+
+	/* routed message can be committed for both receiving and sending */
+	LASSERT(!msg->msg_tx_committed);
+
+	if (msg->msg_sending) {
+		LASSERT(!msg->msg_receiving);
+
+		msg->msg_tx_cpt = cpt;
+		msg->msg_tx_committed = 1;
+		if (msg->msg_rx_committed) { /* routed message REPLY */
+			LASSERT(msg->msg_onactivelist);
+			return;
+		}
+	} else {
+		LASSERT(!msg->msg_sending);
+		msg->msg_rx_cpt = cpt;
+		msg->msg_rx_committed = 1;
+	}
+
+	LASSERT(!msg->msg_onactivelist);
+	msg->msg_onactivelist = 1;
+	list_add(&msg->msg_activelist, &container->msc_active);
+
+	counters->msgs_alloc++;
+	if (counters->msgs_alloc > counters->msgs_max)
+		counters->msgs_max = counters->msgs_alloc;
+}
+
+static void
+lnet_msg_decommit_tx(struct lnet_msg *msg, int status)
+{
+	struct lnet_counters *counters;
+	struct lnet_event *ev = &msg->msg_ev;
+
+	LASSERT(msg->msg_tx_committed);
+	if (status != 0)
+		goto out;
+
+	counters = the_lnet.ln_counters[msg->msg_tx_cpt];
+	switch (ev->type) {
+	default: /* routed message */
+		LASSERT(msg->msg_routing);
+		LASSERT(msg->msg_rx_committed);
+		LASSERT(ev->type == 0);
+
+		counters->route_length += msg->msg_len;
+		counters->route_count++;
+		goto incr_stats;
+
+	case LNET_EVENT_PUT:
+		/* should have been decommitted */
+		LASSERT(!msg->msg_rx_committed);
+		/* overwritten while sending ACK */
+		LASSERT(msg->msg_type == LNET_MSG_ACK);
+		msg->msg_type = LNET_MSG_PUT; /* fix type */
+		break;
+
+	case LNET_EVENT_SEND:
+		LASSERT(!msg->msg_rx_committed);
+		if (msg->msg_type == LNET_MSG_PUT)
+			counters->send_length += msg->msg_len;
+		break;
+
+	case LNET_EVENT_GET:
+		LASSERT(msg->msg_rx_committed);
+		/* overwritten while sending reply, we should never be
+		 * here for optimized GET */
+		LASSERT(msg->msg_type == LNET_MSG_REPLY);
+		msg->msg_type = LNET_MSG_GET; /* fix type */
+		break;
+	}
+
+	counters->send_count++;
+
+incr_stats:
+	if (msg->msg_txpeer)
+		atomic_inc(&msg->msg_txpeer->lpni_stats.send_count);
+	if (msg->msg_txni)
+		atomic_inc(&msg->msg_txni->ni_stats.send_count);
+ out:
+	lnet_return_tx_credits_locked(msg);
+	msg->msg_tx_committed = 0;
+}
+
+static void
+lnet_msg_decommit_rx(struct lnet_msg *msg, int status)
+{
+	struct lnet_counters *counters;
+	struct lnet_event *ev = &msg->msg_ev;
+
+	LASSERT(!msg->msg_tx_committed); /* decommitted or never committed */
+	LASSERT(msg->msg_rx_committed);
+
+	if (status != 0)
+		goto out;
+
+	counters = the_lnet.ln_counters[msg->msg_rx_cpt];
+	switch (ev->type) {
+	default:
+		LASSERT(ev->type == 0);
+		LASSERT(msg->msg_routing);
+		goto incr_stats;
+
+	case LNET_EVENT_ACK:
+		LASSERT(msg->msg_type == LNET_MSG_ACK);
+		break;
+
+	case LNET_EVENT_GET:
+		/* type is "REPLY" if it's an optimized GET on passive side,
+		 * because optimized GET will never be committed for sending,
+		 * so message type wouldn't be changed back to "GET" by
+		 * lnet_msg_decommit_tx(), see details in lnet_parse_get() */
+		LASSERT(msg->msg_type == LNET_MSG_REPLY ||
+			msg->msg_type == LNET_MSG_GET);
+		counters->send_length += msg->msg_wanted;
+		break;
+
+	case LNET_EVENT_PUT:
+		LASSERT(msg->msg_type == LNET_MSG_PUT);
+		break;
+
+	case LNET_EVENT_REPLY:
+		/* type is "GET" if it's an optimized GET on active side,
+		 * see details in lnet_create_reply_msg() */
+		LASSERT(msg->msg_type == LNET_MSG_GET ||
+			msg->msg_type == LNET_MSG_REPLY);
+		break;
+	}
+
+	counters->recv_count++;
+
+incr_stats:
+	if (msg->msg_rxpeer)
+		atomic_inc(&msg->msg_rxpeer->lpni_stats.recv_count);
+	if (msg->msg_rxni)
+		atomic_inc(&msg->msg_rxni->ni_stats.recv_count);
+	if (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_REPLY)
+		counters->recv_length += msg->msg_wanted;
+
+ out:
+	lnet_return_rx_credits_locked(msg);
+	msg->msg_rx_committed = 0;
+}
+
+void
+lnet_msg_decommit(struct lnet_msg *msg, int cpt, int status)
+{
+	int	cpt2 = cpt;
+
+	LASSERT(msg->msg_tx_committed || msg->msg_rx_committed);
+	LASSERT(msg->msg_onactivelist);
+
+	if (msg->msg_tx_committed) { /* always decommit for sending first */
+		LASSERT(cpt == msg->msg_tx_cpt);
+		lnet_msg_decommit_tx(msg, status);
+	}
+
+	if (msg->msg_rx_committed) {
+		/* forwarding msg committed for both receiving and sending */
+		if (cpt != msg->msg_rx_cpt) {
+			lnet_net_unlock(cpt);
+			cpt2 = msg->msg_rx_cpt;
+			lnet_net_lock(cpt2);
+		}
+		lnet_msg_decommit_rx(msg, status);
+	}
+
+	list_del(&msg->msg_activelist);
+	msg->msg_onactivelist = 0;
+
+	the_lnet.ln_counters[cpt2]->msgs_alloc--;
+
+	if (cpt2 != cpt) {
+		lnet_net_unlock(cpt2);
+		lnet_net_lock(cpt);
+	}
+}
+
+void
+lnet_msg_attach_md(struct lnet_msg *msg, struct lnet_libmd *md,
+		   unsigned int offset, unsigned int mlen)
+{
+	/* NB: @offset and @len are only useful for receiving */
+	/* Here, we attach the MD on lnet_msg and mark it busy and
+	 * decrementing its threshold. Come what may, the lnet_msg "owns"
+	 * the MD until a call to lnet_msg_detach_md or lnet_finalize()
+	 * signals completion. */
+	LASSERT(!msg->msg_routing);
+
+	msg->msg_md = md;
+	if (msg->msg_receiving) { /* committed for receiving */
+		msg->msg_offset = offset;
+		msg->msg_wanted = mlen;
+	}
+
+	md->md_refcount++;
+	if (md->md_threshold != LNET_MD_THRESH_INF) {
+		LASSERT(md->md_threshold > 0);
+		md->md_threshold--;
+	}
+
+	/* build umd in event */
+	lnet_md2handle(&msg->msg_ev.md_handle, md);
+	lnet_md_deconstruct(md, &msg->msg_ev.md);
+}
+
+void
+lnet_msg_detach_md(struct lnet_msg *msg, int status)
+{
+	struct lnet_libmd *md = msg->msg_md;
+	int unlink;
+
+	/* Now it's safe to drop my caller's ref */
+	md->md_refcount--;
+	LASSERT(md->md_refcount >= 0);
+
+	unlink = lnet_md_unlinkable(md);
+	if (md->md_eq != NULL) {
+		msg->msg_ev.status   = status;
+		msg->msg_ev.unlinked = unlink;
+		lnet_eq_enqueue_event(md->md_eq, &msg->msg_ev);
+	}
+
+	if (unlink)
+		lnet_md_unlink(md);
+
+	msg->msg_md = NULL;
+}
+
+static int
+lnet_complete_msg_locked(struct lnet_msg *msg, int cpt)
+{
+	struct lnet_handle_wire ack_wmd;
+	int		   rc;
+	int		   status = msg->msg_ev.status;
+
+	LASSERT(msg->msg_onactivelist);
+
+	if (status == 0 && msg->msg_ack) {
+		/* Only send an ACK if the PUT completed successfully */
+
+		lnet_msg_decommit(msg, cpt, 0);
+
+		msg->msg_ack = 0;
+		lnet_net_unlock(cpt);
+
+		LASSERT(msg->msg_ev.type == LNET_EVENT_PUT);
+		LASSERT(!msg->msg_routing);
+
+		ack_wmd = msg->msg_hdr.msg.put.ack_wmd;
+
+		lnet_prep_send(msg, LNET_MSG_ACK, msg->msg_ev.source, 0, 0);
+
+		msg->msg_hdr.msg.ack.dst_wmd = ack_wmd;
+		msg->msg_hdr.msg.ack.match_bits = msg->msg_ev.match_bits;
+		msg->msg_hdr.msg.ack.mlength = cpu_to_le32(msg->msg_ev.mlength);
+
+		/* NB: we probably want to use NID of msg::msg_from as 3rd
+		 * parameter (router NID) if it's routed message */
+		rc = lnet_send(msg->msg_ev.target.nid, msg, LNET_NID_ANY);
+
+		lnet_net_lock(cpt);
+		/*
+		 * NB: message is committed for sending, we should return
+		 * on success because LND will finalize this message later.
+		 *
+		 * Also, there is possibility that message is committed for
+		 * sending and also failed before delivering to LND,
+		 * i.e: ENOMEM, in that case we can't fall through either
+		 * because CPT for sending can be different with CPT for
+		 * receiving, so we should return back to lnet_finalize()
+		 * to make sure we are locking the correct partition.
+		 */
+		return rc;
+
+	} else if (status == 0 &&	/* OK so far */
+		   (msg->msg_routing && !msg->msg_sending)) {
+		/* not forwarded */
+		LASSERT(!msg->msg_receiving);	/* called back recv already */
+		lnet_net_unlock(cpt);
+
+		rc = lnet_send(LNET_NID_ANY, msg, LNET_NID_ANY);
+
+		lnet_net_lock(cpt);
+		/*
+		 * NB: message is committed for sending, we should return
+		 * on success because LND will finalize this message later.
+		 *
+		 * Also, there is possibility that message is committed for
+		 * sending and also failed before delivering to LND,
+		 * i.e: ENOMEM, in that case we can't fall through either:
+		 * - The rule is message must decommit for sending first if
+		 *   the it's committed for both sending and receiving
+		 * - CPT for sending can be different with CPT for receiving,
+		 *   so we should return back to lnet_finalize() to make
+		 *   sure we are locking the correct partition.
+		 */
+		return rc;
+	}
+
+	lnet_msg_decommit(msg, cpt, status);
+	lnet_msg_free(msg);
+	return 0;
+}
+
+void
+lnet_finalize(struct lnet_msg *msg, int status)
+{
+	struct lnet_msg_container	*container;
+	int				my_slot;
+	int				cpt;
+	int				rc;
+	int				i;
+
+	LASSERT(!in_interrupt());
+
+	if (msg == NULL)
+		return;
+
+	msg->msg_ev.status = status;
+
+	if (msg->msg_md != NULL) {
+		cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie);
+
+		lnet_res_lock(cpt);
+		lnet_msg_detach_md(msg, status);
+		lnet_res_unlock(cpt);
+	}
+
+ again:
+	rc = 0;
+	if (!msg->msg_tx_committed && !msg->msg_rx_committed) {
+		/* not committed to network yet */
+		LASSERT(!msg->msg_onactivelist);
+		lnet_msg_free(msg);
+		return;
+	}
+
+	/*
+	 * NB: routed message can be committed for both receiving and sending,
+	 * we should finalize in LIFO order and keep counters correct.
+	 * (finalize sending first then finalize receiving)
+	 */
+	cpt = msg->msg_tx_committed ? msg->msg_tx_cpt : msg->msg_rx_cpt;
+	lnet_net_lock(cpt);
+
+	container = the_lnet.ln_msg_containers[cpt];
+	list_add_tail(&msg->msg_list, &container->msc_finalizing);
+
+	/* Recursion breaker.  Don't complete the message here if I am (or
+	 * enough other threads are) already completing messages */
+
+	my_slot = -1;
+	for (i = 0; i < container->msc_nfinalizers; i++) {
+		if (container->msc_finalizers[i] == current)
+			break;
+
+		if (my_slot < 0 && container->msc_finalizers[i] == NULL)
+			my_slot = i;
+	}
+
+	if (i < container->msc_nfinalizers || my_slot < 0) {
+		lnet_net_unlock(cpt);
+		return;
+	}
+
+	container->msc_finalizers[my_slot] = current;
+
+	while (!list_empty(&container->msc_finalizing)) {
+		msg = list_entry(container->msc_finalizing.next,
+				 struct lnet_msg, msg_list);
+
+		list_del(&msg->msg_list);
+
+		/* NB drops and regains the lnet lock if it actually does
+		 * anything, so my finalizing friends can chomp along too */
+		rc = lnet_complete_msg_locked(msg, cpt);
+		if (rc != 0)
+			break;
+	}
+
+	if (unlikely(!list_empty(&the_lnet.ln_delay_rules))) {
+		lnet_net_unlock(cpt);
+		lnet_delay_rule_check();
+		lnet_net_lock(cpt);
+	}
+
+	container->msc_finalizers[my_slot] = NULL;
+	lnet_net_unlock(cpt);
+
+	if (rc != 0)
+		goto again;
+}
+EXPORT_SYMBOL(lnet_finalize);
+
+void
+lnet_msg_container_cleanup(struct lnet_msg_container *container)
+{
+	int	count = 0;
+
+	if (container->msc_init == 0)
+		return;
+
+	while (!list_empty(&container->msc_active)) {
+		struct lnet_msg *msg;
+
+		msg  = list_entry(container->msc_active.next,
+				  struct lnet_msg, msg_activelist);
+		LASSERT(msg->msg_onactivelist);
+		msg->msg_onactivelist = 0;
+		list_del(&msg->msg_activelist);
+		lnet_msg_free(msg);
+		count++;
+	}
+
+	if (count > 0)
+		CERROR("%d active msg on exit\n", count);
+
+	if (container->msc_finalizers != NULL) {
+		LIBCFS_FREE(container->msc_finalizers,
+			    container->msc_nfinalizers *
+			    sizeof(*container->msc_finalizers));
+		container->msc_finalizers = NULL;
+	}
+	container->msc_init = 0;
+}
+
+int
+lnet_msg_container_setup(struct lnet_msg_container *container, int cpt)
+{
+	int rc = 0;
+
+	container->msc_init = 1;
+
+	INIT_LIST_HEAD(&container->msc_active);
+	INIT_LIST_HEAD(&container->msc_finalizing);
+
+	/* number of CPUs */
+	container->msc_nfinalizers = cfs_cpt_weight(lnet_cpt_table(), cpt);
+	if (container->msc_nfinalizers == 0)
+		container->msc_nfinalizers = 1;
+
+	LIBCFS_CPT_ALLOC(container->msc_finalizers, lnet_cpt_table(), cpt,
+			 container->msc_nfinalizers *
+			 sizeof(*container->msc_finalizers));
+
+	if (container->msc_finalizers == NULL) {
+		CERROR("Failed to allocate message finalizers\n");
+		lnet_msg_container_cleanup(container);
+		return -ENOMEM;
+	}
+
+	return rc;
+}
+
+void
+lnet_msg_containers_destroy(void)
+{
+	struct lnet_msg_container *container;
+	int	i;
+
+	if (the_lnet.ln_msg_containers == NULL)
+		return;
+
+	cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers)
+		lnet_msg_container_cleanup(container);
+
+	cfs_percpt_free(the_lnet.ln_msg_containers);
+	the_lnet.ln_msg_containers = NULL;
+}
+
+int
+lnet_msg_containers_create(void)
+{
+	struct lnet_msg_container *container;
+	int	rc;
+	int	i;
+
+	the_lnet.ln_msg_containers = cfs_percpt_alloc(lnet_cpt_table(),
+						      sizeof(*container));
+
+	if (the_lnet.ln_msg_containers == NULL) {
+		CERROR("Failed to allocate cpu-partition data for network\n");
+		return -ENOMEM;
+	}
+
+	cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers) {
+		rc = lnet_msg_container_setup(container, i);
+		if (rc != 0) {
+			lnet_msg_containers_destroy();
+			return rc;
+		}
+	}
+
+	return 0;
+}
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c b/drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c
new file mode 100644
index 0000000000000..3773ed9e2436c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c
@@ -0,0 +1,983 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-ptl.c
+ *
+ * portal & match routines
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <lnet/lib-lnet.h>
+
+/* NB: add /proc interfaces in upcoming patches */
+int portal_rotor = LNET_PTL_ROTOR_HASH_RT;
+module_param(portal_rotor, int, 0644);
+MODULE_PARM_DESC(portal_rotor, "redirect PUTs to different cpu-partitions");
+
+static int
+lnet_ptl_match_type(unsigned int index, struct lnet_process_id match_id,
+		    __u64 mbits, __u64 ignore_bits)
+{
+	struct lnet_portal	*ptl = the_lnet.ln_portals[index];
+	int			unique;
+
+	unique = ignore_bits == 0 &&
+		 match_id.nid != LNET_NID_ANY &&
+		 match_id.pid != LNET_PID_ANY;
+
+	LASSERT(!lnet_ptl_is_unique(ptl) || !lnet_ptl_is_wildcard(ptl));
+
+	/* prefer to check w/o any lock */
+	if (likely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl)))
+		goto match;
+
+	/* unset, new portal */
+	lnet_ptl_lock(ptl);
+	/* check again with lock */
+	if (unlikely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl))) {
+		lnet_ptl_unlock(ptl);
+		goto match;
+	}
+
+	/* still not set */
+	if (unique)
+		lnet_ptl_setopt(ptl, LNET_PTL_MATCH_UNIQUE);
+	else
+		lnet_ptl_setopt(ptl, LNET_PTL_MATCH_WILDCARD);
+
+	lnet_ptl_unlock(ptl);
+
+	return 1;
+
+ match:
+	if ((lnet_ptl_is_unique(ptl) && !unique) ||
+	    (lnet_ptl_is_wildcard(ptl) && unique))
+		return 0;
+	return 1;
+}
+
+static void
+lnet_ptl_enable_mt(struct lnet_portal *ptl, int cpt)
+{
+	struct lnet_match_table	*mtable = ptl->ptl_mtables[cpt];
+	int			i;
+
+	/* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */
+	LASSERT(lnet_ptl_is_wildcard(ptl));
+
+	mtable->mt_enabled = 1;
+
+	ptl->ptl_mt_maps[ptl->ptl_mt_nmaps] = cpt;
+	for (i = ptl->ptl_mt_nmaps - 1; i >= 0; i--) {
+		LASSERT(ptl->ptl_mt_maps[i] != cpt);
+		if (ptl->ptl_mt_maps[i] < cpt)
+			break;
+
+		/* swap to order */
+		ptl->ptl_mt_maps[i + 1] = ptl->ptl_mt_maps[i];
+		ptl->ptl_mt_maps[i] = cpt;
+	}
+
+	ptl->ptl_mt_nmaps++;
+}
+
+static void
+lnet_ptl_disable_mt(struct lnet_portal *ptl, int cpt)
+{
+	struct lnet_match_table	*mtable = ptl->ptl_mtables[cpt];
+	int			i;
+
+	/* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */
+	LASSERT(lnet_ptl_is_wildcard(ptl));
+
+	if (LNET_CPT_NUMBER == 1)
+		return; /* never disable the only match-table */
+
+	mtable->mt_enabled = 0;
+
+	LASSERT(ptl->ptl_mt_nmaps > 0 &&
+		ptl->ptl_mt_nmaps <= LNET_CPT_NUMBER);
+
+	/* remove it from mt_maps */
+	ptl->ptl_mt_nmaps--;
+	for (i = 0; i < ptl->ptl_mt_nmaps; i++) {
+		if (ptl->ptl_mt_maps[i] >= cpt) /* overwrite it */
+			ptl->ptl_mt_maps[i] = ptl->ptl_mt_maps[i + 1];
+	}
+}
+
+static int
+lnet_try_match_md(struct lnet_libmd *md,
+		  struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	/* ALWAYS called holding the lnet_res_lock, and can't lnet_res_unlock;
+	 * lnet_match_blocked_msg() relies on this to avoid races */
+	unsigned int	offset;
+	unsigned int	mlength;
+	struct lnet_me	*me = md->md_me;
+
+	/* MD exhausted */
+	if (lnet_md_exhausted(md))
+		return LNET_MATCHMD_NONE | LNET_MATCHMD_EXHAUSTED;
+
+	/* mismatched MD op */
+	if ((md->md_options & info->mi_opc) == 0)
+		return LNET_MATCHMD_NONE;
+
+	/* mismatched ME nid/pid? */
+	if (me->me_match_id.nid != LNET_NID_ANY &&
+	    me->me_match_id.nid != info->mi_id.nid)
+		return LNET_MATCHMD_NONE;
+
+	if (me->me_match_id.pid != LNET_PID_ANY &&
+	    me->me_match_id.pid != info->mi_id.pid)
+		return LNET_MATCHMD_NONE;
+
+	/* mismatched ME matchbits? */
+	if (((me->me_match_bits ^ info->mi_mbits) & ~me->me_ignore_bits) != 0)
+		return LNET_MATCHMD_NONE;
+
+	/* Hurrah! This _is_ a match; check it out... */
+
+	if ((md->md_options & LNET_MD_MANAGE_REMOTE) == 0)
+		offset = md->md_offset;
+	else
+		offset = info->mi_roffset;
+
+	if ((md->md_options & LNET_MD_MAX_SIZE) != 0) {
+		mlength = md->md_max_size;
+		LASSERT(md->md_offset + mlength <= md->md_length);
+	} else {
+		mlength = md->md_length - offset;
+	}
+
+	if (info->mi_rlength <= mlength) {	/* fits in allowed space */
+		mlength = info->mi_rlength;
+	} else if ((md->md_options & LNET_MD_TRUNCATE) == 0) {
+		/* this packet _really_ is too big */
+		CERROR("Matching packet from %s, match %llu"
+		       " length %d too big: %d left, %d allowed\n",
+		       libcfs_id2str(info->mi_id), info->mi_mbits,
+		       info->mi_rlength, md->md_length - offset, mlength);
+
+		return LNET_MATCHMD_DROP;
+	}
+
+	/* Commit to this ME/MD */
+	CDEBUG(D_NET, "Incoming %s index %x from %s of "
+	       "length %d/%d into md %#llx [%d] + %d\n",
+	       (info->mi_opc == LNET_MD_OP_PUT) ? "put" : "get",
+	       info->mi_portal, libcfs_id2str(info->mi_id), mlength,
+	       info->mi_rlength, md->md_lh.lh_cookie, md->md_niov, offset);
+
+	lnet_msg_attach_md(msg, md, offset, mlength);
+	md->md_offset = offset + mlength;
+
+	if (!lnet_md_exhausted(md))
+		return LNET_MATCHMD_OK;
+
+	/* Auto-unlink NOW, so the ME gets unlinked if required.
+	 * We bumped md->md_refcount above so the MD just gets flagged
+	 * for unlink when it is finalized. */
+	if ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0)
+		lnet_md_unlink(md);
+
+	return LNET_MATCHMD_OK | LNET_MATCHMD_EXHAUSTED;
+}
+
+static struct lnet_match_table *
+lnet_match2mt(struct lnet_portal *ptl, struct lnet_process_id id, __u64 mbits)
+{
+	if (LNET_CPT_NUMBER == 1)
+		return ptl->ptl_mtables[0]; /* the only one */
+
+	/* if it's a unique portal, return match-table hashed by NID */
+	return lnet_ptl_is_unique(ptl) ?
+	       ptl->ptl_mtables[lnet_cpt_of_nid(id.nid, NULL)] : NULL;
+}
+
+struct lnet_match_table *
+lnet_mt_of_attach(unsigned int index, struct lnet_process_id id,
+		  __u64 mbits, __u64 ignore_bits, enum lnet_ins_pos pos)
+{
+	struct lnet_portal	*ptl;
+	struct lnet_match_table	*mtable;
+
+	/* NB: called w/o lock */
+	LASSERT(index < the_lnet.ln_nportals);
+
+	if (!lnet_ptl_match_type(index, id, mbits, ignore_bits))
+		return NULL;
+
+	ptl = the_lnet.ln_portals[index];
+
+	mtable = lnet_match2mt(ptl, id, mbits);
+	if (mtable != NULL) /* unique portal or only one match-table */
+		return mtable;
+
+	/* it's a wildcard portal */
+	switch (pos) {
+	default:
+		return NULL;
+	case LNET_INS_BEFORE:
+	case LNET_INS_AFTER:
+		/* posted by no affinity thread, always hash to specific
+		 * match-table to avoid buffer stealing which is heavy */
+		return ptl->ptl_mtables[ptl->ptl_index % LNET_CPT_NUMBER];
+	case LNET_INS_LOCAL:
+		/* posted by cpu-affinity thread */
+		return ptl->ptl_mtables[lnet_cpt_current()];
+	}
+}
+
+static struct lnet_match_table *
+lnet_mt_of_match(struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	struct lnet_match_table	*mtable;
+	struct lnet_portal	*ptl;
+	unsigned int		nmaps;
+	unsigned int		rotor;
+	unsigned int		cpt;
+	bool			routed;
+
+	/* NB: called w/o lock */
+	LASSERT(info->mi_portal < the_lnet.ln_nportals);
+	ptl = the_lnet.ln_portals[info->mi_portal];
+
+	LASSERT(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl));
+
+	mtable = lnet_match2mt(ptl, info->mi_id, info->mi_mbits);
+	if (mtable != NULL)
+		return mtable;
+
+	/* it's a wildcard portal */
+	routed = LNET_NIDNET(msg->msg_hdr.src_nid) !=
+		 LNET_NIDNET(msg->msg_hdr.dest_nid);
+
+	if (portal_rotor == LNET_PTL_ROTOR_OFF ||
+	    (portal_rotor != LNET_PTL_ROTOR_ON && !routed)) {
+		cpt = lnet_cpt_current();
+		if (ptl->ptl_mtables[cpt]->mt_enabled)
+			return ptl->ptl_mtables[cpt];
+	}
+
+	rotor = ptl->ptl_rotor++; /* get round-robin factor */
+	if (portal_rotor == LNET_PTL_ROTOR_HASH_RT && routed)
+		cpt = info->mi_cpt;
+	else
+		cpt = rotor % LNET_CPT_NUMBER;
+
+	if (!ptl->ptl_mtables[cpt]->mt_enabled) {
+		/* is there any active entry for this portal? */
+		nmaps = ptl->ptl_mt_nmaps;
+		/* map to an active mtable to avoid heavy "stealing" */
+		if (nmaps != 0) {
+			/* NB: there is possibility that ptl_mt_maps is being
+			 * changed because we are not under protection of
+			 * lnet_ptl_lock, but it shouldn't hurt anything */
+			cpt = ptl->ptl_mt_maps[rotor % nmaps];
+		}
+	}
+
+	return ptl->ptl_mtables[cpt];
+}
+
+static int
+lnet_mt_test_exhausted(struct lnet_match_table *mtable, int pos)
+{
+	__u64	*bmap;
+	int	i;
+
+	if (!lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]))
+		return 0;
+
+	if (pos < 0) { /* check all bits */
+		for (i = 0; i < LNET_MT_EXHAUSTED_BMAP; i++) {
+			if (mtable->mt_exhausted[i] != (__u64)(-1))
+				return 0;
+		}
+		return 1;
+	}
+
+	LASSERT(pos <= LNET_MT_HASH_IGNORE);
+	/* mtable::mt_mhash[pos] is marked as exhausted or not */
+	bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64];
+	pos &= (1 << LNET_MT_BITS_U64) - 1;
+
+	return ((*bmap) & (1ULL << pos)) != 0;
+}
+
+static void
+lnet_mt_set_exhausted(struct lnet_match_table *mtable, int pos, int exhausted)
+{
+	__u64	*bmap;
+
+	LASSERT(lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]));
+	LASSERT(pos <= LNET_MT_HASH_IGNORE);
+
+	/* set mtable::mt_mhash[pos] as exhausted/non-exhausted */
+	bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64];
+	pos &= (1 << LNET_MT_BITS_U64) - 1;
+
+	if (!exhausted)
+		*bmap &= ~(1ULL << pos);
+	else
+		*bmap |= 1ULL << pos;
+}
+
+struct list_head *
+lnet_mt_match_head(struct lnet_match_table *mtable,
+		   struct lnet_process_id id, __u64 mbits)
+{
+	struct lnet_portal *ptl = the_lnet.ln_portals[mtable->mt_portal];
+
+	if (lnet_ptl_is_wildcard(ptl)) {
+		return &mtable->mt_mhash[mbits & LNET_MT_HASH_MASK];
+	} else {
+		unsigned long hash = mbits + id.nid + id.pid;
+
+		LASSERT(lnet_ptl_is_unique(ptl));
+		hash = hash_long(hash, LNET_MT_HASH_BITS);
+		return &mtable->mt_mhash[hash & LNET_MT_HASH_MASK];
+	}
+}
+
+int
+lnet_mt_match_md(struct lnet_match_table *mtable,
+		 struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	struct list_head	*head;
+	struct lnet_me		*me;
+	struct lnet_me		*tmp;
+	int			exhausted = 0;
+	int			rc;
+
+	/* any ME with ignore bits? */
+	if (!list_empty(&mtable->mt_mhash[LNET_MT_HASH_IGNORE]))
+		head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE];
+	else
+		head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits);
+ again:
+	/* NB: only wildcard portal needs to return LNET_MATCHMD_EXHAUSTED */
+	if (lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]))
+		exhausted = LNET_MATCHMD_EXHAUSTED;
+
+	list_for_each_entry_safe(me, tmp, head, me_list) {
+		/* ME attached but MD not attached yet */
+		if (me->me_md == NULL)
+			continue;
+
+		LASSERT(me == me->me_md->md_me);
+
+		rc = lnet_try_match_md(me->me_md, info, msg);
+		if ((rc & LNET_MATCHMD_EXHAUSTED) == 0)
+			exhausted = 0; /* mlist is not empty */
+
+		if ((rc & LNET_MATCHMD_FINISH) != 0) {
+			/* don't return EXHAUSTED bit because we don't know
+			 * whether the mlist is empty or not */
+			return rc & ~LNET_MATCHMD_EXHAUSTED;
+		}
+	}
+
+	if (exhausted == LNET_MATCHMD_EXHAUSTED) { /* @head is exhausted */
+		lnet_mt_set_exhausted(mtable, head - mtable->mt_mhash, 1);
+		if (!lnet_mt_test_exhausted(mtable, -1))
+			exhausted = 0;
+	}
+
+	if (exhausted == 0 && head == &mtable->mt_mhash[LNET_MT_HASH_IGNORE]) {
+		head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits);
+		goto again; /* re-check MEs w/o ignore-bits */
+	}
+
+	if (info->mi_opc == LNET_MD_OP_GET ||
+	    !lnet_ptl_is_lazy(the_lnet.ln_portals[info->mi_portal]))
+		return LNET_MATCHMD_DROP | exhausted;
+
+	return LNET_MATCHMD_NONE | exhausted;
+}
+
+static int
+lnet_ptl_match_early(struct lnet_portal *ptl, struct lnet_msg *msg)
+{
+	int	rc;
+
+	/* message arrived before any buffer posting on this portal,
+	 * simply delay or drop this message */
+	if (likely(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)))
+		return 0;
+
+	lnet_ptl_lock(ptl);
+	/* check it again with hold of lock */
+	if (lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)) {
+		lnet_ptl_unlock(ptl);
+		return 0;
+	}
+
+	if (lnet_ptl_is_lazy(ptl)) {
+		if (msg->msg_rx_ready_delay) {
+			msg->msg_rx_delayed = 1;
+			list_add_tail(&msg->msg_list,
+				      &ptl->ptl_msg_delayed);
+		}
+		rc = LNET_MATCHMD_NONE;
+	} else {
+		rc = LNET_MATCHMD_DROP;
+	}
+
+	lnet_ptl_unlock(ptl);
+	return rc;
+}
+
+static int
+lnet_ptl_match_delay(struct lnet_portal *ptl,
+		     struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	int	first = ptl->ptl_mt_maps[0]; /* read w/o lock */
+	int	rc = 0;
+	int	i;
+
+	/*
+	 * Steal buffer from other CPTs, and delay msg if nothing to
+	 * steal.  This function is more expensive than a regular
+	 * match, but we don't expect it can happen a lot. The return
+	 * code contains one of LNET_MATCHMD_OK, LNET_MATCHMD_DROP, or
+	 * LNET_MATCHMD_NONE.
+	 */
+	LASSERT(lnet_ptl_is_wildcard(ptl));
+
+	for (i = 0; i < LNET_CPT_NUMBER; i++) {
+		struct lnet_match_table *mtable;
+		int			cpt;
+
+		cpt = (first + i) % LNET_CPT_NUMBER;
+		mtable = ptl->ptl_mtables[cpt];
+		if (i != 0 && i != LNET_CPT_NUMBER - 1 && !mtable->mt_enabled)
+			continue;
+
+		lnet_res_lock(cpt);
+		lnet_ptl_lock(ptl);
+
+		if (i == 0) {
+			/* The first try, add to stealing list. */
+			list_add_tail(&msg->msg_list,
+				      &ptl->ptl_msg_stealing);
+		}
+
+		if (!list_empty(&msg->msg_list)) {
+			/* On stealing list. */
+			rc = lnet_mt_match_md(mtable, info, msg);
+
+			if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 &&
+			    mtable->mt_enabled)
+				lnet_ptl_disable_mt(ptl, cpt);
+
+			if ((rc & LNET_MATCHMD_FINISH) != 0) {
+				/* Match found, remove from stealing list. */
+				list_del_init(&msg->msg_list);
+			} else if (i == LNET_CPT_NUMBER - 1 || /* (1) */
+				   ptl->ptl_mt_nmaps == 0 ||   /* (2) */
+				   (ptl->ptl_mt_nmaps == 1 &&  /* (3) */
+				    ptl->ptl_mt_maps[0] == cpt)) {
+				/*
+				 * No match found, and this is either
+				 * (1) the last cpt to check, or
+				 * (2) there is no active cpt, or
+				 * (3) this is the only active cpt.
+				 * There is nothing to steal: delay or
+				 * drop the message.
+				 */
+				list_del_init(&msg->msg_list);
+
+				if (lnet_ptl_is_lazy(ptl)) {
+					msg->msg_rx_delayed = 1;
+					list_add_tail(&msg->msg_list,
+						      &ptl->ptl_msg_delayed);
+					rc = LNET_MATCHMD_NONE;
+				} else {
+					rc = LNET_MATCHMD_DROP;
+				}
+			} else {
+				/* Do another iteration. */
+				rc = 0;
+			}
+		} else {
+			/*
+			 * No longer on stealing list: another thread
+			 * matched the message in lnet_ptl_attach_md().
+			 * We are now expected to handle the message.
+			 */
+			rc = msg->msg_md == NULL ?
+				LNET_MATCHMD_DROP : LNET_MATCHMD_OK;
+		}
+
+		lnet_ptl_unlock(ptl);
+		lnet_res_unlock(cpt);
+
+		/*
+		 * Note that test (1) above ensures that we always
+		 * exit the loop through this break statement.
+		 *
+		 * LNET_MATCHMD_NONE means msg was added to the
+		 * delayed queue, and we may no longer reference it
+		 * after lnet_ptl_unlock() and lnet_res_unlock().
+		 */
+		if (rc & (LNET_MATCHMD_FINISH | LNET_MATCHMD_NONE))
+			break;
+	}
+
+	return rc;
+}
+
+int
+lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	struct lnet_match_table	*mtable;
+	struct lnet_portal	*ptl;
+	int			rc;
+
+	CDEBUG(D_NET, "Request from %s of length %d into portal %d "
+	       "MB=%#llx\n", libcfs_id2str(info->mi_id),
+	       info->mi_rlength, info->mi_portal, info->mi_mbits);
+
+	if (info->mi_portal >= the_lnet.ln_nportals) {
+		CERROR("Invalid portal %d not in [0-%d]\n",
+		       info->mi_portal, the_lnet.ln_nportals);
+		return LNET_MATCHMD_DROP;
+	}
+
+	ptl = the_lnet.ln_portals[info->mi_portal];
+	rc = lnet_ptl_match_early(ptl, msg);
+	if (rc != 0) /* matched or delayed early message */
+		return rc;
+
+	mtable = lnet_mt_of_match(info, msg);
+	lnet_res_lock(mtable->mt_cpt);
+
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+		rc = LNET_MATCHMD_DROP;
+		goto out1;
+	}
+
+	rc = lnet_mt_match_md(mtable, info, msg);
+	if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 && mtable->mt_enabled) {
+		lnet_ptl_lock(ptl);
+		lnet_ptl_disable_mt(ptl, mtable->mt_cpt);
+		lnet_ptl_unlock(ptl);
+	}
+
+	if ((rc & LNET_MATCHMD_FINISH) != 0)	/* matched or dropping */
+		goto out1;
+
+	if (!msg->msg_rx_ready_delay)
+		goto out1;
+
+	LASSERT(lnet_ptl_is_lazy(ptl));
+	LASSERT(!msg->msg_rx_delayed);
+
+	/* NB: we don't expect "delay" can happen a lot */
+	if (lnet_ptl_is_unique(ptl) || LNET_CPT_NUMBER == 1) {
+		lnet_ptl_lock(ptl);
+
+		msg->msg_rx_delayed = 1;
+		list_add_tail(&msg->msg_list, &ptl->ptl_msg_delayed);
+
+		lnet_ptl_unlock(ptl);
+		lnet_res_unlock(mtable->mt_cpt);
+		rc = LNET_MATCHMD_NONE;
+	} else	{
+		lnet_res_unlock(mtable->mt_cpt);
+		rc = lnet_ptl_match_delay(ptl, info, msg);
+	}
+
+	/* LNET_MATCHMD_NONE means msg was added to the delay queue */
+	if (rc & LNET_MATCHMD_NONE) {
+		CDEBUG(D_NET,
+		       "Delaying %s from %s ptl %d MB %#llx off %d len %d\n",
+		       info->mi_opc == LNET_MD_OP_PUT ? "PUT" : "GET",
+		       libcfs_id2str(info->mi_id), info->mi_portal,
+		       info->mi_mbits, info->mi_roffset, info->mi_rlength);
+	}
+	goto out0;
+ out1:
+	lnet_res_unlock(mtable->mt_cpt);
+ out0:
+	/* EXHAUSTED bit is only meaningful for internal functions */
+	return rc & ~LNET_MATCHMD_EXHAUSTED;
+}
+
+void
+lnet_ptl_detach_md(struct lnet_me *me, struct lnet_libmd *md)
+{
+	LASSERT(me->me_md == md && md->md_me == me);
+
+	me->me_md = NULL;
+	md->md_me = NULL;
+}
+
+/* called with lnet_res_lock held */
+void
+lnet_ptl_attach_md(struct lnet_me *me, struct lnet_libmd *md,
+		   struct list_head *matches, struct list_head *drops)
+{
+	struct lnet_portal *ptl = the_lnet.ln_portals[me->me_portal];
+	struct lnet_match_table	*mtable;
+	struct list_head *head;
+	struct lnet_msg	*tmp;
+	struct lnet_msg	*msg;
+	int exhausted = 0;
+	int cpt;
+
+	LASSERT(md->md_refcount == 0); /* a brand new MD */
+
+	me->me_md = md;
+	md->md_me = me;
+
+	cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie);
+	mtable = ptl->ptl_mtables[cpt];
+
+	if (list_empty(&ptl->ptl_msg_stealing) &&
+	    list_empty(&ptl->ptl_msg_delayed) &&
+	    !lnet_mt_test_exhausted(mtable, me->me_pos))
+		return;
+
+	lnet_ptl_lock(ptl);
+	head = &ptl->ptl_msg_stealing;
+ again:
+	list_for_each_entry_safe(msg, tmp, head, msg_list) {
+		struct lnet_match_info	info;
+		struct lnet_hdr		*hdr;
+		int			rc;
+
+		LASSERT(msg->msg_rx_delayed || head == &ptl->ptl_msg_stealing);
+
+		hdr   = &msg->msg_hdr;
+		/* Multi-Rail: Primary peer NID */
+		info.mi_id.nid	= msg->msg_initiator;
+		info.mi_id.pid	= hdr->src_pid;
+		info.mi_opc	= LNET_MD_OP_PUT;
+		info.mi_portal	= hdr->msg.put.ptl_index;
+		info.mi_rlength	= hdr->payload_length;
+		info.mi_roffset	= hdr->msg.put.offset;
+		info.mi_mbits	= hdr->msg.put.match_bits;
+
+		rc = lnet_try_match_md(md, &info, msg);
+
+		exhausted = (rc & LNET_MATCHMD_EXHAUSTED) != 0;
+		if ((rc & LNET_MATCHMD_NONE) != 0) {
+			if (exhausted)
+				break;
+			continue;
+		}
+
+		/* Hurrah! This _is_ a match */
+		LASSERT((rc & LNET_MATCHMD_FINISH) != 0);
+		list_del_init(&msg->msg_list);
+
+		if (head == &ptl->ptl_msg_stealing) {
+			if (exhausted)
+				break;
+			/* stealing thread will handle the message */
+			continue;
+		}
+
+		if ((rc & LNET_MATCHMD_OK) != 0) {
+			list_add_tail(&msg->msg_list, matches);
+
+			CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d "
+			       "match %llu offset %d length %d.\n",
+			       libcfs_id2str(info.mi_id),
+			       info.mi_portal, info.mi_mbits,
+			       info.mi_roffset, info.mi_rlength);
+		} else {
+			list_add_tail(&msg->msg_list, drops);
+		}
+
+		if (exhausted)
+			break;
+	}
+
+	if (!exhausted && head == &ptl->ptl_msg_stealing) {
+		head = &ptl->ptl_msg_delayed;
+		goto again;
+	}
+
+	if (lnet_ptl_is_wildcard(ptl) && !exhausted) {
+		lnet_mt_set_exhausted(mtable, me->me_pos, 0);
+		if (!mtable->mt_enabled)
+			lnet_ptl_enable_mt(ptl, cpt);
+	}
+
+	lnet_ptl_unlock(ptl);
+}
+
+static void
+lnet_ptl_cleanup(struct lnet_portal *ptl)
+{
+	struct lnet_match_table	*mtable;
+	int			i;
+
+	if (ptl->ptl_mtables == NULL) /* uninitialized portal */
+		return;
+
+	LASSERT(list_empty(&ptl->ptl_msg_delayed));
+	LASSERT(list_empty(&ptl->ptl_msg_stealing));
+	cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) {
+		struct list_head *mhash;
+		struct lnet_me	 *me;
+		int		  j;
+
+		if (mtable->mt_mhash == NULL) /* uninitialized match-table */
+			continue;
+
+		mhash = mtable->mt_mhash;
+		/* cleanup ME */
+		for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++) {
+			while (!list_empty(&mhash[j])) {
+				me = list_entry(mhash[j].next,
+						struct lnet_me, me_list);
+				CERROR("Active ME %p on exit\n", me);
+				list_del(&me->me_list);
+				lnet_me_free(me);
+			}
+		}
+		/* the extra entry is for MEs with ignore bits */
+		LIBCFS_FREE(mhash, sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1));
+	}
+
+	cfs_percpt_free(ptl->ptl_mtables);
+	ptl->ptl_mtables = NULL;
+}
+
+static int
+lnet_ptl_setup(struct lnet_portal *ptl, int index)
+{
+	struct lnet_match_table	*mtable;
+	struct list_head	*mhash;
+	int			i;
+	int			j;
+
+	ptl->ptl_mtables = cfs_percpt_alloc(lnet_cpt_table(),
+					    sizeof(struct lnet_match_table));
+	if (ptl->ptl_mtables == NULL) {
+		CERROR("Failed to create match table for portal %d\n", index);
+		return -ENOMEM;
+	}
+
+	ptl->ptl_index = index;
+	INIT_LIST_HEAD(&ptl->ptl_msg_delayed);
+	INIT_LIST_HEAD(&ptl->ptl_msg_stealing);
+	spin_lock_init(&ptl->ptl_lock);
+	cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) {
+		/* the extra entry is for MEs with ignore bits */
+		LIBCFS_CPT_ALLOC(mhash, lnet_cpt_table(), i,
+				 sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1));
+		if (mhash == NULL) {
+			CERROR("Failed to create match hash for portal %d\n",
+			       index);
+			goto failed;
+		}
+
+		memset(&mtable->mt_exhausted[0], -1,
+		       sizeof(mtable->mt_exhausted[0]) *
+		       LNET_MT_EXHAUSTED_BMAP);
+		mtable->mt_mhash = mhash;
+		for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++)
+			INIT_LIST_HEAD(&mhash[j]);
+
+		mtable->mt_portal = index;
+		mtable->mt_cpt = i;
+	}
+
+	return 0;
+ failed:
+	lnet_ptl_cleanup(ptl);
+	return -ENOMEM;
+}
+
+void
+lnet_portals_destroy(void)
+{
+	int	i;
+
+	if (the_lnet.ln_portals == NULL)
+		return;
+
+	for (i = 0; i < the_lnet.ln_nportals; i++)
+		lnet_ptl_cleanup(the_lnet.ln_portals[i]);
+
+	cfs_array_free(the_lnet.ln_portals);
+	the_lnet.ln_portals = NULL;
+}
+
+int
+lnet_portals_create(void)
+{
+	int	size;
+	int	i;
+
+	size = offsetof(struct lnet_portal, ptl_mt_maps[LNET_CPT_NUMBER]);
+
+	the_lnet.ln_nportals = MAX_PORTALS;
+	the_lnet.ln_portals = cfs_array_alloc(the_lnet.ln_nportals, size);
+	if (the_lnet.ln_portals == NULL) {
+		CERROR("Failed to allocate portals table\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < the_lnet.ln_nportals; i++) {
+		if (lnet_ptl_setup(the_lnet.ln_portals[i], i)) {
+			lnet_portals_destroy();
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * Turn on the lazy portal attribute. Use with caution!
+ *
+ * This portal attribute only affects incoming PUT requests to the portal,
+ * and is off by default. By default, if there's no matching MD for an
+ * incoming PUT request, it is simply dropped. With the lazy attribute on,
+ * such requests are queued indefinitely until either a matching MD is
+ * posted to the portal or the lazy attribute is turned off.
+ *
+ * It would prevent dropped requests, however it should be regarded as the
+ * last line of defense - i.e. users must keep a close watch on active
+ * buffers on a lazy portal and once it becomes too low post more buffers as
+ * soon as possible. This is because delayed requests usually have detrimental
+ * effects on underlying network connections. A few delayed requests often
+ * suffice to bring an underlying connection to a complete halt, due to flow
+ * control mechanisms.
+ *
+ * There's also a DOS attack risk. If users don't post match-all MDs on a
+ * lazy portal, a malicious peer can easily stop a service by sending some
+ * PUT requests with match bits that won't match any MD. A routed server is
+ * especially vulnerable since the connections to its neighbor routers are
+ * shared among all clients.
+ *
+ * \param portal Index of the portal to enable the lazy attribute on.
+ *
+ * \retval 0	   On success.
+ * \retval -EINVAL If \a portal is not a valid index.
+ */
+int
+LNetSetLazyPortal(int portal)
+{
+	struct lnet_portal *ptl;
+
+	if (portal < 0 || portal >= the_lnet.ln_nportals)
+		return -EINVAL;
+
+	CDEBUG(D_NET, "Setting portal %d lazy\n", portal);
+	ptl = the_lnet.ln_portals[portal];
+
+	lnet_res_lock(LNET_LOCK_EX);
+	lnet_ptl_lock(ptl);
+
+	lnet_ptl_setopt(ptl, LNET_PTL_LAZY);
+
+	lnet_ptl_unlock(ptl);
+	lnet_res_unlock(LNET_LOCK_EX);
+
+	return 0;
+}
+EXPORT_SYMBOL(LNetSetLazyPortal);
+
+int
+lnet_clear_lazy_portal(struct lnet_ni *ni, int portal, char *reason)
+{
+	struct lnet_portal	*ptl;
+	struct list_head	zombies = LIST_HEAD_INIT(zombies);
+
+	if (portal < 0 || portal >= the_lnet.ln_nportals)
+		return -EINVAL;
+
+	ptl = the_lnet.ln_portals[portal];
+
+	lnet_res_lock(LNET_LOCK_EX);
+	lnet_ptl_lock(ptl);
+
+	if (!lnet_ptl_is_lazy(ptl)) {
+		lnet_ptl_unlock(ptl);
+		lnet_res_unlock(LNET_LOCK_EX);
+		return 0;
+	}
+
+	if (ni != NULL) {
+		struct lnet_msg *msg, *tmp;
+
+		/* grab all messages which are on the NI passed in */
+		list_for_each_entry_safe(msg, tmp, &ptl->ptl_msg_delayed,
+					 msg_list) {
+			if (msg->msg_txni == ni || msg->msg_rxni == ni)
+				list_move(&msg->msg_list, &zombies);
+		}
+	} else {
+		if (the_lnet.ln_state != LNET_STATE_RUNNING)
+			CWARN("Active lazy portal %d on exit\n", portal);
+		else
+			CDEBUG(D_NET, "clearing portal %d lazy\n", portal);
+
+		/* grab all the blocked messages atomically */
+		list_splice_init(&ptl->ptl_msg_delayed, &zombies);
+
+		lnet_ptl_unsetopt(ptl, LNET_PTL_LAZY);
+	}
+
+	lnet_ptl_unlock(ptl);
+	lnet_res_unlock(LNET_LOCK_EX);
+
+	lnet_drop_delayed_msg_list(&zombies, reason);
+
+	return 0;
+}
+
+/**
+ * Turn off the lazy portal attribute. Delayed requests on the portal,
+ * if any, will be all dropped when this function returns.
+ *
+ * \param portal Index of the portal to disable the lazy attribute on.
+ *
+ * \retval 0	   On success.
+ * \retval -EINVAL If \a portal is not a valid index.
+ */
+int
+LNetClearLazyPortal(int portal)
+{
+	return lnet_clear_lazy_portal(NULL, portal,
+				      "Clearing lazy portal attr");
+}
+EXPORT_SYMBOL(LNetClearLazyPortal);
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
new file mode 100644
index 0000000000000..a0fcec9d8a444
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
@@ -0,0 +1,649 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2015, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/if.h>
+#include <linux/in.h>
+#include <linux/net.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+/* For sys_open & sys_close */
+#include <linux/syscalls.h>
+#include <net/sock.h>
+
+#include <libcfs/libcfs.h>
+#include <lnet/lib-lnet.h>
+
+static int
+kernel_sock_unlocked_ioctl(struct file *filp, int cmd, unsigned long arg)
+{
+	mm_segment_t oldfs = get_fs();
+	int err;
+
+	set_fs(KERNEL_DS);
+	err = filp->f_op->unlocked_ioctl(filp, cmd, arg);
+	set_fs(oldfs);
+
+	return err;
+}
+
+static int
+lnet_sock_ioctl(int cmd, unsigned long arg)
+{
+	struct file    *sock_filp;
+	struct socket  *sock;
+	int		fd = -1;
+	int		rc;
+
+#ifdef HAVE_SOCK_CREATE_KERN_USE_NET
+	rc = sock_create_kern(&init_net, PF_INET, SOCK_STREAM, 0, &sock);
+#else
+	rc = sock_create_kern(PF_INET, SOCK_STREAM, 0, &sock);
+#endif
+	if (rc != 0) {
+		CERROR("Can't create socket: %d\n", rc);
+		return rc;
+	}
+
+#if !defined(HAVE_SOCK_ALLOC_FILE) && !defined(HAVE_SOCK_ALLOC_FILE_3ARGS)
+	fd = sock_map_fd(sock, 0);
+	if (fd < 0) {
+		rc = fd;
+		sock_release(sock);
+		goto out;
+	}
+	sock_filp = fget(fd);
+#else
+# ifdef HAVE_SOCK_ALLOC_FILE_3ARGS
+	sock_filp = sock_alloc_file(sock, 0, NULL);
+# else
+	sock_filp = sock_alloc_file(sock, 0);
+# endif
+#endif
+	if (IS_ERR(sock_filp)) {
+		rc = PTR_ERR(sock_filp);
+		sock_release(sock);
+		goto out;
+	}
+
+	rc = kernel_sock_unlocked_ioctl(sock_filp, cmd, arg);
+
+	fput(sock_filp);
+out:
+	if (fd >= 0)
+		sys_close(fd);
+	return rc;
+}
+
+int
+lnet_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask)
+{
+	struct ifreq	ifr;
+	int		nob;
+	int		rc;
+	__u32		val;
+
+	nob = strnlen(name, IFNAMSIZ);
+	if (nob == IFNAMSIZ) {
+		CERROR("Interface name %s too long\n", name);
+		return -EINVAL;
+	}
+
+	CLASSERT(sizeof(ifr.ifr_name) >= IFNAMSIZ);
+
+	if (strlen(name) > sizeof(ifr.ifr_name)-1)
+		return -E2BIG;
+	strncpy(ifr.ifr_name, name, sizeof(ifr.ifr_name));
+
+	rc = lnet_sock_ioctl(SIOCGIFFLAGS, (unsigned long)&ifr);
+	if (rc != 0) {
+		CERROR("Can't get flags for interface %s\n", name);
+		return rc;
+	}
+
+	if ((ifr.ifr_flags & IFF_UP) == 0) {
+		CDEBUG(D_NET, "Interface %s down\n", name);
+		*up = 0;
+		*ip = *mask = 0;
+		return 0;
+	}
+	*up = 1;
+
+	if (strlen(name) > sizeof(ifr.ifr_name)-1)
+		return -E2BIG;
+	strncpy(ifr.ifr_name, name, sizeof(ifr.ifr_name));
+
+	ifr.ifr_addr.sa_family = AF_INET;
+	rc = lnet_sock_ioctl(SIOCGIFADDR, (unsigned long)&ifr);
+
+	if (rc != 0) {
+		CERROR("Can't get IP address for interface %s\n", name);
+		return rc;
+	}
+
+	val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr;
+	*ip = ntohl(val);
+
+	if (strlen(name) > sizeof(ifr.ifr_name)-1)
+		return -E2BIG;
+	strncpy(ifr.ifr_name, name, sizeof(ifr.ifr_name));
+
+	ifr.ifr_addr.sa_family = AF_INET;
+	rc = lnet_sock_ioctl(SIOCGIFNETMASK, (unsigned long)&ifr);
+	if (rc != 0) {
+		CERROR("Can't get netmask for interface %s\n", name);
+		return rc;
+	}
+
+	val = ((struct sockaddr_in *)&ifr.ifr_netmask)->sin_addr.s_addr;
+	*mask = ntohl(val);
+
+	return 0;
+}
+EXPORT_SYMBOL(lnet_ipif_query);
+
+void
+lnet_ipif_free_enumeration(char **names, int n)
+{
+	int	i;
+
+	LASSERT(n > 0);
+
+	for (i = 0; i < n && names[i] != NULL; i++)
+		LIBCFS_FREE(names[i], IFNAMSIZ);
+
+	LIBCFS_FREE(names, n * sizeof(*names));
+}
+EXPORT_SYMBOL(lnet_ipif_free_enumeration);
+
+int
+lnet_ipif_enumerate(char ***namesp)
+{
+	/* Allocate and fill in 'names', returning # interfaces/error */
+	char	      **names;
+	int		toobig;
+	int		nalloc;
+	int		nfound;
+	struct ifreq   *ifr;
+	struct ifconf	ifc;
+	int		rc;
+	int		nob;
+	int		i;
+
+	nalloc = 16;	/* first guess at max interfaces */
+	toobig = 0;
+	for (;;) {
+		if (nalloc * sizeof(*ifr) > PAGE_SIZE) {
+			toobig = 1;
+			nalloc = PAGE_SIZE / sizeof(*ifr);
+			CWARN("Too many interfaces: only enumerating "
+			      "first %d\n", nalloc);
+		}
+
+		LIBCFS_ALLOC(ifr, nalloc * sizeof(*ifr));
+		if (ifr == NULL) {
+			CERROR("ENOMEM enumerating up to %d interfaces\n",
+			       nalloc);
+			rc = -ENOMEM;
+			goto out0;
+		}
+
+		ifc.ifc_buf = (char *)ifr;
+		ifc.ifc_len = nalloc * sizeof(*ifr);
+
+		rc = lnet_sock_ioctl(SIOCGIFCONF, (unsigned long)&ifc);
+		if (rc < 0) {
+			CERROR("Error %d enumerating interfaces\n", rc);
+			goto out1;
+		}
+
+		LASSERT(rc == 0);
+
+		nfound = ifc.ifc_len/sizeof(*ifr);
+		LASSERT(nfound <= nalloc);
+
+		if (nfound < nalloc || toobig)
+			break;
+
+		LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
+		nalloc *= 2;
+	}
+
+	if (nfound == 0)
+		goto out1;
+
+	LIBCFS_ALLOC(names, nfound * sizeof(*names));
+	if (names == NULL) {
+		rc = -ENOMEM;
+		goto out1;
+	}
+
+	for (i = 0; i < nfound; i++) {
+		nob = strnlen(ifr[i].ifr_name, IFNAMSIZ);
+		if (nob == IFNAMSIZ) {
+			/* no space for terminating NULL */
+			CERROR("interface name %.*s too long (%d max)\n",
+			       nob, ifr[i].ifr_name, IFNAMSIZ);
+			rc = -ENAMETOOLONG;
+			goto out2;
+		}
+
+		LIBCFS_ALLOC(names[i], IFNAMSIZ);
+		if (names[i] == NULL) {
+			rc = -ENOMEM;
+			goto out2;
+		}
+
+		memcpy(names[i], ifr[i].ifr_name, nob);
+		names[i][nob] = 0;
+	}
+
+	*namesp = names;
+	rc = nfound;
+
+ out2:
+	if (rc < 0)
+		lnet_ipif_free_enumeration(names, nfound);
+ out1:
+	LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
+ out0:
+	return rc;
+}
+EXPORT_SYMBOL(lnet_ipif_enumerate);
+
+int
+lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout)
+{
+	int		rc;
+	long		jiffies_left = timeout * msecs_to_jiffies(MSEC_PER_SEC);
+	unsigned long	then;
+	struct timeval	tv;
+
+	LASSERT(nob > 0);
+	/* Caller may pass a zero timeout if she thinks the socket buffer is
+	 * empty enough to take the whole message immediately */
+
+	for (;;) {
+		struct kvec  iov = {
+			.iov_base = buffer,
+			.iov_len  = nob
+		};
+		struct msghdr msg = {
+			.msg_flags	= (timeout == 0) ? MSG_DONTWAIT : 0
+		};
+
+		if (timeout != 0) {
+			/* Set send timeout to remaining time */
+			tv = (struct timeval) {
+				.tv_sec = jiffies_left /
+					  msecs_to_jiffies(MSEC_PER_SEC),
+				.tv_usec = ((jiffies_left %
+					     msecs_to_jiffies(MSEC_PER_SEC)) *
+					     USEC_PER_SEC) /
+					     msecs_to_jiffies(MSEC_PER_SEC)
+			};
+
+			rc = kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO,
+					       (char *)&tv, sizeof(tv));
+			if (rc != 0) {
+				CERROR("Can't set socket send timeout "
+				       "%ld.%06d: %d\n",
+				       (long)tv.tv_sec, (int)tv.tv_usec, rc);
+				return rc;
+			}
+		}
+
+		then = jiffies;
+		rc = kernel_sendmsg(sock, &msg, &iov, 1, nob);
+		jiffies_left -= jiffies - then;
+
+		if (rc == nob)
+			return 0;
+
+		if (rc < 0)
+			return rc;
+
+		if (rc == 0) {
+			CERROR("Unexpected zero rc\n");
+			return -ECONNABORTED;
+		}
+
+		if (jiffies_left <= 0)
+			return -EAGAIN;
+
+		buffer = ((char *)buffer) + rc;
+		nob -= rc;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(lnet_sock_write);
+
+int
+lnet_sock_read(struct socket *sock, void *buffer, int nob, int timeout)
+{
+	int		rc;
+	long		jiffies_left = timeout * msecs_to_jiffies(MSEC_PER_SEC);
+	unsigned long	then;
+	struct timeval	tv;
+
+	LASSERT(nob > 0);
+	LASSERT(jiffies_left > 0);
+
+	for (;;) {
+		struct kvec  iov = {
+			.iov_base = buffer,
+			.iov_len  = nob
+		};
+		struct msghdr msg = {
+			.msg_flags	= 0
+		};
+
+		/* Set receive timeout to remaining time */
+		tv = (struct timeval) {
+			.tv_sec = jiffies_left / msecs_to_jiffies(MSEC_PER_SEC),
+			.tv_usec = ((jiffies_left %
+					msecs_to_jiffies(MSEC_PER_SEC)) *
+					USEC_PER_SEC) /
+					msecs_to_jiffies(MSEC_PER_SEC)
+		};
+		rc = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO,
+				       (char *)&tv, sizeof(tv));
+		if (rc != 0) {
+			CERROR("Can't set socket recv timeout %ld.%06d: %d\n",
+			       (long)tv.tv_sec, (int)tv.tv_usec, rc);
+			return rc;
+		}
+
+		then = jiffies;
+		rc = kernel_recvmsg(sock, &msg, &iov, 1, nob, 0);
+		jiffies_left -= jiffies - then;
+
+		if (rc < 0)
+			return rc;
+
+		if (rc == 0)
+			return -ECONNRESET;
+
+		buffer = ((char *)buffer) + rc;
+		nob -= rc;
+
+		if (nob == 0)
+			return 0;
+
+		if (jiffies_left <= 0)
+			return -ETIMEDOUT;
+	}
+}
+EXPORT_SYMBOL(lnet_sock_read);
+
+static int
+lnet_sock_create(struct socket **sockp, int *fatal,
+		 __u32 local_ip, int local_port)
+{
+	struct sockaddr_in  locaddr;
+	struct socket	   *sock;
+	int		    rc;
+	int		    option;
+
+	/* All errors are fatal except bind failure if the port is in use */
+	*fatal = 1;
+
+#ifdef HAVE_SOCK_CREATE_KERN_USE_NET
+	rc = sock_create_kern(&init_net, PF_INET, SOCK_STREAM, 0, &sock);
+#else
+	rc = sock_create_kern(PF_INET, SOCK_STREAM, 0, &sock);
+#endif
+	*sockp = sock;
+	if (rc != 0) {
+		CERROR("Can't create socket: %d\n", rc);
+		return rc;
+	}
+
+	option = 1;
+	rc = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
+			       (char *)&option, sizeof(option));
+	if (rc != 0) {
+		CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc);
+		goto failed;
+	}
+
+	if (local_ip != 0 || local_port != 0) {
+		memset(&locaddr, 0, sizeof(locaddr));
+		locaddr.sin_family = AF_INET;
+		locaddr.sin_port = htons(local_port);
+		locaddr.sin_addr.s_addr = (local_ip == 0) ?
+					  INADDR_ANY : htonl(local_ip);
+
+		rc = kernel_bind(sock, (struct sockaddr *)&locaddr,
+				 sizeof(locaddr));
+		if (rc == -EADDRINUSE) {
+			CDEBUG(D_NET, "Port %d already in use\n", local_port);
+			*fatal = 0;
+			goto failed;
+		}
+		if (rc != 0) {
+			CERROR("Error trying to bind to port %d: %d\n",
+			       local_port, rc);
+			goto failed;
+		}
+	}
+	return 0;
+
+failed:
+	sock_release(sock);
+	return rc;
+}
+
+int
+lnet_sock_setbuf(struct socket *sock, int txbufsize, int rxbufsize)
+{
+	int		    option;
+	int		    rc;
+
+	if (txbufsize != 0) {
+		option = txbufsize;
+		rc = kernel_setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
+				       (char *)&option, sizeof(option));
+		if (rc != 0) {
+			CERROR("Can't set send buffer %d: %d\n",
+				option, rc);
+			return rc;
+		}
+	}
+
+	if (rxbufsize != 0) {
+		option = rxbufsize;
+		rc = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUF,
+				       (char *)&option, sizeof(option));
+		if (rc != 0) {
+			CERROR("Can't set receive buffer %d: %d\n",
+				option, rc);
+			return rc;
+		}
+	}
+	return 0;
+}
+EXPORT_SYMBOL(lnet_sock_setbuf);
+
+int
+lnet_sock_getaddr(struct socket *sock, bool remote, __u32 *ip, int *port)
+{
+	struct sockaddr_in sin;
+	int		   len = sizeof(sin);
+	int		   rc;
+
+	if (remote)
+		rc = kernel_getpeername(sock, (struct sockaddr *)&sin, &len);
+	else
+		rc = kernel_getsockname(sock, (struct sockaddr *)&sin, &len);
+	if (rc != 0) {
+		CERROR("Error %d getting sock %s IP/port\n",
+			rc, remote ? "peer" : "local");
+		return rc;
+	}
+
+	if (ip != NULL)
+		*ip = ntohl(sin.sin_addr.s_addr);
+
+	if (port != NULL)
+		*port = ntohs(sin.sin_port);
+
+	return 0;
+}
+EXPORT_SYMBOL(lnet_sock_getaddr);
+
+int
+lnet_sock_getbuf(struct socket *sock, int *txbufsize, int *rxbufsize)
+{
+	if (txbufsize != NULL)
+		*txbufsize = sock->sk->sk_sndbuf;
+
+	if (rxbufsize != NULL)
+		*rxbufsize = sock->sk->sk_rcvbuf;
+
+	return 0;
+}
+EXPORT_SYMBOL(lnet_sock_getbuf);
+
+int
+lnet_sock_listen(struct socket **sockp,
+		   __u32 local_ip, int local_port, int backlog)
+{
+	int	 fatal;
+	int	 rc;
+
+	rc = lnet_sock_create(sockp, &fatal, local_ip, local_port);
+	if (rc != 0) {
+		if (!fatal)
+			CERROR("Can't create socket: port %d already in use\n",
+			       local_port);
+		return rc;
+	}
+
+	rc = kernel_listen(*sockp, backlog);
+	if (rc == 0)
+		return 0;
+
+	CERROR("Can't set listen backlog %d: %d\n", backlog, rc);
+	sock_release(*sockp);
+	return rc;
+}
+
+#ifndef HAVE_SK_SLEEP
+static inline wait_queue_head_t *sk_sleep(struct sock *sk)
+{
+	return sk->sk_sleep;
+}
+#endif
+
+int
+lnet_sock_accept(struct socket **newsockp, struct socket *sock)
+{
+	wait_queue_entry_t wait;
+	struct socket *newsock;
+	int	       rc;
+
+	/* XXX this should add a ref to sock->ops->owner, if
+	 * TCP could be a module */
+	rc = sock_create_lite(PF_PACKET, sock->type, IPPROTO_TCP, &newsock);
+	if (rc) {
+		CERROR("Can't allocate socket\n");
+		return rc;
+	}
+
+	newsock->ops = sock->ops;
+
+#ifdef HAVE_KERN_SOCK_ACCEPT_FLAG_ARG
+	rc = sock->ops->accept(sock, newsock, O_NONBLOCK, false);
+#else
+	rc = sock->ops->accept(sock, newsock, O_NONBLOCK);
+#endif
+	if (rc == -EAGAIN) {
+		/* Nothing ready, so wait for activity */
+		init_waitqueue_entry(&wait, current);
+		add_wait_queue(sk_sleep(sock->sk), &wait);
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule();
+		remove_wait_queue(sk_sleep(sock->sk), &wait);
+#ifdef HAVE_KERN_SOCK_ACCEPT_FLAG_ARG
+		rc = sock->ops->accept(sock, newsock, O_NONBLOCK, false);
+#else
+		rc = sock->ops->accept(sock, newsock, O_NONBLOCK);
+#endif
+	}
+
+	if (rc != 0)
+		goto failed;
+
+	*newsockp = newsock;
+	return 0;
+
+failed:
+	sock_release(newsock);
+	return rc;
+}
+
+int
+lnet_sock_connect(struct socket **sockp, int *fatal,
+		  __u32 local_ip, int local_port,
+		  __u32 peer_ip, int peer_port)
+{
+	struct sockaddr_in  srvaddr;
+	int		    rc;
+
+	rc = lnet_sock_create(sockp, fatal, local_ip, local_port);
+	if (rc != 0)
+		return rc;
+
+	memset(&srvaddr, 0, sizeof(srvaddr));
+	srvaddr.sin_family = AF_INET;
+	srvaddr.sin_port = htons(peer_port);
+	srvaddr.sin_addr.s_addr = htonl(peer_ip);
+
+	rc = kernel_connect(*sockp, (struct sockaddr *)&srvaddr,
+			    sizeof(srvaddr), 0);
+	if (rc == 0)
+		return 0;
+
+	/* EADDRNOTAVAIL probably means we're already connected to the same
+	 * peer/port on the same local port on a differently typed
+	 * connection.	Let our caller retry with a different local
+	 * port... */
+	*fatal = !(rc == -EADDRNOTAVAIL);
+
+	CDEBUG_LIMIT(*fatal ? D_NETERROR : D_NET,
+	       "Error %d connecting %pI4h/%d -> %pI4h/%d\n", rc,
+	       &local_ip, local_port, &peer_ip, peer_port);
+
+	sock_release(*sockp);
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lo.c b/drivers/staging/lustrefsx/lnet/lnet/lo.c
new file mode 100644
index 0000000000000..eaa06fb41631d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/lo.c
@@ -0,0 +1,114 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <lnet/lib-lnet.h>
+
+static int
+lolnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
+{
+	LASSERT(!lntmsg->msg_routing);
+	LASSERT(!lntmsg->msg_target_is_router);
+
+	return lnet_parse(ni, &lntmsg->msg_hdr, ni->ni_nid, lntmsg, 0);
+}
+
+static int
+lolnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
+	   int delayed, unsigned int niov,
+	   struct kvec *iov, lnet_kiov_t *kiov,
+	   unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+	struct lnet_msg *sendmsg = private;
+
+	if (lntmsg != NULL) {			/* not discarding */
+		if (sendmsg->msg_iov != NULL) {
+			if (iov != NULL)
+				lnet_copy_iov2iov(niov, iov, offset,
+						  sendmsg->msg_niov,
+						  sendmsg->msg_iov,
+						  sendmsg->msg_offset, mlen);
+			else
+				lnet_copy_iov2kiov(niov, kiov, offset,
+						   sendmsg->msg_niov,
+						   sendmsg->msg_iov,
+						   sendmsg->msg_offset, mlen);
+		} else {
+			if (iov != NULL)
+				lnet_copy_kiov2iov(niov, iov, offset,
+						   sendmsg->msg_niov,
+						   sendmsg->msg_kiov,
+						   sendmsg->msg_offset, mlen);
+			else
+				lnet_copy_kiov2kiov(niov, kiov, offset,
+						    sendmsg->msg_niov,
+						    sendmsg->msg_kiov,
+						    sendmsg->msg_offset, mlen);
+		}
+
+		lnet_finalize(lntmsg, 0);
+	}
+
+	lnet_finalize(sendmsg, 0);
+	return 0;
+}
+
+static int lolnd_instanced;
+
+static void
+lolnd_shutdown(struct lnet_ni *ni)
+{
+	CDEBUG (D_NET, "shutdown\n");
+	LASSERT(lolnd_instanced);
+
+	lolnd_instanced = 0;
+}
+
+static int
+lolnd_startup(struct lnet_ni *ni)
+{
+	LASSERT (ni->ni_net->net_lnd == &the_lolnd);
+	LASSERT (!lolnd_instanced);
+	lolnd_instanced = 1;
+
+	return (0);
+}
+
+struct lnet_lnd the_lolnd = {
+	.lnd_list	= {
+				.next = &the_lolnd.lnd_list,
+				.prev = &the_lolnd.lnd_list
+			},
+	.lnd_type	= LOLND,
+	.lnd_startup	= lolnd_startup,
+	.lnd_shutdown	= lolnd_shutdown,
+	.lnd_send	= lolnd_send,
+	.lnd_recv	= lolnd_recv
+};
diff --git a/drivers/staging/lustrefsx/lnet/lnet/module.c b/drivers/staging/lustrefsx/lnet/lnet/module.c
new file mode 100644
index 0000000000000..a7190dd79d002
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/module.c
@@ -0,0 +1,261 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <lnet/lib-lnet.h>
+#include <lnet/lib-dlc.h>
+
+static int config_on_load = 0;
+module_param(config_on_load, int, 0444);
+MODULE_PARM_DESC(config_on_load, "configure network at module load");
+
+static struct mutex lnet_config_mutex;
+
+static int
+lnet_configure(void *arg)
+{
+	/* 'arg' only there so I can be passed to cfs_create_thread() */
+	int    rc = 0;
+
+	mutex_lock(&lnet_config_mutex);
+
+	if (!the_lnet.ln_niinit_self) {
+		rc = try_module_get(THIS_MODULE);
+
+		if (rc != 1)
+			goto out;
+
+		rc = LNetNIInit(LNET_PID_LUSTRE);
+		if (rc >= 0) {
+			the_lnet.ln_niinit_self = 1;
+			rc = 0;
+		} else {
+			module_put(THIS_MODULE);
+		}
+	}
+
+out:
+	mutex_unlock(&lnet_config_mutex);
+	return rc;
+}
+
+static int
+lnet_unconfigure (void)
+{
+	int   refcount;
+
+	mutex_lock(&lnet_config_mutex);
+
+	if (the_lnet.ln_niinit_self) {
+		the_lnet.ln_niinit_self = 0;
+		LNetNIFini();
+		module_put(THIS_MODULE);
+	}
+
+	mutex_lock(&the_lnet.ln_api_mutex);
+	refcount = the_lnet.ln_refcount;
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	mutex_unlock(&lnet_config_mutex);
+
+	return (refcount == 0) ? 0 : -EBUSY;
+}
+
+static int
+lnet_dyn_configure_net(struct libcfs_ioctl_hdr *hdr)
+{
+	struct lnet_ioctl_config_data *conf =
+	  (struct lnet_ioctl_config_data *)hdr;
+	int			      rc;
+
+	if (conf->cfg_hdr.ioc_len < sizeof(*conf))
+		return -EINVAL;
+
+	mutex_lock(&lnet_config_mutex);
+	if (the_lnet.ln_niinit_self)
+		rc = lnet_dyn_add_net(conf);
+	else
+		rc = -EINVAL;
+	mutex_unlock(&lnet_config_mutex);
+
+	return rc;
+}
+
+static int
+lnet_dyn_unconfigure_net(struct libcfs_ioctl_hdr *hdr)
+{
+	struct lnet_ioctl_config_data *conf =
+	  (struct lnet_ioctl_config_data *) hdr;
+	int			      rc;
+
+	if (conf->cfg_hdr.ioc_len < sizeof(*conf))
+		return -EINVAL;
+
+	mutex_lock(&lnet_config_mutex);
+	if (the_lnet.ln_niinit_self)
+		rc = lnet_dyn_del_net(conf->cfg_net);
+	else
+		rc = -EINVAL;
+	mutex_unlock(&lnet_config_mutex);
+
+	return rc;
+}
+
+static int
+lnet_dyn_configure_ni(struct libcfs_ioctl_hdr *hdr)
+{
+	struct lnet_ioctl_config_ni *conf =
+	  (struct lnet_ioctl_config_ni *)hdr;
+	int			      rc;
+
+	if (conf->lic_cfg_hdr.ioc_len < sizeof(*conf))
+		return -EINVAL;
+
+	mutex_lock(&lnet_config_mutex);
+	if (the_lnet.ln_niinit_self)
+		rc = lnet_dyn_add_ni(conf);
+	else
+		rc = -EINVAL;
+	mutex_unlock(&lnet_config_mutex);
+
+	return rc;
+}
+
+static int
+lnet_dyn_unconfigure_ni(struct libcfs_ioctl_hdr *hdr)
+{
+	struct lnet_ioctl_config_ni *conf =
+	  (struct lnet_ioctl_config_ni *) hdr;
+	int			      rc;
+
+	if (conf->lic_cfg_hdr.ioc_len < sizeof(*conf))
+		return -EINVAL;
+
+	mutex_lock(&lnet_config_mutex);
+	if (the_lnet.ln_niinit_self)
+		rc = lnet_dyn_del_ni(conf);
+	else
+		rc = -EINVAL;
+	mutex_unlock(&lnet_config_mutex);
+
+	return rc;
+}
+
+static int
+lnet_ioctl(unsigned int cmd, struct libcfs_ioctl_hdr *hdr)
+{
+	int   rc;
+
+	switch (cmd) {
+	case IOC_LIBCFS_CONFIGURE: {
+		struct libcfs_ioctl_data *data =
+		  (struct libcfs_ioctl_data *)hdr;
+
+		if (data->ioc_hdr.ioc_len < sizeof(*data))
+			return -EINVAL;
+
+		the_lnet.ln_nis_from_mod_params = data->ioc_flags;
+		return lnet_configure(NULL);
+	}
+
+	case IOC_LIBCFS_UNCONFIGURE:
+		return lnet_unconfigure();
+
+	case IOC_LIBCFS_ADD_NET:
+		return lnet_dyn_configure_net(hdr);
+
+	case IOC_LIBCFS_DEL_NET:
+		return lnet_dyn_unconfigure_net(hdr);
+
+	case IOC_LIBCFS_ADD_LOCAL_NI:
+		return lnet_dyn_configure_ni(hdr);
+
+	case IOC_LIBCFS_DEL_LOCAL_NI:
+		return lnet_dyn_unconfigure_ni(hdr);
+
+	default:
+		/* Passing LNET_PID_ANY only gives me a ref if the net is up
+		 * already; I'll need it to ensure the net can't go down while
+		 * I'm called into it */
+		rc = LNetNIInit(LNET_PID_ANY);
+		if (rc >= 0) {
+			rc = LNetCtl(cmd, hdr);
+			LNetNIFini();
+		}
+		return rc;
+	}
+}
+
+DECLARE_IOCTL_HANDLER(lnet_ioctl_handler, lnet_ioctl);
+
+static int __init lnet_init(void)
+{
+	int rc;
+	ENTRY;
+
+	mutex_init(&lnet_config_mutex);
+
+	rc = lnet_lib_init();
+	if (rc != 0) {
+		CERROR("lnet_lib_init: error %d\n", rc);
+		RETURN(rc);
+	}
+
+	rc = libcfs_register_ioctl(&lnet_ioctl_handler);
+	LASSERT(rc == 0);
+
+	if (config_on_load) {
+		/* Have to schedule a separate thread to avoid deadlocking
+		 * in modload */
+		(void)kthread_run(lnet_configure, NULL, "lnet_initd");
+	}
+
+	RETURN(0);
+}
+
+static void __exit lnet_exit(void)
+{
+	int rc;
+
+	rc = libcfs_deregister_ioctl(&lnet_ioctl_handler);
+	LASSERT(rc == 0);
+
+	lnet_lib_exit();
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Networking layer");
+MODULE_VERSION(LNET_VERSION);
+MODULE_LICENSE("GPL");
+
+module_init(lnet_init);
+module_exit(lnet_exit);
diff --git a/drivers/staging/lustrefsx/lnet/lnet/net_fault.c b/drivers/staging/lustrefsx/lnet/lnet/net_fault.c
new file mode 100644
index 0000000000000..c43f8fe2c176e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/net_fault.c
@@ -0,0 +1,1040 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2014, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/net_fault.c
+ *
+ * Lustre network fault simulation
+ *
+ * Author: liang.zhen@intel.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <lnet/lib-lnet.h>
+#include <lnet/lnetctl.h>
+
+#define LNET_MSG_MASK		(LNET_PUT_BIT | LNET_ACK_BIT | \
+				 LNET_GET_BIT | LNET_REPLY_BIT)
+
+struct lnet_drop_rule {
+	/** link chain on the_lnet.ln_drop_rules */
+	struct list_head	dr_link;
+	/** attributes of this rule */
+	struct lnet_fault_attr	dr_attr;
+	/** lock to protect \a dr_drop_at and \a dr_stat */
+	spinlock_t		dr_lock;
+	/**
+	 * the message sequence to drop, which means message is dropped when
+	 * dr_stat.drs_count == dr_drop_at
+	 */
+	unsigned long		dr_drop_at;
+	/**
+	 * seconds to drop the next message, it's exclusive with dr_drop_at
+	 */
+	cfs_time_t		dr_drop_time;
+	/** baseline to caculate dr_drop_time */
+	cfs_time_t		dr_time_base;
+	/** statistic of dropped messages */
+	struct lnet_fault_stat	dr_stat;
+};
+
+static bool
+lnet_fault_nid_match(lnet_nid_t nid, lnet_nid_t msg_nid)
+{
+	if (nid == msg_nid || nid == LNET_NID_ANY)
+		return true;
+
+	if (LNET_NIDNET(nid) != LNET_NIDNET(msg_nid))
+		return false;
+
+	/* 255.255.255.255@net is wildcard for all addresses in a network */
+	return LNET_NIDADDR(nid) == LNET_NIDADDR(LNET_NID_ANY);
+}
+
+static bool
+lnet_fault_attr_match(struct lnet_fault_attr *attr, lnet_nid_t src,
+		      lnet_nid_t dst, unsigned int type, unsigned int portal)
+{
+	if (!lnet_fault_nid_match(attr->fa_src, src) ||
+	    !lnet_fault_nid_match(attr->fa_dst, dst))
+		return false;
+
+	if (!(attr->fa_msg_mask & (1 << type)))
+		return false;
+
+	/* NB: ACK and REPLY have no portal, but they should have been
+	 * rejected by message mask */
+	if (attr->fa_ptl_mask != 0 && /* has portal filter */
+	    !(attr->fa_ptl_mask & (1ULL << portal)))
+		return false;
+
+	return true;
+}
+
+static int
+lnet_fault_attr_validate(struct lnet_fault_attr *attr)
+{
+	if (attr->fa_msg_mask == 0)
+		attr->fa_msg_mask = LNET_MSG_MASK; /* all message types */
+
+	if (attr->fa_ptl_mask == 0) /* no portal filter */
+		return 0;
+
+	/* NB: only PUT and GET can be filtered if portal filter has been set */
+	attr->fa_msg_mask &= LNET_GET_BIT | LNET_PUT_BIT;
+	if (attr->fa_msg_mask == 0) {
+		CDEBUG(D_NET, "can't find valid message type bits %x\n",
+		       attr->fa_msg_mask);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void
+lnet_fault_stat_inc(struct lnet_fault_stat *stat, unsigned int type)
+{
+	/* NB: fs_counter is NOT updated by this function */
+	switch (type) {
+	case LNET_MSG_PUT:
+		stat->fs_put++;
+		return;
+	case LNET_MSG_ACK:
+		stat->fs_ack++;
+		return;
+	case LNET_MSG_GET:
+		stat->fs_get++;
+		return;
+	case LNET_MSG_REPLY:
+		stat->fs_reply++;
+		return;
+	}
+}
+
+/**
+ * LNet message drop simulation
+ */
+
+/**
+ * Add a new drop rule to LNet
+ * There is no check for duplicated drop rule, all rules will be checked for
+ * incoming message.
+ */
+static int
+lnet_drop_rule_add(struct lnet_fault_attr *attr)
+{
+	struct lnet_drop_rule *rule;
+	ENTRY;
+
+	if (!((attr->u.drop.da_rate == 0) ^ (attr->u.drop.da_interval == 0))) {
+		CDEBUG(D_NET,
+		       "please provide either drop rate or drop interval, "
+		       "but not both at the same time %d/%d\n",
+		       attr->u.drop.da_rate, attr->u.drop.da_interval);
+		RETURN(-EINVAL);
+	}
+
+	if (lnet_fault_attr_validate(attr) != 0)
+		RETURN(-EINVAL);
+
+	CFS_ALLOC_PTR(rule);
+	if (rule == NULL)
+		RETURN(-ENOMEM);
+
+	spin_lock_init(&rule->dr_lock);
+
+	rule->dr_attr = *attr;
+	if (attr->u.drop.da_interval != 0) {
+		rule->dr_time_base = cfs_time_shift(attr->u.drop.da_interval);
+		rule->dr_drop_time = cfs_time_shift(cfs_rand() %
+						    attr->u.drop.da_interval);
+	} else {
+		rule->dr_drop_at = cfs_rand() % attr->u.drop.da_rate;
+	}
+
+	lnet_net_lock(LNET_LOCK_EX);
+	list_add(&rule->dr_link, &the_lnet.ln_drop_rules);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	CDEBUG(D_NET, "Added drop rule: src %s, dst %s, rate %d, interval %d\n",
+	       libcfs_nid2str(attr->fa_src), libcfs_nid2str(attr->fa_src),
+	       attr->u.drop.da_rate, attr->u.drop.da_interval);
+	RETURN(0);
+}
+
+/**
+ * Remove matched drop rules from lnet, all rules that can match \a src and
+ * \a dst will be removed.
+ * If \a src is zero, then all rules have \a dst as destination will be remove
+ * If \a dst is zero, then all rules have \a src as source will be removed
+ * If both of them are zero, all rules will be removed
+ */
+static int
+lnet_drop_rule_del(lnet_nid_t src, lnet_nid_t dst)
+{
+	struct lnet_drop_rule *rule;
+	struct lnet_drop_rule *tmp;
+	struct list_head       zombies;
+	int		       n = 0;
+	ENTRY;
+
+	INIT_LIST_HEAD(&zombies);
+
+	lnet_net_lock(LNET_LOCK_EX);
+	list_for_each_entry_safe(rule, tmp, &the_lnet.ln_drop_rules, dr_link) {
+		if (rule->dr_attr.fa_src != src && src != 0)
+			continue;
+
+		if (rule->dr_attr.fa_dst != dst && dst != 0)
+			continue;
+
+		list_move(&rule->dr_link, &zombies);
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	list_for_each_entry_safe(rule, tmp, &zombies, dr_link) {
+		CDEBUG(D_NET, "Remove drop rule: src %s->dst: %s (1/%d, %d)\n",
+		       libcfs_nid2str(rule->dr_attr.fa_src),
+		       libcfs_nid2str(rule->dr_attr.fa_dst),
+		       rule->dr_attr.u.drop.da_rate,
+		       rule->dr_attr.u.drop.da_interval);
+
+		list_del(&rule->dr_link);
+		CFS_FREE_PTR(rule);
+		n++;
+	}
+
+	RETURN(n);
+}
+
+/**
+ * List drop rule at position of \a pos
+ */
+static int
+lnet_drop_rule_list(int pos, struct lnet_fault_attr *attr,
+		    struct lnet_fault_stat *stat)
+{
+	struct lnet_drop_rule *rule;
+	int		       cpt;
+	int		       i = 0;
+	int		       rc = -ENOENT;
+	ENTRY;
+
+	cpt = lnet_net_lock_current();
+	list_for_each_entry(rule, &the_lnet.ln_drop_rules, dr_link) {
+		if (i++ < pos)
+			continue;
+
+		spin_lock(&rule->dr_lock);
+		*attr = rule->dr_attr;
+		*stat = rule->dr_stat;
+		spin_unlock(&rule->dr_lock);
+		rc = 0;
+		break;
+	}
+
+	lnet_net_unlock(cpt);
+	RETURN(rc);
+}
+
+/**
+ * reset counters for all drop rules
+ */
+static void
+lnet_drop_rule_reset(void)
+{
+	struct lnet_drop_rule *rule;
+	int		       cpt;
+	ENTRY;
+
+	cpt = lnet_net_lock_current();
+
+	list_for_each_entry(rule, &the_lnet.ln_drop_rules, dr_link) {
+		struct lnet_fault_attr *attr = &rule->dr_attr;
+
+		spin_lock(&rule->dr_lock);
+
+		memset(&rule->dr_stat, 0, sizeof(rule->dr_stat));
+		if (attr->u.drop.da_rate != 0) {
+			rule->dr_drop_at = cfs_rand() % attr->u.drop.da_rate;
+		} else {
+			rule->dr_drop_time = cfs_time_shift(cfs_rand() %
+						attr->u.drop.da_interval);
+			rule->dr_time_base = cfs_time_shift(attr->u.drop.
+								  da_interval);
+		}
+		spin_unlock(&rule->dr_lock);
+	}
+
+	lnet_net_unlock(cpt);
+	EXIT;
+}
+
+/**
+ * check source/destination NID, portal, message type and drop rate,
+ * decide whether should drop this message or not
+ */
+static bool
+drop_rule_match(struct lnet_drop_rule *rule, lnet_nid_t src,
+		lnet_nid_t dst, unsigned int type, unsigned int portal)
+{
+	struct lnet_fault_attr	*attr = &rule->dr_attr;
+	bool			 drop;
+
+	if (!lnet_fault_attr_match(attr, src, dst, type, portal))
+		return false;
+
+	/* match this rule, check drop rate now */
+	spin_lock(&rule->dr_lock);
+	if (rule->dr_drop_time != 0) { /* time based drop */
+		cfs_time_t now = cfs_time_current();
+
+		rule->dr_stat.fs_count++;
+		drop = cfs_time_aftereq(now, rule->dr_drop_time);
+		if (drop) {
+			if (cfs_time_after(now, rule->dr_time_base))
+				rule->dr_time_base = now;
+
+			rule->dr_drop_time = rule->dr_time_base +
+					     cfs_time_seconds(cfs_rand() %
+						attr->u.drop.da_interval);
+			rule->dr_time_base += cfs_time_seconds(attr->u.drop.
+							       da_interval);
+
+			CDEBUG(D_NET, "Drop Rule %s->%s: next drop : %ld\n",
+			       libcfs_nid2str(attr->fa_src),
+			       libcfs_nid2str(attr->fa_dst),
+			       rule->dr_drop_time);
+		}
+
+	} else { /* rate based drop */
+		__u64 count;
+
+		drop = rule->dr_stat.fs_count++ == rule->dr_drop_at;
+		count = rule->dr_stat.fs_count;
+		if (do_div(count, attr->u.drop.da_rate) == 0) {
+			rule->dr_drop_at = rule->dr_stat.fs_count +
+					   cfs_rand() % attr->u.drop.da_rate;
+			CDEBUG(D_NET, "Drop Rule %s->%s: next drop: %lu\n",
+			       libcfs_nid2str(attr->fa_src),
+			       libcfs_nid2str(attr->fa_dst), rule->dr_drop_at);
+		}
+	}
+
+	if (drop) { /* drop this message, update counters */
+		lnet_fault_stat_inc(&rule->dr_stat, type);
+		rule->dr_stat.u.drop.ds_dropped++;
+	}
+
+	spin_unlock(&rule->dr_lock);
+	return drop;
+}
+
+/**
+ * Check if message from \a src to \a dst can match any existed drop rule
+ */
+bool
+lnet_drop_rule_match(struct lnet_hdr *hdr)
+{
+	struct lnet_drop_rule	*rule;
+	lnet_nid_t		 src = le64_to_cpu(hdr->src_nid);
+	lnet_nid_t		 dst = le64_to_cpu(hdr->dest_nid);
+	unsigned int		 typ = le32_to_cpu(hdr->type);
+	unsigned int		 ptl = -1;
+	bool			 drop = false;
+	int			 cpt;
+
+	/* NB: if Portal is specified, then only PUT and GET will be
+	 * filtered by drop rule */
+	if (typ == LNET_MSG_PUT)
+		ptl = le32_to_cpu(hdr->msg.put.ptl_index);
+	else if (typ == LNET_MSG_GET)
+		ptl = le32_to_cpu(hdr->msg.get.ptl_index);
+
+	cpt = lnet_net_lock_current();
+	list_for_each_entry(rule, &the_lnet.ln_drop_rules, dr_link) {
+		drop = drop_rule_match(rule, src, dst, typ, ptl);
+		if (drop)
+			break;
+	}
+
+	lnet_net_unlock(cpt);
+	return drop;
+}
+
+/**
+ * LNet Delay Simulation
+ */
+/** timestamp (second) to send delayed message */
+#define msg_delay_send		 msg_ev.hdr_data
+
+struct lnet_delay_rule {
+	/** link chain on the_lnet.ln_delay_rules */
+	struct list_head	dl_link;
+	/** link chain on delay_dd.dd_sched_rules */
+	struct list_head	dl_sched_link;
+	/** attributes of this rule */
+	struct lnet_fault_attr	dl_attr;
+	/** lock to protect \a below members */
+	spinlock_t		dl_lock;
+	/** refcount of delay rule */
+	atomic_t		dl_refcount;
+	/**
+	 * the message sequence to delay, which means message is delayed when
+	 * dl_stat.fs_count == dl_delay_at
+	 */
+	unsigned long		dl_delay_at;
+	/**
+	 * seconds to delay the next message, it's exclusive with dl_delay_at
+	 */
+	cfs_time_t		dl_delay_time;
+	/** baseline to caculate dl_delay_time */
+	cfs_time_t		dl_time_base;
+	/** jiffies to send the next delayed message */
+	unsigned long		dl_msg_send;
+	/** delayed message list */
+	struct list_head	dl_msg_list;
+	/** statistic of delayed messages */
+	struct lnet_fault_stat	dl_stat;
+	/** timer to wakeup delay_daemon */
+	struct timer_list	dl_timer;
+};
+
+struct delay_daemon_data {
+	/** serialise rule add/remove */
+	struct mutex		dd_mutex;
+	/** protect rules on \a dd_sched_rules */
+	spinlock_t		dd_lock;
+	/** scheduled delay rules (by timer) */
+	struct list_head	dd_sched_rules;
+	/** deamon thread sleeps at here */
+	wait_queue_head_t	dd_waitq;
+	/** controler (lctl command) wait at here */
+	wait_queue_head_t	dd_ctl_waitq;
+	/** deamon is running */
+	unsigned int		dd_running;
+	/** deamon stopped */
+	unsigned int		dd_stopped;
+};
+
+static struct delay_daemon_data	delay_dd;
+
+static cfs_time_t
+round_timeout(cfs_time_t timeout)
+{
+	return cfs_time_seconds((unsigned int)
+			cfs_duration_sec(cfs_time_sub(timeout, 0)) + 1);
+}
+
+static void
+delay_rule_decref(struct lnet_delay_rule *rule)
+{
+	if (atomic_dec_and_test(&rule->dl_refcount)) {
+		LASSERT(list_empty(&rule->dl_sched_link));
+		LASSERT(list_empty(&rule->dl_msg_list));
+		LASSERT(list_empty(&rule->dl_link));
+
+		CFS_FREE_PTR(rule);
+	}
+}
+
+/**
+ * check source/destination NID, portal, message type and delay rate,
+ * decide whether should delay this message or not
+ */
+static bool
+delay_rule_match(struct lnet_delay_rule *rule, lnet_nid_t src,
+		lnet_nid_t dst, unsigned int type, unsigned int portal,
+		struct lnet_msg *msg)
+{
+	struct lnet_fault_attr	*attr = &rule->dl_attr;
+	bool			 delay;
+
+	if (!lnet_fault_attr_match(attr, src, dst, type, portal))
+		return false;
+
+	/* match this rule, check delay rate now */
+	spin_lock(&rule->dl_lock);
+	if (rule->dl_delay_time != 0) { /* time based delay */
+		cfs_time_t now = cfs_time_current();
+
+		rule->dl_stat.fs_count++;
+		delay = cfs_time_aftereq(now, rule->dl_delay_time);
+		if (delay) {
+			if (cfs_time_after(now, rule->dl_time_base))
+				rule->dl_time_base = now;
+
+			rule->dl_delay_time = rule->dl_time_base +
+					     cfs_time_seconds(cfs_rand() %
+						attr->u.delay.la_interval);
+			rule->dl_time_base += cfs_time_seconds(attr->u.delay.
+							       la_interval);
+
+			CDEBUG(D_NET, "Delay Rule %s->%s: next delay : %ld\n",
+			       libcfs_nid2str(attr->fa_src),
+			       libcfs_nid2str(attr->fa_dst),
+			       rule->dl_delay_time);
+		}
+
+	} else { /* rate based delay */
+		__u64 count;
+
+		delay = rule->dl_stat.fs_count++ == rule->dl_delay_at;
+		/* generate the next random rate sequence */
+		count = rule->dl_stat.fs_count;
+		if (do_div(count, attr->u.delay.la_rate) == 0) {
+			rule->dl_delay_at = rule->dl_stat.fs_count +
+					    cfs_rand() % attr->u.delay.la_rate;
+			CDEBUG(D_NET, "Delay Rule %s->%s: next delay: %lu\n",
+			       libcfs_nid2str(attr->fa_src),
+			       libcfs_nid2str(attr->fa_dst), rule->dl_delay_at);
+		}
+	}
+
+	if (!delay) {
+		spin_unlock(&rule->dl_lock);
+		return false;
+	}
+
+	/* delay this message, update counters */
+	lnet_fault_stat_inc(&rule->dl_stat, type);
+	rule->dl_stat.u.delay.ls_delayed++;
+
+	list_add_tail(&msg->msg_list, &rule->dl_msg_list);
+	msg->msg_delay_send = round_timeout(
+			cfs_time_shift(attr->u.delay.la_latency));
+	if (rule->dl_msg_send == -1) {
+		rule->dl_msg_send = msg->msg_delay_send;
+		mod_timer(&rule->dl_timer, rule->dl_msg_send);
+	}
+
+	spin_unlock(&rule->dl_lock);
+	return true;
+}
+
+/**
+ * check if \a msg can match any Delay Rule, receiving of this message
+ * will be delayed if there is a match.
+ */
+bool
+lnet_delay_rule_match_locked(struct lnet_hdr *hdr, struct lnet_msg *msg)
+{
+	struct lnet_delay_rule	*rule;
+	lnet_nid_t		 src = le64_to_cpu(hdr->src_nid);
+	lnet_nid_t		 dst = le64_to_cpu(hdr->dest_nid);
+	unsigned int		 typ = le32_to_cpu(hdr->type);
+	unsigned int		 ptl = -1;
+
+	/* NB: called with hold of lnet_net_lock */
+
+	/* NB: if Portal is specified, then only PUT and GET will be
+	 * filtered by delay rule */
+	if (typ == LNET_MSG_PUT)
+		ptl = le32_to_cpu(hdr->msg.put.ptl_index);
+	else if (typ == LNET_MSG_GET)
+		ptl = le32_to_cpu(hdr->msg.get.ptl_index);
+
+	list_for_each_entry(rule, &the_lnet.ln_delay_rules, dl_link) {
+		if (delay_rule_match(rule, src, dst, typ, ptl, msg))
+			return true;
+	}
+
+	return false;
+}
+
+/** check out delayed messages for send */
+static void
+delayed_msg_check(struct lnet_delay_rule *rule, bool all,
+		  struct list_head *msg_list)
+{
+	struct lnet_msg *msg;
+	struct lnet_msg *tmp;
+	unsigned long	 now = cfs_time_current();
+
+	if (!all && rule->dl_msg_send > now)
+		return;
+
+	spin_lock(&rule->dl_lock);
+	list_for_each_entry_safe(msg, tmp, &rule->dl_msg_list, msg_list) {
+		if (!all && msg->msg_delay_send > now)
+			break;
+
+		msg->msg_delay_send = 0;
+		list_move_tail(&msg->msg_list, msg_list);
+	}
+
+	if (list_empty(&rule->dl_msg_list)) {
+		del_timer(&rule->dl_timer);
+		rule->dl_msg_send = -1;
+
+	} else if (!list_empty(msg_list)) {
+		/* dequeued some timedout messages, update timer for the
+		 * next delayed message on rule */
+		msg = list_entry(rule->dl_msg_list.next,
+				 struct lnet_msg, msg_list);
+		rule->dl_msg_send = msg->msg_delay_send;
+		mod_timer(&rule->dl_timer, rule->dl_msg_send);
+	}
+	spin_unlock(&rule->dl_lock);
+}
+
+static void
+delayed_msg_process(struct list_head *msg_list, bool drop)
+{
+	struct lnet_msg	*msg;
+
+	while (!list_empty(msg_list)) {
+		struct lnet_ni *ni;
+		int		cpt;
+		int		rc;
+
+		msg = list_entry(msg_list->next, struct lnet_msg, msg_list);
+		LASSERT(msg->msg_rxpeer != NULL);
+		LASSERT(msg->msg_rxni != NULL);
+
+		ni = msg->msg_rxni;
+		cpt = msg->msg_rx_cpt;
+
+		list_del_init(&msg->msg_list);
+		if (drop) {
+			rc = -ECANCELED;
+
+		} else if (!msg->msg_routing) {
+			rc = lnet_parse_local(ni, msg);
+			if (rc == 0)
+				continue;
+
+		} else {
+			lnet_net_lock(cpt);
+			rc = lnet_parse_forward_locked(ni, msg);
+			lnet_net_unlock(cpt);
+
+			switch (rc) {
+			case LNET_CREDIT_OK:
+				lnet_ni_recv(ni, msg->msg_private, msg, 0,
+					     0, msg->msg_len, msg->msg_len);
+			case LNET_CREDIT_WAIT:
+				continue;
+			default: /* failures */
+				break;
+			}
+		}
+
+		lnet_drop_message(ni, cpt, msg->msg_private, msg->msg_len);
+		lnet_finalize(msg, rc);
+	}
+}
+
+/**
+ * Process delayed messages for scheduled rules
+ * This function can either be called by delay_rule_daemon, or by lnet_finalise
+ */
+void
+lnet_delay_rule_check(void)
+{
+	struct lnet_delay_rule	*rule;
+	struct list_head	 msgs;
+
+	INIT_LIST_HEAD(&msgs);
+	while (1) {
+		if (list_empty(&delay_dd.dd_sched_rules))
+			break;
+
+		spin_lock_bh(&delay_dd.dd_lock);
+		if (list_empty(&delay_dd.dd_sched_rules)) {
+			spin_unlock_bh(&delay_dd.dd_lock);
+			break;
+		}
+
+		rule = list_entry(delay_dd.dd_sched_rules.next,
+				  struct lnet_delay_rule, dl_sched_link);
+		list_del_init(&rule->dl_sched_link);
+		spin_unlock_bh(&delay_dd.dd_lock);
+
+		delayed_msg_check(rule, false, &msgs);
+		delay_rule_decref(rule); /* -1 for delay_dd.dd_sched_rules */
+	}
+
+	if (!list_empty(&msgs))
+		delayed_msg_process(&msgs, false);
+}
+
+/** deamon thread to handle delayed messages */
+static int
+lnet_delay_rule_daemon(void *arg)
+{
+	delay_dd.dd_running = 1;
+	wake_up(&delay_dd.dd_ctl_waitq);
+
+	while (delay_dd.dd_running) {
+		wait_event_interruptible(delay_dd.dd_waitq,
+					 !delay_dd.dd_running ||
+					 !list_empty(&delay_dd.dd_sched_rules));
+		lnet_delay_rule_check();
+	}
+
+	/* in case more rules have been enqueued after my last check */
+	lnet_delay_rule_check();
+	delay_dd.dd_stopped = 1;
+	wake_up(&delay_dd.dd_ctl_waitq);
+
+	return 0;
+}
+
+static void
+delay_timer_cb(unsigned long arg)
+{
+	struct lnet_delay_rule *rule = (struct lnet_delay_rule *)arg;
+
+	spin_lock_bh(&delay_dd.dd_lock);
+	if (list_empty(&rule->dl_sched_link) && delay_dd.dd_running) {
+		atomic_inc(&rule->dl_refcount);
+		list_add_tail(&rule->dl_sched_link, &delay_dd.dd_sched_rules);
+		wake_up(&delay_dd.dd_waitq);
+	}
+	spin_unlock_bh(&delay_dd.dd_lock);
+}
+
+/**
+ * Add a new delay rule to LNet
+ * There is no check for duplicated delay rule, all rules will be checked for
+ * incoming message.
+ */
+int
+lnet_delay_rule_add(struct lnet_fault_attr *attr)
+{
+	struct lnet_delay_rule *rule;
+	int			rc = 0;
+	ENTRY;
+
+	if (!((attr->u.delay.la_rate == 0) ^
+	      (attr->u.delay.la_interval == 0))) {
+		CDEBUG(D_NET,
+		       "please provide either delay rate or delay interval, "
+		       "but not both at the same time %d/%d\n",
+		       attr->u.delay.la_rate, attr->u.delay.la_interval);
+		RETURN(-EINVAL);
+	}
+
+	if (attr->u.delay.la_latency == 0) {
+		CDEBUG(D_NET, "delay latency cannot be zero\n");
+		RETURN(-EINVAL);
+	}
+
+	if (lnet_fault_attr_validate(attr) != 0)
+		RETURN(-EINVAL);
+
+	CFS_ALLOC_PTR(rule);
+	if (rule == NULL)
+		RETURN(-ENOMEM);
+
+	mutex_lock(&delay_dd.dd_mutex);
+	if (!delay_dd.dd_running) {
+		struct task_struct *task;
+
+		/* NB: although LND threads will process delayed message
+		 * in lnet_finalize, but there is no guarantee that LND
+		 * threads will be waken up if no other message needs to
+		 * be handled.
+		 * Only one daemon thread, performance is not the concern
+		 * of this simualation module.
+		 */
+		task = kthread_run(lnet_delay_rule_daemon, NULL, "lnet_dd");
+		if (IS_ERR(task)) {
+			rc = PTR_ERR(task);
+			GOTO(failed, rc);
+		}
+		wait_event(delay_dd.dd_ctl_waitq, delay_dd.dd_running);
+	}
+
+	init_timer(&rule->dl_timer);
+	rule->dl_timer.function = delay_timer_cb;
+	rule->dl_timer.data = (unsigned long)rule;
+
+	spin_lock_init(&rule->dl_lock);
+	INIT_LIST_HEAD(&rule->dl_msg_list);
+	INIT_LIST_HEAD(&rule->dl_sched_link);
+
+	rule->dl_attr = *attr;
+	if (attr->u.delay.la_interval != 0) {
+		rule->dl_time_base = cfs_time_shift(attr->u.delay.la_interval);
+		rule->dl_delay_time = cfs_time_shift(cfs_rand() %
+						     attr->u.delay.la_interval);
+	} else {
+		rule->dl_delay_at = cfs_rand() % attr->u.delay.la_rate;
+	}
+
+	rule->dl_msg_send = -1;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	atomic_set(&rule->dl_refcount, 1);
+	list_add(&rule->dl_link, &the_lnet.ln_delay_rules);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	CDEBUG(D_NET, "Added delay rule: src %s, dst %s, rate %d\n",
+	       libcfs_nid2str(attr->fa_src), libcfs_nid2str(attr->fa_src),
+	       attr->u.delay.la_rate);
+
+	mutex_unlock(&delay_dd.dd_mutex);
+	RETURN(0);
+ failed:
+	mutex_unlock(&delay_dd.dd_mutex);
+	CFS_FREE_PTR(rule);
+	return rc;
+}
+
+/**
+ * Remove matched Delay Rules from lnet, if \a shutdown is true or both \a src
+ * and \a dst are zero, all rules will be removed, otherwise only matched rules
+ * will be removed.
+ * If \a src is zero, then all rules have \a dst as destination will be remove
+ * If \a dst is zero, then all rules have \a src as source will be removed
+ *
+ * When a delay rule is removed, all delayed messages of this rule will be
+ * processed immediately.
+ */
+int
+lnet_delay_rule_del(lnet_nid_t src, lnet_nid_t dst, bool shutdown)
+{
+	struct lnet_delay_rule *rule;
+	struct lnet_delay_rule	*tmp;
+	struct list_head	rule_list;
+	struct list_head	msg_list;
+	int			n = 0;
+	bool			cleanup;
+	ENTRY;
+
+	INIT_LIST_HEAD(&rule_list);
+	INIT_LIST_HEAD(&msg_list);
+
+	if (shutdown)
+		src = dst = 0;
+
+	mutex_lock(&delay_dd.dd_mutex);
+	lnet_net_lock(LNET_LOCK_EX);
+
+	list_for_each_entry_safe(rule, tmp, &the_lnet.ln_delay_rules, dl_link) {
+		if (rule->dl_attr.fa_src != src && src != 0)
+			continue;
+
+		if (rule->dl_attr.fa_dst != dst && dst != 0)
+			continue;
+
+		CDEBUG(D_NET, "Remove delay rule: src %s->dst: %s (1/%d, %d)\n",
+		       libcfs_nid2str(rule->dl_attr.fa_src),
+		       libcfs_nid2str(rule->dl_attr.fa_dst),
+		       rule->dl_attr.u.delay.la_rate,
+		       rule->dl_attr.u.delay.la_interval);
+		/* refcount is taken over by rule_list */
+		list_move(&rule->dl_link, &rule_list);
+	}
+
+	/* check if we need to shutdown delay_daemon */
+	cleanup = list_empty(&the_lnet.ln_delay_rules) &&
+		  !list_empty(&rule_list);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	list_for_each_entry_safe(rule, tmp, &rule_list, dl_link) {
+		list_del_init(&rule->dl_link);
+
+		del_timer_sync(&rule->dl_timer);
+		delayed_msg_check(rule, true, &msg_list);
+		delay_rule_decref(rule); /* -1 for the_lnet.ln_delay_rules */
+		n++;
+	}
+
+	if (cleanup) { /* no more delay rule, shutdown delay_daemon */
+		LASSERT(delay_dd.dd_running);
+		delay_dd.dd_running = 0;
+		wake_up(&delay_dd.dd_waitq);
+
+		while (!delay_dd.dd_stopped)
+			wait_event(delay_dd.dd_ctl_waitq, delay_dd.dd_stopped);
+	}
+	mutex_unlock(&delay_dd.dd_mutex);
+
+	if (!list_empty(&msg_list))
+		delayed_msg_process(&msg_list, shutdown);
+
+	RETURN(n);
+}
+
+/**
+ * List Delay Rule at position of \a pos
+ */
+int
+lnet_delay_rule_list(int pos, struct lnet_fault_attr *attr,
+		    struct lnet_fault_stat *stat)
+{
+	struct lnet_delay_rule *rule;
+	int			cpt;
+	int			i = 0;
+	int			rc = -ENOENT;
+	ENTRY;
+
+	cpt = lnet_net_lock_current();
+	list_for_each_entry(rule, &the_lnet.ln_delay_rules, dl_link) {
+		if (i++ < pos)
+			continue;
+
+		spin_lock(&rule->dl_lock);
+		*attr = rule->dl_attr;
+		*stat = rule->dl_stat;
+		spin_unlock(&rule->dl_lock);
+		rc = 0;
+		break;
+	}
+
+	lnet_net_unlock(cpt);
+	RETURN(rc);
+}
+
+/**
+ * reset counters for all Delay Rules
+ */
+void
+lnet_delay_rule_reset(void)
+{
+	struct lnet_delay_rule *rule;
+	int			cpt;
+	ENTRY;
+
+	cpt = lnet_net_lock_current();
+
+	list_for_each_entry(rule, &the_lnet.ln_delay_rules, dl_link) {
+		struct lnet_fault_attr *attr = &rule->dl_attr;
+
+		spin_lock(&rule->dl_lock);
+
+		memset(&rule->dl_stat, 0, sizeof(rule->dl_stat));
+		if (attr->u.delay.la_rate != 0) {
+			rule->dl_delay_at = cfs_rand() % attr->u.delay.la_rate;
+		} else {
+			rule->dl_delay_time = cfs_time_shift(cfs_rand() %
+						attr->u.delay.la_interval);
+			rule->dl_time_base = cfs_time_shift(attr->u.delay.
+								  la_interval);
+		}
+		spin_unlock(&rule->dl_lock);
+	}
+
+	lnet_net_unlock(cpt);
+	EXIT;
+}
+
+int
+lnet_fault_ctl(int opc, struct libcfs_ioctl_data *data)
+{
+	struct lnet_fault_attr *attr;
+	struct lnet_fault_stat *stat;
+
+	attr = (struct lnet_fault_attr *)data->ioc_inlbuf1;
+
+	switch (opc) {
+	default:
+		return -EINVAL;
+
+	case LNET_CTL_DROP_ADD:
+		if (attr == NULL)
+			return -EINVAL;
+
+		return lnet_drop_rule_add(attr);
+
+	case LNET_CTL_DROP_DEL:
+		if (attr == NULL)
+			return -EINVAL;
+
+		data->ioc_count = lnet_drop_rule_del(attr->fa_src,
+						     attr->fa_dst);
+		return 0;
+
+	case LNET_CTL_DROP_RESET:
+		lnet_drop_rule_reset();
+		return 0;
+
+	case LNET_CTL_DROP_LIST:
+		stat = (struct lnet_fault_stat *)data->ioc_inlbuf2;
+		if (attr == NULL || stat == NULL)
+			return -EINVAL;
+
+		return lnet_drop_rule_list(data->ioc_count, attr, stat);
+
+	case LNET_CTL_DELAY_ADD:
+		if (attr == NULL)
+			return -EINVAL;
+
+		return lnet_delay_rule_add(attr);
+
+	case LNET_CTL_DELAY_DEL:
+		if (attr == NULL)
+			return -EINVAL;
+
+		data->ioc_count = lnet_delay_rule_del(attr->fa_src,
+						      attr->fa_dst, false);
+		return 0;
+
+	case LNET_CTL_DELAY_RESET:
+		lnet_delay_rule_reset();
+		return 0;
+
+	case LNET_CTL_DELAY_LIST:
+		stat = (struct lnet_fault_stat *)data->ioc_inlbuf2;
+		if (attr == NULL || stat == NULL)
+			return -EINVAL;
+
+		return lnet_delay_rule_list(data->ioc_count, attr, stat);
+	}
+}
+
+int
+lnet_fault_init(void)
+{
+	CLASSERT(LNET_PUT_BIT == 1 << LNET_MSG_PUT);
+	CLASSERT(LNET_ACK_BIT == 1 << LNET_MSG_ACK);
+	CLASSERT(LNET_GET_BIT == 1 << LNET_MSG_GET);
+	CLASSERT(LNET_REPLY_BIT == 1 << LNET_MSG_REPLY);
+
+	mutex_init(&delay_dd.dd_mutex);
+	spin_lock_init(&delay_dd.dd_lock);
+	init_waitqueue_head(&delay_dd.dd_waitq);
+	init_waitqueue_head(&delay_dd.dd_ctl_waitq);
+	INIT_LIST_HEAD(&delay_dd.dd_sched_rules);
+
+	return 0;
+}
+
+void
+lnet_fault_fini(void)
+{
+	lnet_drop_rule_del(0, 0);
+	lnet_delay_rule_del(0, 0, true);
+
+	LASSERT(list_empty(&the_lnet.ln_drop_rules));
+	LASSERT(list_empty(&the_lnet.ln_delay_rules));
+	LASSERT(list_empty(&delay_dd.dd_sched_rules));
+}
diff --git a/drivers/staging/lustrefsx/lnet/lnet/nidstrings.c b/drivers/staging/lustrefsx/lnet/lnet/nidstrings.c
new file mode 100644
index 0000000000000..5122a2e6b5d81
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/nidstrings.c
@@ -0,0 +1,1200 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/nidstrings.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <libcfs/libcfs.h>
+#include <lnet/nidstr.h>
+
+/* max value for numeric network address */
+#define MAX_NUMERIC_VALUE 0xffffffff
+
+#define IPSTRING_LENGTH 16
+
+/* CAVEAT VENDITOR! Keep the canonical string representation of nets/nids
+ * consistent in all conversion functions.  Some code fragments are copied
+ * around for the sake of clarity...
+ */
+
+/* CAVEAT EMPTOR! Racey temporary buffer allocation!
+ * Choose the number of nidstrings to support the MAXIMUM expected number of
+ * concurrent users.  If there are more, the returned string will be volatile.
+ * NB this number must allow for a process to be descheduled for a timeslice
+ * between getting its string and using it.
+ */
+
+static char	 libcfs_nidstrings[LNET_NIDSTR_COUNT][LNET_NIDSTR_SIZE];
+static int	 libcfs_nidstring_idx;
+
+static DEFINE_SPINLOCK(libcfs_nidstring_lock);
+
+static struct netstrfns *libcfs_namenum2netstrfns(const char *name);
+
+char *
+libcfs_next_nidstring(void)
+{
+	char	      *str;
+	unsigned long  flags;
+
+	spin_lock_irqsave(&libcfs_nidstring_lock, flags);
+
+	str = libcfs_nidstrings[libcfs_nidstring_idx++];
+	if (libcfs_nidstring_idx == ARRAY_SIZE(libcfs_nidstrings))
+		libcfs_nidstring_idx = 0;
+
+	spin_unlock_irqrestore(&libcfs_nidstring_lock, flags);
+	return str;
+}
+EXPORT_SYMBOL(libcfs_next_nidstring);
+
+/**
+ * Nid range list syntax.
+ * \verbatim
+ *
+ * <nidlist>	     :== <nidrange> [ ' ' <nidrange> ]
+ * <nidrange>	     :== <addrrange> '@' <net>
+ * <addrrange>	     :== '*' |
+ *			 <ipaddr_range> |
+ *			 <cfs_expr_list>
+ * <ipaddr_range>    :== <cfs_expr_list>.<cfs_expr_list>.<cfs_expr_list>.
+ *			 <cfs_expr_list>
+ * <cfs_expr_list>   :== <number> |
+ *			 <expr_list>
+ * <expr_list>	     :== '[' <range_expr> [ ',' <range_expr>] ']'
+ * <range_expr>      :== <number> |
+ *			 <number> '-' <number> |
+ *			 <number> '-' <number> '/' <number>
+ * <net>	     :== <netname> | <netname><number>
+ * <netname>	     :== "lo" | "tcp" | "o2ib" | "cib" | "openib" | "iib" |
+ *			 "vib" | "ra" | "elan" | "mx" | "ptl"
+ * \endverbatim
+ */
+
+/**
+ * Structure to represent \<nidrange\> token of the syntax.
+ *
+ * One of this is created for each \<net\> parsed.
+ */
+struct nidrange {
+	/**
+	 * Link to list of this structures which is built on nid range
+	 * list parsing.
+	 */
+	struct list_head nr_link;
+	/**
+	 * List head for addrrange::ar_link.
+	 */
+	struct list_head nr_addrranges;
+	/**
+	 * Flag indicating that *@<net> is found.
+	 */
+	int nr_all;
+	/**
+	 * Pointer to corresponding element of libcfs_netstrfns.
+	 */
+	struct netstrfns *nr_netstrfns;
+	/**
+	 * Number of network. E.g. 5 if \<net\> is "elan5".
+	 */
+	int nr_netnum;
+};
+
+/**
+ * Structure to represent \<addrrange\> token of the syntax.
+ */
+struct addrrange {
+	/**
+	 * Link to nidrange::nr_addrranges.
+	 */
+	struct list_head ar_link;
+	/**
+	 * List head for cfs_expr_list::el_list.
+	 */
+	struct list_head ar_numaddr_ranges;
+};
+
+/**
+ * Parses \<addrrange\> token on the syntax.
+ *
+ * Allocates struct addrrange and links to \a nidrange via
+ * (nidrange::nr_addrranges)
+ *
+ * \retval 0 if \a src parses to '*' | \<ipaddr_range\> | \<cfs_expr_list\>
+ * \retval -errno otherwise
+ */
+static int
+parse_addrange(const struct cfs_lstr *src, struct nidrange *nidrange)
+{
+	struct addrrange *addrrange;
+
+	if (src->ls_len == 1 && src->ls_str[0] == '*') {
+		nidrange->nr_all = 1;
+		return 0;
+	}
+
+	LIBCFS_ALLOC(addrrange, sizeof(struct addrrange));
+	if (addrrange == NULL)
+		return -ENOMEM;
+	list_add_tail(&addrrange->ar_link, &nidrange->nr_addrranges);
+	INIT_LIST_HEAD(&addrrange->ar_numaddr_ranges);
+
+	return nidrange->nr_netstrfns->nf_parse_addrlist(src->ls_str,
+						src->ls_len,
+						&addrrange->ar_numaddr_ranges);
+}
+
+/**
+ * Finds or creates struct nidrange.
+ *
+ * Checks if \a src is a valid network name, looks for corresponding
+ * nidrange on the ist of nidranges (\a nidlist), creates new struct
+ * nidrange if it is not found.
+ *
+ * \retval pointer to struct nidrange matching network specified via \a src
+ * \retval NULL if \a src does not match any network
+ */
+static struct nidrange *
+add_nidrange(const struct cfs_lstr *src,
+	     struct list_head *nidlist)
+{
+	struct netstrfns *nf;
+	struct nidrange *nr;
+	int endlen;
+	unsigned netnum;
+
+	if (src->ls_len >= LNET_NIDSTR_SIZE)
+		return NULL;
+
+	nf = libcfs_namenum2netstrfns(src->ls_str);
+	if (nf == NULL)
+		return NULL;
+	endlen = src->ls_len - strlen(nf->nf_name);
+	if (endlen == 0)
+		/* network name only, e.g. "elan" or "tcp" */
+		netnum = 0;
+	else {
+		/* e.g. "elan25" or "tcp23", refuse to parse if
+		 * network name is not appended with decimal or
+		 * hexadecimal number */
+		if (!cfs_str2num_check(src->ls_str + strlen(nf->nf_name),
+				       endlen, &netnum, 0, MAX_NUMERIC_VALUE))
+			return NULL;
+	}
+
+	list_for_each_entry(nr, nidlist, nr_link) {
+		if (nr->nr_netstrfns != nf)
+			continue;
+		if (nr->nr_netnum != netnum)
+			continue;
+		return nr;
+	}
+
+	LIBCFS_ALLOC(nr, sizeof(struct nidrange));
+	if (nr == NULL)
+		return NULL;
+	list_add_tail(&nr->nr_link, nidlist);
+	INIT_LIST_HEAD(&nr->nr_addrranges);
+	nr->nr_netstrfns = nf;
+	nr->nr_all = 0;
+	nr->nr_netnum = netnum;
+
+	return nr;
+}
+
+/**
+ * Parses \<nidrange\> token of the syntax.
+ *
+ * \retval 1 if \a src parses to \<addrrange\> '@' \<net\>
+ * \retval 0 otherwise
+ */
+static int
+parse_nidrange(struct cfs_lstr *src, struct list_head *nidlist)
+{
+	struct cfs_lstr addrrange;
+	struct cfs_lstr net;
+	struct nidrange *nr;
+
+	if (cfs_gettok(src, '@', &addrrange) == 0)
+		goto failed;
+
+	if (cfs_gettok(src, '@', &net) == 0 || src->ls_str != NULL)
+		goto failed;
+
+	nr = add_nidrange(&net, nidlist);
+	if (nr == NULL)
+		goto failed;
+
+	if (parse_addrange(&addrrange, nr) != 0)
+		goto failed;
+
+	return 1;
+failed:
+	return 0;
+}
+
+/**
+ * Frees addrrange structures of \a list.
+ *
+ * For each struct addrrange structure found on \a list it frees
+ * cfs_expr_list list attached to it and frees the addrrange itself.
+ *
+ * \retval none
+ */
+static void
+free_addrranges(struct list_head *list)
+{
+	while (!list_empty(list)) {
+		struct addrrange *ar;
+
+		ar = list_entry(list->next, struct addrrange, ar_link);
+
+		cfs_expr_list_free_list(&ar->ar_numaddr_ranges);
+		list_del(&ar->ar_link);
+		LIBCFS_FREE(ar, sizeof(struct addrrange));
+	}
+}
+
+/**
+ * Frees nidrange strutures of \a list.
+ *
+ * For each struct nidrange structure found on \a list it frees
+ * addrrange list attached to it and frees the nidrange itself.
+ *
+ * \retval none
+ */
+void
+cfs_free_nidlist(struct list_head *list)
+{
+	struct list_head *pos, *next;
+	struct nidrange *nr;
+
+	list_for_each_safe(pos, next, list) {
+		nr = list_entry(pos, struct nidrange, nr_link);
+		free_addrranges(&nr->nr_addrranges);
+		list_del(pos);
+		LIBCFS_FREE(nr, sizeof(struct nidrange));
+	}
+}
+EXPORT_SYMBOL(cfs_free_nidlist);
+
+/**
+ * Parses nid range list.
+ *
+ * Parses with rigorous syntax and overflow checking \a str into
+ * \<nidrange\> [ ' ' \<nidrange\> ], compiles \a str into set of
+ * structures and links that structure to \a nidlist. The resulting
+ * list can be used to match a NID againts set of NIDS defined by \a
+ * str.
+ * \see cfs_match_nid
+ *
+ * \retval 1 on success
+ * \retval 0 otherwise
+ */
+int
+cfs_parse_nidlist(char *str, int len, struct list_head *nidlist)
+{
+	struct cfs_lstr src;
+	struct cfs_lstr res;
+	int rc;
+
+	src.ls_str = str;
+	src.ls_len = len;
+	INIT_LIST_HEAD(nidlist);
+	while (src.ls_str) {
+		rc = cfs_gettok(&src, ' ', &res);
+		if (rc == 0) {
+			cfs_free_nidlist(nidlist);
+			return 0;
+		}
+		rc = parse_nidrange(&res, nidlist);
+		if (rc == 0) {
+			cfs_free_nidlist(nidlist);
+			return 0;
+		}
+	}
+	return 1;
+}
+EXPORT_SYMBOL(cfs_parse_nidlist);
+
+/**
+ * Matches a nid (\a nid) against the compiled list of nidranges (\a nidlist).
+ *
+ * \see cfs_parse_nidlist()
+ *
+ * \retval 1 on match
+ * \retval 0  otherwises
+ */
+int cfs_match_nid(lnet_nid_t nid, struct list_head *nidlist)
+{
+	struct nidrange *nr;
+	struct addrrange *ar;
+
+	list_for_each_entry(nr, nidlist, nr_link) {
+		if (nr->nr_netstrfns->nf_type != LNET_NETTYP(LNET_NIDNET(nid)))
+			continue;
+		if (nr->nr_netnum != LNET_NETNUM(LNET_NIDNET(nid)))
+			continue;
+		if (nr->nr_all)
+			return 1;
+		list_for_each_entry(ar, &nr->nr_addrranges, ar_link)
+			if (nr->nr_netstrfns->nf_match_addr(LNET_NIDADDR(nid),
+							&ar->ar_numaddr_ranges))
+				return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(cfs_match_nid);
+
+/**
+ * Print the network part of the nidrange \a nr into the specified \a buffer.
+ *
+ * \retval number of characters written
+ */
+static int
+cfs_print_network(char *buffer, int count, struct nidrange *nr)
+{
+	struct netstrfns *nf = nr->nr_netstrfns;
+
+	if (nr->nr_netnum == 0)
+		return scnprintf(buffer, count, "@%s", nf->nf_name);
+	else
+		return scnprintf(buffer, count, "@%s%u",
+				    nf->nf_name, nr->nr_netnum);
+}
+
+/**
+ * Print a list of addrrange (\a addrranges) into the specified \a buffer.
+ * At max \a count characters can be printed into \a buffer.
+ *
+ * \retval number of characters written
+ */
+static int
+cfs_print_addrranges(char *buffer, int count, struct list_head *addrranges,
+		     struct nidrange *nr)
+{
+	int i = 0;
+	struct addrrange *ar;
+	struct netstrfns *nf = nr->nr_netstrfns;
+
+	list_for_each_entry(ar, addrranges, ar_link) {
+		if (i != 0)
+			i += scnprintf(buffer + i, count - i, " ");
+		i += nf->nf_print_addrlist(buffer + i, count - i,
+					   &ar->ar_numaddr_ranges);
+		i += cfs_print_network(buffer + i, count - i, nr);
+	}
+	return i;
+}
+
+/**
+ * Print a list of nidranges (\a nidlist) into the specified \a buffer.
+ * At max \a count characters can be printed into \a buffer.
+ * Nidranges are separated by a space character.
+ *
+ * \retval number of characters written
+ */
+int cfs_print_nidlist(char *buffer, int count, struct list_head *nidlist)
+{
+	int i = 0;
+	struct nidrange *nr;
+
+	if (count <= 0)
+		return 0;
+
+	list_for_each_entry(nr, nidlist, nr_link) {
+		if (i != 0)
+			i += scnprintf(buffer + i, count - i, " ");
+
+		if (nr->nr_all != 0) {
+			LASSERT(list_empty(&nr->nr_addrranges));
+			i += scnprintf(buffer + i, count - i, "*");
+			i += cfs_print_network(buffer + i, count - i, nr);
+		} else {
+			i += cfs_print_addrranges(buffer + i, count - i,
+						  &nr->nr_addrranges, nr);
+		}
+	}
+	return i;
+}
+EXPORT_SYMBOL(cfs_print_nidlist);
+
+/**
+ * Determines minimum and maximum addresses for a single
+ * numeric address range
+ *
+ * \param	ar
+ * \param[out]	*min_nid __u32 representation of min NID
+ * \param[out]	*max_nid __u32 representation of max NID
+ * \retval	-EINVAL unsupported LNET range
+ * \retval	-ERANGE non-contiguous LNET range
+ */
+static int cfs_ip_ar_min_max(struct addrrange *ar, __u32 *min_nid,
+			      __u32 *max_nid)
+{
+	struct cfs_expr_list *expr_list;
+	struct cfs_range_expr *range;
+	unsigned int min_ip[4] = {0};
+	unsigned int max_ip[4] = {0};
+	int cur_octet = 0;
+	bool expect_full_octet = false;
+
+	list_for_each_entry(expr_list, &ar->ar_numaddr_ranges, el_link) {
+		int re_count = 0;
+
+		list_for_each_entry(range, &expr_list->el_exprs, re_link) {
+			/* XXX: add support for multiple & non-contig. re's */
+			if (re_count > 0)
+				return -EINVAL;
+
+			/* if a previous octet was ranged, then all remaining
+			 * octets must be full for contiguous range */
+			if (expect_full_octet && (range->re_lo != 0 ||
+						  range->re_hi != 255))
+				return -ERANGE;
+
+			if (range->re_stride != 1)
+				return -ERANGE;
+
+			if (range->re_lo > range->re_hi)
+				return -EINVAL;
+
+			if (range->re_lo != range->re_hi)
+				expect_full_octet = true;
+
+			min_ip[cur_octet] = range->re_lo;
+			max_ip[cur_octet] = range->re_hi;
+
+			re_count++;
+		}
+
+		cur_octet++;
+	}
+
+	if (min_nid != NULL)
+		*min_nid = ((min_ip[0] << 24) | (min_ip[1] << 16) |
+			    (min_ip[2] << 8) | min_ip[3]);
+
+	if (max_nid != NULL)
+		*max_nid = ((max_ip[0] << 24) | (max_ip[1] << 16) |
+			    (max_ip[2] << 8) | max_ip[3]);
+
+	return 0;
+}
+
+/**
+ * Determines minimum and maximum addresses for a single
+ * numeric address range
+ *
+ * \param	ar
+ * \param[out]	*min_nid __u32 representation of min NID
+ * \param[out]	*max_nid __u32 representation of max NID
+ * \retval	-EINVAL unsupported LNET range
+ */
+static int cfs_num_ar_min_max(struct addrrange *ar, __u32 *min_nid,
+			       __u32 *max_nid)
+{
+	struct cfs_expr_list *el;
+	struct cfs_range_expr *re;
+	unsigned int min_addr = 0;
+	unsigned int max_addr = 0;
+
+	list_for_each_entry(el, &ar->ar_numaddr_ranges, el_link) {
+		int re_count = 0;
+
+		list_for_each_entry(re, &el->el_exprs, re_link) {
+			if (re_count > 0)
+				return -EINVAL;
+			if (re->re_lo > re->re_hi)
+				return -EINVAL;
+
+			if (re->re_lo < min_addr || min_addr == 0)
+				min_addr = re->re_lo;
+			if (re->re_hi > max_addr)
+				max_addr = re->re_hi;
+
+			re_count++;
+		}
+	}
+
+	if (min_nid != NULL)
+		*min_nid = min_addr;
+	if (max_nid != NULL)
+		*max_nid = max_addr;
+
+	return 0;
+}
+
+/**
+ * Takes a linked list of nidrange expressions, determines the minimum
+ * and maximum nid and creates appropriate nid structures
+ *
+ * \param[out]	*min_nid string representation of min NID
+ * \param[out]	*max_nid string representation of max NID
+ * \retval	-EINVAL unsupported LNET range
+ * \retval	-ERANGE non-contiguous LNET range
+ */
+int cfs_nidrange_find_min_max(struct list_head *nidlist, char *min_nid,
+			      char *max_nid, size_t nidstr_length)
+{
+	struct nidrange *first_nidrange;
+	int netnum;
+	struct netstrfns *nf;
+	char *lndname;
+	__u32 min_addr;
+	__u32 max_addr;
+	char min_addr_str[IPSTRING_LENGTH];
+	char max_addr_str[IPSTRING_LENGTH];
+	int rc;
+
+	first_nidrange = list_entry(nidlist->next, struct nidrange, nr_link);
+
+	netnum = first_nidrange->nr_netnum;
+	nf = first_nidrange->nr_netstrfns;
+	lndname = nf->nf_name;
+
+	rc = nf->nf_min_max(nidlist, &min_addr, &max_addr);
+	if (rc < 0)
+		return rc;
+
+	nf->nf_addr2str(min_addr, min_addr_str, sizeof(min_addr_str));
+	nf->nf_addr2str(max_addr, max_addr_str, sizeof(max_addr_str));
+
+	snprintf(min_nid, nidstr_length, "%s@%s%d", min_addr_str, lndname,
+		 netnum);
+	snprintf(max_nid, nidstr_length, "%s@%s%d", max_addr_str, lndname,
+		 netnum);
+
+	return 0;
+}
+EXPORT_SYMBOL(cfs_nidrange_find_min_max);
+
+/**
+ * Determines the min and max NID values for num LNDs
+ *
+ * \param	*nidlist
+ * \param[out]	*min_nid if provided, returns string representation of min NID
+ * \param[out]	*max_nid if provided, returns string representation of max NID
+ * \retval	-EINVAL unsupported LNET range
+ * \retval	-ERANGE non-contiguous LNET range
+ */
+static int cfs_num_min_max(struct list_head *nidlist, __u32 *min_nid,
+			    __u32 *max_nid)
+{
+	struct nidrange *nr;
+	struct addrrange *ar;
+	unsigned int tmp_min_addr = 0;
+	unsigned int tmp_max_addr = 0;
+	unsigned int min_addr = 0;
+	unsigned int max_addr = 0;
+	int nidlist_count = 0;
+	int rc;
+
+	list_for_each_entry(nr, nidlist, nr_link) {
+		if (nidlist_count > 0)
+			return -EINVAL;
+
+		list_for_each_entry(ar, &nr->nr_addrranges, ar_link) {
+			rc = cfs_num_ar_min_max(ar, &tmp_min_addr,
+						&tmp_max_addr);
+			if (rc < 0)
+				return rc;
+
+			if (tmp_min_addr < min_addr || min_addr == 0)
+				min_addr = tmp_min_addr;
+			if (tmp_max_addr > max_addr)
+				max_addr = tmp_min_addr;
+		}
+	}
+	if (max_nid != NULL)
+		*max_nid = max_addr;
+	if (min_nid != NULL)
+		*min_nid = min_addr;
+
+	return 0;
+}
+
+/**
+ * Takes an nidlist and determines the minimum and maximum
+ * ip addresses.
+ *
+ * \param	*nidlist
+ * \param[out]	*min_nid if provided, returns string representation of min NID
+ * \param[out]	*max_nid if provided, returns string representation of max NID
+ * \retval	-EINVAL unsupported LNET range
+ * \retval	-ERANGE non-contiguous LNET range
+ */
+static int cfs_ip_min_max(struct list_head *nidlist, __u32 *min_nid,
+			   __u32 *max_nid)
+{
+	struct nidrange *nr;
+	struct addrrange *ar;
+	__u32 tmp_min_ip_addr = 0;
+	__u32 tmp_max_ip_addr = 0;
+	__u32 min_ip_addr = 0;
+	__u32 max_ip_addr = 0;
+	int nidlist_count = 0;
+	int rc;
+
+	list_for_each_entry(nr, nidlist, nr_link) {
+		if (nidlist_count > 0)
+			return -EINVAL;
+
+		if (nr->nr_all) {
+			min_ip_addr = 0;
+			max_ip_addr = 0xffffffff;
+			break;
+		}
+
+		list_for_each_entry(ar, &nr->nr_addrranges, ar_link) {
+			rc = cfs_ip_ar_min_max(ar, &tmp_min_ip_addr,
+					       &tmp_max_ip_addr);
+			if (rc < 0)
+				return rc;
+
+			if (tmp_min_ip_addr < min_ip_addr || min_ip_addr == 0)
+				min_ip_addr = tmp_min_ip_addr;
+			if (tmp_max_ip_addr > max_ip_addr)
+				max_ip_addr = tmp_max_ip_addr;
+		}
+
+		nidlist_count++;
+	}
+
+	if (max_nid != NULL)
+		*max_nid = max_ip_addr;
+	if (min_nid != NULL)
+		*min_nid = min_ip_addr;
+
+	return 0;
+}
+
+static int
+libcfs_lo_str2addr(const char *str, int nob, __u32 *addr)
+{
+	*addr = 0;
+	return 1;
+}
+
+static void
+libcfs_ip_addr2str(__u32 addr, char *str, size_t size)
+{
+	snprintf(str, size, "%u.%u.%u.%u",
+		 (addr >> 24) & 0xff, (addr >> 16) & 0xff,
+		 (addr >> 8) & 0xff, addr & 0xff);
+}
+
+/* CAVEAT EMPTOR XscanfX
+ * I use "%n" at the end of a sscanf format to detect trailing junk.  However
+ * sscanf may return immediately if it sees the terminating '0' in a string, so
+ * I initialise the %n variable to the expected length.  If sscanf sets it;
+ * fine, if it doesn't, then the scan ended at the end of the string, which is
+ * fine too :) */
+static int
+libcfs_ip_str2addr(const char *str, int nob, __u32 *addr)
+{
+	unsigned int	a;
+	unsigned int	b;
+	unsigned int	c;
+	unsigned int	d;
+	int		n = nob; /* XscanfX */
+
+	/* numeric IP? */
+	if (sscanf(str, "%u.%u.%u.%u%n", &a, &b, &c, &d, &n) >= 4 &&
+	    n == nob &&
+	    (a & ~0xff) == 0 && (b & ~0xff) == 0 &&
+	    (c & ~0xff) == 0 && (d & ~0xff) == 0) {
+		*addr = ((a<<24)|(b<<16)|(c<<8)|d);
+		return 1;
+	}
+	return 0;
+}
+
+/* Used by lnet/config.c so it can't be static */
+int
+cfs_ip_addr_parse(char *str, int len, struct list_head *list)
+{
+	struct cfs_expr_list *el;
+	struct cfs_lstr src;
+	int rc;
+	int i;
+
+	src.ls_str = str;
+	src.ls_len = len;
+	i = 0;
+
+	while (src.ls_str != NULL) {
+		struct cfs_lstr res;
+
+		if (!cfs_gettok(&src, '.', &res)) {
+			rc = -EINVAL;
+			goto out;
+		}
+
+		rc = cfs_expr_list_parse(res.ls_str, res.ls_len, 0, 255, &el);
+		if (rc != 0)
+			goto out;
+
+		list_add_tail(&el->el_link, list);
+		i++;
+	}
+
+	if (i == 4)
+		return 0;
+
+	rc = -EINVAL;
+out:
+	cfs_expr_list_free_list(list);
+
+	return rc;
+}
+
+static int
+libcfs_ip_addr_range_print(char *buffer, int count, struct list_head *list)
+{
+	int i = 0, j = 0;
+	struct cfs_expr_list *el;
+
+	list_for_each_entry(el, list, el_link) {
+		LASSERT(j++ < 4);
+		if (i != 0)
+			i += scnprintf(buffer + i, count - i, ".");
+		i += cfs_expr_list_print(buffer + i, count - i, el);
+	}
+	return i;
+}
+
+/**
+ * Matches address (\a addr) against address set encoded in \a list.
+ *
+ * \retval 1 if \a addr matches
+ * \retval 0 otherwise
+ */
+int
+cfs_ip_addr_match(__u32 addr, struct list_head *list)
+{
+	struct cfs_expr_list *el;
+	int i = 0;
+
+	list_for_each_entry_reverse(el, list, el_link) {
+		if (!cfs_expr_list_match(addr & 0xff, el))
+			return 0;
+		addr >>= 8;
+		i++;
+	}
+
+	return i == 4;
+}
+
+/**
+ * Print the network part of the nidrange \a nr into the specified \a buffer.
+ *
+ * \retval number of characters written
+ */
+static void
+libcfs_decnum_addr2str(__u32 addr, char *str, size_t size)
+{
+	snprintf(str, size, "%u", addr);
+}
+
+static int
+libcfs_num_str2addr(const char *str, int nob, __u32 *addr)
+{
+	int	n;
+
+	n = nob;
+	if (sscanf(str, "0x%x%n", addr, &n) >= 1 && n == nob)
+		return 1;
+
+	n = nob;
+	if (sscanf(str, "0X%x%n", addr, &n) >= 1 && n == nob)
+		return 1;
+
+	n = nob;
+	if (sscanf(str, "%u%n", addr, &n) >= 1 && n == nob)
+		return 1;
+
+	return 0;
+}
+
+/**
+ * Nf_parse_addrlist method for networks using numeric addresses.
+ *
+ * Examples of such networks are gm and elan.
+ *
+ * \retval 0 if \a str parsed to numeric address
+ * \retval errno otherwise
+ */
+static int
+libcfs_num_parse(char *str, int len, struct list_head *list)
+{
+	struct cfs_expr_list *el;
+	int	rc;
+
+	rc = cfs_expr_list_parse(str, len, 0, MAX_NUMERIC_VALUE, &el);
+	if (rc == 0)
+		list_add_tail(&el->el_link, list);
+
+	return rc;
+}
+
+static int
+libcfs_num_addr_range_print(char *buffer, int count, struct list_head *list)
+{
+	int i = 0, j = 0;
+	struct cfs_expr_list *el;
+
+	list_for_each_entry(el, list, el_link) {
+		LASSERT(j++ < 1);
+		i += cfs_expr_list_print(buffer + i, count - i, el);
+	}
+	return i;
+}
+
+/*
+ * Nf_match_addr method for networks using numeric addresses
+ *
+ * \retval 1 on match
+ * \retval 0 otherwise
+ */
+static int
+libcfs_num_match(__u32 addr, struct list_head *numaddr)
+{
+	struct cfs_expr_list *el;
+
+	LASSERT(!list_empty(numaddr));
+	el = list_entry(numaddr->next, struct cfs_expr_list, el_link);
+
+	return cfs_expr_list_match(addr, el);
+}
+
+static struct netstrfns libcfs_netstrfns[] = {
+	{ .nf_type		= LOLND,
+	  .nf_name		= "lo",
+	  .nf_modname		= "klolnd",
+	  .nf_addr2str		= libcfs_decnum_addr2str,
+	  .nf_str2addr		= libcfs_lo_str2addr,
+	  .nf_parse_addrlist	= libcfs_num_parse,
+	  .nf_print_addrlist	= libcfs_num_addr_range_print,
+	  .nf_match_addr	= libcfs_num_match,
+	  .nf_min_max		= cfs_num_min_max },
+	{ .nf_type		= SOCKLND,
+	  .nf_name		= "tcp",
+	  .nf_modname		= "ksocklnd",
+	  .nf_addr2str		= libcfs_ip_addr2str,
+	  .nf_str2addr		= libcfs_ip_str2addr,
+	  .nf_parse_addrlist	= cfs_ip_addr_parse,
+	  .nf_print_addrlist	= libcfs_ip_addr_range_print,
+	  .nf_match_addr	= cfs_ip_addr_match,
+	  .nf_min_max		= cfs_ip_min_max },
+	{ .nf_type		= O2IBLND,
+	  .nf_name		= "o2ib",
+	  .nf_modname		= "ko2iblnd",
+	  .nf_addr2str		= libcfs_ip_addr2str,
+	  .nf_str2addr		= libcfs_ip_str2addr,
+	  .nf_parse_addrlist	= cfs_ip_addr_parse,
+	  .nf_print_addrlist	= libcfs_ip_addr_range_print,
+	  .nf_match_addr	= cfs_ip_addr_match,
+	  .nf_min_max		= cfs_ip_min_max },
+	{ .nf_type		= GNILND,
+	  .nf_name		= "gni",
+	  .nf_modname		= "kgnilnd",
+	  .nf_addr2str		= libcfs_decnum_addr2str,
+	  .nf_str2addr		= libcfs_num_str2addr,
+	  .nf_parse_addrlist	= libcfs_num_parse,
+	  .nf_print_addrlist	= libcfs_num_addr_range_print,
+	  .nf_match_addr	= libcfs_num_match,
+	  .nf_min_max		= cfs_num_min_max },
+	{ .nf_type		= GNIIPLND,
+	  .nf_name		= "gip",
+	  .nf_modname		= "kgnilnd",
+	  .nf_addr2str		= libcfs_ip_addr2str,
+	  .nf_str2addr		= libcfs_ip_str2addr,
+	  .nf_parse_addrlist	= cfs_ip_addr_parse,
+	  .nf_print_addrlist	= libcfs_ip_addr_range_print,
+	  .nf_match_addr	= cfs_ip_addr_match,
+	  .nf_min_max		= cfs_ip_min_max },
+	{ .nf_type		= PTL4LND,
+	  .nf_name		= "ptlf",
+	  .nf_modname		= "kptl4lnd",
+	  .nf_addr2str		= libcfs_decnum_addr2str,
+	  .nf_str2addr		= libcfs_num_str2addr,
+	  .nf_parse_addrlist	= libcfs_num_parse,
+	  .nf_print_addrlist	= libcfs_num_addr_range_print,
+	  .nf_match_addr	= libcfs_num_match,
+	  .nf_min_max		= cfs_num_min_max},
+};
+
+static const size_t libcfs_nnetstrfns = ARRAY_SIZE(libcfs_netstrfns);
+
+static struct netstrfns *
+libcfs_lnd2netstrfns(__u32 lnd)
+{
+	int	i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++)
+		if (lnd == libcfs_netstrfns[i].nf_type)
+			return &libcfs_netstrfns[i];
+
+	return NULL;
+}
+
+static struct netstrfns *
+libcfs_namenum2netstrfns(const char *name)
+{
+	struct netstrfns *nf;
+	int		  i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++) {
+		nf = &libcfs_netstrfns[i];
+		if (!strncmp(name, nf->nf_name, strlen(nf->nf_name)))
+			return nf;
+	}
+	return NULL;
+}
+
+static struct netstrfns *
+libcfs_name2netstrfns(const char *name)
+{
+	int    i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++)
+		if (!strcmp(libcfs_netstrfns[i].nf_name, name))
+			return &libcfs_netstrfns[i];
+
+	return NULL;
+}
+
+int
+libcfs_isknown_lnd(__u32 lnd)
+{
+	return libcfs_lnd2netstrfns(lnd) != NULL;
+}
+EXPORT_SYMBOL(libcfs_isknown_lnd);
+
+char *
+libcfs_lnd2modname(__u32 lnd)
+{
+	struct netstrfns *nf = libcfs_lnd2netstrfns(lnd);
+
+	return (nf == NULL) ? NULL : nf->nf_modname;
+}
+EXPORT_SYMBOL(libcfs_lnd2modname);
+
+int
+libcfs_str2lnd(const char *str)
+{
+	struct netstrfns *nf = libcfs_name2netstrfns(str);
+
+	if (nf != NULL)
+		return nf->nf_type;
+
+	return -ENXIO;
+}
+EXPORT_SYMBOL(libcfs_str2lnd);
+
+char *
+libcfs_lnd2str_r(__u32 lnd, char *buf, size_t buf_size)
+{
+	struct netstrfns *nf;
+
+	nf = libcfs_lnd2netstrfns(lnd);
+	if (nf == NULL)
+		snprintf(buf, buf_size, "?%u?", lnd);
+	else
+		snprintf(buf, buf_size, "%s", nf->nf_name);
+
+	return buf;
+}
+EXPORT_SYMBOL(libcfs_lnd2str_r);
+
+char *
+libcfs_net2str_r(__u32 net, char *buf, size_t buf_size)
+{
+	__u32		  nnum = LNET_NETNUM(net);
+	__u32		  lnd  = LNET_NETTYP(net);
+	struct netstrfns *nf;
+
+	nf = libcfs_lnd2netstrfns(lnd);
+	if (nf == NULL)
+		snprintf(buf, buf_size, "<%u:%u>", lnd, nnum);
+	else if (nnum == 0)
+		snprintf(buf, buf_size, "%s", nf->nf_name);
+	else
+		snprintf(buf, buf_size, "%s%u", nf->nf_name, nnum);
+
+	return buf;
+}
+EXPORT_SYMBOL(libcfs_net2str_r);
+
+char *
+libcfs_nid2str_r(lnet_nid_t nid, char *buf, size_t buf_size)
+{
+	__u32		  addr = LNET_NIDADDR(nid);
+	__u32		  net  = LNET_NIDNET(nid);
+	__u32		  nnum = LNET_NETNUM(net);
+	__u32		  lnd  = LNET_NETTYP(net);
+	struct netstrfns *nf;
+
+	if (nid == LNET_NID_ANY) {
+		strncpy(buf, "<?>", buf_size);
+		buf[buf_size - 1] = '\0';
+		return buf;
+	}
+
+	nf = libcfs_lnd2netstrfns(lnd);
+	if (nf == NULL) {
+		snprintf(buf, buf_size, "%x@<%u:%u>", addr, lnd, nnum);
+	} else {
+		size_t addr_len;
+
+		nf->nf_addr2str(addr, buf, buf_size);
+		addr_len = strlen(buf);
+		if (nnum == 0)
+			snprintf(buf + addr_len, buf_size - addr_len, "@%s",
+				 nf->nf_name);
+		else
+			snprintf(buf + addr_len, buf_size - addr_len, "@%s%u",
+				 nf->nf_name, nnum);
+	}
+
+	return buf;
+}
+EXPORT_SYMBOL(libcfs_nid2str_r);
+
+static struct netstrfns *
+libcfs_str2net_internal(const char *str, __u32 *net)
+{
+	struct netstrfns *nf = NULL;
+	int		  nob;
+	unsigned int	  netnum;
+	int		  i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++) {
+		nf = &libcfs_netstrfns[i];
+		if (!strncmp(str, nf->nf_name, strlen(nf->nf_name)))
+			break;
+	}
+
+	if (i == libcfs_nnetstrfns)
+		return NULL;
+
+	nob = strlen(nf->nf_name);
+
+	if (strlen(str) == (unsigned int)nob) {
+		netnum = 0;
+	} else {
+		if (nf->nf_type == LOLND) /* net number not allowed */
+			return NULL;
+
+		str += nob;
+		i = strlen(str);
+		if (sscanf(str, "%u%n", &netnum, &i) < 1 ||
+		    i != (int)strlen(str))
+			return NULL;
+	}
+
+	*net = LNET_MKNET(nf->nf_type, netnum);
+	return nf;
+}
+
+__u32
+libcfs_str2net(const char *str)
+{
+	__u32  net;
+
+	if (libcfs_str2net_internal(str, &net) != NULL)
+		return net;
+
+	return LNET_NIDNET(LNET_NID_ANY);
+}
+EXPORT_SYMBOL(libcfs_str2net);
+
+lnet_nid_t
+libcfs_str2nid(const char *str)
+{
+	const char	 *sep = strchr(str, '@');
+	struct netstrfns *nf;
+	__u32		  net;
+	__u32		  addr;
+
+	if (sep != NULL) {
+		nf = libcfs_str2net_internal(sep + 1, &net);
+		if (nf == NULL)
+			return LNET_NID_ANY;
+	} else {
+		sep = str + strlen(str);
+		net = LNET_MKNET(SOCKLND, 0);
+		nf = libcfs_lnd2netstrfns(SOCKLND);
+		LASSERT(nf != NULL);
+	}
+
+	if (!nf->nf_str2addr(str, (int)(sep - str), &addr))
+		return LNET_NID_ANY;
+
+	return LNET_MKNID(net, addr);
+}
+EXPORT_SYMBOL(libcfs_str2nid);
+
+char *
+libcfs_id2str(struct lnet_process_id id)
+{
+	char *str = libcfs_next_nidstring();
+
+	if (id.pid == LNET_PID_ANY) {
+		snprintf(str, LNET_NIDSTR_SIZE,
+			 "LNET_PID_ANY-%s", libcfs_nid2str(id.nid));
+		return str;
+	}
+
+	snprintf(str, LNET_NIDSTR_SIZE, "%s%u-%s",
+		 ((id.pid & LNET_PID_USERFLAG) != 0) ? "U" : "",
+		 (id.pid & ~LNET_PID_USERFLAG), libcfs_nid2str(id.nid));
+	return str;
+}
+EXPORT_SYMBOL(libcfs_id2str);
+
+int
+libcfs_str2anynid(lnet_nid_t *nidp, const char *str)
+{
+	if (!strcmp(str, "*")) {
+		*nidp = LNET_NID_ANY;
+		return 1;
+	}
+
+	*nidp = libcfs_str2nid(str);
+	return *nidp != LNET_NID_ANY;
+}
+EXPORT_SYMBOL(libcfs_str2anynid);
diff --git a/drivers/staging/lustrefsx/lnet/lnet/peer.c b/drivers/staging/lustrefsx/lnet/lnet/peer.c
new file mode 100644
index 0000000000000..612af87d47692
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/peer.c
@@ -0,0 +1,1224 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/peer.c
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <lnet/lib-lnet.h>
+#include <lnet/lib-dlc.h>
+
+static void
+lnet_peer_remove_from_remote_list(struct lnet_peer_ni *lpni)
+{
+	if (!list_empty(&lpni->lpni_on_remote_peer_ni_list)) {
+		list_del_init(&lpni->lpni_on_remote_peer_ni_list);
+		lnet_peer_ni_decref_locked(lpni);
+	}
+}
+
+void
+lnet_peer_net_added(struct lnet_net *net)
+{
+	struct lnet_peer_ni *lpni, *tmp;
+
+	list_for_each_entry_safe(lpni, tmp, &the_lnet.ln_remote_peer_ni_list,
+				 lpni_on_remote_peer_ni_list) {
+
+		if (LNET_NIDNET(lpni->lpni_nid) == net->net_id) {
+			lpni->lpni_net = net;
+
+			spin_lock(&lpni->lpni_lock);
+			lpni->lpni_txcredits =
+				lpni->lpni_net->net_tunables.lct_peer_tx_credits;
+			lpni->lpni_mintxcredits = lpni->lpni_txcredits;
+			lpni->lpni_rtrcredits =
+				lnet_peer_buffer_credits(lpni->lpni_net);
+			lpni->lpni_minrtrcredits = lpni->lpni_rtrcredits;
+			spin_unlock(&lpni->lpni_lock);
+
+			lnet_peer_remove_from_remote_list(lpni);
+		}
+	}
+}
+
+static void
+lnet_peer_tables_destroy(void)
+{
+	struct lnet_peer_table	*ptable;
+	struct list_head	*hash;
+	int			i;
+	int			j;
+
+	if (!the_lnet.ln_peer_tables)
+		return;
+
+	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+		hash = ptable->pt_hash;
+		if (!hash) /* not intialized */
+			break;
+
+		LASSERT(list_empty(&ptable->pt_zombie_list));
+
+		ptable->pt_hash = NULL;
+		for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
+			LASSERT(list_empty(&hash[j]));
+
+		LIBCFS_FREE(hash, LNET_PEER_HASH_SIZE * sizeof(*hash));
+	}
+
+	cfs_percpt_free(the_lnet.ln_peer_tables);
+	the_lnet.ln_peer_tables = NULL;
+}
+
+int
+lnet_peer_tables_create(void)
+{
+	struct lnet_peer_table	*ptable;
+	struct list_head	*hash;
+	int			i;
+	int			j;
+
+	the_lnet.ln_peer_tables = cfs_percpt_alloc(lnet_cpt_table(),
+						   sizeof(*ptable));
+	if (the_lnet.ln_peer_tables == NULL) {
+		CERROR("Failed to allocate cpu-partition peer tables\n");
+		return -ENOMEM;
+	}
+
+	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+		LIBCFS_CPT_ALLOC(hash, lnet_cpt_table(), i,
+				 LNET_PEER_HASH_SIZE * sizeof(*hash));
+		if (hash == NULL) {
+			CERROR("Failed to create peer hash table\n");
+			lnet_peer_tables_destroy();
+			return -ENOMEM;
+		}
+
+		spin_lock_init(&ptable->pt_zombie_lock);
+		INIT_LIST_HEAD(&ptable->pt_zombie_list);
+
+		for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
+			INIT_LIST_HEAD(&hash[j]);
+		ptable->pt_hash = hash; /* sign of initialization */
+	}
+
+	return 0;
+}
+
+static struct lnet_peer_ni *
+lnet_peer_ni_alloc(lnet_nid_t nid)
+{
+	struct lnet_peer_ni *lpni;
+	struct lnet_net *net;
+	int cpt;
+
+	cpt = lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+
+	LIBCFS_CPT_ALLOC(lpni, lnet_cpt_table(), cpt, sizeof(*lpni));
+	if (!lpni)
+		return NULL;
+
+	INIT_LIST_HEAD(&lpni->lpni_txq);
+	INIT_LIST_HEAD(&lpni->lpni_rtrq);
+	INIT_LIST_HEAD(&lpni->lpni_routes);
+	INIT_LIST_HEAD(&lpni->lpni_hashlist);
+	INIT_LIST_HEAD(&lpni->lpni_on_peer_net_list);
+	INIT_LIST_HEAD(&lpni->lpni_on_remote_peer_ni_list);
+
+	spin_lock_init(&lpni->lpni_lock);
+
+	lpni->lpni_alive = !lnet_peers_start_down(); /* 1 bit!! */
+	lpni->lpni_last_alive = cfs_time_current(); /* assumes alive */
+	lpni->lpni_ping_feats = LNET_PING_FEAT_INVAL;
+	lpni->lpni_nid = nid;
+	lpni->lpni_cpt = cpt;
+	lnet_set_peer_ni_health_locked(lpni, true);
+
+	net = lnet_get_net_locked(LNET_NIDNET(nid));
+	lpni->lpni_net = net;
+	if (net) {
+		lpni->lpni_txcredits = net->net_tunables.lct_peer_tx_credits;
+		lpni->lpni_mintxcredits = lpni->lpni_txcredits;
+		lpni->lpni_rtrcredits = lnet_peer_buffer_credits(net);
+		lpni->lpni_minrtrcredits = lpni->lpni_rtrcredits;
+	} else {
+		/*
+		 * This peer_ni is not on a local network, so we
+		 * cannot add the credits here. In case the net is
+		 * added later, add the peer_ni to the remote peer ni
+		 * list so it can be easily found and revisited.
+		 */
+		/* FIXME: per-net implementation instead? */
+		atomic_inc(&lpni->lpni_refcount);
+		list_add_tail(&lpni->lpni_on_remote_peer_ni_list,
+			      &the_lnet.ln_remote_peer_ni_list);
+	}
+
+	/* TODO: update flags */
+
+	return lpni;
+}
+
+static struct lnet_peer_net *
+lnet_peer_net_alloc(__u32 net_id)
+{
+	struct lnet_peer_net *lpn;
+
+	LIBCFS_CPT_ALLOC(lpn, lnet_cpt_table(), CFS_CPT_ANY, sizeof(*lpn));
+	if (!lpn)
+		return NULL;
+
+	INIT_LIST_HEAD(&lpn->lpn_on_peer_list);
+	INIT_LIST_HEAD(&lpn->lpn_peer_nis);
+	lpn->lpn_net_id = net_id;
+
+	return lpn;
+}
+
+static struct lnet_peer *
+lnet_peer_alloc(lnet_nid_t nid)
+{
+	struct lnet_peer *lp;
+
+	LIBCFS_CPT_ALLOC(lp, lnet_cpt_table(), CFS_CPT_ANY, sizeof(*lp));
+	if (!lp)
+		return NULL;
+
+	INIT_LIST_HEAD(&lp->lp_on_lnet_peer_list);
+	INIT_LIST_HEAD(&lp->lp_peer_nets);
+	lp->lp_primary_nid = nid;
+
+	/* TODO: update flags */
+
+	return lp;
+}
+
+
+static void
+lnet_try_destroy_peer_hierarchy_locked(struct lnet_peer_ni *lpni)
+{
+	struct lnet_peer_net *peer_net;
+	struct lnet_peer *peer;
+
+	/* TODO: could the below situation happen? accessing an already
+	 * destroyed peer? */
+	if (lpni->lpni_peer_net == NULL ||
+	    lpni->lpni_peer_net->lpn_peer == NULL)
+		return;
+
+	peer_net = lpni->lpni_peer_net;
+	peer = lpni->lpni_peer_net->lpn_peer;
+
+	list_del_init(&lpni->lpni_on_peer_net_list);
+	lpni->lpni_peer_net = NULL;
+
+	/* if peer_net is empty, then remove it from the peer */
+	if (list_empty(&peer_net->lpn_peer_nis)) {
+		list_del_init(&peer_net->lpn_on_peer_list);
+		peer_net->lpn_peer = NULL;
+		LIBCFS_FREE(peer_net, sizeof(*peer_net));
+
+		/* if the peer is empty then remove it from the
+		 * the_lnet.ln_peers */
+		if (list_empty(&peer->lp_peer_nets)) {
+			list_del_init(&peer->lp_on_lnet_peer_list);
+			LIBCFS_FREE(peer, sizeof(*peer));
+		}
+	}
+}
+
+/* called with lnet_net_lock LNET_LOCK_EX held */
+static int
+lnet_peer_ni_del_locked(struct lnet_peer_ni *lpni)
+{
+	struct lnet_peer_table *ptable = NULL;
+
+	/* don't remove a peer_ni if it's also a gateway */
+	if (lpni->lpni_rtr_refcount > 0) {
+		CERROR("Peer NI %s is a gateway. Can not delete it\n",
+		       libcfs_nid2str(lpni->lpni_nid));
+		return -EBUSY;
+	}
+
+	lnet_peer_remove_from_remote_list(lpni);
+
+	/* remove peer ni from the hash list. */
+	list_del_init(&lpni->lpni_hashlist);
+
+	/* decrement the ref count on the peer table */
+	ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt];
+	LASSERT(atomic_read(&ptable->pt_number) > 0);
+	atomic_dec(&ptable->pt_number);
+
+	/*
+	 * The peer_ni can no longer be found with a lookup. But there
+	 * can be current users, so keep track of it on the zombie
+	 * list until the reference count has gone to zero.
+	 *
+	 * The last reference may be lost in a place where the
+	 * lnet_net_lock locks only a single cpt, and that cpt may not
+	 * be lpni->lpni_cpt. So the zombie list of this peer_table
+	 * has its own lock.
+	 */
+	spin_lock(&ptable->pt_zombie_lock);
+	list_add(&lpni->lpni_hashlist, &ptable->pt_zombie_list);
+	ptable->pt_zombies++;
+	spin_unlock(&ptable->pt_zombie_lock);
+
+	/* no need to keep this peer on the hierarchy anymore */
+	lnet_try_destroy_peer_hierarchy_locked(lpni);
+
+	/* decrement reference on peer */
+	lnet_peer_ni_decref_locked(lpni);
+
+	return 0;
+}
+
+void lnet_peer_uninit(void)
+{
+	struct lnet_peer_ni *lpni, *tmp;
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	/* remove all peer_nis from the remote peer and the hash list */
+	list_for_each_entry_safe(lpni, tmp, &the_lnet.ln_remote_peer_ni_list,
+				 lpni_on_remote_peer_ni_list)
+		lnet_peer_ni_del_locked(lpni);
+
+	lnet_peer_tables_destroy();
+
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+static int
+lnet_peer_del_locked(struct lnet_peer *peer)
+{
+	struct lnet_peer_ni *lpni = NULL, *lpni2;
+	int rc = 0, rc2 = 0;
+
+	lpni = lnet_get_next_peer_ni_locked(peer, NULL, lpni);
+	while (lpni != NULL) {
+		lpni2 = lnet_get_next_peer_ni_locked(peer, NULL, lpni);
+		rc = lnet_peer_ni_del_locked(lpni);
+		if (rc != 0)
+			rc2 = rc;
+		lpni = lpni2;
+	}
+
+	return rc2;
+}
+
+static void
+lnet_peer_table_cleanup_locked(struct lnet_net *net,
+			       struct lnet_peer_table *ptable)
+{
+	int			 i;
+	struct lnet_peer_ni	*next;
+	struct lnet_peer_ni	*lpni;
+	struct lnet_peer	*peer;
+
+	for (i = 0; i < LNET_PEER_HASH_SIZE; i++) {
+		list_for_each_entry_safe(lpni, next, &ptable->pt_hash[i],
+					 lpni_hashlist) {
+			if (net != NULL && net != lpni->lpni_net)
+				continue;
+
+			peer = lpni->lpni_peer_net->lpn_peer;
+			if (peer->lp_primary_nid != lpni->lpni_nid) {
+				lnet_peer_ni_del_locked(lpni);
+				continue;
+			}
+			/*
+			 * Removing the primary NID implies removing
+			 * the entire peer. Advance next beyond any
+			 * peer_ni that belongs to the same peer.
+			 */
+			list_for_each_entry_from(next, &ptable->pt_hash[i],
+						 lpni_hashlist) {
+				if (next->lpni_peer_net->lpn_peer != peer)
+					break;
+			}
+			lnet_peer_del_locked(peer);
+		}
+	}
+}
+
+static void
+lnet_peer_ni_finalize_wait(struct lnet_peer_table *ptable)
+{
+	int	i = 3;
+
+	spin_lock(&ptable->pt_zombie_lock);
+	while (ptable->pt_zombies) {
+		spin_unlock(&ptable->pt_zombie_lock);
+
+		if (is_power_of_2(i)) {
+			CDEBUG(D_WARNING,
+			       "Waiting for %d zombies on peer table\n",
+			       ptable->pt_zombies);
+		}
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(1) >> 1);
+		spin_lock(&ptable->pt_zombie_lock);
+	}
+	spin_unlock(&ptable->pt_zombie_lock);
+}
+
+static void
+lnet_peer_table_del_rtrs_locked(struct lnet_net *net,
+				struct lnet_peer_table *ptable)
+{
+	struct lnet_peer_ni	*lp;
+	struct lnet_peer_ni	*tmp;
+	lnet_nid_t		lpni_nid;
+	int			i;
+
+	for (i = 0; i < LNET_PEER_HASH_SIZE; i++) {
+		list_for_each_entry_safe(lp, tmp, &ptable->pt_hash[i],
+					 lpni_hashlist) {
+			if (net != lp->lpni_net)
+				continue;
+
+			if (lp->lpni_rtr_refcount == 0)
+				continue;
+
+			lpni_nid = lp->lpni_nid;
+
+			lnet_net_unlock(LNET_LOCK_EX);
+			lnet_del_route(LNET_NIDNET(LNET_NID_ANY), lpni_nid);
+			lnet_net_lock(LNET_LOCK_EX);
+		}
+	}
+}
+
+void
+lnet_peer_tables_cleanup(struct lnet_net *net)
+{
+	int				i;
+	struct lnet_peer_table		*ptable;
+
+	LASSERT(the_lnet.ln_state != LNET_STATE_SHUTDOWN || net != NULL);
+	/* If just deleting the peers for a NI, get rid of any routes these
+	 * peers are gateways for. */
+	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+		lnet_net_lock(LNET_LOCK_EX);
+		lnet_peer_table_del_rtrs_locked(net, ptable);
+		lnet_net_unlock(LNET_LOCK_EX);
+	}
+
+	/* Start the cleanup process */
+	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+		lnet_net_lock(LNET_LOCK_EX);
+		lnet_peer_table_cleanup_locked(net, ptable);
+		lnet_net_unlock(LNET_LOCK_EX);
+	}
+
+	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables)
+		lnet_peer_ni_finalize_wait(ptable);
+}
+
+static struct lnet_peer_ni *
+lnet_get_peer_ni_locked(struct lnet_peer_table *ptable, lnet_nid_t nid)
+{
+	struct list_head	*peers;
+	struct lnet_peer_ni	*lp;
+
+	LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING);
+
+	peers = &ptable->pt_hash[lnet_nid2peerhash(nid)];
+	list_for_each_entry(lp, peers, lpni_hashlist) {
+		if (lp->lpni_nid == nid) {
+			lnet_peer_ni_addref_locked(lp);
+			return lp;
+		}
+	}
+
+	return NULL;
+}
+
+struct lnet_peer_ni *
+lnet_find_peer_ni_locked(lnet_nid_t nid)
+{
+	struct lnet_peer_ni *lpni;
+	struct lnet_peer_table *ptable;
+	int cpt;
+
+	cpt = lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+
+	ptable = the_lnet.ln_peer_tables[cpt];
+	lpni = lnet_get_peer_ni_locked(ptable, nid);
+
+	return lpni;
+}
+
+struct lnet_peer *
+lnet_find_or_create_peer_locked(lnet_nid_t dst_nid, int cpt)
+{
+	struct lnet_peer_ni *lpni;
+	struct lnet_peer *lp;
+
+	lpni = lnet_find_peer_ni_locked(dst_nid);
+	if (!lpni) {
+		lpni = lnet_nid2peerni_locked(dst_nid, cpt);
+		if (IS_ERR(lpni))
+			return ERR_CAST(lpni);
+	}
+
+	lp = lpni->lpni_peer_net->lpn_peer;
+	lnet_peer_ni_decref_locked(lpni);
+
+	return lp;
+}
+
+struct lnet_peer_ni *
+lnet_get_peer_ni_idx_locked(int idx, struct lnet_peer_net **lpn,
+			    struct lnet_peer **lp)
+{
+	struct lnet_peer_ni	*lpni;
+
+	list_for_each_entry((*lp), &the_lnet.ln_peers, lp_on_lnet_peer_list) {
+		list_for_each_entry((*lpn), &((*lp)->lp_peer_nets), lpn_on_peer_list) {
+			list_for_each_entry(lpni, &((*lpn)->lpn_peer_nis),
+					    lpni_on_peer_net_list)
+				if (idx-- == 0)
+					return lpni;
+		}
+	}
+
+	return NULL;
+}
+
+struct lnet_peer_ni *
+lnet_get_next_peer_ni_locked(struct lnet_peer *peer,
+			     struct lnet_peer_net *peer_net,
+			     struct lnet_peer_ni *prev)
+{
+	struct lnet_peer_ni *lpni;
+	struct lnet_peer_net *net = peer_net;
+
+	if (!prev) {
+		if (!net)
+			net = list_entry(peer->lp_peer_nets.next,
+					 struct lnet_peer_net,
+					 lpn_on_peer_list);
+		lpni = list_entry(net->lpn_peer_nis.next, struct lnet_peer_ni,
+				  lpni_on_peer_net_list);
+
+		return lpni;
+	}
+
+	if (prev->lpni_on_peer_net_list.next ==
+	    &prev->lpni_peer_net->lpn_peer_nis) {
+		/*
+		 * if you reached the end of the peer ni list and the peer
+		 * net is specified then there are no more peer nis in that
+		 * net.
+		 */
+		if (net)
+			return NULL;
+
+		/*
+		 * we reached the end of this net ni list. move to the
+		 * next net
+		 */
+		if (prev->lpni_peer_net->lpn_on_peer_list.next ==
+		    &peer->lp_peer_nets)
+			/* no more nets and no more NIs. */
+			return NULL;
+
+		/* get the next net */
+		net = list_entry(prev->lpni_peer_net->lpn_on_peer_list.next,
+				 struct lnet_peer_net,
+				 lpn_on_peer_list);
+		/* get the ni on it */
+		lpni = list_entry(net->lpn_peer_nis.next, struct lnet_peer_ni,
+				  lpni_on_peer_net_list);
+
+		return lpni;
+	}
+
+	/* there are more nis left */
+	lpni = list_entry(prev->lpni_on_peer_net_list.next,
+			  struct lnet_peer_ni, lpni_on_peer_net_list);
+
+	return lpni;
+}
+
+bool
+lnet_peer_is_ni_pref_locked(struct lnet_peer_ni *lpni, struct lnet_ni *ni)
+{
+	int i;
+
+	for (i = 0; i < lpni->lpni_pref_nnids; i++) {
+		if (lpni->lpni_pref_nids[i] == ni->ni_nid)
+			return true;
+	}
+	return false;
+}
+
+lnet_nid_t
+lnet_peer_primary_nid_locked(lnet_nid_t nid)
+{
+	struct lnet_peer_ni *lpni;
+	lnet_nid_t primary_nid = nid;
+
+	lpni = lnet_find_peer_ni_locked(nid);
+	if (lpni) {
+		primary_nid = lpni->lpni_peer_net->lpn_peer->lp_primary_nid;
+		lnet_peer_ni_decref_locked(lpni);
+	}
+
+	return primary_nid;
+}
+
+lnet_nid_t
+LNetPrimaryNID(lnet_nid_t nid)
+{
+	struct lnet_peer_ni *lpni;
+	lnet_nid_t primary_nid = nid;
+	int rc = 0;
+	int cpt;
+
+	cpt = lnet_net_lock_current();
+	lpni = lnet_nid2peerni_locked(nid, cpt);
+	if (IS_ERR(lpni)) {
+		rc = PTR_ERR(lpni);
+		goto out_unlock;
+	}
+	primary_nid = lpni->lpni_peer_net->lpn_peer->lp_primary_nid;
+	lnet_peer_ni_decref_locked(lpni);
+out_unlock:
+	lnet_net_unlock(cpt);
+
+	CDEBUG(D_NET, "NID %s primary NID %s rc %d\n", libcfs_nid2str(nid),
+	       libcfs_nid2str(primary_nid), rc);
+	return primary_nid;
+}
+EXPORT_SYMBOL(LNetPrimaryNID);
+
+struct lnet_peer_net *
+lnet_peer_get_net_locked(struct lnet_peer *peer, __u32 net_id)
+{
+	struct lnet_peer_net *peer_net;
+	list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_on_peer_list) {
+		if (peer_net->lpn_net_id == net_id)
+			return peer_net;
+	}
+	return NULL;
+}
+
+static int
+lnet_peer_setup_hierarchy(struct lnet_peer *lp, struct lnet_peer_ni *lpni,
+			  lnet_nid_t nid)
+{
+	struct lnet_peer_net *lpn = NULL;
+	struct lnet_peer_table *ptable;
+        __u32 net_id = LNET_NIDNET(nid);
+
+	/*
+	 * Create the peer_ni, peer_net, and peer if they don't exist
+	 * yet.
+	 */
+	if (lp) {
+		lpn = lnet_peer_get_net_locked(lp, net_id);
+	} else {
+		lp = lnet_peer_alloc(nid);
+		if (!lp)
+			goto out_enomem;
+	}
+
+	if (!lpn) {
+		lpn = lnet_peer_net_alloc(net_id);
+		if (!lpn)
+			goto out_maybe_free_lp;
+	}
+
+	if (!lpni) {
+		lpni = lnet_peer_ni_alloc(nid);
+		if (!lpni)
+			goto out_maybe_free_lpn;
+	}
+
+	/* Install the new peer_ni */
+	lnet_net_lock(LNET_LOCK_EX);
+	/* Add peer_ni to global peer table hash, if necessary. */
+	if (list_empty(&lpni->lpni_hashlist)) {
+		ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt];
+		list_add_tail(&lpni->lpni_hashlist,
+			      &ptable->pt_hash[lnet_nid2peerhash(nid)]);
+		ptable->pt_version++;
+		atomic_inc(&ptable->pt_number);
+		atomic_inc(&lpni->lpni_refcount);
+	}
+
+	/* Detach the peer_ni from an existing peer, if necessary. */
+	if (lpni->lpni_peer_net && lpni->lpni_peer_net->lpn_peer != lp)
+		lnet_try_destroy_peer_hierarchy_locked(lpni);
+
+	/* Add peer_ni to peer_net */
+	lpni->lpni_peer_net = lpn;
+	list_add_tail(&lpni->lpni_on_peer_net_list, &lpn->lpn_peer_nis);
+
+	/* Add peer_net to peer */
+	if (!lpn->lpn_peer) {
+		lpn->lpn_peer = lp;
+		list_add_tail(&lpn->lpn_on_peer_list, &lp->lp_peer_nets);
+	}
+
+	/* Add peer to global peer list */
+	if (list_empty(&lp->lp_on_lnet_peer_list))
+		list_add_tail(&lp->lp_on_lnet_peer_list, &the_lnet.ln_peers);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	return 0;
+
+out_maybe_free_lpn:
+	if (list_empty(&lpn->lpn_on_peer_list))
+		LIBCFS_FREE(lpn, sizeof(*lpn));
+out_maybe_free_lp:
+	if (list_empty(&lp->lp_on_lnet_peer_list))
+		LIBCFS_FREE(lp, sizeof(*lp));
+out_enomem:
+	return -ENOMEM;
+}
+
+static int
+lnet_add_prim_lpni(lnet_nid_t nid)
+{
+	int rc = 0;
+	struct lnet_peer *peer;
+	struct lnet_peer_ni *lpni;
+
+	LASSERT(nid != LNET_NID_ANY);
+
+	/*
+	 * lookup the NID and its peer
+	 *  if the peer doesn't exist, create it.
+	 *  if this is a non-MR peer then change its state to MR and exit.
+	 *  if this is an MR peer and it's a primary NI: NO-OP.
+	 *  if this is an MR peer and it's not a primary NI. Operation not
+	 *     allowed.
+	 *
+	 * The adding and deleting of peer nis is being serialized through
+	 * the api_mutex. So we can look up peers with the mutex locked
+	 * safely. Only when we need to change the ptable, do we need to
+	 * exclusively lock the lnet_net_lock()
+	 */
+	lpni = lnet_find_peer_ni_locked(nid);
+	if (!lpni) {
+		rc = lnet_peer_setup_hierarchy(NULL, NULL, nid);
+		if (rc != 0)
+			return rc;
+		lpni = lnet_find_peer_ni_locked(nid);
+	}
+
+	LASSERT(lpni);
+
+	lnet_peer_ni_decref_locked(lpni);
+
+	peer = lpni->lpni_peer_net->lpn_peer;
+
+	/*
+	 * If we found a lpni with the same nid as the NID we're trying to
+	 * create, then we're trying to create an already existing lpni 
+	 * that belongs to a different peer
+	 */
+	if (peer->lp_primary_nid != nid)
+		return -EEXIST;
+
+	/*
+	 * if we found an lpni that is not a multi-rail, which could occur
+	 * if lpni is already created as a non-mr lpni or we just created
+	 * it, then make sure you indicate that this lpni is a primary mr
+	 * capable peer.
+	 *
+	 * TODO: update flags if necessary
+	 */
+	if (!peer->lp_multi_rail && peer->lp_primary_nid == nid)
+		peer->lp_multi_rail = true;
+
+	return rc;
+}
+
+static int
+lnet_add_peer_ni_to_prim_lpni(lnet_nid_t prim_nid, lnet_nid_t nid)
+{
+	struct lnet_peer *peer, *primary_peer;
+	struct lnet_peer_ni *lpni = NULL, *klpni = NULL;
+
+	LASSERT(prim_nid != LNET_NID_ANY && nid != LNET_NID_ANY);
+
+	/*
+	 * key nid must be created by this point. If not then this
+	 * operation is not permitted
+	 */
+	klpni = lnet_find_peer_ni_locked(prim_nid);
+	if (!klpni)
+		return -ENOENT;
+
+	lnet_peer_ni_decref_locked(klpni);
+
+	primary_peer = klpni->lpni_peer_net->lpn_peer;
+
+	lpni = lnet_find_peer_ni_locked(nid);
+	if (lpni) {
+		lnet_peer_ni_decref_locked(lpni);
+
+		peer = lpni->lpni_peer_net->lpn_peer;
+		/*
+		 * lpni already exists in the system but it belongs to
+		 * a different peer. We can't re-added it
+		 */
+		if (peer->lp_primary_nid != prim_nid && peer->lp_multi_rail) {
+			CERROR("Cannot add NID %s owned by peer %s to peer %s\n",
+			       libcfs_nid2str(lpni->lpni_nid),
+			       libcfs_nid2str(peer->lp_primary_nid),
+			       libcfs_nid2str(prim_nid));
+			return -EEXIST;
+		} else if (peer->lp_primary_nid == prim_nid) {
+			/*
+			 * found a peer_ni that is already part of the
+			 * peer. This is a no-op operation.
+			 */
+			return 0;
+		}
+
+		/*
+		 * TODO: else if (peer->lp_primary_nid != prim_nid &&
+		 *		  !peer->lp_multi_rail)
+		 * peer is not an MR peer and it will be moved in the next
+		 * step to klpni, so update its flags accordingly.
+		 * lnet_move_peer_ni()
+		 */
+
+		/*
+		 * TODO: call lnet_update_peer() from here to update the
+		 * flags. This is the case when the lpni you're trying to
+		 * add is already part of the peer. This could've been
+		 * added by the DD previously, so go ahead and do any
+		 * updates to the state if necessary
+		 */
+
+	}
+
+	/*
+	 * When we get here we either have found an existing lpni, which
+	 * we can switch to the new peer. Or we need to create one and
+	 * add it to the new peer
+	 */
+	return lnet_peer_setup_hierarchy(primary_peer, lpni, nid);
+}
+
+/*
+ * lpni creation initiated due to traffic either sending or receiving.
+ */
+static int
+lnet_peer_ni_traffic_add(lnet_nid_t nid)
+{
+	struct lnet_peer_ni *lpni;
+	int rc = 0;
+
+	if (nid == LNET_NID_ANY)
+		return -EINVAL;
+
+	/* lnet_net_lock is not needed here because ln_api_lock is held */
+	lpni = lnet_find_peer_ni_locked(nid);
+	if (lpni) {
+		/*
+		 * TODO: lnet_update_primary_nid() but not all of it
+		 * only indicate if we're converting this to MR capable
+		 * Can happen due to DD
+		 */
+		lnet_peer_ni_decref_locked(lpni);
+	} else {
+		rc = lnet_peer_setup_hierarchy(NULL, NULL, nid);
+	}
+
+	return rc;
+
+}
+
+static int
+lnet_peer_ni_add_non_mr(lnet_nid_t nid)
+{
+	struct lnet_peer_ni *lpni;
+
+	lpni = lnet_find_peer_ni_locked(nid);
+	if (lpni) {
+		CERROR("Cannot add %s as non-mr when it already exists\n",
+		       libcfs_nid2str(nid));
+		lnet_peer_ni_decref_locked(lpni);
+		return -EEXIST;
+	}
+
+	return lnet_peer_setup_hierarchy(NULL, NULL, nid);
+}
+
+/*
+ * This API handles the following combinations:
+ *	Create a primary NI if only the prim_nid is provided
+ *	Create or add an lpni to a primary NI. Primary NI must've already
+ *	been created
+ *	Create a non-MR peer.
+ */
+int
+lnet_add_peer_ni_to_peer(lnet_nid_t prim_nid, lnet_nid_t nid, bool mr)
+{
+	/*
+	 * Caller trying to setup an MR like peer hierarchy but
+	 * specifying it to be non-MR. This is not allowed.
+	 */
+	if (prim_nid != LNET_NID_ANY &&
+	    nid != LNET_NID_ANY && !mr)
+		return -EPERM;
+
+	/* Add the primary NID of a peer */
+	if (prim_nid != LNET_NID_ANY &&
+	    nid == LNET_NID_ANY && mr)
+		return lnet_add_prim_lpni(prim_nid);
+
+	/* Add a NID to an existing peer */
+	if (prim_nid != LNET_NID_ANY &&
+	    nid != LNET_NID_ANY && mr)
+		return lnet_add_peer_ni_to_prim_lpni(prim_nid, nid);
+
+	/* Add a non-MR peer NI */
+	if (((prim_nid != LNET_NID_ANY &&
+	      nid == LNET_NID_ANY) ||
+	     (prim_nid == LNET_NID_ANY &&
+	      nid != LNET_NID_ANY)) && !mr)
+		return lnet_peer_ni_add_non_mr(prim_nid != LNET_NID_ANY ?
+							 prim_nid : nid);
+
+	return 0;
+}
+
+int
+lnet_del_peer_ni_from_peer(lnet_nid_t prim_nid, lnet_nid_t nid)
+{
+	lnet_nid_t local_nid;
+	struct lnet_peer *peer;
+	struct lnet_peer_ni *lpni;
+	int rc;
+
+	if (prim_nid == LNET_NID_ANY)
+		return -EINVAL;
+
+	local_nid = (nid != LNET_NID_ANY) ? nid : prim_nid;
+
+	lpni = lnet_find_peer_ni_locked(local_nid);
+	if (!lpni)
+		return -EINVAL;
+	lnet_peer_ni_decref_locked(lpni);
+
+	peer = lpni->lpni_peer_net->lpn_peer;
+	LASSERT(peer != NULL);
+
+	if (peer->lp_primary_nid == lpni->lpni_nid) {
+		/*
+		 * deleting the primary ni is equivalent to deleting the
+		 * entire peer
+		 */
+		lnet_net_lock(LNET_LOCK_EX);
+		rc = lnet_peer_del_locked(peer);
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		return rc;
+	}
+
+	lnet_net_lock(LNET_LOCK_EX);
+	rc = lnet_peer_ni_del_locked(lpni);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	return rc;
+}
+
+void
+lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lpni)
+{
+	struct lnet_peer_table *ptable;
+
+	LASSERT(atomic_read(&lpni->lpni_refcount) == 0);
+	LASSERT(lpni->lpni_rtr_refcount == 0);
+	LASSERT(list_empty(&lpni->lpni_txq));
+	LASSERT(lpni->lpni_txqnob == 0);
+
+	lpni->lpni_net = NULL;
+
+	/* remove the peer ni from the zombie list */
+	ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt];
+	spin_lock(&ptable->pt_zombie_lock);
+	list_del_init(&lpni->lpni_hashlist);
+	ptable->pt_zombies--;
+	spin_unlock(&ptable->pt_zombie_lock);
+
+	LIBCFS_FREE(lpni, sizeof(*lpni));
+}
+
+struct lnet_peer_ni *
+lnet_nid2peerni_ex(lnet_nid_t nid, int cpt)
+{
+	struct lnet_peer_ni *lpni = NULL;
+	int rc;
+
+	if (the_lnet.ln_state != LNET_STATE_RUNNING)
+		return ERR_PTR(-ESHUTDOWN);
+
+	/*
+	 * find if a peer_ni already exists.
+	 * If so then just return that.
+	 */
+	lpni = lnet_find_peer_ni_locked(nid);
+	if (lpni)
+		return lpni;
+
+	lnet_net_unlock(cpt);
+
+	rc = lnet_peer_ni_traffic_add(nid);
+	if (rc) {
+		lpni = ERR_PTR(rc);
+		goto out_net_relock;
+	}
+
+	lpni = lnet_find_peer_ni_locked(nid);
+	LASSERT(lpni);
+
+out_net_relock:
+	lnet_net_lock(cpt);
+
+	return lpni;
+}
+
+struct lnet_peer_ni *
+lnet_nid2peerni_locked(lnet_nid_t nid, int cpt)
+{
+	struct lnet_peer_ni *lpni = NULL;
+	int rc;
+
+	if (the_lnet.ln_state != LNET_STATE_RUNNING)
+		return ERR_PTR(-ESHUTDOWN);
+
+	/*
+	 * find if a peer_ni already exists.
+	 * If so then just return that.
+	 */
+	lpni = lnet_find_peer_ni_locked(nid);
+	if (lpni)
+		return lpni;
+
+	/*
+	 * Slow path:
+	 * use the lnet_api_mutex to serialize the creation of the peer_ni
+	 * and the creation/deletion of the local ni/net. When a local ni is
+	 * created, if there exists a set of peer_nis on that network,
+	 * they need to be traversed and updated. When a local NI is
+	 * deleted, which could result in a network being deleted, then
+	 * all peer nis on that network need to be removed as well.
+	 *
+	 * Creation through traffic should also be serialized with
+	 * creation through DLC.
+	 */
+	lnet_net_unlock(cpt);
+	mutex_lock(&the_lnet.ln_api_mutex);
+	/*
+	 * Shutdown is only set under the ln_api_lock, so a single
+	 * check here is sufficent.
+	 */
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+		lpni = ERR_PTR(-ESHUTDOWN);
+		goto out_mutex_unlock;
+	}
+
+	rc = lnet_peer_ni_traffic_add(nid);
+	if (rc) {
+		lpni = ERR_PTR(rc);
+		goto out_mutex_unlock;
+	}
+
+	lpni = lnet_find_peer_ni_locked(nid);
+	LASSERT(lpni);
+
+out_mutex_unlock:
+	mutex_unlock(&the_lnet.ln_api_mutex);
+	lnet_net_lock(cpt);
+
+	return lpni;
+}
+
+void
+lnet_debug_peer(lnet_nid_t nid)
+{
+	char			*aliveness = "NA";
+	struct lnet_peer_ni	*lp;
+	int			cpt;
+
+	cpt = lnet_cpt_of_nid(nid, NULL);
+	lnet_net_lock(cpt);
+
+	lp = lnet_nid2peerni_locked(nid, cpt);
+	if (IS_ERR(lp)) {
+		lnet_net_unlock(cpt);
+		CDEBUG(D_WARNING, "No peer %s\n", libcfs_nid2str(nid));
+		return;
+	}
+
+	if (lnet_isrouter(lp) || lnet_peer_aliveness_enabled(lp))
+		aliveness = lp->lpni_alive ? "up" : "down";
+
+	CDEBUG(D_WARNING, "%-24s %4d %5s %5d %5d %5d %5d %5d %ld\n",
+	       libcfs_nid2str(lp->lpni_nid), atomic_read(&lp->lpni_refcount),
+	       aliveness, lp->lpni_net->net_tunables.lct_peer_tx_credits,
+	       lp->lpni_rtrcredits, lp->lpni_minrtrcredits,
+	       lp->lpni_txcredits, lp->lpni_mintxcredits, lp->lpni_txqnob);
+
+	lnet_peer_ni_decref_locked(lp);
+
+	lnet_net_unlock(cpt);
+}
+
+int lnet_get_peer_ni_info(__u32 peer_index, __u64 *nid,
+			  char aliveness[LNET_MAX_STR_LEN],
+			  __u32 *cpt_iter, __u32 *refcount,
+			  __u32 *ni_peer_tx_credits, __u32 *peer_tx_credits,
+			  __u32 *peer_rtr_credits, __u32 *peer_min_rtr_credits,
+			  __u32 *peer_tx_qnob)
+{
+	struct lnet_peer_table		*peer_table;
+	struct lnet_peer_ni		*lp;
+	int				j;
+	int				lncpt;
+	bool				found = false;
+
+	/* get the number of CPTs */
+	lncpt = cfs_percpt_number(the_lnet.ln_peer_tables);
+
+	/* if the cpt number to be examined is >= the number of cpts in
+	 * the system then indicate that there are no more cpts to examin
+	 */
+	if (*cpt_iter >= lncpt)
+		return -ENOENT;
+
+	/* get the current table */
+	peer_table = the_lnet.ln_peer_tables[*cpt_iter];
+	/* if the ptable is NULL then there are no more cpts to examine */
+	if (peer_table == NULL)
+		return -ENOENT;
+
+	lnet_net_lock(*cpt_iter);
+
+	for (j = 0; j < LNET_PEER_HASH_SIZE && !found; j++) {
+		struct list_head *peers = &peer_table->pt_hash[j];
+
+		list_for_each_entry(lp, peers, lpni_hashlist) {
+			if (peer_index-- > 0)
+				continue;
+
+			snprintf(aliveness, LNET_MAX_STR_LEN, "NA");
+			if (lnet_isrouter(lp) ||
+				lnet_peer_aliveness_enabled(lp))
+				snprintf(aliveness, LNET_MAX_STR_LEN,
+					 lp->lpni_alive ? "up" : "down");
+
+			*nid = lp->lpni_nid;
+			*refcount = atomic_read(&lp->lpni_refcount);
+			*ni_peer_tx_credits =
+				lp->lpni_net->net_tunables.lct_peer_tx_credits;
+			*peer_tx_credits = lp->lpni_txcredits;
+			*peer_rtr_credits = lp->lpni_rtrcredits;
+			*peer_min_rtr_credits = lp->lpni_mintxcredits;
+			*peer_tx_qnob = lp->lpni_txqnob;
+
+			found = true;
+		}
+
+	}
+	lnet_net_unlock(*cpt_iter);
+
+	*cpt_iter = lncpt;
+
+	return found ? 0 : -ENOENT;
+}
+
+int lnet_get_peer_info(__u32 idx, lnet_nid_t *primary_nid, lnet_nid_t *nid,
+		       bool *mr,
+		       struct lnet_peer_ni_credit_info __user *peer_ni_info,
+		       struct lnet_ioctl_element_stats __user *peer_ni_stats)
+{
+	struct lnet_peer_ni *lpni = NULL;
+	struct lnet_peer_net *lpn = NULL;
+	struct lnet_peer *lp = NULL;
+	struct lnet_peer_ni_credit_info ni_info;
+	struct lnet_ioctl_element_stats ni_stats;
+	int rc;
+
+	lpni = lnet_get_peer_ni_idx_locked(idx, &lpn, &lp);
+
+	if (!lpni)
+		return -ENOENT;
+
+	*primary_nid = lp->lp_primary_nid;
+	*mr = lp->lp_multi_rail;
+	*nid = lpni->lpni_nid;
+	snprintf(ni_info.cr_aliveness, LNET_MAX_STR_LEN, "NA");
+	if (lnet_isrouter(lpni) ||
+		lnet_peer_aliveness_enabled(lpni))
+		snprintf(ni_info.cr_aliveness, LNET_MAX_STR_LEN,
+			 lpni->lpni_alive ? "up" : "down");
+
+	ni_info.cr_refcount = atomic_read(&lpni->lpni_refcount);
+	ni_info.cr_ni_peer_tx_credits = (lpni->lpni_net != NULL) ?
+		lpni->lpni_net->net_tunables.lct_peer_tx_credits : 0;
+	ni_info.cr_peer_tx_credits = lpni->lpni_txcredits;
+	ni_info.cr_peer_rtr_credits = lpni->lpni_rtrcredits;
+	ni_info.cr_peer_min_rtr_credits = lpni->lpni_minrtrcredits;
+	ni_info.cr_peer_min_tx_credits = lpni->lpni_mintxcredits;
+	ni_info.cr_peer_tx_qnob = lpni->lpni_txqnob;
+	ni_info.cr_ncpt = lpni->lpni_cpt;
+
+	ni_stats.iel_send_count = atomic_read(&lpni->lpni_stats.send_count);
+	ni_stats.iel_recv_count = atomic_read(&lpni->lpni_stats.recv_count);
+	ni_stats.iel_drop_count = atomic_read(&lpni->lpni_stats.drop_count);
+
+	/* If copy_to_user fails */
+	rc = -EFAULT;
+	if (copy_to_user(peer_ni_info, &ni_info, sizeof(ni_info)))
+		goto copy_failed;
+
+	if (copy_to_user(peer_ni_stats, &ni_stats, sizeof(ni_stats)))
+		goto copy_failed;
+
+	rc = 0;
+
+copy_failed:
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/lnet/lnet/router.c b/drivers/staging/lustrefsx/lnet/lnet/router.c
new file mode 100644
index 0000000000000..f35b67e2d7bba
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/router.c
@@ -0,0 +1,1849 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ *
+ *   This file is part of Lustre, https://wiki.hpdd.intel.com/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <lnet/lib-lnet.h>
+
+#define LNET_NRB_TINY_MIN	512	/* min value for each CPT */
+#define LNET_NRB_TINY		(LNET_NRB_TINY_MIN * 4)
+#define LNET_NRB_SMALL_MIN	4096	/* min value for each CPT */
+#define LNET_NRB_SMALL		(LNET_NRB_SMALL_MIN * 4)
+#define LNET_NRB_SMALL_PAGES	1
+#define LNET_NRB_LARGE_MIN	256	/* min value for each CPT */
+#define LNET_NRB_LARGE		(LNET_NRB_LARGE_MIN * 4)
+#define LNET_NRB_LARGE_PAGES	((LNET_MTU + PAGE_SIZE - 1) >> \
+				  PAGE_SHIFT)
+
+static char *forwarding = "";
+module_param(forwarding, charp, 0444);
+MODULE_PARM_DESC(forwarding, "Explicitly enable/disable forwarding between networks");
+
+static int tiny_router_buffers;
+module_param(tiny_router_buffers, int, 0444);
+MODULE_PARM_DESC(tiny_router_buffers, "# of 0 payload messages to buffer in the router");
+static int small_router_buffers;
+module_param(small_router_buffers, int, 0444);
+MODULE_PARM_DESC(small_router_buffers, "# of small (1 page) messages to buffer in the router");
+static int large_router_buffers;
+module_param(large_router_buffers, int, 0444);
+MODULE_PARM_DESC(large_router_buffers, "# of large messages to buffer in the router");
+static int peer_buffer_credits;
+module_param(peer_buffer_credits, int, 0444);
+MODULE_PARM_DESC(peer_buffer_credits, "# router buffer credits per peer");
+
+static int auto_down = 1;
+module_param(auto_down, int, 0444);
+MODULE_PARM_DESC(auto_down, "Automatically mark peers down on comms error");
+
+int
+lnet_peer_buffer_credits(struct lnet_net *net)
+{
+	/* NI option overrides LNet default */
+	if (net->net_tunables.lct_peer_rtr_credits > 0)
+		return net->net_tunables.lct_peer_rtr_credits;
+	if (peer_buffer_credits > 0)
+		return peer_buffer_credits;
+
+	/* As an approximation, allow this peer the same number of router
+	 * buffers as it is allowed outstanding sends */
+	return net->net_tunables.lct_peer_tx_credits;
+}
+
+/* forward ref's */
+static int lnet_router_checker(void *);
+
+static int check_routers_before_use;
+module_param(check_routers_before_use, int, 0444);
+MODULE_PARM_DESC(check_routers_before_use, "Assume routers are down and ping them before use");
+
+int avoid_asym_router_failure = 1;
+module_param(avoid_asym_router_failure, int, 0644);
+MODULE_PARM_DESC(avoid_asym_router_failure, "Avoid asymmetrical router failures (0 to disable)");
+
+static int dead_router_check_interval = 60;
+module_param(dead_router_check_interval, int, 0644);
+MODULE_PARM_DESC(dead_router_check_interval, "Seconds between dead router health checks (<= 0 to disable)");
+
+static int live_router_check_interval = 60;
+module_param(live_router_check_interval, int, 0644);
+MODULE_PARM_DESC(live_router_check_interval, "Seconds between live router health checks (<= 0 to disable)");
+
+static int router_ping_timeout = 50;
+module_param(router_ping_timeout, int, 0644);
+MODULE_PARM_DESC(router_ping_timeout, "Seconds to wait for the reply to a router health query");
+
+int
+lnet_peers_start_down(void)
+{
+	return check_routers_before_use;
+}
+
+void
+lnet_notify_locked(struct lnet_peer_ni *lp, int notifylnd, int alive,
+		   cfs_time_t when)
+{
+	if (cfs_time_before(when, lp->lpni_timestamp)) { /* out of date information */
+		CDEBUG(D_NET, "Out of date\n");
+		return;
+	}
+
+	/*
+	 * This function can be called with different cpt locks being
+	 * held. lpni_alive_count modification needs to be properly protected.
+	 * Significant reads to lpni_alive_count are also protected with
+	 * the same lock
+	 */
+	spin_lock(&lp->lpni_lock);
+
+	lp->lpni_timestamp = when;                /* update timestamp */
+	lp->lpni_ping_deadline = 0;               /* disable ping timeout */
+
+	if (lp->lpni_alive_count != 0 &&          /* got old news */
+	    (!lp->lpni_alive) == (!alive)) {      /* new date for old news */
+		spin_unlock(&lp->lpni_lock);
+		CDEBUG(D_NET, "Old news\n");
+		return;
+	}
+
+	/* Flag that notification is outstanding */
+
+	lp->lpni_alive_count++;
+	lp->lpni_alive = (alive) ? 1 : 0;
+	lp->lpni_notify = 1;
+	lp->lpni_notifylnd = notifylnd;
+	if (lp->lpni_alive)
+		lp->lpni_ping_feats = LNET_PING_FEAT_INVAL; /* reset */
+
+	spin_unlock(&lp->lpni_lock);
+
+	CDEBUG(D_NET, "set %s %d\n", libcfs_nid2str(lp->lpni_nid), alive);
+}
+
+/*
+ * This function will always be called with lp->lpni_cpt lock held.
+ */
+static void
+lnet_ni_notify_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp)
+{
+	int alive;
+	int notifylnd;
+
+	/* Notify only in 1 thread at any time to ensure ordered notification.
+	 * NB individual events can be missed; the only guarantee is that you
+	 * always get the most recent news */
+
+	spin_lock(&lp->lpni_lock);
+
+	if (lp->lpni_notifying || ni == NULL) {
+		spin_unlock(&lp->lpni_lock);
+		return;
+	}
+
+	lp->lpni_notifying = 1;
+
+	/*
+	 * lp->lpni_notify needs to be protected because it can be set in
+	 * lnet_notify_locked().
+	 */
+	while (lp->lpni_notify) {
+		alive     = lp->lpni_alive;
+		notifylnd = lp->lpni_notifylnd;
+
+		lp->lpni_notifylnd = 0;
+		lp->lpni_notify    = 0;
+
+		if (notifylnd && ni->ni_net->net_lnd->lnd_notify != NULL) {
+			spin_unlock(&lp->lpni_lock);
+			lnet_net_unlock(lp->lpni_cpt);
+
+			/* A new notification could happen now; I'll handle it
+			 * when control returns to me */
+
+			(ni->ni_net->net_lnd->lnd_notify)(ni, lp->lpni_nid,
+							  alive);
+
+			lnet_net_lock(lp->lpni_cpt);
+			spin_lock(&lp->lpni_lock);
+		}
+	}
+
+	lp->lpni_notifying = 0;
+	spin_unlock(&lp->lpni_lock);
+}
+
+static void
+lnet_rtr_addref_locked(struct lnet_peer_ni *lp)
+{
+	LASSERT(atomic_read(&lp->lpni_refcount) > 0);
+	LASSERT(lp->lpni_rtr_refcount >= 0);
+
+	/* lnet_net_lock must be exclusively locked */
+	lp->lpni_rtr_refcount++;
+	if (lp->lpni_rtr_refcount == 1) {
+		struct list_head *pos;
+
+		/* a simple insertion sort */
+		list_for_each_prev(pos, &the_lnet.ln_routers) {
+			struct lnet_peer_ni *rtr;
+
+			rtr = list_entry(pos, struct lnet_peer_ni,
+					 lpni_rtr_list);
+			if (rtr->lpni_nid < lp->lpni_nid)
+				break;
+		}
+
+		list_add(&lp->lpni_rtr_list, pos);
+		/* addref for the_lnet.ln_routers */
+		lnet_peer_ni_addref_locked(lp);
+		the_lnet.ln_routers_version++;
+	}
+}
+
+static void
+lnet_rtr_decref_locked(struct lnet_peer_ni *lp)
+{
+	LASSERT(atomic_read(&lp->lpni_refcount) > 0);
+	LASSERT(lp->lpni_rtr_refcount > 0);
+
+	/* lnet_net_lock must be exclusively locked */
+	lp->lpni_rtr_refcount--;
+	if (lp->lpni_rtr_refcount == 0) {
+		LASSERT(list_empty(&lp->lpni_routes));
+
+		if (lp->lpni_rcd != NULL) {
+			list_add(&lp->lpni_rcd->rcd_list,
+				 &the_lnet.ln_rcd_deathrow);
+			lp->lpni_rcd = NULL;
+		}
+
+		list_del(&lp->lpni_rtr_list);
+		/* decref for the_lnet.ln_routers */
+		lnet_peer_ni_decref_locked(lp);
+		the_lnet.ln_routers_version++;
+	}
+}
+
+struct lnet_remotenet *
+lnet_find_rnet_locked(__u32 net)
+{
+	struct lnet_remotenet *rnet;
+	struct list_head *tmp;
+	struct list_head *rn_list;
+
+	LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING);
+
+	rn_list = lnet_net2rnethash(net);
+	list_for_each(tmp, rn_list) {
+		rnet = list_entry(tmp, struct lnet_remotenet, lrn_list);
+
+		if (rnet->lrn_net == net)
+			return rnet;
+	}
+	return NULL;
+}
+
+static void lnet_shuffle_seed(void)
+{
+	static int seeded;
+	__u32 lnd_type;
+	__u32 seed[2];
+	struct timespec64 ts;
+	struct lnet_ni *ni = NULL;
+
+	if (seeded)
+		return;
+
+	cfs_get_random_bytes(seed, sizeof(seed));
+
+	/* Nodes with small feet have little entropy
+	 * the NID for this node gives the most entropy in the low bits */
+	while ((ni = lnet_get_next_ni_locked(NULL, ni))) {
+		lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid));
+
+		if (lnd_type != LOLND)
+			seed[0] ^= (LNET_NIDADDR(ni->ni_nid) | lnd_type);
+	}
+
+	ktime_get_ts64(&ts);
+	cfs_srand(ts.tv_sec ^ seed[0], ts.tv_nsec ^ seed[1]);
+	seeded = 1;
+	return;
+}
+
+/* NB expects LNET_LOCK held */
+static void
+lnet_add_route_to_rnet(struct lnet_remotenet *rnet, struct lnet_route *route)
+{
+	unsigned int	  len = 0;
+	unsigned int	  offset = 0;
+	struct list_head *e;
+
+	lnet_shuffle_seed();
+
+	list_for_each(e, &rnet->lrn_routes) {
+		len++;
+	}
+
+	/* len+1 positions to add a new entry, also prevents division by 0 */
+	offset = cfs_rand() % (len + 1);
+	list_for_each(e, &rnet->lrn_routes) {
+		if (offset == 0)
+			break;
+		offset--;
+	}
+	list_add(&route->lr_list, e);
+	list_add(&route->lr_gwlist, &route->lr_gateway->lpni_routes);
+
+	the_lnet.ln_remote_nets_version++;
+	lnet_rtr_addref_locked(route->lr_gateway);
+}
+
+int
+lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
+	       unsigned int priority)
+{
+	struct list_head	*e;
+	struct lnet_remotenet	*rnet;
+	struct lnet_remotenet	*rnet2;
+	struct lnet_route		*route;
+	struct lnet_ni		*ni;
+	struct lnet_peer_ni	*lpni;
+	int			add_route;
+	int			rc;
+
+	CDEBUG(D_NET, "Add route: net %s hops %d priority %u gw %s\n",
+	       libcfs_net2str(net), hops, priority, libcfs_nid2str(gateway));
+
+	if (gateway == LNET_NID_ANY ||
+	    LNET_NETTYP(LNET_NIDNET(gateway)) == LOLND ||
+	    net == LNET_NIDNET(LNET_NID_ANY) ||
+	    LNET_NETTYP(net) == LOLND ||
+	    LNET_NIDNET(gateway) == net ||
+	    (hops != LNET_UNDEFINED_HOPS && (hops < 1 || hops > 255)))
+		return -EINVAL;
+
+	if (lnet_islocalnet(net))	/* it's a local network */
+		return -EEXIST;
+
+	/* Assume net, route, all new */
+	LIBCFS_ALLOC(route, sizeof(*route));
+	LIBCFS_ALLOC(rnet, sizeof(*rnet));
+	if (route == NULL || rnet == NULL) {
+		CERROR("Out of memory creating route %s %d %s\n",
+		       libcfs_net2str(net), hops, libcfs_nid2str(gateway));
+		if (route != NULL)
+			LIBCFS_FREE(route, sizeof(*route));
+		if (rnet != NULL)
+			LIBCFS_FREE(rnet, sizeof(*rnet));
+		return -ENOMEM;
+	}
+
+	INIT_LIST_HEAD(&rnet->lrn_routes);
+	rnet->lrn_net = net;
+	route->lr_hops = hops;
+	route->lr_net = net;
+	route->lr_priority = priority;
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	lpni = lnet_nid2peerni_ex(gateway, LNET_LOCK_EX);
+	if (IS_ERR(lpni)) {
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		LIBCFS_FREE(route, sizeof(*route));
+		LIBCFS_FREE(rnet, sizeof(*rnet));
+
+		rc = PTR_ERR(lpni);
+		if (rc == -EHOSTUNREACH) /* gateway is not on a local net. */
+			return rc;	 /* ignore the route entry */
+		CERROR("Error %d creating route %s %d %s\n", rc,
+			libcfs_net2str(net), hops,
+			libcfs_nid2str(gateway));
+		return rc;
+	}
+	route->lr_gateway = lpni;
+	LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING);
+
+	rnet2 = lnet_find_rnet_locked(net);
+	if (rnet2 == NULL) {
+		/* new network */
+		list_add_tail(&rnet->lrn_list, lnet_net2rnethash(net));
+		rnet2 = rnet;
+	}
+
+	/* Search for a duplicate route (it's a NOOP if it is) */
+	add_route = 1;
+	list_for_each(e, &rnet2->lrn_routes) {
+		struct lnet_route *route2;
+
+		route2 = list_entry(e, struct lnet_route, lr_list);
+		if (route2->lr_gateway == route->lr_gateway) {
+			add_route = 0;
+			break;
+		}
+
+		/* our lookups must be true */
+		LASSERT(route2->lr_gateway->lpni_nid != gateway);
+	}
+
+	if (add_route) {
+		lnet_peer_ni_addref_locked(route->lr_gateway); /* +1 for notify */
+		lnet_add_route_to_rnet(rnet2, route);
+
+		ni = lnet_get_next_ni_locked(route->lr_gateway->lpni_net, NULL);
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		/* XXX Assume alive */
+		if (ni->ni_net->net_lnd->lnd_notify != NULL)
+			(ni->ni_net->net_lnd->lnd_notify)(ni, gateway, 1);
+
+		lnet_net_lock(LNET_LOCK_EX);
+	}
+
+	/* -1 for notify or !add_route */
+	lnet_peer_ni_decref_locked(route->lr_gateway);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	rc = 0;
+
+	if (!add_route) {
+		rc = -EEXIST;
+		LIBCFS_FREE(route, sizeof(*route));
+	}
+
+	if (rnet != rnet2)
+		LIBCFS_FREE(rnet, sizeof(*rnet));
+
+	/* indicate to startup the router checker if configured */
+	wake_up(&the_lnet.ln_rc_waitq);
+
+	return rc;
+}
+
+int
+lnet_check_routes(void)
+{
+	struct lnet_remotenet *rnet;
+	struct lnet_route	 *route;
+	struct lnet_route	 *route2;
+	struct list_head *e1;
+	struct list_head *e2;
+	int		  cpt;
+	struct list_head *rn_list;
+	int		  i;
+
+	cpt = lnet_net_lock_current();
+
+	for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
+		rn_list = &the_lnet.ln_remote_nets_hash[i];
+		list_for_each(e1, rn_list) {
+			rnet = list_entry(e1, struct lnet_remotenet, lrn_list);
+
+			route2 = NULL;
+			list_for_each(e2, &rnet->lrn_routes) {
+				lnet_nid_t	nid1;
+				lnet_nid_t	nid2;
+				int		net;
+
+				route = list_entry(e2, struct lnet_route,
+						   lr_list);
+
+				if (route2 == NULL) {
+					route2 = route;
+					continue;
+				}
+
+				if (route->lr_gateway->lpni_net ==
+				    route2->lr_gateway->lpni_net)
+					continue;
+
+				nid1 = route->lr_gateway->lpni_nid;
+				nid2 = route2->lr_gateway->lpni_nid;
+				net = rnet->lrn_net;
+
+				lnet_net_unlock(cpt);
+
+				CERROR("Routes to %s via %s and %s not "
+				       "supported\n",
+				       libcfs_net2str(net),
+				       libcfs_nid2str(nid1),
+				       libcfs_nid2str(nid2));
+				return -EINVAL;
+			}
+		}
+	}
+
+	lnet_net_unlock(cpt);
+	return 0;
+}
+
+int
+lnet_del_route(__u32 net, lnet_nid_t gw_nid)
+{
+	struct lnet_peer_ni	*gateway;
+	struct lnet_remotenet	*rnet;
+	struct lnet_route		*route;
+	struct list_head	*e1;
+	struct list_head	*e2;
+	int			rc = -ENOENT;
+	struct list_head	*rn_list;
+	int			idx = 0;
+
+	CDEBUG(D_NET, "Del route: net %s : gw %s\n",
+	       libcfs_net2str(net), libcfs_nid2str(gw_nid));
+
+	/* NB Caller may specify either all routes via the given gateway
+	 * or a specific route entry actual NIDs) */
+
+	lnet_net_lock(LNET_LOCK_EX);
+	if (net == LNET_NIDNET(LNET_NID_ANY))
+		rn_list = &the_lnet.ln_remote_nets_hash[0];
+	else
+		rn_list = lnet_net2rnethash(net);
+
+again:
+	list_for_each(e1, rn_list) {
+		rnet = list_entry(e1, struct lnet_remotenet, lrn_list);
+
+		if (!(net == LNET_NIDNET(LNET_NID_ANY) ||
+			net == rnet->lrn_net))
+			continue;
+
+		list_for_each(e2, &rnet->lrn_routes) {
+			route = list_entry(e2, struct lnet_route, lr_list);
+
+			gateway = route->lr_gateway;
+			if (!(gw_nid == LNET_NID_ANY ||
+			      gw_nid == gateway->lpni_nid))
+				continue;
+
+			list_del(&route->lr_list);
+			list_del(&route->lr_gwlist);
+			the_lnet.ln_remote_nets_version++;
+
+			if (list_empty(&rnet->lrn_routes))
+				list_del(&rnet->lrn_list);
+			else
+				rnet = NULL;
+
+			lnet_rtr_decref_locked(gateway);
+			lnet_peer_ni_decref_locked(gateway);
+
+			lnet_net_unlock(LNET_LOCK_EX);
+
+			LIBCFS_FREE(route, sizeof(*route));
+
+			if (rnet != NULL)
+				LIBCFS_FREE(rnet, sizeof(*rnet));
+
+			rc = 0;
+			lnet_net_lock(LNET_LOCK_EX);
+			goto again;
+		}
+	}
+
+	if (net == LNET_NIDNET(LNET_NID_ANY) &&
+	    ++idx < LNET_REMOTE_NETS_HASH_SIZE) {
+		rn_list = &the_lnet.ln_remote_nets_hash[idx];
+		goto again;
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	return rc;
+}
+
+void
+lnet_destroy_routes (void)
+{
+	lnet_del_route(LNET_NIDNET(LNET_NID_ANY), LNET_NID_ANY);
+}
+
+int lnet_get_rtr_pool_cfg(int idx, struct lnet_ioctl_pool_cfg *pool_cfg)
+{
+	int i, rc = -ENOENT, j;
+
+	if (the_lnet.ln_rtrpools == NULL)
+		return rc;
+
+	for (i = 0; i < LNET_NRBPOOLS; i++) {
+		struct lnet_rtrbufpool *rbp;
+
+		lnet_net_lock(LNET_LOCK_EX);
+		cfs_percpt_for_each(rbp, j, the_lnet.ln_rtrpools) {
+			if (i++ != idx)
+				continue;
+
+			pool_cfg->pl_pools[i].pl_npages = rbp[i].rbp_npages;
+			pool_cfg->pl_pools[i].pl_nbuffers = rbp[i].rbp_nbuffers;
+			pool_cfg->pl_pools[i].pl_credits = rbp[i].rbp_credits;
+			pool_cfg->pl_pools[i].pl_mincredits = rbp[i].rbp_mincredits;
+			rc = 0;
+			break;
+		}
+		lnet_net_unlock(LNET_LOCK_EX);
+	}
+
+	lnet_net_lock(LNET_LOCK_EX);
+	pool_cfg->pl_routing = the_lnet.ln_routing;
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	return rc;
+}
+
+int
+lnet_get_route(int idx, __u32 *net, __u32 *hops,
+	       lnet_nid_t *gateway, __u32 *alive, __u32 *priority)
+{
+	struct list_head *e1;
+	struct list_head *e2;
+	struct lnet_remotenet *rnet;
+	struct lnet_route	 *route;
+	int		  cpt;
+	int		  i;
+	struct list_head *rn_list;
+
+	cpt = lnet_net_lock_current();
+
+	for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
+		rn_list = &the_lnet.ln_remote_nets_hash[i];
+		list_for_each(e1, rn_list) {
+			rnet = list_entry(e1, struct lnet_remotenet, lrn_list);
+
+			list_for_each(e2, &rnet->lrn_routes) {
+				route = list_entry(e2, struct lnet_route,
+						   lr_list);
+
+				if (idx-- == 0) {
+					*net	  = rnet->lrn_net;
+					*hops	  = route->lr_hops;
+					*priority = route->lr_priority;
+					*gateway  = route->lr_gateway->lpni_nid;
+					*alive	  = lnet_is_route_alive(route);
+					lnet_net_unlock(cpt);
+					return 0;
+				}
+			}
+		}
+	}
+
+	lnet_net_unlock(cpt);
+	return -ENOENT;
+}
+
+void
+lnet_swap_pinginfo(struct lnet_ping_info *info)
+{
+	int		  i;
+	struct lnet_ni_status *stat;
+
+	__swab32s(&info->pi_magic);
+	__swab32s(&info->pi_features);
+	__swab32s(&info->pi_pid);
+	__swab32s(&info->pi_nnis);
+	for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
+		stat = &info->pi_ni[i];
+		__swab64s(&stat->ns_nid);
+		__swab32s(&stat->ns_status);
+	}
+	return;
+}
+
+/**
+ * parse router-checker pinginfo, record number of down NIs for remote
+ * networks on that router.
+ */
+static void
+lnet_parse_rc_info(struct lnet_rc_data *rcd)
+{
+	struct lnet_ping_info	*info = rcd->rcd_pinginfo;
+	struct lnet_peer_ni	*gw   = rcd->rcd_gateway;
+	struct lnet_route		*rte;
+
+	if (!gw->lpni_alive)
+		return;
+
+	/*
+	 * Protect gw->lpni_ping_feats. This can be set from
+	 * lnet_notify_locked with different locks being held
+	 */
+	spin_lock(&gw->lpni_lock);
+
+	if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC))
+		lnet_swap_pinginfo(info);
+
+	/* NB always racing with network! */
+	if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
+		CDEBUG(D_NET, "%s: Unexpected magic %08x\n",
+		       libcfs_nid2str(gw->lpni_nid), info->pi_magic);
+		gw->lpni_ping_feats = LNET_PING_FEAT_INVAL;
+		spin_unlock(&gw->lpni_lock);
+		return;
+	}
+
+	gw->lpni_ping_feats = info->pi_features;
+	if ((gw->lpni_ping_feats & LNET_PING_FEAT_MASK) == 0) {
+		CDEBUG(D_NET, "%s: Unexpected features 0x%x\n",
+		       libcfs_nid2str(gw->lpni_nid), gw->lpni_ping_feats);
+		spin_unlock(&gw->lpni_lock);
+		return; /* nothing I can understand */
+	}
+
+	if ((gw->lpni_ping_feats & LNET_PING_FEAT_NI_STATUS) == 0) {
+		spin_unlock(&gw->lpni_lock);
+		return; /* can't carry NI status info */
+	}
+
+	list_for_each_entry(rte, &gw->lpni_routes, lr_gwlist) {
+		int	down = 0;
+		int	up = 0;
+		int	i;
+
+		if ((gw->lpni_ping_feats & LNET_PING_FEAT_RTE_DISABLED) != 0) {
+			rte->lr_downis = 1;
+			continue;
+		}
+
+		for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
+			struct lnet_ni_status *stat = &info->pi_ni[i];
+			lnet_nid_t	 nid = stat->ns_nid;
+
+			if (nid == LNET_NID_ANY) {
+				CDEBUG(D_NET, "%s: unexpected LNET_NID_ANY\n",
+				       libcfs_nid2str(gw->lpni_nid));
+				gw->lpni_ping_feats = LNET_PING_FEAT_INVAL;
+				spin_unlock(&gw->lpni_lock);
+				return;
+			}
+
+			if (LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
+				continue;
+
+			if (stat->ns_status == LNET_NI_STATUS_DOWN) {
+				down++;
+				continue;
+			}
+
+			if (stat->ns_status == LNET_NI_STATUS_UP) {
+				if (LNET_NIDNET(nid) == rte->lr_net) {
+					up = 1;
+					break;
+				}
+				continue;
+			}
+
+			CDEBUG(D_NET, "%s: Unexpected status 0x%x\n",
+			       libcfs_nid2str(gw->lpni_nid), stat->ns_status);
+			gw->lpni_ping_feats = LNET_PING_FEAT_INVAL;
+			spin_unlock(&gw->lpni_lock);
+			return;
+		}
+
+		if (up) { /* ignore downed NIs if NI for dest network is up */
+			rte->lr_downis = 0;
+			continue;
+		}
+		/* if @down is zero and this route is single-hop, it means
+		 * we can't find NI for target network */
+		if (down == 0 && rte->lr_hops == 1)
+			down = 1;
+
+		rte->lr_downis = down;
+	}
+
+	spin_unlock(&gw->lpni_lock);
+}
+
+static void
+lnet_router_checker_event(struct lnet_event *event)
+{
+	struct lnet_rc_data *rcd = event->md.user_ptr;
+	struct lnet_peer_ni *lp;
+
+	LASSERT(rcd != NULL);
+
+	if (event->unlinked) {
+		LNetInvalidateMDHandle(&rcd->rcd_mdh);
+		return;
+	}
+
+	LASSERT(event->type == LNET_EVENT_SEND ||
+		event->type == LNET_EVENT_REPLY);
+
+	lp = rcd->rcd_gateway;
+	LASSERT(lp != NULL);
+
+	 /* NB: it's called with holding lnet_res_lock, we have a few
+	  * places need to hold both locks at the same time, please take
+	  * care of lock ordering */
+	lnet_net_lock(lp->lpni_cpt);
+	if (!lnet_isrouter(lp) || lp->lpni_rcd != rcd) {
+		/* ignore if no longer a router or rcd is replaced */
+		goto out;
+	}
+
+	if (event->type == LNET_EVENT_SEND) {
+		lp->lpni_ping_notsent = 0;
+		if (event->status == 0)
+			goto out;
+	}
+
+	/* LNET_EVENT_REPLY */
+	/* A successful REPLY means the router is up.  If _any_ comms
+	 * to the router fail I assume it's down (this will happen if
+	 * we ping alive routers to try to detect router death before
+	 * apps get burned). */
+
+	lnet_notify_locked(lp, 1, (event->status == 0), cfs_time_current());
+	/* The router checker will wake up very shortly and do the
+	 * actual notification.
+	 * XXX If 'lp' stops being a router before then, it will still
+	 * have the notification pending!!! */
+
+	if (avoid_asym_router_failure && event->status == 0)
+		lnet_parse_rc_info(rcd);
+
+ out:
+	lnet_net_unlock(lp->lpni_cpt);
+}
+
+static void
+lnet_wait_known_routerstate(void)
+{
+	struct lnet_peer_ni *rtr;
+	struct list_head *entry;
+	int all_known;
+
+	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+	for (;;) {
+		int cpt = lnet_net_lock_current();
+
+		all_known = 1;
+		list_for_each(entry, &the_lnet.ln_routers) {
+			rtr = list_entry(entry, struct lnet_peer_ni,
+					 lpni_rtr_list);
+
+			spin_lock(&rtr->lpni_lock);
+
+			if (rtr->lpni_alive_count == 0) {
+				all_known = 0;
+				spin_unlock(&rtr->lpni_lock);
+				break;
+			}
+			spin_unlock(&rtr->lpni_lock);
+		}
+
+		lnet_net_unlock(cpt);
+
+		if (all_known)
+			return;
+
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(1));
+	}
+}
+
+void
+lnet_router_ni_update_locked(struct lnet_peer_ni *gw, __u32 net)
+{
+	struct lnet_route *rte;
+
+	if ((gw->lpni_ping_feats & LNET_PING_FEAT_NI_STATUS) != 0) {
+		list_for_each_entry(rte, &gw->lpni_routes, lr_gwlist) {
+			if (rte->lr_net == net) {
+				rte->lr_downis = 0;
+				break;
+			}
+		}
+	}
+}
+
+static void
+lnet_update_ni_status_locked(void)
+{
+	struct lnet_ni *ni = NULL;
+	long now;
+	int timeout;
+
+	LASSERT(the_lnet.ln_routing);
+
+	timeout = router_ping_timeout +
+		  MAX(live_router_check_interval, dead_router_check_interval);
+
+	now = cfs_time_current_sec();
+	while ((ni = lnet_get_next_ni_locked(NULL, ni))) {
+		if (ni->ni_net->net_lnd->lnd_type == LOLND)
+			continue;
+
+		if (now < ni->ni_last_alive + timeout)
+			continue;
+
+		lnet_ni_lock(ni);
+		/* re-check with lock */
+		if (now < ni->ni_last_alive + timeout) {
+			lnet_ni_unlock(ni);
+			continue;
+		}
+
+		LASSERT(ni->ni_status != NULL);
+
+		if (ni->ni_status->ns_status != LNET_NI_STATUS_DOWN) {
+			CDEBUG(D_NET, "NI(%s:%d) status changed to down\n",
+			       libcfs_nid2str(ni->ni_nid), timeout);
+			/* NB: so far, this is the only place to set
+			 * NI status to "down" */
+			ni->ni_status->ns_status = LNET_NI_STATUS_DOWN;
+		}
+		lnet_ni_unlock(ni);
+	}
+}
+
+static void
+lnet_destroy_rc_data(struct lnet_rc_data *rcd)
+{
+	LASSERT(list_empty(&rcd->rcd_list));
+	/* detached from network */
+	LASSERT(LNetMDHandleIsInvalid(rcd->rcd_mdh));
+
+	if (rcd->rcd_gateway != NULL) {
+		int cpt = rcd->rcd_gateway->lpni_cpt;
+
+		lnet_net_lock(cpt);
+		lnet_peer_ni_decref_locked(rcd->rcd_gateway);
+		lnet_net_unlock(cpt);
+	}
+
+	if (rcd->rcd_pinginfo != NULL)
+		LIBCFS_FREE(rcd->rcd_pinginfo, LNET_PINGINFO_SIZE);
+
+	LIBCFS_FREE(rcd, sizeof(*rcd));
+}
+
+static struct lnet_rc_data *
+lnet_create_rc_data_locked(struct lnet_peer_ni *gateway)
+{
+	struct lnet_rc_data		*rcd = NULL;
+	struct lnet_ping_info	*pi;
+	int			rc;
+	int			i;
+
+	lnet_net_unlock(gateway->lpni_cpt);
+
+	LIBCFS_ALLOC(rcd, sizeof(*rcd));
+	if (rcd == NULL)
+		goto out;
+
+	LNetInvalidateMDHandle(&rcd->rcd_mdh);
+	INIT_LIST_HEAD(&rcd->rcd_list);
+
+	LIBCFS_ALLOC(pi, LNET_PINGINFO_SIZE);
+	if (pi == NULL)
+		goto out;
+
+	for (i = 0; i < LNET_MAX_RTR_NIS; i++) {
+		pi->pi_ni[i].ns_nid = LNET_NID_ANY;
+		pi->pi_ni[i].ns_status = LNET_NI_STATUS_INVALID;
+	}
+	rcd->rcd_pinginfo = pi;
+
+	LASSERT(!LNetEQHandleIsInvalid(the_lnet.ln_rc_eqh));
+	rc = LNetMDBind((struct lnet_md){.start     = pi,
+				    .user_ptr  = rcd,
+				    .length    = LNET_PINGINFO_SIZE,
+				    .threshold = LNET_MD_THRESH_INF,
+				    .options   = LNET_MD_TRUNCATE,
+				    .eq_handle = the_lnet.ln_rc_eqh},
+			LNET_UNLINK,
+			&rcd->rcd_mdh);
+	if (rc < 0) {
+		CERROR("Can't bind MD: %d\n", rc);
+		goto out;
+	}
+	LASSERT(rc == 0);
+
+	lnet_net_lock(gateway->lpni_cpt);
+	/* router table changed or someone has created rcd for this gateway */
+	if (!lnet_isrouter(gateway) || gateway->lpni_rcd != NULL) {
+		lnet_net_unlock(gateway->lpni_cpt);
+		goto out;
+	}
+
+	lnet_peer_ni_addref_locked(gateway);
+	rcd->rcd_gateway = gateway;
+	gateway->lpni_rcd = rcd;
+	gateway->lpni_ping_notsent = 0;
+
+	return rcd;
+
+out:
+	if (rcd != NULL) {
+		if (!LNetMDHandleIsInvalid(rcd->rcd_mdh)) {
+			rc = LNetMDUnlink(rcd->rcd_mdh);
+			LASSERT(rc == 0);
+		}
+		lnet_destroy_rc_data(rcd);
+	}
+
+	lnet_net_lock(gateway->lpni_cpt);
+	return gateway->lpni_rcd;
+}
+
+static int
+lnet_router_check_interval(struct lnet_peer_ni *rtr)
+{
+	int secs;
+
+	secs = rtr->lpni_alive ? live_router_check_interval :
+			       dead_router_check_interval;
+	if (secs < 0)
+		secs = 0;
+
+	return secs;
+}
+
+static void
+lnet_ping_router_locked(struct lnet_peer_ni *rtr)
+{
+	struct lnet_rc_data *rcd = NULL;
+	cfs_time_t      now = cfs_time_current();
+	int             secs;
+	struct lnet_ni  *ni;
+
+	lnet_peer_ni_addref_locked(rtr);
+
+	if (rtr->lpni_ping_deadline != 0 && /* ping timed out? */
+	    cfs_time_after(now, rtr->lpni_ping_deadline))
+		lnet_notify_locked(rtr, 1, 0, now);
+
+	/* Run any outstanding notifications */
+	ni = lnet_get_next_ni_locked(rtr->lpni_net, NULL);
+	lnet_ni_notify_locked(ni, rtr);
+
+	if (!lnet_isrouter(rtr) ||
+	    the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
+		/* router table changed or router checker is shutting down */
+		lnet_peer_ni_decref_locked(rtr);
+		return;
+	}
+
+	rcd = rtr->lpni_rcd != NULL ?
+	      rtr->lpni_rcd : lnet_create_rc_data_locked(rtr);
+
+	if (rcd == NULL)
+		return;
+
+	secs = lnet_router_check_interval(rtr);
+
+	CDEBUG(D_NET,
+	       "rtr %s %d: deadline %lu ping_notsent %d alive %d "
+	       "alive_count %d lpni_ping_timestamp %lu\n",
+	       libcfs_nid2str(rtr->lpni_nid), secs,
+	       rtr->lpni_ping_deadline, rtr->lpni_ping_notsent,
+	       rtr->lpni_alive, rtr->lpni_alive_count, rtr->lpni_ping_timestamp);
+
+	if (secs != 0 && !rtr->lpni_ping_notsent &&
+	    cfs_time_after(now, cfs_time_add(rtr->lpni_ping_timestamp,
+					     cfs_time_seconds(secs)))) {
+		int               rc;
+		struct lnet_process_id id;
+		struct lnet_handle_md mdh;
+
+		id.nid = rtr->lpni_nid;
+		id.pid = LNET_PID_LUSTRE;
+		CDEBUG(D_NET, "Check: %s\n", libcfs_id2str(id));
+
+		rtr->lpni_ping_notsent   = 1;
+		rtr->lpni_ping_timestamp = now;
+
+		mdh = rcd->rcd_mdh;
+
+		if (rtr->lpni_ping_deadline == 0) {
+			rtr->lpni_ping_deadline =
+				cfs_time_shift(router_ping_timeout);
+		}
+
+		lnet_net_unlock(rtr->lpni_cpt);
+
+		rc = LNetGet(LNET_NID_ANY, mdh, id, LNET_RESERVED_PORTAL,
+			     LNET_PROTO_PING_MATCHBITS, 0);
+
+		lnet_net_lock(rtr->lpni_cpt);
+		if (rc != 0)
+			rtr->lpni_ping_notsent = 0; /* no event pending */
+	}
+
+	lnet_peer_ni_decref_locked(rtr);
+	return;
+}
+
+int
+lnet_router_checker_start(void)
+{
+	int			rc;
+	int			eqsz = 0;
+	struct task_struct     *task;
+
+	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
+
+	if (check_routers_before_use &&
+	    dead_router_check_interval <= 0) {
+		LCONSOLE_ERROR_MSG(0x10a, "'dead_router_check_interval' must be"
+				   " set if 'check_routers_before_use' is set"
+				   "\n");
+		return -EINVAL;
+	}
+
+	sema_init(&the_lnet.ln_rc_signal, 0);
+
+	rc = LNetEQAlloc(0, lnet_router_checker_event, &the_lnet.ln_rc_eqh);
+	if (rc != 0) {
+		CERROR("Can't allocate EQ(%d): %d\n", eqsz, rc);
+		return -ENOMEM;
+	}
+
+	the_lnet.ln_rc_state = LNET_RC_STATE_RUNNING;
+	task = kthread_run(lnet_router_checker, NULL, "router_checker");
+	if (IS_ERR(task)) {
+		rc = PTR_ERR(task);
+		CERROR("Can't start router checker thread: %d\n", rc);
+		/* block until event callback signals exit */
+		down(&the_lnet.ln_rc_signal);
+		rc = LNetEQFree(the_lnet.ln_rc_eqh);
+		LASSERT(rc == 0);
+		the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+		return -ENOMEM;
+	}
+
+	if (check_routers_before_use) {
+		/* Note that a helpful side-effect of pinging all known routers
+		 * at startup is that it makes them drop stale connections they
+		 * may have to a previous instance of me. */
+		lnet_wait_known_routerstate();
+	}
+
+	return 0;
+}
+
+void
+lnet_router_checker_stop (void)
+{
+	int rc;
+
+	if (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN)
+		return;
+
+	LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+	the_lnet.ln_rc_state = LNET_RC_STATE_STOPPING;
+	/* wakeup the RC thread if it's sleeping */
+	wake_up(&the_lnet.ln_rc_waitq);
+
+	/* block until event callback signals exit */
+	down(&the_lnet.ln_rc_signal);
+	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
+
+	rc = LNetEQFree(the_lnet.ln_rc_eqh);
+	LASSERT(rc == 0);
+	return;
+}
+
+static void
+lnet_prune_rc_data(int wait_unlink)
+{
+	struct lnet_rc_data *rcd;
+	struct lnet_rc_data *tmp;
+	struct lnet_peer_ni *lp;
+	struct list_head head;
+	int i = 2;
+
+	if (likely(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING &&
+		   list_empty(&the_lnet.ln_rcd_deathrow) &&
+		   list_empty(&the_lnet.ln_rcd_zombie)))
+		return;
+
+	INIT_LIST_HEAD(&head);
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	if (the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
+		/* router checker is stopping, prune all */
+		list_for_each_entry(lp, &the_lnet.ln_routers,
+				    lpni_rtr_list) {
+			if (lp->lpni_rcd == NULL)
+				continue;
+
+			LASSERT(list_empty(&lp->lpni_rcd->rcd_list));
+			list_add(&lp->lpni_rcd->rcd_list,
+				 &the_lnet.ln_rcd_deathrow);
+			lp->lpni_rcd = NULL;
+		}
+	}
+
+	/* unlink all RCDs on deathrow list */
+	list_splice_init(&the_lnet.ln_rcd_deathrow, &head);
+
+	if (!list_empty(&head)) {
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		list_for_each_entry(rcd, &head, rcd_list)
+			LNetMDUnlink(rcd->rcd_mdh);
+
+		lnet_net_lock(LNET_LOCK_EX);
+	}
+
+	list_splice_init(&head, &the_lnet.ln_rcd_zombie);
+
+	/* release all zombie RCDs */
+	while (!list_empty(&the_lnet.ln_rcd_zombie)) {
+		list_for_each_entry_safe(rcd, tmp, &the_lnet.ln_rcd_zombie,
+					 rcd_list) {
+			if (LNetMDHandleIsInvalid(rcd->rcd_mdh))
+				list_move(&rcd->rcd_list, &head);
+		}
+
+		wait_unlink = wait_unlink &&
+			      !list_empty(&the_lnet.ln_rcd_zombie);
+
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		while (!list_empty(&head)) {
+			rcd = list_entry(head.next,
+					 struct lnet_rc_data, rcd_list);
+			list_del_init(&rcd->rcd_list);
+			lnet_destroy_rc_data(rcd);
+		}
+
+		if (!wait_unlink)
+			return;
+
+		i++;
+		CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+		       "Waiting for rc buffers to unlink\n");
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(1) / 4);
+
+		lnet_net_lock(LNET_LOCK_EX);
+	}
+
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+/*
+ * This function is called to check if the RC should block indefinitely.
+ * It's called from lnet_router_checker() as well as being passed to
+ * wait_event_interruptible() to avoid the lost wake_up problem.
+ *
+ * When it's called from wait_event_interruptible() it is necessary to
+ * also not sleep if the rc state is not running to avoid a deadlock
+ * when the system is shutting down
+ */
+static inline bool
+lnet_router_checker_active(void)
+{
+	if (the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING)
+		return true;
+
+	/* Router Checker thread needs to run when routing is enabled in
+	 * order to call lnet_update_ni_status_locked() */
+	if (the_lnet.ln_routing)
+		return true;
+
+	return !list_empty(&the_lnet.ln_routers) &&
+		(live_router_check_interval > 0 ||
+		 dead_router_check_interval > 0);
+}
+
+static int
+lnet_router_checker(void *arg)
+{
+	struct lnet_peer_ni *rtr;
+	struct list_head *entry;
+
+	cfs_block_allsigs();
+
+	while (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) {
+		__u64	version;
+		int	cpt;
+		int	cpt2;
+
+		cpt = lnet_net_lock_current();
+rescan:
+		version = the_lnet.ln_routers_version;
+
+		list_for_each(entry, &the_lnet.ln_routers) {
+			rtr = list_entry(entry, struct lnet_peer_ni,
+					 lpni_rtr_list);
+
+			cpt2 = rtr->lpni_cpt;
+			if (cpt != cpt2) {
+				lnet_net_unlock(cpt);
+				cpt = cpt2;
+				lnet_net_lock(cpt);
+				/* the routers list has changed */
+				if (version != the_lnet.ln_routers_version)
+					goto rescan;
+			}
+
+			lnet_ping_router_locked(rtr);
+
+			/* NB dropped lock */
+			if (version != the_lnet.ln_routers_version) {
+				/* the routers list has changed */
+				goto rescan;
+			}
+		}
+
+		if (the_lnet.ln_routing)
+			lnet_update_ni_status_locked();
+
+		lnet_net_unlock(cpt);
+
+		lnet_prune_rc_data(0); /* don't wait for UNLINK */
+
+		/* Call schedule_timeout() here always adds 1 to load average
+		 * because kernel counts # active tasks as nr_running
+		 * + nr_uninterruptible. */
+		/* if there are any routes then wakeup every second.  If
+		 * there are no routes then sleep indefinitely until woken
+		 * up by a user adding a route */
+		if (!lnet_router_checker_active())
+			wait_event_interruptible(the_lnet.ln_rc_waitq,
+						 lnet_router_checker_active());
+		else
+			wait_event_interruptible_timeout(the_lnet.ln_rc_waitq,
+							 false,
+							 cfs_time_seconds(1));
+	}
+
+	lnet_prune_rc_data(1); /* wait for UNLINK */
+
+	the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+	up(&the_lnet.ln_rc_signal);
+	/* The unlink event callback will signal final completion */
+	return 0;
+}
+
+void
+lnet_destroy_rtrbuf(struct lnet_rtrbuf *rb, int npages)
+{
+	int sz = offsetof(struct lnet_rtrbuf, rb_kiov[npages]);
+
+	while (--npages >= 0)
+		__free_page(rb->rb_kiov[npages].kiov_page);
+
+	LIBCFS_FREE(rb, sz);
+}
+
+static struct lnet_rtrbuf *
+lnet_new_rtrbuf(struct lnet_rtrbufpool *rbp, int cpt)
+{
+	int	       npages = rbp->rbp_npages;
+	int	       sz = offsetof(struct lnet_rtrbuf, rb_kiov[npages]);
+	struct page   *page;
+	struct lnet_rtrbuf *rb;
+	int	       i;
+
+	LIBCFS_CPT_ALLOC(rb, lnet_cpt_table(), cpt, sz);
+	if (rb == NULL)
+		return NULL;
+
+	rb->rb_pool = rbp;
+
+	for (i = 0; i < npages; i++) {
+		page = cfs_page_cpt_alloc(lnet_cpt_table(), cpt,
+					  GFP_KERNEL | __GFP_ZERO);
+		if (page == NULL) {
+			while (--i >= 0)
+				__free_page(rb->rb_kiov[i].kiov_page);
+
+			LIBCFS_FREE(rb, sz);
+			return NULL;
+		}
+
+		rb->rb_kiov[i].kiov_len = PAGE_SIZE;
+		rb->rb_kiov[i].kiov_offset = 0;
+		rb->rb_kiov[i].kiov_page = page;
+	}
+
+	return rb;
+}
+
+static void
+lnet_rtrpool_free_bufs(struct lnet_rtrbufpool *rbp, int cpt)
+{
+	int npages = rbp->rbp_npages;
+	struct lnet_rtrbuf *rb;
+	struct list_head tmp;
+
+	if (rbp->rbp_nbuffers == 0) /* not initialized or already freed */
+		return;
+
+	INIT_LIST_HEAD(&tmp);
+
+	lnet_net_lock(cpt);
+	list_splice_init(&rbp->rbp_msgs, &tmp);
+	lnet_drop_routed_msgs_locked(&tmp, cpt);
+	list_splice_init(&rbp->rbp_bufs, &tmp);
+	rbp->rbp_req_nbuffers = 0;
+	rbp->rbp_nbuffers = rbp->rbp_credits = 0;
+	rbp->rbp_mincredits = 0;
+	lnet_net_unlock(cpt);
+
+	/* Free buffers on the free list. */
+	while (!list_empty(&tmp)) {
+		rb = list_entry(tmp.next, struct lnet_rtrbuf, rb_list);
+		list_del(&rb->rb_list);
+		lnet_destroy_rtrbuf(rb, npages);
+	}
+}
+
+static int
+lnet_rtrpool_adjust_bufs(struct lnet_rtrbufpool *rbp, int nbufs, int cpt)
+{
+	struct list_head rb_list;
+	struct lnet_rtrbuf *rb;
+	int		num_rb;
+	int		num_buffers = 0;
+	int		old_req_nbufs;
+	int		npages = rbp->rbp_npages;
+
+	lnet_net_lock(cpt);
+	/* If we are called for less buffers than already in the pool, we
+	 * just lower the req_nbuffers number and excess buffers will be
+	 * thrown away as they are returned to the free list.  Credits
+	 * then get adjusted as well.
+	 * If we already have enough buffers allocated to serve the
+	 * increase requested, then we can treat that the same way as we
+	 * do the decrease. */
+	num_rb = nbufs - rbp->rbp_nbuffers;
+	if (nbufs <= rbp->rbp_req_nbuffers || num_rb <= 0) {
+		rbp->rbp_req_nbuffers = nbufs;
+		lnet_net_unlock(cpt);
+		return 0;
+	}
+	/* store the older value of rbp_req_nbuffers and then set it to
+	 * the new request to prevent lnet_return_rx_credits_locked() from
+	 * freeing buffers that we need to keep around */
+	old_req_nbufs = rbp->rbp_req_nbuffers;
+	rbp->rbp_req_nbuffers = nbufs;
+	lnet_net_unlock(cpt);
+
+	INIT_LIST_HEAD(&rb_list);
+
+	/* allocate the buffers on a local list first.	If all buffers are
+	 * allocated successfully then join this list to the rbp buffer
+	 * list.  If not then free all allocated buffers. */
+	while (num_rb-- > 0) {
+		rb = lnet_new_rtrbuf(rbp, cpt);
+		if (rb == NULL) {
+			CERROR("Failed to allocate %d route bufs of %d pages\n",
+			       nbufs, npages);
+
+			lnet_net_lock(cpt);
+			rbp->rbp_req_nbuffers = old_req_nbufs;
+			lnet_net_unlock(cpt);
+
+			goto failed;
+		}
+
+		list_add(&rb->rb_list, &rb_list);
+		num_buffers++;
+	}
+
+	lnet_net_lock(cpt);
+
+	list_splice_tail(&rb_list, &rbp->rbp_bufs);
+	rbp->rbp_nbuffers += num_buffers;
+	rbp->rbp_credits += num_buffers;
+	rbp->rbp_mincredits = rbp->rbp_credits;
+	/* We need to schedule blocked msg using the newly
+	 * added buffers. */
+	while (!list_empty(&rbp->rbp_bufs) &&
+	       !list_empty(&rbp->rbp_msgs))
+		lnet_schedule_blocked_locked(rbp);
+
+	lnet_net_unlock(cpt);
+
+	return 0;
+
+failed:
+	while (!list_empty(&rb_list)) {
+		rb = list_entry(rb_list.next, struct lnet_rtrbuf, rb_list);
+		list_del(&rb->rb_list);
+		lnet_destroy_rtrbuf(rb, npages);
+	}
+
+	return -ENOMEM;
+}
+
+static void
+lnet_rtrpool_init(struct lnet_rtrbufpool *rbp, int npages)
+{
+	INIT_LIST_HEAD(&rbp->rbp_msgs);
+	INIT_LIST_HEAD(&rbp->rbp_bufs);
+
+	rbp->rbp_npages = npages;
+	rbp->rbp_credits = 0;
+	rbp->rbp_mincredits = 0;
+}
+
+void
+lnet_rtrpools_free(int keep_pools)
+{
+	struct lnet_rtrbufpool *rtrp;
+	int		  i;
+
+	if (the_lnet.ln_rtrpools == NULL) /* uninitialized or freed */
+		return;
+
+	cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
+		lnet_rtrpool_free_bufs(&rtrp[LNET_TINY_BUF_IDX], i);
+		lnet_rtrpool_free_bufs(&rtrp[LNET_SMALL_BUF_IDX], i);
+		lnet_rtrpool_free_bufs(&rtrp[LNET_LARGE_BUF_IDX], i);
+	}
+
+	if (!keep_pools) {
+		cfs_percpt_free(the_lnet.ln_rtrpools);
+		the_lnet.ln_rtrpools = NULL;
+	}
+}
+
+static int
+lnet_nrb_tiny_calculate(void)
+{
+	int	nrbs = LNET_NRB_TINY;
+
+	if (tiny_router_buffers < 0) {
+		LCONSOLE_ERROR_MSG(0x10c,
+				   "tiny_router_buffers=%d invalid when "
+				   "routing enabled\n", tiny_router_buffers);
+		return -EINVAL;
+	}
+
+	if (tiny_router_buffers > 0)
+		nrbs = tiny_router_buffers;
+
+	nrbs /= LNET_CPT_NUMBER;
+	return max(nrbs, LNET_NRB_TINY_MIN);
+}
+
+static int
+lnet_nrb_small_calculate(void)
+{
+	int	nrbs = LNET_NRB_SMALL;
+
+	if (small_router_buffers < 0) {
+		LCONSOLE_ERROR_MSG(0x10c,
+				   "small_router_buffers=%d invalid when "
+				   "routing enabled\n", small_router_buffers);
+		return -EINVAL;
+	}
+
+	if (small_router_buffers > 0)
+		nrbs = small_router_buffers;
+
+	nrbs /= LNET_CPT_NUMBER;
+	return max(nrbs, LNET_NRB_SMALL_MIN);
+}
+
+static int
+lnet_nrb_large_calculate(void)
+{
+	int	nrbs = LNET_NRB_LARGE;
+
+	if (large_router_buffers < 0) {
+		LCONSOLE_ERROR_MSG(0x10c,
+				   "large_router_buffers=%d invalid when "
+				   "routing enabled\n", large_router_buffers);
+		return -EINVAL;
+	}
+
+	if (large_router_buffers > 0)
+		nrbs = large_router_buffers;
+
+	nrbs /= LNET_CPT_NUMBER;
+	return max(nrbs, LNET_NRB_LARGE_MIN);
+}
+
+int
+lnet_rtrpools_alloc(int im_a_router)
+{
+	struct lnet_rtrbufpool *rtrp;
+	int	nrb_tiny;
+	int	nrb_small;
+	int	nrb_large;
+	int	rc;
+	int	i;
+
+	if (!strcmp(forwarding, "")) {
+		/* not set either way */
+		if (!im_a_router)
+			return 0;
+	} else if (!strcmp(forwarding, "disabled")) {
+		/* explicitly disabled */
+		return 0;
+	} else if (!strcmp(forwarding, "enabled")) {
+		/* explicitly enabled */
+	} else {
+		LCONSOLE_ERROR_MSG(0x10b, "'forwarding' not set to either "
+				   "'enabled' or 'disabled'\n");
+		return -EINVAL;
+	}
+
+	nrb_tiny = lnet_nrb_tiny_calculate();
+	if (nrb_tiny < 0)
+		return -EINVAL;
+
+	nrb_small = lnet_nrb_small_calculate();
+	if (nrb_small < 0)
+		return -EINVAL;
+
+	nrb_large = lnet_nrb_large_calculate();
+	if (nrb_large < 0)
+		return -EINVAL;
+
+	the_lnet.ln_rtrpools = cfs_percpt_alloc(lnet_cpt_table(),
+						LNET_NRBPOOLS *
+						sizeof(struct lnet_rtrbufpool));
+	if (the_lnet.ln_rtrpools == NULL) {
+		LCONSOLE_ERROR_MSG(0x10c,
+				   "Failed to initialize router buffe pool\n");
+		return -ENOMEM;
+	}
+
+	cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
+		lnet_rtrpool_init(&rtrp[LNET_TINY_BUF_IDX], 0);
+		rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_TINY_BUF_IDX],
+					      nrb_tiny, i);
+		if (rc != 0)
+			goto failed;
+
+		lnet_rtrpool_init(&rtrp[LNET_SMALL_BUF_IDX],
+				  LNET_NRB_SMALL_PAGES);
+		rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_SMALL_BUF_IDX],
+					      nrb_small, i);
+		if (rc != 0)
+			goto failed;
+
+		lnet_rtrpool_init(&rtrp[LNET_LARGE_BUF_IDX],
+				  LNET_NRB_LARGE_PAGES);
+		rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_LARGE_BUF_IDX],
+					      nrb_large, i);
+		if (rc != 0)
+			goto failed;
+	}
+
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_routing = 1;
+	lnet_net_unlock(LNET_LOCK_EX);
+	return 0;
+
+ failed:
+	lnet_rtrpools_free(0);
+	return rc;
+}
+
+static int
+lnet_rtrpools_adjust_helper(int tiny, int small, int large)
+{
+	int nrb = 0;
+	int rc = 0;
+	int i;
+	struct lnet_rtrbufpool *rtrp;
+
+	/* If the provided values for each buffer pool are different than the
+	 * configured values, we need to take action. */
+	if (tiny >= 0) {
+		tiny_router_buffers = tiny;
+		nrb = lnet_nrb_tiny_calculate();
+		cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
+			rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_TINY_BUF_IDX],
+						      nrb, i);
+			if (rc != 0)
+				return rc;
+		}
+	}
+	if (small >= 0) {
+		small_router_buffers = small;
+		nrb = lnet_nrb_small_calculate();
+		cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
+			rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_SMALL_BUF_IDX],
+						      nrb, i);
+			if (rc != 0)
+				return rc;
+		}
+	}
+	if (large >= 0) {
+		large_router_buffers = large;
+		nrb = lnet_nrb_large_calculate();
+		cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
+			rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_LARGE_BUF_IDX],
+						      nrb, i);
+			if (rc != 0)
+				return rc;
+		}
+	}
+
+	return 0;
+}
+
+int
+lnet_rtrpools_adjust(int tiny, int small, int large)
+{
+	/* this function doesn't revert the changes if adding new buffers
+	 * failed.  It's up to the user space caller to revert the
+	 * changes. */
+
+	if (!the_lnet.ln_routing)
+		return 0;
+
+	return lnet_rtrpools_adjust_helper(tiny, small, large);
+}
+
+int
+lnet_rtrpools_enable(void)
+{
+	int rc = 0;
+
+	if (the_lnet.ln_routing)
+		return 0;
+
+	if (the_lnet.ln_rtrpools == NULL)
+		/* If routing is turned off, and we have never
+		 * initialized the pools before, just call the
+		 * standard buffer pool allocation routine as
+		 * if we are just configuring this for the first
+		 * time. */
+		rc = lnet_rtrpools_alloc(1);
+	else
+		rc = lnet_rtrpools_adjust_helper(0, 0, 0);
+	if (rc != 0)
+		return rc;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_routing = 1;
+
+	the_lnet.ln_ping_info->pi_features &= ~LNET_PING_FEAT_RTE_DISABLED;
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	return rc;
+}
+
+void
+lnet_rtrpools_disable(void)
+{
+	if (!the_lnet.ln_routing)
+		return;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_routing = 0;
+	the_lnet.ln_ping_info->pi_features |= LNET_PING_FEAT_RTE_DISABLED;
+
+	tiny_router_buffers = 0;
+	small_router_buffers = 0;
+	large_router_buffers = 0;
+	lnet_net_unlock(LNET_LOCK_EX);
+	lnet_rtrpools_free(1);
+}
+
+int
+lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, int alive, cfs_time_t when)
+{
+	struct lnet_peer_ni *lp = NULL;
+	cfs_time_t now = cfs_time_current();
+	int cpt = lnet_cpt_of_nid(nid, ni);
+
+	LASSERT (!in_interrupt ());
+
+	CDEBUG (D_NET, "%s notifying %s: %s\n",
+		(ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
+		libcfs_nid2str(nid),
+		alive ? "up" : "down");
+
+	if (ni != NULL &&
+	    LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid)) {
+		CWARN("Ignoring notification of %s %s by %s (different net)\n",
+		      libcfs_nid2str(nid), alive ? "birth" : "death",
+		      libcfs_nid2str(ni->ni_nid));
+		return -EINVAL;
+	}
+
+	/* can't do predictions... */
+	if (cfs_time_after(when, now)) {
+		CWARN("Ignoring prediction from %s of %s %s "
+		      "%ld seconds in the future\n",
+		      (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
+		      libcfs_nid2str(nid), alive ? "up" : "down",
+		      cfs_duration_sec(cfs_time_sub(when, now)));
+		return -EINVAL;
+	}
+
+	if (ni != NULL && !alive &&		/* LND telling me she's down */
+	    !auto_down) {			/* auto-down disabled */
+		CDEBUG(D_NET, "Auto-down disabled\n");
+		return 0;
+	}
+
+	lnet_net_lock(cpt);
+
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+		lnet_net_unlock(cpt);
+		return -ESHUTDOWN;
+	}
+
+	lp = lnet_find_peer_ni_locked(nid);
+	if (lp == NULL) {
+		/* nid not found */
+		lnet_net_unlock(cpt);
+		CDEBUG(D_NET, "%s not found\n", libcfs_nid2str(nid));
+		return 0;
+	}
+
+	/*
+	 * It is possible for this function to be called for the same peer
+	 * but with different NIs. We want to synchronize the notification
+	 * between the different calls. So we will use the lpni_cpt to
+	 * grab the net lock.
+	 */
+	if (lp->lpni_cpt != cpt) {
+		lnet_net_unlock(cpt);
+		cpt = lp->lpni_cpt;
+		lnet_net_lock(cpt);
+	}
+
+	/* We can't fully trust LND on reporting exact peer last_alive
+	 * if he notifies us about dead peer. For example ksocklnd can
+	 * call us with when == _time_when_the_node_was_booted_ if
+	 * no connections were successfully established */
+	if (ni != NULL && !alive && when < lp->lpni_last_alive)
+		when = lp->lpni_last_alive;
+
+	lnet_notify_locked(lp, ni == NULL, alive, when);
+
+	if (ni != NULL)
+		lnet_ni_notify_locked(ni, lp);
+
+	lnet_peer_ni_decref_locked(lp);
+
+	lnet_net_unlock(cpt);
+	return 0;
+}
+EXPORT_SYMBOL(lnet_notify);
diff --git a/drivers/staging/lustrefsx/lnet/lnet/router_proc.c b/drivers/staging/lustrefsx/lnet/lnet/router_proc.c
new file mode 100644
index 0000000000000..2f6b0c76d7b70
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/router_proc.c
@@ -0,0 +1,988 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ *
+ *   This file is part of Lustre, https://wiki.hpdd.intel.com/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <libcfs/libcfs.h>
+#include <lnet/lib-lnet.h>
+
+/* This is really lnet_proc.c. You might need to update sanity test 215
+ * if any file format is changed. */
+
+static struct ctl_table_header *lnet_table_header = NULL;
+
+#define LNET_LOFFT_BITS		(sizeof(loff_t) * 8)
+/*
+ * NB: max allowed LNET_CPT_BITS is 8 on 64-bit system and 2 on 32-bit system
+ */
+#define LNET_PROC_CPT_BITS	(LNET_CPT_BITS + 1)
+/* change version, 16 bits or 8 bits */
+#define LNET_PROC_VER_BITS	MAX(((MIN(LNET_LOFFT_BITS, 64)) / 4), 8)
+
+#define LNET_PROC_HASH_BITS	LNET_PEER_HASH_BITS
+/*
+ * bits for peer hash offset
+ * NB: we don't use the highest bit of *ppos because it's signed
+ */
+#define LNET_PROC_HOFF_BITS	(LNET_LOFFT_BITS -	 \
+				 LNET_PROC_CPT_BITS -	 \
+				 LNET_PROC_VER_BITS -	 \
+				 LNET_PROC_HASH_BITS - 1)
+/* bits for hash index + position */
+#define LNET_PROC_HPOS_BITS	(LNET_PROC_HASH_BITS + LNET_PROC_HOFF_BITS)
+/* bits for peer hash table + hash version */
+#define LNET_PROC_VPOS_BITS	(LNET_PROC_HPOS_BITS + LNET_PROC_VER_BITS)
+
+#define LNET_PROC_CPT_MASK	((1ULL << LNET_PROC_CPT_BITS) - 1)
+#define LNET_PROC_VER_MASK	((1ULL << LNET_PROC_VER_BITS) - 1)
+#define LNET_PROC_HASH_MASK	((1ULL << LNET_PROC_HASH_BITS) - 1)
+#define LNET_PROC_HOFF_MASK	((1ULL << LNET_PROC_HOFF_BITS) - 1)
+
+#define LNET_PROC_CPT_GET(pos)				\
+	(int)(((pos) >> LNET_PROC_VPOS_BITS) & LNET_PROC_CPT_MASK)
+
+#define LNET_PROC_VER_GET(pos)				\
+	(int)(((pos) >> LNET_PROC_HPOS_BITS) & LNET_PROC_VER_MASK)
+
+#define LNET_PROC_HASH_GET(pos)				\
+	(int)(((pos) >> LNET_PROC_HOFF_BITS) & LNET_PROC_HASH_MASK)
+
+#define LNET_PROC_HOFF_GET(pos)				\
+	(int)((pos) & LNET_PROC_HOFF_MASK)
+
+#define LNET_PROC_POS_MAKE(cpt, ver, hash, off)		\
+	(((((loff_t)(cpt)) & LNET_PROC_CPT_MASK) << LNET_PROC_VPOS_BITS) |   \
+	((((loff_t)(ver)) & LNET_PROC_VER_MASK) << LNET_PROC_HPOS_BITS) |   \
+	((((loff_t)(hash)) & LNET_PROC_HASH_MASK) << LNET_PROC_HOFF_BITS) | \
+	((off) & LNET_PROC_HOFF_MASK))
+
+#define LNET_PROC_VERSION(v)	((unsigned int)((v) & LNET_PROC_VER_MASK))
+
+static int __proc_lnet_stats(void *data, int write,
+			     loff_t pos, void __user *buffer, int nob)
+{
+	int		 rc;
+	struct lnet_counters *ctrs;
+	int		 len;
+	char		*tmpstr;
+	const int	 tmpsiz = 256; /* 7 %u and 4 __u64 */
+
+	if (write) {
+		lnet_counters_reset();
+		return 0;
+	}
+
+	/* read */
+
+	LIBCFS_ALLOC(ctrs, sizeof(*ctrs));
+	if (ctrs == NULL)
+		return -ENOMEM;
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL) {
+		LIBCFS_FREE(ctrs, sizeof(*ctrs));
+		return -ENOMEM;
+	}
+
+	lnet_counters_get(ctrs);
+
+	len = snprintf(tmpstr, tmpsiz,
+		       "%u %u %u %u %u %u %u %llu %llu "
+		       "%llu %llu",
+		       ctrs->msgs_alloc, ctrs->msgs_max,
+		       ctrs->errors,
+		       ctrs->send_count, ctrs->recv_count,
+		       ctrs->route_count, ctrs->drop_count,
+		       ctrs->send_length, ctrs->recv_length,
+		       ctrs->route_length, ctrs->drop_length);
+
+	if (pos >= min_t(int, len, strlen(tmpstr)))
+		rc = 0;
+	else
+		rc = cfs_trace_copyout_string(buffer, nob,
+					      tmpstr + pos, "\n");
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+	LIBCFS_FREE(ctrs, sizeof(*ctrs));
+	return rc;
+}
+
+static int
+proc_lnet_stats(struct ctl_table *table, int write, void __user *buffer,
+		size_t *lenp, loff_t *ppos)
+{
+	return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
+				    __proc_lnet_stats);
+}
+
+static int
+proc_lnet_routes(struct ctl_table *table, int write, void __user *buffer,
+		 size_t *lenp, loff_t *ppos)
+{
+	const int	tmpsiz = 256;
+	char		*tmpstr;
+	char		*s;
+	int		rc = 0;
+	int		len;
+	int		ver;
+	int		off;
+
+	CLASSERT(sizeof(loff_t) >= 4);
+
+	off = LNET_PROC_HOFF_GET(*ppos);
+	ver = LNET_PROC_VER_GET(*ppos);
+
+	LASSERT(!write);
+
+	if (*lenp == 0)
+		return 0;
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	if (*ppos == 0) {
+		s += snprintf(s, tmpstr + tmpsiz - s, "Routing %s\n",
+			      the_lnet.ln_routing ? "enabled" : "disabled");
+		LASSERT(tmpstr + tmpsiz - s > 0);
+
+		s += snprintf(s, tmpstr + tmpsiz - s, "%-8s %4s %8s %7s %s\n",
+			      "net", "hops", "priority", "state", "router");
+		LASSERT(tmpstr + tmpsiz - s > 0);
+
+		lnet_net_lock(0);
+		ver = (unsigned int)the_lnet.ln_remote_nets_version;
+		lnet_net_unlock(0);
+		*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+	} else {
+		struct list_head	*n;
+		struct list_head	*r;
+		struct lnet_route		*route = NULL;
+		struct lnet_remotenet	*rnet  = NULL;
+		int			skip  = off - 1;
+		struct list_head	*rn_list;
+		int			i;
+
+		lnet_net_lock(0);
+
+		if (ver != LNET_PROC_VERSION(the_lnet.ln_remote_nets_version)) {
+			lnet_net_unlock(0);
+			LIBCFS_FREE(tmpstr, tmpsiz);
+			return -ESTALE;
+		}
+
+		for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE && route == NULL;
+		     i++) {
+			rn_list = &the_lnet.ln_remote_nets_hash[i];
+
+			n = rn_list->next;
+
+			while (n != rn_list && route == NULL) {
+				rnet = list_entry(n, struct lnet_remotenet,
+						  lrn_list);
+
+				r = rnet->lrn_routes.next;
+
+				while (r != &rnet->lrn_routes) {
+					struct lnet_route *re =
+						list_entry(r, struct lnet_route,
+							   lr_list);
+					if (skip == 0) {
+						route = re;
+						break;
+					}
+
+					skip--;
+					r = r->next;
+				}
+
+				n = n->next;
+			}
+		}
+
+		if (route != NULL) {
+			__u32	     net	= rnet->lrn_net;
+			__u32 hops		= route->lr_hops;
+			unsigned int priority	= route->lr_priority;
+			lnet_nid_t   nid	= route->lr_gateway->lpni_nid;
+			int          alive	= lnet_is_route_alive(route);
+
+			s += snprintf(s, tmpstr + tmpsiz - s,
+				      "%-8s %4d %8u %7s %s\n",
+				      libcfs_net2str(net), hops,
+				      priority,
+				      alive ? "up" : "down",
+				      libcfs_nid2str(nid));
+			LASSERT(tmpstr + tmpsiz - s > 0);
+		}
+
+		lnet_net_unlock(0);
+	}
+
+	len = s - tmpstr;     /* how many bytes was written */
+
+	if (len > *lenp) {    /* linux-supplied buffer is too small */
+		rc = -EINVAL;
+	} else if (len > 0) { /* wrote something */
+		if (copy_to_user(buffer, tmpstr, len))
+			rc = -EFAULT;
+		else {
+			off += 1;
+			*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+		}
+	}
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+
+	if (rc == 0)
+		*lenp = len;
+
+	return rc;
+}
+
+static int
+proc_lnet_routers(struct ctl_table *table, int write, void __user *buffer,
+		  size_t *lenp, loff_t *ppos)
+{
+	int	   rc = 0;
+	char	  *tmpstr;
+	char	  *s;
+	const int  tmpsiz = 256;
+	int	   len;
+	int	   ver;
+	int	   off;
+
+	off = LNET_PROC_HOFF_GET(*ppos);
+	ver = LNET_PROC_VER_GET(*ppos);
+
+	LASSERT(!write);
+
+	if (*lenp == 0)
+		return 0;
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	if (*ppos == 0) {
+		s += snprintf(s, tmpstr + tmpsiz - s,
+			      "%-4s %7s %9s %6s %12s %9s %8s %7s %s\n",
+			      "ref", "rtr_ref", "alive_cnt", "state",
+			      "last_ping", "ping_sent", "deadline",
+			      "down_ni", "router");
+		LASSERT(tmpstr + tmpsiz - s > 0);
+
+		lnet_net_lock(0);
+		ver = (unsigned int)the_lnet.ln_routers_version;
+		lnet_net_unlock(0);
+		*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+	} else {
+		struct list_head *r;
+		struct lnet_peer_ni *peer = NULL;
+		int		  skip = off - 1;
+
+		lnet_net_lock(0);
+
+		if (ver != LNET_PROC_VERSION(the_lnet.ln_routers_version)) {
+			lnet_net_unlock(0);
+
+			LIBCFS_FREE(tmpstr, tmpsiz);
+			return -ESTALE;
+		}
+
+		r = the_lnet.ln_routers.next;
+
+		while (r != &the_lnet.ln_routers) {
+			struct lnet_peer_ni *lp =
+			  list_entry(r, struct lnet_peer_ni,
+				     lpni_rtr_list);
+
+			if (skip == 0) {
+				peer = lp;
+				break;
+			}
+
+			skip--;
+			r = r->next;
+		}
+
+		if (peer != NULL) {
+			lnet_nid_t nid = peer->lpni_nid;
+			cfs_time_t now = cfs_time_current();
+			cfs_time_t deadline = peer->lpni_ping_deadline;
+			int nrefs     = atomic_read(&peer->lpni_refcount);
+			int nrtrrefs  = peer->lpni_rtr_refcount;
+			int alive_cnt = peer->lpni_alive_count;
+			int alive     = peer->lpni_alive;
+			int pingsent  = !peer->lpni_ping_notsent;
+			int last_ping = cfs_duration_sec(cfs_time_sub(now,
+						     peer->lpni_ping_timestamp));
+			int down_ni   = 0;
+			struct lnet_route *rtr;
+
+			if ((peer->lpni_ping_feats &
+			     LNET_PING_FEAT_NI_STATUS) != 0) {
+				list_for_each_entry(rtr, &peer->lpni_routes,
+						    lr_gwlist) {
+					/* downis on any route should be the
+					 * number of downis on the gateway */
+					if (rtr->lr_downis != 0) {
+						down_ni = rtr->lr_downis;
+						break;
+					}
+				}
+			}
+
+			if (deadline == 0)
+				s += snprintf(s, tmpstr + tmpsiz - s,
+					      "%-4d %7d %9d %6s %12d %9d %8s %7d %s\n",
+					      nrefs, nrtrrefs, alive_cnt,
+					      alive ? "up" : "down", last_ping,
+					      pingsent, "NA", down_ni,
+					      libcfs_nid2str(nid));
+			else
+				s += snprintf(s, tmpstr + tmpsiz - s,
+					      "%-4d %7d %9d %6s %12d %9d %8lu %7d %s\n",
+					      nrefs, nrtrrefs, alive_cnt,
+					      alive ? "up" : "down", last_ping,
+					      pingsent,
+					      cfs_duration_sec(cfs_time_sub(deadline, now)),
+					      down_ni, libcfs_nid2str(nid));
+			LASSERT(tmpstr + tmpsiz - s > 0);
+		}
+
+		lnet_net_unlock(0);
+	}
+
+	len = s - tmpstr;     /* how many bytes was written */
+
+	if (len > *lenp) {    /* linux-supplied buffer is too small */
+		rc = -EINVAL;
+	} else if (len > 0) { /* wrote something */
+		if (copy_to_user(buffer, tmpstr, len))
+			rc = -EFAULT;
+		else {
+			off += 1;
+			*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+		}
+	}
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+
+	if (rc == 0)
+		*lenp = len;
+
+	return rc;
+}
+
+/* TODO: there should be no direct access to ptable. We should add a set
+ * of APIs that give access to the ptable and its members */
+static int
+proc_lnet_peers(struct ctl_table *table, int write, void __user *buffer,
+		size_t *lenp, loff_t *ppos)
+{
+	const int		tmpsiz	= 256;
+	struct lnet_peer_table	*ptable;
+	char			*tmpstr = NULL;
+	char			*s;
+	int			cpt  = LNET_PROC_CPT_GET(*ppos);
+	int			ver  = LNET_PROC_VER_GET(*ppos);
+	int			hash = LNET_PROC_HASH_GET(*ppos);
+	int			hoff = LNET_PROC_HOFF_GET(*ppos);
+	int			rc = 0;
+	int			len;
+
+	if (write) {
+		int i;
+		struct lnet_peer_ni *peer;
+
+		cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+			lnet_net_lock(i);
+			for (hash = 0; hash < LNET_PEER_HASH_SIZE; hash++) {
+				list_for_each_entry(peer,
+						    &ptable->pt_hash[hash],
+						    lpni_hashlist) {
+					peer->lpni_mintxcredits =
+						peer->lpni_txcredits;
+					peer->lpni_minrtrcredits =
+						peer->lpni_rtrcredits;
+				}
+			}
+			lnet_net_unlock(i);
+		}
+		*ppos += *lenp;
+		return 0;
+	}
+
+	if (*lenp == 0)
+		return 0;
+
+	CLASSERT(LNET_PROC_HASH_BITS >= LNET_PEER_HASH_BITS);
+
+	if (cpt >= LNET_CPT_NUMBER) {
+		*lenp = 0;
+		return 0;
+	}
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	if (*ppos == 0) {
+		s += snprintf(s, tmpstr + tmpsiz - s,
+			      "%-24s %4s %5s %5s %5s %5s %5s %5s %5s %s\n",
+			      "nid", "refs", "state", "last", "max",
+			      "rtr", "min", "tx", "min", "queue");
+		LASSERT(tmpstr + tmpsiz - s > 0);
+
+		hoff++;
+	} else {
+		struct lnet_peer_ni	*peer;
+		struct list_head	*p;
+		int			skip;
+
+ again:
+		p = NULL;
+		peer = NULL;
+		skip = hoff - 1;
+
+		lnet_net_lock(cpt);
+		ptable = the_lnet.ln_peer_tables[cpt];
+		if (hoff == 1)
+			ver = LNET_PROC_VERSION(ptable->pt_version);
+
+		if (ver != LNET_PROC_VERSION(ptable->pt_version)) {
+			lnet_net_unlock(cpt);
+			LIBCFS_FREE(tmpstr, tmpsiz);
+			return -ESTALE;
+		}
+
+		while (hash < LNET_PEER_HASH_SIZE) {
+			if (p == NULL)
+				p = ptable->pt_hash[hash].next;
+
+			while (p != &ptable->pt_hash[hash]) {
+				struct lnet_peer_ni *lp =
+				  list_entry(p, struct lnet_peer_ni,
+					     lpni_hashlist);
+				if (skip == 0) {
+					peer = lp;
+
+					/* minor optimization: start from idx+1
+					 * on next iteration if we've just
+					 * drained lpni_hashlist */
+					if (lp->lpni_hashlist.next ==
+					    &ptable->pt_hash[hash]) {
+						hoff = 1;
+						hash++;
+					} else {
+						hoff++;
+					}
+
+					break;
+				}
+
+				skip--;
+				p = lp->lpni_hashlist.next;
+			}
+
+			if (peer != NULL)
+				break;
+
+			p = NULL;
+			hoff = 1;
+			hash++;
+                }
+
+		if (peer != NULL) {
+			lnet_nid_t nid = peer->lpni_nid;
+			int nrefs = atomic_read(&peer->lpni_refcount);
+			int lastalive = -1;
+			char *aliveness = "NA";
+			int maxcr = (peer->lpni_net) ?
+			  peer->lpni_net->net_tunables.lct_peer_tx_credits : 0;
+			int txcr = peer->lpni_txcredits;
+			int mintxcr = peer->lpni_mintxcredits;
+			int rtrcr = peer->lpni_rtrcredits;
+			int minrtrcr = peer->lpni_minrtrcredits;
+			int txqnob = peer->lpni_txqnob;
+
+			if (lnet_isrouter(peer) ||
+			    lnet_peer_aliveness_enabled(peer))
+				aliveness = peer->lpni_alive ? "up" : "down";
+
+			if (lnet_peer_aliveness_enabled(peer)) {
+				cfs_time_t now = cfs_time_current();
+				cfs_duration_t delta;
+
+				delta = cfs_time_sub(now, peer->lpni_last_alive);
+				lastalive = cfs_duration_sec(delta);
+
+				/* No need to mess up peers contents with
+				 * arbitrarily long integers - it suffices to
+				 * know that lastalive is more than 10000s old
+				 */
+				if (lastalive >= 10000)
+					lastalive = 9999;
+			}
+
+			lnet_net_unlock(cpt);
+
+			s += snprintf(s, tmpstr + tmpsiz - s,
+				      "%-24s %4d %5s %5d %5d %5d %5d %5d %5d %d\n",
+				      libcfs_nid2str(nid), nrefs, aliveness,
+				      lastalive, maxcr, rtrcr, minrtrcr, txcr,
+				      mintxcr, txqnob);
+			LASSERT(tmpstr + tmpsiz - s > 0);
+
+		} else { /* peer is NULL */
+			lnet_net_unlock(cpt);
+		}
+
+		if (hash == LNET_PEER_HASH_SIZE) {
+			cpt++;
+			hash = 0;
+			hoff = 1;
+			if (peer == NULL && cpt < LNET_CPT_NUMBER)
+				goto again;
+		}
+	}
+
+	len = s - tmpstr;     /* how many bytes was written */
+
+	if (len > *lenp) {    /* linux-supplied buffer is too small */
+		rc = -EINVAL;
+	} else if (len > 0) { /* wrote something */
+		if (copy_to_user(buffer, tmpstr, len))
+			rc = -EFAULT;
+		else
+			*ppos = LNET_PROC_POS_MAKE(cpt, ver, hash, hoff);
+	}
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+
+	if (rc == 0)
+		*lenp = len;
+
+	return rc;
+}
+
+static int __proc_lnet_buffers(void *data, int write,
+			       loff_t pos, void __user *buffer, int nob)
+{
+	char		*s;
+	char		*tmpstr;
+	int		tmpsiz;
+	int		idx;
+	int		len;
+	int		rc;
+	int		i;
+
+	LASSERT(!write);
+
+	/* (4 %d) * 4 * LNET_CPT_NUMBER */
+	tmpsiz = 64 * (LNET_NRBPOOLS + 1) * LNET_CPT_NUMBER;
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	s += snprintf(s, tmpstr + tmpsiz - s,
+		      "%5s %5s %7s %7s\n",
+		      "pages", "count", "credits", "min");
+	LASSERT(tmpstr + tmpsiz - s > 0);
+
+	if (the_lnet.ln_rtrpools == NULL)
+		goto out; /* I'm not a router */
+
+	for (idx = 0; idx < LNET_NRBPOOLS; idx++) {
+		struct lnet_rtrbufpool *rbp;
+
+		lnet_net_lock(LNET_LOCK_EX);
+		cfs_percpt_for_each(rbp, i, the_lnet.ln_rtrpools) {
+			s += snprintf(s, tmpstr + tmpsiz - s,
+				      "%5d %5d %7d %7d\n",
+				      rbp[idx].rbp_npages,
+				      rbp[idx].rbp_nbuffers,
+				      rbp[idx].rbp_credits,
+				      rbp[idx].rbp_mincredits);
+			LASSERT(tmpstr + tmpsiz - s > 0);
+		}
+		lnet_net_unlock(LNET_LOCK_EX);
+	}
+
+ out:
+	len = s - tmpstr;
+
+	if (pos >= min_t(int, len, strlen(tmpstr)))
+		rc = 0;
+	else
+		rc = cfs_trace_copyout_string(buffer, nob,
+					      tmpstr + pos, NULL);
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+	return rc;
+}
+
+static int
+proc_lnet_buffers(struct ctl_table *table, int write, void __user *buffer,
+		  size_t *lenp, loff_t *ppos)
+{
+	return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
+				    __proc_lnet_buffers);
+}
+
+static int
+proc_lnet_nis(struct ctl_table *table, int write, void __user *buffer,
+	      size_t *lenp, loff_t *ppos)
+{
+	int	tmpsiz = 128 * LNET_CPT_NUMBER;
+	int	rc = 0;
+	char	*tmpstr;
+	char	*s;
+	int	len;
+
+	if (*lenp == 0)
+		return 0;
+
+	if (write) {
+		/* Just reset the min stat. */
+		struct lnet_ni	*ni;
+		struct lnet_net	*net;
+
+		lnet_net_lock(0);
+
+		list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+			list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+				struct lnet_tx_queue *tq;
+				int i;
+				int j;
+
+				cfs_percpt_for_each(tq, i, ni->ni_tx_queues) {
+					for (j = 0; ni->ni_cpts != NULL &&
+					     j < ni->ni_ncpts; j++) {
+						if (i == ni->ni_cpts[j])
+							break;
+					}
+
+					if (j == ni->ni_ncpts)
+						continue;
+
+					if (i != 0)
+						lnet_net_lock(i);
+					tq->tq_credits_min = tq->tq_credits;
+					if (i != 0)
+						lnet_net_unlock(i);
+				}
+			}
+		}
+		lnet_net_unlock(0);
+		*ppos += *lenp;
+		return 0;
+	}
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	if (*ppos == 0) {
+		s += snprintf(s, tmpstr + tmpsiz - s,
+			      "%-24s %6s %5s %4s %4s %4s %5s %5s %5s\n",
+			      "nid", "status", "alive", "refs", "peer",
+			      "rtr", "max", "tx", "min");
+		LASSERT (tmpstr + tmpsiz - s > 0);
+	} else {
+		struct lnet_ni *ni   = NULL;
+		int skip = *ppos - 1;
+
+		lnet_net_lock(0);
+
+		ni = lnet_get_ni_idx_locked(skip);
+
+		if (ni != NULL) {
+			struct lnet_tx_queue	*tq;
+			char	*stat;
+			long now = cfs_time_current_sec();
+			int	last_alive = -1;
+			int	i;
+			int	j;
+
+			if (the_lnet.ln_routing)
+				last_alive = now - ni->ni_last_alive;
+
+			/* @lo forever alive */
+			if (ni->ni_net->net_lnd->lnd_type == LOLND)
+				last_alive = 0;
+
+			lnet_ni_lock(ni);
+			LASSERT(ni->ni_status != NULL);
+			stat = (ni->ni_status->ns_status ==
+				LNET_NI_STATUS_UP) ? "up" : "down";
+			lnet_ni_unlock(ni);
+
+			/* we actually output credits information for
+			 * TX queue of each partition */
+			cfs_percpt_for_each(tq, i, ni->ni_tx_queues) {
+				for (j = 0; ni->ni_cpts != NULL &&
+				     j < ni->ni_ncpts; j++) {
+					if (i == ni->ni_cpts[j])
+						break;
+				}
+
+				if (j == ni->ni_ncpts)
+					continue;
+
+				if (i != 0)
+					lnet_net_lock(i);
+
+				s += snprintf(s, tmpstr + tmpsiz - s,
+				      "%-24s %6s %5d %4d %4d %4d %5d %5d %5d\n",
+				      libcfs_nid2str(ni->ni_nid), stat,
+				      last_alive, *ni->ni_refs[i],
+				      ni->ni_net->net_tunables.lct_peer_tx_credits,
+				      ni->ni_net->net_tunables.lct_peer_rtr_credits,
+				      tq->tq_credits_max,
+				      tq->tq_credits, tq->tq_credits_min);
+				if (i != 0)
+					lnet_net_unlock(i);
+			}
+			LASSERT(tmpstr + tmpsiz - s > 0);
+		}
+
+		lnet_net_unlock(0);
+	}
+
+	len = s - tmpstr;     /* how many bytes was written */
+
+	if (len > *lenp) {    /* linux-supplied buffer is too small */
+		rc = -EINVAL;
+	} else if (len > 0) { /* wrote something */
+		if (copy_to_user(buffer, tmpstr, len))
+			rc = -EFAULT;
+		else
+			*ppos += 1;
+	}
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+
+	if (rc == 0)
+		*lenp = len;
+
+	return rc;
+}
+
+struct lnet_portal_rotors {
+	int		pr_value;
+	const char	*pr_name;
+	const char	*pr_desc;
+};
+
+static struct lnet_portal_rotors	portal_rotors[] = {
+	{
+		.pr_value = LNET_PTL_ROTOR_OFF,
+		.pr_name  = "OFF",
+		.pr_desc  = "Turn off message rotor for wildcard portals"
+	},
+	{
+		.pr_value = LNET_PTL_ROTOR_ON,
+		.pr_name  = "ON",
+		.pr_desc  = "round-robin dispatch all PUT messages for "
+			    "wildcard portals"
+	},
+	{
+		.pr_value = LNET_PTL_ROTOR_RR_RT,
+		.pr_name  = "RR_RT",
+		.pr_desc  = "round-robin dispatch routed PUT message for "
+			    "wildcard portals"
+	},
+	{
+		.pr_value = LNET_PTL_ROTOR_HASH_RT,
+		.pr_name  = "HASH_RT",
+		.pr_desc  = "dispatch routed PUT message by hashing source "
+			    "NID for wildcard portals"
+	},
+	{
+		.pr_value = -1,
+		.pr_name  = NULL,
+		.pr_desc  = NULL
+	},
+};
+
+static int __proc_lnet_portal_rotor(void *data, int write,
+				    loff_t pos, void __user *buffer, int nob)
+{
+	const int	buf_len	= 128;
+	char		*buf;
+	char		*tmp;
+	int		rc;
+	int		i;
+
+	LIBCFS_ALLOC(buf, buf_len);
+	if (buf == NULL)
+		return -ENOMEM;
+
+	if (!write) {
+		lnet_res_lock(0);
+
+		for (i = 0; portal_rotors[i].pr_value >= 0; i++) {
+			if (portal_rotors[i].pr_value == portal_rotor)
+				break;
+		}
+
+		LASSERT(portal_rotors[i].pr_value == portal_rotor);
+		lnet_res_unlock(0);
+
+		rc = snprintf(buf, buf_len,
+			      "{\n\tportals: all\n"
+			      "\trotor: %s\n\tdescription: %s\n}",
+			      portal_rotors[i].pr_name,
+			      portal_rotors[i].pr_desc);
+
+		if (pos >= min_t(int, rc, buf_len)) {
+			rc = 0;
+		} else {
+			rc = cfs_trace_copyout_string(buffer, nob,
+					buf + pos, "\n");
+		}
+		goto out;
+	}
+
+	rc = cfs_trace_copyin_string(buf, buf_len, buffer, nob);
+	if (rc < 0)
+		goto out;
+
+	tmp = cfs_trimwhite(buf);
+
+	rc = -EINVAL;
+	lnet_res_lock(0);
+	for (i = 0; portal_rotors[i].pr_name != NULL; i++) {
+		if (strncasecmp(portal_rotors[i].pr_name, tmp,
+				strlen(portal_rotors[i].pr_name)) == 0) {
+			portal_rotor = portal_rotors[i].pr_value;
+			rc = 0;
+			break;
+		}
+	}
+	lnet_res_unlock(0);
+out:
+	LIBCFS_FREE(buf, buf_len);
+	return rc;
+}
+
+static int
+proc_lnet_portal_rotor(struct ctl_table *table, int write, void __user *buffer,
+		       size_t *lenp, loff_t *ppos)
+{
+	return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
+				    __proc_lnet_portal_rotor);
+}
+
+
+static struct ctl_table lnet_table[] = {
+	/*
+	 * NB No .strategy entries have been provided since sysctl(8) prefers
+	 * to go via /proc for portability.
+	 */
+	{
+		INIT_CTL_NAME
+		.procname	= "stats",
+		.mode		= 0644,
+		.proc_handler	= &proc_lnet_stats,
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "routes",
+		.mode		= 0444,
+		.proc_handler	= &proc_lnet_routes,
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "routers",
+		.mode		= 0444,
+		.proc_handler	= &proc_lnet_routers,
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "peers",
+		.mode		= 0644,
+		.proc_handler	= &proc_lnet_peers,
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "buffers",
+		.mode		= 0444,
+		.proc_handler	= &proc_lnet_buffers,
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "nis",
+		.mode		= 0644,
+		.proc_handler	= &proc_lnet_nis,
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "portal_rotor",
+		.mode		= 0644,
+		.proc_handler	= &proc_lnet_portal_rotor,
+	},
+	{ .procname = NULL }
+};
+
+static struct ctl_table top_table[] = {
+	{
+		INIT_CTL_NAME
+		.procname	= "lnet",
+		.mode		= 0555,
+		.data		= NULL,
+		.maxlen		= 0,
+		.child		= lnet_table,
+	},
+	{ .procname = NULL }
+};
+
+void
+lnet_proc_init(void)
+{
+#ifdef CONFIG_SYSCTL
+	if (lnet_table_header == NULL)
+		lnet_table_header = register_sysctl_table(top_table);
+#endif
+}
+
+void
+lnet_proc_fini(void)
+{
+#ifdef CONFIG_SYSCTL
+	if (lnet_table_header != NULL)
+		unregister_sysctl_table(lnet_table_header);
+
+	lnet_table_header = NULL;
+#endif
+}
diff --git a/drivers/staging/lustrefsx/lnet/selftest/brw_test.c b/drivers/staging/lustrefsx/lnet/selftest/brw_test.c
new file mode 100644
index 0000000000000..512dbb5b8a2f1
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/brw_test.c
@@ -0,0 +1,526 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/brw_test.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+
+#include "selftest.h"
+
+static int brw_srv_workitems = SFW_TEST_WI_MAX;
+module_param(brw_srv_workitems, int, 0644);
+MODULE_PARM_DESC(brw_srv_workitems, "# BRW server workitems");
+
+static int brw_inject_errors;
+module_param(brw_inject_errors, int, 0644);
+MODULE_PARM_DESC(brw_inject_errors, "# data errors to inject randomly, zero by default");
+
+#define BRW_POISON	0xbeefbeefbeefbeefULL
+#define BRW_MAGIC	0xeeb0eeb1eeb2eeb3ULL
+#define BRW_MSIZE	sizeof(__u64)
+
+static void
+brw_client_fini (sfw_test_instance_t *tsi)
+{
+	srpc_bulk_t	*bulk;
+	sfw_test_unit_t	*tsu;
+
+	LASSERT(tsi->tsi_is_client);
+
+	list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) {
+		bulk = tsu->tsu_private;
+		if (bulk == NULL)
+			continue;
+
+		srpc_free_bulk(bulk);
+		tsu->tsu_private = NULL;
+	}
+}
+
+static int
+brw_client_init (sfw_test_instance_t *tsi)
+{
+	sfw_session_t	 *sn = tsi->tsi_batch->bat_session;
+	int		  flags;
+	int		  off;
+	int		  npg;
+	int		  len;
+	int		  opc;
+	srpc_bulk_t	 *bulk;
+	sfw_test_unit_t	 *tsu;
+
+	LASSERT(sn != NULL);
+	LASSERT(tsi->tsi_is_client);
+
+	if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+		test_bulk_req_t  *breq = &tsi->tsi_u.bulk_v0;
+
+		opc   = breq->blk_opc;
+		flags = breq->blk_flags;
+		npg   = breq->blk_npg;
+		/* NB: this is not going to work for variable page size,
+		 * but we have to keep it for compatibility */
+		len   = npg * PAGE_SIZE;
+		off   = 0;
+
+	} else {
+		test_bulk_req_v1_t  *breq = &tsi->tsi_u.bulk_v1;
+
+		/* I should never get this step if it's unknown feature
+		 * because make_session will reject unknown feature */
+		LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+		opc   = breq->blk_opc;
+		flags = breq->blk_flags;
+		len   = breq->blk_len;
+		off   = breq->blk_offset & ~PAGE_MASK;
+		npg   = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	}
+
+	if (off % BRW_MSIZE != 0)
+		return -EINVAL;
+
+	if (npg > LNET_MAX_IOV || npg <= 0)
+		return -EINVAL;
+
+	if (opc != LST_BRW_READ && opc != LST_BRW_WRITE)
+		return -EINVAL;
+
+	if (flags != LST_BRW_CHECK_NONE &&
+	    flags != LST_BRW_CHECK_FULL && flags != LST_BRW_CHECK_SIMPLE)
+		return -EINVAL;
+
+	list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) {
+		bulk = srpc_alloc_bulk(lnet_cpt_of_nid(tsu->tsu_dest.nid, NULL),
+				       off, npg, len, opc == LST_BRW_READ);
+		if (bulk == NULL) {
+			brw_client_fini(tsi);
+			return -ENOMEM;
+		}
+
+		tsu->tsu_private = bulk;
+	}
+
+	return 0;
+}
+
+#define BRW_POISON      0xbeefbeefbeefbeefULL
+#define BRW_MAGIC       0xeeb0eeb1eeb2eeb3ULL
+#define BRW_MSIZE       sizeof(__u64)
+
+int brw_inject_one_error(void)
+{
+	struct timespec64 ts;
+
+	if (brw_inject_errors <= 0) return 0;
+
+	ktime_get_ts64(&ts);
+
+	if (((ts.tv_nsec / NSEC_PER_USEC) & 1) == 0)
+		return 0;
+
+	return brw_inject_errors--;
+}
+
+static void
+brw_fill_page(struct page *pg, int off, int len, int pattern, __u64 magic)
+{
+	char *addr = page_address(pg) + off;
+	int   i;
+
+	LASSERT(addr != NULL);
+	LASSERT(off % BRW_MSIZE == 0 && len % BRW_MSIZE == 0);
+
+	if (pattern == LST_BRW_CHECK_NONE)
+		return;
+
+	if (magic == BRW_MAGIC)
+		magic += brw_inject_one_error();
+
+	if (pattern == LST_BRW_CHECK_SIMPLE) {
+		memcpy(addr, &magic, BRW_MSIZE);
+		if (len > BRW_MSIZE) {
+			addr += len - BRW_MSIZE;
+			memcpy(addr, &magic, BRW_MSIZE);
+		}
+		return;
+	}
+
+	if (pattern == LST_BRW_CHECK_FULL) {
+		for (i = 0; i < len; i += BRW_MSIZE)
+			memcpy(addr + i, &magic, BRW_MSIZE);
+		return;
+	}
+	LBUG();
+}
+
+static int
+brw_check_page(struct page *pg, int off, int len, int pattern, __u64 magic)
+{
+	char  *addr = page_address(pg) + off;
+	__u64  data = 0; /* make compiler happy */
+	int    i;
+
+	LASSERT(addr != NULL);
+	LASSERT(off % BRW_MSIZE == 0 && len % BRW_MSIZE == 0);
+
+	if (pattern == LST_BRW_CHECK_NONE)
+		return 0;
+
+	if (pattern == LST_BRW_CHECK_SIMPLE) {
+		data = *((__u64 *) addr);
+		if (data != magic)
+			goto bad_data;
+
+		if (len > BRW_MSIZE) {
+			addr += len - BRW_MSIZE;
+			data = *((__u64 *) addr);
+			if (data != magic)
+				goto bad_data;
+		}
+		return 0;
+	}
+
+	if (pattern == LST_BRW_CHECK_FULL) {
+		for (i = 0; i < len; i += BRW_MSIZE) {
+			data = *(__u64 *)(addr + i);
+			if (data != magic)
+				goto bad_data;
+		}
+		return 0;
+	}
+
+	LBUG();
+
+bad_data:
+	CERROR ("Bad data in page %p: %#llx, %#llx expected\n",
+		pg, data, magic);
+	return 1;
+}
+
+static void
+brw_fill_bulk(srpc_bulk_t *bk, int pattern, __u64 magic)
+{
+	int	     i;
+	struct page *pg;
+
+	for (i = 0; i < bk->bk_niov; i++) {
+		int	off;
+		int	len;
+
+		pg = bk->bk_iovs[i].kiov_page;
+		off = bk->bk_iovs[i].kiov_offset;
+		len = bk->bk_iovs[i].kiov_len;
+		brw_fill_page(pg, off, len, pattern, magic);
+	}
+}
+
+static int
+brw_check_bulk(srpc_bulk_t *bk, int pattern, __u64 magic)
+{
+	int	     i;
+	struct page *pg;
+
+	for (i = 0; i < bk->bk_niov; i++) {
+		int	off;
+		int	len;
+
+		pg = bk->bk_iovs[i].kiov_page;
+		off = bk->bk_iovs[i].kiov_offset;
+		len = bk->bk_iovs[i].kiov_len;
+		if (brw_check_page(pg, off, len, pattern, magic) != 0) {
+			CERROR("Bulk page %p (%d/%d) is corrupted!\n",
+			       pg, i, bk->bk_niov);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static int
+brw_client_prep_rpc(sfw_test_unit_t *tsu,
+		    struct lnet_process_id dest, srpc_client_rpc_t **rpcpp)
+{
+	srpc_bulk_t	    *bulk = tsu->tsu_private;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	sfw_session_t	    *sn = tsi->tsi_batch->bat_session;
+	srpc_client_rpc_t   *rpc;
+	srpc_brw_reqst_t    *req;
+	int		     flags;
+	int		     npg;
+	int		     len;
+	int		     opc;
+	int		     rc;
+
+	LASSERT(sn != NULL);
+	LASSERT(bulk != NULL);
+
+	if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+		test_bulk_req_t *breq = &tsi->tsi_u.bulk_v0;
+
+		opc   = breq->blk_opc;
+		flags = breq->blk_flags;
+		npg   = breq->blk_npg;
+		len   = npg * PAGE_SIZE;
+
+	} else {
+		test_bulk_req_v1_t  *breq = &tsi->tsi_u.bulk_v1;
+		int		     off;
+
+		/* I should never get this step if it's unknown feature
+		 * because make_session will reject unknown feature */
+		LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+		opc   = breq->blk_opc;
+		flags = breq->blk_flags;
+		len   = breq->blk_len;
+		off   = breq->blk_offset;
+		npg   = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	}
+
+	rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, npg, len, &rpc);
+	if (rc != 0)
+		return rc;
+
+	memcpy(&rpc->crpc_bulk, bulk, offsetof(srpc_bulk_t, bk_iovs[npg]));
+	if (opc == LST_BRW_WRITE)
+		brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_MAGIC);
+	else
+		brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_POISON);
+
+	req = &rpc->crpc_reqstmsg.msg_body.brw_reqst;
+	req->brw_flags = flags;
+	req->brw_rw    = opc;
+	req->brw_len   = len;
+
+	*rpcpp = rpc;
+	return 0;
+}
+
+static void
+brw_client_done_rpc(sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
+{
+	__u64                magic = BRW_MAGIC;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	sfw_session_t       *sn = tsi->tsi_batch->bat_session;
+	srpc_msg_t          *msg = &rpc->crpc_replymsg;
+	srpc_brw_reply_t    *reply = &msg->msg_body.brw_reply;
+	srpc_brw_reqst_t    *reqst = &rpc->crpc_reqstmsg.msg_body.brw_reqst;
+
+	LASSERT(sn != NULL);
+
+	if (rpc->crpc_status != 0) {
+		CERROR("BRW RPC to %s failed with %d\n",
+		       libcfs_id2str(rpc->crpc_dest), rpc->crpc_status);
+		if (!tsi->tsi_stopping) /* rpc could have been aborted */
+			atomic_inc(&sn->sn_brw_errors);
+		return;
+	}
+
+	if (msg->msg_magic != SRPC_MSG_MAGIC) {
+		__swab64s(&magic);
+		__swab32s(&reply->brw_status);
+	}
+
+	CDEBUG(reply->brw_status ? D_WARNING : D_NET,
+	       "BRW RPC to %s finished with brw_status: %d\n",
+	       libcfs_id2str(rpc->crpc_dest), reply->brw_status);
+
+	if (reply->brw_status != 0) {
+		atomic_inc(&sn->sn_brw_errors);
+		rpc->crpc_status = -(int)reply->brw_status;
+		return;
+	}
+
+	if (reqst->brw_rw == LST_BRW_WRITE)
+		return;
+
+	if (brw_check_bulk(&rpc->crpc_bulk, reqst->brw_flags, magic) != 0) {
+		CERROR("Bulk data from %s is corrupted!\n",
+		       libcfs_id2str(rpc->crpc_dest));
+		atomic_inc(&sn->sn_brw_errors);
+		rpc->crpc_status = -EBADMSG;
+	}
+
+	return;
+}
+
+static void
+brw_server_rpc_done(srpc_server_rpc_t *rpc)
+{
+	srpc_bulk_t *blk = rpc->srpc_bulk;
+
+	if (blk == NULL)
+		return;
+
+	if (rpc->srpc_status != 0)
+		CERROR("Bulk transfer %s %s has failed: %d\n",
+		       blk->bk_sink ? "from" : "to",
+		       libcfs_id2str(rpc->srpc_peer), rpc->srpc_status);
+	else
+		CDEBUG(D_NET, "Transferred %d pages bulk data %s %s\n",
+		       blk->bk_niov, blk->bk_sink ? "from" : "to",
+		       libcfs_id2str(rpc->srpc_peer));
+
+	sfw_free_pages(rpc);
+}
+
+static int
+brw_bulk_ready(srpc_server_rpc_t *rpc, int status)
+{
+        __u64             magic = BRW_MAGIC;
+        srpc_brw_reply_t *reply = &rpc->srpc_replymsg.msg_body.brw_reply;
+        srpc_brw_reqst_t *reqst;
+        srpc_msg_t       *reqstmsg;
+
+        LASSERT (rpc->srpc_bulk != NULL);
+        LASSERT (rpc->srpc_reqstbuf != NULL);
+
+        reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+        reqst = &reqstmsg->msg_body.brw_reqst;
+
+        if (status != 0) {
+                CERROR ("BRW bulk %s failed for RPC from %s: %d\n",
+                        reqst->brw_rw == LST_BRW_READ ? "READ" : "WRITE",
+                        libcfs_id2str(rpc->srpc_peer), status);
+                return -EIO;
+        }
+
+        if (reqst->brw_rw == LST_BRW_READ)
+                return 0;
+
+        if (reqstmsg->msg_magic != SRPC_MSG_MAGIC)
+                __swab64s(&magic);
+
+        if (brw_check_bulk(rpc->srpc_bulk, reqst->brw_flags, magic) != 0) {
+                CERROR ("Bulk data from %s is corrupted!\n",
+                        libcfs_id2str(rpc->srpc_peer));
+                reply->brw_status = EBADMSG;
+        }
+
+        return 0;
+}
+
+static int
+brw_server_handle(struct srpc_server_rpc *rpc)
+{
+	struct srpc_service	*sv = rpc->srpc_scd->scd_svc;
+        srpc_msg_t       *replymsg = &rpc->srpc_replymsg;
+        srpc_msg_t       *reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+        srpc_brw_reply_t *reply = &replymsg->msg_body.brw_reply;
+        srpc_brw_reqst_t *reqst = &reqstmsg->msg_body.brw_reqst;
+	int		  npg;
+        int               rc;
+
+        LASSERT (sv->sv_id == SRPC_SERVICE_BRW);
+
+        if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) {
+                LASSERT (reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+                __swab32s(&reqst->brw_rw);
+                __swab32s(&reqst->brw_len);
+                __swab32s(&reqst->brw_flags);
+                __swab64s(&reqst->brw_rpyid);
+                __swab64s(&reqst->brw_bulkid);
+        }
+        LASSERT (reqstmsg->msg_type == (__u32)srpc_service2request(sv->sv_id));
+
+	reply->brw_status = 0;
+        rpc->srpc_done = brw_server_rpc_done;
+
+        if ((reqst->brw_rw != LST_BRW_READ && reqst->brw_rw != LST_BRW_WRITE) ||
+            (reqst->brw_flags != LST_BRW_CHECK_NONE &&
+             reqst->brw_flags != LST_BRW_CHECK_FULL &&
+             reqst->brw_flags != LST_BRW_CHECK_SIMPLE)) {
+                reply->brw_status = EINVAL;
+                return 0;
+        }
+
+	if ((reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		replymsg->msg_ses_feats = LST_FEATS_MASK;
+		reply->brw_status = EPROTO;
+		return 0;
+	}
+
+	if ((reqstmsg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) {
+		/* compat with old version */
+		if ((reqst->brw_len & ~PAGE_MASK) != 0) {
+			reply->brw_status = EINVAL;
+			return 0;
+		}
+		npg = reqst->brw_len >> PAGE_SHIFT;
+
+	} else {
+		npg = (reqst->brw_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	}
+
+	replymsg->msg_ses_feats = reqstmsg->msg_ses_feats;
+
+	if (reqst->brw_len == 0 || npg > LNET_MAX_IOV) {
+		reply->brw_status = EINVAL;
+		return 0;
+	}
+
+	rc = sfw_alloc_pages(rpc, rpc->srpc_scd->scd_cpt, npg,
+			     reqst->brw_len,
+			     reqst->brw_rw == LST_BRW_WRITE);
+	if (rc != 0)
+		return rc;
+
+        if (reqst->brw_rw == LST_BRW_READ)
+                brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_MAGIC);
+        else
+                brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_POISON);
+
+        return 0;
+}
+
+sfw_test_client_ops_t brw_test_client;
+void brw_init_test_client(void)
+{
+        brw_test_client.tso_init       = brw_client_init;
+        brw_test_client.tso_fini       = brw_client_fini;
+        brw_test_client.tso_prep_rpc   = brw_client_prep_rpc;
+        brw_test_client.tso_done_rpc   = brw_client_done_rpc;
+};
+
+srpc_service_t brw_test_service;
+void brw_init_test_service(void)
+{
+
+        brw_test_service.sv_id         = SRPC_SERVICE_BRW;
+        brw_test_service.sv_name       = "brw_test";
+        brw_test_service.sv_handler    = brw_server_handle;
+        brw_test_service.sv_bulk_ready = brw_bulk_ready;
+	brw_test_service.sv_wi_total   = brw_srv_workitems;
+}
diff --git a/drivers/staging/lustrefsx/lnet/selftest/conctl.c b/drivers/staging/lustrefsx/lnet/selftest/conctl.c
new file mode 100644
index 0000000000000..5476097fbc1ba
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/conctl.c
@@ -0,0 +1,924 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * IOC handle in kernel
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#include <libcfs/libcfs.h>
+#include <lnet/lib-lnet.h>
+#include <lnet/lnetst.h>
+#include "console.h"
+
+static int
+lst_session_new_ioctl(struct lstio_session_new_args *args)
+{
+        char      *name;
+        int        rc;
+
+        if (args->lstio_ses_idp   == NULL || /* address for output sid */
+            args->lstio_ses_key   == 0 || /* no key is specified */
+            args->lstio_ses_namep == NULL || /* session name */
+            args->lstio_ses_nmlen <= 0 ||
+            args->lstio_ses_nmlen > LST_NAME_SIZE)
+                return -EINVAL;
+
+        LIBCFS_ALLOC(name, args->lstio_ses_nmlen + 1);
+        if (name == NULL)
+                return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_ses_namep,
+			   args->lstio_ses_nmlen)) {
+                LIBCFS_FREE(name, args->lstio_ses_nmlen + 1);
+                return -EFAULT;
+        }
+
+        name[args->lstio_ses_nmlen] = 0;
+
+	rc = lstcon_session_new(name,
+				args->lstio_ses_key,
+				args->lstio_ses_feats,
+				args->lstio_ses_timeout,
+				args->lstio_ses_force,
+				args->lstio_ses_idp);
+
+	LIBCFS_FREE(name, args->lstio_ses_nmlen + 1);
+	return rc;
+}
+
+static int
+lst_session_end_ioctl(struct lstio_session_end_args *args)
+{
+        if (args->lstio_ses_key != console_session.ses_key)
+                return -EACCES;
+
+        return lstcon_session_end();
+}
+
+static int
+lst_session_info_ioctl(struct lstio_session_info_args *args)
+{
+        /* no checking of key */
+
+        if (args->lstio_ses_idp   == NULL || /* address for ouput sid */
+            args->lstio_ses_keyp  == NULL || /* address for ouput key */
+	    args->lstio_ses_featp  == NULL || /* address for ouput features */
+            args->lstio_ses_ndinfo == NULL || /* address for output ndinfo */
+            args->lstio_ses_namep == NULL || /* address for ouput name */
+            args->lstio_ses_nmlen <= 0 ||
+            args->lstio_ses_nmlen > LST_NAME_SIZE)
+                return -EINVAL;
+
+        return lstcon_session_info(args->lstio_ses_idp,
+                                   args->lstio_ses_keyp,
+				   args->lstio_ses_featp,
+                                   args->lstio_ses_ndinfo,
+                                   args->lstio_ses_namep,
+                                   args->lstio_ses_nmlen);
+}
+
+static int
+lst_debug_ioctl(struct lstio_debug_args *args)
+{
+        char   *name   = NULL;
+        int     client = 1;
+        int     rc;
+
+        if (args->lstio_dbg_key != console_session.ses_key)
+                return -EACCES;
+
+        if (args->lstio_dbg_resultp == NULL)
+                return -EINVAL;
+
+        if (args->lstio_dbg_namep != NULL && /* name of batch/group */
+            (args->lstio_dbg_nmlen <= 0 ||
+             args->lstio_dbg_nmlen > LST_NAME_SIZE))
+                return -EINVAL;
+
+        if (args->lstio_dbg_namep != NULL) {
+                LIBCFS_ALLOC(name, args->lstio_dbg_nmlen + 1);
+                if (name == NULL)
+                        return -ENOMEM;
+
+		if (copy_from_user(name, args->lstio_dbg_namep,
+                                       args->lstio_dbg_nmlen)) {
+                        LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
+
+                        return -EFAULT;
+                }
+
+                name[args->lstio_dbg_nmlen] = 0;
+        }
+
+        rc = -EINVAL;
+
+        switch (args->lstio_dbg_type) {
+        case LST_OPC_SESSION:
+                rc = lstcon_session_debug(args->lstio_dbg_timeout,
+                                          args->lstio_dbg_resultp);
+                break;
+
+        case LST_OPC_BATCHSRV:
+                client = 0;
+        case LST_OPC_BATCHCLI:
+                if (name == NULL)
+                        goto out;
+
+                rc = lstcon_batch_debug(args->lstio_dbg_timeout,
+                                        name, client, args->lstio_dbg_resultp);
+                break;
+
+        case LST_OPC_GROUP:
+                if (name == NULL)
+                        goto out;
+
+                rc = lstcon_group_debug(args->lstio_dbg_timeout,
+                                        name, args->lstio_dbg_resultp);
+                break;
+
+        case LST_OPC_NODES:
+                if (args->lstio_dbg_count <= 0 ||
+                    args->lstio_dbg_idsp == NULL)
+                        goto out;
+
+                rc = lstcon_nodes_debug(args->lstio_dbg_timeout,
+                                        args->lstio_dbg_count,
+                                        args->lstio_dbg_idsp,
+                                        args->lstio_dbg_resultp);
+                break;
+
+        default:
+                break;
+        }
+
+out:
+        if (name != NULL)
+                LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
+
+        return rc;
+}
+
+static int
+lst_group_add_ioctl(struct lstio_group_add_args *args)
+{
+        char           *name;
+        int             rc;
+
+        if (args->lstio_grp_key != console_session.ses_key)
+                return -EACCES;
+
+        if (args->lstio_grp_namep == NULL||
+	    args->lstio_grp_nmlen <= 0 ||
+            args->lstio_grp_nmlen > LST_NAME_SIZE)
+                return -EINVAL;
+
+        LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+        if (name == NULL)
+                return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_grp_namep,
+			   args->lstio_grp_nmlen)) {
+                LIBCFS_FREE(name, args->lstio_grp_nmlen);
+                return -EFAULT;
+        }
+
+        name[args->lstio_grp_nmlen] = 0;
+
+        rc = lstcon_group_add(name);
+
+        LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+        return rc;
+}
+
+static int
+lst_group_del_ioctl(struct lstio_group_del_args *args)
+{
+        int     rc;
+        char   *name;
+
+        if (args->lstio_grp_key != console_session.ses_key)
+                return -EACCES;
+
+        if (args->lstio_grp_namep == NULL ||
+            args->lstio_grp_nmlen <= 0 ||
+            args->lstio_grp_nmlen > LST_NAME_SIZE)
+                return -EINVAL;
+
+        LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+        if (name == NULL)
+                return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_grp_namep,
+			   args->lstio_grp_nmlen)) {
+                LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+                return -EFAULT;
+        }
+
+        name[args->lstio_grp_nmlen] = 0;
+
+        rc = lstcon_group_del(name);
+
+        LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+        return rc;
+}
+
+static int
+lst_group_update_ioctl(struct lstio_group_update_args *args)
+{
+        int     rc;
+        char   *name;
+
+        if (args->lstio_grp_key != console_session.ses_key)
+                return -EACCES;
+
+        if (args->lstio_grp_resultp == NULL ||
+            args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+            args->lstio_grp_nmlen > LST_NAME_SIZE)
+                return -EINVAL;
+
+        LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+        if (name == NULL)
+                return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_grp_namep,
+			   args->lstio_grp_nmlen)) {
+                LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+                return -EFAULT;
+        }
+
+        name[args->lstio_grp_nmlen] = 0;
+
+        switch (args->lstio_grp_opc) {
+        case LST_GROUP_CLEAN:
+                rc = lstcon_group_clean(name, args->lstio_grp_args);
+                break;
+
+        case LST_GROUP_REFRESH:
+                rc = lstcon_group_refresh(name, args->lstio_grp_resultp);
+                break;
+
+        case LST_GROUP_RMND:
+                if (args->lstio_grp_count  <= 0 ||
+                    args->lstio_grp_idsp == NULL) {
+                        rc = -EINVAL;
+                        break;
+                }
+                rc = lstcon_nodes_remove(name, args->lstio_grp_count,
+                                         args->lstio_grp_idsp,
+                                         args->lstio_grp_resultp);
+                break;
+
+        default:
+                rc = -EINVAL;
+                break;
+        }
+
+        LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+        return rc;
+}
+
+static int
+lst_nodes_add_ioctl(struct lstio_group_nodes_args *args)
+{
+	unsigned feats;
+        int     rc;
+        char   *name;
+
+        if (args->lstio_grp_key != console_session.ses_key)
+                return -EACCES;
+
+        if (args->lstio_grp_idsp == NULL || /* array of ids */
+            args->lstio_grp_count <= 0 ||
+            args->lstio_grp_resultp == NULL ||
+	    args->lstio_grp_featp == NULL ||
+	    args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+            args->lstio_grp_nmlen > LST_NAME_SIZE)
+                return -EINVAL;
+
+        LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+        if (name == NULL)
+                return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_grp_namep,
+                               args->lstio_grp_nmlen)) {
+                LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+                return -EFAULT;
+        }
+
+        name[args->lstio_grp_nmlen] = 0;
+
+        rc = lstcon_nodes_add(name, args->lstio_grp_count,
+			      args->lstio_grp_idsp, &feats,
+			      args->lstio_grp_resultp);
+
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+	if (rc == 0 &&
+	    copy_to_user(args->lstio_grp_featp, &feats, sizeof(feats))) {
+		return -EINVAL;
+	}
+
+        return rc;
+}
+
+static int
+lst_group_list_ioctl(struct lstio_group_list_args *args)
+{
+	if (args->lstio_grp_key != console_session.ses_key)
+                return -EACCES;
+
+        if (args->lstio_grp_idx   < 0 ||
+            args->lstio_grp_namep == NULL ||
+            args->lstio_grp_nmlen <= 0 ||
+            args->lstio_grp_nmlen > LST_NAME_SIZE)
+                return -EINVAL;
+
+        return lstcon_group_list(args->lstio_grp_idx,
+                              args->lstio_grp_nmlen,
+                              args->lstio_grp_namep);
+}
+
+static int
+lst_group_info_ioctl(struct lstio_group_info_args *args)
+{
+        char           *name;
+        int             ndent;
+        int             index;
+        int             rc;
+
+        if (args->lstio_grp_key != console_session.ses_key)
+                return -EACCES;
+
+        if (args->lstio_grp_namep == NULL ||
+            args->lstio_grp_nmlen <= 0 ||
+            args->lstio_grp_nmlen > LST_NAME_SIZE)
+                return -EINVAL;
+
+        if (args->lstio_grp_entp  == NULL && /* output: group entry */
+            args->lstio_grp_dentsp == NULL)  /* output: node entry */
+                return -EINVAL;
+
+        if (args->lstio_grp_dentsp != NULL) { /* have node entry */
+                if (args->lstio_grp_idxp == NULL || /* node index */
+                    args->lstio_grp_ndentp == NULL) /* # of node entry */
+                        return -EINVAL;
+
+		if (copy_from_user(&ndent, args->lstio_grp_ndentp,
+				   sizeof(ndent)) ||
+		    copy_from_user(&index, args->lstio_grp_idxp,
+				   sizeof(index)))
+			return -EFAULT;
+
+		if (ndent <= 0 || index < 0)
+			return -EINVAL;
+	}
+
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_grp_namep,
+			   args->lstio_grp_nmlen)) {
+                LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+                return -EFAULT;
+        }
+
+        name[args->lstio_grp_nmlen] = 0;
+
+        rc = lstcon_group_info(name, args->lstio_grp_entp,
+                               &index, &ndent, args->lstio_grp_dentsp);
+
+        LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+	if (rc != 0)
+                return rc;
+
+	if (args->lstio_grp_dentsp != NULL &&
+	    (copy_to_user(args->lstio_grp_idxp, &index, sizeof(index)) ||
+	     copy_to_user(args->lstio_grp_ndentp, &ndent, sizeof(ndent))))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int
+lst_batch_add_ioctl(struct lstio_batch_add_args *args)
+{
+        int             rc;
+        char           *name;
+
+        if (args->lstio_bat_key != console_session.ses_key)
+                return -EACCES;
+
+        if (args->lstio_bat_namep == NULL ||
+            args->lstio_bat_nmlen <= 0 ||
+            args->lstio_bat_nmlen > LST_NAME_SIZE)
+                return -EINVAL;
+
+        LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+        if (name == NULL)
+                return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_bat_namep,
+			   args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+        name[args->lstio_bat_nmlen] = 0;
+
+        rc = lstcon_batch_add(name);
+
+        LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+        return rc;
+}
+
+static int
+lst_batch_run_ioctl(struct lstio_batch_run_args *args)
+{
+        int             rc;
+        char           *name;
+
+        if (args->lstio_bat_key != console_session.ses_key)
+                return -EACCES;
+
+        if (args->lstio_bat_namep == NULL ||
+            args->lstio_bat_nmlen <= 0 ||
+            args->lstio_bat_nmlen > LST_NAME_SIZE)
+                return -EINVAL;
+
+        LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+        if (name == NULL)
+                return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_bat_namep,
+			   args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+        name[args->lstio_bat_nmlen] = 0;
+
+        rc = lstcon_batch_run(name, args->lstio_bat_timeout,
+                              args->lstio_bat_resultp);
+
+        LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+        return rc;
+}
+
+static int
+lst_batch_stop_ioctl(struct lstio_batch_stop_args *args)
+{
+        int             rc;
+        char           *name;
+
+        if (args->lstio_bat_key != console_session.ses_key)
+                return -EACCES;
+
+        if (args->lstio_bat_resultp == NULL ||
+            args->lstio_bat_namep == NULL ||
+            args->lstio_bat_nmlen <= 0 ||
+            args->lstio_bat_nmlen > LST_NAME_SIZE)
+                return -EINVAL;
+
+        LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+        if (name == NULL)
+                return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_bat_namep,
+			   args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+        name[args->lstio_bat_nmlen] = 0;
+
+        rc = lstcon_batch_stop(name, args->lstio_bat_force,
+                               args->lstio_bat_resultp);
+
+        LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+        return rc;
+}
+
+static int
+lst_batch_query_ioctl(struct lstio_batch_query_args *args)
+{
+        char   *name;
+        int     rc;
+
+        if (args->lstio_bat_key != console_session.ses_key)
+                return -EACCES;
+
+        if (args->lstio_bat_resultp == NULL ||
+            args->lstio_bat_namep == NULL ||
+            args->lstio_bat_nmlen <= 0 ||
+            args->lstio_bat_nmlen > LST_NAME_SIZE)
+                return -EINVAL;
+
+        if (args->lstio_bat_testidx < 0)
+                return -EINVAL;
+
+        LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+        if (name == NULL)
+                return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_bat_namep,
+			   args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+        name[args->lstio_bat_nmlen] = 0;
+
+        rc = lstcon_test_batch_query(name,
+                                     args->lstio_bat_testidx,
+                                     args->lstio_bat_client,
+                                     args->lstio_bat_timeout,
+                                     args->lstio_bat_resultp);
+
+        LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+        return rc;
+}
+
+static int
+lst_batch_list_ioctl(struct lstio_batch_list_args *args)
+{
+        if (args->lstio_bat_key != console_session.ses_key)
+                return -EACCES;
+
+        if (args->lstio_bat_idx   < 0 ||
+            args->lstio_bat_namep == NULL ||
+            args->lstio_bat_nmlen <= 0 ||
+            args->lstio_bat_nmlen > LST_NAME_SIZE)
+                return -EINVAL;
+
+        return lstcon_batch_list(args->lstio_bat_idx,
+                              args->lstio_bat_nmlen,
+                              args->lstio_bat_namep);
+}
+
+static int
+lst_batch_info_ioctl(struct lstio_batch_info_args *args)
+{
+        char           *name;
+        int             rc;
+        int             index;
+        int             ndent;
+
+        if (args->lstio_bat_key != console_session.ses_key)
+                return -EACCES;
+
+        if (args->lstio_bat_namep == NULL || /* batch name */
+            args->lstio_bat_nmlen <= 0 ||
+            args->lstio_bat_nmlen > LST_NAME_SIZE)
+                return -EINVAL;
+
+        if (args->lstio_bat_entp == NULL && /* output: batch entry */
+            args->lstio_bat_dentsp == NULL) /* output: node entry */
+                return -EINVAL;
+
+        if (args->lstio_bat_dentsp != NULL) { /* have node entry */
+                if (args->lstio_bat_idxp == NULL || /* node index */
+                    args->lstio_bat_ndentp == NULL) /* # of node entry */
+                        return -EINVAL;
+
+		if (copy_from_user(&index, args->lstio_bat_idxp,
+                                       sizeof(index)) ||
+		    copy_from_user(&ndent, args->lstio_bat_ndentp,
+                                       sizeof(ndent)))
+                        return -EFAULT;
+
+                if (ndent <= 0 || index < 0)
+                        return -EINVAL;
+        }
+
+        LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+        if (name == NULL)
+                return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_bat_namep,
+			   args->lstio_bat_nmlen)) {
+                LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+                return -EFAULT;
+        }
+
+        name[args->lstio_bat_nmlen] = 0;
+
+        rc = lstcon_batch_info(name,
+                            args->lstio_bat_entp, args->lstio_bat_server,
+                            args->lstio_bat_testidx, &index, &ndent,
+                            args->lstio_bat_dentsp);
+
+        LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+        if (rc != 0)
+                return rc;
+
+	if (args->lstio_bat_dentsp != NULL &&
+	    (copy_to_user(args->lstio_bat_idxp, &index, sizeof(index)) ||
+	     copy_to_user(args->lstio_bat_ndentp, &ndent, sizeof(ndent))))
+		rc = -EFAULT;
+
+	return rc;
+}
+
+static int
+lst_stat_query_ioctl(struct lstio_stat_args *args)
+{
+        int             rc;
+	char           *name = NULL;
+
+        /* TODO: not finished */
+        if (args->lstio_sta_key != console_session.ses_key)
+                return -EACCES;
+
+	if (args->lstio_sta_resultp == NULL)
+		return -EINVAL;
+
+	if (args->lstio_sta_idsp != NULL) {
+		if (args->lstio_sta_count <= 0)
+			return -EINVAL;
+
+		rc = lstcon_nodes_stat(args->lstio_sta_count,
+                                       args->lstio_sta_idsp,
+                                       args->lstio_sta_timeout,
+                                       args->lstio_sta_resultp);
+	} else if (args->lstio_sta_namep != NULL) {
+		if (args->lstio_sta_nmlen <= 0 ||
+		    args->lstio_sta_nmlen > LST_NAME_SIZE)
+			return -EINVAL;
+
+		LIBCFS_ALLOC(name, args->lstio_sta_nmlen + 1);
+		if (name == NULL)
+			return -ENOMEM;
+
+		rc = copy_from_user(name, args->lstio_sta_namep,
+				    args->lstio_sta_nmlen);
+		if (rc == 0)
+			rc = lstcon_group_stat(name, args->lstio_sta_timeout,
+					       args->lstio_sta_resultp);
+		else
+			rc = -EFAULT;
+
+	} else {
+		rc = -EINVAL;
+	}
+
+	if (name != NULL)
+		LIBCFS_FREE(name, args->lstio_sta_nmlen + 1);
+	return rc;
+}
+
+static int lst_test_add_ioctl(struct lstio_test_args *args)
+{
+	char		*batch_name;
+	char		*src_name = NULL;
+	char		*dst_name = NULL;
+	void		*param = NULL;
+	int		ret = 0;
+	int		rc = -ENOMEM;
+
+	if (args->lstio_tes_resultp == NULL ||
+	    args->lstio_tes_retp == NULL ||
+	    args->lstio_tes_bat_name == NULL || /* no specified batch */
+	    args->lstio_tes_bat_nmlen <= 0 ||
+	    args->lstio_tes_bat_nmlen > LST_NAME_SIZE ||
+	    args->lstio_tes_sgrp_name == NULL || /* no source group */
+	    args->lstio_tes_sgrp_nmlen <= 0 ||
+	    args->lstio_tes_sgrp_nmlen > LST_NAME_SIZE ||
+	    args->lstio_tes_dgrp_name == NULL || /* no target group */
+	    args->lstio_tes_dgrp_nmlen <= 0 ||
+	    args->lstio_tes_dgrp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	if (args->lstio_tes_loop == 0 || /* negative is infinite */
+	    args->lstio_tes_concur <= 0 ||
+	    args->lstio_tes_dist <= 0 ||
+	    args->lstio_tes_span <= 0)
+		return -EINVAL;
+
+        /* have parameter, check if parameter length is valid */
+        if (args->lstio_tes_param != NULL &&
+            (args->lstio_tes_param_len <= 0 ||
+	     args->lstio_tes_param_len >
+	     PAGE_SIZE - sizeof(lstcon_test_t)))
+                return -EINVAL;
+
+	LIBCFS_ALLOC(batch_name, args->lstio_tes_bat_nmlen + 1);
+	if (batch_name == NULL)
+		return rc;
+
+	LIBCFS_ALLOC(src_name, args->lstio_tes_sgrp_nmlen + 1);
+	if (src_name == NULL)
+		goto out;
+
+	LIBCFS_ALLOC(dst_name, args->lstio_tes_dgrp_nmlen + 1);
+	if (dst_name == NULL)
+		goto out;
+
+	if (args->lstio_tes_param != NULL) {
+		LIBCFS_ALLOC(param, args->lstio_tes_param_len);
+		if (param == NULL)
+			goto out;
+		if (copy_from_user(param, args->lstio_tes_param,
+				   args->lstio_tes_param_len)) {
+			rc = -EFAULT;
+			goto out;
+		}
+	}
+
+	rc = -EFAULT;
+	if (copy_from_user(batch_name, args->lstio_tes_bat_name,
+			   args->lstio_tes_bat_nmlen) ||
+	    copy_from_user(src_name, args->lstio_tes_sgrp_name,
+			   args->lstio_tes_sgrp_nmlen) ||
+	    copy_from_user(dst_name, args->lstio_tes_dgrp_name,
+			   args->lstio_tes_dgrp_nmlen))
+		goto out;
+
+	rc = lstcon_test_add(batch_name,
+			    args->lstio_tes_type,
+			    args->lstio_tes_loop,
+			    args->lstio_tes_concur,
+			    args->lstio_tes_dist, args->lstio_tes_span,
+			    src_name, dst_name, param,
+			    args->lstio_tes_param_len,
+			    &ret, args->lstio_tes_resultp);
+
+        if (ret != 0)
+		rc = (copy_to_user(args->lstio_tes_retp, &ret,
+                                       sizeof(ret))) ? -EFAULT : 0;
+out:
+	if (batch_name != NULL)
+		LIBCFS_FREE(batch_name, args->lstio_tes_bat_nmlen + 1);
+
+	if (src_name != NULL)
+		LIBCFS_FREE(src_name, args->lstio_tes_sgrp_nmlen + 1);
+
+	if (dst_name != NULL)
+		LIBCFS_FREE(dst_name, args->lstio_tes_dgrp_nmlen + 1);
+
+	if (param != NULL)
+		LIBCFS_FREE(param, args->lstio_tes_param_len);
+
+	return rc;
+}
+
+int
+lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_hdr *hdr)
+{
+	char   *buf;
+	struct libcfs_ioctl_data *data;
+	int     opc;
+	int     rc;
+
+	if (cmd != IOC_LIBCFS_LNETST)
+		return -EINVAL;
+
+	data = container_of(hdr, struct libcfs_ioctl_data, ioc_hdr);
+
+	opc = data->ioc_u32[0];
+
+	if (data->ioc_plen1 > PAGE_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(buf, data->ioc_plen1);
+	if (buf == NULL)
+		return -ENOMEM;
+
+	/* copy in parameter */
+	if (copy_from_user(buf, data->ioc_pbuf1, data->ioc_plen1)) {
+		LIBCFS_FREE(buf, data->ioc_plen1);
+		return -EFAULT;
+	}
+
+	mutex_lock(&console_session.ses_mutex);
+
+	console_session.ses_laststamp = cfs_time_current_sec();
+
+	if (console_session.ses_shutdown) {
+		rc = -ESHUTDOWN;
+		goto out;
+	}
+
+	if (console_session.ses_expired)
+		lstcon_session_end();
+
+	if (opc != LSTIO_SESSION_NEW &&
+	    console_session.ses_state == LST_SESSION_NONE) {
+		CDEBUG(D_NET, "LST no active session\n");
+		rc = -ESRCH;
+		goto out;
+	}
+
+	memset(&console_session.ses_trans_stat, 0, sizeof(struct lstcon_trans_stat));
+
+	switch (opc) {
+	case LSTIO_SESSION_NEW:
+		rc = lst_session_new_ioctl((struct lstio_session_new_args *)buf);
+		break;
+	case LSTIO_SESSION_END:
+		rc = lst_session_end_ioctl((struct lstio_session_end_args *)buf);
+		break;
+	case LSTIO_SESSION_INFO:
+		rc = lst_session_info_ioctl((struct lstio_session_info_args *)buf);
+		break;
+	case LSTIO_DEBUG:
+		rc = lst_debug_ioctl((struct lstio_debug_args *)buf);
+		break;
+	case LSTIO_GROUP_ADD:
+		rc = lst_group_add_ioctl((struct lstio_group_add_args *)buf);
+		break;
+	case LSTIO_GROUP_DEL:
+		rc = lst_group_del_ioctl((struct lstio_group_del_args *)buf);
+		break;
+	case LSTIO_GROUP_UPDATE:
+		rc = lst_group_update_ioctl((struct lstio_group_update_args *)buf);
+		break;
+	case LSTIO_NODES_ADD:
+		rc = lst_nodes_add_ioctl((struct lstio_group_nodes_args *)buf);
+		break;
+	case LSTIO_GROUP_LIST:
+		rc = lst_group_list_ioctl((struct lstio_group_list_args *)buf);
+		break;
+	case LSTIO_GROUP_INFO:
+		rc = lst_group_info_ioctl((struct lstio_group_info_args *)buf);
+		break;
+	case LSTIO_BATCH_ADD:
+		rc = lst_batch_add_ioctl((struct lstio_batch_add_args *)buf);
+		break;
+	case LSTIO_BATCH_START:
+		rc = lst_batch_run_ioctl((struct lstio_batch_run_args *)buf);
+		break;
+	case LSTIO_BATCH_STOP:
+		rc = lst_batch_stop_ioctl((struct lstio_batch_stop_args *)buf);
+		break;
+	case LSTIO_BATCH_QUERY:
+		rc = lst_batch_query_ioctl((struct lstio_batch_query_args *)buf);
+		break;
+	case LSTIO_BATCH_LIST:
+		rc = lst_batch_list_ioctl((struct lstio_batch_list_args *)buf);
+		break;
+	case LSTIO_BATCH_INFO:
+		rc = lst_batch_info_ioctl((struct lstio_batch_info_args *)buf);
+		break;
+	case LSTIO_TEST_ADD:
+		rc = lst_test_add_ioctl((struct lstio_test_args *)buf);
+		break;
+	case LSTIO_STAT_QUERY:
+		rc = lst_stat_query_ioctl((struct lstio_stat_args *)buf);
+		break;
+	default:
+		rc = -EINVAL;
+	}
+
+	if (copy_to_user(data->ioc_pbuf2, &console_session.ses_trans_stat,
+			 sizeof(struct lstcon_trans_stat)))
+		rc = -EFAULT;
+out:
+	mutex_unlock(&console_session.ses_mutex);
+
+	LIBCFS_FREE(buf, data->ioc_plen1);
+
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/lnet/selftest/conrpc.c b/drivers/staging/lustrefsx/lnet/selftest/conrpc.c
new file mode 100644
index 0000000000000..f9f6c71db2557
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/conrpc.c
@@ -0,0 +1,1402 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Console framework rpcs
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ */
+
+
+#include <libcfs/libcfs.h>
+#include <lnet/lib-lnet.h>
+#include "timer.h"
+#include "conrpc.h"
+#include "console.h"
+
+void lstcon_rpc_stat_reply(lstcon_rpc_trans_t *, srpc_msg_t *,
+			   lstcon_node_t *, struct lstcon_trans_stat *);
+
+static void
+lstcon_rpc_done(srpc_client_rpc_t *rpc)
+{
+	lstcon_rpc_t *crpc = (lstcon_rpc_t *)rpc->crpc_priv;
+
+	LASSERT(crpc != NULL && rpc == crpc->crp_rpc);
+	LASSERT(crpc->crp_posted && !crpc->crp_finished);
+
+	spin_lock(&rpc->crpc_lock);
+
+	if (crpc->crp_trans == NULL) {
+		/* Orphan RPC is not in any transaction,
+		 * I'm just a poor body and nobody loves me */
+		spin_unlock(&rpc->crpc_lock);
+
+		/* release it */
+		lstcon_rpc_put(crpc);
+		return;
+	}
+
+	/* not an orphan RPC */
+	crpc->crp_finished = 1;
+
+	if (crpc->crp_stamp == 0) {
+		/* not aborted */
+		LASSERT (crpc->crp_status == 0);
+
+		crpc->crp_stamp  = cfs_time_current();
+		crpc->crp_status = rpc->crpc_status;
+	}
+
+	/* wakeup (transaction)thread if I'm the last RPC in the transaction */
+	if (atomic_dec_and_test(&crpc->crp_trans->tas_remaining))
+		wake_up(&crpc->crp_trans->tas_waitq);
+
+	spin_unlock(&rpc->crpc_lock);
+}
+
+static int
+lstcon_rpc_init(lstcon_node_t *nd, int service, unsigned feats,
+		int bulk_npg, int bulk_len, int embedded, lstcon_rpc_t *crpc)
+{
+	crpc->crp_rpc = sfw_create_rpc(nd->nd_id, service,
+				       feats, bulk_npg, bulk_len,
+				       lstcon_rpc_done, (void *)crpc);
+	if (crpc->crp_rpc == NULL)
+		return -ENOMEM;
+
+	crpc->crp_trans	   = NULL;
+	crpc->crp_node	   = nd;
+	crpc->crp_posted   = 0;
+	crpc->crp_finished = 0;
+	crpc->crp_unpacked = 0;
+	crpc->crp_status   = 0;
+	crpc->crp_stamp	   = 0;
+	crpc->crp_embedded = embedded;
+	INIT_LIST_HEAD(&crpc->crp_link);
+
+	atomic_inc(&console_session.ses_rpc_counter);
+
+	return 0;
+}
+
+static int
+lstcon_rpc_prep(lstcon_node_t *nd, int service, unsigned feats,
+		int bulk_npg, int bulk_len, lstcon_rpc_t **crpcpp)
+{
+	lstcon_rpc_t  *crpc = NULL;
+	int            rc;
+
+	spin_lock(&console_session.ses_rpc_lock);
+
+	if (!list_empty(&console_session.ses_rpc_freelist)) {
+		crpc = list_entry(console_session.ses_rpc_freelist.next,
+				  lstcon_rpc_t, crp_link);
+		list_del_init(&crpc->crp_link);
+	}
+
+	spin_unlock(&console_session.ses_rpc_lock);
+
+	if (crpc == NULL) {
+		LIBCFS_ALLOC(crpc, sizeof(*crpc));
+		if (crpc == NULL)
+			return -ENOMEM;
+	}
+
+	rc = lstcon_rpc_init(nd, service, feats, bulk_npg, bulk_len, 0, crpc);
+        if (rc == 0) {
+                *crpcpp = crpc;
+                return 0;
+        }
+
+        LIBCFS_FREE(crpc, sizeof(*crpc));
+
+        return rc;
+}
+
+void
+lstcon_rpc_put(lstcon_rpc_t *crpc)
+{
+	srpc_bulk_t *bulk = &crpc->crp_rpc->crpc_bulk;
+	int	     i;
+
+	LASSERT(list_empty(&crpc->crp_link));
+
+	for (i = 0; i < bulk->bk_niov; i++) {
+		if (bulk->bk_iovs[i].kiov_page == NULL)
+			continue;
+
+		__free_page(bulk->bk_iovs[i].kiov_page);
+	}
+
+	srpc_client_rpc_decref(crpc->crp_rpc);
+
+	if (crpc->crp_embedded) {
+		/* embedded RPC, don't recycle it */
+		memset(crpc, 0, sizeof(*crpc));
+		crpc->crp_embedded = 1;
+
+	} else {
+		spin_lock(&console_session.ses_rpc_lock);
+
+		list_add(&crpc->crp_link,
+			 &console_session.ses_rpc_freelist);
+
+		spin_unlock(&console_session.ses_rpc_lock);
+	}
+
+	/* RPC is not alive now */
+	atomic_dec(&console_session.ses_rpc_counter);
+}
+
+static void
+lstcon_rpc_post(lstcon_rpc_t *crpc)
+{
+        lstcon_rpc_trans_t *trans = crpc->crp_trans;
+
+        LASSERT (trans != NULL);
+
+	atomic_inc(&trans->tas_remaining);
+        crpc->crp_posted = 1;
+
+        sfw_post_rpc(crpc->crp_rpc);
+}
+
+static char *
+lstcon_rpc_trans_name(int transop)
+{
+        if (transop == LST_TRANS_SESNEW)
+                return "SESNEW";
+
+        if (transop == LST_TRANS_SESEND)
+                return "SESEND";
+
+        if (transop == LST_TRANS_SESQRY)
+                return "SESQRY";
+
+        if (transop == LST_TRANS_SESPING)
+                return "SESPING";
+
+        if (transop == LST_TRANS_TSBCLIADD)
+                return "TSBCLIADD";
+
+        if (transop == LST_TRANS_TSBSRVADD)
+                return "TSBSRVADD";
+
+        if (transop == LST_TRANS_TSBRUN)
+                return "TSBRUN";
+
+        if (transop == LST_TRANS_TSBSTOP)
+                return "TSBSTOP";
+
+        if (transop == LST_TRANS_TSBCLIQRY)
+                return "TSBCLIQRY";
+
+        if (transop == LST_TRANS_TSBSRVQRY)
+                return "TSBSRVQRY";
+
+        if (transop == LST_TRANS_STATQRY)
+                return "STATQRY";
+
+        return "Unknown";
+}
+
+int
+lstcon_rpc_trans_prep(struct list_head *translist, int transop,
+		      lstcon_rpc_trans_t **transpp)
+{
+	lstcon_rpc_trans_t *trans;
+
+	if (translist != NULL) {
+		list_for_each_entry(trans, translist, tas_link) {
+			/* Can't enqueue two private transaction on
+			 * the same object */
+			if ((trans->tas_opc & transop) == LST_TRANS_PRIVATE)
+				return -EPERM;
+		}
+	}
+
+	/* create a trans group */
+	LIBCFS_ALLOC(trans, sizeof(*trans));
+	if (trans == NULL)
+		return -ENOMEM;
+
+	trans->tas_opc = transop;
+
+	if (translist == NULL)
+		INIT_LIST_HEAD(&trans->tas_olink);
+	else
+		list_add_tail(&trans->tas_olink, translist);
+
+	list_add_tail(&trans->tas_link, &console_session.ses_trans_list);
+
+	INIT_LIST_HEAD(&trans->tas_rpcs_list);
+	atomic_set(&trans->tas_remaining, 0);
+	init_waitqueue_head(&trans->tas_waitq);
+
+	spin_lock(&console_session.ses_rpc_lock);
+	trans->tas_features = console_session.ses_features;
+	spin_unlock(&console_session.ses_rpc_lock);
+
+	*transpp = trans;
+	return 0;
+}
+
+void
+lstcon_rpc_trans_addreq(lstcon_rpc_trans_t *trans, lstcon_rpc_t *crpc)
+{
+	list_add_tail(&crpc->crp_link, &trans->tas_rpcs_list);
+	crpc->crp_trans = trans;
+}
+
+void
+lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error)
+{
+	srpc_client_rpc_t *rpc;
+	lstcon_rpc_t	  *crpc;
+	lstcon_node_t	  *nd;
+
+	list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
+		rpc = crpc->crp_rpc;
+
+		spin_lock(&rpc->crpc_lock);
+
+		if (!crpc->crp_posted || /* not posted */
+		    crpc->crp_stamp != 0) { /* rpc done or aborted already */
+			if (crpc->crp_stamp == 0) {
+				crpc->crp_stamp = cfs_time_current();
+				crpc->crp_status = -EINTR;
+			}
+			spin_unlock(&rpc->crpc_lock);
+			continue;
+		}
+
+		crpc->crp_stamp  = cfs_time_current();
+		crpc->crp_status = error;
+
+		spin_unlock(&rpc->crpc_lock);
+
+		sfw_abort_rpc(rpc);
+
+		if (error != -ETIMEDOUT)
+			continue;
+
+		nd = crpc->crp_node;
+		if (cfs_time_after(nd->nd_stamp, crpc->crp_stamp))
+			continue;
+
+		nd->nd_stamp = crpc->crp_stamp;
+		nd->nd_state = LST_NODE_DOWN;
+	}
+}
+
+static int
+lstcon_rpc_trans_check(lstcon_rpc_trans_t *trans)
+{
+	if (console_session.ses_shutdown &&
+	    !list_empty(&trans->tas_olink)) /* Not an end session RPC */
+		return 1;
+
+	return (atomic_read(&trans->tas_remaining) == 0) ? 1: 0;
+}
+
+int
+lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout)
+{
+	lstcon_rpc_t  *crpc;
+	int	       rc;
+
+	if (list_empty(&trans->tas_rpcs_list))
+                return 0;
+
+	if (timeout < LST_TRANS_MIN_TIMEOUT)
+		timeout = LST_TRANS_MIN_TIMEOUT;
+
+	CDEBUG(D_NET, "Transaction %s started\n",
+	lstcon_rpc_trans_name(trans->tas_opc));
+
+	/* post all requests */
+	list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
+		LASSERT(!crpc->crp_posted);
+
+		lstcon_rpc_post(crpc);
+	}
+
+	mutex_unlock(&console_session.ses_mutex);
+
+	rc = wait_event_interruptible_timeout(trans->tas_waitq,
+					      lstcon_rpc_trans_check(trans),
+					      cfs_time_seconds(timeout));
+
+        rc = (rc > 0)? 0: ((rc < 0)? -EINTR: -ETIMEDOUT);
+
+	mutex_lock(&console_session.ses_mutex);
+
+        if (console_session.ses_shutdown)
+                rc = -ESHUTDOWN;
+
+        if (rc != 0 || atomic_read(&trans->tas_remaining) != 0) {
+                /* treat short timeout as canceled */
+                if (rc == -ETIMEDOUT && timeout < LST_TRANS_MIN_TIMEOUT * 2)
+                        rc = -EINTR;
+
+                lstcon_rpc_trans_abort(trans, rc);
+        }
+
+        CDEBUG(D_NET, "Transaction %s stopped: %d\n",
+               lstcon_rpc_trans_name(trans->tas_opc), rc);
+
+        lstcon_rpc_trans_stat(trans, lstcon_trans_stat());
+
+        return rc;
+}
+
+static int
+lstcon_rpc_get_reply(lstcon_rpc_t *crpc, srpc_msg_t **msgpp)
+{
+        lstcon_node_t        *nd  = crpc->crp_node;
+        srpc_client_rpc_t    *rpc = crpc->crp_rpc;
+        srpc_generic_reply_t *rep;
+
+        LASSERT (nd != NULL && rpc != NULL);
+        LASSERT (crpc->crp_stamp != 0);
+
+        if (crpc->crp_status != 0) {
+                *msgpp = NULL;
+                return crpc->crp_status;
+        }
+
+        *msgpp = &rpc->crpc_replymsg;
+        if (!crpc->crp_unpacked) {
+                sfw_unpack_message(*msgpp);
+                crpc->crp_unpacked = 1;
+        }
+
+        if (cfs_time_after(nd->nd_stamp, crpc->crp_stamp))
+                return 0;
+
+        nd->nd_stamp = crpc->crp_stamp;
+        rep = &(*msgpp)->msg_body.reply;
+
+        if (rep->sid.ses_nid == LNET_NID_ANY)
+                nd->nd_state = LST_NODE_UNKNOWN;
+        else if (lstcon_session_match(rep->sid))
+                nd->nd_state = LST_NODE_ACTIVE;
+        else
+                nd->nd_state = LST_NODE_BUSY;
+
+        return 0;
+}
+
+void
+lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans, struct lstcon_trans_stat *stat)
+{
+	lstcon_rpc_t	*crpc;
+	srpc_msg_t	*rep;
+	int		 error;
+
+	LASSERT(stat != NULL);
+
+	memset(stat, 0, sizeof(*stat));
+
+	list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
+		lstcon_rpc_stat_total(stat, 1);
+
+		LASSERT(crpc->crp_stamp != 0);
+
+                error = lstcon_rpc_get_reply(crpc, &rep);
+                if (error != 0) {
+                        lstcon_rpc_stat_failure(stat, 1);
+                        if (stat->trs_rpc_errno == 0)
+                                stat->trs_rpc_errno = -error;
+
+                        continue;
+                }
+
+                lstcon_rpc_stat_success(stat, 1);
+
+		lstcon_rpc_stat_reply(trans, rep, crpc->crp_node, stat);
+	}
+
+	if (trans->tas_opc == LST_TRANS_SESNEW && stat->trs_fwk_errno == 0) {
+		stat->trs_fwk_errno =
+		      lstcon_session_feats_check(trans->tas_features);
+        }
+
+        CDEBUG(D_NET, "transaction %s : success %d, failure %d, total %d, "
+                      "RPC error(%d), Framework error(%d)\n",
+               lstcon_rpc_trans_name(trans->tas_opc),
+               lstcon_rpc_stat_success(stat, 0),
+               lstcon_rpc_stat_failure(stat, 0),
+               lstcon_rpc_stat_total(stat, 0),
+               stat->trs_rpc_errno, stat->trs_fwk_errno);
+
+        return;
+}
+
+int
+lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
+			     struct list_head __user *head_up,
+			     lstcon_rpc_readent_func_t readent)
+{
+	struct list_head      tmp;
+	struct list_head     __user *next;
+	struct lstcon_rpc_ent     *ent;
+        srpc_generic_reply_t *rep;
+        lstcon_rpc_t         *crpc;
+        srpc_msg_t           *msg;
+        lstcon_node_t        *nd;
+        cfs_duration_t        dur;
+        struct timeval        tv;
+        int                   error;
+
+	LASSERT(head_up != NULL);
+
+	next = head_up;
+
+	list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
+		if (copy_from_user(&tmp, next,
+				   sizeof(struct list_head)))
+			return -EFAULT;
+
+		if (tmp.next == head_up)
+			return 0;
+
+		next = tmp.next;
+
+		ent = list_entry(next, struct lstcon_rpc_ent, rpe_link);
+
+		LASSERT(crpc->crp_stamp != 0);
+
+                error = lstcon_rpc_get_reply(crpc, &msg);
+
+                nd = crpc->crp_node;
+
+                dur = (cfs_duration_t)cfs_time_sub(crpc->crp_stamp,
+                      (cfs_time_t)console_session.ses_id.ses_stamp);
+		jiffies_to_timeval(dur, &tv);
+
+		if (copy_to_user(&ent->rpe_peer,
+				 &nd->nd_id, sizeof(struct lnet_process_id)) ||
+		    copy_to_user(&ent->rpe_stamp, &tv, sizeof(tv)) ||
+		    copy_to_user(&ent->rpe_state,
+				 &nd->nd_state, sizeof(nd->nd_state)) ||
+		    copy_to_user(&ent->rpe_rpc_errno, &error,
+				     sizeof(error)))
+			return -EFAULT;
+
+		if (error != 0)
+			continue;
+
+		/* RPC is done */
+		rep = (srpc_generic_reply_t *)&msg->msg_body.reply;
+
+		if (copy_to_user(&ent->rpe_sid,
+				 &rep->sid, sizeof(rep->sid)) ||
+		    copy_to_user(&ent->rpe_fwk_errno,
+				 &rep->status, sizeof(rep->status)))
+			return -EFAULT;
+
+		if (readent == NULL)
+			continue;
+
+		error = readent(trans->tas_opc, msg, ent);
+		if (error != 0)
+			return error;
+	}
+
+	return 0;
+}
+
+void
+lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans)
+{
+	srpc_client_rpc_t *rpc;
+	lstcon_rpc_t      *crpc;
+	lstcon_rpc_t      *tmp;
+	int                count = 0;
+
+	list_for_each_entry_safe(crpc, tmp, &trans->tas_rpcs_list, crp_link) {
+		rpc = crpc->crp_rpc;
+
+		spin_lock(&rpc->crpc_lock);
+
+		/* free it if not posted or finished already */
+		if (!crpc->crp_posted || crpc->crp_finished) {
+			spin_unlock(&rpc->crpc_lock);
+
+			list_del_init(&crpc->crp_link);
+			lstcon_rpc_put(crpc);
+
+			continue;
+		}
+
+		/* rpcs can be still not callbacked (even LNetMDUnlink is
+		 * called) because huge timeout for inaccessible network,
+		 * don't make user wait for them, just abandon them, they
+		 * will be recycled in callback */
+
+		LASSERT(crpc->crp_status != 0);
+
+		crpc->crp_node  = NULL;
+		crpc->crp_trans = NULL;
+		list_del_init(&crpc->crp_link);
+		count++;
+
+		spin_unlock(&rpc->crpc_lock);
+
+		atomic_dec(&trans->tas_remaining);
+	}
+
+	LASSERT(atomic_read(&trans->tas_remaining) == 0);
+
+	list_del(&trans->tas_link);
+	if (!list_empty(&trans->tas_olink))
+		list_del(&trans->tas_olink);
+
+	CDEBUG(D_NET, "Transaction %s destroyed with %d pending RPCs\n",
+	       lstcon_rpc_trans_name(trans->tas_opc), count);
+
+	LIBCFS_FREE(trans, sizeof(*trans));
+
+	return;
+}
+
+int
+lstcon_sesrpc_prep(lstcon_node_t *nd, int transop,
+		   unsigned feats, lstcon_rpc_t **crpc)
+{
+        srpc_mksn_reqst_t *msrq;
+        srpc_rmsn_reqst_t *rsrq;
+        int                rc;
+
+        switch (transop) {
+        case LST_TRANS_SESNEW:
+		rc = lstcon_rpc_prep(nd, SRPC_SERVICE_MAKE_SESSION,
+				     feats, 0, 0, crpc);
+                if (rc != 0)
+                        return rc;
+
+                msrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.mksn_reqst;
+                msrq->mksn_sid     = console_session.ses_id;
+                msrq->mksn_force   = console_session.ses_force;
+		strlcpy(msrq->mksn_name, console_session.ses_name,
+			sizeof(msrq->mksn_name));
+                break;
+
+        case LST_TRANS_SESEND:
+		rc = lstcon_rpc_prep(nd, SRPC_SERVICE_REMOVE_SESSION,
+				     feats, 0, 0, crpc);
+                if (rc != 0)
+                        return rc;
+
+                rsrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.rmsn_reqst;
+                rsrq->rmsn_sid = console_session.ses_id;
+                break;
+
+        default:
+                LBUG();
+        }
+
+        return 0;
+}
+
+int
+lstcon_dbgrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc)
+{
+	srpc_debug_reqst_t *drq;
+	int		    rc;
+
+	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_DEBUG, feats, 0, 0, crpc);
+        if (rc != 0)
+                return rc;
+
+        drq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst;
+
+        drq->dbg_sid   = console_session.ses_id;
+        drq->dbg_flags = 0;
+
+        return rc;
+}
+
+int
+lstcon_batrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
+		   lstcon_tsb_hdr_t *tsb, lstcon_rpc_t **crpc)
+{
+	lstcon_batch_t	   *batch;
+	srpc_batch_reqst_t *brq;
+	int		    rc;
+
+	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_BATCH, feats, 0, 0, crpc);
+        if (rc != 0)
+                return rc;
+
+        brq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.bat_reqst;
+
+        brq->bar_sid     = console_session.ses_id;
+        brq->bar_bid     = tsb->tsb_id;
+        brq->bar_testidx = tsb->tsb_index;
+        brq->bar_opc     = transop == LST_TRANS_TSBRUN ? SRPC_BATCH_OPC_RUN :
+                           (transop == LST_TRANS_TSBSTOP ? SRPC_BATCH_OPC_STOP:
+                            SRPC_BATCH_OPC_QUERY);
+
+        if (transop != LST_TRANS_TSBRUN &&
+            transop != LST_TRANS_TSBSTOP)
+                return 0;
+
+        LASSERT (tsb->tsb_index == 0);
+
+        batch = (lstcon_batch_t *)tsb;
+        brq->bar_arg = batch->bat_arg;
+
+        return 0;
+}
+
+int
+lstcon_statrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc)
+{
+	srpc_stat_reqst_t *srq;
+	int		   rc;
+
+	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_QUERY_STAT, feats, 0, 0, crpc);
+        if (rc != 0)
+                return rc;
+
+        srq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.stat_reqst;
+
+        srq->str_sid  = console_session.ses_id;
+        srq->str_type = 0; /* XXX remove it */
+
+        return 0;
+}
+
+static struct lnet_process_id_packed *
+lstcon_next_id(int idx, int nkiov, lnet_kiov_t *kiov)
+{
+	struct lnet_process_id_packed *pid;
+        int                       i;
+
+        i = idx / SFW_ID_PER_PAGE;
+
+        LASSERT (i < nkiov);
+
+	pid = (struct lnet_process_id_packed *)page_address(kiov[i].kiov_page);
+
+        return &pid[idx % SFW_ID_PER_PAGE];
+}
+
+static int
+lstcon_dstnodes_prep(lstcon_group_t *grp, int idx,
+                     int dist, int span, int nkiov, lnet_kiov_t *kiov)
+{
+	struct lnet_process_id_packed *pid;
+        lstcon_ndlink_t          *ndl;
+        lstcon_node_t            *nd;
+        int                       start;
+        int                       end;
+        int                       i = 0;
+
+        LASSERT (dist >= 1);
+        LASSERT (span >= 1);
+        LASSERT (grp->grp_nnode >= 1);
+
+        if (span > grp->grp_nnode)
+                return -EINVAL;
+
+        start = ((idx / dist) * span) % grp->grp_nnode;
+        end   = ((idx / dist) * span + span - 1) % grp->grp_nnode;
+
+	list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) {
+		nd = ndl->ndl_node;
+		if (i < start) {
+			i++;
+			continue;
+		}
+
+		if (i > (end >= start ? end : grp->grp_nnode))
+			break;
+
+		pid = lstcon_next_id((i - start), nkiov, kiov);
+		pid->nid = nd->nd_id.nid;
+		pid->pid = nd->nd_id.pid;
+		i++;
+	}
+
+	if (start <= end)	/* done */
+		return 0;
+
+	list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) {
+		if (i > grp->grp_nnode + end)
+			break;
+
+		nd = ndl->ndl_node;
+		pid = lstcon_next_id((i - start), nkiov, kiov);
+		pid->nid = nd->nd_id.nid;
+		pid->pid = nd->nd_id.pid;
+		i++;
+	}
+
+	return 0;
+}
+
+static int
+lstcon_pingrpc_prep(struct lst_test_ping_param *param, srpc_test_reqst_t *req)
+{
+        test_ping_req_t *prq = &req->tsr_u.ping;
+
+        prq->png_size   = param->png_size;
+        prq->png_flags  = param->png_flags;
+        /* TODO dest */
+        return 0;
+}
+
+static int
+lstcon_bulkrpc_v0_prep(struct lst_test_bulk_param *param, srpc_test_reqst_t *req)
+{
+	test_bulk_req_t *brq = &req->tsr_u.bulk_v0;
+
+	brq->blk_opc    = param->blk_opc;
+	brq->blk_npg    = (param->blk_size + PAGE_SIZE - 1) /
+			   PAGE_SIZE;
+	brq->blk_flags  = param->blk_flags;
+
+	return 0;
+}
+
+static int
+lstcon_bulkrpc_v1_prep(struct lst_test_bulk_param *param, bool is_client,
+		       srpc_test_reqst_t *req)
+{
+	test_bulk_req_v1_t *brq = &req->tsr_u.bulk_v1;
+
+	brq->blk_opc	= param->blk_opc;
+	brq->blk_flags	= param->blk_flags;
+	brq->blk_len	= param->blk_size;
+	brq->blk_offset	= is_client ? param->blk_cli_off : param->blk_srv_off;
+
+	return 0;
+}
+
+int
+lstcon_testrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
+                    lstcon_test_t *test, lstcon_rpc_t **crpc)
+{
+        lstcon_group_t    *sgrp = test->tes_src_grp;
+        lstcon_group_t    *dgrp = test->tes_dst_grp;
+        srpc_test_reqst_t *trq;
+        srpc_bulk_t       *bulk;
+        int                i;
+	int		   npg = 0;
+	int		   nob = 0;
+	int		   rc  = 0;
+
+	if (transop == LST_TRANS_TSBCLIADD) {
+		npg = sfw_id_pages(test->tes_span);
+		nob = (feats & LST_FEAT_BULK_LEN) == 0 ?
+		      npg * PAGE_SIZE :
+		      sizeof(struct lnet_process_id_packed) * test->tes_span;
+	}
+
+	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_TEST, feats, npg, nob, crpc);
+	if (rc != 0)
+                return rc;
+
+        trq  = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.tes_reqst;
+
+        if (transop == LST_TRANS_TSBSRVADD) {
+                int ndist = (sgrp->grp_nnode + test->tes_dist - 1) / test->tes_dist;
+                int nspan = (dgrp->grp_nnode + test->tes_span - 1) / test->tes_span;
+                int nmax = (ndist + nspan - 1) / nspan;
+
+                trq->tsr_ndest = 0;
+                trq->tsr_loop  = nmax * test->tes_dist * test->tes_concur;
+
+        } else {
+                bulk = &(*crpc)->crp_rpc->crpc_bulk;
+
+		for (i = 0; i < npg; i++) {
+			int	len;
+
+			LASSERT(nob > 0);
+
+			len = (feats & LST_FEAT_BULK_LEN) == 0 ?
+			      PAGE_SIZE : min_t(int, nob, PAGE_SIZE);
+			nob -= len;
+
+			bulk->bk_iovs[i].kiov_offset = 0;
+			bulk->bk_iovs[i].kiov_len    = len;
+			bulk->bk_iovs[i].kiov_page   =
+				alloc_page(GFP_KERNEL);
+
+			if (bulk->bk_iovs[i].kiov_page == NULL) {
+				lstcon_rpc_put(*crpc);
+				return -ENOMEM;
+			}
+                }
+
+                bulk->bk_sink = 0;
+
+                LASSERT (transop == LST_TRANS_TSBCLIADD);
+
+                rc = lstcon_dstnodes_prep(test->tes_dst_grp,
+					  test->tes_cliidx++,
+					  test->tes_dist,
+					  test->tes_span,
+					  npg, &bulk->bk_iovs[0]);
+                if (rc != 0) {
+                        lstcon_rpc_put(*crpc);
+                        return rc;
+                }
+
+                trq->tsr_ndest = test->tes_span;
+                trq->tsr_loop  = test->tes_loop;
+	}
+
+        trq->tsr_sid        = console_session.ses_id;
+        trq->tsr_bid        = test->tes_hdr.tsb_id;
+        trq->tsr_concur     = test->tes_concur;
+        trq->tsr_is_client  = (transop == LST_TRANS_TSBCLIADD) ? 1 : 0;
+        trq->tsr_stop_onerr = !!test->tes_stop_onerr;
+
+        switch (test->tes_type) {
+        case LST_TEST_PING:
+                trq->tsr_service = SRPC_SERVICE_PING;
+		rc = lstcon_pingrpc_prep((struct lst_test_ping_param *)
+					 &test->tes_param[0], trq);
+		break;
+
+	case LST_TEST_BULK:
+		trq->tsr_service = SRPC_SERVICE_BRW;
+		if ((feats & LST_FEAT_BULK_LEN) == 0) {
+			rc = lstcon_bulkrpc_v0_prep((struct lst_test_bulk_param *)
+						    &test->tes_param[0], trq);
+		} else {
+			rc = lstcon_bulkrpc_v1_prep((struct lst_test_bulk_param *)
+						    &test->tes_param[0],
+						    trq->tsr_is_client, trq);
+		}
+
+                break;
+        default:
+                LBUG();
+                break;
+        }
+
+        return rc;
+}
+
+static int
+lstcon_sesnew_stat_reply(lstcon_rpc_trans_t *trans,
+			 lstcon_node_t *nd, srpc_msg_t *reply)
+{
+	srpc_mksn_reply_t *mksn_rep = &reply->msg_body.mksn_reply;
+	int		   status   = mksn_rep->mksn_status;
+
+	if (status == 0 &&
+	    (reply->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		mksn_rep->mksn_status = EPROTO;
+		status = EPROTO;
+	}
+
+	if (status == EPROTO) {
+		CNETERR("session protocol error from %s: %u\n",
+			libcfs_nid2str(nd->nd_id.nid),
+			reply->msg_ses_feats);
+	}
+
+	if (status != 0)
+		return status;
+
+	if (!trans->tas_feats_updated) {
+		spin_lock(&console_session.ses_rpc_lock);
+		if (!trans->tas_feats_updated) { /* recheck with lock */
+			trans->tas_feats_updated = 1;
+			trans->tas_features = reply->msg_ses_feats;
+		}
+		spin_unlock(&console_session.ses_rpc_lock);
+	}
+
+	if (reply->msg_ses_feats != trans->tas_features) {
+		CNETERR("Framework features %x from %s is different with "
+			"features on this transaction: %x\n",
+			 reply->msg_ses_feats, libcfs_nid2str(nd->nd_id.nid),
+			 trans->tas_features);
+		status = mksn_rep->mksn_status = EPROTO;
+	}
+
+	if (status == 0) {
+		/* session timeout on remote node */
+		nd->nd_timeout = mksn_rep->mksn_timeout;
+	}
+
+	return status;
+}
+
+void
+lstcon_rpc_stat_reply(lstcon_rpc_trans_t *trans, srpc_msg_t *msg,
+		      lstcon_node_t *nd, struct lstcon_trans_stat *stat)
+{
+        srpc_rmsn_reply_t  *rmsn_rep;
+        srpc_debug_reply_t *dbg_rep;
+        srpc_batch_reply_t *bat_rep;
+        srpc_test_reply_t  *test_rep;
+        srpc_stat_reply_t  *stat_rep;
+        int                 rc = 0;
+
+	switch (trans->tas_opc) {
+	case LST_TRANS_SESNEW:
+		rc = lstcon_sesnew_stat_reply(trans, nd, msg);
+		if (rc == 0) {
+                        lstcon_sesop_stat_success(stat, 1);
+                        return;
+                }
+
+                lstcon_sesop_stat_failure(stat, 1);
+                break;
+
+        case LST_TRANS_SESEND:
+                rmsn_rep = &msg->msg_body.rmsn_reply;
+                /* ESRCH is not an error for end session */
+                if (rmsn_rep->rmsn_status == 0 ||
+                    rmsn_rep->rmsn_status == ESRCH) {
+                        lstcon_sesop_stat_success(stat, 1);
+                        return;
+                }
+
+                lstcon_sesop_stat_failure(stat, 1);
+                rc = rmsn_rep->rmsn_status;
+                break;
+
+        case LST_TRANS_SESQRY:
+        case LST_TRANS_SESPING:
+                dbg_rep = &msg->msg_body.dbg_reply;
+
+                if (dbg_rep->dbg_status == ESRCH) {
+                        lstcon_sesqry_stat_unknown(stat, 1);
+                        return;
+		}
+
+                if (lstcon_session_match(dbg_rep->dbg_sid))
+                        lstcon_sesqry_stat_active(stat, 1);
+                else
+                        lstcon_sesqry_stat_busy(stat, 1);
+                return;
+
+        case LST_TRANS_TSBRUN:
+        case LST_TRANS_TSBSTOP:
+                bat_rep = &msg->msg_body.bat_reply;
+
+                if (bat_rep->bar_status == 0) {
+                        lstcon_tsbop_stat_success(stat, 1);
+                        return;
+                }
+
+		if (bat_rep->bar_status == EPERM &&
+		    trans->tas_opc == LST_TRANS_TSBSTOP) {
+                        lstcon_tsbop_stat_success(stat, 1);
+                        return;
+                }
+
+                lstcon_tsbop_stat_failure(stat, 1);
+                rc = bat_rep->bar_status;
+                break;
+
+        case LST_TRANS_TSBCLIQRY:
+        case LST_TRANS_TSBSRVQRY:
+                bat_rep = &msg->msg_body.bat_reply;
+
+		if (bat_rep->bar_active != 0)
+                        lstcon_tsbqry_stat_run(stat, 1);
+                else
+                        lstcon_tsbqry_stat_idle(stat, 1);
+
+		if (bat_rep->bar_status == 0)
+                        return;
+
+                lstcon_tsbqry_stat_failure(stat, 1);
+                rc = bat_rep->bar_status;
+                break;
+
+        case LST_TRANS_TSBCLIADD:
+        case LST_TRANS_TSBSRVADD:
+                test_rep = &msg->msg_body.tes_reply;
+
+                if (test_rep->tsr_status == 0) {
+                        lstcon_tsbop_stat_success(stat, 1);
+                        return;
+                }
+
+                lstcon_tsbop_stat_failure(stat, 1);
+                rc = test_rep->tsr_status;
+                break;
+
+        case LST_TRANS_STATQRY:
+                stat_rep = &msg->msg_body.stat_reply;
+
+                if (stat_rep->str_status == 0) {
+                        lstcon_statqry_stat_success(stat, 1);
+                        return;
+                }
+
+                lstcon_statqry_stat_failure(stat, 1);
+                rc = stat_rep->str_status;
+                break;
+
+        default:
+                LBUG();
+        }
+
+        if (stat->trs_fwk_errno == 0)
+                stat->trs_fwk_errno = rc;
+
+        return;
+}
+
+int
+lstcon_rpc_trans_ndlist(struct list_head *ndlist,
+			struct list_head *translist, int transop,
+			void *arg, lstcon_rpc_cond_func_t condition,
+			lstcon_rpc_trans_t **transpp)
+{
+        lstcon_rpc_trans_t *trans;
+        lstcon_ndlink_t    *ndl;
+        lstcon_node_t      *nd;
+        lstcon_rpc_t       *rpc;
+	unsigned	    feats;
+        int                 rc;
+
+        /* Creating session RPG for list of nodes */
+
+        rc = lstcon_rpc_trans_prep(translist, transop, &trans);
+        if (rc != 0) {
+                CERROR("Can't create transaction %d: %d\n", transop, rc);
+                return rc;
+        }
+
+	feats = trans->tas_features;
+	list_for_each_entry(ndl, ndlist, ndl_link) {
+                rc = condition == NULL ? 1 :
+                     condition(transop, ndl->ndl_node, arg);
+
+                if (rc == 0)
+                        continue;
+
+                if (rc < 0) {
+                        CDEBUG(D_NET, "Condition error while creating RPC "
+                                      " for transaction %d: %d\n", transop, rc);
+                        break;
+                }
+
+                nd = ndl->ndl_node;
+
+                switch (transop) {
+                case LST_TRANS_SESNEW:
+                case LST_TRANS_SESEND:
+			rc = lstcon_sesrpc_prep(nd, transop, feats, &rpc);
+			break;
+		case LST_TRANS_SESQRY:
+		case LST_TRANS_SESPING:
+			rc = lstcon_dbgrpc_prep(nd, feats, &rpc);
+			break;
+		case LST_TRANS_TSBCLIADD:
+		case LST_TRANS_TSBSRVADD:
+			rc = lstcon_testrpc_prep(nd, transop, feats,
+						 (lstcon_test_t *)arg, &rpc);
+			break;
+		case LST_TRANS_TSBRUN:
+		case LST_TRANS_TSBSTOP:
+		case LST_TRANS_TSBCLIQRY:
+		case LST_TRANS_TSBSRVQRY:
+			rc = lstcon_batrpc_prep(nd, transop, feats,
+						(lstcon_tsb_hdr_t *)arg, &rpc);
+			break;
+		case LST_TRANS_STATQRY:
+			rc = lstcon_statrpc_prep(nd, feats, &rpc);
+                        break;
+                default:
+                        rc = -EINVAL;
+                        break;
+                }
+
+                if (rc != 0) {
+                        CERROR("Failed to create RPC for transaction %s: %d\n",
+                               lstcon_rpc_trans_name(transop), rc);
+                        break;
+                }
+
+                lstcon_rpc_trans_addreq(trans, rpc);
+        }
+
+        if (rc == 0) {
+                *transpp = trans;
+                return 0;
+        }
+
+        lstcon_rpc_trans_destroy(trans);
+
+        return rc;
+}
+
+static void
+lstcon_rpc_pinger(void *arg)
+{
+        stt_timer_t        *ptimer = (stt_timer_t *)arg;
+        lstcon_rpc_trans_t *trans;
+        lstcon_rpc_t       *crpc;
+        srpc_msg_t         *rep;
+        srpc_debug_reqst_t *drq;
+        lstcon_ndlink_t    *ndl;
+        lstcon_node_t      *nd;
+	int intv;
+        int                 count = 0;
+        int                 rc;
+
+        /* RPC pinger is a special case of transaction,
+         * it's called by timer at 8 seconds interval.
+         */
+	mutex_lock(&console_session.ses_mutex);
+
+        if (console_session.ses_shutdown || console_session.ses_expired) {
+		mutex_unlock(&console_session.ses_mutex);
+                return;
+        }
+
+	if (!console_session.ses_expired &&
+	    cfs_time_current_sec() - console_session.ses_laststamp >
+	    (time_t)console_session.ses_timeout)
+		console_session.ses_expired = 1;
+
+	trans = console_session.ses_ping;
+
+	LASSERT(trans != NULL);
+
+	list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link) {
+		nd = ndl->ndl_node;
+
+                if (console_session.ses_expired) {
+                        /* idle console, end session on all nodes */
+                        if (nd->nd_state != LST_NODE_ACTIVE)
+                                continue;
+
+			rc = lstcon_sesrpc_prep(nd, LST_TRANS_SESEND,
+						trans->tas_features, &crpc);
+                        if (rc != 0) {
+                                CERROR("Out of memory\n");
+                                break;
+                        }
+
+                        lstcon_rpc_trans_addreq(trans, crpc);
+                        lstcon_rpc_post(crpc);
+
+                        continue;
+                }
+
+                crpc = &nd->nd_ping;
+
+		if (crpc->crp_rpc != NULL) {
+			LASSERT(crpc->crp_trans == trans);
+			LASSERT(!list_empty(&crpc->crp_link));
+
+			spin_lock(&crpc->crp_rpc->crpc_lock);
+
+			LASSERT(crpc->crp_posted);
+
+			if (!crpc->crp_finished) {
+				/* in flight */
+				spin_unlock(&crpc->crp_rpc->crpc_lock);
+				continue;
+			}
+
+			spin_unlock(&crpc->crp_rpc->crpc_lock);
+
+			lstcon_rpc_get_reply(crpc, &rep);
+
+			list_del_init(&crpc->crp_link);
+
+			lstcon_rpc_put(crpc);
+		}
+
+                if (nd->nd_state != LST_NODE_ACTIVE)
+                        continue;
+
+		intv = cfs_duration_sec(jiffies - nd->nd_stamp);
+		if (intv < nd->nd_timeout / 2)
+                        continue;
+
+		rc = lstcon_rpc_init(nd, SRPC_SERVICE_DEBUG,
+				     trans->tas_features, 0, 0, 1, crpc);
+                if (rc != 0) {
+                        CERROR("Out of memory\n");
+                        break;
+                }
+
+                drq = &crpc->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst;
+
+                drq->dbg_sid   = console_session.ses_id;
+                drq->dbg_flags = 0;
+
+                lstcon_rpc_trans_addreq(trans, crpc);
+                lstcon_rpc_post(crpc);
+
+		count++;
+        }
+
+        if (console_session.ses_expired) {
+		mutex_unlock(&console_session.ses_mutex);
+                return;
+        }
+
+        CDEBUG(D_NET, "Ping %d nodes in session\n", count);
+
+	ptimer->stt_expires = (cfs_time_t)(cfs_time_current_sec() + LST_PING_INTERVAL);
+	stt_add_timer(ptimer);
+
+	mutex_unlock(&console_session.ses_mutex);
+}
+
+int
+lstcon_rpc_pinger_start(void)
+{
+	stt_timer_t	*ptimer;
+	int		 rc;
+
+	LASSERT(list_empty(&console_session.ses_rpc_freelist));
+	LASSERT(atomic_read(&console_session.ses_rpc_counter) == 0);
+
+        rc = lstcon_rpc_trans_prep(NULL, LST_TRANS_SESPING,
+                                   &console_session.ses_ping);
+        if (rc != 0) {
+                CERROR("Failed to create console pinger\n");
+                return rc;
+        }
+
+	ptimer = &console_session.ses_ping_timer;
+	ptimer->stt_expires = (cfs_time_t)(cfs_time_current_sec() + LST_PING_INTERVAL);
+
+	stt_add_timer(ptimer);
+
+        return 0;
+}
+
+void
+lstcon_rpc_pinger_stop(void)
+{
+        LASSERT (console_session.ses_shutdown);
+
+        stt_del_timer(&console_session.ses_ping_timer);
+
+        lstcon_rpc_trans_abort(console_session.ses_ping, -ESHUTDOWN);
+        lstcon_rpc_trans_stat(console_session.ses_ping, lstcon_trans_stat());
+        lstcon_rpc_trans_destroy(console_session.ses_ping);
+
+	memset(lstcon_trans_stat(), 0, sizeof(struct lstcon_trans_stat));
+
+        console_session.ses_ping = NULL;
+}
+
+void
+lstcon_rpc_cleanup_wait(void)
+{
+	lstcon_rpc_trans_t	*trans;
+	lstcon_rpc_t		*crpc;
+	struct list_head	*pacer;
+	struct list_head	 zlist;
+
+	/* Called with hold of global mutex */
+
+	LASSERT(console_session.ses_shutdown);
+
+	while (!list_empty(&console_session.ses_trans_list)) {
+		list_for_each(pacer, &console_session.ses_trans_list) {
+			trans = list_entry(pacer, lstcon_rpc_trans_t,
+					   tas_link);
+
+			CDEBUG(D_NET, "Session closed, wakeup transaction %s\n",
+			       lstcon_rpc_trans_name(trans->tas_opc));
+
+			wake_up(&trans->tas_waitq);
+		}
+
+		mutex_unlock(&console_session.ses_mutex);
+
+		CWARN("Session is shutting down, "
+		      "waiting for termination of transactions\n");
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(1));
+
+		mutex_lock(&console_session.ses_mutex);
+	}
+
+	spin_lock(&console_session.ses_rpc_lock);
+
+	lst_wait_until((atomic_read(&console_session.ses_rpc_counter) == 0),
+                       console_session.ses_rpc_lock,
+                       "Network is not accessable or target is down, "
+                       "waiting for %d console RPCs to being recycled\n",
+		       atomic_read(&console_session.ses_rpc_counter));
+
+	list_add(&zlist, &console_session.ses_rpc_freelist);
+	list_del_init(&console_session.ses_rpc_freelist);
+
+	spin_unlock(&console_session.ses_rpc_lock);
+
+	while (!list_empty(&zlist)) {
+		crpc = list_entry(zlist.next, lstcon_rpc_t, crp_link);
+
+		list_del(&crpc->crp_link);
+		LIBCFS_FREE(crpc, sizeof(lstcon_rpc_t));
+	}
+}
+
+int
+lstcon_rpc_module_init(void)
+{
+	INIT_LIST_HEAD(&console_session.ses_ping_timer.stt_list);
+        console_session.ses_ping_timer.stt_func = lstcon_rpc_pinger;
+        console_session.ses_ping_timer.stt_data = &console_session.ses_ping_timer;
+
+        console_session.ses_ping = NULL;
+
+	spin_lock_init(&console_session.ses_rpc_lock);
+	atomic_set(&console_session.ses_rpc_counter, 0);
+	INIT_LIST_HEAD(&console_session.ses_rpc_freelist);
+
+	return 0;
+}
+
+void
+lstcon_rpc_module_fini(void)
+{
+	LASSERT(list_empty(&console_session.ses_rpc_freelist));
+	LASSERT(atomic_read(&console_session.ses_rpc_counter) == 0);
+}
+
diff --git a/drivers/staging/lustrefsx/lnet/selftest/conrpc.h b/drivers/staging/lustrefsx/lnet/selftest/conrpc.h
new file mode 100644
index 0000000000000..3ac70050b29a0
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/conrpc.h
@@ -0,0 +1,146 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * /lnet/selftest/conrpc.h
+ *
+ * Console rpc
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ */
+
+#ifndef __LST_CONRPC_H__
+#define __LST_CONRPC_H__
+
+#include <libcfs/libcfs.h>
+#include <lnet/lnet.h>
+#include <lnet/lib-types.h>
+#include <lnet/lnetst.h>
+#include "rpc.h"
+#include "selftest.h"
+
+/* Console rpc and rpc transaction */
+#define LST_TRANS_TIMEOUT       30
+#define LST_TRANS_MIN_TIMEOUT   3
+
+#define LST_VALIDATE_TIMEOUT(t) MIN(MAX(t, LST_TRANS_MIN_TIMEOUT), LST_TRANS_TIMEOUT)
+
+#define LST_PING_INTERVAL       8
+
+struct lstcon_rpc_trans;
+struct lstcon_tsb_hdr;
+struct lstcon_test;
+struct lstcon_node;
+
+typedef struct lstcon_rpc {
+	struct list_head	 crp_link;	/* chain on rpc transaction */
+	srpc_client_rpc_t	*crp_rpc;	/* client rpc */
+	struct lstcon_node	*crp_node;	/* destination node */
+	struct lstcon_rpc_trans *crp_trans;	/* conrpc transaction */
+
+	unsigned int		 crp_posted:1;   /* rpc is posted */
+	unsigned int		 crp_finished:1; /* rpc is finished */
+	unsigned int		 crp_unpacked:1; /* reply is unpacked */
+	/** RPC is embedded in other structure and can't free it */
+	unsigned int		 crp_embedded:1;
+        int                      crp_status;     /* console rpc errors */
+        cfs_time_t               crp_stamp;      /* replied time stamp */
+} lstcon_rpc_t;
+
+typedef struct lstcon_rpc_trans {
+	/* link chain on owner list */
+	struct list_head	tas_olink;
+	/* link chain on global list */
+	struct list_head	tas_link;
+	/* operation code of transaction */
+	int			tas_opc;
+	/* features mask is uptodate */
+	unsigned		tas_feats_updated;
+	/* test features mask */
+	unsigned		tas_features;
+	wait_queue_head_t	tas_waitq;	/* wait queue head */
+	atomic_t		tas_remaining;	/* # of un-scheduled rpcs */
+	struct list_head	tas_rpcs_list;	/* queued requests */
+} lstcon_rpc_trans_t;
+
+#define LST_TRANS_PRIVATE       0x1000
+
+#define LST_TRANS_SESNEW        (LST_TRANS_PRIVATE | 0x01)
+#define LST_TRANS_SESEND        (LST_TRANS_PRIVATE | 0x02)
+#define LST_TRANS_SESQRY        0x03
+#define LST_TRANS_SESPING       0x04
+
+#define LST_TRANS_TSBCLIADD     (LST_TRANS_PRIVATE | 0x11)
+#define LST_TRANS_TSBSRVADD     (LST_TRANS_PRIVATE | 0x12)
+#define LST_TRANS_TSBRUN        (LST_TRANS_PRIVATE | 0x13)
+#define LST_TRANS_TSBSTOP       (LST_TRANS_PRIVATE | 0x14)
+#define LST_TRANS_TSBCLIQRY     0x15
+#define LST_TRANS_TSBSRVQRY     0x16
+
+#define LST_TRANS_STATQRY       0x21
+
+typedef int (* lstcon_rpc_cond_func_t)(int, struct lstcon_node *, void *);
+typedef int (*lstcon_rpc_readent_func_t)(int, srpc_msg_t *,
+					 struct lstcon_rpc_ent __user *);
+
+int  lstcon_sesrpc_prep(struct lstcon_node *nd, int transop,
+			unsigned version, lstcon_rpc_t **crpc);
+int  lstcon_dbgrpc_prep(struct lstcon_node *nd,
+			unsigned version, lstcon_rpc_t **crpc);
+int  lstcon_batrpc_prep(struct lstcon_node *nd, int transop, unsigned version,
+                        struct lstcon_tsb_hdr *tsb, lstcon_rpc_t **crpc);
+int  lstcon_testrpc_prep(struct lstcon_node *nd, int transop, unsigned version,
+                         struct lstcon_test *test, lstcon_rpc_t **crpc);
+int  lstcon_statrpc_prep(struct lstcon_node *nd, unsigned version,
+			 lstcon_rpc_t **crpc);
+void lstcon_rpc_put(lstcon_rpc_t *crpc);
+int  lstcon_rpc_trans_prep(struct list_head *translist,
+			   int transop, lstcon_rpc_trans_t **transpp);
+int  lstcon_rpc_trans_ndlist(struct list_head *ndlist,
+			     struct list_head *translist, int transop,
+			     void *arg, lstcon_rpc_cond_func_t condition,
+			     lstcon_rpc_trans_t **transpp);
+void lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans,
+			   struct lstcon_trans_stat *stat);
+int  lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
+				  struct list_head __user *head_up,
+				  lstcon_rpc_readent_func_t readent);
+void lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error);
+void lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans);
+void lstcon_rpc_trans_addreq(lstcon_rpc_trans_t *trans, lstcon_rpc_t *req);
+int  lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout);
+int  lstcon_rpc_pinger_start(void);
+void lstcon_rpc_pinger_stop(void);
+void lstcon_rpc_cleanup_wait(void);
+int  lstcon_rpc_module_init(void);
+void lstcon_rpc_module_fini(void);
+
+
+#endif
diff --git a/drivers/staging/lustrefsx/lnet/selftest/console.c b/drivers/staging/lustrefsx/lnet/selftest/console.c
new file mode 100644
index 0000000000000..a9fe8a85a2dd1
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/console.c
@@ -0,0 +1,2111 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Infrastructure of LST console
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+
+#include <libcfs/libcfs.h>
+#include <lnet/lib-lnet.h>
+#include "console.h"
+#include "conrpc.h"
+
+#define LST_NODE_STATE_COUNTER(nd, p)                   \
+do {                                                    \
+        if ((nd)->nd_state == LST_NODE_ACTIVE)          \
+                (p)->nle_nactive ++;                    \
+        else if ((nd)->nd_state == LST_NODE_BUSY)       \
+                (p)->nle_nbusy ++;                      \
+        else if ((nd)->nd_state == LST_NODE_DOWN)       \
+                (p)->nle_ndown ++;                      \
+        else                                            \
+                (p)->nle_nunknown ++;                   \
+        (p)->nle_nnode ++;                              \
+} while (0)
+
+lstcon_session_t        console_session;
+
+static void
+lstcon_node_get(lstcon_node_t *nd)
+{
+        LASSERT (nd->nd_ref >= 1);
+
+        nd->nd_ref++;
+}
+
+static int
+lstcon_node_find(struct lnet_process_id id, lstcon_node_t **ndpp, int create)
+{
+	lstcon_ndlink_t	*ndl;
+	unsigned int	 idx = LNET_NIDADDR(id.nid) % LST_GLOBAL_HASHSIZE;
+
+	LASSERT(id.nid != LNET_NID_ANY);
+
+	list_for_each_entry(ndl, &console_session.ses_ndl_hash[idx],
+			    ndl_hlink) {
+		if (ndl->ndl_node->nd_id.nid != id.nid ||
+		    ndl->ndl_node->nd_id.pid != id.pid)
+			continue;
+
+		lstcon_node_get(ndl->ndl_node);
+		*ndpp = ndl->ndl_node;
+		return 0;
+	}
+
+        if (!create)
+                return -ENOENT;
+
+        LIBCFS_ALLOC(*ndpp, sizeof(lstcon_node_t) + sizeof(lstcon_ndlink_t));
+        if (*ndpp == NULL)
+                return -ENOMEM;
+
+        ndl = (lstcon_ndlink_t *)(*ndpp + 1);
+
+        ndl->ndl_node = *ndpp;
+
+        ndl->ndl_node->nd_ref   = 1;
+        ndl->ndl_node->nd_id    = id;
+        ndl->ndl_node->nd_stamp = cfs_time_current();
+        ndl->ndl_node->nd_state = LST_NODE_UNKNOWN;
+        ndl->ndl_node->nd_timeout = 0;
+        memset(&ndl->ndl_node->nd_ping, 0, sizeof(lstcon_rpc_t));
+
+	/* queued in global hash & list, no refcount is taken by
+	 * global hash & list, if caller release his refcount,
+	 * node will be released */
+	list_add_tail(&ndl->ndl_hlink, &console_session.ses_ndl_hash[idx]);
+	list_add_tail(&ndl->ndl_link, &console_session.ses_ndl_list);
+
+	return 0;
+}
+
+static void
+lstcon_node_put(lstcon_node_t *nd)
+{
+	lstcon_ndlink_t *ndl;
+
+	LASSERT(nd->nd_ref > 0);
+
+	if (--nd->nd_ref > 0)
+		return;
+
+	ndl = (lstcon_ndlink_t *)(nd + 1);
+
+	LASSERT(!list_empty(&ndl->ndl_link));
+	LASSERT(!list_empty(&ndl->ndl_hlink));
+
+	/* remove from session */
+	list_del(&ndl->ndl_link);
+	list_del(&ndl->ndl_hlink);
+
+	LIBCFS_FREE(nd, sizeof(lstcon_node_t) + sizeof(lstcon_ndlink_t));
+}
+
+static int
+lstcon_ndlink_find(struct list_head *hash, struct lnet_process_id id,
+		   lstcon_ndlink_t **ndlpp, int create)
+{
+	unsigned int	 idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE;
+	lstcon_ndlink_t *ndl;
+	lstcon_node_t   *nd;
+	int		 rc;
+
+	if (id.nid == LNET_NID_ANY)
+		return -EINVAL;
+
+	/* search in hash */
+	list_for_each_entry(ndl, &hash[idx], ndl_hlink) {
+		if (ndl->ndl_node->nd_id.nid != id.nid ||
+		    ndl->ndl_node->nd_id.pid != id.pid)
+			continue;
+
+		*ndlpp = ndl;
+		return 0;
+	}
+
+        if (create == 0)
+                return -ENOENT;
+
+        /* find or create in session hash */
+        rc = lstcon_node_find(id, &nd, (create == 1) ? 1 : 0);
+        if (rc != 0)
+                return rc;
+
+        LIBCFS_ALLOC(ndl, sizeof(lstcon_ndlink_t));
+        if (ndl == NULL) {
+                lstcon_node_put(nd);
+                return -ENOMEM;
+        }
+
+        *ndlpp = ndl;
+
+	ndl->ndl_node = nd;
+	INIT_LIST_HEAD(&ndl->ndl_link);
+	list_add_tail(&ndl->ndl_hlink, &hash[idx]);
+
+	return 0;
+}
+
+static void
+lstcon_ndlink_release(lstcon_ndlink_t *ndl)
+{
+	LASSERT(list_empty(&ndl->ndl_link));
+	LASSERT(!list_empty(&ndl->ndl_hlink));
+
+	list_del(&ndl->ndl_hlink); /* delete from hash */
+        lstcon_node_put(ndl->ndl_node);
+
+        LIBCFS_FREE(ndl, sizeof(*ndl));
+}
+
+static int
+lstcon_group_alloc(char *name, lstcon_group_t **grpp)
+{
+	lstcon_group_t *grp;
+	int		i;
+
+        LIBCFS_ALLOC(grp, offsetof(lstcon_group_t,
+                                   grp_ndl_hash[LST_NODE_HASHSIZE]));
+        if (grp == NULL)
+                return -ENOMEM;
+
+        grp->grp_ref = 1;
+	if (name != NULL) {
+		if (strlen(name) > sizeof(grp->grp_name)-1) {
+			LIBCFS_FREE(grp, offsetof(lstcon_group_t,
+					  grp_ndl_hash[LST_NODE_HASHSIZE]));
+			return -E2BIG;
+		}
+		strncpy(grp->grp_name, name, sizeof(grp->grp_name));
+	}
+
+	INIT_LIST_HEAD(&grp->grp_link);
+	INIT_LIST_HEAD(&grp->grp_ndl_list);
+	INIT_LIST_HEAD(&grp->grp_trans_list);
+
+	for (i = 0; i < LST_NODE_HASHSIZE; i++)
+		INIT_LIST_HEAD(&grp->grp_ndl_hash[i]);
+
+	*grpp = grp;
+
+	return 0;
+}
+
+static void
+lstcon_group_addref(lstcon_group_t *grp)
+{
+	grp->grp_ref++;
+}
+
+static void lstcon_group_ndlink_release(lstcon_group_t *, lstcon_ndlink_t *);
+
+static void
+lstcon_group_drain(lstcon_group_t *grp, int keep)
+{
+	lstcon_ndlink_t *ndl;
+	lstcon_ndlink_t *tmp;
+
+	list_for_each_entry_safe(ndl, tmp, &grp->grp_ndl_list, ndl_link) {
+		if ((ndl->ndl_node->nd_state & keep) == 0)
+			lstcon_group_ndlink_release(grp, ndl);
+	}
+}
+
+static void
+lstcon_group_decref(lstcon_group_t *grp)
+{
+	int i;
+
+	if (--grp->grp_ref > 0)
+		return;
+
+	if (!list_empty(&grp->grp_link))
+		list_del(&grp->grp_link);
+
+	lstcon_group_drain(grp, 0);
+
+	for (i = 0; i < LST_NODE_HASHSIZE; i++)
+		LASSERT(list_empty(&grp->grp_ndl_hash[i]));
+
+	LIBCFS_FREE(grp, offsetof(lstcon_group_t,
+				  grp_ndl_hash[LST_NODE_HASHSIZE]));
+}
+
+static int
+lstcon_group_find(const char *name, lstcon_group_t **grpp)
+{
+	lstcon_group_t *grp;
+
+	list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) {
+		if (strncmp(grp->grp_name, name, LST_NAME_SIZE) != 0)
+			continue;
+
+		lstcon_group_addref(grp);  /* +1 ref for caller */
+		*grpp = grp;
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+static int
+lstcon_group_ndlink_find(lstcon_group_t *grp, struct lnet_process_id id,
+                         lstcon_ndlink_t **ndlpp, int create)
+{
+	int rc;
+
+	rc = lstcon_ndlink_find(&grp->grp_ndl_hash[0], id, ndlpp, create);
+	if (rc != 0)
+		return rc;
+
+	if (!list_empty(&(*ndlpp)->ndl_link))
+		return 0;
+
+	list_add_tail(&(*ndlpp)->ndl_link, &grp->grp_ndl_list);
+	grp->grp_nnode++;
+
+	return 0;
+}
+
+static void
+lstcon_group_ndlink_release(lstcon_group_t *grp, lstcon_ndlink_t *ndl)
+{
+	list_del_init(&ndl->ndl_link);
+	lstcon_ndlink_release(ndl);
+	grp->grp_nnode--;
+}
+
+static void
+lstcon_group_ndlink_move(lstcon_group_t *old,
+                         lstcon_group_t *new, lstcon_ndlink_t *ndl)
+{
+	unsigned int idx = LNET_NIDADDR(ndl->ndl_node->nd_id.nid) %
+					LST_NODE_HASHSIZE;
+
+	list_del(&ndl->ndl_hlink);
+	list_del(&ndl->ndl_link);
+	old->grp_nnode--;
+
+	list_add_tail(&ndl->ndl_hlink, &new->grp_ndl_hash[idx]);
+	list_add_tail(&ndl->ndl_link, &new->grp_ndl_list);
+	new->grp_nnode++;
+
+	return;
+}
+
+static void
+lstcon_group_move(lstcon_group_t *old, lstcon_group_t *new)
+{
+	lstcon_ndlink_t *ndl;
+
+	while (!list_empty(&old->grp_ndl_list)) {
+		ndl = list_entry(old->grp_ndl_list.next,
+				 lstcon_ndlink_t, ndl_link);
+		lstcon_group_ndlink_move(old, new, ndl);
+	}
+}
+
+static int
+lstcon_sesrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+{
+        lstcon_group_t *grp = (lstcon_group_t *)arg;
+
+        switch (transop) {
+        case LST_TRANS_SESNEW:
+                if (nd->nd_state == LST_NODE_ACTIVE)
+                        return 0;
+                break;
+
+        case LST_TRANS_SESEND:
+                if (nd->nd_state != LST_NODE_ACTIVE)
+                        return 0;
+
+                if (grp != NULL && nd->nd_ref > 1)
+                        return 0;
+                break;
+
+        case LST_TRANS_SESQRY:
+                break;
+
+        default:
+                LBUG();
+        }
+
+        return 1;
+}
+
+static int
+lstcon_sesrpc_readent(int transop, srpc_msg_t *msg,
+		      struct lstcon_rpc_ent __user *ent_up)
+{
+        srpc_debug_reply_t *rep;
+
+        switch (transop) {
+        case LST_TRANS_SESNEW:
+        case LST_TRANS_SESEND:
+                return 0;
+
+        case LST_TRANS_SESQRY:
+                rep = &msg->msg_body.dbg_reply;
+
+		if (copy_to_user(&ent_up->rpe_priv[0],
+                                     &rep->dbg_timeout, sizeof(int)) ||
+		    copy_to_user(&ent_up->rpe_payload[0],
+                                     &rep->dbg_name, LST_NAME_SIZE))
+                        return -EFAULT;
+
+                return 0;
+
+        default:
+                LBUG();
+        }
+
+        return 0;
+}
+
+static int
+lstcon_group_nodes_add(lstcon_group_t *grp,
+		       int count, struct lnet_process_id __user *ids_up,
+		       unsigned *featp, struct list_head __user *result_up)
+{
+        lstcon_rpc_trans_t      *trans;
+        lstcon_ndlink_t         *ndl;
+        lstcon_group_t          *tmp;
+	struct lnet_process_id        id;
+        int                      i;
+        int                      rc;
+
+        rc = lstcon_group_alloc(NULL, &tmp);
+        if (rc != 0) {
+                CERROR("Out of memory\n");
+                return -ENOMEM;
+        }
+
+        for (i = 0 ; i < count; i++) {
+		if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+                        rc = -EFAULT;
+                        break;
+                }
+
+                /* skip if it's in this group already */
+                rc = lstcon_group_ndlink_find(grp, id, &ndl, 0);
+                if (rc == 0)
+                        continue;
+
+                /* add to tmp group */
+                rc = lstcon_group_ndlink_find(tmp, id, &ndl, 1);
+                if (rc != 0) {
+                        CERROR("Can't create ndlink, out of memory\n");
+                        break;
+                }
+        }
+
+        if (rc != 0) {
+		lstcon_group_decref(tmp);
+                return rc;
+        }
+
+        rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list,
+                                     &tmp->grp_trans_list, LST_TRANS_SESNEW,
+                                     tmp, lstcon_sesrpc_condition, &trans);
+        if (rc != 0) {
+                CERROR("Can't create transaction: %d\n", rc);
+		lstcon_group_decref(tmp);
+                return rc;
+        }
+
+        /* post all RPCs */
+        lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+        rc = lstcon_rpc_trans_interpreter(trans, result_up,
+                                          lstcon_sesrpc_readent);
+	*featp = trans->tas_features;
+
+        /* destroy all RPGs */
+        lstcon_rpc_trans_destroy(trans);
+
+        lstcon_group_move(tmp, grp);
+	lstcon_group_decref(tmp);
+
+        return rc;
+}
+
+static int
+lstcon_group_nodes_remove(lstcon_group_t *grp,
+			  int count, struct lnet_process_id __user *ids_up,
+			  struct list_head __user *result_up)
+{
+        lstcon_rpc_trans_t     *trans;
+        lstcon_ndlink_t        *ndl;
+        lstcon_group_t         *tmp;
+	struct lnet_process_id       id;
+        int                     rc;
+        int                     i;
+
+        /* End session and remove node from the group */
+
+        rc = lstcon_group_alloc(NULL, &tmp);
+        if (rc != 0) {
+                CERROR("Out of memory\n");
+                return -ENOMEM;
+        }
+
+        for (i = 0; i < count; i++) {
+		if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+                        rc = -EFAULT;
+                        goto error;
+                }
+
+                /* move node to tmp group */
+                if (lstcon_group_ndlink_find(grp, id, &ndl, 0) == 0)
+                        lstcon_group_ndlink_move(grp, tmp, ndl);
+        }
+
+        rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list,
+                                     &tmp->grp_trans_list, LST_TRANS_SESEND,
+                                     tmp, lstcon_sesrpc_condition, &trans);
+        if (rc != 0) {
+                CERROR("Can't create transaction: %d\n", rc);
+                goto error;
+        }
+
+        lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+        rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+        lstcon_rpc_trans_destroy(trans);
+        /* release nodes anyway, because we can't rollback status */
+	lstcon_group_decref(tmp);
+
+        return rc;
+error:
+        lstcon_group_move(tmp, grp);
+	lstcon_group_decref(tmp);
+
+        return rc;
+}
+
+int
+lstcon_group_add(char *name)
+{
+        lstcon_group_t *grp;
+        int             rc;
+
+        rc = (lstcon_group_find(name, &grp) == 0)? -EEXIST: 0;
+        if (rc != 0) {
+                /* find a group with same name */
+		lstcon_group_decref(grp);
+                return rc;
+        }
+
+        rc = lstcon_group_alloc(name, &grp);
+        if (rc != 0) {
+                CERROR("Can't allocate descriptor for group %s\n", name);
+                return -ENOMEM;
+        }
+
+	list_add_tail(&grp->grp_link, &console_session.ses_grp_list);
+
+	return rc;
+}
+
+int
+lstcon_nodes_add(char *name, int count, struct lnet_process_id __user *ids_up,
+		 unsigned *featp, struct list_head __user *result_up)
+{
+        lstcon_group_t         *grp;
+        int                     rc;
+
+        LASSERT (count > 0);
+        LASSERT (ids_up != NULL);
+
+        rc = lstcon_group_find(name, &grp);
+        if (rc != 0) {
+                CDEBUG(D_NET, "Can't find group %s\n", name);
+                return rc;
+        }
+
+        if (grp->grp_ref > 2) {
+                /* referred by other threads or test */
+                CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_decref(grp);
+
+                return -EBUSY;
+        }
+
+	rc = lstcon_group_nodes_add(grp, count, ids_up, featp, result_up);
+
+	lstcon_group_decref(grp);
+
+        return rc;
+}
+
+int
+lstcon_group_del(char *name)
+{
+        lstcon_rpc_trans_t *trans;
+        lstcon_group_t     *grp;
+        int                 rc;
+
+        rc = lstcon_group_find(name, &grp);
+        if (rc != 0) {
+                CDEBUG(D_NET, "Can't find group: %s\n", name);
+                return rc;
+        }
+
+        if (grp->grp_ref > 2) {
+                /* referred by others threads or test */
+                CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_decref(grp);
+                return -EBUSY;
+        }
+
+        rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+                                     &grp->grp_trans_list, LST_TRANS_SESEND,
+                                     grp, lstcon_sesrpc_condition, &trans);
+        if (rc != 0) {
+                CERROR("Can't create transaction: %d\n", rc);
+		lstcon_group_decref(grp);
+                return rc;
+        }
+
+        lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+        lstcon_rpc_trans_destroy(trans);
+
+	lstcon_group_decref(grp);
+        /* -ref for session, it's destroyed,
+         * status can't be rolled back, destroy group anway */
+	lstcon_group_decref(grp);
+
+        return rc;
+}
+
+int
+lstcon_group_clean(char *name, int args)
+{
+        lstcon_group_t *grp = NULL;
+        int             rc;
+
+        rc = lstcon_group_find(name, &grp);
+        if (rc != 0) {
+                CDEBUG(D_NET, "Can't find group %s\n", name);
+                return rc;
+        }
+
+        if (grp->grp_ref > 2) {
+                /* referred by test */
+                CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_decref(grp);
+                return -EBUSY;
+        }
+
+        args = (LST_NODE_ACTIVE | LST_NODE_BUSY |
+                LST_NODE_DOWN | LST_NODE_UNKNOWN) & ~args;
+
+        lstcon_group_drain(grp, args);
+
+	lstcon_group_decref(grp);
+	/* release empty group */
+	if (list_empty(&grp->grp_ndl_list))
+		lstcon_group_decref(grp);
+
+	return 0;
+}
+
+int
+lstcon_nodes_remove(char *name, int count,
+		    struct lnet_process_id __user *ids_up,
+		    struct list_head __user *result_up)
+{
+        lstcon_group_t *grp = NULL;
+        int             rc;
+
+        rc = lstcon_group_find(name, &grp);
+        if (rc != 0) {
+                CDEBUG(D_NET, "Can't find group: %s\n", name);
+                return rc;
+        }
+
+        if (grp->grp_ref > 2) {
+                /* referred by test */
+                CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_decref(grp);
+                return -EBUSY;
+        }
+
+        rc = lstcon_group_nodes_remove(grp, count, ids_up, result_up);
+
+	lstcon_group_decref(grp);
+	/* release empty group */
+	if (list_empty(&grp->grp_ndl_list))
+		lstcon_group_decref(grp);
+
+	return rc;
+}
+
+int
+lstcon_group_refresh(char *name, struct list_head __user *result_up)
+{
+        lstcon_rpc_trans_t      *trans;
+        lstcon_group_t          *grp;
+        int                      rc;
+
+        rc = lstcon_group_find(name, &grp);
+        if (rc != 0) {
+                CDEBUG(D_NET, "Can't find group: %s\n", name);
+                return rc;
+        }
+
+        if (grp->grp_ref > 2) {
+                /* referred by test */
+                CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_decref(grp);
+                return -EBUSY;
+        }
+
+        /* re-invite all inactive nodes int the group */
+        rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+                                     &grp->grp_trans_list, LST_TRANS_SESNEW,
+                                     grp, lstcon_sesrpc_condition, &trans);
+        if (rc != 0) {
+                /* local error, return */
+                CDEBUG(D_NET, "Can't create transaction: %d\n", rc);
+		lstcon_group_decref(grp);
+                return rc;
+        }
+
+        lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+        rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+        lstcon_rpc_trans_destroy(trans);
+        /* -ref for me */
+	lstcon_group_decref(grp);
+
+        return rc;
+}
+
+int
+lstcon_group_list(int index, int len, char __user *name_up)
+{
+	lstcon_group_t *grp;
+
+	LASSERT(index >= 0);
+	LASSERT(name_up != NULL);
+
+	list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) {
+		if (index-- == 0) {
+			return copy_to_user(name_up, grp->grp_name, len) ?
+					    -EFAULT : 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+static int
+lstcon_nodes_getent(struct list_head *head, int *index_p,
+		    int *count_p, struct lstcon_node_ent __user *dents_up)
+{
+        lstcon_ndlink_t  *ndl;
+        lstcon_node_t    *nd;
+        int               count = 0;
+        int               index = 0;
+
+	LASSERT(index_p != NULL && count_p != NULL);
+	LASSERT(dents_up != NULL);
+	LASSERT(*index_p >= 0);
+	LASSERT(*count_p > 0);
+
+	list_for_each_entry(ndl, head, ndl_link) {
+		if (index++ < *index_p)
+			continue;
+
+                if (count >= *count_p)
+                        break;
+
+                nd = ndl->ndl_node;
+		if (copy_to_user(&dents_up[count].nde_id,
+                                     &nd->nd_id, sizeof(nd->nd_id)) ||
+		    copy_to_user(&dents_up[count].nde_state,
+                                     &nd->nd_state, sizeof(nd->nd_state)))
+                        return -EFAULT;
+
+                count ++;
+        }
+
+        if (index <= *index_p)
+                return -ENOENT;
+
+        *count_p = count;
+        *index_p = index;
+
+        return 0;
+}
+
+int
+lstcon_group_info(char *name, struct lstcon_ndlist_ent __user *gents_p,
+		  int *index_p, int *count_p,
+		  struct lstcon_node_ent __user *dents_up)
+{
+	struct lstcon_ndlist_ent *gentp;
+        lstcon_group_t      *grp;
+        lstcon_ndlink_t     *ndl;
+        int                  rc;
+
+        rc = lstcon_group_find(name, &grp);
+        if (rc != 0) {
+                CDEBUG(D_NET, "Can't find group %s\n", name);
+                return rc;
+        }
+
+	if (dents_up != NULL) {
+                /* verbose query */
+                rc = lstcon_nodes_getent(&grp->grp_ndl_list,
+                                         index_p, count_p, dents_up);
+		lstcon_group_decref(grp);
+
+                return rc;
+        }
+
+        /* non-verbose query */
+	LIBCFS_ALLOC(gentp, sizeof(struct lstcon_ndlist_ent));
+        if (gentp == NULL) {
+                CERROR("Can't allocate ndlist_ent\n");
+		lstcon_group_decref(grp);
+
+                return -ENOMEM;
+        }
+
+	list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link)
+                LST_NODE_STATE_COUNTER(ndl->ndl_node, gentp);
+
+	rc = copy_to_user(gents_p, gentp,
+			  sizeof(struct lstcon_ndlist_ent)) ? -EFAULT : 0;
+
+	LIBCFS_FREE(gentp, sizeof(struct lstcon_ndlist_ent));
+
+	lstcon_group_decref(grp);
+
+	return 0;
+}
+
+static int
+lstcon_batch_find(const char *name, lstcon_batch_t **batpp)
+{
+	lstcon_batch_t *bat;
+
+	list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) {
+		if (strncmp(bat->bat_name, name, LST_NAME_SIZE) == 0) {
+			*batpp = bat;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+int
+lstcon_batch_add(char *name)
+{
+        lstcon_batch_t   *bat;
+        int               i;
+        int               rc;
+
+        rc = (lstcon_batch_find(name, &bat) == 0)? -EEXIST: 0;
+        if (rc != 0) {
+                CDEBUG(D_NET, "Batch %s already exists\n", name);
+                return rc;
+        }
+
+        LIBCFS_ALLOC(bat, sizeof(lstcon_batch_t));
+        if (bat == NULL) {
+                CERROR("Can't allocate descriptor for batch %s\n", name);
+                return -ENOMEM;
+        }
+
+        LIBCFS_ALLOC(bat->bat_cli_hash,
+		     sizeof(struct list_head) * LST_NODE_HASHSIZE);
+        if (bat->bat_cli_hash == NULL) {
+                CERROR("Can't allocate hash for batch %s\n", name);
+                LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+
+                return -ENOMEM;
+        }
+
+        LIBCFS_ALLOC(bat->bat_srv_hash,
+		     sizeof(struct list_head) * LST_NODE_HASHSIZE);
+        if (bat->bat_srv_hash == NULL) {
+                CERROR("Can't allocate hash for batch %s\n", name);
+                LIBCFS_FREE(bat->bat_cli_hash, LST_NODE_HASHSIZE);
+                LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+
+                return -ENOMEM;
+        }
+
+	if (strlen(name) > sizeof(bat->bat_name)-1) {
+		LIBCFS_FREE(bat->bat_srv_hash, LST_NODE_HASHSIZE);
+		LIBCFS_FREE(bat->bat_cli_hash, LST_NODE_HASHSIZE);
+		LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+		return -E2BIG;
+	}
+	strncpy(bat->bat_name, name, sizeof(bat->bat_name));
+        bat->bat_hdr.tsb_index = 0;
+        bat->bat_hdr.tsb_id.bat_id = ++console_session.ses_id_cookie;
+
+        bat->bat_ntest = 0;
+        bat->bat_state = LST_BATCH_IDLE;
+
+	INIT_LIST_HEAD(&bat->bat_cli_list);
+	INIT_LIST_HEAD(&bat->bat_srv_list);
+	INIT_LIST_HEAD(&bat->bat_test_list);
+	INIT_LIST_HEAD(&bat->bat_trans_list);
+
+        for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+		INIT_LIST_HEAD(&bat->bat_cli_hash[i]);
+		INIT_LIST_HEAD(&bat->bat_srv_hash[i]);
+        }
+
+	list_add_tail(&bat->bat_link, &console_session.ses_bat_list);
+
+        return rc;
+}
+
+int
+lstcon_batch_list(int index, int len, char __user *name_up)
+{
+	lstcon_batch_t *bat;
+
+	LASSERT(name_up != NULL);
+	LASSERT(index >= 0);
+
+	list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) {
+		if (index-- == 0) {
+			return copy_to_user(name_up, bat->bat_name, len) ?
+					    -EFAULT : 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+int
+lstcon_batch_info(char *name, struct lstcon_test_batch_ent __user *ent_up,
+		  int server, int testidx, int *index_p, int *ndent_p,
+		  struct lstcon_node_ent __user *dents_up)
+{
+	struct lstcon_test_batch_ent *entp;
+	struct list_head	*clilst;
+	struct list_head	*srvlst;
+        lstcon_test_t           *test = NULL;
+        lstcon_batch_t          *bat;
+        lstcon_ndlink_t         *ndl;
+        int                      rc;
+
+        rc = lstcon_batch_find(name, &bat);
+        if (rc != 0) {
+                CDEBUG(D_NET, "Can't find batch %s\n", name);
+                return -ENOENT;
+        }
+
+	if (testidx > 0) {
+		/* query test, test index start from 1 */
+		list_for_each_entry(test, &bat->bat_test_list, tes_link) {
+			if (testidx-- == 1)
+				break;
+		}
+
+                if (testidx > 0) {
+                        CDEBUG(D_NET, "Can't find specified test in batch\n");
+                        return -ENOENT;
+                }
+        }
+
+        clilst = (test == NULL) ? &bat->bat_cli_list :
+                                  &test->tes_src_grp->grp_ndl_list;
+        srvlst = (test == NULL) ? &bat->bat_srv_list :
+                                  &test->tes_dst_grp->grp_ndl_list;
+
+        if (dents_up != NULL) {
+                rc = lstcon_nodes_getent((server ? srvlst: clilst),
+                                         index_p, ndent_p, dents_up);
+                return rc;
+        }
+
+        /* non-verbose query */
+	LIBCFS_ALLOC(entp, sizeof(struct lstcon_test_batch_ent));
+        if (entp == NULL)
+                return -ENOMEM;
+
+        if (test == NULL) {
+                entp->u.tbe_batch.bae_ntest = bat->bat_ntest;
+                entp->u.tbe_batch.bae_state = bat->bat_state;
+
+        } else {
+
+                entp->u.tbe_test.tse_type   = test->tes_type;
+                entp->u.tbe_test.tse_loop   = test->tes_loop;
+                entp->u.tbe_test.tse_concur = test->tes_concur;
+        }
+
+	list_for_each_entry(ndl, clilst, ndl_link)
+		LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_cli_nle);
+
+	list_for_each_entry(ndl, srvlst, ndl_link)
+		LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_srv_nle);
+
+	rc = copy_to_user(ent_up, entp,
+			  sizeof(struct lstcon_test_batch_ent)) ? -EFAULT : 0;
+
+	LIBCFS_FREE(entp, sizeof(struct lstcon_test_batch_ent));
+
+	return rc;
+}
+
+static int
+lstcon_batrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+{
+        switch (transop) {
+        case LST_TRANS_TSBRUN:
+                if (nd->nd_state != LST_NODE_ACTIVE)
+                        return -ENETDOWN;
+                break;
+
+        case LST_TRANS_TSBSTOP:
+                if (nd->nd_state != LST_NODE_ACTIVE)
+                        return 0;
+                break;
+
+        case LST_TRANS_TSBCLIQRY:
+        case LST_TRANS_TSBSRVQRY:
+                break;
+        }
+
+        return 1;
+}
+
+static int
+lstcon_batch_op(lstcon_batch_t *bat, int transop,
+		struct list_head __user *result_up)
+{
+        lstcon_rpc_trans_t *trans;
+        int                 rc;
+
+        rc = lstcon_rpc_trans_ndlist(&bat->bat_cli_list,
+                                     &bat->bat_trans_list, transop,
+                                     bat, lstcon_batrpc_condition, &trans);
+        if (rc != 0) {
+                CERROR("Can't create transaction: %d\n", rc);
+                return rc;
+        }
+
+        lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+        rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+        lstcon_rpc_trans_destroy(trans);
+
+        return rc;
+}
+
+int
+lstcon_batch_run(char *name, int timeout, struct list_head __user *result_up)
+{
+        lstcon_batch_t *bat;
+        int             rc;
+
+        if (lstcon_batch_find(name, &bat) != 0) {
+                CDEBUG(D_NET, "Can't find batch %s\n", name);
+                return -ENOENT;
+        }
+
+        bat->bat_arg = timeout;
+
+        rc = lstcon_batch_op(bat, LST_TRANS_TSBRUN, result_up);
+
+        /* mark batch as running if it's started in any node */
+        if (lstcon_tsbop_stat_success(lstcon_trans_stat(), 0) != 0)
+                bat->bat_state = LST_BATCH_RUNNING;
+
+        return rc;
+}
+
+int
+lstcon_batch_stop(char *name, int force, struct list_head __user *result_up)
+{
+        lstcon_batch_t *bat;
+        int             rc;
+
+        if (lstcon_batch_find(name, &bat) != 0) {
+                CDEBUG(D_NET, "Can't find batch %s\n", name);
+                return -ENOENT;
+        }
+
+        bat->bat_arg = force;
+
+        rc = lstcon_batch_op(bat, LST_TRANS_TSBSTOP, result_up);
+
+        /* mark batch as stopped if all RPCs finished */
+        if (lstcon_tsbop_stat_failure(lstcon_trans_stat(), 0) == 0)
+                bat->bat_state = LST_BATCH_IDLE;
+
+        return rc;
+}
+
+static void
+lstcon_batch_destroy(lstcon_batch_t *bat)
+{
+        lstcon_ndlink_t    *ndl;
+        lstcon_test_t      *test;
+        int                 i;
+
+	list_del(&bat->bat_link);
+
+	while (!list_empty(&bat->bat_test_list)) {
+		test = list_entry(bat->bat_test_list.next,
+				  lstcon_test_t, tes_link);
+		LASSERT(list_empty(&test->tes_trans_list));
+
+		list_del(&test->tes_link);
+
+		lstcon_group_decref(test->tes_src_grp);
+		lstcon_group_decref(test->tes_dst_grp);
+
+		LIBCFS_FREE(test, offsetof(lstcon_test_t,
+					   tes_param[test->tes_paramlen]));
+	}
+
+	LASSERT(list_empty(&bat->bat_trans_list));
+
+	while (!list_empty(&bat->bat_cli_list)) {
+		ndl = list_entry(bat->bat_cli_list.next,
+				 lstcon_ndlink_t, ndl_link);
+		list_del_init(&ndl->ndl_link);
+
+		lstcon_ndlink_release(ndl);
+	}
+
+	while (!list_empty(&bat->bat_srv_list)) {
+		ndl = list_entry(bat->bat_srv_list.next,
+				 lstcon_ndlink_t, ndl_link);
+		list_del_init(&ndl->ndl_link);
+
+		lstcon_ndlink_release(ndl);
+	}
+
+	for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+		LASSERT(list_empty(&bat->bat_cli_hash[i]));
+		LASSERT(list_empty(&bat->bat_srv_hash[i]));
+	}
+
+	LIBCFS_FREE(bat->bat_cli_hash,
+		    sizeof(struct list_head) * LST_NODE_HASHSIZE);
+	LIBCFS_FREE(bat->bat_srv_hash,
+		    sizeof(struct list_head) * LST_NODE_HASHSIZE);
+	LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+}
+
+static int
+lstcon_testrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+{
+	lstcon_test_t	 *test;
+	lstcon_batch_t	 *batch;
+	lstcon_ndlink_t  *ndl;
+	struct list_head *hash;
+	struct list_head *head;
+
+	test = (lstcon_test_t *)arg;
+	LASSERT(test != NULL);
+
+	batch = test->tes_batch;
+	LASSERT(batch != NULL);
+
+        if (test->tes_oneside &&
+            transop == LST_TRANS_TSBSRVADD)
+                return 0;
+
+        if (nd->nd_state != LST_NODE_ACTIVE)
+                return -ENETDOWN;
+
+        if (transop == LST_TRANS_TSBCLIADD) {
+                hash = batch->bat_cli_hash;
+                head = &batch->bat_cli_list;
+
+        } else {
+                LASSERT (transop == LST_TRANS_TSBSRVADD);
+
+                hash = batch->bat_srv_hash;
+                head = &batch->bat_srv_list;
+        }
+
+        LASSERT (nd->nd_id.nid != LNET_NID_ANY);
+
+        if (lstcon_ndlink_find(hash, nd->nd_id, &ndl, 1) != 0)
+                return -ENOMEM;
+
+	if (list_empty(&ndl->ndl_link))
+		list_add_tail(&ndl->ndl_link, head);
+
+	return 1;
+}
+
+static int
+lstcon_test_nodes_add(lstcon_test_t *test, struct list_head __user *result_up)
+{
+        lstcon_rpc_trans_t     *trans;
+        lstcon_group_t         *grp;
+        int                     transop;
+        int                     rc;
+
+        LASSERT (test->tes_src_grp != NULL);
+        LASSERT (test->tes_dst_grp != NULL);
+
+        transop = LST_TRANS_TSBSRVADD;
+        grp  = test->tes_dst_grp;
+again:
+        rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+                                     &test->tes_trans_list, transop,
+                                     test, lstcon_testrpc_condition, &trans);
+        if (rc != 0) {
+                CERROR("Can't create transaction: %d\n", rc);
+                return rc;
+        }
+
+        lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+        if (lstcon_trans_stat()->trs_rpc_errno != 0 ||
+            lstcon_trans_stat()->trs_fwk_errno != 0) {
+                lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+                lstcon_rpc_trans_destroy(trans);
+                /* return if any error */
+                CDEBUG(D_NET, "Failed to add test %s, "
+                              "RPC error %d, framework error %d\n",
+                       transop == LST_TRANS_TSBCLIADD ? "client" : "server",
+                       lstcon_trans_stat()->trs_rpc_errno,
+                       lstcon_trans_stat()->trs_fwk_errno);
+
+                return rc;
+        }
+
+        lstcon_rpc_trans_destroy(trans);
+
+        if (transop == LST_TRANS_TSBCLIADD)
+                return rc;
+
+        transop = LST_TRANS_TSBCLIADD;
+        grp = test->tes_src_grp;
+        test->tes_cliidx = 0;
+
+        /* requests to test clients */
+        goto again;
+}
+
+static int
+lstcon_verify_batch(const char *name, lstcon_batch_t **batch)
+{
+	int rc;
+
+	rc = lstcon_batch_find(name, batch);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find batch %s\n", name);
+		return rc;
+	}
+
+	if ((*batch)->bat_state != LST_BATCH_IDLE) {
+		CDEBUG(D_NET, "Can't change running batch %s\n", name);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+lstcon_verify_group(const char *name, lstcon_group_t **grp)
+{
+	int			rc;
+	lstcon_ndlink_t		*ndl;
+
+	rc = lstcon_group_find(name, grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "can't find group %s\n", name);
+		return rc;
+	}
+
+	list_for_each_entry(ndl, &(*grp)->grp_ndl_list, ndl_link) {
+		if (ndl->ndl_node->nd_state == LST_NODE_ACTIVE) {
+			return 0;
+		}
+	}
+
+	CDEBUG(D_NET, "Group %s has no ACTIVE nodes\n", name);
+
+	return -EINVAL;
+}
+
+int
+lstcon_test_add(char *batch_name, int type, int loop,
+		int concur, int dist, int span,
+		char *src_name, char *dst_name,
+		void *param, int paramlen, int *retp,
+		struct list_head __user *result_up)
+{
+	lstcon_test_t	 *test	 = NULL;
+	int		 rc;
+	lstcon_group_t	 *src_grp = NULL;
+	lstcon_group_t	 *dst_grp = NULL;
+	lstcon_batch_t	 *batch = NULL;
+
+	/*
+	 * verify that a batch of the given name exists, and the groups
+	 * that will be part of the batch exist and have at least one
+	 * active node
+	 */
+	rc = lstcon_verify_batch(batch_name, &batch);
+	if (rc != 0)
+		goto out;
+
+	rc = lstcon_verify_group(src_name, &src_grp);
+	if (rc != 0)
+		goto out;
+
+	rc = lstcon_verify_group(dst_name, &dst_grp);
+	if (rc != 0)
+		goto out;
+
+	if (dst_grp->grp_userland)
+		*retp = 1;
+
+	LIBCFS_ALLOC(test, offsetof(lstcon_test_t, tes_param[paramlen]));
+	if (!test) {
+		CERROR("Can't allocate test descriptor\n");
+		rc = -ENOMEM;
+
+		goto out;
+	}
+
+	test->tes_hdr.tsb_id	= batch->bat_hdr.tsb_id;
+	test->tes_batch		= batch;
+	test->tes_type		= type;
+	test->tes_oneside	= 0; /* TODO */
+	test->tes_loop		= loop;
+	test->tes_concur	= concur;
+	test->tes_stop_onerr	= 1; /* TODO */
+	test->tes_span		= span;
+	test->tes_dist		= dist;
+	test->tes_cliidx	= 0; /* just used for creating RPC */
+	test->tes_src_grp	= src_grp;
+	test->tes_dst_grp	= dst_grp;
+	INIT_LIST_HEAD(&test->tes_trans_list);
+
+	if (param != NULL) {
+		test->tes_paramlen = paramlen;
+		memcpy(&test->tes_param[0], param, paramlen);
+	}
+
+	rc = lstcon_test_nodes_add(test, result_up);
+
+	if (rc != 0)
+		goto out;
+
+	if (lstcon_trans_stat()->trs_rpc_errno != 0 ||
+	    lstcon_trans_stat()->trs_fwk_errno != 0)
+		CDEBUG(D_NET, "Failed to add test %d to batch %s\n", type,
+		       batch_name);
+
+	/* add to test list anyway, so user can check what's going on */
+	list_add_tail(&test->tes_link, &batch->bat_test_list);
+
+	batch->bat_ntest++;
+	test->tes_hdr.tsb_index = batch->bat_ntest;
+
+	/*  hold groups so nobody can change them */
+	return rc;
+out:
+	if (test != NULL)
+		LIBCFS_FREE(test, offsetof(lstcon_test_t, tes_param[paramlen]));
+
+	if (dst_grp != NULL)
+		lstcon_group_decref(dst_grp);
+
+	if (src_grp != NULL)
+		lstcon_group_decref(src_grp);
+
+	return rc;
+}
+
+static int
+lstcon_test_find(lstcon_batch_t *batch, int idx, lstcon_test_t **testpp)
+{
+	lstcon_test_t *test;
+
+	list_for_each_entry(test, &batch->bat_test_list, tes_link) {
+		if (idx == test->tes_hdr.tsb_index) {
+			*testpp = test;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+static int
+lstcon_tsbrpc_readent(int transop, srpc_msg_t *msg,
+		      struct lstcon_rpc_ent __user *ent_up)
+{
+        srpc_batch_reply_t *rep = &msg->msg_body.bat_reply;
+
+        LASSERT (transop == LST_TRANS_TSBCLIQRY ||
+                 transop == LST_TRANS_TSBSRVQRY);
+
+        /* positive errno, framework error code */
+	if (copy_to_user(&ent_up->rpe_priv[0],
+                             &rep->bar_active, sizeof(rep->bar_active)))
+                return -EFAULT;
+
+        return 0;
+}
+
+int
+lstcon_test_batch_query(char *name, int testidx, int client,
+			int timeout, struct list_head __user *result_up)
+{
+        lstcon_rpc_trans_t *trans;
+	struct list_head   *translist;
+	struct list_head   *ndlist;
+        lstcon_tsb_hdr_t   *hdr;
+        lstcon_batch_t     *batch;
+        lstcon_test_t      *test = NULL;
+        int                 transop;
+        int                 rc;
+
+        rc = lstcon_batch_find(name, &batch);
+        if (rc != 0) {
+                CDEBUG(D_NET, "Can't find batch: %s\n", name);
+                return rc;
+        }
+
+        if (testidx == 0) {
+                translist = &batch->bat_trans_list;
+                ndlist    = &batch->bat_cli_list;
+                hdr       = &batch->bat_hdr;
+
+        } else {
+                /* query specified test only */
+                rc = lstcon_test_find(batch, testidx, &test);
+                if (rc != 0) {
+                        CDEBUG(D_NET, "Can't find test: %d\n", testidx);
+                        return rc;
+                }
+
+                translist = &test->tes_trans_list;
+                ndlist    = &test->tes_src_grp->grp_ndl_list;
+                hdr       = &test->tes_hdr;
+	}
+
+        transop = client ? LST_TRANS_TSBCLIQRY : LST_TRANS_TSBSRVQRY;
+
+        rc = lstcon_rpc_trans_ndlist(ndlist, translist, transop, hdr,
+                                     lstcon_batrpc_condition, &trans);
+        if (rc != 0) {
+                CERROR("Can't create transaction: %d\n", rc);
+                return rc;
+        }
+
+        lstcon_rpc_trans_postwait(trans, timeout);
+
+        if (testidx == 0 && /* query a batch, not a test */
+            lstcon_rpc_stat_failure(lstcon_trans_stat(), 0) == 0 &&
+            lstcon_tsbqry_stat_run(lstcon_trans_stat(), 0) == 0) {
+                /* all RPCs finished, and no active test */
+                batch->bat_state = LST_BATCH_IDLE;
+        }
+
+        rc = lstcon_rpc_trans_interpreter(trans, result_up,
+                                          lstcon_tsbrpc_readent);
+        lstcon_rpc_trans_destroy(trans);
+
+        return rc;
+}
+
+static int
+lstcon_statrpc_readent(int transop, srpc_msg_t *msg,
+		       struct lstcon_rpc_ent __user *ent_up)
+{
+	srpc_stat_reply_t *rep = &msg->msg_body.stat_reply;
+	struct sfw_counters __user  *sfwk_stat;
+	struct srpc_counters __user *srpc_stat;
+	struct lnet_counters __user *lnet_stat;
+
+        if (rep->str_status != 0)
+                return 0;
+
+	sfwk_stat = (struct sfw_counters __user *)&ent_up->rpe_payload[0];
+	srpc_stat = (struct srpc_counters __user *)
+		((char __user *)sfwk_stat + sizeof(*sfwk_stat));
+	lnet_stat = (struct lnet_counters __user *)
+		((char __user *)srpc_stat + sizeof(*srpc_stat));
+
+	if (copy_to_user(sfwk_stat, &rep->str_fw, sizeof(*sfwk_stat)) ||
+	    copy_to_user(srpc_stat, &rep->str_rpc, sizeof(*srpc_stat)) ||
+	    copy_to_user(lnet_stat, &rep->str_lnet, sizeof(*lnet_stat)))
+                return -EFAULT;
+
+        return 0;
+}
+
+static int
+lstcon_ndlist_stat(struct list_head *ndlist,
+		   int timeout, struct list_head __user *result_up)
+{
+	struct list_head    head;
+	lstcon_rpc_trans_t *trans;
+	int		    rc;
+
+	INIT_LIST_HEAD(&head);
+
+        rc = lstcon_rpc_trans_ndlist(ndlist, &head,
+                                     LST_TRANS_STATQRY, NULL, NULL, &trans);
+        if (rc != 0) {
+                CERROR("Can't create transaction: %d\n", rc);
+                return rc;
+        }
+
+        lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout));
+
+        rc = lstcon_rpc_trans_interpreter(trans, result_up,
+                                          lstcon_statrpc_readent);
+        lstcon_rpc_trans_destroy(trans);
+
+        return rc;
+}
+
+int
+lstcon_group_stat(char *grp_name, int timeout,
+		  struct list_head __user *result_up)
+{
+        lstcon_group_t     *grp;
+        int                 rc;
+
+        rc = lstcon_group_find(grp_name, &grp);
+        if (rc != 0) {
+                CDEBUG(D_NET, "Can't find group %s\n", grp_name);
+                return rc;
+        }
+
+        rc = lstcon_ndlist_stat(&grp->grp_ndl_list, timeout, result_up);
+
+	lstcon_group_decref(grp);
+
+        return rc;
+}
+
+int
+lstcon_nodes_stat(int count, struct lnet_process_id __user *ids_up,
+		  int timeout, struct list_head __user *result_up)
+{
+        lstcon_ndlink_t         *ndl;
+        lstcon_group_t          *tmp;
+	struct lnet_process_id        id;
+        int                      i;
+        int                      rc;
+
+        rc = lstcon_group_alloc(NULL, &tmp);
+        if (rc != 0) {
+                CERROR("Out of memory\n");
+                return -ENOMEM;
+        }
+
+        for (i = 0 ; i < count; i++) {
+		if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+                        rc = -EFAULT;
+                        break;
+                }
+
+                /* add to tmp group */
+                rc = lstcon_group_ndlink_find(tmp, id, &ndl, 2);
+                if (rc != 0) {
+                        CDEBUG((rc == -ENOMEM) ? D_ERROR : D_NET,
+                               "Failed to find or create %s: %d\n",
+                               libcfs_id2str(id), rc);
+                        break;
+                }
+        }
+
+        if (rc != 0) {
+		lstcon_group_decref(tmp);
+                return rc;
+        }
+
+        rc = lstcon_ndlist_stat(&tmp->grp_ndl_list, timeout, result_up);
+
+	lstcon_group_decref(tmp);
+
+        return rc;
+}
+
+static int
+lstcon_debug_ndlist(struct list_head *ndlist,
+		    struct list_head *translist,
+		    int timeout, struct list_head __user *result_up)
+{
+        lstcon_rpc_trans_t *trans;
+        int                 rc;
+
+        rc = lstcon_rpc_trans_ndlist(ndlist, translist, LST_TRANS_SESQRY,
+                                     NULL, lstcon_sesrpc_condition, &trans);
+        if (rc != 0) {
+                CERROR("Can't create transaction: %d\n", rc);
+                return rc;
+        }
+
+        lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout));
+
+        rc = lstcon_rpc_trans_interpreter(trans, result_up,
+                                          lstcon_sesrpc_readent);
+        lstcon_rpc_trans_destroy(trans);
+
+        return rc;
+}
+
+int
+lstcon_session_debug(int timeout, struct list_head __user *result_up)
+{
+        return lstcon_debug_ndlist(&console_session.ses_ndl_list,
+                                   NULL, timeout, result_up);
+}
+
+int
+lstcon_batch_debug(int timeout, char *name,
+		   int client, struct list_head __user *result_up)
+{
+        lstcon_batch_t *bat;
+        int             rc;
+
+        rc = lstcon_batch_find(name, &bat);
+        if (rc != 0)
+                return -ENOENT;
+
+        rc = lstcon_debug_ndlist(client ? &bat->bat_cli_list :
+                                          &bat->bat_srv_list,
+                                 NULL, timeout, result_up);
+
+        return rc;
+}
+
+int
+lstcon_group_debug(int timeout, char *name,
+		   struct list_head __user *result_up)
+{
+        lstcon_group_t *grp;
+        int             rc;
+
+        rc = lstcon_group_find(name, &grp);
+        if (rc != 0)
+                return -ENOENT;
+
+        rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL,
+                                 timeout, result_up);
+	lstcon_group_decref(grp);
+
+        return rc;
+}
+
+int
+lstcon_nodes_debug(int timeout,
+		   int count, struct lnet_process_id __user *ids_up,
+		   struct list_head __user *result_up)
+{
+	struct lnet_process_id  id;
+        lstcon_ndlink_t   *ndl;
+        lstcon_group_t    *grp;
+        int                i;
+        int                rc;
+
+        rc = lstcon_group_alloc(NULL, &grp);
+        if (rc != 0) {
+                CDEBUG(D_NET, "Out of memory\n");
+                return rc;
+        }
+
+        for (i = 0; i < count; i++) {
+		if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+                        rc = -EFAULT;
+                        break;
+                }
+
+                /* node is added to tmp group */
+                rc = lstcon_group_ndlink_find(grp, id, &ndl, 1);
+                if (rc != 0) {
+                        CERROR("Can't create node link\n");
+                        break;
+                }
+        }
+
+        if (rc != 0) {
+		lstcon_group_decref(grp);
+                return rc;
+        }
+
+        rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL,
+                                 timeout, result_up);
+
+	lstcon_group_decref(grp);
+
+        return rc;
+}
+
+int
+lstcon_session_match(struct lst_sid sid)
+{
+        return (console_session.ses_id.ses_nid   == sid.ses_nid &&
+                console_session.ses_id.ses_stamp == sid.ses_stamp) ?  1: 0;
+}
+
+static void
+lstcon_new_session_id(struct lst_sid *sid)
+{
+	struct lnet_process_id id;
+
+        LASSERT (console_session.ses_state == LST_SESSION_NONE);
+
+        LNetGetId(1, &id);
+        sid->ses_nid   = id.nid;
+        sid->ses_stamp = cfs_time_current();
+}
+
+int
+lstcon_session_new(char *name, int key, unsigned feats,
+		   int timeout, int force, struct lst_sid __user *sid_up)
+{
+        int     rc = 0;
+        int     i;
+
+        if (console_session.ses_state != LST_SESSION_NONE) {
+                /* session exists */
+                if (!force) {
+			CNETERR("Session %s already exists\n",
+				console_session.ses_name);
+                        return -EEXIST;
+                }
+
+                rc = lstcon_session_end();
+
+                /* lstcon_session_end() only return local error */
+                if  (rc != 0)
+                        return rc;
+        }
+
+	if ((feats & ~LST_FEATS_MASK) != 0) {
+		CNETERR("Unknown session features %x\n",
+			(feats & ~LST_FEATS_MASK));
+		return -EINVAL;
+	}
+
+	for (i = 0; i < LST_GLOBAL_HASHSIZE; i++)
+		LASSERT(list_empty(&console_session.ses_ndl_hash[i]));
+
+	lstcon_new_session_id(&console_session.ses_id);
+
+	console_session.ses_key	    = key;
+	console_session.ses_state   = LST_SESSION_ACTIVE;
+	console_session.ses_force   = !!force;
+	console_session.ses_features = feats;
+	console_session.ses_feats_updated = 0;
+	console_session.ses_timeout = (timeout <= 0) ?
+				      LST_CONSOLE_TIMEOUT : timeout;
+
+	if (strlen(name) > sizeof(console_session.ses_name)-1)
+		return -E2BIG;
+	strlcpy(console_session.ses_name, name,
+		sizeof(console_session.ses_name));
+
+        rc = lstcon_batch_add(LST_DEFAULT_BATCH);
+        if (rc != 0)
+                return rc;
+
+        rc = lstcon_rpc_pinger_start();
+        if (rc != 0) {
+                lstcon_batch_t *bat = NULL;
+
+                lstcon_batch_find(LST_DEFAULT_BATCH, &bat);
+                lstcon_batch_destroy(bat);
+
+                return rc;
+        }
+
+	if (copy_to_user(sid_up, &console_session.ses_id,
+			     sizeof(struct lst_sid)) == 0)
+                return rc;
+
+        lstcon_session_end();
+
+        return -EFAULT;
+}
+
+int
+lstcon_session_info(struct lst_sid __user *sid_up, int __user *key_up,
+		    unsigned __user *featp,
+		    struct lstcon_ndlist_ent __user *ndinfo_up,
+		    char __user *name_up, int len)
+{
+	struct lstcon_ndlist_ent *entp;
+        lstcon_ndlink_t     *ndl;
+        int                  rc = 0;
+
+        if (console_session.ses_state != LST_SESSION_ACTIVE)
+                return -ESRCH;
+
+        LIBCFS_ALLOC(entp, sizeof(*entp));
+        if (entp == NULL)
+                return -ENOMEM;
+
+	list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link)
+		LST_NODE_STATE_COUNTER(ndl->ndl_node, entp);
+
+	if (copy_to_user(sid_up, &console_session.ses_id,
+			 sizeof(struct lst_sid)) ||
+	    copy_to_user(key_up, &console_session.ses_key,
+			     sizeof(*key_up)) ||
+	    copy_to_user(featp, &console_session.ses_features,
+			     sizeof(*featp)) ||
+	    copy_to_user(ndinfo_up, entp, sizeof(*entp)) ||
+	    copy_to_user(name_up, console_session.ses_name, len))
+                rc = -EFAULT;
+
+        LIBCFS_FREE(entp, sizeof(*entp));
+
+        return rc;
+}
+
+int
+lstcon_session_end()
+{
+        lstcon_rpc_trans_t *trans;
+        lstcon_group_t     *grp;
+        lstcon_batch_t     *bat;
+        int                 rc = 0;
+
+        LASSERT (console_session.ses_state == LST_SESSION_ACTIVE);
+
+	rc = lstcon_rpc_trans_ndlist(&console_session.ses_ndl_list,
+                                     NULL, LST_TRANS_SESEND, NULL,
+                                     lstcon_sesrpc_condition, &trans);
+        if (rc != 0) {
+                CERROR("Can't create transaction: %d\n", rc);
+                return rc;
+        }
+
+        console_session.ses_shutdown = 1;
+
+        lstcon_rpc_pinger_stop();
+
+        lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+        lstcon_rpc_trans_destroy(trans);
+        /* User can do nothing even rpc failed, so go on */
+
+        /* waiting for orphan rpcs to die */
+        lstcon_rpc_cleanup_wait();
+
+        console_session.ses_id    = LST_INVALID_SID;
+        console_session.ses_state = LST_SESSION_NONE;
+        console_session.ses_key   = 0;
+        console_session.ses_force = 0;
+	console_session.ses_feats_updated = 0;
+
+	/* destroy all batches */
+	while (!list_empty(&console_session.ses_bat_list)) {
+		bat = list_entry(console_session.ses_bat_list.next,
+				 lstcon_batch_t, bat_link);
+
+		lstcon_batch_destroy(bat);
+	}
+
+	/* destroy all groups */
+	while (!list_empty(&console_session.ses_grp_list)) {
+		grp = list_entry(console_session.ses_grp_list.next,
+				 lstcon_group_t, grp_link);
+		LASSERT(grp->grp_ref == 1);
+
+		lstcon_group_decref(grp);
+	}
+
+	/* all nodes should be released */
+	LASSERT(list_empty(&console_session.ses_ndl_list));
+
+	console_session.ses_shutdown = 0;
+	console_session.ses_expired  = 0;
+
+	return rc;
+}
+
+int
+lstcon_session_feats_check(unsigned feats)
+{
+	int rc = 0;
+
+	if ((feats & ~LST_FEATS_MASK) != 0) {
+		CERROR("Can't support these features: %x\n",
+		       (feats & ~LST_FEATS_MASK));
+		return -EPROTO;
+	}
+
+	spin_lock(&console_session.ses_rpc_lock);
+
+	if (!console_session.ses_feats_updated) {
+		console_session.ses_feats_updated = 1;
+		console_session.ses_features = feats;
+	}
+
+	if (console_session.ses_features != feats)
+		rc = -EPROTO;
+
+	spin_unlock(&console_session.ses_rpc_lock);
+
+	if (rc != 0) {
+		CERROR("remote features %x do not match with "
+		       "session features %x of console\n",
+		       feats, console_session.ses_features);
+	}
+
+	return rc;
+}
+
+static int
+lstcon_acceptor_handle (srpc_server_rpc_t *rpc)
+{
+        srpc_msg_t        *rep  = &rpc->srpc_replymsg;
+        srpc_msg_t        *req  = &rpc->srpc_reqstbuf->buf_msg;
+        srpc_join_reqst_t *jreq = &req->msg_body.join_reqst;
+        srpc_join_reply_t *jrep = &rep->msg_body.join_reply;
+        lstcon_group_t    *grp  = NULL;
+        lstcon_ndlink_t   *ndl;
+        int                rc   = 0;
+
+        sfw_unpack_message(req);
+
+	mutex_lock(&console_session.ses_mutex);
+
+        jrep->join_sid = console_session.ses_id;
+
+        if (console_session.ses_id.ses_nid == LNET_NID_ANY) {
+                jrep->join_status = ESRCH;
+                goto out;
+        }
+
+	if (lstcon_session_feats_check(req->msg_ses_feats) != 0) {
+		jrep->join_status = EPROTO;
+		goto out;
+	}
+
+        if (jreq->join_sid.ses_nid != LNET_NID_ANY &&
+             !lstcon_session_match(jreq->join_sid)) {
+                jrep->join_status = EBUSY;
+                goto out;
+        }
+
+        if (lstcon_group_find(jreq->join_group, &grp) != 0) {
+                rc = lstcon_group_alloc(jreq->join_group, &grp);
+                if (rc != 0) {
+                        CERROR("Out of memory\n");
+                        goto out;
+                }
+
+		list_add_tail(&grp->grp_link,
+			      &console_session.ses_grp_list);
+		lstcon_group_addref(grp);
+	}
+
+        if (grp->grp_ref > 2) {
+                /* Group in using */
+                jrep->join_status = EBUSY;
+                goto out;
+        }
+
+        rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 0);
+        if (rc == 0) {
+                jrep->join_status = EEXIST;
+                goto out;
+        }
+
+        rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 1);
+        if (rc != 0) {
+                CERROR("Out of memory\n");
+                goto out;
+        }
+
+        ndl->ndl_node->nd_state   = LST_NODE_ACTIVE;
+        ndl->ndl_node->nd_timeout = console_session.ses_timeout;
+
+        if (grp->grp_userland == 0)
+                grp->grp_userland = 1;
+
+	strlcpy(jrep->join_session, console_session.ses_name,
+		sizeof(jrep->join_session));
+        jrep->join_timeout = console_session.ses_timeout;
+        jrep->join_status  = 0;
+
+out:
+	rep->msg_ses_feats = console_session.ses_features;
+        if (grp != NULL)
+		lstcon_group_decref(grp);
+
+	mutex_unlock(&console_session.ses_mutex);
+
+        return rc;
+}
+
+static srpc_service_t lstcon_acceptor_service;
+static void lstcon_init_acceptor_service(void)
+{
+        /* initialize selftest console acceptor service table */
+        lstcon_acceptor_service.sv_name    = "join session";
+        lstcon_acceptor_service.sv_handler = lstcon_acceptor_handle;
+        lstcon_acceptor_service.sv_id      = SRPC_SERVICE_JOIN;
+	lstcon_acceptor_service.sv_wi_total = SFW_FRWK_WI_MAX;
+}
+
+int lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_hdr *hdr);
+
+DECLARE_IOCTL_HANDLER(lstcon_ioctl_handler, lstcon_ioctl_entry);
+
+/* initialize console */
+int
+lstcon_console_init(void)
+{
+        int     i;
+        int     rc;
+
+        memset(&console_session, 0, sizeof(lstcon_session_t));
+
+	console_session.ses_id		    = LST_INVALID_SID;
+	console_session.ses_state	    = LST_SESSION_NONE;
+	console_session.ses_timeout	    = 0;
+	console_session.ses_force	    = 0;
+	console_session.ses_expired	    = 0;
+	console_session.ses_feats_updated   = 0;
+	console_session.ses_features	    = LST_FEATS_MASK;
+	console_session.ses_laststamp = cfs_time_current_sec();
+
+	mutex_init(&console_session.ses_mutex);
+
+	INIT_LIST_HEAD(&console_session.ses_ndl_list);
+	INIT_LIST_HEAD(&console_session.ses_grp_list);
+	INIT_LIST_HEAD(&console_session.ses_bat_list);
+	INIT_LIST_HEAD(&console_session.ses_trans_list);
+
+	LIBCFS_ALLOC(console_session.ses_ndl_hash,
+		     sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+	if (console_session.ses_ndl_hash == NULL)
+		return -ENOMEM;
+
+	for (i = 0; i < LST_GLOBAL_HASHSIZE; i++)
+		INIT_LIST_HEAD(&console_session.ses_ndl_hash[i]);
+
+
+        /* initialize acceptor service table */
+        lstcon_init_acceptor_service();
+
+	rc = srpc_add_service(&lstcon_acceptor_service);
+	LASSERT(rc != -EBUSY);
+	if (rc != 0) {
+		LIBCFS_FREE(console_session.ses_ndl_hash,
+			    sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+		return rc;
+	}
+
+	rc = srpc_service_add_buffers(&lstcon_acceptor_service,
+				      lstcon_acceptor_service.sv_wi_total);
+	if (rc != 0) {
+                rc = -ENOMEM;
+                goto out;
+        }
+
+        rc = libcfs_register_ioctl(&lstcon_ioctl_handler);
+
+        if (rc == 0) {
+                lstcon_rpc_module_init();
+                return 0;
+        }
+
+out:
+	srpc_shutdown_service(&lstcon_acceptor_service);
+	srpc_remove_service(&lstcon_acceptor_service);
+
+	LIBCFS_FREE(console_session.ses_ndl_hash,
+		    sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+
+	srpc_wait_service_shutdown(&lstcon_acceptor_service);
+
+	return rc;
+}
+
+int
+lstcon_console_fini(void)
+{
+        int     i;
+
+        libcfs_deregister_ioctl(&lstcon_ioctl_handler);
+
+	mutex_lock(&console_session.ses_mutex);
+
+        srpc_shutdown_service(&lstcon_acceptor_service);
+        srpc_remove_service(&lstcon_acceptor_service);
+
+        if (console_session.ses_state != LST_SESSION_NONE)
+                lstcon_session_end();
+
+        lstcon_rpc_module_fini();
+
+	mutex_unlock(&console_session.ses_mutex);
+
+	LASSERT(list_empty(&console_session.ses_ndl_list));
+	LASSERT(list_empty(&console_session.ses_grp_list));
+	LASSERT(list_empty(&console_session.ses_bat_list));
+	LASSERT(list_empty(&console_session.ses_trans_list));
+
+	for (i = 0; i < LST_NODE_HASHSIZE; i++)
+		LASSERT(list_empty(&console_session.ses_ndl_hash[i]));
+
+	LIBCFS_FREE(console_session.ses_ndl_hash,
+		    sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+
+	srpc_wait_service_shutdown(&lstcon_acceptor_service);
+
+	return 0;
+}
+
diff --git a/drivers/staging/lustrefsx/lnet/selftest/console.h b/drivers/staging/lustrefsx/lnet/selftest/console.h
new file mode 100644
index 0000000000000..0d597c45cb469
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/console.h
@@ -0,0 +1,257 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/console.h
+ *
+ * kernel structure for LST console
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#ifndef __LST_CONSOLE_H__
+#define __LST_CONSOLE_H__
+
+
+#include <libcfs/libcfs.h>
+#include <lnet/lnet.h>
+#include <lnet/lib-types.h>
+#include <lnet/lnetst.h>
+#include "selftest.h"
+#include "conrpc.h"
+
+typedef struct lstcon_node {
+	struct lnet_process_id    nd_id;          /* id of the node */
+        int                  nd_ref;         /* reference count */
+        int                  nd_state;       /* state of the node */
+        int                  nd_timeout;     /* session timeout */
+        cfs_time_t           nd_stamp;       /* timestamp of last replied RPC */
+        struct lstcon_rpc    nd_ping;        /* ping rpc */
+} lstcon_node_t;                                /*** node descriptor */
+
+typedef struct {
+	struct list_head	ndl_link;	/* chain on list */
+	struct list_head	ndl_hlink;	/* chain on hash */
+	lstcon_node_t		*ndl_node;	/* pointer to node */
+} lstcon_ndlink_t;				/*** node link descriptor */
+
+typedef struct {
+	struct list_head	grp_link;	/* chain on global group list */
+	int			grp_ref;	/* reference count */
+	int			grp_userland;	/* has userland nodes */
+	int			grp_nnode;	/* # of nodes */
+	char			grp_name[LST_NAME_SIZE];	/* group name */
+
+	struct list_head	grp_trans_list;	/* transaction list */
+	struct list_head	grp_ndl_list;	/* nodes list */
+	struct list_head	grp_ndl_hash[0];/* hash table for nodes */
+} lstcon_group_t;		/*** (alias of nodes) group descriptor */
+
+#define LST_BATCH_IDLE          0xB0            /* idle batch */
+#define LST_BATCH_RUNNING       0xB1            /* running batch */
+
+typedef struct lstcon_tsb_hdr {
+	struct lst_bid		tsb_id;		/* batch ID */
+        int                     tsb_index;      /* test index */
+} lstcon_tsb_hdr_t;
+
+typedef struct {
+	/* test_batch header */
+	lstcon_tsb_hdr_t	bat_hdr;
+	/* chain on session's batches list */
+	struct list_head	bat_link;
+	/* # of test */
+	int			bat_ntest;
+	/* state of the batch */
+	int			bat_state;
+	/* parameter for run|stop, timeout for run, force for stop */
+	int			bat_arg;
+	/* name of batch */
+	char			bat_name[LST_NAME_SIZE];
+
+	/* list head of tests (lstcon_test_t) */
+	struct list_head	bat_test_list;
+	/* list head of transaction */
+	struct list_head	bat_trans_list;
+	/* list head of client nodes (lstcon_node_t) */
+	struct list_head	bat_cli_list;
+	/* hash table of client nodes */
+	struct list_head	*bat_cli_hash;
+	/* list head of server nodes */
+	struct list_head	bat_srv_list;
+	/* hash table of server nodes */
+	struct list_head	*bat_srv_hash;
+} lstcon_batch_t;		/*** (tests ) batch descritptor */
+
+typedef struct lstcon_test {
+	/* test batch header */
+	lstcon_tsb_hdr_t	tes_hdr;
+	/* chain on batch's tests list */
+	struct list_head	tes_link;
+	/* pointer to batch */
+	lstcon_batch_t	       *tes_batch;
+
+        int                   tes_type;       /* type of the test, i.e: bulk, ping */
+        int                   tes_stop_onerr; /* stop on error */
+        int                   tes_oneside;    /* one-sided test */
+        int                   tes_concur;     /* concurrency */
+        int                   tes_loop;       /* loop count */
+        int                   tes_dist;       /* nodes distribution of target group */
+        int                   tes_span;       /* nodes span of target group */
+        int                   tes_cliidx;     /* client index, used for RPC creating */
+
+	struct list_head	tes_trans_list;	/* transaction list */
+	lstcon_group_t		*tes_src_grp;	/* group run the test */
+	lstcon_group_t		*tes_dst_grp;	/* target group */
+
+        int                   tes_paramlen;   /* test parameter length */
+        char                  tes_param[0];   /* test parameter */
+} lstcon_test_t;                                /*** a single test descriptor */
+
+#define LST_GLOBAL_HASHSIZE     503             /* global nodes hash table size */
+#define LST_NODE_HASHSIZE       239             /* node hash table (for batch or group) */
+
+#define LST_SESSION_NONE        0x0             /* no session */
+#define LST_SESSION_ACTIVE      0x1             /* working session */
+
+#define LST_CONSOLE_TIMEOUT     300             /* default console timeout */
+
+typedef struct {
+	struct mutex		ses_mutex;      /* only 1 thread in session */
+	struct lst_sid               ses_id;         /* global session id */
+        int                     ses_key;        /* local session key */
+        int                     ses_state;      /* state of session */
+        int                     ses_timeout;    /* timeout in seconds */
+	time_t			ses_laststamp;  /* last operation stamp (seconds) */
+	/** tests features of the session */
+	unsigned		ses_features;
+	/** features are synced with remote test nodes */
+	unsigned		ses_feats_updated:1;
+	/** force creating */
+	unsigned		ses_force:1;
+	/** session is shutting down */
+	unsigned		ses_shutdown:1;
+	/** console is timedout */
+	unsigned		ses_expired:1;
+        __u64                   ses_id_cookie;  /* batch id cookie */
+        char                    ses_name[LST_NAME_SIZE];  /* session name */
+        lstcon_rpc_trans_t     *ses_ping;       /* session pinger */
+        stt_timer_t             ses_ping_timer; /* timer for pinger */
+	struct lstcon_trans_stat     ses_trans_stat; /* transaction stats */
+
+	struct list_head	ses_trans_list;	/* global list of transaction */
+	struct list_head	ses_grp_list;	/* global list of groups */
+	struct list_head	ses_bat_list;	/* global list of batches */
+	struct list_head	ses_ndl_list;	/* global list of nodes */
+	struct list_head	*ses_ndl_hash;	/* hash table of nodes */
+
+	spinlock_t		ses_rpc_lock;	/* serialize */
+	atomic_t		ses_rpc_counter;/* # of initialized RPCs */
+	struct list_head	ses_rpc_freelist;/* idle console rpc */
+} lstcon_session_t;		/*** session descriptor */
+
+extern lstcon_session_t         console_session;
+
+static inline struct lstcon_trans_stat *
+lstcon_trans_stat(void)
+{
+        return &console_session.ses_trans_stat;
+}
+
+static inline struct list_head *
+lstcon_id2hash(struct lnet_process_id id, struct list_head *hash)
+{
+        unsigned int idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE;
+
+        return &hash[idx];
+}
+
+extern int lstcon_session_match(struct lst_sid sid);
+extern int lstcon_session_new(char *name, int key, unsigned version,
+			      int timeout, int flags, struct lst_sid __user *sid_up);
+extern int lstcon_session_info(struct lst_sid __user *sid_up, int __user *key,
+			       unsigned __user *verp,
+			       struct lstcon_ndlist_ent __user *entp,
+			       char __user *name_up, int len);
+extern int lstcon_session_end(void);
+extern int lstcon_session_debug(int timeout,
+				struct list_head __user *result_up);
+extern int lstcon_session_feats_check(unsigned feats);
+extern int lstcon_batch_debug(int timeout, char *name,
+			      int client, struct list_head __user *result_up);
+extern int lstcon_group_debug(int timeout, char *name,
+			      struct list_head __user *result_up);
+extern int lstcon_nodes_debug(int timeout, int nnd,
+			      struct lnet_process_id __user *nds_up,
+			      struct list_head __user *result_up);
+extern int lstcon_group_add(char *name);
+extern int lstcon_group_del(char *name);
+extern int lstcon_group_clean(char *name, int args);
+extern int lstcon_group_refresh(char *name, struct list_head __user *result_up);
+extern int lstcon_nodes_add(char *name, int nnd,
+			    struct lnet_process_id __user *nds_up,
+			    unsigned *featp,
+			    struct list_head __user *result_up);
+extern int lstcon_nodes_remove(char *name, int nnd,
+			       struct lnet_process_id __user *nds_up,
+			       struct list_head __user *result_up);
+extern int lstcon_group_info(char *name, struct lstcon_ndlist_ent __user *gent_up,
+			     int *index_p, int *ndent_p,
+			     struct lstcon_node_ent __user *ndents_up);
+extern int lstcon_group_list(int idx, int len, char __user *name_up);
+extern int lstcon_batch_add(char *name);
+extern int lstcon_batch_run(char *name, int timeout,
+			    struct list_head __user *result_up);
+extern int lstcon_batch_stop(char *name, int force,
+			     struct list_head __user *result_up);
+extern int lstcon_test_batch_query(char *name, int testidx,
+				   int client, int timeout,
+				   struct list_head __user *result_up);
+extern int lstcon_batch_del(char *name);
+extern int lstcon_batch_list(int idx, int namelen, char __user *name_up);
+extern int lstcon_batch_info(char *name,
+			     struct lstcon_test_batch_ent __user *ent_up,
+			     int server, int testidx, int *index_p,
+			     int *ndent_p,
+			     struct lstcon_node_ent __user *dents_up);
+extern int lstcon_group_stat(char *grp_name, int timeout,
+			     struct list_head __user *result_up);
+extern int lstcon_nodes_stat(int count, struct lnet_process_id __user *ids_up,
+			     int timeout, struct list_head __user *result_up);
+extern int lstcon_test_add(char *batch_name, int type, int loop,
+			   int concur, int dist, int span,
+			   char *src_name, char *dst_name,
+			   void *param, int paramlen, int *retp,
+			   struct list_head __user *result_up);
+
+int lstcon_console_init(void);
+int lstcon_console_fini(void);
+
+#endif
diff --git a/drivers/staging/lustrefsx/lnet/selftest/framework.c b/drivers/staging/lustrefsx/lnet/selftest/framework.c
new file mode 100644
index 0000000000000..b5d430dde00d1
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/framework.c
@@ -0,0 +1,1809 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/framework.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ * Author: Liang Zhen  <liangzhen@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+struct lst_sid LST_INVALID_SID = { .ses_nid = LNET_NID_ANY, .ses_stamp = -1};
+
+static int session_timeout = 100;
+module_param(session_timeout, int, 0444);
+MODULE_PARM_DESC(session_timeout, "test session timeout in seconds (100 by default, 0 == never)");
+
+static int rpc_timeout = 64;
+module_param(rpc_timeout, int, 0644);
+MODULE_PARM_DESC(rpc_timeout, "rpc timeout in seconds (64 by default, 0 == never)");
+
+#define sfw_unpack_id(id)               \
+do {                                    \
+        __swab64s(&(id).nid);           \
+        __swab32s(&(id).pid);           \
+} while (0)
+
+#define sfw_unpack_sid(sid)             \
+do {                                    \
+        __swab64s(&(sid).ses_nid);      \
+        __swab64s(&(sid).ses_stamp);    \
+} while (0)
+
+#define sfw_unpack_fw_counters(fc)        \
+do {                                      \
+        __swab32s(&(fc).running_ms);      \
+        __swab32s(&(fc).active_batches);  \
+        __swab32s(&(fc).zombie_sessions); \
+        __swab32s(&(fc).brw_errors);      \
+        __swab32s(&(fc).ping_errors);     \
+} while (0)
+
+#define sfw_unpack_rpc_counters(rc)     \
+do {                                    \
+        __swab32s(&(rc).errors);        \
+        __swab32s(&(rc).rpcs_sent);     \
+        __swab32s(&(rc).rpcs_rcvd);     \
+        __swab32s(&(rc).rpcs_dropped);  \
+        __swab32s(&(rc).rpcs_expired);  \
+        __swab64s(&(rc).bulk_get);      \
+        __swab64s(&(rc).bulk_put);      \
+} while (0)
+
+#define sfw_unpack_lnet_counters(lc)    \
+do {                                    \
+        __swab32s(&(lc).errors);        \
+        __swab32s(&(lc).msgs_max);      \
+        __swab32s(&(lc).msgs_alloc);    \
+        __swab32s(&(lc).send_count);    \
+        __swab32s(&(lc).recv_count);    \
+        __swab32s(&(lc).drop_count);    \
+        __swab32s(&(lc).route_count);   \
+        __swab64s(&(lc).send_length);   \
+        __swab64s(&(lc).recv_length);   \
+        __swab64s(&(lc).drop_length);   \
+        __swab64s(&(lc).route_length);  \
+} while (0)
+
+#define sfw_test_active(t)      (atomic_read(&(t)->tsi_nactive) != 0)
+#define sfw_batch_active(b)     (atomic_read(&(b)->bat_nactive) != 0)
+
+static struct smoketest_framework {
+	/* RPCs to be recycled */
+	struct list_head	fw_zombie_rpcs;
+	/* stopping sessions */
+	struct list_head	fw_zombie_sessions;
+	/* registered test cases */
+	struct list_head	fw_tests;
+	/* # zombie sessions */
+	atomic_t		fw_nzombies;
+	/* serialise */
+	spinlock_t		fw_lock;
+	/* _the_ session */
+	sfw_session_t		*fw_session;
+	/* shutdown in progress */
+	int			fw_shuttingdown;
+	/* running RPC */
+	srpc_server_rpc_t	*fw_active_srpc;
+} sfw_data;
+
+/* forward ref's */
+int sfw_stop_batch (sfw_batch_t *tsb, int force);
+void sfw_destroy_session (sfw_session_t *sn);
+
+static inline sfw_test_case_t *
+sfw_find_test_case(int id)
+{
+	sfw_test_case_t *tsc;
+
+	LASSERT(id <= SRPC_SERVICE_MAX_ID);
+	LASSERT(id > SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+	list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) {
+		if (tsc->tsc_srv_service->sv_id == id)
+			return tsc;
+	}
+
+	return NULL;
+}
+
+static int
+sfw_register_test (srpc_service_t *service, sfw_test_client_ops_t *cliops)
+{
+        sfw_test_case_t *tsc;
+
+        if (sfw_find_test_case(service->sv_id) != NULL) {
+                CERROR ("Failed to register test %s (%d)\n",
+                        service->sv_name, service->sv_id);
+                return -EEXIST;
+        }
+
+        LIBCFS_ALLOC(tsc, sizeof(sfw_test_case_t));
+        if (tsc == NULL)
+                return -ENOMEM;
+
+        tsc->tsc_cli_ops     = cliops;
+        tsc->tsc_srv_service = service;
+
+	list_add_tail(&tsc->tsc_list, &sfw_data.fw_tests);
+	return 0;
+}
+
+static void
+sfw_add_session_timer (void)
+{
+        sfw_session_t *sn = sfw_data.fw_session;
+        stt_timer_t   *timer = &sn->sn_timer;
+
+        LASSERT (!sfw_data.fw_shuttingdown);
+
+        if (sn == NULL || sn->sn_timeout == 0)
+                return;
+
+        LASSERT (!sn->sn_timer_active);
+
+        sn->sn_timer_active = 1;
+	timer->stt_expires = cfs_time_add(sn->sn_timeout,
+					  cfs_time_current_sec());
+        stt_add_timer(timer);
+        return;
+}
+
+static int
+sfw_del_session_timer (void)
+{
+        sfw_session_t *sn = sfw_data.fw_session;
+
+        if (sn == NULL || !sn->sn_timer_active)
+                return 0;
+
+        LASSERT (sn->sn_timeout != 0);
+
+        if (stt_del_timer(&sn->sn_timer)) { /* timer defused */
+                sn->sn_timer_active = 0;
+                return 0;
+        }
+
+        return EBUSY; /* racing with sfw_session_expired() */
+}
+
+/* called with sfw_data.fw_lock held */
+static void
+sfw_deactivate_session (void)
+__must_hold(&sfw_data.fw_lock)
+{
+        sfw_session_t *sn = sfw_data.fw_session;
+        int            nactive = 0;
+        sfw_batch_t   *tsb;
+        sfw_test_case_t *tsc;
+
+        if (sn == NULL) return;
+
+	LASSERT(!sn->sn_timer_active);
+
+	sfw_data.fw_session = NULL;
+	atomic_inc(&sfw_data.fw_nzombies);
+	list_add(&sn->sn_list, &sfw_data.fw_zombie_sessions);
+
+	spin_unlock(&sfw_data.fw_lock);
+
+	list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) {
+		srpc_abort_service(tsc->tsc_srv_service);
+	}
+
+	spin_lock(&sfw_data.fw_lock);
+
+	list_for_each_entry(tsb, &sn->sn_batches, bat_list) {
+		if (sfw_batch_active(tsb)) {
+			nactive++;
+			sfw_stop_batch(tsb, 1);
+		}
+	}
+
+	if (nactive != 0)
+		return;	/* wait for active batches to stop */
+
+	list_del_init(&sn->sn_list);
+	spin_unlock(&sfw_data.fw_lock);
+
+	sfw_destroy_session(sn);
+
+	spin_lock(&sfw_data.fw_lock);
+}
+
+
+static void
+sfw_session_expired (void *data)
+{
+	sfw_session_t *sn = data;
+
+	spin_lock(&sfw_data.fw_lock);
+
+        LASSERT (sn->sn_timer_active);
+        LASSERT (sn == sfw_data.fw_session);
+
+	CWARN ("Session expired! sid: %s-%llu, name: %s\n",
+               libcfs_nid2str(sn->sn_id.ses_nid),
+               sn->sn_id.ses_stamp, &sn->sn_name[0]);
+
+        sn->sn_timer_active = 0;
+        sfw_deactivate_session();
+
+	spin_unlock(&sfw_data.fw_lock);
+}
+
+static inline void
+sfw_init_session(sfw_session_t *sn, struct lst_sid sid,
+		 unsigned features, const char *name)
+{
+        stt_timer_t *timer = &sn->sn_timer;
+
+        memset(sn, 0, sizeof(sfw_session_t));
+	INIT_LIST_HEAD(&sn->sn_list);
+	INIT_LIST_HEAD(&sn->sn_batches);
+	atomic_set(&sn->sn_refcount, 1);        /* +1 for caller */
+	atomic_set(&sn->sn_brw_errors, 0);
+	atomic_set(&sn->sn_ping_errors, 0);
+	strlcpy(&sn->sn_name[0], name, sizeof(sn->sn_name));
+
+        sn->sn_timer_active = 0;
+        sn->sn_id           = sid;
+	sn->sn_features	    = features;
+        sn->sn_timeout      = session_timeout;
+        sn->sn_started      = cfs_time_current();
+
+        timer->stt_data = sn;
+        timer->stt_func = sfw_session_expired;
+	INIT_LIST_HEAD(&timer->stt_list);
+}
+
+/* completion handler for incoming framework RPCs */
+static void
+sfw_server_rpc_done(struct srpc_server_rpc *rpc)
+{
+	struct srpc_service	*sv	= rpc->srpc_scd->scd_svc;
+	int			status	= rpc->srpc_status;
+
+        CDEBUG (D_NET,
+                "Incoming framework RPC done: "
+                "service %s, peer %s, status %s:%d\n",
+                sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+                swi_state2str(rpc->srpc_wi.swi_state),
+                status);
+
+        if (rpc->srpc_bulk != NULL)
+                sfw_free_pages(rpc);
+        return;
+}
+
+static void
+sfw_client_rpc_fini (srpc_client_rpc_t *rpc)
+{
+	LASSERT(rpc->crpc_bulk.bk_niov == 0);
+	LASSERT(list_empty(&rpc->crpc_list));
+	LASSERT(atomic_read(&rpc->crpc_refcount) == 0);
+
+	CDEBUG(D_NET, "Outgoing framework RPC done: "
+	       "service %d, peer %s, status %s:%d:%d\n",
+	       rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+	       swi_state2str(rpc->crpc_wi.swi_state),
+	       rpc->crpc_aborted, rpc->crpc_status);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	/* my callers must finish all RPCs before shutting me down */
+	LASSERT(!sfw_data.fw_shuttingdown);
+	list_add(&rpc->crpc_list, &sfw_data.fw_zombie_rpcs);
+
+	spin_unlock(&sfw_data.fw_lock);
+}
+
+static sfw_batch_t *
+sfw_find_batch(struct lst_bid bid)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+	sfw_batch_t   *bat;
+
+	LASSERT(sn != NULL);
+
+	list_for_each_entry(bat, &sn->sn_batches, bat_list) {
+		if (bat->bat_id.bat_id == bid.bat_id)
+			return bat;
+	}
+
+	return NULL;
+}
+
+static sfw_batch_t *
+sfw_bid2batch(struct lst_bid bid)
+{
+        sfw_session_t *sn = sfw_data.fw_session;
+        sfw_batch_t   *bat;
+
+        LASSERT (sn != NULL);
+
+        bat = sfw_find_batch(bid);
+        if (bat != NULL)
+                return bat;
+
+        LIBCFS_ALLOC(bat, sizeof(sfw_batch_t));
+	if (bat == NULL)
+                return NULL;
+
+	bat->bat_error	 = 0;
+	bat->bat_session = sn;
+	bat->bat_id	 = bid;
+	atomic_set(&bat->bat_nactive, 0);
+	INIT_LIST_HEAD(&bat->bat_tests);
+
+	list_add_tail(&bat->bat_list, &sn->sn_batches);
+	return bat;
+}
+
+static int
+sfw_get_stats (srpc_stat_reqst_t *request, srpc_stat_reply_t *reply)
+{
+        sfw_session_t  *sn = sfw_data.fw_session;
+	struct sfw_counters *cnt = &reply->str_fw;
+        sfw_batch_t    *bat;
+
+        reply->str_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+        if (request->str_sid.ses_nid == LNET_NID_ANY) {
+                reply->str_status = EINVAL;
+                return 0;
+        }
+
+        if (sn == NULL || !sfw_sid_equal(request->str_sid, sn->sn_id)) {
+                reply->str_status = ESRCH;
+                return 0;
+        }
+
+	lnet_counters_get(&reply->str_lnet);
+	srpc_get_counters(&reply->str_rpc);
+
+        /* send over the msecs since the session was started
+         - with 32 bits to send, this is ~49 days */
+	cnt->running_ms      = jiffies_to_msecs(jiffies - sn->sn_started);
+	cnt->brw_errors      = atomic_read(&sn->sn_brw_errors);
+	cnt->ping_errors     = atomic_read(&sn->sn_ping_errors);
+	cnt->zombie_sessions = atomic_read(&sfw_data.fw_nzombies);
+
+	cnt->active_batches = 0;
+	list_for_each_entry(bat, &sn->sn_batches, bat_list) {
+		if (atomic_read(&bat->bat_nactive) > 0)
+			cnt->active_batches++;
+	}
+
+	reply->str_status = 0;
+	return 0;
+}
+
+int
+sfw_make_session(srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+	srpc_msg_t    *msg = container_of(request, srpc_msg_t,
+					  msg_body.mksn_reqst);
+	int	       cplen = 0;
+
+        if (request->mksn_sid.ses_nid == LNET_NID_ANY) {
+                reply->mksn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+                reply->mksn_status = EINVAL;
+                return 0;
+        }
+
+        if (sn != NULL) {
+                reply->mksn_status  = 0;
+                reply->mksn_sid     = sn->sn_id;
+                reply->mksn_timeout = sn->sn_timeout;
+
+                if (sfw_sid_equal(request->mksn_sid, sn->sn_id)) {
+			atomic_inc(&sn->sn_refcount);
+                        return 0;
+                }
+
+                if (!request->mksn_force) {
+                        reply->mksn_status = EBUSY;
+			cplen = strlcpy(&reply->mksn_name[0], &sn->sn_name[0],
+					sizeof(reply->mksn_name));
+			if (cplen >= sizeof(reply->mksn_name))
+				return -E2BIG;
+                        return 0;
+                }
+        }
+
+	/* reject the request if it requires unknown features
+	 * NB: old version will always accept all features because it's not
+	 * aware of srpc_msg_t::msg_ses_feats, it's a defect but it's also
+	 * harmless because it will return zero feature to console, and it's
+	 * console's responsibility to make sure all nodes in a session have
+	 * same feature mask. */
+	if ((msg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		reply->mksn_status = EPROTO;
+		return 0;
+	}
+
+	/* brand new or create by force */
+	LIBCFS_ALLOC(sn, sizeof(sfw_session_t));
+	if (sn == NULL) {
+		CERROR("dropping RPC mksn under memory pressure\n");
+		return -ENOMEM;
+	}
+
+	sfw_init_session(sn, request->mksn_sid,
+			 msg->msg_ses_feats, &request->mksn_name[0]);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	sfw_deactivate_session();
+	LASSERT(sfw_data.fw_session == NULL);
+	sfw_data.fw_session = sn;
+
+	spin_unlock(&sfw_data.fw_lock);
+
+	reply->mksn_status  = 0;
+	reply->mksn_sid     = sn->sn_id;
+	reply->mksn_timeout = sn->sn_timeout;
+	return 0;
+}
+
+static int
+sfw_remove_session (srpc_rmsn_reqst_t *request, srpc_rmsn_reply_t *reply)
+{
+        sfw_session_t *sn = sfw_data.fw_session;
+
+        reply->rmsn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+        if (request->rmsn_sid.ses_nid == LNET_NID_ANY) {
+                reply->rmsn_status = EINVAL;
+                return 0;
+        }
+
+        if (sn == NULL || !sfw_sid_equal(request->rmsn_sid, sn->sn_id)) {
+                reply->rmsn_status = (sn == NULL) ? ESRCH : EBUSY;
+                return 0;
+        }
+
+	if (!atomic_dec_and_test(&sn->sn_refcount)) {
+                reply->rmsn_status = 0;
+                return 0;
+        }
+
+	spin_lock(&sfw_data.fw_lock);
+	sfw_deactivate_session();
+	spin_unlock(&sfw_data.fw_lock);
+
+	reply->rmsn_status = 0;
+	reply->rmsn_sid    = LST_INVALID_SID;
+	LASSERT(sfw_data.fw_session == NULL);
+	return 0;
+}
+
+static int
+sfw_debug_session (srpc_debug_reqst_t *request, srpc_debug_reply_t *reply)
+{
+        sfw_session_t *sn = sfw_data.fw_session;
+
+        if (sn == NULL) {
+                reply->dbg_status = ESRCH;
+                reply->dbg_sid    = LST_INVALID_SID;
+                return 0;
+	}
+
+	reply->dbg_status  = 0;
+	reply->dbg_sid     = sn->sn_id;
+	reply->dbg_timeout = sn->sn_timeout;
+	if (strlcpy(reply->dbg_name, &sn->sn_name[0], sizeof(reply->dbg_name))
+	    >= sizeof(reply->dbg_name))
+		return -E2BIG;
+
+        return 0;
+}
+
+static void
+sfw_test_rpc_fini (srpc_client_rpc_t *rpc)
+{
+	sfw_test_unit_t	    *tsu = rpc->crpc_priv;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+
+	/* Called with hold of tsi->tsi_lock */
+	LASSERT(list_empty(&rpc->crpc_list));
+	list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs);
+}
+
+static inline int
+sfw_test_buffers(sfw_test_instance_t *tsi)
+{
+	struct sfw_test_case	*tsc;
+	struct srpc_service	*svc;
+	int			nbuf;
+
+	LASSERT(tsi != NULL);
+	tsc = sfw_find_test_case(tsi->tsi_service);
+	LASSERT(tsc != NULL);
+	svc = tsc->tsc_srv_service;
+	LASSERT(svc != NULL);
+
+	nbuf = min(svc->sv_wi_total, tsi->tsi_loop) / svc->sv_ncpts;
+	return max(SFW_TEST_WI_MIN, nbuf + SFW_TEST_WI_EXTRA);
+}
+
+static int
+sfw_load_test(struct sfw_test_instance *tsi)
+{
+	struct sfw_test_case	*tsc;
+	struct srpc_service	*svc;
+	int			nbuf;
+	int			rc;
+
+	LASSERT(tsi != NULL);
+	tsc = sfw_find_test_case(tsi->tsi_service);
+	nbuf = sfw_test_buffers(tsi);
+	LASSERT(tsc != NULL);
+	svc = tsc->tsc_srv_service;
+
+	if (tsi->tsi_is_client) {
+		tsi->tsi_ops = tsc->tsc_cli_ops;
+		return 0;
+	}
+
+	rc = srpc_service_add_buffers(svc, nbuf);
+	if (rc != 0) {
+		CWARN("Failed to reserve enough buffers: "
+		      "service %s, %d needed: %d\n", svc->sv_name, nbuf, rc);
+		/* NB: this error handler is not strictly correct, because
+		 * it may release more buffers than already allocated,
+		 * but it doesn't matter because request portal should
+		 * be lazy portal and will grow buffers if necessary. */
+		srpc_service_remove_buffers(svc, nbuf);
+		return -ENOMEM;
+	}
+
+	CDEBUG(D_NET, "Reserved %d buffers for test %s\n",
+	       nbuf * (srpc_serv_is_framework(svc) ?
+		       1 : cfs_cpt_number(cfs_cpt_table)), svc->sv_name);
+	return 0;
+}
+
+static void
+sfw_unload_test(struct sfw_test_instance *tsi)
+{
+	struct sfw_test_case *tsc;
+
+	LASSERT(tsi != NULL);
+	tsc = sfw_find_test_case(tsi->tsi_service);
+	LASSERT(tsc != NULL);
+
+	if (tsi->tsi_is_client)
+		return;
+
+	/* shrink buffers, because request portal is lazy portal
+	 * which can grow buffers at runtime so we may leave
+	 * some buffers behind, but never mind... */
+	srpc_service_remove_buffers(tsc->tsc_srv_service,
+				    sfw_test_buffers(tsi));
+	return;
+}
+
+static void
+sfw_destroy_test_instance (sfw_test_instance_t *tsi)
+{
+        srpc_client_rpc_t *rpc;
+        sfw_test_unit_t   *tsu;
+
+        if (!tsi->tsi_is_client) goto clean;
+
+        tsi->tsi_ops->tso_fini(tsi);
+
+	LASSERT(!tsi->tsi_stopping);
+	LASSERT(list_empty(&tsi->tsi_active_rpcs));
+	LASSERT(!sfw_test_active(tsi));
+
+	while (!list_empty(&tsi->tsi_units)) {
+		tsu = list_entry(tsi->tsi_units.next,
+				 sfw_test_unit_t, tsu_list);
+		list_del(&tsu->tsu_list);
+		LIBCFS_FREE(tsu, sizeof(*tsu));
+	}
+
+	while (!list_empty(&tsi->tsi_free_rpcs)) {
+		rpc = list_entry(tsi->tsi_free_rpcs.next,
+				 srpc_client_rpc_t, crpc_list);
+		list_del(&rpc->crpc_list);
+		LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+	}
+
+clean:
+	sfw_unload_test(tsi);
+	LIBCFS_FREE(tsi, sizeof(*tsi));
+	return;
+}
+
+static void
+sfw_destroy_batch (sfw_batch_t *tsb)
+{
+	sfw_test_instance_t *tsi;
+
+	LASSERT(!sfw_batch_active(tsb));
+	LASSERT(list_empty(&tsb->bat_list));
+
+	while (!list_empty(&tsb->bat_tests)) {
+		tsi = list_entry(tsb->bat_tests.next,
+				 sfw_test_instance_t, tsi_list);
+		list_del_init(&tsi->tsi_list);
+		sfw_destroy_test_instance(tsi);
+	}
+
+	LIBCFS_FREE(tsb, sizeof(sfw_batch_t));
+	return;
+}
+
+void
+sfw_destroy_session (sfw_session_t *sn)
+{
+	sfw_batch_t *batch;
+
+	LASSERT(list_empty(&sn->sn_list));
+	LASSERT(sn != sfw_data.fw_session);
+
+	while (!list_empty(&sn->sn_batches)) {
+		batch = list_entry(sn->sn_batches.next,
+				   sfw_batch_t, bat_list);
+		list_del_init(&batch->bat_list);
+		sfw_destroy_batch(batch);
+	}
+
+	LIBCFS_FREE(sn, sizeof(*sn));
+	atomic_dec(&sfw_data.fw_nzombies);
+	return;
+}
+
+static void
+sfw_unpack_addtest_req(srpc_msg_t *msg)
+{
+        srpc_test_reqst_t *req = &msg->msg_body.tes_reqst;
+
+        LASSERT (msg->msg_type == SRPC_MSG_TEST_REQST);
+        LASSERT (req->tsr_is_client);
+
+        if (msg->msg_magic == SRPC_MSG_MAGIC)
+                return; /* no flipping needed */
+
+        LASSERT (msg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+	if (req->tsr_service == SRPC_SERVICE_BRW) {
+		if ((msg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) {
+			test_bulk_req_t *bulk = &req->tsr_u.bulk_v0;
+
+			__swab32s(&bulk->blk_opc);
+			__swab32s(&bulk->blk_npg);
+			__swab32s(&bulk->blk_flags);
+
+		} else {
+			test_bulk_req_v1_t *bulk = &req->tsr_u.bulk_v1;
+
+			__swab16s(&bulk->blk_opc);
+			__swab16s(&bulk->blk_flags);
+			__swab32s(&bulk->blk_offset);
+			__swab32s(&bulk->blk_len);
+		}
+
+		return;
+	}
+
+        if (req->tsr_service == SRPC_SERVICE_PING) {
+                test_ping_req_t *ping = &req->tsr_u.ping;
+
+                __swab32s(&ping->png_size);
+                __swab32s(&ping->png_flags);
+                return;
+        }
+
+	LBUG();
+	return;
+}
+
+static int
+sfw_add_test_instance (sfw_batch_t *tsb, srpc_server_rpc_t *rpc)
+{
+        srpc_msg_t          *msg = &rpc->srpc_reqstbuf->buf_msg;
+        srpc_test_reqst_t   *req = &msg->msg_body.tes_reqst;
+        srpc_bulk_t         *bk = rpc->srpc_bulk;
+        int                  ndest = req->tsr_ndest;
+        sfw_test_unit_t     *tsu;
+        sfw_test_instance_t *tsi;
+        int                  i;
+        int                  rc;
+
+        LIBCFS_ALLOC(tsi, sizeof(*tsi));
+        if (tsi == NULL) {
+		CERROR ("Can't allocate test instance for batch: %llu\n",
+                        tsb->bat_id.bat_id);
+                return -ENOMEM;
+        }
+
+	spin_lock_init(&tsi->tsi_lock);
+	atomic_set(&tsi->tsi_nactive, 0);
+	INIT_LIST_HEAD(&tsi->tsi_units);
+	INIT_LIST_HEAD(&tsi->tsi_free_rpcs);
+	INIT_LIST_HEAD(&tsi->tsi_active_rpcs);
+
+        tsi->tsi_stopping      = 0;
+        tsi->tsi_batch         = tsb;
+        tsi->tsi_loop          = req->tsr_loop;
+        tsi->tsi_concur        = req->tsr_concur;
+        tsi->tsi_service       = req->tsr_service;
+        tsi->tsi_is_client     = !!(req->tsr_is_client);
+        tsi->tsi_stoptsu_onerr = !!(req->tsr_stop_onerr);
+
+        rc = sfw_load_test(tsi);
+        if (rc != 0) {
+                LIBCFS_FREE(tsi, sizeof(*tsi));
+                return rc;
+        }
+
+        LASSERT (!sfw_batch_active(tsb));
+
+	if (!tsi->tsi_is_client) {
+		/* it's test server, just add it to tsb */
+		list_add_tail(&tsi->tsi_list, &tsb->bat_tests);
+		return 0;
+	}
+
+        LASSERT (bk != NULL);
+        LASSERT (bk->bk_niov * SFW_ID_PER_PAGE >= (unsigned int)ndest);
+	LASSERT((unsigned int)bk->bk_len >=
+		sizeof(struct lnet_process_id_packed) * ndest);
+
+	sfw_unpack_addtest_req(msg);
+        memcpy(&tsi->tsi_u, &req->tsr_u, sizeof(tsi->tsi_u));
+
+        for (i = 0; i < ndest; i++) {
+		struct lnet_process_id_packed *dests;
+		struct lnet_process_id_packed  id;
+                int                       j;
+
+		dests = page_address(bk->bk_iovs[i / SFW_ID_PER_PAGE].kiov_page);
+		LASSERT (dests != NULL);  /* my pages are within KVM always */
+                id = dests[i % SFW_ID_PER_PAGE];
+                if (msg->msg_magic != SRPC_MSG_MAGIC)
+                        sfw_unpack_id(id);
+
+                for (j = 0; j < tsi->tsi_concur; j++) {
+                        LIBCFS_ALLOC(tsu, sizeof(sfw_test_unit_t));
+                        if (tsu == NULL) {
+                                rc = -ENOMEM;
+                                CERROR ("Can't allocate tsu for %d\n",
+                                        tsi->tsi_service);
+                                goto error;
+                        }
+
+			tsu->tsu_dest.nid = id.nid;
+			tsu->tsu_dest.pid = id.pid;
+			tsu->tsu_instance = tsi;
+			tsu->tsu_private  = NULL;
+			list_add_tail(&tsu->tsu_list, &tsi->tsi_units);
+		}
+	}
+
+	rc = tsi->tsi_ops->tso_init(tsi);
+	if (rc == 0) {
+		list_add_tail(&tsi->tsi_list, &tsb->bat_tests);
+		return 0;
+	}
+
+error:
+	LASSERT(rc != 0);
+	sfw_destroy_test_instance(tsi);
+	return rc;
+}
+
+static void
+sfw_test_unit_done (sfw_test_unit_t *tsu)
+{
+        sfw_test_instance_t *tsi = tsu->tsu_instance;
+        sfw_batch_t         *tsb = tsi->tsi_batch;
+        sfw_session_t       *sn = tsb->bat_session;
+
+        LASSERT (sfw_test_active(tsi));
+
+	if (!atomic_dec_and_test(&tsi->tsi_nactive))
+                return;
+
+        /* the test instance is done */
+	spin_lock(&tsi->tsi_lock);
+
+	tsi->tsi_stopping = 0;
+
+	spin_unlock(&tsi->tsi_lock);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	if (!atomic_dec_and_test(&tsb->bat_nactive) ||/* tsb still active */
+	    sn == sfw_data.fw_session) {		  /* sn also active */
+		spin_unlock(&sfw_data.fw_lock);
+                return;
+        }
+
+	LASSERT(!list_empty(&sn->sn_list)); /* I'm a zombie! */
+
+	list_for_each_entry(tsb, &sn->sn_batches, bat_list) {
+		if (sfw_batch_active(tsb)) {
+			spin_unlock(&sfw_data.fw_lock);
+			return;
+		}
+	}
+
+	list_del_init(&sn->sn_list);
+	spin_unlock(&sfw_data.fw_lock);
+
+	sfw_destroy_session(sn);
+	return;
+}
+
+static void
+sfw_test_rpc_done (srpc_client_rpc_t *rpc)
+{
+        sfw_test_unit_t     *tsu = rpc->crpc_priv;
+        sfw_test_instance_t *tsi = tsu->tsu_instance;
+        int                  done = 0;
+
+        tsi->tsi_ops->tso_done_rpc(tsu, rpc);
+
+	spin_lock(&tsi->tsi_lock);
+
+	LASSERT(sfw_test_active(tsi));
+	LASSERT(!list_empty(&rpc->crpc_list));
+
+	list_del_init(&rpc->crpc_list);
+
+        /* batch is stopping or loop is done or get error */
+        if (tsi->tsi_stopping ||
+            tsu->tsu_loop == 0 ||
+            (rpc->crpc_status != 0 && tsi->tsi_stoptsu_onerr))
+                done = 1;
+
+        /* dec ref for poster */
+        srpc_client_rpc_decref(rpc);
+
+	spin_unlock(&tsi->tsi_lock);
+
+        if (!done) {
+                swi_schedule_workitem(&tsu->tsu_worker);
+                return;
+        }
+
+        sfw_test_unit_done(tsu);
+        return;
+}
+
+int
+sfw_create_test_rpc(sfw_test_unit_t *tsu, struct lnet_process_id peer,
+		    unsigned features, int nblk, int blklen,
+		    srpc_client_rpc_t **rpcpp)
+{
+	srpc_client_rpc_t   *rpc = NULL;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+
+	spin_lock(&tsi->tsi_lock);
+
+        LASSERT (sfw_test_active(tsi));
+
+	if (!list_empty(&tsi->tsi_free_rpcs)) {
+		/* pick request from buffer */
+		rpc = list_entry(tsi->tsi_free_rpcs.next,
+				 srpc_client_rpc_t, crpc_list);
+		LASSERT(nblk == rpc->crpc_bulk.bk_niov);
+		list_del_init(&rpc->crpc_list);
+	}
+
+	spin_unlock(&tsi->tsi_lock);
+
+	if (rpc == NULL) {
+		rpc = srpc_create_client_rpc(peer, tsi->tsi_service, nblk,
+					     blklen, sfw_test_rpc_done,
+					     sfw_test_rpc_fini, tsu);
+	} else {
+		srpc_init_client_rpc(rpc, peer, tsi->tsi_service, nblk,
+				     blklen, sfw_test_rpc_done,
+				     sfw_test_rpc_fini, tsu);
+	}
+
+	if (rpc == NULL) {
+		CERROR("Can't create rpc for test %d\n", tsi->tsi_service);
+		return -ENOMEM;
+	}
+
+	rpc->crpc_reqstmsg.msg_ses_feats = features;
+	*rpcpp = rpc;
+
+	return 0;
+}
+
+static int
+sfw_run_test (swi_workitem_t *wi)
+{
+        sfw_test_unit_t     *tsu = wi->swi_workitem.wi_data;
+        sfw_test_instance_t *tsi = tsu->tsu_instance;
+        srpc_client_rpc_t   *rpc = NULL;
+
+        LASSERT (wi == &tsu->tsu_worker);
+
+        if (tsi->tsi_ops->tso_prep_rpc(tsu, tsu->tsu_dest, &rpc) != 0) {
+                LASSERT (rpc == NULL);
+                goto test_done;
+        }
+
+        LASSERT (rpc != NULL);
+
+	spin_lock(&tsi->tsi_lock);
+
+	if (tsi->tsi_stopping) {
+		list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs);
+		spin_unlock(&tsi->tsi_lock);
+		goto test_done;
+	}
+
+	if (tsu->tsu_loop > 0)
+		tsu->tsu_loop--;
+
+	list_add_tail(&rpc->crpc_list, &tsi->tsi_active_rpcs);
+	spin_unlock(&tsi->tsi_lock);
+
+	spin_lock(&rpc->crpc_lock);
+	rpc->crpc_timeout = rpc_timeout;
+	srpc_post_rpc(rpc);
+	spin_unlock(&rpc->crpc_lock);
+	return 0;
+
+test_done:
+        /*
+         * No one can schedule me now since:
+         * - previous RPC, if any, has done and
+         * - no new RPC is initiated.
+         * - my batch is still active; no one can run it again now.
+         * Cancel pending schedules and prevent future schedule attempts:
+         */
+	swi_exit_workitem(wi);
+	sfw_test_unit_done(tsu);
+	return 1;
+}
+
+static int
+sfw_run_batch (sfw_batch_t *tsb)
+{
+        swi_workitem_t      *wi;
+        sfw_test_unit_t     *tsu;
+        sfw_test_instance_t *tsi;
+
+        if (sfw_batch_active(tsb)) {
+		CDEBUG(D_NET, "Batch already active: %llu (%d)\n",
+		       tsb->bat_id.bat_id, atomic_read(&tsb->bat_nactive));
+                return 0;
+        }
+
+	list_for_each_entry(tsi, &tsb->bat_tests, tsi_list) {
+		if (!tsi->tsi_is_client)	/* skip server instances */
+			continue;
+
+		LASSERT(!tsi->tsi_stopping);
+		LASSERT(!sfw_test_active(tsi));
+
+		atomic_inc(&tsb->bat_nactive);
+
+		list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) {
+			atomic_inc(&tsi->tsi_nactive);
+			tsu->tsu_loop = tsi->tsi_loop;
+			wi = &tsu->tsu_worker;
+			swi_init_workitem(wi, tsu, sfw_run_test,
+					  lst_sched_test[\
+					  lnet_cpt_of_nid(tsu->tsu_dest.nid,
+							  NULL)]);
+			swi_schedule_workitem(wi);
+		}
+	}
+
+	return 0;
+}
+
+int
+sfw_stop_batch (sfw_batch_t *tsb, int force)
+{
+        sfw_test_instance_t *tsi;
+        srpc_client_rpc_t   *rpc;
+
+        if (!sfw_batch_active(tsb)) {
+		CDEBUG(D_NET, "Batch %llu inactive\n", tsb->bat_id.bat_id);
+                return 0;
+        }
+
+	list_for_each_entry(tsi, &tsb->bat_tests, tsi_list) {
+		spin_lock(&tsi->tsi_lock);
+
+		if (!tsi->tsi_is_client ||
+		    !sfw_test_active(tsi) || tsi->tsi_stopping) {
+			spin_unlock(&tsi->tsi_lock);
+			continue;
+		}
+
+		tsi->tsi_stopping = 1;
+
+		if (!force) {
+			spin_unlock(&tsi->tsi_lock);
+			continue;
+		}
+
+		/* abort launched rpcs in the test */
+		list_for_each_entry(rpc, &tsi->tsi_active_rpcs, crpc_list) {
+			spin_lock(&rpc->crpc_lock);
+
+			srpc_abort_rpc(rpc, -EINTR);
+
+			spin_unlock(&rpc->crpc_lock);
+		}
+
+		spin_unlock(&tsi->tsi_lock);
+	}
+
+	return 0;
+}
+
+static int
+sfw_query_batch (sfw_batch_t *tsb, int testidx, srpc_batch_reply_t *reply)
+{
+        sfw_test_instance_t *tsi;
+
+        if (testidx < 0)
+                return -EINVAL;
+
+        if (testidx == 0) {
+		reply->bar_active = atomic_read(&tsb->bat_nactive);
+                return 0;
+        }
+
+	list_for_each_entry(tsi, &tsb->bat_tests, tsi_list) {
+                if (testidx-- > 1)
+                        continue;
+
+		reply->bar_active = atomic_read(&tsi->tsi_nactive);
+                return 0;
+        }
+
+        return -ENOENT;
+}
+
+void
+sfw_free_pages (srpc_server_rpc_t *rpc)
+{
+        srpc_free_bulk(rpc->srpc_bulk);
+        rpc->srpc_bulk = NULL;
+}
+
+int
+sfw_alloc_pages(struct srpc_server_rpc *rpc, int cpt, int npages, int len,
+		int sink)
+{
+	LASSERT(rpc->srpc_bulk == NULL);
+	LASSERT(npages > 0 && npages <= LNET_MAX_IOV);
+
+	rpc->srpc_bulk = srpc_alloc_bulk(cpt, 0, npages, len, sink);
+	if (rpc->srpc_bulk == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int
+sfw_add_test (srpc_server_rpc_t *rpc)
+{
+        sfw_session_t     *sn = sfw_data.fw_session;
+        srpc_test_reply_t *reply = &rpc->srpc_replymsg.msg_body.tes_reply;
+        srpc_test_reqst_t *request;
+        int                rc;
+        sfw_batch_t       *bat;
+
+        request = &rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst;
+        reply->tsr_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+        if (request->tsr_loop == 0 ||
+            request->tsr_concur == 0 ||
+            request->tsr_sid.ses_nid == LNET_NID_ANY ||
+            request->tsr_ndest > SFW_MAX_NDESTS ||
+            (request->tsr_is_client && request->tsr_ndest == 0) ||
+            request->tsr_concur > SFW_MAX_CONCUR ||
+            request->tsr_service > SRPC_SERVICE_MAX_ID ||
+            request->tsr_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID) {
+                reply->tsr_status = EINVAL;
+                return 0;
+        }
+
+        if (sn == NULL || !sfw_sid_equal(request->tsr_sid, sn->sn_id) ||
+            sfw_find_test_case(request->tsr_service) == NULL) {
+                reply->tsr_status = ENOENT;
+                return 0;
+        }
+
+	bat = sfw_bid2batch(request->tsr_bid);
+	if (bat == NULL) {
+		CERROR("dropping RPC %s from %s under memory pressure\n",
+			rpc->srpc_scd->scd_svc->sv_name,
+                        libcfs_id2str(rpc->srpc_peer));
+                return -ENOMEM;
+        }
+
+        if (sfw_batch_active(bat)) {
+                reply->tsr_status = EBUSY;
+                return 0;
+        }
+
+        if (request->tsr_is_client && rpc->srpc_bulk == NULL) {
+		/* rpc will be resumed later in sfw_bulk_ready */
+		int	npg = sfw_id_pages(request->tsr_ndest);
+		int	len;
+
+		if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+			len = npg * PAGE_SIZE;
+
+		} else  {
+			len = sizeof(struct lnet_process_id_packed) *
+			      request->tsr_ndest;
+		}
+
+		return sfw_alloc_pages(rpc, CFS_CPT_ANY, npg, len, 1);
+        }
+
+        rc = sfw_add_test_instance(bat, rpc);
+        CDEBUG (rc == 0 ? D_NET : D_WARNING,
+                "%s test: sv %d %s, loop %d, concur %d, ndest %d\n",
+                rc == 0 ? "Added" : "Failed to add", request->tsr_service,
+                request->tsr_is_client ? "client" : "server",
+                request->tsr_loop, request->tsr_concur, request->tsr_ndest);
+
+        reply->tsr_status = (rc < 0) ? -rc : rc;
+        return 0;
+}
+
+static int
+sfw_control_batch (srpc_batch_reqst_t *request, srpc_batch_reply_t *reply)
+{
+        sfw_session_t *sn = sfw_data.fw_session;
+        int            rc = 0;
+        sfw_batch_t   *bat;
+
+        reply->bar_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+        if (sn == NULL || !sfw_sid_equal(request->bar_sid, sn->sn_id)) {
+                reply->bar_status = ESRCH;
+                return 0;
+        }
+
+        bat = sfw_find_batch(request->bar_bid);
+        if (bat == NULL) {
+                reply->bar_status = ENOENT;
+                return 0;
+        }
+
+        switch (request->bar_opc) {
+        case SRPC_BATCH_OPC_RUN:
+                rc = sfw_run_batch(bat);
+                break;
+
+        case SRPC_BATCH_OPC_STOP:
+                rc = sfw_stop_batch(bat, request->bar_arg);
+                break;
+
+        case SRPC_BATCH_OPC_QUERY:
+                rc = sfw_query_batch(bat, request->bar_testidx, reply);
+                break;
+
+        default:
+                return -EINVAL; /* drop it */
+        }
+
+        reply->bar_status = (rc < 0) ? -rc : rc;
+        return 0;
+}
+
+static int
+sfw_handle_server_rpc(struct srpc_server_rpc *rpc)
+{
+	struct srpc_service	*sv = rpc->srpc_scd->scd_svc;
+	srpc_msg_t     *reply	= &rpc->srpc_replymsg;
+	srpc_msg_t     *request	= &rpc->srpc_reqstbuf->buf_msg;
+	unsigned	features = LST_FEATS_MASK;
+	int		rc = 0;
+
+	LASSERT(sfw_data.fw_active_srpc == NULL);
+	LASSERT(sv->sv_id <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	if (sfw_data.fw_shuttingdown) {
+		spin_unlock(&sfw_data.fw_lock);
+		return -ESHUTDOWN;
+	}
+
+	/* Remove timer to avoid racing with it or expiring active session */
+	if (sfw_del_session_timer() != 0) {
+		CERROR("dropping RPC %s from %s: racing with expiry timer\n",
+		       sv->sv_name, libcfs_id2str(rpc->srpc_peer));
+		spin_unlock(&sfw_data.fw_lock);
+		return -EAGAIN;
+	}
+
+	sfw_data.fw_active_srpc = rpc;
+	spin_unlock(&sfw_data.fw_lock);
+
+	sfw_unpack_message(request);
+	LASSERT(request->msg_type == srpc_service2request(sv->sv_id));
+
+	/* rpc module should have checked this */
+	LASSERT(request->msg_version == SRPC_MSG_VERSION);
+
+	if (sv->sv_id != SRPC_SERVICE_MAKE_SESSION &&
+	    sv->sv_id != SRPC_SERVICE_DEBUG) {
+		sfw_session_t *sn = sfw_data.fw_session;
+
+		if (sn != NULL &&
+		    sn->sn_features != request->msg_ses_feats) {
+			CNETERR("Features of framework RPC don't match "
+				"features of current session: %x/%x\n",
+				request->msg_ses_feats, sn->sn_features);
+			reply->msg_body.reply.status = EPROTO;
+			reply->msg_body.reply.sid    = sn->sn_id;
+			goto out;
+		}
+
+	} else if ((request->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		/* NB: at this point, old version will ignore features and
+		 * create new session anyway, so console should be able
+		 * to handle this */
+		reply->msg_body.reply.status = EPROTO;
+		goto out;
+	}
+
+        switch(sv->sv_id) {
+        default:
+                LBUG ();
+        case SRPC_SERVICE_TEST:
+                rc = sfw_add_test(rpc);
+                break;
+
+        case SRPC_SERVICE_BATCH:
+                rc = sfw_control_batch(&request->msg_body.bat_reqst,
+                                       &reply->msg_body.bat_reply);
+                break;
+
+        case SRPC_SERVICE_QUERY_STAT:
+                rc = sfw_get_stats(&request->msg_body.stat_reqst,
+                                   &reply->msg_body.stat_reply);
+                break;
+
+        case SRPC_SERVICE_DEBUG:
+                rc = sfw_debug_session(&request->msg_body.dbg_reqst,
+                                       &reply->msg_body.dbg_reply);
+                break;
+
+        case SRPC_SERVICE_MAKE_SESSION:
+                rc = sfw_make_session(&request->msg_body.mksn_reqst,
+                                      &reply->msg_body.mksn_reply);
+                break;
+
+        case SRPC_SERVICE_REMOVE_SESSION:
+                rc = sfw_remove_session(&request->msg_body.rmsn_reqst,
+                                        &reply->msg_body.rmsn_reply);
+                break;
+        }
+
+	if (sfw_data.fw_session != NULL)
+		features = sfw_data.fw_session->sn_features;
+ out:
+	reply->msg_ses_feats = features;
+	rpc->srpc_done = sfw_server_rpc_done;
+	spin_lock(&sfw_data.fw_lock);
+
+	if (!sfw_data.fw_shuttingdown)
+		sfw_add_session_timer();
+
+	sfw_data.fw_active_srpc = NULL;
+	spin_unlock(&sfw_data.fw_lock);
+	return rc;
+}
+
+static int
+sfw_bulk_ready(struct srpc_server_rpc *rpc, int status)
+{
+	struct srpc_service	*sv = rpc->srpc_scd->scd_svc;
+	int			rc;
+
+	LASSERT(rpc->srpc_bulk != NULL);
+	LASSERT(sv->sv_id == SRPC_SERVICE_TEST);
+	LASSERT(sfw_data.fw_active_srpc == NULL);
+	LASSERT(rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst.tsr_is_client);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	if (status != 0) {
+		CERROR("Bulk transfer failed for RPC: "
+		       "service %s, peer %s, status %d\n",
+		       sv->sv_name, libcfs_id2str(rpc->srpc_peer), status);
+		spin_unlock(&sfw_data.fw_lock);
+		return -EIO;
+	}
+
+	if (sfw_data.fw_shuttingdown) {
+		spin_unlock(&sfw_data.fw_lock);
+		return -ESHUTDOWN;
+	}
+
+	if (sfw_del_session_timer() != 0) {
+		CERROR("dropping RPC %s from %s: racing with expiry timer\n",
+		       sv->sv_name, libcfs_id2str(rpc->srpc_peer));
+		spin_unlock(&sfw_data.fw_lock);
+		return -EAGAIN;
+	}
+
+	sfw_data.fw_active_srpc = rpc;
+	spin_unlock(&sfw_data.fw_lock);
+
+	rc = sfw_add_test(rpc);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	if (!sfw_data.fw_shuttingdown)
+		sfw_add_session_timer();
+
+	sfw_data.fw_active_srpc = NULL;
+	spin_unlock(&sfw_data.fw_lock);
+	return rc;
+}
+
+srpc_client_rpc_t *
+sfw_create_rpc(struct lnet_process_id peer, int service,
+	       unsigned features, int nbulkiov, int bulklen,
+	       void (*done)(srpc_client_rpc_t *), void *priv)
+{
+	srpc_client_rpc_t *rpc = NULL;
+
+	spin_lock(&sfw_data.fw_lock);
+
+        LASSERT (!sfw_data.fw_shuttingdown);
+        LASSERT (service <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+	if (nbulkiov == 0 && !list_empty(&sfw_data.fw_zombie_rpcs)) {
+		rpc = list_entry(sfw_data.fw_zombie_rpcs.next,
+                                     srpc_client_rpc_t, crpc_list);
+		list_del(&rpc->crpc_list);
+
+                srpc_init_client_rpc(rpc, peer, service, 0, 0,
+                                     done, sfw_client_rpc_fini, priv);
+        }
+
+	spin_unlock(&sfw_data.fw_lock);
+
+	if (rpc == NULL) {
+		rpc = srpc_create_client_rpc(peer, service,
+					     nbulkiov, bulklen, done,
+					     nbulkiov != 0 ?  NULL :
+					     sfw_client_rpc_fini,
+					     priv);
+	}
+
+	if (rpc != NULL) /* "session" is concept in framework */
+		rpc->crpc_reqstmsg.msg_ses_feats = features;
+
+	return rpc;
+}
+
+void
+sfw_unpack_message (srpc_msg_t *msg)
+{
+        if (msg->msg_magic == SRPC_MSG_MAGIC)
+                return; /* no flipping needed */
+
+	/* srpc module should guarantee I wouldn't get crap */
+        LASSERT (msg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+        if (msg->msg_type == SRPC_MSG_STAT_REQST) {
+                srpc_stat_reqst_t *req = &msg->msg_body.stat_reqst;
+
+                __swab32s(&req->str_type);
+                __swab64s(&req->str_rpyid);
+                sfw_unpack_sid(req->str_sid);
+                return;
+        }
+
+        if (msg->msg_type == SRPC_MSG_STAT_REPLY) {
+                srpc_stat_reply_t *rep = &msg->msg_body.stat_reply;
+
+                __swab32s(&rep->str_status);
+                sfw_unpack_sid(rep->str_sid);
+                sfw_unpack_fw_counters(rep->str_fw);
+                sfw_unpack_rpc_counters(rep->str_rpc);
+                sfw_unpack_lnet_counters(rep->str_lnet);
+                return;
+        }
+
+        if (msg->msg_type == SRPC_MSG_MKSN_REQST) {
+                srpc_mksn_reqst_t *req = &msg->msg_body.mksn_reqst;
+
+                __swab64s(&req->mksn_rpyid);
+                __swab32s(&req->mksn_force);
+                sfw_unpack_sid(req->mksn_sid);
+                return;
+        }
+
+        if (msg->msg_type == SRPC_MSG_MKSN_REPLY) {
+                srpc_mksn_reply_t *rep = &msg->msg_body.mksn_reply;
+
+                __swab32s(&rep->mksn_status);
+                __swab32s(&rep->mksn_timeout);
+                sfw_unpack_sid(rep->mksn_sid);
+                return;
+        }
+
+        if (msg->msg_type == SRPC_MSG_RMSN_REQST) {
+                srpc_rmsn_reqst_t *req = &msg->msg_body.rmsn_reqst;
+
+                __swab64s(&req->rmsn_rpyid);
+                sfw_unpack_sid(req->rmsn_sid);
+                return;
+        }
+
+        if (msg->msg_type == SRPC_MSG_RMSN_REPLY) {
+                srpc_rmsn_reply_t *rep = &msg->msg_body.rmsn_reply;
+
+                __swab32s(&rep->rmsn_status);
+                sfw_unpack_sid(rep->rmsn_sid);
+                return;
+        }
+
+        if (msg->msg_type == SRPC_MSG_DEBUG_REQST) {
+                srpc_debug_reqst_t *req = &msg->msg_body.dbg_reqst;
+
+                __swab64s(&req->dbg_rpyid);
+                __swab32s(&req->dbg_flags);
+                sfw_unpack_sid(req->dbg_sid);
+                return;
+        }
+
+        if (msg->msg_type == SRPC_MSG_DEBUG_REPLY) {
+                srpc_debug_reply_t *rep = &msg->msg_body.dbg_reply;
+
+                __swab32s(&rep->dbg_nbatch);
+                __swab32s(&rep->dbg_timeout);
+                sfw_unpack_sid(rep->dbg_sid);
+                return;
+        }
+
+        if (msg->msg_type == SRPC_MSG_BATCH_REQST) {
+                srpc_batch_reqst_t *req = &msg->msg_body.bat_reqst;
+
+                __swab32s(&req->bar_opc);
+                __swab64s(&req->bar_rpyid);
+                __swab32s(&req->bar_testidx);
+                __swab32s(&req->bar_arg);
+                sfw_unpack_sid(req->bar_sid);
+                __swab64s(&req->bar_bid.bat_id);
+                return;
+        }
+
+        if (msg->msg_type == SRPC_MSG_BATCH_REPLY) {
+                srpc_batch_reply_t *rep = &msg->msg_body.bat_reply;
+
+                __swab32s(&rep->bar_status);
+                sfw_unpack_sid(rep->bar_sid);
+                return;
+        }
+
+        if (msg->msg_type == SRPC_MSG_TEST_REQST) {
+                srpc_test_reqst_t *req = &msg->msg_body.tes_reqst;
+
+                __swab64s(&req->tsr_rpyid);
+                __swab64s(&req->tsr_bulkid);
+                __swab32s(&req->tsr_loop);
+                __swab32s(&req->tsr_ndest);
+                __swab32s(&req->tsr_concur);
+                __swab32s(&req->tsr_service);
+                sfw_unpack_sid(req->tsr_sid);
+                __swab64s(&req->tsr_bid.bat_id);
+                return;
+        }
+
+        if (msg->msg_type == SRPC_MSG_TEST_REPLY) {
+                srpc_test_reply_t *rep = &msg->msg_body.tes_reply;
+
+                __swab32s(&rep->tsr_status);
+                sfw_unpack_sid(rep->tsr_sid);
+                return;
+        }
+
+        if (msg->msg_type == SRPC_MSG_JOIN_REQST) {
+                srpc_join_reqst_t *req = &msg->msg_body.join_reqst;
+
+                __swab64s(&req->join_rpyid);
+                sfw_unpack_sid(req->join_sid);
+                return;
+        }
+
+        if (msg->msg_type == SRPC_MSG_JOIN_REPLY) {
+                srpc_join_reply_t *rep = &msg->msg_body.join_reply;
+
+                __swab32s(&rep->join_status);
+                __swab32s(&rep->join_timeout);
+                sfw_unpack_sid(rep->join_sid);
+                return;
+        }
+
+        LBUG ();
+        return;
+}
+
+void
+sfw_abort_rpc (srpc_client_rpc_t *rpc)
+{
+	LASSERT(atomic_read(&rpc->crpc_refcount) > 0);
+	LASSERT(rpc->crpc_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+	spin_lock(&rpc->crpc_lock);
+	srpc_abort_rpc(rpc, -EINTR);
+	spin_unlock(&rpc->crpc_lock);
+	return;
+}
+
+void
+sfw_post_rpc (srpc_client_rpc_t *rpc)
+{
+	spin_lock(&rpc->crpc_lock);
+
+	LASSERT(!rpc->crpc_closed);
+	LASSERT(!rpc->crpc_aborted);
+	LASSERT(list_empty(&rpc->crpc_list));
+	LASSERT(!sfw_data.fw_shuttingdown);
+
+	rpc->crpc_timeout = rpc_timeout;
+	srpc_post_rpc(rpc);
+
+	spin_unlock(&rpc->crpc_lock);
+	return;
+}
+
+static srpc_service_t sfw_services[] =
+{
+        {
+                /* sv_id */    SRPC_SERVICE_DEBUG,
+                /* sv_name */  "debug",
+                0
+        },
+        {
+                /* sv_id */    SRPC_SERVICE_QUERY_STAT,
+                /* sv_name */  "query stats",
+                0
+        },
+        {
+                /* sv_id */    SRPC_SERVICE_MAKE_SESSION,
+                /* sv_name */  "make session",
+                0
+        },
+        {
+                /* sv_id */    SRPC_SERVICE_REMOVE_SESSION,
+                /* sv_name */  "remove session",
+                0
+        },
+        {
+                /* sv_id */    SRPC_SERVICE_BATCH,
+                /* sv_name */  "batch service",
+                0
+        },
+        {
+                /* sv_id */    SRPC_SERVICE_TEST,
+                /* sv_name */  "test service",
+                0
+        },
+        {
+                /* sv_id */    0,
+                /* sv_name */  NULL,
+                0
+        }
+};
+
+int
+sfw_startup (void)
+{
+        int              i;
+        int              rc;
+        int              error;
+        srpc_service_t  *sv;
+        sfw_test_case_t *tsc;
+
+
+        if (session_timeout < 0) {
+                CERROR ("Session timeout must be non-negative: %d\n",
+                        session_timeout);
+                return -EINVAL;
+        }
+
+        if (rpc_timeout < 0) {
+                CERROR ("RPC timeout must be non-negative: %d\n",
+                        rpc_timeout);
+                return -EINVAL;
+        }
+
+        if (session_timeout == 0)
+                CWARN ("Zero session_timeout specified "
+                       "- test sessions never expire.\n");
+
+        if (rpc_timeout == 0)
+                CWARN ("Zero rpc_timeout specified "
+                       "- test RPC never expire.\n");
+
+        memset(&sfw_data, 0, sizeof(struct smoketest_framework));
+
+        sfw_data.fw_session     = NULL;
+        sfw_data.fw_active_srpc = NULL;
+	spin_lock_init(&sfw_data.fw_lock);
+	atomic_set(&sfw_data.fw_nzombies, 0);
+	INIT_LIST_HEAD(&sfw_data.fw_tests);
+	INIT_LIST_HEAD(&sfw_data.fw_zombie_rpcs);
+	INIT_LIST_HEAD(&sfw_data.fw_zombie_sessions);
+
+        brw_init_test_client();
+        brw_init_test_service();
+        rc = sfw_register_test(&brw_test_service, &brw_test_client);
+        LASSERT (rc == 0);
+
+        ping_init_test_client();
+        ping_init_test_service();
+        rc = sfw_register_test(&ping_test_service, &ping_test_client);
+        LASSERT (rc == 0);
+
+	error = 0;
+	list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) {
+		sv = tsc->tsc_srv_service;
+
+		rc = srpc_add_service(sv);
+		LASSERT(rc != -EBUSY);
+		if (rc != 0) {
+			CWARN("Failed to add %s service: %d\n",
+			      sv->sv_name, rc);
+			error = rc;
+		}
+	}
+
+        for (i = 0; ; i++) {
+                sv = &sfw_services[i];
+                if (sv->sv_name == NULL) break;
+
+                sv->sv_bulk_ready = NULL;
+                sv->sv_handler    = sfw_handle_server_rpc;
+		sv->sv_wi_total   = SFW_FRWK_WI_MAX;
+                if (sv->sv_id == SRPC_SERVICE_TEST)
+                        sv->sv_bulk_ready = sfw_bulk_ready;
+
+                rc = srpc_add_service(sv);
+                LASSERT (rc != -EBUSY);
+                if (rc != 0) {
+                        CWARN ("Failed to add %s service: %d\n",
+                               sv->sv_name, rc);
+                        error = rc;
+                }
+
+                /* about to sfw_shutdown, no need to add buffer */
+                if (error) continue;
+
+		rc = srpc_service_add_buffers(sv, sv->sv_wi_total);
+		if (rc != 0) {
+			CWARN("Failed to reserve enough buffers: "
+			      "service %s, %d needed: %d\n",
+			      sv->sv_name, sv->sv_wi_total, rc);
+			error = -ENOMEM;
+		}
+        }
+
+        if (error != 0)
+                sfw_shutdown();
+        return error;
+}
+
+void
+sfw_shutdown (void)
+{
+	srpc_service_t	*sv;
+	sfw_test_case_t	*tsc;
+	int		 i;
+
+	spin_lock(&sfw_data.fw_lock);
+
+        sfw_data.fw_shuttingdown = 1;
+        lst_wait_until(sfw_data.fw_active_srpc == NULL, sfw_data.fw_lock,
+                       "waiting for active RPC to finish.\n");
+
+        if (sfw_del_session_timer() != 0)
+                lst_wait_until(sfw_data.fw_session == NULL, sfw_data.fw_lock,
+                               "waiting for session timer to explode.\n");
+
+        sfw_deactivate_session();
+	lst_wait_until(atomic_read(&sfw_data.fw_nzombies) == 0,
+                       sfw_data.fw_lock,
+                       "waiting for %d zombie sessions to die.\n",
+		       atomic_read(&sfw_data.fw_nzombies));
+
+	spin_unlock(&sfw_data.fw_lock);
+
+        for (i = 0; ; i++) {
+                sv = &sfw_services[i];
+                if (sv->sv_name == NULL)
+                        break;
+
+                srpc_shutdown_service(sv);
+                srpc_remove_service(sv);
+        }
+
+	list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) {
+                sv = tsc->tsc_srv_service;
+                srpc_shutdown_service(sv);
+                srpc_remove_service(sv);
+        }
+
+	while (!list_empty(&sfw_data.fw_zombie_rpcs)) {
+		srpc_client_rpc_t *rpc;
+
+		rpc = list_entry(sfw_data.fw_zombie_rpcs.next,
+				 srpc_client_rpc_t, crpc_list);
+		list_del(&rpc->crpc_list);
+
+		LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+	}
+
+        for (i = 0; ; i++) {
+                sv = &sfw_services[i];
+                if (sv->sv_name == NULL)
+                        break;
+
+                srpc_wait_service_shutdown(sv);
+        }
+
+	while (!list_empty(&sfw_data.fw_tests)) {
+		tsc = list_entry(sfw_data.fw_tests.next,
+				 sfw_test_case_t, tsc_list);
+
+		srpc_wait_service_shutdown(tsc->tsc_srv_service);
+
+		list_del(&tsc->tsc_list);
+		LIBCFS_FREE(tsc, sizeof(*tsc));
+	}
+
+	return;
+}
diff --git a/drivers/staging/lustrefsx/lnet/selftest/module.c b/drivers/staging/lustrefsx/lnet/selftest/module.c
new file mode 100644
index 0000000000000..2f87742142d4a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/module.c
@@ -0,0 +1,165 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+#include "console.h"
+
+enum {
+	LST_INIT_NONE		= 0,
+	LST_INIT_WI_SERIAL,
+	LST_INIT_WI_TEST,
+	LST_INIT_RPC,
+	LST_INIT_FW,
+	LST_INIT_CONSOLE
+};
+
+static int lst_init_step = LST_INIT_NONE;
+
+struct cfs_wi_sched *lst_sched_serial;
+struct cfs_wi_sched **lst_sched_test;
+
+static void
+lnet_selftest_exit(void)
+{
+	int	i;
+
+        switch (lst_init_step) {
+                case LST_INIT_CONSOLE:
+                        lstcon_console_fini();
+                case LST_INIT_FW:
+                        sfw_shutdown();
+                case LST_INIT_RPC:
+                        srpc_shutdown();
+		case LST_INIT_WI_TEST:
+			for (i = 0;
+			     i < cfs_cpt_number(lnet_cpt_table()); i++) {
+				if (lst_sched_test[i] == NULL)
+					continue;
+				cfs_wi_sched_destroy(lst_sched_test[i]);
+			}
+			LIBCFS_FREE(lst_sched_test,
+				    sizeof(lst_sched_test[0]) *
+				    cfs_cpt_number(lnet_cpt_table()));
+			lst_sched_test = NULL;
+
+		case LST_INIT_WI_SERIAL:
+			cfs_wi_sched_destroy(lst_sched_serial);
+			lst_sched_serial = NULL;
+                case LST_INIT_NONE:
+                        break;
+                default:
+                        LBUG();
+        }
+        return;
+}
+
+void
+lnet_selftest_structure_assertion(void)
+{
+        CLASSERT(sizeof(srpc_msg_t) == 160);
+        CLASSERT(sizeof(srpc_test_reqst_t) == 70);
+        CLASSERT(offsetof(srpc_msg_t, msg_body.tes_reqst.tsr_concur) == 72);
+        CLASSERT(offsetof(srpc_msg_t, msg_body.tes_reqst.tsr_ndest) == 78);
+        CLASSERT(sizeof(srpc_stat_reply_t) == 136);
+        CLASSERT(sizeof(srpc_stat_reqst_t) == 28);
+}
+
+static int __init
+lnet_selftest_init(void)
+{
+	int	nscheds;
+	int	rc;
+	int	i;
+
+	rc = cfs_wi_sched_create("lst_s", lnet_cpt_table(), CFS_CPT_ANY,
+				 1, &lst_sched_serial);
+	if (rc != 0) {
+		CERROR("Failed to create serial WI scheduler for LST\n");
+		return rc;
+	}
+	lst_init_step = LST_INIT_WI_SERIAL;
+
+	nscheds = cfs_cpt_number(lnet_cpt_table());
+	LIBCFS_ALLOC(lst_sched_test, sizeof(lst_sched_test[0]) * nscheds);
+	if (lst_sched_test == NULL)
+		goto error;
+
+	lst_init_step = LST_INIT_WI_TEST;
+	for (i = 0; i < nscheds; i++) {
+		int nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+
+		/* reserve at least one CPU for LND */
+		nthrs = max(nthrs - 1, 1);
+		rc = cfs_wi_sched_create("lst_t", lnet_cpt_table(), i,
+					 nthrs, &lst_sched_test[i]);
+		if (rc != 0) {
+			CERROR("Failed to create CPU partition affinity WI "
+			       "scheduler %d for LST\n", i);
+			goto error;
+		}
+	}
+
+        rc = srpc_startup();
+        if (rc != 0) {
+                CERROR("LST can't startup rpc\n");
+                goto error;
+        }
+        lst_init_step = LST_INIT_RPC;
+
+        rc = sfw_startup();
+        if (rc != 0) {
+                CERROR("LST can't startup framework\n");
+                goto error;
+        }
+        lst_init_step = LST_INIT_FW;
+
+        rc = lstcon_console_init();
+        if (rc != 0) {
+                CERROR("LST can't startup console\n");
+                goto error;
+        }
+	lst_init_step = LST_INIT_CONSOLE;
+	return 0;
+error:
+	lnet_selftest_exit();
+	return rc;
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("LNet Selftest");
+MODULE_VERSION("2.8.0");
+MODULE_LICENSE("GPL");
+
+module_init(lnet_selftest_init);
+module_exit(lnet_selftest_exit);
diff --git a/drivers/staging/lustrefsx/lnet/selftest/ping_test.c b/drivers/staging/lustrefsx/lnet/selftest/ping_test.c
new file mode 100644
index 0000000000000..ea2076103c756
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/ping_test.c
@@ -0,0 +1,226 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Test client & Server
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#include "selftest.h"
+
+#define LST_PING_TEST_MAGIC     0xbabeface
+
+static int ping_srv_workitems = SFW_TEST_WI_MAX;
+module_param(ping_srv_workitems, int, 0644);
+MODULE_PARM_DESC(ping_srv_workitems, "# PING server workitems");
+
+typedef struct {
+	spinlock_t	pnd_lock;	/* serialize */
+	int		pnd_counter;	/* sequence counter */
+} lst_ping_data_t;
+
+static lst_ping_data_t  lst_ping_data;
+
+static int
+ping_client_init(sfw_test_instance_t *tsi)
+{
+	sfw_session_t *sn = tsi->tsi_batch->bat_session;
+
+	LASSERT(tsi->tsi_is_client);
+	LASSERT(sn != NULL && (sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+	spin_lock_init(&lst_ping_data.pnd_lock);
+	lst_ping_data.pnd_counter = 0;
+
+	return 0;
+}
+
+static void
+ping_client_fini (sfw_test_instance_t *tsi)
+{
+        sfw_session_t *sn = tsi->tsi_batch->bat_session;
+        int            errors;
+
+        LASSERT (sn != NULL);
+        LASSERT (tsi->tsi_is_client);
+
+	errors = atomic_read(&sn->sn_ping_errors);
+        if (errors)
+                CWARN ("%d pings have failed.\n", errors);
+        else
+                CDEBUG (D_NET, "Ping test finished OK.\n");
+}
+
+static int
+ping_client_prep_rpc(sfw_test_unit_t *tsu,
+		     struct lnet_process_id dest, srpc_client_rpc_t **rpc)
+{
+	srpc_ping_reqst_t   *req;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	sfw_session_t       *sn  = tsi->tsi_batch->bat_session;
+	struct timespec64 ts;
+	int		     rc;
+
+	LASSERT(sn != NULL);
+	LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+	rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, 0, 0, rpc);
+        if (rc != 0)
+                return rc;
+
+        req = &(*rpc)->crpc_reqstmsg.msg_body.ping_reqst;
+
+        req->pnr_magic = LST_PING_TEST_MAGIC;
+
+	spin_lock(&lst_ping_data.pnd_lock);
+	req->pnr_seq = lst_ping_data.pnd_counter++;
+	spin_unlock(&lst_ping_data.pnd_lock);
+
+	ktime_get_real_ts64(&ts);
+	req->pnr_time_sec  = ts.tv_sec;
+	req->pnr_time_nsec = ts.tv_nsec;
+
+	return rc;
+}
+
+static void
+ping_client_done_rpc (sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
+{
+        sfw_test_instance_t *tsi = tsu->tsu_instance;
+	sfw_session_t *sn = tsi->tsi_batch->bat_session;
+	srpc_ping_reqst_t *reqst = &rpc->crpc_reqstmsg.msg_body.ping_reqst;
+	srpc_ping_reply_t *reply = &rpc->crpc_replymsg.msg_body.ping_reply;
+	struct timespec64 ts;
+
+	LASSERT(sn != NULL);
+
+        if (rpc->crpc_status != 0) {
+                if (!tsi->tsi_stopping) /* rpc could have been aborted */
+			atomic_inc(&sn->sn_ping_errors);
+                CERROR ("Unable to ping %s (%d): %d\n",
+                        libcfs_id2str(rpc->crpc_dest),
+                        reqst->pnr_seq, rpc->crpc_status);
+                return;
+	}
+
+        if (rpc->crpc_replymsg.msg_magic != SRPC_MSG_MAGIC) {
+                __swab32s(&reply->pnr_seq);
+                __swab32s(&reply->pnr_magic);
+                __swab32s(&reply->pnr_status);
+        }
+
+        if (reply->pnr_magic != LST_PING_TEST_MAGIC) {
+                rpc->crpc_status = -EBADMSG;
+		atomic_inc(&sn->sn_ping_errors);
+                CERROR ("Bad magic %u from %s, %u expected.\n",
+                        reply->pnr_magic, libcfs_id2str(rpc->crpc_dest),
+                        LST_PING_TEST_MAGIC);
+                return;
+	}
+
+        if (reply->pnr_seq != reqst->pnr_seq) {
+                rpc->crpc_status = -EBADMSG;
+		atomic_inc(&sn->sn_ping_errors);
+                CERROR ("Bad seq %u from %s, %u expected.\n",
+                        reply->pnr_seq, libcfs_id2str(rpc->crpc_dest),
+                        reqst->pnr_seq);
+                return;
+        }
+
+	ktime_get_real_ts64(&ts);
+	CDEBUG(D_NET, "%d reply in %llu nsec\n", reply->pnr_seq,
+	       (u64)((ts.tv_sec - reqst->pnr_time_sec) * NSEC_PER_SEC +
+		    (ts.tv_nsec - reqst->pnr_time_nsec)));
+        return;
+}
+
+static int
+ping_server_handle(struct srpc_server_rpc *rpc)
+{
+	struct srpc_service	*sv  = rpc->srpc_scd->scd_svc;
+        srpc_msg_t        *reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+	srpc_msg_t	  *replymsg = &rpc->srpc_replymsg;
+        srpc_ping_reqst_t *req = &reqstmsg->msg_body.ping_reqst;
+        srpc_ping_reply_t *rep = &rpc->srpc_replymsg.msg_body.ping_reply;
+
+        LASSERT (sv->sv_id == SRPC_SERVICE_PING);
+
+        if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) {
+                LASSERT (reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+                __swab32s(&req->pnr_seq);
+                __swab32s(&req->pnr_magic);
+                __swab64s(&req->pnr_time_sec);
+		__swab64s(&req->pnr_time_nsec);
+        }
+        LASSERT (reqstmsg->msg_type == srpc_service2request(sv->sv_id));
+
+        if (req->pnr_magic != LST_PING_TEST_MAGIC) {
+                CERROR ("Unexpect magic %08x from %s\n",
+                        req->pnr_magic, libcfs_id2str(rpc->srpc_peer));
+                return -EINVAL;
+        }
+
+        rep->pnr_seq   = req->pnr_seq;
+        rep->pnr_magic = LST_PING_TEST_MAGIC;
+
+	if ((reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		replymsg->msg_ses_feats = LST_FEATS_MASK;
+		rep->pnr_status = EPROTO;
+		return 0;
+	}
+
+	replymsg->msg_ses_feats = reqstmsg->msg_ses_feats;
+
+	CDEBUG(D_NET, "Get ping %d from %s\n",
+	       req->pnr_seq, libcfs_id2str(rpc->srpc_peer));
+	return 0;
+}
+
+sfw_test_client_ops_t ping_test_client;
+void ping_init_test_client(void)
+{
+        ping_test_client.tso_init     = ping_client_init;
+        ping_test_client.tso_fini     = ping_client_fini;
+        ping_test_client.tso_prep_rpc = ping_client_prep_rpc;
+        ping_test_client.tso_done_rpc = ping_client_done_rpc;
+}
+
+srpc_service_t ping_test_service;
+void ping_init_test_service(void)
+{
+	ping_test_service.sv_id       = SRPC_SERVICE_PING;
+	ping_test_service.sv_name     = "ping_test";
+	ping_test_service.sv_handler  = ping_server_handle;
+	ping_test_service.sv_wi_total = ping_srv_workitems;
+}
diff --git a/drivers/staging/lustrefsx/lnet/selftest/rpc.c b/drivers/staging/lustrefsx/lnet/selftest/rpc.c
new file mode 100644
index 0000000000000..abed28104aa69
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/rpc.c
@@ -0,0 +1,1668 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/rpc.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ *
+ * 2012-05-13: Liang Zhen <liang@whamcloud.com>
+ * - percpt data for service to improve smp performance
+ * - code cleanup
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+typedef enum {
+        SRPC_STATE_NONE,
+        SRPC_STATE_NI_INIT,
+        SRPC_STATE_EQ_INIT,
+        SRPC_STATE_RUNNING,
+        SRPC_STATE_STOPPING,
+} srpc_state_t;
+
+static struct smoketest_rpc {
+	spinlock_t	 rpc_glock;	/* global lock */
+	srpc_service_t	*rpc_services[SRPC_SERVICE_MAX_ID + 1];
+	struct lnet_handle_eq	rpc_lnet_eq;	/* _the_ LNet event queue */
+	srpc_state_t	 rpc_state;
+	struct srpc_counters	 rpc_counters;
+	__u64		 rpc_matchbits;	/* matchbits counter */
+} srpc_data;
+
+static inline int
+srpc_serv_portal(int svc_id)
+{
+	return svc_id < SRPC_FRAMEWORK_SERVICE_MAX_ID ?
+	       SRPC_FRAMEWORK_REQUEST_PORTAL : SRPC_REQUEST_PORTAL;
+}
+
+/* forward ref's */
+int srpc_handle_rpc(swi_workitem_t *wi);
+
+void srpc_get_counters(struct srpc_counters *cnt)
+{
+	spin_lock(&srpc_data.rpc_glock);
+	*cnt = srpc_data.rpc_counters;
+	spin_unlock(&srpc_data.rpc_glock);
+}
+
+void srpc_set_counters(const struct srpc_counters *cnt)
+{
+	spin_lock(&srpc_data.rpc_glock);
+	srpc_data.rpc_counters = *cnt;
+	spin_unlock(&srpc_data.rpc_glock);
+}
+
+static int
+srpc_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i, int off, int nob)
+{
+	LASSERT(off < PAGE_SIZE);
+	LASSERT(nob > 0 && nob <= PAGE_SIZE);
+
+	bk->bk_iovs[i].kiov_offset = off;
+	bk->bk_iovs[i].kiov_page   = pg;
+	bk->bk_iovs[i].kiov_len    = nob;
+	return nob;
+}
+
+void
+srpc_free_bulk (srpc_bulk_t *bk)
+{
+        int         i;
+	struct page *pg;
+
+        LASSERT (bk != NULL);
+
+        for (i = 0; i < bk->bk_niov; i++) {
+                pg = bk->bk_iovs[i].kiov_page;
+                if (pg == NULL) break;
+
+		__free_page(pg);
+        }
+
+        LIBCFS_FREE(bk, offsetof(srpc_bulk_t, bk_iovs[bk->bk_niov]));
+        return;
+}
+
+srpc_bulk_t *
+srpc_alloc_bulk(int cpt, unsigned bulk_off, unsigned bulk_npg,
+		unsigned bulk_len, int sink)
+{
+	srpc_bulk_t  *bk;
+	int	      i;
+
+	LASSERT(bulk_npg > 0 && bulk_npg <= LNET_MAX_IOV);
+
+	LIBCFS_CPT_ALLOC(bk, lnet_cpt_table(), cpt,
+			 offsetof(srpc_bulk_t, bk_iovs[bulk_npg]));
+	if (bk == NULL) {
+		CERROR("Can't allocate descriptor for %d pages\n", bulk_npg);
+		return NULL;
+	}
+
+	memset(bk, 0, offsetof(srpc_bulk_t, bk_iovs[bulk_npg]));
+	bk->bk_sink   = sink;
+	bk->bk_len    = bulk_len;
+	bk->bk_niov   = bulk_npg;
+
+	for (i = 0; i < bulk_npg; i++) {
+		struct page *pg;
+		int	    nob;
+
+		pg = cfs_page_cpt_alloc(lnet_cpt_table(), cpt, GFP_KERNEL);
+		if (pg == NULL) {
+			CERROR("Can't allocate page %d of %d\n", i, bulk_npg);
+			srpc_free_bulk(bk);
+			return NULL;
+		}
+
+		nob = min_t(unsigned, bulk_off + bulk_len, PAGE_SIZE) -
+		      bulk_off;
+
+		srpc_add_bulk_page(bk, pg, i, bulk_off, nob);
+		bulk_len -= nob;
+		bulk_off = 0;
+	}
+
+	return bk;
+}
+
+static inline __u64
+srpc_next_id (void)
+{
+	__u64 id;
+
+	spin_lock(&srpc_data.rpc_glock);
+	id = srpc_data.rpc_matchbits++;
+	spin_unlock(&srpc_data.rpc_glock);
+	return id;
+}
+
+static void
+srpc_init_server_rpc(struct srpc_server_rpc *rpc,
+		     struct srpc_service_cd *scd,
+		     struct srpc_buffer *buffer)
+{
+	memset(rpc, 0, sizeof(*rpc));
+	swi_init_workitem(&rpc->srpc_wi, rpc, srpc_handle_rpc,
+			  srpc_serv_is_framework(scd->scd_svc) ?
+			  lst_sched_serial : lst_sched_test[scd->scd_cpt]);
+
+	rpc->srpc_ev.ev_fired = 1; /* no event expected now */
+
+	rpc->srpc_scd      = scd;
+	rpc->srpc_reqstbuf = buffer;
+	rpc->srpc_peer     = buffer->buf_peer;
+	rpc->srpc_self     = buffer->buf_self;
+	LNetInvalidateMDHandle(&rpc->srpc_replymdh);
+}
+
+static void
+srpc_service_fini(struct srpc_service *svc)
+{
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	struct srpc_buffer	*buf;
+	struct list_head		*q;
+	int			i;
+
+	if (svc->sv_cpt_data == NULL)
+		return;
+
+	cfs_percpt_for_each(scd, i, svc->sv_cpt_data) {
+		while (1) {
+			if (!list_empty(&scd->scd_buf_posted))
+				q = &scd->scd_buf_posted;
+			else if (!list_empty(&scd->scd_buf_blocked))
+				q = &scd->scd_buf_blocked;
+			else
+				break;
+
+			while (!list_empty(q)) {
+				buf = list_entry(q->next,
+						     struct srpc_buffer,
+						     buf_list);
+				list_del(&buf->buf_list);
+				LIBCFS_FREE(buf, sizeof(*buf));
+			}
+		}
+
+		LASSERT(list_empty(&scd->scd_rpc_active));
+
+		while (!list_empty(&scd->scd_rpc_free)) {
+			rpc = list_entry(scd->scd_rpc_free.next,
+					     struct srpc_server_rpc,
+					     srpc_list);
+			list_del(&rpc->srpc_list);
+			LIBCFS_FREE(rpc, sizeof(*rpc));
+		}
+	}
+
+	cfs_percpt_free(svc->sv_cpt_data);
+	svc->sv_cpt_data = NULL;
+}
+
+static int
+srpc_service_nrpcs(struct srpc_service *svc)
+{
+	int nrpcs = svc->sv_wi_total / svc->sv_ncpts;
+
+	return srpc_serv_is_framework(svc) ?
+	       max(nrpcs, SFW_FRWK_WI_MIN) : max(nrpcs, SFW_TEST_WI_MIN);
+}
+
+int srpc_add_buffer(struct swi_workitem *wi);
+
+static int
+srpc_service_init(struct srpc_service *svc)
+{
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	int			nrpcs;
+	int			i;
+	int			j;
+
+	svc->sv_shuttingdown = 0;
+
+	svc->sv_cpt_data = cfs_percpt_alloc(lnet_cpt_table(),
+					    sizeof(struct srpc_service_cd));
+	if (svc->sv_cpt_data == NULL)
+		return -ENOMEM;
+
+	svc->sv_ncpts = srpc_serv_is_framework(svc) ?
+			1 : cfs_cpt_number(lnet_cpt_table());
+	nrpcs = srpc_service_nrpcs(svc);
+
+	cfs_percpt_for_each(scd, i, svc->sv_cpt_data) {
+		scd->scd_cpt = i;
+		scd->scd_svc = svc;
+		spin_lock_init(&scd->scd_lock);
+		INIT_LIST_HEAD(&scd->scd_rpc_free);
+		INIT_LIST_HEAD(&scd->scd_rpc_active);
+		INIT_LIST_HEAD(&scd->scd_buf_posted);
+		INIT_LIST_HEAD(&scd->scd_buf_blocked);
+
+		scd->scd_ev.ev_data = scd;
+		scd->scd_ev.ev_type = SRPC_REQUEST_RCVD;
+
+		/* NB: don't use lst_sched_serial for adding buffer,
+		 * see details in srpc_service_add_buffers() */
+		swi_init_workitem(&scd->scd_buf_wi, scd,
+				  srpc_add_buffer, lst_sched_test[i]);
+
+		if (i != 0 && srpc_serv_is_framework(svc)) {
+			/* NB: framework service only needs srpc_service_cd for
+			 * one partition, but we allocate for all to make
+			 * it easier to implement, it will waste a little
+			 * memory but nobody should care about this */
+			continue;
+		}
+
+		for (j = 0; j < nrpcs; j++) {
+			LIBCFS_CPT_ALLOC(rpc, lnet_cpt_table(),
+					 i, sizeof(*rpc));
+			if (rpc == NULL) {
+				srpc_service_fini(svc);
+				return -ENOMEM;
+			}
+			list_add(&rpc->srpc_list, &scd->scd_rpc_free);
+		}
+	}
+
+	return 0;
+}
+
+int
+srpc_add_service(struct srpc_service *sv)
+{
+	int id = sv->sv_id;
+
+	LASSERT(0 <= id && id <= SRPC_SERVICE_MAX_ID);
+
+	if (srpc_service_init(sv) != 0)
+		return -ENOMEM;
+
+	spin_lock(&srpc_data.rpc_glock);
+
+	LASSERT(srpc_data.rpc_state == SRPC_STATE_RUNNING);
+
+	if (srpc_data.rpc_services[id] != NULL) {
+		spin_unlock(&srpc_data.rpc_glock);
+		goto failed;
+	}
+
+	srpc_data.rpc_services[id] = sv;
+	spin_unlock(&srpc_data.rpc_glock);
+
+	CDEBUG(D_NET, "Adding service: id %d, name %s\n", id, sv->sv_name);
+	return 0;
+
+ failed:
+	srpc_service_fini(sv);
+	return -EBUSY;
+}
+
+int
+srpc_remove_service (srpc_service_t *sv)
+{
+	int id = sv->sv_id;
+
+	spin_lock(&srpc_data.rpc_glock);
+
+	if (srpc_data.rpc_services[id] != sv) {
+		spin_unlock(&srpc_data.rpc_glock);
+		return -ENOENT;
+	}
+
+	srpc_data.rpc_services[id] = NULL;
+	spin_unlock(&srpc_data.rpc_glock);
+	return 0;
+}
+
+static int
+srpc_post_passive_rdma(int portal, int local, __u64 matchbits, void *buf,
+		       int len, int options, struct lnet_process_id peer,
+		       struct lnet_handle_md *mdh, srpc_event_t *ev)
+{
+	int		 rc;
+	struct lnet_md	 md;
+	struct lnet_handle_me meh;
+
+	rc = LNetMEAttach(portal, peer, matchbits, 0, LNET_UNLINK,
+			  local ? LNET_INS_LOCAL : LNET_INS_AFTER, &meh);
+        if (rc != 0) {
+                CERROR ("LNetMEAttach failed: %d\n", rc);
+                LASSERT (rc == -ENOMEM);
+                return -ENOMEM;
+        }
+
+        md.threshold = 1;
+        md.user_ptr  = ev;
+        md.start     = buf;
+        md.length    = len;
+        md.options   = options;
+        md.eq_handle = srpc_data.rpc_lnet_eq;
+
+        rc = LNetMDAttach(meh, md, LNET_UNLINK, mdh);
+        if (rc != 0) {
+                CERROR ("LNetMDAttach failed: %d\n", rc);
+                LASSERT (rc == -ENOMEM);
+
+                rc = LNetMEUnlink(meh);
+                LASSERT (rc == 0);
+                return -ENOMEM;
+        }
+
+        CDEBUG (D_NET,
+		"Posted passive RDMA: peer %s, portal %d, matchbits %#llx\n",
+                libcfs_id2str(peer), portal, matchbits);
+        return 0;
+}
+
+static int
+srpc_post_active_rdma(int portal, __u64 matchbits, void *buf, int len,
+		      int options, struct lnet_process_id peer, lnet_nid_t self,
+		      struct lnet_handle_md *mdh, srpc_event_t *ev)
+{
+	int rc;
+	struct lnet_md md;
+
+        md.user_ptr  = ev;
+        md.start     = buf;
+        md.length    = len;
+        md.eq_handle = srpc_data.rpc_lnet_eq;
+        md.threshold = ((options & LNET_MD_OP_GET) != 0) ? 2 : 1;
+        md.options   = options & ~(LNET_MD_OP_PUT | LNET_MD_OP_GET);
+
+        rc = LNetMDBind(md, LNET_UNLINK, mdh);
+        if (rc != 0) {
+                CERROR ("LNetMDBind failed: %d\n", rc);
+                LASSERT (rc == -ENOMEM);
+                return -ENOMEM;
+        }
+
+        /* this is kind of an abuse of the LNET_MD_OP_{PUT,GET} options.
+         * they're only meaningful for MDs attached to an ME (i.e. passive
+         * buffers... */
+        if ((options & LNET_MD_OP_PUT) != 0) {
+                rc = LNetPut(self, *mdh, LNET_NOACK_REQ, peer,
+                             portal, matchbits, 0, 0);
+        } else {
+                LASSERT ((options & LNET_MD_OP_GET) != 0);
+
+                rc = LNetGet(self, *mdh, peer, portal, matchbits, 0);
+        }
+
+        if (rc != 0) {
+		CERROR ("LNet%s(%s, %d, %lld) failed: %d\n",
+                        ((options & LNET_MD_OP_PUT) != 0) ? "Put" : "Get",
+                        libcfs_id2str(peer), portal, matchbits, rc);
+
+                /* The forthcoming unlink event will complete this operation
+                 * with failure, so fall through and return success here.
+                 */
+                rc = LNetMDUnlink(*mdh);
+                LASSERT (rc == 0);
+        } else {
+                CDEBUG (D_NET,
+			"Posted active RDMA: peer %s, portal %u, matchbits %#llx\n",
+                        libcfs_id2str(peer), portal, matchbits);
+        }
+        return 0;
+}
+
+static int
+srpc_post_passive_rqtbuf(int service, int local, void *buf, int len,
+			 struct lnet_handle_md *mdh, srpc_event_t *ev)
+{
+	struct lnet_process_id any = {0};
+
+	any.nid = LNET_NID_ANY;
+	any.pid = LNET_PID_ANY;
+
+	return srpc_post_passive_rdma(srpc_serv_portal(service),
+				      local, service, buf, len,
+				      LNET_MD_OP_PUT, any, mdh, ev);
+}
+
+static int
+srpc_service_post_buffer(struct srpc_service_cd *scd, struct srpc_buffer *buf)
+__must_hold(&scd->scd_lock)
+{
+	struct srpc_service	*sv = scd->scd_svc;
+	struct srpc_msg		*msg = &buf->buf_msg;
+	int			rc;
+
+	LNetInvalidateMDHandle(&buf->buf_mdh);
+	list_add(&buf->buf_list, &scd->scd_buf_posted);
+	scd->scd_buf_nposted++;
+	spin_unlock(&scd->scd_lock);
+
+	rc = srpc_post_passive_rqtbuf(sv->sv_id,
+				      !srpc_serv_is_framework(sv),
+				      msg, sizeof(*msg), &buf->buf_mdh,
+				      &scd->scd_ev);
+
+	/* At this point, a RPC (new or delayed) may have arrived in
+	 * msg and its event handler has been called. So we must add
+	 * buf to scd_buf_posted _before_ dropping scd_lock */
+
+	spin_lock(&scd->scd_lock);
+
+	if (rc == 0) {
+		if (!sv->sv_shuttingdown)
+			return 0;
+
+		spin_unlock(&scd->scd_lock);
+		/* srpc_shutdown_service might have tried to unlink me
+		 * when my buf_mdh was still invalid */
+		LNetMDUnlink(buf->buf_mdh);
+		spin_lock(&scd->scd_lock);
+		return 0;
+	}
+
+	scd->scd_buf_nposted--;
+	if (sv->sv_shuttingdown)
+		return rc; /* don't allow to change scd_buf_posted */
+
+	list_del(&buf->buf_list);
+	spin_unlock(&scd->scd_lock);
+
+	LIBCFS_FREE(buf, sizeof(*buf));
+
+	spin_lock(&scd->scd_lock);
+	return rc;
+}
+
+int
+srpc_add_buffer(struct swi_workitem *wi)
+{
+	struct srpc_service_cd	*scd = wi->swi_workitem.wi_data;
+	struct srpc_buffer	*buf;
+	int			rc = 0;
+
+	/* it's called by workitem scheduler threads, these threads
+	 * should have been set CPT affinity, so buffers will be posted
+	 * on CPT local list of Portal */
+	spin_lock(&scd->scd_lock);
+
+	while (scd->scd_buf_adjust > 0 &&
+	       !scd->scd_svc->sv_shuttingdown) {
+		scd->scd_buf_adjust--; /* consume it */
+		scd->scd_buf_posting++;
+
+		spin_unlock(&scd->scd_lock);
+
+		LIBCFS_ALLOC(buf, sizeof(*buf));
+		if (buf == NULL) {
+			CERROR("Failed to add new buf to service: %s\n",
+			       scd->scd_svc->sv_name);
+			spin_lock(&scd->scd_lock);
+			rc = -ENOMEM;
+			break;
+		}
+
+		spin_lock(&scd->scd_lock);
+		if (scd->scd_svc->sv_shuttingdown) {
+			spin_unlock(&scd->scd_lock);
+			LIBCFS_FREE(buf, sizeof(*buf));
+
+			spin_lock(&scd->scd_lock);
+			rc = -ESHUTDOWN;
+			break;
+		}
+
+		rc = srpc_service_post_buffer(scd, buf);
+		if (rc != 0)
+			break; /* buf has been freed inside */
+
+		LASSERT(scd->scd_buf_posting > 0);
+		scd->scd_buf_posting--;
+		scd->scd_buf_total++;
+		scd->scd_buf_low = MAX(2, scd->scd_buf_total / 4);
+	}
+
+	if (rc != 0) {
+		scd->scd_buf_err_stamp = cfs_time_current_sec();
+		scd->scd_buf_err = rc;
+
+		LASSERT(scd->scd_buf_posting > 0);
+		scd->scd_buf_posting--;
+	}
+
+	spin_unlock(&scd->scd_lock);
+	return 0;
+}
+
+int
+srpc_service_add_buffers(struct srpc_service *sv, int nbuffer)
+{
+	struct srpc_service_cd	*scd;
+	int			rc = 0;
+	int			i;
+
+	LASSERTF(nbuffer > 0, "nbuffer must be positive: %d\n", nbuffer);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+
+		scd->scd_buf_err = 0;
+		scd->scd_buf_err_stamp = 0;
+		scd->scd_buf_posting = 0;
+		scd->scd_buf_adjust = nbuffer;
+		/* start to post buffers */
+		swi_schedule_workitem(&scd->scd_buf_wi);
+		spin_unlock(&scd->scd_lock);
+
+		/* framework service only post buffer for one partition  */
+		if (srpc_serv_is_framework(sv))
+			break;
+	}
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+		/*
+		 * NB: srpc_service_add_buffers() can be called inside
+		 * thread context of lst_sched_serial, and we don't normally
+		 * allow to sleep inside thread context of WI scheduler
+		 * because it will block current scheduler thread from doing
+		 * anything else, even worse, it could deadlock if it's
+		 * waiting on result from another WI of the same scheduler.
+		 * However, it's safe at here because scd_buf_wi is scheduled
+		 * by thread in a different WI scheduler (lst_sched_test),
+		 * so we don't have any risk of deadlock, though this could
+		 * block all WIs pending on lst_sched_serial for a moment
+		 * which is not good but not fatal.
+		 */
+		lst_wait_until(scd->scd_buf_err != 0 ||
+			       (scd->scd_buf_adjust == 0 &&
+				scd->scd_buf_posting == 0),
+			       scd->scd_lock, "waiting for adding buffer\n");
+
+		if (scd->scd_buf_err != 0 && rc == 0)
+			rc = scd->scd_buf_err;
+
+		spin_unlock(&scd->scd_lock);
+	}
+
+	return rc;
+}
+
+void
+srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer)
+{
+	struct srpc_service_cd	*scd;
+	int			num;
+	int			i;
+
+	LASSERT(!sv->sv_shuttingdown);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+
+		num = scd->scd_buf_total + scd->scd_buf_posting;
+		scd->scd_buf_adjust -= min(nbuffer, num);
+
+		spin_unlock(&scd->scd_lock);
+	}
+}
+
+/* returns 1 if sv has finished, otherwise 0 */
+int
+srpc_finish_service(struct srpc_service *sv)
+{
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	int			i;
+
+	LASSERT(sv->sv_shuttingdown); /* srpc_shutdown_service called */
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+		if (!swi_deschedule_workitem(&scd->scd_buf_wi)) {
+			spin_unlock(&scd->scd_lock);
+			return 0;
+		}
+
+		if (scd->scd_buf_nposted > 0) {
+			CDEBUG(D_NET, "waiting for %d posted buffers to "
+			       "unlink\n", scd->scd_buf_nposted);
+			spin_unlock(&scd->scd_lock);
+			return 0;
+		}
+
+		if (list_empty(&scd->scd_rpc_active)) {
+			spin_unlock(&scd->scd_lock);
+			continue;
+		}
+
+		rpc = list_entry(scd->scd_rpc_active.next,
+				     struct srpc_server_rpc, srpc_list);
+		CNETERR("Active RPC %p on shutdown: sv %s, peer %s, "
+			"wi %s scheduled %d running %d, "
+			"ev fired %d type %d status %d lnet %d\n",
+			rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+			swi_state2str(rpc->srpc_wi.swi_state),
+			rpc->srpc_wi.swi_workitem.wi_scheduled,
+			rpc->srpc_wi.swi_workitem.wi_running,
+			rpc->srpc_ev.ev_fired, rpc->srpc_ev.ev_type,
+			rpc->srpc_ev.ev_status, rpc->srpc_ev.ev_lnet);
+		spin_unlock(&scd->scd_lock);
+		return 0;
+	}
+
+	/* no lock needed from now on */
+	srpc_service_fini(sv);
+	return 1;
+}
+
+/* called with sv->sv_lock held */
+static void
+srpc_service_recycle_buffer(struct srpc_service_cd *scd, srpc_buffer_t *buf)
+__must_hold(&scd->scd_lock)
+{
+	if (!scd->scd_svc->sv_shuttingdown && scd->scd_buf_adjust >= 0) {
+		if (srpc_service_post_buffer(scd, buf) != 0) {
+			CWARN("Failed to post %s buffer\n",
+			      scd->scd_svc->sv_name);
+		}
+		return;
+	}
+
+	/* service is shutting down, or we want to recycle some buffers */
+	scd->scd_buf_total--;
+
+	if (scd->scd_buf_adjust < 0) {
+		scd->scd_buf_adjust++;
+		if (scd->scd_buf_adjust < 0 &&
+		    scd->scd_buf_total == 0 && scd->scd_buf_posting == 0) {
+			CDEBUG(D_INFO,
+			       "Try to recyle %d buffers but nothing left\n",
+			       scd->scd_buf_adjust);
+			scd->scd_buf_adjust = 0;
+		}
+	}
+
+	spin_unlock(&scd->scd_lock);
+	LIBCFS_FREE(buf, sizeof(*buf));
+	spin_lock(&scd->scd_lock);
+}
+
+void
+srpc_abort_service(struct srpc_service *sv)
+{
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	int			i;
+
+	CDEBUG(D_NET, "Aborting service: id %d, name %s\n",
+	       sv->sv_id, sv->sv_name);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+
+		/* schedule in-flight RPCs to notice the abort, NB:
+		 * racing with incoming RPCs; complete fix should make test
+		 * RPCs carry session ID in its headers */
+		list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list) {
+			rpc->srpc_aborted = 1;
+			swi_schedule_workitem(&rpc->srpc_wi);
+		}
+
+		spin_unlock(&scd->scd_lock);
+	}
+}
+
+void
+srpc_shutdown_service(srpc_service_t *sv)
+{
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	srpc_buffer_t		*buf;
+	int			i;
+
+	CDEBUG(D_NET, "Shutting down service: id %d, name %s\n",
+	       sv->sv_id, sv->sv_name);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data)
+		spin_lock(&scd->scd_lock);
+
+	sv->sv_shuttingdown = 1; /* i.e. no new active RPC */
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data)
+		spin_unlock(&scd->scd_lock);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+
+		/* schedule in-flight RPCs to notice the shutdown */
+		list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list)
+			swi_schedule_workitem(&rpc->srpc_wi);
+
+		spin_unlock(&scd->scd_lock);
+
+		/* OK to traverse scd_buf_posted without lock, since no one
+		 * touches scd_buf_posted now */
+		list_for_each_entry(buf, &scd->scd_buf_posted, buf_list)
+			LNetMDUnlink(buf->buf_mdh);
+	}
+}
+
+static int
+srpc_send_request (srpc_client_rpc_t *rpc)
+{
+        srpc_event_t *ev = &rpc->crpc_reqstev;
+        int           rc;
+
+        ev->ev_fired = 0;
+        ev->ev_data  = rpc;
+        ev->ev_type  = SRPC_REQUEST_SENT;
+
+	rc = srpc_post_active_rdma(srpc_serv_portal(rpc->crpc_service),
+				   rpc->crpc_service, &rpc->crpc_reqstmsg,
+				   sizeof(srpc_msg_t), LNET_MD_OP_PUT,
+				   rpc->crpc_dest, LNET_NID_ANY,
+				   &rpc->crpc_reqstmdh, ev);
+        if (rc != 0) {
+                LASSERT (rc == -ENOMEM);
+                ev->ev_fired = 1;  /* no more event expected */
+        }
+        return rc;
+}
+
+static int
+srpc_prepare_reply (srpc_client_rpc_t *rpc)
+{
+        srpc_event_t *ev = &rpc->crpc_replyev;
+        __u64        *id = &rpc->crpc_reqstmsg.msg_body.reqst.rpyid;
+        int           rc;
+
+        ev->ev_fired = 0;
+        ev->ev_data  = rpc;
+        ev->ev_type  = SRPC_REPLY_RCVD;
+
+        *id = srpc_next_id();
+
+	rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id,
+                                    &rpc->crpc_replymsg, sizeof(srpc_msg_t),
+                                    LNET_MD_OP_PUT, rpc->crpc_dest,
+                                    &rpc->crpc_replymdh, ev);
+        if (rc != 0) {
+                LASSERT (rc == -ENOMEM);
+                ev->ev_fired = 1;  /* no more event expected */
+        }
+        return rc;
+}
+
+static int
+srpc_prepare_bulk (srpc_client_rpc_t *rpc)
+{
+        srpc_bulk_t  *bk = &rpc->crpc_bulk;
+        srpc_event_t *ev = &rpc->crpc_bulkev;
+        __u64        *id = &rpc->crpc_reqstmsg.msg_body.reqst.bulkid;
+        int           rc;
+        int           opt;
+
+        LASSERT (bk->bk_niov <= LNET_MAX_IOV);
+
+        if (bk->bk_niov == 0) return 0; /* nothing to do */
+
+        opt = bk->bk_sink ? LNET_MD_OP_PUT : LNET_MD_OP_GET;
+        opt |= LNET_MD_KIOV;
+
+        ev->ev_fired = 0;
+        ev->ev_data  = rpc;
+        ev->ev_type  = SRPC_BULK_REQ_RCVD;
+
+        *id = srpc_next_id();
+
+	rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id,
+                                    &bk->bk_iovs[0], bk->bk_niov, opt,
+                                    rpc->crpc_dest, &bk->bk_mdh, ev);
+        if (rc != 0) {
+                LASSERT (rc == -ENOMEM);
+                ev->ev_fired = 1;  /* no more event expected */
+        }
+        return rc;
+}
+
+static int
+srpc_do_bulk (srpc_server_rpc_t *rpc)
+{
+        srpc_event_t  *ev = &rpc->srpc_ev;
+        srpc_bulk_t   *bk = rpc->srpc_bulk;
+        __u64          id = rpc->srpc_reqstbuf->buf_msg.msg_body.reqst.bulkid;
+        int            rc;
+        int            opt;
+
+        LASSERT (bk != NULL);
+
+        opt = bk->bk_sink ? LNET_MD_OP_GET : LNET_MD_OP_PUT;
+        opt |= LNET_MD_KIOV;
+
+        ev->ev_fired = 0;
+        ev->ev_data  = rpc;
+        ev->ev_type  = bk->bk_sink ? SRPC_BULK_GET_RPLD : SRPC_BULK_PUT_SENT;
+
+        rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, id,
+                                   &bk->bk_iovs[0], bk->bk_niov, opt,
+                                   rpc->srpc_peer, rpc->srpc_self,
+                                   &bk->bk_mdh, ev);
+        if (rc != 0)
+                ev->ev_fired = 1;  /* no more event expected */
+        return rc;
+}
+
+/* only called from srpc_handle_rpc */
+static void
+srpc_server_rpc_done(srpc_server_rpc_t *rpc, int status)
+{
+	struct srpc_service_cd	*scd = rpc->srpc_scd;
+	struct srpc_service	*sv  = scd->scd_svc;
+	srpc_buffer_t		*buffer;
+
+        LASSERT (status != 0 || rpc->srpc_wi.swi_state == SWI_STATE_DONE);
+
+        rpc->srpc_status = status;
+
+        CDEBUG_LIMIT (status == 0 ? D_NET : D_NETERROR,
+                "Server RPC %p done: service %s, peer %s, status %s:%d\n",
+                rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+                swi_state2str(rpc->srpc_wi.swi_state), status);
+
+        if (status != 0) {
+		spin_lock(&srpc_data.rpc_glock);
+		srpc_data.rpc_counters.rpcs_dropped++;
+		spin_unlock(&srpc_data.rpc_glock);
+	}
+
+	if (rpc->srpc_done != NULL)
+		(*rpc->srpc_done) (rpc);
+	LASSERT(rpc->srpc_bulk == NULL);
+
+	spin_lock(&scd->scd_lock);
+
+	if (rpc->srpc_reqstbuf != NULL) {
+		/* NB might drop sv_lock in srpc_service_recycle_buffer, but
+		 * sv won't go away for scd_rpc_active must not be empty */
+		srpc_service_recycle_buffer(scd, rpc->srpc_reqstbuf);
+		rpc->srpc_reqstbuf = NULL;
+	}
+
+	list_del(&rpc->srpc_list); /* from scd->scd_rpc_active */
+
+	/*
+	 * No one can schedule me now since:
+	 * - I'm not on scd_rpc_active.
+	 * - all LNet events have been fired.
+	 * Cancel pending schedules and prevent future schedule attempts:
+	 */
+	LASSERT(rpc->srpc_ev.ev_fired);
+	swi_exit_workitem(&rpc->srpc_wi);
+
+	if (!sv->sv_shuttingdown && !list_empty(&scd->scd_buf_blocked)) {
+		buffer = list_entry(scd->scd_buf_blocked.next,
+					srpc_buffer_t, buf_list);
+		list_del(&buffer->buf_list);
+
+		srpc_init_server_rpc(rpc, scd, buffer);
+		list_add_tail(&rpc->srpc_list, &scd->scd_rpc_active);
+		swi_schedule_workitem(&rpc->srpc_wi);
+	} else {
+		list_add(&rpc->srpc_list, &scd->scd_rpc_free);
+	}
+
+	spin_unlock(&scd->scd_lock);
+	return;
+}
+
+/* handles an incoming RPC */
+int
+srpc_handle_rpc(swi_workitem_t *wi)
+{
+	struct srpc_server_rpc	*rpc = wi->swi_workitem.wi_data;
+	struct srpc_service_cd	*scd = rpc->srpc_scd;
+	struct srpc_service	*sv = scd->scd_svc;
+	srpc_event_t		*ev = &rpc->srpc_ev;
+	int			rc = 0;
+
+	LASSERT(wi == &rpc->srpc_wi);
+
+	spin_lock(&scd->scd_lock);
+
+	if (sv->sv_shuttingdown || rpc->srpc_aborted) {
+		spin_unlock(&scd->scd_lock);
+
+                if (rpc->srpc_bulk != NULL)
+                        LNetMDUnlink(rpc->srpc_bulk->bk_mdh);
+                LNetMDUnlink(rpc->srpc_replymdh);
+
+                if (ev->ev_fired) { /* no more event, OK to finish */
+                        srpc_server_rpc_done(rpc, -ESHUTDOWN);
+                        return 1;
+                }
+                return 0;
+        }
+
+	spin_unlock(&scd->scd_lock);
+
+        switch (wi->swi_state) {
+        default:
+                LBUG ();
+        case SWI_STATE_NEWBORN: {
+                srpc_msg_t           *msg;
+                srpc_generic_reply_t *reply;
+
+                msg = &rpc->srpc_reqstbuf->buf_msg;
+                reply = &rpc->srpc_replymsg.msg_body.reply;
+
+                if (msg->msg_magic == 0) {
+                        /* moaned already in srpc_lnet_ev_handler */
+			srpc_server_rpc_done(rpc, EBADMSG);
+			return 1;
+		}
+
+		srpc_unpack_msg_hdr(msg);
+		if (msg->msg_version != SRPC_MSG_VERSION) {
+			CWARN("Version mismatch: %u, %u expected, from %s\n",
+			      msg->msg_version, SRPC_MSG_VERSION,
+			      libcfs_id2str(rpc->srpc_peer));
+			reply->status = EPROTO;
+			/* drop through and send reply */
+		} else {
+			reply->status = 0;
+			rc = (*sv->sv_handler)(rpc);
+			LASSERT(reply->status == 0 || !rpc->srpc_bulk);
+			if (rc != 0) {
+				srpc_server_rpc_done(rpc, rc);
+				return 1;
+			}
+                }
+
+                wi->swi_state = SWI_STATE_BULK_STARTED;
+
+                if (rpc->srpc_bulk != NULL) {
+                        rc = srpc_do_bulk(rpc);
+                        if (rc == 0)
+                                return 0; /* wait for bulk */
+
+                        LASSERT (ev->ev_fired);
+                        ev->ev_status = rc;
+                }
+        }
+        case SWI_STATE_BULK_STARTED:
+                LASSERT (rpc->srpc_bulk == NULL || ev->ev_fired);
+
+                if (rpc->srpc_bulk != NULL) {
+                        rc = ev->ev_status;
+
+                        if (sv->sv_bulk_ready != NULL)
+                                rc = (*sv->sv_bulk_ready) (rpc, rc);
+
+                        if (rc != 0) {
+                                srpc_server_rpc_done(rpc, rc);
+                                return 1;
+                        }
+                }
+
+                wi->swi_state = SWI_STATE_REPLY_SUBMITTED;
+                rc = srpc_send_reply(rpc);
+                if (rc == 0)
+                        return 0; /* wait for reply */
+                srpc_server_rpc_done(rpc, rc);
+                return 1;
+
+        case SWI_STATE_REPLY_SUBMITTED:
+                if (!ev->ev_fired) {
+                        CERROR("RPC %p: bulk %p, service %d\n",
+			       rpc, rpc->srpc_bulk, sv->sv_id);
+                        CERROR("Event: status %d, type %d, lnet %d\n",
+                               ev->ev_status, ev->ev_type, ev->ev_lnet);
+                        LASSERT (ev->ev_fired);
+                }
+
+                wi->swi_state = SWI_STATE_DONE;
+                srpc_server_rpc_done(rpc, ev->ev_status);
+                return 1;
+        }
+
+        return 0;
+}
+
+static void
+srpc_client_rpc_expired (void *data)
+{
+        srpc_client_rpc_t *rpc = data;
+
+        CWARN ("Client RPC expired: service %d, peer %s, timeout %d.\n",
+               rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+               rpc->crpc_timeout);
+
+	spin_lock(&rpc->crpc_lock);
+
+	rpc->crpc_timeout = 0;
+	srpc_abort_rpc(rpc, -ETIMEDOUT);
+
+	spin_unlock(&rpc->crpc_lock);
+
+	spin_lock(&srpc_data.rpc_glock);
+	srpc_data.rpc_counters.rpcs_expired++;
+	spin_unlock(&srpc_data.rpc_glock);
+}
+
+static void
+srpc_add_client_rpc_timer(srpc_client_rpc_t *rpc)
+{
+	stt_timer_t *timer = &rpc->crpc_timer;
+
+	if (rpc->crpc_timeout == 0)
+		return;
+
+	INIT_LIST_HEAD(&timer->stt_list);
+	timer->stt_data	   = rpc;
+	timer->stt_func    = srpc_client_rpc_expired;
+	timer->stt_expires = cfs_time_add(rpc->crpc_timeout,
+					  cfs_time_current_sec());
+	stt_add_timer(timer);
+	return;
+}
+
+/*
+ * Called with rpc->crpc_lock held.
+ *
+ * Upon exit the RPC expiry timer is not queued and the handler is not
+ * running on any CPU. */
+static void
+srpc_del_client_rpc_timer (srpc_client_rpc_t *rpc)
+{
+	/* timer not planted or already exploded */
+	if (rpc->crpc_timeout == 0)
+		return;
+
+	/* timer successfully defused */
+	if (stt_del_timer(&rpc->crpc_timer))
+		return;
+
+	/* timer detonated, wait for it to explode */
+	while (rpc->crpc_timeout != 0) {
+		spin_unlock(&rpc->crpc_lock);
+
+		schedule();
+
+		spin_lock(&rpc->crpc_lock);
+	}
+}
+
+static void
+srpc_client_rpc_done (srpc_client_rpc_t *rpc, int status)
+{
+	swi_workitem_t *wi = &rpc->crpc_wi;
+
+	LASSERT(status != 0 || wi->swi_state == SWI_STATE_DONE);
+
+	spin_lock(&rpc->crpc_lock);
+
+        rpc->crpc_closed = 1;
+        if (rpc->crpc_status == 0)
+                rpc->crpc_status = status;
+
+        srpc_del_client_rpc_timer(rpc);
+
+        CDEBUG_LIMIT ((status == 0) ? D_NET : D_NETERROR,
+                "Client RPC done: service %d, peer %s, status %s:%d:%d\n",
+                rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+                swi_state2str(wi->swi_state), rpc->crpc_aborted, status);
+
+        /*
+         * No one can schedule me now since:
+         * - RPC timer has been defused.
+         * - all LNet events have been fired.
+         * - crpc_closed has been set, preventing srpc_abort_rpc from
+         *   scheduling me.
+         * Cancel pending schedules and prevent future schedule attempts:
+         */
+        LASSERT (!srpc_event_pending(rpc));
+	swi_exit_workitem(wi);
+
+	spin_unlock(&rpc->crpc_lock);
+
+	(*rpc->crpc_done)(rpc);
+	return;
+}
+
+/* sends an outgoing RPC */
+int
+srpc_send_rpc (swi_workitem_t *wi)
+{
+        int                rc = 0;
+	srpc_client_rpc_t *rpc;
+	srpc_msg_t        *reply;
+	int                do_bulk;
+
+	LASSERT(wi != NULL);
+
+	rpc = wi->swi_workitem.wi_data;
+
+        LASSERT (rpc != NULL);
+        LASSERT (wi == &rpc->crpc_wi);
+
+	reply = &rpc->crpc_replymsg;
+	do_bulk = rpc->crpc_bulk.bk_niov > 0;
+
+	spin_lock(&rpc->crpc_lock);
+
+	if (rpc->crpc_aborted) {
+		spin_unlock(&rpc->crpc_lock);
+		goto abort;
+	}
+
+	spin_unlock(&rpc->crpc_lock);
+
+        switch (wi->swi_state) {
+        default:
+                LBUG ();
+        case SWI_STATE_NEWBORN:
+                LASSERT (!srpc_event_pending(rpc));
+
+                rc = srpc_prepare_reply(rpc);
+                if (rc != 0) {
+                        srpc_client_rpc_done(rpc, rc);
+                        return 1;
+                }
+
+                rc = srpc_prepare_bulk(rpc);
+                if (rc != 0) break;
+
+                wi->swi_state = SWI_STATE_REQUEST_SUBMITTED;
+                rc = srpc_send_request(rpc);
+                break;
+
+        case SWI_STATE_REQUEST_SUBMITTED:
+                /* CAVEAT EMPTOR: rqtev, rpyev, and bulkev may come in any
+                 * order; however, they're processed in a strict order:
+                 * rqt, rpy, and bulk. */
+                if (!rpc->crpc_reqstev.ev_fired) break;
+
+                rc = rpc->crpc_reqstev.ev_status;
+                if (rc != 0) break;
+
+                wi->swi_state = SWI_STATE_REQUEST_SENT;
+                /* perhaps more events, fall thru */
+        case SWI_STATE_REQUEST_SENT: {
+                srpc_msg_type_t type = srpc_service2reply(rpc->crpc_service);
+
+                if (!rpc->crpc_replyev.ev_fired) break;
+
+                rc = rpc->crpc_replyev.ev_status;
+                if (rc != 0) break;
+
+		srpc_unpack_msg_hdr(reply);
+		if (reply->msg_type != type ||
+		    (reply->msg_magic != SRPC_MSG_MAGIC &&
+		     reply->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
+                        CWARN ("Bad message from %s: type %u (%d expected),"
+                               " magic %u (%d expected).\n",
+                               libcfs_id2str(rpc->crpc_dest),
+                               reply->msg_type, type,
+                               reply->msg_magic, SRPC_MSG_MAGIC);
+                        rc = -EBADMSG;
+                        break;
+                }
+
+                if (do_bulk && reply->msg_body.reply.status != 0) {
+                        CWARN ("Remote error %d at %s, unlink bulk buffer in "
+                               "case peer didn't initiate bulk transfer\n",
+                               reply->msg_body.reply.status,
+                               libcfs_id2str(rpc->crpc_dest));
+                        LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
+                }
+
+                wi->swi_state = SWI_STATE_REPLY_RECEIVED;
+        }
+        case SWI_STATE_REPLY_RECEIVED:
+                if (do_bulk && !rpc->crpc_bulkev.ev_fired) break;
+
+                rc = do_bulk ? rpc->crpc_bulkev.ev_status : 0;
+
+                /* Bulk buffer was unlinked due to remote error. Clear error
+                 * since reply buffer still contains valid data.
+                 * NB rpc->crpc_done shouldn't look into bulk data in case of
+                 * remote error. */
+                if (do_bulk && rpc->crpc_bulkev.ev_lnet == LNET_EVENT_UNLINK &&
+                    rpc->crpc_status == 0 && reply->msg_body.reply.status != 0)
+                        rc = 0;
+
+                wi->swi_state = SWI_STATE_DONE;
+                srpc_client_rpc_done(rpc, rc);
+                return 1;
+        }
+
+	if (rc != 0) {
+		spin_lock(&rpc->crpc_lock);
+		srpc_abort_rpc(rpc, rc);
+		spin_unlock(&rpc->crpc_lock);
+	}
+
+abort:
+        if (rpc->crpc_aborted) {
+                LNetMDUnlink(rpc->crpc_reqstmdh);
+                LNetMDUnlink(rpc->crpc_replymdh);
+                LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
+
+                if (!srpc_event_pending(rpc)) {
+                        srpc_client_rpc_done(rpc, -EINTR);
+                        return 1;
+                }
+        }
+        return 0;
+}
+
+srpc_client_rpc_t *
+srpc_create_client_rpc(struct lnet_process_id peer, int service,
+		       int nbulkiov, int bulklen,
+		       void (*rpc_done)(srpc_client_rpc_t *),
+		       void (*rpc_fini)(srpc_client_rpc_t *), void *priv)
+{
+        srpc_client_rpc_t *rpc;
+
+        LIBCFS_ALLOC(rpc, offsetof(srpc_client_rpc_t,
+                                   crpc_bulk.bk_iovs[nbulkiov]));
+        if (rpc == NULL)
+                return NULL;
+
+        srpc_init_client_rpc(rpc, peer, service, nbulkiov,
+                             bulklen, rpc_done, rpc_fini, priv);
+        return rpc;
+}
+
+/* called with rpc->crpc_lock held */
+void
+srpc_abort_rpc (srpc_client_rpc_t *rpc, int why)
+{
+        LASSERT (why != 0);
+
+        if (rpc->crpc_aborted || /* already aborted */
+            rpc->crpc_closed)    /* callback imminent */
+                return;
+
+        CDEBUG (D_NET,
+                "Aborting RPC: service %d, peer %s, state %s, why %d\n",
+                rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+                swi_state2str(rpc->crpc_wi.swi_state), why);
+
+        rpc->crpc_aborted = 1;
+        rpc->crpc_status  = why;
+        swi_schedule_workitem(&rpc->crpc_wi);
+        return;
+}
+
+/* called with rpc->crpc_lock held */
+void
+srpc_post_rpc (srpc_client_rpc_t *rpc)
+{
+        LASSERT (!rpc->crpc_aborted);
+        LASSERT (srpc_data.rpc_state == SRPC_STATE_RUNNING);
+
+        CDEBUG (D_NET, "Posting RPC: peer %s, service %d, timeout %d\n",
+                libcfs_id2str(rpc->crpc_dest), rpc->crpc_service,
+                rpc->crpc_timeout);
+
+        srpc_add_client_rpc_timer(rpc);
+        swi_schedule_workitem(&rpc->crpc_wi);
+        return;
+}
+
+
+int
+srpc_send_reply(struct srpc_server_rpc *rpc)
+{
+	srpc_event_t		*ev = &rpc->srpc_ev;
+	struct srpc_msg		*msg = &rpc->srpc_replymsg;
+	struct srpc_buffer	*buffer = rpc->srpc_reqstbuf;
+	struct srpc_service_cd	*scd = rpc->srpc_scd;
+	struct srpc_service	*sv = scd->scd_svc;
+	__u64			rpyid;
+	int			rc;
+
+	LASSERT(buffer != NULL);
+	rpyid = buffer->buf_msg.msg_body.reqst.rpyid;
+
+	spin_lock(&scd->scd_lock);
+
+	if (!sv->sv_shuttingdown && !srpc_serv_is_framework(sv)) {
+		/* Repost buffer before replying since test client
+		 * might send me another RPC once it gets the reply */
+		if (srpc_service_post_buffer(scd, buffer) != 0)
+			CWARN("Failed to repost %s buffer\n", sv->sv_name);
+		rpc->srpc_reqstbuf = NULL;
+	}
+
+	spin_unlock(&scd->scd_lock);
+
+        ev->ev_fired = 0;
+        ev->ev_data  = rpc;
+        ev->ev_type  = SRPC_REPLY_SENT;
+
+        msg->msg_magic   = SRPC_MSG_MAGIC;
+        msg->msg_version = SRPC_MSG_VERSION;
+        msg->msg_type    = srpc_service2reply(sv->sv_id);
+
+        rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, rpyid, msg,
+                                   sizeof(*msg), LNET_MD_OP_PUT,
+                                   rpc->srpc_peer, rpc->srpc_self,
+                                   &rpc->srpc_replymdh, ev);
+        if (rc != 0)
+                ev->ev_fired = 1;  /* no more event expected */
+        return rc;
+}
+
+/* when in kernel always called with LNET_LOCK() held, and in thread context */
+static void
+srpc_lnet_ev_handler(struct lnet_event *ev)
+{
+	struct srpc_service_cd	*scd;
+	srpc_event_t      *rpcev = ev->md.user_ptr;
+	srpc_client_rpc_t *crpc;
+	srpc_server_rpc_t *srpc;
+	srpc_buffer_t     *buffer;
+	srpc_service_t    *sv;
+	srpc_msg_t        *msg;
+	srpc_msg_type_t    type;
+
+	LASSERT (!in_interrupt());
+
+	if (ev->status != 0) {
+		__u32 errors;
+
+		spin_lock(&srpc_data.rpc_glock);
+		if (ev->status != -ECANCELED) /* cancellation is not error */
+			srpc_data.rpc_counters.errors++;
+		errors = srpc_data.rpc_counters.errors;
+		spin_unlock(&srpc_data.rpc_glock);
+
+		CNETERR("LNet event status %d type %d, RPC errors %u\n",
+			ev->status, ev->type, errors);
+	}
+
+        rpcev->ev_lnet = ev->type;
+
+        switch (rpcev->ev_type) {
+        default:
+                CERROR("Unknown event: status %d, type %d, lnet %d\n",
+                       rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet);
+                LBUG ();
+        case SRPC_REQUEST_SENT:
+                if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
+			spin_lock(&srpc_data.rpc_glock);
+			srpc_data.rpc_counters.rpcs_sent++;
+			spin_unlock(&srpc_data.rpc_glock);
+                }
+        case SRPC_REPLY_RCVD:
+        case SRPC_BULK_REQ_RCVD:
+                crpc = rpcev->ev_data;
+
+                if (rpcev != &crpc->crpc_reqstev &&
+                    rpcev != &crpc->crpc_replyev &&
+                    rpcev != &crpc->crpc_bulkev) {
+                        CERROR("rpcev %p, crpc %p, reqstev %p, replyev %p, bulkev %p\n",
+                               rpcev, crpc, &crpc->crpc_reqstev,
+                               &crpc->crpc_replyev, &crpc->crpc_bulkev);
+                        CERROR("Bad event: status %d, type %d, lnet %d\n",
+                               rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet);
+                        LBUG ();
+                }
+
+		spin_lock(&crpc->crpc_lock);
+
+		LASSERT(rpcev->ev_fired == 0);
+		rpcev->ev_fired  = 1;
+		rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ?
+						-EINTR : ev->status;
+		swi_schedule_workitem(&crpc->crpc_wi);
+
+		spin_unlock(&crpc->crpc_lock);
+		break;
+
+	case SRPC_REQUEST_RCVD:
+		scd = rpcev->ev_data;
+		sv = scd->scd_svc;
+
+		LASSERT(rpcev == &scd->scd_ev);
+
+		spin_lock(&scd->scd_lock);
+
+                LASSERT (ev->unlinked);
+                LASSERT (ev->type == LNET_EVENT_PUT ||
+                         ev->type == LNET_EVENT_UNLINK);
+                LASSERT (ev->type != LNET_EVENT_UNLINK ||
+                         sv->sv_shuttingdown);
+
+                buffer = container_of(ev->md.start, srpc_buffer_t, buf_msg);
+		buffer->buf_peer = ev->source;
+                buffer->buf_self = ev->target.nid;
+
+		LASSERT(scd->scd_buf_nposted > 0);
+		scd->scd_buf_nposted--;
+
+		if (sv->sv_shuttingdown) {
+			/* Leave buffer on scd->scd_buf_nposted since
+			 * srpc_finish_service needs to traverse it. */
+			spin_unlock(&scd->scd_lock);
+			break;
+		}
+
+		if (scd->scd_buf_err_stamp != 0 &&
+		    scd->scd_buf_err_stamp < cfs_time_current_sec()) {
+			/* re-enable adding buffer */
+			scd->scd_buf_err_stamp = 0;
+			scd->scd_buf_err = 0;
+		}
+
+		if (scd->scd_buf_err == 0 && /* adding buffer is enabled */
+		    scd->scd_buf_adjust == 0 &&
+		    scd->scd_buf_nposted < scd->scd_buf_low) {
+			scd->scd_buf_adjust = MAX(scd->scd_buf_total / 2,
+						  SFW_TEST_WI_MIN);
+			swi_schedule_workitem(&scd->scd_buf_wi);
+		}
+
+		list_del(&buffer->buf_list); /* from scd->scd_buf_posted */
+		msg = &buffer->buf_msg;
+		type = srpc_service2request(sv->sv_id);
+
+                if (ev->status != 0 || ev->mlength != sizeof(*msg) ||
+                    (msg->msg_type != type &&
+                     msg->msg_type != __swab32(type)) ||
+                    (msg->msg_magic != SRPC_MSG_MAGIC &&
+                     msg->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
+                        CERROR ("Dropping RPC (%s) from %s: "
+                                "status %d mlength %d type %u magic %u.\n",
+                                sv->sv_name, libcfs_id2str(ev->initiator),
+                                ev->status, ev->mlength,
+                                msg->msg_type, msg->msg_magic);
+
+                        /* NB can't call srpc_service_recycle_buffer here since
+                         * it may call LNetM[DE]Attach. The invalid magic tells
+                         * srpc_handle_rpc to drop this RPC */
+                        msg->msg_magic = 0;
+                }
+
+		if (!list_empty(&scd->scd_rpc_free)) {
+			srpc = list_entry(scd->scd_rpc_free.next,
+					  struct srpc_server_rpc,
+					  srpc_list);
+			list_del(&srpc->srpc_list);
+
+			srpc_init_server_rpc(srpc, scd, buffer);
+			list_add_tail(&srpc->srpc_list,
+				      &scd->scd_rpc_active);
+			swi_schedule_workitem(&srpc->srpc_wi);
+		} else {
+			list_add_tail(&buffer->buf_list,
+				      &scd->scd_buf_blocked);
+		}
+
+		spin_unlock(&scd->scd_lock);
+
+		spin_lock(&srpc_data.rpc_glock);
+		srpc_data.rpc_counters.rpcs_rcvd++;
+		spin_unlock(&srpc_data.rpc_glock);
+                break;
+
+        case SRPC_BULK_GET_RPLD:
+                LASSERT (ev->type == LNET_EVENT_SEND ||
+                         ev->type == LNET_EVENT_REPLY ||
+                         ev->type == LNET_EVENT_UNLINK);
+
+                if (!ev->unlinked)
+                        break; /* wait for final event */
+
+        case SRPC_BULK_PUT_SENT:
+                if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
+			spin_lock(&srpc_data.rpc_glock);
+
+			if (rpcev->ev_type == SRPC_BULK_GET_RPLD)
+				srpc_data.rpc_counters.bulk_get += ev->mlength;
+			else
+				srpc_data.rpc_counters.bulk_put += ev->mlength;
+
+			spin_unlock(&srpc_data.rpc_glock);
+		}
+	case SRPC_REPLY_SENT:
+		srpc = rpcev->ev_data;
+		scd  = srpc->srpc_scd;
+
+		LASSERT(rpcev == &srpc->srpc_ev);
+
+		spin_lock(&scd->scd_lock);
+
+		rpcev->ev_fired  = 1;
+		rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ?
+				   -EINTR : ev->status;
+		swi_schedule_workitem(&srpc->srpc_wi);
+
+		spin_unlock(&scd->scd_lock);
+		break;
+	}
+}
+
+
+int
+srpc_startup (void)
+{
+	int rc;
+
+	memset(&srpc_data, 0, sizeof(struct smoketest_rpc));
+	spin_lock_init(&srpc_data.rpc_glock);
+
+	/* 1 second pause to avoid timestamp reuse */
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule_timeout(cfs_time_seconds(1));
+	srpc_data.rpc_matchbits = ((__u64) cfs_time_current_sec()) << 48;
+
+	srpc_data.rpc_state = SRPC_STATE_NONE;
+
+	rc = LNetNIInit(LNET_PID_LUSTRE);
+        if (rc < 0) {
+                CERROR ("LNetNIInit() has failed: %d\n", rc);
+		return rc;
+        }
+
+        srpc_data.rpc_state = SRPC_STATE_NI_INIT;
+
+	LNetInvalidateEQHandle(&srpc_data.rpc_lnet_eq);
+	rc = LNetEQAlloc(0, srpc_lnet_ev_handler, &srpc_data.rpc_lnet_eq);
+        if (rc != 0) {
+                CERROR("LNetEQAlloc() has failed: %d\n", rc);
+                goto bail;
+        }
+
+	rc = LNetSetLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
+	LASSERT(rc == 0);
+	rc = LNetSetLazyPortal(SRPC_REQUEST_PORTAL);
+	LASSERT(rc == 0);
+
+        srpc_data.rpc_state = SRPC_STATE_EQ_INIT;
+
+        rc = stt_startup();
+
+bail:
+        if (rc != 0)
+                srpc_shutdown();
+        else
+                srpc_data.rpc_state = SRPC_STATE_RUNNING;
+
+        return rc;
+}
+
+void
+srpc_shutdown (void)
+{
+        int i;
+        int rc;
+        int state;
+
+        state = srpc_data.rpc_state;
+        srpc_data.rpc_state = SRPC_STATE_STOPPING;
+
+        switch (state) {
+        default:
+                LBUG ();
+        case SRPC_STATE_RUNNING:
+		spin_lock(&srpc_data.rpc_glock);
+
+                for (i = 0; i <= SRPC_SERVICE_MAX_ID; i++) {
+                        srpc_service_t *sv = srpc_data.rpc_services[i];
+
+                        LASSERTF (sv == NULL,
+                                  "service not empty: id %d, name %s\n",
+                                  i, sv->sv_name);
+                }
+
+		spin_unlock(&srpc_data.rpc_glock);
+
+                stt_shutdown();
+
+        case SRPC_STATE_EQ_INIT:
+                rc = LNetClearLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
+		rc = LNetClearLazyPortal(SRPC_REQUEST_PORTAL);
+                LASSERT (rc == 0);
+                rc = LNetEQFree(srpc_data.rpc_lnet_eq);
+                LASSERT (rc == 0); /* the EQ should have no user by now */
+
+        case SRPC_STATE_NI_INIT:
+                LNetNIFini();
+        }
+
+        return;
+}
diff --git a/drivers/staging/lustrefsx/lnet/selftest/rpc.h b/drivers/staging/lustrefsx/lnet/selftest/rpc.h
new file mode 100644
index 0000000000000..aab2629e7ba1d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/rpc.h
@@ -0,0 +1,297 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __SELFTEST_RPC_H__
+#define __SELFTEST_RPC_H__
+
+#include <lnet/lnetst.h>
+
+/*
+ * LST wired structures
+ * 
+ * XXX: *REPLY == *REQST + 1
+ */
+typedef enum {
+        SRPC_MSG_MKSN_REQST     = 0,
+        SRPC_MSG_MKSN_REPLY     = 1,
+        SRPC_MSG_RMSN_REQST     = 2,
+        SRPC_MSG_RMSN_REPLY     = 3,
+        SRPC_MSG_BATCH_REQST    = 4,
+        SRPC_MSG_BATCH_REPLY    = 5,
+        SRPC_MSG_STAT_REQST     = 6,
+        SRPC_MSG_STAT_REPLY     = 7,
+        SRPC_MSG_TEST_REQST     = 8,
+        SRPC_MSG_TEST_REPLY     = 9,
+        SRPC_MSG_DEBUG_REQST    = 10,
+        SRPC_MSG_DEBUG_REPLY    = 11,
+        SRPC_MSG_BRW_REQST      = 12,
+        SRPC_MSG_BRW_REPLY      = 13,
+        SRPC_MSG_PING_REQST     = 14,
+        SRPC_MSG_PING_REPLY     = 15,
+        SRPC_MSG_JOIN_REQST     = 16,
+        SRPC_MSG_JOIN_REPLY     = 17,
+} srpc_msg_type_t;
+
+/* CAVEAT EMPTOR:
+ * All srpc_*_reqst_t's 1st field must be matchbits of reply buffer,
+ * and 2nd field matchbits of bulk buffer if any.
+ *
+ * All srpc_*_reply_t's 1st field must be a __u32 status, and 2nd field
+ * session id if needed.
+ */
+typedef struct {
+        __u64 			rpyid;  	/* reply buffer matchbits */
+        __u64 			bulkid; 	/* bulk buffer matchbits */
+} WIRE_ATTR srpc_generic_reqst_t;
+
+typedef struct {
+        __u32                   status;
+	struct lst_sid               sid;
+} WIRE_ATTR srpc_generic_reply_t;
+
+/* FRAMEWORK RPCs */
+typedef struct {
+        __u64 			mksn_rpyid;      /* reply buffer matchbits */
+	struct lst_sid               mksn_sid;        /* session id */
+        __u32 			mksn_force;      /* use brute force */
+        char  			mksn_name[LST_NAME_SIZE];
+} WIRE_ATTR srpc_mksn_reqst_t; 			/* make session request */
+
+typedef struct {
+        __u32                   mksn_status;      /* session status */
+	struct lst_sid               mksn_sid;         /* session id */
+        __u32                   mksn_timeout;     /* session timeout */
+        char  			mksn_name[LST_NAME_SIZE];
+} WIRE_ATTR srpc_mksn_reply_t; /* make session reply */
+
+typedef struct {
+        __u64			rmsn_rpyid;      /* reply buffer matchbits */
+	struct lst_sid		rmsn_sid;        /* session id */
+} WIRE_ATTR srpc_rmsn_reqst_t; /* remove session request */
+
+typedef struct {
+        __u32			rmsn_status;
+	struct lst_sid		rmsn_sid;        /* session id */
+} WIRE_ATTR srpc_rmsn_reply_t; /* remove session reply */
+
+typedef struct {
+        __u64			join_rpyid;     /* reply buffer matchbits */
+	struct lst_sid               join_sid;       /* session id to join */
+        char                    join_group[LST_NAME_SIZE]; /* group name */
+} WIRE_ATTR srpc_join_reqst_t;
+
+typedef struct {
+        __u32                   join_status;    /* returned status */
+	struct lst_sid               join_sid;       /* session id */
+        __u32 			join_timeout;   /* # seconds' inactivity to expire */
+        char                    join_session[LST_NAME_SIZE]; /* session name */
+} WIRE_ATTR srpc_join_reply_t;
+
+typedef struct {
+        __u64                   dbg_rpyid;      /* reply buffer matchbits */ 
+	struct lst_sid               dbg_sid;        /* session id */
+        __u32                   dbg_flags;      /* bitmap of debug */
+} WIRE_ATTR srpc_debug_reqst_t;
+
+typedef struct {
+        __u32                   dbg_status;     /* returned code */
+	struct lst_sid               dbg_sid;        /* session id */
+        __u32                   dbg_timeout;    /* session timeout */
+        __u32                   dbg_nbatch;     /* # of batches in the node */
+        char                    dbg_name[LST_NAME_SIZE]; /* session name */
+} WIRE_ATTR srpc_debug_reply_t;
+
+#define SRPC_BATCH_OPC_RUN      1
+#define SRPC_BATCH_OPC_STOP     2
+#define SRPC_BATCH_OPC_QUERY    3
+
+typedef struct {
+        __u64                   bar_rpyid;      /* reply buffer matchbits */ 
+	struct lst_sid               bar_sid;        /* session id */
+	struct lst_bid               bar_bid;        /* batch id */
+        __u32                   bar_opc;        /* create/start/stop batch */
+        __u32                   bar_testidx;    /* index of test */
+        __u32                   bar_arg;        /* parameters */
+} WIRE_ATTR srpc_batch_reqst_t;
+
+typedef struct {
+        __u32                   bar_status;     /* status of request */
+	struct lst_sid		bar_sid;	/* session id */
+        __u32                   bar_active;     /* # of active tests in batch/test */
+        __u32                   bar_time;       /* remained time */
+} WIRE_ATTR srpc_batch_reply_t;
+
+typedef struct {
+        __u64                   str_rpyid;      /* reply buffer matchbits */
+	struct lst_sid		str_sid;	/* session id */
+        __u32                   str_type;       /* type of stat */
+} WIRE_ATTR srpc_stat_reqst_t;
+
+typedef struct {
+        __u32                   str_status;
+	struct lst_sid		str_sid;
+	struct sfw_counters	str_fw;
+	struct srpc_counters	str_rpc;
+	struct lnet_counters	str_lnet;
+} WIRE_ATTR srpc_stat_reply_t;
+
+typedef struct {
+        __u32                   blk_opc;        /* bulk operation code */
+        __u32                   blk_npg;        /* # of pages */
+        __u32                   blk_flags;      /* reserved flags */
+} WIRE_ATTR test_bulk_req_t;
+
+typedef struct {
+	/** bulk operation code */
+	__u16			blk_opc;
+	/** data check flags */
+	__u16			blk_flags;
+	/** data length */
+	__u32			blk_len;
+	/** bulk offset */
+	__u32                   blk_offset;
+} WIRE_ATTR test_bulk_req_v1_t;
+
+typedef struct {
+	__u32			png_size;       /* size of ping message */
+	__u32			png_flags;      /* reserved flags */
+} WIRE_ATTR test_ping_req_t;
+
+typedef struct {
+	__u64			tsr_rpyid;      /* reply buffer matchbits */
+	__u64			tsr_bulkid;     /* bulk buffer matchbits */
+	struct lst_sid		tsr_sid;        /* session id */
+	struct lst_bid		tsr_bid;        /* batch id */
+	__u32			tsr_service;    /* test type: bulk|ping|... */
+	/* test client loop count or # server buffers needed */
+	__u32			tsr_loop;
+	__u32			tsr_concur;     /* concurrency of test */
+	__u8			tsr_is_client;  /* is test client or not */
+	__u8			tsr_stop_onerr; /* stop on error */
+	__u32			tsr_ndest;      /* # of dest nodes */
+
+	union {
+		test_ping_req_t		ping;
+		test_bulk_req_t		bulk_v0;
+		test_bulk_req_v1_t	bulk_v1;
+	}		tsr_u;
+} WIRE_ATTR srpc_test_reqst_t;
+
+typedef struct {
+	__u32			tsr_status;     /* returned code */
+	struct lst_sid		tsr_sid;
+} WIRE_ATTR srpc_test_reply_t;
+
+/* TEST RPCs */
+typedef struct {
+        __u64                   pnr_rpyid;
+        __u32                   pnr_magic;
+        __u32                   pnr_seq;
+        __u64                   pnr_time_sec;
+	__u64                   pnr_time_nsec;
+} WIRE_ATTR srpc_ping_reqst_t;
+
+typedef struct {
+        __u32                   pnr_status;
+        __u32                   pnr_magic;
+        __u32                   pnr_seq;
+} WIRE_ATTR srpc_ping_reply_t;
+
+typedef struct {
+        __u64                   brw_rpyid;      /* reply buffer matchbits */
+        __u64                   brw_bulkid;     /* bulk buffer matchbits */
+        __u32                   brw_rw;         /* read or write */
+        __u32                   brw_len;        /* bulk data len */
+        __u32                   brw_flags;      /* bulk data patterns */
+} WIRE_ATTR srpc_brw_reqst_t; /* bulk r/w request */
+
+typedef struct {
+        __u32                   brw_status;
+} WIRE_ATTR srpc_brw_reply_t; /* bulk r/w reply */
+
+#define SRPC_MSG_MAGIC                  0xeeb0f00d
+#define SRPC_MSG_VERSION                1
+
+typedef struct srpc_msg {
+	/** magic number */
+	__u32	msg_magic;
+	/** message version number */
+	__u32	msg_version;
+	/** type of message body: srpc_msg_type_t */
+	__u32	msg_type;
+	__u32	msg_reserved0;
+	__u32	msg_reserved1;
+	/** test session features */
+	__u32	msg_ses_feats;
+        union {
+                srpc_generic_reqst_t reqst;
+                srpc_generic_reply_t reply;
+
+                srpc_mksn_reqst_t    mksn_reqst;
+                srpc_mksn_reply_t    mksn_reply;
+                srpc_rmsn_reqst_t    rmsn_reqst;
+                srpc_rmsn_reply_t    rmsn_reply;
+                srpc_debug_reqst_t   dbg_reqst;
+                srpc_debug_reply_t   dbg_reply;
+                srpc_batch_reqst_t   bat_reqst;
+                srpc_batch_reply_t   bat_reply;
+                srpc_stat_reqst_t    stat_reqst;
+                srpc_stat_reply_t    stat_reply;
+                srpc_test_reqst_t    tes_reqst;
+                srpc_test_reply_t    tes_reply;
+                srpc_join_reqst_t    join_reqst;
+                srpc_join_reply_t    join_reply;
+
+                srpc_ping_reqst_t    ping_reqst;
+                srpc_ping_reply_t    ping_reply;
+                srpc_brw_reqst_t     brw_reqst;
+                srpc_brw_reply_t     brw_reply;
+        }     msg_body;
+} WIRE_ATTR srpc_msg_t;
+
+static inline void
+srpc_unpack_msg_hdr(srpc_msg_t *msg)
+{
+	if (msg->msg_magic == SRPC_MSG_MAGIC)
+		return; /* no flipping needed */
+
+	/* We do not swap the magic number here as it is needed to
+	   determine whether the body needs to be swapped. */
+	/* __swab32s(&msg->msg_magic); */
+	__swab32s(&msg->msg_type);
+	__swab32s(&msg->msg_version);
+	__swab32s(&msg->msg_ses_feats);
+	__swab32s(&msg->msg_reserved0);
+	__swab32s(&msg->msg_reserved1);
+}
+
+#endif /* __SELFTEST_RPC_H__ */
diff --git a/drivers/staging/lustrefsx/lnet/selftest/selftest.h b/drivers/staging/lustrefsx/lnet/selftest/selftest.h
new file mode 100644
index 0000000000000..2a29161cd4802
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/selftest.h
@@ -0,0 +1,614 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/selftest.h
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+#ifndef __SELFTEST_SELFTEST_H__
+#define __SELFTEST_SELFTEST_H__
+
+#define LNET_ONLY
+
+#include <libcfs/libcfs.h>
+#include <lnet/api.h>
+#include <lnet/lib-lnet.h>
+#include <lnet/lib-types.h>
+#include <lnet/lnetst.h>
+
+#include "rpc.h"
+#include "timer.h"
+
+#ifndef MADE_WITHOUT_COMPROMISE
+#define MADE_WITHOUT_COMPROMISE
+#endif
+
+
+#define SWI_STATE_NEWBORN                  0
+#define SWI_STATE_REPLY_SUBMITTED          1
+#define SWI_STATE_REPLY_SENT               2
+#define SWI_STATE_REQUEST_SUBMITTED        3
+#define SWI_STATE_REQUEST_SENT             4
+#define SWI_STATE_REPLY_RECEIVED           5
+#define SWI_STATE_BULK_STARTED             6
+#define SWI_STATE_DONE                     10
+
+/* forward refs */
+struct srpc_service;
+struct srpc_service_cd;
+struct sfw_test_unit;
+struct sfw_test_instance;
+
+/* services below SRPC_FRAMEWORK_SERVICE_MAX_ID are framework
+ * services, e.g. create/modify session.
+ */
+#define SRPC_SERVICE_DEBUG              0
+#define SRPC_SERVICE_MAKE_SESSION       1
+#define SRPC_SERVICE_REMOVE_SESSION     2
+#define SRPC_SERVICE_BATCH              3
+#define SRPC_SERVICE_TEST               4
+#define SRPC_SERVICE_QUERY_STAT         5
+#define SRPC_SERVICE_JOIN               6
+#define SRPC_FRAMEWORK_SERVICE_MAX_ID   10
+/* other services start from SRPC_FRAMEWORK_SERVICE_MAX_ID+1 */
+#define SRPC_SERVICE_BRW                11
+#define SRPC_SERVICE_PING               12
+#define SRPC_SERVICE_MAX_ID             12
+
+#define SRPC_REQUEST_PORTAL             50
+/* a lazy portal for framework RPC requests */
+#define SRPC_FRAMEWORK_REQUEST_PORTAL   51
+/* all reply/bulk RDMAs go to this portal */
+#define SRPC_RDMA_PORTAL                52
+
+static inline srpc_msg_type_t
+srpc_service2request (int service)
+{
+        switch (service) {
+        default:
+                LBUG ();
+        case SRPC_SERVICE_DEBUG:
+                return SRPC_MSG_DEBUG_REQST;
+
+        case SRPC_SERVICE_MAKE_SESSION:
+                return SRPC_MSG_MKSN_REQST;
+
+        case SRPC_SERVICE_REMOVE_SESSION:
+                return SRPC_MSG_RMSN_REQST;
+
+        case SRPC_SERVICE_BATCH:
+                return SRPC_MSG_BATCH_REQST;
+
+        case SRPC_SERVICE_TEST:
+                return SRPC_MSG_TEST_REQST;
+
+        case SRPC_SERVICE_QUERY_STAT:
+                return SRPC_MSG_STAT_REQST;
+
+        case SRPC_SERVICE_BRW:
+                return SRPC_MSG_BRW_REQST;
+
+        case SRPC_SERVICE_PING:
+                return SRPC_MSG_PING_REQST;
+
+        case SRPC_SERVICE_JOIN:
+                return SRPC_MSG_JOIN_REQST;
+        }
+}
+
+static inline srpc_msg_type_t
+srpc_service2reply (int service)
+{
+        return srpc_service2request(service) + 1;
+}
+
+typedef enum {
+        SRPC_BULK_REQ_RCVD   = 1, /* passive bulk request(PUT sink/GET source) received */
+        SRPC_BULK_PUT_SENT   = 2, /* active bulk PUT sent (source) */
+        SRPC_BULK_GET_RPLD   = 3, /* active bulk GET replied (sink) */
+        SRPC_REPLY_RCVD      = 4, /* incoming reply received */
+        SRPC_REPLY_SENT      = 5, /* outgoing reply sent */
+        SRPC_REQUEST_RCVD    = 6, /* incoming request received */
+        SRPC_REQUEST_SENT    = 7, /* outgoing request sent */
+} srpc_event_type_t;
+
+/* RPC event */
+typedef struct {
+        srpc_event_type_t ev_type;   /* what's up */
+	enum lnet_event_kind ev_lnet;   /* LNet event type */
+        int               ev_fired;  /* LNet event fired? */
+        int               ev_status; /* LNet event status */
+        void             *ev_data;   /* owning server/client RPC */
+} srpc_event_t;
+
+typedef struct {
+        int              bk_len;  /* len of bulk data */
+	struct lnet_handle_md bk_mdh;
+        int              bk_sink; /* sink/source */
+        int              bk_niov; /* # iov in bk_iovs */
+        lnet_kiov_t      bk_iovs[0];
+} srpc_bulk_t; /* bulk descriptor */
+
+/* message buffer descriptor */
+typedef struct srpc_buffer {
+	struct list_head	buf_list; /* chain on srpc_service::*_msgq */
+	srpc_msg_t		buf_msg;
+	struct lnet_handle_md	buf_mdh;
+	lnet_nid_t		buf_self;
+	struct lnet_process_id	buf_peer;
+} srpc_buffer_t;
+
+struct swi_workitem;
+typedef int (*swi_action_t) (struct swi_workitem *);
+
+typedef struct swi_workitem {
+	struct cfs_wi_sched	*swi_sched;
+	struct cfs_workitem       swi_workitem;
+        swi_action_t         swi_action;
+        int                  swi_state;
+} swi_workitem_t;
+
+/* server-side state of a RPC */
+typedef struct srpc_server_rpc {
+	/* chain on srpc_service::*_rpcq */
+	struct list_head	srpc_list;
+	struct srpc_service_cd *srpc_scd;
+	swi_workitem_t		srpc_wi;
+	srpc_event_t		srpc_ev;	/* bulk/reply event */
+	lnet_nid_t		srpc_self;
+	struct lnet_process_id	srpc_peer;
+	srpc_msg_t		srpc_replymsg;
+	struct lnet_handle_md	srpc_replymdh;
+	srpc_buffer_t		*srpc_reqstbuf;
+	srpc_bulk_t		*srpc_bulk;
+
+	unsigned int	srpc_aborted; /* being given up */
+	int		srpc_status;
+	void		(*srpc_done)(struct srpc_server_rpc *);
+} srpc_server_rpc_t;
+
+/* client-side state of a RPC */
+typedef struct srpc_client_rpc {
+	struct list_head	crpc_list;	/* chain on user's lists */
+	spinlock_t		crpc_lock;	/* serialize */
+	int			crpc_service;
+	atomic_t		crpc_refcount;
+	/* # seconds to wait for reply */
+	int			crpc_timeout;
+	stt_timer_t		crpc_timer;
+	swi_workitem_t		crpc_wi;
+	struct lnet_process_id	crpc_dest;
+
+        void               (*crpc_done)(struct srpc_client_rpc *);
+        void               (*crpc_fini)(struct srpc_client_rpc *);
+        int                  crpc_status;    /* completion status */
+        void                *crpc_priv;      /* caller data */
+
+        /* state flags */
+        unsigned int         crpc_aborted:1; /* being given up */
+        unsigned int         crpc_closed:1;  /* completed */
+
+        /* RPC events */
+        srpc_event_t         crpc_bulkev;    /* bulk event */
+        srpc_event_t         crpc_reqstev;   /* request event */
+        srpc_event_t         crpc_replyev;   /* reply event */
+
+        /* bulk, request(reqst), and reply exchanged on wire */
+        srpc_msg_t           crpc_reqstmsg;
+        srpc_msg_t           crpc_replymsg;
+	struct lnet_handle_md	crpc_reqstmdh;
+	struct lnet_handle_md	crpc_replymdh;
+        srpc_bulk_t          crpc_bulk;
+} srpc_client_rpc_t;
+
+#define srpc_client_rpc_size(rpc)                                       \
+offsetof(srpc_client_rpc_t, crpc_bulk.bk_iovs[(rpc)->crpc_bulk.bk_niov])
+
+#define srpc_client_rpc_addref(rpc)                                     \
+do {                                                                    \
+        CDEBUG(D_NET, "RPC[%p] -> %s (%d)++\n",                         \
+               (rpc), libcfs_id2str((rpc)->crpc_dest),                  \
+	       atomic_read(&(rpc)->crpc_refcount));                 \
+	LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0);            \
+	atomic_inc(&(rpc)->crpc_refcount);                          \
+} while (0)
+
+#define srpc_client_rpc_decref(rpc)                                     \
+do {                                                                    \
+        CDEBUG(D_NET, "RPC[%p] -> %s (%d)--\n",                         \
+               (rpc), libcfs_id2str((rpc)->crpc_dest),                  \
+	       atomic_read(&(rpc)->crpc_refcount));                 \
+	LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0);            \
+	if (atomic_dec_and_test(&(rpc)->crpc_refcount))             \
+                srpc_destroy_client_rpc(rpc);                           \
+} while (0)
+
+#define srpc_event_pending(rpc)   ((rpc)->crpc_bulkev.ev_fired == 0 ||  \
+                                   (rpc)->crpc_reqstev.ev_fired == 0 || \
+                                   (rpc)->crpc_replyev.ev_fired == 0)
+
+/* CPU partition data of srpc service */
+struct srpc_service_cd {
+	/** serialize */
+	spinlock_t		scd_lock;
+	/** backref to service */
+	struct srpc_service	*scd_svc;
+	/** event buffer */
+	srpc_event_t		scd_ev;
+	/** free RPC descriptors */
+	struct list_head	scd_rpc_free;
+	/** in-flight RPCs */
+	struct list_head	scd_rpc_active;
+	/** workitem for posting buffer */
+	swi_workitem_t		scd_buf_wi;
+	/** CPT id */
+	int			scd_cpt;
+	/** error code for scd_buf_wi */
+	int			scd_buf_err;
+	/** timestamp for scd_buf_err */
+	unsigned long		scd_buf_err_stamp;
+	/** total # request buffers */
+	int			scd_buf_total;
+	/** # posted request buffers */
+	int			scd_buf_nposted;
+	/** in progress of buffer posting */
+	int			scd_buf_posting;
+	/** allocate more buffers if scd_buf_nposted < scd_buf_low */
+	int			scd_buf_low;
+	/** increase/decrease some buffers */
+	int			scd_buf_adjust;
+	/** posted message buffers */
+	struct list_head	scd_buf_posted;
+	/** blocked for RPC descriptor */
+	struct list_head	scd_buf_blocked;
+};
+
+/* number of server workitems (mini-thread) for testing service */
+#define SFW_TEST_WI_MIN		256
+#define SFW_TEST_WI_MAX		2048
+/* extra buffers for tolerating buggy peers, or unbalanced number
+ * of peers between partitions  */
+#define SFW_TEST_WI_EXTRA	64
+
+/* number of server workitems (mini-thread) for framework service */
+#define SFW_FRWK_WI_MIN		16
+#define SFW_FRWK_WI_MAX		256
+
+typedef struct srpc_service {
+	int			sv_id;		/* service id */
+	const char		*sv_name;	/* human readable name */
+	int			sv_wi_total;	/* total server workitems */
+	int			sv_shuttingdown;
+	int			sv_ncpts;
+	/* percpt data for srpc_service */
+	struct srpc_service_cd	**sv_cpt_data;
+        /* Service callbacks:
+         * - sv_handler: process incoming RPC request
+         * - sv_bulk_ready: notify bulk data
+         */
+        int              (*sv_handler) (srpc_server_rpc_t *);
+        int              (*sv_bulk_ready) (srpc_server_rpc_t *, int);
+} srpc_service_t;
+
+typedef struct {
+	/* chain on fw_zombie_sessions */
+	struct list_head	sn_list;
+	struct lst_sid		sn_id;		/* unique identifier */
+	/* # seconds' inactivity to expire */
+	unsigned int		sn_timeout;
+	int			sn_timer_active;
+	unsigned int		sn_features;
+	stt_timer_t		sn_timer;
+	struct list_head	sn_batches;	/* list of batches */
+	char			sn_name[LST_NAME_SIZE];
+	atomic_t		sn_refcount;
+	atomic_t		sn_brw_errors;
+	atomic_t		sn_ping_errors;
+	cfs_time_t		sn_started;
+} sfw_session_t;
+
+#define sfw_sid_equal(sid0, sid1)     ((sid0).ses_nid == (sid1).ses_nid && \
+                                       (sid0).ses_stamp == (sid1).ses_stamp)
+
+typedef struct {
+	struct list_head	bat_list;	/* chain on sn_batches */
+	struct lst_bid		bat_id;		/* batch id */
+	int			bat_error;	/* error code of batch */
+	sfw_session_t		*bat_session;	/* batch's session */
+	atomic_t		bat_nactive;	/* # of active tests */
+	struct list_head	bat_tests;	/* test instances */
+} sfw_batch_t;
+
+typedef struct {
+        int  (*tso_init)(struct sfw_test_instance *tsi); /* intialize test client */
+        void (*tso_fini)(struct sfw_test_instance *tsi); /* finalize test client */
+        int  (*tso_prep_rpc)(struct sfw_test_unit *tsu,
+			     struct lnet_process_id dest,
+                             srpc_client_rpc_t **rpc);   /* prep a tests rpc */
+        void (*tso_done_rpc)(struct sfw_test_unit *tsu,
+                             srpc_client_rpc_t *rpc);    /* done a test rpc */
+} sfw_test_client_ops_t;
+
+typedef struct sfw_test_instance {
+	struct list_head	tsi_list;	/* chain on batch */
+	int			tsi_service;	/* test type */
+	sfw_batch_t		*tsi_batch;	/* batch */
+	sfw_test_client_ops_t	*tsi_ops;	/* test client operations */
+
+	/* public parameter for all test units */
+	unsigned int		tsi_is_client:1;     /* is test client */
+	unsigned int		tsi_stoptsu_onerr:1; /* stop tsu on error */
+        int                     tsi_concur;          /* concurrency */
+        int                     tsi_loop;            /* loop count */
+
+	/* status of test instance */
+	spinlock_t		tsi_lock;	/* serialize */
+	unsigned int		tsi_stopping:1;	/* test is stopping */
+	atomic_t		tsi_nactive;	/* # of active test unit */
+	struct list_head	tsi_units;	/* test units */
+	struct list_head	tsi_free_rpcs;	/* free rpcs */
+	struct list_head	tsi_active_rpcs;/* active rpcs */
+
+	union {
+		test_ping_req_t		ping;	  /* ping parameter */
+		test_bulk_req_t		bulk_v0;  /* bulk parameter */
+		test_bulk_req_v1_t	bulk_v1;  /* bulk v1 parameter */
+	} tsi_u;
+} sfw_test_instance_t;
+
+/* XXX: trailing (PAGE_SIZE % sizeof(struct lnet_process_id)) bytes at
+ * the end of pages are not used */
+#define SFW_MAX_CONCUR     LST_MAX_CONCUR
+#define SFW_ID_PER_PAGE    (PAGE_SIZE / sizeof(struct lnet_process_id_packed))
+#define SFW_MAX_NDESTS     (LNET_MAX_IOV * SFW_ID_PER_PAGE)
+#define sfw_id_pages(n)    (((n) + SFW_ID_PER_PAGE - 1) / SFW_ID_PER_PAGE)
+
+typedef struct sfw_test_unit {
+	struct list_head	tsu_list;	/* chain on lst_test_instance */
+	struct lnet_process_id	tsu_dest;	/* id of dest node */
+	int			tsu_loop;	/* loop count of the test */
+	sfw_test_instance_t	*tsu_instance;	/* pointer to test instance */
+	void			*tsu_private;	/* private data */
+	swi_workitem_t		tsu_worker;	/* workitem of the test unit */
+} sfw_test_unit_t;
+
+typedef struct sfw_test_case {
+	struct list_head	tsc_list;		/* chain on fw_tests */
+	srpc_service_t		*tsc_srv_service;	/* test service */
+	sfw_test_client_ops_t	*tsc_cli_ops;		/* ops of test client */
+} sfw_test_case_t;
+
+srpc_client_rpc_t *
+sfw_create_rpc(struct lnet_process_id peer, int service,
+	       unsigned features, int nbulkiov, int bulklen,
+	       void (*done) (srpc_client_rpc_t *), void *priv);
+int sfw_create_test_rpc(sfw_test_unit_t *tsu,
+			struct lnet_process_id peer, unsigned int features,
+			int nblk, int blklen, srpc_client_rpc_t **rpc);
+void sfw_abort_rpc(srpc_client_rpc_t *rpc);
+void sfw_post_rpc(srpc_client_rpc_t *rpc);
+void sfw_client_rpc_done(srpc_client_rpc_t *rpc);
+void sfw_unpack_message(srpc_msg_t *msg);
+void sfw_free_pages(srpc_server_rpc_t *rpc);
+void sfw_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i);
+int sfw_alloc_pages(srpc_server_rpc_t *rpc, int cpt, int npages, int len,
+		    int sink);
+int sfw_make_session (srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply);
+
+srpc_client_rpc_t *
+srpc_create_client_rpc(struct lnet_process_id peer, int service,
+                       int nbulkiov, int bulklen,
+                       void (*rpc_done)(srpc_client_rpc_t *),
+                       void (*rpc_fini)(srpc_client_rpc_t *), void *priv);
+void srpc_post_rpc(srpc_client_rpc_t *rpc);
+void srpc_abort_rpc(srpc_client_rpc_t *rpc, int why);
+void srpc_free_bulk(srpc_bulk_t *bk);
+srpc_bulk_t *srpc_alloc_bulk(int cpt, unsigned off, unsigned bulk_npg,
+			     unsigned bulk_len, int sink);
+int srpc_send_rpc(swi_workitem_t *wi);
+int srpc_send_reply(srpc_server_rpc_t *rpc);
+int srpc_add_service(srpc_service_t *sv);
+int srpc_remove_service(srpc_service_t *sv);
+void srpc_shutdown_service(srpc_service_t *sv);
+void srpc_abort_service(srpc_service_t *sv);
+int srpc_finish_service(srpc_service_t *sv);
+int srpc_service_add_buffers(srpc_service_t *sv, int nbuffer);
+void srpc_service_remove_buffers(srpc_service_t *sv, int nbuffer);
+void srpc_get_counters(struct srpc_counters *cnt);
+void srpc_set_counters(const struct srpc_counters *cnt);
+
+extern struct cfs_wi_sched *lst_sched_serial;
+extern struct cfs_wi_sched **lst_sched_test;
+
+static inline int
+srpc_serv_is_framework(struct srpc_service *svc)
+{
+	return svc->sv_id < SRPC_FRAMEWORK_SERVICE_MAX_ID;
+}
+
+static inline int
+swi_wi_action(struct cfs_workitem *wi)
+{
+        swi_workitem_t *swi = container_of(wi, swi_workitem_t, swi_workitem);
+
+        return swi->swi_action(swi);
+}
+
+static inline void
+swi_init_workitem(swi_workitem_t *swi, void *data,
+		  swi_action_t action, struct cfs_wi_sched *sched)
+{
+	swi->swi_sched  = sched;
+	swi->swi_action = action;
+	swi->swi_state  = SWI_STATE_NEWBORN;
+	cfs_wi_init(&swi->swi_workitem, data, swi_wi_action);
+}
+
+static inline void
+swi_schedule_workitem(swi_workitem_t *wi)
+{
+	cfs_wi_schedule(wi->swi_sched, &wi->swi_workitem);
+}
+
+static inline void
+swi_exit_workitem(swi_workitem_t *swi)
+{
+	cfs_wi_exit(swi->swi_sched, &swi->swi_workitem);
+}
+
+static inline int
+swi_deschedule_workitem(swi_workitem_t *swi)
+{
+	return cfs_wi_deschedule(swi->swi_sched, &swi->swi_workitem);
+}
+
+int sfw_startup(void);
+int srpc_startup(void);
+void sfw_shutdown(void);
+void srpc_shutdown(void);
+
+static inline void
+srpc_destroy_client_rpc (srpc_client_rpc_t *rpc)
+{
+	LASSERT (rpc != NULL);
+	LASSERT (!srpc_event_pending(rpc));
+	LASSERT (atomic_read(&rpc->crpc_refcount) == 0);
+
+	if (rpc->crpc_fini == NULL) {
+		LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+	} else {
+		(*rpc->crpc_fini) (rpc);
+	}
+
+	return;
+}
+
+static inline void
+srpc_init_client_rpc(srpc_client_rpc_t *rpc, struct lnet_process_id peer,
+		     int service, int nbulkiov, int bulklen,
+		     void (*rpc_done)(srpc_client_rpc_t *),
+		     void (*rpc_fini)(srpc_client_rpc_t *), void *priv)
+{
+	LASSERT(nbulkiov <= LNET_MAX_IOV);
+
+	memset(rpc, 0, offsetof(srpc_client_rpc_t,
+				crpc_bulk.bk_iovs[nbulkiov]));
+
+	INIT_LIST_HEAD(&rpc->crpc_list);
+	swi_init_workitem(&rpc->crpc_wi, rpc, srpc_send_rpc,
+			  lst_sched_test[lnet_cpt_of_nid(peer.nid, NULL)]);
+	spin_lock_init(&rpc->crpc_lock);
+	atomic_set(&rpc->crpc_refcount, 1); /* 1 ref for caller */
+
+	rpc->crpc_dest         = peer;
+	rpc->crpc_priv         = priv;
+        rpc->crpc_service      = service;
+        rpc->crpc_bulk.bk_len  = bulklen;
+        rpc->crpc_bulk.bk_niov = nbulkiov;
+        rpc->crpc_done         = rpc_done;
+        rpc->crpc_fini         = rpc_fini;
+	LNetInvalidateMDHandle(&rpc->crpc_reqstmdh);
+	LNetInvalidateMDHandle(&rpc->crpc_replymdh);
+	LNetInvalidateMDHandle(&rpc->crpc_bulk.bk_mdh);
+
+        /* no event is expected at this point */
+        rpc->crpc_bulkev.ev_fired  =
+        rpc->crpc_reqstev.ev_fired =
+        rpc->crpc_replyev.ev_fired = 1;
+
+        rpc->crpc_reqstmsg.msg_magic   = SRPC_MSG_MAGIC;
+        rpc->crpc_reqstmsg.msg_version = SRPC_MSG_VERSION;
+        rpc->crpc_reqstmsg.msg_type    = srpc_service2request(service);
+        return;
+}
+
+static inline const char *
+swi_state2str (int state)
+{
+#define STATE2STR(x) case x: return #x
+        switch(state) {
+                default:
+                        LBUG();
+                STATE2STR(SWI_STATE_NEWBORN);
+                STATE2STR(SWI_STATE_REPLY_SUBMITTED);
+                STATE2STR(SWI_STATE_REPLY_SENT);
+                STATE2STR(SWI_STATE_REQUEST_SUBMITTED);
+                STATE2STR(SWI_STATE_REQUEST_SENT);
+                STATE2STR(SWI_STATE_REPLY_RECEIVED);
+                STATE2STR(SWI_STATE_BULK_STARTED);
+                STATE2STR(SWI_STATE_DONE);
+        }
+#undef STATE2STR
+}
+
+#define lst_wait_until(cond, lock, fmt, ...)				\
+do {									\
+	int __I = 2;							\
+	while (!(cond)) {						\
+		CDEBUG(is_power_of_2(++__I) ? D_WARNING : D_NET,	\
+		       fmt, ## __VA_ARGS__);				\
+		spin_unlock(&(lock));					\
+									\
+		set_current_state(TASK_UNINTERRUPTIBLE);		\
+		schedule_timeout(cfs_time_seconds(1) / 10);		\
+									\
+		spin_lock(&(lock));					\
+	}								\
+} while (0)
+
+static inline void
+srpc_wait_service_shutdown(srpc_service_t *sv)
+{
+	int i = 2;
+
+	LASSERT(sv->sv_shuttingdown);
+
+	while (srpc_finish_service(sv) == 0) {
+		i++;
+		CDEBUG(((i & -i) == i) ? D_WARNING : D_NET,
+		       "Waiting for %s service to shutdown...\n",
+		       sv->sv_name);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(1) / 10);
+	}
+}
+
+extern sfw_test_client_ops_t ping_test_client;
+extern srpc_service_t        ping_test_service;
+void ping_init_test_client(void);
+void ping_init_test_service(void);
+
+extern sfw_test_client_ops_t brw_test_client;
+extern srpc_service_t        brw_test_service;
+void brw_init_test_client(void);
+void brw_init_test_service(void);
+
+#endif /* __SELFTEST_SELFTEST_H__ */
diff --git a/drivers/staging/lustrefsx/lnet/selftest/timer.c b/drivers/staging/lustrefsx/lnet/selftest/timer.c
new file mode 100644
index 0000000000000..7e09e6672b3ef
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/timer.c
@@ -0,0 +1,246 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/timer.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+
+/*
+ * Timers are implemented as a sorted queue of expiry times. The queue
+ * is slotted, with each slot holding timers which expire in a
+ * 2**STTIMER_MINPOLL (8) second period. The timers in each slot are
+ * sorted by increasing expiry time. The number of slots is 2**7 (128),
+ * to cover a time period of 1024 seconds into the future before wrapping.
+ */
+#define STTIMER_MINPOLL        3   /* log2 min poll interval (8 s) */
+#define STTIMER_SLOTTIME       (1 << STTIMER_MINPOLL)
+#define STTIMER_SLOTTIMEMASK   (~(STTIMER_SLOTTIME - 1))
+#define STTIMER_NSLOTS	       (1 << 7)
+#define STTIMER_SLOT(t)	       (&stt_data.stt_hash[(((t) >> STTIMER_MINPOLL) & \
+                                                    (STTIMER_NSLOTS - 1))])
+
+static struct st_timer_data {
+	spinlock_t		stt_lock;
+	/* start time of the slot processed previously */
+	cfs_time_t		stt_prev_slot;
+	struct list_head	stt_hash[STTIMER_NSLOTS];
+	int			stt_shuttingdown;
+	wait_queue_head_t	stt_waitq;
+	int			stt_nthreads;
+} stt_data;
+
+void
+stt_add_timer(stt_timer_t *timer)
+{
+	struct list_head *pos;
+
+	spin_lock(&stt_data.stt_lock);
+
+	LASSERT(stt_data.stt_nthreads > 0);
+	LASSERT(!stt_data.stt_shuttingdown);
+	LASSERT(timer->stt_func != NULL);
+	LASSERT(list_empty(&timer->stt_list));
+	LASSERT(cfs_time_after(timer->stt_expires, cfs_time_current_sec()));
+
+	/* a simple insertion sort */
+	list_for_each_prev(pos, STTIMER_SLOT(timer->stt_expires)) {
+		stt_timer_t *old = list_entry(pos, stt_timer_t, stt_list);
+
+		if (timer->stt_expires >= old->stt_expires)
+			break;
+	}
+	list_add(&timer->stt_list, pos);
+
+	spin_unlock(&stt_data.stt_lock);
+}
+
+/*
+ * The function returns whether it has deactivated a pending timer or not.
+ * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
+ * active timer returns 1.)
+ *
+ * CAVEAT EMPTOR:
+ * When 0 is returned, it is possible that timer->stt_func _is_ running on
+ * another CPU.
+ */
+int
+stt_del_timer(stt_timer_t *timer)
+{
+	int ret = 0;
+
+	spin_lock(&stt_data.stt_lock);
+
+	LASSERT(stt_data.stt_nthreads > 0);
+	LASSERT(!stt_data.stt_shuttingdown);
+
+	if (!list_empty(&timer->stt_list)) {
+		ret = 1;
+		list_del_init(&timer->stt_list);
+	}
+
+	spin_unlock(&stt_data.stt_lock);
+	return ret;
+}
+
+/* called with stt_data.stt_lock held */
+static int
+stt_expire_list(struct list_head *slot, cfs_time_t now)
+{
+	int	     expired = 0;
+	stt_timer_t *timer;
+
+	while (!list_empty(slot)) {
+		timer = list_entry(slot->next, stt_timer_t, stt_list);
+
+		if (timer->stt_expires > now)
+			break;
+
+		list_del_init(&timer->stt_list);
+		spin_unlock(&stt_data.stt_lock);
+
+		expired++;
+		(*timer->stt_func) (timer->stt_data);
+
+		spin_lock(&stt_data.stt_lock);
+	}
+
+	return expired;
+}
+
+static int
+stt_check_timers(cfs_time_t *last)
+{
+	int expired = 0;
+	cfs_time_t now;
+        cfs_time_t this_slot;
+
+	now = cfs_time_current_sec();
+        this_slot = now & STTIMER_SLOTTIMEMASK;
+
+	spin_lock(&stt_data.stt_lock);
+
+	while (cfs_time_aftereq(this_slot, *last)) {
+		expired += stt_expire_list(STTIMER_SLOT(this_slot), now);
+		this_slot = cfs_time_sub(this_slot, STTIMER_SLOTTIME);
+	}
+
+	*last = now & STTIMER_SLOTTIMEMASK;
+	spin_unlock(&stt_data.stt_lock);
+	return expired;
+}
+
+
+static int
+stt_timer_main (void *arg)
+{
+        int rc = 0;
+
+        cfs_block_allsigs();
+
+	while (!stt_data.stt_shuttingdown) {
+		stt_check_timers(&stt_data.stt_prev_slot);
+
+		rc = wait_event_timeout(stt_data.stt_waitq,
+					stt_data.stt_shuttingdown,
+					cfs_time_seconds(STTIMER_SLOTTIME));
+	}
+
+	spin_lock(&stt_data.stt_lock);
+	stt_data.stt_nthreads--;
+	spin_unlock(&stt_data.stt_lock);
+	return rc;
+}
+
+static int
+stt_start_timer_thread (void)
+{
+	struct task_struct *task;
+
+	LASSERT(!stt_data.stt_shuttingdown);
+
+	task = kthread_run(stt_timer_main, NULL, "st_timer");
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+
+	spin_lock(&stt_data.stt_lock);
+	stt_data.stt_nthreads++;
+	spin_unlock(&stt_data.stt_lock);
+	return 0;
+}
+
+
+int
+stt_startup (void)
+{
+        int rc = 0;
+        int i;
+
+        stt_data.stt_shuttingdown = 0;
+	stt_data.stt_prev_slot = cfs_time_current_sec() & STTIMER_SLOTTIMEMASK;
+
+	spin_lock_init(&stt_data.stt_lock);
+        for (i = 0; i < STTIMER_NSLOTS; i++)
+		INIT_LIST_HEAD(&stt_data.stt_hash[i]);
+
+	stt_data.stt_nthreads = 0;
+	init_waitqueue_head(&stt_data.stt_waitq);
+	rc = stt_start_timer_thread();
+	if (rc != 0)
+		CERROR ("Can't spawn timer thread: %d\n", rc);
+
+        return rc;
+}
+
+void
+stt_shutdown(void)
+{
+	int i;
+
+	spin_lock(&stt_data.stt_lock);
+
+	for (i = 0; i < STTIMER_NSLOTS; i++)
+		LASSERT(list_empty(&stt_data.stt_hash[i]));
+
+	stt_data.stt_shuttingdown = 1;
+
+	wake_up(&stt_data.stt_waitq);
+	lst_wait_until(stt_data.stt_nthreads == 0, stt_data.stt_lock,
+		       "waiting for %d threads to terminate\n",
+		       stt_data.stt_nthreads);
+
+	spin_unlock(&stt_data.stt_lock);
+}
diff --git a/drivers/staging/lustrefsx/lnet/selftest/timer.h b/drivers/staging/lustrefsx/lnet/selftest/timer.h
new file mode 100644
index 0000000000000..71c3de2736b15
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/timer.h
@@ -0,0 +1,49 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/timer.h
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+#ifndef __SELFTEST_TIMER_H__
+#define __SELFTEST_TIMER_H__
+
+typedef struct {
+	struct list_head	stt_list;
+	cfs_time_t		stt_expires;
+	void			(*stt_func)(void *);
+	void			*stt_data;
+} stt_timer_t;
+
+void stt_add_timer(stt_timer_t *timer);
+int stt_del_timer(stt_timer_t *timer);
+int stt_startup(void);
+void stt_shutdown(void);
+
+#endif /* __SELFTEST_TIMER_H__ */
diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_handler.c b/drivers/staging/lustrefsx/lustre/fid/fid_handler.c
new file mode 100644
index 0000000000000..ef61772f0dcb2
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/fid/fid_handler.c
@@ -0,0 +1,655 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_handler.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+#include <libcfs/libcfs.h>
+#include <linux/module.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <dt_object.h>
+#include <obd_support.h>
+#include <lustre_req_layout.h>
+#include <lustre_fid.h>
+#include "fid_internal.h"
+
+static void seq_server_proc_fini(struct lu_server_seq *seq);
+
+/* Assigns client to sequence controller node. */
+int seq_server_set_cli(const struct lu_env *env, struct lu_server_seq *seq,
+		       struct lu_client_seq *cli)
+{
+        int rc = 0;
+        ENTRY;
+
+        /*
+         * Ask client for new range, assign that range to ->seq_space and write
+         * seq state to backing store should be atomic.
+         */
+	mutex_lock(&seq->lss_mutex);
+
+	if (cli == NULL) {
+		CDEBUG(D_INFO, "%s: Detached sequence client\n", seq->lss_name);
+		seq->lss_cli = NULL;
+		GOTO(out_up, rc = 0);
+	}
+
+	if (seq->lss_cli != NULL) {
+		CDEBUG(D_HA, "%s: Sequence controller is already "
+		       "assigned\n", seq->lss_name);
+		GOTO(out_up, rc = -EEXIST);
+	}
+
+        CDEBUG(D_INFO, "%s: Attached sequence controller %s\n",
+               seq->lss_name, cli->lcs_name);
+
+	seq->lss_cli = cli;
+	cli->lcs_space.lsr_index = seq->lss_site->ss_node_id;
+	EXIT;
+out_up:
+	mutex_unlock(&seq->lss_mutex);
+        return rc;
+}
+EXPORT_SYMBOL(seq_server_set_cli);
+/*
+ * allocate \a w units of sequence from range \a from.
+ */
+static inline void range_alloc(struct lu_seq_range *to,
+			       struct lu_seq_range *from,
+			       __u64 width)
+{
+	width = min(lu_seq_range_space(from), width);
+	to->lsr_start = from->lsr_start;
+	to->lsr_end = from->lsr_start + width;
+	from->lsr_start += width;
+}
+
+/**
+ * On controller node, allocate new super sequence for regular sequence server.
+ * As this super sequence controller, this node suppose to maintain fld
+ * and update index.
+ * \a out range always has currect mds node number of requester.
+ */
+
+static int __seq_server_alloc_super(struct lu_server_seq *seq,
+                                    struct lu_seq_range *out,
+                                    const struct lu_env *env)
+{
+	struct lu_seq_range *space = &seq->lss_space;
+	int rc;
+	ENTRY;
+
+	LASSERT(lu_seq_range_is_sane(space));
+
+	if (lu_seq_range_is_exhausted(space)) {
+		CERROR("%s: Sequences space is exhausted\n",
+		       seq->lss_name);
+		RETURN(-ENOSPC);
+	} else {
+		range_alloc(out, space, seq->lss_width);
+	}
+
+	rc = seq_store_update(env, seq, out, 1 /* sync */);
+
+	LCONSOLE_INFO("%s: super-sequence allocation rc = %d " DRANGE"\n",
+		      seq->lss_name, rc, PRANGE(out));
+
+	RETURN(rc);
+}
+
+int seq_server_alloc_super(struct lu_server_seq *seq,
+                           struct lu_seq_range *out,
+                           const struct lu_env *env)
+{
+        int rc;
+        ENTRY;
+
+	mutex_lock(&seq->lss_mutex);
+        rc = __seq_server_alloc_super(seq, out, env);
+	mutex_unlock(&seq->lss_mutex);
+
+        RETURN(rc);
+}
+
+int seq_server_alloc_spec(struct lu_server_seq *seq,
+			  struct lu_seq_range *spec,
+			  const struct lu_env *env)
+{
+	struct lu_seq_range *space = &seq->lss_space;
+	int rc = -ENOSPC;
+	ENTRY;
+
+	/*
+	 * In some cases (like recovery after a disaster)
+	 * we may need to allocate sequences manually
+	 * Notice some sequences can be lost if requested
+	 * range doesn't start at the beginning of current
+	 * free space. Also notice it's not possible now
+	 * to allocate sequences out of natural order.
+	 */
+	if (spec->lsr_start >= spec->lsr_end)
+		RETURN(-EINVAL);
+	if (spec->lsr_flags != LU_SEQ_RANGE_MDT &&
+	    spec->lsr_flags != LU_SEQ_RANGE_OST)
+		RETURN(-EINVAL);
+
+	mutex_lock(&seq->lss_mutex);
+	if (spec->lsr_start >= space->lsr_start) {
+		space->lsr_start = spec->lsr_end;
+		rc = seq_store_update(env, seq, spec, 1 /* sync */);
+
+		LCONSOLE_INFO("%s: "DRANGE" sequences allocated: rc = %d \n",
+			      seq->lss_name, PRANGE(spec), rc);
+	}
+	mutex_unlock(&seq->lss_mutex);
+
+	RETURN(rc);
+}
+
+static int __seq_set_init(const struct lu_env *env,
+                            struct lu_server_seq *seq)
+{
+        struct lu_seq_range *space = &seq->lss_space;
+        int rc;
+
+        range_alloc(&seq->lss_lowater_set, space, seq->lss_set_width);
+        range_alloc(&seq->lss_hiwater_set, space, seq->lss_set_width);
+
+        rc = seq_store_update(env, seq, NULL, 1);
+
+        return rc;
+}
+
+/*
+ * This function implements new seq allocation algorithm using async
+ * updates to seq file on disk. ref bug 18857 for details.
+ * there are four variable to keep track of this process
+ *
+ * lss_space; - available lss_space
+ * lss_lowater_set; - lu_seq_range for all seqs before barrier, i.e. safe to use
+ * lss_hiwater_set; - lu_seq_range after barrier, i.e. allocated but may be
+ *                    not yet committed
+ *
+ * when lss_lowater_set reaches the end it is replaced with hiwater one and
+ * a write operation is initiated to allocate new hiwater range.
+ * if last seq write opearion is still not committed, current operation is
+ * flaged as sync write op.
+ */
+static int range_alloc_set(const struct lu_env *env,
+			   struct lu_seq_range *out,
+			   struct lu_server_seq *seq)
+{
+	struct lu_seq_range *space = &seq->lss_space;
+	struct lu_seq_range *loset = &seq->lss_lowater_set;
+	struct lu_seq_range *hiset = &seq->lss_hiwater_set;
+	int rc = 0;
+
+	if (lu_seq_range_is_zero(loset))
+		__seq_set_init(env, seq);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_SEQ_ALLOC)) /* exhaust set */
+		loset->lsr_start = loset->lsr_end;
+
+	if (lu_seq_range_is_exhausted(loset)) {
+		/* reached high water mark. */
+		struct lu_device *dev = seq->lss_site->ss_lu->ls_top_dev;
+		int obd_num_clients = dev->ld_obd->obd_num_exports;
+		__u64 set_sz;
+
+		/* calculate new seq width based on number of clients */
+		set_sz = max(seq->lss_set_width,
+			     obd_num_clients * seq->lss_width);
+		set_sz = min(lu_seq_range_space(space), set_sz);
+
+		/* Switch to hiwater range now */
+		*loset = *hiset;
+		/* allocate new hiwater range */
+		range_alloc(hiset, space, set_sz);
+
+		/* update ondisk seq with new *space */
+		rc = seq_store_update(env, seq, NULL, seq->lss_need_sync);
+	}
+
+	LASSERTF(!lu_seq_range_is_exhausted(loset) ||
+		 lu_seq_range_is_sane(loset),
+		 DRANGE"\n", PRANGE(loset));
+
+	if (rc == 0)
+		range_alloc(out, loset, seq->lss_width);
+
+	RETURN(rc);
+}
+
+/**
+ * Check if the sequence server has sequence avaible
+ *
+ * Check if the sequence server has sequence avaible, if not, then
+ * allocating super sequence from sequence manager (MDT0).
+ *
+ * \param[in] env	execution environment
+ * \param[in] seq	server sequence
+ *
+ * \retval		negative errno if allocating new sequence fails
+ * \retval		0 if there is enough sequence or allocating
+ *                      new sequence succeeds
+ */
+int seq_server_check_and_alloc_super(const struct lu_env *env,
+				     struct lu_server_seq *seq)
+{
+	struct lu_seq_range *space = &seq->lss_space;
+	int rc = 0;
+
+	ENTRY;
+
+	/* Check if available space ends and allocate new super seq */
+	if (lu_seq_range_is_exhausted(space)) {
+		if (!seq->lss_cli) {
+			CERROR("%s: No sequence controller is attached.\n",
+			       seq->lss_name);
+			RETURN(-ENODEV);
+		}
+
+		rc = seq_client_alloc_super(seq->lss_cli, env);
+		if (rc) {
+			CDEBUG(D_HA, "%s: Can't allocate super-sequence:"
+			      " rc %d\n", seq->lss_name, rc);
+			RETURN(rc);
+		}
+
+		/* Saving new range to allocation space. */
+		*space = seq->lss_cli->lcs_space;
+		LASSERT(lu_seq_range_is_sane(space));
+		if (seq->lss_cli->lcs_srv == NULL) {
+			struct lu_server_fld *fld;
+
+			/* Insert it to the local FLDB */
+			fld = seq->lss_site->ss_server_fld;
+			mutex_lock(&fld->lsf_lock);
+			rc = fld_insert_entry(env, fld, space);
+			mutex_unlock(&fld->lsf_lock);
+		}
+	}
+
+	if (lu_seq_range_is_zero(&seq->lss_lowater_set))
+		__seq_set_init(env, seq);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(seq_server_check_and_alloc_super);
+
+static int __seq_server_alloc_meta(struct lu_server_seq *seq,
+				   struct lu_seq_range *out,
+				   const struct lu_env *env)
+{
+	struct lu_seq_range *space = &seq->lss_space;
+	int rc = 0;
+
+	ENTRY;
+
+	LASSERT(lu_seq_range_is_sane(space));
+
+	rc = seq_server_check_and_alloc_super(env, seq);
+	if (rc < 0) {
+		if (rc == -EINPROGRESS) {
+			static int printed;
+
+			if (printed++ % 8 == 0)
+				LCONSOLE_INFO("%s: Waiting to contact MDT0000 "
+					      "to allocate super-sequence\n",
+					      seq->lss_name);
+		} else {
+			CERROR("%s: Allocated super-sequence failed: rc = %d\n",
+			       seq->lss_name, rc);
+		}
+		RETURN(rc);
+	}
+
+	rc = range_alloc_set(env, out, seq);
+	if (rc != 0) {
+		CERROR("%s: Allocated meta-sequence failed: rc = %d\n",
+			seq->lss_name, rc);
+		RETURN(rc);
+	}
+
+	CDEBUG(D_INFO, "%s: Allocated meta-sequence " DRANGE"\n",
+		seq->lss_name, PRANGE(out));
+
+	RETURN(rc);
+}
+
+int seq_server_alloc_meta(struct lu_server_seq *seq,
+                          struct lu_seq_range *out,
+                          const struct lu_env *env)
+{
+        int rc;
+        ENTRY;
+
+	mutex_lock(&seq->lss_mutex);
+        rc = __seq_server_alloc_meta(seq, out, env);
+	mutex_unlock(&seq->lss_mutex);
+
+        RETURN(rc);
+}
+EXPORT_SYMBOL(seq_server_alloc_meta);
+
+static int seq_server_handle(struct lu_site *site,
+                             const struct lu_env *env,
+                             __u32 opc, struct lu_seq_range *out)
+{
+	int rc;
+	struct seq_server_site *ss_site;
+	struct dt_device *dev;
+	ENTRY;
+
+	ss_site = lu_site2seq(site);
+
+	switch (opc) {
+	case SEQ_ALLOC_META:
+		if (!ss_site->ss_server_seq) {
+			CERROR("Sequence server is not "
+			       "initialized\n");
+			RETURN(-EINVAL);
+		}
+
+		dev = lu2dt_dev(ss_site->ss_server_seq->lss_obj->do_lu.lo_dev);
+		if (dev->dd_rdonly)
+			RETURN(-EROFS);
+
+		rc = seq_server_alloc_meta(ss_site->ss_server_seq, out, env);
+		break;
+	case SEQ_ALLOC_SUPER:
+		if (!ss_site->ss_control_seq) {
+			CERROR("Sequence controller is not "
+			       "initialized\n");
+			RETURN(-EINVAL);
+		}
+
+		dev = lu2dt_dev(ss_site->ss_control_seq->lss_obj->do_lu.lo_dev);
+		if (dev->dd_rdonly)
+			RETURN(-EROFS);
+
+		rc = seq_server_alloc_super(ss_site->ss_control_seq, out, env);
+		break;
+	default:
+		rc = -EINVAL;
+		break;
+	}
+
+	RETURN(rc);
+}
+
+static int seq_handler(struct tgt_session_info *tsi)
+{
+	struct lu_seq_range	*out, *tmp;
+	struct lu_site		*site;
+	int			 rc;
+	__u32			*opc;
+
+	ENTRY;
+
+	LASSERT(!(lustre_msg_get_flags(tgt_ses_req(tsi)->rq_reqmsg) & MSG_REPLAY));
+	site = tsi->tsi_exp->exp_obd->obd_lu_dev->ld_site;
+	LASSERT(site != NULL);
+
+	opc = req_capsule_client_get(tsi->tsi_pill, &RMF_SEQ_OPC);
+	if (opc != NULL) {
+		out = req_capsule_server_get(tsi->tsi_pill, &RMF_SEQ_RANGE);
+		if (out == NULL)
+			RETURN(err_serious(-EPROTO));
+
+		tmp = req_capsule_client_get(tsi->tsi_pill, &RMF_SEQ_RANGE);
+
+		/* seq client passed mdt id, we need to pass that using out
+		 * range parameter */
+
+		out->lsr_index = tmp->lsr_index;
+		out->lsr_flags = tmp->lsr_flags;
+		rc = seq_server_handle(site, tsi->tsi_env, *opc, out);
+	} else {
+		rc = err_serious(-EPROTO);
+	}
+
+	RETURN(rc);
+}
+
+struct tgt_handler seq_handlers[] = {
+TGT_SEQ_HDL(HABEO_REFERO,	SEQ_QUERY,	seq_handler),
+};
+EXPORT_SYMBOL(seq_handlers);
+
+/* context key constructor/destructor: seq_key_init, seq_key_fini */
+LU_KEY_INIT_FINI(seq, struct seq_thread_info);
+
+/* context key: seq_thread_key */
+LU_CONTEXT_KEY_DEFINE(seq, LCT_MD_THREAD | LCT_DT_THREAD);
+
+extern const struct file_operations seq_fld_proc_seq_fops;
+
+static int seq_server_proc_init(struct lu_server_seq *seq)
+{
+#ifdef CONFIG_PROC_FS
+	int rc;
+	ENTRY;
+
+	seq->lss_proc_dir = lprocfs_register(seq->lss_name,
+					     seq_type_proc_dir,
+					     NULL, NULL);
+	if (IS_ERR(seq->lss_proc_dir)) {
+		rc = PTR_ERR(seq->lss_proc_dir);
+		RETURN(rc);
+	}
+
+	rc = lprocfs_add_vars(seq->lss_proc_dir, seq_server_proc_list, seq);
+	if (rc) {
+		CERROR("%s: Can't init sequence manager "
+		       "proc, rc %d\n", seq->lss_name, rc);
+		GOTO(out_cleanup, rc);
+	}
+
+	if (seq->lss_type == LUSTRE_SEQ_CONTROLLER) {
+		rc = lprocfs_seq_create(seq->lss_proc_dir, "fldb", 0644,
+					&seq_fld_proc_seq_fops, seq);
+		if (rc) {
+			CERROR("%s: Can't create fldb for sequence manager "
+			       "proc: rc = %d\n", seq->lss_name, rc);
+			GOTO(out_cleanup, rc);
+		}
+	}
+
+	RETURN(0);
+
+out_cleanup:
+	seq_server_proc_fini(seq);
+	return rc;
+#else /* !CONFIG_PROC_FS */
+	return 0;
+#endif /* CONFIG_PROC_FS */
+}
+
+static void seq_server_proc_fini(struct lu_server_seq *seq)
+{
+#ifdef CONFIG_PROC_FS
+        ENTRY;
+        if (seq->lss_proc_dir != NULL) {
+                if (!IS_ERR(seq->lss_proc_dir))
+                        lprocfs_remove(&seq->lss_proc_dir);
+                seq->lss_proc_dir = NULL;
+        }
+        EXIT;
+#endif /* CONFIG_PROC_FS */
+}
+
+int seq_server_init(const struct lu_env *env,
+		    struct lu_server_seq *seq,
+		    struct dt_device *dev,
+		    const char *prefix,
+		    enum lu_mgr_type type,
+		    struct seq_server_site *ss)
+{
+	int rc, is_srv = (type == LUSTRE_SEQ_SERVER);
+	ENTRY;
+
+	LASSERT(dev != NULL);
+	LASSERT(prefix != NULL);
+	LASSERT(ss != NULL);
+	LASSERT(ss->ss_lu != NULL);
+
+	/* A compile-time check for FIDs that used to be in lustre_idl.h
+	 * but is moved here to remove CLASSERT/LASSERT in that header.
+	 * Check all lu_fid fields are converted in fid_cpu_to_le() and friends
+	 * and that there is no padding added by compiler to the struct. */
+	{
+		struct lu_fid tst;
+
+		CLASSERT(sizeof(tst) == sizeof(tst.f_seq) +
+			 sizeof(tst.f_oid) + sizeof(tst.f_ver));
+	}
+
+	seq->lss_cli = NULL;
+	seq->lss_type = type;
+	seq->lss_site = ss;
+	lu_seq_range_init(&seq->lss_space);
+
+	lu_seq_range_init(&seq->lss_lowater_set);
+	lu_seq_range_init(&seq->lss_hiwater_set);
+	seq->lss_set_width = LUSTRE_SEQ_BATCH_WIDTH;
+
+	mutex_init(&seq->lss_mutex);
+
+        seq->lss_width = is_srv ?
+                LUSTRE_SEQ_META_WIDTH : LUSTRE_SEQ_SUPER_WIDTH;
+
+        snprintf(seq->lss_name, sizeof(seq->lss_name),
+                 "%s-%s", (is_srv ? "srv" : "ctl"), prefix);
+
+        rc = seq_store_init(seq, env, dev);
+        if (rc)
+                GOTO(out, rc);
+        /* Request backing store for saved sequence info. */
+        rc = seq_store_read(seq, env);
+        if (rc == -ENODATA) {
+
+                /* Nothing is read, init by default value. */
+                seq->lss_space = is_srv ?
+                        LUSTRE_SEQ_ZERO_RANGE:
+                        LUSTRE_SEQ_SPACE_RANGE;
+
+		seq->lss_space.lsr_index = ss->ss_node_id;
+		LCONSOLE_INFO("%s: No data found "
+			      "on store. Initialize space\n",
+			      seq->lss_name);
+
+                rc = seq_store_update(env, seq, NULL, 0);
+                if (rc) {
+                        CERROR("%s: Can't write space data, "
+                               "rc %d\n", seq->lss_name, rc);
+                }
+        } else if (rc) {
+                CERROR("%s: Can't read space data, rc %d\n",
+                       seq->lss_name, rc);
+                GOTO(out, rc);
+        }
+
+	if (is_srv) {
+		LASSERT(lu_seq_range_is_sane(&seq->lss_space));
+	} else {
+		LASSERT(!lu_seq_range_is_zero(&seq->lss_space) &&
+			lu_seq_range_is_sane(&seq->lss_space));
+	}
+
+        rc  = seq_server_proc_init(seq);
+        if (rc)
+                GOTO(out, rc);
+
+        EXIT;
+out:
+        if (rc)
+                seq_server_fini(seq, env);
+        return rc;
+}
+EXPORT_SYMBOL(seq_server_init);
+
+void seq_server_fini(struct lu_server_seq *seq,
+                     const struct lu_env *env)
+{
+        ENTRY;
+
+        seq_server_proc_fini(seq);
+        seq_store_fini(seq, env);
+
+        EXIT;
+}
+EXPORT_SYMBOL(seq_server_fini);
+
+int seq_site_fini(const struct lu_env *env, struct seq_server_site *ss)
+{
+	if (ss == NULL)
+		RETURN(0);
+
+	if (ss->ss_server_seq) {
+		seq_server_fini(ss->ss_server_seq, env);
+		OBD_FREE_PTR(ss->ss_server_seq);
+		ss->ss_server_seq = NULL;
+	}
+
+	if (ss->ss_control_seq) {
+		seq_server_fini(ss->ss_control_seq, env);
+		OBD_FREE_PTR(ss->ss_control_seq);
+		ss->ss_control_seq = NULL;
+	}
+
+	if (ss->ss_client_seq) {
+		seq_client_fini(ss->ss_client_seq);
+		OBD_FREE_PTR(ss->ss_client_seq);
+		ss->ss_client_seq = NULL;
+	}
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(seq_site_fini);
+
+int fid_server_mod_init(void)
+{
+	LU_CONTEXT_KEY_INIT(&seq_thread_key);
+	return lu_context_key_register(&seq_thread_key);
+}
+
+void fid_server_mod_exit(void)
+{
+	lu_context_key_degister(&seq_thread_key);
+}
diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_internal.h b/drivers/staging/lustrefsx/lustre/fid/fid_internal.h
new file mode 100644
index 0000000000000..9ad1420e1812e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/fid/fid_internal.h
@@ -0,0 +1,99 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_internal.h
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+#ifndef __FID_INTERNAL_H
+#define __FID_INTERNAL_H
+
+#include <lustre/lustre_idl.h>
+#include <libcfs/libcfs.h>
+
+#ifdef HAVE_SERVER_SUPPORT
+# define HAVE_SEQ_SERVER
+
+struct req_capsule;
+
+struct seq_thread_info {
+        struct req_capsule     *sti_pill;
+        struct lu_seq_range     sti_space;
+        struct lu_buf           sti_buf;
+};
+
+enum {
+        SEQ_TXN_STORE_CREDITS = 20
+};
+
+extern struct lu_context_key seq_thread_key;
+
+# ifdef CONFIG_PROC_FS
+extern struct lprocfs_vars seq_server_proc_list[];
+# endif
+
+/* Store API functions. */
+struct dt_device;
+
+int seq_store_init(struct lu_server_seq *seq,
+		   const struct lu_env *env,
+		   struct dt_device *dt);
+
+void seq_store_fini(struct lu_server_seq *seq,
+		    const struct lu_env *env);
+
+int seq_store_read(struct lu_server_seq *seq,
+		   const struct lu_env *env);
+
+int seq_store_update(const struct lu_env *env, struct lu_server_seq *seq,
+		     struct lu_seq_range *out, int sync);
+
+int seq_server_alloc_spec(struct lu_server_seq *seq,
+			  struct lu_seq_range *spec,
+			  const struct lu_env *env);
+
+int fid_server_mod_init(void);
+
+void fid_server_mod_exit(void);
+
+# endif /* HAVE_SERVER_SUPPORT */
+
+/* Functions used internally in module. */
+int seq_client_alloc_super(struct lu_client_seq *seq,
+			   const struct lu_env *env);
+
+# ifdef CONFIG_PROC_FS
+extern struct lprocfs_vars seq_client_proc_list[];
+# endif
+
+extern struct proc_dir_entry *seq_type_proc_dir;
+
+#endif /* __FID_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_lib.c b/drivers/staging/lustrefsx/lustre/fid/fid_lib.c
new file mode 100644
index 0000000000000..7c5477c044351
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/fid/fid_lib.c
@@ -0,0 +1,100 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_lib.c
+ *
+ * Miscellaneous fid functions.
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+#include <libcfs/libcfs.h>
+#include <linux/module.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_fid.h>
+
+/**
+ * A cluster-wide range from which fid-sequences are granted to servers and
+ * then clients.
+ *
+ * Fid namespace:
+ * <pre>
+ * Normal FID:        seq:64 [2^33,2^64-1]      oid:32          ver:32
+ * IGIF      :        0:32, ino:32              gen:32          0:32
+ * IDIF      :        0:31, 1:1, ost-index:16,  objd:48         0:32
+ * </pre>
+ *
+ * The first 0x400 sequences of normal FID are reserved for special purpose.
+ * FID_SEQ_START + 1 is for local file id generation.
+ * FID_SEQ_START + 2 is for .lustre directory and its objects
+ */
+const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE = {
+	.lsr_start	= FID_SEQ_NORMAL,
+	.lsr_end	= (__u64)~0ULL,
+};
+
+/* Zero range, used for init and other purposes. */
+const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE = {
+	.lsr_start = 0,
+};
+
+/* Lustre Big Fs Lock fid. */
+const struct lu_fid LUSTRE_BFL_FID = { .f_seq = FID_SEQ_SPECIAL,
+                                       .f_oid = FID_OID_SPECIAL_BFL,
+                                       .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LUSTRE_BFL_FID);
+
+/** Special fid for ".lustre" directory */
+const struct lu_fid LU_DOT_LUSTRE_FID = { .f_seq = FID_SEQ_DOT_LUSTRE,
+                                          .f_oid = FID_OID_DOT_LUSTRE,
+                                          .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LU_DOT_LUSTRE_FID);
+
+/** Special fid for "fid" special object in .lustre */
+const struct lu_fid LU_OBF_FID = { .f_seq = FID_SEQ_DOT_LUSTRE,
+                                   .f_oid = FID_OID_DOT_LUSTRE_OBF,
+                                   .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LU_OBF_FID);
+
+/** Special fid for "lost+found" special object in .lustre */
+const struct lu_fid LU_LPF_FID = { .f_seq = FID_SEQ_DOT_LUSTRE,
+				   .f_oid = FID_OID_DOT_LUSTRE_LPF,
+				   .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LU_LPF_FID);
+
+/** "/lost+found" - special FID for ldiskfs backend, invislbe to client. */
+const struct lu_fid LU_BACKEND_LPF_FID = { .f_seq = FID_SEQ_LOCAL_FILE,
+					   .f_oid = OSD_LPF_OID,
+					   .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LU_BACKEND_LPF_FID);
diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_request.c b/drivers/staging/lustrefsx/lustre/fid/fid_request.c
new file mode 100644
index 0000000000000..ab1cca59bc916
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/fid/fid_request.c
@@ -0,0 +1,629 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_request.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+#include <linux/module.h>
+#include <libcfs/libcfs.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+/* mdc RPC locks */
+#include <lustre_mdc.h>
+#include "fid_internal.h"
+
+static int seq_client_rpc(struct lu_client_seq *seq,
+                          struct lu_seq_range *output, __u32 opc,
+                          const char *opcname)
+{
+	struct obd_export     *exp = seq->lcs_exp;
+	struct ptlrpc_request *req;
+	struct lu_seq_range   *out, *in;
+	__u32                 *op;
+	unsigned int           debug_mask;
+	int                    rc;
+	ENTRY;
+
+	LASSERT(exp != NULL && !IS_ERR(exp));
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_SEQ_QUERY,
+					LUSTRE_MDS_VERSION, SEQ_QUERY);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	/* Init operation code */
+	op = req_capsule_client_get(&req->rq_pill, &RMF_SEQ_OPC);
+	*op = opc;
+
+	/* Zero out input range, this is not recovery yet. */
+	in = req_capsule_client_get(&req->rq_pill, &RMF_SEQ_RANGE);
+	lu_seq_range_init(in);
+
+	ptlrpc_request_set_replen(req);
+
+	in->lsr_index = seq->lcs_space.lsr_index;
+	if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+		fld_range_set_mdt(in);
+	else
+		fld_range_set_ost(in);
+
+	if (opc == SEQ_ALLOC_SUPER) {
+		req->rq_request_portal = SEQ_CONTROLLER_PORTAL;
+		req->rq_reply_portal = MDC_REPLY_PORTAL;
+		/* During allocating super sequence for data object,
+		 * the current thread might hold the export of MDT0(MDT0
+		 * precreating objects on this OST), and it will send the
+		 * request to MDT0 here, so we can not keep resending the
+		 * request here, otherwise if MDT0 is failed(umounted),
+		 * it can not release the export of MDT0 */
+		if (seq->lcs_type == LUSTRE_SEQ_DATA)
+			req->rq_no_delay = req->rq_no_resend = 1;
+		debug_mask = D_CONSOLE;
+	} else {
+		if (seq->lcs_type == LUSTRE_SEQ_METADATA) {
+			req->rq_reply_portal = MDC_REPLY_PORTAL;
+			req->rq_request_portal = SEQ_METADATA_PORTAL;
+		} else {
+			req->rq_reply_portal = OSC_REPLY_PORTAL;
+			req->rq_request_portal = SEQ_DATA_PORTAL;
+		}
+
+		debug_mask = D_INFO;
+	}
+
+	/* Allow seq client RPC during recovery time. */
+	req->rq_allow_replay = 1;
+
+	ptlrpc_at_set_req_timeout(req);
+
+	rc = ptlrpc_queue_wait(req);
+
+	if (rc)
+		GOTO(out_req, rc);
+
+	out = req_capsule_server_get(&req->rq_pill, &RMF_SEQ_RANGE);
+	*output = *out;
+
+	if (!lu_seq_range_is_sane(output)) {
+		CERROR("%s: Invalid range received from server: "
+		       DRANGE"\n", seq->lcs_name, PRANGE(output));
+		GOTO(out_req, rc = -EINVAL);
+	}
+
+	if (lu_seq_range_is_exhausted(output)) {
+		CERROR("%s: Range received from server is exhausted: "
+		       DRANGE"]\n", seq->lcs_name, PRANGE(output));
+		GOTO(out_req, rc = -EINVAL);
+	}
+
+	CDEBUG_LIMIT(debug_mask, "%s: Allocated %s-sequence "DRANGE"]\n",
+		     seq->lcs_name, opcname, PRANGE(output));
+
+	EXIT;
+out_req:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+/* Request sequence-controller node to allocate new super-sequence. */
+int seq_client_alloc_super(struct lu_client_seq *seq,
+                           const struct lu_env *env)
+{
+        int rc;
+        ENTRY;
+
+	mutex_lock(&seq->lcs_mutex);
+
+        if (seq->lcs_srv) {
+#ifdef HAVE_SEQ_SERVER
+                LASSERT(env != NULL);
+                rc = seq_server_alloc_super(seq->lcs_srv, &seq->lcs_space,
+                                            env);
+#else
+		rc = 0;
+#endif
+	} else {
+		/* Check whether the connection to seq controller has been
+		 * setup (lcs_exp != NULL) */
+		if (seq->lcs_exp == NULL) {
+			mutex_unlock(&seq->lcs_mutex);
+			RETURN(-EINPROGRESS);
+		}
+
+		rc = seq_client_rpc(seq, &seq->lcs_space,
+                                    SEQ_ALLOC_SUPER, "super");
+        }
+	mutex_unlock(&seq->lcs_mutex);
+        RETURN(rc);
+}
+
+/* Request sequence-controller node to allocate new meta-sequence. */
+static int seq_client_alloc_meta(const struct lu_env *env,
+                                 struct lu_client_seq *seq)
+{
+        int rc;
+        ENTRY;
+
+        if (seq->lcs_srv) {
+#ifdef HAVE_SEQ_SERVER
+                LASSERT(env != NULL);
+                rc = seq_server_alloc_meta(seq->lcs_srv, &seq->lcs_space, env);
+#else
+		rc = 0;
+#endif
+	} else {
+		do {
+			/* If meta server return -EINPROGRESS or EAGAIN,
+			 * it means meta server might not be ready to
+			 * allocate super sequence from sequence controller
+			 * (MDT0)yet */
+			rc = seq_client_rpc(seq, &seq->lcs_space,
+					    SEQ_ALLOC_META, "meta");
+			if (rc == -EINPROGRESS || rc == -EAGAIN) {
+				wait_queue_head_t waitq;
+				struct l_wait_info  lwi;
+
+				/* MDT0 is not ready, let's wait for 2
+				 * seconds and retry. */
+				init_waitqueue_head(&waitq);
+				lwi = LWI_TIMEOUT(cfs_time_seconds(2), NULL,
+						  NULL);
+				l_wait_event(waitq, 0, &lwi);
+			}
+		} while (rc == -EINPROGRESS || rc == -EAGAIN);
+        }
+
+        RETURN(rc);
+}
+
+/* Allocate new sequence for client. */
+static int seq_client_alloc_seq(const struct lu_env *env,
+				struct lu_client_seq *seq, u64 *seqnr)
+{
+	int rc;
+	ENTRY;
+
+	LASSERT(lu_seq_range_is_sane(&seq->lcs_space));
+
+	if (lu_seq_range_is_exhausted(&seq->lcs_space)) {
+                rc = seq_client_alloc_meta(env, seq);
+                if (rc) {
+			if (rc != -EINPROGRESS)
+				CERROR("%s: Can't allocate new meta-sequence,"
+				       "rc = %d\n", seq->lcs_name, rc);
+                        RETURN(rc);
+                } else {
+                        CDEBUG(D_INFO, "%s: New range - "DRANGE"\n",
+                               seq->lcs_name, PRANGE(&seq->lcs_space));
+                }
+        } else {
+                rc = 0;
+        }
+
+	LASSERT(!lu_seq_range_is_exhausted(&seq->lcs_space));
+	*seqnr = seq->lcs_space.lsr_start;
+	seq->lcs_space.lsr_start += 1;
+
+	CDEBUG(D_INFO, "%s: Allocated sequence [%#llx]\n", seq->lcs_name,
+               *seqnr);
+
+        RETURN(rc);
+}
+
+static int seq_fid_alloc_prep(struct lu_client_seq *seq,
+			      wait_queue_entry_t *link)
+{
+	if (seq->lcs_update) {
+		add_wait_queue(&seq->lcs_waitq, link);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&seq->lcs_mutex);
+
+		schedule();
+
+		mutex_lock(&seq->lcs_mutex);
+		remove_wait_queue(&seq->lcs_waitq, link);
+		set_current_state(TASK_RUNNING);
+		return -EAGAIN;
+	}
+
+	++seq->lcs_update;
+	mutex_unlock(&seq->lcs_mutex);
+
+	return 0;
+}
+
+static void seq_fid_alloc_fini(struct lu_client_seq *seq, __u64 seqnr,
+			       bool whole)
+{
+	LASSERT(seq->lcs_update == 1);
+
+	mutex_lock(&seq->lcs_mutex);
+	if (seqnr != 0) {
+		CDEBUG(D_INFO, "%s: New sequence [0x%16.16llx]\n",
+		       seq->lcs_name, seqnr);
+
+		seq->lcs_fid.f_seq = seqnr;
+		if (whole) {
+			/* Since the caller require the whole seq,
+			 * so marked this seq to be used */
+			if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+				seq->lcs_fid.f_oid =
+					LUSTRE_METADATA_SEQ_MAX_WIDTH;
+			else
+				seq->lcs_fid.f_oid = LUSTRE_DATA_SEQ_MAX_WIDTH;
+		} else {
+			seq->lcs_fid.f_oid = LUSTRE_FID_INIT_OID;
+		}
+		seq->lcs_fid.f_ver = 0;
+	}
+
+	--seq->lcs_update;
+	wake_up_all(&seq->lcs_waitq);
+}
+
+/**
+ * Allocate the whole non-used seq to the caller.
+ *
+ * \param[in] env	pointer to the thread context
+ * \param[in,out] seq	pointer to the client sequence manager
+ * \param[out] seqnr	to hold the new allocated sequence
+ *
+ * \retval		0 for new sequence allocated.
+ * \retval		Negative error number on failure.
+ */
+int seq_client_get_seq(const struct lu_env *env,
+		       struct lu_client_seq *seq, u64 *seqnr)
+{
+	wait_queue_entry_t link;
+	int rc;
+
+	LASSERT(seqnr != NULL);
+
+	mutex_lock(&seq->lcs_mutex);
+	init_waitqueue_entry(&link, current);
+
+	/* To guarantee that we can get a whole non-used sequence. */
+	while (seq_fid_alloc_prep(seq, &link) != 0);
+
+	rc = seq_client_alloc_seq(env, seq, seqnr);
+	seq_fid_alloc_fini(seq, rc ? 0 : *seqnr, true);
+	if (rc)
+		CERROR("%s: Can't allocate new sequence: rc = %d\n",
+		       seq->lcs_name, rc);
+	mutex_unlock(&seq->lcs_mutex);
+
+	return rc;
+}
+EXPORT_SYMBOL(seq_client_get_seq);
+
+/**
+ * Allocate new fid on passed client @seq and save it to @fid.
+ *
+ * \param[in] env	pointer to the thread context
+ * \param[in,out] seq	pointer to the client sequence manager
+ * \param[out] fid	to hold the new allocated fid
+ *
+ * \retval		1 for notify the caller that sequence switch
+ *			is performed to allow it to setup FLD for it.
+ * \retval		0 for new FID allocated in current sequence.
+ * \retval		Negative error number on failure.
+ */
+int seq_client_alloc_fid(const struct lu_env *env,
+			 struct lu_client_seq *seq, struct lu_fid *fid)
+{
+	wait_queue_entry_t link;
+	int rc;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+	LASSERT(fid != NULL);
+
+	init_waitqueue_entry(&link, current);
+	mutex_lock(&seq->lcs_mutex);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_SEQ_EXHAUST))
+		seq->lcs_fid.f_oid = seq->lcs_width;
+
+	while (1) {
+		u64 seqnr;
+
+		if (unlikely(!fid_is_zero(&seq->lcs_fid) &&
+			     fid_oid(&seq->lcs_fid) < seq->lcs_width)) {
+			/* Just bump last allocated fid and return to caller. */
+			seq->lcs_fid.f_oid++;
+			rc = 0;
+			break;
+		}
+
+		/* Release seq::lcs_mutex via seq_fid_alloc_prep() to avoid
+		 * deadlock during seq_client_alloc_seq(). */
+		rc = seq_fid_alloc_prep(seq, &link);
+		if (rc)
+			continue;
+
+		rc = seq_client_alloc_seq(env, seq, &seqnr);
+		/* Re-take seq::lcs_mutex via seq_fid_alloc_fini(). */
+		seq_fid_alloc_fini(seq, rc ? 0 : seqnr, false);
+		if (rc) {
+			if (rc != -EINPROGRESS)
+				CERROR("%s: Can't allocate new sequence: "
+				       "rc = %d\n", seq->lcs_name, rc);
+			mutex_unlock(&seq->lcs_mutex);
+
+			RETURN(rc);
+		}
+
+		rc = 1;
+		break;
+	}
+
+	*fid = seq->lcs_fid;
+	mutex_unlock(&seq->lcs_mutex);
+
+	CDEBUG(D_INFO, "%s: Allocated FID "DFID"\n", seq->lcs_name,  PFID(fid));
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(seq_client_alloc_fid);
+
+/*
+ * Finish the current sequence due to disconnect.
+ * See mdc_import_event()
+ */
+void seq_client_flush(struct lu_client_seq *seq)
+{
+	wait_queue_entry_t link;
+
+	LASSERT(seq != NULL);
+	init_waitqueue_entry(&link, current);
+	mutex_lock(&seq->lcs_mutex);
+
+	while (seq->lcs_update) {
+		add_wait_queue(&seq->lcs_waitq, &link);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&seq->lcs_mutex);
+
+		schedule();
+
+		mutex_lock(&seq->lcs_mutex);
+		remove_wait_queue(&seq->lcs_waitq, &link);
+		set_current_state(TASK_RUNNING);
+	}
+
+        fid_zero(&seq->lcs_fid);
+        /**
+         * this id shld not be used for seq range allocation.
+         * set to -1 for dgb check.
+         */
+
+        seq->lcs_space.lsr_index = -1;
+
+	lu_seq_range_init(&seq->lcs_space);
+	mutex_unlock(&seq->lcs_mutex);
+}
+EXPORT_SYMBOL(seq_client_flush);
+
+static void seq_client_proc_fini(struct lu_client_seq *seq)
+{
+#ifdef CONFIG_PROC_FS
+	ENTRY;
+	if (seq->lcs_proc_dir) {
+		if (!IS_ERR(seq->lcs_proc_dir))
+			lprocfs_remove(&seq->lcs_proc_dir);
+		seq->lcs_proc_dir = NULL;
+	}
+	EXIT;
+#endif /* CONFIG_PROC_FS */
+}
+
+static int seq_client_proc_init(struct lu_client_seq *seq)
+{
+#ifdef CONFIG_PROC_FS
+        int rc;
+        ENTRY;
+
+	seq->lcs_proc_dir = lprocfs_register(seq->lcs_name, seq_type_proc_dir,
+					     NULL, NULL);
+        if (IS_ERR(seq->lcs_proc_dir)) {
+                CERROR("%s: LProcFS failed in seq-init\n",
+                       seq->lcs_name);
+                rc = PTR_ERR(seq->lcs_proc_dir);
+                RETURN(rc);
+        }
+
+	rc = lprocfs_add_vars(seq->lcs_proc_dir, seq_client_proc_list, seq);
+        if (rc) {
+                CERROR("%s: Can't init sequence manager "
+                       "proc, rc %d\n", seq->lcs_name, rc);
+                GOTO(out_cleanup, rc);
+        }
+
+        RETURN(0);
+
+out_cleanup:
+        seq_client_proc_fini(seq);
+        return rc;
+
+#else /* !CONFIG_PROC_FS */
+	return 0;
+#endif /* CONFIG_PROC_FS */
+}
+
+int seq_client_init(struct lu_client_seq *seq,
+                    struct obd_export *exp,
+                    enum lu_cli_type type,
+                    const char *prefix,
+                    struct lu_server_seq *srv)
+{
+	int rc;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+	LASSERT(prefix != NULL);
+
+	seq->lcs_srv = srv;
+	seq->lcs_type = type;
+
+	mutex_init(&seq->lcs_mutex);
+	if (type == LUSTRE_SEQ_METADATA)
+		seq->lcs_width = LUSTRE_METADATA_SEQ_MAX_WIDTH;
+	else
+		seq->lcs_width = LUSTRE_DATA_SEQ_MAX_WIDTH;
+
+	init_waitqueue_head(&seq->lcs_waitq);
+	/* Make sure that things are clear before work is started. */
+	seq_client_flush(seq);
+
+	if (exp != NULL)
+		seq->lcs_exp = class_export_get(exp);
+
+	snprintf(seq->lcs_name, sizeof(seq->lcs_name),
+		 "cli-%s", prefix);
+
+	rc = seq_client_proc_init(seq);
+	if (rc)
+		seq_client_fini(seq);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(seq_client_init);
+
+void seq_client_fini(struct lu_client_seq *seq)
+{
+        ENTRY;
+
+        seq_client_proc_fini(seq);
+
+        if (seq->lcs_exp != NULL) {
+                class_export_put(seq->lcs_exp);
+                seq->lcs_exp = NULL;
+        }
+
+        seq->lcs_srv = NULL;
+        EXIT;
+}
+EXPORT_SYMBOL(seq_client_fini);
+
+int client_fid_init(struct obd_device *obd,
+		    struct obd_export *exp, enum lu_cli_type type)
+{
+	struct client_obd *cli = &obd->u.cli;
+	char *prefix;
+	int rc;
+	ENTRY;
+
+	down_write(&cli->cl_seq_rwsem);
+	OBD_ALLOC_PTR(cli->cl_seq);
+	if (!cli->cl_seq)
+		GOTO(out, rc = -ENOMEM);
+
+	OBD_ALLOC(prefix, MAX_OBD_NAME + 5);
+	if (!prefix)
+		GOTO(out, rc = -ENOMEM);
+
+	snprintf(prefix, MAX_OBD_NAME + 5, "cli-%s", obd->obd_name);
+
+	/* Init client side sequence-manager */
+	rc = seq_client_init(cli->cl_seq, exp, type, prefix, NULL);
+	OBD_FREE(prefix, MAX_OBD_NAME + 5);
+
+	GOTO(out, rc);
+
+out:
+	if (rc && cli->cl_seq) {
+		OBD_FREE_PTR(cli->cl_seq);
+		cli->cl_seq = NULL;
+	}
+	up_write(&cli->cl_seq_rwsem);
+
+	return rc;
+}
+EXPORT_SYMBOL(client_fid_init);
+
+int client_fid_fini(struct obd_device *obd)
+{
+	struct client_obd *cli = &obd->u.cli;
+	ENTRY;
+
+	down_write(&cli->cl_seq_rwsem);
+	if (cli->cl_seq) {
+		seq_client_fini(cli->cl_seq);
+		OBD_FREE_PTR(cli->cl_seq);
+		cli->cl_seq = NULL;
+	}
+	up_write(&cli->cl_seq_rwsem);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(client_fid_fini);
+
+struct proc_dir_entry *seq_type_proc_dir;
+
+static int __init fid_init(void)
+{
+	seq_type_proc_dir = lprocfs_register(LUSTRE_SEQ_NAME,
+					     proc_lustre_root,
+					     NULL, NULL);
+	if (IS_ERR(seq_type_proc_dir))
+		return PTR_ERR(seq_type_proc_dir);
+
+# ifdef HAVE_SERVER_SUPPORT
+	fid_server_mod_init();
+# endif
+
+	return 0;
+}
+
+static void __exit fid_exit(void)
+{
+# ifdef HAVE_SERVER_SUPPORT
+	fid_server_mod_exit();
+# endif
+
+	if (seq_type_proc_dir != NULL && !IS_ERR(seq_type_proc_dir)) {
+		lprocfs_remove(&seq_type_proc_dir);
+		seq_type_proc_dir = NULL;
+	}
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre File IDentifier");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+MODULE_LICENSE("GPL");
+
+module_init(fid_init);
+module_exit(fid_exit);
diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_store.c b/drivers/staging/lustrefsx/lustre/fid/fid_store.c
new file mode 100644
index 0000000000000..225ddfad6f634
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/fid/fid_store.c
@@ -0,0 +1,248 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_store.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+#include <libcfs/libcfs.h>
+#include <dt_object.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <lustre_fld.h>
+#include "fid_internal.h"
+
+static struct lu_buf *seq_store_buf(struct seq_thread_info *info)
+{
+        struct lu_buf *buf;
+
+        buf = &info->sti_buf;
+        buf->lb_buf = &info->sti_space;
+        buf->lb_len = sizeof(info->sti_space);
+        return buf;
+}
+
+struct seq_update_callback {
+        struct dt_txn_commit_cb suc_cb;
+        struct lu_server_seq   *suc_seq;
+};
+
+void seq_update_cb(struct lu_env *env, struct thandle *th,
+		   struct dt_txn_commit_cb *cb, int err)
+{
+	struct seq_update_callback *ccb;
+
+	ccb = container_of0(cb, struct seq_update_callback, suc_cb);
+
+	LASSERT(ccb->suc_seq != NULL);
+
+	ccb->suc_seq->lss_need_sync = 0;
+	OBD_FREE_PTR(ccb);
+}
+
+int seq_update_cb_add(struct thandle *th, struct lu_server_seq *seq)
+{
+	struct seq_update_callback *ccb;
+	struct dt_txn_commit_cb	   *dcb;
+	int			   rc;
+
+	OBD_ALLOC_PTR(ccb);
+	if (ccb == NULL)
+		return -ENOMEM;
+
+	ccb->suc_seq	   = seq;
+	seq->lss_need_sync = 1;
+
+	dcb	       = &ccb->suc_cb;
+	dcb->dcb_func  = seq_update_cb;
+	INIT_LIST_HEAD(&dcb->dcb_linkage);
+	strlcpy(dcb->dcb_name, "seq_update_cb", sizeof(dcb->dcb_name));
+
+	rc = dt_trans_cb_add(th, dcb);
+	if (rc)
+		OBD_FREE_PTR(ccb);
+	return rc;
+}
+
+/* This function implies that caller takes care about locking. */
+int seq_store_update(const struct lu_env *env, struct lu_server_seq *seq,
+                     struct lu_seq_range *out, int sync)
+{
+	struct dt_device *dt_dev = lu2dt_dev(seq->lss_obj->do_lu.lo_dev);
+	struct seq_thread_info *info;
+	struct thandle *th;
+	loff_t pos = 0;
+	int rc;
+
+	if (dt_dev->dd_rdonly)
+		RETURN(0);
+
+	info = lu_context_key_get(&env->le_ctx, &seq_thread_key);
+	LASSERT(info != NULL);
+
+	th = dt_trans_create(env, dt_dev);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	/* Store ranges in le format. */
+	range_cpu_to_le(&info->sti_space, &seq->lss_space);
+
+	rc = dt_declare_record_write(env, seq->lss_obj,
+				     seq_store_buf(info), 0, th);
+	if (rc)
+		GOTO(exit, rc);
+
+	if (out != NULL) {
+		rc = fld_declare_server_create(env,
+					       seq->lss_site->ss_server_fld,
+					       out, th);
+		if (rc)
+			GOTO(exit, rc);
+	}
+
+	rc = dt_trans_start_local(env, dt_dev, th);
+	if (rc)
+		GOTO(exit, rc);
+
+	rc = dt_record_write(env, seq->lss_obj, seq_store_buf(info), &pos, th);
+	if (rc) {
+		CERROR("%s: Can't write space data, rc %d\n",
+		       seq->lss_name, rc);
+		GOTO(exit, rc);
+	} else if (out != NULL) {
+		rc = fld_server_create(env, seq->lss_site->ss_server_fld, out,
+				       th);
+		if (rc) {
+			CERROR("%s: Can't Update fld database, rc %d\n",
+				seq->lss_name, rc);
+			GOTO(exit, rc);
+		}
+	}
+	/* next sequence update will need sync until this update is committed
+	 * in case of sync operation this is not needed obviously */
+	if (!sync)
+		/* if callback can't be added then sync always */
+		sync = !!seq_update_cb_add(th, seq);
+
+	th->th_sync |= sync;
+exit:
+	dt_trans_stop(env, dt_dev, th);
+	return rc;
+}
+
+/*
+ * This function implies that caller takes care about locking or locking is not
+ * needed (init time).
+ */
+int seq_store_read(struct lu_server_seq *seq,
+                   const struct lu_env *env)
+{
+        struct seq_thread_info *info;
+        loff_t pos = 0;
+        int rc;
+        ENTRY;
+
+        info = lu_context_key_get(&env->le_ctx, &seq_thread_key);
+        LASSERT(info != NULL);
+
+	rc = dt_read(env, seq->lss_obj, seq_store_buf(info), &pos);
+
+        if (rc == sizeof(info->sti_space)) {
+                range_le_to_cpu(&seq->lss_space, &info->sti_space);
+                CDEBUG(D_INFO, "%s: Space - "DRANGE"\n",
+                       seq->lss_name, PRANGE(&seq->lss_space));
+                rc = 0;
+        } else if (rc == 0) {
+                rc = -ENODATA;
+	} else if (rc > 0) {
+                CERROR("%s: Read only %d bytes of %d\n", seq->lss_name,
+                       rc, (int)sizeof(info->sti_space));
+                rc = -EIO;
+        }
+
+        RETURN(rc);
+}
+
+int seq_store_init(struct lu_server_seq *seq,
+                   const struct lu_env *env,
+                   struct dt_device *dt)
+{
+        struct dt_object *dt_obj;
+	struct lu_fid fid;
+	struct lu_attr attr;
+	struct dt_object_format dof;
+        const char *name;
+        int rc;
+        ENTRY;
+
+	name = seq->lss_type == LUSTRE_SEQ_SERVER ?
+		LUSTRE_SEQ_SRV_NAME : LUSTRE_SEQ_CTL_NAME;
+
+	if (seq->lss_type == LUSTRE_SEQ_SERVER)
+		lu_local_obj_fid(&fid, FID_SEQ_SRV_OID);
+	else
+		lu_local_obj_fid(&fid, FID_SEQ_CTL_OID);
+
+	memset(&attr, 0, sizeof(attr));
+	attr.la_valid = LA_MODE;
+	attr.la_mode = S_IFREG | 0666;
+	dof.dof_type = DFT_REGULAR;
+
+	dt_obj = dt_find_or_create(env, dt, &fid, &dof, &attr);
+        if (!IS_ERR(dt_obj)) {
+                seq->lss_obj = dt_obj;
+                rc = 0;
+        } else {
+                CERROR("%s: Can't find \"%s\" obj %d\n",
+                       seq->lss_name, name, (int)PTR_ERR(dt_obj));
+                rc = PTR_ERR(dt_obj);
+        }
+
+        RETURN(rc);
+}
+
+void seq_store_fini(struct lu_server_seq *seq, const struct lu_env *env)
+{
+	ENTRY;
+
+	if (seq->lss_obj != NULL) {
+		if (!IS_ERR(seq->lss_obj))
+			dt_object_put(env, seq->lss_obj);
+		seq->lss_obj = NULL;
+	}
+
+	EXIT;
+}
diff --git a/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c b/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c
new file mode 100644
index 0000000000000..11ae1b6996532
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c
@@ -0,0 +1,651 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/lproc_fid.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+#include <libcfs/libcfs.h>
+#include <linux/module.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <lprocfs_status.h>
+#include "fid_internal.h"
+
+#ifdef CONFIG_PROC_FS
+
+/* Format: [0x64BIT_INT - 0x64BIT_INT] + 32 bytes just in case */
+#define MAX_FID_RANGE_STRLEN (32 + 2 * 2 * sizeof(__u64))
+/**
+ * Reduce the SEQ range allocated to a node to a strict subset of the range
+ * currently-allocated SEQ range.  If the specified range is "clear", then
+ * drop all allocated sequences and request a new one from the master.
+ *
+ * Note: this function should only be used for testing, it is not necessarily
+ * safe for production use.
+ */
+static int
+lprocfs_fid_write_common(const char __user *buffer, size_t count,
+				struct lu_seq_range *range)
+{
+	struct lu_seq_range tmp = {
+		.lsr_start = 0,
+	};
+	char kernbuf[MAX_FID_RANGE_STRLEN];
+	ENTRY;
+
+	LASSERT(range != NULL);
+
+	if (count >= sizeof(kernbuf))
+		RETURN(-EINVAL);
+
+	if (copy_from_user(kernbuf, buffer, count))
+		RETURN(-EFAULT);
+
+	kernbuf[count] = 0;
+
+	if (count == 5 && strcmp(kernbuf, "clear") == 0) {
+		memset(range, 0, sizeof(*range));
+		RETURN(0);
+	}
+
+	/* of the form "[0x0000000240000400 - 0x000000028000400]" */
+	sscanf(kernbuf, "[%llx - %llx]\n",
+	       (long long unsigned *)&tmp.lsr_start,
+	       (long long unsigned *)&tmp.lsr_end);
+	if (!lu_seq_range_is_sane(&tmp) || lu_seq_range_is_zero(&tmp) ||
+	    tmp.lsr_start < range->lsr_start || tmp.lsr_end > range->lsr_end)
+		RETURN(-EINVAL);
+	*range = tmp;
+	RETURN(0);
+}
+
+#ifdef HAVE_SERVER_SUPPORT
+/*
+ * Server side procfs stuff.
+ */
+static ssize_t
+lprocfs_server_fid_space_seq_write(struct file *file, const char __user *buffer,
+					size_t count, loff_t *off)
+{
+	struct lu_server_seq *seq = ((struct seq_file *)file->private_data)->private;
+	int rc;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+
+	mutex_lock(&seq->lss_mutex);
+	rc = lprocfs_fid_write_common(buffer, count, &seq->lss_space);
+	if (rc == 0) {
+		CDEBUG(D_INFO, "%s: Space: "DRANGE"\n",
+			seq->lss_name, PRANGE(&seq->lss_space));
+	}
+	mutex_unlock(&seq->lss_mutex);
+
+	RETURN(count);
+}
+
+static int
+lprocfs_server_fid_space_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_server_seq *seq = (struct lu_server_seq *)m->private;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+
+	mutex_lock(&seq->lss_mutex);
+	seq_printf(m, "[%#llx - %#llx]:%x:%s\n", PRANGE(&seq->lss_space));
+	mutex_unlock(&seq->lss_mutex);
+
+	RETURN(0);
+}
+
+static int
+lprocfs_server_fid_server_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_server_seq *seq = (struct lu_server_seq *)m->private;
+	struct client_obd *cli;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+
+	if (seq->lss_cli) {
+		if (seq->lss_cli->lcs_exp != NULL) {
+			cli = &seq->lss_cli->lcs_exp->exp_obd->u.cli;
+			seq_printf(m, "%s\n", cli->cl_target_uuid.uuid);
+		} else {
+			seq_printf(m, "%s\n", seq->lss_cli->lcs_srv->lss_name);
+                }
+	} else {
+		seq_puts(m, "<none>\n");
+	}
+
+	RETURN(0);
+}
+
+static ssize_t
+lprocfs_server_fid_width_seq_write(struct file *file, const char __user *buffer,
+					size_t count, loff_t *off)
+{
+	struct lu_server_seq *seq = ((struct seq_file *)file->private_data)->private;
+	int rc;
+	__s64 val;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+
+	mutex_lock(&seq->lss_mutex);
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc) {
+		CERROR("%s: invalid FID sequence width: rc = %d\n",
+		       seq->lss_name, rc);
+		GOTO(out_unlock, count = rc);
+	}
+
+	if (val < 0) {
+		CERROR("%s: invalid FID sequence width: rc = %d\n",
+		       seq->lss_name, -ERANGE);
+		GOTO(out_unlock, count = -ERANGE);
+	}
+
+	seq->lss_width = val;
+
+	CDEBUG(D_INFO, "%s: Width: %llu\n",
+	       seq->lss_name, seq->lss_width);
+out_unlock:
+	mutex_unlock(&seq->lss_mutex);
+
+	RETURN(count);
+}
+
+static int
+lprocfs_server_fid_width_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_server_seq *seq = (struct lu_server_seq *)m->private;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+
+	mutex_lock(&seq->lss_mutex);
+	seq_printf(m, "%llu\n", seq->lss_width);
+	mutex_unlock(&seq->lss_mutex);
+
+	RETURN(0);
+}
+
+LPROC_SEQ_FOPS(lprocfs_server_fid_space);
+LPROC_SEQ_FOPS(lprocfs_server_fid_width);
+LPROC_SEQ_FOPS_RO(lprocfs_server_fid_server);
+
+struct lprocfs_vars seq_server_proc_list[] = {
+	{ .name	=	"space",
+	  .fops	=	&lprocfs_server_fid_space_fops	},
+	{ .name	=	"width",
+	  .fops	=	&lprocfs_server_fid_width_fops	},
+	{ .name	=	"server",
+	  .fops	=	&lprocfs_server_fid_server_fops	},
+	{ NULL }
+};
+
+struct fld_seq_param {
+	struct lu_env		fsp_env;
+	struct dt_it		*fsp_it;
+	struct lu_server_fld	*fsp_fld;
+	struct lu_server_seq	*fsp_seq;
+	unsigned int		fsp_stop:1;
+};
+
+/*
+ * XXX: below is a copy of the functions in lustre/fld/lproc_fld.c.
+ * we want to avoid this duplication either by exporting the
+ * functions or merging fid and fld into a single module.
+ */
+static void *fldb_seq_start(struct seq_file *p, loff_t *pos)
+{
+	struct fld_seq_param    *param = p->private;
+	struct lu_server_fld    *fld;
+	struct dt_object        *obj;
+	const struct dt_it_ops  *iops;
+	struct dt_key		*key;
+	int			rc;
+
+	if (param == NULL || param->fsp_stop)
+		return NULL;
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	rc = iops->load(&param->fsp_env, param->fsp_it, *pos);
+	if (rc <= 0)
+		return NULL;
+
+	key = iops->key(&param->fsp_env, param->fsp_it);
+	if (IS_ERR(key))
+		return NULL;
+
+	*pos = be64_to_cpu(*(__u64 *)key);
+
+	return param;
+}
+
+static void fldb_seq_stop(struct seq_file *p, void *v)
+{
+	struct fld_seq_param    *param = p->private;
+	const struct dt_it_ops	*iops;
+	struct lu_server_fld	*fld;
+	struct dt_object	*obj;
+
+	if (param == NULL)
+		return;
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	iops->put(&param->fsp_env, param->fsp_it);
+}
+
+static void *fldb_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	struct fld_seq_param    *param = p->private;
+	struct lu_server_fld	*fld;
+	struct dt_object	*obj;
+	const struct dt_it_ops	*iops;
+	int			rc;
+
+	if (param == NULL || param->fsp_stop)
+		return NULL;
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	rc = iops->next(&param->fsp_env, param->fsp_it);
+	if (rc > 0) {
+		param->fsp_stop = 1;
+		return NULL;
+	}
+
+	*pos = be64_to_cpu(*(__u64 *)iops->key(&param->fsp_env, param->fsp_it));
+	return param;
+}
+
+static int fldb_seq_show(struct seq_file *p, void *v)
+{
+	struct fld_seq_param    *param = p->private;
+	struct lu_server_fld	*fld;
+	struct dt_object	*obj;
+	const struct dt_it_ops	*iops;
+	struct lu_seq_range	 fld_rec;
+	int			rc;
+
+	if (param == NULL || param->fsp_stop)
+		return 0;
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	rc = iops->rec(&param->fsp_env, param->fsp_it,
+		       (struct dt_rec *)&fld_rec, 0);
+	if (rc != 0) {
+		CERROR("%s: read record error: rc = %d\n",
+		       fld->lsf_name, rc);
+	} else if (fld_rec.lsr_start != 0) {
+		range_be_to_cpu(&fld_rec, &fld_rec);
+		seq_printf(p, DRANGE"\n", PRANGE(&fld_rec));
+	}
+
+	return rc;
+}
+
+struct seq_operations fldb_sops = {
+	.start = fldb_seq_start,
+	.stop = fldb_seq_stop,
+	.next = fldb_seq_next,
+	.show = fldb_seq_show,
+};
+
+static int fldb_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file		*seq;
+	struct lu_server_seq    *ss = (struct lu_server_seq *) PDE_DATA(inode);
+	struct lu_server_fld    *fld;
+	struct dt_object	*obj;
+	const struct dt_it_ops  *iops;
+	struct fld_seq_param    *param = NULL;
+	int			env_init = 0;
+	int			rc;
+
+	fld = ss->lss_site->ss_server_fld;
+	LASSERT(fld != NULL);
+
+	rc = LPROCFS_ENTRY_CHECK(inode);
+	if (rc < 0)
+		return rc;
+
+	rc = seq_open(file, &fldb_sops);
+	if (rc)
+		return rc;
+
+	obj = fld->lsf_obj;
+	if (obj == NULL) {
+		seq = file->private_data;
+		seq->private = NULL;
+		return 0;
+	}
+
+	OBD_ALLOC_PTR(param);
+	if (param == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	rc = lu_env_init(&param->fsp_env, LCT_MD_THREAD);
+	if (rc != 0)
+		GOTO(out, rc);
+
+	env_init = 1;
+	iops = &obj->do_index_ops->dio_it;
+	param->fsp_it = iops->init(&param->fsp_env, obj, 0);
+	if (IS_ERR(param->fsp_it))
+		GOTO(out, rc = PTR_ERR(param->fsp_it));
+
+	param->fsp_fld = fld;
+	param->fsp_seq = ss;
+	param->fsp_stop = 0;
+
+	seq = file->private_data;
+	seq->private = param;
+out:
+	if (rc != 0) {
+		if (env_init == 1)
+			lu_env_fini(&param->fsp_env);
+		if (param != NULL)
+			OBD_FREE_PTR(param);
+	}
+	return rc;
+}
+
+static int fldb_seq_release(struct inode *inode, struct file *file)
+{
+	struct seq_file		*seq = file->private_data;
+	struct fld_seq_param	*param;
+	struct lu_server_fld	*fld;
+	struct dt_object	*obj;
+	const struct dt_it_ops	*iops;
+
+	param = seq->private;
+	if (param == NULL) {
+		lprocfs_seq_release(inode, file);
+		return 0;
+	}
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	LASSERT(iops != NULL);
+	LASSERT(param->fsp_it != NULL);
+	iops->fini(&param->fsp_env, param->fsp_it);
+	lu_env_fini(&param->fsp_env);
+	OBD_FREE_PTR(param);
+	lprocfs_seq_release(inode, file);
+
+	return 0;
+}
+
+static ssize_t fldb_seq_write(struct file *file, const char __user *buf,
+			      size_t len, loff_t *off)
+{
+	struct seq_file		*seq = file->private_data;
+	struct fld_seq_param	*param;
+	struct lu_seq_range	 range;
+	int			 rc = 0;
+	char			 _buffer[MAX_FID_RANGE_STRLEN];
+	char			*buffer = _buffer;
+	ENTRY;
+
+	param = seq->private;
+	if (param == NULL)
+		RETURN(-EINVAL);
+
+	if (len >= sizeof(_buffer))
+		RETURN(-EINVAL);
+
+	if (copy_from_user(buffer, buf, len))
+		GOTO(out, rc = -EFAULT);
+	buffer[len] = 0;
+
+	/*
+	 * format - [0x0000000200000007-0x0000000200000008):0:mdt
+	 */
+	if (*buffer != '[')
+		GOTO(out, rc = -EINVAL);
+	buffer++;
+
+	range.lsr_start = simple_strtoull(buffer, &buffer, 0);
+	if (*buffer != '-')
+		GOTO(out, rc = -EINVAL);
+	buffer++;
+
+	range.lsr_end = simple_strtoull(buffer, &buffer, 0);
+	if (*buffer != ')')
+		GOTO(out, rc = -EINVAL);
+	buffer++;
+	if (*buffer != ':')
+		GOTO(out, rc = -EINVAL);
+	buffer++;
+
+	range.lsr_index = simple_strtoul(buffer, &buffer, 0);
+	if (*buffer != ':')
+		GOTO(out, rc = -EINVAL);
+	buffer++;
+
+	if (strncmp(buffer, "mdt", 3) == 0)
+		range.lsr_flags = LU_SEQ_RANGE_MDT;
+	else if (strncmp(buffer, "ost", 3) == 0)
+		range.lsr_flags = LU_SEQ_RANGE_OST;
+	else
+		GOTO(out, rc = -EINVAL);
+
+	rc = seq_server_alloc_spec(param->fsp_seq->lss_site->ss_control_seq,
+				   &range, &param->fsp_env);
+
+out:
+	RETURN(rc < 0 ? rc : len);
+}
+
+const struct file_operations seq_fld_proc_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = fldb_seq_open,
+	.read	 = seq_read,
+	.write	 = fldb_seq_write,
+	.release = fldb_seq_release,
+};
+
+#endif /* HAVE_SERVER_SUPPORT */
+
+/* Client side procfs stuff */
+static ssize_t
+lprocfs_client_fid_space_seq_write(struct file *file, const char __user *buffer,
+				   size_t count, loff_t *off)
+{
+	struct lu_client_seq *seq = ((struct seq_file *)file->private_data)->private;
+	int rc;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+
+	mutex_lock(&seq->lcs_mutex);
+	rc = lprocfs_fid_write_common(buffer, count, &seq->lcs_space);
+	if (rc == 0) {
+		CDEBUG(D_INFO, "%s: Space: "DRANGE"\n",
+                       seq->lcs_name, PRANGE(&seq->lcs_space));
+	}
+
+	mutex_unlock(&seq->lcs_mutex);
+
+	RETURN(count);
+}
+
+static int
+lprocfs_client_fid_space_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+
+	mutex_lock(&seq->lcs_mutex);
+	seq_printf(m, "[%#llx - %#llx]:%x:%s\n",
+		   PRANGE(&seq->lcs_space));
+	mutex_unlock(&seq->lcs_mutex);
+
+	RETURN(0);
+}
+
+static ssize_t
+lprocfs_client_fid_width_seq_write(struct file *file, const char __user *buffer,
+				   size_t count, loff_t *off)
+{
+	struct lu_client_seq *seq = ((struct seq_file *)file->private_data)->private;
+	__u64 max;
+	int rc;
+	__s64 val;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+
+	mutex_lock(&seq->lcs_mutex);
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc) {
+		GOTO(out_unlock, count = rc);
+	}
+
+	if (seq->lcs_type == LUSTRE_SEQ_DATA)
+		max = LUSTRE_DATA_SEQ_MAX_WIDTH;
+	else
+		max = LUSTRE_METADATA_SEQ_MAX_WIDTH;
+
+	if (val <= max && val > 0) {
+		seq->lcs_width = val;
+
+		CDEBUG(D_INFO, "%s: Sequence size: %llu\n",
+		       seq->lcs_name, seq->lcs_width);
+	} else {
+		GOTO(out_unlock, count = -ERANGE);
+	}
+
+out_unlock:
+	mutex_unlock(&seq->lcs_mutex);
+	RETURN(count);
+}
+
+static int
+lprocfs_client_fid_width_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+
+	mutex_lock(&seq->lcs_mutex);
+	seq_printf(m, "%llu\n", seq->lcs_width);
+	mutex_unlock(&seq->lcs_mutex);
+
+	RETURN(0);
+}
+
+static int
+lprocfs_client_fid_fid_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+
+	mutex_lock(&seq->lcs_mutex);
+	seq_printf(m, DFID"\n", PFID(&seq->lcs_fid));
+	mutex_unlock(&seq->lcs_mutex);
+
+	RETURN(0);
+}
+
+static int
+lprocfs_client_fid_server_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
+	struct client_obd *cli;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+
+	if (seq->lcs_exp != NULL) {
+		cli = &seq->lcs_exp->exp_obd->u.cli;
+		seq_printf(m, "%s\n", cli->cl_target_uuid.uuid);
+	} else {
+		seq_printf(m, "%s\n", seq->lcs_srv->lss_name);
+	}
+	RETURN(0);
+}
+
+LPROC_SEQ_FOPS(lprocfs_client_fid_space);
+LPROC_SEQ_FOPS(lprocfs_client_fid_width);
+LPROC_SEQ_FOPS_RO(lprocfs_client_fid_server);
+LPROC_SEQ_FOPS_RO(lprocfs_client_fid_fid);
+
+struct lprocfs_vars seq_client_proc_list[] = {
+	{ .name	=	"space",
+	  .fops	=	&lprocfs_client_fid_space_fops	},
+	{ .name	=	"width",
+	  .fops	=	&lprocfs_client_fid_width_fops	},
+	{ .name	=	"server",
+	  .fops	=	&lprocfs_client_fid_server_fops	},
+	{ .name	=	"fid",
+	  .fops	=	&lprocfs_client_fid_fid_fops	},
+	{ NULL }
+};
+
+#endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_cache.c b/drivers/staging/lustrefsx/lustre/fld/fld_cache.c
new file mode 100644
index 0000000000000..9b46feed04e72
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/fld/fld_cache.c
@@ -0,0 +1,554 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_cache.c
+ *
+ * FLD (Fids Location Database)
+ *
+ * Author: Pravin Shelar <pravin.shelar@sun.com>
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+#include <libcfs/libcfs.h>
+#include <linux/module.h>
+#include <linux/math64.h>
+#include <obd_support.h>
+#include <lustre_fld.h>
+#include "fld_internal.h"
+
+/**
+ * create fld cache.
+ */
+struct fld_cache *fld_cache_init(const char *name,
+                                 int cache_size, int cache_threshold)
+{
+        struct fld_cache *cache;
+        ENTRY;
+
+        LASSERT(name != NULL);
+        LASSERT(cache_threshold < cache_size);
+
+        OBD_ALLOC_PTR(cache);
+        if (cache == NULL)
+                RETURN(ERR_PTR(-ENOMEM));
+
+	INIT_LIST_HEAD(&cache->fci_entries_head);
+	INIT_LIST_HEAD(&cache->fci_lru);
+
+        cache->fci_cache_count = 0;
+	rwlock_init(&cache->fci_lock);
+
+	strlcpy(cache->fci_name, name,
+                sizeof(cache->fci_name));
+
+        cache->fci_cache_size = cache_size;
+        cache->fci_threshold = cache_threshold;
+
+        /* Init fld cache info. */
+        memset(&cache->fci_stat, 0, sizeof(cache->fci_stat));
+
+        CDEBUG(D_INFO, "%s: FLD cache - Size: %d, Threshold: %d\n",
+               cache->fci_name, cache_size, cache_threshold);
+
+        RETURN(cache);
+}
+
+/**
+ * destroy fld cache.
+ */
+void fld_cache_fini(struct fld_cache *cache)
+{
+        __u64 pct;
+        ENTRY;
+
+        LASSERT(cache != NULL);
+        fld_cache_flush(cache);
+
+        if (cache->fci_stat.fst_count > 0) {
+                pct = cache->fci_stat.fst_cache * 100;
+                do_div(pct, cache->fci_stat.fst_count);
+        } else {
+                pct = 0;
+        }
+
+        CDEBUG(D_INFO, "FLD cache statistics (%s):\n", cache->fci_name);
+	CDEBUG(D_INFO, "  Total reqs: %llu\n", cache->fci_stat.fst_count);
+	CDEBUG(D_INFO, "  Cache reqs: %llu\n", cache->fci_stat.fst_cache);
+	CDEBUG(D_INFO, "  Cache hits: %llu%%\n", pct);
+
+        OBD_FREE_PTR(cache);
+
+        EXIT;
+}
+
+/**
+ * delete given node from list.
+ */
+void fld_cache_entry_delete(struct fld_cache *cache,
+			    struct fld_cache_entry *node)
+{
+	list_del(&node->fce_list);
+	list_del(&node->fce_lru);
+	cache->fci_cache_count--;
+	OBD_FREE_PTR(node);
+}
+
+/**
+ * fix list by checking new entry with NEXT entry in order.
+ */
+static void fld_fix_new_list(struct fld_cache *cache)
+{
+        struct fld_cache_entry *f_curr;
+        struct fld_cache_entry *f_next;
+        struct lu_seq_range *c_range;
+        struct lu_seq_range *n_range;
+	struct list_head *head = &cache->fci_entries_head;
+        ENTRY;
+
+restart_fixup:
+
+	list_for_each_entry_safe(f_curr, f_next, head, fce_list) {
+		c_range = &f_curr->fce_range;
+		n_range = &f_next->fce_range;
+
+		LASSERT(lu_seq_range_is_sane(c_range));
+		if (&f_next->fce_list == head)
+			break;
+
+		if (c_range->lsr_flags != n_range->lsr_flags)
+			continue;
+
+                LASSERTF(c_range->lsr_start <= n_range->lsr_start,
+                         "cur lsr_start "DRANGE" next lsr_start "DRANGE"\n",
+                         PRANGE(c_range), PRANGE(n_range));
+
+                /* check merge possibility with next range */
+                if (c_range->lsr_end == n_range->lsr_start) {
+                        if (c_range->lsr_index != n_range->lsr_index)
+                                continue;
+                        n_range->lsr_start = c_range->lsr_start;
+                        fld_cache_entry_delete(cache, f_curr);
+                        continue;
+                }
+
+                /* check if current range overlaps with next range. */
+                if (n_range->lsr_start < c_range->lsr_end) {
+                        if (c_range->lsr_index == n_range->lsr_index) {
+                                n_range->lsr_start = c_range->lsr_start;
+                                n_range->lsr_end = max(c_range->lsr_end,
+                                                       n_range->lsr_end);
+                                fld_cache_entry_delete(cache, f_curr);
+                        } else {
+                                if (n_range->lsr_end <= c_range->lsr_end) {
+                                        *n_range = *c_range;
+                                        fld_cache_entry_delete(cache, f_curr);
+                                } else
+                                        n_range->lsr_start = c_range->lsr_end;
+                        }
+
+                        /* we could have overlap over next
+                         * range too. better restart. */
+                        goto restart_fixup;
+                }
+
+                /* kill duplicates */
+		if (c_range->lsr_start == n_range->lsr_start &&
+		    c_range->lsr_end == n_range->lsr_end)
+			fld_cache_entry_delete(cache, f_curr);
+        }
+
+        EXIT;
+}
+
+/**
+ * add node to fld cache
+ */
+static inline void fld_cache_entry_add(struct fld_cache *cache,
+                                       struct fld_cache_entry *f_new,
+				       struct list_head *pos)
+{
+	list_add(&f_new->fce_list, pos);
+	list_add(&f_new->fce_lru, &cache->fci_lru);
+
+	cache->fci_cache_count++;
+	fld_fix_new_list(cache);
+}
+
+/**
+ * Check if cache needs to be shrunk. If so - do it.
+ * Remove one entry in list and so on until cache is shrunk enough.
+ */
+static int fld_cache_shrink(struct fld_cache *cache)
+{
+        struct fld_cache_entry *flde;
+	struct list_head *curr;
+        int num = 0;
+        ENTRY;
+
+        LASSERT(cache != NULL);
+
+        if (cache->fci_cache_count < cache->fci_cache_size)
+                RETURN(0);
+
+        curr = cache->fci_lru.prev;
+
+        while (cache->fci_cache_count + cache->fci_threshold >
+               cache->fci_cache_size && curr != &cache->fci_lru) {
+
+		flde = list_entry(curr, struct fld_cache_entry, fce_lru);
+                curr = curr->prev;
+                fld_cache_entry_delete(cache, flde);
+                num++;
+        }
+
+        CDEBUG(D_INFO, "%s: FLD cache - Shrunk by "
+               "%d entries\n", cache->fci_name, num);
+
+        RETURN(0);
+}
+
+/**
+ * kill all fld cache entries.
+ */
+void fld_cache_flush(struct fld_cache *cache)
+{
+	ENTRY;
+
+	write_lock(&cache->fci_lock);
+	cache->fci_cache_size = 0;
+	fld_cache_shrink(cache);
+	write_unlock(&cache->fci_lock);
+
+	EXIT;
+}
+
+/**
+ * punch hole in existing range. divide this range and add new
+ * entry accordingly.
+ */
+
+static void fld_cache_punch_hole(struct fld_cache *cache,
+				 struct fld_cache_entry *f_curr,
+				 struct fld_cache_entry *f_new)
+{
+        const struct lu_seq_range *range = &f_new->fce_range;
+	const u64 new_start  = range->lsr_start;
+	const u64 new_end  = range->lsr_end;
+        struct fld_cache_entry *fldt;
+
+        ENTRY;
+	OBD_ALLOC_GFP(fldt, sizeof *fldt, GFP_ATOMIC);
+        if (!fldt) {
+                OBD_FREE_PTR(f_new);
+                EXIT;
+                /* overlap is not allowed, so dont mess up list. */
+                return;
+        }
+        /*  break f_curr RANGE into three RANGES:
+         *        f_curr, f_new , fldt
+         */
+
+        /* f_new = *range */
+
+        /* fldt */
+        fldt->fce_range.lsr_start = new_end;
+        fldt->fce_range.lsr_end = f_curr->fce_range.lsr_end;
+        fldt->fce_range.lsr_index = f_curr->fce_range.lsr_index;
+
+        /* f_curr */
+        f_curr->fce_range.lsr_end = new_start;
+
+        /* add these two entries to list */
+        fld_cache_entry_add(cache, f_new, &f_curr->fce_list);
+        fld_cache_entry_add(cache, fldt, &f_new->fce_list);
+
+        /* no need to fixup */
+        EXIT;
+}
+
+/**
+ * handle range overlap in fld cache.
+ */
+static void fld_cache_overlap_handle(struct fld_cache *cache,
+				struct fld_cache_entry *f_curr,
+				struct fld_cache_entry *f_new)
+{
+	const struct lu_seq_range *range = &f_new->fce_range;
+	const u64 new_start  = range->lsr_start;
+	const u64 new_end  = range->lsr_end;
+	const u32 mdt = range->lsr_index;
+
+        /* this is overlap case, these case are checking overlapping with
+         * prev range only. fixup will handle overlaping with next range. */
+
+        if (f_curr->fce_range.lsr_index == mdt) {
+                f_curr->fce_range.lsr_start = min(f_curr->fce_range.lsr_start,
+                                                  new_start);
+
+                f_curr->fce_range.lsr_end = max(f_curr->fce_range.lsr_end,
+                                                new_end);
+
+                OBD_FREE_PTR(f_new);
+                fld_fix_new_list(cache);
+
+        } else if (new_start <= f_curr->fce_range.lsr_start &&
+                        f_curr->fce_range.lsr_end <= new_end) {
+                /* case 1: new range completely overshadowed existing range.
+                 *         e.g. whole range migrated. update fld cache entry */
+
+                f_curr->fce_range = *range;
+                OBD_FREE_PTR(f_new);
+                fld_fix_new_list(cache);
+
+        } else if (f_curr->fce_range.lsr_start < new_start &&
+                        new_end < f_curr->fce_range.lsr_end) {
+                /* case 2: new range fit within existing range. */
+
+                fld_cache_punch_hole(cache, f_curr, f_new);
+
+        } else  if (new_end <= f_curr->fce_range.lsr_end) {
+                /* case 3: overlap:
+                 *         [new_start [c_start  new_end)  c_end)
+                 */
+
+                LASSERT(new_start <= f_curr->fce_range.lsr_start);
+
+                f_curr->fce_range.lsr_start = new_end;
+                fld_cache_entry_add(cache, f_new, f_curr->fce_list.prev);
+
+        } else if (f_curr->fce_range.lsr_start <= new_start) {
+                /* case 4: overlap:
+                 *         [c_start [new_start c_end) new_end)
+                 */
+
+                LASSERT(f_curr->fce_range.lsr_end <= new_end);
+
+                f_curr->fce_range.lsr_end = new_start;
+                fld_cache_entry_add(cache, f_new, &f_curr->fce_list);
+        } else
+                CERROR("NEW range ="DRANGE" curr = "DRANGE"\n",
+                       PRANGE(range),PRANGE(&f_curr->fce_range));
+}
+
+struct fld_cache_entry
+*fld_cache_entry_create(const struct lu_seq_range *range)
+{
+	struct fld_cache_entry *f_new;
+
+	LASSERT(lu_seq_range_is_sane(range));
+
+	OBD_ALLOC_PTR(f_new);
+	if (!f_new)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	f_new->fce_range = *range;
+	RETURN(f_new);
+}
+
+/**
+ * Insert FLD entry in FLD cache.
+ *
+ * This function handles all cases of merging and breaking up of
+ * ranges.
+ */
+int fld_cache_insert_nolock(struct fld_cache *cache,
+			    struct fld_cache_entry *f_new)
+{
+	struct fld_cache_entry *f_curr;
+	struct fld_cache_entry *n;
+	struct list_head *head;
+	struct list_head *prev = NULL;
+	const u64 new_start  = f_new->fce_range.lsr_start;
+	const u64 new_end  = f_new->fce_range.lsr_end;
+	__u32 new_flags  = f_new->fce_range.lsr_flags;
+	ENTRY;
+
+	/*
+	 * Duplicate entries are eliminated in insert op.
+	 * So we don't need to search new entry before starting
+	 * insertion loop.
+	 */
+
+	if (!cache->fci_no_shrink)
+		fld_cache_shrink(cache);
+
+	head = &cache->fci_entries_head;
+
+	list_for_each_entry_safe(f_curr, n, head, fce_list) {
+		/* add list if next is end of list */
+		if (new_end < f_curr->fce_range.lsr_start ||
+		   (new_end == f_curr->fce_range.lsr_start &&
+		    new_flags != f_curr->fce_range.lsr_flags))
+			break;
+
+		prev = &f_curr->fce_list;
+		/* check if this range is to left of new range. */
+		if (new_start < f_curr->fce_range.lsr_end &&
+		    new_flags == f_curr->fce_range.lsr_flags) {
+			fld_cache_overlap_handle(cache, f_curr, f_new);
+			goto out;
+		}
+	}
+
+	if (prev == NULL)
+		prev = head;
+
+	CDEBUG(D_INFO, "insert range "DRANGE"\n", PRANGE(&f_new->fce_range));
+	/* Add new entry to cache and lru list. */
+	fld_cache_entry_add(cache, f_new, prev);
+out:
+	RETURN(0);
+}
+
+int fld_cache_insert(struct fld_cache *cache,
+		     const struct lu_seq_range *range)
+{
+	struct fld_cache_entry	*flde;
+	int rc;
+
+	flde = fld_cache_entry_create(range);
+	if (IS_ERR(flde))
+		RETURN(PTR_ERR(flde));
+
+	write_lock(&cache->fci_lock);
+	rc = fld_cache_insert_nolock(cache, flde);
+	write_unlock(&cache->fci_lock);
+	if (rc)
+		OBD_FREE_PTR(flde);
+
+	RETURN(rc);
+}
+
+void fld_cache_delete_nolock(struct fld_cache *cache,
+		      const struct lu_seq_range *range)
+{
+	struct fld_cache_entry *flde;
+	struct fld_cache_entry *tmp;
+	struct list_head *head;
+
+	head = &cache->fci_entries_head;
+	list_for_each_entry_safe(flde, tmp, head, fce_list) {
+		/* add list if next is end of list */
+		if (range->lsr_start == flde->fce_range.lsr_start ||
+		   (range->lsr_end == flde->fce_range.lsr_end &&
+		    range->lsr_flags == flde->fce_range.lsr_flags)) {
+			fld_cache_entry_delete(cache, flde);
+			break;
+		}
+	}
+}
+
+/**
+ * Delete FLD entry in FLD cache.
+ *
+ */
+void fld_cache_delete(struct fld_cache *cache,
+		      const struct lu_seq_range *range)
+{
+	write_lock(&cache->fci_lock);
+	fld_cache_delete_nolock(cache, range);
+	write_unlock(&cache->fci_lock);
+}
+
+struct fld_cache_entry *
+fld_cache_entry_lookup_nolock(struct fld_cache *cache,
+			      const struct lu_seq_range *range)
+{
+	struct fld_cache_entry *flde;
+	struct fld_cache_entry *got = NULL;
+	struct list_head *head;
+
+	head = &cache->fci_entries_head;
+	list_for_each_entry(flde, head, fce_list) {
+		if (range->lsr_start == flde->fce_range.lsr_start ||
+		   (range->lsr_end == flde->fce_range.lsr_end &&
+		    range->lsr_flags == flde->fce_range.lsr_flags)) {
+			got = flde;
+			break;
+		}
+	}
+
+	RETURN(got);
+}
+
+/**
+ * lookup \a seq sequence for range in fld cache.
+ */
+struct fld_cache_entry *
+fld_cache_entry_lookup(struct fld_cache *cache,
+		       const struct lu_seq_range *range)
+{
+	struct fld_cache_entry *got = NULL;
+	ENTRY;
+
+	read_lock(&cache->fci_lock);
+	got = fld_cache_entry_lookup_nolock(cache, range);
+	read_unlock(&cache->fci_lock);
+
+	RETURN(got);
+}
+
+/**
+ * lookup \a seq sequence for range in fld cache.
+ */
+int fld_cache_lookup(struct fld_cache *cache,
+		     const u64 seq, struct lu_seq_range *range)
+{
+	struct fld_cache_entry *flde;
+	struct fld_cache_entry *prev = NULL;
+	struct list_head *head;
+	ENTRY;
+
+	read_lock(&cache->fci_lock);
+	head = &cache->fci_entries_head;
+
+	cache->fci_stat.fst_count++;
+	list_for_each_entry(flde, head, fce_list) {
+		if (flde->fce_range.lsr_start > seq) {
+			if (prev != NULL)
+				*range = prev->fce_range;
+			break;
+		}
+
+		prev = flde;
+		if (lu_seq_range_within(&flde->fce_range, seq)) {
+			*range = flde->fce_range;
+
+			cache->fci_stat.fst_cache++;
+			read_unlock(&cache->fci_lock);
+			RETURN(0);
+		}
+	}
+	read_unlock(&cache->fci_lock);
+	RETURN(-ENOENT);
+}
diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_handler.c b/drivers/staging/lustrefsx/lustre/fld/fld_handler.c
new file mode 100644
index 0000000000000..375070464cd85
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/fld/fld_handler.c
@@ -0,0 +1,504 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_handler.c
+ *
+ * FLD (Fids Location Database)
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ * Author: WangDi <wangdi@clusterfs.com>
+ * Author: Pravin Shelar <pravin.shelar@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+#include <libcfs/libcfs.h>
+#include <linux/module.h>
+
+#include <obd.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <lustre_fld.h>
+#include <lustre_req_layout.h>
+#include <lprocfs_status.h>
+#include "fld_internal.h"
+
+/* context key constructor/destructor: fld_key_init, fld_key_fini */
+LU_KEY_INIT_FINI(fld, struct fld_thread_info);
+
+/* context key: fld_thread_key */
+/* MGS thread may create llog file causing FLD lookup */
+LU_CONTEXT_KEY_DEFINE(fld, LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD);
+
+int fld_server_mod_init(void)
+{
+	LU_CONTEXT_KEY_INIT(&fld_thread_key);
+	return lu_context_key_register(&fld_thread_key);
+}
+
+void fld_server_mod_exit(void)
+{
+	lu_context_key_degister(&fld_thread_key);
+}
+
+int fld_declare_server_create(const struct lu_env *env,
+			      struct lu_server_fld *fld,
+			      const struct lu_seq_range *range,
+			      struct thandle *th)
+{
+	int rc;
+
+	rc = fld_declare_index_create(env, fld, range, th);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(fld_declare_server_create);
+
+/**
+ * Insert FLD index entry and update FLD cache.
+ *
+ * This function is called from the sequence allocator when a super-sequence
+ * is granted to a server.
+ */
+int fld_server_create(const struct lu_env *env, struct lu_server_fld *fld,
+		      const struct lu_seq_range *range, struct thandle *th)
+{
+	int rc;
+
+	mutex_lock(&fld->lsf_lock);
+	rc = fld_index_create(env, fld, range, th);
+	mutex_unlock(&fld->lsf_lock);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(fld_server_create);
+
+/**
+ * Extract index information from fld name like srv-fsname-MDT0000
+ **/
+int fld_name_to_index(const char *name, __u32 *index)
+{
+	char *dash;
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_INFO, "get index from %s\n", name);
+	dash = strrchr(name, '-');
+	if (dash == NULL)
+		RETURN(-EINVAL);
+	dash++;
+	rc = target_name2index(dash, index, NULL);
+	RETURN(rc);
+}
+
+/**
+ * Retrieve fldb entry from MDT0 and add to local FLDB and cache.
+ **/
+int fld_update_from_controller(const struct lu_env *env,
+			       struct lu_server_fld *fld)
+{
+	struct fld_thread_info	  *info;
+	struct lu_seq_range	  *range;
+	struct lu_seq_range_array *lsra;
+	__u32			  index;
+	struct ptlrpc_request	  *req;
+	int			  rc;
+	int			  i;
+	ENTRY;
+
+	/* Update only happens during initalization, i.e. local FLDB
+	 * does not exist yet */
+	if (!fld->lsf_new)
+		RETURN(0);
+
+	rc = fld_name_to_index(fld->lsf_name, &index);
+	if (rc < 0)
+		RETURN(rc);
+
+	/* No need update fldb for MDT0 */
+	if (index == 0)
+		RETURN(0);
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+	LASSERT(info != NULL);
+	range = &info->fti_lrange;
+	memset(range, 0, sizeof(*range));
+	range->lsr_index = index;
+	fld_range_set_mdt(range);
+
+	do {
+		rc = fld_client_rpc(fld->lsf_control_exp, range, FLD_READ,
+				    &req);
+		if (rc != 0 && rc != -EAGAIN)
+			GOTO(out, rc);
+
+		LASSERT(req != NULL);
+		lsra = (struct lu_seq_range_array *)req_capsule_server_get(
+					  &req->rq_pill, &RMF_GENERIC_DATA);
+		if (lsra == NULL)
+			GOTO(out, rc = -EPROTO);
+
+		range_array_le_to_cpu(lsra, lsra);
+		for (i = 0; i < lsra->lsra_count; i++) {
+			int rc1;
+
+			if (lsra->lsra_lsr[i].lsr_flags != LU_SEQ_RANGE_MDT)
+				GOTO(out, rc = -EINVAL);
+
+			if (lsra->lsra_lsr[i].lsr_index != index)
+				GOTO(out, rc = -EINVAL);
+
+			mutex_lock(&fld->lsf_lock);
+			rc1 = fld_insert_entry(env, fld, &lsra->lsra_lsr[i]);
+			mutex_unlock(&fld->lsf_lock);
+
+			if (rc1 != 0)
+				GOTO(out, rc = rc1);
+		}
+		if (rc == -EAGAIN)
+			*range = lsra->lsra_lsr[lsra->lsra_count - 1];
+	} while (rc == -EAGAIN);
+
+	fld->lsf_new = 1;
+out:
+	if (req != NULL)
+		ptlrpc_req_finished(req);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(fld_update_from_controller);
+
+/**
+ * Lookup sequece in local cache/fldb.
+ **/
+int fld_local_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+		     u64 seq, struct lu_seq_range *range)
+{
+	struct lu_seq_range *erange;
+	struct fld_thread_info *info;
+	int rc;
+	ENTRY;
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+	LASSERT(info != NULL);
+	erange = &info->fti_lrange;
+
+	/* Lookup it in the cache. */
+	rc = fld_cache_lookup(fld->lsf_cache, seq, erange);
+	if (rc == 0) {
+		if (unlikely(fld_range_type(erange) != fld_range_type(range) &&
+			     !fld_range_is_any(range))) {
+			CERROR("%s: FLD cache range "DRANGE" does not match"
+			       "requested flag %x: rc = %d\n", fld->lsf_name,
+			       PRANGE(erange), range->lsr_flags, -EIO);
+			RETURN(-EIO);
+		}
+		*range = *erange;
+		RETURN(0);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(fld_local_lookup);
+
+/**
+ *  Lookup MDT/OST by seq, returns a range for given seq.
+ *
+ *  If that entry is not cached in fld cache, request is sent to super
+ *  sequence controller node (MDT0). All other MDT[1...N] and client
+ *  cache fld entries, but this cache is not persistent.
+ */
+int fld_server_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+		      u64 seq, struct lu_seq_range *range)
+{
+	__u32 index;
+	int rc;
+	ENTRY;
+
+	rc = fld_local_lookup(env, fld, seq, range);
+	if (likely(rc == 0))
+		RETURN(rc);
+
+	rc = fld_name_to_index(fld->lsf_name, &index);
+	if (rc < 0)
+		RETURN(rc);
+
+	if (index == 0 && rc == LDD_F_SV_TYPE_MDT) {
+		/* On server side, all entries should be in cache.
+		 * If we can not find it in cache, just return error */
+		CERROR("%s: Cannot find sequence %#llx: rc = %d\n",
+		       fld->lsf_name, seq, -ENOENT);
+		RETURN(-ENOENT);
+	} else {
+		if (fld->lsf_control_exp == NULL) {
+			CERROR("%s: lookup %#llx, but not connects to MDT0"
+			       "yet: rc = %d.\n", fld->lsf_name, seq, -EIO);
+			RETURN(-EIO);
+		}
+		/* send request to mdt0 i.e. super seq. controller.
+		 * This is temporary solution, long term solution is fld
+		 * replication on all mdt servers.
+		 */
+		range->lsr_start = seq;
+		rc = fld_client_rpc(fld->lsf_control_exp,
+				    range, FLD_QUERY, NULL);
+		if (rc == 0)
+			fld_cache_insert(fld->lsf_cache, range);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(fld_server_lookup);
+
+/**
+ * All MDT server handle fld lookup operation. But only MDT0 has fld index.
+ * if entry is not found in cache we need to forward lookup request to MDT0
+ */
+static int fld_handle_lookup(struct tgt_session_info *tsi)
+{
+	struct obd_export	*exp = tsi->tsi_exp;
+	struct lu_site		*site = exp->exp_obd->obd_lu_dev->ld_site;
+	struct lu_server_fld	*fld;
+	struct lu_seq_range	*in;
+	struct lu_seq_range	*out;
+	int			rc;
+
+	ENTRY;
+
+	in = req_capsule_client_get(tsi->tsi_pill, &RMF_FLD_MDFLD);
+	if (in == NULL)
+		RETURN(err_serious(-EPROTO));
+
+	rc = req_capsule_server_pack(tsi->tsi_pill);
+	if (unlikely(rc != 0))
+		RETURN(err_serious(rc));
+
+	out = req_capsule_server_get(tsi->tsi_pill, &RMF_FLD_MDFLD);
+	if (out == NULL)
+		RETURN(err_serious(-EPROTO));
+	*out = *in;
+
+	fld = lu_site2seq(site)->ss_server_fld;
+
+	rc = fld_server_lookup(tsi->tsi_env, fld, in->lsr_start, out);
+
+	CDEBUG(D_INFO, "%s: FLD req handle: error %d (range: "DRANGE")\n",
+	       fld->lsf_name, rc, PRANGE(out));
+
+	RETURN(rc);
+}
+
+static int fld_handle_read(struct tgt_session_info *tsi)
+{
+	struct obd_export	*exp = tsi->tsi_exp;
+	struct lu_site		*site = exp->exp_obd->obd_lu_dev->ld_site;
+	struct lu_seq_range	*in;
+	void			*data;
+	int			rc;
+
+	ENTRY;
+
+	req_capsule_set(tsi->tsi_pill, &RQF_FLD_READ);
+
+	in = req_capsule_client_get(tsi->tsi_pill, &RMF_FLD_MDFLD);
+	if (in == NULL)
+		RETURN(err_serious(-EPROTO));
+
+	req_capsule_set_size(tsi->tsi_pill, &RMF_GENERIC_DATA, RCL_SERVER,
+			     PAGE_SIZE);
+
+	rc = req_capsule_server_pack(tsi->tsi_pill);
+	if (unlikely(rc != 0))
+		RETURN(err_serious(rc));
+
+	data = req_capsule_server_get(tsi->tsi_pill, &RMF_GENERIC_DATA);
+
+	rc = fld_server_read(tsi->tsi_env, lu_site2seq(site)->ss_server_fld,
+			     in, data, PAGE_SIZE);
+	RETURN(rc);
+}
+
+static int fld_handle_query(struct tgt_session_info *tsi)
+{
+	int	rc;
+
+	ENTRY;
+
+	req_capsule_set(tsi->tsi_pill, &RQF_FLD_QUERY);
+
+	rc = fld_handle_lookup(tsi);
+
+	RETURN(rc);
+}
+
+/*
+ * Returns true, if fid is local to this server node.
+ *
+ * WARNING: this function is *not* guaranteed to return false if fid is
+ * remote: it makes an educated conservative guess only.
+ *
+ * fid_is_local() is supposed to be used in assertion checks only.
+ */
+int fid_is_local(const struct lu_env *env,
+                 struct lu_site *site, const struct lu_fid *fid)
+{
+	int result;
+	struct seq_server_site *ss_site;
+	struct lu_seq_range *range;
+	struct fld_thread_info *info;
+	ENTRY;
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+	range = &info->fti_lrange;
+
+	result = 1; /* conservatively assume fid is local */
+	ss_site = lu_site2seq(site);
+	if (ss_site->ss_client_fld != NULL) {
+		int rc;
+
+		rc = fld_cache_lookup(ss_site->ss_client_fld->lcf_cache,
+				      fid_seq(fid), range);
+		if (rc == 0)
+			result = (range->lsr_index == ss_site->ss_node_id);
+	}
+	return result;
+}
+
+static void fld_server_proc_fini(struct lu_server_fld *fld);
+
+#ifdef CONFIG_PROC_FS
+static int fld_server_proc_init(struct lu_server_fld *fld)
+{
+        int rc = 0;
+        ENTRY;
+
+	fld->lsf_proc_dir = lprocfs_register(fld->lsf_name, fld_type_proc_dir,
+					     fld_server_proc_list, fld);
+	if (IS_ERR(fld->lsf_proc_dir)) {
+		rc = PTR_ERR(fld->lsf_proc_dir);
+		RETURN(rc);
+	}
+
+	rc = lprocfs_seq_create(fld->lsf_proc_dir, "fldb", 0444,
+				&fld_proc_seq_fops, fld);
+	if (rc) {
+		lprocfs_remove(&fld->lsf_proc_dir);
+		fld->lsf_proc_dir = NULL;
+	}
+
+	RETURN(rc);
+}
+
+static void fld_server_proc_fini(struct lu_server_fld *fld)
+{
+        ENTRY;
+        if (fld->lsf_proc_dir != NULL) {
+                if (!IS_ERR(fld->lsf_proc_dir))
+                        lprocfs_remove(&fld->lsf_proc_dir);
+                fld->lsf_proc_dir = NULL;
+        }
+        EXIT;
+}
+#else
+static int fld_server_proc_init(struct lu_server_fld *fld)
+{
+        return 0;
+}
+
+static void fld_server_proc_fini(struct lu_server_fld *fld)
+{
+        return;
+}
+#endif
+
+int fld_server_init(const struct lu_env *env, struct lu_server_fld *fld,
+		    struct dt_device *dt, const char *prefix, int type)
+{
+	int cache_size, cache_threshold;
+	int rc;
+
+	ENTRY;
+
+	snprintf(fld->lsf_name, sizeof(fld->lsf_name), "srv-%s", prefix);
+
+	cache_size = FLD_SERVER_CACHE_SIZE / sizeof(struct fld_cache_entry);
+
+	cache_threshold = cache_size * FLD_SERVER_CACHE_THRESHOLD / 100;
+
+	mutex_init(&fld->lsf_lock);
+	fld->lsf_cache = fld_cache_init(fld->lsf_name, cache_size,
+					cache_threshold);
+	if (IS_ERR(fld->lsf_cache)) {
+		rc = PTR_ERR(fld->lsf_cache);
+		fld->lsf_cache = NULL;
+		RETURN(rc);
+	}
+
+	rc = fld_index_init(env, fld, dt, type);
+	if (rc)
+		GOTO(out_cache, rc);
+
+	rc = fld_server_proc_init(fld);
+	if (rc)
+		GOTO(out_index, rc);
+
+	fld->lsf_control_exp = NULL;
+	fld->lsf_seq_lookup = fld_server_lookup;
+
+	fld->lsf_seq_lookup = fld_server_lookup;
+	RETURN(0);
+out_index:
+	fld_index_fini(env, fld);
+out_cache:
+	fld_cache_fini(fld->lsf_cache);
+	return rc;
+}
+EXPORT_SYMBOL(fld_server_init);
+
+void fld_server_fini(const struct lu_env *env, struct lu_server_fld *fld)
+{
+	ENTRY;
+
+	fld_server_proc_fini(fld);
+	fld_index_fini(env, fld);
+
+	if (fld->lsf_cache != NULL) {
+		if (!IS_ERR(fld->lsf_cache))
+			fld_cache_fini(fld->lsf_cache);
+		fld->lsf_cache = NULL;
+	}
+
+	EXIT;
+}
+EXPORT_SYMBOL(fld_server_fini);
+
+struct tgt_handler fld_handlers[] = {
+TGT_FLD_HDL_VAR(0,	FLD_QUERY,	fld_handle_query),
+TGT_FLD_HDL_VAR(0,	FLD_READ,	fld_handle_read),
+};
+EXPORT_SYMBOL(fld_handlers);
diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_index.c b/drivers/staging/lustrefsx/lustre/fld/fld_index.c
new file mode 100644
index 0000000000000..fa9ca9427f22f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/fld/fld_index.c
@@ -0,0 +1,511 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_index.c
+ *
+ * Author: WangDi <wangdi@clusterfs.com>
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+#include <libcfs/libcfs.h>
+#include <linux/module.h>
+#include <obd_support.h>
+#include <dt_object.h>
+#include <lustre_fid.h>
+#include <lustre_fld.h>
+#include "fld_internal.h"
+
+static const char fld_index_name[] = "fld";
+
+static const struct lu_seq_range IGIF_FLD_RANGE = {
+	.lsr_start = FID_SEQ_IGIF,
+	.lsr_end   = FID_SEQ_IGIF_MAX + 1,
+	.lsr_index = 0,
+	.lsr_flags = LU_SEQ_RANGE_MDT
+};
+
+static const struct lu_seq_range DOT_LUSTRE_FLD_RANGE = {
+	.lsr_start = FID_SEQ_DOT_LUSTRE,
+	.lsr_end   = FID_SEQ_DOT_LUSTRE + 1,
+	.lsr_index = 0,
+	.lsr_flags = LU_SEQ_RANGE_MDT
+};
+
+static const struct lu_seq_range ROOT_FLD_RANGE = {
+	.lsr_start = FID_SEQ_ROOT,
+	.lsr_end   = FID_SEQ_ROOT + 1,
+	.lsr_index = 0,
+	.lsr_flags = LU_SEQ_RANGE_MDT
+};
+
+static const struct dt_index_features fld_index_features = {
+	.dif_flags       = DT_IND_UPDATE,
+	.dif_keysize_min = sizeof(u64),
+	.dif_keysize_max = sizeof(u64),
+	.dif_recsize_min = sizeof(struct lu_seq_range),
+	.dif_recsize_max = sizeof(struct lu_seq_range),
+	.dif_ptrsize     = 4
+};
+
+extern struct lu_context_key fld_thread_key;
+
+int fld_declare_index_create(const struct lu_env *env,
+			     struct lu_server_fld *fld,
+			     const struct lu_seq_range *new_range,
+			     struct thandle *th)
+{
+	struct lu_seq_range	*tmp;
+	struct lu_seq_range	*range;
+	struct fld_thread_info	*info;
+	int			rc = 0;
+
+	ENTRY;
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+	range = &info->fti_lrange;
+	tmp = &info->fti_irange;
+	memset(range, 0, sizeof(*range));
+
+	rc = fld_index_lookup(env, fld, new_range->lsr_start, range);
+	if (rc == 0) {
+		/* In case of duplicate entry, the location must be same */
+		LASSERT((lu_seq_range_compare_loc(new_range, range) == 0));
+		GOTO(out, rc = -EEXIST);
+	}
+
+	if (rc != -ENOENT) {
+		CERROR("%s: lookup range "DRANGE" error: rc = %d\n",
+			fld->lsf_name, PRANGE(range), rc);
+		GOTO(out, rc);
+	}
+
+	/* Check for merge case, since the fld entry can only be increamental,
+	 * so we will only check whether it can be merged from the left. */
+	if (new_range->lsr_start == range->lsr_end && range->lsr_end != 0 &&
+	    lu_seq_range_compare_loc(new_range, range) == 0) {
+		range_cpu_to_be(tmp, range);
+		rc = dt_declare_delete(env, fld->lsf_obj,
+				       (struct dt_key *)&tmp->lsr_start, th);
+		if (rc) {
+			CERROR("%s: declare record "DRANGE" failed: rc = %d\n",
+			       fld->lsf_name, PRANGE(range), rc);
+			GOTO(out, rc);
+		}
+		*tmp = *new_range;
+		tmp->lsr_start = range->lsr_start;
+	} else {
+		*tmp = *new_range;
+	}
+
+	range_cpu_to_be(tmp, tmp);
+	rc = dt_declare_insert(env, fld->lsf_obj, (struct dt_rec *)tmp,
+			       (struct dt_key *)&tmp->lsr_start, th);
+out:
+	RETURN(rc);
+}
+
+/**
+ * insert range in fld store.
+ *
+ *      \param  range  range to be inserted
+ *      \param  th     transaction for this operation as it could compound
+ *                     transaction.
+ *
+ *      \retval  0  success
+ *      \retval  -ve error
+ *
+ * The whole fld index insertion is protected by seq->lss_mutex (see
+ * seq_server_alloc_super), i.e. only one thread will access fldb each
+ * time, so we do not need worry the fld file and cache will being
+ * changed between declare and create.
+ * Because the fld entry can only be increamental, so we will only check
+ * whether it can be merged from the left.
+ *
+ * Caller must hold fld->lsf_lock
+ **/
+int fld_index_create(const struct lu_env *env, struct lu_server_fld *fld,
+		     const struct lu_seq_range *new_range, struct thandle *th)
+{
+	struct lu_seq_range	*range;
+	struct lu_seq_range	*tmp;
+	struct fld_thread_info	*info;
+	int			rc = 0;
+	int			deleted = 0;
+	struct fld_cache_entry	*flde;
+	ENTRY;
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+
+	LASSERT(mutex_is_locked(&fld->lsf_lock));
+
+	range = &info->fti_lrange;
+	memset(range, 0, sizeof(*range));
+	tmp = &info->fti_irange;
+	rc = fld_index_lookup(env, fld, new_range->lsr_start, range);
+	if (rc != -ENOENT) {
+		rc = rc == 0 ? -EEXIST : rc;
+		GOTO(out, rc);
+	}
+
+	if (new_range->lsr_start == range->lsr_end && range->lsr_end != 0 &&
+	    lu_seq_range_compare_loc(new_range, range) == 0) {
+		range_cpu_to_be(tmp, range);
+		rc = dt_delete(env, fld->lsf_obj,
+			       (struct dt_key *)&tmp->lsr_start, th);
+		if (rc != 0)
+			GOTO(out, rc);
+		*tmp = *new_range;
+		tmp->lsr_start = range->lsr_start;
+		deleted = 1;
+	} else {
+		*tmp = *new_range;
+	}
+
+	range_cpu_to_be(tmp, tmp);
+	rc = dt_insert(env, fld->lsf_obj, (struct dt_rec *)tmp,
+		       (struct dt_key *)&tmp->lsr_start, th, 1);
+	if (rc != 0) {
+		CERROR("%s: insert range "DRANGE" failed: rc = %d\n",
+		       fld->lsf_name, PRANGE(new_range), rc);
+		GOTO(out, rc);
+	}
+
+	flde = fld_cache_entry_create(new_range);
+	if (IS_ERR(flde))
+		GOTO(out, rc = PTR_ERR(flde));
+
+	write_lock(&fld->lsf_cache->fci_lock);
+	if (deleted)
+		fld_cache_delete_nolock(fld->lsf_cache, new_range);
+	rc = fld_cache_insert_nolock(fld->lsf_cache, flde);
+	write_unlock(&fld->lsf_cache->fci_lock);
+	if (rc)
+		OBD_FREE_PTR(flde);
+out:
+	RETURN(rc);
+}
+
+/**
+ * lookup range for a seq passed. note here we only care about the start/end,
+ * caller should handle the attached location data (flags, index).
+ *
+ * \param  seq     seq for lookup.
+ * \param  range   result of lookup.
+ *
+ * \retval  0           found, \a range is the matched range;
+ * \retval -ENOENT      not found, \a range is the left-side range;
+ * \retval  -ve         other error;
+ */
+int fld_index_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+		     u64 seq, struct lu_seq_range *range)
+{
+        struct lu_seq_range     *fld_rec;
+        struct fld_thread_info  *info;
+        int rc;
+
+        ENTRY;
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+	fld_rec = &info->fti_rec;
+
+	rc = fld_cache_lookup(fld->lsf_cache, seq, fld_rec);
+	if (rc == 0) {
+		*range = *fld_rec;
+		if (lu_seq_range_within(range, seq))
+			rc = 0;
+		else
+			rc = -ENOENT;
+        }
+
+	CDEBUG(D_INFO, "%s: lookup seq = %#llx range : "DRANGE" rc = %d\n",
+               fld->lsf_name, seq, PRANGE(range), rc);
+
+        RETURN(rc);
+}
+
+/**
+ * insert entry in fld store.
+ *
+ * \param  env    relevant lu_env
+ * \param  fld    fld store
+ * \param  range  range to be inserted
+ *
+ * \retval  0  success
+ * \retval  -ve error
+ *
+ * Caller must hold fld->lsf_lock
+ **/
+
+int fld_insert_entry(const struct lu_env *env,
+		     struct lu_server_fld *fld,
+		     const struct lu_seq_range *range)
+{
+	struct thandle *th;
+	struct dt_device *dt = lu2dt_dev(fld->lsf_obj->do_lu.lo_dev);
+	int rc;
+	ENTRY;
+
+	LASSERT(mutex_is_locked(&fld->lsf_lock));
+
+	if (dt->dd_rdonly)
+		RETURN(0);
+
+	th = dt_trans_create(env, dt);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	rc = fld_declare_index_create(env, fld, range, th);
+	if (rc != 0) {
+		if (rc == -EEXIST)
+			rc = 0;
+		GOTO(out, rc);
+	}
+
+	rc = dt_trans_start_local(env, dt, th);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = fld_index_create(env, fld, range, th);
+	if (rc == -EEXIST)
+		rc = 0;
+out:
+	dt_trans_stop(env, dt, th);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(fld_insert_entry);
+
+static int fld_insert_special_entries(const struct lu_env *env,
+				      struct lu_server_fld *fld)
+{
+	int rc;
+
+	rc = fld_insert_entry(env, fld, &IGIF_FLD_RANGE);
+	if (rc != 0)
+		RETURN(rc);
+
+	rc = fld_insert_entry(env, fld, &DOT_LUSTRE_FLD_RANGE);
+	if (rc != 0)
+		RETURN(rc);
+
+	rc = fld_insert_entry(env, fld, &ROOT_FLD_RANGE);
+
+	RETURN(rc);
+}
+
+int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
+		   struct dt_device *dt, int type)
+{
+	struct dt_object	*dt_obj = NULL;
+	struct lu_fid		fid;
+	struct lu_attr		*attr = NULL;
+	struct lu_seq_range	*range = NULL;
+	struct fld_thread_info	*info;
+	struct dt_object_format	dof;
+	struct dt_it		*it;
+	const struct dt_it_ops	*iops;
+	int			rc;
+	__u32			index;
+	ENTRY;
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+	LASSERT(info != NULL);
+
+	lu_local_obj_fid(&fid, FLD_INDEX_OID);
+	OBD_ALLOC_PTR(attr);
+	if (attr == NULL)
+		RETURN(-ENOMEM);
+
+	memset(attr, 0, sizeof(*attr));
+	attr->la_valid = LA_MODE;
+	attr->la_mode = S_IFREG | 0666;
+	dof.dof_type = DFT_INDEX;
+	dof.u.dof_idx.di_feat = &fld_index_features;
+
+	dt_obj = dt_locate(env, dt, &fid);
+	if (IS_ERR(dt_obj)) {
+		rc = PTR_ERR(dt_obj);
+		dt_obj = NULL;
+		GOTO(out, rc);
+	}
+
+	LASSERT(dt_obj != NULL);
+	if (!dt_object_exists(dt_obj)) {
+		dt_object_put(env, dt_obj);
+		dt_obj = dt_find_or_create(env, dt, &fid, &dof, attr);
+		fld->lsf_new = 1;
+		if (IS_ERR(dt_obj)) {
+			rc = PTR_ERR(dt_obj);
+			CERROR("%s: Can't find \"%s\" obj %d\n", fld->lsf_name,
+				fld_index_name, rc);
+			dt_obj = NULL;
+			GOTO(out, rc);
+		}
+	}
+
+	fld->lsf_obj = dt_obj;
+	rc = dt_obj->do_ops->do_index_try(env, dt_obj, &fld_index_features);
+	if (rc != 0) {
+		CERROR("%s: File \"%s\" is not an index: rc = %d!\n",
+		       fld->lsf_name, fld_index_name, rc);
+		GOTO(out, rc);
+	}
+
+	range = &info->fti_rec;
+	/* Load fld entry to cache */
+	iops = &dt_obj->do_index_ops->dio_it;
+	it = iops->init(env, dt_obj, 0);
+	if (IS_ERR(it))
+		GOTO(out, rc = PTR_ERR(it));
+
+	rc = iops->load(env, it, 0);
+	if (rc < 0)
+		GOTO(out_it_fini, rc);
+
+	if (rc > 0) {
+		/* Load FLD entry into server cache */
+		do {
+			rc = iops->rec(env, it, (struct dt_rec *)range, 0);
+			if (rc != 0)
+				GOTO(out_it_put, rc);
+			LASSERT(range != NULL);
+			range_be_to_cpu(range, range);
+			rc = fld_cache_insert(fld->lsf_cache, range);
+			if (rc != 0)
+				GOTO(out_it_put, rc);
+			rc = iops->next(env, it);
+		} while (rc == 0);
+	} else {
+		fld->lsf_new = 1;
+	}
+
+	rc = fld_name_to_index(fld->lsf_name, &index);
+	if (rc < 0)
+		GOTO(out_it_put, rc);
+	else
+		rc = 0;
+
+	if (index == 0 && type == LU_SEQ_RANGE_MDT) {
+		/* Note: fld_insert_entry will detect whether these
+		 * special entries already exist inside FLDB */
+		mutex_lock(&fld->lsf_lock);
+		rc = fld_insert_special_entries(env, fld);
+		mutex_unlock(&fld->lsf_lock);
+		if (rc != 0) {
+			CERROR("%s: insert special entries failed!: rc = %d\n",
+			       fld->lsf_name, rc);
+			GOTO(out_it_put, rc);
+		}
+	}
+out_it_put:
+	iops->put(env, it);
+out_it_fini:
+	iops->fini(env, it);
+out:
+	if (attr != NULL)
+		OBD_FREE_PTR(attr);
+
+	if (rc < 0) {
+		if (dt_obj != NULL)
+			dt_object_put(env, dt_obj);
+		fld->lsf_obj = NULL;
+	}
+	RETURN(rc);
+}
+
+void fld_index_fini(const struct lu_env *env, struct lu_server_fld *fld)
+{
+	ENTRY;
+	if (fld->lsf_obj != NULL) {
+		if (!IS_ERR(fld->lsf_obj))
+			dt_object_put(env, fld->lsf_obj);
+		fld->lsf_obj = NULL;
+	}
+	EXIT;
+}
+
+int fld_server_read(const struct lu_env *env, struct lu_server_fld *fld,
+		    struct lu_seq_range *range, void *data, int data_len)
+{
+	struct lu_seq_range_array *lsra = data;
+	struct fld_thread_info	  *info;
+	struct dt_object	  *dt_obj = fld->lsf_obj;
+	struct lu_seq_range	  *entry;
+	struct dt_it		  *it;
+	const struct dt_it_ops	  *iops;
+	int			  rc;
+
+	ENTRY;
+
+	lsra->lsra_count = 0;
+	iops = &dt_obj->do_index_ops->dio_it;
+	it = iops->init(env, dt_obj, 0);
+	if (IS_ERR(it))
+		RETURN(PTR_ERR(it));
+
+	rc = iops->load(env, it, range->lsr_end);
+	if (rc <= 0)
+		GOTO(out_it_fini, rc);
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+	LASSERT(info != NULL);
+	entry = &info->fti_rec;
+	do {
+		rc = iops->rec(env, it, (struct dt_rec *)entry, 0);
+		if (rc != 0)
+			GOTO(out_it_put, rc);
+
+		if (offsetof(typeof(*lsra), lsra_lsr[lsra->lsra_count + 1]) >
+		    data_len)
+			GOTO(out, rc = -EAGAIN);
+
+		range_be_to_cpu(entry, entry);
+		if (entry->lsr_index == range->lsr_index &&
+		    entry->lsr_flags == range->lsr_flags &&
+		    entry->lsr_start > range->lsr_start) {
+			lsra->lsra_lsr[lsra->lsra_count] = *entry;
+			lsra->lsra_count++;
+		}
+
+		rc = iops->next(env, it);
+	} while (rc == 0);
+	if (rc > 0)
+		rc = 0;
+out:
+	range_array_cpu_to_le(lsra, lsra);
+out_it_put:
+	iops->put(env, it);
+out_it_fini:
+	iops->fini(env, it);
+
+	RETURN(rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_internal.h b/drivers/staging/lustrefsx/lustre/fld/fld_internal.h
new file mode 100644
index 0000000000000..dcb24a3c2f22a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/fld/fld_internal.h
@@ -0,0 +1,236 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_internal.h
+ *
+ * Subsystem Description:
+ * FLD is FID Location Database, which stores where (IE, on which MDT)
+ * FIDs are located.
+ * The database is basically a record file, each record consists of a FID
+ * sequence range, MDT/OST index, and flags. The FLD for the whole FS
+ * is only stored on the sequence controller(MDT0) right now, but each target
+ * also has its local FLD, which only stores the local sequence.
+ *
+ * The FLD subsystem usually has two tasks:
+ * 1. maintain the database, i.e. when the sequence controller allocates
+ * new sequence ranges to some nodes, it will call the FLD API to insert the
+ * location information <sequence_range, node_index> in FLDB.
+ *
+ * 2. Handle requests from other nodes, i.e. if client needs to know where
+ * the FID is located, if it can not find the information in the local cache,
+ * it will send a FLD lookup RPC to the FLD service, and the FLD service will
+ * look up the FLDB entry and return the location information to client.
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ * Author: Tom WangDi <wangdi@clusterfs.com>
+ */
+#ifndef __FLD_INTERNAL_H
+#define __FLD_INTERNAL_H
+
+#include <obd.h>
+#include <lustre/lustre_idl.h>
+#include <libcfs/libcfs.h>
+#include <lustre_fld.h>
+
+struct fld_stats {
+        __u64   fst_count;
+        __u64   fst_cache;
+};
+
+typedef int (*fld_hash_func_t) (struct lu_client_fld *, __u64);
+
+typedef struct lu_fld_target *
+(*fld_scan_func_t) (struct lu_client_fld *, __u64);
+
+struct lu_fld_hash {
+        const char              *fh_name;
+        fld_hash_func_t          fh_hash_func;
+        fld_scan_func_t          fh_scan_func;
+};
+
+struct fld_cache_entry {
+	struct list_head	fce_lru;
+	struct list_head	fce_list;
+	/**
+	 * fld cache entries are sorted on range->lsr_start field. */
+	struct lu_seq_range	fce_range;
+};
+
+struct fld_cache {
+	/**
+	 * Cache guard, protects fci_hash mostly because others immutable after
+	 * init is finished.
+	 */
+	rwlock_t		 fci_lock;
+
+        /**
+         * Cache shrink threshold */
+        int                      fci_threshold;
+
+        /**
+         * Prefered number of cached entries */
+        int                      fci_cache_size;
+
+        /**
+         * Current number of cached entries. Protected by \a fci_lock */
+        int                      fci_cache_count;
+
+        /**
+         * LRU list fld entries. */
+	struct list_head	fci_lru;
+
+        /**
+         * sorted fld entries. */
+	struct list_head	fci_entries_head;
+
+        /**
+         * Cache statistics. */
+        struct fld_stats         fci_stat;
+
+        /**
+         * Cache name used for debug and messages. */
+        char                     fci_name[80];
+	unsigned int		 fci_no_shrink:1;
+};
+
+enum {
+        /* 4M of FLD cache will not hurt client a lot. */
+        FLD_SERVER_CACHE_SIZE      = (4 * 0x100000),
+
+        /* 1M of FLD cache will not hurt client a lot. */
+        FLD_CLIENT_CACHE_SIZE      = (1 * 0x100000)
+};
+
+enum {
+        /* Cache threshold is 10 percent of size. */
+        FLD_SERVER_CACHE_THRESHOLD = 10,
+
+        /* Cache threshold is 10 percent of size. */
+        FLD_CLIENT_CACHE_THRESHOLD = 10
+};
+
+extern struct lu_fld_hash fld_hash[];
+
+
+#ifdef CONFIG_PROC_FS
+extern struct proc_dir_entry *fld_type_proc_dir;
+extern struct lprocfs_vars fld_client_proc_list[];
+#endif
+
+# ifdef HAVE_SERVER_SUPPORT
+struct fld_thread_info {
+	struct lu_seq_range fti_rec;
+	struct lu_seq_range fti_lrange;
+	struct lu_seq_range fti_irange;
+};
+
+extern struct lu_context_key fld_thread_key;
+
+struct dt_device;
+int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
+		   struct dt_device *dt, int type);
+
+void fld_index_fini(const struct lu_env *env, struct lu_server_fld *fld);
+
+int fld_declare_index_create(const struct lu_env *env,
+			     struct lu_server_fld *fld,
+			     const struct lu_seq_range *new_range,
+			     struct thandle *th);
+
+int fld_index_create(const struct lu_env *env, struct lu_server_fld *fld,
+		     const struct lu_seq_range *new_range, struct thandle *th);
+
+int fld_index_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+		     u64 seq, struct lu_seq_range *range);
+
+int fld_name_to_index(const char *name, __u32 *index);
+int fld_server_mod_init(void);
+
+void fld_server_mod_exit(void);
+
+int fld_server_read(const struct lu_env *env, struct lu_server_fld *fld,
+		    struct lu_seq_range *range, void *data, int data_len);
+#ifdef CONFIG_PROC_FS
+extern const struct file_operations fld_proc_seq_fops;
+extern struct lprocfs_vars fld_server_proc_list[];
+#endif
+
+# endif /* HAVE_SERVER_SUPPORT */
+
+int fld_client_rpc(struct obd_export *exp,
+                   struct lu_seq_range *range, __u32 fld_op,
+		   struct ptlrpc_request **reqp);
+
+struct fld_cache *fld_cache_init(const char *name,
+                                 int cache_size, int cache_threshold);
+
+void fld_cache_fini(struct fld_cache *cache);
+
+void fld_cache_flush(struct fld_cache *cache);
+
+int fld_cache_insert(struct fld_cache *cache,
+		     const struct lu_seq_range *range);
+
+struct fld_cache_entry
+*fld_cache_entry_create(const struct lu_seq_range *range);
+
+int fld_cache_insert_nolock(struct fld_cache *cache,
+			    struct fld_cache_entry *f_new);
+void fld_cache_delete(struct fld_cache *cache,
+                      const struct lu_seq_range *range);
+void fld_cache_delete_nolock(struct fld_cache *cache,
+			     const struct lu_seq_range *range);
+int fld_cache_lookup(struct fld_cache *cache,
+		     const u64 seq, struct lu_seq_range *range);
+
+struct fld_cache_entry *
+fld_cache_entry_lookup(struct fld_cache *cache,
+		       const struct lu_seq_range *range);
+
+void fld_cache_entry_delete(struct fld_cache *cache,
+			    struct fld_cache_entry *node);
+
+struct fld_cache_entry *
+fld_cache_entry_lookup_nolock(struct fld_cache *cache,
+			      const struct lu_seq_range *range);
+
+static inline const char *
+fld_target_name(const struct lu_fld_target *tar)
+{
+#ifdef HAVE_SERVER_SUPPORT
+	if (tar->ft_srv != NULL)
+		return tar->ft_srv->lsf_name;
+#endif
+
+	return tar->ft_exp->exp_obd->obd_name;
+}
+
+#endif /* __FLD_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_request.c b/drivers/staging/lustrefsx/lustre/fld/fld_request.c
new file mode 100644
index 0000000000000..19b5789c19851
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/fld/fld_request.c
@@ -0,0 +1,570 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_request.c
+ *
+ * FLD (Fids Location Database)
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+#include <libcfs/libcfs.h>
+#include <linux/module.h>
+#include <linux/math64.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lprocfs_status.h>
+#include <lustre_req_layout.h>
+#include <lustre_fld.h>
+#include <lustre_mdc.h>
+#include "fld_internal.h"
+
+static int fld_rrb_hash(struct lu_client_fld *fld, u64 seq)
+{
+	LASSERT(fld->lcf_count > 0);
+	return do_div(seq, fld->lcf_count);
+}
+
+static struct lu_fld_target *
+fld_rrb_scan(struct lu_client_fld *fld, u64 seq)
+{
+        struct lu_fld_target *target;
+        int hash;
+        ENTRY;
+
+	/* Because almost all of special sequence located in MDT0,
+	 * it should go to index 0 directly, instead of calculating
+	 * hash again, and also if other MDTs is not being connected,
+	 * the fld lookup requests(for seq on MDT0) should not be
+	 * blocked because of other MDTs */
+	if (fid_seq_is_norm(seq))
+		hash = fld_rrb_hash(fld, seq);
+	else
+		hash = 0;
+
+again:
+	list_for_each_entry(target, &fld->lcf_targets, ft_chain) {
+                if (target->ft_idx == hash)
+                        RETURN(target);
+        }
+
+	if (hash != 0) {
+		/* It is possible the remote target(MDT) are not connected to
+		 * with client yet, so we will refer this to MDT0, which should
+		 * be connected during mount */
+		hash = 0;
+		goto again;
+	}
+
+	CERROR("%s: Can't find target by hash %d (seq %#llx). "
+               "Targets (%d):\n", fld->lcf_name, hash, seq,
+               fld->lcf_count);
+
+	list_for_each_entry(target, &fld->lcf_targets, ft_chain) {
+                const char *srv_name = target->ft_srv != NULL  ?
+                        target->ft_srv->lsf_name : "<null>";
+                const char *exp_name = target->ft_exp != NULL ?
+                        (char *)target->ft_exp->exp_obd->obd_uuid.uuid :
+                        "<null>";
+
+		CERROR("  exp: 0x%p (%s), srv: 0x%p (%s), idx: %llu\n",
+                       target->ft_exp, exp_name, target->ft_srv,
+                       srv_name, target->ft_idx);
+        }
+
+        /*
+         * If target is not found, there is logical error anyway, so here is
+         * LBUG() to catch this situation.
+         */
+        LBUG();
+        RETURN(NULL);
+}
+
+struct lu_fld_hash fld_hash[] = {
+        {
+                .fh_name = "RRB",
+                .fh_hash_func = fld_rrb_hash,
+                .fh_scan_func = fld_rrb_scan
+        },
+        {
+		NULL,
+        }
+};
+
+static struct lu_fld_target *
+fld_client_get_target(struct lu_client_fld *fld, u64 seq)
+{
+	struct lu_fld_target *target;
+	ENTRY;
+
+	LASSERT(fld->lcf_hash != NULL);
+
+	spin_lock(&fld->lcf_lock);
+	target = fld->lcf_hash->fh_scan_func(fld, seq);
+	spin_unlock(&fld->lcf_lock);
+
+        if (target != NULL) {
+		CDEBUG(D_INFO, "%s: Found target (idx %llu"
+		       ") by seq %#llx\n", fld->lcf_name,
+                       target->ft_idx, seq);
+        }
+
+        RETURN(target);
+}
+
+/*
+ * Add export to FLD. This is usually done by CMM and LMV as they are main users
+ * of FLD module.
+ */
+int fld_client_add_target(struct lu_client_fld *fld,
+                          struct lu_fld_target *tar)
+{
+	const char *name;
+        struct lu_fld_target *target, *tmp;
+        ENTRY;
+
+        LASSERT(tar != NULL);
+	name = fld_target_name(tar);
+        LASSERT(name != NULL);
+        LASSERT(tar->ft_srv != NULL || tar->ft_exp != NULL);
+
+	CDEBUG(D_INFO, "%s: Adding target %s (idx %llu)\n", fld->lcf_name,
+	       name, tar->ft_idx);
+
+        OBD_ALLOC_PTR(target);
+        if (target == NULL)
+                RETURN(-ENOMEM);
+
+	spin_lock(&fld->lcf_lock);
+	list_for_each_entry(tmp, &fld->lcf_targets, ft_chain) {
+		if (tmp->ft_idx == tar->ft_idx) {
+			spin_unlock(&fld->lcf_lock);
+                        OBD_FREE_PTR(target);
+			CERROR("Target %s exists in FLD and known as %s:#%llu\n",
+                               name, fld_target_name(tmp), tmp->ft_idx);
+                        RETURN(-EEXIST);
+                }
+        }
+
+        target->ft_exp = tar->ft_exp;
+        if (target->ft_exp != NULL)
+                class_export_get(target->ft_exp);
+        target->ft_srv = tar->ft_srv;
+        target->ft_idx = tar->ft_idx;
+
+	list_add_tail(&target->ft_chain, &fld->lcf_targets);
+
+        fld->lcf_count++;
+	spin_unlock(&fld->lcf_lock);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(fld_client_add_target);
+
+/* Remove export from FLD */
+int fld_client_del_target(struct lu_client_fld *fld, __u64 idx)
+{
+	struct lu_fld_target *target, *tmp;
+	ENTRY;
+
+	spin_lock(&fld->lcf_lock);
+	list_for_each_entry_safe(target, tmp, &fld->lcf_targets, ft_chain) {
+		if (target->ft_idx == idx) {
+			fld->lcf_count--;
+			list_del(&target->ft_chain);
+			spin_unlock(&fld->lcf_lock);
+
+                        if (target->ft_exp != NULL)
+                                class_export_put(target->ft_exp);
+
+                        OBD_FREE_PTR(target);
+                        RETURN(0);
+                }
+        }
+	spin_unlock(&fld->lcf_lock);
+	RETURN(-ENOENT);
+}
+
+#ifdef CONFIG_PROC_FS
+static int fld_client_proc_init(struct lu_client_fld *fld)
+{
+	int rc;
+	ENTRY;
+
+	fld->lcf_proc_dir = lprocfs_register(fld->lcf_name, fld_type_proc_dir,
+					     NULL, NULL);
+	if (IS_ERR(fld->lcf_proc_dir)) {
+		CERROR("%s: LProcFS failed in fld-init\n",
+		       fld->lcf_name);
+		rc = PTR_ERR(fld->lcf_proc_dir);
+		RETURN(rc);
+	}
+
+	rc = lprocfs_add_vars(fld->lcf_proc_dir, fld_client_proc_list, fld);
+	if (rc) {
+		CERROR("%s: Can't init FLD proc, rc %d\n",
+		       fld->lcf_name, rc);
+		GOTO(out_cleanup, rc);
+	}
+
+	RETURN(0);
+
+out_cleanup:
+	fld_client_proc_fini(fld);
+	return rc;
+}
+
+void fld_client_proc_fini(struct lu_client_fld *fld)
+{
+        ENTRY;
+        if (fld->lcf_proc_dir) {
+                if (!IS_ERR(fld->lcf_proc_dir))
+                        lprocfs_remove(&fld->lcf_proc_dir);
+                fld->lcf_proc_dir = NULL;
+        }
+        EXIT;
+}
+#else /* !CONFIG_PROC_FS */
+static int fld_client_proc_init(struct lu_client_fld *fld)
+{
+        return 0;
+}
+
+void fld_client_proc_fini(struct lu_client_fld *fld)
+{
+        return;
+}
+#endif /* CONFIG_PROC_FS */
+
+EXPORT_SYMBOL(fld_client_proc_fini);
+
+static inline int hash_is_sane(int hash)
+{
+        return (hash >= 0 && hash < ARRAY_SIZE(fld_hash));
+}
+
+int fld_client_init(struct lu_client_fld *fld,
+                    const char *prefix, int hash)
+{
+        int cache_size, cache_threshold;
+        int rc;
+        ENTRY;
+
+        LASSERT(fld != NULL);
+
+        snprintf(fld->lcf_name, sizeof(fld->lcf_name),
+                 "cli-%s", prefix);
+
+        if (!hash_is_sane(hash)) {
+                CERROR("%s: Wrong hash function %#x\n",
+                       fld->lcf_name, hash);
+                RETURN(-EINVAL);
+        }
+
+	fld->lcf_count = 0;
+	spin_lock_init(&fld->lcf_lock);
+	fld->lcf_hash = &fld_hash[hash];
+	INIT_LIST_HEAD(&fld->lcf_targets);
+
+        cache_size = FLD_CLIENT_CACHE_SIZE /
+                sizeof(struct fld_cache_entry);
+
+        cache_threshold = cache_size *
+                FLD_CLIENT_CACHE_THRESHOLD / 100;
+
+        fld->lcf_cache = fld_cache_init(fld->lcf_name,
+                                        cache_size, cache_threshold);
+        if (IS_ERR(fld->lcf_cache)) {
+                rc = PTR_ERR(fld->lcf_cache);
+                fld->lcf_cache = NULL;
+                GOTO(out, rc);
+        }
+
+        rc = fld_client_proc_init(fld);
+        if (rc)
+                GOTO(out, rc);
+        EXIT;
+out:
+        if (rc)
+                fld_client_fini(fld);
+        else
+                CDEBUG(D_INFO, "%s: Using \"%s\" hash\n",
+                       fld->lcf_name, fld->lcf_hash->fh_name);
+        return rc;
+}
+EXPORT_SYMBOL(fld_client_init);
+
+void fld_client_fini(struct lu_client_fld *fld)
+{
+	struct lu_fld_target *target, *tmp;
+	ENTRY;
+
+	spin_lock(&fld->lcf_lock);
+	list_for_each_entry_safe(target, tmp, &fld->lcf_targets, ft_chain) {
+                fld->lcf_count--;
+		list_del(&target->ft_chain);
+                if (target->ft_exp != NULL)
+                        class_export_put(target->ft_exp);
+                OBD_FREE_PTR(target);
+        }
+	spin_unlock(&fld->lcf_lock);
+
+        if (fld->lcf_cache != NULL) {
+                if (!IS_ERR(fld->lcf_cache))
+                        fld_cache_fini(fld->lcf_cache);
+                fld->lcf_cache = NULL;
+        }
+
+        EXIT;
+}
+EXPORT_SYMBOL(fld_client_fini);
+
+int fld_client_rpc(struct obd_export *exp,
+		   struct lu_seq_range *range, __u32 fld_op,
+		   struct ptlrpc_request **reqp)
+{
+	struct ptlrpc_request *req = NULL;
+	struct lu_seq_range   *prange;
+	__u32                 *op;
+	int                    rc = 0;
+	struct obd_import     *imp;
+	ENTRY;
+
+	LASSERT(exp != NULL);
+
+again:
+	imp = class_exp2cliimp(exp);
+	switch (fld_op) {
+	case FLD_QUERY:
+		req = ptlrpc_request_alloc_pack(imp, &RQF_FLD_QUERY,
+						LUSTRE_MDS_VERSION, FLD_QUERY);
+		if (req == NULL)
+			RETURN(-ENOMEM);
+
+		/* XXX: only needed when talking to old server(< 2.6), it should
+		 * be removed when < 2.6 server is not supported */
+		op = req_capsule_client_get(&req->rq_pill, &RMF_FLD_OPC);
+		*op = FLD_LOOKUP;
+
+		/* For MDS_MDS seq lookup, it will always use LWP connection,
+		 * but LWP will be evicted after restart, so cause the error.
+		 * so we will set no_delay for seq lookup request, once the
+		 * request fails because of the eviction. always retry here */
+		if (imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS) {
+			req->rq_allow_replay = 1;
+			req->rq_no_delay = 1;
+		}
+		break;
+	case FLD_READ:
+		req = ptlrpc_request_alloc_pack(imp, &RQF_FLD_READ,
+						LUSTRE_MDS_VERSION, FLD_READ);
+		if (req == NULL)
+			RETURN(-ENOMEM);
+
+		req_capsule_set_size(&req->rq_pill, &RMF_GENERIC_DATA,
+				     RCL_SERVER, PAGE_SIZE);
+		break;
+	default:
+		rc = -EINVAL;
+		break;
+	}
+
+	if (rc != 0)
+		RETURN(rc);
+
+	prange = req_capsule_client_get(&req->rq_pill, &RMF_FLD_MDFLD);
+	*prange = *range;
+	ptlrpc_request_set_replen(req);
+        req->rq_request_portal = FLD_REQUEST_PORTAL;
+	req->rq_reply_portal = MDC_REPLY_PORTAL;
+        ptlrpc_at_set_req_timeout(req);
+
+	obd_get_request_slot(&exp->exp_obd->u.cli);
+	rc = ptlrpc_queue_wait(req);
+	obd_put_request_slot(&exp->exp_obd->u.cli);
+
+	if (rc == -ENOENT) {
+		/* Don't loop forever on non-existing FID sequences. */
+		GOTO(out_req, rc);
+	}
+
+	if (rc != 0) {
+		if (imp->imp_state != LUSTRE_IMP_CLOSED &&
+		    !imp->imp_deactive &&
+		    imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS &&
+		    OCD_HAS_FLAG(&imp->imp_connect_data, LIGHTWEIGHT) &&
+		    rc != -ENOTSUPP) {
+			/* Since LWP is not replayable, so it will keep
+			 * trying unless umount happens or the remote
+			 * target does not support the operation, otherwise
+			 * it would cause unecessary failure of the
+			 * application. */
+			ptlrpc_req_finished(req);
+			rc = 0;
+			goto again;
+		}
+		GOTO(out_req, rc);
+	}
+
+	if (fld_op == FLD_QUERY) {
+		prange = req_capsule_server_get(&req->rq_pill,
+						&RMF_FLD_MDFLD);
+		if (prange == NULL)
+			GOTO(out_req, rc = -EFAULT);
+		*range = *prange;
+	}
+
+	EXIT;
+out_req:
+	if (rc != 0 || reqp == NULL) {
+		ptlrpc_req_finished(req);
+		req = NULL;
+	}
+
+	if (reqp != NULL)
+		*reqp = req;
+
+	return rc;
+}
+
+int fld_client_lookup(struct lu_client_fld *fld, u64 seq, u32 *mds,
+		      __u32 flags, const struct lu_env *env)
+{
+	struct lu_seq_range res = { 0 };
+	struct lu_fld_target *target;
+	struct lu_fld_target *origin;
+	int rc;
+	ENTRY;
+
+	rc = fld_cache_lookup(fld->lcf_cache, seq, &res);
+	if (rc == 0) {
+		*mds = res.lsr_index;
+		RETURN(0);
+	}
+
+        /* Can not find it in the cache */
+        target = fld_client_get_target(fld, seq);
+        LASSERT(target != NULL);
+	origin = target;
+again:
+	CDEBUG(D_INFO, "%s: Lookup fld entry (seq: %#llx) on "
+	       "target %s (idx %llu)\n", fld->lcf_name, seq,
+               fld_target_name(target), target->ft_idx);
+
+	res.lsr_start = seq;
+	fld_range_set_type(&res, flags);
+
+#ifdef HAVE_SERVER_SUPPORT
+	if (target->ft_srv != NULL) {
+		LASSERT(env != NULL);
+		rc = fld_server_lookup(env, target->ft_srv, seq, &res);
+	} else
+#endif /* HAVE_SERVER_SUPPORT */
+	{
+		rc = fld_client_rpc(target->ft_exp, &res, FLD_QUERY, NULL);
+	}
+
+	if (rc == -ESHUTDOWN) {
+		/* If fld lookup failed because the target has been shutdown,
+		 * then try next target in the list, until trying all targets
+		 * or fld lookup succeeds */
+		spin_lock(&fld->lcf_lock);
+
+		/* If the next entry in the list is the head of the list,
+		 * move to the next entry after the head and retrieve
+		 * the target. Else retreive the next target entry. */
+
+		if (target->ft_chain.next == &fld->lcf_targets)
+			target = list_entry(target->ft_chain.next->next,
+					    struct lu_fld_target, ft_chain);
+		else
+			target = list_entry(target->ft_chain.next,
+						 struct lu_fld_target,
+						 ft_chain);
+		spin_unlock(&fld->lcf_lock);
+		if (target != origin)
+			goto again;
+	}
+	if (rc == 0) {
+		*mds = res.lsr_index;
+		fld_cache_insert(fld->lcf_cache, &res);
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(fld_client_lookup);
+
+void fld_client_flush(struct lu_client_fld *fld)
+{
+        fld_cache_flush(fld->lcf_cache);
+}
+
+
+struct proc_dir_entry *fld_type_proc_dir;
+
+static int __init fld_init(void)
+{
+	fld_type_proc_dir = lprocfs_register(LUSTRE_FLD_NAME,
+					     proc_lustre_root,
+					     NULL, NULL);
+	if (IS_ERR(fld_type_proc_dir))
+		return PTR_ERR(fld_type_proc_dir);
+
+#ifdef HAVE_SERVER_SUPPORT
+	fld_server_mod_init();
+#endif /* HAVE_SERVER_SUPPORT */
+
+	return 0;
+}
+
+static void __exit fld_exit(void)
+{
+#ifdef HAVE_SERVER_SUPPORT
+	fld_server_mod_exit();
+#endif /* HAVE_SERVER_SUPPORT */
+
+	if (fld_type_proc_dir != NULL && !IS_ERR(fld_type_proc_dir)) {
+		lprocfs_remove(&fld_type_proc_dir);
+		fld_type_proc_dir = NULL;
+	}
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre FID Location Database");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+MODULE_LICENSE("GPL");
+
+module_init(fld_init);
+module_exit(fld_exit);
diff --git a/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c b/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c
new file mode 100644
index 0000000000000..c7be5bf6ea97f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c
@@ -0,0 +1,371 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/lproc_fld.c
+ *
+ * FLD (FIDs Location Database)
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ *	Di Wang <di.wang@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+#include <libcfs/libcfs.h>
+#include <linux/module.h>
+#include <dt_object.h>
+#include <obd_support.h>
+#include <lustre_fld.h>
+#include <lustre_fid.h>
+#include "fld_internal.h"
+
+#ifdef CONFIG_PROC_FS
+static int
+fld_proc_targets_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_client_fld *fld = (struct lu_client_fld *)m->private;
+        struct lu_fld_target *target;
+	ENTRY;
+
+	LASSERT(fld != NULL);
+
+	spin_lock(&fld->lcf_lock);
+	list_for_each_entry(target, &fld->lcf_targets, ft_chain)
+	seq_printf(m, "%s\n", fld_target_name(target));
+	spin_unlock(&fld->lcf_lock);
+	RETURN(0);
+}
+
+static int
+fld_proc_hash_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_client_fld *fld = (struct lu_client_fld *)m->private;
+	ENTRY;
+
+	LASSERT(fld != NULL);
+
+	spin_lock(&fld->lcf_lock);
+	seq_printf(m, "%s\n", fld->lcf_hash->fh_name);
+	spin_unlock(&fld->lcf_lock);
+
+	RETURN(0);
+}
+
+static ssize_t
+fld_proc_hash_seq_write(struct file *file, const char __user *buffer,
+			size_t count, loff_t *off)
+{
+	struct lu_client_fld *fld;
+	struct lu_fld_hash *hash = NULL;
+	char fh_name[8];
+	int i;
+
+	if (count > sizeof(fh_name))
+		return -ENAMETOOLONG;
+
+	if (copy_from_user(fh_name, buffer, count) != 0)
+		return -EFAULT;
+
+	fld = ((struct seq_file *)file->private_data)->private;
+	LASSERT(fld != NULL);
+
+	for (i = 0; fld_hash[i].fh_name != NULL; i++) {
+		if (count != strlen(fld_hash[i].fh_name))
+			continue;
+
+		if (!strncmp(fld_hash[i].fh_name, fh_name, count)) {
+			hash = &fld_hash[i];
+			break;
+		}
+	}
+
+	if (hash != NULL) {
+		spin_lock(&fld->lcf_lock);
+		fld->lcf_hash = hash;
+		spin_unlock(&fld->lcf_lock);
+
+		CDEBUG(D_INFO, "%s: Changed hash to \"%s\"\n",
+		       fld->lcf_name, hash->fh_name);
+	}
+
+	return count;
+}
+
+static ssize_t
+lprocfs_cache_flush_seq_write(struct file *file, const char __user *buffer,
+			       size_t count, loff_t *pos)
+{
+	struct lu_client_fld *fld = ((struct seq_file *)file->private_data)->private;
+	ENTRY;
+
+        LASSERT(fld != NULL);
+
+        fld_cache_flush(fld->lcf_cache);
+
+        CDEBUG(D_INFO, "%s: Lookup cache is flushed\n", fld->lcf_name);
+
+        RETURN(count);
+}
+
+LPROC_SEQ_FOPS_RO(fld_proc_targets);
+LPROC_SEQ_FOPS(fld_proc_hash);
+LPROC_SEQ_FOPS_WO_TYPE(fld, cache_flush);
+
+struct lprocfs_vars fld_client_proc_list[] = {
+	{ .name	=	"targets",
+	  .fops	=	&fld_proc_targets_fops	},
+	{ .name	=	"hash",
+	  .fops	=	&fld_proc_hash_fops	},
+	{ .name	=	"cache_flush",
+	  .fops	=	&fld_cache_flush_fops	},
+	{ NULL }
+};
+
+#ifdef HAVE_SERVER_SUPPORT
+struct fld_seq_param {
+	struct lu_env		fsp_env;
+	struct dt_it		*fsp_it;
+	struct lu_server_fld	*fsp_fld;
+	unsigned int		fsp_stop:1;
+};
+
+static void *fldb_seq_start(struct seq_file *p, loff_t *pos)
+{
+	struct fld_seq_param    *param = p->private;
+	struct lu_server_fld    *fld;
+	struct dt_object        *obj;
+	const struct dt_it_ops  *iops;
+	struct dt_key		*key;
+	int			rc;
+
+	if (param == NULL || param->fsp_stop)
+		return NULL;
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	rc = iops->load(&param->fsp_env, param->fsp_it, *pos);
+	if (rc <= 0)
+		return NULL;
+
+	key = iops->key(&param->fsp_env, param->fsp_it);
+	if (IS_ERR(key))
+		return NULL;
+
+	*pos = be64_to_cpu(*(__u64 *)key);
+
+	return param;
+}
+
+static void fldb_seq_stop(struct seq_file *p, void *v)
+{
+	struct fld_seq_param    *param = p->private;
+	const struct dt_it_ops	*iops;
+	struct lu_server_fld	*fld;
+	struct dt_object	*obj;
+
+	if (param == NULL)
+		return;
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	iops->put(&param->fsp_env, param->fsp_it);
+}
+
+static void *fldb_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	struct fld_seq_param    *param = p->private;
+	struct lu_server_fld	*fld;
+	struct dt_object	*obj;
+	const struct dt_it_ops	*iops;
+	int			rc;
+
+	if (param == NULL || param->fsp_stop)
+		return NULL;
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	rc = iops->next(&param->fsp_env, param->fsp_it);
+	if (rc > 0) {
+		param->fsp_stop = 1;
+		return NULL;
+	}
+
+	*pos = be64_to_cpu(*(__u64 *)iops->key(&param->fsp_env, param->fsp_it));
+	return param;
+}
+
+static int fldb_seq_show(struct seq_file *p, void *v)
+{
+	struct fld_seq_param    *param = p->private;
+	struct lu_server_fld	*fld;
+	struct dt_object	*obj;
+	const struct dt_it_ops	*iops;
+	struct fld_thread_info	*info;
+	struct lu_seq_range	*fld_rec;
+	int			rc;
+
+	if (param == NULL || param->fsp_stop)
+		return 0;
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	info = lu_context_key_get(&param->fsp_env.le_ctx,
+				  &fld_thread_key);
+	fld_rec = &info->fti_rec;
+	rc = iops->rec(&param->fsp_env, param->fsp_it,
+		       (struct dt_rec *)fld_rec, 0);
+	if (rc != 0) {
+		CERROR("%s:read record error: rc %d\n",
+		       fld->lsf_name, rc);
+	} else if (fld_rec->lsr_start != 0) {
+		range_be_to_cpu(fld_rec, fld_rec);
+		seq_printf(p, DRANGE"\n", PRANGE(fld_rec));
+	}
+
+	return rc;
+}
+
+struct seq_operations fldb_sops = {
+	.start = fldb_seq_start,
+	.stop = fldb_seq_stop,
+	.next = fldb_seq_next,
+	.show = fldb_seq_show,
+};
+
+static int fldb_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file		*seq;
+	struct lu_server_fld    *fld = (struct lu_server_fld *)PDE_DATA(inode);
+	struct dt_object	*obj;
+	const struct dt_it_ops  *iops;
+	struct fld_seq_param    *param = NULL;
+	int			env_init = 0;
+	int			rc;
+
+	rc = LPROCFS_ENTRY_CHECK(inode);
+	if (rc < 0)
+		return rc;
+
+	rc = seq_open(file, &fldb_sops);
+	if (rc)
+		GOTO(out, rc);
+
+	obj = fld->lsf_obj;
+	if (obj == NULL) {
+		seq = file->private_data;
+		seq->private = NULL;
+		return 0;
+	}
+
+	OBD_ALLOC_PTR(param);
+	if (param == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	rc = lu_env_init(&param->fsp_env, LCT_MD_THREAD);
+	if (rc != 0)
+		GOTO(out, rc);
+
+	env_init = 1;
+	iops = &obj->do_index_ops->dio_it;
+	param->fsp_it = iops->init(&param->fsp_env, obj, 0);
+	if (IS_ERR(param->fsp_it))
+		GOTO(out, rc = PTR_ERR(param->fsp_it));
+
+	param->fsp_fld = fld;
+	param->fsp_stop = 0;
+
+	seq = file->private_data;
+	seq->private = param;
+out:
+	if (rc != 0) {
+		if (env_init == 1)
+			lu_env_fini(&param->fsp_env);
+		if (param != NULL)
+			OBD_FREE_PTR(param);
+	}
+	return rc;
+}
+
+static int fldb_seq_release(struct inode *inode, struct file *file)
+{
+	struct seq_file		*seq = file->private_data;
+	struct fld_seq_param	*param;
+	struct lu_server_fld	*fld;
+	struct dt_object	*obj;
+	const struct dt_it_ops	*iops;
+
+	param = seq->private;
+	if (param == NULL) {
+		lprocfs_seq_release(inode, file);
+		return 0;
+	}
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	LASSERT(iops != NULL);
+	LASSERT(param->fsp_it != NULL);
+	iops->fini(&param->fsp_env, param->fsp_it);
+	lu_env_fini(&param->fsp_env);
+	OBD_FREE_PTR(param);
+	lprocfs_seq_release(inode, file);
+
+	return 0;
+}
+
+const struct file_operations fld_proc_seq_fops = {
+	.owner   = THIS_MODULE,
+	.open    = fldb_seq_open,
+	.read    = seq_read,
+	.release = fldb_seq_release,
+};
+
+struct lprocfs_vars fld_server_proc_list[] = {
+	{ NULL }
+};
+
+# endif /* HAVE_SERVER_SUPPORT */
+
+#endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/include/cl_object.h b/drivers/staging/lustrefsx/lustre/include/cl_object.h
new file mode 100644
index 0000000000000..78d09269a33c9
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/cl_object.h
@@ -0,0 +1,2493 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#ifndef _LUSTRE_CL_OBJECT_H
+#define _LUSTRE_CL_OBJECT_H
+
+/** \defgroup clio clio
+ *
+ * Client objects implement io operations and cache pages.
+ *
+ * Examples: lov and osc are implementations of cl interface.
+ *
+ * Big Theory Statement.
+ *
+ * Layered objects.
+ *
+ * Client implementation is based on the following data-types:
+ *
+ *   - cl_object
+ *
+ *   - cl_page
+ *
+ *   - cl_lock     represents an extent lock on an object.
+ *
+ *   - cl_io       represents high-level i/o activity such as whole read/write
+ *                 system call, or write-out of pages from under the lock being
+ *                 canceled. cl_io has sub-ios that can be stopped and resumed
+ *                 independently, thus achieving high degree of transfer
+ *                 parallelism. Single cl_io can be advanced forward by
+ *                 the multiple threads (although in the most usual case of
+ *                 read/write system call it is associated with the single user
+ *                 thread, that issued the system call).
+ *
+ * Terminology
+ *
+ *     - to avoid confusion high-level I/O operation like read or write system
+ *     call is referred to as "an io", whereas low-level I/O operation, like
+ *     RPC, is referred to as "a transfer"
+ *
+ *     - "generic code" means generic (not file system specific) code in the
+ *     hosting environment. "cl-code" means code (mostly in cl_*.c files) that
+ *     is not layer specific.
+ *
+ * Locking.
+ *
+ *  - i_mutex
+ *      - PG_locked
+ *          - cl_object_header::coh_page_guard
+ *          - lu_site::ls_guard
+ *
+ * See the top comment in cl_object.c for the description of overall locking and
+ * reference-counting design.
+ *
+ * See comments below for the description of i/o, page, and dlm-locking
+ * design.
+ *
+ * @{
+ */
+
+/*
+ * super-class definitions.
+ */
+#include <libcfs/libcfs.h>
+#include <libcfs/libcfs_ptask.h>
+#include <lu_object.h>
+#include <linux/atomic.h>
+#include <linux/mutex.h>
+#include <linux/radix-tree.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
+#include <lustre_dlm.h>
+
+struct obd_info;
+struct inode;
+
+struct cl_device;
+
+struct cl_object;
+
+struct cl_page;
+struct cl_page_slice;
+struct cl_lock;
+struct cl_lock_slice;
+
+struct cl_lock_operations;
+struct cl_page_operations;
+
+struct cl_io;
+struct cl_io_slice;
+
+struct cl_req_attr;
+
+extern struct cfs_ptask_engine *cl_io_engine;
+
+/**
+ * Device in the client stack.
+ *
+ * \see vvp_device, lov_device, lovsub_device, osc_device
+ */
+struct cl_device {
+        /** Super-class. */
+        struct lu_device                   cd_lu_dev;
+};
+
+/** \addtogroup cl_object cl_object
+ * @{ */
+/**
+ * "Data attributes" of cl_object. Data attributes can be updated
+ * independently for a sub-object, and top-object's attributes are calculated
+ * from sub-objects' ones.
+ */
+struct cl_attr {
+        /** Object size, in bytes */
+        loff_t cat_size;
+        /**
+         * Known minimal size, in bytes.
+         *
+         * This is only valid when at least one DLM lock is held.
+         */
+        loff_t cat_kms;
+        /** Modification time. Measured in seconds since epoch. */
+        time64_t cat_mtime;
+        /** Access time. Measured in seconds since epoch. */
+        time64_t cat_atime;
+        /** Change time. Measured in seconds since epoch. */
+        time64_t cat_ctime;
+        /**
+         * Blocks allocated to this cl_object on the server file system.
+         *
+         * \todo XXX An interface for block size is needed.
+         */
+        __u64  cat_blocks;
+        /**
+         * User identifier for quota purposes.
+         */
+        uid_t  cat_uid;
+        /**
+         * Group identifier for quota purposes.
+         */
+        gid_t  cat_gid;
+
+	/* nlink of the directory */
+	__u64  cat_nlink;
+
+	/* Project identifier for quota purpose. */
+	__u32  cat_projid;
+};
+
+/**
+ * Fields in cl_attr that are being set.
+ */
+enum cl_attr_valid {
+	CAT_SIZE   = 1 << 0,
+	CAT_KMS    = 1 << 1,
+	CAT_MTIME  = 1 << 3,
+	CAT_ATIME  = 1 << 4,
+	CAT_CTIME  = 1 << 5,
+	CAT_BLOCKS = 1 << 6,
+	CAT_UID    = 1 << 7,
+	CAT_GID    = 1 << 8,
+	CAT_PROJID = 1 << 9
+};
+
+/**
+ * Sub-class of lu_object with methods common for objects on the client
+ * stacks.
+ *
+ * cl_object: represents a regular file system object, both a file and a
+ *    stripe. cl_object is based on lu_object: it is identified by a fid,
+ *    layered, cached, hashed, and lrued. Important distinction with the server
+ *    side, where md_object and dt_object are used, is that cl_object "fans out"
+ *    at the lov/sns level: depending on the file layout, single file is
+ *    represented as a set of "sub-objects" (stripes). At the implementation
+ *    level, struct lov_object contains an array of cl_objects. Each sub-object
+ *    is a full-fledged cl_object, having its fid, living in the lru and hash
+ *    table.
+ *
+ *    This leads to the next important difference with the server side: on the
+ *    client, it's quite usual to have objects with the different sequence of
+ *    layers. For example, typical top-object is composed of the following
+ *    layers:
+ *
+ *        - vvp
+ *        - lov
+ *
+ *    whereas its sub-objects are composed of
+ *
+ *        - lovsub
+ *        - osc
+ *
+ *    layers. Here "lovsub" is a mostly dummy layer, whose purpose is to keep
+ *    track of the object-subobject relationship.
+ *
+ *    Sub-objects are not cached independently: when top-object is about to
+ *    be discarded from the memory, all its sub-objects are torn-down and
+ *    destroyed too.
+ *
+ * \see vvp_object, lov_object, lovsub_object, osc_object
+ */
+struct cl_object {
+        /** super class */
+        struct lu_object                   co_lu;
+        /** per-object-layer operations */
+        const struct cl_object_operations *co_ops;
+	/** offset of page slice in cl_page buffer */
+	int				   co_slice_off;
+};
+
+/**
+ * Description of the client object configuration. This is used for the
+ * creation of a new client object that is identified by a more state than
+ * fid.
+ */
+struct cl_object_conf {
+        /** Super-class. */
+	struct lu_object_conf     coc_lu;
+	union {
+		/**
+		 * Object layout. This is consumed by lov.
+		 */
+		struct lu_buf	 coc_layout;
+                /**
+                 * Description of particular stripe location in the
+                 * cluster. This is consumed by osc.
+                 */
+                struct lov_oinfo *coc_oinfo;
+        } u;
+        /**
+         * VFS inode. This is consumed by vvp.
+         */
+        struct inode             *coc_inode;
+	/**
+	 * Layout lock handle.
+	 */
+	struct ldlm_lock	 *coc_lock;
+	/**
+	 * Operation to handle layout, OBJECT_CONF_XYZ.
+	 */
+	int			  coc_opc;
+};
+
+enum {
+	/** configure layout, set up a new stripe, must be called while
+	 * holding layout lock. */
+	OBJECT_CONF_SET = 0,
+	/** invalidate the current stripe configuration due to losing
+	 * layout lock. */
+	OBJECT_CONF_INVALIDATE = 1,
+	/** wait for old layout to go away so that new layout can be
+	 * set up. */
+	OBJECT_CONF_WAIT = 2
+};
+
+enum {
+	CL_LAYOUT_GEN_NONE	= (u32)-2,	/* layout lock was cancelled */
+	CL_LAYOUT_GEN_EMPTY	= (u32)-1,	/* for empty layout */
+};
+
+struct cl_layout {
+	/** the buffer to return the layout in lov_mds_md format. */
+	struct lu_buf	cl_buf;
+	/** size of layout in lov_mds_md format. */
+	size_t		cl_size;
+	/** Layout generation. */
+	u32		cl_layout_gen;
+	/** whether layout is a composite one */
+	bool		cl_is_composite;
+};
+
+/**
+ * Operations implemented for each cl object layer.
+ *
+ * \see vvp_ops, lov_ops, lovsub_ops, osc_ops
+ */
+struct cl_object_operations {
+        /**
+         * Initialize page slice for this layer. Called top-to-bottom through
+         * every object layer when a new cl_page is instantiated. Layer
+         * keeping private per-page data, or requiring its own page operations
+         * vector should allocate these data here, and attach then to the page
+         * by calling cl_page_slice_add(). \a vmpage is locked (in the VM
+         * sense). Optional.
+         *
+         * \retval NULL success.
+         *
+         * \retval ERR_PTR(errno) failure code.
+         *
+         * \retval valid-pointer pointer to already existing referenced page
+         *         to be used instead of newly created.
+         */
+	int  (*coo_page_init)(const struct lu_env *env, struct cl_object *obj,
+				struct cl_page *page, pgoff_t index);
+        /**
+         * Initialize lock slice for this layer. Called top-to-bottom through
+         * every object layer when a new cl_lock is instantiated. Layer
+         * keeping private per-lock data, or requiring its own lock operations
+         * vector should allocate these data here, and attach then to the lock
+         * by calling cl_lock_slice_add(). Mandatory.
+         */
+        int  (*coo_lock_init)(const struct lu_env *env,
+                              struct cl_object *obj, struct cl_lock *lock,
+                              const struct cl_io *io);
+        /**
+         * Initialize io state for a given layer.
+         *
+         * called top-to-bottom once per io existence to initialize io
+         * state. If layer wants to keep some state for this type of io, it
+         * has to embed struct cl_io_slice in lu_env::le_ses, and register
+         * slice with cl_io_slice_add(). It is guaranteed that all threads
+         * participating in this io share the same session.
+         */
+        int  (*coo_io_init)(const struct lu_env *env,
+                            struct cl_object *obj, struct cl_io *io);
+	/**
+	 * Fill portion of \a attr that this layer controls. This method is
+	 * called top-to-bottom through all object layers.
+	 *
+	 * \pre cl_object_header::coh_attr_guard of the top-object is locked.
+	 *
+	 * \return   0: to continue
+	 * \return +ve: to stop iterating through layers (but 0 is returned
+	 *              from enclosing cl_object_attr_get())
+	 * \return -ve: to signal error
+	 */
+	int (*coo_attr_get)(const struct lu_env *env, struct cl_object *obj,
+			    struct cl_attr *attr);
+        /**
+         * Update attributes.
+         *
+         * \a valid is a bitmask composed from enum #cl_attr_valid, and
+         * indicating what attributes are to be set.
+         *
+         * \pre cl_object_header::coh_attr_guard of the top-object is locked.
+         *
+         * \return the same convention as for
+         * cl_object_operations::coo_attr_get() is used.
+         */
+	int (*coo_attr_update)(const struct lu_env *env, struct cl_object *obj,
+			       const struct cl_attr *attr, unsigned valid);
+        /**
+         * Update object configuration. Called top-to-bottom to modify object
+         * configuration.
+         *
+         * XXX error conditions and handling.
+         */
+        int (*coo_conf_set)(const struct lu_env *env, struct cl_object *obj,
+                            const struct cl_object_conf *conf);
+	/**
+	 * Glimpse ast. Executed when glimpse ast arrives for a lock on this
+	 * object. Layers are supposed to fill parts of \a lvb that will be
+	 * shipped to the glimpse originator as a glimpse result.
+	 *
+	 * \see vvp_object_glimpse(), lovsub_object_glimpse(),
+	 * \see osc_object_glimpse()
+	 */
+        int (*coo_glimpse)(const struct lu_env *env,
+                           const struct cl_object *obj, struct ost_lvb *lvb);
+	/**
+	 * Object prune method. Called when the layout is going to change on
+	 * this object, therefore each layer has to clean up their cache,
+	 * mainly pages and locks.
+	 */
+	int (*coo_prune)(const struct lu_env *env, struct cl_object *obj);
+	/**
+	 * Object getstripe method.
+	 */
+	int (*coo_getstripe)(const struct lu_env *env, struct cl_object *obj,
+			     struct lov_user_md __user *lum, size_t size);
+	/**
+	 * Get FIEMAP mapping from the object.
+	 */
+	int (*coo_fiemap)(const struct lu_env *env, struct cl_object *obj,
+			  struct ll_fiemap_info_key *fmkey,
+			  struct fiemap *fiemap, size_t *buflen);
+	/**
+	 * Get layout and generation of the object.
+	 */
+	int (*coo_layout_get)(const struct lu_env *env, struct cl_object *obj,
+			      struct cl_layout *layout);
+	/**
+	 * Get maximum size of the object.
+	 */
+	loff_t (*coo_maxbytes)(struct cl_object *obj);
+	/**
+	 * Set request attributes.
+	 */
+	void (*coo_req_attr_set)(const struct lu_env *env,
+				 struct cl_object *obj,
+				 struct cl_req_attr *attr);
+};
+
+/**
+ * Extended header for client object.
+ */
+struct cl_object_header {
+        /** Standard lu_object_header. cl_object::co_lu::lo_header points
+         * here. */
+	struct lu_object_header	coh_lu;
+
+        /**
+         * Parent object. It is assumed that an object has a well-defined
+         * parent, but not a well-defined child (there may be multiple
+         * sub-objects, for the same top-object). cl_object_header::coh_parent
+         * field allows certain code to be written generically, without
+         * limiting possible cl_object layouts unduly.
+         */
+        struct cl_object_header *coh_parent;
+        /**
+         * Protects consistency between cl_attr of parent object and
+         * attributes of sub-objects, that the former is calculated ("merged")
+         * from.
+         *
+         * \todo XXX this can be read/write lock if needed.
+         */
+	spinlock_t		 coh_attr_guard;
+	/**
+	 * Size of cl_page + page slices
+	 */
+	unsigned short		 coh_page_bufsize;
+	/**
+	 * Number of objects above this one: 0 for a top-object, 1 for its
+	 * sub-object, etc.
+	 */
+	unsigned char		 coh_nesting;
+};
+
+/**
+ * Helper macro: iterate over all layers of the object \a obj, assigning every
+ * layer top-to-bottom to \a slice.
+ */
+#define cl_object_for_each(slice, obj)				\
+	list_for_each_entry((slice),				\
+			    &(obj)->co_lu.lo_header->loh_layers,\
+			    co_lu.lo_linkage)
+
+/**
+ * Helper macro: iterate over all layers of the object \a obj, assigning every
+ * layer bottom-to-top to \a slice.
+ */
+#define cl_object_for_each_reverse(slice, obj)				\
+	list_for_each_entry_reverse((slice),				\
+				    &(obj)->co_lu.lo_header->loh_layers,\
+				    co_lu.lo_linkage)
+
+/** @} cl_object */
+
+#define CL_PAGE_EOF ((pgoff_t)~0ull)
+
+/** \addtogroup cl_page cl_page
+ * @{ */
+
+/** \struct cl_page
+ * Layered client page.
+ *
+ * cl_page: represents a portion of a file, cached in the memory. All pages
+ *    of the given file are of the same size, and are kept in the radix tree
+ *    hanging off the cl_object. cl_page doesn't fan out, but as sub-objects
+ *    of the top-level file object are first class cl_objects, they have their
+ *    own radix trees of pages and hence page is implemented as a sequence of
+ *    struct cl_pages's, linked into double-linked list through
+ *    cl_page::cp_parent and cl_page::cp_child pointers, each residing in the
+ *    corresponding radix tree at the corresponding logical offset.
+ *
+ * cl_page is associated with VM page of the hosting environment (struct
+ *    page in Linux kernel, for example), struct page. It is assumed, that this
+ *    association is implemented by one of cl_page layers (top layer in the
+ *    current design) that
+ *
+ *        - intercepts per-VM-page call-backs made by the environment (e.g.,
+ *          memory pressure),
+ *
+ *        - translates state (page flag bits) and locking between lustre and
+ *          environment.
+ *
+ *    The association between cl_page and struct page is immutable and
+ *    established when cl_page is created.
+ *
+ * cl_page can be "owned" by a particular cl_io (see below), guaranteeing
+ *    this io an exclusive access to this page w.r.t. other io attempts and
+ *    various events changing page state (such as transfer completion, or
+ *    eviction of the page from the memory). Note, that in general cl_io
+ *    cannot be identified with a particular thread, and page ownership is not
+ *    exactly equal to the current thread holding a lock on the page. Layer
+ *    implementing association between cl_page and struct page has to implement
+ *    ownership on top of available synchronization mechanisms.
+ *
+ *    While lustre client maintains the notion of an page ownership by io,
+ *    hosting MM/VM usually has its own page concurrency control
+ *    mechanisms. For example, in Linux, page access is synchronized by the
+ *    per-page PG_locked bit-lock, and generic kernel code (generic_file_*())
+ *    takes care to acquire and release such locks as necessary around the
+ *    calls to the file system methods (->readpage(), ->prepare_write(),
+ *    ->commit_write(), etc.). This leads to the situation when there are two
+ *    different ways to own a page in the client:
+ *
+ *        - client code explicitly and voluntary owns the page (cl_page_own());
+ *
+ *        - VM locks a page and then calls the client, that has "to assume"
+ *          the ownership from the VM (cl_page_assume()).
+ *
+ *    Dual methods to release ownership are cl_page_disown() and
+ *    cl_page_unassume().
+ *
+ * cl_page is reference counted (cl_page::cp_ref). When reference counter
+ *    drops to 0, the page is returned to the cache, unless it is in
+ *    cl_page_state::CPS_FREEING state, in which case it is immediately
+ *    destroyed.
+ *
+ *    The general logic guaranteeing the absence of "existential races" for
+ *    pages is the following:
+ *
+ *        - there are fixed known ways for a thread to obtain a new reference
+ *          to a page:
+ *
+ *            - by doing a lookup in the cl_object radix tree, protected by the
+ *              spin-lock;
+ *
+ *            - by starting from VM-locked struct page and following some
+ *              hosting environment method (e.g., following ->private pointer in
+ *              the case of Linux kernel), see cl_vmpage_page();
+ *
+ *        - when the page enters cl_page_state::CPS_FREEING state, all these
+ *          ways are severed with the proper synchronization
+ *          (cl_page_delete());
+ *
+ *        - entry into cl_page_state::CPS_FREEING is serialized by the VM page
+ *          lock;
+ *
+ *        - no new references to the page in cl_page_state::CPS_FREEING state
+ *          are allowed (checked in cl_page_get()).
+ *
+ *    Together this guarantees that when last reference to a
+ *    cl_page_state::CPS_FREEING page is released, it is safe to destroy the
+ *    page, as neither references to it can be acquired at that point, nor
+ *    ones exist.
+ *
+ * cl_page is a state machine. States are enumerated in enum
+ *    cl_page_state. Possible state transitions are enumerated in
+ *    cl_page_state_set(). State transition process (i.e., actual changing of
+ *    cl_page::cp_state field) is protected by the lock on the underlying VM
+ *    page.
+ *
+ * Linux Kernel implementation.
+ *
+ *    Binding between cl_page and struct page (which is a typedef for
+ *    struct page) is implemented in the vvp layer. cl_page is attached to the
+ *    ->private pointer of the struct page, together with the setting of
+ *    PG_private bit in page->flags, and acquiring additional reference on the
+ *    struct page (much like struct buffer_head, or any similar file system
+ *    private data structures).
+ *
+ *    PG_locked lock is used to implement both ownership and transfer
+ *    synchronization, that is, page is VM-locked in CPS_{OWNED,PAGE{IN,OUT}}
+ *    states. No additional references are acquired for the duration of the
+ *    transfer.
+ *
+ * \warning *THIS IS NOT* the behavior expected by the Linux kernel, where
+ *          write-out is "protected" by the special PG_writeback bit.
+ */
+
+/**
+ * States of cl_page. cl_page.c assumes particular order here.
+ *
+ * The page state machine is rather crude, as it doesn't recognize finer page
+ * states like "dirty" or "up to date". This is because such states are not
+ * always well defined for the whole stack (see, for example, the
+ * implementation of the read-ahead, that hides page up-to-dateness to track
+ * cache hits accurately). Such sub-states are maintained by the layers that
+ * are interested in them.
+ */
+enum cl_page_state {
+        /**
+         * Page is in the cache, un-owned. Page leaves cached state in the
+         * following cases:
+         *
+         *     - [cl_page_state::CPS_OWNED] io comes across the page and
+         *     owns it;
+         *
+         *     - [cl_page_state::CPS_PAGEOUT] page is dirty, the
+         *     req-formation engine decides that it wants to include this page
+         *     into an RPC being constructed, and yanks it from the cache;
+         *
+         *     - [cl_page_state::CPS_FREEING] VM callback is executed to
+         *     evict the page form the memory;
+         *
+         * \invariant cl_page::cp_owner == NULL && cl_page::cp_req == NULL
+         */
+        CPS_CACHED,
+        /**
+         * Page is exclusively owned by some cl_io. Page may end up in this
+         * state as a result of
+         *
+         *     - io creating new page and immediately owning it;
+         *
+         *     - [cl_page_state::CPS_CACHED] io finding existing cached page
+         *     and owning it;
+         *
+         *     - [cl_page_state::CPS_OWNED] io finding existing owned page
+         *     and waiting for owner to release the page;
+         *
+         * Page leaves owned state in the following cases:
+         *
+         *     - [cl_page_state::CPS_CACHED] io decides to leave the page in
+         *     the cache, doing nothing;
+         *
+         *     - [cl_page_state::CPS_PAGEIN] io starts read transfer for
+         *     this page;
+         *
+         *     - [cl_page_state::CPS_PAGEOUT] io starts immediate write
+         *     transfer for this page;
+         *
+         *     - [cl_page_state::CPS_FREEING] io decides to destroy this
+         *     page (e.g., as part of truncate or extent lock cancellation).
+         *
+         * \invariant cl_page::cp_owner != NULL && cl_page::cp_req == NULL
+         */
+        CPS_OWNED,
+        /**
+         * Page is being written out, as a part of a transfer. This state is
+         * entered when req-formation logic decided that it wants this page to
+         * be sent through the wire _now_. Specifically, it means that once
+         * this state is achieved, transfer completion handler (with either
+         * success or failure indication) is guaranteed to be executed against
+         * this page independently of any locks and any scheduling decisions
+         * made by the hosting environment (that effectively means that the
+         * page is never put into cl_page_state::CPS_PAGEOUT state "in
+         * advance". This property is mentioned, because it is important when
+         * reasoning about possible dead-locks in the system). The page can
+         * enter this state as a result of
+         *
+         *     - [cl_page_state::CPS_OWNED] an io requesting an immediate
+         *     write-out of this page, or
+         *
+         *     - [cl_page_state::CPS_CACHED] req-forming engine deciding
+         *     that it has enough dirty pages cached to issue a "good"
+         *     transfer.
+         *
+         * The page leaves cl_page_state::CPS_PAGEOUT state when the transfer
+         * is completed---it is moved into cl_page_state::CPS_CACHED state.
+         *
+         * Underlying VM page is locked for the duration of transfer.
+         *
+         * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL
+         */
+        CPS_PAGEOUT,
+        /**
+         * Page is being read in, as a part of a transfer. This is quite
+         * similar to the cl_page_state::CPS_PAGEOUT state, except that
+         * read-in is always "immediate"---there is no such thing a sudden
+         * construction of read request from cached, presumably not up to date,
+         * pages.
+         *
+         * Underlying VM page is locked for the duration of transfer.
+         *
+         * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL
+         */
+        CPS_PAGEIN,
+        /**
+         * Page is being destroyed. This state is entered when client decides
+         * that page has to be deleted from its host object, as, e.g., a part
+         * of truncate.
+         *
+         * Once this state is reached, there is no way to escape it.
+         *
+         * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req == NULL
+         */
+        CPS_FREEING,
+        CPS_NR
+};
+
+enum cl_page_type {
+        /** Host page, the page is from the host inode which the cl_page
+         * belongs to. */
+        CPT_CACHEABLE = 1,
+
+        /** Transient page, the transient cl_page is used to bind a cl_page
+         *  to vmpage which is not belonging to the same object of cl_page.
+         *  it is used in DirectIO, lockless IO and liblustre. */
+        CPT_TRANSIENT,
+};
+
+/**
+ * Fields are protected by the lock on struct page, except for atomics and
+ * immutables.
+ *
+ * \invariant Data type invariants are in cl_page_invariant(). Basically:
+ * cl_page::cp_parent and cl_page::cp_child are a well-formed double-linked
+ * list, consistent with the parent/child pointers in the cl_page::cp_obj and
+ * cl_page::cp_owner (when set).
+ */
+struct cl_page {
+	/** Reference counter. */
+	atomic_t		 cp_ref;
+	/** An object this page is a part of. Immutable after creation. */
+	struct cl_object	*cp_obj;
+	/** vmpage */
+	struct page		*cp_vmpage;
+	/** Linkage of pages within group. Pages must be owned */
+	struct list_head	 cp_batch;
+	/** List of slices. Immutable after creation. */
+	struct list_head	 cp_layers;
+	/**
+	 * Page state. This field is const to avoid accidental update, it is
+	 * modified only internally within cl_page.c. Protected by a VM lock.
+	 */
+	const enum cl_page_state cp_state;
+        /**
+         * Page type. Only CPT_TRANSIENT is used so far. Immutable after
+         * creation.
+         */
+        enum cl_page_type        cp_type;
+
+        /**
+         * Owning IO in cl_page_state::CPS_OWNED state. Sub-page can be owned
+         * by sub-io. Protected by a VM lock.
+         */
+        struct cl_io            *cp_owner;
+        /** List of references to this page, for debugging. */
+        struct lu_ref            cp_reference;
+	/** Link to an object, for debugging. */
+	struct lu_ref_link       cp_obj_ref;
+	/** Link to a queue, for debugging. */
+	struct lu_ref_link       cp_queue_ref;
+	/** Assigned if doing a sync_io */
+	struct cl_sync_io       *cp_sync_io;
+};
+
+/**
+ * Per-layer part of cl_page.
+ *
+ * \see vvp_page, lov_page, osc_page
+ */
+struct cl_page_slice {
+        struct cl_page                  *cpl_page;
+	pgoff_t				 cpl_index;
+        /**
+         * Object slice corresponding to this page slice. Immutable after
+         * creation.
+         */
+        struct cl_object                *cpl_obj;
+        const struct cl_page_operations *cpl_ops;
+        /** Linkage into cl_page::cp_layers. Immutable after creation. */
+	struct list_head		 cpl_linkage;
+};
+
+/**
+ * Lock mode. For the client extent locks.
+ *
+ * \ingroup cl_lock
+ */
+enum cl_lock_mode {
+	CLM_READ,
+	CLM_WRITE,
+	CLM_GROUP,
+	CLM_MAX,
+};
+
+/**
+ * Requested transfer type.
+ */
+enum cl_req_type {
+        CRT_READ,
+        CRT_WRITE,
+        CRT_NR
+};
+
+/**
+ * Per-layer page operations.
+ *
+ * Methods taking an \a io argument are for the activity happening in the
+ * context of given \a io. Page is assumed to be owned by that io, except for
+ * the obvious cases (like cl_page_operations::cpo_own()).
+ *
+ * \see vvp_page_ops, lov_page_ops, osc_page_ops
+ */
+struct cl_page_operations {
+        /**
+	 * cl_page<->struct page methods. Only one layer in the stack has to
+         * implement these. Current code assumes that this functionality is
+         * provided by the topmost layer, see cl_page_disown0() as an example.
+         */
+
+        /**
+         * Called when \a io acquires this page into the exclusive
+         * ownership. When this method returns, it is guaranteed that the is
+         * not owned by other io, and no transfer is going on against
+         * it. Optional.
+         *
+         * \see cl_page_own()
+         * \see vvp_page_own(), lov_page_own()
+         */
+        int  (*cpo_own)(const struct lu_env *env,
+                        const struct cl_page_slice *slice,
+                        struct cl_io *io, int nonblock);
+        /** Called when ownership it yielded. Optional.
+         *
+         * \see cl_page_disown()
+         * \see vvp_page_disown()
+         */
+        void (*cpo_disown)(const struct lu_env *env,
+                           const struct cl_page_slice *slice, struct cl_io *io);
+        /**
+         * Called for a page that is already "owned" by \a io from VM point of
+         * view. Optional.
+         *
+         * \see cl_page_assume()
+         * \see vvp_page_assume(), lov_page_assume()
+         */
+        void (*cpo_assume)(const struct lu_env *env,
+                           const struct cl_page_slice *slice, struct cl_io *io);
+        /** Dual to cl_page_operations::cpo_assume(). Optional. Called
+         * bottom-to-top when IO releases a page without actually unlocking
+         * it.
+         *
+         * \see cl_page_unassume()
+         * \see vvp_page_unassume()
+         */
+        void (*cpo_unassume)(const struct lu_env *env,
+                             const struct cl_page_slice *slice,
+                             struct cl_io *io);
+        /**
+         * Announces whether the page contains valid data or not by \a uptodate.
+         *
+         * \see cl_page_export()
+         * \see vvp_page_export()
+         */
+        void  (*cpo_export)(const struct lu_env *env,
+                            const struct cl_page_slice *slice, int uptodate);
+        /**
+         * Checks whether underlying VM page is locked (in the suitable
+         * sense). Used for assertions.
+         *
+         * \retval    -EBUSY: page is protected by a lock of a given mode;
+         * \retval  -ENODATA: page is not protected by a lock;
+         * \retval         0: this layer cannot decide. (Should never happen.)
+         */
+        int (*cpo_is_vmlocked)(const struct lu_env *env,
+                               const struct cl_page_slice *slice);
+        /**
+         * Page destruction.
+         */
+
+        /**
+         * Called when page is truncated from the object. Optional.
+         *
+         * \see cl_page_discard()
+         * \see vvp_page_discard(), osc_page_discard()
+         */
+        void (*cpo_discard)(const struct lu_env *env,
+                            const struct cl_page_slice *slice,
+                            struct cl_io *io);
+        /**
+         * Called when page is removed from the cache, and is about to being
+         * destroyed. Optional.
+         *
+         * \see cl_page_delete()
+         * \see vvp_page_delete(), osc_page_delete()
+         */
+        void (*cpo_delete)(const struct lu_env *env,
+                           const struct cl_page_slice *slice);
+        /** Destructor. Frees resources and slice itself. */
+        void (*cpo_fini)(const struct lu_env *env,
+                         struct cl_page_slice *slice);
+        /**
+         * Optional debugging helper. Prints given page slice.
+         *
+         * \see cl_page_print()
+         */
+        int (*cpo_print)(const struct lu_env *env,
+                         const struct cl_page_slice *slice,
+                         void *cookie, lu_printer_t p);
+        /**
+         * \name transfer
+         *
+         * Transfer methods.
+         *
+         * @{
+         */
+        /**
+         * Request type dependent vector of operations.
+         *
+         * Transfer operations depend on transfer mode (cl_req_type). To avoid
+         * passing transfer mode to each and every of these methods, and to
+         * avoid branching on request type inside of the methods, separate
+         * methods for cl_req_type:CRT_READ and cl_req_type:CRT_WRITE are
+         * provided. That is, method invocation usually looks like
+         *
+         *         slice->cp_ops.io[req->crq_type].cpo_method(env, slice, ...);
+         */
+        struct {
+                /**
+                 * Called when a page is submitted for a transfer as a part of
+                 * cl_page_list.
+                 *
+                 * \return    0         : page is eligible for submission;
+                 * \return    -EALREADY : skip this page;
+                 * \return    -ve       : error.
+                 *
+                 * \see cl_page_prep()
+                 */
+                int  (*cpo_prep)(const struct lu_env *env,
+                                 const struct cl_page_slice *slice,
+                                 struct cl_io *io);
+                /**
+                 * Completion handler. This is guaranteed to be eventually
+                 * fired after cl_page_operations::cpo_prep() or
+                 * cl_page_operations::cpo_make_ready() call.
+                 *
+                 * This method can be called in a non-blocking context. It is
+                 * guaranteed however, that the page involved and its object
+                 * are pinned in memory (and, hence, calling cl_page_put() is
+                 * safe).
+                 *
+                 * \see cl_page_completion()
+                 */
+                void (*cpo_completion)(const struct lu_env *env,
+                                       const struct cl_page_slice *slice,
+                                       int ioret);
+                /**
+                 * Called when cached page is about to be added to the
+                 * ptlrpc request as a part of req formation.
+                 *
+                 * \return    0       : proceed with this page;
+                 * \return    -EAGAIN : skip this page;
+                 * \return    -ve     : error.
+                 *
+                 * \see cl_page_make_ready()
+                 */
+                int  (*cpo_make_ready)(const struct lu_env *env,
+                                       const struct cl_page_slice *slice);
+        } io[CRT_NR];
+        /**
+         * Tell transfer engine that only [to, from] part of a page should be
+         * transmitted.
+         *
+         * This is used for immediate transfers.
+         *
+         * \todo XXX this is not very good interface. It would be much better
+         * if all transfer parameters were supplied as arguments to
+         * cl_io_operations::cio_submit() call, but it is not clear how to do
+         * this for page queues.
+         *
+         * \see cl_page_clip()
+         */
+        void (*cpo_clip)(const struct lu_env *env,
+                         const struct cl_page_slice *slice,
+                         int from, int to);
+        /**
+         * \pre  the page was queued for transferring.
+         * \post page is removed from client's pending list, or -EBUSY
+         *       is returned if it has already been in transferring.
+         *
+         * This is one of seldom page operation which is:
+         * 0. called from top level;
+         * 1. don't have vmpage locked;
+         * 2. every layer should synchronize execution of its ->cpo_cancel()
+         *    with completion handlers. Osc uses client obd lock for this
+         *    purpose. Based on there is no vvp_page_cancel and
+         *    lov_page_cancel(), cpo_cancel is defacto protected by client lock.
+         *
+         * \see osc_page_cancel().
+         */
+        int (*cpo_cancel)(const struct lu_env *env,
+                          const struct cl_page_slice *slice);
+	/**
+	 * Write out a page by kernel. This is only called by ll_writepage
+	 * right now.
+	 *
+	 * \see cl_page_flush()
+	 */
+	int (*cpo_flush)(const struct lu_env *env,
+			 const struct cl_page_slice *slice,
+			 struct cl_io *io);
+        /** @} transfer */
+};
+
+/**
+ * Helper macro, dumping detailed information about \a page into a log.
+ */
+#define CL_PAGE_DEBUG(mask, env, page, format, ...)                     \
+do {                                                                    \
+        if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {                   \
+                LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);        \
+                cl_page_print(env, &msgdata, lu_cdebug_printer, page);  \
+                CDEBUG(mask, format , ## __VA_ARGS__);                  \
+        }                                                               \
+} while (0)
+
+/**
+ * Helper macro, dumping shorter information about \a page into a log.
+ */
+#define CL_PAGE_HEADER(mask, env, page, format, ...)                          \
+do {                                                                          \
+        if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {                         \
+                LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);              \
+                cl_page_header_print(env, &msgdata, lu_cdebug_printer, page); \
+                CDEBUG(mask, format , ## __VA_ARGS__);                        \
+        }                                                                     \
+} while (0)
+
+static inline struct page *cl_page_vmpage(const struct cl_page *page)
+{
+	LASSERT(page->cp_vmpage != NULL);
+	return page->cp_vmpage;
+}
+
+/**
+ * Check if a cl_page is in use.
+ *
+ * Client cache holds a refcount, this refcount will be dropped when
+ * the page is taken out of cache, see vvp_page_delete().
+ */
+static inline bool __page_in_use(const struct cl_page *page, int refc)
+{
+	return (atomic_read(&page->cp_ref) > refc + 1);
+}
+
+/**
+ * Caller itself holds a refcount of cl_page.
+ */
+#define cl_page_in_use(pg)       __page_in_use(pg, 1)
+/**
+ * Caller doesn't hold a refcount.
+ */
+#define cl_page_in_use_noref(pg) __page_in_use(pg, 0)
+
+/** @} cl_page */
+
+/** \addtogroup cl_lock cl_lock
+ * @{ */
+/** \struct cl_lock
+ *
+ * Extent locking on the client.
+ *
+ * LAYERING
+ *
+ * The locking model of the new client code is built around
+ *
+ *        struct cl_lock
+ *
+ * data-type representing an extent lock on a regular file. cl_lock is a
+ * layered object (much like cl_object and cl_page), it consists of a header
+ * (struct cl_lock) and a list of layers (struct cl_lock_slice), linked to
+ * cl_lock::cll_layers list through cl_lock_slice::cls_linkage.
+ *
+ * Typical cl_lock consists of the two layers:
+ *
+ *     - vvp_lock (vvp specific data), and
+ *     - lov_lock (lov specific data).
+ *
+ * lov_lock contains an array of sub-locks. Each of these sub-locks is a
+ * normal cl_lock: it has a header (struct cl_lock) and a list of layers:
+ *
+ *     - lovsub_lock, and
+ *     - osc_lock
+ *
+ * Each sub-lock is associated with a cl_object (representing stripe
+ * sub-object or the file to which top-level cl_lock is associated to), and is
+ * linked into that cl_object::coh_locks. In this respect cl_lock is similar to
+ * cl_object (that at lov layer also fans out into multiple sub-objects), and
+ * is different from cl_page, that doesn't fan out (there is usually exactly
+ * one osc_page for every vvp_page). We shall call vvp-lov portion of the lock
+ * a "top-lock" and its lovsub-osc portion a "sub-lock".
+ *
+ * LIFE CYCLE
+ *
+ * cl_lock is a cacheless data container for the requirements of locks to
+ * complete the IO. cl_lock is created before I/O starts and destroyed when the
+ * I/O is complete.
+ *
+ * cl_lock depends on LDLM lock to fulfill lock semantics. LDLM lock is attached
+ * to cl_lock at OSC layer. LDLM lock is still cacheable.
+ *
+ * INTERFACE AND USAGE
+ *
+ * Two major methods are supported for cl_lock: clo_enqueue and clo_cancel.  A
+ * cl_lock is enqueued by cl_lock_request(), which will call clo_enqueue()
+ * methods for each layer to enqueue the lock. At the LOV layer, if a cl_lock
+ * consists of multiple sub cl_locks, each sub locks will be enqueued
+ * correspondingly. At OSC layer, the lock enqueue request will tend to reuse
+ * cached LDLM lock; otherwise a new LDLM lock will have to be requested from
+ * OST side.
+ *
+ * cl_lock_cancel() must be called to release a cl_lock after use. clo_cancel()
+ * method will be called for each layer to release the resource held by this
+ * lock. At OSC layer, the reference count of LDLM lock, which is held at
+ * clo_enqueue time, is released.
+ *
+ * LDLM lock can only be canceled if there is no cl_lock using it.
+ *
+ * Overall process of the locking during IO operation is as following:
+ *
+ *     - once parameters for IO are setup in cl_io, cl_io_operations::cio_lock()
+ *       is called on each layer. Responsibility of this method is to add locks,
+ *       needed by a given layer into cl_io.ci_lockset.
+ *
+ *     - once locks for all layers were collected, they are sorted to avoid
+ *       dead-locks (cl_io_locks_sort()), and enqueued.
+ *
+ *     - when all locks are acquired, IO is performed;
+ *
+ *     - locks are released after IO is complete.
+ *
+ * Striping introduces major additional complexity into locking. The
+ * fundamental problem is that it is generally unsafe to actively use (hold)
+ * two locks on the different OST servers at the same time, as this introduces
+ * inter-server dependency and can lead to cascading evictions.
+ *
+ * Basic solution is to sub-divide large read/write IOs into smaller pieces so
+ * that no multi-stripe locks are taken (note that this design abandons POSIX
+ * read/write semantics). Such pieces ideally can be executed concurrently. At
+ * the same time, certain types of IO cannot be sub-divived, without
+ * sacrificing correctness. This includes:
+ *
+ *  - O_APPEND write, where [0, EOF] lock has to be taken, to guarantee
+ *  atomicity;
+ *
+ *  - ftruncate(fd, offset), where [offset, EOF] lock has to be taken.
+ *
+ * Also, in the case of read(fd, buf, count) or write(fd, buf, count), where
+ * buf is a part of memory mapped Lustre file, a lock or locks protecting buf
+ * has to be held together with the usual lock on [offset, offset + count].
+ *
+ * Interaction with DLM
+ *
+ * In the expected setup, cl_lock is ultimately backed up by a collection of
+ * DLM locks (struct ldlm_lock). Association between cl_lock and DLM lock is
+ * implemented in osc layer, that also matches DLM events (ASTs, cancellation,
+ * etc.) into cl_lock_operation calls. See struct osc_lock for a more detailed
+ * description of interaction with DLM.
+ */
+
+/**
+ * Lock description.
+ */
+struct cl_lock_descr {
+        /** Object this lock is granted for. */
+        struct cl_object *cld_obj;
+        /** Index of the first page protected by this lock. */
+        pgoff_t           cld_start;
+        /** Index of the last page (inclusive) protected by this lock. */
+        pgoff_t           cld_end;
+        /** Group ID, for group lock */
+        __u64             cld_gid;
+        /** Lock mode. */
+        enum cl_lock_mode cld_mode;
+        /**
+         * flags to enqueue lock. A combination of bit-flags from
+         * enum cl_enq_flags.
+         */
+        __u32             cld_enq_flags;
+};
+
+#define DDESCR "%s(%d):[%lu, %lu]:%x"
+#define PDESCR(descr)							\
+	cl_lock_mode_name((descr)->cld_mode), (descr)->cld_mode,	\
+	(descr)->cld_start, (descr)->cld_end, (descr)->cld_enq_flags
+
+const char *cl_lock_mode_name(const enum cl_lock_mode mode);
+
+/**
+ * Layered client lock.
+ */
+struct cl_lock {
+	/** List of slices. Immutable after creation. */
+	struct list_head      cll_layers;
+	/** lock attribute, extent, cl_object, etc. */
+	struct cl_lock_descr  cll_descr;
+};
+
+/**
+ * Per-layer part of cl_lock
+ *
+ * \see vvp_lock, lov_lock, lovsub_lock, osc_lock
+ */
+struct cl_lock_slice {
+        struct cl_lock                  *cls_lock;
+        /** Object slice corresponding to this lock slice. Immutable after
+         * creation. */
+        struct cl_object                *cls_obj;
+        const struct cl_lock_operations *cls_ops;
+        /** Linkage into cl_lock::cll_layers. Immutable after creation. */
+	struct list_head		 cls_linkage;
+};
+
+/**
+ *
+ * \see vvp_lock_ops, lov_lock_ops, lovsub_lock_ops, osc_lock_ops
+ */
+struct cl_lock_operations {
+	/** @{ */
+	/**
+	 * Attempts to enqueue the lock. Called top-to-bottom.
+	 *
+	 * \retval 0	this layer has enqueued the lock successfully
+	 * \retval >0	this layer has enqueued the lock, but need to wait on
+	 *		@anchor for resources
+	 * \retval -ve	failure
+	 *
+	 * \see vvp_lock_enqueue(), lov_lock_enqueue(), lovsub_lock_enqueue(),
+	 * \see osc_lock_enqueue()
+	 */
+	int  (*clo_enqueue)(const struct lu_env *env,
+			    const struct cl_lock_slice *slice,
+			    struct cl_io *io, struct cl_sync_io *anchor);
+	/**
+	 * Cancel a lock, release its DLM lock ref, while does not cancel the
+	 * DLM lock
+	 */
+	void (*clo_cancel)(const struct lu_env *env,
+			   const struct cl_lock_slice *slice);
+	/** @} */
+	/**
+	 * Destructor. Frees resources and the slice.
+	 *
+	 * \see vvp_lock_fini(), lov_lock_fini(), lovsub_lock_fini(),
+	 * \see osc_lock_fini()
+	 */
+        void (*clo_fini)(const struct lu_env *env, struct cl_lock_slice *slice);
+        /**
+         * Optional debugging helper. Prints given lock slice.
+         */
+        int (*clo_print)(const struct lu_env *env,
+                         void *cookie, lu_printer_t p,
+                         const struct cl_lock_slice *slice);
+};
+
+#define CL_LOCK_DEBUG(mask, env, lock, format, ...)                     \
+do {                                                                    \
+        if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {                   \
+                LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);        \
+                cl_lock_print(env, &msgdata, lu_cdebug_printer, lock);  \
+                CDEBUG(mask, format , ## __VA_ARGS__);                  \
+        }                                                               \
+} while (0)
+
+#define CL_LOCK_ASSERT(expr, env, lock) do {                            \
+	if (likely(expr))                                               \
+		break;                                                  \
+									\
+	CL_LOCK_DEBUG(D_ERROR, env, lock, "failed at %s.\n", #expr);    \
+	LBUG();                                                         \
+} while (0)
+
+/** @} cl_lock */
+
+/** \addtogroup cl_page_list cl_page_list
+ * Page list used to perform collective operations on a group of pages.
+ *
+ * Pages are added to the list one by one. cl_page_list acquires a reference
+ * for every page in it. Page list is used to perform collective operations on
+ * pages:
+ *
+ *     - submit pages for an immediate transfer,
+ *
+ *     - own pages on behalf of certain io (waiting for each page in turn),
+ *
+ *     - discard pages.
+ *
+ * When list is finalized, it releases references on all pages it still has.
+ *
+ * \todo XXX concurrency control.
+ *
+ * @{
+ */
+struct cl_page_list {
+	unsigned		 pl_nr;
+	struct list_head	 pl_pages;
+	struct task_struct	*pl_owner;
+};
+
+/** 
+ * A 2-queue of pages. A convenience data-type for common use case, 2-queue
+ * contains an incoming page list and an outgoing page list.
+ */
+struct cl_2queue {
+        struct cl_page_list c2_qin;
+        struct cl_page_list c2_qout;
+};
+
+/** @} cl_page_list */
+
+/** \addtogroup cl_io cl_io
+ * @{ */
+/** \struct cl_io
+ * I/O
+ *
+ * cl_io represents a high level I/O activity like
+ * read(2)/write(2)/truncate(2) system call, or cancellation of an extent
+ * lock.
+ *
+ * cl_io is a layered object, much like cl_{object,page,lock} but with one
+ * important distinction. We want to minimize number of calls to the allocator
+ * in the fast path, e.g., in the case of read(2) when everything is cached:
+ * client already owns the lock over region being read, and data are cached
+ * due to read-ahead. To avoid allocation of cl_io layers in such situations,
+ * per-layer io state is stored in the session, associated with the io, see
+ * struct {vvp,lov,osc}_io for example. Sessions allocation is amortized
+ * by using free-lists, see cl_env_get().
+ *
+ * There is a small predefined number of possible io types, enumerated in enum
+ * cl_io_type.
+ *
+ * cl_io is a state machine, that can be advanced concurrently by the multiple
+ * threads. It is up to these threads to control the concurrency and,
+ * specifically, to detect when io is done, and its state can be safely
+ * released.
+ *
+ * For read/write io overall execution plan is as following:
+ *
+ *     (0) initialize io state through all layers;
+ *
+ *     (1) loop: prepare chunk of work to do
+ *
+ *     (2) call all layers to collect locks they need to process current chunk
+ *
+ *     (3) sort all locks to avoid dead-locks, and acquire them
+ *
+ *     (4) process the chunk: call per-page methods
+ *         cl_io_operations::cio_prepare_write(),
+ *         cl_io_operations::cio_commit_write() for write)
+ *
+ *     (5) release locks
+ *
+ *     (6) repeat loop.
+ *
+ * To implement the "parallel IO mode", lov layer creates sub-io's (lazily to
+ * address allocation efficiency issues mentioned above), and returns with the
+ * special error condition from per-page method when current sub-io has to
+ * block. This causes io loop to be repeated, and lov switches to the next
+ * sub-io in its cl_io_operations::cio_iter_init() implementation.
+ */
+
+/** IO types */
+enum cl_io_type {
+        /** read system call */
+	CIT_READ = 1,
+        /** write system call */
+        CIT_WRITE,
+        /** truncate, utime system calls */
+        CIT_SETATTR,
+	/** get data version */
+	CIT_DATA_VERSION,
+        /**
+         * page fault handling
+         */
+        CIT_FAULT,
+        /**
+	 * fsync system call handling
+	 * To write out a range of file
+	 */
+	CIT_FSYNC,
+	/**
+         * Miscellaneous io. This is used for occasional io activity that
+         * doesn't fit into other types. Currently this is used for:
+         *
+         *     - cancellation of an extent lock. This io exists as a context
+         *     to write dirty pages from under the lock being canceled back
+         *     to the server;
+         *
+         *     - VM induced page write-out. An io context for writing page out
+         *     for memory cleansing;
+         *
+         *     - glimpse. An io context to acquire glimpse lock.
+         *
+         *     - grouplock. An io context to acquire group lock.
+         *
+         * CIT_MISC io is used simply as a context in which locks and pages
+         * are manipulated. Such io has no internal "process", that is,
+         * cl_io_loop() is never called for it.
+         */
+        CIT_MISC,
+	/**
+	 * ladvise handling
+	 * To give advice about access of a file
+	 */
+	CIT_LADVISE,
+        CIT_OP_NR
+};
+
+/**
+ * States of cl_io state machine
+ */
+enum cl_io_state {
+        /** Not initialized. */
+        CIS_ZERO,
+        /** Initialized. */
+        CIS_INIT,
+        /** IO iteration started. */
+        CIS_IT_STARTED,
+        /** Locks taken. */
+        CIS_LOCKED,
+        /** Actual IO is in progress. */
+        CIS_IO_GOING,
+        /** IO for the current iteration finished. */
+        CIS_IO_FINISHED,
+        /** Locks released. */
+        CIS_UNLOCKED,
+        /** Iteration completed. */
+        CIS_IT_ENDED,
+        /** cl_io finalized. */
+        CIS_FINI
+};
+
+/**
+ * IO state private for a layer.
+ *
+ * This is usually embedded into layer session data, rather than allocated
+ * dynamically.
+ *
+ * \see vvp_io, lov_io, osc_io
+ */
+struct cl_io_slice {
+	struct cl_io			*cis_io;
+	/** corresponding object slice. Immutable after creation. */
+	struct cl_object		*cis_obj;
+	/** io operations. Immutable after creation. */
+	const struct cl_io_operations	*cis_iop;
+	/**
+	 * linkage into a list of all slices for a given cl_io, hanging off
+	 * cl_io::ci_layers. Immutable after creation.
+	 */
+	struct list_head		cis_linkage;
+};
+
+typedef void (*cl_commit_cbt)(const struct lu_env *, struct cl_io *,
+			      struct cl_page *);
+
+struct cl_read_ahead {
+	/* Maximum page index the readahead window will end.
+	 * This is determined DLM lock coverage, RPC and stripe boundary.
+	 * cra_end is included. */
+	pgoff_t cra_end;
+	/* optimal RPC size for this read, by pages */
+	unsigned long cra_rpc_size;
+	/* Release callback. If readahead holds resources underneath, this
+	 * function should be called to release it. */
+	void    (*cra_release)(const struct lu_env *env, void *cbdata);
+	/* Callback data for cra_release routine */
+	void	*cra_cbdata;
+};
+
+static inline void cl_read_ahead_release(const struct lu_env *env,
+					 struct cl_read_ahead *ra)
+{
+	if (ra->cra_release != NULL)
+		ra->cra_release(env, ra->cra_cbdata);
+	memset(ra, 0, sizeof(*ra));
+}
+
+
+/**
+ * Per-layer io operations.
+ * \see vvp_io_ops, lov_io_ops, lovsub_io_ops, osc_io_ops
+ */
+struct cl_io_operations {
+        /**
+         * Vector of io state transition methods for every io type.
+         *
+         * \see cl_page_operations::io
+         */
+        struct {
+                /**
+                 * Prepare io iteration at a given layer.
+                 *
+                 * Called top-to-bottom at the beginning of each iteration of
+                 * "io loop" (if it makes sense for this type of io). Here
+                 * layer selects what work it will do during this iteration.
+                 *
+                 * \see cl_io_operations::cio_iter_fini()
+                 */
+                int (*cio_iter_init) (const struct lu_env *env,
+                                      const struct cl_io_slice *slice);
+                /**
+                 * Finalize io iteration.
+                 *
+                 * Called bottom-to-top at the end of each iteration of "io
+                 * loop". Here layers can decide whether IO has to be
+                 * continued.
+                 *
+                 * \see cl_io_operations::cio_iter_init()
+                 */
+                void (*cio_iter_fini) (const struct lu_env *env,
+                                       const struct cl_io_slice *slice);
+                /**
+                 * Collect locks for the current iteration of io.
+                 *
+                 * Called top-to-bottom to collect all locks necessary for
+                 * this iteration. This methods shouldn't actually enqueue
+                 * anything, instead it should post a lock through
+                 * cl_io_lock_add(). Once all locks are collected, they are
+                 * sorted and enqueued in the proper order.
+                 */
+                int  (*cio_lock) (const struct lu_env *env,
+                                  const struct cl_io_slice *slice);
+                /**
+                 * Finalize unlocking.
+                 *
+                 * Called bottom-to-top to finish layer specific unlocking
+                 * functionality, after generic code released all locks
+                 * acquired by cl_io_operations::cio_lock().
+                 */
+                void  (*cio_unlock)(const struct lu_env *env,
+                                    const struct cl_io_slice *slice);
+                /**
+                 * Start io iteration.
+                 *
+                 * Once all locks are acquired, called top-to-bottom to
+                 * commence actual IO. In the current implementation,
+                 * top-level vvp_io_{read,write}_start() does all the work
+                 * synchronously by calling generic_file_*(), so other layers
+                 * are called when everything is done.
+                 */
+                int  (*cio_start)(const struct lu_env *env,
+                                  const struct cl_io_slice *slice);
+                /**
+                 * Called top-to-bottom at the end of io loop. Here layer
+                 * might wait for an unfinished asynchronous io.
+                 */
+                void (*cio_end)  (const struct lu_env *env,
+                                  const struct cl_io_slice *slice);
+                /**
+                 * Called bottom-to-top to notify layers that read/write IO
+                 * iteration finished, with \a nob bytes transferred.
+                 */
+                void (*cio_advance)(const struct lu_env *env,
+                                    const struct cl_io_slice *slice,
+                                    size_t nob);
+                /**
+                 * Called once per io, bottom-to-top to release io resources.
+                 */
+                void (*cio_fini) (const struct lu_env *env,
+                                  const struct cl_io_slice *slice);
+        } op[CIT_OP_NR];
+
+	/**
+	 * Submit pages from \a queue->c2_qin for IO, and move
+	 * successfully submitted pages into \a queue->c2_qout. Return
+	 * non-zero if failed to submit even the single page. If
+	 * submission failed after some pages were moved into \a
+	 * queue->c2_qout, completion callback with non-zero ioret is
+	 * executed on them.
+	 */
+	int  (*cio_submit)(const struct lu_env *env,
+			const struct cl_io_slice *slice,
+			enum cl_req_type crt,
+			struct cl_2queue *queue);
+	/**
+	 * Queue async page for write.
+	 * The difference between cio_submit and cio_queue is that
+	 * cio_submit is for urgent request.
+	 */
+	int  (*cio_commit_async)(const struct lu_env *env,
+			const struct cl_io_slice *slice,
+			struct cl_page_list *queue, int from, int to,
+			cl_commit_cbt cb);
+	/**
+	 * Decide maximum read ahead extent
+	 *
+	 * \pre io->ci_type == CIT_READ
+	 */
+	int (*cio_read_ahead)(const struct lu_env *env,
+			      const struct cl_io_slice *slice,
+			      pgoff_t start, struct cl_read_ahead *ra);
+        /**
+         * Optional debugging helper. Print given io slice.
+         */
+        int (*cio_print)(const struct lu_env *env, void *cookie,
+                         lu_printer_t p, const struct cl_io_slice *slice);
+};
+
+/**
+ * Flags to lock enqueue procedure.
+ * \ingroup cl_lock
+ */
+enum cl_enq_flags {
+        /**
+         * instruct server to not block, if conflicting lock is found. Instead
+         * -EWOULDBLOCK is returned immediately.
+         */
+        CEF_NONBLOCK     = 0x00000001,
+        /**
+         * take lock asynchronously (out of order), as it cannot
+         * deadlock. This is for LDLM_FL_HAS_INTENT locks used for glimpsing.
+         */
+        CEF_ASYNC        = 0x00000002,
+        /**
+         * tell the server to instruct (though a flag in the blocking ast) an
+         * owner of the conflicting lock, that it can drop dirty pages
+         * protected by this lock, without sending them to the server.
+         */
+        CEF_DISCARD_DATA = 0x00000004,
+        /**
+         * tell the sub layers that it must be a `real' lock. This is used for
+         * mmapped-buffer locks and glimpse locks that must be never converted
+         * into lockless mode.
+         *
+         * \see vvp_mmap_locks(), cl_glimpse_lock().
+         */
+        CEF_MUST         = 0x00000008,
+        /**
+         * tell the sub layers that never request a `real' lock. This flag is
+         * not used currently.
+         *
+         * cl_io::ci_lockreq and CEF_{MUST,NEVER} flags specify lockless
+         * conversion policy: ci_lockreq describes generic information of lock
+         * requirement for this IO, especially for locks which belong to the
+         * object doing IO; however, lock itself may have precise requirements
+         * that are described by the enqueue flags.
+         */
+        CEF_NEVER        = 0x00000010,
+        /**
+         * for async glimpse lock.
+         */
+        CEF_AGL          = 0x00000020,
+	/**
+	 * enqueue a lock to test DLM lock existence.
+	 */
+	CEF_PEEK	= 0x00000040,
+	/**
+	 * Lock match only. Used by group lock in I/O as group lock
+	 * is known to exist.
+	 */
+	CEF_LOCK_MATCH  = 0x00000080,
+	/**
+	 * mask of enq_flags.
+	 */
+	CEF_MASK         = 0x000000ff,
+};
+
+/**
+ * Link between lock and io. Intermediate structure is needed, because the
+ * same lock can be part of multiple io's simultaneously.
+ */
+struct cl_io_lock_link {
+	/** linkage into one of cl_lockset lists. */
+	struct list_head        cill_linkage;
+	struct cl_lock          cill_lock;
+	/** optional destructor */
+	void                    (*cill_fini)(const struct lu_env *env,
+					     struct cl_io_lock_link *link);
+};
+#define cill_descr	cill_lock.cll_descr
+
+/**
+ * Lock-set represents a collection of locks, that io needs at a
+ * time. Generally speaking, client tries to avoid holding multiple locks when
+ * possible, because
+ *
+ *      - holding extent locks over multiple ost's introduces the danger of
+ *        "cascading timeouts";
+ *
+ *      - holding multiple locks over the same ost is still dead-lock prone,
+ *        see comment in osc_lock_enqueue(),
+ *
+ * but there are certain situations where this is unavoidable:
+ *
+ *      - O_APPEND writes have to take [0, EOF] lock for correctness;
+ *
+ *      - truncate has to take [new-size, EOF] lock for correctness;
+ *
+ *      - SNS has to take locks across full stripe for correctness;
+ *
+ *      - in the case when user level buffer, supplied to {read,write}(file0),
+ *        is a part of a memory mapped lustre file, client has to take a dlm
+ *        locks on file0, and all files that back up the buffer (or a part of
+ *        the buffer, that is being processed in the current chunk, in any
+ *        case, there are situations where at least 2 locks are necessary).
+ *
+ * In such cases we at least try to take locks in the same consistent
+ * order. To this end, all locks are first collected, then sorted, and then
+ * enqueued.
+ */
+struct cl_lockset {
+	/** locks to be acquired. */
+	struct list_head  cls_todo;
+	/** locks acquired. */
+	struct list_head  cls_done;
+};
+
+/**
+ * Lock requirements(demand) for IO. It should be cl_io_lock_req,
+ * but 'req' is always to be thought as 'request' :-)
+ */
+enum cl_io_lock_dmd {
+        /** Always lock data (e.g., O_APPEND). */
+        CILR_MANDATORY = 0,
+        /** Layers are free to decide between local and global locking. */
+        CILR_MAYBE,
+        /** Never lock: there is no cache (e.g., liblustre). */
+        CILR_NEVER
+};
+
+enum cl_fsync_mode {
+	/** start writeback, do not wait for them to finish */
+	CL_FSYNC_NONE  = 0,
+	/** start writeback and wait for them to finish */
+	CL_FSYNC_LOCAL = 1,
+	/** discard all of dirty pages in a specific file range */
+	CL_FSYNC_DISCARD = 2,
+	/** start writeback and make sure they have reached storage before
+	 * return. OST_SYNC RPC must be issued and finished */
+	CL_FSYNC_ALL   = 3
+};
+
+struct cl_io_range {
+	loff_t cir_pos;
+	size_t cir_count;
+};
+
+struct cl_io_pt {
+	struct cl_io_pt		*cip_next;
+	struct cfs_ptask	 cip_task;
+	struct kiocb		 cip_iocb;
+	struct iov_iter		 cip_iter;
+	struct file		*cip_file;
+	enum cl_io_type		 cip_iot;
+	loff_t			 cip_pos;
+	size_t			 cip_count;
+	ssize_t			 cip_result;
+};
+
+/**
+ * State for io.
+ *
+ * cl_io is shared by all threads participating in this IO (in current
+ * implementation only one thread advances IO, but parallel IO design and
+ * concurrent copy_*_user() require multiple threads acting on the same IO. It
+ * is up to these threads to serialize their activities, including updates to
+ * mutable cl_io fields.
+ */
+struct cl_io {
+        /** type of this IO. Immutable after creation. */
+        enum cl_io_type                ci_type;
+        /** current state of cl_io state machine. */
+        enum cl_io_state               ci_state;
+        /** main object this io is against. Immutable after creation. */
+        struct cl_object              *ci_obj;
+        /**
+         * Upper layer io, of which this io is a part of. Immutable after
+         * creation.
+         */
+        struct cl_io                  *ci_parent;
+        /** List of slices. Immutable after creation. */
+	struct list_head		ci_layers;
+        /** list of locks (to be) acquired by this io. */
+        struct cl_lockset              ci_lockset;
+        /** lock requirements, this is just a help info for sublayers. */
+        enum cl_io_lock_dmd            ci_lockreq;
+        union {
+		struct cl_rw_io {
+			struct iov_iter		 rw_iter;
+			struct kiocb		 rw_iocb;
+			struct cl_io_range	 rw_range;
+			struct file		*rw_file;
+			unsigned int		 rw_nonblock:1,
+						 rw_append:1,
+						 rw_sync:1;
+			int (*rw_ptask)(struct cfs_ptask *ptask);
+		} ci_rw;
+		struct cl_setattr_io {
+			struct ost_lvb		 sa_attr;
+			unsigned int		 sa_attr_flags;
+			unsigned int		 sa_valid;
+			int			 sa_stripe_index;
+			struct ost_layout	 sa_layout;
+			const struct lu_fid	*sa_parent_fid;
+		} ci_setattr;
+		struct cl_data_version_io {
+			u64 dv_data_version;
+			int dv_flags;
+		} ci_data_version;
+                struct cl_fault_io {
+                        /** page index within file. */
+                        pgoff_t         ft_index;
+                        /** bytes valid byte on a faulted page. */
+			size_t		ft_nob;
+                        /** writable page? for nopage() only */
+                        int             ft_writable;
+                        /** page of an executable? */
+                        int             ft_executable;
+                        /** page_mkwrite() */
+                        int             ft_mkwrite;
+                        /** resulting page */
+                        struct cl_page *ft_page;
+                } ci_fault;
+		struct cl_fsync_io {
+			loff_t             fi_start;
+			loff_t             fi_end;
+			/** file system level fid */
+			struct lu_fid     *fi_fid;
+			enum cl_fsync_mode fi_mode;
+			/* how many pages were written/discarded */
+			unsigned int       fi_nr_written;
+		} ci_fsync;
+		struct cl_ladvise_io {
+			__u64			 li_start;
+			__u64			 li_end;
+			/** file system level fid */
+			struct lu_fid		*li_fid;
+			enum lu_ladvise_type	 li_advice;
+			__u64			 li_flags;
+		} ci_ladvise;
+        } u;
+        struct cl_2queue     ci_queue;
+        size_t               ci_nob;
+        int                  ci_result;
+	unsigned int         ci_continue:1,
+	/**
+	 * This io has held grouplock, to inform sublayers that
+	 * don't do lockless i/o.
+	 */
+			     ci_no_srvlock:1,
+	/**
+	 * The whole IO need to be restarted because layout has been changed
+	 */
+			     ci_need_restart:1,
+	/**
+	 * to not refresh layout - the IO issuer knows that the layout won't
+	 * change(page operations, layout change causes all page to be
+	 * discarded), or it doesn't matter if it changes(sync).
+	 */
+			     ci_ignore_layout:1,
+	/**
+	 * Need MDS intervention to complete a write. This usually means the
+	 * corresponding component is not initialized for the writing extent.
+	 */
+			     ci_need_write_intent:1,
+	/**
+	 * Check if layout changed after the IO finishes. Mainly for HSM
+	 * requirement. If IO occurs to openning files, it doesn't need to
+	 * verify layout because HSM won't release openning files.
+	 * Right now, only two opertaions need to verify layout: glimpse
+	 * and setattr.
+	 */
+			     ci_verify_layout:1,
+	/**
+	 * file is released, restore has to to be triggered by vvp layer
+	 */
+			     ci_restore_needed:1,
+	/**
+	 * O_NOATIME
+	 */
+			     ci_noatime:1,
+	/** Set to 1 if parallel execution is allowed for current I/O? */
+			     ci_pio:1;
+	/**
+	 * Number of pages owned by this IO. For invariant checking.
+	 */
+	unsigned	     ci_owned_nr;
+};
+
+/** @} cl_io */
+
+/**
+ * Per-transfer attributes.
+ */
+struct cl_req_attr {
+	enum cl_req_type cra_type;
+	u64		 cra_flags;
+	struct cl_page  *cra_page;
+	/** Generic attributes for the server consumption. */
+	struct obdo	*cra_oa;
+	/** Jobid */
+	char		 cra_jobid[LUSTRE_JOBID_SIZE];
+};
+
+enum cache_stats_item {
+	/** how many cache lookups were performed */
+	CS_lookup = 0,
+	/** how many times cache lookup resulted in a hit */
+	CS_hit,
+	/** how many entities are in the cache right now */
+	CS_total,
+	/** how many entities in the cache are actively used (and cannot be
+	 * evicted) right now */
+	CS_busy,
+	/** how many entities were created at all */
+	CS_create,
+	CS_NR
+};
+
+#define CS_NAMES { "lookup", "hit", "total", "busy", "create" }
+
+/**
+ * Stats for a generic cache (similar to inode, lu_object, etc. caches).
+ */
+struct cache_stats {
+	const char	*cs_name;
+	atomic_t	cs_stats[CS_NR];
+};
+
+/** These are not exported so far */
+void cache_stats_init (struct cache_stats *cs, const char *name);
+
+/**
+ * Client-side site. This represents particular client stack. "Global"
+ * variables should (directly or indirectly) be added here to allow multiple
+ * clients to co-exist in the single address space.
+ */
+struct cl_site {
+	struct lu_site		cs_lu;
+	/**
+	 * Statistical counters. Atomics do not scale, something better like
+	 * per-cpu counters is needed.
+	 *
+	 * These are exported as /proc/fs/lustre/llite/.../site
+	 *
+	 * When interpreting keep in mind that both sub-locks (and sub-pages)
+	 * and top-locks (and top-pages) are accounted here.
+	 */
+	struct cache_stats	cs_pages;
+	atomic_t		cs_pages_state[CPS_NR];
+};
+
+int  cl_site_init(struct cl_site *s, struct cl_device *top);
+void cl_site_fini(struct cl_site *s);
+void cl_stack_fini(const struct lu_env *env, struct cl_device *cl);
+
+/**
+ * Output client site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int cl_site_stats_print(const struct cl_site *site, struct seq_file *m);
+
+/**
+ * \name helpers
+ *
+ * Type conversion and accessory functions.
+ */
+/** @{ */
+
+static inline struct cl_site *lu2cl_site(const struct lu_site *site)
+{
+        return container_of(site, struct cl_site, cs_lu);
+}
+
+static inline struct cl_device *lu2cl_dev(const struct lu_device *d)
+{
+        LASSERT(d == NULL || IS_ERR(d) || lu_device_is_cl(d));
+        return container_of0(d, struct cl_device, cd_lu_dev);
+}
+
+static inline struct lu_device *cl2lu_dev(struct cl_device *d)
+{
+        return &d->cd_lu_dev;
+}
+
+static inline struct cl_object *lu2cl(const struct lu_object *o)
+{
+        LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->lo_dev));
+        return container_of0(o, struct cl_object, co_lu);
+}
+
+static inline const struct cl_object_conf *
+lu2cl_conf(const struct lu_object_conf *conf)
+{
+        return container_of0(conf, struct cl_object_conf, coc_lu);
+}
+
+static inline struct cl_object *cl_object_next(const struct cl_object *obj)
+{
+        return obj ? lu2cl(lu_object_next(&obj->co_lu)) : NULL;
+}
+
+static inline struct cl_object_header *luh2coh(const struct lu_object_header *h)
+{
+        return container_of0(h, struct cl_object_header, coh_lu);
+}
+
+static inline struct cl_site *cl_object_site(const struct cl_object *obj)
+{
+        return lu2cl_site(obj->co_lu.lo_dev->ld_site);
+}
+
+static inline
+struct cl_object_header *cl_object_header(const struct cl_object *obj)
+{
+        return luh2coh(obj->co_lu.lo_header);
+}
+
+static inline int cl_device_init(struct cl_device *d, struct lu_device_type *t)
+{
+        return lu_device_init(&d->cd_lu_dev, t);
+}
+
+static inline void cl_device_fini(struct cl_device *d)
+{
+        lu_device_fini(&d->cd_lu_dev);
+}
+
+void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice,
+		       struct cl_object *obj, pgoff_t index,
+		       const struct cl_page_operations *ops);
+void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice,
+                       struct cl_object *obj,
+                       const struct cl_lock_operations *ops);
+void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice,
+                     struct cl_object *obj, const struct cl_io_operations *ops);
+/** @} helpers */
+
+/** \defgroup cl_object cl_object
+ * @{ */
+struct cl_object *cl_object_top (struct cl_object *o);
+struct cl_object *cl_object_find(const struct lu_env *env, struct cl_device *cd,
+                                 const struct lu_fid *fid,
+                                 const struct cl_object_conf *c);
+
+int  cl_object_header_init(struct cl_object_header *h);
+void cl_object_header_fini(struct cl_object_header *h);
+void cl_object_put        (const struct lu_env *env, struct cl_object *o);
+void cl_object_get        (struct cl_object *o);
+void cl_object_attr_lock  (struct cl_object *o);
+void cl_object_attr_unlock(struct cl_object *o);
+int  cl_object_attr_get(const struct lu_env *env, struct cl_object *obj,
+			struct cl_attr *attr);
+int  cl_object_attr_update(const struct lu_env *env, struct cl_object *obj,
+                           const struct cl_attr *attr, unsigned valid);
+int  cl_object_glimpse    (const struct lu_env *env, struct cl_object *obj,
+                           struct ost_lvb *lvb);
+int  cl_conf_set          (const struct lu_env *env, struct cl_object *obj,
+                           const struct cl_object_conf *conf);
+int  cl_object_prune      (const struct lu_env *env, struct cl_object *obj);
+void cl_object_kill       (const struct lu_env *env, struct cl_object *obj);
+int cl_object_getstripe(const struct lu_env *env, struct cl_object *obj,
+			struct lov_user_md __user *lum, size_t size);
+int cl_object_fiemap(const struct lu_env *env, struct cl_object *obj,
+		     struct ll_fiemap_info_key *fmkey, struct fiemap *fiemap,
+		     size_t *buflen);
+int cl_object_layout_get(const struct lu_env *env, struct cl_object *obj,
+			 struct cl_layout *cl);
+loff_t cl_object_maxbytes(struct cl_object *obj);
+
+/**
+ * Returns true, iff \a o0 and \a o1 are slices of the same object.
+ */
+static inline int cl_object_same(struct cl_object *o0, struct cl_object *o1)
+{
+        return cl_object_header(o0) == cl_object_header(o1);
+}
+
+static inline void cl_object_page_init(struct cl_object *clob, int size)
+{
+	clob->co_slice_off = cl_object_header(clob)->coh_page_bufsize;
+	cl_object_header(clob)->coh_page_bufsize += cfs_size_round(size);
+	WARN_ON(cl_object_header(clob)->coh_page_bufsize > 512);
+}
+
+static inline void *cl_object_page_slice(struct cl_object *clob,
+					 struct cl_page *page)
+{
+	return (void *)((char *)page + clob->co_slice_off);
+}
+
+/**
+ * Return refcount of cl_object.
+ */
+static inline int cl_object_refc(struct cl_object *clob)
+{
+	struct lu_object_header *header = clob->co_lu.lo_header;
+	return atomic_read(&header->loh_ref);
+}
+
+/** @} cl_object */
+
+/** \defgroup cl_page cl_page
+ * @{ */
+enum {
+        CLP_GANG_OKAY = 0,
+        CLP_GANG_RESCHED,
+        CLP_GANG_AGAIN,
+        CLP_GANG_ABORT
+};
+/* callback of cl_page_gang_lookup() */
+
+struct cl_page *cl_page_find        (const struct lu_env *env,
+                                     struct cl_object *obj,
+                                     pgoff_t idx, struct page *vmpage,
+                                     enum cl_page_type type);
+struct cl_page *cl_page_alloc       (const struct lu_env *env,
+				     struct cl_object *o, pgoff_t ind,
+				     struct page *vmpage,
+				     enum cl_page_type type);
+void            cl_page_get         (struct cl_page *page);
+void            cl_page_put         (const struct lu_env *env,
+                                     struct cl_page *page);
+void            cl_page_print       (const struct lu_env *env, void *cookie,
+                                     lu_printer_t printer,
+                                     const struct cl_page *pg);
+void            cl_page_header_print(const struct lu_env *env, void *cookie,
+                                     lu_printer_t printer,
+                                     const struct cl_page *pg);
+struct cl_page *cl_vmpage_page      (struct page *vmpage, struct cl_object *obj);
+struct cl_page *cl_page_top         (struct cl_page *page);
+
+const struct cl_page_slice *cl_page_at(const struct cl_page *page,
+                                       const struct lu_device_type *dtype);
+
+/**
+ * \name ownership
+ *
+ * Functions dealing with the ownership of page by io.
+ */
+/** @{ */
+
+int  cl_page_own        (const struct lu_env *env,
+                         struct cl_io *io, struct cl_page *page);
+int  cl_page_own_try    (const struct lu_env *env,
+                         struct cl_io *io, struct cl_page *page);
+void cl_page_assume     (const struct lu_env *env,
+                         struct cl_io *io, struct cl_page *page);
+void cl_page_unassume   (const struct lu_env *env,
+                         struct cl_io *io, struct cl_page *pg);
+void cl_page_disown     (const struct lu_env *env,
+                         struct cl_io *io, struct cl_page *page);
+int  cl_page_is_owned   (const struct cl_page *pg, const struct cl_io *io);
+
+/** @} ownership */
+
+/**
+ * \name transfer
+ *
+ * Functions dealing with the preparation of a page for a transfer, and
+ * tracking transfer state.
+ */
+/** @{ */
+int  cl_page_prep       (const struct lu_env *env, struct cl_io *io,
+                         struct cl_page *pg, enum cl_req_type crt);
+void cl_page_completion (const struct lu_env *env,
+                         struct cl_page *pg, enum cl_req_type crt, int ioret);
+int  cl_page_make_ready (const struct lu_env *env, struct cl_page *pg,
+                         enum cl_req_type crt);
+int  cl_page_cache_add  (const struct lu_env *env, struct cl_io *io,
+                         struct cl_page *pg, enum cl_req_type crt);
+void cl_page_clip       (const struct lu_env *env, struct cl_page *pg,
+                         int from, int to);
+int  cl_page_cancel     (const struct lu_env *env, struct cl_page *page);
+int  cl_page_flush      (const struct lu_env *env, struct cl_io *io,
+			 struct cl_page *pg);
+
+/** @} transfer */
+
+
+/**
+ * \name helper routines
+ * Functions to discard, delete and export a cl_page.
+ */
+/** @{ */
+void    cl_page_discard(const struct lu_env *env, struct cl_io *io,
+			struct cl_page *pg);
+void    cl_page_delete(const struct lu_env *env, struct cl_page *pg);
+int     cl_page_is_vmlocked(const struct lu_env *env,
+			    const struct cl_page *pg);
+void    cl_page_export(const struct lu_env *env,
+		       struct cl_page *pg, int uptodate);
+loff_t  cl_offset(const struct cl_object *obj, pgoff_t idx);
+pgoff_t cl_index(const struct cl_object *obj, loff_t offset);
+size_t  cl_page_size(const struct cl_object *obj);
+
+void cl_lock_print(const struct lu_env *env, void *cookie,
+		   lu_printer_t printer, const struct cl_lock *lock);
+void cl_lock_descr_print(const struct lu_env *env, void *cookie,
+			 lu_printer_t printer,
+			 const struct cl_lock_descr *descr);
+/* @} helper */
+
+/**
+ * Data structure managing a client's cached pages. A count of
+ * "unstable" pages is maintained, and an LRU of clean pages is
+ * maintained. "unstable" pages are pages pinned by the ptlrpc
+ * layer for recovery purposes.
+ */
+struct cl_client_cache {
+	/**
+	 * # of client cache refcount
+	 * # of users (OSCs) + 2 (held by llite and lov)
+	 */
+	atomic_t		ccc_users;
+	/**
+	 * # of threads are doing shrinking
+	 */
+	unsigned int		ccc_lru_shrinkers;
+	/**
+	 * # of LRU entries available
+	 */
+	atomic_long_t		ccc_lru_left;
+	/**
+	 * List of entities(OSCs) for this LRU cache
+	 */
+	struct list_head	ccc_lru;
+	/**
+	 * Max # of LRU entries
+	 */
+	unsigned long		ccc_lru_max;
+	/**
+	 * Lock to protect ccc_lru list
+	 */
+	spinlock_t		ccc_lru_lock;
+	/**
+	 * Set if unstable check is enabled
+	 */
+	unsigned int		ccc_unstable_check:1;
+	/**
+	 * # of unstable pages for this mount point
+	 */
+	atomic_long_t		ccc_unstable_nr;
+	/**
+	 * Waitq for awaiting unstable pages to reach zero.
+	 * Used at umounting time and signaled on BRW commit
+	 */
+	wait_queue_head_t	ccc_unstable_waitq;
+};
+/**
+ * cl_cache functions
+ */
+struct cl_client_cache *cl_cache_init(unsigned long lru_page_max);
+void cl_cache_incref(struct cl_client_cache *cache);
+void cl_cache_decref(struct cl_client_cache *cache);
+
+/** @} cl_page */
+
+/** \defgroup cl_lock cl_lock
+ * @{ */
+int cl_lock_request(const struct lu_env *env, struct cl_io *io,
+		    struct cl_lock *lock);
+int cl_lock_init(const struct lu_env *env, struct cl_lock *lock,
+		 const struct cl_io *io);
+void cl_lock_fini(const struct lu_env *env, struct cl_lock *lock);
+const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock,
+				       const struct lu_device_type *dtype);
+void cl_lock_release(const struct lu_env *env, struct cl_lock *lock);
+
+int cl_lock_enqueue(const struct lu_env *env, struct cl_io *io,
+		    struct cl_lock *lock, struct cl_sync_io *anchor);
+void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock);
+
+/** @} cl_lock */
+
+/** \defgroup cl_io cl_io
+ * @{ */
+
+int   cl_io_init         (const struct lu_env *env, struct cl_io *io,
+                          enum cl_io_type iot, struct cl_object *obj);
+int   cl_io_sub_init     (const struct lu_env *env, struct cl_io *io,
+                          enum cl_io_type iot, struct cl_object *obj);
+int   cl_io_rw_init      (const struct lu_env *env, struct cl_io *io,
+                          enum cl_io_type iot, loff_t pos, size_t count);
+int   cl_io_loop         (const struct lu_env *env, struct cl_io *io);
+
+void  cl_io_fini         (const struct lu_env *env, struct cl_io *io);
+int   cl_io_iter_init    (const struct lu_env *env, struct cl_io *io);
+void  cl_io_iter_fini    (const struct lu_env *env, struct cl_io *io);
+int   cl_io_lock         (const struct lu_env *env, struct cl_io *io);
+void  cl_io_unlock       (const struct lu_env *env, struct cl_io *io);
+int   cl_io_start        (const struct lu_env *env, struct cl_io *io);
+void  cl_io_end          (const struct lu_env *env, struct cl_io *io);
+int   cl_io_lock_add     (const struct lu_env *env, struct cl_io *io,
+                          struct cl_io_lock_link *link);
+int   cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io,
+                           struct cl_lock_descr *descr);
+int   cl_io_submit_rw    (const struct lu_env *env, struct cl_io *io,
+			  enum cl_req_type iot, struct cl_2queue *queue);
+int   cl_io_submit_sync  (const struct lu_env *env, struct cl_io *io,
+			  enum cl_req_type iot, struct cl_2queue *queue,
+			  long timeout);
+int   cl_io_commit_async (const struct lu_env *env, struct cl_io *io,
+			  struct cl_page_list *queue, int from, int to,
+			  cl_commit_cbt cb);
+int   cl_io_read_ahead   (const struct lu_env *env, struct cl_io *io,
+			  pgoff_t start, struct cl_read_ahead *ra);
+void  cl_io_rw_advance   (const struct lu_env *env, struct cl_io *io,
+                          size_t nob);
+int   cl_io_cancel       (const struct lu_env *env, struct cl_io *io,
+                          struct cl_page_list *queue);
+
+/**
+ * True, iff \a io is an O_APPEND write(2).
+ */
+static inline int cl_io_is_append(const struct cl_io *io)
+{
+	return io->ci_type == CIT_WRITE && io->u.ci_rw.rw_append;
+}
+
+static inline int cl_io_is_sync_write(const struct cl_io *io)
+{
+	return io->ci_type == CIT_WRITE && io->u.ci_rw.rw_sync;
+}
+
+static inline int cl_io_is_mkwrite(const struct cl_io *io)
+{
+	return io->ci_type == CIT_FAULT && io->u.ci_fault.ft_mkwrite;
+}
+
+/**
+ * True, iff \a io is a truncate(2).
+ */
+static inline int cl_io_is_trunc(const struct cl_io *io)
+{
+        return io->ci_type == CIT_SETATTR &&
+                (io->u.ci_setattr.sa_valid & ATTR_SIZE);
+}
+
+struct cl_io *cl_io_top(struct cl_io *io);
+
+void cl_io_print(const struct lu_env *env, void *cookie,
+                 lu_printer_t printer, const struct cl_io *io);
+
+#define CL_IO_SLICE_CLEAN(foo_io, base)                                 \
+do {                                                                    \
+        typeof(foo_io) __foo_io = (foo_io);                             \
+                                                                        \
+        CLASSERT(offsetof(typeof(*__foo_io), base) == 0);               \
+        memset(&__foo_io->base + 1, 0,                                  \
+               (sizeof *__foo_io) - sizeof __foo_io->base);             \
+} while (0)
+
+/** @} cl_io */
+
+/** \defgroup cl_page_list cl_page_list
+ * @{ */
+
+/**
+ * Last page in the page list.
+ */
+static inline struct cl_page *cl_page_list_last(struct cl_page_list *plist)
+{
+	LASSERT(plist->pl_nr > 0);
+	return list_entry(plist->pl_pages.prev, struct cl_page, cp_batch);
+}
+
+static inline struct cl_page *cl_page_list_first(struct cl_page_list *plist)
+{
+	LASSERT(plist->pl_nr > 0);
+	return list_entry(plist->pl_pages.next, struct cl_page, cp_batch);
+}
+
+/**
+ * Iterate over pages in a page list.
+ */
+#define cl_page_list_for_each(page, list)                               \
+	list_for_each_entry((page), &(list)->pl_pages, cp_batch)
+
+/**
+ * Iterate over pages in a page list, taking possible removals into account.
+ */
+#define cl_page_list_for_each_safe(page, temp, list)                    \
+	list_for_each_entry_safe((page), (temp), &(list)->pl_pages, cp_batch)
+
+void cl_page_list_init   (struct cl_page_list *plist);
+void cl_page_list_add    (struct cl_page_list *plist, struct cl_page *page);
+void cl_page_list_move   (struct cl_page_list *dst, struct cl_page_list *src,
+                          struct cl_page *page);
+void cl_page_list_move_head(struct cl_page_list *dst, struct cl_page_list *src,
+			  struct cl_page *page);
+void cl_page_list_splice (struct cl_page_list *list,
+                          struct cl_page_list *head);
+void cl_page_list_del    (const struct lu_env *env,
+                          struct cl_page_list *plist, struct cl_page *page);
+void cl_page_list_disown (const struct lu_env *env,
+                          struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_assume (const struct lu_env *env,
+                          struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_discard(const struct lu_env *env,
+                          struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_fini   (const struct lu_env *env, struct cl_page_list *plist);
+
+void cl_2queue_init     (struct cl_2queue *queue);
+void cl_2queue_add      (struct cl_2queue *queue, struct cl_page *page);
+void cl_2queue_disown   (const struct lu_env *env,
+                         struct cl_io *io, struct cl_2queue *queue);
+void cl_2queue_assume   (const struct lu_env *env,
+                         struct cl_io *io, struct cl_2queue *queue);
+void cl_2queue_discard  (const struct lu_env *env,
+                         struct cl_io *io, struct cl_2queue *queue);
+void cl_2queue_fini     (const struct lu_env *env, struct cl_2queue *queue);
+void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page);
+
+/** @} cl_page_list */
+
+void cl_req_attr_set(const struct lu_env *env, struct cl_object *obj,
+		     struct cl_req_attr *attr);
+
+/** \defgroup cl_sync_io cl_sync_io
+ * @{ */
+
+/**
+ * Anchor for synchronous transfer. This is allocated on a stack by thread
+ * doing synchronous transfer, and a pointer to this structure is set up in
+ * every page submitted for transfer. Transfer completion routine updates
+ * anchor and wakes up waiting thread when transfer is complete.
+ */
+struct cl_sync_io {
+	/** number of pages yet to be transferred. */
+	atomic_t		csi_sync_nr;
+	/** error code. */
+	int			csi_sync_rc;
+	/** barrier of destroy this structure */
+	atomic_t		csi_barrier;
+	/** completion to be signaled when transfer is complete. */
+	wait_queue_head_t	csi_waitq;
+	/** callback to invoke when this IO is finished */
+	void			(*csi_end_io)(const struct lu_env *,
+					      struct cl_sync_io *);
+};
+
+void cl_sync_io_init(struct cl_sync_io *anchor, int nr,
+		     void (*end)(const struct lu_env *, struct cl_sync_io *));
+int  cl_sync_io_wait(const struct lu_env *env, struct cl_sync_io *anchor,
+		     long timeout);
+void cl_sync_io_note(const struct lu_env *env, struct cl_sync_io *anchor,
+		     int ioret);
+void cl_sync_io_end(const struct lu_env *env, struct cl_sync_io *anchor);
+
+/** @} cl_sync_io */
+
+/** \defgroup cl_env cl_env
+ *
+ * lu_env handling for a client.
+ *
+ * lu_env is an environment within which lustre code executes. Its major part
+ * is lu_context---a fast memory allocation mechanism that is used to conserve
+ * precious kernel stack space. Originally lu_env was designed for a server,
+ * where
+ *
+ *     - there is a (mostly) fixed number of threads, and
+ *
+ *     - call chains have no non-lustre portions inserted between lustre code.
+ *
+ * On a client both these assumtpion fails, because every user thread can
+ * potentially execute lustre code as part of a system call, and lustre calls
+ * into VFS or MM that call back into lustre.
+ *
+ * To deal with that, cl_env wrapper functions implement the following
+ * optimizations:
+ *
+ *     - allocation and destruction of environment is amortized by caching no
+ *     longer used environments instead of destroying them;
+ *
+ * \see lu_env, lu_context, lu_context_key
+ * @{ */
+
+struct lu_env *cl_env_get(__u16 *refcheck);
+struct lu_env *cl_env_alloc(__u16 *refcheck, __u32 tags);
+void cl_env_put(struct lu_env *env, __u16 *refcheck);
+unsigned cl_env_cache_purge(unsigned nr);
+struct lu_env *cl_env_percpu_get(void);
+void cl_env_percpu_put(struct lu_env *env);
+
+/** @} cl_env */
+
+/*
+ * Misc
+ */
+void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr);
+void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb);
+
+struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site,
+                                struct lu_device_type *ldt,
+                                struct lu_device *next);
+/** @} clio */
+
+int cl_global_init(void);
+void cl_global_fini(void);
+
+#endif /* _LINUX_CL_OBJECT_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/dt_object.h b/drivers/staging/lustrefsx/lustre/include/dt_object.h
new file mode 100644
index 0000000000000..7d8e702d76d73
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/dt_object.h
@@ -0,0 +1,2830 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LUSTRE_DT_OBJECT_H
+#define __LUSTRE_DT_OBJECT_H
+
+/** \defgroup dt dt
+ * Sub-class of lu_object with methods common for "data" objects in OST stack.
+ *
+ * Data objects behave like regular files: you can read/write them, get and
+ * set their attributes. Implementation of dt interface is supposed to
+ * implement some form of garbage collection, normally reference counting
+ * (nlink) based one.
+ *
+ * Examples: osd (lustre/osd) is an implementation of dt interface.
+ * @{
+ */
+
+#include <obd_support.h>
+/*
+ * super-class definitions.
+ */
+#include <lu_object.h>
+
+#include <libcfs/libcfs.h>
+
+struct seq_file;
+struct proc_dir_entry;
+struct lustre_cfg;
+
+struct thandle;
+struct dt_device;
+struct dt_object;
+struct dt_index_features;
+struct niobuf_local;
+struct niobuf_remote;
+struct ldlm_enqueue_info;
+
+typedef enum {
+        MNTOPT_USERXATTR        = 0x00000001,
+        MNTOPT_ACL              = 0x00000002,
+} mntopt_t;
+
+struct dt_device_param {
+	unsigned	   ddp_max_name_len;
+	unsigned	   ddp_max_nlink;
+	unsigned	   ddp_symlink_max;
+	mntopt_t	   ddp_mntopts;
+	unsigned	   ddp_max_ea_size;
+	unsigned	   ddp_mount_type;
+	unsigned long long ddp_maxbytes;
+	/* per-inode space consumption */
+	short		   ddp_inodespace;
+	/* maximum number of blocks in an extent */
+	unsigned	   ddp_max_extent_blks;
+	/* per-extent insertion overhead to be used by client for grant
+	 * calculation */
+	unsigned int	   ddp_extent_tax;
+	unsigned int	   ddp_brw_size;	/* optimal RPC size */
+};
+
+/**
+ * Per-transaction commit callback function
+ */
+struct dt_txn_commit_cb;
+typedef void (*dt_cb_t)(struct lu_env *env, struct thandle *th,
+                        struct dt_txn_commit_cb *cb, int err);
+/**
+ * Special per-transaction callback for cases when just commit callback
+ * is needed and per-device callback are not convenient to use
+ */
+#define TRANS_COMMIT_CB_MAGIC	0xa0a00a0a
+#define MAX_COMMIT_CB_STR_LEN	32
+
+#define DCB_TRANS_STOP		0x1
+struct dt_txn_commit_cb {
+	struct list_head	dcb_linkage;
+	dt_cb_t			dcb_func;
+	void			*dcb_data;
+	__u32			dcb_magic;
+	__u32			dcb_flags;
+	char			dcb_name[MAX_COMMIT_CB_STR_LEN];
+};
+
+/**
+ * Operations on dt device.
+ */
+struct dt_device_operations {
+        /**
+         * Return device-wide statistics.
+	 *
+	 * Return device-wide stats including block size, total and
+	 * free blocks, total and free objects, etc. See struct obd_statfs
+	 * for the details.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dev	dt device
+	 * \param[out] osfs	stats information
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+         */
+        int   (*dt_statfs)(const struct lu_env *env,
+			   struct dt_device *dev,
+			   struct obd_statfs *osfs);
+
+        /**
+	 * Create transaction.
+	 *
+	 * Create in-memory structure representing the transaction for the
+	 * caller. The structure returned will be used by the calling thread
+	 * to specify the transaction the updates belong to. Once created
+	 * successfully ->dt_trans_stop() must be called in any case (with
+	 * ->dt_trans_start() and updates or not) so that the transaction
+	 * handle and other resources can be released by the layers below.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dev	dt device
+	 *
+	 * \retval pointer to handle	if creation succeeds
+	 * \retval ERR_PTR(errno)	if creation fails
+         */
+        struct thandle *(*dt_trans_create)(const struct lu_env *env,
+                                           struct dt_device *dev);
+
+        /**
+	 * Start transaction.
+	 *
+	 * Start the transaction. The transaction described by \a th can be
+	 * started only once. Another start is considered as an error.
+	 * A thread is not supposed to start a transaction while another
+	 * transaction isn't closed by the thread (though multiple handles
+	 * can be created). The caller should start the transaction once
+	 * all possible updates are declared (see the ->do_declare_* methods
+	 * below) and all the needed resources are reserved.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dev	dt device
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+         */
+        int   (*dt_trans_start)(const struct lu_env *env,
+				struct dt_device *dev,
+				struct thandle *th);
+
+	/**
+	 * Stop transaction.
+	 *
+	 * Once stopped the transaction described by \a th is complete (all
+	 * the needed updates are applied) and further processing such as
+	 * flushing to disk, sending to another target, etc, is handled by
+	 * lower layers. The caller can't access this transaction by the
+	 * handle anymore (except from the commit callbacks, see below).
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dev	dt device
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*dt_trans_stop)(const struct lu_env *env,
+			       struct dt_device *dev,
+			       struct thandle *th);
+
+        /**
+         * Add commit callback to the transaction.
+	 *
+	 * Add a commit callback to the given transaction handle. The callback
+	 * will be called when the associated transaction is stored. I.e. the
+	 * transaction will survive an event like power off if the callback did
+	 * run. The number of callbacks isn't limited, but you should note that
+	 * some disk filesystems do handle the commit callbacks in the thread
+	 * handling commit/flush of all the transactions, meaning that new
+	 * transactions are blocked from commit and flush until all the
+	 * callbacks are done. Also, note multiple callbacks can be running
+	 * concurrently using multiple CPU cores. The callbacks will be running
+	 * in a special environment which can not be used to pass data around.
+	 *
+	 * \param[in] th	transaction handle
+	 * \param[in] dcb	commit callback description
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+         */
+        int   (*dt_trans_cb_add)(struct thandle *th,
+                                 struct dt_txn_commit_cb *dcb);
+
+        /**
+	 * Return FID of root index object.
+	 *
+	 * Return the FID of the root object in the filesystem. This object
+	 * is usually provided as a bootstrap point by a disk filesystem.
+	 * This is up to the implementation which FID to use, though
+	 * [FID_SEQ_ROOT:1:0] is reserved for this purpose.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dev	dt device
+	 * \param[out] fid	FID of the root object
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+         */
+        int   (*dt_root_get)(const struct lu_env *env,
+			     struct dt_device *dev,
+			     struct lu_fid *f);
+
+        /**
+         * Return device configuration data.
+	 *
+	 * Return device (disk fs, actually) specific configuration.
+	 * The configuration isn't subject to change at runtime.
+	 * See struct dt_device_param for the details.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dev	dt device
+	 * \param[out] param	configuration parameters
+         */
+        void  (*dt_conf_get)(const struct lu_env *env,
+                             const struct dt_device *dev,
+                             struct dt_device_param *param);
+
+	/**
+	 * Sync the device.
+	 *
+	 * Sync all the cached state (dirty buffers, pages, etc) to the
+	 * persistent storage. The method returns control once the sync is
+	 * complete. This operation may incur significant I/O to disk and
+	 * should be reserved for cases where a global sync is strictly
+	 * necessary.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dev	dt device
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*dt_sync)(const struct lu_env *env,
+			 struct dt_device *dev);
+
+	/**
+	 * Make device read-only.
+	 *
+	 * Prevent new modifications to the device. This is a very specific
+	 * state where all the changes are accepted successfully and the
+	 * commit callbacks are called, but persistent state never changes.
+	 * Used only in the tests to simulate power-off scenario.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dev	dt device
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*dt_ro)(const struct lu_env *env,
+		       struct dt_device *dev);
+
+	/**
+	 * Start transaction commit asynchronously.
+	 *
+
+	 * Provide a hint to the underlying filesystem that it should start
+	 * committing soon. The control returns immediately. It's up to the
+	 * layer implementing the method how soon to start committing. Usually
+	 * this should be throttled to some extent, otherwise the number of
+	 * aggregated transaction goes too high causing performance drop.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dev	dt device
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+         int   (*dt_commit_async)(const struct lu_env *env,
+                                  struct dt_device *dev);
+};
+
+struct dt_index_features {
+        /** required feature flags from enum dt_index_flags */
+        __u32 dif_flags;
+        /** minimal required key size */
+        size_t dif_keysize_min;
+        /** maximal required key size, 0 if no limit */
+        size_t dif_keysize_max;
+        /** minimal required record size */
+        size_t dif_recsize_min;
+        /** maximal required record size, 0 if no limit */
+        size_t dif_recsize_max;
+        /** pointer size for record */
+        size_t dif_ptrsize;
+};
+
+enum dt_index_flags {
+        /** index supports variable sized keys */
+        DT_IND_VARKEY = 1 << 0,
+        /** index supports variable sized records */
+        DT_IND_VARREC = 1 << 1,
+        /** index can be modified */
+        DT_IND_UPDATE = 1 << 2,
+        /** index supports records with non-unique (duplicate) keys */
+        DT_IND_NONUNQ = 1 << 3,
+        /**
+         * index support fixed-size keys sorted with natural numerical way
+         * and is able to return left-side value if no exact value found
+         */
+        DT_IND_RANGE = 1 << 4,
+};
+
+/**
+ * Features, required from index to support file system directories (mapping
+ * names to fids).
+ */
+extern const struct dt_index_features dt_directory_features;
+extern const struct dt_index_features dt_otable_features;
+extern const struct dt_index_features dt_lfsck_layout_orphan_features;
+extern const struct dt_index_features dt_lfsck_layout_dangling_features;
+extern const struct dt_index_features dt_lfsck_namespace_features;
+
+/* index features supported by the accounting objects */
+extern const struct dt_index_features dt_acct_features;
+
+/* index features supported by the quota global indexes */
+extern const struct dt_index_features dt_quota_glb_features;
+
+/* index features supported by the quota slave indexes */
+extern const struct dt_index_features dt_quota_slv_features;
+
+/* index features supported by the nodemap index */
+extern const struct dt_index_features dt_nodemap_features;
+
+/**
+ * This is a general purpose dt allocation hint.
+ * It now contains the parent object.
+ * It can contain any allocation hint in the future.
+ */
+struct dt_allocation_hint {
+	struct dt_object	*dah_parent;
+	const void		*dah_eadata;
+	int			dah_eadata_len;
+	__u32			dah_mode;
+};
+
+/**
+ * object type specifier.
+ */
+
+enum dt_format_type {
+        DFT_REGULAR,
+        DFT_DIR,
+        /** for mknod */
+        DFT_NODE,
+        /** for special index */
+        DFT_INDEX,
+        /** for symbolic link */
+        DFT_SYM,
+};
+
+/**
+ * object format specifier.
+ */
+struct dt_object_format {
+        /** type for dt object */
+        enum dt_format_type dof_type;
+        union {
+                struct dof_regular {
+			int striped;
+                } dof_reg;
+                struct dof_dir {
+                } dof_dir;
+                struct dof_node {
+                } dof_node;
+                /**
+                 * special index need feature as parameter to create
+                 * special idx
+                 */
+                struct dof_index {
+                        const struct dt_index_features *di_feat;
+                } dof_idx;
+        } u;
+};
+
+enum dt_format_type dt_mode_to_dft(__u32 mode);
+
+typedef __u64 dt_obj_version_t;
+
+union ldlm_policy_data;
+
+/**
+ * A dt_object provides common operations to create and destroy
+ * objects and to manage regular and extended attributes.
+ */
+struct dt_object_operations {
+	/**
+	 * Get read lock on object.
+	 *
+	 * Read lock is compatible with other read locks, so it's shared.
+	 * Read lock is not compatible with write lock which is exclusive.
+	 * The lock is blocking and can't be used from an interrupt context.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object to lock for reading
+	 * \param[in] role	a hint to debug locks (see kernel's mutexes)
+	 */
+	void  (*do_read_lock)(const struct lu_env *env,
+			      struct dt_object *dt,
+			      unsigned role);
+
+	/*
+	 * Get write lock on object.
+	 *
+	 * Write lock is exclusive and cannot be shared. The lock is blocking
+	 * and can't be used from an interrupt context.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object to lock for writing
+	 * \param[in] role	a hint to debug locks (see kernel's mutexes)
+	 *
+	 */
+	void  (*do_write_lock)(const struct lu_env *env,
+			       struct dt_object *dt,
+			       unsigned role);
+
+	/**
+	 * Release read lock.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 */
+        void  (*do_read_unlock)(const struct lu_env *env,
+                                struct dt_object *dt);
+
+	/**
+	 * Release write lock.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 */
+        void  (*do_write_unlock)(const struct lu_env *env,
+                                 struct dt_object *dt);
+
+	/**
+	 * Check whether write lock is held.
+	 *
+	 * The caller can learn whether write lock is held on the object
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 *
+	 * \retval 0		no write lock
+	 * \retval 1		write lock is held
+	 */
+        int  (*do_write_locked)(const struct lu_env *env,
+                                struct dt_object *dt);
+
+	/**
+	 * Declare intention to request reqular attributes.
+	 *
+	 * Notity the underlying filesystem that the caller may request regular
+	 * attributes with ->do_attr_get() soon. This allows OSD to implement
+	 * prefetching logic in an object-oriented manner. The implementation
+	 * can be noop. This method should avoid expensive delays such as
+	 * waiting on disk I/O, otherwise the goal of enabling a performance
+	 * optimization would be defeated.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*do_declare_attr_get)(const struct lu_env *env,
+				     struct dt_object *dt);
+
+	/**
+	 * Return regular attributes.
+	 *
+	 * The object must exist. Currently all the attributes should be
+	 * returned, but in the future this can be improved so that only
+	 * a selected set is returned. This can improve performance as in
+	 * some cases attributes are stored in different places and
+	 * getting them all can be an iterative and expensive process.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[out] attr	attributes to fill
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*do_attr_get)(const struct lu_env *env,
+			     struct dt_object *dt,
+			     struct lu_attr *attr);
+
+	/**
+	 * Declare intention to change regular object's attributes.
+	 *
+	 * Notify the underlying filesystem that the regular attributes may
+	 * change in this transaction. This enables the layer below to prepare
+	 * resources (e.g. journal credits in ext4).  This method should be
+	 * called between creating the transaction and starting it. Note that
+	 * the la_valid field of \a attr specifies which attributes will change.
+	 * The object need not exist.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] attr	attributes to change specified in attr.la_valid
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+         */
+        int   (*do_declare_attr_set)(const struct lu_env *env,
+                                     struct dt_object *dt,
+                                     const struct lu_attr *attr,
+				     struct thandle *th);
+
+	/**
+	 * Change regular attributes.
+	 *
+	 * Change regular attributes in the given transaction. Note only
+	 * attributes flagged by attr.la_valid change. The object must
+	 * exist. If the layer implementing this method is responsible for
+	 * quota, then the method should maintain object accounting for the
+	 * given credentials when la_uid/la_gid changes.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] attr	new attributes to apply
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+        int   (*do_attr_set)(const struct lu_env *env,
+                             struct dt_object *dt,
+                             const struct lu_attr *attr,
+			     struct thandle *th);
+
+	/**
+	 * Declare intention to request extented attribute.
+	 *
+	 * Notify the underlying filesystem that the caller may request extended
+	 * attribute with ->do_xattr_get() soon. This allows OSD to implement
+	 * prefetching logic in an object-oriented manner. The implementation
+	 * can be noop. This method should avoid expensive delays such as
+	 * waiting on disk I/O, otherwise the goal of enabling a performance
+	 * optimization would be defeated.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] buf	unused, may be removed in the future
+	 * \param[in] name	name of the extended attribute
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*do_declare_xattr_get)(const struct lu_env *env,
+				      struct dt_object *dt,
+				      struct lu_buf *buf,
+				      const char *name);
+
+	/**
+	 * Return a value of an extended attribute.
+	 *
+	 * The object must exist. If the buffer is NULL, then the method
+	 * must return the size of the value.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[out] buf	buffer in which to store the value
+	 * \param[in] name	name of the extended attribute
+	 *
+	 * \retval 0		on success
+	 * \retval -ERANGE	if \a buf is too small
+	 * \retval negative	negated errno on error
+	 * \retval positive	value's size if \a buf is NULL or has zero size
+	 */
+	int   (*do_xattr_get)(const struct lu_env *env,
+			      struct dt_object *dt,
+			      struct lu_buf *buf,
+			      const char *name);
+
+	/**
+	 * Declare intention to change an extended attribute.
+	 *
+	 * Notify the underlying filesystem that the extended attribute may
+	 * change in this transaction.  This enables the layer below to prepare
+	 * resources (e.g. journal credits in ext4).  This method should be
+	 * called between creating the transaction and starting it. The object
+	 * need not exist.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] buf	buffer storing new value of the attribute
+	 * \param[in] name	name of the attribute
+	 * \param[in] fl	LU_XATTR_CREATE - fail if EA exists
+	 *			LU_XATTR_REPLACE - fail if EA doesn't exist
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+         */
+	int   (*do_declare_xattr_set)(const struct lu_env *env,
+				      struct dt_object *dt,
+				      const struct lu_buf *buf,
+				      const char *name,
+				      int fl,
+				      struct thandle *th);
+
+	/**
+	 * Set an extended attribute.
+	 *
+	 * Change or replace the specified extended attribute (EA).
+	 * The flags passed in \a fl dictate whether the EA is to be
+	 * created or replaced, as follows.
+	 *   LU_XATTR_CREATE - fail if EA exists
+	 *   LU_XATTR_REPLACE - fail if EA doesn't exist
+	 * The object must exist.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] buf	buffer storing new value of the attribute
+	 * \param[in] name	name of the attribute
+	 * \param[in] fl	flags indicating EA creation or replacement
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*do_xattr_set)(const struct lu_env *env,
+			      struct dt_object *dt,
+			      const struct lu_buf *buf,
+			      const char *name,
+			      int fl,
+			      struct thandle *th);
+
+	/**
+	 * Declare intention to delete an extended attribute.
+	 *
+	 * Notify the underlying filesystem that the extended attribute may
+	 * be deleted in this transaction. This enables the layer below to
+	 * prepare resources (e.g. journal credits in ext4).  This method
+	 * should be called between creating the transaction and starting it.
+	 * The object need not exist.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] name	name of the attribute
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*do_declare_xattr_del)(const struct lu_env *env,
+				      struct dt_object *dt,
+				      const char *name,
+				      struct thandle *th);
+
+	/**
+	 * Delete an extended attribute.
+	 *
+	 * This method deletes the specified extended attribute. The object
+	 * must exist.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] name	name of the attribute
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*do_xattr_del)(const struct lu_env *env,
+			      struct dt_object *dt,
+			      const char *name,
+			      struct thandle *th);
+
+	/**
+	 * Return a list of the extended attributes.
+	 *
+	 * Fills the passed buffer with a list of the extended attributes
+	 * found in the object. The names are separated with '\0'.
+	 * The object must exist.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[out] buf	buffer to put the list in
+	 *
+	 * \retval positive	bytes used/required in the buffer
+	 * \retval negative	negated errno on error
+         */
+	int   (*do_xattr_list)(const struct lu_env *env,
+			       struct dt_object *dt,
+			       const struct lu_buf *buf);
+
+	/**
+	 * Prepare allocation hint for a new object.
+	 *
+	 * This method is used by the caller to inform OSD of the parent-child
+	 * relationship between two objects and enable efficient object
+	 * allocation. Filled allocation hint will be passed to ->do_create()
+	 * later.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[out] ah	allocation hint
+	 * \param[in] parent	parent object (can be NULL)
+	 * \param[in] child	child object
+	 * \param[in] _mode	type of the child object
+	 */
+	void  (*do_ah_init)(const struct lu_env *env,
+			    struct dt_allocation_hint *ah,
+			    struct dt_object *parent,
+			    struct dt_object *child,
+			    umode_t mode);
+
+	/**
+	 * Declare intention to create a new object.
+	 *
+	 * Notify the underlying filesystem that the object may be created
+	 * in this transaction. This enables the layer below to prepare
+	 * resources (e.g. journal credits in ext4).  This method should be
+	 * called between creating the transaction and starting it.
+	 *
+	 * If the layer implementing this method is responsible for quota,
+	 * then the method should reserve an object for the given credentials
+	 * and return an error if quota is over. If object creation later
+	 * fails for some reason, then the reservation should be released
+	 * properly (usually in ->dt_trans_stop()).
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] attr	attributes of the new object
+	 * \param[in] hint	allocation hint
+	 * \param[in] dof	object format
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+        int   (*do_declare_create)(const struct lu_env *env,
+                                   struct dt_object *dt,
+                                   struct lu_attr *attr,
+                                   struct dt_allocation_hint *hint,
+                                   struct dt_object_format *dof,
+                                   struct thandle *th);
+
+	/**
+	 * Create new object.
+	 *
+	 * The method creates the object passed with the specified attributes
+	 * and object format. Object allocation procedure can use information
+	 * stored in the allocation hint. Different object formats are supported
+	 * (see enum dt_format_type and struct dt_object_format) depending on
+	 * the device. If creation succeeds, then LOHA_EXISTS flag must be set
+	 * in the LU-object header attributes.
+	 *
+	 * If the layer implementing this method is responsible for quota,
+	 * then the method should maintain object accounting for the given
+	 * credentials.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] attr	attributes of the new object
+	 * \param[in] hint	allocation hint
+	 * \param[in] dof	object format
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*do_create)(const struct lu_env *env,
+			   struct dt_object *dt,
+                           struct lu_attr *attr,
+                           struct dt_allocation_hint *hint,
+                           struct dt_object_format *dof,
+                           struct thandle *th);
+
+	/**
+	 * Declare intention to destroy an object.
+	 *
+	 * Notify the underlying filesystem that the object may be destroyed
+	 * in this transaction. This enables the layer below to prepare
+	 * resources (e.g. journal credits in ext4).  This method should be
+	 * called between creating the transaction and starting it. The object
+	 * need not exist.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+        int   (*do_declare_destroy)(const struct lu_env *env,
+                                    struct dt_object *dt,
+                                    struct thandle *th);
+
+	/**
+	 * Destroy an object.
+	 *
+	 * This method destroys the object and all the resources associated
+	 * with the object (data, key/value pairs, extended attributes, etc).
+	 * The object must exist. If destroy is successful, then flag
+	 * LU_OBJECT_HEARD_BANSHEE should be set to forbid access to this
+	 * instance of in-core object. Any subsequent access to the same FID
+	 * should get another instance with no LOHA_EXIST flag set.
+	 *
+	 * If the layer implementing this method is responsible for quota,
+	 * then the method should maintain object accounting for the given
+	 * credentials.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*do_destroy)(const struct lu_env *env,
+			    struct dt_object *dt,
+			    struct thandle *th);
+
+	/**
+	 * Try object as an index.
+	 *
+         * Announce that this object is going to be used as an index. This
+	 * operation checks that object supports indexing operations and
+         * installs appropriate dt_index_operations vector on success.
+         * Also probes for features. Operation is successful if all required
+	 * features are supported. It's not possible to access the object
+	 * with index methods before ->do_index_try() returns success.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] feat	index features
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+        int   (*do_index_try)(const struct lu_env *env,
+                              struct dt_object *dt,
+                              const struct dt_index_features *feat);
+
+	/**
+	 * Declare intention to increment nlink count.
+	 *
+	 * Notify the underlying filesystem that the nlink regular attribute
+	 * be changed in this transaction. This enables the layer below to
+	 * prepare resources (e.g. journal credits in ext4).  This method
+	 * should be called between creating the transaction and starting it.
+	 * The object need not exist.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*do_declare_ref_add)(const struct lu_env *env,
+				    struct dt_object *dt,
+				    struct thandle *th);
+
+	/**
+	 * Increment nlink.
+	 *
+	 * Increment nlink (from the regular attributes set) in the given
+	 * transaction. Note the absolute limit for nlink should be learnt
+	 * from struct dt_device_param::ddp_max_nlink. The object must exist.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+        int   (*do_ref_add)(const struct lu_env *env,
+                            struct dt_object *dt, struct thandle *th);
+
+	/**
+	 * Declare intention to decrement nlink count.
+	 *
+	 * Notify the underlying filesystem that the nlink regular attribute
+	 * be changed in this transaction. This enables the layer below to
+	 * prepare resources (e.g. journal credits in ext4).  This method
+	 * should be called between creating the transaction and starting it.
+	 * The object need not exist.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*do_declare_ref_del)(const struct lu_env *env,
+				    struct dt_object *dt,
+				    struct thandle *th);
+
+	/**
+	 * Decrement nlink.
+	 *
+	 * Decrement nlink (from the regular attributes set) in the given
+	 * transaction. The object must exist.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*do_ref_del)(const struct lu_env *env,
+			    struct dt_object *dt,
+			    struct thandle *th);
+
+	/**
+	 * Sync obect.
+	 *
+	 * The method is called to sync specified range of the object to a
+	 * persistent storage. The control is returned once the operation is
+	 * complete. The difference from ->do_sync() is that the object can
+	 * be in-sync with the persistent storage (nothing to flush), then
+	 * the method returns quickly with no I/O overhead. So, this method
+	 * should be preferred over ->do_sync() where possible. Also note that
+	 * if the object isn't clean, then some disk filesystems will call
+	 * ->do_sync() to maintain overall consistency, in which case it's
+	 * still very expensive.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] start	start of the range to sync
+	 * \param[in] end	end of the range to sync
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int (*do_object_sync)(const struct lu_env *env, struct dt_object *obj,
+			      __u64 start, __u64 end);
+
+	/**
+	 * Lock object.
+	 *
+	 * Lock object(s) using Distributed Lock Manager (LDLM).
+	 *
+	 * Get LDLM locks for the object. Currently used to lock "remote"
+	 * objects in DNE configuration - a service running on MDTx needs
+	 * to lock an object on MDTy.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[out] lh	lock handle, sometimes used, sometimes not
+	 * \param[in] einfo	ldlm callbacks, locking type and mode
+	 * \param[out] einfo	private data to be passed to unlock later
+	 * \param[in] policy	inodebits data
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int (*do_object_lock)(const struct lu_env *env, struct dt_object *dt,
+			      struct lustre_handle *lh,
+			      struct ldlm_enqueue_info *einfo,
+			      union ldlm_policy_data *policy);
+
+	/**
+	 * Unlock object.
+	 *
+	 * Release LDLM lock(s) granted with ->do_object_lock().
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] einfo	lock handles, from ->do_object_lock()
+	 * \param[in] policy	inodebits data
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int (*do_object_unlock)(const struct lu_env *env,
+				struct dt_object *dt,
+				struct ldlm_enqueue_info *einfo,
+				union ldlm_policy_data *policy);
+
+	/**
+	 * Invalidate attribute cache.
+	 *
+	 * This method invalidate attribute cache of the object, which is on OSP
+	 * only.
+	 *
+	 * \param[in] env	execution envionment for this thread
+	 * \param[in] dt	object
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*do_invalidate)(const struct lu_env *env, struct dt_object *dt);
+
+	/**
+	 * Declare intention to instaintiate extended layout component.
+	 *
+	 * \param[in] env	execution environment
+	 * \param[in] dt	DT object
+	 * \param[in] layout	data structure to describe the changes to
+	 *			the DT object's layout
+	 * \param[in] buf	buffer containing client's lovea or empty
+	 *
+	 * \retval 0		success
+	 * \retval -ne		error code
+	 */
+	int (*do_declare_layout_change)(const struct lu_env *env,
+					struct dt_object *dt,
+					struct layout_intent *layout,
+					const struct lu_buf *buf,
+					struct thandle *th);
+
+	/**
+	 * Client is trying to write to un-instantiated layout component.
+	 *
+	 * \param[in] env	execution environment
+	 * \param[in] dt	DT object
+	 * \param[in] layout	data structure to describe the changes to
+	 *			the DT object's layout
+	 * \param[in] buf	buffer containing client's lovea or empty
+	 *
+	 * \retval 0		success
+	 * \retval -ne		error code
+	 */
+	int (*do_layout_change)(const struct lu_env *env, struct dt_object *dt,
+				struct layout_intent *layout,
+				const struct lu_buf *buf, struct thandle *th);
+};
+
+enum dt_bufs_type {
+	DT_BUFS_TYPE_READ	= 0x0000,
+	DT_BUFS_TYPE_WRITE	= 0x0001,
+	DT_BUFS_TYPE_READAHEAD	= 0x0002,
+	DT_BUFS_TYPE_LOCAL	= 0x0004,
+};
+
+/**
+ * Per-dt-object operations on "file body" - unstructure raw data.
+ */
+struct dt_body_operations {
+	/**
+	 * Read data.
+	 *
+	 * Read unstructured data from an existing regular object.
+	 * Only data before attr.la_size is returned.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[out] buf	buffer (including size) to copy data in
+	 * \param[in] pos	position in the object to start
+	 * \param[out] pos	original value of \a pos + bytes returned
+	 *
+	 * \retval positive	bytes read on success
+	 * \retval negative	negated errno on error
+	 */
+	ssize_t (*dbo_read)(const struct lu_env *env,
+			    struct dt_object *dt,
+			    struct lu_buf *buf,
+			    loff_t *pos);
+
+	/**
+	 * Declare intention to write data to object.
+	 *
+	 * Notify the underlying filesystem that data may be written in
+	 * this transaction. This enables the layer below to prepare resources
+	 * (e.g. journal credits in ext4).  This method should be called
+	 * between creating the transaction and starting it. The object need
+	 * not exist. If the layer implementing this method is responsible for
+	 * quota, then the method should reserve space for the given credentials
+	 * and return an error if quota is over. If the write later fails
+	 * for some reason, then the reserve should be released properly
+	 * (usually in ->dt_trans_stop()).
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] buf	buffer (including size) to copy data from
+	 * \param[in] pos	position in the object to start
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	ssize_t (*dbo_declare_write)(const struct lu_env *env,
+				     struct dt_object *dt,
+				     const struct lu_buf *buf,
+				     loff_t pos,
+				     struct thandle *th);
+
+	/**
+	 * Write unstructured data to regular existing object.
+	 *
+	 * The method allocates space and puts data in. Also, the method should
+	 * maintain attr.la_size properly. Partial writes are possible.
+	 *
+	 * If the layer implementing this method is responsible for quota,
+	 * then the method should maintain space accounting for the given
+	 * credentials.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] buf	buffer (including size) to copy data from
+	 * \param[in] pos	position in the object to start
+	 * \param[out] pos	\a pos + bytes written
+	 * \param[in] th	transaction handle
+	 * \param[in] ignore	unused (was used to request quota ignorance)
+	 *
+	 * \retval positive	bytes written on success
+	 * \retval negative	negated errno on error
+	 */
+	ssize_t (*dbo_write)(const struct lu_env *env,
+			     struct dt_object *dt,
+			     const struct lu_buf *buf,
+			     loff_t *pos,
+			     struct thandle *th,
+			     int ignore);
+
+	/**
+	 * Return buffers for data.
+	 *
+	 * This method is used to access data with no copying. It's so-called
+	 * zero-copy I/O. The method returns the descriptors for the internal
+	 * buffers where data are managed by the disk filesystem. For example,
+	 * pagecache in case of ext4 or ARC with ZFS. Then other components
+	 * (e.g. networking) can transfer data from or to the buffers with no
+	 * additional copying.
+	 *
+	 * The method should fill an array of struct niobuf_local, where
+	 * each element describes a full or partial page for data at specific
+	 * offset. The caller should use page/lnb_page_offset/len to find data
+	 * at object's offset lnb_file_offset.
+	 *
+	 * The memory referenced by the descriptors can't change its purpose
+	 * until the complementary ->dbo_bufs_put() is called. The caller should
+	 * specify if the buffers are used to read or modify data so that OSD
+	 * can decide how to initialize the buffers: bring all the data for
+	 * reads or just bring partial buffers for write. Note: the method does
+	 * not check whether output array is large enough.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] pos	position in the object to start
+	 * \param[in] len	size of region in bytes
+	 * \param[out] lb	array of descriptors to fill
+	 * \param[in] rw	0 if used to read, 1 if used for write
+	 *
+	 * \retval positive	number of descriptors on success
+	 * \retval negative	negated errno on error
+	 */
+	int (*dbo_bufs_get)(const struct lu_env *env,
+			    struct dt_object *dt,
+			    loff_t pos,
+			    ssize_t len,
+			    struct niobuf_local *lb,
+			    enum dt_bufs_type rw);
+
+	/**
+	 * Release reference granted by ->dbo_bufs_get().
+	 *
+	 * Release the reference granted by the previous ->dbo_bufs_get().
+	 * Note the references are counted.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[out] lb	array of descriptors to fill
+	 * \param[in] nr	size of the array
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int (*dbo_bufs_put)(const struct lu_env *env,
+			    struct dt_object *dt,
+			    struct niobuf_local *lb,
+			    int nr);
+
+	/**
+	 * Prepare buffers for reading.
+	 *
+	 * The method is called on the given buffers to fill them with data
+	 * if that wasn't done in ->dbo_bufs_get(). The idea is that the
+	 * caller should be able to get few buffers for discontiguous regions
+	 * using few calls to ->dbo_bufs_get() and then request them all for
+	 * the preparation with a single call, so that OSD can fire many I/Os
+	 * to run concurrently. It's up to the specific OSD whether to implement
+	 * this logic in ->dbo_read_prep() or just use ->dbo_bufs_get() to
+	 * prepare data for every requested region individually.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] lnb	array of buffer descriptors
+	 * \param[in] nr	size of the array
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int (*dbo_read_prep)(const struct lu_env *env,
+			     struct dt_object *dt,
+			     struct niobuf_local *lnb,
+			     int nr);
+
+	/**
+	 * Prepare buffers for write.
+	 *
+	 * This method is called on the given buffers to ensure the partial
+	 * buffers contain correct data. The underlying idea is the same as
+	 * in ->db_read_prep().
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] lb	array of buffer descriptors
+	 * \param[in] nr	size of the array
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int (*dbo_write_prep)(const struct lu_env *env,
+			      struct dt_object *dt,
+			      struct niobuf_local *lb,
+			      int nr);
+
+	/**
+	 * Declare intention to write data stored in the buffers.
+	 *
+	 * Notify the underlying filesystem that data may be written in
+	 * this transaction. This enables the layer below to prepare resources
+	 * (e.g. journal credits in ext4).  This method should be called
+	 * between creating the transaction and starting it.
+	 *
+	 * If the layer implementing this method is responsible for quota,
+	 * then the method should be reserving a space for the given
+	 * credentials and return an error if quota is exceeded. If the write
+	 * later fails for some reason, then the reserve should be released
+	 * properly (usually in ->dt_trans_stop()).
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] lb	array of descriptors
+	 * \param[in] nr	size of the array
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int (*dbo_declare_write_commit)(const struct lu_env *env,
+					struct dt_object *dt,
+					struct niobuf_local *lb,
+					int nr,
+					struct thandle *th);
+
+	/**
+	 * Write to existing object.
+	 *
+	 * This method is used to write data to a persistent storage using
+	 * the buffers returned by ->dbo_bufs_get(). The caller puts new
+	 * data into the buffers using own mechanisms (e.g. direct transfer
+	 * from a NIC). The method should maintain attr.la_size. Also,
+	 * attr.la_blocks should be maintained but this can be done in lazy
+	 * manner, when actual allocation happens.
+	 *
+	 * If the layer implementing this method is responsible for quota,
+	 * then the method should maintain space accounting for the given
+	 * credentials.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] lb	array of descriptors for the buffers
+	 * \param[in] nr	size of the array
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int (*dbo_write_commit)(const struct lu_env *env,
+				struct dt_object *dt,
+				struct niobuf_local *lb,
+				int nr,
+				struct thandle *th);
+
+	/**
+	 * Return logical to physical block mapping for a given extent
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] fm	describe the region to map and the output buffer
+	 *			see the details in include/linux/fiemap.h
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int (*dbo_fiemap_get)(const struct lu_env *env,
+			      struct dt_object *dt,
+			      struct fiemap *fm);
+
+	/**
+	 * Declare intention to deallocate space from an object.
+	 *
+	 * Notify the underlying filesystem that space may be deallocated in
+	 * this transactions. This enables the layer below to prepare resources
+	 * (e.g. journal credits in ext4).  This method should be called between
+	 * creating the transaction and starting it. The object need not exist.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] start	the start of the region to deallocate
+	 * \param[in] end	the end of the region to deallocate
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*dbo_declare_punch)(const struct lu_env *env,
+				   struct dt_object *dt,
+				   __u64 start,
+				   __u64 end,
+				   struct thandle *th);
+
+	/**
+	 * Deallocate specified region in an object.
+	 *
+	 * This method is used to deallocate (release) space possibly consumed
+	 * by the given region of the object. If the layer implementing this
+	 * method is responsible for quota, then the method should maintain
+	 * space accounting for the given credentials.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] start	the start of the region to deallocate
+	 * \param[in] end	the end of the region to deallocate
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*dbo_punch)(const struct lu_env *env,
+			   struct dt_object *dt,
+			   __u64 start,
+			   __u64 end,
+			   struct thandle *th);
+	/**
+	 * Give advices on specified region in an object.
+	 *
+	 * This method is used to give advices about access pattern on an
+	 * given region of the object. The disk filesystem understands
+	 * the advices and tunes cache/read-ahead policies.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] start	the start of the region affected
+	 * \param[in] end	the end of the region affected
+	 * \param[in] advice	advice type
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int   (*dbo_ladvise)(const struct lu_env *env,
+			     struct dt_object *dt,
+			     __u64 start,
+			     __u64 end,
+			     enum lu_ladvise_type advice);
+};
+
+/**
+ * Incomplete type of index record.
+ */
+struct dt_rec;
+
+/**
+ * Incomplete type of index key.
+ */
+struct dt_key;
+
+/**
+ * Incomplete type of dt iterator.
+ */
+struct dt_it;
+
+/**
+ * Per-dt-object operations on object as index. Index is a set of key/value
+ * pairs abstracted from an on-disk representation. An index supports the
+ * number of operations including lookup by key, insert and delete. Also,
+ * an index can be iterated to find the pairs one by one, from a beginning
+ * or specified point.
+ */
+struct dt_index_operations {
+	/**
+	 * Lookup in an index by key.
+	 *
+	 * The method returns a value for the given key. Key/value format
+	 * and size should have been negotiated with ->do_index_try() before.
+	 * Thus it's the caller's responsibility to provide the method with
+	 * proper key and big enough buffer. No external locking is required,
+	 * all the internal consistency should be implemented by the method
+	 * or lower layers. The object should should have been created with
+	 * type DFT_INDEX or DFT_DIR.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[out] rec	buffer where value will be stored
+	 * \param[in] key	key
+	 *
+	 * \retval 0		on success
+	 * \retval -ENOENT	if key isn't found
+	 * \retval negative	negated errno on error
+	 */
+	int (*dio_lookup)(const struct lu_env *env,
+			  struct dt_object *dt,
+			  struct dt_rec *rec,
+			  const struct dt_key *key);
+
+	/**
+	 * Declare intention to insert a key/value into an index.
+	 *
+	 * Notify the underlying filesystem that new key/value may be inserted
+	 * in this transaction. This enables the layer below to prepare
+	 * resources (e.g. journal credits in ext4). This method should be
+	 * called between creating the transaction and starting it. key/value
+	 * format and size is subject to ->do_index_try().
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] rec	buffer storing value
+	 * \param[in] key	key
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+        int (*dio_declare_insert)(const struct lu_env *env,
+				  struct dt_object *dt,
+				  const struct dt_rec *rec,
+				  const struct dt_key *key,
+				  struct thandle *th);
+
+	/**
+	 * Insert a new key/value pair into an index.
+	 *
+	 * The method inserts specified key/value pair into the given index
+	 * object. The internal consistency is maintained by the method or
+	 * the functionality below. The format and size of key/value should
+	 * have been negotiated before using ->do_index_try(), no additional
+	 * information can be specified to the method. The keys are unique
+	 * in a given index.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] rec	buffer storing value
+	 * \param[in] key	key
+	 * \param[in] th	transaction handle
+	 * \param[in] ignore	unused (was used to request quota ignorance)
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int (*dio_insert)(const struct lu_env *env,
+			  struct dt_object *dt,
+			  const struct dt_rec *rec,
+			  const struct dt_key *key,
+			  struct thandle *th,
+			  int ignore);
+
+	/**
+	 * Declare intention to delete a key/value from an index.
+	 *
+	 * Notify the underlying filesystem that key/value may be deleted in
+	 * this transaction. This enables the layer below to prepare resources
+	 * (e.g. journal credits in ext4).  This method should be called
+	 * between creating the transaction and starting it. Key/value format
+	 * and size is subject to ->do_index_try(). The object need not exist.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] key	key
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+        int (*dio_declare_delete)(const struct lu_env *env,
+				  struct dt_object *dt,
+				  const struct dt_key *key,
+				  struct thandle *th);
+
+	/**
+	 * Delete key/value pair from an index.
+	 *
+	 * The method deletes specified key and corresponding value from the
+	 * given index object. The internal consistency is maintained by the
+	 * method or the functionality below. The format and size of the key
+	 * should have been negotiated before using ->do_index_try(), no
+	 * additional information can be specified to the method.
+	 *
+	 * \param[in] env	execution environment for this thread
+	 * \param[in] dt	object
+	 * \param[in] key	key
+	 * \param[in] th	transaction handle
+	 *
+	 * \retval 0		on success
+	 * \retval negative	negated errno on error
+	 */
+	int (*dio_delete)(const struct lu_env *env,
+			  struct dt_object *dt,
+			  const struct dt_key *key,
+			  struct thandle *th);
+
+        /**
+	 * Iterator interface.
+	 *
+	 * Methods to iterate over an existing index, list the keys stored and
+	 * associated values, get key/value size, etc.
+         */
+        struct dt_it_ops {
+		/**
+		 * Allocate and initialize new iterator.
+		 *
+		 * The iterator is a handler to be used in the subsequent
+		 * methods to access index's content. Note the position is
+		 * not defined at this point and should be initialized with
+		 * ->get() or ->load() method.
+		 *
+		 * \param[in] env	execution environment for this thread
+		 * \param[in] dt	object
+		 * \param[in] attr	ask the iterator to return part of
+					the records, see LUDA_* for details
+		 *
+		 * \retval pointer	iterator pointer on success
+		 * \retval ERR_PTR(errno)	on error
+                 */
+                struct dt_it *(*init)(const struct lu_env *env,
+				      struct dt_object *dt,
+				      __u32 attr);
+
+		/**
+		 * Release iterator.
+		 *
+		 * Release the specified iterator and all the resources
+		 * associated (e.g. the object, index cache, etc).
+		 *
+		 * \param[in] env	execution environment for this thread
+		 * \param[in] di	iterator to release
+		 */
+                void          (*fini)(const struct lu_env *env,
+                                      struct dt_it *di);
+
+		/**
+		 * Move position of iterator.
+		 *
+		 * Move the position of the specified iterator to the specified
+		 * key.
+		 *
+		 * \param[in] env	execution environment for this thread
+		 * \param[in] di	iterator
+		 * \param[in] key	key to position to
+		 *
+		 * \retval 0		if exact key is found
+		 * \retval 1		if at the record with least key
+		 *			not larger than the key
+		 * \retval negative	negated errno on error
+		 */
+                int            (*get)(const struct lu_env *env,
+                                      struct dt_it *di,
+                                      const struct dt_key *key);
+
+		/**
+		 * Release position
+		 *
+		 * Complimentary method for dt_it_ops::get() above. Some
+		 * implementation can increase a reference on the iterator in
+		 * dt_it_ops::get(). So the caller should be able to release
+		 * with dt_it_ops::put().
+		 *
+		 * \param[in] env	execution environment for this thread
+		 * \param[in] di	iterator
+		 */
+                void           (*put)(const struct lu_env *env,
+                                      struct dt_it *di);
+
+		/**
+		 * Move to next record.
+		 *
+		 * Moves the position of the iterator to a next record
+		 *
+		 * \param[in] env	execution environment for this thread
+		 * \param[in] di	iterator
+		 *
+		 * \retval 1		if no more records
+		 * \retval 0		on success, the next record is found
+		 * \retval negative	negated errno on error
+		 */
+                int           (*next)(const struct lu_env *env,
+                                      struct dt_it *di);
+
+		/**
+		 * Return key.
+		 *
+		 * Returns a pointer to a buffer containing the key of the
+		 * record at the current position. The pointer is valid and
+		 * retains data until ->get(), ->load() and ->fini() methods
+		 * are called.
+		 *
+		 * \param[in] env	execution environment for this thread
+		 * \param[in] di	iterator
+		 *
+		 * \retval pointer to key	on success
+		 * \retval ERR_PTR(errno)	on error
+		 */
+                struct dt_key *(*key)(const struct lu_env *env,
+                                      const struct dt_it *di);
+
+		/**
+		 * Return key size.
+		 *
+		 * Returns size of the key at the current position.
+		 *
+		 * \param[in] env	execution environment for this thread
+		 * \param[in] di	iterator
+		 *
+		 * \retval key's size	on success
+		 * \retval negative	negated errno on error
+		 */
+                int       (*key_size)(const struct lu_env *env,
+                                      const struct dt_it *di);
+
+		/**
+		 * Return record.
+		 *
+		 * Stores the value of the record at the current position. The
+		 * buffer must be big enough (as negotiated with
+		 * ->do_index_try() or ->rec_size()). The caller can specify
+		 * she is interested only in part of the record, using attr
+		 * argument (see LUDA_* definitions for the details).
+		 *
+		 * \param[in] env	execution environment for this thread
+		 * \param[in] di	iterator
+		 * \param[out] rec	buffer to store value in
+		 * \param[in] attr	specify part of the value to copy
+		 *
+		 * \retval 0		on success
+		 * \retval negative	negated errno on error
+		 */
+                int            (*rec)(const struct lu_env *env,
+                                      const struct dt_it *di,
+                                      struct dt_rec *rec,
+                                      __u32 attr);
+
+		/**
+		 * Return record size.
+		 *
+		 * Returns size of the record at the current position. The
+		 * \a attr can be used to specify only the parts of the record
+		 * needed to be returned. (see LUDA_* definitions for the
+		 * details).
+		 *
+		 * \param[in] env	execution environment for this thread
+		 * \param[in] di	iterator
+		 * \param[in] attr	part of the record to return
+		 *
+		 * \retval record's size	on success
+		 * \retval negative		negated errno on error
+		 */
+		int	   (*rec_size)(const struct lu_env *env,
+				       const struct dt_it *di,
+				      __u32 attr);
+
+		/**
+		 * Return a cookie (hash).
+		 *
+		 * Returns the cookie (usually hash) of the key at the current
+		 * position. This allows the caller to resume iteration at this
+		 * position later. The exact value is specific to implementation
+		 * and should not be interpreted by the caller.
+		 *
+		 * \param[in] env	execution environment for this thread
+		 * \param[in] di	iterator
+		 *
+		 * \retval cookie/hash of the key
+		 */
+                __u64        (*store)(const struct lu_env *env,
+                                      const struct dt_it *di);
+
+		/**
+		 * Initialize position using cookie/hash.
+		 *
+		 * Initializes the current position of the iterator to one
+		 * described by the cookie/hash as returned by ->store()
+		 * previously.
+		 *
+		 * \param[in] env	execution environment for this thread
+		 * \param[in] di	iterator
+		 * \param[in] hash	cookie/hash value
+		 *
+		 * \retval positive	if current position points to
+		 *			record with least cookie not larger
+		 *			than cookie
+		 * \retval 0		if current position matches cookie
+		 * \retval negative	negated errno on error
+		 */
+                int           (*load)(const struct lu_env *env,
+				      const struct dt_it *di,
+				      __u64 hash);
+
+		/**
+		 * Not used
+		 */
+                int        (*key_rec)(const struct lu_env *env,
+				      const struct dt_it *di,
+				      void *key_rec);
+        } dio_it;
+};
+
+enum dt_otable_it_valid {
+	DOIV_ERROR_HANDLE	= 0x0001,
+	DOIV_DRYRUN		= 0x0002,
+};
+
+enum dt_otable_it_flags {
+	/* Exit when fail. */
+	DOIF_FAILOUT	= 0x0001,
+
+	/* Reset iteration position to the device beginning. */
+	DOIF_RESET	= 0x0002,
+
+	/* There is up layer component uses the iteration. */
+	DOIF_OUTUSED	= 0x0004,
+
+	/* Check only without repairing. */
+	DOIF_DRYRUN	= 0x0008,
+};
+
+/* otable based iteration needs to use the common DT iteration APIs.
+ * To initialize the iteration, it needs call dio_it::init() firstly.
+ * Here is how the otable based iteration should prepare arguments to
+ * call dt_it_ops::init().
+ *
+ * For otable based iteration, the 32-bits 'attr' for dt_it_ops::init()
+ * is composed of two parts:
+ * low 16-bits is for valid bits, high 16-bits is for flags bits. */
+#define DT_OTABLE_IT_FLAGS_SHIFT	16
+#define DT_OTABLE_IT_FLAGS_MASK 	0xffff0000
+
+struct dt_device {
+        struct lu_device                   dd_lu_dev;
+        const struct dt_device_operations *dd_ops;
+
+        /**
+         * List of dt_txn_callback (see below). This is not protected in any
+         * way, because callbacks are supposed to be added/deleted only during
+         * single-threaded start-up shut-down procedures.
+         */
+	struct list_head		   dd_txn_callbacks;
+	unsigned int			   dd_record_fid_accessed:1,
+					   dd_rdonly:1;
+};
+
+int  dt_device_init(struct dt_device *dev, struct lu_device_type *t);
+void dt_device_fini(struct dt_device *dev);
+
+static inline int lu_device_is_dt(const struct lu_device *d)
+{
+        return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_DT);
+}
+
+static inline struct dt_device * lu2dt_dev(struct lu_device *l)
+{
+        LASSERT(lu_device_is_dt(l));
+        return container_of0(l, struct dt_device, dd_lu_dev);
+}
+
+struct dt_object {
+        struct lu_object                   do_lu;
+        const struct dt_object_operations *do_ops;
+        const struct dt_body_operations   *do_body_ops;
+        const struct dt_index_operations  *do_index_ops;
+};
+
+/*
+ * In-core representation of per-device local object OID storage
+ */
+struct local_oid_storage {
+	/* all initialized llog systems on this node linked by this */
+	struct list_head  los_list;
+
+	/* how many handle's reference this los has */
+	atomic_t	  los_refcount;
+	struct dt_device *los_dev;
+	struct dt_object *los_obj;
+
+	/* data used to generate new fids */
+	struct mutex	  los_id_lock;
+	__u64		  los_seq;
+	__u32		  los_last_oid;
+};
+
+static inline struct lu_device *dt2lu_dev(struct dt_device *d)
+{
+        return &d->dd_lu_dev;
+}
+
+static inline struct dt_object *lu2dt(struct lu_object *l)
+{
+        LASSERT(l == NULL || IS_ERR(l) || lu_device_is_dt(l->lo_dev));
+        return container_of0(l, struct dt_object, do_lu);
+}
+
+int  dt_object_init(struct dt_object *obj,
+                    struct lu_object_header *h, struct lu_device *d);
+
+void dt_object_fini(struct dt_object *obj);
+
+static inline int dt_object_exists(const struct dt_object *dt)
+{
+        return lu_object_exists(&dt->do_lu);
+}
+
+static inline int dt_object_remote(const struct dt_object *dt)
+{
+	return lu_object_remote(&dt->do_lu);
+}
+
+static inline struct dt_object *lu2dt_obj(struct lu_object *o)
+{
+	LASSERT(ergo(o != NULL, lu_device_is_dt(o->lo_dev)));
+	return container_of0(o, struct dt_object, do_lu);
+}
+
+static inline struct dt_object *dt_object_child(struct dt_object *o)
+{
+	return container_of0(lu_object_next(&(o)->do_lu),
+			     struct dt_object, do_lu);
+}
+
+/**
+ * This is the general purpose transaction handle.
+ * 1. Transaction Life Cycle
+ *      This transaction handle is allocated upon starting a new transaction,
+ *      and deallocated after this transaction is committed.
+ * 2. Transaction Nesting
+ *      We do _NOT_ support nested transaction. So, every thread should only
+ *      have one active transaction, and a transaction only belongs to one
+ *      thread. Due to this, transaction handle need no reference count.
+ * 3. Transaction & dt_object locking
+ *      dt_object locks should be taken inside transaction.
+ * 4. Transaction & RPC
+ *      No RPC request should be issued inside transaction.
+ */
+struct thandle {
+	/** the dt device on which the transactions are executed */
+	struct dt_device *th_dev;
+
+	/* point to the top thandle, XXX this is a bit hacky right now,
+	 * but normal device trans callback triggered by the bottom
+	 * device (OSP/OSD == sub thandle layer) needs to get the
+	 * top_thandle (see dt_txn_hook_start/stop()), so we put the
+	 * top thandle here for now, will fix it when we have better
+	 * callback mechanism */
+	struct thandle	*th_top;
+
+	/** the last operation result in this transaction.
+	 * this value is used in recovery */
+	__s32             th_result;
+
+	/** whether we need sync commit */
+	unsigned int		th_sync:1,
+	/* local transation, no need to inform other layers */
+				th_local:1,
+	/* Whether we need wait the transaction to be submitted
+	 * (send to remote target) */
+				th_wait_submit:1,
+	/* complex transaction which will track updates on all targets,
+	 * including OSTs */
+				th_complex:1;
+};
+
+/**
+ * Transaction call-backs.
+ *
+ * These are invoked by osd (or underlying transaction engine) when
+ * transaction changes state.
+ *
+ * Call-backs are used by upper layers to modify transaction parameters and to
+ * perform some actions on for each transaction state transition. Typical
+ * example is mdt registering call-back to write into last-received file
+ * before each transaction commit.
+ */
+struct dt_txn_callback {
+        int (*dtc_txn_start)(const struct lu_env *env,
+                             struct thandle *txn, void *cookie);
+        int (*dtc_txn_stop)(const struct lu_env *env,
+                            struct thandle *txn, void *cookie);
+        void (*dtc_txn_commit)(struct thandle *txn, void *cookie);
+	void			*dtc_cookie;
+	__u32			dtc_tag;
+	struct list_head	dtc_linkage;
+};
+
+void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb);
+void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb);
+
+int dt_txn_hook_start(const struct lu_env *env,
+                      struct dt_device *dev, struct thandle *txn);
+int dt_txn_hook_stop(const struct lu_env *env, struct thandle *txn);
+void dt_txn_hook_commit(struct thandle *txn);
+
+int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj);
+
+/**
+ * Callback function used for parsing path.
+ * \see llo_store_resolve
+ */
+typedef int (*dt_entry_func_t)(const struct lu_env *env,
+                            const char *name,
+                            void *pvt);
+
+#define DT_MAX_PATH 1024
+
+int dt_path_parser(const struct lu_env *env,
+                   char *local, dt_entry_func_t entry_func,
+                   void *data);
+
+struct dt_object *
+dt_store_resolve(const struct lu_env *env, struct dt_device *dt,
+		 const char *path, struct lu_fid *fid);
+
+struct dt_object *dt_store_open(const struct lu_env *env,
+                                struct dt_device *dt,
+                                const char *dirname,
+                                const char *filename,
+                                struct lu_fid *fid);
+
+struct dt_object *dt_find_or_create(const struct lu_env *env,
+                                    struct dt_device *dt,
+                                    const struct lu_fid *fid,
+                                    struct dt_object_format *dof,
+                                    struct lu_attr *attr);
+
+struct dt_object *dt_locate_at(const struct lu_env *env,
+			       struct dt_device *dev,
+			       const struct lu_fid *fid,
+			       struct lu_device *top_dev,
+			       const struct lu_object_conf *conf);
+
+static inline struct dt_object *
+dt_locate(const struct lu_env *env, struct dt_device *dev,
+	  const struct lu_fid *fid)
+{
+	return dt_locate_at(env, dev, fid,
+			    dev->dd_lu_dev.ld_site->ls_top_dev, NULL);
+}
+
+static inline struct dt_object *
+dt_object_locate(struct dt_object *dto, struct dt_device *dt_dev)
+{
+	struct lu_object *lo;
+
+	list_for_each_entry(lo, &dto->do_lu.lo_header->loh_layers, lo_linkage) {
+		if (lo->lo_dev == &dt_dev->dd_lu_dev)
+			return container_of(lo, struct dt_object, do_lu);
+	}
+	return NULL;
+}
+
+static inline void dt_object_put(const struct lu_env *env,
+				 struct dt_object *dto)
+{
+	lu_object_put(env, &dto->do_lu);
+}
+
+static inline void dt_object_put_nocache(const struct lu_env *env,
+					 struct dt_object *dto)
+{
+	lu_object_put_nocache(env, &dto->do_lu);
+}
+
+int local_oid_storage_init(const struct lu_env *env, struct dt_device *dev,
+			   const struct lu_fid *first_fid,
+			   struct local_oid_storage **los);
+void local_oid_storage_fini(const struct lu_env *env,
+			    struct local_oid_storage *los);
+int local_object_fid_generate(const struct lu_env *env,
+			      struct local_oid_storage *los,
+			      struct lu_fid *fid);
+int local_object_declare_create(const struct lu_env *env,
+				struct local_oid_storage *los,
+				struct dt_object *o,
+				struct lu_attr *attr,
+				struct dt_object_format *dof,
+				struct thandle *th);
+int local_object_create(const struct lu_env *env,
+			struct local_oid_storage *los,
+			struct dt_object *o,
+			struct lu_attr *attr, struct dt_object_format *dof,
+			struct thandle *th);
+struct dt_object *local_file_find(const struct lu_env *env,
+				  struct local_oid_storage *los,
+				  struct dt_object *parent,
+				  const char *name);
+struct dt_object *local_file_find_or_create(const struct lu_env *env,
+					    struct local_oid_storage *los,
+					    struct dt_object *parent,
+					    const char *name, __u32 mode);
+struct dt_object *local_file_find_or_create_with_fid(const struct lu_env *env,
+						     struct dt_device *dt,
+						     const struct lu_fid *fid,
+						     struct dt_object *parent,
+						     const char *name,
+						     __u32 mode);
+struct dt_object *
+local_index_find_or_create(const struct lu_env *env,
+			   struct local_oid_storage *los,
+			   struct dt_object *parent,
+			   const char *name, __u32 mode,
+			   const struct dt_index_features *ft);
+struct dt_object *
+local_index_find_or_create_with_fid(const struct lu_env *env,
+				    struct dt_device *dt,
+				    const struct lu_fid *fid,
+				    struct dt_object *parent,
+				    const char *name, __u32 mode,
+				    const struct dt_index_features *ft);
+int local_object_unlink(const struct lu_env *env, struct dt_device *dt,
+			struct dt_object *parent, const char *name);
+
+static inline int dt_object_lock(const struct lu_env *env,
+				 struct dt_object *o, struct lustre_handle *lh,
+				 struct ldlm_enqueue_info *einfo,
+				 union ldlm_policy_data *policy)
+{
+	LASSERT(o != NULL);
+	LASSERT(o->do_ops != NULL);
+	LASSERT(o->do_ops->do_object_lock != NULL);
+	return o->do_ops->do_object_lock(env, o, lh, einfo, policy);
+}
+
+static inline int dt_object_unlock(const struct lu_env *env,
+				   struct dt_object *o,
+				   struct ldlm_enqueue_info *einfo,
+				   union ldlm_policy_data *policy)
+{
+	LASSERT(o != NULL);
+	LASSERT(o->do_ops != NULL);
+	LASSERT(o->do_ops->do_object_unlock != NULL);
+	return o->do_ops->do_object_unlock(env, o, einfo, policy);
+}
+
+int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir,
+		  const char *name, struct lu_fid *fid);
+
+static inline int dt_object_sync(const struct lu_env *env, struct dt_object *o,
+				 __u64 start, __u64 end)
+{
+	LASSERT(o);
+	LASSERT(o->do_ops);
+	LASSERT(o->do_ops->do_object_sync);
+	return o->do_ops->do_object_sync(env, o, start, end);
+}
+
+int dt_declare_version_set(const struct lu_env *env, struct dt_object *o,
+                           struct thandle *th);
+void dt_version_set(const struct lu_env *env, struct dt_object *o,
+                    dt_obj_version_t version, struct thandle *th);
+dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o);
+
+
+int dt_read(const struct lu_env *env, struct dt_object *dt,
+            struct lu_buf *buf, loff_t *pos);
+int dt_record_read(const struct lu_env *env, struct dt_object *dt,
+                   struct lu_buf *buf, loff_t *pos);
+int dt_record_write(const struct lu_env *env, struct dt_object *dt,
+                    const struct lu_buf *buf, loff_t *pos, struct thandle *th);
+typedef int (*dt_index_page_build_t)(const struct lu_env *env,
+				     union lu_page *lp, size_t nob,
+				     const struct dt_it_ops *iops,
+				     struct dt_it *it, __u32 attr, void *arg);
+int dt_index_walk(const struct lu_env *env, struct dt_object *obj,
+		  const struct lu_rdpg *rdpg, dt_index_page_build_t filler,
+		  void *arg);
+int dt_index_read(const struct lu_env *env, struct dt_device *dev,
+		  struct idx_info *ii, const struct lu_rdpg *rdpg);
+
+static inline struct thandle *dt_trans_create(const struct lu_env *env,
+                                              struct dt_device *d)
+{
+        LASSERT(d->dd_ops->dt_trans_create);
+        return d->dd_ops->dt_trans_create(env, d);
+}
+
+static inline int dt_trans_start(const struct lu_env *env,
+                                 struct dt_device *d, struct thandle *th)
+{
+        LASSERT(d->dd_ops->dt_trans_start);
+        return d->dd_ops->dt_trans_start(env, d, th);
+}
+
+/* for this transaction hooks shouldn't be called */
+static inline int dt_trans_start_local(const struct lu_env *env,
+                                       struct dt_device *d, struct thandle *th)
+{
+        LASSERT(d->dd_ops->dt_trans_start);
+        th->th_local = 1;
+        return d->dd_ops->dt_trans_start(env, d, th);
+}
+
+static inline int dt_trans_stop(const struct lu_env *env,
+                                struct dt_device *d, struct thandle *th)
+{
+	LASSERT(d->dd_ops->dt_trans_stop);
+	return d->dd_ops->dt_trans_stop(env, d, th);
+}
+
+static inline int dt_trans_cb_add(struct thandle *th,
+				  struct dt_txn_commit_cb *dcb)
+{
+	LASSERT(th->th_dev->dd_ops->dt_trans_cb_add);
+	dcb->dcb_magic = TRANS_COMMIT_CB_MAGIC;
+	return th->th_dev->dd_ops->dt_trans_cb_add(th, dcb);
+}
+/** @} dt */
+
+
+static inline int dt_declare_record_write(const struct lu_env *env,
+					  struct dt_object *dt,
+					  const struct lu_buf *buf,
+					  loff_t pos,
+					  struct thandle *th)
+{
+	int rc;
+
+	LASSERTF(dt != NULL, "dt is NULL when we want to write record\n");
+	LASSERT(th != NULL);
+	LASSERT(dt->do_body_ops);
+	LASSERT(dt->do_body_ops->dbo_declare_write);
+	rc = dt->do_body_ops->dbo_declare_write(env, dt, buf, pos, th);
+	return rc;
+}
+
+static inline int dt_declare_create(const struct lu_env *env,
+                                    struct dt_object *dt,
+                                    struct lu_attr *attr,
+                                    struct dt_allocation_hint *hint,
+                                    struct dt_object_format *dof,
+                                    struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_declare_create);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_CREATE))
+		return cfs_fail_err;
+
+        return dt->do_ops->do_declare_create(env, dt, attr, hint, dof, th);
+}
+
+static inline int dt_create(const struct lu_env *env,
+                                    struct dt_object *dt,
+                                    struct lu_attr *attr,
+                                    struct dt_allocation_hint *hint,
+                                    struct dt_object_format *dof,
+                                    struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_create);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_CREATE))
+		return cfs_fail_err;
+
+        return dt->do_ops->do_create(env, dt, attr, hint, dof, th);
+}
+
+static inline int dt_declare_destroy(const struct lu_env *env,
+                                     struct dt_object *dt,
+                                     struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_declare_destroy);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_DESTROY))
+		return cfs_fail_err;
+
+        return dt->do_ops->do_declare_destroy(env, dt, th);
+}
+
+static inline int dt_destroy(const struct lu_env *env,
+                             struct dt_object *dt,
+                             struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_destroy);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_DESTROY))
+		return cfs_fail_err;
+
+        return dt->do_ops->do_destroy(env, dt, th);
+}
+
+static inline void dt_read_lock(const struct lu_env *env,
+                                struct dt_object *dt,
+                                unsigned role)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_read_lock);
+        dt->do_ops->do_read_lock(env, dt, role);
+}
+
+static inline void dt_write_lock(const struct lu_env *env,
+                                struct dt_object *dt,
+                                unsigned role)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_write_lock);
+        dt->do_ops->do_write_lock(env, dt, role);
+}
+
+static inline void dt_read_unlock(const struct lu_env *env,
+                                struct dt_object *dt)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_read_unlock);
+        dt->do_ops->do_read_unlock(env, dt);
+}
+
+static inline void dt_write_unlock(const struct lu_env *env,
+                                struct dt_object *dt)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_write_unlock);
+        dt->do_ops->do_write_unlock(env, dt);
+}
+
+static inline int dt_write_locked(const struct lu_env *env,
+                                  struct dt_object *dt)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_write_locked);
+        return dt->do_ops->do_write_locked(env, dt);
+}
+
+static inline int dt_declare_attr_get(const struct lu_env *env,
+				      struct dt_object *dt)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_declare_attr_get);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_ATTR_GET))
+		return cfs_fail_err;
+
+	return dt->do_ops->do_declare_attr_get(env, dt);
+}
+
+static inline int dt_attr_get(const struct lu_env *env, struct dt_object *dt,
+			      struct lu_attr *la)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_attr_get);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_ATTR_GET))
+		return cfs_fail_err;
+
+	return dt->do_ops->do_attr_get(env, dt, la);
+}
+
+static inline int dt_declare_attr_set(const struct lu_env *env,
+                                      struct dt_object *dt,
+                                      const struct lu_attr *la,
+                                      struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_declare_attr_set);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_ATTR_SET))
+		return cfs_fail_err;
+
+        return dt->do_ops->do_declare_attr_set(env, dt, la, th);
+}
+
+static inline int dt_attr_set(const struct lu_env *env, struct dt_object *dt,
+			      const struct lu_attr *la, struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_attr_set);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_ATTR_SET))
+		return cfs_fail_err;
+
+	return dt->do_ops->do_attr_set(env, dt, la, th);
+}
+
+static inline int dt_declare_ref_add(const struct lu_env *env,
+                                     struct dt_object *dt, struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_declare_ref_add);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_REF_ADD))
+		return cfs_fail_err;
+
+        return dt->do_ops->do_declare_ref_add(env, dt, th);
+}
+
+static inline int dt_ref_add(const struct lu_env *env,
+                             struct dt_object *dt, struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_ref_add);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_REF_ADD))
+		return cfs_fail_err;
+
+        return dt->do_ops->do_ref_add(env, dt, th);
+}
+
+static inline int dt_declare_ref_del(const struct lu_env *env,
+                                     struct dt_object *dt, struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_declare_ref_del);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_REF_DEL))
+		return cfs_fail_err;
+
+        return dt->do_ops->do_declare_ref_del(env, dt, th);
+}
+
+static inline int dt_ref_del(const struct lu_env *env,
+                             struct dt_object *dt, struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_ref_del);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_REF_DEL))
+		return cfs_fail_err;
+
+        return dt->do_ops->do_ref_del(env, dt, th);
+}
+
+static inline int dt_bufs_get(const struct lu_env *env, struct dt_object *d,
+			      struct niobuf_remote *rnb,
+			      struct niobuf_local *lnb, enum dt_bufs_type rw)
+{
+	LASSERT(d);
+	LASSERT(d->do_body_ops);
+	LASSERT(d->do_body_ops->dbo_bufs_get);
+	return d->do_body_ops->dbo_bufs_get(env, d, rnb->rnb_offset,
+					    rnb->rnb_len, lnb, rw);
+}
+
+static inline int dt_bufs_put(const struct lu_env *env, struct dt_object *d,
+                              struct niobuf_local *lnb, int n)
+{
+        LASSERT(d);
+        LASSERT(d->do_body_ops);
+        LASSERT(d->do_body_ops->dbo_bufs_put);
+        return d->do_body_ops->dbo_bufs_put(env, d, lnb, n);
+}
+
+static inline int dt_write_prep(const struct lu_env *env, struct dt_object *d,
+                                struct niobuf_local *lnb, int n)
+{
+        LASSERT(d);
+        LASSERT(d->do_body_ops);
+        LASSERT(d->do_body_ops->dbo_write_prep);
+        return d->do_body_ops->dbo_write_prep(env, d, lnb, n);
+}
+
+static inline int dt_declare_write_commit(const struct lu_env *env,
+                                          struct dt_object *d,
+                                          struct niobuf_local *lnb,
+                                          int n, struct thandle *th)
+{
+        LASSERTF(d != NULL, "dt is NULL when we want to declare write\n");
+        LASSERT(th != NULL);
+        return d->do_body_ops->dbo_declare_write_commit(env, d, lnb, n, th);
+}
+
+
+static inline int dt_write_commit(const struct lu_env *env,
+                                  struct dt_object *d, struct niobuf_local *lnb,
+                                  int n, struct thandle *th)
+{
+        LASSERT(d);
+        LASSERT(d->do_body_ops);
+        LASSERT(d->do_body_ops->dbo_write_commit);
+        return d->do_body_ops->dbo_write_commit(env, d, lnb, n, th);
+}
+
+static inline int dt_read_prep(const struct lu_env *env, struct dt_object *d,
+                               struct niobuf_local *lnb, int n)
+{
+        LASSERT(d);
+        LASSERT(d->do_body_ops);
+        LASSERT(d->do_body_ops->dbo_read_prep);
+        return d->do_body_ops->dbo_read_prep(env, d, lnb, n);
+}
+
+static inline int dt_declare_write(const struct lu_env *env,
+				   struct dt_object *dt,
+				   const struct lu_buf *buf, loff_t pos,
+				   struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_body_ops);
+	LASSERT(dt->do_body_ops->dbo_declare_write);
+	return dt->do_body_ops->dbo_declare_write(env, dt, buf, pos, th);
+}
+
+static inline ssize_t dt_write(const struct lu_env *env, struct dt_object *dt,
+			       const struct lu_buf *buf, loff_t *pos,
+			       struct thandle *th, int rq)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_body_ops);
+	LASSERT(dt->do_body_ops->dbo_write);
+	return dt->do_body_ops->dbo_write(env, dt, buf, pos, th, rq);
+}
+
+static inline int dt_declare_punch(const struct lu_env *env,
+                                   struct dt_object *dt, __u64 start,
+                                   __u64 end, struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_body_ops);
+        LASSERT(dt->do_body_ops->dbo_declare_punch);
+        return dt->do_body_ops->dbo_declare_punch(env, dt, start, end, th);
+}
+
+static inline int dt_punch(const struct lu_env *env, struct dt_object *dt,
+			   __u64 start, __u64 end, struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_body_ops);
+        LASSERT(dt->do_body_ops->dbo_punch);
+	return dt->do_body_ops->dbo_punch(env, dt, start, end, th);
+}
+
+static inline int dt_ladvise(const struct lu_env *env, struct dt_object *dt,
+			     __u64 start, __u64 end, int advice)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_body_ops);
+	LASSERT(dt->do_body_ops->dbo_ladvise);
+	return dt->do_body_ops->dbo_ladvise(env, dt, start, end, advice);
+}
+
+static inline int dt_fiemap_get(const struct lu_env *env, struct dt_object *d,
+				struct fiemap *fm)
+{
+        LASSERT(d);
+        if (d->do_body_ops == NULL)
+                return -EPROTO;
+	if (d->do_body_ops->dbo_fiemap_get == NULL)
+		return -EOPNOTSUPP;
+        return d->do_body_ops->dbo_fiemap_get(env, d, fm);
+}
+
+static inline int dt_statfs(const struct lu_env *env, struct dt_device *dev,
+                            struct obd_statfs *osfs)
+{
+        LASSERT(dev);
+        LASSERT(dev->dd_ops);
+        LASSERT(dev->dd_ops->dt_statfs);
+        return dev->dd_ops->dt_statfs(env, dev, osfs);
+}
+
+static inline int dt_root_get(const struct lu_env *env, struct dt_device *dev,
+                              struct lu_fid *f)
+{
+        LASSERT(dev);
+        LASSERT(dev->dd_ops);
+        LASSERT(dev->dd_ops->dt_root_get);
+        return dev->dd_ops->dt_root_get(env, dev, f);
+}
+
+static inline void dt_conf_get(const struct lu_env *env,
+                               const struct dt_device *dev,
+                               struct dt_device_param *param)
+{
+        LASSERT(dev);
+        LASSERT(dev->dd_ops);
+        LASSERT(dev->dd_ops->dt_conf_get);
+        return dev->dd_ops->dt_conf_get(env, dev, param);
+}
+
+static inline int dt_sync(const struct lu_env *env, struct dt_device *dev)
+{
+        LASSERT(dev);
+        LASSERT(dev->dd_ops);
+        LASSERT(dev->dd_ops->dt_sync);
+        return dev->dd_ops->dt_sync(env, dev);
+}
+
+static inline int dt_ro(const struct lu_env *env, struct dt_device *dev)
+{
+        LASSERT(dev);
+        LASSERT(dev->dd_ops);
+        LASSERT(dev->dd_ops->dt_ro);
+        return dev->dd_ops->dt_ro(env, dev);
+}
+
+static inline int dt_declare_insert(const struct lu_env *env,
+                                    struct dt_object *dt,
+                                    const struct dt_rec *rec,
+                                    const struct dt_key *key,
+                                    struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_index_ops);
+        LASSERT(dt->do_index_ops->dio_declare_insert);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_INSERT))
+		return cfs_fail_err;
+
+	return dt->do_index_ops->dio_declare_insert(env, dt, rec, key, th);
+}
+
+static inline int dt_insert(const struct lu_env *env,
+                                    struct dt_object *dt,
+                                    const struct dt_rec *rec,
+                                    const struct dt_key *key,
+                                    struct thandle *th,
+                                    int noquota)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_index_ops);
+        LASSERT(dt->do_index_ops->dio_insert);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_INSERT))
+		return cfs_fail_err;
+
+	return dt->do_index_ops->dio_insert(env, dt, rec, key, th, noquota);
+}
+
+static inline int dt_declare_xattr_del(const struct lu_env *env,
+                                       struct dt_object *dt,
+                                       const char *name,
+                                       struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_declare_xattr_del);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_XATTR_DEL))
+		return cfs_fail_err;
+
+        return dt->do_ops->do_declare_xattr_del(env, dt, name, th);
+}
+
+static inline int dt_xattr_del(const struct lu_env *env,
+			       struct dt_object *dt, const char *name,
+			       struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_xattr_del);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_XATTR_DEL))
+		return cfs_fail_err;
+
+	return dt->do_ops->do_xattr_del(env, dt, name, th);
+}
+
+static inline int dt_declare_xattr_set(const struct lu_env *env,
+                                      struct dt_object *dt,
+                                      const struct lu_buf *buf,
+                                      const char *name, int fl,
+                                      struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_declare_xattr_set);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_XATTR_SET))
+		return cfs_fail_err;
+
+        return dt->do_ops->do_declare_xattr_set(env, dt, buf, name, fl, th);
+}
+
+static inline int dt_xattr_set(const struct lu_env *env,
+			       struct dt_object *dt, const struct lu_buf *buf,
+			       const char *name, int fl, struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_xattr_set);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_XATTR_SET))
+		return cfs_fail_err;
+
+	return dt->do_ops->do_xattr_set(env, dt, buf, name, fl, th);
+}
+
+static inline int dt_declare_xattr_get(const struct lu_env *env,
+				       struct dt_object *dt,
+				       struct lu_buf *buf,
+				       const char *name)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_declare_xattr_get);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_XATTR_GET))
+		return cfs_fail_err;
+
+	return dt->do_ops->do_declare_xattr_get(env, dt, buf, name);
+}
+
+static inline int dt_xattr_get(const struct lu_env *env,
+			       struct dt_object *dt, struct lu_buf *buf,
+			       const char *name)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_xattr_get);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_XATTR_GET))
+		return cfs_fail_err;
+
+	return dt->do_ops->do_xattr_get(env, dt, buf, name);
+}
+
+static inline int dt_xattr_list(const struct lu_env *env, struct dt_object *dt,
+				const struct lu_buf *buf)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_ops);
+        LASSERT(dt->do_ops->do_xattr_list);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_XATTR_LIST))
+		return cfs_fail_err;
+
+	return dt->do_ops->do_xattr_list(env, dt, buf);
+}
+
+static inline int dt_invalidate(const struct lu_env *env, struct dt_object *dt)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_invalidate);
+
+	return dt->do_ops->do_invalidate(env, dt);
+}
+
+static inline int dt_declare_delete(const struct lu_env *env,
+                                    struct dt_object *dt,
+                                    const struct dt_key *key,
+                                    struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_index_ops);
+        LASSERT(dt->do_index_ops->dio_declare_delete);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_DECLARE_DELETE))
+		return cfs_fail_err;
+
+        return dt->do_index_ops->dio_declare_delete(env, dt, key, th);
+}
+
+static inline int dt_delete(const struct lu_env *env,
+			    struct dt_object *dt,
+			    const struct dt_key *key,
+			    struct thandle *th)
+{
+        LASSERT(dt);
+        LASSERT(dt->do_index_ops);
+        LASSERT(dt->do_index_ops->dio_delete);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_DELETE))
+		return cfs_fail_err;
+
+	return dt->do_index_ops->dio_delete(env, dt, key, th);
+}
+
+static inline int dt_commit_async(const struct lu_env *env,
+                                  struct dt_device *dev)
+{
+        LASSERT(dev);
+        LASSERT(dev->dd_ops);
+        LASSERT(dev->dd_ops->dt_commit_async);
+        return dev->dd_ops->dt_commit_async(env, dev);
+}
+
+static inline int dt_lookup(const struct lu_env *env,
+			    struct dt_object *dt,
+			    struct dt_rec *rec,
+			    const struct dt_key *key)
+{
+        int ret;
+
+        LASSERT(dt);
+        LASSERT(dt->do_index_ops);
+        LASSERT(dt->do_index_ops->dio_lookup);
+
+	if (CFS_FAULT_CHECK(OBD_FAIL_DT_LOOKUP))
+		return cfs_fail_err;
+
+	ret = dt->do_index_ops->dio_lookup(env, dt, rec, key);
+        if (ret > 0)
+                ret = 0;
+        else if (ret == 0)
+                ret = -ENOENT;
+        return ret;
+}
+
+static inline int dt_declare_layout_change(const struct lu_env *env,
+					   struct dt_object *o,
+					   struct layout_intent *layout,
+					   const struct lu_buf *buf,
+					   struct thandle *th)
+{
+	LASSERT(o);
+	LASSERT(o->do_ops);
+	LASSERT(o->do_ops->do_declare_layout_change);
+	return o->do_ops->do_declare_layout_change(env, o, layout, buf, th);
+}
+
+static inline int dt_layout_change(const struct lu_env *env,
+				   struct dt_object *o,
+				   struct layout_intent *layout,
+				   const struct lu_buf *buf,
+				   struct thandle *th)
+{
+	LASSERT(o);
+	LASSERT(o->do_ops);
+	LASSERT(o->do_ops->do_layout_change);
+	return o->do_ops->do_layout_change(env, o, layout, buf, th);
+}
+
+struct dt_find_hint {
+	struct lu_fid        *dfh_fid;
+	struct dt_device     *dfh_dt;
+	struct dt_object     *dfh_o;
+};
+
+struct dt_insert_rec {
+	union {
+		const struct lu_fid	*rec_fid;
+		void			*rec_data;
+	};
+	union {
+		struct {
+			__u32		 rec_type;
+			__u32		 rec_padding;
+		};
+		__u64			 rec_misc;
+	};
+};
+
+struct dt_thread_info {
+	char                     dti_buf[DT_MAX_PATH];
+	struct dt_find_hint      dti_dfh;
+	struct lu_attr           dti_attr;
+	struct lu_fid            dti_fid;
+	struct dt_object_format  dti_dof;
+	struct lustre_mdt_attrs  dti_lma;
+	struct lu_buf            dti_lb;
+	struct lu_object_conf	 dti_conf;
+	loff_t                   dti_off;
+	struct dt_insert_rec	 dti_dt_rec;
+};
+
+extern struct lu_context_key dt_key;
+
+static inline struct dt_thread_info *dt_info(const struct lu_env *env)
+{
+	struct dt_thread_info *dti;
+
+	dti = lu_context_key_get(&env->le_ctx, &dt_key);
+	LASSERT(dti);
+	return dti;
+}
+
+int dt_global_init(void);
+void dt_global_fini(void);
+
+# ifdef CONFIG_PROC_FS
+int lprocfs_dt_blksize_seq_show(struct seq_file *m, void *v);
+int lprocfs_dt_kbytestotal_seq_show(struct seq_file *m, void *v);
+int lprocfs_dt_kbytesfree_seq_show(struct seq_file *m, void *v);
+int lprocfs_dt_kbytesavail_seq_show(struct seq_file *m, void *v);
+int lprocfs_dt_filestotal_seq_show(struct seq_file *m, void *v);
+int lprocfs_dt_filesfree_seq_show(struct seq_file *m, void *v);
+# endif /* CONFIG_PROC_FS */
+
+#endif /* __LUSTRE_DT_OBJECT_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/interval_tree.h b/drivers/staging/lustrefsx/lustre/include/interval_tree.h
new file mode 100644
index 0000000000000..1598119aba5b5
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/interval_tree.h
@@ -0,0 +1,131 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/interval_tree.h
+ *
+ * Author: Huang Wei <huangwei@clusterfs.com>
+ * Author: Jay Xiong <jinshan.xiong@sun.com>
+ */
+
+#ifndef _INTERVAL_H__
+#define _INTERVAL_H__
+
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+struct interval_node {
+        struct interval_node   *in_left;
+        struct interval_node   *in_right;
+        struct interval_node   *in_parent;
+        unsigned                in_color:1,
+                                in_intree:1, /** set if the node is in tree */
+                                in_res1:30;
+        __u8                    in_res2[4];  /** tags, 8-bytes aligned */
+        __u64                   in_max_high;
+        struct interval_node_extent {
+                __u64 start;
+                __u64 end;
+        } in_extent;
+};
+
+enum interval_iter {
+        INTERVAL_ITER_CONT = 1,
+        INTERVAL_ITER_STOP = 2
+};
+
+static inline int interval_is_intree(struct interval_node *node)
+{
+        return node->in_intree == 1;
+}
+
+static inline __u64 interval_low(struct interval_node *node)
+{
+        return node->in_extent.start;
+}
+
+static inline __u64 interval_high(struct interval_node *node)
+{
+        return node->in_extent.end;
+}
+
+static inline int interval_set(struct interval_node *node,
+			       __u64 start, __u64 end)
+{
+	if (start > end)
+		return -ERANGE;
+	node->in_extent.start = start;
+	node->in_extent.end = end;
+	node->in_max_high = end;
+	return 0;
+}
+
+static inline void interval_init(struct interval_node *node)
+{
+	memset(node, 0, sizeof(*node));
+}
+
+int node_equal(struct interval_node *n1, struct interval_node *n2);
+
+/* Rules to write an interval callback.
+ *  - the callback returns INTERVAL_ITER_STOP when it thinks the iteration
+ *    should be stopped. It will then cause the iteration function to return
+ *    immediately with return value INTERVAL_ITER_STOP.
+ *  - callbacks for interval_iterate and interval_iterate_reverse: Every 
+ *    nodes in the tree will be set to @node before the callback being called
+ *  - callback for interval_search: Only overlapped node will be set to @node
+ *    before the callback being called.
+ */
+typedef enum interval_iter (*interval_callback_t)(struct interval_node *node,
+                                                  void *args);
+
+struct interval_node *interval_insert(struct interval_node *node,
+                                      struct interval_node **root);
+void interval_erase(struct interval_node *node, struct interval_node **root);
+
+/* Search the extents in the tree and call @func for each overlapped
+ * extents. */
+enum interval_iter interval_search(struct interval_node *root,
+                                   struct interval_node_extent *ex,
+                                   interval_callback_t func, void *data);
+
+/* Iterate every node in the tree - by reverse order or regular order. */
+enum interval_iter interval_iterate(struct interval_node *root, 
+                                    interval_callback_t func, void *data);
+enum interval_iter interval_iterate_reverse(struct interval_node *root,
+                                    interval_callback_t func,void *data);
+
+void interval_expand(struct interval_node *root, 
+                     struct interval_node_extent *ext,
+                     struct interval_node_extent *limiter);
+int interval_is_overlapped(struct interval_node *root, 
+                           struct interval_node_extent *ex);
+struct interval_node *interval_find(struct interval_node *root,
+                                    struct interval_node_extent *ex);
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/llog_swab.h b/drivers/staging/lustrefsx/lustre/include/llog_swab.h
new file mode 100644
index 0000000000000..a0b8d022c1a5b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/llog_swab.h
@@ -0,0 +1,68 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ *
+ * Copyright 2015 Cray Inc, all rights reserved.
+ * Author: Ben Evans.
+ *
+ * We assume all nodes are either little-endian or big-endian, and we
+ * always send messages in the sender's native format.  The receiver
+ * detects the message format by checking the 'magic' field of the message
+ * (see lustre_msg_swabbed() below).
+ *
+ * Each type has corresponding 'lustre_swab_xxxtypexxx()' routines
+ * are implemented in ptlrpc/pack_generic.c.  These 'swabbers' convert the
+ * type from "other" endian, in-place in the message buffer.
+ *
+ * A swabber takes a single pointer argument.  The caller must already have
+ * verified that the length of the message buffer >= sizeof (type).
+ *
+ * For variable length types, a second 'lustre_swab_v_xxxtypexxx()' routine
+ * may be defined that swabs just the variable part, after the caller has
+ * verified that the message buffer is large enough.
+ */
+
+#ifndef _LLOG_SWAB_H_
+#define _LLOG_SWAB_H_
+
+#include <lustre/lustre_idl.h>
+struct lustre_cfg;
+
+void lustre_swab_lu_fid(struct lu_fid *fid);
+void lustre_swab_ost_id(struct ost_id *oid);
+void lustre_swab_ll_fid(struct ll_fid *fid);
+void lustre_swab_llogd_body(struct llogd_body *d);
+void lustre_swab_llog_hdr(struct llog_log_hdr *h);
+void lustre_swab_llogd_conn_body(struct llogd_conn_body *d);
+void lustre_swab_llog_rec(struct llog_rec_hdr *rec);
+void lustre_swab_llog_id(struct llog_logid *lid);
+void lustre_swab_lu_seq_range(struct lu_seq_range *range);
+void lustre_swab_update_ops(struct update_ops *uops, unsigned int op_count);
+void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg);
+void lustre_swab_cfg_marker(struct cfg_marker *marker,
+			    int swab, int size);
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h b/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h
new file mode 100644
index 0000000000000..646679d9aa45e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h
@@ -0,0 +1,1006 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lprocfs_status.h
+ *
+ * Top level header file for LProc
+ *
+ * Author: Hariharan Thantry thantry@users.sourceforge.net
+ */
+#ifndef _LPROCFS_STATUS_H
+#define _LPROCFS_STATUS_H
+
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/debugfs.h>
+#include <linux/rwsem.h>
+#include <linux/spinlock.h>
+#include <linux/seq_file.h>
+
+#include <libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+
+struct lprocfs_vars {
+	const char			*name;
+	const struct file_operations	*fops;
+	void				*data;
+	/**
+	 * /proc file mode.
+	 */
+	mode_t				 proc_mode;
+};
+
+/* if we find more consumers this could be generalized */
+#define OBD_HIST_MAX 32
+struct obd_histogram {
+	spinlock_t	oh_lock;
+	unsigned long	oh_buckets[OBD_HIST_MAX];
+};
+
+enum {
+        BRW_R_PAGES = 0,
+        BRW_W_PAGES,
+        BRW_R_RPC_HIST,
+        BRW_W_RPC_HIST,
+        BRW_R_IO_TIME,
+        BRW_W_IO_TIME,
+        BRW_R_DISCONT_PAGES,
+        BRW_W_DISCONT_PAGES,
+        BRW_R_DISCONT_BLOCKS,
+        BRW_W_DISCONT_BLOCKS,
+        BRW_R_DISK_IOSIZE,
+        BRW_W_DISK_IOSIZE,
+        BRW_R_DIO_FRAGS,
+        BRW_W_DIO_FRAGS,
+        BRW_LAST,
+};
+
+struct brw_stats {
+        struct obd_histogram hist[BRW_LAST];
+};
+
+enum {
+        RENAME_SAMEDIR_SIZE = 0,
+        RENAME_CROSSDIR_SRC_SIZE,
+        RENAME_CROSSDIR_TGT_SIZE,
+        RENAME_LAST,
+};
+
+struct rename_stats {
+        struct obd_histogram hist[RENAME_LAST];
+};
+
+/* An lprocfs counter can be configured using the enum bit masks below.
+ *
+ * LPROCFS_CNTR_EXTERNALLOCK indicates that an external lock already
+ * protects this counter from concurrent updates. If not specified,
+ * lprocfs an internal per-counter lock variable. External locks are
+ * not used to protect counter increments, but are used to protect
+ * counter readout and resets.
+ *
+ * LPROCFS_CNTR_AVGMINMAX indicates a multi-valued counter samples,
+ * (i.e. counter can be incremented by more than "1"). When specified,
+ * the counter maintains min, max and sum in addition to a simple
+ * invocation count. This allows averages to be be computed.
+ * If not specified, the counter is an increment-by-1 counter.
+ * min, max, sum, etc. are not maintained.
+ *
+ * LPROCFS_CNTR_STDDEV indicates that the counter should track sum of
+ * squares (for multi-valued counter samples only). This allows
+ * external computation of standard deviation, but involves a 64-bit
+ * multiply per counter increment.
+ */
+
+enum {
+        LPROCFS_CNTR_EXTERNALLOCK = 0x0001,
+        LPROCFS_CNTR_AVGMINMAX    = 0x0002,
+        LPROCFS_CNTR_STDDEV       = 0x0004,
+
+        /* counter data type */
+        LPROCFS_TYPE_REGS         = 0x0100,
+        LPROCFS_TYPE_BYTES        = 0x0200,
+        LPROCFS_TYPE_PAGES        = 0x0400,
+        LPROCFS_TYPE_CYCLE        = 0x0800,
+};
+
+#define LC_MIN_INIT ((~(__u64)0) >> 1)
+
+struct lprocfs_counter_header {
+	unsigned int		lc_config;
+	const char		*lc_name;   /* must be static */
+	const char		*lc_units;  /* must be static */
+};
+
+struct lprocfs_counter {
+	__s64	lc_count;
+	__s64	lc_min;
+	__s64	lc_max;
+	__s64	lc_sumsquare;
+	/*
+	 * Every counter has lc_array_sum[0], while lc_array_sum[1] is only
+	 * for irq context counter, i.e. stats with
+	 * LPROCFS_STATS_FLAG_IRQ_SAFE flag, its counter need
+	 * lc_array_sum[1]
+	 */
+	__s64	lc_array_sum[1];
+};
+#define lc_sum		lc_array_sum[0]
+#define lc_sum_irq	lc_array_sum[1]
+
+struct lprocfs_percpu {
+	struct lprocfs_counter lp_cntr[0];
+};
+
+enum lprocfs_stats_lock_ops {
+	LPROCFS_GET_NUM_CPU	= 0x0001, /* number allocated per-CPU stats */
+	LPROCFS_GET_SMP_ID	= 0x0002, /* current stat to be updated */
+};
+
+enum lprocfs_stats_flags {
+	LPROCFS_STATS_FLAG_NONE     = 0x0000, /* per cpu counter */
+	LPROCFS_STATS_FLAG_NOPERCPU = 0x0001, /* stats have no percpu
+					       * area and need locking */
+	LPROCFS_STATS_FLAG_IRQ_SAFE = 0x0002, /* alloc need irq safe */
+};
+
+enum lprocfs_fields_flags {
+	LPROCFS_FIELDS_FLAGS_CONFIG     = 0x0001,
+	LPROCFS_FIELDS_FLAGS_SUM        = 0x0002,
+	LPROCFS_FIELDS_FLAGS_MIN        = 0x0003,
+	LPROCFS_FIELDS_FLAGS_MAX        = 0x0004,
+	LPROCFS_FIELDS_FLAGS_AVG        = 0x0005,
+	LPROCFS_FIELDS_FLAGS_SUMSQUARE  = 0x0006,
+	LPROCFS_FIELDS_FLAGS_COUNT      = 0x0007,
+};
+
+struct lprocfs_stats {
+	/* # of counters */
+	unsigned short			ls_num;
+	/* 1 + the biggest cpu # whose ls_percpu slot has been allocated */
+	unsigned short			ls_biggest_alloc_num;
+	enum lprocfs_stats_flags	ls_flags;
+	/* Lock used when there are no percpu stats areas; For percpu stats,
+	 * it is used to protect ls_biggest_alloc_num change */
+	spinlock_t			ls_lock;
+
+	/* has ls_num of counter headers */
+	struct lprocfs_counter_header	*ls_cnt_header;
+	struct lprocfs_percpu		*ls_percpu[0];
+};
+
+#define OPC_RANGE(seg) (seg ## _LAST_OPC - seg ## _FIRST_OPC)
+
+/* Pack all opcodes down into a single monotonically increasing index */
+static inline int opcode_offset(__u32 opc) {
+        if (opc < OST_LAST_OPC) {
+                 /* OST opcode */
+                return (opc - OST_FIRST_OPC);
+        } else if (opc < MDS_LAST_OPC) {
+                /* MDS opcode */
+                return (opc - MDS_FIRST_OPC +
+                        OPC_RANGE(OST));
+        } else if (opc < LDLM_LAST_OPC) {
+                /* LDLM Opcode */
+                return (opc - LDLM_FIRST_OPC +
+                        OPC_RANGE(MDS) +
+                        OPC_RANGE(OST));
+        } else if (opc < MGS_LAST_OPC) {
+                /* MGS Opcode */
+                return (opc - MGS_FIRST_OPC +
+                        OPC_RANGE(LDLM) +
+                        OPC_RANGE(MDS) +
+                        OPC_RANGE(OST));
+        } else if (opc < OBD_LAST_OPC) {
+                /* OBD Ping */
+                return (opc - OBD_FIRST_OPC +
+                        OPC_RANGE(MGS) +
+                        OPC_RANGE(LDLM) +
+                        OPC_RANGE(MDS) +
+                        OPC_RANGE(OST));
+        } else if (opc < LLOG_LAST_OPC) {
+                /* LLOG Opcode */
+                return (opc - LLOG_FIRST_OPC +
+                        OPC_RANGE(OBD) +
+                        OPC_RANGE(MGS) +
+                        OPC_RANGE(LDLM) +
+                        OPC_RANGE(MDS) +
+                        OPC_RANGE(OST));
+        } else if (opc < QUOTA_LAST_OPC) {
+                /* LQUOTA Opcode */
+                return (opc - QUOTA_FIRST_OPC +
+                        OPC_RANGE(LLOG) +
+                        OPC_RANGE(OBD) +
+                        OPC_RANGE(MGS) +
+                        OPC_RANGE(LDLM) +
+                        OPC_RANGE(MDS) +
+                        OPC_RANGE(OST));
+        } else if (opc < SEQ_LAST_OPC) {
+                /* SEQ opcode */
+                return (opc - SEQ_FIRST_OPC +
+                        OPC_RANGE(QUOTA) +
+                        OPC_RANGE(LLOG) +
+                        OPC_RANGE(OBD) +
+                        OPC_RANGE(MGS) +
+                        OPC_RANGE(LDLM) +
+                        OPC_RANGE(MDS) +
+                        OPC_RANGE(OST));
+        } else if (opc < SEC_LAST_OPC) {
+                /* SEC opcode */
+                return (opc - SEC_FIRST_OPC +
+                        OPC_RANGE(SEQ) +
+                        OPC_RANGE(QUOTA) +
+                        OPC_RANGE(LLOG) +
+                        OPC_RANGE(OBD) +
+                        OPC_RANGE(MGS) +
+                        OPC_RANGE(LDLM) +
+                        OPC_RANGE(MDS) +
+                        OPC_RANGE(OST));
+        } else if (opc < FLD_LAST_OPC) {
+                /* FLD opcode */
+                 return (opc - FLD_FIRST_OPC +
+                        OPC_RANGE(SEC) +
+                        OPC_RANGE(SEQ) +
+                        OPC_RANGE(QUOTA) +
+                        OPC_RANGE(LLOG) +
+                        OPC_RANGE(OBD) +
+                        OPC_RANGE(MGS) +
+                        OPC_RANGE(LDLM) +
+                        OPC_RANGE(MDS) +
+                        OPC_RANGE(OST));
+	} else if (opc < OUT_UPDATE_LAST_OPC) {
+		/* update opcode */
+		return (opc - OUT_UPDATE_FIRST_OPC +
+			OPC_RANGE(FLD) +
+			OPC_RANGE(SEC) +
+			OPC_RANGE(SEQ) +
+			OPC_RANGE(QUOTA) +
+			OPC_RANGE(LLOG) +
+			OPC_RANGE(OBD) +
+			OPC_RANGE(MGS) +
+			OPC_RANGE(LDLM) +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else if (opc < LFSCK_LAST_OPC) {
+		/* LFSCK opcode */
+		return (opc - LFSCK_FIRST_OPC +
+			OPC_RANGE(OUT_UPDATE) +
+			OPC_RANGE(FLD) +
+			OPC_RANGE(SEC) +
+			OPC_RANGE(SEQ) +
+			OPC_RANGE(QUOTA) +
+			OPC_RANGE(LLOG) +
+			OPC_RANGE(OBD) +
+			OPC_RANGE(MGS) +
+			OPC_RANGE(LDLM) +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else {
+		/* Unknown Opcode */
+		return -1;
+	}
+}
+
+
+#define LUSTRE_MAX_OPCODES (OPC_RANGE(OST)  + \
+                            OPC_RANGE(MDS)  + \
+                            OPC_RANGE(LDLM) + \
+                            OPC_RANGE(MGS)  + \
+                            OPC_RANGE(OBD)  + \
+                            OPC_RANGE(LLOG) + \
+                            OPC_RANGE(SEC)  + \
+                            OPC_RANGE(SEQ)  + \
+                            OPC_RANGE(SEC)  + \
+			    OPC_RANGE(FLD)  + \
+			    OPC_RANGE(OUT_UPDATE) + \
+			    OPC_RANGE(LFSCK))
+
+#define EXTRA_MAX_OPCODES ((PTLRPC_LAST_CNTR - PTLRPC_FIRST_CNTR)  + \
+                            OPC_RANGE(EXTRA))
+
+enum {
+        PTLRPC_REQWAIT_CNTR = 0,
+        PTLRPC_REQQDEPTH_CNTR,
+        PTLRPC_REQACTIVE_CNTR,
+        PTLRPC_TIMEOUT,
+        PTLRPC_REQBUF_AVAIL_CNTR,
+        PTLRPC_LAST_CNTR
+};
+
+#define PTLRPC_FIRST_CNTR PTLRPC_REQWAIT_CNTR
+
+enum lprocfs_extra_opc {
+        LDLM_GLIMPSE_ENQUEUE = 0,
+        LDLM_PLAIN_ENQUEUE,
+        LDLM_EXTENT_ENQUEUE,
+        LDLM_FLOCK_ENQUEUE,
+        LDLM_IBITS_ENQUEUE,
+        MDS_REINT_SETATTR,
+        MDS_REINT_CREATE,
+        MDS_REINT_LINK,
+        MDS_REINT_UNLINK,
+        MDS_REINT_RENAME,
+        MDS_REINT_OPEN,
+        MDS_REINT_SETXATTR,
+        BRW_READ_BYTES,
+        BRW_WRITE_BYTES,
+        EXTRA_LAST_OPC
+};
+
+#define EXTRA_FIRST_OPC LDLM_GLIMPSE_ENQUEUE
+/* class_obd.c */
+extern struct proc_dir_entry *proc_lustre_root;
+extern struct dentry *debugfs_lustre_root;
+extern struct kobject *lustre_kobj;
+
+struct obd_device;
+struct obd_histogram;
+
+#define JOBSTATS_JOBID_VAR_MAX_LEN	20
+#define JOBSTATS_DISABLE		"disable"
+#define JOBSTATS_PROCNAME_UID		"procname_uid"
+#define JOBSTATS_NODELOCAL		"nodelocal"
+
+typedef void (*cntr_init_callback)(struct lprocfs_stats *stats);
+
+struct obd_job_stats {
+	struct cfs_hash	       *ojs_hash;	/* hash of jobids */
+	struct list_head	ojs_list;	/* list of job_stat structs */
+	rwlock_t		ojs_lock;	/* protect ojs_list/js_list */
+	unsigned int		ojs_cleanup_interval;/* seconds before expiry */
+	time_t			ojs_last_cleanup; /* previous cleanup time */
+	cntr_init_callback	ojs_cntr_init_fn;/* lprocfs_stats initializer */
+	unsigned short		ojs_cntr_num;	/* number of stats in struct */
+	bool			ojs_cleaning;	/* currently expiring stats */
+};
+
+#ifdef CONFIG_PROC_FS
+
+int lprocfs_stats_alloc_one(struct lprocfs_stats *stats,
+			    unsigned int cpuid);
+int lprocfs_stats_lock(struct lprocfs_stats *stats,
+		       enum lprocfs_stats_lock_ops opc,
+		       unsigned long *flags);
+void lprocfs_stats_unlock(struct lprocfs_stats *stats,
+			  enum lprocfs_stats_lock_ops opc,
+			  unsigned long *flags);
+
+static inline unsigned int
+lprocfs_stats_counter_size(struct lprocfs_stats *stats)
+{
+	unsigned int percpusize;
+
+	percpusize = offsetof(struct lprocfs_percpu, lp_cntr[stats->ls_num]);
+
+	/* irq safe stats need lc_array_sum[1] */
+	if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+		percpusize += stats->ls_num * sizeof(__s64);
+
+	if ((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0)
+		percpusize = L1_CACHE_ALIGN(percpusize);
+
+	return percpusize;
+}
+
+static inline struct lprocfs_counter *
+lprocfs_stats_counter_get(struct lprocfs_stats *stats, unsigned int cpuid,
+			  int index)
+{
+	struct lprocfs_counter *cntr;
+
+	cntr = &stats->ls_percpu[cpuid]->lp_cntr[index];
+
+	if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+		cntr = (void *)cntr + index * sizeof(__s64);
+
+	return cntr;
+}
+
+/* Two optimized LPROCFS counter increment functions are provided:
+ *     lprocfs_counter_incr(cntr, value) - optimized for by-one counters
+ *     lprocfs_counter_add(cntr) - use for multi-valued counters
+ * Counter data layout allows config flag, counter lock and the
+ * count itself to reside within a single cache line.
+ */
+
+extern void lprocfs_counter_add(struct lprocfs_stats *stats, int idx,
+                                long amount);
+extern void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx,
+                                long amount);
+
+#define lprocfs_counter_incr(stats, idx) \
+        lprocfs_counter_add(stats, idx, 1)
+#define lprocfs_counter_decr(stats, idx) \
+        lprocfs_counter_sub(stats, idx, 1)
+
+extern __s64 lprocfs_read_helper(struct lprocfs_counter *lc,
+				 struct lprocfs_counter_header *header,
+				 enum lprocfs_stats_flags flags,
+				 enum lprocfs_fields_flags field);
+u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx,
+			    enum lprocfs_fields_flags field);
+
+extern struct lprocfs_stats *
+lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags);
+extern void lprocfs_clear_stats(struct lprocfs_stats *stats);
+extern void lprocfs_free_stats(struct lprocfs_stats **stats);
+extern void lprocfs_init_ops_stats(int num_private_stats,
+                                   struct lprocfs_stats *stats);
+extern void lprocfs_init_mps_stats(int num_private_stats,
+                                   struct lprocfs_stats *stats);
+extern void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats);
+extern int lprocfs_alloc_obd_stats(struct obd_device *obddev,
+                                   unsigned int num_private_stats);
+extern int lprocfs_alloc_md_stats(struct obd_device *obddev,
+                                  unsigned int num_private_stats);
+extern void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
+                                 unsigned conf, const char *name,
+                                 const char *units);
+extern void lprocfs_free_obd_stats(struct obd_device *obddev);
+extern void lprocfs_free_md_stats(struct obd_device *obddev);
+struct obd_export;
+struct nid_stat;
+extern int lprocfs_add_clear_entry(struct obd_device * obd,
+				   struct proc_dir_entry *entry);
+#ifdef HAVE_SERVER_SUPPORT
+extern int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *peer_nid);
+extern int lprocfs_exp_cleanup(struct obd_export *exp);
+#else
+static inline int lprocfs_exp_cleanup(struct obd_export *exp)
+{ return 0; }
+#endif
+extern struct proc_dir_entry *
+lprocfs_add_simple(struct proc_dir_entry *root, char *name,
+		   void *data, const struct file_operations *fops);
+extern struct proc_dir_entry *
+lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent,
+                    const char *format, ...);
+extern void lprocfs_free_per_client_stats(struct obd_device *obd);
+#ifdef HAVE_SERVER_SUPPORT
+extern ssize_t
+lprocfs_nid_stats_clear_seq_write(struct file *file, const char __user *buffer,
+					size_t count, loff_t *off);
+extern int lprocfs_nid_stats_clear_seq_show(struct seq_file *file, void *data);
+#endif
+extern int ldebugfs_register_stats(struct dentry *parent, const char *name,
+				   struct lprocfs_stats *stats);
+extern int lprocfs_register_stats(struct proc_dir_entry *root, const char *name,
+                                  struct lprocfs_stats *stats);
+
+/* lprocfs_status.c */
+extern int ldebugfs_add_vars(struct dentry *parent, struct lprocfs_vars *var,
+			     void *data);
+extern int lprocfs_add_vars(struct proc_dir_entry *root,
+			    struct lprocfs_vars *var, void *data);
+
+extern struct dentry *ldebugfs_register(const char *name,
+					struct dentry *parent,
+					struct lprocfs_vars *list,
+					void *data);
+extern struct proc_dir_entry *
+lprocfs_register(const char *name, struct proc_dir_entry *parent,
+		 struct lprocfs_vars *list, void *data);
+
+extern void ldebugfs_remove(struct dentry **entryp);
+extern void lprocfs_remove(struct proc_dir_entry **root);
+extern void lprocfs_remove_proc_entry(const char *name,
+                                      struct proc_dir_entry *parent);
+#ifndef HAVE_REMOVE_PROC_SUBTREE
+extern int remove_proc_subtree(const char *name,
+			       struct proc_dir_entry *parent);
+#define PDE_DATA(inode)		(PDE(inode)->data)
+
+static inline int LPROCFS_ENTRY_CHECK(struct inode *inode)
+{
+	struct proc_dir_entry *dp = PDE(inode);
+	int deleted = 0;
+
+	spin_lock(&(dp)->pde_unload_lock);
+	if (dp->proc_fops == NULL)
+		deleted = 1;
+	spin_unlock(&(dp)->pde_unload_lock);
+	if (deleted)
+		return -ENODEV;
+	return 0;
+}
+#else
+static inline int LPROCFS_ENTRY_CHECK(struct inode *inode)
+{ return 0; }
+#endif
+extern int lprocfs_obd_setup(struct obd_device *dev);
+extern int lprocfs_obd_cleanup(struct obd_device *obd);
+#ifdef HAVE_SERVER_SUPPORT
+extern const struct file_operations lprocfs_evict_client_fops;
+#endif
+
+extern int ldebugfs_seq_create(struct dentry *parent, const char *name,
+			       umode_t mode,
+			       const struct file_operations *seq_fops,
+			       void *data);
+extern int lprocfs_seq_create(struct proc_dir_entry *parent, const char *name,
+			      mode_t mode,
+			      const struct file_operations *seq_fops,
+			      void *data);
+extern int lprocfs_obd_seq_create(struct obd_device *dev, const char *name,
+				  mode_t mode,
+				  const struct file_operations *seq_fops,
+				  void *data);
+
+/* Generic callbacks */
+extern int lprocfs_u64_seq_show(struct seq_file *m, void *data);
+extern int lprocfs_atomic_seq_show(struct seq_file *m, void *data);
+extern ssize_t lprocfs_atomic_seq_write(struct file *file,
+					const char __user *buffer,
+					size_t count, loff_t *off);
+extern int lprocfs_uint_seq_show(struct seq_file *m, void *data);
+extern ssize_t lprocfs_uint_seq_write(struct file *file,
+				      const char __user *buffer,
+				      size_t count, loff_t *off);
+extern int lprocfs_wr_uint(struct file *file, const char __user *buffer,
+			   unsigned long count, void *data);
+extern int lprocfs_uuid_seq_show(struct seq_file *m, void *data);
+extern int lprocfs_name_seq_show(struct seq_file *m, void *data);
+extern int lprocfs_server_uuid_seq_show(struct seq_file *m, void *data);
+extern int lprocfs_conn_uuid_seq_show(struct seq_file *m, void *data);
+extern int lprocfs_import_seq_show(struct seq_file *m, void *data);
+extern int lprocfs_state_seq_show(struct seq_file *m, void *data);
+extern int lprocfs_connect_flags_seq_show(struct seq_file *m, void *data);
+#ifdef HAVE_SERVER_SUPPORT
+extern int lprocfs_num_exports_seq_show(struct seq_file *m, void *data);
+#endif
+struct adaptive_timeout;
+extern int lprocfs_at_hist_helper(struct seq_file *m,
+				  struct adaptive_timeout *at);
+extern int lprocfs_timeouts_seq_show(struct seq_file *m, void *data);
+extern ssize_t
+lprocfs_timeouts_seq_write(struct file *file, const char __user *buffer,
+			   size_t count, loff_t *off);
+#ifdef HAVE_SERVER_SUPPORT
+extern ssize_t
+lprocfs_evict_client_seq_write(struct file *file, const char __user *buffer,
+				size_t count, loff_t *off);
+#endif
+extern ssize_t
+lprocfs_ping_seq_write(struct file *file, const char __user *buffer,
+		       size_t count, loff_t *off);
+extern ssize_t
+lprocfs_import_seq_write(struct file *file, const char __user *buffer,
+			 size_t count, loff_t *off);
+extern int lprocfs_pinger_recov_seq_show(struct seq_file *m, void *data);
+extern ssize_t
+lprocfs_pinger_recov_seq_write(struct file *file, const char __user *buffer,
+			       size_t count, loff_t *off);
+
+/* Statfs helpers */
+extern int lprocfs_blksize_seq_show(struct seq_file *m, void *data);
+extern int lprocfs_kbytestotal_seq_show(struct seq_file *m, void *data);
+extern int lprocfs_kbytesfree_seq_show(struct seq_file *m, void *data);
+extern int lprocfs_kbytesavail_seq_show(struct seq_file *m, void *data);
+extern int lprocfs_filestotal_seq_show(struct seq_file *m, void *data);
+extern int lprocfs_filesfree_seq_show(struct seq_file *m, void *data);
+
+extern int lprocfs_seq_read_frac_helper(struct seq_file *m, long val, int mult);
+extern int lprocfs_read_frac_helper(char *buffer, unsigned long count,
+                                    long val, int mult);
+extern int lprocfs_str_to_s64(const char __user *buffer, unsigned long count,
+			      __s64 *val);
+extern int lprocfs_str_with_units_to_s64(const char __user *buffer,
+					 unsigned long count, __s64 *val,
+					 char defunit);
+
+char *lprocfs_strnstr(const char *s1, const char *s2, size_t len);
+char *lprocfs_find_named_value(const char *buffer, const char *name,
+				size_t *count);
+void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value);
+void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value);
+void lprocfs_oh_clear(struct obd_histogram *oh);
+unsigned long lprocfs_oh_sum(struct obd_histogram *oh);
+
+void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
+                           struct lprocfs_counter *cnt);
+
+#ifdef HAVE_SERVER_SUPPORT
+/* lprocfs_status.c: recovery status */
+int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data);
+
+/* lprocfs_status.c: hash statistics */
+int lprocfs_hash_seq_show(struct seq_file *m, void *data);
+
+/* lprocfs_status.c: IR factor */
+int lprocfs_ir_factor_seq_show(struct seq_file *m, void *data);
+ssize_t
+lprocfs_ir_factor_seq_write(struct file *file, const char __user *buffer,
+				size_t count, loff_t *off);
+#endif
+
+/* lprocfs_status.c: dump pages on cksum error */
+int lprocfs_checksum_dump_seq_show(struct seq_file *m, void *data);
+ssize_t
+lprocfs_checksum_dump_seq_write(struct file *file, const char __user *buffer,
+				size_t count, loff_t *off);
+
+extern int lprocfs_single_release(struct inode *, struct file *);
+extern int lprocfs_seq_release(struct inode *, struct file *);
+
+/* You must use these macros when you want to refer to
+ * the import in a client obd_device for a lprocfs entry */
+#define LPROCFS_CLIMP_CHECK(obd) do {           \
+        typecheck(struct obd_device *, obd);    \
+	down_read(&(obd)->u.cli.cl_sem);    \
+        if ((obd)->u.cli.cl_import == NULL) {   \
+	     up_read(&(obd)->u.cli.cl_sem); \
+             return -ENODEV;                    \
+        }                                       \
+} while(0)
+#define LPROCFS_CLIMP_EXIT(obd)                 \
+	up_read(&(obd)->u.cli.cl_sem);
+
+/* write the name##_seq_show function, call LPROC_SEQ_FOPS_RO for read-only
+  proc entries; otherwise, you will define name##_seq_write function also for
+  a read-write proc entry, and then call LPROC_SEQ_SEQ instead. Finally,
+  call lprocfs_obd_seq_create(obd, filename, 0444, &name#_fops, data); */
+#define __LPROC_SEQ_FOPS(name, custom_seq_write)			\
+static int name##_single_open(struct inode *inode, struct file *file)	\
+{									\
+	int rc;								\
+									\
+	rc = LPROCFS_ENTRY_CHECK(inode);				\
+	if (rc < 0)							\
+		return rc;						\
+									\
+	return single_open(file, name##_seq_show,			\
+			   inode->i_private ? : PDE_DATA(inode));	\
+}									\
+static const struct file_operations name##_fops = {			\
+	.owner	 = THIS_MODULE,						\
+	.open	 = name##_single_open,					\
+	.read	 = seq_read,						\
+	.write	 = custom_seq_write,					\
+	.llseek	 = seq_lseek,						\
+	.release = lprocfs_single_release,				\
+}
+
+#define LPROC_SEQ_FOPS_RO(name)		__LPROC_SEQ_FOPS(name, NULL)
+#define LPROC_SEQ_FOPS(name)		__LPROC_SEQ_FOPS(name, name##_seq_write)
+
+#define LPROC_SEQ_FOPS_RO_TYPE(name, type)				\
+	static int name##_##type##_seq_show(struct seq_file *m, void *v)\
+	{								\
+		return lprocfs_##type##_seq_show(m, m->private);	\
+	}								\
+	LPROC_SEQ_FOPS_RO(name##_##type)
+
+#define LPROC_SEQ_FOPS_RW_TYPE(name, type)				\
+	static int name##_##type##_seq_show(struct seq_file *m, void *v)\
+	{								\
+		return lprocfs_##type##_seq_show(m, m->private);	\
+	}								\
+	static ssize_t name##_##type##_seq_write(struct file *file,	\
+			const char __user *buffer, size_t count,	\
+			loff_t *off)					\
+	{								\
+		struct seq_file *seq = file->private_data;		\
+		return lprocfs_##type##_seq_write(file, buffer,		\
+						count, seq->private);	\
+	}								\
+	LPROC_SEQ_FOPS(name##_##type);
+
+#define LPROC_SEQ_FOPS_WO_TYPE(name, type)				\
+	static ssize_t name##_##type##_write(struct file *file,		\
+			const char __user *buffer, size_t count,	\
+			loff_t *off)					\
+	{								\
+		return lprocfs_##type##_seq_write(file, buffer, count, off);\
+	}								\
+	static int name##_##type##_open(struct inode *inode, struct file *file)\
+	{								\
+		return single_open(file, NULL,				\
+				   inode->i_private ? : PDE_DATA(inode));\
+	}								\
+	static const struct file_operations name##_##type##_fops = {	\
+		.open	 = name##_##type##_open,			\
+		.write	 = name##_##type##_write,			\
+		.release = lprocfs_single_release,			\
+	};
+
+struct lustre_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct kobject *kobj, struct attribute *attr,
+			char *buf);
+	ssize_t (*store)(struct kobject *kobj, struct attribute *attr,
+			 const char *buf, size_t len);
+};
+
+#define LUSTRE_ATTR(name, mode, show, store) \
+static struct lustre_attr lustre_attr_##name = __ATTR(name, mode, show, store)
+
+#define LUSTRE_RO_ATTR(name) LUSTRE_ATTR(name, 0444, name##_show, NULL)
+#define LUSTRE_RW_ATTR(name) LUSTRE_ATTR(name, 0644, name##_show, name##_store)
+
+ssize_t lustre_attr_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf);
+ssize_t lustre_attr_store(struct kobject *kobj, struct attribute *attr,
+			  const char *buf, size_t len);
+
+extern const struct sysfs_ops lustre_sysfs_ops;
+
+/* lproc_ptlrpc.c */
+struct ptlrpc_request;
+extern void target_print_req(void *seq_file, struct ptlrpc_request *req);
+
+#ifdef HAVE_SERVER_SUPPORT
+/* lprocfs_jobstats.c */
+int lprocfs_job_stats_log(struct obd_device *obd, char *jobid,
+			  int event, long amount);
+void lprocfs_job_stats_fini(struct obd_device *obd);
+int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
+			   cntr_init_callback fn);
+int lprocfs_job_interval_seq_show(struct seq_file *m, void *data);
+ssize_t
+lprocfs_job_interval_seq_write(struct file *file, const char __user *buffer,
+				size_t count, loff_t *off);
+/* lproc_status.c */
+int lprocfs_recovery_time_soft_seq_show(struct seq_file *m, void *data);
+ssize_t lprocfs_recovery_time_soft_seq_write(struct file *file,
+						const char __user *buffer,
+						size_t count, loff_t *off);
+int lprocfs_recovery_time_hard_seq_show(struct seq_file *m, void *data);
+ssize_t
+lprocfs_recovery_time_hard_seq_write(struct file *file,
+				     const char __user *buffer,
+				     size_t count, loff_t *off);
+int lprocfs_target_instance_seq_show(struct seq_file *m, void *data);
+#endif
+int lprocfs_obd_max_pages_per_rpc_seq_show(struct seq_file *m, void *data);
+ssize_t lprocfs_obd_max_pages_per_rpc_seq_write(struct file *file,
+						const char __user *buffer,
+						size_t count, loff_t *off);
+
+struct root_squash_info;
+int lprocfs_wr_root_squash(const char __user *buffer, unsigned long count,
+			   struct root_squash_info *squash, char *name);
+int lprocfs_wr_nosquash_nids(const char __user *buffer, unsigned long count,
+			     struct root_squash_info *squash, char *name);
+
+#else /* !CONFIG_PROC_FS */
+
+#define proc_lustre_root NULL
+
+static inline void lprocfs_counter_add(struct lprocfs_stats *stats,
+                                       int index, long amount)
+{ return; }
+static inline void lprocfs_counter_incr(struct lprocfs_stats *stats,
+                                        int index)
+{ return; }
+static inline void lprocfs_counter_sub(struct lprocfs_stats *stats,
+                                       int index, long amount)
+{ return; }
+static inline void lprocfs_counter_decr(struct lprocfs_stats *stats,
+                                        int index)
+{ return; }
+static inline void lprocfs_counter_init(struct lprocfs_stats *stats,
+                                        int index, unsigned conf,
+                                        const char *name, const char *units)
+{ return; }
+
+static inline __u64 lc_read_helper(struct lprocfs_counter *lc,
+                                   enum lprocfs_fields_flags field)
+{ return 0; }
+
+/* NB: we return !NULL to satisfy error checker */
+static inline struct lprocfs_stats *
+lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags)
+{ return (struct lprocfs_stats *)1; }
+static inline void lprocfs_clear_stats(struct lprocfs_stats *stats)
+{ return; }
+static inline void lprocfs_free_stats(struct lprocfs_stats **stats)
+{ return; }
+static inline int lprocfs_register_stats(struct proc_dir_entry *root,
+                                         const char *name,
+                                         struct lprocfs_stats *stats)
+{ return 0; }
+static inline void lprocfs_init_ops_stats(int num_private_stats,
+                                          struct lprocfs_stats *stats)
+{ return; }
+static inline void lprocfs_init_mps_stats(int num_private_stats,
+                                          struct lprocfs_stats *stats)
+{ return; }
+static inline void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
+{ return; }
+static inline int lprocfs_alloc_obd_stats(struct obd_device *obddev,
+                                          unsigned int num_private_stats)
+{ return 0; }
+static inline int lprocfs_alloc_md_stats(struct obd_device *obddev,
+                                         unsigned int num_private_stats)
+{ return 0; }
+static inline void lprocfs_free_obd_stats(struct obd_device *obddev)
+{ return; }
+static inline void lprocfs_free_md_stats(struct obd_device *obddev)
+{ return; }
+
+struct obd_export;
+static inline int lprocfs_add_clear_entry(struct obd_export *exp)
+{ return 0; }
+static inline void lprocfs_free_per_client_stats(struct obd_device *obd)
+{ return; }
+#ifdef HAVE_SERVER_SUPPORT
+static inline
+ssize_t lprocfs_nid_stats_seq_write(struct file *file,
+				    const char __user *buffer,
+				    size_t count, loff_t *off)
+{return 0;}
+static inline
+int lprocfs_nid_stats_clear_seq_show(struct seq_file *m, void *data)
+{return 0;}
+static inline int lprocfs_exp_setup(struct obd_export *exp,lnet_nid_t *peer_nid)
+{ return 0; }
+#endif
+static inline int lprocfs_exp_cleanup(struct obd_export *exp)
+{ return 0; }
+static inline struct proc_dir_entry *
+lprocfs_add_simple(struct proc_dir_entry *root, char *name,
+		   void *data, const struct file_operations *fops)
+{return 0; }
+static inline struct proc_dir_entry *
+lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent,
+                    const char *format, ...)
+{return NULL; }
+static inline int lprocfs_add_vars(struct proc_dir_entry *root,
+				   struct lprocfs_vars *var, void *data)
+{ return 0; }
+static inline struct proc_dir_entry *
+lprocfs_register(const char *name, struct proc_dir_entry *parent,
+		 struct lprocfs_vars *list, void *data)
+{ return NULL; }
+static inline void lprocfs_remove(struct proc_dir_entry **root)
+{ return; }
+static inline void lprocfs_remove_proc_entry(const char *name,
+                                             struct proc_dir_entry *parent)
+{ return; }
+static inline int lprocfs_obd_setup(struct obd_device *dev)
+{ return 0; }
+static inline int lprocfs_obd_cleanup(struct obd_device *dev)
+{ return 0; }
+static inline int lprocfs_uuid_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_name_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_server_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_conn_uuid_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_import_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_state_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_connect_flags_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+#ifdef HAVE_SERVER_SUPPORT
+static inline int lprocfs_num_exports_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+#endif
+struct adaptive_timeout;
+static inline int lprocfs_at_hist_helper(struct seq_file *m,
+					 struct adaptive_timeout *at)
+{ return 0; }
+static inline int lprocfs_timeouts_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+static inline ssize_t
+lprocfs_timeouts_seq_write(struct file *file, const char __user *buffer,
+			   size_t count, loff_t *off)
+{ return 0; }
+#ifdef HAVE_SERVER_SUPPORT
+static inline ssize_t
+lprocfs_evict_client_seq_write(struct file *file, const char __user *buffer,
+			       size_t count, loff_t *off)
+{ return 0; }
+#endif
+static inline ssize_t
+lprocfs_ping_seq_write(struct file *file, const char __user *buffer,
+		       size_t count, loff_t *off)
+{ return 0; }
+static inline ssize_t
+lprocfs_import_seq_write(struct file *file, const char __user *buffer,
+			 size_t count, loff_t *off)
+{ return 0; }
+static inline int
+lprocfs_pinger_recov_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+static inline ssize_t
+lprocfs_pinger_recov_seq_write(struct file *file, const char __user *buffer,
+			       size_t count, loff_t *off)
+{ return 0; }
+
+/* Statfs helpers */
+static inline
+int lprocfs_blksize_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_kbytestotal_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_kbytesfree_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_kbytesavail_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_filestotal_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_filesfree_seq_show(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value)
+{ return; }
+static inline
+void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value)
+{ return; }
+static inline
+void lprocfs_oh_clear(struct obd_histogram *oh)
+{ return; }
+static inline
+unsigned long lprocfs_oh_sum(struct obd_histogram *oh)
+{ return 0; }
+static inline
+void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
+                           struct lprocfs_counter *cnt)
+{ return; }
+static inline
+u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx,
+			    enum lprocfs_fields_flags field)
+{ return (__u64)0; }
+
+#define LPROC_SEQ_FOPS_RO(name)
+#define LPROC_SEQ_FOPS(name)
+#define LPROC_SEQ_FOPS_RO_TYPE(name, type)
+#define LPROC_SEQ_FOPS_RW_TYPE(name, type)
+#define LPROC_SEQ_FOPS_WO_TYPE(name, type)
+
+/* lprocfs_jobstats.c */
+static inline
+int lprocfs_job_stats_log(struct obd_device *obd, char *jobid, int event,
+			  long amount)
+{ return 0; }
+static inline
+void lprocfs_job_stats_fini(struct obd_device *obd)
+{ return; }
+static inline
+int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
+			   cntr_init_callback fn)
+{ return 0; }
+
+
+/* lproc_ptlrpc.c */
+#define target_print_req NULL
+
+#endif /* CONFIG_PROC_FS */
+
+#endif /* LPROCFS_STATUS_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lu_object.h b/drivers/staging/lustrefsx/lustre/include/lu_object.h
new file mode 100644
index 0000000000000..ae5bb3dde4c82
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lu_object.h
@@ -0,0 +1,1400 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LUSTRE_LU_OBJECT_H
+#define __LUSTRE_LU_OBJECT_H
+
+#include <stdarg.h>
+#include <libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+#include <lu_ref.h>
+#include <linux/percpu_counter.h>
+
+struct seq_file;
+struct proc_dir_entry;
+struct lustre_cfg;
+struct lprocfs_stats;
+
+/** \defgroup lu lu
+ * lu_* data-types represent server-side entities shared by data and meta-data
+ * stacks.
+ *
+ * Design goals:
+ *
+ * -# support for layering.
+ *
+ *     Server side object is split into layers, one per device in the
+ *     corresponding device stack. Individual layer is represented by struct
+ *     lu_object. Compound layered object --- by struct lu_object_header. Most
+ *     interface functions take lu_object as an argument and operate on the
+ *     whole compound object. This decision was made due to the following
+ *     reasons:
+ *
+ *        - it's envisaged that lu_object will be used much more often than
+ *        lu_object_header;
+ *
+ *        - we want lower (non-top) layers to be able to initiate operations
+ *        on the whole object.
+ *
+ *     Generic code supports layering more complex than simple stacking, e.g.,
+ *     it is possible that at some layer object "spawns" multiple sub-objects
+ *     on the lower layer.
+ *
+ * -# fid-based identification.
+ *
+ *     Compound object is uniquely identified by its fid. Objects are indexed
+ *     by their fids (hash table is used for index).
+ *
+ * -# caching and life-cycle management.
+ *
+ *     Object's life-time is controlled by reference counting. When reference
+ *     count drops to 0, object is returned to cache. Cached objects still
+ *     retain their identity (i.e., fid), and can be recovered from cache.
+ *
+ *     Objects are kept in the global LRU list, and lu_site_purge() function
+ *     can be used to reclaim given number of unused objects from the tail of
+ *     the LRU.
+ *
+ * -# avoiding recursion.
+ *
+ *     Generic code tries to replace recursion through layers by iterations
+ *     where possible. Additionally to the end of reducing stack consumption,
+ *     data, when practically possible, are allocated through lu_context_key
+ *     interface rather than on stack.
+ * @{
+ */
+
+struct lu_site;
+struct lu_object;
+struct lu_device;
+struct lu_object_header;
+struct lu_context;
+struct lu_env;
+
+/**
+ * Operations common for data and meta-data devices.
+ */
+struct lu_device_operations {
+        /**
+         * Allocate object for the given device (without lower-layer
+         * parts). This is called by lu_object_operations::loo_object_init()
+         * from the parent layer, and should setup at least lu_object::lo_dev
+         * and lu_object::lo_ops fields of resulting lu_object.
+         *
+         * Object creation protocol.
+         *
+         * Due to design goal of avoiding recursion, object creation (see
+         * lu_object_alloc()) is somewhat involved:
+         *
+         *  - first, lu_device_operations::ldo_object_alloc() method of the
+         *  top-level device in the stack is called. It should allocate top
+         *  level object (including lu_object_header), but without any
+         *  lower-layer sub-object(s).
+         *
+         *  - then lu_object_alloc() sets fid in the header of newly created
+         *  object.
+         *
+         *  - then lu_object_operations::loo_object_init() is called. It has
+         *  to allocate lower-layer object(s). To do this,
+         *  lu_object_operations::loo_object_init() calls ldo_object_alloc()
+         *  of the lower-layer device(s).
+         *
+         *  - for all new objects allocated by
+         *  lu_object_operations::loo_object_init() (and inserted into object
+         *  stack), lu_object_operations::loo_object_init() is called again
+         *  repeatedly, until no new objects are created.
+         *
+         * \post ergo(!IS_ERR(result), result->lo_dev == d &&
+         *                             result->lo_ops != NULL);
+         */
+        struct lu_object *(*ldo_object_alloc)(const struct lu_env *env,
+                                              const struct lu_object_header *h,
+                                              struct lu_device *d);
+        /**
+         * process config specific for device.
+         */
+        int (*ldo_process_config)(const struct lu_env *env,
+                                  struct lu_device *, struct lustre_cfg *);
+        int (*ldo_recovery_complete)(const struct lu_env *,
+                                     struct lu_device *);
+
+        /**
+         * initialize local objects for device. this method called after layer has
+         * been initialized (after LCFG_SETUP stage) and before it starts serving
+         * user requests.
+         */
+
+        int (*ldo_prepare)(const struct lu_env *,
+                           struct lu_device *parent,
+                           struct lu_device *dev);
+
+};
+
+/**
+ * For lu_object_conf flags
+ */
+typedef enum {
+	/* This is a new object to be allocated, or the file
+	 * corresponding to the object does not exists. */
+	LOC_F_NEW	= 0x00000001,
+} loc_flags_t;
+
+/**
+ * Object configuration, describing particulars of object being created. On
+ * server this is not used, as server objects are full identified by fid. On
+ * client configuration contains struct lustre_md.
+ */
+struct lu_object_conf {
+        /**
+         * Some hints for obj find and alloc.
+         */
+        loc_flags_t     loc_flags;
+};
+
+/**
+ * Type of "printer" function used by lu_object_operations::loo_object_print()
+ * method.
+ *
+ * Printer function is needed to provide some flexibility in (semi-)debugging
+ * output: possible implementations: printk, CDEBUG, sysfs/seq_file
+ */
+typedef int (*lu_printer_t)(const struct lu_env *env,
+                            void *cookie, const char *format, ...)
+        __attribute__ ((format (printf, 3, 4)));
+
+/**
+ * Operations specific for particular lu_object.
+ */
+struct lu_object_operations {
+
+        /**
+         * Allocate lower-layer parts of the object by calling
+         * lu_device_operations::ldo_object_alloc() of the corresponding
+         * underlying device.
+         *
+         * This method is called once for each object inserted into object
+         * stack. It's responsibility of this method to insert lower-layer
+         * object(s) it create into appropriate places of object stack.
+         */
+        int (*loo_object_init)(const struct lu_env *env,
+                               struct lu_object *o,
+                               const struct lu_object_conf *conf);
+        /**
+         * Called (in top-to-bottom order) during object allocation after all
+         * layers were allocated and initialized. Can be used to perform
+         * initialization depending on lower layers.
+         */
+        int (*loo_object_start)(const struct lu_env *env,
+                                struct lu_object *o);
+        /**
+         * Called before lu_object_operations::loo_object_free() to signal
+         * that object is being destroyed. Dual to
+         * lu_object_operations::loo_object_init().
+         */
+        void (*loo_object_delete)(const struct lu_env *env,
+                                  struct lu_object *o);
+        /**
+         * Dual to lu_device_operations::ldo_object_alloc(). Called when
+         * object is removed from memory.
+         */
+        void (*loo_object_free)(const struct lu_env *env,
+                                struct lu_object *o);
+        /**
+         * Called when last active reference to the object is released (and
+         * object returns to the cache). This method is optional.
+         */
+        void (*loo_object_release)(const struct lu_env *env,
+                                   struct lu_object *o);
+        /**
+         * Optional debugging helper. Print given object.
+         */
+        int (*loo_object_print)(const struct lu_env *env, void *cookie,
+                                lu_printer_t p, const struct lu_object *o);
+        /**
+         * Optional debugging method. Returns true iff method is internally
+         * consistent.
+         */
+        int (*loo_object_invariant)(const struct lu_object *o);
+};
+
+/**
+ * Type of lu_device.
+ */
+struct lu_device_type;
+
+/**
+ * Device: a layer in the server side abstraction stacking.
+ */
+struct lu_device {
+	/**
+	 * reference count. This is incremented, in particular, on each object
+	 * created at this layer.
+	 *
+	 * \todo XXX which means that atomic_t is probably too small.
+	 */
+	atomic_t			   ld_ref;
+	/**
+	 * Pointer to device type. Never modified once set.
+	 */
+	struct lu_device_type		  *ld_type;
+        /**
+         * Operation vector for this device.
+         */
+        const struct lu_device_operations *ld_ops;
+        /**
+         * Stack this device belongs to.
+         */
+        struct lu_site                    *ld_site;
+        struct proc_dir_entry             *ld_proc_entry;
+
+        /** \todo XXX: temporary back pointer into obd. */
+        struct obd_device                 *ld_obd;
+        /**
+         * A list of references to this object, for debugging.
+         */
+        struct lu_ref                      ld_reference;
+        /**
+         * Link the device to the site.
+         **/
+	struct list_head		   ld_linkage;
+};
+
+struct lu_device_type_operations;
+
+/**
+ * Tag bits for device type. They are used to distinguish certain groups of
+ * device types.
+ */
+enum lu_device_tag {
+        /** this is meta-data device */
+        LU_DEVICE_MD = (1 << 0),
+        /** this is data device */
+        LU_DEVICE_DT = (1 << 1),
+        /** data device in the client stack */
+        LU_DEVICE_CL = (1 << 2)
+};
+
+/**
+ * Type of device.
+ */
+struct lu_device_type {
+        /**
+         * Tag bits. Taken from enum lu_device_tag. Never modified once set.
+         */
+        __u32                                   ldt_tags;
+        /**
+         * Name of this class. Unique system-wide. Never modified once set.
+         */
+        char                                   *ldt_name;
+        /**
+         * Operations for this type.
+         */
+        const struct lu_device_type_operations *ldt_ops;
+        /**
+         * \todo XXX: temporary pointer to associated obd_type.
+         */
+        struct obd_type                        *ldt_obd_type;
+        /**
+         * \todo XXX: temporary: context tags used by obd_*() calls.
+         */
+        __u32                                   ldt_ctx_tags;
+        /**
+         * Number of existing device type instances.
+         */
+	atomic_t				ldt_device_nr;
+};
+
+/**
+ * Operations on a device type.
+ */
+struct lu_device_type_operations {
+        /**
+         * Allocate new device.
+         */
+        struct lu_device *(*ldto_device_alloc)(const struct lu_env *env,
+                                               struct lu_device_type *t,
+                                               struct lustre_cfg *lcfg);
+        /**
+         * Free device. Dual to
+         * lu_device_type_operations::ldto_device_alloc(). Returns pointer to
+         * the next device in the stack.
+         */
+        struct lu_device *(*ldto_device_free)(const struct lu_env *,
+                                              struct lu_device *);
+
+        /**
+         * Initialize the devices after allocation
+         */
+        int  (*ldto_device_init)(const struct lu_env *env,
+                                 struct lu_device *, const char *,
+                                 struct lu_device *);
+        /**
+         * Finalize device. Dual to
+         * lu_device_type_operations::ldto_device_init(). Returns pointer to
+         * the next device in the stack.
+         */
+        struct lu_device *(*ldto_device_fini)(const struct lu_env *env,
+                                              struct lu_device *);
+        /**
+         * Initialize device type. This is called on module load.
+         */
+        int  (*ldto_init)(struct lu_device_type *t);
+        /**
+         * Finalize device type. Dual to
+         * lu_device_type_operations::ldto_init(). Called on module unload.
+         */
+        void (*ldto_fini)(struct lu_device_type *t);
+        /**
+         * Called when the first device is created.
+         */
+        void (*ldto_start)(struct lu_device_type *t);
+        /**
+         * Called when number of devices drops to 0.
+         */
+        void (*ldto_stop)(struct lu_device_type *t);
+};
+
+static inline int lu_device_is_md(const struct lu_device *d)
+{
+	return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_MD);
+}
+
+/**
+ * Common object attributes.
+ */
+struct lu_attr {
+	/**
+	 * valid bits
+	 *
+	 * \see enum la_valid
+	 */
+	__u64          la_valid;
+        /** size in bytes */
+        __u64          la_size;
+	/** modification time in seconds since Epoch */
+	s64		la_mtime;
+	/** access time in seconds since Epoch */
+	s64		la_atime;
+	/** change time in seconds since Epoch */
+	s64		la_ctime;
+        /** 512-byte blocks allocated to object */
+        __u64          la_blocks;
+        /** permission bits and file type */
+        __u32          la_mode;
+        /** owner id */
+        __u32          la_uid;
+        /** group id */
+        __u32          la_gid;
+        /** object flags */
+        __u32          la_flags;
+        /** number of persistent references to this object */
+        __u32          la_nlink;
+        /** blk bits of the object*/
+        __u32          la_blkbits;
+        /** blk size of the object*/
+        __u32          la_blksize;
+        /** real device */
+        __u32          la_rdev;
+	/** project id */
+	__u32	       la_projid;
+};
+
+/** Bit-mask of valid attributes */
+enum la_valid {
+        LA_ATIME = 1 << 0,
+        LA_MTIME = 1 << 1,
+        LA_CTIME = 1 << 2,
+        LA_SIZE  = 1 << 3,
+        LA_MODE  = 1 << 4,
+        LA_UID   = 1 << 5,
+        LA_GID   = 1 << 6,
+        LA_BLOCKS = 1 << 7,
+        LA_TYPE   = 1 << 8,
+        LA_FLAGS  = 1 << 9,
+        LA_NLINK  = 1 << 10,
+        LA_RDEV   = 1 << 11,
+        LA_BLKSIZE = 1 << 12,
+        LA_KILL_SUID = 1 << 13,
+        LA_KILL_SGID = 1 << 14,
+	LA_PROJID    = 1 << 15,
+};
+
+/**
+ * Layer in the layered object.
+ */
+struct lu_object {
+        /**
+         * Header for this object.
+         */
+        struct lu_object_header           *lo_header;
+        /**
+         * Device for this layer.
+         */
+        struct lu_device                  *lo_dev;
+        /**
+         * Operations for this object.
+         */
+        const struct lu_object_operations *lo_ops;
+        /**
+         * Linkage into list of all layers.
+         */
+	struct list_head		   lo_linkage;
+	/**
+	 * Link to the device, for debugging.
+	 */
+	struct lu_ref_link                 lo_dev_ref;
+};
+
+enum lu_object_header_flags {
+	/**
+	 * Don't keep this object in cache. Object will be destroyed as soon
+	 * as last reference to it is released. This flag cannot be cleared
+	 * once set.
+	 */
+	LU_OBJECT_HEARD_BANSHEE = 0,
+	/**
+	 * Mark this object has already been taken out of cache.
+	 */
+	LU_OBJECT_UNHASHED = 1,
+};
+
+enum lu_object_header_attr {
+        LOHA_EXISTS   = 1 << 0,
+        LOHA_REMOTE   = 1 << 1,
+        /**
+         * UNIX file type is stored in S_IFMT bits.
+         */
+        LOHA_FT_START = 001 << 12, /**< S_IFIFO */
+        LOHA_FT_END   = 017 << 12, /**< S_IFMT */
+};
+
+/**
+ * "Compound" object, consisting of multiple layers.
+ *
+ * Compound object with given fid is unique with given lu_site.
+ *
+ * Note, that object does *not* necessary correspond to the real object in the
+ * persistent storage: object is an anchor for locking and method calling, so
+ * it is created for things like not-yet-existing child created by mkdir or
+ * create calls. lu_object_operations::loo_exists() can be used to check
+ * whether object is backed by persistent storage entity.
+ */
+struct lu_object_header {
+	/**
+	 * Fid, uniquely identifying this object.
+	 */
+	struct lu_fid		loh_fid;
+	/**
+	 * Object flags from enum lu_object_header_flags. Set and checked
+	 * atomically.
+	 */
+	unsigned long		loh_flags;
+	/**
+	 * Object reference count. Protected by lu_site::ls_guard.
+	 */
+	atomic_t		loh_ref;
+	/**
+	 * Common object attributes, cached for efficiency. From enum
+	 * lu_object_header_attr.
+	 */
+	__u32			loh_attr;
+	/**
+	 * Linkage into per-site hash table. Protected by lu_site::ls_guard.
+	 */
+	struct hlist_node	loh_hash;
+	/**
+	 * Linkage into per-site LRU list. Protected by lu_site::ls_guard.
+	 */
+	struct list_head	loh_lru;
+	/**
+	 * Linkage into list of layers. Never modified once set (except lately
+	 * during object destruction). No locking is necessary.
+	 */
+	struct list_head	loh_layers;
+	/**
+	 * A list of references to this object, for debugging.
+	 */
+	struct lu_ref		loh_reference;
+};
+
+struct fld;
+
+struct lu_site_bkt_data {
+	/**
+	 * number of object in this bucket on the lsb_lru list.
+	 */
+	long			lsb_lru_len;
+	/**
+	 * LRU list, updated on each access to object. Protected by
+	 * bucket lock of lu_site::ls_obj_hash.
+	 *
+	 * "Cold" end of LRU is lu_site::ls_lru.next. Accessed object are
+	 * moved to the lu_site::ls_lru.prev (this is due to the non-existence
+	 * of list_for_each_entry_safe_reverse()).
+	 */
+	struct list_head	lsb_lru;
+	/**
+	 * Wait-queue signaled when an object in this site is ultimately
+	 * destroyed (lu_object_free()). It is used by lu_object_find() to
+	 * wait before re-trying when object in the process of destruction is
+	 * found in the hash table.
+	 *
+	 * \see htable_lookup().
+	 */
+	wait_queue_head_t	lsb_marche_funebre;
+};
+
+enum {
+	LU_SS_CREATED		= 0,
+	LU_SS_CACHE_HIT,
+	LU_SS_CACHE_MISS,
+	LU_SS_CACHE_RACE,
+	LU_SS_CACHE_DEATH_RACE,
+	LU_SS_LRU_PURGED,
+	LU_SS_LAST_STAT
+};
+
+/**
+ * lu_site is a "compartment" within which objects are unique, and LRU
+ * discipline is maintained.
+ *
+ * lu_site exists so that multiple layered stacks can co-exist in the same
+ * address space.
+ *
+ * lu_site has the same relation to lu_device as lu_object_header to
+ * lu_object.
+ */
+struct lu_site {
+        /**
+         * objects hash table
+         */
+	struct cfs_hash		*ls_obj_hash;
+        /**
+         * index of bucket on hash table while purging
+         */
+	unsigned int		ls_purge_start;
+	/**
+	 * Top-level device for this stack.
+	 */
+	struct lu_device	*ls_top_dev;
+	/**
+	 * Bottom-level device for this stack
+	 */
+	struct lu_device	*ls_bottom_dev;
+	/**
+	 * Linkage into global list of sites.
+	 */
+	struct list_head	ls_linkage;
+	/**
+	 * List for lu device for this site, protected
+	 * by ls_ld_lock.
+	 **/
+	struct list_head	ls_ld_linkage;
+	spinlock_t		ls_ld_lock;
+	/**
+	 * Lock to serialize site purge.
+	 */
+	struct mutex		ls_purge_mutex;
+	/**
+	 * lu_site stats
+	 */
+	struct lprocfs_stats	*ls_stats;
+	/**
+	 * XXX: a hack! fld has to find md_site via site, remove when possible
+	 */
+	struct seq_server_site	*ld_seq_site;
+	/**
+	 * Pointer to the lu_target for this site.
+	 */
+	struct lu_target	*ls_tgt;
+
+	/**
+	 * Number of objects in lsb_lru_lists - used for shrinking
+	 */
+	struct percpu_counter   ls_lru_len_counter;
+};
+
+static inline struct lu_site_bkt_data *
+lu_site_bkt_from_fid(struct lu_site *site, struct lu_fid *fid)
+{
+	struct cfs_hash_bd bd;
+
+        cfs_hash_bd_get(site->ls_obj_hash, fid, &bd);
+        return cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
+}
+
+static inline struct seq_server_site *lu_site2seq(const struct lu_site *s)
+{
+	return s->ld_seq_site;
+}
+
+/** \name ctors
+ * Constructors/destructors.
+ * @{
+ */
+
+int  lu_site_init         (struct lu_site *s, struct lu_device *d);
+void lu_site_fini         (struct lu_site *s);
+int  lu_site_init_finish  (struct lu_site *s);
+void lu_stack_fini        (const struct lu_env *env, struct lu_device *top);
+void lu_device_get        (struct lu_device *d);
+void lu_device_put        (struct lu_device *d);
+int  lu_device_init       (struct lu_device *d, struct lu_device_type *t);
+void lu_device_fini       (struct lu_device *d);
+int  lu_object_header_init(struct lu_object_header *h);
+void lu_object_header_fini(struct lu_object_header *h);
+int  lu_object_init       (struct lu_object *o,
+                           struct lu_object_header *h, struct lu_device *d);
+void lu_object_fini       (struct lu_object *o);
+void lu_object_add_top    (struct lu_object_header *h, struct lu_object *o);
+void lu_object_add        (struct lu_object *before, struct lu_object *o);
+
+void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d);
+void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d);
+
+/**
+ * Helpers to initialize and finalize device types.
+ */
+
+int  lu_device_type_init(struct lu_device_type *ldt);
+void lu_device_type_fini(struct lu_device_type *ldt);
+
+/** @} ctors */
+
+/** \name caching
+ * Caching and reference counting.
+ * @{
+ */
+
+/**
+ * Acquire additional reference to the given object. This function is used to
+ * attain additional reference. To acquire initial reference use
+ * lu_object_find().
+ */
+static inline void lu_object_get(struct lu_object *o)
+{
+	LASSERT(atomic_read(&o->lo_header->loh_ref) > 0);
+	atomic_inc(&o->lo_header->loh_ref);
+}
+
+/**
+ * Return true if object will not be cached after last reference to it is
+ * released.
+ */
+static inline int lu_object_is_dying(const struct lu_object_header *h)
+{
+	return test_bit(LU_OBJECT_HEARD_BANSHEE, &h->loh_flags);
+}
+
+void lu_object_put(const struct lu_env *env, struct lu_object *o);
+void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o);
+void lu_object_unhash(const struct lu_env *env, struct lu_object *o);
+int lu_site_purge_objects(const struct lu_env *env, struct lu_site *s, int nr,
+			  int canblock);
+
+static inline int lu_site_purge(const struct lu_env *env, struct lu_site *s,
+				int nr)
+{
+	return lu_site_purge_objects(env, s, nr, 1);
+}
+
+void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie,
+                   lu_printer_t printer);
+struct lu_object *lu_object_find(const struct lu_env *env,
+                                 struct lu_device *dev, const struct lu_fid *f,
+                                 const struct lu_object_conf *conf);
+struct lu_object *lu_object_find_at(const struct lu_env *env,
+                                    struct lu_device *dev,
+                                    const struct lu_fid *f,
+                                    const struct lu_object_conf *conf);
+struct lu_object *lu_object_find_slice(const struct lu_env *env,
+                                       struct lu_device *dev,
+                                       const struct lu_fid *f,
+                                       const struct lu_object_conf *conf);
+/** @} caching */
+
+/** \name helpers
+ * Helpers.
+ * @{
+ */
+
+/**
+ * First (topmost) sub-object of given compound object
+ */
+static inline struct lu_object *lu_object_top(struct lu_object_header *h)
+{
+	LASSERT(!list_empty(&h->loh_layers));
+	return container_of0(h->loh_layers.next, struct lu_object, lo_linkage);
+}
+
+/**
+ * Next sub-object in the layering
+ */
+static inline struct lu_object *lu_object_next(const struct lu_object *o)
+{
+        return container_of0(o->lo_linkage.next, struct lu_object, lo_linkage);
+}
+
+/**
+ * Pointer to the fid of this object.
+ */
+static inline const struct lu_fid *lu_object_fid(const struct lu_object *o)
+{
+        return &o->lo_header->loh_fid;
+}
+
+/**
+ * return device operations vector for this object
+ */
+static const inline struct lu_device_operations *
+lu_object_ops(const struct lu_object *o)
+{
+        return o->lo_dev->ld_ops;
+}
+
+/**
+ * Given a compound object, find its slice, corresponding to the device type
+ * \a dtype.
+ */
+struct lu_object *lu_object_locate(struct lu_object_header *h,
+                                   const struct lu_device_type *dtype);
+
+/**
+ * Printer function emitting messages through libcfs_debug_msg().
+ */
+int lu_cdebug_printer(const struct lu_env *env,
+                      void *cookie, const char *format, ...);
+
+/**
+ * Print object description followed by a user-supplied message.
+ */
+#define LU_OBJECT_DEBUG(mask, env, object, format, ...)                   \
+do {                                                                      \
+        if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {                     \
+                LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);          \
+                lu_object_print(env, &msgdata, lu_cdebug_printer, object);\
+                CDEBUG(mask, format "\n", ## __VA_ARGS__);                \
+        }                                                                 \
+} while (0)
+
+/**
+ * Print short object description followed by a user-supplied message.
+ */
+#define LU_OBJECT_HEADER(mask, env, object, format, ...)                \
+do {                                                                    \
+        if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {                   \
+                LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);        \
+                lu_object_header_print(env, &msgdata, lu_cdebug_printer,\
+                                       (object)->lo_header);            \
+                lu_cdebug_printer(env, &msgdata, "\n");                 \
+                CDEBUG(mask, format , ## __VA_ARGS__);                  \
+        }                                                               \
+} while (0)
+
+void lu_object_print       (const struct lu_env *env, void *cookie,
+                            lu_printer_t printer, const struct lu_object *o);
+void lu_object_header_print(const struct lu_env *env, void *cookie,
+                            lu_printer_t printer,
+                            const struct lu_object_header *hdr);
+
+/**
+ * Check object consistency.
+ */
+int lu_object_invariant(const struct lu_object *o);
+
+
+/**
+ * Check whether object exists, no matter on local or remote storage.
+ * Note: LOHA_EXISTS will be set once some one created the object,
+ * and it does not needs to be committed to storage.
+ */
+#define lu_object_exists(o) ((o)->lo_header->loh_attr & LOHA_EXISTS)
+
+/**
+ * Check whether object on the remote storage.
+ */
+#define lu_object_remote(o) unlikely((o)->lo_header->loh_attr & LOHA_REMOTE)
+
+static inline int lu_object_assert_exists(const struct lu_object *o)
+{
+	return lu_object_exists(o);
+}
+
+static inline int lu_object_assert_not_exists(const struct lu_object *o)
+{
+	return !lu_object_exists(o);
+}
+
+/**
+ * Attr of this object.
+ */
+static inline __u32 lu_object_attr(const struct lu_object *o)
+{
+	LASSERT(lu_object_exists(o) != 0);
+        return o->lo_header->loh_attr;
+}
+
+static inline void lu_object_ref_add(struct lu_object *o,
+				     const char *scope,
+				     const void *source)
+{
+	lu_ref_add(&o->lo_header->loh_reference, scope, source);
+}
+
+static inline void lu_object_ref_add_at(struct lu_object *o,
+					struct lu_ref_link *link,
+					const char *scope,
+					const void *source)
+{
+	lu_ref_add_at(&o->lo_header->loh_reference, link, scope, source);
+}
+
+static inline void lu_object_ref_del(struct lu_object *o,
+                                     const char *scope, const void *source)
+{
+        lu_ref_del(&o->lo_header->loh_reference, scope, source);
+}
+
+static inline void lu_object_ref_del_at(struct lu_object *o,
+                                        struct lu_ref_link *link,
+                                        const char *scope, const void *source)
+{
+        lu_ref_del_at(&o->lo_header->loh_reference, link, scope, source);
+}
+
+/** input params, should be filled out by mdt */
+struct lu_rdpg {
+        /** hash */
+        __u64                   rp_hash;
+        /** count in bytes */
+        unsigned int            rp_count;
+        /** number of pages */
+        unsigned int            rp_npages;
+        /** requested attr */
+        __u32                   rp_attrs;
+        /** pointers to pages */
+        struct page           **rp_pages;
+};
+
+enum lu_xattr_flags {
+	LU_XATTR_REPLACE = (1 << 0),
+	LU_XATTR_CREATE  = (1 << 1)
+};
+
+/** @} helpers */
+
+/** \name lu_context
+ * @{ */
+
+/** For lu_context health-checks */
+enum lu_context_state {
+        LCS_INITIALIZED = 1,
+        LCS_ENTERED,
+        LCS_LEFT,
+        LCS_FINALIZED
+};
+
+/**
+ * lu_context. Execution context for lu_object methods. Currently associated
+ * with thread.
+ *
+ * All lu_object methods, except device and device type methods (called during
+ * system initialization and shutdown) are executed "within" some
+ * lu_context. This means, that pointer to some "current" lu_context is passed
+ * as an argument to all methods.
+ *
+ * All service ptlrpc threads create lu_context as part of their
+ * initialization. It is possible to create "stand-alone" context for other
+ * execution environments (like system calls).
+ *
+ * lu_object methods mainly use lu_context through lu_context_key interface
+ * that allows each layer to associate arbitrary pieces of data with each
+ * context (see pthread_key_create(3) for similar interface).
+ *
+ * On a client, lu_context is bound to a thread, see cl_env_get().
+ *
+ * \see lu_context_key
+ */
+struct lu_context {
+        /**
+         * lu_context is used on the client side too. Yet we don't want to
+         * allocate values of server-side keys for the client contexts and
+         * vice versa.
+         *
+         * To achieve this, set of tags in introduced. Contexts and keys are
+         * marked with tags. Key value are created only for context whose set
+         * of tags has non-empty intersection with one for key. Tags are taken
+         * from enum lu_context_tag.
+         */
+        __u32                  lc_tags;
+	enum lu_context_state  lc_state;
+        /**
+         * Pointer to the home service thread. NULL for other execution
+         * contexts.
+         */
+        struct ptlrpc_thread  *lc_thread;
+        /**
+         * Pointer to an array with key values. Internal implementation
+         * detail.
+         */
+	void		      **lc_value;
+	/**
+	 * Linkage into a list of all remembered contexts. Only
+	 * `non-transient' contexts, i.e., ones created for service threads
+	 * are placed here.
+	 */
+	struct list_head	lc_remember;
+	/**
+	 * Version counter used to skip calls to lu_context_refill() when no
+	 * keys were registered.
+	 */
+	unsigned		lc_version;
+        /**
+         * Debugging cookie.
+         */
+	unsigned		lc_cookie;
+};
+
+/**
+ * lu_context_key interface. Similar to pthread_key.
+ */
+
+enum lu_context_tag {
+        /**
+         * Thread on md server
+         */
+        LCT_MD_THREAD = 1 << 0,
+        /**
+         * Thread on dt server
+         */
+        LCT_DT_THREAD = 1 << 1,
+        /**
+         * Thread on client
+         */
+        LCT_CL_THREAD = 1 << 3,
+        /**
+         * A per-request session on a server, and a per-system-call session on
+         * a client.
+         */
+        LCT_SESSION   = 1 << 4,
+        /**
+         * A per-request data on OSP device
+         */
+        LCT_OSP_THREAD = 1 << 5,
+        /**
+         * MGS device thread
+         */
+        LCT_MG_THREAD = 1 << 6,
+        /**
+         * Context for local operations
+         */
+	LCT_LOCAL = 1 << 7,
+	/**
+	 * session for server thread
+	 **/
+	LCT_SERVER_SESSION = 1 << 8,
+        /**
+         * Set when at least one of keys, having values in this context has
+         * non-NULL lu_context_key::lct_exit() method. This is used to
+         * optimize lu_context_exit() call.
+         */
+        LCT_HAS_EXIT  = 1 << 28,
+        /**
+         * Don't add references for modules creating key values in that context.
+         * This is only for contexts used internally by lu_object framework.
+         */
+        LCT_NOREF     = 1 << 29,
+        /**
+         * Key is being prepared for retiring, don't create new values for it.
+         */
+        LCT_QUIESCENT = 1 << 30,
+        /**
+         * Context should be remembered.
+         */
+        LCT_REMEMBER  = 1 << 31,
+        /**
+         * Contexts usable in cache shrinker thread.
+         */
+        LCT_SHRINKER  = LCT_MD_THREAD|LCT_DT_THREAD|LCT_CL_THREAD|LCT_NOREF
+};
+
+/**
+ * Key. Represents per-context value slot.
+ *
+ * Keys are usually registered when module owning the key is initialized, and
+ * de-registered when module is unloaded. Once key is registered, all new
+ * contexts with matching tags, will get key value. "Old" contexts, already
+ * initialized at the time of key registration, can be forced to get key value
+ * by calling lu_context_refill().
+ *
+ * Every key value is counted in lu_context_key::lct_used and acquires a
+ * reference on an owning module. This means, that all key values have to be
+ * destroyed before module can be unloaded. This is usually achieved by
+ * stopping threads started by the module, that created contexts in their
+ * entry functions. Situation is complicated by the threads shared by multiple
+ * modules, like ptlrpcd daemon on a client. To work around this problem,
+ * contexts, created in such threads, are `remembered' (see
+ * LCT_REMEMBER)---i.e., added into a global list. When module is preparing
+ * for unloading it does the following:
+ *
+ *     - marks its keys as `quiescent' (lu_context_tag::LCT_QUIESCENT)
+ *       preventing new key values from being allocated in the new contexts,
+ *       and
+ *
+ *     - scans a list of remembered contexts, destroying values of module
+ *       keys, thus releasing references to the module.
+ *
+ * This is done by lu_context_key_quiesce(). If module is re-activated
+ * before key has been de-registered, lu_context_key_revive() call clears
+ * `quiescent' marker.
+ *
+ * lu_context code doesn't provide any internal synchronization for these
+ * activities---it's assumed that startup (including threads start-up) and
+ * shutdown are serialized by some external means.
+ *
+ * \see lu_context
+ */
+struct lu_context_key {
+        /**
+         * Set of tags for which values of this key are to be instantiated.
+         */
+        __u32 lct_tags;
+        /**
+         * Value constructor. This is called when new value is created for a
+         * context. Returns pointer to new value of error pointer.
+         */
+        void  *(*lct_init)(const struct lu_context *ctx,
+                           struct lu_context_key *key);
+        /**
+         * Value destructor. Called when context with previously allocated
+         * value of this slot is destroyed. \a data is a value that was returned
+         * by a matching call to lu_context_key::lct_init().
+         */
+        void   (*lct_fini)(const struct lu_context *ctx,
+                           struct lu_context_key *key, void *data);
+        /**
+         * Optional method called on lu_context_exit() for all allocated
+         * keys. Can be used by debugging code checking that locks are
+         * released, etc.
+         */
+        void   (*lct_exit)(const struct lu_context *ctx,
+                           struct lu_context_key *key, void *data);
+	/**
+	 * Internal implementation detail: index within lu_context::lc_value[]
+	 * reserved for this key.
+	 */
+	int		lct_index;
+	/**
+	 * Internal implementation detail: number of values created for this
+	 * key.
+	 */
+	atomic_t	lct_used;
+	/**
+	 * Internal implementation detail: module for this key.
+	 */
+	struct module	*lct_owner;
+	/**
+	 * References to this key. For debugging.
+	 */
+	struct lu_ref	lct_reference;
+};
+
+#define LU_KEY_INIT(mod, type)                                    \
+        static void* mod##_key_init(const struct lu_context *ctx, \
+                                    struct lu_context_key *key)   \
+        {                                                         \
+                type *value;                                      \
+                                                                  \
+		CLASSERT(PAGE_SIZE >= sizeof(*value));		  \
+                                                                  \
+                OBD_ALLOC_PTR(value);                             \
+                if (value == NULL)                                \
+                        value = ERR_PTR(-ENOMEM);                 \
+                                                                  \
+                return value;                                     \
+        }                                                         \
+        struct __##mod##__dummy_init {;} /* semicolon catcher */
+
+#define LU_KEY_FINI(mod, type)                                              \
+        static void mod##_key_fini(const struct lu_context *ctx,            \
+                                    struct lu_context_key *key, void* data) \
+        {                                                                   \
+                type *info = data;                                          \
+                                                                            \
+                OBD_FREE_PTR(info);                                         \
+        }                                                                   \
+        struct __##mod##__dummy_fini {;} /* semicolon catcher */
+
+#define LU_KEY_INIT_FINI(mod, type)   \
+        LU_KEY_INIT(mod,type);        \
+        LU_KEY_FINI(mod,type)
+
+#define LU_CONTEXT_KEY_DEFINE(mod, tags)                \
+        struct lu_context_key mod##_thread_key = {      \
+                .lct_tags = tags,                       \
+                .lct_init = mod##_key_init,             \
+                .lct_fini = mod##_key_fini              \
+        }
+
+#define LU_CONTEXT_KEY_INIT(key)                        \
+do {                                                    \
+        (key)->lct_owner = THIS_MODULE;                 \
+} while (0)
+
+int   lu_context_key_register(struct lu_context_key *key);
+void  lu_context_key_degister(struct lu_context_key *key);
+void *lu_context_key_get     (const struct lu_context *ctx,
+                               const struct lu_context_key *key);
+void  lu_context_key_quiesce (struct lu_context_key *key);
+void  lu_context_key_revive  (struct lu_context_key *key);
+
+
+/*
+ * LU_KEY_INIT_GENERIC() has to be a macro to correctly determine an
+ * owning module.
+ */
+
+#define LU_KEY_INIT_GENERIC(mod)                                        \
+        static void mod##_key_init_generic(struct lu_context_key *k, ...) \
+        {                                                               \
+                struct lu_context_key *key = k;                         \
+                va_list args;                                           \
+                                                                        \
+                va_start(args, k);                                      \
+                do {                                                    \
+                        LU_CONTEXT_KEY_INIT(key);                       \
+                        key = va_arg(args, struct lu_context_key *);    \
+                } while (key != NULL);                                  \
+                va_end(args);                                           \
+        }
+
+#define LU_TYPE_INIT(mod, ...)                                          \
+        LU_KEY_INIT_GENERIC(mod)                                        \
+        static int mod##_type_init(struct lu_device_type *t)            \
+        {                                                               \
+                mod##_key_init_generic(__VA_ARGS__, NULL);              \
+                return lu_context_key_register_many(__VA_ARGS__, NULL); \
+        }                                                               \
+        struct __##mod##_dummy_type_init {;}
+
+#define LU_TYPE_FINI(mod, ...)                                          \
+        static void mod##_type_fini(struct lu_device_type *t)           \
+        {                                                               \
+                lu_context_key_degister_many(__VA_ARGS__, NULL);        \
+        }                                                               \
+        struct __##mod##_dummy_type_fini {;}
+
+#define LU_TYPE_START(mod, ...)                                 \
+        static void mod##_type_start(struct lu_device_type *t)  \
+        {                                                       \
+                lu_context_key_revive_many(__VA_ARGS__, NULL);  \
+        }                                                       \
+        struct __##mod##_dummy_type_start {;}
+
+#define LU_TYPE_STOP(mod, ...)                                  \
+        static void mod##_type_stop(struct lu_device_type *t)   \
+        {                                                       \
+                lu_context_key_quiesce_many(__VA_ARGS__, NULL); \
+        }                                                       \
+        struct __##mod##_dummy_type_stop {;}
+
+
+
+#define LU_TYPE_INIT_FINI(mod, ...)             \
+        LU_TYPE_INIT(mod, __VA_ARGS__);         \
+        LU_TYPE_FINI(mod, __VA_ARGS__);         \
+        LU_TYPE_START(mod, __VA_ARGS__);        \
+        LU_TYPE_STOP(mod, __VA_ARGS__)
+
+int   lu_context_init  (struct lu_context *ctx, __u32 tags);
+void  lu_context_fini  (struct lu_context *ctx);
+void  lu_context_enter (struct lu_context *ctx);
+void  lu_context_exit  (struct lu_context *ctx);
+int   lu_context_refill(struct lu_context *ctx);
+
+/*
+ * Helper functions to operate on multiple keys. These are used by the default
+ * device type operations, defined by LU_TYPE_INIT_FINI().
+ */
+
+int  lu_context_key_register_many(struct lu_context_key *k, ...);
+void lu_context_key_degister_many(struct lu_context_key *k, ...);
+void lu_context_key_revive_many  (struct lu_context_key *k, ...);
+void lu_context_key_quiesce_many (struct lu_context_key *k, ...);
+
+/*
+ * update/clear ctx/ses tags.
+ */
+void lu_context_tags_update(__u32 tags);
+void lu_context_tags_clear(__u32 tags);
+void lu_session_tags_update(__u32 tags);
+void lu_session_tags_clear(__u32 tags);
+
+/**
+ * Environment.
+ */
+struct lu_env {
+        /**
+         * "Local" context, used to store data instead of stack.
+         */
+        struct lu_context  le_ctx;
+        /**
+         * "Session" context for per-request data.
+         */
+        struct lu_context *le_ses;
+};
+
+int  lu_env_init  (struct lu_env *env, __u32 tags);
+void lu_env_fini  (struct lu_env *env);
+int  lu_env_refill(struct lu_env *env);
+int  lu_env_refill_by_tags(struct lu_env *env, __u32 ctags, __u32 stags);
+
+/** @} lu_context */
+
+/**
+ * Output site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int lu_site_stats_seq_print(const struct lu_site *s, struct seq_file *m);
+
+/**
+ * Common name structure to be passed around for various name related methods.
+ */
+struct lu_name {
+        const char    *ln_name;
+        int            ln_namelen;
+};
+
+/**
+ * Validate names (path components)
+ *
+ * To be valid \a name must be non-empty, '\0' terminated of length \a
+ * name_len, and not contain '/'. The maximum length of a name (before
+ * say -ENAMETOOLONG will be returned) is really controlled by llite
+ * and the server. We only check for something insane coming from bad
+ * integer handling here.
+ */
+static inline bool lu_name_is_valid_2(const char *name, size_t name_len)
+{
+	return name != NULL &&
+	       name_len > 0 &&
+	       name_len < INT_MAX &&
+	       name[name_len] == '\0' &&
+	       strlen(name) == name_len &&
+	       memchr(name, '/', name_len) == NULL;
+}
+
+static inline bool lu_name_is_valid(const struct lu_name *ln)
+{
+	return lu_name_is_valid_2(ln->ln_name, ln->ln_namelen);
+}
+
+#define DNAME "%.*s"
+#define PNAME(ln)					\
+	(lu_name_is_valid(ln) ? (ln)->ln_namelen : 0),	\
+	(lu_name_is_valid(ln) ? (ln)->ln_name : "")
+
+/**
+ * Common buffer structure to be passed around for various xattr_{s,g}et()
+ * methods.
+ */
+struct lu_buf {
+	void   *lb_buf;
+	size_t  lb_len;
+};
+
+#define DLUBUF "(%p %zu)"
+#define PLUBUF(buf) (buf)->lb_buf, (buf)->lb_len
+
+/* read buffer params, should be filled out by out */
+struct lu_rdbuf {
+	/** number of buffers */
+	unsigned int	rb_nbufs;
+	/** pointers to buffers */
+	struct lu_buf	rb_bufs[];
+};
+
+/**
+ * One-time initializers, called at obdclass module initialization, not
+ * exported.
+ */
+
+/**
+ * Initialization of global lu_* data.
+ */
+int lu_global_init(void);
+
+/**
+ * Dual to lu_global_init().
+ */
+void lu_global_fini(void);
+
+struct lu_kmem_descr {
+	struct kmem_cache **ckd_cache;
+        const char       *ckd_name;
+        const size_t      ckd_size;
+};
+
+int  lu_kmem_init(struct lu_kmem_descr *caches);
+void lu_kmem_fini(struct lu_kmem_descr *caches);
+
+void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o,
+			  const struct lu_fid *fid);
+struct lu_object *lu_object_anon(const struct lu_env *env,
+				 struct lu_device *dev,
+				 const struct lu_object_conf *conf);
+
+/** null buffer */
+extern struct lu_buf LU_BUF_NULL;
+
+void lu_buf_free(struct lu_buf *buf);
+void lu_buf_alloc(struct lu_buf *buf, size_t size);
+void lu_buf_realloc(struct lu_buf *buf, size_t size);
+
+int lu_buf_check_and_grow(struct lu_buf *buf, size_t len);
+struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, size_t len);
+
+extern __u32 lu_context_tags_default;
+extern __u32 lu_session_tags_default;
+
+static inline bool lu_device_is_cl(const struct lu_device *d)
+{
+	return d->ld_type->ldt_tags & LU_DEVICE_CL;
+}
+
+static inline bool lu_object_is_cl(const struct lu_object *o)
+{
+	return lu_device_is_cl(o->lo_dev);
+}
+
+/** @} lu */
+#endif /* __LUSTRE_LU_OBJECT_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lu_ref.h b/drivers/staging/lustrefsx/lustre/include/lu_ref.h
new file mode 100644
index 0000000000000..c7366c0481320
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lu_ref.h
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef __LUSTRE_LU_REF_H
+#define __LUSTRE_LU_REF_H
+
+#include <linux/list.h>
+
+/** \defgroup lu_ref lu_ref
+ *
+ * An interface to track references between objects. Mostly for debugging.
+ *
+ * Suppose there is a reference counted data-structure struct foo. To track
+ * who acquired references to instance of struct foo, add lu_ref field to it:
+ *
+ * \code
+ *         struct foo {
+ *                 atomic_t      foo_refcount;
+ *                 struct lu_ref foo_reference;
+ *                 ...
+ *         };
+ * \endcode
+ *
+ * foo::foo_reference has to be initialized by calling
+ * lu_ref_init(). Typically there will be functions or macros to increment and
+ * decrement foo::foo_refcount, let's say they are foo_get(struct foo *foo)
+ * and foo_put(struct foo *foo), respectively.
+ *
+ * Whenever foo_get() is called to acquire a reference on a foo, lu_ref_add()
+ * has to be called to insert into foo::foo_reference a record, describing
+ * acquired reference. Dually, lu_ref_del() removes matching record. Typical
+ * usages are:
+ *
+ * \code
+ *        struct bar *bar;
+ *
+ *        // bar owns a reference to foo.
+ *        bar->bar_foo = foo_get(foo);
+ *        lu_ref_add(&foo->foo_reference, "bar", bar);
+ *
+ *        ...
+ *
+ *        // reference from bar to foo is released.
+ *        lu_ref_del(&foo->foo_reference, "bar", bar);
+ *        foo_put(bar->bar_foo);
+ *
+ *
+ *        // current thread acquired a temporary reference to foo.
+ *        foo_get(foo);
+ *        lu_ref_add(&foo->reference, __FUNCTION__, current);
+ *
+ *        ...
+ *
+ *        // temporary reference is released.
+ *        lu_ref_del(&foo->reference, __FUNCTION__, current);
+ *        foo_put(foo);
+ * \endcode
+ *
+ * \e Et \e cetera. Often it makes sense to include lu_ref_add() and
+ * lu_ref_del() calls into foo_get() and foo_put(). When an instance of struct
+ * foo is destroyed, lu_ref_fini() has to be called that checks that no
+ * pending references remain. lu_ref_print() can be used to dump a list of
+ * pending references, while hunting down a leak.
+ *
+ * For objects to which a large number of references can be acquired,
+ * lu_ref_del() can become cpu consuming, as it has to scan the list of
+ * references. To work around this, remember result of lu_ref_add() (usually
+ * in the same place where pointer to struct foo is stored), and use
+ * lu_ref_del_at():
+ *
+ * \code
+ *        // There is a large number of bar's for a single foo.
+ *        bar->bar_foo     = foo_get(foo);
+ *        bar->bar_foo_ref = lu_ref_add(&foo->foo_reference, "bar", bar);
+ *
+ *        ...
+ *
+ *        // reference from bar to foo is released.
+ *        lu_ref_del_at(&foo->foo_reference, bar->bar_foo_ref, "bar", bar);
+ *        foo_put(bar->bar_foo);
+ * \endcode
+ *
+ * lu_ref interface degrades gracefully in case of memory shortages.
+ *
+ * @{
+ */
+
+#ifdef USE_LU_REF
+
+/**
+ * Data-structure to keep track of references to a given object. This is used
+ * for debugging.
+ *
+ * lu_ref is embedded into an object which other entities (objects, threads,
+ * etc.) refer to.
+ */
+struct lu_ref {
+	/**
+	 * Spin-lock protecting lu_ref::lf_list.
+	 */
+	spinlock_t		lf_guard;
+	/**
+	 * List of all outstanding references (each represented by struct
+	 * lu_ref_link), pointing to this object.
+	 */
+	struct list_head	lf_list;
+        /**
+         * # of links.
+         */
+        short                lf_refs;
+        /**
+         * Flag set when lu_ref_add() failed to allocate lu_ref_link. It is
+         * used to mask spurious failure of the following lu_ref_del().
+         */
+        short                lf_failed;
+        /**
+         * flags - attribute for the lu_ref, for pad and future use.
+         */
+        short                lf_flags;
+        /**
+         * Where was I initialized?
+         */
+        short                lf_line;
+        const char          *lf_func;
+        /**
+         * Linkage into a global list of all lu_ref's (lu_ref_refs).
+         */
+	struct list_head	lf_linkage;
+};
+
+struct lu_ref_link {
+	struct lu_ref	*ll_ref;
+	struct list_head ll_linkage;
+	const char	*ll_scope;
+	const void	*ll_source;
+};
+
+void lu_ref_init_loc(struct lu_ref *ref, const char *func, const int line);
+void lu_ref_fini    (struct lu_ref *ref);
+#define lu_ref_init(ref) lu_ref_init_loc(ref, __FUNCTION__, __LINE__)
+
+void lu_ref_add(struct lu_ref *ref, const char *scope, const void *source);
+
+void lu_ref_add_atomic(struct lu_ref *ref, const char *scope,
+		       const void *source);
+
+void lu_ref_add_at(struct lu_ref *ref, struct lu_ref_link *link,
+		   const char *scope, const void *source);
+
+void lu_ref_del(struct lu_ref *ref, const char *scope, const void *source);
+
+void lu_ref_set_at(struct lu_ref *ref, struct lu_ref_link *link,
+		   const char *scope, const void *source0, const void *source1);
+
+void lu_ref_del_at(struct lu_ref *ref, struct lu_ref_link *link,
+		   const char *scope, const void *source);
+
+void lu_ref_print(const struct lu_ref *ref);
+
+void lu_ref_print_all(void);
+
+int lu_ref_global_init(void);
+
+void lu_ref_global_fini(void);
+
+#else /* !USE_LU_REF */
+
+struct lu_ref {
+};
+
+struct lu_ref_link {
+};
+
+static inline void lu_ref_init(struct lu_ref *ref)
+{
+}
+
+static inline void lu_ref_fini(struct lu_ref *ref)
+{
+}
+
+static inline void lu_ref_add(struct lu_ref *ref,
+			      const char *scope,
+			      const void *source)
+{
+}
+
+static inline void lu_ref_add_atomic(struct lu_ref *ref,
+				     const char *scope,
+				     const void *source)
+{
+}
+
+static inline void lu_ref_add_at(struct lu_ref *ref,
+				 struct lu_ref_link *link,
+				 const char *scope,
+				 const void *source)
+{
+}
+
+static inline void lu_ref_del(struct lu_ref *ref, const char *scope,
+                              const void *source)
+{
+}
+
+static inline void lu_ref_set_at(struct lu_ref *ref, struct lu_ref_link *link,
+                                 const char *scope, const void *source0,
+                                 const void *source1)
+{
+}
+
+static inline void lu_ref_del_at(struct lu_ref *ref, struct lu_ref_link *link,
+                                 const char *scope, const void *source)
+{
+}
+
+static inline int lu_ref_global_init(void)
+{
+        return 0;
+}
+
+static inline void lu_ref_global_fini(void)
+{
+}
+
+static inline void lu_ref_print(const struct lu_ref *ref)
+{
+}
+
+static inline void lu_ref_print_all(void)
+{
+}
+#endif /* USE_LU_REF */
+
+/** @} lu */
+
+#endif /* __LUSTRE_LU_REF_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lu_target.h b/drivers/staging/lustrefsx/lustre/include/lu_target.h
new file mode 100644
index 0000000000000..0d3ef968923ad
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lu_target.h
@@ -0,0 +1,680 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_LU_TARGET_H
+#define _LUSTRE_LU_TARGET_H
+
+#include <dt_object.h>
+#include <lustre_export.h>
+#include <lustre_update.h>
+#include <lustre_disk.h>
+#include <lustre_lfsck.h>
+
+/* Each one represents a distribute transaction replay
+ * operation, and updates on each MDTs are linked to
+ * dtr_sub_list */
+struct distribute_txn_replay_req {
+	/* update record, may be vmalloc'd */
+	struct llog_update_record *dtrq_lur;
+	int			dtrq_lur_size;
+
+	/* linked to the distribute transaction replay
+	 * list (tdtd_replay_list) */
+	struct list_head	dtrq_list;
+	__u64			dtrq_master_transno;
+	__u64			dtrq_batchid;
+	__u64			dtrq_xid;
+
+	/* all of sub updates are linked here */
+	struct list_head	dtrq_sub_list;
+	spinlock_t		dtrq_sub_list_lock;
+
+	/* If the local update has been executed during replay */
+	__u32			dtrq_local_update_executed:1;
+};
+
+/* Each one represents a sub replay item under a distribute
+ * transaction. A distribute transaction will be operated in
+ * two or more MDTs, and updates on each MDT will be represented
+ * by this structure */
+struct distribute_txn_replay_req_sub {
+	__u32			dtrqs_mdt_index;
+
+	/* All of cookies for the update will be linked here */
+	spinlock_t		dtrqs_cookie_list_lock;
+	struct list_head	dtrqs_cookie_list;
+	struct list_head	dtrqs_list;
+};
+
+struct target_distribute_txn_data;
+typedef int (*distribute_txn_replay_handler_t)(struct lu_env *env,
+				       struct target_distribute_txn_data *tdtd,
+				       struct distribute_txn_replay_req *dtrq);
+typedef char *(*target_show_update_logs_retrievers_t)(void *data, int *size,
+						      int *count);
+struct target_distribute_txn_data {
+	/* Distribution ID is used to identify updates log on different
+	 * MDTs for one operation */
+	spinlock_t		tdtd_batchid_lock;
+	__u64			tdtd_batchid;
+	struct lu_target	*tdtd_lut;
+	struct dt_object	*tdtd_batchid_obj;
+	struct dt_device	*tdtd_dt;
+
+	/* Committed batchid for distribute transaction */
+	__u64                   tdtd_committed_batchid;
+
+	/* List for distribute transaction */
+	struct list_head	tdtd_list;
+
+	/* Threads to manage distribute transaction */
+	wait_queue_head_t	tdtd_commit_thread_waitq;
+	atomic_t		tdtd_refcount;
+
+	/* recovery update */
+	distribute_txn_replay_handler_t	tdtd_replay_handler;
+	struct list_head		tdtd_replay_list;
+	struct list_head		tdtd_replay_finish_list;
+	spinlock_t			tdtd_replay_list_lock;
+	/* last replay update transno */
+	__u32				tdtd_replay_ready:1;
+
+	/* Manage the llog recovery threads */
+	atomic_t		tdtd_recovery_threads_count;
+	wait_queue_head_t	tdtd_recovery_threads_waitq;
+	target_show_update_logs_retrievers_t
+				tdtd_show_update_logs_retrievers;
+	void			*tdtd_show_retrievers_cbdata;
+};
+
+struct tg_grants_data {
+	/* grants: all values in bytes */
+	/* grant lock to protect all grant counters */
+	spinlock_t		 tgd_grant_lock;
+	/* total amount of dirty data reported by clients in incoming obdo */
+	u64			 tgd_tot_dirty;
+	/* sum of filesystem space granted to clients for async writes */
+	u64			 tgd_tot_granted;
+	/* grant used by I/Os in progress (between prepare and commit) */
+	u64			 tgd_tot_pending;
+	/* number of clients using grants */
+	int			 tgd_tot_granted_clients;
+	/* shall we grant space to clients not
+	 * supporting OBD_CONNECT_GRANT_PARAM? */
+	int			 tgd_grant_compat_disable;
+	/* protect all statfs-related counters */
+	spinlock_t		 tgd_osfs_lock;
+	__u64			 tgd_osfs_age;
+	int			 tgd_blockbits;
+	/* counters used during statfs update, protected by ofd_osfs_lock.
+	 * record when some statfs refresh are in progress */
+	int			 tgd_statfs_inflight;
+	/* writes between prep & commit which might be accounted twice in
+	 * ofd_osfs.os_bavail */
+	u64			 tgd_osfs_unstable;
+	/* track writes completed while statfs refresh is underway.
+	 * tracking is only effective when ofd_statfs_inflight > 1 */
+	u64			 tgd_osfs_inflight;
+	/* statfs optimization: we cache a bit  */
+	struct obd_statfs	 tgd_osfs;
+};
+
+struct lu_target {
+	struct obd_device	*lut_obd;
+	struct dt_device	*lut_bottom;
+	struct dt_device_param	 lut_dt_conf;
+
+	struct target_distribute_txn_data *lut_tdtd;
+	struct ptlrpc_thread	lut_tdtd_commit_thread;
+
+	/* supported opcodes and handlers for this target */
+	struct tgt_opc_slice	*lut_slice;
+	__u32			 lut_reply_fail_id;
+	__u32			 lut_request_fail_id;
+
+	/* sptlrpc rules */
+	rwlock_t		 lut_sptlrpc_lock;
+	struct sptlrpc_rule_set	 lut_sptlrpc_rset;
+	spinlock_t		 lut_flags_lock;
+	unsigned int		 lut_syncjournal:1,
+				 lut_sync_lock_cancel:2,
+				 /* e.g. OST node */
+				 lut_no_reconstruct:1;
+	/** last_rcvd file */
+	struct dt_object	*lut_last_rcvd;
+	/* transaction callbacks */
+	struct dt_txn_callback	 lut_txn_cb;
+	/** server data in last_rcvd file */
+	struct lr_server_data	 lut_lsd;
+	/** Server last transaction number */
+	__u64			 lut_last_transno;
+	/** Lock protecting last transaction number */
+	spinlock_t		 lut_translock;
+	/** Lock protecting client bitmap */
+	spinlock_t		 lut_client_bitmap_lock;
+	/** Bitmap of known clients */
+	unsigned long		*lut_client_bitmap;
+	/* Number of clients supporting multiple modify RPCs
+	 * recorded in the bitmap */
+	atomic_t		 lut_num_clients;
+	/* Client generation to identify client slot reuse */
+	atomic_t		 lut_client_generation;
+	/** reply_data file */
+	struct dt_object	*lut_reply_data;
+	/** Bitmap of used slots in the reply data file */
+	unsigned long		**lut_reply_bitmap;
+	/** target sync count, used for debug & test */
+	atomic_t		 lut_sync_count;
+
+	/** cross MDT locks which should trigger Sync-on-Lock-Cancel */
+	spinlock_t		 lut_slc_locks_guard;
+	struct list_head	 lut_slc_locks;
+
+	/* target grants fields */
+	struct tg_grants_data	 lut_tgd;
+};
+
+/* number of slots in reply bitmap */
+#define LUT_REPLY_SLOTS_PER_CHUNK (1<<20)
+#define LUT_REPLY_SLOTS_MAX_CHUNKS 16
+
+/**
+ * Target reply data
+ */
+struct tg_reply_data {
+	/** chain of reply data anchored in tg_export_data */
+	struct list_head	trd_list;
+	/** copy of on-disk reply data */
+	struct lsd_reply_data	trd_reply;
+	/** versions for Version Based Recovery */
+	__u64			trd_pre_versions[4];
+	/** slot index in reply_data file */
+	int			trd_index;
+	/** tag the client used */
+	__u16			trd_tag;
+};
+
+extern struct lu_context_key tgt_session_key;
+
+struct tgt_session_info {
+	/*
+	 * The following members will be filled explicitly
+	 * with specific data in tgt_ses_init().
+	 */
+	struct req_capsule	*tsi_pill;
+
+	/*
+	 * Lock request for "habeo clavis" operations.
+	 */
+	struct ldlm_request	*tsi_dlm_req;
+
+	/* although we have export in req, there are cases when it is not
+	 * available, e.g. closing files upon export destroy */
+	struct obd_export	*tsi_exp;
+	const struct lu_env	*tsi_env;
+	struct lu_target	*tsi_tgt;
+
+	const struct mdt_body	*tsi_mdt_body;
+	struct ost_body		*tsi_ost_body;
+	struct lu_object	*tsi_corpus;
+
+	struct lu_fid		 tsi_fid;
+	struct ldlm_res_id	 tsi_resid;
+
+	/* object affected by VBR, for last_rcvd_update */
+	struct dt_object	*tsi_vbr_obj;
+	/* opdata for mdt_reint_open(), has the same value as
+	 * ldlm_reply:lock_policy_res1.  The tgt_update_last_rcvd() stores
+	 * this value onto disk for recovery when tgt_txn_stop_cb() is called.
+	 */
+	__u64			 tsi_opdata;
+
+	/*
+	 * Additional fail id that can be set by handler.
+	 */
+	int			 tsi_reply_fail_id;
+	bool			 tsi_preprocessed;
+	/* request JobID */
+	char                    *tsi_jobid;
+
+	/* update replay */
+	__u64			tsi_xid;
+	__u32			tsi_result;
+	__u32			tsi_client_gen;
+};
+
+static inline struct tgt_session_info *tgt_ses_info(const struct lu_env *env)
+{
+	struct tgt_session_info *tsi;
+
+	LASSERT(env->le_ses != NULL);
+	tsi = lu_context_key_get(env->le_ses, &tgt_session_key);
+	LASSERT(tsi);
+	return tsi;
+}
+
+static inline void tgt_vbr_obj_set(const struct lu_env *env,
+				   struct dt_object *obj)
+{
+	struct tgt_session_info	*tsi;
+
+	if (env->le_ses != NULL) {
+		tsi = tgt_ses_info(env);
+		tsi->tsi_vbr_obj = obj;
+	}
+}
+
+static inline void tgt_opdata_set(const struct lu_env *env, __u64 flags)
+{
+	struct tgt_session_info	*tsi;
+
+	if (env->le_ses != NULL) {
+		tsi = tgt_ses_info(env);
+		tsi->tsi_opdata |= flags;
+	}
+}
+
+static inline void tgt_opdata_clear(const struct lu_env *env, __u64 flags)
+{
+	struct tgt_session_info	*tsi;
+
+	if (env->le_ses != NULL) {
+		tsi = tgt_ses_info(env);
+		tsi->tsi_opdata &= ~flags;
+	}
+}
+
+/*
+ * Generic unified target support.
+ */
+enum tgt_handler_flags {
+	/*
+	 * struct *_body is passed in the incoming message, and object
+	 * identified by this fid exists on disk.
+	 *                            *
+	 * "habeo corpus" == "I have a body"
+	 */
+	HABEO_CORPUS = (1 << 0),
+	/*
+	 * struct ldlm_request is passed in the incoming message.
+	 *
+	 * "habeo clavis" == "I have a key"
+	 *                                     */
+	HABEO_CLAVIS = (1 << 1),
+	/*
+	 * this request has fixed reply format, so that reply message can be
+	 * packed by generic code.
+	 *
+	 * "habeo refero" == "I have a reply"
+	 */
+	HABEO_REFERO = (1 << 2),
+	/*
+	 * this request will modify something, so check whether the file system
+	 * is readonly or not, then return -EROFS to client asap if necessary.
+	 *
+	 * "mutabor" == "I shall modify"
+	 */
+	MUTABOR      = (1 << 3)
+};
+
+struct tgt_handler {
+	/* The name of this handler. */
+	const char		*th_name;
+	/* Fail id, check at the beginning */
+	int			 th_fail_id;
+	/* Operation code */
+	__u32			 th_opc;
+	/* Flags in enum tgt_handler_flags */
+	__u32			 th_flags;
+	/* Request version for this opcode */
+	int			 th_version;
+	/* Handler function */
+	int			(*th_act)(struct tgt_session_info *tsi);
+	/* Handler function for high priority requests */
+	void			(*th_hp)(struct tgt_session_info *tsi);
+	/* Request format for this request */
+	const struct req_format	*th_fmt;
+};
+
+struct tgt_opc_slice {
+	__u32			 tos_opc_start; /* First op code */
+	__u32			 tos_opc_end; /* Last op code */
+	struct tgt_handler	*tos_hs; /* Registered handler */
+};
+
+static inline struct ptlrpc_request *tgt_ses_req(struct tgt_session_info *tsi)
+{
+	return tsi->tsi_pill ? tsi->tsi_pill->rc_req : NULL;
+}
+
+static inline __u64 tgt_conn_flags(struct tgt_session_info *tsi)
+{
+	LASSERT(tsi->tsi_exp);
+	return exp_connect_flags(tsi->tsi_exp);
+}
+
+static inline int req_is_replay(struct ptlrpc_request *req)
+{
+	LASSERT(req->rq_reqmsg);
+	return !!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY);
+}
+
+static inline bool tgt_is_multimodrpcs_client(struct obd_export *exp)
+{
+	return exp_connect_flags(exp) & OBD_CONNECT_MULTIMODRPCS;
+}
+
+
+/* target/tgt_handler.c */
+int tgt_request_handle(struct ptlrpc_request *req);
+char *tgt_name(struct lu_target *tgt);
+void tgt_counter_incr(struct obd_export *exp, int opcode);
+int tgt_connect_check_sptlrpc(struct ptlrpc_request *req,
+			      struct obd_export *exp);
+int tgt_adapt_sptlrpc_conf(struct lu_target *tgt);
+int tgt_connect(struct tgt_session_info *tsi);
+int tgt_disconnect(struct tgt_session_info *uti);
+int tgt_obd_ping(struct tgt_session_info *tsi);
+int tgt_enqueue(struct tgt_session_info *tsi);
+int tgt_convert(struct tgt_session_info *tsi);
+int tgt_bl_callback(struct tgt_session_info *tsi);
+int tgt_cp_callback(struct tgt_session_info *tsi);
+int tgt_llog_open(struct tgt_session_info *tsi);
+int tgt_llog_close(struct tgt_session_info *tsi);
+int tgt_llog_destroy(struct tgt_session_info *tsi);
+int tgt_llog_read_header(struct tgt_session_info *tsi);
+int tgt_llog_next_block(struct tgt_session_info *tsi);
+int tgt_llog_prev_block(struct tgt_session_info *tsi);
+int tgt_sec_ctx_init(struct tgt_session_info *tsi);
+int tgt_sec_ctx_init_cont(struct tgt_session_info *tsi);
+int tgt_sec_ctx_fini(struct tgt_session_info *tsi);
+int tgt_sendpage(struct tgt_session_info *tsi, struct lu_rdpg *rdpg, int nob);
+int tgt_send_buffer(struct tgt_session_info *tsi, struct lu_rdbuf *rdbuf);
+int tgt_validate_obdo(struct tgt_session_info *tsi, struct obdo *oa);
+int tgt_sync(const struct lu_env *env, struct lu_target *tgt,
+	     struct dt_object *obj, __u64 start, __u64 end);
+
+int tgt_io_thread_init(struct ptlrpc_thread *thread);
+void tgt_io_thread_done(struct ptlrpc_thread *thread);
+
+int tgt_extent_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
+		    __u64 start, __u64 end, struct lustre_handle *lh,
+		    int mode, __u64 *flags);
+void tgt_extent_unlock(struct lustre_handle *lh, enum ldlm_mode mode);
+int tgt_brw_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
+		 struct obd_ioobj *obj, struct niobuf_remote *nb,
+		 struct lustre_handle *lh, enum ldlm_mode mode);
+void tgt_brw_unlock(struct obd_ioobj *obj, struct niobuf_remote *niob,
+		    struct lustre_handle *lh, enum ldlm_mode mode);
+int tgt_brw_read(struct tgt_session_info *tsi);
+int tgt_brw_write(struct tgt_session_info *tsi);
+int tgt_hpreq_handler(struct ptlrpc_request *req);
+void tgt_register_lfsck_in_notify_local(int (*notify)(const struct lu_env *,
+						      struct dt_device *,
+						      struct lfsck_req_local *,
+						      struct thandle *));
+void tgt_register_lfsck_in_notify(int (*notify)(const struct lu_env *,
+						struct dt_device *,
+						struct lfsck_request *));
+void tgt_register_lfsck_query(int (*query)(const struct lu_env *,
+					   struct dt_device *,
+					   struct lfsck_request *,
+					   struct lfsck_reply *,
+					   struct lfsck_query *));
+bool req_can_reconstruct(struct ptlrpc_request *req, struct tg_reply_data *trd);
+
+extern struct tgt_handler tgt_sec_ctx_handlers[];
+extern struct tgt_handler tgt_lfsck_handlers[];
+extern struct tgt_handler tgt_obd_handlers[];
+extern struct tgt_handler tgt_dlm_handlers[];
+extern struct tgt_handler tgt_llog_handlers[];
+extern struct tgt_handler tgt_out_handlers[];
+extern struct tgt_handler fld_handlers[];
+extern struct tgt_handler seq_handlers[];
+
+typedef void (*tgt_cb_t)(struct lu_target *lut, __u64 transno,
+			 void *data, int err);
+struct tgt_commit_cb {
+	tgt_cb_t  tgt_cb_func;
+	void     *tgt_cb_data;
+};
+
+int tgt_hpreq_handler(struct ptlrpc_request *req);
+
+/* target/tgt_main.c */
+void tgt_boot_epoch_update(struct lu_target *lut);
+void tgt_save_slc_lock(struct lu_target *lut, struct ldlm_lock *lock,
+		       __u64 transno);
+void tgt_discard_slc_lock(struct lu_target *lut, struct ldlm_lock *lock);
+int tgt_init(const struct lu_env *env, struct lu_target *lut,
+	     struct obd_device *obd, struct dt_device *dt,
+	     struct tgt_opc_slice *slice,
+	     int request_fail_id, int reply_fail_id);
+void tgt_fini(const struct lu_env *env, struct lu_target *lut);
+int tgt_client_alloc(struct obd_export *exp);
+void tgt_client_free(struct obd_export *exp);
+int tgt_client_del(const struct lu_env *env, struct obd_export *exp);
+int tgt_client_add(const struct lu_env *env, struct obd_export *exp, int);
+int tgt_client_new(const struct lu_env *env, struct obd_export *exp);
+int tgt_server_data_update(const struct lu_env *env, struct lu_target *tg,
+			   int sync);
+int tgt_reply_data_init(const struct lu_env *env, struct lu_target *tgt);
+bool tgt_lookup_reply(struct ptlrpc_request *req, struct tg_reply_data *trd);
+int tgt_add_reply_data(const struct lu_env *env, struct lu_target *tgt,
+		       struct tg_export_data *ted, struct tg_reply_data *trd,
+		       struct thandle *th, bool update_lrd_file);
+struct tg_reply_data *tgt_lookup_reply_by_xid(struct tg_export_data *ted,
+					       __u64 xid);
+
+/* target/tgt_grant.c */
+static inline int exp_grant_param_supp(struct obd_export *exp)
+{
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_GRANT_PARAM);
+}
+
+/* Blocksize used for client not supporting OBD_CONNECT_GRANT_PARAM.
+ * That's 4KB=2^12 which is the biggest block size known to work whatever
+ * the client's page size is. */
+#define COMPAT_BSIZE_SHIFT 12
+
+void tgt_grant_sanity_check(struct obd_device *obd, const char *func);
+void tgt_grant_connect(const struct lu_env *env, struct obd_export *exp,
+		       struct obd_connect_data *data, bool new_conn);
+void tgt_grant_discard(struct obd_export *exp);
+void tgt_grant_prepare_read(const struct lu_env *env, struct obd_export *exp,
+			    struct obdo *oa);
+void tgt_grant_prepare_write(const struct lu_env *env, struct obd_export *exp,
+			     struct obdo *oa, struct niobuf_remote *rnb,
+			     int niocount);
+void tgt_grant_commit(struct obd_export *exp, unsigned long grant_used, int rc);
+int tgt_grant_commit_cb_add(struct thandle *th, struct obd_export *exp,
+			    unsigned long grant);
+long tgt_grant_create(const struct lu_env *env, struct obd_export *exp,
+		      s64 *nr);
+int tgt_statfs_internal(const struct lu_env *env, struct lu_target *lut,
+			struct obd_statfs *osfs, __u64 max_age,
+			int *from_cache);
+
+/* target/update_trans.c */
+int distribute_txn_init(const struct lu_env *env,
+			struct lu_target *lut,
+			struct target_distribute_txn_data *tdtd,
+			__u32 index);
+void distribute_txn_fini(const struct lu_env *env,
+			 struct target_distribute_txn_data *tdtd);
+
+/* target/update_recovery.c */
+int insert_update_records_to_replay_list(struct target_distribute_txn_data *,
+					 struct llog_update_record *,
+					 struct llog_cookie *, __u32);
+void dtrq_list_dump(struct target_distribute_txn_data *tdtd,
+		    unsigned int mask);
+void dtrq_list_destroy(struct target_distribute_txn_data *tdtd);
+int distribute_txn_replay_handle(struct lu_env *env,
+			   struct target_distribute_txn_data *tdtd,
+			   struct distribute_txn_replay_req *dtrq);
+__u64 distribute_txn_get_next_transno(struct target_distribute_txn_data *tdtd);
+struct distribute_txn_replay_req *
+distribute_txn_get_next_req(struct target_distribute_txn_data *tdtd);
+void dtrq_destroy(struct distribute_txn_replay_req *dtrq);
+struct distribute_txn_replay_req_sub *
+dtrq_sub_lookup(struct distribute_txn_replay_req *dtrq, __u32 mdt_index);
+struct distribute_txn_replay_req *
+distribute_txn_lookup_finish_list(struct target_distribute_txn_data *tdtd,
+				  __u64 transno);
+bool is_req_replayed_by_update(struct ptlrpc_request *req);
+enum {
+	ESERIOUS = 0x0001000
+};
+
+static inline int err_serious(int rc)
+{
+	LASSERT(rc < 0);
+	return -(-rc | ESERIOUS);
+}
+
+static inline int clear_serious(int rc)
+{
+	if (rc < 0)
+		rc = -(-rc & ~ESERIOUS);
+	return rc;
+}
+
+static inline int is_serious(int rc)
+{
+	return (rc < 0 && -rc & ESERIOUS);
+}
+
+/*
+ * Unified target generic handers macros and generic functions.
+ */
+#define TGT_RPC_HANDLER_HP(base, flags, opc, fn, hp, fmt, version)	\
+[opc - base] = {							\
+	.th_name	= #opc,						\
+	.th_fail_id	= OBD_FAIL_ ## opc ## _NET,			\
+	.th_opc		= opc,						\
+	.th_flags	= flags,					\
+	.th_act		= fn,						\
+	.th_fmt		= fmt,						\
+	.th_version	= version,					\
+	.th_hp		= hp,						\
+}
+#define TGT_RPC_HANDLER(base, flags, opc, fn, fmt, version)		\
+	TGT_RPC_HANDLER_HP(base, flags, opc, fn, NULL, fmt, version)
+
+/* MDT Request with a format known in advance */
+#define TGT_MDT_HDL(flags, name, fn)					\
+	TGT_RPC_HANDLER(MDS_FIRST_OPC, flags, name, fn, &RQF_ ## name,	\
+			LUSTRE_MDS_VERSION)
+/* Request with a format we do not yet know */
+#define TGT_MDT_HDL_VAR(flags, name, fn)				\
+	TGT_RPC_HANDLER(MDS_FIRST_OPC, flags, name, fn, NULL,		\
+			LUSTRE_MDS_VERSION)
+
+/* OST Request with a format known in advance */
+#define TGT_OST_HDL(flags, name, fn)					\
+	TGT_RPC_HANDLER(OST_FIRST_OPC, flags, name, fn, &RQF_ ## name,	\
+			LUSTRE_OST_VERSION)
+#define TGT_OST_HDL_HP(flags, name, fn, hp)				\
+	TGT_RPC_HANDLER_HP(OST_FIRST_OPC, flags, name, fn, hp,		\
+			   &RQF_ ## name, LUSTRE_OST_VERSION)
+
+/* MGS request with a format known in advance */
+#define TGT_MGS_HDL(flags, name, fn)					\
+	TGT_RPC_HANDLER(MGS_FIRST_OPC, flags, name, fn, &RQF_ ## name,	\
+			LUSTRE_MGS_VERSION)
+#define TGT_MGS_HDL_VAR(flags, name, fn)				\
+	TGT_RPC_HANDLER(MGS_FIRST_OPC, flags, name, fn, NULL,		\
+			LUSTRE_MGS_VERSION)
+
+/*
+ * OBD handler macros and generic functions.
+ */
+#define TGT_OBD_HDL(flags, name, fn)					\
+	TGT_RPC_HANDLER(OBD_FIRST_OPC, flags, name, fn, &RQF_ ## name,	\
+			LUSTRE_OBD_VERSION)
+#define TGT_OBD_HDL_VAR(flags, name, fn)				\
+	TGT_RPC_HANDLER(OBD_FIRST_OPC, flags, name, fn, NULL,		\
+			LUSTRE_OBD_VERSION)
+
+/*
+ * DLM handler macros and generic functions.
+ */
+#define TGT_DLM_HDL_VAR(flags, name, fn)				\
+	TGT_RPC_HANDLER(LDLM_FIRST_OPC, flags, name, fn, NULL,		\
+			LUSTRE_DLM_VERSION)
+#define TGT_DLM_HDL(flags, name, fn)					\
+	TGT_RPC_HANDLER(LDLM_FIRST_OPC, flags, name, fn, &RQF_ ## name,	\
+			LUSTRE_DLM_VERSION)
+
+/*
+ * LLOG handler macros and generic functions.
+ */
+#define TGT_LLOG_HDL_VAR(flags, name, fn)				\
+	TGT_RPC_HANDLER(LLOG_FIRST_OPC, flags, name, fn, NULL,		\
+			LUSTRE_LOG_VERSION)
+#define TGT_LLOG_HDL(flags, name, fn)					\
+	TGT_RPC_HANDLER(LLOG_FIRST_OPC, flags, name, fn, &RQF_ ## name,	\
+			LUSTRE_LOG_VERSION)
+
+/*
+ * Sec context handler macros and generic functions.
+ */
+#define TGT_SEC_HDL_VAR(flags, name, fn)				\
+	TGT_RPC_HANDLER(SEC_FIRST_OPC, flags, name, fn, NULL,		\
+			LUSTRE_OBD_VERSION)
+
+#define TGT_QUOTA_HDL(flags, name, fn)					\
+	TGT_RPC_HANDLER(QUOTA_DQACQ, flags, name, fn, &RQF_ ## name,	\
+			LUSTRE_MDS_VERSION)
+
+/* Sequence service handlers */
+#define TGT_SEQ_HDL(flags, name, fn)					\
+	TGT_RPC_HANDLER(SEQ_QUERY, flags, name, fn, &RQF_ ## name,	\
+			LUSTRE_MDS_VERSION)
+
+/* FID Location Database handlers */
+#define TGT_FLD_HDL_VAR(flags, name, fn)				\
+	TGT_RPC_HANDLER(FLD_QUERY, flags, name, fn, NULL,		\
+			LUSTRE_MDS_VERSION)
+
+/* LFSCK handlers */
+#define TGT_LFSCK_HDL(flags, name, fn)					\
+	TGT_RPC_HANDLER(LFSCK_FIRST_OPC, flags, name, fn,		\
+			&RQF_ ## name, LUSTRE_OBD_VERSION)
+
+/* Request with a format known in advance */
+#define TGT_UPDATE_HDL(flags, name, fn)					\
+	TGT_RPC_HANDLER(OUT_UPDATE, flags, name, fn, &RQF_ ## name,	\
+			LUSTRE_MDS_VERSION)
+
+#endif /* __LUSTRE_LU_TARGET_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/libiam.h b/drivers/staging/lustrefsx/lustre/include/lustre/libiam.h
new file mode 100644
index 0000000000000..9b73278254206
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/libiam.h
@@ -0,0 +1,141 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/libiam.h
+ *
+ * iam user level library
+ *
+ * Author: Wang Di <wangdi@clusterfs.com>
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+/*
+ *  lustre/libiam.h
+ */
+
+#ifndef __IAM_ULIB_H__
+#define __IAM_ULIB_H__
+
+/** \defgroup libiam libiam
+ *
+ * @{
+ */
+
+
+#define DX_FMT_NAME_LEN 16
+
+enum iam_fmt_t {
+        FMT_LFIX,
+        FMT_LVAR
+};
+
+struct iam_uapi_info {
+        __u16 iui_keysize;
+        __u16 iui_recsize;
+        __u16 iui_ptrsize;
+        __u16 iui_height;
+        char  iui_fmt_name[DX_FMT_NAME_LEN];
+};
+
+/*
+ * Creat an iam file, but do NOT open it.
+ * Return 0 if success, else -1.
+ */
+int iam_creat(char *filename, enum iam_fmt_t fmt,
+              int blocksize, int keysize, int recsize, int ptrsize);
+
+/*
+ * Open an iam file, but do NOT creat it if the file doesn't exist.
+ * Please use iam_creat for creating the file before use iam_open.
+ * Return file id (fd) if success, else -1.
+ */
+int iam_open(char *filename, struct iam_uapi_info *ua);
+
+/*
+ * Close file opened by iam_open. 
+ */
+int iam_close(int fd);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_insert(int fd, struct iam_uapi_info *ua,
+               int key_need_convert, char *keybuf,
+               int rec_need_convert, char *recbuf);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_lookup(int fd, struct iam_uapi_info *ua,
+               int key_need_convert, char *key_buf,
+               int *keysize, char *save_key,
+               int rec_need_convert, char *rec_buf,
+               int *recsize, char *save_rec);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_delete(int fd, struct iam_uapi_info *ua,
+               int key_need_convert, char *keybuf,
+               int rec_need_convert, char *recbuf);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_it_start(int fd, struct iam_uapi_info *ua,
+                 int key_need_convert, char *key_buf,
+                 int *keysize, char *save_key,
+                 int rec_need_convert, char *rec_buf,
+                 int *recsize, char *save_rec);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_it_next(int fd, struct iam_uapi_info *ua,
+                int key_need_convert, char *key_buf,
+                int *keysize, char *save_key,
+                int rec_need_convert, char *rec_buf,
+                int *recsize, char *save_rec);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_it_stop(int fd, struct iam_uapi_info *ua,
+                int key_need_convert, char *keybuf,
+                int rec_need_convert, char *recbuf);
+
+/*
+ * Change iam file mode.
+ */
+int iam_polymorph(char *filename, unsigned long mode);
+
+/** @} libiam */
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/liblustreapi.h b/drivers/staging/lustrefsx/lustre/include/lustre/liblustreapi.h
new file mode 100644
index 0000000000000..ec64bb610b825
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/liblustreapi.h
@@ -0,0 +1,39 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/*
+ * NOTE: This file is DEPRECATED!  Please include lustreapi.h directly
+ * instead of this file.  This file will be removed from a future version
+ * of lustre!
+ */
+
+#ifndef _LIBLUSTREAPI_H_
+#define _LIBLUSTREAPI_H_
+
+#include <lustre/lustreapi.h>
+#warning "Including liblustreapi.h is deprecated. Include lustreapi.h directly."
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h b/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h
new file mode 100644
index 0000000000000..cee135bf3c74f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h
@@ -0,0 +1,75 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/ll_fiemap.h
+ *
+ * FIEMAP data structures and flags. This header file will be used until
+ * fiemap.h is available in the upstream kernel.
+ *
+ * Author: Kalpak Shah <kalpak.shah@sun.com>
+ * Author: Andreas Dilger <adilger@sun.com>
+ */
+
+#ifndef _LUSTRE_FIEMAP_H
+#define _LUSTRE_FIEMAP_H
+
+#ifndef __KERNEL__
+#include <stddef.h>
+#include <linux/fiemap.h>
+#endif
+
+/* XXX: We use fiemap_extent::fe_reserved[0] */
+#define fe_device	fe_reserved[0]
+
+static inline size_t fiemap_count_to_size(size_t extent_count)
+{
+	return sizeof(struct fiemap) + extent_count *
+				       sizeof(struct fiemap_extent);
+}
+
+static inline unsigned fiemap_size_to_count(size_t array_size)
+{
+	return (array_size - sizeof(struct fiemap)) /
+	       sizeof(struct fiemap_extent);
+}
+
+#define FIEMAP_FLAG_DEVICE_ORDER 0x40000000 /* return device ordered mapping */
+
+#ifdef FIEMAP_FLAGS_COMPAT
+#undef FIEMAP_FLAGS_COMPAT
+#endif
+
+/* Lustre specific flags - use a high bit, don't conflict with upstream flag */
+#define FIEMAP_EXTENT_NO_DIRECT 0x40000000 /* Data mapping undefined */
+#define FIEMAP_EXTENT_NET       0x80000000 /* Data stored remotely.
+					    * Sets NO_DIRECT flag */
+
+#endif /* _LUSTRE_FIEMAP_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_barrier_user.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_barrier_user.h
new file mode 100644
index 0000000000000..e69bdc2795e56
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_barrier_user.h
@@ -0,0 +1,73 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2016, Intel Corporation.
+ *
+ * lustre/include/lustre/lustre_barrier_user.h
+ *
+ * Lustre write barrier (on MDT) userspace interfaces.
+ *
+ * Author: Fan, Yong <fan.yong@intel.com>
+ */
+#ifndef _LUSTRE_BARRIER_USER_H
+# define _LUSTRE_BARRIER_USER_H
+
+#include <lustre/lustre_user.h>
+
+#define BARRIER_VERSION_V1	1
+#define BARRIER_TIMEOUT_DEFAULT	30
+
+enum barrier_commands {
+	BC_FREEZE	= 1,
+	BC_THAW		= 2,
+	BC_STAT		= 3,
+	BC_RESCAN	= 4,
+};
+
+enum barrier_status {
+	BS_INIT		= 0,
+	BS_FREEZING_P1	= 1,
+	BS_FREEZING_P2	= 2,
+	BS_FROZEN	= 3,
+	BS_THAWING	= 4,
+	BS_THAWED	= 5,
+	BS_FAILED	= 6,
+	BS_EXPIRED	= 7,
+	BS_RESCAN	= 8,
+};
+
+struct barrier_ctl {
+	__u32	bc_version;
+	__u32	bc_cmd;
+	union {
+		__s32	bc_timeout;
+		__u32	bc_total;
+	};
+	union {
+		__u32	bc_status;
+		__u32	bc_absence;
+	};
+	char	bc_name[12];
+	__u32	bc_padding;
+};
+
+#endif /* _LUSTRE_BARRIER_USER_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_errno.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_errno.h
new file mode 100644
index 0000000000000..fe9ccd2e07a82
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_errno.h
@@ -0,0 +1,218 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.txt
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (C) 2011 FUJITSU LIMITED.  All rights reserved.
+ *
+ * Copyright (c) 2013, Intel Corporation.
+ */
+
+#ifndef LUSTRE_ERRNO_H
+#define LUSTRE_ERRNO_H
+
+/*
+ * Only "network" errnos, which are defined below, are allowed on wire (or on
+ * disk).  Generic routines exist to help translate between these and a subset
+ * of the "host" errnos.  Some host errnos (e.g., EDEADLOCK) are intentionally
+ * left out.  See also the comment on lustre_errno_hton_mapping[].
+ *
+ * To maintain compatibility with existing x86 clients and servers, each of
+ * these network errnos has the same numerical value as its corresponding host
+ * errno on x86.
+ */
+#define LUSTRE_EPERM		1	/* Operation not permitted */
+#define LUSTRE_ENOENT		2	/* No such file or directory */
+#define LUSTRE_ESRCH		3	/* No such process */
+#define LUSTRE_EINTR		4	/* Interrupted system call */
+#define LUSTRE_EIO		5	/* I/O error */
+#define LUSTRE_ENXIO		6	/* No such device or address */
+#define LUSTRE_E2BIG		7	/* Argument list too long */
+#define LUSTRE_ENOEXEC		8	/* Exec format error */
+#define LUSTRE_EBADF		9	/* Bad file number */
+#define LUSTRE_ECHILD		10	/* No child processes */
+#define LUSTRE_EAGAIN		11	/* Try again */
+#define LUSTRE_ENOMEM		12	/* Out of memory */
+#define LUSTRE_EACCES		13	/* Permission denied */
+#define LUSTRE_EFAULT		14	/* Bad address */
+#define LUSTRE_ENOTBLK		15	/* Block device required */
+#define LUSTRE_EBUSY		16	/* Device or resource busy */
+#define LUSTRE_EEXIST		17	/* File exists */
+#define LUSTRE_EXDEV		18	/* Cross-device link */
+#define LUSTRE_ENODEV		19	/* No such device */
+#define LUSTRE_ENOTDIR		20	/* Not a directory */
+#define LUSTRE_EISDIR		21	/* Is a directory */
+#define LUSTRE_EINVAL		22	/* Invalid argument */
+#define LUSTRE_ENFILE		23	/* File table overflow */
+#define LUSTRE_EMFILE		24	/* Too many open files */
+#define LUSTRE_ENOTTY		25	/* Not a typewriter */
+#define LUSTRE_ETXTBSY		26	/* Text file busy */
+#define LUSTRE_EFBIG		27	/* File too large */
+#define LUSTRE_ENOSPC		28	/* No space left on device */
+#define LUSTRE_ESPIPE		29	/* Illegal seek */
+#define LUSTRE_EROFS		30	/* Read-only file system */
+#define LUSTRE_EMLINK		31	/* Too many links */
+#define LUSTRE_EPIPE		32	/* Broken pipe */
+#define LUSTRE_EDOM		33	/* Math argument out of domain of
+					   func */
+#define LUSTRE_ERANGE		34	/* Math result not representable */
+#define LUSTRE_EDEADLK		35	/* Resource deadlock would occur */
+#define LUSTRE_ENAMETOOLONG	36	/* File name too long */
+#define LUSTRE_ENOLCK		37	/* No record locks available */
+#define LUSTRE_ENOSYS		38	/* Function not implemented */
+#define LUSTRE_ENOTEMPTY	39	/* Directory not empty */
+#define LUSTRE_ELOOP		40	/* Too many symbolic links
+					   encountered */
+#define LUSTRE_ENOMSG		42	/* No message of desired type */
+#define LUSTRE_EIDRM		43	/* Identifier removed */
+#define LUSTRE_ECHRNG		44	/* Channel number out of range */
+#define LUSTRE_EL2NSYNC		45	/* Level 2 not synchronized */
+#define LUSTRE_EL3HLT		46	/* Level 3 halted */
+#define LUSTRE_EL3RST		47	/* Level 3 reset */
+#define LUSTRE_ELNRNG		48	/* Link number out of range */
+#define LUSTRE_EUNATCH		49	/* Protocol driver not attached */
+#define LUSTRE_ENOCSI		50	/* No CSI structure available */
+#define LUSTRE_EL2HLT		51	/* Level 2 halted */
+#define LUSTRE_EBADE		52	/* Invalid exchange */
+#define LUSTRE_EBADR		53	/* Invalid request descriptor */
+#define LUSTRE_EXFULL		54	/* Exchange full */
+#define LUSTRE_ENOANO		55	/* No anode */
+#define LUSTRE_EBADRQC		56	/* Invalid request code */
+#define LUSTRE_EBADSLT		57	/* Invalid slot */
+#define LUSTRE_EBFONT		59	/* Bad font file format */
+#define LUSTRE_ENOSTR		60	/* Device not a stream */
+#define LUSTRE_ENODATA		61	/* No data available */
+#define LUSTRE_ETIME		62	/* Timer expired */
+#define LUSTRE_ENOSR		63	/* Out of streams resources */
+#define LUSTRE_ENONET		64	/* Machine is not on the network */
+#define LUSTRE_ENOPKG		65	/* Package not installed */
+#define LUSTRE_EREMOTE		66	/* Object is remote */
+#define LUSTRE_ENOLINK		67	/* Link has been severed */
+#define LUSTRE_EADV		68	/* Advertise error */
+#define LUSTRE_ESRMNT		69	/* Srmount error */
+#define LUSTRE_ECOMM		70	/* Communication error on send */
+#define LUSTRE_EPROTO		71	/* Protocol error */
+#define LUSTRE_EMULTIHOP	72	/* Multihop attempted */
+#define LUSTRE_EDOTDOT		73	/* RFS specific error */
+#define LUSTRE_EBADMSG		74	/* Not a data message */
+#define LUSTRE_EOVERFLOW	75	/* Value too large for defined data
+					   type */
+#define LUSTRE_ENOTUNIQ		76	/* Name not unique on network */
+#define LUSTRE_EBADFD		77	/* File descriptor in bad state */
+#define LUSTRE_EREMCHG		78	/* Remote address changed */
+#define LUSTRE_ELIBACC		79	/* Can not access a needed shared
+					   library */
+#define LUSTRE_ELIBBAD		80	/* Accessing a corrupted shared
+					   library */
+#define LUSTRE_ELIBSCN		81	/* .lib section in a.out corrupted */
+#define LUSTRE_ELIBMAX		82	/* Attempting to link in too many shared
+					   libraries */
+#define LUSTRE_ELIBEXEC		83	/* Cannot exec a shared library
+					   directly */
+#define LUSTRE_EILSEQ		84	/* Illegal byte sequence */
+#define LUSTRE_ERESTART		85	/* Interrupted system call should be
+					   restarted */
+#define LUSTRE_ESTRPIPE		86	/* Streams pipe error */
+#define LUSTRE_EUSERS		87	/* Too many users */
+#define LUSTRE_ENOTSOCK		88	/* Socket operation on non-socket */
+#define LUSTRE_EDESTADDRREQ	89	/* Destination address required */
+#define LUSTRE_EMSGSIZE		90	/* Message too long */
+#define LUSTRE_EPROTOTYPE	91	/* Protocol wrong type for socket */
+#define LUSTRE_ENOPROTOOPT	92	/* Protocol not available */
+#define LUSTRE_EPROTONOSUPPORT	93	/* Protocol not supported */
+#define LUSTRE_ESOCKTNOSUPPORT	94	/* Socket type not supported */
+#define LUSTRE_EOPNOTSUPP	95	/* Operation not supported on transport
+					   endpoint */
+#define LUSTRE_EPFNOSUPPORT	96	/* Protocol family not supported */
+#define LUSTRE_EAFNOSUPPORT	97	/* Address family not supported by
+					   protocol */
+#define LUSTRE_EADDRINUSE	98	/* Address already in use */
+#define LUSTRE_EADDRNOTAVAIL	99	/* Cannot assign requested address */
+#define LUSTRE_ENETDOWN		100	/* Network is down */
+#define LUSTRE_ENETUNREACH	101	/* Network is unreachable */
+#define LUSTRE_ENETRESET	102	/* Network dropped connection because of
+					   reset */
+#define LUSTRE_ECONNABORTED	103	/* Software caused connection abort */
+#define LUSTRE_ECONNRESET	104	/* Connection reset by peer */
+#define LUSTRE_ENOBUFS		105	/* No buffer space available */
+#define LUSTRE_EISCONN		106	/* Transport endpoint is already
+					   connected */
+#define LUSTRE_ENOTCONN		107	/* Transport endpoint is not
+					   connected */
+#define LUSTRE_ESHUTDOWN	108	/* Cannot send after transport endpoint
+					   shutdown */
+#define LUSTRE_ETOOMANYREFS	109	/* Too many references: cannot splice */
+#define LUSTRE_ETIMEDOUT	110	/* Connection timed out */
+#define LUSTRE_ECONNREFUSED	111	/* Connection refused */
+#define LUSTRE_EHOSTDOWN	112	/* Host is down */
+#define LUSTRE_EHOSTUNREACH	113	/* No route to host */
+#define LUSTRE_EALREADY		114	/* Operation already in progress */
+#define LUSTRE_EINPROGRESS	115	/* Operation now in progress */
+#define LUSTRE_ESTALE		116	/* Stale NFS file handle */
+#define LUSTRE_EUCLEAN		117	/* Structure needs cleaning */
+#define LUSTRE_ENOTNAM		118	/* Not a XENIX named type file */
+#define LUSTRE_ENAVAIL		119	/* No XENIX semaphores available */
+#define LUSTRE_EISNAM		120	/* Is a named type file */
+#define LUSTRE_EREMOTEIO	121	/* Remote I/O error */
+#define LUSTRE_EDQUOT		122	/* Quota exceeded */
+#define LUSTRE_ENOMEDIUM	123	/* No medium found */
+#define LUSTRE_EMEDIUMTYPE	124	/* Wrong medium type */
+#define LUSTRE_ECANCELED	125	/* Operation Canceled */
+#define LUSTRE_ENOKEY		126	/* Required key not available */
+#define LUSTRE_EKEYEXPIRED	127	/* Key has expired */
+#define LUSTRE_EKEYREVOKED	128	/* Key has been revoked */
+#define LUSTRE_EKEYREJECTED	129	/* Key was rejected by service */
+#define LUSTRE_EOWNERDEAD	130	/* Owner died */
+#define LUSTRE_ENOTRECOVERABLE	131	/* State not recoverable */
+#define LUSTRE_ERESTARTSYS	512
+#define LUSTRE_ERESTARTNOINTR	513
+#define LUSTRE_ERESTARTNOHAND	514	/* restart if no handler.. */
+#define LUSTRE_ENOIOCTLCMD	515	/* No ioctl command */
+#define LUSTRE_ERESTART_RESTARTBLOCK 516 /* restart by calling
+					    sys_restart_syscall */
+#define LUSTRE_EBADHANDLE	521	/* Illegal NFS file handle */
+#define LUSTRE_ENOTSYNC		522	/* Update synchronization mismatch */
+#define LUSTRE_EBADCOOKIE	523	/* Cookie is stale */
+#define LUSTRE_ENOTSUPP		524	/* Operation is not supported */
+#define LUSTRE_ETOOSMALL	525	/* Buffer or request is too small */
+#define LUSTRE_ESERVERFAULT	526	/* An untranslatable error occurred */
+#define LUSTRE_EBADTYPE		527	/* Type not supported by server */
+#define LUSTRE_EJUKEBOX		528	/* Request initiated, but will not
+					   complete before timeout */
+#define LUSTRE_EIOCBQUEUED	529	/* iocb queued, will get completion
+					   event */
+
+/*
+ * Translations are optimized away on x86.  Host errnos that shouldn't be put
+ * on wire could leak through as a result.  Do not count on this side effect.
+ */
+#if !defined(__x86_64__) && !defined(__i386__)
+#define LUSTRE_TRANSLATE_ERRNOS
+#endif
+
+#ifdef LUSTRE_TRANSLATE_ERRNOS
+unsigned int lustre_errno_hton(unsigned int h);
+unsigned int lustre_errno_ntoh(unsigned int n);
+#else
+#define lustre_errno_hton(h) (h)
+#define lustre_errno_ntoh(n) (n)
+#endif
+
+#endif /* LUSTRE_ERRNO_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_idl.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_idl.h
new file mode 100644
index 0000000000000..e40b90ec65a20
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_idl.h
@@ -0,0 +1,3486 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/lustre_idl.h
+ *
+ * Lustre wire protocol definitions.
+ */
+
+/** \defgroup lustreidl lustreidl
+ *
+ * Lustre wire protocol definitions.
+ *
+ * ALL structs passing over the wire should be declared here.  Structs
+ * that are used in interfaces with userspace should go in lustre_user.h.
+ *
+ * All structs being declared here should be built from simple fixed-size
+ * types (__u8, __u16, __u32, __u64) or be built from other types or
+ * structs also declared in this file.  Similarly, all flags and magic
+ * values in those structs should also be declared here.  This ensures
+ * that the Lustre wire protocol is not influenced by external dependencies.
+ *
+ * The only other acceptable items in this file are VERY SIMPLE accessor
+ * functions to avoid callers grubbing inside the structures. Nothing that
+ * depends on external functions or definitions should be in here.
+ *
+ * Structs must be properly aligned to put 64-bit values on an 8-byte
+ * boundary.  Any structs being added here must also be added to
+ * utils/wirecheck.c and "make newwiretest" run to regenerate the
+ * utils/wiretest.c sources.  This allows us to verify that wire structs
+ * have the proper alignment/size on all architectures.
+ *
+ * DO NOT CHANGE any of the structs, flags, values declared here and used
+ * in released Lustre versions.  Some structs may have padding fields that
+ * can be used.  Some structs might allow addition at the end (verify this
+ * in the code to ensure that new/old clients that see this larger struct
+ * do not fail, otherwise you need to implement protocol compatibility).
+ *
+ * @{
+ */
+
+#ifndef _LUSTRE_IDL_H_
+#define _LUSTRE_IDL_H_
+
+#include <asm/byteorder.h>
+#include <linux/types.h>
+
+#include <lnet/types.h>
+#include <lustre/lustre_user.h> /* Defn's shared with user-space. */
+#include <lustre_ver.h>
+
+/*
+ *  GENERAL STUFF
+ */
+/* FOO_REQUEST_PORTAL is for incoming requests on the FOO
+ * FOO_REPLY_PORTAL   is for incoming replies on the FOO
+ * FOO_BULK_PORTAL    is for incoming bulk on the FOO
+ */
+
+#define CONNMGR_REQUEST_PORTAL          1
+#define CONNMGR_REPLY_PORTAL            2
+//#define OSC_REQUEST_PORTAL            3
+#define OSC_REPLY_PORTAL                4
+//#define OSC_BULK_PORTAL               5
+#define OST_IO_PORTAL                   6
+#define OST_CREATE_PORTAL               7
+#define OST_BULK_PORTAL                 8
+//#define MDC_REQUEST_PORTAL            9
+#define MDC_REPLY_PORTAL               10
+//#define MDC_BULK_PORTAL              11
+#define MDS_REQUEST_PORTAL             12
+//#define MDS_REPLY_PORTAL             13
+#define MDS_BULK_PORTAL                14
+#define LDLM_CB_REQUEST_PORTAL         15
+#define LDLM_CB_REPLY_PORTAL           16
+#define LDLM_CANCEL_REQUEST_PORTAL     17
+#define LDLM_CANCEL_REPLY_PORTAL       18
+//#define PTLBD_REQUEST_PORTAL           19
+//#define PTLBD_REPLY_PORTAL             20
+//#define PTLBD_BULK_PORTAL              21
+#define MDS_SETATTR_PORTAL             22
+#define MDS_READPAGE_PORTAL            23
+#define OUT_PORTAL			24
+#define MGC_REPLY_PORTAL               25
+#define MGS_REQUEST_PORTAL             26
+#define MGS_REPLY_PORTAL               27
+#define OST_REQUEST_PORTAL             28
+#define FLD_REQUEST_PORTAL             29
+#define SEQ_METADATA_PORTAL            30
+#define SEQ_DATA_PORTAL                31
+#define SEQ_CONTROLLER_PORTAL          32
+#define MGS_BULK_PORTAL                33
+
+/* Portal 63 is reserved for the Cray Inc DVS - nic@cray.com, roe@cray.com, n8851@cray.com */
+
+/* packet types */
+#define PTL_RPC_MSG_REQUEST 4711
+#define PTL_RPC_MSG_ERR     4712
+#define PTL_RPC_MSG_REPLY   4713
+
+/* DON'T use swabbed values of MAGIC as magic! */
+#define LUSTRE_MSG_MAGIC_V2 0x0BD00BD3
+#define LUSTRE_MSG_MAGIC_V2_SWABBED 0xD30BD00B
+
+#define LUSTRE_MSG_MAGIC LUSTRE_MSG_MAGIC_V2
+
+#define PTLRPC_MSG_VERSION  0x00000003
+#define LUSTRE_VERSION_MASK 0xffff0000
+#define LUSTRE_OBD_VERSION  0x00010000
+#define LUSTRE_MDS_VERSION  0x00020000
+#define LUSTRE_OST_VERSION  0x00030000
+#define LUSTRE_DLM_VERSION  0x00040000
+#define LUSTRE_LOG_VERSION  0x00050000
+#define LUSTRE_MGS_VERSION  0x00060000
+
+/**
+ * Describes a range of sequence, lsr_start is included but lsr_end is
+ * not in the range.
+ * Same structure is used in fld module where lsr_index field holds mdt id
+ * of the home mdt.
+ */
+struct lu_seq_range {
+	__u64 lsr_start;
+	__u64 lsr_end;
+	__u32 lsr_index;
+	__u32 lsr_flags;
+};
+
+struct lu_seq_range_array {
+	__u32 lsra_count;
+	__u32 lsra_padding;
+	struct lu_seq_range lsra_lsr[0];
+};
+
+#define LU_SEQ_RANGE_MDT	0x0
+#define LU_SEQ_RANGE_OST	0x1
+#define LU_SEQ_RANGE_ANY	0x3
+
+#define LU_SEQ_RANGE_MASK	0x3
+
+/** \defgroup lu_fid lu_fid
+ * @{ */
+
+extern void lustre_lma_swab(struct lustre_mdt_attrs *lma);
+extern void lustre_lma_init(struct lustre_mdt_attrs *lma,
+			    const struct lu_fid *fid,
+			    __u32 compat, __u32 incompat);
+extern void lustre_loa_swab(struct lustre_ost_attrs *loa,
+			    bool to_cpu);
+extern void lustre_loa_init(struct lustre_ost_attrs *loa,
+			    const struct lu_fid *fid,
+			    __u32 compat, __u32 incompat);
+
+/* copytool uses a 32b bitmask field to encode archive-Ids during register
+ * with MDT thru kuc.
+ * archive num = 0 => all
+ * archive num from 1 to 32
+ */
+#define LL_HSM_MAX_ARCHIVE (sizeof(__u32) * 8)
+
+/**
+ * HSM on-disk attributes stored in a separate xattr.
+ */
+struct hsm_attrs {
+	/** Bitfield for supported data in this structure. For future use. */
+	__u32	hsm_compat;
+
+	/** HSM flags, see hsm_flags enum below */
+	__u32	hsm_flags;
+	/** backend archive id associated with the file */
+	__u64	hsm_arch_id;
+	/** version associated with the last archiving, if any */
+	__u64	hsm_arch_ver;
+};
+extern void lustre_hsm_swab(struct hsm_attrs *attrs);
+
+/**
+ * fid constants
+ */
+enum {
+	/** LASTID file has zero OID */
+	LUSTRE_FID_LASTID_OID = 0UL,
+        /** initial fid id value */
+        LUSTRE_FID_INIT_OID  = 1UL
+};
+
+/**
+ * Different FID Format
+ * http://arch.lustre.org/index.php?title=Interoperability_fids_zfs#NEW.0
+ *
+ * FID:
+ * File IDentifier generated by client from range allocated by the seq service.
+ * First 0x400 sequences [2^33, 2^33 + 0x400] are reserved for system use. Note
+ * that on ldiskfs MDTs that IGIF FIDs can use inode numbers starting at 12,
+ * but this is in the IGIF SEQ rangeand does not conflict with assigned FIDs.
+ *
+ * IGIF:
+ * Inode and Generation In FID, a surrogate FID used to globally identify an
+ * existing object on OLD formatted MDT file system. This would only be used on
+ * MDT0 in a DNE filesystem, because there are not expected to be any OLD
+ * formatted DNE filesystems. Belongs to a sequence in [12, 2^32 - 1] range,
+ * where sequence number is inode number, and inode generation is used as OID.
+ * NOTE: This assumes no more than 2^32-1 inodes exist in the MDT filesystem,
+ * which is the maximum possible for an ldiskfs backend. NOTE: This assumes
+ * that the reserved ext3/ext4/ldiskfs inode numbers [0-11] are never visible
+ * to clients, which has always been true.
+ *
+ * IDIF:
+ * Object ID in FID, a surrogate FID used to globally identify an existing
+ * object on OLD formatted OST file system. Belongs to a sequence in
+ * [2^32, 2^33 - 1]. Sequence number is calculated as:
+ *	1 << 32 | (ost_index << 16) | ((objid >> 32) & 0xffff)
+ * that is, SEQ consists of 16-bit OST index, and higher 16 bits of object ID.
+ * The generation of unique SEQ values per OST allows the IDIF FIDs to be
+ * identified in the FLD correctly. The OID field is calculated as:
+ *	objid & 0xffffffff
+ * that is, it consists of lower 32 bits of object ID. NOTE This assumes that
+ * no more than 2^48-1 objects have ever been created on an OST, and that no
+ * more than 65535 OSTs are in use. Both are very reasonable assumptions (can
+ * uniquely map all objects on an OST that created 1M objects per second for 9
+ * years, or combinations thereof).
+ *
+ * OST_MDT0:
+ * Surrogate FID used to identify an existing object on OLD formatted OST
+ * filesystem. Belongs to the reserved sequence 0, and is used internally prior
+ * to the introduction of FID-on-OST, at which point IDIF will be used to
+ * identify objects as residing on a specific OST.
+ *
+ * LLOG:
+ * For Lustre Log objects the object sequence 1 is used. This is compatible with
+ * both OLD and NEW.1 namespaces, as this SEQ number is in the ext3/ldiskfs
+ * reserved inode range and does not conflict with IGIF sequence numbers.
+ *
+ * ECHO:
+ * For testing OST IO performance the object sequence 2 is used. This is
+ * compatible with both OLD and NEW.1 namespaces, as this SEQ number is in the
+ * ext3/ldiskfs reserved inode range and does not conflict with IGIF sequence
+ * numbers.
+ *
+ * OST_MDT1 .. OST_MAX:
+ * For testing with multiple MDTs the object sequence 3 through 9 is used,
+ * allowing direct mapping of MDTs 1 through 7 respectively, for a total of 8
+ * MDTs including OST_MDT0. This matches the legacy CMD project "group"
+ * mappings. However, this SEQ range is only for testing prior to any production
+ * DNE release, as the objects in this range conflict across all OSTs, as the
+ * OST index is not part of the FID.
+ *
+ *
+ * For compatibility with existing OLD OST network protocol structures, the FID
+ * must map onto the o_id and o_gr in a manner that ensures existing objects are
+ * identified consistently for IO, as well as onto the lock namespace to ensure
+ * both IDIFs map onto the same objects for IO as well as resources in the DLM.
+ *
+ * DLM OLD OBIF/IDIF:
+ * resource[] = {o_id, o_seq, 0, 0};  // o_seq == 0 for production releases
+ *
+ * DLM NEW.1 FID (this is the same for both the MDT and OST):
+ * resource[] = {SEQ, OID, VER, HASH};
+ *
+ * Note that for mapping IDIF values to DLM resource names the o_id may be
+ * larger than the 2^33 reserved sequence numbers for IDIF, so it is possible
+ * for the o_id numbers to overlap FID SEQ numbers in the resource. However, in
+ * all production releases the OLD o_seq field is always zero, and all valid FID
+ * OID values are non-zero, so the lock resources will not collide.
+ *
+ * For objects within the IDIF range, group extraction (non-CMD) will be:
+ * o_id = (fid->f_seq & 0x7fff) << 16 | fid->f_oid;
+ * o_seq = 0;  // formerly group number
+ */
+
+/**
+ * Note that reserved SEQ numbers below 12 will conflict with ldiskfs
+ * inodes in the IGIF namespace, so these reserved SEQ numbers can be
+ * used for other purposes and not risk collisions with existing inodes.
+ */
+enum fid_seq {
+	FID_SEQ_OST_MDT0	= 0,
+	FID_SEQ_LLOG		= 1, /* unnamed llogs */
+	FID_SEQ_ECHO		= 2,
+	FID_SEQ_UNUSED_START	= 3,
+	FID_SEQ_UNUSED_END	= 9,
+	FID_SEQ_LLOG_NAME	= 10, /* named llogs */
+	FID_SEQ_RSVD		= 11,
+	FID_SEQ_IGIF		= 12,
+	FID_SEQ_IGIF_MAX	= 0x0ffffffffULL,
+	FID_SEQ_IDIF		= 0x100000000ULL,
+	FID_SEQ_IDIF_MAX	= 0x1ffffffffULL,
+	/* Normal FID sequence starts from this value, i.e. 1<<33 */
+	FID_SEQ_START		= 0x200000000ULL,
+	/* sequence for local pre-defined FIDs listed in local_oid */
+	FID_SEQ_LOCAL_FILE	= 0x200000001ULL,
+	FID_SEQ_DOT_LUSTRE	= 0x200000002ULL,
+	/* sequence is used for local named objects FIDs generated
+	 * by local_object_storage library */
+	FID_SEQ_LOCAL_NAME	= 0x200000003ULL,
+	/* Because current FLD will only cache the fid sequence, instead
+	 * of oid on the client side, if the FID needs to be exposed to
+	 * clients sides, it needs to make sure all of fids under one
+	 * sequence will be located in one MDT. */
+	FID_SEQ_SPECIAL		= 0x200000004ULL,
+	FID_SEQ_QUOTA		= 0x200000005ULL,
+	FID_SEQ_QUOTA_GLB	= 0x200000006ULL,
+	FID_SEQ_ROOT		= 0x200000007ULL,  /* Located on MDT0 */
+	FID_SEQ_LAYOUT_RBTREE	= 0x200000008ULL,
+	/* sequence is used for update logs of cross-MDT operation */
+	FID_SEQ_UPDATE_LOG	= 0x200000009ULL,
+	/* Sequence is used for the directory under which update logs
+	 * are created. */
+	FID_SEQ_UPDATE_LOG_DIR	= 0x20000000aULL,
+	FID_SEQ_NORMAL		= 0x200000400ULL,
+	FID_SEQ_LOV_DEFAULT	= 0xffffffffffffffffULL
+};
+
+#define OBIF_OID_MAX_BITS           32
+#define OBIF_MAX_OID                (1ULL << OBIF_OID_MAX_BITS)
+#define OBIF_OID_MASK               ((1ULL << OBIF_OID_MAX_BITS) - 1)
+#define IDIF_OID_MAX_BITS           48
+#define IDIF_MAX_OID                (1ULL << IDIF_OID_MAX_BITS)
+#define IDIF_OID_MASK               ((1ULL << IDIF_OID_MAX_BITS) - 1)
+
+/** OID for FID_SEQ_SPECIAL */
+enum special_oid {
+        /* Big Filesystem Lock to serialize rename operations */
+        FID_OID_SPECIAL_BFL     = 1UL,
+};
+
+/** OID for FID_SEQ_DOT_LUSTRE */
+enum dot_lustre_oid {
+	FID_OID_DOT_LUSTRE	= 1UL,
+	FID_OID_DOT_LUSTRE_OBF	= 2UL,
+	FID_OID_DOT_LUSTRE_LPF	= 3UL,
+};
+
+/** OID for FID_SEQ_ROOT */
+enum root_oid {
+	FID_OID_ROOT		= 1UL,
+	FID_OID_ECHO_ROOT	= 2UL,
+};
+
+struct lu_orphan_rec {
+	/* The MDT-object's FID referenced by the orphan OST-object */
+	struct lu_fid	lor_fid;
+	__u32		lor_uid;
+	__u32		lor_gid;
+};
+
+struct lu_orphan_ent {
+	/* The orphan OST-object's FID */
+	struct lu_fid		loe_key;
+	struct lu_orphan_rec	loe_rec;
+};
+
+struct lu_orphan_rec_v2 {
+	struct lu_orphan_rec	lor_rec;
+	struct ost_layout	lor_layout;
+	__u32			lor_padding;
+};
+
+struct lu_orphan_ent_v2 {
+	/* The orphan OST-object's FID */
+	struct lu_fid		loe_key;
+	struct lu_orphan_rec_v2	loe_rec;
+};
+
+/** @} lu_fid */
+
+/** \defgroup lu_dir lu_dir
+ * @{ */
+
+/**
+ * Enumeration of possible directory entry attributes.
+ *
+ * Attributes follow directory entry header in the order they appear in this
+ * enumeration.
+ */
+enum lu_dirent_attrs {
+	LUDA_FID		= 0x0001,
+	LUDA_TYPE		= 0x0002,
+	LUDA_64BITHASH		= 0x0004,
+
+	/* The following attrs are used for MDT internal only,
+	 * not visible to client */
+
+	/* Something in the record is unknown, to be verified in further. */
+	LUDA_UNKNOWN		= 0x0400,
+	/* Ignore this record, go to next directly. */
+	LUDA_IGNORE		= 0x0800,
+	/* The system is upgraded, has beed or to be repaired (dryrun). */
+	LUDA_UPGRADE		= 0x1000,
+	/* The dirent has been repaired, or to be repaired (dryrun). */
+	LUDA_REPAIR		= 0x2000,
+	/* Only check but not repair the dirent inconsistency */
+	LUDA_VERIFY_DRYRUN	= 0x4000,
+	/* Verify the dirent consistency */
+	LUDA_VERIFY		= 0x8000,
+};
+
+#define LU_DIRENT_ATTRS_MASK	0xff00
+
+/**
+ * Layout of readdir pages, as transmitted on wire.
+ */
+struct lu_dirent {
+        /** valid if LUDA_FID is set. */
+        struct lu_fid lde_fid;
+        /** a unique entry identifier: a hash or an offset. */
+        __u64         lde_hash;
+        /** total record length, including all attributes. */
+        __u16         lde_reclen;
+        /** name length */
+        __u16         lde_namelen;
+        /** optional variable size attributes following this entry.
+         *  taken from enum lu_dirent_attrs.
+         */
+        __u32         lde_attrs;
+        /** name is followed by the attributes indicated in ->ldp_attrs, in
+         *  their natural order. After the last attribute, padding bytes are
+         *  added to make ->lde_reclen a multiple of 8.
+         */
+        char          lde_name[0];
+};
+
+/*
+ * Definitions of optional directory entry attributes formats.
+ *
+ * Individual attributes do not have their length encoded in a generic way. It
+ * is assumed that consumer of an attribute knows its format. This means that
+ * it is impossible to skip over an unknown attribute, except by skipping over all
+ * remaining attributes (by using ->lde_reclen), which is not too
+ * constraining, because new server versions will append new attributes at
+ * the end of an entry.
+ */
+
+/**
+ * Fid directory attribute: a fid of an object referenced by the entry. This
+ * will be almost always requested by the client and supplied by the server.
+ *
+ * Aligned to 8 bytes.
+ */
+/* To have compatibility with 1.8, lets have fid in lu_dirent struct. */
+
+/**
+ * File type.
+ *
+ * Aligned to 2 bytes.
+ */
+struct luda_type {
+        __u16 lt_type;
+};
+
+struct lu_dirpage {
+        __u64            ldp_hash_start;
+        __u64            ldp_hash_end;
+        __u32            ldp_flags;
+        __u32            ldp_pad0;
+        struct lu_dirent ldp_entries[0];
+};
+
+enum lu_dirpage_flags {
+        /**
+         * dirpage contains no entry.
+         */
+        LDF_EMPTY   = 1 << 0,
+        /**
+         * last entry's lde_hash equals ldp_hash_end.
+         */
+        LDF_COLLIDE = 1 << 1
+};
+
+static inline struct lu_dirent *lu_dirent_start(struct lu_dirpage *dp)
+{
+	if (__le32_to_cpu(dp->ldp_flags) & LDF_EMPTY)
+		return NULL;
+	else
+		return dp->ldp_entries;
+}
+
+static inline struct lu_dirent *lu_dirent_next(struct lu_dirent *ent)
+{
+	struct lu_dirent *next;
+
+	if (__le16_to_cpu(ent->lde_reclen) != 0)
+		next = ((void *)ent) + __le16_to_cpu(ent->lde_reclen);
+	else
+		next = NULL;
+
+	return next;
+}
+
+static inline size_t lu_dirent_calc_size(size_t namelen, __u16 attr)
+{
+	size_t size;
+
+	if (attr & LUDA_TYPE) {
+		const size_t align = sizeof(struct luda_type) - 1;
+                size = (sizeof(struct lu_dirent) + namelen + align) & ~align;
+                size += sizeof(struct luda_type);
+        } else
+                size = sizeof(struct lu_dirent) + namelen;
+
+        return (size + 7) & ~7;
+}
+
+#define MDS_DIR_END_OFF 0xfffffffffffffffeULL
+
+/**
+ * MDS_READPAGE page size
+ *
+ * This is the directory page size packed in MDS_READPAGE RPC.
+ * It's different than PAGE_SIZE because the client needs to
+ * access the struct lu_dirpage header packed at the beginning of
+ * the "page" and without this there isn't any way to know find the
+ * lu_dirpage header is if client and server PAGE_SIZE differ.
+ */
+#define LU_PAGE_SHIFT 12
+#define LU_PAGE_SIZE  (1UL << LU_PAGE_SHIFT)
+#define LU_PAGE_MASK  (~(LU_PAGE_SIZE - 1))
+
+#define LU_PAGE_COUNT (1 << (PAGE_SHIFT - LU_PAGE_SHIFT))
+
+/** @} lu_dir */
+
+struct lustre_handle {
+        __u64 cookie;
+};
+#define DEAD_HANDLE_MAGIC 0xdeadbeefcafebabeULL
+
+static inline bool lustre_handle_is_used(const struct lustre_handle *lh)
+{
+	return lh->cookie != 0;
+}
+
+static inline bool lustre_handle_equal(const struct lustre_handle *lh1,
+				       const struct lustre_handle *lh2)
+{
+	return lh1->cookie == lh2->cookie;
+}
+
+static inline void lustre_handle_copy(struct lustre_handle *tgt,
+				      const struct lustre_handle *src)
+{
+	tgt->cookie = src->cookie;
+}
+
+struct lustre_handle_array {
+	unsigned int		count;
+	struct lustre_handle	handles[0];
+};
+
+/* flags for lm_flags */
+#define MSGHDR_AT_SUPPORT               0x1
+#define MSGHDR_CKSUM_INCOMPAT18         0x2
+
+#define lustre_msg lustre_msg_v2
+/* we depend on this structure to be 8-byte aligned */
+/* this type is only endian-adjusted in lustre_unpack_msg() */
+struct lustre_msg_v2 {
+        __u32 lm_bufcount;
+        __u32 lm_secflvr;
+        __u32 lm_magic;
+        __u32 lm_repsize;
+        __u32 lm_cksum;
+        __u32 lm_flags;
+        __u32 lm_padding_2;
+        __u32 lm_padding_3;
+        __u32 lm_buflens[0];
+};
+
+/* without gss, ptlrpc_body is put at the first buffer. */
+#define PTLRPC_NUM_VERSIONS     4
+struct ptlrpc_body_v3 {
+	struct lustre_handle pb_handle;
+	__u32 pb_type;
+	__u32 pb_version;
+	__u32 pb_opc;
+	__u32 pb_status;
+	__u64 pb_last_xid; /* highest replied XID without lower unreplied XID */
+	__u16 pb_tag;      /* virtual slot idx for multiple modifying RPCs */
+	__u16 pb_padding0;
+	__u32 pb_padding1;
+	__u64 pb_last_committed;
+	__u64 pb_transno;
+	__u32 pb_flags;
+	__u32 pb_op_flags;
+	__u32 pb_conn_cnt;
+	__u32 pb_timeout;  /* for req, the deadline, for rep, the service est */
+	__u32 pb_service_time; /* for rep, actual service time */
+	__u32 pb_limit;
+	__u64 pb_slv;
+	/* VBR: pre-versions */
+	__u64 pb_pre_versions[PTLRPC_NUM_VERSIONS];
+	__u64 pb_mbits;	/**< match bits for bulk request */
+	/* padding for future needs */
+	__u64 pb_padding64_0;
+	__u64 pb_padding64_1;
+	__u64 pb_padding64_2;
+	char  pb_jobid[LUSTRE_JOBID_SIZE];
+};
+#define ptlrpc_body     ptlrpc_body_v3
+
+struct ptlrpc_body_v2 {
+        struct lustre_handle pb_handle;
+        __u32 pb_type;
+        __u32 pb_version;
+        __u32 pb_opc;
+        __u32 pb_status;
+	__u64 pb_last_xid; /* highest replied XID without lower unreplied XID */
+	__u16 pb_tag;      /* virtual slot idx for multiple modifying RPCs */
+	__u16 pb_padding0;
+	__u32 pb_padding1;
+        __u64 pb_last_committed;
+        __u64 pb_transno;
+        __u32 pb_flags;
+        __u32 pb_op_flags;
+        __u32 pb_conn_cnt;
+        __u32 pb_timeout;  /* for req, the deadline, for rep, the service est */
+        __u32 pb_service_time; /* for rep, actual service time, also used for
+                                  net_latency of req */
+        __u32 pb_limit;
+        __u64 pb_slv;
+        /* VBR: pre-versions */
+        __u64 pb_pre_versions[PTLRPC_NUM_VERSIONS];
+	__u64 pb_mbits;	/**< unused in V2 */
+        /* padding for future needs */
+	__u64 pb_padding64_0;
+	__u64 pb_padding64_1;
+	__u64 pb_padding64_2;
+};
+
+/* message body offset for lustre_msg_v2 */
+/* ptlrpc body offset in all request/reply messages */
+#define MSG_PTLRPC_BODY_OFF             0
+
+/* normal request/reply message record offset */
+#define REQ_REC_OFF                     1
+#define REPLY_REC_OFF                   1
+
+/* ldlm request message body offset */
+#define DLM_LOCKREQ_OFF                 1 /* lockreq offset */
+#define DLM_REQ_REC_OFF                 2 /* normal dlm request record offset */
+
+/* ldlm intent lock message body offset */
+#define DLM_INTENT_IT_OFF               2 /* intent lock it offset */
+#define DLM_INTENT_REC_OFF              3 /* intent lock record offset */
+
+/* ldlm reply message body offset */
+#define DLM_LOCKREPLY_OFF               1 /* lockrep offset */
+#define DLM_REPLY_REC_OFF               2 /* reply record offset */
+
+/** only use in req->rq_{req,rep}_swab_mask */
+#define MSG_PTLRPC_HEADER_OFF           31
+
+/* Flags that are operation-specific go in the top 16 bits. */
+#define MSG_OP_FLAG_MASK   0xffff0000
+#define MSG_OP_FLAG_SHIFT  16
+
+/* Flags that apply to all requests are in the bottom 16 bits */
+#define MSG_GEN_FLAG_MASK     0x0000ffff
+#define MSG_LAST_REPLAY           0x0001
+#define MSG_RESENT                0x0002
+#define MSG_REPLAY                0x0004
+/* #define MSG_AT_SUPPORT         0x0008
+ * This was used in early prototypes of adaptive timeouts, and while there
+ * shouldn't be any users of that code there also isn't a need for using this
+ * bits. Defer usage until at least 1.10 to avoid potential conflict. */
+#define MSG_DELAY_REPLAY          0x0010
+#define MSG_VERSION_REPLAY        0x0020
+#define MSG_REQ_REPLAY_DONE       0x0040
+#define MSG_LOCK_REPLAY_DONE      0x0080
+
+/*
+ * Flags for all connect opcodes (MDS_CONNECT, OST_CONNECT)
+ */
+
+#define MSG_CONNECT_RECOVERING  0x00000001
+#define MSG_CONNECT_RECONNECT   0x00000002
+#define MSG_CONNECT_REPLAYABLE  0x00000004
+//#define MSG_CONNECT_PEER        0x8
+#define MSG_CONNECT_LIBCLIENT   0x00000010
+#define MSG_CONNECT_INITIAL     0x00000020
+#define MSG_CONNECT_ASYNC       0x00000040
+#define MSG_CONNECT_NEXT_VER    0x00000080 /* use next version of lustre_msg */
+#define MSG_CONNECT_TRANSNO     0x00000100 /* report transno */
+
+/* Connect flags */
+#define OBD_CONNECT_RDONLY                0x1ULL /*client has read-only access*/
+#define OBD_CONNECT_INDEX                 0x2ULL /*connect specific LOV idx */
+#define OBD_CONNECT_MDS                   0x4ULL /*connect from MDT to OST */
+#define OBD_CONNECT_GRANT                 0x8ULL /*OSC gets grant at connect */
+#define OBD_CONNECT_SRVLOCK              0x10ULL /*server takes locks for cli */
+#define OBD_CONNECT_VERSION              0x20ULL /*Lustre versions in ocd */
+#define OBD_CONNECT_REQPORTAL            0x40ULL /*Separate non-IO req portal */
+#define OBD_CONNECT_ACL                  0x80ULL /*access control lists */
+#define OBD_CONNECT_XATTR               0x100ULL /*client use extended attr */
+#define OBD_CONNECT_LARGE_ACL		0x200ULL /* more than 32 ACL entries */
+#define OBD_CONNECT_TRUNCLOCK           0x400ULL /*locks on server for punch */
+#define OBD_CONNECT_TRANSNO             0x800ULL /*replay sends init transno */
+#define OBD_CONNECT_IBITS              0x1000ULL /*support for inodebits locks*/
+#define OBD_CONNECT_BARRIER	       0x2000ULL /* write barrier */
+#define OBD_CONNECT_ATTRFID            0x4000ULL /*Server can GetAttr By Fid*/
+#define OBD_CONNECT_NODEVOH            0x8000ULL /*No open hndl on specl nodes*/
+#define OBD_CONNECT_RMT_CLIENT        0x10000ULL /* Remote client, never used
+						  * in production. Removed in
+						  * 2.9. Keep this flag to
+						  * avoid reusing.
+						  */
+#define OBD_CONNECT_RMT_CLIENT_FORCE  0x20000ULL /* Remote client by force,
+						  * never used in production.
+						  * Removed in 2.9. Keep this
+						  * flag to avoid reusing.
+						  */
+#define OBD_CONNECT_BRW_SIZE          0x40000ULL /*Max bytes per rpc */
+#define OBD_CONNECT_QUOTA64           0x80000ULL /*Not used since 2.4 */
+#define OBD_CONNECT_MDS_CAPA         0x100000ULL /*MDS capability */
+#define OBD_CONNECT_OSS_CAPA         0x200000ULL /*OSS capability */
+#define OBD_CONNECT_CANCELSET        0x400000ULL /*Early batched cancels. */
+#define OBD_CONNECT_SOM              0x800000ULL /*Size on MDS */
+#define OBD_CONNECT_AT              0x1000000ULL /*client uses AT */
+#define OBD_CONNECT_LRU_RESIZE      0x2000000ULL /*LRU resize feature. */
+#define OBD_CONNECT_MDS_MDS         0x4000000ULL /*MDS-MDS connection */
+#define OBD_CONNECT_REAL            0x8000000ULL /* obsolete since 2.8 */
+#define OBD_CONNECT_CHANGE_QS      0x10000000ULL /*Not used since 2.4 */
+#define OBD_CONNECT_CKSUM          0x20000000ULL /*support several cksum algos*/
+#define OBD_CONNECT_FID            0x40000000ULL /*FID is supported by server */
+#define OBD_CONNECT_VBR            0x80000000ULL /*version based recovery */
+#define OBD_CONNECT_LOV_V3        0x100000000ULL /*client supports LOV v3 EA */
+#define OBD_CONNECT_GRANT_SHRINK  0x200000000ULL /* support grant shrink */
+#define OBD_CONNECT_SKIP_ORPHAN   0x400000000ULL /* don't reuse orphan objids */
+#define OBD_CONNECT_MAX_EASIZE    0x800000000ULL /* preserved for large EA */
+#define OBD_CONNECT_FULL20       0x1000000000ULL /* it is 2.0 client */
+#define OBD_CONNECT_LAYOUTLOCK   0x2000000000ULL /* client uses layout lock */
+#define OBD_CONNECT_64BITHASH    0x4000000000ULL /* client supports 64-bits
+                                                  * directory hash */
+#define OBD_CONNECT_MAXBYTES     0x8000000000ULL /* max stripe size */
+#define OBD_CONNECT_IMP_RECOV   0x10000000000ULL /* imp recovery support */
+#define OBD_CONNECT_JOBSTATS    0x20000000000ULL /* jobid in ptlrpc_body */
+#define OBD_CONNECT_UMASK       0x40000000000ULL /* create uses client umask */
+#define OBD_CONNECT_EINPROGRESS 0x80000000000ULL /* client handles -EINPROGRESS
+                                                  * RPC error properly */
+#define OBD_CONNECT_GRANT_PARAM 0x100000000000ULL/* extra grant params used for
+                                                  * finer space reservation */
+#define OBD_CONNECT_FLOCK_OWNER 0x200000000000ULL /* for the fixed 1.8
+						   * policy and 2.x server */
+#define OBD_CONNECT_LVB_TYPE	0x400000000000ULL /* variable type of LVB */
+#define OBD_CONNECT_NANOSEC_TIME 0x800000000000ULL /* nanosecond timestamps */
+#define OBD_CONNECT_LIGHTWEIGHT 0x1000000000000ULL/* lightweight connection */
+#define OBD_CONNECT_SHORTIO     0x2000000000000ULL/* short io */
+#define OBD_CONNECT_PINGLESS	0x4000000000000ULL/* pings not required */
+#define OBD_CONNECT_FLOCK_DEAD	0x8000000000000ULL/* improved flock deadlock detection */
+#define OBD_CONNECT_DISP_STRIPE 0x10000000000000ULL/* create stripe disposition*/
+#define OBD_CONNECT_OPEN_BY_FID	0x20000000000000ULL /* open by fid won't pack
+						       name in request */
+#define OBD_CONNECT_LFSCK      0x40000000000000ULL/* support online LFSCK */
+#define OBD_CONNECT_UNLINK_CLOSE 0x100000000000000ULL/* close file in unlink */
+#define OBD_CONNECT_MULTIMODRPCS 0x200000000000000ULL /* support multiple modify
+							 RPCs in parallel */
+#define OBD_CONNECT_DIR_STRIPE	 0x400000000000000ULL /* striped DNE dir */
+#define OBD_CONNECT_SUBTREE	0x800000000000000ULL /* fileset mount */
+#define OBD_CONNECT_LOCK_AHEAD	 0x1000000000000000ULL /* lock ahead */
+/** bulk matchbits is sent within ptlrpc_body */
+#define OBD_CONNECT_BULK_MBITS	 0x2000000000000000ULL
+#define OBD_CONNECT_OBDOPACK	 0x4000000000000000ULL /* compact OUT obdo */
+#define OBD_CONNECT_FLAGS2	 0x8000000000000000ULL /* second flags word */
+/* ocd_connect_flags2 flags */
+#define OBD_CONNECT2_FILE_SECCTX	0x1ULL /* set file security context at create */
+
+/* XXX README XXX:
+ * Please DO NOT add flag values here before first ensuring that this same
+ * flag value is not in use on some other branch.  Please clear any such
+ * changes with senior engineers before starting to use a new flag.  Then,
+ * submit a small patch against EVERY branch that ONLY adds the new flag,
+ * updates obd_connect_names[] for lprocfs_rd_connect_flags(), adds the
+ * flag to check_obd_connect_data(), and updates wiretests accordingly, so it
+ * can be approved and landed easily to reserve the flag for future use. */
+
+/* The MNE_SWAB flag is overloading the MDS_MDS bit only for the MGS
+ * connection.  It is a temporary bug fix for Imperative Recovery interop
+ * between 2.2 and 2.3 x86/ppc nodes, and can be removed when interop for
+ * 2.2 clients/servers is no longer needed.  LU-1252/LU-1644. */
+#define OBD_CONNECT_MNE_SWAB		 OBD_CONNECT_MDS_MDS
+
+#define OCD_HAS_FLAG(ocd, flg)  \
+        (!!((ocd)->ocd_connect_flags & OBD_CONNECT_##flg))
+
+
+#ifdef HAVE_LRU_RESIZE_SUPPORT
+#define LRU_RESIZE_CONNECT_FLAG OBD_CONNECT_LRU_RESIZE
+#else
+#define LRU_RESIZE_CONNECT_FLAG 0
+#endif
+
+#define MDT_CONNECT_SUPPORTED  (OBD_CONNECT_RDONLY | OBD_CONNECT_VERSION | \
+				OBD_CONNECT_ACL | OBD_CONNECT_XATTR | \
+				OBD_CONNECT_IBITS | OBD_CONNECT_NODEVOH | \
+				OBD_CONNECT_ATTRFID | OBD_CONNECT_CANCELSET | \
+				OBD_CONNECT_AT | OBD_CONNECT_BRW_SIZE | \
+				OBD_CONNECT_MDS_MDS | OBD_CONNECT_FID | \
+				LRU_RESIZE_CONNECT_FLAG | OBD_CONNECT_VBR | \
+				OBD_CONNECT_LOV_V3 | OBD_CONNECT_FULL20 | \
+				OBD_CONNECT_64BITHASH | OBD_CONNECT_JOBSTATS | \
+				OBD_CONNECT_EINPROGRESS | \
+				OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_UMASK | \
+				OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LAYOUTLOCK |\
+				OBD_CONNECT_PINGLESS | OBD_CONNECT_MAX_EASIZE |\
+				OBD_CONNECT_FLOCK_DEAD | \
+				OBD_CONNECT_DISP_STRIPE | OBD_CONNECT_LFSCK | \
+				OBD_CONNECT_OPEN_BY_FID | \
+				OBD_CONNECT_DIR_STRIPE | \
+				OBD_CONNECT_BULK_MBITS | \
+				OBD_CONNECT_MULTIMODRPCS | \
+				OBD_CONNECT_SUBTREE | OBD_CONNECT_LARGE_ACL | \
+				OBD_CONNECT_FLAGS2)
+
+#define MDT_CONNECT_SUPPORTED2 OBD_CONNECT2_FILE_SECCTX
+
+#define OST_CONNECT_SUPPORTED  (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
+				OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
+				OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \
+				OBD_CONNECT_BRW_SIZE | OBD_CONNECT_CANCELSET | \
+				OBD_CONNECT_AT | LRU_RESIZE_CONNECT_FLAG | \
+				OBD_CONNECT_CKSUM | OBD_CONNECT_VBR | \
+				OBD_CONNECT_MDS | OBD_CONNECT_SKIP_ORPHAN | \
+				OBD_CONNECT_GRANT_SHRINK | OBD_CONNECT_FULL20 |\
+				OBD_CONNECT_64BITHASH | OBD_CONNECT_MAXBYTES | \
+				OBD_CONNECT_MAX_EASIZE | \
+				OBD_CONNECT_EINPROGRESS | \
+				OBD_CONNECT_JOBSTATS | \
+				OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_LVB_TYPE|\
+				OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_FID | \
+				OBD_CONNECT_PINGLESS | OBD_CONNECT_LFSCK | \
+				OBD_CONNECT_BULK_MBITS | \
+				OBD_CONNECT_GRANT_PARAM)
+#define OST_CONNECT_SUPPORTED2 0
+
+#define ECHO_CONNECT_SUPPORTED 0
+#define ECHO_CONNECT_SUPPORTED2 0
+
+#define MGS_CONNECT_SUPPORTED  (OBD_CONNECT_VERSION | OBD_CONNECT_AT | \
+				OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV | \
+				OBD_CONNECT_MNE_SWAB | OBD_CONNECT_PINGLESS |\
+				OBD_CONNECT_BULK_MBITS | OBD_CONNECT_BARRIER)
+
+#define MGS_CONNECT_SUPPORTED2 0
+
+/* Features required for this version of the client to work with server */
+#define CLIENT_CONNECT_MDT_REQD (OBD_CONNECT_IBITS | OBD_CONNECT_FID | \
+                                 OBD_CONNECT_FULL20)
+
+/* This structure is used for both request and reply.
+ *
+ * If we eventually have separate connect data for different types, which we
+ * almost certainly will, then perhaps we stick a union in here. */
+struct obd_connect_data {
+	__u64 ocd_connect_flags; /* OBD_CONNECT_* per above */
+	__u32 ocd_version;	 /* lustre release version number */
+	__u32 ocd_grant;	 /* initial cache grant amount (bytes) */
+	__u32 ocd_index;	 /* LOV index to connect to */
+	__u32 ocd_brw_size;	 /* Maximum BRW size in bytes */
+        __u64 ocd_ibits_known;   /* inode bits this client understands */
+	__u8  ocd_grant_blkbits; /* log2 of the backend filesystem blocksize */
+	__u8  ocd_grant_inobits; /* log2 of the per-inode space consumption */
+	__u16 ocd_grant_tax_kb;  /* extent insertion overhead, in 1K blocks */
+	__u32 ocd_grant_max_blks;/* maximum number of blocks per extent */
+        __u64 ocd_transno;       /* first transno from client to be replayed */
+        __u32 ocd_group;         /* MDS group on OST */
+        __u32 ocd_cksum_types;   /* supported checksum algorithms */
+        __u32 ocd_max_easize;    /* How big LOV EA can be on MDS */
+        __u32 ocd_instance;      /* instance # of this target */
+        __u64 ocd_maxbytes;      /* Maximum stripe size in bytes */
+        /* Fields after ocd_maxbytes are only accessible by the receiver
+         * if the corresponding flag in ocd_connect_flags is set. Accessing
+         * any field after ocd_maxbytes on the receiver without a valid flag
+         * may result in out-of-bound memory access and kernel oops. */
+	__u16 ocd_maxmodrpcs;    /* Maximum modify RPCs in parallel */
+	__u16 padding0;          /* added 2.1.0. also fix lustre_swab_connect */
+	__u32 padding1;          /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 ocd_connect_flags2;
+        __u64 padding3;          /* added 2.1.0. also fix lustre_swab_connect */
+        __u64 padding4;          /* added 2.1.0. also fix lustre_swab_connect */
+        __u64 padding5;          /* added 2.1.0. also fix lustre_swab_connect */
+        __u64 padding6;          /* added 2.1.0. also fix lustre_swab_connect */
+        __u64 padding7;          /* added 2.1.0. also fix lustre_swab_connect */
+        __u64 padding8;          /* added 2.1.0. also fix lustre_swab_connect */
+        __u64 padding9;          /* added 2.1.0. also fix lustre_swab_connect */
+        __u64 paddingA;          /* added 2.1.0. also fix lustre_swab_connect */
+        __u64 paddingB;          /* added 2.1.0. also fix lustre_swab_connect */
+        __u64 paddingC;          /* added 2.1.0. also fix lustre_swab_connect */
+        __u64 paddingD;          /* added 2.1.0. also fix lustre_swab_connect */
+        __u64 paddingE;          /* added 2.1.0. also fix lustre_swab_connect */
+        __u64 paddingF;          /* added 2.1.0. also fix lustre_swab_connect */
+};
+/* XXX README XXX:
+ * Please DO NOT use any fields here before first ensuring that this same
+ * field is not in use on some other branch.  Please clear any such changes
+ * with senior engineers before starting to use a new field.  Then, submit
+ * a small patch against EVERY branch that ONLY adds the new field along with
+ * the matching OBD_CONNECT flag, so that can be approved and landed easily to
+ * reserve the flag for future use. */
+
+/*
+ * Supported checksum algorithms. Up to 32 checksum types are supported.
+ * (32-bit mask stored in obd_connect_data::ocd_cksum_types)
+ * Please update DECLARE_CKSUM_NAME/OBD_CKSUM_ALL in obd.h when adding a new
+ * algorithm and also the OBD_FL_CKSUM* flags.
+ */
+typedef enum cksum_types {
+        OBD_CKSUM_CRC32 = 0x00000001,
+        OBD_CKSUM_ADLER = 0x00000002,
+        OBD_CKSUM_CRC32C= 0x00000004,
+} cksum_type_t;
+
+/*
+ *   OST requests: OBDO & OBD request records
+ */
+
+/* opcodes */
+typedef enum {
+        OST_REPLY      =  0,       /* reply ? */
+        OST_GETATTR    =  1,
+        OST_SETATTR    =  2,
+        OST_READ       =  3,
+        OST_WRITE      =  4,
+        OST_CREATE     =  5,
+        OST_DESTROY    =  6,
+        OST_GET_INFO   =  7,
+        OST_CONNECT    =  8,
+        OST_DISCONNECT =  9,
+        OST_PUNCH      = 10,
+        OST_OPEN       = 11,
+        OST_CLOSE      = 12,
+        OST_STATFS     = 13,
+        OST_SYNC       = 16,
+        OST_SET_INFO   = 17,
+	OST_QUOTACHECK = 18, /* not used since 2.4 */
+        OST_QUOTACTL   = 19,
+	OST_QUOTA_ADJUST_QUNIT = 20, /* not used since 2.4 */
+	OST_LADVISE    = 21,
+	OST_LAST_OPC /* must be < 33 to avoid MDS_GETATTR */
+} ost_cmd_t;
+#define OST_FIRST_OPC  OST_REPLY
+
+enum obdo_flags {
+        OBD_FL_INLINEDATA   = 0x00000001,
+        OBD_FL_OBDMDEXISTS  = 0x00000002,
+        OBD_FL_DELORPHAN    = 0x00000004, /* if set in o_flags delete orphans */
+        OBD_FL_NORPC        = 0x00000008, /* set in o_flags do in OSC not OST */
+        OBD_FL_IDONLY       = 0x00000010, /* set in o_flags only adjust obj id*/
+        OBD_FL_RECREATE_OBJS= 0x00000020, /* recreate missing obj */
+        OBD_FL_DEBUG_CHECK  = 0x00000040, /* echo client/server debug check */
+	OBD_FL_NO_PRJQUOTA  = 0x00000080, /* the object's project is over
+					   * quota */
+        OBD_FL_NO_USRQUOTA  = 0x00000100, /* the object's owner is over quota */
+        OBD_FL_NO_GRPQUOTA  = 0x00000200, /* the object's group is over quota */
+        OBD_FL_CREATE_CROW  = 0x00000400, /* object should be create on write */
+        OBD_FL_SRVLOCK      = 0x00000800, /* delegate DLM locking to server */
+        OBD_FL_CKSUM_CRC32  = 0x00001000, /* CRC32 checksum type */
+        OBD_FL_CKSUM_ADLER  = 0x00002000, /* ADLER checksum type */
+        OBD_FL_CKSUM_CRC32C = 0x00004000, /* CRC32C checksum type */
+        OBD_FL_CKSUM_RSVD2  = 0x00008000, /* for future cksum types */
+        OBD_FL_CKSUM_RSVD3  = 0x00010000, /* for future cksum types */
+        OBD_FL_SHRINK_GRANT = 0x00020000, /* object shrink the grant */
+        OBD_FL_MMAP         = 0x00040000, /* object is mmapped on the client.
+                                           * XXX: obsoleted - reserved for old
+                                           * clients prior than 2.2 */
+        OBD_FL_RECOV_RESEND = 0x00080000, /* recoverable resent */
+        OBD_FL_NOSPC_BLK    = 0x00100000, /* no more block space on OST */
+	OBD_FL_FLUSH	    = 0x00200000, /* flush pages on the OST */
+	OBD_FL_SHORT_IO	    = 0x00400000, /* short io request */
+	/* OBD_FL_LOCAL_MASK = 0xF0000000, was local-only flags until 2.10 */
+
+	/* Note that while these checksum values are currently separate bits,
+	 * in 2.x we can actually allow all values from 1-31 if we wanted. */
+	OBD_FL_CKSUM_ALL    = OBD_FL_CKSUM_CRC32 | OBD_FL_CKSUM_ADLER |
+			      OBD_FL_CKSUM_CRC32C,
+};
+
+/*
+ * All LOV EA magics should have the same postfix, if some new version
+ * Lustre instroduces new LOV EA magic, then when down-grade to an old
+ * Lustre, even though the old version system does not recognizes such
+ * new magic, it still can distinguish the corrupted cases by checking
+ * the magic's postfix.
+ */
+#define LOV_MAGIC_MAGIC 0x0BD0
+#define LOV_MAGIC_MASK  0xFFFF
+
+#define LOV_MAGIC_V1		(0x0BD10000 | LOV_MAGIC_MAGIC)
+#define LOV_MAGIC_JOIN_V1	(0x0BD20000 | LOV_MAGIC_MAGIC)
+#define LOV_MAGIC_V3		(0x0BD30000 | LOV_MAGIC_MAGIC)
+#define LOV_MAGIC_MIGRATE	(0x0BD40000 | LOV_MAGIC_MAGIC)
+/* reserved for specifying OSTs */
+#define LOV_MAGIC_SPECIFIC	(0x0BD50000 | LOV_MAGIC_MAGIC)
+#define LOV_MAGIC		LOV_MAGIC_V1
+#define LOV_MAGIC_COMP_V1	(0x0BD60000 | LOV_MAGIC_MAGIC)
+
+/*
+ * magic for fully defined striping
+ * the idea is that we should have different magics for striping "hints"
+ * (struct lov_user_md_v[13]) and defined ready-to-use striping (struct
+ * lov_mds_md_v[13]). at the moment the magics are used in wire protocol,
+ * we can't just change it w/o long way preparation, but we still need a
+ * mechanism to allow LOD to differentiate hint versus ready striping.
+ * so, at the moment we do a trick: MDT knows what to expect from request
+ * depending on the case (replay uses ready striping, non-replay req uses
+ * hints), so MDT replaces magic with appropriate one and now LOD can
+ * easily understand what's inside -bzzz
+ *
+ * those *_DEF magics are only used on server side internally, they
+ * won't be put on wire or disk.
+ */
+#define LOV_MAGIC_DEF		0x10000000
+#define LOV_MAGIC_V1_DEF	(LOV_MAGIC_DEF | LOV_MAGIC_V1)
+#define LOV_MAGIC_V3_DEF	(LOV_MAGIC_DEF | LOV_MAGIC_V3)
+#define LOV_MAGIC_COMP_V1_DEF	(LOV_MAGIC_DEF | LOV_MAGIC_COMP_V1)
+
+#define lov_pattern(pattern)		(pattern & ~LOV_PATTERN_F_MASK)
+#define lov_pattern_flags(pattern)	(pattern & LOV_PATTERN_F_MASK)
+
+#define lov_ost_data lov_ost_data_v1
+struct lov_ost_data_v1 {          /* per-stripe data structure (little-endian)*/
+	struct ost_id l_ost_oi;	  /* OST object ID */
+	__u32 l_ost_gen;          /* generation of this l_ost_idx */
+	__u32 l_ost_idx;          /* OST index in LOV (lov_tgt_desc->tgts) */
+};
+
+#define lov_mds_md lov_mds_md_v1
+struct lov_mds_md_v1 {            /* LOV EA mds/wire data (little-endian) */
+	__u32 lmm_magic;          /* magic number = LOV_MAGIC_V1 */
+	__u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+	struct ost_id	lmm_oi;	  /* LOV object ID */
+	__u32 lmm_stripe_size;    /* size of stripe in bytes */
+	/* lmm_stripe_count used to be __u32 */
+	__u16 lmm_stripe_count;   /* num stripes in use for this object */
+	__u16 lmm_layout_gen;     /* layout generation number */
+	struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+};
+
+#define MAX_MD_SIZE (sizeof(struct lov_mds_md) + 4 * sizeof(struct lov_ost_data))
+#define MIN_MD_SIZE (sizeof(struct lov_mds_md) + 1 * sizeof(struct lov_ost_data))
+
+/* This is the default MDT reply size allocated, should the striping be bigger,
+ * it will be reallocated in mdt_fix_reply.
+ * 100 stripes is a bit less than 2.5k of data */
+#define DEF_REP_MD_SIZE (sizeof(struct lov_mds_md) + \
+			 100 * sizeof(struct lov_ost_data))
+
+#define XATTR_NAME_ACL_ACCESS   "system.posix_acl_access"
+#define XATTR_NAME_ACL_DEFAULT  "system.posix_acl_default"
+#define XATTR_USER_PREFIX       "user."
+#define XATTR_TRUSTED_PREFIX    "trusted."
+#define XATTR_SECURITY_PREFIX   "security."
+
+#define XATTR_NAME_LOV          "trusted.lov"
+#define XATTR_NAME_LMA          "trusted.lma"
+#define XATTR_NAME_LMV          "trusted.lmv"
+#define XATTR_NAME_DEFAULT_LMV	"trusted.dmv"
+#define XATTR_NAME_LINK         "trusted.link"
+#define XATTR_NAME_FID          "trusted.fid"
+#define XATTR_NAME_VERSION      "trusted.version"
+#define XATTR_NAME_SOM		"trusted.som"
+#define XATTR_NAME_HSM		"trusted.hsm"
+#define XATTR_NAME_LFSCK_BITMAP "trusted.lfsck_bitmap"
+#define XATTR_NAME_DUMMY	"trusted.dummy"
+
+#define XATTR_NAME_LFSCK_NAMESPACE "trusted.lfsck_ns"
+#define XATTR_NAME_MAX_LEN	32 /* increase this, if there is longer name. */
+
+struct lov_mds_md_v3 {            /* LOV EA mds/wire data (little-endian) */
+	__u32 lmm_magic;          /* magic number = LOV_MAGIC_V3 */
+	__u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+	struct ost_id	lmm_oi;	  /* LOV object ID */
+	__u32 lmm_stripe_size;    /* size of stripe in bytes */
+	/* lmm_stripe_count used to be __u32 */
+	__u16 lmm_stripe_count;   /* num stripes in use for this object */
+	__u16 lmm_layout_gen;     /* layout generation number */
+	char  lmm_pool_name[LOV_MAXPOOLNAME + 1]; /* must be 32bit aligned */
+	struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+};
+
+static inline __u32 lov_mds_md_size(__u16 stripes, __u32 lmm_magic)
+{
+	if (stripes == (__u16)-1)
+		stripes = 0;
+
+	if (lmm_magic == LOV_MAGIC_V3)
+		return sizeof(struct lov_mds_md_v3) +
+				stripes * sizeof(struct lov_ost_data_v1);
+	else
+		return sizeof(struct lov_mds_md_v1) +
+				stripes * sizeof(struct lov_ost_data_v1);
+}
+
+static inline __u32
+lov_mds_md_max_stripe_count(size_t buf_size, __u32 lmm_magic)
+{
+	switch (lmm_magic) {
+	case LOV_MAGIC_V1: {
+		struct lov_mds_md_v1 lmm;
+
+		if (buf_size < sizeof(lmm))
+			return 0;
+
+		return (buf_size - sizeof(lmm)) / sizeof(lmm.lmm_objects[0]);
+	}
+	case LOV_MAGIC_V3: {
+		struct lov_mds_md_v3 lmm;
+
+		if (buf_size < sizeof(lmm))
+			return 0;
+
+		return (buf_size - sizeof(lmm)) / sizeof(lmm.lmm_objects[0]);
+	}
+	default:
+		return 0;
+	}
+}
+
+#define OBD_MD_FLID        (0x00000001ULL) /* object ID */
+#define OBD_MD_FLATIME     (0x00000002ULL) /* access time */
+#define OBD_MD_FLMTIME     (0x00000004ULL) /* data modification time */
+#define OBD_MD_FLCTIME     (0x00000008ULL) /* change time */
+#define OBD_MD_FLSIZE      (0x00000010ULL) /* size */
+#define OBD_MD_FLBLOCKS    (0x00000020ULL) /* allocated blocks count */
+#define OBD_MD_FLBLKSZ     (0x00000040ULL) /* block size */
+#define OBD_MD_FLMODE      (0x00000080ULL) /* access bits (mode & ~S_IFMT) */
+#define OBD_MD_FLTYPE      (0x00000100ULL) /* object type (mode & S_IFMT) */
+#define OBD_MD_FLUID       (0x00000200ULL) /* user ID */
+#define OBD_MD_FLGID       (0x00000400ULL) /* group ID */
+#define OBD_MD_FLFLAGS     (0x00000800ULL) /* flags word */
+#define OBD_MD_FLNLINK     (0x00002000ULL) /* link count */
+#define OBD_MD_FLGENER     (0x00004000ULL) /* generation number */
+/*#define OBD_MD_FLINLINE    (0x00008000ULL)  inline data. used until 1.6.5 */
+#define OBD_MD_FLRDEV      (0x00010000ULL) /* device number */
+#define OBD_MD_FLEASIZE    (0x00020000ULL) /* extended attribute data */
+#define OBD_MD_LINKNAME    (0x00040000ULL) /* symbolic link target */
+#define OBD_MD_FLHANDLE    (0x00080000ULL) /* file/lock handle */
+#define OBD_MD_FLCKSUM     (0x00100000ULL) /* bulk data checksum */
+#define OBD_MD_FLQOS       (0x00200000ULL) /* quality of service stats */
+/*	OBD_MD_FLCOOKIE    (0x00800000ULL)    obsolete in 2.8 */
+#define OBD_MD_FLPRJQUOTA  (0x00400000ULL) /* over quota flags sent from ost */
+#define OBD_MD_FLGROUP     (0x01000000ULL) /* group */
+#define OBD_MD_FLFID       (0x02000000ULL) /* ->ost write inline fid */
+#define OBD_MD_FLEPOCH     (0x04000000ULL) /* ->ost write with ioepoch */
+                                           /* ->mds if epoch opens or closes */
+#define OBD_MD_FLGRANT     (0x08000000ULL) /* ost preallocation space grant */
+#define OBD_MD_FLDIREA     (0x10000000ULL) /* dir's extended attribute data */
+#define OBD_MD_FLUSRQUOTA  (0x20000000ULL) /* over quota flags sent from ost */
+#define OBD_MD_FLGRPQUOTA  (0x40000000ULL) /* over quota flags sent from ost */
+#define OBD_MD_FLMODEASIZE (0x80000000ULL) /* EA size will be changed */
+
+#define OBD_MD_MDS         (0x0000000100000000ULL) /* where an inode lives on */
+#define OBD_MD_REINT       (0x0000000200000000ULL) /* reintegrate oa */
+#define OBD_MD_MEA         (0x0000000400000000ULL) /* CMD split EA  */
+#define OBD_MD_TSTATE      (0x0000000800000000ULL) /* transient state field */
+
+#define OBD_MD_FLXATTR       (0x0000001000000000ULL) /* xattr */
+#define OBD_MD_FLXATTRLS     (0x0000002000000000ULL) /* xattr list */
+#define OBD_MD_FLXATTRRM     (0x0000004000000000ULL) /* xattr remove */
+#define OBD_MD_FLACL         (0x0000008000000000ULL) /* ACL */
+/*	OBD_MD_FLRMTPERM     (0x0000010000000000ULL) remote perm, obsolete */
+#define OBD_MD_FLMDSCAPA     (0x0000020000000000ULL) /* MDS capability */
+#define OBD_MD_FLOSSCAPA     (0x0000040000000000ULL) /* OSS capability */
+#define OBD_MD_FLCKSPLIT     (0x0000080000000000ULL) /* Check split on server */
+#define OBD_MD_FLCROSSREF    (0x0000100000000000ULL) /* Cross-ref case */
+#define OBD_MD_FLGETATTRLOCK (0x0000200000000000ULL) /* Get IOEpoch attributes
+                                                      * under lock; for xattr
+                                                      * requests means the
+                                                      * client holds the lock */
+#define OBD_MD_FLOBJCOUNT    (0x0000400000000000ULL) /* for multiple destroy */
+
+#define OBD_MD_FLDATAVERSION (0x0010000000000000ULL) /* iversion sum */
+#define OBD_MD_CLOSE_INTENT_EXECED (0x0020000000000000ULL) /* close intent
+							      executed */
+
+#define OBD_MD_DEFAULT_MEA   (0x0040000000000000ULL) /* default MEA */
+#define OBD_MD_FLOSTLAYOUT   (0x0080000000000000ULL) /* contain ost_layout */
+#define OBD_MD_FLPROJID      (0x0100000000000000ULL) /* project ID */
+
+#define OBD_MD_FLALLQUOTA (OBD_MD_FLUSRQUOTA | \
+			   OBD_MD_FLGRPQUOTA | \
+			   OBD_MD_FLPRJQUOTA)
+
+#define OBD_MD_FLGETATTR (OBD_MD_FLID    | OBD_MD_FLATIME | OBD_MD_FLMTIME | \
+			  OBD_MD_FLCTIME | OBD_MD_FLSIZE  | OBD_MD_FLBLKSZ | \
+			  OBD_MD_FLMODE  | OBD_MD_FLTYPE  | OBD_MD_FLUID   | \
+			  OBD_MD_FLGID   | OBD_MD_FLFLAGS | OBD_MD_FLNLINK | \
+			  OBD_MD_FLGENER | OBD_MD_FLRDEV  | OBD_MD_FLGROUP | \
+			  OBD_MD_FLPROJID)
+
+#define OBD_MD_FLXATTRALL (OBD_MD_FLXATTR | OBD_MD_FLXATTRLS)
+
+/* don't forget obdo_fid which is way down at the bottom so it can
+ * come after the definition of llog_cookie */
+
+enum hss_valid {
+	HSS_SETMASK	= 0x01,
+	HSS_CLEARMASK	= 0x02,
+	HSS_ARCHIVE_ID	= 0x04,
+};
+
+struct hsm_state_set {
+	__u32	hss_valid;
+	__u32	hss_archive_id;
+	__u64	hss_setmask;
+	__u64	hss_clearmask;
+};
+
+/* ost_body.data values for OST_BRW */
+
+#define OBD_BRW_READ            0x01
+#define OBD_BRW_WRITE           0x02
+#define OBD_BRW_RWMASK          (OBD_BRW_READ | OBD_BRW_WRITE)
+#define OBD_BRW_SYNC            0x08 /* this page is a part of synchronous
+                                      * transfer and is not accounted in
+                                      * the grant. */
+#define OBD_BRW_CHECK           0x10
+#define OBD_BRW_FROM_GRANT      0x20 /* the osc manages this under llite */
+#define OBD_BRW_GRANTED         0x40 /* the ost manages this */
+#define OBD_BRW_NOCACHE         0x80 /* this page is a part of non-cached IO */
+#define OBD_BRW_NOQUOTA        0x100
+#define OBD_BRW_SRVLOCK        0x200 /* Client holds no lock over this page */
+#define OBD_BRW_ASYNC          0x400 /* Server may delay commit to disk */
+#define OBD_BRW_MEMALLOC       0x800 /* Client runs in the "kswapd" context */
+#define OBD_BRW_OVER_USRQUOTA 0x1000 /* Running out of user quota */
+#define OBD_BRW_OVER_GRPQUOTA 0x2000 /* Running out of group quota */
+#define OBD_BRW_SOFT_SYNC     0x4000 /* This flag notifies the server
+				      * that the client is running low on
+				      * space for unstable pages; asking
+				      * it to sync quickly */
+#define OBD_BRW_OVER_PRJQUOTA 0x8000 /* Running out of project quota */
+
+#define OBD_BRW_OVER_ALLQUOTA (OBD_BRW_OVER_USRQUOTA | \
+			       OBD_BRW_OVER_GRPQUOTA | \
+			       OBD_BRW_OVER_PRJQUOTA)
+
+#define OBD_BRW_LOCAL1	0x80000000UL	/*
+					 * osd-ldiskfs internal,
+					 * page mapped to real block
+					 */
+
+#define OBD_BRW_LOCALS (OBD_BRW_LOCAL1)
+
+#define OBD_OBJECT_EOF LUSTRE_EOF
+
+#define OST_MIN_PRECREATE 32
+#define OST_MAX_PRECREATE 20000
+
+struct obd_ioobj {
+	struct ost_id	ioo_oid;	/* object ID, if multi-obj BRW */
+	__u32		ioo_max_brw;	/* low 16 bits were o_mode before 2.4,
+					 * now (PTLRPC_BULK_OPS_COUNT - 1) in
+					 * high 16 bits in 2.4 and later */
+	__u32		ioo_bufcnt;	/* number of niobufs for this object */
+};
+
+/* NOTE: IOOBJ_MAX_BRW_BITS defines the _offset_ of the max_brw field in
+ * ioo_max_brw, NOT the maximum number of bits in PTLRPC_BULK_OPS_BITS.
+ * That said, ioo_max_brw is a 32-bit field so the limit is also 16 bits. */
+#define IOOBJ_MAX_BRW_BITS	16
+#define ioobj_max_brw_get(ioo)	(((ioo)->ioo_max_brw >> IOOBJ_MAX_BRW_BITS) + 1)
+#define ioobj_max_brw_set(ioo, num)					\
+do { (ioo)->ioo_max_brw = ((num) - 1) << IOOBJ_MAX_BRW_BITS; } while (0)
+
+/* multiple of 8 bytes => can array */
+struct niobuf_remote {
+	__u64	rnb_offset;
+	__u32	rnb_len;
+	__u32	rnb_flags;
+};
+
+/* lock value block communicated between the filter and llite */
+
+/* OST_LVB_ERR_INIT is needed because the return code in rc is
+ * negative, i.e. because ((MASK + rc) & MASK) != MASK. */
+#define OST_LVB_ERR_INIT 0xffbadbad80000000ULL
+#define OST_LVB_ERR_MASK 0xffbadbad00000000ULL
+#define OST_LVB_IS_ERR(blocks)                                          \
+        ((blocks & OST_LVB_ERR_MASK) == OST_LVB_ERR_MASK)
+#define OST_LVB_SET_ERR(blocks, rc)                                     \
+        do { blocks = OST_LVB_ERR_INIT + rc; } while (0)
+#define OST_LVB_GET_ERR(blocks)    (int)(blocks - OST_LVB_ERR_INIT)
+
+struct ost_lvb_v1 {
+	__u64	lvb_size;
+	__s64	lvb_mtime;
+	__s64	lvb_atime;
+	__s64	lvb_ctime;
+	__u64	lvb_blocks;
+};
+
+struct ost_lvb {
+	__u64	lvb_size;
+	__s64	lvb_mtime;
+	__s64	lvb_atime;
+	__s64	lvb_ctime;
+	__u64	lvb_blocks;
+	__u32	lvb_mtime_ns;
+	__u32	lvb_atime_ns;
+	__u32	lvb_ctime_ns;
+	__u32	lvb_padding;
+};
+
+/*
+ *   lquota data structures
+ */
+
+#ifndef QUOTABLOCK_BITS
+# define QUOTABLOCK_BITS LUSTRE_QUOTABLOCK_BITS
+#endif
+
+#ifndef QUOTABLOCK_SIZE
+# define QUOTABLOCK_SIZE LUSTRE_QUOTABLOCK_SIZE
+#endif
+
+#ifndef toqb
+# define toqb lustre_stoqb
+#endif
+
+/* The lquota_id structure is an union of all the possible identifier types that
+ * can be used with quota, this includes:
+ * - 64-bit user ID
+ * - 64-bit group ID
+ * - a FID which can be used for per-directory quota in the future */
+union lquota_id {
+	struct lu_fid	qid_fid; /* FID for per-directory quota */
+	__u64		qid_uid; /* user identifier */
+	__u64		qid_gid; /* group identifier */
+	__u64		qid_projid; /* project identifier */
+};
+
+/* quotactl management */
+struct obd_quotactl {
+	__u32			qc_cmd;
+	__u32			qc_type; /* see Q_* flag below */
+	__u32			qc_id;
+	__u32			qc_stat;
+	struct obd_dqinfo	qc_dqinfo;
+	struct obd_dqblk	qc_dqblk;
+};
+
+#define Q_COPY(out, in, member) (out)->member = (in)->member
+
+#define QCTL_COPY(out, in)		\
+do {					\
+	Q_COPY(out, in, qc_cmd);	\
+	Q_COPY(out, in, qc_type);	\
+	Q_COPY(out, in, qc_id);		\
+	Q_COPY(out, in, qc_stat);	\
+	Q_COPY(out, in, qc_dqinfo);	\
+	Q_COPY(out, in, qc_dqblk);	\
+} while (0)
+
+/* Body of quota request used for quota acquire/release RPCs between quota
+ * master (aka QMT) and slaves (ak QSD). */
+struct quota_body {
+	struct lu_fid	qb_fid;     /* FID of global index packing the pool ID
+				      * and type (data or metadata) as well as
+				      * the quota type (user or group). */
+	union lquota_id	qb_id;      /* uid or gid or directory FID */
+	__u32		qb_flags;   /* see below */
+	__u32		qb_padding;
+	__u64		qb_count;   /* acquire/release count (kbytes/inodes) */
+	__u64		qb_usage;   /* current slave usage (kbytes/inodes) */
+	__u64		qb_slv_ver; /* slave index file version */
+	struct lustre_handle	qb_lockh;     /* per-ID lock handle */
+	struct lustre_handle	qb_glb_lockh; /* global lock handle */
+	__u64		qb_padding1[4];
+};
+
+/* When the quota_body is used in the reply of quota global intent
+ * lock (IT_QUOTA_CONN) reply, qb_fid contains slave index file FID. */
+#define qb_slv_fid	qb_fid
+/* qb_usage is the current qunit (in kbytes/inodes) when quota_body is used in
+ * quota reply */
+#define qb_qunit	qb_usage
+
+#define QUOTA_DQACQ_FL_ACQ	0x1  /* acquire quota */
+#define QUOTA_DQACQ_FL_PREACQ	0x2  /* pre-acquire */
+#define QUOTA_DQACQ_FL_REL	0x4  /* release quota */
+#define QUOTA_DQACQ_FL_REPORT	0x8  /* report usage */
+
+/* Quota types currently supported */
+enum {
+	LQUOTA_TYPE_USR	= 0x00, /* maps to USRQUOTA */
+	LQUOTA_TYPE_GRP	= 0x01, /* maps to GRPQUOTA */
+	LQUOTA_TYPE_PRJ	= 0x02, /* maps to PRJQUOTA */
+	LQUOTA_TYPE_MAX
+};
+
+/* There are 2 different resource types on which a quota limit can be enforced:
+ * - inodes on the MDTs
+ * - blocks on the OSTs */
+enum {
+	LQUOTA_RES_MD		= 0x01, /* skip 0 to avoid null oid in FID */
+	LQUOTA_RES_DT		= 0x02,
+	LQUOTA_LAST_RES,
+	LQUOTA_FIRST_RES	= LQUOTA_RES_MD
+};
+#define LQUOTA_NR_RES (LQUOTA_LAST_RES - LQUOTA_FIRST_RES + 1)
+
+/*
+ * Space accounting support
+ * Format of an accounting record, providing disk usage information for a given
+ * user or group
+ */
+struct lquota_acct_rec { /* 16 bytes */
+	__u64 bspace;  /* current space in use */
+	__u64 ispace;  /* current # inodes in use */
+};
+
+/*
+ * Global quota index support
+ * Format of a global record, providing global quota settings for a given quota
+ * identifier
+ */
+struct lquota_glb_rec { /* 32 bytes */
+	__u64 qbr_hardlimit; /* quota hard limit, in #inodes or kbytes */
+	__u64 qbr_softlimit; /* quota soft limit, in #inodes or kbytes */
+	__u64 qbr_time;      /* grace time, in seconds */
+	__u64 qbr_granted;   /* how much is granted to slaves, in #inodes or
+			      * kbytes */
+};
+
+/*
+ * Slave index support
+ * Format of a slave record, recording how much space is granted to a given
+ * slave
+ */
+struct lquota_slv_rec { /* 8 bytes */
+	__u64 qsr_granted; /* space granted to the slave for the key=ID,
+			    * in #inodes or kbytes */
+};
+
+/* Data structures associated with the quota locks */
+
+/* Glimpse descriptor used for the index & per-ID quota locks */
+struct ldlm_gl_lquota_desc {
+	union lquota_id	gl_id;    /* quota ID subject to the glimpse */
+	__u64		gl_flags; /* see LQUOTA_FL* below */
+	__u64		gl_ver;   /* new index version */
+	__u64		gl_hardlimit; /* new hardlimit or qunit value */
+	__u64		gl_softlimit; /* new softlimit */
+	__u64		gl_time;
+	__u64		gl_pad2;
+};
+#define gl_qunit	gl_hardlimit /* current qunit value used when
+				      * glimpsing per-ID quota locks */
+
+/* quota glimpse flags */
+#define LQUOTA_FL_EDQUOT 0x1 /* user/group out of quota space on QMT */
+
+/* LVB used with quota (global and per-ID) locks */
+struct lquota_lvb {
+	__u64	lvb_flags;	/* see LQUOTA_FL* above */
+	__u64	lvb_id_may_rel; /* space that might be released later */
+	__u64	lvb_id_rel;     /* space released by the slave for this ID */
+	__u64	lvb_id_qunit;   /* current qunit value */
+	__u64	lvb_pad1;
+};
+
+/* LVB used with global quota lock */
+#define lvb_glb_ver  lvb_id_may_rel /* current version of the global index */
+
+/* op codes */
+typedef enum {
+	QUOTA_DQACQ	= 601,
+	QUOTA_DQREL	= 602,
+	QUOTA_LAST_OPC
+} quota_cmd_t;
+#define QUOTA_FIRST_OPC	QUOTA_DQACQ
+
+/*
+ *   MDS REQ RECORDS
+ */
+
+/* opcodes */
+typedef enum {
+	MDS_GETATTR		= 33,
+	MDS_GETATTR_NAME	= 34,
+	MDS_CLOSE		= 35,
+	MDS_REINT		= 36,
+	MDS_READPAGE		= 37,
+	MDS_CONNECT		= 38,
+	MDS_DISCONNECT		= 39,
+	MDS_GET_ROOT		= 40,
+	MDS_STATFS		= 41,
+	MDS_PIN			= 42, /* obsolete, never used in a release */
+	MDS_UNPIN		= 43, /* obsolete, never used in a release */
+	MDS_SYNC		= 44,
+	MDS_DONE_WRITING	= 45, /* obsolete since 2.8.0 */
+	MDS_SET_INFO		= 46,
+	MDS_QUOTACHECK		= 47, /* not used since 2.4 */
+	MDS_QUOTACTL		= 48,
+	MDS_GETXATTR		= 49,
+	MDS_SETXATTR		= 50, /* obsolete, now it's MDS_REINT op */
+	MDS_WRITEPAGE		= 51,
+	MDS_IS_SUBDIR		= 52, /* obsolete, never used in a release */
+	MDS_GET_INFO		= 53,
+	MDS_HSM_STATE_GET	= 54,
+	MDS_HSM_STATE_SET	= 55,
+	MDS_HSM_ACTION		= 56,
+	MDS_HSM_PROGRESS	= 57,
+	MDS_HSM_REQUEST		= 58,
+	MDS_HSM_CT_REGISTER	= 59,
+	MDS_HSM_CT_UNREGISTER	= 60,
+	MDS_SWAP_LAYOUTS	= 61,
+	MDS_LAST_OPC
+} mds_cmd_t;
+
+#define MDS_FIRST_OPC    MDS_GETATTR
+
+
+/* opcodes for object update */
+typedef enum {
+	OUT_UPDATE	= 1000,
+	OUT_UPDATE_LAST_OPC
+} update_cmd_t;
+
+#define OUT_UPDATE_FIRST_OPC    OUT_UPDATE
+
+/*
+ * Do not exceed 63
+ */
+
+typedef enum {
+	REINT_SETATTR  = 1,
+	REINT_CREATE   = 2,
+	REINT_LINK     = 3,
+	REINT_UNLINK   = 4,
+	REINT_RENAME   = 5,
+	REINT_OPEN     = 6,
+	REINT_SETXATTR = 7,
+	REINT_RMENTRY  = 8,
+	REINT_MIGRATE  = 9,
+        REINT_MAX
+} mds_reint_t, mdt_reint_t;
+
+/* the disposition of the intent outlines what was executed */
+#define DISP_IT_EXECD        0x00000001
+#define DISP_LOOKUP_EXECD    0x00000002
+#define DISP_LOOKUP_NEG      0x00000004
+#define DISP_LOOKUP_POS      0x00000008
+#define DISP_OPEN_CREATE     0x00000010
+#define DISP_OPEN_OPEN       0x00000020
+#define DISP_ENQ_COMPLETE    0x00400000		/* obsolete and unused */
+#define DISP_ENQ_OPEN_REF    0x00800000
+#define DISP_ENQ_CREATE_REF  0x01000000
+#define DISP_OPEN_LOCK       0x02000000
+#define DISP_OPEN_LEASE      0x04000000
+#define DISP_OPEN_STRIPE     0x08000000
+#define DISP_OPEN_DENY	     0x10000000
+
+/* INODE LOCK PARTS */
+#define MDS_INODELOCK_LOOKUP 0x000001	/* For namespace, dentry etc, and also
+					 * was used to protect permission (mode,
+					 * owner, group etc) before 2.4. */
+#define MDS_INODELOCK_UPDATE 0x000002	/* size, links, timestamps */
+#define MDS_INODELOCK_OPEN   0x000004	/* For opened files */
+#define MDS_INODELOCK_LAYOUT 0x000008	/* for layout */
+
+/* The PERM bit is added int 2.4, and it is used to protect permission(mode,
+ * owner, group, acl etc), so to separate the permission from LOOKUP lock.
+ * Because for remote directories(in DNE), these locks will be granted by
+ * different MDTs(different ldlm namespace).
+ *
+ * For local directory, MDT will always grant UPDATE_LOCK|PERM_LOCK together.
+ * For Remote directory, the master MDT, where the remote directory is, will
+ * grant UPDATE_LOCK|PERM_LOCK, and the remote MDT, where the name entry is,
+ * will grant LOOKUP_LOCK. */
+#define MDS_INODELOCK_PERM   0x000010
+#define MDS_INODELOCK_XATTR  0x000020	/* extended attributes */
+
+#define MDS_INODELOCK_MAXSHIFT 5
+/* This FULL lock is useful to take on unlink sort of operations */
+#define MDS_INODELOCK_FULL ((1<<(MDS_INODELOCK_MAXSHIFT+1))-1)
+
+/* NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2],
+ * but was moved into name[1] along with the OID to avoid consuming the
+ * name[2,3] fields that need to be used for the quota id (also a FID). */
+enum {
+        LUSTRE_RES_ID_SEQ_OFF = 0,
+        LUSTRE_RES_ID_VER_OID_OFF = 1,
+        LUSTRE_RES_ID_WAS_VER_OFF = 2, /* see note above */
+	LUSTRE_RES_ID_QUOTA_SEQ_OFF = 2,
+	LUSTRE_RES_ID_QUOTA_VER_OID_OFF = 3,
+        LUSTRE_RES_ID_HSH_OFF = 3
+};
+
+#define MDS_STATUS_CONN 1
+#define MDS_STATUS_LOV 2
+
+enum {
+	/* these should be identical to their EXT4_*_FL counterparts, they are
+	 * redefined here only to avoid dragging in fs/ext4/ext4.h */
+	LUSTRE_SYNC_FL = 0x00000008, /* Synchronous updates */
+	LUSTRE_IMMUTABLE_FL = 0x00000010, /* Immutable file */
+	LUSTRE_APPEND_FL = 0x00000020, /* writes to file may only append */
+	LUSTRE_NODUMP_FL = 0x00000040, /* do not dump file */
+	LUSTRE_NOATIME_FL = 0x00000080, /* do not update atime */
+	LUSTRE_INDEX_FL = 0x00001000, /* hash-indexed directory */
+	LUSTRE_DIRSYNC_FL = 0x00010000, /* dirsync behaviour (dir only) */
+	LUSTRE_TOPDIR_FL = 0x00020000, /* Top of directory hierarchies*/
+	LUSTRE_DIRECTIO_FL = 0x00100000, /* Use direct i/o */
+	LUSTRE_INLINE_DATA_FL = 0x10000000, /* Inode has inline data. */
+	LUSTRE_PROJINHERIT_FL = 0x20000000, /* Create with parents projid */
+
+	/* These flags will not be identical to any EXT4_*_FL counterparts,
+	 * and only reserved for lustre purpose. Note: these flags might
+	 * be conflict with some of EXT4 flags, so
+	 * 1. these conflict flags needs to be removed when the flag is
+	 * wired by la_flags see osd_attr_get().
+	 * 2. If these flags needs to be stored into inode, they will be
+	 * stored in LMA. see LMAI_XXXX */
+	LUSTRE_ORPHAN_FL = 0x00002000,
+	LUSTRE_SET_SYNC_FL = 0x00040000, /* Synchronous setattr on OSTs */
+
+	LUSTRE_LMA_FL_MASKS = LUSTRE_ORPHAN_FL,
+};
+
+#ifndef FS_XFLAG_PROJINHERIT
+#define FS_XFLAG_PROJINHERIT	0x00000200	/* create with parents projid */
+#endif
+
+#ifdef __KERNEL__
+/* Convert wire LUSTRE_*_FL to corresponding client local VFS S_* values
+ * for the client inode i_flags.  The LUSTRE_*_FL are the Lustre wire
+ * protocol equivalents of LDISKFS_*_FL values stored on disk, while
+ * the S_* flags are kernel-internal values that change between kernel
+ * versions.  These flags are set/cleared via FSFILT_IOC_{GET,SET}_FLAGS.
+ * See b=16526 for a full history. */
+static inline int ll_ext_to_inode_flags(int flags)
+{
+        return (((flags & LUSTRE_SYNC_FL)      ? S_SYNC      : 0) |
+                ((flags & LUSTRE_NOATIME_FL)   ? S_NOATIME   : 0) |
+                ((flags & LUSTRE_APPEND_FL)    ? S_APPEND    : 0) |
+#if defined(S_DIRSYNC)
+                ((flags & LUSTRE_DIRSYNC_FL)   ? S_DIRSYNC   : 0) |
+#endif
+		((flags & LUSTRE_IMMUTABLE_FL) ? S_IMMUTABLE : 0) |
+		((flags & LUSTRE_PROJINHERIT_FL) ? FS_XFLAG_PROJINHERIT : 0));
+}
+
+static inline int ll_inode_to_ext_flags(int iflags)
+{
+        return (((iflags & S_SYNC)      ? LUSTRE_SYNC_FL      : 0) |
+                ((iflags & S_NOATIME)   ? LUSTRE_NOATIME_FL   : 0) |
+                ((iflags & S_APPEND)    ? LUSTRE_APPEND_FL    : 0) |
+#if defined(S_DIRSYNC)
+                ((iflags & S_DIRSYNC)   ? LUSTRE_DIRSYNC_FL   : 0) |
+#endif
+		((iflags & S_IMMUTABLE) ? LUSTRE_IMMUTABLE_FL : 0) |
+		((iflags & FS_XFLAG_PROJINHERIT) ? LUSTRE_PROJINHERIT_FL : 0));
+}
+#endif
+
+/* 64 possible states */
+enum md_transient_state {
+	MS_RESTORE	= (1 << 0),	/* restore is running */
+};
+
+struct mdt_body {
+	struct lu_fid mbo_fid1;
+	struct lu_fid mbo_fid2;
+	struct lustre_handle mbo_handle;
+	__u64	mbo_valid;
+	__u64	mbo_size; /* Offset, in the case of MDS_READPAGE */
+	__s64	mbo_mtime;
+	__s64	mbo_atime;
+	__s64	mbo_ctime;
+	__u64	mbo_blocks; /* XID, in the case of MDS_READPAGE */
+	__u64	mbo_ioepoch;
+	__u64	mbo_t_state; /* transient file state defined in
+			      * enum md_transient_state
+			      * was "ino" until 2.4.0 */
+	__u32	mbo_fsuid;
+	__u32	mbo_fsgid;
+	__u32	mbo_capability;
+	__u32	mbo_mode;
+	__u32	mbo_uid;
+	__u32	mbo_gid;
+	__u32	mbo_flags;   /* LUSTRE_*_FL file attributes */
+	__u32	mbo_rdev;
+	__u32	mbo_nlink; /* #bytes to read in the case of MDS_READPAGE */
+	__u32	mbo_unused2; /* was "generation" until 2.4.0 */
+	__u32	mbo_suppgid;
+	__u32	mbo_eadatasize;
+	__u32	mbo_aclsize;
+	__u32	mbo_max_mdsize;
+	__u32	mbo_unused3; /* was max_cookiesize until 2.8 */
+	__u32	mbo_uid_h; /* high 32-bits of uid, for FUID */
+	__u32	mbo_gid_h; /* high 32-bits of gid, for FUID */
+	__u32	mbo_projid;
+	__u64	mbo_padding_6; /* also fix lustre_swab_mdt_body */
+	__u64	mbo_padding_7;
+	__u64	mbo_padding_8;
+	__u64	mbo_padding_9;
+	__u64	mbo_padding_10;
+}; /* 216 */
+
+struct mdt_ioepoch {
+	struct lustre_handle mio_handle;
+	__u64 mio_unused1; /* was ioepoch */
+	__u32 mio_unused2; /* was flags */
+	__u32 mio_padding;
+};
+
+/* permissions for md_perm.mp_perm */
+enum {
+        CFS_SETUID_PERM = 0x01,
+        CFS_SETGID_PERM = 0x02,
+        CFS_SETGRP_PERM = 0x04,
+};
+
+struct mdt_rec_setattr {
+        __u32           sa_opcode;
+        __u32           sa_cap;
+        __u32           sa_fsuid;
+        __u32           sa_fsuid_h;
+        __u32           sa_fsgid;
+        __u32           sa_fsgid_h;
+        __u32           sa_suppgid;
+        __u32           sa_suppgid_h;
+        __u32           sa_padding_1;
+        __u32           sa_padding_1_h;
+        struct lu_fid   sa_fid;
+        __u64           sa_valid;
+        __u32           sa_uid;
+        __u32           sa_gid;
+        __u64           sa_size;
+        __u64           sa_blocks;
+	__s64		sa_mtime;
+	__s64		sa_atime;
+	__s64		sa_ctime;
+        __u32           sa_attr_flags;
+        __u32           sa_mode;
+	__u32           sa_bias;      /* some operation flags */
+	__u32		sa_projid;
+        __u32           sa_padding_4;
+        __u32           sa_padding_5;
+};
+
+/*
+ * Attribute flags used in mdt_rec_setattr::sa_valid.
+ * The kernel's #defines for ATTR_* should not be used over the network
+ * since the client and MDS may run different kernels (see bug 13828)
+ * Therefore, we should only use MDS_ATTR_* attributes for sa_valid.
+ */
+#define MDS_ATTR_MODE          0x1ULL /* = 1 */
+#define MDS_ATTR_UID           0x2ULL /* = 2 */
+#define MDS_ATTR_GID           0x4ULL /* = 4 */
+#define MDS_ATTR_SIZE          0x8ULL /* = 8 */
+#define MDS_ATTR_ATIME        0x10ULL /* = 16 */
+#define MDS_ATTR_MTIME        0x20ULL /* = 32 */
+#define MDS_ATTR_CTIME        0x40ULL /* = 64 */
+#define MDS_ATTR_ATIME_SET    0x80ULL /* = 128 */
+#define MDS_ATTR_MTIME_SET   0x100ULL /* = 256 */
+#define MDS_ATTR_FORCE       0x200ULL /* = 512, Not a change, but a change it */
+#define MDS_ATTR_ATTR_FLAG   0x400ULL /* = 1024 */
+#define MDS_ATTR_KILL_SUID   0x800ULL /* = 2048 */
+#define MDS_ATTR_KILL_SGID  0x1000ULL /* = 4096 */
+#define MDS_ATTR_CTIME_SET  0x2000ULL /* = 8192 */
+#define MDS_ATTR_FROM_OPEN  0x4000ULL /* = 16384, called from open path, ie O_TRUNC */
+#define MDS_ATTR_BLOCKS     0x8000ULL /* = 32768 */
+#define MDS_ATTR_PROJID	    0x10000ULL	/* = 65536 */
+
+#ifndef FMODE_READ
+#define FMODE_READ               00000001
+#define FMODE_WRITE              00000002
+#endif
+
+#define MDS_FMODE_CLOSED         00000000
+#define MDS_FMODE_EXEC           00000004
+/*	MDS_FMODE_EPOCH          01000000 obsolete since 2.8.0 */
+/*	MDS_FMODE_TRUNC          02000000 obsolete since 2.8.0 */
+/*	MDS_FMODE_SOM            04000000 obsolete since 2.8.0 */
+
+#define MDS_OPEN_CREATED         00000010
+#define MDS_OPEN_CROSS           00000020
+
+#define MDS_OPEN_CREAT           00000100
+#define MDS_OPEN_EXCL            00000200
+#define MDS_OPEN_TRUNC           00001000
+#define MDS_OPEN_APPEND          00002000
+#define MDS_OPEN_SYNC            00010000
+#define MDS_OPEN_DIRECTORY       00200000
+
+#define MDS_OPEN_BY_FID 	040000000 /* open_by_fid for known object */
+#define MDS_OPEN_DELAY_CREATE  0100000000 /* delay initial object create */
+#define MDS_OPEN_OWNEROVERRIDE 0200000000 /* NFSD rw-reopen ro file for owner */
+#define MDS_OPEN_JOIN_FILE     0400000000 /* open for join file.
+                                           * We do not support JOIN FILE
+                                           * anymore, reserve this flags
+                                           * just for preventing such bit
+                                           * to be reused. */
+
+#define MDS_OPEN_LOCK         04000000000 /* This open requires open lock */
+#define MDS_OPEN_HAS_EA      010000000000 /* specify object create pattern */
+#define MDS_OPEN_HAS_OBJS    020000000000 /* Just set the EA the obj exist */
+#define MDS_OPEN_NORESTORE  0100000000000ULL /* Do not restore file at open */
+#define MDS_OPEN_NEWSTRIPE  0200000000000ULL /* New stripe needed (restripe or
+                                              * hsm restore) */
+#define MDS_OPEN_VOLATILE   0400000000000ULL /* File is volatile = created
+						unlinked */
+#define MDS_OPEN_LEASE	   01000000000000ULL /* Open the file and grant lease
+					      * delegation, succeed if it's not
+					      * being opened with conflict mode.
+					      */
+#define MDS_OPEN_RELEASE   02000000000000ULL /* Open the file for HSM release */
+
+/* lustre internal open flags, which should not be set from user space */
+#define MDS_OPEN_FL_INTERNAL (MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS |	\
+			      MDS_OPEN_OWNEROVERRIDE | MDS_OPEN_LOCK |	\
+			      MDS_OPEN_BY_FID | MDS_OPEN_LEASE |	\
+			      MDS_OPEN_RELEASE)
+
+enum mds_op_bias {
+	MDS_CHECK_SPLIT		= 1 << 0,
+	MDS_CROSS_REF		= 1 << 1,
+	MDS_VTX_BYPASS		= 1 << 2,
+	MDS_PERM_BYPASS		= 1 << 3,
+/*	MDS_SOM			= 1 << 4, obsolete since 2.8.0 */
+	MDS_QUOTA_IGNORE	= 1 << 5,
+	/* Was MDS_CLOSE_CLEANUP (1 << 6), No more used */
+	MDS_KEEP_ORPHAN		= 1 << 7,
+	MDS_RECOV_OPEN		= 1 << 8,
+	MDS_DATA_MODIFIED	= 1 << 9,
+	MDS_CREATE_VOLATILE	= 1 << 10,
+	MDS_OWNEROVERRIDE	= 1 << 11,
+	MDS_HSM_RELEASE		= 1 << 12,
+	MDS_RENAME_MIGRATE	= 1 << 13,
+	MDS_CLOSE_LAYOUT_SWAP	= 1 << 14,
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_create {
+        __u32           cr_opcode;
+        __u32           cr_cap;
+        __u32           cr_fsuid;
+        __u32           cr_fsuid_h;
+        __u32           cr_fsgid;
+        __u32           cr_fsgid_h;
+        __u32           cr_suppgid1;
+        __u32           cr_suppgid1_h;
+        __u32           cr_suppgid2;
+        __u32           cr_suppgid2_h;
+        struct lu_fid   cr_fid1;
+        struct lu_fid   cr_fid2;
+        struct lustre_handle cr_old_handle; /* handle in case of open replay */
+	__s64		cr_time;
+        __u64           cr_rdev;
+        __u64           cr_ioepoch;
+        __u64           cr_padding_1;   /* rr_blocks */
+        __u32           cr_mode;
+        __u32           cr_bias;
+        /* use of helpers set/get_mrc_cr_flags() is needed to access
+         * 64 bits cr_flags [cr_flags_l, cr_flags_h], this is done to
+         * extend cr_flags size without breaking 1.8 compat */
+        __u32           cr_flags_l;     /* for use with open, low  32 bits  */
+        __u32           cr_flags_h;     /* for use with open, high 32 bits */
+        __u32           cr_umask;       /* umask for create */
+        __u32           cr_padding_4;   /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_link {
+        __u32           lk_opcode;
+        __u32           lk_cap;
+        __u32           lk_fsuid;
+        __u32           lk_fsuid_h;
+        __u32           lk_fsgid;
+        __u32           lk_fsgid_h;
+        __u32           lk_suppgid1;
+        __u32           lk_suppgid1_h;
+        __u32           lk_suppgid2;
+        __u32           lk_suppgid2_h;
+        struct lu_fid   lk_fid1;
+        struct lu_fid   lk_fid2;
+	__s64		lk_time;
+        __u64           lk_padding_1;   /* rr_atime */
+        __u64           lk_padding_2;   /* rr_ctime */
+        __u64           lk_padding_3;   /* rr_size */
+        __u64           lk_padding_4;   /* rr_blocks */
+        __u32           lk_bias;
+        __u32           lk_padding_5;   /* rr_mode */
+        __u32           lk_padding_6;   /* rr_flags */
+        __u32           lk_padding_7;   /* rr_padding_2 */
+        __u32           lk_padding_8;   /* rr_padding_3 */
+        __u32           lk_padding_9;   /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_unlink {
+        __u32           ul_opcode;
+        __u32           ul_cap;
+        __u32           ul_fsuid;
+        __u32           ul_fsuid_h;
+        __u32           ul_fsgid;
+        __u32           ul_fsgid_h;
+        __u32           ul_suppgid1;
+        __u32           ul_suppgid1_h;
+        __u32           ul_suppgid2;
+        __u32           ul_suppgid2_h;
+        struct lu_fid   ul_fid1;
+        struct lu_fid   ul_fid2;
+	__s64		ul_time;
+        __u64           ul_padding_2;   /* rr_atime */
+        __u64           ul_padding_3;   /* rr_ctime */
+        __u64           ul_padding_4;   /* rr_size */
+        __u64           ul_padding_5;   /* rr_blocks */
+        __u32           ul_bias;
+        __u32           ul_mode;
+        __u32           ul_padding_6;   /* rr_flags */
+        __u32           ul_padding_7;   /* rr_padding_2 */
+        __u32           ul_padding_8;   /* rr_padding_3 */
+        __u32           ul_padding_9;   /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_rename {
+        __u32           rn_opcode;
+        __u32           rn_cap;
+        __u32           rn_fsuid;
+        __u32           rn_fsuid_h;
+        __u32           rn_fsgid;
+        __u32           rn_fsgid_h;
+        __u32           rn_suppgid1;
+        __u32           rn_suppgid1_h;
+        __u32           rn_suppgid2;
+        __u32           rn_suppgid2_h;
+        struct lu_fid   rn_fid1;
+        struct lu_fid   rn_fid2;
+	__s64		rn_time;
+        __u64           rn_padding_1;   /* rr_atime */
+        __u64           rn_padding_2;   /* rr_ctime */
+        __u64           rn_padding_3;   /* rr_size */
+        __u64           rn_padding_4;   /* rr_blocks */
+        __u32           rn_bias;        /* some operation flags */
+        __u32           rn_mode;        /* cross-ref rename has mode */
+        __u32           rn_padding_5;   /* rr_flags */
+        __u32           rn_padding_6;   /* rr_padding_2 */
+        __u32           rn_padding_7;   /* rr_padding_3 */
+        __u32           rn_padding_8;   /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_setxattr {
+        __u32           sx_opcode;
+        __u32           sx_cap;
+        __u32           sx_fsuid;
+        __u32           sx_fsuid_h;
+        __u32           sx_fsgid;
+        __u32           sx_fsgid_h;
+        __u32           sx_suppgid1;
+        __u32           sx_suppgid1_h;
+        __u32           sx_suppgid2;
+        __u32           sx_suppgid2_h;
+        struct lu_fid   sx_fid;
+        __u64           sx_padding_1;   /* These three are rr_fid2 */
+        __u32           sx_padding_2;
+        __u32           sx_padding_3;
+        __u64           sx_valid;
+	__s64		sx_time;
+        __u64           sx_padding_5;   /* rr_ctime */
+        __u64           sx_padding_6;   /* rr_size */
+        __u64           sx_padding_7;   /* rr_blocks */
+        __u32           sx_size;
+        __u32           sx_flags;
+        __u32           sx_padding_8;   /* rr_flags */
+        __u32           sx_padding_9;   /* rr_padding_2 */
+        __u32           sx_padding_10;  /* rr_padding_3 */
+        __u32           sx_padding_11;  /* rr_padding_4 */
+};
+
+/*
+ * mdt_rec_reint is the template for all mdt_reint_xxx structures.
+ * Do NOT change the size of various members, otherwise the value
+ * will be broken in lustre_swab_mdt_rec_reint().
+ *
+ * If you add new members in other mdt_reint_xxx structres and need to use the
+ * rr_padding_x fields, then update lustre_swab_mdt_rec_reint() also.
+ */
+struct mdt_rec_reint {
+	__u32           rr_opcode;
+	__u32           rr_cap;
+	__u32           rr_fsuid;
+	__u32           rr_fsuid_h;
+	__u32           rr_fsgid;
+	__u32           rr_fsgid_h;
+	__u32           rr_suppgid1;
+	__u32           rr_suppgid1_h;
+	__u32           rr_suppgid2;
+	__u32           rr_suppgid2_h;
+	struct lu_fid   rr_fid1;
+	struct lu_fid   rr_fid2;
+	__s64		rr_mtime;
+	__s64		rr_atime;
+	__s64		rr_ctime;
+	__u64           rr_size;
+	__u64           rr_blocks;
+	__u32           rr_bias;
+	__u32           rr_mode;
+	__u32           rr_flags;
+	__u32           rr_flags_h;
+	__u32           rr_umask;
+	__u32           rr_padding_4; /* also fix lustre_swab_mdt_rec_reint */
+};
+
+/* lmv structures */
+struct lmv_desc {
+	__u32 ld_tgt_count;		/* how many MDS's */
+	__u32 ld_active_tgt_count;	/* how many active */
+	__u32 ld_default_stripe_count;	/* how many objects are used */
+	__u32 ld_pattern;		/* default hash pattern */
+	__u64 ld_default_hash_size;
+	__u64 ld_padding_1;		/* also fix lustre_swab_lmv_desc */
+	__u32 ld_padding_2;		/* also fix lustre_swab_lmv_desc */
+	__u32 ld_qos_maxage;		/* in second */
+	__u32 ld_padding_3;		/* also fix lustre_swab_lmv_desc */
+	__u32 ld_padding_4;		/* also fix lustre_swab_lmv_desc */
+	struct obd_uuid ld_uuid;
+};
+
+/* LMV layout EA, and it will be stored both in master and slave object */
+struct lmv_mds_md_v1 {
+	__u32 lmv_magic;
+	__u32 lmv_stripe_count;
+	__u32 lmv_master_mdt_index;	/* On master object, it is master
+					 * MDT index, on slave object, it
+					 * is stripe index of the slave obj */
+	__u32 lmv_hash_type;		/* dir stripe policy, i.e. indicate
+					 * which hash function to be used,
+					 * Note: only lower 16 bits is being
+					 * used for now. Higher 16 bits will
+					 * be used to mark the object status,
+					 * for example migrating or dead. */
+	__u32 lmv_layout_version;	/* Used for directory restriping */
+	__u32 lmv_padding1;
+	__u64 lmv_padding2;
+	__u64 lmv_padding3;
+	char lmv_pool_name[LOV_MAXPOOLNAME + 1];	/* pool name */
+	struct lu_fid lmv_stripe_fids[0];	/* FIDs for each stripe */
+};
+
+#define LMV_MAGIC_V1	0x0CD20CD0    /* normal stripe lmv magic */
+#define LMV_MAGIC	LMV_MAGIC_V1
+
+/* #define LMV_USER_MAGIC 0x0CD30CD0 */
+#define LMV_MAGIC_STRIPE 0x0CD40CD0 /* magic for dir sub_stripe */
+
+/* Right now only the lower part(0-16bits) of lmv_hash_type is being used,
+ * and the higher part will be the flag to indicate the status of object,
+ * for example the object is being migrated. And the hash function
+ * might be interpreted differently with different flags. */
+#define LMV_HASH_TYPE_MASK 0x0000ffff
+
+#define LMV_HASH_FLAG_MIGRATION	0x80000000
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 10, 53, 0)
+/* Since lustre 2.8, this flag will not be needed, instead this DEAD
+ * and orphan flags will be stored in LMA (see LMAI_ORPHAN)
+ * Keep this flag just for LFSCK, because it still might meet such
+ * flag when it checks the old FS */
+#define LMV_HASH_FLAG_DEAD	0x40000000
+#endif
+#define LMV_HASH_FLAG_BAD_TYPE	0x20000000
+
+/* The striped directory has ever lost its master LMV EA, then LFSCK
+ * re-generated it. This flag is used to indicate such case. It is an
+ * on-disk flag. */
+#define LMV_HASH_FLAG_LOST_LMV	0x10000000
+
+/**
+ * The FNV-1a hash algorithm is as follows:
+ *	hash = FNV_offset_basis
+ *	for each octet_of_data to be hashed
+ *		hash = hash XOR octet_of_data
+ *		hash = hash × FNV_prime
+ *	return hash
+ * http://en.wikipedia.org/wiki/Fowler–Noll–Vo_hash_function#FNV-1a_hash
+ *
+ * http://www.isthe.com/chongo/tech/comp/fnv/index.html#FNV-reference-source
+ * FNV_prime is 2^40 + 2^8 + 0xb3 = 0x100000001b3ULL
+ **/
+#define LUSTRE_FNV_1A_64_PRIME	0x100000001b3ULL
+#define LUSTRE_FNV_1A_64_OFFSET_BIAS 0xcbf29ce484222325ULL
+static inline __u64 lustre_hash_fnv_1a_64(const void *buf, size_t size)
+{
+	__u64 hash = LUSTRE_FNV_1A_64_OFFSET_BIAS;
+	const unsigned char *p = buf;
+	size_t i;
+
+	for (i = 0; i < size; i++) {
+		hash ^= p[i];
+		hash *= LUSTRE_FNV_1A_64_PRIME;
+	}
+
+	return hash;
+}
+
+union lmv_mds_md {
+	__u32			 lmv_magic;
+	struct lmv_mds_md_v1	 lmv_md_v1;
+	struct lmv_user_md	 lmv_user_md;
+};
+
+static inline int lmv_mds_md_size(int stripe_count, unsigned int lmm_magic)
+{
+	switch (lmm_magic) {
+	case LMV_MAGIC_V1:{
+		struct lmv_mds_md_v1 *lmm1;
+
+		return sizeof(*lmm1) + stripe_count *
+				       sizeof(lmm1->lmv_stripe_fids[0]);
+	}
+	default:
+		return -EINVAL;
+	}
+}
+
+static inline int lmv_mds_md_stripe_count_get(const union lmv_mds_md *lmm)
+{
+	switch (__le32_to_cpu(lmm->lmv_magic)) {
+	case LMV_MAGIC_V1:
+		return __le32_to_cpu(lmm->lmv_md_v1.lmv_stripe_count);
+	case LMV_USER_MAGIC:
+		return __le32_to_cpu(lmm->lmv_user_md.lum_stripe_count);
+	default:
+		return -EINVAL;
+	}
+}
+
+static inline int lmv_mds_md_hash_type_get(const union lmv_mds_md *lmm)
+{
+	switch (__le32_to_cpu(lmm->lmv_magic)) {
+	case LMV_MAGIC_V1:
+		return __le32_to_cpu(lmm->lmv_md_v1.lmv_hash_type);
+	case LMV_USER_MAGIC:
+		return __le32_to_cpu(lmm->lmv_user_md.lum_hash_type);
+	default:
+		return -EINVAL;
+	}
+}
+
+enum fld_rpc_opc {
+	FLD_QUERY	= 900,
+	FLD_READ	= 901,
+	FLD_LAST_OPC,
+	FLD_FIRST_OPC   = FLD_QUERY
+};
+
+enum seq_rpc_opc {
+        SEQ_QUERY                       = 700,
+        SEQ_LAST_OPC,
+        SEQ_FIRST_OPC                   = SEQ_QUERY
+};
+
+enum seq_op {
+        SEQ_ALLOC_SUPER = 0,
+        SEQ_ALLOC_META = 1
+};
+
+enum fld_op {
+	FLD_CREATE = 0,
+	FLD_DELETE = 1,
+	FLD_LOOKUP = 2,
+};
+
+/* LFSCK opcodes */
+typedef enum {
+	LFSCK_NOTIFY		= 1101,
+	LFSCK_QUERY		= 1102,
+	LFSCK_LAST_OPC,
+	LFSCK_FIRST_OPC 	= LFSCK_NOTIFY
+} lfsck_cmd_t;
+
+/*
+ *  LOV data structures
+ */
+
+#define LOV_MAX_UUID_BUFFER_SIZE  8192
+/* The size of the buffer the lov/mdc reserves for the
+ * array of UUIDs returned by the MDS.  With the current
+ * protocol, this will limit the max number of OSTs per LOV */
+
+#define LOV_DESC_MAGIC 0xB0CCDE5C
+#define LOV_DESC_QOS_MAXAGE_DEFAULT 5  /* Seconds */
+#define LOV_DESC_STRIPE_SIZE_DEFAULT (1 << LNET_MTU_BITS)
+
+/* LOV settings descriptor (should only contain static info) */
+struct lov_desc {
+	__u32 ld_tgt_count;		/* how many OBD's */
+	__u32 ld_active_tgt_count;	/* how many active */
+	__s32 ld_default_stripe_count;	/* how many objects are used */
+	__u32 ld_pattern;		/* default PATTERN_RAID0 */
+	__u64 ld_default_stripe_size;	/* in bytes */
+	__s64 ld_default_stripe_offset;	/* starting OST index */
+	__u32 ld_padding_0;		/* unused */
+	__u32 ld_qos_maxage;		/* in second */
+	__u32 ld_padding_1;		/* also fix lustre_swab_lov_desc */
+	__u32 ld_padding_2;		/* also fix lustre_swab_lov_desc */
+	struct obd_uuid ld_uuid;
+};
+
+#define ld_magic ld_active_tgt_count       /* for swabbing from llogs */
+
+/*
+ *   LDLM requests:
+ */
+/* opcodes -- MUST be distinct from OST/MDS opcodes */
+typedef enum {
+        LDLM_ENQUEUE     = 101,
+        LDLM_CONVERT     = 102,
+        LDLM_CANCEL      = 103,
+        LDLM_BL_CALLBACK = 104,
+        LDLM_CP_CALLBACK = 105,
+        LDLM_GL_CALLBACK = 106,
+        LDLM_SET_INFO    = 107,
+        LDLM_LAST_OPC
+} ldlm_cmd_t;
+#define LDLM_FIRST_OPC LDLM_ENQUEUE
+
+#define RES_NAME_SIZE 4
+struct ldlm_res_id {
+        __u64 name[RES_NAME_SIZE];
+};
+
+#define DLDLMRES	"[%#llx:%#llx:%#llx].%#llx"
+#define PLDLMRES(res)	(unsigned long long)(res)->lr_name.name[0],	\
+			(unsigned long long)(res)->lr_name.name[1],	\
+			(unsigned long long)(res)->lr_name.name[2],	\
+			(unsigned long long)(res)->lr_name.name[3]
+
+/* lock types */
+typedef enum ldlm_mode {
+	LCK_MINMODE	= 0,
+	LCK_EX		= 1,
+	LCK_PW		= 2,
+	LCK_PR		= 4,
+	LCK_CW		= 8,
+	LCK_CR		= 16,
+	LCK_NL		= 32,
+	LCK_GROUP	= 64,
+	LCK_COS		= 128,
+	LCK_MAXMODE
+} ldlm_mode_t;
+
+#define LCK_MODE_NUM    8
+
+typedef enum ldlm_type {
+	LDLM_PLAIN	= 10,
+	LDLM_EXTENT	= 11,
+	LDLM_FLOCK	= 12,
+	LDLM_IBITS	= 13,
+	LDLM_MAX_TYPE
+} ldlm_type_t;
+
+#define LDLM_MIN_TYPE LDLM_PLAIN
+
+struct ldlm_extent {
+        __u64 start;
+        __u64 end;
+        __u64 gid;
+};
+
+struct ldlm_inodebits {
+        __u64 bits;
+};
+
+struct ldlm_flock_wire {
+        __u64 lfw_start;
+        __u64 lfw_end;
+        __u64 lfw_owner;
+        __u32 lfw_padding;
+        __u32 lfw_pid;
+};
+
+/* it's important that the fields of the ldlm_extent structure match
+ * the first fields of the ldlm_flock structure because there is only
+ * one ldlm_swab routine to process the ldlm_policy_data_t union. if
+ * this ever changes we will need to swab the union differently based
+ * on the resource type. */
+
+typedef union ldlm_wire_policy_data {
+	struct ldlm_extent	l_extent;
+	struct ldlm_flock_wire	l_flock;
+	struct ldlm_inodebits	l_inodebits;
+} ldlm_wire_policy_data_t;
+
+struct barrier_lvb {
+	__u32	lvb_status;
+	__u32	lvb_index;
+	__u64	lvb_padding;
+};
+
+struct ldlm_gl_barrier_desc {
+	__u32	lgbd_status;
+	__u32	lgbd_timeout;
+	__u64	lgbd_padding;
+};
+
+union ldlm_gl_desc {
+	struct ldlm_gl_lquota_desc	lquota_desc;
+	struct ldlm_gl_barrier_desc	barrier_desc;
+};
+
+enum ldlm_intent_flags {
+	IT_OPEN        = 0x00000001,
+	IT_CREAT       = 0x00000002,
+	IT_OPEN_CREAT  = 0x00000003,
+	IT_READDIR     = 0x00000004,
+	IT_GETATTR     = 0x00000008,
+	IT_LOOKUP      = 0x00000010,
+	IT_UNLINK      = 0x00000020,
+	IT_TRUNC       = 0x00000040,
+	IT_GETXATTR    = 0x00000080,
+	IT_EXEC        = 0x00000100,
+	IT_PIN         = 0x00000200,
+	IT_LAYOUT      = 0x00000400,
+	IT_QUOTA_DQACQ = 0x00000800,
+	IT_QUOTA_CONN  = 0x00001000,
+	IT_SETXATTR    = 0x00002000,
+};
+
+struct ldlm_intent {
+	__u64 opc;
+};
+
+struct ldlm_resource_desc {
+	enum ldlm_type	   lr_type;
+	__u32		   lr_pad; /* also fix lustre_swab_ldlm_resource_desc */
+	struct ldlm_res_id lr_name;
+};
+
+struct ldlm_lock_desc {
+	struct ldlm_resource_desc l_resource;
+	enum ldlm_mode l_req_mode;
+	enum ldlm_mode l_granted_mode;
+	union ldlm_wire_policy_data l_policy_data;
+};
+
+#define LDLM_LOCKREQ_HANDLES 2
+#define LDLM_ENQUEUE_CANCEL_OFF 1
+
+struct ldlm_request {
+        __u32 lock_flags;
+        __u32 lock_count;
+        struct ldlm_lock_desc lock_desc;
+        struct lustre_handle lock_handle[LDLM_LOCKREQ_HANDLES];
+};
+
+struct ldlm_reply {
+        __u32 lock_flags;
+        __u32 lock_padding;     /* also fix lustre_swab_ldlm_reply */
+        struct ldlm_lock_desc lock_desc;
+        struct lustre_handle lock_handle;
+        __u64  lock_policy_res1;
+        __u64  lock_policy_res2;
+};
+
+#define ldlm_flags_to_wire(flags)    ((__u32)(flags))
+#define ldlm_flags_from_wire(flags)  ((__u64)(flags))
+
+/*
+ * Opcodes for mountconf (mgs and mgc)
+ */
+typedef enum {
+        MGS_CONNECT = 250,
+        MGS_DISCONNECT,
+        MGS_EXCEPTION,         /* node died, etc. */
+        MGS_TARGET_REG,        /* whenever target starts up */
+        MGS_TARGET_DEL,
+        MGS_SET_INFO,
+        MGS_CONFIG_READ,
+        MGS_LAST_OPC
+} mgs_cmd_t;
+#define MGS_FIRST_OPC MGS_CONNECT
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0)
+#define MGS_PARAM_MAXLEN 1024
+#define KEY_SET_INFO "set_info"
+
+struct mgs_send_param {
+        char             mgs_param[MGS_PARAM_MAXLEN];
+};
+#endif
+
+/* We pass this info to the MGS so it can write config logs */
+#define MTI_NAME_MAXLEN  64
+#define MTI_PARAM_MAXLEN 4096
+#define MTI_NIDS_MAX     32
+struct mgs_target_info {
+        __u32            mti_lustre_ver;
+        __u32            mti_stripe_index;
+        __u32            mti_config_ver;
+        __u32            mti_flags;
+        __u32            mti_nid_count;
+        __u32            mti_instance; /* Running instance of target */
+        char             mti_fsname[MTI_NAME_MAXLEN];
+        char             mti_svname[MTI_NAME_MAXLEN];
+        char             mti_uuid[sizeof(struct obd_uuid)];
+        __u64            mti_nids[MTI_NIDS_MAX];     /* host nids (lnet_nid_t)*/
+        char             mti_params[MTI_PARAM_MAXLEN];
+};
+
+struct mgs_nidtbl_entry {
+        __u64           mne_version;    /* table version of this entry */
+        __u32           mne_instance;   /* target instance # */
+        __u32           mne_index;      /* target index */
+        __u32           mne_length;     /* length of this entry - by bytes */
+        __u8            mne_type;       /* target type LDD_F_SV_TYPE_OST/MDT */
+        __u8            mne_nid_type;   /* type of nid(mbz). for ipv6. */
+        __u8            mne_nid_size;   /* size of each NID, by bytes */
+        __u8            mne_nid_count;  /* # of NIDs in buffer */
+        union {
+                lnet_nid_t nids[0];     /* variable size buffer for NIDs. */
+        } u;
+};
+
+enum {
+	CONFIG_T_CONFIG  = 0,
+	CONFIG_T_SPTLRPC = 1,
+	CONFIG_T_RECOVER = 2,
+	CONFIG_T_PARAMS  = 3,
+	CONFIG_T_NODEMAP = 4,
+	CONFIG_T_BARRIER = 5,
+	CONFIG_T_MAX
+};
+
+struct mgs_config_body {
+	char     mcb_name[MTI_NAME_MAXLEN]; /* logname */
+	__u64    mcb_offset;    /* next index of config log to request */
+	__u16    mcb_type;      /* type of log: CONFIG_T_[CONFIG|RECOVER] */
+	__u8     mcb_nm_cur_pass;
+	__u8     mcb_bits;      /* bits unit size of config log */
+	__u32    mcb_units;     /* # of units for bulk transfer */
+};
+
+struct mgs_config_res {
+	__u64    mcr_offset;    /* index of last config log */
+	union {
+		__u64    mcr_size;		/* size of the log */
+		__u64	 mcr_nm_cur_pass;	/* current nodemap config pass */
+	};
+};
+
+/* Config marker flags (in config log) */
+#define CM_START       0x01
+#define CM_END         0x02
+#define CM_SKIP        0x04
+#define CM_UPGRADE146  0x08
+#define CM_EXCLUDE     0x10
+#define CM_START_SKIP (CM_START | CM_SKIP)
+
+struct cfg_marker {
+	__u32	cm_step;       /* aka config version */
+	__u32	cm_flags;
+	__u32	cm_vers;       /* lustre release version number */
+	__u32	cm_padding;    /* 64 bit align */
+	__s64	cm_createtime; /*when this record was first created */
+	__s64	cm_canceltime; /*when this record is no longer valid*/
+	char	cm_tgtname[MTI_NAME_MAXLEN];
+	char	cm_comment[MTI_NAME_MAXLEN];
+};
+
+/*
+ * Opcodes for multiple servers.
+ */
+
+typedef enum {
+        OBD_PING = 400,
+        OBD_LOG_CANCEL,
+	OBD_QC_CALLBACK, /* not used since 2.4 */
+	OBD_IDX_READ,
+        OBD_LAST_OPC
+} obd_cmd_t;
+#define OBD_FIRST_OPC OBD_PING
+
+/**
+ * llog contexts indices.
+ *
+ * There is compatibility problem with indexes below, they are not
+ * continuous and must keep their numbers for compatibility needs.
+ * See LU-5218 for details.
+ */
+enum llog_ctxt_id {
+	LLOG_CONFIG_ORIG_CTXT  =  0,
+	LLOG_CONFIG_REPL_CTXT = 1,
+	LLOG_MDS_OST_ORIG_CTXT = 2,
+	LLOG_MDS_OST_REPL_CTXT = 3, /* kept just to avoid re-assignment */
+	LLOG_SIZE_ORIG_CTXT = 4,
+	LLOG_SIZE_REPL_CTXT = 5,
+	LLOG_TEST_ORIG_CTXT = 8,
+	LLOG_TEST_REPL_CTXT = 9, /* kept just to avoid re-assignment */
+	LLOG_CHANGELOG_ORIG_CTXT = 12, /**< changelog generation on mdd */
+	LLOG_CHANGELOG_REPL_CTXT = 13, /**< changelog access on clients */
+	/* for multiple changelog consumers */
+	LLOG_CHANGELOG_USER_ORIG_CTXT = 14,
+	LLOG_AGENT_ORIG_CTXT = 15, /**< agent requests generation on cdt */
+	LLOG_UPDATELOG_ORIG_CTXT = 16, /* update log */
+	LLOG_UPDATELOG_REPL_CTXT = 17, /* update log */
+	LLOG_MAX_CTXTS
+};
+
+/** Identifier for a single log object */
+struct llog_logid {
+	struct ost_id		lgl_oi;
+        __u32                   lgl_ogen;
+} __attribute__((packed));
+
+/** Records written to the CATALOGS list */
+#define CATLIST "CATALOGS"
+struct llog_catid {
+        struct llog_logid       lci_logid;
+        __u32                   lci_padding1;
+        __u32                   lci_padding2;
+        __u32                   lci_padding3;
+} __attribute__((packed));
+
+/* Log data record types - there is no specific reason that these need to
+ * be related to the RPC opcodes, but no reason not to (may be handy later?)
+ */
+#define LLOG_OP_MAGIC 0x10600000
+#define LLOG_OP_MASK  0xfff00000
+
+typedef enum {
+	LLOG_PAD_MAGIC		= LLOG_OP_MAGIC | 0x00000,
+	OST_SZ_REC		= LLOG_OP_MAGIC | 0x00f00,
+	/* OST_RAID1_REC	= LLOG_OP_MAGIC | 0x01000, never used */
+	MDS_UNLINK_REC		= LLOG_OP_MAGIC | 0x10000 | (MDS_REINT << 8) |
+				  REINT_UNLINK, /* obsolete after 2.5.0 */
+	MDS_UNLINK64_REC	= LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) |
+				  REINT_UNLINK,
+	/* MDS_SETATTR_REC	= LLOG_OP_MAGIC | 0x12401, obsolete 1.8.0 */
+	MDS_SETATTR64_REC	= LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) |
+				  REINT_SETATTR,
+	OBD_CFG_REC		= LLOG_OP_MAGIC | 0x20000,
+	/* PTL_CFG_REC		= LLOG_OP_MAGIC | 0x30000, obsolete 1.4.0 */
+	LLOG_GEN_REC		= LLOG_OP_MAGIC | 0x40000,
+	/* LLOG_JOIN_REC	= LLOG_OP_MAGIC | 0x50000, obsolete  1.8.0 */
+	CHANGELOG_REC		= LLOG_OP_MAGIC | 0x60000,
+	CHANGELOG_USER_REC	= LLOG_OP_MAGIC | 0x70000,
+	HSM_AGENT_REC		= LLOG_OP_MAGIC | 0x80000,
+	UPDATE_REC		= LLOG_OP_MAGIC | 0xa0000,
+	LLOG_HDR_MAGIC		= LLOG_OP_MAGIC | 0x45539,
+	LLOG_LOGID_MAGIC	= LLOG_OP_MAGIC | 0x4553b,
+} llog_op_type;
+
+#define LLOG_REC_HDR_NEEDS_SWABBING(r) \
+	(((r)->lrh_type & __swab32(LLOG_OP_MASK)) == __swab32(LLOG_OP_MAGIC))
+
+/** Log record header - stored in little endian order.
+ * Each record must start with this struct, end with a llog_rec_tail,
+ * and be a multiple of 256 bits in size.
+ */
+struct llog_rec_hdr {
+	__u32	lrh_len;
+	__u32	lrh_index;
+	__u32	lrh_type;
+	__u32	lrh_id;
+};
+
+struct llog_rec_tail {
+	__u32	lrt_len;
+	__u32	lrt_index;
+};
+
+/* Where data follow just after header */
+#define REC_DATA(ptr)						\
+	((void *)((char *)ptr + sizeof(struct llog_rec_hdr)))
+
+#define REC_DATA_LEN(rec)					\
+	(rec->lrh_len - sizeof(struct llog_rec_hdr) -		\
+	 sizeof(struct llog_rec_tail))
+
+struct llog_logid_rec {
+	struct llog_rec_hdr	lid_hdr;
+	struct llog_logid	lid_id;
+	__u32			lid_padding1;
+	__u64			lid_padding2;
+	__u64			lid_padding3;
+	struct llog_rec_tail	lid_tail;
+} __attribute__((packed));
+
+struct llog_unlink_rec {
+	struct llog_rec_hdr	lur_hdr;
+	__u64			lur_oid;
+	__u32			lur_oseq;
+	__u32			lur_count;
+	struct llog_rec_tail	lur_tail;
+} __attribute__((packed));
+
+struct llog_unlink64_rec {
+	struct llog_rec_hdr	lur_hdr;
+	struct lu_fid		lur_fid;
+	__u32			lur_count; /* to destroy the lost precreated */
+	__u32			lur_padding1;
+	__u64			lur_padding2;
+	__u64			lur_padding3;
+	struct llog_rec_tail    lur_tail;
+} __attribute__((packed));
+
+struct llog_setattr64_rec {
+	struct llog_rec_hdr	lsr_hdr;
+	struct ost_id		lsr_oi;
+	__u32			lsr_uid;
+	__u32			lsr_uid_h;
+	__u32			lsr_gid;
+	__u32			lsr_gid_h;
+	__u64			lsr_valid;
+	struct llog_rec_tail    lsr_tail;
+} __attribute__((packed));
+
+/* Extended to support project quota */
+struct llog_setattr64_rec_v2 {
+	struct llog_rec_hdr	lsr_hdr;
+	struct ost_id		lsr_oi;
+	__u32			lsr_uid;
+	__u32			lsr_uid_h;
+	__u32			lsr_gid;
+	__u32			lsr_gid_h;
+	__u64			lsr_valid;
+	__u32			lsr_projid;
+	__u32			lsr_padding1;
+	__u64			lsr_padding2;
+	__u64			lsr_padding3;
+	struct llog_rec_tail	lsr_tail;
+} __attribute__((packed));
+
+struct llog_size_change_rec {
+	struct llog_rec_hdr	lsc_hdr;
+	struct ll_fid		lsc_fid;
+	__u32			lsc_ioepoch;
+	__u32			lsc_padding1;
+	__u64			lsc_padding2;
+	__u64			lsc_padding3;
+	struct llog_rec_tail	lsc_tail;
+} __attribute__((packed));
+
+#define CHANGELOG_MAGIC 0xca103000
+
+/** \a changelog_rec_type's that can't be masked */
+#define CHANGELOG_MINMASK (1 << CL_MARK)
+/** bits covering all \a changelog_rec_type's */
+#define CHANGELOG_ALLMASK 0XFFFFFFFF
+/** default \a changelog_rec_type mask. Allow all of them, except
+ * CL_ATIME since it can really be time consuming, and not necessary
+ * under normal use. */
+#define CHANGELOG_DEFMASK (CHANGELOG_ALLMASK & ~(1 << CL_ATIME))
+
+/* changelog llog name, needed by client replicators */
+#define CHANGELOG_CATALOG "changelog_catalog"
+
+struct changelog_setinfo {
+        __u64 cs_recno;
+        __u32 cs_id;
+} __attribute__((packed));
+
+/** changelog record */
+struct llog_changelog_rec {
+	struct llog_rec_hdr  cr_hdr;
+	struct changelog_rec cr; /**< Variable length field */
+	struct llog_rec_tail cr_do_not_use; /**< for_sizeof_only */
+} __attribute__((packed));
+
+#define CHANGELOG_USER_PREFIX "cl"
+
+struct llog_changelog_user_rec {
+        struct llog_rec_hdr   cur_hdr;
+        __u32                 cur_id;
+        __u32                 cur_padding;
+        __u64                 cur_endrec;
+        struct llog_rec_tail  cur_tail;
+} __attribute__((packed));
+
+enum agent_req_status {
+	ARS_WAITING,
+	ARS_STARTED,
+	ARS_FAILED,
+	ARS_CANCELED,
+	ARS_SUCCEED,
+};
+
+static inline const char *agent_req_status2name(enum agent_req_status ars)
+{
+	switch (ars) {
+	case ARS_WAITING:
+		return "WAITING";
+	case ARS_STARTED:
+		return "STARTED";
+	case ARS_FAILED:
+		return "FAILED";
+	case ARS_CANCELED:
+		return "CANCELED";
+	case ARS_SUCCEED:
+		return "SUCCEED";
+	default:
+		return "UNKNOWN";
+	}
+}
+
+struct llog_agent_req_rec {
+	struct llog_rec_hdr	arr_hdr;	/**< record header */
+	__u32			arr_status;	/**< status of the request */
+						/* must match enum
+						 * agent_req_status */
+	__u32			arr_archive_id;	/**< backend archive number */
+	__u64			arr_flags;	/**< req flags */
+	__u64			arr_compound_id;	/**< compound cookie */
+	__u64			arr_req_create;	/**< req. creation time */
+	__u64			arr_req_change;	/**< req. status change time */
+	struct hsm_action_item	arr_hai;	/**< req. to the agent */
+	struct llog_rec_tail	arr_tail; /**< record tail for_sizezof_only */
+} __attribute__((packed));
+
+/* Old llog gen for compatibility */
+struct llog_gen {
+	__u64 mnt_cnt;
+	__u64 conn_cnt;
+} __attribute__((packed));
+
+struct llog_gen_rec {
+	struct llog_rec_hdr	lgr_hdr;
+	struct llog_gen		lgr_gen;
+	__u64			padding1;
+	__u64			padding2;
+	__u64			padding3;
+	struct llog_rec_tail	lgr_tail;
+};
+
+/* flags for the logs */
+enum llog_flag {
+	LLOG_F_ZAP_WHEN_EMPTY	= 0x1,
+	LLOG_F_IS_CAT		= 0x2,
+	LLOG_F_IS_PLAIN		= 0x4,
+	LLOG_F_EXT_JOBID	= 0x8,
+	LLOG_F_IS_FIXSIZE	= 0x10,
+
+	/* Note: Flags covered by LLOG_F_EXT_MASK will be inherited from
+	 * catlog to plain log, so do not add LLOG_F_IS_FIXSIZE here,
+	 * because the catlog record is usually fixed size, but its plain
+	 * log record can be variable */
+	LLOG_F_EXT_MASK = LLOG_F_EXT_JOBID,
+};
+
+/* On-disk header structure of each log object, stored in little endian order */
+#define LLOG_MIN_CHUNK_SIZE	8192
+#define LLOG_HEADER_SIZE        (96) /* sizeof (llog_log_hdr) + sizeof(llh_tail)
+				      * - sizeof(llh_bitmap) */
+#define LLOG_BITMAP_BYTES       (LLOG_MIN_CHUNK_SIZE - LLOG_HEADER_SIZE)
+#define LLOG_MIN_REC_SIZE       (24) /* round(llog_rec_hdr + llog_rec_tail) */
+
+struct llog_log_hdr {
+	struct llog_rec_hdr	llh_hdr;
+	__s64			llh_timestamp;
+	__u32			llh_count;
+	__u32			llh_bitmap_offset;
+	__u32			llh_size;
+	__u32			llh_flags;
+	/* for a catalog the first/oldest and still in-use plain slot is just
+	 * next to it. It will serve as the upper limit after Catalog has
+	 * wrapped around */
+	__u32			llh_cat_idx;
+	struct obd_uuid		llh_tgtuuid;
+	__u32			llh_reserved[LLOG_HEADER_SIZE/sizeof(__u32)-23];
+	/* These fields must always be at the end of the llog_log_hdr.
+	 * Note: llh_bitmap size is variable because llog chunk size could be
+	 * bigger than LLOG_MIN_CHUNK_SIZE, i.e. sizeof(llog_log_hdr) > 8192
+	 * bytes, and the real size is stored in llh_hdr.lrh_len, which means
+	 * llh_tail should only be refered by LLOG_HDR_TAIL().
+	 * But this structure is also used by client/server llog interface
+	 * (see llog_client.c), it will be kept in its original way to avoid
+	 * compatiblity issue. */
+	__u32			llh_bitmap[LLOG_BITMAP_BYTES / sizeof(__u32)];
+	struct llog_rec_tail	llh_tail;
+} __attribute__((packed));
+#undef LLOG_HEADER_SIZE
+#undef LLOG_BITMAP_BYTES
+
+#define LLOG_HDR_BITMAP_SIZE(llh)	(__u32)((llh->llh_hdr.lrh_len -	\
+					 llh->llh_bitmap_offset -	\
+					 sizeof(llh->llh_tail)) * 8)
+#define LLOG_HDR_BITMAP(llh)	(__u32 *)((char *)(llh) +		\
+					  (llh)->llh_bitmap_offset)
+#define LLOG_HDR_TAIL(llh)	((struct llog_rec_tail *)((char *)llh +	\
+						 llh->llh_hdr.lrh_len -	\
+						 sizeof(llh->llh_tail)))
+
+/** log cookies are used to reference a specific log file and a record therein */
+struct llog_cookie {
+        struct llog_logid       lgc_lgl;
+        __u32                   lgc_subsys;
+        __u32                   lgc_index;
+        __u32                   lgc_padding;
+} __attribute__((packed));
+
+/** llog protocol */
+enum llogd_rpc_ops {
+        LLOG_ORIGIN_HANDLE_CREATE       = 501,
+        LLOG_ORIGIN_HANDLE_NEXT_BLOCK   = 502,
+        LLOG_ORIGIN_HANDLE_READ_HEADER  = 503,
+        LLOG_ORIGIN_HANDLE_WRITE_REC    = 504,
+        LLOG_ORIGIN_HANDLE_CLOSE        = 505,
+        LLOG_ORIGIN_CONNECT             = 506,
+	LLOG_CATINFO			= 507,  /* deprecated */
+        LLOG_ORIGIN_HANDLE_PREV_BLOCK   = 508,
+        LLOG_ORIGIN_HANDLE_DESTROY      = 509,  /* for destroy llog object*/
+        LLOG_LAST_OPC,
+        LLOG_FIRST_OPC                  = LLOG_ORIGIN_HANDLE_CREATE
+};
+
+struct llogd_body {
+        struct llog_logid  lgd_logid;
+        __u32 lgd_ctxt_idx;
+        __u32 lgd_llh_flags;
+        __u32 lgd_index;
+        __u32 lgd_saved_index;
+        __u32 lgd_len;
+        __u64 lgd_cur_offset;
+} __attribute__((packed));
+
+struct llogd_conn_body {
+        struct llog_gen         lgdc_gen;
+        struct llog_logid       lgdc_logid;
+        __u32                   lgdc_ctxt_idx;
+} __attribute__((packed));
+
+/* Note: 64-bit types are 64-bit aligned in structure */
+struct obdo {
+	__u64			o_valid;	/* hot fields in this obdo */
+	struct ost_id		o_oi;
+	__u64			o_parent_seq;
+	__u64			o_size;		/* o_size-o_blocks == ost_lvb */
+	__s64			o_mtime;
+	__s64			o_atime;
+	__s64			o_ctime;
+	__u64			o_blocks;	/* brw: cli sent cached bytes */
+	__u64			o_grant;
+
+	/* 32-bit fields start here: keep an even number of them via padding */
+	__u32			o_blksize;	/* optimal IO blocksize */
+	__u32			o_mode;		/* brw: cli sent cache remain */
+	__u32			o_uid;
+	__u32			o_gid;
+	__u32			o_flags;
+	__u32			o_nlink;	/* brw: checksum */
+	__u32			o_parent_oid;
+	__u32			o_misc;		/* brw: o_dropped */
+
+	__u64			o_ioepoch;	/* epoch in ost writes */
+	__u32			o_stripe_idx;	/* holds stripe idx */
+	__u32			o_parent_ver;
+	struct lustre_handle	o_handle;	/* brw: lock handle to prolong
+						 * locks */
+	/* Originally, the field is llog_cookie for destroy with unlink cookie
+	 * from MDS, it is obsolete in 2.8. Then reuse it by client to transfer
+	 * layout and PFL information in IO, setattr RPCs. Since llog_cookie is
+	 * not used on wire any longer, remove it from the obdo, then it can be
+	 * enlarged freely in the further without affect related RPCs.
+	 *
+	 * sizeof(ost_layout) + sieof(__u32) == sizeof(llog_cookie). */
+	struct ost_layout	o_layout;
+	__u32			o_padding_3;
+	__u32			o_uid_h;
+	__u32			o_gid_h;
+
+	__u64			o_data_version;	/* getattr: sum of iversion for
+						 * each stripe.
+						 * brw: grant space consumed on
+						 * the client for the write */
+	__u32			o_projid;
+	__u32			o_padding_4;	/* also fix
+						 * lustre_swab_obdo() */
+	__u64			o_padding_5;
+	__u64			o_padding_6;
+};
+
+#define o_dirty   o_blocks
+#define o_undirty o_mode
+#define o_dropped o_misc
+#define o_cksum   o_nlink
+#define o_grant_used o_data_version
+
+struct lfsck_request {
+	__u32		lr_event;
+	__u32		lr_index;
+	__u32		lr_flags;
+	__u32		lr_valid;
+	union {
+		__u32	lr_speed;
+		__u32	lr_status;
+	};
+	__u16		lr_version;
+	__u16		lr_active;
+	__u16		lr_param;
+	__u16		lr_async_windows;
+	__u32		lr_flags2;
+	struct lu_fid	lr_fid;
+	struct lu_fid	lr_fid2;
+	__u32		lr_comp_id;
+	__u32		lr_padding_0;
+	__u64		lr_padding_1;
+	__u64		lr_padding_2;
+	__u64		lr_padding_3;
+};
+
+struct lfsck_reply {
+	__u32		lr_status;
+	__u32		lr_padding_1;
+	__u64		lr_repaired;
+};
+
+enum lfsck_events {
+	LE_LASTID_REBUILDING	= 1,
+	LE_LASTID_REBUILT	= 2,
+	LE_PHASE1_DONE		= 3,
+	LE_PHASE2_DONE		= 4,
+	LE_START		= 5,
+	LE_STOP 		= 6,
+	LE_QUERY		= 7,
+	/* LE_FID_ACCESSED	= 8, moved to lfsck_events_local */
+	LE_PEER_EXIT		= 9,
+	LE_CONDITIONAL_DESTROY	= 10,
+	LE_PAIRS_VERIFY 	= 11,
+	LE_SET_LMV_MASTER	= 15,
+	LE_SET_LMV_SLAVE	= 16,
+};
+
+enum lfsck_event_flags {
+	LEF_TO_OST		= 0x00000001,
+	LEF_FROM_OST		= 0x00000002,
+	LEF_SET_LMV_HASH	= 0x00000004,
+	LEF_SET_LMV_ALL		= 0x00000008,
+	LEF_RECHECK_NAME_HASH	= 0x00000010,
+	LEF_QUERY_ALL		= 0x00000020,
+};
+
+/* request structure for OST's */
+struct ost_body {
+	struct obdo oa;
+};
+
+/* Key for FIEMAP to be used in get_info calls */
+struct ll_fiemap_info_key {
+	char		lfik_name[8];
+	struct obdo	lfik_oa;
+	struct fiemap	lfik_fiemap;
+};
+
+#define IDX_INFO_MAGIC 0x3D37CC37
+
+/* Index file transfer through the network. The server serializes the index into
+ * a byte stream which is sent to the client via a bulk transfer */
+struct idx_info {
+	__u32		ii_magic;
+
+	/* reply: see idx_info_flags below */
+	__u32		ii_flags;
+
+	/* request & reply: number of lu_idxpage (to be) transferred */
+	__u16		ii_count;
+	__u16		ii_pad0;
+
+	/* request: requested attributes passed down to the iterator API */
+	__u32		ii_attrs;
+
+	/* request & reply: index file identifier (FID) */
+	struct lu_fid	ii_fid;
+
+	/* reply: version of the index file before starting to walk the index.
+	 * Please note that the version can be modified at any time during the
+	 * transfer */
+	__u64		ii_version;
+
+	/* request: hash to start with:
+	 * reply: hash of the first entry of the first lu_idxpage and hash
+	 *        of the entry to read next if any */
+	__u64		ii_hash_start;
+	__u64		ii_hash_end;
+
+	/* reply: size of keys in lu_idxpages, minimal one if II_FL_VARKEY is
+	 * set */
+	__u16		ii_keysize;
+
+	/* reply: size of records in lu_idxpages, minimal one if II_FL_VARREC
+	 * is set */
+	__u16		ii_recsize;
+
+	__u32		ii_pad1;
+	__u64		ii_pad2;
+	__u64		ii_pad3;
+};
+
+#define II_END_OFF	MDS_DIR_END_OFF /* all entries have been read */
+
+/* List of flags used in idx_info::ii_flags */
+enum idx_info_flags {
+	II_FL_NOHASH	= 1 << 0, /* client doesn't care about hash value */
+	II_FL_VARKEY	= 1 << 1, /* keys can be of variable size */
+	II_FL_VARREC	= 1 << 2, /* records can be of variable size */
+	II_FL_NONUNQ	= 1 << 3, /* index supports non-unique keys */
+	II_FL_NOKEY	= 1 << 4, /* client doesn't care about key */
+};
+
+#define LIP_MAGIC 0x8A6D6B6C
+
+/* 4KB (= LU_PAGE_SIZE) container gathering key/record pairs */
+struct lu_idxpage {
+	/* 16-byte header */
+	__u32	lip_magic;
+	__u16	lip_flags;
+	__u16	lip_nr;   /* number of entries in the container */
+	__u64	lip_pad0; /* additional padding for future use */
+
+	/* key/record pairs are stored in the remaining 4080 bytes.
+	 * depending upon the flags in idx_info::ii_flags, each key/record
+	 * pair might be preceded by:
+	 * - a hash value
+	 * - the key size (II_FL_VARKEY is set)
+	 * - the record size (II_FL_VARREC is set)
+	 *
+	 * For the time being, we only support fixed-size key & record. */
+	char	lip_entries[0];
+};
+
+#define LIP_HDR_SIZE (offsetof(struct lu_idxpage, lip_entries))
+
+/* Gather all possible type associated with a 4KB container */
+union lu_page {
+	struct lu_dirpage	lp_dir; /* for MDS_READPAGE */
+	struct lu_idxpage	lp_idx; /* for OBD_IDX_READ */
+	char			lp_array[LU_PAGE_SIZE];
+};
+
+/* security opcodes */
+typedef enum {
+        SEC_CTX_INIT            = 801,
+        SEC_CTX_INIT_CONT       = 802,
+        SEC_CTX_FINI            = 803,
+        SEC_LAST_OPC,
+        SEC_FIRST_OPC           = SEC_CTX_INIT
+} sec_cmd_t;
+
+/*
+ * capa related definitions
+ */
+#define CAPA_HMAC_MAX_LEN       64
+#define CAPA_HMAC_KEY_MAX_LEN   56
+
+/* NB take care when changing the sequence of elements this struct,
+ * because the offset info is used in find_capa() */
+struct lustre_capa {
+        struct lu_fid   lc_fid;         /** fid */
+        __u64           lc_opc;         /** operations allowed */
+        __u64           lc_uid;         /** file owner */
+        __u64           lc_gid;         /** file group */
+        __u32           lc_flags;       /** HMAC algorithm & flags */
+        __u32           lc_keyid;       /** key# used for the capability */
+        __u32           lc_timeout;     /** capa timeout value (sec) */
+        __u32           lc_expiry;      /** expiry time (sec) */
+        __u8            lc_hmac[CAPA_HMAC_MAX_LEN];   /** HMAC */
+} __attribute__((packed));
+
+/** lustre_capa::lc_opc */
+enum {
+        CAPA_OPC_BODY_WRITE   = 1<<0,  /**< write object data */
+        CAPA_OPC_BODY_READ    = 1<<1,  /**< read object data */
+        CAPA_OPC_INDEX_LOOKUP = 1<<2,  /**< lookup object fid */
+        CAPA_OPC_INDEX_INSERT = 1<<3,  /**< insert object fid */
+        CAPA_OPC_INDEX_DELETE = 1<<4,  /**< delete object fid */
+        CAPA_OPC_OSS_WRITE    = 1<<5,  /**< write oss object data */
+        CAPA_OPC_OSS_READ     = 1<<6,  /**< read oss object data */
+        CAPA_OPC_OSS_TRUNC    = 1<<7,  /**< truncate oss object */
+        CAPA_OPC_OSS_DESTROY  = 1<<8,  /**< destroy oss object */
+        CAPA_OPC_META_WRITE   = 1<<9,  /**< write object meta data */
+        CAPA_OPC_META_READ    = 1<<10, /**< read object meta data */
+};
+
+#define CAPA_OPC_OSS_RW (CAPA_OPC_OSS_READ | CAPA_OPC_OSS_WRITE)
+#define CAPA_OPC_MDS_ONLY                                                   \
+        (CAPA_OPC_BODY_WRITE | CAPA_OPC_BODY_READ | CAPA_OPC_INDEX_LOOKUP | \
+         CAPA_OPC_INDEX_INSERT | CAPA_OPC_INDEX_DELETE)
+#define CAPA_OPC_OSS_ONLY                                                   \
+        (CAPA_OPC_OSS_WRITE | CAPA_OPC_OSS_READ | CAPA_OPC_OSS_TRUNC |      \
+         CAPA_OPC_OSS_DESTROY)
+#define CAPA_OPC_MDS_DEFAULT ~CAPA_OPC_OSS_ONLY
+#define CAPA_OPC_OSS_DEFAULT ~(CAPA_OPC_MDS_ONLY | CAPA_OPC_OSS_ONLY)
+
+/* lustre_capa::lc_hmac_alg */
+enum {
+        CAPA_HMAC_ALG_SHA1 = 1, /**< sha1 algorithm */
+        CAPA_HMAC_ALG_MAX,
+};
+
+#define CAPA_FL_MASK            0x00ffffff
+#define CAPA_HMAC_ALG_MASK      0xff000000
+
+struct lustre_capa_key {
+        __u64   lk_seq;       /**< mds# */
+        __u32   lk_keyid;     /**< key# */
+        __u32   lk_padding;
+        __u8    lk_key[CAPA_HMAC_KEY_MAX_LEN];    /**< key */
+} __attribute__((packed));
+
+/** The link ea holds 1 \a link_ea_entry for each hardlink */
+#define LINK_EA_MAGIC 0x11EAF1DFUL
+struct link_ea_header {
+	__u32 leh_magic;
+	__u32 leh_reccount;
+	__u64 leh_len;	/* total size */
+	__u32 leh_overflow_time;
+	__u32 leh_padding;
+};
+
+/** Hardlink data is name and parent fid.
+ * Stored in this crazy struct for maximum packing and endian-neutrality
+ */
+struct link_ea_entry {
+        /** __u16 stored big-endian, unaligned */
+        unsigned char      lee_reclen[2];
+        unsigned char      lee_parent_fid[sizeof(struct lu_fid)];
+        char               lee_name[0];
+}__attribute__((packed));
+
+/** fid2path request/reply structure */
+struct getinfo_fid2path {
+	struct lu_fid	gf_fid;
+	__u64		gf_recno;
+	__u32		gf_linkno;
+	__u32		gf_pathlen;
+	union {
+		char		gf_path[0];
+		struct lu_fid	gf_root_fid[0];
+	} gf_u;
+} __attribute__((packed));
+
+/** path2parent request/reply structures */
+struct getparent {
+	struct lu_fid	gp_fid;         /**< parent FID */
+	__u32		gp_linkno;	/**< hardlink number */
+	__u32		gp_name_size;   /**< size of the name field */
+	char		gp_name[0];     /**< zero-terminated link name */
+} __attribute__((packed));
+
+enum {
+	LAYOUT_INTENT_ACCESS	= 0,	/** generic access */
+	LAYOUT_INTENT_READ	= 1,	/** not used */
+	LAYOUT_INTENT_WRITE	= 2,	/** write file, for comp layout */
+	LAYOUT_INTENT_GLIMPSE	= 3,	/** not used */
+	LAYOUT_INTENT_TRUNC	= 4,	/** truncate file, for comp layout */
+	LAYOUT_INTENT_RELEASE	= 5,	/** reserved for HSM release */
+	LAYOUT_INTENT_RESTORE	= 6,	/** reserved for HSM restore */
+};
+
+/* enqueue layout lock with intent */
+struct layout_intent {
+	__u32 li_opc;	/* intent operation for enqueue, read, write etc */
+	__u32 li_flags;
+	__u64 li_start;
+	__u64 li_end;
+} __attribute__((packed));
+
+/**
+ * On the wire version of hsm_progress structure.
+ *
+ * Contains the userspace hsm_progress and some internal fields.
+ */
+struct hsm_progress_kernel {
+	/* Field taken from struct hsm_progress */
+	lustre_fid		hpk_fid;
+	__u64			hpk_cookie;
+	struct hsm_extent	hpk_extent;
+	__u16			hpk_flags;
+	__u16			hpk_errval; /* positive val */
+	__u32			hpk_padding1;
+	/* Additional fields */
+	__u64			hpk_data_version;
+	__u64			hpk_padding2;
+} __attribute__((packed));
+
+/**
+ * OUT_UPDATE RPC Format
+ *
+ * During the cross-ref operation, the Master MDT, which the client send the
+ * request to, will disassembly the operation into object updates, then OSP
+ * will send these updates to the remote MDT to be executed.
+ *
+ * An UPDATE_OBJ RPC does a list of updates.  Each update belongs to an
+ * operation and does a type of modification to an object.
+ *
+ * Request Format
+ *
+ *   update_buf
+ *   update (1st)
+ *   update (2nd)
+ *   ...
+ *   update (ub_count-th)
+ *
+ * ub_count must be less than or equal to UPDATE_PER_RPC_MAX.
+ *
+ * Reply Format
+ *
+ *   update_reply
+ *   rc [+ buffers] (1st)
+ *   rc [+ buffers] (2st)
+ *   ...
+ *   rc [+ buffers] (nr_count-th)
+ *
+ * ur_count must be less than or equal to UPDATE_PER_RPC_MAX and should usually
+ * be equal to ub_count.
+ */
+
+/**
+ * Type of each update, if adding/deleting update, please also update
+ * update_opcode in lustre/target/out_lib.c.
+ */
+enum update_type {
+	OUT_START		= 0,
+	OUT_CREATE		= 1,
+	OUT_DESTROY		= 2,
+	OUT_REF_ADD		= 3,
+	OUT_REF_DEL		= 4,
+	OUT_ATTR_SET		= 5,
+	OUT_ATTR_GET		= 6,
+	OUT_XATTR_SET		= 7,
+	OUT_XATTR_GET		= 8,
+	OUT_INDEX_LOOKUP	= 9,
+	OUT_INDEX_INSERT	= 10,
+	OUT_INDEX_DELETE	= 11,
+	OUT_WRITE		= 12,
+	OUT_XATTR_DEL		= 13,
+	OUT_PUNCH		= 14,
+	OUT_READ		= 15,
+	OUT_NOOP		= 16,
+	OUT_LAST
+};
+
+enum update_flag {
+	UPDATE_FL_OST		= 0x00000001,	/* op from OST (not MDT) */
+	UPDATE_FL_SYNC		= 0x00000002,	/* commit before replying */
+	UPDATE_FL_COMMITTED	= 0x00000004,	/* op committed globally */
+	UPDATE_FL_NOLOG		= 0x00000008	/* for idempotent updates */
+};
+
+struct object_update_param {
+	__u16	oup_len;	/* length of this parameter */
+	__u16	oup_padding;
+	__u32	oup_padding2;
+	char	oup_buf[0];
+};
+
+/* object update */
+struct object_update {
+	__u16		ou_type;		/* enum update_type */
+	__u16		ou_params_count;	/* update parameters count */
+	__u32		ou_result_size;		/* how many bytes can return */
+	__u32		ou_flags;		/* enum update_flag */
+	__u32		ou_padding1;		/* padding 1 */
+	__u64		ou_batchid;		/* op transno on master */
+	struct lu_fid	ou_fid;			/* object to be updated */
+	struct object_update_param ou_params[0]; /* update params */
+};
+
+#define	UPDATE_REQUEST_MAGIC_V1	0xBDDE0001
+#define	UPDATE_REQUEST_MAGIC_V2	0xBDDE0002
+#define	UPDATE_REQUEST_MAGIC	UPDATE_REQUEST_MAGIC_V2
+/* Hold object_updates sending to the remote OUT in single RPC */
+struct object_update_request {
+	__u32			ourq_magic;
+	__u16			ourq_count;	/* number of ourq_updates[] */
+	__u16			ourq_padding;
+	struct object_update	ourq_updates[0];
+};
+
+#define OUT_UPDATE_HEADER_MAGIC		0xBDDF0001
+#define OUT_UPDATE_MAX_INLINE_SIZE	4096
+/* Header for updates request between MDTs */
+struct out_update_header {
+	__u32		ouh_magic;
+	__u32		ouh_count;
+	__u32		ouh_inline_length;
+	__u32		ouh_reply_size;
+	__u32		ouh_inline_data[0];
+};
+
+struct out_update_buffer {
+	__u32	oub_size;
+	__u32	oub_padding;
+};
+
+/* the result of object update */
+struct object_update_result {
+	__u32   our_rc;
+	__u16   our_datalen;
+	__u16   our_padding;
+	__u32   our_data[0];
+};
+
+#define UPDATE_REPLY_MAGIC_V1	0x00BD0001
+#define UPDATE_REPLY_MAGIC_V2	0x00BD0002
+#define UPDATE_REPLY_MAGIC	UPDATE_REPLY_MAGIC_V2
+/* Hold object_update_results being replied from the remote OUT. */
+struct object_update_reply {
+	__u32	ourp_magic;
+	__u16	ourp_count;
+	__u16	ourp_padding;
+	__u16	ourp_lens[0];
+};
+
+/* read update result */
+struct out_read_reply {
+	__u32	orr_size;
+	__u32	orr_padding;
+	__u64	orr_offset;
+	char	orr_data[0];
+};
+
+/** layout swap request structure
+ * fid1 and fid2 are in mdt_body
+ */
+struct mdc_swap_layouts {
+	__u64           msl_flags;
+} __attribute__((packed));
+
+struct close_data {
+	struct lustre_handle	cd_handle;
+	struct lu_fid		cd_fid;
+	__u64			cd_data_version;
+	__u64			cd_reserved[8];
+};
+
+/* Update llog format */
+struct update_op {
+	struct lu_fid	uop_fid;
+	__u16		uop_type;
+	__u16		uop_param_count;
+	__u16		uop_params_off[0];
+};
+
+struct update_ops {
+	struct update_op	uops_op[0];
+};
+
+struct update_params {
+	struct object_update_param	up_params[0];
+};
+
+enum update_records_flag {
+	UPDATE_RECORD_CONTINUE = 1 >> 0,
+};
+/*
+ * This is the update record format used to store the updates in
+ * disk. All updates of the operation will be stored in ur_ops.
+ * All of parameters for updates of the operation will be stored
+ * in ur_params.
+ * To save the space of the record, parameters in ur_ops will only
+ * remember their offset in ur_params, so to avoid storing duplicate
+ * parameters in ur_params, which can help us save a lot space for
+ * operation like creating striped directory.
+ */
+struct update_records {
+	__u64			ur_master_transno;
+	__u64			ur_batchid;
+	__u32			ur_flags;
+	/* If the operation includes multiple updates, then ur_index
+	 * means the index of the update inside the whole updates. */
+	__u32			ur_index;
+	__u32			ur_update_count;
+	__u32			ur_param_count;
+	struct update_ops	ur_ops;
+	 /* Note ur_ops has a variable size, so comment out
+	  * the following ur_params, in case some use it directly
+	  * update_records->ur_params
+	  *
+	  * struct update_params	ur_params;
+	  */
+};
+
+struct llog_update_record {
+	struct llog_rec_hdr     lur_hdr;
+	struct update_records   lur_update_rec;
+	/* Note ur_update_rec has a variable size, so comment out
+	* the following ur_tail, in case someone use it directly
+	*
+	* struct llog_rec_tail lur_tail;
+	*/
+};
+
+/* nodemap records, uses 32 byte record length */
+#define LUSTRE_NODEMAP_NAME_LENGTH 16
+struct nodemap_cluster_rec {
+	char	ncr_name[LUSTRE_NODEMAP_NAME_LENGTH + 1];
+	__u8	ncr_flags;
+	__u16	ncr_padding1;
+	__u32	ncr_padding2;
+	__u32	ncr_squash_uid;
+	__u32	ncr_squash_gid;
+};
+
+/* lnet_nid_t is 8 bytes */
+struct nodemap_range_rec {
+	lnet_nid_t	nrr_start_nid;
+	lnet_nid_t	nrr_end_nid;
+	__u64		nrr_padding1;
+	__u64		nrr_padding2;
+};
+
+struct nodemap_id_rec {
+	__u32	nir_id_fs;
+	__u32	nir_padding1;
+	__u64	nir_padding2;
+	__u64	nir_padding3;
+	__u64	nir_padding4;
+};
+
+struct nodemap_global_rec {
+	__u8	ngr_is_active;
+	__u8	ngr_padding1;
+	__u16	ngr_padding2;
+	__u32	ngr_padding3;
+	__u64	ngr_padding4;
+	__u64	ngr_padding5;
+	__u64	ngr_padding6;
+};
+
+union nodemap_rec {
+	struct nodemap_cluster_rec ncr;
+	struct nodemap_range_rec nrr;
+	struct nodemap_id_rec nir;
+	struct nodemap_global_rec ngr;
+};
+
+/* This is the lu_ladvise struct which goes out on the wire.
+ * Corresponds to the userspace arg llapi_lu_ladvise.
+ * value[1-4] are unspecified fields, used differently by different advices */
+struct lu_ladvise {
+	__u16 lla_advice;	/* advice type */
+	__u16 lla_value1;	/* values for different advice types */
+	__u32 lla_value2;
+	__u64 lla_start;	/* first byte of extent for advice */
+	__u64 lla_end;		/* last byte of extent for advice */
+	__u32 lla_value3;
+	__u32 lla_value4;
+};
+
+/* This is the ladvise_hdr which goes on the wire, corresponds to the userspace
+ * arg llapi_ladvise_hdr.
+ * value[1-3] are unspecified fields, used differently by different advices */
+struct ladvise_hdr {
+	__u32			lah_magic;	/* LADVISE_MAGIC */
+	__u32			lah_count;	/* number of advices */
+	__u64			lah_flags;	/* from enum ladvise_flag */
+	__u32			lah_value1;	/* unused */
+	__u32			lah_value2;	/* unused */
+	__u64			lah_value3;	/* unused */
+	struct lu_ladvise	lah_advise[0];	/* advices in this header */
+};
+
+#endif
+/** @} lustreidl */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_lfsck_user.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_lfsck_user.h
new file mode 100644
index 0000000000000..a02f65fa08aef
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_lfsck_user.h
@@ -0,0 +1,236 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * lustre/include/lustre/lustre_lfsck_user.h
+ *
+ * Lustre LFSCK userspace interfaces.
+ *
+ * Author: Fan, Yong <fan.yong@intel.com>
+ */
+
+#ifndef _LUSTRE_LFSCK_USER_H
+# define _LUSTRE_LFSCK_USER_H
+# include <lustre/lustre_user.h>
+
+/**
+ * state machine:
+ *
+ *					LS_INIT
+ *					   |
+ *				     (lfsck|start)
+ *					   |
+ *					   v
+ *				   LS_SCANNING_PHASE1
+ *					|	^
+ *					|	:
+ *					| (lfsck:restart)
+ *					|	:
+ *					v	:
+ *	-----------------------------------------------------------------
+ *	|		    |^		|^	   |^	      |^	|^
+ *	|		    |:		|:	   |:	      |:	|:
+ *	v		    v:		v:	   v:	      v:	v:
+ * LS_SCANNING_PHASE2	LS_FAILED  LS_STOPPED  LS_PAUSED LS_CRASHED LS_PARTIAL
+ *			  (CO_)       (CO_)	 (CO_)
+ *	|	^	    ^:		^:	   ^:	      ^:	^:
+ *	|	:	    |:		|:	   |:	      |:	|:
+ *	| (lfsck:restart)   |:		|:	   |:	      |:	|:
+ *	v	:	    |v		|v	   |v	      |v	|v
+ *	-----------------------------------------------------------------
+ *	    |
+ *	    v
+ *    LS_COMPLETED
+ */
+enum lfsck_status {
+	/* The lfsck file is new created, for new MDT, upgrading from old disk,
+	 * or re-creating the lfsck file manually. */
+	LS_INIT			= 0,
+
+	/* The first-step system scanning. The checked items during the phase1
+	 * scanning depends on the LFSCK type. */
+	LS_SCANNING_PHASE1	= 1,
+
+	/* The second-step system scanning. The checked items during the phase2
+	 * scanning depends on the LFSCK type. */
+	LS_SCANNING_PHASE2	= 2,
+
+	/* The LFSCK processing has completed for all objects. */
+	LS_COMPLETED		= 3,
+
+	/* The LFSCK exited automatically for failure, will not auto restart. */
+	LS_FAILED		= 4,
+
+	/* The LFSCK is stopped manually, will not auto restart. */
+	LS_STOPPED		= 5,
+
+	/* LFSCK is paused automatically when umount,
+	 * will be restarted automatically when remount. */
+	LS_PAUSED		= 6,
+
+	/* System crashed during the LFSCK,
+	 * will be restarted automatically after recovery. */
+	LS_CRASHED		= 7,
+
+	/* Some OST/MDT failed during the LFSCK, or not join the LFSCK. */
+	LS_PARTIAL		= 8,
+
+	/* The LFSCK is failed because its controller is failed. */
+	LS_CO_FAILED		= 9,
+
+	/* The LFSCK is stopped because its controller is stopped. */
+	LS_CO_STOPPED		= 10,
+
+	/* The LFSCK is paused because its controller is paused. */
+	LS_CO_PAUSED		= 11,
+
+	LS_MAX
+};
+
+static inline const char *lfsck_status2name(int status)
+{
+	static const char * const lfsck_status_names[] = {
+		[LS_INIT]		= "init",
+		[LS_SCANNING_PHASE1]	= "scanning-phase1",
+		[LS_SCANNING_PHASE2]	= "scanning-phase2",
+		[LS_COMPLETED]		= "completed",
+		[LS_FAILED]		= "failed",
+		[LS_STOPPED]		= "stopped",
+		[LS_PAUSED]		= "paused",
+		[LS_CRASHED]		= "crashed",
+		[LS_PARTIAL]		= "partial",
+		[LS_CO_FAILED]		= "co-failed",
+		[LS_CO_STOPPED]		= "co-stopped",
+		[LS_CO_PAUSED]		= "co-paused"
+	};
+
+	if (status < 0 || status >= LS_MAX)
+		return "unknown";
+
+	return lfsck_status_names[status];
+}
+
+enum lfsck_param_flags {
+	/* Reset LFSCK iterator position to the device beginning. */
+	LPF_RESET		= 0x0001,
+
+	/* Exit when fail. */
+	LPF_FAILOUT		= 0x0002,
+
+	/* Dryrun mode, only check without modification */
+	LPF_DRYRUN		= 0x0004,
+
+	/* LFSCK runs on all targets. */
+	LPF_ALL_TGT		= 0x0008,
+
+	/* Broadcast the command to other MDTs. Only valid on the sponsor MDT */
+	LPF_BROADCAST		= 0x0010,
+
+	/* Handle orphan OST-objects. */
+	LPF_OST_ORPHAN		= 0x0020,
+
+	/* Create OST-object for dangling LOV EA. */
+	LPF_CREATE_OSTOBJ	= 0x0040,
+
+	/* Create MDT-object for dangling name entry. */
+	LPF_CREATE_MDTOBJ	= 0x0080,
+
+	/* Do not return until the LFSCK not running. */
+	LPF_WAIT		= 0x0100,
+
+	/* Delay to create OST-object for dangling LOV EA. */
+	LPF_DELAY_CREATE_OSTOBJ	= 0x0200,
+};
+
+enum lfsck_type {
+	/* For MDT and OST internal OSD consistency check/repair. */
+	LFSCK_TYPE_SCRUB	= 0x0000,
+
+	/* For MDT-OST (layout, object) consistency check/repair. */
+	LFSCK_TYPE_LAYOUT	= 0x0001,
+
+	/* For MDT (FID-in-dirent, linkEA) consistency check/repair. */
+	LFSCK_TYPE_NAMESPACE	= 0x0004,
+	LFSCK_TYPES_SUPPORTED	= (LFSCK_TYPE_SCRUB | LFSCK_TYPE_LAYOUT |
+				   LFSCK_TYPE_NAMESPACE),
+	LFSCK_TYPES_DEF		= LFSCK_TYPES_SUPPORTED,
+	LFSCK_TYPES_ALL		= ((__u16)(~0))
+};
+
+#define LFSCK_VERSION_V1	1
+#define LFSCK_VERSION_V2	2
+
+#define LFSCK_SPEED_NO_LIMIT	0
+#define LFSCK_SPEED_LIMIT_DEF	LFSCK_SPEED_NO_LIMIT
+#define LFSCK_ASYNC_WIN_DEFAULT 1024
+#define LFSCK_ASYNC_WIN_MAX	((__u16)(~0))
+#define LFSCK_TYPE_BITS		16
+
+enum lfsck_start_valid {
+	LSV_SPEED_LIMIT		= 0x00000001,
+	LSV_ERROR_HANDLE	= 0x00000002,
+	LSV_DRYRUN		= 0x00000004,
+	LSV_ASYNC_WINDOWS	= 0x00000008,
+	LSV_CREATE_OSTOBJ	= 0x00000010,
+	LSV_CREATE_MDTOBJ	= 0x00000020,
+	LSV_DELAY_CREATE_OSTOBJ	= 0x00000040,
+};
+
+/* Arguments for starting lfsck. */
+struct lfsck_start {
+	/* Which arguments are valid, see 'enum lfsck_start_valid'. */
+	__u32   ls_valid;
+
+	/* How many items can be scanned at most per second. */
+	__u32   ls_speed_limit;
+
+	/* For compatibility between user space tools and kernel service. */
+	__u16   ls_version;
+
+	/* Which LFSCK components to be (have been) started. */
+	__u16   ls_active;
+
+	/* Flags for the LFSCK, see 'enum lfsck_param_flags'. */
+	__u16   ls_flags;
+
+	/* The windows size for async requests pipeline. */
+	__u16   ls_async_windows;
+};
+
+struct lfsck_stop {
+	__u32	ls_status;
+	__u16	ls_flags;
+	__u16	ls_padding_1; /* For 64-bits aligned. */
+	__u64	ls_padding_2;
+};
+
+struct lfsck_query {
+	__u16	lu_types;
+	__u16	lu_flags;
+	__u32	lu_mdts_count[LFSCK_TYPE_BITS][LS_MAX + 1];
+	__u32	lu_osts_count[LFSCK_TYPE_BITS][LS_MAX + 1];
+	__u64	lu_repaired[LFSCK_TYPE_BITS];
+};
+
+#endif /* _LUSTRE_LFSCK_USER_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h
new file mode 100644
index 0000000000000..c014ed714919e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h
@@ -0,0 +1,1625 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/lustre_user.h
+ *
+ * Lustre public user-space interface definitions.
+ */
+
+#ifndef _LUSTRE_USER_H
+#define _LUSTRE_USER_H
+
+/** \defgroup lustreuser lustreuser
+ *
+ * @{
+ */
+
+#include <linux/types.h>
+
+#ifdef __KERNEL__
+# include <linux/fs.h>
+# include <linux/quota.h>
+# include <linux/string.h> /* snprintf() */
+# include <linux/version.h>
+#else /* !__KERNEL__ */
+# define NEED_QUOTA_DEFS
+# include <limits.h>
+# include <stdbool.h>
+# include <stdio.h> /* snprintf() */
+# include <string.h>
+# include <sys/quota.h>
+# include <sys/stat.h>
+#endif /* __KERNEL__ */
+#include <lustre/ll_fiemap.h>
+
+/*
+ * This is a temporary solution of adding quota type.
+ * Should be removed as soon as system header is updated.
+ */
+#undef LL_MAXQUOTAS
+#define LL_MAXQUOTAS 3
+#undef INITQFNAMES
+#define INITQFNAMES { \
+    "user",	/* USRQUOTA */ \
+    "group",	/* GRPQUOTA */ \
+    "project",	/* PRJQUOTA */ \
+    "undefined", \
+};
+#define PRJQUOTA 2
+
+#if defined(__x86_64__) || defined(__ia64__) || defined(__ppc64__) || \
+    defined(__craynv) || defined(__mips64__) || defined(__powerpc64__) || \
+    defined(__aarch64__)
+typedef struct stat	lstat_t;
+# define lstat_f	lstat
+# define fstat_f	fstat
+# define fstatat_f	fstatat
+# define HAVE_LOV_USER_MDS_DATA
+#elif defined(__USE_LARGEFILE64) || defined(__KERNEL__)
+typedef struct stat64	lstat_t;
+# define lstat_f	lstat64
+# define fstat_f	fstat64
+# define fstatat_f	fstatat64
+# define HAVE_LOV_USER_MDS_DATA
+#endif
+
+#define LUSTRE_EOF 0xffffffffffffffffULL
+
+/* for statfs() */
+#define LL_SUPER_MAGIC 0x0BD00BD0
+
+#ifndef FSFILT_IOC_GETFLAGS
+#define FSFILT_IOC_GETFLAGS               _IOR('f', 1, long)
+#define FSFILT_IOC_SETFLAGS               _IOW('f', 2, long)
+#define FSFILT_IOC_GETVERSION             _IOR('f', 3, long)
+#define FSFILT_IOC_SETVERSION             _IOW('f', 4, long)
+#define FSFILT_IOC_GETVERSION_OLD         _IOR('v', 1, long)
+#define FSFILT_IOC_SETVERSION_OLD         _IOW('v', 2, long)
+#endif
+
+/* FIEMAP flags supported by Lustre */
+#define LUSTRE_FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_DEVICE_ORDER)
+
+enum obd_statfs_state {
+        OS_STATE_DEGRADED       = 0x00000001, /**< RAID degraded/rebuilding */
+        OS_STATE_READONLY       = 0x00000002, /**< filesystem is read-only */
+	OS_STATE_NOPRECREATE    = 0x00000004, /**< no object precreation */
+	OS_STATE_ENOSPC		= 0x00000020, /**< not enough free space */
+	OS_STATE_ENOINO		= 0x00000040, /**< not enough inodes */
+};
+
+struct obd_statfs {
+        __u64           os_type;
+        __u64           os_blocks;
+        __u64           os_bfree;
+        __u64           os_bavail;
+        __u64           os_files;
+        __u64           os_ffree;
+        __u8            os_fsid[40];
+        __u32           os_bsize;
+        __u32           os_namelen;
+        __u64           os_maxbytes;
+        __u32           os_state;       /**< obd_statfs_state OS_STATE_* flag */
+	__u32           os_fprecreated;	/* objs available now to the caller */
+					/* used in QoS code to find preferred
+					 * OSTs */
+        __u32           os_spare2;
+        __u32           os_spare3;
+        __u32           os_spare4;
+        __u32           os_spare5;
+        __u32           os_spare6;
+        __u32           os_spare7;
+        __u32           os_spare8;
+        __u32           os_spare9;
+};
+
+/**
+ * File IDentifier.
+ *
+ * FID is a cluster-wide unique identifier of a file or an object (stripe).
+ * FIDs are never reused.
+ **/
+struct lu_fid {
+       /**
+	* FID sequence. Sequence is a unit of migration: all files (objects)
+	* with FIDs from a given sequence are stored on the same server.
+	* Lustre should support 2^64 objects, so even if each sequence
+	* has only a single object we can still enumerate 2^64 objects.
+	**/
+	__u64 f_seq;
+	/* FID number within sequence. */
+	__u32 f_oid;
+	/**
+	 * FID version, used to distinguish different versions (in the sense
+	 * of snapshots, etc.) of the same file system object. Not currently
+	 * used.
+	 **/
+	__u32 f_ver;
+};
+
+static inline bool fid_is_zero(const struct lu_fid *fid)
+{
+	return fid->f_seq == 0 && fid->f_oid == 0;
+}
+
+/* Currently, the filter_fid::ff_parent::f_ver is not the real parent
+ * MDT-object's FID::f_ver, instead it is the OST-object index in its
+ * parent MDT-object's layout EA. */
+#define f_stripe_idx f_ver
+
+struct ost_layout {
+	__u32	ol_stripe_size;
+	__u32	ol_stripe_count;
+	__u64	ol_comp_start;
+	__u64	ol_comp_end;
+	__u32	ol_comp_id;
+} __attribute__((packed));
+
+/* keep this one for compatibility */
+struct filter_fid_old {
+	struct lu_fid	ff_parent;
+	__u64		ff_objid;
+	__u64		ff_seq;
+};
+
+struct filter_fid {
+	struct lu_fid		ff_parent;
+	struct ost_layout	ff_layout;
+} __attribute__((packed));
+
+/* Userspace should treat lu_fid as opaque, and only use the following methods
+ * to print or parse them.  Other functions (e.g. compare, swab) could be moved
+ * here from lustre_idl.h if needed. */
+typedef struct lu_fid lustre_fid;
+
+enum lma_compat {
+	LMAC_HSM	 = 0x00000001,
+/*	LMAC_SOM	 = 0x00000002, obsolete since 2.8.0 */
+	LMAC_NOT_IN_OI	 = 0x00000004, /* the object does NOT need OI mapping */
+	LMAC_FID_ON_OST  = 0x00000008, /* For OST-object, its OI mapping is
+				       * under /O/<seq>/d<x>. */
+	LMAC_STRIPE_INFO = 0x00000010, /* stripe info in the LMA EA. */
+	LMAC_COMP_INFO	 = 0x00000020, /* Component info in the LMA EA. */
+};
+
+/**
+ * Masks for all features that should be supported by a Lustre version to
+ * access a specific file.
+ * This information is stored in lustre_mdt_attrs::lma_incompat.
+ */
+enum lma_incompat {
+	LMAI_RELEASED		= 0x00000001, /* file is released */
+	LMAI_AGENT		= 0x00000002, /* agent inode */
+	LMAI_REMOTE_PARENT	= 0x00000004, /* the parent of the object
+						 is on the remote MDT */
+	LMAI_STRIPED		= 0x00000008, /* striped directory inode */
+	LMAI_ORPHAN		= 0x00000010, /* inode is orphan */
+	LMA_INCOMPAT_SUPP	= (LMAI_AGENT | LMAI_REMOTE_PARENT | \
+				   LMAI_STRIPED | LMAI_ORPHAN)
+};
+
+
+/**
+ * Following struct for object attributes, that will be kept inode's EA.
+ * Introduced in 2.0 release (please see b15993, for details)
+ * Added to all objects since Lustre 2.4 as contains self FID
+ */
+struct lustre_mdt_attrs {
+	/**
+	 * Bitfield for supported data in this structure. From enum lma_compat.
+	 * lma_self_fid and lma_flags are always available.
+	 */
+	__u32   lma_compat;
+	/**
+	 * Per-file incompat feature list. Lustre version should support all
+	 * flags set in this field. The supported feature mask is available in
+	 * LMA_INCOMPAT_SUPP.
+	 */
+	__u32   lma_incompat;
+	/** FID of this inode */
+	struct lu_fid  lma_self_fid;
+};
+
+struct lustre_ost_attrs {
+	/* Use lustre_mdt_attrs directly for now, need a common header
+	 * structure if want to change lustre_mdt_attrs in future. */
+	struct lustre_mdt_attrs loa_lma;
+
+	/* Below five elements are for OST-object's PFID EA, the
+	 * lma_parent_fid::f_ver is composed of the stripe_count (high 16 bits)
+	 * and the stripe_index (low 16 bits), the size should not exceed
+	 * 5 * sizeof(__u64)) to be accessable by old Lustre. If the flag
+	 * LMAC_STRIPE_INFO is set, then loa_parent_fid and loa_stripe_size
+	 * are valid; if the flag LMAC_COMP_INFO is set, then the next three
+	 * loa_comp_* elements are valid. */
+	struct lu_fid	loa_parent_fid;
+	__u32		loa_stripe_size;
+	__u32		loa_comp_id;
+	__u64		loa_comp_start;
+	__u64		loa_comp_end;
+};
+
+/**
+ * Prior to 2.4, the LMA structure also included SOM attributes which has since
+ * been moved to a dedicated xattr
+ * lma_flags was also removed because of lma_compat/incompat fields.
+ */
+#define LMA_OLD_SIZE (sizeof(struct lustre_mdt_attrs) + 5 * sizeof(__u64))
+
+/**
+ * OST object IDentifier.
+ */
+struct ost_id {
+	union {
+		struct {
+			__u64	oi_id;
+			__u64	oi_seq;
+		} oi;
+		struct lu_fid oi_fid;
+	};
+};
+
+#define DOSTID "%#llx:%llu"
+#define POSTID(oi) ((unsigned long long)ostid_seq(oi)), \
+		   ((unsigned long long)ostid_id(oi))
+
+struct ll_futimes_3 {
+	__u64 lfu_atime_sec;
+	__u64 lfu_atime_nsec;
+	__u64 lfu_mtime_sec;
+	__u64 lfu_mtime_nsec;
+	__u64 lfu_ctime_sec;
+	__u64 lfu_ctime_nsec;
+};
+
+/*
+ * The ioctl naming rules:
+ * LL_*     - works on the currently opened filehandle instead of parent dir
+ * *_OBD_*  - gets data for both OSC or MDC (LOV, LMV indirectly)
+ * *_MDC_*  - gets/sets data related to MDC
+ * *_LOV_*  - gets/sets data related to OSC/LOV
+ * *FILE*   - called on parent dir and passes in a filename
+ * *STRIPE* - set/get lov_user_md
+ * *INFO    - set/get lov_user_mds_data
+ */
+/*	lustre_ioctl.h			101-150 */
+#define LL_IOC_GETFLAGS                 _IOR ('f', 151, long)
+#define LL_IOC_SETFLAGS                 _IOW ('f', 152, long)
+#define LL_IOC_CLRFLAGS                 _IOW ('f', 153, long)
+#define LL_IOC_LOV_SETSTRIPE            _IOW ('f', 154, long)
+#define LL_IOC_LOV_SETSTRIPE_NEW	_IOWR('f', 154, struct lov_user_md)
+#define LL_IOC_LOV_GETSTRIPE            _IOW ('f', 155, long)
+#define LL_IOC_LOV_GETSTRIPE_NEW	_IOR('f', 155, struct lov_user_md)
+#define LL_IOC_LOV_SETEA                _IOW ('f', 156, long)
+/*	LL_IOC_RECREATE_OBJ             157 obsolete */
+/*	LL_IOC_RECREATE_FID             157 obsolete */
+#define LL_IOC_GROUP_LOCK               _IOW ('f', 158, long)
+#define LL_IOC_GROUP_UNLOCK             _IOW ('f', 159, long)
+/*	LL_IOC_QUOTACHECK               160 OBD_IOC_QUOTACHECK */
+/*	LL_IOC_POLL_QUOTACHECK		161 OBD_IOC_POLL_QUOTACHECK */
+/*	LL_IOC_QUOTACTL			162 OBD_IOC_QUOTACTL */
+#define IOC_OBD_STATFS                  _IOWR('f', 164, struct obd_statfs *)
+/*	IOC_LOV_GETINFO                 165 obsolete */
+#define LL_IOC_FLUSHCTX                 _IOW ('f', 166, long)
+/*	LL_IOC_RMTACL                   167 obsolete */
+#define LL_IOC_GETOBDCOUNT              _IOR ('f', 168, long)
+#define LL_IOC_LLOOP_ATTACH             _IOWR('f', 169, long)
+#define LL_IOC_LLOOP_DETACH             _IOWR('f', 170, long)
+#define LL_IOC_LLOOP_INFO               _IOWR('f', 171, struct lu_fid)
+#define LL_IOC_LLOOP_DETACH_BYDEV       _IOWR('f', 172, long)
+#define LL_IOC_PATH2FID                 _IOR ('f', 173, long)
+#define LL_IOC_GET_CONNECT_FLAGS        _IOWR('f', 174, __u64 *)
+#define LL_IOC_GET_MDTIDX               _IOR ('f', 175, int)
+#define LL_IOC_FUTIMES_3		_IOWR('f', 176, struct ll_futimes_3)
+/*	lustre_ioctl.h			177-210 */
+#define LL_IOC_HSM_STATE_GET		_IOR('f', 211, struct hsm_user_state)
+#define LL_IOC_HSM_STATE_SET		_IOW('f', 212, struct hsm_state_set)
+#define LL_IOC_HSM_CT_START		_IOW('f', 213, struct lustre_kernelcomm)
+#define LL_IOC_HSM_COPY_START		_IOW('f', 214, struct hsm_copy *)
+#define LL_IOC_HSM_COPY_END		_IOW('f', 215, struct hsm_copy *)
+#define LL_IOC_HSM_PROGRESS		_IOW('f', 216, struct hsm_user_request)
+#define LL_IOC_HSM_REQUEST		_IOW('f', 217, struct hsm_user_request)
+#define LL_IOC_DATA_VERSION		_IOR('f', 218, struct ioc_data_version)
+#define LL_IOC_LOV_SWAP_LAYOUTS		_IOW('f', 219, \
+						struct lustre_swap_layouts)
+#define LL_IOC_HSM_ACTION		_IOR('f', 220, \
+						struct hsm_current_action)
+/*	lustre_ioctl.h			221-232 */
+#define LL_IOC_LMV_SETSTRIPE		_IOWR('f', 240, struct lmv_user_md)
+#define LL_IOC_LMV_GETSTRIPE		_IOWR('f', 241, struct lmv_user_md)
+#define LL_IOC_REMOVE_ENTRY		_IOWR('f', 242, __u64)
+#define LL_IOC_SET_LEASE		_IOWR('f', 243, long)
+#define LL_IOC_GET_LEASE		_IO('f', 244)
+#define LL_IOC_HSM_IMPORT		_IOWR('f', 245, struct hsm_user_import)
+#define LL_IOC_LMV_SET_DEFAULT_STRIPE	_IOWR('f', 246, struct lmv_user_md)
+#define LL_IOC_MIGRATE			_IOR('f', 247, int)
+#define LL_IOC_FID2MDTIDX		_IOWR('f', 248, struct lu_fid)
+#define LL_IOC_GETPARENT		_IOWR('f', 249, struct getparent)
+#define LL_IOC_LADVISE			_IOR('f', 250, struct llapi_lu_ladvise)
+
+#ifndef	FS_IOC_FSGETXATTR
+/*
+ * Structure for FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR.
+*/
+struct fsxattr {
+	__u32           fsx_xflags;     /* xflags field value (get/set) */
+	__u32           fsx_extsize;    /* extsize field value (get/set)*/
+	__u32           fsx_nextents;   /* nextents field value (get)   */
+	__u32           fsx_projid;     /* project identifier (get/set) */
+	unsigned char   fsx_pad[12];
+};
+#define FS_IOC_FSGETXATTR		_IOR('X', 31, struct fsxattr)
+#define FS_IOC_FSSETXATTR		_IOW('X', 32, struct fsxattr)
+#endif
+#define LL_IOC_FSGETXATTR		FS_IOC_FSGETXATTR
+#define LL_IOC_FSSETXATTR		FS_IOC_FSSETXATTR
+#define LL_PROJINHERIT_FL		0x20000000
+
+
+/* Lease types for use as arg and return of LL_IOC_{GET,SET}_LEASE ioctl. */
+enum ll_lease_type {
+	LL_LEASE_RDLCK	= 0x1,
+	LL_LEASE_WRLCK	= 0x2,
+	LL_LEASE_UNLCK	= 0x4,
+};
+
+#define LL_STATFS_LMV		1
+#define LL_STATFS_LOV		2
+#define LL_STATFS_NODELAY	4
+
+#define IOC_MDC_TYPE            'i'
+#define IOC_MDC_LOOKUP          _IOWR(IOC_MDC_TYPE, 20, struct obd_device *)
+#define IOC_MDC_GETFILESTRIPE   _IOWR(IOC_MDC_TYPE, 21, struct lov_user_md *)
+#define IOC_MDC_GETFILEINFO     _IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data *)
+#define LL_IOC_MDC_GETINFO      _IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data *)
+
+#define MAX_OBD_NAME 128 /* If this changes, a NEW ioctl must be added */
+
+/* Define O_LOV_DELAY_CREATE to be a mask that is not useful for regular
+ * files, but are unlikely to be used in practice and are not harmful if
+ * used incorrectly.  O_NOCTTY and FASYNC are only meaningful for character
+ * devices and are safe for use on new files. See LU-4209. */
+/* To be compatible with old statically linked binary we keep the check for
+ * the older 0100000000 flag.  This is already removed upstream.  LU-812. */
+#define O_LOV_DELAY_CREATE_1_8	0100000000 /* FMODE_NONOTIFY masked in 2.6.36 */
+#define O_LOV_DELAY_CREATE_MASK	(O_NOCTTY | FASYNC)
+#define O_LOV_DELAY_CREATE		(O_LOV_DELAY_CREATE_1_8 | \
+					 O_LOV_DELAY_CREATE_MASK)
+
+#define LL_FILE_IGNORE_LOCK     0x00000001
+#define LL_FILE_GROUP_LOCKED    0x00000002
+#define LL_FILE_READAHEA        0x00000004
+#define LL_FILE_LOCKED_DIRECTIO 0x00000008 /* client-side locks with dio */
+#define LL_FILE_LOCKLESS_IO     0x00000010 /* server-side locks with cio */
+
+#define LOV_USER_MAGIC_V1	0x0BD10BD0
+#define LOV_USER_MAGIC		LOV_USER_MAGIC_V1
+#define LOV_USER_MAGIC_JOIN_V1	0x0BD20BD0
+#define LOV_USER_MAGIC_V3	0x0BD30BD0
+/* 0x0BD40BD0 is occupied by LOV_MAGIC_MIGRATE */
+#define LOV_USER_MAGIC_SPECIFIC 0x0BD50BD0	/* for specific OSTs */
+#define LOV_USER_MAGIC_COMP_V1	0x0BD60BD0
+
+#define LMV_USER_MAGIC		0x0CD30CD0    /* default lmv magic */
+#define LMV_USER_MAGIC_V0	0x0CD20CD0    /* old default lmv magic */
+
+#define LOV_PATTERN_NONE	0x000
+#define LOV_PATTERN_RAID0	0x001
+#define LOV_PATTERN_RAID1	0x002
+#define LOV_PATTERN_FIRST	0x100
+#define LOV_PATTERN_CMOBD	0x200
+
+#define LOV_PATTERN_F_MASK	0xffff0000
+#define LOV_PATTERN_F_HOLE	0x40000000 /* there is hole in LOV EA */
+#define LOV_PATTERN_F_RELEASED	0x80000000 /* HSM released file */
+#define LOV_PATTERN_DEFAULT	0xffffffff
+
+static inline bool lov_pattern_supported(__u32 pattern)
+{
+	return pattern == LOV_PATTERN_RAID0 ||
+	       pattern == (LOV_PATTERN_RAID0 | LOV_PATTERN_F_RELEASED);
+}
+
+#define LOV_MAXPOOLNAME 15
+#define LOV_POOLNAMEF "%.15s"
+
+#define LOV_MIN_STRIPE_BITS 16   /* maximum PAGE_SIZE (ia64), power of 2 */
+#define LOV_MIN_STRIPE_SIZE (1 << LOV_MIN_STRIPE_BITS)
+#define LOV_MAX_STRIPE_COUNT_OLD 160
+/* This calculation is crafted so that input of 4096 will result in 160
+ * which in turn is equal to old maximal stripe count.
+ * XXX: In fact this is too simpified for now, what it also need is to get
+ * ea_type argument to clearly know how much space each stripe consumes.
+ *
+ * The limit of 12 pages is somewhat arbitrary, but is a reasonably large
+ * allocation that is sufficient for the current generation of systems.
+ *
+ * (max buffer size - lov+rpc header) / sizeof(struct lov_ost_data_v1) */
+#define LOV_MAX_STRIPE_COUNT 2000  /* ((12 * 4096 - 256) / 24) */
+#define LOV_ALL_STRIPES       0xffff /* only valid for directories */
+#define LOV_V1_INSANE_STRIPE_COUNT 65532 /* maximum stripe count bz13933 */
+
+#define XATTR_LUSTRE_PREFIX	"lustre."
+#define XATTR_LUSTRE_LOV	XATTR_LUSTRE_PREFIX"lov"
+
+#define lov_user_ost_data lov_user_ost_data_v1
+struct lov_user_ost_data_v1 {     /* per-stripe data structure */
+	struct ost_id l_ost_oi;	  /* OST object ID */
+	__u32 l_ost_gen;          /* generation of this OST index */
+	__u32 l_ost_idx;          /* OST index in LOV */
+} __attribute__((packed));
+
+#define lov_user_md lov_user_md_v1
+struct lov_user_md_v1 {           /* LOV EA user data (host-endian) */
+	__u32 lmm_magic;          /* magic number = LOV_USER_MAGIC_V1 */
+	__u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+	struct ost_id lmm_oi;	  /* MDT parent inode id/seq (id/0 for 1.x) */
+	__u32 lmm_stripe_size;    /* size of stripe in bytes */
+	__u16 lmm_stripe_count;   /* num stripes in use for this object */
+	union {
+		__u16 lmm_stripe_offset;  /* starting stripe offset in
+					   * lmm_objects, use when writing */
+		__u16 lmm_layout_gen;     /* layout generation number
+					   * used when reading */
+	};
+	struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+} __attribute__((packed,  __may_alias__));
+
+struct lov_user_md_v3 {           /* LOV EA user data (host-endian) */
+	__u32 lmm_magic;          /* magic number = LOV_USER_MAGIC_V3 */
+	__u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+	struct ost_id lmm_oi;	  /* MDT parent inode id/seq (id/0 for 1.x) */
+	__u32 lmm_stripe_size;    /* size of stripe in bytes */
+	__u16 lmm_stripe_count;   /* num stripes in use for this object */
+	union {
+		__u16 lmm_stripe_offset;  /* starting stripe offset in
+					   * lmm_objects, use when writing */
+		__u16 lmm_layout_gen;     /* layout generation number
+					   * used when reading */
+	};
+	char  lmm_pool_name[LOV_MAXPOOLNAME + 1]; /* pool name */
+	struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+} __attribute__((packed));
+
+struct lu_extent {
+	__u64	e_start;
+	__u64	e_end;
+};
+
+#define DEXT "[ %#llx , %#llx )"
+#define PEXT(ext) (ext)->e_start, (ext)->e_end
+
+static inline bool lu_extent_is_overlapped(struct lu_extent *e1,
+					   struct lu_extent *e2)
+{
+	return e1->e_start < e2->e_end && e2->e_start < e1->e_end;
+}
+
+enum lov_comp_md_entry_flags {
+	LCME_FL_PRIMARY	= 0x00000001,	/* Not used */
+	LCME_FL_STALE	= 0x00000002,	/* Not used */
+	LCME_FL_OFFLINE	= 0x00000004,	/* Not used */
+	LCME_FL_PREFERRED = 0x00000008, /* Not used */
+	LCME_FL_INIT	= 0x00000010,	/* instantiated */
+	LCME_FL_NEG	= 0x80000000	/* used to indicate a negative flag,
+					   won't be stored on disk */
+};
+
+#define LCME_KNOWN_FLAGS	(LCME_FL_NEG | LCME_FL_INIT)
+
+/* lcme_id can be specified as certain flags, and the the first
+ * bit of lcme_id is used to indicate that the ID is representing
+ * certain LCME_FL_* but not a real ID. Which implies we can have
+ * at most 31 flags (see LCME_FL_XXX). */
+enum lcme_id {
+	LCME_ID_INVAL	= 0x0,
+	LCME_ID_MAX	= 0x7FFFFFFF,
+	LCME_ID_ALL	= 0xFFFFFFFF,
+	LCME_ID_NOT_ID	= LCME_FL_NEG
+};
+
+#define LCME_ID_MASK	LCME_ID_MAX
+
+struct lov_comp_md_entry_v1 {
+	__u32			lcme_id;        /* unique id of component */
+	__u32			lcme_flags;     /* LCME_FL_XXX */
+	struct lu_extent	lcme_extent;    /* file extent for component */
+	__u32			lcme_offset;    /* offset of component blob,
+						   start from lov_comp_md_v1 */
+	__u32			lcme_size;      /* size of component blob */
+	__u64			lcme_padding[2];
+} __attribute__((packed));
+
+enum lov_comp_md_flags;
+
+struct lov_comp_md_v1 {
+	__u32	lcm_magic;      /* LOV_USER_MAGIC_COMP_V1 */
+	__u32	lcm_size;       /* overall size including this struct */
+	__u32	lcm_layout_gen;
+	__u16	lcm_flags;
+	__u16	lcm_entry_count;
+	__u64	lcm_padding1;
+	__u64	lcm_padding2;
+	struct lov_comp_md_entry_v1 lcm_entries[0];
+} __attribute__((packed));
+
+static inline __u32 lov_user_md_size(__u16 stripes, __u32 lmm_magic)
+{
+	if (stripes == (__u16)-1)
+		stripes = 0;
+
+	if (lmm_magic == LOV_USER_MAGIC_V1)
+		return sizeof(struct lov_user_md_v1) +
+			      stripes * sizeof(struct lov_user_ost_data_v1);
+	return sizeof(struct lov_user_md_v3) +
+				stripes * sizeof(struct lov_user_ost_data_v1);
+}
+
+/* Compile with -D_LARGEFILE64_SOURCE or -D_GNU_SOURCE (or #define) to
+ * use this.  It is unsafe to #define those values in this header as it
+ * is possible the application has already #included <sys/stat.h>. */
+#ifdef HAVE_LOV_USER_MDS_DATA
+#define lov_user_mds_data lov_user_mds_data_v1
+struct lov_user_mds_data_v1 {
+        lstat_t lmd_st;                 /* MDS stat struct */
+        struct lov_user_md_v1 lmd_lmm;  /* LOV EA V1 user data */
+} __attribute__((packed));
+
+struct lov_user_mds_data_v3 {
+        lstat_t lmd_st;                 /* MDS stat struct */
+        struct lov_user_md_v3 lmd_lmm;  /* LOV EA V3 user data */
+} __attribute__((packed));
+#endif
+
+struct lmv_user_mds_data {
+	struct lu_fid	lum_fid;
+	__u32		lum_padding;
+	__u32		lum_mds;
+};
+
+enum lmv_hash_type {
+	LMV_HASH_TYPE_UNKNOWN	= 0,	/* 0 is reserved for testing purpose */
+	LMV_HASH_TYPE_ALL_CHARS = 1,
+	LMV_HASH_TYPE_FNV_1A_64 = 2,
+	LMV_HASH_TYPE_MAX,
+};
+
+#define LMV_HASH_NAME_ALL_CHARS	"all_char"
+#define LMV_HASH_NAME_FNV_1A_64	"fnv_1a_64"
+
+extern char *mdt_hash_name[LMV_HASH_TYPE_MAX];
+
+/* Got this according to how get LOV_MAX_STRIPE_COUNT, see above,
+ * (max buffer size - lmv+rpc header) / sizeof(struct lmv_user_mds_data) */
+#define LMV_MAX_STRIPE_COUNT 2000  /* ((12 * 4096 - 256) / 24) */
+#define lmv_user_md lmv_user_md_v1
+struct lmv_user_md_v1 {
+	__u32	lum_magic;	 /* must be the first field */
+	__u32	lum_stripe_count;  /* dirstripe count */
+	__u32	lum_stripe_offset; /* MDT idx for default dirstripe */
+	__u32	lum_hash_type;     /* Dir stripe policy */
+	__u32	lum_type;	  /* LMV type: default or normal */
+	__u32	lum_padding1;
+	__u32	lum_padding2;
+	__u32	lum_padding3;
+	char	lum_pool_name[LOV_MAXPOOLNAME + 1];
+	struct	lmv_user_mds_data  lum_objects[0];
+} __attribute__((packed));
+
+static inline int lmv_user_md_size(int stripes, int lmm_magic)
+{
+	return sizeof(struct lmv_user_md) +
+		      stripes * sizeof(struct lmv_user_mds_data);
+}
+
+struct ll_recreate_obj {
+        __u64 lrc_id;
+        __u32 lrc_ost_idx;
+};
+
+struct ll_fid {
+        __u64 id;         /* holds object id */
+        __u32 generation; /* holds object generation */
+        __u32 f_type;     /* holds object type or stripe idx when passing it to
+                           * OST for saving into EA. */
+};
+
+#define UUID_MAX        40
+struct obd_uuid {
+        char uuid[UUID_MAX];
+};
+
+static inline bool obd_uuid_equals(const struct obd_uuid *u1,
+				   const struct obd_uuid *u2)
+{
+	return strcmp((char *)u1->uuid, (char *)u2->uuid) == 0;
+}
+
+static inline int obd_uuid_empty(struct obd_uuid *uuid)
+{
+        return uuid->uuid[0] == '\0';
+}
+
+static inline void obd_str2uuid(struct obd_uuid *uuid, const char *tmp)
+{
+        strncpy((char *)uuid->uuid, tmp, sizeof(*uuid));
+        uuid->uuid[sizeof(*uuid) - 1] = '\0';
+}
+
+/* For printf's only, make sure uuid is terminated */
+static inline char *obd_uuid2str(const struct obd_uuid *uuid)
+{
+	if (uuid == NULL)
+		return NULL;
+
+        if (uuid->uuid[sizeof(*uuid) - 1] != '\0') {
+                /* Obviously not safe, but for printfs, no real harm done...
+                   we're always null-terminated, even in a race. */
+                static char temp[sizeof(*uuid)];
+                memcpy(temp, uuid->uuid, sizeof(*uuid) - 1);
+                temp[sizeof(*uuid) - 1] = '\0';
+                return temp;
+        }
+        return (char *)(uuid->uuid);
+}
+
+#define LUSTRE_MAXFSNAME 8
+
+/* Extract fsname from uuid (or target name) of a target
+   e.g. (myfs-OST0007_UUID -> myfs)
+   see also deuuidify. */
+static inline void obd_uuid2fsname(char *buf, char *uuid, int buflen)
+{
+        char *p;
+
+        strncpy(buf, uuid, buflen - 1);
+        buf[buflen - 1] = '\0';
+        p = strrchr(buf, '-');
+	if (p != NULL)
+		*p = '\0';
+}
+
+/* printf display format for Lustre FIDs
+ * usage: printf("file FID is "DFID"\n", PFID(fid)); */
+#define FID_NOBRACE_LEN 40
+#define FID_LEN (FID_NOBRACE_LEN + 2)
+#define DFID_NOBRACE "%#llx:0x%x:0x%x"
+#define DFID "["DFID_NOBRACE"]"
+#define PFID(fid) (unsigned long long)(fid)->f_seq, (fid)->f_oid, (fid)->f_ver
+
+/* scanf input parse format for fids in DFID_NOBRACE format
+ * Need to strip '[' from DFID format first or use "["SFID"]" at caller.
+ * usage: sscanf(fidstr, SFID, RFID(&fid)); */
+#define SFID "0x%llx:0x%x:0x%x"
+#define RFID(fid) &((fid)->f_seq), &((fid)->f_oid), &((fid)->f_ver)
+
+/********* Quotas **********/
+
+#define LUSTRE_QUOTABLOCK_BITS 10
+#define LUSTRE_QUOTABLOCK_SIZE (1 << LUSTRE_QUOTABLOCK_BITS)
+
+static inline __u64 lustre_stoqb(size_t space)
+{
+	return (space + LUSTRE_QUOTABLOCK_SIZE - 1) >> LUSTRE_QUOTABLOCK_BITS;
+}
+
+#define Q_QUOTACHECK	0x800100 /* deprecated as of 2.4 */
+#define Q_INITQUOTA	0x800101 /* deprecated as of 2.4  */
+#define Q_GETOINFO	0x800102 /* get obd quota info */
+#define Q_GETOQUOTA	0x800103 /* get obd quotas */
+#define Q_FINVALIDATE	0x800104 /* deprecated as of 2.4 */
+
+/* these must be explicitly translated into linux Q_* in ll_dir_ioctl */
+#define LUSTRE_Q_QUOTAON    0x800002     /* deprecated as of 2.4 */
+#define LUSTRE_Q_QUOTAOFF   0x800003     /* deprecated as of 2.4 */
+#define LUSTRE_Q_GETINFO    0x800005     /* get information about quota files */
+#define LUSTRE_Q_SETINFO    0x800006     /* set information about quota files */
+#define LUSTRE_Q_GETQUOTA   0x800007     /* get user quota structure */
+#define LUSTRE_Q_SETQUOTA   0x800008     /* set user quota structure */
+/* lustre-specific control commands */
+#define LUSTRE_Q_INVALIDATE  0x80000b     /* deprecated as of 2.4 */
+#define LUSTRE_Q_FINVALIDATE 0x80000c     /* deprecated as of 2.4 */
+
+#define ALLQUOTA 255       /* set all quota */
+static inline char *qtype_name(int qtype)
+{
+	switch (qtype) {
+	case USRQUOTA:
+		return "usr";
+	case GRPQUOTA:
+		return "grp";
+	case PRJQUOTA:
+		return "prj";
+	}
+	return "unknown";
+}
+
+#define IDENTITY_DOWNCALL_MAGIC 0x6d6dd629
+
+/* permission */
+#define N_PERMS_MAX      64
+
+struct perm_downcall_data {
+        __u64 pdd_nid;
+        __u32 pdd_perm;
+        __u32 pdd_padding;
+};
+
+struct identity_downcall_data {
+        __u32                            idd_magic;
+        __u32                            idd_err;
+        __u32                            idd_uid;
+        __u32                            idd_gid;
+        __u32                            idd_nperms;
+        __u32                            idd_ngroups;
+        struct perm_downcall_data idd_perms[N_PERMS_MAX];
+        __u32                            idd_groups[0];
+};
+
+#ifdef NEED_QUOTA_DEFS
+#ifndef QIF_BLIMITS
+#define QIF_BLIMITS     1
+#define QIF_SPACE       2
+#define QIF_ILIMITS     4
+#define QIF_INODES      8
+#define QIF_BTIME       16
+#define QIF_ITIME       32
+#define QIF_LIMITS      (QIF_BLIMITS | QIF_ILIMITS)
+#define QIF_USAGE       (QIF_SPACE | QIF_INODES)
+#define QIF_TIMES       (QIF_BTIME | QIF_ITIME)
+#define QIF_ALL         (QIF_LIMITS | QIF_USAGE | QIF_TIMES)
+#endif
+
+#endif /* !__KERNEL__ */
+
+/* lustre volatile file support
+ * file name header: .^L^S^T^R:volatile"
+ */
+#define LUSTRE_VOLATILE_HDR	".\x0c\x13\x14\x12:VOLATILE"
+#define LUSTRE_VOLATILE_HDR_LEN	14
+
+typedef enum lustre_quota_version {
+        LUSTRE_QUOTA_V2 = 1
+} lustre_quota_version_t;
+
+/* XXX: same as if_dqinfo struct in kernel */
+struct obd_dqinfo {
+        __u64 dqi_bgrace;
+        __u64 dqi_igrace;
+        __u32 dqi_flags;
+        __u32 dqi_valid;
+};
+
+/* XXX: same as if_dqblk struct in kernel, plus one padding */
+struct obd_dqblk {
+        __u64 dqb_bhardlimit;
+        __u64 dqb_bsoftlimit;
+        __u64 dqb_curspace;
+        __u64 dqb_ihardlimit;
+        __u64 dqb_isoftlimit;
+        __u64 dqb_curinodes;
+        __u64 dqb_btime;
+        __u64 dqb_itime;
+        __u32 dqb_valid;
+        __u32 dqb_padding;
+};
+
+enum {
+        QC_GENERAL      = 0,
+        QC_MDTIDX       = 1,
+        QC_OSTIDX       = 2,
+        QC_UUID         = 3
+};
+
+struct if_quotactl {
+        __u32                   qc_cmd;
+        __u32                   qc_type;
+        __u32                   qc_id;
+        __u32                   qc_stat;
+        __u32                   qc_valid;
+        __u32                   qc_idx;
+        struct obd_dqinfo       qc_dqinfo;
+        struct obd_dqblk        qc_dqblk;
+        char                    obd_type[16];
+        struct obd_uuid         obd_uuid;
+};
+
+/* swap layout flags */
+#define SWAP_LAYOUTS_CHECK_DV1		(1 << 0)
+#define SWAP_LAYOUTS_CHECK_DV2		(1 << 1)
+#define SWAP_LAYOUTS_KEEP_MTIME		(1 << 2)
+#define SWAP_LAYOUTS_KEEP_ATIME		(1 << 3)
+#define SWAP_LAYOUTS_CLOSE		(1 << 4)
+
+/* Swap XATTR_NAME_HSM as well, only on the MDT so far */
+#define SWAP_LAYOUTS_MDS_HSM		(1 << 31)
+struct lustre_swap_layouts {
+	__u64	sl_flags;
+	__u32	sl_fd;
+	__u32	sl_gid;
+	__u64	sl_dv1;
+	__u64	sl_dv2;
+};
+
+
+/********* Changelogs **********/
+/** Changelog record types */
+enum changelog_rec_type {
+        CL_MARK     = 0,
+        CL_CREATE   = 1,  /* namespace */
+        CL_MKDIR    = 2,  /* namespace */
+        CL_HARDLINK = 3,  /* namespace */
+        CL_SOFTLINK = 4,  /* namespace */
+        CL_MKNOD    = 5,  /* namespace */
+        CL_UNLINK   = 6,  /* namespace */
+        CL_RMDIR    = 7,  /* namespace */
+        CL_RENAME   = 8,  /* namespace */
+        CL_EXT      = 9,  /* namespace extended record (2nd half of rename) */
+        CL_OPEN     = 10, /* not currently used */
+        CL_CLOSE    = 11, /* may be written to log only with mtime change */
+	CL_LAYOUT   = 12, /* file layout/striping modified */
+	CL_TRUNC    = 13,
+	CL_SETATTR  = 14,
+	CL_XATTR    = 15,
+	CL_HSM      = 16, /* HSM specific events, see flags */
+	CL_MTIME    = 17, /* Precedence: setattr > mtime > ctime > atime */
+	CL_CTIME    = 18,
+	CL_ATIME    = 19,
+	CL_MIGRATE  = 20,
+	CL_LAST
+};
+
+static inline const char *changelog_type2str(int type) {
+	static const char *changelog_str[] = {
+		"MARK",  "CREAT", "MKDIR", "HLINK", "SLINK", "MKNOD", "UNLNK",
+		"RMDIR", "RENME", "RNMTO", "OPEN",  "CLOSE", "LYOUT", "TRUNC",
+		"SATTR", "XATTR", "HSM",   "MTIME", "CTIME", "ATIME", "MIGRT"
+	};
+
+	if (type >= 0 && type < CL_LAST)
+		return changelog_str[type];
+	return NULL;
+}
+
+/* per-record flags */
+#define CLF_FLAGSHIFT   12
+#define CLF_FLAGMASK    ((1U << CLF_FLAGSHIFT) - 1)
+#define CLF_VERMASK     (~CLF_FLAGMASK)
+enum changelog_rec_flags {
+	CLF_VERSION	= 0x1000,
+	CLF_RENAME	= 0x2000,
+	CLF_JOBID	= 0x4000,
+	CLF_SUPPORTED	= CLF_VERSION | CLF_RENAME | CLF_JOBID
+};
+
+
+/* Anything under the flagmask may be per-type (if desired) */
+/* Flags for unlink */
+#define CLF_UNLINK_LAST       0x0001 /* Unlink of last hardlink */
+#define CLF_UNLINK_HSM_EXISTS 0x0002 /* File has something in HSM */
+                                     /* HSM cleaning needed */
+/* Flags for rename */
+#define CLF_RENAME_LAST		0x0001 /* rename unlink last hardlink
+					* of target */
+#define CLF_RENAME_LAST_EXISTS	0x0002 /* rename unlink last hardlink of target
+					* has an archive in backend */
+
+/* Flags for HSM */
+/* 12b used (from high weight to low weight):
+ * 2b for flags
+ * 3b for event
+ * 7b for error code
+ */
+#define CLF_HSM_ERR_L        0 /* HSM return code, 7 bits */
+#define CLF_HSM_ERR_H        6
+#define CLF_HSM_EVENT_L      7 /* HSM event, 3 bits, see enum hsm_event */
+#define CLF_HSM_EVENT_H      9
+#define CLF_HSM_FLAG_L      10 /* HSM flags, 2 bits, 1 used, 1 spare */
+#define CLF_HSM_FLAG_H      11
+#define CLF_HSM_SPARE_L     12 /* 4 spare bits */
+#define CLF_HSM_SPARE_H     15
+#define CLF_HSM_LAST        15
+
+/* Remove bits higher than _h, then extract the value
+ * between _h and _l by shifting lower weigth to bit 0. */
+#define CLF_GET_BITS(_b, _h, _l) (((_b << (CLF_HSM_LAST - _h)) & 0xFFFF) \
+                                   >> (CLF_HSM_LAST - _h + _l))
+
+#define CLF_HSM_SUCCESS      0x00
+#define CLF_HSM_MAXERROR     0x7E
+#define CLF_HSM_ERROVERFLOW  0x7F
+
+#define CLF_HSM_DIRTY        1 /* file is dirty after HSM request end */
+
+/* 3 bits field => 8 values allowed */
+enum hsm_event {
+        HE_ARCHIVE      = 0,
+        HE_RESTORE      = 1,
+        HE_CANCEL       = 2,
+        HE_RELEASE      = 3,
+        HE_REMOVE       = 4,
+        HE_STATE        = 5,
+        HE_SPARE1       = 6,
+        HE_SPARE2       = 7,
+};
+
+static inline enum hsm_event hsm_get_cl_event(__u16 flags)
+{
+	return (enum hsm_event)CLF_GET_BITS(flags, CLF_HSM_EVENT_H,
+					    CLF_HSM_EVENT_L);
+}
+
+static inline void hsm_set_cl_event(int *flags, enum hsm_event he)
+{
+        *flags |= (he << CLF_HSM_EVENT_L);
+}
+
+static inline __u16 hsm_get_cl_flags(int flags)
+{
+        return CLF_GET_BITS(flags, CLF_HSM_FLAG_H, CLF_HSM_FLAG_L);
+}
+
+static inline void hsm_set_cl_flags(int *flags, int bits)
+{
+        *flags |= (bits << CLF_HSM_FLAG_L);
+}
+
+static inline int hsm_get_cl_error(int flags)
+{
+        return CLF_GET_BITS(flags, CLF_HSM_ERR_H, CLF_HSM_ERR_L);
+}
+
+static inline void hsm_set_cl_error(int *flags, int error)
+{
+        *flags |= (error << CLF_HSM_ERR_L);
+}
+
+enum changelog_send_flag {
+	/* Not yet implemented */
+	CHANGELOG_FLAG_FOLLOW   = 0x01,
+	/* Blocking IO makes sense in case of slow user parsing of the records,
+	 * but it also prevents us from cleaning up if the records are not
+	 * consumed. */
+	CHANGELOG_FLAG_BLOCK    = 0x02,
+	/* Pack jobid into the changelog records if available. */
+	CHANGELOG_FLAG_JOBID    = 0x04,
+};
+
+#define CR_MAXSIZE cfs_size_round(2 * NAME_MAX + 2 + \
+				  changelog_rec_offset(CLF_SUPPORTED))
+
+/* 31 usable bytes string + null terminator. */
+#define LUSTRE_JOBID_SIZE	32
+
+/* This is the minimal changelog record. It can contain extensions
+ * such as rename fields or process jobid. Its exact content is described
+ * by the cr_flags.
+ *
+ * Extensions are packed in the same order as their corresponding flags.
+ */
+struct changelog_rec {
+	__u16			cr_namelen;
+	__u16			cr_flags; /**< \a changelog_rec_flags */
+	__u32			cr_type;  /**< \a changelog_rec_type */
+	__u64			cr_index; /**< changelog record number */
+	__u64			cr_prev;  /**< last index for this target fid */
+	__u64			cr_time;
+	union {
+		lustre_fid	cr_tfid;        /**< target fid */
+		__u32		cr_markerflags; /**< CL_MARK flags */
+	};
+	lustre_fid		cr_pfid;        /**< parent fid */
+};
+
+/* Changelog extension for RENAME. */
+struct changelog_ext_rename {
+	lustre_fid		cr_sfid;     /**< source fid, or zero */
+	lustre_fid		cr_spfid;    /**< source parent fid, or zero */
+};
+
+/* Changelog extension to include JOBID. */
+struct changelog_ext_jobid {
+	char	cr_jobid[LUSTRE_JOBID_SIZE];	/**< zero-terminated string. */
+};
+
+
+static inline size_t changelog_rec_offset(enum changelog_rec_flags crf)
+{
+	size_t size = sizeof(struct changelog_rec);
+
+	if (crf & CLF_RENAME)
+		size += sizeof(struct changelog_ext_rename);
+
+	if (crf & CLF_JOBID)
+		size += sizeof(struct changelog_ext_jobid);
+
+	return size;
+}
+
+static inline size_t changelog_rec_size(const struct changelog_rec *rec)
+{
+	return changelog_rec_offset(rec->cr_flags);
+}
+
+static inline size_t changelog_rec_varsize(const struct changelog_rec *rec)
+{
+	return changelog_rec_size(rec) - sizeof(*rec) + rec->cr_namelen;
+}
+
+static inline
+struct changelog_ext_rename *changelog_rec_rename(const struct changelog_rec *rec)
+{
+	enum changelog_rec_flags crf = rec->cr_flags & CLF_VERSION;
+
+	return (struct changelog_ext_rename *)((char *)rec +
+					       changelog_rec_offset(crf));
+}
+
+/* The jobid follows the rename extension, if present */
+static inline
+struct changelog_ext_jobid *changelog_rec_jobid(const struct changelog_rec *rec)
+{
+	enum changelog_rec_flags crf = rec->cr_flags &
+					(CLF_VERSION | CLF_RENAME);
+
+	return (struct changelog_ext_jobid *)((char *)rec +
+					      changelog_rec_offset(crf));
+}
+
+/* The name follows the rename and jobid extensions, if present */
+static inline char *changelog_rec_name(const struct changelog_rec *rec)
+{
+	return (char *)rec + changelog_rec_offset(rec->cr_flags &
+						  CLF_SUPPORTED);
+}
+
+static inline size_t changelog_rec_snamelen(const struct changelog_rec *rec)
+{
+	return rec->cr_namelen - strlen(changelog_rec_name(rec)) - 1;
+}
+
+static inline char *changelog_rec_sname(const struct changelog_rec *rec)
+{
+	char *cr_name = changelog_rec_name(rec);
+
+	return cr_name + strlen(cr_name) + 1;
+}
+
+/**
+ * Remap a record to the desired format as specified by the crf flags.
+ * The record must be big enough to contain the final remapped version.
+ * Superfluous extension fields are removed and missing ones are added
+ * and zeroed. The flags of the record are updated accordingly.
+ *
+ * The jobid and rename extensions can be added to a record, to match the
+ * format an application expects, typically. In this case, the newly added
+ * fields will be zeroed.
+ * The Jobid field can be removed, to guarantee compatibility with older
+ * clients that don't expect this field in the records they process.
+ *
+ * The following assumptions are being made:
+ *   - CLF_RENAME will not be removed
+ *   - CLF_JOBID will not be added without CLF_RENAME being added too
+ *
+ * @param[in,out]  rec         The record to remap.
+ * @param[in]      crf_wanted  Flags describing the desired extensions.
+ */
+static inline void changelog_remap_rec(struct changelog_rec *rec,
+				       enum changelog_rec_flags crf_wanted)
+{
+	char *jid_mov;
+	char *rnm_mov;
+
+	crf_wanted &= CLF_SUPPORTED;
+
+	if ((rec->cr_flags & CLF_SUPPORTED) == crf_wanted)
+		return;
+
+	/* First move the variable-length name field */
+	memmove((char *)rec + changelog_rec_offset(crf_wanted),
+		changelog_rec_name(rec), rec->cr_namelen);
+
+	/* Locations of jobid and rename extensions in the remapped record */
+	jid_mov = (char *)rec +
+		  changelog_rec_offset(crf_wanted & ~CLF_JOBID);
+	rnm_mov = (char *)rec +
+		  changelog_rec_offset(crf_wanted & ~(CLF_JOBID | CLF_RENAME));
+
+	/* Move the extension fields to the desired positions */
+	if ((crf_wanted & CLF_JOBID) && (rec->cr_flags & CLF_JOBID))
+		memmove(jid_mov, changelog_rec_jobid(rec),
+			sizeof(struct changelog_ext_jobid));
+
+	if ((crf_wanted & CLF_RENAME) && (rec->cr_flags & CLF_RENAME))
+		memmove(rnm_mov, changelog_rec_rename(rec),
+			sizeof(struct changelog_ext_rename));
+
+	/* Clear newly added fields */
+	if ((crf_wanted & CLF_JOBID) && !(rec->cr_flags & CLF_JOBID))
+		memset(jid_mov, 0, sizeof(struct changelog_ext_jobid));
+
+	if ((crf_wanted & CLF_RENAME) && !(rec->cr_flags & CLF_RENAME))
+		memset(rnm_mov, 0, sizeof(struct changelog_ext_rename));
+
+	/* Update the record's flags accordingly */
+	rec->cr_flags = (rec->cr_flags & CLF_FLAGMASK) | crf_wanted;
+}
+
+enum changelog_message_type {
+        CL_RECORD = 10, /* message is a changelog_rec */
+        CL_EOF    = 11, /* at end of current changelog */
+};
+
+/********* Misc **********/
+
+struct ioc_data_version {
+        __u64 idv_version;
+        __u64 idv_flags;     /* See LL_DV_xxx */
+};
+#define LL_DV_RD_FLUSH (1 << 0) /* Flush dirty pages from clients */
+#define LL_DV_WR_FLUSH (1 << 1) /* Flush all caching pages from clients */
+
+#ifndef offsetof
+#define offsetof(typ, memb)     ((unsigned long)((char *)&(((typ *)0)->memb)))
+#endif
+
+#define dot_lustre_name ".lustre"
+
+
+/********* HSM **********/
+
+/** HSM per-file state
+ * See HSM_FLAGS below.
+ */
+enum hsm_states {
+	HS_NONE		= 0x00000000,
+	HS_EXISTS	= 0x00000001,
+	HS_DIRTY	= 0x00000002,
+	HS_RELEASED	= 0x00000004,
+	HS_ARCHIVED	= 0x00000008,
+	HS_NORELEASE	= 0x00000010,
+	HS_NOARCHIVE	= 0x00000020,
+	HS_LOST		= 0x00000040,
+};
+
+/* HSM user-setable flags. */
+#define HSM_USER_MASK   (HS_NORELEASE | HS_NOARCHIVE | HS_DIRTY)
+
+/* Other HSM flags. */
+#define HSM_STATUS_MASK (HS_EXISTS | HS_LOST | HS_RELEASED | HS_ARCHIVED)
+
+/*
+ * All HSM-related possible flags that could be applied to a file.
+ * This should be kept in sync with hsm_states.
+ */
+#define HSM_FLAGS_MASK  (HSM_USER_MASK | HSM_STATUS_MASK)
+
+/**
+ * HSM request progress state
+ */
+enum hsm_progress_states {
+	HPS_WAITING	= 1,
+	HPS_RUNNING	= 2,
+	HPS_DONE	= 3,
+};
+#define HPS_NONE	0
+
+static inline const char *hsm_progress_state2name(enum hsm_progress_states s)
+{
+	switch  (s) {
+	case HPS_WAITING:	return "waiting";
+	case HPS_RUNNING:	return "running";
+	case HPS_DONE:		return "done";
+	default:		return "unknown";
+	}
+}
+
+struct hsm_extent {
+	__u64 offset;
+	__u64 length;
+} __attribute__((packed));
+
+/**
+ * Current HSM states of a Lustre file.
+ *
+ * This structure purpose is to be sent to user-space mainly. It describes the
+ * current HSM flags and in-progress action.
+ */
+struct hsm_user_state {
+	/** Current HSM states, from enum hsm_states. */
+	__u32			hus_states;
+	__u32			hus_archive_id;
+	/**  The current undergoing action, if there is one */
+	__u32			hus_in_progress_state;
+	__u32			hus_in_progress_action;
+	struct hsm_extent	hus_in_progress_location;
+	char			hus_extended_info[];
+};
+
+struct hsm_state_set_ioc {
+	struct lu_fid	hssi_fid;
+	__u64		hssi_setmask;
+	__u64		hssi_clearmask;
+};
+
+/*
+ * This structure describes the current in-progress action for a file.
+ * it is retuned to user space and send over the wire
+ */
+struct hsm_current_action {
+	/**  The current undergoing action, if there is one */
+	/* state is one of hsm_progress_states */
+	__u32			hca_state;
+	/* action is one of hsm_user_action */
+	__u32			hca_action;
+	struct hsm_extent	hca_location;
+};
+
+/***** HSM user requests ******/
+/* User-generated (lfs/ioctl) request types */
+enum hsm_user_action {
+        HUA_NONE    =  1, /* no action (noop) */
+        HUA_ARCHIVE = 10, /* copy to hsm */
+        HUA_RESTORE = 11, /* prestage */
+        HUA_RELEASE = 12, /* drop ost objects */
+        HUA_REMOVE  = 13, /* remove from archive */
+        HUA_CANCEL  = 14  /* cancel a request */
+};
+
+static inline const char *hsm_user_action2name(enum hsm_user_action  a)
+{
+        switch  (a) {
+        case HUA_NONE:    return "NOOP";
+        case HUA_ARCHIVE: return "ARCHIVE";
+        case HUA_RESTORE: return "RESTORE";
+        case HUA_RELEASE: return "RELEASE";
+        case HUA_REMOVE:  return "REMOVE";
+        case HUA_CANCEL:  return "CANCEL";
+        default:          return "UNKNOWN";
+        }
+}
+
+/*
+ * List of hr_flags (bit field)
+ */
+#define HSM_FORCE_ACTION 0x0001
+/* used by CT, cannot be set by user */
+#define HSM_GHOST_COPY   0x0002
+
+/**
+ * Contains all the fixed part of struct hsm_user_request.
+ *
+ */
+struct hsm_request {
+	__u32 hr_action;	/* enum hsm_user_action */
+	__u32 hr_archive_id;	/* archive id, used only with HUA_ARCHIVE */
+	__u64 hr_flags;		/* request flags */
+	__u32 hr_itemcount;	/* item count in hur_user_item vector */
+	__u32 hr_data_len;
+};
+
+struct hsm_user_item {
+       lustre_fid        hui_fid;
+       struct hsm_extent hui_extent;
+} __attribute__((packed));
+
+struct hsm_user_request {
+	struct hsm_request	hur_request;
+	struct hsm_user_item	hur_user_item[0];
+	/* extra data blob at end of struct (after all
+	 * hur_user_items), only use helpers to access it
+	 */
+} __attribute__((packed));
+
+/** Return pointer to data field in a hsm user request */
+static inline void *hur_data(struct hsm_user_request *hur)
+{
+	return &(hur->hur_user_item[hur->hur_request.hr_itemcount]);
+}
+
+/**
+ * Compute the current length of the provided hsm_user_request.  This returns -1
+ * instead of an errno because ssize_t is defined to be only [ -1, SSIZE_MAX ]
+ *
+ * return -1 on bounds check error.
+ */
+static inline ssize_t hur_len(struct hsm_user_request *hur)
+{
+	__u64	size;
+
+	/* can't overflow a __u64 since hr_itemcount is only __u32 */
+	size = offsetof(struct hsm_user_request, hur_user_item[0]) +
+		(__u64)hur->hur_request.hr_itemcount *
+		sizeof(hur->hur_user_item[0]) + hur->hur_request.hr_data_len;
+
+	if (size != (ssize_t)size)
+		return -1;
+
+	return size;
+}
+
+/****** HSM RPCs to copytool *****/
+/* Message types the copytool may receive */
+enum hsm_message_type {
+        HMT_ACTION_LIST = 100, /* message is a hsm_action_list */
+};
+
+/* Actions the copytool may be instructed to take for a given action_item */
+enum hsm_copytool_action {
+        HSMA_NONE    = 10, /* no action */
+        HSMA_ARCHIVE = 20, /* arbitrary offset */
+        HSMA_RESTORE = 21,
+        HSMA_REMOVE  = 22,
+        HSMA_CANCEL  = 23
+};
+
+static inline const char *hsm_copytool_action2name(enum hsm_copytool_action  a)
+{
+        switch  (a) {
+        case HSMA_NONE:    return "NOOP";
+        case HSMA_ARCHIVE: return "ARCHIVE";
+        case HSMA_RESTORE: return "RESTORE";
+        case HSMA_REMOVE:  return "REMOVE";
+        case HSMA_CANCEL:  return "CANCEL";
+        default:           return "UNKNOWN";
+        }
+}
+
+/* Copytool item action description */
+struct hsm_action_item {
+	__u32      hai_len;     /* valid size of this struct */
+	__u32      hai_action;  /* hsm_copytool_action, but use known size */
+	lustre_fid hai_fid;     /* Lustre FID to operate on */
+	lustre_fid hai_dfid;    /* fid used for data access */
+	struct hsm_extent hai_extent;  /* byte range to operate on */
+	__u64      hai_cookie;  /* action cookie from coordinator */
+	__u64      hai_gid;     /* grouplock id */
+	char       hai_data[0]; /* variable length */
+} __attribute__((packed));
+
+/**
+ * helper function which print in hexa the first bytes of
+ * hai opaque field
+ *
+ * \param hai [IN]        record to print
+ * \param buffer [IN,OUT] buffer to write the hex string to
+ * \param len [IN]        max buffer length
+ *
+ * \retval buffer
+ */
+static inline char *hai_dump_data_field(const struct hsm_action_item *hai,
+					char *buffer, size_t len)
+{
+	int i;
+	int data_len;
+	char *ptr;
+
+	ptr = buffer;
+	data_len = hai->hai_len - sizeof(*hai);
+	for (i = 0; (i < data_len) && (len > 2); i++) {
+		snprintf(ptr, 3, "%02X", (unsigned char)hai->hai_data[i]);
+		ptr += 2;
+		len -= 2;
+	}
+
+	*ptr = '\0';
+
+	return buffer;
+}
+
+/* Copytool action list */
+#define HAL_VERSION 1
+#define HAL_MAXSIZE LNET_MTU /* bytes, used in userspace only */
+struct hsm_action_list {
+	__u32 hal_version;
+	__u32 hal_count;       /* number of hai's to follow */
+	__u64 hal_compound_id; /* returned by coordinator */
+	__u64 hal_flags;
+	__u32 hal_archive_id; /* which archive backend */
+	__u32 padding1;
+	char  hal_fsname[0];   /* null-terminated */
+	/* struct hsm_action_item[hal_count] follows, aligned on 8-byte
+	   boundaries. See hai_zero */
+} __attribute__((packed));
+
+#ifndef HAVE_CFS_SIZE_ROUND
+static inline int cfs_size_round (int val)
+{
+        return (val + 7) & (~0x7);
+}
+#define HAVE_CFS_SIZE_ROUND
+#endif
+
+/* Return pointer to first hai in action list */
+static inline struct hsm_action_item *hai_first(struct hsm_action_list *hal)
+{
+	return (struct hsm_action_item *)(hal->hal_fsname +
+					  cfs_size_round(strlen(hal-> \
+								hal_fsname)
+							 + 1));
+}
+/* Return pointer to next hai */
+static inline struct hsm_action_item * hai_next(struct hsm_action_item *hai)
+{
+        return (struct hsm_action_item *)((char *)hai +
+                                          cfs_size_round(hai->hai_len));
+}
+
+/* Return size of an hsm_action_list */
+static inline size_t hal_size(struct hsm_action_list *hal)
+{
+	__u32 i;
+	size_t sz;
+	struct hsm_action_item *hai;
+
+	sz = sizeof(*hal) + cfs_size_round(strlen(hal->hal_fsname) + 1);
+	hai = hai_first(hal);
+	for (i = 0; i < hal->hal_count ; i++, hai = hai_next(hai))
+		sz += cfs_size_round(hai->hai_len);
+
+	return sz;
+}
+
+/* HSM file import
+ * describe the attributes to be set on imported file
+ */
+struct hsm_user_import {
+	__u64		hui_size;
+	__u64		hui_atime;
+	__u64		hui_mtime;
+	__u32		hui_atime_ns;
+	__u32		hui_mtime_ns;
+	__u32		hui_uid;
+	__u32		hui_gid;
+	__u32		hui_mode;
+	__u32		hui_archive_id;
+};
+
+/* Copytool progress reporting */
+#define HP_FLAG_COMPLETED 0x01
+#define HP_FLAG_RETRY     0x02
+
+struct hsm_progress {
+	lustre_fid		hp_fid;
+	__u64			hp_cookie;
+	struct hsm_extent	hp_extent;
+	__u16			hp_flags;
+	__u16			hp_errval; /* positive val */
+	__u32			padding;
+};
+
+struct hsm_copy {
+	__u64			hc_data_version;
+	__u16			hc_flags;
+	__u16			hc_errval; /* positive val */
+	__u32			padding;
+	struct hsm_action_item	hc_hai;
+};
+
+/* JSON objects */
+enum llapi_json_types {
+	LLAPI_JSON_INTEGER = 1,
+	LLAPI_JSON_BIGNUM,
+	LLAPI_JSON_REAL,
+	LLAPI_JSON_STRING
+};
+
+struct llapi_json_item {
+	char			*lji_key;
+	__u32			lji_type;
+	union {
+		int	lji_integer;
+		__u64	lji_u64;
+		double	lji_real;
+		char	*lji_string;
+	};
+	struct llapi_json_item	*lji_next;
+};
+
+struct llapi_json_item_list {
+	int			ljil_item_count;
+	struct llapi_json_item	*ljil_items;
+};
+
+enum lu_ladvise_type {
+	LU_LADVISE_INVALID	= 0,
+	LU_LADVISE_WILLREAD	= 1,
+	LU_LADVISE_DONTNEED	= 2,
+};
+
+#define LU_LADVISE_NAMES {						\
+	[LU_LADVISE_WILLREAD]	= "willread",				\
+	[LU_LADVISE_DONTNEED]	= "dontneed",				\
+}
+
+/* This is the userspace argument for ladvise.  It is currently the same as
+ * what goes on the wire (struct lu_ladvise), but is defined separately as we
+ * may need info which is only used locally. */
+struct llapi_lu_ladvise {
+	__u16 lla_advice;	/* advice type */
+	__u16 lla_value1;	/* values for different advice types */
+	__u32 lla_value2;
+	__u64 lla_start;	/* first byte of extent for advice */
+	__u64 lla_end;		/* last byte of extent for advice */
+	__u32 lla_value3;
+	__u32 lla_value4;
+};
+
+enum ladvise_flag {
+	LF_ASYNC	= 0x00000001,
+};
+
+#define LADVISE_MAGIC 0x1ADF1CE0
+#define LF_MASK LF_ASYNC
+
+/* This is the userspace argument for ladvise, corresponds to ladvise_hdr which
+ * is used on the wire.  It is defined separately as we may need info which is
+ * only used locally. */
+struct llapi_ladvise_hdr {
+	__u32			lah_magic;	/* LADVISE_MAGIC */
+	__u32			lah_count;	/* number of advices */
+	__u64			lah_flags;	/* from enum ladvise_flag */
+	__u32			lah_value1;	/* unused */
+	__u32			lah_value2;	/* unused */
+	__u64			lah_value3;	/* unused */
+	struct llapi_lu_ladvise	lah_advise[0];	/* advices in this header */
+};
+
+#define LAH_COUNT_MAX	(1024)
+
+/* Shared key */
+enum sk_crypt_alg {
+	SK_CRYPT_INVALID	= -1,
+	SK_CRYPT_EMPTY		= 0,
+	SK_CRYPT_AES256_CTR	= 1,
+	SK_CRYPT_MAX		= 2,
+};
+
+enum sk_hmac_alg {
+	SK_HMAC_INVALID	= -1,
+	SK_HMAC_EMPTY	= 0,
+	SK_HMAC_SHA256	= 1,
+	SK_HMAC_SHA512	= 2,
+	SK_HMAC_MAX	= 3,
+};
+
+struct sk_crypt_type {
+	char    *sct_name;
+	size_t   sct_bytes;
+};
+
+struct sk_hmac_type {
+	char    *sht_name;
+	size_t   sht_bytes;
+};
+
+/** @} lustreuser */
+#endif /* _LUSTRE_USER_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h
new file mode 100644
index 0000000000000..67df286a5c358
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h
@@ -0,0 +1,822 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTREAPI_H_
+#define _LUSTREAPI_H_
+
+/** \defgroup llapi llapi
+ *
+ * @{
+ */
+
+#include <stdarg.h>
+#include <stdint.h>
+#include <lustre/lustre_user.h>
+
+#ifndef LL_MAXQUOTAS
+#define LL_MAXQUOTAS 3
+#endif
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(a) ((sizeof(a)) / (sizeof((a)[0])))
+#endif
+
+extern bool liblustreapi_initialized;
+
+
+typedef void (*llapi_cb_t)(char *obd_type_name, char *obd_name, char *obd_uuid,
+			   void *args);
+
+/* lustreapi message severity level */
+enum llapi_message_level {
+        LLAPI_MSG_OFF    = 0,
+        LLAPI_MSG_FATAL  = 1,
+        LLAPI_MSG_ERROR  = 2,
+        LLAPI_MSG_WARN   = 3,
+        LLAPI_MSG_NORMAL = 4,
+        LLAPI_MSG_INFO   = 5,
+        LLAPI_MSG_DEBUG  = 6,
+        LLAPI_MSG_MAX
+};
+
+typedef void (*llapi_log_callback_t)(enum llapi_message_level level, int err,
+				     const char *fmt, va_list ap);
+
+
+/* the bottom three bits reserved for llapi_message_level */
+#define LLAPI_MSG_MASK          0x00000007
+#define LLAPI_MSG_NO_ERRNO      0x00000010
+
+static inline const char *llapi_msg_level2str(enum llapi_message_level level)
+{
+	static const char *levels[LLAPI_MSG_MAX] = {"OFF", "FATAL", "ERROR",
+						    "WARNING", "NORMAL",
+						    "INFO", "DEBUG"};
+
+	if (level >= LLAPI_MSG_MAX)
+		return NULL;
+
+	return levels[level];
+}
+extern void llapi_msg_set_level(int level);
+int llapi_msg_get_level(void);
+extern llapi_log_callback_t llapi_error_callback_set(llapi_log_callback_t cb);
+extern llapi_log_callback_t llapi_info_callback_set(llapi_log_callback_t cb);
+
+void llapi_error(enum llapi_message_level level, int err, const char *fmt, ...)
+	__attribute__((__format__(__printf__, 3, 4)));
+#define llapi_err_noerrno(level, fmt, a...)			\
+	llapi_error((level) | LLAPI_MSG_NO_ERRNO, 0, fmt, ## a)
+void llapi_printf(enum llapi_message_level level, const char *fmt, ...)
+	__attribute__((__format__(__printf__, 2, 3)));
+
+struct llapi_stripe_param {
+	unsigned long long	lsp_stripe_size;
+	char			*lsp_pool;
+	int			lsp_stripe_offset;
+	int			lsp_stripe_pattern;
+	/* Number of stripes. Size of lsp_osts[] if lsp_specific is true.*/
+	int			lsp_stripe_count;
+	bool			lsp_is_specific;
+	__u32			lsp_osts[0];
+};
+
+extern int llapi_file_open_param(const char *name, int flags, mode_t mode,
+				 const struct llapi_stripe_param *param);
+extern int llapi_file_create(const char *name, unsigned long long stripe_size,
+                             int stripe_offset, int stripe_count,
+                             int stripe_pattern);
+extern int llapi_file_open(const char *name, int flags, int mode,
+                           unsigned long long stripe_size, int stripe_offset,
+                           int stripe_count, int stripe_pattern);
+extern int llapi_file_create_pool(const char *name,
+                                  unsigned long long stripe_size,
+                                  int stripe_offset, int stripe_count,
+                                  int stripe_pattern, char *pool_name);
+extern int llapi_file_open_pool(const char *name, int flags, int mode,
+                                unsigned long long stripe_size,
+                                int stripe_offset, int stripe_count,
+                                int stripe_pattern, char *pool_name);
+extern int llapi_poollist(const char *name);
+extern int llapi_get_poollist(const char *name, char **poollist, int list_size,
+                              char *buffer, int buffer_size);
+extern int llapi_get_poolmembers(const char *poolname, char **members,
+                                 int list_size, char *buffer, int buffer_size);
+extern int llapi_file_get_stripe(const char *path, struct lov_user_md *lum);
+extern int llapi_file_lookup(int dirfd, const char *name);
+
+#define VERBOSE_COUNT		   0x1
+#define VERBOSE_SIZE		   0x2
+#define VERBOSE_OFFSET		   0x4
+#define VERBOSE_POOL		   0x8
+#define VERBOSE_DETAIL		  0x10
+#define VERBOSE_OBJID		  0x20
+#define VERBOSE_GENERATION	  0x40
+#define VERBOSE_MDTINDEX	  0x80
+#define VERBOSE_LAYOUT		 0x100
+#define VERBOSE_COMP_COUNT	 0x200
+#define VERBOSE_COMP_FLAGS	 0x400
+#define VERBOSE_COMP_START	 0x800
+#define VERBOSE_COMP_END	0x1000
+#define VERBOSE_COMP_ID		0x2000
+#define VERBOSE_DFID		0x4000
+#define VERBOSE_HASH_TYPE	0x8000
+#define VERBOSE_DEFAULT		(VERBOSE_COUNT | VERBOSE_SIZE | \
+				 VERBOSE_OFFSET | VERBOSE_POOL | \
+				 VERBOSE_OBJID | VERBOSE_GENERATION | \
+				 VERBOSE_LAYOUT | VERBOSE_HASH_TYPE | \
+				 VERBOSE_COMP_COUNT | VERBOSE_COMP_FLAGS | \
+				 VERBOSE_COMP_START | VERBOSE_COMP_END | \
+				 VERBOSE_COMP_ID)
+
+struct find_param {
+	unsigned int		 fp_max_depth;
+	dev_t			 fp_dev;
+	mode_t			 fp_type; /* S_IFIFO,... */
+	uid_t			 fp_uid;
+	gid_t			 fp_gid;
+	time_t			 fp_atime;
+	time_t			 fp_mtime;
+	time_t			 fp_ctime;
+	/* {a,m,c}sign cannot be bitfields due to using pointers to
+	 * access them during argument parsing. */
+	int			 fp_asign;
+	int			 fp_msign;
+	int			 fp_csign;
+	/* these need to be signed values */
+	int			 fp_size_sign:2,
+				 fp_stripe_size_sign:2,
+				 fp_stripe_count_sign:2,
+				 fp_comp_start_sign:2,
+				 fp_comp_end_sign:2,
+				 fp_comp_count_sign:2,
+				 fp_mdt_count_sign:2;
+	unsigned long long	 fp_size;
+	unsigned long long	 fp_size_units;
+
+	unsigned long long	 fp_zero_end:1,
+				 fp_recursive:1,
+				 fp_exclude_pattern:1,
+				 fp_exclude_type:1,
+				 fp_exclude_obd:1,
+				 fp_exclude_mdt:1,
+				 fp_exclude_gid:1,
+				 fp_exclude_uid:1,
+				 fp_check_gid:1,
+				 fp_check_uid:1,
+				 fp_check_pool:1,	/* LOV pool name */
+				 fp_check_size:1,	/* file size */
+				 fp_exclude_pool:1,
+				 fp_exclude_size:1,
+				 fp_exclude_atime:1,
+				 fp_exclude_mtime:1,
+				 fp_exclude_ctime:1,
+				 fp_get_lmv:1,	/* get MDT list from LMV */
+				 fp_raw:1,	/* do not fill in defaults */
+				 fp_check_stripe_size:1, /* LOV stripe size */
+				 fp_exclude_stripe_size:1,
+				 fp_check_stripe_count:1, /* LOV stripe count */
+				 fp_exclude_stripe_count:1,
+				 fp_check_layout:1,
+				 fp_exclude_layout:1,
+				 fp_get_default_lmv:1, /* Get default LMV */
+				 fp_migrate:1,
+				 fp_check_projid:1,
+				 fp_exclude_projid:1,
+				 fp_check_comp_count:1,
+				 fp_exclude_comp_count:1,
+				 fp_check_comp_flags:1,
+				 fp_exclude_comp_flags:1,
+				 fp_check_comp_start:1,
+				 fp_exclude_comp_start:1,
+				 fp_check_comp_end:1,
+				 fp_exclude_comp_end:1,
+				 fp_check_comp_id:1,
+				 fp_exclude_comp_id:1,
+				 fp_check_mdt_count:1,
+				 fp_exclude_mdt_count:1,
+				 fp_check_hash_type:1,
+				 fp_exclude_hash_type:1,
+				 fp_yaml:1;	/* output layout in YAML */
+
+	int			 fp_verbose;
+	int			 fp_quiet;
+
+	/* regular expression */
+	char			*fp_pattern;
+
+	struct  obd_uuid	*fp_obd_uuid;
+	int			 fp_num_obds;
+	int			 fp_num_alloc_obds;
+	int			 fp_obd_index;
+	int			*fp_obd_indexes;
+
+	struct  obd_uuid	*fp_mdt_uuid;
+	int			 fp_num_mdts;
+	int			 fp_num_alloc_mdts;
+	int			 fp_mdt_index;
+	int			*fp_mdt_indexes;
+	int			 fp_file_mdt_index;
+
+	size_t			 fp_lum_size;
+	struct  lov_user_mds_data *fp_lmd;
+
+	char			 fp_poolname[LOV_MAXPOOLNAME + 1];
+
+	__u32			 fp_lmv_stripe_count;
+	struct lmv_user_md	*fp_lmv_md;
+
+	unsigned long long	 fp_stripe_size;
+	unsigned long long	 fp_stripe_size_units;
+	unsigned long long	 fp_stripe_count;
+	__u32			 fp_layout;
+
+	__u32			 fp_comp_count;
+	__u32			 fp_comp_flags;
+	__u32			 fp_comp_id;
+	unsigned long long	 fp_comp_start;
+	unsigned long long	 fp_comp_start_units;
+	unsigned long long	 fp_comp_end;
+	unsigned long long	 fp_comp_end_units;
+	unsigned long long	 fp_mdt_count;
+	unsigned		 fp_projid;
+
+	/* In-process parameters. */
+	unsigned long		 fp_got_uuids:1,
+				 fp_obds_printed:1;
+	unsigned int		 fp_depth;
+	unsigned int		 fp_hash_type;
+};
+
+extern int llapi_ostlist(char *path, struct find_param *param);
+extern int llapi_uuid_match(char *real_uuid, char *search_uuid);
+extern int llapi_getstripe(char *path, struct find_param *param);
+extern int llapi_find(char *path, struct find_param *param);
+
+extern int llapi_file_fget_mdtidx(int fd, int *mdtidx);
+extern int llapi_dir_set_default_lmv_stripe(const char *name, int stripe_offset,
+					   int stripe_count, int stripe_pattern,
+					    const char *pool_name);
+extern int llapi_dir_create_pool(const char *name, int flags, int stripe_offset,
+				 int stripe_count, int stripe_pattern,
+				 const char *poolname);
+int llapi_direntry_remove(char *dname);
+
+int llapi_obd_fstatfs(int fd, __u32 type, __u32 index,
+		      struct obd_statfs *stat_buf, struct obd_uuid *uuid_buf);
+extern int llapi_obd_statfs(char *path, __u32 type, __u32 index,
+                     struct obd_statfs *stat_buf,
+                     struct obd_uuid *uuid_buf);
+extern int llapi_ping(char *obd_type, char *obd_name);
+extern int llapi_target_check(int num_types, char **obd_types, char *dir);
+extern int llapi_file_get_lov_uuid(const char *path, struct obd_uuid *lov_uuid);
+extern int llapi_file_get_lmv_uuid(const char *path, struct obd_uuid *lmv_uuid);
+extern int llapi_file_fget_lov_uuid(int fd, struct obd_uuid *lov_uuid);
+extern int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count);
+extern int llapi_lmv_get_uuids(int fd, struct obd_uuid *uuidp, int *mdt_count);
+extern int llapi_is_lustre_mnttype(const char *type);
+extern int llapi_search_ost(char *fsname, char *poolname, char *ostname);
+extern int llapi_get_obd_count(char *mnt, int *count, int is_mdt);
+extern int llapi_parse_size(const char *optarg, unsigned long long *size,
+			    unsigned long long *size_units, int bytes_spec);
+extern int llapi_search_mounts(const char *pathname, int index,
+                               char *mntdir, char *fsname);
+extern int llapi_search_fsname(const char *pathname, char *fsname);
+extern int llapi_getname(const char *path, char *buf, size_t size);
+extern int llapi_search_fileset(const char *pathname, char *fileset);
+
+extern int llapi_search_rootpath(char *pathname, const char *fsname);
+extern int llapi_nodemap_exists(const char *name);
+extern int llapi_migrate_mdt(char *path, struct find_param *param);
+extern int llapi_mv(char *path, struct find_param *param);
+
+struct mntent;
+#define HAVE_LLAPI_IS_LUSTRE_MNT
+extern int llapi_is_lustre_mnt(struct mntent *mnt);
+extern int llapi_quotactl(char *mnt, struct if_quotactl *qctl);
+extern int llapi_target_iterate(int type_num, char **obd_type, void *args,
+				llapi_cb_t cb);
+extern int llapi_get_connect_flags(const char *mnt, __u64 *flags);
+extern int llapi_cp(int argc, char *argv[]);
+extern int llapi_ls(int argc, char *argv[]);
+extern int llapi_fid2path(const char *device, const char *fidstr, char *path,
+			  int pathlen, long long *recno, int *linkno);
+extern int llapi_path2fid(const char *path, lustre_fid *fid);
+extern int llapi_get_mdt_index_by_fid(int fd, const lustre_fid *fid,
+				      int *mdt_index);
+extern int llapi_fd2fid(const int fd, lustre_fid *fid);
+/* get FID of parent dir + the related name of entry in this parent dir */
+extern int llapi_path2parent(const char *path, unsigned int linkno,
+			     lustre_fid *parent_fid, char *name,
+			     size_t name_size);
+extern int llapi_fd2parent(int fd, unsigned int linkno,
+			   lustre_fid *parent_fid, char *name,
+			   size_t name_size);
+extern int llapi_chomp_string(char *buf);
+extern int llapi_open_by_fid(const char *dir, const lustre_fid *fid,
+			     int open_flags);
+
+extern int llapi_get_version_string(char *version, unsigned int version_size);
+/* llapi_get_version() is deprecated, use llapi_get_version_string() instead */
+extern int llapi_get_version(char *buffer, int buffer_size, char **version)
+	__attribute__((deprecated));
+extern int llapi_get_data_version(int fd, __u64 *data_version, __u64 flags);
+extern int llapi_hsm_state_get_fd(int fd, struct hsm_user_state *hus);
+extern int llapi_hsm_state_get(const char *path, struct hsm_user_state *hus);
+extern int llapi_hsm_state_set_fd(int fd, __u64 setmask, __u64 clearmask,
+				  __u32 archive_id);
+extern int llapi_hsm_state_set(const char *path, __u64 setmask, __u64 clearmask,
+			       __u32 archive_id);
+extern int llapi_hsm_register_event_fifo(const char *path);
+extern int llapi_hsm_unregister_event_fifo(const char *path);
+extern void llapi_hsm_log_error(enum llapi_message_level level, int _rc,
+				const char *fmt, va_list args);
+
+extern int llapi_get_agent_uuid(char *path, char *buf, size_t bufsize);
+extern int llapi_create_volatile_idx(char *directory, int idx, int mode);
+static inline int llapi_create_volatile(char *directory, int mode)
+{
+	return llapi_create_volatile_idx(directory, -1, mode);
+}
+
+
+extern int llapi_fswap_layouts_grouplock(int fd1, int fd2, __u64 dv1, __u64 dv2,
+					 int gid, __u64 flags);
+extern int llapi_fswap_layouts(int fd1, int fd2, __u64 dv1, __u64 dv2,
+			       __u64 flags);
+extern int llapi_swap_layouts(const char *path1, const char *path2,
+			      __u64 dv1, __u64 dv2, __u64 flags);
+
+/* Changelog interface.  priv is private state, managed internally by these
+ * functions */
+
+/* Records received are in extended format now, though most of them are still
+ * written in disk in changelog_rec format (to save space and time), it's
+ * converted to extended format in the lustre api to ease changelog analysis. */
+#define HAVE_CHANGELOG_EXTEND_REC 1
+
+extern int llapi_changelog_start(void **priv, enum changelog_send_flag flags,
+				 const char *mdtname, long long startrec);
+extern int llapi_changelog_fini(void **priv);
+extern int llapi_changelog_recv(void *priv, struct changelog_rec **rech);
+extern int llapi_changelog_free(struct changelog_rec **rech);
+extern int llapi_changelog_get_fd(void *priv);
+/* Allow records up to endrec to be destroyed; requires registered id. */
+extern int llapi_changelog_clear(const char *mdtname, const char *idstr,
+                                 long long endrec);
+
+/* HSM copytool interface.
+ * priv is private state, managed internally by these functions
+ */
+struct hsm_copytool_private;
+struct hsm_copyaction_private;
+
+extern int llapi_hsm_copytool_register(struct hsm_copytool_private **priv,
+				       const char *mnt, int archive_count,
+				       int *archives, int rfd_flags);
+extern int llapi_hsm_copytool_unregister(struct hsm_copytool_private **priv);
+extern int llapi_hsm_copytool_get_fd(struct hsm_copytool_private *ct);
+extern int llapi_hsm_copytool_recv(struct hsm_copytool_private *priv,
+				   struct hsm_action_list **hal, int *msgsize);
+extern int llapi_hsm_action_begin(struct hsm_copyaction_private **phcp,
+				  const struct hsm_copytool_private *ct,
+				  const struct hsm_action_item *hai,
+				  int restore_mdt_index, int restore_open_flags,
+				  bool is_error);
+extern int llapi_hsm_action_end(struct hsm_copyaction_private **phcp,
+				const struct hsm_extent *he,
+				int hp_flags, int errval);
+extern int llapi_hsm_action_progress(struct hsm_copyaction_private *hcp,
+				     const struct hsm_extent *he, __u64 total,
+				     int hp_flags);
+extern int llapi_hsm_action_get_dfid(const struct hsm_copyaction_private *hcp,
+				     lustre_fid *fid);
+extern int llapi_hsm_action_get_fd(const struct hsm_copyaction_private *hcp);
+extern int llapi_hsm_import(const char *dst, int archive, const struct stat *st,
+			    unsigned long long stripe_size, int stripe_offset,
+			    int stripe_count, int stripe_pattern,
+			    char *pool_name, lustre_fid *newfid);
+
+/* HSM user interface */
+extern struct hsm_user_request *llapi_hsm_user_request_alloc(int itemcount,
+							     int data_len);
+extern int llapi_hsm_request(const char *path,
+			     const struct hsm_user_request *request);
+extern int llapi_hsm_current_action(const char *path,
+				    struct hsm_current_action *hca);
+
+/* JSON handling */
+extern int llapi_json_init_list(struct llapi_json_item_list **item_list);
+extern int llapi_json_destroy_list(struct llapi_json_item_list **item_list);
+extern int llapi_json_add_item(struct llapi_json_item_list **item_list,
+			       char *key, __u32 type, void *val);
+extern int llapi_json_write_list(struct llapi_json_item_list **item_list,
+				 FILE *fp);
+
+/* File lease */
+extern int llapi_lease_get(int fd, int mode);
+extern int llapi_lease_check(int fd);
+extern int llapi_lease_put(int fd);
+
+/* Group lock */
+int llapi_group_lock(int fd, int gid);
+int llapi_group_unlock(int fd, int gid);
+
+/* Ladvise */
+int llapi_ladvise(int fd, unsigned long long flags, int num_advise,
+		  struct llapi_lu_ladvise *ladvise);
+/** @} llapi */
+
+/* llapi_layout user interface */
+
+/** Opaque data type abstracting the layout of a Lustre file. */
+struct llapi_layout;
+
+/*
+ * Flags to control how layouts are retrieved.
+ */
+
+/* Replace non-specified values with expected inherited values. */
+#define LAYOUT_GET_EXPECTED 0x1
+
+/**
+ * Return a pointer to a newly-allocated opaque data structure containing
+ * the layout for the file at \a path.  The pointer should be freed with
+ * llapi_layout_free() when it is no longer needed. Failure is indicated
+ * by a NULL return value and an appropriate error code stored in errno.
+ */
+struct llapi_layout *llapi_layout_get_by_path(const char *path, uint32_t flags);
+
+/**
+ * Return a pointer to a newly-allocated opaque data type containing the
+ * layout for the file referenced by open file descriptor \a fd.  The
+ * pointer should be freed with llapi_layout_free() when it is no longer
+ * needed. Failure is indicated by a NULL return value and an
+ * appropriate error code stored in errno.
+ */
+struct llapi_layout *llapi_layout_get_by_fd(int fd, uint32_t flags);
+
+/**
+ * Return a pointer to a newly-allocated opaque data type containing the
+ * layout for the file associated with Lustre file identifier string
+ * \a fidstr.  The string \a path must name a path within the
+ * filesystem that contains the file being looked up, such as the
+ * filesystem root.  The returned pointer should be freed with
+ * llapi_layout_free() when it is no longer needed.  Failure is
+ * indicated with a NULL return value and an appropriate error code
+ * stored in errno.
+ */
+struct llapi_layout *llapi_layout_get_by_fid(const char *path,
+					     const lustre_fid *fid,
+					     uint32_t flags);
+
+/**
+ * Allocate a new layout. Use this when creating a new file with
+ * llapi_layout_file_create().
+ */
+struct llapi_layout *llapi_layout_alloc(void);
+
+/**
+ * Free memory allocated for \a layout.
+ */
+void llapi_layout_free(struct llapi_layout *layout);
+
+/** Not a valid stripe size, offset, or RAID pattern. */
+#define LLAPI_LAYOUT_INVALID	0x1000000000000001ULL
+
+/**
+ * When specified or returned as the value for stripe count,
+ * stripe size, offset, or RAID pattern, the filesystem-wide
+ * default behavior will apply.
+ */
+#define LLAPI_LAYOUT_DEFAULT	(LLAPI_LAYOUT_INVALID + 1)
+
+/**
+ * When specified or returned as the value for stripe count, all
+ * available OSTs will be used.
+ */
+#define LLAPI_LAYOUT_WIDE	(LLAPI_LAYOUT_INVALID + 2)
+
+/**
+ * When specified as the value for layout pattern, file objects will be
+ * stored using RAID0.  That is, data will be split evenly and without
+ * redundancy across all OSTs in the layout.
+ */
+#define LLAPI_LAYOUT_RAID0	0
+
+/**
+* The layout includes a specific set of OSTs on which to allocate.
+*/
+#define LLAPI_LAYOUT_SPECIFIC	0x2000000000000000ULL
+
+/**
+ * A valid ost index should be less than maximum valid OST index (UINT_MAX).
+ */
+#define LLAPI_LAYOUT_IDX_MAX	0x00000000FFFFFFFFULL
+
+/**
+ * Flags to modify how layouts are retrieved.
+ */
+/******************** Stripe Count ********************/
+
+/**
+ * Store the stripe count of \a layout in \a count.
+ *
+ * \retval  0 Success
+ * \retval -1 Error with status code in errno.
+ */
+int llapi_layout_stripe_count_get(const struct llapi_layout *layout,
+				  uint64_t *count);
+
+/**
+ * Set the stripe count of \a layout to \a count.
+ *
+ * \retval  0 Success.
+ * \retval -1 Invalid argument, errno set to EINVAL.
+ */
+int llapi_layout_stripe_count_set(struct llapi_layout *layout, uint64_t count);
+
+/******************** Stripe Size ********************/
+
+/**
+ * Store the stripe size of \a layout in \a size.
+ *
+ * \retval  0 Success.
+ * \retval -1 Invalid argument, errno set to EINVAL.
+ */
+int llapi_layout_stripe_size_get(const struct llapi_layout *layout,
+				 uint64_t *size);
+
+/**
+ * Set the stripe size of \a layout to \a stripe_size.
+ *
+ * \retval  0 Success.
+ * \retval -1 Invalid argument, errno set to EINVAL.
+ */
+int llapi_layout_stripe_size_set(struct llapi_layout *layout, uint64_t size);
+
+/******************** Stripe Pattern ********************/
+
+/**
+ * Store the stripe pattern of \a layout in \a pattern.
+ *
+ * \retval 0  Success.
+ * \retval -1 Error with status code in errno.
+ */
+int llapi_layout_pattern_get(const struct llapi_layout *layout,
+			     uint64_t *pattern);
+
+/**
+ * Set the stripe pattern of \a layout to \a pattern.
+ *
+ * \retval  0 Success.
+ * \retval -1 Invalid argument, errno set to EINVAL.
+ */
+int llapi_layout_pattern_set(struct llapi_layout *layout, uint64_t pattern);
+
+/******************** OST Index ********************/
+
+/**
+ * Store the index of the OST where stripe number \a stripe_number is stored
+ * in \a index.
+ *
+ * An error return value will result from a NULL layout, if \a
+ * stripe_number is out of range, or if \a layout was not initialized
+ * with llapi_layout_lookup_by{path,fd,fid}().
+ *
+ * \retval  0 Success
+ * \retval -1 Invalid argument, errno set to EINVAL.
+ */
+int llapi_layout_ost_index_get(const struct llapi_layout *layout,
+			       uint64_t stripe_number, uint64_t *index);
+
+/**
+ * Set the OST index associated with stripe number \a stripe_number to
+ * \a ost_index.
+ * NB: This is currently supported only for \a stripe_number = 0 and
+ * other usage will return ENOTSUPP in errno.  A NULL \a layout or
+ * out-of-range \a stripe_number will return EINVAL in errno.
+ *
+ * \retval  0 Success.
+ * \retval -1 Error with errno set to non-zero value.
+ */
+int llapi_layout_ost_index_set(struct llapi_layout *layout, int stripe_number,
+			       uint64_t index);
+
+/******************** Pool Name ********************/
+
+/**
+ * Store up to \a pool_name_len characters of the name of the pool of
+ * OSTs associated with \a layout into the buffer pointed to by
+ * \a pool_name.
+ *
+ * The correct calling form is:
+ *
+ *   llapi_layout_pool_name_get(layout, pool_name, sizeof(pool_name));
+ *
+ * A pool defines a set of OSTs from which file objects may be
+ * allocated for a file using \a layout.
+ *
+ * On success, the number of bytes stored is returned, excluding the
+ * terminating '\0' character (zero indicates that \a layout does not
+ * have an associated OST pool).  On error, -1 is returned and errno is
+ * set appropriately. Possible sources of error include a NULL pointer
+ * argument or insufficient space in \a dest to store the pool name,
+ * in which cases errno will be set to EINVAL.
+ *
+ * \retval 0+		The number of bytes stored in \a dest.
+ * \retval -1		Invalid argument, errno set to EINVAL.
+ */
+int llapi_layout_pool_name_get(const struct llapi_layout *layout,
+			      char *pool_name, size_t pool_name_len);
+
+/**
+ * Set the name of the pool of OSTs from which file objects will be
+ * allocated to \a pool_name.
+ *
+ * If the pool name uses "fsname.pool" notation to qualify the pool name
+ * with a filesystem name, the "fsname." portion will be silently
+ * discarded before storing the value. No validation that \a pool_name
+ * is an existing non-empty pool in filesystem \a fsname will be
+ * performed.  Such validation can be performed by the application if
+ * desired using the llapi_search_ost() function.  The maximum length of
+ * the stored value is defined by the constant LOV_MAXPOOLNAME.
+ *
+ * \retval  0	Success.
+ * \retval -1	Invalid argument, errno set to EINVAL.
+ */
+int llapi_layout_pool_name_set(struct llapi_layout *layout,
+			      const char *pool_name);
+
+/******************** File Creation ********************/
+
+/**
+ * Open an existing file at \a path, or create it with the specified
+ * \a layout and \a mode.
+ *
+ * One access mode and zero or more file creation flags and file status
+ * flags May be bitwise-or'd in \a open_flags (see open(2)).  Return an
+ * open file descriptor for the file.  If \a layout is non-NULL and
+ * \a path is not on a Lustre filesystem this function will fail and set
+ * errno to ENOTTY.
+ *
+ * An already existing file may be opened with this function, but
+ * \a layout and \a mode will not be applied to it.  Callers requiring a
+ * guarantee that the opened file is created with the specified
+ * \a layout and \a mode should use llapi_layout_file_create().
+ *
+ * A NULL \a layout may be specified, in which case the standard Lustre
+ * behavior for assigning layouts to newly-created files will apply.
+ *
+ * \retval 0+ An open file descriptor.
+ * \retval -1 Error with status code in errno.
+ */
+int llapi_layout_file_open(const char *path, int open_flags, mode_t mode,
+			   const struct llapi_layout *layout);
+
+/**
+ * Create a new file at \a path with the specified \a layout and \a mode.
+ *
+ * One access mode and zero or more file creation flags and file status
+ * flags May be bitwise-or'd in \a open_flags (see open(2)).  Return an
+ * open file descriptor for the file.  If \a layout is non-NULL and
+ * \a path is not on a Lustre filesystem this function will fail and set
+ * errno to ENOTTY.
+ *
+ * The function call
+ *
+ *   llapi_layout_file_create(path, open_flags, mode, layout)
+ *
+ * shall be equivalent to:
+ *
+ *   llapi_layout_file_open(path, open_flags|O_CREAT|O_EXCL, mode, layout)
+ *
+ * It is an error if \a path specifies an existing file.
+ *
+ * A NULL \a layout may be specified, in which the standard Lustre
+ * behavior for assigning layouts to newly-created files will apply.
+ *
+ * \retval 0+ An open file descriptor.
+ * \retval -1 Error with status code in errno.
+ */
+int llapi_layout_file_create(const char *path, int open_flags, int mode,
+			     const struct llapi_layout *layout);
+
+/**
+ * Fetch the start and end offset of the current layout component.
+ */
+int llapi_layout_comp_extent_get(const struct llapi_layout *layout,
+				 uint64_t *start, uint64_t *end);
+/**
+ * Set the extent of current layout component.
+ */
+int llapi_layout_comp_extent_set(struct llapi_layout *layout,
+				 uint64_t start, uint64_t end);
+
+/* PFL component flags table */
+static const struct comp_flag_name {
+	enum lov_comp_md_entry_flags cfn_flag;
+	const char *cfn_name;
+} comp_flags_table[] = {
+	{ LCME_FL_INIT,		"init" },
+	/* For now, only "init" is supported
+	{ LCME_FL_PRIMARY,	"primary" },
+	{ LCME_FL_STALE,	"stale" },
+	{ LCME_FL_OFFLINE,	"offline" },
+	{ LCME_FL_PREFERRED,	"preferred" }
+	*/
+};
+
+/**
+ * Gets the attribute flags of the current component.
+ */
+int llapi_layout_comp_flags_get(const struct llapi_layout *layout,
+				uint32_t *flags);
+/**
+ * Sets the specified flags of the current component leaving other flags as-is.
+ */
+int llapi_layout_comp_flags_set(struct llapi_layout *layout, uint32_t flags);
+/**
+ * Clears the flags specified in the flags leaving other flags as-is.
+ */
+int llapi_layout_comp_flags_clear(struct llapi_layout *layout, uint32_t flags);
+/**
+ * Fetches the file-unique component ID of the current layout component.
+ */
+int llapi_layout_comp_id_get(const struct llapi_layout *layout, uint32_t *id);
+/**
+ * Adds one component to the existing composite or plain layout.
+ */
+int llapi_layout_comp_add(struct llapi_layout *layout);
+/**
+ * Deletes the current layout component from the composite layout.
+ */
+int llapi_layout_comp_del(struct llapi_layout *layout);
+
+enum llapi_layout_comp_use {
+	LLAPI_LAYOUT_COMP_USE_FIRST = 1,
+	LLAPI_LAYOUT_COMP_USE_LAST = 2,
+	LLAPI_LAYOUT_COMP_USE_NEXT = 3,
+	LLAPI_LAYOUT_COMP_USE_PREV = 4,
+};
+
+/**
+ * Set the currently active component to the specified component ID.
+ */
+int llapi_layout_comp_use_id(struct llapi_layout *layout, uint32_t id);
+/**
+ * Select the currently active component at the specified position.
+ */
+int llapi_layout_comp_use(struct llapi_layout *layout, uint32_t pos);
+/**
+ * Add layout components to an existing file.
+ */
+int llapi_layout_file_comp_add(const char *path,
+			       const struct llapi_layout *layout);
+/**
+ * Delete component(s) by the specified component id or flags.
+ */
+int llapi_layout_file_comp_del(const char *path, uint32_t id, uint32_t flags);
+/**
+ * Change flags or other parameters of the component(s) by component ID of an
+ * existing file. The component to be modified is specified by the
+ * comp->lcme_id value, which must be an unique component ID. The new
+ * attributes are passed in by @comp and @valid is used to specify which
+ * attributes in the component are going to be changed.
+ */
+int llapi_layout_file_comp_set(const char *path,
+			       const struct llapi_layout *comp,
+			       uint32_t valid);
+
+/** @} llapi */
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_acl.h b/drivers/staging/lustrefsx/lustre/include/lustre_acl.h
new file mode 100644
index 0000000000000..beab4a225119f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_acl.h
@@ -0,0 +1,52 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_acl.h
+ */
+
+#ifndef _LUSTRE_ACL_H
+#define _LUSTRE_ACL_H
+
+#include <linux/fs.h>
+#include <linux/dcache.h>
+#ifdef CONFIG_FS_POSIX_ACL
+# include <linux/posix_acl_xattr.h>
+# define LUSTRE_POSIX_ACL_MAX_ENTRIES 32
+# define LUSTRE_POSIX_ACL_MAX_SIZE_OLD					\
+	(sizeof(posix_acl_xattr_header) +				\
+	 LUSTRE_POSIX_ACL_MAX_ENTRIES * sizeof(posix_acl_xattr_entry))
+#endif /* CONFIG_FS_POSIX_ACL */
+
+#ifndef LUSTRE_POSIX_ACL_MAX_SIZE_OLD
+# define LUSTRE_POSIX_ACL_MAX_SIZE_OLD 0
+#endif /* LUSTRE_POSIX_ACL_MAX_SIZE */
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_barrier.h b/drivers/staging/lustrefsx/lustre/include/lustre_barrier.h
new file mode 100644
index 0000000000000..231eae97972ee
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_barrier.h
@@ -0,0 +1,44 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2016, Intel Corporation.
+ *
+ * lustre/include/lustre_barrier.h
+ *
+ * Lustre write barrier (on MDT) exported functions.
+ *
+ * Author: Fan, Yong <fan.yong@intel.com>
+ */
+
+#ifndef _LUSTRE_BARRIER_H
+# define _LUSTRE_BARRIER_H
+
+#include <dt_object.h>
+#include <lustre_export.h>
+
+bool barrier_entry(struct dt_device *key);
+void barrier_exit(struct dt_device *key);
+int barrier_handler(struct dt_device *key, struct ptlrpc_request *req);
+int barrier_register(struct dt_device *key, struct dt_device *next);
+void barrier_deregister(struct dt_device *key);
+
+#endif /* _LUSTRE_BARRIER_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
new file mode 100644
index 0000000000000..ef9ec2af53905
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
@@ -0,0 +1,667 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_COMPAT_H
+#define _LUSTRE_COMPAT_H
+
+#include <linux/fs_struct.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/xattr.h>
+
+#include <libcfs/libcfs.h>
+#include <lustre_patchless_compat.h>
+#include <obd_support.h>
+
+#ifdef HAVE_FS_STRUCT_RWLOCK
+# define LOCK_FS_STRUCT(fs)	write_lock(&(fs)->lock)
+# define UNLOCK_FS_STRUCT(fs)	write_unlock(&(fs)->lock)
+#else
+# define LOCK_FS_STRUCT(fs)	spin_lock(&(fs)->lock)
+# define UNLOCK_FS_STRUCT(fs)	spin_unlock(&(fs)->lock)
+#endif
+
+#ifdef HAVE_FS_STRUCT_SEQCOUNT
+# define WRITE_FS_SEQ_BEGIN(fs)	write_seqcount_begin(&(fs)->seq)
+# define WRITE_FS_SEQ_END(fs)	write_seqcount_end(&(fs)->seq)
+#else
+# define WRITE_FS_SEQ_BEGIN(fs)
+# define WRITE_FS_SEQ_END(fs)
+#endif
+static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
+                                 struct dentry *dentry)
+{
+	struct path path;
+	struct path old_pwd;
+
+	path.mnt = mnt;
+	path.dentry = dentry;
+	path_get(&path);
+	LOCK_FS_STRUCT(fs);
+	WRITE_FS_SEQ_BEGIN(fs);
+	old_pwd = fs->pwd;
+	fs->pwd = path;
+	WRITE_FS_SEQ_END(fs);
+	UNLOCK_FS_STRUCT(fs);
+
+	if (old_pwd.dentry)
+		path_put(&old_pwd);
+}
+
+/*
+ * set ATTR_BLOCKS to a high value to avoid any risk of collision with other
+ * ATTR_* attributes (see bug 13828)
+ */
+#define ATTR_BLOCKS    (1 << 27)
+
+#define current_ngroups current_cred()->group_info->ngroups
+#define current_groups current_cred()->group_info->small_block
+
+/*
+ * OBD need working random driver, thus all our
+ * initialization routines must be called after device
+ * driver initialization
+ */
+#ifndef MODULE
+#undef module_init
+#define module_init(a)     late_initcall(a)
+#endif
+
+#ifndef MODULE_ALIAS_FS
+#define MODULE_ALIAS_FS(name)
+#endif
+
+#define LTIME_S(time)                   (time.tv_sec)
+
+#ifdef HAVE_GENERIC_PERMISSION_2ARGS
+# define ll_generic_permission(inode, mask, flags, check_acl) \
+	 generic_permission(inode, mask)
+#elif defined HAVE_GENERIC_PERMISSION_4ARGS
+# define ll_generic_permission(inode, mask, flags, check_acl) \
+	 generic_permission(inode, mask, flags, check_acl)
+#else
+# define ll_generic_permission(inode, mask, flags, check_acl) \
+	 generic_permission(inode, mask, check_acl)
+#endif
+
+#ifdef HAVE_4ARGS_VFS_SYMLINK
+#define ll_vfs_symlink(dir, dentry, mnt, path, mode) \
+                vfs_symlink(dir, dentry, path, mode)
+#else
+#define ll_vfs_symlink(dir, dentry, mnt, path, mode) \
+                       vfs_symlink(dir, dentry, path)
+#endif
+
+#if !defined(HAVE_FILE_LLSEEK_SIZE) || defined(HAVE_FILE_LLSEEK_SIZE_5ARGS)
+#define ll_generic_file_llseek_size(file, offset, origin, maxbytes, eof) \
+		generic_file_llseek_size(file, offset, origin, maxbytes, eof);
+#else
+#define ll_generic_file_llseek_size(file, offset, origin, maxbytes, eof) \
+		generic_file_llseek_size(file, offset, origin, maxbytes);
+#endif
+
+#ifdef HAVE_INODE_DIO_WAIT
+/* inode_dio_wait(i) use as-is for write lock */
+# define inode_dio_write_done(i)	do {} while (0) /* for write unlock */
+#else
+# define inode_dio_wait(i)		down_write(&(i)->i_alloc_sem)
+# define inode_dio_write_done(i)	up_write(&(i)->i_alloc_sem)
+#endif
+
+#ifndef FS_HAS_FIEMAP
+#define FS_HAS_FIEMAP			(0)
+#endif
+
+#ifndef HAVE_SIMPLE_SETATTR
+#define simple_setattr(dentry, ops) inode_setattr((dentry)->d_inode, ops)
+#endif
+
+#ifndef SLAB_DESTROY_BY_RCU
+#define SLAB_DESTROY_BY_RCU 0
+#endif
+
+#ifndef HAVE_DQUOT_SUSPEND
+# define ll_vfs_dq_init             vfs_dq_init
+# define ll_vfs_dq_drop             vfs_dq_drop
+# define ll_vfs_dq_transfer         vfs_dq_transfer
+# define ll_vfs_dq_off(sb, remount) vfs_dq_off(sb, remount)
+#else
+# define ll_vfs_dq_init             dquot_initialize
+# define ll_vfs_dq_drop             dquot_drop
+# define ll_vfs_dq_transfer         dquot_transfer
+# define ll_vfs_dq_off(sb, remount) dquot_suspend(sb, -1)
+#endif
+
+#ifndef HAVE_BLKDEV_GET_BY_DEV
+# define blkdev_get_by_dev(dev, mode, holder) open_by_devnum(dev, mode)
+#endif
+
+#ifdef HAVE_BVEC_ITER
+#define bio_idx(bio)			(bio->bi_iter.bi_idx)
+#define bio_set_sector(bio, sector)	(bio->bi_iter.bi_sector = sector)
+#define bvl_to_page(bvl)		(bvl->bv_page)
+#else
+#define bio_idx(bio)			(bio->bi_idx)
+#define bio_set_sector(bio, sector)	(bio->bi_sector = sector)
+#define bio_sectors(bio)		((bio)->bi_size >> 9)
+#ifndef HAVE_BIO_END_SECTOR
+#define bio_end_sector(bio)		(bio->bi_sector + bio_sectors(bio))
+#endif
+#define bvl_to_page(bvl)		(bvl->bv_page)
+#endif
+
+#ifndef HAVE_BLK_QUEUE_MAX_SEGMENTS
+#define blk_queue_max_segments(rq, seg)                      \
+        do { blk_queue_max_phys_segments(rq, seg);           \
+             blk_queue_max_hw_segments(rq, seg); } while (0)
+#else
+#define queue_max_phys_segments(rq)       queue_max_segments(rq)
+#define queue_max_hw_segments(rq)         queue_max_segments(rq)
+#endif
+
+#ifdef HAVE_BLK_PLUG
+#define DECLARE_PLUG(plug)	struct blk_plug plug
+#else /* !HAVE_BLK_PLUG */
+#define DECLARE_PLUG(name)
+#define blk_start_plug(plug)	do {} while (0)
+#define blk_finish_plug(plug)	do {} while (0)
+#endif
+
+#ifdef HAVE_KMAP_ATOMIC_HAS_1ARG
+#define ll_kmap_atomic(a, b)	kmap_atomic(a)
+#define ll_kunmap_atomic(a, b)	kunmap_atomic(a)
+#else
+#define ll_kmap_atomic(a, b)	kmap_atomic(a, b)
+#define ll_kunmap_atomic(a, b)	kunmap_atomic(a, b)
+#endif
+
+#ifndef HAVE_CLEAR_INODE
+#define clear_inode(i)		end_writeback(i)
+#endif
+
+#ifndef HAVE_DENTRY_D_CHILD
+#define d_child			d_u.d_child
+#endif
+
+#ifdef HAVE_DENTRY_D_U_D_ALIAS
+#define d_alias			d_u.d_alias
+#endif
+
+#ifndef DATA_FOR_LLITE_IS_LIST
+#define ll_d_hlist_node hlist_node
+#define ll_d_hlist_empty(list) hlist_empty(list)
+#define ll_d_hlist_entry(ptr, type, name) hlist_entry(ptr.first, type, name)
+#define ll_d_hlist_for_each(tmp, i_dentry) hlist_for_each(tmp, i_dentry)
+# ifdef HAVE_HLIST_FOR_EACH_3ARG
+# define ll_d_hlist_for_each_entry(dentry, p, i_dentry) \
+	p = NULL; hlist_for_each_entry(dentry, i_dentry, d_alias)
+# else
+# define ll_d_hlist_for_each_entry(dentry, p, i_dentry) \
+	hlist_for_each_entry(dentry, p, i_dentry, d_alias)
+# endif
+#define DECLARE_LL_D_HLIST_NODE_PTR(name) struct ll_d_hlist_node *name
+#else
+#define ll_d_hlist_node list_head
+#define ll_d_hlist_empty(list) list_empty(list)
+#define ll_d_hlist_entry(ptr, type, name) list_entry(ptr.next, type, name)
+#define ll_d_hlist_for_each(tmp, i_dentry) list_for_each(tmp, i_dentry)
+#define ll_d_hlist_for_each_entry(dentry, p, i_dentry) \
+	list_for_each_entry(dentry, i_dentry, d_alias)
+#define DECLARE_LL_D_HLIST_NODE_PTR(name) /* nothing */
+#endif /* !DATA_FOR_LLITE_IS_LIST */
+
+#ifndef QUOTA_OK
+# define QUOTA_OK 0
+#endif
+#ifndef NO_QUOTA
+# define NO_QUOTA (-EDQUOT)
+#endif
+
+#ifndef SEEK_DATA
+#define SEEK_DATA      3       /* seek to the next data */
+#endif
+#ifndef SEEK_HOLE
+#define SEEK_HOLE      4       /* seek to the next hole */
+#endif
+
+#ifndef FMODE_UNSIGNED_OFFSET
+#define FMODE_UNSIGNED_OFFSET	((__force fmode_t)0x2000)
+#endif
+
+#if !defined(_ASM_GENERIC_BITOPS_EXT2_NON_ATOMIC_H_) && !defined(ext2_set_bit)
+# define ext2_set_bit             __test_and_set_bit_le
+# define ext2_clear_bit           __test_and_clear_bit_le
+# define ext2_test_bit            test_bit_le
+# define ext2_find_first_zero_bit find_first_zero_bit_le
+# define ext2_find_next_zero_bit  find_next_zero_bit_le
+#endif
+
+#ifdef ATTR_TIMES_SET
+# define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
+#else
+# define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET)
+#endif
+
+#ifndef XATTR_NAME_POSIX_ACL_ACCESS
+# define XATTR_NAME_POSIX_ACL_ACCESS POSIX_ACL_XATTR_ACCESS
+#endif
+
+#ifndef XATTR_NAME_POSIX_ACL_DEFAULT
+# define XATTR_NAME_POSIX_ACL_DEFAULT POSIX_ACL_XATTR_DEFAULT
+#endif
+
+#ifndef HAVE_LM_XXX_LOCK_MANAGER_OPS
+# define lm_compare_owner	fl_compare_owner
+#endif
+
+/*
+ * After 3.1, kernel's nameidata.intent.open.flags is different
+ * with lustre's lookup_intent.it_flags, as lustre's it_flags'
+ * lower bits equal to FMODE_xxx while kernel doesn't transliterate
+ * lower bits of nameidata.intent.open.flags to FMODE_xxx.
+ * */
+#include <linux/version.h>
+static inline int ll_namei_to_lookup_intent_flag(int flag)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 1, 0)
+	flag = (flag & ~O_ACCMODE) | OPEN_FMODE(flag);
+#endif
+	return flag;
+}
+
+#include <linux/fs.h>
+#ifndef HAVE_PROTECT_I_NLINK
+static inline void set_nlink(struct inode *inode, unsigned int nlink)
+{
+	inode->i_nlink = nlink;
+}
+#endif
+
+#ifdef HAVE_INODEOPS_USE_UMODE_T
+# define ll_umode_t	umode_t
+#else
+# define ll_umode_t	int
+#endif
+
+#include <linux/dcache.h>
+#ifndef HAVE_D_MAKE_ROOT
+static inline struct dentry *d_make_root(struct inode *root)
+{
+	struct dentry *res = d_alloc_root(root);
+
+	if (res == NULL && root)
+		iput(root);
+
+	return res;
+}
+#endif
+
+#ifdef HAVE_DIRTY_INODE_HAS_FLAG
+# define ll_dirty_inode(inode, flag)	(inode)->i_sb->s_op->dirty_inode((inode), flag)
+#else
+# define ll_dirty_inode(inode, flag)	(inode)->i_sb->s_op->dirty_inode((inode))
+#endif
+
+#ifdef HAVE_FILE_F_INODE
+# define set_file_inode(file, inode)	(file)->f_inode = inode
+#else
+# define set_file_inode(file, inode)
+#endif
+
+#ifndef HAVE_FILE_INODE
+static inline struct inode *file_inode(const struct file *file)
+{
+	return file->f_path.dentry->d_inode;
+}
+#endif
+
+#ifdef HAVE_OLDSIZE_TRUNCATE_PAGECACHE
+#define ll_truncate_pagecache(inode, size) truncate_pagecache(inode, 0, size)
+#else
+#define ll_truncate_pagecache(inode, size) truncate_pagecache(inode, size)
+#endif
+
+#ifdef HAVE_VFS_RENAME_5ARGS
+#define ll_vfs_rename(a, b, c, d) vfs_rename(a, b, c, d, NULL)
+#elif defined HAVE_VFS_RENAME_6ARGS
+#define ll_vfs_rename(a, b, c, d) vfs_rename(a, b, c, d, NULL, 0)
+#else
+#define ll_vfs_rename(a, b, c, d) vfs_rename(a, b, c, d)
+#endif
+
+#ifdef HAVE_VFS_UNLINK_3ARGS
+#define ll_vfs_unlink(a, b) vfs_unlink(a, b, NULL)
+#else
+#define ll_vfs_unlink(a, b) vfs_unlink(a, b)
+#endif
+
+#ifndef HAVE_INODE_LOCK
+# define inode_lock(inode) mutex_lock(&(inode)->i_mutex)
+# define inode_unlock(inode) mutex_unlock(&(inode)->i_mutex)
+# define inode_trylock(inode) mutex_trylock(&(inode)->i_mutex)
+#endif
+
+#ifndef HAVE_RADIX_EXCEPTION_ENTRY
+static inline int radix_tree_exceptional_entry(void *arg)
+{
+	return 0;
+}
+#endif
+
+#ifndef HAVE_TRUNCATE_INODE_PAGES_FINAL
+static inline void truncate_inode_pages_final(struct address_space *map)
+{
+	truncate_inode_pages(map, 0);
+		/* Workaround for LU-118 */
+	if (map->nrpages) {
+		spin_lock_irq(&map->tree_lock);
+		spin_unlock_irq(&map->tree_lock);
+	}	/* Workaround end */
+}
+#endif
+
+#ifndef SIZE_MAX
+#define SIZE_MAX	(~(size_t)0)
+#endif
+
+#ifdef HAVE_SECURITY_IINITSEC_CALLBACK
+# define ll_security_inode_init_security(inode, dir, name, value, len, \
+					 initxattrs, dentry)	       \
+	 security_inode_init_security(inode, dir, &((dentry)->d_name), \
+				      initxattrs, dentry)
+#elif defined HAVE_SECURITY_IINITSEC_QSTR
+# define ll_security_inode_init_security(inode, dir, name, value, len, \
+					 initxattrs, dentry)	       \
+	 security_inode_init_security(inode, dir, &((dentry)->d_name), \
+				      name, value, len)
+#else /* !HAVE_SECURITY_IINITSEC_CALLBACK && !HAVE_SECURITY_IINITSEC_QSTR */
+# define ll_security_inode_init_security(inode, dir, name, value, len, \
+					 initxattrs, dentry)	       \
+	 security_inode_init_security(inode, dir, name, value, len)
+#endif
+
+#ifndef bio_for_each_segment_all /* since kernel version 3.9 */
+#ifdef HAVE_BVEC_ITER
+#define bio_for_each_segment_all(bv, bio, it) \
+	for (it = 0, bv = (bio)->bi_io_vec; it < (bio)->bi_vcnt; it++, bv++)
+#else
+#define bio_for_each_segment_all(bv, bio, it) bio_for_each_segment(bv, bio, it)
+#endif
+#endif
+
+#ifdef HAVE_PID_NS_FOR_CHILDREN
+# define ll_task_pid_ns(task)	((task)->nsproxy->pid_ns_for_children)
+#else
+# define ll_task_pid_ns(task)	((task)->nsproxy->pid_ns)
+#endif
+
+#ifdef HAVE_FULL_NAME_HASH_3ARGS
+# define ll_full_name_hash(salt, name, len) full_name_hash(salt, name, len)
+#else
+# define ll_full_name_hash(salt, name, len) full_name_hash(name, len)
+#endif
+
+#ifdef HAVE_STRUCT_POSIX_ACL_XATTR
+# define posix_acl_xattr_header struct posix_acl_xattr_header
+# define posix_acl_xattr_entry  struct posix_acl_xattr_entry
+# define GET_POSIX_ACL_XATTR_ENTRY(head) ((void *)((head) + 1))
+#else
+# define GET_POSIX_ACL_XATTR_ENTRY(head) ((head)->a_entries)
+#endif
+
+#ifdef HAVE_IOP_XATTR
+#ifdef HAVE_XATTR_HANDLER_FLAGS
+#define ll_setxattr     generic_setxattr
+#define ll_getxattr     generic_getxattr
+#define ll_removexattr  generic_removexattr
+#else
+int ll_setxattr(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags);
+ssize_t ll_getxattr(struct dentry *dentry, const char *name,
+		    void *buf, size_t buf_size);
+int ll_removexattr(struct dentry *dentry, const char *name);
+#endif /* ! HAVE_XATTR_HANDLER_FLAGS */
+#endif /* HAVE_IOP_XATTR */
+
+#ifndef HAVE_VFS_SETXATTR
+const struct xattr_handler *get_xattr_type(const char *name);
+
+#ifdef HAVE_XATTR_HANDLER_FLAGS
+static inline int
+__vfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
+	       const void *value, size_t size, int flags)
+{
+	const struct xattr_handler *handler;
+	int rc;
+
+	handler = get_xattr_type(name);
+	if (!handler)
+		return -ENXIO;
+
+#if defined(HAVE_XATTR_HANDLER_INODE_PARAM)
+	rc = handler->set(handler, dentry, inode, name, value, size,
+			  XATTR_CREATE);
+#elif defined(HAVE_XATTR_HANDLER_SIMPLIFIED)
+	rc = handler->set(handler, dentry, name, value, size, XATTR_CREATE);
+#else
+	rc = handler->set(dentry, name, value, size, XATTR_CREATE,
+			  handler->flags);
+#endif /* !HAVE_XATTR_HANDLER_INODE_PARAM */
+	return rc;
+}
+#else /* !HAVE_XATTR_HANDLER_FLAGS */
+static inline int
+__vfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
+	       const void *value, size_t size, int flags)
+{
+	return ll_setxattr(dentry, name, value, size, flags);
+}
+#endif /* HAVE_XATTR_HANDLER_FLAGS */
+#endif /* HAVE_VFS_SETXATTR */
+
+#ifdef HAVE_IOP_SET_ACL
+#ifdef CONFIG_FS_POSIX_ACL
+#ifndef HAVE_POSIX_ACL_UPDATE_MODE
+static inline int posix_acl_update_mode(struct inode *inode, umode_t *mode_p,
+			  struct posix_acl **acl)
+{
+	umode_t mode = inode->i_mode;
+	int error;
+
+	error = posix_acl_equiv_mode(*acl, &mode);
+	if (error < 0)
+		return error;
+	if (error == 0)
+		*acl = NULL;
+	if (!in_group_p(inode->i_gid) &&
+	    !capable_wrt_inode_uidgid(inode, CAP_FSETID))
+		mode &= ~S_ISGID;
+	*mode_p = mode;
+	return 0;
+}
+#endif /* HAVE_POSIX_ACL_UPDATE_MODE */
+#endif
+#endif
+
+#ifndef HAVE_IOV_ITER_TRUNCATE
+static inline void iov_iter_truncate(struct iov_iter *i, u64 count)
+{
+	if (i->count > count)
+		i->count = count;
+}
+#endif
+
+#ifndef HAVE_IS_SXID
+static inline bool is_sxid(umode_t mode)
+{
+	return (mode & S_ISUID) || ((mode & S_ISGID) && (mode & S_IXGRP));
+}
+#endif
+
+#ifndef IS_NOSEC
+#define IS_NOSEC(inode)	(!is_sxid(inode->i_mode))
+#endif
+
+#ifndef MS_NOSEC
+static inline void inode_has_no_xattr(struct inode *inode)
+{
+	return;
+}
+#endif
+
+#ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
+static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
+{
+	i->count = count;
+}
+
+static inline struct iovec iov_iter_iovec(const struct iov_iter *iter)
+{
+	return (struct iovec) {
+		.iov_base = iter->iov->iov_base + iter->iov_offset,
+		.iov_len = min(iter->count,
+			       iter->iov->iov_len - iter->iov_offset),
+	};
+}
+
+#define iov_for_each(iov, iter, start)					\
+	for (iter = (start);						\
+	     (iter).count && ((iov = iov_iter_iovec(&(iter))), 1);	\
+	     iov_iter_advance(&(iter), (iov).iov_len))
+
+static inline ssize_t
+generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct iovec iov;
+	struct iov_iter i;
+	ssize_t bytes = 0;
+
+	iov_for_each(iov, i, *iter) {
+		ssize_t res;
+
+		res = generic_file_aio_read(iocb, &iov, 1, iocb->ki_pos);
+		if (res <= 0) {
+			if (bytes == 0)
+				bytes = res;
+			break;
+		}
+
+		bytes += res;
+		if (res < iov.iov_len)
+			break;
+	}
+
+	if (bytes > 0)
+		iov_iter_advance(iter, bytes);
+	return bytes;
+}
+
+static inline ssize_t
+__generic_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct iovec iov;
+	struct iov_iter i;
+	ssize_t bytes = 0;
+
+	/* Since LLITE updates file size at the end of I/O in
+	 * vvp_io_commit_write(), append write has to be done in atomic when
+	 * there are multiple segments because otherwise each iteration to
+	 * __generic_file_aio_write() will see original file size */
+	if (unlikely(iocb->ki_filp->f_flags & O_APPEND && iter->nr_segs > 1)) {
+		struct iovec *iov_copy;
+		int count = 0;
+
+		OBD_ALLOC(iov_copy, sizeof(*iov_copy) * iter->nr_segs);
+		if (!iov_copy)
+			return -ENOMEM;
+
+		iov_for_each(iov, i, *iter)
+			iov_copy[count++] = iov;
+
+		bytes = __generic_file_aio_write(iocb, iov_copy, count,
+						 &iocb->ki_pos);
+		OBD_FREE(iov_copy, sizeof(*iov_copy) * iter->nr_segs);
+
+		if (bytes > 0)
+			iov_iter_advance(iter, bytes);
+		return bytes;
+	}
+
+	iov_for_each(iov, i, *iter) {
+		ssize_t res;
+
+		res = __generic_file_aio_write(iocb, &iov, 1, &iocb->ki_pos);
+		if (res <= 0) {
+			if (bytes == 0)
+				bytes = res;
+			break;
+		}
+
+		bytes += res;
+		if (res < iov.iov_len)
+			break;
+	}
+
+	if (bytes > 0)
+		iov_iter_advance(iter, bytes);
+	return bytes;
+}
+#endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
+
+static inline void __user *get_vmf_address(struct vm_fault *vmf)
+{
+#ifdef HAVE_VM_FAULT_ADDRESS
+	return (void __user *)vmf->address;
+#else
+	return vmf->virtual_address;
+#endif
+}
+
+#ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY
+# define ll_filemap_fault(vma, vmf) filemap_fault(vmf)
+#else
+# define ll_filemap_fault(vma, vmf) filemap_fault(vma, vmf)
+#endif
+
+#ifndef HAVE_CURRENT_TIME
+static inline struct timespec current_time(struct inode *inode)
+{
+	return CURRENT_TIME;
+}
+#endif
+
+#ifndef __GFP_COLD
+#define __GFP_COLD 0
+#endif
+
+#endif /* _LUSTRE_COMPAT_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_debug.h b/drivers/staging/lustrefsx/lustre/include/lustre_debug.h
new file mode 100644
index 0000000000000..bf67e6816a77e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_debug.h
@@ -0,0 +1,74 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_DEBUG_H
+#define _LUSTRE_DEBUG_H
+
+/** \defgroup debug debug
+ *
+ * @{
+ */
+
+#include <lustre_net.h>
+#include <obd.h>
+
+#define LL_CDEBUG_PAGE(mask, page, fmt, arg...)				\
+	CDEBUG(mask, "page %p map %p index %lu flags %lx count %u priv %0lx: " \
+	       fmt, page, page->mapping, page->index, (long)page->flags, \
+	       page_count(page), page_private(page), ## arg)
+
+#define ASSERT_MAX_SIZE_MB 60000ULL
+#define ASSERT_PAGE_INDEX(index, OP)                                    \
+do { if (index > ASSERT_MAX_SIZE_MB << (20 - PAGE_SHIFT)) {         \
+        CERROR("bad page index %lu > %llu\n", index,                    \
+	       ASSERT_MAX_SIZE_MB << (20 - PAGE_SHIFT));            \
+        libcfs_debug = ~0UL;                                            \
+        OP;                                                             \
+}} while(0)
+
+#define ASSERT_FILE_OFFSET(offset, OP)                                  \
+do { if (offset > ASSERT_MAX_SIZE_MB << 20) {                           \
+        CERROR("bad file offset %llu > %llu\n", offset,                 \
+               ASSERT_MAX_SIZE_MB << 20);                               \
+        libcfs_debug = ~0UL;                                            \
+        OP;                                                             \
+}} while(0)
+
+/* lib/debug.c */
+void dump_lniobuf(struct niobuf_local *lnb);
+int dump_req(struct ptlrpc_request *req);
+int block_debug_setup(void *addr, int len, __u64 off, __u64 id);
+int block_debug_check(char *who, void *addr, int len, __u64 off, __u64 id);
+
+/** @} debug */
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_disk.h b/drivers/staging/lustrefsx/lustre/include/lustre_disk.h
new file mode 100644
index 0000000000000..9b20b7ba8f09e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_disk.h
@@ -0,0 +1,361 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_disk.h
+ *
+ * Lustre disk format definitions.
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#ifndef _LUSTRE_DISK_H
+#define _LUSTRE_DISK_H
+
+/** \defgroup disk disk
+ *
+ * @{
+ */
+#include <asm/byteorder.h>
+#include <linux/types.h>
+#include <linux/backing-dev.h>
+#include <linux/list.h>
+#include <libcfs/libcfs.h>
+#include <uapi/linux/lustre_disk.h>
+#include <lustre/lustre_idl.h>
+
+#define IS_MDT(data)		((data)->lsi_flags & LDD_F_SV_TYPE_MDT)
+#define IS_OST(data)		((data)->lsi_flags & LDD_F_SV_TYPE_OST)
+#define IS_MGS(data)		((data)->lsi_flags & LDD_F_SV_TYPE_MGS)
+#define IS_SERVER(data)		((data)->lsi_flags & (LDD_F_SV_TYPE_MGS | \
+						      LDD_F_SV_TYPE_MDT | \
+						      LDD_F_SV_TYPE_OST))
+#define MT_STR(data)		mt_str((data)->ldd_mount_type)
+
+/****************** mount command *********************/
+
+/* The lmd is only used internally by Lustre; mount simply passes
+ * everything as string options
+ */
+#define LMD_MAGIC		0xbdacbd03
+#define LMD_PARAMS_MAXLEN	4096
+
+/* gleaned from the mount command - no persistent info here */
+struct lustre_mount_data {
+	u32	lmd_magic;
+	u32	lmd_flags;	/* lustre mount flags */
+	int	lmd_mgs_failnodes; /* mgs failover node count */
+	int	lmd_exclude_count;
+	int	lmd_recovery_time_soft;
+	int	lmd_recovery_time_hard;
+	char   *lmd_dev;	/* device name */
+	char   *lmd_profile;	/* client only */
+	char   *lmd_fileset;	/* mount fileset */
+	char   *lmd_mgssec;	/* sptlrpc flavor to mgs */
+	char   *lmd_opts;	/* lustre mount options (as opposed to
+				 * device_ mount options) */
+	char   *lmd_params;	/* lustre params */
+	u32    *lmd_exclude;	/* array of OSTs to ignore */
+	char   *lmd_mgs;	/* MGS nid */
+	char   *lmd_osd_type;	/* OSD type */
+	char   *lmd_nidnet;     /* network to restrict this client to */
+};
+
+#define LMD_FLG_SERVER		0x0001	/* Mounting a server */
+#define LMD_FLG_CLIENT		0x0002	/* Mounting a client */
+#define LMD_FLG_SKIP_LFSCK	0x0004	/* NOT auto resume LFSCK when mount */
+#define LMD_FLG_ABORT_RECOV	0x0008	/* Abort recovery */
+#define LMD_FLG_NOSVC		0x0010	/* Only start MGS/MGC for servers,
+					   no other services */
+#define LMD_FLG_NOMGS		0x0020	/* Only start target for servers, reusing
+					   existing MGS services */
+#define LMD_FLG_WRITECONF	0x0040	/* Rewrite config log */
+#define LMD_FLG_NOIR		0x0080	/* NO imperative recovery */
+#define LMD_FLG_NOSCRUB		0x0100	/* Do not trigger scrub automatically */
+#define LMD_FLG_MGS		0x0200	/* Also start MGS along with server */
+#define LMD_FLG_IAM		0x0400	/* IAM dir */
+#define LMD_FLG_NO_PRIMNODE	0x0800	/* all nodes are service nodes */
+#define LMD_FLG_VIRGIN		0x1000	/* the service registers first time */
+#define LMD_FLG_UPDATE		0x2000	/* update parameters */
+#define LMD_FLG_HSM		0x4000	/* Start coordinator */
+#define LMD_FLG_DEV_RDONLY	0x8000	/* discard modification quitely */
+
+#define lmd_is_client(x) ((x)->lmd_flags & LMD_FLG_CLIENT)
+
+/****************** superblock additional info *********************/
+struct ll_sb_info;
+
+struct lustre_sb_info {
+	int                       lsi_flags;
+	struct obd_device        *lsi_mgc;     /* mgc obd */
+	struct lustre_mount_data *lsi_lmd;     /* mount command info */
+	struct ll_sb_info        *lsi_llsbi;   /* add'l client sbi info */
+	struct dt_device	 *lsi_dt_dev;  /* dt device to access disk fs*/
+	atomic_t		  lsi_mounts;  /* references to the srv_mnt */
+	char			  lsi_svname[MTI_NAME_MAXLEN];
+	/* lsi_osd_obdname format = 'lsi->ls_svname'-osd */
+	char			  lsi_osd_obdname[MTI_NAME_MAXLEN + 4];
+	/* lsi_osd_uuid format = 'lsi->ls_osd_obdname'_UUID */
+	char			  lsi_osd_uuid[MTI_NAME_MAXLEN + 9];
+	struct obd_export	 *lsi_osd_exp;
+	char			  lsi_osd_type[16];
+	char			  lsi_fstype[16];
+	struct backing_dev_info   lsi_bdi;     /* each client mountpoint needs
+						  own backing_dev_info */
+	struct list_head	  lsi_lwp_list;
+	spinlock_t		  lsi_lwp_lock;
+	unsigned long		  lsi_lwp_started:1;
+};
+
+#define LSI_UMOUNT_FAILOVER              0x00200000
+
+#define     s2lsi(sb)        ((struct lustre_sb_info *)((sb)->s_fs_info))
+#define     s2lsi_nocast(sb) ((sb)->s_fs_info)
+
+#define     get_profile_name(sb)   (s2lsi(sb)->lsi_lmd->lmd_profile)
+#define     get_mount_fileset(sb)  (s2lsi(sb)->lsi_lmd->lmd_fileset)
+
+# ifdef HAVE_SERVER_SUPPORT
+/* opc for target register */
+#define LDD_F_OPC_REG   0x10000000
+#define LDD_F_OPC_UNREG 0x20000000
+#define LDD_F_OPC_READY 0x40000000
+#define LDD_F_OPC_MASK  0xf0000000
+
+#define LDD_F_MASK	0xFFFF
+
+/*
+ * This limit is arbitrary (131072 clients on x86), but it is convenient to use
+ * 2^n * PAGE_SIZE * 8 for the number of bits that fit an order-n allocation.
+ * If we need more than 131072 clients (order-2 allocation on x86) then this
+ * should become an array of single-page pointers that are allocated on demand.
+ */
+#if (128 * 1024UL) > (PAGE_SIZE * 8)
+#define LR_MAX_CLIENTS (128 * 1024UL)
+#else
+#define LR_MAX_CLIENTS (PAGE_SIZE * 8)
+#endif
+
+/** COMPAT_146: this is an OST (temporary) */
+#define OBD_COMPAT_OST          0x00000002
+/** COMPAT_146: this is an MDT (temporary) */
+#define OBD_COMPAT_MDT          0x00000004
+/** 2.0 server, interop flag to show server version is changed */
+#define OBD_COMPAT_20           0x00000008
+
+/** MDS handles LOV_OBJID file */
+#define OBD_ROCOMPAT_LOVOBJID		0x00000001
+/** store OST index in the IDIF */
+#define OBD_ROCOMPAT_IDX_IN_IDIF	0x00000002
+
+/** OST handles group subdirs */
+#define OBD_INCOMPAT_GROUPS     0x00000001
+/** this is an OST */
+#define OBD_INCOMPAT_OST        0x00000002
+/** this is an MDT */
+#define OBD_INCOMPAT_MDT        0x00000004
+/** common last_rvcd format */
+#define OBD_INCOMPAT_COMMON_LR  0x00000008
+/** FID is enabled */
+#define OBD_INCOMPAT_FID        0x00000010
+/** Size-on-MDS is enabled */
+#define OBD_INCOMPAT_SOM        0x00000020
+/** filesystem using iam format to store directory entries */
+#define OBD_INCOMPAT_IAM_DIR    0x00000040
+/** LMA attribute contains per-inode incompatible flags */
+#define OBD_INCOMPAT_LMA        0x00000080
+/** lmm_stripe_count has been shrunk from u32 to u16 and the remaining 16
+ * bits are now used to store a generation. Once we start changing the layout
+ * and bumping the generation, old versions expecting a 32-bit lmm_stripe_count
+ * will be confused by interpreting stripe_count | gen << 16 as the actual
+ * stripe count */
+#define OBD_INCOMPAT_LMM_VER    0x00000100
+/** multiple OI files for MDT */
+#define OBD_INCOMPAT_MULTI_OI   0x00000200
+/** multiple RPCs in flight */
+#define OBD_INCOMPAT_MULTI_RPCS	0x00000400
+
+/* last_rcvd handling */
+static inline void lsd_le_to_cpu(struct lr_server_data *buf,
+                                 struct lr_server_data *lsd)
+{
+	int i;
+
+	memcpy(lsd->lsd_uuid, buf->lsd_uuid, sizeof(lsd->lsd_uuid));
+	lsd->lsd_last_transno = le64_to_cpu(buf->lsd_last_transno);
+	lsd->lsd_compat14 = le64_to_cpu(buf->lsd_compat14);
+	lsd->lsd_mount_count = le64_to_cpu(buf->lsd_mount_count);
+	lsd->lsd_feature_compat = le32_to_cpu(buf->lsd_feature_compat);
+	lsd->lsd_feature_rocompat = le32_to_cpu(buf->lsd_feature_rocompat);
+	lsd->lsd_feature_incompat = le32_to_cpu(buf->lsd_feature_incompat);
+	lsd->lsd_server_size = le32_to_cpu(buf->lsd_server_size);
+	lsd->lsd_client_start = le32_to_cpu(buf->lsd_client_start);
+	lsd->lsd_client_size = le16_to_cpu(buf->lsd_client_size);
+	lsd->lsd_subdir_count = le16_to_cpu(buf->lsd_subdir_count);
+	lsd->lsd_catalog_oid = le64_to_cpu(buf->lsd_catalog_oid);
+	lsd->lsd_catalog_ogen = le32_to_cpu(buf->lsd_catalog_ogen);
+	memcpy(lsd->lsd_peeruuid, buf->lsd_peeruuid, sizeof(lsd->lsd_peeruuid));
+	lsd->lsd_osd_index = le32_to_cpu(buf->lsd_osd_index);
+	lsd->lsd_padding1 = le32_to_cpu(buf->lsd_padding1);
+	lsd->lsd_start_epoch = le32_to_cpu(buf->lsd_start_epoch);
+	for (i = 0; i < LR_EXPIRE_INTERVALS; i++)
+		lsd->lsd_trans_table[i] = le64_to_cpu(buf->lsd_trans_table[i]);
+	lsd->lsd_trans_table_time = le32_to_cpu(buf->lsd_trans_table_time);
+	lsd->lsd_expire_intervals = le32_to_cpu(buf->lsd_expire_intervals);
+}
+
+static inline void lsd_cpu_to_le(struct lr_server_data *lsd,
+                                 struct lr_server_data *buf)
+{
+	int i;
+
+	memcpy(buf->lsd_uuid, lsd->lsd_uuid, sizeof(buf->lsd_uuid));
+	buf->lsd_last_transno = cpu_to_le64(lsd->lsd_last_transno);
+	buf->lsd_compat14 = cpu_to_le64(lsd->lsd_compat14);
+	buf->lsd_mount_count = cpu_to_le64(lsd->lsd_mount_count);
+	buf->lsd_feature_compat = cpu_to_le32(lsd->lsd_feature_compat);
+	buf->lsd_feature_rocompat = cpu_to_le32(lsd->lsd_feature_rocompat);
+	buf->lsd_feature_incompat = cpu_to_le32(lsd->lsd_feature_incompat);
+	buf->lsd_server_size = cpu_to_le32(lsd->lsd_server_size);
+	buf->lsd_client_start = cpu_to_le32(lsd->lsd_client_start);
+	buf->lsd_client_size = cpu_to_le16(lsd->lsd_client_size);
+	buf->lsd_subdir_count = cpu_to_le16(lsd->lsd_subdir_count);
+	buf->lsd_catalog_oid = cpu_to_le64(lsd->lsd_catalog_oid);
+	buf->lsd_catalog_ogen = cpu_to_le32(lsd->lsd_catalog_ogen);
+	memcpy(buf->lsd_peeruuid, lsd->lsd_peeruuid, sizeof(buf->lsd_peeruuid));
+	buf->lsd_osd_index = cpu_to_le32(lsd->lsd_osd_index);
+	buf->lsd_padding1 = cpu_to_le32(lsd->lsd_padding1);
+	buf->lsd_start_epoch = cpu_to_le32(lsd->lsd_start_epoch);
+	for (i = 0; i < LR_EXPIRE_INTERVALS; i++)
+		buf->lsd_trans_table[i] = cpu_to_le64(lsd->lsd_trans_table[i]);
+	buf->lsd_trans_table_time = cpu_to_le32(lsd->lsd_trans_table_time);
+	buf->lsd_expire_intervals = cpu_to_le32(lsd->lsd_expire_intervals);
+}
+
+static inline void lcd_le_to_cpu(struct lsd_client_data *buf,
+                                 struct lsd_client_data *lcd)
+{
+        memcpy(lcd->lcd_uuid, buf->lcd_uuid, sizeof (lcd->lcd_uuid));
+	lcd->lcd_last_transno = le64_to_cpu(buf->lcd_last_transno);
+	lcd->lcd_last_xid = le64_to_cpu(buf->lcd_last_xid);
+	lcd->lcd_last_result = le32_to_cpu(buf->lcd_last_result);
+	lcd->lcd_last_data = le32_to_cpu(buf->lcd_last_data);
+	lcd->lcd_last_close_transno = le64_to_cpu(buf->lcd_last_close_transno);
+	lcd->lcd_last_close_xid = le64_to_cpu(buf->lcd_last_close_xid);
+	lcd->lcd_last_close_result = le32_to_cpu(buf->lcd_last_close_result);
+	lcd->lcd_last_close_data = le32_to_cpu(buf->lcd_last_close_data);
+	lcd->lcd_pre_versions[0] = le64_to_cpu(buf->lcd_pre_versions[0]);
+	lcd->lcd_pre_versions[1] = le64_to_cpu(buf->lcd_pre_versions[1]);
+	lcd->lcd_pre_versions[2] = le64_to_cpu(buf->lcd_pre_versions[2]);
+	lcd->lcd_pre_versions[3] = le64_to_cpu(buf->lcd_pre_versions[3]);
+	lcd->lcd_last_epoch = le32_to_cpu(buf->lcd_last_epoch);
+	lcd->lcd_generation = le32_to_cpu(buf->lcd_generation);
+}
+
+static inline void lcd_cpu_to_le(struct lsd_client_data *lcd,
+                                 struct lsd_client_data *buf)
+{
+        memcpy(buf->lcd_uuid, lcd->lcd_uuid, sizeof (lcd->lcd_uuid));
+	buf->lcd_last_transno = cpu_to_le64(lcd->lcd_last_transno);
+	buf->lcd_last_xid = cpu_to_le64(lcd->lcd_last_xid);
+	buf->lcd_last_result = cpu_to_le32(lcd->lcd_last_result);
+	buf->lcd_last_data = cpu_to_le32(lcd->lcd_last_data);
+	buf->lcd_last_close_transno = cpu_to_le64(lcd->lcd_last_close_transno);
+	buf->lcd_last_close_xid = cpu_to_le64(lcd->lcd_last_close_xid);
+	buf->lcd_last_close_result = cpu_to_le32(lcd->lcd_last_close_result);
+	buf->lcd_last_close_data = cpu_to_le32(lcd->lcd_last_close_data);
+	buf->lcd_pre_versions[0] = cpu_to_le64(lcd->lcd_pre_versions[0]);
+	buf->lcd_pre_versions[1] = cpu_to_le64(lcd->lcd_pre_versions[1]);
+	buf->lcd_pre_versions[2] = cpu_to_le64(lcd->lcd_pre_versions[2]);
+	buf->lcd_pre_versions[3] = cpu_to_le64(lcd->lcd_pre_versions[3]);
+	buf->lcd_last_epoch = cpu_to_le32(lcd->lcd_last_epoch);
+	buf->lcd_generation = cpu_to_le32(lcd->lcd_generation);
+}
+
+static inline u64 lcd_last_transno(struct lsd_client_data *lcd)
+{
+        return (lcd->lcd_last_transno > lcd->lcd_last_close_transno ?
+                lcd->lcd_last_transno : lcd->lcd_last_close_transno);
+}
+
+static inline u64 lcd_last_xid(struct lsd_client_data *lcd)
+{
+        return (lcd->lcd_last_xid > lcd->lcd_last_close_xid ?
+                lcd->lcd_last_xid : lcd->lcd_last_close_xid);
+}
+
+/****************** mount lookup info *********************/
+
+struct lustre_mount_info {
+	char			*lmi_name;
+	struct super_block	*lmi_sb;
+	struct list_head	 lmi_list_chain;
+};
+
+/****************** prototypes *********************/
+
+/* obd_mount.c */
+int server_name2fsname(const char *svname, char *fsname, const char **endptr);
+#endif /* HAVE_SERVER_SUPPORT */
+
+int server_name2svname(const char *label, char *svname, const char **endptr,
+		       size_t svsize);
+void obdname2fsname(const char *tgt, char *fsname, size_t buflen);
+
+#ifdef HAVE_SERVER_SUPPORT
+int server_name_is_ost(const char *svname);
+int target_name2index(const char *svname, u32 *idx, const char **endptr);
+
+int lustre_put_lsi(struct super_block *sb);
+int lustre_start_simple(char *obdname, char *type, char *uuid,
+			char *s1, char *s2, char *s3, char *s4);
+int lustre_start_mgc(struct super_block *sb);
+#endif /* HAVE_SERVER_SUPPORT */
+void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb,
+						  struct vfsmount *mnt));
+void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb));
+int lustre_common_put_super(struct super_block *sb);
+
+# ifdef HAVE_SERVER_SUPPORT
+/* obd_mount_server.c */
+int server_fill_super(struct super_block *sb);
+struct lustre_mount_info *server_get_mount(const char *name);
+int server_put_mount(const char *name, bool dereg_mnt);
+struct mgs_target_info;
+int server_mti_print(const char *title, struct mgs_target_info *mti);
+void server_calc_timeout(struct lustre_sb_info *lsi, struct obd_device *obd);
+# endif
+
+int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, int type);
+int mgc_logname2resid(char *fsname, struct ldlm_res_id *res_id, int type);
+
+/** @} disk */
+
+#endif /* _LUSTRE_DISK_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_dlm.h b/drivers/staging/lustrefsx/lustre/include/lustre_dlm.h
new file mode 100644
index 0000000000000..d1cb7c20cf82c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_dlm.h
@@ -0,0 +1,1672 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/** \defgroup LDLM Lustre Distributed Lock Manager
+ *
+ * Lustre DLM is based on VAX DLM.
+ * Its two main roles are:
+ *   - To provide locking assuring consistency of data on all Lustre nodes.
+ *   - To allow clients to cache state protected by a lock by holding the
+ *     lock until a conflicting lock is requested or it is expired by the LRU.
+ *
+ * @{
+ */
+
+#ifndef _LUSTRE_DLM_H__
+#define _LUSTRE_DLM_H__
+
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_handles.h>
+#include <interval_tree.h> /* for interval_node{}, ldlm_extent */
+#include <lu_ref.h>
+
+#include "lustre_dlm_flags.h"
+
+struct obd_ops;
+struct obd_device;
+
+extern struct kset *ldlm_ns_kset;
+extern struct kset *ldlm_svc_kset;
+
+#define OBD_LDLM_DEVICENAME  "ldlm"
+
+#define LDLM_DEFAULT_LRU_SIZE (100 * num_online_cpus())
+#define LDLM_DEFAULT_MAX_ALIVE		3900	/* 3900 seconds ~65 min */
+#define LDLM_CTIME_AGE_LIMIT (10)
+#define LDLM_DEFAULT_PARALLEL_AST_LIMIT 1024
+
+/**
+ * LDLM non-error return states
+ */
+enum ldlm_error {
+	ELDLM_OK		= 0,
+	ELDLM_LOCK_MATCHED	= 1,
+
+	ELDLM_LOCK_CHANGED	= 300,
+	ELDLM_LOCK_ABORTED	= 301,
+	ELDLM_LOCK_REPLACED	= 302,
+	ELDLM_NO_LOCK_DATA	= 303,
+	ELDLM_LOCK_WOULDBLOCK	= 304,
+
+	ELDLM_NAMESPACE_EXISTS	= 400,
+	ELDLM_BAD_NAMESPACE	= 401,
+};
+
+/**
+ * LDLM namespace type.
+ * The "client" type is actually an indication that this is a narrow local view
+ * into complete namespace on the server. Such namespaces cannot make any
+ * decisions about lack of conflicts or do any autonomous lock granting without
+ * first speaking to a server.
+ */
+enum ldlm_side {
+	LDLM_NAMESPACE_SERVER = 0x01,
+	LDLM_NAMESPACE_CLIENT = 0x02
+};
+
+/**
+ * The blocking callback is overloaded to perform two functions.  These flags
+ * indicate which operation should be performed.
+ */
+#define LDLM_CB_BLOCKING    1
+#define LDLM_CB_CANCELING   2
+
+/**
+ * \name Lock Compatibility Matrix.
+ *
+ * A lock has both a type (extent, flock, inode bits, or plain) and a mode.
+ * Lock types are described in their respective implementation files:
+ * ldlm_{extent,flock,inodebits,plain}.c.
+ *
+ * There are six lock modes along with a compatibility matrix to indicate if
+ * two locks are compatible.
+ *
+ * - EX: Exclusive mode. Before a new file is created, MDS requests EX lock
+ *   on the parent.
+ * - PW: Protective Write (normal write) mode. When a client requests a write
+ *   lock from an OST, a lock with PW mode will be issued.
+ * - PR: Protective Read (normal read) mode. When a client requests a read from
+ *   an OST, a lock with PR mode will be issued. Also, if the client opens a
+ *   file for execution, it is granted a lock with PR mode.
+ * - CW: Concurrent Write mode. The type of lock that the MDS grants if a client
+ *   requests a write lock during a file open operation.
+ * - CR Concurrent Read mode. When a client performs a path lookup, MDS grants
+ *   an inodebit lock with the CR mode on the intermediate path component.
+ * - NL Null mode.
+ *
+ * <PRE>
+ *       NL  CR  CW  PR  PW  EX
+ *  NL    1   1   1   1   1   1
+ *  CR    1   1   1   1   1   0
+ *  CW    1   1   1   0   0   0
+ *  PR    1   1   0   1   0   0
+ *  PW    1   1   0   0   0   0
+ *  EX    1   0   0   0   0   0
+ * </PRE>
+ */
+/** @{ */
+#define LCK_COMPAT_EX  LCK_NL
+#define LCK_COMPAT_PW  (LCK_COMPAT_EX | LCK_CR)
+#define LCK_COMPAT_PR  (LCK_COMPAT_PW | LCK_PR)
+#define LCK_COMPAT_CW  (LCK_COMPAT_PW | LCK_CW)
+#define LCK_COMPAT_CR  (LCK_COMPAT_CW | LCK_PR | LCK_PW)
+#define LCK_COMPAT_NL  (LCK_COMPAT_CR | LCK_EX | LCK_GROUP)
+#define LCK_COMPAT_GROUP  (LCK_GROUP | LCK_NL)
+#define LCK_COMPAT_COS (LCK_COS)
+/** @} Lock Compatibility Matrix */
+
+extern enum ldlm_mode lck_compat_array[];
+
+static inline void lockmode_verify(enum ldlm_mode mode)
+{
+	LASSERT(mode > LCK_MINMODE && mode < LCK_MAXMODE);
+}
+
+static inline int lockmode_compat(enum ldlm_mode exist_mode,
+				  enum ldlm_mode new_mode)
+{
+	return lck_compat_array[exist_mode] & new_mode;
+}
+
+/*
+ *
+ * cluster name spaces
+ *
+ */
+
+#define DLM_OST_NAMESPACE 1
+#define DLM_MDS_NAMESPACE 2
+
+/* XXX
+   - do we just separate this by security domains and use a prefix for
+     multiple namespaces in the same domain?
+   -
+*/
+
+/**
+ * Locking rules for LDLM:
+ *
+ * lr_lock
+ *
+ * lr_lock
+ *     waiting_locks_spinlock
+ *
+ * lr_lock
+ *     led_lock
+ *
+ * lr_lock
+ *     ns_lock
+ *
+ * lr_lvb_mutex
+ *     lr_lock
+ *
+ */
+
+struct ldlm_pool;
+struct ldlm_lock;
+struct ldlm_resource;
+struct ldlm_namespace;
+
+/**
+ * Operations on LDLM pools.
+ * LDLM pool is a pool of locks in the namespace without any implicitly
+ * specified limits.
+ * Locks in the pool are organized in LRU.
+ * Local memory pressure or server instructions (e.g. mempressure on server)
+ * can trigger freeing of locks from the pool
+ */
+struct ldlm_pool_ops {
+	/** Recalculate pool \a pl usage */
+	int (*po_recalc)(struct ldlm_pool *pl);
+	/** Cancel at least \a nr locks from pool \a pl */
+	int (*po_shrink)(struct ldlm_pool *pl, int nr, gfp_t gfp_mask);
+	int (*po_setup)(struct ldlm_pool *pl, int limit);
+};
+
+/** One second for pools thread check interval. Each pool has own period. */
+#define LDLM_POOLS_THREAD_PERIOD (1)
+
+/** ~6% margin for modest pools. See ldlm_pool.c for details. */
+#define LDLM_POOLS_MODEST_MARGIN_SHIFT (4)
+
+/** Default recalc period for server side pools in sec. */
+#define LDLM_POOL_SRV_DEF_RECALC_PERIOD (1)
+
+/** Default recalc period for client side pools in sec. */
+#define LDLM_POOL_CLI_DEF_RECALC_PERIOD (10)
+
+/**
+ * LDLM pool structure to track granted locks.
+ * For purposes of determining when to release locks on e.g. memory pressure.
+ * This feature is commonly referred to as lru_resize.
+ */
+struct ldlm_pool {
+	/** Pool proc directory. */
+	struct proc_dir_entry	*pl_proc_dir;
+	/** Pool name, must be long enough to hold compound proc entry name. */
+	char			pl_name[100];
+	/** Lock for protecting SLV/CLV updates. */
+	spinlock_t		pl_lock;
+	/** Number of allowed locks in in pool, both, client and server side. */
+	atomic_t		pl_limit;
+	/** Number of granted locks in */
+	atomic_t		pl_granted;
+	/** Grant rate per T. */
+	atomic_t		pl_grant_rate;
+	/** Cancel rate per T. */
+	atomic_t		pl_cancel_rate;
+	/** Server lock volume (SLV). Protected by pl_lock. */
+	__u64			pl_server_lock_volume;
+	/** Current biggest client lock volume. Protected by pl_lock. */
+	__u64			pl_client_lock_volume;
+	/** Lock volume factor. SLV on client is calculated as following:
+	 *  server_slv * lock_volume_factor. */
+	atomic_t		pl_lock_volume_factor;
+	/** Time when last SLV from server was obtained. */
+	time64_t		pl_recalc_time;
+	/** Recalculation period for pool. */
+	time64_t		pl_recalc_period;
+	/** Recalculation and shrink operations. */
+	struct ldlm_pool_ops	*pl_ops;
+	/** Number of planned locks for next period. */
+	int			pl_grant_plan;
+	/** Pool statistics. */
+	struct lprocfs_stats	*pl_stats;
+
+	/* sysfs object */
+	struct kobject		 pl_kobj;
+	struct completion	 pl_kobj_unregister;
+};
+
+typedef int (*ldlm_res_policy)(struct ldlm_namespace *, struct ldlm_lock **,
+			       void *req_cookie, enum ldlm_mode mode,
+			       __u64 flags, void *data);
+
+typedef int (*ldlm_cancel_cbt)(struct ldlm_lock *lock);
+
+/**
+ * LVB operations.
+ * LVB is Lock Value Block. This is a special opaque (to LDLM) value that could
+ * be associated with an LDLM lock and transferred from client to server and
+ * back.
+ *
+ * Currently LVBs are used by:
+ *  - OSC-OST code to maintain current object size/times
+ *  - layout lock code to return the layout when the layout lock is granted
+ *
+ * To ensure delayed LVB initialization, it is highly recommended to use the set
+ * of ldlm_[res_]lvbo_[init,update,fill]() functions.
+ */
+struct ldlm_valblock_ops {
+        int (*lvbo_init)(struct ldlm_resource *res);
+        int (*lvbo_update)(struct ldlm_resource *res,
+                           struct ptlrpc_request *r,
+                           int increase);
+        int (*lvbo_free)(struct ldlm_resource *res);
+	/* Return size of lvb data appropriate RPC size can be reserved */
+	int (*lvbo_size)(struct ldlm_lock *lock);
+	/* Called to fill in lvb data to RPC buffer @buf */
+	int (*lvbo_fill)(struct ldlm_lock *lock, void *buf, int buflen);
+};
+
+/**
+ * LDLM pools related, type of lock pool in the namespace.
+ * Greedy means release cached locks aggressively
+ */
+enum ldlm_appetite {
+	LDLM_NAMESPACE_GREEDY = 1 << 0,
+	LDLM_NAMESPACE_MODEST = 1 << 1
+};
+
+/**
+ * Default values for the "max_nolock_size", "contention_time" and
+ * "contended_locks" namespace tunables.
+ */
+#define NS_DEFAULT_MAX_NOLOCK_BYTES 0
+#define NS_DEFAULT_CONTENTION_SECONDS 2
+#define NS_DEFAULT_CONTENDED_LOCKS 32
+
+struct ldlm_ns_bucket {
+	/** back pointer to namespace */
+	struct ldlm_namespace      *nsb_namespace;
+	/**
+	 * Estimated lock callback time.  Used by adaptive timeout code to
+	 * avoid spurious client evictions due to unresponsiveness when in
+	 * fact the network or overall system load is at fault
+	 */
+	struct adaptive_timeout     nsb_at_estimate;
+	/**
+	 * Which res in the bucket should we start with the reclaim.
+	 */
+	int			    nsb_reclaim_start;
+};
+
+enum {
+	/** LDLM namespace lock stats */
+	LDLM_NSS_LOCKS          = 0,
+	LDLM_NSS_LAST
+};
+
+enum ldlm_ns_type {
+	LDLM_NS_TYPE_UNKNOWN = 0,	/**< invalid type */
+	LDLM_NS_TYPE_MDC,		/**< MDC namespace */
+	LDLM_NS_TYPE_MDT,		/**< MDT namespace */
+	LDLM_NS_TYPE_OSC,		/**< OSC namespace */
+	LDLM_NS_TYPE_OST,		/**< OST namespace */
+	LDLM_NS_TYPE_MGC,		/**< MGC namespace */
+	LDLM_NS_TYPE_MGT,		/**< MGT namespace */
+};
+
+/**
+ * LDLM Namespace.
+ *
+ * Namespace serves to contain locks related to a particular service.
+ * There are two kinds of namespaces:
+ * - Server namespace has knowledge of all locks and is therefore authoritative
+ *   to make decisions like what locks could be granted and what conflicts
+ *   exist during new lock enqueue.
+ * - Client namespace only has limited knowledge about locks in the namespace,
+ *   only seeing locks held by the client.
+ *
+ * Every Lustre service has one server namespace present on the server serving
+ * that service. Every client connected to the service has a client namespace
+ * for it.
+ * Every lock obtained by client in that namespace is actually represented by
+ * two in-memory locks. One on the server and one on the client. The locks are
+ * linked by a special cookie by which one node can tell to the other which lock
+ * it actually means during communications. Such locks are called remote locks.
+ * The locks held by server only without any reference to a client are called
+ * local locks.
+ */
+struct ldlm_namespace {
+	/** Backward link to OBD, required for LDLM pool to store new SLV. */
+	struct obd_device	*ns_obd;
+
+	/** Flag indicating if namespace is on client instead of server */
+	enum ldlm_side		ns_client;
+
+	/** Resource hash table for namespace. */
+	struct cfs_hash		*ns_rs_hash;
+
+	/** serialize */
+	spinlock_t		ns_lock;
+
+	/** big refcount (by bucket) */
+	atomic_t		ns_bref;
+
+	/**
+	 * Namespace connect flags supported by server (may be changed via
+	 * /proc, LRU resize may be disabled/enabled).
+	 */
+	__u64			ns_connect_flags;
+
+	/** Client side original connect flags supported by server. */
+	__u64			ns_orig_connect_flags;
+
+	/* namespace proc dir entry */
+	struct proc_dir_entry	*ns_proc_dir_entry;
+
+	/**
+	 * Position in global namespace list linking all namespaces on
+	 * the node.
+	 */
+	struct list_head	ns_list_chain;
+
+	/**
+	 * List of unused locks for this namespace. This list is also called
+	 * LRU lock list.
+	 * Unused locks are locks with zero reader/writer reference counts.
+	 * This list is only used on clients for lock caching purposes.
+	 * When we want to release some locks voluntarily or if server wants
+	 * us to release some locks due to e.g. memory pressure, we take locks
+	 * to release from the head of this list.
+	 * Locks are linked via l_lru field in \see struct ldlm_lock.
+	 */
+	struct list_head	ns_unused_list;
+	/** Number of locks in the LRU list above */
+	int			ns_nr_unused;
+
+	/**
+	 * Maximum number of locks permitted in the LRU. If 0, means locks
+	 * are managed by pools and there is no preset limit, rather it is all
+	 * controlled by available memory on this client and on server.
+	 */
+	unsigned int		ns_max_unused;
+
+	/** Maximum allowed age (last used time) for locks in the LRU */
+	ktime_t			ns_max_age;
+
+	/**
+	 * Server only: number of times we evicted clients due to lack of reply
+	 * to ASTs.
+	 */
+	unsigned int		ns_timeouts;
+	/**
+	 * Number of seconds since the file change time after which the
+	 * MDT will return an UPDATE lock along with a LOOKUP lock.
+	 * This allows the client to start caching negative dentries
+	 * for a directory and may save an RPC for a later stat.
+	 */
+	unsigned int		ns_ctime_age_limit;
+
+	/**
+	 * Used to rate-limit ldlm_namespace_dump calls.
+	 * \see ldlm_namespace_dump. Increased by 10 seconds every time
+	 * it is called.
+	 */
+	cfs_time_t		ns_next_dump;
+
+	/** "policy" function that does actual lock conflict determination */
+	ldlm_res_policy		ns_policy;
+
+	/**
+	 * LVB operations for this namespace.
+	 * \see struct ldlm_valblock_ops
+	 */
+	struct ldlm_valblock_ops *ns_lvbo;
+
+	/**
+	 * Used by filter code to store pointer to OBD of the service.
+	 * Should be dropped in favor of \a ns_obd
+	 */
+	void			*ns_lvbp;
+
+	/**
+	 * Wait queue used by __ldlm_namespace_free. Gets woken up every time
+	 * a resource is removed.
+	 */
+	wait_queue_head_t	ns_waitq;
+	/** LDLM pool structure for this namespace */
+	struct ldlm_pool	ns_pool;
+	/** Definition of how eagerly unused locks will be released from LRU */
+	enum ldlm_appetite	ns_appetite;
+
+	/**
+	 * If more than \a ns_contended_locks are found, the resource is
+	 * considered to be contended. Lock enqueues might specify that no
+	 * contended locks should be granted
+	 */
+	unsigned		ns_contended_locks;
+
+	/**
+	 * The resources in this namespace remember contended state during
+	 * \a ns_contention_time, in seconds.
+	 */
+	unsigned		ns_contention_time;
+
+	/**
+	 * Limit size of contended extent locks, in bytes.
+	 * If extended lock is requested for more then this many bytes and
+	 * caller instructs us not to grant contended locks, we would disregard
+	 * such a request.
+	 */
+	unsigned		ns_max_nolock_size;
+
+	/** Limit of parallel AST RPC count. */
+	unsigned		ns_max_parallel_ast;
+
+	/**
+	 * Callback to check if a lock is good to be canceled by ELC or
+	 * during recovery.
+	 */
+	ldlm_cancel_cbt		ns_cancel;
+
+	/** LDLM lock stats */
+	struct lprocfs_stats	*ns_stats;
+
+	/**
+	 * Flag to indicate namespace is being freed. Used to determine if
+	 * recalculation of LDLM pool statistics should be skipped.
+	 */
+	unsigned		ns_stopping:1;
+
+	/**
+	 * Which bucket should we start with the lock reclaim.
+	 */
+	int			ns_reclaim_start;
+
+	struct kobject		ns_kobj; /* sysfs object */
+	struct completion	ns_kobj_unregister;
+};
+
+/**
+ * Returns 1 if namespace \a ns is a client namespace.
+ */
+static inline int ns_is_client(struct ldlm_namespace *ns)
+{
+        LASSERT(ns != NULL);
+        LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT |
+                                    LDLM_NAMESPACE_SERVER)));
+        LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT ||
+                ns->ns_client == LDLM_NAMESPACE_SERVER);
+        return ns->ns_client == LDLM_NAMESPACE_CLIENT;
+}
+
+/**
+ * Returns 1 if namespace \a ns is a server namespace.
+ */
+static inline int ns_is_server(struct ldlm_namespace *ns)
+{
+        LASSERT(ns != NULL);
+        LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT |
+                                    LDLM_NAMESPACE_SERVER)));
+        LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT ||
+                ns->ns_client == LDLM_NAMESPACE_SERVER);
+        return ns->ns_client == LDLM_NAMESPACE_SERVER;
+}
+
+/**
+ * Returns 1 if namespace \a ns supports early lock cancel (ELC).
+ */
+static inline int ns_connect_cancelset(struct ldlm_namespace *ns)
+{
+	LASSERT(ns != NULL);
+	return !!(ns->ns_connect_flags & OBD_CONNECT_CANCELSET);
+}
+
+/**
+ * Returns 1 if this namespace supports lru_resize.
+ */
+static inline int ns_connect_lru_resize(struct ldlm_namespace *ns)
+{
+        LASSERT(ns != NULL);
+        return !!(ns->ns_connect_flags & OBD_CONNECT_LRU_RESIZE);
+}
+
+static inline void ns_register_cancel(struct ldlm_namespace *ns,
+				      ldlm_cancel_cbt arg)
+{
+	LASSERT(ns != NULL);
+	ns->ns_cancel = arg;
+}
+
+struct ldlm_lock;
+
+/** Type for blocking callback function of a lock. */
+typedef int (*ldlm_blocking_callback)(struct ldlm_lock *lock,
+				      struct ldlm_lock_desc *new, void *data,
+				      int flag);
+/** Type for completion callback function of a lock. */
+typedef int (*ldlm_completion_callback)(struct ldlm_lock *lock, __u64 flags,
+					void *data);
+/** Type for glimpse callback function of a lock. */
+typedef int (*ldlm_glimpse_callback)(struct ldlm_lock *lock, void *data);
+
+/** Work list for sending GL ASTs to multiple locks. */
+struct ldlm_glimpse_work {
+	struct ldlm_lock	*gl_lock; /* lock to glimpse */
+	struct list_head	 gl_list; /* linkage to other gl work structs */
+	__u32			 gl_flags;/* see LDLM_GL_WORK_* below */
+	union ldlm_gl_desc	*gl_desc; /* glimpse descriptor to be packed in
+					   * glimpse callback request */
+	ptlrpc_interpterer_t	 gl_interpret_reply;
+	void			*gl_interpret_data;
+};
+
+struct ldlm_cb_set_arg {
+	struct ptlrpc_request_set	*set;
+	int				 type; /* LDLM_{CP,BL,GL}_CALLBACK */
+	atomic_t			 restart;
+	struct list_head		*list;
+	union ldlm_gl_desc		*gl_desc; /* glimpse AST descriptor */
+	ptlrpc_interpterer_t		 gl_interpret_reply;
+	void				*gl_interpret_data;
+};
+
+struct ldlm_cb_async_args {
+	struct ldlm_cb_set_arg	*ca_set_arg;
+	struct ldlm_lock	*ca_lock;
+};
+
+/** The ldlm_glimpse_work is allocated on the stack and should not be freed. */
+#define LDLM_GL_WORK_NOFREE 0x1
+
+/** Interval node data for each LDLM_EXTENT lock. */
+struct ldlm_interval {
+	struct interval_node	li_node;  /* node for tree management */
+	struct list_head	li_group; /* the locks which have the same
+					   * policy - group of the policy */
+};
+#define to_ldlm_interval(n) container_of(n, struct ldlm_interval, li_node)
+
+/**
+ * Interval tree for extent locks.
+ * The interval tree must be accessed under the resource lock.
+ * Interval trees are used for granted extent locks to speed up conflicts
+ * lookup. See ldlm/interval_tree.c for more details.
+ */
+struct ldlm_interval_tree {
+	/** Tree size. */
+	int			lit_size;
+	enum ldlm_mode		lit_mode;  /* lock mode */
+	struct interval_node	*lit_root; /* actual ldlm_interval */
+};
+
+/** Whether to track references to exports by LDLM locks. */
+#define LUSTRE_TRACKS_LOCK_EXP_REFS (0)
+
+/** Cancel flags. */
+enum ldlm_cancel_flags {
+	LCF_ASYNC	= 0x1, /* Cancel locks asynchronously. */
+	LCF_LOCAL	= 0x2, /* Cancel locks locally, not notifing server */
+	LCF_BL_AST	= 0x4, /* Cancel LDLM_FL_BL_AST locks in the same RPC */
+};
+
+struct ldlm_flock {
+	__u64 start;
+	__u64 end;
+	__u64 owner;
+	__u64 blocking_owner;
+	struct obd_export *blocking_export;
+	atomic_t blocking_refs;
+	__u32 pid;
+};
+
+union ldlm_policy_data {
+	struct ldlm_extent l_extent;
+	struct ldlm_flock l_flock;
+	struct ldlm_inodebits l_inodebits;
+};
+
+void ldlm_convert_policy_to_wire(enum ldlm_type type,
+				 const union ldlm_policy_data *lpolicy,
+				 union ldlm_wire_policy_data *wpolicy);
+void ldlm_convert_policy_to_local(struct obd_export *exp, enum ldlm_type type,
+				  const union ldlm_wire_policy_data *wpolicy,
+				  union ldlm_policy_data *lpolicy);
+
+enum lvb_type {
+	LVB_T_NONE	= 0,
+	LVB_T_OST	= 1,
+	LVB_T_LQUOTA	= 2,
+	LVB_T_LAYOUT	= 3,
+};
+
+/**
+ * LDLM_GID_ANY is used to match any group id in ldlm_lock_match().
+ */
+#define LDLM_GID_ANY  ((__u64)-1)
+
+/**
+ * LDLM lock structure
+ *
+ * Represents a single LDLM lock and its state in memory. Each lock is
+ * associated with a single ldlm_resource, the object which is being
+ * locked. There may be multiple ldlm_locks on a single resource,
+ * depending on the lock type and whether the locks are conflicting or
+ * not.
+ */
+struct ldlm_lock {
+	/**
+	 * Local lock handle.
+	 * When remote side wants to tell us about a lock, they address
+	 * it by this opaque handle.  The handle does not hold a
+	 * reference on the ldlm_lock, so it can be safely passed to
+	 * other threads or nodes. When the lock needs to be accessed
+	 * from the handle, it is looked up again in the lock table, and
+	 * may no longer exist.
+	 *
+	 * Must be first in the structure.
+	 */
+	struct portals_handle	l_handle;
+	/**
+	 * Lock reference count.
+	 * This is how many users have pointers to actual structure, so that
+	 * we do not accidentally free lock structure that is in use.
+	 */
+	atomic_t		l_refc;
+	/**
+	 * Internal spinlock protects l_resource.  We should hold this lock
+	 * first before taking res_lock.
+	 */
+	spinlock_t		l_lock;
+	/**
+	 * Pointer to actual resource this lock is in.
+	 * ldlm_lock_change_resource() can change this.
+	 */
+	struct ldlm_resource	*l_resource;
+	/**
+	 * List item for client side LRU list.
+	 * Protected by ns_lock in struct ldlm_namespace.
+	 */
+	struct list_head	l_lru;
+	/**
+	 * Linkage to resource's lock queues according to current lock state.
+	 * (could be granted, waiting or converting)
+	 * Protected by lr_lock in struct ldlm_resource.
+	 */
+	struct list_head	l_res_link;
+	/**
+	 * Tree node for ldlm_extent.
+	 */
+	struct ldlm_interval	*l_tree_node;
+	/**
+	 * Per export hash of locks.
+	 * Protected by per-bucket exp->exp_lock_hash locks.
+	 */
+	struct hlist_node	l_exp_hash;
+	/**
+	 * Per export hash of flock locks.
+	 * Protected by per-bucket exp->exp_flock_hash locks.
+	 */
+	struct hlist_node	l_exp_flock_hash;
+	/**
+	 * Requested mode.
+	 * Protected by lr_lock.
+	 */
+	enum ldlm_mode		l_req_mode;
+	/**
+	 * Granted mode, also protected by lr_lock.
+	 */
+	enum ldlm_mode		l_granted_mode;
+	/** Lock completion handler pointer. Called when lock is granted. */
+	ldlm_completion_callback l_completion_ast;
+	/**
+	 * Lock blocking AST handler pointer.
+	 * It plays two roles:
+	 * - as a notification of an attempt to queue a conflicting lock (once)
+	 * - as a notification when the lock is being cancelled.
+	 *
+	 * As such it's typically called twice: once for the initial conflict
+	 * and then once more when the last user went away and the lock is
+	 * cancelled (could happen recursively).
+	 */
+	ldlm_blocking_callback	l_blocking_ast;
+	/**
+	 * Lock glimpse handler.
+	 * Glimpse handler is used to obtain LVB updates from a client by
+	 * server
+	 */
+	ldlm_glimpse_callback	l_glimpse_ast;
+
+	/**
+	 * Lock export.
+	 * This is a pointer to actual client export for locks that were granted
+	 * to clients. Used server-side.
+	 */
+	struct obd_export	*l_export;
+	/**
+	 * Lock connection export.
+	 * Pointer to server export on a client.
+	 */
+	struct obd_export	*l_conn_export;
+
+	/**
+	 * Remote lock handle.
+	 * If the lock is remote, this is the handle of the other side lock
+	 * (l_handle)
+	 */
+	struct lustre_handle	l_remote_handle;
+
+	/**
+	 * Representation of private data specific for a lock type.
+	 * Examples are: extent range for extent lock or bitmask for ibits locks
+	 */
+	union ldlm_policy_data	l_policy_data;
+
+	/**
+	 * Lock state flags. Protected by lr_lock.
+	 * \see lustre_dlm_flags.h where the bits are defined.
+	 */
+	__u64			l_flags;
+
+	/**
+	 * Lock r/w usage counters.
+	 * Protected by lr_lock.
+	 */
+	__u32			l_readers;
+	__u32			l_writers;
+	/**
+	 * If the lock is granted, a process sleeps on this waitq to learn when
+	 * it's no longer in use.  If the lock is not granted, a process sleeps
+	 * on this waitq to learn when it becomes granted.
+	 */
+	wait_queue_head_t	l_waitq;
+
+	/**
+	 * Seconds. It will be updated if there is any activity related to
+	 * the lock, e.g. enqueue the lock or send blocking AST.
+	 */
+	time64_t		l_last_activity;
+
+	/**
+	 * Time, in nanoseconds, last used by e.g. being matched by lock match.
+	 */
+	ktime_t			l_last_used;
+
+	/** Originally requested extent for the extent lock. */
+	struct ldlm_extent	l_req_extent;
+
+	/*
+	 * Client-side-only members.
+	 */
+
+	enum lvb_type	      l_lvb_type;
+
+	/**
+	 * Temporary storage for a LVB received during an enqueue operation.
+	 * May be vmalloc'd, so needs to be freed with OBD_FREE_LARGE().
+	 */
+	__u32			l_lvb_len;
+	void			*l_lvb_data;
+
+	/** Private storage for lock user. Opaque to LDLM. */
+	void			*l_ast_data;
+
+	/*
+	 * Server-side-only members.
+	 */
+
+	/**
+	 * Connection cookie for the client originating the operation.
+	 * Used by Commit on Share (COS) code. Currently only used for
+	 * inodebits locks on MDS.
+	 */
+	__u64			l_client_cookie;
+
+	/**
+	 * List item for locks waiting for cancellation from clients.
+	 * The lists this could be linked into are:
+	 * waiting_locks_list (protected by waiting_locks_spinlock),
+	 * then if the lock timed out, it is moved to
+	 * expired_lock_list for further processing.
+	 */
+	struct list_head	l_pending_chain;
+
+	/**
+	 * Set when lock is sent a blocking AST. Time in seconds when timeout
+	 * is reached and client holding this lock could be evicted.
+	 * This timeout could be further extended by e.g. certain IO activity
+	 * under this lock.
+	 * \see ost_rw_prolong_locks
+	 */
+	cfs_time_t		l_callback_timeout;
+
+	/** Local PID of process which created this lock. */
+	__u32			l_pid;
+
+	/**
+	 * Number of times blocking AST was sent for this lock.
+	 * This is for debugging. Valid values are 0 and 1, if there is an
+	 * attempt to send blocking AST more than once, an assertion would be
+	 * hit. \see ldlm_work_bl_ast_lock
+	 */
+	int			l_bl_ast_run;
+	/** List item ldlm_add_ast_work_item() for case of blocking ASTs. */
+	struct list_head	l_bl_ast;
+	/** List item ldlm_add_ast_work_item() for case of completion ASTs. */
+	struct list_head	l_cp_ast;
+	/** For ldlm_add_ast_work_item() for "revoke" AST used in COS. */
+	struct list_head	l_rk_ast;
+
+	/**
+	 * Pointer to a conflicting lock that caused blocking AST to be sent
+	 * for this lock
+	 */
+	struct ldlm_lock	*l_blocking_lock;
+
+	/**
+	 * Protected by lr_lock, linkages to "skip lists".
+	 * For more explanations of skip lists see ldlm/ldlm_inodebits.c
+	 */
+	struct list_head	l_sl_mode;
+	struct list_head	l_sl_policy;
+
+	/** Reference tracking structure to debug leaked locks. */
+	struct lu_ref		l_reference;
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	/* Debugging stuff for bug 20498, for tracking export references. */
+	/** number of export references taken */
+	int			l_exp_refs_nr;
+	/** link all locks referencing one export */
+	struct list_head	l_exp_refs_link;
+	/** referenced export object */
+	struct obd_export	*l_exp_refs_target;
+#endif
+	/**
+	 * export blocking dlm lock list, protected by
+	 * l_export->exp_bl_list_lock.
+	 * Lock order of waiting_lists_spinlock, exp_bl_list_lock and res lock
+	 * is: res lock -> exp_bl_list_lock -> wanting_lists_spinlock.
+	 */
+	struct list_head	l_exp_list;
+};
+
+/** For uncommitted cross-MDT lock, store transno this lock belongs to */
+#define l_transno l_client_cookie
+
+/** For uncommitted cross-MDT lock, which is client lock, share with l_rk_ast
+ *  which is for server. */
+#define l_slc_link l_rk_ast
+
+/**
+ * LDLM resource description.
+ * Basically, resource is a representation for a single object.
+ * Object has a name which is currently 4 64-bit integers. LDLM user is
+ * responsible for creation of a mapping between objects it wants to be
+ * protected and resource names.
+ *
+ * A resource can only hold locks of a single lock type, though there may be
+ * multiple ldlm_locks on a single resource, depending on the lock type and
+ * whether the locks are conflicting or not.
+ */
+struct ldlm_resource {
+	struct ldlm_ns_bucket	*lr_ns_bucket;
+
+	/**
+	 * List item for list in namespace hash.
+	 * protected by ns_lock
+	 */
+	struct hlist_node	lr_hash;
+
+	/** Reference count for this resource */
+	atomic_t		lr_refcount;
+
+	/** Spinlock to protect locks under this resource. */
+	spinlock_t		lr_lock;
+
+	/**
+	 * protected by lr_lock
+	 * @{ */
+	/** List of locks in granted state */
+	struct list_head	lr_granted;
+	/** List of locks waiting to change their granted mode (converted) */
+	struct list_head	lr_converting;
+	/**
+	 * List of locks that could not be granted due to conflicts and
+	 * that are waiting for conflicts to go away */
+	struct list_head	lr_waiting;
+	/** @} */
+
+	/** Resource name */
+	struct ldlm_res_id	lr_name;
+
+	/**
+	 * Interval trees (only for extent locks) for all modes of this resource
+	 */
+	struct ldlm_interval_tree *lr_itree;
+
+	union {
+		/**
+		 * When the resource was considered as contended,
+		 * used only on server side. */
+		cfs_time_t	lr_contention_time;
+		/**
+		 * Associated inode, used only on client side.
+		 */
+		struct inode	*lr_lvb_inode;
+	};
+
+	/** Type of locks this resource can hold. Only one type per resource. */
+	enum ldlm_type		lr_type; /* LDLM_{PLAIN,EXTENT,FLOCK,IBITS} */
+
+	/**
+	 * Server-side-only lock value block elements.
+	 * To serialize lvbo_init.
+	 */
+	int			lr_lvb_len;
+	struct mutex		lr_lvb_mutex;
+	/** protected by lr_lock */
+	void			*lr_lvb_data;
+	/** is lvb initialized ? */
+	bool			lr_lvb_initialized;
+
+	/** List of references to this resource. For debugging. */
+	struct lu_ref		lr_reference;
+};
+
+static inline bool ldlm_has_layout(struct ldlm_lock *lock)
+{
+	return lock->l_resource->lr_type == LDLM_IBITS &&
+		lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_LAYOUT;
+}
+
+static inline char *
+ldlm_ns_name(struct ldlm_namespace *ns)
+{
+        return ns->ns_rs_hash->hs_name;
+}
+
+static inline struct ldlm_namespace *
+ldlm_res_to_ns(struct ldlm_resource *res)
+{
+        return res->lr_ns_bucket->nsb_namespace;
+}
+
+static inline struct ldlm_namespace *
+ldlm_lock_to_ns(struct ldlm_lock *lock)
+{
+        return ldlm_res_to_ns(lock->l_resource);
+}
+
+static inline char *
+ldlm_lock_to_ns_name(struct ldlm_lock *lock)
+{
+        return ldlm_ns_name(ldlm_lock_to_ns(lock));
+}
+
+static inline struct adaptive_timeout *
+ldlm_lock_to_ns_at(struct ldlm_lock *lock)
+{
+        return &lock->l_resource->lr_ns_bucket->nsb_at_estimate;
+}
+
+static inline int ldlm_lvbo_init(struct ldlm_resource *res)
+{
+	struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+	int rc = 0;
+
+	if (ns->ns_lvbo == NULL || ns->ns_lvbo->lvbo_init == NULL ||
+	    res->lr_lvb_initialized)
+		return 0;
+
+	mutex_lock(&res->lr_lvb_mutex);
+	/* Did we lose the race? */
+	if (res->lr_lvb_initialized) {
+		mutex_unlock(&res->lr_lvb_mutex);
+		return 0;
+	}
+	rc = ns->ns_lvbo->lvbo_init(res);
+	if (rc < 0) {
+		CDEBUG(D_DLMTRACE, "lvbo_init failed for resource : rc = %d\n",
+		       rc);
+		if (res->lr_lvb_data != NULL) {
+			OBD_FREE(res->lr_lvb_data, res->lr_lvb_len);
+			res->lr_lvb_data = NULL;
+		}
+		res->lr_lvb_len = rc;
+	} else {
+		res->lr_lvb_initialized = true;
+	}
+	mutex_unlock(&res->lr_lvb_mutex);
+	return rc;
+}
+
+static inline int ldlm_lvbo_size(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	if (ns->ns_lvbo != NULL && ns->ns_lvbo->lvbo_size != NULL)
+		return ns->ns_lvbo->lvbo_size(lock);
+
+	return 0;
+}
+
+static inline int ldlm_lvbo_fill(struct ldlm_lock *lock, void *buf, int len)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+	int rc;
+
+	if (ns->ns_lvbo != NULL) {
+		LASSERT(ns->ns_lvbo->lvbo_fill != NULL);
+		/* init lvb now if not already */
+		rc = ldlm_lvbo_init(lock->l_resource);
+		if (rc < 0) {
+			CERROR("lock %p: delayed lvb init failed (rc %d)",
+			       lock, rc);
+			return rc;
+		}
+		return ns->ns_lvbo->lvbo_fill(lock, buf, len);
+	}
+	return 0;
+}
+
+struct ldlm_ast_work {
+	struct ldlm_lock       *w_lock;
+	int			w_blocking;
+	struct ldlm_lock_desc	w_desc;
+	struct list_head	w_list;
+	int			w_flags;
+	void		       *w_data;
+	int			w_datalen;
+};
+
+/**
+ * Common ldlm_enqueue parameters
+ */
+struct ldlm_enqueue_info {
+	enum ldlm_type	ei_type;	/** Type of the lock being enqueued. */
+	enum ldlm_mode	ei_mode;	/** Mode of the lock being enqueued. */
+	void		*ei_cb_bl;	/** blocking lock callback */
+	void		*ei_cb_local_bl; /** blocking local lock callback */
+	void		*ei_cb_cp;	/** lock completion callback */
+	void		*ei_cb_gl;	/** lock glimpse callback */
+	void		*ei_cbdata;	/** Data to be passed into callbacks. */
+	void		*ei_namespace;	/** lock namespace **/
+	unsigned int	ei_enq_slave:1,	/** whether enqueue slave stripes */
+			ei_nonblock:1;	/** non block enqueue */
+};
+
+#define ei_res_id	ei_cb_gl
+
+extern struct obd_ops ldlm_obd_ops;
+
+extern char *ldlm_lockname[];
+extern char *ldlm_typename[];
+extern const char *ldlm_it2str(enum ldlm_intent_flags it);
+
+/**
+ * Just a fancy CDEBUG call with log level preset to LDLM_DEBUG.
+ * For the cases where we do not have actual lock to print along
+ * with a debugging message that is ldlm-related
+ */
+#define LDLM_DEBUG_NOLOCK(format, a...)			\
+	CDEBUG(D_DLMTRACE, "### " format "\n" , ##a)
+
+/**
+ * Support function for lock information printing into debug logs.
+ * \see LDLM_DEBUG
+ */
+#ifdef LIBCFS_DEBUG
+#define ldlm_lock_debug(msgdata, mask, cdls, lock, fmt, a...) do {      \
+        CFS_CHECK_STACK(msgdata, mask, cdls);                           \
+                                                                        \
+        if (((mask) & D_CANTMASK) != 0 ||                               \
+            ((libcfs_debug & (mask)) != 0 &&                            \
+             (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0))          \
+                _ldlm_lock_debug(lock, msgdata, fmt, ##a);              \
+} while(0)
+
+void _ldlm_lock_debug(struct ldlm_lock *lock,
+                      struct libcfs_debug_msg_data *data,
+                      const char *fmt, ...)
+        __attribute__ ((format (printf, 3, 4)));
+
+/**
+ * Rate-limited version of lock printing function.
+ */
+#define LDLM_DEBUG_LIMIT(mask, lock, fmt, a...) do {                         \
+	static struct cfs_debug_limit_state _ldlm_cdls;			     \
+        LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, &_ldlm_cdls);              \
+        ldlm_lock_debug(&msgdata, mask, &_ldlm_cdls, lock, "### " fmt , ##a);\
+} while (0)
+
+#define LDLM_ERROR(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_ERROR, lock, fmt, ## a)
+#define LDLM_WARN(lock, fmt, a...)  LDLM_DEBUG_LIMIT(D_WARNING, lock, fmt, ## a)
+
+/** Non-rate-limited lock printing function for debugging purposes. */
+#define LDLM_DEBUG(lock, fmt, a...)   do {                                  \
+	if (likely(lock != NULL)) {					    \
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_DLMTRACE, NULL);      \
+		ldlm_lock_debug(&msgdata, D_DLMTRACE, NULL, lock, 	    \
+				"### " fmt , ##a);			    \
+	} else {							    \
+		LDLM_DEBUG_NOLOCK("no dlm lock: " fmt, ##a);		    \
+	}								    \
+} while (0)
+#else /* !LIBCFS_DEBUG */
+# define LDLM_DEBUG_LIMIT(mask, lock, fmt, a...) ((void)0)
+# define LDLM_DEBUG(lock, fmt, a...) ((void)0)
+# define LDLM_ERROR(lock, fmt, a...) ((void)0)
+#endif
+
+/*
+ * Three intentions can be used for the policy functions in
+ * ldlm_processing_policy.
+ *
+ * LDLM_PROCESS_RESCAN:
+ *
+ * It's used when policy functions are called from ldlm_reprocess_queue() to
+ * reprocess the wait & convert list and try to grant locks, blocking ASTs
+ * have already been sent in this situation, completion ASTs need be sent for
+ * the locks being granted.
+ *
+ * LDLM_PROCESS_ENQUEUE:
+ *
+ * It's used when policy functions are called from ldlm_lock_enqueue() to
+ * process the wait & convert list for handling an enqueue request, blocking
+ * ASTs have not been sent yet, so list of conflicting locks would be
+ * collected and ASTs sent.
+ *
+ * LDLM_PROCESS_RECOVERY:
+ *
+ * It's used when policy functions are called from ldlm_reprocess_queue() to
+ * reprocess the wait & convert list when recovery done. In case of blocking
+ * ASTs are lost before recovery, it needs not only to grant locks if
+ * available, but also send blocking ASTs to the locks doesn't have AST sent
+ * flag. Completion ASTs need be sent for the locks being granted.
+ */
+enum ldlm_process_intention {
+	LDLM_PROCESS_RESCAN = 0,
+	LDLM_PROCESS_ENQUEUE = 1,
+	LDLM_PROCESS_RECOVERY = 2,
+};
+
+typedef int (*ldlm_processing_policy)(struct ldlm_lock *lock, __u64 *flags,
+				      enum ldlm_process_intention intention,
+				      enum ldlm_error *err,
+				      struct list_head *work_list);
+
+/**
+ * Return values for lock iterators.
+ * Also used during deciding of lock grants and cancellations.
+ */
+#define LDLM_ITER_CONTINUE 1 /* keep iterating */
+#define LDLM_ITER_STOP     2 /* stop iterating */
+
+typedef int (*ldlm_iterator_t)(struct ldlm_lock *, void *);
+typedef int (*ldlm_res_iterator_t)(struct ldlm_resource *, void *);
+
+/** \defgroup ldlm_iterator Lock iterators
+ *
+ * LDLM provides for a way to iterate through every lock on a resource or
+ * namespace or every resource in a namespace.
+ * @{ */
+int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter,
+			  void *closure);
+void ldlm_namespace_foreach(struct ldlm_namespace *ns, ldlm_iterator_t iter,
+			    void *closure);
+int ldlm_resource_iterate(struct ldlm_namespace *, const struct ldlm_res_id *,
+			  ldlm_iterator_t iter, void *data);
+/** @} ldlm_iterator */
+
+int ldlm_replay_locks(struct obd_import *imp);
+
+/* ldlm_flock.c */
+int ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data);
+
+/* ldlm_extent.c */
+__u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms);
+
+struct ldlm_prolong_args {
+	struct obd_export	*lpa_export;
+	struct ldlm_res_id	lpa_resid;
+	struct ldlm_extent	lpa_extent;
+	enum ldlm_mode		lpa_mode;
+	int			lpa_timeout;
+	int			lpa_locks_cnt;
+	int			lpa_blocks_cnt;
+};
+void ldlm_lock_prolong_one(struct ldlm_lock *lock,
+			   struct ldlm_prolong_args *arg);
+void ldlm_resource_prolong(struct ldlm_prolong_args *arg);
+
+struct ldlm_callback_suite {
+        ldlm_completion_callback lcs_completion;
+        ldlm_blocking_callback   lcs_blocking;
+        ldlm_glimpse_callback    lcs_glimpse;
+};
+
+/* ldlm_lockd.c */
+#ifdef HAVE_SERVER_SUPPORT
+/** \defgroup ldlm_srv_ast Server AST handlers
+ * These are AST handlers used by server code.
+ * Their property is that they are just preparing RPCs to be sent to clients.
+ * @{
+ */
+int ldlm_server_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *,
+			     void *data, int flag);
+int ldlm_server_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data);
+int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data);
+int ldlm_glimpse_locks(struct ldlm_resource *res,
+		       struct list_head *gl_work_list);
+/** @} ldlm_srv_ast */
+
+/** \defgroup ldlm_handlers Server LDLM handlers
+ * These are handler functions that should be called by "frontends" such as
+ * MDT or OST to pass through LDLM requests to LDLM for handling
+ * @{
+ */
+int ldlm_handle_enqueue(struct ptlrpc_request *req, ldlm_completion_callback,
+                        ldlm_blocking_callback, ldlm_glimpse_callback);
+int ldlm_handle_enqueue0(struct ldlm_namespace *ns, struct ptlrpc_request *req,
+                         const struct ldlm_request *dlm_req,
+                         const struct ldlm_callback_suite *cbs);
+int ldlm_handle_convert(struct ptlrpc_request *req);
+int ldlm_handle_convert0(struct ptlrpc_request *req,
+                         const struct ldlm_request *dlm_req);
+int ldlm_handle_cancel(struct ptlrpc_request *req);
+int ldlm_request_cancel(struct ptlrpc_request *req,
+			const struct ldlm_request *dlm_req,
+			int first, enum lustre_at_flags flags);
+/** @} ldlm_handlers */
+
+void ldlm_revoke_export_locks(struct obd_export *exp);
+unsigned int ldlm_bl_timeout(struct ldlm_lock *lock);
+#endif
+int ldlm_del_waiting_lock(struct ldlm_lock *lock);
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout);
+int ldlm_get_ref(void);
+void ldlm_put_ref(void);
+int ldlm_init_export(struct obd_export *exp);
+void ldlm_destroy_export(struct obd_export *exp);
+struct ldlm_lock *ldlm_request_lock(struct ptlrpc_request *req);
+
+/* ldlm_lock.c */
+#ifdef HAVE_SERVER_SUPPORT
+ldlm_processing_policy ldlm_get_processing_policy(struct ldlm_resource *res);
+#endif
+void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg);
+void ldlm_lock2handle(const struct ldlm_lock *lock,
+                      struct lustre_handle *lockh);
+struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *, __u64 flags);
+void ldlm_cancel_callback(struct ldlm_lock *);
+int ldlm_lock_remove_from_lru(struct ldlm_lock *);
+int ldlm_lock_set_data(const struct lustre_handle *lockh, void *data);
+
+/**
+ * Obtain a lock reference by its handle.
+ */
+static inline struct ldlm_lock *ldlm_handle2lock(const struct lustre_handle *h)
+{
+        return __ldlm_handle2lock(h, 0);
+}
+
+#define LDLM_LOCK_REF_DEL(lock) \
+	lu_ref_del(&lock->l_reference, "handle", current)
+
+static inline struct ldlm_lock *
+ldlm_handle2lock_long(const struct lustre_handle *h, __u64 flags)
+{
+        struct ldlm_lock *lock;
+
+        lock = __ldlm_handle2lock(h, flags);
+        if (lock != NULL)
+                LDLM_LOCK_REF_DEL(lock);
+        return lock;
+}
+
+/**
+ * Update Lock Value Block Operations (LVBO) on a resource taking into account
+ * data from request \a r
+ */
+static inline int ldlm_res_lvbo_update(struct ldlm_resource *res,
+				       struct ptlrpc_request *req, int increase)
+{
+	int rc;
+
+	/* delayed lvb init may be required */
+	rc = ldlm_lvbo_init(res);
+	if (rc < 0) {
+		CERROR("delayed lvb init failed (rc %d)\n", rc);
+		return rc;
+	}
+
+	if (ldlm_res_to_ns(res)->ns_lvbo &&
+	    ldlm_res_to_ns(res)->ns_lvbo->lvbo_update) {
+		return ldlm_res_to_ns(res)->ns_lvbo->lvbo_update(res, req,
+								 increase);
+	}
+	return 0;
+}
+
+int ldlm_error2errno(enum ldlm_error error);
+enum ldlm_error ldlm_errno2error(int err_no); /* don't call it `errno': this
+					       * confuses user-space. */
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+void ldlm_dump_export_locks(struct obd_export *exp);
+#endif
+
+/**
+ * Release a temporary lock reference obtained by ldlm_handle2lock() or
+ * __ldlm_handle2lock().
+ */
+#define LDLM_LOCK_PUT(lock)                     \
+do {                                            \
+        LDLM_LOCK_REF_DEL(lock);                \
+        /*LDLM_DEBUG((lock), "put");*/          \
+        ldlm_lock_put(lock);                    \
+} while (0)
+
+/**
+ * Release a lock reference obtained by some other means (see
+ * LDLM_LOCK_PUT()).
+ */
+#define LDLM_LOCK_RELEASE(lock)                 \
+do {                                            \
+        /*LDLM_DEBUG((lock), "put");*/          \
+        ldlm_lock_put(lock);                    \
+} while (0)
+
+#define LDLM_LOCK_GET(lock)                     \
+({                                              \
+        ldlm_lock_get(lock);                    \
+        /*LDLM_DEBUG((lock), "get");*/          \
+        lock;                                   \
+})
+
+#define ldlm_lock_list_put(head, member, count)			\
+({								\
+	struct ldlm_lock *_lock, *_next;			\
+	int c = count;						\
+	list_for_each_entry_safe(_lock, _next, head, member) {	\
+		if (c-- == 0)					\
+			break;					\
+		list_del_init(&_lock->member);			\
+		LDLM_LOCK_RELEASE(_lock);			\
+	}							\
+	LASSERT(c <= 0);					\
+})
+
+struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock);
+void ldlm_lock_put(struct ldlm_lock *lock);
+void ldlm_lock_destroy(struct ldlm_lock *lock);
+void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc);
+void ldlm_lock_addref(const struct lustre_handle *lockh, enum ldlm_mode mode);
+int  ldlm_lock_addref_try(const struct lustre_handle *lockh,
+			  enum ldlm_mode mode);
+void ldlm_lock_decref(const struct lustre_handle *lockh, enum ldlm_mode mode);
+void ldlm_lock_decref_and_cancel(const struct lustre_handle *lockh,
+				 enum ldlm_mode mode);
+void ldlm_lock_fail_match_locked(struct ldlm_lock *lock);
+void ldlm_lock_fail_match(struct ldlm_lock *lock);
+void ldlm_lock_allow_match(struct ldlm_lock *lock);
+void ldlm_lock_allow_match_locked(struct ldlm_lock *lock);
+enum ldlm_mode ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags,
+			       const struct ldlm_res_id *, enum ldlm_type type,
+			       union ldlm_policy_data *, enum ldlm_mode mode,
+			       struct lustre_handle *, int unref);
+enum ldlm_mode ldlm_revalidate_lock_handle(const struct lustre_handle *lockh,
+					   __u64 *bits);
+struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock,
+					enum ldlm_mode new_mode, __u32 *flags);
+void ldlm_lock_downgrade(struct ldlm_lock *lock, enum ldlm_mode new_mode);
+void ldlm_lock_cancel(struct ldlm_lock *lock);
+void ldlm_reprocess_all(struct ldlm_resource *res);
+void ldlm_reprocess_recovery_done(struct ldlm_namespace *ns);
+void ldlm_lock_dump_handle(int level, const struct lustre_handle *lockh);
+void ldlm_unlink_lock_skiplist(struct ldlm_lock *req);
+
+/* resource.c */
+struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name,
+					  enum ldlm_side client,
+					  enum ldlm_appetite apt,
+					  enum ldlm_ns_type ns_type);
+int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags);
+void ldlm_namespace_free_prior(struct ldlm_namespace *ns,
+			       struct obd_import *imp,
+			       int force);
+void ldlm_namespace_free_post(struct ldlm_namespace *ns);
+void ldlm_namespace_free(struct ldlm_namespace *ns,
+			 struct obd_import *imp, int force);
+void ldlm_namespace_register(struct ldlm_namespace *ns, enum ldlm_side client);
+void ldlm_namespace_unregister(struct ldlm_namespace *ns,
+			       enum ldlm_side client);
+void ldlm_namespace_get(struct ldlm_namespace *ns);
+void ldlm_namespace_put(struct ldlm_namespace *ns);
+int ldlm_proc_setup(void);
+#ifdef CONFIG_PROC_FS
+void ldlm_proc_cleanup(void);
+#else
+static inline void ldlm_proc_cleanup(void) {}
+#endif
+
+/* resource.c - internal */
+struct ldlm_resource *ldlm_resource_get(struct ldlm_namespace *ns,
+					struct ldlm_resource *parent,
+					const struct ldlm_res_id *,
+					enum ldlm_type type, int create);
+struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res);
+int ldlm_resource_putref(struct ldlm_resource *res);
+void ldlm_resource_add_lock(struct ldlm_resource *res,
+			    struct list_head *head,
+			    struct ldlm_lock *lock);
+void ldlm_resource_unlink_lock(struct ldlm_lock *lock);
+void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc);
+void ldlm_dump_all_namespaces(enum ldlm_side client, int level);
+void ldlm_namespace_dump(int level, struct ldlm_namespace *);
+void ldlm_resource_dump(int level, struct ldlm_resource *);
+int ldlm_lock_change_resource(struct ldlm_namespace *, struct ldlm_lock *,
+                              const struct ldlm_res_id *);
+
+#define LDLM_RESOURCE_ADDREF(res) do {                                  \
+	lu_ref_add_atomic(&(res)->lr_reference, __FUNCTION__, current);  \
+} while (0)
+
+#define LDLM_RESOURCE_DELREF(res) do {                                  \
+	lu_ref_del(&(res)->lr_reference, __FUNCTION__, current);  \
+} while (0)
+
+/* ldlm_request.c */
+int ldlm_expired_completion_wait(void *data);
+/** \defgroup ldlm_local_ast Default AST handlers for local locks
+ * These AST handlers are typically used for server-side local locks and are
+ * also used by client-side lock handlers to perform minimum level base
+ * processing.
+ * @{ */
+int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock);
+int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+		      void *data, int flag);
+int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp);
+int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data);
+int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data);
+/** @} ldlm_local_ast */
+
+/** \defgroup ldlm_cli_api API to operate on locks from actual LDLM users.
+ * These are typically used by client and server (*_local versions)
+ * to obtain and release locks.
+ * @{ */
+int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
+		     struct ldlm_enqueue_info *einfo,
+		     const struct ldlm_res_id *res_id,
+		     union ldlm_policy_data const *policy, __u64 *flags,
+		     void *lvb, __u32 lvb_len, enum lvb_type lvb_type,
+		     struct lustre_handle *lockh, int async);
+int ldlm_prep_enqueue_req(struct obd_export *exp,
+			  struct ptlrpc_request *req,
+			  struct list_head *cancels,
+			  int count);
+int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req,
+		      int version, int opc, int canceloff,
+		      struct list_head *cancels, int count);
+
+struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp, int lvb_len);
+int ldlm_handle_enqueue0(struct ldlm_namespace *ns, struct ptlrpc_request *req,
+			 const struct ldlm_request *dlm_req,
+			 const struct ldlm_callback_suite *cbs);
+int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
+			  enum ldlm_type type, __u8 with_policy,
+			  enum ldlm_mode mode, __u64 *flags, void *lvb,
+			  __u32 lvb_len,
+			  const struct lustre_handle *lockh, int rc);
+int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
+			   const struct ldlm_res_id *res_id,
+			   enum ldlm_type type, union ldlm_policy_data *policy,
+			   enum ldlm_mode mode, __u64 *flags,
+			   ldlm_blocking_callback blocking,
+			   ldlm_completion_callback completion,
+			   ldlm_glimpse_callback glimpse,
+			   void *data, __u32 lvb_len, enum lvb_type lvb_type,
+			   const __u64 *client_cookie,
+			   struct lustre_handle *lockh);
+int ldlm_cli_convert(const struct lustre_handle *lockh, int new_mode,
+		     __u32 *flags);
+int ldlm_cli_update_pool(struct ptlrpc_request *req);
+int ldlm_cli_cancel(const struct lustre_handle *lockh,
+		    enum ldlm_cancel_flags cancel_flags);
+int ldlm_cli_cancel_unused(struct ldlm_namespace *, const struct ldlm_res_id *,
+			   enum ldlm_cancel_flags flags, void *opaque);
+int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
+				    const struct ldlm_res_id *res_id,
+				    union ldlm_policy_data *policy,
+				    enum ldlm_mode mode,
+				    enum ldlm_cancel_flags flags, void *opaque);
+int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *head,
+			int count, enum ldlm_cancel_flags flags);
+int ldlm_cancel_resource_local(struct ldlm_resource *res,
+			       struct list_head *cancels,
+			       union ldlm_policy_data *policy,
+			       enum ldlm_mode mode, __u64 lock_flags,
+			       enum ldlm_cancel_flags cancel_flags,
+			       void *opaque);
+int ldlm_cli_cancel_list_local(struct list_head *cancels, int count,
+			       enum ldlm_cancel_flags flags);
+int ldlm_cli_cancel_list(struct list_head *head, int count,
+			 struct ptlrpc_request *req,
+			 enum ldlm_cancel_flags flags);
+/** @} ldlm_cli_api */
+
+/* mds/handler.c */
+/* This has to be here because recursive inclusion sucks. */
+int intent_disposition(struct ldlm_reply *rep, int flag);
+void intent_set_disposition(struct ldlm_reply *rep, int flag);
+
+/**
+ * "Modes" of acquiring lock_res, necessary to tell lockdep that taking more
+ * than one lock_res is dead-lock safe.
+ */
+enum lock_res_type {
+        LRT_NORMAL,
+        LRT_NEW
+};
+
+/** Lock resource. */
+static inline void lock_res(struct ldlm_resource *res)
+{
+	spin_lock(&res->lr_lock);
+}
+
+/** Lock resource with a way to instruct lockdep code about nestedness-safe. */
+static inline void lock_res_nested(struct ldlm_resource *res,
+				   enum lock_res_type mode)
+{
+	spin_lock_nested(&res->lr_lock, mode);
+}
+
+/** Unlock resource. */
+static inline void unlock_res(struct ldlm_resource *res)
+{
+	spin_unlock(&res->lr_lock);
+}
+
+/** Check if resource is already locked, assert if not. */
+static inline void check_res_locked(struct ldlm_resource *res)
+{
+	assert_spin_locked(&res->lr_lock);
+}
+
+struct ldlm_resource * lock_res_and_lock(struct ldlm_lock *lock);
+void unlock_res_and_lock(struct ldlm_lock *lock);
+
+/* ldlm_pool.c */
+/** \defgroup ldlm_pools Various LDLM pool related functions
+ * There are not used outside of ldlm.
+ * @{
+ */
+int ldlm_pools_recalc(enum ldlm_side client);
+int ldlm_pools_init(void);
+void ldlm_pools_fini(void);
+
+int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
+		   int idx, enum ldlm_side client);
+int ldlm_pool_shrink(struct ldlm_pool *pl, int nr, gfp_t gfp_mask);
+void ldlm_pool_fini(struct ldlm_pool *pl);
+int ldlm_pool_setup(struct ldlm_pool *pl, int limit);
+int ldlm_pool_recalc(struct ldlm_pool *pl);
+__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl);
+__u64 ldlm_pool_get_slv(struct ldlm_pool *pl);
+__u64 ldlm_pool_get_clv(struct ldlm_pool *pl);
+__u32 ldlm_pool_get_limit(struct ldlm_pool *pl);
+void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv);
+void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv);
+void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit);
+void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock);
+void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock);
+/** @} */
+
+static inline int ldlm_extent_overlap(const struct ldlm_extent *ex1,
+				      const struct ldlm_extent *ex2)
+{
+	return ex1->start <= ex2->end && ex2->start <= ex1->end;
+}
+
+/* check if @ex1 contains @ex2 */
+static inline int ldlm_extent_contain(const struct ldlm_extent *ex1,
+				      const struct ldlm_extent *ex2)
+{
+	return ex1->start <= ex2->start && ex1->end >= ex2->end;
+}
+
+#endif
+/** @} LDLM */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h b/drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h
new file mode 100644
index 0000000000000..179cb71de3758
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h
@@ -0,0 +1,407 @@
+/*  -*- buffer-read-only: t -*- vi: set ro:
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/**
+ * \file lustre_dlm_flags.h
+ * The flags and collections of flags (masks) for \see struct ldlm_lock.
+ *
+ * \addtogroup LDLM Lustre Distributed Lock Manager
+ * @{
+ *
+ * \name flags
+ * The flags and collections of flags (masks) for \see struct ldlm_lock.
+ * @{
+ */
+#ifndef LDLM_ALL_FLAGS_MASK
+
+/** l_flags bits marked as "all_flags" bits */
+#define LDLM_FL_ALL_FLAGS_MASK          0x00FFFFFFC08F932FULL
+
+/** extent, mode, or resource changed */
+#define LDLM_FL_LOCK_CHANGED            0x0000000000000001ULL // bit   0
+#define ldlm_is_lock_changed(_l)        LDLM_TEST_FLAG(( _l), 1ULL <<  0)
+#define ldlm_set_lock_changed(_l)       LDLM_SET_FLAG((  _l), 1ULL <<  0)
+#define ldlm_clear_lock_changed(_l)     LDLM_CLEAR_FLAG((_l), 1ULL <<  0)
+
+/**
+ * Server placed lock on granted list, or a recovering client wants the
+ * lock added to the granted list, no questions asked. */
+#define LDLM_FL_BLOCK_GRANTED           0x0000000000000002ULL // bit   1
+#define ldlm_is_block_granted(_l)       LDLM_TEST_FLAG(( _l), 1ULL <<  1)
+#define ldlm_set_block_granted(_l)      LDLM_SET_FLAG((  _l), 1ULL <<  1)
+#define ldlm_clear_block_granted(_l)    LDLM_CLEAR_FLAG((_l), 1ULL <<  1)
+
+/**
+ * Server placed lock on conv list, or a recovering client wants the lock
+ * added to the conv list, no questions asked. */
+#define LDLM_FL_BLOCK_CONV              0x0000000000000004ULL // bit   2
+#define ldlm_is_block_conv(_l)          LDLM_TEST_FLAG(( _l), 1ULL <<  2)
+#define ldlm_set_block_conv(_l)         LDLM_SET_FLAG((  _l), 1ULL <<  2)
+#define ldlm_clear_block_conv(_l)       LDLM_CLEAR_FLAG((_l), 1ULL <<  2)
+
+/**
+ * Server placed lock on wait list, or a recovering client wants the lock
+ * added to the wait list, no questions asked. */
+#define LDLM_FL_BLOCK_WAIT              0x0000000000000008ULL // bit   3
+#define ldlm_is_block_wait(_l)          LDLM_TEST_FLAG(( _l), 1ULL <<  3)
+#define ldlm_set_block_wait(_l)         LDLM_SET_FLAG((  _l), 1ULL <<  3)
+#define ldlm_clear_block_wait(_l)       LDLM_CLEAR_FLAG((_l), 1ULL <<  3)
+
+/** blocking or cancel packet was queued for sending. */
+#define LDLM_FL_AST_SENT                0x0000000000000020ULL // bit   5
+#define ldlm_is_ast_sent(_l)            LDLM_TEST_FLAG(( _l), 1ULL <<  5)
+#define ldlm_set_ast_sent(_l)           LDLM_SET_FLAG((  _l), 1ULL <<  5)
+#define ldlm_clear_ast_sent(_l)         LDLM_CLEAR_FLAG((_l), 1ULL <<  5)
+
+/**
+ * Lock is being replayed.  This could probably be implied by the fact that
+ * one of BLOCK_{GRANTED,CONV,WAIT} is set, but that is pretty dangerous. */
+#define LDLM_FL_REPLAY                  0x0000000000000100ULL // bit   8
+#define ldlm_is_replay(_l)              LDLM_TEST_FLAG(( _l), 1ULL <<  8)
+#define ldlm_set_replay(_l)             LDLM_SET_FLAG((  _l), 1ULL <<  8)
+#define ldlm_clear_replay(_l)           LDLM_CLEAR_FLAG((_l), 1ULL <<  8)
+
+/** Don't grant lock, just do intent. */
+#define LDLM_FL_INTENT_ONLY             0x0000000000000200ULL // bit   9
+#define ldlm_is_intent_only(_l)         LDLM_TEST_FLAG(( _l), 1ULL <<  9)
+#define ldlm_set_intent_only(_l)        LDLM_SET_FLAG((  _l), 1ULL <<  9)
+#define ldlm_clear_intent_only(_l)      LDLM_CLEAR_FLAG((_l), 1ULL <<  9)
+
+/** lock request has intent */
+#define LDLM_FL_HAS_INTENT              0x0000000000001000ULL // bit  12
+#define ldlm_is_has_intent(_l)          LDLM_TEST_FLAG(( _l), 1ULL << 12)
+#define ldlm_set_has_intent(_l)         LDLM_SET_FLAG((  _l), 1ULL << 12)
+#define ldlm_clear_has_intent(_l)       LDLM_CLEAR_FLAG((_l), 1ULL << 12)
+
+/** flock deadlock detected */
+#define LDLM_FL_FLOCK_DEADLOCK          0x0000000000008000ULL // bit  15
+#define ldlm_is_flock_deadlock(_l)      LDLM_TEST_FLAG(( _l), 1ULL << 15)
+#define ldlm_set_flock_deadlock(_l)     LDLM_SET_FLAG((  _l), 1ULL << 15)
+#define ldlm_clear_flock_deadlock(_l)   LDLM_CLEAR_FLAG((_l), 1ULL << 15)
+
+/** discard (no writeback (PW locks) or page retention (PR locks)) on cancel */
+#define LDLM_FL_DISCARD_DATA            0x0000000000010000ULL // bit  16
+#define ldlm_is_discard_data(_l)        LDLM_TEST_FLAG(( _l), 1ULL << 16)
+#define ldlm_set_discard_data(_l)       LDLM_SET_FLAG((  _l), 1ULL << 16)
+#define ldlm_clear_discard_data(_l)     LDLM_CLEAR_FLAG((_l), 1ULL << 16)
+
+/** Blocked by group lock - wait indefinitely */
+#define LDLM_FL_NO_TIMEOUT              0x0000000000020000ULL // bit  17
+#define ldlm_is_no_timeout(_l)          LDLM_TEST_FLAG(( _l), 1ULL << 17)
+#define ldlm_set_no_timeout(_l)         LDLM_SET_FLAG((  _l), 1ULL << 17)
+#define ldlm_clear_no_timeout(_l)       LDLM_CLEAR_FLAG((_l), 1ULL << 17)
+
+/**
+ * Server told not to wait if blocked. For AGL, OST will not send glimpse
+ * callback. */
+#define LDLM_FL_BLOCK_NOWAIT            0x0000000000040000ULL // bit  18
+#define ldlm_is_block_nowait(_l)        LDLM_TEST_FLAG(( _l), 1ULL << 18)
+#define ldlm_set_block_nowait(_l)       LDLM_SET_FLAG((  _l), 1ULL << 18)
+#define ldlm_clear_block_nowait(_l)     LDLM_CLEAR_FLAG((_l), 1ULL << 18)
+
+/** return blocking lock */
+#define LDLM_FL_TEST_LOCK               0x0000000000080000ULL // bit  19
+#define ldlm_is_test_lock(_l)           LDLM_TEST_FLAG(( _l), 1ULL << 19)
+#define ldlm_set_test_lock(_l)          LDLM_SET_FLAG((  _l), 1ULL << 19)
+#define ldlm_clear_test_lock(_l)        LDLM_CLEAR_FLAG((_l), 1ULL << 19)
+
+/** match lock only */
+#define LDLM_FL_MATCH_LOCK               0x0000000000100000ULL // bit  20
+
+/**
+ * Immediatelly cancel such locks when they block some other locks. Send
+ * cancel notification to original lock holder, but expect no reply. This
+ * is for clients (like liblustre) that cannot be expected to reliably
+ * response to blocking AST. */
+#define LDLM_FL_CANCEL_ON_BLOCK         0x0000000000800000ULL // bit  23
+#define ldlm_is_cancel_on_block(_l)     LDLM_TEST_FLAG(( _l), 1ULL << 23)
+#define ldlm_set_cancel_on_block(_l)    LDLM_SET_FLAG((  _l), 1ULL << 23)
+#define ldlm_clear_cancel_on_block(_l)  LDLM_CLEAR_FLAG((_l), 1ULL << 23)
+
+/** Flag whether a lock is enqueued from a distributed transaction, and the
+ *  requesting lock mode is PW/EX, if so, it will check compatibility with COS
+ *  locks, and different from original COS semantic, transactions from the same
+ *  client is also treated as lock conflict. */
+#define LDLM_FL_COS_INCOMPAT		0x0000000001000000ULL /* bit  24 */
+#define ldlm_is_cos_incompat(_l)	LDLM_TEST_FLAG((_l), 1ULL << 24)
+#define ldlm_set_cos_incompat(_l)	LDLM_SET_FLAG((_l), 1ULL << 24)
+#define ldlm_clear_cos_incompat(_l)	LDLM_CLEAR_FLAG((_l), 1ULL << 24)
+
+/**
+ * measure lock contention and return -EUSERS if locking contention is high */
+#define LDLM_FL_DENY_ON_CONTENTION        0x0000000040000000ULL // bit  30
+#define ldlm_is_deny_on_contention(_l)    LDLM_TEST_FLAG(( _l), 1ULL << 30)
+#define ldlm_set_deny_on_contention(_l)   LDLM_SET_FLAG((  _l), 1ULL << 30)
+#define ldlm_clear_deny_on_contention(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 30)
+
+/**
+ * These are flags that are mapped into the flags and ASTs of blocking
+ * locks Add FL_DISCARD to blocking ASTs */
+#define LDLM_FL_AST_DISCARD_DATA        0x0000000080000000ULL // bit  31
+#define ldlm_is_ast_discard_data(_l)    LDLM_TEST_FLAG(( _l), 1ULL << 31)
+#define ldlm_set_ast_discard_data(_l)   LDLM_SET_FLAG((  _l), 1ULL << 31)
+#define ldlm_clear_ast_discard_data(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 31)
+
+/**
+ * Used for marking lock as a target for -EINTR while cp_ast sleep emulation
+ * + race with upcoming bl_ast. */
+#define LDLM_FL_FAIL_LOC                0x0000000100000000ULL // bit  32
+#define ldlm_is_fail_loc(_l)            LDLM_TEST_FLAG(( _l), 1ULL << 32)
+#define ldlm_set_fail_loc(_l)           LDLM_SET_FLAG((  _l), 1ULL << 32)
+#define ldlm_clear_fail_loc(_l)         LDLM_CLEAR_FLAG((_l), 1ULL << 32)
+
+/**
+ * Used while processing the unused list to know that we have already
+ * handled this lock and decided to skip it. */
+#define LDLM_FL_SKIPPED                 0x0000000200000000ULL // bit  33
+#define ldlm_is_skipped(_l)             LDLM_TEST_FLAG(( _l), 1ULL << 33)
+#define ldlm_set_skipped(_l)            LDLM_SET_FLAG((  _l), 1ULL << 33)
+#define ldlm_clear_skipped(_l)          LDLM_CLEAR_FLAG((_l), 1ULL << 33)
+
+/** this lock is being destroyed */
+#define LDLM_FL_CBPENDING               0x0000000400000000ULL // bit  34
+#define ldlm_is_cbpending(_l)           LDLM_TEST_FLAG(( _l), 1ULL << 34)
+#define ldlm_set_cbpending(_l)          LDLM_SET_FLAG((  _l), 1ULL << 34)
+#define ldlm_clear_cbpending(_l)        LDLM_CLEAR_FLAG((_l), 1ULL << 34)
+
+/** not a real flag, not saved in lock */
+#define LDLM_FL_WAIT_NOREPROC           0x0000000800000000ULL // bit  35
+#define ldlm_is_wait_noreproc(_l)       LDLM_TEST_FLAG(( _l), 1ULL << 35)
+#define ldlm_set_wait_noreproc(_l)      LDLM_SET_FLAG((  _l), 1ULL << 35)
+#define ldlm_clear_wait_noreproc(_l)    LDLM_CLEAR_FLAG((_l), 1ULL << 35)
+
+/** cancellation callback already run */
+#define LDLM_FL_CANCEL                  0x0000001000000000ULL // bit  36
+#define ldlm_is_cancel(_l)              LDLM_TEST_FLAG(( _l), 1ULL << 36)
+#define ldlm_set_cancel(_l)             LDLM_SET_FLAG((  _l), 1ULL << 36)
+#define ldlm_clear_cancel(_l)           LDLM_CLEAR_FLAG((_l), 1ULL << 36)
+
+/** whatever it might mean -- never transmitted? */
+#define LDLM_FL_LOCAL_ONLY              0x0000002000000000ULL // bit  37
+#define ldlm_is_local_only(_l)          LDLM_TEST_FLAG(( _l), 1ULL << 37)
+#define ldlm_set_local_only(_l)         LDLM_SET_FLAG((  _l), 1ULL << 37)
+#define ldlm_clear_local_only(_l)       LDLM_CLEAR_FLAG((_l), 1ULL << 37)
+
+/** don't run the cancel callback under ldlm_cli_cancel_unused */
+#define LDLM_FL_FAILED                  0x0000004000000000ULL // bit  38
+#define ldlm_is_failed(_l)              LDLM_TEST_FLAG(( _l), 1ULL << 38)
+#define ldlm_set_failed(_l)             LDLM_SET_FLAG((  _l), 1ULL << 38)
+#define ldlm_clear_failed(_l)           LDLM_CLEAR_FLAG((_l), 1ULL << 38)
+
+/** lock cancel has already been sent */
+#define LDLM_FL_CANCELING               0x0000008000000000ULL // bit  39
+#define ldlm_is_canceling(_l)           LDLM_TEST_FLAG(( _l), 1ULL << 39)
+#define ldlm_set_canceling(_l)          LDLM_SET_FLAG((  _l), 1ULL << 39)
+#define ldlm_clear_canceling(_l)        LDLM_CLEAR_FLAG((_l), 1ULL << 39)
+
+/** local lock (ie, no srv/cli split) */
+#define LDLM_FL_LOCAL                   0x0000010000000000ULL // bit  40
+#define ldlm_is_local(_l)               LDLM_TEST_FLAG(( _l), 1ULL << 40)
+#define ldlm_set_local(_l)              LDLM_SET_FLAG((  _l), 1ULL << 40)
+#define ldlm_clear_local(_l)            LDLM_CLEAR_FLAG((_l), 1ULL << 40)
+
+/**
+ * XXX FIXME: This is being added to b_size as a low-risk fix to the
+ * fact that the LVB filling happens _after_ the lock has been granted,
+ * so another thread can match it before the LVB has been updated.  As a
+ * dirty hack, we set LDLM_FL_LVB_READY only after we've done the LVB poop.
+ * this is only needed on LOV/OSC now, where LVB is actually used and
+ * callers must set it in input flags.
+ *
+ * The proper fix is to do the granting inside of the completion AST,
+ * which can be replaced with a LVB-aware wrapping function for OSC locks.
+ * That change is pretty high-risk, though, and would need a lot more
+ * testing. */
+#define LDLM_FL_LVB_READY               0x0000020000000000ULL // bit  41
+#define ldlm_is_lvb_ready(_l)           LDLM_TEST_FLAG(( _l), 1ULL << 41)
+#define ldlm_set_lvb_ready(_l)          LDLM_SET_FLAG((  _l), 1ULL << 41)
+#define ldlm_clear_lvb_ready(_l)        LDLM_CLEAR_FLAG((_l), 1ULL << 41)
+
+/**
+ * A lock contributes to the known minimum size (KMS) calculation until it
+ * has finished the part of its cancelation that performs write back on its
+ * dirty pages.  It can remain on the granted list during this whole time.
+ * Threads racing to update the KMS after performing their writeback need
+ * to know to exclude each other's locks from the calculation as they walk
+ * the granted list. */
+#define LDLM_FL_KMS_IGNORE              0x0000040000000000ULL // bit  42
+#define ldlm_is_kms_ignore(_l)          LDLM_TEST_FLAG(( _l), 1ULL << 42)
+#define ldlm_set_kms_ignore(_l)         LDLM_SET_FLAG((  _l), 1ULL << 42)
+#define ldlm_clear_kms_ignore(_l)       LDLM_CLEAR_FLAG((_l), 1ULL << 42)
+
+/** completion AST to be executed */
+#define LDLM_FL_CP_REQD                 0x0000080000000000ULL // bit  43
+#define ldlm_is_cp_reqd(_l)             LDLM_TEST_FLAG(( _l), 1ULL << 43)
+#define ldlm_set_cp_reqd(_l)            LDLM_SET_FLAG((  _l), 1ULL << 43)
+#define ldlm_clear_cp_reqd(_l)          LDLM_CLEAR_FLAG((_l), 1ULL << 43)
+
+/** cleanup_resource has already handled the lock */
+#define LDLM_FL_CLEANED                 0x0000100000000000ULL // bit  44
+#define ldlm_is_cleaned(_l)             LDLM_TEST_FLAG(( _l), 1ULL << 44)
+#define ldlm_set_cleaned(_l)            LDLM_SET_FLAG((  _l), 1ULL << 44)
+#define ldlm_clear_cleaned(_l)          LDLM_CLEAR_FLAG((_l), 1ULL << 44)
+
+/**
+ * optimization hint: LDLM can run blocking callback from current context
+ * w/o involving separate thread. in order to decrease cs rate */
+#define LDLM_FL_ATOMIC_CB               0x0000200000000000ULL // bit  45
+#define ldlm_is_atomic_cb(_l)           LDLM_TEST_FLAG(( _l), 1ULL << 45)
+#define ldlm_set_atomic_cb(_l)          LDLM_SET_FLAG((  _l), 1ULL << 45)
+#define ldlm_clear_atomic_cb(_l)        LDLM_CLEAR_FLAG((_l), 1ULL << 45)
+
+/**
+ * It may happen that a client initiates two operations, e.g. unlink and
+ * mkdir, such that the server sends a blocking AST for conflicting locks
+ * to this client for the first operation, whereas the second operation
+ * has canceled this lock and is waiting for rpc_lock which is taken by
+ * the first operation. LDLM_FL_BL_AST is set by ldlm_callback_handler() in
+ * the lock to prevent the Early Lock Cancel (ELC) code from cancelling it. */
+#define LDLM_FL_BL_AST                  0x0000400000000000ULL // bit  46
+#define ldlm_is_bl_ast(_l)              LDLM_TEST_FLAG(( _l), 1ULL << 46)
+#define ldlm_set_bl_ast(_l)             LDLM_SET_FLAG((  _l), 1ULL << 46)
+#define ldlm_clear_bl_ast(_l)           LDLM_CLEAR_FLAG((_l), 1ULL << 46)
+
+/**
+ * Set by ldlm_cancel_callback() when lock cache is dropped to let
+ * ldlm_callback_handler() return EINVAL to the server. It is used when
+ * ELC RPC is already prepared and is waiting for rpc_lock, too late to
+ * send a separate CANCEL RPC. */
+#define LDLM_FL_BL_DONE                 0x0000800000000000ULL // bit  47
+#define ldlm_is_bl_done(_l)             LDLM_TEST_FLAG(( _l), 1ULL << 47)
+#define ldlm_set_bl_done(_l)            LDLM_SET_FLAG((  _l), 1ULL << 47)
+#define ldlm_clear_bl_done(_l)          LDLM_CLEAR_FLAG((_l), 1ULL << 47)
+
+/**
+ * Don't put lock into the LRU list, so that it is not canceled due
+ * to aging.  Used by MGC locks, they are cancelled only at unmount or
+ * by callback. */
+#define LDLM_FL_NO_LRU                  0x0001000000000000ULL // bit  48
+#define ldlm_is_no_lru(_l)              LDLM_TEST_FLAG(( _l), 1ULL << 48)
+#define ldlm_set_no_lru(_l)             LDLM_SET_FLAG((  _l), 1ULL << 48)
+#define ldlm_clear_no_lru(_l)           LDLM_CLEAR_FLAG((_l), 1ULL << 48)
+
+/**
+ * Set for locks that failed and where the server has been notified.
+ *
+ * Protected by lock and resource locks. */
+#define LDLM_FL_FAIL_NOTIFIED           0x0002000000000000ULL // bit  49
+#define ldlm_is_fail_notified(_l)       LDLM_TEST_FLAG(( _l), 1ULL << 49)
+#define ldlm_set_fail_notified(_l)      LDLM_SET_FLAG((  _l), 1ULL << 49)
+#define ldlm_clear_fail_notified(_l)    LDLM_CLEAR_FLAG((_l), 1ULL << 49)
+
+/**
+ * Set for locks that were removed from class hash table and will
+ * be destroyed when last reference to them is released. Set by
+ * ldlm_lock_destroy_internal().
+ *
+ * Protected by lock and resource locks. */
+#define LDLM_FL_DESTROYED               0x0004000000000000ULL // bit  50
+#define ldlm_is_destroyed(_l)           LDLM_TEST_FLAG(( _l), 1ULL << 50)
+#define ldlm_set_destroyed(_l)          LDLM_SET_FLAG((  _l), 1ULL << 50)
+#define ldlm_clear_destroyed(_l)        LDLM_CLEAR_FLAG((_l), 1ULL << 50)
+
+/** flag whether this is a server namespace lock */
+#define LDLM_FL_SERVER_LOCK             0x0008000000000000ULL // bit  51
+#define ldlm_is_server_lock(_l)         LDLM_TEST_FLAG(( _l), 1ULL << 51)
+#define ldlm_set_server_lock(_l)        LDLM_SET_FLAG((  _l), 1ULL << 51)
+#define ldlm_clear_server_lock(_l)      LDLM_CLEAR_FLAG((_l), 1ULL << 51)
+
+/**
+ * It's set in lock_res_and_lock() and unset in unlock_res_and_lock().
+ *
+ * NB: compared with check_res_locked(), checking this bit is cheaper.
+ * Also, spin_is_locked() is deprecated for kernel code; one reason is
+ * because it works only for SMP so user needs to add extra macros like
+ * LASSERT_SPIN_LOCKED for uniprocessor kernels. */
+#define LDLM_FL_RES_LOCKED              0x0010000000000000ULL // bit  52
+#define ldlm_is_res_locked(_l)          LDLM_TEST_FLAG(( _l), 1ULL << 52)
+#define ldlm_set_res_locked(_l)         LDLM_SET_FLAG((  _l), 1ULL << 52)
+#define ldlm_clear_res_locked(_l)       LDLM_CLEAR_FLAG((_l), 1ULL << 52)
+
+/**
+ * It's set once we call ldlm_add_waiting_lock_res_locked() to start the
+ * lock-timeout timer and it will never be reset.
+ *
+ * Protected by lock and resource locks. */
+#define LDLM_FL_WAITED                  0x0020000000000000ULL // bit  53
+#define ldlm_is_waited(_l)              LDLM_TEST_FLAG(( _l), 1ULL << 53)
+#define ldlm_set_waited(_l)             LDLM_SET_FLAG((  _l), 1ULL << 53)
+#define ldlm_clear_waited(_l)           LDLM_CLEAR_FLAG((_l), 1ULL << 53)
+
+/** Flag whether this is a server namespace lock. */
+#define LDLM_FL_NS_SRV                  0x0040000000000000ULL // bit  54
+#define ldlm_is_ns_srv(_l)              LDLM_TEST_FLAG(( _l), 1ULL << 54)
+#define ldlm_set_ns_srv(_l)             LDLM_SET_FLAG((  _l), 1ULL << 54)
+#define ldlm_clear_ns_srv(_l)           LDLM_CLEAR_FLAG((_l), 1ULL << 54)
+
+/** Flag whether this lock can be reused. Used by exclusive open. */
+#define LDLM_FL_EXCL                    0x0080000000000000ULL // bit  55
+#define ldlm_is_excl(_l)                LDLM_TEST_FLAG(( _l), 1ULL << 55)
+#define ldlm_set_excl(_l)               LDLM_SET_FLAG((  _l), 1ULL << 55)
+#define ldlm_clear_excl(_l)             LDLM_CLEAR_FLAG((_l), 1ULL << 55)
+
+/** Flag whether a lock is found on server for re-sent RPC. */
+#define LDLM_FL_RESENT                   0x0100000000000000ULL // bit  56
+
+/** Flag whether Commit-on-Sharing is enabled, if LDLM_FL_COS_INCOMPAT is set
+ *  this flag may not be set because once the former is set this flag won't be
+ *  checked, and for cross-MDT lock COS_INCOMPAT is always set but ast handle is
+ *  in ldlm context which doesn't know whether COS is enabled or not. */
+#define LDLM_FL_COS_ENABLED              0x0200000000000000ULL /* bit  57 */
+#define ldlm_is_cos_enabled(_l)          LDLM_TEST_FLAG((_l), 1ULL << 57)
+#define ldlm_set_cos_enabled(_l)         LDLM_SET_FLAG((_l), 1ULL << 57)
+
+/** l_flags bits marked as "ast" bits */
+#define LDLM_FL_AST_MASK                (LDLM_FL_FLOCK_DEADLOCK		|\
+					 LDLM_FL_AST_DISCARD_DATA)
+
+/** l_flags bits marked as "blocked" bits */
+#define LDLM_FL_BLOCKED_MASK            (LDLM_FL_BLOCK_GRANTED		|\
+					 LDLM_FL_BLOCK_CONV		|\
+					 LDLM_FL_BLOCK_WAIT)
+
+/** l_flags bits marked as "gone" bits */
+#define LDLM_FL_GONE_MASK		(LDLM_FL_DESTROYED		|\
+					 LDLM_FL_FAILED)
+
+/** l_flags bits marked as "inherit" bits */
+/* Flags inherited from wire on enqueue/reply between client/server. */
+/* NO_TIMEOUT flag to force ldlm_lock_match() to wait with no timeout. */
+/* TEST_LOCK flag to not let TEST lock to be granted. */
+#define LDLM_FL_INHERIT_MASK            (LDLM_FL_CANCEL_ON_BLOCK	|\
+					 LDLM_FL_NO_TIMEOUT		|\
+					 LDLM_FL_TEST_LOCK)
+
+/** flags returned in @flags parameter on ldlm_lock_enqueue,
+ * to be re-constructed on re-send */
+#define LDLM_FL_SRV_ENQ_MASK	(LDLM_FL_LOCK_CHANGED		|\
+				 LDLM_FL_BLOCKED_MASK		|\
+				 LDLM_FL_NO_TIMEOUT)
+
+/** test for ldlm_lock flag bit set */
+#define LDLM_TEST_FLAG(_l, _b)    (((_l)->l_flags & (_b)) != 0)
+
+/** multi-bit test: are any of mask bits set? */
+#define LDLM_HAVE_MASK(_l, _m)    (((_l)->l_flags & LDLM_FL_##_m##_MASK) != 0)
+
+/** set a ldlm_lock flag bit */
+#define LDLM_SET_FLAG(_l, _b)     ((_l)->l_flags |= (_b))
+
+/** clear a ldlm_lock flag bit */
+#define LDLM_CLEAR_FLAG(_l, _b)   ((_l)->l_flags &= ~(_b))
+
+/** @} subgroup */
+/** @} group */
+#endif /* LDLM_ALL_FLAGS_MASK */
+
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_eacl.h b/drivers/staging/lustrefsx/lustre/include/lustre_eacl.h
new file mode 100644
index 0000000000000..3061be1bc6124
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_eacl.h
@@ -0,0 +1,94 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/include/lustre_idmap.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_EACL_H
+#define _LUSTRE_EACL_H
+
+/** \defgroup eacl eacl
+ *
+ * @{
+ */
+
+#ifdef CONFIG_FS_POSIX_ACL
+# include <linux/fs.h>
+# include <linux/posix_acl_xattr.h>
+
+typedef struct {
+        __u16                   e_tag;
+        __u16                   e_perm;
+        __u32                   e_id;
+        __u32                   e_stat;
+} ext_acl_xattr_entry;
+
+typedef struct {
+        __u32                   a_count;
+        ext_acl_xattr_entry     a_entries[0];
+} ext_acl_xattr_header;
+
+#define CFS_ACL_XATTR_SIZE(count, prefix) \
+        (sizeof(prefix ## _header) + (count) * sizeof(prefix ## _entry))
+
+#define CFS_ACL_XATTR_COUNT(size, prefix) \
+        (((size) - sizeof(prefix ## _header)) / sizeof(prefix ## _entry))
+
+#ifdef HAVE_SERVER_SUPPORT
+struct lu_ucred;
+struct lu_attr;
+struct lustre_idmap_table;
+
+#ifdef HAVE_STRUCT_POSIX_ACL_XATTR
+# define posix_acl_xattr_header struct posix_acl_xattr_header
+# define posix_acl_xattr_entry  struct posix_acl_xattr_entry
+#endif
+
+extern int lustre_posix_acl_permission(struct lu_ucred *mu,
+					const struct lu_attr *la, int want,
+					posix_acl_xattr_entry *entry,
+					int count);
+extern int lustre_posix_acl_chmod_masq(posix_acl_xattr_entry *entry,
+                                       __u32 mode, int count);
+extern int lustre_posix_acl_create_masq(posix_acl_xattr_entry *entry,
+                                        __u32 *pmode, int count);
+extern int lustre_posix_acl_equiv_mode(posix_acl_xattr_entry *entry, mode_t *mode_p,
+				       int count);
+#endif /* HAVE_SERVER_SUPPORT */
+#endif /* CONFIG_FS_POSIX_ACL */
+
+/** @} eacl */
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_export.h b/drivers/staging/lustrefsx/lustre/include/lustre_export.h
new file mode 100644
index 0000000000000..8552d3d1c00a7
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_export.h
@@ -0,0 +1,435 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/** \defgroup obd_export PortalRPC export definitions
+ *
+ * @{
+ */
+
+#ifndef __EXPORT_H
+#define __EXPORT_H
+
+/** \defgroup export export
+ *
+ * @{
+ */
+
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_dlm.h>
+
+struct mds_client_data;
+struct mdt_client_data;
+struct mds_idmap_table;
+struct mdt_idmap_table;
+
+/**
+ * Target-specific export data
+ */
+struct tg_export_data {
+	/** Protects ted_lcd, ted_reply_* and
+	 * ted_release_* fields below */
+	struct mutex		ted_lcd_lock;
+	/** Per-client data for each export */
+	struct lsd_client_data	*ted_lcd;
+	/** Offset of record in last_rcvd file */
+	loff_t			ted_lr_off;
+	/** Client index in last_rcvd file */
+	int			ted_lr_idx;
+
+	/**
+	 * ted_nodemap_lock is used to ensure that the nodemap is not destroyed
+	 * between the time that ted_nodemap is checked for NULL, and a
+	 * reference is taken. Modifications to ted_nodemap require that the
+	 * active_config_lock and the nodemap(s)'s nm_member_list_lock be
+	 * taken, as well as ted_nodemap_lock, so the export can be properly
+	 * added to or removed from the nodemap's member list. When an export
+	 * is added to a nodemap, a reference on that nodemap must be taken.
+	 * That reference can be put only after ted_nodemap no longer refers to
+	 * it.
+	 */
+	spinlock_t		ted_nodemap_lock;
+	struct lu_nodemap	*ted_nodemap;
+	struct list_head	ted_nodemap_member;
+
+	/** last version of nodemap config sent to client */
+	__u64			ted_nodemap_version;
+
+	/* Every reply data fields below are
+	 * protected by ted_lcd_lock */
+	/** List of reply data */
+	struct list_head	ted_reply_list;
+	int			ted_reply_cnt;
+	/** Reply data with highest transno is retained */
+	struct tg_reply_data	*ted_reply_last;
+	/* Statistics */
+	int			ted_reply_max; /* high water mark */
+	int			ted_release_xid;
+	int			ted_release_tag;
+	/* grants */
+	long			ted_dirty;    /* in bytes */
+	long			ted_grant;    /* in bytes */
+	long			ted_pending;  /* bytes just being written */
+	__u8			ted_pagebits; /* log2 of client page size */
+};
+
+/**
+ * MDT-specific export data
+ */
+struct mdt_export_data {
+	struct tg_export_data	med_ted;
+	/** List of all files opened by client on this MDT */
+	struct list_head	med_open_head;
+	spinlock_t		med_open_lock; /* med_open_head, mfd_list */
+};
+
+struct ec_export_data { /* echo client */
+	struct list_head	eced_locks;
+};
+
+/* In-memory access to client data from OST struct */
+/** Filter (oss-side) specific import data */
+struct filter_export_data {
+	struct tg_export_data	fed_ted;
+	spinlock_t		fed_lock;	/**< protects fed_mod_list */
+	__u64			fed_lastid_gen;
+	struct list_head	fed_mod_list; /* files being modified */
+	/* count of SOFT_SYNC RPCs, which will be reset after
+	 * ofd_soft_sync_limit number of RPCs, and trigger a sync. */
+	atomic_t		fed_soft_sync_count;
+	int			fed_mod_count;/* items in fed_writing list */
+	__u32			fed_group;
+};
+
+struct mgs_export_data {
+	struct list_head	med_clients;	/* mgc fs client via this exp */
+	spinlock_t		med_lock;	/* protect med_clients */
+};
+
+/**
+ * per-NID statistics structure.
+ * It tracks access patterns to this export on a per-client-NID basis
+ */
+struct nid_stat {
+	lnet_nid_t		 nid;
+	struct hlist_node	 nid_hash;
+	struct list_head	 nid_list;
+	struct obd_device       *nid_obd;
+	struct proc_dir_entry   *nid_proc;
+	struct lprocfs_stats    *nid_stats;
+	struct lprocfs_stats    *nid_ldlm_stats;
+	atomic_t		 nid_exp_ref_count; /* for obd_nid_stats_hash
+						       exp_nid_stats */
+};
+
+#define nidstat_getref(nidstat)                                                \
+do {                                                                           \
+	atomic_inc(&(nidstat)->nid_exp_ref_count);                         \
+} while(0)
+
+#define nidstat_putref(nidstat)                                                \
+do {                                                                           \
+	atomic_dec(&(nidstat)->nid_exp_ref_count);                         \
+	LASSERTF(atomic_read(&(nidstat)->nid_exp_ref_count) >= 0,          \
+		 "stat %p nid_exp_ref_count < 0\n", nidstat);                  \
+} while(0)
+
+enum obd_option {
+        OBD_OPT_FORCE =         0x0001,
+        OBD_OPT_FAILOVER =      0x0002,
+        OBD_OPT_ABORT_RECOV =   0x0004,
+};
+
+/**
+ * Export structure. Represents target-side of connection in portals.
+ * Also used in Lustre to connect between layers on the same node when
+ * there is no network-connection in-between.
+ * For every connected client there is an export structure on the server
+ * attached to the same obd device.
+ */
+struct obd_export {
+	/**
+	 * Export handle, it's id is provided to client on connect
+	 * Subsequent client RPCs contain this handle id to identify
+	 * what export they are talking to.
+	 */
+	struct portals_handle	exp_handle;
+	atomic_t		exp_refcount;
+	/**
+	 * Set of counters below is to track where export references are
+	 * kept. The exp_rpc_count is used for reconnect handling also,
+	 * the cb_count and locks_count are for debug purposes only for now.
+	 * The sum of them should be less than exp_refcount by 3
+	 */
+	atomic_t		exp_rpc_count; /* RPC references */
+	atomic_t		exp_cb_count; /* Commit callback references */
+	/** Number of queued replay requests to be processes */
+	atomic_t		exp_replay_count;
+	atomic_t		exp_locks_count; /** Lock references */
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	struct list_head	exp_locks_list;
+	spinlock_t		exp_locks_list_guard;
+#endif
+        /** UUID of client connected to this export */
+	struct obd_uuid		exp_client_uuid;
+        /** To link all exports on an obd device */
+	struct list_head	exp_obd_chain;
+	/* Unlinked export list */
+	struct list_head	exp_stale_list;
+	struct hlist_node	exp_uuid_hash;	/** uuid-export hash*/
+	struct hlist_node	exp_nid_hash;	/** nid-export hash */
+	struct hlist_node	exp_gen_hash;   /** last_rcvd clt gen hash */
+        /**
+         * All exports eligible for ping evictor are linked into a list
+         * through this field in "most time since last request on this export"
+         * order
+         * protected by obd_dev_lock
+         */
+	struct list_head	exp_obd_chain_timed;
+	/** Obd device of this export */
+	struct obd_device      *exp_obd;
+	/**
+	 * "reverse" import to send requests (e.g. from ldlm) back to client
+	 * exp_lock protect its change
+	 */
+        struct obd_import        *exp_imp_reverse;
+        struct nid_stat          *exp_nid_stats;
+        /** Active connetion */
+        struct ptlrpc_connection *exp_connection;
+	/** Connection count value from last successful reconnect rpc */
+	__u32			  exp_conn_cnt;
+	/** Hash list of all ldlm locks granted on this export */
+	struct cfs_hash		 *exp_lock_hash;
+	/**
+	 * Hash list for Posix lock deadlock detection, added with
+	 * ldlm_lock::l_exp_flock_hash.
+	 */
+	struct cfs_hash	       *exp_flock_hash;
+	struct list_head	exp_outstanding_replies;
+	struct list_head	exp_uncommitted_replies;
+	spinlock_t		exp_uncommitted_replies_lock;
+	/** Last committed transno for this export */
+	__u64			exp_last_committed;
+	/** When was last request received */
+	cfs_time_t		exp_last_request_time;
+	/** On replay all requests waiting for replay are linked here */
+	struct list_head	exp_req_replay_queue;
+	/**
+	 * protects exp_flags, exp_outstanding_replies and the change
+	 * of exp_imp_reverse
+	 */
+	spinlock_t		  exp_lock;
+	/** Compatibility flags for this export are embedded into
+	 *  exp_connect_data */
+	struct obd_connect_data   exp_connect_data;
+        enum obd_option           exp_flags;
+        unsigned long             exp_failed:1,
+                                  exp_in_recovery:1,
+                                  exp_disconnected:1,
+                                  exp_connecting:1,
+                                  /** VBR: export missed recovery */
+                                  exp_delayed:1,
+                                  /** VBR: failed version checking */
+                                  exp_vbr_failed:1,
+                                  exp_req_replay_needed:1,
+                                  exp_lock_replay_needed:1,
+                                  exp_need_sync:1,
+                                  exp_flvr_changed:1,
+                                  exp_flvr_adapt:1,
+                                  exp_libclient:1, /* liblustre client? */
+				  /* if to swap nidtbl entries for 2.2 clients.
+				   * Only used by the MGS to fix LU-1644. */
+				  exp_need_mne_swab:1,
+				  /* The export already got final replay ping
+				   * request. */
+				  exp_replay_done:1;
+        /* also protected by exp_lock */
+        enum lustre_sec_part      exp_sp_peer;
+        struct sptlrpc_flavor     exp_flvr;             /* current */
+        struct sptlrpc_flavor     exp_flvr_old[2];      /* about-to-expire */
+	time64_t		  exp_flvr_expire[2];	/* seconds */
+
+        /** protects exp_hp_rpcs */
+	spinlock_t		exp_rpc_lock;
+	struct list_head	exp_hp_rpcs;	/* (potential) HP RPCs */
+	struct list_head	exp_reg_rpcs;  /* RPC being handled */
+
+	/** blocking dlm lock list, protected by exp_bl_list_lock */
+	struct list_head	exp_bl_list;
+	spinlock_t		exp_bl_list_lock;
+
+        /** Target specific data */
+        union {
+                struct tg_export_data     eu_target_data;
+                struct mdt_export_data    eu_mdt_data;
+                struct filter_export_data eu_filter_data;
+                struct ec_export_data     eu_ec_data;
+                struct mgs_export_data    eu_mgs_data;
+        } u;
+
+	struct adaptive_timeout    exp_bl_lock_at;
+
+	/** highest XID received by export client that has no
+	 * unreceived lower-numbered XID
+	 */
+	__u64			  exp_last_xid;
+};
+
+#define exp_target_data u.eu_target_data
+#define exp_mdt_data    u.eu_mdt_data
+#define exp_filter_data u.eu_filter_data
+#define exp_ec_data     u.eu_ec_data
+
+static inline __u64 *exp_connect_flags_ptr(struct obd_export *exp)
+{
+	return &exp->exp_connect_data.ocd_connect_flags;
+}
+
+static inline __u64 exp_connect_flags(struct obd_export *exp)
+{
+	return *exp_connect_flags_ptr(exp);
+}
+
+static inline int exp_max_brw_size(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	if (exp_connect_flags(exp) & OBD_CONNECT_BRW_SIZE)
+		return exp->exp_connect_data.ocd_brw_size;
+
+	return ONE_MB_BRW_SIZE;
+}
+
+static inline int exp_connect_multibulk(struct obd_export *exp)
+{
+	return exp_max_brw_size(exp) > ONE_MB_BRW_SIZE;
+}
+
+static inline int exp_expired(struct obd_export *exp, cfs_duration_t age)
+{
+        LASSERT(exp->exp_delayed);
+        return cfs_time_before(cfs_time_add(exp->exp_last_request_time, age),
+                               cfs_time_current_sec());
+}
+
+static inline int exp_connect_cancelset(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_CANCELSET);
+}
+
+static inline int exp_connect_lru_resize(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_LRU_RESIZE);
+}
+
+static inline int exp_connect_vbr(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	LASSERT(exp->exp_connection);
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_VBR);
+}
+
+static inline int exp_connect_umask(struct obd_export *exp)
+{
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_UMASK);
+}
+
+static inline int imp_connect_lru_resize(struct obd_import *imp)
+{
+        struct obd_connect_data *ocd;
+
+        LASSERT(imp != NULL);
+        ocd = &imp->imp_connect_data;
+        return !!(ocd->ocd_connect_flags & OBD_CONNECT_LRU_RESIZE);
+}
+
+static inline int exp_connect_layout(struct obd_export *exp)
+{
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_LAYOUTLOCK);
+}
+
+static inline bool exp_connect_lvb_type(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	if (exp_connect_flags(exp) & OBD_CONNECT_LVB_TYPE)
+		return true;
+	else
+		return false;
+}
+
+static inline bool imp_connect_lvb_type(struct obd_import *imp)
+{
+	struct obd_connect_data *ocd;
+
+	LASSERT(imp != NULL);
+	ocd = &imp->imp_connect_data;
+	if (ocd->ocd_connect_flags & OBD_CONNECT_LVB_TYPE)
+		return true;
+	else
+		return false;
+}
+
+static inline bool imp_connect_disp_stripe(struct obd_import *imp)
+{
+	struct obd_connect_data *ocd;
+
+	LASSERT(imp != NULL);
+	ocd = &imp->imp_connect_data;
+	return ocd->ocd_connect_flags & OBD_CONNECT_DISP_STRIPE;
+}
+
+static inline __u64 exp_connect_ibits(struct obd_export *exp)
+{
+	struct obd_connect_data *ocd;
+
+	ocd = &exp->exp_connect_data;
+	return ocd->ocd_ibits_known;
+}
+
+static inline int exp_connect_large_acl(struct obd_export *exp)
+{
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_LARGE_ACL);
+}
+
+extern struct obd_export *class_conn2export(struct lustre_handle *conn);
+extern struct obd_device *class_conn2obd(struct lustre_handle *conn);
+
+#define KKUC_CT_DATA_MAGIC	0x092013cea
+struct kkuc_ct_data {
+	__u32		kcd_magic;
+	__u32		kcd_archive;
+};
+
+/** @} export */
+
+#endif /* __EXPORT_H */
+/** @} obd_export */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_fid.h b/drivers/staging/lustrefsx/lustre/include/lustre_fid.h
new file mode 100644
index 0000000000000..8759b31f91674
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_fid.h
@@ -0,0 +1,952 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_fid.h
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#ifndef __LUSTRE_FID_H
+#define __LUSTRE_FID_H
+
+/** \defgroup fid fid
+ *
+ * @{
+ *
+ * http://wiki.lustre.org/index.php/Architecture_-_Interoperability_fids_zfs
+ * describes the FID namespace and interoperability requirements for FIDs.
+ * The important parts of that document are included here for reference.
+ *
+ * FID
+ *   File IDentifier generated by client from range allocated by the SEQuence
+ *   service and stored in struct lu_fid. The FID is composed of three parts:
+ *   SEQuence, ObjectID, and VERsion.  The SEQ component is a filesystem
+ *   unique 64-bit integer, and only one client is ever assigned any SEQ value.
+ *   The first 0x400 FID_SEQ_NORMAL [2^33, 2^33 + 0x400] values are reserved
+ *   for system use.  The OID component is a 32-bit value generated by the
+ *   client on a per-SEQ basis to allow creating many unique FIDs without
+ *   communication with the server.  The VER component is a 32-bit value that
+ *   distinguishes between different FID instantiations, such as snapshots or
+ *   separate subtrees within the filesystem.  FIDs with the same VER field
+ *   are considered part of the same namespace.
+ *
+ * OLD filesystems are those upgraded from Lustre 1.x that predate FIDs, and
+ *   MDTs use 32-bit ldiskfs internal inode/generation numbers (IGIFs), while
+ *   OSTs use 64-bit Lustre object IDs and generation numbers.
+ *
+ * NEW filesystems are those formatted since the introduction of FIDs.
+ *
+ * IGIF
+ *   Inode and Generation In FID, a surrogate FID used to globally identify
+ *   an existing object on OLD formatted MDT file system. This would only be
+ *   used on MDT0 in a DNE filesystem, because there cannot be more than one
+ *   MDT in an OLD formatted filesystem. Belongs to sequence in [12, 2^32 - 1]
+ *   range, where inode number is stored in SEQ, and inode generation is in OID.
+ *   NOTE: This assumes no more than 2^32-1 inodes exist in the MDT filesystem,
+ *   which is the maximum possible for an ldiskfs backend.  It also assumes
+ *   that the reserved ext3/ext4/ldiskfs inode numbers [0-11] are never visible
+ *   to clients, which has always been true.
+ *
+ * IDIF
+ *   object ID In FID, a surrogate FID used to globally identify an existing
+ *   OST object on OLD formatted OST file system. Belongs to a sequence in
+ *   [2^32, 2^33 - 1]. Sequence number is calculated as:
+ *
+ *      1 << 32 | (ost_index << 16) | ((objid >> 32) & 0xffff)
+ *
+ *   that is, SEQ consists of 16-bit OST index, and higher 16 bits of object
+ *   ID. The generation of unique SEQ values per OST allows the IDIF FIDs to
+ *   be identified in the FLD correctly. The OID field is calculated as:
+ *
+ *      objid & 0xffffffff
+ *
+ *   that is, it consists of lower 32 bits of object ID.  For objects within
+ *   the IDIF range, object ID extraction will be:
+ *
+ *      o_id = (fid->f_seq & 0x7fff) << 16 | fid->f_oid;
+ *      o_seq = 0;  // formerly group number
+ *
+ *   NOTE: This assumes that no more than 2^48-1 objects have ever been created
+ *   on any OST, and that no more than 65535 OSTs are in use.  Both are very
+ *   reasonable assumptions, i.e. an IDIF can uniquely map all objects assuming
+ *   a maximum creation rate of 1M objects per second for a maximum of 9 years,
+ *   or combinations thereof.
+ *
+ * OST_MDT0
+ *   Surrogate FID used to identify an existing object on OLD formatted OST
+ *   filesystem. Belongs to the reserved SEQuence 0, and is used prior to
+ *   the introduction of FID-on-OST, at which point IDIF will be used to
+ *   identify objects as residing on a specific OST.
+ *
+ * LLOG
+ *   For Lustre Log objects the object sequence 1 is used. This is compatible
+ *   with both OLD and NEW namespaces, as this SEQ number is in the
+ *   ext3/ldiskfs reserved inode range and does not conflict with IGIF
+ *   sequence numbers.
+ *
+ * ECHO
+ *   For testing OST IO performance the object sequence 2 is used. This is
+ *   compatible with both OLD and NEW namespaces, as this SEQ number is in
+ *   the ext3/ldiskfs reserved inode range and does not conflict with IGIF
+ *   sequence numbers.
+ *
+ * OST_MDT1 .. OST_MAX
+ *   For testing with multiple MDTs the object sequence 3 through 9 is used,
+ *   allowing direct mapping of MDTs 1 through 7 respectively, for a total
+ *   of 8 MDTs including OST_MDT0. This matches the legacy CMD project "group"
+ *   mappings. However, this SEQ range is only for testing prior to any
+ *   production DNE release, as the objects in this range conflict across all
+ *   OSTs, as the OST index is not part of the FID.  For production DNE usage,
+ *   OST objects created by MDT1+ will use FID_SEQ_NORMAL FIDs.
+ *
+ * DLM OST objid to IDIF mapping
+ *   For compatibility with existing OLD OST network protocol structures, the
+ *   FID must map onto the o_id and o_seq in a manner that ensures existing
+ *   objects are identified consistently for IO, as well as onto the LDLM
+ *   namespace to ensure IDIFs there is only a single resource name for any
+ *   object in the DLM.  The OLD OST object DLM resource mapping is:
+ *
+ *      resource[] = {o_id, o_seq, 0, 0}; // o_seq == 0 for production releases
+ *
+ *   The NEW OST object DLM resource mapping is the same for both MDT and OST:
+ *
+ *      resource[] = {SEQ, OID, VER, HASH};
+ *
+ *  NOTE: for mapping IDIF values to DLM resource names the o_id may be
+ *  larger than the 2^33 reserved sequence numbers for IDIF, so it is possible
+ *  for the o_id numbers to overlap FID SEQ numbers in the resource. However,
+ *  in all production releases the OLD o_seq field is always zero, and all
+ *  valid FID OID values are non-zero, so the lock resources will not collide.
+ *  Even so, the MDT and OST resources are also in different LDLM namespaces.
+ */
+
+#include <libcfs/libcfs.h>
+#include <uapi/linux/lustre_fid.h>
+#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre_ostid.h>
+
+struct lu_env;
+struct lu_site;
+struct lu_context;
+struct obd_device;
+struct obd_export;
+
+/* Whole sequences space range and zero range definitions */
+extern const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE;
+extern const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE;
+extern const struct lu_fid LUSTRE_BFL_FID;
+extern const struct lu_fid LU_OBF_FID;
+extern const struct lu_fid LU_LPF_FID;
+extern const struct lu_fid LU_DOT_LUSTRE_FID;
+extern const struct lu_fid LU_BACKEND_LPF_FID;
+
+enum {
+	/*
+	 * This is how may metadata FIDs may be allocated in one sequence(128k)
+	 */
+	LUSTRE_METADATA_SEQ_MAX_WIDTH = 0x0000000000020000ULL,
+
+	/*
+	 * This is how many data FIDs could be allocated in one sequence(4B - 1)
+	 */
+	LUSTRE_DATA_SEQ_MAX_WIDTH = 0x00000000FFFFFFFFULL,
+
+	/*
+	 * How many sequences to allocate to a client at once.
+	 */
+	LUSTRE_SEQ_META_WIDTH = 0x0000000000000001ULL,
+
+	/*
+	 * seq allocation pool size.
+	 */
+	LUSTRE_SEQ_BATCH_WIDTH = LUSTRE_SEQ_META_WIDTH * 1000,
+
+	/*
+	 * This is how many sequences may be in one super-sequence allocated to
+	 * MDTs.
+	 */
+	LUSTRE_SEQ_SUPER_WIDTH = ((1ULL << 30ULL) * LUSTRE_SEQ_META_WIDTH)
+};
+
+enum {
+        /** 2^6 FIDs for OI containers */
+        OSD_OI_FID_OID_BITS     = 6,
+        /** reserve enough FIDs in case we want more in the future */
+        OSD_OI_FID_OID_BITS_MAX = 10,
+};
+
+/** special OID for local objects */
+enum local_oid {
+	/** \see fld_mod_init */
+	FLD_INDEX_OID		= 3UL,
+	/** \see fid_mod_init */
+	FID_SEQ_CTL_OID		= 4UL,
+	FID_SEQ_SRV_OID		= 5UL,
+	/** \see mdd_mod_init */
+	MDD_ROOT_INDEX_OID	= 6UL, /* deprecated in 2.4 */
+	MDD_ORPHAN_OID		= 7UL, /* deprecated in 2.4 */
+	MDD_LOV_OBJ_OID		= 8UL,
+	MDD_CAPA_KEYS_OID	= 9UL,
+	/** \see mdt_mod_init */
+	LAST_RECV_OID		= 11UL,
+	OSD_FS_ROOT_OID		= 13UL,
+	ACCT_USER_OID		= 15UL,
+	ACCT_GROUP_OID		= 16UL,
+	LFSCK_BOOKMARK_OID	= 17UL,
+	OTABLE_IT_OID		= 18UL,
+	OSD_LPF_OID		= 19UL,
+	REPLY_DATA_OID		= 21UL,
+	ACCT_PROJECT_OID	= 22UL,
+	OFD_LAST_GROUP_OID	= 4117UL,
+	LLOG_CATALOGS_OID	= 4118UL,
+	MGS_CONFIGS_OID		= 4119UL,
+	OFD_HEALTH_CHECK_OID	= 4120UL,
+	MDD_LOV_OBJ_OSEQ	= 4121UL,
+	LFSCK_NAMESPACE_OID     = 4122UL,
+	REMOTE_PARENT_DIR_OID	= 4123UL,
+	/* This definition is obsolete
+	 * SLAVE_LLOG_CATALOGS_OID	= 4124UL,
+	 */
+	BATCHID_COMMITTED_OID   = 4125UL,
+};
+
+static inline void lu_local_obj_fid(struct lu_fid *fid, __u32 oid)
+{
+        fid->f_seq = FID_SEQ_LOCAL_FILE;
+        fid->f_oid = oid;
+        fid->f_ver = 0;
+}
+
+static inline void lu_local_name_obj_fid(struct lu_fid *fid, __u32 oid)
+{
+        fid->f_seq = FID_SEQ_LOCAL_NAME;
+        fid->f_oid = oid;
+        fid->f_ver = 0;
+}
+
+/* For new FS (>= 2.4), the root FID will be changed to
+ * [FID_SEQ_ROOT:1:0], for existing FS, (upgraded to 2.4),
+ * the root FID will still be IGIF */
+static inline int fid_is_root(const struct lu_fid *fid)
+{
+	return unlikely((fid_seq(fid) == FID_SEQ_ROOT &&
+			 fid_oid(fid) == FID_OID_ROOT));
+}
+
+static inline int fid_is_dot_lustre(const struct lu_fid *fid)
+{
+	return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE &&
+			fid_oid(fid) == FID_OID_DOT_LUSTRE);
+}
+
+static inline int fid_is_obf(const struct lu_fid *fid)
+{
+	return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE &&
+			fid_oid(fid) == FID_OID_DOT_LUSTRE_OBF);
+}
+
+static inline int fid_is_otable_it(const struct lu_fid *fid)
+{
+	return unlikely(fid_seq(fid) == FID_SEQ_LOCAL_FILE &&
+			fid_oid(fid) == OTABLE_IT_OID);
+}
+
+static inline int fid_oid_is_quota(const struct lu_fid *fid)
+{
+	switch (fid_oid(fid)) {
+	case ACCT_USER_OID:
+	case ACCT_GROUP_OID:
+	case ACCT_PROJECT_OID:
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static inline int fid_is_acct(const struct lu_fid *fid)
+{
+        return fid_seq(fid) == FID_SEQ_LOCAL_FILE &&
+	       fid_oid_is_quota(fid);
+}
+
+static inline int fid_is_quota(const struct lu_fid *fid)
+{
+	return fid_seq(fid) == FID_SEQ_QUOTA ||
+	       fid_seq(fid) == FID_SEQ_QUOTA_GLB;
+}
+
+static inline int fid_is_name_llog(const struct lu_fid *fid)
+{
+	return fid_seq(fid) == FID_SEQ_LLOG_NAME;
+}
+
+static inline int fid_is_namespace_visible(const struct lu_fid *fid)
+{
+	const __u64 seq = fid_seq(fid);
+
+	/* Here, we cannot distinguish whether the normal FID is for OST
+	 * object or not. It is caller's duty to check more if needed. */
+	return (!fid_is_last_id(fid) &&
+		(fid_seq_is_norm(seq) || fid_seq_is_igif(seq))) ||
+	       fid_is_root(fid) || fid_seq_is_dot(seq);
+}
+
+static inline int fid_seq_in_fldb(__u64 seq)
+{
+	return fid_seq_is_igif(seq) || fid_seq_is_norm(seq) ||
+	       fid_seq_is_root(seq) || fid_seq_is_dot(seq);
+}
+
+static inline void ost_layout_cpu_to_le(struct ost_layout *dst,
+					const struct ost_layout *src)
+{
+	dst->ol_stripe_size = __cpu_to_le32(src->ol_stripe_size);
+	dst->ol_stripe_count = __cpu_to_le32(src->ol_stripe_count);
+	dst->ol_comp_start = __cpu_to_le64(src->ol_comp_start);
+	dst->ol_comp_end = __cpu_to_le64(src->ol_comp_end);
+	dst->ol_comp_id = __cpu_to_le32(src->ol_comp_id);
+}
+
+static inline void ost_layout_le_to_cpu(struct ost_layout *dst,
+					const struct ost_layout *src)
+{
+	dst->ol_stripe_size = __le32_to_cpu(src->ol_stripe_size);
+	dst->ol_stripe_count = __le32_to_cpu(src->ol_stripe_count);
+	dst->ol_comp_start = __le64_to_cpu(src->ol_comp_start);
+	dst->ol_comp_end = __le64_to_cpu(src->ol_comp_end);
+	dst->ol_comp_id = __le32_to_cpu(src->ol_comp_id);
+}
+
+/* Both filter_fid_*cpu* functions not currently used */
+static inline void filter_fid_cpu_to_le(struct filter_fid *dst,
+					const struct filter_fid *src, int size)
+{
+	fid_cpu_to_le(&dst->ff_parent, &src->ff_parent);
+
+	if (size < sizeof(struct filter_fid))
+		memset(&dst->ff_layout, 0, sizeof(dst->ff_layout));
+	else
+		ost_layout_cpu_to_le(&dst->ff_layout, &src->ff_layout);
+
+	/* XXX: Add more if filter_fid is enlarged in the future. */
+}
+
+static inline void filter_fid_le_to_cpu(struct filter_fid *dst,
+					const struct filter_fid *src, int size)
+{
+	fid_le_to_cpu(&dst->ff_parent, &src->ff_parent);
+
+	if (size < sizeof(struct filter_fid))
+		memset(&dst->ff_layout, 0, sizeof(dst->ff_layout));
+	else
+		ost_layout_le_to_cpu(&dst->ff_layout, &src->ff_layout);
+
+	/* XXX: Add more if filter_fid is enlarged in the future. */
+}
+
+static inline void lu_last_id_fid(struct lu_fid *fid, __u64 seq, __u32 ost_idx)
+{
+	if (fid_seq_is_mdt0(seq)) {
+		fid->f_seq = fid_idif_seq(0, ost_idx);
+	} else {
+		LASSERTF(fid_seq_is_norm(seq) || fid_seq_is_echo(seq) ||
+			 fid_seq_is_idif(seq), "%#llx\n", seq);
+		fid->f_seq = seq;
+	}
+	fid->f_oid = 0;
+	fid->f_ver = 0;
+}
+
+static inline bool fid_is_md_operative(const struct lu_fid *fid)
+{
+	return fid_is_mdt0(fid) || fid_is_igif(fid) ||
+	       fid_is_norm(fid) || fid_is_root(fid);
+}
+
+/* seq client type */
+enum lu_cli_type {
+	LUSTRE_SEQ_METADATA = 1,
+	LUSTRE_SEQ_DATA
+};
+
+enum lu_mgr_type {
+        LUSTRE_SEQ_SERVER,
+        LUSTRE_SEQ_CONTROLLER
+};
+
+struct lu_server_seq;
+
+/* Client sequence manager interface. */
+struct lu_client_seq {
+        /* Sequence-controller export. */
+        struct obd_export      *lcs_exp;
+	struct mutex		lcs_mutex;
+
+        /*
+         * Range of allowed for allocation sequeces. When using lu_client_seq on
+         * clients, this contains meta-sequence range. And for servers this
+         * contains super-sequence range.
+         */
+        struct lu_seq_range         lcs_space;
+
+        /* Seq related proc */
+	struct proc_dir_entry   *lcs_proc_dir;
+
+        /* This holds last allocated fid in last obtained seq */
+        struct lu_fid           lcs_fid;
+
+        /* LUSTRE_SEQ_METADATA or LUSTRE_SEQ_DATA */
+        enum lu_cli_type        lcs_type;
+
+        /*
+         * Service uuid, passed from MDT + seq name to form unique seq name to
+         * use it with procfs.
+         */
+        char                    lcs_name[80];
+
+        /*
+         * Sequence width, that is how many objects may be allocated in one
+         * sequence. Default value for it is LUSTRE_SEQ_MAX_WIDTH.
+         */
+        __u64                   lcs_width;
+
+        /* Seq-server for direct talking */
+        struct lu_server_seq   *lcs_srv;
+
+	/* wait queue for fid allocation and update indicator */
+	wait_queue_head_t       lcs_waitq;
+	int                     lcs_update;
+};
+
+/* server sequence manager interface */
+struct lu_server_seq {
+        /* Available sequences space */
+        struct lu_seq_range         lss_space;
+
+        /* keeps highwater in lsr_end for seq allocation algorithm */
+        struct lu_seq_range         lss_lowater_set;
+        struct lu_seq_range         lss_hiwater_set;
+
+        /*
+         * Device for server side seq manager needs (saving sequences to backing
+         * store).
+         */
+        struct dt_device       *lss_dev;
+
+        /* /seq file object device */
+        struct dt_object       *lss_obj;
+
+        /* Seq related proc */
+	struct proc_dir_entry	*lss_proc_dir;
+
+        /* LUSTRE_SEQ_SERVER or LUSTRE_SEQ_CONTROLLER */
+        enum lu_mgr_type       lss_type;
+
+	/* Client interface to request controller */
+        struct lu_client_seq   *lss_cli;
+
+        /* Mutex for protecting allocation */
+	struct mutex		lss_mutex;
+
+        /*
+         * Service uuid, passed from MDT + seq name to form unique seq name to
+         * use it with procfs.
+         */
+        char                    lss_name[80];
+
+        /*
+         * Allocation chunks for super and meta sequences. Default values are
+         * LUSTRE_SEQ_SUPER_WIDTH and LUSTRE_SEQ_META_WIDTH.
+         */
+        __u64                   lss_width;
+
+        /*
+         * minimum lss_alloc_set size that should be allocated from
+         * lss_space
+         */
+        __u64                   lss_set_width;
+
+        /* sync is needed for update operation */
+        __u32                   lss_need_sync;
+
+	/**
+	 * Pointer to site object, required to access site fld.
+	 */
+	struct seq_server_site  *lss_site;
+};
+
+struct seq_server_site {
+	struct lu_site	     *ss_lu;
+	/**
+	 * mds number of this site.
+	 */
+	u32		      ss_node_id;
+	/**
+	 * Fid location database
+	 */
+	struct lu_server_fld *ss_server_fld;
+	struct lu_client_fld *ss_client_fld;
+
+	/**
+	 * Server Seq Manager
+	 */
+	struct lu_server_seq *ss_server_seq;
+
+	/**
+	 * Controller Seq Manager
+	 */
+	struct lu_server_seq *ss_control_seq;
+	struct obd_export    *ss_control_exp;
+
+	/**
+	 * Client Seq Manager
+	 */
+	struct lu_client_seq *ss_client_seq;
+};
+
+/* Server methods */
+
+int seq_server_init(const struct lu_env *env,
+		    struct lu_server_seq *seq,
+		    struct dt_device *dev,
+		    const char *prefix,
+		    enum lu_mgr_type type,
+		    struct seq_server_site *ss);
+
+void seq_server_fini(struct lu_server_seq *seq,
+                     const struct lu_env *env);
+
+int seq_server_alloc_super(struct lu_server_seq *seq,
+                           struct lu_seq_range *out,
+                           const struct lu_env *env);
+
+int seq_server_alloc_meta(struct lu_server_seq *seq,
+                          struct lu_seq_range *out,
+                          const struct lu_env *env);
+
+int seq_server_set_cli(const struct lu_env *env,
+		       struct lu_server_seq *seq,
+		       struct lu_client_seq *cli);
+
+int seq_server_check_and_alloc_super(const struct lu_env *env,
+				     struct lu_server_seq *seq);
+/* Client methods */
+int seq_client_init(struct lu_client_seq *seq,
+                    struct obd_export *exp,
+                    enum lu_cli_type type,
+                    const char *prefix,
+                    struct lu_server_seq *srv);
+
+void seq_client_fini(struct lu_client_seq *seq);
+
+void seq_client_flush(struct lu_client_seq *seq);
+
+int seq_client_alloc_fid(const struct lu_env *env, struct lu_client_seq *seq,
+                         struct lu_fid *fid);
+int seq_client_get_seq(const struct lu_env *env, struct lu_client_seq *seq,
+		       u64 *seqnr);
+int seq_site_fini(const struct lu_env *env, struct seq_server_site *ss);
+/* Fids common stuff */
+int fid_is_local(const struct lu_env *env,
+                 struct lu_site *site, const struct lu_fid *fid);
+
+enum lu_cli_type;
+int client_fid_init(struct obd_device *obd, struct obd_export *exp,
+		    enum lu_cli_type type);
+int client_fid_fini(struct obd_device *obd);
+
+/* fid locking */
+
+struct ldlm_namespace;
+
+/*
+ * Build (DLM) resource name from FID.
+ *
+ * NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2],
+ * but was moved into name[1] along with the OID to avoid consuming the
+ * renaming name[2,3] fields that need to be used for the quota identifier.
+ */
+static inline void
+fid_build_reg_res_name(const struct lu_fid *fid, struct ldlm_res_id *res)
+{
+	memset(res, 0, sizeof(*res));
+	res->name[LUSTRE_RES_ID_SEQ_OFF] = fid_seq(fid);
+	res->name[LUSTRE_RES_ID_VER_OID_OFF] = fid_ver_oid(fid);
+}
+
+/*
+ * Return true if resource is for object identified by FID.
+ */
+static inline int fid_res_name_eq(const struct lu_fid *fid,
+				  const struct ldlm_res_id *res)
+{
+	return res->name[LUSTRE_RES_ID_SEQ_OFF] == fid_seq(fid) &&
+	       res->name[LUSTRE_RES_ID_VER_OID_OFF] == fid_ver_oid(fid);
+}
+
+/*
+ * Extract FID from LDLM resource. Reverse of fid_build_reg_res_name().
+ */
+static inline void
+fid_extract_from_res_name(struct lu_fid *fid, const struct ldlm_res_id *res)
+{
+	fid->f_seq = res->name[LUSTRE_RES_ID_SEQ_OFF];
+	fid->f_oid = (__u32)(res->name[LUSTRE_RES_ID_VER_OID_OFF]);
+	fid->f_ver = (__u32)(res->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32);
+	LASSERT(fid_res_name_eq(fid, res));
+}
+
+/*
+ * Build (DLM) resource identifier from global quota FID and quota ID.
+ */
+static inline void
+fid_build_quota_res_name(const struct lu_fid *glb_fid, union lquota_id *qid,
+		      struct ldlm_res_id *res)
+{
+	fid_build_reg_res_name(glb_fid, res);
+	res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF] = fid_seq(&qid->qid_fid);
+	res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] = fid_ver_oid(&qid->qid_fid);
+}
+
+/*
+ * Extract global FID and quota ID from resource name
+ */
+static inline void fid_extract_from_quota_res(struct lu_fid *glb_fid,
+					      union lquota_id *qid,
+					      const struct ldlm_res_id *res)
+{
+	fid_extract_from_res_name(glb_fid, res);
+	qid->qid_fid.f_seq = res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF];
+	qid->qid_fid.f_oid = (__u32)res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF];
+	qid->qid_fid.f_ver =
+		(__u32)(res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] >> 32);
+}
+
+static inline void
+fid_build_pdo_res_name(const struct lu_fid *fid, unsigned int hash,
+		       struct ldlm_res_id *res)
+{
+	fid_build_reg_res_name(fid, res);
+	res->name[LUSTRE_RES_ID_HSH_OFF] = hash;
+}
+
+/**
+ * Build DLM resource name from object id & seq, which will be removed
+ * finally, when we replace ost_id with FID in data stack.
+ *
+ * Currently, resid from the old client, whose res[0] = object_id,
+ * res[1] = object_seq, is just oposite with Metatdata
+ * resid, where, res[0] = fid->f_seq, res[1] = fid->f_oid.
+ * To unifiy the resid identification, we will reverse the data
+ * resid to keep it same with Metadata resid, i.e.
+ *
+ * For resid from the old client,
+ *    res[0] = objid,  res[1] = 0, still keep the original order,
+ *    for compatiblity.
+ *
+ * For new resid
+ *    res will be built from normal FID directly, i.e. res[0] = f_seq,
+ *    res[1] = f_oid + f_ver.
+ */
+static inline void ostid_build_res_name(const struct ost_id *oi,
+					struct ldlm_res_id *name)
+{
+	memset(name, 0, sizeof *name);
+	if (fid_seq_is_mdt0(ostid_seq(oi))) {
+		name->name[LUSTRE_RES_ID_SEQ_OFF] = ostid_id(oi);
+		name->name[LUSTRE_RES_ID_VER_OID_OFF] = ostid_seq(oi);
+	} else {
+		fid_build_reg_res_name(&oi->oi_fid, name);
+	}
+}
+
+/**
+ * Return true if the resource is for the object identified by this id & group.
+ */
+static inline bool ostid_res_name_eq(const struct ost_id *oi,
+				     const struct ldlm_res_id *name)
+{
+	/* Note: it is just a trick here to save some effort, probably the
+	 * correct way would be turn them into the FID and compare */
+	if (fid_seq_is_mdt0(ostid_seq(oi))) {
+		return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_id(oi) &&
+		       name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_seq(oi);
+	} else {
+		return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_seq(oi) &&
+		       name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_id(oi);
+	}
+}
+
+/**
+ * Note: we need check oi_seq to decide where to set oi_id,
+ * so oi_seq should always be set ahead of oi_id.
+ */
+static inline int ostid_set_id(struct ost_id *oi, __u64 oid)
+{
+	if (fid_seq_is_mdt0(oi->oi.oi_seq)) {
+		if (oid >= IDIF_MAX_OID)
+			return -E2BIG;
+		oi->oi.oi_id = oid;
+	} else if (fid_is_idif(&oi->oi_fid)) {
+		if (oid >= IDIF_MAX_OID)
+			return -E2BIG;
+		oi->oi_fid.f_seq = fid_idif_seq(oid,
+						fid_idif_ost_idx(&oi->oi_fid));
+		oi->oi_fid.f_oid = oid;
+		oi->oi_fid.f_ver = oid >> 48;
+	} else {
+		if (oid >= OBIF_MAX_OID)
+			return -E2BIG;
+		oi->oi_fid.f_oid = oid;
+	}
+	return 0;
+}
+
+/* pack any OST FID into an ostid (id/seq) for the wire/disk */
+static inline int fid_to_ostid(const struct lu_fid *fid, struct ost_id *ostid)
+{
+	int rc = 0;
+
+	if (fid_seq_is_igif(fid->f_seq))
+		return -EBADF;
+
+	if (fid_is_idif(fid)) {
+		ostid_set_seq_mdt0(ostid);
+		rc = ostid_set_id(ostid, fid_idif_id(fid_seq(fid),
+				  fid_oid(fid), fid_ver(fid)));
+	} else {
+		ostid->oi_fid = *fid;
+	}
+
+	return rc;
+}
+
+/* The same as osc_build_res_name() */
+static inline void ost_fid_build_resid(const struct lu_fid *fid,
+				       struct ldlm_res_id *resname)
+{
+	if (fid_is_mdt0(fid) || fid_is_idif(fid)) {
+		struct ost_id oi;
+		oi.oi.oi_id = 0; /* gcc 4.7.2 complains otherwise */
+		if (fid_to_ostid(fid, &oi) != 0)
+			return;
+		ostid_build_res_name(&oi, resname);
+	} else {
+		fid_build_reg_res_name(fid, resname);
+	}
+}
+
+static inline void ost_fid_from_resid(struct lu_fid *fid,
+				      const struct ldlm_res_id *name,
+				      int ost_idx)
+{
+	if (fid_seq_is_mdt0(name->name[LUSTRE_RES_ID_VER_OID_OFF])) {
+		/* old resid */
+		struct ost_id oi;
+
+		memset(&oi, 0, sizeof(oi));
+		ostid_set_seq(&oi, name->name[LUSTRE_RES_ID_VER_OID_OFF]);
+		if (ostid_set_id(&oi, name->name[LUSTRE_RES_ID_SEQ_OFF])) {
+			CERROR("Bad %llu to set " DOSTID "\n",
+			       name->name[LUSTRE_RES_ID_SEQ_OFF], POSTID(&oi));
+		}
+		ostid_to_fid(fid, &oi, ost_idx);
+	} else {
+		/* new resid */
+		fid_extract_from_res_name(fid, name);
+	}
+}
+
+/**
+ * Flatten 128-bit FID values into a 64-bit value for use as an inode number.
+ * For non-IGIF FIDs this starts just over 2^32, and continues without
+ * conflict until 2^64, at which point we wrap the high 24 bits of the SEQ
+ * into the range where there may not be many OID values in use, to minimize
+ * the risk of conflict.
+ *
+ * Suppose LUSTRE_SEQ_MAX_WIDTH less than (1 << 24) which is currently true,
+ * the time between re-used inode numbers is very long - 2^40 SEQ numbers,
+ * or about 2^40 client mounts, if clients create less than 2^24 files/mount.
+ */
+static inline __u64 fid_flatten(const struct lu_fid *fid)
+{
+	__u64 ino;
+	__u64 seq;
+
+	if (fid_is_igif(fid)) {
+		ino = lu_igif_ino(fid);
+		return ino;
+	}
+
+	seq = fid_seq(fid);
+
+	ino = (seq << 24) + ((seq >> 24) & 0xffffff0000ULL) + fid_oid(fid);
+
+	return ino ?: fid_oid(fid);
+}
+
+static inline __u32 fid_hash(const struct lu_fid *f, int bits)
+{
+	/* all objects with same id and different versions will belong to same
+	 * collisions list. */
+	return hash_long(fid_flatten(f), bits);
+}
+
+/**
+ * map fid to 32 bit value for ino on 32bit systems. */
+static inline __u32 fid_flatten32(const struct lu_fid *fid)
+{
+	__u32 ino;
+	__u64 seq;
+
+	if (fid_is_igif(fid)) {
+		ino = lu_igif_ino(fid);
+		return ino;
+	}
+
+	seq = fid_seq(fid) - FID_SEQ_START;
+
+	/* Map the high bits of the OID into higher bits of the inode number so
+	 * that inodes generated at about the same time have a reduced chance
+	 * of collisions. This will give a period of 2^12 = 1024 unique clients
+	 * (from SEQ) and up to min(LUSTRE_SEQ_MAX_WIDTH, 2^20) = 128k objects
+	 * (from OID), or up to 128M inodes without collisions for new files. */
+	ino = ((seq & 0x000fffffULL) << 12) + ((seq >> 8) & 0xfffff000) +
+	      (seq >> (64 - (40-8)) & 0xffffff00) +
+	      (fid_oid(fid) & 0xff000fff) + ((fid_oid(fid) & 0x00fff000) << 8);
+
+	return ino ?: fid_oid(fid);
+}
+
+static inline int
+lu_fid_diff(const struct lu_fid *fid1, const struct lu_fid *fid2)
+{
+	LASSERTF(fid_seq(fid1) == fid_seq(fid2), "fid1:"DFID", fid2:"DFID"\n",
+		 PFID(fid1), PFID(fid2));
+
+	if (fid_is_idif(fid1) && fid_is_idif(fid2))
+		return fid_idif_id(fid1->f_seq, fid1->f_oid, fid1->f_ver) -
+		       fid_idif_id(fid2->f_seq, fid2->f_oid, fid2->f_ver);
+
+	return fid_oid(fid1) - fid_oid(fid2);
+}
+
+static inline int fid_set_id(struct lu_fid *fid, u64 oid)
+{
+	if (unlikely(fid_seq_is_igif(fid->f_seq))) {
+		CERROR("bad IGIF, "DFID"\n", PFID(fid));
+		return -EBADF;
+	}
+
+	if (fid_is_idif(fid)) {
+		if (oid >= IDIF_MAX_OID) {
+			CERROR("Too large OID %#llx to set IDIF "DFID"\n",
+			       (unsigned long long)oid, PFID(fid));
+			return -EBADF;
+		}
+		fid->f_seq = fid_idif_seq(oid, fid_idif_ost_idx(fid));
+		fid->f_oid = oid;
+		fid->f_ver = oid >> 48;
+	} else {
+		if (oid > OBIF_MAX_OID) {
+			CERROR("Too large OID %#llx to set REG "DFID"\n",
+			       (unsigned long long)oid, PFID(fid));
+			return -EBADF;
+		}
+		fid->f_oid = oid;
+	}
+	return 0;
+}
+
+#define LUSTRE_SEQ_SRV_NAME "seq_srv"
+#define LUSTRE_SEQ_CTL_NAME "seq_ctl"
+
+/* Range common stuff */
+static inline void
+range_cpu_to_le(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+        dst->lsr_start = cpu_to_le64(src->lsr_start);
+        dst->lsr_end = cpu_to_le64(src->lsr_end);
+        dst->lsr_index = cpu_to_le32(src->lsr_index);
+        dst->lsr_flags = cpu_to_le32(src->lsr_flags);
+}
+
+static inline void
+range_le_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+        dst->lsr_start = le64_to_cpu(src->lsr_start);
+        dst->lsr_end = le64_to_cpu(src->lsr_end);
+        dst->lsr_index = le32_to_cpu(src->lsr_index);
+        dst->lsr_flags = le32_to_cpu(src->lsr_flags);
+}
+
+static inline void
+range_cpu_to_be(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+        dst->lsr_start = cpu_to_be64(src->lsr_start);
+        dst->lsr_end = cpu_to_be64(src->lsr_end);
+        dst->lsr_index = cpu_to_be32(src->lsr_index);
+        dst->lsr_flags = cpu_to_be32(src->lsr_flags);
+}
+
+static inline void
+range_be_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+        dst->lsr_start = be64_to_cpu(src->lsr_start);
+        dst->lsr_end = be64_to_cpu(src->lsr_end);
+        dst->lsr_index = be32_to_cpu(src->lsr_index);
+        dst->lsr_flags = be32_to_cpu(src->lsr_flags);
+}
+
+static inline void range_array_cpu_to_le(struct lu_seq_range_array *dst,
+					 const struct lu_seq_range_array *src)
+{
+	__u32 i;
+
+	for (i = 0; i < src->lsra_count; i++)
+		range_cpu_to_le(&dst->lsra_lsr[i], &src->lsra_lsr[i]);
+
+	dst->lsra_count = cpu_to_le32(src->lsra_count);
+}
+
+static inline void range_array_le_to_cpu(struct lu_seq_range_array *dst,
+					 const struct lu_seq_range_array *src)
+{
+	__u32 i;
+
+	dst->lsra_count = le32_to_cpu(src->lsra_count);
+	for (i = 0; i < dst->lsra_count; i++)
+		range_le_to_cpu(&dst->lsra_lsr[i], &src->lsra_lsr[i]);
+}
+
+/** @} fid */
+
+#endif /* __LUSTRE_FID_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_fld.h b/drivers/staging/lustrefsx/lustre/include/lustre_fld.h
new file mode 100644
index 0000000000000..2f39962f8fb5e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_fld.h
@@ -0,0 +1,196 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_FLD_H
+#define __LINUX_FLD_H
+
+/** \defgroup fld fld
+ *
+ * @{
+ */
+
+#include <lustre/lustre_idl.h>
+#include <libcfs/libcfs.h>
+#include <seq_range.h>
+
+struct lu_env;
+struct lu_client_fld;
+struct lu_server_fld;
+struct lu_fld_hash;
+struct fld_cache;
+struct thandle;
+struct dt_device;
+struct dt_object;
+
+/*
+ * FLD (Fid Location Database) interface.
+ */
+enum {
+        LUSTRE_CLI_FLD_HASH_DHT = 0,
+        LUSTRE_CLI_FLD_HASH_RRB
+};
+
+struct lu_fld_target {
+	struct list_head	ft_chain;
+	struct obd_export      *ft_exp;
+	struct lu_server_fld   *ft_srv;
+	__u64			ft_idx;
+};
+
+struct lu_server_fld {
+        /**
+         * Fld dir proc entry. */
+	struct proc_dir_entry	*lsf_proc_dir;
+
+        /**
+         * /fld file object device */
+        struct dt_object        *lsf_obj;
+
+        /**
+         * super sequence controller export, needed to forward fld
+         * lookup  request. */
+        struct obd_export       *lsf_control_exp;
+
+        /**
+         * Client FLD cache. */
+        struct fld_cache        *lsf_cache;
+
+        /**
+         * Protect index modifications */
+	struct mutex		lsf_lock;
+
+        /**
+         * Fld service name in form "fld-srv-lustre-MDTXXX" */
+        char                     lsf_name[80];
+
+	int (*lsf_seq_lookup)(const struct lu_env *env,
+			      struct lu_server_fld *fld, u64 seq,
+			      struct lu_seq_range *range);
+
+	/**
+	 * Just reformatted or upgraded, and this flag is being
+	 * used to check whether the local FLDB is needs to be
+	 * synced with global FLDB(in MDT0), and it is only needed
+	 * if the MDT is upgraded from < 2.6 to 2.6, i.e. when the
+	 * local FLDB is being invited */
+	unsigned int		 lsf_new:1;
+
+};
+
+struct lu_client_fld {
+	/**
+	 * Client side proc entry. */
+	struct proc_dir_entry	*lcf_proc_dir;
+
+	/**
+	 * List of exports client FLD knows about. */
+	struct list_head	lcf_targets;
+
+        /**
+         * Current hash to be used to chose an export. */
+        struct lu_fld_hash      *lcf_hash;
+
+        /**
+         * Exports count. */
+        int                      lcf_count;
+
+        /**
+         * Lock protecting exports list and fld_hash. */
+	spinlock_t		 lcf_lock;
+
+        /**
+         * Client FLD cache. */
+        struct fld_cache        *lcf_cache;
+
+        /**
+         * Client fld proc entry name. */
+        char                     lcf_name[80];
+};
+
+/* Server methods */
+int fld_server_init(const struct lu_env *env, struct lu_server_fld *fld,
+		    struct dt_device *dt, const char *prefix, int type);
+
+void fld_server_fini(const struct lu_env *env, struct lu_server_fld *fld);
+
+int fld_declare_server_create(const struct lu_env *env,
+			      struct lu_server_fld *fld,
+			      const struct lu_seq_range *range,
+			      struct thandle *th);
+
+int fld_server_create(const struct lu_env *env,
+		      struct lu_server_fld *fld,
+		      const struct lu_seq_range *add_range,
+		      struct thandle *th);
+
+int fld_insert_entry(const struct lu_env *env,
+		     struct lu_server_fld *fld,
+		     const struct lu_seq_range *range);
+
+int fld_server_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+		      u64 seq, struct lu_seq_range *range);
+
+int fld_local_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+		     u64 seq, struct lu_seq_range *range);
+
+int fld_update_from_controller(const struct lu_env *env,
+			       struct lu_server_fld *fld);
+
+/* Client methods */
+int fld_client_init(struct lu_client_fld *fld,
+                    const char *prefix, int hash);
+
+void fld_client_fini(struct lu_client_fld *fld);
+
+void fld_client_flush(struct lu_client_fld *fld);
+
+int fld_client_lookup(struct lu_client_fld *fld, u64 seq, u32 *mds,
+                      __u32 flags, const struct lu_env *env);
+
+int fld_client_create(struct lu_client_fld *fld,
+                      struct lu_seq_range *range,
+                      const struct lu_env *env);
+
+int fld_client_delete(struct lu_client_fld *fld, u64 seq,
+                      const struct lu_env *env);
+
+int fld_client_add_target(struct lu_client_fld *fld,
+                          struct lu_fld_target *tar);
+
+int fld_client_del_target(struct lu_client_fld *fld,
+                          __u64 idx);
+
+void fld_client_proc_fini(struct lu_client_fld *fld);
+
+/** @} fld */
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_ha.h b/drivers/staging/lustrefsx/lustre/include/lustre_ha.h
new file mode 100644
index 0000000000000..7c22d985af5a4
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_ha.h
@@ -0,0 +1,60 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_HA_H
+#define _LUSTRE_HA_H
+
+/** \defgroup ha ha
+ *
+ * @{
+ */
+
+struct obd_import;
+struct obd_export;
+struct obd_device;
+struct ptlrpc_request;
+
+
+int ptlrpc_replay(struct obd_import *imp);
+int ptlrpc_resend(struct obd_import *imp);
+void ptlrpc_free_committed(struct obd_import *imp);
+void ptlrpc_wake_delayed(struct obd_import *imp);
+int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async);
+int ptlrpc_set_import_active(struct obd_import *imp, int active);
+void ptlrpc_activate_import(struct obd_import *imp);
+void ptlrpc_deactivate_import(struct obd_import *imp);
+void ptlrpc_invalidate_import(struct obd_import *imp);
+void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt);
+void ptlrpc_pinger_force(struct obd_import *imp);
+/** @} ha */
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_handles.h b/drivers/staging/lustrefsx/lustre/include/lustre_handles.h
new file mode 100644
index 0000000000000..16917caccdb7c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_handles.h
@@ -0,0 +1,88 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LUSTRE_HANDLES_H_
+#define __LUSTRE_HANDLES_H_
+
+/** \defgroup handles handles
+ *
+ * @{
+ */
+
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <libcfs/libcfs.h>
+
+struct portals_handle_ops {
+	void (*hop_addref)(void *object);
+	void (*hop_free)(void *object, int size);
+};
+
+/* These handles are most easily used by having them appear at the very top of
+ * whatever object that you want to make handles for.  ie:
+ *
+ * struct ldlm_lock {
+ *         struct portals_handle handle;
+ *         ...
+ * };
+ *
+ * Now you're able to assign the results of cookie2handle directly to an
+ * ldlm_lock.  If it's not at the top, you'll want to use container_of()
+ * to compute the start of the structure based on the handle field. */
+struct portals_handle {
+	struct list_head		h_link;
+	__u64				h_cookie;
+	const void		       *h_owner;
+	struct portals_handle_ops      *h_ops;
+
+	/* newly added fields to handle the RCU issue. -jxiong */
+	struct rcu_head			h_rcu;
+	spinlock_t			h_lock;
+	unsigned int			h_size:31;
+	unsigned int			h_in:1;
+};
+
+/* handles.c */
+
+/* Add a handle to the hash table */
+void class_handle_hash(struct portals_handle *,
+		       struct portals_handle_ops *ops);
+void class_handle_unhash(struct portals_handle *);
+void class_handle_hash_back(struct portals_handle *);
+void *class_handle2object(__u64 cookie, const void *owner);
+void class_handle_free_cb(struct rcu_head *rcu);
+int class_handle_init(void);
+void class_handle_cleanup(void);
+
+/** @} handles */
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_idmap.h b/drivers/staging/lustrefsx/lustre/include/lustre_idmap.h
new file mode 100644
index 0000000000000..70d647d8a15f3
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_idmap.h
@@ -0,0 +1,69 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/include/lustre_idmap.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_IDMAP_H
+#define _LUSTRE_IDMAP_H
+
+/** \defgroup idmap idmap
+ *
+ * @{
+ */
+
+#include <libcfs/libcfs.h>
+
+#ifdef HAVE_GROUP_INFO_GID
+
+#define CFS_GROUP_AT(gi, i) ((gi)->gid[(i)])
+
+#else  /* !HAVE_GROUP_INFO_GID */
+
+#define CFS_NGROUPS_PER_BLOCK   ((int)(PAGE_SIZE / sizeof(gid_t)))
+
+#define CFS_GROUP_AT(gi, i) \
+        ((gi)->blocks[(i) / CFS_NGROUPS_PER_BLOCK][(i) % CFS_NGROUPS_PER_BLOCK])
+
+#endif /* HAVE_GROUP_INFO_GID */
+
+struct lu_ucred;
+
+extern void lustre_groups_from_list(struct group_info *ginfo, gid_t *glist);
+extern void lustre_groups_sort(struct group_info *group_info);
+extern int lustre_in_group_p(struct lu_ucred *mu, gid_t grp);
+
+/** @} idmap */
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_import.h b/drivers/staging/lustrefsx/lustre/include/lustre_import.h
new file mode 100644
index 0000000000000..1b44d32393139
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_import.h
@@ -0,0 +1,390 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/** \defgroup obd_import PtlRPC import definitions
+ * Imports are client-side representation of remote obd target.
+ *
+ * @{
+ */
+
+#ifndef __IMPORT_H
+#define __IMPORT_H
+
+/** \defgroup export export
+ *
+ * @{
+ */
+
+#include <lustre_handles.h>
+#include <lustre/lustre_idl.h>
+
+
+/**
+ * Adaptive Timeout stuff
+ *
+ * @{
+ */
+#define D_ADAPTTO D_OTHER
+#define AT_BINS 4                  /* "bin" means "N seconds of history" */
+#define AT_FLG_NOHIST 0x1          /* use last reported value only */
+
+struct adaptive_timeout {
+	time64_t	at_binstart;         /* bin start time */
+	unsigned int	at_hist[AT_BINS];    /* timeout history bins */
+	unsigned int	at_flags;
+	unsigned int	at_current;          /* current timeout value */
+	unsigned int	at_worst_ever;       /* worst-ever timeout value */
+	time64_t	at_worst_time;       /* worst-ever timeout timestamp */
+	spinlock_t	at_lock;
+};
+
+enum lustre_at_flags {
+	LATF_SKIP	= 0x0,
+	LATF_STATS	= 0x1,
+};
+
+struct ptlrpc_at_array {
+	struct list_head *paa_reqs_array; /** array to hold requests */
+        __u32             paa_size;       /** the size of array */
+        __u32             paa_count;      /** the total count of reqs */
+	time64_t	  paa_deadline;	  /** the earliest deadline of reqs */
+        __u32            *paa_reqs_count; /** the count of reqs in each entry */
+};
+
+#define IMP_AT_MAX_PORTALS 8
+struct imp_at {
+        int                     iat_portal[IMP_AT_MAX_PORTALS];
+        struct adaptive_timeout iat_net_latency;
+        struct adaptive_timeout iat_service_estimate[IMP_AT_MAX_PORTALS];
+};
+
+
+/** @} */
+
+/** Possible import states */
+enum lustre_imp_state {
+        LUSTRE_IMP_CLOSED     = 1,
+        LUSTRE_IMP_NEW        = 2,
+        LUSTRE_IMP_DISCON     = 3,
+        LUSTRE_IMP_CONNECTING = 4,
+        LUSTRE_IMP_REPLAY     = 5,
+        LUSTRE_IMP_REPLAY_LOCKS = 6,
+        LUSTRE_IMP_REPLAY_WAIT  = 7,
+        LUSTRE_IMP_RECOVER    = 8,
+        LUSTRE_IMP_FULL       = 9,
+        LUSTRE_IMP_EVICTED    = 10,
+};
+
+/** Returns test string representation of numeric import state \a state */
+static inline char * ptlrpc_import_state_name(enum lustre_imp_state state)
+{
+        static char* import_state_names[] = {
+                "<UNKNOWN>", "CLOSED",  "NEW", "DISCONN",
+                "CONNECTING", "REPLAY", "REPLAY_LOCKS", "REPLAY_WAIT",
+                "RECOVER", "FULL", "EVICTED",
+        };
+
+        LASSERT (state <= LUSTRE_IMP_EVICTED);
+        return import_state_names[state];
+}
+
+/**
+ * List of import event types
+ */
+enum obd_import_event {
+        IMP_EVENT_DISCON     = 0x808001,
+        IMP_EVENT_INACTIVE   = 0x808002,
+        IMP_EVENT_INVALIDATE = 0x808003,
+        IMP_EVENT_ACTIVE     = 0x808004,
+        IMP_EVENT_OCD        = 0x808005,
+        IMP_EVENT_DEACTIVATE = 0x808006,
+        IMP_EVENT_ACTIVATE   = 0x808007,
+};
+
+/**
+ * Definition of import connection structure
+ */
+struct obd_import_conn {
+	/** Item for linking connections together */
+	struct list_head	  oic_item;
+	/** Pointer to actual PortalRPC connection */
+        struct ptlrpc_connection *oic_conn;
+        /** uuid of remote side */
+        struct obd_uuid           oic_uuid;
+        /**
+         * Time (64 bit jiffies) of last connection attempt on this connection
+         */
+        __u64                     oic_last_attempt;
+};
+
+/* state history */
+#define IMP_STATE_HIST_LEN 16
+struct import_state_hist {
+	enum lustre_imp_state	ish_state;
+	time64_t		ish_time;
+};
+
+/**
+ * Defintion of PortalRPC import structure.
+ * Imports are representing client-side view to remote target.
+ */
+struct obd_import {
+	/** Local handle (== id) for this import. */
+	struct portals_handle     imp_handle;
+	/** Reference counter */
+	atomic_t                  imp_refcount;
+	struct lustre_handle      imp_dlm_handle; /* client's ldlm export */
+	/** Currently active connection */
+	struct ptlrpc_connection *imp_connection;
+        /** PortalRPC client structure for this import */
+        struct ptlrpc_client     *imp_client;
+	/** List element for linking into pinger chain */
+	struct list_head	  imp_pinger_chain;
+	/** List element for linking into chain for destruction */
+	struct list_head	  imp_zombie_chain;
+
+        /**
+         * Lists of requests that are retained for replay, waiting for a reply,
+         * or waiting for recovery to complete, respectively.
+         * @{
+         */
+	struct list_head	imp_replay_list;
+	struct list_head	imp_sending_list;
+	struct list_head	imp_delayed_list;
+        /** @} */
+
+	/**
+	 * List of requests that are retained for committed open replay. Once
+	 * open is committed, open replay request will be moved from the
+	 * imp_replay_list into the imp_committed_list.
+	 * The imp_replay_cursor is for accelerating searching during replay.
+	 * @{
+	 */
+	struct list_head	imp_committed_list;
+	struct list_head	*imp_replay_cursor;
+	/** @} */
+
+	/** List of not replied requests */
+	struct list_head	imp_unreplied_list;
+	/** Known maximal replied XID */
+	__u64			imp_known_replied_xid;
+
+	/** obd device for this import */
+	struct obd_device	*imp_obd;
+
+        /**
+         * some seciruty-related fields
+         * @{
+         */
+	struct ptlrpc_sec        *imp_sec;
+	struct mutex		  imp_sec_mutex;
+	time64_t		imp_sec_expire;
+	pid_t			  imp_sec_refpid;
+        /** @} */
+
+	/** Wait queue for those who need to wait for recovery completion */
+	wait_queue_head_t         imp_recovery_waitq;
+
+	/** Number of requests currently in-flight */
+	atomic_t                  imp_inflight;
+	/** Number of requests currently unregistering */
+	atomic_t                  imp_unregistering;
+	/** Number of replay requests inflight */
+	atomic_t                  imp_replay_inflight;
+	/** Number of currently happening import invalidations */
+	atomic_t                  imp_inval_count;
+	/** Numbner of request timeouts */
+	atomic_t                  imp_timeouts;
+	/** Current import state */
+        enum lustre_imp_state     imp_state;
+	/** Last replay state */
+	enum lustre_imp_state     imp_replay_state;
+        /** History of import states */
+        struct import_state_hist  imp_state_hist[IMP_STATE_HIST_LEN];
+        int                       imp_state_hist_idx;
+        /** Current import generation. Incremented on every reconnect */
+        int                       imp_generation;
+        /** Incremented every time we send reconnection request */
+        __u32                     imp_conn_cnt;
+       /** 
+        * \see ptlrpc_free_committed remembers imp_generation value here
+        * after a check to save on unnecessary replay list iterations
+        */
+        int                       imp_last_generation_checked;
+        /** Last tranno we replayed */
+        __u64                     imp_last_replay_transno;
+        /** Last transno committed on remote side */
+        __u64                     imp_peer_committed_transno;
+        /**
+         * \see ptlrpc_free_committed remembers last_transno since its last
+         * check here and if last_transno did not change since last run of
+         * ptlrpc_free_committed and import generation is the same, we can
+         * skip looking for requests to remove from replay list as optimisation
+         */
+        __u64                     imp_last_transno_checked;
+        /**
+         * Remote export handle. This is how remote side knows what export
+         * we are talking to. Filled from response to connect request
+         */
+        struct lustre_handle      imp_remote_handle;
+        /** When to perform next ping. time in jiffies. */
+        cfs_time_t                imp_next_ping;
+	/** When we last successfully connected. time in 64bit jiffies */
+        __u64                     imp_last_success_conn;
+
+        /** List of all possible connection for import. */
+	struct list_head	imp_conn_list;
+        /**
+         * Current connection. \a imp_connection is imp_conn_current->oic_conn
+         */
+        struct obd_import_conn   *imp_conn_current;
+
+        /** Protects flags, level, generation, conn_cnt, *_list */
+	spinlock_t		  imp_lock;
+
+	/* flags */
+	unsigned long             imp_no_timeout:1, /* timeouts are disabled */
+				  imp_invalid:1,    /* evicted */
+				  /* administratively disabled */
+				  imp_deactive:1,
+				  /* try to recover the import */
+				  imp_replayable:1,
+				  /* don't run recovery (timeout instead) */
+				  imp_dlm_fake:1,
+				  /* use 1/2 timeout on MDS' OSCs */
+				  imp_server_timeout:1,
+				  /* VBR: imp in delayed recovery */
+				  imp_delayed_recovery:1,
+				  /* VBR: if gap was found then no lock replays
+				   */
+				  imp_no_lock_replay:1,
+				  /* recovery by versions was failed */
+				  imp_vbr_failed:1,
+				  /* force an immidiate ping */
+				  imp_force_verify:1,
+				  /* force a scheduled ping */
+				  imp_force_next_verify:1,
+				  /* pingable */
+				  imp_pingable:1,
+				  /* resend for replay */
+				  imp_resend_replay:1,
+				  /* disable normal recovery, for test only. */
+				  imp_no_pinger_recover:1,
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
+				  /* need IR MNE swab */
+				  imp_need_mne_swab:1,
+#endif
+				  /* import must be reconnected instead of
+				   * chouse new connection */
+				  imp_force_reconnect:1,
+				  /* import has tried to connect with server */
+				  imp_connect_tried:1,
+				  /* connected but not FULL yet */
+				  imp_connected:1;
+	__u32                     imp_connect_op;
+	struct obd_connect_data   imp_connect_data;
+	__u64                     imp_connect_flags_orig;
+	__u64                     imp_connect_flags2_orig;
+	int                       imp_connect_error;
+
+	__u32                     imp_msg_magic;
+				  /* adjusted based on server capability */
+	__u32                     imp_msghdr_flags;
+
+				  /* adaptive timeout data */
+	struct imp_at             imp_at;
+	time64_t		  imp_last_reply_time;	/* for health check */
+};
+
+/* import.c */
+static inline unsigned int at_est2timeout(unsigned int val)
+{
+        /* add an arbitrary minimum: 125% +5 sec */
+        return (val + (val >> 2) + 5);
+}
+
+static inline unsigned int at_timeout2est(unsigned int val)
+{
+        /* restore estimate value from timeout: e=4/5(t-5) */
+        LASSERT(val);
+        return (max((val << 2) / 5, 5U) - 4);
+}
+
+static inline void at_reset_nolock(struct adaptive_timeout *at, int val)
+{
+        at->at_current = val;
+        at->at_worst_ever = val;
+	at->at_worst_time = ktime_get_real_seconds();
+}
+
+static inline void at_reset(struct adaptive_timeout *at, int val)
+{
+	spin_lock(&at->at_lock);
+	at_reset_nolock(at, val);
+	spin_unlock(&at->at_lock);
+}
+
+static inline void at_init(struct adaptive_timeout *at, int val, int flags) {
+	memset(at, 0, sizeof(*at));
+	spin_lock_init(&at->at_lock);
+	at->at_flags = flags;
+	at_reset(at, val);
+}
+
+static inline void at_reinit(struct adaptive_timeout *at, int val, int flags)
+{
+	spin_lock(&at->at_lock);
+	at->at_binstart = 0;
+	memset(at->at_hist, 0, sizeof(at->at_hist));
+	at->at_flags = flags;
+	at_reset_nolock(at, val);
+	spin_unlock(&at->at_lock);
+}
+
+extern unsigned int at_min;
+static inline int at_get(struct adaptive_timeout *at) {
+        return (at->at_current > at_min) ? at->at_current : at_min;
+}
+int at_measured(struct adaptive_timeout *at, unsigned int val);
+int import_at_get_index(struct obd_import *imp, int portal);
+extern unsigned int at_max;
+#define AT_OFF (at_max == 0)
+
+/* genops.c */
+struct obd_export;
+extern struct obd_import *class_exp2cliimp(struct obd_export *);
+extern struct obd_import *class_conn2cliimp(struct lustre_handle *);
+
+/** @} import */
+
+#endif /* __IMPORT_H */
+
+/** @} obd_import */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_intent.h b/drivers/staging/lustrefsx/lustre/include/lustre_intent.h
new file mode 100644
index 0000000000000..76dcd8878985a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_intent.h
@@ -0,0 +1,68 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LUSTRE_INTENT_H
+#define LUSTRE_INTENT_H
+
+/* intent IT_XXX are defined in lustre/include/obd.h */
+
+struct lookup_intent {
+	int			 it_op;
+	int			 it_create_mode;
+	__u64			 it_flags;
+	int			 it_disposition;
+	int			 it_status;
+	__u64			 it_lock_handle;
+	__u64			 it_lock_bits;
+	int			 it_lock_mode;
+	int			 it_remote_lock_mode;
+	__u64			 it_remote_lock_handle;
+	struct ptlrpc_request	*it_request;
+	unsigned int		 it_lock_set:1;
+};
+
+static inline int it_disposition(const struct lookup_intent *it, int flag)
+{
+	return it->it_disposition & flag;
+}
+
+static inline void it_set_disposition(struct lookup_intent *it, int flag)
+{
+	it->it_disposition |= flag;
+}
+
+static inline void it_clear_disposition(struct lookup_intent *it, int flag)
+{
+	it->it_disposition &= ~flag;
+}
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_kernelcomm.h b/drivers/staging/lustrefsx/lustre/include/lustre_kernelcomm.h
new file mode 100644
index 0000000000000..4fc76566501ba
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_kernelcomm.h
@@ -0,0 +1,57 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2013, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ *
+ * Kernel <-> userspace communication routines.
+ * The definitions below are used in the kernel and userspace.
+ */
+
+#ifndef __LUSTRE_KERNELCOMM_H__
+#define __LUSTRE_KERNELCOMM_H__
+
+/* For declarations shared with userspace */
+#include <uapi_kernelcomm.h>
+
+/* prototype for callback function on kuc groups */
+typedef int (*libcfs_kkuc_cb_t)(void *data, void *cb_arg);
+
+/* Kernel methods */
+void libcfs_kkuc_init(void);
+int libcfs_kkuc_msg_put(struct file *fp, void *payload);
+int libcfs_kkuc_group_put(const struct obd_uuid *uuid, int group, void *data);
+int libcfs_kkuc_group_add(struct file *fp, const struct obd_uuid *uuid, int uid,
+			  int group, void *data, size_t data_len);
+int libcfs_kkuc_group_rem(const struct obd_uuid *uuid, int uid, int group);
+int libcfs_kkuc_group_foreach(const struct obd_uuid *uuid, int group,
+			      libcfs_kkuc_cb_t cb_func, void *cb_arg);
+
+#endif /* __LUSTRE_KERNELCOMM_H__ */
+
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_lfsck.h b/drivers/staging/lustrefsx/lustre/include/lustre_lfsck.h
new file mode 100644
index 0000000000000..37f6ee1de49eb
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_lfsck.h
@@ -0,0 +1,130 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2016, Intel Corporation.
+ */
+/*
+ * lustre/include/lustre_lfsck.h
+ *
+ * Lustre LFSCK exported functions.
+ *
+ * Author: Fan, Yong <fan.yong@intel.com>
+ */
+
+#ifndef _LUSTRE_LFSCK_H
+# define _LUSTRE_LFSCK_H
+
+#include <lustre/lustre_lfsck_user.h>
+#include <lustre_dlm.h>
+#include <lu_object.h>
+#include <dt_object.h>
+
+struct lfsck_start_param {
+	struct lfsck_start	*lsp_start;
+	__u32			 lsp_index;
+	unsigned int		 lsp_index_valid:1;
+};
+
+/* For LE_PAIRS_VERIFY returned status */
+enum lfsck_pv_status {
+	LPVS_INIT		= 0,
+	LPVS_INCONSISTENT	= 1,
+	LPVS_INCONSISTENT_TOFIX = 2,
+};
+
+enum lfsck_events_local {
+	LEL_FID_ACCESSED	= 1,
+	LEL_PAIRS_VERIFY_LOCAL	= 2,
+};
+
+struct lfsck_req_local {
+	__u32		lrl_event;
+	__u32		lrl_status;
+	__u16		lrl_active;
+	__u16		lrl_padding0;
+	__u32		lrl_padding1;
+	struct lu_fid	lrl_fid;
+	struct filter_fid lrl_ff_client;
+	struct filter_fid lrl_ff_local;
+};
+
+struct lfsck_layout_dangling_key {
+	struct lu_fid	lldk_fid;
+	__u32		lldk_comp_id;
+	__u32		lldk_ea_off;
+};
+
+typedef int (*lfsck_out_notify)(const struct lu_env *env, void *data,
+				enum lfsck_events event);
+
+int lfsck_register_namespace(const struct lu_env *env, struct dt_device *key,
+			     struct ldlm_namespace *ns);
+int lfsck_register(const struct lu_env *env, struct dt_device *key,
+		   struct dt_device *next, struct obd_device *obd,
+		   lfsck_out_notify notify, void *notify_data, bool master);
+void lfsck_degister(const struct lu_env *env, struct dt_device *key);
+
+int lfsck_add_target(const struct lu_env *env, struct dt_device *key,
+		     struct dt_device *tgt, struct obd_export *exp,
+		     __u32 index, bool for_ost);
+void lfsck_del_target(const struct lu_env *env, struct dt_device *key,
+		      struct dt_device *tgt, __u32 index, bool for_ost);
+
+int lfsck_start(const struct lu_env *env, struct dt_device *key,
+		struct lfsck_start_param *lsp);
+int lfsck_stop(const struct lu_env *env, struct dt_device *key,
+	       struct lfsck_stop *stop);
+int lfsck_in_notify_local(const struct lu_env *env, struct dt_device *key,
+			  struct lfsck_req_local *lrl, struct thandle *th);
+int lfsck_in_notify(const struct lu_env *env, struct dt_device *key,
+		    struct lfsck_request *lr);
+int lfsck_query(const struct lu_env *env, struct dt_device *key,
+		struct lfsck_request *req, struct lfsck_reply *rep,
+		struct lfsck_query *que);
+
+int lfsck_get_speed(struct seq_file *m, struct dt_device *key);
+int lfsck_set_speed(struct dt_device *key, __u32 val);
+int lfsck_get_windows(struct seq_file *m, struct dt_device *key);
+int lfsck_set_windows(struct dt_device *key, int val);
+
+int lfsck_dump(struct seq_file *m, struct dt_device *key, enum lfsck_type type);
+
+static inline void lfsck_pack_rfa(struct lfsck_req_local *lrl,
+				  const struct lu_fid *fid,
+				  enum lfsck_events_local event, __u16 com)
+{
+	memset(lrl, 0, sizeof(*lrl));
+	lrl->lrl_fid = *fid;
+	lrl->lrl_event = event;
+	lrl->lrl_active = com;
+}
+
+static inline bool lovea_slot_is_dummy(const struct lov_ost_data_v1 *obj)
+{
+	/* zero area does not care about the bytes-order. */
+	if (obj->l_ost_oi.oi.oi_id == 0 && obj->l_ost_oi.oi.oi_seq == 0 &&
+	    obj->l_ost_idx == 0 && obj->l_ost_gen == 0)
+		return true;
+
+	return false;
+}
+#endif /* _LUSTRE_LFSCK_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_lib.h b/drivers/staging/lustrefsx/lustre/include/lustre_lib.h
new file mode 100644
index 0000000000000..df1ca627aa4d0
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_lib.h
@@ -0,0 +1,405 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_lib.h
+ *
+ * Basic Lustre library routines.
+ */
+
+#ifndef _LUSTRE_LIB_H
+#define _LUSTRE_LIB_H
+
+/** \defgroup lib lib
+ *
+ * @{
+ */
+
+#include <libcfs/linux/linux-misc.h>
+#include <libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_ver.h>
+#include <uapi/linux/lustre_cfg.h>
+
+/* target.c */
+struct ptlrpc_request;
+struct obd_export;
+struct lu_target;
+struct l_wait_info;
+#include <lustre_ha.h>
+#include <lustre_net.h>
+
+#define LI_POISON 0x5a5a5a5a
+#if BITS_PER_LONG > 32
+# define LL_POISON 0x5a5a5a5a5a5a5a5aL
+#else
+# define LL_POISON 0x5a5a5a5aL
+#endif
+#define LP_POISON ((void *)LL_POISON)
+
+#ifdef HAVE_SERVER_SUPPORT
+int rev_import_init(struct obd_export *exp);
+int target_handle_connect(struct ptlrpc_request *req);
+int target_handle_disconnect(struct ptlrpc_request *req);
+void target_destroy_export(struct obd_export *exp);
+int target_handle_ping(struct ptlrpc_request *req);
+void target_committed_to_req(struct ptlrpc_request *req);
+void target_cancel_recovery_timer(struct obd_device *obd);
+void target_stop_recovery_thread(struct obd_device *obd);
+void target_cleanup_recovery(struct obd_device *obd);
+int target_queue_recovery_request(struct ptlrpc_request *req,
+                                  struct obd_device *obd);
+int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc,
+                   struct l_wait_info *lwi);
+#endif
+
+int target_pack_pool_reply(struct ptlrpc_request *req);
+int do_set_info_async(struct obd_import *imp,
+		      int opcode, int version,
+		      size_t keylen, void *key,
+		      size_t vallen, void *val,
+		      struct ptlrpc_request_set *set);
+
+void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id);
+
+/*
+ * l_wait_event is a flexible sleeping function, permitting simple caller
+ * configuration of interrupt and timeout sensitivity along with actions to
+ * be performed in the event of either exception.
+ *
+ * The first form of usage looks like this:
+ *
+ * struct l_wait_info lwi = LWI_TIMEOUT_INTR(timeout, timeout_handler,
+ *                                           intr_handler, callback_data);
+ * rc = l_wait_event(waitq, condition, &lwi);
+ *
+ * l_wait_event() makes the current process wait on 'waitq' until 'condition'
+ * is TRUE or a "killable" signal (SIGTERM, SIKGILL, SIGINT) is pending.  It
+ * returns 0 to signify 'condition' is TRUE, but if a signal wakes it before
+ * 'condition' becomes true, it optionally calls the specified 'intr_handler'
+ * if not NULL, and returns -EINTR.
+ *
+ * If a non-zero timeout is specified, signals are ignored until the timeout
+ * has expired.  At this time, if 'timeout_handler' is not NULL it is called.
+ * If it returns FALSE l_wait_event() continues to wait as described above with
+ * signals enabled.  Otherwise it returns -ETIMEDOUT.
+ *
+ * LWI_INTR(intr_handler, callback_data) is shorthand for
+ * LWI_TIMEOUT_INTR(0, NULL, intr_handler, callback_data)
+ *
+ * The second form of usage looks like this:
+ *
+ * struct l_wait_info lwi = LWI_TIMEOUT(timeout, timeout_handler);
+ * rc = l_wait_event(waitq, condition, &lwi);
+ *
+ * This form is the same as the first except that it COMPLETELY IGNORES
+ * SIGNALS.  The caller must therefore beware that if 'timeout' is zero, or if
+ * 'timeout_handler' is not NULL and returns FALSE, then the ONLY thing that
+ * can unblock the current process is 'condition' becoming TRUE.
+ *
+ * Another form of usage is:
+ * struct l_wait_info lwi = LWI_TIMEOUT_INTERVAL(timeout, interval,
+ *                                               timeout_handler);
+ * rc = l_wait_event(waitq, condition, &lwi);
+ * This is the same as previous case, but condition is checked once every
+ * 'interval' jiffies (if non-zero).
+ *
+ * Subtle synchronization point: this macro does *not* necessary takes
+ * wait-queue spin-lock before returning, and, hence, following idiom is safe
+ * ONLY when caller provides some external locking:
+ *
+ *             Thread1                            Thread2
+ *
+ *   l_wait_event(&obj->wq, ....);                                       (1)
+ *
+ *                                    wake_up(&obj->wq):                 (2)
+ *                                         spin_lock(&q->lock);          (2.1)
+ *                                         __wake_up_common(q, ...);     (2.2)
+ *                                         spin_unlock(&q->lock, flags); (2.3)
+ *
+ *   OBD_FREE_PTR(obj);                                                  (3)
+ *
+ * As l_wait_event() may "short-cut" execution and return without taking
+ * wait-queue spin-lock, some additional synchronization is necessary to
+ * guarantee that step (3) can begin only after (2.3) finishes.
+ *
+ * XXX nikita: some ptlrpc daemon threads have races of that sort.
+ *
+ */
+static inline int back_to_sleep(void *arg)
+{
+        return 0;
+}
+
+#define LWI_ON_SIGNAL_NOOP ((void (*)(void *))(-1))
+
+struct l_wait_info {
+        cfs_duration_t lwi_timeout;
+        cfs_duration_t lwi_interval;
+        int            lwi_allow_intr;
+        int  (*lwi_on_timeout)(void *);
+        void (*lwi_on_signal)(void *);
+        void  *lwi_cb_data;
+};
+
+/* NB: LWI_TIMEOUT ignores signals completely */
+#define LWI_TIMEOUT(time, cb, data)             \
+((struct l_wait_info) {                         \
+        .lwi_timeout    = time,                 \
+        .lwi_on_timeout = cb,                   \
+        .lwi_cb_data    = data,                 \
+        .lwi_interval   = 0,                    \
+        .lwi_allow_intr = 0                     \
+})
+
+#define LWI_TIMEOUT_INTERVAL(time, interval, cb, data)  \
+((struct l_wait_info) {                                 \
+        .lwi_timeout    = time,                         \
+        .lwi_on_timeout = cb,                           \
+        .lwi_cb_data    = data,                         \
+        .lwi_interval   = interval,                     \
+        .lwi_allow_intr = 0                             \
+})
+
+#define LWI_TIMEOUT_INTR(time, time_cb, sig_cb, data)   \
+((struct l_wait_info) {                                 \
+        .lwi_timeout    = time,                         \
+        .lwi_on_timeout = time_cb,                      \
+        .lwi_on_signal  = sig_cb,                       \
+        .lwi_cb_data    = data,                         \
+        .lwi_interval   = 0,                            \
+        .lwi_allow_intr = 0                             \
+})
+
+#define LWI_TIMEOUT_INTR_ALL(time, time_cb, sig_cb, data)       \
+((struct l_wait_info) {                                         \
+        .lwi_timeout    = time,                                 \
+        .lwi_on_timeout = time_cb,                              \
+        .lwi_on_signal  = sig_cb,                               \
+        .lwi_cb_data    = data,                                 \
+        .lwi_interval   = 0,                                    \
+        .lwi_allow_intr = 1                                     \
+})
+
+#define LWI_INTR(cb, data)  LWI_TIMEOUT_INTR(0, NULL, cb, data)
+
+#define LUSTRE_FATAL_SIGS					 \
+	(sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGTERM) | \
+	 sigmask(SIGQUIT) | sigmask(SIGALRM))
+
+/*
+ * Wait Queue
+ */
+#if !defined(HAVE___ADD_WAIT_QUEUE_EXCLUSIVE) && !defined(HAVE_WAIT_QUEUE_ENTRY)
+static inline void __add_wait_queue_exclusive(wait_queue_head_t *q,
+					      wait_queue_t *wait)
+{
+	wait->flags |= WQ_FLAG_EXCLUSIVE;
+	__add_wait_queue(q, wait);
+}
+#endif /* HAVE___ADD_WAIT_QUEUE_EXCLUSIVE */
+
+/**
+ * wait_queue_t of Linux (version < 2.6.34) is a FIFO list for exclusively
+ * waiting threads, which is not always desirable because all threads will
+ * be waken up again and again, even user only needs a few of them to be
+ * active most time. This is not good for performance because cache can
+ * be polluted by different threads.
+ *
+ * LIFO list can resolve this problem because we always wakeup the most
+ * recent active thread by default.
+ *
+ * NB: please don't call non-exclusive & exclusive wait on the same
+ * waitq if add_wait_queue_exclusive_head is used.
+ */
+#define add_wait_queue_exclusive_head(waitq, link)		\
+{								\
+	unsigned long flags;					\
+								\
+	spin_lock_irqsave(&((waitq)->lock), flags);		\
+	__add_wait_queue_exclusive(waitq, link);		\
+	spin_unlock_irqrestore(&((waitq)->lock), flags);	\
+}
+
+/*
+ * wait for @condition to become true, but no longer than timeout, specified
+ * by @info.
+ */
+#define __l_wait_event(wq, condition, info, ret, l_add_wait)                   \
+do {                                                                           \
+	wait_queue_entry_t __wait;                                             \
+	cfs_duration_t __timeout = info->lwi_timeout;                          \
+	sigset_t   __blocked;                                              \
+	int   __allow_intr = info->lwi_allow_intr;                             \
+									       \
+	ret = 0;                                                               \
+	if (condition)                                                         \
+		break;                                                         \
+									       \
+	init_waitqueue_entry(&__wait, current);				       \
+	l_add_wait(&wq, &__wait);                                              \
+									       \
+	/* Block all signals (just the non-fatal ones if no timeout). */       \
+	if (info->lwi_on_signal != NULL && (__timeout == 0 || __allow_intr))   \
+		__blocked = cfs_block_sigsinv(LUSTRE_FATAL_SIGS);              \
+	else                                                                   \
+		__blocked = cfs_block_sigsinv(0);                              \
+									       \
+	for (;;) {                                                             \
+		set_current_state(TASK_INTERRUPTIBLE);			       \
+									       \
+		/* To guarantee that the condition check will be done */       \
+		/* after setting the thread state as TASK_INTERRUPTIBLE. */    \
+		/* Otherwise, out-of-order execution may cause some race. */   \
+		/* Consider the following real execution order: */	       \
+									       \
+		/* 1. Thread1 checks condition on CPU1, gets false. */	       \
+		/* 2. Thread2 sets condition on CPU2. */		       \
+		/* 3. Thread2 calls wake_up() on CPU2 to wake the threads */   \
+		/*    with state TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE. */ \
+		/*    But the Thread1's state is TASK_RUNNING at that time. */ \
+		/* 4. Thread1 sets its state as TASK_INTERRUPTIBLE on CPU1, */ \
+		/*    then schedule. */					       \
+									       \
+		/* If the '__timeout' variable is zero, the Thread1 will */    \
+		/* have no chance to check the condition again. */	       \
+									       \
+		/* Generally, the interval between out-of-ordered step1 and */ \
+		/* step4 is very tiny, as to above step2 and step3 cannot */   \
+		/* happen. On some degree, it can explain why we seldom hit */ \
+		/* related trouble. But such race really exists, especially */ \
+		/* consider that the step1 and step4 can be interruptible. */  \
+		/* So add barrier to avoid Thread1 out-of-order execution. */  \
+		smp_mb();						       \
+									       \
+		if (condition)                                                 \
+			break;                                                 \
+									       \
+		if (__timeout == 0) {                                          \
+			schedule();					       \
+		} else {                                                       \
+			cfs_duration_t interval = info->lwi_interval?          \
+					     min_t(cfs_duration_t,             \
+						 info->lwi_interval,__timeout):\
+					     __timeout;                        \
+			cfs_duration_t remaining = schedule_timeout(interval); \
+			__timeout = cfs_time_sub(__timeout,                    \
+					    cfs_time_sub(interval, remaining));\
+			if (__timeout == 0) {                                  \
+				if (info->lwi_on_timeout == NULL ||            \
+				    info->lwi_on_timeout(info->lwi_cb_data)) { \
+					ret = -ETIMEDOUT;                      \
+					break;                                 \
+				}                                              \
+				/* Take signals after the timeout expires. */  \
+				if (info->lwi_on_signal != NULL)               \
+				    (void)cfs_block_sigsinv(LUSTRE_FATAL_SIGS);\
+			}                                                      \
+		}                                                              \
+                                                                               \
+                if (condition)                                                 \
+                        break;                                                 \
+		if (signal_pending(current)) {				       \
+                        if (info->lwi_on_signal != NULL &&                     \
+                            (__timeout == 0 || __allow_intr)) {                \
+                                if (info->lwi_on_signal != LWI_ON_SIGNAL_NOOP) \
+                                        info->lwi_on_signal(info->lwi_cb_data);\
+                                ret = -EINTR;                                  \
+                                break;                                         \
+                        }                                                      \
+			/* We have to do this here because some signals */     \
+			/* are not blockable - ie from strace(1).       */     \
+			/* In these cases we want to schedule_timeout() */     \
+			/* again, because we don't want that to return  */     \
+			/* -EINTR when the RPC actually succeeded.      */     \
+			/* the recalc_sigpending() below will deliver the */   \
+			/* signal properly.                             */     \
+			cfs_clear_sigpending();                                \
+                }                                                              \
+        }                                                                      \
+                                                                               \
+	cfs_restore_sigs(__blocked);                                           \
+                                                                               \
+	set_current_state(TASK_RUNNING);				       \
+	remove_wait_queue(&wq, &__wait);                                       \
+} while (0)
+
+
+#define l_wait_event(wq, condition, info)                       \
+({                                                              \
+	int                 __ret;                              \
+	struct l_wait_info *__info = (info);                    \
+								\
+	__l_wait_event(wq, condition, __info,                   \
+		       __ret, add_wait_queue);			\
+	__ret;                                                  \
+})
+
+#define l_wait_event_exclusive(wq, condition, info)             \
+({                                                              \
+	int                 __ret;                              \
+	struct l_wait_info *__info = (info);                    \
+								\
+	__l_wait_event(wq, condition, __info,                   \
+		       __ret, add_wait_queue_exclusive);        \
+	__ret;                                                  \
+})
+
+#define l_wait_event_exclusive_head(wq, condition, info)        \
+({                                                              \
+	int                 __ret;                              \
+	struct l_wait_info *__info = (info);                    \
+								\
+	__l_wait_event(wq, condition, __info,                   \
+		       __ret, add_wait_queue_exclusive_head);	\
+	__ret;                                                  \
+})
+
+#define l_wait_condition(wq, condition)                         \
+({                                                              \
+        struct l_wait_info lwi = { 0 };                         \
+        l_wait_event(wq, condition, &lwi);                      \
+})
+
+#define l_wait_condition_exclusive(wq, condition)               \
+({                                                              \
+        struct l_wait_info lwi = { 0 };                         \
+        l_wait_event_exclusive(wq, condition, &lwi);            \
+})
+
+#define l_wait_condition_exclusive_head(wq, condition)          \
+({                                                              \
+        struct l_wait_info lwi = { 0 };                         \
+        l_wait_event_exclusive_head(wq, condition, &lwi);       \
+})
+
+/** @} lib */
+
+#endif /* _LUSTRE_LIB_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_linkea.h b/drivers/staging/lustrefsx/lustre/include/lustre_linkea.h
new file mode 100644
index 0000000000000..89a040f735d5d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_linkea.h
@@ -0,0 +1,95 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2014, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: di wang <di.wang@intel.com>
+ */
+
+/* There are several reasons to restrict the linkEA size:
+ *
+ * 1. Under DNE mode, if we do not restrict the linkEA size, and if there
+ *    are too many cross-MDTs hard links to the same object, then it will
+ *    casue the llog overflow.
+ *
+ * 2. Some backend has limited size for EA. For example, if without large
+ *    EA enabled, the ldiskfs will make all EAs to share one (4K) EA block.
+ *
+ * 3. Too many entries in linkEA will seriously affect linkEA performance
+ *    because we only support to locate linkEA entry consecutively. */
+#define MAX_LINKEA_SIZE	4096
+
+struct linkea_data {
+	/**
+	 * Buffer to keep link EA body.
+	 */
+	struct lu_buf		*ld_buf;
+	/**
+	 * The matched header, entry and its lenght in the EA
+	 */
+	struct link_ea_header	*ld_leh;
+	struct link_ea_entry	*ld_lee;
+	int			ld_reclen;
+};
+
+int linkea_data_new(struct linkea_data *ldata, struct lu_buf *buf);
+int linkea_init(struct linkea_data *ldata);
+int linkea_init_with_rec(struct linkea_data *ldata);
+void linkea_entry_unpack(const struct link_ea_entry *lee, int *reclen,
+			 struct lu_name *lname, struct lu_fid *pfid);
+int linkea_entry_pack(struct link_ea_entry *lee, const struct lu_name *lname,
+		      const struct lu_fid *pfid);
+int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname,
+		   const struct lu_fid *pfid);
+void linkea_del_buf(struct linkea_data *ldata, const struct lu_name *lname);
+int linkea_links_new(struct linkea_data *ldata, struct lu_buf *buf,
+		     const struct lu_name *cname, const struct lu_fid *pfid);
+int linkea_overflow_shrink(struct linkea_data *ldata);
+int linkea_links_find(struct linkea_data *ldata, const struct lu_name *lname,
+		      const struct lu_fid  *pfid);
+
+static inline void linkea_first_entry(struct linkea_data *ldata)
+{
+	LASSERT(ldata != NULL);
+	LASSERT(ldata->ld_leh != NULL);
+
+	if (ldata->ld_leh->leh_reccount == 0)
+		ldata->ld_lee = NULL;
+	else
+		ldata->ld_lee = (struct link_ea_entry *)(ldata->ld_leh + 1);
+}
+
+static inline void linkea_next_entry(struct linkea_data *ldata)
+{
+	LASSERT(ldata != NULL);
+	LASSERT(ldata->ld_leh != NULL);
+
+	if (ldata->ld_lee != NULL) {
+		ldata->ld_lee = (struct link_ea_entry *)((char *)ldata->ld_lee +
+							 ldata->ld_reclen);
+		if ((char *)ldata->ld_lee >= ((char *)ldata->ld_leh +
+					      ldata->ld_leh->leh_len))
+			ldata->ld_lee = NULL;
+	}
+}
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_lmv.h b/drivers/staging/lustrefsx/lustre/include/lustre_lmv.h
new file mode 100644
index 0000000000000..f936973801012
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_lmv.h
@@ -0,0 +1,179 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2014, 2016, Intel Corporation.
+ */
+/*
+ * lustre/include/lustre_lmv.h
+ *
+ * Lustre LMV structures and functions.
+ *
+ * Author: Di Wang <di.wang@intel.com>
+ */
+
+#ifndef _LUSTRE_LMV_H
+#define _LUSTRE_LMV_H
+#include <lustre/lustre_idl.h>
+
+struct lmv_oinfo {
+	struct lu_fid	lmo_fid;
+	u32		lmo_mds;
+	struct inode	*lmo_root;
+};
+
+struct lmv_stripe_md {
+	__u32	lsm_md_magic;
+	__u32	lsm_md_stripe_count;
+	__u32	lsm_md_master_mdt_index;
+	__u32	lsm_md_hash_type;
+	__u32	lsm_md_layout_version;
+	__u32	lsm_md_default_count;
+	__u32	lsm_md_default_index;
+	char	lsm_md_pool_name[LOV_MAXPOOLNAME + 1];
+	struct lmv_oinfo lsm_md_oinfo[0];
+};
+
+static inline bool
+lsm_md_eq(const struct lmv_stripe_md *lsm1, const struct lmv_stripe_md *lsm2)
+{
+	__u32 idx;
+
+	if (lsm1->lsm_md_magic != lsm2->lsm_md_magic ||
+	    lsm1->lsm_md_stripe_count != lsm2->lsm_md_stripe_count ||
+	    lsm1->lsm_md_master_mdt_index !=
+				lsm2->lsm_md_master_mdt_index ||
+	    lsm1->lsm_md_hash_type != lsm2->lsm_md_hash_type ||
+	    lsm1->lsm_md_layout_version !=
+				lsm2->lsm_md_layout_version ||
+	    strcmp(lsm1->lsm_md_pool_name,
+		      lsm2->lsm_md_pool_name) != 0)
+		return false;
+
+	for (idx = 0; idx < lsm1->lsm_md_stripe_count; idx++) {
+		if (!lu_fid_eq(&lsm1->lsm_md_oinfo[idx].lmo_fid,
+			       &lsm2->lsm_md_oinfo[idx].lmo_fid))
+			return false;
+	}
+
+	return true;
+}
+union lmv_mds_md;
+
+void lmv_free_memmd(struct lmv_stripe_md *lsm);
+
+int lmvea_load_shards(const struct lu_env *env, struct dt_object *obj,
+		      struct lu_dirent *ent, struct lu_buf *buf,
+		      bool resize);
+
+static inline void lmv1_le_to_cpu(struct lmv_mds_md_v1 *lmv_dst,
+				  const struct lmv_mds_md_v1 *lmv_src)
+{
+	__u32 i;
+
+	lmv_dst->lmv_magic = le32_to_cpu(lmv_src->lmv_magic);
+	lmv_dst->lmv_stripe_count = le32_to_cpu(lmv_src->lmv_stripe_count);
+	lmv_dst->lmv_master_mdt_index =
+				le32_to_cpu(lmv_src->lmv_master_mdt_index);
+	lmv_dst->lmv_hash_type = le32_to_cpu(lmv_src->lmv_hash_type);
+	lmv_dst->lmv_layout_version = le32_to_cpu(lmv_src->lmv_layout_version);
+	for (i = 0; i < lmv_src->lmv_stripe_count; i++)
+		fid_le_to_cpu(&lmv_dst->lmv_stripe_fids[i],
+			      &lmv_src->lmv_stripe_fids[i]);
+}
+
+static inline void lmv_le_to_cpu(union lmv_mds_md *lmv_dst,
+				 const union lmv_mds_md *lmv_src)
+{
+	switch (le32_to_cpu(lmv_src->lmv_magic)) {
+	case LMV_MAGIC_V1:
+		lmv1_le_to_cpu(&lmv_dst->lmv_md_v1, &lmv_src->lmv_md_v1);
+		break;
+	default:
+		break;
+	}
+}
+
+/* This hash is only for testing purpose */
+static inline unsigned int
+lmv_hash_all_chars(unsigned int count, const char *name, int namelen)
+{
+	unsigned int c = 0;
+	const unsigned char *p = (const unsigned char *)name;
+
+	while (--namelen >= 0)
+		c += p[namelen];
+
+	c = c % count;
+
+	return c;
+}
+
+static inline unsigned int
+lmv_hash_fnv1a(unsigned int count, const char *name, int namelen)
+{
+	__u64 hash;
+
+	hash = lustre_hash_fnv_1a_64(name, namelen);
+
+	return do_div(hash, count);
+}
+
+static inline int lmv_name_to_stripe_index(__u32 lmv_hash_type,
+					   unsigned int stripe_count,
+					   const char *name, int namelen)
+{
+	int	idx;
+	__u32	hash_type = lmv_hash_type & LMV_HASH_TYPE_MASK;
+
+	LASSERT(namelen > 0);
+	if (stripe_count <= 1)
+		return 0;
+
+	/* for migrating object, always start from 0 stripe */
+	if (lmv_hash_type & LMV_HASH_FLAG_MIGRATION)
+		return 0;
+
+	switch (hash_type) {
+	case LMV_HASH_TYPE_ALL_CHARS:
+		idx = lmv_hash_all_chars(stripe_count, name, namelen);
+		break;
+	case LMV_HASH_TYPE_FNV_1A_64:
+		idx = lmv_hash_fnv1a(stripe_count, name, namelen);
+		break;
+	default:
+		idx = -EBADFD;
+		break;
+	}
+
+	CDEBUG(D_INFO, "name %.*s hash_type %d idx %d\n", namelen, name,
+	       hash_type, idx);
+
+	return idx;
+}
+
+static inline bool lmv_is_known_hash_type(__u32 type)
+{
+	return (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_FNV_1A_64 ||
+	       (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_ALL_CHARS;
+}
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_log.h b/drivers/staging/lustrefsx/lustre/include/lustre_log.h
new file mode 100644
index 0000000000000..237da21bf4210
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_log.h
@@ -0,0 +1,560 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/include/lustre_log.h
+ *
+ * Generic infrastructure for managing a collection of logs.
+ * These logs are used for:
+ *
+ * - orphan recovery: OST adds record on create
+ * - mtime/size consistency: the OST adds a record on first write
+ * - open/unlinked objects: OST adds a record on destroy
+ *
+ * - mds unlink log: the MDS adds an entry upon delete
+ *
+ * - raid1 replication log between OST's
+ * - MDS replication logs
+ */
+
+#ifndef _LUSTRE_LOG_H
+#define _LUSTRE_LOG_H
+
+/** \defgroup log log
+ *
+ * @{
+ */
+
+#include <obd_class.h>
+#include <lustre/lustre_idl.h>
+#include <dt_object.h>
+#include <lustre_log_user.h>
+
+#define LOG_NAME_LIMIT(logname, name)                   \
+        snprintf(logname, sizeof(logname), "LOGS/%s", name)
+#define LLOG_EEMPTY 4711
+
+enum llog_open_param {
+	LLOG_OPEN_EXISTS	= 0x0000,
+	LLOG_OPEN_NEW		= 0x0001,
+};
+
+struct plain_handle_data {
+	struct list_head	phd_entry;
+	struct llog_handle	*phd_cat_handle;
+	/* cookie of this log in its cat */
+	struct llog_cookie	phd_cookie;
+};
+
+struct cat_handle_data {
+	struct list_head	chd_head;
+	struct llog_handle     *chd_current_log;/* currently open log */
+	struct llog_handle     *chd_next_log;	/* llog to be used next */
+};
+
+struct llog_handle;
+
+/* llog.c  -  general API */
+int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
+		     int flags, struct obd_uuid *uuid);
+int llog_copy_handler(const struct lu_env *env, struct llog_handle *llh,
+		      struct llog_rec_hdr *rec, void *data);
+int llog_process(const struct lu_env *env, struct llog_handle *loghandle,
+		 llog_cb_t cb, void *data, void *catdata);
+int llog_process_or_fork(const struct lu_env *env,
+			 struct llog_handle *loghandle,
+			 llog_cb_t cb, void *data, void *catdata, bool fork);
+int llog_reverse_process(const struct lu_env *env,
+			 struct llog_handle *loghandle, llog_cb_t cb,
+			 void *data, void *catdata);
+int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
+		    int index);
+int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt,
+	      struct llog_handle **lgh, struct llog_logid *logid,
+	      char *name, enum llog_open_param open_param);
+int llog_close(const struct lu_env *env, struct llog_handle *cathandle);
+int llog_is_empty(const struct lu_env *env, struct llog_ctxt *ctxt,
+		  char *name);
+int llog_backup(const struct lu_env *env, struct obd_device *obd,
+		struct llog_ctxt *ctxt, struct llog_ctxt *bak_ctxt,
+		char *name, char *backup);
+int llog_read_header(const struct lu_env *env, struct llog_handle *handle,
+		     const struct obd_uuid *uuid);
+__u64 llog_size(const struct lu_env *env, struct llog_handle *llh);
+
+/* llog_process flags */
+#define LLOG_FLAG_NODEAMON 0x0001
+
+/* llog_cat.c - catalog api */
+struct llog_process_data {
+        /**
+         * Any useful data needed while processing catalog. This is
+         * passed later to process callback.
+         */
+        void                *lpd_data;
+        /**
+         * Catalog process callback function, called for each record
+         * in catalog.
+         */
+        llog_cb_t            lpd_cb;
+        /**
+         * Start processing the catalog from startcat/startidx
+         */
+        int                  lpd_startcat;
+        int                  lpd_startidx;
+};
+
+struct llog_process_cat_data {
+        /**
+         * Temporary stored first_idx while scanning log.
+         */
+        int                  lpcd_first_idx;
+        /**
+         * Temporary stored last_idx while scanning log.
+         */
+        int                  lpcd_last_idx;
+};
+
+int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle);
+int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle,
+		     struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+		     struct thandle *th);
+int llog_cat_declare_add_rec(const struct lu_env *env,
+			     struct llog_handle *cathandle,
+			     struct llog_rec_hdr *rec, struct thandle *th);
+int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle,
+		 struct llog_rec_hdr *rec, struct llog_cookie *reccookie);
+int llog_cat_cancel_records(const struct lu_env *env,
+			    struct llog_handle *cathandle, int count,
+			    struct llog_cookie *cookies);
+int llog_cat_process_or_fork(const struct lu_env *env,
+			     struct llog_handle *cat_llh, llog_cb_t cat_cb,
+			     llog_cb_t cb, void *data, int startcat,
+			     int startidx, bool fork);
+int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh,
+		     llog_cb_t cb, void *data, int startcat, int startidx);
+__u64 llog_cat_size(const struct lu_env *env, struct llog_handle *cat_llh);
+int llog_cat_reverse_process(const struct lu_env *env,
+			     struct llog_handle *cat_llh, llog_cb_t cb,
+			     void *data);
+/* llog_obd.c */
+int llog_setup(const struct lu_env *env, struct obd_device *obd,
+	       struct obd_llog_group *olg, int index,
+	       struct obd_device *disk_obd, struct llog_operations *op);
+int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt);
+int llog_cleanup(const struct lu_env *env, struct llog_ctxt *);
+int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags);
+int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt,
+		struct llog_cookie *cookies, int flags);
+
+/* llog_ioctl.c */
+struct obd_ioctl_data;
+int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
+	       struct obd_ioctl_data *data);
+int llog_catalog_list(const struct lu_env *env, struct dt_device *d,
+		      int count, struct obd_ioctl_data *data,
+		      const struct lu_fid *fid);
+
+/* llog_net.c */
+int llog_initiator_connect(struct llog_ctxt *ctxt);
+
+struct llog_operations {
+	int (*lop_declare_destroy)(const struct lu_env *env,
+			   struct llog_handle *handle, struct thandle *th);
+	int (*lop_destroy)(const struct lu_env *env,
+			   struct llog_handle *handle, struct thandle *th);
+	int (*lop_next_block)(const struct lu_env *env, struct llog_handle *h,
+			      int *curr_idx, int next_idx, __u64 *offset,
+			      void *buf, int len);
+	int (*lop_prev_block)(const struct lu_env *env, struct llog_handle *h,
+			      int prev_idx, void *buf, int len);
+	int (*lop_read_header)(const struct lu_env *env,
+			       struct llog_handle *handle);
+	int (*lop_setup)(const struct lu_env *env, struct obd_device *obd,
+			 struct obd_llog_group *olg, int ctxt_idx,
+			 struct obd_device *disk_obd);
+	int (*lop_sync)(struct llog_ctxt *ctxt, struct obd_export *exp,
+			int flags);
+	int (*lop_cleanup)(const struct lu_env *env, struct llog_ctxt *ctxt);
+	int (*lop_cancel)(const struct lu_env *env, struct llog_ctxt *ctxt,
+			  struct llog_cookie *cookies, int flags);
+	int (*lop_connect)(struct llog_ctxt *ctxt, struct llog_logid *logid,
+			   struct llog_gen *gen, struct obd_uuid *uuid);
+	/**
+	 * Any llog file must be opened first using llog_open().  Llog can be
+	 * opened by name, logid or without both, in last case the new logid
+	 * will be generated.
+	 */
+	int (*lop_open)(const struct lu_env *env, struct llog_handle *lgh,
+			struct llog_logid *logid, char *name,
+			enum llog_open_param);
+	/**
+	 * Opened llog may not exist and this must be checked where needed using
+	 * the llog_exist() call.
+	 */
+	int (*lop_exist)(struct llog_handle *lgh);
+	/**
+	 * Close llog file and calls llog_free_handle() implicitly.
+	 * Any opened llog must be closed by llog_close() call.
+	 */
+	int (*lop_close)(const struct lu_env *env, struct llog_handle *handle);
+	/**
+	 * Create new llog file. The llog must be opened.
+	 * Must be used only for local llog operations.
+	 */
+	int (*lop_declare_create)(const struct lu_env *env,
+				  struct llog_handle *handle,
+				  struct thandle *th);
+	int (*lop_create)(const struct lu_env *env, struct llog_handle *handle,
+			  struct thandle *th);
+	/**
+	 * write new record in llog. It appends records usually but can edit
+	 * existing records too.
+	 */
+	int (*lop_declare_write_rec)(const struct lu_env *env,
+				     struct llog_handle *lgh,
+				     struct llog_rec_hdr *rec,
+				     int idx, struct thandle *th);
+	int (*lop_write_rec)(const struct lu_env *env,
+			     struct llog_handle *loghandle,
+			     struct llog_rec_hdr *rec,
+			     struct llog_cookie *cookie,
+			     int idx, struct thandle *th);
+	/**
+	 * Add new record in llog catalog. Does the same as llog_write_rec()
+	 * but using llog catalog.
+	 */
+	int (*lop_declare_add)(const struct lu_env *env,
+			       struct llog_handle *lgh,
+			       struct llog_rec_hdr *rec, struct thandle *th);
+	int (*lop_add)(const struct lu_env *env, struct llog_handle *lgh,
+		       struct llog_rec_hdr *rec, struct llog_cookie *cookie,
+		       struct thandle *th);
+};
+
+/* In-memory descriptor for a log object or log catalog */
+struct llog_handle {
+	struct rw_semaphore	 lgh_lock;
+	struct mutex		 lgh_hdr_mutex; /* protect lgh_hdr data */
+	struct llog_logid	 lgh_id; /* id of this log */
+	struct llog_log_hdr	*lgh_hdr; /* may be vmalloc'd */
+	size_t			lgh_hdr_size;
+	struct dt_object	*lgh_obj;
+	/* For a Catalog, is the last/newest used index for a plain slot.
+	 * Used in conjunction with llh_cat_idx to handle Catalog wrap-around
+	 * case, after it will have reached LLOG_HDR_BITMAP_SIZE, llh_cat_idx
+	 * will become its upper limit */
+	int			 lgh_last_idx;
+	int			 lgh_cur_idx; /* used during llog_process */
+	__u64			 lgh_cur_offset; /* used during llog_process */
+	struct llog_ctxt	*lgh_ctxt;
+	union {
+		struct plain_handle_data	 phd;
+		struct cat_handle_data		 chd;
+	} u;
+	char			*lgh_name;
+	void			*private_data;
+	struct llog_operations	*lgh_logops;
+	atomic_t		 lgh_refcount;
+
+	int			lgh_max_size;
+	__u32			lgh_stale:1;
+};
+
+/* llog_osd.c */
+extern struct llog_operations llog_osd_ops;
+extern struct llog_operations llog_common_cat_ops;
+int llog_osd_get_cat_list(const struct lu_env *env, struct dt_device *d,
+			  int idx, int count, struct llog_catid *idarray,
+			  const struct lu_fid *fid);
+int llog_osd_put_cat_list(const struct lu_env *env, struct dt_device *d,
+			  int idx, int count, struct llog_catid *idarray,
+			  const struct lu_fid *fid);
+
+#define LLOG_CTXT_FLAG_UNINITIALIZED     0x00000001
+#define LLOG_CTXT_FLAG_STOP		 0x00000002
+
+/* Indicate the llog objects under this context are normal FID objects,
+ * instead of objects with local FID. */
+#define LLOG_CTXT_FLAG_NORMAL_FID	 0x00000004
+
+struct llog_ctxt {
+        int                      loc_idx; /* my index the obd array of ctxt's */
+        struct obd_device       *loc_obd; /* points back to the containing obd*/
+        struct obd_llog_group   *loc_olg; /* group containing that ctxt */
+        struct obd_export       *loc_exp; /* parent "disk" export (e.g. MDS) */
+        struct obd_import       *loc_imp; /* to use in RPC's: can be backward
+                                             pointing import */
+	struct llog_operations  *loc_logops;
+	struct llog_handle      *loc_handle;
+	struct mutex		 loc_mutex; /* protect loc_imp */
+	atomic_t                 loc_refcount;
+	long                     loc_flags; /* flags, see above defines */
+	struct dt_object	*loc_dir;
+	struct local_oid_storage *loc_los_nameless;
+	struct local_oid_storage *loc_los_named;
+	/* llog chunk size, and llog record size can not be bigger than
+	 * loc_chunk_size */
+	__u32			 loc_chunk_size;
+};
+
+#define LLOG_PROC_BREAK 0x0001
+#define LLOG_DEL_RECORD 0x0002
+#define LLOG_DEL_PLAIN  0x0003
+
+static inline int llog_obd2ops(struct llog_ctxt *ctxt,
+                               struct llog_operations **lop)
+{
+        if (ctxt == NULL)
+                return -ENOTCONN;
+
+        *lop = ctxt->loc_logops;
+        if (*lop == NULL)
+                return -EOPNOTSUPP;
+
+        return 0;
+}
+
+static inline int llog_handle2ops(struct llog_handle *loghandle,
+                                  struct llog_operations **lop)
+{
+	if (loghandle == NULL || loghandle->lgh_logops == NULL)
+		return -EINVAL;
+
+	*lop = loghandle->lgh_logops;
+	return 0;
+}
+
+static inline int llog_data_len(int len)
+{
+        return cfs_size_round(len);
+}
+
+static inline int llog_get_size(struct llog_handle *loghandle)
+{
+	if (loghandle && loghandle->lgh_hdr)
+		return loghandle->lgh_hdr->llh_count;
+	return 0;
+}
+
+static inline struct llog_ctxt *llog_ctxt_get(struct llog_ctxt *ctxt)
+{
+	atomic_inc(&ctxt->loc_refcount);
+	CDEBUG(D_INFO, "GETting ctxt %p : new refcount %d\n", ctxt,
+	       atomic_read(&ctxt->loc_refcount));
+	return ctxt;
+}
+
+static inline void llog_ctxt_put(struct llog_ctxt *ctxt)
+{
+	if (ctxt == NULL)
+		return;
+	LASSERT_ATOMIC_GT_LT(&ctxt->loc_refcount, 0, LI_POISON);
+	CDEBUG(D_INFO, "PUTting ctxt %p : new refcount %d\n", ctxt,
+	       atomic_read(&ctxt->loc_refcount) - 1);
+	__llog_ctxt_put(NULL, ctxt);
+}
+
+static inline void llog_group_init(struct obd_llog_group *olg)
+{
+	init_waitqueue_head(&olg->olg_waitq);
+	spin_lock_init(&olg->olg_lock);
+	mutex_init(&olg->olg_cat_processing);
+}
+
+static inline int llog_group_set_ctxt(struct obd_llog_group *olg,
+                                      struct llog_ctxt *ctxt, int index)
+{
+	LASSERT(index >= 0 && index < LLOG_MAX_CTXTS);
+
+	spin_lock(&olg->olg_lock);
+	if (olg->olg_ctxts[index] != NULL) {
+		spin_unlock(&olg->olg_lock);
+		return -EEXIST;
+	}
+	olg->olg_ctxts[index] = ctxt;
+	spin_unlock(&olg->olg_lock);
+	return 0;
+}
+
+static inline struct llog_ctxt *llog_group_get_ctxt(struct obd_llog_group *olg,
+                                                    int index)
+{
+	struct llog_ctxt *ctxt;
+
+	LASSERT(index >= 0 && index < LLOG_MAX_CTXTS);
+
+	spin_lock(&olg->olg_lock);
+	if (olg->olg_ctxts[index] == NULL)
+		ctxt = NULL;
+	else
+		ctxt = llog_ctxt_get(olg->olg_ctxts[index]);
+	spin_unlock(&olg->olg_lock);
+	return ctxt;
+}
+
+static inline void llog_group_clear_ctxt(struct obd_llog_group *olg, int index)
+{
+	LASSERT(index >= 0 && index < LLOG_MAX_CTXTS);
+	spin_lock(&olg->olg_lock);
+	olg->olg_ctxts[index] = NULL;
+	spin_unlock(&olg->olg_lock);
+}
+
+static inline struct llog_ctxt *llog_get_context(struct obd_device *obd,
+                                                 int index)
+{
+        return llog_group_get_ctxt(&obd->obd_olg, index);
+}
+
+static inline int llog_group_ctxt_null(struct obd_llog_group *olg, int index)
+{
+        return (olg->olg_ctxts[index] == NULL);
+}
+
+static inline int llog_ctxt_null(struct obd_device *obd, int index)
+{
+        return (llog_group_ctxt_null(&obd->obd_olg, index));
+}
+
+static inline int llog_next_block(const struct lu_env *env,
+				  struct llog_handle *loghandle, int *cur_idx,
+				  int next_idx, __u64 *cur_offset, void *buf,
+				  int len)
+{
+	struct llog_operations *lop;
+	int rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(loghandle, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_next_block == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	rc = lop->lop_next_block(env, loghandle, cur_idx, next_idx,
+				 cur_offset, buf, len);
+	RETURN(rc);
+}
+
+static inline int llog_prev_block(const struct lu_env *env,
+				  struct llog_handle *loghandle,
+				  int prev_idx, void *buf, int len)
+{
+	struct llog_operations *lop;
+	int rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(loghandle, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_prev_block == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	rc = lop->lop_prev_block(env, loghandle, prev_idx, buf, len);
+	RETURN(rc);
+}
+
+static inline int llog_connect(struct llog_ctxt *ctxt,
+			       struct llog_logid *logid, struct llog_gen *gen,
+			       struct obd_uuid *uuid)
+{
+	struct llog_operations	*lop;
+	int			 rc;
+
+	ENTRY;
+
+	rc = llog_obd2ops(ctxt, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_connect == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	rc = lop->lop_connect(ctxt, logid, gen, uuid);
+	RETURN(rc);
+}
+
+static inline int llog_is_full(struct llog_handle *llh)
+{
+	return llh->lgh_last_idx >= LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr) - 1;
+}
+
+struct llog_cfg_rec {
+	struct llog_rec_hdr	lcr_hdr;
+	struct lustre_cfg	lcr_cfg;
+	struct llog_rec_tail	lcr_tail;
+};
+
+struct llog_cfg_rec *lustre_cfg_rec_new(int cmd, struct lustre_cfg_bufs *bufs);
+void lustre_cfg_rec_free(struct llog_cfg_rec *lcr);
+
+enum {
+	LLOG_NEXT_IDX = -1,
+	LLOG_HEADER_IDX = 0,
+};
+
+/* llog.c */
+int llog_exist(struct llog_handle *loghandle);
+int llog_declare_create(const struct lu_env *env,
+			struct llog_handle *loghandle, struct thandle *th);
+int llog_create(const struct lu_env *env, struct llog_handle *handle,
+		struct thandle *th);
+int llog_trans_destroy(const struct lu_env *env, struct llog_handle *handle,
+		       struct thandle *th);
+int llog_destroy(const struct lu_env *env, struct llog_handle *handle);
+
+int llog_declare_write_rec(const struct lu_env *env,
+			   struct llog_handle *handle,
+			   struct llog_rec_hdr *rec, int idx,
+			   struct thandle *th);
+int llog_write_rec(const struct lu_env *env, struct llog_handle *handle,
+		   struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+		   int idx, struct thandle *th);
+int llog_add(const struct lu_env *env, struct llog_handle *lgh,
+	     struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+	     struct thandle *th);
+int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh,
+		     struct llog_rec_hdr *rec, struct thandle *th);
+int lustre_process_log(struct super_block *sb, char *logname,
+		       struct config_llog_instance *cfg);
+int lustre_end_log(struct super_block *sb, char *logname,
+		   struct config_llog_instance *cfg);
+int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt,
+		     struct llog_handle **res, struct llog_logid *logid,
+		     char *name);
+int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt,
+	       struct llog_logid *logid, char *name);
+int llog_write(const struct lu_env *env, struct llog_handle *loghandle,
+	       struct llog_rec_hdr *rec, int idx);
+
+/** @} log */
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_log_user.h b/drivers/staging/lustrefsx/lustre/include/lustre_log_user.h
new file mode 100644
index 0000000000000..ee5f0f7385fa0
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_log_user.h
@@ -0,0 +1,79 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/include/lustre_log_user.h
+ *
+ * Userspace-usable portion of Generic infrastructure for managing
+ * a collection of logs.
+ * See lustre_log.h for more details.
+ */
+
+#ifndef _LUSTRE_LOG_USER_H
+#define _LUSTRE_LOG_USER_H
+
+#include <uapi/linux/lustre_fid.h>
+
+/*  Lustre logs use FIDs constructed from oi_id and oi_seq directly,
+ *  without attempting to use the IGIF and IDIF ranges as is done
+ *  elsewhere, because of compatibility concerns (see lu-2888).
+ */
+
+static inline void logid_to_fid(struct llog_logid *id, struct lu_fid *fid)
+{
+	/* For compatibility purposes we identify pre-OSD (~< 2.3.51 MDS)
+	 * logid's by non-zero ogen (inode generation) and convert them
+	 * into IGIF */
+	if (id->lgl_ogen == 0) {
+		fid->f_seq = id->lgl_oi.oi.oi_seq;
+		fid->f_oid = id->lgl_oi.oi.oi_id;
+		fid->f_ver = 0;
+	} else {
+		lu_igif_build(fid, id->lgl_oi.oi.oi_id, id->lgl_ogen);
+	}
+}
+
+static inline void fid_to_logid(struct lu_fid *fid, struct llog_logid *id)
+{
+	id->lgl_oi.oi.oi_seq = fid->f_seq;
+	id->lgl_oi.oi.oi_id = fid->f_oid;
+	id->lgl_ogen = 0;
+}
+
+static inline void logid_set_id(struct llog_logid *log_id, __u64 id)
+{
+	log_id->lgl_oi.oi.oi_id = id;
+}
+
+static inline __u64 logid_id(struct llog_logid *log_id)
+{
+	return log_id->lgl_oi.oi.oi_id;
+}
+
+#endif /* ifndef _LUSTRE_LOG_USER_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_mdc.h b/drivers/staging/lustrefsx/lustre/include/lustre_mdc.h
new file mode 100644
index 0000000000000..be0eb7742e644
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_mdc.h
@@ -0,0 +1,239 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_mdc.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_MDC_H
+#define _LUSTRE_MDC_H
+
+/** \defgroup mdc mdc
+ *
+ * @{
+ */
+
+#include <linux/fs.h>
+#include <linux/dcache.h>
+#ifdef CONFIG_FS_POSIX_ACL
+# include <linux/posix_acl_xattr.h>
+#endif /* CONFIG_FS_POSIX_ACL */
+#include <lustre_handles.h>
+#include <lustre_intent.h>
+#include <libcfs/libcfs.h>
+#include <obd_class.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_lib.h>
+#include <lustre_dlm.h>
+#include <lustre_export.h>
+
+struct ptlrpc_client;
+struct obd_export;
+struct ptlrpc_request;
+struct obd_device;
+
+/**
+ * Serializes in-flight MDT-modifying RPC requests to preserve idempotency.
+ *
+ * This mutex is used to implement execute-once semantics on the MDT.
+ * The MDT stores the last transaction ID and result for every client in
+ * its last_rcvd file. If the client doesn't get a reply, it can safely
+ * resend the request and the MDT will reconstruct the reply being aware
+ * that the request has already been executed. Without this lock,
+ * execution status of concurrent in-flight requests would be
+ * overwritten.
+ *
+ * This design limits the extent to which we can keep a full pipeline of
+ * in-flight requests from a single client.  This limitation could be
+ * overcome by allowing multiple slots per client in the last_rcvd file.
+ */
+struct mdc_rpc_lock {
+	/** Lock protecting in-flight RPC concurrency. */
+	struct mutex		rpcl_mutex;
+	/** Intent associated with currently executing request. */
+	struct lookup_intent	*rpcl_it;
+	/** Used for MDS/RPC load testing purposes. */
+	int			rpcl_fakes;
+};
+
+#define MDC_FAKE_RPCL_IT ((void *)0x2c0012bfUL)
+
+static inline void mdc_init_rpc_lock(struct mdc_rpc_lock *lck)
+{
+	mutex_init(&lck->rpcl_mutex);
+        lck->rpcl_it = NULL;
+}
+
+static inline void mdc_get_rpc_lock(struct mdc_rpc_lock *lck,
+				    struct lookup_intent *it)
+{
+	ENTRY;
+
+	if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP ||
+			   it->it_op == IT_LAYOUT || it->it_op == IT_READDIR))
+		return;
+
+	/* This would normally block until the existing request finishes.
+	 * If fail_loc is set it will block until the regular request is
+	 * done, then set rpcl_it to MDC_FAKE_RPCL_IT.  Once that is set
+	 * it will only be cleared when all fake requests are finished.
+	 * Only when all fake requests are finished can normal requests
+	 * be sent, to ensure they are recoverable again. */
+ again:
+	mutex_lock(&lck->rpcl_mutex);
+
+	if (CFS_FAIL_CHECK_QUIET(OBD_FAIL_MDC_RPCS_SEM)) {
+		lck->rpcl_it = MDC_FAKE_RPCL_IT;
+		lck->rpcl_fakes++;
+		mutex_unlock(&lck->rpcl_mutex);
+		return;
+	}
+
+	/* This will only happen when the CFS_FAIL_CHECK() was
+	 * just turned off but there are still requests in progress.
+	 * Wait until they finish.  It doesn't need to be efficient
+	 * in this extremely rare case, just have low overhead in
+	 * the common case when it isn't true. */
+	while (unlikely(lck->rpcl_it == MDC_FAKE_RPCL_IT)) {
+		mutex_unlock(&lck->rpcl_mutex);
+		schedule_timeout(cfs_time_seconds(1) / 4);
+		goto again;
+	}
+
+	LASSERT(lck->rpcl_it == NULL);
+	lck->rpcl_it = it;
+}
+
+static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck,
+				    struct lookup_intent *it)
+{
+	if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP ||
+			   it->it_op == IT_LAYOUT || it->it_op == IT_READDIR))
+		goto out;
+
+	if (lck->rpcl_it == MDC_FAKE_RPCL_IT) { /* OBD_FAIL_MDC_RPCS_SEM */
+		mutex_lock(&lck->rpcl_mutex);
+
+		LASSERTF(lck->rpcl_fakes > 0, "%d\n", lck->rpcl_fakes);
+		lck->rpcl_fakes--;
+
+		if (lck->rpcl_fakes == 0)
+			lck->rpcl_it = NULL;
+
+	} else {
+		LASSERTF(it == lck->rpcl_it, "%p != %p\n", it, lck->rpcl_it);
+		lck->rpcl_it = NULL;
+	}
+
+	mutex_unlock(&lck->rpcl_mutex);
+ out:
+	EXIT;
+}
+
+static inline void mdc_get_mod_rpc_slot(struct ptlrpc_request *req,
+					struct lookup_intent *it)
+{
+	struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+	__u32 opc;
+	__u16 tag;
+
+	opc = lustre_msg_get_opc(req->rq_reqmsg);
+	tag = obd_get_mod_rpc_slot(cli, opc, it);
+	lustre_msg_set_tag(req->rq_reqmsg, tag);
+}
+
+static inline void mdc_put_mod_rpc_slot(struct ptlrpc_request *req,
+					struct lookup_intent *it)
+{
+	struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+	__u32 opc;
+	__u16 tag;
+
+	opc = lustre_msg_get_opc(req->rq_reqmsg);
+	tag = lustre_msg_get_tag(req->rq_reqmsg);
+	obd_put_mod_rpc_slot(cli, opc, it, tag);
+}
+
+
+/**
+ * Update the maximum possible easize.
+ *
+ * This value is learned from ptlrpc replies sent by the MDT.  The
+ * default easize is initialized to the minimum value but allowed to
+ * grow up to a single page in size if required to handle the common
+ * case.
+ *
+ * \see client_obd::cl_default_mds_easize
+ *
+ * \param[in] exp	export for MDC device
+ * \param[in] body	body of ptlrpc reply from MDT
+ *
+ */
+static inline void mdc_update_max_ea_from_body(struct obd_export *exp,
+					       struct mdt_body *body)
+{
+	if (body->mbo_valid & OBD_MD_FLMODEASIZE) {
+		struct client_obd *cli = &exp->exp_obd->u.cli;
+		__u32 def_easize;
+
+		if (cli->cl_max_mds_easize < body->mbo_max_mdsize)
+			cli->cl_max_mds_easize = body->mbo_max_mdsize;
+
+		def_easize = min_t(__u32, body->mbo_max_mdsize,
+				   OBD_MAX_DEFAULT_EA_SIZE);
+		cli->cl_default_mds_easize = def_easize;
+	}
+}
+
+
+/* mdc/mdc_locks.c */
+int it_open_error(int phase, struct lookup_intent *it);
+
+static inline bool cl_is_lov_delay_create(unsigned int flags)
+{
+	return  (flags & O_LOV_DELAY_CREATE_1_8) != 0 ||
+		(flags & O_LOV_DELAY_CREATE_MASK) == O_LOV_DELAY_CREATE_MASK;
+}
+
+static inline void cl_lov_delay_create_clear(unsigned int *flags)
+{
+	if ((*flags & O_LOV_DELAY_CREATE_1_8) != 0)
+		*flags &= ~O_LOV_DELAY_CREATE_1_8;
+	if ((*flags & O_LOV_DELAY_CREATE_MASK) == O_LOV_DELAY_CREATE_MASK)
+		*flags &= ~O_LOV_DELAY_CREATE_MASK;
+}
+
+/** @} mdc */
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_mds.h b/drivers/staging/lustrefsx/lustre/include/lustre_mds.h
new file mode 100644
index 0000000000000..c254c7f730f10
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_mds.h
@@ -0,0 +1,74 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_mds.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_MDS_H
+#define _LUSTRE_MDS_H
+
+/** \defgroup mds mds
+ *
+ * @{
+ */
+
+#include <lustre_handles.h>
+#include <libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_lib.h>
+#include <lustre_dlm.h>
+#include <lustre_export.h>
+
+struct mds_group_info {
+        struct obd_uuid *uuid;
+        int group;
+};
+
+struct mds_capa_info {
+        struct obd_uuid        *uuid;
+        struct lustre_capa_key *capa;
+};
+
+#define MDD_OBD_NAME     "mdd_obd"
+#define MDD_OBD_UUID     "mdd_obd_uuid"
+
+static inline int md_should_create(__u64 flags)
+{
+	return !(flags & MDS_OPEN_DELAY_CREATE) && (flags & FMODE_WRITE) &&
+               !(flags & MDS_OPEN_LEASE);
+}
+
+/** @} mds */
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_net.h b/drivers/staging/lustrefsx/lustre/include/lustre_net.h
new file mode 100644
index 0000000000000..f6d67c832ed64
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_net.h
@@ -0,0 +1,2716 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/** \defgroup PtlRPC Portal RPC and networking module.
+ *
+ * PortalRPC is the layer used by rest of lustre code to achieve network
+ * communications: establish connections with corresponding export and import
+ * states, listen for a service, send and receive RPCs.
+ * PortalRPC also includes base recovery framework: packet resending and
+ * replaying, reconnections, pinger.
+ *
+ * PortalRPC utilizes LNet as its transport layer.
+ *
+ * @{
+ */
+
+
+#ifndef _LUSTRE_NET_H
+#define _LUSTRE_NET_H
+
+/** \defgroup net net
+ *
+ * @{
+ */
+
+#include <linux/uio.h>
+#include <libcfs/libcfs.h>
+#include <lnet/nidstr.h>
+#include <lnet/api.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_ha.h>
+#include <lustre_sec.h>
+#include <lustre_import.h>
+#include <lprocfs_status.h>
+#include <lu_object.h>
+#include <lustre_req_layout.h>
+#include <obd_support.h>
+#include <lustre_ver.h>
+
+/* MD flags we _always_ use */
+#define PTLRPC_MD_OPTIONS  0
+
+/**
+ * log2 max # of bulk operations in one request: 2=4MB/RPC, 5=32MB/RPC, ...
+ * In order for the client and server to properly negotiate the maximum
+ * possible transfer size, PTLRPC_BULK_OPS_COUNT must be a power-of-two
+ * value.  The client is free to limit the actual RPC size for any bulk
+ * transfer via cl_max_pages_per_rpc to some non-power-of-two value.
+ * NOTE: This is limited to 16 (=64GB RPCs) by IOOBJ_MAX_BRW_BITS. */
+#define PTLRPC_BULK_OPS_BITS	4
+#if PTLRPC_BULK_OPS_BITS > 16
+#error "More than 65536 BRW RPCs not allowed by IOOBJ_MAX_BRW_BITS."
+#endif
+#define PTLRPC_BULK_OPS_COUNT	(1U << PTLRPC_BULK_OPS_BITS)
+/**
+ * PTLRPC_BULK_OPS_MASK is for the convenience of the client only, and
+ * should not be used on the server at all.  Otherwise, it imposes a
+ * protocol limitation on the maximum RPC size that can be used by any
+ * RPC sent to that server in the future.  Instead, the server should
+ * use the negotiated per-client ocd_brw_size to determine the bulk
+ * RPC count. */
+#define PTLRPC_BULK_OPS_MASK	(~((__u64)PTLRPC_BULK_OPS_COUNT - 1))
+
+/**
+ * Define maxima for bulk I/O.
+ *
+ * A single PTLRPC BRW request is sent via up to PTLRPC_BULK_OPS_COUNT
+ * of LNET_MTU sized RDMA transfers.  Clients and servers negotiate the
+ * currently supported maximum between peers at connect via ocd_brw_size.
+ */
+#define PTLRPC_MAX_BRW_BITS	(LNET_MTU_BITS + PTLRPC_BULK_OPS_BITS)
+#define PTLRPC_MAX_BRW_SIZE	(1U << PTLRPC_MAX_BRW_BITS)
+#define PTLRPC_MAX_BRW_PAGES	(PTLRPC_MAX_BRW_SIZE >> PAGE_SHIFT)
+
+#define ONE_MB_BRW_SIZE		(1U << LNET_MTU_BITS)
+#define MD_MAX_BRW_SIZE		(1U << LNET_MTU_BITS)
+#define MD_MAX_BRW_PAGES	(MD_MAX_BRW_SIZE >> PAGE_SHIFT)
+#define DT_MAX_BRW_SIZE		PTLRPC_MAX_BRW_SIZE
+#define DT_DEF_BRW_SIZE		(4 * ONE_MB_BRW_SIZE)
+#define DT_MAX_BRW_PAGES	(DT_MAX_BRW_SIZE >> PAGE_SHIFT)
+#define OFD_MAX_BRW_SIZE	(1U << LNET_MTU_BITS)
+
+/* When PAGE_SIZE is a constant, we can check our arithmetic here with cpp! */
+#if ((PTLRPC_MAX_BRW_PAGES & (PTLRPC_MAX_BRW_PAGES - 1)) != 0)
+# error "PTLRPC_MAX_BRW_PAGES isn't a power of two"
+#endif
+#if (PTLRPC_MAX_BRW_SIZE != (PTLRPC_MAX_BRW_PAGES * PAGE_SIZE))
+# error "PTLRPC_MAX_BRW_SIZE isn't PTLRPC_MAX_BRW_PAGES * PAGE_SIZE"
+#endif
+#if (PTLRPC_MAX_BRW_SIZE > LNET_MTU * PTLRPC_BULK_OPS_COUNT)
+# error "PTLRPC_MAX_BRW_SIZE too big"
+#endif
+#if (PTLRPC_MAX_BRW_PAGES > LNET_MAX_IOV * PTLRPC_BULK_OPS_COUNT)
+# error "PTLRPC_MAX_BRW_PAGES too big"
+#endif
+
+#define PTLRPC_NTHRS_INIT	2
+
+/**
+ * Buffer Constants
+ *
+ * Constants determine how memory is used to buffer incoming service requests.
+ *
+ * ?_NBUFS              # buffers to allocate when growing the pool
+ * ?_BUFSIZE            # bytes in a single request buffer
+ * ?_MAXREQSIZE         # maximum request service will receive
+ *
+ * When fewer than ?_NBUFS/2 buffers are posted for receive, another chunk
+ * of ?_NBUFS is added to the pool.
+ *
+ * Messages larger than ?_MAXREQSIZE are dropped.  Request buffers are
+ * considered full when less than ?_MAXREQSIZE is left in them.
+ */
+/**
+ * Thread Constants
+ *
+ * Constants determine how threads are created for ptlrpc service.
+ *
+ * ?_NTHRS_INIT	        # threads to create for each service partition on
+ *			  initializing. If it's non-affinity service and
+ *			  there is only one partition, it's the overall #
+ *			  threads for the service while initializing.
+ * ?_NTHRS_BASE		# threads should be created at least for each
+ *			  ptlrpc partition to keep the service healthy.
+ *			  It's the low-water mark of threads upper-limit
+ *			  for each partition.
+ * ?_THR_FACTOR         # threads can be added on threads upper-limit for
+ *			  each CPU core. This factor is only for reference,
+ *			  we might decrease value of factor if number of cores
+ *			  per CPT is above a limit.
+ * ?_NTHRS_MAX		# overall threads can be created for a service,
+ *			  it's a soft limit because if service is running
+ *			  on machine with hundreds of cores and tens of
+ *			  CPU partitions, we need to guarantee each partition
+ *			  has ?_NTHRS_BASE threads, which means total threads
+ *			  will be ?_NTHRS_BASE * number_of_cpts which can
+ *			  exceed ?_NTHRS_MAX.
+ *
+ * Examples
+ *
+ * #define MDS_NTHRS_INIT	2
+ * #define MDS_NTHRS_BASE	64
+ * #define MDS_NTHRS_FACTOR	8
+ * #define MDS_NTHRS_MAX	1024
+ *
+ * Example 1):
+ * ---------------------------------------------------------------------
+ * Server(A) has 16 cores, user configured it to 4 partitions so each
+ * partition has 4 cores, then actual number of service threads on each
+ * partition is:
+ *     MDS_NTHRS_BASE(64) + cores(4) * MDS_NTHRS_FACTOR(8) = 96
+ *
+ * Total number of threads for the service is:
+ *     96 * partitions(4) = 384
+ *
+ * Example 2):
+ * ---------------------------------------------------------------------
+ * Server(B) has 32 cores, user configured it to 4 partitions so each
+ * partition has 8 cores, then actual number of service threads on each
+ * partition is:
+ *     MDS_NTHRS_BASE(64) + cores(8) * MDS_NTHRS_FACTOR(8) = 128
+ *
+ * Total number of threads for the service is:
+ *     128 * partitions(4) = 512
+ *
+ * Example 3):
+ * ---------------------------------------------------------------------
+ * Server(B) has 96 cores, user configured it to 8 partitions so each
+ * partition has 12 cores, then actual number of service threads on each
+ * partition is:
+ *     MDS_NTHRS_BASE(64) + cores(12) * MDS_NTHRS_FACTOR(8) = 160
+ *
+ * Total number of threads for the service is:
+ *     160 * partitions(8) = 1280
+ *
+ * However, it's above the soft limit MDS_NTHRS_MAX, so we choose this number
+ * as upper limit of threads number for each partition:
+ *     MDS_NTHRS_MAX(1024) / partitions(8) = 128
+ *
+ * Example 4):
+ * ---------------------------------------------------------------------
+ * Server(C) have a thousand of cores and user configured it to 32 partitions
+ *     MDS_NTHRS_BASE(64) * 32 = 2048
+ *
+ * which is already above soft limit MDS_NTHRS_MAX(1024), but we still need
+ * to guarantee that each partition has at least MDS_NTHRS_BASE(64) threads
+ * to keep service healthy, so total number of threads will just be 2048.
+ *
+ * NB: we don't suggest to choose server with that many cores because backend
+ *     filesystem itself, buffer cache, or underlying network stack might
+ *     have some SMP scalability issues at that large scale.
+ *
+ *     If user already has a fat machine with hundreds or thousands of cores,
+ *     there are two choices for configuration:
+ *     a) create CPU table from subset of all CPUs and run Lustre on
+ *        top of this subset
+ *     b) bind service threads on a few partitions, see modparameters of
+ *        MDS and OSS for details
+*
+ * NB: these calculations (and examples below) are simplified to help
+ *     understanding, the real implementation is a little more complex,
+ *     please see ptlrpc_server_nthreads_check() for details.
+ *
+ */
+
+ /*
+  * LDLM threads constants:
+  *
+  * Given 8 as factor and 24 as base threads number
+  *
+  * example 1)
+  * On 4-core machine we will have 24 + 8 * 4 = 56 threads.
+  *
+  * example 2)
+  * On 8-core machine with 2 partitions we will have 24 + 4 * 8 = 56
+  * threads for each partition and total threads number will be 112.
+  *
+  * example 3)
+  * On 64-core machine with 8 partitions we will need LDLM_NTHRS_BASE(24)
+  * threads for each partition to keep service healthy, so total threads
+  * number should be 24 * 8 = 192.
+  *
+  * So with these constants, threads number will be at the similar level
+  * of old versions, unless target machine has over a hundred cores
+  */
+#define LDLM_THR_FACTOR		8
+#define LDLM_NTHRS_INIT		PTLRPC_NTHRS_INIT
+#define LDLM_NTHRS_BASE		24
+#define LDLM_NTHRS_MAX		(num_online_cpus() == 1 ? 64 : 128)
+
+#define LDLM_BL_THREADS   LDLM_NTHRS_AUTO_INIT
+#define LDLM_CLIENT_NBUFS 1
+#define LDLM_SERVER_NBUFS 64
+#define LDLM_BUFSIZE      (8 * 1024)
+#define LDLM_MAXREQSIZE   (5 * 1024)
+#define LDLM_MAXREPSIZE   (1024)
+
+ /*
+  * MDS threads constants:
+  *
+  * Please see examples in "Thread Constants", MDS threads number will be at
+  * the comparable level of old versions, unless the server has many cores.
+  */
+#ifndef MDS_MAX_THREADS
+#define MDS_MAX_THREADS		1024
+#define MDS_MAX_OTHR_THREADS	256
+
+#else /* MDS_MAX_THREADS */
+#if MDS_MAX_THREADS < PTLRPC_NTHRS_INIT
+#undef MDS_MAX_THREADS
+#define MDS_MAX_THREADS	PTLRPC_NTHRS_INIT
+#endif
+#define MDS_MAX_OTHR_THREADS	max(PTLRPC_NTHRS_INIT, MDS_MAX_THREADS / 2)
+#endif
+
+/* default service */
+#define MDS_THR_FACTOR		8
+#define MDS_NTHRS_INIT		PTLRPC_NTHRS_INIT
+#define MDS_NTHRS_MAX		MDS_MAX_THREADS
+#define MDS_NTHRS_BASE		min(64, MDS_NTHRS_MAX)
+
+/* read-page service */
+#define MDS_RDPG_THR_FACTOR	4
+#define MDS_RDPG_NTHRS_INIT	PTLRPC_NTHRS_INIT
+#define MDS_RDPG_NTHRS_MAX	MDS_MAX_OTHR_THREADS
+#define MDS_RDPG_NTHRS_BASE	min(48, MDS_RDPG_NTHRS_MAX)
+
+/* these should be removed when we remove setattr service in the future */
+#define MDS_SETA_THR_FACTOR	4
+#define MDS_SETA_NTHRS_INIT	PTLRPC_NTHRS_INIT
+#define MDS_SETA_NTHRS_MAX	MDS_MAX_OTHR_THREADS
+#define MDS_SETA_NTHRS_BASE	min(48, MDS_SETA_NTHRS_MAX)
+
+/* non-affinity threads */
+#define MDS_OTHR_NTHRS_INIT	PTLRPC_NTHRS_INIT
+#define MDS_OTHR_NTHRS_MAX	MDS_MAX_OTHR_THREADS
+
+#define MDS_NBUFS		64
+
+/**
+ * Assume file name length = FNAME_MAX = 256 (true for ext3).
+ *	  path name length = PATH_MAX = 4096
+ *	  LOV MD size max  = EA_MAX = 24 * 2000
+ *	  	(NB: 24 is size of lov_ost_data)
+ *	  LOV LOGCOOKIE size max = 32 * 2000
+ *	  	(NB: 32 is size of llog_cookie)
+ * symlink:  FNAME_MAX + PATH_MAX  <- largest
+ * link:     FNAME_MAX + PATH_MAX  (mds_rec_link < mds_rec_create)
+ * rename:   FNAME_MAX + FNAME_MAX
+ * open:     FNAME_MAX + EA_MAX
+ *
+ * MDS_MAXREQSIZE ~= 4736 bytes =
+ * lustre_msg + ldlm_request + mdt_body + mds_rec_create + FNAME_MAX + PATH_MAX
+ * MDS_MAXREPSIZE ~= 8300 bytes = lustre_msg + llog_header
+ *
+ * Realistic size is about 512 bytes (20 character name + 128 char symlink),
+ * except in the open case where there are a large number of OSTs in a LOV.
+ */
+#define MDS_MAXREQSIZE		(5 * 1024)	/* >= 4736 */
+#define MDS_MAXREPSIZE		(9 * 1024)	/* >= 8300 */
+
+/**
+ * MDS incoming request with LOV EA
+ * 24 = sizeof(struct lov_ost_data), i.e: replay of opencreate
+ */
+#define MDS_LOV_MAXREQSIZE	max(MDS_MAXREQSIZE, \
+				    362 + LOV_MAX_STRIPE_COUNT * 24)
+/**
+ * MDS outgoing reply with LOV EA
+ *
+ * NB: max reply size Lustre 2.4+ client can get from old MDS is:
+ * LOV_MAX_STRIPE_COUNT * (llog_cookie + lov_ost_data) + extra bytes
+ *
+ * but 2.4 or later MDS will never send reply with llog_cookie to any
+ * version client. This macro is defined for server side reply buffer size.
+ */
+#define MDS_LOV_MAXREPSIZE	MDS_LOV_MAXREQSIZE
+
+/**
+ * This is the size of a maximum REINT_SETXATTR request:
+ *
+ *   lustre_msg		 56 (32 + 4 x 5 + 4)
+ *   ptlrpc_body	184
+ *   mdt_rec_setxattr	136
+ *   lustre_capa	120
+ *   name		256 (XATTR_NAME_MAX)
+ *   value	      65536 (XATTR_SIZE_MAX)
+ */
+#define MDS_EA_MAXREQSIZE	66288
+
+/**
+ * These are the maximum request and reply sizes (rounded up to 1 KB
+ * boundaries) for the "regular" MDS_REQUEST_PORTAL and MDS_REPLY_PORTAL.
+ */
+#define MDS_REG_MAXREQSIZE	(((max(MDS_EA_MAXREQSIZE, \
+				       MDS_LOV_MAXREQSIZE) + 1023) >> 10) << 10)
+#define MDS_REG_MAXREPSIZE	MDS_REG_MAXREQSIZE
+
+/**
+ * The update request includes all of updates from the create, which might
+ * include linkea (4K maxim), together with other updates, we set it to 1000K:
+ * lustre_msg + ptlrpc_body + OUT_UPDATE_BUFFER_SIZE_MAX
+ */
+#define OUT_MAXREQSIZE	(1000 * 1024)
+#define OUT_MAXREPSIZE	MDS_MAXREPSIZE
+
+/** MDS_BUFSIZE = max_reqsize (w/o LOV EA) + max sptlrpc payload size */
+#define MDS_BUFSIZE		max(MDS_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \
+				    8 * 1024)
+
+/**
+ * MDS_REG_BUFSIZE should at least be MDS_REG_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD.
+ * However, we need to allocate a much larger buffer for it because LNet
+ * requires each MD(rqbd) has at least MDS_REQ_MAXREQSIZE bytes left to avoid
+ * dropping of maximum-sized incoming request.  So if MDS_REG_BUFSIZE is only a
+ * little larger than MDS_REG_MAXREQSIZE, then it can only fit in one request
+ * even there are about MDS_REG_MAX_REQSIZE bytes left in a rqbd, and memory
+ * utilization is very low.
+ *
+ * In the meanwhile, size of rqbd can't be too large, because rqbd can't be
+ * reused until all requests fit in it have been processed and released,
+ * which means one long blocked request can prevent the rqbd be reused.
+ * Now we set request buffer size to 160 KB, so even each rqbd is unlinked
+ * from LNet with unused 65 KB, buffer utilization will be about 59%.
+ * Please check LU-2432 for details.
+ */
+#define MDS_REG_BUFSIZE		max(MDS_REG_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \
+				    160 * 1024)
+
+/**
+ * OUT_BUFSIZE = max_out_reqsize + max sptlrpc payload (~1K) which is
+ * about 10K, for the same reason as MDS_REG_BUFSIZE, we also give some
+ * extra bytes to each request buffer to improve buffer utilization rate.
+  */
+#define OUT_BUFSIZE		max(OUT_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \
+				    24 * 1024)
+
+/** FLD_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc */
+#define FLD_MAXREQSIZE  (160)
+
+/** FLD_MAXREPSIZE == lustre_msg + ptlrpc_body */
+#define FLD_MAXREPSIZE  (152)
+#define FLD_BUFSIZE	(1 << 12)
+
+/**
+ * SEQ_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc + lu_range +
+ * __u32 padding */
+#define SEQ_MAXREQSIZE  (160)
+
+/** SEQ_MAXREPSIZE == lustre_msg + ptlrpc_body + lu_range */
+#define SEQ_MAXREPSIZE  (152)
+#define SEQ_BUFSIZE	(1 << 12)
+
+/** MGS threads must be >= 3, see bug 22458 comment #28 */
+#define MGS_NTHRS_INIT	(PTLRPC_NTHRS_INIT + 1)
+#define MGS_NTHRS_MAX	32
+
+#define MGS_NBUFS       64
+#define MGS_BUFSIZE     (8 * 1024)
+#define MGS_MAXREQSIZE  (7 * 1024)
+#define MGS_MAXREPSIZE  (9 * 1024)
+
+ /*
+  * OSS threads constants:
+  *
+  * Given 8 as factor and 64 as base threads number
+  *
+  * example 1):
+  * On 8-core server configured to 2 partitions, we will have
+  * 64 + 8 * 4 = 96 threads for each partition, 192 total threads.
+  *
+  * example 2):
+  * On 32-core machine configured to 4 partitions, we will have
+  * 64 + 8 * 8 = 112 threads for each partition, so total threads number
+  * will be 112 * 4 = 448.
+  *
+  * example 3):
+  * On 64-core machine configured to 4 partitions, we will have
+  * 64 + 16 * 8 = 192 threads for each partition, so total threads number
+  * will be 192 * 4 = 768 which is above limit OSS_NTHRS_MAX(512), so we
+  * cut off the value to OSS_NTHRS_MAX(512) / 4 which is 128 threads
+  * for each partition.
+  *
+  * So we can see that with these constants, threads number wil be at the
+  * similar level of old versions, unless the server has many cores.
+  */
+ /* depress threads factor for VM with small memory size */
+#define OSS_THR_FACTOR		min_t(int, 8, \
+				NUM_CACHEPAGES >> (28 - PAGE_SHIFT))
+#define OSS_NTHRS_INIT		(PTLRPC_NTHRS_INIT + 1)
+#define OSS_NTHRS_BASE		64
+
+/* threads for handling "create" request */
+#define OSS_CR_THR_FACTOR	1
+#define OSS_CR_NTHRS_INIT	PTLRPC_NTHRS_INIT
+#define OSS_CR_NTHRS_BASE	8
+#define OSS_CR_NTHRS_MAX	64
+
+/**
+ * OST_IO_MAXREQSIZE ~=
+ * 	lustre_msg + ptlrpc_body + obdo + obd_ioobj +
+ * 	DT_MAX_BRW_PAGES * niobuf_remote
+ *
+ * - single object with 16 pages is 512 bytes
+ * - OST_IO_MAXREQSIZE must be at least 1 page of cookies plus some spillover
+ * - Must be a multiple of 1024
+ * - actual size is about 18K
+ */
+#define _OST_MAXREQSIZE_SUM (sizeof(struct lustre_msg) + \
+			     sizeof(struct ptlrpc_body) + \
+			     sizeof(struct obdo) + \
+			     sizeof(struct obd_ioobj) + \
+			     sizeof(struct niobuf_remote) * DT_MAX_BRW_PAGES)
+/**
+ * FIEMAP request can be 4K+ for now
+ */
+#define OST_MAXREQSIZE		(16 * 1024)
+#define OST_IO_MAXREQSIZE	max_t(int, OST_MAXREQSIZE, \
+				(((_OST_MAXREQSIZE_SUM - 1) | (1024 - 1)) + 1))
+
+#define OST_MAXREPSIZE		(9 * 1024)
+#define OST_IO_MAXREPSIZE	OST_MAXREPSIZE
+
+#define OST_NBUFS		64
+/** OST_BUFSIZE = max_reqsize + max sptlrpc payload size */
+#define OST_BUFSIZE		max_t(int, OST_MAXREQSIZE + 1024, 16 * 1024)
+/**
+ * OST_IO_MAXREQSIZE is 18K, giving extra 46K can increase buffer utilization
+ * rate of request buffer, please check comment of MDS_LOV_BUFSIZE for details.
+ */
+#define OST_IO_BUFSIZE		max_t(int, OST_IO_MAXREQSIZE + 1024, 64 * 1024)
+
+/* Macro to hide a typecast. */
+#define ptlrpc_req_async_args(req) ((void *)&req->rq_async_args)
+
+struct ptlrpc_replay_async_args {
+	int		praa_old_state;
+	int		praa_old_status;
+};
+
+/**
+ * Structure to single define portal connection.
+ */
+struct ptlrpc_connection {
+	/** linkage for connections hash table */
+	struct hlist_node        c_hash;
+	/** Our own lnet nid for this connection */
+	lnet_nid_t              c_self;
+	/** Remote side nid for this connection */
+	struct lnet_process_id       c_peer;
+	/** UUID of the other side */
+	struct obd_uuid         c_remote_uuid;
+	/** reference counter for this connection */
+	atomic_t            c_refcount;
+};
+
+/** Client definition for PortalRPC */
+struct ptlrpc_client {
+        /** What lnet portal does this client send messages to by default */
+        __u32                   cli_request_portal;
+        /** What portal do we expect replies on */
+        __u32                   cli_reply_portal;
+        /** Name of the client */
+        char                   *cli_name;
+};
+
+/** state flags of requests */
+/* XXX only ones left are those used by the bulk descs as well! */
+#define PTL_RPC_FL_INTR      (1 << 0)  /* reply wait was interrupted by user */
+#define PTL_RPC_FL_TIMEOUT   (1 << 7)  /* request timed out waiting for reply */
+
+#define REQ_MAX_ACK_LOCKS 8
+
+union ptlrpc_async_args {
+        /**
+         * Scratchpad for passing args to completion interpreter. Users
+         * cast to the struct of their choosing, and CLASSERT that this is
+         * big enough.  For _tons_ of context, OBD_ALLOC a struct and store
+         * a pointer to it here.  The pointer_arg ensures this struct is at
+         * least big enough for that.
+         */
+        void      *pointer_arg[11];
+	__u64      space[7];
+};
+
+struct ptlrpc_request_set;
+typedef int (*set_interpreter_func)(struct ptlrpc_request_set *, void *, int);
+typedef int (*set_producer_func)(struct ptlrpc_request_set *, void *);
+
+/**
+ * Definition of request set structure.
+ * Request set is a list of requests (not necessary to the same target) that
+ * once populated with RPCs could be sent in parallel.
+ * There are two kinds of request sets. General purpose and with dedicated
+ * serving thread. Example of the latter is ptlrpcd set.
+ * For general purpose sets once request set started sending it is impossible
+ * to add new requests to such set.
+ * Provides a way to call "completion callbacks" when all requests in the set
+ * returned.
+ */
+struct ptlrpc_request_set {
+	atomic_t		set_refcount;
+	/** number of in queue requests */
+	atomic_t		set_new_count;
+	/** number of uncompleted requests */
+	atomic_t		set_remaining;
+	/** wait queue to wait on for request events */
+	wait_queue_head_t	set_waitq;
+	wait_queue_head_t      *set_wakeup_ptr;
+	/** List of requests in the set */
+	struct list_head	set_requests;
+	/**
+	 * List of completion callbacks to be called when the set is completed
+	 * This is only used if \a set_interpret is NULL.
+	 * Links struct ptlrpc_set_cbdata.
+	 */
+	struct list_head	set_cblist;
+	/** Completion callback, if only one. */
+	set_interpreter_func	set_interpret;
+	/** opaq argument passed to completion \a set_interpret callback. */
+	void			*set_arg;
+	/**
+	 * Lock for \a set_new_requests manipulations
+	 * locked so that any old caller can communicate requests to
+	 * the set holder who can then fold them into the lock-free set
+	 */
+	spinlock_t		set_new_req_lock;
+	/** List of new yet unsent requests. Only used with ptlrpcd now. */
+	struct list_head	set_new_requests;
+
+	/** rq_status of requests that have been freed already */
+	int			set_rc;
+	/** Additional fields used by the flow control extension */
+	/** Maximum number of RPCs in flight */
+	int			set_max_inflight;
+	/** Callback function used to generate RPCs */
+	set_producer_func	set_producer;
+	/** opaq argument passed to the producer callback */
+	void			*set_producer_arg;
+	unsigned int		 set_allow_intr:1;
+};
+
+/**
+ * Description of a single ptrlrpc_set callback
+ */
+struct ptlrpc_set_cbdata {
+	/** List linkage item */
+	struct list_head	psc_item;
+	/** Pointer to interpreting function */
+	set_interpreter_func	psc_interpret;
+	/** Opaq argument to pass to the callback */
+	void			*psc_data;
+};
+
+struct ptlrpc_bulk_desc;
+struct ptlrpc_service_part;
+struct ptlrpc_service;
+
+/**
+ * ptlrpc callback & work item stuff
+ */
+struct ptlrpc_cb_id {
+	void (*cbid_fn)(struct lnet_event *ev);	/* specific callback fn */
+	void *cbid_arg;				/* additional arg */
+};
+
+/** Maximum number of locks to fit into reply state */
+#define RS_MAX_LOCKS 8
+#define RS_DEBUG     0
+
+/**
+ * Structure to define reply state on the server
+ * Reply state holds various reply message information. Also for "difficult"
+ * replies (rep-ack case) we store the state after sending reply and wait
+ * for the client to acknowledge the reception. In these cases locks could be
+ * added to the state for replay/failover consistency guarantees.
+ */
+struct ptlrpc_reply_state {
+	/** Callback description */
+	struct ptlrpc_cb_id	rs_cb_id;
+	/** Linkage for list of all reply states in a system */
+	struct list_head	rs_list;
+	/** Linkage for list of all reply states on same export */
+	struct list_head	rs_exp_list;
+	/** Linkage for list of all reply states for same obd */
+	struct list_head	rs_obd_list;
+#if RS_DEBUG
+	struct list_head	rs_debug_list;
+#endif
+	/** A spinlock to protect the reply state flags */
+	spinlock_t		rs_lock;
+	/** Reply state flags */
+        unsigned long          rs_difficult:1;     /* ACK/commit stuff */
+        unsigned long          rs_no_ack:1;    /* no ACK, even for
+                                                  difficult requests */
+        unsigned long          rs_scheduled:1;     /* being handled? */
+        unsigned long          rs_scheduled_ever:1;/* any schedule attempts? */
+        unsigned long          rs_handled:1;  /* been handled yet? */
+        unsigned long          rs_on_net:1;   /* reply_out_callback pending? */
+        unsigned long          rs_prealloc:1; /* rs from prealloc list */
+        unsigned long          rs_committed:1;/* the transaction was committed
+                                                 and the rs was dispatched
+                                                 by ptlrpc_commit_replies */
+	unsigned long		rs_convert_lock:1; /* need to convert saved
+						    * locks to COS mode */
+	atomic_t		rs_refcount;	/* number of users */
+	/** Number of locks awaiting client ACK */
+	int			rs_nlocks;
+
+        /** Size of the state */
+        int                    rs_size;
+        /** opcode */
+        __u32                  rs_opc;
+        /** Transaction number */
+        __u64                  rs_transno;
+        /** xid */
+        __u64                  rs_xid;
+	struct obd_export     *rs_export;
+	struct ptlrpc_service_part *rs_svcpt;
+	/** Lnet metadata handle for the reply */
+	struct lnet_handle_md	rs_md_h;
+
+	/** Context for the sevice thread */
+	struct ptlrpc_svc_ctx	*rs_svc_ctx;
+	/** Reply buffer (actually sent to the client), encoded if needed */
+	struct lustre_msg	*rs_repbuf;	/* wrapper */
+	/** Size of the reply buffer */
+	int			rs_repbuf_len;	/* wrapper buf length */
+	/** Size of the reply message */
+	int			rs_repdata_len;	/* wrapper msg length */
+	/**
+	 * Actual reply message. Its content is encrupted (if needed) to
+	 * produce reply buffer for actual sending. In simple case
+	 * of no network encryption we jus set \a rs_repbuf to \a rs_msg
+	 */
+	struct lustre_msg	*rs_msg;	/* reply message */
+
+	/** Handles of locks awaiting client reply ACK */
+	struct lustre_handle	rs_locks[RS_MAX_LOCKS];
+	/** Lock modes of locks in \a rs_locks */
+	enum ldlm_mode		rs_modes[RS_MAX_LOCKS];
+};
+
+struct ptlrpc_thread;
+
+/** RPC stages */
+enum rq_phase {
+	RQ_PHASE_NEW            = 0xebc0de00,
+	RQ_PHASE_RPC            = 0xebc0de01,
+	RQ_PHASE_BULK           = 0xebc0de02,
+	RQ_PHASE_INTERPRET      = 0xebc0de03,
+	RQ_PHASE_COMPLETE       = 0xebc0de04,
+	RQ_PHASE_UNREG_RPC      = 0xebc0de05,
+	RQ_PHASE_UNREG_BULK     = 0xebc0de06,
+	RQ_PHASE_UNDEFINED      = 0xebc0de07
+};
+
+/** Type of request interpreter call-back */
+typedef int (*ptlrpc_interpterer_t)(const struct lu_env *env,
+                                    struct ptlrpc_request *req,
+                                    void *arg, int rc);
+/** Type of request resend call-back */
+typedef void (*ptlrpc_resend_cb_t)(struct ptlrpc_request *req,
+				   void *arg);
+
+/**
+ * Definition of request pool structure.
+ * The pool is used to store empty preallocated requests for the case
+ * when we would actually need to send something without performing
+ * any allocations (to avoid e.g. OOM).
+ */
+struct ptlrpc_request_pool {
+	/** Locks the list */
+	spinlock_t		prp_lock;
+	/** list of ptlrpc_request structs */
+	struct list_head	prp_req_list;
+	/** Maximum message size that would fit into a rquest from this pool */
+	int			prp_rq_size;
+	/** Function to allocate more requests for this pool */
+	int (*prp_populate)(struct ptlrpc_request_pool *, int);
+};
+
+struct lu_context;
+struct lu_env;
+
+struct ldlm_lock;
+
+#include <lustre_nrs.h>
+
+/**
+ * Basic request prioritization operations structure.
+ * The whole idea is centered around locks and RPCs that might affect locks.
+ * When a lock is contended we try to give priority to RPCs that might lead
+ * to fastest release of that lock.
+ * Currently only implemented for OSTs only in a way that makes all
+ * IO and truncate RPCs that are coming from a locked region where a lock is
+ * contended a priority over other requests.
+ */
+struct ptlrpc_hpreq_ops {
+        /**
+         * Check if the lock handle of the given lock is the same as
+         * taken from the request.
+         */
+        int  (*hpreq_lock_match)(struct ptlrpc_request *, struct ldlm_lock *);
+        /**
+         * Check if the request is a high priority one.
+         */
+        int  (*hpreq_check)(struct ptlrpc_request *);
+        /**
+         * Called after the request has been handled.
+         */
+        void (*hpreq_fini)(struct ptlrpc_request *);
+};
+
+struct ptlrpc_cli_req {
+	/** For bulk requests on client only: bulk descriptor */
+	struct ptlrpc_bulk_desc		*cr_bulk;
+	/** optional time limit for send attempts */
+	cfs_duration_t			 cr_delay_limit;
+	/** time request was first queued */
+	cfs_time_t			 cr_queued_time;
+	/** request sent in nanoseconds */
+	ktime_t				 cr_sent_ns;
+	/** time for request really sent out */
+	time64_t			 cr_sent_out;
+	/** when req reply unlink must finish. */
+	time64_t			 cr_reply_deadline;
+	/** when req bulk unlink must finish. */
+	time64_t			 cr_bulk_deadline;
+	/** when req unlink must finish. */
+	time64_t			 cr_req_deadline;
+	/** Portal to which this request would be sent */
+	short				 cr_req_ptl;
+	/** Portal where to wait for reply and where reply would be sent */
+	short				 cr_rep_ptl;
+	/** request resending number */
+	unsigned int			 cr_resend_nr;
+	/** What was import generation when this request was sent */
+	int				 cr_imp_gen;
+	enum lustre_imp_state		 cr_send_state;
+	/** Per-request waitq introduced by bug 21938 for recovery waiting */
+	wait_queue_head_t		 cr_set_waitq;
+	/** Link item for request set lists */
+	struct list_head		 cr_set_chain;
+	/** link to waited ctx */
+	struct list_head		 cr_ctx_chain;
+
+	/** client's half ctx */
+	struct ptlrpc_cli_ctx		*cr_cli_ctx;
+	/** Link back to the request set */
+	struct ptlrpc_request_set	*cr_set;
+	/** outgoing request MD handle */
+	struct lnet_handle_md		 cr_req_md_h;
+	/** request-out callback parameter */
+	struct ptlrpc_cb_id		 cr_req_cbid;
+	/** incoming reply MD handle */
+	struct lnet_handle_md		 cr_reply_md_h;
+	wait_queue_head_t		 cr_reply_waitq;
+	/** reply callback parameter */
+	struct ptlrpc_cb_id		 cr_reply_cbid;
+	/** Async completion handler, called when reply is received */
+	ptlrpc_interpterer_t		 cr_reply_interp;
+	/** Resend handler, called when request is resend to update RPC data */
+	ptlrpc_resend_cb_t		 cr_resend_cb;
+	/** Async completion context */
+	union ptlrpc_async_args		 cr_async_args;
+	/** Opaq data for replay and commit callbacks. */
+	void				*cr_cb_data;
+	/** Link to the imp->imp_unreplied_list */
+	struct list_head		 cr_unreplied_list;
+	/**
+	 * Commit callback, called when request is committed and about to be
+	 * freed.
+	 */
+	void (*cr_commit_cb)(struct ptlrpc_request *);
+	/** Replay callback, called after request is replayed at recovery */
+	void (*cr_replay_cb)(struct ptlrpc_request *);
+};
+
+/** client request member alias */
+/* NB: these alias should NOT be used by any new code, instead they should
+ * be removed step by step to avoid potential abuse */
+#define rq_bulk			rq_cli.cr_bulk
+#define rq_delay_limit		rq_cli.cr_delay_limit
+#define rq_queued_time		rq_cli.cr_queued_time
+#define rq_sent_ns		rq_cli.cr_sent_ns
+#define rq_real_sent		rq_cli.cr_sent_out
+#define rq_reply_deadline	rq_cli.cr_reply_deadline
+#define rq_bulk_deadline	rq_cli.cr_bulk_deadline
+#define rq_req_deadline		rq_cli.cr_req_deadline
+#define rq_nr_resend		rq_cli.cr_resend_nr
+#define rq_request_portal	rq_cli.cr_req_ptl
+#define rq_reply_portal		rq_cli.cr_rep_ptl
+#define rq_import_generation	rq_cli.cr_imp_gen
+#define rq_send_state		rq_cli.cr_send_state
+#define rq_set_chain		rq_cli.cr_set_chain
+#define rq_ctx_chain		rq_cli.cr_ctx_chain
+#define rq_set			rq_cli.cr_set
+#define rq_set_waitq		rq_cli.cr_set_waitq
+#define rq_cli_ctx		rq_cli.cr_cli_ctx
+#define rq_req_md_h		rq_cli.cr_req_md_h
+#define rq_req_cbid		rq_cli.cr_req_cbid
+#define rq_reply_md_h		rq_cli.cr_reply_md_h
+#define rq_reply_waitq		rq_cli.cr_reply_waitq
+#define rq_reply_cbid		rq_cli.cr_reply_cbid
+#define rq_interpret_reply	rq_cli.cr_reply_interp
+#define rq_resend_cb		rq_cli.cr_resend_cb
+#define rq_async_args		rq_cli.cr_async_args
+#define rq_cb_data		rq_cli.cr_cb_data
+#define rq_unreplied_list	rq_cli.cr_unreplied_list
+#define rq_commit_cb		rq_cli.cr_commit_cb
+#define rq_replay_cb		rq_cli.cr_replay_cb
+
+struct ptlrpc_srv_req {
+	/** initial thread servicing this request */
+	struct ptlrpc_thread		*sr_svc_thread;
+	/**
+	 * Server side list of incoming unserved requests sorted by arrival
+	 * time.  Traversed from time to time to notice about to expire
+	 * requests and sent back "early replies" to clients to let them
+	 * know server is alive and well, just very busy to service their
+	 * requests in time
+	 */
+	struct list_head		 sr_timed_list;
+	/** server-side per-export list */
+	struct list_head		 sr_exp_list;
+	/** server-side history, used for debuging purposes. */
+	struct list_head		 sr_hist_list;
+	/** history sequence # */
+	__u64				 sr_hist_seq;
+	/** the index of service's srv_at_array into which request is linked */
+	__u32				 sr_at_index;
+	/** authed uid */
+	uid_t				 sr_auth_uid;
+	/** authed uid mapped to */
+	uid_t				 sr_auth_mapped_uid;
+	/** RPC is generated from what part of Lustre */
+	enum lustre_sec_part		 sr_sp_from;
+	/** request session context */
+	struct lu_context		 sr_ses;
+	/** \addtogroup  nrs
+	 * @{
+	 */
+	/** stub for NRS request */
+	struct ptlrpc_nrs_request	 sr_nrq;
+	/** @} nrs */
+	/** request arrival time */
+	struct timespec64		 sr_arrival_time;
+	/** server's half ctx */
+	struct ptlrpc_svc_ctx		*sr_svc_ctx;
+	/** (server side), pointed directly into req buffer */
+	struct ptlrpc_user_desc		*sr_user_desc;
+	/** separated reply state, may be vmalloc'd */
+	struct ptlrpc_reply_state	*sr_reply_state;
+	/** server-side hp handlers */
+	struct ptlrpc_hpreq_ops		*sr_ops;
+	/** incoming request buffer */
+	struct ptlrpc_request_buffer_desc *sr_rqbd;
+};
+
+/** server request member alias */
+/* NB: these alias should NOT be used by any new code, instead they should
+ * be removed step by step to avoid potential abuse */
+#define rq_svc_thread		rq_srv.sr_svc_thread
+#define rq_timed_list		rq_srv.sr_timed_list
+#define rq_exp_list		rq_srv.sr_exp_list
+#define rq_history_list		rq_srv.sr_hist_list
+#define rq_history_seq		rq_srv.sr_hist_seq
+#define rq_at_index		rq_srv.sr_at_index
+#define rq_auth_uid		rq_srv.sr_auth_uid
+#define rq_auth_mapped_uid	rq_srv.sr_auth_mapped_uid
+#define rq_sp_from		rq_srv.sr_sp_from
+#define rq_session		rq_srv.sr_ses
+#define rq_nrq			rq_srv.sr_nrq
+#define rq_arrival_time		rq_srv.sr_arrival_time
+#define rq_reply_state		rq_srv.sr_reply_state
+#define rq_svc_ctx		rq_srv.sr_svc_ctx
+#define rq_user_desc		rq_srv.sr_user_desc
+#define rq_ops			rq_srv.sr_ops
+#define rq_rqbd			rq_srv.sr_rqbd
+
+/**
+ * Represents remote procedure call.
+ *
+ * This is a staple structure used by everybody wanting to send a request
+ * in Lustre.
+ */
+struct ptlrpc_request {
+	/* Request type: one of PTL_RPC_MSG_* */
+	int				 rq_type;
+	/** Result of request processing */
+	int				 rq_status;
+	/**
+	 * Linkage item through which this request is included into
+	 * sending/delayed lists on client and into rqbd list on server
+	 */
+	struct list_head		 rq_list;
+	/** Lock to protect request flags and some other important bits, like
+	 * rq_list
+	 */
+	spinlock_t			 rq_lock;
+	spinlock_t			 rq_early_free_lock;
+	/** client-side flags are serialized by rq_lock @{ */
+	unsigned int rq_intr:1, rq_replied:1, rq_err:1,
+                rq_timedout:1, rq_resend:1, rq_restart:1,
+                /**
+                 * when ->rq_replay is set, request is kept by the client even
+                 * after server commits corresponding transaction. This is
+                 * used for operations that require sequence of multiple
+                 * requests to be replayed. The only example currently is file
+                 * open/close. When last request in such a sequence is
+                 * committed, ->rq_replay is cleared on all requests in the
+                 * sequence.
+                 */
+                rq_replay:1,
+                rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1,
+                rq_no_delay:1, rq_net_err:1, rq_wait_ctx:1,
+		rq_early:1,
+		rq_req_unlinked:1,	/* unlinked request buffer from lnet */
+		rq_reply_unlinked:1,	/* unlinked reply buffer from lnet */
+		rq_memalloc:1,      /* req originated from "kswapd" */
+		rq_committed:1,
+		rq_reply_truncated:1,
+		/** whether the "rq_set" is a valid one */
+		rq_invalid_rqset:1,
+		rq_generation_set:1,
+		/** do not resend request on -EINPROGRESS */
+		rq_no_retry_einprogress:1,
+		/* allow the req to be sent if the import is in recovery
+		 * status */
+		rq_allow_replay:1,
+		/* bulk request, sent to server, but uncommitted */
+		rq_unstable:1,
+		rq_early_free_repbuf:1, /* free reply buffer in advance */
+		rq_allow_intr:1;
+	/** @} */
+
+	/** server-side flags @{ */
+	unsigned int
+		rq_hp:1,		/**< high priority RPC */
+		rq_at_linked:1,		/**< link into service's srv_at_array */
+		rq_packed_final:1;	/**< packed final reply */
+	/** @} */
+
+	/** one of RQ_PHASE_* */
+	enum rq_phase			 rq_phase;
+	/** one of RQ_PHASE_* to be used next */
+	enum rq_phase			 rq_next_phase;
+	/**
+	 * client-side refcount for SENT race, server-side refcounf
+	 * for multiple replies
+	 */
+	atomic_t			 rq_refcount;
+        /**
+         * client-side:
+         * !rq_truncate : # reply bytes actually received,
+         *  rq_truncate : required repbuf_len for resend
+         */
+        int rq_nob_received;
+        /** Request length */
+        int rq_reqlen;
+        /** Reply length */
+        int rq_replen;
+	/** Pool if request is from preallocated list */
+	struct ptlrpc_request_pool	*rq_pool;
+	/** Request message - what client sent */
+	struct lustre_msg *rq_reqmsg;
+        /** Reply message - server response */
+        struct lustre_msg *rq_repmsg;
+        /** Transaction number */
+        __u64 rq_transno;
+        /** xid */
+        __u64				 rq_xid;
+	/** bulk match bits */
+	__u64				 rq_mbits;
+	/**
+	 * List item to for replay list. Not yet committed requests get linked
+	 * there.
+	 * Also see \a rq_replay comment above.
+	 * It's also link chain on obd_export::exp_req_replay_queue
+	 */
+	struct list_head		 rq_replay_list;
+	/** non-shared members for client & server request*/
+	union {
+		struct ptlrpc_cli_req	 rq_cli;
+		struct ptlrpc_srv_req	 rq_srv;
+	};
+	/**
+	 * security and encryption data
+	 * @{ */
+	/** description of flavors for client & server */
+	struct sptlrpc_flavor		 rq_flvr;
+
+	/* client/server security flags */
+	unsigned int
+                                 rq_ctx_init:1,      /* context initiation */
+                                 rq_ctx_fini:1,      /* context destroy */
+                                 rq_bulk_read:1,     /* request bulk read */
+                                 rq_bulk_write:1,    /* request bulk write */
+                                 /* server authentication flags */
+                                 rq_auth_gss:1,      /* authenticated by gss */
+                                 rq_auth_usr_root:1, /* authed as root */
+                                 rq_auth_usr_mdt:1,  /* authed as mdt */
+                                 rq_auth_usr_ost:1,  /* authed as ost */
+                                 /* security tfm flags */
+                                 rq_pack_udesc:1,
+                                 rq_pack_bulk:1,
+                                 /* doesn't expect reply FIXME */
+                                 rq_no_reply:1,
+				 rq_pill_init:1, /* pill initialized */
+				 rq_srv_req:1; /* server request */
+
+
+	/** various buffer pointers */
+	struct lustre_msg		*rq_reqbuf;  /**< req wrapper, vmalloc*/
+	char				*rq_repbuf;  /**< rep buffer, vmalloc */
+	struct lustre_msg		*rq_repdata; /**< rep wrapper msg */
+	/** only in priv mode */
+	struct lustre_msg		*rq_clrbuf;
+        int                      rq_reqbuf_len;  /* req wrapper buf len */
+        int                      rq_reqdata_len; /* req wrapper msg len */
+        int                      rq_repbuf_len;  /* rep buffer len */
+        int                      rq_repdata_len; /* rep wrapper msg len */
+        int                      rq_clrbuf_len;  /* only in priv mode */
+        int                      rq_clrdata_len; /* only in priv mode */
+
+	/** early replies go to offset 0, regular replies go after that */
+	unsigned int			 rq_reply_off;
+	/** @} */
+
+	/** Fields that help to see if request and reply were swabbed or not */
+	__u32				 rq_req_swab_mask;
+	__u32				 rq_rep_swab_mask;
+
+	/** how many early replies (for stats) */
+	int				 rq_early_count;
+	/** Server-side, export on which request was received */
+	struct obd_export		*rq_export;
+	/** import where request is being sent */
+	struct obd_import		*rq_import;
+	/** our LNet NID */
+	lnet_nid_t			 rq_self;
+	/** Peer description (the other side) */
+	struct lnet_process_id		 rq_peer;
+	/** Descriptor for the NID from which the peer sent the request. */
+	struct lnet_process_id		 rq_source;
+	/**
+	 * service time estimate (secs)
+	 * If the request is not served by this time, it is marked as timed out.
+	 */
+	int				 rq_timeout;
+	/**
+	 * when request/reply sent (secs), or time when request should be sent
+	 */
+	time64_t			 rq_sent;
+	/** when request must finish. */
+	time64_t			 rq_deadline;
+	/** request format description */
+	struct req_capsule		 rq_pill;
+};
+
+/**
+ * Call completion handler for rpc if any, return it's status or original
+ * rc if there was no handler defined for this request.
+ */
+static inline int ptlrpc_req_interpret(const struct lu_env *env,
+                                       struct ptlrpc_request *req, int rc)
+{
+        if (req->rq_interpret_reply != NULL) {
+                req->rq_status = req->rq_interpret_reply(env, req,
+                                                         &req->rq_async_args,
+                                                         rc);
+                return req->rq_status;
+        }
+        return rc;
+}
+
+/** \addtogroup  nrs
+ * @{
+ */
+int ptlrpc_nrs_policy_register(struct ptlrpc_nrs_pol_conf *conf);
+int ptlrpc_nrs_policy_unregister(struct ptlrpc_nrs_pol_conf *conf);
+void ptlrpc_nrs_req_hp_move(struct ptlrpc_request *req);
+void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy,
+				struct ptlrpc_nrs_pol_info *info);
+
+/*
+ * Can the request be moved from the regular NRS head to the high-priority NRS
+ * head (of the same PTLRPC service partition), if any?
+ *
+ * For a reliable result, this should be checked under svcpt->scp_req lock.
+ */
+static inline bool ptlrpc_nrs_req_can_move(struct ptlrpc_request *req)
+{
+	struct ptlrpc_nrs_request *nrq = &req->rq_nrq;
+
+	/**
+	 * LU-898: Check ptlrpc_nrs_request::nr_enqueued to make sure the
+	 * request has been enqueued first, and ptlrpc_nrs_request::nr_started
+	 * to make sure it has not been scheduled yet (analogous to previous
+	 * (non-NRS) checking of !list_empty(&ptlrpc_request::rq_list).
+	 */
+	return nrq->nr_enqueued && !nrq->nr_started && !req->rq_hp;
+}
+/** @} nrs */
+
+/**
+ * Returns 1 if request buffer at offset \a index was already swabbed
+ */
+static inline int lustre_req_swabbed(struct ptlrpc_request *req, size_t index)
+{
+        LASSERT(index < sizeof(req->rq_req_swab_mask) * 8);
+        return req->rq_req_swab_mask & (1 << index);
+}
+
+/**
+ * Returns 1 if request reply buffer at offset \a index was already swabbed
+ */
+static inline int lustre_rep_swabbed(struct ptlrpc_request *req, size_t index)
+{
+        LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8);
+        return req->rq_rep_swab_mask & (1 << index);
+}
+
+/**
+ * Returns 1 if request needs to be swabbed into local cpu byteorder
+ */
+static inline int ptlrpc_req_need_swab(struct ptlrpc_request *req)
+{
+        return lustre_req_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+}
+
+/**
+ * Returns 1 if request reply needs to be swabbed into local cpu byteorder
+ */
+static inline int ptlrpc_rep_need_swab(struct ptlrpc_request *req)
+{
+        return lustre_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+}
+
+/**
+ * Mark request buffer at offset \a index that it was already swabbed
+ */
+static inline void lustre_set_req_swabbed(struct ptlrpc_request *req,
+					  size_t index)
+{
+        LASSERT(index < sizeof(req->rq_req_swab_mask) * 8);
+        LASSERT((req->rq_req_swab_mask & (1 << index)) == 0);
+        req->rq_req_swab_mask |= 1 << index;
+}
+
+/**
+ * Mark request reply buffer at offset \a index that it was already swabbed
+ */
+static inline void lustre_set_rep_swabbed(struct ptlrpc_request *req,
+					  size_t index)
+{
+        LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8);
+        LASSERT((req->rq_rep_swab_mask & (1 << index)) == 0);
+        req->rq_rep_swab_mask |= 1 << index;
+}
+
+/**
+ * Convert numerical request phase value \a phase into text string description
+ */
+static inline const char *
+ptlrpc_phase2str(enum rq_phase phase)
+{
+	switch (phase) {
+	case RQ_PHASE_NEW:
+		return "New";
+	case RQ_PHASE_RPC:
+		return "Rpc";
+	case RQ_PHASE_BULK:
+		return "Bulk";
+	case RQ_PHASE_INTERPRET:
+		return "Interpret";
+	case RQ_PHASE_COMPLETE:
+		return "Complete";
+	case RQ_PHASE_UNREG_RPC:
+		return "UnregRPC";
+	case RQ_PHASE_UNREG_BULK:
+		return "UnregBULK";
+	default:
+		return "?Phase?";
+	}
+}
+
+/**
+ * Convert numerical request phase of the request \a req into text stringi
+ * description
+ */
+static inline const char *
+ptlrpc_rqphase2str(struct ptlrpc_request *req)
+{
+        return ptlrpc_phase2str(req->rq_phase);
+}
+
+/**
+ * Debugging functions and helpers to print request structure into debug log
+ * @{
+ */
+/* Spare the preprocessor, spoil the bugs. */
+#define FLAG(field, str) (field ? str : "")
+
+/** Convert bit flags into a string */
+#define DEBUG_REQ_FLAGS(req)                                                   \
+	ptlrpc_rqphase2str(req),                                               \
+	FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"),                   \
+	FLAG(req->rq_err, "E"), FLAG(req->rq_net_err, "e"),                    \
+	FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"),  \
+	FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"),                 \
+	FLAG(req->rq_no_resend, "N"),                                          \
+	FLAG(req->rq_waiting, "W"),                                            \
+	FLAG(req->rq_wait_ctx, "C"), FLAG(req->rq_hp, "H"),                    \
+	FLAG(req->rq_committed, "M")
+
+#define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s%s%s%s%s"
+
+void _debug_req(struct ptlrpc_request *req,
+                struct libcfs_debug_msg_data *data, const char *fmt, ...)
+        __attribute__ ((format (printf, 3, 4)));
+
+/**
+ * Helper that decides if we need to print request accordig to current debug
+ * level settings
+ */
+#define debug_req(msgdata, mask, cdls, req, fmt, a...)                        \
+do {                                                                          \
+        CFS_CHECK_STACK(msgdata, mask, cdls);                                 \
+                                                                              \
+        if (((mask) & D_CANTMASK) != 0 ||                                     \
+            ((libcfs_debug & (mask)) != 0 &&                                  \
+             (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0))                \
+                _debug_req((req), msgdata, fmt, ##a);                         \
+} while(0)
+
+/**
+ * This is the debug print function you need to use to print request sturucture
+ * content into lustre debug log.
+ * for most callers (level is a constant) this is resolved at compile time */
+#define DEBUG_REQ(level, req, fmt, args...)                                   \
+do {                                                                          \
+        if ((level) & (D_ERROR | D_WARNING)) {                                \
+		static struct cfs_debug_limit_state cdls;		      \
+                LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls);            \
+                debug_req(&msgdata, level, &cdls, req, "@@@ "fmt" ", ## args);\
+        } else {                                                              \
+                LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL);             \
+                debug_req(&msgdata, level, NULL, req, "@@@ "fmt" ", ## args); \
+        }                                                                     \
+} while (0)
+/** @} */
+
+/**
+ * Structure that defines a single page of a bulk transfer
+ */
+struct ptlrpc_bulk_page {
+	/** Linkage to list of pages in a bulk */
+	struct list_head bp_link;
+	/**
+	 * Number of bytes in a page to transfer starting from \a bp_pageoffset
+	 */
+	int		 bp_buflen;
+	/** offset within a page */
+	int		 bp_pageoffset;
+	/** The page itself */
+	struct page	*bp_page;
+};
+
+enum ptlrpc_bulk_op_type {
+	PTLRPC_BULK_OP_ACTIVE =	 0x00000001,
+	PTLRPC_BULK_OP_PASSIVE = 0x00000002,
+	PTLRPC_BULK_OP_PUT =	 0x00000004,
+	PTLRPC_BULK_OP_GET =	 0x00000008,
+	PTLRPC_BULK_BUF_KVEC =	 0x00000010,
+	PTLRPC_BULK_BUF_KIOV =	 0x00000020,
+	PTLRPC_BULK_GET_SOURCE = PTLRPC_BULK_OP_PASSIVE | PTLRPC_BULK_OP_GET,
+	PTLRPC_BULK_PUT_SINK =	 PTLRPC_BULK_OP_PASSIVE | PTLRPC_BULK_OP_PUT,
+	PTLRPC_BULK_GET_SINK =	 PTLRPC_BULK_OP_ACTIVE | PTLRPC_BULK_OP_GET,
+	PTLRPC_BULK_PUT_SOURCE = PTLRPC_BULK_OP_ACTIVE | PTLRPC_BULK_OP_PUT,
+};
+
+static inline bool ptlrpc_is_bulk_op_get(enum ptlrpc_bulk_op_type type)
+{
+	return (type & PTLRPC_BULK_OP_GET) == PTLRPC_BULK_OP_GET;
+}
+
+static inline bool ptlrpc_is_bulk_get_source(enum ptlrpc_bulk_op_type type)
+{
+	return (type & PTLRPC_BULK_GET_SOURCE) == PTLRPC_BULK_GET_SOURCE;
+}
+
+static inline bool ptlrpc_is_bulk_put_sink(enum ptlrpc_bulk_op_type type)
+{
+	return (type & PTLRPC_BULK_PUT_SINK) == PTLRPC_BULK_PUT_SINK;
+}
+
+static inline bool ptlrpc_is_bulk_get_sink(enum ptlrpc_bulk_op_type type)
+{
+	return (type & PTLRPC_BULK_GET_SINK) == PTLRPC_BULK_GET_SINK;
+}
+
+static inline bool ptlrpc_is_bulk_put_source(enum ptlrpc_bulk_op_type type)
+{
+	return (type & PTLRPC_BULK_PUT_SOURCE) == PTLRPC_BULK_PUT_SOURCE;
+}
+
+static inline bool ptlrpc_is_bulk_desc_kvec(enum ptlrpc_bulk_op_type type)
+{
+	return ((type & PTLRPC_BULK_BUF_KVEC) | (type & PTLRPC_BULK_BUF_KIOV))
+			== PTLRPC_BULK_BUF_KVEC;
+}
+
+static inline bool ptlrpc_is_bulk_desc_kiov(enum ptlrpc_bulk_op_type type)
+{
+	return ((type & PTLRPC_BULK_BUF_KVEC) | (type & PTLRPC_BULK_BUF_KIOV))
+			== PTLRPC_BULK_BUF_KIOV;
+}
+
+static inline bool ptlrpc_is_bulk_op_active(enum ptlrpc_bulk_op_type type)
+{
+	return ((type & PTLRPC_BULK_OP_ACTIVE) |
+		(type & PTLRPC_BULK_OP_PASSIVE))
+			== PTLRPC_BULK_OP_ACTIVE;
+}
+
+static inline bool ptlrpc_is_bulk_op_passive(enum ptlrpc_bulk_op_type type)
+{
+	return ((type & PTLRPC_BULK_OP_ACTIVE) |
+		(type & PTLRPC_BULK_OP_PASSIVE))
+			== PTLRPC_BULK_OP_PASSIVE;
+}
+
+struct ptlrpc_bulk_frag_ops {
+	/**
+	 * Add a page \a page to the bulk descriptor \a desc
+	 * Data to transfer in the page starts at offset \a pageoffset and
+	 * amount of data to transfer from the page is \a len
+	 */
+	void (*add_kiov_frag)(struct ptlrpc_bulk_desc *desc,
+			      struct page *page, int pageoffset, int len);
+
+	/*
+	 * Add a \a fragment to the bulk descriptor \a desc.
+	 * Data to transfer in the fragment is pointed to by \a frag
+	 * The size of the fragment is \a len
+	 */
+	int (*add_iov_frag)(struct ptlrpc_bulk_desc *desc, void *frag, int len);
+
+	/**
+	 * Uninitialize and free bulk descriptor \a desc.
+	 * Works on bulk descriptors both from server and client side.
+	 */
+	void (*release_frags)(struct ptlrpc_bulk_desc *desc);
+};
+
+extern const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kiov_pin_ops;
+extern const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kiov_nopin_ops;
+extern const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kvec_ops;
+
+/*
+ * Definition of bulk descriptor.
+ * Bulks are special "Two phase" RPCs where initial request message
+ * is sent first and it is followed bt a transfer (o receiving) of a large
+ * amount of data to be settled into pages referenced from the bulk descriptors.
+ * Bulks transfers (the actual data following the small requests) are done
+ * on separate LNet portals.
+ * In lustre we use bulk transfers for READ and WRITE transfers from/to OSTs.
+ *  Another user is readpage for MDT.
+ */
+struct ptlrpc_bulk_desc {
+	/** completed with failure */
+	unsigned long bd_failure:1;
+	/** client side */
+	unsigned long bd_registered:1;
+	/** For serialization with callback */
+	spinlock_t bd_lock;
+	/** Import generation when request for this bulk was sent */
+	int bd_import_generation;
+	/** {put,get}{source,sink}{kvec,kiov} */
+	enum ptlrpc_bulk_op_type bd_type;
+	/** LNet portal for this bulk */
+	__u32 bd_portal;
+	/** Server side - export this bulk created for */
+	struct obd_export *bd_export;
+	/** Client side - import this bulk was sent on */
+	struct obd_import *bd_import;
+	/** Back pointer to the request */
+	struct ptlrpc_request *bd_req;
+	struct ptlrpc_bulk_frag_ops *bd_frag_ops;
+	wait_queue_head_t      bd_waitq;        /* server side only WQ */
+	int                    bd_iov_count;    /* # entries in bd_iov */
+	int                    bd_max_iov;      /* allocated size of bd_iov */
+	int                    bd_nob;          /* # bytes covered */
+	int                    bd_nob_transferred; /* # bytes GOT/PUT */
+
+	__u64                  bd_last_mbits;
+
+	struct ptlrpc_cb_id    bd_cbid;         /* network callback info */
+	lnet_nid_t             bd_sender;       /* stash event::sender */
+	int			bd_md_count;	/* # valid entries in bd_mds */
+	int			bd_md_max_brw;	/* max entries in bd_mds */
+	/** array of associated MDs */
+	struct lnet_handle_md	bd_mds[PTLRPC_BULK_OPS_COUNT];
+
+	union {
+		struct {
+			/*
+			 * encrypt iov, size is either 0 or bd_iov_count.
+			 */
+			lnet_kiov_t *bd_enc_vec;
+			lnet_kiov_t *bd_vec;
+		} bd_kiov;
+
+		struct {
+			struct kvec *bd_enc_kvec;
+			struct kvec *bd_kvec;
+		} bd_kvec;
+	} bd_u;
+
+};
+
+#define GET_KIOV(desc)			((desc)->bd_u.bd_kiov.bd_vec)
+#define BD_GET_KIOV(desc, i)		((desc)->bd_u.bd_kiov.bd_vec[i])
+#define GET_ENC_KIOV(desc)		((desc)->bd_u.bd_kiov.bd_enc_vec)
+#define BD_GET_ENC_KIOV(desc, i)	((desc)->bd_u.bd_kiov.bd_enc_vec[i])
+#define GET_KVEC(desc)			((desc)->bd_u.bd_kvec.bd_kvec)
+#define BD_GET_KVEC(desc, i)		((desc)->bd_u.bd_kvec.bd_kvec[i])
+#define GET_ENC_KVEC(desc)		((desc)->bd_u.bd_kvec.bd_enc_kvec)
+#define BD_GET_ENC_KVEC(desc, i)	((desc)->bd_u.bd_kvec.bd_enc_kvec[i])
+
+enum {
+	SVC_INIT	= 0,
+	SVC_STOPPED	= 1 << 0,
+	SVC_STOPPING	= 1 << 1,
+	SVC_STARTING	= 1 << 2,
+	SVC_RUNNING	= 1 << 3,
+	SVC_EVENT	= 1 << 4,
+	SVC_SIGNAL	= 1 << 5,
+};
+
+#define PTLRPC_THR_NAME_LEN		32
+/**
+ * Definition of server service thread structure
+ */
+struct ptlrpc_thread {
+	/**
+	 * List of active threads in svc->srv_threads
+	 */
+	struct list_head t_link;
+	/**
+	 * thread-private data (preallocated vmalloc'd memory)
+	 */
+	void *t_data;
+	__u32 t_flags;
+	/**
+	 * service thread index, from ptlrpc_start_threads
+	 */
+	unsigned int t_id;
+	/**
+	 * service thread pid
+	 */
+	pid_t t_pid;
+	/**
+	 * put watchdog in the structure per thread b=14840
+	 */
+	struct lc_watchdog *t_watchdog;
+	/**
+	 * the svc this thread belonged to b=18582
+	 */
+	struct ptlrpc_service_part	*t_svcpt;
+	wait_queue_head_t		t_ctl_waitq;
+	struct lu_env			*t_env;
+	char				t_name[PTLRPC_THR_NAME_LEN];
+};
+
+static inline int thread_is_init(struct ptlrpc_thread *thread)
+{
+	return thread->t_flags == 0;
+}
+
+static inline int thread_is_stopped(struct ptlrpc_thread *thread)
+{
+        return !!(thread->t_flags & SVC_STOPPED);
+}
+
+static inline int thread_is_stopping(struct ptlrpc_thread *thread)
+{
+        return !!(thread->t_flags & SVC_STOPPING);
+}
+
+static inline int thread_is_starting(struct ptlrpc_thread *thread)
+{
+        return !!(thread->t_flags & SVC_STARTING);
+}
+
+static inline int thread_is_running(struct ptlrpc_thread *thread)
+{
+        return !!(thread->t_flags & SVC_RUNNING);
+}
+
+static inline int thread_is_event(struct ptlrpc_thread *thread)
+{
+        return !!(thread->t_flags & SVC_EVENT);
+}
+
+static inline int thread_is_signal(struct ptlrpc_thread *thread)
+{
+        return !!(thread->t_flags & SVC_SIGNAL);
+}
+
+static inline void thread_clear_flags(struct ptlrpc_thread *thread, __u32 flags)
+{
+        thread->t_flags &= ~flags;
+}
+
+static inline void thread_set_flags(struct ptlrpc_thread *thread, __u32 flags)
+{
+        thread->t_flags = flags;
+}
+
+static inline void thread_add_flags(struct ptlrpc_thread *thread, __u32 flags)
+{
+        thread->t_flags |= flags;
+}
+
+static inline int thread_test_and_clear_flags(struct ptlrpc_thread *thread,
+                                              __u32 flags)
+{
+        if (thread->t_flags & flags) {
+                thread->t_flags &= ~flags;
+                return 1;
+        }
+        return 0;
+}
+
+/**
+ * Request buffer descriptor structure.
+ * This is a structure that contains one posted request buffer for service.
+ * Once data land into a buffer, event callback creates actual request and
+ * notifies wakes one of the service threads to process new incoming request.
+ * More than one request can fit into the buffer.
+ */
+struct ptlrpc_request_buffer_desc {
+	/** Link item for rqbds on a service */
+	struct list_head		rqbd_list;
+	/** History of requests for this buffer */
+	struct list_head		rqbd_reqs;
+	/** Back pointer to service for which this buffer is registered */
+	struct ptlrpc_service_part	*rqbd_svcpt;
+	/** LNet descriptor */
+	struct lnet_handle_md		rqbd_md_h;
+	int				rqbd_refcount;
+	/** The buffer itself */
+	char				*rqbd_buffer;
+	struct ptlrpc_cb_id		rqbd_cbid;
+	/**
+	 * This "embedded" request structure is only used for the
+	 * last request to fit into the buffer
+	 */
+	struct ptlrpc_request		rqbd_req;
+};
+
+typedef int  (*svc_handler_t)(struct ptlrpc_request *req);
+
+struct ptlrpc_service_ops {
+	/**
+	 * if non-NULL called during thread creation (ptlrpc_start_thread())
+	 * to initialize service specific per-thread state.
+	 */
+	int		(*so_thr_init)(struct ptlrpc_thread *thr);
+	/**
+	 * if non-NULL called during thread shutdown (ptlrpc_main()) to
+	 * destruct state created by ->srv_init().
+	 */
+	void		(*so_thr_done)(struct ptlrpc_thread *thr);
+	/**
+	 * Handler function for incoming requests for this service
+	 */
+	int		(*so_req_handler)(struct ptlrpc_request *req);
+	/**
+	 * function to determine priority of the request, it's called
+	 * on every new request
+	 */
+	int		(*so_hpreq_handler)(struct ptlrpc_request *);
+	/**
+	 * service-specific print fn
+	 */
+	void		(*so_req_printer)(void *, struct ptlrpc_request *);
+};
+
+#ifndef __cfs_cacheline_aligned
+/* NB: put it here for reducing patche dependence */
+# define __cfs_cacheline_aligned
+#endif
+
+/**
+ * How many high priority requests to serve before serving one normal
+ * priority request
+ */
+#define PTLRPC_SVC_HP_RATIO 10
+
+/**
+ * Definition of PortalRPC service.
+ * The service is listening on a particular portal (like tcp port)
+ * and perform actions for a specific server like IO service for OST
+ * or general metadata service for MDS.
+ */
+struct ptlrpc_service {
+	/** serialize /proc operations */
+	spinlock_t			srv_lock;
+	/** most often accessed fields */
+	/** chain thru all services */
+	struct list_head		srv_list;
+	/** service operations table */
+	struct ptlrpc_service_ops	srv_ops;
+        /** only statically allocated strings here; we don't clean them */
+        char                           *srv_name;
+        /** only statically allocated strings here; we don't clean them */
+        char                           *srv_thread_name;
+        /** service thread list */
+	struct list_head		srv_threads;
+	/** threads # should be created for each partition on initializing */
+	int				srv_nthrs_cpt_init;
+	/** limit of threads number for each partition */
+	int				srv_nthrs_cpt_limit;
+        /** Root of /proc dir tree for this service */
+	struct proc_dir_entry           *srv_procroot;
+        /** Pointer to statistic data for this service */
+        struct lprocfs_stats           *srv_stats;
+        /** # hp per lp reqs to handle */
+        int                             srv_hpreq_ratio;
+        /** biggest request to receive */
+        int                             srv_max_req_size;
+        /** biggest reply to send */
+        int                             srv_max_reply_size;
+        /** size of individual buffers */
+        int                             srv_buf_size;
+        /** # buffers to allocate in 1 group */
+        int                             srv_nbuf_per_group;
+        /** Local portal on which to receive requests */
+        __u32                           srv_req_portal;
+        /** Portal on the client to send replies to */
+        __u32                           srv_rep_portal;
+        /**
+         * Tags for lu_context associated with this thread, see struct
+         * lu_context.
+         */
+        __u32                           srv_ctx_tags;
+        /** soft watchdog timeout multiplier */
+        int                             srv_watchdog_factor;
+        /** under unregister_service */
+        unsigned                        srv_is_stopping:1;
+
+	/** max # request buffers in history per partition */
+	int				srv_hist_nrqbds_cpt_max;
+	/** number of CPTs this service bound on */
+	int				srv_ncpts;
+	/** CPTs array this service bound on */
+	__u32				*srv_cpts;
+	/** 2^srv_cptab_bits >= cfs_cpt_numbert(srv_cptable) */
+	int				srv_cpt_bits;
+	/** CPT table this service is running over */
+	struct cfs_cpt_table		*srv_cptable;
+	/**
+	 * partition data for ptlrpc service
+	 */
+	struct ptlrpc_service_part	*srv_parts[0];
+};
+
+/**
+ * Definition of PortalRPC service partition data.
+ * Although a service only has one instance of it right now, but we
+ * will have multiple instances very soon (instance per CPT).
+ *
+ * it has four locks:
+ * \a scp_lock
+ *    serialize operations on rqbd and requests waiting for preprocess
+ * \a scp_req_lock
+ *    serialize operations active requests sent to this portal
+ * \a scp_at_lock
+ *    serialize adaptive timeout stuff
+ * \a scp_rep_lock
+ *    serialize operations on RS list (reply states)
+ *
+ * We don't have any use-case to take two or more locks at the same time
+ * for now, so there is no lock order issue.
+ */
+struct ptlrpc_service_part {
+	/** back reference to owner */
+	struct ptlrpc_service		*scp_service __cfs_cacheline_aligned;
+	/* CPT id, reserved */
+	int				scp_cpt;
+	/** always increasing number */
+	int				scp_thr_nextid;
+	/** # of starting threads */
+	int				scp_nthrs_starting;
+	/** # of stopping threads, reserved for shrinking threads */
+	int				scp_nthrs_stopping;
+	/** # running threads */
+	int				scp_nthrs_running;
+	/** service threads list */
+	struct list_head		scp_threads;
+
+	/**
+	 * serialize the following fields, used for protecting
+	 * rqbd list and incoming requests waiting for preprocess,
+	 * threads starting & stopping are also protected by this lock.
+	 */
+	spinlock_t			scp_lock  __cfs_cacheline_aligned;
+	/** total # req buffer descs allocated */
+	int				scp_nrqbds_total;
+	/** # posted request buffers for receiving */
+	int				scp_nrqbds_posted;
+	/** in progress of allocating rqbd */
+	int				scp_rqbd_allocating;
+	/** # incoming reqs */
+	int				scp_nreqs_incoming;
+	/** request buffers to be reposted */
+	struct list_head		scp_rqbd_idle;
+	/** req buffers receiving */
+	struct list_head		scp_rqbd_posted;
+	/** incoming reqs */
+	struct list_head		scp_req_incoming;
+	/** timeout before re-posting reqs, in tick */
+	cfs_duration_t			scp_rqbd_timeout;
+	/**
+	 * all threads sleep on this. This wait-queue is signalled when new
+	 * incoming request arrives and when difficult reply has to be handled.
+	 */
+	wait_queue_head_t		scp_waitq;
+
+	/** request history */
+	struct list_head		scp_hist_reqs;
+	/** request buffer history */
+	struct list_head		scp_hist_rqbds;
+	/** # request buffers in history */
+	int				scp_hist_nrqbds;
+	/** sequence number for request */
+	__u64				scp_hist_seq;
+	/** highest seq culled from history */
+	__u64				scp_hist_seq_culled;
+
+	/**
+	 * serialize the following fields, used for processing requests
+	 * sent to this portal
+	 */
+	spinlock_t			scp_req_lock __cfs_cacheline_aligned;
+	/** # reqs in either of the NRS heads below */
+	/** # reqs being served */
+	int				scp_nreqs_active;
+	/** # HPreqs being served */
+	int				scp_nhreqs_active;
+	/** # hp requests handled */
+	int				scp_hreq_count;
+
+	/** NRS head for regular requests */
+	struct ptlrpc_nrs		scp_nrs_reg;
+	/** NRS head for HP requests; this is only valid for services that can
+	 *  handle HP requests */
+	struct ptlrpc_nrs	       *scp_nrs_hp;
+
+	/** AT stuff */
+	/** @{ */
+	/**
+	 * serialize the following fields, used for changes on
+	 * adaptive timeout
+	 */
+	spinlock_t			scp_at_lock __cfs_cacheline_aligned;
+	/** estimated rpc service time */
+	struct adaptive_timeout		scp_at_estimate;
+	/** reqs waiting for replies */
+	struct ptlrpc_at_array		scp_at_array;
+	/** early reply timer */
+	struct timer_list		scp_at_timer;
+	/** debug */
+	cfs_time_t			scp_at_checktime;
+	/** check early replies */
+	unsigned			scp_at_check;
+	/** @} */
+
+	/**
+	 * serialize the following fields, used for processing
+	 * replies for this portal
+	 */
+	spinlock_t			scp_rep_lock __cfs_cacheline_aligned;
+	/** all the active replies */
+	struct list_head		scp_rep_active;
+	/** List of free reply_states */
+	struct list_head		scp_rep_idle;
+	/** waitq to run, when adding stuff to srv_free_rs_list */
+	wait_queue_head_t		scp_rep_waitq;
+	/** # 'difficult' replies */
+	atomic_t			scp_nreps_difficult;
+};
+
+#define ptlrpc_service_for_each_part(part, i, svc)			\
+	for (i = 0;							\
+	     i < (svc)->srv_ncpts &&					\
+	     (svc)->srv_parts != NULL &&				\
+	     ((part) = (svc)->srv_parts[i]) != NULL; i++)
+
+/**
+ * Declaration of ptlrpcd control structure
+ */
+struct ptlrpcd_ctl {
+	/**
+	 * Ptlrpc thread control flags (LIOD_START, LIOD_STOP, LIOD_FORCE)
+	 */
+	unsigned long			pc_flags;
+	/**
+	 * Thread lock protecting structure fields.
+	 */
+	spinlock_t			pc_lock;
+	/**
+	 * Start completion.
+	 */
+	struct completion		pc_starting;
+	/**
+	 * Stop completion.
+	 */
+	struct completion		pc_finishing;
+	/**
+	 * Thread requests set.
+	 */
+	struct ptlrpc_request_set	*pc_set;
+	/**
+	 * Thread name used in kthread_run()
+	 */
+	char				pc_name[16];
+	/**
+	 * CPT the thread is bound on.
+	 */
+	int				pc_cpt;
+        /**
+         * Index of ptlrpcd thread in the array.
+         */
+	int				pc_index;
+	/**
+	 * Pointer to the array of partners' ptlrpcd_ctl structure.
+	 */
+	struct ptlrpcd_ctl		**pc_partners;
+	/**
+	 * Number of the ptlrpcd's partners.
+	 */
+	int				pc_npartners;
+	/**
+	 * Record the partner index to be processed next.
+	 */
+	int				pc_cursor;
+	/**
+	 * Error code if the thread failed to fully start.
+	 */
+	int				pc_error;
+};
+
+/* Bits for pc_flags */
+enum ptlrpcd_ctl_flags {
+        /**
+         * Ptlrpc thread start flag.
+         */
+        LIOD_START       = 1 << 0,
+        /**
+         * Ptlrpc thread stop flag.
+         */
+        LIOD_STOP        = 1 << 1,
+        /**
+         * Ptlrpc thread force flag (only stop force so far).
+         * This will cause aborting any inflight rpcs handled
+         * by thread if LIOD_STOP is specified.
+         */
+        LIOD_FORCE       = 1 << 2,
+        /**
+         * This is a recovery ptlrpc thread.
+         */
+        LIOD_RECOVERY    = 1 << 3,
+};
+
+/**
+ * \addtogroup nrs
+ * @{
+ *
+ * Service compatibility function; the policy is compatible with all services.
+ *
+ * \param[in] svc  The service the policy is attempting to register with.
+ * \param[in] desc The policy descriptor
+ *
+ * \retval true The policy is compatible with the service
+ *
+ * \see ptlrpc_nrs_pol_desc::pd_compat()
+ */
+static inline bool nrs_policy_compat_all(const struct ptlrpc_service *svc,
+					 const struct ptlrpc_nrs_pol_desc *desc)
+{
+	return true;
+}
+
+/**
+ * Service compatibility function; the policy is compatible with only a specific
+ * service which is identified by its human-readable name at
+ * ptlrpc_service::srv_name.
+ *
+ * \param[in] svc  The service the policy is attempting to register with.
+ * \param[in] desc The policy descriptor
+ *
+ * \retval false The policy is not compatible with the service
+ * \retval true	 The policy is compatible with the service
+ *
+ * \see ptlrpc_nrs_pol_desc::pd_compat()
+ */
+static inline bool nrs_policy_compat_one(const struct ptlrpc_service *svc,
+					 const struct ptlrpc_nrs_pol_desc *desc)
+{
+	LASSERT(desc->pd_compat_svc_name != NULL);
+	return strcmp(svc->srv_name, desc->pd_compat_svc_name) == 0;
+}
+
+/** @} nrs */
+
+/* ptlrpc/events.c */
+extern struct lnet_handle_eq ptlrpc_eq_h;
+extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid,
+			       struct lnet_process_id *peer, lnet_nid_t *self);
+/**
+ * These callbacks are invoked by LNet when something happened to
+ * underlying buffer
+ * @{
+ */
+extern void request_out_callback(struct lnet_event *ev);
+extern void reply_in_callback(struct lnet_event *ev);
+extern void client_bulk_callback(struct lnet_event *ev);
+extern void request_in_callback(struct lnet_event *ev);
+extern void reply_out_callback(struct lnet_event *ev);
+#ifdef HAVE_SERVER_SUPPORT
+extern void server_bulk_callback(struct lnet_event *ev);
+#endif
+/** @} */
+
+/* ptlrpc/connection.c */
+struct ptlrpc_connection *ptlrpc_connection_get(struct lnet_process_id peer,
+                                                lnet_nid_t self,
+                                                struct obd_uuid *uuid);
+int ptlrpc_connection_put(struct ptlrpc_connection *c);
+struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *);
+int ptlrpc_connection_init(void);
+void ptlrpc_connection_fini(void);
+extern lnet_pid_t ptl_get_pid(void);
+
+/*
+ * Check if the peer connection is on the local node.  We need to use GFP_NOFS
+ * for requests from a local client to avoid recursing into the filesystem
+ * as we might end up waiting on a page sent in the request we're serving.
+ *
+ * Use __GFP_HIGHMEM so that the pages can use all of the available memory
+ * on 32-bit machines.  Use more aggressive GFP_HIGHUSER flags from non-local
+ * clients to be able to generate more memory pressure on the OSS and allow
+ * inactive pages to be reclaimed, since it doesn't have any other processes
+ * or allocations that generate memory reclaim pressure.
+ *
+ * See b=17576 (bdf50dc9) and b=19529 (3dcf18d3) for details.
+ */
+static inline bool ptlrpc_connection_is_local(struct ptlrpc_connection *conn)
+{
+	if (!conn)
+		return false;
+
+	if (conn->c_peer.nid == conn->c_self)
+		return true;
+
+	RETURN(LNetIsPeerLocal(conn->c_peer.nid));
+}
+
+/* ptlrpc/niobuf.c */
+/**
+ * Actual interfacing with LNet to put/get/register/unregister stuff
+ * @{
+ */
+#ifdef HAVE_SERVER_SUPPORT
+struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp(struct ptlrpc_request *req,
+					      unsigned nfrags, unsigned max_brw,
+					      unsigned int type,
+					      unsigned portal,
+					      const struct ptlrpc_bulk_frag_ops
+						*ops);
+int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc);
+void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc);
+
+static inline int ptlrpc_server_bulk_active(struct ptlrpc_bulk_desc *desc)
+{
+	int rc;
+
+	LASSERT(desc != NULL);
+
+	spin_lock(&desc->bd_lock);
+	rc = desc->bd_md_count;
+	spin_unlock(&desc->bd_lock);
+	return rc;
+}
+#endif
+
+int ptlrpc_register_bulk(struct ptlrpc_request *req);
+int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async);
+
+static inline int ptlrpc_client_bulk_active(struct ptlrpc_request *req)
+{
+	struct ptlrpc_bulk_desc *desc;
+	int rc;
+
+	LASSERT(req != NULL);
+	desc = req->rq_bulk;
+
+	if (req->rq_bulk_deadline > ktime_get_real_seconds())
+		return 1;
+
+	if (!desc)
+		return 0;
+
+	spin_lock(&desc->bd_lock);
+	rc = desc->bd_md_count;
+	spin_unlock(&desc->bd_lock);
+	return rc;
+}
+
+#define PTLRPC_REPLY_MAYBE_DIFFICULT 0x01
+#define PTLRPC_REPLY_EARLY           0x02
+int ptlrpc_send_reply(struct ptlrpc_request *req, int flags);
+int ptlrpc_reply(struct ptlrpc_request *req);
+int ptlrpc_send_error(struct ptlrpc_request *req, int difficult);
+int ptlrpc_error(struct ptlrpc_request *req);
+int ptlrpc_at_get_net_latency(struct ptlrpc_request *req);
+int ptl_send_rpc(struct ptlrpc_request *request, int noreply);
+int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd);
+/** @} */
+
+/* ptlrpc/client.c */
+/**
+ * Client-side portals API. Everything to send requests, receive replies,
+ * request queues, request management, etc.
+ * @{
+ */
+void ptlrpc_request_committed(struct ptlrpc_request *req, int force);
+
+void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
+                        struct ptlrpc_client *);
+void ptlrpc_cleanup_client(struct obd_import *imp);
+struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid,
+						    lnet_nid_t nid4refnet);
+
+int ptlrpc_queue_wait(struct ptlrpc_request *req);
+int ptlrpc_replay_req(struct ptlrpc_request *req);
+void ptlrpc_restart_req(struct ptlrpc_request *req);
+void ptlrpc_abort_inflight(struct obd_import *imp);
+void ptlrpc_cleanup_imp(struct obd_import *imp);
+void ptlrpc_abort_set(struct ptlrpc_request_set *set);
+
+struct ptlrpc_request_set *ptlrpc_prep_set(void);
+struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func,
+					     void *arg);
+int ptlrpc_set_add_cb(struct ptlrpc_request_set *set,
+                      set_interpreter_func fn, void *data);
+int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set);
+int ptlrpc_set_wait(struct ptlrpc_request_set *);
+void ptlrpc_mark_interrupted(struct ptlrpc_request *req);
+void ptlrpc_set_destroy(struct ptlrpc_request_set *);
+void ptlrpc_set_add_req(struct ptlrpc_request_set *, struct ptlrpc_request *);
+
+void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool);
+int ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq);
+
+struct ptlrpc_request_pool *
+ptlrpc_init_rq_pool(int, int,
+		    int (*populate_pool)(struct ptlrpc_request_pool *, int));
+
+void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req);
+struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp,
+                                            const struct req_format *format);
+struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp,
+                                            struct ptlrpc_request_pool *,
+                                            const struct req_format *format);
+void ptlrpc_request_free(struct ptlrpc_request *request);
+int ptlrpc_request_pack(struct ptlrpc_request *request,
+                        __u32 version, int opcode);
+struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp,
+                                                const struct req_format *format,
+                                                __u32 version, int opcode);
+int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
+                             __u32 version, int opcode, char **bufs,
+                             struct ptlrpc_cli_ctx *ctx);
+void ptlrpc_req_finished(struct ptlrpc_request *request);
+void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request);
+struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req);
+struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req,
+					      unsigned nfrags, unsigned max_brw,
+					      unsigned int type,
+					      unsigned portal,
+					      const struct ptlrpc_bulk_frag_ops
+						*ops);
+
+int ptlrpc_prep_bulk_frag(struct ptlrpc_bulk_desc *desc,
+			  void *frag, int len);
+void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
+			     struct page *page, int pageoffset, int len,
+			     int pin);
+static inline void ptlrpc_prep_bulk_page_pin(struct ptlrpc_bulk_desc *desc,
+					     struct page *page, int pageoffset,
+					     int len)
+{
+	__ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 1);
+}
+
+static inline void ptlrpc_prep_bulk_page_nopin(struct ptlrpc_bulk_desc *desc,
+					       struct page *page, int pageoffset,
+					       int len)
+{
+	__ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 0);
+}
+
+void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk);
+
+static inline void ptlrpc_release_bulk_page_pin(struct ptlrpc_bulk_desc *desc)
+{
+	int i;
+
+	for (i = 0; i < desc->bd_iov_count ; i++)
+		put_page(BD_GET_KIOV(desc, i).kiov_page);
+}
+
+static inline void ptlrpc_release_bulk_noop(struct ptlrpc_bulk_desc *desc)
+{
+}
+
+void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
+                                      struct obd_import *imp);
+__u64 ptlrpc_next_xid(void);
+__u64 ptlrpc_sample_next_xid(void);
+__u64 ptlrpc_req_xid(struct ptlrpc_request *request);
+
+/* Set of routines to run a function in ptlrpcd context */
+void *ptlrpcd_alloc_work(struct obd_import *imp,
+                         int (*cb)(const struct lu_env *, void *), void *data);
+void ptlrpcd_destroy_work(void *handler);
+int ptlrpcd_queue_work(void *handler);
+
+/** @} */
+struct ptlrpc_service_buf_conf {
+	/* nbufs is buffers # to allocate when growing the pool */
+	unsigned int			bc_nbufs;
+	/* buffer size to post */
+	unsigned int			bc_buf_size;
+	/* portal to listed for requests on */
+	unsigned int			bc_req_portal;
+	/* portal of where to send replies to */
+	unsigned int			bc_rep_portal;
+	/* maximum request size to be accepted for this service */
+	unsigned int			bc_req_max_size;
+	/* maximum reply size this service can ever send */
+	unsigned int			bc_rep_max_size;
+};
+
+struct ptlrpc_service_thr_conf {
+	/* threadname should be 8 characters or less - 6 will be added on */
+	char				*tc_thr_name;
+	/* threads increasing factor for each CPU */
+	unsigned int			tc_thr_factor;
+	/* service threads # to start on each partition while initializing */
+	unsigned int			tc_nthrs_init;
+	/*
+	 * low water of threads # upper-limit on each partition while running,
+	 * service availability may be impacted if threads number is lower
+	 * than this value. It can be ZERO if the service doesn't require
+	 * CPU affinity or there is only one partition.
+	 */
+	unsigned int			tc_nthrs_base;
+	/* "soft" limit for total threads number */
+	unsigned int			tc_nthrs_max;
+	/* user specified threads number, it will be validated due to
+	 * other members of this structure. */
+	unsigned int			tc_nthrs_user;
+	/* set NUMA node affinity for service threads */
+	unsigned int			tc_cpu_affinity;
+	/* Tags for lu_context associated with service thread */
+	__u32				tc_ctx_tags;
+};
+
+struct ptlrpc_service_cpt_conf {
+	struct cfs_cpt_table		*cc_cptable;
+	/* string pattern to describe CPTs for a service */
+	char				*cc_pattern;
+};
+
+struct ptlrpc_service_conf {
+	/* service name */
+	char				*psc_name;
+	/* soft watchdog timeout multiplifier to print stuck service traces */
+	unsigned int			psc_watchdog_factor;
+	/* buffer information */
+	struct ptlrpc_service_buf_conf	psc_buf;
+	/* thread information */
+	struct ptlrpc_service_thr_conf	psc_thr;
+	/* CPU partition information */
+	struct ptlrpc_service_cpt_conf	psc_cpt;
+	/* function table */
+	struct ptlrpc_service_ops	psc_ops;
+};
+
+/* ptlrpc/service.c */
+/**
+ * Server-side services API. Register/unregister service, request state
+ * management, service thread management
+ *
+ * @{
+ */
+void ptlrpc_save_lock(struct ptlrpc_request *req, struct lustre_handle *lock,
+		      int mode, bool no_ack, bool convert_lock);
+void ptlrpc_commit_replies(struct obd_export *exp);
+void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs);
+void ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs);
+int ptlrpc_hpreq_handler(struct ptlrpc_request *req);
+struct ptlrpc_service *ptlrpc_register_service(
+				struct ptlrpc_service_conf *conf,
+				struct proc_dir_entry *proc_entry);
+void ptlrpc_stop_all_threads(struct ptlrpc_service *svc);
+
+int ptlrpc_start_threads(struct ptlrpc_service *svc);
+int ptlrpc_unregister_service(struct ptlrpc_service *service);
+int liblustre_check_services(void *arg);
+void ptlrpc_daemonize(char *name);
+int ptlrpc_service_health_check(struct ptlrpc_service *);
+void ptlrpc_server_drop_request(struct ptlrpc_request *req);
+void ptlrpc_request_change_export(struct ptlrpc_request *req,
+				  struct obd_export *export);
+void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay);
+
+int ptlrpc_hr_init(void);
+void ptlrpc_hr_fini(void);
+
+/** @} */
+
+/* ptlrpc/import.c */
+/**
+ * Import API
+ * @{
+ */
+int ptlrpc_connect_import(struct obd_import *imp);
+int ptlrpc_init_import(struct obd_import *imp);
+int ptlrpc_disconnect_import(struct obd_import *imp, int noclose);
+int ptlrpc_import_recovery_state_machine(struct obd_import *imp);
+void deuuidify(char *uuid, const char *prefix, char **uuid_start,
+	       int *uuid_len);
+void ptlrpc_import_enter_resend(struct obd_import *imp);
+/* ptlrpc/pack_generic.c */
+int ptlrpc_reconnect_import(struct obd_import *imp);
+/** @} */
+
+/**
+ * ptlrpc msg buffer and swab interface
+ *
+ * @{
+ */
+int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout,
+			 __u32 index);
+void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout,
+			    __u32 index);
+int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len);
+int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len);
+
+int lustre_msg_check_version(struct lustre_msg *msg, __u32 version);
+void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens,
+                        char **bufs);
+int lustre_pack_request(struct ptlrpc_request *, __u32 magic, int count,
+                        __u32 *lens, char **bufs);
+int lustre_pack_reply(struct ptlrpc_request *, int count, __u32 *lens,
+                      char **bufs);
+int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
+                         __u32 *lens, char **bufs, int flags);
+#define LPRFL_EARLY_REPLY 1
+int lustre_pack_reply_flags(struct ptlrpc_request *, int count, __u32 *lens,
+                            char **bufs, int flags);
+int lustre_shrink_msg(struct lustre_msg *msg, int segment,
+                      unsigned int newlen, int move_data);
+void lustre_free_reply_state(struct ptlrpc_reply_state *rs);
+int __lustre_unpack_msg(struct lustre_msg *m, int len);
+__u32 lustre_msg_hdr_size(__u32 magic, __u32 count);
+__u32 lustre_msg_size(__u32 magic, int count, __u32 *lengths);
+__u32 lustre_msg_size_v2(int count, __u32 *lengths);
+__u32 lustre_packed_msg_size(struct lustre_msg *msg);
+__u32 lustre_msg_early_size(void);
+void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, __u32 n, __u32 min_size);
+void *lustre_msg_buf(struct lustre_msg *m, __u32 n, __u32 minlen);
+__u32 lustre_msg_buflen(struct lustre_msg *m, __u32 n);
+void lustre_msg_set_buflen(struct lustre_msg *m, __u32 n, __u32 len);
+__u32 lustre_msg_bufcount(struct lustre_msg *m);
+char *lustre_msg_string(struct lustre_msg *m, __u32 n, __u32 max_len);
+__u32 lustre_msghdr_get_flags(struct lustre_msg *msg);
+void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags);
+__u32 lustre_msg_get_flags(struct lustre_msg *msg);
+void lustre_msg_add_flags(struct lustre_msg *msg, __u32 flags);
+void lustre_msg_set_flags(struct lustre_msg *msg, __u32 flags);
+void lustre_msg_clear_flags(struct lustre_msg *msg, __u32 flags);
+__u32 lustre_msg_get_op_flags(struct lustre_msg *msg);
+void lustre_msg_add_op_flags(struct lustre_msg *msg, __u32 flags);
+struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg);
+__u32 lustre_msg_get_type(struct lustre_msg *msg);
+__u32 lustre_msg_get_version(struct lustre_msg *msg);
+void lustre_msg_add_version(struct lustre_msg *msg, __u32 version);
+__u32 lustre_msg_get_opc(struct lustre_msg *msg);
+__u64 lustre_msg_get_last_xid(struct lustre_msg *msg);
+__u16 lustre_msg_get_tag(struct lustre_msg *msg);
+__u64 lustre_msg_get_last_committed(struct lustre_msg *msg);
+__u64 *lustre_msg_get_versions(struct lustre_msg *msg);
+__u64 lustre_msg_get_transno(struct lustre_msg *msg);
+__u64 lustre_msg_get_slv(struct lustre_msg *msg);
+__u32 lustre_msg_get_limit(struct lustre_msg *msg);
+void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv);
+void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit);
+int lustre_msg_get_status(struct lustre_msg *msg);
+__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg);
+__u32 lustre_msg_get_magic(struct lustre_msg *msg);
+__u32 lustre_msg_get_timeout(struct lustre_msg *msg);
+__u32 lustre_msg_get_service_time(struct lustre_msg *msg);
+char *lustre_msg_get_jobid(struct lustre_msg *msg);
+__u32 lustre_msg_get_cksum(struct lustre_msg *msg);
+__u64 lustre_msg_get_mbits(struct lustre_msg *msg);
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg);
+void lustre_msg_set_handle(struct lustre_msg *msg,struct lustre_handle *handle);
+void lustre_msg_set_type(struct lustre_msg *msg, __u32 type);
+void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc);
+void lustre_msg_set_last_xid(struct lustre_msg *msg, __u64 last_xid);
+void lustre_msg_set_tag(struct lustre_msg *msg, __u16 tag);
+void lustre_msg_set_last_committed(struct lustre_msg *msg,__u64 last_committed);
+void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions);
+void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno);
+void lustre_msg_set_status(struct lustre_msg *msg, __u32 status);
+void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt);
+void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *sizes);
+void ptlrpc_request_set_replen(struct ptlrpc_request *req);
+void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout);
+void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time);
+void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid);
+void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum);
+void lustre_msg_set_mbits(struct lustre_msg *msg, __u64 mbits);
+
+static inline void
+lustre_shrink_reply(struct ptlrpc_request *req, int segment,
+                    unsigned int newlen, int move_data)
+{
+        LASSERT(req->rq_reply_state);
+        LASSERT(req->rq_repmsg);
+        req->rq_replen = lustre_shrink_msg(req->rq_repmsg, segment,
+                                           newlen, move_data);
+}
+
+#ifdef LUSTRE_TRANSLATE_ERRNOS
+
+static inline int ptlrpc_status_hton(int h)
+{
+	/*
+	 * Positive errnos must be network errnos, such as LUSTRE_EDEADLK,
+	 * ELDLM_LOCK_ABORTED, etc.
+	 */
+	if (h < 0)
+		return -lustre_errno_hton(-h);
+	else
+		return h;
+}
+
+static inline int ptlrpc_status_ntoh(int n)
+{
+	/*
+	 * See the comment in ptlrpc_status_hton().
+	 */
+	if (n < 0)
+		return -lustre_errno_ntoh(-n);
+	else
+		return n;
+}
+
+#else
+
+#define ptlrpc_status_hton(h) (h)
+#define ptlrpc_status_ntoh(n) (n)
+
+#endif
+/** @} */
+
+/** Change request phase of \a req to \a new_phase */
+static inline void
+ptlrpc_rqphase_move(struct ptlrpc_request *req, enum rq_phase new_phase)
+{
+	if (req->rq_phase == new_phase)
+		return;
+
+	if (new_phase == RQ_PHASE_UNREG_RPC ||
+	    new_phase == RQ_PHASE_UNREG_BULK) {
+		/* No embedded unregistering phases */
+		if (req->rq_phase == RQ_PHASE_UNREG_RPC ||
+		    req->rq_phase == RQ_PHASE_UNREG_BULK)
+			return;
+
+		req->rq_next_phase = req->rq_phase;
+		if (req->rq_import)
+			atomic_inc(&req->rq_import->imp_unregistering);
+	}
+
+	if (req->rq_phase == RQ_PHASE_UNREG_RPC ||
+	    req->rq_phase == RQ_PHASE_UNREG_BULK) {
+		if (req->rq_import)
+			atomic_dec(&req->rq_import->imp_unregistering);
+	}
+
+	DEBUG_REQ(D_INFO, req, "move req \"%s\" -> \"%s\"",
+		  ptlrpc_rqphase2str(req), ptlrpc_phase2str(new_phase));
+
+	req->rq_phase = new_phase;
+}
+
+/**
+ * Returns true if request \a req got early reply and hard deadline is not met
+ */
+static inline int
+ptlrpc_client_early(struct ptlrpc_request *req)
+{
+        return req->rq_early;
+}
+
+/**
+ * Returns true if we got real reply from server for this request
+ */
+static inline int
+ptlrpc_client_replied(struct ptlrpc_request *req)
+{
+	if (req->rq_reply_deadline > ktime_get_real_seconds())
+		return 0;
+	return req->rq_replied;
+}
+
+/** Returns true if request \a req is in process of receiving server reply */
+static inline int
+ptlrpc_client_recv(struct ptlrpc_request *req)
+{
+	if (req->rq_reply_deadline > ktime_get_real_seconds())
+		return 1;
+	return req->rq_receiving_reply;
+}
+
+static inline int
+ptlrpc_client_recv_or_unlink(struct ptlrpc_request *req)
+{
+	int rc;
+
+	spin_lock(&req->rq_lock);
+	if (req->rq_reply_deadline > ktime_get_real_seconds()) {
+		spin_unlock(&req->rq_lock);
+		return 1;
+	}
+	if (req->rq_req_deadline > ktime_get_real_seconds()) {
+		spin_unlock(&req->rq_lock);
+		return 1;
+	}
+
+	rc = !req->rq_req_unlinked || !req->rq_reply_unlinked ||
+	     req->rq_receiving_reply;
+	spin_unlock(&req->rq_lock);
+	return rc;
+}
+
+static inline void
+ptlrpc_client_wake_req(struct ptlrpc_request *req)
+{
+	smp_mb();
+	if (req->rq_set == NULL)
+		wake_up(&req->rq_reply_waitq);
+	else
+		wake_up(&req->rq_set->set_waitq);
+}
+
+static inline void
+ptlrpc_rs_addref(struct ptlrpc_reply_state *rs)
+{
+	LASSERT(atomic_read(&rs->rs_refcount) > 0);
+	atomic_inc(&rs->rs_refcount);
+}
+
+static inline void
+ptlrpc_rs_decref(struct ptlrpc_reply_state *rs)
+{
+	LASSERT(atomic_read(&rs->rs_refcount) > 0);
+	if (atomic_dec_and_test(&rs->rs_refcount))
+		lustre_free_reply_state(rs);
+}
+
+/* Should only be called once per req */
+static inline void ptlrpc_req_drop_rs(struct ptlrpc_request *req)
+{
+        if (req->rq_reply_state == NULL)
+                return; /* shouldn't occur */
+        ptlrpc_rs_decref(req->rq_reply_state);
+        req->rq_reply_state = NULL;
+        req->rq_repmsg = NULL;
+}
+
+static inline __u32 lustre_request_magic(struct ptlrpc_request *req)
+{
+        return lustre_msg_get_magic(req->rq_reqmsg);
+}
+
+static inline int ptlrpc_req_get_repsize(struct ptlrpc_request *req)
+{
+        switch (req->rq_reqmsg->lm_magic) {
+        case LUSTRE_MSG_MAGIC_V2:
+                return req->rq_reqmsg->lm_repsize;
+        default:
+                LASSERTF(0, "incorrect message magic: %08x\n",
+                         req->rq_reqmsg->lm_magic);
+                return -EFAULT;
+        }
+}
+
+static inline int ptlrpc_send_limit_expired(struct ptlrpc_request *req)
+{
+        if (req->rq_delay_limit != 0 &&
+            cfs_time_before(cfs_time_add(req->rq_queued_time,
+                                         cfs_time_seconds(req->rq_delay_limit)),
+                            cfs_time_current())) {
+                return 1;
+        }
+        return 0;
+}
+
+static inline int ptlrpc_no_resend(struct ptlrpc_request *req)
+{
+	if (!req->rq_no_resend && ptlrpc_send_limit_expired(req)) {
+		spin_lock(&req->rq_lock);
+		req->rq_no_resend = 1;
+		spin_unlock(&req->rq_lock);
+	}
+	return req->rq_no_resend;
+}
+
+static inline int
+ptlrpc_server_get_timeout(struct ptlrpc_service_part *svcpt)
+{
+	int at = AT_OFF ? 0 : at_get(&svcpt->scp_at_estimate);
+
+	return svcpt->scp_service->srv_watchdog_factor *
+	       max_t(int, at, obd_timeout);
+}
+
+static inline struct ptlrpc_service *
+ptlrpc_req2svc(struct ptlrpc_request *req)
+{
+	LASSERT(req->rq_rqbd != NULL);
+	return req->rq_rqbd->rqbd_svcpt->scp_service;
+}
+
+/* ldlm/ldlm_lib.c */
+/**
+ * Target client logic
+ * @{
+ */
+int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg);
+int client_obd_cleanup(struct obd_device *obddev);
+int client_connect_import(const struct lu_env *env,
+                          struct obd_export **exp, struct obd_device *obd,
+                          struct obd_uuid *cluuid, struct obd_connect_data *,
+                          void *localdata);
+int client_disconnect_export(struct obd_export *exp);
+int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
+                           int priority);
+int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid);
+int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer,
+                            struct obd_uuid *uuid);
+int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid);
+void client_destroy_import(struct obd_import *imp);
+/** @} */
+
+#ifdef HAVE_SERVER_SUPPORT
+int server_disconnect_export(struct obd_export *exp);
+#endif
+
+/* ptlrpc/pinger.c */
+/**
+ * Pinger API (client side only)
+ * @{
+ */
+enum timeout_event {
+        TIMEOUT_GRANT = 1
+};
+struct timeout_item;
+typedef int (*timeout_cb_t)(struct timeout_item *, void *);
+int ptlrpc_pinger_add_import(struct obd_import *imp);
+int ptlrpc_pinger_del_import(struct obd_import *imp);
+int ptlrpc_add_timeout_client(int time, enum timeout_event event,
+			      timeout_cb_t cb, void *data,
+			      struct list_head *obd_list);
+int ptlrpc_del_timeout_client(struct list_head *obd_list,
+                              enum timeout_event event);
+struct ptlrpc_request * ptlrpc_prep_ping(struct obd_import *imp);
+int ptlrpc_obd_ping(struct obd_device *obd);
+void ping_evictor_start(void);
+void ping_evictor_stop(void);
+void ptlrpc_pinger_ir_up(void);
+void ptlrpc_pinger_ir_down(void);
+/** @} */
+int ptlrpc_pinger_suppress_pings(void);
+
+/* ptlrpc/ptlrpcd.c */
+void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force);
+void ptlrpcd_free(struct ptlrpcd_ctl *pc);
+void ptlrpcd_wake(struct ptlrpc_request *req);
+void ptlrpcd_add_req(struct ptlrpc_request *req);
+void ptlrpcd_add_rqset(struct ptlrpc_request_set *set);
+int ptlrpcd_addref(void);
+void ptlrpcd_decref(void);
+
+/* ptlrpc/lproc_ptlrpc.c */
+/**
+ * procfs output related functions
+ * @{
+ */
+const char* ll_opcode2str(__u32 opcode);
+const int ll_str2opcode(const char *ops);
+#ifdef CONFIG_PROC_FS
+void ptlrpc_lprocfs_register_obd(struct obd_device *obd);
+void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd);
+void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes);
+#else
+static inline void ptlrpc_lprocfs_register_obd(struct obd_device *obd) {}
+static inline void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd) {}
+static inline void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes) {}
+#endif
+/** @} */
+
+/* ptlrpc/llog_server.c */
+int llog_origin_handle_open(struct ptlrpc_request *req);
+int llog_origin_handle_destroy(struct ptlrpc_request *req);
+int llog_origin_handle_prev_block(struct ptlrpc_request *req);
+int llog_origin_handle_next_block(struct ptlrpc_request *req);
+int llog_origin_handle_read_header(struct ptlrpc_request *req);
+int llog_origin_handle_close(struct ptlrpc_request *req);
+
+/* ptlrpc/llog_client.c */
+extern struct llog_operations llog_client_ops;
+/** @} net */
+
+#endif
+/** @} PtlRPC */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nodemap.h b/drivers/staging/lustrefsx/lustre/include/lustre_nodemap.h
new file mode 100644
index 0000000000000..7cabc6f2424d7
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_nodemap.h
@@ -0,0 +1,214 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (C) 2013, Trustees of Indiana University
+ * Author: Joshua Walgenbach <jjw@iu.edu>
+ */
+
+#ifndef _LUSTRE_NODEMAP_H
+#define _LUSTRE_NODEMAP_H
+
+#include <lustre/lustre_idl.h>
+
+#define LUSTRE_NODEMAP_NAME "nodemap"
+
+#define LUSTRE_NODEMAP_DEFAULT_ID 0
+
+/** enums containing the types of ids contained in a nodemap
+ * kept so other modules (mgs, mdt, etc) can define the type
+ * of search easily
+ */
+
+enum nodemap_id_type {
+	NODEMAP_UID,
+	NODEMAP_GID,
+};
+
+enum nodemap_tree_type {
+	NODEMAP_FS_TO_CLIENT,
+	NODEMAP_CLIENT_TO_FS,
+};
+
+enum nodemap_mapping_modes {
+	NODEMAP_MAP_BOTH,
+	NODEMAP_MAP_UID_ONLY,
+	NODEMAP_MAP_GID_ONLY,
+};
+
+struct nodemap_pde {
+	char			 npe_name[LUSTRE_NODEMAP_NAME_LENGTH + 1];
+	struct proc_dir_entry	*npe_proc_entry;
+	struct list_head	 npe_list_member;
+};
+
+/** The nodemap id 0 will be the default nodemap. It will have a configuration
+ * set by the MGS, but no ranges will be allowed as all NIDs that do not map
+ * will be added to the default nodemap
+ */
+
+struct lu_nodemap {
+	/* human readable ID */
+	char			 nm_name[LUSTRE_NODEMAP_NAME_LENGTH + 1];
+	/* flags to govern nodemap behavior */
+	bool			 nmf_trust_client_ids:1,
+				 nmf_deny_unknown:1,
+				 nmf_allow_root_access:1,
+				 nmf_map_uid_only:1,
+				 nmf_map_gid_only:1;
+	/* unique ID set by MGS */
+	unsigned int		 nm_id;
+	/* nodemap ref counter */
+	atomic_t		 nm_refcount;
+	/* UID to squash unmapped UIDs */
+	uid_t			 nm_squash_uid;
+	/* GID to squash unmapped GIDs */
+	gid_t			 nm_squash_gid;
+	/* NID range list */
+	struct list_head	 nm_ranges;
+	/* lock for idmap red/black trees */
+	struct rw_semaphore	 nm_idmap_lock;
+	/* UID map keyed by local UID */
+	struct rb_root		 nm_fs_to_client_uidmap;
+	/* UID map keyed by remote UID */
+	struct rb_root		 nm_client_to_fs_uidmap;
+	/* GID map keyed by local UID */
+	struct rb_root		 nm_fs_to_client_gidmap;
+	/* GID map keyed by remote UID */
+	struct rb_root		 nm_client_to_fs_gidmap;
+	/* attached client members of this nodemap */
+	struct mutex		 nm_member_list_lock;
+	struct list_head	 nm_member_list;
+	/* access by nodemap name */
+	struct hlist_node	 nm_hash;
+	struct nodemap_pde	*nm_pde_data;
+	/* fileset the nodes of this nodemap are restricted to */
+	char			 nm_fileset[PATH_MAX+1];
+
+	/* used when loading/unloading nodemaps */
+	struct list_head	 nm_list;
+};
+
+/* Store handles to local MGC storage to save config locally. In future
+ * versions of nodemap, mgc will receive the config directly and so this might
+ * not be needed.
+ */
+struct nm_config_file {
+	struct local_oid_storage	*ncf_los;
+	struct dt_object		*ncf_obj;
+	struct list_head		 ncf_list;
+};
+
+void nodemap_activate(const bool value);
+int nodemap_add(const char *nodemap_name);
+int nodemap_del(const char *nodemap_name);
+int nodemap_add_member(lnet_nid_t nid, struct obd_export *exp);
+void nodemap_del_member(struct obd_export *exp);
+int nodemap_parse_range(const char *range_string, lnet_nid_t range[2]);
+int nodemap_parse_idmap(char *idmap_string, __u32 idmap[2]);
+int nodemap_add_range(const char *name, const lnet_nid_t nid[2]);
+int nodemap_del_range(const char *name, const lnet_nid_t nid[2]);
+int nodemap_set_allow_root(const char *name, bool allow_root);
+int nodemap_set_trust_client_ids(const char *name, bool trust_client_ids);
+int nodemap_set_deny_unknown(const char *name, bool deny_unknown);
+int nodemap_set_mapping_mode(const char *name, enum nodemap_mapping_modes mode);
+int nodemap_set_squash_uid(const char *name, uid_t uid);
+int nodemap_set_squash_gid(const char *name, gid_t gid);
+bool nodemap_can_setquota(const struct lu_nodemap *nodemap);
+int nodemap_add_idmap(const char *name, enum nodemap_id_type id_type,
+		      const __u32 map[2]);
+int nodemap_del_idmap(const char *name, enum nodemap_id_type id_type,
+		      const __u32 map[2]);
+int nodemap_set_fileset(const char *name, const char *fileset);
+char *nodemap_get_fileset(const struct lu_nodemap *nodemap);
+__u32 nodemap_map_id(struct lu_nodemap *nodemap,
+		     enum nodemap_id_type id_type,
+		     enum nodemap_tree_type tree_type, __u32 id);
+ssize_t nodemap_map_acl(struct lu_nodemap *nodemap, void *buf, size_t size,
+			enum nodemap_tree_type tree_type);
+#ifdef HAVE_SERVER_SUPPORT
+void nodemap_test_nid(lnet_nid_t nid, char *name_buf, size_t name_len);
+#else
+#define nodemap_test_nid(nid, name_buf, name_len) do {} while(0)
+#endif
+int nodemap_test_id(lnet_nid_t nid, enum nodemap_id_type idtype,
+		    __u32 client_id, __u32 *fs_id);
+
+struct nm_config_file *nm_config_file_register_mgs(const struct lu_env *env,
+						   struct dt_object *obj,
+						   struct local_oid_storage *los);
+struct dt_device;
+struct nm_config_file *nm_config_file_register_tgt(const struct lu_env *env,
+						   struct dt_device *dev,
+						   struct local_oid_storage *los);
+void nm_config_file_deregister_mgs(const struct lu_env *env,
+				   struct nm_config_file *ncf);
+void nm_config_file_deregister_tgt(const struct lu_env *env,
+				   struct nm_config_file *ncf);
+struct lu_nodemap *nodemap_get_from_exp(struct obd_export *exp);
+void nodemap_putref(struct lu_nodemap *nodemap);
+
+#ifdef HAVE_SERVER_SUPPORT
+struct nodemap_range_tree {
+	struct interval_node *nmrt_range_interval_root;
+	unsigned int nmrt_range_highest_id;
+};
+
+struct nodemap_config {
+	/* Highest numerical lu_nodemap.nm_id defined */
+	unsigned int nmc_nodemap_highest_id;
+
+	/* Simple flag to determine if nodemaps are active */
+	bool nmc_nodemap_is_active;
+
+	/* Pointer to default nodemap as it is needed more often */
+	struct lu_nodemap *nmc_default_nodemap;
+
+	/**
+	 * Lock required to access the range tree.
+	 */
+	struct rw_semaphore nmc_range_tree_lock;
+	struct nodemap_range_tree nmc_range_tree;
+
+	/**
+	 * Hash keyed on nodemap name containing all
+	 * nodemaps
+	 */
+	struct cfs_hash *nmc_nodemap_hash;
+};
+
+struct nodemap_config *nodemap_config_alloc(void);
+void nodemap_config_dealloc(struct nodemap_config *config);
+void nodemap_config_set_active_mgc(struct nodemap_config *config);
+
+int nodemap_process_idx_pages(struct nodemap_config *config, union lu_page *lip,
+			      struct lu_nodemap **recent_nodemap);
+
+#else /* disable nodemap processing in MGC of non-servers */
+static inline int nodemap_process_idx_pages(void *config,
+					    union lu_page *lip,
+					    struct lu_nodemap **recent_nodemap)
+{ return 0; }
+#endif /* HAVE_SERVER_SUPPORT */
+
+int nodemap_get_config_req(struct obd_device *mgs_obd,
+			   struct ptlrpc_request *req);
+#endif	/* _LUSTRE_NODEMAP_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nrs.h b/drivers/staging/lustrefsx/lustre/include/lustre_nrs.h
new file mode 100644
index 0000000000000..6397cf2f0d377
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_nrs.h
@@ -0,0 +1,738 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ *
+ * Network Request Scheduler (NRS)
+ *
+ */
+
+#ifndef _LUSTRE_NRS_H
+#define _LUSTRE_NRS_H
+
+/**
+ * \defgroup nrs Network Request Scheduler
+ * @{
+ */
+struct ptlrpc_nrs_policy;
+struct ptlrpc_nrs_resource;
+struct ptlrpc_nrs_request;
+
+/**
+ * NRS control operations.
+ *
+ * These are common for all policies.
+ */
+enum ptlrpc_nrs_ctl {
+	/**
+	 * Not a valid opcode.
+	 */
+	PTLRPC_NRS_CTL_INVALID,
+	/**
+	 * Activate the policy.
+	 */
+	PTLRPC_NRS_CTL_START,
+	/**
+	 * Reserved for multiple primary policies, which may be a possibility
+	 * in the future.
+	 */
+	PTLRPC_NRS_CTL_STOP,
+	/**
+	 * Policies can start using opcodes from this value and onwards for
+	 * their own purposes; the assigned value itself is arbitrary.
+	 */
+	PTLRPC_NRS_CTL_1ST_POL_SPEC = 0x20,
+};
+
+/**
+ * NRS policy operations.
+ *
+ * These determine the behaviour of a policy, and are called in response to
+ * NRS core events.
+ */
+struct ptlrpc_nrs_pol_ops {
+	/**
+	 * Called during policy registration; this operation is optional.
+	 *
+	 * \param[in,out] policy The policy being initialized
+	 */
+	int	(*op_policy_init) (struct ptlrpc_nrs_policy *policy);
+	/**
+	 * Called during policy unregistration; this operation is optional.
+	 *
+	 * \param[in,out] policy The policy being unregistered/finalized
+	 */
+	void	(*op_policy_fini) (struct ptlrpc_nrs_policy *policy);
+	/**
+	 * Called when activating a policy via lprocfs; policies allocate and
+	 * initialize their resources here; this operation is optional.
+	 *
+	 * \param[in,out] policy The policy being started
+	 * \param[in,out] arg A generic char buffer
+	 *
+	 * \see nrs_policy_start_locked()
+	 */
+	int	(*op_policy_start) (struct ptlrpc_nrs_policy *policy,
+				    char *arg);
+	/**
+	 * Called when deactivating a policy via lprocfs; policies deallocate
+	 * their resources here; this operation is optional
+	 *
+	 * \param[in,out] policy The policy being stopped
+	 *
+	 * \see nrs_policy_stop0()
+	 */
+	void	(*op_policy_stop) (struct ptlrpc_nrs_policy *policy);
+	/**
+	 * Used for policy-specific operations; i.e. not generic ones like
+	 * \e PTLRPC_NRS_CTL_START and \e PTLRPC_NRS_CTL_GET_INFO; analogous
+	 * to an ioctl; this operation is optional.
+	 *
+	 * \param[in,out]	 policy The policy carrying out operation \a opc
+	 * \param[in]	  opc	 The command operation being carried out
+	 * \param[in,out] arg	 An generic buffer for communication between the
+	 *			 user and the control operation
+	 *
+	 * \retval -ve error
+	 * \retval   0 success
+	 *
+	 * \see ptlrpc_nrs_policy_control()
+	 */
+	int	(*op_policy_ctl) (struct ptlrpc_nrs_policy *policy,
+				  enum ptlrpc_nrs_ctl opc, void *arg);
+
+	/**
+	 * Called when obtaining references to the resources of the resource
+	 * hierarchy for a request that has arrived for handling at the PTLRPC
+	 * service. Policies should return -ve for requests they do not wish
+	 * to handle. This operation is mandatory.
+	 *
+	 * \param[in,out] policy  The policy we're getting resources for.
+	 * \param[in,out] nrq	  The request we are getting resources for.
+	 * \param[in]	  parent  The parent resource of the resource being
+	 *			  requested; set to NULL if none.
+	 * \param[out]	  resp	  The resource is to be returned here; the
+	 *			  fallback policy in an NRS head should
+	 *			  \e always return a non-NULL pointer value.
+	 * \param[in]  moving_req When set, signifies that this is an attempt
+	 *			  to obtain resources for a request being moved
+	 *			  to the high-priority NRS head by
+	 *			  ldlm_lock_reorder_req().
+	 *			  This implies two things:
+	 *			  1. We are under obd_export::exp_rpc_lock and
+	 *			  so should not sleep.
+	 *			  2. We should not perform non-idempotent or can
+	 *			  skip performing idempotent operations that
+	 *			  were carried out when resources were first
+	 *			  taken for the request when it was initialized
+	 *			  in ptlrpc_nrs_req_initialize().
+	 *
+	 * \retval 0, +ve The level of the returned resource in the resource
+	 *		  hierarchy; currently only 0 (for a non-leaf resource)
+	 *		  and 1 (for a leaf resource) are supported by the
+	 *		  framework.
+	 * \retval -ve	  error
+	 *
+	 * \see ptlrpc_nrs_req_initialize()
+	 * \see ptlrpc_nrs_hpreq_add_nolock()
+	 * \see ptlrpc_nrs_req_hp_move()
+	 */
+	int	(*op_res_get) (struct ptlrpc_nrs_policy *policy,
+			       struct ptlrpc_nrs_request *nrq,
+			       const struct ptlrpc_nrs_resource *parent,
+			       struct ptlrpc_nrs_resource **resp,
+			       bool moving_req);
+	/**
+	 * Called when releasing references taken for resources in the resource
+	 * hierarchy for the request; this operation is optional.
+	 *
+	 * \param[in,out] policy The policy the resource belongs to
+	 * \param[in] res	 The resource to be freed
+	 *
+	 * \see ptlrpc_nrs_req_finalize()
+	 * \see ptlrpc_nrs_hpreq_add_nolock()
+	 * \see ptlrpc_nrs_req_hp_move()
+	 */
+	void	(*op_res_put) (struct ptlrpc_nrs_policy *policy,
+			       const struct ptlrpc_nrs_resource *res);
+
+	/**
+	 * Obtains a request for handling from the policy, and optionally
+	 * removes the request from the policy; this operation is mandatory.
+	 *
+	 * \param[in,out] policy The policy to poll
+	 * \param[in]	  peek	 When set, signifies that we just want to
+	 *			 examine the request, and not handle it, so the
+	 *			 request is not removed from the policy.
+	 * \param[in]	  force  When set, it will force a policy to return a
+	 *			 request if it has one queued.
+	 *
+	 * \retval NULL No request available for handling
+	 * \retval valid-pointer The request polled for handling
+	 *
+	 * \see ptlrpc_nrs_req_get_nolock()
+	 */
+	struct ptlrpc_nrs_request *
+		(*op_req_get) (struct ptlrpc_nrs_policy *policy, bool peek,
+			       bool force);
+	/**
+	 * Called when attempting to add a request to a policy for later
+	 * handling; this operation is mandatory.
+	 *
+	 * \param[in,out] policy  The policy on which to enqueue \a nrq
+	 * \param[in,out] nrq The request to enqueue
+	 *
+	 * \retval 0	success
+	 * \retval != 0 error
+	 *
+	 * \see ptlrpc_nrs_req_add_nolock()
+	 */
+	int	(*op_req_enqueue) (struct ptlrpc_nrs_policy *policy,
+				   struct ptlrpc_nrs_request *nrq);
+	/**
+	 * Removes a request from the policy's set of pending requests. Normally
+	 * called after a request has been polled successfully from the policy
+	 * for handling; this operation is mandatory.
+	 *
+	 * \param[in,out] policy The policy the request \a nrq belongs to
+	 * \param[in,out] nrq	 The request to dequeue
+	 *
+	 * \see ptlrpc_nrs_req_del_nolock()
+	 */
+	void	(*op_req_dequeue) (struct ptlrpc_nrs_policy *policy,
+				   struct ptlrpc_nrs_request *nrq);
+	/**
+	 * Called after the request being carried out. Could be used for
+	 * job/resource control; this operation is optional.
+	 *
+	 * \param[in,out] policy The policy which is stopping to handle request
+	 *			 \a nrq
+	 * \param[in,out] nrq	 The request
+	 *
+	 * \pre assert_spin_locked(&svcpt->scp_req_lock)
+	 *
+	 * \see ptlrpc_nrs_req_stop_nolock()
+	 */
+	void	(*op_req_stop) (struct ptlrpc_nrs_policy *policy,
+				struct ptlrpc_nrs_request *nrq);
+	/**
+	 * Registers the policy's lprocfs interface with a PTLRPC service.
+	 *
+	 * \param[in] svc The service
+	 *
+	 * \retval 0	success
+	 * \retval != 0 error
+	 */
+	int	(*op_lprocfs_init) (struct ptlrpc_service *svc);
+	/**
+	 * Unegisters the policy's lprocfs interface with a PTLRPC service.
+	 *
+	 * In cases of failed policy registration in
+	 * \e ptlrpc_nrs_policy_register(), this function may be called for a
+	 * service which has not registered the policy successfully, so
+	 * implementations of this method should make sure their operations are
+	 * safe in such cases.
+	 *
+	 * \param[in] svc The service
+	 */
+	void	(*op_lprocfs_fini) (struct ptlrpc_service *svc);
+};
+
+/**
+ * Policy flags
+ */
+enum nrs_policy_flags {
+	/**
+	 * Fallback policy, use this flag only on a single supported policy per
+	 * service. The flag cannot be used on policies that use
+	 * \e PTLRPC_NRS_FL_REG_EXTERN
+	 */
+	PTLRPC_NRS_FL_FALLBACK		= (1 << 0),
+	/**
+	 * Start policy immediately after registering.
+	 */
+	PTLRPC_NRS_FL_REG_START		= (1 << 1),
+	/**
+	 * This is a policy registering from a module different to the one NRS
+	 * core ships in (currently ptlrpc).
+	 */
+	PTLRPC_NRS_FL_REG_EXTERN	= (1 << 2),
+};
+
+/**
+ * NRS queue type.
+ *
+ * Denotes whether an NRS instance is for handling normal or high-priority
+ * RPCs, or whether an operation pertains to one or both of the NRS instances
+ * in a service.
+ */
+enum ptlrpc_nrs_queue_type {
+	PTLRPC_NRS_QUEUE_REG	= (1 << 0),
+	PTLRPC_NRS_QUEUE_HP	= (1 << 1),
+	PTLRPC_NRS_QUEUE_BOTH	= (PTLRPC_NRS_QUEUE_REG | PTLRPC_NRS_QUEUE_HP)
+};
+
+/**
+ * NRS head
+ *
+ * A PTLRPC service has at least one NRS head instance for handling normal
+ * priority RPCs, and may optionally have a second NRS head instance for
+ * handling high-priority RPCs. Each NRS head maintains a list of available
+ * policies, of which one and only one policy is acting as the fallback policy,
+ * and optionally a different policy may be acting as the primary policy. For
+ * all RPCs handled by this NRS head instance, NRS core will first attempt to
+ * enqueue the RPC using the primary policy (if any). The fallback policy is
+ * used in the following cases:
+ * - when there was no primary policy in the
+ *   ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state at the time the request
+ *   was initialized.
+ * - when the primary policy that was at the
+ *   ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the
+ *   RPC was initialized, denoted it did not wish, or for some other reason was
+ *   not able to handle the request, by returning a non-valid NRS resource
+ *   reference.
+ * - when the primary policy that was at the
+ *   ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the
+ *   RPC was initialized, fails later during the request enqueueing stage.
+ *
+ * \see nrs_resource_get_safe()
+ * \see nrs_request_enqueue()
+ */
+struct ptlrpc_nrs {
+	spinlock_t			nrs_lock;
+	/** XXX Possibly replace svcpt->scp_req_lock with another lock here. */
+	/**
+	 * List of registered policies
+	 */
+	struct list_head		nrs_policy_list;
+	/**
+	 * List of policies with queued requests. Policies that have any
+	 * outstanding requests are queued here, and this list is queried
+	 * in a round-robin manner from NRS core when obtaining a request
+	 * for handling. This ensures that requests from policies that at some
+	 * point transition away from the
+	 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state are drained.
+	 */
+	struct list_head		nrs_policy_queued;
+	/**
+	 * Service partition for this NRS head
+	 */
+	struct ptlrpc_service_part     *nrs_svcpt;
+	/**
+	 * Primary policy, which is the preferred policy for handling RPCs
+	 */
+	struct ptlrpc_nrs_policy       *nrs_policy_primary;
+	/**
+	 * Fallback policy, which is the backup policy for handling RPCs
+	 */
+	struct ptlrpc_nrs_policy       *nrs_policy_fallback;
+	/**
+	 * This NRS head handles either HP or regular requests
+	 */
+	enum ptlrpc_nrs_queue_type	nrs_queue_type;
+	/**
+	 * # queued requests from all policies in this NRS head
+	 */
+	unsigned long			nrs_req_queued;
+	/**
+	 * # scheduled requests from all policies in this NRS head
+	 */
+	unsigned long			nrs_req_started;
+	/**
+	 * # policies on this NRS
+	 */
+	unsigned			nrs_num_pols;
+	/**
+	 * This NRS head is in progress of starting a policy
+	 */
+	unsigned			nrs_policy_starting:1;
+	/**
+	 * In progress of shutting down the whole NRS head; used during
+	 * unregistration
+	 */
+	unsigned			nrs_stopping:1;
+	/**
+	 * NRS policy is throttling reqeust
+	 */
+	unsigned			nrs_throttling:1;
+};
+
+#define NRS_POL_NAME_MAX		16
+#define NRS_POL_ARG_MAX			16
+
+struct ptlrpc_nrs_pol_desc;
+
+/**
+ * Service compatibility predicate; this determines whether a policy is adequate
+ * for handling RPCs of a particular PTLRPC service.
+ *
+ * XXX:This should give the same result during policy registration and
+ * unregistration, and for all partitions of a service; so the result should not
+ * depend on temporal service or other properties, that may influence the
+ * result.
+ */
+typedef bool (*nrs_pol_desc_compat_t) (const struct ptlrpc_service *svc,
+				       const struct ptlrpc_nrs_pol_desc *desc);
+
+struct ptlrpc_nrs_pol_conf {
+	/**
+	 * Human-readable policy name
+	 */
+	char				   nc_name[NRS_POL_NAME_MAX];
+	/**
+	 * NRS operations for this policy
+	 */
+	const struct ptlrpc_nrs_pol_ops   *nc_ops;
+	/**
+	 * Service compatibility predicate
+	 */
+	nrs_pol_desc_compat_t		   nc_compat;
+	/**
+	 * Set for policies that support a single ptlrpc service, i.e. ones that
+	 * have \a pd_compat set to nrs_policy_compat_one(). The variable value
+	 * depicts the name of the single service that such policies are
+	 * compatible with.
+	 */
+	const char			  *nc_compat_svc_name;
+	/**
+	 * Owner module for this policy descriptor; policies registering from a
+	 * different module to the one the NRS framework is held within
+	 * (currently ptlrpc), should set this field to THIS_MODULE.
+	 */
+	struct module			  *nc_owner;
+	/**
+	 * Policy registration flags; a bitmast of \e nrs_policy_flags
+	 */
+	unsigned			   nc_flags;
+};
+
+/**
+ * NRS policy registering descriptor
+ *
+ * Is used to hold a description of a policy that can be passed to NRS core in
+ * order to register the policy with NRS heads in different PTLRPC services.
+ */
+struct ptlrpc_nrs_pol_desc {
+	/**
+	 * Human-readable policy name
+	 */
+	char					pd_name[NRS_POL_NAME_MAX];
+	/**
+	 * Link into nrs_core::nrs_policies
+	 */
+	struct list_head			pd_list;
+	/**
+	 * NRS operations for this policy
+	 */
+	const struct ptlrpc_nrs_pol_ops        *pd_ops;
+	/**
+	 * Service compatibility predicate
+	 */
+	nrs_pol_desc_compat_t			pd_compat;
+	/**
+	 * Set for policies that are compatible with only one PTLRPC service.
+	 *
+	 * \see ptlrpc_nrs_pol_conf::nc_compat_svc_name
+	 */
+	const char			       *pd_compat_svc_name;
+	/**
+	 * Owner module for this policy descriptor.
+	 *
+	 * We need to hold a reference to the module whenever we might make use
+	 * of any of the module's contents, i.e.
+	 * - If one or more instances of the policy are at a state where they
+	 *   might be handling a request, i.e.
+	 *   ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or
+	 *   ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING as we will have to
+	 *   call into the policy's ptlrpc_nrs_pol_ops() handlers. A reference
+	 *   is taken on the module when
+	 *   \e ptlrpc_nrs_pol_desc::pd_refs becomes 1, and released when it
+	 *   becomes 0, so that we hold only one reference to the module maximum
+	 *   at any time.
+	 *
+	 *   We do not need to hold a reference to the module, even though we
+	 *   might use code and data from the module, in the following cases:
+	 * - During external policy registration, because this should happen in
+	 *   the module's init() function, in which case the module is safe from
+	 *   removal because a reference is being held on the module by the
+	 *   kernel, and iirc kmod (and I guess module-init-tools also) will
+	 *   serialize any racing processes properly anyway.
+	 * - During external policy unregistration, because this should happen
+	 *   in a module's exit() function, and any attempts to start a policy
+	 *   instance would need to take a reference on the module, and this is
+	 *   not possible once we have reached the point where the exit()
+	 *   handler is called.
+	 * - During service registration and unregistration, as service setup
+	 *   and cleanup, and policy registration, unregistration and policy
+	 *   instance starting, are serialized by \e nrs_core::nrs_mutex, so
+	 *   as long as users adhere to the convention of registering policies
+	 *   in init() and unregistering them in module exit() functions, there
+	 *   should not be a race between these operations.
+	 * - During any policy-specific lprocfs operations, because a reference
+	 *   is held by the kernel on a proc entry that has been entered by a
+	 *   syscall, so as long as proc entries are removed during
+	 *   unregistration time, then unregistration and lprocfs operations
+	 *   will be properly serialized.
+	 */
+	struct module			       *pd_owner;
+	/**
+	 * Bitmask of \e nrs_policy_flags
+	 */
+	unsigned				pd_flags;
+	/**
+	 * # of references on this descriptor
+	 */
+	atomic_t				pd_refs;
+};
+
+/**
+ * NRS policy state
+ *
+ * Policies transition from one state to the other during their lifetime
+ */
+enum ptlrpc_nrs_pol_state {
+	/**
+	 * Not a valid policy state.
+	 */
+	NRS_POL_STATE_INVALID,
+	/**
+	 * Policies are at this state either at the start of their life, or
+	 * transition here when the user selects a different policy to act
+	 * as the primary one.
+	 */
+	NRS_POL_STATE_STOPPED,
+	/**
+	 * Policy is progress of stopping
+	 */
+	NRS_POL_STATE_STOPPING,
+	/**
+	 * Policy is in progress of starting
+	 */
+	NRS_POL_STATE_STARTING,
+	/**
+	 * A policy is in this state in two cases:
+	 * - it is the fallback policy, which is always in this state.
+	 * - it has been activated by the user; i.e. it is the primary policy,
+	 */
+	NRS_POL_STATE_STARTED,
+};
+
+/**
+ * NRS policy information
+ *
+ * Used for obtaining information for the status of a policy via lprocfs
+ */
+struct ptlrpc_nrs_pol_info {
+	/**
+	 * Policy name
+	 */
+	char				pi_name[NRS_POL_NAME_MAX];
+	/**
+	 * Policy argument
+	 */
+	char				pi_arg[NRS_POL_ARG_MAX];
+	/**
+	 * Current policy state
+	 */
+	enum ptlrpc_nrs_pol_state	pi_state;
+	/**
+	 * # RPCs enqueued for later dispatching by the policy
+	 */
+	long				pi_req_queued;
+	/**
+	 * # RPCs started for dispatch by the policy
+	 */
+	long				pi_req_started;
+	/**
+	 * Is this a fallback policy?
+	 */
+	unsigned			pi_fallback:1;
+};
+
+/**
+ * NRS policy
+ *
+ * There is one instance of this for each policy in each NRS head of each
+ * PTLRPC service partition.
+ */
+struct ptlrpc_nrs_policy {
+	/**
+	 * Linkage into the NRS head's list of policies,
+	 * ptlrpc_nrs:nrs_policy_list
+	 */
+	struct list_head		pol_list;
+	/**
+	 * Linkage into the NRS head's list of policies with enqueued
+	 * requests ptlrpc_nrs:nrs_policy_queued
+	 */
+	struct list_head		pol_list_queued;
+	/**
+	 * Current state of this policy
+	 */
+	enum ptlrpc_nrs_pol_state	pol_state;
+	/**
+	 * Bitmask of nrs_policy_flags
+	 */
+	unsigned			pol_flags;
+	/**
+	 * # RPCs enqueued for later dispatching by the policy
+	 */
+	long				pol_req_queued;
+	/**
+	 * # RPCs started for dispatch by the policy
+	 */
+	long				pol_req_started;
+	/**
+	 * Usage Reference count taken on the policy instance
+	 */
+	long				pol_ref;
+	/**
+	 * Human-readable policy argument
+	 */
+	char				pol_arg[NRS_POL_ARG_MAX];
+	/**
+	 * The NRS head this policy has been created at
+	 */
+	struct ptlrpc_nrs	       *pol_nrs;
+	/**
+	 * Private policy data; varies by policy type
+	 */
+	void			       *pol_private;
+	/**
+	 * Policy descriptor for this policy instance.
+	 */
+	struct ptlrpc_nrs_pol_desc     *pol_desc;
+};
+
+/**
+ * NRS resource
+ *
+ * Resources are embedded into two types of NRS entities:
+ * - Inside NRS policies, in the policy's private data in
+ *   ptlrpc_nrs_policy::pol_private
+ * - In objects that act as prime-level scheduling entities in different NRS
+ *   policies; e.g. on a policy that performs round robin or similar order
+ *   scheduling across client NIDs, there would be one NRS resource per unique
+ *   client NID. On a policy which performs round robin scheduling across
+ *   backend filesystem objects, there would be one resource associated with
+ *   each of the backend filesystem objects partaking in the scheduling
+ *   performed by the policy.
+ *
+ * NRS resources share a parent-child relationship, in which resources embedded
+ * in policy instances are the parent entities, with all scheduling entities
+ * a policy schedules across being the children, thus forming a simple resource
+ * hierarchy. This hierarchy may be extended with one or more levels in the
+ * future if the ability to have more than one primary policy is added.
+ *
+ * Upon request initialization, references to the then active NRS policies are
+ * taken and used to later handle the dispatching of the request with one of
+ * these policies.
+ *
+ * \see nrs_resource_get_safe()
+ * \see ptlrpc_nrs_req_add()
+ */
+struct ptlrpc_nrs_resource {
+	/**
+	 * This NRS resource's parent; is NULL for resources embedded in NRS
+	 * policy instances; i.e. those are top-level ones.
+	 */
+	struct ptlrpc_nrs_resource     *res_parent;
+	/**
+	 * The policy associated with this resource.
+	 */
+	struct ptlrpc_nrs_policy       *res_policy;
+};
+
+enum {
+	NRS_RES_FALLBACK,
+	NRS_RES_PRIMARY,
+	NRS_RES_MAX
+};
+
+#include <lustre_nrs_fifo.h>
+#include <lustre_nrs_tbf.h>
+#include <lustre_nrs_crr.h>
+#include <lustre_nrs_orr.h>
+#include <lustre_nrs_delay.h>
+
+/**
+ * NRS request
+ *
+ * Instances of this object exist embedded within ptlrpc_request; the main
+ * purpose of this object is to hold references to the request's resources
+ * for the lifetime of the request, and to hold properties that policies use
+ * use for determining the request's scheduling priority.
+ * */
+struct ptlrpc_nrs_request {
+	/**
+	 * The request's resource hierarchy.
+	 */
+	struct ptlrpc_nrs_resource     *nr_res_ptrs[NRS_RES_MAX];
+	/**
+	 * Index into ptlrpc_nrs_request::nr_res_ptrs of the resource of the
+	 * policy that was used to enqueue the request.
+	 *
+	 * \see nrs_request_enqueue()
+	 */
+	unsigned			nr_res_idx;
+	unsigned			nr_initialized:1;
+	unsigned			nr_enqueued:1;
+	unsigned			nr_started:1;
+	unsigned			nr_finalized:1;
+	struct cfs_binheap_node		nr_node;
+
+	/**
+	 * Policy-specific fields, used for determining a request's scheduling
+	 * priority, and other supporting functionality.
+	 */
+	union {
+		/**
+		 * Fields for the FIFO policy
+		 */
+		struct nrs_fifo_req	fifo;
+		/**
+		 * CRR-N request defintion
+		 */
+		struct nrs_crrn_req	crr;
+		/** ORR and TRR share the same request definition */
+		struct nrs_orr_req	orr;
+		/**
+		 * TBF request definition
+		 */
+		struct nrs_tbf_req	tbf;
+		/**
+		 * Fields for the delay policy
+		 */
+		struct nrs_delay_req	delay;
+	} nr_u;
+	/**
+	 * Externally-registering policies may want to use this to allocate
+	 * their own request properties.
+	 */
+	void			       *ext;
+};
+
+/** @} nrs */
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nrs_crr.h b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_crr.h
new file mode 100644
index 0000000000000..f057ec72d9289
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_crr.h
@@ -0,0 +1,126 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ *
+ * Network Request Scheduler (NRS) Client Round Robin over NIDs (CRR-N) policy
+ *
+ */
+
+#ifndef _LUSTRE_NRS_CRR_H
+#define _LUSTRE_NRS_CRR_H
+
+/**
+ * \name CRR-N
+ *
+ * CRR-N, Client Round Robin over NIDs
+ * @{
+ */
+
+/**
+ * private data structure for CRR-N NRS
+ */
+struct nrs_crrn_net {
+	struct ptlrpc_nrs_resource	cn_res;
+	struct cfs_binheap	       *cn_binheap;
+	struct cfs_hash		       *cn_cli_hash;
+	/**
+	 * Used when a new scheduling round commences, in order to synchronize
+	 * all clients with the new round number.
+	 */
+	__u64				cn_round;
+	/**
+	 * Determines the relevant ordering amongst request batches within a
+	 * scheduling round.
+	 */
+	__u64				cn_sequence;
+	/**
+	 * Round Robin quantum; the maximum number of RPCs that each request
+	 * batch for each client can have in a scheduling round.
+	 */
+	__u16				cn_quantum;
+};
+
+/**
+ * Object representing a client in CRR-N, as identified by its NID
+ */
+struct nrs_crrn_client {
+	struct ptlrpc_nrs_resource	cc_res;
+	struct hlist_node		cc_hnode;
+	lnet_nid_t			cc_nid;
+	/**
+	 * The round number against which this client is currently scheduling
+	 * requests.
+	 */
+	__u64				cc_round;
+	/**
+	 * The sequence number used for requests scheduled by this client during
+	 * the current round number.
+	 */
+	__u64				cc_sequence;
+	atomic_t			cc_ref;
+	/**
+	 * Round Robin quantum; the maximum number of RPCs the client is allowed
+	 * to schedule in a single batch of each round.
+	 */
+	__u16				cc_quantum;
+	/**
+	 * # of pending requests for this client, on all existing rounds
+	 */
+	__u16				cc_active;
+};
+
+/**
+ * CRR-N NRS request definition
+ */
+struct nrs_crrn_req {
+	/**
+	 * Round number for this request; shared with all other requests in the
+	 * same batch.
+	 */
+	__u64			cr_round;
+	/**
+	 * Sequence number for this request; shared with all other requests in
+	 * the same batch.
+	 */
+	__u64			cr_sequence;
+};
+
+/**
+ * CRR-N policy operations.
+ */
+enum nrs_ctl_crr {
+	/**
+	 * Read the RR quantum size of a CRR-N policy.
+	 */
+	NRS_CTL_CRRN_RD_QUANTUM = PTLRPC_NRS_CTL_1ST_POL_SPEC,
+	/**
+	 * Write the RR quantum size of a CRR-N policy.
+	 */
+	NRS_CTL_CRRN_WR_QUANTUM,
+};
+
+/** @} CRR-N */
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nrs_delay.h b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_delay.h
new file mode 100644
index 0000000000000..01605a7f4129e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_delay.h
@@ -0,0 +1,87 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2015, Cray Inc. All Rights Reserved.
+ *
+ * Copyright (c) 2015, Intel Corporation.
+ */
+/*
+ *
+ * Network Request Scheduler (NRS) Delay policy
+ *
+ */
+
+#ifndef _LUSTRE_NRS_DELAY_H
+#define _LUSTRE_NRS_DELAY_H
+
+/* \name delay
+ *
+ * Delay policy
+ * @{
+ */
+
+/**
+ * Private data structure for the delay policy
+ */
+struct nrs_delay_data {
+	struct ptlrpc_nrs_resource	 delay_res;
+
+	/**
+	 * Delayed requests are stored in this binheap until they are
+	 * removed for handling.
+	 */
+	struct cfs_binheap		*delay_binheap;
+
+	/**
+	 * Minimum service time
+	 */
+	__u32				 min_delay;
+
+	/**
+	 * Maximum service time
+	 */
+	__u32				 max_delay;
+
+	/**
+	 * We'll delay this percent of requests
+	 */
+	__u32				 delay_pct;
+};
+
+struct nrs_delay_req {
+	/**
+	 * This is the time at which a request becomes eligible for handling
+	 */
+	time64_t	req_start_time;
+};
+
+enum nrs_ctl_delay {
+	NRS_CTL_DELAY_RD_MIN = PTLRPC_NRS_CTL_1ST_POL_SPEC,
+	NRS_CTL_DELAY_WR_MIN,
+	NRS_CTL_DELAY_RD_MAX,
+	NRS_CTL_DELAY_WR_MAX,
+	NRS_CTL_DELAY_RD_PCT,
+	NRS_CTL_DELAY_WR_PCT,
+};
+
+/** @} delay */
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nrs_fifo.h b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_fifo.h
new file mode 100644
index 0000000000000..3b5418eac6c44
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_fifo.h
@@ -0,0 +1,70 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ *
+ * Network Request Scheduler (NRS) First-in First-out (FIFO) policy
+ *
+ */
+
+#ifndef _LUSTRE_NRS_FIFO_H
+#define _LUSTRE_NRS_FIFO_H
+
+/* \name fifo
+ *
+ * FIFO policy
+ *
+ * This policy is a logical wrapper around previous, non-NRS functionality.
+ * It dispatches RPCs in the same order as they arrive from the network. This
+ * policy is currently used as the fallback policy, and the only enabled policy
+ * on all NRS heads of all PTLRPC service partitions.
+ * @{
+ */
+
+/**
+ * Private data structure for the FIFO policy
+ */
+struct nrs_fifo_head {
+	/**
+	 * Resource object for policy instance.
+	 */
+	struct ptlrpc_nrs_resource	fh_res;
+	/**
+	 * List of queued requests.
+	 */
+	struct list_head		fh_list;
+	/**
+	 * For debugging purposes.
+	 */
+	__u64				fh_sequence;
+};
+
+struct nrs_fifo_req {
+	struct list_head	fr_list;
+	__u64			fr_sequence;
+};
+
+/** @} fifo */
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nrs_orr.h b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_orr.h
new file mode 100644
index 0000000000000..d9789b26286aa
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_orr.h
@@ -0,0 +1,225 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ *
+ * Network Request Scheduler (NRS) Object-based Round Robin and Target-based
+ * Round Robin (ORR and TRR) policies
+ *
+ */
+
+#ifndef _LUSTRE_NRS_ORR_H
+#define _LUSTRE_NRS_ORR_H
+
+/**
+ * ORR policy operations
+ */
+enum nrs_ctl_orr {
+	NRS_CTL_ORR_RD_QUANTUM = PTLRPC_NRS_CTL_1ST_POL_SPEC,
+	NRS_CTL_ORR_WR_QUANTUM,
+	NRS_CTL_ORR_RD_OFF_TYPE,
+	NRS_CTL_ORR_WR_OFF_TYPE,
+	NRS_CTL_ORR_RD_SUPP_REQ,
+	NRS_CTL_ORR_WR_SUPP_REQ,
+};
+
+/**
+ * \name ORR/TRR
+ *
+ * ORR/TRR (Object-based Round Robin/Target-based Round Robin) NRS policies
+ * @{
+ */
+
+/**
+ * Lower and upper byte offsets of a brw RPC
+ */
+struct nrs_orr_req_range {
+	__u64		or_start;
+	__u64		or_end;
+};
+
+/**
+ * RPC types supported by the ORR/TRR policies
+ */
+enum nrs_orr_supp {
+	NOS_OST_READ  = (1 << 0),
+	NOS_OST_WRITE = (1 << 1),
+	NOS_OST_RW    = (NOS_OST_READ | NOS_OST_WRITE),
+	/**
+	 * Default value for policies.
+	 */
+	NOS_DFLT      = NOS_OST_READ
+};
+
+/**
+ * As unique keys for grouping RPCs together, we use the object's OST FID for
+ * the ORR policy, and the OST index for the TRR policy.
+ *
+ * XXX: We waste some space for TRR policy instances by using a union, but it
+ *	allows to consolidate some of the code between ORR and TRR, and these
+ *	policies will probably eventually merge into one anyway.
+ */
+struct nrs_orr_key {
+	union {
+		/** object FID for ORR */
+		struct lu_fid	ok_fid;
+		/** OST index for TRR */
+		__u32		ok_idx;
+	};
+};
+
+/**
+ * The largest base string for unique hash/slab object names is
+ * "nrs_orr_reg_", so 13 characters. We add 3 to this to be used for the CPT
+ * id number, so this _should_ be more than enough for the maximum number of
+ * CPTs on any system. If it does happen that this statement is incorrect,
+ * nrs_orr_genobjname() will inevitably yield a non-unique name and cause
+ * kmem_cache_create() to complain (on Linux), so the erroneous situation
+ * will hopefully not go unnoticed.
+ */
+#define NRS_ORR_OBJ_NAME_MAX	(sizeof("nrs_orr_reg_") + 3)
+
+/**
+ * private data structure for ORR and TRR NRS
+ */
+struct nrs_orr_data {
+	struct ptlrpc_nrs_resource	od_res;
+	struct cfs_binheap	       *od_binheap;
+	struct cfs_hash		       *od_obj_hash;
+	struct kmem_cache	       *od_cache;
+	/**
+	 * Used when a new scheduling round commences, in order to synchronize
+	 * all object or OST batches with the new round number.
+	 */
+	__u64				od_round;
+	/**
+	 * Determines the relevant ordering amongst request batches within a
+	 * scheduling round.
+	 */
+	__u64				od_sequence;
+	/**
+	 * RPC types that are currently supported.
+	 */
+	enum nrs_orr_supp		od_supp;
+	/**
+	 * Round Robin quantum; the maxium number of RPCs that each request
+	 * batch for each object or OST can have in a scheduling round.
+	 */
+	__u16				od_quantum;
+	/**
+	 * Whether to use physical disk offsets or logical file offsets.
+	 */
+	bool				od_physical;
+	/**
+	 * XXX: We need to provide a persistently allocated string to hold
+	 * unique object names for this policy, since in currently supported
+	 * versions of Linux by Lustre, kmem_cache_create() just sets a pointer
+	 * to the name string provided. kstrdup() is used in the version of
+	 * kmeme_cache_create() in current Linux mainline, so we may be able to
+	 * remove this in the future.
+	 */
+	char				od_objname[NRS_ORR_OBJ_NAME_MAX];
+};
+
+/**
+ * Represents a backend-fs object or OST in the ORR and TRR policies
+ * respectively
+ */
+struct nrs_orr_object {
+	struct ptlrpc_nrs_resource	oo_res;
+	struct hlist_node		oo_hnode;
+	/**
+	 * The round number against which requests are being scheduled for this
+	 * object or OST
+	 */
+	__u64				oo_round;
+	/**
+	 * The sequence number used for requests scheduled for this object or
+	 * OST during the current round number.
+	 */
+	__u64				oo_sequence;
+	/**
+	 * The key of the object or OST for which this structure instance is
+	 * scheduling RPCs
+	 */
+	struct nrs_orr_key		oo_key;
+	long				oo_ref;
+	/**
+	 * Round Robin quantum; the maximum number of RPCs that are allowed to
+	 * be scheduled for the object or OST in a single batch of each round.
+	 */
+	__u16				oo_quantum;
+	/**
+	 * # of pending requests for this object or OST, on all existing rounds
+	 */
+	__u16				oo_active;
+};
+
+/**
+ * ORR/TRR NRS request definition
+ */
+struct nrs_orr_req {
+	/**
+	 * The offset range this request covers
+	 */
+	struct nrs_orr_req_range	or_range;
+	/**
+	 * Round number for this request; shared with all other requests in the
+	 * same batch.
+	 */
+	__u64				or_round;
+	/**
+	 * Sequence number for this request; shared with all other requests in
+	 * the same batch.
+	 */
+	__u64				or_sequence;
+	/**
+	 * For debugging purposes.
+	 */
+	struct nrs_orr_key		or_key;
+	/**
+	 * An ORR policy instance has filled in request information while
+	 * enqueueing the request on the service partition's regular NRS head.
+	 */
+	unsigned int			or_orr_set:1;
+	/**
+	 * A TRR policy instance has filled in request information while
+	 * enqueueing the request on the service partition's regular NRS head.
+	 */
+	unsigned int			or_trr_set:1;
+	/**
+	 * Request offset ranges have been filled in with logical offset
+	 * values.
+	 */
+	unsigned int			or_logical_set:1;
+	/**
+	 * Request offset ranges have been filled in with physical offset
+	 * values.
+	 */
+	unsigned int			or_physical_set:1;
+};
+
+/** @} ORR/TRR */
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nrs_tbf.h b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_tbf.h
new file mode 100644
index 0000000000000..6e0c736ab8d87
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_tbf.h
@@ -0,0 +1,343 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (C) 2013 DataDirect Networks, Inc.
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ */
+/*
+ *
+ * Network Request Scheduler (NRS) Token Bucket Filter(TBF) policy
+ *
+ */
+
+#ifndef _LUSTRE_NRS_TBF_H
+#define _LUSTRE_NRS_TBF_H
+
+/* \name tbf
+ *
+ * TBF policy
+ *
+ * @{
+ */
+
+struct nrs_tbf_head;
+struct nrs_tbf_cmd;
+
+#define NRS_TBF_MATCH_FULL	0x0000001
+#define NRS_TBF_MATCH_WILDCARD	0x0000002
+
+struct nrs_tbf_jobid {
+	char		*tj_id;
+	__u32		 tj_match_flag;
+	struct list_head tj_linkage;
+};
+
+#define NRS_TBF_KEY_LEN	(LNET_NIDSTR_SIZE + LUSTRE_JOBID_SIZE + 3 + 2)
+struct nrs_tbf_client {
+	/** Resource object for policy instance. */
+	struct ptlrpc_nrs_resource	 tc_res;
+	/** Node in the hash table. */
+	struct hlist_node		 tc_hnode;
+	/** NID of the client. */
+	lnet_nid_t			 tc_nid;
+	/** Jobid of the client. */
+	char				 tc_jobid[LUSTRE_JOBID_SIZE];
+	/** opcode of the client. */
+	__u32				 tc_opcode;
+	/** Hash key of the client. */
+	char				 tc_key[NRS_TBF_KEY_LEN];
+	/** Reference number of the client. */
+	atomic_t			 tc_ref;
+	/** Lock to protect rule and linkage. */
+	spinlock_t			 tc_rule_lock;
+	/** Linkage to rule. */
+	struct list_head	         tc_linkage;
+	/** Pointer to rule. */
+	struct nrs_tbf_rule		*tc_rule;
+	/** Generation of the rule matched. */
+	__u64				 tc_rule_generation;
+	/** Limit of RPC rate. */
+	__u64				 tc_rpc_rate;
+	/** Time to wait for next token. */
+	__u64				 tc_nsecs;
+	/** RPC token number. */
+	__u64				 tc_ntoken;
+	/** Token bucket depth. */
+	__u64				 tc_depth;
+	/** Time check-point. */
+	__u64				 tc_check_time;
+	/** List of queued requests. */
+	struct list_head		 tc_list;
+	/** Node in binary heap. */
+	struct cfs_binheap_node		 tc_node;
+	/** Whether the client is in heap. */
+	bool				 tc_in_heap;
+	/** Sequence of the newest rule. */
+	__u32				 tc_rule_sequence;
+	/**
+	 * Linkage into LRU list. Protected bucket lock of
+	 * nrs_tbf_head::th_cli_hash.
+	 */
+	struct list_head		 tc_lru;
+};
+
+#define MAX_TBF_NAME (16)
+
+#define NTRS_STOPPING	0x0000001
+#define NTRS_DEFAULT	0x0000002
+
+struct nrs_tbf_rule {
+	/** Name of the rule. */
+	char				 tr_name[MAX_TBF_NAME];
+	/** Head belongs to. */
+	struct nrs_tbf_head		*tr_head;
+	/** Likage to head. */
+	struct list_head		 tr_linkage;
+	/** Nid list of the rule. */
+	struct list_head		 tr_nids;
+	/** Nid list string of the rule.*/
+	char				*tr_nids_str;
+	/** Jobid list of the rule. */
+	struct list_head		 tr_jobids;
+	/** Jobid list string of the rule.*/
+	char				*tr_jobids_str;
+	/** Opcode bitmap of the rule. */
+	struct cfs_bitmap		*tr_opcodes;
+	/** Opcode list string of the rule.*/
+	char				*tr_opcodes_str;
+	/** Condition list of the rule.*/
+	struct list_head		tr_conds;
+	/** Generic condition string of the rule. */
+	char				*tr_conds_str;
+	/** RPC/s limit. */
+	__u64				 tr_rpc_rate;
+	/** Time to wait for next token. */
+	__u64				 tr_nsecs;
+	/** Token bucket depth. */
+	__u64				 tr_depth;
+	/** Lock to protect the list of clients. */
+	spinlock_t			 tr_rule_lock;
+	/** List of client. */
+	struct list_head		 tr_cli_list;
+	/** Flags of the rule. */
+	__u32				 tr_flags;
+	/** Usage Reference count taken on the rule. */
+	atomic_t			 tr_ref;
+	/** Generation of the rule. */
+	__u64				 tr_generation;
+};
+
+struct nrs_tbf_ops {
+	char *o_name;
+	int (*o_startup)(struct ptlrpc_nrs_policy *, struct nrs_tbf_head *);
+	struct nrs_tbf_client *(*o_cli_find)(struct nrs_tbf_head *,
+					     struct ptlrpc_request *);
+	struct nrs_tbf_client *(*o_cli_findadd)(struct nrs_tbf_head *,
+						struct nrs_tbf_client *);
+	void (*o_cli_put)(struct nrs_tbf_head *, struct nrs_tbf_client *);
+	void (*o_cli_init)(struct nrs_tbf_client *, struct ptlrpc_request *);
+	int (*o_rule_init)(struct ptlrpc_nrs_policy *,
+			   struct nrs_tbf_rule *,
+			   struct nrs_tbf_cmd *);
+	int (*o_rule_dump)(struct nrs_tbf_rule *, struct seq_file *);
+	int (*o_rule_match)(struct nrs_tbf_rule *,
+			    struct nrs_tbf_client *);
+	void (*o_rule_fini)(struct nrs_tbf_rule *);
+};
+
+#define NRS_TBF_TYPE_JOBID	"jobid"
+#define NRS_TBF_TYPE_NID	"nid"
+#define NRS_TBF_TYPE_OPCODE	"opcode"
+#define NRS_TBF_TYPE_GENERIC	"generic"
+#define NRS_TBF_TYPE_MAX_LEN	20
+
+enum nrs_tbf_flag {
+	NRS_TBF_FLAG_INVALID	= 0x0000000,
+	NRS_TBF_FLAG_JOBID	= 0x0000001,
+	NRS_TBF_FLAG_NID	= 0x0000002,
+	NRS_TBF_FLAG_OPCODE	= 0x0000004,
+	NRS_TBF_FLAG_GENERIC	= 0x0000008,
+};
+
+struct nrs_tbf_type {
+	const char		*ntt_name;
+	enum nrs_tbf_flag	 ntt_flag;
+	struct nrs_tbf_ops	*ntt_ops;
+};
+
+struct nrs_tbf_bucket {
+	/**
+	 * LRU list, updated on each access to client. Protected by
+	 * bucket lock of nrs_tbf_head::th_cli_hash.
+	 */
+	struct list_head	ntb_lru;
+};
+
+/**
+ * Private data structure for the TBF policy
+ */
+struct nrs_tbf_head {
+	/**
+	 * Resource object for policy instance.
+	 */
+	struct ptlrpc_nrs_resource	 th_res;
+	/**
+	 * List of rules.
+	 */
+	struct list_head		 th_list;
+	/**
+	 * Lock to protect the list of rules.
+	 */
+	spinlock_t			 th_rule_lock;
+	/**
+	 * Generation of rules.
+	 */
+	atomic_t			 th_rule_sequence;
+	/**
+	 * Default rule.
+	 */
+	struct nrs_tbf_rule		*th_rule;
+	/**
+	 * Timer for next token.
+	 */
+	struct hrtimer			 th_timer;
+	/**
+	 * Deadline of the timer.
+	 */
+	__u64				 th_deadline;
+	/**
+	 * Sequence of requests.
+	 */
+	__u64				 th_sequence;
+	/**
+	 * Heap of queues.
+	 */
+	struct cfs_binheap		*th_binheap;
+	/**
+	 * Hash of clients.
+	 */
+	struct cfs_hash			*th_cli_hash;
+	/**
+	 * Type of TBF policy.
+	 */
+	char				 th_type[NRS_TBF_TYPE_MAX_LEN + 1];
+	/**
+	 * Rule operations.
+	 */
+	struct nrs_tbf_ops		*th_ops;
+	/**
+	 * Flag of type.
+	 */
+	__u32				 th_type_flag;
+	/**
+	 * Index of bucket on hash table while purging.
+	 */
+	int				 th_purge_start;
+};
+
+enum nrs_tbf_cmd_type {
+	NRS_CTL_TBF_START_RULE = 0,
+	NRS_CTL_TBF_STOP_RULE,
+	NRS_CTL_TBF_CHANGE_RULE,
+};
+
+struct nrs_tbf_cmd {
+	enum nrs_tbf_cmd_type			 tc_cmd;
+	char					*tc_name;
+	union {
+		struct nrs_tbf_cmd_start {
+			__u64			 ts_rpc_rate;
+			struct list_head	 ts_nids;
+			char			*ts_nids_str;
+			struct list_head	 ts_jobids;
+			char			*ts_jobids_str;
+			struct cfs_bitmap	*ts_opcodes;
+			char			*ts_opcodes_str;
+			struct list_head	 ts_conds;
+			char			*ts_conds_str;
+			__u32			 ts_valid_type;
+			__u32			 ts_rule_flags;
+			char			*ts_next_name;
+		} tc_start;
+		struct nrs_tbf_cmd_change {
+			__u64			 tc_rpc_rate;
+			char			*tc_next_name;
+		} tc_change;
+	} u;
+};
+
+enum nrs_tbf_field {
+	NRS_TBF_FIELD_NID,
+	NRS_TBF_FIELD_JOBID,
+	NRS_TBF_FIELD_OPCODE,
+	NRS_TBF_FIELD_MAX
+};
+
+struct nrs_tbf_expression {
+	enum nrs_tbf_field	 te_field;
+	struct list_head	 te_cond;
+	struct cfs_bitmap	*te_opcodes;
+	struct list_head	 te_linkage;
+};
+
+struct nrs_tbf_conjunction {
+	/**
+	 * link to disjunction.
+	 */
+	struct list_head tc_linkage;
+	/**
+	 * list of logical conjunction
+	 */
+	struct list_head tc_expressions;
+};
+
+struct nrs_tbf_req {
+	/**
+	 * Linkage to queue.
+	 */
+	struct list_head	tr_list;
+	/**
+	 * Sequence of the request.
+	 */
+	__u64			tr_sequence;
+};
+
+/**
+ * TBF policy operations.
+ */
+enum nrs_ctl_tbf {
+	/**
+	 * Read the the data of a TBF policy.
+	 */
+	NRS_CTL_TBF_RD_RULE = PTLRPC_NRS_CTL_1ST_POL_SPEC,
+	/**
+	 * Write the the data of a TBF policy.
+	 */
+	NRS_CTL_TBF_WR_RULE,
+	/**
+	 * Read the TBF policy type preset by proc entry "nrs_policies".
+	 */
+	NRS_CTL_TBF_RD_TYPE_FLAG,
+};
+
+/** @} tbf */
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_obdo.h b/drivers/staging/lustrefsx/lustre/include/lustre_obdo.h
new file mode 100644
index 0000000000000..d3afac961b043
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_obdo.h
@@ -0,0 +1,53 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ *
+ * Copyright 2015 Cray Inc, all rights reserved.
+ * Author: Ben Evans.
+ *
+ * Define obdo associated functions
+ *   obdo:  OBject Device o...
+ */
+
+#ifndef _LUSTRE_OBDO_H_
+#define _LUSTRE_OBDO_H_
+
+#include <lustre/lustre_idl.h>
+
+/**
+ * Create an obdo to send over the wire
+ */
+void lustre_set_wire_obdo(const struct obd_connect_data *ocd,
+			  struct obdo *wobdo,
+			  const struct obdo *lobdo);
+
+/**
+ * Create a local obdo from a wire based odbo
+ */
+void lustre_get_wire_obdo(const struct obd_connect_data *ocd,
+			  struct obdo *lobdo,
+			  const struct obdo *wobdo);
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_patchless_compat.h b/drivers/staging/lustrefsx/lustre/include/lustre_patchless_compat.h
new file mode 100644
index 0000000000000..2ad8bce19ac53
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_patchless_compat.h
@@ -0,0 +1,138 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LUSTRE_PATCHLESS_COMPAT_H
+#define LUSTRE_PATCHLESS_COMPAT_H
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#ifndef HAVE_TRUNCATE_COMPLETE_PAGE
+#include <linux/list.h>
+#include <linux/hash.h>
+
+#ifndef HAVE_DELETE_FROM_PAGE_CACHE /* 2.6.39 */
+#ifndef HAVE_REMOVE_FROM_PAGE_CACHE /* 2.6.35 - 2.6.38 */
+
+/* XXX copy & paste from 2.6.15 kernel */
+static inline void ll_remove_from_page_cache(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+
+	BUG_ON(!PageLocked(page));
+
+	spin_lock_irq(&mapping->tree_lock);
+	radix_tree_delete(&mapping->page_tree, page->index);
+	page->mapping = NULL;
+	mapping->nrpages--;
+	__dec_zone_page_state(page, NR_FILE_PAGES);
+
+	spin_unlock_irq(&mapping->tree_lock);
+}
+#else /* HAVE_REMOVE_FROM_PAGE_CACHE */
+#define ll_remove_from_page_cache(page) remove_from_page_cache(page)
+#endif /* !HAVE_REMOVE_FROM_PAGE_CACHE */
+
+static inline void ll_delete_from_page_cache(struct page *page)
+{
+        ll_remove_from_page_cache(page);
+	put_page(page);
+}
+#else /* HAVE_DELETE_FROM_PAGE_CACHE */
+#define ll_delete_from_page_cache(page) delete_from_page_cache(page)
+#endif /* !HAVE_DELETE_FROM_PAGE_CACHE */
+
+static inline void
+ll_cancel_dirty_page(struct address_space *mapping, struct page *page)
+{
+#ifdef HAVE_NEW_CANCEL_DIRTY_PAGE
+	cancel_dirty_page(page);
+#elif defined(HAVE_CANCEL_DIRTY_PAGE)
+	cancel_dirty_page(page, PAGE_SIZE);
+#else
+	if (TestClearPageDirty(page))
+		account_page_cleaned(page, mapping);
+#endif	/* HAVE_NEW_CANCEL_DIRTY_PAGE */
+}
+
+static inline void
+truncate_complete_page(struct address_space *mapping, struct page *page)
+{
+	if (page->mapping != mapping)
+		return;
+
+	if (PagePrivate(page))
+#ifdef HAVE_INVALIDATE_RANGE
+		page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
+#else
+		page->mapping->a_ops->invalidatepage(page, 0);
+#endif
+
+	ll_cancel_dirty_page(mapping, page);
+	ClearPageMappedToDisk(page);
+	ll_delete_from_page_cache(page);
+}
+#endif /* !HAVE_TRUNCATE_COMPLETE_PAGE */
+
+#ifdef HAVE_DCACHE_LOCK
+#  define dget_dlock(d)			dget_locked(d)
+#  define ll_d_count(d)			atomic_read(&(d)->d_count)
+#elif defined(HAVE_D_COUNT)
+#  define ll_d_count(d)			d_count(d)
+#else
+#  define ll_d_count(d)			((d)->d_count)
+#endif /* HAVE_DCACHE_LOCK */
+
+#ifdef ATTR_OPEN
+# define ATTR_FROM_OPEN ATTR_OPEN
+#else
+# ifndef ATTR_FROM_OPEN
+#  define ATTR_FROM_OPEN 0
+# endif
+#endif /* ATTR_OPEN */
+
+#ifndef ATTR_RAW
+#define ATTR_RAW 0
+#endif
+
+#ifndef ATTR_CTIME_SET
+/*
+ * set ATTR_CTIME_SET to a high value to avoid any risk of collision with other
+ * ATTR_* attributes (see bug 13828)
+ */
+#define ATTR_CTIME_SET (1 << 28)
+#endif
+
+#ifndef HAVE_IN_COMPAT_SYSCALL
+#define in_compat_syscall	is_compat_task
+#endif
+
+#endif /* LUSTRE_PATCHLESS_COMPAT_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_quota.h b/drivers/staging/lustrefsx/lustre/include/lustre_quota.h
new file mode 100644
index 0000000000000..8cb25d2374322
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_quota.h
@@ -0,0 +1,244 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ * Use is subject to license terms.
+ */
+
+#ifndef _LUSTRE_QUOTA_H
+#define _LUSTRE_QUOTA_H
+
+/** \defgroup quota quota
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+#include <dt_object.h>
+#include <lustre_fid.h>
+#include <lustre_dlm.h>
+
+#ifndef MAX_IQ_TIME
+#define MAX_IQ_TIME  604800     /* (7*24*60*60) 1 week */
+#endif
+
+#ifndef MAX_DQ_TIME
+#define MAX_DQ_TIME  604800     /* (7*24*60*60) 1 week */
+#endif
+
+struct lquota_id_info;
+struct lquota_trans;
+
+/* Gather all quota record type in an union that can be used to read any records
+ * from disk. All fields of these records must be 64-bit aligned, otherwise the
+ * OSD layer may swab them incorrectly. */
+union lquota_rec {
+	struct lquota_glb_rec	lqr_glb_rec;
+	struct lquota_slv_rec	lqr_slv_rec;
+	struct lquota_acct_rec	lqr_acct_rec;
+};
+
+/* flags for inode/block quota accounting */
+enum osd_qid_declare_flags {
+	OSD_QID_INODE	= 1 << 0,
+	OSD_QID_BLK	= 1 << 1,
+	OSD_QID_FORCE	= 1 << 2,
+};
+
+/* Index features supported by the global index objects
+ * Only used for migration purpose and should be removed once on-disk migration
+ * is no longer needed */
+extern struct dt_index_features dt_quota_iusr_features;
+extern struct dt_index_features dt_quota_busr_features;
+extern struct dt_index_features dt_quota_igrp_features;
+extern struct dt_index_features dt_quota_bgrp_features;
+
+/* Name used in the configuration logs to identify the default metadata pool
+ * (composed of all the MDTs, with pool ID 0) and the default data pool (all
+ * the OSTs, with pool ID 0 too). */
+#define QUOTA_METAPOOL_NAME   "mdt="
+#define QUOTA_DATAPOOL_NAME   "ost="
+
+/*
+ * Quota Master Target support
+ */
+
+/* Request handlers for quota master operations.
+ * This is used by the MDT to pass quota/lock requests to the quota master
+ * target. This won't be needed any more once the QMT is a real target and
+ * does not rely any more on the MDT service threads and namespace. */
+struct qmt_handlers {
+	/* Handle quotactl request from client. */
+	int (*qmth_quotactl)(const struct lu_env *, struct lu_device *,
+			     struct obd_quotactl *);
+
+	/* Handle dqacq/dqrel request from slave. */
+	int (*qmth_dqacq)(const struct lu_env *, struct lu_device *,
+			  struct ptlrpc_request *);
+
+	/* LDLM intent policy associated with quota locks */
+	int (*qmth_intent_policy)(const struct lu_env *, struct lu_device *,
+				  struct ptlrpc_request *, struct ldlm_lock **,
+				  int);
+
+	/* Initialize LVB of ldlm resource associated with quota objects */
+	int (*qmth_lvbo_init)(struct lu_device *, struct ldlm_resource *);
+
+	/* Update LVB of ldlm resource associated with quota objects */
+	int (*qmth_lvbo_update)(struct lu_device *, struct ldlm_resource *,
+				struct ptlrpc_request *, int);
+
+	/* Return size of LVB to be packed in ldlm message */
+	int (*qmth_lvbo_size)(struct lu_device *, struct ldlm_lock *);
+
+	/* Fill request buffer with lvb */
+	int (*qmth_lvbo_fill)(struct lu_device *, struct ldlm_lock *, void *,
+			      int);
+
+	/* Free lvb associated with ldlm resource */
+	int (*qmth_lvbo_free)(struct lu_device *, struct ldlm_resource *);
+};
+
+/* actual handlers are defined in lustre/quota/qmt_handler.c */
+extern struct qmt_handlers qmt_hdls;
+
+/*
+ * Quota enforcement support on slaves
+ */
+
+struct qsd_instance;
+
+/* The quota slave feature is implemented under the form of a library.
+ * The API is the following:
+ *
+ * - qsd_init(): the user (mostly the OSD layer) should first allocate a qsd
+ *               instance via qsd_init(). This creates all required structures
+ *               to manage quota enforcement for this target and performs all
+ *               low-level initialization which does not involve any lustre
+ *               object. qsd_init() should typically be called when the OSD
+ *               is being set up.
+ *
+ * - qsd_prepare(): This sets up on-disk objects associated with the quota slave
+ *                  feature and initiates the quota reintegration procedure if
+ *                  needed. qsd_prepare() should typically be called when
+ *                  ->ldo_prepare is invoked.
+ *
+ * - qsd_start(): a qsd instance should be started once recovery is completed
+ *                (i.e. when ->ldo_recovery_complete is called). This is used
+ *                to notify the qsd layer that quota should now be enforced
+ *                again via the qsd_op_begin/end functions. The last step of the
+ *                reintegration prodecure (namely usage reconciliation) will be
+ *                completed during start.
+ *
+ * - qsd_fini(): is used to release a qsd_instance structure allocated with
+ *               qsd_init(). This releases all quota slave objects and frees the
+ *               structures associated with the qsd_instance.
+ *
+ * - qsd_op_begin(): is used to enforce quota, it must be called in the
+ *                   declaration of each operation. qsd_op_end() should then be
+ *                   invoked later once all operations have been completed in
+ *                   order to release/adjust the quota space.
+ *                   Running qsd_op_begin() before qsd_start() isn't fatal and
+ *                   will return success.
+ *                   Once qsd_start() has been run, qsd_op_begin() will block
+ *                   until the reintegration procedure is completed.
+ *
+ * - qsd_op_end(): performs the post operation quota processing. This must be
+ *                 called after the operation transaction stopped.
+ *                 While qsd_op_begin() must be invoked each time a new
+ *                 operation is declared, qsd_op_end() should be called only
+ *                 once for the whole transaction.
+ *
+ * - qsd_op_adjust(): triggers pre-acquire/release if necessary.
+ *
+ * Below are the function prototypes to be used by OSD layer to manage quota
+ * enforcement. Arguments are documented where each function is defined.  */
+
+struct qsd_instance *qsd_init(const struct lu_env *, char *, struct dt_device *,
+			      struct proc_dir_entry *);
+int qsd_prepare(const struct lu_env *, struct qsd_instance *);
+int qsd_start(const struct lu_env *, struct qsd_instance *);
+void qsd_fini(const struct lu_env *, struct qsd_instance *);
+int qsd_op_begin(const struct lu_env *, struct qsd_instance *,
+		 struct lquota_trans *, struct lquota_id_info *, int *);
+void qsd_op_end(const struct lu_env *, struct qsd_instance *,
+		struct lquota_trans *);
+void qsd_op_adjust(const struct lu_env *, struct qsd_instance *,
+		   union lquota_id *, int);
+
+/*
+ * Quota information attached to a transaction
+ */
+
+struct lquota_entry;
+
+struct lquota_id_info {
+	/* quota identifier */
+	union lquota_id		 lqi_id;
+
+	/* USRQUOTA or GRPQUOTA for now, could be expanded for
+	 * directory quota or other types later.  */
+	int			 lqi_type;
+
+	/* inodes or kbytes to be consumed or released, it could
+	 * be negative when releasing space.  */
+	long long		 lqi_space;
+
+	/* quota slave entry structure associated with this ID */
+	struct lquota_entry	*lqi_qentry;
+
+	/* whether we are reporting blocks or inodes */
+	bool			 lqi_is_blk;
+};
+
+/* Since we enforce only inode quota in meta pool (MDTs), and block quota in
+ * data pool (OSTs), there are at most 4 quota ids being enforced in a single
+ * transaction, which is chown transaction:
+ * original uid and gid, new uid and gid.
+ *
+ * This value might need to be revised when directory quota is added.  */
+#define QUOTA_MAX_TRANSIDS    4
+
+/* all qids involved in a single transaction */
+struct lquota_trans {
+	unsigned short		lqt_id_cnt;
+	struct lquota_id_info	lqt_ids[QUOTA_MAX_TRANSIDS];
+};
+
+/* flags for quota local enforcement */
+#define QUOTA_FL_OVER_USRQUOTA  0x01
+#define QUOTA_FL_OVER_GRPQUOTA  0x02
+#define QUOTA_FL_SYNC           0x04
+#define QUOTA_FL_OVER_PRJQUOTA  0x08
+
+#define IS_LQUOTA_RES(res)						\
+	(res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA ||	\
+	 res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA_GLB)
+
+/* helper function used by MDT & OFD to retrieve quota accounting information
+ * on slave */
+int lquotactl_slv(const struct lu_env *, struct dt_device *,
+		  struct obd_quotactl *);
+/** @} quota */
+#endif /* _LUSTRE_QUOTA_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h b/drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h
new file mode 100644
index 0000000000000..46e6fa862f48e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h
@@ -0,0 +1,342 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_req_layout.h
+ *
+ * Lustre Metadata Target (mdt) request handler
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#ifndef _LUSTRE_REQ_LAYOUT_H__
+#define _LUSTRE_REQ_LAYOUT_H__
+
+#include <linux/types.h>
+
+/** \defgroup req_layout req_layout
+ *
+ * @{
+ */
+
+struct req_msg_field;
+struct req_format;
+struct req_capsule;
+
+struct ptlrpc_request;
+
+enum req_location {
+        RCL_CLIENT,
+        RCL_SERVER,
+        RCL_NR
+};
+
+/* Maximal number of fields (buffers) in a request message. */
+#define REQ_MAX_FIELD_NR 10
+
+struct req_capsule {
+        struct ptlrpc_request   *rc_req;
+        const struct req_format *rc_fmt;
+        enum req_location        rc_loc;
+        __u32                    rc_area[RCL_NR][REQ_MAX_FIELD_NR];
+};
+
+void req_capsule_init(struct req_capsule *pill, struct ptlrpc_request *req,
+                      enum req_location location);
+void req_capsule_fini(struct req_capsule *pill);
+
+void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt);
+void req_capsule_client_dump(struct req_capsule *pill);
+void req_capsule_server_dump(struct req_capsule *pill);
+void req_capsule_init_area(struct req_capsule *pill);
+size_t req_capsule_filled_sizes(struct req_capsule *pill,
+				enum req_location loc);
+int  req_capsule_server_pack(struct req_capsule *pill);
+
+void *req_capsule_client_get(struct req_capsule *pill,
+                             const struct req_msg_field *field);
+void *req_capsule_client_swab_get(struct req_capsule *pill,
+				  const struct req_msg_field *field,
+				  void *swabber);
+void *req_capsule_client_sized_get(struct req_capsule *pill,
+				   const struct req_msg_field *field,
+				   __u32 len);
+void *req_capsule_server_get(struct req_capsule *pill,
+                             const struct req_msg_field *field);
+void *req_capsule_server_sized_get(struct req_capsule *pill,
+				   const struct req_msg_field *field,
+				   __u32 len);
+void *req_capsule_server_swab_get(struct req_capsule *pill,
+                                  const struct req_msg_field *field,
+                                  void *swabber);
+void *req_capsule_server_sized_swab_get(struct req_capsule *pill,
+					const struct req_msg_field *field,
+					__u32 len, void *swabber);
+const void *req_capsule_other_get(struct req_capsule *pill,
+                                  const struct req_msg_field *field);
+
+void req_capsule_set_size(struct req_capsule *pill,
+			  const struct req_msg_field *field,
+			  enum req_location loc, __u32 size);
+__u32 req_capsule_get_size(const struct req_capsule *pill,
+			   const struct req_msg_field *field,
+			   enum req_location loc);
+__u32 req_capsule_msg_size(struct req_capsule *pill, enum req_location loc);
+__u32 req_capsule_fmt_size(__u32 magic, const struct req_format *fmt,
+                         enum req_location loc);
+void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt);
+
+int req_capsule_has_field(const struct req_capsule *pill,
+                          const struct req_msg_field *field,
+                          enum req_location loc);
+int req_capsule_field_present(const struct req_capsule *pill,
+                              const struct req_msg_field *field,
+                              enum req_location loc);
+void req_capsule_shrink(struct req_capsule *pill,
+			const struct req_msg_field *field,
+			__u32 newlen,
+			enum req_location loc);
+int req_capsule_server_grow(struct req_capsule *pill,
+			    const struct req_msg_field *field,
+			    __u32 newlen);
+int  req_layout_init(void);
+void req_layout_fini(void);
+
+extern struct req_format RQF_OBD_PING;
+extern struct req_format RQF_OBD_SET_INFO;
+extern struct req_format RQF_SEC_CTX;
+extern struct req_format RQF_OBD_IDX_READ;
+/* MGS req_format */
+extern struct req_format RQF_MGS_TARGET_REG;
+extern struct req_format RQF_MGS_SET_INFO;
+extern struct req_format RQF_MGS_CONFIG_READ;
+/* fid/fld req_format */
+extern struct req_format RQF_SEQ_QUERY;
+extern struct req_format RQF_FLD_QUERY;
+extern struct req_format RQF_FLD_READ;
+/* MDS req_format */
+extern struct req_format RQF_MDS_CONNECT;
+extern struct req_format RQF_MDS_DISCONNECT;
+extern struct req_format RQF_MDS_STATFS;
+extern struct req_format RQF_MDS_GET_ROOT;
+extern struct req_format RQF_MDS_SYNC;
+extern struct req_format RQF_MDS_GETXATTR;
+extern struct req_format RQF_MDS_GETATTR;
+extern struct req_format RQF_OUT_UPDATE;
+
+/*
+ * This is format of direct (non-intent) MDS_GETATTR_NAME request.
+ */
+extern struct req_format RQF_MDS_GETATTR_NAME;
+extern struct req_format RQF_MDS_CLOSE;
+extern struct req_format RQF_MDS_INTENT_CLOSE;
+extern struct req_format RQF_MDS_CONNECT;
+extern struct req_format RQF_MDS_DISCONNECT;
+extern struct req_format RQF_MDS_GET_INFO;
+extern struct req_format RQF_MDS_READPAGE;
+extern struct req_format RQF_MDS_REINT;
+extern struct req_format RQF_MDS_REINT_CREATE;
+extern struct req_format RQF_MDS_REINT_CREATE_ACL;
+extern struct req_format RQF_MDS_REINT_CREATE_SLAVE;
+extern struct req_format RQF_MDS_REINT_CREATE_SYM;
+extern struct req_format RQF_MDS_REINT_OPEN;
+extern struct req_format RQF_MDS_REINT_UNLINK;
+extern struct req_format RQF_MDS_REINT_LINK;
+extern struct req_format RQF_MDS_REINT_RENAME;
+extern struct req_format RQF_MDS_REINT_SETATTR;
+extern struct req_format RQF_MDS_REINT_SETXATTR;
+extern struct req_format RQF_MDS_QUOTACTL;
+extern struct req_format RQF_QUOTA_DQACQ;
+extern struct req_format RQF_MDS_SWAP_LAYOUTS;
+extern struct req_format RQF_MDS_REINT_MIGRATE;
+/* MDS hsm formats */
+extern struct req_format RQF_MDS_HSM_STATE_GET;
+extern struct req_format RQF_MDS_HSM_STATE_SET;
+extern struct req_format RQF_MDS_HSM_ACTION;
+extern struct req_format RQF_MDS_HSM_PROGRESS;
+extern struct req_format RQF_MDS_HSM_CT_REGISTER;
+extern struct req_format RQF_MDS_HSM_CT_UNREGISTER;
+extern struct req_format RQF_MDS_HSM_REQUEST;
+/* OST req_format */
+extern struct req_format RQF_OST_CONNECT;
+extern struct req_format RQF_OST_DISCONNECT;
+extern struct req_format RQF_OST_QUOTACTL;
+extern struct req_format RQF_OST_GETATTR;
+extern struct req_format RQF_OST_SETATTR;
+extern struct req_format RQF_OST_CREATE;
+extern struct req_format RQF_OST_PUNCH;
+extern struct req_format RQF_OST_SYNC;
+extern struct req_format RQF_OST_DESTROY;
+extern struct req_format RQF_OST_BRW_READ;
+extern struct req_format RQF_OST_BRW_WRITE;
+extern struct req_format RQF_OST_STATFS;
+extern struct req_format RQF_OST_SET_GRANT_INFO;
+extern struct req_format RQF_OST_GET_INFO;
+extern struct req_format RQF_OST_GET_INFO_LAST_ID;
+extern struct req_format RQF_OST_GET_INFO_LAST_FID;
+extern struct req_format RQF_OST_SET_INFO_LAST_FID;
+extern struct req_format RQF_OST_GET_INFO_FIEMAP;
+extern struct req_format RQF_OST_LADVISE;
+
+/* LDLM req_format */
+extern struct req_format RQF_LDLM_ENQUEUE;
+extern struct req_format RQF_LDLM_ENQUEUE_LVB;
+extern struct req_format RQF_LDLM_CONVERT;
+extern struct req_format RQF_LDLM_INTENT;
+extern struct req_format RQF_LDLM_INTENT_BASIC;
+extern struct req_format RQF_LDLM_INTENT_LAYOUT;
+extern struct req_format RQF_LDLM_INTENT_GETATTR;
+extern struct req_format RQF_LDLM_INTENT_OPEN;
+extern struct req_format RQF_LDLM_INTENT_CREATE;
+extern struct req_format RQF_LDLM_INTENT_UNLINK;
+extern struct req_format RQF_LDLM_INTENT_GETXATTR;
+extern struct req_format RQF_LDLM_INTENT_QUOTA;
+extern struct req_format RQF_LDLM_CANCEL;
+extern struct req_format RQF_LDLM_CALLBACK;
+extern struct req_format RQF_LDLM_CP_CALLBACK;
+extern struct req_format RQF_LDLM_BL_CALLBACK;
+extern struct req_format RQF_LDLM_GL_CALLBACK;
+extern struct req_format RQF_LDLM_GL_DESC_CALLBACK;
+/* LOG req_format */
+extern struct req_format RQF_LOG_CANCEL;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_DESTROY;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER;
+extern struct req_format RQF_LLOG_ORIGIN_CONNECT;
+
+extern struct req_format RQF_CONNECT;
+
+/* LFSCK req_format */
+extern struct req_format RQF_LFSCK_NOTIFY;
+extern struct req_format RQF_LFSCK_QUERY;
+
+extern struct req_msg_field RMF_GENERIC_DATA;
+extern struct req_msg_field RMF_PTLRPC_BODY;
+extern struct req_msg_field RMF_MDT_BODY;
+extern struct req_msg_field RMF_MDT_EPOCH;
+extern struct req_msg_field RMF_OBD_STATFS;
+extern struct req_msg_field RMF_NAME;
+extern struct req_msg_field RMF_SYMTGT;
+extern struct req_msg_field RMF_TGTUUID;
+extern struct req_msg_field RMF_CLUUID;
+extern struct req_msg_field RMF_SETINFO_VAL;
+extern struct req_msg_field RMF_SETINFO_KEY;
+extern struct req_msg_field RMF_GETINFO_VAL;
+extern struct req_msg_field RMF_GETINFO_VALLEN;
+extern struct req_msg_field RMF_GETINFO_KEY;
+extern struct req_msg_field RMF_IDX_INFO;
+extern struct req_msg_field RMF_CLOSE_DATA;
+extern struct req_msg_field RMF_FILE_SECCTX_NAME;
+extern struct req_msg_field RMF_FILE_SECCTX;
+
+/*
+ * connection handle received in MDS_CONNECT request.
+ */
+extern struct req_msg_field RMF_CONN;
+extern struct req_msg_field RMF_CONNECT_DATA;
+extern struct req_msg_field RMF_DLM_REQ;
+extern struct req_msg_field RMF_DLM_REP;
+extern struct req_msg_field RMF_DLM_LVB;
+extern struct req_msg_field RMF_DLM_GL_DESC;
+extern struct req_msg_field RMF_LDLM_INTENT;
+extern struct req_msg_field RMF_LAYOUT_INTENT;
+extern struct req_msg_field RMF_MDT_MD;
+extern struct req_msg_field RMF_REC_REINT;
+extern struct req_msg_field RMF_EADATA;
+extern struct req_msg_field RMF_EAVALS;
+extern struct req_msg_field RMF_EAVALS_LENS;
+extern struct req_msg_field RMF_ACL;
+extern struct req_msg_field RMF_LOGCOOKIES;
+extern struct req_msg_field RMF_CAPA1;
+extern struct req_msg_field RMF_CAPA2;
+extern struct req_msg_field RMF_OBD_QUOTACHECK;
+extern struct req_msg_field RMF_OBD_QUOTACTL;
+extern struct req_msg_field RMF_QUOTA_BODY;
+extern struct req_msg_field RMF_STRING;
+extern struct req_msg_field RMF_SWAP_LAYOUTS;
+extern struct req_msg_field RMF_MDS_HSM_PROGRESS;
+extern struct req_msg_field RMF_MDS_HSM_REQUEST;
+extern struct req_msg_field RMF_MDS_HSM_USER_ITEM;
+extern struct req_msg_field RMF_MDS_HSM_ARCHIVE;
+extern struct req_msg_field RMF_HSM_USER_STATE;
+extern struct req_msg_field RMF_HSM_STATE_SET;
+extern struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION;
+extern struct req_msg_field RMF_MDS_HSM_REQUEST;
+
+/* seq-mgr fields */
+extern struct req_msg_field RMF_SEQ_OPC;
+extern struct req_msg_field RMF_SEQ_RANGE;
+extern struct req_msg_field RMF_FID_SPACE;
+
+/* FLD fields */
+extern struct req_msg_field RMF_FLD_OPC;
+extern struct req_msg_field RMF_FLD_MDFLD;
+
+extern struct req_msg_field RMF_LLOGD_BODY;
+extern struct req_msg_field RMF_LLOG_LOG_HDR;
+extern struct req_msg_field RMF_LLOGD_CONN_BODY;
+
+extern struct req_msg_field RMF_MGS_TARGET_INFO;
+extern struct req_msg_field RMF_MGS_SEND_PARAM;
+
+extern struct req_msg_field RMF_OST_BODY;
+extern struct req_msg_field RMF_OBD_IOOBJ;
+extern struct req_msg_field RMF_OBD_ID;
+extern struct req_msg_field RMF_FID;
+extern struct req_msg_field RMF_NIOBUF_REMOTE;
+extern struct req_msg_field RMF_RCS;
+extern struct req_msg_field RMF_FIEMAP_KEY;
+extern struct req_msg_field RMF_FIEMAP_VAL;
+extern struct req_msg_field RMF_OST_ID;
+
+/* MGS config read message format */
+extern struct req_msg_field RMF_MGS_CONFIG_BODY;
+extern struct req_msg_field RMF_MGS_CONFIG_RES;
+
+/* generic uint32 */
+extern struct req_msg_field RMF_U32;
+
+/* OBJ update format */
+extern struct req_msg_field RMF_OUT_UPDATE;
+extern struct req_msg_field RMF_OUT_UPDATE_REPLY;
+extern struct req_msg_field RMF_OUT_UPDATE_HEADER;
+extern struct req_msg_field RMF_OUT_UPDATE_BUF;
+
+/* LFSCK format */
+extern struct req_msg_field RMF_LFSCK_REQUEST;
+extern struct req_msg_field RMF_LFSCK_REPLY;
+
+extern struct req_msg_field RMF_OST_LADVISE_HDR;
+extern struct req_msg_field RMF_OST_LADVISE;
+/** @} req_layout */
+
+#endif /* _LUSTRE_REQ_LAYOUT_H__ */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_sec.h b/drivers/staging/lustrefsx/lustre/include/lustre_sec.h
new file mode 100644
index 0000000000000..7e6f490854911
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_sec.h
@@ -0,0 +1,1202 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_SEC_H_
+#define _LUSTRE_SEC_H_
+
+/** \defgroup sptlrpc sptlrpc
+ *
+ * @{
+ */
+
+/*
+ * to avoid include
+ */
+struct obd_import;
+struct obd_export;
+struct ptlrpc_request;
+struct ptlrpc_reply_state;
+struct ptlrpc_bulk_desc;
+struct brw_page;
+struct lu_env;
+/* Linux specific */
+struct key;
+struct seq_file;
+struct lustre_cfg;
+
+/*
+ * forward declaration
+ */
+struct ptlrpc_sec_policy;
+struct ptlrpc_sec_cops;
+struct ptlrpc_sec_sops;
+struct ptlrpc_sec;
+struct ptlrpc_svc_ctx;
+struct ptlrpc_cli_ctx;
+struct ptlrpc_ctx_ops;
+struct req_msg_field;
+
+/**
+ * \addtogroup flavor flavor
+ *
+ * RPC flavor is represented by a 32 bits integer. Currently the high 12 bits
+ * are unused, must be set to 0 for future expansion.
+ * <pre>
+ * ------------------------------------------------------------------------
+ * | 4b (bulk svc) | 4b (bulk type) | 4b (svc) | 4b (mech)  | 4b (policy) |
+ * ------------------------------------------------------------------------
+ * </pre>
+ *
+ * @{
+ */
+
+/*
+ * flavor constants
+ */
+enum sptlrpc_policy {
+        SPTLRPC_POLICY_NULL             = 0,
+        SPTLRPC_POLICY_PLAIN            = 1,
+        SPTLRPC_POLICY_GSS              = 2,
+        SPTLRPC_POLICY_MAX,
+};
+
+enum sptlrpc_mech_null {
+        SPTLRPC_MECH_NULL               = 0,
+        SPTLRPC_MECH_NULL_MAX,
+};
+
+enum sptlrpc_mech_plain {
+        SPTLRPC_MECH_PLAIN              = 0,
+        SPTLRPC_MECH_PLAIN_MAX,
+};
+
+enum sptlrpc_mech_gss {
+        SPTLRPC_MECH_GSS_NULL           = 0,
+        SPTLRPC_MECH_GSS_KRB5           = 1,
+	SPTLRPC_MECH_GSS_SK             = 2,
+        SPTLRPC_MECH_GSS_MAX,
+};
+
+enum sptlrpc_service_type {
+        SPTLRPC_SVC_NULL                = 0,    /**< no security */
+        SPTLRPC_SVC_AUTH                = 1,    /**< authentication only */
+        SPTLRPC_SVC_INTG                = 2,    /**< integrity */
+        SPTLRPC_SVC_PRIV                = 3,    /**< privacy */
+        SPTLRPC_SVC_MAX,
+};
+
+enum sptlrpc_bulk_type {
+        SPTLRPC_BULK_DEFAULT            = 0,    /**< follow rpc flavor */
+        SPTLRPC_BULK_HASH               = 1,    /**< hash integrity */
+        SPTLRPC_BULK_MAX,
+};
+
+enum sptlrpc_bulk_service {
+        SPTLRPC_BULK_SVC_NULL           = 0,    /**< no security */
+        SPTLRPC_BULK_SVC_AUTH           = 1,    /**< authentication only */
+        SPTLRPC_BULK_SVC_INTG           = 2,    /**< integrity */
+        SPTLRPC_BULK_SVC_PRIV           = 3,    /**< privacy */
+        SPTLRPC_BULK_SVC_MAX,
+};
+
+/*
+ * compose/extract macros
+ */
+#define FLVR_POLICY_OFFSET              (0)
+#define FLVR_MECH_OFFSET                (4)
+#define FLVR_SVC_OFFSET                 (8)
+#define FLVR_BULK_TYPE_OFFSET           (12)
+#define FLVR_BULK_SVC_OFFSET            (16)
+
+#define MAKE_FLVR(policy, mech, svc, btype, bsvc)                       \
+        (((__u32)(policy) << FLVR_POLICY_OFFSET) |                      \
+         ((__u32)(mech) << FLVR_MECH_OFFSET) |                          \
+         ((__u32)(svc) << FLVR_SVC_OFFSET) |                            \
+         ((__u32)(btype) << FLVR_BULK_TYPE_OFFSET) |                    \
+         ((__u32)(bsvc) << FLVR_BULK_SVC_OFFSET))
+
+/*
+ * extraction
+ */
+#define SPTLRPC_FLVR_POLICY(flavor)                                     \
+        ((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_MECH(flavor)                                       \
+        ((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_SVC(flavor)                                        \
+        ((((__u32)(flavor)) >> FLVR_SVC_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_BULK_TYPE(flavor)                                  \
+        ((((__u32)(flavor)) >> FLVR_BULK_TYPE_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_BULK_SVC(flavor)                                   \
+        ((((__u32)(flavor)) >> FLVR_BULK_SVC_OFFSET) & 0xF)
+
+#define SPTLRPC_FLVR_BASE(flavor)                                       \
+        ((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xFFF)
+#define SPTLRPC_FLVR_BASE_SUB(flavor)                                   \
+        ((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xFF)
+
+/*
+ * gss subflavors
+ */
+#define MAKE_BASE_SUBFLVR(mech, svc)                                    \
+        ((__u32)(mech) |                                                \
+         ((__u32)(svc) << (FLVR_SVC_OFFSET - FLVR_MECH_OFFSET)))
+
+#define SPTLRPC_SUBFLVR_GSSNULL						\
+	MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_NULL, SPTLRPC_SVC_NULL)
+#define SPTLRPC_SUBFLVR_KRB5N                                           \
+        MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_NULL)
+#define SPTLRPC_SUBFLVR_KRB5A                                           \
+        MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_AUTH)
+#define SPTLRPC_SUBFLVR_KRB5I                                           \
+        MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_INTG)
+#define SPTLRPC_SUBFLVR_KRB5P                                           \
+        MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_PRIV)
+#define SPTLRPC_SUBFLVR_SKN                                             \
+	MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_SK, SPTLRPC_SVC_NULL)
+#define SPTLRPC_SUBFLVR_SKA                                             \
+	MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_SK, SPTLRPC_SVC_AUTH)
+#define SPTLRPC_SUBFLVR_SKI                                             \
+	MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_SK, SPTLRPC_SVC_INTG)
+#define SPTLRPC_SUBFLVR_SKPI                                            \
+	MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_SK, SPTLRPC_SVC_PRIV)
+
+/*
+ * "end user" flavors
+ */
+#define SPTLRPC_FLVR_NULL                               \
+        MAKE_FLVR(SPTLRPC_POLICY_NULL,                  \
+                  SPTLRPC_MECH_NULL,                    \
+                  SPTLRPC_SVC_NULL,                     \
+                  SPTLRPC_BULK_DEFAULT,                 \
+                  SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_PLAIN                              \
+        MAKE_FLVR(SPTLRPC_POLICY_PLAIN,                 \
+                  SPTLRPC_MECH_PLAIN,                   \
+                  SPTLRPC_SVC_NULL,                     \
+                  SPTLRPC_BULK_HASH,                    \
+                  SPTLRPC_BULK_SVC_INTG)
+#define SPTLRPC_FLVR_GSSNULL				\
+	MAKE_FLVR(SPTLRPC_POLICY_GSS,			\
+		  SPTLRPC_MECH_GSS_NULL,		\
+		  SPTLRPC_SVC_NULL,			\
+		  SPTLRPC_BULK_DEFAULT,			\
+		  SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_KRB5N                              \
+        MAKE_FLVR(SPTLRPC_POLICY_GSS,                   \
+                  SPTLRPC_MECH_GSS_KRB5,                \
+                  SPTLRPC_SVC_NULL,                     \
+                  SPTLRPC_BULK_DEFAULT,                 \
+                  SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_KRB5A                              \
+        MAKE_FLVR(SPTLRPC_POLICY_GSS,                   \
+                  SPTLRPC_MECH_GSS_KRB5,                \
+                  SPTLRPC_SVC_AUTH,                     \
+                  SPTLRPC_BULK_DEFAULT,                 \
+                  SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_KRB5I                              \
+        MAKE_FLVR(SPTLRPC_POLICY_GSS,                   \
+                  SPTLRPC_MECH_GSS_KRB5,                \
+                  SPTLRPC_SVC_INTG,                     \
+                  SPTLRPC_BULK_DEFAULT,                 \
+                  SPTLRPC_BULK_SVC_INTG)
+#define SPTLRPC_FLVR_KRB5P                              \
+        MAKE_FLVR(SPTLRPC_POLICY_GSS,                   \
+                  SPTLRPC_MECH_GSS_KRB5,                \
+                  SPTLRPC_SVC_PRIV,                     \
+                  SPTLRPC_BULK_DEFAULT,                 \
+                  SPTLRPC_BULK_SVC_PRIV)
+#define SPTLRPC_FLVR_SKN                                \
+	MAKE_FLVR(SPTLRPC_POLICY_GSS,                   \
+		  SPTLRPC_MECH_GSS_SK,                  \
+		  SPTLRPC_SVC_NULL,                     \
+		  SPTLRPC_BULK_DEFAULT,                 \
+		  SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_SKA                                \
+	MAKE_FLVR(SPTLRPC_POLICY_GSS,                   \
+		  SPTLRPC_MECH_GSS_SK,                  \
+		  SPTLRPC_SVC_AUTH,                     \
+		  SPTLRPC_BULK_DEFAULT,                 \
+		  SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_SKI                                \
+	MAKE_FLVR(SPTLRPC_POLICY_GSS,                   \
+		  SPTLRPC_MECH_GSS_SK,                  \
+		  SPTLRPC_SVC_INTG,                     \
+		  SPTLRPC_BULK_DEFAULT,                 \
+		  SPTLRPC_BULK_SVC_INTG)
+#define SPTLRPC_FLVR_SKPI                               \
+	MAKE_FLVR(SPTLRPC_POLICY_GSS,                   \
+		  SPTLRPC_MECH_GSS_SK,                  \
+		  SPTLRPC_SVC_PRIV,                     \
+		  SPTLRPC_BULK_DEFAULT,                 \
+		  SPTLRPC_BULK_SVC_PRIV)
+
+#define SPTLRPC_FLVR_DEFAULT            SPTLRPC_FLVR_NULL
+
+#define SPTLRPC_FLVR_INVALID            ((__u32) 0xFFFFFFFF)
+#define SPTLRPC_FLVR_ANY                ((__u32) 0xFFF00000)
+
+/**
+ * extract the useful part from wire flavor
+ */
+#define WIRE_FLVR(wflvr)                (((__u32) (wflvr)) & 0x000FFFFF)
+
+/** @} flavor */
+
+static inline void flvr_set_svc(__u32 *flvr, __u32 svc)
+{
+        LASSERT(svc < SPTLRPC_SVC_MAX);
+        *flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr),
+                          SPTLRPC_FLVR_MECH(*flvr),
+                          svc,
+                          SPTLRPC_FLVR_BULK_TYPE(*flvr),
+                          SPTLRPC_FLVR_BULK_SVC(*flvr));
+}
+
+static inline void flvr_set_bulk_svc(__u32 *flvr, __u32 svc)
+{
+        LASSERT(svc < SPTLRPC_BULK_SVC_MAX);
+        *flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr),
+                          SPTLRPC_FLVR_MECH(*flvr),
+                          SPTLRPC_FLVR_SVC(*flvr),
+                          SPTLRPC_FLVR_BULK_TYPE(*flvr),
+                          svc);
+}
+
+struct bulk_spec_hash {
+        __u8    hash_alg;
+};
+
+/**
+ * Full description of flavors being used on a ptlrpc connection, include
+ * both regular RPC and bulk transfer parts.
+ */
+struct sptlrpc_flavor {
+        /**
+         * wire flavor, should be renamed to sf_wire.
+         */
+        __u32   sf_rpc;
+        /**
+         * general flags of PTLRPC_SEC_FL_*
+         */
+        __u32   sf_flags;
+        /**
+         * rpc flavor specification
+         */
+        union {
+                /* nothing for now */
+        } u_rpc;
+        /**
+         * bulk flavor specification
+         */
+        union {
+                struct bulk_spec_hash hash;
+        } u_bulk;
+};
+
+/**
+ * identify the RPC is generated from what part of Lustre. It's encoded into
+ * RPC requests and to be checked by ptlrpc service.
+ */
+enum lustre_sec_part {
+        LUSTRE_SP_CLI           = 0,
+        LUSTRE_SP_MDT,
+        LUSTRE_SP_OST,
+        LUSTRE_SP_MGC,
+        LUSTRE_SP_MGS,
+        LUSTRE_SP_ANY           = 0xFF
+};
+
+const char *sptlrpc_part2name(enum lustre_sec_part sp);
+enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd);
+
+/**
+ * A rule specifies a flavor to be used by a ptlrpc connection between
+ * two Lustre parts.
+ */
+struct sptlrpc_rule {
+        __u32                   sr_netid;   /* LNET network ID */
+        __u8                    sr_from;    /* sec_part */
+        __u8                    sr_to;      /* sec_part */
+        __u16                   sr_padding;
+        struct sptlrpc_flavor   sr_flvr;
+};
+
+/**
+ * A set of rules in memory.
+ *
+ * Rules are generated and stored on MGS, and propagated to MDT, OST,
+ * and client when needed.
+ */
+struct sptlrpc_rule_set {
+        int                     srs_nslot;
+        int                     srs_nrule;
+        struct sptlrpc_rule    *srs_rules;
+};
+
+int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr);
+int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr);
+
+static inline void sptlrpc_rule_set_init(struct sptlrpc_rule_set *set)
+{
+        memset(set, 0, sizeof(*set));
+}
+
+void sptlrpc_rule_set_free(struct sptlrpc_rule_set *set);
+int  sptlrpc_rule_set_expand(struct sptlrpc_rule_set *set);
+int  sptlrpc_rule_set_merge(struct sptlrpc_rule_set *set,
+                            struct sptlrpc_rule *rule);
+int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset,
+                            enum lustre_sec_part from,
+                            enum lustre_sec_part to,
+                            lnet_nid_t nid,
+                            struct sptlrpc_flavor *sf);
+void sptlrpc_rule_set_dump(struct sptlrpc_rule_set *set);
+
+int  sptlrpc_process_config(struct lustre_cfg *lcfg);
+void sptlrpc_conf_log_start(const char *logname);
+void sptlrpc_conf_log_stop(const char *logname);
+void sptlrpc_conf_log_update_begin(const char *logname);
+void sptlrpc_conf_log_update_end(const char *logname);
+void sptlrpc_conf_client_adapt(struct obd_device *obd);
+int  sptlrpc_conf_target_get_rules(struct obd_device *obd,
+				   struct sptlrpc_rule_set *rset);
+void sptlrpc_target_choose_flavor(struct sptlrpc_rule_set *rset,
+                                  enum lustre_sec_part from,
+                                  lnet_nid_t nid,
+                                  struct sptlrpc_flavor *flavor);
+
+/* The maximum length of security payload. 1024 is enough for Kerberos 5,
+ * and should be enough for other future mechanisms but not sure.
+ * Only used by pre-allocated request/reply pool.
+ */
+#define SPTLRPC_MAX_PAYLOAD     (1024)
+
+
+struct vfs_cred {
+        uint32_t        vc_uid;
+        uint32_t        vc_gid;
+};
+
+struct ptlrpc_ctx_ops {
+        /**
+         * To determine whether it's suitable to use the \a ctx for \a vcred.
+         */
+        int     (*match)       (struct ptlrpc_cli_ctx *ctx,
+                                struct vfs_cred *vcred);
+
+        /**
+         * To bring the \a ctx uptodate.
+         */
+        int     (*refresh)     (struct ptlrpc_cli_ctx *ctx);
+
+        /**
+         * Validate the \a ctx.
+         */
+        int     (*validate)    (struct ptlrpc_cli_ctx *ctx);
+
+        /**
+         * Force the \a ctx to die.
+         */
+        void    (*die)         (struct ptlrpc_cli_ctx *ctx,
+                                int grace);
+        int     (*display)     (struct ptlrpc_cli_ctx *ctx,
+                                char *buf, int bufsize);
+
+        /**
+         * Sign the request message using \a ctx.
+         *
+         * \pre req->rq_reqmsg point to request message.
+         * \pre req->rq_reqlen is the request message length.
+         * \post req->rq_reqbuf point to request message with signature.
+         * \post req->rq_reqdata_len is set to the final request message size.
+         *
+         * \see null_ctx_sign(), plain_ctx_sign(), gss_cli_ctx_sign().
+         */
+        int     (*sign)        (struct ptlrpc_cli_ctx *ctx,
+                                struct ptlrpc_request *req);
+
+        /**
+         * Verify the reply message using \a ctx.
+         *
+         * \pre req->rq_repdata point to reply message with signature.
+         * \pre req->rq_repdata_len is the total reply message length.
+         * \post req->rq_repmsg point to reply message without signature.
+         * \post req->rq_replen is the reply message length.
+         *
+         * \see null_ctx_verify(), plain_ctx_verify(), gss_cli_ctx_verify().
+         */
+        int     (*verify)      (struct ptlrpc_cli_ctx *ctx,
+                                struct ptlrpc_request *req);
+
+        /**
+         * Encrypt the request message using \a ctx.
+         *
+         * \pre req->rq_reqmsg point to request message in clear text.
+         * \pre req->rq_reqlen is the request message length.
+         * \post req->rq_reqbuf point to request message.
+         * \post req->rq_reqdata_len is set to the final request message size.
+         *
+         * \see gss_cli_ctx_seal().
+         */
+        int     (*seal)        (struct ptlrpc_cli_ctx *ctx,
+                                struct ptlrpc_request *req);
+
+        /**
+         * Decrypt the reply message using \a ctx.
+         *
+         * \pre req->rq_repdata point to encrypted reply message.
+         * \pre req->rq_repdata_len is the total cipher text length.
+         * \post req->rq_repmsg point to reply message in clear text.
+         * \post req->rq_replen is the reply message length in clear text.
+         *
+         * \see gss_cli_ctx_unseal().
+         */
+        int     (*unseal)      (struct ptlrpc_cli_ctx *ctx,
+                                struct ptlrpc_request *req);
+
+        /**
+         * Wrap bulk request data. This is called before wrapping RPC
+         * request message.
+         *
+         * \pre bulk buffer is descripted by desc->bd_iov and
+         * desc->bd_iov_count. note for read it's just buffer, no data
+         * need to be sent;  for write it contains data in clear text.
+         * \post when necessary, ptlrpc_bulk_sec_desc was properly prepared
+         * (usually inside of RPC request message).
+         * - encryption: cipher text bulk buffer is descripted by
+         *   desc->bd_enc_iov and desc->bd_iov_count (currently assume iov
+         *   count remains the same).
+         * - otherwise: bulk buffer is still desc->bd_iov and
+         *   desc->bd_iov_count.
+         *
+         * \return 0: success.
+         * \return -ev: error code.
+         *
+         * \see plain_cli_wrap_bulk(), gss_cli_ctx_wrap_bulk().
+         */
+        int     (*wrap_bulk)   (struct ptlrpc_cli_ctx *ctx,
+                                struct ptlrpc_request *req,
+                                struct ptlrpc_bulk_desc *desc);
+
+        /**
+         * Unwrap bulk reply data. This is called after wrapping RPC
+         * reply message.
+         *
+         * \pre bulk buffer is descripted by desc->bd_iov/desc->bd_enc_iov and
+         * desc->bd_iov_count, according to wrap_bulk().
+         * \post final bulk data in clear text is placed in buffer described
+         * by desc->bd_iov and desc->bd_iov_count.
+         * \return +ve nob of actual bulk data in clear text.
+         * \return -ve error code.
+         *
+         * \see plain_cli_unwrap_bulk(), gss_cli_ctx_unwrap_bulk().
+         */
+        int     (*unwrap_bulk) (struct ptlrpc_cli_ctx *ctx,
+                                struct ptlrpc_request *req,
+                                struct ptlrpc_bulk_desc *desc);
+};
+
+#define PTLRPC_CTX_NEW_BIT             (0)  /* newly created */
+#define PTLRPC_CTX_UPTODATE_BIT        (1)  /* uptodate */
+#define PTLRPC_CTX_DEAD_BIT            (2)  /* mark expired gracefully */
+#define PTLRPC_CTX_ERROR_BIT           (3)  /* fatal error (refresh, etc.) */
+#define PTLRPC_CTX_CACHED_BIT          (8)  /* in ctx cache (hash etc.) */
+#define PTLRPC_CTX_ETERNAL_BIT         (9)  /* always valid */
+
+#define PTLRPC_CTX_NEW                 (1 << PTLRPC_CTX_NEW_BIT)
+#define PTLRPC_CTX_UPTODATE            (1 << PTLRPC_CTX_UPTODATE_BIT)
+#define PTLRPC_CTX_DEAD                (1 << PTLRPC_CTX_DEAD_BIT)
+#define PTLRPC_CTX_ERROR               (1 << PTLRPC_CTX_ERROR_BIT)
+#define PTLRPC_CTX_CACHED              (1 << PTLRPC_CTX_CACHED_BIT)
+#define PTLRPC_CTX_ETERNAL             (1 << PTLRPC_CTX_ETERNAL_BIT)
+
+#define PTLRPC_CTX_STATUS_MASK         (PTLRPC_CTX_NEW_BIT    |       \
+                                        PTLRPC_CTX_UPTODATE   |       \
+                                        PTLRPC_CTX_DEAD       |       \
+                                        PTLRPC_CTX_ERROR)
+
+struct ptlrpc_cli_ctx {
+	struct hlist_node	cc_cache;	/* linked into ctx cache */
+	atomic_t		cc_refcount;
+	struct ptlrpc_sec      *cc_sec;
+	struct ptlrpc_ctx_ops  *cc_ops;
+	cfs_time_t		cc_expire;	/* in seconds */
+	unsigned int		cc_early_expire:1;
+	unsigned long		cc_flags;
+	struct vfs_cred		cc_vcred;
+	spinlock_t		cc_lock;
+	struct list_head	cc_req_list;	/* waiting reqs linked here */
+	struct list_head	cc_gc_chain;	/* linked to gc chain */
+};
+
+/**
+ * client side policy operation vector.
+ */
+struct ptlrpc_sec_cops {
+        /**
+         * Given an \a imp, create and initialize a ptlrpc_sec structure.
+         * \param ctx service context:
+         * - regular import: \a ctx should be NULL;
+         * - reverse import: \a ctx is obtained from incoming request.
+         * \param flavor specify what flavor to use.
+         *
+         * When necessary, policy module is responsible for taking reference
+         * on the import.
+         *
+         * \see null_create_sec(), plain_create_sec(), gss_sec_create_kr().
+         */
+        struct ptlrpc_sec *     (*create_sec)  (struct obd_import *imp,
+                                                struct ptlrpc_svc_ctx *ctx,
+                                                struct sptlrpc_flavor *flavor);
+
+        /**
+         * Destructor of ptlrpc_sec. When called, refcount has been dropped
+         * to 0 and all contexts has been destroyed.
+         *
+         * \see null_destroy_sec(), plain_destroy_sec(), gss_sec_destroy_kr().
+         */
+        void                    (*destroy_sec) (struct ptlrpc_sec *sec);
+
+        /**
+         * Notify that this ptlrpc_sec is going to die. Optionally, policy
+         * module is supposed to set sec->ps_dying and whatever necessary
+         * actions.
+         *
+         * \see plain_kill_sec(), gss_sec_kill().
+         */
+        void                    (*kill_sec)    (struct ptlrpc_sec *sec);
+
+        /**
+         * Given \a vcred, lookup and/or create its context. The policy module
+         * is supposed to maintain its own context cache.
+         * XXX currently \a create and \a remove_dead is always 1, perhaps
+         * should be removed completely.
+         *
+         * \see null_lookup_ctx(), plain_lookup_ctx(), gss_sec_lookup_ctx_kr().
+         */
+        struct ptlrpc_cli_ctx * (*lookup_ctx)  (struct ptlrpc_sec *sec,
+                                                struct vfs_cred *vcred,
+                                                int create,
+                                                int remove_dead);
+
+        /**
+         * Called then the reference of \a ctx dropped to 0. The policy module
+         * is supposed to destroy this context or whatever else according to
+         * its cache maintainance mechamism.
+         *
+         * \param sync if zero, we shouldn't wait for the context being
+         * destroyed completely.
+         *
+         * \see plain_release_ctx(), gss_sec_release_ctx_kr().
+         */
+        void                    (*release_ctx) (struct ptlrpc_sec *sec,
+                                                struct ptlrpc_cli_ctx *ctx,
+                                                int sync);
+
+        /**
+         * Flush the context cache.
+         *
+         * \param uid context of which user, -1 means all contexts.
+         * \param grace if zero, the PTLRPC_CTX_UPTODATE_BIT of affected
+         * contexts should be cleared immediately.
+         * \param force if zero, only idle contexts will be flushed.
+         *
+         * \see plain_flush_ctx_cache(), gss_sec_flush_ctx_cache_kr().
+         */
+        int                     (*flush_ctx_cache)
+                                               (struct ptlrpc_sec *sec,
+                                                uid_t uid,
+                                                int grace,
+                                                int force);
+
+        /**
+         * Called periodically by garbage collector to remove dead contexts
+         * from cache.
+         *
+         * \see gss_sec_gc_ctx_kr().
+         */
+        void                    (*gc_ctx)      (struct ptlrpc_sec *sec);
+
+        /**
+         * Given an context \a ctx, install a corresponding reverse service
+         * context on client side.
+         * XXX currently it's only used by GSS module, maybe we should remove
+         * this from general API.
+         */
+        int                     (*install_rctx)(struct obd_import *imp,
+                                                struct ptlrpc_sec *sec,
+                                                struct ptlrpc_cli_ctx *ctx);
+
+        /**
+         * To allocate request buffer for \a req.
+         *
+         * \pre req->rq_reqmsg == NULL.
+         * \pre req->rq_reqbuf == NULL, otherwise it must be pre-allocated,
+         * we are not supposed to free it.
+         * \post if success, req->rq_reqmsg point to a buffer with size
+         * at least \a lustre_msg_size.
+         *
+         * \see null_alloc_reqbuf(), plain_alloc_reqbuf(), gss_alloc_reqbuf().
+         */
+        int                     (*alloc_reqbuf)(struct ptlrpc_sec *sec,
+                                                struct ptlrpc_request *req,
+                                                int lustre_msg_size);
+
+        /**
+         * To free request buffer for \a req.
+         *
+         * \pre req->rq_reqbuf != NULL.
+         *
+         * \see null_free_reqbuf(), plain_free_reqbuf(), gss_free_reqbuf().
+         */
+        void                    (*free_reqbuf) (struct ptlrpc_sec *sec,
+                                                struct ptlrpc_request *req);
+
+        /**
+         * To allocate reply buffer for \a req.
+         *
+         * \pre req->rq_repbuf == NULL.
+         * \post if success, req->rq_repbuf point to a buffer with size
+         * req->rq_repbuf_len, the size should be large enough to receive
+         * reply which be transformed from \a lustre_msg_size of clear text.
+         *
+         * \see null_alloc_repbuf(), plain_alloc_repbuf(), gss_alloc_repbuf().
+         */
+        int                     (*alloc_repbuf)(struct ptlrpc_sec *sec,
+                                                struct ptlrpc_request *req,
+                                                int lustre_msg_size);
+
+        /**
+         * To free reply buffer for \a req.
+         *
+         * \pre req->rq_repbuf != NULL.
+         * \post req->rq_repbuf == NULL.
+         * \post req->rq_repbuf_len == 0.
+         *
+         * \see null_free_repbuf(), plain_free_repbuf(), gss_free_repbuf().
+         */
+        void                    (*free_repbuf) (struct ptlrpc_sec *sec,
+                                                struct ptlrpc_request *req);
+
+        /**
+         * To expand the request buffer of \a req, thus the \a segment in
+         * the request message pointed by req->rq_reqmsg can accommodate
+         * at least \a newsize of data.
+         *
+         * \pre req->rq_reqmsg->lm_buflens[segment] < newsize.
+         *
+         * \see null_enlarge_reqbuf(), plain_enlarge_reqbuf(),
+         * gss_enlarge_reqbuf().
+         */
+        int                     (*enlarge_reqbuf)
+                                               (struct ptlrpc_sec *sec,
+                                                struct ptlrpc_request *req,
+                                                int segment, int newsize);
+        /*
+         * misc
+         */
+        int                     (*display)     (struct ptlrpc_sec *sec,
+                                                struct seq_file *seq);
+};
+
+/**
+ * server side policy operation vector.
+ */
+struct ptlrpc_sec_sops {
+        /**
+         * verify an incoming request.
+         *
+         * \pre request message is pointed by req->rq_reqbuf, size is
+         * req->rq_reqdata_len; and the message has been unpacked to
+         * host byte order.
+         *
+         * \retval SECSVC_OK success, req->rq_reqmsg point to request message
+         * in clear text, size is req->rq_reqlen; req->rq_svc_ctx is set;
+         * req->rq_sp_from is decoded from request.
+         * \retval SECSVC_COMPLETE success, the request has been fully
+         * processed, and reply message has been prepared; req->rq_sp_from is
+         * decoded from request.
+         * \retval SECSVC_DROP failed, this request should be dropped.
+         *
+         * \see null_accept(), plain_accept(), gss_svc_accept_kr().
+         */
+        int                     (*accept)      (struct ptlrpc_request *req);
+
+        /**
+         * Perform security transformation upon reply message.
+         *
+         * \pre reply message is pointed by req->rq_reply_state->rs_msg, size
+         * is req->rq_replen.
+         * \post req->rs_repdata_len is the final message size.
+         * \post req->rq_reply_off is set.
+         *
+         * \see null_authorize(), plain_authorize(), gss_svc_authorize().
+         */
+        int                     (*authorize)   (struct ptlrpc_request *req);
+
+        /**
+         * Invalidate server context \a ctx.
+         *
+         * \see gss_svc_invalidate_ctx().
+         */
+        void                    (*invalidate_ctx)
+                                               (struct ptlrpc_svc_ctx *ctx);
+
+        /**
+         * Allocate a ptlrpc_reply_state.
+         *
+         * \param msgsize size of the reply message in clear text.
+         * \pre if req->rq_reply_state != NULL, then it's pre-allocated, we
+         * should simply use it; otherwise we'll responsible for allocating
+         * a new one.
+         * \post req->rq_reply_state != NULL;
+         * \post req->rq_reply_state->rs_msg != NULL;
+         *
+         * \see null_alloc_rs(), plain_alloc_rs(), gss_svc_alloc_rs().
+         */
+        int                     (*alloc_rs)    (struct ptlrpc_request *req,
+                                                int msgsize);
+
+        /**
+         * Free a ptlrpc_reply_state.
+         */
+        void                    (*free_rs)     (struct ptlrpc_reply_state *rs);
+
+        /**
+         * Release the server context \a ctx.
+         *
+         * \see gss_svc_free_ctx().
+         */
+        void                    (*free_ctx)    (struct ptlrpc_svc_ctx *ctx);
+
+        /**
+         * Install a reverse context based on the server context \a ctx.
+         *
+         * \see gss_svc_install_rctx_kr().
+         */
+        int                     (*install_rctx)(struct obd_import *imp,
+                                                struct ptlrpc_svc_ctx *ctx);
+
+        /**
+         * Prepare buffer for incoming bulk write.
+         *
+         * \pre desc->bd_iov and desc->bd_iov_count describes the buffer
+         * intended to receive the write.
+         *
+         * \see gss_svc_prep_bulk().
+         */
+        int                     (*prep_bulk)   (struct ptlrpc_request *req,
+                                                struct ptlrpc_bulk_desc *desc);
+
+        /**
+         * Unwrap the bulk write data.
+         *
+         * \see plain_svc_unwrap_bulk(), gss_svc_unwrap_bulk().
+         */
+        int                     (*unwrap_bulk) (struct ptlrpc_request *req,
+                                                struct ptlrpc_bulk_desc *desc);
+
+        /**
+         * Wrap the bulk read data.
+         *
+         * \see plain_svc_wrap_bulk(), gss_svc_wrap_bulk().
+         */
+        int                     (*wrap_bulk)   (struct ptlrpc_request *req,
+                                                struct ptlrpc_bulk_desc *desc);
+};
+
+struct ptlrpc_sec_policy {
+	struct module                  *sp_owner;
+	char                           *sp_name;
+	__u16                           sp_policy; /* policy number */
+	struct ptlrpc_sec_cops         *sp_cops;   /* client ops */
+	struct ptlrpc_sec_sops         *sp_sops;   /* server ops */
+};
+
+#define PTLRPC_SEC_FL_REVERSE           0x0001 /* reverse sec */
+#define PTLRPC_SEC_FL_ROOTONLY          0x0002 /* treat everyone as root */
+#define PTLRPC_SEC_FL_UDESC             0x0004 /* ship udesc */
+#define PTLRPC_SEC_FL_BULK              0x0008 /* intensive bulk i/o expected */
+#define PTLRPC_SEC_FL_PAG               0x0010 /* PAG mode */
+
+/**
+ * The ptlrpc_sec represents the client side ptlrpc security facilities,
+ * each obd_import (both regular and reverse import) must associate with
+ * a ptlrpc_sec.
+ *
+ * \see sptlrpc_import_sec_adapt().
+ */
+struct ptlrpc_sec {
+	struct ptlrpc_sec_policy       *ps_policy;
+	atomic_t                        ps_refcount;
+	/** statistic only */
+	atomic_t                        ps_nctx;
+	/** unique identifier */
+	int                             ps_id;
+        struct sptlrpc_flavor           ps_flvr;
+        enum lustre_sec_part            ps_part;
+        /** after set, no more new context will be created */
+        unsigned int                    ps_dying:1;
+        /** owning import */
+        struct obd_import              *ps_import;
+	spinlock_t			ps_lock;
+
+	/*
+	 * garbage collection
+	 */
+	struct list_head		ps_gc_list;
+	time64_t			ps_gc_interval;	/* in seconds */
+	time64_t			ps_gc_next;	/* in seconds */
+};
+
+static inline int flvr_is_rootonly(__u32 flavor)
+{
+	return (SPTLRPC_FLVR_POLICY(flavor) == SPTLRPC_POLICY_GSS &&
+		(SPTLRPC_FLVR_MECH(flavor) == SPTLRPC_MECH_GSS_NULL ||
+		 SPTLRPC_FLVR_MECH(flavor) == SPTLRPC_MECH_GSS_SK));
+}
+
+static inline int flvr_allows_user_desc(__u32 flavor)
+{
+	return (SPTLRPC_FLVR_POLICY(flavor) == SPTLRPC_POLICY_GSS &&
+		(SPTLRPC_FLVR_MECH(flavor) == SPTLRPC_MECH_GSS_NULL ||
+		 SPTLRPC_FLVR_MECH(flavor) == SPTLRPC_MECH_GSS_SK));
+}
+
+static inline int sec_is_reverse(struct ptlrpc_sec *sec)
+{
+        return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_REVERSE);
+}
+
+static inline int sec_is_rootonly(struct ptlrpc_sec *sec)
+{
+        return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_ROOTONLY);
+}
+
+
+struct ptlrpc_svc_ctx {
+	atomic_t                        sc_refcount;
+	struct ptlrpc_sec_policy       *sc_policy;
+};
+
+/*
+ * user identity descriptor
+ */
+#define LUSTRE_MAX_GROUPS               (128)
+
+struct ptlrpc_user_desc {
+        __u32           pud_uid;
+        __u32           pud_gid;
+        __u32           pud_fsuid;
+        __u32           pud_fsgid;
+        __u32           pud_cap;
+        __u32           pud_ngroups;
+        __u32           pud_groups[0];
+};
+
+/*
+ * bulk flavors
+ */
+enum sptlrpc_bulk_hash_alg {
+        BULK_HASH_ALG_NULL      = 0,
+        BULK_HASH_ALG_ADLER32,
+        BULK_HASH_ALG_CRC32,
+        BULK_HASH_ALG_MD5,
+        BULK_HASH_ALG_SHA1,
+        BULK_HASH_ALG_SHA256,
+        BULK_HASH_ALG_SHA384,
+        BULK_HASH_ALG_SHA512,
+        BULK_HASH_ALG_MAX
+};
+
+const char * sptlrpc_get_hash_name(__u8 hash_alg);
+__u8 sptlrpc_get_hash_alg(const char *algname);
+
+enum {
+        BSD_FL_ERR      = 1,
+};
+
+struct ptlrpc_bulk_sec_desc {
+        __u8            bsd_version;    /* 0 */
+        __u8            bsd_type;       /* SPTLRPC_BULK_XXX */
+        __u8            bsd_svc;        /* SPTLRPC_BULK_SVC_XXXX */
+        __u8            bsd_flags;      /* flags */
+        __u32           bsd_nob;        /* nob of bulk data */
+        __u8            bsd_data[0];    /* policy-specific token */
+};
+
+
+/*
+ * lprocfs
+ */
+struct proc_dir_entry;
+extern struct proc_dir_entry *sptlrpc_proc_root;
+
+/*
+ * round size up to next power of 2, for slab allocation.
+ * @size must be sane (can't overflow after round up)
+ */
+static inline int size_roundup_power2(int size)
+{
+        size--;
+        size |= size >> 1;
+        size |= size >> 2;
+        size |= size >> 4;
+        size |= size >> 8;
+        size |= size >> 16;
+        size++;
+        return size;
+}
+
+/*
+ * internal support libraries
+ */
+void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg,
+                                  int segment, int newsize);
+
+/*
+ * security policies
+ */
+int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy);
+int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy);
+
+__u32 sptlrpc_name2flavor_base(const char *name);
+const char *sptlrpc_flavor2name_base(__u32 flvr);
+char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf,
+                               char *buf, int bufsize);
+char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize);
+char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize);
+
+static inline struct ptlrpc_sec_policy *
+sptlrpc_policy_get(struct ptlrpc_sec_policy *policy)
+{
+	__module_get(policy->sp_owner);
+	return policy;
+}
+
+static inline void
+sptlrpc_policy_put(struct ptlrpc_sec_policy *policy)
+{
+	module_put(policy->sp_owner);
+}
+
+/*
+ * client credential
+ */
+static inline
+unsigned long cli_ctx_status(struct ptlrpc_cli_ctx *ctx)
+{
+        return (ctx->cc_flags & PTLRPC_CTX_STATUS_MASK);
+}
+
+static inline
+int cli_ctx_is_ready(struct ptlrpc_cli_ctx *ctx)
+{
+        return (cli_ctx_status(ctx) == PTLRPC_CTX_UPTODATE);
+}
+
+static inline
+int cli_ctx_is_refreshed(struct ptlrpc_cli_ctx *ctx)
+{
+        return (cli_ctx_status(ctx) != 0);
+}
+
+static inline
+int cli_ctx_is_uptodate(struct ptlrpc_cli_ctx *ctx)
+{
+        return ((ctx->cc_flags & PTLRPC_CTX_UPTODATE) != 0);
+}
+
+static inline
+int cli_ctx_is_error(struct ptlrpc_cli_ctx *ctx)
+{
+        return ((ctx->cc_flags & PTLRPC_CTX_ERROR) != 0);
+}
+
+static inline
+int cli_ctx_is_dead(struct ptlrpc_cli_ctx *ctx)
+{
+        return ((ctx->cc_flags & (PTLRPC_CTX_DEAD | PTLRPC_CTX_ERROR)) != 0);
+}
+
+static inline
+int cli_ctx_is_eternal(struct ptlrpc_cli_ctx *ctx)
+{
+        return ((ctx->cc_flags & PTLRPC_CTX_ETERNAL) != 0);
+}
+
+/*
+ * sec get/put
+ */
+struct ptlrpc_sec *sptlrpc_sec_get(struct ptlrpc_sec *sec);
+void sptlrpc_sec_put(struct ptlrpc_sec *sec);
+
+/*
+ * internal apis which only used by policy impelentation
+ */
+int  sptlrpc_get_next_secid(void);
+void sptlrpc_sec_destroy(struct ptlrpc_sec *sec);
+
+/*
+ * exported client context api
+ */
+struct ptlrpc_cli_ctx *sptlrpc_cli_ctx_get(struct ptlrpc_cli_ctx *ctx);
+void sptlrpc_cli_ctx_put(struct ptlrpc_cli_ctx *ctx, int sync);
+void sptlrpc_cli_ctx_expire(struct ptlrpc_cli_ctx *ctx);
+void sptlrpc_cli_ctx_wakeup(struct ptlrpc_cli_ctx *ctx);
+int sptlrpc_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize);
+
+/*
+ * exported client context wrap/buffers
+ */
+int sptlrpc_cli_wrap_request(struct ptlrpc_request *req);
+int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req);
+int sptlrpc_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize);
+void sptlrpc_cli_free_reqbuf(struct ptlrpc_request *req);
+int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize);
+void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req);
+int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req,
+			       const struct req_msg_field *field,
+			       int newsize);
+int  sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req,
+                                    struct ptlrpc_request **req_ret);
+void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req);
+
+void sptlrpc_request_out_callback(struct ptlrpc_request *req);
+
+/*
+ * exported higher interface of import & request
+ */
+int sptlrpc_import_sec_adapt(struct obd_import *imp,
+                             struct ptlrpc_svc_ctx *ctx,
+                             struct sptlrpc_flavor *flvr);
+struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp);
+void sptlrpc_import_sec_put(struct obd_import *imp);
+
+int  sptlrpc_import_check_ctx(struct obd_import *imp);
+void sptlrpc_import_flush_root_ctx(struct obd_import *imp);
+void sptlrpc_import_flush_my_ctx(struct obd_import *imp);
+void sptlrpc_import_flush_all_ctx(struct obd_import *imp);
+int  sptlrpc_req_get_ctx(struct ptlrpc_request *req);
+void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync);
+int  sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout);
+int  sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req);
+void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode);
+
+int sptlrpc_parse_rule(char *param, struct sptlrpc_rule *rule);
+
+/* gc */
+void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec);
+void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec);
+void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx);
+
+/* misc */
+const char * sec2target_str(struct ptlrpc_sec *sec);
+int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev);
+
+/*
+ * server side
+ */
+enum secsvc_accept_res {
+        SECSVC_OK       = 0,
+        SECSVC_COMPLETE,
+        SECSVC_DROP,
+};
+
+int  sptlrpc_svc_unwrap_request(struct ptlrpc_request *req);
+int  sptlrpc_svc_alloc_rs(struct ptlrpc_request *req, int msglen);
+int  sptlrpc_svc_wrap_reply(struct ptlrpc_request *req);
+void sptlrpc_svc_free_rs(struct ptlrpc_reply_state *rs);
+void sptlrpc_svc_ctx_addref(struct ptlrpc_request *req);
+void sptlrpc_svc_ctx_decref(struct ptlrpc_request *req);
+void sptlrpc_svc_ctx_invalidate(struct ptlrpc_request *req);
+
+int  sptlrpc_target_export_check(struct obd_export *exp,
+                                 struct ptlrpc_request *req);
+void sptlrpc_target_update_exp_flavor(struct obd_device *obd,
+                                      struct sptlrpc_rule_set *rset);
+
+/*
+ * reverse context
+ */
+int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp,
+                                struct ptlrpc_svc_ctx *ctx);
+int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp,
+                                struct ptlrpc_cli_ctx *ctx);
+
+/* bulk security api */
+int sptlrpc_enc_pool_add_user(void);
+int sptlrpc_enc_pool_del_user(void);
+int  sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc);
+void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc);
+int get_free_pages_in_pool(void);
+int pool_is_at_full_capacity(void);
+
+int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req,
+                          struct ptlrpc_bulk_desc *desc);
+int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req,
+                                 struct ptlrpc_bulk_desc *desc,
+                                 int nob);
+int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req,
+                                  struct ptlrpc_bulk_desc *desc);
+#ifdef HAVE_SERVER_SUPPORT
+int sptlrpc_svc_prep_bulk(struct ptlrpc_request *req,
+                          struct ptlrpc_bulk_desc *desc);
+int sptlrpc_svc_wrap_bulk(struct ptlrpc_request *req,
+                          struct ptlrpc_bulk_desc *desc);
+int sptlrpc_svc_unwrap_bulk(struct ptlrpc_request *req,
+                            struct ptlrpc_bulk_desc *desc);
+#endif
+
+/* bulk helpers (internal use only by policies) */
+int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
+                              void *buf, int buflen);
+
+int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed);
+
+/* user descriptor helpers */
+static inline int sptlrpc_user_desc_size(int ngroups)
+{
+        return sizeof(struct ptlrpc_user_desc) + ngroups * sizeof(__u32);
+}
+
+int sptlrpc_current_user_desc_size(void);
+int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset);
+int sptlrpc_unpack_user_desc(struct lustre_msg *req, int offset, int swabbed);
+
+
+#define CFS_CAP_CHOWN_MASK (1 << CFS_CAP_CHOWN)
+#define CFS_CAP_SYS_RESOURCE_MASK (1 << CFS_CAP_SYS_RESOURCE)
+
+/** @} sptlrpc */
+
+#endif /* _LUSTRE_SEC_H_ */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_swab.h b/drivers/staging/lustrefsx/lustre/include/lustre_swab.h
new file mode 100644
index 0000000000000..8f8b375e64c25
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_swab.h
@@ -0,0 +1,134 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ *
+ * Copyright 2015 Cray Inc, all rights reserved.
+ * Author: Ben Evans.
+ *
+ * We assume all nodes are either little-endian or big-endian, and we
+ * always send messages in the sender's native format.  The receiver
+ * detects the message format by checking the 'magic' field of the message
+ * (see lustre_msg_swabbed() below).
+ *
+ * Each wire type has corresponding 'lustre_swab_xxxtypexxx()' routines
+ * are implemented in ptlrpc/lustre_swab.c.  These 'swabbers' convert the
+ * type from "other" endian, in-place in the message buffer.
+ *
+ * A swabber takes a single pointer argument.  The caller must already have
+ * verified that the length of the message buffer >= sizeof (type).
+ *
+ * For variable length types, a second 'lustre_swab_v_xxxtypexxx()' routine
+ * may be defined that swabs just the variable part, after the caller has
+ * verified that the message buffer is large enough.
+ */
+
+#ifndef _LUSTRE_SWAB_H_
+#define _LUSTRE_SWAB_H_
+
+#include <lustre/lustre_idl.h>
+
+void lustre_swab_orphan_ent(struct lu_orphan_ent *ent);
+void lustre_swab_orphan_ent_v2(struct lu_orphan_ent_v2 *ent);
+void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
+void lustre_swab_connect(struct obd_connect_data *ocd);
+void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
+void lustre_swab_hsm_state_set(struct hsm_state_set *hss);
+void lustre_swab_obd_statfs(struct obd_statfs *os);
+void lustre_swab_obd_ioobj(struct obd_ioobj *ioo);
+void lustre_swab_niobuf_remote(struct niobuf_remote *nbr);
+void lustre_swab_ost_lvb_v1(struct ost_lvb_v1 *lvb);
+void lustre_swab_ost_lvb(struct ost_lvb *lvb);
+void lustre_swab_obd_quotactl(struct obd_quotactl *q);
+void lustre_swab_quota_body(struct quota_body *b);
+void lustre_swab_lquota_lvb(struct lquota_lvb *lvb);
+void lustre_swab_barrier_lvb(struct barrier_lvb *lvb);
+void lustre_swab_generic_32s(__u32 *val);
+void lustre_swab_mdt_body(struct mdt_body *b);
+void lustre_swab_mdt_ioepoch(struct mdt_ioepoch *b);
+void lustre_swab_mdt_rec_setattr(struct mdt_rec_setattr *sa);
+void lustre_swab_mdt_rec_reint(struct mdt_rec_reint *rr);
+void lustre_swab_lmv_desc(struct lmv_desc *ld);
+void lustre_swab_lmv_mds_md(union lmv_mds_md *lmm);
+void lustre_swab_lov_desc(struct lov_desc *ld);
+void lustre_swab_ldlm_res_id(struct ldlm_res_id *id);
+void lustre_swab_ldlm_policy_data(union ldlm_wire_policy_data *d);
+void lustre_swab_gl_lquota_desc(struct ldlm_gl_lquota_desc *);
+void lustre_swab_gl_barrier_desc(struct ldlm_gl_barrier_desc *);
+void lustre_swab_ldlm_intent(struct ldlm_intent *i);
+void lustre_swab_ldlm_resource_desc(struct ldlm_resource_desc *r);
+void lustre_swab_ldlm_lock_desc(struct ldlm_lock_desc *l);
+void lustre_swab_ldlm_request(struct ldlm_request *rq);
+void lustre_swab_ldlm_reply(struct ldlm_reply *r);
+void lustre_swab_mgs_target_info(struct mgs_target_info *oinfo);
+void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *oinfo);
+void lustre_swab_mgs_config_body(struct mgs_config_body *body);
+void lustre_swab_mgs_config_res(struct mgs_config_res *body);
+void lustre_swab_lfsck_request(struct lfsck_request *lr);
+void lustre_swab_lfsck_reply(struct lfsck_reply *lr);
+void lustre_swab_obdo(struct obdo *o);
+void lustre_swab_ost_body(struct ost_body *b);
+void lustre_swab_ost_last_id(__u64 *id);
+void lustre_swab_fiemap(struct fiemap *fiemap);
+void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum);
+void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum);
+void lustre_swab_lov_comp_md_v1(struct lov_comp_md_v1 *lum);
+void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
+				     int stripe_count);
+void lustre_swab_lov_mds_md(struct lov_mds_md *lmm);
+void lustre_swab_idx_info(struct idx_info *ii);
+void lustre_swab_lip_header(struct lu_idxpage *lip);
+void lustre_swab_lustre_capa(struct lustre_capa *c);
+void lustre_swab_lustre_capa_key(struct lustre_capa_key *k);
+void lustre_swab_fid2path(struct getinfo_fid2path *gf);
+void lustre_swab_layout_intent(struct layout_intent *li);
+void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
+void lustre_swab_hsm_current_action(struct hsm_current_action *action);
+void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk);
+void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
+void lustre_swab_hsm_user_item(struct hsm_user_item *hui);
+void lustre_swab_hsm_request(struct hsm_request *hr);
+void lustre_swab_object_update(struct object_update *ou);
+void lustre_swab_object_update_request(struct object_update_request *our);
+void lustre_swab_out_update_header(struct out_update_header *ouh);
+void lustre_swab_out_update_buffer(struct out_update_buffer *oub);
+void lustre_swab_object_update_result(struct object_update_result *our);
+void lustre_swab_object_update_reply(struct object_update_reply *our);
+void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl);
+void lustre_swab_close_data(struct close_data *data);
+void lustre_swab_lmv_user_md(struct lmv_user_md *lum);
+void lustre_swab_ladvise(struct lu_ladvise *ladvise);
+void lustre_swab_ladvise_hdr(struct ladvise_hdr *ladvise_hdr);
+
+/* Functions for dumping PTLRPC fields */
+void dump_rniobuf(struct niobuf_remote *rnb);
+void dump_ioo(struct obd_ioobj *nb);
+void dump_ost_body(struct ost_body *ob);
+void dump_rcs(__u32 *rc);
+
+void lustre_print_user_md(unsigned int level, struct lov_user_md *lum,
+			  const char *msg);
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_update.h b/drivers/staging/lustrefsx/lustre/include/lustre_update.h
new file mode 100644
index 0000000000000..968cc51028d86
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_update.h
@@ -0,0 +1,706 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.htm
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2016, Intel Corporation.
+ */
+/*
+ * lustre/include/lustre_update.h
+ *
+ * Author: Di Wang <di.wang@intel.com>
+ */
+
+#ifndef _LUSTRE_UPDATE_H
+#define _LUSTRE_UPDATE_H
+#include <dt_object.h>
+#include <lustre_net.h>
+#include <obj_update.h>
+
+#define OUT_UPDATE_REPLY_SIZE		4096
+#define OUT_BULK_BUFFER_SIZE		4096
+
+struct dt_key;
+struct dt_rec;
+struct object_update_param;
+struct llog_update_record;
+
+static inline size_t update_params_size(const struct update_params *params,
+					unsigned int param_count)
+{
+	struct object_update_param	*param;
+	size_t total_size = sizeof(*params);
+	unsigned int i;
+
+	param = (struct object_update_param *)&params->up_params[0];
+	for (i = 0; i < param_count; i++) {
+		size_t size = object_update_param_size(param);
+
+		param = (struct object_update_param *)((char *)param + size);
+		total_size += size;
+	}
+
+	return total_size;
+}
+
+static inline struct object_update_param *
+update_params_get_param(const struct update_params *params,
+			unsigned int index, unsigned int param_count)
+{
+	struct object_update_param *param;
+	unsigned int		i;
+
+	if (index > param_count)
+		return NULL;
+
+	param = (struct object_update_param *)&params->up_params[0];
+	for (i = 0; i < index; i++)
+		param = (struct object_update_param *)((char *)param +
+			object_update_param_size(param));
+
+	return param;
+}
+
+static inline void*
+update_params_get_param_buf(const struct update_params *params, __u16 index,
+			    unsigned int param_count, __u16 *size)
+{
+	struct object_update_param *param;
+
+	param = update_params_get_param(params, (unsigned int)index,
+					param_count);
+	if (param == NULL)
+		return NULL;
+
+	if (size != NULL)
+		*size = param->oup_len;
+
+	return param->oup_buf;
+}
+
+static inline size_t
+update_op_size(unsigned int param_count)
+{
+	return offsetof(struct update_op, uop_params_off[param_count]);
+}
+
+static inline struct update_op *
+update_op_next_op(const struct update_op *uop)
+{
+	return (struct update_op *)((char *)uop +
+				update_op_size(uop->uop_param_count));
+}
+
+static inline size_t update_ops_size(const struct update_ops *ops,
+				     unsigned int update_count)
+{
+	struct update_op *op;
+	size_t total_size = sizeof(*ops);
+	unsigned int i;
+
+	op = (struct update_op *)&ops->uops_op[0];
+	for (i = 0; i < update_count; i++, op = update_op_next_op(op))
+		total_size += update_op_size(op->uop_param_count);
+
+	return total_size;
+}
+
+static inline struct update_params *
+update_records_get_params(const struct update_records *record)
+{
+	return (struct update_params *)((char *)record +
+		offsetof(struct update_records, ur_ops) +
+		update_ops_size(&record->ur_ops, record->ur_update_count));
+}
+
+static inline struct update_param *
+update_param_next_param(const struct update_param *param)
+{
+	return (struct update_param *)((char *)param +
+				       object_update_param_size(
+					  (struct object_update_param *)param));
+}
+
+static inline size_t
+__update_records_size(size_t raw_size)
+{
+	return cfs_size_round(offsetof(struct update_records, ur_ops) +
+			      raw_size);
+}
+
+static inline size_t
+update_records_size(const struct update_records *record)
+{
+	size_t op_size = 0;
+	size_t param_size = 0;
+
+	if (record->ur_update_count > 0)
+		op_size = update_ops_size(&record->ur_ops,
+					  record->ur_update_count);
+	if (record->ur_param_count > 0) {
+		struct update_params *params;
+
+		params = update_records_get_params(record);
+		param_size = update_params_size(params, record->ur_param_count);
+	}
+
+	return __update_records_size(op_size + param_size);
+}
+
+static inline size_t
+__llog_update_record_size(size_t records_size)
+{
+	return cfs_size_round(sizeof(struct llog_rec_hdr) + records_size +
+			      sizeof(struct llog_rec_tail));
+}
+
+static inline size_t
+llog_update_record_size(const struct llog_update_record *lur)
+{
+	return __llog_update_record_size(
+			update_records_size(&lur->lur_update_rec));
+}
+
+static inline struct update_op *
+update_ops_get_op(const struct update_ops *ops, unsigned int index,
+		  unsigned int update_count)
+{
+	struct update_op *op;
+	unsigned int i;
+
+	if (index > update_count)
+		return NULL;
+
+	op = (struct update_op *)&ops->uops_op[0];
+	for (i = 0; i < index; i++)
+		op = update_op_next_op(op);
+
+	return op;
+}
+
+static inline void
+*object_update_param_get(const struct object_update *update, size_t index,
+			 size_t *size)
+{
+	const struct	object_update_param *param;
+	size_t		i;
+
+	if (index >= update->ou_params_count)
+		return ERR_PTR(-EINVAL);
+
+	param = &update->ou_params[0];
+	for (i = 0; i < index; i++)
+		param = (struct object_update_param *)((char *)param +
+			object_update_param_size(param));
+
+	if (size != NULL)
+		*size = param->oup_len;
+
+	if (param->oup_len == 0)
+		return ERR_PTR(-ENODATA);
+
+	return (void *)&param->oup_buf[0];
+}
+
+static inline unsigned long
+object_update_request_size(const struct object_update_request *our)
+{
+	unsigned long	size;
+	size_t		i = 0;
+
+	size = offsetof(struct object_update_request, ourq_updates[0]);
+	for (i = 0; i < our->ourq_count; i++) {
+		struct object_update *update;
+
+		update = (struct object_update *)((char *)our + size);
+		size += object_update_size(update);
+	}
+	return size;
+}
+
+static inline void
+object_update_result_insert(struct object_update_reply *reply,
+			    void *data, size_t data_len, size_t index,
+			    int rc)
+{
+	struct object_update_result *update_result;
+
+	update_result = object_update_result_get(reply, index, NULL);
+	LASSERT(update_result);
+
+	update_result->our_rc = ptlrpc_status_hton(rc);
+	if (rc >= 0) {
+		if (data_len > 0 && data)
+			memcpy(update_result->our_data, data, data_len);
+		update_result->our_datalen = data_len;
+	}
+
+	reply->ourp_lens[index] = cfs_size_round(data_len +
+					sizeof(struct object_update_result));
+}
+
+static inline int
+object_update_result_data_get(const struct object_update_reply *reply,
+			      struct lu_buf *lbuf, size_t index)
+{
+	struct object_update_result *update_result;
+	size_t size = 0;
+	int    result;
+
+	LASSERT(lbuf != NULL);
+	update_result = object_update_result_get(reply, index, &size);
+	if (update_result == NULL ||
+	    size < cfs_size_round(sizeof(struct object_update_reply)) ||
+	    update_result->our_datalen > size)
+		RETURN(-EFAULT);
+
+	result = ptlrpc_status_ntoh(update_result->our_rc);
+	if (result < 0)
+		return result;
+
+	lbuf->lb_buf = update_result->our_data;
+	lbuf->lb_len = update_result->our_datalen;
+
+	return result;
+}
+
+/**
+ * Attached in the thandle to record the updates for distribute
+ * distribution.
+ */
+struct thandle_update_records {
+	/* All of updates for the cross-MDT operation, vmalloc'd. */
+	struct llog_update_record	*tur_update_records;
+	size_t				tur_update_records_buf_size;
+
+	/* All of parameters for the cross-MDT operation, vmalloc'd */
+	struct update_params    *tur_update_params;
+	unsigned int		tur_update_param_count;
+	size_t			tur_update_params_buf_size;
+};
+
+#define TOP_THANDLE_MAGIC	0x20140917
+struct top_multiple_thandle {
+	struct dt_device	*tmt_master_sub_dt;
+	atomic_t		tmt_refcount;
+	/* Other sub transactions will be listed here. */
+	struct list_head	tmt_sub_thandle_list;
+	spinlock_t		tmt_sub_lock;
+
+	struct list_head	tmt_commit_list;
+	/* All of update records will packed here */
+	struct thandle_update_records *tmt_update_records;
+
+	wait_queue_head_t	tmt_stop_waitq;
+	__u64			tmt_batchid;
+	int			tmt_result;
+	__u32			tmt_magic;
+	size_t			tmt_record_size;
+	__u32			tmt_committed:1;
+};
+
+/* {top,sub}_thandle are used to manage distributed transactions which
+ * include updates on several nodes. A top_handle represents the
+ * whole operation, and sub_thandle represents updates on each node. */
+struct top_thandle {
+	struct thandle		tt_super;
+	/* The master sub transaction. */
+	struct thandle		*tt_master_sub_thandle;
+
+	struct top_multiple_thandle *tt_multiple_thandle;
+};
+
+struct sub_thandle_cookie {
+	struct llog_cookie	stc_cookie;
+	struct list_head	stc_list;
+};
+
+/* Sub thandle is used to track multiple sub thandles under one parent
+ * thandle */
+struct sub_thandle {
+	struct thandle		*st_sub_th;
+	struct dt_device	*st_dt;
+	struct list_head	st_cookie_list;
+	struct dt_txn_commit_cb	st_commit_dcb;
+	struct dt_txn_commit_cb	st_stop_dcb;
+	int			st_result;
+
+	/* linked to top_thandle */
+	struct list_head	st_sub_list;
+
+	/* If this sub thandle is committed */
+	bool			st_committed:1,
+				st_stopped:1,
+				st_started:1;
+};
+
+struct tx_arg;
+typedef int (*tx_exec_func_t)(const struct lu_env *env, struct thandle *th,
+			      struct tx_arg *ta);
+
+/* Structure for holding one update execution */
+struct tx_arg {
+	tx_exec_func_t		 exec_fn;
+	tx_exec_func_t		 undo_fn;
+	struct dt_object	*object;
+	const char		*file;
+	struct object_update_reply *reply;
+	int			 line;
+	int			 index;
+	union {
+		struct {
+			struct dt_insert_rec	 rec;
+			const struct dt_key	*key;
+		} insert;
+		struct {
+		} ref;
+		struct {
+			struct lu_attr	 attr;
+		} attr_set;
+		struct {
+			struct lu_buf	 buf;
+			const char	*name;
+			int		 flags;
+			__u32		 csum;
+		} xattr_set;
+		struct {
+			struct lu_attr			attr;
+			struct dt_allocation_hint	hint;
+			struct dt_object_format		dof;
+			struct lu_fid			fid;
+		} create;
+		struct {
+			struct lu_buf	buf;
+			loff_t		pos;
+		} write;
+		struct {
+			struct ost_body	    *body;
+		} destroy;
+	} u;
+};
+
+/* Structure for holding all update executations of one transaction */
+struct thandle_exec_args {
+	struct thandle		*ta_handle;
+	int			ta_argno;   /* used args */
+	int			ta_alloc_args; /* allocated args count */
+	struct tx_arg		**ta_args;
+};
+
+/* target/out_lib.c */
+int out_update_pack(const struct lu_env *env, struct object_update *update,
+		    size_t *max_update_size, enum update_type op,
+		    const struct lu_fid *fid, unsigned int params_count,
+		    __u16 *param_sizes, const void **param_bufs,
+		    __u32 reply_size);
+int out_create_pack(const struct lu_env *env, struct object_update *update,
+		    size_t *max_update_size, const struct lu_fid *fid,
+		    const struct lu_attr *attr, struct dt_allocation_hint *hint,
+		    struct dt_object_format *dof);
+int out_destroy_pack(const struct lu_env *env, struct object_update *update,
+		     size_t *max_update_size, const struct lu_fid *fid);
+int out_index_delete_pack(const struct lu_env *env,
+			  struct object_update *update, size_t *max_update_size,
+			  const struct lu_fid *fid, const struct dt_key *key);
+int out_index_insert_pack(const struct lu_env *env,
+			  struct object_update *update, size_t *max_update_size,
+			  const struct lu_fid *fid, const struct dt_rec *rec,
+			  const struct dt_key *key);
+int out_xattr_set_pack(const struct lu_env *env,
+		       struct object_update *update, size_t *max_update_size,
+		       const struct lu_fid *fid, const struct lu_buf *buf,
+		       const char *name, __u32 flag);
+int out_xattr_del_pack(const struct lu_env *env,
+		       struct object_update *update, size_t *max_update_size,
+		       const struct lu_fid *fid, const char *name);
+int out_attr_set_pack(const struct lu_env *env,
+		      struct object_update *update, size_t *max_update_size,
+		      const struct lu_fid *fid, const struct lu_attr *attr);
+int out_ref_add_pack(const struct lu_env *env,
+		     struct object_update *update, size_t *max_update_size,
+		     const struct lu_fid *fid);
+int out_ref_del_pack(const struct lu_env *env,
+		     struct object_update *update, size_t *max_update_size,
+		     const struct lu_fid *fid);
+int out_write_pack(const struct lu_env *env,
+		   struct object_update *update, size_t *max_update_size,
+		   const struct lu_fid *fid, const struct lu_buf *buf,
+		   __u64 pos);
+int out_attr_get_pack(const struct lu_env *env,
+		      struct object_update *update, size_t *max_update_size,
+		      const struct lu_fid *fid);
+int out_index_lookup_pack(const struct lu_env *env,
+			  struct object_update *update, size_t *max_update_size,
+			  const struct lu_fid *fid, struct dt_rec *rec,
+			  const struct dt_key *key);
+int out_xattr_get_pack(const struct lu_env *env,
+		       struct object_update *update, size_t *max_update_size,
+		       const struct lu_fid *fid, const char *name,
+		       const int bufsize);
+int out_read_pack(const struct lu_env *env, struct object_update *update,
+		  size_t *max_update_length, const struct lu_fid *fid,
+		  size_t size, loff_t pos);
+
+const char *update_op_str(__u16 opcode);
+
+/* target/update_trans.c */
+struct thandle *thandle_get_sub_by_dt(const struct lu_env *env,
+				      struct thandle *th,
+				      struct dt_device *sub_dt);
+
+static inline struct thandle *
+thandle_get_sub(const struct lu_env *env, struct thandle *th,
+		 const struct dt_object *sub_obj)
+{
+	return thandle_get_sub_by_dt(env, th, lu2dt_dev(sub_obj->do_lu.lo_dev));
+}
+
+struct thandle *
+top_trans_create(const struct lu_env *env, struct dt_device *master_dev);
+int top_trans_start(const struct lu_env *env, struct dt_device *master_dev,
+		    struct thandle *th);
+int top_trans_stop(const struct lu_env *env, struct dt_device *master_dev,
+		   struct thandle *th);
+void top_multiple_thandle_destroy(struct top_multiple_thandle *tmt);
+
+static inline void top_multiple_thandle_get(struct top_multiple_thandle *tmt)
+{
+	atomic_inc(&tmt->tmt_refcount);
+}
+
+static inline void top_multiple_thandle_put(struct top_multiple_thandle *tmt)
+{
+	if (atomic_dec_and_test(&tmt->tmt_refcount))
+		top_multiple_thandle_destroy(tmt);
+}
+
+struct sub_thandle *lookup_sub_thandle(struct top_multiple_thandle *tmt,
+				       struct dt_device *dt_dev);
+int sub_thandle_trans_create(const struct lu_env *env,
+			     struct top_thandle *top_th,
+			     struct sub_thandle *st);
+
+/* update_records.c */
+size_t update_records_create_size(const struct lu_env *env,
+				  const struct lu_fid *fid,
+				  const struct lu_attr *attr,
+				  const struct dt_allocation_hint *hint,
+				  struct dt_object_format *dof);
+size_t update_records_attr_set_size(const struct lu_env *env,
+				    const struct lu_fid *fid,
+				    const struct lu_attr *attr);
+size_t update_records_ref_add_size(const struct lu_env *env,
+				   const struct lu_fid *fid);
+size_t update_records_ref_del_size(const struct lu_env *env,
+				   const struct lu_fid *fid);
+size_t update_records_destroy_size(const struct lu_env *env,
+				   const struct lu_fid *fid);
+size_t update_records_index_insert_size(const struct lu_env *env,
+					const struct lu_fid *fid,
+					const struct dt_rec *rec,
+					const struct dt_key *key);
+size_t update_records_index_delete_size(const struct lu_env *env,
+					const struct lu_fid *fid,
+					const struct dt_key *key);
+size_t update_records_xattr_set_size(const struct lu_env *env,
+				     const struct lu_fid *fid,
+				     const struct lu_buf *buf,
+				     const char *name,
+				     __u32 flag);
+size_t update_records_xattr_del_size(const struct lu_env *env,
+				     const struct lu_fid *fid,
+				     const char *name);
+size_t update_records_write_size(const struct lu_env *env,
+				 const struct lu_fid *fid,
+				 const struct lu_buf *buf,
+				 __u64 pos);
+size_t update_records_punch_size(const struct lu_env *env,
+				 const struct lu_fid *fid,
+				 __u64 start, __u64 end);
+
+int update_records_create_pack(const struct lu_env *env,
+			       struct update_ops *ops,
+			       unsigned int *op_count,
+			       size_t *max_ops_size,
+			       struct update_params *params,
+			       unsigned int *param_count,
+			       size_t *max_param_size,
+			       const struct lu_fid *fid,
+			       const struct lu_attr *attr,
+			       const struct dt_allocation_hint *hint,
+			       struct dt_object_format *dof);
+int update_records_attr_set_pack(const struct lu_env *env,
+				 struct update_ops *ops,
+				 unsigned int *op_count,
+				 size_t *max_ops_size,
+				 struct update_params *params,
+				 unsigned int *param_count,
+				 size_t *max_param_size,
+				 const struct lu_fid *fid,
+				 const struct lu_attr *attr);
+int update_records_ref_add_pack(const struct lu_env *env,
+				struct update_ops *ops,
+				unsigned int *op_count,
+				size_t *max_ops_size,
+				struct update_params *params,
+				unsigned int *param_count,
+				size_t *max_param_size,
+				const struct lu_fid *fid);
+int update_records_ref_del_pack(const struct lu_env *env,
+				struct update_ops *ops,
+				unsigned int *op_count,
+				size_t *max_ops_size,
+				struct update_params *params,
+				unsigned int *param_count,
+				size_t *max_param_size,
+				const struct lu_fid *fid);
+int update_records_destroy_pack(const struct lu_env *env,
+				struct update_ops *ops, unsigned int *op_count,
+				size_t *max_ops_size,
+				struct update_params *params,
+				unsigned int *param_count,
+				size_t *max_param_size,
+				const struct lu_fid *fid);
+int update_records_index_insert_pack(const struct lu_env *env,
+				     struct update_ops *ops,
+				     unsigned int *op_count,
+				     size_t *max_ops_size,
+				     struct update_params *params,
+				     unsigned int *param_count,
+				     size_t *max_param_size,
+				     const struct lu_fid *fid,
+				     const struct dt_rec *rec,
+				     const struct dt_key *key);
+int update_records_index_delete_pack(const struct lu_env *env,
+				     struct update_ops *ops,
+				     unsigned int *op_count,
+				     size_t *max_ops_size,
+				     struct update_params *params,
+				     unsigned int *param_count,
+				     size_t *max_param_size,
+				     const struct lu_fid *fid,
+				     const struct dt_key *key);
+int update_records_xattr_set_pack(const struct lu_env *env,
+				  struct update_ops *ops,
+				  unsigned int *op_count,
+				  size_t *max_ops_size,
+				  struct update_params *params,
+				  unsigned int *param_count,
+				  size_t *max_param_size,
+				  const struct lu_fid *fid,
+				  const struct lu_buf *buf, const char *name,
+				  __u32 flag);
+int update_records_xattr_del_pack(const struct lu_env *env,
+				  struct update_ops *ops,
+				  unsigned int *op_count,
+				  size_t *max_ops_size,
+				  struct update_params *params,
+				  unsigned int *param_count,
+				  size_t *max_param_size,
+				  const struct lu_fid *fid,
+				  const char *name);
+int update_records_write_pack(const struct lu_env *env,
+			      struct update_ops *ops,
+			      unsigned int *op_count,
+			      size_t *max_ops_size,
+			      struct update_params *params,
+			      unsigned int *param_count,
+			      size_t *max_param_size,
+			      const struct lu_fid *fid,
+			      const struct lu_buf *buf,
+			      __u64 pos);
+int update_records_punch_pack(const struct lu_env *env,
+			      struct update_ops *ops,
+			      unsigned int *op_count,
+			      size_t *max_ops_size,
+			      struct update_params *params,
+			      unsigned int *param_count,
+			      size_t *max_param_size,
+			      const struct lu_fid *fid,
+			      __u64 start, __u64 end);
+int update_records_noop_pack(const struct lu_env *env,
+			     struct update_ops *ops,
+			     unsigned int *op_count,
+			     size_t *max_ops_size,
+			     struct update_params *params,
+			     unsigned int *param_count,
+			     size_t *max_param_size,
+			     const struct lu_fid *fid);
+
+int tur_update_records_extend(struct thandle_update_records *tur,
+			      size_t new_size);
+int tur_update_params_extend(struct thandle_update_records *tur,
+			     size_t new_size);
+int tur_update_extend(struct thandle_update_records *tur,
+		      size_t new_op_size, size_t new_param_size);
+
+#define update_record_pack(name, th, ...)				\
+({									\
+	struct top_thandle *top_th;					\
+	struct top_multiple_thandle *tmt;				\
+	struct thandle_update_records *tur;				\
+	struct llog_update_record     *lur;				\
+	size_t		avail_param_size;				\
+	size_t		avail_op_size;					\
+	int		ret;						\
+									\
+	while (1) {							\
+		top_th = container_of(th, struct top_thandle, tt_super);\
+		tmt = top_th->tt_multiple_thandle;			\
+		tur = tmt->tmt_update_records;				\
+		lur = tur->tur_update_records;				\
+		avail_param_size = tur->tur_update_params_buf_size -	\
+			     update_params_size(tur->tur_update_params,	\
+					tur->tur_update_param_count);	\
+		avail_op_size = tur->tur_update_records_buf_size -	\
+				llog_update_record_size(lur);		\
+		ret = update_records_##name##_pack(env,			\
+					  &lur->lur_update_rec.ur_ops,	\
+				  &lur->lur_update_rec.ur_update_count,	\
+				  &avail_op_size,			\
+				  tur->tur_update_params,		\
+				  &tur->tur_update_param_count,		\
+				  &avail_param_size, __VA_ARGS__);	\
+		if (ret == -E2BIG) {					\
+			ret = tur_update_extend(tur, avail_op_size,	\
+						   avail_param_size);	\
+			if (ret != 0)					\
+				break;					\
+			continue;					\
+		} else {						\
+			break;						\
+		}							\
+	}								\
+	ret;								\
+})
+
+#define update_record_size(env, name, th, ...)				\
+({									\
+	struct top_thandle *top_th;					\
+	struct top_multiple_thandle *tmt;				\
+									\
+	top_th = container_of(th, struct top_thandle, tt_super);	\
+									\
+	LASSERT(top_th->tt_multiple_thandle != NULL);			\
+	tmt = top_th->tt_multiple_thandle;				\
+	tmt->tmt_record_size +=						\
+		update_records_##name##_size(env, __VA_ARGS__);		\
+})
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_ver.h b/drivers/staging/lustrefsx/lustre/include/lustre_ver.h
new file mode 100644
index 0000000000000..0557c2dd554e5
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_ver.h
@@ -0,0 +1,37 @@
+#ifndef _LUSTRE_VER_H_
+#define _LUSTRE_VER_H_
+
+/*
+ * LUSTRE_VERSION_STRING
+ *
+ * Note that some files may seem to include this header unnecessarily.
+ * If the file uses LUSTRE_VERSION_STRING, it is likely doing the include
+ * for compatibility with the Lustre code in the Linux kernel.
+ * In the Linux kernel, they are likely hard coding LUSTRE_VERSION_STRING
+ * right here in this file.  The out-of-kernel Lustre code generates
+ * LUSTRE_VERSION_STRING in autoconf with AC_DEFINE.
+ */
+
+#define OBD_OCD_VERSION(major, minor, patch, fix)			\
+	(((major) << 24) + ((minor) << 16) + ((patch) << 8) + (fix))
+
+#define OBD_OCD_VERSION_MAJOR(version)	((int)((version) >> 24) & 255)
+#define OBD_OCD_VERSION_MINOR(version)	((int)((version) >> 16) & 255)
+#define OBD_OCD_VERSION_PATCH(version)	((int)((version) >>  8) & 255)
+#define OBD_OCD_VERSION_FIX(version)	((int)((version) >>  0) & 255)
+
+#define LUSTRE_VERSION_CODE						\
+	OBD_OCD_VERSION(LUSTRE_MAJOR, LUSTRE_MINOR, LUSTRE_PATCH, LUSTRE_FIX)
+
+/* liblustre clients are only allowed to connect if their LUSTRE_FIX mismatches
+ * by this amount (set in lustre/autoconf/lustre-version.ac). */
+#define LUSTRE_VERSION_ALLOWED_OFFSET OBD_OCD_VERSION(0, 0, 1, 32)
+
+#ifdef __KERNEL__
+/* If lustre version of client and servers it connects to differs by more
+ * than this amount, client would issue a warning.
+ * (set in lustre/autoconf/lustre-version.ac) */
+#define LUSTRE_VERSION_OFFSET_WARN OBD_OCD_VERSION(0, 4, 50, 0)
+#endif
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lvfs.h b/drivers/staging/lustrefsx/lustre/include/lvfs.h
new file mode 100644
index 0000000000000..856ee1972aa06
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lvfs.h
@@ -0,0 +1,102 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lvfs.h
+ *
+ * lustre VFS/process permission interface
+ */
+
+#ifndef __LVFS_H__
+#define __LVFS_H__
+
+#include <linux/dcache.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/namei.h>
+#include <lustre_compat.h>
+
+#define OBD_RUN_CTXT_MAGIC	0xC0FFEEAA
+#define OBD_CTXT_DEBUG		/* development-only debugging */
+
+struct dt_device;
+
+struct lvfs_run_ctxt {
+	struct vfsmount		*pwdmnt;
+	struct dentry		*pwd;
+	mm_segment_t		 fs;
+	int			 umask;
+	struct dt_device	*dt;
+#ifdef OBD_CTXT_DEBUG
+	unsigned int		 magic;
+#endif
+};
+
+static inline void OBD_SET_CTXT_MAGIC(struct lvfs_run_ctxt *ctxt)
+{
+#ifdef OBD_CTXT_DEBUG
+	ctxt->magic = OBD_RUN_CTXT_MAGIC;
+#endif
+}
+
+/* ptlrpc_sec_ctx.c */
+void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx);
+void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx);
+
+/* We need to hold the inode semaphore over the dcache lookup itself, or we
+ * run the risk of entering the filesystem lookup path concurrently on SMP
+ * systems, and instantiating two inodes for the same entry.  We still
+ * protect against concurrent addition/removal races with the DLM locking.
+ */
+static inline struct dentry *
+ll_lookup_one_len(const char *fid_name, struct dentry *dparent,
+		  int fid_namelen)
+{
+	struct dentry *dchild;
+
+	inode_lock(dparent->d_inode);
+	dchild = lookup_one_len(fid_name, dparent, fid_namelen);
+	inode_unlock(dparent->d_inode);
+
+	if (IS_ERR(dchild) || dchild->d_inode == NULL)
+		return dchild;
+
+	if (is_bad_inode(dchild->d_inode)) {
+		CERROR("bad inode returned %lu/%u\n",
+		       dchild->d_inode->i_ino, dchild->d_inode->i_generation);
+		dput(dchild);
+		dchild = ERR_PTR(-ENOENT);
+	}
+
+	return dchild;
+}
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/md_object.h b/drivers/staging/lustrefsx/lustre/include/md_object.h
new file mode 100644
index 0000000000000..d64d243ff8988
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/md_object.h
@@ -0,0 +1,680 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/md_object.h
+ *
+ * Extention of lu_object.h for metadata objects
+ */
+
+#ifndef _LUSTRE_MD_OBJECT_H
+#define _LUSTRE_MD_OBJECT_H
+
+#ifndef HAVE_SERVER_SUPPORT
+# error "client code should not depend on md_object.h"
+#endif /* !HAVE_SERVER_SUPPORT */
+
+/** \defgroup md md
+ * Sub-class of lu_object with methods common for "meta-data" objects in MDT
+ * stack.
+ *
+ * Meta-data objects implement namespace operations: you can link, unlink
+ * them, and treat them as directories.
+ *
+ * Examples: mdt, cmm, and mdt are implementations of md interface.
+ * @{
+ */
+
+
+/*
+ * super-class definitions.
+ */
+#include <dt_object.h>
+
+struct md_device;
+struct md_device_operations;
+struct md_object;
+struct obd_export;
+
+/** metadata attributes */
+enum ma_valid {
+	MA_INODE     = 1 << 0,
+	MA_LOV       = 1 << 1,
+	MA_FLAGS     = 1 << 2,
+	MA_LMV       = 1 << 3,
+	MA_ACL_DEF   = 1 << 4,
+	MA_LOV_DEF   = 1 << 5,
+	MA_HSM       = 1 << 6,
+	MA_PFID      = 1 << 7,
+	MA_LMV_DEF   = 1 << 8,
+};
+
+typedef enum {
+        MDL_MINMODE  = 0,
+        MDL_EX       = 1,
+        MDL_PW       = 2,
+        MDL_PR       = 4,
+        MDL_CW       = 8,
+        MDL_CR       = 16,
+        MDL_NL       = 32,
+        MDL_GROUP    = 64,
+        MDL_MAXMODE
+} mdl_mode_t;
+
+typedef enum {
+        MDT_NUL_LOCK = 0,
+        MDT_REG_LOCK = (1 << 0),
+        MDT_PDO_LOCK = (1 << 1)
+} mdl_type_t;
+
+/* lfs rgetfacl permission check */
+#define MAY_RGETFACL    (1 << 14)
+
+/* memory structure for hsm attributes
+ * for fields description see the on disk structure hsm_attrs
+ * which is defined in lustre_idl.h
+ */
+struct md_hsm {
+	__u32	mh_compat;
+	__u32	mh_flags;
+	__u64	mh_arch_id;
+	__u64	mh_arch_ver;
+};
+
+struct md_attr {
+        __u64                   ma_valid;
+        __u64                   ma_need;
+        __u64                   ma_attr_flags;
+        struct lu_attr          ma_attr;
+        struct lu_fid           ma_pfid;
+        struct md_hsm           ma_hsm;
+        struct lov_mds_md      *ma_lmm;
+	union lmv_mds_md       *ma_lmv;
+        void                   *ma_acl;
+        int                     ma_lmm_size;
+        int                     ma_lmv_size;
+        int                     ma_acl_size;
+};
+
+/** Additional parameters for create */
+struct md_op_spec {
+        union {
+                /** symlink target */
+                const char               *sp_symname;
+                /** eadata for regular files */
+                struct md_spec_reg {
+                        const void *eadata;
+                        int  eadatalen;
+                } sp_ea;
+        } u;
+
+	/** Create flag from client: such as MDS_OPEN_CREAT, and others. */
+	__u64      sp_cr_flags;
+
+	/* File security context for creates. */
+	const char	*sp_cr_file_secctx_name; /* (security) xattr name */
+	void		*sp_cr_file_secctx; /* xattr value */
+	size_t		 sp_cr_file_secctx_size; /* xattr value size */
+
+	/** don't create lov objects or llog cookie - this replay */
+	unsigned int no_create:1,
+		     sp_cr_lookup:1, /* do lookup sanity check or not. */
+		     sp_rm_entry:1,  /* only remove name entry */
+		     sp_permitted:1, /* do not check permission */
+		     sp_migrate_close:1; /* close the file during migrate */
+	/** Current lock mode for parent dir where create is performing. */
+        mdl_mode_t sp_cr_mode;
+
+        /** to create directory */
+        const struct dt_index_features *sp_feat;
+};
+
+union ldlm_policy_data;
+/**
+ * Operations implemented for each md object (both directory and leaf).
+ */
+struct md_object_operations {
+        int (*moo_permission)(const struct lu_env *env,
+                              struct md_object *pobj, struct md_object *cobj,
+                              struct md_attr *attr, int mask);
+
+        int (*moo_attr_get)(const struct lu_env *env, struct md_object *obj,
+                            struct md_attr *attr);
+
+        int (*moo_attr_set)(const struct lu_env *env, struct md_object *obj,
+                            const struct md_attr *attr);
+
+        int (*moo_xattr_get)(const struct lu_env *env, struct md_object *obj,
+                             struct lu_buf *buf, const char *name);
+
+        int (*moo_xattr_list)(const struct lu_env *env, struct md_object *obj,
+                              struct lu_buf *buf);
+
+        int (*moo_xattr_set)(const struct lu_env *env, struct md_object *obj,
+                             const struct lu_buf *buf, const char *name,
+                             int fl);
+
+        int (*moo_xattr_del)(const struct lu_env *env, struct md_object *obj,
+                             const char *name);
+
+	/** This method is used to swap the layouts between 2 objects */
+	int (*moo_swap_layouts)(const struct lu_env *env,
+			       struct md_object *obj1, struct md_object *obj2,
+			       __u64 flags);
+
+        /** \retval number of bytes actually read upon success */
+        int (*moo_readpage)(const struct lu_env *env, struct md_object *obj,
+                            const struct lu_rdpg *rdpg);
+
+        int (*moo_readlink)(const struct lu_env *env, struct md_object *obj,
+                            struct lu_buf *buf);
+	int (*moo_changelog)(const struct lu_env *env,
+			     enum changelog_rec_type type, int flags,
+			     struct md_device *m, const struct lu_fid *fid);
+
+        int (*moo_open)(const struct lu_env *env,
+                        struct md_object *obj, int flag);
+
+        int (*moo_close)(const struct lu_env *env, struct md_object *obj,
+                         struct md_attr *ma, int mode);
+
+        int (*moo_object_sync)(const struct lu_env *, struct md_object *);
+
+	int (*moo_object_lock)(const struct lu_env *env, struct md_object *obj,
+			       struct lustre_handle *lh,
+			       struct ldlm_enqueue_info *einfo,
+			       union ldlm_policy_data *policy);
+	int (*moo_object_unlock)(const struct lu_env *env,
+				 struct md_object *obj,
+				 struct ldlm_enqueue_info *einfo,
+				 union ldlm_policy_data *policy);
+
+	int (*moo_invalidate)(const struct lu_env *env, struct md_object *obj);
+	/**
+	 * Trying to write to un-instantiated layout component.
+	 *
+	 * The caller should have held layout lock.
+	 *
+	 * \param[in] env	execution environment
+	 * \param[in] obj	MD object
+	 * \param[in] layout	data structure to describe the changes to
+	 *			the MD object's layout
+	 * \param[in] buf	buffer containing the client's lovea
+	 *
+	 * \retval 0		success
+	 * \retval -ne		error code
+	 */
+	int (*moo_layout_change)(const struct lu_env *env,
+				 struct md_object *obj,
+				 struct layout_intent *layout,
+				 const struct lu_buf *buf);
+};
+
+/**
+ * Operations implemented for each directory object.
+ */
+struct md_dir_operations {
+        int (*mdo_is_subdir) (const struct lu_env *env, struct md_object *obj,
+                              const struct lu_fid *fid, struct lu_fid *sfid);
+
+        int (*mdo_lookup)(const struct lu_env *env, struct md_object *obj,
+                          const struct lu_name *lname, struct lu_fid *fid,
+                          struct md_op_spec *spec);
+
+        mdl_mode_t (*mdo_lock_mode)(const struct lu_env *env,
+                                    struct md_object *obj,
+                                    mdl_mode_t mode);
+
+        int (*mdo_create)(const struct lu_env *env, struct md_object *pobj,
+                          const struct lu_name *lname, struct md_object *child,
+                          struct md_op_spec *spec,
+                          struct md_attr *ma);
+
+        /** This method is used for creating data object for this meta object*/
+        int (*mdo_create_data)(const struct lu_env *env, struct md_object *p,
+                               struct md_object *o,
+                               const struct md_op_spec *spec,
+                               struct md_attr *ma);
+
+        int (*mdo_rename)(const struct lu_env *env, struct md_object *spobj,
+                          struct md_object *tpobj, const struct lu_fid *lf,
+                          const struct lu_name *lsname, struct md_object *tobj,
+                          const struct lu_name *ltname, struct md_attr *ma);
+
+        int (*mdo_link)(const struct lu_env *env, struct md_object *tgt_obj,
+                        struct md_object *src_obj, const struct lu_name *lname,
+                        struct md_attr *ma);
+
+	int (*mdo_unlink)(const struct lu_env *env, struct md_object *pobj,
+			  struct md_object *cobj, const struct lu_name *lname,
+			  struct md_attr *ma, int no_name);
+
+	int (*mdo_migrate)(const struct lu_env *env, struct md_object *pobj,
+			   struct md_object *sobj, const struct lu_name *lname,
+			   struct md_object *tobj, struct md_attr *ma);
+};
+
+struct md_device_operations {
+        /** meta-data device related handlers. */
+	int (*mdo_root_get)(const struct lu_env *env, struct md_device *m,
+			    struct lu_fid *f);
+
+	int (*mdo_maxeasize_get)(const struct lu_env *env, struct md_device *m,
+				int *easize);
+
+        int (*mdo_statfs)(const struct lu_env *env, struct md_device *m,
+                          struct obd_statfs *sfs);
+
+        int (*mdo_llog_ctxt_get)(const struct lu_env *env,
+                                 struct md_device *m, int idx, void **h);
+
+        int (*mdo_iocontrol)(const struct lu_env *env, struct md_device *m,
+                             unsigned int cmd, int len, void *data);
+};
+
+struct md_device {
+        struct lu_device                   md_lu_dev;
+        const struct md_device_operations *md_ops;
+};
+
+struct md_object {
+        struct lu_object                   mo_lu;
+        const struct md_object_operations *mo_ops;
+        const struct md_dir_operations    *mo_dir_ops;
+};
+
+static inline struct md_device *lu2md_dev(const struct lu_device *d)
+{
+        LASSERT(IS_ERR(d) || lu_device_is_md(d));
+        return container_of0(d, struct md_device, md_lu_dev);
+}
+
+static inline struct lu_device *md2lu_dev(struct md_device *d)
+{
+        return &d->md_lu_dev;
+}
+
+static inline struct md_object *lu2md(const struct lu_object *o)
+{
+        LASSERT(o == NULL || IS_ERR(o) || lu_device_is_md(o->lo_dev));
+        return container_of0(o, struct md_object, mo_lu);
+}
+
+static inline int md_device_init(struct md_device *md, struct lu_device_type *t)
+{
+        return lu_device_init(&md->md_lu_dev, t);
+}
+
+static inline void md_device_fini(struct md_device *md)
+{
+        lu_device_fini(&md->md_lu_dev);
+}
+
+static inline struct md_object *md_object_find_slice(const struct lu_env *env,
+                                                     struct md_device *md,
+                                                     const struct lu_fid *f)
+{
+        return lu2md(lu_object_find_slice(env, md2lu_dev(md), f, NULL));
+}
+
+
+/** md operations */
+static inline int mo_permission(const struct lu_env *env,
+                                struct md_object *p,
+                                struct md_object *c,
+                                struct md_attr *at,
+                                int mask)
+{
+        LASSERT(c->mo_ops->moo_permission);
+        return c->mo_ops->moo_permission(env, p, c, at, mask);
+}
+
+static inline int mo_attr_get(const struct lu_env *env,
+                              struct md_object *m,
+                              struct md_attr *at)
+{
+        LASSERT(m->mo_ops->moo_attr_get);
+        return m->mo_ops->moo_attr_get(env, m, at);
+}
+
+static inline int mo_readlink(const struct lu_env *env,
+                              struct md_object *m,
+                              struct lu_buf *buf)
+{
+        LASSERT(m->mo_ops->moo_readlink);
+        return m->mo_ops->moo_readlink(env, m, buf);
+}
+
+static inline int mo_changelog(const struct lu_env *env,
+			       enum changelog_rec_type type,
+			       int flags, struct md_device *m,
+			       const struct lu_fid *fid)
+{
+	struct lu_fid rootfid;
+	struct md_object *root;
+	int rc;
+
+	rc = m->md_ops->mdo_root_get(env, m, &rootfid);
+	if (rc)
+		return rc;
+
+	root = md_object_find_slice(env, m, &rootfid);
+	if (IS_ERR(root))
+		RETURN(PTR_ERR(root));
+
+	LASSERT(root->mo_ops->moo_changelog);
+	rc = root->mo_ops->moo_changelog(env, type, flags, m, fid);
+
+	lu_object_put(env, &root->mo_lu);
+
+	return rc;
+}
+
+static inline int mo_attr_set(const struct lu_env *env,
+                              struct md_object *m,
+                              const struct md_attr *at)
+{
+        LASSERT(m->mo_ops->moo_attr_set);
+        return m->mo_ops->moo_attr_set(env, m, at);
+}
+
+static inline int mo_xattr_get(const struct lu_env *env,
+                               struct md_object *m,
+                               struct lu_buf *buf,
+                               const char *name)
+{
+        LASSERT(m->mo_ops->moo_xattr_get);
+        return m->mo_ops->moo_xattr_get(env, m, buf, name);
+}
+
+static inline int mo_xattr_del(const struct lu_env *env,
+                               struct md_object *m,
+                               const char *name)
+{
+        LASSERT(m->mo_ops->moo_xattr_del);
+        return m->mo_ops->moo_xattr_del(env, m, name);
+}
+
+static inline int mo_xattr_set(const struct lu_env *env,
+                               struct md_object *m,
+                               const struct lu_buf *buf,
+                               const char *name,
+                               int flags)
+{
+        LASSERT(m->mo_ops->moo_xattr_set);
+        return m->mo_ops->moo_xattr_set(env, m, buf, name, flags);
+}
+
+static inline int mo_xattr_list(const struct lu_env *env,
+                                struct md_object *m,
+                                struct lu_buf *buf)
+{
+        LASSERT(m->mo_ops->moo_xattr_list);
+        return m->mo_ops->moo_xattr_list(env, m, buf);
+}
+
+static inline int mo_invalidate(const struct lu_env *env, struct md_object *m)
+{
+	LASSERT(m->mo_ops->moo_invalidate);
+	return m->mo_ops->moo_invalidate(env, m);
+}
+
+static inline int mo_layout_change(const struct lu_env *env,
+				   struct md_object *m,
+				   struct layout_intent *layout,
+				   const struct lu_buf *buf)
+{
+	/* need instantiate objects which in the access range */
+	LASSERT(m->mo_ops->moo_layout_change);
+	return m->mo_ops->moo_layout_change(env, m, layout, buf);
+}
+
+static inline int mo_swap_layouts(const struct lu_env *env,
+				  struct md_object *o1,
+				  struct md_object *o2, __u64 flags)
+{
+	LASSERT(o1->mo_ops->moo_swap_layouts);
+	LASSERT(o2->mo_ops->moo_swap_layouts);
+	if (o1->mo_ops->moo_swap_layouts != o2->mo_ops->moo_swap_layouts)
+		return -EPERM;
+	return o1->mo_ops->moo_swap_layouts(env, o1, o2, flags);
+}
+
+static inline int mo_open(const struct lu_env *env,
+                          struct md_object *m,
+                          int flags)
+{
+        LASSERT(m->mo_ops->moo_open);
+        return m->mo_ops->moo_open(env, m, flags);
+}
+
+static inline int mo_close(const struct lu_env *env,
+                           struct md_object *m,
+                           struct md_attr *ma,
+                           int mode)
+{
+        LASSERT(m->mo_ops->moo_close);
+        return m->mo_ops->moo_close(env, m, ma, mode);
+}
+
+static inline int mo_readpage(const struct lu_env *env,
+                              struct md_object *m,
+                              const struct lu_rdpg *rdpg)
+{
+        LASSERT(m->mo_ops->moo_readpage);
+        return m->mo_ops->moo_readpage(env, m, rdpg);
+}
+
+static inline int mo_object_sync(const struct lu_env *env, struct md_object *m)
+{
+        LASSERT(m->mo_ops->moo_object_sync);
+        return m->mo_ops->moo_object_sync(env, m);
+}
+
+static inline int mo_object_lock(const struct lu_env *env,
+				 struct md_object *m,
+				 struct lustre_handle *lh,
+				 struct ldlm_enqueue_info *einfo,
+				 union ldlm_policy_data *policy)
+{
+	LASSERT(m->mo_ops->moo_object_lock);
+	return m->mo_ops->moo_object_lock(env, m, lh, einfo, policy);
+}
+
+static inline int mo_object_unlock(const struct lu_env *env,
+				   struct md_object *m,
+				   struct ldlm_enqueue_info *einfo,
+				   union ldlm_policy_data *policy)
+{
+	LASSERT(m->mo_ops->moo_object_unlock);
+	return m->mo_ops->moo_object_unlock(env, m, einfo, policy);
+}
+
+static inline int mdo_lookup(const struct lu_env *env,
+                             struct md_object *p,
+                             const struct lu_name *lname,
+                             struct lu_fid *f,
+                             struct md_op_spec *spec)
+{
+        LASSERT(p->mo_dir_ops->mdo_lookup);
+        return p->mo_dir_ops->mdo_lookup(env, p, lname, f, spec);
+}
+
+static inline mdl_mode_t mdo_lock_mode(const struct lu_env *env,
+                                       struct md_object *mo,
+                                       mdl_mode_t lm)
+{
+        if (mo->mo_dir_ops->mdo_lock_mode == NULL)
+                return MDL_MINMODE;
+        return mo->mo_dir_ops->mdo_lock_mode(env, mo, lm);
+}
+
+static inline int mdo_create(const struct lu_env *env,
+                             struct md_object *p,
+                             const struct lu_name *lchild_name,
+                             struct md_object *c,
+                             struct md_op_spec *spc,
+                             struct md_attr *at)
+{
+	LASSERT(p->mo_dir_ops->mdo_create);
+	return p->mo_dir_ops->mdo_create(env, p, lchild_name, c, spc, at);
+}
+
+static inline int mdo_create_data(const struct lu_env *env,
+                                  struct md_object *p,
+                                  struct md_object *c,
+                                  const struct md_op_spec *spec,
+                                  struct md_attr *ma)
+{
+        LASSERT(c->mo_dir_ops->mdo_create_data);
+        return c->mo_dir_ops->mdo_create_data(env, p, c, spec, ma);
+}
+
+static inline int mdo_rename(const struct lu_env *env,
+                             struct md_object *sp,
+                             struct md_object *tp,
+                             const struct lu_fid *lf,
+                             const struct lu_name *lsname,
+                             struct md_object *t,
+                             const struct lu_name *ltname,
+                             struct md_attr *ma)
+{
+        LASSERT(tp->mo_dir_ops->mdo_rename);
+        return tp->mo_dir_ops->mdo_rename(env, sp, tp, lf, lsname, t, ltname,
+                                          ma);
+}
+
+static inline int mdo_migrate(const struct lu_env *env,
+			     struct md_object *pobj,
+			     struct md_object *sobj,
+			     const struct lu_name *lname,
+			     struct md_object *tobj,
+			     struct md_attr *ma)
+{
+	LASSERT(pobj->mo_dir_ops->mdo_migrate);
+	return pobj->mo_dir_ops->mdo_migrate(env, pobj, sobj, lname, tobj, ma);
+}
+
+static inline int mdo_is_subdir(const struct lu_env *env,
+                                struct md_object *mo,
+                                const struct lu_fid *fid,
+                                struct lu_fid *sfid)
+{
+        LASSERT(mo->mo_dir_ops->mdo_is_subdir);
+        return mo->mo_dir_ops->mdo_is_subdir(env, mo, fid, sfid);
+}
+
+static inline int mdo_link(const struct lu_env *env,
+                           struct md_object *p,
+                           struct md_object *s,
+                           const struct lu_name *lname,
+                           struct md_attr *ma)
+{
+        LASSERT(s->mo_dir_ops->mdo_link);
+        return s->mo_dir_ops->mdo_link(env, p, s, lname, ma);
+}
+
+static inline int mdo_unlink(const struct lu_env *env,
+			     struct md_object *p,
+			     struct md_object *c,
+			     const struct lu_name *lname,
+			     struct md_attr *ma, int no_name)
+{
+	LASSERT(p->mo_dir_ops->mdo_unlink);
+	return p->mo_dir_ops->mdo_unlink(env, p, c, lname, ma, no_name);
+}
+
+/**
+ * Used in MDD/OUT layer for object lock rule
+ **/
+enum mdd_object_role {
+	MOR_SRC_PARENT,
+	MOR_SRC_CHILD,
+	MOR_TGT_PARENT,
+	MOR_TGT_CHILD,
+	MOR_TGT_ORPHAN
+};
+
+struct dt_device;
+
+int lustre_buf2hsm(void *buf, int rc, struct md_hsm *mh);
+void lustre_hsm2buf(void *buf, const struct md_hsm *mh);
+
+enum {
+	UCRED_INVALID	= -1,
+	UCRED_INIT	= 0,
+	UCRED_OLD	= 1,
+	UCRED_NEW	= 2,
+};
+
+struct lu_ucred {
+	__u32			 uc_valid;
+	__u32			 uc_o_uid;
+	__u32			 uc_o_gid;
+	__u32			 uc_o_fsuid;
+	__u32			 uc_o_fsgid;
+	__u32			 uc_uid;
+	__u32			 uc_gid;
+	__u32			 uc_fsuid;
+	__u32			 uc_fsgid;
+	__u32			 uc_suppgids[2];
+	cfs_cap_t		 uc_cap;
+	__u32			 uc_umask;
+	struct group_info	*uc_ginfo;
+	struct md_identity	*uc_identity;
+	char			 uc_jobid[LUSTRE_JOBID_SIZE];
+};
+
+struct lu_ucred *lu_ucred(const struct lu_env *env);
+
+struct lu_ucred *lu_ucred_check(const struct lu_env *env);
+
+struct lu_ucred *lu_ucred_assert(const struct lu_env *env);
+
+int lu_ucred_global_init(void);
+
+void lu_ucred_global_fini(void);
+
+#define md_cap_t(x) (x)
+
+#define MD_CAP_TO_MASK(x) (1 << (x))
+
+#define md_cap_raised(c, flag) (md_cap_t(c) & MD_CAP_TO_MASK(flag))
+
+/* capable() is copied from linux kernel! */
+static inline int md_capable(struct lu_ucred *uc, cfs_cap_t cap)
+{
+	if (md_cap_raised(uc->uc_cap, cap))
+		return 1;
+	return 0;
+}
+
+/** @} md */
+#endif /* _LINUX_MD_OBJECT_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/obd.h b/drivers/staging/lustrefsx/lustre/include/obd.h
new file mode 100644
index 0000000000000..ba31e450be2e0
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/obd.h
@@ -0,0 +1,1227 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __OBD_H
+#define __OBD_H
+
+#include <linux/spinlock.h>
+
+#include <lustre/lustre_idl.h>
+#include <lustre_lib.h>
+#include <libcfs/bitmap.h>
+#ifdef HAVE_SERVER_SUPPORT
+# include <lu_target.h>
+# include <obd_target.h>
+#endif
+#include <lu_ref.h>
+#include <lustre_export.h>
+#include <lustre_fid.h>
+#include <lustre_fld.h>
+#include <lustre_handles.h>
+#include <lustre_intent.h>
+#include <lvfs.h>
+#include <lustre_quota.h>
+
+#define MAX_OBD_DEVICES 8192
+
+struct osc_async_rc {
+        int     ar_rc;
+        int     ar_force_sync;
+        __u64   ar_min_xid;
+};
+
+struct lov_oinfo {                 /* per-stripe data structure */
+	struct ost_id   loi_oi;    /* object ID/Sequence on the target OST */
+	int loi_ost_idx;           /* OST stripe index in lov_tgt_desc->tgts */
+	int loi_ost_gen;           /* generation of this loi_ost_idx */
+
+	unsigned long loi_kms_valid:1;
+	__u64 loi_kms;             /* known minimum size */
+	struct ost_lvb loi_lvb;
+	struct osc_async_rc     loi_ar;
+};
+
+static inline void loi_kms_set(struct lov_oinfo *oinfo, __u64 kms)
+{
+        oinfo->loi_kms = kms;
+        oinfo->loi_kms_valid = 1;
+}
+
+struct lov_stripe_md;
+struct obd_info;
+
+typedef int (*obd_enqueue_update_f)(void *cookie, int rc);
+
+/* obd info for a particular level (lov, osc). */
+struct obd_info {
+	/* OBD_STATFS_* flags */
+	__u64                   oi_flags;
+        /* statfs data specific for every OSC, if needed at all. */
+        struct obd_statfs      *oi_osfs;
+        /* An update callback which is called to update some data on upper
+	 * level. E.g. it is used for update lsm->lsm_oinfo at every received
+         * request in osc level for enqueue requests. It is also possible to
+         * update some caller data from LOV layer if needed. */
+        obd_enqueue_update_f    oi_cb_up;
+};
+
+struct obd_type {
+	struct list_head	 typ_chain;
+	struct obd_ops		*typ_dt_ops;
+	struct md_ops		*typ_md_ops;
+	struct proc_dir_entry	*typ_procroot;
+	struct proc_dir_entry	*typ_procsym;
+	__u32			 typ_sym_filter;
+	char			*typ_name;
+	int			 typ_refcnt;
+	struct lu_device_type	*typ_lu;
+	spinlock_t		 obd_type_lock;
+};
+
+struct brw_page {
+	u64		 off;
+	struct page	*pg;
+	u32		 count;
+	u32		 flag;
+};
+
+struct timeout_item {
+	enum timeout_event ti_event;
+	cfs_time_t         ti_timeout;
+	timeout_cb_t       ti_cb;
+	void              *ti_cb_data;
+	struct list_head   ti_obd_list;
+	struct list_head   ti_chain;
+};
+
+#define OBD_MAX_RIF_DEFAULT	8
+#define OBD_MAX_RIF_MAX		512
+#define OSC_MAX_RIF_MAX		256
+#define OSC_MAX_DIRTY_DEFAULT	(OBD_MAX_RIF_DEFAULT * 4)
+#define OSC_MAX_DIRTY_MB_MAX	2048     /* arbitrary, but < MAX_LONG bytes */
+#define OSC_DEFAULT_RESENDS	10
+
+/* possible values for fo_sync_lock_cancel */
+enum {
+        NEVER_SYNC_ON_CANCEL = 0,
+        BLOCKING_SYNC_ON_CANCEL = 1,
+        ALWAYS_SYNC_ON_CANCEL = 2,
+        NUM_SYNC_ON_CANCEL_STATES
+};
+
+/*
+ * Limit reply buffer size for striping data to one x86_64 page. This
+ * value is chosen to fit the striping data for common use cases while
+ * staying well below the limit at which the buffer must be backed by
+ * vmalloc(). Excessive use of vmalloc() may cause spinlock contention
+ * on the MDS.
+ */
+#define OBD_MAX_DEFAULT_EA_SIZE		4096
+
+enum obd_cl_sem_lock_class {
+	OBD_CLI_SEM_NORMAL,
+	OBD_CLI_SEM_MGC,
+	OBD_CLI_SEM_MDCOSC,
+};
+
+struct mdc_rpc_lock;
+struct obd_import;
+struct client_obd {
+	struct rw_semaphore	 cl_sem;
+	struct obd_uuid		 cl_target_uuid;
+	struct obd_import	*cl_import; /* ptlrpc connection state */
+	size_t			 cl_conn_count;
+
+	/* Cache maximum and default values for easize. This is
+	 * strictly a performance optimization to minimize calls to
+	 * obd_size_diskmd(). The default values are used to calculate the
+	 * initial size of a request buffer. The ptlrpc layer will resize the
+	 * buffer as needed to accommodate a larger reply from the
+	 * server. The default values should be small enough to avoid wasted
+	 * memory and excessive use of vmalloc(), yet large enough to avoid
+	 * reallocating the buffer in the common use case. */
+
+	/* Default EA size for striping attributes. It is initialized at
+	 * mount-time based on the default stripe width of the filesystem,
+	 * then it tracks the largest observed EA size advertised by
+	 * the MDT, up to a maximum value of OBD_MAX_DEFAULT_EA_SIZE. */
+	__u32			 cl_default_mds_easize;
+
+	/* Maximum possible EA size computed at mount-time based on
+	 * the number of OSTs in the filesystem. May be increased at
+	 * run-time if a larger observed size is advertised by the MDT. */
+	__u32			 cl_max_mds_easize;
+
+	enum lustre_sec_part	 cl_sp_me;
+	enum lustre_sec_part	 cl_sp_to;
+	struct sptlrpc_flavor	 cl_flvr_mgc; /* fixed flavor of mgc->mgs */
+
+	/* the grant values are protected by loi_list_lock below */
+	unsigned long		 cl_dirty_pages;      /* all _dirty_ in pages */
+	unsigned long		 cl_dirty_max_pages;  /* allowed w/o rpc */
+	unsigned long		 cl_dirty_transit;    /* dirty synchronous */
+	unsigned long		 cl_avail_grant;   /* bytes of credit for ost */
+	unsigned long		 cl_lost_grant;    /* lost credits (trunc) */
+	/* grant consumed for dirty pages */
+	unsigned long		 cl_dirty_grant;
+
+	/* since we allocate grant by blocks, we don't know how many grant will
+	 * be used to add a page into cache. As a solution, we reserve maximum
+	 * grant before trying to dirty a page and unreserve the rest.
+	 * See osc_{reserve|unreserve}_grant for details. */
+	long			cl_reserved_grant;
+	struct list_head	cl_cache_waiters; /* waiting for cache/grant */
+	cfs_time_t		cl_next_shrink_grant;   /* jiffies */
+	struct list_head	cl_grant_shrink_list;  /* Timeout event list */
+	int			cl_grant_shrink_interval; /* seconds */
+
+	/* A chunk is an optimal size used by osc_extent to determine
+	 * the extent size. A chunk is max(PAGE_SIZE, OST block size) */
+	int			cl_chunkbits;
+	/* extent insertion metadata overhead to be accounted in grant,
+	 * in bytes */
+	unsigned int		cl_grant_extent_tax;
+	/* maximum extent size, in number of pages */
+	unsigned int		cl_max_extent_pages;
+
+	/* keep track of objects that have lois that contain pages which
+	 * have been queued for async brw.  this lock also protects the
+	 * lists of osc_client_pages that hang off of the loi */
+        /*
+         * ->cl_loi_list_lock protects consistency of
+         * ->cl_loi_{ready,read,write}_list. ->ap_make_ready() and
+         * ->ap_completion() call-backs are executed under this lock. As we
+         * cannot guarantee that these call-backs never block on all platforms
+         * (as a matter of fact they do block on Mac OS X), type of
+         * ->cl_loi_list_lock is platform dependent: it's a spin-lock on Linux
+         * and blocking mutex on Mac OS X. (Alternative is to make this lock
+         * blocking everywhere, but we don't want to slow down fast-path of
+         * our main platform.)
+         *
+	 * NB by Jinshan: though field names are still _loi_, but actually
+	 * osc_object{}s are in the list.
+	 */
+	spinlock_t		cl_loi_list_lock;
+	struct list_head	cl_loi_ready_list;
+	struct list_head	cl_loi_hp_ready_list;
+	struct list_head	cl_loi_write_list;
+	struct list_head	cl_loi_read_list;
+	__u32			cl_r_in_flight;
+	__u32			cl_w_in_flight;
+	/* just a sum of the loi/lop pending numbers to be exported by /proc */
+	atomic_t		cl_pending_w_pages;
+	atomic_t		cl_pending_r_pages;
+	__u32			cl_max_pages_per_rpc;
+	__u32			cl_max_rpcs_in_flight;
+	struct obd_histogram	cl_read_rpc_hist;
+	struct obd_histogram	cl_write_rpc_hist;
+	struct obd_histogram	cl_read_page_hist;
+	struct obd_histogram	cl_write_page_hist;
+	struct obd_histogram	cl_read_offset_hist;
+	struct obd_histogram	cl_write_offset_hist;
+
+	/** LRU for osc caching pages */
+	struct cl_client_cache  *cl_cache;
+	/** member of cl_cache->ccc_lru */
+	struct list_head         cl_lru_osc;
+	/** # of available LRU slots left in the per-OSC cache.
+	 * Available LRU slots are shared by all OSCs of the same file system,
+	 * therefore this is a pointer to cl_client_cache::ccc_lru_left. */
+	atomic_long_t           *cl_lru_left;
+	/** # of busy LRU pages. A page is considered busy if it's in writeback
+	 * queue, or in transfer. Busy pages can't be discarded so they are not
+	 * in LRU cache. */
+	atomic_long_t            cl_lru_busy;
+	/** # of LRU pages in the cache for this client_obd */
+	atomic_long_t            cl_lru_in_list;
+	/** # of threads are shrinking LRU cache. To avoid contention, it's not
+	 * allowed to have multiple threads shrinking LRU cache. */
+	atomic_t                 cl_lru_shrinkers;
+	/** The time when this LRU cache was last used. */
+	time64_t		 cl_lru_last_used;
+	/** stats: how many reclaims have happened for this client_obd.
+	 * reclaim and shrink - shrink is async, voluntarily rebalancing;
+	 * reclaim is sync, initiated by IO thread when the LRU slots are
+	 * in shortage. */
+	__u64                    cl_lru_reclaim;
+	/** List of LRU pages for this client_obd */
+	struct list_head         cl_lru_list;
+	/** Lock for LRU page list */
+	spinlock_t		 cl_lru_list_lock;
+	/** # of unstable pages in this client_obd.
+	 * An unstable page is a page state that WRITE RPC has finished but
+	 * the transaction has NOT yet committed. */
+	atomic_long_t            cl_unstable_count;
+	/** Link to osc_shrinker_list */
+	struct list_head	 cl_shrink_list;
+
+	/* number of in flight destroy rpcs is limited to max_rpcs_in_flight */
+	atomic_t		 cl_destroy_in_flight;
+	wait_queue_head_t	 cl_destroy_waitq;
+
+        struct mdc_rpc_lock     *cl_rpc_lock;
+
+	/* modify rpcs in flight
+	 * currently used for metadata only */
+	spinlock_t		 cl_mod_rpcs_lock;
+	__u16			 cl_max_mod_rpcs_in_flight;
+	__u16			 cl_mod_rpcs_in_flight;
+	__u16			 cl_close_rpcs_in_flight;
+	wait_queue_head_t	 cl_mod_rpcs_waitq;
+	unsigned long		*cl_mod_tag_bitmap;
+	struct obd_histogram	 cl_mod_rpcs_hist;
+
+        /* mgc datastruct */
+	struct mutex		  cl_mgc_mutex;
+	struct local_oid_storage *cl_mgc_los;
+	struct dt_object	 *cl_mgc_configs_dir;
+	atomic_t		  cl_mgc_refcount;
+	struct obd_export        *cl_mgc_mgsexp;
+
+        /* checksumming for data sent over the network */
+	unsigned int		 cl_checksum:1, /* 0 = disabled, 1 = enabled */
+				 cl_checksum_dump:1; /* same */
+        /* supported checksum types that are worked out at connect time */
+        __u32                    cl_supp_cksum_types;
+        /* checksum algorithm to be used */
+        cksum_type_t             cl_cksum_type;
+
+        /* also protected by the poorly named _loi_list_lock lock above */
+        struct osc_async_rc      cl_ar;
+
+	/* sequence manager */
+	struct lu_client_seq    *cl_seq;
+	struct rw_semaphore	 cl_seq_rwsem;
+
+	atomic_t		 cl_resends; /* resend count */
+
+	/* ptlrpc work for writeback in ptlrpcd context */
+	void			*cl_writeback_work;
+	void			*cl_lru_work;
+	/* hash tables for osc_quota_info */
+	struct cfs_hash		*cl_quota_hash[LL_MAXQUOTAS];
+	/* Links to the global list of registered changelog devices */
+	struct list_head	 cl_chg_dev_linkage;
+};
+#define obd2cli_tgt(obd) ((char *)(obd)->u.cli.cl_target_uuid.uuid)
+
+struct obd_id_info {
+	u32	 idx;
+	u64	*data;
+};
+
+struct echo_client_obd {
+	struct obd_export      *ec_exp;	/* the local connection to osc/lov */
+	spinlock_t		ec_lock;
+	struct list_head	ec_objects;
+	struct list_head	ec_locks;
+	__u64			ec_unique;
+};
+
+/* Generic subset of OSTs */
+struct ost_pool {
+        __u32              *op_array;      /* array of index of
+                                                   lov_obd->lov_tgts */
+        unsigned int        op_count;      /* number of OSTs in the array */
+        unsigned int        op_size;       /* allocated size of lp_array */
+	struct rw_semaphore op_rw_sem;     /* to protect ost_pool use */
+};
+
+/* allow statfs data caching for 1 second */
+#define OBD_STATFS_CACHE_SECONDS 1
+
+struct lov_tgt_desc {
+	struct list_head    ltd_kill;
+        struct obd_uuid     ltd_uuid;
+        struct obd_device  *ltd_obd;
+        struct obd_export  *ltd_exp;
+        __u32               ltd_gen;
+        __u32               ltd_index;   /* index in lov_obd->tgts */
+        unsigned long       ltd_active:1,/* is this target up for requests */
+                            ltd_activate:1,/* should  target be activated */
+                            ltd_reap:1;  /* should this target be deleted */
+};
+
+struct lov_obd {
+	struct lov_desc		desc;
+	struct lov_tgt_desc   **lov_tgts;		/* sparse array */
+	struct ost_pool		lov_packed;		/* all OSTs in a packed
+							   array */
+	struct mutex		lov_lock;
+	struct obd_connect_data	lov_ocd;
+	struct proc_dir_entry  *targets_proc_entry;
+	atomic_t		lov_refcount;
+	__u32			lov_death_row;	/* tgts scheduled to be deleted */
+	__u32			lov_tgt_size;	/* size of tgts array */
+	int			lov_connects;
+	int			lov_pool_count;
+	struct cfs_hash	       *lov_pools_hash_body; /* used for key access */
+	struct list_head	lov_pool_list;	/* used for sequential access */
+	struct proc_dir_entry  *lov_pool_proc_entry;
+	enum lustre_sec_part	lov_sp_me;
+
+	/* Cached LRU and unstable data from upper layer */
+	struct cl_client_cache *lov_cache;
+
+	struct rw_semaphore	lov_notify_lock;
+};
+
+struct lmv_tgt_desc {
+	struct obd_uuid		ltd_uuid;
+	struct obd_export	*ltd_exp;
+	__u32			ltd_idx;
+	struct mutex		ltd_fid_mutex;
+	unsigned long		ltd_active:1; /* target up for requests */
+};
+
+struct lmv_obd {
+	struct lu_client_fld	lmv_fld;
+	spinlock_t		lmv_lock;
+	struct lmv_desc		desc;
+	struct proc_dir_entry	*targets_proc_entry;
+
+	struct mutex		lmv_init_mutex;
+	int			connected;
+	int			max_easize;
+	int			max_def_easize;
+
+	__u32			tgts_size; /* size of tgts array */
+	struct lmv_tgt_desc	**tgts;
+
+	struct obd_connect_data	conn_data;
+};
+
+struct niobuf_local {
+	__u64		lnb_file_offset;
+	__u32		lnb_page_offset;
+	__u32		lnb_len;
+	__u32		lnb_flags;
+	int		lnb_rc;
+	struct page	*lnb_page;
+	void		*lnb_data;
+};
+
+struct tgt_thread_big_cache {
+	struct niobuf_local	local[PTLRPC_MAX_BRW_PAGES];
+};
+
+#define LUSTRE_FLD_NAME         "fld"
+#define LUSTRE_SEQ_NAME         "seq"
+
+#define LUSTRE_MDD_NAME         "mdd"
+#define LUSTRE_OSD_LDISKFS_NAME	"osd-ldiskfs"
+#define LUSTRE_OSD_ZFS_NAME     "osd-zfs"
+#define LUSTRE_VVP_NAME         "vvp"
+#define LUSTRE_LMV_NAME         "lmv"
+#define LUSTRE_SLP_NAME         "slp"
+#define LUSTRE_LOD_NAME		"lod"
+#define LUSTRE_OSP_NAME		"osp"
+#define LUSTRE_LWP_NAME		"lwp"
+
+/* obd device type names */
+ /* FIXME all the references to LUSTRE_MDS_NAME should be swapped with LUSTRE_MDT_NAME */
+#define LUSTRE_MDS_NAME         "mds"
+#define LUSTRE_MDT_NAME         "mdt"
+#define LUSTRE_MDC_NAME         "mdc"
+#define LUSTRE_OSS_NAME         "ost"       /* FIXME change name to oss */
+#define LUSTRE_OST_NAME         "obdfilter" /* FIXME change name to ost */
+#define LUSTRE_OSC_NAME         "osc"
+#define LUSTRE_LOV_NAME         "lov"
+#define LUSTRE_MGS_NAME         "mgs"
+#define LUSTRE_MGC_NAME         "mgc"
+
+#define LUSTRE_ECHO_NAME        "obdecho"
+#define LUSTRE_ECHO_CLIENT_NAME "echo_client"
+#define LUSTRE_QMT_NAME         "qmt"
+
+/* Constant obd names (post-rename) */
+#define LUSTRE_MDS_OBDNAME "MDS"
+#define LUSTRE_OSS_OBDNAME "OSS"
+#define LUSTRE_MGS_OBDNAME "MGS"
+#define LUSTRE_MGC_OBDNAME "MGC"
+
+static inline int is_lwp_on_mdt(char *name)
+{
+	char   *ptr;
+
+	ptr = strrchr(name, '-');
+	if (ptr == NULL) {
+		CERROR("%s is not a obdname\n", name);
+		return 0;
+	}
+
+	/* LWP name on MDT is fsname-MDTxxxx-lwp-MDTxxxx */
+
+	if (strncmp(ptr + 1, "MDT", 3) != 0)
+		return 0;
+
+	while (*(--ptr) != '-' && ptr != name);
+
+	if (ptr == name)
+		return 0;
+
+	if (strncmp(ptr + 1, LUSTRE_LWP_NAME, strlen(LUSTRE_LWP_NAME)) != 0)
+		return 0;
+
+	return 1;
+}
+
+static inline int is_lwp_on_ost(char *name)
+{
+	char   *ptr;
+
+	ptr = strrchr(name, '-');
+	if (ptr == NULL) {
+		CERROR("%s is not a obdname\n", name);
+		return 0;
+	}
+
+	/* LWP name on OST is fsname-MDTxxxx-lwp-OSTxxxx */
+
+	if (strncmp(ptr + 1, "OST", 3) != 0)
+		return 0;
+
+	while (*(--ptr) != '-' && ptr != name);
+
+	if (ptr == name)
+		return 0;
+
+	if (strncmp(ptr + 1, LUSTRE_LWP_NAME, strlen(LUSTRE_LWP_NAME)) != 0)
+		return 0;
+
+	return 1;
+}
+
+/*
+ * Events signalled through obd_notify() upcall-chain.
+ */
+enum obd_notify_event {
+        /* Device connect start */
+        OBD_NOTIFY_CONNECT,
+        /* Device activated */
+        OBD_NOTIFY_ACTIVE,
+        /* Device deactivated */
+        OBD_NOTIFY_INACTIVE,
+        /* Connect data for import were changed */
+        OBD_NOTIFY_OCD,
+        /* Administratively deactivate/activate event */
+        OBD_NOTIFY_DEACTIVATE,
+        OBD_NOTIFY_ACTIVATE
+};
+
+/*
+ * Data structure used to pass obd_notify()-event to non-obd listeners (llite
+ * and liblustre being main examples).
+ */
+struct obd_notify_upcall {
+	int (*onu_upcall)(struct obd_device *host, struct obd_device *watched,
+			  enum obd_notify_event ev, void *owner);
+        /* Opaque datum supplied by upper layer listener */
+        void *onu_owner;
+};
+
+struct target_recovery_data {
+	svc_handler_t		trd_recovery_handler;
+	pid_t			trd_processing_task;
+	struct completion	trd_starting;
+	struct completion	trd_finishing;
+};
+
+struct obd_llog_group {
+	struct llog_ctxt   *olg_ctxts[LLOG_MAX_CTXTS];
+	wait_queue_head_t  olg_waitq;
+	spinlock_t	   olg_lock;
+	struct mutex	   olg_cat_processing;
+};
+
+/* corresponds to one of the obd's */
+#define OBD_DEVICE_MAGIC        0XAB5CD6EF
+
+struct obd_device {
+	struct obd_type			*obd_type;
+	__u32				 obd_magic; /* OBD_DEVICE_MAGIC */
+	int				 obd_minor; /* device number: lctl dl */
+	struct lu_device		*obd_lu_dev;
+
+	/* common and UUID name of this device */
+	struct obd_uuid			 obd_uuid;
+	char				 obd_name[MAX_OBD_NAME];
+
+	/* bitfield modification is protected by obd_dev_lock */
+	unsigned long
+		obd_attached:1,		/* finished attach */
+		obd_set_up:1,		/* finished setup */
+		obd_recovering:1,	/* there are recoverable clients */
+		obd_abort_recovery:1,	/* recovery expired */
+		obd_version_recov:1,	/* obd uses version checking */
+		obd_replayable:1,	/* recovery enabled; inform clients */
+		obd_no_transno:1,	/* no committed-transno notification */
+		obd_no_recov:1,		/* fail instead of retry messages */
+		obd_stopping:1,		/* started cleanup */
+		obd_starting:1,		/* started setup */
+		obd_force:1,		/* cleanup with > 0 obd refcount */
+		obd_fail:1,		/* cleanup with failover */
+		obd_no_conn:1,		/* deny new connections */
+		obd_inactive:1,		/* device active/inactive
+					 * (for /proc/status only!!) */
+		obd_no_ir:1,		/* no imperative recovery. */
+		obd_process_conf:1,	/* device is processing mgs config */
+		obd_uses_nid_stats:1,	/* maintain per-client OBD stats */
+		obd_checksum_dump:1;	/* dump pages upon cksum error */
+
+        /* use separate field as it is set in interrupt to don't mess with
+         * protection of other bits using _bh lock */
+        unsigned long obd_recovery_expired:1;
+        /* uuid-export hash body */
+	struct cfs_hash             *obd_uuid_hash;
+        /* nid-export hash body */
+	struct cfs_hash             *obd_nid_hash;
+	/* nid stats body */
+	struct cfs_hash             *obd_nid_stats_hash;
+	/* client_generation-export hash body */
+	struct cfs_hash		    *obd_gen_hash;
+	struct list_head	obd_nid_stats;
+	struct list_head	obd_exports;
+	struct list_head	obd_unlinked_exports;
+	struct list_head	obd_delayed_exports;
+	struct list_head	obd_lwp_list;
+	atomic_t		obd_refcount;
+	int                     obd_num_exports;
+	spinlock_t		obd_nid_lock;
+	struct ldlm_namespace  *obd_namespace;
+	struct ptlrpc_client	obd_ldlm_client; /* XXX OST/MDS only */
+	/* a spinlock is OK for what we do now, may need a semaphore later */
+	spinlock_t		obd_dev_lock; /* protect OBD bitfield above */
+	spinlock_t		obd_osfs_lock;
+	struct obd_statfs	obd_osfs;       /* locked by obd_osfs_lock */
+	__u64			obd_osfs_age;
+	__u64			obd_last_committed;
+	struct mutex		obd_dev_mutex;
+	struct lvfs_run_ctxt	obd_lvfs_ctxt;
+	struct obd_llog_group	obd_olg;	/* default llog group */
+	struct obd_device	*obd_observer;
+	struct rw_semaphore	obd_observer_link_sem;
+        struct obd_notify_upcall obd_upcall;
+        struct obd_export       *obd_self_export;
+	struct obd_export	*obd_lwp_export;
+	/* list of exports in LRU order, for ping evictor, with obd_dev_lock */
+	struct list_head	obd_exports_timed;
+	time_t			obd_eviction_timer;	/* for ping evictor */
+
+	int                     obd_max_recoverable_clients;
+	atomic_t                obd_connected_clients;
+	int                     obd_stale_clients;
+        /* this lock protects all recovery list_heads, timer and
+         * obd_next_recovery_transno value */
+	spinlock_t		obd_recovery_task_lock;
+	__u64			obd_next_recovery_transno;
+	int			obd_replayed_requests;
+	int			obd_requests_queued_for_recovery;
+	wait_queue_head_t	obd_next_transno_waitq;
+	/* protected by obd_recovery_task_lock */
+	struct timer_list	obd_recovery_timer;
+	/* seconds */
+	time64_t		obd_recovery_start;
+	/* seconds, for lprocfs_status */
+	time64_t		obd_recovery_end;
+	time64_t		obd_recovery_time_hard;
+	time64_t		obd_recovery_timeout;
+	int			obd_recovery_ir_factor;
+
+	/* new recovery stuff from CMD2 */
+	int				obd_replayed_locks;
+	atomic_t			obd_req_replay_clients;
+	atomic_t			obd_lock_replay_clients;
+	struct target_recovery_data	obd_recovery_data;
+
+	/* all lists are protected by obd_recovery_task_lock */
+	struct list_head		obd_req_replay_queue;
+	struct list_head		obd_lock_replay_queue;
+	struct list_head		obd_final_req_queue;
+
+	union {
+#ifdef HAVE_SERVER_SUPPORT
+		struct obd_device_target obt;
+		struct filter_obd filter;
+		struct ost_obd ost;
+		struct echo_obd echo;
+#endif
+		struct client_obd cli;
+		struct echo_client_obd echo_client;
+		struct lov_obd lov;
+		struct lmv_obd lmv;
+	} u;
+
+	/* Fields used by LProcFS */
+	struct lprocfs_stats		*obd_stats;
+	unsigned int			obd_cntr_base;
+
+	unsigned int			 obd_md_cntr_base;
+	struct lprocfs_stats		*obd_md_stats;
+
+	struct proc_dir_entry	*obd_proc_entry;
+	struct proc_dir_entry	*obd_proc_exports_entry;
+	struct proc_dir_entry	*obd_svc_procroot;
+	struct lprocfs_stats	*obd_svc_stats;
+	struct lprocfs_vars	*obd_vars;
+	atomic_t		obd_evict_inprogress;
+	wait_queue_head_t	obd_evict_inprogress_waitq;
+	struct list_head	obd_evict_list;	/* protected with pet_lock */
+
+	/**
+	 * LDLM pool part. Save last calculated SLV and Limit.
+	 */
+	rwlock_t			obd_pool_lock;
+	__u64				obd_pool_slv;
+	int				obd_pool_limit;
+
+	int				obd_conn_inprogress;
+
+	/**
+	 * List of outstanding class_incref()'s fo this OBD. For debugging. */
+	struct lu_ref			obd_reference;
+};
+
+/* get/set_info keys */
+#define KEY_ASYNC               "async"
+#define KEY_CHANGELOG_CLEAR     "changelog_clear"
+#define KEY_FID2PATH            "fid2path"
+#define KEY_CHECKSUM            "checksum"
+#define KEY_CLEAR_FS            "clear_fs"
+#define KEY_CONN_DATA           "conn_data"
+#define KEY_EVICT_BY_NID        "evict_by_nid"
+#define KEY_FIEMAP              "fiemap"
+#define KEY_FLUSH_CTX           "flush_ctx"
+#define KEY_GRANT_SHRINK        "grant_shrink"
+#define KEY_HSM_COPYTOOL_SEND   "hsm_send"
+#define KEY_INIT_RECOV_BACKUP   "init_recov_bk"
+#define KEY_INTERMDS            "inter_mds"
+#define KEY_LAST_ID             "last_id"
+#define KEY_LAST_FID		"last_fid"
+#define KEY_MAX_EASIZE		"max_easize"
+#define KEY_DEFAULT_EASIZE	"default_easize"
+#define KEY_MGSSEC              "mgssec"
+#define KEY_READ_ONLY           "read-only"
+#define KEY_REGISTER_TARGET     "register_target"
+#define KEY_SET_FS              "set_fs"
+#define KEY_TGT_COUNT           "tgt_count"
+/*      KEY_SET_INFO in lustre_idl.h */
+#define KEY_SPTLRPC_CONF        "sptlrpc_conf"
+
+#define KEY_CACHE_SET		"cache_set"
+#define KEY_CACHE_LRU_SHRINK	"cache_lru_shrink"
+#define KEY_OSP_CONNECTED	"osp_connected"
+
+struct lu_context;
+
+static inline int it_to_lock_mode(struct lookup_intent *it)
+{
+	/* CREAT needs to be tested before open (both could be set) */
+	if (it->it_op & IT_CREAT)
+		return LCK_CW;
+	else if (it->it_op & (IT_GETATTR | IT_OPEN | IT_LOOKUP |
+			      IT_LAYOUT))
+		return LCK_CR;
+	else if (it->it_op &  IT_READDIR)
+		return LCK_PR;
+	else if (it->it_op &  IT_GETXATTR)
+		return LCK_PR;
+	else if (it->it_op &  IT_SETXATTR)
+		return LCK_PW;
+
+	LASSERTF(0, "Invalid it_op: %d\n", it->it_op);
+	return -EINVAL;
+}
+
+enum md_op_flags {
+	MF_MDC_CANCEL_FID1	= 1 << 0,
+	MF_MDC_CANCEL_FID2	= 1 << 1,
+	MF_MDC_CANCEL_FID3	= 1 << 2,
+	MF_MDC_CANCEL_FID4	= 1 << 3,
+	MF_GET_MDT_IDX		= 1 << 4,
+};
+
+enum md_cli_flags {
+	CLI_SET_MEA     = 1 << 0,
+	CLI_RM_ENTRY    = 1 << 1,
+	CLI_HASH64      = 1 << 2,
+	CLI_API32       = 1 << 3,
+	CLI_MIGRATE     = 1 << 4,
+};
+
+/**
+ * GETXATTR is not included as only a couple of fields in the reply body
+ * is filled, but not FID which is needed for common intent handling in
+ * mdc_finish_intent_lock()
+ */
+static inline bool it_has_reply_body(const struct lookup_intent *it)
+{
+	return it->it_op & (IT_OPEN | IT_UNLINK | IT_LOOKUP | IT_GETATTR);
+}
+
+struct md_op_data {
+	struct lu_fid		op_fid1; /* operation fid1 (usualy parent) */
+	struct lu_fid		op_fid2; /* operation fid2 (usualy child) */
+	struct lu_fid		op_fid3; /* 2 extra fids to find conflicting */
+	struct lu_fid		op_fid4; /* to the operation locks. */
+	u32			op_mds;  /* what mds server open will go to */
+	__u32			op_mode;
+	struct lustre_handle	op_handle;
+	s64			op_mod_time;
+	const char		*op_name;
+	size_t			op_namelen;
+	struct lmv_stripe_md	*op_mea1;
+	struct lmv_stripe_md	*op_mea2;
+	__u32			op_suppgids[2];
+	__u32			op_fsuid;
+	__u32			op_fsgid;
+	cfs_cap_t		op_cap;
+	void			*op_data;
+	size_t			op_data_size;
+
+	/* iattr fields and blocks. */
+	struct iattr            op_attr;
+	loff_t                  op_attr_blocks;
+	__u64                   op_valid; /* OBD_MD_* */
+	unsigned int		op_attr_flags; /* LUSTRE_{SYNC,..}_FL */
+
+	enum md_op_flags	op_flags;
+
+	/* Various operation flags. */
+	enum mds_op_bias        op_bias;
+
+	/* used to transfer info between the stacks of MD client
+	 * see enum op_cli_flags */
+	enum md_cli_flags	op_cli_flags;
+
+	/* File object data version for HSM release, on client */
+	__u64			op_data_version;
+	struct lustre_handle	op_lease_handle;
+
+	/* File security context, for creates. */
+	const char	       *op_file_secctx_name;
+	void		       *op_file_secctx;
+	__u32			op_file_secctx_size;
+
+	/* default stripe offset */
+	__u32			op_default_stripe_offset;
+
+	__u32			op_projid;
+
+	/* Used by readdir */
+	unsigned int		op_max_pages;
+
+};
+
+struct md_callback {
+	int (*md_blocking_ast)(struct ldlm_lock *lock,
+			       struct ldlm_lock_desc *desc,
+			       void *data, int flag);
+};
+
+struct md_enqueue_info;
+/* metadata stat-ahead */
+typedef int (* md_enqueue_cb_t)(struct ptlrpc_request *req,
+                                struct md_enqueue_info *minfo,
+                                int rc);
+
+struct md_enqueue_info {
+	struct md_op_data		mi_data;
+	struct lookup_intent		mi_it;
+	struct lustre_handle		mi_lockh;
+	struct inode		       *mi_dir;
+	struct ldlm_enqueue_info	mi_einfo;
+	md_enqueue_cb_t			mi_cb;
+	void			       *mi_cbdata;
+};
+
+struct obd_ops {
+	struct module *o_owner;
+	int (*o_iocontrol)(unsigned int cmd, struct obd_export *exp, int len,
+			   void *karg, void __user *uarg);
+	int (*o_get_info)(const struct lu_env *env, struct obd_export *,
+			  __u32 keylen, void *key, __u32 *vallen, void *val);
+	int (*o_set_info_async)(const struct lu_env *, struct obd_export *,
+				__u32 keylen, void *key,
+				__u32 vallen, void *val,
+				struct ptlrpc_request_set *set);
+	int (*o_setup) (struct obd_device *dev, struct lustre_cfg *cfg);
+	int (*o_precleanup)(struct obd_device *dev);
+	int (*o_cleanup)(struct obd_device *dev);
+	int (*o_process_config)(struct obd_device *dev, size_t len, void *data);
+	int (*o_postrecov)(struct obd_device *dev);
+	int (*o_add_conn)(struct obd_import *imp, struct obd_uuid *uuid,
+			  int priority);
+	int (*o_del_conn)(struct obd_import *imp, struct obd_uuid *uuid);
+	/* connect to the target device with given connection
+	 * data. @ocd->ocd_connect_flags is modified to reflect flags actually
+	 * granted by the target, which are guaranteed to be a subset of flags
+	 * asked for. If @ocd == NULL, use default parameters. */
+	int (*o_connect)(const struct lu_env *env,
+			 struct obd_export **exp, struct obd_device *src,
+			 struct obd_uuid *cluuid, struct obd_connect_data *ocd,
+			 void *localdata);
+	int (*o_reconnect)(const struct lu_env *env,
+			   struct obd_export *exp, struct obd_device *src,
+			   struct obd_uuid *cluuid,
+			   struct obd_connect_data *ocd,
+			   void *localdata);
+	int (*o_disconnect)(struct obd_export *exp);
+
+	/* Initialize/finalize fids infrastructure. */
+	int (*o_fid_init)(struct obd_device *obd,
+			  struct obd_export *exp, enum lu_cli_type type);
+	int (*o_fid_fini)(struct obd_device *obd);
+
+	/* Allocate new fid according to passed @hint. */
+	int (*o_fid_alloc)(const struct lu_env *env, struct obd_export *exp,
+			   struct lu_fid *fid, struct md_op_data *op_data);
+
+	/*
+	 * Object with @fid is getting deleted, we may want to do something
+	 * about this.
+	 */
+	int (*o_statfs)(const struct lu_env *, struct obd_export *exp,
+			struct obd_statfs *osfs, __u64 max_age, __u32 flags);
+	int (*o_statfs_async)(struct obd_export *exp, struct obd_info *oinfo,
+			      __u64 max_age, struct ptlrpc_request_set *set);
+	int (*o_create)(const struct lu_env *env, struct obd_export *exp,
+			struct obdo *oa);
+	int (*o_destroy)(const struct lu_env *env, struct obd_export *exp,
+			 struct obdo *oa);
+	int (*o_setattr)(const struct lu_env *, struct obd_export *exp,
+			 struct obdo *oa);
+	int (*o_getattr)(const struct lu_env *env, struct obd_export *exp,
+			 struct obdo *oa);
+	int (*o_preprw)(const struct lu_env *env, int cmd,
+			struct obd_export *exp, struct obdo *oa, int objcount,
+			struct obd_ioobj *obj, struct niobuf_remote *remote,
+			int *nr_pages, struct niobuf_local *local);
+	int (*o_commitrw)(const struct lu_env *env, int cmd,
+			  struct obd_export *exp, struct obdo *oa,
+			  int objcount, struct obd_ioobj *obj,
+			  struct niobuf_remote *remote, int pages,
+			  struct niobuf_local *local, int rc);
+	int (*o_init_export)(struct obd_export *exp);
+	int (*o_destroy_export)(struct obd_export *exp);
+
+	int (*o_import_event)(struct obd_device *, struct obd_import *,
+			      enum obd_import_event);
+
+	int (*o_notify)(struct obd_device *obd, struct obd_device *watched,
+			enum obd_notify_event ev);
+
+	int (*o_health_check)(const struct lu_env *env, struct obd_device *);
+	struct obd_uuid *(*o_get_uuid) (struct obd_export *exp);
+
+	/* quota methods */
+	int (*o_quotactl)(struct obd_device *, struct obd_export *,
+			  struct obd_quotactl *);
+
+	int (*o_ping)(const struct lu_env *, struct obd_export *exp);
+
+	/* pools methods */
+	int (*o_pool_new)(struct obd_device *obd, char *poolname);
+	int (*o_pool_del)(struct obd_device *obd, char *poolname);
+	int (*o_pool_add)(struct obd_device *obd, char *poolname,
+			  char *ostname);
+	int (*o_pool_rem)(struct obd_device *obd, char *poolname,
+			  char *ostname);
+	void (*o_getref)(struct obd_device *obd);
+	void (*o_putref)(struct obd_device *obd);
+	/*
+	 * NOTE: If adding ops, add another LPROCFS_OBD_OP_INIT() line
+	 * to lprocfs_alloc_obd_stats() in obdclass/lprocfs_status.c.
+	 * Also, add a wrapper function in include/linux/obd_class.h. */
+};
+
+/* lmv structures */
+struct lustre_md {
+	struct mdt_body         *body;
+	struct lu_buf		 layout;
+	struct lmv_stripe_md    *lmv;
+#ifdef CONFIG_FS_POSIX_ACL
+	struct posix_acl        *posix_acl;
+#endif
+};
+
+struct md_open_data {
+	struct obd_client_handle	*mod_och;
+	struct ptlrpc_request		*mod_open_req;
+	struct ptlrpc_request		*mod_close_req;
+	atomic_t			 mod_refcount;
+	bool				 mod_is_create;
+};
+
+struct obd_client_handle {
+	struct lustre_handle	 och_fh;
+	struct lu_fid		 och_fid;
+	struct md_open_data	*och_mod;
+	struct lustre_handle	 och_lease_handle; /* open lock for lease */
+	__u32			 och_magic;
+	int			 och_flags;
+};
+
+#define OBD_CLIENT_HANDLE_MAGIC 0xd15ea5ed
+
+struct lookup_intent;
+struct cl_attr;
+
+struct md_ops {
+	/* Every operation from MD_STATS_FIRST_OP up to and including
+	 * MD_STATS_LAST_OP will be counted by EXP_MD_OP_INCREMENT()
+	 * and will appear in /proc/fs/lustre/{lmv,mdc}/.../md_stats.
+	 * Operations after MD_STATS_LAST_OP are excluded from stats.
+	 * There are a few reasons for doing this: we prune the 17
+	 * counters which will be of minimal use in understanding
+	 * metadata utilization, we save memory by allocating 15
+	 * instead of 32 counters, we save cycles by not counting.
+	 *
+	 * MD_STATS_FIRST_OP must be the first member of md_ops.
+	 */
+#define MD_STATS_FIRST_OP m_close
+	int (*m_close)(struct obd_export *, struct md_op_data *,
+		       struct md_open_data *, struct ptlrpc_request **);
+
+	int (*m_create)(struct obd_export *, struct md_op_data *,
+			const void *, size_t, umode_t, uid_t, gid_t,
+			cfs_cap_t, __u64, struct ptlrpc_request **);
+
+	int (*m_enqueue)(struct obd_export *, struct ldlm_enqueue_info *,
+			 const union ldlm_policy_data *, struct md_op_data *,
+			 struct lustre_handle *, __u64);
+
+	int (*m_getattr)(struct obd_export *, struct md_op_data *,
+			 struct ptlrpc_request **);
+
+	int (*m_intent_lock)(struct obd_export *, struct md_op_data *,
+			     struct lookup_intent *,
+			     struct ptlrpc_request **,
+			     ldlm_blocking_callback, __u64);
+
+	int (*m_link)(struct obd_export *, struct md_op_data *,
+		      struct ptlrpc_request **);
+
+	int (*m_rename)(struct obd_export *, struct md_op_data *,
+			const char *, size_t, const char *, size_t,
+			struct ptlrpc_request **);
+
+	int (*m_setattr)(struct obd_export *, struct md_op_data *, void *,
+			 size_t , struct ptlrpc_request **);
+
+	int (*m_fsync)(struct obd_export *, const struct lu_fid *,
+		       struct ptlrpc_request **);
+
+	int (*m_read_page)(struct obd_export *, struct md_op_data *,
+			   struct md_callback *cb_op, __u64 hash_offset,
+			   struct page **ppage);
+
+	int (*m_unlink)(struct obd_export *, struct md_op_data *,
+			struct ptlrpc_request **);
+
+	int (*m_setxattr)(struct obd_export *, const struct lu_fid *,
+			  u64, const char *, const char *, int, int, int, u32,
+			  struct ptlrpc_request **);
+
+	int (*m_getxattr)(struct obd_export *, const struct lu_fid *,
+			  u64, const char *, const char *, int, int, int,
+			  struct ptlrpc_request **);
+
+	int (*m_intent_getattr_async)(struct obd_export *,
+				      struct md_enqueue_info *);
+
+        int (*m_revalidate_lock)(struct obd_export *, struct lookup_intent *,
+                                 struct lu_fid *, __u64 *bits);
+
+#define MD_STATS_LAST_OP m_revalidate_lock
+
+	int (*m_get_root)(struct obd_export *, const char *, struct lu_fid *);
+	int (*m_null_inode)(struct obd_export *, const struct lu_fid *);
+
+	int (*m_getattr_name)(struct obd_export *, struct md_op_data *,
+			      struct ptlrpc_request **);
+
+	int (*m_init_ea_size)(struct obd_export *, __u32, __u32);
+
+	int (*m_get_lustre_md)(struct obd_export *, struct ptlrpc_request *,
+			       struct obd_export *, struct obd_export *,
+			       struct lustre_md *);
+
+	int (*m_free_lustre_md)(struct obd_export *, struct lustre_md *);
+
+	int (*m_merge_attr)(struct obd_export *,
+			    const struct lmv_stripe_md *lsm,
+			    struct cl_attr *attr, ldlm_blocking_callback);
+
+	int (*m_set_open_replay_data)(struct obd_export *,
+				      struct obd_client_handle *,
+				      struct lookup_intent *);
+
+	int (*m_clear_open_replay_data)(struct obd_export *,
+					struct obd_client_handle *);
+
+	int (*m_set_lock_data)(struct obd_export *,
+			       const struct lustre_handle *, void *, __u64 *);
+
+	enum ldlm_mode (*m_lock_match)(struct obd_export *, __u64,
+				       const struct lu_fid *, enum ldlm_type,
+				       union ldlm_policy_data *, enum ldlm_mode,
+				       struct lustre_handle *);
+
+	int (*m_cancel_unused)(struct obd_export *, const struct lu_fid *,
+			       union ldlm_policy_data *, enum ldlm_mode,
+			       enum ldlm_cancel_flags flags, void *opaque);
+
+	int (*m_get_fid_from_lsm)(struct obd_export *,
+				  const struct lmv_stripe_md *,
+				  const char *name, int namelen,
+				  struct lu_fid *fid);
+	int (*m_unpackmd)(struct obd_export *exp, struct lmv_stripe_md **plsm,
+			  const union lmv_mds_md *lmv, size_t lmv_size);
+};
+
+static inline struct md_open_data *obd_mod_alloc(void)
+{
+	struct md_open_data *mod;
+	OBD_ALLOC_PTR(mod);
+	if (mod == NULL)
+		return NULL;
+	atomic_set(&mod->mod_refcount, 1);
+	return mod;
+}
+
+#define obd_mod_get(mod) atomic_inc(&(mod)->mod_refcount)
+#define obd_mod_put(mod)                                          \
+({                                                                \
+	if (atomic_dec_and_test(&(mod)->mod_refcount)) {      	  \
+		if ((mod)->mod_open_req)                          \
+			ptlrpc_req_finished((mod)->mod_open_req); \
+		OBD_FREE_PTR(mod);                                \
+	}                                                         \
+})
+
+void obdo_from_inode(struct obdo *dst, struct inode *src, u64 valid);
+void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent);
+void obdo_set_o_projid(struct obdo *dst, u32 projid);
+
+/* return 1 if client should be resend request */
+static inline int client_should_resend(int resend, struct client_obd *cli)
+{
+	return atomic_read(&cli->cl_resends) ?
+	       atomic_read(&cli->cl_resends) > resend : 1;
+}
+
+/**
+ * Return device name for this device
+ *
+ * XXX: lu_device is declared before obd_device, while a pointer pointing
+ * back to obd_device in lu_device, so this helper function defines here
+ * instead of in lu_object.h
+ */
+static inline const char *lu_dev_name(const struct lu_device *lu_dev)
+{
+        return lu_dev->ld_obd->obd_name;
+}
+
+static inline bool filename_is_volatile(const char *name, size_t namelen,
+					int *idx)
+{
+	const char	*start;
+	char		*end;
+
+	if (strncmp(name, LUSTRE_VOLATILE_HDR, LUSTRE_VOLATILE_HDR_LEN) != 0)
+		return false;
+
+	/* caller does not care of idx */
+	if (idx == NULL)
+		return true;
+
+	/* volatile file, the MDT can be set from name */
+	/* name format is LUSTRE_VOLATILE_HDR:[idx]: */
+	/* if no MDT is specified, use std way */
+	if (namelen < LUSTRE_VOLATILE_HDR_LEN + 2)
+		goto bad_format;
+	/* test for no MDT idx case */
+	if ((*(name + LUSTRE_VOLATILE_HDR_LEN) == ':') &&
+	    (*(name + LUSTRE_VOLATILE_HDR_LEN + 1) == ':')) {
+		*idx = -1;
+		return true;
+	}
+	/* we have an idx, read it */
+	start = name + LUSTRE_VOLATILE_HDR_LEN + 1;
+	*idx = simple_strtoul(start, &end, 16);
+	/* error cases:
+	 * no digit, no trailing :, negative value
+	 */
+	if (((*idx == 0) && (end == start)) ||
+	    (*end != ':') || (*idx < 0))
+		goto bad_format;
+
+	return true;
+bad_format:
+	/* bad format of mdt idx, we cannot return an error
+	 * to caller so we use hash algo */
+	CERROR("Bad volatile file name format: %s\n",
+	       name + LUSTRE_VOLATILE_HDR_LEN);
+	return false;
+}
+
+static inline int cli_brw_size(struct obd_device *obd)
+{
+	LASSERT(obd != NULL);
+	return obd->u.cli.cl_max_pages_per_rpc << PAGE_SHIFT;
+}
+
+/* when RPC size or the max RPCs in flight is increased, the max dirty pages
+ * of the client should be increased accordingly to avoid sending fragmented
+ * RPCs over the network when the client runs out of the maximum dirty space
+ * when so many RPCs are being generated.
+ */
+static inline void client_adjust_max_dirty(struct client_obd *cli)
+{
+	 /* initializing */
+	if (cli->cl_dirty_max_pages <= 0)
+		cli->cl_dirty_max_pages = (OSC_MAX_DIRTY_DEFAULT * 1024 * 1024)
+							>> PAGE_SHIFT;
+	else {
+		unsigned long dirty_max = cli->cl_max_rpcs_in_flight *
+					  cli->cl_max_pages_per_rpc;
+
+		if (dirty_max > cli->cl_dirty_max_pages)
+			cli->cl_dirty_max_pages = dirty_max;
+	}
+
+	if (cli->cl_dirty_max_pages > totalram_pages / 8)
+		cli->cl_dirty_max_pages = totalram_pages / 8;
+}
+
+#endif /* __OBD_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/obd_cache.h b/drivers/staging/lustrefsx/lustre/include/obd_cache.h
new file mode 100644
index 0000000000000..3378e5fc93375
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/obd_cache.h
@@ -0,0 +1,35 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _OBD_CACHE_H__
+#define _OBD_CACHE_H__
+
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/obd_cksum.h b/drivers/staging/lustrefsx/lustre/include/obd_cksum.h
new file mode 100644
index 0000000000000..6a0cfe8d72fc0
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/obd_cksum.h
@@ -0,0 +1,175 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __OBD_CKSUM
+#define __OBD_CKSUM
+#include <libcfs/libcfs.h>
+#include <libcfs/libcfs_crypto.h>
+#include <lustre/lustre_idl.h>
+
+static inline unsigned char cksum_obd2cfs(cksum_type_t cksum_type)
+{
+	switch (cksum_type) {
+	case OBD_CKSUM_CRC32:
+		return CFS_HASH_ALG_CRC32;
+	case OBD_CKSUM_ADLER:
+		return CFS_HASH_ALG_ADLER32;
+	case OBD_CKSUM_CRC32C:
+		return CFS_HASH_ALG_CRC32C;
+	default:
+		CERROR("Unknown checksum type (%x)!!!\n", cksum_type);
+		LBUG();
+	}
+	return 0;
+}
+
+/* The OBD_FL_CKSUM_* flags is packed into 5 bits of o_flags, since there can
+ * only be a single checksum type per RPC.
+ *
+ * The OBD_CHECKSUM_* type bits passed in ocd_cksum_types are a 32-bit bitmask
+ * since they need to represent the full range of checksum algorithms that
+ * both the client and server can understand.
+ *
+ * In case of an unsupported types/flags we fall back to ADLER
+ * because that is supported by all clients since 1.8
+ *
+ * In case multiple algorithms are supported the best one is used. */
+static inline u32 cksum_type_pack(cksum_type_t cksum_type)
+{
+	unsigned int    performance = 0, tmp;
+	u32		flag = OBD_FL_CKSUM_ADLER;
+
+	if (cksum_type & OBD_CKSUM_CRC32) {
+		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32));
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_CRC32;
+		}
+	}
+	if (cksum_type & OBD_CKSUM_CRC32C) {
+		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C));
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_CRC32C;
+		}
+	}
+	if (cksum_type & OBD_CKSUM_ADLER) {
+		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER));
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_ADLER;
+		}
+	}
+	if (unlikely(cksum_type && !(cksum_type & (OBD_CKSUM_CRC32C |
+						   OBD_CKSUM_CRC32 |
+						   OBD_CKSUM_ADLER))))
+		CWARN("unknown cksum type %x\n", cksum_type);
+
+	return flag;
+}
+
+static inline cksum_type_t cksum_type_unpack(u32 o_flags)
+{
+	switch (o_flags & OBD_FL_CKSUM_ALL) {
+	case OBD_FL_CKSUM_CRC32C:
+		return OBD_CKSUM_CRC32C;
+	case OBD_FL_CKSUM_CRC32:
+		return OBD_CKSUM_CRC32;
+	default:
+		break;
+	}
+
+	return OBD_CKSUM_ADLER;
+}
+
+/* Return a bitmask of the checksum types supported on this system.
+ * 1.8 supported ADLER it is base and not depend on hw
+ * Client uses all available local algos
+ */
+static inline cksum_type_t cksum_types_supported_client(void)
+{
+	cksum_type_t ret = OBD_CKSUM_ADLER;
+
+	CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n",
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)),
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)),
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)));
+
+	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) > 0)
+		ret |= OBD_CKSUM_CRC32C;
+	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) > 0)
+		ret |= OBD_CKSUM_CRC32;
+
+	return ret;
+}
+
+/* Server uses algos that perform at 50% or better of the Adler */
+static inline enum cksum_types cksum_types_supported_server(void)
+{
+	enum cksum_types ret = OBD_CKSUM_ADLER;
+	int base_speed;
+
+	CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n",
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)),
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)),
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)));
+
+	base_speed = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)) / 2;
+
+	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) >=
+	    base_speed)
+		ret |= OBD_CKSUM_CRC32C;
+	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) >=
+	    base_speed)
+		ret |= OBD_CKSUM_CRC32;
+
+	return ret;
+}
+
+
+/* Select the best checksum algorithm among those supplied in the cksum_types
+ * input.
+ *
+ * Currently, calling cksum_type_pack() with a mask will return the fastest
+ * checksum type due to its benchmarking at libcfs module load.
+ * Caution is advised, however, since what is fastest on a single client may
+ * not be the fastest or most efficient algorithm on the server.  */
+static inline cksum_type_t cksum_type_select(cksum_type_t cksum_types)
+{
+	return cksum_type_unpack(cksum_type_pack(cksum_types));
+}
+
+/* Checksum algorithm names. Must be defined in the same order as the
+ * OBD_CKSUM_* flags. */
+#define DECLARE_CKSUM_NAME char *cksum_name[] = {"crc32", "adler", "crc32c"}
+
+#endif /* __OBD_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/obd_class.h b/drivers/staging/lustrefsx/lustre/include/obd_class.h
new file mode 100644
index 0000000000000..729d34ad91fe2
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/obd_class.h
@@ -0,0 +1,1744 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#ifndef __CLASS_OBD_H
+#define __CLASS_OBD_H
+
+
+#include <obd_support.h>
+#include <lustre_import.h>
+#include <lustre_net.h>
+#include <obd.h>
+#include <lustre_lib.h>
+#include <lustre/lustre_idl.h>
+#include <lprocfs_status.h>
+
+#define OBD_STATFS_NODELAY      0x0001  /* requests should be send without delay
+                                         * and resends for avoid deadlocks */
+#define OBD_STATFS_FROM_CACHE   0x0002  /* the statfs callback should not update
+                                         * obd_osfs_age */
+#define OBD_STATFS_FOR_MDT0	0x0004	/* The statfs is only for retrieving
+					 * information from MDT0. */
+
+extern rwlock_t obd_dev_lock;
+
+/* OBD Operations Declarations */
+extern struct obd_device *class_conn2obd(struct lustre_handle *);
+extern struct obd_device *class_exp2obd(struct obd_export *);
+extern int class_handle_ioctl(unsigned int cmd, unsigned long arg);
+extern int lustre_get_jobid(char *jobid);
+
+struct lu_device_type;
+
+/* genops.c */
+struct obd_export *class_conn2export(struct lustre_handle *);
+int class_register_type(struct obd_ops *, struct md_ops *, bool enable_proc,
+			struct lprocfs_vars *module_vars,
+			const char *nm, struct lu_device_type *ldt);
+int class_unregister_type(const char *nm);
+
+struct obd_device *class_newdev(const char *type_name, const char *name,
+				const char *uuid);
+int class_register_device(struct obd_device *obd);
+void class_unregister_device(struct obd_device *obd);
+void class_free_dev(struct obd_device *obd);
+
+struct obd_device *class_dev_by_str(const char *str);
+int class_name2dev(const char *name);
+struct obd_device *class_name2obd(const char *name);
+int class_uuid2dev(struct obd_uuid *uuid);
+struct obd_device *class_uuid2obd(struct obd_uuid *uuid);
+void class_obd_list(void);
+struct obd_device * class_find_client_obd(struct obd_uuid *tgt_uuid,
+                                          const char * typ_name,
+                                          struct obd_uuid *grp_uuid);
+struct obd_device * class_devices_in_group(struct obd_uuid *grp_uuid,
+                                           int *next);
+struct obd_device * class_num2obd(int num);
+int get_devices_count(void);
+
+int class_notify_sptlrpc_conf(const char *fsname, int namelen);
+
+char *obd_export_nid2str(struct obd_export *exp);
+
+int obd_export_evict_by_nid(struct obd_device *obd, const char *nid);
+int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid);
+int obd_connect_flags2str(char *page, int count, __u64 flags, __u64 flags2,
+			  const char *sep);
+
+int obd_zombie_impexp_init(void);
+void obd_zombie_impexp_stop(void);
+void obd_zombie_impexp_cull(void);
+void obd_zombie_barrier(void);
+void obd_exports_barrier(struct obd_device *obd);
+int kuc_len(int payload_len);
+struct kuc_hdr * kuc_ptr(void *p);
+void *kuc_alloc(int payload_len, int transport, int type);
+void kuc_free(void *p, int payload_len);
+int obd_get_request_slot(struct client_obd *cli);
+void obd_put_request_slot(struct client_obd *cli);
+__u32 obd_get_max_rpcs_in_flight(struct client_obd *cli);
+int obd_set_max_rpcs_in_flight(struct client_obd *cli, __u32 max);
+__u16 obd_get_max_mod_rpcs_in_flight(struct client_obd *cli);
+int obd_set_max_mod_rpcs_in_flight(struct client_obd *cli, __u16 max);
+int obd_mod_rpc_stats_seq_show(struct client_obd *cli, struct seq_file *seq);
+
+__u16 obd_get_mod_rpc_slot(struct client_obd *cli, __u32 opc,
+			   struct lookup_intent *it);
+void obd_put_mod_rpc_slot(struct client_obd *cli, __u32 opc,
+			  struct lookup_intent *it, __u16 tag);
+
+struct llog_handle;
+struct llog_rec_hdr;
+typedef int (*llog_cb_t)(const struct lu_env *, struct llog_handle *,
+			 struct llog_rec_hdr *, void *);
+
+struct obd_export *obd_stale_export_get(void);
+void obd_stale_export_put(struct obd_export *exp);
+void obd_stale_export_adjust(struct obd_export *exp);
+
+/* obd_config.c */
+/* For interoperability */
+struct cfg_interop_param {
+	char *old_param;
+	char *new_param;
+};
+
+char *lustre_cfg_string(struct lustre_cfg *lcfg, u32 index);
+struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg,
+				     const char *new_name);
+void print_lustre_cfg(struct lustre_cfg *lcfg);
+int class_process_config(struct lustre_cfg *lcfg);
+int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
+			     struct lustre_cfg *lcfg, void *data);
+int class_attach(struct lustre_cfg *lcfg);
+int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
+int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg);
+int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg);
+
+int class_find_param(char *buf, char *key, char **valp);
+struct cfg_interop_param *class_find_old_param(const char *param,
+					       struct cfg_interop_param *ptr);
+int class_get_next_param(char **params, char *copy);
+int class_match_param(char *buf, const char *key, char **valp);
+int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh);
+int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh);
+int class_parse_net(char *buf, u32 *net, char **endh);
+int class_match_nid(char *buf, char *key, lnet_nid_t nid);
+int class_match_net(char *buf, char *key, u32 net);
+
+struct obd_device *class_incref(struct obd_device *obd,
+                                const char *scope, const void *source);
+void class_decref(struct obd_device *obd,
+                  const char *scope, const void *source);
+void dump_exports(struct obd_device *obd, int locks, int debug_level);
+int class_config_llog_handler(const struct lu_env *env,
+			      struct llog_handle *handle,
+			      struct llog_rec_hdr *rec, void *data);
+int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg);
+
+#define CFG_F_START     0x01   /* Set when we start updating from a log */
+#define CFG_F_MARKER    0x02   /* We are within a maker */
+#define CFG_F_SKIP      0x04   /* We should ignore this cfg command */
+#define CFG_F_COMPAT146 0x08   /* Allow old-style logs */
+#define CFG_F_EXCLUDE   0x10   /* OST exclusion list */
+
+/* Passed as data param to class_config_parse_llog */
+struct config_llog_instance {
+	char			*cfg_obdname;
+	void			*cfg_instance;
+	struct super_block	*cfg_sb;
+	struct obd_uuid		 cfg_uuid;
+	llog_cb_t		 cfg_callback;
+	int			 cfg_last_idx; /* for partial llog processing */
+	int			 cfg_flags;
+	__u32			 cfg_lwp_idx;
+	__u32			 cfg_sub_clds;
+};
+int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+			    char *name, struct config_llog_instance *cfg);
+
+#define CONFIG_SUB_SPTLRPC	0x01
+#define CONFIG_SUB_RECOVER	0x02
+#define CONFIG_SUB_PARAMS	0x04
+#define CONFIG_SUB_NODEMAP	0x08
+#define CONFIG_SUB_BARRIER	0x10
+
+/* Sub clds should be attached to the config_llog_data when processing
+ * config log for client or server target. */
+#define CONFIG_SUB_CLIENT	(CONFIG_SUB_SPTLRPC | CONFIG_SUB_RECOVER | \
+				 CONFIG_SUB_PARAMS)
+#define CONFIG_SUB_SERVER	(CONFIG_SUB_CLIENT | CONFIG_SUB_NODEMAP | \
+				 CONFIG_SUB_BARRIER)
+
+#define PARAMS_FILENAME		"params"
+#define BARRIER_FILENAME	"barrier"
+#define LCTL_UPCALL		"lctl"
+
+static inline bool logname_is_barrier(const char *logname)
+{
+	char *ptr;
+
+	/* logname for barrier is "fsname-barrier" */
+	ptr = strstr(logname, BARRIER_FILENAME);
+	if (ptr && (ptr - logname) >= 2 &&
+	    *(ptr - 1) == '-' && *(ptr + 7) == '\0')
+		return true;
+
+	return false;
+}
+
+/* list of active configuration logs  */
+struct config_llog_data {
+	struct ldlm_res_id	    cld_resid;
+	struct config_llog_instance cld_cfg;
+	struct list_head	    cld_list_chain;
+	atomic_t		    cld_refcount;
+	struct config_llog_data    *cld_sptlrpc;/* depended sptlrpc log */
+	struct config_llog_data    *cld_params;	/* common parameters log */
+	struct config_llog_data    *cld_recover;/* imperative recover log */
+	struct config_llog_data    *cld_nodemap;/* nodemap log */
+	struct config_llog_data    *cld_barrier;/* barrier log (for MDT only) */
+	struct obd_export	   *cld_mgcexp;
+	struct mutex		    cld_lock;
+	int			    cld_type;
+	unsigned int		    cld_stopping:1, /* we were told to stop
+						     * watching */
+				    cld_lostlock:1; /* lock not requeued */
+	char			    cld_logname[0];
+};
+
+struct lustre_profile {
+	struct list_head	 lp_list;
+	char			*lp_profile;
+	char			*lp_dt;
+	char			*lp_md;
+	int			 lp_refs;
+	bool			 lp_list_deleted;
+};
+
+struct lustre_profile *class_get_profile(const char * prof);
+void class_del_profile(const char *prof);
+void class_put_profile(struct lustre_profile *lprof);
+void class_del_profiles(void);
+
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+
+void __class_export_add_lock_ref(struct obd_export *, struct ldlm_lock *);
+void __class_export_del_lock_ref(struct obd_export *, struct ldlm_lock *);
+extern void (*class_export_dump_hook)(struct obd_export *);
+
+#else
+
+#define __class_export_add_lock_ref(exp, lock)             do {} while(0)
+#define __class_export_del_lock_ref(exp, lock)             do {} while(0)
+
+#endif
+
+#define class_export_rpc_inc(exp)                                       \
+({                                                                      \
+	atomic_inc(&(exp)->exp_rpc_count);                          	\
+	CDEBUG(D_INFO, "RPC GETting export %p : new rpc_count %d\n",    \
+	       (exp), atomic_read(&(exp)->exp_rpc_count));          	\
+})
+
+#define class_export_rpc_dec(exp)                                       \
+({                                                                      \
+	LASSERT_ATOMIC_POS(&exp->exp_rpc_count);                        \
+	atomic_dec(&(exp)->exp_rpc_count);                          	\
+	CDEBUG(D_INFO, "RPC PUTting export %p : new rpc_count %d\n",    \
+	       (exp), atomic_read(&(exp)->exp_rpc_count));          	\
+})
+
+#define class_export_lock_get(exp, lock)                                \
+({                                                                      \
+	atomic_inc(&(exp)->exp_locks_count);                        	\
+	__class_export_add_lock_ref(exp, lock);                         \
+	CDEBUG(D_INFO, "lock GETting export %p : new locks_count %d\n", \
+	       (exp), atomic_read(&(exp)->exp_locks_count));        	\
+	class_export_get(exp);                                          \
+})
+
+#define class_export_lock_put(exp, lock)                                \
+({                                                                      \
+	LASSERT_ATOMIC_POS(&exp->exp_locks_count);                      \
+	atomic_dec(&(exp)->exp_locks_count);                        	\
+	__class_export_del_lock_ref(exp, lock);                         \
+	CDEBUG(D_INFO, "lock PUTting export %p : new locks_count %d\n", \
+	       (exp), atomic_read(&(exp)->exp_locks_count));        	\
+	class_export_put(exp);                                          \
+})
+
+#define class_export_cb_get(exp)                                        \
+({                                                                      \
+	atomic_inc(&(exp)->exp_cb_count);                           	\
+	CDEBUG(D_INFO, "callback GETting export %p : new cb_count %d\n",\
+	       (exp), atomic_read(&(exp)->exp_cb_count));           	\
+	class_export_get(exp);                                          \
+})
+
+#define class_export_cb_put(exp)                                        \
+({                                                                      \
+	LASSERT_ATOMIC_POS(&exp->exp_cb_count);                         \
+	atomic_dec(&(exp)->exp_cb_count);                           	\
+	CDEBUG(D_INFO, "callback PUTting export %p : new cb_count %d\n",\
+	       (exp), atomic_read(&(exp)->exp_cb_count));           	\
+	class_export_put(exp);                                          \
+})
+
+/* genops.c */
+struct obd_export *class_export_get(struct obd_export *exp);
+void class_export_put(struct obd_export *exp);
+struct obd_export *class_new_export(struct obd_device *obddev,
+                                    struct obd_uuid *cluuid);
+struct obd_export *class_new_export_self(struct obd_device *obd,
+					 struct obd_uuid *uuid);
+void class_unlink_export(struct obd_export *exp);
+
+struct obd_import *class_import_get(struct obd_import *);
+void class_import_put(struct obd_import *);
+struct obd_import *class_new_import(struct obd_device *obd);
+void class_destroy_import(struct obd_import *exp);
+
+struct obd_type *class_search_type(const char *name);
+struct obd_type *class_get_type(const char *name);
+void class_put_type(struct obd_type *type);
+int class_connect(struct lustre_handle *conn, struct obd_device *obd,
+                  struct obd_uuid *cluuid);
+int class_disconnect(struct obd_export *exp);
+void class_fail_export(struct obd_export *exp);
+int class_connected_export(struct obd_export *exp);
+void class_disconnect_exports(struct obd_device *obddev);
+int class_manual_cleanup(struct obd_device *obd);
+void class_disconnect_stale_exports(struct obd_device *,
+                                    int (*test_export)(struct obd_export *));
+static inline enum obd_option exp_flags_from_obd(struct obd_device *obd)
+{
+        return ((obd->obd_fail ? OBD_OPT_FAILOVER : 0) |
+                (obd->obd_force ? OBD_OPT_FORCE : 0) |
+                (obd->obd_abort_recovery ? OBD_OPT_ABORT_RECOV : 0) |
+                0);
+}
+
+#ifdef HAVE_SERVER_SUPPORT
+static inline struct lu_target *class_exp2tgt(struct obd_export *exp)
+{
+        LASSERT(exp->exp_obd);
+	if (exp->exp_obd->u.obt.obt_magic != OBT_MAGIC)
+		return NULL;
+        return exp->exp_obd->u.obt.obt_lut;
+}
+
+static inline struct lr_server_data *class_server_data(struct obd_device *obd)
+{
+        LASSERT(obd->u.obt.obt_lut);
+        return &obd->u.obt.obt_lut->lut_lsd;
+}
+#endif
+
+/* obdo.c */
+struct lu_attr;
+struct inode;
+
+void obdo_from_la(struct obdo *dst, const struct lu_attr *la, u64 valid);
+void la_from_obdo(struct lu_attr *la, const struct obdo *dst, u64 valid);
+
+void obdo_cpy_md(struct obdo *dst, const struct obdo *src, u64 valid);
+void obdo_to_ioobj(const struct obdo *oa, struct obd_ioobj *ioobj);
+
+#define OBT(dev)        (dev)->obd_type
+#define OBP(dev, op)    (dev)->obd_type->typ_dt_ops->o_ ## op
+#define MDP(dev, op)    (dev)->obd_type->typ_md_ops->m_ ## op
+#define CTXTP(ctxt, op) (ctxt)->loc_logops->lop_##op
+
+/* Ensure obd_setup: used for cleanup which must be called
+   while obd is stopping */
+#define OBD_CHECK_DEV(obd)                                      \
+do {                                                            \
+        if (!(obd)) {                                           \
+                CERROR("NULL device\n");                        \
+                RETURN(-ENODEV);                                \
+        }                                                       \
+} while (0)
+
+/* ensure obd_setup and !obd_stopping */
+#define OBD_CHECK_DEV_ACTIVE(obd)                               \
+do {                                                            \
+        OBD_CHECK_DEV(obd);                                     \
+        if (!(obd)->obd_set_up || (obd)->obd_stopping) {        \
+                CERROR("Device %d not setup\n",                 \
+                       (obd)->obd_minor);                       \
+                RETURN(-ENODEV);                                \
+        }                                                       \
+} while (0)
+
+
+#ifdef CONFIG_PROC_FS
+#define OBD_COUNTER_OFFSET(op)						       \
+	((offsetof(struct obd_ops, o_ ## op) -				       \
+	  offsetof(struct obd_ops, o_iocontrol))			       \
+	 / sizeof(((struct obd_ops *)NULL)->o_iocontrol))
+
+/* The '- 1' below is for o_owner. */
+#define NUM_OBD_STATS							       \
+	(sizeof(struct obd_ops) /					       \
+	 sizeof(((struct obd_ops *)NULL)->o_iocontrol) - 1)
+
+#define OBD_COUNTER_INCREMENT(obd, op)					       \
+	lprocfs_counter_incr((obd)->obd_stats,				       \
+			     (obd)->obd_cntr_base + OBD_COUNTER_OFFSET(op))
+
+#define EXP_COUNTER_INCREMENT(exp, op)					       \
+	do {								       \
+		unsigned int _off;					       \
+		_off = (exp)->exp_obd->obd_cntr_base + OBD_COUNTER_OFFSET(op); \
+		lprocfs_counter_incr((exp)->exp_obd->obd_stats, _off);	       \
+		if ((exp)->exp_obd->obd_uses_nid_stats &&		       \
+		    (exp)->exp_nid_stats != NULL)			       \
+			lprocfs_counter_incr((exp)->exp_nid_stats->nid_stats,  \
+					     _off);			       \
+	} while (0)
+
+#define _MD_COUNTER_OFFSET(m_op)					       \
+	((offsetof(struct md_ops, m_op) -				       \
+	  offsetof(struct md_ops, MD_STATS_FIRST_OP)) /			       \
+	 sizeof(((struct md_ops *)NULL)->MD_STATS_FIRST_OP))
+
+#define MD_COUNTER_OFFSET(op) _MD_COUNTER_OFFSET(m_ ## op)
+
+#define NUM_MD_STATS							       \
+	(_MD_COUNTER_OFFSET(MD_STATS_LAST_OP) -				       \
+	 _MD_COUNTER_OFFSET(MD_STATS_FIRST_OP) + 1)
+
+/* Note that we only increment md counters for ops whose offset is less
+ * than NUM_MD_STATS. This is explained in a comment in the definition
+ * of struct md_ops. */
+#define EXP_MD_COUNTER_INCREMENT(exp, op)				       \
+	do {								       \
+		if (MD_COUNTER_OFFSET(op) < NUM_MD_STATS)		       \
+			lprocfs_counter_incr((exp)->exp_obd->obd_md_stats,     \
+					(exp)->exp_obd->obd_md_cntr_base +     \
+					MD_COUNTER_OFFSET(op));	               \
+	} while (0)
+
+#else
+#define OBD_COUNTER_OFFSET(op)
+#define OBD_COUNTER_INCREMENT(obd, op)
+#define EXP_COUNTER_INCREMENT(exp, op)
+#define EXP_MD_COUNTER_INCREMENT(exp, op)
+#endif
+
+static inline int lprocfs_nid_ldlm_stats_init(struct nid_stat* tmp)
+{
+	/* Always add in ldlm_stats */
+	tmp->nid_ldlm_stats =
+		lprocfs_alloc_stats(LDLM_LAST_OPC - LDLM_FIRST_OPC,
+				    LPROCFS_STATS_FLAG_NOPERCPU);
+	if (tmp->nid_ldlm_stats == NULL)
+		return -ENOMEM;
+
+	lprocfs_init_ldlm_stats(tmp->nid_ldlm_stats);
+
+	return lprocfs_register_stats(tmp->nid_proc, "ldlm_stats",
+				      tmp->nid_ldlm_stats);
+}
+
+#define EXP_CHECK_MD_OP(exp, op)					\
+do {									\
+	if ((exp) == NULL) {						\
+		CERROR("obd_" #op ": NULL export\n");			\
+		RETURN(-ENODEV);					\
+	}								\
+	if ((exp)->exp_obd == NULL || !OBT((exp)->exp_obd)) {		\
+		CERROR("obd_" #op ": cleaned up obd\n");		\
+		RETURN(-EOPNOTSUPP);					\
+	}								\
+	if (!OBT((exp)->exp_obd) || !MDP((exp)->exp_obd, op)) {		\
+		CERROR("%s: obd_" #op ": dev %d no operation\n",	\
+		       (exp)->exp_obd->obd_name,			\
+		       (exp)->exp_obd->obd_minor);			\
+		RETURN(-EOPNOTSUPP);					\
+	}								\
+} while (0)
+
+
+#define OBD_CHECK_DT_OP(obd, op, err)					\
+do {									\
+	if (!OBT(obd) || !OBP((obd), op)) {				\
+		if (err)						\
+			CERROR("%s: no obd_" #op " operation\n",	\
+			       obd->obd_name);				\
+		RETURN(err);						\
+	}								\
+} while (0)
+
+#define EXP_CHECK_DT_OP(exp, op)					\
+do {									\
+	if ((exp) == NULL) {						\
+		CERROR("obd_" #op ": NULL export\n");			\
+		RETURN(-ENODEV);					\
+	}								\
+	if ((exp)->exp_obd == NULL || !OBT((exp)->exp_obd)) {		\
+		CERROR("obd_" #op ": cleaned up obd\n");		\
+		RETURN(-EOPNOTSUPP);					\
+	}								\
+	OBD_CHECK_DT_OP((exp)->exp_obd, op, -EOPNOTSUPP);		\
+} while (0)
+
+#define CTXT_CHECK_OP(ctxt, op, err)					\
+do {									\
+	if (!OBT(ctxt->loc_obd) || !CTXTP((ctxt), op)) {		\
+		if (err)						\
+			CERROR("%s: no lop_" #op "operation\n",		\
+			       ctxt->loc_obd->obd_name);		\
+		RETURN(err);						\
+	}								\
+} while (0)
+
+static inline int class_devno_max(void)
+{
+	return MAX_OBD_DEVICES;
+}
+
+static inline int obd_get_info(const struct lu_env *env, struct obd_export *exp,
+			       __u32 keylen, void *key,
+			       __u32 *vallen, void *val)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, get_info);
+	EXP_COUNTER_INCREMENT(exp, get_info);
+
+	rc = OBP(exp->exp_obd, get_info)(env, exp, keylen, key, vallen, val);
+	RETURN(rc);
+}
+
+static inline int obd_set_info_async(const struct lu_env *env,
+				     struct obd_export *exp,
+				     __u32 keylen, void *key,
+				     __u32 vallen, void *val,
+				     struct ptlrpc_request_set *set)
+{
+        int rc;
+        ENTRY;
+
+        EXP_CHECK_DT_OP(exp, set_info_async);
+        EXP_COUNTER_INCREMENT(exp, set_info_async);
+
+        rc = OBP(exp->exp_obd, set_info_async)(env, exp, keylen, key, vallen,
+                                               val, set);
+        RETURN(rc);
+}
+
+/*
+ * obd-lu integration.
+ *
+ * Functionality is being moved into new lu_device-based layering, but some
+ * pieces of configuration process are still based on obd devices.
+ *
+ * Specifically, lu_device_type_operations::ldto_device_alloc() methods fully
+ * subsume ->o_setup() methods of obd devices they replace. The same for
+ * lu_device_operations::ldo_process_config() and ->o_process_config(). As a
+ * result, obd_setup() and obd_process_config() branch and call one XOR
+ * another.
+ *
+ * Yet neither lu_device_type_operations::ldto_device_fini() nor
+ * lu_device_type_operations::ldto_device_free() fully implement the
+ * functionality of ->o_precleanup() and ->o_cleanup() they override. Hence,
+ * obd_precleanup() and obd_cleanup() call both lu_device and obd operations.
+ */
+
+#define DECLARE_LU_VARS(ldt, d)                 \
+        struct lu_device_type *ldt;       \
+        struct lu_device *d
+
+static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg)
+{
+        int rc;
+        DECLARE_LU_VARS(ldt, d);
+        ENTRY;
+
+        ldt = obd->obd_type->typ_lu;
+        if (ldt != NULL) {
+                struct lu_context  session_ctx;
+                struct lu_env env;
+                lu_context_init(&session_ctx, LCT_SESSION | LCT_SERVER_SESSION);
+                session_ctx.lc_thread = NULL;
+                lu_context_enter(&session_ctx);
+
+                rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+                if (rc == 0) {
+                        env.le_ses = &session_ctx;
+                        d = ldt->ldt_ops->ldto_device_alloc(&env, ldt, cfg);
+                        lu_env_fini(&env);
+                        if (!IS_ERR(d)) {
+                                obd->obd_lu_dev = d;
+                                d->ld_obd = obd;
+                                rc = 0;
+                        } else
+                                rc = PTR_ERR(d);
+                }
+                lu_context_exit(&session_ctx);
+                lu_context_fini(&session_ctx);
+
+        } else {
+                OBD_CHECK_DT_OP(obd, setup, -EOPNOTSUPP);
+                OBD_COUNTER_INCREMENT(obd, setup);
+                rc = OBP(obd, setup)(obd, cfg);
+        }
+        RETURN(rc);
+}
+
+static inline int obd_precleanup(struct obd_device *obd)
+{
+	int rc;
+	DECLARE_LU_VARS(ldt, d);
+	ENTRY;
+
+	OBD_CHECK_DEV(obd);
+	ldt = obd->obd_type->typ_lu;
+	d = obd->obd_lu_dev;
+	if (ldt != NULL && d != NULL) {
+		struct lu_env env;
+
+		rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+		if (rc == 0) {
+			ldt->ldt_ops->ldto_device_fini(&env, d);
+			lu_env_fini(&env);
+		}
+	}
+	OBD_CHECK_DT_OP(obd, precleanup, 0);
+	OBD_COUNTER_INCREMENT(obd, precleanup);
+
+	rc = OBP(obd, precleanup)(obd);
+	RETURN(rc);
+}
+
+static inline int obd_cleanup(struct obd_device *obd)
+{
+        int rc;
+        DECLARE_LU_VARS(ldt, d);
+        ENTRY;
+
+        OBD_CHECK_DEV(obd);
+
+        ldt = obd->obd_type->typ_lu;
+        d = obd->obd_lu_dev;
+        if (ldt != NULL && d != NULL) {
+                struct lu_env env;
+
+                rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+                if (rc == 0) {
+                        ldt->ldt_ops->ldto_device_free(&env, d);
+                        lu_env_fini(&env);
+                        obd->obd_lu_dev = NULL;
+                }
+        }
+        OBD_CHECK_DT_OP(obd, cleanup, 0);
+        OBD_COUNTER_INCREMENT(obd, cleanup);
+
+        rc = OBP(obd, cleanup)(obd);
+        RETURN(rc);
+}
+
+static inline void obd_cleanup_client_import(struct obd_device *obd)
+{
+        ENTRY;
+
+        /* If we set up but never connected, the
+           client import will not have been cleaned. */
+	down_write(&obd->u.cli.cl_sem);
+        if (obd->u.cli.cl_import) {
+                struct obd_import *imp;
+                imp = obd->u.cli.cl_import;
+                CDEBUG(D_CONFIG, "%s: client import never connected\n",
+                       obd->obd_name);
+                ptlrpc_invalidate_import(imp);
+                client_destroy_import(imp);
+                obd->u.cli.cl_import = NULL;
+        }
+	up_write(&obd->u.cli.cl_sem);
+
+        EXIT;
+}
+
+static inline int
+obd_process_config(struct obd_device *obd, int datalen, void *data)
+{
+        int rc;
+        DECLARE_LU_VARS(ldt, d);
+        ENTRY;
+
+        OBD_CHECK_DEV(obd);
+
+        obd->obd_process_conf = 1;
+        ldt = obd->obd_type->typ_lu;
+        d = obd->obd_lu_dev;
+        if (ldt != NULL && d != NULL) {
+                struct lu_env env;
+
+                rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+                if (rc == 0) {
+                        rc = d->ld_ops->ldo_process_config(&env, d, data);
+                        lu_env_fini(&env);
+                }
+        } else {
+                OBD_CHECK_DT_OP(obd, process_config, -EOPNOTSUPP);
+                rc = OBP(obd, process_config)(obd, datalen, data);
+        }
+        OBD_COUNTER_INCREMENT(obd, process_config);
+        obd->obd_process_conf = 0;
+
+        RETURN(rc);
+}
+
+static inline int obd_create(const struct lu_env *env, struct obd_export *exp,
+			     struct obdo *obdo)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, create);
+	EXP_COUNTER_INCREMENT(exp, create);
+
+	rc = OBP(exp->exp_obd, create)(env, exp, obdo);
+	RETURN(rc);
+}
+
+static inline int obd_destroy(const struct lu_env *env, struct obd_export *exp,
+			      struct obdo *obdo)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, destroy);
+	EXP_COUNTER_INCREMENT(exp, destroy);
+
+	rc = OBP(exp->exp_obd, destroy)(env, exp, obdo);
+	RETURN(rc);
+}
+
+static inline int obd_getattr(const struct lu_env *env, struct obd_export *exp,
+			      struct obdo *oa)
+{
+	int rc;
+
+	ENTRY;
+	EXP_CHECK_DT_OP(exp, getattr);
+	EXP_COUNTER_INCREMENT(exp, getattr);
+	rc = OBP(exp->exp_obd, getattr)(env, exp, oa);
+
+	RETURN(rc);
+}
+
+static inline int obd_setattr(const struct lu_env *env, struct obd_export *exp,
+			      struct obdo *oa)
+{
+	int rc;
+
+	ENTRY;
+	EXP_CHECK_DT_OP(exp, setattr);
+	EXP_COUNTER_INCREMENT(exp, setattr);
+	rc = OBP(exp->exp_obd, setattr)(env, exp, oa);
+
+	RETURN(rc);
+}
+
+static inline int obd_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
+                               int priority)
+{
+        struct obd_device *obd = imp->imp_obd;
+        int rc;
+        ENTRY;
+
+        OBD_CHECK_DEV_ACTIVE(obd);
+        OBD_CHECK_DT_OP(obd, add_conn, -EOPNOTSUPP);
+        OBD_COUNTER_INCREMENT(obd, add_conn);
+
+        rc = OBP(obd, add_conn)(imp, uuid, priority);
+        RETURN(rc);
+}
+
+static inline int obd_del_conn(struct obd_import *imp, struct obd_uuid *uuid)
+{
+        struct obd_device *obd = imp->imp_obd;
+        int rc;
+        ENTRY;
+
+        OBD_CHECK_DEV_ACTIVE(obd);
+        OBD_CHECK_DT_OP(obd, del_conn, -EOPNOTSUPP);
+        OBD_COUNTER_INCREMENT(obd, del_conn);
+
+        rc = OBP(obd, del_conn)(imp, uuid);
+        RETURN(rc);
+}
+
+static inline struct obd_uuid *obd_get_uuid(struct obd_export *exp)
+{
+        struct obd_uuid *uuid;
+        ENTRY;
+
+        OBD_CHECK_DT_OP(exp->exp_obd, get_uuid, NULL);
+        EXP_COUNTER_INCREMENT(exp, get_uuid);
+
+        uuid = OBP(exp->exp_obd, get_uuid)(exp);
+        RETURN(uuid);
+}
+
+/** Create a new /a exp on device /a obd for the uuid /a cluuid
+ * @param exp New export handle
+ * @param d Connect data, supported flags are set, flags also understood
+ *    by obd are returned.
+ */
+static inline int obd_connect(const struct lu_env *env,
+                              struct obd_export **exp,struct obd_device *obd,
+                              struct obd_uuid *cluuid,
+                              struct obd_connect_data *data,
+                              void *localdata)
+{
+        int rc;
+        __u64 ocf = data ? data->ocd_connect_flags : 0; /* for post-condition
+                                                   * check */
+        ENTRY;
+
+        OBD_CHECK_DEV_ACTIVE(obd);
+        OBD_CHECK_DT_OP(obd, connect, -EOPNOTSUPP);
+        OBD_COUNTER_INCREMENT(obd, connect);
+
+        rc = OBP(obd, connect)(env, exp, obd, cluuid, data, localdata);
+        /* check that only subset is granted */
+        LASSERT(ergo(data != NULL, (data->ocd_connect_flags & ocf) ==
+                                    data->ocd_connect_flags));
+        RETURN(rc);
+}
+
+static inline int obd_reconnect(const struct lu_env *env,
+                                struct obd_export *exp,
+                                struct obd_device *obd,
+                                struct obd_uuid *cluuid,
+                                struct obd_connect_data *d,
+                                void *localdata)
+{
+        int rc;
+        __u64 ocf = d ? d->ocd_connect_flags : 0; /* for post-condition
+                                                   * check */
+
+        ENTRY;
+
+        OBD_CHECK_DEV_ACTIVE(obd);
+        OBD_CHECK_DT_OP(obd, reconnect, 0);
+        OBD_COUNTER_INCREMENT(obd, reconnect);
+
+        rc = OBP(obd, reconnect)(env, exp, obd, cluuid, d, localdata);
+        /* check that only subset is granted */
+        LASSERT(ergo(d != NULL,
+                     (d->ocd_connect_flags & ocf) == d->ocd_connect_flags));
+        RETURN(rc);
+}
+
+static inline int obd_disconnect(struct obd_export *exp)
+{
+        int rc;
+        ENTRY;
+
+        EXP_CHECK_DT_OP(exp, disconnect);
+        EXP_COUNTER_INCREMENT(exp, disconnect);
+
+        rc = OBP(exp->exp_obd, disconnect)(exp);
+        RETURN(rc);
+}
+
+static inline int obd_fid_init(struct obd_device *obd, struct obd_export *exp,
+			       enum lu_cli_type type)
+{
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(obd, fid_init, 0);
+	OBD_COUNTER_INCREMENT(obd, fid_init);
+
+	rc = OBP(obd, fid_init)(obd, exp, type);
+	RETURN(rc);
+}
+
+static inline int obd_fid_fini(struct obd_device *obd)
+{
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(obd, fid_fini, 0);
+	OBD_COUNTER_INCREMENT(obd, fid_fini);
+
+	rc = OBP(obd, fid_fini)(obd);
+	RETURN(rc);
+}
+
+static inline int obd_fid_alloc(const struct lu_env *env,
+				struct obd_export *exp,
+                                struct lu_fid *fid,
+                                struct md_op_data *op_data)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, fid_alloc);
+	EXP_COUNTER_INCREMENT(exp, fid_alloc);
+
+	rc = OBP(exp->exp_obd, fid_alloc)(env, exp, fid, op_data);
+	RETURN(rc);
+}
+
+static inline int obd_ping(const struct lu_env *env, struct obd_export *exp)
+{
+        int rc;
+        ENTRY;
+
+        OBD_CHECK_DT_OP(exp->exp_obd, ping, 0);
+        EXP_COUNTER_INCREMENT(exp, ping);
+
+        rc = OBP(exp->exp_obd, ping)(env, exp);
+        RETURN(rc);
+}
+
+static inline int obd_pool_new(struct obd_device *obd, char *poolname)
+{
+        int rc;
+        ENTRY;
+
+        OBD_CHECK_DT_OP(obd, pool_new, -EOPNOTSUPP);
+        OBD_COUNTER_INCREMENT(obd, pool_new);
+
+        rc = OBP(obd, pool_new)(obd, poolname);
+        RETURN(rc);
+}
+
+static inline int obd_pool_del(struct obd_device *obd, char *poolname)
+{
+        int rc;
+        ENTRY;
+
+        OBD_CHECK_DT_OP(obd, pool_del, -EOPNOTSUPP);
+        OBD_COUNTER_INCREMENT(obd, pool_del);
+
+        rc = OBP(obd, pool_del)(obd, poolname);
+        RETURN(rc);
+}
+
+static inline int obd_pool_add(struct obd_device *obd, char *poolname, char *ostname)
+{
+        int rc;
+        ENTRY;
+
+        OBD_CHECK_DT_OP(obd, pool_add, -EOPNOTSUPP);
+        OBD_COUNTER_INCREMENT(obd, pool_add);
+
+        rc = OBP(obd, pool_add)(obd, poolname, ostname);
+        RETURN(rc);
+}
+
+static inline int obd_pool_rem(struct obd_device *obd, char *poolname, char *ostname)
+{
+        int rc;
+        ENTRY;
+
+        OBD_CHECK_DT_OP(obd, pool_rem, -EOPNOTSUPP);
+        OBD_COUNTER_INCREMENT(obd, pool_rem);
+
+        rc = OBP(obd, pool_rem)(obd, poolname, ostname);
+        RETURN(rc);
+}
+
+static inline void obd_getref(struct obd_device *obd)
+{
+        ENTRY;
+        if (OBT(obd) && OBP(obd, getref)) {
+                OBD_COUNTER_INCREMENT(obd, getref);
+                OBP(obd, getref)(obd);
+        }
+        EXIT;
+}
+
+static inline void obd_putref(struct obd_device *obd)
+{
+        ENTRY;
+        if (OBT(obd) && OBP(obd, putref)) {
+                OBD_COUNTER_INCREMENT(obd, putref);
+                OBP(obd, putref)(obd);
+        }
+        EXIT;
+}
+
+static inline int obd_init_export(struct obd_export *exp)
+{
+        int rc = 0;
+
+        ENTRY;
+        if ((exp)->exp_obd != NULL && OBT((exp)->exp_obd) &&
+            OBP((exp)->exp_obd, init_export))
+                rc = OBP(exp->exp_obd, init_export)(exp);
+        RETURN(rc);
+}
+
+static inline int obd_destroy_export(struct obd_export *exp)
+{
+        ENTRY;
+        if ((exp)->exp_obd != NULL && OBT((exp)->exp_obd) &&
+            OBP((exp)->exp_obd, destroy_export))
+                OBP(exp->exp_obd, destroy_export)(exp);
+        RETURN(0);
+}
+
+/* @max_age is the oldest time in jiffies that we accept using a cached data.
+ * If the cache is older than @max_age we will get a new value from the
+ * target.  Use a value of "cfs_time_current() + HZ" to guarantee freshness. */
+static inline int obd_statfs_async(struct obd_export *exp,
+                                   struct obd_info *oinfo,
+                                   __u64 max_age,
+                                   struct ptlrpc_request_set *rqset)
+{
+        int rc = 0;
+        struct obd_device *obd;
+        ENTRY;
+
+        if (exp == NULL || exp->exp_obd == NULL)
+                RETURN(-EINVAL);
+
+        obd = exp->exp_obd;
+        OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP);
+        OBD_COUNTER_INCREMENT(obd, statfs);
+
+	CDEBUG(D_SUPER, "%s: osfs %p age %llu, max_age %llu\n",
+               obd->obd_name, &obd->obd_osfs, obd->obd_osfs_age, max_age);
+        if (cfs_time_before_64(obd->obd_osfs_age, max_age)) {
+                rc = OBP(obd, statfs_async)(exp, oinfo, max_age, rqset);
+        } else {
+		CDEBUG(D_SUPER, "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n",
+                       obd->obd_name, &obd->obd_osfs,
+                       obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
+                       obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
+		spin_lock(&obd->obd_osfs_lock);
+		memcpy(oinfo->oi_osfs, &obd->obd_osfs, sizeof(*oinfo->oi_osfs));
+		spin_unlock(&obd->obd_osfs_lock);
+                oinfo->oi_flags |= OBD_STATFS_FROM_CACHE;
+                if (oinfo->oi_cb_up)
+                        oinfo->oi_cb_up(oinfo, 0);
+        }
+        RETURN(rc);
+}
+
+static inline int obd_statfs_rqset(struct obd_export *exp,
+                                   struct obd_statfs *osfs, __u64 max_age,
+                                   __u32 flags)
+{
+	struct ptlrpc_request_set *set = NULL;
+	struct obd_info oinfo = {
+		.oi_osfs = osfs,
+		.oi_flags = flags,
+	};
+	int rc = 0;
+
+	ENTRY;
+
+	set = ptlrpc_prep_set();
+	if (set == NULL)
+		RETURN(-ENOMEM);
+
+	rc = obd_statfs_async(exp, &oinfo, max_age, set);
+	if (rc == 0)
+		rc = ptlrpc_set_wait(set);
+
+	ptlrpc_set_destroy(set);
+
+	RETURN(rc);
+}
+
+/* @max_age is the oldest time in jiffies that we accept using a cached data.
+ * If the cache is older than @max_age we will get a new value from the
+ * target.  Use a value of "cfs_time_current() + HZ" to guarantee freshness. */
+static inline int obd_statfs(const struct lu_env *env, struct obd_export *exp,
+                             struct obd_statfs *osfs, __u64 max_age,
+                             __u32 flags)
+{
+        int rc = 0;
+        struct obd_device *obd = exp->exp_obd;
+        ENTRY;
+
+        if (obd == NULL)
+                RETURN(-EINVAL);
+
+        OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP);
+	OBD_CHECK_DEV_ACTIVE(obd);
+        OBD_COUNTER_INCREMENT(obd, statfs);
+
+	CDEBUG(D_SUPER, "osfs %llu, max_age %llu\n",
+               obd->obd_osfs_age, max_age);
+        if (cfs_time_before_64(obd->obd_osfs_age, max_age)) {
+                rc = OBP(obd, statfs)(env, exp, osfs, max_age, flags);
+                if (rc == 0) {
+			spin_lock(&obd->obd_osfs_lock);
+			memcpy(&obd->obd_osfs, osfs, sizeof(obd->obd_osfs));
+			obd->obd_osfs_age = cfs_time_current_64();
+			spin_unlock(&obd->obd_osfs_lock);
+		}
+	} else {
+		CDEBUG(D_SUPER, "%s: use %p cache blocks %llu/%llu"
+		       " objects %llu/%llu\n",
+		       obd->obd_name, &obd->obd_osfs,
+		       obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
+		       obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
+		spin_lock(&obd->obd_osfs_lock);
+		memcpy(osfs, &obd->obd_osfs, sizeof(*osfs));
+		spin_unlock(&obd->obd_osfs_lock);
+	}
+	RETURN(rc);
+}
+
+static inline int obd_preprw(const struct lu_env *env, int cmd,
+			     struct obd_export *exp, struct obdo *oa,
+			     int objcount, struct obd_ioobj *obj,
+			     struct niobuf_remote *remote, int *pages,
+			     struct niobuf_local *local)
+{
+	int rc;
+
+	ENTRY;
+	EXP_CHECK_DT_OP(exp, preprw);
+	EXP_COUNTER_INCREMENT(exp, preprw);
+	rc = OBP(exp->exp_obd, preprw)(env, cmd, exp, oa, objcount, obj, remote,
+				       pages, local);
+
+	RETURN(rc);
+}
+
+static inline int obd_commitrw(const struct lu_env *env, int cmd,
+			       struct obd_export *exp, struct obdo *oa,
+			       int objcount, struct obd_ioobj *obj,
+			       struct niobuf_remote *rnb, int pages,
+			       struct niobuf_local *local, int rc)
+{
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, commitrw);
+	EXP_COUNTER_INCREMENT(exp, commitrw);
+	rc = OBP(exp->exp_obd, commitrw)(env, cmd, exp, oa, objcount, obj,
+					 rnb, pages, local, rc);
+
+	RETURN(rc);
+}
+
+static inline int obd_iocontrol(unsigned int cmd, struct obd_export *exp,
+				int len, void *karg, void __user *uarg)
+{
+        int rc;
+        ENTRY;
+
+        EXP_CHECK_DT_OP(exp, iocontrol);
+        EXP_COUNTER_INCREMENT(exp, iocontrol);
+
+        rc = OBP(exp->exp_obd, iocontrol)(cmd, exp, len, karg, uarg);
+        RETURN(rc);
+}
+
+static inline void obd_import_event(struct obd_device *obd,
+                                    struct obd_import *imp,
+                                    enum obd_import_event event)
+{
+        ENTRY;
+        if (!obd) {
+                CERROR("NULL device\n");
+                EXIT;
+                return;
+        }
+        if (obd->obd_set_up && OBP(obd, import_event)) {
+                OBD_COUNTER_INCREMENT(obd, import_event);
+                OBP(obd, import_event)(obd, imp, event);
+        }
+        EXIT;
+}
+
+static inline int obd_notify(struct obd_device *obd,
+			     struct obd_device *watched,
+			     enum obd_notify_event ev)
+{
+	int rc;
+	ENTRY;
+	OBD_CHECK_DEV(obd);
+
+	if (!obd->obd_set_up) {
+		CDEBUG(D_HA, "obd %s not set up\n", obd->obd_name);
+		RETURN(-EINVAL);
+	}
+
+	if (!OBP(obd, notify)) {
+		CDEBUG(D_HA, "obd %s has no notify handler\n", obd->obd_name);
+		RETURN(-ENOSYS);
+	}
+
+	OBD_COUNTER_INCREMENT(obd, notify);
+	rc = OBP(obd, notify)(obd, watched, ev);
+
+	RETURN(rc);
+}
+
+static inline int obd_notify_observer(struct obd_device *observer,
+				      struct obd_device *observed,
+				      enum obd_notify_event ev)
+{
+	int rc = 0;
+	int rc2 = 0;
+	struct obd_notify_upcall *onu;
+
+	if (observer->obd_observer)
+		rc = obd_notify(observer->obd_observer, observed, ev);
+
+	/*
+	 * Also, call non-obd listener, if any
+	 */
+	onu = &observer->obd_upcall;
+	if (onu->onu_upcall != NULL)
+		rc2 = onu->onu_upcall(observer, observed, ev, onu->onu_owner);
+
+	return rc ? rc : rc2;
+}
+
+static inline int obd_quotactl(struct obd_export *exp,
+                               struct obd_quotactl *oqctl)
+{
+        int rc;
+        ENTRY;
+
+        EXP_CHECK_DT_OP(exp, quotactl);
+        EXP_COUNTER_INCREMENT(exp, quotactl);
+
+        rc = OBP(exp->exp_obd, quotactl)(exp->exp_obd, exp, oqctl);
+        RETURN(rc);
+}
+
+static inline int obd_health_check(const struct lu_env *env,
+                                   struct obd_device *obd)
+{
+        /* returns: 0 on healthy
+         *         >0 on unhealthy + reason code/flag
+         *            however the only suppored reason == 1 right now
+         *            We'll need to define some better reasons
+         *            or flags in the future.
+         *         <0 on error
+         */
+        int rc;
+        ENTRY;
+
+        /* don't use EXP_CHECK_DT_OP, because NULL method is normal here */
+        if (obd == NULL || !OBT(obd)) {
+                CERROR("cleaned up obd\n");
+                RETURN(-EOPNOTSUPP);
+        }
+        if (!obd->obd_set_up || obd->obd_stopping)
+                RETURN(0);
+        if (!OBP(obd, health_check))
+                RETURN(0);
+
+        rc = OBP(obd, health_check)(env, obd);
+        RETURN(rc);
+}
+
+static inline int obd_register_observer(struct obd_device *obd,
+                                        struct obd_device *observer)
+{
+        ENTRY;
+        OBD_CHECK_DEV(obd);
+	down_write(&obd->obd_observer_link_sem);
+        if (obd->obd_observer && observer) {
+		up_write(&obd->obd_observer_link_sem);
+                RETURN(-EALREADY);
+        }
+        obd->obd_observer = observer;
+	up_write(&obd->obd_observer_link_sem);
+        RETURN(0);
+}
+
+/* metadata helpers */
+static inline int md_get_root(struct obd_export *exp, const char *fileset,
+			      struct lu_fid *fid)
+{
+	int rc;
+
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, get_root);
+	EXP_MD_COUNTER_INCREMENT(exp, get_root);
+	rc = MDP(exp->exp_obd, get_root)(exp, fileset, fid);
+
+	RETURN(rc);
+}
+
+static inline int md_getattr(struct obd_export *exp, struct md_op_data *op_data,
+                             struct ptlrpc_request **request)
+{
+        int rc;
+        ENTRY;
+        EXP_CHECK_MD_OP(exp, getattr);
+        EXP_MD_COUNTER_INCREMENT(exp, getattr);
+        rc = MDP(exp->exp_obd, getattr)(exp, op_data, request);
+        RETURN(rc);
+}
+
+static inline int md_null_inode(struct obd_export *exp,
+                                   const struct lu_fid *fid)
+{
+        int rc;
+        ENTRY;
+        EXP_CHECK_MD_OP(exp, null_inode);
+        EXP_MD_COUNTER_INCREMENT(exp, null_inode);
+        rc = MDP(exp->exp_obd, null_inode)(exp, fid);
+        RETURN(rc);
+}
+
+static inline int md_close(struct obd_export *exp, struct md_op_data *op_data,
+                           struct md_open_data *mod,
+                           struct ptlrpc_request **request)
+{
+        int rc;
+        ENTRY;
+        EXP_CHECK_MD_OP(exp, close);
+        EXP_MD_COUNTER_INCREMENT(exp, close);
+        rc = MDP(exp->exp_obd, close)(exp, op_data, mod, request);
+        RETURN(rc);
+}
+
+static inline int md_create(struct obd_export *exp, struct md_op_data *op_data,
+			    const void *data, size_t datalen, umode_t mode,
+			    uid_t uid, gid_t gid, cfs_cap_t cap_effective,
+			    __u64 rdev, struct ptlrpc_request **request)
+{
+        int rc;
+        ENTRY;
+        EXP_CHECK_MD_OP(exp, create);
+        EXP_MD_COUNTER_INCREMENT(exp, create);
+        rc = MDP(exp->exp_obd, create)(exp, op_data, data, datalen, mode,
+                                       uid, gid, cap_effective, rdev, request);
+        RETURN(rc);
+}
+
+static inline int md_enqueue(struct obd_export *exp,
+			     struct ldlm_enqueue_info *einfo,
+			     const union ldlm_policy_data *policy,
+			     struct md_op_data *op_data,
+			     struct lustre_handle *lockh,
+			     __u64 extra_lock_flags)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, enqueue);
+	EXP_MD_COUNTER_INCREMENT(exp, enqueue);
+	rc = MDP(exp->exp_obd, enqueue)(exp, einfo, policy, op_data, lockh,
+					extra_lock_flags);
+        RETURN(rc);
+}
+
+static inline int md_getattr_name(struct obd_export *exp,
+                                  struct md_op_data *op_data,
+                                  struct ptlrpc_request **request)
+{
+        int rc;
+        ENTRY;
+        EXP_CHECK_MD_OP(exp, getattr_name);
+        EXP_MD_COUNTER_INCREMENT(exp, getattr_name);
+        rc = MDP(exp->exp_obd, getattr_name)(exp, op_data, request);
+        RETURN(rc);
+}
+
+static inline int md_intent_lock(struct obd_export *exp,
+				 struct md_op_data *op_data,
+				 struct lookup_intent *it,
+				 struct ptlrpc_request **reqp,
+				 ldlm_blocking_callback cb_blocking,
+				 __u64 extra_lock_flags)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, intent_lock);
+	EXP_MD_COUNTER_INCREMENT(exp, intent_lock);
+	rc = MDP(exp->exp_obd, intent_lock)(exp, op_data, it, reqp, cb_blocking,
+					    extra_lock_flags);
+	RETURN(rc);
+}
+
+static inline int md_link(struct obd_export *exp, struct md_op_data *op_data,
+                          struct ptlrpc_request **request)
+{
+        int rc;
+        ENTRY;
+        EXP_CHECK_MD_OP(exp, link);
+        EXP_MD_COUNTER_INCREMENT(exp, link);
+        rc = MDP(exp->exp_obd, link)(exp, op_data, request);
+        RETURN(rc);
+}
+
+static inline int md_rename(struct obd_export *exp, struct md_op_data *op_data,
+			    const char *old, size_t oldlen, const char *new,
+			    size_t newlen, struct ptlrpc_request **request)
+{
+        int rc;
+        ENTRY;
+        EXP_CHECK_MD_OP(exp, rename);
+        EXP_MD_COUNTER_INCREMENT(exp, rename);
+        rc = MDP(exp->exp_obd, rename)(exp, op_data, old, oldlen, new,
+                                       newlen, request);
+        RETURN(rc);
+}
+
+static inline int md_setattr(struct obd_export *exp, struct md_op_data *op_data,
+			     void *ea, size_t ealen,
+			     struct ptlrpc_request **request)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, setattr);
+	EXP_MD_COUNTER_INCREMENT(exp, setattr);
+	rc = MDP(exp->exp_obd, setattr)(exp, op_data, ea, ealen, request);
+	RETURN(rc);
+}
+
+static inline int md_fsync(struct obd_export *exp, const struct lu_fid *fid,
+			   struct ptlrpc_request **request)
+{
+	int rc;
+
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, fsync);
+	EXP_MD_COUNTER_INCREMENT(exp, fsync);
+	rc = MDP(exp->exp_obd, fsync)(exp, fid, request);
+
+	RETURN(rc);
+}
+
+static inline int md_read_page(struct obd_export *exp,
+			       struct md_op_data *op_data,
+			       struct md_callback *cb_op,
+			       __u64  hash_offset,
+			       struct page **ppage)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, read_page);
+	EXP_MD_COUNTER_INCREMENT(exp, read_page);
+	rc = MDP(exp->exp_obd, read_page)(exp, op_data, cb_op, hash_offset,
+					  ppage);
+	RETURN(rc);
+}
+
+static inline int md_unlink(struct obd_export *exp, struct md_op_data *op_data,
+                            struct ptlrpc_request **request)
+{
+        int rc;
+        ENTRY;
+        EXP_CHECK_MD_OP(exp, unlink);
+        EXP_MD_COUNTER_INCREMENT(exp, unlink);
+        rc = MDP(exp->exp_obd, unlink)(exp, op_data, request);
+        RETURN(rc);
+}
+
+static inline int md_get_lustre_md(struct obd_export *exp,
+                                   struct ptlrpc_request *req,
+                                   struct obd_export *dt_exp,
+                                   struct obd_export *md_exp,
+                                   struct lustre_md *md)
+{
+        ENTRY;
+        EXP_CHECK_MD_OP(exp, get_lustre_md);
+        EXP_MD_COUNTER_INCREMENT(exp, get_lustre_md);
+        RETURN(MDP(exp->exp_obd, get_lustre_md)(exp, req, dt_exp, md_exp, md));
+}
+
+static inline int md_free_lustre_md(struct obd_export *exp,
+                                    struct lustre_md *md)
+{
+        ENTRY;
+        EXP_CHECK_MD_OP(exp, free_lustre_md);
+        EXP_MD_COUNTER_INCREMENT(exp, free_lustre_md);
+        RETURN(MDP(exp->exp_obd, free_lustre_md)(exp, md));
+}
+
+static inline int md_merge_attr(struct obd_export *exp,
+				const struct lmv_stripe_md *lsm,
+				struct cl_attr *attr,
+				ldlm_blocking_callback cb)
+{
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, merge_attr);
+	EXP_MD_COUNTER_INCREMENT(exp, merge_attr);
+	RETURN(MDP(exp->exp_obd, merge_attr)(exp, lsm, attr, cb));
+}
+
+static inline int md_setxattr(struct obd_export *exp, const struct lu_fid *fid,
+			      u64 valid, const char *name,
+			      const char *input, int input_size,
+			      int output_size, int flags, __u32 suppgid,
+			      struct ptlrpc_request **request)
+{
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, setxattr);
+	EXP_MD_COUNTER_INCREMENT(exp, setxattr);
+	RETURN(MDP(exp->exp_obd, setxattr)(exp, fid, valid, name, input,
+					   input_size, output_size, flags,
+					   suppgid, request));
+}
+
+static inline int md_getxattr(struct obd_export *exp, const struct lu_fid *fid,
+			      u64 valid, const char *name,
+			      const char *input, int input_size,
+			      int output_size, int flags,
+			      struct ptlrpc_request **request)
+{
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, getxattr);
+	EXP_MD_COUNTER_INCREMENT(exp, getxattr);
+	RETURN(MDP(exp->exp_obd, getxattr)(exp, fid, valid, name, input,
+					   input_size, output_size, flags,
+					   request));
+}
+
+static inline int md_set_open_replay_data(struct obd_export *exp,
+					  struct obd_client_handle *och,
+					  struct lookup_intent *it)
+{
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, set_open_replay_data);
+	EXP_MD_COUNTER_INCREMENT(exp, set_open_replay_data);
+	RETURN(MDP(exp->exp_obd, set_open_replay_data)(exp, och, it));
+}
+
+static inline int md_clear_open_replay_data(struct obd_export *exp,
+                                            struct obd_client_handle *och)
+{
+        ENTRY;
+        EXP_CHECK_MD_OP(exp, clear_open_replay_data);
+        EXP_MD_COUNTER_INCREMENT(exp, clear_open_replay_data);
+        RETURN(MDP(exp->exp_obd, clear_open_replay_data)(exp, och));
+}
+
+static inline int md_set_lock_data(struct obd_export *exp,
+				   const struct lustre_handle *lockh,
+				   void *data, __u64 *bits)
+{
+        ENTRY;
+        EXP_CHECK_MD_OP(exp, set_lock_data);
+        EXP_MD_COUNTER_INCREMENT(exp, set_lock_data);
+        RETURN(MDP(exp->exp_obd, set_lock_data)(exp, lockh, data, bits));
+}
+
+static inline
+int md_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
+		     union ldlm_policy_data *policy, enum ldlm_mode mode,
+		     enum ldlm_cancel_flags cancel_flags, void *opaque)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_MD_OP(exp, cancel_unused);
+	EXP_MD_COUNTER_INCREMENT(exp, cancel_unused);
+
+	rc = MDP(exp->exp_obd, cancel_unused)(exp, fid, policy, mode,
+					      cancel_flags, opaque);
+	RETURN(rc);
+}
+
+static inline enum ldlm_mode md_lock_match(struct obd_export *exp, __u64 flags,
+					   const struct lu_fid *fid,
+					   enum ldlm_type type,
+					   union ldlm_policy_data *policy,
+					   enum ldlm_mode mode,
+					   struct lustre_handle *lockh)
+{
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, lock_match);
+	EXP_MD_COUNTER_INCREMENT(exp, lock_match);
+	RETURN(MDP(exp->exp_obd, lock_match)(exp, flags, fid, type,
+					     policy, mode, lockh));
+}
+
+static inline int md_init_ea_size(struct obd_export *exp, __u32 ea_size,
+				  __u32 def_ea_size)
+{
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, init_ea_size);
+	EXP_MD_COUNTER_INCREMENT(exp, init_ea_size);
+	RETURN(MDP(exp->exp_obd, init_ea_size)(exp, ea_size, def_ea_size));
+}
+
+static inline int md_intent_getattr_async(struct obd_export *exp,
+					  struct md_enqueue_info *minfo)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, intent_getattr_async);
+	EXP_MD_COUNTER_INCREMENT(exp, intent_getattr_async);
+	rc = MDP(exp->exp_obd, intent_getattr_async)(exp, minfo);
+	RETURN(rc);
+}
+
+static inline int md_revalidate_lock(struct obd_export *exp,
+                                     struct lookup_intent *it,
+                                     struct lu_fid *fid, __u64 *bits)
+{
+        int rc;
+        ENTRY;
+        EXP_CHECK_MD_OP(exp, revalidate_lock);
+        EXP_MD_COUNTER_INCREMENT(exp, revalidate_lock);
+        rc = MDP(exp->exp_obd, revalidate_lock)(exp, it, fid, bits);
+        RETURN(rc);
+}
+
+static inline int md_get_fid_from_lsm(struct obd_export *exp,
+				      const struct lmv_stripe_md *lsm,
+				      const char *name, int namelen,
+				      struct lu_fid *fid)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, get_fid_from_lsm);
+	EXP_MD_COUNTER_INCREMENT(exp, get_fid_from_lsm);
+	rc = MDP(exp->exp_obd, get_fid_from_lsm)(exp, lsm, name, namelen, fid);
+	RETURN(rc);
+}
+
+
+/* Unpack an MD struct from disk to in-memory format.
+ * Returns +ve size of unpacked MD (0 for free), or -ve error.
+ *
+ * If *plsm != NULL and lmm == NULL then *lsm will be freed.
+ * If *plsm == NULL then it will be allocated.
+ */
+static inline int md_unpackmd(struct obd_export *exp,
+			      struct lmv_stripe_md **plsm,
+			      const union lmv_mds_md *lmm, size_t lmm_size)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, unpackmd);
+	EXP_MD_COUNTER_INCREMENT(exp, unpackmd);
+	rc = MDP(exp->exp_obd, unpackmd)(exp, plsm, lmm, lmm_size);
+	RETURN(rc);
+}
+
+/* OBD Metadata Support */
+
+extern int obd_init_caches(void);
+extern void obd_cleanup_caches(void);
+
+/* support routines */
+extern struct kmem_cache *obdo_cachep;
+
+#define OBDO_ALLOC(ptr)                                                       \
+do {                                                                          \
+	OBD_SLAB_ALLOC_PTR_GFP((ptr), obdo_cachep, GFP_NOFS);             \
+} while(0)
+
+#define OBDO_FREE(ptr)                                                        \
+do {                                                                          \
+        OBD_SLAB_FREE_PTR((ptr), obdo_cachep);                                \
+} while(0)
+
+
+typedef int (*register_lwp_cb)(void *data);
+
+struct lwp_register_item {
+	struct obd_export **lri_exp;
+	register_lwp_cb	    lri_cb_func;
+	void		   *lri_cb_data;
+	struct list_head    lri_list;
+	atomic_t	    lri_ref;
+	char		    lri_name[MTI_NAME_MAXLEN];
+};
+
+/* I'm as embarrassed about this as you are.
+ *
+ * <shaver> // XXX do not look into _superhack with remaining eye
+ * <shaver> // XXX if this were any uglier, I'd get my own show on MTV */
+extern int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c);
+
+/* obd_mount.c */
+#ifdef HAVE_SERVER_SUPPORT
+int lustre_register_lwp_item(const char *lwpname, struct obd_export **exp,
+			     register_lwp_cb cb_func, void *cb_data);
+void lustre_deregister_lwp_item(struct obd_export **exp);
+struct obd_export *lustre_find_lwp_by_index(const char *dev, __u32 idx);
+void lustre_notify_lwp_list(struct obd_export *exp);
+int tgt_name2lwp_name(const char *tgt_name, char *lwp_name, int len, __u32 idx);
+#endif /* HAVE_SERVER_SUPPORT */
+int lustre_register_fs(void);
+int lustre_unregister_fs(void);
+int lustre_check_exclusion(struct super_block *sb, char *svname);
+
+/* sysctl.c */
+extern int obd_sysctl_init(void);
+extern void obd_sysctl_clean(void);
+
+/* uuid.c  */
+typedef __u8 class_uuid_t[16];
+void class_uuid_unparse(class_uuid_t in, struct obd_uuid *out);
+
+/* lustre_peer.c    */
+int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index);
+int class_add_uuid(const char *uuid, __u64 nid);
+int class_del_uuid (const char *uuid);
+int class_check_uuid(struct obd_uuid *uuid, __u64 nid);
+
+/* class_obd.c */
+extern char obd_jobid_node[];
+
+/* prng.c */
+#define ll_generate_random_uuid(uuid_out) cfs_get_random_bytes(uuid_out, sizeof(class_uuid_t))
+
+/* statfs_pack.c */
+struct kstatfs;
+void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs);
+void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs);
+
+/* root squash info */
+struct rw_semaphore;
+struct root_squash_info {
+	uid_t			rsi_uid;
+	gid_t			rsi_gid;
+	struct list_head	rsi_nosquash_nids;
+	struct rw_semaphore	rsi_sem;
+};
+
+int server_name2index(const char *svname, __u32 *idx, const char **endptr);
+
+/* linux-module.c */
+extern struct miscdevice obd_psdev;
+int obd_ioctl_getdata(char **buf, int *len, void __user *arg);
+int class_procfs_init(void);
+int class_procfs_clean(void);
+
+#endif /* __LINUX_OBD_CLASS_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/obd_support.h b/drivers/staging/lustrefsx/lustre/include/obd_support.h
new file mode 100644
index 0000000000000..19c179dfb1507
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/obd_support.h
@@ -0,0 +1,911 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _OBD_SUPPORT
+#define _OBD_SUPPORT
+
+#ifndef __KERNEL__
+# error Userspace should not include obd_support.h.
+#endif /* !__KERNEL__ */
+
+#include <linux/atomic.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include <libcfs/libcfs.h>
+#include <lprocfs_status.h>
+#include <lustre_handles.h>
+
+/* global variables */
+extern struct lprocfs_stats *obd_memory;
+enum {
+        OBD_MEMORY_STAT = 0,
+        OBD_STATS_NUM,
+};
+
+extern unsigned int obd_debug_peer_on_timeout;
+extern unsigned int obd_dump_on_timeout;
+extern unsigned int obd_dump_on_eviction;
+/* obd_timeout should only be used for recovery, not for
+   networking / disk / timings affected by load (use Adaptive Timeouts) */
+extern unsigned int obd_timeout;          /* seconds */
+extern unsigned int ldlm_timeout;         /* seconds */
+extern unsigned int obd_timeout_set;
+extern unsigned int ldlm_timeout_set;
+extern unsigned int bulk_timeout;
+extern unsigned int at_min;
+extern unsigned int at_max;
+extern unsigned int at_history;
+extern int at_early_margin;
+extern int at_extra;
+extern unsigned long obd_max_dirty_pages;
+extern atomic_long_t obd_dirty_pages;
+extern atomic_long_t obd_dirty_transit_pages;
+extern char obd_jobid_var[];
+
+/* Some hash init argument constants */
+#define HASH_POOLS_BKT_BITS 3
+#define HASH_POOLS_CUR_BITS 3
+#define HASH_POOLS_MAX_BITS 7
+#define HASH_UUID_BKT_BITS 5
+#define HASH_UUID_CUR_BITS 7
+#define HASH_UUID_MAX_BITS 12
+#define HASH_NID_BKT_BITS 5
+#define HASH_NID_CUR_BITS 7
+#define HASH_NID_MAX_BITS 12
+#define HASH_NID_STATS_BKT_BITS 5
+#define HASH_NID_STATS_CUR_BITS 7
+#define HASH_NID_STATS_MAX_BITS 12
+#define HASH_GEN_BKT_BITS 5
+#define HASH_GEN_CUR_BITS 7
+#define HASH_GEN_MAX_BITS 12
+#define HASH_LQE_BKT_BITS 5
+#define HASH_LQE_CUR_BITS 7
+#define HASH_LQE_MAX_BITS 12
+#define HASH_CONN_BKT_BITS 5
+#define HASH_CONN_CUR_BITS 5
+#define HASH_CONN_MAX_BITS 15
+#define HASH_EXP_LOCK_BKT_BITS  5
+#define HASH_EXP_LOCK_CUR_BITS  7
+#define HASH_EXP_LOCK_MAX_BITS  16
+#define HASH_CL_ENV_BKT_BITS    5
+#define HASH_CL_ENV_BITS        10
+#define HASH_JOB_STATS_BKT_BITS 5
+#define HASH_JOB_STATS_CUR_BITS 7
+#define HASH_JOB_STATS_MAX_BITS 12
+
+/* Timeout definitions */
+#define OBD_TIMEOUT_DEFAULT             100
+#define LDLM_TIMEOUT_DEFAULT            20
+#define MDS_LDLM_TIMEOUT_DEFAULT        6
+/* Time to wait for all clients to reconnect during recovery (hard limit) */
+#define OBD_RECOVERY_TIME_HARD          (obd_timeout * 9)
+/* Time to wait for all clients to reconnect during recovery (soft limit) */
+/* Should be very conservative; must catch the first reconnect after reboot */
+#define OBD_RECOVERY_TIME_SOFT          (obd_timeout * 3)
+/* Change recovery-small 26b time if you change this */
+#define PING_INTERVAL max(obd_timeout / 4, 1U)
+/* a bit more than maximal journal commit time in seconds */
+#define PING_INTERVAL_SHORT min(PING_INTERVAL, 7U)
+/* Client may skip 1 ping; we must wait at least 2.5. But for multiple
+ * failover targets the client only pings one server at a time, and pings
+ * can be lost on a loaded network. Since eviction has serious consequences,
+ * and there's no urgent need to evict a client just because it's idle, we
+ * should be very conservative here. */
+#define PING_EVICT_TIMEOUT (PING_INTERVAL * 6)
+#define DISK_TIMEOUT 50          /* Beyond this we warn about disk speed */
+#define CONNECTION_SWITCH_MIN 5U /* Connection switching rate limiter */
+ /* Max connect interval for nonresponsive servers; ~50s to avoid building up
+    connect requests in the LND queues, but within obd_timeout so we don't
+    miss the recovery window */
+#define CONNECTION_SWITCH_MAX min(50U, max(CONNECTION_SWITCH_MIN,obd_timeout))
+#define CONNECTION_SWITCH_INC 5  /* Connection timeout backoff */
+/* In general this should be low to have quick detection of a system
+   running on a backup server. (If it's too low, import_select_connection
+   will increase the timeout anyhow.)  */
+#define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN,obd_timeout/20)
+/* The max delay between connects is SWITCH_MAX + SWITCH_INC + INITIAL */
+#define RECONNECT_DELAY_MAX (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC + \
+                             INITIAL_CONNECT_TIMEOUT)
+/* The min time a target should wait for clients to reconnect in recovery */
+#define OBD_RECOVERY_TIME_MIN    (2*RECONNECT_DELAY_MAX)
+#define OBD_IR_FACTOR_MIN         1
+#define OBD_IR_FACTOR_MAX         10
+#define OBD_IR_FACTOR_DEFAULT    (OBD_IR_FACTOR_MAX/2)
+/* default timeout for the MGS to become IR_FULL */
+#define OBD_IR_MGS_TIMEOUT       (4*obd_timeout)
+#define LONG_UNLINK 300          /* Unlink should happen before now */
+
+/**
+ * Time interval of shrink, if the client is "idle" more than this interval,
+ * then the ll_grant thread will return the requested grant space to filter
+ */
+#define GRANT_SHRINK_INTERVAL            1200/*20 minutes*/
+
+#define OBD_FAIL_MDS                     0x100
+#define OBD_FAIL_MDS_HANDLE_UNPACK       0x101
+#define OBD_FAIL_MDS_GETATTR_NET         0x102
+#define OBD_FAIL_MDS_GETATTR_PACK        0x103
+#define OBD_FAIL_MDS_READPAGE_NET        0x104
+#define OBD_FAIL_MDS_READPAGE_PACK       0x105
+#define OBD_FAIL_MDS_SENDPAGE            0x106
+#define OBD_FAIL_MDS_REINT_NET           0x107
+#define OBD_FAIL_MDS_REINT_UNPACK        0x108
+#define OBD_FAIL_MDS_REINT_SETATTR       0x109
+#define OBD_FAIL_MDS_REINT_SETATTR_WRITE 0x10a
+#define OBD_FAIL_MDS_REINT_CREATE        0x10b
+#define OBD_FAIL_MDS_REINT_CREATE_WRITE  0x10c
+#define OBD_FAIL_MDS_REINT_UNLINK        0x10d
+#define OBD_FAIL_MDS_REINT_UNLINK_WRITE  0x10e
+#define OBD_FAIL_MDS_REINT_LINK          0x10f
+#define OBD_FAIL_MDS_REINT_LINK_WRITE    0x110
+#define OBD_FAIL_MDS_REINT_RENAME        0x111
+#define OBD_FAIL_MDS_REINT_RENAME_WRITE  0x112
+#define OBD_FAIL_MDS_OPEN_NET            0x113
+#define OBD_FAIL_MDS_OPEN_PACK           0x114
+#define OBD_FAIL_MDS_CLOSE_NET           0x115
+#define OBD_FAIL_MDS_CLOSE_PACK          0x116
+#define OBD_FAIL_MDS_CONNECT_NET         0x117
+#define OBD_FAIL_MDS_CONNECT_PACK        0x118
+#define OBD_FAIL_MDS_REINT_NET_REP       0x119
+#define OBD_FAIL_MDS_DISCONNECT_NET      0x11a
+#define OBD_FAIL_MDS_GET_ROOT_NET	 0x11b
+#define OBD_FAIL_MDS_GET_ROOT_PACK	 0x11c
+#define OBD_FAIL_MDS_STATFS_PACK         0x11d
+#define OBD_FAIL_MDS_STATFS_NET          0x11e
+#define OBD_FAIL_MDS_GETATTR_NAME_NET    0x11f
+#define OBD_FAIL_MDS_PIN_NET             0x120
+#define OBD_FAIL_MDS_UNPIN_NET           0x121
+#define OBD_FAIL_MDS_ALL_REPLY_NET       0x122
+#define OBD_FAIL_MDS_ALL_REQUEST_NET     0x123
+#define OBD_FAIL_MDS_SYNC_NET            0x124
+#define OBD_FAIL_MDS_SYNC_PACK           0x125
+/*	OBD_FAIL_MDS_DONE_WRITING_NET    0x126 obsolete since 2.8.0 */
+/*	OBD_FAIL_MDS_DONE_WRITING_PACK   0x127 obsolete since 2.8.0 */
+#define OBD_FAIL_MDS_ALLOC_OBDO          0x128
+#define OBD_FAIL_MDS_PAUSE_OPEN          0x129
+#define OBD_FAIL_MDS_STATFS_LCW_SLEEP    0x12a
+#define OBD_FAIL_MDS_OPEN_CREATE         0x12b
+#define OBD_FAIL_MDS_OST_SETATTR         0x12c
+/*	OBD_FAIL_MDS_QUOTACHECK_NET      0x12d obsolete since 2.4 */
+#define OBD_FAIL_MDS_QUOTACTL_NET        0x12e
+#define OBD_FAIL_MDS_CLIENT_ADD          0x12f
+#define OBD_FAIL_MDS_GETXATTR_NET        0x130
+#define OBD_FAIL_MDS_GETXATTR_PACK       0x131
+#define OBD_FAIL_MDS_SETXATTR_NET        0x132
+#define OBD_FAIL_MDS_SETXATTR            0x133
+#define OBD_FAIL_MDS_SETXATTR_WRITE      0x134
+#define OBD_FAIL_MDS_FS_SETUP            0x135
+#define OBD_FAIL_MDS_RESEND              0x136
+#define OBD_FAIL_MDS_LLOG_CREATE_FAILED  0x137
+#define OBD_FAIL_MDS_LOV_SYNC_RACE       0x138
+#define OBD_FAIL_MDS_OSC_PRECREATE       0x139
+#define OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT   0x13a
+#define OBD_FAIL_MDS_CLOSE_NET_REP       0x13b
+#define OBD_FAIL_MDS_BLOCK_QUOTA_REQ     0x13c
+#define OBD_FAIL_MDS_DROP_QUOTA_REQ      0x13d
+#define OBD_FAIL_MDS_REMOVE_COMMON_EA    0x13e
+#define OBD_FAIL_MDS_ALLOW_COMMON_EA_SETTING   0x13f
+#define OBD_FAIL_MDS_FAIL_LOV_LOG_ADD    0x140
+#define OBD_FAIL_MDS_LOV_PREP_CREATE     0x141
+#define OBD_FAIL_MDS_REINT_DELAY         0x142
+#define OBD_FAIL_MDS_READLINK_EPROTO     0x143
+#define OBD_FAIL_MDS_OPEN_WAIT_CREATE    0x144
+#define OBD_FAIL_MDS_PDO_LOCK            0x145
+#define OBD_FAIL_MDS_PDO_LOCK2           0x146
+#define OBD_FAIL_MDS_OSC_CREATE_FAIL     0x147
+#define OBD_FAIL_MDS_NEGATIVE_POSITIVE	 0x148
+#define OBD_FAIL_MDS_HSM_STATE_GET_NET		0x149
+#define OBD_FAIL_MDS_HSM_STATE_SET_NET		0x14a
+#define OBD_FAIL_MDS_HSM_PROGRESS_NET		0x14b
+#define OBD_FAIL_MDS_HSM_REQUEST_NET		0x14c
+#define OBD_FAIL_MDS_HSM_CT_REGISTER_NET	0x14d
+#define OBD_FAIL_MDS_HSM_CT_UNREGISTER_NET	0x14e
+#define OBD_FAIL_MDS_SWAP_LAYOUTS_NET		0x14f
+#define OBD_FAIL_MDS_HSM_ACTION_NET		0x150
+#define OBD_FAIL_MDS_CHANGELOG_INIT		0x151
+#define OBD_FAIL_MDS_HSM_SWAP_LAYOUTS		0x152
+#define OBD_FAIL_MDS_RENAME              0x153
+#define OBD_FAIL_MDS_RENAME2             0x154
+#define OBD_FAIL_MDS_RENAME3             0x155
+#define OBD_FAIL_MDS_RENAME4             0x156
+#define OBD_FAIL_MDS_LDLM_REPLY_NET	 0x157
+#define OBD_FAIL_MDS_STALE_DIR_LAYOUT	 0x158
+#define OBD_FAIL_MDS_REINT_MULTI_NET     0x159
+#define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a
+#define OBD_FAIL_MDS_LLOG_CREATE_FAILED2 0x15b
+#define OBD_FAIL_MDS_FLD_LOOKUP			0x15c
+#define OBD_FAIL_MDS_INTENT_DELAY		0x160
+#define OBD_FAIL_MDS_XATTR_REP			0x161
+#define OBD_FAIL_MDS_TRACK_OVERFLOW	 0x162
+#define OBD_FAIL_MDS_LOV_CREATE_RACE	 0x163
+#define OBD_FAIL_MDS_HSM_CDT_DELAY	 0x164
+
+/* layout lock */
+#define OBD_FAIL_MDS_NO_LL_GETATTR	 0x170
+#define OBD_FAIL_MDS_NO_LL_OPEN		 0x171
+#define OBD_FAIL_MDS_LL_BLOCK		 0x172
+
+/* CMD */
+#define OBD_FAIL_MDS_IS_SUBDIR_NET       0x180
+#define OBD_FAIL_MDS_IS_SUBDIR_PACK      0x181
+#define OBD_FAIL_MDS_SET_INFO_NET        0x182
+#define OBD_FAIL_MDS_WRITEPAGE_NET       0x183
+#define OBD_FAIL_MDS_WRITEPAGE_PACK      0x184
+#define OBD_FAIL_MDS_RECOVERY_ACCEPTS_GAPS 0x185
+#define OBD_FAIL_MDS_GET_INFO_NET        0x186
+#define OBD_FAIL_MDS_DQACQ_NET           0x187
+
+/* OI scrub */
+#define OBD_FAIL_OSD_SCRUB_DELAY			0x190
+#define OBD_FAIL_OSD_SCRUB_CRASH			0x191
+#define OBD_FAIL_OSD_SCRUB_FATAL			0x192
+#define OBD_FAIL_OSD_FID_MAPPING			0x193
+#define OBD_FAIL_OSD_LMA_INCOMPAT			0x194
+#define OBD_FAIL_OSD_COMPAT_INVALID_ENTRY		0x195
+#define OBD_FAIL_OSD_COMPAT_NO_ENTRY			0x196
+#define OBD_FAIL_OSD_OST_EA_FID_SET			0x197
+
+#define OBD_FAIL_OST                     0x200
+#define OBD_FAIL_OST_CONNECT_NET         0x201
+#define OBD_FAIL_OST_DISCONNECT_NET      0x202
+#define OBD_FAIL_OST_GET_INFO_NET        0x203
+#define OBD_FAIL_OST_CREATE_NET          0x204
+#define OBD_FAIL_OST_DESTROY_NET         0x205
+#define OBD_FAIL_OST_GETATTR_NET         0x206
+#define OBD_FAIL_OST_SETATTR_NET         0x207
+#define OBD_FAIL_OST_OPEN_NET            0x208
+#define OBD_FAIL_OST_CLOSE_NET           0x209
+#define OBD_FAIL_OST_BRW_NET             0x20a
+#define OBD_FAIL_OST_PUNCH_NET           0x20b
+#define OBD_FAIL_OST_STATFS_NET          0x20c
+#define OBD_FAIL_OST_HANDLE_UNPACK       0x20d
+#define OBD_FAIL_OST_BRW_WRITE_BULK      0x20e
+#define OBD_FAIL_OST_BRW_READ_BULK       0x20f
+#define OBD_FAIL_OST_SYNC_NET            0x210
+#define OBD_FAIL_OST_ALL_REPLY_NET       0x211
+#define OBD_FAIL_OST_ALL_REQUEST_NET     0x212
+#define OBD_FAIL_OST_LDLM_REPLY_NET      0x213
+#define OBD_FAIL_OST_BRW_PAUSE_BULK      0x214
+#define OBD_FAIL_OST_ENOSPC              0x215
+#define OBD_FAIL_OST_EROFS               0x216
+#define OBD_FAIL_SRV_ENOENT              0x217
+/*	OBD_FAIL_OST_QUOTACHECK_NET      0x218 obsolete since 2.4 */
+#define OBD_FAIL_OST_QUOTACTL_NET        0x219
+#define OBD_FAIL_OST_CHECKSUM_RECEIVE    0x21a
+#define OBD_FAIL_OST_CHECKSUM_SEND       0x21b
+#define OBD_FAIL_OST_BRW_SIZE            0x21c
+#define OBD_FAIL_OST_DROP_REQ            0x21d
+#define OBD_FAIL_OST_SETATTR_CREDITS     0x21e
+#define OBD_FAIL_OST_HOLD_WRITE_RPC      0x21f
+#define OBD_FAIL_OST_BRW_WRITE_BULK2     0x220
+#define OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221
+#define OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222
+#define OBD_FAIL_OST_PAUSE_CREATE        0x223
+#define OBD_FAIL_OST_BRW_PAUSE_PACK      0x224
+#define OBD_FAIL_OST_CONNECT_NET2        0x225
+#define OBD_FAIL_OST_NOMEM               0x226
+#define OBD_FAIL_OST_BRW_PAUSE_BULK2     0x227
+#define OBD_FAIL_OST_MAPBLK_ENOSPC       0x228
+#define OBD_FAIL_OST_ENOINO              0x229
+#define OBD_FAIL_OST_DQACQ_NET           0x230
+#define OBD_FAIL_OST_STATFS_EINPROGRESS  0x231
+#define OBD_FAIL_OST_SET_INFO_NET        0x232
+#define OBD_FAIL_OST_NODESTROY		 0x233
+#define OBD_FAIL_OST_READ_SIZE		 0x234
+#define OBD_FAIL_OST_LADVISE_NET	 0x235
+#define OBD_FAIL_OST_PAUSE_PUNCH         0x236
+#define OBD_FAIL_OST_LADVISE_PAUSE	 0x237
+#define OBD_FAIL_OST_FAKE_RW		 0x238
+
+#define OBD_FAIL_LDLM                    0x300
+#define OBD_FAIL_LDLM_NAMESPACE_NEW      0x301
+#define OBD_FAIL_LDLM_ENQUEUE_NET			0x302
+#define OBD_FAIL_LDLM_CONVERT_NET			0x303
+#define OBD_FAIL_LDLM_CANCEL_NET			0x304
+#define OBD_FAIL_LDLM_BL_CALLBACK_NET			0x305
+#define OBD_FAIL_LDLM_CP_CALLBACK_NET			0x306
+#define OBD_FAIL_LDLM_GL_CALLBACK_NET			0x307
+#define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308
+#define OBD_FAIL_LDLM_ENQUEUE_INTENT_ERR 0x309
+#define OBD_FAIL_LDLM_CREATE_RESOURCE    0x30a
+#define OBD_FAIL_LDLM_ENQUEUE_BLOCKED    0x30b
+#define OBD_FAIL_LDLM_REPLY              0x30c
+#define OBD_FAIL_LDLM_RECOV_CLIENTS      0x30d
+#define OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT 0x30e
+#define OBD_FAIL_LDLM_GLIMPSE            0x30f
+#define OBD_FAIL_LDLM_CANCEL_RACE        0x310
+#define OBD_FAIL_LDLM_CANCEL_EVICT_RACE  0x311
+#define OBD_FAIL_LDLM_PAUSE_CANCEL       0x312
+#define OBD_FAIL_LDLM_CLOSE_THREAD       0x313
+#define OBD_FAIL_LDLM_CANCEL_BL_CB_RACE  0x314
+#define OBD_FAIL_LDLM_CP_CB_WAIT         0x315
+#define OBD_FAIL_LDLM_OST_FAIL_RACE      0x316
+#define OBD_FAIL_LDLM_INTR_CP_AST        0x317
+#define OBD_FAIL_LDLM_CP_BL_RACE         0x318
+#define OBD_FAIL_LDLM_NEW_LOCK           0x319
+#define OBD_FAIL_LDLM_AGL_DELAY          0x31a
+#define OBD_FAIL_LDLM_AGL_NOLOCK         0x31b
+#define OBD_FAIL_LDLM_OST_LVB		 0x31c
+#define OBD_FAIL_LDLM_ENQUEUE_HANG	 0x31d
+#define OBD_FAIL_LDLM_BL_EVICT           0x31e
+#define OBD_FAIL_LDLM_PAUSE_CANCEL2      0x31f
+#define OBD_FAIL_LDLM_CP_CB_WAIT2        0x320
+#define OBD_FAIL_LDLM_CP_CB_WAIT3        0x321
+#define OBD_FAIL_LDLM_CP_CB_WAIT4        0x322
+#define OBD_FAIL_LDLM_CP_CB_WAIT5        0x323
+#define OBD_FAIL_LDLM_SRV_BL_AST	 0x324
+#define OBD_FAIL_LDLM_SRV_CP_AST	 0x325
+#define OBD_FAIL_LDLM_SRV_GL_AST	 0x326
+#define OBD_FAIL_LDLM_WATERMARK_LOW	 0x327
+#define OBD_FAIL_LDLM_WATERMARK_HIGH	 0x328
+
+#define OBD_FAIL_LDLM_GRANT_CHECK        0x32a
+#define OBD_FAIL_LDLM_PROLONG_PAUSE	 0x32b
+
+/* LOCKLESS IO */
+#define OBD_FAIL_LDLM_SET_CONTENTION     0x385
+
+#define OBD_FAIL_OSC                     0x400
+#define OBD_FAIL_OSC_BRW_READ_BULK       0x401
+#define OBD_FAIL_OSC_BRW_WRITE_BULK      0x402
+#define OBD_FAIL_OSC_LOCK_BL_AST         0x403
+#define OBD_FAIL_OSC_LOCK_CP_AST         0x404
+#define OBD_FAIL_OSC_MATCH               0x405
+#define OBD_FAIL_OSC_BRW_PREP_REQ        0x406
+#define OBD_FAIL_OSC_SHUTDOWN            0x407
+#define OBD_FAIL_OSC_CHECKSUM_RECEIVE    0x408
+#define OBD_FAIL_OSC_CHECKSUM_SEND       0x409
+#define OBD_FAIL_OSC_BRW_PREP_REQ2       0x40a
+/* #define OBD_FAIL_OSC_CONNECT_CKSUM       0x40b Obsolete since 2.9 */
+#define OBD_FAIL_OSC_CKSUM_ADLER_ONLY    0x40c
+#define OBD_FAIL_OSC_DIO_PAUSE           0x40d
+#define OBD_FAIL_OSC_OBJECT_CONTENTION   0x40e
+#define OBD_FAIL_OSC_CP_CANCEL_RACE      0x40f
+#define OBD_FAIL_OSC_CP_ENQ_RACE         0x410
+#define OBD_FAIL_OSC_NO_GRANT            0x411
+#define OBD_FAIL_OSC_DELAY_SETTIME	 0x412
+#define OBD_FAIL_OSC_CONNECT_GRANT_PARAM 0x413
+#define OBD_FAIL_OSC_DELAY_IO            0x414
+
+#define OBD_FAIL_PTLRPC                  0x500
+#define OBD_FAIL_PTLRPC_ACK              0x501
+#define OBD_FAIL_PTLRPC_RQBD             0x502
+#define OBD_FAIL_PTLRPC_BULK_GET_NET     0x503
+#define OBD_FAIL_PTLRPC_BULK_PUT_NET     0x504
+#define OBD_FAIL_PTLRPC_DROP_RPC         0x505
+#define OBD_FAIL_PTLRPC_DELAY_SEND       0x506
+#define OBD_FAIL_PTLRPC_DELAY_RECOV      0x507
+#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB   0x508
+#define OBD_FAIL_PTLRPC_PAUSE_REQ        0x50a
+#define OBD_FAIL_PTLRPC_PAUSE_REP        0x50c
+#define OBD_FAIL_PTLRPC_IMP_DEACTIVE     0x50d
+#define OBD_FAIL_PTLRPC_DUMP_LOG         0x50e
+#define OBD_FAIL_PTLRPC_LONG_REPL_UNLINK 0x50f
+#define OBD_FAIL_PTLRPC_LONG_BULK_UNLINK 0x510
+#define OBD_FAIL_PTLRPC_HPREQ_TIMEOUT    0x511
+#define OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT  0x512
+#define OBD_FAIL_PTLRPC_DROP_REQ_OPC     0x513
+#define OBD_FAIL_PTLRPC_FINISH_REPLAY    0x514
+#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2  0x515
+#define OBD_FAIL_PTLRPC_DELAY_IMP_FULL   0x516
+#define OBD_FAIL_PTLRPC_CANCEL_RESEND    0x517
+#define OBD_FAIL_PTLRPC_DROP_BULK	 0x51a
+#define OBD_FAIL_PTLRPC_LONG_REQ_UNLINK  0x51b
+#define OBD_FAIL_PTLRPC_LONG_BOTH_UNLINK 0x51c
+#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB3  0x520
+
+#define OBD_FAIL_OBD_PING_NET            0x600
+#define OBD_FAIL_OBD_LOG_CANCEL_NET      0x601
+#define OBD_FAIL_OBD_LOGD_NET            0x602
+/*	OBD_FAIL_OBD_QC_CALLBACK_NET     0x603 obsolete since 2.4 */
+#define OBD_FAIL_OBD_DQACQ               0x604
+#define OBD_FAIL_OBD_LLOG_SETUP          0x605
+#define OBD_FAIL_OBD_LOG_CANCEL_REP      0x606
+#define OBD_FAIL_OBD_IDX_READ_NET        0x607
+#define OBD_FAIL_OBD_IDX_READ_BREAK	 0x608
+#define OBD_FAIL_OBD_NO_LRU		 0x609
+#define OBD_FAIL_OBDCLASS_MODULE_LOAD	 0x60a
+
+#define OBD_FAIL_TGT_REPLY_NET           0x700
+#define OBD_FAIL_TGT_CONN_RACE           0x701
+#define OBD_FAIL_TGT_FORCE_RECONNECT     0x702
+#define OBD_FAIL_TGT_DELAY_CONNECT       0x703
+#define OBD_FAIL_TGT_DELAY_RECONNECT     0x704
+#define OBD_FAIL_TGT_DELAY_PRECREATE     0x705
+#define OBD_FAIL_TGT_TOOMANY_THREADS     0x706
+#define OBD_FAIL_TGT_REPLAY_DROP         0x707
+#define OBD_FAIL_TGT_FAKE_EXP            0x708
+#define OBD_FAIL_TGT_REPLAY_DELAY        0x709
+/* #define OBD_FAIL_TGT_LAST_REPLAY         0x710 (obsoleted) */
+#define OBD_FAIL_TGT_CLIENT_ADD          0x711
+#define OBD_FAIL_TGT_RCVG_FLAG           0x712
+#define OBD_FAIL_TGT_DELAY_CONDITIONAL	 0x713
+#define OBD_FAIL_TGT_REPLAY_DELAY2       0x714
+#define OBD_FAIL_TGT_REPLAY_RECONNECT	 0x715
+#define OBD_FAIL_TGT_MOUNT_RACE		 0x716
+#define OBD_FAIL_TGT_REPLAY_TIMEOUT	 0x717
+#define OBD_FAIL_TGT_CLIENT_DEL		 0x718
+#define OBD_FAIL_TGT_SLUGGISH_NET	 0x719
+#define OBD_FAIL_TGT_RCVD_EIO		 0x720
+
+#define OBD_FAIL_MDC_REVALIDATE_PAUSE    0x800
+#define OBD_FAIL_MDC_ENQUEUE_PAUSE       0x801
+#define OBD_FAIL_MDC_OLD_EXT_FLAGS       0x802
+#define OBD_FAIL_MDC_GETATTR_ENQUEUE     0x803
+#define OBD_FAIL_MDC_RPCS_SEM		 0x804
+#define OBD_FAIL_MDC_LIGHTWEIGHT	 0x805
+#define OBD_FAIL_MDC_CLOSE		 0x806
+
+#define OBD_FAIL_MGS                     0x900
+#define OBD_FAIL_MGS_ALL_REQUEST_NET     0x901
+#define OBD_FAIL_MGS_ALL_REPLY_NET       0x902
+#define OBD_FAIL_MGC_PAUSE_PROCESS_LOG   0x903
+#define OBD_FAIL_MGS_PAUSE_REQ           0x904
+#define OBD_FAIL_MGS_PAUSE_TARGET_REG    0x905
+#define OBD_FAIL_MGS_CONNECT_NET	 0x906
+#define OBD_FAIL_MGS_DISCONNECT_NET	 0x907
+#define OBD_FAIL_MGS_SET_INFO_NET	 0x908
+#define OBD_FAIL_MGS_EXCEPTION_NET	 0x909
+#define OBD_FAIL_MGS_TARGET_REG_NET	 0x90a
+#define OBD_FAIL_MGS_TARGET_DEL_NET	 0x90b
+#define OBD_FAIL_MGS_CONFIG_READ_NET	 0x90c
+#define OBD_FAIL_MGS_LDLM_REPLY_NET	 0x90d
+#define OBD_FAIL_MGS_WRITE_TARGET_DELAY	 0x90e
+
+#define OBD_FAIL_QUOTA_DQACQ_NET			0xA01
+#define OBD_FAIL_QUOTA_EDQUOT            0xA02
+#define OBD_FAIL_QUOTA_DELAY_REINT       0xA03
+#define OBD_FAIL_QUOTA_RECOVERABLE_ERR   0xA04
+
+#define OBD_FAIL_LPROC_REMOVE            0xB00
+
+#define OBD_FAIL_SEQ                     0x1000
+#define OBD_FAIL_SEQ_QUERY_NET           0x1001
+#define OBD_FAIL_SEQ_EXHAUST		 0x1002
+
+#define OBD_FAIL_FLD                     0x1100
+#define OBD_FAIL_FLD_QUERY_NET           0x1101
+#define OBD_FAIL_FLD_READ_NET		 0x1102
+
+#define OBD_FAIL_SEC_CTX                 0x1200
+#define OBD_FAIL_SEC_CTX_INIT_NET        0x1201
+#define OBD_FAIL_SEC_CTX_INIT_CONT_NET   0x1202
+#define OBD_FAIL_SEC_CTX_FINI_NET        0x1203
+#define OBD_FAIL_SEC_CTX_HDL_PAUSE       0x1204
+
+#define OBD_FAIL_LLOG                               0x1300
+#define OBD_FAIL_LLOG_ORIGIN_CONNECT_NET            0x1301
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CREATE_NET      0x1302
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_DESTROY_NET     0x1303
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_READ_HEADER_NET 0x1304
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_NEXT_BLOCK_NET  0x1305
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_PREV_BLOCK_NET  0x1306
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_WRITE_REC_NET   0x1307
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CLOSE_NET       0x1308
+#define OBD_FAIL_LLOG_CATINFO_NET                   0x1309
+#define OBD_FAIL_MDS_SYNC_CAPA_SL                   0x1310
+#define OBD_FAIL_SEQ_ALLOC                          0x1311
+#define OBD_FAIL_CAT_RECORDS			    0x1312
+
+#define OBD_FAIL_LLITE                              0x1400
+#define OBD_FAIL_LLITE_FAULT_TRUNC_RACE             0x1401
+#define OBD_FAIL_LOCK_STATE_WAIT_INTR               0x1402
+#define OBD_FAIL_LOV_INIT			    0x1403
+#define OBD_FAIL_GLIMPSE_DELAY			    0x1404
+#define OBD_FAIL_LLITE_XATTR_ENOMEM		    0x1405
+#define OBD_FAIL_MAKE_LOVEA_HOLE		    0x1406
+#define OBD_FAIL_LLITE_LOST_LAYOUT		    0x1407
+#define OBD_FAIL_LLITE_NO_CHECK_DEAD		    0x1408
+#define OBD_FAIL_GETATTR_DELAY			    0x1409
+#define OBD_FAIL_LLITE_CREATE_FILE_PAUSE	    0x1409
+#define OBD_FAIL_LLITE_NEWNODE_PAUSE		    0x140a
+#define OBD_FAIL_LLITE_SETDIRSTRIPE_PAUSE	    0x140b
+#define OBD_FAIL_LLITE_CREATE_NODE_PAUSE	    0x140c
+#define OBD_FAIL_LLITE_PTASK_IO_FAIL		    0x140d
+#define OBD_FAIL_LLITE_IMUTEX_SEC		    0x140e
+#define OBD_FAIL_LLITE_IMUTEX_NOSEC		    0x140f
+
+#define OBD_FAIL_FID_INDIR	0x1501
+#define OBD_FAIL_FID_INLMA	0x1502
+#define OBD_FAIL_FID_IGIF	0x1504
+#define OBD_FAIL_FID_LOOKUP	0x1505
+#define OBD_FAIL_FID_NOLMA	0x1506
+
+/* LFSCK */
+#define OBD_FAIL_LFSCK_DELAY1		0x1600
+#define OBD_FAIL_LFSCK_DELAY2		0x1601
+#define OBD_FAIL_LFSCK_DELAY3		0x1602
+#define OBD_FAIL_LFSCK_LINKEA_CRASH	0x1603
+#define OBD_FAIL_LFSCK_LINKEA_MORE	0x1604
+#define OBD_FAIL_LFSCK_LINKEA_MORE2	0x1605
+#define OBD_FAIL_LFSCK_FATAL1		0x1608
+#define OBD_FAIL_LFSCK_FATAL2		0x1609
+#define OBD_FAIL_LFSCK_CRASH		0x160a
+#define OBD_FAIL_LFSCK_NO_AUTO		0x160b
+#define OBD_FAIL_LFSCK_NO_DOUBLESCAN	0x160c
+#define OBD_FAIL_LFSCK_SKIP_LASTID	0x160d
+#define OBD_FAIL_LFSCK_DELAY4		0x160e
+#define OBD_FAIL_LFSCK_BAD_LMMOI	0x160f
+#define OBD_FAIL_LFSCK_DANGLING 	0x1610
+#define OBD_FAIL_LFSCK_UNMATCHED_PAIR1	0x1611
+#define OBD_FAIL_LFSCK_UNMATCHED_PAIR2	0x1612
+#define OBD_FAIL_LFSCK_BAD_OWNER	0x1613
+#define OBD_FAIL_LFSCK_MULTIPLE_REF	0x1614
+#define OBD_FAIL_LFSCK_LOST_STRIPE	0x1615
+#define OBD_FAIL_LFSCK_LOST_MDTOBJ	0x1616
+#define OBD_FAIL_LFSCK_NOPFID		0x1617
+#define OBD_FAIL_LFSCK_CHANGE_STRIPE	0x1618
+#define OBD_FAIL_LFSCK_INVALID_PFID	0x1619
+#define OBD_FAIL_LFSCK_LOST_SPEOBJ	0x161a
+#define OBD_FAIL_LFSCK_DELAY5		0x161b
+#define OBD_FAIL_LFSCK_BAD_NETWORK	0x161c
+#define OBD_FAIL_LFSCK_NO_LINKEA	0x161d
+#define OBD_FAIL_LFSCK_BAD_PARENT	0x161e
+#define OBD_FAIL_LFSCK_DANGLING2	0x1620
+#define OBD_FAIL_LFSCK_DANGLING3	0x1621
+#define OBD_FAIL_LFSCK_MUL_REF		0x1622
+#define OBD_FAIL_LFSCK_BAD_TYPE		0x1623
+#define OBD_FAIL_LFSCK_NO_NAMEENTRY	0x1624
+#define OBD_FAIL_LFSCK_LESS_NLINK	0x1626
+#define OBD_FAIL_LFSCK_BAD_NAME_HASH	0x1628
+#define OBD_FAIL_LFSCK_LOST_MASTER_LMV	0x1629
+#define OBD_FAIL_LFSCK_LOST_SLAVE_LMV	0x162a
+#define OBD_FAIL_LFSCK_BAD_SLAVE_LMV	0x162b
+#define OBD_FAIL_LFSCK_BAD_SLAVE_NAME	0x162c
+#define OBD_FAIL_LFSCK_ASSISTANT_DIRECT	0x162d
+#define OBD_FAIL_LFSCK_LOST_MDTOBJ2	0x162e
+#define OBD_FAIL_LFSCK_BAD_PFL_RANGE	0x162f
+
+#define OBD_FAIL_LFSCK_NOTIFY_NET	0x16f0
+#define OBD_FAIL_LFSCK_QUERY_NET	0x16f1
+
+/* UPDATE */
+#define OBD_FAIL_OUT_UPDATE_NET		0x1700
+#define OBD_FAIL_OUT_UPDATE_NET_REP	0x1701
+#define OBD_FAIL_SPLIT_UPDATE_REC	0x1702
+#define OBD_FAIL_LARGE_STRIPE		0x1703
+#define OBD_FAIL_OUT_ENOSPC             0x1704
+#define OBD_FAIL_INVALIDATE_UPDATE	0x1705
+
+/* MIGRATE */
+#define OBD_FAIL_MIGRATE_NET_REP		0x1800
+#define OBD_FAIL_MIGRATE_ENTRIES		0x1801
+#define OBD_FAIL_MIGRATE_LINKEA			0x1802
+#define OBD_FAIL_MIGRATE_DELAY			0x1803
+
+/* LMV */
+#define OBD_FAIL_UNKNOWN_LMV_STRIPE		0x1901
+
+/* DT */
+#define OBD_FAIL_DT_DECLARE_ATTR_GET		0x2000
+#define OBD_FAIL_DT_ATTR_GET			0x2001
+#define OBD_FAIL_DT_DECLARE_ATTR_SET		0x2002
+#define OBD_FAIL_DT_ATTR_SET			0x2003
+#define OBD_FAIL_DT_DECLARE_XATTR_GET		0x2004
+#define OBD_FAIL_DT_XATTR_GET			0x2005
+#define OBD_FAIL_DT_DECLARE_XATTR_SET		0x2006
+#define OBD_FAIL_DT_XATTR_SET			0x2007
+#define OBD_FAIL_DT_DECLARE_XATTR_DEL		0x2008
+#define OBD_FAIL_DT_XATTR_DEL			0x2009
+#define OBD_FAIL_DT_XATTR_LIST			0x200a
+#define OBD_FAIL_DT_DECLARE_CREATE		0x200b
+#define OBD_FAIL_DT_CREATE			0x200c
+#define OBD_FAIL_DT_DECLARE_DESTROY		0x200d
+#define OBD_FAIL_DT_DESTROY			0x200e
+#define OBD_FAIL_DT_INDEX_TRY			0x200f
+#define OBD_FAIL_DT_DECLARE_REF_ADD		0x2010
+#define OBD_FAIL_DT_REF_ADD			0x2011
+#define OBD_FAIL_DT_DECLARE_REF_DEL		0x2012
+#define OBD_FAIL_DT_REF_DEL			0x2013
+#define OBD_FAIL_DT_DECLARE_INSERT		0x2014
+#define OBD_FAIL_DT_INSERT			0x2015
+#define OBD_FAIL_DT_DECLARE_DELETE		0x2016
+#define OBD_FAIL_DT_DELETE			0x2017
+#define OBD_FAIL_DT_LOOKUP			0x2018
+#define OBD_FAIL_DT_TXN_STOP			0x2019
+
+#define OBD_FAIL_OSP_CHECK_INVALID_REC		0x2100
+#define OBD_FAIL_OSP_CHECK_ENOMEM		0x2101
+#define OBD_FAIL_OSP_FAKE_PRECREATE		0x2102
+
+ /* barrier */
+#define OBD_FAIL_MGS_BARRIER_READ_NET		0x2200
+#define OBD_FAIL_MGS_BARRIER_NOTIFY_NET		0x2201
+
+#define OBD_FAIL_BARRIER_DELAY			0x2202
+#define OBD_FAIL_BARRIER_FAILURE		0x2203
+
+/* Assign references to moved code to reduce code changes */
+#define OBD_FAIL_PRECHECK(id)                   CFS_FAIL_PRECHECK(id)
+#define OBD_FAIL_CHECK(id)                      CFS_FAIL_CHECK(id)
+#define OBD_FAIL_CHECK_VALUE(id, value)         CFS_FAIL_CHECK_VALUE(id, value)
+#define OBD_FAIL_CHECK_ORSET(id, value)         CFS_FAIL_CHECK_ORSET(id, value)
+#define OBD_FAIL_CHECK_RESET(id, value)         CFS_FAIL_CHECK_RESET(id, value)
+#define OBD_FAIL_RETURN(id, ret)                CFS_FAIL_RETURN(id, ret)
+#define OBD_FAIL_TIMEOUT(id, secs)              CFS_FAIL_TIMEOUT(id, secs)
+#define OBD_FAIL_TIMEOUT_MS(id, ms)             CFS_FAIL_TIMEOUT_MS(id, ms)
+#define OBD_FAIL_TIMEOUT_ORSET(id, value, secs) CFS_FAIL_TIMEOUT_ORSET(id, value, secs)
+#define OBD_RACE(id)                            CFS_RACE(id)
+#define OBD_FAIL_ONCE                           CFS_FAIL_ONCE
+#define OBD_FAILED                              CFS_FAILED
+
+#define LUT_FAIL_CLASS(fail_id)			(((fail_id) >> 8) << 16)
+#define LUT_FAIL_MGT				LUT_FAIL_CLASS(OBD_FAIL_MGS)
+#define LUT_FAIL_MDT				LUT_FAIL_CLASS(OBD_FAIL_MDS)
+#define LUT_FAIL_OST				LUT_FAIL_CLASS(OBD_FAIL_OST)
+
+extern atomic_t libcfs_kmemory;
+
+#ifdef CONFIG_PROC_FS
+#define obd_memory_add(size)                                                  \
+        lprocfs_counter_add(obd_memory, OBD_MEMORY_STAT, (long)(size))
+#define obd_memory_sub(size)                                                  \
+        lprocfs_counter_sub(obd_memory, OBD_MEMORY_STAT, (long)(size))
+#define obd_memory_sum()                                                      \
+        lprocfs_stats_collector(obd_memory, OBD_MEMORY_STAT,                  \
+                                LPROCFS_FIELDS_FLAGS_SUM)
+
+extern void obd_update_maxusage(void);
+extern __u64 obd_memory_max(void);
+
+#else /* CONFIG_PROC_FS */
+
+extern __u64 obd_alloc;
+
+extern __u64 obd_max_alloc;
+
+static inline void obd_memory_add(long size)
+{
+        obd_alloc += size;
+        if (obd_alloc > obd_max_alloc)
+                obd_max_alloc = obd_alloc;
+}
+
+static inline void obd_memory_sub(long size)
+{
+        obd_alloc -= size;
+}
+
+#define obd_memory_sum() (obd_alloc)
+
+#define obd_memory_max() (obd_max_alloc)
+
+#endif /* !CONFIG_PROC_FS */
+
+#define OBD_DEBUG_MEMUSAGE (1)
+
+#if OBD_DEBUG_MEMUSAGE
+#define OBD_ALLOC_POST(ptr, size, name)                                 \
+                obd_memory_add(size);                                   \
+                CDEBUG(D_MALLOC, name " '" #ptr "': %d at %p.\n",       \
+                       (int)(size), ptr)
+
+#define OBD_FREE_PRE(ptr, size, name)                                   \
+        LASSERT(ptr);                                                   \
+        obd_memory_sub(size);                                           \
+        CDEBUG(D_MALLOC, name " '" #ptr "': %d at %p.\n",               \
+               (int)(size), ptr);                                       \
+        POISON(ptr, 0x5a, size)
+
+#else /* !OBD_DEBUG_MEMUSAGE */
+
+#define OBD_ALLOC_POST(ptr, size, name) ((void)0)
+#define OBD_FREE_PRE(ptr, size, name)   ((void)0)
+
+#endif /* !OBD_DEBUG_MEMUSAGE */
+
+#define __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, flags)		      \
+do {									      \
+	(ptr) = (cptab) == NULL ?					      \
+		kmalloc(size, (flags) | __GFP_ZERO) :			      \
+		cfs_cpt_malloc(cptab, cpt, size, (flags) | __GFP_ZERO);	      \
+	if (likely((ptr) != NULL))                                            \
+		OBD_ALLOC_POST(ptr, size, "kmalloced");                       \
+} while (0)
+
+#define OBD_ALLOC_GFP(ptr, size, gfp_mask)				      \
+	__OBD_MALLOC_VERBOSE(ptr, NULL, 0, size, gfp_mask)
+
+#define OBD_ALLOC(ptr, size) OBD_ALLOC_GFP(ptr, size, GFP_NOFS)
+#define OBD_ALLOC_WAIT(ptr, size) OBD_ALLOC_GFP(ptr, size, GFP_KERNEL)
+#define OBD_ALLOC_PTR(ptr) OBD_ALLOC(ptr, sizeof *(ptr))
+#define OBD_ALLOC_PTR_WAIT(ptr) OBD_ALLOC_WAIT(ptr, sizeof *(ptr))
+
+#define OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, gfp_mask)		      \
+	__OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, gfp_mask)
+
+#define OBD_CPT_ALLOC(ptr, cptab, cpt, size)				      \
+	OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, GFP_NOFS)
+
+#define OBD_CPT_ALLOC_PTR(ptr, cptab, cpt)				      \
+	OBD_CPT_ALLOC(ptr, cptab, cpt, sizeof *(ptr))
+
+/* Direct use of __vmalloc() allows for protection flag specification
+ * (and particularly to not set __GFP_FS, which is likely to cause some
+ * deadlock situations in our code).
+ */
+#define __OBD_VMALLOC_VERBOSE(ptr, cptab, cpt, size)			      \
+do {									      \
+	(ptr) = cptab == NULL ?						      \
+		__vmalloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO,	      \
+			  PAGE_KERNEL) :				      \
+		cfs_cpt_vzalloc(cptab, cpt, size);			      \
+	if (unlikely((ptr) == NULL)) {                                        \
+		CERROR("vmalloc of '" #ptr "' (%d bytes) failed\n",           \
+		       (int)(size));                                          \
+		CERROR("%llu total bytes allocated by Lustre, %d by LNET\n", \
+		       obd_memory_sum(), atomic_read(&libcfs_kmemory));       \
+	} else {                                                              \
+		OBD_ALLOC_POST(ptr, size, "vmalloced");                       \
+	}                                                                     \
+} while(0)
+
+#define OBD_VMALLOC(ptr, size)						      \
+	 __OBD_VMALLOC_VERBOSE(ptr, NULL, 0, size)
+#define OBD_CPT_VMALLOC(ptr, cptab, cpt, size)				      \
+	 __OBD_VMALLOC_VERBOSE(ptr, cptab, cpt, size)
+
+#define OBD_ALLOC_LARGE(ptr, size)                                            \
+do {                                                                          \
+	/* LU-8196 - force large allocations to use vmalloc, not kmalloc */   \
+	if ((size) > KMALLOC_MAX_SIZE)                                          \
+		ptr = NULL;                                                   \
+	else                                                                  \
+		OBD_ALLOC_GFP(ptr, size, GFP_NOFS | __GFP_NOWARN);            \
+	if (ptr == NULL)                                                      \
+                OBD_VMALLOC(ptr, size);                                       \
+} while (0)
+
+#define OBD_CPT_ALLOC_LARGE(ptr, cptab, cpt, size)			      \
+do {									      \
+	OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, GFP_NOFS | __GFP_NOWARN);    \
+	if (ptr == NULL)                                                      \
+		OBD_CPT_VMALLOC(ptr, cptab, cpt, size);			      \
+} while (0)
+
+#ifdef CONFIG_DEBUG_SLAB
+#define POISON(ptr, c, s) do {} while (0)
+#define POISON_PTR(ptr)  ((void)0)
+#else
+#define POISON(ptr, c, s) memset(ptr, c, s)
+#define POISON_PTR(ptr)  (ptr) = (void *)0xdeadbeef
+#endif
+
+#ifdef POISON_BULK
+#define POISON_PAGE(page, val) do { memset(kmap(page), val, PAGE_SIZE); \
+                                    kunmap(page); } while (0)
+#else
+#define POISON_PAGE(page, val) do { } while (0)
+#endif
+
+#define OBD_FREE(ptr, size)						      \
+do {									      \
+	OBD_FREE_PRE(ptr, size, "kfreed");				      \
+	kfree(ptr);							      \
+	POISON_PTR(ptr);						      \
+} while (0)
+
+#define OBD_FREE_LARGE(ptr, size)					      \
+do {									      \
+	if (is_vmalloc_addr(ptr)) {					      \
+		OBD_FREE_PRE(ptr, size, "vfreed");			      \
+		vfree(ptr);						      \
+		POISON_PTR(ptr);					      \
+	} else {							      \
+		OBD_FREE(ptr, size);					      \
+	}                                                                     \
+} while (0)
+
+#define OBD_FREE_RCU(ptr, size, handle)					      \
+do {									      \
+	struct portals_handle *__h = (handle);				      \
+									      \
+	LASSERT(handle != NULL);					      \
+	__h->h_cookie = (unsigned long)(ptr);				      \
+	__h->h_size = (size);						      \
+	call_rcu(&__h->h_rcu, class_handle_free_cb);			      \
+	POISON_PTR(ptr);						      \
+} while(0)
+
+/* we memset() the slab object to 0 when allocation succeeds, so DO NOT
+ * HAVE A CTOR THAT DOES ANYTHING.  its work will be cleared here.  we'd
+ * love to assert on that, but slab.c keeps kmem_cache_s all to itself. */
+#define OBD_SLAB_FREE_RTN0(ptr, slab)                                         \
+({                                                                            \
+	kmem_cache_free((slab), (ptr));                                    \
+        (ptr) = NULL;                                                         \
+        0;                                                                    \
+})
+
+#define __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, cptab, cpt, size, type)	      \
+do {									      \
+	LASSERT(ergo((type) != GFP_ATOMIC, !in_interrupt()));		      \
+	(ptr) = (cptab) == NULL ?					      \
+		kmem_cache_alloc(slab, (type) | __GFP_ZERO) :		      \
+		cfs_mem_cache_cpt_alloc(slab, cptab, cpt, (type) | __GFP_ZERO); \
+	if (likely((ptr)))                                                    \
+		OBD_ALLOC_POST(ptr, size, "slab-alloced");                    \
+} while(0)
+
+#define OBD_SLAB_ALLOC_GFP(ptr, slab, size, flags)			      \
+	__OBD_SLAB_ALLOC_VERBOSE(ptr, slab, NULL, 0, size, flags)
+#define OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, size, flags)	      \
+	__OBD_SLAB_ALLOC_VERBOSE(ptr, slab, cptab, cpt, size, flags)
+
+#define OBD_FREE_PTR(ptr) OBD_FREE(ptr, sizeof *(ptr))
+
+#define OBD_SLAB_FREE(ptr, slab, size)                                        \
+do {                                                                          \
+        OBD_FREE_PRE(ptr, size, "slab-freed");                                \
+	kmem_cache_free(slab, ptr);                                        \
+        POISON_PTR(ptr);                                                      \
+} while(0)
+
+#define OBD_SLAB_ALLOC(ptr, slab, size)					      \
+	OBD_SLAB_ALLOC_GFP(ptr, slab, size, GFP_NOFS)
+
+#define OBD_SLAB_CPT_ALLOC(ptr, slab, cptab, cpt, size)			      \
+	OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, size, GFP_NOFS)
+
+#define OBD_SLAB_ALLOC_PTR(ptr, slab)					      \
+	OBD_SLAB_ALLOC(ptr, slab, sizeof *(ptr))
+
+#define OBD_SLAB_CPT_ALLOC_PTR(ptr, slab, cptab, cpt)			      \
+	OBD_SLAB_CPT_ALLOC(ptr, slab, cptab, cpt, sizeof *(ptr))
+
+#define OBD_SLAB_ALLOC_PTR_GFP(ptr, slab, flags)			      \
+	OBD_SLAB_ALLOC_GFP(ptr, slab, sizeof *(ptr), flags)
+
+#define OBD_SLAB_CPT_ALLOC_PTR_GFP(ptr, slab, cptab, cpt, flags)		      \
+	OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, sizeof *(ptr), flags)
+
+#define OBD_SLAB_FREE_PTR(ptr, slab)					      \
+	OBD_SLAB_FREE((ptr), (slab), sizeof *(ptr))
+
+#define KEY_IS(str) \
+        (keylen >= (sizeof(str)-1) && memcmp(key, str, (sizeof(str)-1)) == 0)
+
+/* LUSTRE_LMA_FL_MASKS defines which flags will be stored in LMA */
+
+static inline int lma_to_lustre_flags(__u32 lma_flags)
+{
+	return (lma_flags & LMAI_ORPHAN) ? LUSTRE_ORPHAN_FL : 0;
+}
+
+static inline int lustre_to_lma_flags(__u32 la_flags)
+{
+	return (la_flags & LUSTRE_ORPHAN_FL) ? LMAI_ORPHAN : 0;
+}
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/obd_target.h b/drivers/staging/lustrefsx/lustre/include/obd_target.h
new file mode 100644
index 0000000000000..79f29dd374d86
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/obd_target.h
@@ -0,0 +1,73 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __OBD_TARGET_H
+#define __OBD_TARGET_H
+
+/* server-side individual type definitions */
+
+#define OBT_MAGIC       0xBDDECEAE
+/* hold common fields for "target" device */
+struct obd_device_target {
+	__u32			obt_magic;
+	__u32			obt_instance;
+	struct lu_target       *obt_lut;
+	__u64			obt_mount_count;
+	struct obd_job_stats	obt_jobstats;
+	struct nm_config_file	*obt_nodemap_config_file;
+};
+
+#define OBJ_SUBDIR_COUNT 32 /* set to zero for no subdirs */
+
+struct filter_obd {
+	/* NB this field MUST be first */
+	struct obd_device_target	 fo_obt;
+};
+
+struct echo_obd {
+	struct obd_device_target	eo_obt;
+	struct obdo			eo_oa;
+	spinlock_t			eo_lock;
+	u64				eo_lastino;
+	struct lustre_handle		eo_nl_lock;
+	atomic_t			eo_prep;
+};
+
+struct ost_obd {
+	struct ptlrpc_service	*ost_service;
+	struct ptlrpc_service	*ost_create_service;
+	struct ptlrpc_service	*ost_io_service;
+	struct ptlrpc_service	*ost_seq_service;
+	struct ptlrpc_service	*ost_out_service;
+	struct mutex		 ost_health_mutex;
+};
+
+#endif /* __OBD_TARGET_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/obj_update.h b/drivers/staging/lustrefsx/lustre/include/obj_update.h
new file mode 100644
index 0000000000000..c381f77f0045e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/obj_update.h
@@ -0,0 +1,115 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ *
+ * Selection of object_update and object_update_param handling functions
+ */
+
+#ifndef _OBJ_UPDATE_H_
+#define _OBJ_UPDATE_H_
+
+#include <lustre/lustre_idl.h>
+
+static inline size_t
+object_update_param_size(const struct object_update_param *param)
+{
+	return cfs_size_round(sizeof(*param) + param->oup_len);
+}
+
+static inline size_t
+object_update_params_size(const struct object_update *update)
+{
+	const struct object_update_param *param;
+	size_t				 total_size = 0;
+	unsigned int			 i;
+
+	param = &update->ou_params[0];
+	for (i = 0; i < update->ou_params_count; i++) {
+		size_t size = object_update_param_size(param);
+
+		param = (struct object_update_param *)((char *)param + size);
+		total_size += size;
+	}
+
+	return total_size;
+}
+
+static inline size_t
+object_update_size(const struct object_update *update)
+{
+	return offsetof(struct object_update, ou_params[0]) +
+	       object_update_params_size(update);
+}
+
+static inline struct object_update *
+object_update_request_get(const struct object_update_request *our,
+			  unsigned int index, size_t *size)
+{
+	void	*ptr;
+	unsigned int i;
+
+	if (index >= our->ourq_count)
+		return NULL;
+
+	ptr = (void *)&our->ourq_updates[0];
+	for (i = 0; i < index; i++)
+		ptr += object_update_size(ptr);
+
+	if (size != NULL)
+		*size = object_update_size(ptr);
+
+	return ptr;
+}
+
+
+
+static inline struct object_update_result *
+object_update_result_get(const struct object_update_reply *reply,
+			 unsigned int index, size_t *size)
+{
+	__u16 count = reply->ourp_count;
+	unsigned int i;
+	void *ptr;
+
+	if (index >= count)
+		return NULL;
+
+	ptr = (char *)reply +
+	      cfs_size_round(offsetof(struct object_update_reply,
+				      ourp_lens[count]));
+	for (i = 0; i < index; i++) {
+		if (reply->ourp_lens[i] == 0)
+			return NULL;
+
+		ptr += cfs_size_round(reply->ourp_lens[i]);
+	}
+
+	if (size != NULL)
+		*size = reply->ourp_lens[index];
+
+	return ptr;
+}
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/seq_range.h b/drivers/staging/lustrefsx/lustre/include/seq_range.h
new file mode 100644
index 0000000000000..616ee3a78e68b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/seq_range.h
@@ -0,0 +1,192 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ *
+ * Copyright 2015 Cray Inc, all rights reserved.
+ * Author: Ben Evans.
+ *
+ * Define lu_seq_range  associated functions
+ */
+
+#ifndef _SEQ_RANGE_H_
+#define _SEQ_RANGE_H_
+
+#include <lustre/lustre_idl.h>
+
+/**
+ * computes the sequence range type \a range
+ */
+
+static inline unsigned fld_range_type(const struct lu_seq_range *range)
+{
+	return range->lsr_flags & LU_SEQ_RANGE_MASK;
+}
+
+/**
+ *  Is this sequence range an OST? \a range
+ */
+
+static inline bool fld_range_is_ost(const struct lu_seq_range *range)
+{
+	return fld_range_type(range) == LU_SEQ_RANGE_OST;
+}
+
+/**
+ *  Is this sequence range an MDT? \a range
+ */
+
+static inline bool fld_range_is_mdt(const struct lu_seq_range *range)
+{
+	return fld_range_type(range) == LU_SEQ_RANGE_MDT;
+}
+
+/**
+ * ANY range is only used when the fld client sends a fld query request,
+ * but it does not know whether the seq is an MDT or OST, so it will send the
+ * request with ANY type, which means any seq type from the lookup can be
+ * expected. /a range
+ */
+static inline unsigned fld_range_is_any(const struct lu_seq_range *range)
+{
+	return fld_range_type(range) == LU_SEQ_RANGE_ANY;
+}
+
+/**
+ * Apply flags to range \a range \a flags
+ */
+
+static inline void fld_range_set_type(struct lu_seq_range *range,
+				      unsigned flags)
+{
+	range->lsr_flags |= flags;
+}
+
+/**
+ * Add MDT to range type \a range
+ */
+
+static inline void fld_range_set_mdt(struct lu_seq_range *range)
+{
+	fld_range_set_type(range, LU_SEQ_RANGE_MDT);
+}
+
+/**
+ * Add OST to range type \a range
+ */
+
+static inline void fld_range_set_ost(struct lu_seq_range *range)
+{
+	fld_range_set_type(range, LU_SEQ_RANGE_OST);
+}
+
+/**
+ * Add ANY to range type \a range
+ */
+
+static inline void fld_range_set_any(struct lu_seq_range *range)
+{
+	fld_range_set_type(range, LU_SEQ_RANGE_ANY);
+}
+
+/**
+ * computes width of given sequence range \a range
+ */
+
+static inline __u64 lu_seq_range_space(const struct lu_seq_range *range)
+{
+	return range->lsr_end - range->lsr_start;
+}
+
+/**
+ * initialize range to zero \a range
+ */
+
+static inline void lu_seq_range_init(struct lu_seq_range *range)
+{
+	memset(range, 0, sizeof(*range));
+}
+
+/**
+ * check if given seq id \a s is within given range \a range
+ */
+
+static inline bool lu_seq_range_within(const struct lu_seq_range *range,
+				       __u64 seq)
+{
+	return seq >= range->lsr_start && seq < range->lsr_end;
+}
+
+/**
+ * Is the range sane?  Is the end after the beginning? \a range
+ */
+
+static inline bool lu_seq_range_is_sane(const struct lu_seq_range *range)
+{
+	return range->lsr_end >= range->lsr_start;
+}
+
+/**
+ * Is the range 0? \a range
+ */
+
+static inline bool lu_seq_range_is_zero(const struct lu_seq_range *range)
+{
+	return range->lsr_start == 0 && range->lsr_end == 0;
+}
+
+/**
+ * Is the range out of space? \a range
+ */
+
+static inline bool lu_seq_range_is_exhausted(const struct lu_seq_range *range)
+{
+	return lu_seq_range_space(range) == 0;
+}
+
+/**
+ * return 0 if two ranges have the same location, nonzero if they are
+ * different \a r1 \a r2
+ */
+
+static inline int lu_seq_range_compare_loc(const struct lu_seq_range *r1,
+					   const struct lu_seq_range *r2)
+{
+	return r1->lsr_index != r2->lsr_index ||
+		r1->lsr_flags != r2->lsr_flags;
+}
+
+/**
+ * printf string and argument list for sequence range
+ */
+#define DRANGE "[%#16.16llx-%#16.16llx]:%x:%s"
+
+#define PRANGE(range)				\
+	(unsigned long long)(range)->lsr_start,	\
+	(unsigned long long)(range)->lsr_end,	\
+	(range)->lsr_index,			\
+	fld_range_is_mdt(range) ? "mdt" : "ost"
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_cfg.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_cfg.h
new file mode 100644
index 0000000000000..b1f68d50b0242
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_cfg.h
@@ -0,0 +1,294 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _UAPI_LUSTRE_CFG_H
+#define _UAPI_LUSTRE_CFG_H
+
+#include <linux/kernel.h>
+#include <lustre/lustre_user.h>
+
+/* Handle older distros */
+#ifndef __ALIGN_KERNEL
+# define __ALIGN_KERNEL(x, a)		__ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
+# define __ALIGN_KERNEL_MASK(x, mask)	(((x) + (mask)) & ~(mask))
+#endif
+
+/** \defgroup cfg cfg
+ *
+ * @{
+ */
+
+/*
+ * 1cf6
+ * lcfG
+ */
+#define LUSTRE_CFG_VERSION 0x1cf60001
+#define LUSTRE_CFG_MAX_BUFCOUNT 8
+
+#define LCFG_HDR_SIZE(count) \
+	__ALIGN_KERNEL(offsetof(struct lustre_cfg, lcfg_buflens[(count)]), 8)
+
+/** If the LCFG_REQUIRED bit is set in a configuration command,
+ * then the client is required to understand this parameter
+ * in order to mount the filesystem. If it does not understand
+ * a REQUIRED command the client mount will fail.
+ */
+#define LCFG_REQUIRED	0x0001000
+
+enum lcfg_command_type {
+	LCFG_ATTACH		  = 0x00cf001, /**< create a new obd instance */
+	LCFG_DETACH		  = 0x00cf002, /**< destroy obd instance */
+	LCFG_SETUP		  = 0x00cf003, /**< call type-specific setup */
+	LCFG_CLEANUP		  = 0x00cf004, /**< call type-specific cleanup
+						 */
+	LCFG_ADD_UUID		  = 0x00cf005, /**< add a nid to a niduuid */
+	LCFG_DEL_UUID		  = 0x00cf006, /**< remove a nid from
+						 *  a niduuid
+						 */
+	LCFG_MOUNTOPT		  = 0x00cf007, /**< create a profile
+						 * (mdc, osc)
+						 */
+	LCFG_DEL_MOUNTOPT	  = 0x00cf008, /**< destroy a profile */
+	LCFG_SET_TIMEOUT	  = 0x00cf009, /**< set obd_timeout */
+	LCFG_SET_UPCALL		  = 0x00cf00a, /**< deprecated */
+	LCFG_ADD_CONN		  = 0x00cf00b, /**< add a failover niduuid to
+						 *  an obd
+						 */
+	LCFG_DEL_CONN		  = 0x00cf00c, /**< remove a failover niduuid */
+	LCFG_LOV_ADD_OBD	  = 0x00cf00d, /**< add an osc to a lov */
+	LCFG_LOV_DEL_OBD	  = 0x00cf00e, /**< remove an osc from a lov */
+	LCFG_PARAM		  = 0x00cf00f, /**< set a proc parameter */
+	LCFG_MARKER		  = 0x00cf010, /**< metadata about next
+						 *  cfg rec
+						 */
+	LCFG_LOG_START		  = 0x00ce011, /**< mgc only, process a
+						 *  cfg log
+						 */
+	LCFG_LOG_END		  = 0x00ce012, /**< stop processing updates */
+	LCFG_LOV_ADD_INA	  = 0x00ce013, /**< like LOV_ADD_OBD,
+						 *  inactive
+						 */
+	LCFG_ADD_MDC		  = 0x00cf014, /**< add an mdc to a lmv */
+	LCFG_DEL_MDC		  = 0x00cf015, /**< remove an mdc from a lmv */
+	LCFG_SPTLRPC_CONF	  = 0x00ce016, /**< security */
+	LCFG_POOL_NEW		  = 0x00ce020, /**< create an ost pool name */
+	LCFG_POOL_ADD		  = 0x00ce021, /**< add an ost to a pool */
+	LCFG_POOL_REM		  = 0x00ce022, /**< remove an ost from a pool */
+	LCFG_POOL_DEL		  = 0x00ce023, /**< destroy an ost pool name */
+	LCFG_SET_LDLM_TIMEOUT	  = 0x00ce030, /**< set ldlm_timeout */
+	LCFG_PRE_CLEANUP	  = 0x00cf031, /**< call type-specific pre
+						 * cleanup cleanup
+						 */
+	LCFG_SET_PARAM		  = 0x00ce032, /**< use set_param syntax to set
+						 * a proc parameters
+						 */
+	LCFG_NODEMAP_ADD	  = 0x00ce040, /**< create a cluster */
+	LCFG_NODEMAP_DEL	  = 0x00ce041, /**< destroy a cluster */
+	LCFG_NODEMAP_ADD_RANGE	  = 0x00ce042, /**< add a nid range */
+	LCFG_NODEMAP_DEL_RANGE	  = 0x00ce043, /**< delete an nid range */
+	LCFG_NODEMAP_ADD_UIDMAP	  = 0x00ce044, /**< add a uidmap */
+	LCFG_NODEMAP_DEL_UIDMAP	  = 0x00ce045, /**< delete a uidmap */
+	LCFG_NODEMAP_ADD_GIDMAP	  = 0x00ce046, /**< add a gidmap */
+	LCFG_NODEMAP_DEL_GIDMAP	  = 0x00ce047, /**< delete a gidmap */
+	LCFG_NODEMAP_ACTIVATE	  = 0x00ce048, /**< activate cluster
+						 *  id mapping
+						 */
+	LCFG_NODEMAP_ADMIN	  = 0x00ce049, /**< allow cluster to use id 0 */
+	LCFG_NODEMAP_TRUSTED	  = 0x00ce050, /**< trust a clusters ids */
+	LCFG_NODEMAP_SQUASH_UID	  = 0x00ce051, /**< default map uid */
+	LCFG_NODEMAP_SQUASH_GID	  = 0x00ce052, /**< default map gid */
+	LCFG_NODEMAP_ADD_SHKEY	  = 0x00ce053, /**< add shared key to cluster */
+	LCFG_NODEMAP_DEL_SHKEY	  = 0x00ce054, /**< delete shared key from
+						 *  cluster
+						 */
+	LCFG_NODEMAP_TEST_NID	  = 0x00ce055, /**< test for nodemap
+						 *  membership
+						 */
+	LCFG_NODEMAP_TEST_ID	  = 0x00ce056, /**< test uid/gid mapping */
+	LCFG_NODEMAP_SET_FILESET  = 0x00ce057, /**< set fileset */
+	LCFG_NODEMAP_DENY_UNKNOWN = 0x00ce058, /**< deny squashed nodemap
+						 *  users
+						 */
+	LCFG_NODEMAP_MAP_MODE	  = 0x00ce059, /**< set the mapping mode */
+};
+
+struct lustre_cfg_bufs {
+	void  *lcfg_buf[LUSTRE_CFG_MAX_BUFCOUNT];
+	__u32 lcfg_buflen[LUSTRE_CFG_MAX_BUFCOUNT];
+	__u32 lcfg_bufcount;
+};
+
+struct lustre_cfg {
+	__u32 lcfg_version;
+	__u32 lcfg_command;
+
+	__u32 lcfg_num;
+	__u32 lcfg_flags;
+	__u64 lcfg_nid;
+	__u32 lcfg_nal;		/* not used any more */
+
+	__u32 lcfg_bufcount;
+	__u32 lcfg_buflens[0];
+};
+
+enum cfg_record_type {
+	PORTALS_CFG_TYPE	= 1,
+	LUSTRE_CFG_TYPE		= 123,
+};
+
+#define LUSTRE_CFG_BUFLEN(lcfg, idx)					\
+	((lcfg)->lcfg_bufcount <= (idx) ? 0 : (lcfg)->lcfg_buflens[(idx)])
+
+static inline void lustre_cfg_bufs_set(struct lustre_cfg_bufs *bufs,
+				       __u32 index, void *buf, __u32 buflen)
+{
+	if (index >= LUSTRE_CFG_MAX_BUFCOUNT)
+		return;
+
+	if (!bufs)
+		return;
+
+	if (bufs->lcfg_bufcount <= index)
+		bufs->lcfg_bufcount = index + 1;
+
+	bufs->lcfg_buf[index] = buf;
+	bufs->lcfg_buflen[index] = buflen;
+}
+
+static inline void lustre_cfg_bufs_set_string(struct lustre_cfg_bufs *bufs,
+					      __u32 index, char *str)
+{
+	lustre_cfg_bufs_set(bufs, index, str, str ? strlen(str) + 1 : 0);
+}
+
+static inline void lustre_cfg_bufs_reset(struct lustre_cfg_bufs *bufs,
+					 char *name)
+{
+	memset((bufs), 0, sizeof(*bufs));
+	if (name)
+		lustre_cfg_bufs_set_string(bufs, 0, name);
+}
+
+static inline void *lustre_cfg_buf(struct lustre_cfg *lcfg, __u32 index)
+{
+	__u32 i;
+	size_t offset;
+	__u32 bufcount;
+
+	if (!lcfg)
+		return NULL;
+
+	bufcount = lcfg->lcfg_bufcount;
+	if (index >= bufcount)
+		return NULL;
+
+	offset = LCFG_HDR_SIZE(lcfg->lcfg_bufcount);
+	for (i = 0; i < index; i++)
+		offset += __ALIGN_KERNEL(lcfg->lcfg_buflens[i], 8);
+	return (char *)lcfg + offset;
+}
+
+static inline void lustre_cfg_bufs_init(struct lustre_cfg_bufs *bufs,
+					struct lustre_cfg *lcfg)
+{
+	__u32 i;
+
+	bufs->lcfg_bufcount = lcfg->lcfg_bufcount;
+	for (i = 0; i < bufs->lcfg_bufcount; i++) {
+		bufs->lcfg_buflen[i] = lcfg->lcfg_buflens[i];
+		bufs->lcfg_buf[i] = lustre_cfg_buf(lcfg, i);
+	}
+}
+
+static inline __u32 lustre_cfg_len(__u32 bufcount, __u32 *buflens)
+{
+	__u32 i;
+	__u32 len;
+
+	len = LCFG_HDR_SIZE(bufcount);
+	for (i = 0; i < bufcount; i++)
+		len += __ALIGN_KERNEL(buflens[i], 8);
+
+	return __ALIGN_KERNEL(len, 8);
+}
+
+static inline void lustre_cfg_init(struct lustre_cfg *lcfg, int cmd,
+				   struct lustre_cfg_bufs *bufs)
+{
+	char *ptr;
+	__u32 i;
+
+	lcfg->lcfg_version = LUSTRE_CFG_VERSION;
+	lcfg->lcfg_command = cmd;
+	lcfg->lcfg_bufcount = bufs->lcfg_bufcount;
+
+	ptr = (char *)lcfg + LCFG_HDR_SIZE(lcfg->lcfg_bufcount);
+	for (i = 0; i < lcfg->lcfg_bufcount; i++) {
+		lcfg->lcfg_buflens[i] = bufs->lcfg_buflen[i];
+		if (bufs->lcfg_buf[i]) {
+			memcpy(ptr, bufs->lcfg_buf[i], bufs->lcfg_buflen[i]);
+			ptr += __ALIGN_KERNEL(bufs->lcfg_buflen[i], 8);
+		}
+	}
+}
+
+static inline int lustre_cfg_sanity_check(void *buf, size_t len)
+{
+	struct lustre_cfg *lcfg = (struct lustre_cfg *)buf;
+
+	if (!lcfg)
+		return -EINVAL;
+
+	/* check that the first bits of the struct are valid */
+	if (len < LCFG_HDR_SIZE(0))
+		return -EINVAL;
+
+	if (lcfg->lcfg_version != LUSTRE_CFG_VERSION)
+		return -EINVAL;
+
+	if (lcfg->lcfg_bufcount >= LUSTRE_CFG_MAX_BUFCOUNT)
+		return -EINVAL;
+
+	/* check that the buflens are valid */
+	if (len < LCFG_HDR_SIZE(lcfg->lcfg_bufcount))
+		return -EINVAL;
+
+	/* make sure all the pointers point inside the data */
+	if (len < lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens))
+		return -EINVAL;
+
+	return 0;
+}
+
+/** @} cfg */
+
+#endif /* _UAPI_LUSTRE_CFG_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_disk.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_disk.h
new file mode 100644
index 0000000000000..8887c82d3b8b9
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_disk.h
@@ -0,0 +1,229 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * uapi/linux/lustre_disk.h
+ *
+ * Lustre disk format definitions.
+ *
+ * Author: Nathan Rutman <nathan.rutman@seagate.com>
+ */
+
+#ifndef _UAPI_LUSTRE_DISK_H
+#define _UAPI_LUSTRE_DISK_H
+
+/** \defgroup disk disk
+ *
+ * @{
+ */
+#include <linux/types.h>
+
+/****************** on-disk files ********************/
+
+#define MDT_LOGS_DIR		"LOGS"	/* COMPAT_146 */
+#define MOUNT_CONFIGS_DIR	"CONFIGS"
+#define CONFIGS_FILE		"mountdata"
+/** Persistent mount data are stored on the disk in this file. */
+#define MOUNT_DATA_FILE		MOUNT_CONFIGS_DIR"/"CONFIGS_FILE
+#define LAST_RCVD		"last_rcvd"
+#define REPLY_DATA		"reply_data"
+#define LOV_OBJID		"lov_objid"
+#define LOV_OBJSEQ		"lov_objseq"
+#define HEALTH_CHECK		"health_check"
+#define CAPA_KEYS		"capa_keys"
+#define CHANGELOG_USERS		"changelog_users"
+#define MGS_NIDTBL_DIR		"NIDTBL_VERSIONS"
+#define QMT_DIR			"quota_master"
+#define QSD_DIR			"quota_slave"
+#define HSM_ACTIONS		"hsm_actions"
+#define LFSCK_DIR		"LFSCK"
+#define LFSCK_BOOKMARK		"lfsck_bookmark"
+#define LFSCK_LAYOUT		"lfsck_layout"
+#define LFSCK_NAMESPACE		"lfsck_namespace"
+
+/****************** persistent mount data *********************/
+
+#define LDD_F_SV_TYPE_MDT	0x0001
+#define LDD_F_SV_TYPE_OST	0x0002
+#define LDD_F_SV_TYPE_MGS	0x0004
+#define LDD_F_SV_TYPE_MASK	(LDD_F_SV_TYPE_MDT  | \
+				 LDD_F_SV_TYPE_OST  | \
+				 LDD_F_SV_TYPE_MGS)
+#define LDD_F_SV_ALL		0x0008
+/** need an index assignment */
+#define LDD_F_NEED_INDEX	0x0010
+/** never registered */
+#define LDD_F_VIRGIN		0x0020
+/** update the config logs for this server */
+#define LDD_F_UPDATE		0x0040
+/** rewrite the LDD */
+#define LDD_F_REWRITE_LDD	0x0080
+/** regenerate config logs for this fs or server */
+#define LDD_F_WRITECONF		0x0100
+/** COMPAT_14 */
+#define LDD_F_UPGRADE14		0x0200
+/** process as lctl conf_param */
+#define LDD_F_PARAM		0x0400
+/** all nodes are specified as service nodes */
+#define LDD_F_NO_PRIMNODE	0x1000
+/** IR enable flag */
+#define LDD_F_IR_CAPABLE	0x2000
+/** the MGS refused to register the target. */
+#define LDD_F_ERROR		0x4000
+/** process at lctl conf_param */
+#define LDD_F_PARAM2		0x8000
+
+#define LDD_MAGIC 0x1dd00001
+
+#define XATTR_TARGET_RENAME "trusted.rename_tgt"
+
+enum ldd_mount_type {
+	LDD_MT_EXT3 = 0,
+	LDD_MT_LDISKFS,
+	LDD_MT_SMFS,
+	LDD_MT_REISERFS,
+	LDD_MT_LDISKFS2,
+	LDD_MT_ZFS,
+	LDD_MT_LAST
+};
+
+/* On-disk configuration file. In host-endian order. */
+struct lustre_disk_data {
+	__u32 ldd_magic;
+	__u32 ldd_feature_compat;	/* compatible feature flags */
+	__u32 ldd_feature_rocompat;	/* read-only compatible feature flags */
+	__u32 ldd_feature_incompat;	/* incompatible feature flags */
+
+	__u32 ldd_config_ver;		/* config rewrite count - not used */
+	__u32 ldd_flags;		/* LDD_SV_TYPE */
+	__u32 ldd_svindex;		/* server index (0001), must match
+					 * svname
+					 */
+	__u32 ldd_mount_type;		/* target fs type LDD_MT_* */
+	char  ldd_fsname[64];		/* filesystem this server is part of,
+					 * MTI_NAME_MAXLEN
+					 */
+	char  ldd_svname[64];		/* this server's name (lustre-mdt0001)*/
+	__u8  ldd_uuid[40];		/* server UUID (COMPAT_146) */
+
+	char  ldd_userdata[1024 - 200];	/* arbitrary user string '200' */
+	__u8  ldd_padding[4096 - 1024];	/* 1024 */
+	char  ldd_mount_opts[4096];	/* target fs mount opts '4096' */
+	char  ldd_params[4096];		/* key=value pairs '8192' */
+};
+
+/****************** last_rcvd file *********************/
+
+#define LR_EXPIRE_INTERVALS 16	/**< number of intervals to track transno */
+#define ENOENT_VERSION 1	/** 'virtual' version of non-existent object */
+
+#define LR_SERVER_SIZE	512
+#define LR_CLIENT_START	8192
+#define LR_CLIENT_SIZE	128
+#if LR_CLIENT_START < LR_SERVER_SIZE
+#error "Can't have LR_CLIENT_START < LR_SERVER_SIZE"
+#endif
+
+/*
+ * Data stored per server at the head of the last_rcvd file. In le32 order.
+ */
+struct lr_server_data {
+	__u8  lsd_uuid[40];	   /* server UUID */
+	__u64 lsd_last_transno;    /* last completed transaction ID */
+	__u64 lsd_compat14;	   /* reserved - compat with old last_rcvd */
+	__u64 lsd_mount_count;	   /* incarnation number */
+	__u32 lsd_feature_compat;  /* compatible feature flags */
+	__u32 lsd_feature_rocompat;/* read-only compatible feature flags */
+	__u32 lsd_feature_incompat;/* incompatible feature flags */
+	__u32 lsd_server_size;	   /* size of server data area */
+	__u32 lsd_client_start;    /* start of per-client data area */
+	__u16 lsd_client_size;	   /* size of per-client data area */
+	__u16 lsd_subdir_count;    /* number of subdirectories for objects */
+	__u64 lsd_catalog_oid;	   /* recovery catalog object id */
+	__u32 lsd_catalog_ogen;    /* recovery catalog inode generation */
+	__u8  lsd_peeruuid[40];    /* UUID of MDS associated with this OST */
+	__u32 lsd_osd_index;	   /* index number of OST in LOV */
+	__u32 lsd_padding1;	   /* was lsd_mdt_index, unused in 2.4.0 */
+	__u32 lsd_start_epoch;	   /* VBR: start epoch from last boot */
+	/** transaction values since lsd_trans_table_time */
+	__u64 lsd_trans_table[LR_EXPIRE_INTERVALS];
+	/** start point of transno table below */
+	__u32 lsd_trans_table_time; /* time of first slot in table above */
+	__u32 lsd_expire_intervals; /* LR_EXPIRE_INTERVALS */
+	__u8  lsd_padding[LR_SERVER_SIZE - 288];
+};
+
+/* Data stored per client in the last_rcvd file. In le32 order. */
+struct lsd_client_data {
+	__u8  lcd_uuid[40];		/* client UUID */
+	__u64 lcd_last_transno;		/* last completed transaction ID */
+	__u64 lcd_last_xid;		/* xid for the last transaction */
+	__u32 lcd_last_result;		/* result from last RPC */
+	__u32 lcd_last_data;		/* per-op data (disposition for
+					 * open &c.)
+					 */
+	/* for MDS_CLOSE requests */
+	__u64 lcd_last_close_transno;	/* last completed transaction ID */
+	__u64 lcd_last_close_xid;	/* xid for the last transaction */
+	__u32 lcd_last_close_result;	/* result from last RPC */
+	__u32 lcd_last_close_data;	/* per-op data */
+	/* VBR: last versions */
+	__u64 lcd_pre_versions[4];
+	__u32 lcd_last_epoch;
+	/* generation counter of client slot in last_rcvd */
+	__u32 lcd_generation;
+	__u8  lcd_padding[LR_CLIENT_SIZE - 128];
+};
+
+/* Data stored in each slot of the reply_data file.
+ *
+ * The lrd_client_gen field is assigned with lcd_generation value
+ * to allow identify which client the reply data belongs to.
+ */
+struct lsd_reply_data {
+	__u64 lrd_transno;	/* transaction number */
+	__u64 lrd_xid;		/* transmission id */
+	__u64 lrd_data;		/* per-operation data */
+	__u32 lrd_result;	/* request result */
+	__u32 lrd_client_gen;	/* client generation */
+};
+
+/* Header of the reply_data file */
+#define LRH_MAGIC 0xbdabda01
+struct lsd_reply_header {
+	__u32	lrh_magic;
+	__u32	lrh_header_size;
+	__u32	lrh_reply_size;
+	__u8	lrh_pad[sizeof(struct lsd_reply_data) - 12];
+};
+
+/** @} disk */
+
+#endif /* _UAPI_LUSTRE_DISK_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_fid.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_fid.h
new file mode 100644
index 0000000000000..3e58dd5329c3f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_fid.h
@@ -0,0 +1,363 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ *
+ * Copyright 2016 Cray Inc, all rights reserved.
+ * Author: Ben Evans.
+ *
+ * all fid manipulation functions go here
+ *
+ * FIDS are globally unique within a Lustre filessytem, and are made up
+ * of three parts: sequence, Object ID, and version.
+ *
+ */
+#ifndef _UAPI_LUSTRE_FID_H_
+#define _UAPI_LUSTRE_FID_H_
+
+#include <lustre/lustre_idl.h>
+
+/** returns fid object sequence */
+static inline __u64 fid_seq(const struct lu_fid *fid)
+{
+	return fid->f_seq;
+}
+
+/** returns fid object id */
+static inline __u32 fid_oid(const struct lu_fid *fid)
+{
+	return fid->f_oid;
+}
+
+/** returns fid object version */
+static inline __u32 fid_ver(const struct lu_fid *fid)
+{
+	return fid->f_ver;
+}
+
+static inline void fid_zero(struct lu_fid *fid)
+{
+	memset(fid, 0, sizeof(*fid));
+}
+
+static inline __u64 fid_ver_oid(const struct lu_fid *fid)
+{
+	return (__u64)fid_ver(fid) << 32 | fid_oid(fid);
+}
+
+static inline bool fid_seq_is_mdt0(__u64 seq)
+{
+	return seq == FID_SEQ_OST_MDT0;
+}
+
+static inline bool fid_seq_is_mdt(__u64 seq)
+{
+	return seq == FID_SEQ_OST_MDT0 || seq >= FID_SEQ_NORMAL;
+};
+
+static inline bool fid_seq_is_echo(__u64 seq)
+{
+	return seq == FID_SEQ_ECHO;
+}
+
+static inline bool fid_is_echo(const struct lu_fid *fid)
+{
+	return fid_seq_is_echo(fid_seq(fid));
+}
+
+static inline bool fid_seq_is_llog(__u64 seq)
+{
+	return seq == FID_SEQ_LLOG;
+}
+
+static inline bool fid_is_llog(const struct lu_fid *fid)
+{
+	/* file with OID == 0 is not llog but contains last oid */
+	return fid_seq_is_llog(fid_seq(fid)) && fid_oid(fid) > 0;
+}
+
+static inline bool fid_seq_is_rsvd(__u64 seq)
+{
+	return seq > FID_SEQ_OST_MDT0 && seq <= FID_SEQ_RSVD;
+};
+
+static inline bool fid_seq_is_special(__u64 seq)
+{
+	return seq == FID_SEQ_SPECIAL;
+};
+
+static inline bool fid_seq_is_local_file(__u64 seq)
+{
+	return seq == FID_SEQ_LOCAL_FILE ||
+	       seq == FID_SEQ_LOCAL_NAME;
+};
+
+static inline bool fid_seq_is_root(__u64 seq)
+{
+	return seq == FID_SEQ_ROOT;
+}
+
+static inline bool fid_seq_is_dot(__u64 seq)
+{
+	return seq == FID_SEQ_DOT_LUSTRE;
+}
+
+static inline bool fid_seq_is_default(__u64 seq)
+{
+	return seq == FID_SEQ_LOV_DEFAULT;
+}
+
+static inline bool fid_is_mdt0(const struct lu_fid *fid)
+{
+	return fid_seq_is_mdt0(fid_seq(fid));
+}
+
+static inline void lu_root_fid(struct lu_fid *fid)
+{
+	fid->f_seq = FID_SEQ_ROOT;
+	fid->f_oid = FID_OID_ROOT;
+	fid->f_ver = 0;
+}
+
+static inline void lu_echo_root_fid(struct lu_fid *fid)
+{
+	fid->f_seq = FID_SEQ_ROOT;
+	fid->f_oid = FID_OID_ECHO_ROOT;
+	fid->f_ver = 0;
+}
+
+static inline void lu_update_log_fid(struct lu_fid *fid, __u32 index)
+{
+	fid->f_seq = FID_SEQ_UPDATE_LOG;
+	fid->f_oid = index;
+	fid->f_ver = 0;
+}
+
+static inline void lu_update_log_dir_fid(struct lu_fid *fid, __u32 index)
+{
+	fid->f_seq = FID_SEQ_UPDATE_LOG_DIR;
+	fid->f_oid = index;
+	fid->f_ver = 0;
+}
+
+/**
+ * Check if a fid is igif or not.
+ * \param fid the fid to be tested.
+ * \return true if the fid is an igif; otherwise false.
+ */
+static inline bool fid_seq_is_igif(__u64 seq)
+{
+	return seq >= FID_SEQ_IGIF && seq <= FID_SEQ_IGIF_MAX;
+}
+
+static inline bool fid_is_igif(const struct lu_fid *fid)
+{
+	return fid_seq_is_igif(fid_seq(fid));
+}
+
+/**
+ * Check if a fid is idif or not.
+ * \param fid the fid to be tested.
+ * \return true if the fid is an idif; otherwise false.
+ */
+static inline bool fid_seq_is_idif(__u64 seq)
+{
+	return seq >= FID_SEQ_IDIF && seq <= FID_SEQ_IDIF_MAX;
+}
+
+static inline bool fid_is_idif(const struct lu_fid *fid)
+{
+	return fid_seq_is_idif(fid_seq(fid));
+}
+
+static inline bool fid_is_local_file(const struct lu_fid *fid)
+{
+	return fid_seq_is_local_file(fid_seq(fid));
+}
+
+static inline bool fid_seq_is_norm(__u64 seq)
+{
+	return (seq >= FID_SEQ_NORMAL);
+}
+
+static inline bool fid_is_norm(const struct lu_fid *fid)
+{
+	return fid_seq_is_norm(fid_seq(fid));
+}
+
+static inline int fid_is_layout_rbtree(const struct lu_fid *fid)
+{
+	return fid_seq(fid) == FID_SEQ_LAYOUT_RBTREE;
+}
+
+static inline bool fid_seq_is_update_log(__u64 seq)
+{
+	return seq == FID_SEQ_UPDATE_LOG;
+}
+
+static inline bool fid_is_update_log(const struct lu_fid *fid)
+{
+	return fid_seq_is_update_log(fid_seq(fid));
+}
+
+static inline bool fid_seq_is_update_log_dir(__u64 seq)
+{
+	return seq == FID_SEQ_UPDATE_LOG_DIR;
+}
+
+static inline bool fid_is_update_log_dir(const struct lu_fid *fid)
+{
+	return fid_seq_is_update_log_dir(fid_seq(fid));
+}
+
+/* convert an OST objid into an IDIF FID SEQ number */
+static inline __u64 fid_idif_seq(__u64 id, __u32 ost_idx)
+{
+	return FID_SEQ_IDIF | (ost_idx << 16) | ((id >> 32) & 0xffff);
+}
+
+/* convert a packed IDIF FID into an OST objid */
+static inline __u64 fid_idif_id(__u64 seq, __u32 oid, __u32 ver)
+{
+	return ((__u64)ver << 48) | ((seq & 0xffff) << 32) | oid;
+}
+
+static inline __u32 idif_ost_idx(__u64 seq)
+{
+	return (seq >> 16) & 0xffff;
+}
+
+/* extract ost index from IDIF FID */
+static inline __u32 fid_idif_ost_idx(const struct lu_fid *fid)
+{
+	return idif_ost_idx(fid_seq(fid));
+}
+
+/* Check whether the fid is for LAST_ID */
+static inline bool fid_is_last_id(const struct lu_fid *fid)
+{
+	if (fid_oid(fid) != 0)
+		return false;
+
+	if (fid_is_idif(fid) && ((fid_seq(fid) & 0xFFFF) != 0))
+		return false;
+
+	if (fid_seq(fid) == FID_SEQ_UPDATE_LOG ||
+	    fid_seq(fid) == FID_SEQ_UPDATE_LOG_DIR ||
+	    fid_seq_is_igif(fid_seq(fid)))
+		return false;
+
+	return true;
+}
+
+/**
+ * Get inode number from an igif.
+ * \param fid an igif to get inode number from.
+ * \return inode number for the igif.
+ */
+static inline ino_t lu_igif_ino(const struct lu_fid *fid)
+{
+	return fid_seq(fid);
+}
+
+/**
+ * Get inode generation from an igif.
+ * \param fid an igif to get inode generation from.
+ * \return inode generation for the igif.
+ */
+static inline __u32 lu_igif_gen(const struct lu_fid *fid)
+{
+	return fid_oid(fid);
+}
+
+/**
+ * Build igif from the inode number/generation.
+ */
+static inline void lu_igif_build(struct lu_fid *fid, __u32 ino, __u32 gen)
+{
+	fid->f_seq = ino;
+	fid->f_oid = gen;
+	fid->f_ver = 0;
+}
+
+/*
+ * Fids are transmitted across network (in the sender byte-ordering),
+ * and stored on disk in big-endian order.
+ */
+static inline void fid_cpu_to_le(struct lu_fid *dst, const struct lu_fid *src)
+{
+	dst->f_seq = __cpu_to_le64(fid_seq(src));
+	dst->f_oid = __cpu_to_le32(fid_oid(src));
+	dst->f_ver = __cpu_to_le32(fid_ver(src));
+}
+
+static inline void fid_le_to_cpu(struct lu_fid *dst, const struct lu_fid *src)
+{
+	dst->f_seq = __le64_to_cpu(fid_seq(src));
+	dst->f_oid = __le32_to_cpu(fid_oid(src));
+	dst->f_ver = __le32_to_cpu(fid_ver(src));
+}
+
+static inline void fid_cpu_to_be(struct lu_fid *dst, const struct lu_fid *src)
+{
+	dst->f_seq = __cpu_to_be64(fid_seq(src));
+	dst->f_oid = __cpu_to_be32(fid_oid(src));
+	dst->f_ver = __cpu_to_be32(fid_ver(src));
+}
+
+static inline void fid_be_to_cpu(struct lu_fid *dst, const struct lu_fid *src)
+{
+	dst->f_seq = __be64_to_cpu(fid_seq(src));
+	dst->f_oid = __be32_to_cpu(fid_oid(src));
+	dst->f_ver = __be32_to_cpu(fid_ver(src));
+}
+
+static inline bool fid_is_sane(const struct lu_fid *fid)
+{
+	return fid && ((fid_seq(fid) >= FID_SEQ_START && !fid_ver(fid)) ||
+			fid_is_igif(fid) || fid_is_idif(fid) ||
+			fid_seq_is_rsvd(fid_seq(fid)));
+}
+
+static inline bool lu_fid_eq(const struct lu_fid *f0, const struct lu_fid *f1)
+{
+	return !memcmp(f0, f1, sizeof(*f0));
+}
+
+static inline int lu_fid_cmp(const struct lu_fid *f0,
+			     const struct lu_fid *f1)
+{
+	if (fid_seq(f0) != fid_seq(f1))
+		return fid_seq(f0) > fid_seq(f1) ? 1 : -1;
+
+	if (fid_oid(f0) != fid_oid(f1))
+		return fid_oid(f0) > fid_oid(f1) ? 1 : -1;
+
+	if (fid_ver(f0) != fid_ver(f1))
+		return fid_ver(f0) > fid_ver(f1) ? 1 : -1;
+
+	return 0;
+}
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ioctl.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ioctl.h
new file mode 100644
index 0000000000000..cb4ec46373759
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ioctl.h
@@ -0,0 +1,244 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, 2016, Intel Corporation.
+ */
+#ifndef _UAPI_LUSTRE_IOCTL_H
+#define _UAPI_LUSTRE_IOCTL_H
+
+#include <linux/ioctl.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <lustre/lustre_idl.h>
+
+#ifndef __KERNEL__
+# define __user
+#endif
+
+#if !defined(__KERNEL__) && !defined(LUSTRE_UTILS)
+# error This file is for Lustre internal use only.
+#endif
+
+/* Handle older distros */
+#ifndef __ALIGN_KERNEL
+# define __ALIGN_KERNEL(x, a)	__ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
+# define __ALIGN_KERNEL_MASK(x, mask)	(((x) + (mask)) & ~(mask))
+#endif
+
+enum md_echo_cmd {
+	ECHO_MD_CREATE		= 1, /* Open/Create file on MDT */
+	ECHO_MD_MKDIR		= 2, /* Mkdir on MDT */
+	ECHO_MD_DESTROY		= 3, /* Unlink file on MDT */
+	ECHO_MD_RMDIR		= 4, /* Rmdir on MDT */
+	ECHO_MD_LOOKUP		= 5, /* Lookup on MDT */
+	ECHO_MD_GETATTR		= 6, /* Getattr on MDT */
+	ECHO_MD_SETATTR		= 7, /* Setattr on MDT */
+	ECHO_MD_ALLOC_FID	= 8, /* Get FIDs from MDT */
+};
+
+#define OBD_DEV_ID 1
+#define OBD_DEV_NAME "obd"
+#define OBD_DEV_PATH "/dev/" OBD_DEV_NAME
+
+#define OBD_IOCTL_VERSION	0x00010004
+#define OBD_DEV_BY_DEVNAME	0xffffd0de
+#define OBD_MAX_IOCTL_BUFFER	CONFIG_LUSTRE_OBD_MAX_IOCTL_BUFFER
+
+struct obd_ioctl_data {
+	__u32		ioc_len;
+	__u32		ioc_version;
+
+	union {
+		__u64	ioc_cookie;
+		__u64	ioc_u64_1;
+	};
+	union {
+		__u32	ioc_conn1;
+		__u32	ioc_u32_1;
+	};
+	union {
+		__u32	ioc_conn2;
+		__u32	ioc_u32_2;
+	};
+
+	struct obdo	ioc_obdo1;
+	struct obdo	ioc_obdo2;
+
+	__u64		ioc_count;
+	__u64		ioc_offset;
+	__u32		ioc_dev;
+	__u32		ioc_command;
+
+	__u64		ioc_nid;
+	__u32		ioc_nal;
+	__u32		ioc_type;
+
+	/* buffers the kernel will treat as user pointers */
+	__u32		ioc_plen1;
+	char __user    *ioc_pbuf1;
+	__u32		ioc_plen2;
+	char __user    *ioc_pbuf2;
+
+	/* inline buffers for various arguments */
+	__u32		ioc_inllen1;
+	char	       *ioc_inlbuf1;
+	__u32		ioc_inllen2;
+	char	       *ioc_inlbuf2;
+	__u32		ioc_inllen3;
+	char	       *ioc_inlbuf3;
+	__u32		ioc_inllen4;
+	char	       *ioc_inlbuf4;
+
+	char		ioc_bulk[0];
+};
+
+struct obd_ioctl_hdr {
+	__u32		ioc_len;
+	__u32		ioc_version;
+};
+
+static inline __u32 obd_ioctl_packlen(struct obd_ioctl_data *data)
+{
+	__u32 len = __ALIGN_KERNEL(sizeof(*data), 8);
+
+	len += __ALIGN_KERNEL(data->ioc_inllen1, 8);
+	len += __ALIGN_KERNEL(data->ioc_inllen2, 8);
+	len += __ALIGN_KERNEL(data->ioc_inllen3, 8);
+	len += __ALIGN_KERNEL(data->ioc_inllen4, 8);
+
+	return len;
+}
+
+/*
+ * OBD_IOC_DATA_TYPE is only for compatibility reasons with older
+ * Linux Lustre user tools. New ioctls should NOT use this macro as
+ * the ioctl "size". Instead the ioctl should get a "size" argument
+ * which is the actual data type used by the ioctl, to ensure the
+ * ioctl interface is versioned correctly.
+ */
+#define OBD_IOC_DATA_TYPE	long
+
+/*	IOC_LDLM_TEST		_IOWR('f', 40, long) */
+/*	IOC_LDLM_DUMP		_IOWR('f', 41, long) */
+/*	IOC_LDLM_REGRESS_START	_IOWR('f', 42, long) */
+/*	IOC_LDLM_REGRESS_STOP	_IOWR('f', 43, long) */
+
+#define OBD_IOC_CREATE		_IOWR('f', 101, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_DESTROY		_IOW('f', 104, OBD_IOC_DATA_TYPE)
+/*	OBD_IOC_PREALLOCATE	_IOWR('f', 105, OBD_IOC_DATA_TYPE) */
+
+#define OBD_IOC_SETATTR		_IOW('f', 107, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETATTR		_IOWR('f', 108, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_READ		_IOWR('f', 109, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_WRITE		_IOWR('f', 110, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_STATFS		_IOWR('f', 113, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_SYNC		_IOW('f', 114, OBD_IOC_DATA_TYPE)
+/*	OBD_IOC_READ2		_IOWR('f', 115, OBD_IOC_DATA_TYPE) */
+/*	OBD_IOC_FORMAT		_IOWR('f', 116, OBD_IOC_DATA_TYPE) */
+/*	OBD_IOC_PARTITION	_IOWR('f', 117, OBD_IOC_DATA_TYPE) */
+/*	OBD_IOC_COPY		_IOWR('f', 120, OBD_IOC_DATA_TYPE) */
+/*	OBD_IOC_MIGR		_IOWR('f', 121, OBD_IOC_DATA_TYPE) */
+/*	OBD_IOC_PUNCH		_IOWR('f', 122, OBD_IOC_DATA_TYPE) */
+
+/*	OBD_IOC_MODULE_DEBUG	_IOWR('f', 124, OBD_IOC_DATA_TYPE) */
+#define OBD_IOC_BRW_READ	_IOWR('f', 125, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_BRW_WRITE	_IOWR('f', 126, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_NAME2DEV	_IOWR('f', 127, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_UUID2DEV	_IOWR('f', 130, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETNAME		_IOWR('f', 131, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETMDNAME	_IOR('f', 131, char[MAX_OBD_NAME])
+#define OBD_IOC_GETDTNAME	OBD_IOC_GETNAME
+#define OBD_IOC_LOV_GET_CONFIG	_IOWR('f', 132, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_CLIENT_RECOVER	_IOW('f', 133, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PING_TARGET	_IOW('f', 136, OBD_IOC_DATA_TYPE)
+
+/*	OBD_IOC_DEC_FS_USE_COUNT _IO('f', 139) */
+#define OBD_IOC_NO_TRANSNO	_IOW('f', 140, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_SET_READONLY	_IOW('f', 141, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_ABORT_RECOVERY	_IOR('f', 142, OBD_IOC_DATA_TYPE)
+/*	OBD_IOC_ROOT_SQUASH	_IOWR('f', 143, OBD_IOC_DATA_TYPE) */
+#define OBD_GET_VERSION		_IOWR('f', 144, OBD_IOC_DATA_TYPE)
+/*	OBD_IOC_GSS_SUPPORT	_IOWR('f', 145, OBD_IOC_DATA_TYPE) */
+/*	OBD_IOC_CLOSE_UUID	_IOWR('f', 147, OBD_IOC_DATA_TYPE) */
+/*	OBD_IOC_CHANGELOG_SEND	_IOW('f', 148, OBD_IOC_DATA_TYPE) */
+#define OBD_IOC_GETDEVICE	_IOWR('f', 149, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_FID2PATH	_IOWR('f', 150, OBD_IOC_DATA_TYPE)
+/*	lustre/lustre_user.h	151-153 */
+/*	OBD_IOC_LOV_SETSTRIPE	154 LL_IOC_LOV_SETSTRIPE */
+/*	OBD_IOC_LOV_GETSTRIPE	155 LL_IOC_LOV_GETSTRIPE */
+/*	OBD_IOC_LOV_SETEA	156 LL_IOC_LOV_SETEA */
+/*	lustre/lustre_user.h	157-159 */
+/*	OBD_IOC_QUOTACHECK	_IOW('f', 160, int) */
+/*	OBD_IOC_POLL_QUOTACHECK	_IOR('f', 161, struct if_quotacheck *) */
+#define OBD_IOC_QUOTACTL	_IOWR('f', 162, struct if_quotactl)
+/*	lustre/lustre_user.h	163-176 */
+#define OBD_IOC_CHANGELOG_REG	_IOW('f', 177, struct obd_ioctl_data)
+#define OBD_IOC_CHANGELOG_DEREG	_IOW('f', 178, struct obd_ioctl_data)
+#define OBD_IOC_CHANGELOG_CLEAR	_IOW('f', 179, struct obd_ioctl_data)
+/*	OBD_IOC_RECORD		_IOWR('f', 180, OBD_IOC_DATA_TYPE) */
+/*	OBD_IOC_ENDRECORD	_IOWR('f', 181, OBD_IOC_DATA_TYPE) */
+/*	OBD_IOC_PARSE		_IOWR('f', 182, OBD_IOC_DATA_TYPE) */
+/*	OBD_IOC_DORECORD	_IOWR('f', 183, OBD_IOC_DATA_TYPE) */
+#define OBD_IOC_PROCESS_CFG	_IOWR('f', 184, OBD_IOC_DATA_TYPE)
+/*	OBD_IOC_DUMP_LOG	_IOWR('f', 185, OBD_IOC_DATA_TYPE) */
+/*	OBD_IOC_CLEAR_LOG	_IOWR('f', 186, OBD_IOC_DATA_TYPE) */
+#define OBD_IOC_PARAM		_IOW('f', 187, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_POOL		_IOWR('f', 188, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_REPLACE_NIDS	_IOWR('f', 189, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_CATLOGLIST	_IOWR('f', 190, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_INFO	_IOWR('f', 191, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_PRINT	_IOWR('f', 192, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_CANCEL	_IOWR('f', 193, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_REMOVE	_IOWR('f', 194, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_CHECK	_IOWR('f', 195, OBD_IOC_DATA_TYPE)
+/*	OBD_IOC_LLOG_CATINFO	_IOWR('f', 196, OBD_IOC_DATA_TYPE) */
+#define OBD_IOC_NODEMAP		_IOWR('f', 197, OBD_IOC_DATA_TYPE)
+
+/*	ECHO_IOC_GET_STRIPE	_IOWR('f', 200, OBD_IOC_DATA_TYPE) */
+/*	ECHO_IOC_SET_STRIPE	_IOWR('f', 201, OBD_IOC_DATA_TYPE) */
+/*	ECHO_IOC_ENQUEUE	_IOWR('f', 202, OBD_IOC_DATA_TYPE) */
+/*	ECHO_IOC_CANCEL		_IOWR('f', 203, OBD_IOC_DATA_TYPE) */
+
+#define OBD_IOC_LCFG_FORK	_IOWR('f', 208, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LCFG_ERASE	_IOWR('f', 209, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GET_OBJ_VERSION	_IOR('f', 210, OBD_IOC_DATA_TYPE)
+
+/*	lustre/lustre_user.h	212-217 */
+#define OBD_IOC_GET_MNTOPT	_IOW('f', 220, mntopt_t)
+#define OBD_IOC_ECHO_MD		_IOR('f', 221, struct obd_ioctl_data)
+#define OBD_IOC_ECHO_ALLOC_SEQ	_IOWR('f', 222, struct obd_ioctl_data)
+#define OBD_IOC_START_LFSCK	_IOWR('f', 230, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_STOP_LFSCK	_IOW('f', 231, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_QUERY_LFSCK	_IOR('f', 232, struct obd_ioctl_data)
+/*	lustre/lustre_user.h	240-249 */
+/*	LIBCFS_IOC_DEBUG_MASK	250 */
+
+#define OBD_IOC_BARRIER		_IOWR('f', 261, OBD_IOC_DATA_TYPE)
+
+#define IOC_OSC_SET_ACTIVE	_IOWR('h', 21, void *)
+
+#endif /* _UAPI_LUSTRE_IOCTL_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ostid.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ostid.h
new file mode 100644
index 0000000000000..c0e662ae7b84f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ostid.h
@@ -0,0 +1,243 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ *
+ * Copyright 2015 Cray Inc, all rights reserved.
+ * Author: Ben Evans.
+ *
+ * Define ost_id  associated functions
+ */
+
+#ifndef _UAPI_LUSTRE_OSTID_H_
+#define _UAPI_LUSTRE_OSTID_H_
+
+/*
+ * This is due to us being out of kernel and the way the OpenSFS branch
+ * handles CFLAGS. Upstream will just have linux/lustre_fid.h
+ */
+#ifdef __KERNEL__
+#include <uapi/linux/lustre_fid.h>
+#else
+#include <linux/lustre_fid.h>
+#endif
+
+static inline __u64 lmm_oi_id(const struct ost_id *oi)
+{
+	return oi->oi.oi_id;
+}
+
+static inline __u64 lmm_oi_seq(const struct ost_id *oi)
+{
+	return oi->oi.oi_seq;
+}
+
+static inline void lmm_oi_set_seq(struct ost_id *oi, __u64 seq)
+{
+	oi->oi.oi_seq = seq;
+}
+
+static inline void lmm_oi_set_id(struct ost_id *oi, __u64 oid)
+{
+	oi->oi.oi_id = oid;
+}
+
+static inline void lmm_oi_le_to_cpu(struct ost_id *dst_oi,
+				    const struct ost_id *src_oi)
+{
+	dst_oi->oi.oi_id = __le64_to_cpu(src_oi->oi.oi_id);
+	dst_oi->oi.oi_seq = __le64_to_cpu(src_oi->oi.oi_seq);
+}
+
+static inline void lmm_oi_cpu_to_le(struct ost_id *dst_oi,
+				    const struct ost_id *src_oi)
+{
+	dst_oi->oi.oi_id = __cpu_to_le64(src_oi->oi.oi_id);
+	dst_oi->oi.oi_seq = __cpu_to_le64(src_oi->oi.oi_seq);
+}
+
+/* extract OST sequence (group) from a wire ost_id (id/seq) pair */
+static inline __u64 ostid_seq(const struct ost_id *ostid)
+{
+	if (fid_seq_is_mdt0(ostid->oi.oi_seq))
+		return FID_SEQ_OST_MDT0;
+
+	if (fid_seq_is_default(ostid->oi.oi_seq))
+		return FID_SEQ_LOV_DEFAULT;
+
+	if (fid_is_idif(&ostid->oi_fid))
+		return FID_SEQ_OST_MDT0;
+
+	return fid_seq(&ostid->oi_fid);
+}
+
+/* extract OST objid from a wire ost_id (id/seq) pair */
+static inline __u64 ostid_id(const struct ost_id *ostid)
+{
+	if (fid_seq_is_mdt0(ostid->oi.oi_seq))
+		return ostid->oi.oi_id & IDIF_OID_MASK;
+
+	if (fid_seq_is_default(ostid->oi.oi_seq))
+		return ostid->oi.oi_id;
+
+	if (fid_is_idif(&ostid->oi_fid))
+		return fid_idif_id(fid_seq(&ostid->oi_fid),
+				   fid_oid(&ostid->oi_fid), 0);
+
+	return fid_oid(&ostid->oi_fid);
+}
+
+static inline void ostid_set_seq(struct ost_id *oi, __u64 seq)
+{
+	if (fid_seq_is_mdt0(seq) || fid_seq_is_default(seq)) {
+		oi->oi.oi_seq = seq;
+	} else {
+		oi->oi_fid.f_seq = seq;
+		/*
+		 * Note: if f_oid + f_ver is zero, we need init it
+		 * to be 1, otherwise, ostid_seq will treat this
+		 * as old ostid (oi_seq == 0)
+		 */
+		if (!oi->oi_fid.f_oid && !oi->oi_fid.f_ver)
+			oi->oi_fid.f_oid = LUSTRE_FID_INIT_OID;
+	}
+}
+
+static inline void ostid_set_seq_mdt0(struct ost_id *oi)
+{
+	ostid_set_seq(oi, FID_SEQ_OST_MDT0);
+}
+
+static inline void ostid_set_seq_echo(struct ost_id *oi)
+{
+	ostid_set_seq(oi, FID_SEQ_ECHO);
+}
+
+static inline void ostid_set_seq_llog(struct ost_id *oi)
+{
+	ostid_set_seq(oi, FID_SEQ_LLOG);
+}
+
+static inline void ostid_cpu_to_le(const struct ost_id *src_oi,
+				   struct ost_id *dst_oi)
+{
+	if (fid_seq_is_mdt0(src_oi->oi.oi_seq)) {
+		dst_oi->oi.oi_id = __cpu_to_le64(src_oi->oi.oi_id);
+		dst_oi->oi.oi_seq = __cpu_to_le64(src_oi->oi.oi_seq);
+	} else {
+		fid_cpu_to_le(&dst_oi->oi_fid, &src_oi->oi_fid);
+	}
+}
+
+static inline void ostid_le_to_cpu(const struct ost_id *src_oi,
+				   struct ost_id *dst_oi)
+{
+	if (fid_seq_is_mdt0(src_oi->oi.oi_seq)) {
+		dst_oi->oi.oi_id = __le64_to_cpu(src_oi->oi.oi_id);
+		dst_oi->oi.oi_seq = __le64_to_cpu(src_oi->oi.oi_seq);
+	} else {
+		fid_le_to_cpu(&dst_oi->oi_fid, &src_oi->oi_fid);
+	}
+}
+
+/**
+ * Sigh, because pre-2.4 uses
+ * struct lov_mds_md_v1 {
+ *	........
+ *	__u64 lmm_object_id;
+ *	__u64 lmm_object_seq;
+ *      ......
+ *      }
+ * to identify the LOV(MDT) object, and lmm_object_seq will
+ * be normal_fid, which make it hard to combine these conversion
+ * to ostid_to FID. so we will do lmm_oi/fid conversion separately
+ *
+ * We can tell the lmm_oi by this way,
+ * 1.8: lmm_object_id = {inode}, lmm_object_gr = 0
+ * 2.1: lmm_object_id = {oid < 128k}, lmm_object_seq = FID_SEQ_NORMAL
+ * 2.4: lmm_oi.f_seq = FID_SEQ_NORMAL, lmm_oi.f_oid = {oid < 128k},
+ *      lmm_oi.f_ver = 0
+ *
+ * But currently lmm_oi/lsm_oi does not have any "real" usages,
+ * except for printing some information, and the user can always
+ * get the real FID from LMA, besides this multiple case check might
+ * make swab more complicate. So we will keep using id/seq for lmm_oi.
+ */
+
+static inline void fid_to_lmm_oi(const struct lu_fid *fid,
+				 struct ost_id *oi)
+{
+	oi->oi.oi_id = fid_oid(fid);
+	oi->oi.oi_seq = fid_seq(fid);
+}
+
+/**
+ * Unpack an OST object id/seq (group) into a FID.  This is needed for
+ * converting all obdo, lmm, lsm, etc. 64-bit id/seq pairs into proper
+ * FIDs.  Note that if an id/seq is already in FID/IDIF format it will
+ * be passed through unchanged.  Only legacy OST objects in "group 0"
+ * will be mapped into the IDIF namespace so that they can fit into the
+ * struct lu_fid fields without loss.
+ */
+static inline int ostid_to_fid(struct lu_fid *fid, const struct ost_id *ostid,
+			       __u32 ost_idx)
+{
+	__u64 seq = ostid_seq(ostid);
+
+	if (ost_idx > 0xffff)
+		return -EBADF;
+
+	if (fid_seq_is_mdt0(seq)) {
+		__u64 oid = ostid_id(ostid);
+
+		/* This is a "legacy" (old 1.x/2.early) OST object in "group 0"
+		 * that we map into the IDIF namespace.  It allows up to 2^48
+		 * objects per OST, as this is the object namespace that has
+		 * been in production for years.  This can handle create rates
+		 * of 1M objects/s/OST for 9 years, or combinations thereof.
+		 */
+		if (oid >= IDIF_MAX_OID)
+			return -EBADF;
+
+		fid->f_seq = fid_idif_seq(oid, ost_idx);
+		/* truncate to 32 bits by assignment */
+		fid->f_oid = oid;
+		/* in theory, not currently used */
+		fid->f_ver = oid >> 48;
+	} else if (!fid_seq_is_default(seq)) {
+		/* This is either an IDIF object, which identifies objects
+		 * across all OSTs, or a regular FID.  The IDIF namespace
+		 * maps legacy OST objects into the FID namespace.  In both
+		 * cases, we just pass the FID through, no conversion needed.
+		 */
+		if (ostid->oi_fid.f_ver)
+			return -EBADF;
+
+		*fid = ostid->oi_fid;
+	}
+
+	return 0;
+}
+#endif /* _UAPI_LUSTRE_OSTID_H_ */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_param.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_param.h
new file mode 100644
index 0000000000000..022d253bbc353
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_param.h
@@ -0,0 +1,94 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * User-settable parameter keys
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#ifndef _UAPI_LUSTRE_PARAM_H
+#define _UAPI_LUSTRE_PARAM_H
+
+/** \defgroup param param
+ *
+ * @{
+ */
+
+/****************** User-settable parameter keys *********************/
+/* e.g.
+ *	tunefs.lustre --param="failover.node=192.168.0.13@tcp0" /dev/sda
+ *	lctl conf_param testfs-OST0000 failover.node=3@elan,192.168.0.3@tcp0
+ *		    ... testfs-MDT0000.lov.stripesize=4M
+ *		    ... testfs-OST0000.ost.client_cache_seconds=15
+ *		    ... testfs.sys.timeout=<secs>
+ *		    ... testfs.llite.max_read_ahead_mb=16
+ */
+
+/* System global or special params not handled in obd's proc
+ * See mgs_write_log_sys()
+ */
+#define PARAM_TIMEOUT              "timeout="          /* global */
+#define PARAM_LDLM_TIMEOUT         "ldlm_timeout="     /* global */
+#define PARAM_AT_MIN               "at_min="           /* global */
+#define PARAM_AT_MAX               "at_max="           /* global */
+#define PARAM_AT_EXTRA             "at_extra="         /* global */
+#define PARAM_AT_EARLY_MARGIN      "at_early_margin="  /* global */
+#define PARAM_AT_HISTORY           "at_history="       /* global */
+#define PARAM_JOBID_VAR		   "jobid_var="	       /* global */
+#define PARAM_MGSNODE              "mgsnode="          /* only at mounttime */
+#define PARAM_FAILNODE             "failover.node="    /* add failover nid */
+#define PARAM_FAILMODE             "failover.mode="    /* initial mount only */
+#define PARAM_ACTIVE               "active="           /* activate/deactivate */
+#define PARAM_NETWORK              "network="          /* bind on nid */
+#define PARAM_ID_UPCALL		"identity_upcall="  /* identity upcall */
+
+/* Prefixes for parameters handled by obd's proc methods (XXX_process_config) */
+#define PARAM_OST		"ost."
+#define PARAM_OSD		"osd."
+#define PARAM_OSC		"osc."
+#define PARAM_MDT		"mdt."
+#define PARAM_HSM		"mdt.hsm."
+#define PARAM_MDD		"mdd."
+#define PARAM_MDC		"mdc."
+#define PARAM_LLITE		"llite."
+#define PARAM_LOV		"lov."
+#define PARAM_LOD		"lod."
+#define PARAM_OSP		"osp."
+#define PARAM_SYS		"sys."		/* global */
+#define PARAM_SRPC		"srpc."
+#define PARAM_SRPC_FLVR		"srpc.flavor."
+#define PARAM_SRPC_UDESC	"srpc.udesc.cli2mdt"
+#define PARAM_SEC		"security."
+#define PARAM_QUOTA		"quota."	/* global */
+
+/** @} param */
+
+#endif /* _UAPI_LUSTRE_PARAM_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi_kernelcomm.h b/drivers/staging/lustrefsx/lustre/include/uapi_kernelcomm.h
new file mode 100644
index 0000000000000..e8119f5278c23
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi_kernelcomm.h
@@ -0,0 +1,89 @@
+/*
+ * LGPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library.
+ *
+ * LGPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Author: Nathan Rutman <nathan.rutman@seagate.com>
+ *
+ * Kernel <-> userspace communication routines.
+ * The definitions below are used in the kernel and userspace.
+ */
+
+#ifndef __UAPI_KERNELCOMM_H__
+#define __UAPI_KERNELCOMM_H__
+
+#include <linux/types.h>
+
+/* KUC message header.
+ * All current and future KUC messages should use this header.
+ * To avoid having to include Lustre headers from libcfs, define this here.
+ */
+struct kuc_hdr {
+	__u16 kuc_magic;
+	__u8  kuc_transport;  /* Each new Lustre feature should use a different
+				 transport */
+	__u8  kuc_flags;
+	__u16 kuc_msgtype;    /* Message type or opcode, transport-specific */
+	__u16 kuc_msglen;     /* Including header */
+} __attribute__((aligned(sizeof(__u64))));
+
+
+#define KUC_MAGIC  0x191C /*Lustre9etLinC */
+
+/* kuc_msgtype values are defined in each transport */
+enum kuc_transport_type {
+	KUC_TRANSPORT_GENERIC   = 1,
+	KUC_TRANSPORT_HSM       = 2,
+};
+
+enum kuc_generic_message_type {
+	KUC_MSG_SHUTDOWN = 1,
+};
+
+/* KUC Broadcast Groups. This determines which userspace process hears which
+ * messages.  Mutliple transports may be used within a group, or multiple
+ * groups may use the same transport.  Broadcast
+ * groups need not be used if e.g. a UID is specified instead;
+ * use group 0 to signify unicast.
+ */
+#define KUC_GRP_HSM	0x02
+#define KUC_GRP_MAX	KUC_GRP_HSM
+
+#define LK_FLG_STOP 0x01
+#define LK_NOFD -1U
+
+/* kernelcomm control structure, passed from userspace to kernel */
+struct lustre_kernelcomm {
+	__u32 lk_wfd;
+	__u32 lk_rfd;
+	__u32 lk_uid;
+	__u32 lk_group;
+	__u32 lk_data;
+	__u32 lk_flags;
+} __attribute__((packed));
+
+#endif	/* __UAPI_KERNELCOMM_H__ */
diff --git a/drivers/staging/lustrefsx/lustre/include/upcall_cache.h b/drivers/staging/lustrefsx/lustre/include/upcall_cache.h
new file mode 100644
index 0000000000000..accc4495d156e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/upcall_cache.h
@@ -0,0 +1,154 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _UPCALL_CACHE_H
+#define _UPCALL_CACHE_H
+
+#include <libcfs/libcfs.h>
+#include <lnet/types.h>
+
+/** \defgroup ucache ucache
+ *
+ * @{
+ */
+
+#define UC_CACHE_NEW            0x01
+#define UC_CACHE_ACQUIRING      0x02
+#define UC_CACHE_INVALID        0x04
+#define UC_CACHE_EXPIRED        0x08
+
+#define UC_CACHE_IS_NEW(i)          ((i)->ue_flags & UC_CACHE_NEW)
+#define UC_CACHE_IS_INVALID(i)      ((i)->ue_flags & UC_CACHE_INVALID)
+#define UC_CACHE_IS_ACQUIRING(i)    ((i)->ue_flags & UC_CACHE_ACQUIRING)
+#define UC_CACHE_IS_EXPIRED(i)      ((i)->ue_flags & UC_CACHE_EXPIRED)
+#define UC_CACHE_IS_VALID(i)        ((i)->ue_flags == 0)
+
+#define UC_CACHE_SET_NEW(i)         ((i)->ue_flags |= UC_CACHE_NEW)
+#define UC_CACHE_SET_INVALID(i)     ((i)->ue_flags |= UC_CACHE_INVALID)
+#define UC_CACHE_SET_ACQUIRING(i)   ((i)->ue_flags |= UC_CACHE_ACQUIRING)
+#define UC_CACHE_SET_EXPIRED(i)     ((i)->ue_flags |= UC_CACHE_EXPIRED)
+#define UC_CACHE_SET_VALID(i)       ((i)->ue_flags = 0)
+
+#define UC_CACHE_CLEAR_NEW(i)       ((i)->ue_flags &= ~UC_CACHE_NEW)
+#define UC_CACHE_CLEAR_ACQUIRING(i) ((i)->ue_flags &= ~UC_CACHE_ACQUIRING)
+#define UC_CACHE_CLEAR_INVALID(i)   ((i)->ue_flags &= ~UC_CACHE_INVALID)
+#define UC_CACHE_CLEAR_EXPIRED(i)   ((i)->ue_flags &= ~UC_CACHE_EXPIRED)
+
+struct upcall_cache_entry;
+
+struct md_perm {
+	lnet_nid_t      mp_nid;
+	uint32_t	mp_perm;
+};
+
+struct md_identity {
+	struct upcall_cache_entry *mi_uc_entry;
+	uid_t                      mi_uid;
+	gid_t                      mi_gid;
+	struct group_info          *mi_ginfo;
+	int                        mi_nperms;
+	struct md_perm            *mi_perms;
+};
+
+struct upcall_cache_entry {
+	struct list_head	ue_hash;
+	uint64_t		ue_key;
+	atomic_t		ue_refcount;
+	int			ue_flags;
+	wait_queue_head_t	ue_waitq;
+	cfs_time_t		ue_acquire_expire;
+	cfs_time_t		ue_expire;
+	union {
+		struct md_identity	identity;
+	} u;
+};
+
+#define UC_CACHE_HASH_SIZE        (128)
+#define UC_CACHE_HASH_INDEX(id)   ((id) & (UC_CACHE_HASH_SIZE - 1))
+#define UC_CACHE_UPCALL_MAXPATH   (1024UL)
+
+struct upcall_cache;
+
+struct upcall_cache_ops {
+	void            (*init_entry)(struct upcall_cache_entry *, void *args);
+	void            (*free_entry)(struct upcall_cache *,
+				      struct upcall_cache_entry *);
+	int             (*upcall_compare)(struct upcall_cache *,
+					  struct upcall_cache_entry *,
+					  __u64 key, void *args);
+	int             (*downcall_compare)(struct upcall_cache *,
+					    struct upcall_cache_entry *,
+					    __u64 key, void *args);
+	int             (*do_upcall)(struct upcall_cache *,
+				     struct upcall_cache_entry *);
+	int             (*parse_downcall)(struct upcall_cache *,
+					  struct upcall_cache_entry *, void *);
+};
+
+struct upcall_cache {
+	struct list_head	uc_hashtable[UC_CACHE_HASH_SIZE];
+	spinlock_t		uc_lock;
+	struct rw_semaphore	uc_upcall_rwsem;
+
+	char			uc_name[40];		/* for upcall */
+	char			uc_upcall[UC_CACHE_UPCALL_MAXPATH];
+	int			uc_acquire_expire;	/* seconds */
+	int			uc_entry_expire;	/* seconds */
+	struct upcall_cache_ops	*uc_ops;
+};
+
+struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *cache,
+						  __u64 key, void *args);
+void upcall_cache_put_entry(struct upcall_cache *cache,
+			    struct upcall_cache_entry *entry);
+int upcall_cache_downcall(struct upcall_cache *cache, __u32 err, __u64 key,
+			  void *args);
+void upcall_cache_flush(struct upcall_cache *cache, int force);
+
+static inline void upcall_cache_flush_idle(struct upcall_cache *cache)
+{
+	upcall_cache_flush(cache, 0);
+}
+
+static inline void upcall_cache_flush_all(struct upcall_cache *cache)
+{
+	upcall_cache_flush(cache, 1);
+}
+
+void upcall_cache_flush_one(struct upcall_cache *cache, __u64 key, void *args);
+struct upcall_cache *upcall_cache_init(const char *name, const char *upcall,
+				       struct upcall_cache_ops *ops);
+void upcall_cache_cleanup(struct upcall_cache *cache);
+
+/** @} ucache */
+
+#endif /* _UPCALL_CACHE_H */
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/interval_tree.c b/drivers/staging/lustrefsx/lustre/ldlm/interval_tree.c
new file mode 100644
index 0000000000000..7dd0c65332649
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ldlm/interval_tree.c
@@ -0,0 +1,765 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/interval_tree.c
+ *
+ * Interval tree library used by ldlm extent lock code
+ *
+ * Author: Huang Wei <huangwei@clusterfs.com>
+ * Author: Jay Xiong <jinshan.xiong@sun.com>
+ */
+#ifdef __KERNEL__
+# include <lustre_dlm.h>
+#else
+# include <libcfs/libcfs.h>
+#endif
+#include <interval_tree.h>
+
+enum {
+        INTERVAL_RED = 0,
+        INTERVAL_BLACK = 1
+};
+
+static inline int node_is_left_child(struct interval_node *node)
+{
+        LASSERT(node->in_parent != NULL);
+        return node == node->in_parent->in_left;
+}
+
+static inline int node_is_right_child(struct interval_node *node)
+{
+        LASSERT(node->in_parent != NULL);
+        return node == node->in_parent->in_right;
+}
+
+static inline int node_is_red(struct interval_node *node)
+{
+        return node->in_color == INTERVAL_RED;
+}
+
+static inline int node_is_black(struct interval_node *node)
+{
+        return node->in_color == INTERVAL_BLACK;
+}
+
+static inline int extent_compare(struct interval_node_extent *e1,
+                                 struct interval_node_extent *e2)
+{
+        int rc;
+        if (e1->start == e2->start) {
+                if (e1->end < e2->end)
+                        rc = -1;
+                else if (e1->end > e2->end)
+                        rc = 1;
+                else
+                        rc = 0;
+        } else {
+                if (e1->start < e2->start)
+                        rc = -1;
+                else
+                        rc = 1;
+        }
+        return rc;
+}
+
+static inline int extent_equal(struct interval_node_extent *e1,
+                               struct interval_node_extent *e2)
+{
+        return (e1->start == e2->start) && (e1->end == e2->end);
+}
+
+static inline int extent_overlapped(struct interval_node_extent *e1,
+                                    struct interval_node_extent *e2)
+{
+        return (e1->start <= e2->end) && (e2->start <= e1->end);
+}
+
+static inline int node_compare(struct interval_node *n1,
+                               struct interval_node *n2)
+{
+        return extent_compare(&n1->in_extent, &n2->in_extent);
+}
+
+int node_equal(struct interval_node *n1, struct interval_node *n2)
+{
+	return extent_equal(&n1->in_extent, &n2->in_extent);
+}
+
+static inline __u64 max_u64(__u64 x, __u64 y)
+{
+        return x > y ? x : y;
+}
+
+static inline __u64 min_u64(__u64 x, __u64 y)
+{
+        return x < y ? x : y;
+}
+
+#define interval_for_each(node, root)                   \
+for (node = interval_first(root); node != NULL;         \
+     node = interval_next(node))
+
+#define interval_for_each_reverse(node, root)           \
+for (node = interval_last(root); node != NULL;          \
+     node = interval_prev(node))
+
+static struct interval_node *interval_first(struct interval_node *node)
+{
+        ENTRY;
+
+        if (!node)
+                RETURN(NULL);
+        while (node->in_left)
+                node = node->in_left;
+        RETURN(node);
+}
+
+static struct interval_node *interval_last(struct interval_node *node)
+{
+        ENTRY;
+
+        if (!node)
+                RETURN(NULL);
+        while (node->in_right)
+                node = node->in_right;
+        RETURN(node);
+}
+
+static struct interval_node *interval_next(struct interval_node *node)
+{
+        ENTRY;
+
+        if (!node)
+                RETURN(NULL);
+        if (node->in_right)
+                RETURN(interval_first(node->in_right));
+        while (node->in_parent && node_is_right_child(node))
+                node = node->in_parent;
+        RETURN(node->in_parent);
+}
+
+static struct interval_node *interval_prev(struct interval_node *node)
+{
+        ENTRY;
+
+        if (!node)
+                RETURN(NULL);
+
+        if (node->in_left)
+                RETURN(interval_last(node->in_left));
+
+        while (node->in_parent && node_is_left_child(node))
+                node = node->in_parent;
+
+        RETURN(node->in_parent);
+}
+
+enum interval_iter interval_iterate(struct interval_node *root,
+                                    interval_callback_t func,
+                                    void *data)
+{
+        struct interval_node *node;
+        enum interval_iter rc = INTERVAL_ITER_CONT;
+        ENTRY;
+        
+        interval_for_each(node, root) {
+                rc = func(node, data);
+                if (rc == INTERVAL_ITER_STOP)
+                        break;
+        }
+
+        RETURN(rc);
+}
+EXPORT_SYMBOL(interval_iterate);
+
+enum interval_iter interval_iterate_reverse(struct interval_node *root,
+                                            interval_callback_t func,
+                                            void *data)
+{
+        struct interval_node *node;
+        enum interval_iter rc = INTERVAL_ITER_CONT;
+        ENTRY;
+        
+        interval_for_each_reverse(node, root) {
+                rc = func(node, data);
+                if (rc == INTERVAL_ITER_STOP)
+                        break;
+        }
+
+        RETURN(rc);
+}
+EXPORT_SYMBOL(interval_iterate_reverse);
+
+/* try to find a node with same interval in the tree,
+ * if found, return the pointer to the node, otherwise return NULL*/
+struct interval_node *interval_find(struct interval_node *root,
+                                    struct interval_node_extent *ex)
+{
+        struct interval_node *walk = root;
+        int rc;
+        ENTRY;
+
+        while (walk) {
+                rc = extent_compare(ex, &walk->in_extent);
+                if (rc == 0)
+                        break;
+                else if (rc < 0)
+                        walk = walk->in_left;
+                else
+                        walk = walk->in_right;
+        }
+
+        RETURN(walk);
+}
+EXPORT_SYMBOL(interval_find);
+
+static void __rotate_change_maxhigh(struct interval_node *node,
+                                    struct interval_node *rotate)
+{
+        __u64 left_max, right_max;
+
+        rotate->in_max_high = node->in_max_high;
+        left_max = node->in_left ? node->in_left->in_max_high : 0;
+        right_max = node->in_right ? node->in_right->in_max_high : 0;
+        node->in_max_high  = max_u64(interval_high(node),
+                                     max_u64(left_max,right_max));
+}
+
+/* The left rotation "pivots" around the link from node to node->right, and
+ * - node will be linked to node->right's left child, and
+ * - node->right's left child will be linked to node's right child.  */
+static void __rotate_left(struct interval_node *node,
+                          struct interval_node **root)
+{
+        struct interval_node *right = node->in_right;
+        struct interval_node *parent = node->in_parent;
+
+        node->in_right = right->in_left;
+        if (node->in_right)
+                right->in_left->in_parent = node;
+
+        right->in_left = node;
+        right->in_parent = parent;
+        if (parent) {
+                if (node_is_left_child(node))
+                        parent->in_left = right;
+                else
+                        parent->in_right = right;
+        } else {
+                *root = right;
+        }
+        node->in_parent = right;
+
+        /* update max_high for node and right */
+        __rotate_change_maxhigh(node, right);
+}
+
+/* The right rotation "pivots" around the link from node to node->left, and
+ * - node will be linked to node->left's right child, and
+ * - node->left's right child will be linked to node's left child.  */
+static void __rotate_right(struct interval_node *node,
+                           struct interval_node **root)
+{
+        struct interval_node *left = node->in_left;
+        struct interval_node *parent = node->in_parent;
+
+        node->in_left = left->in_right;
+        if (node->in_left)
+                left->in_right->in_parent = node;
+        left->in_right = node;
+
+        left->in_parent = parent;
+        if (parent) {
+                if (node_is_right_child(node))
+                        parent->in_right = left;
+                else
+                        parent->in_left = left;
+        } else {
+                *root = left;
+        }
+        node->in_parent = left;
+
+        /* update max_high for node and left */
+        __rotate_change_maxhigh(node, left);
+}
+
+#define interval_swap(a, b) do {                        \
+        struct interval_node *c = a; a = b; b = c;      \
+} while (0)
+
+/*
+ * Operations INSERT and DELETE, when run on a tree with n keys, 
+ * take O(logN) time.Because they modify the tree, the result 
+ * may violate the red-black properties.To restore these properties, 
+ * we must change the colors of some of the nodes in the tree 
+ * and also change the pointer structure.
+ */
+static void interval_insert_color(struct interval_node *node,
+                                  struct interval_node **root)
+{
+        struct interval_node *parent, *gparent;
+        ENTRY;
+
+        while ((parent = node->in_parent) && node_is_red(parent)) {
+                gparent = parent->in_parent;
+                /* Parent is RED, so gparent must not be NULL */
+                if (node_is_left_child(parent)) {
+                        struct interval_node *uncle;
+                        uncle = gparent->in_right;
+                        if (uncle && node_is_red(uncle)) {
+                                uncle->in_color = INTERVAL_BLACK;
+                                parent->in_color = INTERVAL_BLACK;
+                                gparent->in_color = INTERVAL_RED;
+                                node = gparent;
+                                continue;
+                        }
+
+                        if (parent->in_right == node) {
+                                __rotate_left(parent, root);
+                                interval_swap(node, parent);
+                        }
+
+                        parent->in_color = INTERVAL_BLACK;
+                        gparent->in_color = INTERVAL_RED;
+                        __rotate_right(gparent, root);
+                } else {
+                        struct interval_node *uncle;
+                        uncle = gparent->in_left;
+                        if (uncle && node_is_red(uncle)) {
+                                uncle->in_color = INTERVAL_BLACK;
+                                parent->in_color = INTERVAL_BLACK;
+                                gparent->in_color = INTERVAL_RED;
+                                node = gparent;
+                                continue;
+                        }
+
+                        if (node_is_left_child(node)) {
+                                __rotate_right(parent, root);
+                                interval_swap(node, parent);
+                        }
+
+                        parent->in_color = INTERVAL_BLACK;
+                        gparent->in_color = INTERVAL_RED;
+                        __rotate_left(gparent, root);
+                }
+        }
+
+        (*root)->in_color = INTERVAL_BLACK;
+        EXIT;
+}
+
+struct interval_node *interval_insert(struct interval_node *node,
+                                      struct interval_node **root)
+                     
+{
+        struct interval_node **p, *parent = NULL;
+        ENTRY;
+
+        LASSERT(!interval_is_intree(node));
+        p = root;
+        while (*p) {
+                parent = *p;
+                if (node_equal(parent, node))
+                        RETURN(parent);
+
+                /* max_high field must be updated after each iteration */
+                if (parent->in_max_high < interval_high(node))
+                        parent->in_max_high = interval_high(node);
+
+                if (node_compare(node, parent) < 0)
+                        p = &parent->in_left;
+                else 
+                        p = &parent->in_right;
+        }
+
+        /* link node into the tree */
+        node->in_parent = parent;
+        node->in_color = INTERVAL_RED;
+        node->in_left = node->in_right = NULL;
+        *p = node;
+
+        interval_insert_color(node, root);
+        node->in_intree = 1;
+
+        RETURN(NULL);
+}
+EXPORT_SYMBOL(interval_insert);
+
+static inline int node_is_black_or_0(struct interval_node *node)
+{
+        return !node || node_is_black(node);
+}
+
+static void interval_erase_color(struct interval_node *node,
+                                 struct interval_node *parent,
+                                 struct interval_node **root)
+{
+        struct interval_node *tmp;
+        ENTRY;
+
+        while (node_is_black_or_0(node) && node != *root) {
+                if (parent->in_left == node) {
+                        tmp = parent->in_right;
+                        if (node_is_red(tmp)) {
+                                tmp->in_color = INTERVAL_BLACK;
+                                parent->in_color = INTERVAL_RED;
+                                __rotate_left(parent, root);
+                                tmp = parent->in_right;
+                        }
+                        if (node_is_black_or_0(tmp->in_left) &&
+                            node_is_black_or_0(tmp->in_right)) {
+                                tmp->in_color = INTERVAL_RED;
+                                node = parent;
+                                parent = node->in_parent;
+                        } else {
+                                if (node_is_black_or_0(tmp->in_right)) {
+                                        struct interval_node *o_left;
+                                        if ((o_left = tmp->in_left))
+                                             o_left->in_color = INTERVAL_BLACK;
+                                        tmp->in_color = INTERVAL_RED;
+                                        __rotate_right(tmp, root);
+                                        tmp = parent->in_right;
+                                }
+                                tmp->in_color = parent->in_color;
+                                parent->in_color = INTERVAL_BLACK;
+                                if (tmp->in_right)
+                                    tmp->in_right->in_color = INTERVAL_BLACK;
+                                __rotate_left(parent, root);
+                                node = *root;
+                                break;
+                        }
+                } else {
+                        tmp = parent->in_left;
+                        if (node_is_red(tmp)) {
+                                tmp->in_color = INTERVAL_BLACK;
+                                parent->in_color = INTERVAL_RED;
+                                __rotate_right(parent, root);
+                                tmp = parent->in_left;
+                        }
+                        if (node_is_black_or_0(tmp->in_left) &&
+                            node_is_black_or_0(tmp->in_right)) {
+                                tmp->in_color = INTERVAL_RED;
+                                node = parent;
+                                parent = node->in_parent;
+                        } else {
+                                if (node_is_black_or_0(tmp->in_left)) {
+                                        struct interval_node *o_right;
+                                        if ((o_right = tmp->in_right))
+                                            o_right->in_color = INTERVAL_BLACK;
+                                        tmp->in_color = INTERVAL_RED;
+                                        __rotate_left(tmp, root);
+                                        tmp = parent->in_left;
+                                }
+                                tmp->in_color = parent->in_color;
+                                parent->in_color = INTERVAL_BLACK;
+                                if (tmp->in_left)
+                                        tmp->in_left->in_color = INTERVAL_BLACK;
+                                __rotate_right(parent, root);
+                                node = *root;
+                                break;
+                        }
+                }
+        }
+        if (node)
+                node->in_color = INTERVAL_BLACK;
+        EXIT;
+}
+
+/* 
+ * if the @max_high value of @node is changed, this function traverse  a path 
+ * from node  up to the root to update max_high for the whole tree.
+ */
+static void update_maxhigh(struct interval_node *node,
+                           __u64  old_maxhigh)
+{
+        __u64 left_max, right_max;
+        ENTRY;
+
+        while (node) {
+                left_max = node->in_left ? node->in_left->in_max_high : 0;
+                right_max = node->in_right ? node->in_right->in_max_high : 0;
+                node->in_max_high = max_u64(interval_high(node),
+                                            max_u64(left_max, right_max));
+
+                if (node->in_max_high >= old_maxhigh)
+                        break;
+                node = node->in_parent;
+        }
+        EXIT;
+}
+
+void interval_erase(struct interval_node *node,
+                    struct interval_node **root)
+{
+        struct interval_node *child, *parent;
+        int color;
+        ENTRY;
+
+        LASSERT(interval_is_intree(node));
+        node->in_intree = 0;
+        if (!node->in_left) {
+                child = node->in_right;
+        } else if (!node->in_right) {
+                child = node->in_left;
+        } else { /* Both left and right child are not NULL */
+                struct interval_node *old = node;
+
+                node = interval_next(node);
+                child = node->in_right;
+                parent = node->in_parent;
+                color = node->in_color;
+
+                if (child)
+                        child->in_parent = parent;
+                if (parent == old)
+                        parent->in_right = child;
+                else
+                        parent->in_left = child;
+
+                node->in_color = old->in_color;
+                node->in_right = old->in_right;
+                node->in_left = old->in_left;
+                node->in_parent = old->in_parent;
+
+                if (old->in_parent) {
+                        if (node_is_left_child(old))
+                                old->in_parent->in_left = node;
+                        else
+                                old->in_parent->in_right = node;
+                } else {
+                        *root = node;
+                }
+
+                old->in_left->in_parent = node;
+                if (old->in_right)
+                        old->in_right->in_parent = node;
+                update_maxhigh(child ? : parent, node->in_max_high);
+                update_maxhigh(node, old->in_max_high);
+                if (parent == old)
+                         parent = node;
+                goto color;
+        }
+        parent = node->in_parent;
+        color = node->in_color;
+
+        if (child)
+                child->in_parent = parent;
+        if (parent) {
+                if (node_is_left_child(node))
+                        parent->in_left = child;
+                else
+                        parent->in_right = child;
+        } else {
+                *root = child;
+        }
+
+        update_maxhigh(child ? : parent, node->in_max_high);
+
+color:
+        if (color == INTERVAL_BLACK)
+                interval_erase_color(child, parent, root);
+        EXIT;
+}
+EXPORT_SYMBOL(interval_erase);
+
+static inline int interval_may_overlap(struct interval_node *node,
+                                          struct interval_node_extent *ext)
+{
+        return (ext->start <= node->in_max_high &&
+                ext->end >= interval_low(node));
+}
+
+/*
+ * This function finds all intervals that overlap interval ext,
+ * and calls func to handle resulted intervals one by one.
+ * in lustre, this function will find all conflicting locks in
+ * the granted queue and add these locks to the ast work list.
+ *
+ * {
+ *       if (node == NULL)
+ *               return 0;
+ *       if (ext->end < interval_low(node)) {
+ *               interval_search(node->in_left, ext, func, data);
+ *       } else if (interval_may_overlap(node, ext)) {
+ *               if (extent_overlapped(ext, &node->in_extent))
+ *                       func(node, data);
+ *               interval_search(node->in_left, ext, func, data);
+ *               interval_search(node->in_right, ext, func, data);
+ *       }
+ *       return 0;
+ * }
+ *
+ */
+enum interval_iter interval_search(struct interval_node *node,
+				   struct interval_node_extent *ext,
+				   interval_callback_t func,
+				   void *data)
+{
+	struct interval_node *parent;
+	enum interval_iter rc = INTERVAL_ITER_CONT;
+
+	ENTRY;
+
+	LASSERT(ext != NULL);
+	LASSERT(func != NULL);
+
+	while (node) {
+		if (ext->end < interval_low(node)) {
+			if (node->in_left) {
+				node = node->in_left;
+				continue;
+			}
+		} else if (interval_may_overlap(node, ext)) {
+			if (extent_overlapped(ext, &node->in_extent)) {
+				rc = func(node, data);
+				if (rc == INTERVAL_ITER_STOP)
+					break;
+			}
+
+			if (node->in_left) {
+				node = node->in_left;
+				continue;
+			}
+			if (node->in_right) {
+				node = node->in_right;
+				continue;
+			}
+		}
+
+		parent = node->in_parent;
+		while (parent) {
+			if (node_is_left_child(node) &&
+			    parent->in_right) {
+				/* If we ever got the left, it means that the
+				 * parent met ext->end<interval_low(parent), or
+				 * may_overlap(parent). If the former is true,
+				 * we needn't go back. So stop early and check
+				 * may_overlap(parent) after this loop.  */
+				node = parent->in_right;
+				break;
+			}
+			node = parent;
+			parent = parent->in_parent;
+		}
+		if (parent == NULL || !interval_may_overlap(parent, ext))
+			break;
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(interval_search);
+
+static enum interval_iter interval_overlap_cb(struct interval_node *n,
+                                              void *args)
+{
+        *(int *)args = 1;
+        return INTERVAL_ITER_STOP;
+}
+
+int interval_is_overlapped(struct interval_node *root,
+                           struct interval_node_extent *ext)
+{
+        int has = 0;
+        (void)interval_search(root, ext, interval_overlap_cb, &has);
+        return has;
+}
+EXPORT_SYMBOL(interval_is_overlapped);
+
+/* Don't expand to low. Expanding downwards is expensive, and meaningless to
+ * some extents, because programs seldom do IO backward.
+ *
+ * The recursive algorithm of expanding low:
+ * expand_low {
+ *        struct interval_node *tmp;
+ *        static __u64 res = 0;
+ *
+ *        if (root == NULL)
+ *                return res;
+ *        if (root->in_max_high < low) {
+ *                res = max_u64(root->in_max_high + 1, res);
+ *                return res;
+ *        } else if (low < interval_low(root)) {
+ *                interval_expand_low(root->in_left, low);
+ *                return res;
+ *        }
+ *
+ *        if (interval_high(root) < low)
+ *                res = max_u64(interval_high(root) + 1, res);
+ *        interval_expand_low(root->in_left, low);
+ *        interval_expand_low(root->in_right, low);
+ *
+ *        return res;
+ * }
+ *
+ * It's much easy to eliminate the recursion, see interval_search for 
+ * an example. -jay
+ */
+static inline __u64 interval_expand_low(struct interval_node *root, __u64 low)
+{
+        /* we only concern the empty tree right now. */
+        if (root == NULL)
+                return 0;
+        return low;
+}
+
+static inline __u64 interval_expand_high(struct interval_node *node, __u64 high)
+{
+        __u64 result = ~0;
+
+        while (node != NULL) {
+                if (node->in_max_high < high)
+                        break;
+                        
+                if (interval_low(node) > high) {
+                        result = interval_low(node) - 1;
+                        node = node->in_left;
+                } else {
+                        node = node->in_right;
+                }
+        }
+
+        return result;
+}
+
+/* expanding the extent based on @ext. */
+void interval_expand(struct interval_node *root,
+                     struct interval_node_extent *ext,
+                     struct interval_node_extent *limiter)
+{
+        /* The assertion of interval_is_overlapped is expensive because we may
+         * travel many nodes to find the overlapped node. */
+        LASSERT(interval_is_overlapped(root, ext) == 0);
+        if (!limiter || limiter->start < ext->start)
+                ext->start = interval_expand_low(root, ext->start);
+        if (!limiter || limiter->end > ext->end)
+                ext->end = interval_expand_high(root, ext->end);
+        LASSERT(interval_is_overlapped(root, ext) == 0);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/l_lock.c b/drivers/staging/lustrefsx/lustre/ldlm/l_lock.c
new file mode 100644
index 0000000000000..a4f7c85a42efb
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ldlm/l_lock.c
@@ -0,0 +1,72 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+#include <libcfs/libcfs.h>
+
+#include <lustre_dlm.h>
+#include <lustre_lib.h>
+
+/**
+ * Lock a lock and its resource.
+ *
+ * LDLM locking uses resource to serialize access to locks
+ * but there is a case when we change resource of lock upon
+ * enqueue reply. We rely on lock->l_resource = new_res
+ * being an atomic operation.
+ */
+struct ldlm_resource *lock_res_and_lock(struct ldlm_lock *lock)
+{
+	/* on server-side resource of lock doesn't change */
+	if (!ldlm_is_ns_srv(lock))
+		spin_lock(&lock->l_lock);
+
+	lock_res(lock->l_resource);
+
+	ldlm_set_res_locked(lock);
+	return lock->l_resource;
+}
+EXPORT_SYMBOL(lock_res_and_lock);
+
+/**
+ * Unlock a lock and its resource previously locked with lock_res_and_lock
+ */
+void unlock_res_and_lock(struct ldlm_lock *lock)
+{
+	/* on server-side resource of lock doesn't change */
+	ldlm_clear_res_locked(lock);
+
+	unlock_res(lock->l_resource);
+	if (!ldlm_is_ns_srv(lock))
+		spin_unlock(&lock->l_lock);
+}
+EXPORT_SYMBOL(unlock_res_and_lock);
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_extent.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_extent.c
new file mode 100644
index 0000000000000..1088d583145e7
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_extent.c
@@ -0,0 +1,1108 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_extent.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+/**
+ * This file contains implementation of EXTENT lock type
+ *
+ * EXTENT lock type is for locking a contiguous range of values, represented
+ * by 64-bit starting and ending offsets (inclusive). There are several extent
+ * lock modes, some of which may be mutually incompatible. Extent locks are
+ * considered incompatible if their modes are incompatible and their extents
+ * intersect.  See the lock mode compatibility matrix in lustre_dlm.h.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <libcfs/libcfs.h>
+#include <lustre_dlm.h>
+#include <obd_support.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+
+#include "ldlm_internal.h"
+
+#ifdef HAVE_SERVER_SUPPORT
+# define LDLM_MAX_GROWN_EXTENT (32 * 1024 * 1024 - 1)
+
+/**
+ * Fix up the ldlm_extent after expanding it.
+ *
+ * After expansion has been done, we might still want to do certain adjusting
+ * based on overall contention of the resource and the like to avoid granting
+ * overly wide locks.
+ */
+static void ldlm_extent_internal_policy_fixup(struct ldlm_lock *req,
+					      struct ldlm_extent *new_ex,
+					      int conflicting)
+{
+	enum ldlm_mode req_mode = req->l_req_mode;
+	__u64 req_start = req->l_req_extent.start;
+	__u64 req_end = req->l_req_extent.end;
+	__u64 req_align, mask;
+
+        if (conflicting > 32 && (req_mode == LCK_PW || req_mode == LCK_CW)) {
+                if (req_end < req_start + LDLM_MAX_GROWN_EXTENT)
+                        new_ex->end = min(req_start + LDLM_MAX_GROWN_EXTENT,
+                                          new_ex->end);
+        }
+
+        if (new_ex->start == 0 && new_ex->end == OBD_OBJECT_EOF) {
+                EXIT;
+                return;
+        }
+
+        /* we need to ensure that the lock extent is properly aligned to what
+         * the client requested. Also we need to make sure it's also server
+         * page size aligned otherwise a server page can be covered by two
+         * write locks. */
+	mask = PAGE_SIZE;
+        req_align = (req_end + 1) | req_start;
+        if (req_align != 0 && (req_align & (mask - 1)) == 0) {
+                while ((req_align & mask) == 0)
+                        mask <<= 1;
+        }
+        mask -= 1;
+        /* We can only shrink the lock, not grow it.
+         * This should never cause lock to be smaller than requested,
+         * since requested lock was already aligned on these boundaries. */
+        new_ex->start = ((new_ex->start - 1) | mask) + 1;
+        new_ex->end = ((new_ex->end + 1) & ~mask) - 1;
+        LASSERTF(new_ex->start <= req_start,
+		 "mask %#llx grant start %llu req start %llu\n",
+                 mask, new_ex->start, req_start);
+        LASSERTF(new_ex->end >= req_end,
+		 "mask %#llx grant end %llu req end %llu\n",
+                 mask, new_ex->end, req_end);
+}
+
+/**
+ * Return the maximum extent that:
+ * - contains the requested extent
+ * - does not overlap existing conflicting extents outside the requested one
+ *
+ * This allows clients to request a small required extent range, but if there
+ * is no contention on the lock the full lock can be granted to the client.
+ * This avoids the need for many smaller lock requests to be granted in the
+ * common (uncontended) case.
+ *
+ * Use interval tree to expand the lock extent for granted lock.
+ */
+static void ldlm_extent_internal_policy_granted(struct ldlm_lock *req,
+                                                struct ldlm_extent *new_ex)
+{
+	struct ldlm_resource *res = req->l_resource;
+	enum ldlm_mode req_mode = req->l_req_mode;
+	__u64 req_start = req->l_req_extent.start;
+	__u64 req_end = req->l_req_extent.end;
+	struct ldlm_interval_tree *tree;
+	struct interval_node_extent limiter = {
+		.start	= new_ex->start,
+		.end	= new_ex->end,
+	};
+	int conflicting = 0;
+	int idx;
+	ENTRY;
+
+	lockmode_verify(req_mode);
+
+	/* Using interval tree to handle the LDLM extent granted locks. */
+        for (idx = 0; idx < LCK_MODE_NUM; idx++) {
+		struct interval_node_extent ext = {
+			.start	= req_start,
+			.end	= req_end,
+		};
+
+                tree = &res->lr_itree[idx];
+                if (lockmode_compat(tree->lit_mode, req_mode))
+                        continue;
+
+                conflicting += tree->lit_size;
+                if (conflicting > 4)
+                        limiter.start = req_start;
+
+                if (interval_is_overlapped(tree->lit_root, &ext))
+                        CDEBUG(D_INFO, 
+                               "req_mode = %d, tree->lit_mode = %d, "
+                               "tree->lit_size = %d\n",
+                               req_mode, tree->lit_mode, tree->lit_size);
+                interval_expand(tree->lit_root, &ext, &limiter);
+                limiter.start = max(limiter.start, ext.start);
+                limiter.end = min(limiter.end, ext.end);
+                if (limiter.start == req_start && limiter.end == req_end)
+                        break;
+        }
+
+        new_ex->start = limiter.start;
+        new_ex->end = limiter.end;
+        LASSERT(new_ex->start <= req_start);
+        LASSERT(new_ex->end >= req_end);
+
+        ldlm_extent_internal_policy_fixup(req, new_ex, conflicting);
+        EXIT;
+}
+
+/* The purpose of this function is to return:
+ * - the maximum extent
+ * - containing the requested extent
+ * - and not overlapping existing conflicting extents outside the requested one
+ */
+static void
+ldlm_extent_internal_policy_waiting(struct ldlm_lock *req,
+                                    struct ldlm_extent *new_ex)
+{
+	struct ldlm_resource *res = req->l_resource;
+	enum ldlm_mode req_mode = req->l_req_mode;
+	__u64 req_start = req->l_req_extent.start;
+	__u64 req_end = req->l_req_extent.end;
+	struct ldlm_lock *lock;
+	int conflicting = 0;
+	ENTRY;
+
+	lockmode_verify(req_mode);
+
+	/* for waiting locks */
+	list_for_each_entry(lock, &res->lr_waiting, l_res_link) {
+		struct ldlm_extent *l_extent = &lock->l_policy_data.l_extent;
+
+		/* We already hit the minimum requested size, search no more */
+		if (new_ex->start == req_start && new_ex->end == req_end) {
+			EXIT;
+			return;
+		}
+
+                /* Don't conflict with ourselves */
+                if (req == lock)
+                        continue;
+
+                /* Locks are compatible, overlap doesn't matter */
+                /* Until bug 20 is fixed, try to avoid granting overlapping
+                 * locks on one client (they take a long time to cancel) */
+                if (lockmode_compat(lock->l_req_mode, req_mode) &&
+                    lock->l_export != req->l_export)
+                        continue;
+
+                /* If this is a high-traffic lock, don't grow downwards at all
+                 * or grow upwards too much */
+                ++conflicting;
+                if (conflicting > 4)
+                        new_ex->start = req_start;
+
+                /* If lock doesn't overlap new_ex, skip it. */
+                if (!ldlm_extent_overlap(l_extent, new_ex))
+                        continue;
+
+                /* Locks conflicting in requested extents and we can't satisfy
+                 * both locks, so ignore it.  Either we will ping-pong this
+                 * extent (we would regardless of what extent we granted) or
+                 * lock is unused and it shouldn't limit our extent growth. */
+                if (ldlm_extent_overlap(&lock->l_req_extent,&req->l_req_extent))
+                        continue;
+
+                /* We grow extents downwards only as far as they don't overlap
+                 * with already-granted locks, on the assumption that clients
+                 * will be writing beyond the initial requested end and would
+                 * then need to enqueue a new lock beyond previous request.
+                 * l_req_extent->end strictly < req_start, checked above. */
+                if (l_extent->start < req_start && new_ex->start != req_start) {
+                        if (l_extent->end >= req_start)
+                                new_ex->start = req_start;
+                        else
+                                new_ex->start = min(l_extent->end+1, req_start);
+                }
+
+                /* If we need to cancel this lock anyways because our request
+                 * overlaps the granted lock, we grow up to its requested
+                 * extent start instead of limiting this extent, assuming that
+                 * clients are writing forwards and the lock had over grown
+                 * its extent downwards before we enqueued our request. */
+                if (l_extent->end > req_end) {
+                        if (l_extent->start <= req_end)
+                                new_ex->end = max(lock->l_req_extent.start - 1,
+                                                  req_end);
+                        else
+                                new_ex->end = max(l_extent->start - 1, req_end);
+                }
+        }
+
+        ldlm_extent_internal_policy_fixup(req, new_ex, conflicting);
+        EXIT;
+}
+
+
+/* In order to determine the largest possible extent we can grant, we need
+ * to scan all of the queues. */
+static void ldlm_extent_policy(struct ldlm_resource *res,
+			       struct ldlm_lock *lock, __u64 *flags)
+{
+        struct ldlm_extent new_ex = { .start = 0, .end = OBD_OBJECT_EOF };
+
+        if (lock->l_export == NULL)
+                /*
+                 * this is local lock taken by server (e.g., as a part of
+                 * OST-side locking, or unlink handling). Expansion doesn't
+                 * make a lot of sense for local locks, because they are
+                 * dropped immediately on operation completion and would only
+                 * conflict with other threads.
+                 */
+                return;
+
+        if (lock->l_policy_data.l_extent.start == 0 &&
+            lock->l_policy_data.l_extent.end == OBD_OBJECT_EOF)
+                /* fast-path whole file locks */
+                return;
+
+        ldlm_extent_internal_policy_granted(lock, &new_ex);
+        ldlm_extent_internal_policy_waiting(lock, &new_ex);
+
+        if (new_ex.start != lock->l_policy_data.l_extent.start ||
+            new_ex.end != lock->l_policy_data.l_extent.end) {
+                *flags |= LDLM_FL_LOCK_CHANGED;
+                lock->l_policy_data.l_extent.start = new_ex.start;
+                lock->l_policy_data.l_extent.end = new_ex.end;
+        }
+}
+
+static int ldlm_check_contention(struct ldlm_lock *lock, int contended_locks)
+{
+	struct ldlm_resource *res = lock->l_resource;
+	cfs_time_t now = cfs_time_current();
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_SET_CONTENTION))
+		return 1;
+
+	CDEBUG(D_DLMTRACE, "contended locks = %d\n", contended_locks);
+	if (contended_locks > ldlm_res_to_ns(res)->ns_contended_locks)
+		res->lr_contention_time = now;
+	return cfs_time_before(now, cfs_time_add(res->lr_contention_time,
+		cfs_time_seconds(ldlm_res_to_ns(res)->ns_contention_time)));
+}
+
+struct ldlm_extent_compat_args {
+	struct list_head *work_list;
+	struct ldlm_lock *lock;
+	enum ldlm_mode mode;
+	int *locks;
+	int *compat;
+};
+
+static enum interval_iter ldlm_extent_compat_cb(struct interval_node *n,
+						void *data)
+{
+	struct ldlm_extent_compat_args *priv = data;
+	struct ldlm_interval *node = to_ldlm_interval(n);
+	struct ldlm_extent *extent;
+	struct list_head *work_list = priv->work_list;
+	struct ldlm_lock *lock, *enq = priv->lock;
+	enum ldlm_mode mode = priv->mode;
+	int count = 0;
+	ENTRY;
+
+	LASSERT(!list_empty(&node->li_group));
+
+	list_for_each_entry(lock, &node->li_group, l_sl_policy) {
+                /* interval tree is for granted lock */
+                LASSERTF(mode == lock->l_granted_mode,
+                         "mode = %s, lock->l_granted_mode = %s\n",
+                         ldlm_lockname[mode],
+                         ldlm_lockname[lock->l_granted_mode]);
+                count++;
+		if (lock->l_blocking_ast &&
+		    lock->l_granted_mode != LCK_GROUP)
+                        ldlm_add_ast_work_item(lock, enq, work_list);
+        }
+
+        /* don't count conflicting glimpse locks */
+        extent = ldlm_interval_extent(node);
+        if (!(mode == LCK_PR &&
+            extent->start == 0 && extent->end == OBD_OBJECT_EOF))
+                *priv->locks += count;
+
+        if (priv->compat)
+                *priv->compat = 0;
+
+        RETURN(INTERVAL_ITER_CONT);
+}
+
+/**
+ * Determine if the lock is compatible with all locks on the queue.
+ *
+ * If \a work_list is provided, conflicting locks are linked there.
+ * If \a work_list is not provided, we exit this function on first conflict.
+ *
+ * \retval 0 if the lock is not compatible
+ * \retval 1 if the lock is compatible
+ * \retval 2 if \a req is a group lock and it is compatible and requires
+ *           no further checking
+ * \retval negative error, such as EWOULDBLOCK for group locks
+ */
+static int
+ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
+			 __u64 *flags, enum ldlm_error *err,
+			 struct list_head *work_list, int *contended_locks)
+{
+	struct ldlm_resource *res = req->l_resource;
+	enum ldlm_mode req_mode = req->l_req_mode;
+	__u64 req_start = req->l_req_extent.start;
+	__u64 req_end = req->l_req_extent.end;
+	struct ldlm_lock *lock;
+	int check_contention;
+	int compat = 1;
+	int scan = 0;
+	ENTRY;
+
+        lockmode_verify(req_mode);
+
+        /* Using interval tree for granted lock */
+        if (queue == &res->lr_granted) {
+                struct ldlm_interval_tree *tree;
+                struct ldlm_extent_compat_args data = {.work_list = work_list,
+                                               .lock = req,
+                                               .locks = contended_locks,
+                                               .compat = &compat };
+                struct interval_node_extent ex = { .start = req_start,
+                                                   .end = req_end };
+                int idx, rc;
+
+                for (idx = 0; idx < LCK_MODE_NUM; idx++) {
+                        tree = &res->lr_itree[idx];
+                        if (tree->lit_root == NULL) /* empty tree, skipped */
+                                continue;
+
+                        data.mode = tree->lit_mode;
+                        if (lockmode_compat(req_mode, tree->lit_mode)) {
+                                struct ldlm_interval *node;
+                                struct ldlm_extent *extent;
+
+                                if (req_mode != LCK_GROUP)
+                                        continue;
+
+                                /* group lock, grant it immediately if
+                                 * compatible */
+                                node = to_ldlm_interval(tree->lit_root);
+                                extent = ldlm_interval_extent(node);
+                                if (req->l_policy_data.l_extent.gid ==
+                                    extent->gid)
+                                        RETURN(2);
+                        }
+
+                        if (tree->lit_mode == LCK_GROUP) {
+                                if (*flags & LDLM_FL_BLOCK_NOWAIT) {
+                                        compat = -EWOULDBLOCK;
+                                        goto destroylock;
+                                }
+
+                                *flags |= LDLM_FL_NO_TIMEOUT;
+                                if (!work_list)
+                                        RETURN(0);
+
+                                /* if work list is not NULL,add all
+                                   locks in the tree to work list */
+                                compat = 0;
+                                interval_iterate(tree->lit_root,
+                                                 ldlm_extent_compat_cb, &data);
+                                continue;
+                        }
+
+                        if (!work_list) {
+                                rc = interval_is_overlapped(tree->lit_root,&ex);
+                                if (rc)
+                                        RETURN(0);
+                        } else {
+                                interval_search(tree->lit_root, &ex,
+                                                ldlm_extent_compat_cb, &data);
+				if (!list_empty(work_list) && compat)
+                                        compat = 0;
+                        }
+                }
+        } else { /* for waiting queue */
+		list_for_each_entry(lock, queue, l_res_link) {
+                        check_contention = 1;
+
+			/* We stop walking the queue if we hit ourselves so
+			 * we don't take conflicting locks enqueued after us
+			 * into account, or we'd wait forever. */
+                        if (req == lock)
+                                break;
+
+                        if (unlikely(scan)) {
+                                /* We only get here if we are queuing GROUP lock
+                                   and met some incompatible one. The main idea of this
+                                   code is to insert GROUP lock past compatible GROUP
+                                   lock in the waiting queue or if there is not any,
+                                   then in front of first non-GROUP lock */
+                                if (lock->l_req_mode != LCK_GROUP) {
+                                        /* Ok, we hit non-GROUP lock, there should
+                                         * be no more GROUP locks later on, queue in
+                                         * front of first non-GROUP lock */
+
+                                        ldlm_resource_insert_lock_after(lock, req);
+					list_del_init(&lock->l_res_link);
+                                        ldlm_resource_insert_lock_after(req, lock);
+                                        compat = 0;
+                                        break;
+                                }
+                                if (req->l_policy_data.l_extent.gid ==
+                                    lock->l_policy_data.l_extent.gid) {
+                                        /* found it */
+                                        ldlm_resource_insert_lock_after(lock, req);
+                                        compat = 0;
+                                        break;
+                                }
+                                continue;
+                        }
+
+                        /* locks are compatible, overlap doesn't matter */
+                        if (lockmode_compat(lock->l_req_mode, req_mode)) {
+                                if (req_mode == LCK_PR &&
+                                    ((lock->l_policy_data.l_extent.start <=
+                                      req->l_policy_data.l_extent.start) &&
+                                     (lock->l_policy_data.l_extent.end >=
+                                      req->l_policy_data.l_extent.end))) {
+					/* If we met a PR lock just like us or
+					   wider, and nobody down the list
+					   conflicted with it, that means we
+					   can skip processing of the rest of
+					   the list and safely place ourselves
+					   at the end of the list, or grant
+					   (dependent if we met an conflicting
+					   locks before in the list).  In case
+					   of 1st enqueue only we continue
+					   traversing if there is something
+					   conflicting down the list because
+					   we need to make sure that something
+					   is marked as AST_SENT as well, in
+					   cse of empy worklist we would exit
+					   on first conflict met. */
+					/* There IS a case where such flag is
+					   not set for a lock, yet it blocks
+					   something. Luckily for us this is
+					   only during destroy, so lock is
+					   exclusive. So here we are safe */
+					if (!ldlm_is_ast_sent(lock))
+						RETURN(compat);
+                                }
+
+                                /* non-group locks are compatible, overlap doesn't
+                                   matter */
+                                if (likely(req_mode != LCK_GROUP))
+                                        continue;
+
+                                /* If we are trying to get a GROUP lock and there is
+                                   another one of this kind, we need to compare gid */
+                                if (req->l_policy_data.l_extent.gid ==
+                                    lock->l_policy_data.l_extent.gid) {
+                                        /* If existing lock with matched gid is granted,
+                                           we grant new one too. */
+                                        if (lock->l_req_mode == lock->l_granted_mode)
+                                                RETURN(2);
+
+                                        /* Otherwise we are scanning queue of waiting
+                                         * locks and it means current request would
+                                         * block along with existing lock (that is
+                                         * already blocked.
+                                         * If we are in nonblocking mode - return
+                                         * immediately */
+                                        if (*flags & LDLM_FL_BLOCK_NOWAIT) {
+                                                compat = -EWOULDBLOCK;
+                                                goto destroylock;
+                                        }
+                                        /* If this group lock is compatible with another
+                                         * group lock on the waiting list, they must be
+                                         * together in the list, so they can be granted
+                                         * at the same time.  Otherwise the later lock
+                                         * can get stuck behind another, incompatible,
+                                         * lock. */
+                                        ldlm_resource_insert_lock_after(lock, req);
+                                        /* Because 'lock' is not granted, we can stop
+                                         * processing this queue and return immediately.
+                                         * There is no need to check the rest of the
+                                         * list. */
+                                        RETURN(0);
+                                }
+                        }
+
+                        if (unlikely(req_mode == LCK_GROUP &&
+                                     (lock->l_req_mode != lock->l_granted_mode))) {
+                                scan = 1;
+                                compat = 0;
+                                if (lock->l_req_mode != LCK_GROUP) {
+                                        /* Ok, we hit non-GROUP lock, there should be no
+                                           more GROUP locks later on, queue in front of
+                                           first non-GROUP lock */
+
+                                        ldlm_resource_insert_lock_after(lock, req);
+					list_del_init(&lock->l_res_link);
+                                        ldlm_resource_insert_lock_after(req, lock);
+                                        break;
+                                }
+                                if (req->l_policy_data.l_extent.gid ==
+                                    lock->l_policy_data.l_extent.gid) {
+                                        /* found it */
+                                        ldlm_resource_insert_lock_after(lock, req);
+                                        break;
+                                }
+                                continue;
+                        }
+
+                        if (unlikely(lock->l_req_mode == LCK_GROUP)) {
+                                /* If compared lock is GROUP, then requested is PR/PW/
+                                 * so this is not compatible; extent range does not
+                                 * matter */
+                                if (*flags & LDLM_FL_BLOCK_NOWAIT) {
+                                        compat = -EWOULDBLOCK;
+                                        goto destroylock;
+                                } else {
+                                        *flags |= LDLM_FL_NO_TIMEOUT;
+                                }
+                        } else if (lock->l_policy_data.l_extent.end < req_start ||
+                                   lock->l_policy_data.l_extent.start > req_end) {
+                                /* if a non group lock doesn't overlap skip it */
+                                continue;
+                        } else if (lock->l_req_extent.end < req_start ||
+                                   lock->l_req_extent.start > req_end) {
+                                /* false contention, the requests doesn't really overlap */
+                                check_contention = 0;
+                        }
+
+                        if (!work_list)
+                                RETURN(0);
+
+                        /* don't count conflicting glimpse locks */
+                        if (lock->l_req_mode == LCK_PR &&
+                            lock->l_policy_data.l_extent.start == 0 &&
+                            lock->l_policy_data.l_extent.end == OBD_OBJECT_EOF)
+                                check_contention = 0;
+
+                        *contended_locks += check_contention;
+
+                        compat = 0;
+			if (lock->l_blocking_ast &&
+			    lock->l_req_mode != LCK_GROUP)
+                                ldlm_add_ast_work_item(lock, req, work_list);
+                }
+        }
+
+        if (ldlm_check_contention(req, *contended_locks) &&
+            compat == 0 &&
+            (*flags & LDLM_FL_DENY_ON_CONTENTION) &&
+            req->l_req_mode != LCK_GROUP &&
+            req_end - req_start <=
+            ldlm_res_to_ns(req->l_resource)->ns_max_nolock_size)
+                GOTO(destroylock, compat = -EUSERS);
+
+        RETURN(compat);
+destroylock:
+	list_del_init(&req->l_res_link);
+        ldlm_lock_destroy_nolock(req);
+        *err = compat;
+        RETURN(compat);
+}
+
+/**
+ * This function refresh eviction timer for cancelled lock.
+ * \param[in] lock		ldlm lock for refresh
+ * \param[in] arg		ldlm prolong arguments, timeout, export, extent
+ *				and counter are used
+ */
+void ldlm_lock_prolong_one(struct ldlm_lock *lock,
+			   struct ldlm_prolong_args *arg)
+{
+	int timeout;
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PROLONG_PAUSE, 3);
+
+	if (arg->lpa_export != lock->l_export ||
+	    lock->l_flags & LDLM_FL_DESTROYED)
+		/* ignore unrelated locks */
+		return;
+
+	arg->lpa_locks_cnt++;
+
+	if (!(lock->l_flags & LDLM_FL_AST_SENT))
+		/* ignore locks not being cancelled */
+		return;
+
+	/* We are in the middle of the process - BL AST is sent, CANCEL
+	 * is ahead. Take half of BL AT + IO AT process time.
+	 */
+	timeout = arg->lpa_timeout + (ldlm_bl_timeout(lock) >> 1);
+
+	LDLM_DEBUG(lock, "refreshed to %ds.\n", timeout);
+
+	arg->lpa_blocks_cnt++;
+
+	/* OK. this is a possible lock the user holds doing I/O
+	 * let's refresh eviction timer for it.
+	 */
+	ldlm_refresh_waiting_lock(lock, timeout);
+}
+EXPORT_SYMBOL(ldlm_lock_prolong_one);
+
+static enum interval_iter ldlm_resource_prolong_cb(struct interval_node *n,
+						   void *data)
+{
+	struct ldlm_prolong_args *arg = data;
+	struct ldlm_interval *node = to_ldlm_interval(n);
+	struct ldlm_lock *lock;
+
+	ENTRY;
+
+	LASSERT(!list_empty(&node->li_group));
+
+	list_for_each_entry(lock, &node->li_group, l_sl_policy) {
+		ldlm_lock_prolong_one(lock, arg);
+	}
+
+	RETURN(INTERVAL_ITER_CONT);
+}
+
+/**
+ * Walk through granted tree and prolong locks if they overlaps extent.
+ *
+ * \param[in] arg		prolong args
+ */
+void ldlm_resource_prolong(struct ldlm_prolong_args *arg)
+{
+	struct ldlm_interval_tree *tree;
+	struct ldlm_resource *res;
+	struct interval_node_extent ex = { .start = arg->lpa_extent.start,
+					   .end = arg->lpa_extent.end };
+	int idx;
+
+	ENTRY;
+
+	res = ldlm_resource_get(arg->lpa_export->exp_obd->obd_namespace, NULL,
+				&arg->lpa_resid, LDLM_EXTENT, 0);
+	if (IS_ERR(res)) {
+		CDEBUG(D_DLMTRACE, "Failed to get resource for resid %llu/%llu\n",
+		       arg->lpa_resid.name[0], arg->lpa_resid.name[1]);
+		RETURN_EXIT;
+	}
+
+	lock_res(res);
+	for (idx = 0; idx < LCK_MODE_NUM; idx++) {
+		tree = &res->lr_itree[idx];
+		if (tree->lit_root == NULL) /* empty tree, skipped */
+			continue;
+
+		/* There is no possibility to check for the groupID
+		 * so all the group locks are considered as valid
+		 * here, especially because the client is supposed
+		 * to check it has such a lock before sending an RPC.
+		 */
+		if (!(tree->lit_mode & arg->lpa_mode))
+			continue;
+
+		interval_search(tree->lit_root, &ex,
+				ldlm_resource_prolong_cb, arg);
+	}
+
+	unlock_res(res);
+	ldlm_resource_putref(res);
+
+	EXIT;
+}
+EXPORT_SYMBOL(ldlm_resource_prolong);
+
+/**
+ * Process a granting attempt for extent lock.
+ * Must be called with ns lock held.
+ *
+ * This function looks for any conflicts for \a lock in the granted or
+ * waiting queues. The lock is granted if no conflicts are found in
+ * either queue.
+ */
+int ldlm_process_extent_lock(struct ldlm_lock *lock, __u64 *flags,
+			     enum ldlm_process_intention intention,
+			     enum ldlm_error *err, struct list_head *work_list)
+{
+	struct ldlm_resource *res = lock->l_resource;
+	struct list_head rpc_list;
+	int rc, rc2;
+	int contended_locks = 0;
+	ENTRY;
+
+	LASSERT(lock->l_granted_mode != lock->l_req_mode);
+	LASSERT(list_empty(&res->lr_converting));
+	LASSERT(!(*flags & LDLM_FL_DENY_ON_CONTENTION) ||
+		!ldlm_is_ast_discard_data(lock));
+	INIT_LIST_HEAD(&rpc_list);
+	check_res_locked(res);
+	*err = ELDLM_OK;
+
+	if (intention == LDLM_PROCESS_RESCAN) {
+                /* Careful observers will note that we don't handle -EWOULDBLOCK
+                 * here, but it's ok for a non-obvious reason -- compat_queue
+                 * can only return -EWOULDBLOCK if (flags & BLOCK_NOWAIT).
+                 * flags should always be zero here, and if that ever stops
+                 * being true, we want to find out. */
+                LASSERT(*flags == 0);
+                rc = ldlm_extent_compat_queue(&res->lr_granted, lock, flags,
+                                              err, NULL, &contended_locks);
+                if (rc == 1) {
+                        rc = ldlm_extent_compat_queue(&res->lr_waiting, lock,
+                                                      flags, err, NULL,
+                                                      &contended_locks);
+                }
+                if (rc == 0)
+                        RETURN(LDLM_ITER_STOP);
+
+                ldlm_resource_unlink_lock(lock);
+
+                if (!OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_EVICT_RACE))
+                        ldlm_extent_policy(res, lock, flags);
+                ldlm_grant_lock(lock, work_list);
+                RETURN(LDLM_ITER_CONTINUE);
+        }
+
+	LASSERT((intention == LDLM_PROCESS_ENQUEUE && work_list == NULL) ||
+		(intention == LDLM_PROCESS_RECOVERY && work_list != NULL));
+ restart:
+        contended_locks = 0;
+        rc = ldlm_extent_compat_queue(&res->lr_granted, lock, flags, err,
+                                      &rpc_list, &contended_locks);
+	if (rc < 0)
+		GOTO(out_rpc_list, rc);
+
+	rc2 = 0;
+	if (rc != 2) {
+		rc2 = ldlm_extent_compat_queue(&res->lr_waiting, lock,
+					       flags, err, &rpc_list,
+					       &contended_locks);
+		if (rc2 < 0)
+			GOTO(out_rpc_list, rc = rc2);
+	}
+
+	if (rc + rc2 != 2) {
+		/* Adding LDLM_FL_NO_TIMEOUT flag to granted lock to force
+		 * client to wait for the lock endlessly once the lock is
+		 * enqueued -bzzz */
+		rc = ldlm_handle_conflict_lock(lock, flags, &rpc_list,
+					       LDLM_FL_NO_TIMEOUT);
+		if (rc == -ERESTART)
+			GOTO(restart, rc);
+		*err = rc;
+	} else {
+		ldlm_extent_policy(res, lock, flags);
+		ldlm_resource_unlink_lock(lock);
+		ldlm_grant_lock(lock, work_list);
+		rc = 0;
+	}
+
+out_rpc_list:
+	if (!list_empty(&rpc_list)) {
+		LASSERT(!ldlm_is_ast_discard_data(lock));
+		ldlm_discard_bl_list(&rpc_list);
+	}
+	RETURN(rc);
+}
+#endif /* HAVE_SERVER_SUPPORT */
+
+struct ldlm_kms_shift_args {
+	__u64	old_kms;
+	__u64	kms;
+	bool    complete;
+};
+
+/* Callback for interval_iterate functions, used by ldlm_extent_shift_Kms */
+static enum interval_iter ldlm_kms_shift_cb(struct interval_node *n,
+					    void *args)
+{
+	struct ldlm_kms_shift_args *arg = args;
+	struct ldlm_interval *node = to_ldlm_interval(n);
+	struct ldlm_lock *tmplock;
+	struct ldlm_lock *lock = NULL;
+
+	ENTRY;
+
+	/* Since all locks in an interval have the same extent, we can just
+	 * use the first lock without kms_ignore set. */
+	list_for_each_entry(tmplock, &node->li_group, l_sl_policy) {
+		if (ldlm_is_kms_ignore(tmplock))
+			continue;
+
+		lock = tmplock;
+
+		break;
+	}
+
+	/* No locks in this interval without kms_ignore set */
+	if (!lock)
+		RETURN(INTERVAL_ITER_CONT);
+
+	/* If we find a lock with a greater or equal kms, we are not the
+	 * highest lock (or we share that distinction with another lock), and
+	 * don't need to update KMS.  Return old_kms and stop looking. */
+	if (lock->l_policy_data.l_extent.end >= arg->old_kms) {
+		arg->kms = arg->old_kms;
+		arg->complete = true;
+		RETURN(INTERVAL_ITER_STOP);
+	}
+
+	if (lock->l_policy_data.l_extent.end + 1 > arg->kms)
+		arg->kms = lock->l_policy_data.l_extent.end + 1;
+
+	/* Since interval_iterate_reverse starts with the highest lock and
+	 * works down, for PW locks, we only need to check if we should update
+	 * the kms, then stop walking the tree.  PR locks are not exclusive, so
+	 * the highest start does not imply the highest end and we must
+	 * continue. (Only one group lock is allowed per resource, so this is
+	 * irrelevant for group locks.)*/
+	if (lock->l_granted_mode == LCK_PW)
+		RETURN(INTERVAL_ITER_STOP);
+	else
+		RETURN(INTERVAL_ITER_CONT);
+}
+
+/* When a lock is cancelled by a client, the KMS may undergo change if this
+ * is the "highest lock".  This function returns the new KMS value, updating
+ * it only if we were the highest lock.
+ *
+ * Caller must hold lr_lock already.
+ *
+ * NB: A lock on [x,y] protects a KMS of up to y + 1 bytes! */
+__u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms)
+{
+	struct ldlm_resource *res = lock->l_resource;
+	struct ldlm_interval_tree *tree;
+	struct ldlm_kms_shift_args args;
+	int idx = 0;
+
+	ENTRY;
+
+	args.old_kms = old_kms;
+	args.kms = 0;
+	args.complete = false;
+
+	/* don't let another thread in ldlm_extent_shift_kms race in
+	 * just after we finish and take our lock into account in its
+	 * calculation of the kms */
+	ldlm_set_kms_ignore(lock);
+
+	/* We iterate over the lock trees, looking for the largest kms smaller
+	 * than the current one. */
+	for (idx = 0; idx < LCK_MODE_NUM; idx++) {
+		tree = &res->lr_itree[idx];
+
+		/* If our already known kms is >= than the highest 'end' in
+		 * this tree, we don't need to check this tree, because
+		 * the kms from a tree can be lower than in_max_high (due to
+		 * kms_ignore), but it can never be higher. */
+		if (!tree->lit_root || args.kms >= tree->lit_root->in_max_high)
+			continue;
+
+		interval_iterate_reverse(tree->lit_root, ldlm_kms_shift_cb,
+					 &args);
+
+		/* this tells us we're not the highest lock, so we don't need
+		 * to check the remaining trees */
+		if (args.complete)
+			break;
+	}
+
+	LASSERTF(args.kms <= args.old_kms, "kms %llu old_kms %llu\n", args.kms,
+		 args.old_kms);
+
+	RETURN(args.kms);
+}
+EXPORT_SYMBOL(ldlm_extent_shift_kms);
+
+struct kmem_cache *ldlm_interval_slab;
+struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock)
+{
+	struct ldlm_interval *node;
+	ENTRY;
+
+	LASSERT(lock->l_resource->lr_type == LDLM_EXTENT);
+	OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, GFP_NOFS);
+	if (node == NULL)
+		RETURN(NULL);
+
+	INIT_LIST_HEAD(&node->li_group);
+	ldlm_interval_attach(node, lock);
+	RETURN(node);
+}
+
+void ldlm_interval_free(struct ldlm_interval *node)
+{
+        if (node) {
+		LASSERT(list_empty(&node->li_group));
+                LASSERT(!interval_is_intree(&node->li_node));
+                OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
+        }
+}
+
+/* interval tree, for LDLM_EXTENT. */
+void ldlm_interval_attach(struct ldlm_interval *n,
+                          struct ldlm_lock *l)
+{
+        LASSERT(l->l_tree_node == NULL);
+        LASSERT(l->l_resource->lr_type == LDLM_EXTENT);
+
+	list_add_tail(&l->l_sl_policy, &n->li_group);
+        l->l_tree_node = n;
+}
+
+struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l)
+{
+        struct ldlm_interval *n = l->l_tree_node;
+
+        if (n == NULL)
+                return NULL;
+
+	LASSERT(!list_empty(&n->li_group));
+        l->l_tree_node = NULL;
+	list_del_init(&l->l_sl_policy);
+
+	return list_empty(&n->li_group) ? n : NULL;
+}
+
+static inline int ldlm_mode_to_index(enum ldlm_mode mode)
+{
+	int index;
+
+	LASSERT(mode != 0);
+	LASSERT(is_power_of_2(mode));
+	for (index = -1; mode != 0; index++, mode >>= 1)
+		/* do nothing */;
+	LASSERT(index < LCK_MODE_NUM);
+	return index;
+}
+
+/** Add newly granted lock into interval tree for the resource. */
+void ldlm_extent_add_lock(struct ldlm_resource *res,
+                          struct ldlm_lock *lock)
+{
+        struct interval_node *found, **root;
+        struct ldlm_interval *node;
+        struct ldlm_extent *extent;
+	int idx, rc;
+
+        LASSERT(lock->l_granted_mode == lock->l_req_mode);
+
+        node = lock->l_tree_node;
+        LASSERT(node != NULL);
+        LASSERT(!interval_is_intree(&node->li_node));
+
+	idx = ldlm_mode_to_index(lock->l_granted_mode);
+	LASSERT(lock->l_granted_mode == 1 << idx);
+	LASSERT(lock->l_granted_mode == res->lr_itree[idx].lit_mode);
+
+        /* node extent initialize */
+        extent = &lock->l_policy_data.l_extent;
+
+	rc = interval_set(&node->li_node, extent->start, extent->end);
+	LASSERT(!rc);
+
+        root = &res->lr_itree[idx].lit_root;
+        found = interval_insert(&node->li_node, root);
+        if (found) { /* The policy group found. */
+                struct ldlm_interval *tmp = ldlm_interval_detach(lock);
+                LASSERT(tmp != NULL);
+                ldlm_interval_free(tmp);
+                ldlm_interval_attach(to_ldlm_interval(found), lock);
+        }
+        res->lr_itree[idx].lit_size++;
+
+        /* even though we use interval tree to manage the extent lock, we also
+         * add the locks into grant list, for debug purpose, .. */
+        ldlm_resource_add_lock(res, &res->lr_granted, lock);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_GRANT_CHECK)) {
+		struct ldlm_lock *lck;
+
+		list_for_each_entry_reverse(lck, &res->lr_granted,
+					    l_res_link) {
+			if (lck == lock)
+				continue;
+			if (lockmode_compat(lck->l_granted_mode,
+					    lock->l_granted_mode))
+				continue;
+			if (ldlm_extent_overlap(&lck->l_req_extent,
+						&lock->l_req_extent)) {
+				CDEBUG(D_ERROR, "granting conflicting lock %p "
+						"%p\n", lck, lock);
+				ldlm_resource_dump(D_ERROR, res);
+				LBUG();
+			}
+		}
+	}
+}
+
+/** Remove cancelled lock from resource interval tree. */
+void ldlm_extent_unlink_lock(struct ldlm_lock *lock)
+{
+	struct ldlm_resource *res = lock->l_resource;
+	struct ldlm_interval *node = lock->l_tree_node;
+	struct ldlm_interval_tree *tree;
+	int idx;
+
+	if (!node || !interval_is_intree(&node->li_node)) /* duplicate unlink */
+		return;
+
+	idx = ldlm_mode_to_index(lock->l_granted_mode);
+	LASSERT(lock->l_granted_mode == 1 << idx);
+	tree = &res->lr_itree[idx];
+
+	LASSERT(tree->lit_root != NULL); /* assure the tree is not null */
+
+	tree->lit_size--;
+	node = ldlm_interval_detach(lock);
+	if (node) {
+		interval_erase(&node->li_node, &tree->lit_root);
+		ldlm_interval_free(node);
+	}
+}
+
+void ldlm_extent_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy,
+				      union ldlm_policy_data *lpolicy)
+{
+	lpolicy->l_extent.start = wpolicy->l_extent.start;
+	lpolicy->l_extent.end = wpolicy->l_extent.end;
+	lpolicy->l_extent.gid = wpolicy->l_extent.gid;
+}
+
+void ldlm_extent_policy_local_to_wire(const union ldlm_policy_data *lpolicy,
+				      union ldlm_wire_policy_data *wpolicy)
+{
+	memset(wpolicy, 0, sizeof(*wpolicy));
+	wpolicy->l_extent.start = lpolicy->l_extent.start;
+	wpolicy->l_extent.end = lpolicy->l_extent.end;
+	wpolicy->l_extent.gid = lpolicy->l_extent.gid;
+}
+
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_flock.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_flock.c
new file mode 100644
index 0000000000000..b3d669799ceba
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_flock.c
@@ -0,0 +1,950 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003 Hewlett-Packard Development Company LP.
+ * Developed under the sponsorship of the US Government under
+ * Subcontract No. B514193
+ *
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/**
+ * This file implements POSIX lock type for Lustre.
+ * Its policy properties are start and end of extent and PID.
+ *
+ * These locks are only done through MDS due to POSIX semantics requiring
+ * e.g. that locks could be only partially released and as such split into
+ * two parts, and also that two adjacent locks from the same process may be
+ * merged into a single wider lock.
+ *
+ * Lock modes are mapped like this:
+ * PR and PW for READ and WRITE locks
+ * NL to request a releasing of a portion of the lock
+ *
+ * These flock locks never timeout.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <linux/list.h>
+#include <lustre_dlm.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+
+#include "ldlm_internal.h"
+
+int ldlm_flock_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+                            void *data, int flag);
+
+/**
+ * list_for_remaining_safe - iterate over the remaining entries in a list
+ *              and safeguard against removal of a list entry.
+ * \param pos   the &struct list_head to use as a loop counter. pos MUST
+ *              have been initialized prior to using it in this macro.
+ * \param n     another &struct list_head to use as temporary storage
+ * \param head  the head for your list.
+ */
+#define list_for_remaining_safe(pos, n, head) \
+        for (n = pos->next; pos != (head); pos = n, n = pos->next)
+
+static inline int
+ldlm_same_flock_owner(struct ldlm_lock *lock, struct ldlm_lock *new)
+{
+        return((new->l_policy_data.l_flock.owner ==
+                lock->l_policy_data.l_flock.owner) &&
+               (new->l_export == lock->l_export));
+}
+
+static inline int
+ldlm_flocks_overlap(struct ldlm_lock *lock, struct ldlm_lock *new)
+{
+        return((new->l_policy_data.l_flock.start <=
+                lock->l_policy_data.l_flock.end) &&
+               (new->l_policy_data.l_flock.end >=
+                lock->l_policy_data.l_flock.start));
+}
+
+static inline void ldlm_flock_blocking_link(struct ldlm_lock *req,
+					    struct ldlm_lock *lock)
+{
+        /* For server only */
+        if (req->l_export == NULL)
+		return;
+
+	LASSERT(hlist_unhashed(&req->l_exp_flock_hash));
+
+        req->l_policy_data.l_flock.blocking_owner =
+                lock->l_policy_data.l_flock.owner;
+        req->l_policy_data.l_flock.blocking_export =
+		lock->l_export;
+	atomic_set(&req->l_policy_data.l_flock.blocking_refs, 0);
+
+	cfs_hash_add(req->l_export->exp_flock_hash,
+		     &req->l_policy_data.l_flock.owner,
+		     &req->l_exp_flock_hash);
+}
+
+static inline void ldlm_flock_blocking_unlink(struct ldlm_lock *req)
+{
+        /* For server only */
+        if (req->l_export == NULL)
+                return;
+
+	check_res_locked(req->l_resource);
+	if (req->l_export->exp_flock_hash != NULL &&
+	    !hlist_unhashed(&req->l_exp_flock_hash))
+		cfs_hash_del(req->l_export->exp_flock_hash,
+			     &req->l_policy_data.l_flock.owner,
+			     &req->l_exp_flock_hash);
+}
+
+static inline void
+ldlm_flock_destroy(struct ldlm_lock *lock, enum ldlm_mode mode, __u64 flags)
+{
+	ENTRY;
+
+	LDLM_DEBUG(lock, "ldlm_flock_destroy(mode: %d, flags: %#llx)",
+		   mode, flags);
+
+	/* Safe to not lock here, since it should be empty anyway */
+	LASSERT(hlist_unhashed(&lock->l_exp_flock_hash));
+
+	list_del_init(&lock->l_res_link);
+	if (flags == LDLM_FL_WAIT_NOREPROC) {
+		/* client side - set a flag to prevent sending a CANCEL */
+		lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_CBPENDING;
+
+                /* when reaching here, it is under lock_res_and_lock(). Thus,
+                   need call the nolock version of ldlm_lock_decref_internal*/
+                ldlm_lock_decref_internal_nolock(lock, mode);
+        }
+
+        ldlm_lock_destroy_nolock(lock);
+        EXIT;
+}
+
+/**
+ * POSIX locks deadlock detection code.
+ *
+ * Given a new lock \a req and an existing lock \a bl_lock it conflicts
+ * with, we need to iterate through all blocked POSIX locks for this
+ * export and see if there is a deadlock condition arising. (i.e. when
+ * one client holds a lock on something and want a lock on something
+ * else and at the same time another client has the opposite situation).
+ */
+
+struct ldlm_flock_lookup_cb_data {
+	__u64 *bl_owner;
+	struct ldlm_lock *lock;
+	struct obd_export *exp;
+};
+
+static int ldlm_flock_lookup_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+				struct hlist_node *hnode, void *data)
+{
+	struct ldlm_flock_lookup_cb_data *cb_data = data;
+	struct obd_export *exp = cfs_hash_object(hs, hnode);
+	struct ldlm_lock *lock;
+
+	lock = cfs_hash_lookup(exp->exp_flock_hash, cb_data->bl_owner);
+	if (lock == NULL)
+		return 0;
+
+	/* Stop on first found lock. Same process can't sleep twice */
+	cb_data->lock = lock;
+	cb_data->exp = class_export_get(exp);
+
+	return 1;
+}
+
+static int
+ldlm_flock_deadlock(struct ldlm_lock *req, struct ldlm_lock *bl_lock)
+{
+        struct obd_export *req_exp = req->l_export;
+        struct obd_export *bl_exp = bl_lock->l_export;
+        __u64 req_owner = req->l_policy_data.l_flock.owner;
+        __u64 bl_owner = bl_lock->l_policy_data.l_flock.owner;
+
+        /* For server only */
+        if (req_exp == NULL)
+                return 0;
+
+        class_export_get(bl_exp);
+	while (1) {
+		struct ldlm_flock_lookup_cb_data cb_data = {
+					.bl_owner = &bl_owner,
+					.lock = NULL,
+					.exp = NULL };
+		struct obd_export *bl_exp_new;
+		struct ldlm_lock *lock = NULL;
+		struct ldlm_flock *flock;
+
+		if (bl_exp->exp_flock_hash != NULL) {
+			cfs_hash_for_each_key(bl_exp->exp_obd->obd_nid_hash,
+				&bl_exp->exp_connection->c_peer.nid,
+				ldlm_flock_lookup_cb, &cb_data);
+			lock = cb_data.lock;
+		}
+		if (lock == NULL)
+			break;
+
+		class_export_put(bl_exp);
+		bl_exp = cb_data.exp;
+
+		LASSERT(req != lock);
+		flock = &lock->l_policy_data.l_flock;
+		LASSERT(flock->owner == bl_owner);
+                bl_owner = flock->blocking_owner;
+                bl_exp_new = class_export_get(flock->blocking_export);
+                class_export_put(bl_exp);
+
+		cfs_hash_put(bl_exp->exp_flock_hash, &lock->l_exp_flock_hash);
+                bl_exp = bl_exp_new;
+
+		if (bl_exp->exp_failed)
+			break;
+
+		if (bl_owner == req_owner &&
+		    (bl_exp->exp_connection->c_peer.nid ==
+		     req_exp->exp_connection->c_peer.nid)) {
+                        class_export_put(bl_exp);
+                        return 1;
+                }
+        }
+        class_export_put(bl_exp);
+
+        return 0;
+}
+
+static void ldlm_flock_cancel_on_deadlock(struct ldlm_lock *lock,
+					  struct list_head *work_list)
+{
+	CDEBUG(D_INFO, "reprocess deadlock req=%p\n", lock);
+
+	if ((exp_connect_flags(lock->l_export) &
+				OBD_CONNECT_FLOCK_DEAD) == 0) {
+		CERROR("deadlock found, but client doesn't "
+				"support flock canceliation\n");
+	} else {
+		LASSERT(lock->l_completion_ast);
+		LASSERT(!ldlm_is_ast_sent(lock));
+		lock->l_flags |= LDLM_FL_AST_SENT | LDLM_FL_CANCEL_ON_BLOCK |
+			LDLM_FL_FLOCK_DEADLOCK;
+		ldlm_flock_blocking_unlink(lock);
+		ldlm_resource_unlink_lock(lock);
+		ldlm_add_ast_work_item(lock, NULL, work_list);
+	}
+}
+
+/**
+ * Process a granting attempt for flock lock.
+ * Must be called under ns lock held.
+ *
+ * This function looks for any conflicts for \a lock in the granted or
+ * waiting queues. The lock is granted if no conflicts are found in
+ * either queue.
+ */
+int
+ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags,
+			enum ldlm_process_intention intention,
+			enum ldlm_error *err, struct list_head *work_list)
+{
+	struct ldlm_resource *res = req->l_resource;
+	struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+	struct list_head *tmp;
+	struct list_head *ownlocks = NULL;
+	struct ldlm_lock *lock = NULL;
+	struct ldlm_lock *new = req;
+	struct ldlm_lock *new2 = NULL;
+	enum ldlm_mode mode = req->l_req_mode;
+	int local = ns_is_client(ns);
+	int added = (mode == LCK_NL);
+	int overlaps = 0;
+	int splitted = 0;
+	const struct ldlm_callback_suite null_cbs = { NULL };
+	ENTRY;
+
+	CDEBUG(D_DLMTRACE, "flags %#llx owner %llu pid %u mode %u start "
+	       "%llu end %llu\n", *flags,
+	       new->l_policy_data.l_flock.owner,
+               new->l_policy_data.l_flock.pid, mode,
+               req->l_policy_data.l_flock.start,
+               req->l_policy_data.l_flock.end);
+
+        *err = ELDLM_OK;
+
+        if (local) {
+                /* No blocking ASTs are sent to the clients for
+                 * Posix file & record locks */
+                req->l_blocking_ast = NULL;
+        } else {
+                /* Called on the server for lock cancels. */
+                req->l_blocking_ast = ldlm_flock_blocking_ast;
+        }
+
+reprocess:
+        if ((*flags == LDLM_FL_WAIT_NOREPROC) || (mode == LCK_NL)) {
+                /* This loop determines where this processes locks start
+                 * in the resource lr_granted list. */
+		list_for_each(tmp, &res->lr_granted) {
+			lock = list_entry(tmp, struct ldlm_lock,
+                                              l_res_link);
+                        if (ldlm_same_flock_owner(lock, req)) {
+                                ownlocks = tmp;
+                                break;
+                        }
+                }
+        } else {
+		int reprocess_failed = 0;
+                lockmode_verify(mode);
+
+                /* This loop determines if there are existing locks
+                 * that conflict with the new lock request. */
+		list_for_each(tmp, &res->lr_granted) {
+			lock = list_entry(tmp, struct ldlm_lock,
+                                              l_res_link);
+
+                        if (ldlm_same_flock_owner(lock, req)) {
+                                if (!ownlocks)
+                                        ownlocks = tmp;
+                                continue;
+                        }
+
+                        /* locks are compatible, overlap doesn't matter */
+                        if (lockmode_compat(lock->l_granted_mode, mode))
+                                continue;
+
+                        if (!ldlm_flocks_overlap(lock, req))
+                                continue;
+
+			if (intention != LDLM_PROCESS_ENQUEUE) {
+				reprocess_failed = 1;
+				if (ldlm_flock_deadlock(req, lock)) {
+					ldlm_flock_cancel_on_deadlock(req,
+							work_list);
+					RETURN(LDLM_ITER_CONTINUE);
+				}
+				continue;
+			}
+
+                        if (*flags & LDLM_FL_BLOCK_NOWAIT) {
+                                ldlm_flock_destroy(req, mode, *flags);
+                                *err = -EAGAIN;
+                                RETURN(LDLM_ITER_STOP);
+                        }
+
+                        if (*flags & LDLM_FL_TEST_LOCK) {
+                                ldlm_flock_destroy(req, mode, *flags);
+                                req->l_req_mode = lock->l_granted_mode;
+                                req->l_policy_data.l_flock.pid =
+                                        lock->l_policy_data.l_flock.pid;
+                                req->l_policy_data.l_flock.start =
+                                        lock->l_policy_data.l_flock.start;
+                                req->l_policy_data.l_flock.end =
+                                        lock->l_policy_data.l_flock.end;
+                                *flags |= LDLM_FL_LOCK_CHANGED;
+                                RETURN(LDLM_ITER_STOP);
+                        }
+
+			/* add lock to blocking list before deadlock
+			 * check to prevent race */
+			ldlm_flock_blocking_link(req, lock);
+
+			if (ldlm_flock_deadlock(req, lock)) {
+				ldlm_flock_blocking_unlink(req);
+				ldlm_flock_destroy(req, mode, *flags);
+				*err = -EDEADLK;
+				RETURN(LDLM_ITER_STOP);
+			}
+
+                        ldlm_resource_add_lock(res, &res->lr_waiting, req);
+                        *flags |= LDLM_FL_BLOCK_GRANTED;
+                        RETURN(LDLM_ITER_STOP);
+                }
+		if (reprocess_failed)
+			RETURN(LDLM_ITER_CONTINUE);
+        }
+
+        if (*flags & LDLM_FL_TEST_LOCK) {
+                ldlm_flock_destroy(req, mode, *flags);
+                req->l_req_mode = LCK_NL;
+                *flags |= LDLM_FL_LOCK_CHANGED;
+                RETURN(LDLM_ITER_STOP);
+        }
+
+        /* In case we had slept on this lock request take it off of the
+	 * deadlock detection hash list. */
+        ldlm_flock_blocking_unlink(req);
+
+        /* Scan the locks owned by this process that overlap this request.
+         * We may have to merge or split existing locks. */
+
+        if (!ownlocks)
+                ownlocks = &res->lr_granted;
+
+        list_for_remaining_safe(ownlocks, tmp, &res->lr_granted) {
+		lock = list_entry(ownlocks, struct ldlm_lock, l_res_link);
+
+                if (!ldlm_same_flock_owner(lock, new))
+                        break;
+
+                if (lock->l_granted_mode == mode) {
+                        /* If the modes are the same then we need to process
+                         * locks that overlap OR adjoin the new lock. The extra
+                         * logic condition is necessary to deal with arithmetic
+                         * overflow and underflow. */
+                        if ((new->l_policy_data.l_flock.start >
+                             (lock->l_policy_data.l_flock.end + 1))
+                            && (lock->l_policy_data.l_flock.end !=
+                                OBD_OBJECT_EOF))
+                                continue;
+
+                        if ((new->l_policy_data.l_flock.end <
+                             (lock->l_policy_data.l_flock.start - 1))
+                            && (lock->l_policy_data.l_flock.start != 0))
+                                break;
+
+                        if (new->l_policy_data.l_flock.start <
+                            lock->l_policy_data.l_flock.start) {
+                                lock->l_policy_data.l_flock.start =
+                                        new->l_policy_data.l_flock.start;
+                        } else {
+                                new->l_policy_data.l_flock.start =
+                                        lock->l_policy_data.l_flock.start;
+                        }
+
+                        if (new->l_policy_data.l_flock.end >
+                            lock->l_policy_data.l_flock.end) {
+                                lock->l_policy_data.l_flock.end =
+                                        new->l_policy_data.l_flock.end;
+                        } else {
+                                new->l_policy_data.l_flock.end =
+                                        lock->l_policy_data.l_flock.end;
+                        }
+
+                        if (added) {
+                                ldlm_flock_destroy(lock, mode, *flags);
+                        } else {
+                                new = lock;
+                                added = 1;
+                        }
+                        continue;
+                }
+
+                if (new->l_policy_data.l_flock.start >
+                    lock->l_policy_data.l_flock.end)
+                        continue;
+
+                if (new->l_policy_data.l_flock.end <
+                    lock->l_policy_data.l_flock.start)
+                        break;
+
+                ++overlaps;
+
+                if (new->l_policy_data.l_flock.start <=
+                    lock->l_policy_data.l_flock.start) {
+                        if (new->l_policy_data.l_flock.end <
+                            lock->l_policy_data.l_flock.end) {
+                                lock->l_policy_data.l_flock.start =
+                                        new->l_policy_data.l_flock.end + 1;
+                                break;
+                        }
+                        ldlm_flock_destroy(lock, lock->l_req_mode, *flags);
+                        continue;
+                }
+                if (new->l_policy_data.l_flock.end >=
+                    lock->l_policy_data.l_flock.end) {
+                        lock->l_policy_data.l_flock.end =
+                                new->l_policy_data.l_flock.start - 1;
+                        continue;
+                }
+
+                /* split the existing lock into two locks */
+
+                /* if this is an F_UNLCK operation then we could avoid
+                 * allocating a new lock and use the req lock passed in
+                 * with the request but this would complicate the reply
+                 * processing since updates to req get reflected in the
+                 * reply. The client side replays the lock request so
+                 * it must see the original lock data in the reply. */
+
+                /* XXX - if ldlm_lock_new() can sleep we should
+                 * release the lr_lock, allocate the new lock,
+                 * and restart processing this lock. */
+		if (new2 == NULL) {
+			unlock_res_and_lock(req);
+			new2 = ldlm_lock_create(ns, &res->lr_name, LDLM_FLOCK,
+						lock->l_granted_mode, &null_cbs,
+						NULL, 0, LVB_T_NONE);
+			lock_res_and_lock(req);
+			if (IS_ERR(new2)) {
+				ldlm_flock_destroy(req, lock->l_granted_mode,
+						   *flags);
+				*err = PTR_ERR(new2);
+				RETURN(LDLM_ITER_STOP);
+			}
+			goto reprocess;
+		}
+
+                splitted = 1;
+
+                new2->l_granted_mode = lock->l_granted_mode;
+                new2->l_policy_data.l_flock.pid =
+                        new->l_policy_data.l_flock.pid;
+                new2->l_policy_data.l_flock.owner =
+                        new->l_policy_data.l_flock.owner;
+                new2->l_policy_data.l_flock.start =
+                        lock->l_policy_data.l_flock.start;
+                new2->l_policy_data.l_flock.end =
+                        new->l_policy_data.l_flock.start - 1;
+                lock->l_policy_data.l_flock.start =
+                        new->l_policy_data.l_flock.end + 1;
+                new2->l_conn_export = lock->l_conn_export;
+                if (lock->l_export != NULL) {
+                        new2->l_export = class_export_lock_get(lock->l_export, new2);
+                        if (new2->l_export->exp_lock_hash &&
+			    hlist_unhashed(&new2->l_exp_hash))
+                                cfs_hash_add(new2->l_export->exp_lock_hash,
+                                             &new2->l_remote_handle,
+                                             &new2->l_exp_hash);
+                }
+                if (*flags == LDLM_FL_WAIT_NOREPROC)
+                        ldlm_lock_addref_internal_nolock(new2,
+                                                         lock->l_granted_mode);
+
+                /* insert new2 at lock */
+                ldlm_resource_add_lock(res, ownlocks, new2);
+                LDLM_LOCK_RELEASE(new2);
+                break;
+        }
+
+        /* if new2 is created but never used, destroy it*/
+        if (splitted == 0 && new2 != NULL)
+                ldlm_lock_destroy_nolock(new2);
+
+        /* At this point we're granting the lock request. */
+        req->l_granted_mode = req->l_req_mode;
+
+        /* Add req to the granted queue before calling ldlm_reprocess_all(). */
+        if (!added) {
+		list_del_init(&req->l_res_link);
+                /* insert new lock before ownlocks in list. */
+                ldlm_resource_add_lock(res, ownlocks, req);
+        }
+
+        if (*flags != LDLM_FL_WAIT_NOREPROC) {
+#ifdef HAVE_SERVER_SUPPORT
+		if (intention == LDLM_PROCESS_ENQUEUE) {
+                        /* If this is an unlock, reprocess the waitq and
+                         * send completions ASTs for locks that can now be
+                         * granted. The only problem with doing this
+                         * reprocessing here is that the completion ASTs for
+                         * newly granted locks will be sent before the unlock
+                         * completion is sent. It shouldn't be an issue. Also
+                         * note that ldlm_process_flock_lock() will recurse,
+			 * but only once because 'intention' won't be
+			 * LDLM_PROCESS_ENQUEUE from ldlm_reprocess_queue. */
+			if ((mode == LCK_NL) && overlaps) {
+				struct list_head rpc_list;
+                                int rc;
+
+				INIT_LIST_HEAD(&rpc_list);
+restart:
+				ldlm_reprocess_queue(res, &res->lr_waiting,
+						     &rpc_list,
+						     LDLM_PROCESS_RESCAN);
+
+                                unlock_res_and_lock(req);
+                                rc = ldlm_run_ast_work(ns, &rpc_list,
+                                                       LDLM_WORK_CP_AST);
+                                lock_res_and_lock(req);
+				if (rc == -ERESTART)
+					GOTO(restart, rc);
+                       }
+                } else {
+                        LASSERT(req->l_completion_ast);
+                        ldlm_add_ast_work_item(req, NULL, work_list);
+                }
+#else /* !HAVE_SERVER_SUPPORT */
+                /* The only one possible case for client-side calls flock
+                 * policy function is ldlm_flock_completion_ast inside which
+                 * carries LDLM_FL_WAIT_NOREPROC flag. */
+                CERROR("Illegal parameter for client-side-only module.\n");
+                LBUG();
+#endif /* HAVE_SERVER_SUPPORT */
+        }
+
+	/* In case we're reprocessing the requested lock we can't destroy
+	 * it until after calling ldlm_add_ast_work_item() above so that laawi()
+	 * can bump the reference count on \a req. Otherwise \a req
+	 * could be freed before the completion AST can be sent.  */
+        if (added)
+                ldlm_flock_destroy(req, mode, *flags);
+
+        ldlm_resource_dump(D_INFO, res);
+        RETURN(LDLM_ITER_CONTINUE);
+}
+
+struct ldlm_flock_wait_data {
+        struct ldlm_lock *fwd_lock;
+        int               fwd_generation;
+};
+
+static void
+ldlm_flock_interrupted_wait(void *data)
+{
+        struct ldlm_lock *lock;
+        ENTRY;
+
+        lock = ((struct ldlm_flock_wait_data *)data)->fwd_lock;
+
+	/* take lock off the deadlock detection hash list. */
+	lock_res_and_lock(lock);
+        ldlm_flock_blocking_unlink(lock);
+
+	/* client side - set flag to prevent lock from being put on LRU list */
+	ldlm_set_cbpending(lock);
+        unlock_res_and_lock(lock);
+
+        EXIT;
+}
+
+/**
+ * Flock completion callback function.
+ *
+ * \param lock [in,out]: A lock to be handled
+ * \param flags    [in]: flags
+ * \param *data    [in]: ldlm_work_cp_ast_lock() will use ldlm_cb_set_arg
+ *
+ * \retval 0    : success
+ * \retval <0   : failure
+ */
+int
+ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
+{
+	struct file_lock *getlk = lock->l_ast_data;
+	struct obd_device *obd;
+	struct obd_import *imp = NULL;
+	struct ldlm_flock_wait_data fwd;
+	struct l_wait_info lwi;
+	enum ldlm_error err;
+	int rc = 0;
+	ENTRY;
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT2, 4);
+	if (OBD_FAIL_PRECHECK(OBD_FAIL_LDLM_CP_CB_WAIT3)) {
+		lock_res_and_lock(lock);
+		lock->l_flags |= LDLM_FL_FAIL_LOC;
+		unlock_res_and_lock(lock);
+		OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT3, 4);
+	}
+	CDEBUG(D_DLMTRACE, "flags: %#llx data: %p getlk: %p\n",
+	       flags, data, getlk);
+
+	LASSERT(flags != LDLM_FL_WAIT_NOREPROC);
+
+	if (flags & LDLM_FL_FAILED)
+		goto granted;
+
+	if (!(flags & LDLM_FL_BLOCKED_MASK)) {
+		if (NULL == data)
+			/* mds granted the lock in the reply */
+			goto granted;
+		/* CP AST RPC: lock get granted, wake it up */
+		wake_up(&lock->l_waitq);
+		RETURN(0);
+	}
+
+        LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, "
+                   "sleeping");
+        fwd.fwd_lock = lock;
+        obd = class_exp2obd(lock->l_conn_export);
+
+        /* if this is a local lock, there is no import */
+        if (NULL != obd)
+                imp = obd->u.cli.cl_import;
+
+        if (NULL != imp) {
+		spin_lock(&imp->imp_lock);
+		fwd.fwd_generation = imp->imp_generation;
+		spin_unlock(&imp->imp_lock);
+        }
+
+        lwi = LWI_TIMEOUT_INTR(0, NULL, ldlm_flock_interrupted_wait, &fwd);
+
+        /* Go to sleep until the lock is granted. */
+        rc = l_wait_event(lock->l_waitq, is_granted_or_cancelled(lock), &lwi);
+
+        if (rc) {
+                LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)",
+                           rc);
+                RETURN(rc);
+        }
+
+granted:
+        OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT, 10);
+
+	if (OBD_FAIL_PRECHECK(OBD_FAIL_LDLM_CP_CB_WAIT4)) {
+		lock_res_and_lock(lock);
+		/* DEADLOCK is always set with CBPENDING */
+		lock->l_flags |= LDLM_FL_FLOCK_DEADLOCK | LDLM_FL_CBPENDING;
+		unlock_res_and_lock(lock);
+		OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT4, 4);
+	}
+	if (OBD_FAIL_PRECHECK(OBD_FAIL_LDLM_CP_CB_WAIT5)) {
+		lock_res_and_lock(lock);
+		/* DEADLOCK is always set with CBPENDING */
+		lock->l_flags |= LDLM_FL_FAIL_LOC |
+				 LDLM_FL_FLOCK_DEADLOCK | LDLM_FL_CBPENDING;
+		unlock_res_and_lock(lock);
+		OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT5, 4);
+	}
+
+	lock_res_and_lock(lock);
+
+
+	/* Protect against race where lock could have been just destroyed
+	 * due to overlap in ldlm_process_flock_lock().
+	 */
+	if (ldlm_is_destroyed(lock)) {
+		unlock_res_and_lock(lock);
+		LDLM_DEBUG(lock, "client-side enqueue waking up: destroyed");
+
+		/* An error is still to be returned, to propagate it up to
+		 * ldlm_cli_enqueue_fini() caller. */
+		RETURN(-EIO);
+	}
+
+        /* ldlm_lock_enqueue() has already placed lock on the granted list. */
+	ldlm_resource_unlink_lock(lock);
+
+	/* Import invalidation. We need to actually release the lock
+	 * references being held, so that it can go away. No point in
+	 * holding the lock even if app still believes it has it, since
+	 * server already dropped it anyway. Only for granted locks too. */
+	/* Do the same for DEADLOCK'ed locks. */
+	if (ldlm_is_failed(lock) || ldlm_is_flock_deadlock(lock)) {
+		int mode;
+
+		if (flags & LDLM_FL_TEST_LOCK)
+			LASSERT(ldlm_is_test_lock(lock));
+
+		if (ldlm_is_test_lock(lock) || ldlm_is_flock_deadlock(lock))
+			mode = flock_type(getlk);
+		else
+			mode = lock->l_granted_mode;
+
+		if (ldlm_is_flock_deadlock(lock)) {
+			LDLM_DEBUG(lock, "client-side enqueue deadlock "
+				   "received");
+			rc = -EDEADLK;
+		}
+		ldlm_flock_destroy(lock, mode, LDLM_FL_WAIT_NOREPROC);
+		unlock_res_and_lock(lock);
+
+		/* Need to wake up the waiter if we were evicted */
+		wake_up(&lock->l_waitq);
+
+		/* An error is still to be returned, to propagate it up to
+		 * ldlm_cli_enqueue_fini() caller. */
+		RETURN(rc ? : -EIO);
+	}
+
+	LDLM_DEBUG(lock, "client-side enqueue granted");
+
+	if (flags & LDLM_FL_TEST_LOCK) {
+                /* fcntl(F_GETLK) request */
+                /* The old mode was saved in getlk->fl_type so that if the mode
+                 * in the lock changes we can decref the appropriate refcount.*/
+		LASSERT(ldlm_is_test_lock(lock));
+		ldlm_flock_destroy(lock, flock_type(getlk),
+				   LDLM_FL_WAIT_NOREPROC);
+		switch (lock->l_granted_mode) {
+		case LCK_PR:
+			flock_set_type(getlk, F_RDLCK);
+			break;
+		case LCK_PW:
+			flock_set_type(getlk, F_WRLCK);
+			break;
+		default:
+			flock_set_type(getlk, F_UNLCK);
+		}
+		flock_set_pid(getlk, (pid_t)lock->l_policy_data.l_flock.pid);
+		flock_set_start(getlk,
+				(loff_t)lock->l_policy_data.l_flock.start);
+		flock_set_end(getlk,
+			      (loff_t)lock->l_policy_data.l_flock.end);
+	} else {
+		__u64 noreproc = LDLM_FL_WAIT_NOREPROC;
+
+		/* We need to reprocess the lock to do merges or splits
+		 * with existing locks owned by this process. */
+		ldlm_process_flock_lock(lock, &noreproc, 1, &err, NULL);
+	}
+	unlock_res_and_lock(lock);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_flock_completion_ast);
+
+int ldlm_flock_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+                            void *data, int flag)
+{
+        ENTRY;
+
+        LASSERT(lock);
+        LASSERT(flag == LDLM_CB_CANCELING);
+
+	/* take lock off the deadlock detection hash list. */
+	lock_res_and_lock(lock);
+        ldlm_flock_blocking_unlink(lock);
+	unlock_res_and_lock(lock);
+        RETURN(0);
+}
+
+void ldlm_flock_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy,
+				     union ldlm_policy_data *lpolicy)
+{
+	lpolicy->l_flock.start = wpolicy->l_flock.lfw_start;
+	lpolicy->l_flock.end = wpolicy->l_flock.lfw_end;
+	lpolicy->l_flock.pid = wpolicy->l_flock.lfw_pid;
+	lpolicy->l_flock.owner = wpolicy->l_flock.lfw_owner;
+}
+
+void ldlm_flock_policy_local_to_wire(const union ldlm_policy_data *lpolicy,
+				     union ldlm_wire_policy_data *wpolicy)
+{
+	memset(wpolicy, 0, sizeof(*wpolicy));
+	wpolicy->l_flock.lfw_start = lpolicy->l_flock.start;
+	wpolicy->l_flock.lfw_end = lpolicy->l_flock.end;
+	wpolicy->l_flock.lfw_pid = lpolicy->l_flock.pid;
+	wpolicy->l_flock.lfw_owner = lpolicy->l_flock.owner;
+}
+
+/*
+ * Export handle<->flock hash operations.
+ */
+static unsigned
+ldlm_export_flock_hash(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_u64_hash(*(__u64 *)key, mask);
+}
+
+static void *
+ldlm_export_flock_key(struct hlist_node *hnode)
+{
+	struct ldlm_lock *lock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+	return &lock->l_policy_data.l_flock.owner;
+}
+
+static int
+ldlm_export_flock_keycmp(const void *key, struct hlist_node *hnode)
+{
+	return !memcmp(ldlm_export_flock_key(hnode), key, sizeof(__u64));
+}
+
+static void *
+ldlm_export_flock_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+}
+
+static void
+ldlm_export_flock_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct ldlm_lock *lock;
+	struct ldlm_flock *flock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+	LDLM_LOCK_GET(lock);
+
+	flock = &lock->l_policy_data.l_flock;
+	LASSERT(flock->blocking_export != NULL);
+	class_export_get(flock->blocking_export);
+	atomic_inc(&flock->blocking_refs);
+}
+
+static void
+ldlm_export_flock_put(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct ldlm_lock *lock;
+	struct ldlm_flock *flock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+
+	flock = &lock->l_policy_data.l_flock;
+	LASSERT(flock->blocking_export != NULL);
+	class_export_put(flock->blocking_export);
+	if (atomic_dec_and_test(&flock->blocking_refs)) {
+		flock->blocking_owner = 0;
+		flock->blocking_export = NULL;
+	}
+	LDLM_LOCK_RELEASE(lock);
+}
+
+static struct cfs_hash_ops ldlm_export_flock_ops = {
+	.hs_hash        = ldlm_export_flock_hash,
+	.hs_key         = ldlm_export_flock_key,
+	.hs_keycmp      = ldlm_export_flock_keycmp,
+	.hs_object      = ldlm_export_flock_object,
+	.hs_get         = ldlm_export_flock_get,
+	.hs_put         = ldlm_export_flock_put,
+	.hs_put_locked  = ldlm_export_flock_put,
+};
+
+int ldlm_init_flock_export(struct obd_export *exp)
+{
+	if( strcmp(exp->exp_obd->obd_type->typ_name, LUSTRE_MDT_NAME) != 0)
+		RETURN(0);
+
+	exp->exp_flock_hash =
+		cfs_hash_create(obd_uuid2str(&exp->exp_client_uuid),
+				HASH_EXP_LOCK_CUR_BITS,
+				HASH_EXP_LOCK_MAX_BITS,
+				HASH_EXP_LOCK_BKT_BITS, 0,
+				CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA,
+				&ldlm_export_flock_ops,
+				CFS_HASH_DEFAULT | CFS_HASH_NBLK_CHANGE);
+	if (!exp->exp_flock_hash)
+		RETURN(-ENOMEM);
+
+	RETURN(0);
+}
+
+void ldlm_destroy_flock_export(struct obd_export *exp)
+{
+	ENTRY;
+	if (exp->exp_flock_hash) {
+		cfs_hash_putref(exp->exp_flock_hash);
+		exp->exp_flock_hash = NULL;
+	}
+	EXIT;
+}
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_inodebits.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_inodebits.c
new file mode 100644
index 0000000000000..90e34a612d7c8
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_inodebits.c
@@ -0,0 +1,250 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_inodebits.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+/**
+ * This file contains implementation of IBITS lock type
+ *
+ * IBITS lock type contains a bit mask determining various properties of an
+ * object. The meanings of specific bits are specific to the caller and are
+ * opaque to LDLM code.
+ *
+ * Locks with intersecting bitmasks and conflicting lock modes (e.g.  LCK_PW)
+ * are considered conflicting.  See the lock mode compatibility matrix
+ * in lustre_dlm.h.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <lustre_dlm.h>
+#include <obd_support.h>
+#include <lustre_lib.h>
+#include <obd_class.h>
+
+#include "ldlm_internal.h"
+
+#ifdef HAVE_SERVER_SUPPORT
+/**
+ * Determine if the lock is compatible with all locks on the queue.
+ *
+ * If \a work_list is provided, conflicting locks are linked there.
+ * If \a work_list is not provided, we exit this function on first conflict.
+ *
+ * \retval 0 if there are conflicting locks in the \a queue
+ * \retval 1 if the lock is compatible to all locks in \a queue
+ *
+ * IBITS locks in granted queue are organized in bunches of
+ * same-mode/same-bits locks called "skip lists". The First lock in the
+ * bunch contains a pointer to the end of the bunch.  This allows us to
+ * skip an entire bunch when iterating the list in search for conflicting
+ * locks if first lock of the bunch is not conflicting with us.
+ */
+static int
+ldlm_inodebits_compat_queue(struct list_head *queue, struct ldlm_lock *req,
+			    struct list_head *work_list)
+{
+	struct list_head *tmp;
+	struct ldlm_lock *lock;
+	__u64 req_bits = req->l_policy_data.l_inodebits.bits;
+	int compat = 1;
+	ENTRY;
+
+	/* There is no sense in lock with no bits set, I think.
+	 * Also, such a lock would be compatible with any other bit lock */
+	LASSERT(req_bits != 0);
+
+	list_for_each(tmp, queue) {
+		struct list_head *mode_tail;
+
+		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+		/* We stop walking the queue if we hit ourselves so we don't
+		 * take conflicting locks enqueued after us into account,
+		 * or we'd wait forever. */
+		if (req == lock)
+			RETURN(compat);
+
+		/* last lock in mode group */
+		LASSERT(lock->l_sl_mode.prev != NULL);
+		mode_tail = &list_entry(lock->l_sl_mode.prev,
+					struct ldlm_lock,
+					l_sl_mode)->l_res_link;
+
+		/* if reqest lock is not COS_INCOMPAT and COS is disabled,
+		 * they are compatible, IOW this request is from a local
+		 * transaction on a DNE system. */
+		if (lock->l_req_mode == LCK_COS && !ldlm_is_cos_incompat(req) &&
+		    !ldlm_is_cos_enabled(req)) {
+			/* jump to last lock in mode group */
+			tmp = mode_tail;
+			continue;
+		}
+
+		/* locks' mode are compatible, bits don't matter */
+		if (lockmode_compat(lock->l_req_mode, req->l_req_mode)) {
+			/* jump to last lock in mode group */
+			tmp = mode_tail;
+			continue;
+		}
+
+		for (;;) {
+			struct list_head *head;
+
+			/* Advance loop cursor to last lock in policy group. */
+			tmp = &list_entry(lock->l_sl_policy.prev,
+					      struct ldlm_lock,
+					      l_sl_policy)->l_res_link;
+
+			/* Locks with overlapping bits conflict. */
+			if (lock->l_policy_data.l_inodebits.bits & req_bits) {
+				/* COS lock mode has a special compatibility
+				 * requirement: it is only compatible with
+				 * locks from the same client. */
+				if (lock->l_req_mode == LCK_COS &&
+				    !ldlm_is_cos_incompat(req) &&
+				    ldlm_is_cos_enabled(req) &&
+				    lock->l_client_cookie == req->l_client_cookie)
+					goto not_conflicting;
+				/* Found a conflicting policy group. */
+				if (!work_list)
+					RETURN(0);
+
+				compat = 0;
+
+				/* Add locks of the policy group to @work_list
+				 * as blocking locks for @req */
+                                if (lock->l_blocking_ast)
+                                        ldlm_add_ast_work_item(lock, req,
+                                                               work_list);
+                                head = &lock->l_sl_policy;
+				list_for_each_entry(lock, head, l_sl_policy)
+                                        if (lock->l_blocking_ast)
+                                                ldlm_add_ast_work_item(lock, req,
+                                                                       work_list);
+                        }
+                not_conflicting:
+                        if (tmp == mode_tail)
+                                break;
+
+                        tmp = tmp->next;
+			lock = list_entry(tmp, struct ldlm_lock,
+                                              l_res_link);
+		} /* Loop over policy groups within one mode group. */
+	} /* Loop over mode groups within @queue. */
+
+	RETURN(compat);
+}
+
+/**
+ * Process a granting attempt for IBITS lock.
+ * Must be called with ns lock held
+ *
+ * This function looks for any conflicts for \a lock in the granted or
+ * waiting queues. The lock is granted if no conflicts are found in
+ * either queue.
+ */
+int ldlm_process_inodebits_lock(struct ldlm_lock *lock, __u64 *flags,
+				enum ldlm_process_intention intention,
+				enum ldlm_error *err,
+				struct list_head *work_list)
+{
+	struct ldlm_resource *res = lock->l_resource;
+	struct list_head rpc_list;
+	int rc;
+	ENTRY;
+
+	LASSERT(lock->l_granted_mode != lock->l_req_mode);
+	LASSERT(list_empty(&res->lr_converting));
+	INIT_LIST_HEAD(&rpc_list);
+	check_res_locked(res);
+
+	/* (*flags & LDLM_FL_BLOCK_NOWAIT) is for layout lock right now. */
+	if (intention == LDLM_PROCESS_RESCAN ||
+	    (*flags & LDLM_FL_BLOCK_NOWAIT)) {
+		*err = ELDLM_LOCK_ABORTED;
+		if (*flags & LDLM_FL_BLOCK_NOWAIT)
+			*err = ELDLM_LOCK_WOULDBLOCK;
+
+                rc = ldlm_inodebits_compat_queue(&res->lr_granted, lock, NULL);
+                if (!rc)
+                        RETURN(LDLM_ITER_STOP);
+                rc = ldlm_inodebits_compat_queue(&res->lr_waiting, lock, NULL);
+                if (!rc)
+                        RETURN(LDLM_ITER_STOP);
+
+                ldlm_resource_unlink_lock(lock);
+                ldlm_grant_lock(lock, work_list);
+
+		*err = ELDLM_OK;
+		RETURN(LDLM_ITER_CONTINUE);
+	}
+
+	LASSERT((intention == LDLM_PROCESS_ENQUEUE && work_list == NULL) ||
+		(intention == LDLM_PROCESS_RECOVERY && work_list != NULL));
+ restart:
+        rc = ldlm_inodebits_compat_queue(&res->lr_granted, lock, &rpc_list);
+        rc += ldlm_inodebits_compat_queue(&res->lr_waiting, lock, &rpc_list);
+
+        if (rc != 2) {
+		rc = ldlm_handle_conflict_lock(lock, flags, &rpc_list, 0);
+		if (rc == -ERESTART)
+			GOTO(restart, rc);
+		*err = rc;
+	} else {
+		ldlm_resource_unlink_lock(lock);
+		ldlm_grant_lock(lock, work_list);
+		rc = 0;
+	}
+
+	if (!list_empty(&rpc_list))
+		ldlm_discard_bl_list(&rpc_list);
+
+	RETURN(rc);
+}
+#endif /* HAVE_SERVER_SUPPORT */
+
+void ldlm_ibits_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy,
+				     union ldlm_policy_data *lpolicy)
+{
+	lpolicy->l_inodebits.bits = wpolicy->l_inodebits.bits;
+}
+
+void ldlm_ibits_policy_local_to_wire(const union ldlm_policy_data *lpolicy,
+				     union ldlm_wire_policy_data *wpolicy)
+{
+	memset(wpolicy, 0, sizeof(*wpolicy));
+	wpolicy->l_inodebits.bits = lpolicy->l_inodebits.bits;
+}
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h
new file mode 100644
index 0000000000000..83cd89e5960fe
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h
@@ -0,0 +1,415 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define MAX_STRING_SIZE 128
+
+extern int ldlm_srv_namespace_nr;
+extern int ldlm_cli_namespace_nr;
+extern struct mutex ldlm_srv_namespace_lock;
+extern struct list_head ldlm_srv_namespace_list;
+extern struct mutex ldlm_cli_namespace_lock;
+extern struct list_head ldlm_cli_active_namespace_list;
+extern struct list_head ldlm_cli_inactive_namespace_list;
+extern unsigned int ldlm_cancel_unused_locks_before_replay;
+
+static inline int ldlm_namespace_nr_read(enum ldlm_side client)
+{
+	return client == LDLM_NAMESPACE_SERVER ?
+		ldlm_srv_namespace_nr : ldlm_cli_namespace_nr;
+}
+
+static inline void ldlm_namespace_nr_inc(enum ldlm_side client)
+{
+	if (client == LDLM_NAMESPACE_SERVER)
+		ldlm_srv_namespace_nr++;
+	else
+		ldlm_cli_namespace_nr++;
+}
+
+static inline void ldlm_namespace_nr_dec(enum ldlm_side client)
+{
+	if (client == LDLM_NAMESPACE_SERVER)
+		ldlm_srv_namespace_nr--;
+	else
+		ldlm_cli_namespace_nr--;
+}
+
+static inline struct list_head *ldlm_namespace_list(enum ldlm_side client)
+{
+	return client == LDLM_NAMESPACE_SERVER ?
+		&ldlm_srv_namespace_list : &ldlm_cli_active_namespace_list;
+}
+
+static inline
+struct list_head *ldlm_namespace_inactive_list(enum ldlm_side client)
+{
+	return client == LDLM_NAMESPACE_SERVER ?
+		&ldlm_srv_namespace_list : &ldlm_cli_inactive_namespace_list;
+}
+
+static inline struct mutex *ldlm_namespace_lock(enum ldlm_side client)
+{
+	return client == LDLM_NAMESPACE_SERVER ?
+		&ldlm_srv_namespace_lock : &ldlm_cli_namespace_lock;
+}
+
+/* ns_bref is the number of resources in this namespace */
+static inline int ldlm_ns_empty(struct ldlm_namespace *ns)
+{
+	return atomic_read(&ns->ns_bref) == 0;
+}
+
+void ldlm_namespace_move_to_active_locked(struct ldlm_namespace *,
+					  enum ldlm_side);
+void ldlm_namespace_move_to_inactive_locked(struct ldlm_namespace *,
+					    enum ldlm_side);
+struct ldlm_namespace *ldlm_namespace_first_locked(enum ldlm_side);
+
+/* ldlm_request.c */
+/* Cancel lru flag, it indicates we cancel aged locks. */
+enum ldlm_lru_flags {
+	LDLM_LRU_FLAG_AGED	= 0x01, /* Cancel aged locks (non LRU resize) */
+	LDLM_LRU_FLAG_PASSED	= 0x02, /* Cancel passed number of locks */
+	LDLM_LRU_FLAG_SHRINK	= 0x04, /* Cancel locks from shrinker */
+	LDLM_LRU_FLAG_LRUR	= 0x08, /* Cancel locks from lru resize */
+	LDLM_LRU_FLAG_NO_WAIT	= 0x10, /* Cancel locks w/o blocking (neither
+					 * sending nor waiting for any RPCs) */
+	LDLM_LRU_FLAG_CLEANUP	= 0x20, /* Used when clearing lru, tells
+					 * prepare_lru_list to set discard flag
+					 * on PR extent locks so we don't waste
+					 * time saving pages that will be
+					 * discarded momentarily */
+};
+
+int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr,
+		    enum ldlm_cancel_flags cancel_flags,
+		    enum ldlm_lru_flags lru_flags);
+int ldlm_cancel_lru_local(struct ldlm_namespace *ns,
+			  struct list_head *cancels, int count, int max,
+			  enum ldlm_cancel_flags cancel_flags,
+			  enum ldlm_lru_flags lru_flags);
+extern unsigned int ldlm_enqueue_min;
+/* ldlm_resource.c */
+extern struct kmem_cache *ldlm_resource_slab;
+extern struct kmem_cache *ldlm_lock_slab;
+extern struct kmem_cache *ldlm_interval_tree_slab;
+
+void ldlm_resource_insert_lock_after(struct ldlm_lock *original,
+                                     struct ldlm_lock *new);
+
+/* ldlm_lock.c */
+
+typedef enum {
+	LDLM_WORK_BL_AST,
+	LDLM_WORK_CP_AST,
+	LDLM_WORK_REVOKE_AST,
+	LDLM_WORK_GL_AST
+} ldlm_desc_ast_t;
+
+void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list);
+int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill,
+		  enum req_location loc, void *data, int size);
+struct ldlm_lock *
+ldlm_lock_create(struct ldlm_namespace *ns, const struct ldlm_res_id *,
+		 enum ldlm_type type, enum ldlm_mode mode,
+		 const struct ldlm_callback_suite *cbs,
+		 void *data, __u32 lvb_len, enum lvb_type lvb_type);
+enum ldlm_error ldlm_lock_enqueue(struct ldlm_namespace *, struct ldlm_lock **,
+				  void *cookie, __u64 *flags);
+void ldlm_lock_addref_internal(struct ldlm_lock *, enum ldlm_mode mode);
+void ldlm_lock_addref_internal_nolock(struct ldlm_lock *, enum ldlm_mode mode);
+void ldlm_lock_decref_internal(struct ldlm_lock *, enum ldlm_mode mode);
+void ldlm_lock_decref_internal_nolock(struct ldlm_lock *, enum ldlm_mode mode);
+void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
+			    struct list_head *work_list);
+#ifdef HAVE_SERVER_SUPPORT
+int ldlm_reprocess_queue(struct ldlm_resource *res, struct list_head *queue,
+			 struct list_head *work_list,
+			 enum ldlm_process_intention intention);
+int ldlm_handle_conflict_lock(struct ldlm_lock *lock, __u64 *flags,
+			      struct list_head *rpc_list, __u64 grant_flags);
+void ldlm_discard_bl_list(struct list_head *bl_list);
+#endif
+int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
+                      ldlm_desc_ast_t ast_type);
+int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq);
+int ldlm_lock_remove_from_lru_check(struct ldlm_lock *lock, ktime_t last_use);
+#define ldlm_lock_remove_from_lru(lock) \
+		ldlm_lock_remove_from_lru_check(lock, ktime_set(0, 0))
+int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock);
+void ldlm_lock_add_to_lru_nolock(struct ldlm_lock *lock);
+void ldlm_lock_add_to_lru(struct ldlm_lock *lock);
+void ldlm_lock_touch_in_lru(struct ldlm_lock *lock);
+void ldlm_lock_destroy_nolock(struct ldlm_lock *lock);
+
+int ldlm_export_cancel_blocked_locks(struct obd_export *exp);
+int ldlm_export_cancel_locks(struct obd_export *exp);
+
+/* ldlm_lockd.c */
+int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld,
+                           struct ldlm_lock *lock);
+int ldlm_bl_to_thread_list(struct ldlm_namespace *ns,
+			   struct ldlm_lock_desc *ld,
+			   struct list_head *cancels, int count,
+			   enum ldlm_cancel_flags cancel_flags);
+int ldlm_bl_thread_wakeup(void);
+
+void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
+                             struct ldlm_lock_desc *ld, struct ldlm_lock *lock);
+
+#ifdef HAVE_SERVER_SUPPORT
+/* ldlm_plain.c */
+int ldlm_process_plain_lock(struct ldlm_lock *lock, __u64 *flags,
+			    enum ldlm_process_intention intention,
+			    enum ldlm_error *err, struct list_head *work_list);
+
+/* ldlm_inodebits.c */
+int ldlm_process_inodebits_lock(struct ldlm_lock *lock, __u64 *flags,
+				enum ldlm_process_intention intention,
+				enum ldlm_error *err,
+				struct list_head *work_list);
+/* ldlm_extent.c */
+int ldlm_process_extent_lock(struct ldlm_lock *lock, __u64 *flags,
+			     enum ldlm_process_intention intention,
+			     enum ldlm_error *err, struct list_head *work_list);
+#endif
+void ldlm_extent_add_lock(struct ldlm_resource *res, struct ldlm_lock *lock);
+void ldlm_extent_unlink_lock(struct ldlm_lock *lock);
+
+/* ldlm_flock.c */
+int ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags,
+			    enum ldlm_process_intention intention,
+			    enum ldlm_error *err, struct list_head *work_list);
+int ldlm_init_flock_export(struct obd_export *exp);
+void ldlm_destroy_flock_export(struct obd_export *exp);
+
+/* l_lock.c */
+void l_check_ns_lock(struct ldlm_namespace *ns);
+void l_check_no_ns_lock(struct ldlm_namespace *ns);
+
+extern struct proc_dir_entry *ldlm_svc_proc_dir;
+
+struct ldlm_state {
+        struct ptlrpc_service *ldlm_cb_service;
+        struct ptlrpc_service *ldlm_cancel_service;
+        struct ptlrpc_client *ldlm_client;
+        struct ptlrpc_connection *ldlm_server_conn;
+        struct ldlm_bl_pool *ldlm_bl_pool;
+};
+
+/* interval tree, for LDLM_EXTENT. */
+extern struct kmem_cache *ldlm_interval_slab; /* slab cache for ldlm_interval */
+extern void ldlm_interval_attach(struct ldlm_interval *n, struct ldlm_lock *l);
+extern struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l);
+extern struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock);
+extern void ldlm_interval_free(struct ldlm_interval *node);
+/* this function must be called with res lock held */
+static inline struct ldlm_extent *
+ldlm_interval_extent(struct ldlm_interval *node)
+{
+        struct ldlm_lock *lock;
+	LASSERT(!list_empty(&node->li_group));
+
+	lock = list_entry(node->li_group.next, struct ldlm_lock,
+                              l_sl_policy);
+        return &lock->l_policy_data.l_extent;
+}
+
+int ldlm_init(void);
+void ldlm_exit(void);
+
+enum ldlm_policy_res {
+        LDLM_POLICY_CANCEL_LOCK,
+        LDLM_POLICY_KEEP_LOCK,
+        LDLM_POLICY_SKIP_LOCK
+};
+
+#define LDLM_POOL_SYSFS_PRINT_int(v) sprintf(buf, "%d\n", v)
+#define LDLM_POOL_SYSFS_SET_int(a, b) { a = b; }
+#define LDLM_POOL_SYSFS_PRINT_u64(v) sprintf(buf, "%lld\n", v)
+#define LDLM_POOL_SYSFS_SET_u64(a, b) { a = b; }
+#define LDLM_POOL_SYSFS_PRINT_atomic(v) sprintf(buf, "%d\n", atomic_read(&v))
+#define LDLM_POOL_SYSFS_SET_atomic(a, b) atomic_set(&a, b)
+
+#define LDLM_POOL_SYSFS_READER_SHOW(var, type)				   \
+	static ssize_t var##_show(struct kobject *kobj,			   \
+				  struct attribute *attr,		   \
+				  char *buf)				   \
+	{								   \
+		struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool,\
+						    pl_kobj);		   \
+		type tmp;						   \
+									   \
+		spin_lock(&pl->pl_lock);				   \
+		tmp = pl->pl_##var;					   \
+		spin_unlock(&pl->pl_lock);				   \
+									   \
+		return LDLM_POOL_SYSFS_PRINT_##type(tmp);		   \
+	}								   \
+	struct __##var##__dummy_read {;} /* semicolon catcher */
+
+#define LDLM_POOL_SYSFS_WRITER_STORE(var, type)				   \
+	static ssize_t var##_store(struct kobject *kobj,		   \
+				   struct attribute *attr,		   \
+				   const char *buffer,			   \
+				   unsigned long count)			   \
+	{								   \
+		struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool,\
+						    pl_kobj);		   \
+		unsigned long tmp;					   \
+		int rc;							   \
+									   \
+		rc = kstrtoul(buffer, 10, &tmp);			   \
+		if (rc < 0) {						   \
+			return rc;					   \
+		}							   \
+									   \
+		spin_lock(&pl->pl_lock);				   \
+		LDLM_POOL_SYSFS_SET_##type(pl->pl_##var, tmp);		   \
+		spin_unlock(&pl->pl_lock);				   \
+									   \
+		return count;						   \
+	}								   \
+	struct __##var##__dummy_write {; } /* semicolon catcher */
+
+#define LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(var, type)			   \
+	static ssize_t var##_show(struct kobject *kobj,			   \
+				  struct attribute *attr,		   \
+				  char *buf)				   \
+	{								   \
+		struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool,\
+						    pl_kobj);		   \
+									   \
+		return LDLM_POOL_SYSFS_PRINT_##type(pl->pl_##var);	   \
+	}								   \
+	struct __##var##__dummy_read {; } /* semicolon catcher */
+
+#define LDLM_POOL_SYSFS_WRITER_NOLOCK_STORE(var, type)			   \
+	static ssize_t var##_store(struct kobject *kobj,		   \
+				   struct attribute *attr,		   \
+				   const char *buffer,			   \
+				   unsigned long count)			   \
+	{								   \
+		struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool,\
+						    pl_kobj);		   \
+		unsigned long tmp;					   \
+		int rc;							   \
+									   \
+		rc = kstrtoul(buffer, 10, &tmp);			   \
+		if (rc < 0) {						   \
+			return rc;					   \
+		}							   \
+									   \
+		LDLM_POOL_SYSFS_SET_##type(pl->pl_##var, tmp);		   \
+									   \
+		return count;						   \
+	}								   \
+	struct __##var##__dummy_write {; } /* semicolon catcher */
+
+static inline void
+ldlm_add_var(struct lprocfs_vars *vars, struct proc_dir_entry *proc_dir,
+	     const char *name, void *data, const struct file_operations *ops)
+{
+	snprintf((char *)vars->name, MAX_STRING_SIZE, "%s", name);
+	vars->data = data;
+	vars->fops = ops;
+	lprocfs_add_vars(proc_dir, vars, NULL);
+}
+
+static inline int is_granted_or_cancelled(struct ldlm_lock *lock)
+{
+        int ret = 0;
+
+        lock_res_and_lock(lock);
+	if ((lock->l_req_mode == lock->l_granted_mode) &&
+	     !ldlm_is_cp_reqd(lock))
+		ret = 1;
+	else if (ldlm_is_failed(lock) || ldlm_is_cancel(lock))
+                ret = 1;
+        unlock_res_and_lock(lock);
+
+        return ret;
+}
+
+static inline bool is_bl_done(struct ldlm_lock *lock)
+{
+	bool bl_done = true;
+
+	if (!ldlm_is_bl_done(lock)) {
+		lock_res_and_lock(lock);
+		bl_done = ldlm_is_bl_done(lock);
+		unlock_res_and_lock(lock);
+	}
+
+	return bl_done;
+}
+
+typedef void (*ldlm_policy_wire_to_local_t)(const union ldlm_wire_policy_data *,
+					    union ldlm_policy_data *);
+typedef void (*ldlm_policy_local_to_wire_t)(const union ldlm_policy_data *,
+					    union ldlm_wire_policy_data *);
+void ldlm_plain_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy,
+				     union ldlm_policy_data *lpolicy);
+void ldlm_plain_policy_local_to_wire(const union ldlm_policy_data *lpolicy,
+				     union ldlm_wire_policy_data *wpolicy);
+void ldlm_ibits_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy,
+				     union ldlm_policy_data *lpolicy);
+void ldlm_ibits_policy_local_to_wire(const union ldlm_policy_data *lpolicy,
+				     union ldlm_wire_policy_data *wpolicy);
+void ldlm_extent_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy,
+				      union ldlm_policy_data *lpolicy);
+void ldlm_extent_policy_local_to_wire(const union ldlm_policy_data *lpolicy,
+				      union ldlm_wire_policy_data *wpolicy);
+void ldlm_flock_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy,
+				     union ldlm_policy_data *lpolicy);
+void ldlm_flock_policy_local_to_wire(const union ldlm_policy_data *lpolicy,
+				     union ldlm_wire_policy_data *wpolicy);
+
+/* ldlm_reclaim.c */
+#ifdef HAVE_SERVER_SUPPORT
+extern __u64 ldlm_reclaim_threshold;
+extern __u64 ldlm_lock_limit;
+extern __u64 ldlm_reclaim_threshold_mb;
+extern __u64 ldlm_lock_limit_mb;
+extern struct percpu_counter ldlm_granted_total;
+#endif
+int ldlm_reclaim_setup(void);
+void ldlm_reclaim_cleanup(void);
+void ldlm_reclaim_add(struct ldlm_lock *lock);
+void ldlm_reclaim_del(struct ldlm_lock *lock);
+bool ldlm_reclaim_full(void);
+
+static inline bool ldlm_res_eq(const struct ldlm_res_id *res0,
+			       const struct ldlm_res_id *res1)
+{
+	return memcmp(res0, res1, sizeof(*res0)) == 0;
+}
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c
new file mode 100644
index 0000000000000..3836f99d01aaf
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c
@@ -0,0 +1,3272 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/**
+ * This file deals with various client/target related logic including recovery.
+ *
+ * TODO: This code more logically belongs in the ptlrpc module than in ldlm and
+ * should be moved.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <linux/kthread.h>
+#include <libcfs/libcfs.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_dlm.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+#include "ldlm_internal.h"
+
+/* @priority: If non-zero, move the selected connection to the list head.
+ * @create: If zero, only search in existing connections.
+ */
+static int import_set_conn(struct obd_import *imp, struct obd_uuid *uuid,
+                           int priority, int create)
+{
+	struct ptlrpc_connection *ptlrpc_conn;
+	struct obd_import_conn *imp_conn = NULL, *item;
+	lnet_nid_t nid4refnet = LNET_NID_ANY;
+	int rc = 0;
+	ENTRY;
+
+	if (!create && !priority) {
+		CDEBUG(D_HA, "Nothing to do\n");
+		RETURN(-EINVAL);
+	}
+
+	if (imp->imp_connection &&
+	    imp->imp_connection->c_remote_uuid.uuid[0] == 0)
+		/* nid4refnet is used to restrict network connections */
+		nid4refnet = imp->imp_connection->c_self;
+	ptlrpc_conn = ptlrpc_uuid_to_connection(uuid, nid4refnet);
+	if (!ptlrpc_conn) {
+		CDEBUG(D_HA, "can't find connection %s\n", uuid->uuid);
+		RETURN(-ENOENT);
+	}
+
+	if (create) {
+		OBD_ALLOC(imp_conn, sizeof(*imp_conn));
+		if (!imp_conn)
+			GOTO(out_put, rc = -ENOMEM);
+	}
+
+	spin_lock(&imp->imp_lock);
+	list_for_each_entry(item, &imp->imp_conn_list, oic_item) {
+		if (obd_uuid_equals(uuid, &item->oic_uuid)) {
+			if (priority) {
+				list_del(&item->oic_item);
+				list_add(&item->oic_item,
+					 &imp->imp_conn_list);
+				item->oic_last_attempt = 0;
+			}
+			CDEBUG(D_HA, "imp %p@%s: found existing conn %s%s\n",
+			       imp, imp->imp_obd->obd_name, uuid->uuid,
+			       (priority ? ", moved to head" : ""));
+			spin_unlock(&imp->imp_lock);
+			GOTO(out_free, rc = 0);
+		}
+	}
+	/* No existing import connection found for \a uuid. */
+	if (create) {
+		imp_conn->oic_conn = ptlrpc_conn;
+		imp_conn->oic_uuid = *uuid;
+		imp_conn->oic_last_attempt = 0;
+		if (priority)
+			list_add(&imp_conn->oic_item, &imp->imp_conn_list);
+		else
+			list_add_tail(&imp_conn->oic_item,
+				      &imp->imp_conn_list);
+		CDEBUG(D_HA, "imp %p@%s: add connection %s at %s\n",
+		       imp, imp->imp_obd->obd_name, uuid->uuid,
+		       (priority ? "head" : "tail"));
+	} else {
+		spin_unlock(&imp->imp_lock);
+		GOTO(out_free, rc = -ENOENT);
+	}
+
+	spin_unlock(&imp->imp_lock);
+	RETURN(0);
+out_free:
+	if (imp_conn)
+		OBD_FREE(imp_conn, sizeof(*imp_conn));
+out_put:
+	ptlrpc_connection_put(ptlrpc_conn);
+	RETURN(rc);
+}
+
+int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid)
+{
+        return import_set_conn(imp, uuid, 1, 0);
+}
+
+int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
+                           int priority)
+{
+        return import_set_conn(imp, uuid, priority, 1);
+}
+EXPORT_SYMBOL(client_import_add_conn);
+
+int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid)
+{
+	struct obd_import_conn *imp_conn;
+	struct obd_export *dlmexp;
+	int rc = -ENOENT;
+	ENTRY;
+
+	spin_lock(&imp->imp_lock);
+	if (list_empty(&imp->imp_conn_list)) {
+                LASSERT(!imp->imp_connection);
+                GOTO(out, rc);
+        }
+
+	list_for_each_entry(imp_conn, &imp->imp_conn_list, oic_item) {
+                if (!obd_uuid_equals(uuid, &imp_conn->oic_uuid))
+                        continue;
+                LASSERT(imp_conn->oic_conn);
+
+                if (imp_conn == imp->imp_conn_current) {
+                        LASSERT(imp_conn->oic_conn == imp->imp_connection);
+
+                        if (imp->imp_state != LUSTRE_IMP_CLOSED &&
+                            imp->imp_state != LUSTRE_IMP_DISCON) {
+                                CERROR("can't remove current connection\n");
+                                GOTO(out, rc = -EBUSY);
+                        }
+
+                        ptlrpc_connection_put(imp->imp_connection);
+                        imp->imp_connection = NULL;
+
+			dlmexp = class_conn2export(&imp->imp_dlm_handle);
+			if (dlmexp && dlmexp->exp_connection) {
+				LASSERT(dlmexp->exp_connection ==
+					imp_conn->oic_conn);
+				ptlrpc_connection_put(dlmexp->exp_connection);
+				dlmexp->exp_connection = NULL;
+			}
+
+			if (dlmexp != NULL)
+				class_export_put(dlmexp);
+		}
+
+		list_del(&imp_conn->oic_item);
+                ptlrpc_connection_put(imp_conn->oic_conn);
+                OBD_FREE(imp_conn, sizeof(*imp_conn));
+                CDEBUG(D_HA, "imp %p@%s: remove connection %s\n",
+                       imp, imp->imp_obd->obd_name, uuid->uuid);
+                rc = 0;
+                break;
+        }
+out:
+	spin_unlock(&imp->imp_lock);
+	if (rc == -ENOENT)
+		CERROR("connection %s not found\n", uuid->uuid);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(client_import_del_conn);
+
+/**
+ * Find conn UUID by peer NID. \a peer is a server NID. This function is used
+ * to find a conn uuid of \a imp which can reach \a peer.
+ */
+int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer,
+			    struct obd_uuid *uuid)
+{
+	struct obd_import_conn *conn;
+	int rc = -ENOENT;
+	ENTRY;
+
+	spin_lock(&imp->imp_lock);
+	list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+		/* Check if conn UUID does have this peer NID. */
+                if (class_check_uuid(&conn->oic_uuid, peer)) {
+                        *uuid = conn->oic_uuid;
+                        rc = 0;
+                        break;
+                }
+        }
+	spin_unlock(&imp->imp_lock);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(client_import_find_conn);
+
+void client_destroy_import(struct obd_import *imp)
+{
+	/* Drop security policy instance after all RPCs have finished/aborted
+	 * to let all busy contexts be released. */
+        class_import_get(imp);
+        class_destroy_import(imp);
+        sptlrpc_import_sec_put(imp);
+        class_import_put(imp);
+}
+EXPORT_SYMBOL(client_destroy_import);
+
+/**
+ * Check whether or not the OSC is on MDT.
+ * In the config log,
+ * osc on MDT
+ *	setup 0:{fsname}-OSTxxxx-osc[-MDTxxxx] 1:lustre-OST0000_UUID 2:NID
+ * osc on client
+ *	setup 0:{fsname}-OSTxxxx-osc 1:lustre-OST0000_UUID 2:NID
+ *
+ **/
+static int osc_on_mdt(char *obdname)
+{
+	char *ptr;
+
+	ptr = strrchr(obdname, '-');
+	if (ptr == NULL)
+		return 0;
+
+	if (strncmp(ptr + 1, "MDT", 3) == 0)
+		return 1;
+
+	return 0;
+}
+
+/* Configure an RPC client OBD device.
+ *
+ * lcfg parameters:
+ * 1 - client UUID
+ * 2 - server UUID
+ * 3 - inactive-on-startup
+ * 4 - restrictive net
+ */
+int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
+{
+	struct client_obd *cli = &obddev->u.cli;
+	struct obd_import *imp;
+	struct obd_uuid server_uuid;
+	int rq_portal, rp_portal, connect_op;
+	char *name = obddev->obd_type->typ_name;
+	enum ldlm_ns_type ns_type = LDLM_NS_TYPE_UNKNOWN;
+	char *cli_name = lustre_cfg_buf(lcfg, 0);
+	struct ptlrpc_connection fake_conn = { .c_self = 0,
+					       .c_remote_uuid.uuid[0] = 0 };
+	int rc;
+	ENTRY;
+
+	/* In a more perfect world, we would hang a ptlrpc_client off of
+	 * obd_type and just use the values from there. */
+	if (!strcmp(name, LUSTRE_OSC_NAME)) {
+		rq_portal = OST_REQUEST_PORTAL;
+		rp_portal = OSC_REPLY_PORTAL;
+		connect_op = OST_CONNECT;
+		cli->cl_sp_me = LUSTRE_SP_CLI;
+		cli->cl_sp_to = LUSTRE_SP_OST;
+		ns_type = LDLM_NS_TYPE_OSC;
+	} else if (!strcmp(name, LUSTRE_MDC_NAME) ||
+		   !strcmp(name, LUSTRE_LWP_NAME)) {
+		rq_portal = MDS_REQUEST_PORTAL;
+		rp_portal = MDC_REPLY_PORTAL;
+		connect_op = MDS_CONNECT;
+		if (is_lwp_on_ost(cli_name))
+			cli->cl_sp_me = LUSTRE_SP_OST;
+		else if (is_lwp_on_mdt(cli_name))
+			cli->cl_sp_me = LUSTRE_SP_MDT;
+		else
+			cli->cl_sp_me = LUSTRE_SP_CLI;
+		cli->cl_sp_to = LUSTRE_SP_MDT;
+		ns_type = LDLM_NS_TYPE_MDC;
+	} else if (!strcmp(name, LUSTRE_OSP_NAME)) {
+		if (strstr(lustre_cfg_buf(lcfg, 1), "OST") == NULL) {
+			/* OSP_on_MDT for other MDTs */
+			connect_op = MDS_CONNECT;
+			cli->cl_sp_to = LUSTRE_SP_MDT;
+			ns_type = LDLM_NS_TYPE_MDC;
+			rq_portal = OUT_PORTAL;
+		} else {
+			/* OSP on MDT for OST */
+			connect_op = OST_CONNECT;
+			cli->cl_sp_to = LUSTRE_SP_OST;
+			ns_type = LDLM_NS_TYPE_OSC;
+			rq_portal = OST_REQUEST_PORTAL;
+		}
+		rp_portal = OSC_REPLY_PORTAL;
+		cli->cl_sp_me = LUSTRE_SP_MDT;
+        } else if (!strcmp(name, LUSTRE_MGC_NAME)) {
+                rq_portal = MGS_REQUEST_PORTAL;
+                rp_portal = MGC_REPLY_PORTAL;
+                connect_op = MGS_CONNECT;
+                cli->cl_sp_me = LUSTRE_SP_MGC;
+                cli->cl_sp_to = LUSTRE_SP_MGS;
+                cli->cl_flvr_mgc.sf_rpc = SPTLRPC_FLVR_INVALID;
+                ns_type = LDLM_NS_TYPE_MGC;
+	} else {
+                CERROR("unknown client OBD type \"%s\", can't setup\n",
+                       name);
+                RETURN(-EINVAL);
+        }
+
+        if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+                CERROR("requires a TARGET UUID\n");
+                RETURN(-EINVAL);
+        }
+
+        if (LUSTRE_CFG_BUFLEN(lcfg, 1) > 37) {
+                CERROR("client UUID must be less than 38 characters\n");
+                RETURN(-EINVAL);
+        }
+
+        if (LUSTRE_CFG_BUFLEN(lcfg, 2) < 1) {
+                CERROR("setup requires a SERVER UUID\n");
+                RETURN(-EINVAL);
+        }
+
+        if (LUSTRE_CFG_BUFLEN(lcfg, 2) > 37) {
+                CERROR("target UUID must be less than 38 characters\n");
+                RETURN(-EINVAL);
+        }
+
+	init_rwsem(&cli->cl_sem);
+	mutex_init(&cli->cl_mgc_mutex);
+	cli->cl_seq = NULL;
+	init_rwsem(&cli->cl_seq_rwsem);
+	cli->cl_conn_count = 0;
+	memcpy(server_uuid.uuid, lustre_cfg_buf(lcfg, 2),
+	       min_t(unsigned int, LUSTRE_CFG_BUFLEN(lcfg, 2),
+		     sizeof(server_uuid)));
+
+	cli->cl_dirty_pages = 0;
+	cli->cl_avail_grant = 0;
+	/* FIXME: Should limit this for the sum of all cl_dirty_max_pages. */
+	/* cl_dirty_max_pages may be changed at connect time in
+	 * ptlrpc_connect_interpret(). */
+	client_adjust_max_dirty(cli);
+	INIT_LIST_HEAD(&cli->cl_cache_waiters);
+	INIT_LIST_HEAD(&cli->cl_loi_ready_list);
+	INIT_LIST_HEAD(&cli->cl_loi_hp_ready_list);
+	INIT_LIST_HEAD(&cli->cl_loi_write_list);
+	INIT_LIST_HEAD(&cli->cl_loi_read_list);
+	spin_lock_init(&cli->cl_loi_list_lock);
+	atomic_set(&cli->cl_pending_w_pages, 0);
+	atomic_set(&cli->cl_pending_r_pages, 0);
+	cli->cl_r_in_flight = 0;
+	cli->cl_w_in_flight = 0;
+
+	spin_lock_init(&cli->cl_read_rpc_hist.oh_lock);
+	spin_lock_init(&cli->cl_write_rpc_hist.oh_lock);
+	spin_lock_init(&cli->cl_read_page_hist.oh_lock);
+	spin_lock_init(&cli->cl_write_page_hist.oh_lock);
+	spin_lock_init(&cli->cl_read_offset_hist.oh_lock);
+	spin_lock_init(&cli->cl_write_offset_hist.oh_lock);
+
+	/* lru for osc. */
+	INIT_LIST_HEAD(&cli->cl_lru_osc);
+	atomic_set(&cli->cl_lru_shrinkers, 0);
+	atomic_long_set(&cli->cl_lru_busy, 0);
+	atomic_long_set(&cli->cl_lru_in_list, 0);
+	INIT_LIST_HEAD(&cli->cl_lru_list);
+	spin_lock_init(&cli->cl_lru_list_lock);
+	atomic_long_set(&cli->cl_unstable_count, 0);
+	INIT_LIST_HEAD(&cli->cl_shrink_list);
+
+	init_waitqueue_head(&cli->cl_destroy_waitq);
+	atomic_set(&cli->cl_destroy_in_flight, 0);
+#ifdef ENABLE_CHECKSUM
+	/* Turn on checksumming by default. */
+	cli->cl_checksum = 1;
+	/*
+	 * The supported checksum types will be worked out at connect time
+	 * Set cl_chksum* to CRC32 for now to avoid returning screwed info
+	 * through procfs.
+	 */
+	cli->cl_cksum_type = cli->cl_supp_cksum_types = OBD_CKSUM_CRC32;
+#endif
+	atomic_set(&cli->cl_resends, OSC_DEFAULT_RESENDS);
+
+	/* Set it to possible maximum size. It may be reduced by ocd_brw_size
+	 * from OFD after connecting. */
+	cli->cl_max_pages_per_rpc = PTLRPC_MAX_BRW_PAGES;
+
+	/* set cl_chunkbits default value to PAGE_SHIFT,
+	 * it will be updated at OSC connection time. */
+	cli->cl_chunkbits = PAGE_SHIFT;
+
+	if (!strcmp(name, LUSTRE_MDC_NAME)) {
+		cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT;
+	} else if (totalram_pages >> (20 - PAGE_SHIFT) <= 128 /* MB */) {
+		cli->cl_max_rpcs_in_flight = 2;
+	} else if (totalram_pages >> (20 - PAGE_SHIFT) <= 256 /* MB */) {
+		cli->cl_max_rpcs_in_flight = 3;
+	} else if (totalram_pages >> (20 - PAGE_SHIFT) <= 512 /* MB */) {
+		cli->cl_max_rpcs_in_flight = 4;
+	} else {
+		if (osc_on_mdt(obddev->obd_name))
+			cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_MAX;
+		else
+			cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT;
+        }
+
+	spin_lock_init(&cli->cl_mod_rpcs_lock);
+	spin_lock_init(&cli->cl_mod_rpcs_hist.oh_lock);
+	cli->cl_max_mod_rpcs_in_flight = 0;
+	cli->cl_mod_rpcs_in_flight = 0;
+	cli->cl_close_rpcs_in_flight = 0;
+	init_waitqueue_head(&cli->cl_mod_rpcs_waitq);
+	cli->cl_mod_tag_bitmap = NULL;
+
+	INIT_LIST_HEAD(&cli->cl_chg_dev_linkage);
+
+	if (connect_op == MDS_CONNECT) {
+		cli->cl_max_mod_rpcs_in_flight = cli->cl_max_rpcs_in_flight - 1;
+		OBD_ALLOC(cli->cl_mod_tag_bitmap,
+			  BITS_TO_LONGS(OBD_MAX_RIF_MAX) * sizeof(long));
+		if (cli->cl_mod_tag_bitmap == NULL)
+			GOTO(err, rc = -ENOMEM);
+	}
+
+        rc = ldlm_get_ref();
+        if (rc) {
+                CERROR("ldlm_get_ref failed: %d\n", rc);
+                GOTO(err, rc);
+        }
+
+        ptlrpc_init_client(rq_portal, rp_portal, name,
+                           &obddev->obd_ldlm_client);
+
+        imp = class_new_import(obddev);
+        if (imp == NULL)
+                GOTO(err_ldlm, rc = -ENOENT);
+        imp->imp_client = &obddev->obd_ldlm_client;
+        imp->imp_connect_op = connect_op;
+        memcpy(cli->cl_target_uuid.uuid, lustre_cfg_buf(lcfg, 1),
+               LUSTRE_CFG_BUFLEN(lcfg, 1));
+        class_import_put(imp);
+
+	if (lustre_cfg_buf(lcfg, 4)) {
+		__u32 refnet = libcfs_str2net(lustre_cfg_string(lcfg, 4));
+
+		if (refnet == LNET_NIDNET(LNET_NID_ANY)) {
+			rc = -EINVAL;
+			CERROR("%s: bad mount option 'network=%s': rc = %d\n",
+			       obddev->obd_name, lustre_cfg_string(lcfg, 4),
+			       rc);
+			GOTO(err_import, rc);
+		}
+		fake_conn.c_self = LNET_MKNID(refnet, 0);
+		imp->imp_connection = &fake_conn;
+	}
+
+	rc = client_import_add_conn(imp, &server_uuid, 1);
+	if (rc) {
+		CERROR("can't add initial connection\n");
+		GOTO(err_import, rc);
+	}
+	imp->imp_connection = NULL;
+
+	cli->cl_import = imp;
+	/* cli->cl_max_mds_easize updated by mdc_init_ea_size() */
+	cli->cl_max_mds_easize = sizeof(struct lov_mds_md_v3);
+
+        if (LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) {
+                if (!strcmp(lustre_cfg_string(lcfg, 3), "inactive")) {
+                        CDEBUG(D_HA, "marking %s %s->%s as inactive\n",
+                               name, obddev->obd_name,
+                               cli->cl_target_uuid.uuid);
+			spin_lock(&imp->imp_lock);
+			imp->imp_deactive = 1;
+			spin_unlock(&imp->imp_lock);
+                }
+        }
+
+        obddev->obd_namespace = ldlm_namespace_new(obddev, obddev->obd_name,
+                                                   LDLM_NAMESPACE_CLIENT,
+                                                   LDLM_NAMESPACE_GREEDY,
+                                                   ns_type);
+        if (obddev->obd_namespace == NULL) {
+                CERROR("Unable to create client namespace - %s\n",
+                       obddev->obd_name);
+                GOTO(err_import, rc = -ENOMEM);
+        }
+
+        RETURN(rc);
+
+err_import:
+        class_destroy_import(imp);
+err_ldlm:
+        ldlm_put_ref();
+err:
+	if (cli->cl_mod_tag_bitmap != NULL)
+		OBD_FREE(cli->cl_mod_tag_bitmap,
+			 BITS_TO_LONGS(OBD_MAX_RIF_MAX) * sizeof(long));
+	cli->cl_mod_tag_bitmap = NULL;
+        RETURN(rc);
+
+}
+EXPORT_SYMBOL(client_obd_setup);
+
+int client_obd_cleanup(struct obd_device *obddev)
+{
+	struct client_obd *cli = &obddev->u.cli;
+	ENTRY;
+
+	ldlm_namespace_free_post(obddev->obd_namespace);
+	obddev->obd_namespace = NULL;
+
+	obd_cleanup_client_import(obddev);
+	LASSERT(obddev->u.cli.cl_import == NULL);
+
+	ldlm_put_ref();
+
+	if (cli->cl_mod_tag_bitmap != NULL)
+		OBD_FREE(cli->cl_mod_tag_bitmap,
+			 BITS_TO_LONGS(OBD_MAX_RIF_MAX) * sizeof(long));
+	cli->cl_mod_tag_bitmap = NULL;
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(client_obd_cleanup);
+
+/* ->o_connect() method for client side (OSC and MDC and MGC) */
+int client_connect_import(const struct lu_env *env,
+                          struct obd_export **exp,
+                          struct obd_device *obd, struct obd_uuid *cluuid,
+                          struct obd_connect_data *data, void *localdata)
+{
+	struct client_obd       *cli    = &obd->u.cli;
+	struct obd_import       *imp    = cli->cl_import;
+	struct obd_connect_data *ocd;
+	struct lustre_handle    conn    = { 0 };
+	int                     rc;
+	ENTRY;
+
+	*exp = NULL;
+	down_write(&cli->cl_sem);
+	if (cli->cl_conn_count > 0)
+                GOTO(out_sem, rc = -EALREADY);
+
+	rc = class_connect(&conn, obd, cluuid);
+	if (rc)
+		GOTO(out_sem, rc);
+
+	cli->cl_conn_count++;
+	*exp = class_conn2export(&conn);
+
+	LASSERT(obd->obd_namespace);
+
+	imp->imp_dlm_handle = conn;
+	rc = ptlrpc_init_import(imp);
+	if (rc != 0)
+		GOTO(out_ldlm, rc);
+
+	ocd = &imp->imp_connect_data;
+	if (data) {
+		*ocd = *data;
+		imp->imp_connect_flags_orig = data->ocd_connect_flags;
+		imp->imp_connect_flags2_orig = data->ocd_connect_flags2;
+	}
+
+	rc = ptlrpc_connect_import(imp);
+	if (rc != 0) {
+		LASSERT(imp->imp_state == LUSTRE_IMP_DISCON);
+		GOTO(out_ldlm, rc);
+	}
+	LASSERT(*exp != NULL && (*exp)->exp_connection);
+
+	if (data) {
+		LASSERTF((ocd->ocd_connect_flags & data->ocd_connect_flags) ==
+			 ocd->ocd_connect_flags, "old %#llx, new %#llx\n",
+			 data->ocd_connect_flags, ocd->ocd_connect_flags);
+		data->ocd_connect_flags = ocd->ocd_connect_flags;
+	}
+
+	ptlrpc_pinger_add_import(imp);
+
+	EXIT;
+
+	if (rc) {
+out_ldlm:
+		cli->cl_conn_count--;
+		class_disconnect(*exp);
+		*exp = NULL;
+	}
+out_sem:
+	up_write(&cli->cl_sem);
+
+	return rc;
+}
+EXPORT_SYMBOL(client_connect_import);
+
+int client_disconnect_export(struct obd_export *exp)
+{
+        struct obd_device *obd = class_exp2obd(exp);
+        struct client_obd *cli;
+        struct obd_import *imp;
+        int rc = 0, err;
+        ENTRY;
+
+        if (!obd) {
+		CERROR("invalid export for disconnect: exp %p cookie %#llx\n",
+                       exp, exp ? exp->exp_handle.h_cookie : -1);
+                RETURN(-EINVAL);
+        }
+
+        cli = &obd->u.cli;
+        imp = cli->cl_import;
+
+	down_write(&cli->cl_sem);
+	CDEBUG(D_INFO, "disconnect %s - %zu\n", obd->obd_name,
+		cli->cl_conn_count);
+
+	if (cli->cl_conn_count == 0) {
+                CERROR("disconnecting disconnected device (%s)\n",
+                       obd->obd_name);
+                GOTO(out_disconnect, rc = -EINVAL);
+        }
+
+        cli->cl_conn_count--;
+	if (cli->cl_conn_count != 0)
+                GOTO(out_disconnect, rc = 0);
+
+	/* Mark import deactivated now, so we don't try to reconnect if any
+	 * of the cleanup RPCs fails (e.g. LDLM cancel, etc).  We don't
+	 * fully deactivate the import, or that would drop all requests. */
+	spin_lock(&imp->imp_lock);
+	imp->imp_deactive = 1;
+	spin_unlock(&imp->imp_lock);
+
+        /* Some non-replayable imports (MDS's OSCs) are pinged, so just
+         * delete it regardless.  (It's safe to delete an import that was
+         * never added.) */
+        (void)ptlrpc_pinger_del_import(imp);
+
+        if (obd->obd_namespace != NULL) {
+                /* obd_force == local only */
+                ldlm_cli_cancel_unused(obd->obd_namespace, NULL,
+                                       obd->obd_force ? LCF_LOCAL : 0, NULL);
+                ldlm_namespace_free_prior(obd->obd_namespace, imp, obd->obd_force);
+        }
+
+	/* There's no need to hold sem while disconnecting an import,
+	 * and it may actually cause deadlock in GSS. */
+	up_write(&cli->cl_sem);
+	rc = ptlrpc_disconnect_import(imp, 0);
+	down_write(&cli->cl_sem);
+
+        ptlrpc_invalidate_import(imp);
+
+        EXIT;
+
+out_disconnect:
+	/* Use server style - class_disconnect should be always called for
+	 * o_disconnect. */
+        err = class_disconnect(exp);
+        if (!rc && err)
+                rc = err;
+
+	up_write(&cli->cl_sem);
+
+        RETURN(rc);
+}
+EXPORT_SYMBOL(client_disconnect_export);
+
+#ifdef HAVE_SERVER_SUPPORT
+int server_disconnect_export(struct obd_export *exp)
+{
+        int rc;
+        ENTRY;
+
+	/* Disconnect early so that clients can't keep using export. */
+	rc = class_disconnect(exp);
+	/* Close import to avoid sending any requests. */
+	if (exp->exp_imp_reverse)
+		ptlrpc_cleanup_imp(exp->exp_imp_reverse);
+
+	ldlm_bl_thread_wakeup();
+
+        /* complete all outstanding replies */
+	spin_lock(&exp->exp_lock);
+	while (!list_empty(&exp->exp_outstanding_replies)) {
+		struct ptlrpc_reply_state *rs =
+			list_entry(exp->exp_outstanding_replies.next,
+				       struct ptlrpc_reply_state, rs_exp_list);
+		struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+
+		spin_lock(&svcpt->scp_rep_lock);
+
+		list_del_init(&rs->rs_exp_list);
+
+		spin_lock(&rs->rs_lock);
+		/* clear rs_convert_lock to make sure rs is handled and put */
+		rs->rs_convert_lock = 0;
+		ptlrpc_schedule_difficult_reply(rs);
+		spin_unlock(&rs->rs_lock);
+
+		spin_unlock(&svcpt->scp_rep_lock);
+	}
+	spin_unlock(&exp->exp_lock);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(server_disconnect_export);
+
+/* --------------------------------------------------------------------------
+ * from old lib/target.c
+ * -------------------------------------------------------------------------- */
+
+static int target_handle_reconnect(struct lustre_handle *conn,
+                                   struct obd_export *exp,
+                                   struct obd_uuid *cluuid)
+{
+	struct obd_device *target;
+	struct lustre_handle *hdl;
+	cfs_time_t now;
+	cfs_time_t deadline;
+	int timeout;
+	int rc = 0;
+	ENTRY;
+
+	hdl = &exp->exp_imp_reverse->imp_remote_handle;
+	if (!exp->exp_connection || !lustre_handle_is_used(hdl)) {
+		conn->cookie = exp->exp_handle.h_cookie;
+		CDEBUG(D_HA, "connect export for UUID '%s' at %p,"
+		       " cookie %#llx\n", cluuid->uuid, exp, conn->cookie);
+		RETURN(0);
+	}
+
+	target = exp->exp_obd;
+
+	/* Might be a re-connect after a partition. */
+	if (memcmp(&conn->cookie, &hdl->cookie, sizeof conn->cookie)) {
+		LCONSOLE_WARN("%s: already connected client %s (at %s) "
+			      "with handle %#llx. Rejecting client "
+			      "with the same UUID trying to reconnect "
+			      "with handle %#llx\n", target->obd_name,
+			      obd_uuid2str(&exp->exp_client_uuid),
+			      obd_export_nid2str(exp),
+			      hdl->cookie, conn->cookie);
+		memset(conn, 0, sizeof *conn);
+		/* target_handle_connect() treats EALREADY and
+		 * -EALREADY differently.  -EALREADY is an error
+		 * (same UUID, different handle). */
+		RETURN(-EALREADY);
+	}
+
+	if (!target->obd_recovering) {
+		LCONSOLE_WARN("%s: Client %s (at %s) reconnecting\n",
+			target->obd_name, obd_uuid2str(&exp->exp_client_uuid),
+			obd_export_nid2str(exp));
+		GOTO(out_already, rc);
+	}
+
+	now = cfs_time_current();
+	deadline = target->obd_recovery_timer.expires;
+	if (cfs_time_before(now, deadline)) {
+		struct target_distribute_txn_data *tdtd =
+					class_exp2tgt(exp)->lut_tdtd;
+		int size = 0;
+		int count = 0;
+		char *buf = NULL;
+
+		timeout = cfs_duration_sec(cfs_time_sub(deadline, now));
+		if (tdtd && tdtd->tdtd_show_update_logs_retrievers)
+			buf = tdtd->tdtd_show_update_logs_retrievers(
+				tdtd->tdtd_show_retrievers_cbdata,
+				&size, &count);
+
+		if (count > 0)
+			LCONSOLE_WARN("%s: Recovery already passed deadline "
+				      "%d:%.02d. It is due to DNE recovery "
+				      "failed/stuck on the %d MDT(s):%s. "
+				      "Please wait until all MDTs recovered "
+				      "or abort the recovery by force.\n",
+				      target->obd_name, timeout / 60,
+				      timeout % 60, count,
+				      buf ? buf : "unknown (not enough RAM)");
+		else
+			LCONSOLE_WARN("%s: Recovery already passed deadline "
+				      "%d:%.02d. If you do not want to wait "
+				      "more, please abort the recovery by "
+				      "force.\n", target->obd_name,
+				      timeout / 60, timeout % 60);
+
+		if (buf != NULL)
+			OBD_FREE(buf, size);
+	} else {
+		timeout = cfs_duration_sec(cfs_time_sub(now, deadline));
+		LCONSOLE_WARN("%s: Recovery already passed deadline"
+			" %d:%.02d, It is most likely due to DNE"
+			" recovery is failed or stuck, please wait a"
+			" few more minutes or abort the recovery.\n",
+			target->obd_name, timeout / 60, timeout % 60);
+	}
+
+out_already:
+	conn->cookie = exp->exp_handle.h_cookie;
+	/* target_handle_connect() treats EALREADY and
+	 * -EALREADY differently.  EALREADY means we are
+	 * doing a valid reconnect from the same client. */
+	RETURN(EALREADY);
+}
+
+static void
+check_and_start_recovery_timer(struct obd_device *obd,
+                               struct ptlrpc_request *req, int new_client);
+
+/**
+ * update flags for import during reconnect process
+ */
+static int rev_import_flags_update(struct obd_import *revimp,
+				   struct ptlrpc_request *req)
+{
+	int rc;
+	struct obd_connect_data *data;
+
+	data = req_capsule_client_get(&req->rq_pill, &RMF_CONNECT_DATA);
+
+	if (data->ocd_connect_flags & OBD_CONNECT_AT)
+		revimp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT;
+	else
+		revimp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
+
+	revimp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18;
+
+	rc = sptlrpc_import_sec_adapt(revimp, req->rq_svc_ctx, &req->rq_flvr);
+	if (rc) {
+		CERROR("%s: cannot get reverse import %s security: rc = %d\n",
+			revimp->imp_client->cli_name,
+			libcfs_id2str(req->rq_peer), rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+/**
+ * Allocate a new reverse import for an export.
+ *
+ * \retval -errno in case error hit
+ * \retval 0 if reverse import correctly init
+ **/
+int rev_import_init(struct obd_export *export)
+{
+	struct obd_device *obd = export->exp_obd;
+	struct obd_import *revimp;
+
+	LASSERT(export->exp_imp_reverse == NULL);
+
+	revimp = class_new_import(obd);
+	if (revimp == NULL)
+		return -ENOMEM;
+
+	revimp->imp_remote_handle.cookie = 0ULL;
+	revimp->imp_client = &obd->obd_ldlm_client;
+	revimp->imp_dlm_fake = 1;
+
+	/* it is safe to connect import in new state as no sends possible */
+	spin_lock(&export->exp_lock);
+	export->exp_imp_reverse = revimp;
+	spin_unlock(&export->exp_lock);
+	class_import_put(revimp);
+
+	return 0;
+}
+EXPORT_SYMBOL(rev_import_init);
+
+/**
+ * Handle reconnect for an export.
+ *
+ * \param exp export to handle reconnect process
+ * \param req client reconnect request
+ *
+ * \retval -rc in case securitfy flavor can't be changed
+ * \retval 0 in case none problems
+ */
+static int rev_import_reconnect(struct obd_export *exp,
+				struct ptlrpc_request *req)
+{
+	struct obd_import *revimp = exp->exp_imp_reverse;
+	struct lustre_handle *lh;
+	int rc;
+
+	/* avoid sending a request until import flags are changed */
+	ptlrpc_import_enter_resend(revimp);
+
+	if (revimp->imp_connection != NULL)
+		ptlrpc_connection_put(revimp->imp_connection);
+
+	/*
+	 * client from recovery don't have a handle so we need to take from
+	 * request. it may produce situation when wrong client connected
+	 * to recovery as we trust a client uuid
+	 */
+	lh = req_capsule_client_get(&req->rq_pill, &RMF_CONN);
+	revimp->imp_remote_handle = *lh;
+
+	/* unknown versions will be caught in
+	 * ptlrpc_handle_server_req_in->lustre_unpack_msg() */
+	revimp->imp_msg_magic = req->rq_reqmsg->lm_magic;
+
+	revimp->imp_connection = ptlrpc_connection_addref(exp->exp_connection);
+
+	rc = rev_import_flags_update(revimp, req);
+	if (rc != 0) {
+		/* it is safe to still be in RECOVERY phase as we are not able
+		 * to setup correct security flavor so requests are not able to
+		 * be delivered correctly */
+		return rc;
+	}
+
+	/* resend all rpc's via new connection */
+	return ptlrpc_import_recovery_state_machine(revimp);
+}
+
+int target_handle_connect(struct ptlrpc_request *req)
+{
+	struct obd_device *target = NULL;
+	struct obd_export *export = NULL;
+	/* connect handle - filled from target_handle_reconnect in
+	 * reconnect case */
+	struct lustre_handle conn;
+	struct lustre_handle *tmp;
+        struct obd_uuid cluuid;
+        char *str;
+        int rc = 0;
+        char *target_start;
+        int target_len;
+	bool	 mds_conn = false, lw_client = false, initial_conn = false;
+	bool	 mds_mds_conn = false;
+	bool	 new_mds_mds_conn = false;
+        struct obd_connect_data *data, *tmpdata;
+        int size, tmpsize;
+        lnet_nid_t *client_nid = NULL;
+	ENTRY;
+
+        OBD_RACE(OBD_FAIL_TGT_CONN_RACE);
+
+        str = req_capsule_client_get(&req->rq_pill, &RMF_TGTUUID);
+        if (str == NULL) {
+                DEBUG_REQ(D_ERROR, req, "bad target UUID for connect");
+                GOTO(out, rc = -EINVAL);
+        }
+
+	target = class_dev_by_str(str);
+	if (!target) {
+		deuuidify(str, NULL, &target_start, &target_len);
+		LCONSOLE_ERROR_MSG(0x137, "%s: not available for connect "
+				   "from %s (no target). If you are running "
+				   "an HA pair check that the target is "
+				   "mounted on the other server.\n", str,
+				   libcfs_nid2str(req->rq_peer.nid));
+		GOTO(out, rc = -ENODEV);
+	}
+
+	spin_lock(&target->obd_dev_lock);
+
+	target->obd_conn_inprogress++;
+
+	if (target->obd_stopping || !target->obd_set_up) {
+		spin_unlock(&target->obd_dev_lock);
+
+		deuuidify(str, NULL, &target_start, &target_len);
+		LCONSOLE_INFO("%.*s: Not available for connect from %s (%s)\n",
+			      target_len, target_start,
+			      libcfs_nid2str(req->rq_peer.nid),
+			      (target->obd_stopping ?
+			       "stopping" : "not set up"));
+		GOTO(out, rc = -ENODEV);
+	}
+
+        if (target->obd_no_conn) {
+		spin_unlock(&target->obd_dev_lock);
+
+		CDEBUG(D_INFO, "%s: Temporarily refusing client connection "
+			       "from %s\n", target->obd_name,
+			       libcfs_nid2str(req->rq_peer.nid));
+		GOTO(out, rc = -EAGAIN);
+	}
+
+	spin_unlock(&target->obd_dev_lock);
+
+        str = req_capsule_client_get(&req->rq_pill, &RMF_CLUUID);
+        if (str == NULL) {
+                DEBUG_REQ(D_ERROR, req, "bad client UUID for connect");
+                GOTO(out, rc = -EINVAL);
+        }
+
+        obd_str2uuid(&cluuid, str);
+
+        tmp = req_capsule_client_get(&req->rq_pill, &RMF_CONN);
+        if (tmp == NULL)
+                GOTO(out, rc = -EPROTO);
+
+        conn = *tmp;
+
+        size = req_capsule_get_size(&req->rq_pill, &RMF_CONNECT_DATA,
+                                    RCL_CLIENT);
+        data = req_capsule_client_get(&req->rq_pill, &RMF_CONNECT_DATA);
+        if (!data)
+                GOTO(out, rc = -EPROTO);
+
+        rc = req_capsule_server_pack(&req->rq_pill);
+        if (rc)
+                GOTO(out, rc);
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
+	/* Don't allow clients to connect that are using old 1.8 format
+	 * protocol conventions (LUSTRE_MSG_MAGIC_v1, !MSGHDR_CKSUM_INCOMPAT18,
+	 * ldlm_flock_policy_wire format, MDT_ATTR_xTIME_SET, etc).  The
+	 * FULL20 flag should be set on all connections since 2.0, but no
+	 * longer affects behaviour.
+	 *
+	 * Later this check will be disabled and the flag can be retired
+	 * completely once interop with 3.0 is no longer needed.
+	 */
+	if (!(data->ocd_connect_flags & OBD_CONNECT_FULL20))
+		GOTO(out, rc = -EPROTO);
+#endif
+
+	if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
+		if (data->ocd_version < LUSTRE_VERSION_CODE -
+		                               LUSTRE_VERSION_ALLOWED_OFFSET ||
+		    data->ocd_version > LUSTRE_VERSION_CODE +
+		                               LUSTRE_VERSION_ALLOWED_OFFSET) {
+			DEBUG_REQ(D_WARNING, req, "Refusing %s (%d.%d.%d.%d) "
+				  "libclient connection attempt",
+				  data->ocd_version < LUSTRE_VERSION_CODE ?
+				  "old" : "new",
+				  OBD_OCD_VERSION_MAJOR(data->ocd_version),
+				  OBD_OCD_VERSION_MINOR(data->ocd_version),
+				  OBD_OCD_VERSION_PATCH(data->ocd_version),
+				  OBD_OCD_VERSION_FIX(data->ocd_version));
+			data = req_capsule_server_sized_get(&req->rq_pill,
+							    &RMF_CONNECT_DATA,
+				    offsetof(typeof(*data), ocd_version) +
+					     sizeof(data->ocd_version));
+			if (data) {
+				data->ocd_connect_flags = OBD_CONNECT_VERSION;
+				data->ocd_version = LUSTRE_VERSION_CODE;
+			}
+			GOTO(out, rc = -EPROTO);
+		}
+	}
+
+	/* Note: lw_client is needed in MDS-MDS failover during update log
+	 * processing, so we needs to allow lw_client to be connected at
+	 * anytime, instead of only the initial connection */
+	lw_client = (data->ocd_connect_flags & OBD_CONNECT_LIGHTWEIGHT) != 0;
+
+	if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_INITIAL) {
+		initial_conn = true;
+		mds_conn = (data->ocd_connect_flags & OBD_CONNECT_MDS) != 0;
+		mds_mds_conn = (data->ocd_connect_flags &
+				OBD_CONNECT_MDS_MDS) != 0;
+
+		/* OBD_CONNECT_MNE_SWAB is defined as OBD_CONNECT_MDS_MDS
+		 * for Imperative Recovery connection from MGC to MGS.
+		 *
+		 * Via check OBD_CONNECT_FID, we can distinguish whether
+		 * the OBD_CONNECT_MDS_MDS/OBD_CONNECT_MNE_SWAB is from
+		 * MGC or MDT. */
+		if (!lw_client &&
+		    (data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) &&
+		    (data->ocd_connect_flags & OBD_CONNECT_FID) &&
+		    (data->ocd_connect_flags & OBD_CONNECT_VERSION)) {
+			__u32 major = OBD_OCD_VERSION_MAJOR(data->ocd_version);
+			__u32 minor = OBD_OCD_VERSION_MINOR(data->ocd_version);
+			__u32 patch = OBD_OCD_VERSION_PATCH(data->ocd_version);
+
+			/* We do not support the MDT-MDT interoperations with
+			 * different version MDT because of protocol changes. */
+			if (unlikely(major != LUSTRE_MAJOR ||
+				     minor != LUSTRE_MINOR ||
+				     abs(patch - LUSTRE_PATCH) > 3)) {
+				LCONSOLE_WARN("%s (%u.%u.%u.%u) refused the "
+					"connection from different version MDT "
+					"(%d.%d.%d.%d) %s %s\n",
+					target->obd_name, LUSTRE_MAJOR,
+					LUSTRE_MINOR, LUSTRE_PATCH, LUSTRE_FIX,
+					major, minor, patch,
+					OBD_OCD_VERSION_FIX(data->ocd_version),
+					libcfs_nid2str(req->rq_peer.nid), str);
+
+				GOTO(out, rc = -EPROTO);
+			}
+		}
+	}
+
+        /* lctl gets a backstage, all-access pass. */
+        if (obd_uuid_equals(&cluuid, &target->obd_uuid))
+                goto dont_check_exports;
+
+        export = cfs_hash_lookup(target->obd_uuid_hash, &cluuid);
+        if (!export)
+                goto no_export;
+
+	/* We've found an export in the hash. */
+
+	spin_lock(&export->exp_lock);
+
+	if (export->exp_connecting) { /* bug 9635, et. al. */
+		spin_unlock(&export->exp_lock);
+		LCONSOLE_WARN("%s: Export %p already connecting from %s\n",
+			      export->exp_obd->obd_name, export,
+			      libcfs_nid2str(req->rq_peer.nid));
+		class_export_put(export);
+		export = NULL;
+		rc = -EALREADY;
+	} else if ((mds_conn || (lw_client && initial_conn) ||
+		   data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) &&
+		   export->exp_connection != NULL) {
+		spin_unlock(&export->exp_lock);
+		if (req->rq_peer.nid != export->exp_connection->c_peer.nid) {
+			/* MDS or LWP reconnected after failover. */
+			LCONSOLE_WARN("%s: Received %s connection from "
+			    "%s, removing former export from %s\n",
+			    target->obd_name, mds_conn ? "MDS" : "LWP",
+			    libcfs_nid2str(req->rq_peer.nid),
+			    libcfs_nid2str(export->exp_connection->c_peer.nid));
+		} else {
+			/* New MDS connection from the same NID. */
+			LCONSOLE_WARN("%s: Received new %s connection from "
+				"%s, removing former export from same NID\n",
+				target->obd_name, mds_conn ? "MDS" : "LWP",
+				libcfs_nid2str(req->rq_peer.nid));
+		}
+
+		if (req->rq_peer.nid == export->exp_connection->c_peer.nid &&
+		    data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) {
+			/* Because exports between MDTs will always be
+			 * kept, let's do not fail such export if they
+			 * come from the same NID, otherwise it might
+			 * cause eviction between MDTs, which might
+			 * cause namespace inconsistency */
+			spin_lock(&export->exp_lock);
+			export->exp_connecting = 1;
+			spin_unlock(&export->exp_lock);
+			conn.cookie = export->exp_handle.h_cookie;
+			rc = EALREADY;
+		} else {
+			class_fail_export(export);
+			class_export_put(export);
+			export = NULL;
+			rc = 0;
+		}
+	} else if (export->exp_connection != NULL && initial_conn &&
+		   req->rq_peer.nid != export->exp_connection->c_peer.nid) {
+		spin_unlock(&export->exp_lock);
+		/* In MDS failover we have static UUID but NID can change. */
+                LCONSOLE_WARN("%s: Client %s seen on new nid %s when "
+                              "existing nid %s is already connected\n",
+                              target->obd_name, cluuid.uuid,
+                              libcfs_nid2str(req->rq_peer.nid),
+                              libcfs_nid2str(
+                                      export->exp_connection->c_peer.nid));
+                rc = -EALREADY;
+                class_export_put(export);
+                export = NULL;
+        } else {
+		export->exp_connecting = 1;
+		spin_unlock(&export->exp_lock);
+		LASSERT(export->exp_obd == target);
+
+		rc = target_handle_reconnect(&conn, export, &cluuid);
+	}
+
+        /* If we found an export, we already unlocked. */
+        if (!export) {
+no_export:
+                OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_CONNECT, 2 * obd_timeout);
+        } else if (req->rq_export == NULL &&
+		   atomic_read(&export->exp_rpc_count) > 0) {
+                LCONSOLE_WARN("%s: Client %s (at %s) refused connection, "
+                              "still busy with %d references\n",
+                              target->obd_name, cluuid.uuid,
+                              libcfs_nid2str(req->rq_peer.nid),
+			      atomic_read(&export->exp_refcount));
+                GOTO(out, rc = -EBUSY);
+        } else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1) {
+                if (!strstr(cluuid.uuid, "mdt"))
+                        LCONSOLE_WARN("%s: Rejecting reconnect from the "
+                                      "known client %s (at %s) because it "
+                                      "is indicating it is a new client",
+                                      target->obd_name, cluuid.uuid,
+                                      libcfs_nid2str(req->rq_peer.nid));
+                GOTO(out, rc = -EALREADY);
+        } else {
+                OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_RECONNECT, 2 * obd_timeout);
+        }
+
+        if (rc < 0) {
+                GOTO(out, rc);
+        }
+
+	CDEBUG(D_HA, "%s: connection from %s@%s %st%llu exp %p cur %ld last %ld\n",
+               target->obd_name, cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
+              target->obd_recovering ? "recovering/" : "", data->ocd_transno,
+              export, (long)cfs_time_current_sec(),
+              export ? (long)export->exp_last_request_time : 0);
+
+	/* If this is the first time a client connects, reset the recovery
+	 * timer. Discard lightweight connections which might be local. */
+	if (!lw_client && rc == 0 && target->obd_recovering)
+		check_and_start_recovery_timer(target, req, export == NULL);
+
+	/* We want to handle EALREADY but *not* -EALREADY from
+	 * target_handle_reconnect(), return reconnection state in a flag. */
+        if (rc == EALREADY) {
+                lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECONNECT);
+                rc = 0;
+        } else {
+                LASSERT(rc == 0);
+        }
+
+	/* Tell the client if we support replayable requests. */
+        if (target->obd_replayable)
+                lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_REPLAYABLE);
+        client_nid = &req->rq_peer.nid;
+
+        if (export == NULL) {
+		/* allow lightweight connections during recovery */
+		/* allow "new" MDT to be connected during recovery, since we
+		 * need retrieve recovery update records from it */
+		if (target->obd_recovering && !lw_client && !mds_mds_conn) {
+                        cfs_time_t t;
+			int	c; /* connected */
+			int	i; /* in progress */
+			int	k; /* known */
+			int	s; /* stale/evicted */
+
+			c = atomic_read(&target->obd_connected_clients);
+			i = atomic_read(&target->obd_lock_replay_clients);
+			k = target->obd_max_recoverable_clients;
+			s = target->obd_stale_clients;
+			t = target->obd_recovery_timer.expires;
+			t = cfs_time_sub(t, cfs_time_current());
+			t = cfs_duration_sec(t);
+			LCONSOLE_WARN("%s: Denying connection for new client %s"
+				      "(at %s), waiting for %d known clients "
+				      "(%d recovered, %d in progress, and %d "
+				      "evicted) to recover in %d:%.02d\n",
+				      target->obd_name, cluuid.uuid,
+				      libcfs_nid2str(req->rq_peer.nid), k,
+				      c - i, i, s, (int)t / 60,
+				      (int)t % 60);
+			rc = -EBUSY;
+		} else {
+dont_check_exports:
+			rc = obd_connect(req->rq_svc_thread->t_env,
+					 &export, target, &cluuid, data,
+					 client_nid);
+			if (mds_conn && OBD_FAIL_CHECK(OBD_FAIL_TGT_RCVG_FLAG))
+				lustre_msg_add_op_flags(req->rq_repmsg,
+							MSG_CONNECT_RECOVERING);
+			if (rc == 0) {
+				conn.cookie = export->exp_handle.h_cookie;
+				rc = rev_import_init(export);
+			}
+
+			if (mds_mds_conn)
+				new_mds_mds_conn = true;
+		}
+	} else {
+		rc = obd_reconnect(req->rq_svc_thread->t_env,
+				   export, target, &cluuid, data, client_nid);
+	}
+	if (rc)
+		GOTO(out, rc);
+
+	LASSERT(target->u.obt.obt_magic == OBT_MAGIC);
+	data->ocd_instance = target->u.obt.obt_instance;
+
+        /* Return only the parts of obd_connect_data that we understand, so the
+         * client knows that we don't understand the rest. */
+        if (data) {
+                tmpsize = req_capsule_get_size(&req->rq_pill, &RMF_CONNECT_DATA,
+                                               RCL_SERVER);
+                tmpdata = req_capsule_server_get(&req->rq_pill,
+                                                 &RMF_CONNECT_DATA);
+                /* Don't use struct assignment here, because the client reply
+                 * buffer may be smaller/larger than the local struct
+                 * obd_connect_data. */
+                memcpy(tmpdata, data, min(tmpsize, size));
+        }
+
+        /* If the client and the server are the same node, we will already
+         * have an export that really points to the client's DLM export,
+         * because we have a shared handles table.
+         *
+         * XXX this will go away when shaver stops sending the "connect" handle
+         * in the real "remote handle" field of the request --phik 24 Apr 2003
+         */
+	ptlrpc_request_change_export(req, export);
+
+	spin_lock(&export->exp_lock);
+	if (export->exp_conn_cnt >= lustre_msg_get_conn_cnt(req->rq_reqmsg)) {
+		spin_unlock(&export->exp_lock);
+		CDEBUG(D_RPCTRACE, "%s: %s already connected at greater "
+		       "or equal conn_cnt: %d >= %d\n",
+                       cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
+                       export->exp_conn_cnt,
+                       lustre_msg_get_conn_cnt(req->rq_reqmsg));
+
+                GOTO(out, rc = -EALREADY);
+        }
+        LASSERT(lustre_msg_get_conn_cnt(req->rq_reqmsg) > 0);
+        export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
+
+	/* Don't evict liblustre clients for not pinging. */
+        if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
+                export->exp_libclient = 1;
+		spin_unlock(&export->exp_lock);
+
+		spin_lock(&target->obd_dev_lock);
+		list_del_init(&export->exp_obd_chain_timed);
+		spin_unlock(&target->obd_dev_lock);
+	} else {
+		spin_unlock(&export->exp_lock);
+	}
+
+        if (export->exp_connection != NULL) {
+		/* Check to see if connection came from another NID. */
+                if ((export->exp_connection->c_peer.nid != req->rq_peer.nid) &&
+		    !hlist_unhashed(&export->exp_nid_hash))
+                        cfs_hash_del(export->exp_obd->obd_nid_hash,
+                                     &export->exp_connection->c_peer.nid,
+                                     &export->exp_nid_hash);
+
+                ptlrpc_connection_put(export->exp_connection);
+        }
+
+	export->exp_connection = ptlrpc_connection_get(req->rq_peer,
+						       req->rq_self,
+						       &cluuid);
+	if (hlist_unhashed(&export->exp_nid_hash))
+		cfs_hash_add(export->exp_obd->obd_nid_hash,
+			     &export->exp_connection->c_peer.nid,
+			     &export->exp_nid_hash);
+
+	lustre_msg_set_handle(req->rq_repmsg, &conn);
+
+	rc = rev_import_reconnect(export, req);
+	if (rc != 0)
+		GOTO(out, rc);
+
+	if (target->obd_recovering && !export->exp_in_recovery && !lw_client) {
+                int has_transno;
+                __u64 transno = data->ocd_transno;
+
+		spin_lock(&export->exp_lock);
+		/* possible race with class_disconnect_stale_exports,
+		 * export may be already in the eviction process */
+		if (export->exp_failed) {
+			spin_unlock(&export->exp_lock);
+			GOTO(out, rc = -ENODEV);
+		}
+		export->exp_in_recovery = 1;
+		export->exp_req_replay_needed = 1;
+		export->exp_lock_replay_needed = 1;
+		spin_unlock(&export->exp_lock);
+
+                has_transno = !!(lustre_msg_get_op_flags(req->rq_reqmsg) &
+                                 MSG_CONNECT_TRANSNO);
+                if (has_transno && transno == 0)
+                        CWARN("Connect with zero transno!\n");
+
+                if (has_transno && transno > 0 &&
+                    transno < target->obd_next_recovery_transno &&
+                    transno > target->obd_last_committed) {
+			/* Another way is to use cmpxchg() to be lock-free. */
+			spin_lock(&target->obd_recovery_task_lock);
+			if (transno < target->obd_next_recovery_transno)
+				target->obd_next_recovery_transno = transno;
+			spin_unlock(&target->obd_recovery_task_lock);
+                }
+
+		atomic_inc(&target->obd_req_replay_clients);
+		atomic_inc(&target->obd_lock_replay_clients);
+		/* Note: MDS-MDS connection is allowed to be connected during
+		 * recovery, no matter if the exports needs to be recoveried.
+		 * Because we need retrieve updates logs from all other MDTs.
+		 * So if the MDS-MDS export is new, obd_max_recoverable_clients
+		 * also needs to be increased to match other recovery checking
+		 * condition. */
+		if (new_mds_mds_conn)
+			target->obd_max_recoverable_clients++;
+		if (atomic_inc_return(&target->obd_connected_clients) ==
+		    target->obd_max_recoverable_clients)
+			wake_up(&target->obd_next_transno_waitq);
+	}
+
+        /* Tell the client we're in recovery, when client is involved in it. */
+	if (target->obd_recovering && !lw_client)
+                lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECOVERING);
+
+out:
+	if (export) {
+		spin_lock(&export->exp_lock);
+		export->exp_connecting = 0;
+		spin_unlock(&export->exp_lock);
+
+		class_export_put(export);
+	}
+	if (target != NULL) {
+		spin_lock(&target->obd_dev_lock);
+		target->obd_conn_inprogress--;
+		spin_unlock(&target->obd_dev_lock);
+		class_decref(target, "find", current);
+	}
+	req->rq_status = rc;
+	RETURN(rc);
+}
+
+int target_handle_disconnect(struct ptlrpc_request *req)
+{
+        int rc;
+        ENTRY;
+
+        rc = req_capsule_server_pack(&req->rq_pill);
+        if (rc)
+                RETURN(rc);
+
+	/* Keep the rq_export around so we can send the reply. */
+        req->rq_status = obd_disconnect(class_export_get(req->rq_export));
+
+        RETURN(0);
+}
+
+void target_destroy_export(struct obd_export *exp)
+{
+	struct obd_import	*imp = NULL;
+	/* exports created from last_rcvd data, and "fake"
+	   exports created by lctl don't have an import */
+	spin_lock(&exp->exp_lock);
+	if (exp->exp_imp_reverse != NULL) {
+		imp = exp->exp_imp_reverse;
+		exp->exp_imp_reverse = NULL;
+	}
+	spin_unlock(&exp->exp_lock);
+	if (imp != NULL)
+		client_destroy_import(imp);
+
+	LASSERT_ATOMIC_ZERO(&exp->exp_locks_count);
+	LASSERT_ATOMIC_ZERO(&exp->exp_rpc_count);
+	LASSERT_ATOMIC_ZERO(&exp->exp_cb_count);
+	LASSERT_ATOMIC_ZERO(&exp->exp_replay_count);
+}
+EXPORT_SYMBOL(target_destroy_export);
+
+/*
+ * Recovery functions
+ */
+static void target_request_copy_get(struct ptlrpc_request *req)
+{
+	class_export_rpc_inc(req->rq_export);
+	LASSERT(list_empty(&req->rq_list));
+	INIT_LIST_HEAD(&req->rq_replay_list);
+
+	/* Increase refcount to keep request in queue. */
+	atomic_inc(&req->rq_refcount);
+	/* Let export know it has replays to be handled. */
+	atomic_inc(&req->rq_export->exp_replay_count);
+}
+
+static void target_request_copy_put(struct ptlrpc_request *req)
+{
+	LASSERT(list_empty(&req->rq_replay_list));
+	LASSERT_ATOMIC_POS(&req->rq_export->exp_replay_count);
+
+	atomic_dec(&req->rq_export->exp_replay_count);
+	class_export_rpc_dec(req->rq_export);
+	ptlrpc_server_drop_request(req);
+}
+
+static int target_exp_enqueue_req_replay(struct ptlrpc_request *req)
+{
+        __u64                  transno = lustre_msg_get_transno(req->rq_reqmsg);
+        struct obd_export     *exp = req->rq_export;
+        struct ptlrpc_request *reqiter;
+	struct ptlrpc_request *dup_req = NULL;
+        int                    dup = 0;
+
+        LASSERT(exp);
+
+	spin_lock(&exp->exp_lock);
+	list_for_each_entry(reqiter, &exp->exp_req_replay_queue,
+                                rq_replay_list) {
+                if (lustre_msg_get_transno(reqiter->rq_reqmsg) == transno) {
+			dup_req = reqiter;
+                        dup = 1;
+                        break;
+                }
+        }
+
+        if (dup) {
+		/* We expect it with RESENT and REPLAY flags. */
+                if ((lustre_msg_get_flags(req->rq_reqmsg) &
+                     (MSG_RESENT | MSG_REPLAY)) != (MSG_RESENT | MSG_REPLAY))
+                        CERROR("invalid flags %x of resent replay\n",
+                               lustre_msg_get_flags(req->rq_reqmsg));
+
+		if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
+			__u32 new_conn;
+
+			new_conn = lustre_msg_get_conn_cnt(req->rq_reqmsg);
+			if (new_conn >
+			    lustre_msg_get_conn_cnt(dup_req->rq_reqmsg))
+				lustre_msg_set_conn_cnt(dup_req->rq_reqmsg,
+							new_conn);
+		}
+        } else {
+		list_add_tail(&req->rq_replay_list,
+                                  &exp->exp_req_replay_queue);
+        }
+
+	spin_unlock(&exp->exp_lock);
+        return dup;
+}
+
+static void target_exp_dequeue_req_replay(struct ptlrpc_request *req)
+{
+	LASSERT(!list_empty(&req->rq_replay_list));
+	LASSERT(req->rq_export);
+
+	spin_lock(&req->rq_export->exp_lock);
+	list_del_init(&req->rq_replay_list);
+	spin_unlock(&req->rq_export->exp_lock);
+}
+
+static void target_finish_recovery(struct lu_target *lut)
+{
+	struct obd_device *obd = lut->lut_obd;
+        ENTRY;
+
+	/* Only log a recovery message when recovery has occurred. */
+	if (obd->obd_recovery_start) {
+		time64_t now = ktime_get_real_seconds();
+		time64_t elapsed_time;
+
+		elapsed_time = max_t(time64_t, now - obd->obd_recovery_start, 1);
+		LCONSOLE_INFO("%s: Recovery over after %lld:%.02lld, of %d clients "
+			"%d recovered and %d %s evicted.\n", obd->obd_name,
+			(s64)elapsed_time / 60, (s64)elapsed_time % 60,
+			obd->obd_max_recoverable_clients,
+			atomic_read(&obd->obd_connected_clients),
+			obd->obd_stale_clients,
+			obd->obd_stale_clients == 1 ? "was" : "were");
+	}
+
+	ldlm_reprocess_recovery_done(obd->obd_namespace);
+	spin_lock(&obd->obd_recovery_task_lock);
+	if (!list_empty(&obd->obd_req_replay_queue) ||
+	    !list_empty(&obd->obd_lock_replay_queue) ||
+	    !list_empty(&obd->obd_final_req_queue)) {
+                CERROR("%s: Recovery queues ( %s%s%s) are not empty\n",
+                       obd->obd_name,
+		       list_empty(&obd->obd_req_replay_queue) ? "" : "req ",
+		       list_empty(&obd->obd_lock_replay_queue) ? \
+                               "" : "lock ",
+		       list_empty(&obd->obd_final_req_queue) ? \
+                               "" : "final ");
+		spin_unlock(&obd->obd_recovery_task_lock);
+		LBUG();
+	}
+	spin_unlock(&obd->obd_recovery_task_lock);
+
+	obd->obd_recovery_end = ktime_get_real_seconds();
+
+	/* When recovery finished, cleanup orphans on MDS and OST. */
+        if (OBT(obd) && OBP(obd, postrecov)) {
+                int rc = OBP(obd, postrecov)(obd);
+                if (rc < 0)
+                        LCONSOLE_WARN("%s: Post recovery failed, rc %d\n",
+                                      obd->obd_name, rc);
+        }
+        EXIT;
+}
+
+static void abort_req_replay_queue(struct obd_device *obd)
+{
+	struct ptlrpc_request *req, *n;
+	struct list_head abort_list;
+
+	INIT_LIST_HEAD(&abort_list);
+	spin_lock(&obd->obd_recovery_task_lock);
+	list_splice_init(&obd->obd_req_replay_queue, &abort_list);
+	spin_unlock(&obd->obd_recovery_task_lock);
+	list_for_each_entry_safe(req, n, &abort_list, rq_list) {
+                DEBUG_REQ(D_WARNING, req, "aborted:");
+                req->rq_status = -ENOTCONN;
+                if (ptlrpc_error(req)) {
+                        DEBUG_REQ(D_ERROR, req,
+                                  "failed abort_req_reply; skipping");
+                }
+                target_exp_dequeue_req_replay(req);
+                target_request_copy_put(req);
+        }
+}
+
+static void abort_lock_replay_queue(struct obd_device *obd)
+{
+	struct ptlrpc_request *req, *n;
+	struct list_head abort_list;
+
+	INIT_LIST_HEAD(&abort_list);
+	spin_lock(&obd->obd_recovery_task_lock);
+	list_splice_init(&obd->obd_lock_replay_queue, &abort_list);
+	spin_unlock(&obd->obd_recovery_task_lock);
+	list_for_each_entry_safe(req, n, &abort_list, rq_list) {
+                DEBUG_REQ(D_ERROR, req, "aborted:");
+                req->rq_status = -ENOTCONN;
+                if (ptlrpc_error(req)) {
+                        DEBUG_REQ(D_ERROR, req,
+                                  "failed abort_lock_reply; skipping");
+                }
+                target_request_copy_put(req);
+        }
+}
+
+/* Called from a cleanup function if the device is being cleaned up
+   forcefully.  The exports should all have been disconnected already,
+   the only thing left to do is
+     - clear the recovery flags
+     - cancel the timer
+     - free queued requests and replies, but don't send replies
+   Because the obd_stopping flag is set, no new requests should be received.
+
+*/
+void target_cleanup_recovery(struct obd_device *obd)
+{
+        struct ptlrpc_request *req, *n;
+	struct list_head clean_list;
+
+	INIT_LIST_HEAD(&clean_list);
+	spin_lock(&obd->obd_dev_lock);
+	if (!obd->obd_recovering) {
+		spin_unlock(&obd->obd_dev_lock);
+		EXIT;
+		return;
+	}
+	obd->obd_recovering = obd->obd_abort_recovery = 0;
+	spin_unlock(&obd->obd_dev_lock);
+
+	spin_lock(&obd->obd_recovery_task_lock);
+	target_cancel_recovery_timer(obd);
+	list_splice_init(&obd->obd_req_replay_queue, &clean_list);
+	spin_unlock(&obd->obd_recovery_task_lock);
+
+	list_for_each_entry_safe(req, n, &clean_list, rq_list) {
+		LASSERT(req->rq_reply_state == NULL);
+		target_exp_dequeue_req_replay(req);
+		target_request_copy_put(req);
+	}
+
+	spin_lock(&obd->obd_recovery_task_lock);
+	list_splice_init(&obd->obd_lock_replay_queue, &clean_list);
+	list_splice_init(&obd->obd_final_req_queue, &clean_list);
+	spin_unlock(&obd->obd_recovery_task_lock);
+
+	list_for_each_entry_safe(req, n, &clean_list, rq_list) {
+		LASSERT(req->rq_reply_state == NULL);
+                target_request_copy_put(req);
+        }
+
+        EXIT;
+}
+EXPORT_SYMBOL(target_cleanup_recovery);
+
+/* obd_recovery_task_lock should be held */
+void target_cancel_recovery_timer(struct obd_device *obd)
+{
+        CDEBUG(D_HA, "%s: cancel recovery timer\n", obd->obd_name);
+	del_timer(&obd->obd_recovery_timer);
+}
+
+static void target_start_recovery_timer(struct obd_device *obd)
+{
+	if (obd->obd_recovery_start != 0)
+		return;
+
+	spin_lock(&obd->obd_dev_lock);
+	if (!obd->obd_recovering || obd->obd_abort_recovery) {
+		spin_unlock(&obd->obd_dev_lock);
+		return;
+	}
+
+	LASSERT(obd->obd_recovery_timeout != 0);
+
+	if (obd->obd_recovery_start != 0) {
+		spin_unlock(&obd->obd_dev_lock);
+		return;
+	}
+
+	mod_timer(&obd->obd_recovery_timer,
+		  cfs_time_shift(obd->obd_recovery_timeout));
+	obd->obd_recovery_start = ktime_get_real_seconds();
+	spin_unlock(&obd->obd_dev_lock);
+
+	LCONSOLE_WARN("%s: Will be in recovery for at least %llu:%02llu, or until %d client%s reconnect%s\n",
+		      obd->obd_name,
+		      obd->obd_recovery_timeout / 60,
+		      obd->obd_recovery_timeout % 60,
+		      obd->obd_max_recoverable_clients,
+		      (obd->obd_max_recoverable_clients == 1) ? "" : "s",
+		      (obd->obd_max_recoverable_clients == 1) ? "s": "");
+}
+
+/**
+ * extend recovery window.
+ *
+ * if @extend is true, extend recovery window to have @drt remaining at least;
+ * otherwise, make sure the recovery timeout value is not less than @drt.
+ */
+static void extend_recovery_timer(struct obd_device *obd, int drt,
+				  bool extend)
+{
+	time64_t now;
+	time64_t end;
+	time64_t left;
+	time64_t to;
+
+	spin_lock(&obd->obd_dev_lock);
+	if (!obd->obd_recovering || obd->obd_abort_recovery) {
+		spin_unlock(&obd->obd_dev_lock);
+		return;
+	}
+	LASSERT(obd->obd_recovery_start != 0);
+
+	now = ktime_get_real_seconds();
+	to = obd->obd_recovery_timeout;
+	end = obd->obd_recovery_start + to;
+	left = end - now;
+
+        if (extend && (drt > left)) {
+                to += drt - left;
+        } else if (!extend && (drt > to)) {
+                to = drt;
+        }
+
+	if (to > obd->obd_recovery_time_hard) {
+		to = obd->obd_recovery_time_hard;
+		CWARN("%s: extended recovery timer reaching hard limit: %lld, extend: %d\n",
+		      obd->obd_name, to, extend);
+	}
+
+	if (obd->obd_recovery_timeout < to) {
+                obd->obd_recovery_timeout = to;
+		end = obd->obd_recovery_start + to;
+		mod_timer(&obd->obd_recovery_timer,
+			  cfs_time_shift(end - now));
+        }
+	spin_unlock(&obd->obd_dev_lock);
+
+	CDEBUG(D_HA, "%s: recovery timer will expire in %lld seconds\n",
+		obd->obd_name, (s64)(end - now));
+}
+
+/* Reset the timer with each new client connection */
+/*
+ * This timer is actually reconnect_timer, which is for making sure
+ * the total recovery window is at least as big as my reconnect
+ * attempt timing. So the initial recovery time_out will be set to
+ * OBD_RECOVERY_FACTOR * obd_timeout. If the timeout coming
+ * from client is bigger than this, then the recovery time_out will
+ * be extended to make sure the client could be reconnected, in the
+ * process, the timeout from the new client should be ignored.
+ */
+
+static void
+check_and_start_recovery_timer(struct obd_device *obd,
+                               struct ptlrpc_request *req,
+                               int new_client)
+{
+        int service_time = lustre_msg_get_service_time(req->rq_reqmsg);
+        struct obd_device_target *obt = &obd->u.obt;
+
+        if (!new_client && service_time)
+                /* Teach server about old server's estimates, as first guess
+                 * at how long new requests will take. */
+		at_measured(&req->rq_rqbd->rqbd_svcpt->scp_at_estimate,
+                            service_time);
+
+        target_start_recovery_timer(obd);
+
+	/* Convert the service time to RPC timeout,
+	 * and reuse service_time to limit stack usage. */
+	service_time = at_est2timeout(service_time);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_TGT_SLUGGISH_NET) &&
+	    service_time < at_extra)
+		service_time = at_extra;
+
+	/* We expect other clients to timeout within service_time, then try
+	 * to reconnect, then try the failover server.  The max delay between
+	 * connect attempts is SWITCH_MAX + SWITCH_INC + INITIAL. */
+        service_time += 2 * INITIAL_CONNECT_TIMEOUT;
+
+        LASSERT(obt->obt_magic == OBT_MAGIC);
+	service_time += 2 * (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC);
+	if (service_time > obd->obd_recovery_timeout && !new_client)
+		extend_recovery_timer(obd, service_time, false);
+}
+
+/** Health checking routines */
+static inline int exp_connect_healthy(struct obd_export *exp)
+{
+        return (exp->exp_in_recovery);
+}
+
+/** if export done req_replay or has replay in queue */
+static inline int exp_req_replay_healthy(struct obd_export *exp)
+{
+	return (!exp->exp_req_replay_needed ||
+		atomic_read(&exp->exp_replay_count) > 0);
+}
+
+
+static inline int exp_req_replay_healthy_or_from_mdt(struct obd_export *exp)
+{
+	return (exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS) ||
+	       exp_req_replay_healthy(exp);
+}
+
+/** if export done lock_replay or has replay in queue */
+static inline int exp_lock_replay_healthy(struct obd_export *exp)
+{
+	return (!exp->exp_lock_replay_needed ||
+		atomic_read(&exp->exp_replay_count) > 0);
+}
+
+static inline int exp_vbr_healthy(struct obd_export *exp)
+{
+        return (!exp->exp_vbr_failed);
+}
+
+static inline int exp_finished(struct obd_export *exp)
+{
+        return (exp->exp_in_recovery && !exp->exp_lock_replay_needed);
+}
+
+static inline int exp_finished_or_from_mdt(struct obd_export *exp)
+{
+	return (exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS) ||
+		exp_finished(exp);
+}
+
+static int check_for_next_transno(struct lu_target *lut)
+{
+	struct ptlrpc_request *req = NULL;
+	struct obd_device *obd = lut->lut_obd;
+	struct target_distribute_txn_data *tdtd = lut->lut_tdtd;
+	int wake_up = 0, connected, completed, queue_len;
+	__u64 req_transno = 0;
+	__u64 update_transno = 0;
+	__u64 next_transno = 0;
+	ENTRY;
+
+	spin_lock(&obd->obd_recovery_task_lock);
+	if (!list_empty(&obd->obd_req_replay_queue)) {
+		req = list_entry(obd->obd_req_replay_queue.next,
+				     struct ptlrpc_request, rq_list);
+		req_transno = lustre_msg_get_transno(req->rq_reqmsg);
+	}
+
+	if (tdtd != NULL)
+		update_transno = distribute_txn_get_next_transno(tdtd);
+
+	connected = atomic_read(&obd->obd_connected_clients);
+	completed = connected - atomic_read(&obd->obd_req_replay_clients);
+	queue_len = obd->obd_requests_queued_for_recovery;
+	next_transno = obd->obd_next_recovery_transno;
+
+	CDEBUG(D_HA, "max: %d, connected: %d, completed: %d, queue_len: %d, "
+	       "req_transno: %llu, next_transno: %llu\n",
+	       obd->obd_max_recoverable_clients, connected, completed,
+	       queue_len, req_transno, next_transno);
+
+	if (obd->obd_abort_recovery) {
+		CDEBUG(D_HA, "waking for aborted recovery\n");
+		wake_up = 1;
+	} else if (obd->obd_recovery_expired) {
+		CDEBUG(D_HA, "waking for expired recovery\n");
+		wake_up = 1;
+	} else if (tdtd != NULL && req != NULL &&
+		   is_req_replayed_by_update(req)) {
+		LASSERTF(req_transno < next_transno, "req_transno %llu"
+			 "next_transno%llu\n", req_transno, next_transno);
+		CDEBUG(D_HA, "waking for duplicate req (%llu)\n",
+		       req_transno);
+		wake_up = 1;
+	} else if (req_transno == next_transno ||
+		   (update_transno != 0 && update_transno <= next_transno)) {
+		CDEBUG(D_HA, "waking for next (%lld)\n", next_transno);
+		wake_up = 1;
+	} else if (queue_len > 0 &&
+		   queue_len == atomic_read(&obd->obd_req_replay_clients)) {
+		/** handle gaps occured due to lost reply or VBR */
+		LASSERTF(req_transno >= next_transno,
+			 "req_transno: %llu, next_transno: %llu\n",
+			 req_transno, next_transno);
+		CDEBUG(D_HA,
+		       "%s: waking for gap in transno, VBR is %s (skip: "
+		       "%lld, ql: %d, comp: %d, conn: %d, next: %lld"
+		       ", next_update %lld last_committed: %lld)\n",
+		       obd->obd_name, obd->obd_version_recov ? "ON" : "OFF",
+		       next_transno, queue_len, completed, connected,
+		       req_transno, update_transno, obd->obd_last_committed);
+		obd->obd_next_recovery_transno = req_transno;
+		wake_up = 1;
+	} else if (atomic_read(&obd->obd_req_replay_clients) == 0) {
+		CDEBUG(D_HA, "waking for completed recovery\n");
+		wake_up = 1;
+	} else if (OBD_FAIL_CHECK(OBD_FAIL_MDS_RECOVERY_ACCEPTS_GAPS)) {
+		CDEBUG(D_HA, "accepting transno gaps is explicitly allowed"
+		       " by fail_lock, waking up (%lld)\n", next_transno);
+		obd->obd_next_recovery_transno = req_transno;
+		wake_up = 1;
+	}
+	spin_unlock(&obd->obd_recovery_task_lock);
+	return wake_up;
+}
+
+static int check_for_next_lock(struct lu_target *lut)
+{
+	struct obd_device *obd = lut->lut_obd;
+	int wake_up = 0;
+
+	spin_lock(&obd->obd_recovery_task_lock);
+	if (!list_empty(&obd->obd_lock_replay_queue)) {
+		CDEBUG(D_HA, "waking for next lock\n");
+		wake_up = 1;
+	} else if (atomic_read(&obd->obd_lock_replay_clients) == 0) {
+		CDEBUG(D_HA, "waking for completed lock replay\n");
+		wake_up = 1;
+	} else if (obd->obd_abort_recovery) {
+		CDEBUG(D_HA, "waking for aborted recovery\n");
+		wake_up = 1;
+	} else if (obd->obd_recovery_expired) {
+		CDEBUG(D_HA, "waking for expired recovery\n");
+		wake_up = 1;
+	}
+	spin_unlock(&obd->obd_recovery_task_lock);
+
+	return wake_up;
+}
+
+/**
+ * wait for recovery events,
+ * check its status with help of check_routine
+ * evict dead clients via health_check
+ */
+static int target_recovery_overseer(struct lu_target *lut,
+				    int (*check_routine)(struct lu_target *),
+				    int (*health_check)(struct obd_export *))
+{
+	struct obd_device	*obd = lut->lut_obd;
+	struct target_distribute_txn_data *tdtd;
+	time64_t last = 0;
+	time64_t now;
+repeat:
+	if (obd->obd_recovering && obd->obd_recovery_start == 0) {
+		now = ktime_get_seconds();
+		if (now - last > 600) {
+			LCONSOLE_INFO("%s: in recovery but waiting for "
+				      "the first client to connect\n",
+				      obd->obd_name);
+			last = now;
+		}
+	}
+	if (obd->obd_recovery_start != 0 && ktime_get_real_seconds() >=
+	      (obd->obd_recovery_start + obd->obd_recovery_time_hard)) {
+		__u64 next_update_transno = 0;
+
+		/* Only abort the recovery if there are no update recovery
+		 * left in the queue */
+		spin_lock(&obd->obd_recovery_task_lock);
+		if (lut->lut_tdtd != NULL) {
+			next_update_transno =
+				distribute_txn_get_next_transno(lut->lut_tdtd);
+
+			tdtd = lut->lut_tdtd;
+			/* If next_update_transno == 0, it probably because
+			 * updatelog retrieve threads did not get any records
+			 * yet, let's wait those threads stopped */
+			if (next_update_transno == 0) {
+				struct l_wait_info lwi = { 0 };
+
+				l_wait_event(tdtd->tdtd_recovery_threads_waitq,
+				       atomic_read(
+				       &tdtd->tdtd_recovery_threads_count) == 0,
+				       &lwi);
+
+				next_update_transno =
+					distribute_txn_get_next_transno(
+								lut->lut_tdtd);
+			}
+		}
+
+		if (next_update_transno != 0 && !obd->obd_abort_recovery) {
+			obd->obd_next_recovery_transno = next_update_transno;
+			spin_unlock(&obd->obd_recovery_task_lock);
+			/* Disconnect unfinished exports from clients, and
+			 * keep connection from MDT to make sure the update
+			 * recovery will still keep trying until some one
+			 * manually abort the recovery */
+			class_disconnect_stale_exports(obd,
+						exp_finished_or_from_mdt);
+			/* Abort all of replay and replay lock req from
+			 * clients */
+			abort_req_replay_queue(obd);
+			abort_lock_replay_queue(obd);
+			CDEBUG(D_HA, "%s: there are still update replay (%#llx"
+			       ")in the queue.\n", obd->obd_name,
+			       next_update_transno);
+		} else {
+			obd->obd_abort_recovery = 1;
+			spin_unlock(&obd->obd_recovery_task_lock);
+			CWARN("%s recovery is aborted by hard timeout\n",
+			      obd->obd_name);
+		}
+	}
+
+	while (wait_event_timeout(obd->obd_next_transno_waitq,
+				  check_routine(lut),
+				  msecs_to_jiffies(60 * MSEC_PER_SEC)) == 0)
+		/* wait indefinitely for event, but don't trigger watchdog */;
+
+	if (obd->obd_abort_recovery) {
+		CWARN("recovery is aborted, evict exports in recovery\n");
+		if (lut->lut_tdtd != NULL) {
+			struct l_wait_info lwi = { 0 };
+
+			tdtd = lut->lut_tdtd;
+			/* Let's wait all of the update log recovery thread
+			 * finished */
+			l_wait_event(tdtd->tdtd_recovery_threads_waitq,
+			 atomic_read(&tdtd->tdtd_recovery_threads_count) == 0,
+			     &lwi);
+			/* Then abort the update recovery list */
+			dtrq_list_destroy(lut->lut_tdtd);
+		}
+
+		/** evict exports which didn't finish recovery yet */
+		class_disconnect_stale_exports(obd, exp_finished);
+		return 1;
+	} else if (obd->obd_recovery_expired) {
+		obd->obd_recovery_expired = 0;
+		/** If some clients died being recovered, evict them */
+		LCONSOLE_WARN("%s: recovery is timed out, "
+			      "evict stale exports\n", obd->obd_name);
+		/** evict cexports with no replay in queue, they are stalled */
+		class_disconnect_stale_exports(obd, health_check);
+
+		/** continue with VBR */
+		spin_lock(&obd->obd_dev_lock);
+		obd->obd_version_recov = 1;
+		spin_unlock(&obd->obd_dev_lock);
+		/**
+		 * reset timer, recovery will proceed with versions now,
+		 * timeout is set just to handle reconnection delays
+		 */
+		extend_recovery_timer(obd, RECONNECT_DELAY_MAX, true);
+		/** Wait for recovery events again, after evicting bad clients */
+		goto repeat;
+	}
+	return 0;
+}
+
+static struct ptlrpc_request *target_next_replay_lock(struct lu_target *lut)
+{
+	struct obd_device	*obd = lut->lut_obd;
+	struct ptlrpc_request *req = NULL;
+
+	CDEBUG(D_HA, "Waiting for lock\n");
+	if (target_recovery_overseer(lut, check_for_next_lock,
+				     exp_lock_replay_healthy))
+		abort_lock_replay_queue(obd);
+
+	spin_lock(&obd->obd_recovery_task_lock);
+	if (!list_empty(&obd->obd_lock_replay_queue)) {
+		req = list_entry(obd->obd_lock_replay_queue.next,
+				     struct ptlrpc_request, rq_list);
+		list_del_init(&req->rq_list);
+		spin_unlock(&obd->obd_recovery_task_lock);
+	} else {
+		spin_unlock(&obd->obd_recovery_task_lock);
+		LASSERT(list_empty(&obd->obd_lock_replay_queue));
+		LASSERT(atomic_read(&obd->obd_lock_replay_clients) == 0);
+		/** evict exports failed VBR */
+		class_disconnect_stale_exports(obd, exp_vbr_healthy);
+	}
+	return req;
+}
+
+static struct ptlrpc_request *target_next_final_ping(struct obd_device *obd)
+{
+	struct ptlrpc_request *req = NULL;
+
+	spin_lock(&obd->obd_recovery_task_lock);
+	if (!list_empty(&obd->obd_final_req_queue)) {
+		req = list_entry(obd->obd_final_req_queue.next,
+				     struct ptlrpc_request, rq_list);
+		list_del_init(&req->rq_list);
+		spin_unlock(&obd->obd_recovery_task_lock);
+		if (req->rq_export->exp_in_recovery) {
+			spin_lock(&req->rq_export->exp_lock);
+			req->rq_export->exp_in_recovery = 0;
+			spin_unlock(&req->rq_export->exp_lock);
+		}
+	} else {
+		spin_unlock(&obd->obd_recovery_task_lock);
+	}
+	return req;
+}
+
+static void handle_recovery_req(struct ptlrpc_thread *thread,
+				struct ptlrpc_request *req,
+				svc_handler_t handler)
+{
+	ENTRY;
+
+	/**
+	 * export can be evicted during recovery, no need to handle replays for
+	 * it after that, discard such request silently
+	 */
+	if (req->rq_export->exp_disconnected)
+		RETURN_EXIT;
+
+	req->rq_session.lc_thread = thread;
+	req->rq_svc_thread = thread;
+	req->rq_svc_thread->t_env->le_ses = &req->rq_session;
+
+        /* thread context */
+        lu_context_enter(&thread->t_env->le_ctx);
+        (void)handler(req);
+        lu_context_exit(&thread->t_env->le_ctx);
+
+        /* don't reset timer for final stage */
+        if (!exp_finished(req->rq_export)) {
+                int to = obd_timeout;
+
+                /**
+                 * Add request timeout to the recovery time so next request from
+                 * this client may come in recovery time
+                 */
+                if (!AT_OFF) {
+			struct ptlrpc_service_part *svcpt;
+
+			svcpt = req->rq_rqbd->rqbd_svcpt;
+			/* If the server sent early reply for this request,
+			 * the client will recalculate the timeout according to
+			 * current server estimate service time, so we will
+			 * use the maxium timeout here for waiting the client
+			 * sending the next req */
+			to = max((int)at_est2timeout(
+				 at_get(&svcpt->scp_at_estimate)),
+				 (int)lustre_msg_get_timeout(req->rq_reqmsg));
+			/* Add 2 net_latency, one for balance rq_deadline
+			 * (see ptl_send_rpc), one for resend the req to server,
+			 * Note: client will pack net_latency in replay req
+			 * (see ptlrpc_replay_req) */
+			to += 2 * lustre_msg_get_service_time(req->rq_reqmsg);
+                }
+                extend_recovery_timer(class_exp2obd(req->rq_export), to, true);
+        }
+	EXIT;
+}
+
+/** Checking routines for recovery */
+static int check_for_recovery_ready(struct lu_target *lut)
+{
+	struct obd_device *obd = lut->lut_obd;
+	unsigned int clnts = atomic_read(&obd->obd_connected_clients);
+
+	CDEBUG(D_HA, "connected %d stale %d max_recoverable_clients %d"
+	       " abort %d expired %d\n", clnts, obd->obd_stale_clients,
+	       obd->obd_max_recoverable_clients, obd->obd_abort_recovery,
+	       obd->obd_recovery_expired);
+
+	if (!obd->obd_abort_recovery && !obd->obd_recovery_expired) {
+		LASSERT(clnts <= obd->obd_max_recoverable_clients);
+		if (clnts + obd->obd_stale_clients <
+		    obd->obd_max_recoverable_clients)
+			return 0;
+	}
+
+	if (lut->lut_tdtd != NULL) {
+		if (!lut->lut_tdtd->tdtd_replay_ready &&
+		    !obd->obd_abort_recovery) {
+			/* Let's extend recovery timer, in case the recovery
+			 * timer expired, and some clients got evicted */
+			extend_recovery_timer(obd, obd->obd_recovery_timeout,
+					      true);
+			CDEBUG(D_HA, "%s update recovery is not ready, extend recovery %llu\n",
+			       obd->obd_name, obd->obd_recovery_timeout);
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
+enum {
+	REQUEST_RECOVERY = 1,
+	UPDATE_RECOVERY = 2,
+};
+
+static __u64 get_next_replay_req_transno(struct obd_device *obd)
+{
+	__u64 transno = 0;
+
+	if (!list_empty(&obd->obd_req_replay_queue)) {
+		struct ptlrpc_request *req;
+
+		req = list_entry(obd->obd_req_replay_queue.next,
+				 struct ptlrpc_request, rq_list);
+		transno = lustre_msg_get_transno(req->rq_reqmsg);
+	}
+
+	return transno;
+}
+
+static __u64 get_next_transno(struct lu_target *lut, int *type)
+{
+	struct obd_device *obd = lut->lut_obd;
+	struct target_distribute_txn_data *tdtd = lut->lut_tdtd;
+	__u64 transno = 0;
+	__u64 update_transno;
+	ENTRY;
+
+	transno = get_next_replay_req_transno(obd);
+	if (type != NULL)
+		*type = REQUEST_RECOVERY;
+
+	if (tdtd == NULL)
+		RETURN(transno);
+
+	update_transno = distribute_txn_get_next_transno(tdtd);
+	if (transno == 0 || (transno >= update_transno &&
+			     update_transno != 0)) {
+		transno = update_transno;
+		if (type != NULL)
+			*type = UPDATE_RECOVERY;
+	}
+
+	RETURN(transno);
+}
+
+/**
+ * drop duplicate replay request
+ *
+ * Because the operation has been replayed by update recovery, the request
+ * with the same transno will be dropped and also notify the client to send
+ * next replay request.
+ *
+ * \param[in] env	execution environment
+ * \param[in] obd	failover obd device
+ * \param[in] req	request to be dropped
+ */
+static void drop_duplicate_replay_req(struct lu_env *env,
+				      struct obd_device *obd,
+				      struct ptlrpc_request *req)
+{
+	DEBUG_REQ(D_HA, req, "remove t%lld from %s because of duplicate"
+		  " update records are found.\n",
+		  lustre_msg_get_transno(req->rq_reqmsg),
+		  libcfs_nid2str(req->rq_peer.nid));
+
+	/* Right now, only for MDS reint operation update replay and
+	 * normal request replay can have the same transno */
+	if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_REINT) {
+		req_capsule_set(&req->rq_pill, &RQF_MDS_REINT);
+		req->rq_status = req_capsule_server_pack(&req->rq_pill);
+		if (likely(req->rq_export))
+			target_committed_to_req(req);
+		lustre_msg_set_transno(req->rq_repmsg, req->rq_transno);
+		target_send_reply(req, req->rq_status, 0);
+	} else {
+		DEBUG_REQ(D_ERROR, req, "wrong opc" "from %s\n",
+		libcfs_nid2str(req->rq_peer.nid));
+	}
+	target_exp_dequeue_req_replay(req);
+	target_request_copy_put(req);
+	obd->obd_replayed_requests++;
+}
+
+static void replay_request_or_update(struct lu_env *env,
+				     struct lu_target *lut,
+				     struct target_recovery_data *trd,
+				     struct ptlrpc_thread *thread)
+{
+	struct obd_device *obd = lut->lut_obd;
+	struct ptlrpc_request *req = NULL;
+	int			type;
+	__u64			transno;
+	ENTRY;
+
+	CDEBUG(D_HA, "Waiting for transno %lld\n",
+	       obd->obd_next_recovery_transno);
+
+	/* Replay all of request and update by transno */
+	do {
+		struct target_distribute_txn_data *tdtd = lut->lut_tdtd;
+
+		CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_REPLAY_DELAY2, cfs_fail_val);
+
+		/** It is needed to extend recovery window above
+		 *  recovery_time_soft. Extending is possible only in the
+		 *  end of recovery window (see more details in
+		 *  handle_recovery_req()).
+		 */
+		CFS_FAIL_TIMEOUT_MS(OBD_FAIL_TGT_REPLAY_DELAY, 300);
+
+		if (target_recovery_overseer(lut, check_for_next_transno,
+					exp_req_replay_healthy_or_from_mdt)) {
+			abort_req_replay_queue(obd);
+			abort_lock_replay_queue(obd);
+			goto abort;
+		}
+
+		spin_lock(&obd->obd_recovery_task_lock);
+		transno = get_next_transno(lut, &type);
+		if (type == REQUEST_RECOVERY && transno != 0) {
+			/* Drop replay request from client side, if the
+			 * replay has been executed by update with the
+			 * same transno */
+			req = list_entry(obd->obd_req_replay_queue.next,
+					struct ptlrpc_request, rq_list);
+
+			list_del_init(&req->rq_list);
+			obd->obd_requests_queued_for_recovery--;
+			spin_unlock(&obd->obd_recovery_task_lock);
+
+			/* Let's check if the request has been redone by
+			 * update replay */
+			if (is_req_replayed_by_update(req)) {
+				struct distribute_txn_replay_req *dtrq;
+
+				dtrq = distribute_txn_lookup_finish_list(tdtd,
+								   req->rq_xid);
+				LASSERT(dtrq != NULL);
+				spin_lock(&tdtd->tdtd_replay_list_lock);
+				list_del_init(&dtrq->dtrq_list);
+				spin_unlock(&tdtd->tdtd_replay_list_lock);
+				dtrq_destroy(dtrq);
+
+				drop_duplicate_replay_req(env, obd, req);
+
+				continue;
+			}
+
+			LASSERT(trd->trd_processing_task == current_pid());
+			DEBUG_REQ(D_HA, req, "processing t%lld from %s",
+				  lustre_msg_get_transno(req->rq_reqmsg),
+				  libcfs_nid2str(req->rq_peer.nid));
+
+			handle_recovery_req(thread, req,
+					    trd->trd_recovery_handler);
+			/**
+			 * bz18031: increase next_recovery_transno before
+			 * target_request_copy_put() will drop exp_rpc reference
+			 */
+			spin_lock(&obd->obd_recovery_task_lock);
+			obd->obd_next_recovery_transno++;
+			spin_unlock(&obd->obd_recovery_task_lock);
+			target_exp_dequeue_req_replay(req);
+			target_request_copy_put(req);
+			obd->obd_replayed_requests++;
+		} else if (type == UPDATE_RECOVERY && transno != 0) {
+			struct distribute_txn_replay_req *dtrq;
+			int rc;
+
+			spin_unlock(&obd->obd_recovery_task_lock);
+
+			LASSERT(tdtd != NULL);
+			dtrq = distribute_txn_get_next_req(tdtd);
+			lu_context_enter(&thread->t_env->le_ctx);
+			rc = tdtd->tdtd_replay_handler(env, tdtd, dtrq);
+			lu_context_exit(&thread->t_env->le_ctx);
+			extend_recovery_timer(obd, obd_timeout, true);
+
+			if (rc == 0 && dtrq->dtrq_xid != 0) {
+				CDEBUG(D_HA, "Move x%llu t%llu"
+				       " to finish list\n", dtrq->dtrq_xid,
+				       dtrq->dtrq_master_transno);
+
+				/* Add it to the replay finish list */
+				spin_lock(&tdtd->tdtd_replay_list_lock);
+				list_add(&dtrq->dtrq_list,
+					 &tdtd->tdtd_replay_finish_list);
+				spin_unlock(&tdtd->tdtd_replay_list_lock);
+
+				spin_lock(&obd->obd_recovery_task_lock);
+				if (transno == obd->obd_next_recovery_transno)
+					obd->obd_next_recovery_transno++;
+				else if (transno >
+					 obd->obd_next_recovery_transno)
+					obd->obd_next_recovery_transno =
+								transno + 1;
+				spin_unlock(&obd->obd_recovery_task_lock);
+			} else {
+				dtrq_destroy(dtrq);
+			}
+		} else {
+			spin_unlock(&obd->obd_recovery_task_lock);
+abort:
+			LASSERT(list_empty(&obd->obd_req_replay_queue));
+			LASSERT(atomic_read(&obd->obd_req_replay_clients) == 0);
+			/** evict exports failed VBR */
+			class_disconnect_stale_exports(obd, exp_vbr_healthy);
+			break;
+		}
+	} while (1);
+}
+
+static int target_recovery_thread(void *arg)
+{
+        struct lu_target *lut = arg;
+        struct obd_device *obd = lut->lut_obd;
+        struct ptlrpc_request *req;
+        struct target_recovery_data *trd = &obd->obd_recovery_data;
+        unsigned long delta;
+        struct lu_env *env;
+        struct ptlrpc_thread *thread = NULL;
+        int rc = 0;
+        ENTRY;
+
+	unshare_fs_struct();
+        OBD_ALLOC_PTR(thread);
+        if (thread == NULL)
+                RETURN(-ENOMEM);
+
+        OBD_ALLOC_PTR(env);
+        if (env == NULL) {
+                OBD_FREE_PTR(thread);
+                RETURN(-ENOMEM);
+        }
+
+        rc = lu_context_init(&env->le_ctx, LCT_MD_THREAD | LCT_DT_THREAD);
+        if (rc) {
+                OBD_FREE_PTR(thread);
+                OBD_FREE_PTR(env);
+                RETURN(rc);
+        }
+
+        thread->t_env = env;
+        thread->t_id = -1; /* force filter_iobuf_get/put to use local buffers */
+        env->le_ctx.lc_thread = thread;
+	tgt_io_thread_init(thread); /* init thread_big_cache for IO requests */
+	thread->t_watchdog = NULL;
+
+	CDEBUG(D_HA, "%s: started recovery thread pid %d\n", obd->obd_name,
+	       current_pid());
+	trd->trd_processing_task = current_pid();
+
+	spin_lock(&obd->obd_dev_lock);
+	obd->obd_recovering = 1;
+	spin_unlock(&obd->obd_dev_lock);
+	complete(&trd->trd_starting);
+
+	/* first of all, we have to know the first transno to replay */
+	if (target_recovery_overseer(lut, check_for_recovery_ready,
+				     exp_connect_healthy)) {
+		abort_req_replay_queue(obd);
+		abort_lock_replay_queue(obd);
+		if (lut->lut_tdtd != NULL)
+			dtrq_list_destroy(lut->lut_tdtd);
+	}
+
+	/* next stage: replay requests or update */
+	delta = jiffies;
+	CDEBUG(D_INFO, "1: request replay stage - %d clients from t%llu\n",
+	       atomic_read(&obd->obd_req_replay_clients),
+	       obd->obd_next_recovery_transno);
+	replay_request_or_update(env, lut, trd, thread);
+
+	/**
+	 * The second stage: replay locks
+	 */
+	CDEBUG(D_INFO, "2: lock replay stage - %d clients\n",
+	       atomic_read(&obd->obd_lock_replay_clients));
+	while ((req = target_next_replay_lock(lut))) {
+		LASSERT(trd->trd_processing_task == current_pid());
+		DEBUG_REQ(D_HA, req, "processing lock from %s: ",
+			  libcfs_nid2str(req->rq_peer.nid));
+		handle_recovery_req(thread, req,
+				    trd->trd_recovery_handler);
+		target_request_copy_put(req);
+		obd->obd_replayed_locks++;
+	}
+
+        /**
+         * The third stage: reply on final pings, at this moment all clients
+         * must have request in final queue
+         */
+	CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_REPLAY_RECONNECT, cfs_fail_val);
+        CDEBUG(D_INFO, "3: final stage - process recovery completion pings\n");
+        /** Update server last boot epoch */
+        tgt_boot_epoch_update(lut);
+        /* We drop recoverying flag to forward all new requests
+         * to regular mds_handle() since now */
+	spin_lock(&obd->obd_dev_lock);
+	obd->obd_recovering = obd->obd_abort_recovery = 0;
+	spin_unlock(&obd->obd_dev_lock);
+	spin_lock(&obd->obd_recovery_task_lock);
+	target_cancel_recovery_timer(obd);
+	spin_unlock(&obd->obd_recovery_task_lock);
+	while ((req = target_next_final_ping(obd))) {
+		LASSERT(trd->trd_processing_task == current_pid());
+		DEBUG_REQ(D_HA, req, "processing final ping from %s: ",
+			  libcfs_nid2str(req->rq_peer.nid));
+                handle_recovery_req(thread, req,
+                                    trd->trd_recovery_handler);
+		/* Because the waiting client can not send ping to server,
+		 * so we need refresh the last_request_time, to avoid the
+		 * export is being evicted */
+		ptlrpc_update_export_timer(req->rq_export, 0);
+		target_request_copy_put(req);
+	}
+
+	delta = jiffies_to_msecs(jiffies - delta) / MSEC_PER_SEC;
+	CDEBUG(D_INFO,"4: recovery completed in %lus - %d/%d reqs/locks\n",
+	      delta, obd->obd_replayed_requests, obd->obd_replayed_locks);
+	if (delta > OBD_RECOVERY_TIME_SOFT) {
+		CWARN("too long recovery - read logs\n");
+		libcfs_debug_dumplog();
+	}
+
+	target_finish_recovery(lut);
+
+        lu_context_fini(&env->le_ctx);
+        trd->trd_processing_task = 0;
+	complete(&trd->trd_finishing);
+
+	tgt_io_thread_done(thread);
+	OBD_FREE_PTR(thread);
+	OBD_FREE_PTR(env);
+	RETURN(rc);
+}
+
+static int target_start_recovery_thread(struct lu_target *lut,
+                                        svc_handler_t handler)
+{
+	struct obd_device *obd = lut->lut_obd;
+	int rc = 0;
+	struct target_recovery_data *trd = &obd->obd_recovery_data;
+	int index;
+
+	memset(trd, 0, sizeof(*trd));
+	init_completion(&trd->trd_starting);
+	init_completion(&trd->trd_finishing);
+	trd->trd_recovery_handler = handler;
+
+	rc = server_name2index(obd->obd_name, &index, NULL);
+	if (rc < 0)
+		return rc;
+
+	if (!IS_ERR(kthread_run(target_recovery_thread,
+				lut, "tgt_recover_%d", index))) {
+		wait_for_completion(&trd->trd_starting);
+		LASSERT(obd->obd_recovering != 0);
+	} else {
+		rc = -ECHILD;
+	}
+
+	return rc;
+}
+
+void target_stop_recovery_thread(struct obd_device *obd)
+{
+	if (obd->obd_recovery_data.trd_processing_task > 0) {
+		struct target_recovery_data *trd = &obd->obd_recovery_data;
+		/** recovery can be done but postrecovery is not yet */
+		spin_lock(&obd->obd_dev_lock);
+		if (obd->obd_recovering) {
+			CERROR("%s: Aborting recovery\n", obd->obd_name);
+			obd->obd_abort_recovery = 1;
+			wake_up(&obd->obd_next_transno_waitq);
+		}
+		spin_unlock(&obd->obd_dev_lock);
+		wait_for_completion(&trd->trd_finishing);
+	}
+}
+EXPORT_SYMBOL(target_stop_recovery_thread);
+
+void target_recovery_fini(struct obd_device *obd)
+{
+        class_disconnect_exports(obd);
+        target_stop_recovery_thread(obd);
+        target_cleanup_recovery(obd);
+}
+EXPORT_SYMBOL(target_recovery_fini);
+
+static void target_recovery_expired(unsigned long castmeharder)
+{
+	struct obd_device *obd = (struct obd_device *)castmeharder;
+	CDEBUG(D_HA, "%s: recovery timed out; %d clients are still in recovery"
+	       " after %llus (%d clients connected)\n",
+	       obd->obd_name, atomic_read(&obd->obd_lock_replay_clients),
+	       (s64)(ktime_get_real_seconds() - obd->obd_recovery_start),
+	       atomic_read(&obd->obd_connected_clients));
+
+	obd->obd_recovery_expired = 1;
+	wake_up(&obd->obd_next_transno_waitq);
+}
+
+void target_recovery_init(struct lu_target *lut, svc_handler_t handler)
+{
+	struct obd_device *obd = lut->lut_obd;
+
+	if (lut->lut_bottom->dd_rdonly)
+		return;
+
+	if (obd->obd_max_recoverable_clients == 0) {
+		/** Update server last boot epoch */
+		tgt_boot_epoch_update(lut);
+		return;
+	}
+
+	CDEBUG(D_HA, "RECOVERY: service %s, %d recoverable clients, "
+	       "last_transno %llu\n", obd->obd_name,
+	       obd->obd_max_recoverable_clients, obd->obd_last_committed);
+        LASSERT(obd->obd_stopping == 0);
+        obd->obd_next_recovery_transno = obd->obd_last_committed + 1;
+        obd->obd_recovery_start = 0;
+        obd->obd_recovery_end = 0;
+
+	setup_timer(&obd->obd_recovery_timer, target_recovery_expired,
+		    (unsigned long)obd);
+	target_start_recovery_thread(lut, handler);
+}
+EXPORT_SYMBOL(target_recovery_init);
+
+static int target_process_req_flags(struct obd_device *obd,
+                                    struct ptlrpc_request *req)
+{
+	struct obd_export *exp = req->rq_export;
+	LASSERT(exp != NULL);
+	if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REQ_REPLAY_DONE) {
+		/* client declares he's ready to replay locks */
+		spin_lock(&exp->exp_lock);
+		if (exp->exp_req_replay_needed) {
+			exp->exp_req_replay_needed = 0;
+			spin_unlock(&exp->exp_lock);
+
+			LASSERT_ATOMIC_POS(&obd->obd_req_replay_clients);
+			atomic_dec(&obd->obd_req_replay_clients);
+		} else {
+			spin_unlock(&exp->exp_lock);
+		}
+	}
+	if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LOCK_REPLAY_DONE) {
+		/* client declares he's ready to complete recovery
+		 * so, we put the request on th final queue */
+		spin_lock(&exp->exp_lock);
+		if (exp->exp_lock_replay_needed) {
+			exp->exp_lock_replay_needed = 0;
+			spin_unlock(&exp->exp_lock);
+
+			LASSERT_ATOMIC_POS(&obd->obd_lock_replay_clients);
+			atomic_dec(&obd->obd_lock_replay_clients);
+		} else {
+			spin_unlock(&exp->exp_lock);
+		}
+	}
+	return 0;
+}
+
+int target_queue_recovery_request(struct ptlrpc_request *req,
+                                  struct obd_device *obd)
+{
+        __u64 transno = lustre_msg_get_transno(req->rq_reqmsg);
+	struct ptlrpc_request *reqiter;
+	int inserted = 0;
+	ENTRY;
+
+	if (obd->obd_recovery_data.trd_processing_task == current_pid()) {
+		/* Processing the queue right now, don't re-add. */
+		RETURN(1);
+	}
+
+        target_process_req_flags(obd, req);
+
+        if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LOCK_REPLAY_DONE) {
+                /* client declares he's ready to complete recovery
+                 * so, we put the request on th final queue */
+		target_request_copy_get(req);
+		DEBUG_REQ(D_HA, req, "queue final req");
+		wake_up(&obd->obd_next_transno_waitq);
+		spin_lock(&obd->obd_recovery_task_lock);
+		if (obd->obd_recovering) {
+			struct ptlrpc_request *tmp;
+			struct ptlrpc_request *duplicate = NULL;
+
+			if (likely(!req->rq_export->exp_replay_done)) {
+				req->rq_export->exp_replay_done = 1;
+				list_add_tail(&req->rq_list,
+					      &obd->obd_final_req_queue);
+				spin_unlock(&obd->obd_recovery_task_lock);
+				RETURN(0);
+			}
+
+			/* XXX O(n), but only happens if final ping is
+			 * timed out, probably reorganize the list as
+			 * a hash list later */
+			list_for_each_entry_safe(reqiter, tmp,
+						 &obd->obd_final_req_queue,
+						 rq_list) {
+				if (reqiter->rq_export == req->rq_export) {
+					list_del_init(&reqiter->rq_list);
+					duplicate = reqiter;
+					break;
+				}
+			}
+
+			list_add_tail(&req->rq_list,
+				      &obd->obd_final_req_queue);
+			req->rq_export->exp_replay_done = 1;
+			spin_unlock(&obd->obd_recovery_task_lock);
+
+			if (duplicate != NULL) {
+				DEBUG_REQ(D_HA, duplicate,
+					  "put prev final req\n");
+				target_request_copy_put(duplicate);
+			}
+			RETURN(0);
+		} else {
+			spin_unlock(&obd->obd_recovery_task_lock);
+			target_request_copy_put(req);
+			RETURN(obd->obd_stopping ? -ENOTCONN : 1);
+		}
+	}
+	if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REQ_REPLAY_DONE) {
+		/* client declares he's ready to replay locks */
+		target_request_copy_get(req);
+		DEBUG_REQ(D_HA, req, "queue lock replay req");
+		wake_up(&obd->obd_next_transno_waitq);
+		spin_lock(&obd->obd_recovery_task_lock);
+		LASSERT(obd->obd_recovering);
+		/* usually due to recovery abort */
+		if (!req->rq_export->exp_in_recovery) {
+			spin_unlock(&obd->obd_recovery_task_lock);
+			target_request_copy_put(req);
+			RETURN(-ENOTCONN);
+		}
+		LASSERT(req->rq_export->exp_lock_replay_needed);
+		list_add_tail(&req->rq_list, &obd->obd_lock_replay_queue);
+		spin_unlock(&obd->obd_recovery_task_lock);
+		RETURN(0);
+	}
+
+        /* CAVEAT EMPTOR: The incoming request message has been swabbed
+         * (i.e. buflens etc are in my own byte order), but type-dependent
+         * buffers (eg mdt_body, ost_body etc) have NOT been swabbed. */
+
+        if (!transno) {
+		INIT_LIST_HEAD(&req->rq_list);
+                DEBUG_REQ(D_HA, req, "not queueing");
+                RETURN(1);
+        }
+
+        /* If we're processing the queue, we want don't want to queue this
+         * message.
+         *
+         * Also, if this request has a transno less than the one we're waiting
+         * for, we should process it now.  It could (and currently always will)
+         * be an open request for a descriptor that was opened some time ago.
+         *
+         * Also, a resent, replayed request that has already been
+         * handled will pass through here and be processed immediately.
+         */
+	CDEBUG(D_HA, "Next recovery transno: %llu"
+	       ", current: %llu, replaying\n",
+               obd->obd_next_recovery_transno, transno);
+
+	/* If the request has been replayed by update replay, then sends this
+	 * request to the recovery thread (replay_request_or_update()), where
+	 * it will be handled */
+	spin_lock(&obd->obd_recovery_task_lock);
+	if (transno < obd->obd_next_recovery_transno &&
+	    !is_req_replayed_by_update(req)) {
+		/* Processing the queue right now, don't re-add. */
+		LASSERT(list_empty(&req->rq_list));
+		spin_unlock(&obd->obd_recovery_task_lock);
+		RETURN(1);
+	}
+	spin_unlock(&obd->obd_recovery_task_lock);
+
+        if (OBD_FAIL_CHECK(OBD_FAIL_TGT_REPLAY_DROP))
+                RETURN(0);
+
+        target_request_copy_get(req);
+        if (!req->rq_export->exp_in_recovery) {
+                target_request_copy_put(req);
+                RETURN(-ENOTCONN);
+        }
+        LASSERT(req->rq_export->exp_req_replay_needed);
+
+        if (target_exp_enqueue_req_replay(req)) {
+                DEBUG_REQ(D_ERROR, req, "dropping resent queued req");
+                target_request_copy_put(req);
+                RETURN(0);
+        }
+
+	/* XXX O(n^2) */
+	spin_lock(&obd->obd_recovery_task_lock);
+	LASSERT(obd->obd_recovering);
+	list_for_each_entry(reqiter, &obd->obd_req_replay_queue, rq_list) {
+		if (lustre_msg_get_transno(reqiter->rq_reqmsg) > transno) {
+			list_add_tail(&req->rq_list, &reqiter->rq_list);
+			inserted = 1;
+			goto added;
+		}
+
+                if (unlikely(lustre_msg_get_transno(reqiter->rq_reqmsg) ==
+                             transno)) {
+                        DEBUG_REQ(D_ERROR, req, "dropping replay: transno "
+                                  "has been claimed by another client");
+			spin_unlock(&obd->obd_recovery_task_lock);
+                        target_exp_dequeue_req_replay(req);
+                        target_request_copy_put(req);
+                        RETURN(0);
+                }
+        }
+added:
+        if (!inserted)
+		list_add_tail(&req->rq_list, &obd->obd_req_replay_queue);
+
+        obd->obd_requests_queued_for_recovery++;
+	spin_unlock(&obd->obd_recovery_task_lock);
+	wake_up(&obd->obd_next_transno_waitq);
+	RETURN(0);
+}
+
+int target_handle_ping(struct ptlrpc_request *req)
+{
+        obd_ping(req->rq_svc_thread->t_env, req->rq_export);
+        return req_capsule_server_pack(&req->rq_pill);
+}
+
+void target_committed_to_req(struct ptlrpc_request *req)
+{
+        struct obd_export *exp = req->rq_export;
+
+        if (!exp->exp_obd->obd_no_transno && req->rq_repmsg != NULL)
+                lustre_msg_set_last_committed(req->rq_repmsg,
+                                              exp->exp_last_committed);
+        else
+                DEBUG_REQ(D_IOCTL, req, "not sending last_committed update (%d/"
+                          "%d)", exp->exp_obd->obd_no_transno,
+                          req->rq_repmsg == NULL);
+
+	CDEBUG(D_INFO, "last_committed %llu, transno %llu, xid %llu\n",
+               exp->exp_last_committed, req->rq_transno, req->rq_xid);
+}
+
+#endif /* HAVE_SERVER_SUPPORT */
+
+/**
+ * Packs current SLV and Limit into \a req.
+ */
+int target_pack_pool_reply(struct ptlrpc_request *req)
+{
+        struct obd_device *obd;
+        ENTRY;
+
+	/* Check that we still have all structures alive as this may
+	 * be some late RPC at shutdown time. */
+        if (unlikely(!req->rq_export || !req->rq_export->exp_obd ||
+                     !exp_connect_lru_resize(req->rq_export))) {
+                lustre_msg_set_slv(req->rq_repmsg, 0);
+                lustre_msg_set_limit(req->rq_repmsg, 0);
+                RETURN(0);
+        }
+
+	/* OBD is alive here as export is alive, which we checked above. */
+        obd = req->rq_export->exp_obd;
+
+	read_lock(&obd->obd_pool_lock);
+        lustre_msg_set_slv(req->rq_repmsg, obd->obd_pool_slv);
+        lustre_msg_set_limit(req->rq_repmsg, obd->obd_pool_limit);
+	read_unlock(&obd->obd_pool_lock);
+
+        RETURN(0);
+}
+
+static int target_send_reply_msg(struct ptlrpc_request *req,
+				 int rc, int fail_id)
+{
+	if (OBD_FAIL_CHECK_ORSET(fail_id & ~OBD_FAIL_ONCE, OBD_FAIL_ONCE)) {
+		DEBUG_REQ(D_ERROR, req, "dropping reply");
+		return -ECOMM;
+	}
+	/* We can have a null rq_reqmsg in the event of bad signature or
+	 * no context when unwrapping */
+	if (req->rq_reqmsg &&
+	    unlikely(lustre_msg_get_opc(req->rq_reqmsg) == MDS_REINT &&
+	    OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_MULTI_NET_REP)))
+		return -ECOMM;
+
+	if (unlikely(rc)) {
+		DEBUG_REQ(D_NET, req, "processing error (%d)", rc);
+		req->rq_status = rc;
+		return ptlrpc_send_error(req, 1);
+	} else {
+		DEBUG_REQ(D_NET, req, "sending reply");
+	}
+
+	return ptlrpc_send_reply(req, PTLRPC_REPLY_MAYBE_DIFFICULT);
+}
+
+void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
+{
+	struct ptlrpc_service_part *svcpt;
+        int                        netrc;
+        struct ptlrpc_reply_state *rs;
+        struct obd_export         *exp;
+        ENTRY;
+
+        if (req->rq_no_reply) {
+                EXIT;
+                return;
+        }
+
+	svcpt = req->rq_rqbd->rqbd_svcpt;
+        rs = req->rq_reply_state;
+        if (rs == NULL || !rs->rs_difficult) {
+                /* no notifiers */
+                target_send_reply_msg (req, rc, fail_id);
+                EXIT;
+                return;
+        }
+
+        /* must be an export if locks saved */
+	LASSERT(req->rq_export != NULL);
+        /* req/reply consistent */
+	LASSERT(rs->rs_svcpt == svcpt);
+
+        /* "fresh" reply */
+	LASSERT(!rs->rs_scheduled);
+	LASSERT(!rs->rs_scheduled_ever);
+	LASSERT(!rs->rs_handled);
+	LASSERT(!rs->rs_on_net);
+	LASSERT(rs->rs_export == NULL);
+	LASSERT(list_empty(&rs->rs_obd_list));
+	LASSERT(list_empty(&rs->rs_exp_list));
+
+	exp = class_export_get(req->rq_export);
+
+        /* disable reply scheduling while I'm setting up */
+        rs->rs_scheduled = 1;
+        rs->rs_on_net    = 1;
+        rs->rs_xid       = req->rq_xid;
+        rs->rs_transno   = req->rq_transno;
+        rs->rs_export    = exp;
+        rs->rs_opc       = lustre_msg_get_opc(req->rq_reqmsg);
+
+	spin_lock(&exp->exp_uncommitted_replies_lock);
+	CDEBUG(D_NET, "rs transno = %llu, last committed = %llu\n",
+	       rs->rs_transno, exp->exp_last_committed);
+	if (rs->rs_transno > exp->exp_last_committed) {
+		/* not committed already */
+		list_add_tail(&rs->rs_obd_list,
+				  &exp->exp_uncommitted_replies);
+	}
+	spin_unlock(&exp->exp_uncommitted_replies_lock);
+
+	spin_lock(&exp->exp_lock);
+	list_add_tail(&rs->rs_exp_list, &exp->exp_outstanding_replies);
+	spin_unlock(&exp->exp_lock);
+
+	netrc = target_send_reply_msg(req, rc, fail_id);
+
+	spin_lock(&svcpt->scp_rep_lock);
+
+	atomic_inc(&svcpt->scp_nreps_difficult);
+
+	if (netrc != 0) {
+		/* error sending: reply is off the net.  Also we need +1
+		 * reply ref until ptlrpc_handle_rs() is done
+		 * with the reply state (if the send was successful, there
+		 * would have been +1 ref for the net, which
+		 * reply_out_callback leaves alone) */
+		rs->rs_on_net = 0;
+		ptlrpc_rs_addref(rs);
+	}
+
+	spin_lock(&rs->rs_lock);
+	if (rs->rs_transno <= exp->exp_last_committed ||
+	    (!rs->rs_on_net && !rs->rs_no_ack) ||
+	    list_empty(&rs->rs_exp_list) ||     /* completed already */
+	    list_empty(&rs->rs_obd_list)) {
+		CDEBUG(D_HA, "Schedule reply immediately\n");
+		ptlrpc_dispatch_difficult_reply(rs);
+	} else {
+		list_add(&rs->rs_list, &svcpt->scp_rep_active);
+		rs->rs_scheduled = 0;	/* allow notifier to schedule */
+	}
+	spin_unlock(&rs->rs_lock);
+	spin_unlock(&svcpt->scp_rep_lock);
+	EXIT;
+}
+
+enum ldlm_mode lck_compat_array[] = {
+	[LCK_EX]    = LCK_COMPAT_EX,
+	[LCK_PW]    = LCK_COMPAT_PW,
+	[LCK_PR]    = LCK_COMPAT_PR,
+	[LCK_CW]    = LCK_COMPAT_CW,
+	[LCK_CR]    = LCK_COMPAT_CR,
+	[LCK_NL]    = LCK_COMPAT_NL,
+	[LCK_GROUP] = LCK_COMPAT_GROUP,
+	[LCK_COS]   = LCK_COMPAT_COS,
+};
+
+/**
+ * Rather arbitrary mapping from LDLM error codes to errno values. This should
+ * not escape to the user level.
+ */
+int ldlm_error2errno(enum ldlm_error error)
+{
+	int result;
+
+	switch (error) {
+	case ELDLM_OK:
+	case ELDLM_LOCK_MATCHED:
+                result = 0;
+                break;
+        case ELDLM_LOCK_CHANGED:
+                result = -ESTALE;
+                break;
+        case ELDLM_LOCK_ABORTED:
+                result = -ENAVAIL;
+                break;
+        case ELDLM_LOCK_REPLACED:
+                result = -ESRCH;
+                break;
+        case ELDLM_NO_LOCK_DATA:
+                result = -ENOENT;
+                break;
+        case ELDLM_NAMESPACE_EXISTS:
+                result = -EEXIST;
+                break;
+        case ELDLM_BAD_NAMESPACE:
+                result = -EBADF;
+                break;
+	default:
+		if (((int)error) < 0) { /* cast to signed type */
+			result = error; /* as ldlm_error can be unsigned */
+		} else {
+			CERROR("Invalid DLM result code: %d\n", error);
+			result = -EPROTO;
+		}
+	}
+	return result;
+}
+EXPORT_SYMBOL(ldlm_error2errno);
+
+/**
+ * Dual to ldlm_error2errno(): maps errno values back to enum ldlm_error.
+ */
+enum ldlm_error ldlm_errno2error(int err_no)
+{
+        int error;
+
+        switch (err_no) {
+        case 0:
+                error = ELDLM_OK;
+                break;
+        case -ESTALE:
+                error = ELDLM_LOCK_CHANGED;
+                break;
+        case -ENAVAIL:
+                error = ELDLM_LOCK_ABORTED;
+                break;
+        case -ESRCH:
+                error = ELDLM_LOCK_REPLACED;
+                break;
+        case -ENOENT:
+                error = ELDLM_NO_LOCK_DATA;
+                break;
+        case -EEXIST:
+                error = ELDLM_NAMESPACE_EXISTS;
+                break;
+        case -EBADF:
+                error = ELDLM_BAD_NAMESPACE;
+                break;
+        default:
+                error = err_no;
+        }
+        return error;
+}
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+void ldlm_dump_export_locks(struct obd_export *exp)
+{
+	spin_lock(&exp->exp_locks_list_guard);
+	if (!list_empty(&exp->exp_locks_list)) {
+		struct ldlm_lock *lock;
+
+		CERROR("dumping locks for export %p,"
+		       "ignore if the unmount doesn't hang\n", exp);
+		list_for_each_entry(lock, &exp->exp_locks_list,
+					l_exp_refs_link)
+			LDLM_ERROR(lock, "lock:");
+	}
+	spin_unlock(&exp->exp_locks_list_guard);
+}
+#endif
+
+#ifdef HAVE_SERVER_SUPPORT
+static int target_bulk_timeout(void *data)
+{
+        ENTRY;
+        /* We don't fail the connection here, because having the export
+         * killed makes the (vital) call to commitrw very sad.
+         */
+        RETURN(1);
+}
+
+static inline const char *bulk2type(struct ptlrpc_request *req)
+{
+	if (req->rq_bulk_read)
+		return "READ";
+	if (req->rq_bulk_write)
+		return "WRITE";
+	return "UNKNOWN";
+}
+
+int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc,
+                   struct l_wait_info *lwi)
+{
+	struct ptlrpc_request	*req = desc->bd_req;
+	time_t			 start = cfs_time_current_sec();
+	time_t			 deadline;
+	int			 rc = 0;
+
+	ENTRY;
+
+	/* If there is eviction in progress, wait for it to finish. */
+	if (unlikely(atomic_read(&exp->exp_obd->obd_evict_inprogress))) {
+		*lwi = LWI_INTR(NULL, NULL);
+		rc = l_wait_event(exp->exp_obd->obd_evict_inprogress_waitq,
+				  !atomic_read(&exp->exp_obd->
+						   obd_evict_inprogress),
+				  lwi);
+	}
+
+	/* Check if client was evicted or reconnected already. */
+	if (exp->exp_failed ||
+	    exp->exp_conn_cnt > lustre_msg_get_conn_cnt(req->rq_reqmsg)) {
+		rc = -ENOTCONN;
+	} else {
+		if (req->rq_bulk_read)
+			rc = sptlrpc_svc_wrap_bulk(req, desc);
+
+		if (OCD_HAS_FLAG(&exp->exp_connect_data, BULK_MBITS))
+			req->rq_mbits = lustre_msg_get_mbits(req->rq_reqmsg);
+		else /* old version, bulk matchbits is rq_xid */
+			req->rq_mbits = req->rq_xid;
+
+		if (rc == 0)
+			rc = ptlrpc_start_bulk_transfer(desc);
+	}
+
+	if (rc < 0) {
+		DEBUG_REQ(D_ERROR, req, "bulk %s failed: rc %d",
+			  bulk2type(req), rc);
+		RETURN(rc);
+	}
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_MDS_SENDPAGE)) {
+		ptlrpc_abort_bulk(desc);
+		RETURN(0);
+	}
+
+	/* limit actual bulk transfer to bulk_timeout seconds */
+	deadline = start + bulk_timeout;
+	if (deadline > req->rq_deadline)
+		deadline = req->rq_deadline;
+
+	do {
+		long timeoutl = deadline - cfs_time_current_sec();
+		cfs_duration_t timeout = timeoutl <= 0 ?
+					 CFS_TICK : cfs_time_seconds(timeoutl);
+		time_t	rq_deadline;
+
+		*lwi = LWI_TIMEOUT_INTERVAL(timeout, cfs_time_seconds(1),
+					    target_bulk_timeout, desc);
+		rc = l_wait_event(desc->bd_waitq,
+				  !ptlrpc_server_bulk_active(desc) ||
+				  exp->exp_failed ||
+				  exp->exp_conn_cnt >
+				  lustre_msg_get_conn_cnt(req->rq_reqmsg),
+				  lwi);
+		LASSERT(rc == 0 || rc == -ETIMEDOUT);
+		/* Wait again if we changed rq_deadline. */
+		rq_deadline = ACCESS_ONCE(req->rq_deadline);
+		deadline = start + bulk_timeout;
+		if (deadline > rq_deadline)
+			deadline = rq_deadline;
+	} while ((rc == -ETIMEDOUT) &&
+		 (deadline > cfs_time_current_sec()));
+
+	if (rc == -ETIMEDOUT) {
+		DEBUG_REQ(D_ERROR, req, "timeout on bulk %s after %ld%+lds",
+			  bulk2type(req), deadline - start,
+			  cfs_time_current_sec() - deadline);
+		ptlrpc_abort_bulk(desc);
+	} else if (exp->exp_failed) {
+		DEBUG_REQ(D_ERROR, req, "Eviction on bulk %s",
+			  bulk2type(req));
+		rc = -ENOTCONN;
+		ptlrpc_abort_bulk(desc);
+	} else if (exp->exp_conn_cnt >
+		   lustre_msg_get_conn_cnt(req->rq_reqmsg)) {
+		DEBUG_REQ(D_ERROR, req, "Reconnect on bulk %s",
+			  bulk2type(req));
+		/* We don't reply anyway. */
+		rc = -ETIMEDOUT;
+		ptlrpc_abort_bulk(desc);
+	} else if (desc->bd_failure) {
+		DEBUG_REQ(D_ERROR, req, "network error on bulk %s",
+			  bulk2type(req));
+		/* XXX should this be a different errno? */
+		rc = -ETIMEDOUT;
+	} else {
+		if (req->rq_bulk_write)
+			rc = sptlrpc_svc_unwrap_bulk(req, desc);
+		if (rc == 0 && desc->bd_nob_transferred != desc->bd_nob) {
+			DEBUG_REQ(D_ERROR, req, "truncated bulk %s %d(%d)",
+				  bulk2type(req), desc->bd_nob_transferred,
+				  desc->bd_nob);
+			/* XXX should this be a different errno? */
+			rc = -ETIMEDOUT;
+		}
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(target_bulk_io);
+
+#endif /* HAVE_SERVER_SUPPORT */
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c
new file mode 100644
index 0000000000000..ca171fe485f0b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c
@@ -0,0 +1,2866 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_lock.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <libcfs/libcfs.h>
+
+#include <lustre_swab.h>
+#include <obd_class.h>
+
+#include "ldlm_internal.h"
+
+/* lock types */
+char *ldlm_lockname[] = {
+	[0] = "--",
+	[LCK_EX] = "EX",
+	[LCK_PW] = "PW",
+	[LCK_PR] = "PR",
+	[LCK_CW] = "CW",
+	[LCK_CR] = "CR",
+	[LCK_NL] = "NL",
+	[LCK_GROUP] = "GROUP",
+	[LCK_COS] = "COS"
+};
+EXPORT_SYMBOL(ldlm_lockname);
+
+char *ldlm_typename[] = {
+	[LDLM_PLAIN] = "PLN",
+	[LDLM_EXTENT] = "EXT",
+	[LDLM_FLOCK] = "FLK",
+	[LDLM_IBITS] = "IBT",
+};
+
+static ldlm_policy_wire_to_local_t ldlm_policy_wire_to_local[] = {
+	[LDLM_PLAIN - LDLM_MIN_TYPE]  = ldlm_plain_policy_wire_to_local,
+	[LDLM_EXTENT - LDLM_MIN_TYPE] = ldlm_extent_policy_wire_to_local,
+	[LDLM_FLOCK - LDLM_MIN_TYPE]  = ldlm_flock_policy_wire_to_local,
+	[LDLM_IBITS - LDLM_MIN_TYPE]  = ldlm_ibits_policy_wire_to_local,
+};
+
+static ldlm_policy_local_to_wire_t ldlm_policy_local_to_wire[] = {
+	[LDLM_PLAIN - LDLM_MIN_TYPE]  = ldlm_plain_policy_local_to_wire,
+	[LDLM_EXTENT - LDLM_MIN_TYPE] = ldlm_extent_policy_local_to_wire,
+	[LDLM_FLOCK - LDLM_MIN_TYPE]  = ldlm_flock_policy_local_to_wire,
+	[LDLM_IBITS - LDLM_MIN_TYPE]  = ldlm_ibits_policy_local_to_wire,
+};
+
+/**
+ * Converts lock policy from local format to on the wire lock_desc format
+ */
+void ldlm_convert_policy_to_wire(enum ldlm_type type,
+				 const union ldlm_policy_data *lpolicy,
+				 union ldlm_wire_policy_data *wpolicy)
+{
+	ldlm_policy_local_to_wire_t convert;
+
+	convert = ldlm_policy_local_to_wire[type - LDLM_MIN_TYPE];
+
+	convert(lpolicy, wpolicy);
+}
+
+/**
+ * Converts lock policy from on the wire lock_desc format to local format
+ */
+void ldlm_convert_policy_to_local(struct obd_export *exp, enum ldlm_type type,
+				  const union ldlm_wire_policy_data *wpolicy,
+				  union ldlm_policy_data *lpolicy)
+{
+	ldlm_policy_wire_to_local_t convert;
+
+	convert = ldlm_policy_wire_to_local[type - LDLM_MIN_TYPE];
+
+	convert(wpolicy, lpolicy);
+}
+
+const char *ldlm_it2str(enum ldlm_intent_flags it)
+{
+	switch (it) {
+	case IT_OPEN:
+		return "open";
+	case IT_CREAT:
+		return "creat";
+	case (IT_OPEN | IT_CREAT):
+		return "open|creat";
+	case IT_READDIR:
+		return "readdir";
+	case IT_GETATTR:
+		return "getattr";
+	case IT_LOOKUP:
+		return "lookup";
+	case IT_UNLINK:
+		return "unlink";
+	case IT_GETXATTR:
+		return "getxattr";
+	case IT_LAYOUT:
+		return "layout";
+	default:
+		CERROR("Unknown intent 0x%08x\n", it);
+		return "UNKNOWN";
+	}
+}
+EXPORT_SYMBOL(ldlm_it2str);
+
+extern struct kmem_cache *ldlm_lock_slab;
+
+#ifdef HAVE_SERVER_SUPPORT
+static ldlm_processing_policy ldlm_processing_policy_table[] = {
+	[LDLM_PLAIN]	= ldlm_process_plain_lock,
+	[LDLM_EXTENT]	= ldlm_process_extent_lock,
+	[LDLM_FLOCK]	= ldlm_process_flock_lock,
+	[LDLM_IBITS]	= ldlm_process_inodebits_lock,
+};
+
+ldlm_processing_policy ldlm_get_processing_policy(struct ldlm_resource *res)
+{
+        return ldlm_processing_policy_table[res->lr_type];
+}
+EXPORT_SYMBOL(ldlm_get_processing_policy);
+#endif /* HAVE_SERVER_SUPPORT */
+
+void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg)
+{
+        ns->ns_policy = arg;
+}
+EXPORT_SYMBOL(ldlm_register_intent);
+
+/*
+ * REFCOUNTED LOCK OBJECTS
+ */
+
+
+/**
+ * Get a reference on a lock.
+ *
+ * Lock refcounts, during creation:
+ *   - one special one for allocation, dec'd only once in destroy
+ *   - one for being a lock that's in-use
+ *   - one for the addref associated with a new lock
+ */
+struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock)
+{
+	atomic_inc(&lock->l_refc);
+        return lock;
+}
+EXPORT_SYMBOL(ldlm_lock_get);
+
+/**
+ * Release lock reference.
+ *
+ * Also frees the lock if it was last reference.
+ */
+void ldlm_lock_put(struct ldlm_lock *lock)
+{
+        ENTRY;
+
+        LASSERT(lock->l_resource != LP_POISON);
+	LASSERT(atomic_read(&lock->l_refc) > 0);
+	if (atomic_dec_and_test(&lock->l_refc)) {
+                struct ldlm_resource *res;
+
+                LDLM_DEBUG(lock,
+                           "final lock_put on destroyed lock, freeing it.");
+
+                res = lock->l_resource;
+		LASSERT(ldlm_is_destroyed(lock));
+		LASSERT(list_empty(&lock->l_exp_list));
+		LASSERT(list_empty(&lock->l_res_link));
+		LASSERT(list_empty(&lock->l_pending_chain));
+
+                lprocfs_counter_decr(ldlm_res_to_ns(res)->ns_stats,
+                                     LDLM_NSS_LOCKS);
+                lu_ref_del(&res->lr_reference, "lock", lock);
+                ldlm_resource_putref(res);
+                lock->l_resource = NULL;
+                if (lock->l_export) {
+                        class_export_lock_put(lock->l_export, lock);
+                        lock->l_export = NULL;
+                }
+
+                if (lock->l_lvb_data != NULL)
+                        OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
+
+                ldlm_interval_free(ldlm_interval_detach(lock));
+                lu_ref_fini(&lock->l_reference);
+		OBD_FREE_RCU(lock, sizeof(*lock), &lock->l_handle);
+        }
+
+        EXIT;
+}
+EXPORT_SYMBOL(ldlm_lock_put);
+
+/**
+ * Removes LDLM lock \a lock from LRU. Assumes LRU is already locked.
+ */
+int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock)
+{
+	int rc = 0;
+	if (!list_empty(&lock->l_lru)) {
+		struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+		LASSERT(lock->l_resource->lr_type != LDLM_FLOCK);
+		list_del_init(&lock->l_lru);
+		LASSERT(ns->ns_nr_unused > 0);
+		ns->ns_nr_unused--;
+		rc = 1;
+	}
+	return rc;
+}
+
+/**
+ * Removes LDLM lock \a lock from LRU. Obtains the LRU lock first.
+ *
+ * If \a last_use is non-zero, it will remove the lock from LRU only if
+ * it matches lock's l_last_used.
+ *
+ * \retval 0 if \a last_use is set, the lock is not in LRU list or \a last_use
+ *           doesn't match lock's l_last_used;
+ *           otherwise, the lock hasn't been in the LRU list.
+ * \retval 1 the lock was in LRU list and removed.
+ */
+int ldlm_lock_remove_from_lru_check(struct ldlm_lock *lock, ktime_t last_use)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+	int rc = 0;
+
+	ENTRY;
+	if (ldlm_is_ns_srv(lock)) {
+		LASSERT(list_empty(&lock->l_lru));
+		RETURN(0);
+	}
+
+	spin_lock(&ns->ns_lock);
+	if (!ktime_compare(last_use, ktime_set(0, 0)) ||
+	    !ktime_compare(last_use, lock->l_last_used))
+		rc = ldlm_lock_remove_from_lru_nolock(lock);
+	spin_unlock(&ns->ns_lock);
+
+	RETURN(rc);
+}
+
+/**
+ * Adds LDLM lock \a lock to namespace LRU. Assumes LRU is already locked.
+ */
+void ldlm_lock_add_to_lru_nolock(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	lock->l_last_used = ktime_get();
+	LASSERT(list_empty(&lock->l_lru));
+	LASSERT(lock->l_resource->lr_type != LDLM_FLOCK);
+	list_add_tail(&lock->l_lru, &ns->ns_unused_list);
+	ldlm_clear_skipped(lock);
+	LASSERT(ns->ns_nr_unused >= 0);
+	ns->ns_nr_unused++;
+}
+
+/**
+ * Adds LDLM lock \a lock to namespace LRU. Obtains necessary LRU locks
+ * first.
+ */
+void ldlm_lock_add_to_lru(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	ENTRY;
+	spin_lock(&ns->ns_lock);
+	ldlm_lock_add_to_lru_nolock(lock);
+	spin_unlock(&ns->ns_lock);
+	EXIT;
+}
+
+/**
+ * Moves LDLM lock \a lock that is already in namespace LRU to the tail of
+ * the LRU. Performs necessary LRU locking
+ */
+void ldlm_lock_touch_in_lru(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	ENTRY;
+	if (ldlm_is_ns_srv(lock)) {
+		LASSERT(list_empty(&lock->l_lru));
+		EXIT;
+		return;
+	}
+
+	spin_lock(&ns->ns_lock);
+	if (!list_empty(&lock->l_lru)) {
+		ldlm_lock_remove_from_lru_nolock(lock);
+		ldlm_lock_add_to_lru_nolock(lock);
+	}
+	spin_unlock(&ns->ns_lock);
+	EXIT;
+}
+
+/**
+ * Helper to destroy a locked lock.
+ *
+ * Used by ldlm_lock_destroy and ldlm_lock_destroy_nolock
+ * Must be called with l_lock and lr_lock held.
+ *
+ * Does not actually free the lock data, but rather marks the lock as
+ * destroyed by setting l_destroyed field in the lock to 1.  Destroys a
+ * handle->lock association too, so that the lock can no longer be found
+ * and removes the lock from LRU list.  Actual lock freeing occurs when
+ * last lock reference goes away.
+ *
+ * Original comment (of some historical value):
+ * This used to have a 'strict' flag, which recovery would use to mark an
+ * in-use lock as needing-to-die.  Lest I am ever tempted to put it back, I
+ * shall explain why it's gone: with the new hash table scheme, once you call
+ * ldlm_lock_destroy, you can never drop your final references on this lock.
+ * Because it's not in the hash table anymore.  -phil
+ */
+static int ldlm_lock_destroy_internal(struct ldlm_lock *lock)
+{
+        ENTRY;
+
+        if (lock->l_readers || lock->l_writers) {
+                LDLM_ERROR(lock, "lock still has references");
+                LBUG();
+        }
+
+	if (!list_empty(&lock->l_res_link)) {
+                LDLM_ERROR(lock, "lock still on resource");
+                LBUG();
+        }
+
+	if (ldlm_is_destroyed(lock)) {
+		LASSERT(list_empty(&lock->l_lru));
+		EXIT;
+		return 0;
+	}
+	ldlm_set_destroyed(lock);
+
+	if (lock->l_export && lock->l_export->exp_lock_hash) {
+		/* NB: it's safe to call cfs_hash_del() even lock isn't
+		 * in exp_lock_hash. */
+		/* In the function below, .hs_keycmp resolves to
+		 * ldlm_export_lock_keycmp() */
+		/* coverity[overrun-buffer-val] */
+		cfs_hash_del(lock->l_export->exp_lock_hash,
+			     &lock->l_remote_handle, &lock->l_exp_hash);
+	}
+
+        ldlm_lock_remove_from_lru(lock);
+        class_handle_unhash(&lock->l_handle);
+
+        EXIT;
+        return 1;
+}
+
+/**
+ * Destroys a LDLM lock \a lock. Performs necessary locking first.
+ */
+void ldlm_lock_destroy(struct ldlm_lock *lock)
+{
+        int first;
+        ENTRY;
+        lock_res_and_lock(lock);
+        first = ldlm_lock_destroy_internal(lock);
+        unlock_res_and_lock(lock);
+
+        /* drop reference from hashtable only for first destroy */
+        if (first) {
+                lu_ref_del(&lock->l_reference, "hash", lock);
+                LDLM_LOCK_RELEASE(lock);
+        }
+        EXIT;
+}
+
+/**
+ * Destroys a LDLM lock \a lock that is already locked.
+ */
+void ldlm_lock_destroy_nolock(struct ldlm_lock *lock)
+{
+        int first;
+        ENTRY;
+        first = ldlm_lock_destroy_internal(lock);
+        /* drop reference from hashtable only for first destroy */
+        if (first) {
+                lu_ref_del(&lock->l_reference, "hash", lock);
+                LDLM_LOCK_RELEASE(lock);
+        }
+        EXIT;
+}
+
+/* this is called by portals_handle2object with the handle lock taken */
+static void lock_handle_addref(void *lock)
+{
+        LDLM_LOCK_GET((struct ldlm_lock *)lock);
+}
+
+static void lock_handle_free(void *lock, int size)
+{
+	LASSERT(size == sizeof(struct ldlm_lock));
+	OBD_SLAB_FREE(lock, ldlm_lock_slab, size);
+}
+
+static struct portals_handle_ops lock_handle_ops = {
+	.hop_addref = lock_handle_addref,
+	.hop_free   = lock_handle_free,
+};
+
+/**
+ *
+ * Allocate and initialize new lock structure.
+ *
+ * usage: pass in a resource on which you have done ldlm_resource_get
+ *        new lock will take over the refcount.
+ * returns: lock with refcount 2 - one for current caller and one for remote
+ */
+static struct ldlm_lock *ldlm_lock_new(struct ldlm_resource *resource)
+{
+	struct ldlm_lock *lock;
+	ENTRY;
+
+	if (resource == NULL)
+		LBUG();
+
+	OBD_SLAB_ALLOC_PTR_GFP(lock, ldlm_lock_slab, GFP_NOFS);
+	if (lock == NULL)
+		RETURN(NULL);
+
+	spin_lock_init(&lock->l_lock);
+	lock->l_resource = resource;
+	lu_ref_add(&resource->lr_reference, "lock", lock);
+
+	atomic_set(&lock->l_refc, 2);
+	INIT_LIST_HEAD(&lock->l_res_link);
+	INIT_LIST_HEAD(&lock->l_lru);
+	INIT_LIST_HEAD(&lock->l_pending_chain);
+	INIT_LIST_HEAD(&lock->l_bl_ast);
+	INIT_LIST_HEAD(&lock->l_cp_ast);
+	INIT_LIST_HEAD(&lock->l_rk_ast);
+	init_waitqueue_head(&lock->l_waitq);
+	lock->l_blocking_lock = NULL;
+	INIT_LIST_HEAD(&lock->l_sl_mode);
+	INIT_LIST_HEAD(&lock->l_sl_policy);
+	INIT_HLIST_NODE(&lock->l_exp_hash);
+	INIT_HLIST_NODE(&lock->l_exp_flock_hash);
+
+        lprocfs_counter_incr(ldlm_res_to_ns(resource)->ns_stats,
+                             LDLM_NSS_LOCKS);
+	INIT_LIST_HEAD(&lock->l_handle.h_link);
+	class_handle_hash(&lock->l_handle, &lock_handle_ops);
+
+        lu_ref_init(&lock->l_reference);
+        lu_ref_add(&lock->l_reference, "hash", lock);
+        lock->l_callback_timeout = 0;
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	INIT_LIST_HEAD(&lock->l_exp_refs_link);
+        lock->l_exp_refs_nr = 0;
+        lock->l_exp_refs_target = NULL;
+#endif
+	INIT_LIST_HEAD(&lock->l_exp_list);
+
+        RETURN(lock);
+}
+
+/**
+ * Moves LDLM lock \a lock to another resource.
+ * This is used on client when server returns some other lock than requested
+ * (typically as a result of intent operation)
+ */
+int ldlm_lock_change_resource(struct ldlm_namespace *ns, struct ldlm_lock *lock,
+                              const struct ldlm_res_id *new_resid)
+{
+        struct ldlm_resource *oldres = lock->l_resource;
+        struct ldlm_resource *newres;
+        int type;
+        ENTRY;
+
+        LASSERT(ns_is_client(ns));
+
+        lock_res_and_lock(lock);
+        if (memcmp(new_resid, &lock->l_resource->lr_name,
+                   sizeof(lock->l_resource->lr_name)) == 0) {
+                /* Nothing to do */
+                unlock_res_and_lock(lock);
+                RETURN(0);
+        }
+
+        LASSERT(new_resid->name[0] != 0);
+
+        /* This function assumes that the lock isn't on any lists */
+	LASSERT(list_empty(&lock->l_res_link));
+
+        type = oldres->lr_type;
+        unlock_res_and_lock(lock);
+
+	newres = ldlm_resource_get(ns, NULL, new_resid, type, 1);
+	if (IS_ERR(newres))
+		RETURN(PTR_ERR(newres));
+
+        lu_ref_add(&newres->lr_reference, "lock", lock);
+        /*
+         * To flip the lock from the old to the new resource, lock, oldres and
+         * newres have to be locked. Resource spin-locks are nested within
+         * lock->l_lock, and are taken in the memory address order to avoid
+         * dead-locks.
+         */
+	spin_lock(&lock->l_lock);
+        oldres = lock->l_resource;
+        if (oldres < newres) {
+                lock_res(oldres);
+                lock_res_nested(newres, LRT_NEW);
+        } else {
+                lock_res(newres);
+                lock_res_nested(oldres, LRT_NEW);
+        }
+        LASSERT(memcmp(new_resid, &oldres->lr_name,
+                       sizeof oldres->lr_name) != 0);
+        lock->l_resource = newres;
+        unlock_res(oldres);
+        unlock_res_and_lock(lock);
+
+        /* ...and the flowers are still standing! */
+        lu_ref_del(&oldres->lr_reference, "lock", lock);
+        ldlm_resource_putref(oldres);
+
+        RETURN(0);
+}
+
+/** \defgroup ldlm_handles LDLM HANDLES
+ * Ways to get hold of locks without any addresses.
+ * @{
+ */
+
+/**
+ * Fills in handle for LDLM lock \a lock into supplied \a lockh
+ * Does not take any references.
+ */
+void ldlm_lock2handle(const struct ldlm_lock *lock, struct lustre_handle *lockh)
+{
+	lockh->cookie = lock->l_handle.h_cookie;
+}
+EXPORT_SYMBOL(ldlm_lock2handle);
+
+/**
+ * Obtain a lock reference by handle.
+ *
+ * if \a flags: atomically get the lock and set the flags.
+ *              Return NULL if flag already set
+ */
+struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *handle,
+				     __u64 flags)
+{
+	struct ldlm_lock *lock;
+	ENTRY;
+
+	LASSERT(handle);
+
+	lock = class_handle2object(handle->cookie, NULL);
+	if (lock == NULL)
+		RETURN(NULL);
+
+	if (lock->l_export != NULL && lock->l_export->exp_failed) {
+		CDEBUG(D_INFO, "lock export failed: lock %p, exp %p\n",
+		       lock, lock->l_export);
+		LDLM_LOCK_PUT(lock);
+		RETURN(NULL);
+	}
+
+	/* It's unlikely but possible that someone marked the lock as
+	 * destroyed after we did handle2object on it */
+	if ((flags == 0) && !ldlm_is_destroyed(lock)) {
+		lu_ref_add(&lock->l_reference, "handle", current);
+		RETURN(lock);
+	}
+
+	lock_res_and_lock(lock);
+
+	LASSERT(lock->l_resource != NULL);
+
+	lu_ref_add_atomic(&lock->l_reference, "handle", current);
+	if (unlikely(ldlm_is_destroyed(lock))) {
+		unlock_res_and_lock(lock);
+		CDEBUG(D_INFO, "lock already destroyed: lock %p\n", lock);
+		LDLM_LOCK_PUT(lock);
+		RETURN(NULL);
+	}
+
+	/* If we're setting flags, make sure none of them are already set. */
+	if (flags != 0) {
+		if ((lock->l_flags & flags) != 0) {
+			unlock_res_and_lock(lock);
+			LDLM_LOCK_PUT(lock);
+			RETURN(NULL);
+		}
+
+		lock->l_flags |= flags;
+	}
+
+	unlock_res_and_lock(lock);
+	RETURN(lock);
+}
+EXPORT_SYMBOL(__ldlm_handle2lock);
+/** @} ldlm_handles */
+
+/**
+ * Fill in "on the wire" representation for given LDLM lock into supplied
+ * lock descriptor \a desc structure.
+ */
+void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc)
+{
+	ldlm_res2desc(lock->l_resource, &desc->l_resource);
+	desc->l_req_mode = lock->l_req_mode;
+	desc->l_granted_mode = lock->l_granted_mode;
+	ldlm_convert_policy_to_wire(lock->l_resource->lr_type,
+				    &lock->l_policy_data,
+				    &desc->l_policy_data);
+}
+
+/**
+ * Add a lock to list of conflicting locks to send AST to.
+ *
+ * Only add if we have not sent a blocking AST to the lock yet.
+ */
+static void ldlm_add_bl_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
+				  struct list_head *work_list)
+{
+	if (!ldlm_is_ast_sent(lock)) {
+		LDLM_DEBUG(lock, "lock incompatible; sending blocking AST.");
+		ldlm_set_ast_sent(lock);
+		/* If the enqueuing client said so, tell the AST recipient to
+		 * discard dirty data, rather than writing back. */
+		if (ldlm_is_ast_discard_data(new))
+			ldlm_set_discard_data(lock);
+		LASSERT(list_empty(&lock->l_bl_ast));
+		list_add(&lock->l_bl_ast, work_list);
+                LDLM_LOCK_GET(lock);
+                LASSERT(lock->l_blocking_lock == NULL);
+                lock->l_blocking_lock = LDLM_LOCK_GET(new);
+        }
+}
+
+/**
+ * Add a lock to list of just granted locks to send completion AST to.
+ */
+static void ldlm_add_cp_work_item(struct ldlm_lock *lock,
+				  struct list_head *work_list)
+{
+	if (!ldlm_is_cp_reqd(lock)) {
+		ldlm_set_cp_reqd(lock);
+                LDLM_DEBUG(lock, "lock granted; sending completion AST.");
+		LASSERT(list_empty(&lock->l_cp_ast));
+		list_add(&lock->l_cp_ast, work_list);
+                LDLM_LOCK_GET(lock);
+        }
+}
+
+/**
+ * Aggregator function to add AST work items into a list. Determines
+ * what sort of an AST work needs to be done and calls the proper
+ * adding function.
+ * Must be called with lr_lock held.
+ */
+void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
+			    struct list_head *work_list)
+{
+        ENTRY;
+        check_res_locked(lock->l_resource);
+        if (new)
+                ldlm_add_bl_work_item(lock, new, work_list);
+        else
+                ldlm_add_cp_work_item(lock, work_list);
+        EXIT;
+}
+
+/**
+ * Add specified reader/writer reference to LDLM lock with handle \a lockh.
+ * r/w reference type is determined by \a mode
+ * Calls ldlm_lock_addref_internal.
+ */
+void ldlm_lock_addref(const struct lustre_handle *lockh, enum ldlm_mode mode)
+{
+	struct ldlm_lock *lock;
+
+	lock = ldlm_handle2lock(lockh);
+	LASSERTF(lock != NULL, "Non-existing lock: %#llx\n", lockh->cookie);
+	ldlm_lock_addref_internal(lock, mode);
+	LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_addref);
+
+/**
+ * Helper function.
+ * Add specified reader/writer reference to LDLM lock \a lock.
+ * r/w reference type is determined by \a mode
+ * Removes lock from LRU if it is there.
+ * Assumes the LDLM lock is already locked.
+ */
+void ldlm_lock_addref_internal_nolock(struct ldlm_lock *lock,
+				      enum ldlm_mode mode)
+{
+        ldlm_lock_remove_from_lru(lock);
+        if (mode & (LCK_NL | LCK_CR | LCK_PR)) {
+                lock->l_readers++;
+                lu_ref_add_atomic(&lock->l_reference, "reader", lock);
+        }
+        if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP | LCK_COS)) {
+                lock->l_writers++;
+                lu_ref_add_atomic(&lock->l_reference, "writer", lock);
+        }
+        LDLM_LOCK_GET(lock);
+        lu_ref_add_atomic(&lock->l_reference, "user", lock);
+        LDLM_DEBUG(lock, "ldlm_lock_addref(%s)", ldlm_lockname[mode]);
+}
+
+/**
+ * Attempts to add reader/writer reference to a lock with handle \a lockh, and
+ * fails if lock is already LDLM_FL_CBPENDING or destroyed.
+ *
+ * \retval 0 success, lock was addref-ed
+ *
+ * \retval -EAGAIN lock is being canceled.
+ */
+int ldlm_lock_addref_try(const struct lustre_handle *lockh, enum ldlm_mode mode)
+{
+        struct ldlm_lock *lock;
+        int               result;
+
+        result = -EAGAIN;
+        lock = ldlm_handle2lock(lockh);
+        if (lock != NULL) {
+                lock_res_and_lock(lock);
+                if (lock->l_readers != 0 || lock->l_writers != 0 ||
+		    !ldlm_is_cbpending(lock)) {
+                        ldlm_lock_addref_internal_nolock(lock, mode);
+                        result = 0;
+                }
+                unlock_res_and_lock(lock);
+                LDLM_LOCK_PUT(lock);
+        }
+        return result;
+}
+EXPORT_SYMBOL(ldlm_lock_addref_try);
+
+/**
+ * Add specified reader/writer reference to LDLM lock \a lock.
+ * Locks LDLM lock and calls ldlm_lock_addref_internal_nolock to do the work.
+ * Only called for local locks.
+ */
+void ldlm_lock_addref_internal(struct ldlm_lock *lock, enum ldlm_mode mode)
+{
+	lock_res_and_lock(lock);
+	ldlm_lock_addref_internal_nolock(lock, mode);
+	unlock_res_and_lock(lock);
+}
+
+/**
+ * Removes reader/writer reference for LDLM lock \a lock.
+ * Assumes LDLM lock is already locked.
+ * only called in ldlm_flock_destroy and for local locks.
+ * Does NOT add lock to LRU if no r/w references left to accomodate flock locks
+ * that cannot be placed in LRU.
+ */
+void ldlm_lock_decref_internal_nolock(struct ldlm_lock *lock,
+				      enum ldlm_mode mode)
+{
+        LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]);
+        if (mode & (LCK_NL | LCK_CR | LCK_PR)) {
+                LASSERT(lock->l_readers > 0);
+                lu_ref_del(&lock->l_reference, "reader", lock);
+                lock->l_readers--;
+        }
+        if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP | LCK_COS)) {
+                LASSERT(lock->l_writers > 0);
+                lu_ref_del(&lock->l_reference, "writer", lock);
+                lock->l_writers--;
+        }
+
+        lu_ref_del(&lock->l_reference, "user", lock);
+        LDLM_LOCK_RELEASE(lock);    /* matches the LDLM_LOCK_GET() in addref */
+}
+
+/**
+ * Removes reader/writer reference for LDLM lock \a lock.
+ * Locks LDLM lock first.
+ * If the lock is determined to be client lock on a client and r/w refcount
+ * drops to zero and the lock is not blocked, the lock is added to LRU lock
+ * on the namespace.
+ * For blocked LDLM locks if r/w count drops to zero, blocking_ast is called.
+ */
+void ldlm_lock_decref_internal(struct ldlm_lock *lock, enum ldlm_mode mode)
+{
+        struct ldlm_namespace *ns;
+        ENTRY;
+
+        lock_res_and_lock(lock);
+
+        ns = ldlm_lock_to_ns(lock);
+
+        ldlm_lock_decref_internal_nolock(lock, mode);
+
+	if ((ldlm_is_local(lock) || lock->l_req_mode == LCK_GROUP) &&
+	    !lock->l_readers && !lock->l_writers) {
+		/* If this is a local lock on a server namespace and this was
+		 * the last reference, cancel the lock.
+		 *
+		 * Group locks are special:
+		 * They must not go in LRU, but they are not called back
+		 * like non-group locks, instead they are manually released.
+		 * They have an l_writers reference which they keep until
+		 * they are manually released, so we remove them when they have
+		 * no more reader or writer references. - LU-6368 */
+		ldlm_set_cbpending(lock);
+	}
+
+	if (!lock->l_readers && !lock->l_writers && ldlm_is_cbpending(lock)) {
+		/* If we received a blocked AST and this was the last reference,
+		 * run the callback. */
+		if (ldlm_is_ns_srv(lock) && lock->l_export)
+                        CERROR("FL_CBPENDING set on non-local lock--just a "
+                               "warning\n");
+
+                LDLM_DEBUG(lock, "final decref done on cbpending lock");
+
+                LDLM_LOCK_GET(lock); /* dropped by bl thread */
+                ldlm_lock_remove_from_lru(lock);
+                unlock_res_and_lock(lock);
+
+		if (ldlm_is_fail_loc(lock))
+                        OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE);
+
+		if (ldlm_is_atomic_cb(lock) ||
+                    ldlm_bl_to_thread_lock(ns, NULL, lock) != 0)
+                        ldlm_handle_bl_callback(ns, NULL, lock);
+        } else if (ns_is_client(ns) &&
+                   !lock->l_readers && !lock->l_writers &&
+		   !ldlm_is_no_lru(lock) &&
+		   !ldlm_is_bl_ast(lock)) {
+
+                LDLM_DEBUG(lock, "add lock into lru list");
+
+                /* If this is a client-side namespace and this was the last
+                 * reference, put it on the LRU. */
+                ldlm_lock_add_to_lru(lock);
+                unlock_res_and_lock(lock);
+
+		if (ldlm_is_fail_loc(lock))
+                        OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE);
+
+                /* Call ldlm_cancel_lru() only if EARLY_CANCEL and LRU RESIZE
+                 * are not supported by the server, otherwise, it is done on
+                 * enqueue. */
+                if (!exp_connect_cancelset(lock->l_conn_export) &&
+                    !ns_connect_lru_resize(ns))
+			ldlm_cancel_lru(ns, 0, LCF_ASYNC, 0);
+        } else {
+                LDLM_DEBUG(lock, "do not add lock into lru list");
+                unlock_res_and_lock(lock);
+        }
+
+        EXIT;
+}
+
+/**
+ * Decrease reader/writer refcount for LDLM lock with handle \a lockh
+ */
+void ldlm_lock_decref(const struct lustre_handle *lockh, enum ldlm_mode mode)
+{
+        struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0);
+	LASSERTF(lock != NULL, "Non-existing lock: %#llx\n", lockh->cookie);
+        ldlm_lock_decref_internal(lock, mode);
+        LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_decref);
+
+/**
+ * Decrease reader/writer refcount for LDLM lock with handle
+ * \a lockh and mark it for subsequent cancellation once r/w refcount
+ * drops to zero instead of putting into LRU.
+ *
+ */
+void ldlm_lock_decref_and_cancel(const struct lustre_handle *lockh,
+				 enum ldlm_mode mode)
+{
+        struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0);
+        ENTRY;
+
+        LASSERT(lock != NULL);
+
+        LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]);
+        lock_res_and_lock(lock);
+	ldlm_set_cbpending(lock);
+        unlock_res_and_lock(lock);
+        ldlm_lock_decref_internal(lock, mode);
+        LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_decref_and_cancel);
+
+struct sl_insert_point {
+	struct list_head *res_link;
+	struct list_head *mode_link;
+	struct list_head *policy_link;
+};
+
+/**
+ * Finds a position to insert the new lock into granted lock list.
+ *
+ * Used for locks eligible for skiplist optimization.
+ *
+ * Parameters:
+ *      queue [input]:  the granted list where search acts on;
+ *      req [input]:    the lock whose position to be located;
+ *      prev [output]:  positions within 3 lists to insert @req to
+ * Return Value:
+ *      filled @prev
+ * NOTE: called by
+ *  - ldlm_grant_lock_with_skiplist
+ */
+static void search_granted_lock(struct list_head *queue,
+                                struct ldlm_lock *req,
+                                struct sl_insert_point *prev)
+{
+	struct list_head *tmp;
+        struct ldlm_lock *lock, *mode_end, *policy_end;
+        ENTRY;
+
+	list_for_each(tmp, queue) {
+		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+		mode_end = list_entry(lock->l_sl_mode.prev,
+                                          struct ldlm_lock, l_sl_mode);
+
+                if (lock->l_req_mode != req->l_req_mode) {
+                        /* jump to last lock of mode group */
+                        tmp = &mode_end->l_res_link;
+                        continue;
+                }
+
+                /* suitable mode group is found */
+                if (lock->l_resource->lr_type == LDLM_PLAIN) {
+                        /* insert point is last lock of the mode group */
+                        prev->res_link = &mode_end->l_res_link;
+                        prev->mode_link = &mode_end->l_sl_mode;
+                        prev->policy_link = &req->l_sl_policy;
+                        EXIT;
+                        return;
+                } else if (lock->l_resource->lr_type == LDLM_IBITS) {
+                        for (;;) {
+                                policy_end =
+					list_entry(lock->l_sl_policy.prev,
+                                                       struct ldlm_lock,
+                                                       l_sl_policy);
+
+                                if (lock->l_policy_data.l_inodebits.bits ==
+                                    req->l_policy_data.l_inodebits.bits) {
+                                        /* insert point is last lock of
+                                         * the policy group */
+                                        prev->res_link =
+                                                &policy_end->l_res_link;
+                                        prev->mode_link =
+                                                &policy_end->l_sl_mode;
+                                        prev->policy_link =
+                                                &policy_end->l_sl_policy;
+                                        EXIT;
+                                        return;
+                                }
+
+                                if (policy_end == mode_end)
+                                        /* done with mode group */
+                                        break;
+
+                                /* go to next policy group within mode group */
+                                tmp = policy_end->l_res_link.next;
+				lock = list_entry(tmp, struct ldlm_lock,
+                                                      l_res_link);
+                        }  /* loop over policy groups within the mode group */
+
+                        /* insert point is last lock of the mode group,
+                         * new policy group is started */
+                        prev->res_link = &mode_end->l_res_link;
+                        prev->mode_link = &mode_end->l_sl_mode;
+                        prev->policy_link = &req->l_sl_policy;
+                        EXIT;
+                        return;
+                } else {
+                        LDLM_ERROR(lock,"is not LDLM_PLAIN or LDLM_IBITS lock");
+                        LBUG();
+                }
+        }
+
+        /* insert point is last lock on the queue,
+         * new mode group and new policy group are started */
+        prev->res_link = queue->prev;
+        prev->mode_link = &req->l_sl_mode;
+        prev->policy_link = &req->l_sl_policy;
+        EXIT;
+        return;
+}
+
+/**
+ * Add a lock into resource granted list after a position described by
+ * \a prev.
+ */
+static void ldlm_granted_list_add_lock(struct ldlm_lock *lock,
+                                       struct sl_insert_point *prev)
+{
+        struct ldlm_resource *res = lock->l_resource;
+        ENTRY;
+
+        check_res_locked(res);
+
+        ldlm_resource_dump(D_INFO, res);
+        LDLM_DEBUG(lock, "About to add lock:");
+
+	if (ldlm_is_destroyed(lock)) {
+                CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n");
+                return;
+        }
+
+	LASSERT(list_empty(&lock->l_res_link));
+	LASSERT(list_empty(&lock->l_sl_mode));
+	LASSERT(list_empty(&lock->l_sl_policy));
+
+	/*
+	 * lock->link == prev->link means lock is first starting the group.
+	 * Don't re-add to itself to suppress kernel warnings.
+	 */
+	if (&lock->l_res_link != prev->res_link)
+		list_add(&lock->l_res_link, prev->res_link);
+	if (&lock->l_sl_mode != prev->mode_link)
+		list_add(&lock->l_sl_mode, prev->mode_link);
+	if (&lock->l_sl_policy != prev->policy_link)
+		list_add(&lock->l_sl_policy, prev->policy_link);
+
+        EXIT;
+}
+
+/**
+ * Add a lock to granted list on a resource maintaining skiplist
+ * correctness.
+ */
+static void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock)
+{
+        struct sl_insert_point prev;
+        ENTRY;
+
+        LASSERT(lock->l_req_mode == lock->l_granted_mode);
+
+        search_granted_lock(&lock->l_resource->lr_granted, lock, &prev);
+        ldlm_granted_list_add_lock(lock, &prev);
+        EXIT;
+}
+
+/**
+ * Perform lock granting bookkeeping.
+ *
+ * Includes putting the lock into granted list and updating lock mode.
+ * NOTE: called by
+ *  - ldlm_lock_enqueue
+ *  - ldlm_reprocess_queue
+ *  - ldlm_lock_convert
+ *
+ * must be called with lr_lock held
+ */
+void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list)
+{
+        struct ldlm_resource *res = lock->l_resource;
+        ENTRY;
+
+        check_res_locked(res);
+
+        lock->l_granted_mode = lock->l_req_mode;
+
+	if (work_list && lock->l_completion_ast != NULL)
+		ldlm_add_ast_work_item(lock, NULL, work_list);
+
+        if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS)
+                ldlm_grant_lock_with_skiplist(lock);
+        else if (res->lr_type == LDLM_EXTENT)
+                ldlm_extent_add_lock(res, lock);
+	else if (res->lr_type == LDLM_FLOCK) {
+		/* We should not add locks to granted list in the following
+		 * cases:
+		 * - this is an UNLOCK but not a real lock;
+		 * - this is a TEST lock;
+		 * - this is a F_CANCELLK lock (async flock has req_mode == 0)
+		 * - this is a deadlock (flock cannot be granted) */
+		if (lock->l_req_mode == 0 ||
+		    lock->l_req_mode == LCK_NL ||
+		    ldlm_is_test_lock(lock) ||
+		    ldlm_is_flock_deadlock(lock))
+			RETURN_EXIT;
+		ldlm_resource_add_lock(res, &res->lr_granted, lock);
+	} else {
+		LBUG();
+	}
+
+        ldlm_pool_add(&ldlm_res_to_ns(res)->ns_pool, lock);
+        EXIT;
+}
+
+/**
+ * Describe the overlap between two locks.  itree_overlap_cb data.
+ */
+struct lock_match_data {
+	struct ldlm_lock	*lmd_old;
+	struct ldlm_lock	*lmd_lock;
+	enum ldlm_mode		*lmd_mode;
+	union ldlm_policy_data	*lmd_policy;
+	__u64			 lmd_flags;
+	int			 lmd_unref;
+};
+
+/**
+ * Check if the given @lock meets the criteria for a match.
+ * A reference on the lock is taken if matched.
+ *
+ * \param lock     test-against this lock
+ * \param data	   parameters
+ */
+static int lock_matches(struct ldlm_lock *lock, struct lock_match_data *data)
+{
+	union ldlm_policy_data *lpol = &lock->l_policy_data;
+	enum ldlm_mode match;
+
+	if (lock == data->lmd_old)
+		return INTERVAL_ITER_STOP;
+
+	/* Check if this lock can be matched.
+	 * Used by LU-2919(exclusive open) for open lease lock */
+	if (ldlm_is_excl(lock))
+		return INTERVAL_ITER_CONT;
+
+	/* llite sometimes wants to match locks that will be
+	 * canceled when their users drop, but we allow it to match
+	 * if it passes in CBPENDING and the lock still has users.
+	 * this is generally only going to be used by children
+	 * whose parents already hold a lock so forward progress
+	 * can still happen. */
+	if (ldlm_is_cbpending(lock) &&
+	    !(data->lmd_flags & LDLM_FL_CBPENDING))
+		return INTERVAL_ITER_CONT;
+	if (!data->lmd_unref && ldlm_is_cbpending(lock) &&
+	    lock->l_readers == 0 && lock->l_writers == 0)
+		return INTERVAL_ITER_CONT;
+
+	if (!(lock->l_req_mode & *data->lmd_mode))
+		return INTERVAL_ITER_CONT;
+	match = lock->l_req_mode;
+
+	switch (lock->l_resource->lr_type) {
+	case LDLM_EXTENT:
+		if (lpol->l_extent.start > data->lmd_policy->l_extent.start ||
+		    lpol->l_extent.end < data->lmd_policy->l_extent.end)
+			return INTERVAL_ITER_CONT;
+
+		if (unlikely(match == LCK_GROUP) &&
+		    data->lmd_policy->l_extent.gid != LDLM_GID_ANY &&
+		    lpol->l_extent.gid != data->lmd_policy->l_extent.gid)
+			return INTERVAL_ITER_CONT;
+		break;
+	case LDLM_IBITS:
+		/* We match if we have existing lock with same or wider set
+		   of bits. */
+		if ((lpol->l_inodebits.bits &
+		     data->lmd_policy->l_inodebits.bits) !=
+		    data->lmd_policy->l_inodebits.bits)
+			return INTERVAL_ITER_CONT;
+		break;
+	default:
+		;
+	}
+
+	/* We match if we have existing lock with same or wider set
+	   of bits. */
+	if (!data->lmd_unref && LDLM_HAVE_MASK(lock, GONE))
+		return INTERVAL_ITER_CONT;
+
+	if (!equi(data->lmd_flags & LDLM_FL_LOCAL_ONLY, ldlm_is_local(lock)))
+		return INTERVAL_ITER_CONT;
+
+	if (data->lmd_flags & LDLM_FL_TEST_LOCK) {
+		LDLM_LOCK_GET(lock);
+		ldlm_lock_touch_in_lru(lock);
+	} else {
+		ldlm_lock_addref_internal_nolock(lock, match);
+	}
+
+	*data->lmd_mode = match;
+	data->lmd_lock = lock;
+
+	return INTERVAL_ITER_STOP;
+}
+
+static unsigned int itree_overlap_cb(struct interval_node *in, void *args)
+{
+	struct ldlm_interval *node = to_ldlm_interval(in);
+	struct lock_match_data *data = args;
+	struct ldlm_lock *lock;
+	int rc;
+
+	list_for_each_entry(lock, &node->li_group, l_sl_policy) {
+		rc = lock_matches(lock, data);
+		if (rc == INTERVAL_ITER_STOP)
+			return INTERVAL_ITER_STOP;
+	}
+	return INTERVAL_ITER_CONT;
+}
+
+/**
+ * Search for a lock with given parameters in interval trees.
+ *
+ * \param res      search for a lock in this resource
+ * \param data	   parameters
+ *
+ * \retval a referenced lock or NULL.
+ */
+static struct ldlm_lock *search_itree(struct ldlm_resource *res,
+				      struct lock_match_data *data)
+{
+	struct interval_node_extent ext = {
+		.start     = data->lmd_policy->l_extent.start,
+		.end       = data->lmd_policy->l_extent.end
+	};
+	int idx;
+
+	for (idx = 0; idx < LCK_MODE_NUM; idx++) {
+		struct ldlm_interval_tree *tree = &res->lr_itree[idx];
+
+		if (tree->lit_root == NULL)
+			continue;
+
+		if (!(tree->lit_mode & *data->lmd_mode))
+			continue;
+
+		interval_search(tree->lit_root, &ext,
+				itree_overlap_cb, data);
+	}
+	return data->lmd_lock;
+}
+
+
+/**
+ * Search for a lock with given properties in a queue.
+ *
+ * \param queue    search for a lock in this queue
+ * \param data	   parameters
+ *
+ * \retval a referenced lock or NULL.
+ */
+static struct ldlm_lock *search_queue(struct list_head *queue,
+				      struct lock_match_data *data)
+{
+	struct ldlm_lock *lock;
+	int rc;
+
+	list_for_each_entry(lock, queue, l_res_link) {
+		rc = lock_matches(lock, data);
+		if (rc == INTERVAL_ITER_STOP)
+			return data->lmd_lock;
+	}
+	return NULL;
+}
+
+void ldlm_lock_fail_match_locked(struct ldlm_lock *lock)
+{
+	if ((lock->l_flags & LDLM_FL_FAIL_NOTIFIED) == 0) {
+		lock->l_flags |= LDLM_FL_FAIL_NOTIFIED;
+		wake_up_all(&lock->l_waitq);
+	}
+}
+EXPORT_SYMBOL(ldlm_lock_fail_match_locked);
+
+void ldlm_lock_fail_match(struct ldlm_lock *lock)
+{
+        lock_res_and_lock(lock);
+        ldlm_lock_fail_match_locked(lock);
+        unlock_res_and_lock(lock);
+}
+
+/**
+ * Mark lock as "matchable" by OST.
+ *
+ * Used to prevent certain races in LOV/OSC where the lock is granted, but LVB
+ * is not yet valid.
+ * Assumes LDLM lock is already locked.
+ */
+void ldlm_lock_allow_match_locked(struct ldlm_lock *lock)
+{
+	ldlm_set_lvb_ready(lock);
+	wake_up_all(&lock->l_waitq);
+}
+EXPORT_SYMBOL(ldlm_lock_allow_match_locked);
+
+/**
+ * Mark lock as "matchable" by OST.
+ * Locks the lock and then \see ldlm_lock_allow_match_locked
+ */
+void ldlm_lock_allow_match(struct ldlm_lock *lock)
+{
+        lock_res_and_lock(lock);
+        ldlm_lock_allow_match_locked(lock);
+        unlock_res_and_lock(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_allow_match);
+
+/**
+ * Attempt to find a lock with specified properties.
+ *
+ * Typically returns a reference to matched lock unless LDLM_FL_TEST_LOCK is
+ * set in \a flags
+ *
+ * Can be called in two ways:
+ *
+ * If 'ns' is NULL, then lockh describes an existing lock that we want to look
+ * for a duplicate of.
+ *
+ * Otherwise, all of the fields must be filled in, to match against.
+ *
+ * If 'flags' contains LDLM_FL_LOCAL_ONLY, then only match local locks on the
+ *     server (ie, connh is NULL)
+ * If 'flags' contains LDLM_FL_BLOCK_GRANTED, then only locks on the granted
+ *     list will be considered
+ * If 'flags' contains LDLM_FL_CBPENDING, then locks that have been marked
+ *     to be canceled can still be matched as long as they still have reader
+ *     or writer refernces
+ * If 'flags' contains LDLM_FL_TEST_LOCK, then don't actually reference a lock,
+ *     just tell us if we would have matched.
+ *
+ * \retval 1 if it finds an already-existing lock that is compatible; in this
+ * case, lockh is filled in with a addref()ed lock
+ *
+ * We also check security context, and if that fails we simply return 0 (to
+ * keep caller code unchanged), the context failure will be discovered by
+ * caller sometime later.
+ */
+enum ldlm_mode ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags,
+			       const struct ldlm_res_id *res_id,
+			       enum ldlm_type type,
+			       union ldlm_policy_data *policy,
+			       enum ldlm_mode mode,
+			       struct lustre_handle *lockh, int unref)
+{
+	struct lock_match_data data = {
+		.lmd_old	= NULL,
+		.lmd_lock	= NULL,
+		.lmd_mode	= &mode,
+		.lmd_policy	= policy,
+		.lmd_flags	= flags,
+		.lmd_unref	= unref,
+	};
+	struct ldlm_resource *res;
+	struct ldlm_lock *lock;
+	int rc = 0;
+	ENTRY;
+
+	if (ns == NULL) {
+		data.lmd_old = ldlm_handle2lock(lockh);
+		LASSERT(data.lmd_old != NULL);
+
+		ns = ldlm_lock_to_ns(data.lmd_old);
+		res_id = &data.lmd_old->l_resource->lr_name;
+		type = data.lmd_old->l_resource->lr_type;
+		*data.lmd_mode = data.lmd_old->l_req_mode;
+	}
+
+	res = ldlm_resource_get(ns, NULL, res_id, type, 0);
+	if (IS_ERR(res)) {
+		LASSERT(data.lmd_old == NULL);
+		RETURN(0);
+	}
+
+	LDLM_RESOURCE_ADDREF(res);
+	lock_res(res);
+
+	if (res->lr_type == LDLM_EXTENT)
+		lock = search_itree(res, &data);
+	else
+		lock = search_queue(&res->lr_granted, &data);
+	if (lock != NULL)
+		GOTO(out, rc = 1);
+	if (flags & LDLM_FL_BLOCK_GRANTED)
+		GOTO(out, rc = 0);
+	lock = search_queue(&res->lr_converting, &data);
+	if (lock != NULL)
+		GOTO(out, rc = 1);
+	lock = search_queue(&res->lr_waiting, &data);
+	if (lock != NULL)
+		GOTO(out, rc = 1);
+
+        EXIT;
+ out:
+        unlock_res(res);
+        LDLM_RESOURCE_DELREF(res);
+        ldlm_resource_putref(res);
+
+        if (lock) {
+                ldlm_lock2handle(lock, lockh);
+                if ((flags & LDLM_FL_LVB_READY) &&
+		    (!ldlm_is_lvb_ready(lock))) {
+			__u64 wait_flags = LDLM_FL_LVB_READY |
+				LDLM_FL_DESTROYED | LDLM_FL_FAIL_NOTIFIED;
+                        struct l_wait_info lwi;
+                        if (lock->l_completion_ast) {
+                                int err = lock->l_completion_ast(lock,
+                                                          LDLM_FL_WAIT_NOREPROC,
+                                                                 NULL);
+                                if (err) {
+                                        if (flags & LDLM_FL_TEST_LOCK)
+                                                LDLM_LOCK_RELEASE(lock);
+                                        else
+                                                ldlm_lock_decref_internal(lock,
+                                                                          mode);
+                                        rc = 0;
+                                        goto out2;
+                                }
+                        }
+
+                        lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(obd_timeout),
+                                               NULL, LWI_ON_SIGNAL_NOOP, NULL);
+
+			/* XXX FIXME see comment on CAN_MATCH in lustre_dlm.h */
+			l_wait_event(lock->l_waitq,
+				     lock->l_flags & wait_flags,
+				     &lwi);
+			if (!ldlm_is_lvb_ready(lock)) {
+                                if (flags & LDLM_FL_TEST_LOCK)
+                                        LDLM_LOCK_RELEASE(lock);
+                                else
+                                        ldlm_lock_decref_internal(lock, mode);
+                                rc = 0;
+                        }
+                }
+        }
+ out2:
+        if (rc) {
+		LDLM_DEBUG(lock, "matched (%llu %llu)",
+                           (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+                                res_id->name[2] : policy->l_extent.start,
+                           (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+                                res_id->name[3] : policy->l_extent.end);
+
+                /* check user's security context */
+                if (lock->l_conn_export &&
+                    sptlrpc_import_check_ctx(
+                                class_exp2cliimp(lock->l_conn_export))) {
+                        if (!(flags & LDLM_FL_TEST_LOCK))
+                                ldlm_lock_decref_internal(lock, mode);
+                        rc = 0;
+                }
+
+                if (flags & LDLM_FL_TEST_LOCK)
+                        LDLM_LOCK_RELEASE(lock);
+
+        } else if (!(flags & LDLM_FL_TEST_LOCK)) {/*less verbose for test-only*/
+                LDLM_DEBUG_NOLOCK("not matched ns %p type %u mode %u res "
+				  "%llu/%llu (%llu %llu)", ns,
+                                  type, mode, res_id->name[0], res_id->name[1],
+                                  (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+                                        res_id->name[2] :policy->l_extent.start,
+                                  (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+                                        res_id->name[3] : policy->l_extent.end);
+        }
+	if (data.lmd_old != NULL)
+		LDLM_LOCK_PUT(data.lmd_old);
+
+	return rc ? mode : 0;
+}
+EXPORT_SYMBOL(ldlm_lock_match);
+
+enum ldlm_mode ldlm_revalidate_lock_handle(const struct lustre_handle *lockh,
+					   __u64 *bits)
+{
+	struct ldlm_lock *lock;
+	enum ldlm_mode mode = 0;
+	ENTRY;
+
+	lock = ldlm_handle2lock(lockh);
+	if (lock != NULL) {
+		lock_res_and_lock(lock);
+		if (LDLM_HAVE_MASK(lock, GONE))
+			GOTO(out, mode);
+
+		if (ldlm_is_cbpending(lock) &&
+                    lock->l_readers == 0 && lock->l_writers == 0)
+                        GOTO(out, mode);
+
+                if (bits)
+                        *bits = lock->l_policy_data.l_inodebits.bits;
+                mode = lock->l_granted_mode;
+                ldlm_lock_addref_internal_nolock(lock, mode);
+        }
+
+        EXIT;
+
+out:
+        if (lock != NULL) {
+                unlock_res_and_lock(lock);
+                LDLM_LOCK_PUT(lock);
+        }
+        return mode;
+}
+EXPORT_SYMBOL(ldlm_revalidate_lock_handle);
+
+/** The caller must guarantee that the buffer is large enough. */
+int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill,
+		  enum req_location loc, void *data, int size)
+{
+	void *lvb;
+	ENTRY;
+
+	LASSERT(data != NULL);
+	LASSERT(size >= 0);
+
+	switch (lock->l_lvb_type) {
+	case LVB_T_OST:
+		if (size == sizeof(struct ost_lvb)) {
+			if (loc == RCL_CLIENT)
+				lvb = req_capsule_client_swab_get(pill,
+						&RMF_DLM_LVB,
+						lustre_swab_ost_lvb);
+			else
+				lvb = req_capsule_server_swab_get(pill,
+						&RMF_DLM_LVB,
+						lustre_swab_ost_lvb);
+			if (unlikely(lvb == NULL)) {
+				LDLM_ERROR(lock, "no LVB");
+				RETURN(-EPROTO);
+			}
+
+			memcpy(data, lvb, size);
+		} else if (size == sizeof(struct ost_lvb_v1)) {
+			struct ost_lvb *olvb = data;
+
+			if (loc == RCL_CLIENT)
+				lvb = req_capsule_client_swab_get(pill,
+						&RMF_DLM_LVB,
+						lustre_swab_ost_lvb_v1);
+			else
+				lvb = req_capsule_server_sized_swab_get(pill,
+						&RMF_DLM_LVB, size,
+						lustre_swab_ost_lvb_v1);
+			if (unlikely(lvb == NULL)) {
+				LDLM_ERROR(lock, "no LVB");
+				RETURN(-EPROTO);
+			}
+
+			memcpy(data, lvb, size);
+			olvb->lvb_mtime_ns = 0;
+			olvb->lvb_atime_ns = 0;
+			olvb->lvb_ctime_ns = 0;
+		} else {
+			LDLM_ERROR(lock, "Replied unexpected ost LVB size %d",
+				   size);
+			RETURN(-EINVAL);
+		}
+		break;
+	case LVB_T_LQUOTA:
+		if (size == sizeof(struct lquota_lvb)) {
+			if (loc == RCL_CLIENT)
+				lvb = req_capsule_client_swab_get(pill,
+						&RMF_DLM_LVB,
+						lustre_swab_lquota_lvb);
+			else
+				lvb = req_capsule_server_swab_get(pill,
+						&RMF_DLM_LVB,
+						lustre_swab_lquota_lvb);
+			if (unlikely(lvb == NULL)) {
+				LDLM_ERROR(lock, "no LVB");
+				RETURN(-EPROTO);
+			}
+
+			memcpy(data, lvb, size);
+		} else {
+			LDLM_ERROR(lock, "Replied unexpected lquota LVB size %d",
+				   size);
+			RETURN(-EINVAL);
+		}
+		break;
+	case LVB_T_LAYOUT:
+		if (size == 0)
+			break;
+
+		if (loc == RCL_CLIENT)
+			lvb = req_capsule_client_get(pill, &RMF_DLM_LVB);
+		else
+			lvb = req_capsule_server_get(pill, &RMF_DLM_LVB);
+		if (unlikely(lvb == NULL)) {
+			LDLM_ERROR(lock, "no LVB");
+			RETURN(-EPROTO);
+		}
+
+		memcpy(data, lvb, size);
+		break;
+	default:
+		LDLM_ERROR(lock, "Unknown LVB type: %d", lock->l_lvb_type);
+		libcfs_debug_dumpstack(NULL);
+		RETURN(-EINVAL);
+	}
+
+	RETURN(0);
+}
+
+/**
+ * Create and fill in new LDLM lock with specified properties.
+ * Returns a referenced lock
+ */
+struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns,
+				   const struct ldlm_res_id *res_id,
+				   enum ldlm_type type,
+				   enum ldlm_mode mode,
+				   const struct ldlm_callback_suite *cbs,
+				   void *data, __u32 lvb_len,
+				   enum lvb_type lvb_type)
+{
+	struct ldlm_lock	*lock;
+	struct ldlm_resource	*res;
+	int			rc;
+	ENTRY;
+
+	res = ldlm_resource_get(ns, NULL, res_id, type, 1);
+	if (IS_ERR(res))
+		RETURN(ERR_CAST(res));
+
+	lock = ldlm_lock_new(res);
+	if (lock == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	lock->l_req_mode = mode;
+	lock->l_ast_data = data;
+	lock->l_pid = current_pid();
+	if (ns_is_server(ns))
+		ldlm_set_ns_srv(lock);
+	if (cbs) {
+		lock->l_blocking_ast = cbs->lcs_blocking;
+		lock->l_completion_ast = cbs->lcs_completion;
+		lock->l_glimpse_ast = cbs->lcs_glimpse;
+	}
+
+	lock->l_tree_node = NULL;
+	/* if this is the extent lock, allocate the interval tree node */
+	if (type == LDLM_EXTENT)
+		if (ldlm_interval_alloc(lock) == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+	if (lvb_len) {
+		lock->l_lvb_len = lvb_len;
+		OBD_ALLOC_LARGE(lock->l_lvb_data, lvb_len);
+		if (lock->l_lvb_data == NULL)
+			GOTO(out, rc = -ENOMEM);
+	}
+
+	lock->l_lvb_type = lvb_type;
+	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_NEW_LOCK))
+		GOTO(out, rc = -ENOENT);
+
+	RETURN(lock);
+
+out:
+	ldlm_lock_destroy(lock);
+	LDLM_LOCK_RELEASE(lock);
+	RETURN(ERR_PTR(rc));
+}
+
+/**
+ * Enqueue (request) a lock.
+ *
+ * Does not block. As a result of enqueue the lock would be put
+ * into granted or waiting list.
+ *
+ * If namespace has intent policy sent and the lock has LDLM_FL_HAS_INTENT flag
+ * set, skip all the enqueueing and delegate lock processing to intent policy
+ * function.
+ */
+enum ldlm_error ldlm_lock_enqueue(struct ldlm_namespace *ns,
+				  struct ldlm_lock **lockp,
+				  void *cookie, __u64 *flags)
+{
+	struct ldlm_lock *lock = *lockp;
+	struct ldlm_resource *res = lock->l_resource;
+	int local = ns_is_client(ldlm_res_to_ns(res));
+#ifdef HAVE_SERVER_SUPPORT
+	ldlm_processing_policy policy;
+#endif
+	enum ldlm_error rc = ELDLM_OK;
+	struct ldlm_interval *node = NULL;
+	ENTRY;
+
+        /* policies are not executed on the client or during replay */
+        if ((*flags & (LDLM_FL_HAS_INTENT|LDLM_FL_REPLAY)) == LDLM_FL_HAS_INTENT
+            && !local && ns->ns_policy) {
+                rc = ns->ns_policy(ns, lockp, cookie, lock->l_req_mode, *flags,
+                                   NULL);
+                if (rc == ELDLM_LOCK_REPLACED) {
+                        /* The lock that was returned has already been granted,
+                         * and placed into lockp.  If it's not the same as the
+                         * one we passed in, then destroy the old one and our
+                         * work here is done. */
+                        if (lock != *lockp) {
+                                ldlm_lock_destroy(lock);
+                                LDLM_LOCK_RELEASE(lock);
+                        }
+                        *flags |= LDLM_FL_LOCK_CHANGED;
+                        RETURN(0);
+		} else if (rc != ELDLM_OK &&
+			   lock->l_req_mode == lock->l_granted_mode) {
+			LASSERT(*flags & LDLM_FL_RESENT);
+			/* It may happen that ns_policy returns an error in
+			 * resend case, object may be unlinked or just some
+			 * error occurs. It is unclear if lock reached the
+			 * client in the original reply, just leave the lock on
+			 * server, not returning it again to client. Due to
+			 * LU-6529, the server will not OOM. */
+			RETURN(rc);
+                } else if (rc != ELDLM_OK ||
+                           (rc == ELDLM_OK && (*flags & LDLM_FL_INTENT_ONLY))) {
+                        ldlm_lock_destroy(lock);
+                        RETURN(rc);
+                }
+        }
+
+	if (*flags & LDLM_FL_RESENT) {
+		/* Reconstruct LDLM_FL_SRV_ENQ_MASK @flags for reply.
+		 * Set LOCK_CHANGED always.
+		 * Check if the lock is granted for BLOCK_GRANTED.
+		 * Take NO_TIMEOUT from the lock as it is inherited through
+		 * LDLM_FL_INHERIT_MASK */
+		*flags |= LDLM_FL_LOCK_CHANGED;
+		if (lock->l_req_mode != lock->l_granted_mode)
+			*flags |= LDLM_FL_BLOCK_GRANTED;
+		*flags |= lock->l_flags & LDLM_FL_NO_TIMEOUT;
+		RETURN(ELDLM_OK);
+	}
+
+	/* For a replaying lock, it might be already in granted list. So
+	 * unlinking the lock will cause the interval node to be freed, we
+	 * have to allocate the interval node early otherwise we can't regrant
+	 * this lock in the future. - jay */
+	if (!local && (*flags & LDLM_FL_REPLAY) && res->lr_type == LDLM_EXTENT)
+		OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, GFP_NOFS);
+
+        lock_res_and_lock(lock);
+        if (local && lock->l_req_mode == lock->l_granted_mode) {
+                /* The server returned a blocked lock, but it was granted
+                 * before we got a chance to actually enqueue it.  We don't
+                 * need to do anything else. */
+                *flags &= ~LDLM_FL_BLOCKED_MASK;
+		GOTO(out, rc = ELDLM_OK);
+        }
+
+        ldlm_resource_unlink_lock(lock);
+        if (res->lr_type == LDLM_EXTENT && lock->l_tree_node == NULL) {
+                if (node == NULL) {
+                        ldlm_lock_destroy_nolock(lock);
+                        GOTO(out, rc = -ENOMEM);
+                }
+
+		INIT_LIST_HEAD(&node->li_group);
+                ldlm_interval_attach(node, lock);
+                node = NULL;
+        }
+
+	/* Some flags from the enqueue want to make it into the AST, via the
+	 * lock's l_flags. */
+	if (*flags & LDLM_FL_AST_DISCARD_DATA)
+		ldlm_set_ast_discard_data(lock);
+	if (*flags & LDLM_FL_TEST_LOCK)
+		ldlm_set_test_lock(lock);
+	if (*flags & LDLM_FL_COS_INCOMPAT)
+		ldlm_set_cos_incompat(lock);
+	if (*flags & LDLM_FL_COS_ENABLED)
+		ldlm_set_cos_enabled(lock);
+
+	/* This distinction between local lock trees is very important; a client
+	 * namespace only has information about locks taken by that client, and
+	 * thus doesn't have enough information to decide for itself if it can
+	 * be granted (below).  In this case, we do exactly what the server
+	 * tells us to do, as dictated by the 'flags'.
+	 *
+	 * We do exactly the same thing during recovery, when the server is
+	 * more or less trusting the clients not to lie.
+	 *
+	 * FIXME (bug 268): Detect obvious lies by checking compatibility in
+	 * granted/converting queues. */
+        if (local) {
+                if (*flags & LDLM_FL_BLOCK_CONV)
+                        ldlm_resource_add_lock(res, &res->lr_converting, lock);
+                else if (*flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED))
+                        ldlm_resource_add_lock(res, &res->lr_waiting, lock);
+                else
+                        ldlm_grant_lock(lock, NULL);
+		GOTO(out, rc = ELDLM_OK);
+#ifdef HAVE_SERVER_SUPPORT
+        } else if (*flags & LDLM_FL_REPLAY) {
+                if (*flags & LDLM_FL_BLOCK_CONV) {
+                        ldlm_resource_add_lock(res, &res->lr_converting, lock);
+			GOTO(out, rc = ELDLM_OK);
+                } else if (*flags & LDLM_FL_BLOCK_WAIT) {
+                        ldlm_resource_add_lock(res, &res->lr_waiting, lock);
+			GOTO(out, rc = ELDLM_OK);
+                } else if (*flags & LDLM_FL_BLOCK_GRANTED) {
+                        ldlm_grant_lock(lock, NULL);
+			GOTO(out, rc = ELDLM_OK);
+                }
+                /* If no flags, fall through to normal enqueue path. */
+        }
+
+        policy = ldlm_processing_policy_table[res->lr_type];
+	policy(lock, flags, LDLM_PROCESS_ENQUEUE, &rc, NULL);
+        GOTO(out, rc);
+#else
+        } else {
+                CERROR("This is client-side-only module, cannot handle "
+                       "LDLM_NAMESPACE_SERVER resource type lock.\n");
+                LBUG();
+        }
+#endif
+
+out:
+        unlock_res_and_lock(lock);
+        if (node)
+                OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
+        return rc;
+}
+
+#ifdef HAVE_SERVER_SUPPORT
+/**
+ * Iterate through all waiting locks on a given resource queue and attempt to
+ * grant them.
+ *
+ * Must be called with resource lock held.
+ */
+int ldlm_reprocess_queue(struct ldlm_resource *res, struct list_head *queue,
+			 struct list_head *work_list,
+			 enum ldlm_process_intention intention)
+{
+	struct list_head *tmp, *pos;
+	ldlm_processing_policy policy;
+	__u64 flags;
+	int rc = LDLM_ITER_CONTINUE;
+	enum ldlm_error err;
+	ENTRY;
+
+	check_res_locked(res);
+
+	policy = ldlm_processing_policy_table[res->lr_type];
+	LASSERT(policy);
+	LASSERT(intention == LDLM_PROCESS_RESCAN ||
+		intention == LDLM_PROCESS_RECOVERY);
+
+	list_for_each_safe(tmp, pos, queue) {
+		struct ldlm_lock *pending;
+
+		pending = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+                CDEBUG(D_INFO, "Reprocessing lock %p\n", pending);
+
+                flags = 0;
+		rc = policy(pending, &flags, intention, &err, work_list);
+		/*
+		 * When this is called from recovery done, we always want
+		 * to scan the whole list no matter what 'rc' is returned.
+		 */
+		if (rc != LDLM_ITER_CONTINUE &&
+		    intention == LDLM_PROCESS_RESCAN)
+			break;
+        }
+
+        RETURN(intention == LDLM_PROCESS_RESCAN ? rc : LDLM_ITER_CONTINUE);
+}
+
+/**
+ * Conflicting locks are detected for a lock to be enqueued, add the lock
+ * into waiting list and send blocking ASTs to the conflicting locks.
+ *
+ * \param[in] lock		The lock to be enqueued.
+ * \param[out] flags		Lock flags for the lock to be enqueued.
+ * \param[in] rpc_list		Conflicting locks list.
+ * \param[in] grant_flags	extra flags when granting a lock.
+ *
+ * \retval -ERESTART:	Some lock was instantly canceled while sending
+ * 			blocking ASTs, caller needs to re-check conflicting
+ * 			locks.
+ * \retval -EAGAIN:	Lock was destroyed, caller should return error.
+ * \reval 0:		Lock is successfully added in waiting list.
+ */
+int ldlm_handle_conflict_lock(struct ldlm_lock *lock, __u64 *flags,
+			      struct list_head *rpc_list, __u64 grant_flags)
+{
+	struct ldlm_resource *res = lock->l_resource;
+	int rc;
+	ENTRY;
+
+	check_res_locked(res);
+
+	/* If either of the compat_queue()s returned failure, then we
+	 * have ASTs to send and must go onto the waiting list.
+	 *
+	 * bug 2322: we used to unlink and re-add here, which was a
+	 * terrible folly -- if we goto restart, we could get
+	 * re-ordered!  Causes deadlock, because ASTs aren't sent! */
+	if (list_empty(&lock->l_res_link))
+		ldlm_resource_add_lock(res, &res->lr_waiting, lock);
+	unlock_res(res);
+
+	rc = ldlm_run_ast_work(ldlm_res_to_ns(res), rpc_list,
+			       LDLM_WORK_BL_AST);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_OST_FAIL_RACE) &&
+	    !ns_is_client(ldlm_res_to_ns(res)))
+		class_fail_export(lock->l_export);
+
+	lock_res(res);
+	if (rc == -ERESTART) {
+		/* 15715: The lock was granted and destroyed after
+		 * resource lock was dropped. Interval node was freed
+		 * in ldlm_lock_destroy. Anyway, this always happens
+		 * when a client is being evicted. So it would be
+		 * ok to return an error. -jay */
+		if (ldlm_is_destroyed(lock))
+			RETURN(-EAGAIN);
+
+		/* lock was granted while resource was unlocked. */
+		if (lock->l_granted_mode == lock->l_req_mode) {
+			/* bug 11300: if the lock has been granted,
+			 * break earlier because otherwise, we will go
+			 * to restart and ldlm_resource_unlink will be
+			 * called and it causes the interval node to be
+			 * freed. Then we will fail at
+			 * ldlm_extent_add_lock() */
+			*flags &= ~LDLM_FL_BLOCKED_MASK;
+			RETURN(0);
+		}
+
+		RETURN(rc);
+	}
+	*flags |= (LDLM_FL_BLOCK_GRANTED | grant_flags);
+
+	RETURN(0);
+}
+
+/**
+ * Discard all AST work items from list.
+ *
+ * If for whatever reason we do not want to send ASTs to conflicting locks
+ * anymore, disassemble the list with this function.
+ */
+void ldlm_discard_bl_list(struct list_head *bl_list)
+{
+	struct list_head *tmp, *pos;
+        ENTRY;
+
+	list_for_each_safe(pos, tmp, bl_list) {
+                struct ldlm_lock *lock =
+			list_entry(pos, struct ldlm_lock, l_bl_ast);
+
+		list_del_init(&lock->l_bl_ast);
+		LASSERT(ldlm_is_ast_sent(lock));
+		ldlm_clear_ast_sent(lock);
+		LASSERT(lock->l_bl_ast_run == 0);
+		LASSERT(lock->l_blocking_lock);
+		LDLM_LOCK_RELEASE(lock->l_blocking_lock);
+		lock->l_blocking_lock = NULL;
+		LDLM_LOCK_RELEASE(lock);
+	}
+	EXIT;
+}
+
+#endif
+
+/**
+ * Process a call to blocking AST callback for a lock in ast_work list
+ */
+static int
+ldlm_work_bl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+	struct ldlm_cb_set_arg *arg = opaq;
+	struct ldlm_lock_desc   d;
+	int                     rc;
+	struct ldlm_lock       *lock;
+	ENTRY;
+
+	if (list_empty(arg->list))
+		RETURN(-ENOENT);
+
+	lock = list_entry(arg->list->next, struct ldlm_lock, l_bl_ast);
+
+	/* nobody should touch l_bl_ast */
+	lock_res_and_lock(lock);
+	list_del_init(&lock->l_bl_ast);
+
+	LASSERT(ldlm_is_ast_sent(lock));
+	LASSERT(lock->l_bl_ast_run == 0);
+	LASSERT(lock->l_blocking_lock);
+	lock->l_bl_ast_run++;
+	unlock_res_and_lock(lock);
+
+	ldlm_lock2desc(lock->l_blocking_lock, &d);
+
+	rc = lock->l_blocking_ast(lock, &d, (void *)arg, LDLM_CB_BLOCKING);
+	LDLM_LOCK_RELEASE(lock->l_blocking_lock);
+	lock->l_blocking_lock = NULL;
+	LDLM_LOCK_RELEASE(lock);
+
+	RETURN(rc);
+}
+
+/**
+ * Process a call to completion AST callback for a lock in ast_work list
+ */
+static int
+ldlm_work_cp_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+	struct ldlm_cb_set_arg  *arg = opaq;
+	int                      rc = 0;
+	struct ldlm_lock        *lock;
+	ldlm_completion_callback completion_callback;
+	ENTRY;
+
+	if (list_empty(arg->list))
+		RETURN(-ENOENT);
+
+	lock = list_entry(arg->list->next, struct ldlm_lock, l_cp_ast);
+
+	/* It's possible to receive a completion AST before we've set
+	 * the l_completion_ast pointer: either because the AST arrived
+	 * before the reply, or simply because there's a small race
+	 * window between receiving the reply and finishing the local
+	 * enqueue. (bug 842)
+	 *
+	 * This can't happen with the blocking_ast, however, because we
+	 * will never call the local blocking_ast until we drop our
+	 * reader/writer reference, which we won't do until we get the
+	 * reply and finish enqueueing. */
+
+	/* nobody should touch l_cp_ast */
+	lock_res_and_lock(lock);
+	list_del_init(&lock->l_cp_ast);
+	LASSERT(ldlm_is_cp_reqd(lock));
+	/* save l_completion_ast since it can be changed by
+	 * mds_intent_policy(), see bug 14225 */
+	completion_callback = lock->l_completion_ast;
+	ldlm_clear_cp_reqd(lock);
+	unlock_res_and_lock(lock);
+
+	if (completion_callback != NULL)
+		rc = completion_callback(lock, 0, (void *)arg);
+	LDLM_LOCK_RELEASE(lock);
+
+	RETURN(rc);
+}
+
+/**
+ * Process a call to revocation AST callback for a lock in ast_work list
+ */
+static int
+ldlm_work_revoke_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+	struct ldlm_cb_set_arg *arg = opaq;
+	struct ldlm_lock_desc   desc;
+	int                     rc;
+	struct ldlm_lock       *lock;
+	ENTRY;
+
+	if (list_empty(arg->list))
+		RETURN(-ENOENT);
+
+	lock = list_entry(arg->list->next, struct ldlm_lock, l_rk_ast);
+	list_del_init(&lock->l_rk_ast);
+
+	/* the desc just pretend to exclusive */
+	ldlm_lock2desc(lock, &desc);
+	desc.l_req_mode = LCK_EX;
+	desc.l_granted_mode = 0;
+
+	rc = lock->l_blocking_ast(lock, &desc, (void*)arg, LDLM_CB_BLOCKING);
+	LDLM_LOCK_RELEASE(lock);
+
+	RETURN(rc);
+}
+
+/**
+ * Process a call to glimpse AST callback for a lock in ast_work list
+ */
+int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+	struct ldlm_cb_set_arg		*arg = opaq;
+	struct ldlm_glimpse_work	*gl_work;
+	struct ldlm_lock		*lock;
+	int				 rc = 0;
+	ENTRY;
+
+	if (list_empty(arg->list))
+		RETURN(-ENOENT);
+
+	gl_work = list_entry(arg->list->next, struct ldlm_glimpse_work,
+				 gl_list);
+	list_del_init(&gl_work->gl_list);
+
+	lock = gl_work->gl_lock;
+
+	/* transfer the glimpse descriptor to ldlm_cb_set_arg */
+	arg->gl_desc = gl_work->gl_desc;
+	arg->gl_interpret_reply = gl_work->gl_interpret_reply;
+	arg->gl_interpret_data = gl_work->gl_interpret_data;
+
+	/* invoke the actual glimpse callback */
+	if (lock->l_glimpse_ast(lock, (void*)arg) == 0)
+		rc = 1;
+
+	LDLM_LOCK_RELEASE(lock);
+
+	if ((gl_work->gl_flags & LDLM_GL_WORK_NOFREE) == 0)
+		OBD_FREE_PTR(gl_work);
+
+	RETURN(rc);
+}
+
+/**
+ * Process list of locks in need of ASTs being sent.
+ *
+ * Used on server to send multiple ASTs together instead of sending one by
+ * one.
+ */
+int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
+                      ldlm_desc_ast_t ast_type)
+{
+	struct ldlm_cb_set_arg *arg;
+	set_producer_func       work_ast_lock;
+	int                     rc;
+
+	if (list_empty(rpc_list))
+		RETURN(0);
+
+	OBD_ALLOC_PTR(arg);
+	if (arg == NULL)
+		RETURN(-ENOMEM);
+
+	atomic_set(&arg->restart, 0);
+	arg->list = rpc_list;
+
+	switch (ast_type) {
+		case LDLM_WORK_BL_AST:
+			arg->type = LDLM_BL_CALLBACK;
+			work_ast_lock = ldlm_work_bl_ast_lock;
+			break;
+		case LDLM_WORK_CP_AST:
+			arg->type = LDLM_CP_CALLBACK;
+			work_ast_lock = ldlm_work_cp_ast_lock;
+			break;
+		case LDLM_WORK_REVOKE_AST:
+			arg->type = LDLM_BL_CALLBACK;
+			work_ast_lock = ldlm_work_revoke_ast_lock;
+			break;
+		case LDLM_WORK_GL_AST:
+			arg->type = LDLM_GL_CALLBACK;
+			work_ast_lock = ldlm_work_gl_ast_lock;
+			break;
+		default:
+			LBUG();
+	}
+
+	/* We create a ptlrpc request set with flow control extension.
+	 * This request set will use the work_ast_lock function to produce new
+	 * requests and will send a new request each time one completes in order
+	 * to keep the number of requests in flight to ns_max_parallel_ast */
+	arg->set = ptlrpc_prep_fcset(ns->ns_max_parallel_ast ? : UINT_MAX,
+				     work_ast_lock, arg);
+	if (arg->set == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	ptlrpc_set_wait(arg->set);
+	ptlrpc_set_destroy(arg->set);
+
+	rc = atomic_read(&arg->restart) ? -ERESTART : 0;
+	GOTO(out, rc);
+out:
+	OBD_FREE_PTR(arg);
+	return rc;
+}
+
+/**
+ * Try to grant all waiting locks on a resource.
+ *
+ * Calls ldlm_reprocess_queue on converting and waiting queues.
+ *
+ * Typically called after some resource locks are cancelled to see
+ * if anything could be granted as a result of the cancellation.
+ */
+static void __ldlm_reprocess_all(struct ldlm_resource *res,
+				 enum ldlm_process_intention intention)
+{
+	struct list_head rpc_list;
+#ifdef HAVE_SERVER_SUPPORT
+	struct obd_device *obd;
+        int rc;
+        ENTRY;
+
+	INIT_LIST_HEAD(&rpc_list);
+        /* Local lock trees don't get reprocessed. */
+        if (ns_is_client(ldlm_res_to_ns(res))) {
+                EXIT;
+                return;
+        }
+
+	/* Disable reprocess during lock replay stage but allow during
+	 * request replay stage.
+	 */
+	obd = ldlm_res_to_ns(res)->ns_obd;
+	if (obd->obd_recovering &&
+	    atomic_read(&obd->obd_req_replay_clients) == 0)
+		RETURN_EXIT;
+restart:
+	lock_res(res);
+	rc = ldlm_reprocess_queue(res, &res->lr_converting, &rpc_list,
+				  intention);
+	if (rc == LDLM_ITER_CONTINUE)
+		ldlm_reprocess_queue(res, &res->lr_waiting, &rpc_list,
+				     intention);
+	unlock_res(res);
+
+        rc = ldlm_run_ast_work(ldlm_res_to_ns(res), &rpc_list,
+                               LDLM_WORK_CP_AST);
+        if (rc == -ERESTART) {
+		LASSERT(list_empty(&rpc_list));
+                goto restart;
+        }
+#else
+        ENTRY;
+
+	INIT_LIST_HEAD(&rpc_list);
+        if (!ns_is_client(ldlm_res_to_ns(res))) {
+                CERROR("This is client-side-only module, cannot handle "
+                       "LDLM_NAMESPACE_SERVER resource type lock.\n");
+                LBUG();
+        }
+#endif
+        EXIT;
+}
+
+void ldlm_reprocess_all(struct ldlm_resource *res)
+{
+	__ldlm_reprocess_all(res, LDLM_PROCESS_RESCAN);
+}
+EXPORT_SYMBOL(ldlm_reprocess_all);
+
+static int ldlm_reprocess_res(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			      struct hlist_node *hnode, void *arg)
+{
+	struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+
+	/* This is only called once after recovery done. LU-8306. */
+	__ldlm_reprocess_all(res, LDLM_PROCESS_RECOVERY);
+	return 0;
+}
+
+/**
+ * Iterate through all resources on a namespace attempting to grant waiting
+ * locks.
+ */
+void ldlm_reprocess_recovery_done(struct ldlm_namespace *ns)
+{
+	ENTRY;
+
+	if (ns != NULL) {
+		cfs_hash_for_each_nolock(ns->ns_rs_hash,
+					 ldlm_reprocess_res, NULL, 0);
+	}
+	EXIT;
+}
+
+/**
+ * Helper function to call blocking AST for LDLM lock \a lock in a
+ * "cancelling" mode.
+ */
+void ldlm_cancel_callback(struct ldlm_lock *lock)
+{
+	check_res_locked(lock->l_resource);
+	if (!ldlm_is_cancel(lock)) {
+		ldlm_set_cancel(lock);
+		if (lock->l_blocking_ast) {
+                        unlock_res_and_lock(lock);
+                        lock->l_blocking_ast(lock, NULL, lock->l_ast_data,
+                                             LDLM_CB_CANCELING);
+                        lock_res_and_lock(lock);
+                } else {
+                        LDLM_DEBUG(lock, "no blocking ast");
+                }
+
+		/* only canceller can set bl_done bit */
+		ldlm_set_bl_done(lock);
+		wake_up_all(&lock->l_waitq);
+	} else if (!ldlm_is_bl_done(lock)) {
+		struct l_wait_info lwi = { 0 };
+
+		/* The lock is guaranteed to have been canceled once
+		 * returning from this function. */
+		unlock_res_and_lock(lock);
+		l_wait_event(lock->l_waitq, is_bl_done(lock), &lwi);
+		lock_res_and_lock(lock);
+	}
+}
+
+/**
+ * Remove skiplist-enabled LDLM lock \a req from granted list
+ */
+void ldlm_unlink_lock_skiplist(struct ldlm_lock *req)
+{
+        if (req->l_resource->lr_type != LDLM_PLAIN &&
+            req->l_resource->lr_type != LDLM_IBITS)
+                return;
+
+	list_del_init(&req->l_sl_policy);
+	list_del_init(&req->l_sl_mode);
+}
+
+/**
+ * Attempts to cancel LDLM lock \a lock that has no reader/writer references.
+ */
+void ldlm_lock_cancel(struct ldlm_lock *lock)
+{
+        struct ldlm_resource *res;
+        struct ldlm_namespace *ns;
+        ENTRY;
+
+        lock_res_and_lock(lock);
+
+        res = lock->l_resource;
+        ns  = ldlm_res_to_ns(res);
+
+        /* Please do not, no matter how tempting, remove this LBUG without
+         * talking to me first. -phik */
+        if (lock->l_readers || lock->l_writers) {
+                LDLM_ERROR(lock, "lock still has references");
+                LBUG();
+        }
+
+	if (ldlm_is_waited(lock))
+		ldlm_del_waiting_lock(lock);
+
+        /* Releases cancel callback. */
+        ldlm_cancel_callback(lock);
+
+	/* Yes, second time, just in case it was added again while we were
+	 * running with no res lock in ldlm_cancel_callback */
+	if (ldlm_is_waited(lock))
+		ldlm_del_waiting_lock(lock);
+
+        ldlm_resource_unlink_lock(lock);
+        ldlm_lock_destroy_nolock(lock);
+
+        if (lock->l_granted_mode == lock->l_req_mode)
+                ldlm_pool_del(&ns->ns_pool, lock);
+
+        /* Make sure we will not be called again for same lock what is possible
+         * if not to zero out lock->l_granted_mode */
+        lock->l_granted_mode = LCK_MINMODE;
+        unlock_res_and_lock(lock);
+
+        EXIT;
+}
+EXPORT_SYMBOL(ldlm_lock_cancel);
+
+/**
+ * Set opaque data into the lock that only makes sense to upper layer.
+ */
+int ldlm_lock_set_data(const struct lustre_handle *lockh, void *data)
+{
+        struct ldlm_lock *lock = ldlm_handle2lock(lockh);
+        int rc = -EINVAL;
+        ENTRY;
+
+        if (lock) {
+                if (lock->l_ast_data == NULL)
+                        lock->l_ast_data = data;
+                if (lock->l_ast_data == data)
+                        rc = 0;
+                LDLM_LOCK_PUT(lock);
+        }
+        RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_lock_set_data);
+
+struct export_cl_data {
+	struct obd_export	*ecl_exp;
+	int			ecl_loop;
+};
+
+static void ldlm_cancel_lock_for_export(struct obd_export *exp,
+					struct ldlm_lock *lock,
+					struct export_cl_data *ecl)
+{
+	struct ldlm_resource *res;
+
+	res = ldlm_resource_getref(lock->l_resource);
+
+	ldlm_res_lvbo_update(res, NULL, 1);
+	ldlm_lock_cancel(lock);
+	if (!exp->exp_obd->obd_stopping)
+		ldlm_reprocess_all(res);
+	ldlm_resource_putref(res);
+
+	ecl->ecl_loop++;
+	if ((ecl->ecl_loop & -ecl->ecl_loop) == ecl->ecl_loop) {
+		CDEBUG(D_INFO, "Export %p, %d locks cancelled.\n",
+		       exp, ecl->ecl_loop);
+	}
+}
+
+/**
+ * Iterator function for ldlm_export_cancel_locks.
+ * Cancels passed locks.
+ */
+static int
+ldlm_cancel_locks_for_export_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+				struct hlist_node *hnode, void *data)
+
+{
+	struct export_cl_data	*ecl = (struct export_cl_data *)data;
+	struct obd_export	*exp  = ecl->ecl_exp;
+	struct ldlm_lock	*lock = cfs_hash_object(hs, hnode);
+
+	LDLM_LOCK_GET(lock);
+	ldlm_cancel_lock_for_export(exp, lock, ecl);
+	LDLM_LOCK_RELEASE(lock);
+
+	return 0;
+}
+
+/**
+ * Cancel all blocked locks for given export.
+ *
+ * Typically called on client disconnection/eviction
+ */
+int ldlm_export_cancel_blocked_locks(struct obd_export *exp)
+{
+	struct export_cl_data	ecl = {
+		.ecl_exp	= exp,
+		.ecl_loop	= 0,
+	};
+
+	while (!list_empty(&exp->exp_bl_list)) {
+		struct ldlm_lock *lock;
+
+		spin_lock_bh(&exp->exp_bl_list_lock);
+		if (!list_empty(&exp->exp_bl_list)) {
+			lock = list_entry(exp->exp_bl_list.next,
+					  struct ldlm_lock, l_exp_list);
+			LDLM_LOCK_GET(lock);
+			list_del_init(&lock->l_exp_list);
+		} else {
+			lock = NULL;
+		}
+		spin_unlock_bh(&exp->exp_bl_list_lock);
+
+		if (lock == NULL)
+			break;
+
+		ldlm_cancel_lock_for_export(exp, lock, &ecl);
+		LDLM_LOCK_RELEASE(lock);
+	}
+
+	CDEBUG(D_DLMTRACE, "Export %p, canceled %d locks, "
+	       "left on hash table %d.\n", exp, ecl.ecl_loop,
+	       atomic_read(&exp->exp_lock_hash->hs_count));
+
+	return ecl.ecl_loop;
+}
+
+/**
+ * Cancel all locks for given export.
+ *
+ * Typically called after client disconnection/eviction
+ */
+int ldlm_export_cancel_locks(struct obd_export *exp)
+{
+	struct export_cl_data	ecl = {
+		.ecl_exp	= exp,
+		.ecl_loop	= 0,
+	};
+
+	cfs_hash_for_each_empty(exp->exp_lock_hash,
+				ldlm_cancel_locks_for_export_cb, &ecl);
+
+	CDEBUG(D_DLMTRACE, "Export %p, canceled %d locks, "
+	       "left on hash table %d.\n", exp, ecl.ecl_loop,
+	       atomic_read(&exp->exp_lock_hash->hs_count));
+
+	if (ecl.ecl_loop > 0 &&
+	    atomic_read(&exp->exp_lock_hash->hs_count) == 0 &&
+	    exp->exp_obd->obd_stopping)
+		ldlm_reprocess_recovery_done(exp->exp_obd->obd_namespace);
+
+	return ecl.ecl_loop;
+}
+
+/**
+ * Downgrade an exclusive lock.
+ *
+ * A fast variant of ldlm_lock_convert for convertion of exclusive locks. The
+ * convertion may fail if lock was canceled before downgrade, but it doesn't
+ * indicate any problem, because such lock has no reader or writer, and will
+ * be released soon.
+ * Used by Commit on Sharing (COS) code.
+ *
+ * \param lock A lock to convert
+ * \param new_mode new lock mode
+ */
+void ldlm_lock_downgrade(struct ldlm_lock *lock, enum ldlm_mode new_mode)
+{
+	ENTRY;
+
+	LASSERT(new_mode == LCK_COS);
+
+	lock_res_and_lock(lock);
+
+	if (!(lock->l_granted_mode & (LCK_PW | LCK_EX))) {
+		unlock_res_and_lock(lock);
+
+		LASSERT(lock->l_granted_mode == LCK_MINMODE);
+		LDLM_DEBUG(lock, "lock was canceled before downgrade");
+		RETURN_EXIT;
+	}
+
+	ldlm_resource_unlink_lock(lock);
+	/*
+	 * Remove the lock from pool as it will be added again in
+	 * ldlm_grant_lock() called below.
+	 */
+	ldlm_pool_del(&ldlm_lock_to_ns(lock)->ns_pool, lock);
+	lock->l_req_mode = new_mode;
+	ldlm_grant_lock(lock, NULL);
+
+	unlock_res_and_lock(lock);
+
+	ldlm_reprocess_all(lock->l_resource);
+
+	EXIT;
+}
+EXPORT_SYMBOL(ldlm_lock_downgrade);
+
+/**
+ * Attempt to convert already granted lock to a different mode.
+ *
+ * While lock conversion is not currently used, future client-side
+ * optimizations could take advantage of it to avoid discarding cached
+ * pages on a file.
+ */
+struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock,
+					enum ldlm_mode new_mode, __u32 *flags)
+{
+	struct list_head rpc_list;
+	struct ldlm_resource *res;
+	struct ldlm_namespace *ns;
+	int granted = 0;
+#ifdef HAVE_SERVER_SUPPORT
+	int old_mode;
+	struct sl_insert_point prev;
+#endif
+	struct ldlm_interval *node;
+	ENTRY;
+
+	INIT_LIST_HEAD(&rpc_list);
+	/* Just return if mode is unchanged. */
+	if (new_mode == lock->l_granted_mode) {
+		*flags |= LDLM_FL_BLOCK_GRANTED;
+		RETURN(lock->l_resource);
+	}
+
+	/* I can't check the type of lock here because the bitlock of lock
+	 * is not held here, so do the allocation blindly. -jay */
+	OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, GFP_NOFS);
+	if (node == NULL)  /* Actually, this causes EDEADLOCK to be returned */
+		RETURN(NULL);
+
+	LASSERTF((new_mode == LCK_PW && lock->l_granted_mode == LCK_PR),
+		 "new_mode %u, granted %u\n", new_mode, lock->l_granted_mode);
+
+	lock_res_and_lock(lock);
+
+	res = lock->l_resource;
+	ns  = ldlm_res_to_ns(res);
+
+#ifdef HAVE_SERVER_SUPPORT
+	old_mode = lock->l_req_mode;
+#endif
+	lock->l_req_mode = new_mode;
+	if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS) {
+#ifdef HAVE_SERVER_SUPPORT
+		/* remember the lock position where the lock might be
+		 * added back to the granted list later and also
+		 * remember the join mode for skiplist fixing. */
+		prev.res_link = lock->l_res_link.prev;
+		prev.mode_link = lock->l_sl_mode.prev;
+		prev.policy_link = lock->l_sl_policy.prev;
+#endif
+                ldlm_resource_unlink_lock(lock);
+        } else {
+                ldlm_resource_unlink_lock(lock);
+                if (res->lr_type == LDLM_EXTENT) {
+                        /* FIXME: ugly code, I have to attach the lock to a
+                         * interval node again since perhaps it will be granted
+                         * soon */
+			INIT_LIST_HEAD(&node->li_group);
+                        ldlm_interval_attach(node, lock);
+                        node = NULL;
+                }
+        }
+
+        /*
+         * Remove old lock from the pool before adding the lock with new
+         * mode below in ->policy()
+         */
+        ldlm_pool_del(&ns->ns_pool, lock);
+
+        /* If this is a local resource, put it on the appropriate list. */
+        if (ns_is_client(ldlm_res_to_ns(res))) {
+                if (*flags & (LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_GRANTED)) {
+                        ldlm_resource_add_lock(res, &res->lr_converting, lock);
+                } else {
+                        /* This should never happen, because of the way the
+                         * server handles conversions. */
+			LDLM_ERROR(lock, "Erroneous flags %x on local lock\n",
+                                   *flags);
+                        LBUG();
+
+                        ldlm_grant_lock(lock, &rpc_list);
+                        granted = 1;
+                        /* FIXME: completion handling not with lr_lock held ! */
+                        if (lock->l_completion_ast)
+                                lock->l_completion_ast(lock, 0, NULL);
+                }
+#ifdef HAVE_SERVER_SUPPORT
+	} else {
+		int rc;
+		enum ldlm_error err;
+		__u64 pflags = 0;
+		ldlm_processing_policy policy;
+
+                policy = ldlm_processing_policy_table[res->lr_type];
+		rc = policy(lock, &pflags, LDLM_PROCESS_RESCAN, &err,
+			    &rpc_list);
+                if (rc == LDLM_ITER_STOP) {
+                        lock->l_req_mode = old_mode;
+                        if (res->lr_type == LDLM_EXTENT)
+                                ldlm_extent_add_lock(res, lock);
+                        else
+                                ldlm_granted_list_add_lock(lock, &prev);
+
+                        res = NULL;
+                } else {
+                        *flags |= LDLM_FL_BLOCK_GRANTED;
+                        granted = 1;
+                }
+        }
+#else
+        } else {
+                CERROR("This is client-side-only module, cannot handle "
+                       "LDLM_NAMESPACE_SERVER resource type lock.\n");
+                LBUG();
+        }
+#endif
+        unlock_res_and_lock(lock);
+
+        if (granted)
+                ldlm_run_ast_work(ns, &rpc_list, LDLM_WORK_CP_AST);
+        if (node)
+                OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
+        RETURN(res);
+}
+
+/**
+ * Print lock with lock handle \a lockh description into debug log.
+ *
+ * Used when printing all locks on a resource for debug purposes.
+ */
+void ldlm_lock_dump_handle(int level, const struct lustre_handle *lockh)
+{
+        struct ldlm_lock *lock;
+
+        if (!((libcfs_debug | D_ERROR) & level))
+                return;
+
+        lock = ldlm_handle2lock(lockh);
+        if (lock == NULL)
+                return;
+
+        LDLM_DEBUG_LIMIT(level, lock, "###");
+
+        LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_dump_handle);
+
+/**
+ * Print lock information with custom message into debug log.
+ * Helper function.
+ */
+void _ldlm_lock_debug(struct ldlm_lock *lock,
+                      struct libcfs_debug_msg_data *msgdata,
+                      const char *fmt, ...)
+{
+        va_list args;
+        struct obd_export *exp = lock->l_export;
+	struct ldlm_resource *resource = NULL;
+        char *nid = "local";
+
+	/* on server-side resource of lock doesn't change */
+	if ((lock->l_flags & LDLM_FL_NS_SRV) != 0) {
+		if (lock->l_resource != NULL)
+			resource = ldlm_resource_getref(lock->l_resource);
+	} else if (spin_trylock(&lock->l_lock)) {
+		if (lock->l_resource != NULL)
+			resource = ldlm_resource_getref(lock->l_resource);
+		spin_unlock(&lock->l_lock);
+	}
+
+        va_start(args, fmt);
+
+        if (exp && exp->exp_connection) {
+                nid = libcfs_nid2str(exp->exp_connection->c_peer.nid);
+        } else if (exp && exp->exp_obd != NULL) {
+                struct obd_import *imp = exp->exp_obd->u.cli.cl_import;
+                nid = libcfs_nid2str(imp->imp_connection->c_peer.nid);
+        }
+
+        if (resource == NULL) {
+                libcfs_debug_vmsg2(msgdata, fmt, args,
+		       " ns: \?\? lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
+		       "res: \?\? rrc=\?\? type: \?\?\? flags: %#llx nid: %s "
+		       "remote: %#llx expref: %d pid: %u timeout: %lu "
+		       "lvb_type: %d\n",
+                       lock,
+		       lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
+                       lock->l_readers, lock->l_writers,
+                       ldlm_lockname[lock->l_granted_mode],
+                       ldlm_lockname[lock->l_req_mode],
+                       lock->l_flags, nid, lock->l_remote_handle.cookie,
+		       exp ? atomic_read(&exp->exp_refcount) : -99,
+                       lock->l_pid, lock->l_callback_timeout, lock->l_lvb_type);
+                va_end(args);
+                return;
+        }
+
+	switch (resource->lr_type) {
+	case LDLM_EXTENT:
+		libcfs_debug_vmsg2(msgdata, fmt, args,
+			" ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
+			"res: "DLDLMRES" rrc: %d type: %s [%llu->%llu] "
+			"(req %llu->%llu) flags: %#llx nid: %s remote: "
+			"%#llx expref: %d pid: %u timeout: %lu lvb_type: %d\n",
+			ldlm_lock_to_ns_name(lock), lock,
+			lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
+			lock->l_readers, lock->l_writers,
+			ldlm_lockname[lock->l_granted_mode],
+			ldlm_lockname[lock->l_req_mode],
+			PLDLMRES(resource),
+			atomic_read(&resource->lr_refcount),
+			ldlm_typename[resource->lr_type],
+			lock->l_policy_data.l_extent.start,
+			lock->l_policy_data.l_extent.end,
+			lock->l_req_extent.start, lock->l_req_extent.end,
+			lock->l_flags, nid, lock->l_remote_handle.cookie,
+			exp ? atomic_read(&exp->exp_refcount) : -99,
+			lock->l_pid, lock->l_callback_timeout,
+			lock->l_lvb_type);
+		break;
+
+	case LDLM_FLOCK:
+		libcfs_debug_vmsg2(msgdata, fmt, args,
+			" ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
+			"res: "DLDLMRES" rrc: %d type: %s pid: %d "
+			"[%llu->%llu] flags: %#llx nid: %s "
+			"remote: %#llx expref: %d pid: %u timeout: %lu\n",
+			ldlm_lock_to_ns_name(lock), lock,
+			lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
+			lock->l_readers, lock->l_writers,
+			ldlm_lockname[lock->l_granted_mode],
+			ldlm_lockname[lock->l_req_mode],
+			PLDLMRES(resource),
+			atomic_read(&resource->lr_refcount),
+			ldlm_typename[resource->lr_type],
+			lock->l_policy_data.l_flock.pid,
+			lock->l_policy_data.l_flock.start,
+			lock->l_policy_data.l_flock.end,
+			lock->l_flags, nid, lock->l_remote_handle.cookie,
+			exp ? atomic_read(&exp->exp_refcount) : -99,
+			lock->l_pid, lock->l_callback_timeout);
+		break;
+
+	case LDLM_IBITS:
+		libcfs_debug_vmsg2(msgdata, fmt, args,
+			" ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
+			"res: "DLDLMRES" bits %#llx rrc: %d type: %s "
+			"flags: %#llx nid: %s remote: %#llx expref: %d "
+			"pid: %u timeout: %lu lvb_type: %d\n",
+			ldlm_lock_to_ns_name(lock),
+			lock, lock->l_handle.h_cookie,
+			atomic_read(&lock->l_refc),
+			lock->l_readers, lock->l_writers,
+			ldlm_lockname[lock->l_granted_mode],
+			ldlm_lockname[lock->l_req_mode],
+			PLDLMRES(resource),
+			lock->l_policy_data.l_inodebits.bits,
+			atomic_read(&resource->lr_refcount),
+			ldlm_typename[resource->lr_type],
+			lock->l_flags, nid, lock->l_remote_handle.cookie,
+			exp ? atomic_read(&exp->exp_refcount) : -99,
+			lock->l_pid, lock->l_callback_timeout,
+			lock->l_lvb_type);
+		break;
+
+	default:
+		libcfs_debug_vmsg2(msgdata, fmt, args,
+			" ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
+			"res: "DLDLMRES" rrc: %d type: %s flags: %#llx "
+			"nid: %s remote: %#llx expref: %d pid: %u "
+			"timeout: %lu lvb_type: %d\n",
+			ldlm_lock_to_ns_name(lock),
+			lock, lock->l_handle.h_cookie,
+			atomic_read(&lock->l_refc),
+			lock->l_readers, lock->l_writers,
+			ldlm_lockname[lock->l_granted_mode],
+			ldlm_lockname[lock->l_req_mode],
+			PLDLMRES(resource),
+			atomic_read(&resource->lr_refcount),
+			ldlm_typename[resource->lr_type],
+			lock->l_flags, nid, lock->l_remote_handle.cookie,
+			exp ? atomic_read(&exp->exp_refcount) : -99,
+			lock->l_pid, lock->l_callback_timeout,
+			lock->l_lvb_type);
+		break;
+	}
+	va_end(args);
+	ldlm_resource_putref(resource);
+}
+EXPORT_SYMBOL(_ldlm_lock_debug);
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c
new file mode 100644
index 0000000000000..356a30231142b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c
@@ -0,0 +1,3258 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_lockd.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <libcfs/libcfs.h>
+#include <lustre/lustre_errno.h>
+#include <lustre_dlm.h>
+#include <obd_class.h>
+#include "ldlm_internal.h"
+
+static int ldlm_num_threads;
+module_param(ldlm_num_threads, int, 0444);
+MODULE_PARM_DESC(ldlm_num_threads, "number of DLM service threads to start");
+
+static char *ldlm_cpts;
+module_param(ldlm_cpts, charp, 0444);
+MODULE_PARM_DESC(ldlm_cpts, "CPU partitions ldlm threads should run on");
+
+static DEFINE_MUTEX(ldlm_ref_mutex);
+static int ldlm_refcount;
+
+struct kobject *ldlm_kobj;
+struct kset *ldlm_ns_kset;
+struct kset *ldlm_svc_kset;
+
+/* LDLM state */
+
+static struct ldlm_state *ldlm_state;
+
+static inline cfs_time_t round_timeout(cfs_time_t timeout)
+{
+        return cfs_time_seconds((int)cfs_duration_sec(cfs_time_sub(timeout, 0)) + 1);
+}
+
+/* timeout for initial callback (AST) reply (bz10399) */
+static inline unsigned int ldlm_get_rq_timeout(void)
+{
+        /* Non-AT value */
+        unsigned int timeout = min(ldlm_timeout, obd_timeout / 3);
+
+        return timeout < 1 ? 1 : timeout;
+}
+
+struct ldlm_bl_pool {
+	spinlock_t		blp_lock;
+
+	/*
+	 * blp_prio_list is used for callbacks that should be handled
+	 * as a priority. It is used for LDLM_FL_DISCARD_DATA requests.
+	 * see bug 13843
+	 */
+	struct list_head              blp_prio_list;
+
+	/*
+	 * blp_list is used for all other callbacks which are likely
+	 * to take longer to process.
+	 */
+	struct list_head              blp_list;
+
+	wait_queue_head_t       blp_waitq;
+	struct completion       blp_comp;
+	atomic_t            blp_num_threads;
+	atomic_t            blp_busy_threads;
+	int                     blp_min_threads;
+	int                     blp_max_threads;
+};
+
+struct ldlm_bl_work_item {
+	struct list_head	blwi_entry;
+	struct ldlm_namespace	*blwi_ns;
+	struct ldlm_lock_desc	blwi_ld;
+	struct ldlm_lock	*blwi_lock;
+	struct list_head	blwi_head;
+	int			blwi_count;
+	struct completion	blwi_comp;
+	enum ldlm_cancel_flags	blwi_flags;
+	int			blwi_mem_pressure;
+};
+
+#ifdef HAVE_SERVER_SUPPORT
+
+/**
+ * Protects both waiting_locks_list and expired_lock_thread.
+ */
+static DEFINE_SPINLOCK(waiting_locks_spinlock); /* BH lock (timer) */
+
+/**
+ * List for contended locks.
+ *
+ * As soon as a lock is contended, it gets placed on this list and
+ * expected time to get a response is filled in the lock. A special
+ * thread walks the list looking for locks that should be released and
+ * schedules client evictions for those that have not been released in
+ * time.
+ *
+ * All access to it should be under waiting_locks_spinlock.
+ */
+static LIST_HEAD(waiting_locks_list);
+static void waiting_locks_callback(unsigned long unused);
+static DEFINE_TIMER(waiting_locks_timer, waiting_locks_callback, 0, 0);
+
+enum elt_state {
+	ELT_STOPPED,
+	ELT_READY,
+	ELT_TERMINATE,
+};
+
+static DECLARE_WAIT_QUEUE_HEAD(expired_lock_wait_queue);
+static enum elt_state expired_lock_thread_state = ELT_STOPPED;
+static int expired_lock_dump;
+static LIST_HEAD(expired_lock_list);
+
+static inline int have_expired_locks(void)
+{
+	int need_to_run;
+
+	ENTRY;
+	spin_lock_bh(&waiting_locks_spinlock);
+	need_to_run = !list_empty(&expired_lock_list);
+	spin_unlock_bh(&waiting_locks_spinlock);
+
+	RETURN(need_to_run);
+}
+
+/**
+ * Check expired lock list for expired locks and time them out.
+ */
+static int expired_lock_main(void *arg)
+{
+	struct list_head *expired = &expired_lock_list;
+	struct l_wait_info lwi = { 0 };
+	int do_dump;
+
+	ENTRY;
+
+	expired_lock_thread_state = ELT_READY;
+	wake_up(&expired_lock_wait_queue);
+
+	while (1) {
+		l_wait_event(expired_lock_wait_queue,
+			     have_expired_locks() ||
+			     expired_lock_thread_state == ELT_TERMINATE,
+			     &lwi);
+
+		spin_lock_bh(&waiting_locks_spinlock);
+		if (expired_lock_dump) {
+			spin_unlock_bh(&waiting_locks_spinlock);
+
+			/* from waiting_locks_callback, but not in timer */
+			libcfs_debug_dumplog();
+
+			spin_lock_bh(&waiting_locks_spinlock);
+			expired_lock_dump = 0;
+		}
+
+		do_dump = 0;
+
+		while (!list_empty(expired)) {
+			struct obd_export *export;
+			struct ldlm_lock *lock;
+
+			lock = list_entry(expired->next, struct ldlm_lock,
+					  l_pending_chain);
+			if ((void *)lock < LP_POISON + PAGE_SIZE &&
+			    (void *)lock >= LP_POISON) {
+				spin_unlock_bh(&waiting_locks_spinlock);
+				CERROR("free lock on elt list %p\n", lock);
+				LBUG();
+			}
+			list_del_init(&lock->l_pending_chain);
+			if ((void *)lock->l_export <
+			     LP_POISON + PAGE_SIZE &&
+			    (void *)lock->l_export >= LP_POISON) {
+				CERROR("lock with free export on elt list %p\n",
+				       lock->l_export);
+				lock->l_export = NULL;
+				LDLM_ERROR(lock, "free export");
+				/* release extra ref grabbed by
+				 * ldlm_add_waiting_lock() or
+				 * ldlm_failed_ast() */
+				LDLM_LOCK_RELEASE(lock);
+				continue;
+			}
+
+			if (ldlm_is_destroyed(lock)) {
+				/* release the lock refcount where
+				 * waiting_locks_callback() founds */
+				LDLM_LOCK_RELEASE(lock);
+				continue;
+			}
+			export = class_export_lock_get(lock->l_export, lock);
+			spin_unlock_bh(&waiting_locks_spinlock);
+
+			spin_lock_bh(&export->exp_bl_list_lock);
+			list_del_init(&lock->l_exp_list);
+			spin_unlock_bh(&export->exp_bl_list_lock);
+
+			do_dump++;
+			class_fail_export(export);
+			class_export_lock_put(export, lock);
+
+			/* release extra ref grabbed by ldlm_add_waiting_lock()
+			 * or ldlm_failed_ast() */
+			LDLM_LOCK_RELEASE(lock);
+
+			spin_lock_bh(&waiting_locks_spinlock);
+		}
+		spin_unlock_bh(&waiting_locks_spinlock);
+
+		if (do_dump && obd_dump_on_eviction) {
+			CERROR("dump the log upon eviction\n");
+			libcfs_debug_dumplog();
+		}
+
+		if (expired_lock_thread_state == ELT_TERMINATE)
+			break;
+	}
+
+	expired_lock_thread_state = ELT_STOPPED;
+	wake_up(&expired_lock_wait_queue);
+	RETURN(0);
+}
+
+static int ldlm_add_waiting_lock(struct ldlm_lock *lock);
+static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, int seconds);
+
+/**
+ * Check if there is a request in the export request list
+ * which prevents the lock canceling.
+ */
+static int ldlm_lock_busy(struct ldlm_lock *lock)
+{
+	struct ptlrpc_request *req;
+	int match = 0;
+	ENTRY;
+
+	if (lock->l_export == NULL)
+		return 0;
+
+	spin_lock_bh(&lock->l_export->exp_rpc_lock);
+	list_for_each_entry(req, &lock->l_export->exp_hp_rpcs,
+				rq_exp_list) {
+		if (req->rq_ops->hpreq_lock_match) {
+			match = req->rq_ops->hpreq_lock_match(req, lock);
+			if (match)
+				break;
+		}
+	}
+	spin_unlock_bh(&lock->l_export->exp_rpc_lock);
+	RETURN(match);
+}
+
+/* This is called from within a timer interrupt and cannot schedule */
+static void waiting_locks_callback(unsigned long unused)
+{
+	struct ldlm_lock	*lock;
+	int			need_dump = 0;
+
+	spin_lock_bh(&waiting_locks_spinlock);
+	while (!list_empty(&waiting_locks_list)) {
+		lock = list_entry(waiting_locks_list.next, struct ldlm_lock,
+                                      l_pending_chain);
+                if (cfs_time_after(lock->l_callback_timeout,
+                                   cfs_time_current()) ||
+                    (lock->l_req_mode == LCK_GROUP))
+                        break;
+
+                /* Check if we need to prolong timeout */
+                if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT) &&
+                    ldlm_lock_busy(lock)) {
+                        int cont = 1;
+
+                        if (lock->l_pending_chain.next == &waiting_locks_list)
+                                cont = 0;
+
+                        LDLM_LOCK_GET(lock);
+
+			spin_unlock_bh(&waiting_locks_spinlock);
+			LDLM_DEBUG(lock, "prolong the busy lock");
+			ldlm_refresh_waiting_lock(lock,
+						  ldlm_bl_timeout(lock) >> 1);
+			spin_lock_bh(&waiting_locks_spinlock);
+
+                        if (!cont) {
+                                LDLM_LOCK_RELEASE(lock);
+                                break;
+                        }
+
+                        LDLM_LOCK_RELEASE(lock);
+                        continue;
+                }
+                ldlm_lock_to_ns(lock)->ns_timeouts++;
+		LDLM_ERROR(lock, "lock callback timer expired after %llds: "
+                           "evicting client at %s ",
+			   ktime_get_real_seconds() - lock->l_last_activity,
+                           libcfs_nid2str(
+                                   lock->l_export->exp_connection->c_peer.nid));
+
+                /* no needs to take an extra ref on the lock since it was in
+                 * the waiting_locks_list and ldlm_add_waiting_lock()
+                 * already grabbed a ref */
+		list_del(&lock->l_pending_chain);
+		list_add(&lock->l_pending_chain, &expired_lock_list);
+		need_dump = 1;
+	}
+
+	if (!list_empty(&expired_lock_list)) {
+		if (obd_dump_on_timeout && need_dump)
+			expired_lock_dump = __LINE__;
+
+		wake_up(&expired_lock_wait_queue);
+	}
+
+        /*
+         * Make sure the timer will fire again if we have any locks
+         * left.
+         */
+	if (!list_empty(&waiting_locks_list)) {
+                cfs_time_t timeout_rounded;
+		lock = list_entry(waiting_locks_list.next, struct ldlm_lock,
+                                      l_pending_chain);
+                timeout_rounded = (cfs_time_t)round_timeout(lock->l_callback_timeout);
+		mod_timer(&waiting_locks_timer, timeout_rounded);
+        }
+	spin_unlock_bh(&waiting_locks_spinlock);
+}
+
+/**
+ * Add lock to the list of contended locks.
+ *
+ * Indicate that we're waiting for a client to call us back cancelling a given
+ * lock.  We add it to the pending-callback chain, and schedule the lock-timeout
+ * timer to fire appropriately.  (We round up to the next second, to avoid
+ * floods of timer firings during periods of high lock contention and traffic).
+ * As done by ldlm_add_waiting_lock(), the caller must grab a lock reference
+ * if it has been added to the waiting list (1 is returned).
+ *
+ * Called with the namespace lock held.
+ */
+static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, int seconds)
+{
+        cfs_time_t timeout;
+        cfs_time_t timeout_rounded;
+
+	if (!list_empty(&lock->l_pending_chain))
+                return 0;
+
+        if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT) ||
+            OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT))
+                seconds = 1;
+
+        timeout = cfs_time_shift(seconds);
+        if (likely(cfs_time_after(timeout, lock->l_callback_timeout)))
+                lock->l_callback_timeout = timeout;
+
+        timeout_rounded = round_timeout(lock->l_callback_timeout);
+
+	if (cfs_time_before(timeout_rounded, waiting_locks_timer.expires) ||
+	    !timer_pending(&waiting_locks_timer)) {
+		mod_timer(&waiting_locks_timer, timeout_rounded);
+        }
+        /* if the new lock has a shorter timeout than something earlier on
+           the list, we'll wait the longer amount of time; no big deal. */
+        /* FIFO */
+	list_add_tail(&lock->l_pending_chain, &waiting_locks_list);
+        return 1;
+}
+
+static void ldlm_add_blocked_lock(struct ldlm_lock *lock)
+{
+	spin_lock_bh(&lock->l_export->exp_bl_list_lock);
+	if (list_empty(&lock->l_exp_list)) {
+		if (lock->l_granted_mode != lock->l_req_mode)
+			list_add_tail(&lock->l_exp_list,
+				      &lock->l_export->exp_bl_list);
+		else
+			list_add(&lock->l_exp_list,
+				 &lock->l_export->exp_bl_list);
+	}
+	spin_unlock_bh(&lock->l_export->exp_bl_list_lock);
+
+	/* A blocked lock is added. Adjust the position in
+	 * the stale list if the export is in the list.
+	 * If export is stale and not in the list - it is being
+	 * processed and will be placed on the right position
+	 * on obd_stale_export_put(). */
+	if (!list_empty(&lock->l_export->exp_stale_list))
+		obd_stale_export_adjust(lock->l_export);
+}
+
+static int ldlm_add_waiting_lock(struct ldlm_lock *lock)
+{
+	int ret;
+	int timeout = ldlm_bl_timeout(lock);
+
+	/* NB: must be called with hold of lock_res_and_lock() */
+	LASSERT(ldlm_is_res_locked(lock));
+	LASSERT(!ldlm_is_cancel_on_block(lock));
+
+	/* Do not put cross-MDT lock in the waiting list, since we
+	 * will not evict it due to timeout for now */
+	if (lock->l_export != NULL &&
+	    (exp_connect_flags(lock->l_export) & OBD_CONNECT_MDS_MDS))
+		return 0;
+
+	spin_lock_bh(&waiting_locks_spinlock);
+	if (ldlm_is_cancel(lock)) {
+		spin_unlock_bh(&waiting_locks_spinlock);
+		return 0;
+	}
+
+	if (ldlm_is_destroyed(lock)) {
+		static cfs_time_t next;
+
+		spin_unlock_bh(&waiting_locks_spinlock);
+		LDLM_ERROR(lock, "not waiting on destroyed lock (bug 5653)");
+		if (cfs_time_after(cfs_time_current(), next)) {
+			next = cfs_time_shift(14400);
+			libcfs_debug_dumpstack(NULL);
+		}
+		return 0;
+	}
+
+	ldlm_set_waited(lock);
+	lock->l_last_activity = ktime_get_real_seconds();
+	ret = __ldlm_add_waiting_lock(lock, timeout);
+	if (ret) {
+		/* grab ref on the lock if it has been added to the
+		 * waiting list */
+		LDLM_LOCK_GET(lock);
+	}
+	spin_unlock_bh(&waiting_locks_spinlock);
+
+	if (ret)
+		ldlm_add_blocked_lock(lock);
+
+	LDLM_DEBUG(lock, "%sadding to wait list(timeout: %d, AT: %s)",
+		   ret == 0 ? "not re-" : "", timeout,
+		   AT_OFF ? "off" : "on");
+	return ret;
+}
+
+/**
+ * Remove a lock from the pending list, likely because it had its cancellation
+ * callback arrive without incident.  This adjusts the lock-timeout timer if
+ * needed.  Returns 0 if the lock wasn't pending after all, 1 if it was.
+ * As done by ldlm_del_waiting_lock(), the caller must release the lock
+ * reference when the lock is removed from any list (1 is returned).
+ *
+ * Called with namespace lock held.
+ */
+static int __ldlm_del_waiting_lock(struct ldlm_lock *lock)
+{
+	struct list_head *list_next;
+
+	if (list_empty(&lock->l_pending_chain))
+                return 0;
+
+        list_next = lock->l_pending_chain.next;
+        if (lock->l_pending_chain.prev == &waiting_locks_list) {
+                /* Removing the head of the list, adjust timer. */
+                if (list_next == &waiting_locks_list) {
+                        /* No more, just cancel. */
+			del_timer(&waiting_locks_timer);
+                } else {
+                        struct ldlm_lock *next;
+			next = list_entry(list_next, struct ldlm_lock,
+                                              l_pending_chain);
+			mod_timer(&waiting_locks_timer,
+				  round_timeout(next->l_callback_timeout));
+                }
+        }
+	list_del_init(&lock->l_pending_chain);
+
+        return 1;
+}
+
+int ldlm_del_waiting_lock(struct ldlm_lock *lock)
+{
+        int ret;
+
+        if (lock->l_export == NULL) {
+                /* We don't have a "waiting locks list" on clients. */
+                CDEBUG(D_DLMTRACE, "Client lock %p : no-op\n", lock);
+                return 0;
+        }
+
+	spin_lock_bh(&waiting_locks_spinlock);
+	ret = __ldlm_del_waiting_lock(lock);
+	ldlm_clear_waited(lock);
+	spin_unlock_bh(&waiting_locks_spinlock);
+
+	/* remove the lock out of export blocking list */
+	spin_lock_bh(&lock->l_export->exp_bl_list_lock);
+	list_del_init(&lock->l_exp_list);
+	spin_unlock_bh(&lock->l_export->exp_bl_list_lock);
+
+        if (ret) {
+                /* release lock ref if it has indeed been removed
+                 * from a list */
+                LDLM_LOCK_RELEASE(lock);
+        }
+
+        LDLM_DEBUG(lock, "%s", ret == 0 ? "wasn't waiting" : "removed");
+        return ret;
+}
+
+/**
+ * Prolong the contended lock waiting time.
+ *
+ * Called with namespace lock held.
+ */
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout)
+{
+	if (lock->l_export == NULL) {
+		/* We don't have a "waiting locks list" on clients. */
+		LDLM_DEBUG(lock, "client lock: no-op");
+		return 0;
+	}
+
+	if (exp_connect_flags(lock->l_export) & OBD_CONNECT_MDS_MDS) {
+		/* We don't have a "waiting locks list" on OSP. */
+		LDLM_DEBUG(lock, "MDS-MDS lock: no-op");
+		return 0;
+	}
+
+	spin_lock_bh(&waiting_locks_spinlock);
+
+	if (list_empty(&lock->l_pending_chain)) {
+		spin_unlock_bh(&waiting_locks_spinlock);
+		LDLM_DEBUG(lock, "wasn't waiting");
+		return 0;
+	}
+
+	/* we remove/add the lock to the waiting list, so no needs to
+	 * release/take a lock reference */
+	__ldlm_del_waiting_lock(lock);
+	__ldlm_add_waiting_lock(lock, timeout);
+	spin_unlock_bh(&waiting_locks_spinlock);
+
+	LDLM_DEBUG(lock, "refreshed");
+	return 1;
+}
+EXPORT_SYMBOL(ldlm_refresh_waiting_lock);
+
+#else /* HAVE_SERVER_SUPPORT */
+
+int ldlm_del_waiting_lock(struct ldlm_lock *lock)
+{
+        RETURN(0);
+}
+
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout)
+{
+        RETURN(0);
+}
+
+#endif /* !HAVE_SERVER_SUPPORT */
+
+#ifdef HAVE_SERVER_SUPPORT
+
+/**
+ * Calculate the per-export Blocking timeout (covering BL AST, data flush,
+ * lock cancel, and their replies). Used for lock callback timeout and AST
+ * re-send period.
+ *
+ * \param[in] lock        lock which is getting the blocking callback
+ *
+ * \retval            timeout in seconds to wait for the client reply
+ */
+unsigned int ldlm_bl_timeout(struct ldlm_lock *lock)
+{
+	unsigned int timeout;
+
+	if (AT_OFF)
+		return obd_timeout / 2;
+
+	/* Since these are non-updating timeouts, we should be conservative.
+	 * Take more than usually, 150%
+	 * It would be nice to have some kind of "early reply" mechanism for
+	 * lock callbacks too... */
+	timeout = at_get(&lock->l_export->exp_bl_lock_at);
+	return max(timeout + (timeout >> 1), ldlm_enqueue_min);
+}
+EXPORT_SYMBOL(ldlm_bl_timeout);
+
+/**
+ * Perform lock cleanup if AST sending failed.
+ */
+static void ldlm_failed_ast(struct ldlm_lock *lock, int rc,
+                            const char *ast_type)
+{
+        LCONSOLE_ERROR_MSG(0x138, "%s: A client on nid %s was evicted due "
+                           "to a lock %s callback time out: rc %d\n",
+                           lock->l_export->exp_obd->obd_name,
+                           obd_export_nid2str(lock->l_export), ast_type, rc);
+
+        if (obd_dump_on_timeout)
+                libcfs_debug_dumplog();
+	spin_lock_bh(&waiting_locks_spinlock);
+	if (__ldlm_del_waiting_lock(lock) == 0)
+		/* the lock was not in any list, grab an extra ref before adding
+		 * the lock to the expired list */
+		LDLM_LOCK_GET(lock);
+	list_add(&lock->l_pending_chain, &expired_lock_list);
+	wake_up(&expired_lock_wait_queue);
+	spin_unlock_bh(&waiting_locks_spinlock);
+}
+
+/**
+ * Perform lock cleanup if AST reply came with error.
+ */
+static int ldlm_handle_ast_error(struct ldlm_lock *lock,
+				 struct ptlrpc_request *req, int rc,
+				 const char *ast_type)
+{
+	struct lnet_process_id peer = req->rq_import->imp_connection->c_peer;
+
+	if (!req->rq_replied || (rc && rc != -EINVAL)) {
+		if (lock->l_export && lock->l_export->exp_libclient) {
+			LDLM_DEBUG(lock,
+				   "%s AST (req@%p x%llu) to liblustre client (nid %s) timeout, just cancelling lock",
+				   ast_type, req, req->rq_xid,
+				   libcfs_nid2str(peer.nid));
+			ldlm_lock_cancel(lock);
+			rc = -ERESTART;
+		} else if (ldlm_is_cancel(lock)) {
+			LDLM_DEBUG(lock,
+				   "%s AST (req@%p x%llu) timeout from nid %s, but cancel was received (AST reply lost?)",
+				   ast_type, req, req->rq_xid,
+				   libcfs_nid2str(peer.nid));
+			ldlm_lock_cancel(lock);
+			rc = -ERESTART;
+		} else if (rc == -ENODEV || rc == -ESHUTDOWN ||
+			   (rc == -EIO &&
+			    req->rq_import->imp_state == LUSTRE_IMP_CLOSED)) {
+			/* Upon umount process the AST fails because cannot be
+			 * sent. This shouldn't lead to the client eviction.
+			 * -ENODEV error is returned by ptl_send_rpc() for
+			 *  new request in such import.
+			 * -SHUTDOWN is returned by ptlrpc_import_delay_req()
+			 *  if imp_invalid is set or obd_no_recov.
+			 * Meanwhile there is also check for LUSTRE_IMP_CLOSED
+			 * in ptlrpc_import_delay_req() as well with -EIO code.
+			 * In all such cases errors are ignored.
+			 */
+			LDLM_DEBUG(lock, "%s AST can't be sent due to a server"
+					 " %s failure or umount process: rc = %d\n",
+					 ast_type,
+					 req->rq_import->imp_obd->obd_name, rc);
+		} else {
+			LDLM_ERROR(lock,
+				   "client (nid %s) %s %s AST (req@%p x%llu status %d rc %d), evict it",
+				   libcfs_nid2str(peer.nid),
+				   req->rq_replied ? "returned error from" :
+				   "failed to reply to",
+				   ast_type, req, req->rq_xid,
+				   (req->rq_repmsg != NULL) ?
+				   lustre_msg_get_status(req->rq_repmsg) : 0,
+				   rc);
+			ldlm_failed_ast(lock, rc, ast_type);
+		}
+		return rc;
+	}
+
+	if (rc == -EINVAL) {
+		struct ldlm_resource *res = lock->l_resource;
+
+		LDLM_DEBUG(lock,
+			   "client (nid %s) returned %d from %s AST (req@%p x%llu) - normal race",
+			   libcfs_nid2str(peer.nid),
+			   req->rq_repmsg ?
+			   lustre_msg_get_status(req->rq_repmsg) : -1,
+			   ast_type, req, req->rq_xid);
+		if (res) {
+			/* update lvbo to return proper attributes.
+			 * see bug 23174 */
+			ldlm_resource_getref(res);
+			ldlm_res_lvbo_update(res, NULL, 1);
+			ldlm_resource_putref(res);
+		}
+		ldlm_lock_cancel(lock);
+		rc = -ERESTART;
+	}
+
+	return rc;
+}
+
+static int ldlm_cb_interpret(const struct lu_env *env,
+                             struct ptlrpc_request *req, void *data, int rc)
+{
+        struct ldlm_cb_async_args *ca   = data;
+        struct ldlm_lock          *lock = ca->ca_lock;
+        struct ldlm_cb_set_arg    *arg  = ca->ca_set_arg;
+        ENTRY;
+
+        LASSERT(lock != NULL);
+
+	switch (arg->type) {
+	case LDLM_GL_CALLBACK:
+		/* Update the LVB from disk if the AST failed
+		 * (this is a legal race)
+		 *
+		 * - Glimpse callback of local lock just returns
+		 *   -ELDLM_NO_LOCK_DATA.
+		 * - Glimpse callback of remote lock might return
+		 *   -ELDLM_NO_LOCK_DATA when inode is cleared. LU-274
+		 */
+		if (unlikely(arg->gl_interpret_reply)) {
+			rc = arg->gl_interpret_reply(env, req, data, rc);
+		} else if (rc == -ELDLM_NO_LOCK_DATA) {
+			LDLM_DEBUG(lock, "lost race - client has a lock but no "
+				   "inode");
+			ldlm_res_lvbo_update(lock->l_resource, NULL, 1);
+		} else if (rc != 0) {
+			rc = ldlm_handle_ast_error(lock, req, rc, "glimpse");
+		} else {
+			rc = ldlm_res_lvbo_update(lock->l_resource, req, 1);
+		}
+		break;
+	case LDLM_BL_CALLBACK:
+		if (rc != 0)
+			rc = ldlm_handle_ast_error(lock, req, rc, "blocking");
+		break;
+	case LDLM_CP_CALLBACK:
+		if (rc != 0)
+			rc = ldlm_handle_ast_error(lock, req, rc, "completion");
+		break;
+	default:
+		LDLM_ERROR(lock, "invalid opcode for lock callback %d",
+			   arg->type);
+		LBUG();
+	}
+
+	/* release extra reference taken in ldlm_ast_fini() */
+        LDLM_LOCK_RELEASE(lock);
+
+	if (rc == -ERESTART)
+		atomic_inc(&arg->restart);
+
+	RETURN(0);
+}
+
+static void ldlm_update_resend(struct ptlrpc_request *req, void *data)
+{
+	struct ldlm_cb_async_args *ca   = data;
+	struct ldlm_lock          *lock = ca->ca_lock;
+
+	ldlm_refresh_waiting_lock(lock, ldlm_bl_timeout(lock));
+}
+
+static inline int ldlm_ast_fini(struct ptlrpc_request *req,
+				struct ldlm_cb_set_arg *arg,
+				struct ldlm_lock *lock,
+				int instant_cancel)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (unlikely(instant_cancel)) {
+		rc = ptl_send_rpc(req, 1);
+		ptlrpc_req_finished(req);
+		if (rc == 0)
+			atomic_inc(&arg->restart);
+	} else {
+		LDLM_LOCK_GET(lock);
+		ptlrpc_set_add_req(arg->set, req);
+	}
+
+	RETURN(rc);
+}
+
+/**
+ * Check if there are requests in the export request list which prevent
+ * the lock canceling and make these requests high priority ones.
+ */
+static void ldlm_lock_reorder_req(struct ldlm_lock *lock)
+{
+	struct ptlrpc_request *req;
+	ENTRY;
+
+	if (lock->l_export == NULL) {
+		LDLM_DEBUG(lock, "client lock: no-op");
+		RETURN_EXIT;
+	}
+
+	spin_lock_bh(&lock->l_export->exp_rpc_lock);
+	list_for_each_entry(req, &lock->l_export->exp_hp_rpcs,
+			    rq_exp_list) {
+		/* Do not process requests that were not yet added to there
+		 * incoming queue or were already removed from there for
+		 * processing. We evaluate ptlrpc_nrs_req_can_move() without
+		 * holding svcpt->scp_req_lock, and then redo the check with
+		 * the lock held once we need to obtain a reliable result.
+		 */
+		if (ptlrpc_nrs_req_can_move(req) &&
+		    req->rq_ops->hpreq_lock_match &&
+		    req->rq_ops->hpreq_lock_match(req, lock))
+			ptlrpc_nrs_req_hp_move(req);
+	}
+	spin_unlock_bh(&lock->l_export->exp_rpc_lock);
+	EXIT;
+}
+
+/**
+ * ->l_blocking_ast() method for server-side locks. This is invoked when newly
+ * enqueued server lock conflicts with given one.
+ *
+ * Sends blocking AST RPC to the client owning that lock; arms timeout timer
+ * to wait for client response.
+ */
+int ldlm_server_blocking_ast(struct ldlm_lock *lock,
+                             struct ldlm_lock_desc *desc,
+                             void *data, int flag)
+{
+        struct ldlm_cb_async_args *ca;
+        struct ldlm_cb_set_arg *arg = data;
+        struct ldlm_request    *body;
+        struct ptlrpc_request  *req;
+        int                     instant_cancel = 0;
+        int                     rc = 0;
+        ENTRY;
+
+        if (flag == LDLM_CB_CANCELING)
+                /* Don't need to do anything here. */
+                RETURN(0);
+
+	if (OBD_FAIL_PRECHECK(OBD_FAIL_LDLM_SRV_BL_AST)) {
+		LDLM_DEBUG(lock, "dropping BL AST");
+		RETURN(0);
+	}
+
+        LASSERT(lock);
+        LASSERT(data != NULL);
+        if (lock->l_export->exp_obd->obd_recovering != 0)
+                LDLM_ERROR(lock, "BUG 6063: lock collide during recovery");
+
+        ldlm_lock_reorder_req(lock);
+
+        req = ptlrpc_request_alloc_pack(lock->l_export->exp_imp_reverse,
+                                        &RQF_LDLM_BL_CALLBACK,
+                                        LUSTRE_DLM_VERSION, LDLM_BL_CALLBACK);
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+        CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
+        ca = ptlrpc_req_async_args(req);
+        ca->ca_set_arg = arg;
+        ca->ca_lock = lock;
+
+        req->rq_interpret_reply = ldlm_cb_interpret;
+
+	lock_res_and_lock(lock);
+	if (ldlm_is_destroyed(lock)) {
+		/* What's the point? */
+		unlock_res_and_lock(lock);
+		ptlrpc_req_finished(req);
+		RETURN(0);
+	}
+
+	if (lock->l_granted_mode != lock->l_req_mode) {
+		/* this blocking AST will be communicated as part of the
+		 * completion AST instead */
+		ldlm_add_blocked_lock(lock);
+		ldlm_set_waited(lock);
+		unlock_res_and_lock(lock);
+
+		ptlrpc_req_finished(req);
+		LDLM_DEBUG(lock, "lock not granted, not sending blocking AST");
+		RETURN(0);
+	}
+
+	if (ldlm_is_cancel_on_block(lock))
+                instant_cancel = 1;
+
+        body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+        body->lock_handle[0] = lock->l_remote_handle;
+        body->lock_desc = *desc;
+	body->lock_flags |= ldlm_flags_to_wire(lock->l_flags & LDLM_FL_AST_MASK);
+
+        LDLM_DEBUG(lock, "server preparing blocking AST");
+
+        ptlrpc_request_set_replen(req);
+	ldlm_set_cbpending(lock);
+	if (instant_cancel) {
+		unlock_res_and_lock(lock);
+		ldlm_lock_cancel(lock);
+
+		req->rq_no_resend = 1;
+	} else {
+		LASSERT(lock->l_granted_mode == lock->l_req_mode);
+		ldlm_add_waiting_lock(lock);
+		unlock_res_and_lock(lock);
+
+		/* Do not resend after lock callback timeout */
+		req->rq_delay_limit = ldlm_bl_timeout(lock);
+		req->rq_resend_cb = ldlm_update_resend;
+	}
+
+        req->rq_send_state = LUSTRE_IMP_FULL;
+        /* ptlrpc_request_alloc_pack already set timeout */
+        if (AT_OFF)
+                req->rq_timeout = ldlm_get_rq_timeout();
+
+	lock->l_last_activity = ktime_get_real_seconds();
+
+        if (lock->l_export && lock->l_export->exp_nid_stats &&
+            lock->l_export->exp_nid_stats->nid_ldlm_stats)
+                lprocfs_counter_incr(lock->l_export->exp_nid_stats->nid_ldlm_stats,
+                                     LDLM_BL_CALLBACK - LDLM_FIRST_OPC);
+
+	rc = ldlm_ast_fini(req, arg, lock, instant_cancel);
+
+        RETURN(rc);
+}
+
+/**
+ * ->l_completion_ast callback for a remote lock in server namespace.
+ *
+ *  Sends AST to the client notifying it of lock granting.  If initial
+ *  lock response was not sent yet, instead of sending another RPC, just
+ *  mark the lock as granted and client will understand
+ */
+int ldlm_server_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
+{
+        struct ldlm_cb_set_arg *arg = data;
+        struct ldlm_request    *body;
+        struct ptlrpc_request  *req;
+        struct ldlm_cb_async_args *ca;
+        int                     instant_cancel = 0;
+        int                     rc = 0;
+	int			lvb_len;
+        ENTRY;
+
+        LASSERT(lock != NULL);
+        LASSERT(data != NULL);
+
+	if (OBD_FAIL_PRECHECK(OBD_FAIL_LDLM_SRV_CP_AST)) {
+		LDLM_DEBUG(lock, "dropping CP AST");
+		RETURN(0);
+	}
+
+        req = ptlrpc_request_alloc(lock->l_export->exp_imp_reverse,
+                                    &RQF_LDLM_CP_CALLBACK);
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+	/* server namespace, doesn't need lock */
+	lvb_len = ldlm_lvbo_size(lock);
+	/* LU-3124 & LU-2187: to not return layout in completion AST because
+	 * it may deadlock for LU-2187, or client may not have enough space
+	 * for large layout. The layout will be returned to client with an
+	 * extra RPC to fetch xattr.lov */
+	if (ldlm_has_layout(lock))
+		lvb_len = 0;
+
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_CLIENT, lvb_len);
+        rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CP_CALLBACK);
+        if (rc) {
+                ptlrpc_request_free(req);
+                RETURN(rc);
+        }
+
+        CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
+        ca = ptlrpc_req_async_args(req);
+        ca->ca_set_arg = arg;
+        ca->ca_lock = lock;
+
+        req->rq_interpret_reply = ldlm_cb_interpret;
+        body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+
+        body->lock_handle[0] = lock->l_remote_handle;
+	body->lock_flags = ldlm_flags_to_wire(flags);
+        ldlm_lock2desc(lock, &body->lock_desc);
+	if (lvb_len > 0) {
+		void *lvb = req_capsule_client_get(&req->rq_pill, &RMF_DLM_LVB);
+
+		lvb_len = ldlm_lvbo_fill(lock, lvb, lvb_len);
+		if (lvb_len < 0) {
+			/* We still need to send the RPC to wake up the blocked
+			 * enqueue thread on the client.
+			 *
+			 * Consider old client, there is no better way to notify
+			 * the failure, just zero-sized the LVB, then the client
+			 * will fail out as "-EPROTO". */
+			req_capsule_shrink(&req->rq_pill, &RMF_DLM_LVB, 0,
+					   RCL_CLIENT);
+			instant_cancel = 1;
+		} else {
+			req_capsule_shrink(&req->rq_pill, &RMF_DLM_LVB, lvb_len,
+					   RCL_CLIENT);
+		}
+        }
+
+	lock->l_last_activity = ktime_get_real_seconds();
+
+	LDLM_DEBUG(lock, "server preparing completion AST");
+
+        ptlrpc_request_set_replen(req);
+
+        req->rq_send_state = LUSTRE_IMP_FULL;
+        /* ptlrpc_request_pack already set timeout */
+        if (AT_OFF)
+                req->rq_timeout = ldlm_get_rq_timeout();
+
+        /* We only send real blocking ASTs after the lock is granted */
+        lock_res_and_lock(lock);
+	if (ldlm_is_ast_sent(lock)) {
+		body->lock_flags |= ldlm_flags_to_wire(LDLM_FL_AST_SENT);
+		/* Copy AST flags like LDLM_FL_DISCARD_DATA. */
+		body->lock_flags |= ldlm_flags_to_wire(lock->l_flags &
+						       LDLM_FL_AST_MASK);
+
+                /* We might get here prior to ldlm_handle_enqueue setting
+                 * LDLM_FL_CANCEL_ON_BLOCK flag. Then we will put this lock
+                 * into waiting list, but this is safe and similar code in
+                 * ldlm_handle_enqueue will call ldlm_lock_cancel() still,
+                 * that would not only cancel the lock, but will also remove
+                 * it from waiting list */
+		if (ldlm_is_cancel_on_block(lock)) {
+			unlock_res_and_lock(lock);
+			ldlm_lock_cancel(lock);
+
+			instant_cancel = 1;
+			req->rq_no_resend = 1;
+
+			lock_res_and_lock(lock);
+		} else {
+			/* start the lock-timeout clock */
+			ldlm_add_waiting_lock(lock);
+			/* Do not resend after lock callback timeout */
+			req->rq_delay_limit = ldlm_bl_timeout(lock);
+			req->rq_resend_cb = ldlm_update_resend;
+		}
+        }
+        unlock_res_and_lock(lock);
+
+        if (lock->l_export && lock->l_export->exp_nid_stats &&
+            lock->l_export->exp_nid_stats->nid_ldlm_stats)
+                lprocfs_counter_incr(lock->l_export->exp_nid_stats->nid_ldlm_stats,
+                                     LDLM_CP_CALLBACK - LDLM_FIRST_OPC);
+
+	rc = ldlm_ast_fini(req, arg, lock, instant_cancel);
+
+	RETURN(lvb_len < 0 ? lvb_len : rc);
+}
+
+/**
+ * Server side ->l_glimpse_ast handler for client locks.
+ *
+ * Sends glimpse AST to the client and waits for reply. Then updates
+ * lvbo with the result.
+ */
+int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data)
+{
+	struct ldlm_cb_set_arg		*arg = data;
+	struct ldlm_request		*body;
+	struct ptlrpc_request		*req;
+	struct ldlm_cb_async_args	*ca;
+	int				 rc;
+	struct req_format		*req_fmt;
+        ENTRY;
+
+        LASSERT(lock != NULL);
+
+	if (arg->gl_desc != NULL)
+		/* There is a glimpse descriptor to pack */
+		req_fmt = &RQF_LDLM_GL_DESC_CALLBACK;
+	else
+		req_fmt = &RQF_LDLM_GL_CALLBACK;
+
+        req = ptlrpc_request_alloc_pack(lock->l_export->exp_imp_reverse,
+					req_fmt, LUSTRE_DLM_VERSION,
+					LDLM_GL_CALLBACK);
+
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+	if (arg->gl_desc != NULL) {
+		/* copy the GL descriptor */
+		union ldlm_gl_desc	*desc;
+		desc = req_capsule_client_get(&req->rq_pill, &RMF_DLM_GL_DESC);
+		*desc = *arg->gl_desc;
+	}
+
+        body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+        body->lock_handle[0] = lock->l_remote_handle;
+        ldlm_lock2desc(lock, &body->lock_desc);
+
+	CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
+	ca = ptlrpc_req_async_args(req);
+	ca->ca_set_arg = arg;
+	ca->ca_lock = lock;
+
+        /* server namespace, doesn't need lock */
+        req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+                             ldlm_lvbo_size(lock));
+        ptlrpc_request_set_replen(req);
+
+        req->rq_send_state = LUSTRE_IMP_FULL;
+        /* ptlrpc_request_alloc_pack already set timeout */
+        if (AT_OFF)
+                req->rq_timeout = ldlm_get_rq_timeout();
+
+	lock->l_last_activity = ktime_get_real_seconds();
+
+	req->rq_interpret_reply = ldlm_cb_interpret;
+
+        if (lock->l_export && lock->l_export->exp_nid_stats &&
+            lock->l_export->exp_nid_stats->nid_ldlm_stats)
+                lprocfs_counter_incr(lock->l_export->exp_nid_stats->nid_ldlm_stats,
+                                     LDLM_GL_CALLBACK - LDLM_FIRST_OPC);
+
+	rc = ldlm_ast_fini(req, arg, lock, 0);
+
+	RETURN(rc);
+}
+
+int ldlm_glimpse_locks(struct ldlm_resource *res,
+		       struct list_head *gl_work_list)
+{
+	int	rc;
+	ENTRY;
+
+	rc = ldlm_run_ast_work(ldlm_res_to_ns(res), gl_work_list,
+			       LDLM_WORK_GL_AST);
+	if (rc == -ERESTART)
+		ldlm_reprocess_all(res);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_glimpse_locks);
+
+/* return LDLM lock associated with a lock callback request */
+struct ldlm_lock *ldlm_request_lock(struct ptlrpc_request *req)
+{
+	struct ldlm_cb_async_args	*ca;
+	struct ldlm_lock		*lock;
+	ENTRY;
+
+	ca = ptlrpc_req_async_args(req);
+	lock = ca->ca_lock;
+	if (lock == NULL)
+		RETURN(ERR_PTR(-EFAULT));
+
+	RETURN(lock);
+}
+EXPORT_SYMBOL(ldlm_request_lock);
+
+static void ldlm_svc_get_eopc(const struct ldlm_request *dlm_req,
+                       struct lprocfs_stats *srv_stats)
+{
+        int lock_type = 0, op = 0;
+
+        lock_type = dlm_req->lock_desc.l_resource.lr_type;
+
+        switch (lock_type) {
+        case LDLM_PLAIN:
+                op = PTLRPC_LAST_CNTR + LDLM_PLAIN_ENQUEUE;
+                break;
+        case LDLM_EXTENT:
+                if (dlm_req->lock_flags & LDLM_FL_HAS_INTENT)
+                        op = PTLRPC_LAST_CNTR + LDLM_GLIMPSE_ENQUEUE;
+                else
+                        op = PTLRPC_LAST_CNTR + LDLM_EXTENT_ENQUEUE;
+                break;
+        case LDLM_FLOCK:
+                op = PTLRPC_LAST_CNTR + LDLM_FLOCK_ENQUEUE;
+                break;
+        case LDLM_IBITS:
+                op = PTLRPC_LAST_CNTR + LDLM_IBITS_ENQUEUE;
+                break;
+        default:
+                op = 0;
+                break;
+        }
+
+        if (op)
+                lprocfs_counter_incr(srv_stats, op);
+
+        return;
+}
+
+/**
+ * Main server-side entry point into LDLM for enqueue. This is called by ptlrpc
+ * service threads to carry out client lock enqueueing requests.
+ */
+int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
+			 struct ptlrpc_request *req,
+			 const struct ldlm_request *dlm_req,
+			 const struct ldlm_callback_suite *cbs)
+{
+	struct ldlm_reply *dlm_rep;
+	__u64 flags;
+	enum ldlm_error err = ELDLM_OK;
+	struct ldlm_lock *lock = NULL;
+	void *cookie = NULL;
+	int rc = 0;
+	struct ldlm_resource *res = NULL;
+	ENTRY;
+
+	LDLM_DEBUG_NOLOCK("server-side enqueue handler START");
+
+	ldlm_request_cancel(req, dlm_req, LDLM_ENQUEUE_CANCEL_OFF, LATF_SKIP);
+	flags = ldlm_flags_from_wire(dlm_req->lock_flags);
+
+	LASSERT(req->rq_export);
+
+	if (ptlrpc_req2svc(req)->srv_stats != NULL)
+		ldlm_svc_get_eopc(dlm_req, ptlrpc_req2svc(req)->srv_stats);
+
+        if (req->rq_export && req->rq_export->exp_nid_stats &&
+            req->rq_export->exp_nid_stats->nid_ldlm_stats)
+                lprocfs_counter_incr(req->rq_export->exp_nid_stats->nid_ldlm_stats,
+                                     LDLM_ENQUEUE - LDLM_FIRST_OPC);
+
+        if (unlikely(dlm_req->lock_desc.l_resource.lr_type < LDLM_MIN_TYPE ||
+                     dlm_req->lock_desc.l_resource.lr_type >= LDLM_MAX_TYPE)) {
+                DEBUG_REQ(D_ERROR, req, "invalid lock request type %d",
+                          dlm_req->lock_desc.l_resource.lr_type);
+                GOTO(out, rc = -EFAULT);
+        }
+
+        if (unlikely(dlm_req->lock_desc.l_req_mode <= LCK_MINMODE ||
+                     dlm_req->lock_desc.l_req_mode >= LCK_MAXMODE ||
+                     dlm_req->lock_desc.l_req_mode &
+                     (dlm_req->lock_desc.l_req_mode-1))) {
+                DEBUG_REQ(D_ERROR, req, "invalid lock request mode %d",
+                          dlm_req->lock_desc.l_req_mode);
+                GOTO(out, rc = -EFAULT);
+        }
+
+	if (exp_connect_flags(req->rq_export) & OBD_CONNECT_IBITS) {
+                if (unlikely(dlm_req->lock_desc.l_resource.lr_type ==
+                             LDLM_PLAIN)) {
+                        DEBUG_REQ(D_ERROR, req,
+                                  "PLAIN lock request from IBITS client?");
+                        GOTO(out, rc = -EPROTO);
+                }
+        } else if (unlikely(dlm_req->lock_desc.l_resource.lr_type ==
+                            LDLM_IBITS)) {
+                DEBUG_REQ(D_ERROR, req,
+                          "IBITS lock request from unaware client?");
+                GOTO(out, rc = -EPROTO);
+        }
+
+	if (unlikely((flags & LDLM_FL_REPLAY) ||
+		     (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT))) {
+                /* Find an existing lock in the per-export lock hash */
+		/* In the function below, .hs_keycmp resolves to
+		 * ldlm_export_lock_keycmp() */
+		/* coverity[overrun-buffer-val] */
+                lock = cfs_hash_lookup(req->rq_export->exp_lock_hash,
+                                       (void *)&dlm_req->lock_handle[0]);
+                if (lock != NULL) {
+			DEBUG_REQ(D_DLMTRACE, req, "found existing lock cookie %#llx",
+				  lock->l_handle.h_cookie);
+			flags |= LDLM_FL_RESENT;
+                        GOTO(existing_lock, rc = 0);
+		}
+	} else {
+		if (ldlm_reclaim_full()) {
+			DEBUG_REQ(D_DLMTRACE, req, "Too many granted locks, "
+				  "reject current enqueue request and let the "
+				  "client retry later.\n");
+			GOTO(out, rc = -EINPROGRESS);
+		}
+	}
+
+	/* The lock's callback data might be set in the policy function */
+	lock = ldlm_lock_create(ns, &dlm_req->lock_desc.l_resource.lr_name,
+				dlm_req->lock_desc.l_resource.lr_type,
+				dlm_req->lock_desc.l_req_mode,
+				cbs, NULL, 0, LVB_T_NONE);
+	if (IS_ERR(lock)) {
+		rc = PTR_ERR(lock);
+		lock = NULL;
+		GOTO(out, rc);
+	}
+
+        lock->l_remote_handle = dlm_req->lock_handle[0];
+        LDLM_DEBUG(lock, "server-side enqueue handler, new lock created");
+
+	/* Initialize resource lvb but not for a lock being replayed since
+	 * Client already got lvb sent in this case.
+	 * This must occur early since some policy methods assume resource
+	 * lvb is available (lr_lvb_data != NULL).
+	 */
+	res = lock->l_resource;
+	if (!(flags & LDLM_FL_REPLAY)) {
+		/* non-replayed lock, delayed lvb init may need to be done */
+		rc = ldlm_lvbo_init(res);
+		if (rc < 0) {
+			LDLM_DEBUG(lock, "delayed lvb init failed (rc %d)", rc);
+			GOTO(out, rc);
+		}
+	}
+
+        OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_BLOCKED, obd_timeout * 2);
+        /* Don't enqueue a lock onto the export if it is been disonnected
+         * due to eviction (bug 3822) or server umount (bug 24324).
+         * Cancel it now instead. */
+        if (req->rq_export->exp_disconnected) {
+                LDLM_ERROR(lock, "lock on disconnected export %p",
+                           req->rq_export);
+                GOTO(out, rc = -ENOTCONN);
+        }
+
+        lock->l_export = class_export_lock_get(req->rq_export, lock);
+        if (lock->l_export->exp_lock_hash)
+                cfs_hash_add(lock->l_export->exp_lock_hash,
+                             &lock->l_remote_handle,
+                             &lock->l_exp_hash);
+
+	/* Inherit the enqueue flags before the operation, because we do not
+	 * keep the res lock on return and next operations (BL AST) may proceed
+	 * without them. */
+	lock->l_flags |= ldlm_flags_from_wire(dlm_req->lock_flags &
+					      LDLM_FL_INHERIT_MASK);
+
+	ldlm_convert_policy_to_local(req->rq_export,
+				     dlm_req->lock_desc.l_resource.lr_type,
+				     &dlm_req->lock_desc.l_policy_data,
+				     &lock->l_policy_data);
+	if (dlm_req->lock_desc.l_resource.lr_type == LDLM_EXTENT)
+		lock->l_req_extent = lock->l_policy_data.l_extent;
+
+existing_lock:
+
+        if (flags & LDLM_FL_HAS_INTENT) {
+                /* In this case, the reply buffer is allocated deep in
+                 * local_lock_enqueue by the policy function. */
+                cookie = req;
+        } else {
+                /* based on the assumption that lvb size never changes during
+                 * resource life time otherwise it need resource->lr_lock's
+                 * protection */
+		req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB,
+				     RCL_SERVER, ldlm_lvbo_size(lock));
+
+                if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR))
+                        GOTO(out, rc = -ENOMEM);
+
+                rc = req_capsule_server_pack(&req->rq_pill);
+                if (rc)
+                        GOTO(out, rc);
+        }
+
+	err = ldlm_lock_enqueue(ns, &lock, cookie, &flags);
+	if (err) {
+		if ((int)err < 0)
+			rc = (int)err;
+		GOTO(out, err);
+	}
+
+        dlm_rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+
+        ldlm_lock2desc(lock, &dlm_rep->lock_desc);
+        ldlm_lock2handle(lock, &dlm_rep->lock_handle);
+
+	if (lock && lock->l_resource->lr_type == LDLM_EXTENT)
+		OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_BL_EVICT, 6);
+
+        /* We never send a blocking AST until the lock is granted, but
+         * we can tell it right now */
+        lock_res_and_lock(lock);
+
+        /* Now take into account flags to be inherited from original lock
+           request both in reply to client and in our own lock flags. */
+	dlm_rep->lock_flags = ldlm_flags_to_wire(flags);
+	lock->l_flags |= flags & LDLM_FL_INHERIT_MASK;
+
+        /* Don't move a pending lock onto the export if it has already been
+         * disconnected due to eviction (bug 5683) or server umount (bug 24324).
+         * Cancel it now instead. */
+        if (unlikely(req->rq_export->exp_disconnected ||
+                     OBD_FAIL_CHECK(OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT))) {
+                LDLM_ERROR(lock, "lock on destroyed export %p", req->rq_export);
+                rc = -ENOTCONN;
+	} else if (ldlm_is_ast_sent(lock)) {
+		dlm_rep->lock_flags |= ldlm_flags_to_wire(LDLM_FL_AST_SENT);
+                if (lock->l_granted_mode == lock->l_req_mode) {
+                        /*
+                         * Only cancel lock if it was granted, because it would
+                         * be destroyed immediately and would never be granted
+                         * in the future, causing timeouts on client.  Not
+                         * granted lock will be cancelled immediately after
+                         * sending completion AST.
+                         */
+                        if (dlm_rep->lock_flags & LDLM_FL_CANCEL_ON_BLOCK) {
+                                unlock_res_and_lock(lock);
+                                ldlm_lock_cancel(lock);
+                                lock_res_and_lock(lock);
+                        } else
+                                ldlm_add_waiting_lock(lock);
+                }
+        }
+        /* Make sure we never ever grant usual metadata locks to liblustre
+           clients */
+        if ((dlm_req->lock_desc.l_resource.lr_type == LDLM_PLAIN ||
+            dlm_req->lock_desc.l_resource.lr_type == LDLM_IBITS) &&
+             req->rq_export->exp_libclient) {
+		if (unlikely(!ldlm_is_cancel_on_block(lock) ||
+                             !(dlm_rep->lock_flags & LDLM_FL_CANCEL_ON_BLOCK))){
+                        CERROR("Granting sync lock to libclient. "
+			       "req fl %d, rep fl %d, lock fl %#llx\n",
+                               dlm_req->lock_flags, dlm_rep->lock_flags,
+                               lock->l_flags);
+                        LDLM_ERROR(lock, "sync lock");
+			if (dlm_req->lock_flags & LDLM_FL_HAS_INTENT) {
+				struct ldlm_intent *it;
+
+				it = req_capsule_client_get(&req->rq_pill,
+							    &RMF_LDLM_INTENT);
+				if (it != NULL) {
+					CERROR("This is intent %s (%llu)\n",
+					       ldlm_it2str(it->opc), it->opc);
+				}
+			}
+                }
+        }
+
+        unlock_res_and_lock(lock);
+
+        EXIT;
+ out:
+        req->rq_status = rc ?: err; /* return either error - bug 11190 */
+        if (!req->rq_packed_final) {
+                err = lustre_pack_reply(req, 1, NULL, NULL);
+                if (rc == 0)
+                        rc = err;
+        }
+
+        /* The LOCK_CHANGED code in ldlm_lock_enqueue depends on this
+         * ldlm_reprocess_all.  If this moves, revisit that code. -phil */
+	if (lock != NULL) {
+		LDLM_DEBUG(lock, "server-side enqueue handler, sending reply"
+			   "(err=%d, rc=%d)", err, rc);
+
+		if (rc == 0) {
+			if (req_capsule_has_field(&req->rq_pill, &RMF_DLM_LVB,
+						  RCL_SERVER) &&
+			    ldlm_lvbo_size(lock) > 0) {
+				void *buf;
+				int buflen;
+
+				buf = req_capsule_server_get(&req->rq_pill,
+							     &RMF_DLM_LVB);
+				LASSERTF(buf != NULL, "req %p, lock %p\n",
+					 req, lock);
+				buflen = req_capsule_get_size(&req->rq_pill,
+						&RMF_DLM_LVB, RCL_SERVER);
+				/* non-replayed lock, delayed lvb init may
+				 * need to be occur now */
+				if ((buflen > 0) && !(flags & LDLM_FL_REPLAY)) {
+					buflen = ldlm_lvbo_fill(lock, buf,
+								buflen);
+					if (buflen >= 0)
+						req_capsule_shrink(
+							&req->rq_pill,
+							&RMF_DLM_LVB,
+							buflen, RCL_SERVER);
+					else
+						rc = buflen;
+				} else if (flags & LDLM_FL_REPLAY) {
+					/* no LVB resend upon replay */
+					if (buflen > 0)
+						req_capsule_shrink(
+							&req->rq_pill,
+							&RMF_DLM_LVB,
+							0, RCL_SERVER);
+					else
+						rc = buflen;
+				} else {
+					rc = buflen;
+				}
+			}
+		}
+
+		if (rc != 0 && !(flags & LDLM_FL_RESENT)) {
+			if (lock->l_export) {
+				ldlm_lock_cancel(lock);
+			} else {
+				lock_res_and_lock(lock);
+				ldlm_resource_unlink_lock(lock);
+				ldlm_lock_destroy_nolock(lock);
+				unlock_res_and_lock(lock);
+
+			}
+		}
+
+                if (!err && dlm_req->lock_desc.l_resource.lr_type != LDLM_FLOCK)
+                        ldlm_reprocess_all(lock->l_resource);
+
+                LDLM_LOCK_RELEASE(lock);
+        }
+
+        LDLM_DEBUG_NOLOCK("server-side enqueue handler END (lock %p, rc %d)",
+                          lock, rc);
+
+        return rc;
+}
+
+/**
+ * Old-style LDLM main entry point for server code enqueue.
+ */
+int ldlm_handle_enqueue(struct ptlrpc_request *req,
+                        ldlm_completion_callback completion_callback,
+                        ldlm_blocking_callback blocking_callback,
+                        ldlm_glimpse_callback glimpse_callback)
+{
+        struct ldlm_request *dlm_req;
+        struct ldlm_callback_suite cbs = {
+                .lcs_completion = completion_callback,
+                .lcs_blocking   = blocking_callback,
+                .lcs_glimpse    = glimpse_callback
+        };
+        int rc;
+
+        dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+        if (dlm_req != NULL) {
+                rc = ldlm_handle_enqueue0(req->rq_export->exp_obd->obd_namespace,
+                                          req, dlm_req, &cbs);
+        } else {
+                rc = -EFAULT;
+        }
+        return rc;
+}
+
+/**
+ * Main LDLM entry point for server code to process lock conversion requests.
+ */
+int ldlm_handle_convert0(struct ptlrpc_request *req,
+                         const struct ldlm_request *dlm_req)
+{
+        struct ldlm_reply *dlm_rep;
+        struct ldlm_lock *lock;
+        int rc;
+        ENTRY;
+
+        if (req->rq_export && req->rq_export->exp_nid_stats &&
+            req->rq_export->exp_nid_stats->nid_ldlm_stats)
+                lprocfs_counter_incr(req->rq_export->exp_nid_stats->nid_ldlm_stats,
+                                     LDLM_CONVERT - LDLM_FIRST_OPC);
+
+        rc = req_capsule_server_pack(&req->rq_pill);
+        if (rc)
+                RETURN(rc);
+
+        dlm_rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+        dlm_rep->lock_flags = dlm_req->lock_flags;
+
+        lock = ldlm_handle2lock(&dlm_req->lock_handle[0]);
+        if (!lock) {
+		req->rq_status = LUSTRE_EINVAL;
+        } else {
+                void *res = NULL;
+
+                LDLM_DEBUG(lock, "server-side convert handler START");
+
+                res = ldlm_lock_convert(lock, dlm_req->lock_desc.l_req_mode,
+                                        &dlm_rep->lock_flags);
+                if (res) {
+                        if (ldlm_del_waiting_lock(lock))
+                                LDLM_DEBUG(lock, "converted waiting lock");
+                        req->rq_status = 0;
+                } else {
+			req->rq_status = LUSTRE_EDEADLK;
+                }
+        }
+
+        if (lock) {
+                if (!req->rq_status)
+                        ldlm_reprocess_all(lock->l_resource);
+                LDLM_DEBUG(lock, "server-side convert handler END");
+                LDLM_LOCK_PUT(lock);
+        } else
+                LDLM_DEBUG_NOLOCK("server-side convert handler END");
+
+        RETURN(0);
+}
+
+/**
+ * Old-style main LDLM entry point for server code to process lock conversion
+ * requests.
+ */
+int ldlm_handle_convert(struct ptlrpc_request *req)
+{
+        int rc;
+        struct ldlm_request *dlm_req;
+
+        dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+        if (dlm_req != NULL) {
+                rc = ldlm_handle_convert0(req, dlm_req);
+        } else {
+                CERROR ("Can't unpack dlm_req\n");
+                rc = -EFAULT;
+        }
+        return rc;
+}
+
+/**
+ * Cancel all the locks whose handles are packed into ldlm_request
+ *
+ * Called by server code expecting such combined cancel activity
+ * requests.
+ */
+int ldlm_request_cancel(struct ptlrpc_request *req,
+			const struct ldlm_request *dlm_req,
+			int first, enum lustre_at_flags flags)
+{
+        struct ldlm_resource *res, *pres = NULL;
+        struct ldlm_lock *lock;
+        int i, count, done = 0;
+        ENTRY;
+
+        count = dlm_req->lock_count ? dlm_req->lock_count : 1;
+        if (first >= count)
+                RETURN(0);
+
+	if (count == 1 && dlm_req->lock_handle[0].cookie == 0)
+		RETURN(0);
+
+        /* There is no lock on the server at the replay time,
+         * skip lock cancelling to make replay tests to pass. */
+        if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
+                RETURN(0);
+
+        LDLM_DEBUG_NOLOCK("server-side cancel handler START: %d locks, "
+                          "starting at %d", count, first);
+
+        for (i = first; i < count; i++) {
+                lock = ldlm_handle2lock(&dlm_req->lock_handle[i]);
+                if (!lock) {
+                        LDLM_DEBUG_NOLOCK("server-side cancel handler stale "
+					  "lock (cookie %llu)",
+                                          dlm_req->lock_handle[i].cookie);
+                        continue;
+                }
+
+                res = lock->l_resource;
+                done++;
+
+		/* This code is an optimization to only attempt lock
+		 * granting on the resource (that could be CPU-expensive)
+		 * after we are done cancelling lock in that resource. */
+                if (res != pres) {
+                        if (pres != NULL) {
+                                ldlm_reprocess_all(pres);
+                                LDLM_RESOURCE_DELREF(pres);
+                                ldlm_resource_putref(pres);
+                        }
+                        if (res != NULL) {
+                                ldlm_resource_getref(res);
+                                LDLM_RESOURCE_ADDREF(res);
+                                ldlm_res_lvbo_update(res, NULL, 1);
+                        }
+                        pres = res;
+                }
+
+		if ((flags & LATF_STATS) && ldlm_is_ast_sent(lock)) {
+			time64_t delay = ktime_get_real_seconds() -
+					 lock->l_last_activity;
+			LDLM_DEBUG(lock, "server cancels blocked lock after %llds",
+				   (s64)delay);
+			at_measured(&lock->l_export->exp_bl_lock_at, delay);
+		}
+                ldlm_lock_cancel(lock);
+                LDLM_LOCK_PUT(lock);
+        }
+        if (pres != NULL) {
+                ldlm_reprocess_all(pres);
+                LDLM_RESOURCE_DELREF(pres);
+                ldlm_resource_putref(pres);
+        }
+        LDLM_DEBUG_NOLOCK("server-side cancel handler END");
+        RETURN(done);
+}
+EXPORT_SYMBOL(ldlm_request_cancel);
+
+/**
+ * Main LDLM entry point for server code to cancel locks.
+ *
+ * Typically gets called from service handler on LDLM_CANCEL opc.
+ */
+int ldlm_handle_cancel(struct ptlrpc_request *req)
+{
+        struct ldlm_request *dlm_req;
+        int rc;
+        ENTRY;
+
+        dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+        if (dlm_req == NULL) {
+                CDEBUG(D_INFO, "bad request buffer for cancel\n");
+                RETURN(-EFAULT);
+        }
+
+        if (req->rq_export && req->rq_export->exp_nid_stats &&
+            req->rq_export->exp_nid_stats->nid_ldlm_stats)
+                lprocfs_counter_incr(req->rq_export->exp_nid_stats->nid_ldlm_stats,
+                                     LDLM_CANCEL - LDLM_FIRST_OPC);
+
+        rc = req_capsule_server_pack(&req->rq_pill);
+        if (rc)
+                RETURN(rc);
+
+	if (!ldlm_request_cancel(req, dlm_req, 0, LATF_STATS))
+		req->rq_status = LUSTRE_ESTALE;
+
+        RETURN(ptlrpc_reply(req));
+}
+#endif /* HAVE_SERVER_SUPPORT */
+
+/**
+ * Callback handler for receiving incoming blocking ASTs.
+ *
+ * This can only happen on client side.
+ */
+void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
+                             struct ldlm_lock_desc *ld, struct ldlm_lock *lock)
+{
+        int do_ast;
+        ENTRY;
+
+        LDLM_DEBUG(lock, "client blocking AST callback handler");
+
+        lock_res_and_lock(lock);
+	ldlm_set_cbpending(lock);
+
+	if (ldlm_is_cancel_on_block(lock))
+		ldlm_set_cancel(lock);
+
+        do_ast = (!lock->l_readers && !lock->l_writers);
+        unlock_res_and_lock(lock);
+
+        if (do_ast) {
+                CDEBUG(D_DLMTRACE, "Lock %p already unused, calling callback (%p)\n",
+                       lock, lock->l_blocking_ast);
+                if (lock->l_blocking_ast != NULL)
+                        lock->l_blocking_ast(lock, ld, lock->l_ast_data,
+                                             LDLM_CB_BLOCKING);
+        } else {
+                CDEBUG(D_DLMTRACE, "Lock %p is referenced, will be cancelled later\n",
+                       lock);
+        }
+
+        LDLM_DEBUG(lock, "client blocking callback handler END");
+        LDLM_LOCK_RELEASE(lock);
+        EXIT;
+}
+
+/**
+ * Callback handler for receiving incoming completion ASTs.
+ *
+ * This only can happen on client side.
+ */
+static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
+                                    struct ldlm_namespace *ns,
+                                    struct ldlm_request *dlm_req,
+                                    struct ldlm_lock *lock)
+{
+	struct list_head ast_list;
+	int lvb_len;
+	int rc = 0;
+	ENTRY;
+
+	LDLM_DEBUG(lock, "client completion callback handler START");
+
+	INIT_LIST_HEAD(&ast_list);
+	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) {
+		int to = cfs_time_seconds(1);
+		while (to > 0) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(to);
+			if (lock->l_granted_mode == lock->l_req_mode ||
+			    ldlm_is_destroyed(lock))
+				break;
+		}
+	}
+
+	lvb_len = req_capsule_get_size(&req->rq_pill, &RMF_DLM_LVB, RCL_CLIENT);
+	if (lvb_len < 0) {
+		LDLM_ERROR(lock, "Fail to get lvb_len, rc = %d", lvb_len);
+		GOTO(out, rc = lvb_len);
+	} else if (lvb_len > 0) {
+		if (lock->l_lvb_len > 0) {
+			/* for extent lock, lvb contains ost_lvb{}. */
+			LASSERT(lock->l_lvb_data != NULL);
+
+			if (unlikely(lock->l_lvb_len < lvb_len)) {
+				LDLM_ERROR(lock, "Replied LVB is larger than "
+					   "expectation, expected = %d, "
+					   "replied = %d",
+					   lock->l_lvb_len, lvb_len);
+				GOTO(out, rc = -EINVAL);
+			}
+		}
+	}
+
+	lock_res_and_lock(lock);
+	if (ldlm_is_destroyed(lock) ||
+	    lock->l_granted_mode == lock->l_req_mode) {
+		/* bug 11300: the lock has already been granted */
+		unlock_res_and_lock(lock);
+		LDLM_DEBUG(lock, "Double grant race happened");
+		GOTO(out, rc = 0);
+	}
+
+	/* If we receive the completion AST before the actual enqueue returned,
+	 * then we might need to switch lock modes, resources, or extents. */
+	if (dlm_req->lock_desc.l_granted_mode != lock->l_req_mode) {
+		lock->l_req_mode = dlm_req->lock_desc.l_granted_mode;
+		LDLM_DEBUG(lock, "completion AST, new lock mode");
+	}
+
+	if (lock->l_resource->lr_type != LDLM_PLAIN) {
+		ldlm_convert_policy_to_local(req->rq_export,
+					  dlm_req->lock_desc.l_resource.lr_type,
+					  &dlm_req->lock_desc.l_policy_data,
+					  &lock->l_policy_data);
+		LDLM_DEBUG(lock, "completion AST, new policy data");
+	}
+
+        ldlm_resource_unlink_lock(lock);
+        if (memcmp(&dlm_req->lock_desc.l_resource.lr_name,
+                   &lock->l_resource->lr_name,
+                   sizeof(lock->l_resource->lr_name)) != 0) {
+                unlock_res_and_lock(lock);
+		rc = ldlm_lock_change_resource(ns, lock,
+				&dlm_req->lock_desc.l_resource.lr_name);
+		if (rc < 0) {
+			LDLM_ERROR(lock, "Failed to allocate resource");
+			GOTO(out, rc);
+		}
+                LDLM_DEBUG(lock, "completion AST, new resource");
+                CERROR("change resource!\n");
+                lock_res_and_lock(lock);
+        }
+
+        if (dlm_req->lock_flags & LDLM_FL_AST_SENT) {
+		/* BL_AST locks are not needed in LRU.
+		 * Let ldlm_cancel_lru() be fast. */
+                ldlm_lock_remove_from_lru(lock);
+		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST;
+                LDLM_DEBUG(lock, "completion AST includes blocking AST");
+        }
+
+	if (lock->l_lvb_len > 0) {
+		rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_CLIENT,
+				   lock->l_lvb_data, lvb_len);
+		if (rc < 0) {
+			unlock_res_and_lock(lock);
+			GOTO(out, rc);
+		}
+	}
+
+        ldlm_grant_lock(lock, &ast_list);
+        unlock_res_and_lock(lock);
+
+        LDLM_DEBUG(lock, "callback handler finished, about to run_ast_work");
+
+        /* Let Enqueue to call osc_lock_upcall() and initialize
+         * l_ast_data */
+        OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 2);
+
+        ldlm_run_ast_work(ns, &ast_list, LDLM_WORK_CP_AST);
+
+        LDLM_DEBUG_NOLOCK("client completion callback handler END (lock %p)",
+                          lock);
+	GOTO(out, rc);
+
+out:
+	if (rc < 0) {
+		lock_res_and_lock(lock);
+		ldlm_set_failed(lock);
+		unlock_res_and_lock(lock);
+		wake_up(&lock->l_waitq);
+	}
+	LDLM_LOCK_RELEASE(lock);
+}
+
+/**
+ * Callback handler for receiving incoming glimpse ASTs.
+ *
+ * This only can happen on client side.  After handling the glimpse AST
+ * we also consider dropping the lock here if it is unused locally for a
+ * long time.
+ */
+static void ldlm_handle_gl_callback(struct ptlrpc_request *req,
+                                    struct ldlm_namespace *ns,
+                                    struct ldlm_request *dlm_req,
+                                    struct ldlm_lock *lock)
+{
+        int rc = -ENOSYS;
+        ENTRY;
+
+        LDLM_DEBUG(lock, "client glimpse AST callback handler");
+
+        if (lock->l_glimpse_ast != NULL)
+                rc = lock->l_glimpse_ast(lock, req);
+
+        if (req->rq_repmsg != NULL) {
+                ptlrpc_reply(req);
+        } else {
+                req->rq_status = rc;
+                ptlrpc_error(req);
+        }
+
+        lock_res_and_lock(lock);
+        if (lock->l_granted_mode == LCK_PW &&
+            !lock->l_readers && !lock->l_writers &&
+	    ktime_after(ktime_get(),
+			ktime_add(lock->l_last_used,
+				  ktime_set(10, 0)))) {
+                unlock_res_and_lock(lock);
+                if (ldlm_bl_to_thread_lock(ns, NULL, lock))
+                        ldlm_handle_bl_callback(ns, NULL, lock);
+
+                EXIT;
+                return;
+        }
+        unlock_res_and_lock(lock);
+        LDLM_LOCK_RELEASE(lock);
+        EXIT;
+}
+
+static int ldlm_callback_reply(struct ptlrpc_request *req, int rc)
+{
+        if (req->rq_no_reply)
+                return 0;
+
+        req->rq_status = rc;
+        if (!req->rq_packed_final) {
+                rc = lustre_pack_reply(req, 1, NULL, NULL);
+                if (rc)
+                        return rc;
+        }
+        return ptlrpc_reply(req);
+}
+
+static int __ldlm_bl_to_thread(struct ldlm_bl_work_item *blwi,
+			       enum ldlm_cancel_flags cancel_flags)
+{
+	struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool;
+	ENTRY;
+
+	spin_lock(&blp->blp_lock);
+	if (blwi->blwi_lock &&
+	    ldlm_is_discard_data(blwi->blwi_lock)) {
+		/* add LDLM_FL_DISCARD_DATA requests to the priority list */
+		list_add_tail(&blwi->blwi_entry, &blp->blp_prio_list);
+	} else {
+		/* other blocking callbacks are added to the regular list */
+		list_add_tail(&blwi->blwi_entry, &blp->blp_list);
+	}
+	spin_unlock(&blp->blp_lock);
+
+	wake_up(&blp->blp_waitq);
+
+	/* can not check blwi->blwi_flags as blwi could be already freed in
+	   LCF_ASYNC mode */
+	if (!(cancel_flags & LCF_ASYNC))
+		wait_for_completion(&blwi->blwi_comp);
+
+	RETURN(0);
+}
+
+static inline void init_blwi(struct ldlm_bl_work_item *blwi,
+			     struct ldlm_namespace *ns,
+			     struct ldlm_lock_desc *ld,
+			     struct list_head *cancels, int count,
+			     struct ldlm_lock *lock,
+			     enum ldlm_cancel_flags cancel_flags)
+{
+	init_completion(&blwi->blwi_comp);
+	INIT_LIST_HEAD(&blwi->blwi_head);
+
+	if (memory_pressure_get())
+                blwi->blwi_mem_pressure = 1;
+
+        blwi->blwi_ns = ns;
+	blwi->blwi_flags = cancel_flags;
+        if (ld != NULL)
+                blwi->blwi_ld = *ld;
+        if (count) {
+		list_add(&blwi->blwi_head, cancels);
+		list_del_init(cancels);
+                blwi->blwi_count = count;
+        } else {
+                blwi->blwi_lock = lock;
+        }
+}
+
+/**
+ * Queues a list of locks \a cancels containing \a count locks
+ * for later processing by a blocking thread.  If \a count is zero,
+ * then the lock referenced as \a lock is queued instead.
+ *
+ * The blocking thread would then call ->l_blocking_ast callback in the lock.
+ * If list addition fails an error is returned and caller is supposed to
+ * call ->l_blocking_ast itself.
+ */
+static int ldlm_bl_to_thread(struct ldlm_namespace *ns,
+			     struct ldlm_lock_desc *ld,
+			     struct ldlm_lock *lock,
+			     struct list_head *cancels, int count,
+			     enum ldlm_cancel_flags cancel_flags)
+{
+	ENTRY;
+
+	if (cancels && count == 0)
+		RETURN(0);
+
+	if (cancel_flags & LCF_ASYNC) {
+		struct ldlm_bl_work_item *blwi;
+
+		OBD_ALLOC(blwi, sizeof(*blwi));
+		if (blwi == NULL)
+			RETURN(-ENOMEM);
+		init_blwi(blwi, ns, ld, cancels, count, lock, cancel_flags);
+
+		RETURN(__ldlm_bl_to_thread(blwi, cancel_flags));
+	} else {
+		/* if it is synchronous call do minimum mem alloc, as it could
+		 * be triggered from kernel shrinker
+		 */
+		struct ldlm_bl_work_item blwi;
+
+		memset(&blwi, 0, sizeof(blwi));
+		init_blwi(&blwi, ns, ld, cancels, count, lock, cancel_flags);
+		RETURN(__ldlm_bl_to_thread(&blwi, cancel_flags));
+	}
+}
+
+
+int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld,
+			   struct ldlm_lock *lock)
+{
+	return ldlm_bl_to_thread(ns, ld, lock, NULL, 0, LCF_ASYNC);
+}
+
+int ldlm_bl_to_thread_list(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld,
+			   struct list_head *cancels, int count,
+			   enum ldlm_cancel_flags cancel_flags)
+{
+	return ldlm_bl_to_thread(ns, ld, NULL, cancels, count, cancel_flags);
+}
+
+int ldlm_bl_thread_wakeup(void)
+{
+	wake_up(&ldlm_state->ldlm_bl_pool->blp_waitq);
+	return 0;
+}
+
+/* Setinfo coming from Server (eg MDT) to Client (eg MDC)! */
+static int ldlm_handle_setinfo(struct ptlrpc_request *req)
+{
+        struct obd_device *obd = req->rq_export->exp_obd;
+        char *key;
+        void *val;
+        int keylen, vallen;
+        int rc = -ENOSYS;
+        ENTRY;
+
+        DEBUG_REQ(D_HSM, req, "%s: handle setinfo\n", obd->obd_name);
+
+        req_capsule_set(&req->rq_pill, &RQF_OBD_SET_INFO);
+
+        key = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+        if (key == NULL) {
+                DEBUG_REQ(D_IOCTL, req, "no set_info key");
+                RETURN(-EFAULT);
+        }
+        keylen = req_capsule_get_size(&req->rq_pill, &RMF_SETINFO_KEY,
+                                      RCL_CLIENT);
+        val = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_VAL);
+        if (val == NULL) {
+                DEBUG_REQ(D_IOCTL, req, "no set_info val");
+                RETURN(-EFAULT);
+        }
+        vallen = req_capsule_get_size(&req->rq_pill, &RMF_SETINFO_VAL,
+                                      RCL_CLIENT);
+
+        /* We are responsible for swabbing contents of val */
+
+        if (KEY_IS(KEY_HSM_COPYTOOL_SEND))
+                /* Pass it on to mdc (the "export" in this case) */
+                rc = obd_set_info_async(req->rq_svc_thread->t_env,
+                                        req->rq_export,
+                                        sizeof(KEY_HSM_COPYTOOL_SEND),
+                                        KEY_HSM_COPYTOOL_SEND,
+                                        vallen, val, NULL);
+        else
+                DEBUG_REQ(D_WARNING, req, "ignoring unknown key %s", key);
+
+        return rc;
+}
+
+static inline void ldlm_callback_errmsg(struct ptlrpc_request *req,
+					const char *msg, int rc,
+					const struct lustre_handle *handle)
+{
+        DEBUG_REQ((req->rq_no_reply || rc) ? D_WARNING : D_DLMTRACE, req,
+		  "%s: [nid %s] [rc %d] [lock %#llx]",
+                  msg, libcfs_id2str(req->rq_peer), rc,
+                  handle ? handle->cookie : 0);
+        if (req->rq_no_reply)
+                CWARN("No reply was sent, maybe cause bug 21636.\n");
+        else if (rc)
+                CWARN("Send reply failed, maybe cause bug 21636.\n");
+}
+
+/* TODO: handle requests in a similar way as MDT: see mdt_handle_common() */
+static int ldlm_callback_handler(struct ptlrpc_request *req)
+{
+        struct ldlm_namespace *ns;
+        struct ldlm_request *dlm_req;
+        struct ldlm_lock *lock;
+        int rc;
+        ENTRY;
+
+        /* Requests arrive in sender's byte order.  The ptlrpc service
+         * handler has already checked and, if necessary, byte-swapped the
+         * incoming request message body, but I am responsible for the
+         * message buffers. */
+
+        /* do nothing for sec context finalize */
+        if (lustre_msg_get_opc(req->rq_reqmsg) == SEC_CTX_FINI)
+                RETURN(0);
+
+        req_capsule_init(&req->rq_pill, req, RCL_SERVER);
+
+        if (req->rq_export == NULL) {
+                rc = ldlm_callback_reply(req, -ENOTCONN);
+                ldlm_callback_errmsg(req, "Operate on unconnected server",
+                                     rc, NULL);
+                RETURN(0);
+        }
+
+        LASSERT(req->rq_export != NULL);
+        LASSERT(req->rq_export->exp_obd != NULL);
+
+	switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+	case LDLM_BL_CALLBACK:
+		if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET)) {
+			if (cfs_fail_err)
+				ldlm_callback_reply(req, -(int)cfs_fail_err);
+			RETURN(0);
+		}
+		break;
+	case LDLM_CP_CALLBACK:
+		if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CP_CALLBACK_NET))
+			RETURN(0);
+		break;
+	case LDLM_GL_CALLBACK:
+		if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_GL_CALLBACK_NET))
+			RETURN(0);
+		break;
+        case LDLM_SET_INFO:
+                rc = ldlm_handle_setinfo(req);
+                ldlm_callback_reply(req, rc);
+                RETURN(0);
+        case LLOG_ORIGIN_HANDLE_CREATE:
+                req_capsule_set(&req->rq_pill, &RQF_LLOG_ORIGIN_HANDLE_CREATE);
+                if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
+                        RETURN(0);
+		rc = llog_origin_handle_open(req);
+                ldlm_callback_reply(req, rc);
+                RETURN(0);
+        case LLOG_ORIGIN_HANDLE_NEXT_BLOCK:
+                req_capsule_set(&req->rq_pill,
+                                &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK);
+                if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
+                        RETURN(0);
+                rc = llog_origin_handle_next_block(req);
+                ldlm_callback_reply(req, rc);
+                RETURN(0);
+        case LLOG_ORIGIN_HANDLE_READ_HEADER:
+                req_capsule_set(&req->rq_pill,
+                                &RQF_LLOG_ORIGIN_HANDLE_READ_HEADER);
+                if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
+                        RETURN(0);
+                rc = llog_origin_handle_read_header(req);
+                ldlm_callback_reply(req, rc);
+                RETURN(0);
+        case LLOG_ORIGIN_HANDLE_CLOSE:
+                if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
+                        RETURN(0);
+                rc = llog_origin_handle_close(req);
+                ldlm_callback_reply(req, rc);
+                RETURN(0);
+        default:
+                CERROR("unknown opcode %u\n",
+                       lustre_msg_get_opc(req->rq_reqmsg));
+                ldlm_callback_reply(req, -EPROTO);
+                RETURN(0);
+        }
+
+        ns = req->rq_export->exp_obd->obd_namespace;
+        LASSERT(ns != NULL);
+
+        req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK);
+
+        dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+        if (dlm_req == NULL) {
+                rc = ldlm_callback_reply(req, -EPROTO);
+                ldlm_callback_errmsg(req, "Operate without parameter", rc,
+                                     NULL);
+                RETURN(0);
+        }
+
+        /* Force a known safe race, send a cancel to the server for a lock
+         * which the server has already started a blocking callback on. */
+        if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE) &&
+            lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) {
+		rc = ldlm_cli_cancel(&dlm_req->lock_handle[0], 0);
+                if (rc < 0)
+                        CERROR("ldlm_cli_cancel: %d\n", rc);
+        }
+
+        lock = ldlm_handle2lock_long(&dlm_req->lock_handle[0], 0);
+        if (!lock) {
+		CDEBUG(D_DLMTRACE, "callback on lock %#llx - lock "
+                       "disappeared\n", dlm_req->lock_handle[0].cookie);
+                rc = ldlm_callback_reply(req, -EINVAL);
+                ldlm_callback_errmsg(req, "Operate with invalid parameter", rc,
+                                     &dlm_req->lock_handle[0]);
+                RETURN(0);
+        }
+
+	if (ldlm_is_fail_loc(lock) &&
+            lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK)
+                OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE);
+
+        /* Copy hints/flags (e.g. LDLM_FL_DISCARD_DATA) from AST. */
+        lock_res_and_lock(lock);
+	lock->l_flags |= ldlm_flags_from_wire(dlm_req->lock_flags &
+					      LDLM_FL_AST_MASK);
+	if (lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) {
+		/* If somebody cancels lock and cache is already dropped,
+		 * or lock is failed before cp_ast received on client,
+		 * we can tell the server we have no lock. Otherwise, we
+		 * should send cancel after dropping the cache. */
+		if ((ldlm_is_canceling(lock) && ldlm_is_bl_done(lock)) ||
+		     ldlm_is_failed(lock)) {
+			LDLM_DEBUG(lock, "callback on lock %llx - lock disappeared",
+				   dlm_req->lock_handle[0].cookie);
+			unlock_res_and_lock(lock);
+			LDLM_LOCK_RELEASE(lock);
+			rc = ldlm_callback_reply(req, -EINVAL);
+			ldlm_callback_errmsg(req, "Operate on stale lock", rc,
+					     &dlm_req->lock_handle[0]);
+			RETURN(0);
+		}
+		/* BL_AST locks are not needed in LRU.
+		 * Let ldlm_cancel_lru() be fast. */
+		ldlm_lock_remove_from_lru(lock);
+		ldlm_set_bl_ast(lock);
+	}
+        unlock_res_and_lock(lock);
+
+        /* We want the ost thread to get this reply so that it can respond
+         * to ost requests (write cache writeback) that might be triggered
+         * in the callback.
+         *
+         * But we'd also like to be able to indicate in the reply that we're
+         * cancelling right now, because it's unused, or have an intent result
+         * in the reply, so we might have to push the responsibility for sending
+         * the reply down into the AST handlers, alas. */
+
+        switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+        case LDLM_BL_CALLBACK:
+                CDEBUG(D_INODE, "blocking ast\n");
+                req_capsule_extend(&req->rq_pill, &RQF_LDLM_BL_CALLBACK);
+		if (!ldlm_is_cancel_on_block(lock)) {
+                        rc = ldlm_callback_reply(req, 0);
+                        if (req->rq_no_reply || rc)
+                                ldlm_callback_errmsg(req, "Normal process", rc,
+                                                     &dlm_req->lock_handle[0]);
+                }
+                if (ldlm_bl_to_thread_lock(ns, &dlm_req->lock_desc, lock))
+                        ldlm_handle_bl_callback(ns, &dlm_req->lock_desc, lock);
+                break;
+        case LDLM_CP_CALLBACK:
+                CDEBUG(D_INODE, "completion ast\n");
+                req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK);
+                ldlm_callback_reply(req, 0);
+                ldlm_handle_cp_callback(req, ns, dlm_req, lock);
+                break;
+        case LDLM_GL_CALLBACK:
+                CDEBUG(D_INODE, "glimpse ast\n");
+                req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK);
+                ldlm_handle_gl_callback(req, ns, dlm_req, lock);
+                break;
+        default:
+                LBUG();                         /* checked above */
+        }
+
+        RETURN(0);
+}
+
+#ifdef HAVE_SERVER_SUPPORT
+/**
+ * Main handler for canceld thread.
+ *
+ * Separated into its own thread to avoid deadlocks.
+ */
+static int ldlm_cancel_handler(struct ptlrpc_request *req)
+{
+        int rc;
+        ENTRY;
+
+        /* Requests arrive in sender's byte order.  The ptlrpc service
+         * handler has already checked and, if necessary, byte-swapped the
+         * incoming request message body, but I am responsible for the
+         * message buffers. */
+
+        req_capsule_init(&req->rq_pill, req, RCL_SERVER);
+
+        if (req->rq_export == NULL) {
+                struct ldlm_request *dlm_req;
+
+                CERROR("%s from %s arrived at %lu with bad export cookie "
+		       "%llu\n",
+                       ll_opcode2str(lustre_msg_get_opc(req->rq_reqmsg)),
+                       libcfs_nid2str(req->rq_peer.nid),
+                       req->rq_arrival_time.tv_sec,
+                       lustre_msg_get_handle(req->rq_reqmsg)->cookie);
+
+                if (lustre_msg_get_opc(req->rq_reqmsg) == LDLM_CANCEL) {
+                        req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK);
+                        dlm_req = req_capsule_client_get(&req->rq_pill,
+                                                         &RMF_DLM_REQ);
+                        if (dlm_req != NULL)
+                                ldlm_lock_dump_handle(D_ERROR,
+                                                      &dlm_req->lock_handle[0]);
+                }
+                ldlm_callback_reply(req, -ENOTCONN);
+                RETURN(0);
+        }
+
+        switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+
+        /* XXX FIXME move this back to mds/handler.c, bug 249 */
+        case LDLM_CANCEL:
+                req_capsule_set(&req->rq_pill, &RQF_LDLM_CANCEL);
+                CDEBUG(D_INODE, "cancel\n");
+		if (CFS_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_NET) ||
+		    CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND) ||
+		    CFS_FAIL_CHECK(OBD_FAIL_LDLM_BL_EVICT))
+			RETURN(0);
+                rc = ldlm_handle_cancel(req);
+                if (rc)
+                        break;
+                RETURN(0);
+        default:
+                CERROR("invalid opcode %d\n",
+                       lustre_msg_get_opc(req->rq_reqmsg));
+                req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK);
+                ldlm_callback_reply(req, -EINVAL);
+        }
+
+        RETURN(0);
+}
+
+static int ldlm_cancel_hpreq_lock_match(struct ptlrpc_request *req,
+                                        struct ldlm_lock *lock)
+{
+        struct ldlm_request *dlm_req;
+        struct lustre_handle lockh;
+        int rc = 0;
+        int i;
+        ENTRY;
+
+        dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+        if (dlm_req == NULL)
+                RETURN(0);
+
+        ldlm_lock2handle(lock, &lockh);
+        for (i = 0; i < dlm_req->lock_count; i++) {
+                if (lustre_handle_equal(&dlm_req->lock_handle[i],
+                                        &lockh)) {
+                        DEBUG_REQ(D_RPCTRACE, req,
+				  "Prio raised by lock %#llx.", lockh.cookie);
+
+                        rc = 1;
+                        break;
+                }
+        }
+
+        RETURN(rc);
+
+}
+
+static int ldlm_cancel_hpreq_check(struct ptlrpc_request *req)
+{
+        struct ldlm_request *dlm_req;
+        int rc = 0;
+        int i;
+        ENTRY;
+
+        /* no prolong in recovery */
+        if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
+                RETURN(0);
+
+        dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+        if (dlm_req == NULL)
+                RETURN(-EFAULT);
+
+        for (i = 0; i < dlm_req->lock_count; i++) {
+                struct ldlm_lock *lock;
+
+                lock = ldlm_handle2lock(&dlm_req->lock_handle[i]);
+                if (lock == NULL)
+                        continue;
+
+		rc = ldlm_is_ast_sent(lock) ? 1 : 0;
+                if (rc)
+                        LDLM_DEBUG(lock, "hpreq cancel lock");
+                LDLM_LOCK_PUT(lock);
+
+                if (rc)
+                        break;
+        }
+
+        RETURN(rc);
+}
+
+static struct ptlrpc_hpreq_ops ldlm_cancel_hpreq_ops = {
+        .hpreq_lock_match = ldlm_cancel_hpreq_lock_match,
+	.hpreq_check      = ldlm_cancel_hpreq_check,
+	.hpreq_fini       = NULL,
+};
+
+static int ldlm_hpreq_handler(struct ptlrpc_request *req)
+{
+        ENTRY;
+
+        req_capsule_init(&req->rq_pill, req, RCL_SERVER);
+
+        if (req->rq_export == NULL)
+                RETURN(0);
+
+        if (LDLM_CANCEL == lustre_msg_get_opc(req->rq_reqmsg)) {
+                req_capsule_set(&req->rq_pill, &RQF_LDLM_CANCEL);
+                req->rq_ops = &ldlm_cancel_hpreq_ops;
+        }
+        RETURN(0);
+}
+
+static int ldlm_revoke_lock_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			       struct hlist_node *hnode, void *data)
+
+{
+	struct list_head         *rpc_list = data;
+        struct ldlm_lock   *lock = cfs_hash_object(hs, hnode);
+
+        lock_res_and_lock(lock);
+
+        if (lock->l_req_mode != lock->l_granted_mode) {
+                unlock_res_and_lock(lock);
+                return 0;
+        }
+
+        LASSERT(lock->l_resource);
+        if (lock->l_resource->lr_type != LDLM_IBITS &&
+            lock->l_resource->lr_type != LDLM_PLAIN) {
+                unlock_res_and_lock(lock);
+                return 0;
+        }
+
+	if (ldlm_is_ast_sent(lock)) {
+                unlock_res_and_lock(lock);
+                return 0;
+        }
+
+        LASSERT(lock->l_blocking_ast);
+        LASSERT(!lock->l_blocking_lock);
+
+	ldlm_set_ast_sent(lock);
+        if (lock->l_export && lock->l_export->exp_lock_hash) {
+		/* NB: it's safe to call cfs_hash_del() even lock isn't
+		 * in exp_lock_hash. */
+		/* In the function below, .hs_keycmp resolves to
+		 * ldlm_export_lock_keycmp() */
+		/* coverity[overrun-buffer-val] */
+		cfs_hash_del(lock->l_export->exp_lock_hash,
+			     &lock->l_remote_handle, &lock->l_exp_hash);
+	}
+
+	list_add_tail(&lock->l_rk_ast, rpc_list);
+        LDLM_LOCK_GET(lock);
+
+        unlock_res_and_lock(lock);
+        return 0;
+}
+
+void ldlm_revoke_export_locks(struct obd_export *exp)
+{
+	struct list_head  rpc_list;
+	ENTRY;
+
+	INIT_LIST_HEAD(&rpc_list);
+	cfs_hash_for_each_nolock(exp->exp_lock_hash,
+				 ldlm_revoke_lock_cb, &rpc_list, 0);
+	ldlm_run_ast_work(exp->exp_obd->obd_namespace, &rpc_list,
+			  LDLM_WORK_REVOKE_AST);
+
+	EXIT;
+}
+EXPORT_SYMBOL(ldlm_revoke_export_locks);
+#endif /* HAVE_SERVER_SUPPORT */
+
+static int ldlm_bl_get_work(struct ldlm_bl_pool *blp,
+			    struct ldlm_bl_work_item **p_blwi,
+			    struct obd_export **p_exp)
+{
+	struct ldlm_bl_work_item *blwi = NULL;
+	static unsigned int num_bl = 0;
+	static unsigned int num_stale;
+	int num_th = atomic_read(&blp->blp_num_threads);
+
+	*p_exp = obd_stale_export_get();
+
+	spin_lock(&blp->blp_lock);
+	if (*p_exp != NULL) {
+		if (num_th == 1 || ++num_stale < num_th) {
+			spin_unlock(&blp->blp_lock);
+			return 1;
+		} else {
+			num_stale = 0;
+		}
+	}
+
+	/* process a request from the blp_list at least every blp_num_threads */
+	if (!list_empty(&blp->blp_list) &&
+	    (list_empty(&blp->blp_prio_list) || num_bl == 0))
+		blwi = list_entry(blp->blp_list.next,
+				  struct ldlm_bl_work_item, blwi_entry);
+	else
+		if (!list_empty(&blp->blp_prio_list))
+			blwi = list_entry(blp->blp_prio_list.next,
+					  struct ldlm_bl_work_item,
+					  blwi_entry);
+
+	if (blwi) {
+		if (++num_bl >= num_th)
+			num_bl = 0;
+		list_del(&blwi->blwi_entry);
+	}
+	spin_unlock(&blp->blp_lock);
+	*p_blwi = blwi;
+
+	if (*p_exp != NULL && *p_blwi != NULL) {
+		obd_stale_export_put(*p_exp);
+		*p_exp = NULL;
+	}
+
+	return (*p_blwi != NULL || *p_exp != NULL) ? 1 : 0;
+}
+
+/* This only contains temporary data until the thread starts */
+struct ldlm_bl_thread_data {
+	struct ldlm_bl_pool	*bltd_blp;
+	struct completion	bltd_comp;
+	int			bltd_num;
+};
+
+static int ldlm_bl_thread_main(void *arg);
+
+static int ldlm_bl_thread_start(struct ldlm_bl_pool *blp, bool check_busy)
+{
+	struct ldlm_bl_thread_data bltd = { .bltd_blp = blp };
+	struct task_struct *task;
+
+	init_completion(&bltd.bltd_comp);
+
+	bltd.bltd_num = atomic_inc_return(&blp->blp_num_threads);
+	if (bltd.bltd_num >= blp->blp_max_threads) {
+		atomic_dec(&blp->blp_num_threads);
+		return 0;
+	}
+
+	LASSERTF(bltd.bltd_num > 0, "thread num:%d\n", bltd.bltd_num);
+	if (check_busy &&
+	    atomic_read(&blp->blp_busy_threads) < (bltd.bltd_num - 1)) {
+		atomic_dec(&blp->blp_num_threads);
+		return 0;
+	}
+
+	task = kthread_run(ldlm_bl_thread_main, &bltd, "ldlm_bl_%02d",
+			   bltd.bltd_num);
+	if (IS_ERR(task)) {
+		CERROR("cannot start LDLM thread ldlm_bl_%02d: rc %ld\n",
+		       bltd.bltd_num, PTR_ERR(task));
+		atomic_dec(&blp->blp_num_threads);
+		return PTR_ERR(task);
+	}
+	wait_for_completion(&bltd.bltd_comp);
+
+	return 0;
+}
+
+/* Not fatal if racy and have a few too many threads */
+static int ldlm_bl_thread_need_create(struct ldlm_bl_pool *blp,
+				      struct ldlm_bl_work_item *blwi)
+{
+	if (atomic_read(&blp->blp_num_threads) >= blp->blp_max_threads)
+		return 0;
+
+	if (atomic_read(&blp->blp_busy_threads) <
+	    atomic_read(&blp->blp_num_threads))
+		return 0;
+
+	if (blwi != NULL && (blwi->blwi_ns == NULL ||
+			     blwi->blwi_mem_pressure))
+		return 0;
+
+	return 1;
+}
+
+static int ldlm_bl_thread_blwi(struct ldlm_bl_pool *blp,
+			       struct ldlm_bl_work_item *blwi)
+{
+	ENTRY;
+
+	if (blwi->blwi_ns == NULL)
+		/* added by ldlm_cleanup() */
+		RETURN(LDLM_ITER_STOP);
+
+	if (blwi->blwi_mem_pressure)
+		memory_pressure_set();
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL2, 4);
+
+	if (blwi->blwi_count) {
+		int count;
+		/* The special case when we cancel locks in lru
+		 * asynchronously, we pass the list of locks here.
+		 * Thus locks are marked LDLM_FL_CANCELING, but NOT
+		 * canceled locally yet. */
+		count = ldlm_cli_cancel_list_local(&blwi->blwi_head,
+						   blwi->blwi_count,
+						   LCF_BL_AST);
+		ldlm_cli_cancel_list(&blwi->blwi_head, count, NULL,
+				     blwi->blwi_flags);
+	} else {
+		ldlm_handle_bl_callback(blwi->blwi_ns, &blwi->blwi_ld,
+					blwi->blwi_lock);
+	}
+	if (blwi->blwi_mem_pressure)
+		memory_pressure_clr();
+
+	if (blwi->blwi_flags & LCF_ASYNC)
+		OBD_FREE(blwi, sizeof(*blwi));
+	else
+		complete(&blwi->blwi_comp);
+
+	RETURN(0);
+}
+
+/**
+ * Cancel stale locks on export. Cancel blocked locks first.
+ * If the given export has blocked locks, the next in the list may have
+ * them too, thus cancel not blocked locks only if the current export has
+ * no blocked locks.
+ **/
+static int ldlm_bl_thread_exports(struct ldlm_bl_pool *blp,
+				  struct obd_export *exp)
+{
+	int num;
+	ENTRY;
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_BL_EVICT, 4);
+
+	num = ldlm_export_cancel_blocked_locks(exp);
+	if (num == 0)
+		ldlm_export_cancel_locks(exp);
+
+	obd_stale_export_put(exp);
+
+	RETURN(0);
+}
+
+
+/**
+ * Main blocking requests processing thread.
+ *
+ * Callers put locks into its queue by calling ldlm_bl_to_thread.
+ * This thread in the end ends up doing actual call to ->l_blocking_ast
+ * for queued locks.
+ */
+static int ldlm_bl_thread_main(void *arg)
+{
+        struct ldlm_bl_pool *blp;
+	struct ldlm_bl_thread_data *bltd = arg;
+        ENTRY;
+
+	blp = bltd->bltd_blp;
+
+	complete(&bltd->bltd_comp);
+	/* cannot use bltd after this, it is only on caller's stack */
+
+	while (1) {
+		struct l_wait_info lwi = { 0 };
+		struct ldlm_bl_work_item *blwi = NULL;
+		struct obd_export *exp = NULL;
+		int rc;
+
+		rc = ldlm_bl_get_work(blp, &blwi, &exp);
+
+		if (rc == 0)
+			l_wait_event_exclusive(blp->blp_waitq,
+					       ldlm_bl_get_work(blp, &blwi,
+								&exp),
+					       &lwi);
+		atomic_inc(&blp->blp_busy_threads);
+
+		if (ldlm_bl_thread_need_create(blp, blwi))
+			/* discard the return value, we tried */
+			ldlm_bl_thread_start(blp, true);
+
+		if (exp)
+			rc = ldlm_bl_thread_exports(blp, exp);
+		else if (blwi)
+			rc = ldlm_bl_thread_blwi(blp, blwi);
+
+		atomic_dec(&blp->blp_busy_threads);
+
+		if (rc == LDLM_ITER_STOP)
+			break;
+
+		/* If there are many namespaces, we will not sleep waiting for
+		 * work, and must do a cond_resched to avoid holding the CPU
+		 * for too long */
+		cond_resched();
+	}
+
+	atomic_dec(&blp->blp_num_threads);
+	complete(&blp->blp_comp);
+	RETURN(0);
+}
+
+
+static int ldlm_setup(void);
+static int ldlm_cleanup(void);
+
+int ldlm_get_ref(void)
+{
+        int rc = 0;
+        ENTRY;
+	mutex_lock(&ldlm_ref_mutex);
+        if (++ldlm_refcount == 1) {
+                rc = ldlm_setup();
+                if (rc)
+                        ldlm_refcount--;
+        }
+	mutex_unlock(&ldlm_ref_mutex);
+
+        RETURN(rc);
+}
+
+void ldlm_put_ref(void)
+{
+        ENTRY;
+	mutex_lock(&ldlm_ref_mutex);
+        if (ldlm_refcount == 1) {
+                int rc = ldlm_cleanup();
+                if (rc)
+                        CERROR("ldlm_cleanup failed: %d\n", rc);
+                else
+                        ldlm_refcount--;
+        } else {
+                ldlm_refcount--;
+        }
+	mutex_unlock(&ldlm_ref_mutex);
+
+        EXIT;
+}
+
+/*
+ * Export handle<->lock hash operations.
+ */
+static unsigned
+ldlm_export_lock_hash(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+        return cfs_hash_u64_hash(((struct lustre_handle *)key)->cookie, mask);
+}
+
+static void *
+ldlm_export_lock_key(struct hlist_node *hnode)
+{
+        struct ldlm_lock *lock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+        return &lock->l_remote_handle;
+}
+
+static void
+ldlm_export_lock_keycpy(struct hlist_node *hnode, void *key)
+{
+        struct ldlm_lock     *lock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+        lock->l_remote_handle = *(struct lustre_handle *)key;
+}
+
+static int
+ldlm_export_lock_keycmp(const void *key, struct hlist_node *hnode)
+{
+        return lustre_handle_equal(ldlm_export_lock_key(hnode), key);
+}
+
+static void *
+ldlm_export_lock_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+}
+
+static void
+ldlm_export_lock_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+        struct ldlm_lock *lock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+        LDLM_LOCK_GET(lock);
+}
+
+static void
+ldlm_export_lock_put(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+        struct ldlm_lock *lock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+        LDLM_LOCK_RELEASE(lock);
+}
+
+static struct cfs_hash_ops ldlm_export_lock_ops = {
+        .hs_hash        = ldlm_export_lock_hash,
+        .hs_key         = ldlm_export_lock_key,
+        .hs_keycmp      = ldlm_export_lock_keycmp,
+        .hs_keycpy      = ldlm_export_lock_keycpy,
+        .hs_object      = ldlm_export_lock_object,
+        .hs_get         = ldlm_export_lock_get,
+        .hs_put         = ldlm_export_lock_put,
+        .hs_put_locked  = ldlm_export_lock_put,
+};
+
+int ldlm_init_export(struct obd_export *exp)
+{
+	int rc;
+        ENTRY;
+
+        exp->exp_lock_hash =
+                cfs_hash_create(obd_uuid2str(&exp->exp_client_uuid),
+                                HASH_EXP_LOCK_CUR_BITS,
+                                HASH_EXP_LOCK_MAX_BITS,
+                                HASH_EXP_LOCK_BKT_BITS, 0,
+                                CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA,
+                                &ldlm_export_lock_ops,
+                                CFS_HASH_DEFAULT | CFS_HASH_REHASH_KEY |
+                                CFS_HASH_NBLK_CHANGE);
+
+        if (!exp->exp_lock_hash)
+                RETURN(-ENOMEM);
+
+	rc = ldlm_init_flock_export(exp);
+	if (rc)
+		GOTO(err, rc);
+
+        RETURN(0);
+err:
+	ldlm_destroy_export(exp);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_init_export);
+
+void ldlm_destroy_export(struct obd_export *exp)
+{
+        ENTRY;
+        cfs_hash_putref(exp->exp_lock_hash);
+        exp->exp_lock_hash = NULL;
+
+	ldlm_destroy_flock_export(exp);
+        EXIT;
+}
+EXPORT_SYMBOL(ldlm_destroy_export);
+
+static ssize_t cancel_unused_locks_before_replay_show(struct kobject *kobj,
+						      struct attribute *attr,
+						      char *buf)
+{
+	return sprintf(buf, "%d\n", ldlm_cancel_unused_locks_before_replay);
+}
+
+static ssize_t cancel_unused_locks_before_replay_store(struct kobject *kobj,
+						       struct attribute *attr,
+						       const char *buffer,
+						       size_t count)
+{
+	int rc;
+	unsigned long val;
+
+	rc = kstrtoul(buffer, 10, &val);
+	if (rc)
+		return rc;
+
+	ldlm_cancel_unused_locks_before_replay = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(cancel_unused_locks_before_replay);
+
+static struct attribute *ldlm_attrs[] = {
+	&lustre_attr_cancel_unused_locks_before_replay.attr,
+	NULL,
+};
+
+static struct attribute_group ldlm_attr_group = {
+	.attrs = ldlm_attrs,
+};
+
+static int ldlm_setup(void)
+{
+	static struct ptlrpc_service_conf	conf;
+	struct ldlm_bl_pool		       *blp = NULL;
+#ifdef HAVE_SERVER_SUPPORT
+	struct task_struct *task;
+#endif /* HAVE_SERVER_SUPPORT */
+	int i;
+	int rc = 0;
+
+        ENTRY;
+
+        if (ldlm_state != NULL)
+                RETURN(-EALREADY);
+
+        OBD_ALLOC(ldlm_state, sizeof(*ldlm_state));
+        if (ldlm_state == NULL)
+                RETURN(-ENOMEM);
+
+	ldlm_kobj = kobject_create_and_add("ldlm", lustre_kobj);
+	if (!ldlm_kobj)
+		GOTO(out, -ENOMEM);
+
+	rc = sysfs_create_group(ldlm_kobj, &ldlm_attr_group);
+	if (rc)
+		GOTO(out, rc);
+
+	ldlm_ns_kset = kset_create_and_add("namespaces", NULL, ldlm_kobj);
+	if (!ldlm_ns_kset)
+		GOTO(out, -ENOMEM);
+
+	ldlm_svc_kset = kset_create_and_add("services", NULL, ldlm_kobj);
+	if (!ldlm_svc_kset)
+		GOTO(out, -ENOMEM);
+
+#ifdef CONFIG_PROC_FS
+	rc = ldlm_proc_setup();
+	if (rc != 0)
+		GOTO(out, rc);
+#endif /* CONFIG_PROC_FS */
+
+	memset(&conf, 0, sizeof(conf));
+	conf = (typeof(conf)) {
+		.psc_name		= "ldlm_cbd",
+		.psc_watchdog_factor	= 2,
+		.psc_buf		= {
+			.bc_nbufs		= LDLM_CLIENT_NBUFS,
+			.bc_buf_size		= LDLM_BUFSIZE,
+			.bc_req_max_size	= LDLM_MAXREQSIZE,
+			.bc_rep_max_size	= LDLM_MAXREPSIZE,
+			.bc_req_portal		= LDLM_CB_REQUEST_PORTAL,
+			.bc_rep_portal		= LDLM_CB_REPLY_PORTAL,
+		},
+		.psc_thr		= {
+			.tc_thr_name		= "ldlm_cb",
+			.tc_thr_factor		= LDLM_THR_FACTOR,
+			.tc_nthrs_init		= LDLM_NTHRS_INIT,
+			.tc_nthrs_base		= LDLM_NTHRS_BASE,
+			.tc_nthrs_max		= LDLM_NTHRS_MAX,
+			.tc_nthrs_user		= ldlm_num_threads,
+			.tc_cpu_affinity	= 1,
+			.tc_ctx_tags		= LCT_MD_THREAD | LCT_DT_THREAD,
+		},
+		.psc_cpt		= {
+			.cc_pattern		= ldlm_cpts,
+		},
+		.psc_ops		= {
+			.so_req_handler		= ldlm_callback_handler,
+		},
+	};
+	ldlm_state->ldlm_cb_service = \
+			ptlrpc_register_service(&conf, ldlm_svc_proc_dir);
+	if (IS_ERR(ldlm_state->ldlm_cb_service)) {
+		CERROR("failed to start service\n");
+		rc = PTR_ERR(ldlm_state->ldlm_cb_service);
+		ldlm_state->ldlm_cb_service = NULL;
+		GOTO(out, rc);
+	}
+
+#ifdef HAVE_SERVER_SUPPORT
+	memset(&conf, 0, sizeof(conf));
+	conf = (typeof(conf)) {
+		.psc_name		= "ldlm_canceld",
+		.psc_watchdog_factor	= 6,
+		.psc_buf		= {
+			.bc_nbufs		= LDLM_SERVER_NBUFS,
+			.bc_buf_size		= LDLM_BUFSIZE,
+			.bc_req_max_size	= LDLM_MAXREQSIZE,
+			.bc_rep_max_size	= LDLM_MAXREPSIZE,
+			.bc_req_portal		= LDLM_CANCEL_REQUEST_PORTAL,
+			.bc_rep_portal		= LDLM_CANCEL_REPLY_PORTAL,
+
+		},
+		.psc_thr		= {
+			.tc_thr_name		= "ldlm_cn",
+			.tc_thr_factor		= LDLM_THR_FACTOR,
+			.tc_nthrs_init		= LDLM_NTHRS_INIT,
+			.tc_nthrs_base		= LDLM_NTHRS_BASE,
+			.tc_nthrs_max		= LDLM_NTHRS_MAX,
+			.tc_nthrs_user		= ldlm_num_threads,
+			.tc_cpu_affinity	= 1,
+			.tc_ctx_tags		= LCT_MD_THREAD | \
+						  LCT_DT_THREAD | \
+						  LCT_CL_THREAD,
+		},
+		.psc_cpt		= {
+			.cc_pattern		= ldlm_cpts,
+		},
+		.psc_ops		= {
+			.so_req_handler		= ldlm_cancel_handler,
+			.so_hpreq_handler	= ldlm_hpreq_handler,
+		},
+	};
+	ldlm_state->ldlm_cancel_service = \
+			ptlrpc_register_service(&conf, ldlm_svc_proc_dir);
+	if (IS_ERR(ldlm_state->ldlm_cancel_service)) {
+		CERROR("failed to start service\n");
+		rc = PTR_ERR(ldlm_state->ldlm_cancel_service);
+		ldlm_state->ldlm_cancel_service = NULL;
+		GOTO(out, rc);
+	}
+#endif /* HAVE_SERVER_SUPPORT */
+
+	OBD_ALLOC(blp, sizeof(*blp));
+	if (blp == NULL)
+		GOTO(out, rc = -ENOMEM);
+	ldlm_state->ldlm_bl_pool = blp;
+
+	spin_lock_init(&blp->blp_lock);
+	INIT_LIST_HEAD(&blp->blp_list);
+	INIT_LIST_HEAD(&blp->blp_prio_list);
+	init_waitqueue_head(&blp->blp_waitq);
+	atomic_set(&blp->blp_num_threads, 0);
+	atomic_set(&blp->blp_busy_threads, 0);
+
+	if (ldlm_num_threads == 0) {
+		blp->blp_min_threads = LDLM_NTHRS_INIT;
+		blp->blp_max_threads = LDLM_NTHRS_MAX;
+	} else {
+		blp->blp_min_threads = blp->blp_max_threads = \
+			min_t(int, LDLM_NTHRS_MAX, max_t(int, LDLM_NTHRS_INIT,
+							 ldlm_num_threads));
+	}
+
+	for (i = 0; i < blp->blp_min_threads; i++) {
+		rc = ldlm_bl_thread_start(blp, false);
+		if (rc < 0)
+			GOTO(out, rc);
+	}
+
+#ifdef HAVE_SERVER_SUPPORT
+	task = kthread_run(expired_lock_main, NULL, "ldlm_elt");
+	if (IS_ERR(task)) {
+		rc = PTR_ERR(task);
+		CERROR("Cannot start ldlm expired-lock thread: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	wait_event(expired_lock_wait_queue,
+		   expired_lock_thread_state == ELT_READY);
+#endif /* HAVE_SERVER_SUPPORT */
+
+	rc = ldlm_pools_init();
+	if (rc) {
+		CERROR("Failed to initialize LDLM pools: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	rc = ldlm_reclaim_setup();
+	if (rc) {
+		CERROR("Failed to setup reclaim thread: rc = %d\n", rc);
+		GOTO(out, rc);
+	}
+	RETURN(0);
+
+ out:
+	ldlm_cleanup();
+	RETURN(rc);
+}
+
+static int ldlm_cleanup(void)
+{
+        ENTRY;
+
+	if (!list_empty(ldlm_namespace_list(LDLM_NAMESPACE_SERVER)) ||
+	    !list_empty(ldlm_namespace_list(LDLM_NAMESPACE_CLIENT))) {
+                CERROR("ldlm still has namespaces; clean these up first.\n");
+                ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE);
+                ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE);
+                RETURN(-EBUSY);
+        }
+
+	ldlm_reclaim_cleanup();
+	ldlm_pools_fini();
+
+	if (ldlm_state->ldlm_bl_pool != NULL) {
+		struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool;
+
+		while (atomic_read(&blp->blp_num_threads) > 0) {
+			struct ldlm_bl_work_item blwi = { .blwi_ns = NULL };
+
+			init_completion(&blp->blp_comp);
+
+			spin_lock(&blp->blp_lock);
+			list_add_tail(&blwi.blwi_entry, &blp->blp_list);
+			wake_up(&blp->blp_waitq);
+			spin_unlock(&blp->blp_lock);
+
+			wait_for_completion(&blp->blp_comp);
+		}
+
+		OBD_FREE(blp, sizeof(*blp));
+	}
+
+	if (ldlm_state->ldlm_cb_service != NULL)
+		ptlrpc_unregister_service(ldlm_state->ldlm_cb_service);
+#ifdef HAVE_SERVER_SUPPORT
+	if (ldlm_state->ldlm_cancel_service != NULL)
+		ptlrpc_unregister_service(ldlm_state->ldlm_cancel_service);
+#endif
+
+	if (ldlm_ns_kset)
+		kset_unregister(ldlm_ns_kset);
+	if (ldlm_svc_kset)
+		kset_unregister(ldlm_svc_kset);
+	if (ldlm_kobj)
+		kobject_put(ldlm_kobj);
+
+	ldlm_proc_cleanup();
+
+#ifdef HAVE_SERVER_SUPPORT
+	if (expired_lock_thread_state != ELT_STOPPED) {
+		expired_lock_thread_state = ELT_TERMINATE;
+		wake_up(&expired_lock_wait_queue);
+		wait_event(expired_lock_wait_queue,
+			   expired_lock_thread_state == ELT_STOPPED);
+	}
+#endif
+
+        OBD_FREE(ldlm_state, sizeof(*ldlm_state));
+        ldlm_state = NULL;
+
+        RETURN(0);
+}
+
+int ldlm_init(void)
+{
+	ldlm_resource_slab = kmem_cache_create("ldlm_resources",
+					       sizeof(struct ldlm_resource), 0,
+					       SLAB_HWCACHE_ALIGN, NULL);
+	if (ldlm_resource_slab == NULL)
+		return -ENOMEM;
+
+	ldlm_lock_slab = kmem_cache_create("ldlm_locks",
+			      sizeof(struct ldlm_lock), 0,
+			      SLAB_HWCACHE_ALIGN | SLAB_DESTROY_BY_RCU, NULL);
+	if (ldlm_lock_slab == NULL)
+		goto out_resource;
+
+	ldlm_interval_slab = kmem_cache_create("interval_node",
+                                        sizeof(struct ldlm_interval),
+					0, SLAB_HWCACHE_ALIGN, NULL);
+	if (ldlm_interval_slab == NULL)
+		goto out_lock;
+
+	ldlm_interval_tree_slab = kmem_cache_create("interval_tree",
+			sizeof(struct ldlm_interval_tree) * LCK_MODE_NUM,
+			0, SLAB_HWCACHE_ALIGN, NULL);
+	if (ldlm_interval_tree_slab == NULL)
+		goto out_interval;
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	class_export_dump_hook = ldlm_dump_export_locks;
+#endif
+	return 0;
+
+out_interval:
+	kmem_cache_destroy(ldlm_interval_slab);
+out_lock:
+	kmem_cache_destroy(ldlm_lock_slab);
+out_resource:
+	kmem_cache_destroy(ldlm_resource_slab);
+
+	return -ENOMEM;
+}
+
+void ldlm_exit(void)
+{
+	if (ldlm_refcount)
+		CERROR("ldlm_refcount is %d in ldlm_exit!\n", ldlm_refcount);
+	kmem_cache_destroy(ldlm_resource_slab);
+	/* ldlm_lock_put() use RCU to call ldlm_lock_free, so need call
+	 * synchronize_rcu() to wait a grace period elapsed, so that
+	 * ldlm_lock_free() get a chance to be called. */
+	synchronize_rcu();
+	kmem_cache_destroy(ldlm_lock_slab);
+	kmem_cache_destroy(ldlm_interval_slab);
+	kmem_cache_destroy(ldlm_interval_tree_slab);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_plain.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_plain.c
new file mode 100644
index 0000000000000..6453cabf1921f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_plain.c
@@ -0,0 +1,189 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_plain.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+/**
+ * This file contains implementation of PLAIN lock type.
+ *
+ * PLAIN locks are the simplest form of LDLM locking, and are used when
+ * there only needs to be a single lock on a resource. This avoids some
+ * of the complexity of EXTENT and IBITS lock types, but doesn't allow
+ * different "parts" of a resource to be locked concurrently.  Example
+ * use cases for PLAIN locks include locking of MGS configuration logs
+ * and (as of Lustre 2.4) quota records.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <lustre_dlm.h>
+#include <obd_support.h>
+#include <lustre_lib.h>
+
+#include "ldlm_internal.h"
+
+#ifdef HAVE_SERVER_SUPPORT
+/**
+ * Determine if the lock is compatible with all locks on the queue.
+ *
+ * If \a work_list is provided, conflicting locks are linked there.
+ * If \a work_list is not provided, we exit this function on first conflict.
+ *
+ * \retval 0 if there are conflicting locks in the \a queue
+ * \retval 1 if the lock is compatible to all locks in \a queue
+ */
+static inline int
+ldlm_plain_compat_queue(struct list_head *queue, struct ldlm_lock *req,
+			struct list_head *work_list)
+{
+	enum ldlm_mode req_mode = req->l_req_mode;
+	struct ldlm_lock *lock, *next_lock;
+	int compat = 1;
+	ENTRY;
+
+	lockmode_verify(req_mode);
+
+	list_for_each_entry_safe(lock, next_lock, queue, l_res_link) {
+
+		/* We stop walking the queue if we hit ourselves so we don't
+		 * take conflicting locks enqueued after us into account,
+		 * or we'd wait forever. */
+		if (req == lock)
+			RETURN(compat);
+
+		/* Advance loop cursor to last lock of mode group. */
+		next_lock = list_entry(list_entry(lock->l_sl_mode.prev,
+						  struct ldlm_lock,
+						  l_sl_mode)->l_res_link.next,
+				       struct ldlm_lock, l_res_link);
+
+		if (lockmode_compat(lock->l_req_mode, req_mode))
+                        continue;
+
+                if (!work_list)
+                        RETURN(0);
+
+                compat = 0;
+
+		/* Add locks of the mode group to \a work_list as
+		 * blocking locks for \a req. */
+                if (lock->l_blocking_ast)
+                        ldlm_add_ast_work_item(lock, req, work_list);
+
+                {
+			struct list_head *head;
+
+                        head = &lock->l_sl_mode;
+			list_for_each_entry(lock, head, l_sl_mode)
+                                if (lock->l_blocking_ast)
+                                        ldlm_add_ast_work_item(lock, req,
+                                                               work_list);
+                }
+        }
+
+        RETURN(compat);
+}
+
+/**
+ * Process a granting attempt for plain lock.
+ * Must be called with ns lock held.
+ *
+ * This function looks for any conflicts for \a lock in the granted or
+ * waiting queues. The lock is granted if no conflicts are found in
+ * either queue.
+ */
+int ldlm_process_plain_lock(struct ldlm_lock *lock, __u64 *flags,
+			    enum ldlm_process_intention intention,
+			    enum ldlm_error *err, struct list_head *work_list)
+{
+	struct ldlm_resource *res = lock->l_resource;
+	struct list_head rpc_list;
+	int rc;
+	ENTRY;
+
+	LASSERT(lock->l_granted_mode != lock->l_req_mode);
+	check_res_locked(res);
+	LASSERT(list_empty(&res->lr_converting));
+	INIT_LIST_HEAD(&rpc_list);
+
+	if (intention == LDLM_PROCESS_RESCAN) {
+                LASSERT(work_list != NULL);
+                rc = ldlm_plain_compat_queue(&res->lr_granted, lock, NULL);
+                if (!rc)
+                        RETURN(LDLM_ITER_STOP);
+                rc = ldlm_plain_compat_queue(&res->lr_waiting, lock, NULL);
+                if (!rc)
+                        RETURN(LDLM_ITER_STOP);
+
+                ldlm_resource_unlink_lock(lock);
+                ldlm_grant_lock(lock, work_list);
+                RETURN(LDLM_ITER_CONTINUE);
+        }
+
+	LASSERT((intention == LDLM_PROCESS_ENQUEUE && work_list == NULL) ||
+		(intention == LDLM_PROCESS_RECOVERY && work_list != NULL));
+ restart:
+        rc = ldlm_plain_compat_queue(&res->lr_granted, lock, &rpc_list);
+        rc += ldlm_plain_compat_queue(&res->lr_waiting, lock, &rpc_list);
+
+        if (rc != 2) {
+		rc = ldlm_handle_conflict_lock(lock, flags, &rpc_list, 0);
+		if (rc == -ERESTART)
+			GOTO(restart, rc);
+		*err = rc;
+	} else {
+		ldlm_resource_unlink_lock(lock);
+		ldlm_grant_lock(lock, work_list);
+		rc = 0;
+	}
+
+	if (!list_empty(&rpc_list))
+		ldlm_discard_bl_list(&rpc_list);
+
+	RETURN(rc);
+}
+#endif /* HAVE_SERVER_SUPPORT */
+
+void ldlm_plain_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy,
+				     union ldlm_policy_data *lpolicy)
+{
+	/* No policy for plain locks */
+}
+
+void ldlm_plain_policy_local_to_wire(const union ldlm_policy_data *lpolicy,
+				     union ldlm_wire_policy_data *wpolicy)
+{
+	/* No policy for plain locks */
+}
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_pool.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_pool.c
new file mode 100644
index 0000000000000..2afed77ea5f70
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_pool.c
@@ -0,0 +1,1621 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_pool.c
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+/*
+ * Idea of this code is rather simple. Each second, for each server namespace
+ * we have SLV - server lock volume which is calculated on current number of
+ * granted locks, grant speed for past period, etc - that is, locking load.
+ * This SLV number may be thought as a flow definition for simplicity. It is
+ * sent to clients with each occasion to let them know what is current load
+ * situation on the server. By default, at the beginning, SLV on server is
+ * set max value which is calculated as the following: allow to one client
+ * have all locks of limit ->pl_limit for 10h.
+ *
+ * Next, on clients, number of cached locks is not limited artificially in any
+ * way as it was before. Instead, client calculates CLV, that is, client lock
+ * volume for each lock and compares it with last SLV from the server. CLV is
+ * calculated as the number of locks in LRU * lock live time in seconds. If
+ * CLV > SLV - lock is canceled.
+ *
+ * Client has LVF, that is, lock volume factor which regulates how much sensitive
+ * client should be about last SLV from server. The higher LVF is the more locks
+ * will be canceled on client. Default value for it is 1. Setting LVF to 2 means
+ * that client will cancel locks 2 times faster.
+ *
+ * Locks on a client will be canceled more intensively in these cases:
+ * (1) if SLV is smaller, that is, load is higher on the server;
+ * (2) client has a lot of locks (the more locks are held by client, the bigger
+ *     chances that some of them should be canceled);
+ * (3) client has old locks (taken some time ago);
+ *
+ * Thus, according to flow paradigm that we use for better understanding SLV,
+ * CLV is the volume of particle in flow described by SLV. According to this,
+ * if flow is getting thinner, more and more particles become outside of it and
+ * as particles are locks, they should be canceled.
+ *
+ * General idea of this belongs to Vitaly Fertman (vitaly@clusterfs.com). Andreas
+ * Dilger (adilger@clusterfs.com) proposed few nice ideas like using LVF and many
+ * cleanups. Flow definition to allow more easy understanding of the logic belongs
+ * to Nikita Danilov (nikita@clusterfs.com) as well as many cleanups and fixes.
+ * And design and implementation are done by Yury Umanets (umka@clusterfs.com).
+ *
+ * Glossary for terms used:
+ *
+ * pl_limit - Number of allowed locks in pool. Applies to server and client
+ * side (tunable);
+ *
+ * pl_granted - Number of granted locks (calculated);
+ * pl_grant_rate - Number of granted locks for last T (calculated);
+ * pl_cancel_rate - Number of canceled locks for last T (calculated);
+ * pl_grant_speed - Grant speed (GR - CR) for last T (calculated);
+ * pl_grant_plan - Planned number of granted locks for next T (calculated);
+ * pl_server_lock_volume - Current server lock volume (calculated);
+ *
+ * As it may be seen from list above, we have few possible tunables which may
+ * affect behavior much. They all may be modified via sysfs. However, they also
+ * give a possibility for constructing few pre-defined behavior policies. If
+ * none of predefines is suitable for a working pattern being used, new one may
+ * be "constructed" via sysfs tunables.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <linux/kthread.h>
+#include <lustre_dlm.h>
+#include <cl_object.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include "ldlm_internal.h"
+
+#ifdef HAVE_LRU_RESIZE_SUPPORT
+
+/*
+ * 50 ldlm locks for 1MB of RAM.
+ */
+#define LDLM_POOL_HOST_L ((NUM_CACHEPAGES >> (20 - PAGE_SHIFT)) * 50)
+
+/*
+ * Maximal possible grant step plan in %.
+ */
+#define LDLM_POOL_MAX_GSP (30)
+
+/*
+ * Minimal possible grant step plan in %.
+ */
+#define LDLM_POOL_MIN_GSP (1)
+
+/*
+ * This controls the speed of reaching LDLM_POOL_MAX_GSP
+ * with increasing thread period.
+ */
+#define LDLM_POOL_GSP_STEP_SHIFT (2)
+
+/*
+ * LDLM_POOL_GSP% of all locks is default GP.
+ */
+#define LDLM_POOL_GP(L)   (((L) * LDLM_POOL_MAX_GSP) / 100)
+
+/*
+ * Max age for locks on clients.
+ */
+#define LDLM_POOL_MAX_AGE (36000)
+
+/*
+ * The granularity of SLV calculation.
+ */
+#define LDLM_POOL_SLV_SHIFT (10)
+
+extern struct proc_dir_entry *ldlm_ns_proc_dir;
+
+static inline __u64 dru(__u64 val, __u32 shift, int round_up)
+{
+        return (val + (round_up ? (1 << shift) - 1 : 0)) >> shift;
+}
+
+static inline __u64 ldlm_pool_slv_max(__u32 L)
+{
+        /*
+         * Allow to have all locks for 1 client for 10 hrs.
+         * Formula is the following: limit * 10h / 1 client.
+         */
+        __u64 lim = (__u64)L *  LDLM_POOL_MAX_AGE / 1;
+        return lim;
+}
+
+static inline __u64 ldlm_pool_slv_min(__u32 L)
+{
+        return 1;
+}
+
+enum {
+        LDLM_POOL_FIRST_STAT = 0,
+        LDLM_POOL_GRANTED_STAT = LDLM_POOL_FIRST_STAT,
+        LDLM_POOL_GRANT_STAT,
+        LDLM_POOL_CANCEL_STAT,
+        LDLM_POOL_GRANT_RATE_STAT,
+        LDLM_POOL_CANCEL_RATE_STAT,
+        LDLM_POOL_GRANT_PLAN_STAT,
+        LDLM_POOL_SLV_STAT,
+        LDLM_POOL_SHRINK_REQTD_STAT,
+        LDLM_POOL_SHRINK_FREED_STAT,
+        LDLM_POOL_RECALC_STAT,
+        LDLM_POOL_TIMING_STAT,
+        LDLM_POOL_LAST_STAT
+};
+
+static inline struct ldlm_namespace *ldlm_pl2ns(struct ldlm_pool *pl)
+{
+        return container_of(pl, struct ldlm_namespace, ns_pool);
+}
+
+/**
+ * Calculates suggested grant_step in % of available locks for passed
+ * \a period. This is later used in grant_plan calculations.
+ */
+static inline int ldlm_pool_t2gsp(unsigned int t)
+{
+        /*
+         * This yields 1% grant step for anything below LDLM_POOL_GSP_STEP
+         * and up to 30% for anything higher than LDLM_POOL_GSP_STEP.
+         *
+         * How this will affect execution is the following:
+         *
+         * - for thread period 1s we will have grant_step 1% which good from
+         * pov of taking some load off from server and push it out to clients.
+         * This is like that because 1% for grant_step means that server will
+         * not allow clients to get lots of locks in short period of time and
+         * keep all old locks in their caches. Clients will always have to
+         * get some locks back if they want to take some new;
+         *
+         * - for thread period 10s (which is default) we will have 23% which
+         * means that clients will have enough of room to take some new locks
+         * without getting some back. All locks from this 23% which were not
+         * taken by clients in current period will contribute in SLV growing.
+         * SLV growing means more locks cached on clients until limit or grant
+         * plan is reached.
+         */
+        return LDLM_POOL_MAX_GSP -
+                ((LDLM_POOL_MAX_GSP - LDLM_POOL_MIN_GSP) >>
+                 (t >> LDLM_POOL_GSP_STEP_SHIFT));
+}
+
+static inline int ldlm_pool_granted(struct ldlm_pool *pl)
+{
+	return atomic_read(&pl->pl_granted);
+}
+
+/**
+ * Recalculates next grant limit on passed \a pl.
+ *
+ * \pre ->pl_lock is locked.
+ */
+static void ldlm_pool_recalc_grant_plan(struct ldlm_pool *pl)
+{
+	int granted, grant_step, limit;
+
+	limit = ldlm_pool_get_limit(pl);
+	granted = ldlm_pool_granted(pl);
+
+	grant_step = ldlm_pool_t2gsp(pl->pl_recalc_period);
+	grant_step = ((limit - granted) * grant_step) / 100;
+	pl->pl_grant_plan = granted + grant_step;
+	limit = (limit * 5) >> 2;
+	if (pl->pl_grant_plan > limit)
+		pl->pl_grant_plan = limit;
+}
+
+/**
+ * Recalculates next SLV on passed \a pl.
+ *
+ * \pre ->pl_lock is locked.
+ */
+static void ldlm_pool_recalc_slv(struct ldlm_pool *pl)
+{
+        int granted;
+        int grant_plan;
+        int round_up;
+        __u64 slv;
+        __u64 slv_factor;
+        __u64 grant_usage;
+        __u32 limit;
+
+	slv = pl->pl_server_lock_volume;
+	grant_plan = pl->pl_grant_plan;
+	limit = ldlm_pool_get_limit(pl);
+	granted = ldlm_pool_granted(pl);
+	round_up = granted < limit;
+
+        grant_usage = max_t(int, limit - (granted - grant_plan), 1);
+
+        /*
+         * Find out SLV change factor which is the ratio of grant usage
+         * from limit. SLV changes as fast as the ratio of grant plan
+         * consumption. The more locks from grant plan are not consumed
+         * by clients in last interval (idle time), the faster grows
+         * SLV. And the opposite, the more grant plan is over-consumed
+         * (load time) the faster drops SLV.
+         */
+        slv_factor = (grant_usage << LDLM_POOL_SLV_SHIFT);
+        do_div(slv_factor, limit);
+        slv = slv * slv_factor;
+        slv = dru(slv, LDLM_POOL_SLV_SHIFT, round_up);
+
+        if (slv > ldlm_pool_slv_max(limit)) {
+                slv = ldlm_pool_slv_max(limit);
+        } else if (slv < ldlm_pool_slv_min(limit)) {
+                slv = ldlm_pool_slv_min(limit);
+        }
+
+        pl->pl_server_lock_volume = slv;
+}
+
+/**
+ * Recalculates next stats on passed \a pl.
+ *
+ * \pre ->pl_lock is locked.
+ */
+static void ldlm_pool_recalc_stats(struct ldlm_pool *pl)
+{
+	int grant_plan = pl->pl_grant_plan;
+	__u64 slv = pl->pl_server_lock_volume;
+	int granted = ldlm_pool_granted(pl);
+	int grant_rate = atomic_read(&pl->pl_grant_rate);
+	int cancel_rate = atomic_read(&pl->pl_cancel_rate);
+
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_SLV_STAT,
+			    slv);
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
+			    granted);
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
+			    grant_rate);
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT,
+			    grant_plan);
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT,
+			    cancel_rate);
+}
+
+/**
+ * Sets current SLV into obd accessible via ldlm_pl2ns(pl)->ns_obd.
+ */
+static void ldlm_srv_pool_push_slv(struct ldlm_pool *pl)
+{
+        struct obd_device *obd;
+
+        /*
+         * Set new SLV in obd field for using it later without accessing the
+         * pool. This is required to avoid race between sending reply to client
+         * with new SLV and cleanup server stack in which we can't guarantee
+         * that namespace is still alive. We know only that obd is alive as
+         * long as valid export is alive.
+         */
+        obd = ldlm_pl2ns(pl)->ns_obd;
+        LASSERT(obd != NULL);
+	write_lock(&obd->obd_pool_lock);
+        obd->obd_pool_slv = pl->pl_server_lock_volume;
+	write_unlock(&obd->obd_pool_lock);
+}
+
+/**
+ * Recalculates all pool fields on passed \a pl.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+static int ldlm_srv_pool_recalc(struct ldlm_pool *pl)
+{
+	time64_t recalc_interval_sec;
+        ENTRY;
+
+	recalc_interval_sec = ktime_get_real_seconds() - pl->pl_recalc_time;
+        if (recalc_interval_sec < pl->pl_recalc_period)
+                RETURN(0);
+
+	spin_lock(&pl->pl_lock);
+	recalc_interval_sec = ktime_get_real_seconds() - pl->pl_recalc_time;
+	if (recalc_interval_sec < pl->pl_recalc_period) {
+		spin_unlock(&pl->pl_lock);
+		RETURN(0);
+	}
+        /*
+         * Recalc SLV after last period. This should be done
+         * _before_ recalculating new grant plan.
+         */
+        ldlm_pool_recalc_slv(pl);
+
+        /*
+         * Make sure that pool informed obd of last SLV changes.
+         */
+        ldlm_srv_pool_push_slv(pl);
+
+        /*
+         * Update grant_plan for new period.
+         */
+        ldlm_pool_recalc_grant_plan(pl);
+
+	pl->pl_recalc_time = ktime_get_real_seconds();
+        lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT,
+                            recalc_interval_sec);
+	spin_unlock(&pl->pl_lock);
+	RETURN(0);
+}
+
+/**
+ * This function is used on server side as main entry point for memory
+ * pressure handling. It decreases SLV on \a pl according to passed
+ * \a nr and \a gfp_mask.
+ *
+ * Our goal here is to decrease SLV such a way that clients hold \a nr
+ * locks smaller in next 10h.
+ */
+static int ldlm_srv_pool_shrink(struct ldlm_pool *pl,
+				int nr,  gfp_t gfp_mask)
+{
+	__u32 limit;
+
+	/*
+	 * VM is asking how many entries may be potentially freed.
+	 */
+	if (nr == 0)
+		return ldlm_pool_granted(pl);
+
+	/*
+	 * Client already canceled locks but server is already in shrinker
+	 * and can't cancel anything. Let's catch this race.
+	 */
+	if (ldlm_pool_granted(pl) == 0)
+		RETURN(0);
+
+	spin_lock(&pl->pl_lock);
+
+        /*
+         * We want shrinker to possibly cause cancellation of @nr locks from
+         * clients or grant approximately @nr locks smaller next intervals.
+         *
+         * This is why we decreased SLV by @nr. This effect will only be as
+         * long as one re-calc interval (1s these days) and this should be
+         * enough to pass this decreased SLV to all clients. On next recalc
+         * interval pool will either increase SLV if locks load is not high
+         * or will keep on same level or even decrease again, thus, shrinker
+         * decreased SLV will affect next recalc intervals and this way will
+         * make locking load lower.
+         */
+        if (nr < pl->pl_server_lock_volume) {
+                pl->pl_server_lock_volume = pl->pl_server_lock_volume - nr;
+        } else {
+                limit = ldlm_pool_get_limit(pl);
+                pl->pl_server_lock_volume = ldlm_pool_slv_min(limit);
+        }
+
+        /*
+         * Make sure that pool informed obd of last SLV changes.
+         */
+        ldlm_srv_pool_push_slv(pl);
+	spin_unlock(&pl->pl_lock);
+
+	/*
+	 * We did not really free any memory here so far, it only will be
+	 * freed later may be, so that we return 0 to not confuse VM.
+	 */
+	return 0;
+}
+
+/**
+ * Setup server side pool \a pl with passed \a limit.
+ */
+static int ldlm_srv_pool_setup(struct ldlm_pool *pl, int limit)
+{
+        struct obd_device *obd;
+
+        obd = ldlm_pl2ns(pl)->ns_obd;
+        LASSERT(obd != NULL && obd != LP_POISON);
+        LASSERT(obd->obd_type != LP_POISON);
+	write_lock(&obd->obd_pool_lock);
+        obd->obd_pool_limit = limit;
+	write_unlock(&obd->obd_pool_lock);
+
+        ldlm_pool_set_limit(pl, limit);
+        return 0;
+}
+
+/**
+ * Sets SLV and Limit from ldlm_pl2ns(pl)->ns_obd tp passed \a pl.
+ */
+static void ldlm_cli_pool_pop_slv(struct ldlm_pool *pl)
+{
+        struct obd_device *obd;
+
+        /*
+         * Get new SLV and Limit from obd which is updated with coming
+         * RPCs.
+         */
+        obd = ldlm_pl2ns(pl)->ns_obd;
+        LASSERT(obd != NULL);
+	read_lock(&obd->obd_pool_lock);
+        pl->pl_server_lock_volume = obd->obd_pool_slv;
+        ldlm_pool_set_limit(pl, obd->obd_pool_limit);
+	read_unlock(&obd->obd_pool_lock);
+}
+
+/**
+ * Recalculates client size pool \a pl according to current SLV and Limit.
+ */
+static int ldlm_cli_pool_recalc(struct ldlm_pool *pl)
+{
+	time64_t recalc_interval_sec;
+	int ret;
+        ENTRY;
+
+	recalc_interval_sec = ktime_get_real_seconds() - pl->pl_recalc_time;
+        if (recalc_interval_sec < pl->pl_recalc_period)
+                RETURN(0);
+
+	spin_lock(&pl->pl_lock);
+	/*
+	 * Check if we need to recalc lists now.
+	 */
+	recalc_interval_sec = ktime_get_real_seconds() - pl->pl_recalc_time;
+	if (recalc_interval_sec < pl->pl_recalc_period) {
+		spin_unlock(&pl->pl_lock);
+                RETURN(0);
+        }
+
+        /*
+         * Make sure that pool knows last SLV and Limit from obd.
+         */
+        ldlm_cli_pool_pop_slv(pl);
+	spin_unlock(&pl->pl_lock);
+
+        /*
+         * Do not cancel locks in case lru resize is disabled for this ns.
+         */
+        if (!ns_connect_lru_resize(ldlm_pl2ns(pl)))
+		GOTO(out, ret = 0);
+
+        /*
+         * In the time of canceling locks on client we do not need to maintain
+         * sharp timing, we only want to cancel locks asap according to new SLV.
+         * It may be called when SLV has changed much, this is why we do not
+         * take into account pl->pl_recalc_time here.
+         */
+	ret = ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LCF_ASYNC,
+			      LDLM_LRU_FLAG_LRUR);
+
+out:
+	spin_lock(&pl->pl_lock);
+	/*
+	 * Time of LRU resizing might be longer than period,
+	 * so update after LRU resizing rather than before it.
+	 */
+	pl->pl_recalc_time = ktime_get_real_seconds();
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT,
+			    recalc_interval_sec);
+	spin_unlock(&pl->pl_lock);
+	RETURN(ret);
+}
+
+/**
+ * This function is main entry point for memory pressure handling on client
+ * side.  Main goal of this function is to cancel some number of locks on
+ * passed \a pl according to \a nr and \a gfp_mask.
+ */
+static int ldlm_cli_pool_shrink(struct ldlm_pool *pl,
+				int nr, gfp_t gfp_mask)
+{
+        struct ldlm_namespace *ns;
+	int unused;
+
+        ns = ldlm_pl2ns(pl);
+
+        /*
+         * Do not cancel locks in case lru resize is disabled for this ns.
+         */
+        if (!ns_connect_lru_resize(ns))
+                RETURN(0);
+
+        /*
+         * Make sure that pool knows last SLV and Limit from obd.
+         */
+        ldlm_cli_pool_pop_slv(pl);
+
+	spin_lock(&ns->ns_lock);
+	unused = ns->ns_nr_unused;
+	spin_unlock(&ns->ns_lock);
+
+	if (nr == 0)
+		return (unused / 100) * sysctl_vfs_cache_pressure;
+	else
+		return ldlm_cancel_lru(ns, nr, LCF_ASYNC, LDLM_LRU_FLAG_SHRINK);
+}
+
+static struct ldlm_pool_ops ldlm_srv_pool_ops = {
+        .po_recalc = ldlm_srv_pool_recalc,
+        .po_shrink = ldlm_srv_pool_shrink,
+        .po_setup  = ldlm_srv_pool_setup
+};
+
+static struct ldlm_pool_ops ldlm_cli_pool_ops = {
+        .po_recalc = ldlm_cli_pool_recalc,
+        .po_shrink = ldlm_cli_pool_shrink
+};
+
+/**
+ * Pool recalc wrapper. Will call either client or server pool recalc callback
+ * depending what pool \a pl is used.
+ */
+int ldlm_pool_recalc(struct ldlm_pool *pl)
+{
+	time64_t recalc_interval_sec;
+	int count;
+
+	recalc_interval_sec = ktime_get_real_seconds() - pl->pl_recalc_time;
+	if (recalc_interval_sec > 0) {
+		spin_lock(&pl->pl_lock);
+		recalc_interval_sec = ktime_get_real_seconds() - pl->pl_recalc_time;
+
+		if (recalc_interval_sec > 0) {
+			/*
+			 * Update pool statistics every 1s.
+			 */
+			ldlm_pool_recalc_stats(pl);
+
+			/*
+			 * Zero out all rates and speed for the last period.
+			 */
+			atomic_set(&pl->pl_grant_rate, 0);
+			atomic_set(&pl->pl_cancel_rate, 0);
+		}
+		spin_unlock(&pl->pl_lock);
+	}
+
+	if (pl->pl_ops->po_recalc != NULL) {
+		count = pl->pl_ops->po_recalc(pl);
+		lprocfs_counter_add(pl->pl_stats, LDLM_POOL_RECALC_STAT,
+				    count);
+	}
+
+	recalc_interval_sec = pl->pl_recalc_time - ktime_get_real_seconds() +
+			      pl->pl_recalc_period;
+	if (recalc_interval_sec <= 0) {
+		/* DEBUG: should be re-removed after LU-4536 is fixed */
+		CDEBUG(D_DLMTRACE, "%s: Negative interval(%lld), too short period(%lld)\n",
+		       pl->pl_name, recalc_interval_sec,
+		       (s64)pl->pl_recalc_period);
+
+		/* Prevent too frequent recalculation. */
+		recalc_interval_sec = 1;
+	}
+
+	return recalc_interval_sec;
+}
+
+/**
+ * Pool shrink wrapper. Will call either client or server pool recalc callback
+ * depending what pool \a pl is used.
+ */
+int ldlm_pool_shrink(struct ldlm_pool *pl, int nr, gfp_t gfp_mask)
+{
+        int cancel = 0;
+
+        if (pl->pl_ops->po_shrink != NULL) {
+                cancel = pl->pl_ops->po_shrink(pl, nr, gfp_mask);
+                if (nr > 0) {
+                        lprocfs_counter_add(pl->pl_stats,
+                                            LDLM_POOL_SHRINK_REQTD_STAT,
+                                            nr);
+                        lprocfs_counter_add(pl->pl_stats,
+                                            LDLM_POOL_SHRINK_FREED_STAT,
+                                            cancel);
+                        CDEBUG(D_DLMTRACE, "%s: request to shrink %d locks, "
+                               "shrunk %d\n", pl->pl_name, nr, cancel);
+                }
+        }
+        return cancel;
+}
+
+/**
+ * Pool setup wrapper. Will call either client or server pool recalc callback
+ * depending what pool \a pl is used.
+ *
+ * Sets passed \a limit into pool \a pl.
+ */
+int ldlm_pool_setup(struct ldlm_pool *pl, int limit)
+{
+        if (pl->pl_ops->po_setup != NULL)
+                return(pl->pl_ops->po_setup(pl, limit));
+        return 0;
+}
+
+static int lprocfs_pool_state_seq_show(struct seq_file *m, void *unused)
+{
+	int granted, grant_rate, cancel_rate, grant_step;
+	int grant_speed, grant_plan, lvf;
+	struct ldlm_pool *pl = m->private;
+	__u64 slv, clv;
+	__u32 limit;
+
+	spin_lock(&pl->pl_lock);
+	slv = pl->pl_server_lock_volume;
+	clv = pl->pl_client_lock_volume;
+	limit = ldlm_pool_get_limit(pl);
+	grant_plan = pl->pl_grant_plan;
+	granted = ldlm_pool_granted(pl);
+	grant_rate = atomic_read(&pl->pl_grant_rate);
+	cancel_rate = atomic_read(&pl->pl_cancel_rate);
+	grant_speed = grant_rate - cancel_rate;
+	lvf = atomic_read(&pl->pl_lock_volume_factor);
+	grant_step = ldlm_pool_t2gsp(pl->pl_recalc_period);
+	spin_unlock(&pl->pl_lock);
+
+	seq_printf(m, "LDLM pool state (%s):\n"
+		   "  SLV: %llu\n"
+		   "  CLV: %llu\n"
+		   "  LVF: %d\n",
+		   pl->pl_name, slv, clv, lvf);
+
+	if (ns_is_server(ldlm_pl2ns(pl))) {
+		seq_printf(m, "  GSP: %d%%\n", grant_step);
+		seq_printf(m, "  GP:  %d\n", grant_plan);
+	}
+
+	seq_printf(m, "  GR:  %d\n  CR:  %d\n  GS:  %d\n  G:   %d\n  L:   %d\n",
+		   grant_rate, cancel_rate, grant_speed,
+		   granted, limit);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(lprocfs_pool_state);
+
+static ssize_t grant_speed_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool,
+					    pl_kobj);
+	int grant_speed;
+
+	spin_lock(&pl->pl_lock);
+	/* serialize with ldlm_pool_recalc */
+	grant_speed = atomic_read(&pl->pl_grant_rate) -
+			atomic_read(&pl->pl_cancel_rate);
+	spin_unlock(&pl->pl_lock);
+	return sprintf(buf, "%d\n", grant_speed);
+}
+LUSTRE_RO_ATTR(grant_speed);
+
+LDLM_POOL_SYSFS_READER_SHOW(grant_plan, int);
+LUSTRE_RO_ATTR(grant_plan);
+
+LDLM_POOL_SYSFS_READER_SHOW(recalc_period, int);
+LDLM_POOL_SYSFS_WRITER_STORE(recalc_period, int);
+LUSTRE_RW_ATTR(recalc_period);
+
+LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(server_lock_volume, u64);
+LUSTRE_RO_ATTR(server_lock_volume);
+
+LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(limit, atomic);
+LDLM_POOL_SYSFS_WRITER_NOLOCK_STORE(limit, atomic);
+LUSTRE_RW_ATTR(limit);
+
+LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(granted, atomic);
+LUSTRE_RO_ATTR(granted);
+
+LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(cancel_rate, atomic);
+LUSTRE_RO_ATTR(cancel_rate);
+
+LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(grant_rate, atomic);
+LUSTRE_RO_ATTR(grant_rate);
+
+LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(lock_volume_factor, atomic);
+LDLM_POOL_SYSFS_WRITER_NOLOCK_STORE(lock_volume_factor, atomic);
+LUSTRE_RW_ATTR(lock_volume_factor);
+
+/* These are for pools in /sys/fs/lustre/ldlm/namespaces/.../pool */
+static struct attribute *ldlm_pl_attrs[] = {
+	&lustre_attr_grant_speed.attr,
+	&lustre_attr_grant_plan.attr,
+	&lustre_attr_recalc_period.attr,
+	&lustre_attr_server_lock_volume.attr,
+	&lustre_attr_limit.attr,
+	&lustre_attr_granted.attr,
+	&lustre_attr_cancel_rate.attr,
+	&lustre_attr_grant_rate.attr,
+	&lustre_attr_lock_volume_factor.attr,
+	NULL,
+};
+
+static void ldlm_pl_release(struct kobject *kobj)
+{
+	struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool,
+					    pl_kobj);
+	complete(&pl->pl_kobj_unregister);
+}
+
+static struct kobj_type ldlm_pl_ktype = {
+	.default_attrs	= ldlm_pl_attrs,
+	.sysfs_ops	= &lustre_sysfs_ops,
+	.release	= ldlm_pl_release,
+};
+
+static int ldlm_pool_sysfs_init(struct ldlm_pool *pl)
+{
+	struct ldlm_namespace *ns = ldlm_pl2ns(pl);
+	int err;
+
+	init_completion(&pl->pl_kobj_unregister);
+	err = kobject_init_and_add(&pl->pl_kobj, &ldlm_pl_ktype, &ns->ns_kobj,
+				   "pool");
+
+	return err;
+}
+
+static int ldlm_pool_proc_init(struct ldlm_pool *pl)
+{
+	struct ldlm_namespace *ns = ldlm_pl2ns(pl);
+	struct proc_dir_entry *parent_ns_proc;
+	struct lprocfs_vars pool_vars[2];
+	char *var_name = NULL;
+	int rc = 0;
+	ENTRY;
+
+	OBD_ALLOC(var_name, MAX_STRING_SIZE + 1);
+	if (!var_name)
+		RETURN(-ENOMEM);
+
+	parent_ns_proc = ns->ns_proc_dir_entry;
+	if (parent_ns_proc == NULL) {
+		CERROR("%s: proc entry is not initialized\n",
+		       ldlm_ns_name(ns));
+		GOTO(out_free_name, rc = -EINVAL);
+	}
+	pl->pl_proc_dir = lprocfs_register("pool", parent_ns_proc,
+					   NULL, NULL);
+	if (IS_ERR(pl->pl_proc_dir)) {
+		rc = PTR_ERR(pl->pl_proc_dir);
+		pl->pl_proc_dir = NULL;
+		CERROR("%s: cannot create 'pool' proc entry: rc = %d\n",
+		       ldlm_ns_name(ns), rc);
+		GOTO(out_free_name, rc);
+	}
+
+	var_name[MAX_STRING_SIZE] = '\0';
+	memset(pool_vars, 0, sizeof(pool_vars));
+	pool_vars[0].name = var_name;
+
+	ldlm_add_var(&pool_vars[0], pl->pl_proc_dir, "state", pl,
+		     &lprocfs_pool_state_fops);
+
+        pl->pl_stats = lprocfs_alloc_stats(LDLM_POOL_LAST_STAT -
+                                           LDLM_POOL_FIRST_STAT, 0);
+        if (!pl->pl_stats)
+                GOTO(out_free_name, rc = -ENOMEM);
+
+        lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
+                             LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                             "granted", "locks");
+        lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_STAT,
+                             LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                             "grant", "locks");
+        lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_STAT,
+                             LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                             "cancel", "locks");
+        lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
+                             LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                             "grant_rate", "locks/s");
+        lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT,
+                             LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                             "cancel_rate", "locks/s");
+        lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT,
+                             LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                             "grant_plan", "locks/s");
+        lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SLV_STAT,
+                             LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                             "slv", "slv");
+        lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_REQTD_STAT,
+                             LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                             "shrink_request", "locks");
+        lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_FREED_STAT,
+                             LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                             "shrink_freed", "locks");
+        lprocfs_counter_init(pl->pl_stats, LDLM_POOL_RECALC_STAT,
+                             LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                             "recalc_freed", "locks");
+        lprocfs_counter_init(pl->pl_stats, LDLM_POOL_TIMING_STAT,
+                             LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                             "recalc_timing", "sec");
+	rc = lprocfs_register_stats(pl->pl_proc_dir, "stats", pl->pl_stats);
+
+        EXIT;
+out_free_name:
+        OBD_FREE(var_name, MAX_STRING_SIZE + 1);
+        return rc;
+}
+
+static void ldlm_pool_sysfs_fini(struct ldlm_pool *pl)
+{
+	kobject_put(&pl->pl_kobj);
+	wait_for_completion(&pl->pl_kobj_unregister);
+}
+
+static void ldlm_pool_proc_fini(struct ldlm_pool *pl)
+{
+        if (pl->pl_stats != NULL) {
+                lprocfs_free_stats(&pl->pl_stats);
+                pl->pl_stats = NULL;
+        }
+        if (pl->pl_proc_dir != NULL) {
+                lprocfs_remove(&pl->pl_proc_dir);
+                pl->pl_proc_dir = NULL;
+        }
+}
+
+int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
+		   int idx, enum ldlm_side client)
+{
+	int rc;
+	ENTRY;
+
+	spin_lock_init(&pl->pl_lock);
+	atomic_set(&pl->pl_granted, 0);
+	pl->pl_recalc_time = ktime_get_real_seconds();
+	atomic_set(&pl->pl_lock_volume_factor, 1);
+
+	atomic_set(&pl->pl_grant_rate, 0);
+	atomic_set(&pl->pl_cancel_rate, 0);
+	pl->pl_grant_plan = LDLM_POOL_GP(LDLM_POOL_HOST_L);
+
+	snprintf(pl->pl_name, sizeof(pl->pl_name), "ldlm-pool-%s-%d",
+		 ldlm_ns_name(ns), idx);
+
+        if (client == LDLM_NAMESPACE_SERVER) {
+                pl->pl_ops = &ldlm_srv_pool_ops;
+                ldlm_pool_set_limit(pl, LDLM_POOL_HOST_L);
+                pl->pl_recalc_period = LDLM_POOL_SRV_DEF_RECALC_PERIOD;
+                pl->pl_server_lock_volume = ldlm_pool_slv_max(LDLM_POOL_HOST_L);
+        } else {
+                ldlm_pool_set_limit(pl, 1);
+                pl->pl_server_lock_volume = 0;
+                pl->pl_ops = &ldlm_cli_pool_ops;
+                pl->pl_recalc_period = LDLM_POOL_CLI_DEF_RECALC_PERIOD;
+        }
+        pl->pl_client_lock_volume = 0;
+        rc = ldlm_pool_proc_init(pl);
+        if (rc)
+                RETURN(rc);
+
+	rc = ldlm_pool_sysfs_init(pl);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_DLMTRACE, "Lock pool %s is initialized\n", pl->pl_name);
+
+	RETURN(rc);
+}
+
+void ldlm_pool_fini(struct ldlm_pool *pl)
+{
+	ENTRY;
+	ldlm_pool_sysfs_fini(pl);
+	ldlm_pool_proc_fini(pl);
+
+        /*
+         * Pool should not be used after this point. We can't free it here as
+         * it lives in struct ldlm_namespace, but still interested in catching
+         * any abnormal using cases.
+         */
+        POISON(pl, 0x5a, sizeof(*pl));
+        EXIT;
+}
+
+/**
+ * Add new taken ldlm lock \a lock into pool \a pl accounting.
+ */
+void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock)
+{
+	/*
+	 * FLOCK locks are special in a sense that they are almost never
+	 * cancelled, instead special kind of lock is used to drop them.
+	 * also there is no LRU for flock locks, so no point in tracking
+	 * them anyway.
+	 *
+	 * PLAIN locks are used by config and quota, the quantity is small
+	 * and usually they are not in LRU.
+	 */
+	if (lock->l_resource->lr_type == LDLM_FLOCK ||
+	    lock->l_resource->lr_type == LDLM_PLAIN)
+		return;
+
+	ldlm_reclaim_add(lock);
+
+	atomic_inc(&pl->pl_granted);
+	atomic_inc(&pl->pl_grant_rate);
+	lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_GRANT_STAT);
+	/*
+	 * Do not do pool recalc for client side as all locks which
+	 * potentially may be canceled has already been packed into
+	 * enqueue/cancel rpc. Also we do not want to run out of stack
+	 * with too long call paths.
+	 */
+	if (ns_is_server(ldlm_pl2ns(pl)))
+		ldlm_pool_recalc(pl);
+}
+
+/**
+ * Remove ldlm lock \a lock from pool \a pl accounting.
+ */
+void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock)
+{
+	/*
+	 * Filter out FLOCK & PLAIN locks. Read above comment in
+	 * ldlm_pool_add().
+	 */
+	if (lock->l_resource->lr_type == LDLM_FLOCK ||
+	    lock->l_resource->lr_type == LDLM_PLAIN)
+		return;
+
+	ldlm_reclaim_del(lock);
+
+	LASSERT(atomic_read(&pl->pl_granted) > 0);
+	atomic_dec(&pl->pl_granted);
+	atomic_inc(&pl->pl_cancel_rate);
+
+	lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_CANCEL_STAT);
+
+	if (ns_is_server(ldlm_pl2ns(pl)))
+		ldlm_pool_recalc(pl);
+}
+
+/**
+ * Returns current \a pl SLV.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+__u64 ldlm_pool_get_slv(struct ldlm_pool *pl)
+{
+	__u64 slv;
+	spin_lock(&pl->pl_lock);
+	slv = pl->pl_server_lock_volume;
+	spin_unlock(&pl->pl_lock);
+	return slv;
+}
+
+/**
+ * Sets passed \a slv to \a pl.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv)
+{
+	spin_lock(&pl->pl_lock);
+	pl->pl_server_lock_volume = slv;
+	spin_unlock(&pl->pl_lock);
+}
+
+/**
+ * Returns current \a pl CLV.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+__u64 ldlm_pool_get_clv(struct ldlm_pool *pl)
+{
+	__u64 slv;
+	spin_lock(&pl->pl_lock);
+	slv = pl->pl_client_lock_volume;
+	spin_unlock(&pl->pl_lock);
+	return slv;
+}
+
+/**
+ * Sets passed \a clv to \a pl.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv)
+{
+	spin_lock(&pl->pl_lock);
+	pl->pl_client_lock_volume = clv;
+	spin_unlock(&pl->pl_lock);
+}
+
+/**
+ * Returns current \a pl limit.
+ */
+__u32 ldlm_pool_get_limit(struct ldlm_pool *pl)
+{
+	return atomic_read(&pl->pl_limit);
+}
+
+/**
+ * Sets passed \a limit to \a pl.
+ */
+void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit)
+{
+	atomic_set(&pl->pl_limit, limit);
+}
+
+/**
+ * Returns current LVF from \a pl.
+ */
+__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl)
+{
+	return atomic_read(&pl->pl_lock_volume_factor);
+}
+
+static struct ptlrpc_thread *ldlm_pools_thread;
+static struct shrinker *ldlm_pools_srv_shrinker;
+static struct shrinker *ldlm_pools_cli_shrinker;
+static struct completion ldlm_pools_comp;
+
+/*
+* count locks from all namespaces (if possible). Returns number of
+* cached locks.
+*/
+static unsigned long ldlm_pools_count(enum ldlm_side client, gfp_t gfp_mask)
+{
+	unsigned long total = 0;
+	int nr_ns;
+	struct ldlm_namespace *ns;
+	struct ldlm_namespace *ns_old = NULL; /* loop detection */
+
+	if (client == LDLM_NAMESPACE_CLIENT && !(gfp_mask & __GFP_FS))
+		return 0;
+
+	CDEBUG(D_DLMTRACE, "Request to count %s locks from all pools\n",
+	       client == LDLM_NAMESPACE_CLIENT ? "client" : "server");
+
+	/*
+	 * Find out how many resources we may release.
+	 */
+	for (nr_ns = ldlm_namespace_nr_read(client);
+	     nr_ns > 0; nr_ns--) {
+		mutex_lock(ldlm_namespace_lock(client));
+		if (list_empty(ldlm_namespace_list(client))) {
+			mutex_unlock(ldlm_namespace_lock(client));
+			return 0;
+		}
+		ns = ldlm_namespace_first_locked(client);
+
+		if (ns == ns_old) {
+			mutex_unlock(ldlm_namespace_lock(client));
+			break;
+		}
+
+		if (ldlm_ns_empty(ns)) {
+			ldlm_namespace_move_to_inactive_locked(ns, client);
+			mutex_unlock(ldlm_namespace_lock(client));
+			continue;
+		}
+
+		if (ns_old == NULL)
+			ns_old = ns;
+
+		ldlm_namespace_get(ns);
+		ldlm_namespace_move_to_active_locked(ns, client);
+		mutex_unlock(ldlm_namespace_lock(client));
+		total += ldlm_pool_shrink(&ns->ns_pool, 0, gfp_mask);
+		ldlm_namespace_put(ns);
+	}
+
+	return total;
+}
+
+static unsigned long ldlm_pools_scan(enum ldlm_side client, int nr,
+				     gfp_t gfp_mask)
+{
+	unsigned long freed = 0;
+	int tmp, nr_ns;
+	struct ldlm_namespace *ns;
+
+	if (client == LDLM_NAMESPACE_CLIENT && !(gfp_mask & __GFP_FS))
+		return -1;
+
+	/*
+	 * Shrink at least ldlm_namespace_nr_read(client) namespaces.
+	 */
+	for (tmp = nr_ns = ldlm_namespace_nr_read(client);
+	     tmp > 0; tmp--) {
+		int cancel, nr_locks;
+
+		/*
+		 * Do not call shrink under ldlm_namespace_lock(client)
+		*/
+		mutex_lock(ldlm_namespace_lock(client));
+		if (list_empty(ldlm_namespace_list(client))) {
+			mutex_unlock(ldlm_namespace_lock(client));
+			break;
+		}
+		ns = ldlm_namespace_first_locked(client);
+		ldlm_namespace_get(ns);
+		ldlm_namespace_move_to_active_locked(ns, client);
+		mutex_unlock(ldlm_namespace_lock(client));
+
+		nr_locks = ldlm_pool_granted(&ns->ns_pool);
+		/*
+		 * We use to shrink propotionally but with new shrinker API,
+		 * we lost the total number of freeable locks.
+		 */
+		cancel = 1 + min_t(int, nr_locks, nr / nr_ns);
+		freed += ldlm_pool_shrink(&ns->ns_pool, cancel, gfp_mask);
+		ldlm_namespace_put(ns);
+	}
+	/*
+	 * we only decrease the SLV in server pools shrinker, return
+	 * SHRINK_STOP to kernel to avoid needless loop. LU-1128
+	 */
+	return (client == LDLM_NAMESPACE_SERVER) ? SHRINK_STOP : freed;
+}
+
+#ifdef HAVE_SHRINKER_COUNT
+static unsigned long ldlm_pools_srv_count(struct shrinker *s,
+					  struct shrink_control *sc)
+{
+	return ldlm_pools_count(LDLM_NAMESPACE_SERVER, sc->gfp_mask);
+}
+
+static unsigned long ldlm_pools_srv_scan(struct shrinker *s,
+					 struct shrink_control *sc)
+{
+	return ldlm_pools_scan(LDLM_NAMESPACE_SERVER, sc->nr_to_scan,
+			       sc->gfp_mask);
+}
+
+static unsigned long ldlm_pools_cli_count(struct shrinker *s, struct shrink_control *sc)
+{
+	return ldlm_pools_count(LDLM_NAMESPACE_CLIENT, sc->gfp_mask);
+}
+
+static unsigned long ldlm_pools_cli_scan(struct shrinker *s,
+					 struct shrink_control *sc)
+{
+	return ldlm_pools_scan(LDLM_NAMESPACE_CLIENT, sc->nr_to_scan,
+			       sc->gfp_mask);
+}
+
+#else
+/*
+ * Cancel \a nr locks from all namespaces (if possible). Returns number of
+ * cached locks after shrink is finished. All namespaces are asked to
+ * cancel approximately equal amount of locks to keep balancing.
+ */
+static int ldlm_pools_shrink(enum ldlm_side client, int nr, gfp_t gfp_mask)
+{
+	unsigned long total = 0;
+
+	if (client == LDLM_NAMESPACE_CLIENT && nr != 0 &&
+	    !(gfp_mask & __GFP_FS))
+		return -1;
+
+	CDEBUG(D_DLMTRACE, "Request to shrink %d %s locks from all pools\n",
+	       nr, client == LDLM_NAMESPACE_CLIENT ? "client" : "server");
+
+	total = ldlm_pools_count(client, gfp_mask);
+
+	if (nr == 0 || total == 0)
+		return total;
+
+	return ldlm_pools_scan(client, nr, gfp_mask);
+}
+
+static int ldlm_pools_srv_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
+{
+        return ldlm_pools_shrink(LDLM_NAMESPACE_SERVER,
+                                 shrink_param(sc, nr_to_scan),
+                                 shrink_param(sc, gfp_mask));
+}
+
+static int ldlm_pools_cli_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
+{
+        return ldlm_pools_shrink(LDLM_NAMESPACE_CLIENT,
+                                 shrink_param(sc, nr_to_scan),
+                                 shrink_param(sc, gfp_mask));
+}
+
+#endif /* HAVE_SHRINKER_COUNT */
+
+int ldlm_pools_recalc(enum ldlm_side client)
+{
+	unsigned long nr_l = 0, nr_p = 0, l;
+	struct ldlm_namespace *ns;
+	struct ldlm_namespace *ns_old = NULL;
+	int nr, equal = 0;
+	/* seconds of sleep if no active namespaces */
+	int time = client ? LDLM_POOL_CLI_DEF_RECALC_PERIOD :
+			    LDLM_POOL_SRV_DEF_RECALC_PERIOD;
+
+	/*
+	 * No need to setup pool limit for client pools.
+	 */
+	if (client == LDLM_NAMESPACE_SERVER) {
+		/*
+		 * Check all modest namespaces first.
+		 */
+		mutex_lock(ldlm_namespace_lock(client));
+		list_for_each_entry(ns, ldlm_namespace_list(client),
+				    ns_list_chain)
+		{
+			if (ns->ns_appetite != LDLM_NAMESPACE_MODEST)
+				continue;
+
+                        l = ldlm_pool_granted(&ns->ns_pool);
+                        if (l == 0)
+                                l = 1;
+
+                        /*
+                         * Set the modest pools limit equal to their avg granted
+                         * locks + ~6%.
+                         */
+                        l += dru(l, LDLM_POOLS_MODEST_MARGIN_SHIFT, 0);
+                        ldlm_pool_setup(&ns->ns_pool, l);
+                        nr_l += l;
+                        nr_p++;
+                }
+
+                /*
+                 * Make sure that modest namespaces did not eat more that 2/3
+                 * of limit.
+                 */
+                if (nr_l >= 2 * (LDLM_POOL_HOST_L / 3)) {
+                        CWARN("\"Modest\" pools eat out 2/3 of server locks "
+                              "limit (%lu of %lu). This means that you have too "
+                              "many clients for this amount of server RAM. "
+                              "Upgrade server!\n", nr_l, LDLM_POOL_HOST_L);
+                        equal = 1;
+                }
+
+		/*
+		 * The rest is given to greedy namespaces.
+		 */
+		list_for_each_entry(ns, ldlm_namespace_list(client),
+				    ns_list_chain)
+		{
+			if (!equal && ns->ns_appetite != LDLM_NAMESPACE_GREEDY)
+				continue;
+
+                        if (equal) {
+                                /*
+                                 * In the case 2/3 locks are eaten out by
+                                 * modest pools, we re-setup equal limit
+                                 * for _all_ pools.
+                                 */
+                                l = LDLM_POOL_HOST_L /
+					ldlm_namespace_nr_read(client);
+                        } else {
+                                /*
+                                 * All the rest of greedy pools will have
+                                 * all locks in equal parts.
+                                 */
+                                l = (LDLM_POOL_HOST_L - nr_l) /
+					(ldlm_namespace_nr_read(client) -
+                                         nr_p);
+                        }
+                        ldlm_pool_setup(&ns->ns_pool, l);
+                }
+		mutex_unlock(ldlm_namespace_lock(client));
+        }
+
+        /*
+         * Recalc at least ldlm_namespace_nr(client) namespaces.
+         */
+	for (nr = ldlm_namespace_nr_read(client); nr > 0; nr--) {
+                int     skip;
+                /*
+                 * Lock the list, get first @ns in the list, getref, move it
+                 * to the tail, unlock and call pool recalc. This way we avoid
+                 * calling recalc under @ns lock what is really good as we get
+                 * rid of potential deadlock on client nodes when canceling
+                 * locks synchronously.
+                 */
+		mutex_lock(ldlm_namespace_lock(client));
+		if (list_empty(ldlm_namespace_list(client))) {
+			mutex_unlock(ldlm_namespace_lock(client));
+			break;
+		}
+		ns = ldlm_namespace_first_locked(client);
+
+		if (ns_old == ns) { /* Full pass complete */
+			mutex_unlock(ldlm_namespace_lock(client));
+			break;
+		}
+
+		/* We got an empty namespace, need to move it back to inactive
+		 * list.
+		 * The race with parallel resource creation is fine:
+		 * - If they do namespace_get before our check, we fail the
+		 *   check and they move this item to the end of the list anyway
+		 * - If we do the check and then they do namespace_get, then
+		 *   we move the namespace to inactive and they will move
+		 *   it back to active (synchronised by the lock, so no clash
+		 *   there).
+		 */
+		if (ldlm_ns_empty(ns)) {
+			ldlm_namespace_move_to_inactive_locked(ns, client);
+			mutex_unlock(ldlm_namespace_lock(client));
+			continue;
+		}
+
+		if (ns_old == NULL)
+			ns_old = ns;
+
+		spin_lock(&ns->ns_lock);
+		/*
+		 * skip ns which is being freed, and we don't want to increase
+		 * its refcount again, not even temporarily. bz21519 & LU-499.
+		 */
+		if (ns->ns_stopping) {
+			skip = 1;
+		} else {
+			skip = 0;
+			ldlm_namespace_get(ns);
+		}
+		spin_unlock(&ns->ns_lock);
+
+		ldlm_namespace_move_to_active_locked(ns, client);
+		mutex_unlock(ldlm_namespace_lock(client));
+
+		/*
+		 * After setup is done - recalc the pool.
+		 */
+		if (!skip) {
+			int ttime = ldlm_pool_recalc(&ns->ns_pool);
+
+			if (ttime < time)
+				time = ttime;
+
+			ldlm_namespace_put(ns);
+		}
+        }
+
+	/* Wake up the blocking threads from time to time. */
+	ldlm_bl_thread_wakeup();
+
+	return time;
+}
+
+static int ldlm_pools_thread_main(void *arg)
+{
+	struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg;
+	int s_time, c_time;
+	ENTRY;
+
+	thread_set_flags(thread, SVC_RUNNING);
+	wake_up(&thread->t_ctl_waitq);
+
+	CDEBUG(D_DLMTRACE, "%s: pool thread starting, process %d\n",
+	       "ldlm_poold", current_pid());
+
+        while (1) {
+                struct l_wait_info lwi;
+
+		/*
+		 * Recal all pools on this tick.
+		 */
+		s_time = ldlm_pools_recalc(LDLM_NAMESPACE_SERVER);
+		c_time = ldlm_pools_recalc(LDLM_NAMESPACE_CLIENT);
+
+		/*
+		 * Wait until the next check time, or until we're
+		 * stopped.
+		 */
+		lwi = LWI_TIMEOUT(cfs_time_seconds(min(s_time, c_time)),
+				  NULL, NULL);
+                l_wait_event(thread->t_ctl_waitq,
+                             thread_is_stopping(thread) ||
+                             thread_is_event(thread),
+                             &lwi);
+
+                if (thread_test_and_clear_flags(thread, SVC_STOPPING))
+                        break;
+                else
+                        thread_test_and_clear_flags(thread, SVC_EVENT);
+        }
+
+	thread_set_flags(thread, SVC_STOPPED);
+	wake_up(&thread->t_ctl_waitq);
+
+	CDEBUG(D_DLMTRACE, "%s: pool thread exiting, process %d\n",
+		"ldlm_poold", current_pid());
+
+	complete_and_exit(&ldlm_pools_comp, 0);
+}
+
+static int ldlm_pools_thread_start(void)
+{
+	struct l_wait_info lwi = { 0 };
+	struct task_struct *task;
+	ENTRY;
+
+	if (ldlm_pools_thread != NULL)
+		RETURN(-EALREADY);
+
+	OBD_ALLOC_PTR(ldlm_pools_thread);
+	if (ldlm_pools_thread == NULL)
+		RETURN(-ENOMEM);
+
+	init_completion(&ldlm_pools_comp);
+	init_waitqueue_head(&ldlm_pools_thread->t_ctl_waitq);
+
+	task = kthread_run(ldlm_pools_thread_main, ldlm_pools_thread,
+			   "ldlm_poold");
+	if (IS_ERR(task)) {
+		CERROR("Can't start pool thread, error %ld\n", PTR_ERR(task));
+		OBD_FREE(ldlm_pools_thread, sizeof(*ldlm_pools_thread));
+		ldlm_pools_thread = NULL;
+		RETURN(PTR_ERR(task));
+	}
+	l_wait_event(ldlm_pools_thread->t_ctl_waitq,
+		     thread_is_running(ldlm_pools_thread), &lwi);
+	RETURN(0);
+}
+
+static void ldlm_pools_thread_stop(void)
+{
+	ENTRY;
+
+	if (ldlm_pools_thread == NULL) {
+		EXIT;
+		return;
+	}
+
+	thread_set_flags(ldlm_pools_thread, SVC_STOPPING);
+	wake_up(&ldlm_pools_thread->t_ctl_waitq);
+
+	/*
+	 * Make sure that pools thread is finished before freeing @thread.
+	 * This fixes possible race and oops due to accessing freed memory
+	 * in pools thread.
+	 */
+	wait_for_completion(&ldlm_pools_comp);
+	OBD_FREE_PTR(ldlm_pools_thread);
+	ldlm_pools_thread = NULL;
+	EXIT;
+}
+
+int ldlm_pools_init(void)
+{
+	int rc;
+	DEF_SHRINKER_VAR(shsvar, ldlm_pools_srv_shrink,
+			 ldlm_pools_srv_count, ldlm_pools_srv_scan);
+	DEF_SHRINKER_VAR(shcvar, ldlm_pools_cli_shrink,
+			 ldlm_pools_cli_count, ldlm_pools_cli_scan);
+	ENTRY;
+
+	rc = ldlm_pools_thread_start();
+	if (rc == 0) {
+		ldlm_pools_srv_shrinker =
+			set_shrinker(DEFAULT_SEEKS, &shsvar);
+		ldlm_pools_cli_shrinker =
+			set_shrinker(DEFAULT_SEEKS, &shcvar);
+	}
+	RETURN(rc);
+}
+
+void ldlm_pools_fini(void)
+{
+	if (ldlm_pools_srv_shrinker != NULL) {
+		remove_shrinker(ldlm_pools_srv_shrinker);
+		ldlm_pools_srv_shrinker = NULL;
+	}
+	if (ldlm_pools_cli_shrinker != NULL) {
+		remove_shrinker(ldlm_pools_cli_shrinker);
+		ldlm_pools_cli_shrinker = NULL;
+	}
+	ldlm_pools_thread_stop();
+}
+
+#else /* !HAVE_LRU_RESIZE_SUPPORT */
+int ldlm_pool_setup(struct ldlm_pool *pl, int limit)
+{
+        return 0;
+}
+
+int ldlm_pool_recalc(struct ldlm_pool *pl)
+{
+        return 0;
+}
+
+int ldlm_pool_shrink(struct ldlm_pool *pl,
+		     int nr, gfp_t gfp_mask)
+{
+	return 0;
+}
+
+int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
+		   int idx, enum ldlm_side client)
+{
+	return 0;
+}
+
+void ldlm_pool_fini(struct ldlm_pool *pl)
+{
+        return;
+}
+
+void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock)
+{
+        return;
+}
+
+void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock)
+{
+        return;
+}
+
+__u64 ldlm_pool_get_slv(struct ldlm_pool *pl)
+{
+        return 1;
+}
+
+void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv)
+{
+        return;
+}
+
+__u64 ldlm_pool_get_clv(struct ldlm_pool *pl)
+{
+        return 1;
+}
+
+void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv)
+{
+        return;
+}
+
+__u32 ldlm_pool_get_limit(struct ldlm_pool *pl)
+{
+        return 0;
+}
+
+void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit)
+{
+        return;
+}
+
+__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl)
+{
+        return 0;
+}
+
+int ldlm_pools_init(void)
+{
+        return 0;
+}
+
+void ldlm_pools_fini(void)
+{
+	return;
+}
+
+int ldlm_pools_recalc(enum ldlm_side client)
+{
+	return 0;
+}
+#endif /* HAVE_LRU_RESIZE_SUPPORT */
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_reclaim.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_reclaim.c
new file mode 100644
index 0000000000000..cf4c87f9e2312
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_reclaim.c
@@ -0,0 +1,411 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2015, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: Niu    Yawei    <yawei.niu@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <linux/kthread.h>
+#include <lustre_dlm.h>
+#include <obd_class.h>
+#include "ldlm_internal.h"
+
+/*
+ * To avoid ldlm lock exhausting server memory, two global parameters:
+ * ldlm_reclaim_threshold & ldlm_lock_limit are used for reclaiming
+ * granted locks and rejecting incoming enqueue requests defensively.
+ *
+ * ldlm_reclaim_threshold: When the amount of granted locks reaching this
+ * threshold, server start to revoke locks gradually.
+ *
+ * ldlm_lock_limit: When the amount of granted locks reaching this
+ * threshold, server will return -EINPROGRESS to any incoming enqueue
+ * request until the lock count is shrunk below the threshold again.
+ *
+ * ldlm_reclaim_threshold & ldlm_lock_limit is set to 20% & 30% of the
+ * total memory by default. It is tunable via proc entry, when it's set
+ * to 0, the feature is disabled.
+ */
+
+#ifdef HAVE_SERVER_SUPPORT
+
+/* Lock count is stored in ldlm_reclaim_threshold & ldlm_lock_limit */
+__u64 ldlm_reclaim_threshold;
+__u64 ldlm_lock_limit;
+
+/* Represents ldlm_reclaim_threshold & ldlm_lock_limit in MB, used for
+ * proc interface. */
+__u64 ldlm_reclaim_threshold_mb;
+__u64 ldlm_lock_limit_mb;
+
+struct percpu_counter		ldlm_granted_total;
+static atomic_t			ldlm_nr_reclaimer;
+static s64			ldlm_last_reclaim_age_ns;
+static ktime_t			ldlm_last_reclaim_time;
+
+struct ldlm_reclaim_cb_data {
+	struct list_head	 rcd_rpc_list;
+	int			 rcd_added;
+	int			 rcd_total;
+	int			 rcd_cursor;
+	int			 rcd_start;
+	bool			 rcd_skip;
+	s64			 rcd_age_ns;
+	struct cfs_hash_bd	*rcd_prev_bd;
+};
+
+static inline bool ldlm_lock_reclaimable(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	/* FLOCK & PLAIN lock are not reclaimable. FLOCK is
+	 * explicitly controlled by application, PLAIN lock
+	 * is used by quota global lock and config lock.
+	 */
+	if (ns->ns_client == LDLM_NAMESPACE_SERVER &&
+	    (lock->l_resource->lr_type == LDLM_IBITS ||
+	     lock->l_resource->lr_type == LDLM_EXTENT))
+		return true;
+	return false;
+}
+
+/**
+ * Callback function for revoking locks from certain resource.
+ *
+ * \param [in] hs	ns_rs_hash
+ * \param [in] bd	current bucket of ns_rsh_hash
+ * \param [in] hnode	hnode of the resource
+ * \param [in] arg	opaque data
+ *
+ * \retval 0		continue the scan
+ * \retval 1		stop the iteration
+ */
+static int ldlm_reclaim_lock_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+				struct hlist_node *hnode, void *arg)
+
+{
+	struct ldlm_resource		*res;
+	struct ldlm_reclaim_cb_data	*data;
+	struct ldlm_lock		*lock;
+	struct ldlm_ns_bucket		*nsb;
+	int				 rc = 0;
+
+	data = (struct ldlm_reclaim_cb_data *)arg;
+
+	LASSERTF(data->rcd_added < data->rcd_total, "added:%d >= total:%d\n",
+		 data->rcd_added, data->rcd_total);
+
+	nsb = cfs_hash_bd_extra_get(hs, bd);
+	res = cfs_hash_object(hs, hnode);
+
+	if (data->rcd_prev_bd != bd) {
+		if (data->rcd_prev_bd != NULL)
+			ldlm_res_to_ns(res)->ns_reclaim_start++;
+		data->rcd_prev_bd = bd;
+		data->rcd_cursor = 0;
+		data->rcd_start = nsb->nsb_reclaim_start %
+				  cfs_hash_bd_count_get(bd);
+	}
+
+	if (data->rcd_skip && data->rcd_cursor < data->rcd_start) {
+		data->rcd_cursor++;
+		return 0;
+	}
+
+	nsb->nsb_reclaim_start++;
+
+	lock_res(res);
+	list_for_each_entry(lock, &res->lr_granted, l_res_link) {
+		if (!ldlm_lock_reclaimable(lock))
+			continue;
+
+		if (!OBD_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_LOW) &&
+		    ktime_before(ktime_get(),
+				 ktime_add_ns(lock->l_last_used,
+					      data->rcd_age_ns)))
+			continue;
+
+		if (!ldlm_is_ast_sent(lock)) {
+			ldlm_set_ast_sent(lock);
+			LASSERT(list_empty(&lock->l_rk_ast));
+			list_add(&lock->l_rk_ast, &data->rcd_rpc_list);
+			LDLM_LOCK_GET(lock);
+			if (++data->rcd_added == data->rcd_total) {
+				rc = 1; /* stop the iteration */
+				break;
+			}
+		}
+	}
+	unlock_res(res);
+
+	return rc;
+}
+
+/**
+ * Revoke locks from the resources of a namespace in a roundrobin
+ * manner.
+ *
+ * \param[in] ns	namespace to do the lock revoke on
+ * \param[in] count	count of lock to be revoked
+ * \param[in] age	only revoke locks older than the 'age'
+ * \param[in] skip	scan from the first lock on resource if the
+ *			'skip' is false, otherwise, continue scan
+ *			from the last scanned position
+ * \param[out] count	count of lock still to be revoked
+ */
+static void ldlm_reclaim_res(struct ldlm_namespace *ns, int *count,
+			     s64 age_ns, bool skip)
+{
+	struct ldlm_reclaim_cb_data	data;
+	int				idx, type, start;
+	ENTRY;
+
+	LASSERT(*count != 0);
+
+	if (ns->ns_obd) {
+		type = server_name2index(ns->ns_obd->obd_name, &idx, NULL);
+		if (type != LDD_F_SV_TYPE_MDT && type != LDD_F_SV_TYPE_OST) {
+			EXIT;
+			return;
+		}
+	}
+
+	if (atomic_read(&ns->ns_bref) == 0) {
+		EXIT;
+		return;
+	}
+
+	INIT_LIST_HEAD(&data.rcd_rpc_list);
+	data.rcd_added = 0;
+	data.rcd_total = *count;
+	data.rcd_age_ns = age_ns;
+	data.rcd_skip = skip;
+	data.rcd_prev_bd = NULL;
+	start = ns->ns_reclaim_start % CFS_HASH_NBKT(ns->ns_rs_hash);
+
+	cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_reclaim_lock_cb, &data,
+				 start);
+
+	CDEBUG(D_DLMTRACE, "NS(%s): %d locks to be reclaimed, found %d/%d "
+	       "locks.\n", ldlm_ns_name(ns), *count, data.rcd_added,
+	       data.rcd_total);
+
+	LASSERTF(*count >= data.rcd_added, "count:%d, added:%d\n", *count,
+		 data.rcd_added);
+
+	ldlm_run_ast_work(ns, &data.rcd_rpc_list, LDLM_WORK_REVOKE_AST);
+	*count -= data.rcd_added;
+	EXIT;
+}
+
+#define LDLM_RECLAIM_BATCH	512
+#define LDLM_RECLAIM_AGE_MIN	(300 * NSEC_PER_SEC)
+#define LDLM_RECLAIM_AGE_MAX	(LDLM_DEFAULT_MAX_ALIVE * NSEC_PER_SEC * 3 / 4)
+
+static inline s64 ldlm_reclaim_age(void)
+{
+	s64 age_ns = ldlm_last_reclaim_age_ns;
+	ktime_t now = ktime_get();
+	ktime_t diff;
+
+	diff = ktime_sub(now, ldlm_last_reclaim_time);
+	age_ns += ktime_to_ns(diff);
+	if (age_ns > LDLM_RECLAIM_AGE_MAX)
+		age_ns = LDLM_RECLAIM_AGE_MAX;
+	else if (age_ns < (LDLM_RECLAIM_AGE_MIN * 2))
+		age_ns = LDLM_RECLAIM_AGE_MIN;
+	return age_ns;
+}
+
+/**
+ * Revoke certain amount of locks from all the server namespaces
+ * in a roundrobin manner. Lock age is used to avoid reclaim on
+ * the non-aged locks.
+ */
+static void ldlm_reclaim_ns(void)
+{
+	struct ldlm_namespace	*ns;
+	int			 count = LDLM_RECLAIM_BATCH;
+	int			 ns_nr, nr_processed;
+	enum ldlm_side		 ns_cli = LDLM_NAMESPACE_SERVER;
+	s64 age_ns;
+	bool			 skip = true;
+	ENTRY;
+
+	if (!atomic_add_unless(&ldlm_nr_reclaimer, 1, 1)) {
+		EXIT;
+		return;
+	}
+
+	age_ns = ldlm_reclaim_age();
+again:
+	nr_processed = 0;
+	ns_nr = ldlm_namespace_nr_read(ns_cli);
+	while (count > 0 && nr_processed < ns_nr) {
+		mutex_lock(ldlm_namespace_lock(ns_cli));
+
+		if (list_empty(ldlm_namespace_list(ns_cli))) {
+			mutex_unlock(ldlm_namespace_lock(ns_cli));
+			goto out;
+		}
+
+		ns = ldlm_namespace_first_locked(ns_cli);
+		ldlm_namespace_move_to_active_locked(ns, ns_cli);
+		mutex_unlock(ldlm_namespace_lock(ns_cli));
+
+		ldlm_reclaim_res(ns, &count, age_ns, skip);
+		ldlm_namespace_put(ns);
+		nr_processed++;
+	}
+
+	if (count > 0 && age_ns > LDLM_RECLAIM_AGE_MIN) {
+		age_ns >>= 1;
+		if (age_ns < (LDLM_RECLAIM_AGE_MIN * 2))
+			age_ns = LDLM_RECLAIM_AGE_MIN;
+		skip = false;
+		goto again;
+	}
+
+	ldlm_last_reclaim_age_ns = age_ns;
+	ldlm_last_reclaim_time = ktime_get();
+out:
+	atomic_add_unless(&ldlm_nr_reclaimer, -1, 0);
+	EXIT;
+}
+
+void ldlm_reclaim_add(struct ldlm_lock *lock)
+{
+	if (!ldlm_lock_reclaimable(lock))
+		return;
+	percpu_counter_add(&ldlm_granted_total, 1);
+	lock->l_last_used = ktime_get();
+}
+
+void ldlm_reclaim_del(struct ldlm_lock *lock)
+{
+	if (!ldlm_lock_reclaimable(lock))
+		return;
+	percpu_counter_sub(&ldlm_granted_total, 1);
+}
+
+/**
+ * Check on the total granted locks: return true if it reaches the
+ * high watermark (ldlm_lock_limit), otherwise return false; It also
+ * triggers lock reclaim if the low watermark (ldlm_reclaim_threshold)
+ * is reached.
+ *
+ * \retval true		high watermark reached.
+ * \retval false	high watermark not reached.
+ */
+bool ldlm_reclaim_full(void)
+{
+	__u64 high = ldlm_lock_limit;
+	__u64 low = ldlm_reclaim_threshold;
+
+	if (low != 0 && OBD_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_LOW))
+		low = cfs_fail_val;
+
+	if (low != 0 &&
+	    percpu_counter_sum_positive(&ldlm_granted_total) > low)
+		ldlm_reclaim_ns();
+
+	if (high != 0 && OBD_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_HIGH))
+		high = cfs_fail_val;
+
+	if (high != 0 &&
+	    percpu_counter_sum_positive(&ldlm_granted_total) > high)
+		return true;
+
+	return false;
+}
+
+static inline __u64 ldlm_ratio2locknr(int ratio)
+{
+	__u64 locknr;
+
+	locknr = ((__u64)NUM_CACHEPAGES << PAGE_SHIFT) * ratio;
+	do_div(locknr, 100 * sizeof(struct ldlm_lock));
+
+	return locknr;
+}
+
+static inline __u64 ldlm_locknr2mb(__u64 locknr)
+{
+	return (locknr * sizeof(struct ldlm_lock) + 512 * 1024) >> 20;
+}
+
+#define LDLM_WM_RATIO_LOW_DEFAULT	20
+#define LDLM_WM_RATIO_HIGH_DEFAULT	30
+
+int ldlm_reclaim_setup(void)
+{
+	atomic_set(&ldlm_nr_reclaimer, 0);
+
+	ldlm_reclaim_threshold = ldlm_ratio2locknr(LDLM_WM_RATIO_LOW_DEFAULT);
+	ldlm_reclaim_threshold_mb = ldlm_locknr2mb(ldlm_reclaim_threshold);
+	ldlm_lock_limit = ldlm_ratio2locknr(LDLM_WM_RATIO_HIGH_DEFAULT);
+	ldlm_lock_limit_mb = ldlm_locknr2mb(ldlm_lock_limit);
+
+	ldlm_last_reclaim_age_ns = LDLM_RECLAIM_AGE_MAX;
+	ldlm_last_reclaim_time = ktime_get();
+
+#ifdef HAVE_PERCPU_COUNTER_INIT_GFP_FLAG
+	return percpu_counter_init(&ldlm_granted_total, 0, GFP_KERNEL);
+#else
+	return percpu_counter_init(&ldlm_granted_total, 0);
+#endif
+}
+
+void ldlm_reclaim_cleanup(void)
+{
+	percpu_counter_destroy(&ldlm_granted_total);
+}
+
+#else /* HAVE_SERVER_SUPPORT */
+
+bool ldlm_reclaim_full(void)
+{
+	return false;
+}
+
+void ldlm_reclaim_add(struct ldlm_lock *lock)
+{
+}
+
+void ldlm_reclaim_del(struct ldlm_lock *lock)
+{
+}
+
+int ldlm_reclaim_setup(void)
+{
+	return 0;
+}
+
+void ldlm_reclaim_cleanup(void)
+{
+}
+
+#endif /* HAVE_SERVER_SUPPORT */
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c
new file mode 100644
index 0000000000000..14ac08ade0809
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c
@@ -0,0 +1,2415 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/**
+ * This file contains Asynchronous System Trap (AST) handlers and related
+ * LDLM request-processing routines.
+ *
+ * An AST is a callback issued on a lock when its state is changed. There are
+ * several different types of ASTs (callbacks) registered for each lock:
+ *
+ * - completion AST: when a lock is enqueued by some process, but cannot be
+ *   granted immediately due to other conflicting locks on the same resource,
+ *   the completion AST is sent to notify the caller when the lock is
+ *   eventually granted
+ *
+ * - blocking AST: when a lock is granted to some process, if another process
+ *   enqueues a conflicting (blocking) lock on a resource, a blocking AST is
+ *   sent to notify the holder(s) of the lock(s) of the conflicting lock
+ *   request. The lock holder(s) must release their lock(s) on that resource in
+ *   a timely manner or be evicted by the server.
+ *
+ * - glimpse AST: this is used when a process wants information about a lock
+ *   (i.e. the lock value block (LVB)) but does not necessarily require holding
+ *   the lock. If the resource is locked, the lock holder(s) are sent glimpse
+ *   ASTs and the LVB is returned to the caller, and lock holder(s) may CANCEL
+ *   their lock(s) if they are idle. If the resource is not locked, the server
+ *   may grant the lock.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <lustre/lustre_errno.h>
+
+#include <lustre_dlm.h>
+#include <obd_class.h>
+#include <obd.h>
+
+#include "ldlm_internal.h"
+
+unsigned int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT;
+module_param(ldlm_enqueue_min, uint, 0644);
+MODULE_PARM_DESC(ldlm_enqueue_min, "lock enqueue timeout minimum");
+
+/* in client side, whether the cached locks will be canceled before replay */
+unsigned int ldlm_cancel_unused_locks_before_replay = 1;
+
+static void interrupted_completion_wait(void *data)
+{
+}
+
+struct lock_wait_data {
+        struct ldlm_lock *lwd_lock;
+        __u32             lwd_conn_cnt;
+};
+
+struct ldlm_async_args {
+        struct lustre_handle lock_handle;
+};
+
+/**
+ * ldlm_request_bufsize
+ *
+ * If opcode=LDLM_ENQUEUE, 1 slot is already occupied,
+ * LDLM_LOCKREQ_HANDLE -1 slots are available.
+ * Otherwise, LDLM_LOCKREQ_HANDLE slots are available.
+ *
+ * \param[in] count
+ * \param[in] type
+ *
+ * \retval size of the request buffer
+ */
+
+int ldlm_request_bufsize(int count, int type)
+{
+	int avail = LDLM_LOCKREQ_HANDLES;
+	if (type == LDLM_ENQUEUE)
+		avail -= LDLM_ENQUEUE_CANCEL_OFF;
+
+	if (count > avail)
+		avail = (count - avail) * sizeof(struct lustre_handle);
+	else
+		avail = 0;
+
+	return sizeof(struct ldlm_request) + avail;
+}
+
+int ldlm_expired_completion_wait(void *data)
+{
+        struct lock_wait_data *lwd = data;
+        struct ldlm_lock *lock = lwd->lwd_lock;
+        struct obd_import *imp;
+        struct obd_device *obd;
+
+        ENTRY;
+        if (lock->l_conn_export == NULL) {
+                static cfs_time_t next_dump = 0, last_dump = 0;
+
+		LDLM_ERROR(lock, "lock timed out (enqueued at %lld, %llds ago); "
+			   "not entering recovery in server code, just going back to sleep",
+			   (s64)lock->l_last_activity,
+			   (s64)(ktime_get_real_seconds() -
+				 lock->l_last_activity));
+                if (cfs_time_after(cfs_time_current(), next_dump)) {
+                        last_dump = next_dump;
+                        next_dump = cfs_time_shift(300);
+                        ldlm_namespace_dump(D_DLMTRACE,
+                                            ldlm_lock_to_ns(lock));
+                        if (last_dump == 0)
+                                libcfs_debug_dumplog();
+                }
+                RETURN(0);
+        }
+
+        obd = lock->l_conn_export->exp_obd;
+        imp = obd->u.cli.cl_import;
+        ptlrpc_fail_import(imp, lwd->lwd_conn_cnt);
+	LDLM_ERROR(lock, "lock timed out (enqueued at %lld, %llds ago), entering recovery for %s@%s",
+		  (s64)lock->l_last_activity,
+		  (s64)(ktime_get_real_seconds() - lock->l_last_activity),
+                  obd2cli_tgt(obd), imp->imp_connection->c_remote_uuid.uuid);
+
+        RETURN(0);
+}
+
+/**
+ * Calculate the Completion timeout (covering enqueue, BL AST, data flush,
+ * lock cancel, and their replies). Used for lock completion timeout on the
+ * client side.
+ *
+ * \param[in] lock        lock which is waiting the completion callback
+ *
+ * \retval            timeout in seconds to wait for the server reply
+ */
+
+/* We use the same basis for both server side and client side functions
+   from a single node. */
+static unsigned int ldlm_cp_timeout(struct ldlm_lock *lock)
+{
+	unsigned int timeout;
+
+	if (AT_OFF)
+		return obd_timeout;
+
+	/* Wait a long time for enqueue - server may have to callback a
+	 * lock from another client.  Server will evict the other client if it
+	 * doesn't respond reasonably, and then give us the lock. */
+	timeout = at_get(ldlm_lock_to_ns_at(lock));
+	return max(3 * timeout, ldlm_enqueue_min);
+}
+
+/**
+ * Helper function for ldlm_completion_ast(), updating timings when lock is
+ * actually granted.
+ */
+static int ldlm_completion_tail(struct ldlm_lock *lock, void *data)
+{
+	time64_t delay;
+	int result = 0;
+
+	if (ldlm_is_destroyed(lock) || ldlm_is_failed(lock)) {
+		LDLM_DEBUG(lock, "client-side enqueue: destroyed");
+		result = -EIO;
+	} else if (data == NULL) {
+		LDLM_DEBUG(lock, "client-side enqueue: granted");
+	} else {
+		/* Take into AT only CP RPC, not immediately granted locks */
+		delay = ktime_get_real_seconds() - lock->l_last_activity;
+		LDLM_DEBUG(lock, "client-side enqueue: granted after %llds",
+			   (s64)delay);
+
+		/* Update our time estimate */
+		at_measured(ldlm_lock_to_ns_at(lock), delay);
+	}
+	return result;
+}
+
+/**
+ * Implementation of ->l_completion_ast() for a client, that doesn't wait
+ * until lock is granted. Suitable for locks enqueued through ptlrpcd, of
+ * other threads that cannot block for long.
+ */
+int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data)
+{
+	ENTRY;
+
+	if (flags == LDLM_FL_WAIT_NOREPROC) {
+		LDLM_DEBUG(lock, "client-side enqueue waiting on pending lock");
+		RETURN(0);
+	}
+
+	if (!(flags & LDLM_FL_BLOCKED_MASK)) {
+		wake_up(&lock->l_waitq);
+		RETURN(ldlm_completion_tail(lock, data));
+	}
+
+	LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, "
+		   "going forward");
+	ldlm_reprocess_all(lock->l_resource);
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_completion_ast_async);
+
+/**
+ * Generic LDLM "completion" AST. This is called in several cases:
+ *
+ *     - when a reply to an ENQUEUE RPC is received from the server
+ *       (ldlm_cli_enqueue_fini()). Lock might be granted or not granted at
+ *       this point (determined by flags);
+ *
+ *     - when LDLM_CP_CALLBACK RPC comes to client to notify it that lock has
+ *       been granted;
+ *
+ *     - when ldlm_lock_match(LDLM_FL_LVB_READY) is about to wait until lock
+ *       gets correct lvb;
+ *
+ *     - to force all locks when resource is destroyed (cleanup_resource());
+ *
+ *     - during lock conversion (not used currently).
+ *
+ * If lock is not granted in the first case, this function waits until second
+ * or penultimate cases happen in some other thread.
+ *
+ */
+int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
+{
+        /* XXX ALLOCATE - 160 bytes */
+        struct lock_wait_data lwd;
+        struct obd_device *obd;
+        struct obd_import *imp = NULL;
+        struct l_wait_info lwi;
+        __u32 timeout;
+        int rc = 0;
+        ENTRY;
+
+        if (flags == LDLM_FL_WAIT_NOREPROC) {
+                LDLM_DEBUG(lock, "client-side enqueue waiting on pending lock");
+                goto noreproc;
+        }
+
+	if (!(flags & LDLM_FL_BLOCKED_MASK)) {
+		wake_up(&lock->l_waitq);
+		RETURN(0);
+	}
+
+        LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, "
+                   "sleeping");
+
+noreproc:
+
+        obd = class_exp2obd(lock->l_conn_export);
+
+        /* if this is a local lock, then there is no import */
+        if (obd != NULL) {
+                imp = obd->u.cli.cl_import;
+        }
+
+	timeout = ldlm_cp_timeout(lock);
+
+	lwd.lwd_lock = lock;
+	lock->l_last_activity = cfs_time_current_sec();
+
+	if (ldlm_is_no_timeout(lock)) {
+                LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT");
+                lwi = LWI_INTR(interrupted_completion_wait, &lwd);
+        } else {
+                lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout),
+                                       ldlm_expired_completion_wait,
+                                       interrupted_completion_wait, &lwd);
+        }
+
+	if (imp != NULL) {
+		spin_lock(&imp->imp_lock);
+		lwd.lwd_conn_cnt = imp->imp_conn_cnt;
+		spin_unlock(&imp->imp_lock);
+	}
+
+        if (ns_is_client(ldlm_lock_to_ns(lock)) &&
+            OBD_FAIL_CHECK_RESET(OBD_FAIL_LDLM_INTR_CP_AST,
+                                 OBD_FAIL_LDLM_CP_BL_RACE | OBD_FAIL_ONCE)) {
+		ldlm_set_fail_loc(lock);
+                rc = -EINTR;
+        } else {
+                /* Go to sleep until the lock is granted or cancelled. */
+                rc = l_wait_event(lock->l_waitq,
+                                  is_granted_or_cancelled(lock), &lwi);
+        }
+
+        if (rc) {
+                LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)",
+                           rc);
+                RETURN(rc);
+	}
+
+	RETURN(ldlm_completion_tail(lock, data));
+}
+EXPORT_SYMBOL(ldlm_completion_ast);
+
+/**
+ * A helper to build a blocking AST function
+ *
+ * Perform a common operation for blocking ASTs:
+ * defferred lock cancellation.
+ *
+ * \param lock the lock blocking or canceling AST was called on
+ * \retval 0
+ * \see mdt_blocking_ast
+ * \see ldlm_blocking_ast
+ */
+int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock)
+{
+        int do_ast;
+        ENTRY;
+
+	ldlm_set_cbpending(lock);
+        do_ast = (!lock->l_readers && !lock->l_writers);
+        unlock_res_and_lock(lock);
+
+        if (do_ast) {
+                struct lustre_handle lockh;
+                int rc;
+
+                LDLM_DEBUG(lock, "already unused, calling ldlm_cli_cancel");
+                ldlm_lock2handle(lock, &lockh);
+		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+                if (rc < 0)
+                        CERROR("ldlm_cli_cancel: %d\n", rc);
+        } else {
+                LDLM_DEBUG(lock, "Lock still has references, will be "
+                           "cancelled later");
+        }
+        RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_blocking_ast_nocheck);
+
+/**
+ * Server blocking AST
+ *
+ * ->l_blocking_ast() callback for LDLM locks acquired by server-side
+ * OBDs.
+ *
+ * \param lock the lock which blocks a request or cancelling lock
+ * \param desc unused
+ * \param data unused
+ * \param flag indicates whether this cancelling or blocking callback
+ * \retval 0
+ * \see ldlm_blocking_ast_nocheck
+ */
+int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+                      void *data, int flag)
+{
+        ENTRY;
+
+        if (flag == LDLM_CB_CANCELING) {
+                /* Don't need to do anything here. */
+                RETURN(0);
+        }
+
+        lock_res_and_lock(lock);
+        /* Get this: if ldlm_blocking_ast is racing with intent_policy, such
+         * that ldlm_blocking_ast is called just before intent_policy method
+         * takes the lr_lock, then by the time we get the lock, we might not
+         * be the correct blocking function anymore.  So check, and return
+         * early, if so. */
+        if (lock->l_blocking_ast != ldlm_blocking_ast) {
+                unlock_res_and_lock(lock);
+                RETURN(0);
+        }
+        RETURN(ldlm_blocking_ast_nocheck(lock));
+}
+EXPORT_SYMBOL(ldlm_blocking_ast);
+
+/**
+ * Implements ldlm_lock::l_glimpse_ast for extent locks acquired on the server.
+ *
+ * Returning -ELDLM_NO_LOCK_DATA actually works, but the reason for that is
+ * rather subtle: with OST-side locking, it may so happen that _all_ extent
+ * locks are held by the OST. If client wants to obtain the current file size
+ * it calls ll_glimpse_size(), and (as all locks are held only on the server),
+ * this dummy glimpse callback fires and does nothing. The client still
+ * receives the correct file size due to the following fragment of code in
+ * ldlm_cb_interpret():
+ *
+ *	if (rc == -ELDLM_NO_LOCK_DATA) {
+ *		LDLM_DEBUG(lock, "lost race - client has a lock but no"
+ *			   "inode");
+ *		ldlm_res_lvbo_update(lock->l_resource, NULL, 1);
+ *	}
+ *
+ * That is, after the glimpse returns this error, ofd_lvbo_update() is called
+ * and returns the updated file attributes from the inode to the client.
+ *
+ * See also comment in ofd_intent_policy() on why servers must set a non-NULL
+ * l_glimpse_ast when grabbing DLM locks.  Otherwise, the server will assume
+ * that the object is in the process of being destroyed.
+ *
+ * \param[in] lock	DLM lock being glimpsed, unused
+ * \param[in] reqp	pointer to ptlrpc_request, unused
+ *
+ * \retval		-ELDLM_NO_LOCK_DATA to get attributes from disk object
+ */
+int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp)
+{
+        return -ELDLM_NO_LOCK_DATA;
+}
+
+/**
+ * Enqueue a local lock (typically on a server).
+ */
+int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
+			   const struct ldlm_res_id *res_id,
+			   enum ldlm_type type, union ldlm_policy_data *policy,
+			   enum ldlm_mode mode, __u64 *flags,
+			   ldlm_blocking_callback blocking,
+			   ldlm_completion_callback completion,
+			   ldlm_glimpse_callback glimpse,
+			   void *data, __u32 lvb_len, enum lvb_type lvb_type,
+			   const __u64 *client_cookie,
+			   struct lustre_handle *lockh)
+{
+        struct ldlm_lock *lock;
+        int err;
+        const struct ldlm_callback_suite cbs = { .lcs_completion = completion,
+                                                 .lcs_blocking   = blocking,
+                                                 .lcs_glimpse    = glimpse,
+        };
+        ENTRY;
+
+        LASSERT(!(*flags & LDLM_FL_REPLAY));
+        if (unlikely(ns_is_client(ns))) {
+                CERROR("Trying to enqueue local lock in a shadow namespace\n");
+                LBUG();
+        }
+
+	lock = ldlm_lock_create(ns, res_id, type, mode, &cbs, data, lvb_len,
+				lvb_type);
+	if (IS_ERR(lock))
+		GOTO(out_nolock, err = PTR_ERR(lock));
+
+	err = ldlm_lvbo_init(lock->l_resource);
+	if (err < 0) {
+		LDLM_ERROR(lock, "delayed lvb init failed (rc %d)", err);
+		GOTO(out, err);
+	}
+
+        ldlm_lock2handle(lock, lockh);
+
+        /* NB: we don't have any lock now (lock_res_and_lock)
+         * because it's a new lock */
+        ldlm_lock_addref_internal_nolock(lock, mode);
+	ldlm_set_local(lock);
+        if (*flags & LDLM_FL_ATOMIC_CB)
+		ldlm_set_atomic_cb(lock);
+
+        if (policy != NULL)
+                lock->l_policy_data = *policy;
+        if (client_cookie != NULL)
+                lock->l_client_cookie = *client_cookie;
+	if (type == LDLM_EXTENT) {
+		/* extent lock without policy is a bug */
+		if (policy == NULL)
+			LBUG();
+
+		lock->l_req_extent = policy->l_extent;
+	}
+
+        err = ldlm_lock_enqueue(ns, &lock, policy, flags);
+        if (unlikely(err != ELDLM_OK))
+                GOTO(out, err);
+
+        if (policy != NULL)
+                *policy = lock->l_policy_data;
+
+        if (lock->l_completion_ast)
+                lock->l_completion_ast(lock, *flags, NULL);
+
+        LDLM_DEBUG(lock, "client-side local enqueue handler, new lock created");
+        EXIT;
+ out:
+        LDLM_LOCK_RELEASE(lock);
+ out_nolock:
+        return err;
+}
+EXPORT_SYMBOL(ldlm_cli_enqueue_local);
+
+static void failed_lock_cleanup(struct ldlm_namespace *ns,
+                                struct ldlm_lock *lock, int mode)
+{
+        int need_cancel = 0;
+
+        /* Set a flag to prevent us from sending a CANCEL (bug 407) */
+        lock_res_and_lock(lock);
+        /* Check that lock is not granted or failed, we might race. */
+        if ((lock->l_req_mode != lock->l_granted_mode) &&
+	    !ldlm_is_failed(lock)) {
+		/* Make sure that this lock will not be found by raced
+		 * bl_ast and -EINVAL reply is sent to server anyways.
+		 * b=17645*/
+		lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_FAILED |
+				 LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING;
+		need_cancel = 1;
+        }
+        unlock_res_and_lock(lock);
+
+        if (need_cancel)
+                LDLM_DEBUG(lock,
+                           "setting FL_LOCAL_ONLY | LDLM_FL_FAILED | "
+                           "LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING");
+        else
+                LDLM_DEBUG(lock, "lock was granted or failed in race");
+
+	/* XXX - HACK because we shouldn't call ldlm_lock_destroy()
+	 *       from llite/file.c/ll_file_flock(). */
+	/* This code makes for the fact that we do not have blocking handler on
+	 * a client for flock locks. As such this is the place where we must
+	 * completely kill failed locks. (interrupted and those that
+	 * were waiting to be granted when server evicted us. */
+	if (lock->l_resource->lr_type == LDLM_FLOCK) {
+		lock_res_and_lock(lock);
+		if (!ldlm_is_destroyed(lock)) {
+			ldlm_resource_unlink_lock(lock);
+			ldlm_lock_decref_internal_nolock(lock, mode);
+			ldlm_lock_destroy_nolock(lock);
+		}
+		unlock_res_and_lock(lock);
+	} else {
+		ldlm_lock_decref_internal(lock, mode);
+	}
+}
+
+/**
+ * Finishing portion of client lock enqueue code.
+ *
+ * Called after receiving reply from server.
+ */
+int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
+			  enum ldlm_type type, __u8 with_policy,
+			  enum ldlm_mode mode, __u64 *flags, void *lvb,
+			  __u32 lvb_len, const struct lustre_handle *lockh,
+			  int rc)
+{
+        struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+        int is_replay = *flags & LDLM_FL_REPLAY;
+        struct ldlm_lock *lock;
+        struct ldlm_reply *reply;
+        int cleanup_phase = 1;
+        ENTRY;
+
+        lock = ldlm_handle2lock(lockh);
+        /* ldlm_cli_enqueue is holding a reference on this lock. */
+        if (!lock) {
+                LASSERT(type == LDLM_FLOCK);
+                RETURN(-ENOLCK);
+        }
+
+	LASSERTF(ergo(lvb_len != 0, lvb_len == lock->l_lvb_len),
+		 "lvb_len = %d, l_lvb_len = %d\n", lvb_len, lock->l_lvb_len);
+
+        if (rc != ELDLM_OK) {
+                LASSERT(!is_replay);
+                LDLM_DEBUG(lock, "client-side enqueue END (%s)",
+                           rc == ELDLM_LOCK_ABORTED ? "ABORTED" : "FAILED");
+
+		if (rc != ELDLM_LOCK_ABORTED)
+			GOTO(cleanup, rc);
+	}
+
+	/* Before we return, swab the reply */
+	reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+	if (reply == NULL)
+		GOTO(cleanup, rc = -EPROTO);
+
+	if (lvb_len > 0) {
+		int size = 0;
+
+		size = req_capsule_get_size(&req->rq_pill, &RMF_DLM_LVB,
+					    RCL_SERVER);
+		if (size < 0) {
+			LDLM_ERROR(lock, "Fail to get lvb_len, rc = %d", size);
+			GOTO(cleanup, rc = size);
+		} else if (unlikely(size > lvb_len)) {
+			LDLM_ERROR(lock, "Replied LVB is larger than "
+				   "expectation, expected = %d, replied = %d",
+				   lvb_len, size);
+			GOTO(cleanup, rc = -EINVAL);
+		}
+		lvb_len = size;
+	}
+
+	if (rc == ELDLM_LOCK_ABORTED) {
+		if (lvb_len > 0 && lvb != NULL)
+			rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER,
+					   lvb, lvb_len);
+		GOTO(cleanup, rc = rc ? : ELDLM_LOCK_ABORTED);
+	}
+
+        /* lock enqueued on the server */
+        cleanup_phase = 0;
+
+        lock_res_and_lock(lock);
+        /* Key change rehash lock in per-export hash with new key */
+        if (exp->exp_lock_hash) {
+		/* In the function below, .hs_keycmp resolves to
+		 * ldlm_export_lock_keycmp() */
+		/* coverity[overrun-buffer-val] */
+                cfs_hash_rehash_key(exp->exp_lock_hash,
+                                    &lock->l_remote_handle,
+                                    &reply->lock_handle,
+                                    &lock->l_exp_hash);
+        } else {
+                lock->l_remote_handle = reply->lock_handle;
+        }
+
+	*flags = ldlm_flags_from_wire(reply->lock_flags);
+	lock->l_flags |= ldlm_flags_from_wire(reply->lock_flags &
+					      LDLM_FL_INHERIT_MASK);
+        unlock_res_and_lock(lock);
+
+	CDEBUG(D_INFO, "local: %p, remote cookie: %#llx, flags: %#llx\n",
+	       lock, reply->lock_handle.cookie, *flags);
+
+	/* If enqueue returned a blocked lock but the completion handler has
+	 * already run, then it fixed up the resource and we don't need to do it
+	 * again. */
+	if ((*flags) & LDLM_FL_LOCK_CHANGED) {
+		int newmode = reply->lock_desc.l_req_mode;
+		LASSERT(!is_replay);
+		if (newmode && newmode != lock->l_req_mode) {
+			LDLM_DEBUG(lock, "server returned different mode %s",
+				   ldlm_lockname[newmode]);
+			lock->l_req_mode = newmode;
+		}
+
+		if (!ldlm_res_eq(&reply->lock_desc.l_resource.lr_name,
+				 &lock->l_resource->lr_name)) {
+			CDEBUG(D_INFO, "remote intent success, locking "DLDLMRES
+				       " instead of "DLDLMRES"\n",
+			       PLDLMRES(&reply->lock_desc.l_resource),
+			       PLDLMRES(lock->l_resource));
+
+			rc = ldlm_lock_change_resource(ns, lock,
+					&reply->lock_desc.l_resource.lr_name);
+			if (rc || lock->l_resource == NULL)
+				GOTO(cleanup, rc = -ENOMEM);
+			LDLM_DEBUG(lock, "client-side enqueue, new resource");
+		}
+		if (with_policy)
+			if (!(type == LDLM_IBITS &&
+			      !(exp_connect_flags(exp) & OBD_CONNECT_IBITS)))
+				/* We assume lock type cannot change on server*/
+				ldlm_convert_policy_to_local(exp,
+						lock->l_resource->lr_type,
+						&reply->lock_desc.l_policy_data,
+						&lock->l_policy_data);
+                if (type != LDLM_PLAIN)
+                        LDLM_DEBUG(lock,"client-side enqueue, new policy data");
+        }
+
+	if ((*flags) & LDLM_FL_AST_SENT) {
+                lock_res_and_lock(lock);
+		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST;
+                unlock_res_and_lock(lock);
+                LDLM_DEBUG(lock, "enqueue reply includes blocking AST");
+        }
+
+        /* If the lock has already been granted by a completion AST, don't
+         * clobber the LVB with an older one. */
+	if (lvb_len > 0) {
+		/* We must lock or a racing completion might update lvb without
+		 * letting us know and we'll clobber the correct value.
+		 * Cannot unlock after the check either, a that still leaves
+		 * a tiny window for completion to get in */
+		lock_res_and_lock(lock);
+		if (lock->l_req_mode != lock->l_granted_mode)
+			rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER,
+					   lock->l_lvb_data, lvb_len);
+		unlock_res_and_lock(lock);
+		if (rc < 0) {
+			cleanup_phase = 1;
+			GOTO(cleanup, rc);
+		}
+        }
+
+        if (!is_replay) {
+                rc = ldlm_lock_enqueue(ns, &lock, NULL, flags);
+                if (lock->l_completion_ast != NULL) {
+                        int err = lock->l_completion_ast(lock, *flags, NULL);
+                        if (!rc)
+                                rc = err;
+                        if (rc)
+                                cleanup_phase = 1;
+                }
+        }
+
+	if (lvb_len > 0 && lvb != NULL) {
+		/* Copy the LVB here, and not earlier, because the completion
+		 * AST (if any) can override what we got in the reply */
+		memcpy(lvb, lock->l_lvb_data, lvb_len);
+	}
+
+        LDLM_DEBUG(lock, "client-side enqueue END");
+        EXIT;
+cleanup:
+        if (cleanup_phase == 1 && rc)
+                failed_lock_cleanup(ns, lock, mode);
+        /* Put lock 2 times, the second reference is held by ldlm_cli_enqueue */
+        LDLM_LOCK_PUT(lock);
+        LDLM_LOCK_RELEASE(lock);
+        return rc;
+}
+EXPORT_SYMBOL(ldlm_cli_enqueue_fini);
+
+/**
+ * Estimate number of lock handles that would fit into request of given
+ * size.  PAGE_SIZE-512 is to allow TCP/IP and LNET headers to fit into
+ * a single page on the send/receive side. XXX: 512 should be changed to
+ * more adequate value.
+ */
+static inline int ldlm_req_handles_avail(int req_size, int off)
+{
+        int avail;
+
+	avail = min_t(int, LDLM_MAXREQSIZE, PAGE_SIZE - 512) - req_size;
+        if (likely(avail >= 0))
+                avail /= (int)sizeof(struct lustre_handle);
+        else
+                avail = 0;
+        avail += LDLM_LOCKREQ_HANDLES - off;
+
+        return avail;
+}
+
+static inline int ldlm_capsule_handles_avail(struct req_capsule *pill,
+                                             enum req_location loc,
+                                             int off)
+{
+	__u32 size = req_capsule_msg_size(pill, loc);
+	return ldlm_req_handles_avail(size, off);
+}
+
+static inline int ldlm_format_handles_avail(struct obd_import *imp,
+                                            const struct req_format *fmt,
+                                            enum req_location loc, int off)
+{
+	__u32 size = req_capsule_fmt_size(imp->imp_msg_magic, fmt, loc);
+	return ldlm_req_handles_avail(size, off);
+}
+
+/**
+ * Cancel LRU locks and pack them into the enqueue request. Pack there the given
+ * \a count locks in \a cancels.
+ *
+ * This is to be called by functions preparing their own requests that
+ * might contain lists of locks to cancel in addition to actual operation
+ * that needs to be performed.
+ */
+int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req,
+		      int version, int opc, int canceloff,
+		      struct list_head *cancels, int count)
+	{
+	struct ldlm_namespace	*ns = exp->exp_obd->obd_namespace;
+	struct req_capsule	*pill = &req->rq_pill;
+	struct ldlm_request	*dlm = NULL;
+	struct list_head	head = LIST_HEAD_INIT(head);
+	enum ldlm_lru_flags lru_flags;
+	int avail, to_free, pack = 0;
+	int rc;
+	ENTRY;
+
+	if (cancels == NULL)
+		cancels = &head;
+	if (ns_connect_cancelset(ns)) {
+		/* Estimate the amount of available space in the request. */
+		req_capsule_filled_sizes(pill, RCL_CLIENT);
+		avail = ldlm_capsule_handles_avail(pill, RCL_CLIENT, canceloff);
+
+		lru_flags = LDLM_LRU_FLAG_NO_WAIT | (ns_connect_lru_resize(ns) ?
+			LDLM_LRU_FLAG_LRUR : LDLM_LRU_FLAG_AGED);
+		to_free = !ns_connect_lru_resize(ns) &&
+			opc == LDLM_ENQUEUE ? 1 : 0;
+
+		/* Cancel LRU locks here _only_ if the server supports
+		 * EARLY_CANCEL. Otherwise we have to send extra CANCEL
+		 * RPC, which will make us slower. */
+		if (avail > count)
+			count += ldlm_cancel_lru_local(ns, cancels, to_free,
+						       avail - count, 0,
+						       lru_flags);
+		if (avail > count)
+			pack = count;
+		else
+			pack = avail;
+		req_capsule_set_size(pill, &RMF_DLM_REQ, RCL_CLIENT,
+				     ldlm_request_bufsize(pack, opc));
+	}
+
+        rc = ptlrpc_request_pack(req, version, opc);
+        if (rc) {
+                ldlm_lock_list_put(cancels, l_bl_ast, count);
+                RETURN(rc);
+        }
+
+	if (ns_connect_cancelset(ns)) {
+                if (canceloff) {
+                        dlm = req_capsule_client_get(pill, &RMF_DLM_REQ);
+                        LASSERT(dlm);
+                        /* Skip first lock handler in ldlm_request_pack(),
+			 * this method will increment @lock_count according
+                         * to the lock handle amount actually written to
+                         * the buffer. */
+                        dlm->lock_count = canceloff;
+                }
+                /* Pack into the request @pack lock handles. */
+                ldlm_cli_cancel_list(cancels, pack, req, 0);
+		/* Prepare and send separate cancel RPC for others. */
+                ldlm_cli_cancel_list(cancels, count - pack, NULL, 0);
+        } else {
+                ldlm_lock_list_put(cancels, l_bl_ast, count);
+        }
+        RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_prep_elc_req);
+
+int ldlm_prep_enqueue_req(struct obd_export *exp, struct ptlrpc_request *req,
+			  struct list_head *cancels, int count)
+{
+        return ldlm_prep_elc_req(exp, req, LUSTRE_DLM_VERSION, LDLM_ENQUEUE,
+                                 LDLM_ENQUEUE_CANCEL_OFF, cancels, count);
+}
+EXPORT_SYMBOL(ldlm_prep_enqueue_req);
+
+struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp, int lvb_len)
+{
+	struct ptlrpc_request *req;
+	int rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_ENQUEUE);
+	if (req == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(ERR_PTR(rc));
+	}
+
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, lvb_len);
+	ptlrpc_request_set_replen(req);
+	RETURN(req);
+}
+EXPORT_SYMBOL(ldlm_enqueue_pack);
+
+/**
+ * Client-side lock enqueue.
+ *
+ * If a request has some specific initialisation it is passed in \a reqp,
+ * otherwise it is created in ldlm_cli_enqueue.
+ *
+ * Supports sync and async requests, pass \a async flag accordingly. If a
+ * request was created in ldlm_cli_enqueue and it is the async request,
+ * pass it to the caller in \a reqp.
+ */
+int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
+		     struct ldlm_enqueue_info *einfo,
+		     const struct ldlm_res_id *res_id,
+		     union ldlm_policy_data const *policy, __u64 *flags,
+		     void *lvb, __u32 lvb_len, enum lvb_type lvb_type,
+		     struct lustre_handle *lockh, int async)
+{
+	struct ldlm_namespace *ns;
+        struct ldlm_lock      *lock;
+        struct ldlm_request   *body;
+        int                    is_replay = *flags & LDLM_FL_REPLAY;
+        int                    req_passed_in = 1;
+        int                    rc, err;
+        struct ptlrpc_request *req;
+        ENTRY;
+
+        LASSERT(exp != NULL);
+
+	ns = exp->exp_obd->obd_namespace;
+
+        /* If we're replaying this lock, just check some invariants.
+         * If we're creating a new lock, get everything all setup nice. */
+        if (is_replay) {
+                lock = ldlm_handle2lock_long(lockh, 0);
+                LASSERT(lock != NULL);
+                LDLM_DEBUG(lock, "client-side enqueue START");
+                LASSERT(exp == lock->l_conn_export);
+        } else {
+		const struct ldlm_callback_suite cbs = {
+			.lcs_completion = einfo->ei_cb_cp,
+			.lcs_blocking	= einfo->ei_cb_bl,
+			.lcs_glimpse	= einfo->ei_cb_gl
+		};
+		lock = ldlm_lock_create(ns, res_id, einfo->ei_type,
+					einfo->ei_mode, &cbs, einfo->ei_cbdata,
+					lvb_len, lvb_type);
+		if (IS_ERR(lock))
+			RETURN(PTR_ERR(lock));
+                /* for the local lock, add the reference */
+                ldlm_lock_addref_internal(lock, einfo->ei_mode);
+                ldlm_lock2handle(lock, lockh);
+		if (policy != NULL)
+			lock->l_policy_data = *policy;
+
+		if (einfo->ei_type == LDLM_EXTENT) {
+			/* extent lock without policy is a bug */
+			if (policy == NULL)
+				LBUG();
+
+			lock->l_req_extent = policy->l_extent;
+		}
+		LDLM_DEBUG(lock, "client-side enqueue START, flags %#llx",
+			   *flags);
+	}
+
+	lock->l_conn_export = exp;
+	lock->l_export = NULL;
+	lock->l_blocking_ast = einfo->ei_cb_bl;
+	lock->l_flags |= (*flags & (LDLM_FL_NO_LRU | LDLM_FL_EXCL));
+        lock->l_last_activity = cfs_time_current_sec();
+
+	/* lock not sent to server yet */
+	if (reqp == NULL || *reqp == NULL) {
+		req = ldlm_enqueue_pack(exp, lvb_len);
+		if (IS_ERR(req)) {
+			failed_lock_cleanup(ns, lock, einfo->ei_mode);
+			LDLM_LOCK_RELEASE(lock);
+			RETURN(PTR_ERR(req));
+		}
+
+		req_passed_in = 0;
+		if (reqp)
+			*reqp = req;
+	} else {
+		int len;
+
+		req = *reqp;
+		len = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ,
+					   RCL_CLIENT);
+		LASSERTF(len >= sizeof(*body), "buflen[%d] = %d, not %d\n",
+			 DLM_LOCKREQ_OFF, len, (int)sizeof(*body));
+	}
+
+	/* Dump lock data into the request buffer */
+	body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	ldlm_lock2desc(lock, &body->lock_desc);
+	body->lock_flags = ldlm_flags_to_wire(*flags);
+	body->lock_handle[0] = *lockh;
+
+	if (async) {
+		LASSERT(reqp != NULL);
+		RETURN(0);
+	}
+
+	LDLM_DEBUG(lock, "sending request");
+
+	rc = ptlrpc_queue_wait(req);
+
+	err = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, policy ? 1 : 0,
+				    einfo->ei_mode, flags, lvb, lvb_len,
+				    lockh, rc);
+
+	/* If ldlm_cli_enqueue_fini did not find the lock, we need to free
+	 * one reference that we took */
+	if (err == -ENOLCK)
+		LDLM_LOCK_RELEASE(lock);
+	else
+		rc = err;
+
+	if (!req_passed_in && req != NULL) {
+		ptlrpc_req_finished(req);
+		if (reqp)
+			*reqp = NULL;
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_cli_enqueue);
+
+static int ldlm_cli_convert_local(struct ldlm_lock *lock, int new_mode,
+                                  __u32 *flags)
+{
+        struct ldlm_resource *res;
+        int rc;
+        ENTRY;
+        if (ns_is_client(ldlm_lock_to_ns(lock))) {
+                CERROR("Trying to cancel local lock\n");
+                LBUG();
+        }
+        LDLM_DEBUG(lock, "client-side local convert");
+
+        res = ldlm_lock_convert(lock, new_mode, flags);
+        if (res) {
+                ldlm_reprocess_all(res);
+                rc = 0;
+        } else {
+		rc = LUSTRE_EDEADLK;
+        }
+        LDLM_DEBUG(lock, "client-side local convert handler END");
+        LDLM_LOCK_PUT(lock);
+        RETURN(rc);
+}
+
+/* FIXME: one of ldlm_cli_convert or the server side should reject attempted
+ * conversion of locks which are on the waiting or converting queue */
+/* Caller of this code is supposed to take care of lock readers/writers
+   accounting */
+int ldlm_cli_convert(const struct lustre_handle *lockh, int new_mode,
+		     __u32 *flags)
+{
+        struct ldlm_request   *body;
+        struct ldlm_reply     *reply;
+        struct ldlm_lock      *lock;
+        struct ldlm_resource  *res;
+        struct ptlrpc_request *req;
+        int                    rc;
+        ENTRY;
+
+        lock = ldlm_handle2lock(lockh);
+        if (!lock) {
+                LBUG();
+                RETURN(-EINVAL);
+        }
+        *flags = 0;
+
+        if (lock->l_conn_export == NULL)
+                RETURN(ldlm_cli_convert_local(lock, new_mode, flags));
+
+        LDLM_DEBUG(lock, "client-side convert");
+
+        req = ptlrpc_request_alloc_pack(class_exp2cliimp(lock->l_conn_export),
+                                        &RQF_LDLM_CONVERT, LUSTRE_DLM_VERSION,
+                                        LDLM_CONVERT);
+        if (req == NULL) {
+                LDLM_LOCK_PUT(lock);
+                RETURN(-ENOMEM);
+        }
+
+        body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+        body->lock_handle[0] = lock->l_remote_handle;
+
+        body->lock_desc.l_req_mode = new_mode;
+	body->lock_flags = ldlm_flags_to_wire(*flags);
+
+
+        ptlrpc_request_set_replen(req);
+        rc = ptlrpc_queue_wait(req);
+        if (rc != ELDLM_OK)
+                GOTO(out, rc);
+
+        reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+        if (reply == NULL)
+                GOTO(out, rc = -EPROTO);
+
+        if (req->rq_status)
+                GOTO(out, rc = req->rq_status);
+
+        res = ldlm_lock_convert(lock, new_mode, &reply->lock_flags);
+        if (res != NULL) {
+                ldlm_reprocess_all(res);
+                /* Go to sleep until the lock is granted. */
+                /* FIXME: or cancelled. */
+                if (lock->l_completion_ast) {
+                        rc = lock->l_completion_ast(lock, LDLM_FL_WAIT_NOREPROC,
+                                                    NULL);
+                        if (rc)
+                                GOTO(out, rc);
+                }
+        } else {
+		rc = LUSTRE_EDEADLK;
+        }
+        EXIT;
+ out:
+        LDLM_LOCK_PUT(lock);
+        ptlrpc_req_finished(req);
+        return rc;
+}
+
+/**
+ * Cancel locks locally.
+ * Returns:
+ * \retval LDLM_FL_LOCAL_ONLY if there is no need for a CANCEL RPC to the server
+ * \retval LDLM_FL_CANCELING otherwise;
+ * \retval LDLM_FL_BL_AST if there is a need for a separate CANCEL RPC.
+ */
+static __u64 ldlm_cli_cancel_local(struct ldlm_lock *lock)
+{
+	__u64 rc = LDLM_FL_LOCAL_ONLY;
+        ENTRY;
+
+        if (lock->l_conn_export) {
+                bool local_only;
+
+                LDLM_DEBUG(lock, "client-side cancel");
+                /* Set this flag to prevent others from getting new references*/
+                lock_res_and_lock(lock);
+		ldlm_set_cbpending(lock);
+		local_only = !!(lock->l_flags &
+				(LDLM_FL_LOCAL_ONLY|LDLM_FL_CANCEL_ON_BLOCK));
+		ldlm_cancel_callback(lock);
+		rc = (ldlm_is_bl_ast(lock)) ?
+			LDLM_FL_BL_AST : LDLM_FL_CANCELING;
+		unlock_res_and_lock(lock);
+
+                if (local_only) {
+                        CDEBUG(D_DLMTRACE, "not sending request (at caller's "
+                               "instruction)\n");
+                        rc = LDLM_FL_LOCAL_ONLY;
+                }
+                ldlm_lock_cancel(lock);
+        } else {
+                if (ns_is_client(ldlm_lock_to_ns(lock))) {
+                        LDLM_ERROR(lock, "Trying to cancel local lock");
+                        LBUG();
+                }
+                LDLM_DEBUG(lock, "server-side local cancel");
+                ldlm_lock_cancel(lock);
+                ldlm_reprocess_all(lock->l_resource);
+        }
+
+        RETURN(rc);
+}
+
+/**
+ * Pack \a count locks in \a head into ldlm_request buffer of request \a req.
+ */
+static void ldlm_cancel_pack(struct ptlrpc_request *req,
+			     struct list_head *head, int count)
+{
+        struct ldlm_request *dlm;
+        struct ldlm_lock *lock;
+        int max, packed = 0;
+        ENTRY;
+
+        dlm = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+        LASSERT(dlm != NULL);
+
+        /* Check the room in the request buffer. */
+        max = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT) -
+                sizeof(struct ldlm_request);
+        max /= sizeof(struct lustre_handle);
+        max += LDLM_LOCKREQ_HANDLES;
+        LASSERT(max >= dlm->lock_count + count);
+
+        /* XXX: it would be better to pack lock handles grouped by resource.
+         * so that the server cancel would call filter_lvbo_update() less
+         * frequently. */
+	list_for_each_entry(lock, head, l_bl_ast) {
+                if (!count--)
+                        break;
+                LASSERT(lock->l_conn_export);
+                /* Pack the lock handle to the given request buffer. */
+                LDLM_DEBUG(lock, "packing");
+                dlm->lock_handle[dlm->lock_count++] = lock->l_remote_handle;
+                packed++;
+        }
+        CDEBUG(D_DLMTRACE, "%d locks packed\n", packed);
+        EXIT;
+}
+
+/**
+ * Prepare and send a batched cancel RPC. It will include \a count lock
+ * handles of locks given in \a cancels list. */
+int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *cancels,
+			int count, enum ldlm_cancel_flags flags)
+{
+	struct ptlrpc_request *req = NULL;
+	struct obd_import *imp;
+	int free, sent = 0;
+	int rc = 0;
+	ENTRY;
+
+        LASSERT(exp != NULL);
+        LASSERT(count > 0);
+
+        CFS_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL, cfs_fail_val);
+
+        if (CFS_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_RACE))
+                RETURN(count);
+
+        free = ldlm_format_handles_avail(class_exp2cliimp(exp),
+                                         &RQF_LDLM_CANCEL, RCL_CLIENT, 0);
+        if (count > free)
+                count = free;
+
+        while (1) {
+                imp = class_exp2cliimp(exp);
+                if (imp == NULL || imp->imp_invalid) {
+                        CDEBUG(D_DLMTRACE,
+                               "skipping cancel on invalid import %p\n", imp);
+                        RETURN(count);
+                }
+
+                req = ptlrpc_request_alloc(imp, &RQF_LDLM_CANCEL);
+                if (req == NULL)
+                        GOTO(out, rc = -ENOMEM);
+
+                req_capsule_filled_sizes(&req->rq_pill, RCL_CLIENT);
+                req_capsule_set_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT,
+                                     ldlm_request_bufsize(count, LDLM_CANCEL));
+
+                rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CANCEL);
+                if (rc) {
+                        ptlrpc_request_free(req);
+                        GOTO(out, rc);
+                }
+
+		/* If OSP want cancel cross-MDT lock, let's not block it in
+		 * in recovery, otherwise the lock will not released, if
+		 * the remote target is also in recovery, and it also need
+		 * this lock, it might cause deadlock. */
+		if (exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS &&
+		    exp->exp_obd->obd_lu_dev != NULL &&
+		    exp->exp_obd->obd_lu_dev->ld_site != NULL) {
+			struct lu_device *top_dev;
+
+			top_dev = exp->exp_obd->obd_lu_dev->ld_site->ls_top_dev;
+			if (top_dev != NULL &&
+			    top_dev->ld_obd->obd_recovering)
+				req->rq_allow_replay = 1;
+		}
+
+                req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
+                req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
+                ptlrpc_at_set_req_timeout(req);
+
+                ldlm_cancel_pack(req, cancels, count);
+
+		ptlrpc_request_set_replen(req);
+		if (flags & LCF_ASYNC) {
+			ptlrpcd_add_req(req);
+			sent = count;
+			GOTO(out, 0);
+		}
+
+		rc = ptlrpc_queue_wait(req);
+		if (rc == LUSTRE_ESTALE) {
+                        CDEBUG(D_DLMTRACE, "client/server (nid %s) "
+                               "out of sync -- not fatal\n",
+                               libcfs_nid2str(req->rq_import->
+                                              imp_connection->c_peer.nid));
+                        rc = 0;
+                } else if (rc == -ETIMEDOUT && /* check there was no reconnect*/
+                           req->rq_import_generation == imp->imp_generation) {
+                        ptlrpc_req_finished(req);
+                        continue;
+                } else if (rc != ELDLM_OK) {
+			/* -ESHUTDOWN is common on umount */
+			CDEBUG_LIMIT(rc == -ESHUTDOWN ? D_DLMTRACE : D_ERROR,
+				     "Got rc %d from cancel RPC: "
+				     "canceling anyway\n", rc);
+			break;
+                }
+                sent = count;
+                break;
+        }
+
+        ptlrpc_req_finished(req);
+        EXIT;
+out:
+        return sent ? sent : rc;
+}
+
+static inline struct ldlm_pool *ldlm_imp2pl(struct obd_import *imp)
+{
+        LASSERT(imp != NULL);
+        return &imp->imp_obd->obd_namespace->ns_pool;
+}
+
+/**
+ * Update client's OBD pool related fields with new SLV and Limit from \a req.
+ */
+int ldlm_cli_update_pool(struct ptlrpc_request *req)
+{
+        struct obd_device *obd;
+        __u64 new_slv;
+        __u32 new_limit;
+        ENTRY;
+        if (unlikely(!req->rq_import || !req->rq_import->imp_obd ||
+                     !imp_connect_lru_resize(req->rq_import)))
+        {
+                /*
+                 * Do nothing for corner cases.
+                 */
+                RETURN(0);
+        }
+
+	/* In some cases RPC may contain SLV and limit zeroed out. This
+	 * is the case when server does not support LRU resize feature.
+	 * This is also possible in some recovery cases when server-side
+	 * reqs have no reference to the OBD export and thus access to
+	 * server-side namespace is not possible. */
+        if (lustre_msg_get_slv(req->rq_repmsg) == 0 ||
+            lustre_msg_get_limit(req->rq_repmsg) == 0) {
+                DEBUG_REQ(D_HA, req, "Zero SLV or Limit found "
+			  "(SLV: %llu, Limit: %u)",
+                          lustre_msg_get_slv(req->rq_repmsg),
+                          lustre_msg_get_limit(req->rq_repmsg));
+                RETURN(0);
+        }
+
+        new_limit = lustre_msg_get_limit(req->rq_repmsg);
+        new_slv = lustre_msg_get_slv(req->rq_repmsg);
+        obd = req->rq_import->imp_obd;
+
+	/* Set new SLV and limit in OBD fields to make them accessible
+	 * to the pool thread. We do not access obd_namespace and pool
+	 * directly here as there is no reliable way to make sure that
+	 * they are still alive at cleanup time. Evil races are possible
+	 * which may cause Oops at that time. */
+	write_lock(&obd->obd_pool_lock);
+        obd->obd_pool_slv = new_slv;
+        obd->obd_pool_limit = new_limit;
+	write_unlock(&obd->obd_pool_lock);
+
+        RETURN(0);
+}
+
+/**
+ * Client side lock cancel.
+ *
+ * Lock must not have any readers or writers by this time.
+ */
+int ldlm_cli_cancel(const struct lustre_handle *lockh,
+		    enum ldlm_cancel_flags cancel_flags)
+{
+	struct obd_export *exp;
+	enum ldlm_lru_flags lru_flags;
+	int avail, count = 1;
+	__u64 rc = 0;
+	struct ldlm_namespace *ns;
+	struct ldlm_lock *lock;
+	struct list_head cancels = LIST_HEAD_INIT(cancels);
+	ENTRY;
+
+	lock = ldlm_handle2lock_long(lockh, 0);
+	if (lock == NULL) {
+		LDLM_DEBUG_NOLOCK("lock is already being destroyed");
+		RETURN(0);
+	}
+
+	lock_res_and_lock(lock);
+	/* Lock is being canceled and the caller doesn't want to wait */
+	if (ldlm_is_canceling(lock)) {
+		if (cancel_flags & LCF_ASYNC) {
+			unlock_res_and_lock(lock);
+		} else {
+			struct l_wait_info lwi = { 0 };
+
+			unlock_res_and_lock(lock);
+			l_wait_event(lock->l_waitq, is_bl_done(lock), &lwi);
+		}
+		LDLM_LOCK_RELEASE(lock);
+		RETURN(0);
+	}
+
+	ldlm_set_canceling(lock);
+	unlock_res_and_lock(lock);
+
+	rc = ldlm_cli_cancel_local(lock);
+	if (rc == LDLM_FL_LOCAL_ONLY || cancel_flags & LCF_LOCAL) {
+		LDLM_LOCK_RELEASE(lock);
+		RETURN(0);
+	}
+	/* Even if the lock is marked as LDLM_FL_BL_AST, this is a LDLM_CANCEL
+	 * RPC which goes to canceld portal, so we can cancel other LRU locks
+	 * here and send them all as one LDLM_CANCEL RPC. */
+	LASSERT(list_empty(&lock->l_bl_ast));
+	list_add(&lock->l_bl_ast, &cancels);
+
+	exp = lock->l_conn_export;
+	if (exp_connect_cancelset(exp)) {
+		avail = ldlm_format_handles_avail(class_exp2cliimp(exp),
+						  &RQF_LDLM_CANCEL,
+						  RCL_CLIENT, 0);
+		LASSERT(avail > 0);
+
+		ns = ldlm_lock_to_ns(lock);
+		lru_flags = ns_connect_lru_resize(ns) ?
+			LDLM_LRU_FLAG_LRUR : LDLM_LRU_FLAG_AGED;
+		count += ldlm_cancel_lru_local(ns, &cancels, 0, avail - 1,
+					       LCF_BL_AST, lru_flags);
+	}
+	ldlm_cli_cancel_list(&cancels, count, NULL, cancel_flags);
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_cli_cancel);
+
+/**
+ * Locally cancel up to \a count locks in list \a cancels.
+ * Return the number of cancelled locks.
+ */
+int ldlm_cli_cancel_list_local(struct list_head *cancels, int count,
+			       enum ldlm_cancel_flags cancel_flags)
+{
+	struct list_head head = LIST_HEAD_INIT(head);
+	struct ldlm_lock *lock, *next;
+	int left = 0, bl_ast = 0;
+	__u64 rc;
+
+	left = count;
+	list_for_each_entry_safe(lock, next, cancels, l_bl_ast) {
+		if (left-- == 0)
+			break;
+
+		if (cancel_flags & LCF_LOCAL) {
+			rc = LDLM_FL_LOCAL_ONLY;
+			ldlm_lock_cancel(lock);
+		} else {
+			rc = ldlm_cli_cancel_local(lock);
+		}
+		/* Until we have compound requests and can send LDLM_CANCEL
+		 * requests batched with generic RPCs, we need to send cancels
+		 * with the LDLM_FL_BL_AST flag in a separate RPC from
+		 * the one being generated now. */
+		if (!(cancel_flags & LCF_BL_AST) && (rc == LDLM_FL_BL_AST)) {
+			LDLM_DEBUG(lock, "Cancel lock separately");
+			list_del_init(&lock->l_bl_ast);
+			list_add(&lock->l_bl_ast, &head);
+			bl_ast++;
+			continue;
+		}
+		if (rc == LDLM_FL_LOCAL_ONLY) {
+			/* CANCEL RPC should not be sent to server. */
+			list_del_init(&lock->l_bl_ast);
+                        LDLM_LOCK_RELEASE(lock);
+                        count--;
+                }
+        }
+        if (bl_ast > 0) {
+                count -= bl_ast;
+                ldlm_cli_cancel_list(&head, bl_ast, NULL, 0);
+        }
+
+        RETURN(count);
+}
+
+/**
+ * Cancel as many locks as possible w/o sending any RPCs (e.g. to write back
+ * dirty data, to close a file, ...) or waiting for any RPCs in-flight (e.g.
+ * readahead requests, ...)
+ */
+static enum ldlm_policy_res
+ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock,
+			   int unused, int added, int count)
+{
+	enum ldlm_policy_res result = LDLM_POLICY_CANCEL_LOCK;
+
+	/* don't check added & count since we want to process all locks
+	 * from unused list.
+	 * It's fine to not take lock to access lock->l_resource since
+	 * the lock has already been granted so it won't change. */
+	switch (lock->l_resource->lr_type) {
+		case LDLM_EXTENT:
+		case LDLM_IBITS:
+			if (ns->ns_cancel != NULL && ns->ns_cancel(lock) != 0)
+				break;
+		default:
+			result = LDLM_POLICY_SKIP_LOCK;
+			lock_res_and_lock(lock);
+			ldlm_set_skipped(lock);
+			unlock_res_and_lock(lock);
+			break;
+	}
+
+	RETURN(result);
+}
+
+/**
+ * Callback function for LRU-resize policy. Decides whether to keep
+ * \a lock in LRU for current \a LRU size \a unused, added in current
+ * scan \a added and number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static enum ldlm_policy_res ldlm_cancel_lrur_policy(struct ldlm_namespace *ns,
+						    struct ldlm_lock *lock,
+						    int unused, int added,
+						    int count)
+{
+	ktime_t cur = ktime_get();
+	struct ldlm_pool *pl = &ns->ns_pool;
+	u64 slv, lvf, lv;
+	s64 la;
+
+	/* Stop LRU processing when we reach past @count or have checked all
+	 * locks in LRU. */
+	if (count && added >= count)
+		return LDLM_POLICY_KEEP_LOCK;
+
+	/* Despite of the LV, It doesn't make sense to keep the lock which
+	 * is unused for ns_max_age time.
+	 */
+	if (ktime_after(ktime_get(),
+			ktime_add(lock->l_last_used, ns->ns_max_age)))
+		return LDLM_POLICY_CANCEL_LOCK;
+
+	slv = ldlm_pool_get_slv(pl);
+	lvf = ldlm_pool_get_lvf(pl);
+	la = ktime_to_ns(ktime_sub(cur, lock->l_last_used)) / NSEC_PER_SEC;
+	lv = lvf * la * unused;
+
+	/* Inform pool about current CLV to see it via proc. */
+	ldlm_pool_set_clv(pl, lv);
+
+	/* Stop when SLV is not yet come from server or lv is smaller than
+	 * it is. */
+	if (slv == 0 || lv < slv)
+		return LDLM_POLICY_KEEP_LOCK;
+
+	return LDLM_POLICY_CANCEL_LOCK;
+}
+
+static enum ldlm_policy_res
+ldlm_cancel_lrur_no_wait_policy(struct ldlm_namespace *ns,
+				struct ldlm_lock *lock,
+				int unused, int added,
+				int count)
+{
+	enum ldlm_policy_res result;
+
+	result = ldlm_cancel_lrur_policy(ns, lock, unused, added, count);
+	if (result == LDLM_POLICY_KEEP_LOCK)
+		return result;
+
+	return ldlm_cancel_no_wait_policy(ns, lock, unused, added, count);
+}
+
+/**
+ * Callback function for proc used policy. Makes decision whether to keep
+ * \a lock in LRU for current \a LRU size \a unused, added in current scan \a
+ * added and number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static enum ldlm_policy_res ldlm_cancel_passed_policy(struct ldlm_namespace *ns,
+						      struct ldlm_lock *lock,
+						      int unused, int added,
+						      int count)
+{
+	/* Stop LRU processing when we reach past @count or have checked all
+	 * locks in LRU. */
+	return (added >= count) ?
+		LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+}
+
+/**
+ * Callback function for aged policy. Makes decision whether to keep \a lock in
+ * LRU for current LRU size \a unused, added in current scan \a added and
+ * number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static enum ldlm_policy_res ldlm_cancel_aged_policy(struct ldlm_namespace *ns,
+						    struct ldlm_lock *lock,
+						    int unused, int added,
+						    int count)
+{
+	if ((added >= count) &&
+	    ktime_before(ktime_get(),
+			 ktime_add(lock->l_last_used, ns->ns_max_age)))
+		return LDLM_POLICY_KEEP_LOCK;
+
+	return LDLM_POLICY_CANCEL_LOCK;
+}
+
+static enum ldlm_policy_res
+ldlm_cancel_aged_no_wait_policy(struct ldlm_namespace *ns,
+				struct ldlm_lock *lock,
+				int unused, int added, int count)
+{
+	enum ldlm_policy_res result;
+
+	result = ldlm_cancel_aged_policy(ns, lock, unused, added, count);
+	if (result == LDLM_POLICY_KEEP_LOCK)
+		return result;
+
+	return ldlm_cancel_no_wait_policy(ns, lock, unused, added, count);
+}
+
+/**
+ * Callback function for default policy. Makes decision whether to keep \a lock
+ * in LRU for current LRU size \a unused, added in current scan \a added and
+ * number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static
+enum ldlm_policy_res ldlm_cancel_default_policy(struct ldlm_namespace *ns,
+						struct ldlm_lock *lock,
+						int unused, int added,
+						int count)
+{
+	/* Stop LRU processing when we reach past count or have checked all
+	 * locks in LRU. */
+        return (added >= count) ?
+                LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+}
+
+typedef enum ldlm_policy_res
+(*ldlm_cancel_lru_policy_t)(struct ldlm_namespace *ns, struct ldlm_lock *lock,
+			    int unused, int added, int count);
+
+static ldlm_cancel_lru_policy_t
+ldlm_cancel_lru_policy(struct ldlm_namespace *ns, enum ldlm_lru_flags lru_flags)
+{
+	if (ns_connect_lru_resize(ns)) {
+		if (lru_flags & LDLM_LRU_FLAG_SHRINK)
+			/* We kill passed number of old locks. */
+			return ldlm_cancel_passed_policy;
+		if (lru_flags & LDLM_LRU_FLAG_LRUR) {
+			if (lru_flags & LDLM_LRU_FLAG_NO_WAIT)
+				return ldlm_cancel_lrur_no_wait_policy;
+			else
+				return ldlm_cancel_lrur_policy;
+		}
+		if (lru_flags & LDLM_LRU_FLAG_PASSED)
+			return ldlm_cancel_passed_policy;
+	} else {
+		if (lru_flags & LDLM_LRU_FLAG_AGED) {
+			if (lru_flags & LDLM_LRU_FLAG_NO_WAIT)
+				return ldlm_cancel_aged_no_wait_policy;
+			else
+				return ldlm_cancel_aged_policy;
+		}
+	}
+	if (lru_flags & LDLM_LRU_FLAG_NO_WAIT)
+		return ldlm_cancel_no_wait_policy;
+
+	return ldlm_cancel_default_policy;
+}
+
+/**
+ * - Free space in LRU for \a count new locks,
+ *   redundant unused locks are canceled locally;
+ * - also cancel locally unused aged locks;
+ * - do not cancel more than \a max locks;
+ * - GET the found locks and add them into the \a cancels list.
+ *
+ * A client lock can be added to the l_bl_ast list only when it is
+ * marked LDLM_FL_CANCELING. Otherwise, somebody is already doing
+ * CANCEL.  There are the following use cases:
+ * ldlm_cancel_resource_local(), ldlm_cancel_lru_local() and
+ * ldlm_cli_cancel(), which check and set this flag properly. As any
+ * attempt to cancel a lock rely on this flag, l_bl_ast list is accessed
+ * later without any special locking.
+ *
+ * Calling policies for enabled LRU resize:
+ * ----------------------------------------
+ * flags & LDLM_LRU_FLAG_LRUR - use LRU resize policy (SLV from server) to
+ *				cancel not more than \a count locks;
+ *
+ * flags & LDLM_LRU_FLAG_PASSED - cancel \a count number of old locks (located
+ *				at the beginning of LRU list);
+ *
+ * flags & LDLM_LRU_FLAG_SHRINK - cancel not more than \a count locks according
+ *				to memory pressre policy function;
+ *
+ * flags & LDLM_LRU_FLAG_AGED - cancel \a count locks according to "aged policy"
+ *
+ * flags & LDLM_LRU_FLAG_NO_WAIT - cancel as many unused locks as possible
+ *				(typically before replaying locks) w/o
+ *				sending any RPCs or waiting for any
+ *				outstanding RPC to complete.
+ *
+ * flags & LDLM_CANCEL_CLEANUP - when cancelling read locks, do not check for
+ * 				other read locks covering the same pages, just
+ * 				discard those pages.
+ */
+static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
+				 struct list_head *cancels, int count, int max,
+				 enum ldlm_lru_flags lru_flags)
+{
+	ldlm_cancel_lru_policy_t pf;
+	struct ldlm_lock *lock, *next;
+	int added = 0, unused, remained;
+	int no_wait = lru_flags & LDLM_LRU_FLAG_NO_WAIT;
+	ENTRY;
+
+	spin_lock(&ns->ns_lock);
+	unused = ns->ns_nr_unused;
+	remained = unused;
+
+	if (!ns_connect_lru_resize(ns))
+		count += unused - ns->ns_max_unused;
+
+	pf = ldlm_cancel_lru_policy(ns, lru_flags);
+	LASSERT(pf != NULL);
+
+	while (!list_empty(&ns->ns_unused_list)) {
+		enum ldlm_policy_res result;
+		ktime_t last_use = ktime_set(0, 0);
+
+		/* all unused locks */
+		if (remained-- <= 0)
+			break;
+
+		/* For any flags, stop scanning if @max is reached. */
+		if (max && added >= max)
+			break;
+
+		list_for_each_entry_safe(lock, next, &ns->ns_unused_list,
+					 l_lru) {
+			/* No locks which got blocking requests. */
+			LASSERT(!ldlm_is_bl_ast(lock));
+
+			if (no_wait && ldlm_is_skipped(lock))
+				/* already processed */
+				continue;
+
+			last_use = lock->l_last_used;
+
+			/* Somebody is already doing CANCEL. No need for this
+			 * lock in LRU, do not traverse it again. */
+			if (!ldlm_is_canceling(lock))
+				break;
+
+			ldlm_lock_remove_from_lru_nolock(lock);
+		}
+		if (&lock->l_lru == &ns->ns_unused_list)
+			break;
+
+		LDLM_LOCK_GET(lock);
+		spin_unlock(&ns->ns_lock);
+		lu_ref_add(&lock->l_reference, __FUNCTION__, current);
+
+		/* Pass the lock through the policy filter and see if it
+		 * should stay in LRU.
+		 *
+		 * Even for shrinker policy we stop scanning if
+		 * we find a lock that should stay in the cache.
+		 * We should take into account lock age anyway
+		 * as a new lock is a valuable resource even if
+		 * it has a low weight.
+		 *
+		 * That is, for shrinker policy we drop only
+		 * old locks, but additionally choose them by
+		 * their weight. Big extent locks will stay in
+		 * the cache. */
+		result = pf(ns, lock, unused, added, count);
+		if (result == LDLM_POLICY_KEEP_LOCK) {
+			lu_ref_del(&lock->l_reference,
+				   __FUNCTION__, current);
+			LDLM_LOCK_RELEASE(lock);
+			spin_lock(&ns->ns_lock);
+			break;
+		}
+		if (result == LDLM_POLICY_SKIP_LOCK) {
+			lu_ref_del(&lock->l_reference,
+				   __func__, current);
+			LDLM_LOCK_RELEASE(lock);
+			spin_lock(&ns->ns_lock);
+			continue;
+		}
+
+		lock_res_and_lock(lock);
+		/* Check flags again under the lock. */
+		if (ldlm_is_canceling(lock) ||
+		    ldlm_lock_remove_from_lru_check(lock, last_use) == 0) {
+			/* Another thread is removing lock from LRU, or
+			 * somebody is already doing CANCEL, or there
+			 * is a blocking request which will send cancel
+			 * by itself, or the lock is no longer unused or
+			 * the lock has been used since the pf() call and
+			 * pages could be put under it. */
+			unlock_res_and_lock(lock);
+			lu_ref_del(&lock->l_reference, __FUNCTION__, current);
+			LDLM_LOCK_RELEASE(lock);
+			spin_lock(&ns->ns_lock);
+			continue;
+		}
+		LASSERT(!lock->l_readers && !lock->l_writers);
+
+		/* If we have chosen to cancel this lock voluntarily, we
+		 * better send cancel notification to server, so that it
+		 * frees appropriate state. This might lead to a race
+		 * where while we are doing cancel here, server is also
+		 * silently cancelling this lock. */
+		ldlm_clear_cancel_on_block(lock);
+
+		/* Setting the CBPENDING flag is a little misleading,
+		 * but prevents an important race; namely, once
+		 * CBPENDING is set, the lock can accumulate no more
+		 * readers/writers. Since readers and writers are
+		 * already zero here, ldlm_lock_decref() won't see
+		 * this flag and call l_blocking_ast */
+		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING;
+
+		if ((lru_flags & LDLM_LRU_FLAG_CLEANUP) &&
+		    lock->l_resource->lr_type == LDLM_EXTENT &&
+		    lock->l_granted_mode == LCK_PR)
+			ldlm_set_discard_data(lock);
+
+		/* We can't re-add to l_lru as it confuses the
+		 * refcounting in ldlm_lock_remove_from_lru() if an AST
+		 * arrives after we drop lr_lock below. We use l_bl_ast
+		 * and can't use l_pending_chain as it is used both on
+		 * server and client nevertheless bug 5666 says it is
+		 * used only on server */
+		LASSERT(list_empty(&lock->l_bl_ast));
+		list_add(&lock->l_bl_ast, cancels);
+		unlock_res_and_lock(lock);
+		lu_ref_del(&lock->l_reference, __FUNCTION__, current);
+		spin_lock(&ns->ns_lock);
+		added++;
+		unused--;
+	}
+	spin_unlock(&ns->ns_lock);
+	RETURN(added);
+}
+
+int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
+			  int count, int max,
+			  enum ldlm_cancel_flags cancel_flags,
+			  enum ldlm_lru_flags lru_flags)
+{
+	int added;
+
+	added = ldlm_prepare_lru_list(ns, cancels, count, max, lru_flags);
+	if (added <= 0)
+		return added;
+
+	return ldlm_cli_cancel_list_local(cancels, added, cancel_flags);
+}
+
+/**
+ * Cancel at least \a nr locks from given namespace LRU.
+ *
+ * When called with LCF_ASYNC the blocking callback will be handled
+ * in a thread and this function will return after the thread has been
+ * asked to call the callback.  When called with LCF_ASYNC the blocking
+ * callback will be performed in this function.
+ */
+int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr,
+		    enum ldlm_cancel_flags cancel_flags,
+		    enum ldlm_lru_flags lru_flags)
+{
+	struct list_head cancels = LIST_HEAD_INIT(cancels);
+	int count, rc;
+	ENTRY;
+
+	/* Just prepare the list of locks, do not actually cancel them yet.
+	 * Locks are cancelled later in a separate thread. */
+	count = ldlm_prepare_lru_list(ns, &cancels, nr, 0, lru_flags);
+	rc = ldlm_bl_to_thread_list(ns, NULL, &cancels, count, cancel_flags);
+	if (rc == 0)
+		RETURN(count);
+
+	RETURN(0);
+}
+
+/**
+ * Find and cancel locally unused locks found on resource, matched to the
+ * given policy, mode. GET the found locks and add them into the \a cancels
+ * list.
+ */
+int ldlm_cancel_resource_local(struct ldlm_resource *res,
+			       struct list_head *cancels,
+			       union ldlm_policy_data *policy,
+			       enum ldlm_mode mode, __u64 lock_flags,
+			       enum ldlm_cancel_flags cancel_flags,
+			       void *opaque)
+{
+	struct ldlm_lock *lock;
+	int count = 0;
+	ENTRY;
+
+	lock_res(res);
+	list_for_each_entry(lock, &res->lr_granted, l_res_link) {
+                if (opaque != NULL && lock->l_ast_data != opaque) {
+                        LDLM_ERROR(lock, "data %p doesn't match opaque %p",
+                                   lock->l_ast_data, opaque);
+                        //LBUG();
+                        continue;
+                }
+
+                if (lock->l_readers || lock->l_writers)
+                        continue;
+
+		/* If somebody is already doing CANCEL, or blocking AST came,
+		 * skip this lock. */
+		if (ldlm_is_bl_ast(lock) || ldlm_is_canceling(lock))
+			continue;
+
+                if (lockmode_compat(lock->l_granted_mode, mode))
+                        continue;
+
+                /* If policy is given and this is IBITS lock, add to list only
+                 * those locks that match by policy. */
+                if (policy && (lock->l_resource->lr_type == LDLM_IBITS) &&
+                    !(lock->l_policy_data.l_inodebits.bits &
+                      policy->l_inodebits.bits))
+                        continue;
+
+		/* See CBPENDING comment in ldlm_cancel_lru */
+		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING |
+				 lock_flags;
+
+		LASSERT(list_empty(&lock->l_bl_ast));
+		list_add(&lock->l_bl_ast, cancels);
+                LDLM_LOCK_GET(lock);
+                count++;
+        }
+        unlock_res(res);
+
+        RETURN(ldlm_cli_cancel_list_local(cancels, count, cancel_flags));
+}
+EXPORT_SYMBOL(ldlm_cancel_resource_local);
+
+/**
+ * Cancel client-side locks from a list and send/prepare cancel RPCs to the
+ * server.
+ * If \a req is NULL, send CANCEL request to server with handles of locks
+ * in the \a cancels. If EARLY_CANCEL is not supported, send CANCEL requests
+ * separately per lock.
+ * If \a req is not NULL, put handles of locks in \a cancels into the request
+ * buffer at the offset \a off.
+ * Destroy \a cancels at the end.
+ */
+int ldlm_cli_cancel_list(struct list_head *cancels, int count,
+			 struct ptlrpc_request *req,
+			 enum ldlm_cancel_flags flags)
+{
+	struct ldlm_lock *lock;
+	int res = 0;
+	ENTRY;
+
+	if (list_empty(cancels) || count == 0)
+                RETURN(0);
+
+        /* XXX: requests (both batched and not) could be sent in parallel.
+         * Usually it is enough to have just 1 RPC, but it is possible that
+         * there are too many locks to be cancelled in LRU or on a resource.
+         * It would also speed up the case when the server does not support
+         * the feature. */
+        while (count > 0) {
+		LASSERT(!list_empty(cancels));
+		lock = list_entry(cancels->next, struct ldlm_lock,
+                                      l_bl_ast);
+                LASSERT(lock->l_conn_export);
+
+                if (exp_connect_cancelset(lock->l_conn_export)) {
+                        res = count;
+                        if (req)
+                                ldlm_cancel_pack(req, cancels, count);
+                        else
+                                res = ldlm_cli_cancel_req(lock->l_conn_export,
+                                                          cancels, count,
+                                                          flags);
+                } else {
+                        res = ldlm_cli_cancel_req(lock->l_conn_export,
+                                                  cancels, 1, flags);
+                }
+
+		if (res < 0) {
+			CDEBUG_LIMIT(res == -ESHUTDOWN ? D_DLMTRACE : D_ERROR,
+				     "ldlm_cli_cancel_list: %d\n", res);
+			res = count;
+		}
+
+                count -= res;
+                ldlm_lock_list_put(cancels, l_bl_ast, res);
+        }
+        LASSERT(count == 0);
+        RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_list);
+
+/**
+ * Cancel all locks on a resource that have 0 readers/writers.
+ *
+ * If flags & LDLM_FL_LOCAL_ONLY, throw the locks away without trying
+ * to notify the server. */
+int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
+				    const struct ldlm_res_id *res_id,
+				    union ldlm_policy_data *policy,
+				    enum ldlm_mode mode,
+				    enum ldlm_cancel_flags flags, void *opaque)
+{
+	struct ldlm_resource *res;
+	struct list_head cancels = LIST_HEAD_INIT(cancels);
+	int count;
+	int rc;
+	ENTRY;
+
+	res = ldlm_resource_get(ns, NULL, res_id, 0, 0);
+	if (IS_ERR(res)) {
+		/* This is not a problem. */
+		CDEBUG(D_INFO, "No resource %llu\n", res_id->name[0]);
+		RETURN(0);
+	}
+
+	LDLM_RESOURCE_ADDREF(res);
+	count = ldlm_cancel_resource_local(res, &cancels, policy, mode,
+					   0, flags | LCF_BL_AST, opaque);
+	rc = ldlm_cli_cancel_list(&cancels, count, NULL, flags);
+	if (rc != ELDLM_OK)
+		CERROR("canceling unused lock "DLDLMRES": rc = %d\n",
+		       PLDLMRES(res), rc);
+
+	LDLM_RESOURCE_DELREF(res);
+	ldlm_resource_putref(res);
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_unused_resource);
+
+struct ldlm_cli_cancel_arg {
+        int     lc_flags;
+        void   *lc_opaque;
+};
+
+static int
+ldlm_cli_hash_cancel_unused(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			    struct hlist_node *hnode, void *arg)
+{
+	struct ldlm_resource           *res = cfs_hash_object(hs, hnode);
+	struct ldlm_cli_cancel_arg     *lc = arg;
+
+	ldlm_cli_cancel_unused_resource(ldlm_res_to_ns(res), &res->lr_name,
+					NULL, LCK_MINMODE, lc->lc_flags,
+					lc->lc_opaque);
+	/* must return 0 for hash iteration */
+	return 0;
+}
+
+/**
+ * Cancel all locks on a namespace (or a specific resource, if given)
+ * that have 0 readers/writers.
+ *
+ * If flags & LCF_LOCAL, throw the locks away without trying
+ * to notify the server. */
+int ldlm_cli_cancel_unused(struct ldlm_namespace *ns,
+			   const struct ldlm_res_id *res_id,
+			   enum ldlm_cancel_flags flags, void *opaque)
+{
+        struct ldlm_cli_cancel_arg arg = {
+                .lc_flags       = flags,
+                .lc_opaque      = opaque,
+        };
+
+        ENTRY;
+
+        if (ns == NULL)
+                RETURN(ELDLM_OK);
+
+        if (res_id != NULL) {
+                RETURN(ldlm_cli_cancel_unused_resource(ns, res_id, NULL,
+                                                       LCK_MINMODE, flags,
+                                                       opaque));
+	} else {
+		cfs_hash_for_each_nolock(ns->ns_rs_hash,
+					 ldlm_cli_hash_cancel_unused, &arg, 0);
+		RETURN(ELDLM_OK);
+	}
+}
+
+/* Lock iterators. */
+
+int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter,
+                          void *closure)
+{
+	struct list_head *tmp, *next;
+        struct ldlm_lock *lock;
+        int rc = LDLM_ITER_CONTINUE;
+
+        ENTRY;
+
+        if (!res)
+                RETURN(LDLM_ITER_CONTINUE);
+
+        lock_res(res);
+	list_for_each_safe(tmp, next, &res->lr_granted) {
+		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+                if (iter(lock, closure) == LDLM_ITER_STOP)
+                        GOTO(out, rc = LDLM_ITER_STOP);
+        }
+
+	list_for_each_safe(tmp, next, &res->lr_converting) {
+		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+                if (iter(lock, closure) == LDLM_ITER_STOP)
+                        GOTO(out, rc = LDLM_ITER_STOP);
+        }
+
+	list_for_each_safe(tmp, next, &res->lr_waiting) {
+		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+                if (iter(lock, closure) == LDLM_ITER_STOP)
+                        GOTO(out, rc = LDLM_ITER_STOP);
+        }
+ out:
+        unlock_res(res);
+        RETURN(rc);
+}
+
+struct iter_helper_data {
+        ldlm_iterator_t iter;
+        void *closure;
+};
+
+static int ldlm_iter_helper(struct ldlm_lock *lock, void *closure)
+{
+        struct iter_helper_data *helper = closure;
+        return helper->iter(lock, helper->closure);
+}
+
+static int ldlm_res_iter_helper(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+				struct hlist_node *hnode, void *arg)
+
+{
+        struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+
+        return ldlm_resource_foreach(res, ldlm_iter_helper, arg) ==
+               LDLM_ITER_STOP;
+}
+
+void ldlm_namespace_foreach(struct ldlm_namespace *ns,
+                            ldlm_iterator_t iter, void *closure)
+
+{
+	struct iter_helper_data helper = { .iter = iter, .closure = closure };
+
+	cfs_hash_for_each_nolock(ns->ns_rs_hash,
+				 ldlm_res_iter_helper, &helper, 0);
+
+}
+
+/* non-blocking function to manipulate a lock whose cb_data is being put away.
+ * return  0:  find no resource
+ *       > 0:  must be LDLM_ITER_STOP/LDLM_ITER_CONTINUE.
+ *       < 0:  errors
+ */
+int ldlm_resource_iterate(struct ldlm_namespace *ns,
+			  const struct ldlm_res_id *res_id,
+			  ldlm_iterator_t iter, void *data)
+{
+	struct ldlm_resource *res;
+	int rc;
+	ENTRY;
+
+	LASSERTF(ns != NULL, "must pass in namespace\n");
+
+	res = ldlm_resource_get(ns, NULL, res_id, 0, 0);
+	if (IS_ERR(res))
+		RETURN(0);
+
+	LDLM_RESOURCE_ADDREF(res);
+	rc = ldlm_resource_foreach(res, iter, data);
+	LDLM_RESOURCE_DELREF(res);
+	ldlm_resource_putref(res);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_resource_iterate);
+
+/* Lock replay */
+
+static int ldlm_chain_lock_for_replay(struct ldlm_lock *lock, void *closure)
+{
+	struct list_head *list = closure;
+
+        /* we use l_pending_chain here, because it's unused on clients. */
+	LASSERTF(list_empty(&lock->l_pending_chain),
+                 "lock %p next %p prev %p\n",
+                 lock, &lock->l_pending_chain.next,&lock->l_pending_chain.prev);
+        /* bug 9573: don't replay locks left after eviction, or
+         * bug 17614: locks being actively cancelled. Get a reference
+         * on a lock so that it does not disapear under us (e.g. due to cancel)
+         */
+	if (!(lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_BL_DONE))) {
+		list_add(&lock->l_pending_chain, list);
+                LDLM_LOCK_GET(lock);
+        }
+
+        return LDLM_ITER_CONTINUE;
+}
+
+static int replay_lock_interpret(const struct lu_env *env,
+				 struct ptlrpc_request *req,
+				 struct ldlm_async_args *aa, int rc)
+{
+	struct ldlm_lock     *lock;
+	struct ldlm_reply    *reply;
+	struct obd_export    *exp;
+
+	ENTRY;
+	atomic_dec(&req->rq_import->imp_replay_inflight);
+	if (rc != ELDLM_OK)
+		GOTO(out, rc);
+
+        reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+        if (reply == NULL)
+                GOTO(out, rc = -EPROTO);
+
+        lock = ldlm_handle2lock(&aa->lock_handle);
+        if (!lock) {
+		CERROR("received replay ack for unknown local cookie %#llx"
+		       " remote cookie %#llx from server %s id %s\n",
+                       aa->lock_handle.cookie, reply->lock_handle.cookie,
+                       req->rq_export->exp_client_uuid.uuid,
+                       libcfs_id2str(req->rq_peer));
+                GOTO(out, rc = -ESTALE);
+        }
+
+        /* Key change rehash lock in per-export hash with new key */
+        exp = req->rq_export;
+        if (exp && exp->exp_lock_hash) {
+		/* In the function below, .hs_keycmp resolves to
+		 * ldlm_export_lock_keycmp() */
+		/* coverity[overrun-buffer-val] */
+                cfs_hash_rehash_key(exp->exp_lock_hash,
+                                    &lock->l_remote_handle,
+                                    &reply->lock_handle,
+                                    &lock->l_exp_hash);
+        } else {
+                lock->l_remote_handle = reply->lock_handle;
+        }
+
+        LDLM_DEBUG(lock, "replayed lock:");
+        ptlrpc_import_recovery_state_machine(req->rq_import);
+        LDLM_LOCK_PUT(lock);
+out:
+        if (rc != ELDLM_OK)
+                ptlrpc_connect_import(req->rq_import);
+
+        RETURN(rc);
+}
+
+static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
+{
+        struct ptlrpc_request *req;
+        struct ldlm_async_args *aa;
+        struct ldlm_request   *body;
+        int flags;
+        ENTRY;
+
+
+        /* Bug 11974: Do not replay a lock which is actively being canceled */
+	if (ldlm_is_bl_done(lock)) {
+                LDLM_DEBUG(lock, "Not replaying canceled lock:");
+                RETURN(0);
+        }
+
+        /* If this is reply-less callback lock, we cannot replay it, since
+         * server might have long dropped it, but notification of that event was
+         * lost by network. (and server granted conflicting lock already) */
+	if (ldlm_is_cancel_on_block(lock)) {
+                LDLM_DEBUG(lock, "Not replaying reply-less lock:");
+                ldlm_lock_cancel(lock);
+                RETURN(0);
+        }
+
+        /*
+         * If granted mode matches the requested mode, this lock is granted.
+         *
+         * If they differ, but we have a granted mode, then we were granted
+         * one mode and now want another: ergo, converting.
+         *
+         * If we haven't been granted anything and are on a resource list,
+         * then we're blocked/waiting.
+         *
+         * If we haven't been granted anything and we're NOT on a resource list,
+         * then we haven't got a reply yet and don't have a known disposition.
+         * This happens whenever a lock enqueue is the request that triggers
+         * recovery.
+         */
+        if (lock->l_granted_mode == lock->l_req_mode)
+                flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_GRANTED;
+        else if (lock->l_granted_mode)
+                flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_CONV;
+	else if (!list_empty(&lock->l_res_link))
+                flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_WAIT;
+        else
+                flags = LDLM_FL_REPLAY;
+
+        req = ptlrpc_request_alloc_pack(imp, &RQF_LDLM_ENQUEUE,
+                                        LUSTRE_DLM_VERSION, LDLM_ENQUEUE);
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+        /* We're part of recovery, so don't wait for it. */
+        req->rq_send_state = LUSTRE_IMP_REPLAY_LOCKS;
+
+        body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+        ldlm_lock2desc(lock, &body->lock_desc);
+	body->lock_flags = ldlm_flags_to_wire(flags);
+
+        ldlm_lock2handle(lock, &body->lock_handle[0]);
+	if (lock->l_lvb_len > 0)
+		req_capsule_extend(&req->rq_pill, &RQF_LDLM_ENQUEUE_LVB);
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+			     lock->l_lvb_len);
+        ptlrpc_request_set_replen(req);
+        /* notify the server we've replayed all requests.
+         * also, we mark the request to be put on a dedicated
+         * queue to be processed after all request replayes.
+         * bug 6063 */
+	lustre_msg_set_flags(req->rq_reqmsg, MSG_REQ_REPLAY_DONE);
+
+	LDLM_DEBUG(lock, "replaying lock:");
+
+	atomic_inc(&req->rq_import->imp_replay_inflight);
+	CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+	aa = ptlrpc_req_async_args(req);
+	aa->lock_handle = body->lock_handle[0];
+	req->rq_interpret_reply = (ptlrpc_interpterer_t)replay_lock_interpret;
+	ptlrpcd_add_req(req);
+
+	RETURN(0);
+}
+
+/**
+ * Cancel as many unused locks as possible before replay. since we are
+ * in recovery, we can't wait for any outstanding RPCs to send any RPC
+ * to the server.
+ *
+ * Called only in recovery before replaying locks. there is no need to
+ * replay locks that are unused. since the clients may hold thousands of
+ * cached unused locks, dropping the unused locks can greatly reduce the
+ * load on the servers at recovery time.
+ */
+static void ldlm_cancel_unused_locks_for_replay(struct ldlm_namespace *ns)
+{
+	int canceled;
+	struct list_head cancels = LIST_HEAD_INIT(cancels);
+
+	CDEBUG(D_DLMTRACE, "Dropping as many unused locks as possible before"
+			   "replay for namespace %s (%d)\n",
+			   ldlm_ns_name(ns), ns->ns_nr_unused);
+
+	/* We don't need to care whether or not LRU resize is enabled
+	 * because the LDLM_LRU_FLAG_NO_WAIT policy doesn't use the
+	 * count parameter */
+	canceled = ldlm_cancel_lru_local(ns, &cancels, ns->ns_nr_unused, 0,
+					 LCF_LOCAL, LDLM_LRU_FLAG_NO_WAIT);
+
+	CDEBUG(D_DLMTRACE, "Canceled %d unused locks from namespace %s\n",
+			   canceled, ldlm_ns_name(ns));
+}
+
+int ldlm_replay_locks(struct obd_import *imp)
+{
+	struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
+	struct list_head list = LIST_HEAD_INIT(list);
+	struct ldlm_lock *lock, *next;
+	int rc = 0;
+
+	ENTRY;
+
+	LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
+
+	/* don't replay locks if import failed recovery */
+	if (imp->imp_vbr_failed)
+		RETURN(0);
+
+	/* ensure this doesn't fall to 0 before all have been queued */
+	atomic_inc(&imp->imp_replay_inflight);
+
+	if (ldlm_cancel_unused_locks_before_replay)
+		ldlm_cancel_unused_locks_for_replay(ns);
+
+	ldlm_namespace_foreach(ns, ldlm_chain_lock_for_replay, &list);
+
+	list_for_each_entry_safe(lock, next, &list, l_pending_chain) {
+		list_del_init(&lock->l_pending_chain);
+		if (rc) {
+			LDLM_LOCK_RELEASE(lock);
+			continue; /* or try to do the rest? */
+		}
+		rc = replay_one_lock(imp, lock);
+		LDLM_LOCK_RELEASE(lock);
+	}
+
+	atomic_dec(&imp->imp_replay_inflight);
+
+	RETURN(rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c
new file mode 100644
index 0000000000000..225d3a7f01df7
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c
@@ -0,0 +1,1708 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_resource.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Peter Braam <braam@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+#include <lustre_dlm.h>
+#include <lustre_fid.h>
+#include <obd_class.h>
+#include "ldlm_internal.h"
+
+struct kmem_cache *ldlm_resource_slab, *ldlm_lock_slab;
+struct kmem_cache *ldlm_interval_tree_slab;
+
+int ldlm_srv_namespace_nr = 0;
+int ldlm_cli_namespace_nr = 0;
+
+DEFINE_MUTEX(ldlm_srv_namespace_lock);
+LIST_HEAD(ldlm_srv_namespace_list);
+
+DEFINE_MUTEX(ldlm_cli_namespace_lock);
+/* Client Namespaces that have active resources in them.
+ * Once all resources go away, ldlm_poold moves such namespaces to the
+ * inactive list */
+LIST_HEAD(ldlm_cli_active_namespace_list);
+/* Client namespaces that don't have any locks in them */
+LIST_HEAD(ldlm_cli_inactive_namespace_list);
+
+static struct proc_dir_entry *ldlm_type_proc_dir;
+static struct proc_dir_entry *ldlm_ns_proc_dir;
+struct proc_dir_entry *ldlm_svc_proc_dir;
+
+/* during debug dump certain amount of granted locks for one resource to avoid
+ * DDOS. */
+static unsigned int ldlm_dump_granted_max = 256;
+
+#ifdef CONFIG_PROC_FS
+static ssize_t
+lprocfs_dump_ns_seq_write(struct file *file, const char __user *buffer,
+			  size_t count, loff_t *off)
+{
+	ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE);
+	ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE);
+	RETURN(count);
+}
+LPROC_SEQ_FOPS_WO_TYPE(ldlm, dump_ns);
+
+LPROC_SEQ_FOPS_RW_TYPE(ldlm_rw, uint);
+
+#ifdef HAVE_SERVER_SUPPORT
+
+static int seq_watermark_show(struct seq_file *m, void *data)
+{
+	seq_printf(m, "%llu\n", *(__u64 *)m->private);
+	return 0;
+}
+
+static ssize_t seq_watermark_write(struct file *file,
+				   const char __user *buffer, size_t count,
+				   loff_t *off)
+{
+	__s64 value;
+	__u64 watermark;
+	__u64 *data = ((struct seq_file *)file->private_data)->private;
+	bool wm_low = (data == &ldlm_reclaim_threshold_mb) ? true : false;
+	int rc;
+
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &value, 'M');
+	if (rc) {
+		CERROR("Failed to set %s, rc = %d.\n",
+		       wm_low ? "lock_reclaim_threshold_mb" : "lock_limit_mb",
+		       rc);
+		return rc;
+	} else if (value != 0 && value < (1 << 20)) {
+		CERROR("%s should be greater than 1MB.\n",
+		       wm_low ? "lock_reclaim_threshold_mb" : "lock_limit_mb");
+		return -EINVAL;
+	}
+	watermark = value >> 20;
+
+	if (wm_low) {
+		if (ldlm_lock_limit_mb != 0 && watermark > ldlm_lock_limit_mb) {
+			CERROR("lock_reclaim_threshold_mb must be smaller than "
+			       "lock_limit_mb.\n");
+			return -EINVAL;
+		}
+
+		*data = watermark;
+		if (watermark != 0) {
+			watermark <<= 20;
+			do_div(watermark, sizeof(struct ldlm_lock));
+		}
+		ldlm_reclaim_threshold = watermark;
+	} else {
+		if (ldlm_reclaim_threshold_mb != 0 &&
+		    watermark < ldlm_reclaim_threshold_mb) {
+			CERROR("lock_limit_mb must be greater than "
+			       "lock_reclaim_threshold_mb.\n");
+			return -EINVAL;
+		}
+
+		*data = watermark;
+		if (watermark != 0) {
+			watermark <<= 20;
+			do_div(watermark, sizeof(struct ldlm_lock));
+		}
+		ldlm_lock_limit = watermark;
+	}
+
+	return count;
+}
+
+static int seq_watermark_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, seq_watermark_show, PDE_DATA(inode));
+}
+
+static const struct file_operations ldlm_watermark_fops = {
+	.owner		= THIS_MODULE,
+	.open		= seq_watermark_open,
+	.read		= seq_read,
+	.write		= seq_watermark_write,
+	.llseek		= seq_lseek,
+	.release	= lprocfs_single_release,
+};
+
+static int seq_granted_show(struct seq_file *m, void *data)
+{
+	seq_printf(m, "%llu\n", percpu_counter_sum_positive(
+		   (struct percpu_counter *)m->private));
+	return 0;
+}
+
+static int seq_granted_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, seq_granted_show, PDE_DATA(inode));
+}
+
+static const struct file_operations ldlm_granted_fops = {
+	.owner	= THIS_MODULE,
+	.open	= seq_granted_open,
+	.read	= seq_read,
+	.llseek	= seq_lseek,
+	.release = seq_release,
+};
+
+#endif /* HAVE_SERVER_SUPPORT */
+
+int ldlm_proc_setup(void)
+{
+	int rc;
+	struct lprocfs_vars list[] = {
+		{ .name	=	"dump_namespaces",
+		  .fops	=	&ldlm_dump_ns_fops,
+		  .proc_mode =	0222 },
+		{ .name	=	"dump_granted_max",
+		  .fops	=	&ldlm_rw_uint_fops,
+		  .data	=	&ldlm_dump_granted_max },
+#ifdef HAVE_SERVER_SUPPORT
+		{ .name =	"lock_reclaim_threshold_mb",
+		  .fops =	&ldlm_watermark_fops,
+		  .data =	&ldlm_reclaim_threshold_mb },
+		{ .name =	"lock_limit_mb",
+		  .fops =	&ldlm_watermark_fops,
+		  .data =	&ldlm_lock_limit_mb },
+		{ .name =	"lock_granted_count",
+		  .fops =	&ldlm_granted_fops,
+		  .data =	&ldlm_granted_total },
+#endif
+		{ NULL }};
+	ENTRY;
+	LASSERT(ldlm_ns_proc_dir == NULL);
+
+	ldlm_type_proc_dir = lprocfs_register(OBD_LDLM_DEVICENAME,
+					      proc_lustre_root,
+					      NULL, NULL);
+	if (IS_ERR(ldlm_type_proc_dir)) {
+		CERROR("LProcFS failed in ldlm-init\n");
+		rc = PTR_ERR(ldlm_type_proc_dir);
+		GOTO(err, rc);
+	}
+
+	ldlm_ns_proc_dir = lprocfs_register("namespaces",
+					    ldlm_type_proc_dir,
+					    NULL, NULL);
+	if (IS_ERR(ldlm_ns_proc_dir)) {
+		CERROR("LProcFS failed in ldlm-init\n");
+		rc = PTR_ERR(ldlm_ns_proc_dir);
+		GOTO(err_type, rc);
+	}
+
+	ldlm_svc_proc_dir = lprocfs_register("services",
+					     ldlm_type_proc_dir,
+					     NULL, NULL);
+	if (IS_ERR(ldlm_svc_proc_dir)) {
+		CERROR("LProcFS failed in ldlm-init\n");
+		rc = PTR_ERR(ldlm_svc_proc_dir);
+		GOTO(err_ns, rc);
+	}
+
+	rc = lprocfs_add_vars(ldlm_type_proc_dir, list, NULL);
+	if (rc != 0) {
+		CERROR("LProcFS failed in ldlm-init\n");
+		GOTO(err_svc, rc);
+	}
+
+	RETURN(0);
+
+err_svc:
+	lprocfs_remove(&ldlm_svc_proc_dir);
+err_ns:
+        lprocfs_remove(&ldlm_ns_proc_dir);
+err_type:
+        lprocfs_remove(&ldlm_type_proc_dir);
+err:
+        ldlm_svc_proc_dir = NULL;
+        RETURN(rc);
+}
+
+void ldlm_proc_cleanup(void)
+{
+        if (ldlm_svc_proc_dir)
+                lprocfs_remove(&ldlm_svc_proc_dir);
+
+        if (ldlm_ns_proc_dir)
+                lprocfs_remove(&ldlm_ns_proc_dir);
+
+        if (ldlm_type_proc_dir)
+                lprocfs_remove(&ldlm_type_proc_dir);
+}
+
+static ssize_t resource_count_show(struct kobject *kobj, struct attribute *attr,
+				   char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	__u64			res = 0;
+	struct cfs_hash_bd		bd;
+	int			i;
+
+	/* result is not strictly consistant */
+	cfs_hash_for_each_bucket(ns->ns_rs_hash, &bd, i)
+		res += cfs_hash_bd_count_get(&bd);
+	return sprintf(buf, "%lld\n", res);
+}
+LUSTRE_RO_ATTR(resource_count);
+
+static ssize_t lock_count_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	__u64			locks;
+
+	locks = lprocfs_stats_collector(ns->ns_stats, LDLM_NSS_LOCKS,
+					LPROCFS_FIELDS_FLAGS_SUM);
+	return sprintf(buf, "%lld\n", locks);
+}
+LUSTRE_RO_ATTR(lock_count);
+
+static ssize_t lock_unused_count_show(struct kobject *kobj,
+				      struct attribute *attr,
+				      char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+
+	return sprintf(buf, "%d\n", ns->ns_nr_unused);
+}
+LUSTRE_RO_ATTR(lock_unused_count);
+
+static ssize_t lru_size_show(struct kobject *kobj, struct attribute *attr,
+			     char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	__u32 *nr = &ns->ns_max_unused;
+
+	if (ns_connect_lru_resize(ns))
+		nr = &ns->ns_nr_unused;
+	return sprintf(buf, "%u\n", *nr);
+}
+
+static ssize_t lru_size_store(struct kobject *kobj, struct attribute *attr,
+			      const char *buffer, size_t count)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	unsigned long tmp;
+	int lru_resize;
+	int err;
+
+	if (strncmp(buffer, "clear", 5) == 0) {
+                CDEBUG(D_DLMTRACE,
+                       "dropping all unused locks from namespace %s\n",
+                       ldlm_ns_name(ns));
+                if (ns_connect_lru_resize(ns)) {
+			/* Try to cancel all @ns_nr_unused locks. */
+			ldlm_cancel_lru(ns, ns->ns_nr_unused, 0,
+					LDLM_LRU_FLAG_PASSED |
+					LDLM_LRU_FLAG_CLEANUP);
+		} else {
+			tmp = ns->ns_max_unused;
+			ns->ns_max_unused = 0;
+			ldlm_cancel_lru(ns, 0, 0, LDLM_LRU_FLAG_PASSED |
+					LDLM_LRU_FLAG_CLEANUP);
+			ns->ns_max_unused = tmp;
+		}
+		return count;
+	}
+
+	err = kstrtoul(buffer, 10, &tmp);
+	if (err != 0) {
+		CERROR("lru_size: invalid value written\n");
+                return -EINVAL;
+        }
+	lru_resize = (tmp == 0);
+
+	if (ns_connect_lru_resize(ns)) {
+		if (!lru_resize)
+			ns->ns_max_unused = (unsigned int)tmp;
+
+		if (tmp > ns->ns_nr_unused)
+			tmp = ns->ns_nr_unused;
+		tmp = ns->ns_nr_unused - tmp;
+
+		CDEBUG(D_DLMTRACE,
+		       "changing namespace %s unused locks from %u to %u\n",
+		       ldlm_ns_name(ns), ns->ns_nr_unused,
+		       (unsigned int)tmp);
+		ldlm_cancel_lru(ns, tmp, LCF_ASYNC, LDLM_LRU_FLAG_PASSED);
+
+		if (!lru_resize) {
+			CDEBUG(D_DLMTRACE,
+			       "disable lru_resize for namespace %s\n",
+			       ldlm_ns_name(ns));
+			ns->ns_connect_flags &= ~OBD_CONNECT_LRU_RESIZE;
+		}
+        } else {
+		CDEBUG(D_DLMTRACE,
+		       "changing namespace %s max_unused from %u to %u\n",
+		       ldlm_ns_name(ns), ns->ns_max_unused,
+		       (unsigned int)tmp);
+		ns->ns_max_unused = (unsigned int)tmp;
+		ldlm_cancel_lru(ns, 0, LCF_ASYNC, LDLM_LRU_FLAG_PASSED);
+
+		/* Make sure that LRU resize was originally supported before
+		 * turning it on here.
+		 */
+                if (lru_resize &&
+                    (ns->ns_orig_connect_flags & OBD_CONNECT_LRU_RESIZE)) {
+                        CDEBUG(D_DLMTRACE,
+                               "enable lru_resize for namespace %s\n",
+                               ldlm_ns_name(ns));
+                        ns->ns_connect_flags |= OBD_CONNECT_LRU_RESIZE;
+                }
+        }
+
+        return count;
+}
+LUSTRE_RW_ATTR(lru_size);
+
+static ssize_t lru_max_age_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+
+	return sprintf(buf, "%lld\n", ktime_to_ms(ns->ns_max_age));
+}
+
+static ssize_t lru_max_age_store(struct kobject *kobj, struct attribute *attr,
+				 const char *buffer, size_t count)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	int scale = NSEC_PER_MSEC;
+	unsigned long long tmp;
+	char *buf;
+	int err;
+
+	/* Did the user ask in seconds or milliseconds. Default is in ms */
+	buf = strstr(buffer, "ms");
+	if (!buf) {
+		buf = strchr(buffer, 's');
+		if (buf)
+			scale = NSEC_PER_SEC;
+	}
+
+	if (buf)
+		*buf = '\0';
+
+	err = kstrtoull(buffer, 10, &tmp);
+	if (err != 0)
+		return -EINVAL;
+
+	ns->ns_max_age = ktime_set(0, tmp * scale);
+
+	return count;
+}
+LUSTRE_RW_ATTR(lru_max_age);
+
+static ssize_t early_lock_cancel_show(struct kobject *kobj,
+				      struct attribute *attr,
+				      char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+
+	return sprintf(buf, "%d\n", ns_connect_cancelset(ns));
+}
+
+static ssize_t early_lock_cancel_store(struct kobject *kobj,
+				       struct attribute *attr,
+				       const char *buffer,
+				       size_t count)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	unsigned long supp = -1;
+	int rc;
+
+	rc = kstrtoul(buffer, 10, &supp);
+	if (rc < 0)
+		return rc;
+
+	if (supp == 0)
+		ns->ns_connect_flags &= ~OBD_CONNECT_CANCELSET;
+	else if (ns->ns_orig_connect_flags & OBD_CONNECT_CANCELSET)
+		ns->ns_connect_flags |= OBD_CONNECT_CANCELSET;
+	return count;
+}
+LUSTRE_RW_ATTR(early_lock_cancel);
+
+#ifdef HAVE_SERVER_SUPPORT
+static ssize_t ctime_age_limit_show(struct kobject *kobj,
+				    struct attribute *attr, char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+
+	return sprintf(buf, "%u\n", ns->ns_ctime_age_limit);
+}
+
+static ssize_t ctime_age_limit_store(struct kobject *kobj,
+				     struct attribute *attr,
+				     const char *buffer, size_t count)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	unsigned long tmp;
+	int err;
+
+	err = kstrtoul(buffer, 10, &tmp);
+	if (err != 0)
+		return -EINVAL;
+
+	ns->ns_ctime_age_limit = tmp;
+
+	return count;
+}
+LUSTRE_RW_ATTR(ctime_age_limit);
+
+static ssize_t lock_timeouts_show(struct kobject *kobj, struct attribute *attr,
+				  char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+
+	return sprintf(buf, "%d\n", ns->ns_timeouts);
+}
+LUSTRE_RO_ATTR(lock_timeouts);
+
+static ssize_t max_nolock_bytes_show(struct kobject *kobj,
+				     struct attribute *attr, char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+
+	return sprintf(buf, "%u\n", ns->ns_max_nolock_size);
+}
+
+static ssize_t max_nolock_bytes_store(struct kobject *kobj,
+				      struct attribute *attr,
+				      const char *buffer, size_t count)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	unsigned long tmp;
+	int err;
+
+	err = kstrtoul(buffer, 10, &tmp);
+	if (err != 0)
+		return -EINVAL;
+
+	ns->ns_max_nolock_size = tmp;
+
+	return count;
+}
+LUSTRE_RW_ATTR(max_nolock_bytes);
+
+static ssize_t contention_seconds_show(struct kobject *kobj,
+				       struct attribute *attr, char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+
+	return sprintf(buf, "%u\n", ns->ns_contention_time);
+}
+
+static ssize_t contention_seconds_store(struct kobject *kobj,
+					struct attribute *attr,
+					const char *buffer, size_t count)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	unsigned long tmp;
+	int err;
+
+	err = kstrtoul(buffer, 10, &tmp);
+	if (err != 0)
+		return -EINVAL;
+
+	ns->ns_contention_time = tmp;
+
+	return count;
+}
+LUSTRE_RW_ATTR(contention_seconds);
+
+static ssize_t contended_locks_show(struct kobject *kobj,
+				    struct attribute *attr, char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+
+	return sprintf(buf, "%u\n", ns->ns_contended_locks);
+}
+
+static ssize_t contended_locks_store(struct kobject *kobj,
+				     struct attribute *attr,
+				     const char *buffer, size_t count)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	unsigned long tmp;
+	int err;
+
+	err = kstrtoul(buffer, 10, &tmp);
+	if (err != 0)
+		return -EINVAL;
+
+	ns->ns_contended_locks = tmp;
+
+	return count;
+}
+LUSTRE_RW_ATTR(contended_locks);
+
+static ssize_t max_parallel_ast_show(struct kobject *kobj,
+				     struct attribute *attr, char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+
+	return sprintf(buf, "%u\n", ns->ns_max_parallel_ast);
+}
+
+static ssize_t max_parallel_ast_store(struct kobject *kobj,
+				      struct attribute *attr,
+				      const char *buffer, size_t count)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	unsigned long tmp;
+	int err;
+
+	err = kstrtoul(buffer, 10, &tmp);
+	if (err != 0)
+		return -EINVAL;
+
+	ns->ns_max_parallel_ast = tmp;
+
+	return count;
+}
+LUSTRE_RW_ATTR(max_parallel_ast);
+
+#endif /* HAVE_SERVER_SUPPORT */
+
+/* These are for namespaces in /sys/fs/lustre/ldlm/namespaces/ */
+static struct attribute *ldlm_ns_attrs[] = {
+	&lustre_attr_resource_count.attr,
+	&lustre_attr_lock_count.attr,
+	&lustre_attr_lock_unused_count.attr,
+	&lustre_attr_lru_size.attr,
+	&lustre_attr_lru_max_age.attr,
+	&lustre_attr_early_lock_cancel.attr,
+#ifdef HAVE_SERVER_SUPPORT
+	&lustre_attr_ctime_age_limit.attr,
+	&lustre_attr_lock_timeouts.attr,
+	&lustre_attr_max_nolock_bytes.attr,
+	&lustre_attr_contention_seconds.attr,
+	&lustre_attr_contended_locks.attr,
+	&lustre_attr_max_parallel_ast.attr,
+#endif
+	NULL,
+};
+
+static void ldlm_ns_release(struct kobject *kobj)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	complete(&ns->ns_kobj_unregister);
+}
+
+static struct kobj_type ldlm_ns_ktype = {
+	.default_attrs	= ldlm_ns_attrs,
+	.sysfs_ops	= &lustre_sysfs_ops,
+	.release	= ldlm_ns_release,
+};
+
+static void ldlm_namespace_proc_unregister(struct ldlm_namespace *ns)
+{
+	if (ns->ns_proc_dir_entry == NULL)
+                CERROR("dlm namespace %s has no procfs dir?\n",
+                       ldlm_ns_name(ns));
+	else
+		lprocfs_remove(&ns->ns_proc_dir_entry);
+
+	if (ns->ns_stats != NULL)
+		lprocfs_free_stats(&ns->ns_stats);
+}
+
+void ldlm_namespace_sysfs_unregister(struct ldlm_namespace *ns)
+{
+	kobject_put(&ns->ns_kobj);
+	wait_for_completion(&ns->ns_kobj_unregister);
+}
+
+int ldlm_namespace_sysfs_register(struct ldlm_namespace *ns)
+{
+	int err;
+
+	ns->ns_kobj.kset = ldlm_ns_kset;
+	init_completion(&ns->ns_kobj_unregister);
+	err = kobject_init_and_add(&ns->ns_kobj, &ldlm_ns_ktype, NULL,
+				   "%s", ldlm_ns_name(ns));
+
+	ns->ns_stats = lprocfs_alloc_stats(LDLM_NSS_LAST, 0);
+	if (!ns->ns_stats) {
+		kobject_put(&ns->ns_kobj);
+		return -ENOMEM;
+	}
+
+	lprocfs_counter_init(ns->ns_stats, LDLM_NSS_LOCKS,
+			     LPROCFS_CNTR_AVGMINMAX, "locks", "locks");
+
+	return err;
+}
+
+static int ldlm_namespace_proc_register(struct ldlm_namespace *ns)
+{
+	struct proc_dir_entry *ns_pde;
+
+        LASSERT(ns != NULL);
+        LASSERT(ns->ns_rs_hash != NULL);
+
+	if (ns->ns_proc_dir_entry != NULL) {
+		ns_pde = ns->ns_proc_dir_entry;
+	} else {
+		ns_pde = proc_mkdir(ldlm_ns_name(ns), ldlm_ns_proc_dir);
+		if (ns_pde == NULL)
+			return -ENOMEM;
+		ns->ns_proc_dir_entry = ns_pde;
+	}
+
+	return 0;
+}
+#undef MAX_STRING_SIZE
+#else /* CONFIG_PROC_FS */
+
+#define ldlm_namespace_proc_unregister(ns)      ({;})
+#define ldlm_namespace_proc_register(ns)        ({0;})
+
+#endif /* CONFIG_PROC_FS */
+
+static unsigned ldlm_res_hop_hash(struct cfs_hash *hs,
+                                  const void *key, unsigned mask)
+{
+        const struct ldlm_res_id     *id  = key;
+        unsigned                val = 0;
+        unsigned                i;
+
+        for (i = 0; i < RES_NAME_SIZE; i++)
+                val += id->name[i];
+        return val & mask;
+}
+
+static unsigned ldlm_res_hop_fid_hash(struct cfs_hash *hs,
+                                      const void *key, unsigned mask)
+{
+        const struct ldlm_res_id *id = key;
+        struct lu_fid       fid;
+        __u32               hash;
+        __u32               val;
+
+        fid.f_seq = id->name[LUSTRE_RES_ID_SEQ_OFF];
+        fid.f_oid = (__u32)id->name[LUSTRE_RES_ID_VER_OID_OFF];
+        fid.f_ver = (__u32)(id->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32);
+
+	hash = fid_flatten32(&fid);
+	hash += (hash >> 4) + (hash << 12); /* mixing oid and seq */
+	if (id->name[LUSTRE_RES_ID_HSH_OFF] != 0) {
+		val = id->name[LUSTRE_RES_ID_HSH_OFF];
+		hash += (val >> 5) + (val << 11);
+	} else {
+		val = fid_oid(&fid);
+	}
+	hash = hash_long(hash, hs->hs_bkt_bits);
+	/* give me another random factor */
+	hash -= hash_long((unsigned long)hs, val % 11 + 3);
+
+	hash <<= hs->hs_cur_bits - hs->hs_bkt_bits;
+	hash |= ldlm_res_hop_hash(hs, key, CFS_HASH_NBKT(hs) - 1);
+
+	return hash & mask;
+}
+
+static void *ldlm_res_hop_key(struct hlist_node *hnode)
+{
+        struct ldlm_resource   *res;
+
+	res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+        return &res->lr_name;
+}
+
+static int ldlm_res_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+        struct ldlm_resource   *res;
+
+	res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+        return ldlm_res_eq((const struct ldlm_res_id *)key,
+                           (const struct ldlm_res_id *)&res->lr_name);
+}
+
+static void *ldlm_res_hop_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct ldlm_resource, lr_hash);
+}
+
+static void
+ldlm_res_hop_get_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+        struct ldlm_resource *res;
+
+	res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+        ldlm_resource_getref(res);
+}
+
+static void ldlm_res_hop_put(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+        struct ldlm_resource *res;
+
+	res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+        ldlm_resource_putref(res);
+}
+
+static struct cfs_hash_ops ldlm_ns_hash_ops = {
+        .hs_hash        = ldlm_res_hop_hash,
+        .hs_key         = ldlm_res_hop_key,
+        .hs_keycmp      = ldlm_res_hop_keycmp,
+        .hs_keycpy      = NULL,
+        .hs_object      = ldlm_res_hop_object,
+        .hs_get         = ldlm_res_hop_get_locked,
+        .hs_put         = ldlm_res_hop_put
+};
+
+static struct cfs_hash_ops ldlm_ns_fid_hash_ops = {
+        .hs_hash        = ldlm_res_hop_fid_hash,
+        .hs_key         = ldlm_res_hop_key,
+        .hs_keycmp      = ldlm_res_hop_keycmp,
+        .hs_keycpy      = NULL,
+        .hs_object      = ldlm_res_hop_object,
+        .hs_get         = ldlm_res_hop_get_locked,
+        .hs_put         = ldlm_res_hop_put
+};
+
+typedef struct ldlm_ns_hash_def {
+	enum ldlm_ns_type	nsd_type;
+	/** hash bucket bits */
+	unsigned		nsd_bkt_bits;
+	/** hash bits */
+	unsigned		nsd_all_bits;
+	/** hash operations */
+	struct cfs_hash_ops *nsd_hops;
+} ldlm_ns_hash_def_t;
+
+static struct ldlm_ns_hash_def ldlm_ns_hash_defs[] =
+{
+        {
+                .nsd_type       = LDLM_NS_TYPE_MDC,
+                .nsd_bkt_bits   = 11,
+                .nsd_all_bits   = 16,
+                .nsd_hops       = &ldlm_ns_fid_hash_ops,
+        },
+        {
+                .nsd_type       = LDLM_NS_TYPE_MDT,
+                .nsd_bkt_bits   = 14,
+                .nsd_all_bits   = 21,
+                .nsd_hops       = &ldlm_ns_fid_hash_ops,
+        },
+        {
+                .nsd_type       = LDLM_NS_TYPE_OSC,
+                .nsd_bkt_bits   = 8,
+                .nsd_all_bits   = 12,
+                .nsd_hops       = &ldlm_ns_hash_ops,
+        },
+        {
+                .nsd_type       = LDLM_NS_TYPE_OST,
+                .nsd_bkt_bits   = 11,
+                .nsd_all_bits   = 17,
+                .nsd_hops       = &ldlm_ns_hash_ops,
+        },
+        {
+                .nsd_type       = LDLM_NS_TYPE_MGC,
+                .nsd_bkt_bits   = 4,
+                .nsd_all_bits   = 4,
+                .nsd_hops       = &ldlm_ns_hash_ops,
+        },
+        {
+                .nsd_type       = LDLM_NS_TYPE_MGT,
+                .nsd_bkt_bits   = 4,
+                .nsd_all_bits   = 4,
+                .nsd_hops       = &ldlm_ns_hash_ops,
+        },
+        {
+                .nsd_type       = LDLM_NS_TYPE_UNKNOWN,
+        },
+};
+
+/**
+ * Create and initialize new empty namespace.
+ */
+struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name,
+					  enum ldlm_side client,
+					  enum ldlm_appetite apt,
+					  enum ldlm_ns_type ns_type)
+{
+	struct ldlm_namespace *ns = NULL;
+	struct ldlm_ns_bucket *nsb;
+	struct ldlm_ns_hash_def *nsd;
+	struct cfs_hash_bd bd;
+	int idx;
+	int rc;
+	ENTRY;
+
+        LASSERT(obd != NULL);
+
+        rc = ldlm_get_ref();
+        if (rc) {
+                CERROR("ldlm_get_ref failed: %d\n", rc);
+                RETURN(NULL);
+        }
+
+        for (idx = 0;;idx++) {
+                nsd = &ldlm_ns_hash_defs[idx];
+                if (nsd->nsd_type == LDLM_NS_TYPE_UNKNOWN) {
+                        CERROR("Unknown type %d for ns %s\n", ns_type, name);
+                        GOTO(out_ref, NULL);
+                }
+
+                if (nsd->nsd_type == ns_type)
+                        break;
+        }
+
+        OBD_ALLOC_PTR(ns);
+        if (!ns)
+                GOTO(out_ref, NULL);
+
+        ns->ns_rs_hash = cfs_hash_create(name,
+                                         nsd->nsd_all_bits, nsd->nsd_all_bits,
+                                         nsd->nsd_bkt_bits, sizeof(*nsb),
+                                         CFS_HASH_MIN_THETA,
+                                         CFS_HASH_MAX_THETA,
+                                         nsd->nsd_hops,
+                                         CFS_HASH_DEPTH |
+                                         CFS_HASH_BIGNAME |
+                                         CFS_HASH_SPIN_BKTLOCK |
+                                         CFS_HASH_NO_ITEMREF);
+        if (ns->ns_rs_hash == NULL)
+                GOTO(out_ns, NULL);
+
+        cfs_hash_for_each_bucket(ns->ns_rs_hash, &bd, idx) {
+                nsb = cfs_hash_bd_extra_get(ns->ns_rs_hash, &bd);
+                at_init(&nsb->nsb_at_estimate, ldlm_enqueue_min, 0);
+                nsb->nsb_namespace = ns;
+		nsb->nsb_reclaim_start = 0;
+        }
+
+        ns->ns_obd      = obd;
+        ns->ns_appetite = apt;
+        ns->ns_client   = client;
+
+	INIT_LIST_HEAD(&ns->ns_list_chain);
+	INIT_LIST_HEAD(&ns->ns_unused_list);
+	spin_lock_init(&ns->ns_lock);
+	atomic_set(&ns->ns_bref, 0);
+	init_waitqueue_head(&ns->ns_waitq);
+
+	ns->ns_max_nolock_size    = NS_DEFAULT_MAX_NOLOCK_BYTES;
+	ns->ns_contention_time    = NS_DEFAULT_CONTENTION_SECONDS;
+	ns->ns_contended_locks    = NS_DEFAULT_CONTENDED_LOCKS;
+
+        ns->ns_max_parallel_ast   = LDLM_DEFAULT_PARALLEL_AST_LIMIT;
+        ns->ns_nr_unused          = 0;
+        ns->ns_max_unused         = LDLM_DEFAULT_LRU_SIZE;
+	ns->ns_max_age            = ktime_set(LDLM_DEFAULT_MAX_ALIVE, 0);
+        ns->ns_ctime_age_limit    = LDLM_CTIME_AGE_LIMIT;
+        ns->ns_timeouts           = 0;
+        ns->ns_orig_connect_flags = 0;
+        ns->ns_connect_flags      = 0;
+        ns->ns_stopping           = 0;
+	ns->ns_reclaim_start	  = 0;
+
+	rc = ldlm_namespace_sysfs_register(ns);
+	if (rc) {
+		CERROR("Can't initialize ns sysfs, rc %d\n", rc);
+		GOTO(out_hash, rc);
+	}
+
+	rc = ldlm_namespace_proc_register(ns);
+	if (rc) {
+		CERROR("Can't initialize ns proc, rc %d\n", rc);
+		GOTO(out_sysfs, rc);
+	}
+
+        idx = ldlm_namespace_nr_read(client);
+        rc = ldlm_pool_init(&ns->ns_pool, ns, idx, client);
+        if (rc) {
+                CERROR("Can't initialize lock pool, rc %d\n", rc);
+                GOTO(out_proc, rc);
+        }
+
+        ldlm_namespace_register(ns, client);
+        RETURN(ns);
+out_proc:
+	ldlm_namespace_proc_unregister(ns);
+out_sysfs:
+	ldlm_namespace_sysfs_unregister(ns);
+	ldlm_namespace_cleanup(ns, 0);
+out_hash:
+        cfs_hash_putref(ns->ns_rs_hash);
+out_ns:
+        OBD_FREE_PTR(ns);
+out_ref:
+        ldlm_put_ref();
+        RETURN(NULL);
+}
+EXPORT_SYMBOL(ldlm_namespace_new);
+
+extern struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock);
+
+/**
+ * Cancel and destroy all locks on a resource.
+ *
+ * If flags contains FL_LOCAL_ONLY, don't try to tell the server, just
+ * clean up.  This is currently only used for recovery, and we make
+ * certain assumptions as a result--notably, that we shouldn't cancel
+ * locks with refs.
+ */
+static void cleanup_resource(struct ldlm_resource *res, struct list_head *q,
+			     __u64 flags)
+{
+	struct list_head *tmp;
+	int rc = 0, client = ns_is_client(ldlm_res_to_ns(res));
+	bool local_only = !!(flags & LDLM_FL_LOCAL_ONLY);
+
+        do {
+                struct ldlm_lock *lock = NULL;
+
+		/* First, we look for non-cleaned-yet lock
+		 * all cleaned locks are marked by CLEANED flag. */
+		lock_res(res);
+		list_for_each(tmp, q) {
+			lock = list_entry(tmp, struct ldlm_lock,
+					  l_res_link);
+			if (ldlm_is_cleaned(lock)) {
+				lock = NULL;
+				continue;
+			}
+			LDLM_LOCK_GET(lock);
+			ldlm_set_cleaned(lock);
+			break;
+		}
+
+                if (lock == NULL) {
+                        unlock_res(res);
+                        break;
+                }
+
+                /* Set CBPENDING so nothing in the cancellation path
+		 * can match this lock. */
+		ldlm_set_cbpending(lock);
+		ldlm_set_failed(lock);
+                lock->l_flags |= flags;
+
+                /* ... without sending a CANCEL message for local_only. */
+                if (local_only)
+			ldlm_set_local_only(lock);
+
+                if (local_only && (lock->l_readers || lock->l_writers)) {
+                        /* This is a little bit gross, but much better than the
+                         * alternative: pretend that we got a blocking AST from
+                         * the server, so that when the lock is decref'd, it
+                         * will go away ... */
+                        unlock_res(res);
+                        LDLM_DEBUG(lock, "setting FL_LOCAL_ONLY");
+			if (lock->l_flags & LDLM_FL_FAIL_LOC) {
+				set_current_state(TASK_UNINTERRUPTIBLE);
+				schedule_timeout(cfs_time_seconds(4));
+				set_current_state(TASK_RUNNING);
+			}
+                        if (lock->l_completion_ast)
+				lock->l_completion_ast(lock,
+						       LDLM_FL_FAILED, NULL);
+                        LDLM_LOCK_RELEASE(lock);
+                        continue;
+                }
+
+                if (client) {
+                        struct lustre_handle lockh;
+
+                        unlock_res(res);
+                        ldlm_lock2handle(lock, &lockh);
+			rc = ldlm_cli_cancel(&lockh, LCF_LOCAL);
+                        if (rc)
+                                CERROR("ldlm_cli_cancel: %d\n", rc);
+                } else {
+                        unlock_res(res);
+                        LDLM_DEBUG(lock, "Freeing a lock still held by a "
+                                   "client node");
+			ldlm_lock_cancel(lock);
+                }
+                LDLM_LOCK_RELEASE(lock);
+        } while (1);
+}
+
+static int ldlm_resource_clean(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			       struct hlist_node *hnode, void *arg)
+{
+        struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+	__u64 flags = *(__u64 *)arg;
+
+        cleanup_resource(res, &res->lr_granted, flags);
+        cleanup_resource(res, &res->lr_converting, flags);
+        cleanup_resource(res, &res->lr_waiting, flags);
+
+        return 0;
+}
+
+static int ldlm_resource_complain(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+				  struct hlist_node *hnode, void *arg)
+{
+	struct ldlm_resource  *res = cfs_hash_object(hs, hnode);
+
+	lock_res(res);
+	CERROR("%s: namespace resource "DLDLMRES" (%p) refcount nonzero "
+	       "(%d) after lock cleanup; forcing cleanup.\n",
+	       ldlm_ns_name(ldlm_res_to_ns(res)), PLDLMRES(res), res,
+	       atomic_read(&res->lr_refcount) - 1);
+
+	ldlm_resource_dump(D_ERROR, res);
+	unlock_res(res);
+	return 0;
+}
+
+/**
+ * Cancel and destroy all locks in the namespace.
+ *
+ * Typically used during evictions when server notified client that it was
+ * evicted and all of its state needs to be destroyed.
+ * Also used during shutdown.
+ */
+int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags)
+{
+        if (ns == NULL) {
+                CDEBUG(D_INFO, "NULL ns, skipping cleanup\n");
+                return ELDLM_OK;
+        }
+
+	cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_clean,
+				 &flags, 0);
+	cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_complain,
+				 NULL, 0);
+	return ELDLM_OK;
+}
+EXPORT_SYMBOL(ldlm_namespace_cleanup);
+
+/**
+ * Attempts to free namespace.
+ *
+ * Only used when namespace goes away, like during an unmount.
+ */
+static int __ldlm_namespace_free(struct ldlm_namespace *ns, int force)
+{
+	ENTRY;
+
+	/* At shutdown time, don't call the cancellation callback */
+	ldlm_namespace_cleanup(ns, force ? LDLM_FL_LOCAL_ONLY : 0);
+
+	if (atomic_read(&ns->ns_bref) > 0) {
+		struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+		int rc;
+		CDEBUG(D_DLMTRACE,
+		       "dlm namespace %s free waiting on refcount %d\n",
+		       ldlm_ns_name(ns), atomic_read(&ns->ns_bref));
+force_wait:
+		if (force)
+			lwi = LWI_TIMEOUT(msecs_to_jiffies(obd_timeout *
+					  MSEC_PER_SEC) / 4, NULL, NULL);
+
+		rc = l_wait_event(ns->ns_waitq,
+				  atomic_read(&ns->ns_bref) == 0, &lwi);
+
+		/* Forced cleanups should be able to reclaim all references,
+		 * so it's safe to wait forever... we can't leak locks... */
+		if (force && rc == -ETIMEDOUT) {
+			LCONSOLE_ERROR("Forced cleanup waiting for %s "
+				       "namespace with %d resources in use, "
+				       "(rc=%d)\n", ldlm_ns_name(ns),
+				       atomic_read(&ns->ns_bref), rc);
+			GOTO(force_wait, rc);
+		}
+
+		if (atomic_read(&ns->ns_bref)) {
+			LCONSOLE_ERROR("Cleanup waiting for %s namespace "
+				       "with %d resources in use, (rc=%d)\n",
+				       ldlm_ns_name(ns),
+				       atomic_read(&ns->ns_bref), rc);
+			RETURN(ELDLM_NAMESPACE_EXISTS);
+		}
+		CDEBUG(D_DLMTRACE, "dlm namespace %s free done waiting\n",
+		       ldlm_ns_name(ns));
+	}
+
+	RETURN(ELDLM_OK);
+}
+
+/**
+ * Performs various cleanups for passed \a ns to make it drop refc and be
+ * ready for freeing. Waits for refc == 0.
+ *
+ * The following is done:
+ * (0) Unregister \a ns from its list to make inaccessible for potential
+ * users like pools thread and others;
+ * (1) Clear all locks in \a ns.
+ */
+void ldlm_namespace_free_prior(struct ldlm_namespace *ns,
+                               struct obd_import *imp,
+                               int force)
+{
+        int rc;
+        ENTRY;
+        if (!ns) {
+                EXIT;
+                return;
+        }
+
+	spin_lock(&ns->ns_lock);
+	ns->ns_stopping = 1;
+	spin_unlock(&ns->ns_lock);
+
+        /*
+         * Can fail with -EINTR when force == 0 in which case try harder.
+         */
+        rc = __ldlm_namespace_free(ns, force);
+        if (rc != ELDLM_OK) {
+                if (imp) {
+                        ptlrpc_disconnect_import(imp, 0);
+                        ptlrpc_invalidate_import(imp);
+                }
+
+                /*
+                 * With all requests dropped and the import inactive
+                 * we are gaurenteed all reference will be dropped.
+                 */
+                rc = __ldlm_namespace_free(ns, 1);
+                LASSERT(rc == 0);
+        }
+        EXIT;
+}
+EXPORT_SYMBOL(ldlm_namespace_free_prior);
+
+/**
+ * Performs freeing memory structures related to \a ns. This is only done
+ * when ldlm_namespce_free_prior() successfully removed all resources
+ * referencing \a ns and its refc == 0.
+ */
+void ldlm_namespace_free_post(struct ldlm_namespace *ns)
+{
+        ENTRY;
+        if (!ns) {
+                EXIT;
+                return;
+        }
+
+	/* Make sure that nobody can find this ns in its list. */
+	ldlm_namespace_unregister(ns, ns->ns_client);
+	/* Fini pool _before_ parent proc dir is removed. This is important as
+	 * ldlm_pool_fini() removes own proc dir which is child to @dir.
+	 * Removing it after @dir may cause oops. */
+	ldlm_pool_fini(&ns->ns_pool);
+
+	ldlm_namespace_proc_unregister(ns);
+	ldlm_namespace_sysfs_unregister(ns);
+	cfs_hash_putref(ns->ns_rs_hash);
+	/* Namespace \a ns should be not on list at this time, otherwise
+	 * this will cause issues related to using freed \a ns in poold
+	 * thread. */
+	LASSERT(list_empty(&ns->ns_list_chain));
+	OBD_FREE_PTR(ns);
+	ldlm_put_ref();
+	EXIT;
+}
+EXPORT_SYMBOL(ldlm_namespace_free_post);
+
+/**
+ * Cleanup the resource, and free namespace.
+ * bug 12864:
+ * Deadlock issue:
+ * proc1: destroy import
+ *        class_disconnect_export(grab cl_sem) ->
+ *              -> ldlm_namespace_free ->
+ *              -> lprocfs_remove(grab _lprocfs_lock).
+ * proc2: read proc info
+ *        lprocfs_fops_read(grab _lprocfs_lock) ->
+ *              -> osc_rd_active, etc(grab cl_sem).
+ *
+ * So that I have to split the ldlm_namespace_free into two parts - the first
+ * part ldlm_namespace_free_prior is used to cleanup the resource which is
+ * being used; the 2nd part ldlm_namespace_free_post is used to unregister the
+ * lprocfs entries, and then free memory. It will be called w/o cli->cl_sem
+ * held.
+ */
+void ldlm_namespace_free(struct ldlm_namespace *ns,
+                         struct obd_import *imp,
+                         int force)
+{
+        ldlm_namespace_free_prior(ns, imp, force);
+        ldlm_namespace_free_post(ns);
+}
+EXPORT_SYMBOL(ldlm_namespace_free);
+
+void ldlm_namespace_get(struct ldlm_namespace *ns)
+{
+	atomic_inc(&ns->ns_bref);
+}
+
+/* This is only for callers that care about refcount */
+static int ldlm_namespace_get_return(struct ldlm_namespace *ns)
+{
+	return atomic_inc_return(&ns->ns_bref);
+}
+
+void ldlm_namespace_put(struct ldlm_namespace *ns)
+{
+	if (atomic_dec_and_lock(&ns->ns_bref, &ns->ns_lock)) {
+		wake_up(&ns->ns_waitq);
+		spin_unlock(&ns->ns_lock);
+	}
+}
+
+/** Register \a ns in the list of namespaces */
+void ldlm_namespace_register(struct ldlm_namespace *ns, enum ldlm_side client)
+{
+	mutex_lock(ldlm_namespace_lock(client));
+	LASSERT(list_empty(&ns->ns_list_chain));
+	list_add(&ns->ns_list_chain, ldlm_namespace_inactive_list(client));
+	ldlm_namespace_nr_inc(client);
+	mutex_unlock(ldlm_namespace_lock(client));
+}
+
+/** Unregister \a ns from the list of namespaces. */
+void ldlm_namespace_unregister(struct ldlm_namespace *ns, enum ldlm_side client)
+{
+	mutex_lock(ldlm_namespace_lock(client));
+	LASSERT(!list_empty(&ns->ns_list_chain));
+	/* Some asserts and possibly other parts of the code are still
+	 * using list_empty(&ns->ns_list_chain). This is why it is
+	 * important to use list_del_init() here. */
+	list_del_init(&ns->ns_list_chain);
+	ldlm_namespace_nr_dec(client);
+	mutex_unlock(ldlm_namespace_lock(client));
+}
+
+/** Should be called with ldlm_namespace_lock(client) taken. */
+void ldlm_namespace_move_to_active_locked(struct ldlm_namespace *ns,
+					  enum ldlm_side client)
+{
+	LASSERT(!list_empty(&ns->ns_list_chain));
+	LASSERT(mutex_is_locked(ldlm_namespace_lock(client)));
+	list_move_tail(&ns->ns_list_chain, ldlm_namespace_list(client));
+}
+
+/** Should be called with ldlm_namespace_lock(client) taken. */
+void ldlm_namespace_move_to_inactive_locked(struct ldlm_namespace *ns,
+					    enum ldlm_side client)
+{
+	LASSERT(!list_empty(&ns->ns_list_chain));
+	LASSERT(mutex_is_locked(ldlm_namespace_lock(client)));
+	list_move_tail(&ns->ns_list_chain,
+		       ldlm_namespace_inactive_list(client));
+}
+
+/** Should be called with ldlm_namespace_lock(client) taken. */
+struct ldlm_namespace *ldlm_namespace_first_locked(enum ldlm_side client)
+{
+	LASSERT(mutex_is_locked(ldlm_namespace_lock(client)));
+	LASSERT(!list_empty(ldlm_namespace_list(client)));
+	return container_of(ldlm_namespace_list(client)->next,
+			    struct ldlm_namespace, ns_list_chain);
+}
+
+/** Create and initialize new resource. */
+static struct ldlm_resource *ldlm_resource_new(enum ldlm_type ldlm_type)
+{
+	struct ldlm_resource *res;
+	int idx;
+
+	OBD_SLAB_ALLOC_PTR_GFP(res, ldlm_resource_slab, GFP_NOFS);
+	if (res == NULL)
+		return NULL;
+
+	if (ldlm_type == LDLM_EXTENT) {
+		OBD_SLAB_ALLOC(res->lr_itree, ldlm_interval_tree_slab,
+			       sizeof(*res->lr_itree) * LCK_MODE_NUM);
+		if (res->lr_itree == NULL) {
+			OBD_SLAB_FREE_PTR(res, ldlm_resource_slab);
+			return NULL;
+		}
+		/* Initialize interval trees for each lock mode. */
+		for (idx = 0; idx < LCK_MODE_NUM; idx++) {
+			res->lr_itree[idx].lit_size = 0;
+			res->lr_itree[idx].lit_mode = 1 << idx;
+			res->lr_itree[idx].lit_root = NULL;
+		}
+	}
+
+	INIT_LIST_HEAD(&res->lr_granted);
+	INIT_LIST_HEAD(&res->lr_converting);
+	INIT_LIST_HEAD(&res->lr_waiting);
+
+	atomic_set(&res->lr_refcount, 1);
+	spin_lock_init(&res->lr_lock);
+	lu_ref_init(&res->lr_reference);
+
+	/* Since LVB init can be delayed now, there is no longer need to
+	 * immediatelly acquire mutex here. */
+	mutex_init(&res->lr_lvb_mutex);
+	res->lr_lvb_initialized = false;
+
+	return res;
+}
+
+/**
+ * Return a reference to resource with given name, creating it if necessary.
+ * Args: namespace with ns_lock unlocked
+ * Locks: takes and releases NS hash-lock and res->lr_lock
+ * Returns: referenced, unlocked ldlm_resource or NULL
+ */
+struct ldlm_resource *
+ldlm_resource_get(struct ldlm_namespace *ns, struct ldlm_resource *parent,
+		  const struct ldlm_res_id *name, enum ldlm_type type,
+		  int create)
+{
+	struct hlist_node	*hnode;
+	struct ldlm_resource	*res = NULL;
+	struct cfs_hash_bd		bd;
+	__u64			version;
+	int			ns_refcount = 0;
+
+        LASSERT(ns != NULL);
+        LASSERT(parent == NULL);
+        LASSERT(ns->ns_rs_hash != NULL);
+        LASSERT(name->name[0] != 0);
+
+        cfs_hash_bd_get_and_lock(ns->ns_rs_hash, (void *)name, &bd, 0);
+        hnode = cfs_hash_bd_lookup_locked(ns->ns_rs_hash, &bd, (void *)name);
+        if (hnode != NULL) {
+                cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 0);
+		GOTO(found, res);
+	}
+
+	version = cfs_hash_bd_version_get(&bd);
+	cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 0);
+
+	if (create == 0)
+		return ERR_PTR(-ENOENT);
+
+	LASSERTF(type >= LDLM_MIN_TYPE && type < LDLM_MAX_TYPE,
+		 "type: %d\n", type);
+	res = ldlm_resource_new(type);
+	if (res == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	res->lr_ns_bucket = cfs_hash_bd_extra_get(ns->ns_rs_hash, &bd);
+	res->lr_name = *name;
+	res->lr_type = type;
+
+	cfs_hash_bd_lock(ns->ns_rs_hash, &bd, 1);
+	hnode = (version == cfs_hash_bd_version_get(&bd)) ? NULL :
+		cfs_hash_bd_lookup_locked(ns->ns_rs_hash, &bd, (void *)name);
+
+	if (hnode != NULL) {
+		/* Someone won the race and already added the resource. */
+		cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+		/* Clean lu_ref for failed resource. */
+		lu_ref_fini(&res->lr_reference);
+		if (res->lr_itree != NULL)
+			OBD_SLAB_FREE(res->lr_itree, ldlm_interval_tree_slab,
+				      sizeof(*res->lr_itree) * LCK_MODE_NUM);
+		OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res);
+found:
+		res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+		return res;
+	}
+	/* We won! Let's add the resource. */
+        cfs_hash_bd_add_locked(ns->ns_rs_hash, &bd, &res->lr_hash);
+	if (cfs_hash_bd_count_get(&bd) == 1)
+		ns_refcount = ldlm_namespace_get_return(ns);
+
+        cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CREATE_RESOURCE, 2);
+
+	/* Let's see if we happened to be the very first resource in this
+	 * namespace. If so, and this is a client namespace, we need to move
+	 * the namespace into the active namespaces list to be patrolled by
+	 * the ldlm_poold. */
+	if (ns_is_client(ns) && ns_refcount == 1) {
+		mutex_lock(ldlm_namespace_lock(LDLM_NAMESPACE_CLIENT));
+		ldlm_namespace_move_to_active_locked(ns, LDLM_NAMESPACE_CLIENT);
+		mutex_unlock(ldlm_namespace_lock(LDLM_NAMESPACE_CLIENT));
+	}
+
+	return res;
+}
+EXPORT_SYMBOL(ldlm_resource_get);
+
+struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res)
+{
+	LASSERT(res != NULL);
+	LASSERT(res != LP_POISON);
+	atomic_inc(&res->lr_refcount);
+	CDEBUG(D_INFO, "getref res: %p count: %d\n", res,
+	       atomic_read(&res->lr_refcount));
+	return res;
+}
+
+static void __ldlm_resource_putref_final(struct cfs_hash_bd *bd,
+                                         struct ldlm_resource *res)
+{
+        struct ldlm_ns_bucket *nsb = res->lr_ns_bucket;
+
+	if (!list_empty(&res->lr_granted)) {
+                ldlm_resource_dump(D_ERROR, res);
+                LBUG();
+        }
+
+	if (!list_empty(&res->lr_converting)) {
+                ldlm_resource_dump(D_ERROR, res);
+                LBUG();
+        }
+
+	if (!list_empty(&res->lr_waiting)) {
+                ldlm_resource_dump(D_ERROR, res);
+                LBUG();
+        }
+
+        cfs_hash_bd_del_locked(nsb->nsb_namespace->ns_rs_hash,
+                               bd, &res->lr_hash);
+        lu_ref_fini(&res->lr_reference);
+        if (cfs_hash_bd_count_get(bd) == 0)
+                ldlm_namespace_put(nsb->nsb_namespace);
+}
+
+/* Returns 1 if the resource was freed, 0 if it remains. */
+int ldlm_resource_putref(struct ldlm_resource *res)
+{
+	struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+	struct cfs_hash_bd   bd;
+
+	LASSERT_ATOMIC_GT_LT(&res->lr_refcount, 0, LI_POISON);
+	CDEBUG(D_INFO, "putref res: %p count: %d\n",
+	       res, atomic_read(&res->lr_refcount) - 1);
+
+	cfs_hash_bd_get(ns->ns_rs_hash, &res->lr_name, &bd);
+	if (cfs_hash_bd_dec_and_lock(ns->ns_rs_hash, &bd, &res->lr_refcount)) {
+		__ldlm_resource_putref_final(&bd, res);
+		cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+		if (ns->ns_lvbo && ns->ns_lvbo->lvbo_free)
+			ns->ns_lvbo->lvbo_free(res);
+		if (res->lr_itree != NULL)
+			OBD_SLAB_FREE(res->lr_itree, ldlm_interval_tree_slab,
+				      sizeof(*res->lr_itree) * LCK_MODE_NUM);
+		OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res);
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ldlm_resource_putref);
+
+/**
+ * Add a lock into a given resource into specified lock list.
+ */
+void ldlm_resource_add_lock(struct ldlm_resource *res, struct list_head *head,
+			    struct ldlm_lock *lock)
+{
+	check_res_locked(res);
+
+	LDLM_DEBUG(lock, "About to add this lock");
+
+	if (ldlm_is_destroyed(lock)) {
+		CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n");
+		return;
+	}
+
+	LASSERT(list_empty(&lock->l_res_link));
+
+	list_add_tail(&lock->l_res_link, head);
+}
+
+/**
+ * Insert a lock into resource after specified lock.
+ *
+ * Obtain resource description from the lock we are inserting after.
+ */
+void ldlm_resource_insert_lock_after(struct ldlm_lock *original,
+                                     struct ldlm_lock *new)
+{
+        struct ldlm_resource *res = original->l_resource;
+
+        check_res_locked(res);
+
+        ldlm_resource_dump(D_INFO, res);
+	LDLM_DEBUG(new, "About to insert this lock after %p: ", original);
+
+	if (ldlm_is_destroyed(new)) {
+		CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n");
+		goto out;
+	}
+
+	LASSERT(list_empty(&new->l_res_link));
+
+	list_add(&new->l_res_link, &original->l_res_link);
+ out:;
+}
+
+void ldlm_resource_unlink_lock(struct ldlm_lock *lock)
+{
+        int type = lock->l_resource->lr_type;
+
+        check_res_locked(lock->l_resource);
+        if (type == LDLM_IBITS || type == LDLM_PLAIN)
+                ldlm_unlink_lock_skiplist(lock);
+        else if (type == LDLM_EXTENT)
+                ldlm_extent_unlink_lock(lock);
+	list_del_init(&lock->l_res_link);
+}
+EXPORT_SYMBOL(ldlm_resource_unlink_lock);
+
+void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc)
+{
+        desc->lr_type = res->lr_type;
+        desc->lr_name = res->lr_name;
+}
+
+/**
+ * Print information about all locks in all namespaces on this node to debug
+ * log.
+ */
+void ldlm_dump_all_namespaces(enum ldlm_side client, int level)
+{
+	struct list_head *tmp;
+
+	if (!((libcfs_debug | D_ERROR) & level))
+		return;
+
+	mutex_lock(ldlm_namespace_lock(client));
+
+	list_for_each(tmp, ldlm_namespace_list(client)) {
+		struct ldlm_namespace *ns;
+
+		ns = list_entry(tmp, struct ldlm_namespace, ns_list_chain);
+		ldlm_namespace_dump(level, ns);
+	}
+
+	mutex_unlock(ldlm_namespace_lock(client));
+}
+
+static int ldlm_res_hash_dump(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			      struct hlist_node *hnode, void *arg)
+{
+        struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+        int    level = (int)(unsigned long)arg;
+
+        lock_res(res);
+        ldlm_resource_dump(level, res);
+        unlock_res(res);
+
+        return 0;
+}
+
+/**
+ * Print information about all locks in this namespace on this node to debug
+ * log.
+ */
+void ldlm_namespace_dump(int level, struct ldlm_namespace *ns)
+{
+	if (!((libcfs_debug | D_ERROR) & level))
+		return;
+
+	CDEBUG(level, "--- Namespace: %s (rc: %d, side: %s)\n",
+	       ldlm_ns_name(ns), atomic_read(&ns->ns_bref),
+	       ns_is_client(ns) ? "client" : "server");
+
+	if (cfs_time_before(cfs_time_current(), ns->ns_next_dump))
+		return;
+
+	cfs_hash_for_each_nolock(ns->ns_rs_hash,
+				 ldlm_res_hash_dump,
+				 (void *)(unsigned long)level, 0);
+	spin_lock(&ns->ns_lock);
+	ns->ns_next_dump = cfs_time_shift(10);
+	spin_unlock(&ns->ns_lock);
+}
+
+/**
+ * Print information about all locks in this resource to debug log.
+ */
+void ldlm_resource_dump(int level, struct ldlm_resource *res)
+{
+	struct ldlm_lock *lock;
+	unsigned int granted = 0;
+
+	CLASSERT(RES_NAME_SIZE == 4);
+
+	if (!((libcfs_debug | D_ERROR) & level))
+		return;
+
+	CDEBUG(level, "--- Resource: "DLDLMRES" (%p) refcount = %d\n",
+	       PLDLMRES(res), res, atomic_read(&res->lr_refcount));
+
+	if (!list_empty(&res->lr_granted)) {
+		CDEBUG(level, "Granted locks (in reverse order):\n");
+		list_for_each_entry_reverse(lock, &res->lr_granted,
+						l_res_link) {
+                        LDLM_DEBUG_LIMIT(level, lock, "###");
+                        if (!(level & D_CANTMASK) &&
+                            ++granted > ldlm_dump_granted_max) {
+                                CDEBUG(level, "only dump %d granted locks to "
+                                       "avoid DDOS.\n", granted);
+                                break;
+                        }
+                }
+        }
+	if (!list_empty(&res->lr_converting)) {
+                CDEBUG(level, "Converting locks:\n");
+		list_for_each_entry(lock, &res->lr_converting, l_res_link)
+                        LDLM_DEBUG_LIMIT(level, lock, "###");
+        }
+	if (!list_empty(&res->lr_waiting)) {
+                CDEBUG(level, "Waiting locks:\n");
+		list_for_each_entry(lock, &res->lr_waiting, l_res_link)
+                        LDLM_DEBUG_LIMIT(level, lock, "###");
+        }
+}
+EXPORT_SYMBOL(ldlm_resource_dump);
diff --git a/drivers/staging/lustrefsx/lustre/llite/dcache.c b/drivers/staging/lustrefsx/lustre/llite/dcache.c
new file mode 100644
index 0000000000000..6da6b5956ab4e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/dcache.c
@@ -0,0 +1,383 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/quotaops.h>
+#include <linux/kernel.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_dlm.h>
+
+#include "llite_internal.h"
+
+static void free_dentry_data(struct rcu_head *head)
+{
+	struct ll_dentry_data *lld;
+
+	lld = container_of(head, struct ll_dentry_data, lld_rcu_head);
+	OBD_FREE_PTR(lld);
+}
+
+/* should NOT be called with the dcache lock, see fs/dcache.c */
+static void ll_release(struct dentry *de)
+{
+        struct ll_dentry_data *lld;
+        ENTRY;
+        LASSERT(de != NULL);
+        lld = ll_d2d(de);
+        if (lld == NULL) /* NFS copies the de->d_op methods (bug 4655) */
+                RETURN_EXIT;
+
+        if (lld->lld_it) {
+                ll_intent_release(lld->lld_it);
+                OBD_FREE(lld->lld_it, sizeof(*lld->lld_it));
+        }
+
+	de->d_fsdata = NULL;
+	call_rcu(&lld->lld_rcu_head, free_dentry_data);
+
+	EXIT;
+}
+
+/* Compare if two dentries are the same.  Don't match if the existing dentry
+ * is marked invalid.  Returns 1 if different, 0 if the same.
+ *
+ * This avoids a race where ll_lookup_it() instantiates a dentry, but we get
+ * an AST before calling d_revalidate_it().  The dentry still exists (marked
+ * INVALID) so d_lookup() matches it, but we have no lock on it (so
+ * lock_match() fails) and we spin around real_lookup(). */
+#ifdef HAVE_D_COMPARE_7ARGS
+static int ll_dcompare(const struct dentry *parent, const struct inode *pinode,
+		       const struct dentry *dentry, const struct inode *inode,
+		       unsigned int len, const char *str,
+		       const struct qstr *name)
+#elif defined(HAVE_D_COMPARE_5ARGS)
+static int ll_dcompare(const struct dentry *parent, const struct dentry *dentry,
+		       unsigned int len, const char *str,
+		       const struct qstr *name)
+#elif defined(HAVE_D_COMPARE_4ARGS)
+static int ll_dcompare(const struct dentry *dentry, unsigned int len,
+		       const char *str, const struct qstr *name)
+#else
+static int ll_dcompare(struct dentry *parent, struct qstr *d_name,
+		       struct qstr *name)
+#endif
+{
+#if !defined(HAVE_D_COMPARE_7ARGS) && !defined(HAVE_D_COMPARE_5ARGS) && !defined(HAVE_D_COMPARE_4ARGS)
+	/* XXX: (ugh !) d_name must be in-dentry structure */
+	struct dentry *dentry = container_of(d_name, struct dentry, d_name);
+	unsigned int len = d_name->len;
+	const char *str = d_name->name;
+#endif
+	ENTRY;
+
+	if (len != name->len)
+		RETURN(1);
+
+	if (memcmp(str, name->name, len))
+		RETURN(1);
+
+	CDEBUG(D_DENTRY, "found name %.*s(%p) flags %#x refc %d\n",
+	       name->len, name->name, dentry, dentry->d_flags,
+	       ll_d_count(dentry));
+
+	/* mountpoint is always valid */
+	if (d_mountpoint((struct dentry *)dentry))
+		RETURN(0);
+
+	if (d_lustre_invalid(dentry))
+		RETURN(1);
+
+	RETURN(0);
+}
+
+/**
+ * Called when last reference to a dentry is dropped and dcache wants to know
+ * whether or not it should cache it:
+ * - return 1 to delete the dentry immediately
+ * - return 0 to cache the dentry
+ * Should NOT be called with the dcache lock, see fs/dcache.c
+ */
+static int ll_ddelete(HAVE_D_DELETE_CONST struct dentry *de)
+{
+	ENTRY;
+	LASSERT(de);
+
+	CDEBUG(D_DENTRY, "%s dentry %.*s (%p, parent %p, inode %p) %s%s\n",
+	       d_lustre_invalid((struct dentry *)de) ? "deleting" : "keeping",
+	       de->d_name.len, de->d_name.name, de, de->d_parent, de->d_inode,
+	       d_unhashed((struct dentry *)de) ? "" : "hashed,",
+	       list_empty(&de->d_subdirs) ? "" : "subdirs");
+
+#ifdef HAVE_DCACHE_LOCK
+	LASSERT(ll_d_count(de) == 0);
+#else
+	/* kernel >= 2.6.38 last refcount is decreased after this function. */
+	LASSERT(ll_d_count(de) == 1);
+#endif
+
+	if (d_lustre_invalid((struct dentry *)de))
+		RETURN(1);
+	RETURN(0);
+}
+
+int ll_d_init(struct dentry *de)
+{
+	ENTRY;
+	LASSERT(de != NULL);
+
+	CDEBUG(D_DENTRY, "ldd on dentry %.*s (%p) parent %p inode %p refc %d\n",
+		de->d_name.len, de->d_name.name, de, de->d_parent, de->d_inode,
+		ll_d_count(de));
+
+	if (de->d_fsdata == NULL) {
+		struct ll_dentry_data *lld;
+
+		OBD_ALLOC_PTR(lld);
+		if (likely(lld != NULL)) {
+			spin_lock(&de->d_lock);
+			if (likely(de->d_fsdata == NULL)) {
+#ifdef HAVE_DCACHE_LOCK
+				/* kernel >= 2.6.38 d_op is set in d_alloc() */
+				de->d_op = &ll_d_ops;
+				smp_mb();
+#endif
+				de->d_fsdata = lld;
+				__d_lustre_invalidate(de);
+			} else {
+				OBD_FREE_PTR(lld);
+			}
+			spin_unlock(&de->d_lock);
+		} else {
+			RETURN(-ENOMEM);
+		}
+	}
+	LASSERT(de->d_op == &ll_d_ops);
+
+	RETURN(0);
+}
+
+void ll_intent_drop_lock(struct lookup_intent *it)
+{
+	if (it->it_op && it->it_lock_mode) {
+		struct lustre_handle handle;
+
+		handle.cookie = it->it_lock_handle;
+
+		CDEBUG(D_DLMTRACE, "releasing lock with cookie %#llx from it %p\n",
+		       handle.cookie, it);
+		ldlm_lock_decref(&handle, it->it_lock_mode);
+
+		/* bug 494: intent_release may be called multiple times, from
+		 * this thread and we don't want to double-decref this lock */
+		it->it_lock_mode = 0;
+		if (it->it_remote_lock_mode != 0) {
+			handle.cookie = it->it_remote_lock_handle;
+
+			CDEBUG(D_DLMTRACE, "releasing remote lock with cookie"
+			       "%#llx from it %p\n", handle.cookie, it);
+			ldlm_lock_decref(&handle,
+					 it->it_remote_lock_mode);
+			it->it_remote_lock_mode = 0;
+		}
+	}
+}
+
+void ll_intent_release(struct lookup_intent *it)
+{
+        ENTRY;
+
+        CDEBUG(D_INFO, "intent %p released\n", it);
+        ll_intent_drop_lock(it);
+        /* We are still holding extra reference on a request, need to free it */
+        if (it_disposition(it, DISP_ENQ_OPEN_REF))
+		ptlrpc_req_finished(it->it_request); /* ll_file_open */
+
+	if (it_disposition(it, DISP_ENQ_CREATE_REF)) /* create rec */
+		ptlrpc_req_finished(it->it_request);
+
+	it->it_disposition = 0;
+	it->it_request = NULL;
+	EXIT;
+}
+
+void ll_invalidate_aliases(struct inode *inode)
+{
+	struct dentry *dentry;
+	DECLARE_LL_D_HLIST_NODE_PTR(p);
+	ENTRY;
+
+	LASSERT(inode != NULL);
+
+	CDEBUG(D_INODE, "marking dentries for inode "DFID"(%p) invalid\n",
+	       PFID(ll_inode2fid(inode)), inode);
+
+	ll_lock_dcache(inode);
+	ll_d_hlist_for_each_entry(dentry, p, &inode->i_dentry) {
+		CDEBUG(D_DENTRY, "dentry in drop %.*s (%p) parent %p "
+		       "inode %p flags %d\n", dentry->d_name.len,
+		       dentry->d_name.name, dentry, dentry->d_parent,
+		       dentry->d_inode, dentry->d_flags);
+
+		d_lustre_invalidate(dentry, 0);
+	}
+	ll_unlock_dcache(inode);
+
+        EXIT;
+}
+
+int ll_revalidate_it_finish(struct ptlrpc_request *request,
+                            struct lookup_intent *it,
+                            struct dentry *de)
+{
+        int rc = 0;
+        ENTRY;
+
+        if (!request)
+                RETURN(0);
+
+        if (it_disposition(it, DISP_LOOKUP_NEG))
+                RETURN(-ENOENT);
+
+        rc = ll_prep_inode(&de->d_inode, request, NULL, it);
+
+        RETURN(rc);
+}
+
+void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry)
+{
+        LASSERT(it != NULL);
+        LASSERT(dentry != NULL);
+
+	if (it->it_lock_mode && dentry->d_inode != NULL) {
+                struct inode *inode = dentry->d_inode;
+                struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
+
+		CDEBUG(D_DLMTRACE, "setting l_data to inode "DFID"(%p)\n",
+		       PFID(ll_inode2fid(inode)), inode);
+                ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL);
+        }
+
+        /* drop lookup or getattr locks immediately */
+        if (it->it_op == IT_LOOKUP || it->it_op == IT_GETATTR) {
+                /* on 2.6 there are situation when several lookups and
+                 * revalidations may be requested during single operation.
+                 * therefore, we don't release intent here -bzzz */
+                ll_intent_drop_lock(it);
+        }
+}
+
+static int ll_revalidate_dentry(struct dentry *dentry,
+				unsigned int lookup_flags)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+
+	/* If this is intermediate component path lookup and we were able to get
+	 * to this dentry, then its lock has not been revoked and the
+	 * path component is valid. */
+	if (lookup_flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))
+		return 1;
+
+	/* Symlink - always valid as long as the dentry was found */
+#ifdef HAVE_IOP_GET_LINK
+	if (dentry->d_inode && dentry->d_inode->i_op->get_link)
+#else
+	if (dentry->d_inode && dentry->d_inode->i_op->follow_link)
+#endif
+		return 1;
+
+	/*
+	 * VFS warns us that this is the second go around and previous
+	 * operation failed (most likely open|creat), so this time
+	 * we better talk to the server via the lookup path by name,
+	 * not by fid.
+	 */
+	if (lookup_flags & LOOKUP_REVAL)
+		return 0;
+
+#ifndef HAVE_DCACHE_LOCK
+	if (lookup_flags & LOOKUP_RCU)
+		return -ECHILD;
+#endif
+
+	if (dentry_may_statahead(dir, dentry))
+		ll_statahead(dir, &dentry, dentry->d_inode == NULL);
+
+	return 1;
+}
+
+/*
+ * Always trust cached dentries. Update statahead window if necessary.
+ */
+#ifdef HAVE_IOP_ATOMIC_OPEN
+static int ll_revalidate_nd(struct dentry *dentry, unsigned int flags)
+{
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%s, flags=%u\n",
+	       dentry->d_name.name, flags);
+
+	rc = ll_revalidate_dentry(dentry, flags);
+	RETURN(rc);
+}
+#else
+static int ll_revalidate_nd(struct dentry *dentry, struct nameidata *nd)
+{
+	int rc;
+	ENTRY;
+
+	/*
+	 * this is normally called from NFS export, and we don't know whether
+	 * this is the last component.
+	 */
+	if (nd == NULL)
+		RETURN(1);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%s, flags=%u\n",
+	       dentry->d_name.name, nd->flags);
+
+	rc = ll_revalidate_dentry(dentry, nd->flags);
+	RETURN(rc);
+}
+#endif
+
+const struct dentry_operations ll_d_ops = {
+        .d_revalidate = ll_revalidate_nd,
+        .d_release = ll_release,
+        .d_delete  = ll_ddelete,
+        .d_compare = ll_dcompare,
+};
diff --git a/drivers/staging/lustrefsx/lustre/llite/dir.c b/drivers/staging/lustrefsx/lustre/llite/dir.c
new file mode 100644
index 0000000000000..6e987fe2f7387
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/dir.c
@@ -0,0 +1,1846 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/dir.c
+ *
+ * Directory code for lustre client.
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/mm.h>
+#include <linux/version.h>
+#include <linux/user_namespace.h>
+#ifdef HAVE_UIDGID_HEADER
+# include <linux/uidgid.h>
+#endif
+#include <asm/uaccess.h>
+#include <linux/buffer_head.h>   // for wait_on_buffer
+#include <linux/pagevec.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <lustre/lustre_idl.h>
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <uapi/linux/lustre_ioctl.h>
+#include <lustre_lib.h>
+#include <lustre_dlm.h>
+#include <lustre_fid.h>
+#include <lustre_kernelcomm.h>
+#include <lustre_swab.h>
+
+#include "llite_internal.h"
+
+/*
+ * (new) readdir implementation overview.
+ *
+ * Original lustre readdir implementation cached exact copy of raw directory
+ * pages on the client. These pages were indexed in client page cache by
+ * logical offset in the directory file. This design, while very simple and
+ * intuitive had some inherent problems:
+ *
+ *     . it implies that byte offset to the directory entry serves as a
+ *     telldir(3)/seekdir(3) cookie, but that offset is not stable: in
+ *     ext3/htree directory entries may move due to splits, and more
+ *     importantly,
+ *
+ *     . it is incompatible with the design of split directories for cmd3,
+ *     that assumes that names are distributed across nodes based on their
+ *     hash, and so readdir should be done in hash order.
+ *
+ * New readdir implementation does readdir in hash order, and uses hash of a
+ * file name as a telldir/seekdir cookie. This led to number of complications:
+ *
+ *     . hash is not unique, so it cannot be used to index cached directory
+ *     pages on the client (note, that it requires a whole pageful of hash
+ *     collided entries to cause two pages to have identical hashes);
+ *
+ *     . hash is not unique, so it cannot, strictly speaking, be used as an
+ *     entry cookie. ext3/htree has the same problem and lustre implementation
+ *     mimics their solution: seekdir(hash) positions directory at the first
+ *     entry with the given hash.
+ *
+ * Client side.
+ *
+ * 0. caching
+ *
+ * Client caches directory pages using hash of the first entry as an index. As
+ * noted above hash is not unique, so this solution doesn't work as is:
+ * special processing is needed for "page hash chains" (i.e., sequences of
+ * pages filled with entries all having the same hash value).
+ *
+ * First, such chains have to be detected. To this end, server returns to the
+ * client the hash of the first entry on the page next to one returned. When
+ * client detects that this hash is the same as hash of the first entry on the
+ * returned page, page hash collision has to be handled. Pages in the
+ * hash chain, except first one, are termed "overflow pages".
+ *
+ * Solution to index uniqueness problem is to not cache overflow
+ * pages. Instead, when page hash collision is detected, all overflow pages
+ * from emerging chain are immediately requested from the server and placed in
+ * a special data structure (struct ll_dir_chain). This data structure is used
+ * by ll_readdir() to process entries from overflow pages. When readdir
+ * invocation finishes, overflow pages are discarded. If page hash collision
+ * chain weren't completely processed, next call to readdir will again detect
+ * page hash collision, again read overflow pages in, process next portion of
+ * entries and again discard the pages. This is not as wasteful as it looks,
+ * because, given reasonable hash, page hash collisions are extremely rare.
+ *
+ * 1. directory positioning
+ *
+ * When seekdir(hash) is called, original
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ * Server.
+ *
+ * identification of and access to overflow pages
+ *
+ * page format
+ *
+ * Page in MDS_READPAGE RPC is packed in LU_PAGE_SIZE, and each page contains
+ * a header lu_dirpage which describes the start/end hash, and whether this
+ * page is empty (contains no dir entry) or hash collide with next page.
+ * After client receives reply, several pages will be integrated into dir page
+ * in PAGE_SIZE (if PAGE_SIZE greater than LU_PAGE_SIZE), and the
+ * lu_dirpage for this integrated page will be adjusted. See
+ * mdc_adjust_dirpages().
+ *
+ */
+struct page *ll_get_dir_page(struct inode *dir, struct md_op_data *op_data,
+			     __u64 offset, struct ll_dir_chain *chain)
+{
+	struct md_callback	cb_op;
+	struct page		*page;
+	int			rc;
+
+	cb_op.md_blocking_ast = ll_md_blocking_ast;
+	rc = md_read_page(ll_i2mdexp(dir), op_data, &cb_op, offset, &page);
+	if (rc != 0)
+		return ERR_PTR(rc);
+
+	return page;
+}
+
+void ll_release_page(struct inode *inode, struct page *page,
+		     bool remove)
+{
+	kunmap(page);
+
+	/* Always remove the page for striped dir, because the page is
+	 * built from temporarily in LMV layer */
+	if (inode != NULL && S_ISDIR(inode->i_mode) &&
+	    ll_i2info(inode)->lli_lsm_md != NULL) {
+		__free_page(page);
+		return;
+	}
+
+	if (remove) {
+		lock_page(page);
+		if (likely(page->mapping != NULL))
+			truncate_complete_page(page->mapping, page);
+		unlock_page(page);
+	}
+	put_page(page);
+}
+
+/**
+ * return IF_* type for given lu_dirent entry.
+ * IF_* flag shld be converted to particular OS file type in
+ * platform llite module.
+ */
+static u16 ll_dirent_type_get(struct lu_dirent *ent)
+{
+	u16 type = 0;
+	struct luda_type *lt;
+	int len = 0;
+
+	if (le32_to_cpu(ent->lde_attrs) & LUDA_TYPE) {
+		const unsigned align = sizeof(struct luda_type) - 1;
+
+		len = le16_to_cpu(ent->lde_namelen);
+		len = (len + align) & ~align;
+		lt = (void *)ent->lde_name + len;
+		type = IFTODT(le16_to_cpu(lt->lt_type));
+	}
+
+	return type;
+}
+
+#ifdef HAVE_DIR_CONTEXT
+int ll_dir_read(struct inode *inode, __u64 *ppos, struct md_op_data *op_data,
+		struct dir_context *ctx)
+{
+#else
+int ll_dir_read(struct inode *inode, __u64 *ppos, struct md_op_data *op_data,
+		void *cookie, filldir_t filldir)
+{
+#endif
+	struct ll_sb_info    *sbi        = ll_i2sbi(inode);
+	__u64                 pos        = *ppos;
+	bool                  is_api32 = ll_need_32bit_api(sbi);
+	bool                  is_hash64 = sbi->ll_flags & LL_SBI_64BIT_HASH;
+	struct page          *page;
+	struct ll_dir_chain   chain;
+	bool                  done = false;
+	int                   rc = 0;
+	ENTRY;
+
+	ll_dir_chain_init(&chain);
+
+	page = ll_get_dir_page(inode, op_data, pos, &chain);
+
+	while (rc == 0 && !done) {
+		struct lu_dirpage *dp;
+		struct lu_dirent  *ent;
+		__u64 hash;
+		__u64 next;
+
+		if (IS_ERR(page)) {
+			rc = PTR_ERR(page);
+			break;
+		}
+
+		hash = MDS_DIR_END_OFF;
+		dp = page_address(page);
+		for (ent = lu_dirent_start(dp); ent != NULL && !done;
+		     ent = lu_dirent_next(ent)) {
+			__u16          type;
+			int            namelen;
+			struct lu_fid  fid;
+			__u64          lhash;
+			__u64          ino;
+
+			hash = le64_to_cpu(ent->lde_hash);
+			if (hash < pos) /* Skip until we find target hash */
+				continue;
+
+			namelen = le16_to_cpu(ent->lde_namelen);
+			if (namelen == 0) /* Skip dummy record */
+				continue;
+
+			if (is_api32 && is_hash64)
+				lhash = hash >> 32;
+			else
+				lhash = hash;
+			fid_le_to_cpu(&fid, &ent->lde_fid);
+			ino = cl_fid_build_ino(&fid, is_api32);
+			type = ll_dirent_type_get(ent);
+			/* For ll_nfs_get_name_filldir(), it will try to access
+			 * 'ent' through 'lde_name', so the parameter 'name'
+			 * for 'filldir()' must be part of the 'ent'. */
+#ifdef HAVE_DIR_CONTEXT
+			ctx->pos = lhash;
+			done = !dir_emit(ctx, ent->lde_name, namelen, ino,
+					 type);
+#else
+			done = filldir(cookie, ent->lde_name, namelen, lhash,
+				       ino, type);
+#endif
+		}
+
+		if (done) {
+			pos = hash;
+			ll_release_page(inode, page, false);
+			break;
+		}
+
+		next = le64_to_cpu(dp->ldp_hash_end);
+		pos = next;
+		if (pos == MDS_DIR_END_OFF) {
+			/*
+			 * End of directory reached.
+			 */
+			done = 1;
+			ll_release_page(inode, page, false);
+		} else {
+			/*
+			 * Normal case: continue to the next
+			 * page.
+			 */
+			ll_release_page(inode, page,
+					le32_to_cpu(dp->ldp_flags) &
+					LDF_COLLIDE);
+			next = pos;
+			page = ll_get_dir_page(inode, op_data, pos,
+					       &chain);
+		}
+	}
+#ifdef HAVE_DIR_CONTEXT
+	ctx->pos = pos;
+#else
+	*ppos = pos;
+#endif
+	ll_dir_chain_fini(&chain);
+	RETURN(rc);
+}
+
+#ifdef HAVE_DIR_CONTEXT
+static int ll_iterate(struct file *filp, struct dir_context *ctx)
+#else
+static int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
+#endif
+{
+	struct inode		*inode	= file_inode(filp);
+	struct ll_file_data	*lfd	= LUSTRE_FPRIVATE(filp);
+	struct ll_sb_info	*sbi	= ll_i2sbi(inode);
+	int			hash64	= sbi->ll_flags & LL_SBI_64BIT_HASH;
+	int			api32	= ll_need_32bit_api(sbi);
+	struct md_op_data	*op_data;
+	__u64			pos;
+	int			rc;
+	ENTRY;
+
+	if (lfd != NULL)
+		pos = lfd->lfd_pos;
+	else
+		pos = 0;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p) pos/size"
+	       "%lu/%llu 32bit_api %d\n", PFID(ll_inode2fid(inode)),
+	       inode, (unsigned long)pos, i_size_read(inode), api32);
+
+	if (pos == MDS_DIR_END_OFF)
+		/*
+		 * end-of-file.
+		 */
+		GOTO(out, rc = 0);
+
+	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, inode);
+	if (IS_ERR(op_data))
+		GOTO(out, rc = PTR_ERR(op_data));
+
+	if (unlikely(op_data->op_mea1 != NULL)) {
+		/* This is only needed for striped dir to fill ..,
+		 * see lmv_read_entry */
+		if (file_dentry(filp)->d_parent != NULL &&
+		    file_dentry(filp)->d_parent->d_inode != NULL) {
+			__u64 ibits = MDS_INODELOCK_UPDATE;
+			struct inode *parent =
+				file_dentry(filp)->d_parent->d_inode;
+
+			if (ll_have_md_lock(parent, &ibits, LCK_MINMODE))
+				op_data->op_fid3 = *ll_inode2fid(parent);
+		}
+
+		/* If it can not find in cache, do lookup .. on the master
+		 * object */
+		if (fid_is_zero(&op_data->op_fid3)) {
+			rc = ll_dir_get_parent_fid(inode, &op_data->op_fid3);
+			if (rc != 0) {
+				ll_finish_md_op_data(op_data);
+				RETURN(rc);
+			}
+		}
+	}
+#ifdef HAVE_DIR_CONTEXT
+	ctx->pos = pos;
+	rc = ll_dir_read(inode, &pos, op_data, ctx);
+	pos = ctx->pos;
+#else
+	rc = ll_dir_read(inode, &pos, op_data, cookie, filldir);
+#endif
+	if (lfd != NULL)
+		lfd->lfd_pos = pos;
+
+	if (pos == MDS_DIR_END_OFF) {
+		if (api32)
+			pos = LL_DIR_END_OFF_32BIT;
+		else
+			pos = LL_DIR_END_OFF;
+	} else {
+		if (api32 && hash64)
+			pos = pos >> 32;
+	}
+#ifdef HAVE_DIR_CONTEXT
+	ctx->pos = pos;
+#else
+	filp->f_pos = pos;
+#endif
+	ll_finish_md_op_data(op_data);
+
+out:
+	if (!rc)
+		ll_stats_ops_tally(sbi, LPROC_LL_READDIR, 1);
+
+	RETURN(rc);
+}
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0)
+static int ll_send_mgc_param(struct obd_export *mgc, char *string)
+{
+        struct mgs_send_param *msp;
+        int rc = 0;
+
+        OBD_ALLOC_PTR(msp);
+        if (!msp)
+                return -ENOMEM;
+
+	strlcpy(msp->mgs_param, string, sizeof(msp->mgs_param));
+        rc = obd_set_info_async(NULL, mgc, sizeof(KEY_SET_INFO), KEY_SET_INFO,
+                                sizeof(struct mgs_send_param), msp, NULL);
+        if (rc)
+                CERROR("Failed to set parameter: %d\n", rc);
+        OBD_FREE_PTR(msp);
+
+        return rc;
+}
+#endif
+
+/**
+ * Create striped directory with specified stripe(@lump)
+ *
+ * \param[in] dparent	the parent of the directory.
+ * \param[in] lump	the specified stripes.
+ * \param[in] dirname	the name of the directory.
+ * \param[in] mode	the specified mode of the directory.
+ *
+ * \retval		=0 if striped directory is being created successfully.
+ *                      <0 if the creation is failed.
+ */
+static int ll_dir_setdirstripe(struct dentry *dparent, struct lmv_user_md *lump,
+			       const char *dirname, umode_t mode)
+{
+	struct inode *parent = dparent->d_inode;
+	struct ptlrpc_request *request = NULL;
+	struct md_op_data *op_data;
+	struct ll_sb_info *sbi = ll_i2sbi(parent);
+	struct inode *inode = NULL;
+	struct dentry dentry = {
+		.d_parent = dparent,
+		.d_name = {
+			.name = dirname,
+			.len = strlen(dirname),
+			.hash = ll_full_name_hash(dparent, dirname,
+						  strlen(dirname)),
+		},
+	};
+	int err;
+	ENTRY;
+
+	if (unlikely(lump->lum_magic != LMV_USER_MAGIC))
+		RETURN(-EINVAL);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p) name %s "
+	       "stripe_offset %d, stripe_count: %u\n",
+	       PFID(ll_inode2fid(parent)), parent, dirname,
+	       (int)lump->lum_stripe_offset, lump->lum_stripe_count);
+
+	if (lump->lum_stripe_count > 1 &&
+	    !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_DIR_STRIPE))
+		RETURN(-EINVAL);
+
+	if (IS_DEADDIR(parent) &&
+	    !OBD_FAIL_CHECK(OBD_FAIL_LLITE_NO_CHECK_DEAD))
+		RETURN(-ENOENT);
+
+	if (lump->lum_magic != cpu_to_le32(LMV_USER_MAGIC))
+		lustre_swab_lmv_user_md(lump);
+
+	if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent)))
+		mode &= ~current_umask();
+	mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR;
+	op_data = ll_prep_md_op_data(NULL, parent, NULL, dirname,
+				     strlen(dirname), mode, LUSTRE_OPC_MKDIR,
+				     lump);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	if (sbi->ll_flags & LL_SBI_FILE_SECCTX) {
+		/* selinux_dentry_init_security() uses dentry->d_parent and name
+		 * to determine the security context for the file. So our fake
+		 * dentry should be real enough for this purpose. */
+		err = ll_dentry_init_security(&dentry, mode, &dentry.d_name,
+					      &op_data->op_file_secctx_name,
+					      &op_data->op_file_secctx,
+					      &op_data->op_file_secctx_size);
+		if (err < 0)
+			GOTO(out_op_data, err);
+	}
+
+	op_data->op_cli_flags |= CLI_SET_MEA;
+	err = md_create(sbi->ll_md_exp, op_data, lump, sizeof(*lump), mode,
+			from_kuid(&init_user_ns, current_fsuid()),
+			from_kgid(&init_user_ns, current_fsgid()),
+			cfs_curproc_cap_pack(), 0, &request);
+	if (err)
+		GOTO(out_request, err);
+
+	CFS_FAIL_TIMEOUT(OBD_FAIL_LLITE_SETDIRSTRIPE_PAUSE, cfs_fail_val);
+
+	err = ll_prep_inode(&inode, request, parent->i_sb, NULL);
+	if (err)
+		GOTO(out_inode, err);
+
+	dentry.d_inode = inode;
+
+	if (sbi->ll_flags & LL_SBI_FILE_SECCTX) {
+		inode_lock(inode);
+		err = security_inode_notifysecctx(inode,
+						  op_data->op_file_secctx,
+						  op_data->op_file_secctx_size);
+		inode_unlock(inode);
+	} else {
+		err = ll_inode_init_security(&dentry, inode, parent);
+	}
+	if (err)
+		GOTO(out_inode, err);
+
+out_inode:
+	if (inode != NULL)
+		iput(inode);
+out_request:
+	ptlrpc_req_finished(request);
+out_op_data:
+	ll_finish_md_op_data(op_data);
+
+	return err;
+}
+
+int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
+                     int set_default)
+{
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        struct md_op_data *op_data;
+        struct ptlrpc_request *req = NULL;
+        int rc = 0;
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0)
+        struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
+        struct obd_device *mgc = lsi->lsi_mgc;
+#endif
+        int lum_size;
+	ENTRY;
+
+        if (lump != NULL) {
+                /*
+                 * This is coming from userspace, so should be in
+                 * local endian.  But the MDS would like it in little
+                 * endian, so we swab it before we send it.
+                 */
+                switch (lump->lmm_magic) {
+                case LOV_USER_MAGIC_V1: {
+                        if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V1))
+                                lustre_swab_lov_user_md_v1(lump);
+                        lum_size = sizeof(struct lov_user_md_v1);
+                        break;
+                }
+                case LOV_USER_MAGIC_V3: {
+                        if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V3))
+                                lustre_swab_lov_user_md_v3(
+                                        (struct lov_user_md_v3 *)lump);
+                        lum_size = sizeof(struct lov_user_md_v3);
+                        break;
+                }
+		case LOV_USER_MAGIC_COMP_V1: {
+			if (lump->lmm_magic !=
+			    cpu_to_le32(LOV_USER_MAGIC_COMP_V1))
+				lustre_swab_lov_comp_md_v1(
+					(struct lov_comp_md_v1 *)lump);
+			lum_size = le32_to_cpu(
+				((struct lov_comp_md_v1 *)lump)->lcm_size);
+			break;
+		}
+		case LMV_USER_MAGIC: {
+			if (lump->lmm_magic != cpu_to_le32(LMV_USER_MAGIC))
+				lustre_swab_lmv_user_md(
+					(struct lmv_user_md *)lump);
+			lum_size = sizeof(struct lmv_user_md);
+			break;
+		}
+                default: {
+                        CDEBUG(D_IOCTL, "bad userland LOV MAGIC:"
+                                        " %#08x != %#08x nor %#08x\n",
+                                        lump->lmm_magic, LOV_USER_MAGIC_V1,
+                                        LOV_USER_MAGIC_V3);
+                        RETURN(-EINVAL);
+                }
+                }
+        } else {
+                lum_size = sizeof(struct lov_user_md_v1);
+        }
+
+        op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+                                     LUSTRE_OPC_ANY, NULL);
+        if (IS_ERR(op_data))
+                RETURN(PTR_ERR(op_data));
+
+	/* swabbing is done in lov_setstripe() on server side */
+	rc = md_setattr(sbi->ll_md_exp, op_data, lump, lum_size, &req);
+	ll_finish_md_op_data(op_data);
+	ptlrpc_req_finished(req);
+	if (rc)
+		RETURN(rc);
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0)
+	/*
+	 * 2.9 server has stored filesystem default stripe in ROOT xattr,
+	 * and it's stored into system config for backward compatibility.
+	 *
+	 * In the following we use the fact that LOV_USER_MAGIC_V1 and
+	 * LOV_USER_MAGIC_V3 have the same initial fields so we do not
+	 * need the make the distiction between the 2 versions
+	 */
+	if (set_default && mgc->u.cli.cl_mgc_mgsexp &&
+	    (lump == NULL ||
+	     le32_to_cpu(lump->lmm_magic) == LOV_USER_MAGIC_V1 ||
+	     le32_to_cpu(lump->lmm_magic) == LOV_USER_MAGIC_V3)) {
+		char *param = NULL;
+		char *buf;
+
+		OBD_ALLOC(param, MGS_PARAM_MAXLEN);
+		if (param == NULL)
+			GOTO(end, rc = -ENOMEM);
+
+		buf = param;
+		/* Get fsname and assume devname to be -MDT0000. */
+		ll_get_fsname(inode->i_sb, buf, MTI_NAME_MAXLEN);
+		strcat(buf, "-MDT0000.lov");
+		buf += strlen(buf);
+
+		/* Set root stripesize */
+		snprintf(buf, MGS_PARAM_MAXLEN, ".stripesize=%u",
+			 lump ? le32_to_cpu(lump->lmm_stripe_size) : 0);
+		rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
+		if (rc)
+			GOTO(end, rc);
+
+		/* Set root stripecount */
+		snprintf(buf, MGS_PARAM_MAXLEN, ".stripecount=%hd",
+			 lump ? le16_to_cpu(lump->lmm_stripe_count) : 0);
+		rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
+		if (rc)
+			GOTO(end, rc);
+
+		/* Set root stripeoffset */
+		snprintf(buf, MGS_PARAM_MAXLEN, ".stripeoffset=%hd",
+			 lump ? le16_to_cpu(lump->lmm_stripe_offset) :
+				(typeof(lump->lmm_stripe_offset))(-1));
+		rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
+
+end:
+		if (param != NULL)
+			OBD_FREE(param, MGS_PARAM_MAXLEN);
+	}
+#endif
+	RETURN(rc);
+}
+
+/**
+ * This function will be used to get default LOV/LMV/Default LMV
+ * @valid will be used to indicate which stripe it will retrieve
+ * 	OBD_MD_MEA  		LMV stripe EA
+ * 	OBD_MD_DEFAULT_MEA	Default LMV stripe EA
+ *  	otherwise		Default LOV EA.
+ * Each time, it can only retrieve 1 stripe EA
+ **/
+int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size,
+		     struct ptlrpc_request **request, u64 valid)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct mdt_body   *body;
+	struct lov_mds_md *lmm = NULL;
+	struct ptlrpc_request *req = NULL;
+	int rc, lmm_size;
+	struct md_op_data *op_data;
+	ENTRY;
+
+	rc = ll_get_default_mdsize(sbi, &lmm_size);
+	if (rc)
+		RETURN(rc);
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
+				     0, lmm_size, LUSTRE_OPC_ANY,
+				     NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	op_data->op_valid = valid | OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
+	rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+	ll_finish_md_op_data(op_data);
+	if (rc < 0) {
+		CDEBUG(D_INFO, "md_getattr failed on inode "
+		       DFID": rc %d\n", PFID(ll_inode2fid(inode)), rc);
+		GOTO(out, rc);
+	}
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body != NULL);
+
+	lmm_size = body->mbo_eadatasize;
+
+	if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
+	    lmm_size == 0) {
+		GOTO(out, rc = -ENODATA);
+	}
+
+	lmm = req_capsule_server_sized_get(&req->rq_pill,
+					   &RMF_MDT_MD, lmm_size);
+	LASSERT(lmm != NULL);
+
+	/*
+	 * This is coming from the MDS, so is probably in
+	 * little endian.  We convert it to host endian before
+	 * passing it to userspace.
+	 */
+	/* We don't swab objects for directories */
+	switch (le32_to_cpu(lmm->lmm_magic)) {
+	case LOV_MAGIC_V1:
+		if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
+			lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
+		break;
+	case LOV_MAGIC_V3:
+		if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
+			lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
+		break;
+	case LOV_MAGIC_COMP_V1:
+		if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
+			lustre_swab_lov_comp_md_v1(
+					(struct lov_comp_md_v1 *)lmm);
+		break;
+	case LMV_MAGIC_V1:
+		if (LMV_MAGIC != cpu_to_le32(LMV_MAGIC))
+			lustre_swab_lmv_mds_md((union lmv_mds_md *)lmm);
+		break;
+	case LMV_USER_MAGIC:
+		if (LMV_USER_MAGIC != cpu_to_le32(LMV_USER_MAGIC))
+			lustre_swab_lmv_user_md((struct lmv_user_md *)lmm);
+		break;
+	default:
+		CERROR("unknown magic: %lX\n", (unsigned long)lmm->lmm_magic);
+		rc = -EPROTO;
+	}
+out:
+	*plmm = lmm;
+	*plmm_size = lmm_size;
+	*request = req;
+	return rc;
+}
+
+int ll_get_mdt_idx_by_fid(struct ll_sb_info *sbi, const struct lu_fid *fid)
+{
+	struct md_op_data	*op_data;
+	int			rc;
+	int			mdt_index;
+	ENTRY;
+
+	OBD_ALLOC_PTR(op_data);
+	if (op_data == NULL)
+		RETURN(-ENOMEM);
+
+	op_data->op_flags |= MF_GET_MDT_IDX;
+	op_data->op_fid1 = *fid;
+	rc = md_getattr(sbi->ll_md_exp, op_data, NULL);
+	mdt_index = op_data->op_mds;
+	OBD_FREE_PTR(op_data);
+	if (rc < 0)
+		RETURN(rc);
+
+	RETURN(mdt_index);
+}
+
+/*
+ *  Get MDT index for the inode.
+ */
+int ll_get_mdt_idx(struct inode *inode)
+{
+	return ll_get_mdt_idx_by_fid(ll_i2sbi(inode), ll_inode2fid(inode));
+}
+
+/**
+ * Generic handler to do any pre-copy work.
+ *
+ * It sends a first hsm_progress (with extent length == 0) to coordinator as a
+ * first information for it that real work has started.
+ *
+ * Moreover, for a ARCHIVE request, it will sample the file data version and
+ * store it in \a copy.
+ *
+ * \return 0 on success.
+ */
+static int ll_ioc_copy_start(struct super_block *sb, struct hsm_copy *copy)
+{
+	struct ll_sb_info		*sbi = ll_s2sbi(sb);
+	struct hsm_progress_kernel	 hpk;
+	int				 rc = 0;
+	int				 rc2;
+	ENTRY;
+
+	/* Forge a hsm_progress based on data from copy. */
+	hpk.hpk_fid = copy->hc_hai.hai_fid;
+	hpk.hpk_cookie = copy->hc_hai.hai_cookie;
+	hpk.hpk_extent.offset = copy->hc_hai.hai_extent.offset;
+	hpk.hpk_extent.length = 0;
+	hpk.hpk_flags = 0;
+	hpk.hpk_errval = 0;
+	hpk.hpk_data_version = 0;
+
+
+	/* For archive request, we need to read the current file version. */
+	if (copy->hc_hai.hai_action == HSMA_ARCHIVE) {
+		struct inode	*inode;
+		__u64		 data_version = 0;
+
+		/* Get inode for this fid */
+		inode = search_inode_for_lustre(sb, &copy->hc_hai.hai_fid);
+		if (IS_ERR(inode)) {
+			hpk.hpk_flags |= HP_FLAG_RETRY;
+			/* hpk_errval is >= 0 */
+			hpk.hpk_errval = -PTR_ERR(inode);
+			GOTO(progress, rc = PTR_ERR(inode));
+		}
+
+		/* Read current file data version */
+		rc = ll_data_version(inode, &data_version, LL_DV_RD_FLUSH);
+		iput(inode);
+		if (rc != 0) {
+			CDEBUG(D_HSM, "Could not read file data version of "
+				      DFID" (rc = %d). Archive request ("
+				      "%#llx) could not be done.\n",
+				      PFID(&copy->hc_hai.hai_fid), rc,
+				      copy->hc_hai.hai_cookie);
+			hpk.hpk_flags |= HP_FLAG_RETRY;
+			/* hpk_errval must be >= 0 */
+			hpk.hpk_errval = -rc;
+			GOTO(progress, rc);
+		}
+
+		/* Store in the hsm_copy for later copytool use.
+		 * Always modified even if no lsm. */
+		copy->hc_data_version = data_version;
+	}
+
+progress:
+	/* On error, the request should be considered as completed */
+	if (hpk.hpk_errval > 0)
+		hpk.hpk_flags |= HP_FLAG_COMPLETED;
+
+	rc2 = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk),
+			    &hpk, NULL);
+
+	/* Return first error */
+	RETURN(rc != 0 ? rc : rc2);
+}
+
+/**
+ * Generic handler to do any post-copy work.
+ *
+ * It will send the last hsm_progress update to coordinator to inform it
+ * that copy is finished and whether it was successful or not.
+ *
+ * Moreover,
+ * - for ARCHIVE request, it will sample the file data version and compare it
+ *   with the version saved in ll_ioc_copy_start(). If they do not match, copy
+ *   will be considered as failed.
+ * - for RESTORE request, it will sample the file data version and send it to
+ *   coordinator which is useful if the file was imported as 'released'.
+ *
+ * \return 0 on success.
+ */
+static int ll_ioc_copy_end(struct super_block *sb, struct hsm_copy *copy)
+{
+	struct ll_sb_info		*sbi = ll_s2sbi(sb);
+	struct hsm_progress_kernel	 hpk;
+	int				 rc = 0;
+	int				 rc2;
+	ENTRY;
+
+	/* If you modify the logic here, also check llapi_hsm_copy_end(). */
+	/* Take care: copy->hc_hai.hai_action, len, gid and data are not
+	 * initialized if copy_end was called with copy == NULL.
+	 */
+
+	/* Forge a hsm_progress based on data from copy. */
+	hpk.hpk_fid = copy->hc_hai.hai_fid;
+	hpk.hpk_cookie = copy->hc_hai.hai_cookie;
+	hpk.hpk_extent = copy->hc_hai.hai_extent;
+	hpk.hpk_flags = copy->hc_flags | HP_FLAG_COMPLETED;
+	hpk.hpk_errval = copy->hc_errval;
+	hpk.hpk_data_version = 0;
+
+	/* For archive request, we need to check the file data was not changed.
+	 *
+	 * For restore request, we need to send the file data version, this is
+	 * useful when the file was created using hsm_import.
+	 */
+	if (((copy->hc_hai.hai_action == HSMA_ARCHIVE) ||
+	     (copy->hc_hai.hai_action == HSMA_RESTORE)) &&
+	    (copy->hc_errval == 0)) {
+		struct inode	*inode;
+		__u64		 data_version = 0;
+
+		/* Get lsm for this fid */
+		inode = search_inode_for_lustre(sb, &copy->hc_hai.hai_fid);
+		if (IS_ERR(inode)) {
+			hpk.hpk_flags |= HP_FLAG_RETRY;
+			/* hpk_errval must be >= 0 */
+			hpk.hpk_errval = -PTR_ERR(inode);
+			GOTO(progress, rc = PTR_ERR(inode));
+		}
+
+		rc = ll_data_version(inode, &data_version, LL_DV_RD_FLUSH);
+		iput(inode);
+		if (rc) {
+			CDEBUG(D_HSM, "Could not read file data version. "
+				      "Request could not be confirmed.\n");
+			if (hpk.hpk_errval == 0)
+				hpk.hpk_errval = -rc;
+			GOTO(progress, rc);
+		}
+
+		/* Store in the hsm_copy for later copytool use.
+		 * Always modified even if no lsm. */
+		hpk.hpk_data_version = data_version;
+
+		/* File could have been stripped during archiving, so we need
+		 * to check anyway. */
+		if ((copy->hc_hai.hai_action == HSMA_ARCHIVE) &&
+		    (copy->hc_data_version != data_version)) {
+			CDEBUG(D_HSM, "File data version mismatched. "
+			      "File content was changed during archiving. "
+			       DFID", start:%#llx current:%#llx\n",
+			       PFID(&copy->hc_hai.hai_fid),
+			       copy->hc_data_version, data_version);
+			/* File was changed, send error to cdt. Do not ask for
+			 * retry because if a file is modified frequently,
+			 * the cdt will loop on retried archive requests.
+			 * The policy engine will ask for a new archive later
+			 * when the file will not be modified for some tunable
+			 * time */
+			hpk.hpk_flags &= ~HP_FLAG_RETRY;
+			rc = -EBUSY;
+			/* hpk_errval must be >= 0 */
+			hpk.hpk_errval = -rc;
+			GOTO(progress, rc);
+		}
+
+	}
+
+progress:
+	rc2 = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk),
+			    &hpk, NULL);
+
+	/* Return first error */
+	RETURN(rc != 0 ? rc : rc2);
+}
+
+
+static int copy_and_ioctl(int cmd, struct obd_export *exp,
+			  const void __user *data, size_t size)
+{
+	void *copy;
+	int rc;
+
+	OBD_ALLOC(copy, size);
+	if (copy == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(copy, data, size)) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	rc = obd_iocontrol(cmd, exp, size, copy, NULL);
+out:
+	OBD_FREE(copy, size);
+
+	return rc;
+}
+
+static int check_owner(int type, int id)
+{
+	switch (type) {
+	case USRQUOTA:
+		if (!uid_eq(current_euid(), make_kuid(&init_user_ns, id)))
+			return -EPERM;
+		break;
+	case GRPQUOTA:
+		if (!in_egroup_p(make_kgid(&init_user_ns, id)))
+			return -EPERM;
+		break;
+	case PRJQUOTA:
+		break;
+	}
+	return 0;
+}
+
+static int quotactl_ioctl(struct ll_sb_info *sbi, struct if_quotactl *qctl)
+{
+        int cmd = qctl->qc_cmd;
+        int type = qctl->qc_type;
+        int id = qctl->qc_id;
+        int valid = qctl->qc_valid;
+        int rc = 0;
+        ENTRY;
+
+        switch (cmd) {
+        case Q_SETQUOTA:
+        case Q_SETINFO:
+		if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+			RETURN(-EPERM);
+		break;
+	case Q_GETQUOTA:
+		if (check_owner(type, id) &&
+		    (!cfs_capable(CFS_CAP_SYS_ADMIN)))
+			RETURN(-EPERM);
+                break;
+        case Q_GETINFO:
+                break;
+        default:
+                CERROR("unsupported quotactl op: %#x\n", cmd);
+                RETURN(-ENOTTY);
+        }
+
+        if (valid != QC_GENERAL) {
+                if (cmd == Q_GETINFO)
+                        qctl->qc_cmd = Q_GETOINFO;
+                else if (cmd == Q_GETQUOTA)
+                        qctl->qc_cmd = Q_GETOQUOTA;
+                else
+                        RETURN(-EINVAL);
+
+                switch (valid) {
+                case QC_MDTIDX:
+                        rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp,
+                                           sizeof(*qctl), qctl, NULL);
+                        break;
+                case QC_OSTIDX:
+                        rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_dt_exp,
+                                           sizeof(*qctl), qctl, NULL);
+                        break;
+                case QC_UUID:
+                        rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp,
+                                           sizeof(*qctl), qctl, NULL);
+                        if (rc == -EAGAIN)
+                                rc = obd_iocontrol(OBD_IOC_QUOTACTL,
+                                                   sbi->ll_dt_exp,
+                                                   sizeof(*qctl), qctl, NULL);
+                        break;
+                default:
+                        rc = -EINVAL;
+                        break;
+                }
+
+                if (rc)
+                        RETURN(rc);
+
+                qctl->qc_cmd = cmd;
+        } else {
+                struct obd_quotactl *oqctl;
+
+                OBD_ALLOC_PTR(oqctl);
+                if (oqctl == NULL)
+                        RETURN(-ENOMEM);
+
+                QCTL_COPY(oqctl, qctl);
+                rc = obd_quotactl(sbi->ll_md_exp, oqctl);
+                if (rc) {
+                        OBD_FREE_PTR(oqctl);
+                        RETURN(rc);
+                }
+                /* If QIF_SPACE is not set, client should collect the
+                 * space usage from OSSs by itself */
+                if (cmd == Q_GETQUOTA &&
+                    !(oqctl->qc_dqblk.dqb_valid & QIF_SPACE) &&
+                    !oqctl->qc_dqblk.dqb_curspace) {
+                        struct obd_quotactl *oqctl_tmp;
+
+                        OBD_ALLOC_PTR(oqctl_tmp);
+                        if (oqctl_tmp == NULL)
+                                GOTO(out, rc = -ENOMEM);
+
+                        oqctl_tmp->qc_cmd = Q_GETOQUOTA;
+                        oqctl_tmp->qc_id = oqctl->qc_id;
+                        oqctl_tmp->qc_type = oqctl->qc_type;
+
+                        /* collect space usage from OSTs */
+                        oqctl_tmp->qc_dqblk.dqb_curspace = 0;
+                        rc = obd_quotactl(sbi->ll_dt_exp, oqctl_tmp);
+                        if (!rc || rc == -EREMOTEIO) {
+                                oqctl->qc_dqblk.dqb_curspace =
+                                        oqctl_tmp->qc_dqblk.dqb_curspace;
+                                oqctl->qc_dqblk.dqb_valid |= QIF_SPACE;
+                        }
+
+                        /* collect space & inode usage from MDTs */
+                        oqctl_tmp->qc_dqblk.dqb_curspace = 0;
+                        oqctl_tmp->qc_dqblk.dqb_curinodes = 0;
+                        rc = obd_quotactl(sbi->ll_md_exp, oqctl_tmp);
+                        if (!rc || rc == -EREMOTEIO) {
+                                oqctl->qc_dqblk.dqb_curspace +=
+                                        oqctl_tmp->qc_dqblk.dqb_curspace;
+                                oqctl->qc_dqblk.dqb_curinodes =
+                                        oqctl_tmp->qc_dqblk.dqb_curinodes;
+                                oqctl->qc_dqblk.dqb_valid |= QIF_INODES;
+                        } else {
+                                oqctl->qc_dqblk.dqb_valid &= ~QIF_SPACE;
+                        }
+
+                        OBD_FREE_PTR(oqctl_tmp);
+                }
+out:
+                QCTL_COPY(qctl, oqctl);
+                OBD_FREE_PTR(oqctl);
+        }
+
+        RETURN(rc);
+}
+
+/* This function tries to get a single name component,
+ * to send to the server. No actual path traversal involved,
+ * so we limit to NAME_MAX */
+static char *ll_getname(const char __user *filename)
+{
+	int ret = 0, len;
+	char *tmp;
+
+	OBD_ALLOC(tmp, NAME_MAX + 1);
+
+	if (!tmp)
+		return ERR_PTR(-ENOMEM);
+
+	len = strncpy_from_user(tmp, filename, NAME_MAX + 1);
+	if (len < 0)
+		ret = -ENOENT;
+	else if (len > NAME_MAX)
+		ret = -ENAMETOOLONG;
+
+	if (ret) {
+		OBD_FREE(tmp, NAME_MAX + 1);
+		tmp =  ERR_PTR(ret);
+	}
+	return tmp;
+}
+
+#define ll_putname(filename) OBD_FREE(filename, NAME_MAX + 1);
+
+static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct dentry *dentry = file_dentry(file);
+	struct inode *inode = file_inode(file);
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        struct obd_ioctl_data *data;
+        int rc = 0;
+        ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%#x\n",
+	       PFID(ll_inode2fid(inode)), inode, cmd);
+
+        /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
+        if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
+                return -ENOTTY;
+
+        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
+        switch(cmd) {
+        case FSFILT_IOC_GETFLAGS:
+        case FSFILT_IOC_SETFLAGS:
+                RETURN(ll_iocontrol(inode, file, cmd, arg));
+        case FSFILT_IOC_GETVERSION_OLD:
+        case FSFILT_IOC_GETVERSION:
+		RETURN(put_user(inode->i_generation, (int __user *)arg));
+        /* We need to special case any other ioctls we want to handle,
+         * to send them to the MDS/OST as appropriate and to properly
+         * network encode the arg field.
+        case FSFILT_IOC_SETVERSION_OLD:
+        case FSFILT_IOC_SETVERSION:
+        */
+        case LL_IOC_GET_MDTIDX: {
+                int mdtidx;
+
+                mdtidx = ll_get_mdt_idx(inode);
+                if (mdtidx < 0)
+                        RETURN(mdtidx);
+
+		if (put_user((int)mdtidx, (int __user *)arg))
+                        RETURN(-EFAULT);
+
+                return 0;
+        }
+        case IOC_MDC_LOOKUP: {
+		int namelen, len = 0;
+		char *buf = NULL;
+		char *filename;
+
+		rc = obd_ioctl_getdata(&buf, &len, (void __user *)arg);
+		if (rc != 0)
+			RETURN(rc);
+		data = (void *)buf;
+
+		filename = data->ioc_inlbuf1;
+		namelen = strlen(filename);
+		if (namelen < 1) {
+			CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
+			GOTO(out_free, rc = -EINVAL);
+		}
+
+		rc = ll_get_fid_by_name(inode, filename, namelen, NULL, NULL);
+		if (rc < 0) {
+			CERROR("%s: lookup %.*s failed: rc = %d\n",
+			       ll_get_fsname(inode->i_sb, NULL, 0), namelen,
+			       filename, rc);
+			GOTO(out_free, rc);
+		}
+out_free:
+		OBD_FREE_LARGE(buf, len);
+                return rc;
+        }
+	case LL_IOC_LMV_SETSTRIPE: {
+		struct lmv_user_md  *lum;
+		char		*buf = NULL;
+		char		*filename;
+		int		 namelen = 0;
+		int		 lumlen = 0;
+		umode_t		 mode;
+		int		 len;
+		int		 rc;
+
+		rc = obd_ioctl_getdata(&buf, &len, (void __user *)arg);
+		if (rc)
+			RETURN(rc);
+
+		data = (void *)buf;
+		if (data->ioc_inlbuf1 == NULL || data->ioc_inlbuf2 == NULL ||
+		    data->ioc_inllen1 == 0 || data->ioc_inllen2 == 0)
+			GOTO(lmv_out_free, rc = -EINVAL);
+
+		filename = data->ioc_inlbuf1;
+		namelen = data->ioc_inllen1;
+
+		if (namelen < 1) {
+			CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
+			GOTO(lmv_out_free, rc = -EINVAL);
+		}
+		lum = (struct lmv_user_md *)data->ioc_inlbuf2;
+		lumlen = data->ioc_inllen2;
+
+		if (lum->lum_magic != LMV_USER_MAGIC ||
+		    lumlen != sizeof(*lum)) {
+			CERROR("%s: wrong lum magic %x or size %d: rc = %d\n",
+			       filename, lum->lum_magic, lumlen, -EFAULT);
+			GOTO(lmv_out_free, rc = -EINVAL);
+		}
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 9, 50, 0)
+		mode = data->ioc_type != 0 ? data->ioc_type : S_IRWXUGO;
+#else
+		mode = data->ioc_type;
+#endif
+		rc = ll_dir_setdirstripe(dentry, lum, filename, mode);
+lmv_out_free:
+		OBD_FREE_LARGE(buf, len);
+		RETURN(rc);
+
+	}
+	case LL_IOC_LMV_SET_DEFAULT_STRIPE: {
+		struct lmv_user_md	  lum;
+		struct lmv_user_md __user *ulump =
+					(struct lmv_user_md __user *)arg;
+		int			  rc;
+
+		if (copy_from_user(&lum, ulump, sizeof(lum)))
+			RETURN(-EFAULT);
+
+		if (lum.lum_magic != LMV_USER_MAGIC)
+			RETURN(-EINVAL);
+
+		rc = ll_dir_setstripe(inode, (struct lov_user_md *)&lum, 0);
+
+		RETURN(rc);
+	}
+	case LL_IOC_LOV_SETSTRIPE_NEW:
+	case LL_IOC_LOV_SETSTRIPE: {
+		struct lov_user_md_v3 lumv3;
+		struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
+		struct lov_user_md_v1 __user *lumv1p =
+			(struct lov_user_md_v1 __user *)arg;
+		struct lov_user_md_v3 __user *lumv3p =
+			(struct lov_user_md_v3 __user *)arg;
+
+		int set_default = 0;
+
+		CLASSERT(sizeof(struct lov_user_md_v3) >
+			 sizeof(struct lov_comp_md_v1));
+		LASSERT(sizeof(lumv3) == sizeof(*lumv3p));
+		LASSERT(sizeof(lumv3.lmm_objects[0]) ==
+				sizeof(lumv3p->lmm_objects[0]));
+		/* first try with v1 which is smaller than v3 */
+		if (copy_from_user(lumv1, lumv1p, sizeof(*lumv1)))
+                        RETURN(-EFAULT);
+
+		if (lumv1->lmm_magic == LOV_USER_MAGIC_V3)
+			if (copy_from_user(&lumv3, lumv3p, sizeof(lumv3)))
+				RETURN(-EFAULT);
+
+		if (inode->i_sb->s_root == file_dentry(file))
+			set_default = 1;
+
+		/* in v1 and v3 cases lumv1 points to data */
+		rc = ll_dir_setstripe(inode, lumv1, set_default);
+
+		RETURN(rc);
+	}
+	case LL_IOC_LMV_GETSTRIPE: {
+		struct lmv_user_md __user *ulmv =
+					(struct lmv_user_md __user *)arg;
+		struct lmv_user_md	lum;
+		struct ptlrpc_request	*request = NULL;
+		union lmv_mds_md	*lmm = NULL;
+		int			lmmsize;
+		u64			valid = 0;
+		struct lmv_user_md	*tmp = NULL;
+		int			mdt_index;
+		int			lum_size;
+		int			stripe_count;
+		int			max_stripe_count;
+		int			i;
+		int			rc;
+
+		if (copy_from_user(&lum, ulmv, sizeof(*ulmv)))
+			RETURN(-EFAULT);
+
+		max_stripe_count = lum.lum_stripe_count;
+		/* lum_magic will indicate which stripe the ioctl will like
+		 * to get, LMV_MAGIC_V1 is for normal LMV stripe, LMV_USER_MAGIC
+		 * is for default LMV stripe */
+		if (lum.lum_magic == LMV_MAGIC_V1)
+			valid |= OBD_MD_MEA;
+		else if (lum.lum_magic == LMV_USER_MAGIC)
+			valid |= OBD_MD_DEFAULT_MEA;
+		else
+			RETURN(-EINVAL);
+
+		rc = ll_dir_getstripe(inode, (void **)&lmm, &lmmsize, &request,
+				      valid);
+		if (rc != 0)
+			GOTO(finish_req, rc);
+
+		/* Get default LMV EA */
+		if (lum.lum_magic == LMV_USER_MAGIC) {
+			if (lmmsize > sizeof(*ulmv))
+				GOTO(finish_req, rc = -EINVAL);
+
+			if (copy_to_user(ulmv, lmm, lmmsize))
+				GOTO(finish_req, rc = -EFAULT);
+
+			GOTO(finish_req, rc);
+		}
+
+		stripe_count = lmv_mds_md_stripe_count_get(lmm);
+		if (max_stripe_count < stripe_count) {
+			lum.lum_stripe_count = stripe_count;
+			if (copy_to_user(ulmv, &lum, sizeof(lum)))
+				GOTO(finish_req, rc = -EFAULT);
+			GOTO(finish_req, rc = -E2BIG);
+		}
+
+		lum_size = lmv_user_md_size(stripe_count, LMV_MAGIC_V1);
+		OBD_ALLOC(tmp, lum_size);
+		if (tmp == NULL)
+			GOTO(finish_req, rc = -ENOMEM);
+
+		mdt_index = ll_get_mdt_idx(inode);
+		if (mdt_index < 0)
+			GOTO(out_tmp, rc = -ENOMEM);
+
+		tmp->lum_magic = LMV_MAGIC_V1;
+		tmp->lum_stripe_count = 0;
+		tmp->lum_stripe_offset = mdt_index;
+		tmp->lum_hash_type = lmv_mds_md_hash_type_get(lmm);
+		for (i = 0; i < stripe_count; i++) {
+			struct lu_fid	fid;
+
+			fid_le_to_cpu(&fid, &lmm->lmv_md_v1.lmv_stripe_fids[i]);
+			mdt_index = ll_get_mdt_idx_by_fid(sbi, &fid);
+			if (mdt_index < 0)
+				GOTO(out_tmp, rc = mdt_index);
+
+			tmp->lum_objects[i].lum_mds = mdt_index;
+			tmp->lum_objects[i].lum_fid = fid;
+			tmp->lum_stripe_count++;
+		}
+
+		if (copy_to_user(ulmv, tmp, lum_size))
+			GOTO(out_tmp, rc = -EFAULT);
+out_tmp:
+		OBD_FREE(tmp, lum_size);
+finish_req:
+		ptlrpc_req_finished(request);
+		return rc;
+	}
+
+	case LL_IOC_REMOVE_ENTRY: {
+		char		*filename = NULL;
+		int		 namelen = 0;
+		int		 rc;
+
+		/* Here is a little hack to avoid sending REINT_RMENTRY to
+		 * unsupported server, which might crash the server(LU-2730),
+		 * Because both LVB_TYPE and REINT_RMENTRY will be supported
+		 * on 2.4, we use OBD_CONNECT_LVB_TYPE to detect whether the
+		 * server will support REINT_RMENTRY XXX*/
+		if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_LVB_TYPE))
+			RETURN(-ENOTSUPP);
+
+		filename = ll_getname((const char __user *)arg);
+		if (IS_ERR(filename))
+			RETURN(PTR_ERR(filename));
+
+		namelen = strlen(filename);
+		if (namelen < 1)
+			GOTO(out_rmdir, rc = -EINVAL);
+
+		rc = ll_rmdir_entry(inode, filename, namelen);
+out_rmdir:
+                if (filename)
+                        ll_putname(filename);
+		RETURN(rc);
+	}
+	case LL_IOC_LOV_SWAP_LAYOUTS:
+		RETURN(-EPERM);
+	case IOC_OBD_STATFS:
+		RETURN(ll_obd_statfs(inode, (void __user *)arg));
+	case LL_IOC_LOV_GETSTRIPE:
+	case LL_IOC_LOV_GETSTRIPE_NEW:
+	case LL_IOC_MDC_GETINFO:
+	case IOC_MDC_GETFILEINFO:
+	case IOC_MDC_GETFILESTRIPE: {
+		struct ptlrpc_request *request = NULL;
+		struct lov_user_md __user *lump;
+                struct lov_mds_md *lmm = NULL;
+                struct mdt_body *body;
+                char *filename = NULL;
+                int lmmsize;
+
+                if (cmd == IOC_MDC_GETFILEINFO ||
+                    cmd == IOC_MDC_GETFILESTRIPE) {
+			filename = ll_getname((const char __user *)arg);
+                        if (IS_ERR(filename))
+                                RETURN(PTR_ERR(filename));
+
+                        rc = ll_lov_getstripe_ea_info(inode, filename, &lmm,
+                                                      &lmmsize, &request);
+		} else {
+			rc = ll_dir_getstripe(inode, (void **)&lmm, &lmmsize,
+					      &request, 0);
+		}
+
+                if (request) {
+                        body = req_capsule_server_get(&request->rq_pill,
+                                                      &RMF_MDT_BODY);
+                        LASSERT(body != NULL);
+                } else {
+                        GOTO(out_req, rc);
+                }
+
+                if (rc < 0) {
+                        if (rc == -ENODATA && (cmd == IOC_MDC_GETFILEINFO ||
+                                               cmd == LL_IOC_MDC_GETINFO))
+                                GOTO(skip_lmm, rc = 0);
+                        else
+                                GOTO(out_req, rc);
+                }
+
+		if (cmd == IOC_MDC_GETFILESTRIPE ||
+		    cmd == LL_IOC_LOV_GETSTRIPE ||
+		    cmd == LL_IOC_LOV_GETSTRIPE_NEW) {
+			lump = (struct lov_user_md __user *)arg;
+                } else {
+			struct lov_user_mds_data __user *lmdp;
+			lmdp = (struct lov_user_mds_data __user *)arg;
+                        lump = &lmdp->lmd_lmm;
+                }
+		if (copy_to_user(lump, lmm, lmmsize)) {
+			if (copy_to_user(lump, lmm, sizeof(*lump)))
+                                GOTO(out_req, rc = -EFAULT);
+                        rc = -EOVERFLOW;
+                }
+        skip_lmm:
+                if (cmd == IOC_MDC_GETFILEINFO || cmd == LL_IOC_MDC_GETINFO) {
+			struct lov_user_mds_data __user *lmdp;
+                        lstat_t st = { 0 };
+
+			st.st_dev	= inode->i_sb->s_dev;
+			st.st_mode	= body->mbo_mode;
+			st.st_nlink	= body->mbo_nlink;
+			st.st_uid	= body->mbo_uid;
+			st.st_gid	= body->mbo_gid;
+			st.st_rdev	= body->mbo_rdev;
+			st.st_size	= body->mbo_size;
+			st.st_blksize	= PAGE_SIZE;
+			st.st_blocks	= body->mbo_blocks;
+			st.st_atime	= body->mbo_atime;
+			st.st_mtime	= body->mbo_mtime;
+			st.st_ctime	= body->mbo_ctime;
+			st.st_ino	= cl_fid_build_ino(&body->mbo_fid1,
+						sbi->ll_flags &
+						LL_SBI_32BIT_API);
+
+			lmdp = (struct lov_user_mds_data __user *)arg;
+			if (copy_to_user(&lmdp->lmd_st, &st, sizeof(st)))
+                                GOTO(out_req, rc = -EFAULT);
+                }
+
+                EXIT;
+        out_req:
+                ptlrpc_req_finished(request);
+                if (filename)
+                        ll_putname(filename);
+                return rc;
+        }
+	case OBD_IOC_QUOTACTL: {
+                struct if_quotactl *qctl;
+
+                OBD_ALLOC_PTR(qctl);
+                if (!qctl)
+                        RETURN(-ENOMEM);
+
+		if (copy_from_user(qctl, (void __user *)arg, sizeof(*qctl)))
+                        GOTO(out_quotactl, rc = -EFAULT);
+
+                rc = quotactl_ioctl(sbi, qctl);
+
+		if (rc == 0 &&
+		    copy_to_user((void __user *)arg, qctl, sizeof(*qctl)))
+                        rc = -EFAULT;
+
+        out_quotactl:
+                OBD_FREE_PTR(qctl);
+                RETURN(rc);
+        }
+        case OBD_IOC_GETDTNAME:
+        case OBD_IOC_GETMDNAME:
+                RETURN(ll_get_obd_name(inode, cmd, arg));
+        case LL_IOC_FLUSHCTX:
+                RETURN(ll_flush_ctx(inode));
+        case LL_IOC_GETOBDCOUNT: {
+		u32 count, vallen;
+                struct obd_export *exp;
+
+		if (copy_from_user(&count, (int __user *)arg, sizeof(int)))
+                        RETURN(-EFAULT);
+
+                /* get ost count when count is zero, get mdt count otherwise */
+                exp = count ? sbi->ll_md_exp : sbi->ll_dt_exp;
+                vallen = sizeof(count);
+		rc = obd_get_info(NULL, exp, sizeof(KEY_TGT_COUNT),
+				  KEY_TGT_COUNT, &vallen, &count);
+                if (rc) {
+                        CERROR("get target count failed: %d\n", rc);
+                        RETURN(rc);
+                }
+
+		if (copy_to_user((int __user *)arg, &count, sizeof(int)))
+                        RETURN(-EFAULT);
+
+                RETURN(0);
+        }
+        case LL_IOC_PATH2FID:
+		if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
+                                     sizeof(struct lu_fid)))
+                        RETURN(-EFAULT);
+                RETURN(0);
+        case LL_IOC_GET_CONNECT_FLAGS: {
+		RETURN(obd_iocontrol(cmd, sbi->ll_md_exp, 0, NULL,
+				     (void __user *)arg));
+        }
+	case OBD_IOC_FID2PATH:
+		RETURN(ll_fid2path(inode, (void __user *)arg));
+	case LL_IOC_GETPARENT:
+		RETURN(ll_getparent(file, (void __user *)arg));
+	case LL_IOC_FID2MDTIDX: {
+		struct obd_export *exp = ll_i2mdexp(inode);
+		struct lu_fid	  fid;
+		__u32		  index;
+
+		if (copy_from_user(&fid, (const struct lu_fid __user *)arg,
+				   sizeof(fid)))
+			RETURN(-EFAULT);
+
+		/* Call mdc_iocontrol */
+		rc = obd_iocontrol(LL_IOC_FID2MDTIDX, exp, sizeof(fid), &fid,
+				   (__u32 __user *)&index);
+		if (rc != 0)
+			RETURN(rc);
+
+		RETURN(index);
+	}
+	case LL_IOC_HSM_REQUEST: {
+		struct hsm_user_request	*hur;
+		ssize_t			 totalsize;
+
+		OBD_ALLOC_PTR(hur);
+		if (hur == NULL)
+			RETURN(-ENOMEM);
+
+		/* We don't know the true size yet; copy the fixed-size part */
+		if (copy_from_user(hur, (void __user *)arg, sizeof(*hur))) {
+			OBD_FREE_PTR(hur);
+			RETURN(-EFAULT);
+		}
+
+		/* Compute the whole struct size */
+		totalsize = hur_len(hur);
+		OBD_FREE_PTR(hur);
+		if (totalsize < 0)
+			RETURN(-E2BIG);
+
+		/* Final size will be more than double totalsize */
+		if (totalsize >= MDS_MAXREQSIZE / 3)
+			RETURN(-E2BIG);
+
+		OBD_ALLOC_LARGE(hur, totalsize);
+		if (hur == NULL)
+			RETURN(-ENOMEM);
+
+		/* Copy the whole struct */
+		if (copy_from_user(hur, (void __user *)arg, totalsize))
+			GOTO(out_hur, rc = -EFAULT);
+
+		if (hur->hur_request.hr_action == HUA_RELEASE) {
+			const struct lu_fid *fid;
+			struct inode *f;
+			int i;
+
+			for (i = 0; i < hur->hur_request.hr_itemcount; i++) {
+				fid = &hur->hur_user_item[i].hui_fid;
+				f = search_inode_for_lustre(inode->i_sb, fid);
+				if (IS_ERR(f)) {
+					rc = PTR_ERR(f);
+					break;
+				}
+
+				rc = ll_hsm_release(f);
+				iput(f);
+				if (rc != 0)
+					break;
+			}
+		} else {
+			rc = obd_iocontrol(cmd, ll_i2mdexp(inode), totalsize,
+					   hur, NULL);
+		}
+
+out_hur:
+		OBD_FREE_LARGE(hur, totalsize);
+
+		RETURN(rc);
+	}
+	case LL_IOC_HSM_PROGRESS: {
+		struct hsm_progress_kernel	hpk;
+		struct hsm_progress		hp;
+
+		if (copy_from_user(&hp, (void __user *)arg, sizeof(hp)))
+			RETURN(-EFAULT);
+
+		hpk.hpk_fid = hp.hp_fid;
+		hpk.hpk_cookie = hp.hp_cookie;
+		hpk.hpk_extent = hp.hp_extent;
+		hpk.hpk_flags = hp.hp_flags;
+		hpk.hpk_errval = hp.hp_errval;
+		hpk.hpk_data_version = 0;
+
+		/* File may not exist in Lustre; all progress
+		 * reported to Lustre root */
+		rc = obd_iocontrol(cmd, sbi->ll_md_exp, sizeof(hpk), &hpk,
+				   NULL);
+		RETURN(rc);
+	}
+	case LL_IOC_HSM_CT_START:
+		if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+			RETURN(-EPERM);
+
+		rc = copy_and_ioctl(cmd, sbi->ll_md_exp, (void __user *)arg,
+				    sizeof(struct lustre_kernelcomm));
+		RETURN(rc);
+
+	case LL_IOC_HSM_COPY_START: {
+		struct hsm_copy	*copy;
+		int		 rc;
+
+		OBD_ALLOC_PTR(copy);
+		if (copy == NULL)
+			RETURN(-ENOMEM);
+		if (copy_from_user(copy, (char __user *)arg, sizeof(*copy))) {
+			OBD_FREE_PTR(copy);
+			RETURN(-EFAULT);
+		}
+
+		rc = ll_ioc_copy_start(inode->i_sb, copy);
+		if (copy_to_user((char __user *)arg, copy, sizeof(*copy)))
+			rc = -EFAULT;
+
+		OBD_FREE_PTR(copy);
+		RETURN(rc);
+	}
+	case LL_IOC_HSM_COPY_END: {
+		struct hsm_copy	*copy;
+		int		 rc;
+
+		OBD_ALLOC_PTR(copy);
+		if (copy == NULL)
+			RETURN(-ENOMEM);
+		if (copy_from_user(copy, (char __user *)arg, sizeof(*copy))) {
+			OBD_FREE_PTR(copy);
+			RETURN(-EFAULT);
+		}
+
+		rc = ll_ioc_copy_end(inode->i_sb, copy);
+		if (copy_to_user((char __user *)arg, copy, sizeof(*copy)))
+			rc = -EFAULT;
+
+		OBD_FREE_PTR(copy);
+		RETURN(rc);
+	}
+	case LL_IOC_MIGRATE: {
+		char		*buf = NULL;
+		const char	*filename;
+		int		namelen = 0;
+		int		len;
+		int		rc;
+		int		mdtidx;
+
+		rc = obd_ioctl_getdata(&buf, &len, (void __user *)arg);
+		if (rc < 0)
+			RETURN(rc);
+
+		data = (struct obd_ioctl_data *)buf;
+		if (data->ioc_inlbuf1 == NULL || data->ioc_inlbuf2 == NULL ||
+		    data->ioc_inllen1 == 0 || data->ioc_inllen2 == 0)
+			GOTO(migrate_free, rc = -EINVAL);
+
+		filename = data->ioc_inlbuf1;
+		namelen = data->ioc_inllen1;
+		/* \0 is packed at the end of filename */
+		if (namelen < 1 || namelen != strlen(filename) + 1)
+			GOTO(migrate_free, rc = -EINVAL);
+
+		if (data->ioc_inllen2 != sizeof(mdtidx))
+			GOTO(migrate_free, rc = -EINVAL);
+		mdtidx = *(int *)data->ioc_inlbuf2;
+
+		rc = ll_migrate(inode, file, mdtidx, filename, namelen - 1);
+migrate_free:
+		OBD_FREE_LARGE(buf, len);
+
+		RETURN(rc);
+	}
+	case LL_IOC_FSGETXATTR:
+		RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
+	case LL_IOC_FSSETXATTR:
+		RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
+	default:
+		RETURN(obd_iocontrol(cmd, sbi->ll_dt_exp, 0, NULL,
+				     (void __user *)arg));
+	}
+}
+
+static loff_t ll_dir_seek(struct file *file, loff_t offset, int origin)
+{
+        struct inode *inode = file->f_mapping->host;
+        struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        int api32 = ll_need_32bit_api(sbi);
+        loff_t ret = -EINVAL;
+        ENTRY;
+
+	inode_lock(inode);
+        switch (origin) {
+                case SEEK_SET:
+                        break;
+                case SEEK_CUR:
+                        offset += file->f_pos;
+                        break;
+                case SEEK_END:
+                        if (offset > 0)
+                                GOTO(out, ret);
+                        if (api32)
+                                offset += LL_DIR_END_OFF_32BIT;
+                        else
+                                offset += LL_DIR_END_OFF;
+                        break;
+                default:
+                        GOTO(out, ret);
+        }
+
+        if (offset >= 0 &&
+            ((api32 && offset <= LL_DIR_END_OFF_32BIT) ||
+             (!api32 && offset <= LL_DIR_END_OFF))) {
+                if (offset != file->f_pos) {
+                        if ((api32 && offset == LL_DIR_END_OFF_32BIT) ||
+                            (!api32 && offset == LL_DIR_END_OFF))
+				fd->lfd_pos = MDS_DIR_END_OFF;
+                        else if (api32 && sbi->ll_flags & LL_SBI_64BIT_HASH)
+				fd->lfd_pos = offset << 32;
+                        else
+				fd->lfd_pos = offset;
+                        file->f_pos = offset;
+                        file->f_version = 0;
+                }
+                ret = offset;
+        }
+        GOTO(out, ret);
+
+out:
+	inode_unlock(inode);
+        return ret;
+}
+
+static int ll_dir_open(struct inode *inode, struct file *file)
+{
+        ENTRY;
+        RETURN(ll_file_open(inode, file));
+}
+
+static int ll_dir_release(struct inode *inode, struct file *file)
+{
+        ENTRY;
+        RETURN(ll_file_release(inode, file));
+}
+
+const struct file_operations ll_dir_operations = {
+	.llseek		= ll_dir_seek,
+	.open		= ll_dir_open,
+	.release	= ll_dir_release,
+	.read		= generic_read_dir,
+#ifdef HAVE_DIR_CONTEXT
+	.iterate	= ll_iterate,
+#else
+	.readdir	= ll_readdir,
+#endif
+	.unlocked_ioctl	= ll_dir_ioctl,
+	.fsync		= ll_fsync,
+};
diff --git a/drivers/staging/lustrefsx/lustre/llite/file.c b/drivers/staging/lustrefsx/lustre/llite/file.c
new file mode 100644
index 0000000000000..1fefb8f63dc0c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/file.c
@@ -0,0 +1,4550 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/file.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+#include <lustre_dlm.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/sched.h>
+#include <linux/user_namespace.h>
+#ifdef HAVE_UIDGID_HEADER
+# include <linux/uidgid.h>
+#endif
+#include <lustre/ll_fiemap.h>
+
+#include <uapi/linux/lustre_ioctl.h>
+#include <lustre_swab.h>
+
+#include "cl_object.h"
+#include "llite_internal.h"
+#include "vvp_internal.h"
+
+static int
+ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
+
+static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
+			  bool *lease_broken);
+
+static struct ll_file_data *ll_file_data_get(void)
+{
+	struct ll_file_data *fd;
+
+	OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
+	if (fd == NULL)
+		return NULL;
+
+	fd->fd_write_failed = false;
+
+	return fd;
+}
+
+static void ll_file_data_put(struct ll_file_data *fd)
+{
+        if (fd != NULL)
+                OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
+}
+
+/**
+ * Packs all the attributes into @op_data for the CLOSE rpc.
+ */
+static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
+                             struct obd_client_handle *och)
+{
+	ENTRY;
+
+	ll_prep_md_op_data(op_data, inode, NULL, NULL,
+			   0, 0, LUSTRE_OPC_ANY, NULL);
+
+	op_data->op_attr.ia_mode = inode->i_mode;
+	op_data->op_attr.ia_atime = inode->i_atime;
+	op_data->op_attr.ia_mtime = inode->i_mtime;
+	op_data->op_attr.ia_ctime = inode->i_ctime;
+	op_data->op_attr.ia_size = i_size_read(inode);
+	op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
+				     ATTR_MTIME | ATTR_MTIME_SET |
+				     ATTR_CTIME | ATTR_CTIME_SET;
+	op_data->op_attr_blocks = inode->i_blocks;
+	op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
+	op_data->op_handle = och->och_fh;
+
+	if (och->och_flags & FMODE_WRITE &&
+	    ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
+		/* For HSM: if inode data has been modified, pack it so that
+		 * MDT can set data dirty flag in the archive. */
+		op_data->op_bias |= MDS_DATA_MODIFIED;
+
+	EXIT;
+}
+
+/**
+ * Perform a close, possibly with a bias.
+ * The meaning of "data" depends on the value of "bias".
+ *
+ * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
+ * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
+ * swap layouts with.
+ */
+static int ll_close_inode_openhandle(struct inode *inode,
+				     struct obd_client_handle *och,
+				     enum mds_op_bias bias, void *data)
+{
+	struct obd_export *md_exp = ll_i2mdexp(inode);
+	const struct ll_inode_info *lli = ll_i2info(inode);
+	struct md_op_data *op_data;
+	struct ptlrpc_request *req = NULL;
+	int rc;
+	ENTRY;
+
+	if (class_exp2obd(md_exp) == NULL) {
+		CERROR("%s: invalid MDC connection handle closing "DFID"\n",
+		       ll_get_fsname(inode->i_sb, NULL, 0),
+		       PFID(&lli->lli_fid));
+		GOTO(out, rc = 0);
+	}
+
+	OBD_ALLOC_PTR(op_data);
+	/* We leak openhandle and request here on error, but not much to be
+	 * done in OOM case since app won't retry close on error either. */
+	if (op_data == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	ll_prepare_close(inode, op_data, och);
+	switch (bias) {
+	case MDS_CLOSE_LAYOUT_SWAP:
+		LASSERT(data != NULL);
+		op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
+		op_data->op_data_version = 0;
+		op_data->op_lease_handle = och->och_lease_handle;
+		op_data->op_fid2 = *ll_inode2fid(data);
+		break;
+
+	case MDS_HSM_RELEASE:
+		LASSERT(data != NULL);
+		op_data->op_bias |= MDS_HSM_RELEASE;
+		op_data->op_data_version = *(__u64 *)data;
+		op_data->op_lease_handle = och->och_lease_handle;
+		op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
+		break;
+
+	default:
+		LASSERT(data == NULL);
+		break;
+	}
+
+	rc = md_close(md_exp, op_data, och->och_mod, &req);
+	if (rc != 0 && rc != -EINTR)
+		CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
+		       md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
+
+	if (rc == 0 &&
+	    op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
+		struct mdt_body *body;
+
+		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+		if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
+			rc = -EBUSY;
+	}
+
+	ll_finish_md_op_data(op_data);
+	EXIT;
+out:
+
+	md_clear_open_replay_data(md_exp, och);
+	och->och_fh.cookie = DEAD_HANDLE_MAGIC;
+	OBD_FREE_PTR(och);
+
+	ptlrpc_req_finished(req);	/* This is close request */
+	return rc;
+}
+
+int ll_md_real_close(struct inode *inode, fmode_t fmode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct obd_client_handle **och_p;
+	struct obd_client_handle *och;
+	__u64 *och_usecount;
+	int rc = 0;
+	ENTRY;
+
+	if (fmode & FMODE_WRITE) {
+		och_p = &lli->lli_mds_write_och;
+		och_usecount = &lli->lli_open_fd_write_count;
+	} else if (fmode & FMODE_EXEC) {
+		och_p = &lli->lli_mds_exec_och;
+		och_usecount = &lli->lli_open_fd_exec_count;
+	} else {
+		LASSERT(fmode & FMODE_READ);
+		och_p = &lli->lli_mds_read_och;
+		och_usecount = &lli->lli_open_fd_read_count;
+	}
+
+	mutex_lock(&lli->lli_och_mutex);
+	if (*och_usecount > 0) {
+		/* There are still users of this handle, so skip
+		 * freeing it. */
+		mutex_unlock(&lli->lli_och_mutex);
+		RETURN(0);
+	}
+
+	och = *och_p;
+	*och_p = NULL;
+	mutex_unlock(&lli->lli_och_mutex);
+
+	if (och != NULL) {
+		/* There might be a race and this handle may already
+		 * be closed. */
+		rc = ll_close_inode_openhandle(inode, och, 0, NULL);
+	}
+
+	RETURN(rc);
+}
+
+static int ll_md_close(struct inode *inode, struct file *file)
+{
+	union ldlm_policy_data policy = {
+		.l_inodebits	= { MDS_INODELOCK_OPEN },
+	};
+	__u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
+	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct lustre_handle lockh;
+	enum ldlm_mode lockmode;
+	int rc = 0;
+	ENTRY;
+
+	/* clear group lock, if present */
+	if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
+		ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
+
+	if (fd->fd_lease_och != NULL) {
+		bool lease_broken;
+
+		/* Usually the lease is not released when the
+		 * application crashed, we need to release here. */
+		rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
+		CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
+			PFID(&lli->lli_fid), rc, lease_broken);
+
+		fd->fd_lease_och = NULL;
+	}
+
+	if (fd->fd_och != NULL) {
+		rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
+		fd->fd_och = NULL;
+		GOTO(out, rc);
+	}
+
+        /* Let's see if we have good enough OPEN lock on the file and if
+           we can skip talking to MDS */
+	mutex_lock(&lli->lli_och_mutex);
+	if (fd->fd_omode & FMODE_WRITE) {
+		lockmode = LCK_CW;
+		LASSERT(lli->lli_open_fd_write_count);
+		lli->lli_open_fd_write_count--;
+	} else if (fd->fd_omode & FMODE_EXEC) {
+		lockmode = LCK_PR;
+		LASSERT(lli->lli_open_fd_exec_count);
+		lli->lli_open_fd_exec_count--;
+	} else {
+		lockmode = LCK_CR;
+		LASSERT(lli->lli_open_fd_read_count);
+		lli->lli_open_fd_read_count--;
+	}
+	mutex_unlock(&lli->lli_och_mutex);
+
+	if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
+			   LDLM_IBITS, &policy, lockmode, &lockh))
+		rc = ll_md_real_close(inode, fd->fd_omode);
+
+out:
+	LUSTRE_FPRIVATE(file) = NULL;
+	ll_file_data_put(fd);
+
+	RETURN(rc);
+}
+
+/* While this returns an error code, fput() the caller does not, so we need
+ * to make every effort to clean up all of our state here.  Also, applications
+ * rarely check close errors and even if an error is returned they will not
+ * re-try the close call.
+ */
+int ll_file_release(struct inode *inode, struct file *file)
+{
+        struct ll_file_data *fd;
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        struct ll_inode_info *lli = ll_i2info(inode);
+        int rc;
+        ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
+	       PFID(ll_inode2fid(inode)), inode);
+
+	if (inode->i_sb->s_root != file_dentry(file))
+                ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
+        fd = LUSTRE_FPRIVATE(file);
+        LASSERT(fd != NULL);
+
+	/* The last ref on @file, maybe not the the owner pid of statahead,
+	 * because parent and child process can share the same file handle. */
+	if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
+		ll_deauthorize_statahead(inode, fd);
+
+	if (inode->i_sb->s_root == file_dentry(file)) {
+		LUSTRE_FPRIVATE(file) = NULL;
+		ll_file_data_put(fd);
+		RETURN(0);
+	}
+
+	if (!S_ISDIR(inode->i_mode)) {
+		if (lli->lli_clob != NULL)
+			lov_read_and_clear_async_rc(lli->lli_clob);
+		lli->lli_async_rc = 0;
+	}
+
+	rc = ll_md_close(inode, file);
+
+	if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
+		libcfs_debug_dumplog();
+
+	RETURN(rc);
+}
+
+static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
+				struct lookup_intent *itp)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
+	struct dentry *parent = de->d_parent;
+	const char *name = NULL;
+	int len = 0;
+	struct md_op_data *op_data;
+	struct ptlrpc_request *req = NULL;
+	int rc;
+	ENTRY;
+
+	LASSERT(parent != NULL);
+	LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
+
+	/* if server supports open-by-fid, or file name is invalid, don't pack
+	 * name in open request */
+	if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
+	    lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
+		name = de->d_name.name;
+		len = de->d_name.len;
+	}
+
+	op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
+				     name, len, 0, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+	op_data->op_data = lmm;
+	op_data->op_data_size = lmmsize;
+
+	rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
+			    &ll_md_blocking_ast, 0);
+	ll_finish_md_op_data(op_data);
+	if (rc == -ESTALE) {
+		/* reason for keep own exit path - don`t flood log
+		 * with messages with -ESTALE errors.
+		 */
+		if (!it_disposition(itp, DISP_OPEN_OPEN) ||
+		     it_open_error(DISP_OPEN_OPEN, itp))
+			GOTO(out, rc);
+		ll_release_openhandle(de, itp);
+		GOTO(out, rc);
+	}
+
+	if (it_disposition(itp, DISP_LOOKUP_NEG))
+		GOTO(out, rc = -ENOENT);
+
+	if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
+		rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
+		CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
+	if (!rc && itp->it_lock_mode)
+		ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
+
+out:
+	ptlrpc_req_finished(req);
+	ll_intent_drop_lock(itp);
+
+	/* We did open by fid, but by the time we got to the server,
+	 * the object disappeared. If this is a create, we cannot really
+	 * tell the userspace that the file it was trying to create
+	 * does not exist. Instead let's return -ESTALE, and the VFS will
+	 * retry the create with LOOKUP_REVAL that we are going to catch
+	 * in ll_revalidate_dentry() and use lookup then.
+	 */
+	if (rc == -ENOENT && itp->it_op & IT_CREAT)
+		rc = -ESTALE;
+
+	RETURN(rc);
+}
+
+static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
+		       struct obd_client_handle *och)
+{
+	struct mdt_body *body;
+
+	body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
+	och->och_fh = body->mbo_handle;
+	och->och_fid = body->mbo_fid1;
+	och->och_lease_handle.cookie = it->it_lock_handle;
+	och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
+	och->och_flags = it->it_flags;
+
+	return md_set_open_replay_data(md_exp, och, it);
+}
+
+static int ll_local_open(struct file *file, struct lookup_intent *it,
+			 struct ll_file_data *fd, struct obd_client_handle *och)
+{
+	struct inode *inode = file_inode(file);
+	ENTRY;
+
+	LASSERT(!LUSTRE_FPRIVATE(file));
+
+	LASSERT(fd != NULL);
+
+	if (och) {
+		int rc;
+
+		rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
+		if (rc != 0)
+			RETURN(rc);
+	}
+
+	LUSTRE_FPRIVATE(file) = fd;
+	ll_readahead_init(inode, &fd->fd_ras);
+	fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
+
+	/* ll_cl_context initialize */
+	rwlock_init(&fd->fd_lock);
+	INIT_LIST_HEAD(&fd->fd_lccs);
+
+	RETURN(0);
+}
+
+/* Open a file, and (for the very first open) create objects on the OSTs at
+ * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
+ * creation or open until ll_lov_setstripe() ioctl is called.
+ *
+ * If we already have the stripe MD locally then we don't request it in
+ * md_open(), by passing a lmm_size = 0.
+ *
+ * It is up to the application to ensure no other processes open this file
+ * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
+ * used.  We might be able to avoid races of that sort by getting lli_open_sem
+ * before returning in the O_LOV_DELAY_CREATE case and dropping it here
+ * or in ll_file_release(), but I'm not sure that is desirable/necessary.
+ */
+int ll_file_open(struct inode *inode, struct file *file)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct lookup_intent *it, oit = { .it_op = IT_OPEN,
+					  .it_flags = file->f_flags };
+	struct obd_client_handle **och_p = NULL;
+	__u64 *och_usecount = NULL;
+	struct ll_file_data *fd;
+	int rc = 0;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
+	       PFID(ll_inode2fid(inode)), inode, file->f_flags);
+
+	it = file->private_data; /* XXX: compat macro */
+	file->private_data = NULL; /* prevent ll_local_open assertion */
+
+	fd = ll_file_data_get();
+	if (fd == NULL)
+		GOTO(out_openerr, rc = -ENOMEM);
+
+	fd->fd_file = file;
+	if (S_ISDIR(inode->i_mode))
+		ll_authorize_statahead(inode, fd);
+
+	if (inode->i_sb->s_root == file_dentry(file)) {
+                LUSTRE_FPRIVATE(file) = fd;
+                RETURN(0);
+        }
+
+	if (!it || !it->it_disposition) {
+                /* Convert f_flags into access mode. We cannot use file->f_mode,
+                 * because everything but O_ACCMODE mask was stripped from
+                 * there */
+                if ((oit.it_flags + 1) & O_ACCMODE)
+                        oit.it_flags++;
+                if (file->f_flags & O_TRUNC)
+                        oit.it_flags |= FMODE_WRITE;
+
+                /* kernel only call f_op->open in dentry_open.  filp_open calls
+                 * dentry_open after call to open_namei that checks permissions.
+                 * Only nfsd_open call dentry_open directly without checking
+                 * permissions and because of that this code below is safe. */
+                if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
+                        oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
+
+                /* We do not want O_EXCL here, presumably we opened the file
+                 * already? XXX - NFS implications? */
+                oit.it_flags &= ~O_EXCL;
+
+                /* bug20584, if "it_flags" contains O_CREAT, the file will be
+                 * created if necessary, then "IT_CREAT" should be set to keep
+                 * consistent with it */
+                if (oit.it_flags & O_CREAT)
+                        oit.it_op |= IT_CREAT;
+
+                it = &oit;
+        }
+
+restart:
+        /* Let's see if we have file open on MDS already. */
+        if (it->it_flags & FMODE_WRITE) {
+                och_p = &lli->lli_mds_write_och;
+                och_usecount = &lli->lli_open_fd_write_count;
+        } else if (it->it_flags & FMODE_EXEC) {
+                och_p = &lli->lli_mds_exec_och;
+                och_usecount = &lli->lli_open_fd_exec_count;
+         } else {
+                och_p = &lli->lli_mds_read_och;
+                och_usecount = &lli->lli_open_fd_read_count;
+        }
+
+	mutex_lock(&lli->lli_och_mutex);
+        if (*och_p) { /* Open handle is present */
+                if (it_disposition(it, DISP_OPEN_OPEN)) {
+                        /* Well, there's extra open request that we do not need,
+                           let's close it somehow. This will decref request. */
+                        rc = it_open_error(DISP_OPEN_OPEN, it);
+                        if (rc) {
+				mutex_unlock(&lli->lli_och_mutex);
+                                GOTO(out_openerr, rc);
+                        }
+
+			ll_release_openhandle(file_dentry(file), it);
+                }
+                (*och_usecount)++;
+
+                rc = ll_local_open(file, it, fd, NULL);
+                if (rc) {
+                        (*och_usecount)--;
+			mutex_unlock(&lli->lli_och_mutex);
+                        GOTO(out_openerr, rc);
+                }
+        } else {
+                LASSERT(*och_usecount == 0);
+		if (!it->it_disposition) {
+			struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
+                        /* We cannot just request lock handle now, new ELC code
+                           means that one of other OPEN locks for this file
+                           could be cancelled, and since blocking ast handler
+                           would attempt to grab och_mutex as well, that would
+                           result in a deadlock */
+			mutex_unlock(&lli->lli_och_mutex);
+			/*
+			 * Normally called under two situations:
+			 * 1. NFS export.
+			 * 2. A race/condition on MDS resulting in no open
+			 *    handle to be returned from LOOKUP|OPEN request,
+			 *    for example if the target entry was a symlink.
+			 *
+			 *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
+			 *  marked by a bit set in ll_iget_for_nfs. Clear the
+			 *  bit so that it's not confusing later callers.
+			 *
+			 *  NB; when ldd is NULL, it must have come via normal
+			 *  lookup path only, since ll_iget_for_nfs always calls
+			 *  ll_d_init().
+			 */
+			if (ldd && ldd->lld_nfs_dentry) {
+				ldd->lld_nfs_dentry = 0;
+				it->it_flags |= MDS_OPEN_LOCK;
+			}
+
+			 /*
+			 * Always specify MDS_OPEN_BY_FID because we don't want
+			 * to get file with different fid.
+			 */
+			it->it_flags |= MDS_OPEN_BY_FID;
+			rc = ll_intent_file_open(file_dentry(file), NULL, 0,
+						 it);
+                        if (rc)
+                                GOTO(out_openerr, rc);
+
+                        goto restart;
+                }
+                OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
+                if (!*och_p)
+                        GOTO(out_och_free, rc = -ENOMEM);
+
+                (*och_usecount)++;
+
+                /* md_intent_lock() didn't get a request ref if there was an
+                 * open error, so don't do cleanup on the request here
+                 * (bug 3430) */
+                /* XXX (green): Should not we bail out on any error here, not
+                 * just open error? */
+		rc = it_open_error(DISP_OPEN_OPEN, it);
+		if (rc != 0)
+			GOTO(out_och_free, rc);
+
+		LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
+			 "inode %p: disposition %x, status %d\n", inode,
+			 it_disposition(it, ~0), it->it_status);
+
+		rc = ll_local_open(file, it, fd, *och_p);
+		if (rc)
+			GOTO(out_och_free, rc);
+	}
+	mutex_unlock(&lli->lli_och_mutex);
+        fd = NULL;
+
+        /* Must do this outside lli_och_mutex lock to prevent deadlock where
+           different kind of OPEN lock for this same inode gets cancelled
+           by ldlm_cancel_lru */
+        if (!S_ISREG(inode->i_mode))
+                GOTO(out_och_free, rc);
+
+	cl_lov_delay_create_clear(&file->f_flags);
+	GOTO(out_och_free, rc);
+
+out_och_free:
+        if (rc) {
+                if (och_p && *och_p) {
+                        OBD_FREE(*och_p, sizeof (struct obd_client_handle));
+                        *och_p = NULL; /* OBD_FREE writes some magic there */
+                        (*och_usecount)--;
+                }
+		mutex_unlock(&lli->lli_och_mutex);
+
+out_openerr:
+		if (lli->lli_opendir_key == fd)
+			ll_deauthorize_statahead(inode, fd);
+		if (fd != NULL)
+			ll_file_data_put(fd);
+        } else {
+                ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
+        }
+
+	if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
+		ptlrpc_req_finished(it->it_request);
+		it_clear_disposition(it, DISP_ENQ_OPEN_REF);
+	}
+
+        return rc;
+}
+
+static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
+			struct ldlm_lock_desc *desc, void *data, int flag)
+{
+	int rc;
+	struct lustre_handle lockh;
+	ENTRY;
+
+	switch (flag) {
+	case LDLM_CB_BLOCKING:
+		ldlm_lock2handle(lock, &lockh);
+		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+		if (rc < 0) {
+			CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
+			RETURN(rc);
+		}
+		break;
+	case LDLM_CB_CANCELING:
+		/* do nothing */
+		break;
+	}
+	RETURN(0);
+}
+
+/**
+ * When setting a lease on a file, we take ownership of the lli_mds_*_och
+ * and save it as fd->fd_och so as to force client to reopen the file even
+ * if it has an open lock in cache already.
+ */
+static int ll_lease_och_acquire(struct inode *inode, struct file *file,
+				struct lustre_handle *old_handle)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+	struct obd_client_handle **och_p;
+	__u64 *och_usecount;
+	int rc = 0;
+	ENTRY;
+
+	/* Get the openhandle of the file */
+	mutex_lock(&lli->lli_och_mutex);
+	if (fd->fd_lease_och != NULL)
+		GOTO(out_unlock, rc = -EBUSY);
+
+	if (fd->fd_och == NULL) {
+		if (file->f_mode & FMODE_WRITE) {
+			LASSERT(lli->lli_mds_write_och != NULL);
+			och_p = &lli->lli_mds_write_och;
+			och_usecount = &lli->lli_open_fd_write_count;
+		} else {
+			LASSERT(lli->lli_mds_read_och != NULL);
+			och_p = &lli->lli_mds_read_och;
+			och_usecount = &lli->lli_open_fd_read_count;
+		}
+
+		if (*och_usecount > 1)
+			GOTO(out_unlock, rc = -EBUSY);
+
+		fd->fd_och = *och_p;
+		*och_usecount = 0;
+		*och_p = NULL;
+	}
+
+	*old_handle = fd->fd_och->och_fh;
+
+	EXIT;
+out_unlock:
+	mutex_unlock(&lli->lli_och_mutex);
+	return rc;
+}
+
+/**
+ * Release ownership on lli_mds_*_och when putting back a file lease.
+ */
+static int ll_lease_och_release(struct inode *inode, struct file *file)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+	struct obd_client_handle **och_p;
+	struct obd_client_handle *old_och = NULL;
+	__u64 *och_usecount;
+	int rc = 0;
+	ENTRY;
+
+	mutex_lock(&lli->lli_och_mutex);
+	if (file->f_mode & FMODE_WRITE) {
+		och_p = &lli->lli_mds_write_och;
+		och_usecount = &lli->lli_open_fd_write_count;
+	} else {
+		och_p = &lli->lli_mds_read_och;
+		och_usecount = &lli->lli_open_fd_read_count;
+	}
+
+	/* The file may have been open by another process (broken lease) so
+	 * *och_p is not NULL. In this case we should simply increase usecount
+	 * and close fd_och.
+	 */
+	if (*och_p != NULL) {
+		old_och = fd->fd_och;
+		(*och_usecount)++;
+	} else {
+		*och_p = fd->fd_och;
+		*och_usecount = 1;
+	}
+	fd->fd_och = NULL;
+	mutex_unlock(&lli->lli_och_mutex);
+
+	if (old_och != NULL)
+		rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
+
+	RETURN(rc);
+}
+
+/**
+ * Acquire a lease and open the file.
+ */
+static struct obd_client_handle *
+ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
+	      __u64 open_flags)
+{
+	struct lookup_intent it = { .it_op = IT_OPEN };
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct md_op_data *op_data;
+	struct ptlrpc_request *req = NULL;
+	struct lustre_handle old_handle = { 0 };
+	struct obd_client_handle *och = NULL;
+	int rc;
+	int rc2;
+	ENTRY;
+
+	if (fmode != FMODE_WRITE && fmode != FMODE_READ)
+		RETURN(ERR_PTR(-EINVAL));
+
+	if (file != NULL) {
+		if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
+			RETURN(ERR_PTR(-EPERM));
+
+		rc = ll_lease_och_acquire(inode, file, &old_handle);
+		if (rc)
+			RETURN(ERR_PTR(rc));
+	}
+
+	OBD_ALLOC_PTR(och);
+	if (och == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
+					LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		GOTO(out, rc = PTR_ERR(op_data));
+
+	/* To tell the MDT this openhandle is from the same owner */
+	op_data->op_handle = old_handle;
+
+	it.it_flags = fmode | open_flags;
+	it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
+	rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
+			    &ll_md_blocking_lease_ast,
+	/* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
+	 * it can be cancelled which may mislead applications that the lease is
+	 * broken;
+	 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
+	 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
+	 * doesn't deal with openhandle, so normal openhandle will be leaked. */
+			    LDLM_FL_NO_LRU | LDLM_FL_EXCL);
+	ll_finish_md_op_data(op_data);
+	ptlrpc_req_finished(req);
+	if (rc < 0)
+		GOTO(out_release_it, rc);
+
+	if (it_disposition(&it, DISP_LOOKUP_NEG))
+		GOTO(out_release_it, rc = -ENOENT);
+
+	rc = it_open_error(DISP_OPEN_OPEN, &it);
+	if (rc)
+		GOTO(out_release_it, rc);
+
+	LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
+	ll_och_fill(sbi->ll_md_exp, &it, och);
+
+	if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
+		GOTO(out_close, rc = -EOPNOTSUPP);
+
+	/* already get lease, handle lease lock */
+	ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
+	if (it.it_lock_mode == 0 ||
+	    it.it_lock_bits != MDS_INODELOCK_OPEN) {
+		/* open lock must return for lease */
+		CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
+			PFID(ll_inode2fid(inode)), it.it_lock_mode,
+			it.it_lock_bits);
+		GOTO(out_close, rc = -EPROTO);
+	}
+
+	ll_intent_release(&it);
+	RETURN(och);
+
+out_close:
+	/* Cancel open lock */
+	if (it.it_lock_mode != 0) {
+		ldlm_lock_decref_and_cancel(&och->och_lease_handle,
+					    it.it_lock_mode);
+		it.it_lock_mode = 0;
+		och->och_lease_handle.cookie = 0ULL;
+	}
+	rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
+	if (rc2 < 0)
+		CERROR("%s: error closing file "DFID": %d\n",
+		       ll_get_fsname(inode->i_sb, NULL, 0),
+		       PFID(&ll_i2info(inode)->lli_fid), rc2);
+	och = NULL; /* och has been freed in ll_close_inode_openhandle() */
+out_release_it:
+	ll_intent_release(&it);
+out:
+	if (och != NULL)
+		OBD_FREE_PTR(och);
+	RETURN(ERR_PTR(rc));
+}
+
+/**
+ * Check whether a layout swap can be done between two inodes.
+ *
+ * \param[in] inode1  First inode to check
+ * \param[in] inode2  Second inode to check
+ *
+ * \retval 0 on success, layout swap can be performed between both inodes
+ * \retval negative error code if requirements are not met
+ */
+static int ll_check_swap_layouts_validity(struct inode *inode1,
+					  struct inode *inode2)
+{
+	if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
+		return -EINVAL;
+
+	if (inode_permission(inode1, MAY_WRITE) ||
+	    inode_permission(inode2, MAY_WRITE))
+		return -EPERM;
+
+	if (inode1->i_sb != inode2->i_sb)
+		return -EXDEV;
+
+	return 0;
+}
+
+static int ll_swap_layouts_close(struct obd_client_handle *och,
+				 struct inode *inode, struct inode *inode2)
+{
+	const struct lu_fid	*fid1 = ll_inode2fid(inode);
+	const struct lu_fid	*fid2;
+	int			 rc;
+	ENTRY;
+
+	CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
+	       ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
+
+	rc = ll_check_swap_layouts_validity(inode, inode2);
+	if (rc < 0)
+		GOTO(out_free_och, rc);
+
+	/* We now know that inode2 is a lustre inode */
+	fid2 = ll_inode2fid(inode2);
+
+	rc = lu_fid_cmp(fid1, fid2);
+	if (rc == 0)
+		GOTO(out_free_och, rc = -EINVAL);
+
+	/* Close the file and swap layouts between inode & inode2.
+	 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
+	 * because we still need it to pack l_remote_handle to MDT. */
+	rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
+				       inode2);
+
+	och = NULL; /* freed in ll_close_inode_openhandle() */
+
+out_free_och:
+	if (och != NULL)
+		OBD_FREE_PTR(och);
+
+	RETURN(rc);
+}
+
+/**
+ * Release lease and close the file.
+ * It will check if the lease has ever broken.
+ */
+static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
+			  bool *lease_broken)
+{
+	struct ldlm_lock *lock;
+	bool cancelled = true;
+	int rc;
+	ENTRY;
+
+	lock = ldlm_handle2lock(&och->och_lease_handle);
+	if (lock != NULL) {
+		lock_res_and_lock(lock);
+		cancelled = ldlm_is_cancel(lock);
+		unlock_res_and_lock(lock);
+		LDLM_LOCK_PUT(lock);
+	}
+
+	CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
+	       PFID(&ll_i2info(inode)->lli_fid), cancelled);
+
+	if (!cancelled)
+		ldlm_cli_cancel(&och->och_lease_handle, 0);
+
+	if (lease_broken != NULL)
+		*lease_broken = cancelled;
+
+	rc = ll_close_inode_openhandle(inode, och, 0, NULL);
+	RETURN(rc);
+}
+
+int ll_merge_attr(const struct lu_env *env, struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct cl_object *obj = lli->lli_clob;
+	struct cl_attr *attr = vvp_env_thread_attr(env);
+	s64 atime;
+	s64 mtime;
+	s64 ctime;
+	int rc = 0;
+
+	ENTRY;
+
+	ll_inode_size_lock(inode);
+
+	/* Merge timestamps the most recently obtained from MDS with
+	 * timestamps obtained from OSTs.
+	 *
+	 * Do not overwrite atime of inode because it may be refreshed
+	 * by file_accessed() function. If the read was served by cache
+	 * data, there is no RPC to be sent so that atime may not be
+	 * transferred to OSTs at all. MDT only updates atime at close time
+	 * if it's at least 'mdd.*.atime_diff' older.
+	 * All in all, the atime in Lustre does not strictly comply with
+	 * POSIX. Solving this problem needs to send an RPC to MDT for each
+	 * read, this will hurt performance. */
+	if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
+		LTIME_S(inode->i_atime) = lli->lli_atime;
+		lli->lli_update_atime = 0;
+	}
+	LTIME_S(inode->i_mtime) = lli->lli_mtime;
+	LTIME_S(inode->i_ctime) = lli->lli_ctime;
+
+	atime = LTIME_S(inode->i_atime);
+	mtime = LTIME_S(inode->i_mtime);
+	ctime = LTIME_S(inode->i_ctime);
+
+	cl_object_attr_lock(obj);
+	rc = cl_object_attr_get(env, obj, attr);
+	cl_object_attr_unlock(obj);
+
+	if (rc != 0)
+		GOTO(out_size_unlock, rc);
+
+	if (atime < attr->cat_atime)
+		atime = attr->cat_atime;
+
+	if (ctime < attr->cat_ctime)
+		ctime = attr->cat_ctime;
+
+	if (mtime < attr->cat_mtime)
+		mtime = attr->cat_mtime;
+
+	CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
+	       PFID(&lli->lli_fid), attr->cat_size);
+
+	i_size_write(inode, attr->cat_size);
+	inode->i_blocks = attr->cat_blocks;
+
+	LTIME_S(inode->i_atime) = atime;
+	LTIME_S(inode->i_mtime) = mtime;
+	LTIME_S(inode->i_ctime) = ctime;
+
+out_size_unlock:
+	ll_inode_size_unlock(inode);
+
+	RETURN(rc);
+}
+
+static bool file_is_noatime(const struct file *file)
+{
+	const struct vfsmount *mnt = file->f_path.mnt;
+	const struct inode *inode = file_inode((struct file *)file);
+
+	/* Adapted from file_accessed() and touch_atime().*/
+	if (file->f_flags & O_NOATIME)
+		return true;
+
+	if (inode->i_flags & S_NOATIME)
+		return true;
+
+	if (IS_NOATIME(inode))
+		return true;
+
+	if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
+		return true;
+
+	if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
+		return true;
+
+	if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
+		return true;
+
+	return false;
+}
+
+static int ll_file_io_ptask(struct cfs_ptask *ptask);
+
+static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
+{
+	struct inode *inode = file_inode(file);
+
+	memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
+	init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
+	io->u.ci_rw.rw_file = file;
+	io->u.ci_rw.rw_ptask = ll_file_io_ptask;
+	io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
+	if (iot == CIT_WRITE) {
+		io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
+		io->u.ci_rw.rw_sync   = !!(file->f_flags & O_SYNC ||
+					   file->f_flags & O_DIRECT ||
+					   IS_SYNC(inode));
+	}
+	io->ci_obj = ll_i2info(inode)->lli_clob;
+	io->ci_lockreq = CILR_MAYBE;
+	if (ll_file_nolock(file)) {
+		io->ci_lockreq = CILR_NEVER;
+		io->ci_no_srvlock = 1;
+	} else if (file->f_flags & O_APPEND) {
+		io->ci_lockreq = CILR_MANDATORY;
+	}
+	io->ci_noatime = file_is_noatime(file);
+	if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
+		io->ci_pio = !io->u.ci_rw.rw_append;
+	else
+		io->ci_pio = 0;
+}
+
+static int ll_file_io_ptask(struct cfs_ptask *ptask)
+{
+	struct cl_io_pt *pt = ptask->pt_cbdata;
+	struct file *file = pt->cip_file;
+	struct lu_env *env;
+	struct cl_io *io;
+	loff_t pos = pt->cip_pos;
+	int rc;
+	__u16 refcheck;
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
+		file_dentry(file)->d_name.name,
+		pt->cip_iot == CIT_READ ? "read" : "write",
+		pos, pos + pt->cip_count);
+
+restart:
+	io = vvp_env_thread_io(env);
+	ll_io_init(io, file, pt->cip_iot);
+	io->u.ci_rw.rw_iter = pt->cip_iter;
+	io->u.ci_rw.rw_iocb = pt->cip_iocb;
+	io->ci_pio = 0; /* It's already in parallel task */
+
+	rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
+			   pt->cip_count - pt->cip_result);
+	if (!rc) {
+		struct vvp_io *vio = vvp_env_io(env);
+
+		vio->vui_io_subtype = IO_NORMAL;
+		vio->vui_fd = LUSTRE_FPRIVATE(file);
+
+		ll_cl_add(file, env, io, LCC_RW);
+		rc = cl_io_loop(env, io);
+		ll_cl_remove(file, env);
+	} else {
+		/* cl_io_rw_init() handled IO */
+		rc = io->ci_result;
+	}
+
+	if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
+		if (io->ci_nob > 0)
+			io->ci_nob /= 2;
+		rc = -EIO;
+	}
+
+	if (io->ci_nob > 0) {
+		pt->cip_result += io->ci_nob;
+		iov_iter_advance(&pt->cip_iter, io->ci_nob);
+		pos += io->ci_nob;
+		pt->cip_iocb.ki_pos = pos;
+#ifdef HAVE_KIOCB_KI_LEFT
+		pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
+#elif defined(HAVE_KI_NBYTES)
+		pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
+#endif
+	}
+
+	cl_io_fini(env, io);
+
+	if ((rc == 0 || rc == -ENODATA) &&
+	    pt->cip_result < pt->cip_count &&
+	    io->ci_need_restart) {
+		CDEBUG(D_VFSTRACE,
+			"%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
+			file_dentry(file)->d_name.name,
+			pt->cip_iot == CIT_READ ? "read" : "write",
+			pos, pos + pt->cip_count - pt->cip_result,
+			pt->cip_result, rc);
+		goto restart;
+	}
+
+	CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
+		file_dentry(file)->d_name.name,
+		pt->cip_iot == CIT_READ ? "read" : "write",
+		pt->cip_result, rc);
+
+	cl_env_put(env, &refcheck);
+	RETURN(pt->cip_result > 0 ? 0 : rc);
+}
+
+static ssize_t
+ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
+		   struct file *file, enum cl_io_type iot,
+		   loff_t *ppos, size_t count)
+{
+	struct range_lock	range;
+	struct vvp_io		*vio = vvp_env_io(env);
+	struct inode		*inode = file_inode(file);
+	struct ll_inode_info	*lli = ll_i2info(inode);
+	struct ll_file_data	*fd  = LUSTRE_FPRIVATE(file);
+	struct cl_io		*io;
+	loff_t			pos = *ppos;
+	ssize_t			result = 0;
+	int			rc = 0;
+
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
+		file_dentry(file)->d_name.name,
+		iot == CIT_READ ? "read" : "write", pos, pos + count);
+
+restart:
+	io = vvp_env_thread_io(env);
+	ll_io_init(io, file, iot);
+	if (args->via_io_subtype == IO_NORMAL) {
+		io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
+		io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
+	} else {
+		io->ci_pio = 0;
+	}
+
+	if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
+		bool range_locked = false;
+
+		if (file->f_flags & O_APPEND)
+			range_lock_init(&range, 0, LUSTRE_EOF);
+		else
+			range_lock_init(&range, pos, pos + count - 1);
+
+		vio->vui_fd  = LUSTRE_FPRIVATE(file);
+		vio->vui_io_subtype = args->via_io_subtype;
+
+		switch (vio->vui_io_subtype) {
+		case IO_NORMAL:
+			/* Direct IO reads must also take range lock,
+			 * or multiple reads will try to work on the same pages
+			 * See LU-6227 for details. */
+			if (((iot == CIT_WRITE) ||
+			    (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
+			    !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+				CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
+				       RL_PARA(&range));
+				rc = range_lock(&lli->lli_write_tree, &range);
+				if (rc < 0)
+					GOTO(out, rc);
+
+				range_locked = true;
+			}
+			break;
+		case IO_SPLICE:
+			vio->u.splice.vui_pipe = args->u.splice.via_pipe;
+			vio->u.splice.vui_flags = args->u.splice.via_flags;
+			break;
+		default:
+			CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
+			LBUG();
+		}
+
+		ll_cl_add(file, env, io, LCC_RW);
+		if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
+		    !lli->lli_inode_locked) {
+			inode_lock(inode);
+			lli->lli_inode_locked = 1;
+		}
+		rc = cl_io_loop(env, io);
+		if (lli->lli_inode_locked) {
+			lli->lli_inode_locked = 0;
+			inode_unlock(inode);
+		}
+		ll_cl_remove(file, env);
+
+		if (range_locked) {
+			CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
+			       RL_PARA(&range));
+			range_unlock(&lli->lli_write_tree, &range);
+		}
+	} else {
+		/* cl_io_rw_init() handled IO */
+		rc = io->ci_result;
+	}
+
+	if (io->ci_nob > 0) {
+		result += io->ci_nob;
+		count  -= io->ci_nob;
+
+		if (args->via_io_subtype == IO_NORMAL) {
+			iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
+
+			/* CLIO is too complicated. See LU-11069. */
+			if (cl_io_is_append(io))
+				pos = io->u.ci_rw.rw_iocb.ki_pos;
+			else
+				pos += io->ci_nob;
+
+			args->u.normal.via_iocb->ki_pos = pos;
+#ifdef HAVE_KIOCB_KI_LEFT
+			args->u.normal.via_iocb->ki_left = count;
+#elif defined(HAVE_KI_NBYTES)
+			args->u.normal.via_iocb->ki_nbytes = count;
+#endif
+		} else {
+			/* for splice */
+			pos = io->u.ci_rw.rw_range.cir_pos;
+		}
+	}
+out:
+	cl_io_fini(env, io);
+
+	if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
+		CDEBUG(D_VFSTRACE,
+			"%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
+			file_dentry(file)->d_name.name,
+			iot == CIT_READ ? "read" : "write",
+			pos, pos + count, result, rc);
+		goto restart;
+	}
+
+	if (iot == CIT_READ) {
+		if (result > 0)
+			ll_stats_ops_tally(ll_i2sbi(inode),
+					   LPROC_LL_READ_BYTES, result);
+	} else if (iot == CIT_WRITE) {
+		if (result > 0) {
+			ll_stats_ops_tally(ll_i2sbi(inode),
+					   LPROC_LL_WRITE_BYTES, result);
+			fd->fd_write_failed = false;
+		} else if (result == 0 && rc == 0) {
+			rc = io->ci_result;
+			if (rc < 0)
+				fd->fd_write_failed = true;
+			else
+				fd->fd_write_failed = false;
+		} else if (rc != -ERESTARTSYS) {
+			fd->fd_write_failed = true;
+		}
+	}
+
+	CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
+		file_dentry(file)->d_name.name,
+		iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
+
+	*ppos = pos;
+
+	RETURN(result > 0 ? result : rc);
+}
+
+/**
+ * The purpose of fast read is to overcome per I/O overhead and improve IOPS
+ * especially for small I/O.
+ *
+ * To serve a read request, CLIO has to create and initialize a cl_io and
+ * then request DLM lock. This has turned out to have siginificant overhead
+ * and affects the performance of small I/O dramatically.
+ *
+ * It's not necessary to create a cl_io for each I/O. Under the help of read
+ * ahead, most of the pages being read are already in memory cache and we can
+ * read those pages directly because if the pages exist, the corresponding DLM
+ * lock must exist so that page content must be valid.
+ *
+ * In fast read implementation, the llite speculatively finds and reads pages
+ * in memory cache. There are three scenarios for fast read:
+ *   - If the page exists and is uptodate, kernel VM will provide the data and
+ *     CLIO won't be intervened;
+ *   - If the page was brought into memory by read ahead, it will be exported
+ *     and read ahead parameters will be updated;
+ *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
+ *     it will go back and invoke normal read, i.e., a cl_io will be created
+ *     and DLM lock will be requested.
+ *
+ * POSIX compliance: posix standard states that read is intended to be atomic.
+ * Lustre read implementation is in line with Linux kernel read implementation
+ * and neither of them complies with POSIX standard in this matter. Fast read
+ * doesn't make the situation worse on single node but it may interleave write
+ * results from multiple nodes due to short read handling in ll_file_aio_read().
+ *
+ * \param env - lu_env
+ * \param iocb - kiocb from kernel
+ * \param iter - user space buffers where the data will be copied
+ *
+ * \retval - number of bytes have been read, or error code if error occurred.
+ */
+static ssize_t
+ll_do_fast_read(const struct lu_env *env, struct kiocb *iocb,
+		struct iov_iter *iter)
+{
+	ssize_t result;
+
+	if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
+		return 0;
+
+	/* NB: we can't do direct IO for fast read because it will need a lock
+	 * to make IO engine happy. */
+	if (iocb->ki_filp->f_flags & O_DIRECT)
+		return 0;
+
+	ll_cl_add(iocb->ki_filp, env, NULL, LCC_RW);
+	result = generic_file_read_iter(iocb, iter);
+	ll_cl_remove(iocb->ki_filp, env);
+
+	/* If the first page is not in cache, generic_file_aio_read() will be
+	 * returned with -ENODATA.
+	 * See corresponding code in ll_readpage(). */
+	if (result == -ENODATA)
+		result = 0;
+
+	if (result > 0)
+		ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
+				LPROC_LL_READ_BYTES, result);
+
+	return result;
+}
+
+/*
+ * Read from a file (through the page cache).
+ */
+static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct lu_env *env;
+	struct vvp_io_args *args;
+	ssize_t result;
+	ssize_t rc2;
+	__u16 refcheck;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		return PTR_ERR(env);
+
+	result = ll_do_fast_read(env, iocb, to);
+	if (result < 0 || iov_iter_count(to) == 0)
+		GOTO(out, result);
+
+	args = ll_env_args(env, IO_NORMAL);
+	args->u.normal.via_iter = to;
+	args->u.normal.via_iocb = iocb;
+
+	rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
+				 &iocb->ki_pos, iov_iter_count(to));
+	if (rc2 > 0)
+		result += rc2;
+	else if (result == 0)
+		result = rc2;
+
+out:
+	cl_env_put(env, &refcheck);
+	return result;
+}
+
+/*
+ * Write to a file (through the page cache).
+ */
+static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct vvp_io_args *args;
+	struct lu_env *env;
+	ssize_t result;
+	__u16 refcheck;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		return PTR_ERR(env);
+
+	args = ll_env_args(env, IO_NORMAL);
+	args->u.normal.via_iter = from;
+	args->u.normal.via_iocb = iocb;
+
+	result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
+				    &iocb->ki_pos, iov_iter_count(from));
+	cl_env_put(env, &refcheck);
+	return result;
+}
+
+#ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
+/*
+ * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
+ */
+static int ll_file_get_iov_count(const struct iovec *iov,
+				 unsigned long *nr_segs, size_t *count)
+{
+	size_t cnt = 0;
+	unsigned long seg;
+
+	for (seg = 0; seg < *nr_segs; seg++) {
+		const struct iovec *iv = &iov[seg];
+
+		/*
+		 * If any segment has a negative length, or the cumulative
+		 * length ever wraps negative then return -EINVAL.
+		 */
+		cnt += iv->iov_len;
+		if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
+			return -EINVAL;
+		if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
+			continue;
+		if (seg == 0)
+			return -EFAULT;
+		*nr_segs = seg;
+		cnt -= iv->iov_len;	/* This segment is no good */
+		break;
+	}
+	*count = cnt;
+	return 0;
+}
+
+static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos)
+{
+	struct iov_iter	to;
+	size_t iov_count;
+	ssize_t result;
+	ENTRY;
+
+	result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
+	if (result)
+		RETURN(result);
+
+# ifdef HAVE_IOV_ITER_INIT_DIRECTION
+	iov_iter_init(&to, READ, iov, nr_segs, iov_count);
+# else /* !HAVE_IOV_ITER_INIT_DIRECTION */
+	iov_iter_init(&to, iov, nr_segs, iov_count, 0);
+# endif /* HAVE_IOV_ITER_INIT_DIRECTION */
+
+	result = ll_file_read_iter(iocb, &to);
+
+	RETURN(result);
+}
+
+static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
+			    loff_t *ppos)
+{
+	struct lu_env *env;
+	struct iovec   iov = { .iov_base = buf, .iov_len = count };
+	struct kiocb  *kiocb;
+	ssize_t        result;
+	__u16          refcheck;
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	kiocb = &ll_env_info(env)->lti_kiocb;
+        init_sync_kiocb(kiocb, file);
+        kiocb->ki_pos = *ppos;
+#ifdef HAVE_KIOCB_KI_LEFT
+	kiocb->ki_left = count;
+#elif defined(HAVE_KI_NBYTES)
+	kiocb->ki_nbytes = count;
+#endif
+
+	result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
+	*ppos = kiocb->ki_pos;
+
+	cl_env_put(env, &refcheck);
+	RETURN(result);
+}
+
+/*
+ * Write to a file (through the page cache).
+ * AIO stuff
+ */
+static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+				 unsigned long nr_segs, loff_t pos)
+{
+	struct iov_iter from;
+	size_t iov_count;
+	ssize_t result;
+	ENTRY;
+
+	result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
+	if (result)
+		RETURN(result);
+
+# ifdef HAVE_IOV_ITER_INIT_DIRECTION
+	iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
+# else /* !HAVE_IOV_ITER_INIT_DIRECTION */
+	iov_iter_init(&from, iov, nr_segs, iov_count, 0);
+# endif /* HAVE_IOV_ITER_INIT_DIRECTION */
+
+	result = ll_file_write_iter(iocb, &from);
+
+	RETURN(result);
+}
+
+static ssize_t ll_file_write(struct file *file, const char __user *buf,
+			     size_t count, loff_t *ppos)
+{
+	struct lu_env *env;
+	struct iovec   iov = { .iov_base = (void __user *)buf,
+			       .iov_len = count };
+        struct kiocb  *kiocb;
+        ssize_t        result;
+	__u16          refcheck;
+        ENTRY;
+
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN(PTR_ERR(env));
+
+	kiocb = &ll_env_info(env)->lti_kiocb;
+        init_sync_kiocb(kiocb, file);
+        kiocb->ki_pos = *ppos;
+#ifdef HAVE_KIOCB_KI_LEFT
+	kiocb->ki_left = count;
+#elif defined(HAVE_KI_NBYTES)
+	kiocb->ki_nbytes = count;
+#endif
+
+	result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
+	*ppos = kiocb->ki_pos;
+
+	cl_env_put(env, &refcheck);
+	RETURN(result);
+}
+#endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
+
+/*
+ * Send file content (through pagecache) somewhere with helper
+ */
+static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
+                                   struct pipe_inode_info *pipe, size_t count,
+                                   unsigned int flags)
+{
+        struct lu_env      *env;
+        struct vvp_io_args *args;
+        ssize_t             result;
+	__u16               refcheck;
+        ENTRY;
+
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN(PTR_ERR(env));
+
+	args = ll_env_args(env, IO_SPLICE);
+        args->u.splice.via_pipe = pipe;
+        args->u.splice.via_flags = flags;
+
+        result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
+        cl_env_put(env, &refcheck);
+        RETURN(result);
+}
+
+int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
+			     __u64 flags, struct lov_user_md *lum, int lum_size)
+{
+	struct lookup_intent oit = {
+		.it_op = IT_OPEN,
+		.it_flags = flags | MDS_OPEN_BY_FID,
+	};
+	int rc;
+	ENTRY;
+
+	ll_inode_size_lock(inode);
+	rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
+	if (rc < 0)
+		GOTO(out_unlock, rc);
+
+	ll_release_openhandle(dentry, &oit);
+
+out_unlock:
+	ll_inode_size_unlock(inode);
+	ll_intent_release(&oit);
+
+	RETURN(rc);
+}
+
+int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
+                             struct lov_mds_md **lmmp, int *lmm_size,
+                             struct ptlrpc_request **request)
+{
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        struct mdt_body  *body;
+        struct lov_mds_md *lmm = NULL;
+        struct ptlrpc_request *req = NULL;
+        struct md_op_data *op_data;
+        int rc, lmmsize;
+
+	rc = ll_get_default_mdsize(sbi, &lmmsize);
+	if (rc)
+		RETURN(rc);
+
+        op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
+                                     strlen(filename), lmmsize,
+                                     LUSTRE_OPC_ANY, NULL);
+        if (IS_ERR(op_data))
+                RETURN(PTR_ERR(op_data));
+
+        op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
+        rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
+        ll_finish_md_op_data(op_data);
+        if (rc < 0) {
+                CDEBUG(D_INFO, "md_getattr_name failed "
+                       "on %s: rc %d\n", filename, rc);
+                GOTO(out, rc);
+        }
+
+        body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+        LASSERT(body != NULL); /* checked by mdc_getattr_name */
+
+	lmmsize = body->mbo_eadatasize;
+
+	if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
+                        lmmsize == 0) {
+                GOTO(out, rc = -ENODATA);
+        }
+
+        lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
+        LASSERT(lmm != NULL);
+
+	if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
+	    lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
+	    lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
+		GOTO(out, rc = -EPROTO);
+
+        /*
+         * This is coming from the MDS, so is probably in
+         * little endian.  We convert it to host endian before
+         * passing it to userspace.
+         */
+        if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
+		int stripe_count;
+
+		if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
+		    lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
+			stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
+			if (le32_to_cpu(lmm->lmm_pattern) &
+			    LOV_PATTERN_F_RELEASED)
+				stripe_count = 0;
+		}
+
+                /* if function called for directory - we should
+                 * avoid swab not existent lsm objects */
+                if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
+			lustre_swab_lov_user_md_v1(
+					(struct lov_user_md_v1 *)lmm);
+			if (S_ISREG(body->mbo_mode))
+				lustre_swab_lov_user_md_objects(
+				    ((struct lov_user_md_v1 *)lmm)->lmm_objects,
+				    stripe_count);
+		} else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
+			lustre_swab_lov_user_md_v3(
+					(struct lov_user_md_v3 *)lmm);
+			if (S_ISREG(body->mbo_mode))
+				lustre_swab_lov_user_md_objects(
+				    ((struct lov_user_md_v3 *)lmm)->lmm_objects,
+				    stripe_count);
+		} else if (lmm->lmm_magic ==
+			   cpu_to_le32(LOV_MAGIC_COMP_V1)) {
+			lustre_swab_lov_comp_md_v1(
+					(struct lov_comp_md_v1 *)lmm);
+		}
+	}
+
+out:
+	*lmmp = lmm;
+	*lmm_size = lmmsize;
+	*request = req;
+	return rc;
+}
+
+static int ll_lov_setea(struct inode *inode, struct file *file,
+			void __user *arg)
+{
+	__u64			 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
+	struct lov_user_md	*lump;
+	int			 lum_size = sizeof(struct lov_user_md) +
+					    sizeof(struct lov_user_ost_data);
+	int			 rc;
+	ENTRY;
+
+	if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+		RETURN(-EPERM);
+
+	OBD_ALLOC_LARGE(lump, lum_size);
+	if (lump == NULL)
+                RETURN(-ENOMEM);
+
+	if (copy_from_user(lump, arg, lum_size))
+		GOTO(out_lump, rc = -EFAULT);
+
+	rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
+				      lum_size);
+	cl_lov_delay_create_clear(&file->f_flags);
+
+out_lump:
+	OBD_FREE_LARGE(lump, lum_size);
+	RETURN(rc);
+}
+
+static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
+{
+	struct lu_env	*env;
+	__u16		refcheck;
+	int		rc;
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
+	cl_env_put(env, &refcheck);
+	RETURN(rc);
+}
+
+static int ll_lov_setstripe(struct inode *inode, struct file *file,
+			    void __user *arg)
+{
+	struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
+	struct lov_user_md	  *klum;
+	int			   lum_size, rc;
+	__u64			   flags = FMODE_WRITE;
+	ENTRY;
+
+	rc = ll_copy_user_md(lum, &klum);
+	if (rc < 0)
+		RETURN(rc);
+
+	lum_size = rc;
+	rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
+				      lum_size);
+	if (!rc) {
+		__u32 gen;
+
+		rc = put_user(0, &lum->lmm_stripe_count);
+		if (rc)
+			GOTO(out, rc);
+
+		rc = ll_layout_refresh(inode, &gen);
+		if (rc)
+			GOTO(out, rc);
+
+		rc = ll_file_getstripe(inode, arg, lum_size);
+	}
+	cl_lov_delay_create_clear(&file->f_flags);
+
+out:
+	OBD_FREE(klum, lum_size);
+	RETURN(rc);
+}
+
+static int
+ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct cl_object *obj = lli->lli_clob;
+	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+	struct ll_grouplock grouplock;
+	int rc;
+	ENTRY;
+
+	if (arg == 0) {
+		CWARN("group id for group lock must not be 0\n");
+		RETURN(-EINVAL);
+	}
+
+        if (ll_file_nolock(file))
+                RETURN(-EOPNOTSUPP);
+
+	spin_lock(&lli->lli_lock);
+	if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
+		CWARN("group lock already existed with gid %lu\n",
+		      fd->fd_grouplock.lg_gid);
+		spin_unlock(&lli->lli_lock);
+		RETURN(-EINVAL);
+	}
+	LASSERT(fd->fd_grouplock.lg_lock == NULL);
+	spin_unlock(&lli->lli_lock);
+
+	/**
+	 * XXX: group lock needs to protect all OST objects while PFL
+	 * can add new OST objects during the IO, so we'd instantiate
+	 * all OST objects before getting its group lock.
+	 */
+	if (obj) {
+		struct lu_env *env;
+		__u16 refcheck;
+		struct cl_layout cl = {
+			.cl_is_composite = false,
+		};
+
+		env = cl_env_get(&refcheck);
+		if (IS_ERR(env))
+			RETURN(PTR_ERR(env));
+
+		rc = cl_object_layout_get(env, obj, &cl);
+		if (!rc && cl.cl_is_composite)
+			rc = ll_layout_write_intent(inode, 0, OBD_OBJECT_EOF);
+
+		cl_env_put(env, &refcheck);
+		if (rc)
+			RETURN(rc);
+	}
+
+	rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
+			      arg, (file->f_flags & O_NONBLOCK), &grouplock);
+	if (rc)
+		RETURN(rc);
+
+	spin_lock(&lli->lli_lock);
+	if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
+		spin_unlock(&lli->lli_lock);
+		CERROR("another thread just won the race\n");
+		cl_put_grouplock(&grouplock);
+		RETURN(-EINVAL);
+	}
+
+	fd->fd_flags |= LL_FILE_GROUP_LOCKED;
+	fd->fd_grouplock = grouplock;
+	spin_unlock(&lli->lli_lock);
+
+	CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
+	RETURN(0);
+}
+
+static int ll_put_grouplock(struct inode *inode, struct file *file,
+			    unsigned long arg)
+{
+	struct ll_inode_info   *lli = ll_i2info(inode);
+	struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
+	struct ll_grouplock	grouplock;
+	ENTRY;
+
+	spin_lock(&lli->lli_lock);
+	if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+		spin_unlock(&lli->lli_lock);
+                CWARN("no group lock held\n");
+                RETURN(-EINVAL);
+        }
+
+	LASSERT(fd->fd_grouplock.lg_lock != NULL);
+
+	if (fd->fd_grouplock.lg_gid != arg) {
+		CWARN("group lock %lu doesn't match current id %lu\n",
+		      arg, fd->fd_grouplock.lg_gid);
+		spin_unlock(&lli->lli_lock);
+		RETURN(-EINVAL);
+	}
+
+	grouplock = fd->fd_grouplock;
+	memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
+	fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
+	spin_unlock(&lli->lli_lock);
+
+	cl_put_grouplock(&grouplock);
+	CDEBUG(D_INFO, "group lock %lu released\n", arg);
+	RETURN(0);
+}
+
+/**
+ * Close inode open handle
+ *
+ * \param dentry [in]     dentry which contains the inode
+ * \param it     [in,out] intent which contains open info and result
+ *
+ * \retval 0     success
+ * \retval <0    failure
+ */
+int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
+{
+        struct inode *inode = dentry->d_inode;
+        struct obd_client_handle *och;
+        int rc;
+        ENTRY;
+
+        LASSERT(inode);
+
+        /* Root ? Do nothing. */
+        if (dentry->d_inode->i_sb->s_root == dentry)
+                RETURN(0);
+
+        /* No open handle to close? Move away */
+        if (!it_disposition(it, DISP_OPEN_OPEN))
+                RETURN(0);
+
+        LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
+
+        OBD_ALLOC(och, sizeof(*och));
+        if (!och)
+                GOTO(out, rc = -ENOMEM);
+
+	ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
+
+	rc = ll_close_inode_openhandle(inode, och, 0, NULL);
+out:
+	/* this one is in place of ll_file_open */
+	if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
+		ptlrpc_req_finished(it->it_request);
+		it_clear_disposition(it, DISP_ENQ_OPEN_REF);
+	}
+	RETURN(rc);
+}
+
+/**
+ * Get size for inode for which FIEMAP mapping is requested.
+ * Make the FIEMAP get_info call and returns the result.
+ * \param fiemap	kernel buffer to hold extens
+ * \param num_bytes	kernel buffer size
+ */
+static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
+			size_t num_bytes)
+{
+	struct lu_env			*env;
+	__u16				refcheck;
+	int				rc = 0;
+	struct ll_fiemap_info_key	fmkey = { .lfik_name = KEY_FIEMAP, };
+	ENTRY;
+
+	/* Checks for fiemap flags */
+	if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
+		fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
+		return -EBADR;
+	}
+
+	/* Check for FIEMAP_FLAG_SYNC */
+	if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
+		rc = filemap_fdatawrite(inode->i_mapping);
+		if (rc)
+			return rc;
+	}
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	if (i_size_read(inode) == 0) {
+		rc = ll_glimpse_size(inode);
+		if (rc)
+			GOTO(out, rc);
+	}
+
+	fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+	obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
+	obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
+
+	/* If filesize is 0, then there would be no objects for mapping */
+	if (fmkey.lfik_oa.o_size == 0) {
+		fiemap->fm_mapped_extents = 0;
+		GOTO(out, rc = 0);
+	}
+
+	fmkey.lfik_fiemap = *fiemap;
+
+	rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
+			      &fmkey, fiemap, &num_bytes);
+out:
+	cl_env_put(env, &refcheck);
+	RETURN(rc);
+}
+
+int ll_fid2path(struct inode *inode, void __user *arg)
+{
+	struct obd_export	*exp = ll_i2mdexp(inode);
+	const struct getinfo_fid2path __user *gfin = arg;
+	__u32			 pathlen;
+	struct getinfo_fid2path	*gfout;
+	size_t			 outsize;
+	int			 rc;
+
+	ENTRY;
+
+	if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
+	    !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
+		RETURN(-EPERM);
+
+	/* Only need to get the buflen */
+	if (get_user(pathlen, &gfin->gf_pathlen))
+		RETURN(-EFAULT);
+
+	if (pathlen > PATH_MAX)
+		RETURN(-EINVAL);
+
+	outsize = sizeof(*gfout) + pathlen;
+	OBD_ALLOC(gfout, outsize);
+	if (gfout == NULL)
+		RETURN(-ENOMEM);
+
+	if (copy_from_user(gfout, arg, sizeof(*gfout)))
+		GOTO(gf_free, rc = -EFAULT);
+	/* append root FID after gfout to let MDT know the root FID so that it
+	 * can lookup the correct path, this is mainly for fileset.
+	 * old server without fileset mount support will ignore this. */
+	*gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
+
+	/* Call mdc_iocontrol */
+	rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
+	if (rc != 0)
+		GOTO(gf_free, rc);
+
+	if (copy_to_user(arg, gfout, outsize))
+		rc = -EFAULT;
+
+gf_free:
+	OBD_FREE(gfout, outsize);
+	RETURN(rc);
+}
+
+/*
+ * Read the data_version for inode.
+ *
+ * This value is computed using stripe object version on OST.
+ * Version is computed using server side locking.
+ *
+ * @param flags if do sync on the OST side;
+ *		0: no sync
+ *		LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
+ *		LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
+ */
+int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
+{
+	struct cl_object *obj = ll_i2info(inode)->lli_clob;
+	struct lu_env *env;
+	struct cl_io *io;
+	__u16  refcheck;
+	int result;
+
+	ENTRY;
+
+	/* If no file object initialized, we consider its version is 0. */
+	if (obj == NULL) {
+		*data_version = 0;
+		RETURN(0);
+	}
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	io = vvp_env_thread_io(env);
+	io->ci_obj = obj;
+	io->u.ci_data_version.dv_data_version = 0;
+	io->u.ci_data_version.dv_flags = flags;
+
+restart:
+	if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
+		result = cl_io_loop(env, io);
+	else
+		result = io->ci_result;
+
+	*data_version = io->u.ci_data_version.dv_data_version;
+
+	cl_io_fini(env, io);
+
+	if (unlikely(io->ci_need_restart))
+		goto restart;
+
+	cl_env_put(env, &refcheck);
+
+	RETURN(result);
+}
+
+/*
+ * Trigger a HSM release request for the provided inode.
+ */
+int ll_hsm_release(struct inode *inode)
+{
+	struct lu_env *env;
+	struct obd_client_handle *och = NULL;
+	__u64 data_version = 0;
+	int rc;
+	__u16 refcheck;
+	ENTRY;
+
+	CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
+	       ll_get_fsname(inode->i_sb, NULL, 0),
+	       PFID(&ll_i2info(inode)->lli_fid));
+
+	och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
+	if (IS_ERR(och))
+		GOTO(out, rc = PTR_ERR(och));
+
+	/* Grab latest data_version and [am]time values */
+	rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
+	if (rc != 0)
+		GOTO(out, rc);
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		GOTO(out, rc = PTR_ERR(env));
+
+	ll_merge_attr(env, inode);
+	cl_env_put(env, &refcheck);
+
+	/* Release the file.
+	 * NB: lease lock handle is released in mdc_hsm_release_pack() because
+	 * we still need it to pack l_remote_handle to MDT. */
+	rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
+				       &data_version);
+	och = NULL;
+
+	EXIT;
+out:
+	if (och != NULL && !IS_ERR(och)) /* close the file */
+		ll_lease_close(och, inode, NULL);
+
+	return rc;
+}
+
+struct ll_swap_stack {
+	__u64			 dv1;
+	__u64			 dv2;
+	struct inode		*inode1;
+	struct inode		*inode2;
+	bool			 check_dv1;
+	bool			 check_dv2;
+};
+
+static int ll_swap_layouts(struct file *file1, struct file *file2,
+			   struct lustre_swap_layouts *lsl)
+{
+	struct mdc_swap_layouts	 msl;
+	struct md_op_data	*op_data;
+	__u32			 gid;
+	__u64			 dv;
+	struct ll_swap_stack	*llss = NULL;
+	int			 rc;
+
+	OBD_ALLOC_PTR(llss);
+	if (llss == NULL)
+		RETURN(-ENOMEM);
+
+	llss->inode1 = file_inode(file1);
+	llss->inode2 = file_inode(file2);
+
+	rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
+	if (rc < 0)
+		GOTO(free, rc);
+
+	/* we use 2 bool because it is easier to swap than 2 bits */
+	if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
+		llss->check_dv1 = true;
+
+	if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
+		llss->check_dv2 = true;
+
+	/* we cannot use lsl->sl_dvX directly because we may swap them */
+	llss->dv1 = lsl->sl_dv1;
+	llss->dv2 = lsl->sl_dv2;
+
+	rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
+	if (rc == 0) /* same file, done! */
+		GOTO(free, rc);
+
+	if (rc < 0) { /* sequentialize it */
+		swap(llss->inode1, llss->inode2);
+		swap(file1, file2);
+		swap(llss->dv1, llss->dv2);
+		swap(llss->check_dv1, llss->check_dv2);
+	}
+
+	gid = lsl->sl_gid;
+	if (gid != 0) { /* application asks to flush dirty cache */
+		rc = ll_get_grouplock(llss->inode1, file1, gid);
+		if (rc < 0)
+			GOTO(free, rc);
+
+		rc = ll_get_grouplock(llss->inode2, file2, gid);
+		if (rc < 0) {
+			ll_put_grouplock(llss->inode1, file1, gid);
+			GOTO(free, rc);
+		}
+	}
+
+	/* ultimate check, before swaping the layouts we check if
+	 * dataversion has changed (if requested) */
+	if (llss->check_dv1) {
+		rc = ll_data_version(llss->inode1, &dv, 0);
+		if (rc)
+			GOTO(putgl, rc);
+		if (dv != llss->dv1)
+			GOTO(putgl, rc = -EAGAIN);
+	}
+
+	if (llss->check_dv2) {
+		rc = ll_data_version(llss->inode2, &dv, 0);
+		if (rc)
+			GOTO(putgl, rc);
+		if (dv != llss->dv2)
+			GOTO(putgl, rc = -EAGAIN);
+	}
+
+	/* struct md_op_data is used to send the swap args to the mdt
+	 * only flags is missing, so we use struct mdc_swap_layouts
+	 * through the md_op_data->op_data */
+	/* flags from user space have to be converted before they are send to
+	 * server, no flag is sent today, they are only used on the client */
+	msl.msl_flags = 0;
+	rc = -ENOMEM;
+	op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
+				     0, LUSTRE_OPC_ANY, &msl);
+	if (IS_ERR(op_data))
+		GOTO(free, rc = PTR_ERR(op_data));
+
+	rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
+			   sizeof(*op_data), op_data, NULL);
+	ll_finish_md_op_data(op_data);
+
+	if (rc < 0)
+		GOTO(putgl, rc);
+
+putgl:
+	if (gid != 0) {
+		ll_put_grouplock(llss->inode2, file2, gid);
+		ll_put_grouplock(llss->inode1, file1, gid);
+	}
+
+free:
+	if (llss != NULL)
+		OBD_FREE_PTR(llss);
+
+	RETURN(rc);
+}
+
+int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
+{
+	struct md_op_data	*op_data;
+	int			 rc;
+	ENTRY;
+
+	/* Detect out-of range masks */
+	if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
+		RETURN(-EINVAL);
+
+	/* Non-root users are forbidden to set or clear flags which are
+	 * NOT defined in HSM_USER_MASK. */
+	if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
+	    !cfs_capable(CFS_CAP_SYS_ADMIN))
+		RETURN(-EPERM);
+
+	/* Detect out-of range archive id */
+	if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
+	    (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
+		RETURN(-EINVAL);
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, hss);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
+			   sizeof(*op_data), op_data, NULL);
+
+	ll_finish_md_op_data(op_data);
+
+	RETURN(rc);
+}
+
+static int ll_hsm_import(struct inode *inode, struct file *file,
+			 struct hsm_user_import *hui)
+{
+	struct hsm_state_set	*hss = NULL;
+	struct iattr		*attr = NULL;
+	int			 rc;
+	ENTRY;
+
+	if (!S_ISREG(inode->i_mode))
+		RETURN(-EINVAL);
+
+	/* set HSM flags */
+	OBD_ALLOC_PTR(hss);
+	if (hss == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
+	hss->hss_archive_id = hui->hui_archive_id;
+	hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
+	rc = ll_hsm_state_set(inode, hss);
+	if (rc != 0)
+		GOTO(out, rc);
+
+	OBD_ALLOC_PTR(attr);
+	if (attr == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
+	attr->ia_mode |= S_IFREG;
+	attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
+	attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
+	attr->ia_size = hui->hui_size;
+	attr->ia_mtime.tv_sec = hui->hui_mtime;
+	attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
+	attr->ia_atime.tv_sec = hui->hui_atime;
+	attr->ia_atime.tv_nsec = hui->hui_atime_ns;
+
+	attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
+			 ATTR_UID | ATTR_GID |
+			 ATTR_MTIME | ATTR_MTIME_SET |
+			 ATTR_ATIME | ATTR_ATIME_SET;
+
+	inode_lock(inode);
+
+	rc = ll_setattr_raw(file_dentry(file), attr, true);
+	if (rc == -ENODATA)
+		rc = 0;
+
+	inode_unlock(inode);
+
+out:
+	if (hss != NULL)
+		OBD_FREE_PTR(hss);
+
+	if (attr != NULL)
+		OBD_FREE_PTR(attr);
+
+	RETURN(rc);
+}
+
+static inline long ll_lease_type_from_fmode(fmode_t fmode)
+{
+	return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
+	       ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
+}
+
+static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
+{
+	struct inode *inode = file_inode(file);
+	struct iattr ia = {
+		.ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
+			    ATTR_MTIME | ATTR_MTIME_SET |
+			    ATTR_CTIME | ATTR_CTIME_SET,
+		.ia_atime = {
+			.tv_sec = lfu->lfu_atime_sec,
+			.tv_nsec = lfu->lfu_atime_nsec,
+		},
+		.ia_mtime = {
+			.tv_sec = lfu->lfu_mtime_sec,
+			.tv_nsec = lfu->lfu_mtime_nsec,
+		},
+		.ia_ctime = {
+			.tv_sec = lfu->lfu_ctime_sec,
+			.tv_nsec = lfu->lfu_ctime_nsec,
+		},
+	};
+	int rc;
+	ENTRY;
+
+	if (!capable(CAP_SYS_ADMIN))
+		RETURN(-EPERM);
+
+	if (!S_ISREG(inode->i_mode))
+		RETURN(-EINVAL);
+
+	inode_lock(inode);
+	rc = ll_setattr_raw(file_dentry(file), &ia, false);
+	inode_unlock(inode);
+
+	RETURN(rc);
+}
+
+/*
+ * Give file access advices
+ *
+ * The ladvise interface is similar to Linux fadvise() system call, except it
+ * forwards the advices directly from Lustre client to server. The server side
+ * codes will apply appropriate read-ahead and caching techniques for the
+ * corresponding files.
+ *
+ * A typical workload for ladvise is e.g. a bunch of different clients are
+ * doing small random reads of a file, so prefetching pages into OSS cache
+ * with big linear reads before the random IO is a net benefit. Fetching
+ * all that data into each client cache with fadvise() may not be, due to
+ * much more data being sent to the client.
+ */
+static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
+		      struct llapi_lu_ladvise *ladvise)
+{
+	struct lu_env *env;
+	struct cl_io *io;
+	struct cl_ladvise_io *lio;
+	int rc;
+	__u16 refcheck;
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	io = vvp_env_thread_io(env);
+	io->ci_obj = ll_i2info(inode)->lli_clob;
+
+	/* initialize parameters for ladvise */
+	lio = &io->u.ci_ladvise;
+	lio->li_start = ladvise->lla_start;
+	lio->li_end = ladvise->lla_end;
+	lio->li_fid = ll_inode2fid(inode);
+	lio->li_advice = ladvise->lla_advice;
+	lio->li_flags = flags;
+
+	if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
+		rc = cl_io_loop(env, io);
+	else
+		rc = io->ci_result;
+
+	cl_io_fini(env, io);
+	cl_env_put(env, &refcheck);
+	RETURN(rc);
+}
+
+int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
+			unsigned long arg)
+{
+	struct fsxattr fsxattr;
+
+	if (copy_from_user(&fsxattr,
+			   (const struct fsxattr __user *)arg,
+			   sizeof(fsxattr)))
+		RETURN(-EFAULT);
+
+	fsxattr.fsx_xflags = ll_inode_to_ext_flags(inode->i_flags);
+	fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
+	if (copy_to_user((struct fsxattr __user *)arg,
+			 &fsxattr, sizeof(fsxattr)))
+		RETURN(-EFAULT);
+
+	RETURN(0);
+}
+
+int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
+			unsigned long arg)
+{
+
+	struct md_op_data *op_data;
+	struct ptlrpc_request *req = NULL;
+	int rc = 0;
+	struct fsxattr fsxattr;
+	struct cl_object *obj;
+
+	/* only root could change project ID */
+	if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+		RETURN(-EPERM);
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	if (copy_from_user(&fsxattr,
+			   (const struct fsxattr __user *)arg,
+			   sizeof(fsxattr)))
+		GOTO(out_fsxattr1, rc = -EFAULT);
+
+	op_data->op_attr_flags = fsxattr.fsx_xflags;
+	op_data->op_projid = fsxattr.fsx_projid;
+	op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
+	rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
+			0, &req);
+	ptlrpc_req_finished(req);
+
+	obj = ll_i2info(inode)->lli_clob;
+	if (obj) {
+		struct iattr *attr;
+
+		inode->i_flags = ll_ext_to_inode_flags(fsxattr.fsx_xflags);
+		OBD_ALLOC_PTR(attr);
+		if (attr == NULL)
+			GOTO(out_fsxattr1, rc = -ENOMEM);
+		attr->ia_valid = ATTR_ATTR_FLAG;
+		rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
+
+		OBD_FREE_PTR(attr);
+	}
+out_fsxattr1:
+	ll_finish_md_op_data(op_data);
+	RETURN(rc);
+
+
+}
+
+static long
+ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct inode		*inode = file_inode(file);
+	struct ll_file_data	*fd = LUSTRE_FPRIVATE(file);
+	int			 flags, rc;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
+	       PFID(ll_inode2fid(inode)), inode, cmd);
+        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
+
+        /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
+        if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
+                RETURN(-ENOTTY);
+
+        switch(cmd) {
+        case LL_IOC_GETFLAGS:
+                /* Get the current value of the file flags */
+		return put_user(fd->fd_flags, (int __user *)arg);
+        case LL_IOC_SETFLAGS:
+        case LL_IOC_CLRFLAGS:
+                /* Set or clear specific file flags */
+                /* XXX This probably needs checks to ensure the flags are
+                 *     not abused, and to handle any flag side effects.
+                 */
+		if (get_user(flags, (int __user *) arg))
+                        RETURN(-EFAULT);
+
+                if (cmd == LL_IOC_SETFLAGS) {
+                        if ((flags & LL_FILE_IGNORE_LOCK) &&
+                            !(file->f_flags & O_DIRECT)) {
+                                CERROR("%s: unable to disable locking on "
+                                       "non-O_DIRECT file\n", current->comm);
+                                RETURN(-EINVAL);
+                        }
+
+                        fd->fd_flags |= flags;
+                } else {
+                        fd->fd_flags &= ~flags;
+                }
+                RETURN(0);
+	case LL_IOC_LOV_SETSTRIPE:
+	case LL_IOC_LOV_SETSTRIPE_NEW:
+		RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
+	case LL_IOC_LOV_SETEA:
+		RETURN(ll_lov_setea(inode, file, (void __user *)arg));
+	case LL_IOC_LOV_SWAP_LAYOUTS: {
+		struct file *file2;
+		struct lustre_swap_layouts lsl;
+
+		if (copy_from_user(&lsl, (char __user *)arg,
+				   sizeof(struct lustre_swap_layouts)))
+			RETURN(-EFAULT);
+
+		if ((file->f_flags & O_ACCMODE) == O_RDONLY)
+			RETURN(-EPERM);
+
+		file2 = fget(lsl.sl_fd);
+		if (file2 == NULL)
+			RETURN(-EBADF);
+
+		/* O_WRONLY or O_RDWR */
+		if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
+			GOTO(out, rc = -EPERM);
+
+		if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
+			struct inode			*inode2;
+			struct ll_inode_info		*lli;
+			struct obd_client_handle	*och = NULL;
+
+			if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
+				GOTO(out, rc = -EINVAL);
+
+			lli = ll_i2info(inode);
+			mutex_lock(&lli->lli_och_mutex);
+			if (fd->fd_lease_och != NULL) {
+				och = fd->fd_lease_och;
+				fd->fd_lease_och = NULL;
+			}
+			mutex_unlock(&lli->lli_och_mutex);
+			if (och == NULL)
+				GOTO(out, rc = -ENOLCK);
+			inode2 = file_inode(file2);
+			rc = ll_swap_layouts_close(och, inode, inode2);
+		} else {
+			rc = ll_swap_layouts(file, file2, &lsl);
+		}
+out:
+		fput(file2);
+		RETURN(rc);
+	}
+	case LL_IOC_LOV_GETSTRIPE:
+	case LL_IOC_LOV_GETSTRIPE_NEW:
+		RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
+        case FSFILT_IOC_GETFLAGS:
+        case FSFILT_IOC_SETFLAGS:
+                RETURN(ll_iocontrol(inode, file, cmd, arg));
+        case FSFILT_IOC_GETVERSION_OLD:
+        case FSFILT_IOC_GETVERSION:
+		RETURN(put_user(inode->i_generation, (int __user *)arg));
+        case LL_IOC_GROUP_LOCK:
+                RETURN(ll_get_grouplock(inode, file, arg));
+        case LL_IOC_GROUP_UNLOCK:
+                RETURN(ll_put_grouplock(inode, file, arg));
+        case IOC_OBD_STATFS:
+		RETURN(ll_obd_statfs(inode, (void __user *)arg));
+
+        /* We need to special case any other ioctls we want to handle,
+         * to send them to the MDS/OST as appropriate and to properly
+         * network encode the arg field.
+        case FSFILT_IOC_SETVERSION_OLD:
+        case FSFILT_IOC_SETVERSION:
+        */
+	case LL_IOC_FLUSHCTX:
+		RETURN(ll_flush_ctx(inode));
+	case LL_IOC_PATH2FID: {
+		if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
+				 sizeof(struct lu_fid)))
+			RETURN(-EFAULT);
+
+		RETURN(0);
+	}
+	case LL_IOC_GETPARENT:
+		RETURN(ll_getparent(file, (struct getparent __user *)arg));
+
+	case OBD_IOC_FID2PATH:
+		RETURN(ll_fid2path(inode, (void __user *)arg));
+	case LL_IOC_DATA_VERSION: {
+		struct ioc_data_version	idv;
+		int rc;
+
+		if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
+			RETURN(-EFAULT);
+
+		idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
+		rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
+
+		if (rc == 0 &&
+		    copy_to_user((char __user *)arg, &idv, sizeof(idv)))
+			RETURN(-EFAULT);
+
+		RETURN(rc);
+	}
+
+        case LL_IOC_GET_MDTIDX: {
+                int mdtidx;
+
+                mdtidx = ll_get_mdt_idx(inode);
+                if (mdtidx < 0)
+                        RETURN(mdtidx);
+
+		if (put_user((int)mdtidx, (int __user *)arg))
+                        RETURN(-EFAULT);
+
+                RETURN(0);
+        }
+	case OBD_IOC_GETDTNAME:
+	case OBD_IOC_GETMDNAME:
+		RETURN(ll_get_obd_name(inode, cmd, arg));
+	case LL_IOC_HSM_STATE_GET: {
+		struct md_op_data	*op_data;
+		struct hsm_user_state	*hus;
+		int			 rc;
+
+		OBD_ALLOC_PTR(hus);
+		if (hus == NULL)
+			RETURN(-ENOMEM);
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+					     LUSTRE_OPC_ANY, hus);
+		if (IS_ERR(op_data)) {
+			OBD_FREE_PTR(hus);
+			RETURN(PTR_ERR(op_data));
+		}
+
+		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
+				   op_data, NULL);
+
+		if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
+			rc = -EFAULT;
+
+		ll_finish_md_op_data(op_data);
+		OBD_FREE_PTR(hus);
+		RETURN(rc);
+	}
+	case LL_IOC_HSM_STATE_SET: {
+		struct hsm_state_set	*hss;
+		int			 rc;
+
+		OBD_ALLOC_PTR(hss);
+		if (hss == NULL)
+			RETURN(-ENOMEM);
+
+		if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
+			OBD_FREE_PTR(hss);
+			RETURN(-EFAULT);
+		}
+
+		rc = ll_hsm_state_set(inode, hss);
+
+		OBD_FREE_PTR(hss);
+		RETURN(rc);
+	}
+	case LL_IOC_HSM_ACTION: {
+		struct md_op_data		*op_data;
+		struct hsm_current_action	*hca;
+		int				 rc;
+
+		OBD_ALLOC_PTR(hca);
+		if (hca == NULL)
+			RETURN(-ENOMEM);
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+					     LUSTRE_OPC_ANY, hca);
+		if (IS_ERR(op_data)) {
+			OBD_FREE_PTR(hca);
+			RETURN(PTR_ERR(op_data));
+		}
+
+		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
+				   op_data, NULL);
+
+		if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
+			rc = -EFAULT;
+
+		ll_finish_md_op_data(op_data);
+		OBD_FREE_PTR(hca);
+		RETURN(rc);
+	}
+	case LL_IOC_SET_LEASE: {
+		struct ll_inode_info *lli = ll_i2info(inode);
+		struct obd_client_handle *och = NULL;
+		bool lease_broken;
+		fmode_t fmode;
+
+		switch (arg) {
+		case LL_LEASE_WRLCK:
+			if (!(file->f_mode & FMODE_WRITE))
+				RETURN(-EPERM);
+			fmode = FMODE_WRITE;
+			break;
+		case LL_LEASE_RDLCK:
+			if (!(file->f_mode & FMODE_READ))
+				RETURN(-EPERM);
+			fmode = FMODE_READ;
+			break;
+		case LL_LEASE_UNLCK:
+			mutex_lock(&lli->lli_och_mutex);
+			if (fd->fd_lease_och != NULL) {
+				och = fd->fd_lease_och;
+				fd->fd_lease_och = NULL;
+			}
+			mutex_unlock(&lli->lli_och_mutex);
+
+			if (och == NULL)
+				RETURN(-ENOLCK);
+
+			fmode = och->och_flags;
+			rc = ll_lease_close(och, inode, &lease_broken);
+			if (rc < 0)
+				RETURN(rc);
+
+			rc = ll_lease_och_release(inode, file);
+			if (rc < 0)
+				RETURN(rc);
+
+			if (lease_broken)
+				fmode = 0;
+
+			RETURN(ll_lease_type_from_fmode(fmode));
+		default:
+			RETURN(-EINVAL);
+		}
+
+		CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
+
+		/* apply for lease */
+		och = ll_lease_open(inode, file, fmode, 0);
+		if (IS_ERR(och))
+			RETURN(PTR_ERR(och));
+
+		rc = 0;
+		mutex_lock(&lli->lli_och_mutex);
+		if (fd->fd_lease_och == NULL) {
+			fd->fd_lease_och = och;
+			och = NULL;
+		}
+		mutex_unlock(&lli->lli_och_mutex);
+		if (och != NULL) {
+			/* impossible now that only excl is supported for now */
+			ll_lease_close(och, inode, &lease_broken);
+			rc = -EBUSY;
+		}
+		RETURN(rc);
+	}
+	case LL_IOC_GET_LEASE: {
+		struct ll_inode_info *lli = ll_i2info(inode);
+		struct ldlm_lock *lock = NULL;
+		fmode_t fmode = 0;
+
+		mutex_lock(&lli->lli_och_mutex);
+		if (fd->fd_lease_och != NULL) {
+			struct obd_client_handle *och = fd->fd_lease_och;
+
+			lock = ldlm_handle2lock(&och->och_lease_handle);
+			if (lock != NULL) {
+				lock_res_and_lock(lock);
+				if (!ldlm_is_cancel(lock))
+					fmode = och->och_flags;
+
+				unlock_res_and_lock(lock);
+				LDLM_LOCK_PUT(lock);
+			}
+		}
+		mutex_unlock(&lli->lli_och_mutex);
+
+		RETURN(ll_lease_type_from_fmode(fmode));
+	}
+	case LL_IOC_HSM_IMPORT: {
+		struct hsm_user_import *hui;
+
+		OBD_ALLOC_PTR(hui);
+		if (hui == NULL)
+			RETURN(-ENOMEM);
+
+		if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
+			OBD_FREE_PTR(hui);
+			RETURN(-EFAULT);
+		}
+
+		rc = ll_hsm_import(inode, file, hui);
+
+		OBD_FREE_PTR(hui);
+		RETURN(rc);
+	}
+	case LL_IOC_FUTIMES_3: {
+		struct ll_futimes_3 lfu;
+
+		if (copy_from_user(&lfu,
+				   (const struct ll_futimes_3 __user *)arg,
+				   sizeof(lfu)))
+			RETURN(-EFAULT);
+
+		RETURN(ll_file_futimes_3(file, &lfu));
+	}
+	case LL_IOC_LADVISE: {
+		struct llapi_ladvise_hdr *ladvise_hdr;
+		int i;
+		int num_advise;
+		int alloc_size = sizeof(*ladvise_hdr);
+
+		rc = 0;
+		OBD_ALLOC_PTR(ladvise_hdr);
+		if (ladvise_hdr == NULL)
+			RETURN(-ENOMEM);
+
+		if (copy_from_user(ladvise_hdr,
+				   (const struct llapi_ladvise_hdr __user *)arg,
+				   alloc_size))
+			GOTO(out_ladvise, rc = -EFAULT);
+
+		if (ladvise_hdr->lah_magic != LADVISE_MAGIC ||
+		    ladvise_hdr->lah_count < 1)
+			GOTO(out_ladvise, rc = -EINVAL);
+
+		num_advise = ladvise_hdr->lah_count;
+		if (num_advise >= LAH_COUNT_MAX)
+			GOTO(out_ladvise, rc = -EFBIG);
+
+		OBD_FREE_PTR(ladvise_hdr);
+		alloc_size = offsetof(typeof(*ladvise_hdr),
+				      lah_advise[num_advise]);
+		OBD_ALLOC(ladvise_hdr, alloc_size);
+		if (ladvise_hdr == NULL)
+			RETURN(-ENOMEM);
+
+		/*
+		 * TODO: submit multiple advices to one server in a single RPC
+		 */
+		if (copy_from_user(ladvise_hdr,
+				   (const struct llapi_ladvise_hdr __user *)arg,
+				   alloc_size))
+			GOTO(out_ladvise, rc = -EFAULT);
+
+		for (i = 0; i < num_advise; i++) {
+			rc = ll_ladvise(inode, file, ladvise_hdr->lah_flags,
+					&ladvise_hdr->lah_advise[i]);
+			if (rc)
+				break;
+		}
+
+out_ladvise:
+		OBD_FREE(ladvise_hdr, alloc_size);
+		RETURN(rc);
+	}
+	case LL_IOC_FSGETXATTR:
+		RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
+	case LL_IOC_FSSETXATTR:
+		RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
+	case BLKSSZGET:
+		RETURN(put_user(PAGE_SIZE, (int __user *)arg));
+	default:
+		RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
+				     (void __user *)arg));
+	}
+}
+
+#ifndef HAVE_FILE_LLSEEK_SIZE
+static inline loff_t
+llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
+{
+	if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
+		return -EINVAL;
+	if (offset > maxsize)
+		return -EINVAL;
+
+	if (offset != file->f_pos) {
+		file->f_pos = offset;
+		file->f_version = 0;
+	}
+	return offset;
+}
+
+static loff_t
+generic_file_llseek_size(struct file *file, loff_t offset, int origin,
+                loff_t maxsize, loff_t eof)
+{
+	struct inode *inode = file_inode(file);
+
+	switch (origin) {
+	case SEEK_END:
+		offset += eof;
+		break;
+	case SEEK_CUR:
+		/*
+		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
+		 * position-querying operation.  Avoid rewriting the "same"
+		 * f_pos value back to the file because a concurrent read(),
+		 * write() or lseek() might have altered it
+		 */
+		if (offset == 0)
+			return file->f_pos;
+		/*
+		 * f_lock protects against read/modify/write race with other
+		 * SEEK_CURs. Note that parallel writes and reads behave
+		 * like SEEK_SET.
+		 */
+		inode_lock(inode);
+		offset = llseek_execute(file, file->f_pos + offset, maxsize);
+		inode_unlock(inode);
+		return offset;
+	case SEEK_DATA:
+		/*
+		 * In the generic case the entire file is data, so as long as
+		 * offset isn't at the end of the file then the offset is data.
+		 */
+		if (offset >= eof)
+			return -ENXIO;
+		break;
+	case SEEK_HOLE:
+		/*
+		 * There is a virtual hole at the end of the file, so as long as
+		 * offset isn't i_size or larger, return i_size.
+		 */
+		if (offset >= eof)
+			return -ENXIO;
+		offset = eof;
+		break;
+	}
+
+	return llseek_execute(file, offset, maxsize);
+}
+#endif
+
+static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
+{
+	struct inode *inode = file_inode(file);
+	loff_t retval, eof = 0;
+
+	ENTRY;
+	retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
+			   (origin == SEEK_CUR) ? file->f_pos : 0);
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
+	       PFID(ll_inode2fid(inode)), inode, retval, retval,
+	       origin);
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
+
+	if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
+		retval = ll_glimpse_size(inode);
+		if (retval != 0)
+			RETURN(retval);
+		eof = i_size_read(inode);
+	}
+
+	retval = ll_generic_file_llseek_size(file, offset, origin,
+					  ll_file_maxbytes(inode), eof);
+	RETURN(retval);
+}
+
+static int ll_flush(struct file *file, fl_owner_t id)
+{
+	struct inode *inode = file_inode(file);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+	int rc, err;
+
+	LASSERT(!S_ISDIR(inode->i_mode));
+
+	/* catch async errors that were recorded back when async writeback
+	 * failed for pages in this mapping. */
+	rc = lli->lli_async_rc;
+	lli->lli_async_rc = 0;
+	if (lli->lli_clob != NULL) {
+		err = lov_read_and_clear_async_rc(lli->lli_clob);
+		if (rc == 0)
+			rc = err;
+	}
+
+	/* The application has been told write failure already.
+	 * Do not report failure again. */
+	if (fd->fd_write_failed)
+		return 0;
+	return rc ? -EIO : 0;
+}
+
+/**
+ * Called to make sure a portion of file has been written out.
+ * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
+ *
+ * Return how many pages have been written.
+ */
+int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
+		       enum cl_fsync_mode mode, int ignore_layout)
+{
+	struct lu_env *env;
+	struct cl_io *io;
+	struct cl_fsync_io *fio;
+	int result;
+	__u16 refcheck;
+	ENTRY;
+
+	if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
+	    mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
+		RETURN(-EINVAL);
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	io = vvp_env_thread_io(env);
+	io->ci_obj = ll_i2info(inode)->lli_clob;
+	io->ci_ignore_layout = ignore_layout;
+
+	/* initialize parameters for sync */
+	fio = &io->u.ci_fsync;
+	fio->fi_start = start;
+	fio->fi_end = end;
+	fio->fi_fid = ll_inode2fid(inode);
+	fio->fi_mode = mode;
+	fio->fi_nr_written = 0;
+
+	if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
+		result = cl_io_loop(env, io);
+	else
+		result = io->ci_result;
+	if (result == 0)
+		result = fio->fi_nr_written;
+	cl_io_fini(env, io);
+	cl_env_put(env, &refcheck);
+
+	RETURN(result);
+}
+
+/*
+ * When dentry is provided (the 'else' case), file_dentry() may be
+ * null and dentry must be used directly rather than pulled from
+ * file_dentry() as is done otherwise.
+ */
+
+#ifdef HAVE_FILE_FSYNC_4ARGS
+int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct dentry *dentry = file_dentry(file);
+	bool lock_inode;
+#elif defined(HAVE_FILE_FSYNC_2ARGS)
+int ll_fsync(struct file *file, int datasync)
+{
+	struct dentry *dentry = file_dentry(file);
+	loff_t start = 0;
+	loff_t end = LLONG_MAX;
+#else
+int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+	loff_t start = 0;
+	loff_t end = LLONG_MAX;
+#endif
+	struct inode *inode = dentry->d_inode;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ptlrpc_request *req;
+	int rc, err;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
+	       PFID(ll_inode2fid(inode)), inode);
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
+
+#ifdef HAVE_FILE_FSYNC_4ARGS
+	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	lock_inode = !lli->lli_inode_locked;
+	if (lock_inode)
+		inode_lock(inode);
+#else
+	/* fsync's caller has already called _fdata{sync,write}, we want
+	 * that IO to finish before calling the osc and mdc sync methods */
+	rc = filemap_fdatawait(inode->i_mapping);
+#endif
+
+	/* catch async errors that were recorded back when async writeback
+	 * failed for pages in this mapping. */
+	if (!S_ISDIR(inode->i_mode)) {
+		err = lli->lli_async_rc;
+		lli->lli_async_rc = 0;
+		if (rc == 0)
+			rc = err;
+		if (lli->lli_clob != NULL) {
+			err = lov_read_and_clear_async_rc(lli->lli_clob);
+			if (rc == 0)
+				rc = err;
+		}
+	}
+
+	err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
+	if (!rc)
+		rc = err;
+	if (!err)
+		ptlrpc_req_finished(req);
+
+	if (S_ISREG(inode->i_mode)) {
+		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+		err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
+		if (rc == 0 && err < 0)
+			rc = err;
+		if (rc < 0)
+			fd->fd_write_failed = true;
+		else
+			fd->fd_write_failed = false;
+	}
+
+#ifdef HAVE_FILE_FSYNC_4ARGS
+	if (lock_inode)
+		inode_unlock(inode);
+#endif
+	RETURN(rc);
+}
+
+static int
+ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
+{
+	struct inode *inode = file_inode(file);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ldlm_enqueue_info einfo = {
+		.ei_type	= LDLM_FLOCK,
+		.ei_cb_cp	= ldlm_flock_completion_ast,
+		.ei_cbdata	= file_lock,
+	};
+	struct md_op_data *op_data;
+	struct lustre_handle lockh = { 0 };
+	union ldlm_policy_data flock = { { 0 } };
+	int fl_type = file_lock->fl_type;
+	__u64 flags = 0;
+	int rc;
+	int rc2 = 0;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
+	       PFID(ll_inode2fid(inode)), file_lock);
+
+        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
+
+        if (file_lock->fl_flags & FL_FLOCK) {
+                LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
+                /* flocks are whole-file locks */
+                flock.l_flock.end = OFFSET_MAX;
+                /* For flocks owner is determined by the local file desctiptor*/
+                flock.l_flock.owner = (unsigned long)file_lock->fl_file;
+        } else if (file_lock->fl_flags & FL_POSIX) {
+                flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
+                flock.l_flock.start = file_lock->fl_start;
+                flock.l_flock.end = file_lock->fl_end;
+        } else {
+                RETURN(-EINVAL);
+        }
+        flock.l_flock.pid = file_lock->fl_pid;
+
+	/* Somewhat ugly workaround for svc lockd.
+	 * lockd installs custom fl_lmops->lm_compare_owner that checks
+	 * for the fl_owner to be the same (which it always is on local node
+	 * I guess between lockd processes) and then compares pid.
+	 * As such we assign pid to the owner field to make it all work,
+	 * conflict with normal locks is unlikely since pid space and
+	 * pointer space for current->files are not intersecting */
+	if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
+		flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
+
+	switch (fl_type) {
+        case F_RDLCK:
+                einfo.ei_mode = LCK_PR;
+                break;
+        case F_UNLCK:
+                /* An unlock request may or may not have any relation to
+                 * existing locks so we may not be able to pass a lock handle
+                 * via a normal ldlm_lock_cancel() request. The request may even
+                 * unlock a byte range in the middle of an existing lock. In
+                 * order to process an unlock request we need all of the same
+                 * information that is given with a normal read or write record
+                 * lock request. To avoid creating another ldlm unlock (cancel)
+                 * message we'll treat a LCK_NL flock request as an unlock. */
+                einfo.ei_mode = LCK_NL;
+                break;
+        case F_WRLCK:
+                einfo.ei_mode = LCK_PW;
+                break;
+        default:
+		CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
+                RETURN (-ENOTSUPP);
+        }
+
+        switch (cmd) {
+        case F_SETLKW:
+#ifdef F_SETLKW64
+        case F_SETLKW64:
+#endif
+                flags = 0;
+                break;
+        case F_SETLK:
+#ifdef F_SETLK64
+        case F_SETLK64:
+#endif
+                flags = LDLM_FL_BLOCK_NOWAIT;
+                break;
+        case F_GETLK:
+#ifdef F_GETLK64
+        case F_GETLK64:
+#endif
+                flags = LDLM_FL_TEST_LOCK;
+                break;
+        default:
+                CERROR("unknown fcntl lock command: %d\n", cmd);
+                RETURN (-EINVAL);
+        }
+
+	/* Save the old mode so that if the mode in the lock changes we
+	 * can decrement the appropriate reader or writer refcount. */
+	file_lock->fl_type = einfo.ei_mode;
+
+        op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+                                     LUSTRE_OPC_ANY, NULL);
+        if (IS_ERR(op_data))
+                RETURN(PTR_ERR(op_data));
+
+	CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
+	       "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
+	       flock.l_flock.pid, flags, einfo.ei_mode,
+	       flock.l_flock.start, flock.l_flock.end);
+
+	rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
+			flags);
+
+	/* Restore the file lock type if not TEST lock. */
+	if (!(flags & LDLM_FL_TEST_LOCK))
+		file_lock->fl_type = fl_type;
+
+#ifdef HAVE_LOCKS_LOCK_FILE_WAIT
+	if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
+	    !(flags & LDLM_FL_TEST_LOCK))
+		rc2  = locks_lock_file_wait(file, file_lock);
+#else
+        if ((file_lock->fl_flags & FL_FLOCK) &&
+            (rc == 0 || file_lock->fl_type == F_UNLCK))
+		rc2  = flock_lock_file_wait(file, file_lock);
+        if ((file_lock->fl_flags & FL_POSIX) &&
+            (rc == 0 || file_lock->fl_type == F_UNLCK) &&
+            !(flags & LDLM_FL_TEST_LOCK))
+		rc2  = posix_lock_file_wait(file, file_lock);
+#endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
+
+	if (rc2 && file_lock->fl_type != F_UNLCK) {
+		einfo.ei_mode = LCK_NL;
+		md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
+			   &lockh, flags);
+		rc = rc2;
+	}
+
+	ll_finish_md_op_data(op_data);
+
+        RETURN(rc);
+}
+
+int ll_get_fid_by_name(struct inode *parent, const char *name,
+		       int namelen, struct lu_fid *fid,
+		       struct inode **inode)
+{
+	struct md_op_data	*op_data = NULL;
+	struct mdt_body		*body;
+	struct ptlrpc_request	*req;
+	int			rc;
+	ENTRY;
+
+	op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
+	rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
+	ll_finish_md_op_data(op_data);
+	if (rc < 0)
+		RETURN(rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		GOTO(out_req, rc = -EFAULT);
+	if (fid != NULL)
+		*fid = body->mbo_fid1;
+
+	if (inode != NULL)
+		rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
+out_req:
+	ptlrpc_req_finished(req);
+	RETURN(rc);
+}
+
+int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
+	       const char *name, int namelen)
+{
+	struct dentry         *dchild = NULL;
+	struct inode          *child_inode = NULL;
+	struct md_op_data     *op_data;
+	struct ptlrpc_request *request = NULL;
+	struct obd_client_handle *och = NULL;
+	struct qstr           qstr;
+	struct mdt_body		*body;
+	int                    rc;
+	__u64			data_version = 0;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
+	       name, PFID(ll_inode2fid(parent)), mdtidx);
+
+	op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
+				     0, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	/* Get child FID first */
+	qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
+	qstr.name = name;
+	qstr.len = namelen;
+	dchild = d_lookup(file_dentry(file), &qstr);
+	if (dchild != NULL) {
+		if (dchild->d_inode != NULL)
+			child_inode = igrab(dchild->d_inode);
+		dput(dchild);
+	}
+
+	if (child_inode == NULL) {
+		rc = ll_get_fid_by_name(parent, name, namelen,
+					&op_data->op_fid3, &child_inode);
+		if (rc != 0)
+			GOTO(out_free, rc);
+	}
+
+	if (child_inode == NULL)
+		GOTO(out_free, rc = -EINVAL);
+
+	/*
+	 * lfs migrate command needs to be blocked on the client
+	 * by checking the migrate FID against the FID of the
+	 * filesystem root.
+	 */
+	if (child_inode == parent->i_sb->s_root->d_inode)
+		GOTO(out_iput, rc = -EINVAL);
+
+	inode_lock(child_inode);
+	op_data->op_fid3 = *ll_inode2fid(child_inode);
+	if (!fid_is_sane(&op_data->op_fid3)) {
+		CERROR("%s: migrate %s, but FID "DFID" is insane\n",
+		       ll_get_fsname(parent->i_sb, NULL, 0), name,
+		       PFID(&op_data->op_fid3));
+		GOTO(out_unlock, rc = -EINVAL);
+	}
+
+	rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
+	if (rc < 0)
+		GOTO(out_unlock, rc);
+
+	if (rc == mdtidx) {
+		CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
+		       PFID(&op_data->op_fid3), mdtidx);
+		GOTO(out_unlock, rc = 0);
+	}
+again:
+	if (S_ISREG(child_inode->i_mode)) {
+		och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
+		if (IS_ERR(och)) {
+			rc = PTR_ERR(och);
+			och = NULL;
+			GOTO(out_unlock, rc);
+		}
+
+		rc = ll_data_version(child_inode, &data_version,
+				     LL_DV_WR_FLUSH);
+		if (rc != 0)
+			GOTO(out_close, rc);
+
+		op_data->op_handle = och->och_fh;
+		op_data->op_data = och->och_mod;
+		op_data->op_data_version = data_version;
+		op_data->op_lease_handle = och->och_lease_handle;
+		op_data->op_bias |= MDS_RENAME_MIGRATE;
+	}
+
+	op_data->op_mds = mdtidx;
+	op_data->op_cli_flags = CLI_MIGRATE;
+	rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
+		       namelen, name, namelen, &request);
+	if (rc == 0) {
+		LASSERT(request != NULL);
+		ll_update_times(request, parent);
+
+		body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
+		LASSERT(body != NULL);
+
+		/* If the server does release layout lock, then we cleanup
+		 * the client och here, otherwise release it in out_close: */
+		if (och != NULL &&
+		    body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
+			obd_mod_put(och->och_mod);
+			md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
+						  och);
+			och->och_fh.cookie = DEAD_HANDLE_MAGIC;
+			OBD_FREE_PTR(och);
+			och = NULL;
+		}
+	}
+
+	if (request != NULL) {
+		ptlrpc_req_finished(request);
+		request = NULL;
+	}
+
+	/* Try again if the file layout has changed. */
+	if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
+		goto again;
+
+out_close:
+	if (och != NULL) /* close the file */
+		ll_lease_close(och, child_inode, NULL);
+	if (rc == 0)
+		clear_nlink(child_inode);
+out_unlock:
+	inode_unlock(child_inode);
+out_iput:
+	iput(child_inode);
+out_free:
+	ll_finish_md_op_data(op_data);
+	RETURN(rc);
+}
+
+static int
+ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
+{
+        ENTRY;
+
+        RETURN(-ENOSYS);
+}
+
+/**
+ * test if some locks matching bits and l_req_mode are acquired
+ * - bits can be in different locks
+ * - if found clear the common lock bits in *bits
+ * - the bits not found, are kept in *bits
+ * \param inode [IN]
+ * \param bits [IN] searched lock bits [IN]
+ * \param l_req_mode [IN] searched lock mode
+ * \retval boolean, true iff all bits are found
+ */
+int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
+{
+	struct lustre_handle lockh;
+	union ldlm_policy_data policy;
+	enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
+			      (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
+	struct lu_fid *fid;
+	__u64 flags;
+	int i;
+	ENTRY;
+
+        if (!inode)
+               RETURN(0);
+
+        fid = &ll_i2info(inode)->lli_fid;
+        CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
+               ldlm_lockname[mode]);
+
+	flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
+	for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
+		policy.l_inodebits.bits = *bits & (1 << i);
+		if (policy.l_inodebits.bits == 0)
+			continue;
+
+                if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
+                                  &policy, mode, &lockh)) {
+                        struct ldlm_lock *lock;
+
+                        lock = ldlm_handle2lock(&lockh);
+                        if (lock) {
+                                *bits &=
+                                      ~(lock->l_policy_data.l_inodebits.bits);
+                                LDLM_LOCK_PUT(lock);
+                        } else {
+                                *bits &= ~policy.l_inodebits.bits;
+                        }
+                }
+        }
+        RETURN(*bits == 0);
+}
+
+enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
+			       struct lustre_handle *lockh, __u64 flags,
+			       enum ldlm_mode mode)
+{
+	union ldlm_policy_data policy = { .l_inodebits = { bits } };
+	struct lu_fid *fid;
+	enum ldlm_mode rc;
+	ENTRY;
+
+	fid = &ll_i2info(inode)->lli_fid;
+	CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
+
+	rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
+			   fid, LDLM_IBITS, &policy, mode, lockh);
+
+	RETURN(rc);
+}
+
+static int ll_inode_revalidate_fini(struct inode *inode, int rc)
+{
+	/* Already unlinked. Just update nlink and return success */
+	if (rc == -ENOENT) {
+		clear_nlink(inode);
+		/* If it is striped directory, and there is bad stripe
+		 * Let's revalidate the dentry again, instead of returning
+		 * error */
+		if (S_ISDIR(inode->i_mode) &&
+		    ll_i2info(inode)->lli_lsm_md != NULL)
+			return 0;
+
+		/* This path cannot be hit for regular files unless in
+		 * case of obscure races, so no need to to validate
+		 * size. */
+		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+			return 0;
+	} else if (rc != 0) {
+		CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
+			     "%s: revalidate FID "DFID" error: rc = %d\n",
+			     ll_get_fsname(inode->i_sb, NULL, 0),
+			     PFID(ll_inode2fid(inode)), rc);
+	}
+
+	return rc;
+}
+
+static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
+{
+        struct inode *inode = dentry->d_inode;
+        struct ptlrpc_request *req = NULL;
+        struct obd_export *exp;
+        int rc = 0;
+        ENTRY;
+
+        LASSERT(inode != NULL);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
+	       PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
+
+        exp = ll_i2mdexp(inode);
+
+        /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
+         *      But under CMD case, it caused some lock issues, should be fixed
+         *      with new CMD ibits lock. See bug 12718 */
+	if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
+                struct lookup_intent oit = { .it_op = IT_GETATTR };
+                struct md_op_data *op_data;
+
+                if (ibits == MDS_INODELOCK_LOOKUP)
+                        oit.it_op = IT_LOOKUP;
+
+                /* Call getattr by fid, so do not provide name at all. */
+                op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
+                                             dentry->d_inode, NULL, 0, 0,
+                                             LUSTRE_OPC_ANY, NULL);
+                if (IS_ERR(op_data))
+                        RETURN(PTR_ERR(op_data));
+
+		rc = md_intent_lock(exp, op_data, &oit, &req,
+				    &ll_md_blocking_ast, 0);
+                ll_finish_md_op_data(op_data);
+                if (rc < 0) {
+                        rc = ll_inode_revalidate_fini(inode, rc);
+                        GOTO (out, rc);
+                }
+
+                rc = ll_revalidate_it_finish(req, &oit, dentry);
+                if (rc != 0) {
+                        ll_intent_release(&oit);
+                        GOTO(out, rc);
+                }
+
+                /* Unlinked? Unhash dentry, so it is not picked up later by
+                   do_lookup() -> ll_revalidate_it(). We cannot use d_drop
+                   here to preserve get_cwd functionality on 2.6.
+                   Bug 10503 */
+		if (!dentry->d_inode->i_nlink) {
+			ll_lock_dcache(inode);
+			d_lustre_invalidate(dentry, 0);
+			ll_unlock_dcache(inode);
+		}
+
+                ll_lookup_finish_locks(&oit, dentry);
+        } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
+		struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
+		u64 valid = OBD_MD_FLGETATTR;
+		struct md_op_data *op_data;
+		int ealen = 0;
+
+		if (S_ISREG(inode->i_mode)) {
+			rc = ll_get_default_mdsize(sbi, &ealen);
+			if (rc)
+				RETURN(rc);
+			valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
+		}
+
+                op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
+                                             0, ealen, LUSTRE_OPC_ANY,
+                                             NULL);
+                if (IS_ERR(op_data))
+                        RETURN(PTR_ERR(op_data));
+
+                op_data->op_valid = valid;
+                rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+                ll_finish_md_op_data(op_data);
+                if (rc) {
+                        rc = ll_inode_revalidate_fini(inode, rc);
+                        RETURN(rc);
+                }
+
+                rc = ll_prep_inode(&inode, req, NULL, NULL);
+        }
+out:
+        ptlrpc_req_finished(req);
+        return rc;
+}
+
+static int ll_merge_md_attr(struct inode *inode)
+{
+	struct cl_attr attr = { 0 };
+	int rc;
+
+	LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
+	rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
+			   &attr, ll_md_blocking_ast);
+	if (rc != 0)
+		RETURN(rc);
+
+	set_nlink(inode, attr.cat_nlink);
+	inode->i_blocks = attr.cat_blocks;
+	i_size_write(inode, attr.cat_size);
+
+	ll_i2info(inode)->lli_atime = attr.cat_atime;
+	ll_i2info(inode)->lli_mtime = attr.cat_mtime;
+	ll_i2info(inode)->lli_ctime = attr.cat_ctime;
+
+	RETURN(0);
+}
+
+static int
+ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
+{
+	struct inode	*inode = dentry->d_inode;
+	int		 rc;
+	ENTRY;
+
+	rc = __ll_inode_revalidate(dentry, ibits);
+	if (rc != 0)
+		RETURN(rc);
+
+	/* if object isn't regular file, don't validate size */
+	if (!S_ISREG(inode->i_mode)) {
+		if (S_ISDIR(inode->i_mode) &&
+		    ll_i2info(inode)->lli_lsm_md != NULL) {
+			rc = ll_merge_md_attr(inode);
+			if (rc != 0)
+				RETURN(rc);
+		}
+
+		LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
+		LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
+		LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
+	} else {
+		/* In case of restore, the MDT has the right size and has
+		 * already send it back without granting the layout lock,
+		 * inode is up-to-date so glimpse is useless.
+		 * Also to glimpse we need the layout, in case of a running
+		 * restore the MDT holds the layout lock so the glimpse will
+		 * block up to the end of restore (getattr will block)
+		 */
+		if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
+			rc = ll_glimpse_size(inode);
+	}
+	RETURN(rc);
+}
+
+static inline dev_t ll_compat_encode_dev(dev_t dev)
+{
+	/* The compat_sys_*stat*() syscalls will fail unless the
+	 * device majors and minors are both less than 256. Note that
+	 * the value returned here will be passed through
+	 * old_encode_dev() in cp_compat_stat(). And so we are not
+	 * trying to return a valid compat (u16) device number, just
+	 * one that will pass the old_valid_dev() check. */
+
+	return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
+}
+
+#ifdef HAVE_INODEOPS_ENHANCED_GETATTR
+int ll_getattr(const struct path *path, struct kstat *stat,
+	       u32 request_mask, unsigned int flags)
+
+{
+	struct dentry *de = path->dentry;
+#else
+int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
+{
+#endif
+        struct inode *inode = de->d_inode;
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        struct ll_inode_info *lli = ll_i2info(inode);
+        int res = 0;
+
+	res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
+				      MDS_INODELOCK_LOOKUP);
+        ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
+
+        if (res)
+                return res;
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
+
+	if (ll_need_32bit_api(sbi)) {
+		stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
+		stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
+		stat->rdev = ll_compat_encode_dev(inode->i_rdev);
+	} else {
+		stat->ino = inode->i_ino;
+		stat->dev = inode->i_sb->s_dev;
+		stat->rdev = inode->i_rdev;
+	}
+
+	stat->mode = inode->i_mode;
+	stat->uid = inode->i_uid;
+	stat->gid = inode->i_gid;
+	stat->atime = inode->i_atime;
+	stat->mtime = inode->i_mtime;
+	stat->ctime = inode->i_ctime;
+	stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
+
+	stat->nlink = inode->i_nlink;
+	stat->size = i_size_read(inode);
+	stat->blocks = inode->i_blocks;
+
+        return 0;
+}
+
+static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		     __u64 start, __u64 len)
+{
+	int		rc;
+	size_t		num_bytes;
+	struct fiemap	*fiemap;
+	unsigned int	extent_count = fieinfo->fi_extents_max;
+
+	num_bytes = sizeof(*fiemap) + (extent_count *
+				       sizeof(struct fiemap_extent));
+	OBD_ALLOC_LARGE(fiemap, num_bytes);
+
+	if (fiemap == NULL)
+		RETURN(-ENOMEM);
+
+	fiemap->fm_flags = fieinfo->fi_flags;
+	fiemap->fm_extent_count = fieinfo->fi_extents_max;
+	fiemap->fm_start = start;
+	fiemap->fm_length = len;
+	if (extent_count > 0 &&
+	    copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
+			   sizeof(struct fiemap_extent)) != 0)
+		GOTO(out, rc = -EFAULT);
+
+	rc = ll_do_fiemap(inode, fiemap, num_bytes);
+
+	fieinfo->fi_flags = fiemap->fm_flags;
+	fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
+	if (extent_count > 0 &&
+	    copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
+			 fiemap->fm_mapped_extents *
+			 sizeof(struct fiemap_extent)) != 0)
+		GOTO(out, rc = -EFAULT);
+out:
+	OBD_FREE_LARGE(fiemap, num_bytes);
+	return rc;
+}
+
+struct posix_acl *ll_get_acl(struct inode *inode, int type)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct posix_acl *acl = NULL;
+	ENTRY;
+
+	spin_lock(&lli->lli_lock);
+	/* VFS' acl_permission_check->check_acl will release the refcount */
+	acl = posix_acl_dup(lli->lli_posix_acl);
+	spin_unlock(&lli->lli_lock);
+
+	RETURN(acl);
+}
+
+#ifdef HAVE_IOP_SET_ACL
+#ifdef CONFIG_FS_POSIX_ACL
+int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *req = NULL;
+	const char *name = NULL;
+	char *value = NULL;
+	size_t value_size = 0;
+	int rc;
+	ENTRY;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = XATTR_NAME_POSIX_ACL_ACCESS;
+		if (acl) {
+			rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+			if (rc)
+				GOTO(out, rc);
+		}
+
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = XATTR_NAME_POSIX_ACL_DEFAULT;
+		if (!S_ISDIR(inode->i_mode))
+			GOTO(out, rc = acl ? -EACCES : 0);
+
+		break;
+	default:
+		GOTO(out, rc = -EINVAL);
+	}
+
+	if (acl) {
+		value_size = posix_acl_xattr_size(acl->a_count);
+		value = kmalloc(value_size, GFP_NOFS);
+		if (value == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
+		if (rc < 0)
+			GOTO(out_value, rc);
+	}
+
+	rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
+			 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
+			 name, value, value_size, 0, 0, 0, &req);
+
+	ptlrpc_req_finished(req);
+out_value:
+	kfree(value);
+out:
+	if (!rc)
+		set_cached_acl(inode, type, acl);
+	else
+		forget_cached_acl(inode, type);
+	RETURN(rc);
+}
+#endif /* CONFIG_FS_POSIX_ACL */
+#endif /* HAVE_IOP_SET_ACL */
+
+#ifndef HAVE_GENERIC_PERMISSION_2ARGS
+static int
+# ifdef HAVE_GENERIC_PERMISSION_4ARGS
+ll_check_acl(struct inode *inode, int mask, unsigned int flags)
+# else
+ll_check_acl(struct inode *inode, int mask)
+# endif
+{
+# ifdef CONFIG_FS_POSIX_ACL
+	struct posix_acl *acl;
+	int rc;
+	ENTRY;
+
+#  ifdef HAVE_GENERIC_PERMISSION_4ARGS
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+#  endif
+	acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
+
+	if (!acl)
+		RETURN(-EAGAIN);
+
+	rc = posix_acl_permission(inode, acl, mask);
+	posix_acl_release(acl);
+
+	RETURN(rc);
+# else /* !CONFIG_FS_POSIX_ACL */
+	return -EAGAIN;
+# endif /* CONFIG_FS_POSIX_ACL */
+}
+#endif /* HAVE_GENERIC_PERMISSION_2ARGS */
+
+#ifdef HAVE_GENERIC_PERMISSION_4ARGS
+int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
+#else
+# ifdef HAVE_INODE_PERMISION_2ARGS
+int ll_inode_permission(struct inode *inode, int mask)
+# else
+int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
+# endif
+#endif
+{
+	int rc = 0;
+	struct ll_sb_info *sbi;
+	struct root_squash_info *squash;
+	struct cred *cred = NULL;
+	const struct cred *old_cred = NULL;
+	cfs_cap_t cap;
+	bool squash_id = false;
+	ENTRY;
+
+#ifdef MAY_NOT_BLOCK
+	if (mask & MAY_NOT_BLOCK)
+		return -ECHILD;
+#elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+#endif
+
+       /* as root inode are NOT getting validated in lookup operation,
+        * need to do it before permission check. */
+
+        if (inode == inode->i_sb->s_root->d_inode) {
+		rc = __ll_inode_revalidate(inode->i_sb->s_root,
+					   MDS_INODELOCK_LOOKUP);
+                if (rc)
+                        RETURN(rc);
+        }
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
+	       PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
+
+	/* squash fsuid/fsgid if needed */
+	sbi = ll_i2sbi(inode);
+	squash = &sbi->ll_squash;
+	if (unlikely(squash->rsi_uid != 0 &&
+		     uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
+		     !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
+			squash_id = true;
+	}
+	if (squash_id) {
+		CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
+		       __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
+		       squash->rsi_uid, squash->rsi_gid);
+
+		/* update current process's credentials
+		 * and FS capability */
+		cred = prepare_creds();
+		if (cred == NULL)
+			RETURN(-ENOMEM);
+
+		cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
+		cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
+		for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
+			if ((1 << cap) & CFS_CAP_FS_MASK)
+				cap_lower(cred->cap_effective, cap);
+		}
+		old_cred = override_creds(cred);
+	}
+
+	ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
+	rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
+	/* restore current process's credentials and FS capability */
+	if (squash_id) {
+		revert_creds(old_cred);
+		put_cred(cred);
+	}
+
+	RETURN(rc);
+}
+
+/* -o localflock - only provides locally consistent flock locks */
+struct file_operations ll_file_operations = {
+#ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
+# ifdef HAVE_SYNC_READ_WRITE
+	.read		= new_sync_read,
+	.write		= new_sync_write,
+# endif
+	.read_iter	= ll_file_read_iter,
+	.write_iter	= ll_file_write_iter,
+#else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
+	.read		= ll_file_read,
+	.aio_read	= ll_file_aio_read,
+	.write		= ll_file_write,
+	.aio_write	= ll_file_aio_write,
+#endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
+	.unlocked_ioctl	= ll_file_ioctl,
+	.open		= ll_file_open,
+	.release	= ll_file_release,
+	.mmap		= ll_file_mmap,
+	.llseek		= ll_file_seek,
+	.splice_read	= ll_file_splice_read,
+	.fsync		= ll_fsync,
+	.flush		= ll_flush
+};
+
+struct file_operations ll_file_operations_flock = {
+#ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
+# ifdef HAVE_SYNC_READ_WRITE
+	.read		= new_sync_read,
+	.write		= new_sync_write,
+# endif /* HAVE_SYNC_READ_WRITE */
+	.read_iter	= ll_file_read_iter,
+	.write_iter	= ll_file_write_iter,
+#else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
+	.read		= ll_file_read,
+	.aio_read	= ll_file_aio_read,
+	.write		= ll_file_write,
+	.aio_write	= ll_file_aio_write,
+#endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
+	.unlocked_ioctl	= ll_file_ioctl,
+	.open		= ll_file_open,
+	.release	= ll_file_release,
+	.mmap		= ll_file_mmap,
+	.llseek		= ll_file_seek,
+	.splice_read	= ll_file_splice_read,
+	.fsync		= ll_fsync,
+	.flush		= ll_flush,
+	.flock		= ll_file_flock,
+	.lock		= ll_file_flock
+};
+
+/* These are for -o noflock - to return ENOSYS on flock calls */
+struct file_operations ll_file_operations_noflock = {
+#ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
+# ifdef HAVE_SYNC_READ_WRITE
+	.read		= new_sync_read,
+	.write		= new_sync_write,
+# endif /* HAVE_SYNC_READ_WRITE */
+	.read_iter	= ll_file_read_iter,
+	.write_iter	= ll_file_write_iter,
+#else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
+	.read		= ll_file_read,
+	.aio_read	= ll_file_aio_read,
+	.write		= ll_file_write,
+	.aio_write	= ll_file_aio_write,
+#endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
+	.unlocked_ioctl	= ll_file_ioctl,
+	.open		= ll_file_open,
+	.release	= ll_file_release,
+	.mmap		= ll_file_mmap,
+	.llseek		= ll_file_seek,
+	.splice_read	= ll_file_splice_read,
+	.fsync		= ll_fsync,
+	.flush		= ll_flush,
+	.flock		= ll_file_noflock,
+	.lock		= ll_file_noflock
+};
+
+struct inode_operations ll_file_inode_operations = {
+	.setattr	= ll_setattr,
+	.getattr	= ll_getattr,
+	.permission	= ll_inode_permission,
+#ifdef HAVE_IOP_XATTR
+	.setxattr	= ll_setxattr,
+	.getxattr	= ll_getxattr,
+	.removexattr	= ll_removexattr,
+#endif
+	.listxattr	= ll_listxattr,
+	.fiemap		= ll_fiemap,
+#ifdef HAVE_IOP_GET_ACL
+	.get_acl	= ll_get_acl,
+#endif
+#ifdef HAVE_IOP_SET_ACL
+	.set_acl	= ll_set_acl,
+#endif
+};
+
+int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct cl_object *obj = lli->lli_clob;
+	struct lu_env *env;
+	int rc;
+	__u16 refcheck;
+	ENTRY;
+
+	if (obj == NULL)
+		RETURN(0);
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	rc = cl_conf_set(env, lli->lli_clob, conf);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	if (conf->coc_opc == OBJECT_CONF_SET) {
+		struct ldlm_lock *lock = conf->coc_lock;
+		struct cl_layout cl = {
+			.cl_layout_gen = 0,
+		};
+
+		LASSERT(lock != NULL);
+		LASSERT(ldlm_has_layout(lock));
+
+		/* it can only be allowed to match after layout is
+		 * applied to inode otherwise false layout would be
+		 * seen. Applying layout shoud happen before dropping
+		 * the intent lock. */
+		ldlm_lock_allow_match(lock);
+
+		rc = cl_object_layout_get(env, obj, &cl);
+		if (rc < 0)
+			GOTO(out, rc);
+
+		CDEBUG(D_VFSTRACE,
+		       DFID": layout version change: %u -> %u\n",
+		       PFID(&lli->lli_fid), ll_layout_version_get(lli),
+		       cl.cl_layout_gen);
+		ll_layout_version_set(lli, cl.cl_layout_gen);
+	}
+
+out:
+	cl_env_put(env, &refcheck);
+
+	RETURN(rc);
+}
+
+/* Fetch layout from MDT with getxattr request, if it's not ready yet */
+static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
+
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *req;
+	struct mdt_body *body;
+	void *lvbdata;
+	void *lmm;
+	int lmmsize;
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
+	       PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
+	       lock->l_lvb_data, lock->l_lvb_len);
+
+	if (lock->l_lvb_data != NULL)
+		RETURN(0);
+
+	/* if layout lock was granted right away, the layout is returned
+	 * within DLM_LVB of dlm reply; otherwise if the lock was ever
+	 * blocked and then granted via completion ast, we have to fetch
+	 * layout here. Please note that we can't use the LVB buffer in
+	 * completion AST because it doesn't have a large enough buffer */
+	rc = ll_get_default_mdsize(sbi, &lmmsize);
+	if (rc == 0)
+		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
+				OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
+				lmmsize, 0, &req);
+	if (rc < 0)
+		RETURN(rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	lmmsize = body->mbo_eadatasize;
+	if (lmmsize == 0) /* empty layout */
+		GOTO(out, rc = 0);
+
+	lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
+	if (lmm == NULL)
+		GOTO(out, rc = -EFAULT);
+
+	OBD_ALLOC_LARGE(lvbdata, lmmsize);
+	if (lvbdata == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	memcpy(lvbdata, lmm, lmmsize);
+	lock_res_and_lock(lock);
+	if (unlikely(lock->l_lvb_data == NULL)) {
+		lock->l_lvb_type = LVB_T_LAYOUT;
+		lock->l_lvb_data = lvbdata;
+		lock->l_lvb_len = lmmsize;
+		lvbdata = NULL;
+	}
+	unlock_res_and_lock(lock);
+
+	if (lvbdata)
+		OBD_FREE_LARGE(lvbdata, lmmsize);
+
+	EXIT;
+
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+/**
+ * Apply the layout to the inode. Layout lock is held and will be released
+ * in this function.
+ */
+static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
+			      struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_sb_info    *sbi = ll_i2sbi(inode);
+	struct ldlm_lock *lock;
+	struct cl_object_conf conf;
+	int rc = 0;
+	bool lvb_ready;
+	bool wait_layout = false;
+	ENTRY;
+
+	LASSERT(lustre_handle_is_used(lockh));
+
+	lock = ldlm_handle2lock(lockh);
+	LASSERT(lock != NULL);
+	LASSERT(ldlm_has_layout(lock));
+
+	LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
+		   PFID(&lli->lli_fid), inode);
+
+	/* in case this is a caching lock and reinstate with new inode */
+	md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
+
+	lock_res_and_lock(lock);
+	lvb_ready = ldlm_is_lvb_ready(lock);
+	unlock_res_and_lock(lock);
+
+	/* checking lvb_ready is racy but this is okay. The worst case is
+	 * that multi processes may configure the file on the same time. */
+	if (lvb_ready)
+		GOTO(out, rc = 0);
+
+	rc = ll_layout_fetch(inode, lock);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	/* for layout lock, lmm is stored in lock's lvb.
+	 * lvb_data is immutable if the lock is held so it's safe to access it
+	 * without res lock.
+	 *
+	 * set layout to file. Unlikely this will fail as old layout was
+	 * surely eliminated */
+	memset(&conf, 0, sizeof conf);
+	conf.coc_opc = OBJECT_CONF_SET;
+	conf.coc_inode = inode;
+	conf.coc_lock = lock;
+	conf.u.coc_layout.lb_buf = lock->l_lvb_data;
+	conf.u.coc_layout.lb_len = lock->l_lvb_len;
+	rc = ll_layout_conf(inode, &conf);
+
+	/* refresh layout failed, need to wait */
+	wait_layout = rc == -EBUSY;
+	EXIT;
+out:
+	LDLM_LOCK_PUT(lock);
+	ldlm_lock_decref(lockh, mode);
+
+	/* wait for IO to complete if it's still being used. */
+	if (wait_layout) {
+		CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
+		       ll_get_fsname(inode->i_sb, NULL, 0),
+		       PFID(&lli->lli_fid), inode);
+
+		memset(&conf, 0, sizeof conf);
+		conf.coc_opc = OBJECT_CONF_WAIT;
+		conf.coc_inode = inode;
+		rc = ll_layout_conf(inode, &conf);
+		if (rc == 0)
+			rc = -EAGAIN;
+
+		CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
+		       ll_get_fsname(inode->i_sb, NULL, 0),
+		       PFID(&lli->lli_fid), rc);
+	}
+	RETURN(rc);
+}
+
+/**
+ * Issue layout intent RPC to MDS.
+ * \param inode [in]	file inode
+ * \param intent [in]	layout intent
+ *
+ * \retval 0	on success
+ * \retval < 0	error code
+ */
+static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
+{
+	struct ll_inode_info  *lli = ll_i2info(inode);
+	struct ll_sb_info     *sbi = ll_i2sbi(inode);
+	struct md_op_data     *op_data;
+	struct lookup_intent it;
+	struct ptlrpc_request *req;
+	int rc;
+	ENTRY;
+
+	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
+				     0, 0, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	op_data->op_data = intent;
+	op_data->op_data_size = sizeof(*intent);
+
+	memset(&it, 0, sizeof(it));
+	it.it_op = IT_LAYOUT;
+	if (intent->li_opc == LAYOUT_INTENT_WRITE ||
+	    intent->li_opc == LAYOUT_INTENT_TRUNC)
+		it.it_flags = FMODE_WRITE;
+
+	LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
+			  ll_get_fsname(inode->i_sb, NULL, 0),
+			  PFID(&lli->lli_fid), inode);
+
+	rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
+			    &ll_md_blocking_ast, 0);
+	if (it.it_request != NULL)
+		ptlrpc_req_finished(it.it_request);
+	it.it_request = NULL;
+
+	ll_finish_md_op_data(op_data);
+
+	/* set lock data in case this is a new lock */
+	if (!rc)
+		ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
+
+	ll_intent_drop_lock(&it);
+
+	RETURN(rc);
+}
+
+/**
+ * This function checks if there exists a LAYOUT lock on the client side,
+ * or enqueues it if it doesn't have one in cache.
+ *
+ * This function will not hold layout lock so it may be revoked any time after
+ * this function returns. Any operations depend on layout should be redone
+ * in that case.
+ *
+ * This function should be called before lov_io_init() to get an uptodate
+ * layout version, the caller should save the version number and after IO
+ * is finished, this function should be called again to verify that layout
+ * is not changed during IO time.
+ */
+int ll_layout_refresh(struct inode *inode, __u32 *gen)
+{
+	struct ll_inode_info	*lli = ll_i2info(inode);
+	struct ll_sb_info	*sbi = ll_i2sbi(inode);
+	struct lustre_handle lockh;
+	struct layout_intent intent = {
+		.li_opc = LAYOUT_INTENT_ACCESS,
+	};
+	enum ldlm_mode mode;
+	int rc;
+	ENTRY;
+
+	*gen = ll_layout_version_get(lli);
+	if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
+		RETURN(0);
+
+	/* sanity checks */
+	LASSERT(fid_is_sane(ll_inode2fid(inode)));
+	LASSERT(S_ISREG(inode->i_mode));
+
+	/* take layout lock mutex to enqueue layout lock exclusively. */
+	mutex_lock(&lli->lli_layout_mutex);
+
+	while (1) {
+		/* mostly layout lock is caching on the local side, so try to
+		 * match it before grabbing layout lock mutex. */
+		mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
+				       LCK_CR | LCK_CW | LCK_PR | LCK_PW);
+		if (mode != 0) { /* hit cached lock */
+			rc = ll_layout_lock_set(&lockh, mode, inode);
+			if (rc == -EAGAIN)
+				continue;
+			break;
+		}
+
+		rc = ll_layout_intent(inode, &intent);
+		if (rc != 0)
+			break;
+	}
+
+	if (rc == 0)
+		*gen = ll_layout_version_get(lli);
+	mutex_unlock(&lli->lli_layout_mutex);
+
+	RETURN(rc);
+}
+
+/**
+ * Issue layout intent RPC indicating where in a file an IO is about to write.
+ *
+ * \param[in] inode	file inode.
+ * \param[in] start	start offset of fille in bytes where an IO is about to
+ *			write.
+ * \param[in] end	exclusive end offset in bytes of the write range.
+ *
+ * \retval 0	on success
+ * \retval < 0	error code
+ */
+int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end)
+{
+	struct layout_intent intent = {
+		.li_opc = LAYOUT_INTENT_WRITE,
+		.li_start = start,
+		.li_end = end,
+	};
+	int rc;
+	ENTRY;
+
+	rc = ll_layout_intent(inode, &intent);
+
+	RETURN(rc);
+}
+
+/**
+ *  This function send a restore request to the MDT
+ */
+int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
+{
+	struct hsm_user_request	*hur;
+	int			 len, rc;
+	ENTRY;
+
+	len = sizeof(struct hsm_user_request) +
+	      sizeof(struct hsm_user_item);
+	OBD_ALLOC(hur, len);
+	if (hur == NULL)
+		RETURN(-ENOMEM);
+
+	hur->hur_request.hr_action = HUA_RESTORE;
+	hur->hur_request.hr_archive_id = 0;
+	hur->hur_request.hr_flags = 0;
+	memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
+	       sizeof(hur->hur_user_item[0].hui_fid));
+	hur->hur_user_item[0].hui_extent.offset = offset;
+	hur->hur_user_item[0].hui_extent.length = length;
+	hur->hur_request.hr_itemcount = 1;
+	rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
+			   len, hur, NULL);
+	OBD_FREE(hur, len);
+	RETURN(rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/glimpse.c b/drivers/staging/lustrefsx/lustre/llite/glimpse.c
new file mode 100644
index 0000000000000..d34be28747bdd
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/glimpse.c
@@ -0,0 +1,208 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * glimpse code shared between vvp and liblustre (and other Lustre clients in
+ * the future).
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Oleg Drokin <oleg.drokin@sun.com>
+ */
+
+#include <libcfs/libcfs.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <obd.h>
+
+#include <lustre_dlm.h>
+#include <lustre_mdc.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+
+#include "cl_object.h"
+#include "llite_internal.h"
+#include "vvp_internal.h"
+
+static const struct cl_lock_descr whole_file = {
+        .cld_start = 0,
+        .cld_end   = CL_PAGE_EOF,
+        .cld_mode  = CLM_READ
+};
+
+/*
+ * Check whether file has possible unwritten pages.
+ *
+ * \retval 1    file is mmap-ed or has dirty pages
+ *         0    otherwise
+ */
+blkcnt_t dirty_cnt(struct inode *inode)
+{
+        blkcnt_t cnt = 0;
+	struct vvp_object *vob = cl_inode2vvp(inode);
+        void              *results[1];
+
+        if (inode->i_mapping != NULL)
+                cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->page_tree,
+                                                  results, 0, 1,
+                                                  PAGECACHE_TAG_DIRTY);
+	if (cnt == 0 && atomic_read(&vob->vob_mmap_cnt) > 0)
+		cnt = 1;
+
+        return (cnt > 0) ? 1 : 0;
+}
+
+int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
+		    struct inode *inode, struct cl_object *clob, int agl)
+{
+	const struct lu_fid *fid = lu_object_fid(&clob->co_lu);
+	struct cl_lock *lock = vvp_env_lock(env);
+	struct cl_lock_descr *descr = &lock->cll_descr;
+	int result;
+
+	ENTRY;
+	result = 0;
+
+	CDEBUG(D_DLMTRACE, "Glimpsing inode "DFID"\n", PFID(fid));
+
+	/* NOTE: this looks like DLM lock request, but it may
+	 *       not be one. Due to CEF_ASYNC flag (translated
+	 *       to LDLM_FL_HAS_INTENT by osc), this is
+	 *       glimpse request, that won't revoke any
+	 *       conflicting DLM locks held. Instead,
+	 *       ll_glimpse_callback() will be called on each
+	 *       client holding a DLM lock against this file,
+	 *       and resulting size will be returned for each
+	 *       stripe. DLM lock on [0, EOF] is acquired only
+	 *       if there were no conflicting locks. If there
+	 *       were conflicting locks, enqueuing or waiting
+	 *       fails with -ENAVAIL, but valid inode
+	 *       attributes are returned anyway. */
+	*descr = whole_file;
+	descr->cld_obj = clob;
+	descr->cld_mode = CLM_READ;
+	descr->cld_enq_flags = CEF_ASYNC | CEF_MUST;
+	if (agl)
+		descr->cld_enq_flags |= CEF_AGL;
+	/*
+	 * CEF_ASYNC is used because glimpse sub-locks cannot
+	 * deadlock (because they never conflict with other
+	 * locks) and, hence, can be enqueued out-of-order.
+	 *
+	 * CEF_MUST protects glimpse lock from conversion into
+	 * a lockless mode.
+	 */
+	result = cl_lock_request(env, io, lock);
+	if (result < 0)
+		RETURN(result);
+
+	if (!agl) {
+		ll_merge_attr(env, inode);
+		if (i_size_read(inode) > 0 && inode->i_blocks == 0) {
+			/*
+			 * LU-417: Add dirty pages block count
+			 * lest i_blocks reports 0, some "cp" or
+			 * "tar" may think it's a completely
+			 * sparse file and skip it.
+			 */
+			inode->i_blocks = dirty_cnt(inode);
+		}
+	}
+
+	cl_lock_release(env, lock);
+
+	RETURN(result);
+}
+
+static int cl_io_get(struct inode *inode, struct lu_env **envout,
+		     struct cl_io **ioout, __u16 *refcheck)
+{
+	struct lu_env		*env;
+	struct cl_io		*io;
+	struct ll_inode_info	*lli = ll_i2info(inode);
+	struct cl_object	*clob = lli->lli_clob;
+	int result;
+
+	if (S_ISREG(inode->i_mode)) {
+                env = cl_env_get(refcheck);
+                if (!IS_ERR(env)) {
+			io = vvp_env_thread_io(env);
+                        io->ci_obj = clob;
+                        *envout = env;
+                        *ioout  = io;
+                        result = +1;
+                } else
+                        result = PTR_ERR(env);
+        } else
+                result = 0;
+        return result;
+}
+
+int cl_glimpse_size0(struct inode *inode, int agl)
+{
+        /*
+         * We don't need ast_flags argument to cl_glimpse_size(), because
+         * osc_lock_enqueue() takes care of the possible deadlock that said
+         * argument was introduced to avoid.
+         */
+        /*
+         * XXX but note that ll_file_seek() passes LDLM_FL_BLOCK_NOWAIT to
+         * cl_glimpse_size(), which doesn't make sense: glimpse locks are not
+         * blocking anyway.
+         */
+        struct lu_env          *env = NULL;
+        struct cl_io           *io  = NULL;
+	__u16                   refcheck;
+        int                     result;
+
+        ENTRY;
+
+        result = cl_io_get(inode, &env, &io, &refcheck);
+        if (result > 0) {
+	again:
+		io->ci_verify_layout = 1;
+                result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+                if (result > 0)
+                        /*
+                         * nothing to do for this io. This currently happens
+                         * when stripe sub-object's are not yet created.
+                         */
+                        result = io->ci_result;
+                else if (result == 0)
+                        result = cl_glimpse_lock(env, io, inode, io->ci_obj,
+                                                 agl);
+
+		OBD_FAIL_TIMEOUT(OBD_FAIL_GLIMPSE_DELAY, 2);
+                cl_io_fini(env, io);
+		if (unlikely(io->ci_need_restart))
+			goto again;
+		cl_env_put(env, &refcheck);
+	}
+	RETURN(result);
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c b/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c
new file mode 100644
index 0000000000000..feaf1769b6e87
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c
@@ -0,0 +1,280 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <libcfs/libcfs.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/quotaops.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/rbtree.h>
+
+#include <obd.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <lustre_dlm.h>
+#include <lustre_ver.h>
+#include <lustre_mdc.h>
+#include <cl_object.h>
+
+#include "llite_internal.h"
+#include "vvp_internal.h"
+
+/**
+ * An `emergency' environment used by cl_inode_fini() when cl_env_get()
+ * fails. Access to this environment is serialized by cl_inode_fini_guard
+ * mutex.
+ */
+struct lu_env *cl_inode_fini_env;
+__u16 cl_inode_fini_refcheck;
+
+/**
+ * A mutex serializing calls to slp_inode_fini() under extreme memory
+ * pressure, when environments cannot be allocated.
+ */
+static DEFINE_MUTEX(cl_inode_fini_guard);
+
+int cl_setattr_ost(struct cl_object *obj, const struct iattr *attr,
+		   unsigned int attr_flags)
+{
+        struct lu_env *env;
+        struct cl_io  *io;
+        int            result;
+	__u16          refcheck;
+
+        ENTRY;
+
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN(PTR_ERR(env));
+
+	io = vvp_env_thread_io(env);
+	io->ci_obj = obj;
+	io->ci_verify_layout = 1;
+
+	io->u.ci_setattr.sa_attr.lvb_atime = LTIME_S(attr->ia_atime);
+	io->u.ci_setattr.sa_attr.lvb_mtime = LTIME_S(attr->ia_mtime);
+	io->u.ci_setattr.sa_attr.lvb_ctime = LTIME_S(attr->ia_ctime);
+	io->u.ci_setattr.sa_attr.lvb_size = attr->ia_size;
+	io->u.ci_setattr.sa_attr_flags = attr_flags;
+	io->u.ci_setattr.sa_valid = attr->ia_valid;
+	io->u.ci_setattr.sa_parent_fid = lu_object_fid(&obj->co_lu);
+
+again:
+        if (cl_io_init(env, io, CIT_SETATTR, io->ci_obj) == 0) {
+		struct vvp_io *vio = vvp_env_io(env);
+
+		if (attr->ia_valid & ATTR_FILE)
+			/* populate the file descriptor for ftruncate to honor
+			 * group lock - see LU-787 */
+			vio->vui_fd = LUSTRE_FPRIVATE(attr->ia_file);
+
+                result = cl_io_loop(env, io);
+        } else {
+                result = io->ci_result;
+        }
+        cl_io_fini(env, io);
+	if (unlikely(io->ci_need_restart))
+		goto again;
+
+	cl_env_put(env, &refcheck);
+	RETURN(result);
+}
+
+/**
+ * Initialize or update CLIO structures for regular files when new
+ * meta-data arrives from the server.
+ *
+ * \param inode regular file inode
+ * \param md    new file metadata from MDS
+ * - allocates cl_object if necessary,
+ * - updated layout, if object was already here.
+ */
+int cl_file_inode_init(struct inode *inode, struct lustre_md *md)
+{
+	struct lu_env        *env;
+	struct ll_inode_info *lli;
+        struct cl_object     *clob;
+        struct lu_site       *site;
+        struct lu_fid        *fid;
+	struct cl_object_conf conf = {
+		.coc_inode = inode,
+		.u = {
+			.coc_layout = md->layout,
+		}
+	};
+        int result = 0;
+	__u16 refcheck;
+
+	LASSERT(md->body->mbo_valid & OBD_MD_FLID);
+	LASSERT(S_ISREG(inode->i_mode));
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		return PTR_ERR(env);
+
+	site = ll_i2sbi(inode)->ll_site;
+	lli  = ll_i2info(inode);
+        fid  = &lli->lli_fid;
+        LASSERT(fid_is_sane(fid));
+
+        if (lli->lli_clob == NULL) {
+                /* clob is slave of inode, empty lli_clob means for new inode,
+                 * there is no clob in cache with the given fid, so it is
+                 * unnecessary to perform lookup-alloc-lookup-insert, just
+                 * alloc and insert directly. */
+                LASSERT(inode->i_state & I_NEW);
+                conf.coc_lu.loc_flags = LOC_F_NEW;
+                clob = cl_object_find(env, lu2cl_dev(site->ls_top_dev),
+                                      fid, &conf);
+                if (!IS_ERR(clob)) {
+                        /*
+                         * No locking is necessary, as new inode is
+                         * locked by I_NEW bit.
+                         */
+                        lli->lli_clob = clob;
+                        lu_object_ref_add(&clob->co_lu, "inode", inode);
+                } else
+                        result = PTR_ERR(clob);
+	} else {
+		result = cl_conf_set(env, lli->lli_clob, &conf);
+	}
+
+        cl_env_put(env, &refcheck);
+
+        if (result != 0)
+                CERROR("Failure to initialize cl object "DFID": %d\n",
+                       PFID(fid), result);
+        return result;
+}
+
+/**
+ * Wait for others drop their references of the object at first, then we drop
+ * the last one, which will lead to the object be destroyed immediately.
+ * Must be called after cl_object_kill() against this object.
+ *
+ * The reason we want to do this is: destroying top object will wait for sub
+ * objects being destroyed first, so we can't let bottom layer (e.g. from ASTs)
+ * to initiate top object destroying which may deadlock. See bz22520.
+ */
+static void cl_object_put_last(struct lu_env *env, struct cl_object *obj)
+{
+	struct lu_object_header *header = obj->co_lu.lo_header;
+	wait_queue_entry_t waiter;
+
+	if (unlikely(atomic_read(&header->loh_ref) != 1)) {
+		struct lu_site *site = obj->co_lu.lo_dev->ld_site;
+		struct lu_site_bkt_data *bkt;
+
+		bkt = lu_site_bkt_from_fid(site, &header->loh_fid);
+
+		init_waitqueue_entry(&waiter, current);
+		add_wait_queue(&bkt->lsb_marche_funebre, &waiter);
+
+		while (1) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			if (atomic_read(&header->loh_ref) == 1)
+				break;
+			schedule();
+		}
+
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&bkt->lsb_marche_funebre, &waiter);
+	}
+
+	cl_object_put(env, obj);
+}
+
+void cl_inode_fini(struct inode *inode)
+{
+	struct lu_env           *env;
+	struct ll_inode_info    *lli  = ll_i2info(inode);
+	struct cl_object        *clob = lli->lli_clob;
+	__u16  refcheck;
+	int emergency;
+
+	if (clob != NULL) {
+		env = cl_env_get(&refcheck);
+		emergency = IS_ERR(env);
+		if (emergency) {
+			mutex_lock(&cl_inode_fini_guard);
+			LASSERT(cl_inode_fini_env != NULL);
+			env = cl_inode_fini_env;
+		}
+
+                /*
+                 * cl_object cache is a slave to inode cache (which, in turn
+                 * is a slave to dentry cache), don't keep cl_object in memory
+                 * when its master is evicted.
+                 */
+                cl_object_kill(env, clob);
+                lu_object_ref_del(&clob->co_lu, "inode", inode);
+                cl_object_put_last(env, clob);
+                lli->lli_clob = NULL;
+		if (emergency)
+			mutex_unlock(&cl_inode_fini_guard);
+		else
+			cl_env_put(env, &refcheck);
+	}
+}
+
+/**
+ * build inode number from passed @fid.
+ *
+ * For 32-bit systems or syscalls limit the inode number to a 32-bit value
+ * to avoid EOVERFLOW errors.  This will inevitably result in inode number
+ * collisions, but fid_flatten32() tries hard to avoid this if possible.
+ */
+__u64 cl_fid_build_ino(const struct lu_fid *fid, int api32)
+{
+	if (BITS_PER_LONG == 32 || api32)
+		RETURN(fid_flatten32(fid));
+
+	RETURN(fid_flatten(fid));
+}
+
+/**
+ * build inode generation from passed @fid.  If our FID overflows the 32-bit
+ * inode number then return a non-zero generation to distinguish them.
+ */
+__u32 cl_fid_build_gen(const struct lu_fid *fid)
+{
+	if (fid_is_igif(fid))
+		RETURN(lu_igif_gen(fid));
+
+	RETURN(fid_flatten(fid) >> 32);
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/lcommon_misc.c b/drivers/staging/lustrefsx/lustre/llite/lcommon_misc.c
new file mode 100644
index 0000000000000..ced348a36b42a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/lcommon_misc.c
@@ -0,0 +1,185 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl code shared between vvp and liblustre (and other Lustre clients in the
+ * future).
+ *
+ */
+#define DEBUG_SUBSYSTEM S_LLITE
+#include <obd_class.h>
+#include <obd_support.h>
+#include <obd.h>
+#include <cl_object.h>
+
+#include "llite_internal.h"
+
+/* Initialize the default and maximum LOV EA and cookie sizes.  This allows
+ * us to make MDS RPCs with large enough reply buffers to hold the
+ * maximum-sized (= maximum striped) EA and cookie without having to
+ * calculate this (via a call into the LOV + OSCs) each time we make an RPC. */
+static int cl_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp)
+{
+	u32 val_size;
+	u32 max_easize;
+	u32 def_easize;
+	int rc;
+	ENTRY;
+
+	val_size = sizeof(max_easize);
+	rc = obd_get_info(NULL, dt_exp, sizeof(KEY_MAX_EASIZE), KEY_MAX_EASIZE,
+			  &val_size, &max_easize);
+	if (rc != 0)
+		RETURN(rc);
+
+	val_size = sizeof(def_easize);
+	rc = obd_get_info(NULL, dt_exp, sizeof(KEY_DEFAULT_EASIZE),
+			  KEY_DEFAULT_EASIZE, &val_size, &def_easize);
+	if (rc != 0)
+		RETURN(rc);
+
+	/* default cookiesize is 0 because from 2.4 server doesn't send
+	 * llog cookies to client. */
+	CDEBUG(D_HA, "updating def/max_easize: %d/%d\n",
+	       def_easize, max_easize);
+
+	rc = md_init_ea_size(md_exp, max_easize, def_easize);
+	RETURN(rc);
+}
+
+/**
+ * This function is used as an upcall-callback hooked llite clients
+ * into obd_notify() listeners chain to handle notifications about
+ * change of import connect_flags. See lustre_common_fill_super().
+ */
+int cl_ocd_update(struct obd_device *host, struct obd_device *watched,
+		  enum obd_notify_event ev, void *owner)
+{
+        struct lustre_client_ocd *lco;
+        struct client_obd        *cli;
+        __u64 flags;
+        int   result;
+
+        ENTRY;
+	if (!strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME) &&
+	    watched->obd_set_up && !watched->obd_stopping) {
+                cli = &watched->u.cli;
+                lco = owner;
+                flags = cli->cl_import->imp_connect_data.ocd_connect_flags;
+		CDEBUG(D_SUPER, "Changing connect_flags: %#llx -> %#llx\n",
+                       lco->lco_flags, flags);
+		mutex_lock(&lco->lco_lock);
+                lco->lco_flags &= flags;
+                /* for each osc event update ea size */
+                if (lco->lco_dt_exp)
+                        cl_init_ea_size(lco->lco_md_exp, lco->lco_dt_exp);
+
+		mutex_unlock(&lco->lco_lock);
+                result = 0;
+        } else {
+		CERROR("unexpected notification from %s %s"
+		       "(setup:%d,stopping:%d)!\n",
+		       watched->obd_type->typ_name,
+		       watched->obd_name, watched->obd_set_up,
+		       watched->obd_stopping);
+		result = -EINVAL;
+        }
+        RETURN(result);
+}
+
+#define GROUPLOCK_SCOPE "grouplock"
+
+int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock,
+		     struct ll_grouplock *lg)
+{
+        struct lu_env          *env;
+        struct cl_io           *io;
+        struct cl_lock         *lock;
+        struct cl_lock_descr   *descr;
+        __u32                   enqflags;
+	__u16                   refcheck;
+        int                     rc;
+
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                return PTR_ERR(env);
+
+	io = vvp_env_thread_io(env);
+        io->ci_obj = obj;
+
+	rc = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+	if (rc != 0) {
+		cl_io_fini(env, io);
+		cl_env_put(env, &refcheck);
+		/* Does not make sense to take GL for released layout */
+		if (rc > 0)
+			rc = -ENOTSUPP;
+		return rc;
+	}
+
+	lock = vvp_env_lock(env);
+	descr = &lock->cll_descr;
+        descr->cld_obj = obj;
+        descr->cld_start = 0;
+        descr->cld_end = CL_PAGE_EOF;
+        descr->cld_gid = gid;
+        descr->cld_mode = CLM_GROUP;
+
+	enqflags = CEF_MUST | (nonblock ? CEF_NONBLOCK : 0);
+	descr->cld_enq_flags = enqflags;
+
+	rc = cl_lock_request(env, io, lock);
+	if (rc < 0) {
+		cl_io_fini(env, io);
+		cl_env_put(env, &refcheck);
+		return rc;
+	}
+
+	lg->lg_env = env;
+	lg->lg_io = io;
+	lg->lg_lock = lock;
+	lg->lg_gid = gid;
+
+	return 0;
+}
+
+void cl_put_grouplock(struct ll_grouplock *lg)
+{
+	struct lu_env  *env  = lg->lg_env;
+	struct cl_io   *io   = lg->lg_io;
+	struct cl_lock *lock = lg->lg_lock;
+
+	LASSERT(lg->lg_env != NULL);
+	LASSERT(lg->lg_gid != 0);
+
+	cl_lock_release(env, lock);
+	cl_io_fini(env, io);
+	cl_env_put(env, NULL);
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_internal.h b/drivers/staging/lustrefsx/lustre/llite/llite_internal.h
new file mode 100644
index 0000000000000..4acb7cdcf2aff
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_internal.h
@@ -0,0 +1,1440 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LLITE_INTERNAL_H
+#define LLITE_INTERNAL_H
+#include <lustre_debug.h>
+#include <lustre_ver.h>
+#include <lustre_disk.h>  /* for s2sbi */
+#include <lustre_eacl.h>
+#include <lustre_linkea.h>
+
+/* for struct cl_lock_descr and struct cl_io */
+#include <cl_object.h>
+#include <lustre_lmv.h>
+#include <lustre_mdc.h>
+#include <lustre_intent.h>
+#include <linux/compat.h>
+#include <linux/aio.h>
+
+#include <lustre_compat.h>
+#include "vvp_internal.h"
+#include "range_lock.h"
+
+#ifndef FMODE_EXEC
+#define FMODE_EXEC 0
+#endif
+
+#ifndef VM_FAULT_RETRY
+#define VM_FAULT_RETRY 0
+#endif
+
+/* Kernel 3.1 kills LOOKUP_CONTINUE, LOOKUP_PARENT is equivalent to it.
+ * seem kernel commit 49084c3bb2055c401f3493c13edae14d49128ca0 */
+#ifndef LOOKUP_CONTINUE
+#define LOOKUP_CONTINUE LOOKUP_PARENT
+#endif
+
+/** Only used on client-side for indicating the tail of dir hash/offset. */
+#define LL_DIR_END_OFF          0x7fffffffffffffffULL
+#define LL_DIR_END_OFF_32BIT    0x7fffffffUL
+
+/* 4UL * 1024 * 1024 */
+#define LL_MAX_BLKSIZE_BITS 22
+
+#define LL_IT2STR(it) ((it) ? ldlm_it2str((it)->it_op) : "0")
+#define LUSTRE_FPRIVATE(file) ((file)->private_data)
+
+struct ll_dentry_data {
+	struct lookup_intent		*lld_it;
+	unsigned int			lld_sa_generation;
+	unsigned int			lld_invalid:1;
+	unsigned int			lld_nfs_dentry:1;
+	struct rcu_head			lld_rcu_head;
+};
+
+#define ll_d2d(de) ((struct ll_dentry_data*)((de)->d_fsdata))
+
+#define LLI_INODE_MAGIC                 0x111d0de5
+#define LLI_INODE_DEAD                  0xdeadd00d
+
+struct ll_getname_data {
+#ifdef HAVE_DIR_CONTEXT
+	struct dir_context	ctx;
+#endif
+	char		*lgd_name;	/* points to a buffer with NAME_MAX+1 size */
+	struct lu_fid	lgd_fid;	/* target fid we are looking for */
+	int		lgd_found;	/* inode matched? */
+};
+
+struct ll_grouplock {
+	struct lu_env	*lg_env;
+	struct cl_io	*lg_io;
+	struct cl_lock	*lg_lock;
+	unsigned long	 lg_gid;
+};
+
+struct ll_inode_info {
+	__u32				lli_inode_magic;
+	spinlock_t			lli_lock;
+
+	volatile unsigned long		lli_flags;
+	struct posix_acl		*lli_posix_acl;
+
+	/* identifying fields for both metadata and data stacks. */
+	struct lu_fid			lli_fid;
+	/* master inode fid for stripe directory */
+	struct lu_fid			lli_pfid;
+
+	/* We need all three because every inode may be opened in different
+	 * modes */
+	struct obd_client_handle       *lli_mds_read_och;
+	struct obd_client_handle       *lli_mds_write_och;
+	struct obd_client_handle       *lli_mds_exec_och;
+	__u64				lli_open_fd_read_count;
+	__u64				lli_open_fd_write_count;
+	__u64				lli_open_fd_exec_count;
+	/* Protects access to och pointers and their usage counters */
+	struct mutex			lli_och_mutex;
+
+	struct inode			lli_vfs_inode;
+
+	/* the most recent timestamps obtained from mds */
+	s64				lli_atime;
+	s64				lli_mtime;
+	s64				lli_ctime;
+	spinlock_t			lli_agl_lock;
+
+	/* update atime from MDS no matter if it's older than
+	 * local inode atime. */
+	unsigned int	lli_update_atime:1,
+			lli_inode_locked:1;
+
+	/* Try to make the d::member and f::member are aligned. Before using
+	 * these members, make clear whether it is directory or not. */
+	union {
+		/* for directory */
+		struct {
+			/* serialize normal readdir and statahead-readdir. */
+			struct mutex			lli_readdir_mutex;
+
+			/* metadata statahead */
+			/* since parent-child threads can share the same @file
+			 * struct, "opendir_key" is the token when dir close for
+			 * case of parent exit before child -- it is me should
+			 * cleanup the dir readahead. */
+			void			       *lli_opendir_key;
+			struct ll_statahead_info       *lli_sai;
+			/* protect statahead stuff. */
+			spinlock_t			lli_sa_lock;
+			/* "opendir_pid" is the token when lookup/revalid
+			 * -- I am the owner of dir statahead. */
+			pid_t				lli_opendir_pid;
+			/* stat will try to access statahead entries or start
+			 * statahead if this flag is set, and this flag will be
+			 * set upon dir open, and cleared when dir is closed,
+			 * statahead hit ratio is too low, or start statahead
+			 * thread failed. */
+			unsigned int			lli_sa_enabled:1;
+			/* generation for statahead */
+			unsigned int			lli_sa_generation;
+			/* directory stripe information */
+			struct lmv_stripe_md		*lli_lsm_md;
+			/* default directory stripe offset.  This is extracted
+			 * from the "dmv" xattr in order to decide which MDT to
+			 * create a subdirectory on.  The MDS itself fetches
+			 * "dmv" and gets the rest of the default layout itself
+			 * (count, hash, etc). */
+			__u32				lli_def_stripe_offset;
+		};
+
+		/* for non-directory */
+		struct {
+			struct mutex			lli_size_mutex;
+			char			       *lli_symlink_name;
+			/*
+			 * struct rw_semaphore {
+			 *    signed long	count;     // align d.d_def_acl
+			 *    spinlock_t	wait_lock; // align d.d_sa_lock
+			 *    struct list_head wait_list;
+			 * }
+			 */
+			struct rw_semaphore		lli_trunc_sem;
+			struct range_lock_tree		lli_write_tree;
+
+			struct rw_semaphore		lli_glimpse_sem;
+			cfs_time_t			lli_glimpse_time;
+			struct list_head		lli_agl_list;
+			__u64				lli_agl_index;
+
+			/* for writepage() only to communicate to fsync */
+			int				lli_async_rc;
+
+			/*
+			 * whenever a process try to read/write the file, the
+			 * jobid of the process will be saved here, and it'll
+			 * be packed into the write PRC when flush later.
+			 *
+			 * so the read/write statistics for jobid will not be
+			 * accurate if the file is shared by different jobs.
+			 */
+			char                    lli_jobid[LUSTRE_JOBID_SIZE];
+		};
+	};
+
+        /* XXX: For following frequent used members, although they maybe special
+         *      used for non-directory object, it is some time-wasting to check
+         *      whether the object is directory or not before using them. On the
+         *      other hand, currently, sizeof(f) > sizeof(d), it cannot reduce
+         *      the "ll_inode_info" size even if moving those members into u.f.
+         *      So keep them out side.
+         *
+         *      In the future, if more members are added only for directory,
+         *      some of the following members can be moved into u.f.
+         */
+	struct cl_object		*lli_clob;
+
+	/* mutex to request for layout lock exclusively. */
+	struct mutex			lli_layout_mutex;
+	/* Layout version, protected by lli_layout_lock */
+	__u32				lli_layout_gen;
+	spinlock_t			lli_layout_lock;
+
+	__u32				lli_projid;   /* project id */
+
+	struct rw_semaphore		lli_xattrs_list_rwsem;
+	struct mutex			lli_xattrs_enq_lock;
+	struct list_head		lli_xattrs; /* ll_xattr_entry->xe_list */
+};
+
+static inline __u32 ll_layout_version_get(struct ll_inode_info *lli)
+{
+	__u32 gen;
+
+	spin_lock(&lli->lli_layout_lock);
+	gen = lli->lli_layout_gen;
+	spin_unlock(&lli->lli_layout_lock);
+
+	return gen;
+}
+
+static inline void ll_layout_version_set(struct ll_inode_info *lli, __u32 gen)
+{
+	spin_lock(&lli->lli_layout_lock);
+	lli->lli_layout_gen = gen;
+	spin_unlock(&lli->lli_layout_lock);
+}
+
+enum ll_file_flags {
+	/* File data is modified. */
+	LLIF_DATA_MODIFIED      = 0,
+	/* File is being restored */
+	LLIF_FILE_RESTORING	= 1,
+	/* Xattr cache is attached to the file */
+	LLIF_XATTR_CACHE	= 2,
+};
+
+static inline void ll_file_set_flag(struct ll_inode_info *lli,
+				    enum ll_file_flags flag)
+{
+	set_bit(flag, &lli->lli_flags);
+}
+
+static inline void ll_file_clear_flag(struct ll_inode_info *lli,
+				      enum ll_file_flags flag)
+{
+	clear_bit(flag, &lli->lli_flags);
+}
+
+static inline bool ll_file_test_flag(struct ll_inode_info *lli,
+				     enum ll_file_flags flag)
+{
+	return test_bit(flag, &lli->lli_flags);
+}
+
+static inline bool ll_file_test_and_clear_flag(struct ll_inode_info *lli,
+					       enum ll_file_flags flag)
+{
+	return test_and_clear_bit(flag, &lli->lli_flags);
+}
+
+int ll_xattr_cache_destroy(struct inode *inode);
+
+int ll_xattr_cache_get(struct inode *inode,
+			const char *name,
+			char *buffer,
+			size_t size,
+			__u64 valid);
+
+int ll_dentry_init_security(struct dentry *dentry, int mode, struct qstr *name,
+			    const char **secctx_name, void **secctx,
+			    __u32 *secctx_size);
+int ll_inode_init_security(struct dentry *dentry, struct inode *inode,
+			   struct inode *dir);
+
+/*
+ * Locking to guarantee consistency of non-atomic updates to long long i_size,
+ * consistency between file size and KMS.
+ *
+ * Implemented by ->lli_size_mutex and ->lsm_lock, nested in that order.
+ */
+
+void ll_inode_size_lock(struct inode *inode);
+void ll_inode_size_unlock(struct inode *inode);
+
+// FIXME: replace the name of this with LL_I to conform to kernel stuff
+// static inline struct ll_inode_info *LL_I(struct inode *inode)
+static inline struct ll_inode_info *ll_i2info(struct inode *inode)
+{
+        return container_of(inode, struct ll_inode_info, lli_vfs_inode);
+}
+
+/* default to about 64M of readahead on a given system. */
+#define SBI_DEFAULT_READAHEAD_MAX	(64UL << (20 - PAGE_SHIFT))
+
+/* default to read-ahead full files smaller than 2MB on the second read */
+#define SBI_DEFAULT_READAHEAD_WHOLE_MAX	(2UL << (20 - PAGE_SHIFT))
+
+enum ra_stat {
+        RA_STAT_HIT = 0,
+        RA_STAT_MISS,
+        RA_STAT_DISTANT_READPAGE,
+        RA_STAT_MISS_IN_WINDOW,
+        RA_STAT_FAILED_GRAB_PAGE,
+        RA_STAT_FAILED_MATCH,
+        RA_STAT_DISCARDED,
+        RA_STAT_ZERO_LEN,
+        RA_STAT_ZERO_WINDOW,
+        RA_STAT_EOF,
+        RA_STAT_MAX_IN_FLIGHT,
+        RA_STAT_WRONG_GRAB_PAGE,
+	RA_STAT_FAILED_REACH_END,
+	_NR_RA_STAT,
+};
+
+struct ll_ra_info {
+	atomic_t	ra_cur_pages;
+	unsigned long	ra_max_pages;
+	unsigned long	ra_max_pages_per_file;
+	unsigned long	ra_max_read_ahead_whole_pages;
+};
+
+/* ra_io_arg will be filled in the beginning of ll_readahead with
+ * ras_lock, then the following ll_read_ahead_pages will read RA
+ * pages according to this arg, all the items in this structure are
+ * counted by page index.
+ */
+struct ra_io_arg {
+	unsigned long ria_start;  /* start offset of read-ahead*/
+	unsigned long ria_end;    /* end offset of read-ahead*/
+	unsigned long ria_reserved; /* reserved pages for read-ahead */
+	unsigned long ria_end_min;  /* minimum end to cover current read */
+	bool          ria_eof;    /* reach end of file */
+	/* If stride read pattern is detected, ria_stoff means where
+	 * stride read is started. Note: for normal read-ahead, the
+	 * value here is meaningless, and also it will not be accessed*/
+	pgoff_t ria_stoff;
+	/* ria_length and ria_pages are the length and pages length in the
+	 * stride I/O mode. And they will also be used to check whether
+	 * it is stride I/O read-ahead in the read-ahead pages*/
+	unsigned long ria_length;
+	unsigned long ria_pages;
+};
+
+/* LL_HIST_MAX=32 causes an overflow */
+#define LL_HIST_MAX 28
+#define LL_HIST_START 12 /* buckets start at 2^12 = 4k */
+#define LL_PROCESS_HIST_MAX 10
+struct per_process_info {
+        pid_t pid;
+        struct obd_histogram pp_r_hist;
+        struct obd_histogram pp_w_hist;
+};
+
+/* pp_extents[LL_PROCESS_HIST_MAX] will hold the combined process info */
+struct ll_rw_extents_info {
+        struct per_process_info pp_extents[LL_PROCESS_HIST_MAX + 1];
+};
+
+#define LL_OFFSET_HIST_MAX 100
+struct ll_rw_process_info {
+        pid_t                     rw_pid;
+        int                       rw_op;
+        loff_t                    rw_range_start;
+        loff_t                    rw_range_end;
+        loff_t                    rw_last_file_pos;
+        loff_t                    rw_offset;
+        size_t                    rw_smallest_extent;
+        size_t                    rw_largest_extent;
+        struct ll_file_data      *rw_last_file;
+};
+
+enum stats_track_type {
+        STATS_TRACK_ALL = 0,  /* track all processes */
+        STATS_TRACK_PID,      /* track process with this pid */
+        STATS_TRACK_PPID,     /* track processes with this ppid */
+        STATS_TRACK_GID,      /* track processes with this gid */
+        STATS_TRACK_LAST,
+};
+
+/* flags for sbi->ll_flags */
+#define LL_SBI_NOLCK             0x01 /* DLM locking disabled (directio-only) */
+#define LL_SBI_CHECKSUM          0x02 /* checksum each page as it's written */
+#define LL_SBI_FLOCK             0x04
+#define LL_SBI_USER_XATTR        0x08 /* support user xattr */
+#define LL_SBI_ACL               0x10 /* support ACL */
+/*	LL_SBI_RMT_CLIENT        0x40    remote client */
+#define LL_SBI_MDS_CAPA          0x80 /* support mds capa, obsolete */
+#define LL_SBI_OSS_CAPA         0x100 /* support oss capa, obsolete */
+#define LL_SBI_LOCALFLOCK       0x200 /* Local flocks support by kernel */
+#define LL_SBI_LRU_RESIZE       0x400 /* lru resize support */
+#define LL_SBI_LAZYSTATFS       0x800 /* lazystatfs mount option */
+/*	LL_SBI_SOM_PREVIEW     0x1000    SOM preview mount option, obsolete */
+#define LL_SBI_32BIT_API       0x2000 /* generate 32 bit inodes. */
+#define LL_SBI_64BIT_HASH      0x4000 /* support 64-bits dir hash/offset */
+#define LL_SBI_AGL_ENABLED     0x8000 /* enable agl */
+#define LL_SBI_VERBOSE        0x10000 /* verbose mount/umount */
+#define LL_SBI_LAYOUT_LOCK    0x20000 /* layout lock support */
+#define LL_SBI_USER_FID2PATH  0x40000 /* allow fid2path by unprivileged users */
+#define LL_SBI_XATTR_CACHE    0x80000 /* support for xattr cache */
+#define LL_SBI_NOROOTSQUASH  0x100000 /* do not apply root squash */
+#define LL_SBI_ALWAYS_PING   0x200000 /* always ping even if server
+				       * suppress_pings */
+#define LL_SBI_FAST_READ     0x400000 /* fast read support */
+#define LL_SBI_FILE_SECCTX   0x800000 /* set file security context at create */
+#define LL_SBI_PIO          0x1000000 /* parallel IO support */
+
+#define LL_SBI_FLAGS { 	\
+	"nolck",	\
+	"checksum",	\
+	"flock",	\
+	"user_xattr",	\
+	"acl",		\
+	"???",		\
+	"???",		\
+	"mds_capa",	\
+	"oss_capa",	\
+	"flock",	\
+	"lru_resize",	\
+	"lazy_statfs",	\
+	"som",		\
+	"32bit_api",	\
+	"64bit_hash",	\
+	"agl",		\
+	"verbose",	\
+	"layout",	\
+	"user_fid2path",\
+	"xattr_cache",	\
+	"norootsquash",	\
+	"always_ping",	\
+	"fast_read",	\
+	"file_secctx",	\
+	"pio",		\
+}
+
+/* This is embedded into llite super-blocks to keep track of connect
+ * flags (capabilities) supported by all imports given mount is
+ * connected to. */
+struct lustre_client_ocd {
+	/* This is conjunction of connect_flags across all imports
+	 * (LOVs) this mount is connected to. This field is updated by
+	 * cl_ocd_update() under ->lco_lock. */
+	__u64			 lco_flags;
+	struct mutex		 lco_lock;
+	struct obd_export	*lco_md_exp;
+	struct obd_export	*lco_dt_exp;
+};
+
+struct ll_sb_info {
+	/* this protects pglist and ra_info.  It isn't safe to
+	 * grab from interrupt contexts */
+	spinlock_t		  ll_lock;
+	spinlock_t		  ll_pp_extent_lock; /* pp_extent entry*/
+	spinlock_t		  ll_process_lock; /* ll_rw_process_info */
+        struct obd_uuid           ll_sb_uuid;
+        struct obd_export        *ll_md_exp;
+        struct obd_export        *ll_dt_exp;
+        struct proc_dir_entry*    ll_proc_root;
+        struct lu_fid             ll_root_fid; /* root object fid */
+
+        int                       ll_flags;
+	unsigned int		  ll_umounting:1,
+				  ll_xattr_cache_enabled:1,
+				  ll_xattr_cache_set:1, /* already set to 0/1 */
+				  ll_client_common_fill_super_succeeded:1;
+
+        struct lustre_client_ocd  ll_lco;
+
+        struct lprocfs_stats     *ll_stats; /* lprocfs stats counter */
+
+	/* Used to track "unstable" pages on a client, and maintain a
+	 * LRU list of clean pages. An "unstable" page is defined as
+	 * any page which is sent to a server as part of a bulk request,
+	 * but is uncommitted to stable storage. */
+	struct cl_client_cache	 *ll_cache;
+
+        struct lprocfs_stats     *ll_ra_stats;
+
+        struct ll_ra_info         ll_ra_info;
+        unsigned int              ll_namelen;
+        struct file_operations   *ll_fop;
+
+        struct lu_site           *ll_site;
+        struct cl_device         *ll_cl;
+        /* Statistics */
+        struct ll_rw_extents_info ll_rw_extents_info;
+        int                       ll_extent_process_count;
+        struct ll_rw_process_info ll_rw_process_info[LL_PROCESS_HIST_MAX];
+        unsigned int              ll_offset_process_count;
+        struct ll_rw_process_info ll_rw_offset_info[LL_OFFSET_HIST_MAX];
+        unsigned int              ll_rw_offset_entry_count;
+        int                       ll_stats_track_id;
+        enum stats_track_type     ll_stats_track_type;
+        int                       ll_rw_stats_on;
+
+	/* metadata stat-ahead */
+	unsigned int		  ll_sa_running_max;/* max concurrent
+						     * statahead instances */
+	unsigned int		  ll_sa_max;     /* max statahead RPCs */
+	atomic_t		  ll_sa_total;   /* statahead thread started
+						  * count */
+	atomic_t		  ll_sa_wrong;   /* statahead thread stopped for
+						  * low hit ratio */
+	atomic_t		  ll_sa_running; /* running statahead thread
+						  * count */
+	atomic_t		  ll_agl_total;  /* AGL thread started count */
+
+	dev_t			  ll_sdev_orig; /* save s_dev before assign for
+						 * clustred nfs */
+	/* root squash */
+	struct root_squash_info	  ll_squash;
+	struct path		  ll_mnt;
+
+	/* st_blksize returned by stat(2), when non-zero */
+	unsigned int		  ll_stat_blksize;
+};
+
+/*
+ * per file-descriptor read-ahead data.
+ */
+struct ll_readahead_state {
+	spinlock_t  ras_lock;
+        /*
+         * index of the last page that read(2) needed and that wasn't in the
+         * cache. Used by ras_update() to detect seeks.
+         *
+         * XXX nikita: if access seeks into cached region, Lustre doesn't see
+         * this.
+         */
+        unsigned long   ras_last_readpage;
+        /*
+         * number of pages read after last read-ahead window reset. As window
+         * is reset on each seek, this is effectively a number of consecutive
+         * accesses. Maybe ->ras_accessed_in_window is better name.
+         *
+         * XXX nikita: window is also reset (by ras_update()) when Lustre
+         * believes that memory pressure evicts read-ahead pages. In that
+         * case, it probably doesn't make sense to expand window to
+         * PTLRPC_MAX_BRW_PAGES on the third access.
+         */
+        unsigned long   ras_consecutive_pages;
+        /*
+         * number of read requests after the last read-ahead window reset
+         * As window is reset on each seek, this is effectively the number
+         * on consecutive read request and is used to trigger read-ahead.
+         */
+        unsigned long   ras_consecutive_requests;
+        /*
+         * Parameters of current read-ahead window. Handled by
+         * ras_update(). On the initial access to the file or after a seek,
+         * window is reset to 0. After 3 consecutive accesses, window is
+         * expanded to PTLRPC_MAX_BRW_PAGES. Afterwards, window is enlarged by
+         * PTLRPC_MAX_BRW_PAGES chunks up to ->ra_max_pages.
+         */
+        unsigned long   ras_window_start, ras_window_len;
+	/*
+	 * Optimal RPC size. It decides how many pages will be sent
+	 * for each read-ahead.
+	 */
+	unsigned long	ras_rpc_size;
+        /*
+         * Where next read-ahead should start at. This lies within read-ahead
+         * window. Read-ahead window is read in pieces rather than at once
+         * because: 1. lustre limits total number of pages under read-ahead by
+         * ->ra_max_pages (see ll_ra_count_get()), 2. client cannot read pages
+         * not covered by DLM lock.
+         */
+        unsigned long   ras_next_readahead;
+        /*
+         * Total number of ll_file_read requests issued, reads originating
+         * due to mmap are not counted in this total.  This value is used to
+         * trigger full file read-ahead after multiple reads to a small file.
+         */
+        unsigned long   ras_requests;
+        /*
+         * Page index with respect to the current request, these value
+         * will not be accurate when dealing with reads issued via mmap.
+         */
+        unsigned long   ras_request_index;
+        /*
+         * The following 3 items are used for detecting the stride I/O
+         * mode.
+         * In stride I/O mode,
+         * ...............|-----data-----|****gap*****|--------|******|....
+         *    offset      |-stride_pages-|-stride_gap-|
+         * ras_stride_offset = offset;
+         * ras_stride_length = stride_pages + stride_gap;
+         * ras_stride_pages = stride_pages;
+         * Note: all these three items are counted by pages.
+         */
+        unsigned long   ras_stride_length;
+        unsigned long   ras_stride_pages;
+        pgoff_t         ras_stride_offset;
+        /*
+         * number of consecutive stride request count, and it is similar as
+         * ras_consecutive_requests, but used for stride I/O mode.
+         * Note: only more than 2 consecutive stride request are detected,
+         * stride read-ahead will be enable
+         */
+        unsigned long   ras_consecutive_stride_requests;
+};
+
+extern struct kmem_cache *ll_file_data_slab;
+struct lustre_handle;
+struct ll_file_data {
+	struct ll_readahead_state fd_ras;
+	struct ll_grouplock fd_grouplock;
+	__u64 lfd_pos;
+	__u32 fd_flags;
+	fmode_t fd_omode;
+	/* openhandle if lease exists for this file.
+	 * Borrow lli->lli_och_mutex to protect assignment */
+	struct obd_client_handle *fd_lease_och;
+	struct obd_client_handle *fd_och;
+	struct file *fd_file;
+	/* Indicate whether need to report failure when close.
+	 * true: failure is known, not report again.
+	 * false: unknown failure, should report. */
+	bool fd_write_failed;
+	rwlock_t fd_lock; /* protect lcc list */
+	struct list_head fd_lccs; /* list of ll_cl_context */
+};
+
+extern struct proc_dir_entry *proc_lustre_fs_root;
+
+static inline struct inode *ll_info2i(struct ll_inode_info *lli)
+{
+        return &lli->lli_vfs_inode;
+}
+
+__u32 ll_i2suppgid(struct inode *i);
+void ll_i2gids(__u32 *suppgids, struct inode *i1,struct inode *i2);
+
+static inline int ll_need_32bit_api(struct ll_sb_info *sbi)
+{
+#if BITS_PER_LONG == 32
+	return 1;
+#elif defined(CONFIG_COMPAT)
+	if (unlikely(sbi->ll_flags & LL_SBI_32BIT_API))
+		return true;
+
+# ifdef CONFIG_X86_X32
+	/* in_compat_syscall() returns true when called from a kthread
+	 * and CONFIG_X86_X32 is enabled, which is wrong. So check
+	 * whether the caller comes from a syscall (ie. not a kthread)
+	 * before calling in_compat_syscall(). */
+	if (current->flags & PF_KTHREAD)
+		return false;
+# endif
+
+	return unlikely(in_compat_syscall());
+#else
+	return unlikely(sbi->ll_flags & LL_SBI_32BIT_API);
+#endif
+}
+
+static inline bool ll_sbi_has_fast_read(struct ll_sb_info *sbi)
+{
+	return !!(sbi->ll_flags & LL_SBI_FAST_READ);
+}
+
+void ll_ras_enter(struct file *f);
+
+/* llite/lcommon_misc.c */
+int cl_ocd_update(struct obd_device *host, struct obd_device *watched,
+		  enum obd_notify_event ev, void *owner);
+int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock,
+		     struct ll_grouplock *lg);
+void cl_put_grouplock(struct ll_grouplock *lg);
+
+/* llite/lproc_llite.c */
+#ifdef CONFIG_PROC_FS
+int lprocfs_ll_register_mountpoint(struct proc_dir_entry *parent,
+				   struct super_block *sb);
+int lprocfs_ll_register_obd(struct super_block *sb, const char *obdname);
+void lprocfs_ll_unregister_mountpoint(struct ll_sb_info *sbi);
+void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count);
+extern struct lprocfs_vars lprocfs_llite_obd_vars[];
+#else
+static inline int lprocfs_ll_register_mountpoint(struct proc_dir_entry *parent,
+					struct super_block *sb) {return 0; }
+static inline int lprocfs_ll_register_obd(struct super_block *sb,
+					  const char *obdname) {return 0; }
+static inline void lprocfs_ll_unregister_mountpoint(struct ll_sb_info *sbi) {}
+static void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count) {}
+#endif
+
+enum {
+	LPROC_LL_DIRTY_HITS,
+	LPROC_LL_DIRTY_MISSES,
+	LPROC_LL_READ_BYTES,
+	LPROC_LL_WRITE_BYTES,
+	LPROC_LL_BRW_READ,
+	LPROC_LL_BRW_WRITE,
+	LPROC_LL_IOCTL,
+	LPROC_LL_OPEN,
+	LPROC_LL_RELEASE,
+	LPROC_LL_MAP,
+	LPROC_LL_FAULT,
+	LPROC_LL_MKWRITE,
+	LPROC_LL_LLSEEK,
+	LPROC_LL_FSYNC,
+	LPROC_LL_READDIR,
+	LPROC_LL_SETATTR,
+	LPROC_LL_TRUNC,
+	LPROC_LL_FLOCK,
+	LPROC_LL_GETATTR,
+	LPROC_LL_CREATE,
+	LPROC_LL_LINK,
+	LPROC_LL_UNLINK,
+	LPROC_LL_SYMLINK,
+	LPROC_LL_MKDIR,
+	LPROC_LL_RMDIR,
+	LPROC_LL_MKNOD,
+	LPROC_LL_RENAME,
+	LPROC_LL_STAFS,
+	LPROC_LL_ALLOC_INODE,
+	LPROC_LL_SETXATTR,
+	LPROC_LL_GETXATTR,
+	LPROC_LL_GETXATTR_HITS,
+	LPROC_LL_LISTXATTR,
+	LPROC_LL_REMOVEXATTR,
+	LPROC_LL_INODE_PERM,
+	LPROC_LL_FILE_OPCODES
+};
+
+/* llite/dir.c */
+struct ll_dir_chain {
+};
+
+static inline void ll_dir_chain_init(struct ll_dir_chain *chain)
+{
+}
+
+static inline void ll_dir_chain_fini(struct ll_dir_chain *chain)
+{
+}
+
+extern const struct file_operations ll_dir_operations;
+extern const struct inode_operations ll_dir_inode_operations;
+#ifdef HAVE_DIR_CONTEXT
+int ll_dir_read(struct inode *inode, __u64 *pos, struct md_op_data *op_data,
+		struct dir_context *ctx);
+#else
+int ll_dir_read(struct inode *inode, __u64 *pos, struct md_op_data *op_data,
+		void *cookie, filldir_t filldir);
+#endif
+int ll_get_mdt_idx(struct inode *inode);
+int ll_get_mdt_idx_by_fid(struct ll_sb_info *sbi, const struct lu_fid *fid);
+struct page *ll_get_dir_page(struct inode *dir, struct md_op_data *op_data,
+			     __u64 offset, struct ll_dir_chain *chain);
+void ll_release_page(struct inode *inode, struct page *page, bool remove);
+
+/* llite/namei.c */
+extern const struct inode_operations ll_special_inode_operations;
+
+struct inode *ll_iget(struct super_block *sb, ino_t hash,
+                      struct lustre_md *lic);
+int ll_test_inode_by_fid(struct inode *inode, void *opaque);
+int ll_md_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *,
+                       void *data, int flag);
+struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de);
+int ll_rmdir_entry(struct inode *dir, char *name, int namelen);
+void ll_update_times(struct ptlrpc_request *request, struct inode *inode);
+
+/* llite/rw.c */
+int ll_writepage(struct page *page, struct writeback_control *wbc);
+int ll_writepages(struct address_space *, struct writeback_control *wbc);
+int ll_readpage(struct file *file, struct page *page);
+void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras);
+int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io);
+
+enum lcc_type;
+void ll_cl_add(struct file *file, const struct lu_env *env, struct cl_io *io,
+	       enum lcc_type type);
+void ll_cl_remove(struct file *file, const struct lu_env *env);
+struct ll_cl_context *ll_cl_find(struct file *file);
+
+extern const struct address_space_operations ll_aops;
+
+/* llite/file.c */
+extern struct file_operations ll_file_operations;
+extern struct file_operations ll_file_operations_flock;
+extern struct file_operations ll_file_operations_noflock;
+extern struct inode_operations ll_file_inode_operations;
+extern int ll_have_md_lock(struct inode *inode, __u64 *bits,
+			   enum ldlm_mode l_req_mode);
+extern enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
+				      struct lustre_handle *lockh, __u64 flags,
+				      enum ldlm_mode mode);
+
+int ll_file_open(struct inode *inode, struct file *file);
+int ll_file_release(struct inode *inode, struct file *file);
+int ll_release_openhandle(struct dentry *, struct lookup_intent *);
+int ll_md_real_close(struct inode *inode, fmode_t fmode);
+extern void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
+                              struct ll_file_data *file, loff_t pos,
+                              size_t count, int rw);
+#ifdef HAVE_INODEOPS_ENHANCED_GETATTR
+int ll_getattr(const struct path *path, struct kstat *stat,
+	       u32 request_mask, unsigned int flags);
+#else
+int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat);
+#endif
+struct posix_acl *ll_get_acl(struct inode *inode, int type);
+#ifdef HAVE_IOP_SET_ACL
+#ifdef CONFIG_FS_POSIX_ACL
+int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+#else  /* !CONFIG_FS_POSIX_ACL */
+#define ll_set_acl NULL
+#endif /* CONFIG_FS_POSIX_ACL */
+
+#endif
+int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
+	       const char *name, int namelen);
+int ll_get_fid_by_name(struct inode *parent, const char *name,
+		       int namelen, struct lu_fid *fid, struct inode **inode);
+#ifdef HAVE_GENERIC_PERMISSION_4ARGS
+int ll_inode_permission(struct inode *inode, int mask, unsigned int flags);
+#else
+# ifndef HAVE_INODE_PERMISION_2ARGS
+int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd);
+# else
+int ll_inode_permission(struct inode *inode, int mask);
+# endif
+#endif
+int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
+			unsigned long arg);
+int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
+			unsigned long arg);
+
+int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
+			     __u64 flags, struct lov_user_md *lum,
+			     int lum_size);
+int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
+                             struct lov_mds_md **lmm, int *lmm_size,
+                             struct ptlrpc_request **request);
+int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
+                     int set_default);
+int ll_dir_getstripe(struct inode *inode, void **lmmp,
+		     int *lmm_size, struct ptlrpc_request **request,
+		     u64 valid);
+#ifdef HAVE_FILE_FSYNC_4ARGS
+int ll_fsync(struct file *file, loff_t start, loff_t end, int data);
+#elif defined(HAVE_FILE_FSYNC_2ARGS)
+int ll_fsync(struct file *file, int data);
+#else
+int ll_fsync(struct file *file, struct dentry *dentry, int data);
+#endif
+int ll_merge_attr(const struct lu_env *env, struct inode *inode);
+int ll_fid2path(struct inode *inode, void __user *arg);
+int ll_data_version(struct inode *inode, __u64 *data_version, int flags);
+int ll_hsm_release(struct inode *inode);
+int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss);
+
+/* llite/dcache.c */
+
+int ll_d_init(struct dentry *de);
+extern const struct dentry_operations ll_d_ops;
+void ll_intent_drop_lock(struct lookup_intent *);
+void ll_intent_release(struct lookup_intent *);
+void ll_invalidate_aliases(struct inode *);
+void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry);
+int ll_revalidate_it_finish(struct ptlrpc_request *request,
+                            struct lookup_intent *it, struct dentry *de);
+
+/* llite/llite_lib.c */
+extern struct super_operations lustre_super_operations;
+
+void ll_lli_init(struct ll_inode_info *lli);
+int ll_fill_super(struct super_block *sb, struct vfsmount *mnt);
+void ll_put_super(struct super_block *sb);
+void ll_kill_super(struct super_block *sb);
+struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock);
+void ll_dir_clear_lsm_md(struct inode *inode);
+void ll_clear_inode(struct inode *inode);
+int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import);
+int ll_setattr(struct dentry *de, struct iattr *attr);
+int ll_statfs(struct dentry *de, struct kstatfs *sfs);
+int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
+                       __u64 max_age, __u32 flags);
+int ll_update_inode(struct inode *inode, struct lustre_md *md);
+int ll_read_inode2(struct inode *inode, void *opaque);
+void ll_delete_inode(struct inode *inode);
+int ll_iocontrol(struct inode *inode, struct file *file,
+                 unsigned int cmd, unsigned long arg);
+int ll_flush_ctx(struct inode *inode);
+void ll_umount_begin(struct super_block *sb);
+int ll_remount_fs(struct super_block *sb, int *flags, char *data);
+#ifdef HAVE_SUPEROPS_USE_DENTRY
+int ll_show_options(struct seq_file *seq, struct dentry *dentry);
+#else
+int ll_show_options(struct seq_file *seq, struct vfsmount *vfs);
+#endif
+void ll_dirty_page_discard_warn(struct page *page, int ioret);
+int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req,
+		  struct super_block *, struct lookup_intent *);
+int ll_obd_statfs(struct inode *inode, void __user *arg);
+int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize);
+int ll_get_default_mdsize(struct ll_sb_info *sbi, int *default_mdsize);
+int ll_set_default_mdsize(struct ll_sb_info *sbi, int default_mdsize);
+int ll_process_config(struct lustre_cfg *lcfg);
+
+enum {
+	LUSTRE_OPC_MKDIR	= 0,
+	LUSTRE_OPC_SYMLINK	= 1,
+	LUSTRE_OPC_MKNOD	= 2,
+	LUSTRE_OPC_CREATE	= 3,
+	LUSTRE_OPC_ANY		= 5,
+};
+
+struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
+				      struct inode *i1, struct inode *i2,
+				      const char *name, size_t namelen,
+				      __u32 mode, __u32 opc, void *data);
+void ll_finish_md_op_data(struct md_op_data *op_data);
+int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg);
+char *ll_get_fsname(struct super_block *sb, char *buf, int buflen);
+void ll_compute_rootsquash_state(struct ll_sb_info *sbi);
+ssize_t ll_copy_user_md(const struct lov_user_md __user *md,
+			struct lov_user_md **kbuf);
+void ll_open_cleanup(struct super_block *sb, struct ptlrpc_request *open_req);
+
+/* Compute expected user md size when passing in a md from user space */
+static inline ssize_t ll_lov_user_md_size(const struct lov_user_md *lum)
+{
+	switch (lum->lmm_magic) {
+	case LOV_USER_MAGIC_V1:
+		return sizeof(struct lov_user_md_v1);
+	case LOV_USER_MAGIC_V3:
+		return sizeof(struct lov_user_md_v3);
+	case LOV_USER_MAGIC_SPECIFIC:
+		if (lum->lmm_stripe_count > LOV_MAX_STRIPE_COUNT)
+			return -EINVAL;
+
+		return lov_user_md_size(lum->lmm_stripe_count,
+					LOV_USER_MAGIC_SPECIFIC);
+	case LOV_USER_MAGIC_COMP_V1:
+		return ((struct lov_comp_md_v1 *)lum)->lcm_size;
+	}
+
+	return -EINVAL;
+}
+
+/* llite/llite_nfs.c */
+extern struct export_operations lustre_export_operations;
+__u32 get_uuid2int(const char *name, int len);
+struct inode *search_inode_for_lustre(struct super_block *sb,
+				      const struct lu_fid *fid);
+int ll_dir_get_parent_fid(struct inode *dir, struct lu_fid *parent_fid);
+
+/* llite/symlink.c */
+extern struct inode_operations ll_fast_symlink_inode_operations;
+
+/**
+ * IO arguments for various VFS I/O interfaces.
+ */
+struct vvp_io_args {
+        /** normal/sendfile/splice */
+        enum vvp_io_subtype via_io_subtype;
+
+        union {
+                struct {
+                        struct kiocb      *via_iocb;
+			struct iov_iter   *via_iter;
+                } normal;
+                struct {
+                        struct pipe_inode_info  *via_pipe;
+                        unsigned int       via_flags;
+                } splice;
+        } u;
+};
+
+enum lcc_type {
+	LCC_RW = 1,
+	LCC_MMAP
+};
+
+struct ll_cl_context {
+	struct list_head	 lcc_list;
+	void			*lcc_cookie;
+	const struct lu_env	*lcc_env;
+	struct cl_io		*lcc_io;
+	struct cl_page		*lcc_page;
+	enum lcc_type		 lcc_type;
+};
+
+struct ll_thread_info {
+	struct iov_iter		lti_iter;
+	struct vvp_io_args	lti_args;
+	struct ra_io_arg	lti_ria;
+	struct kiocb		lti_kiocb;
+	struct ll_cl_context	lti_io_ctx;
+};
+
+extern struct lu_context_key ll_thread_key;
+
+static inline struct ll_thread_info *ll_env_info(const struct lu_env *env)
+{
+	struct ll_thread_info *lti;
+
+	lti = lu_context_key_get(&env->le_ctx, &ll_thread_key);
+	LASSERT(lti != NULL);
+
+	return lti;
+}
+
+static inline struct vvp_io_args *ll_env_args(const struct lu_env *env,
+					      enum vvp_io_subtype type)
+{
+	struct vvp_io_args *via = &ll_env_info(env)->lti_args;
+
+	via->via_io_subtype = type;
+
+	return via;
+}
+
+/* llite/llite_mmap.c */
+
+int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last);
+int ll_file_mmap(struct file * file, struct vm_area_struct * vma);
+void policy_from_vma(union ldlm_policy_data *policy, struct vm_area_struct *vma,
+		     unsigned long addr, size_t count);
+struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr,
+                               size_t count);
+
+static inline void ll_invalidate_page(struct page *vmpage)
+{
+	struct address_space *mapping = vmpage->mapping;
+	loff_t offset = vmpage->index << PAGE_SHIFT;
+
+	LASSERT(PageLocked(vmpage));
+	if (mapping == NULL)
+		return;
+
+	/*
+	 * truncate_complete_page() calls
+	 * a_ops->invalidatepage()->cl_page_delete()->vvp_page_delete().
+	 */
+	ll_teardown_mmaps(mapping, offset, offset + PAGE_SIZE);
+	truncate_complete_page(mapping, vmpage);
+}
+
+#define    ll_s2sbi(sb)        (s2lsi(sb)->lsi_llsbi)
+
+/* don't need an addref as the sb_info should be holding one */
+static inline struct obd_export *ll_s2dtexp(struct super_block *sb)
+{
+        return ll_s2sbi(sb)->ll_dt_exp;
+}
+
+/* don't need an addref as the sb_info should be holding one */
+static inline struct obd_export *ll_s2mdexp(struct super_block *sb)
+{
+        return ll_s2sbi(sb)->ll_md_exp;
+}
+
+static inline struct client_obd *sbi2mdc(struct ll_sb_info *sbi)
+{
+        struct obd_device *obd = sbi->ll_md_exp->exp_obd;
+        if (obd == NULL)
+                LBUG();
+        return &obd->u.cli;
+}
+
+// FIXME: replace the name of this with LL_SB to conform to kernel stuff
+static inline struct ll_sb_info *ll_i2sbi(struct inode *inode)
+{
+        return ll_s2sbi(inode->i_sb);
+}
+
+static inline struct obd_export *ll_i2dtexp(struct inode *inode)
+{
+        return ll_s2dtexp(inode->i_sb);
+}
+
+static inline struct obd_export *ll_i2mdexp(struct inode *inode)
+{
+        return ll_s2mdexp(inode->i_sb);
+}
+
+static inline struct lu_fid *ll_inode2fid(struct inode *inode)
+{
+        struct lu_fid *fid;
+
+        LASSERT(inode != NULL);
+        fid = &ll_i2info(inode)->lli_fid;
+
+        return fid;
+}
+
+static inline loff_t ll_file_maxbytes(struct inode *inode)
+{
+	struct cl_object *obj = ll_i2info(inode)->lli_clob;
+
+	if (obj == NULL)
+		return MAX_LFS_FILESIZE;
+
+	return min_t(loff_t, cl_object_maxbytes(obj), MAX_LFS_FILESIZE);
+}
+
+/* llite/xattr.c */
+extern const struct xattr_handler *ll_xattr_handlers[];
+
+#define XATTR_USER_T		1
+#define XATTR_TRUSTED_T		2
+#define XATTR_SECURITY_T	3
+#define XATTR_ACL_ACCESS_T	4
+#define XATTR_ACL_DEFAULT_T	5
+#define XATTR_LUSTRE_T		6
+#define XATTR_OTHER_T		7
+
+ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size);
+int ll_xattr_list(struct inode *inode, const char *name, int type,
+		  void *buffer, size_t size, u64 valid);
+const struct xattr_handler *get_xattr_type(const char *name);
+
+/**
+ * Common IO arguments for various VFS I/O interfaces.
+ */
+int cl_sb_init(struct super_block *sb);
+int cl_sb_fini(struct super_block *sb);
+
+enum ras_update_flags {
+	LL_RAS_HIT  = 0x1,
+	LL_RAS_MMAP = 0x2
+};
+void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len);
+void ll_ra_stats_inc(struct inode *inode, enum ra_stat which);
+
+/* statahead.c */
+
+#define LL_SA_RPC_MIN           2
+#define LL_SA_RPC_DEF           32
+#define LL_SA_RPC_MAX           512
+
+/* XXX: If want to support more concurrent statahead instances,
+ *	please consider to decentralize the RPC lists attached
+ *	on related import, such as imp_{sending,delayed}_list.
+ *	LU-11079 */
+#define LL_SA_RUNNING_MAX	256
+#define LL_SA_RUNNING_DEF	16
+
+#define LL_SA_CACHE_BIT         5
+#define LL_SA_CACHE_SIZE        (1 << LL_SA_CACHE_BIT)
+#define LL_SA_CACHE_MASK        (LL_SA_CACHE_SIZE - 1)
+
+/* per inode struct, for dir only */
+struct ll_statahead_info {
+	struct dentry	       *sai_dentry;
+	atomic_t		sai_refcount;   /* when access this struct, hold
+						 * refcount */
+	unsigned int            sai_max;        /* max ahead of lookup */
+	__u64                   sai_sent;       /* stat requests sent count */
+	__u64                   sai_replied;    /* stat requests which received
+						 * reply */
+	__u64                   sai_index;      /* index of statahead entry */
+	__u64                   sai_index_wait; /* index of entry which is the
+						 * caller is waiting for */
+	__u64                   sai_hit;        /* hit count */
+	__u64                   sai_miss;       /* miss count:
+						 * for "ls -al" case, includes
+						 * hidden dentry miss;
+						 * for "ls -l" case, it does not
+						 * include hidden dentry miss.
+						 * "sai_miss_hidden" is used for
+						 * the later case.
+						 */
+	unsigned int            sai_consecutive_miss; /* consecutive miss */
+	unsigned int            sai_miss_hidden;/* "ls -al", but first dentry
+						 * is not a hidden one */
+	unsigned int            sai_skip_hidden;/* skipped hidden dentry count
+						 */
+	unsigned int            sai_ls_all:1,   /* "ls -al", do stat-ahead for
+						 * hidden entries */
+				sai_agl_valid:1,/* AGL is valid for the dir */
+				sai_in_readpage:1;/* statahead is in readdir()*/
+	wait_queue_head_t	sai_waitq;	/* stat-ahead wait queue */
+	struct ptlrpc_thread	sai_thread;	/* stat-ahead thread */
+	struct ptlrpc_thread	sai_agl_thread;	/* AGL thread */
+	struct list_head	sai_interim_entries; /* entries which got async
+						      * stat reply, but not
+						      * instantiated */
+	struct list_head	sai_entries;    /* completed entries */
+	struct list_head	sai_agls;	/* AGLs to be sent */
+	struct list_head	sai_cache[LL_SA_CACHE_SIZE];
+	spinlock_t		sai_cache_lock[LL_SA_CACHE_SIZE];
+	atomic_t		sai_cache_count; /* entry count in cache */
+};
+
+int ll_statahead(struct inode *dir, struct dentry **dentry, bool unplug);
+void ll_authorize_statahead(struct inode *dir, void *key);
+void ll_deauthorize_statahead(struct inode *dir, void *key);
+
+/* glimpse.c */
+blkcnt_t dirty_cnt(struct inode *inode);
+
+int cl_glimpse_size0(struct inode *inode, int agl);
+int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
+		    struct inode *inode, struct cl_object *clob, int agl);
+
+static inline int cl_glimpse_size(struct inode *inode)
+{
+	return cl_glimpse_size0(inode, 0);
+}
+
+static inline int cl_agl(struct inode *inode)
+{
+	return cl_glimpse_size0(inode, 1);
+}
+
+static inline int ll_glimpse_size(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int rc;
+
+	down_read(&lli->lli_glimpse_sem);
+	rc = cl_glimpse_size(inode);
+	lli->lli_glimpse_time = cfs_time_current();
+	up_read(&lli->lli_glimpse_sem);
+	return rc;
+}
+
+/* dentry may statahead when statahead is enabled and current process has opened
+ * parent directory, and this dentry hasn't accessed statahead cache before */
+static inline bool
+dentry_may_statahead(struct inode *dir, struct dentry *dentry)
+{
+	struct ll_inode_info  *lli;
+	struct ll_dentry_data *ldd;
+
+	if (ll_i2sbi(dir)->ll_sa_max == 0)
+		return false;
+
+	lli = ll_i2info(dir);
+
+	/* statahead is not allowed for this dir, there may be three causes:
+	 * 1. dir is not opened.
+	 * 2. statahead hit ratio is too low.
+	 * 3. previous stat started statahead thread failed. */
+	if (!lli->lli_sa_enabled)
+		return false;
+
+	/* not the same process, don't statahead */
+	if (lli->lli_opendir_pid != current_pid())
+		return false;
+
+	/*
+	 * When stating a dentry, kernel may trigger 'revalidate' or 'lookup'
+	 * multiple times, eg. for 'getattr', 'getxattr' and etc.
+	 * For patchless client, lookup intent is not accurate, which may
+	 * misguide statahead. For example:
+	 * The 'revalidate' call for 'getattr' and 'getxattr' of a dentry will
+	 * have the same intent -- IT_GETATTR, while one dentry should access
+	 * statahead cache once, otherwise statahead windows is messed up.
+	 * The solution is as following:
+	 * Assign 'lld_sa_generation' with 'lli_sa_generation' when a dentry
+	 * IT_GETATTR for the first time, and subsequent IT_GETATTR will
+	 * bypass interacting with statahead cache by checking
+	 * 'lld_sa_generation == lli->lli_sa_generation'.
+	 */
+	ldd = ll_d2d(dentry);
+	if (ldd != NULL && ldd->lld_sa_generation == lli->lli_sa_generation)
+		return false;
+
+	return true;
+}
+
+int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
+		       enum cl_fsync_mode mode, int ignore_layout);
+
+static inline int ll_file_nolock(const struct file *file)
+{
+        struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+	struct inode *inode = file_inode((struct file *)file);
+
+        LASSERT(fd != NULL);
+        return ((fd->fd_flags & LL_FILE_IGNORE_LOCK) ||
+                (ll_i2sbi(inode)->ll_flags & LL_SBI_NOLCK));
+}
+
+static inline void ll_set_lock_data(struct obd_export *exp, struct inode *inode,
+                                    struct lookup_intent *it, __u64 *bits)
+{
+	if (!it->it_lock_set) {
+		struct lustre_handle handle;
+
+		/* If this inode is a remote object, it will get two
+		 * separate locks in different namespaces, Master MDT,
+		 * where the name entry is, will grant LOOKUP lock,
+		 * remote MDT, where the object is, will grant
+		 * UPDATE|PERM lock. The inode will be attched to both
+		 * LOOKUP and PERM locks, so revoking either locks will
+		 * case the dcache being cleared */
+		if (it->it_remote_lock_mode) {
+			handle.cookie = it->it_remote_lock_handle;
+			CDEBUG(D_DLMTRACE, "setting l_data to inode "DFID
+			       "(%p) for remote lock %#llx\n",
+			       PFID(ll_inode2fid(inode)), inode,
+			       handle.cookie);
+			md_set_lock_data(exp, &handle, inode, NULL);
+		}
+
+		handle.cookie = it->it_lock_handle;
+
+		CDEBUG(D_DLMTRACE, "setting l_data to inode "DFID"(%p)"
+		       " for lock %#llx\n",
+		       PFID(ll_inode2fid(inode)), inode, handle.cookie);
+
+		md_set_lock_data(exp, &handle, inode, &it->it_lock_bits);
+		it->it_lock_set = 1;
+	}
+
+	if (bits != NULL)
+		*bits = it->it_lock_bits;
+}
+
+static inline void ll_lock_dcache(struct inode *inode)
+{
+#ifdef HAVE_DCACHE_LOCK
+	spin_lock(&dcache_lock);
+#else
+	spin_lock(&inode->i_lock);
+#endif
+}
+
+static inline void ll_unlock_dcache(struct inode *inode)
+{
+#ifdef HAVE_DCACHE_LOCK
+	spin_unlock(&dcache_lock);
+#else
+	spin_unlock(&inode->i_lock);
+#endif
+}
+
+static inline int d_lustre_invalid(const struct dentry *dentry)
+{
+	struct ll_dentry_data *lld = ll_d2d(dentry);
+
+	return (lld == NULL) || lld->lld_invalid;
+}
+
+static inline void __d_lustre_invalidate(struct dentry *dentry)
+{
+	struct ll_dentry_data *lld = ll_d2d(dentry);
+
+	if (lld != NULL)
+		lld->lld_invalid = 1;
+}
+
+/*
+ * Mark dentry INVALID, if dentry refcount is zero (this is normally case for
+ * ll_md_blocking_ast), unhash this dentry, and let dcache to reclaim it later;
+ * else dput() of the last refcount will unhash this dentry and kill it.
+ */
+static inline void d_lustre_invalidate(struct dentry *dentry, int nested)
+{
+	CDEBUG(D_DENTRY, "invalidate dentry %.*s (%p) parent %p inode %p "
+	       "refc %d\n", dentry->d_name.len, dentry->d_name.name, dentry,
+	       dentry->d_parent, dentry->d_inode, ll_d_count(dentry));
+
+	spin_lock_nested(&dentry->d_lock,
+			 nested ? DENTRY_D_LOCK_NESTED : DENTRY_D_LOCK_NORMAL);
+	__d_lustre_invalidate(dentry);
+	/*
+	 * We should be careful about dentries created by d_obtain_alias().
+	 * These dentries are not put in the dentry tree, instead they are
+	 * linked to sb->s_anon through dentry->d_hash.
+	 * shrink_dcache_for_umount() shrinks the tree and sb->s_anon list.
+	 * If we unhashed such a dentry, unmount would not be able to find
+	 * it and busy inodes would be reported.
+	 */
+	if (ll_d_count(dentry) == 0 && !(dentry->d_flags & DCACHE_DISCONNECTED))
+		__d_drop(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
+static inline void d_lustre_revalidate(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	LASSERT(ll_d2d(dentry) != NULL);
+	ll_d2d(dentry)->lld_invalid = 0;
+	spin_unlock(&dentry->d_lock);
+}
+
+int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf);
+int ll_layout_refresh(struct inode *inode, __u32 *gen);
+int ll_layout_restore(struct inode *inode, loff_t start, __u64 length);
+int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end);
+
+int ll_xattr_init(void);
+void ll_xattr_fini(void);
+
+int ll_page_sync_io(const struct lu_env *env, struct cl_io *io,
+		    struct cl_page *page, enum cl_req_type crt);
+
+int ll_getparent(struct file *file, struct getparent __user *arg);
+
+/* lcommon_cl.c */
+int cl_setattr_ost(struct cl_object *obj, const struct iattr *attr,
+		   unsigned int attr_flags);
+
+extern struct lu_env *cl_inode_fini_env;
+extern __u16 cl_inode_fini_refcheck;
+
+int cl_file_inode_init(struct inode *inode, struct lustre_md *md);
+void cl_inode_fini(struct inode *inode);
+
+u64 cl_fid_build_ino(const struct lu_fid *fid, int api32);
+u32 cl_fid_build_gen(const struct lu_fid *fid);
+
+#endif /* LLITE_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_lib.c b/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
new file mode 100644
index 0000000000000..b1dbb7d0c3175
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
@@ -0,0 +1,2841 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/llite_lib.c
+ *
+ * Lustre Light Super operations
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/module.h>
+#include <linux/statfs.h>
+#include <linux/time.h>
+#include <linux/types.h>
+#include <linux/version.h>
+#include <linux/mm.h>
+#include <linux/user_namespace.h>
+#ifdef HAVE_UIDGID_HEADER
+# include <linux/uidgid.h>
+#endif
+#include <linux/security.h>
+
+#include <uapi/linux/lustre_ioctl.h>
+#include <lustre_ha.h>
+#include <lustre_dlm.h>
+#include <lprocfs_status.h>
+#include <lustre_disk.h>
+#include <uapi/linux/lustre_param.h>
+#include <lustre_log.h>
+#include <cl_object.h>
+#include <obd_cksum.h>
+#include "llite_internal.h"
+
+struct kmem_cache *ll_file_data_slab;
+
+#ifndef log2
+#define log2(n) ffz(~(n))
+#endif
+
+static struct ll_sb_info *ll_init_sbi(void)
+{
+	struct ll_sb_info *sbi = NULL;
+	unsigned long pages;
+	unsigned long lru_page_max;
+	struct sysinfo si;
+	class_uuid_t uuid;
+	int i;
+	ENTRY;
+
+	OBD_ALLOC_PTR(sbi);
+	if (sbi == NULL)
+		RETURN(NULL);
+
+	spin_lock_init(&sbi->ll_lock);
+	mutex_init(&sbi->ll_lco.lco_lock);
+	spin_lock_init(&sbi->ll_pp_extent_lock);
+	spin_lock_init(&sbi->ll_process_lock);
+        sbi->ll_rw_stats_on = 0;
+
+        si_meminfo(&si);
+        pages = si.totalram - si.totalhigh;
+	lru_page_max = pages / 2;
+
+	/* initialize ll_cache data */
+	sbi->ll_cache = cl_cache_init(lru_page_max);
+	if (sbi->ll_cache == NULL) {
+		OBD_FREE(sbi, sizeof(*sbi));
+		RETURN(NULL);
+	}
+
+	sbi->ll_ra_info.ra_max_pages_per_file = min(pages / 32,
+					   SBI_DEFAULT_READAHEAD_MAX);
+	sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file;
+	sbi->ll_ra_info.ra_max_read_ahead_whole_pages = -1;
+
+        ll_generate_random_uuid(uuid);
+        class_uuid_unparse(uuid, &sbi->ll_sb_uuid);
+        CDEBUG(D_CONFIG, "generated uuid: %s\n", sbi->ll_sb_uuid.uuid);
+
+        sbi->ll_flags |= LL_SBI_VERBOSE;
+#ifdef ENABLE_CHECKSUM
+        sbi->ll_flags |= LL_SBI_CHECKSUM;
+#endif
+
+#ifdef HAVE_LRU_RESIZE_SUPPORT
+        sbi->ll_flags |= LL_SBI_LRU_RESIZE;
+#endif
+	sbi->ll_flags |= LL_SBI_LAZYSTATFS;
+
+        for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) {
+		spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i].
+			       pp_r_hist.oh_lock);
+		spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i].
+			       pp_w_hist.oh_lock);
+        }
+
+	/* metadata statahead is enabled by default */
+	sbi->ll_sa_running_max = LL_SA_RUNNING_DEF;
+	sbi->ll_sa_max = LL_SA_RPC_DEF;
+	atomic_set(&sbi->ll_sa_total, 0);
+	atomic_set(&sbi->ll_sa_wrong, 0);
+	atomic_set(&sbi->ll_sa_running, 0);
+	atomic_set(&sbi->ll_agl_total, 0);
+	sbi->ll_flags |= LL_SBI_AGL_ENABLED;
+	sbi->ll_flags |= LL_SBI_FAST_READ;
+
+	/* root squash */
+	sbi->ll_squash.rsi_uid = 0;
+	sbi->ll_squash.rsi_gid = 0;
+	INIT_LIST_HEAD(&sbi->ll_squash.rsi_nosquash_nids);
+	init_rwsem(&sbi->ll_squash.rsi_sem);
+
+	RETURN(sbi);
+}
+
+static void ll_free_sbi(struct super_block *sb)
+{
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	ENTRY;
+
+	if (sbi != NULL) {
+		if (!list_empty(&sbi->ll_squash.rsi_nosquash_nids))
+			cfs_free_nidlist(&sbi->ll_squash.rsi_nosquash_nids);
+		if (sbi->ll_cache != NULL) {
+			cl_cache_decref(sbi->ll_cache);
+			sbi->ll_cache = NULL;
+		}
+		OBD_FREE(sbi, sizeof(*sbi));
+	}
+	EXIT;
+}
+
+static inline int obd_connect_has_secctx(struct obd_connect_data *data)
+{
+	return data->ocd_connect_flags & OBD_CONNECT_FLAGS2 &&
+	       data->ocd_connect_flags2 & OBD_CONNECT2_FILE_SECCTX;
+}
+
+static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
+                                    struct vfsmount *mnt)
+{
+	struct inode *root = NULL;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        struct obd_device *obd;
+        struct obd_statfs *osfs = NULL;
+        struct ptlrpc_request *request = NULL;
+        struct obd_connect_data *data = NULL;
+        struct obd_uuid *uuid;
+        struct md_op_data *op_data;
+        struct lustre_md lmd;
+	u64 valid;
+        int size, err, checksum;
+        ENTRY;
+
+        obd = class_name2obd(md);
+        if (!obd) {
+                CERROR("MD %s: not setup or attached\n", md);
+                RETURN(-EINVAL);
+        }
+
+        OBD_ALLOC_PTR(data);
+        if (data == NULL)
+                RETURN(-ENOMEM);
+
+        OBD_ALLOC_PTR(osfs);
+        if (osfs == NULL) {
+                OBD_FREE_PTR(data);
+                RETURN(-ENOMEM);
+        }
+
+        /* indicate the features supported by this client */
+        data->ocd_connect_flags = OBD_CONNECT_IBITS    | OBD_CONNECT_NODEVOH  |
+                                  OBD_CONNECT_ATTRFID  |
+                                  OBD_CONNECT_VERSION  | OBD_CONNECT_BRW_SIZE |
+                                  OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA |
+                                  OBD_CONNECT_CANCELSET | OBD_CONNECT_FID     |
+                                  OBD_CONNECT_AT       | OBD_CONNECT_LOV_V3   |
+				  OBD_CONNECT_VBR | OBD_CONNECT_FULL20 |
+				  OBD_CONNECT_64BITHASH |
+				  OBD_CONNECT_EINPROGRESS |
+				  OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
+				  OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS|
+				  OBD_CONNECT_MAX_EASIZE |
+				  OBD_CONNECT_FLOCK_DEAD |
+				  OBD_CONNECT_DISP_STRIPE | OBD_CONNECT_LFSCK |
+				  OBD_CONNECT_OPEN_BY_FID |
+				  OBD_CONNECT_DIR_STRIPE |
+				  OBD_CONNECT_BULK_MBITS |
+				  OBD_CONNECT_SUBTREE |
+				  OBD_CONNECT_FLAGS2 | OBD_CONNECT_MULTIMODRPCS;
+
+	data->ocd_connect_flags2 = 0;
+
+#ifdef HAVE_LRU_RESIZE_SUPPORT
+        if (sbi->ll_flags & LL_SBI_LRU_RESIZE)
+                data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
+#endif
+#ifdef CONFIG_FS_POSIX_ACL
+	data->ocd_connect_flags |= OBD_CONNECT_ACL | OBD_CONNECT_UMASK |
+				   OBD_CONNECT_LARGE_ACL;
+#endif
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_MDC_LIGHTWEIGHT))
+		/* flag mdc connection as lightweight, only used for test
+		 * purpose, use with care */
+                data->ocd_connect_flags |= OBD_CONNECT_LIGHTWEIGHT;
+
+        data->ocd_ibits_known = MDS_INODELOCK_FULL;
+        data->ocd_version = LUSTRE_VERSION_CODE;
+
+        if (sb->s_flags & MS_RDONLY)
+                data->ocd_connect_flags |= OBD_CONNECT_RDONLY;
+        if (sbi->ll_flags & LL_SBI_USER_XATTR)
+                data->ocd_connect_flags |= OBD_CONNECT_XATTR;
+
+#ifdef MS_NOSEC
+	/* Setting this indicates we correctly support S_NOSEC (See kernel
+	 * commit 9e1f1de02c2275d7172e18dc4e7c2065777611bf)
+	 */
+	sb->s_flags |= MS_NOSEC;
+#endif
+
+        if (sbi->ll_flags & LL_SBI_FLOCK)
+                sbi->ll_fop = &ll_file_operations_flock;
+        else if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
+                sbi->ll_fop = &ll_file_operations;
+        else
+                sbi->ll_fop = &ll_file_operations_noflock;
+
+	/* always ping even if server suppress_pings */
+	if (sbi->ll_flags & LL_SBI_ALWAYS_PING)
+		data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS;
+
+#if defined(HAVE_SECURITY_DENTRY_INIT_SECURITY) && defined(CONFIG_SECURITY)
+	data->ocd_connect_flags2 |= OBD_CONNECT2_FILE_SECCTX;
+#endif /* HAVE_SECURITY_DENTRY_INIT_SECURITY */
+
+	data->ocd_brw_size = MD_MAX_BRW_SIZE;
+
+        err = obd_connect(NULL, &sbi->ll_md_exp, obd, &sbi->ll_sb_uuid, data, NULL);
+        if (err == -EBUSY) {
+                LCONSOLE_ERROR_MSG(0x14f, "An MDT (md %s) is performing "
+                                   "recovery, of which this client is not a "
+                                   "part. Please wait for recovery to complete,"
+                                   " abort, or time out.\n", md);
+                GOTO(out, err);
+        } else if (err) {
+                CERROR("cannot connect to %s: rc = %d\n", md, err);
+                GOTO(out, err);
+        }
+
+	sbi->ll_md_exp->exp_connect_data = *data;
+
+	err = obd_fid_init(sbi->ll_md_exp->exp_obd, sbi->ll_md_exp,
+			   LUSTRE_SEQ_METADATA);
+	if (err) {
+		CERROR("%s: Can't init metadata layer FID infrastructure, "
+		       "rc = %d\n", sbi->ll_md_exp->exp_obd->obd_name, err);
+		GOTO(out_md, err);
+	}
+
+	/* For mount, we only need fs info from MDT0, and also in DNE, it
+	 * can make sure the client can be mounted as long as MDT0 is
+	 * avaible */
+	err = obd_statfs(NULL, sbi->ll_md_exp, osfs,
+			cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			OBD_STATFS_FOR_MDT0);
+	if (err)
+		GOTO(out_md_fid, err);
+
+	/* This needs to be after statfs to ensure connect has finished.
+	 * Note that "data" does NOT contain the valid connect reply.
+	 * If connecting to a 1.8 server there will be no LMV device, so
+	 * we can access the MDC export directly and exp_connect_flags will
+	 * be non-zero, but if accessing an upgraded 2.1 server it will
+	 * have the correct flags filled in.
+	 * XXX: fill in the LMV exp_connect_flags from MDC(s). */
+	valid = exp_connect_flags(sbi->ll_md_exp) & CLIENT_CONNECT_MDT_REQD;
+	if (exp_connect_flags(sbi->ll_md_exp) != 0 &&
+	    valid != CLIENT_CONNECT_MDT_REQD) {
+		char *buf;
+
+		OBD_ALLOC_WAIT(buf, PAGE_SIZE);
+		obd_connect_flags2str(buf, PAGE_SIZE,
+				      valid ^ CLIENT_CONNECT_MDT_REQD, 0, ",");
+		LCONSOLE_ERROR_MSG(0x170, "Server %s does not support "
+				   "feature(s) needed for correct operation "
+				   "of this client (%s). Please upgrade "
+				   "server or downgrade client.\n",
+				   sbi->ll_md_exp->exp_obd->obd_name, buf);
+		OBD_FREE(buf, PAGE_SIZE);
+		GOTO(out_md_fid, err = -EPROTO);
+	}
+
+	size = sizeof(*data);
+	err = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_CONN_DATA),
+			   KEY_CONN_DATA,  &size, data);
+	if (err) {
+		CERROR("%s: Get connect data failed: rc = %d\n",
+		       sbi->ll_md_exp->exp_obd->obd_name, err);
+		GOTO(out_md_fid, err);
+	}
+
+	LASSERT(osfs->os_bsize);
+	sb->s_blocksize = osfs->os_bsize;
+	sb->s_blocksize_bits = log2(osfs->os_bsize);
+	sb->s_magic = LL_SUPER_MAGIC;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sbi->ll_namelen = osfs->os_namelen;
+	sbi->ll_mnt.mnt = current->fs->root.mnt;
+
+        if ((sbi->ll_flags & LL_SBI_USER_XATTR) &&
+            !(data->ocd_connect_flags & OBD_CONNECT_XATTR)) {
+                LCONSOLE_INFO("Disabling user_xattr feature because "
+                              "it is not supported on the server\n");
+                sbi->ll_flags &= ~LL_SBI_USER_XATTR;
+        }
+
+        if (data->ocd_connect_flags & OBD_CONNECT_ACL) {
+#ifdef MS_POSIXACL
+                sb->s_flags |= MS_POSIXACL;
+#endif
+                sbi->ll_flags |= LL_SBI_ACL;
+        } else {
+                LCONSOLE_INFO("client wants to enable acl, but mdt not!\n");
+#ifdef MS_POSIXACL
+                sb->s_flags &= ~MS_POSIXACL;
+#endif
+                sbi->ll_flags &= ~LL_SBI_ACL;
+        }
+
+        if (data->ocd_connect_flags & OBD_CONNECT_64BITHASH)
+                sbi->ll_flags |= LL_SBI_64BIT_HASH;
+
+	if (data->ocd_connect_flags & OBD_CONNECT_LAYOUTLOCK)
+		sbi->ll_flags |= LL_SBI_LAYOUT_LOCK;
+
+	if (obd_connect_has_secctx(data))
+		sbi->ll_flags |= LL_SBI_FILE_SECCTX;
+
+	if (data->ocd_ibits_known & MDS_INODELOCK_XATTR) {
+		if (!(data->ocd_connect_flags & OBD_CONNECT_MAX_EASIZE)) {
+			LCONSOLE_INFO("%s: disabling xattr cache due to "
+				      "unknown maximum xattr size.\n", dt);
+		} else if (!sbi->ll_xattr_cache_set) {
+			/* If xattr_cache is already set (no matter 0 or 1)
+			 * during processing llog, it won't be enabled here. */
+			sbi->ll_flags |= LL_SBI_XATTR_CACHE;
+			sbi->ll_xattr_cache_enabled = 1;
+		}
+	}
+
+	obd = class_name2obd(dt);
+	if (!obd) {
+		CERROR("DT %s: not setup or attached\n", dt);
+		GOTO(out_md_fid, err = -ENODEV);
+	}
+
+	/* pass client page size via ocd_grant_blkbits, the server should report
+	 * back its backend blocksize for grant calculation purpose */
+	data->ocd_grant_blkbits = PAGE_SHIFT;
+
+	data->ocd_connect_flags = OBD_CONNECT_GRANT | OBD_CONNECT_VERSION |
+				  OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE |
+				  OBD_CONNECT_CANCELSET | OBD_CONNECT_FID |
+				  OBD_CONNECT_SRVLOCK | OBD_CONNECT_TRUNCLOCK|
+				  OBD_CONNECT_AT | OBD_CONNECT_OSS_CAPA |
+				  OBD_CONNECT_VBR | OBD_CONNECT_FULL20 |
+				  OBD_CONNECT_64BITHASH | OBD_CONNECT_MAXBYTES |
+				  OBD_CONNECT_EINPROGRESS |
+				  OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
+				  OBD_CONNECT_LAYOUTLOCK |
+				  OBD_CONNECT_PINGLESS | OBD_CONNECT_LFSCK |
+				  OBD_CONNECT_BULK_MBITS;
+
+	data->ocd_connect_flags2 = 0;
+
+	if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_GRANT_PARAM))
+		data->ocd_connect_flags |= OBD_CONNECT_GRANT_PARAM;
+
+	/* OBD_CONNECT_CKSUM should always be set, even if checksums are
+	 * disabled by default, because it can still be enabled on the
+	 * fly via /proc. As a consequence, we still need to come to an
+	 * agreement on the supported algorithms at connect time */
+	data->ocd_connect_flags |= OBD_CONNECT_CKSUM;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CKSUM_ADLER_ONLY))
+		data->ocd_cksum_types = OBD_CKSUM_ADLER;
+	else
+		data->ocd_cksum_types = cksum_types_supported_client();
+
+#ifdef HAVE_LRU_RESIZE_SUPPORT
+	data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
+#endif
+	/* always ping even if server suppress_pings */
+	if (sbi->ll_flags & LL_SBI_ALWAYS_PING)
+		data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS;
+
+	CDEBUG(D_RPCTRACE, "ocd_connect_flags: %#llx ocd_version: %d "
+	       "ocd_grant: %d\n", data->ocd_connect_flags,
+	       data->ocd_version, data->ocd_grant);
+
+	obd->obd_upcall.onu_owner = &sbi->ll_lco;
+	obd->obd_upcall.onu_upcall = cl_ocd_update;
+
+	data->ocd_brw_size = DT_MAX_BRW_SIZE;
+
+	err = obd_connect(NULL, &sbi->ll_dt_exp, obd, &sbi->ll_sb_uuid, data,
+			  NULL);
+	if (err == -EBUSY) {
+		LCONSOLE_ERROR_MSG(0x150, "An OST (dt %s) is performing "
+				   "recovery, of which this client is not a "
+				   "part.  Please wait for recovery to "
+				   "complete, abort, or time out.\n", dt);
+		GOTO(out_md, err);
+	} else if (err) {
+		CERROR("%s: Cannot connect to %s: rc = %d\n",
+		       sbi->ll_dt_exp->exp_obd->obd_name, dt, err);
+		GOTO(out_md, err);
+	}
+
+	sbi->ll_dt_exp->exp_connect_data = *data;
+
+	/* Don't change value if it was specified in the config log */
+	if (sbi->ll_ra_info.ra_max_read_ahead_whole_pages == -1)
+		sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
+			max_t(unsigned long, SBI_DEFAULT_READAHEAD_WHOLE_MAX,
+			      (data->ocd_brw_size >> PAGE_SHIFT));
+
+	err = obd_fid_init(sbi->ll_dt_exp->exp_obd, sbi->ll_dt_exp,
+			   LUSTRE_SEQ_METADATA);
+	if (err) {
+		CERROR("%s: Can't init data layer FID infrastructure, "
+		       "rc = %d\n", sbi->ll_dt_exp->exp_obd->obd_name, err);
+		GOTO(out_dt, err);
+	}
+
+	mutex_lock(&sbi->ll_lco.lco_lock);
+	sbi->ll_lco.lco_flags = data->ocd_connect_flags;
+	sbi->ll_lco.lco_md_exp = sbi->ll_md_exp;
+	sbi->ll_lco.lco_dt_exp = sbi->ll_dt_exp;
+	mutex_unlock(&sbi->ll_lco.lco_lock);
+
+	fid_zero(&sbi->ll_root_fid);
+	err = md_get_root(sbi->ll_md_exp, get_mount_fileset(sb),
+			   &sbi->ll_root_fid);
+	if (err) {
+		CERROR("cannot mds_connect: rc = %d\n", err);
+		GOTO(out_lock_cn_cb, err);
+	}
+	if (!fid_is_sane(&sbi->ll_root_fid)) {
+		CERROR("%s: Invalid root fid "DFID" during mount\n",
+		       sbi->ll_md_exp->exp_obd->obd_name,
+		       PFID(&sbi->ll_root_fid));
+		GOTO(out_lock_cn_cb, err = -EINVAL);
+	}
+	CDEBUG(D_SUPER, "rootfid "DFID"\n", PFID(&sbi->ll_root_fid));
+
+	sb->s_op = &lustre_super_operations;
+#ifdef HAVE_XATTR_HANDLER_FLAGS
+	sb->s_xattr = ll_xattr_handlers;
+#endif
+#if THREAD_SIZE >= 8192 /*b=17630*/
+	sb->s_export_op = &lustre_export_operations;
+#endif
+
+	/* make root inode
+	 * XXX: move this to after cbd setup? */
+	valid = OBD_MD_FLGETATTR | OBD_MD_FLBLOCKS | OBD_MD_FLMODEASIZE;
+	if (sbi->ll_flags & LL_SBI_ACL)
+		valid |= OBD_MD_FLACL;
+
+	OBD_ALLOC_PTR(op_data);
+	if (op_data == NULL)
+		GOTO(out_lock_cn_cb, err = -ENOMEM);
+
+	op_data->op_fid1 = sbi->ll_root_fid;
+	op_data->op_mode = 0;
+	op_data->op_valid = valid;
+
+	err = md_getattr(sbi->ll_md_exp, op_data, &request);
+
+	OBD_FREE_PTR(op_data);
+	if (err) {
+		CERROR("%s: md_getattr failed for root: rc = %d\n",
+		       sbi->ll_md_exp->exp_obd->obd_name, err);
+		GOTO(out_lock_cn_cb, err);
+	}
+
+	err = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp,
+			       sbi->ll_md_exp, &lmd);
+	if (err) {
+		CERROR("failed to understand root inode md: rc = %d\n", err);
+		ptlrpc_req_finished(request);
+		GOTO(out_lock_cn_cb, err);
+	}
+
+	LASSERT(fid_is_sane(&sbi->ll_root_fid));
+	root = ll_iget(sb, cl_fid_build_ino(&sbi->ll_root_fid,
+					    sbi->ll_flags & LL_SBI_32BIT_API),
+		       &lmd);
+	md_free_lustre_md(sbi->ll_md_exp, &lmd);
+	ptlrpc_req_finished(request);
+
+	if (IS_ERR(root)) {
+#ifdef CONFIG_FS_POSIX_ACL
+		if (lmd.posix_acl) {
+			posix_acl_release(lmd.posix_acl);
+			lmd.posix_acl = NULL;
+		}
+#endif
+		err = IS_ERR(root) ? PTR_ERR(root) : -EBADF;
+		root = NULL;
+		CERROR("lustre_lite: bad iget4 for root\n");
+		GOTO(out_root, err);
+	}
+
+	checksum = sbi->ll_flags & LL_SBI_CHECKSUM;
+	err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM),
+				 KEY_CHECKSUM, sizeof(checksum), &checksum,
+				 NULL);
+	if (err) {
+		CERROR("%s: Set checksum failed: rc = %d\n",
+		       sbi->ll_dt_exp->exp_obd->obd_name, err);
+		GOTO(out_root, err);
+	}
+	cl_sb_init(sb);
+
+	err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CACHE_SET),
+				 KEY_CACHE_SET, sizeof(*sbi->ll_cache),
+				 sbi->ll_cache, NULL);
+	if (err) {
+		CERROR("%s: Set cache_set failed: rc = %d\n",
+		       sbi->ll_dt_exp->exp_obd->obd_name, err);
+		GOTO(out_root, err);
+	}
+
+	sb->s_root = d_make_root(root);
+	if (sb->s_root == NULL) {
+		CERROR("%s: can't make root dentry\n",
+			ll_get_fsname(sb, NULL, 0));
+		GOTO(out_root, err = -ENOMEM);
+	}
+#ifdef HAVE_DCACHE_LOCK
+	sb->s_root->d_op = &ll_d_ops;
+#endif
+
+	sbi->ll_sdev_orig = sb->s_dev;
+
+	/* We set sb->s_dev equal on all lustre clients in order to support
+	 * NFS export clustering.  NFSD requires that the FSID be the same
+	 * on all clients. */
+	/* s_dev is also used in lt_compare() to compare two fs, but that is
+	 * only a node-local comparison. */
+	uuid = obd_get_uuid(sbi->ll_md_exp);
+	if (uuid != NULL)
+		sb->s_dev = get_uuid2int(uuid->uuid, strlen(uuid->uuid));
+
+	if (data != NULL)
+		OBD_FREE_PTR(data);
+	if (osfs != NULL)
+		OBD_FREE_PTR(osfs);
+
+	if (sbi->ll_proc_root != NULL) {
+		err = lprocfs_ll_register_obd(sb, dt);
+		if (err < 0) {
+			CERROR("%s: could not register %s in llite: rc = %d\n",
+			       dt, ll_get_fsname(sb, NULL, 0), err);
+			err = 0;
+		}
+		err = lprocfs_ll_register_obd(sb, md);
+		if (err < 0) {
+			CERROR("%s: could not register %s in llite: rc = %d\n",
+			       md, ll_get_fsname(sb, NULL, 0), err);
+			err = 0;
+		}
+	}
+
+	RETURN(err);
+out_root:
+	if (root)
+		iput(root);
+out_lock_cn_cb:
+	obd_fid_fini(sbi->ll_dt_exp->exp_obd);
+out_dt:
+	obd_disconnect(sbi->ll_dt_exp);
+	sbi->ll_dt_exp = NULL;
+out_md_fid:
+	obd_fid_fini(sbi->ll_md_exp->exp_obd);
+out_md:
+	obd_disconnect(sbi->ll_md_exp);
+	sbi->ll_md_exp = NULL;
+out:
+	if (data != NULL)
+		OBD_FREE_PTR(data);
+	if (osfs != NULL)
+		OBD_FREE_PTR(osfs);
+	return err;
+}
+
+int ll_get_max_mdsize(struct ll_sb_info *sbi, int *lmmsize)
+{
+	int size, rc;
+
+	size = sizeof(*lmmsize);
+	rc = obd_get_info(NULL, sbi->ll_dt_exp, sizeof(KEY_MAX_EASIZE),
+			  KEY_MAX_EASIZE, &size, lmmsize);
+	if (rc != 0) {
+		CERROR("%s: cannot get max LOV EA size: rc = %d\n",
+		       sbi->ll_dt_exp->exp_obd->obd_name, rc);
+		RETURN(rc);
+	}
+
+	size = sizeof(int);
+	rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_MAX_EASIZE),
+			  KEY_MAX_EASIZE, &size, lmmsize);
+	if (rc)
+		CERROR("Get max mdsize error rc %d\n", rc);
+
+	RETURN(rc);
+}
+
+/**
+ * Get the value of the default_easize parameter.
+ *
+ * \see client_obd::cl_default_mds_easize
+ *
+ * \param[in] sbi	superblock info for this filesystem
+ * \param[out] lmmsize	pointer to storage location for value
+ *
+ * \retval 0		on success
+ * \retval negative	negated errno on failure
+ */
+int ll_get_default_mdsize(struct ll_sb_info *sbi, int *lmmsize)
+{
+	int size, rc;
+
+	size = sizeof(int);
+	rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_DEFAULT_EASIZE),
+			 KEY_DEFAULT_EASIZE, &size, lmmsize);
+	if (rc)
+		CERROR("Get default mdsize error rc %d\n", rc);
+
+	RETURN(rc);
+}
+
+/**
+ * Set the default_easize parameter to the given value.
+ *
+ * \see client_obd::cl_default_mds_easize
+ *
+ * \param[in] sbi	superblock info for this filesystem
+ * \param[in] lmmsize	the size to set
+ *
+ * \retval 0		on success
+ * \retval negative	negated errno on failure
+ */
+int ll_set_default_mdsize(struct ll_sb_info *sbi, int lmmsize)
+{
+	int rc;
+
+	if (lmmsize < sizeof(struct lov_mds_md) ||
+	    lmmsize > OBD_MAX_DEFAULT_EA_SIZE)
+		return -EINVAL;
+
+	rc = obd_set_info_async(NULL, sbi->ll_md_exp,
+				sizeof(KEY_DEFAULT_EASIZE), KEY_DEFAULT_EASIZE,
+				sizeof(int), &lmmsize, NULL);
+
+	RETURN(rc);
+}
+
+static void client_common_put_super(struct super_block *sb)
+{
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	ENTRY;
+
+	cl_sb_fini(sb);
+
+	obd_fid_fini(sbi->ll_dt_exp->exp_obd);
+	obd_disconnect(sbi->ll_dt_exp);
+	sbi->ll_dt_exp = NULL;
+
+	lprocfs_ll_unregister_mountpoint(sbi);
+
+	obd_fid_fini(sbi->ll_md_exp->exp_obd);
+	obd_disconnect(sbi->ll_md_exp);
+	sbi->ll_md_exp = NULL;
+
+	EXIT;
+}
+
+void ll_kill_super(struct super_block *sb)
+{
+	struct ll_sb_info *sbi;
+	ENTRY;
+
+        /* not init sb ?*/
+	if (!(sb->s_flags & MS_ACTIVE))
+		return;
+
+	sbi = ll_s2sbi(sb);
+	/* we need restore s_dev from changed for clustred NFS before put_super
+	 * because new kernels have cached s_dev and change sb->s_dev in
+	 * put_super not affected real removing devices */
+	if (sbi) {
+		sb->s_dev = sbi->ll_sdev_orig;
+		sbi->ll_umounting = 1;
+
+		/* wait running statahead threads to quit */
+		while (atomic_read(&sbi->ll_sa_running) > 0) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC >> 3));
+		}
+	}
+
+	EXIT;
+}
+
+static inline int ll_set_opt(const char *opt, char *data, int fl)
+{
+        if (strncmp(opt, data, strlen(opt)) != 0)
+                return(0);
+        else
+                return(fl);
+}
+
+/* non-client-specific mount options are parsed in lmd_parse */
+static int ll_options(char *options, int *flags)
+{
+        int tmp;
+        char *s1 = options, *s2;
+        ENTRY;
+
+        if (!options)
+                RETURN(0);
+
+        CDEBUG(D_CONFIG, "Parsing opts %s\n", options);
+
+        while (*s1) {
+                CDEBUG(D_SUPER, "next opt=%s\n", s1);
+                tmp = ll_set_opt("nolock", s1, LL_SBI_NOLCK);
+                if (tmp) {
+                        *flags |= tmp;
+                        goto next;
+                }
+                tmp = ll_set_opt("flock", s1, LL_SBI_FLOCK);
+                if (tmp) {
+                        *flags |= tmp;
+                        goto next;
+                }
+                tmp = ll_set_opt("localflock", s1, LL_SBI_LOCALFLOCK);
+                if (tmp) {
+                        *flags |= tmp;
+                        goto next;
+                }
+                tmp = ll_set_opt("noflock", s1, LL_SBI_FLOCK|LL_SBI_LOCALFLOCK);
+                if (tmp) {
+                        *flags &= ~tmp;
+                        goto next;
+                }
+                tmp = ll_set_opt("user_xattr", s1, LL_SBI_USER_XATTR);
+                if (tmp) {
+                        *flags |= tmp;
+                        goto next;
+                }
+                tmp = ll_set_opt("nouser_xattr", s1, LL_SBI_USER_XATTR);
+                if (tmp) {
+                        *flags &= ~tmp;
+                        goto next;
+                }
+		tmp = ll_set_opt("context", s1, 1);
+		if (tmp)
+			goto next;
+		tmp = ll_set_opt("fscontext", s1, 1);
+		if (tmp)
+			goto next;
+		tmp = ll_set_opt("defcontext", s1, 1);
+		if (tmp)
+			goto next;
+		tmp = ll_set_opt("rootcontext", s1, 1);
+		if (tmp)
+			goto next;
+		tmp = ll_set_opt("user_fid2path", s1, LL_SBI_USER_FID2PATH);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("nouser_fid2path", s1, LL_SBI_USER_FID2PATH);
+		if (tmp) {
+			*flags &= ~tmp;
+			goto next;
+		}
+
+                tmp = ll_set_opt("checksum", s1, LL_SBI_CHECKSUM);
+                if (tmp) {
+                        *flags |= tmp;
+                        goto next;
+                }
+                tmp = ll_set_opt("nochecksum", s1, LL_SBI_CHECKSUM);
+                if (tmp) {
+                        *flags &= ~tmp;
+                        goto next;
+                }
+                tmp = ll_set_opt("lruresize", s1, LL_SBI_LRU_RESIZE);
+                if (tmp) {
+                        *flags |= tmp;
+                        goto next;
+                }
+                tmp = ll_set_opt("nolruresize", s1, LL_SBI_LRU_RESIZE);
+                if (tmp) {
+                        *flags &= ~tmp;
+                        goto next;
+                }
+                tmp = ll_set_opt("lazystatfs", s1, LL_SBI_LAZYSTATFS);
+                if (tmp) {
+                        *flags |= tmp;
+                        goto next;
+                }
+                tmp = ll_set_opt("nolazystatfs", s1, LL_SBI_LAZYSTATFS);
+                if (tmp) {
+                        *flags &= ~tmp;
+                        goto next;
+                }
+                tmp = ll_set_opt("32bitapi", s1, LL_SBI_32BIT_API);
+                if (tmp) {
+                        *flags |= tmp;
+                        goto next;
+                }
+                tmp = ll_set_opt("verbose", s1, LL_SBI_VERBOSE);
+                if (tmp) {
+                        *flags |= tmp;
+                        goto next;
+                }
+                tmp = ll_set_opt("noverbose", s1, LL_SBI_VERBOSE);
+                if (tmp) {
+                        *flags &= ~tmp;
+                        goto next;
+                }
+		tmp = ll_set_opt("always_ping", s1, LL_SBI_ALWAYS_PING);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+                LCONSOLE_ERROR_MSG(0x152, "Unknown option '%s', won't mount.\n",
+                                   s1);
+                RETURN(-EINVAL);
+
+next:
+                /* Find next opt */
+                s2 = strchr(s1, ',');
+                if (s2 == NULL)
+                        break;
+                s1 = s2 + 1;
+        }
+        RETURN(0);
+}
+
+void ll_lli_init(struct ll_inode_info *lli)
+{
+	lli->lli_inode_magic = LLI_INODE_MAGIC;
+	lli->lli_flags = 0;
+	spin_lock_init(&lli->lli_lock);
+	lli->lli_posix_acl = NULL;
+	/* Do not set lli_fid, it has been initialized already. */
+	fid_zero(&lli->lli_pfid);
+	lli->lli_mds_read_och = NULL;
+        lli->lli_mds_write_och = NULL;
+        lli->lli_mds_exec_och = NULL;
+        lli->lli_open_fd_read_count = 0;
+        lli->lli_open_fd_write_count = 0;
+        lli->lli_open_fd_exec_count = 0;
+	mutex_init(&lli->lli_och_mutex);
+	spin_lock_init(&lli->lli_agl_lock);
+	spin_lock_init(&lli->lli_layout_lock);
+	ll_layout_version_set(lli, CL_LAYOUT_GEN_NONE);
+	lli->lli_clob = NULL;
+
+	init_rwsem(&lli->lli_xattrs_list_rwsem);
+	mutex_init(&lli->lli_xattrs_enq_lock);
+
+	LASSERT(lli->lli_vfs_inode.i_mode != 0);
+	if (S_ISDIR(lli->lli_vfs_inode.i_mode)) {
+		mutex_init(&lli->lli_readdir_mutex);
+		lli->lli_opendir_key = NULL;
+		lli->lli_sai = NULL;
+		spin_lock_init(&lli->lli_sa_lock);
+		lli->lli_opendir_pid = 0;
+		lli->lli_sa_enabled = 0;
+		lli->lli_def_stripe_offset = -1;
+	} else {
+		mutex_init(&lli->lli_size_mutex);
+		lli->lli_symlink_name = NULL;
+		init_rwsem(&lli->lli_trunc_sem);
+		range_lock_tree_init(&lli->lli_write_tree);
+		init_rwsem(&lli->lli_glimpse_sem);
+		lli->lli_glimpse_time = 0;
+		INIT_LIST_HEAD(&lli->lli_agl_list);
+		lli->lli_agl_index = 0;
+		lli->lli_async_rc = 0;
+	}
+	mutex_init(&lli->lli_layout_mutex);
+	memset(lli->lli_jobid, 0, LUSTRE_JOBID_SIZE);
+}
+
+#ifndef HAVE_SUPER_SETUP_BDI_NAME
+
+#define LSI_BDI_INITIALIZED	0x00400000
+
+#ifndef HAVE_BDI_CAP_MAP_COPY
+# define BDI_CAP_MAP_COPY	0
+#endif
+
+#define MAX_STRING_SIZE 128
+
+static int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
+{
+	struct  lustre_sb_info *lsi = s2lsi(sb);
+	char buf[MAX_STRING_SIZE];
+	va_list args;
+	int err;
+
+	err = bdi_init(&lsi->lsi_bdi);
+	if (err)
+		return err;
+
+	lsi->lsi_flags |= LSI_BDI_INITIALIZED;
+	lsi->lsi_bdi.capabilities = BDI_CAP_MAP_COPY;
+	lsi->lsi_bdi.name = "lustre";
+	va_start(args, fmt);
+	vsnprintf(buf, MAX_STRING_SIZE, fmt, args);
+	va_end(args);
+	err = bdi_register(&lsi->lsi_bdi, NULL, "%s", buf);
+	va_end(args);
+	if (!err)
+		sb->s_bdi = &lsi->lsi_bdi;
+
+	return err;
+}
+#endif /* !HAVE_SUPER_SETUP_BDI_NAME */
+
+int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
+{
+	struct	lustre_profile *lprof = NULL;
+	struct	lustre_sb_info *lsi = s2lsi(sb);
+	struct	ll_sb_info *sbi;
+	char	*dt = NULL, *md = NULL;
+	char	*profilenm = get_profile_name(sb);
+	struct config_llog_instance *cfg;
+	/* %p for void* in printf needs 16+2 characters: 0xffffffffffffffff */
+	const int instlen = sizeof(cfg->cfg_instance) * 2 + 2;
+	int	md_len = 0;
+	int	dt_len = 0;
+	int	err;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb);
+
+	OBD_ALLOC_PTR(cfg);
+	if (cfg == NULL)
+		RETURN(-ENOMEM);
+
+	try_module_get(THIS_MODULE);
+
+	/* client additional sb info */
+	lsi->lsi_llsbi = sbi = ll_init_sbi();
+	if (!sbi) {
+		module_put(THIS_MODULE);
+		OBD_FREE_PTR(cfg);
+		RETURN(-ENOMEM);
+	}
+
+	err = ll_options(lsi->lsi_lmd->lmd_opts, &sbi->ll_flags);
+	if (err)
+		GOTO(out_free, err);
+
+	err = super_setup_bdi_name(sb, "lustre-%p", sb);
+	if (err)
+		GOTO(out_free, err);
+
+#ifndef HAVE_DCACHE_LOCK
+	/* kernel >= 2.6.38 store dentry operations in sb->s_d_op. */
+	sb->s_d_op = &ll_d_ops;
+#endif
+
+	/* Call lprocfs_ll_register_mountpoint() before lustre_process_log()
+	 * so that "llite.*.*" params can be processed correctly. */
+	if (proc_lustre_fs_root != NULL) {
+		err = lprocfs_ll_register_mountpoint(proc_lustre_fs_root, sb);
+		if (err < 0) {
+			CERROR("%s: could not register mountpoint in llite: "
+			       "rc = %d\n", ll_get_fsname(sb, NULL, 0), err);
+			err = 0;
+		}
+	}
+
+	/* Generate a string unique to this super, in case some joker tries
+	   to mount the same fs at two mount points.
+	   Use the address of the super itself.*/
+	cfg->cfg_instance = sb;
+	cfg->cfg_uuid = lsi->lsi_llsbi->ll_sb_uuid;
+	cfg->cfg_callback = class_config_llog_handler;
+	cfg->cfg_sub_clds = CONFIG_SUB_CLIENT;
+	/* set up client obds */
+	err = lustre_process_log(sb, profilenm, cfg);
+	if (err < 0)
+		GOTO(out_proc, err);
+
+	/* Profile set with LCFG_MOUNTOPT so we can find our mdc and osc obds */
+	lprof = class_get_profile(profilenm);
+	if (lprof == NULL) {
+		LCONSOLE_ERROR_MSG(0x156, "The client profile '%s' could not be"
+				   " read from the MGS.  Does that filesystem "
+				   "exist?\n", profilenm);
+		GOTO(out_proc, err = -EINVAL);
+	}
+	CDEBUG(D_CONFIG, "Found profile %s: mdc=%s osc=%s\n", profilenm,
+	       lprof->lp_md, lprof->lp_dt);
+
+	dt_len = strlen(lprof->lp_dt) + instlen + 2;
+	OBD_ALLOC(dt, dt_len);
+	if (!dt)
+		GOTO(out_proc, err = -ENOMEM);
+	snprintf(dt, dt_len - 1, "%s-%p", lprof->lp_dt, cfg->cfg_instance);
+
+	md_len = strlen(lprof->lp_md) + instlen + 2;
+	OBD_ALLOC(md, md_len);
+	if (!md)
+		GOTO(out_proc, err = -ENOMEM);
+	snprintf(md, md_len - 1, "%s-%p", lprof->lp_md, cfg->cfg_instance);
+
+	/* connections, registrations, sb setup */
+	err = client_common_fill_super(sb, md, dt, mnt);
+	if (err < 0)
+		GOTO(out_proc, err);
+
+	sbi->ll_client_common_fill_super_succeeded = 1;
+
+out_proc:
+	if (err < 0)
+		lprocfs_ll_unregister_mountpoint(sbi);
+out_free:
+	if (md)
+		OBD_FREE(md, md_len);
+	if (dt)
+		OBD_FREE(dt, dt_len);
+	if (lprof != NULL)
+		class_put_profile(lprof);
+	if (err)
+		ll_put_super(sb);
+	else if (sbi->ll_flags & LL_SBI_VERBOSE)
+		LCONSOLE_WARN("Mounted %s\n", profilenm);
+
+	OBD_FREE_PTR(cfg);
+	RETURN(err);
+} /* ll_fill_super */
+
+void ll_put_super(struct super_block *sb)
+{
+	struct config_llog_instance cfg, params_cfg;
+        struct obd_device *obd;
+        struct lustre_sb_info *lsi = s2lsi(sb);
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        char *profilenm = get_profile_name(sb);
+	long ccc_count;
+	int next, force = 1, rc = 0;
+        ENTRY;
+
+        CDEBUG(D_VFSTRACE, "VFS Op: sb %p - %s\n", sb, profilenm);
+
+        cfg.cfg_instance = sb;
+        lustre_end_log(sb, profilenm, &cfg);
+
+	params_cfg.cfg_instance = sb;
+	lustre_end_log(sb, PARAMS_FILENAME, &params_cfg);
+
+        if (sbi->ll_md_exp) {
+                obd = class_exp2obd(sbi->ll_md_exp);
+                if (obd)
+                        force = obd->obd_force;
+        }
+
+	/* Wait for unstable pages to be committed to stable storage */
+	if (force == 0) {
+		struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+		rc = l_wait_event(sbi->ll_cache->ccc_unstable_waitq,
+			atomic_long_read(&sbi->ll_cache->ccc_unstable_nr) == 0,
+			&lwi);
+	}
+
+	ccc_count = atomic_long_read(&sbi->ll_cache->ccc_unstable_nr);
+	if (force == 0 && rc != -EINTR)
+		LASSERTF(ccc_count == 0, "count: %li\n", ccc_count);
+
+
+        /* We need to set force before the lov_disconnect in
+           lustre_common_put_super, since l_d cleans up osc's as well. */
+        if (force) {
+                next = 0;
+                while ((obd = class_devices_in_group(&sbi->ll_sb_uuid,
+                                                     &next)) != NULL) {
+                        obd->obd_force = force;
+                }
+        }
+
+	if (sbi->ll_client_common_fill_super_succeeded) {
+		/* Only if client_common_fill_super succeeded */
+		client_common_put_super(sb);
+	}
+
+        next = 0;
+        while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) !=NULL) {
+                class_manual_cleanup(obd);
+        }
+
+        if (sbi->ll_flags & LL_SBI_VERBOSE)
+                LCONSOLE_WARN("Unmounted %s\n", profilenm ? profilenm : "");
+
+        if (profilenm)
+                class_del_profile(profilenm);
+
+#ifndef HAVE_SUPER_SETUP_BDI_NAME
+	if (lsi->lsi_flags & LSI_BDI_INITIALIZED) {
+		bdi_destroy(&lsi->lsi_bdi);
+		lsi->lsi_flags &= ~LSI_BDI_INITIALIZED;
+	}
+#endif
+
+        ll_free_sbi(sb);
+        lsi->lsi_llsbi = NULL;
+
+	lustre_common_put_super(sb);
+
+	cl_env_cache_purge(~0);
+
+	module_put(THIS_MODULE);
+
+	EXIT;
+} /* client_put_super */
+
+struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock)
+{
+	struct inode *inode = NULL;
+
+	/* NOTE: we depend on atomic igrab() -bzzz */
+	lock_res_and_lock(lock);
+	if (lock->l_resource->lr_lvb_inode) {
+		struct ll_inode_info * lli;
+		lli = ll_i2info(lock->l_resource->lr_lvb_inode);
+		if (lli->lli_inode_magic == LLI_INODE_MAGIC) {
+			inode = igrab(lock->l_resource->lr_lvb_inode);
+		} else {
+			inode = lock->l_resource->lr_lvb_inode;
+			LDLM_DEBUG_LIMIT(inode->i_state & I_FREEING ?  D_INFO :
+					 D_WARNING, lock, "lr_lvb_inode %p is "
+					 "bogus: magic %08x",
+					 lock->l_resource->lr_lvb_inode,
+					 lli->lli_inode_magic);
+			inode = NULL;
+		}
+	}
+	unlock_res_and_lock(lock);
+	return inode;
+}
+
+void ll_dir_clear_lsm_md(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+
+	LASSERT(S_ISDIR(inode->i_mode));
+
+	if (lli->lli_lsm_md != NULL) {
+		lmv_free_memmd(lli->lli_lsm_md);
+		lli->lli_lsm_md = NULL;
+	}
+}
+
+static struct inode *ll_iget_anon_dir(struct super_block *sb,
+				      const struct lu_fid *fid,
+				      struct lustre_md *md)
+{
+	struct ll_sb_info	*sbi = ll_s2sbi(sb);
+	struct mdt_body		*body = md->body;
+	struct inode		*inode;
+	ino_t			ino;
+	ENTRY;
+
+	ino = cl_fid_build_ino(fid, sbi->ll_flags & LL_SBI_32BIT_API);
+	inode = iget_locked(sb, ino);
+	if (inode == NULL) {
+		CERROR("%s: failed get simple inode "DFID": rc = -ENOENT\n",
+		       ll_get_fsname(sb, NULL, 0), PFID(fid));
+		RETURN(ERR_PTR(-ENOENT));
+	}
+
+	if (inode->i_state & I_NEW) {
+		struct ll_inode_info *lli = ll_i2info(inode);
+		struct lmv_stripe_md *lsm = md->lmv;
+
+		inode->i_mode = (inode->i_mode & ~S_IFMT) |
+				(body->mbo_mode & S_IFMT);
+		LASSERTF(S_ISDIR(inode->i_mode), "Not slave inode "DFID"\n",
+			 PFID(fid));
+
+		LTIME_S(inode->i_mtime) = 0;
+		LTIME_S(inode->i_atime) = 0;
+		LTIME_S(inode->i_ctime) = 0;
+		inode->i_rdev = 0;
+
+#ifdef HAVE_BACKING_DEV_INFO
+		/* initializing backing dev info. */
+		inode->i_mapping->backing_dev_info =
+						&s2lsi(inode->i_sb)->lsi_bdi;
+#endif
+		inode->i_op = &ll_dir_inode_operations;
+		inode->i_fop = &ll_dir_operations;
+		lli->lli_fid = *fid;
+		ll_lli_init(lli);
+
+		LASSERT(lsm != NULL);
+		/* master object FID */
+		lli->lli_pfid = body->mbo_fid1;
+		CDEBUG(D_INODE, "lli %p slave "DFID" master "DFID"\n",
+		       lli, PFID(fid), PFID(&lli->lli_pfid));
+		unlock_new_inode(inode);
+	}
+
+	RETURN(inode);
+}
+
+static int ll_init_lsm_md(struct inode *inode, struct lustre_md *md)
+{
+	struct lu_fid *fid;
+	struct lmv_stripe_md *lsm = md->lmv;
+	int i;
+
+	LASSERT(lsm != NULL);
+	/* XXX sigh, this lsm_root initialization should be in
+	 * LMV layer, but it needs ll_iget right now, so we
+	 * put this here right now. */
+	for (i = 0; i < lsm->lsm_md_stripe_count; i++) {
+		fid = &lsm->lsm_md_oinfo[i].lmo_fid;
+		LASSERT(lsm->lsm_md_oinfo[i].lmo_root == NULL);
+		/* Unfortunately ll_iget will call ll_update_inode,
+		 * where the initialization of slave inode is slightly
+		 * different, so it reset lsm_md to NULL to avoid
+		 * initializing lsm for slave inode. */
+		/* For migrating inode, master stripe and master object will
+		 * be same, so we only need assign this inode */
+		if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION && i == 0)
+			lsm->lsm_md_oinfo[i].lmo_root = inode;
+		else
+			lsm->lsm_md_oinfo[i].lmo_root =
+				ll_iget_anon_dir(inode->i_sb, fid, md);
+
+		if (IS_ERR(lsm->lsm_md_oinfo[i].lmo_root)) {
+			int rc = PTR_ERR(lsm->lsm_md_oinfo[i].lmo_root);
+
+			lsm->lsm_md_oinfo[i].lmo_root = NULL;
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+static inline int lli_lsm_md_eq(const struct lmv_stripe_md *lsm_md1,
+				const struct lmv_stripe_md *lsm_md2)
+{
+	return lsm_md1->lsm_md_magic == lsm_md2->lsm_md_magic &&
+	       lsm_md1->lsm_md_stripe_count == lsm_md2->lsm_md_stripe_count &&
+	       lsm_md1->lsm_md_master_mdt_index ==
+					lsm_md2->lsm_md_master_mdt_index &&
+	       lsm_md1->lsm_md_hash_type == lsm_md2->lsm_md_hash_type &&
+	       lsm_md1->lsm_md_layout_version ==
+					lsm_md2->lsm_md_layout_version &&
+	       strcmp(lsm_md1->lsm_md_pool_name,
+		      lsm_md2->lsm_md_pool_name) == 0;
+}
+
+static int ll_update_lsm_md(struct inode *inode, struct lustre_md *md)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct lmv_stripe_md *lsm = md->lmv;
+	int	rc;
+	ENTRY;
+
+	LASSERT(S_ISDIR(inode->i_mode));
+	CDEBUG(D_INODE, "update lsm %p of "DFID"\n", lli->lli_lsm_md,
+	       PFID(ll_inode2fid(inode)));
+
+	/* no striped information from request. */
+	if (lsm == NULL) {
+		if (lli->lli_lsm_md == NULL) {
+			RETURN(0);
+		} else if (lli->lli_lsm_md->lsm_md_hash_type &
+						LMV_HASH_FLAG_MIGRATION) {
+			/* migration is done, the temporay MIGRATE layout has
+			 * been removed */
+			CDEBUG(D_INODE, DFID" finish migration.\n",
+			       PFID(ll_inode2fid(inode)));
+			lmv_free_memmd(lli->lli_lsm_md);
+			lli->lli_lsm_md = NULL;
+			RETURN(0);
+		} else {
+			/* The lustre_md from req does not include stripeEA,
+			 * see ll_md_setattr */
+			RETURN(0);
+		}
+	}
+
+	/* set the directory layout */
+	if (lli->lli_lsm_md == NULL) {
+		struct cl_attr	*attr;
+
+		rc = ll_init_lsm_md(inode, md);
+		if (rc != 0)
+			RETURN(rc);
+
+		/* set md->lmv to NULL, so the following free lustre_md
+		 * will not free this lsm */
+		md->lmv = NULL;
+		lli->lli_lsm_md = lsm;
+
+		OBD_ALLOC_PTR(attr);
+		if (attr == NULL)
+			RETURN(-ENOMEM);
+
+		/* validate the lsm */
+		rc = md_merge_attr(ll_i2mdexp(inode), lsm, attr,
+				   ll_md_blocking_ast);
+		if (rc != 0) {
+			OBD_FREE_PTR(attr);
+			RETURN(rc);
+		}
+
+		if (md->body->mbo_valid & OBD_MD_FLNLINK)
+			md->body->mbo_nlink = attr->cat_nlink;
+		if (md->body->mbo_valid & OBD_MD_FLSIZE)
+			md->body->mbo_size = attr->cat_size;
+		if (md->body->mbo_valid & OBD_MD_FLATIME)
+			md->body->mbo_atime = attr->cat_atime;
+		if (md->body->mbo_valid & OBD_MD_FLCTIME)
+			md->body->mbo_ctime = attr->cat_ctime;
+		if (md->body->mbo_valid & OBD_MD_FLMTIME)
+			md->body->mbo_mtime = attr->cat_mtime;
+
+		OBD_FREE_PTR(attr);
+
+		CDEBUG(D_INODE, "Set lsm %p magic %x to "DFID"\n", lsm,
+		       lsm->lsm_md_magic, PFID(ll_inode2fid(inode)));
+		RETURN(0);
+	}
+
+	/* Compare the old and new stripe information */
+	if (!lsm_md_eq(lli->lli_lsm_md, lsm)) {
+		struct lmv_stripe_md	*old_lsm = lli->lli_lsm_md;
+		int			idx;
+
+		CERROR("%s: inode "DFID"(%p)'s lmv layout mismatch (%p)/(%p)"
+		       "magic:0x%x/0x%x stripe count: %d/%d master_mdt: %d/%d"
+		       "hash_type:0x%x/0x%x layout: 0x%x/0x%x pool:%s/%s\n",
+		       ll_get_fsname(inode->i_sb, NULL, 0), PFID(&lli->lli_fid),
+		       inode, lsm, old_lsm,
+		       lsm->lsm_md_magic, old_lsm->lsm_md_magic,
+		       lsm->lsm_md_stripe_count,
+		       old_lsm->lsm_md_stripe_count,
+		       lsm->lsm_md_master_mdt_index,
+		       old_lsm->lsm_md_master_mdt_index,
+		       lsm->lsm_md_hash_type, old_lsm->lsm_md_hash_type,
+		       lsm->lsm_md_layout_version,
+		       old_lsm->lsm_md_layout_version,
+		       lsm->lsm_md_pool_name,
+		       old_lsm->lsm_md_pool_name);
+
+		for (idx = 0; idx < old_lsm->lsm_md_stripe_count; idx++) {
+			CERROR("%s: sub FIDs in old lsm idx %d, old: "DFID"\n",
+			       ll_get_fsname(inode->i_sb, NULL, 0), idx,
+			       PFID(&old_lsm->lsm_md_oinfo[idx].lmo_fid));
+		}
+
+		for (idx = 0; idx < lsm->lsm_md_stripe_count; idx++) {
+			CERROR("%s: sub FIDs in new lsm idx %d, new: "DFID"\n",
+			       ll_get_fsname(inode->i_sb, NULL, 0), idx,
+			       PFID(&lsm->lsm_md_oinfo[idx].lmo_fid));
+		}
+
+		RETURN(-EIO);
+	}
+
+	RETURN(0);
+}
+
+void ll_clear_inode(struct inode *inode)
+{
+        struct ll_inode_info *lli = ll_i2info(inode);
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
+	       PFID(ll_inode2fid(inode)), inode);
+
+        if (S_ISDIR(inode->i_mode)) {
+                /* these should have been cleared in ll_file_release */
+                LASSERT(lli->lli_opendir_key == NULL);
+                LASSERT(lli->lli_sai == NULL);
+                LASSERT(lli->lli_opendir_pid == 0);
+        }
+
+	md_null_inode(sbi->ll_md_exp, ll_inode2fid(inode));
+
+        LASSERT(!lli->lli_open_fd_write_count);
+        LASSERT(!lli->lli_open_fd_read_count);
+        LASSERT(!lli->lli_open_fd_exec_count);
+
+        if (lli->lli_mds_write_och)
+                ll_md_real_close(inode, FMODE_WRITE);
+        if (lli->lli_mds_exec_och)
+                ll_md_real_close(inode, FMODE_EXEC);
+        if (lli->lli_mds_read_och)
+                ll_md_real_close(inode, FMODE_READ);
+
+        if (S_ISLNK(inode->i_mode) && lli->lli_symlink_name) {
+                OBD_FREE(lli->lli_symlink_name,
+                         strlen(lli->lli_symlink_name) + 1);
+                lli->lli_symlink_name = NULL;
+        }
+
+	ll_xattr_cache_destroy(inode);
+
+#ifdef CONFIG_FS_POSIX_ACL
+	forget_all_cached_acls(inode);
+	if (lli->lli_posix_acl) {
+		LASSERT(atomic_read(&lli->lli_posix_acl->a_refcount) == 1);
+		posix_acl_release(lli->lli_posix_acl);
+		lli->lli_posix_acl = NULL;
+	}
+#endif
+	lli->lli_inode_magic = LLI_INODE_DEAD;
+
+	if (S_ISDIR(inode->i_mode))
+		ll_dir_clear_lsm_md(inode);
+	else if (S_ISREG(inode->i_mode) && !is_bad_inode(inode))
+		LASSERT(list_empty(&lli->lli_agl_list));
+
+	/*
+	 * XXX This has to be done before lsm is freed below, because
+	 * cl_object still uses inode lsm.
+	 */
+	cl_inode_fini(inode);
+
+	EXIT;
+}
+
+static int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data)
+{
+        struct lustre_md md;
+        struct inode *inode = dentry->d_inode;
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        struct ptlrpc_request *request = NULL;
+        int rc, ia_valid;
+        ENTRY;
+
+        op_data = ll_prep_md_op_data(op_data, inode, NULL, NULL, 0, 0,
+                                     LUSTRE_OPC_ANY, NULL);
+        if (IS_ERR(op_data))
+                RETURN(PTR_ERR(op_data));
+
+	rc = md_setattr(sbi->ll_md_exp, op_data, NULL, 0, &request);
+	if (rc) {
+		ptlrpc_req_finished(request);
+		if (rc == -ENOENT) {
+			clear_nlink(inode);
+			/* Unlinked special device node? Or just a race?
+			 * Pretend we done everything. */
+			if (!S_ISREG(inode->i_mode) &&
+			    !S_ISDIR(inode->i_mode)) {
+				ia_valid = op_data->op_attr.ia_valid;
+				op_data->op_attr.ia_valid &= ~TIMES_SET_FLAGS;
+				rc = simple_setattr(dentry, &op_data->op_attr);
+				op_data->op_attr.ia_valid = ia_valid;
+			}
+		} else if (rc != -EPERM && rc != -EACCES && rc != -ETXTBSY) {
+			CERROR("md_setattr fails: rc = %d\n", rc);
+		}
+		RETURN(rc);
+	}
+
+        rc = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp,
+                              sbi->ll_md_exp, &md);
+        if (rc) {
+                ptlrpc_req_finished(request);
+                RETURN(rc);
+        }
+
+	ia_valid = op_data->op_attr.ia_valid;
+	/* inode size will be in ll_setattr_ost, can't do it now since dirty
+	 * cache is not cleared yet. */
+	op_data->op_attr.ia_valid &= ~(TIMES_SET_FLAGS | ATTR_SIZE);
+	if (S_ISREG(inode->i_mode))
+		inode_lock(inode);
+	rc = simple_setattr(dentry, &op_data->op_attr);
+	if (S_ISREG(inode->i_mode))
+		inode_unlock(inode);
+	op_data->op_attr.ia_valid = ia_valid;
+
+	rc = ll_update_inode(inode, &md);
+	ptlrpc_req_finished(request);
+
+	RETURN(rc);
+}
+
+/* If this inode has objects allocated to it (lsm != NULL), then the OST
+ * object(s) determine the file size and mtime.  Otherwise, the MDS will
+ * keep these values until such a time that objects are allocated for it.
+ * We do the MDS operations first, as it is checking permissions for us.
+ * We don't to the MDS RPC if there is nothing that we want to store there,
+ * otherwise there is no harm in updating mtime/atime on the MDS if we are
+ * going to do an RPC anyways.
+ *
+ * If we are doing a truncate, we will send the mtime and ctime updates
+ * to the OST with the punch RPC, otherwise we do an explicit setattr RPC.
+ * I don't believe it is possible to get e.g. ATTR_MTIME_SET and ATTR_SIZE
+ * at the same time.
+ *
+ * In case of HSMimport, we only set attr on MDS.
+ */
+int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import)
+{
+        struct inode *inode = dentry->d_inode;
+        struct ll_inode_info *lli = ll_i2info(inode);
+        struct md_op_data *op_data = NULL;
+	int rc = 0;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "%s: setattr inode "DFID"(%p) from %llu to %llu, "
+	       "valid %x, hsm_import %d\n",
+	       ll_get_fsname(inode->i_sb, NULL, 0), PFID(&lli->lli_fid),
+	       inode, i_size_read(inode), attr->ia_size, attr->ia_valid,
+	       hsm_import);
+
+	if (attr->ia_valid & ATTR_SIZE) {
+                /* Check new size against VFS/VM file size limit and rlimit */
+                rc = inode_newsize_ok(inode, attr->ia_size);
+                if (rc)
+                        RETURN(rc);
+
+                /* The maximum Lustre file size is variable, based on the
+                 * OST maximum object size and number of stripes.  This
+                 * needs another check in addition to the VFS check above. */
+                if (attr->ia_size > ll_file_maxbytes(inode)) {
+			CDEBUG(D_INODE,"file "DFID" too large %llu > %llu\n",
+                               PFID(&lli->lli_fid), attr->ia_size,
+                               ll_file_maxbytes(inode));
+                        RETURN(-EFBIG);
+                }
+
+                attr->ia_valid |= ATTR_MTIME | ATTR_CTIME;
+        }
+
+	/* POSIX: check before ATTR_*TIME_SET set (from inode_change_ok) */
+	if (attr->ia_valid & TIMES_SET_FLAGS) {
+		if ((!uid_eq(current_fsuid(), inode->i_uid)) &&
+		    !cfs_capable(CFS_CAP_FOWNER))
+			RETURN(-EPERM);
+	}
+
+        /* We mark all of the fields "set" so MDS/OST does not re-set them */
+	if (!(attr->ia_valid & ATTR_CTIME_SET) &&
+	    (attr->ia_valid & ATTR_CTIME)) {
+		attr->ia_ctime = current_time(inode);
+                attr->ia_valid |= ATTR_CTIME_SET;
+        }
+	if (!(attr->ia_valid & ATTR_ATIME_SET) &&
+	    (attr->ia_valid & ATTR_ATIME)) {
+		attr->ia_atime = current_time(inode);
+                attr->ia_valid |= ATTR_ATIME_SET;
+        }
+	if (!(attr->ia_valid & ATTR_MTIME_SET) &&
+	    (attr->ia_valid & ATTR_MTIME)) {
+		attr->ia_mtime = current_time(inode);
+                attr->ia_valid |= ATTR_MTIME_SET;
+        }
+
+        if (attr->ia_valid & (ATTR_MTIME | ATTR_CTIME))
+		CDEBUG(D_INODE, "setting mtime %lu, ctime %lu, now = %llu\n",
+                       LTIME_S(attr->ia_mtime), LTIME_S(attr->ia_ctime),
+		       (s64)ktime_get_real_seconds());
+
+	if (S_ISREG(inode->i_mode)) {
+		if (attr->ia_valid & ATTR_SIZE)
+			inode_dio_write_done(inode);
+		inode_unlock(inode);
+	}
+
+	/* We always do an MDS RPC, even if we're only changing the size;
+	 * only the MDS knows whether truncate() should fail with -ETXTBUSY */
+
+	OBD_ALLOC_PTR(op_data);
+	if (op_data == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	if (!hsm_import && attr->ia_valid & ATTR_SIZE) {
+		/* If we are changing file size, file content is
+		 * modified, flag it. */
+		attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
+		op_data->op_bias |= MDS_DATA_MODIFIED;
+		ll_file_clear_flag(lli, LLIF_DATA_MODIFIED);
+	}
+
+	op_data->op_attr = *attr;
+
+	rc = ll_md_setattr(dentry, op_data);
+	if (rc)
+		GOTO(out, rc);
+
+	if (!S_ISREG(inode->i_mode) || hsm_import)
+		GOTO(out, rc = 0);
+
+	if (attr->ia_valid & (ATTR_SIZE |
+			      ATTR_ATIME | ATTR_ATIME_SET |
+			      ATTR_MTIME | ATTR_MTIME_SET |
+			      ATTR_CTIME | ATTR_CTIME_SET)) {
+		/* For truncate and utimes sending attributes to OSTs, setting
+		 * mtime/atime to the past will be performed under PW [0:EOF]
+		 * extent lock (new_size:EOF for truncate).  It may seem
+		 * excessive to send mtime/atime updates to OSTs when not
+		 * setting times to past, but it is necessary due to possible
+		 * time de-synchronization between MDT inode and OST objects */
+		rc = cl_setattr_ost(lli->lli_clob, attr, 0);
+	}
+
+	/* If the file was restored, it needs to set dirty flag.
+	 *
+	 * We've already sent MDS_DATA_MODIFIED flag in
+	 * ll_md_setattr() for truncate. However, the MDT refuses to
+	 * set the HS_DIRTY flag on released files, so we have to set
+	 * it again if the file has been restored. Please check how
+	 * LLIF_DATA_MODIFIED is set in vvp_io_setattr_fini().
+	 *
+	 * Please notice that if the file is not released, the previous
+	 * MDS_DATA_MODIFIED has taken effect and usually
+	 * LLIF_DATA_MODIFIED is not set(see vvp_io_setattr_fini()).
+	 * This way we can save an RPC for common open + trunc
+	 * operation. */
+	if (ll_file_test_and_clear_flag(lli, LLIF_DATA_MODIFIED)) {
+		struct hsm_state_set hss = {
+			.hss_valid = HSS_SETMASK,
+			.hss_setmask = HS_DIRTY,
+		};
+		int rc2;
+
+		rc2 = ll_hsm_state_set(inode, &hss);
+		/* truncate and write can happen at the same time, so that
+		 * the file can be set modified even though the file is not
+		 * restored from released state, and ll_hsm_state_set() is
+		 * not applicable for the file, and rc2 < 0 is normal in this
+		 * case. */
+		if (rc2 < 0)
+			CDEBUG(D_INFO, DFID "HSM set dirty failed: rc2 = %d\n",
+			       PFID(ll_inode2fid(inode)), rc2);
+	}
+
+	EXIT;
+out:
+	if (op_data != NULL)
+		ll_finish_md_op_data(op_data);
+
+	if (S_ISREG(inode->i_mode)) {
+		inode_lock(inode);
+		if ((attr->ia_valid & ATTR_SIZE) && !hsm_import)
+			inode_dio_wait(inode);
+		/* Once we've got the i_mutex, it's safe to set the S_NOSEC
+		 * flag.  ll_update_inode (called from ll_md_setattr), clears
+		 * inode flags, so there is a gap where S_NOSEC is not set.
+		 * This can cause a writer to take the i_mutex unnecessarily,
+		 * but this is safe to do and should be rare. */
+		inode_has_no_xattr(inode);
+	}
+
+	ll_stats_ops_tally(ll_i2sbi(inode), (attr->ia_valid & ATTR_SIZE) ?
+			LPROC_LL_TRUNC : LPROC_LL_SETATTR, 1);
+
+	return rc;
+}
+
+int ll_setattr(struct dentry *de, struct iattr *attr)
+{
+	int mode = de->d_inode->i_mode;
+
+	if ((attr->ia_valid & (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) ==
+			      (ATTR_CTIME|ATTR_SIZE|ATTR_MODE))
+		attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
+
+	if (((attr->ia_valid & (ATTR_MODE|ATTR_FORCE|ATTR_SIZE)) ==
+			       (ATTR_SIZE|ATTR_MODE)) &&
+	    (((mode & S_ISUID) && !(attr->ia_mode & S_ISUID)) ||
+	     (((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) &&
+	      !(attr->ia_mode & S_ISGID))))
+		attr->ia_valid |= ATTR_FORCE;
+
+	if ((attr->ia_valid & ATTR_MODE) &&
+	    (mode & S_ISUID) &&
+	    !(attr->ia_mode & S_ISUID) &&
+	    !(attr->ia_valid & ATTR_KILL_SUID))
+		attr->ia_valid |= ATTR_KILL_SUID;
+
+	if ((attr->ia_valid & ATTR_MODE) &&
+	    ((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) &&
+	    !(attr->ia_mode & S_ISGID) &&
+	    !(attr->ia_valid & ATTR_KILL_SGID))
+		attr->ia_valid |= ATTR_KILL_SGID;
+
+	/* avoid polluted from ATTR_TIMES_SET,
+	 * projid is not expected to be set here */
+	attr->ia_valid &= ~MDS_ATTR_PROJID;
+
+	return ll_setattr_raw(de, attr, false);
+}
+
+int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
+                       __u64 max_age, __u32 flags)
+{
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        struct obd_statfs obd_osfs;
+        int rc;
+        ENTRY;
+
+        rc = obd_statfs(NULL, sbi->ll_md_exp, osfs, max_age, flags);
+        if (rc) {
+                CERROR("md_statfs fails: rc = %d\n", rc);
+                RETURN(rc);
+        }
+
+        osfs->os_type = sb->s_magic;
+
+	CDEBUG(D_SUPER, "MDC blocks %llu/%llu objects %llu/%llu\n",
+               osfs->os_bavail, osfs->os_blocks, osfs->os_ffree,osfs->os_files);
+
+        if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
+                flags |= OBD_STATFS_NODELAY;
+
+        rc = obd_statfs_rqset(sbi->ll_dt_exp, &obd_osfs, max_age, flags);
+        if (rc) {
+                CERROR("obd_statfs fails: rc = %d\n", rc);
+                RETURN(rc);
+        }
+
+	CDEBUG(D_SUPER, "OSC blocks %llu/%llu objects %llu/%llu\n",
+               obd_osfs.os_bavail, obd_osfs.os_blocks, obd_osfs.os_ffree,
+               obd_osfs.os_files);
+
+        osfs->os_bsize = obd_osfs.os_bsize;
+        osfs->os_blocks = obd_osfs.os_blocks;
+        osfs->os_bfree = obd_osfs.os_bfree;
+        osfs->os_bavail = obd_osfs.os_bavail;
+
+        /* If we don't have as many objects free on the OST as inodes
+         * on the MDS, we reduce the total number of inodes to
+         * compensate, so that the "inodes in use" number is correct.
+         */
+        if (obd_osfs.os_ffree < osfs->os_ffree) {
+                osfs->os_files = (osfs->os_files - osfs->os_ffree) +
+                        obd_osfs.os_ffree;
+                osfs->os_ffree = obd_osfs.os_ffree;
+        }
+
+        RETURN(rc);
+}
+int ll_statfs(struct dentry *de, struct kstatfs *sfs)
+{
+	struct super_block *sb = de->d_sb;
+	struct obd_statfs osfs;
+	__u64 fsid = huge_encode_dev(sb->s_dev);
+	int rc;
+
+	CDEBUG(D_VFSTRACE, "VFS Op: at %llu jiffies\n", get_jiffies_64());
+        ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_STAFS, 1);
+
+        /* Some amount of caching on the client is allowed */
+        rc = ll_statfs_internal(sb, &osfs,
+                                cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                                0);
+        if (rc)
+                return rc;
+
+        statfs_unpack(sfs, &osfs);
+
+        /* We need to downshift for all 32-bit kernels, because we can't
+         * tell if the kernel is being called via sys_statfs64() or not.
+         * Stop before overflowing f_bsize - in which case it is better
+         * to just risk EOVERFLOW if caller is using old sys_statfs(). */
+        if (sizeof(long) < 8) {
+                while (osfs.os_blocks > ~0UL && sfs->f_bsize < 0x40000000) {
+                        sfs->f_bsize <<= 1;
+
+                        osfs.os_blocks >>= 1;
+                        osfs.os_bfree >>= 1;
+                        osfs.os_bavail >>= 1;
+                }
+        }
+
+        sfs->f_blocks = osfs.os_blocks;
+        sfs->f_bfree = osfs.os_bfree;
+        sfs->f_bavail = osfs.os_bavail;
+	sfs->f_fsid.val[0] = (__u32)fsid;
+	sfs->f_fsid.val[1] = (__u32)(fsid >> 32);
+	return 0;
+}
+
+void ll_inode_size_lock(struct inode *inode)
+{
+	struct ll_inode_info *lli;
+
+	LASSERT(!S_ISDIR(inode->i_mode));
+
+	lli = ll_i2info(inode);
+	mutex_lock(&lli->lli_size_mutex);
+}
+
+void ll_inode_size_unlock(struct inode *inode)
+{
+	struct ll_inode_info *lli;
+
+	lli = ll_i2info(inode);
+	mutex_unlock(&lli->lli_size_mutex);
+}
+
+int ll_update_inode(struct inode *inode, struct lustre_md *md)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct mdt_body *body = md->body;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	int rc = 0;
+
+	if (body->mbo_valid & OBD_MD_FLEASIZE) {
+		rc = cl_file_inode_init(inode, md);
+		if (rc)
+			return rc;
+	}
+
+	if (S_ISDIR(inode->i_mode)) {
+		rc = ll_update_lsm_md(inode, md);
+		if (rc != 0)
+			return rc;
+	}
+
+#ifdef CONFIG_FS_POSIX_ACL
+	if (body->mbo_valid & OBD_MD_FLACL) {
+		spin_lock(&lli->lli_lock);
+		if (lli->lli_posix_acl)
+			posix_acl_release(lli->lli_posix_acl);
+		lli->lli_posix_acl = md->posix_acl;
+		spin_unlock(&lli->lli_lock);
+	}
+#endif
+	inode->i_ino = cl_fid_build_ino(&body->mbo_fid1,
+					sbi->ll_flags & LL_SBI_32BIT_API);
+	inode->i_generation = cl_fid_build_gen(&body->mbo_fid1);
+
+	if (body->mbo_valid & OBD_MD_FLATIME) {
+		if (body->mbo_atime > LTIME_S(inode->i_atime))
+			LTIME_S(inode->i_atime) = body->mbo_atime;
+		lli->lli_atime = body->mbo_atime;
+	}
+
+	if (body->mbo_valid & OBD_MD_FLMTIME) {
+		if (body->mbo_mtime > LTIME_S(inode->i_mtime)) {
+			CDEBUG(D_INODE, "setting ino %lu mtime from %lu "
+			       "to %llu\n", inode->i_ino,
+			       LTIME_S(inode->i_mtime), body->mbo_mtime);
+			LTIME_S(inode->i_mtime) = body->mbo_mtime;
+		}
+		lli->lli_mtime = body->mbo_mtime;
+	}
+
+	if (body->mbo_valid & OBD_MD_FLCTIME) {
+		if (body->mbo_ctime > LTIME_S(inode->i_ctime))
+			LTIME_S(inode->i_ctime) = body->mbo_ctime;
+		lli->lli_ctime = body->mbo_ctime;
+	}
+
+	/* Clear i_flags to remove S_NOSEC before permissions are updated */
+	if (body->mbo_valid & OBD_MD_FLFLAGS)
+		inode->i_flags = ll_ext_to_inode_flags(body->mbo_flags);
+	if (body->mbo_valid & OBD_MD_FLMODE)
+		inode->i_mode = (inode->i_mode & S_IFMT) |
+				(body->mbo_mode & ~S_IFMT);
+
+	if (body->mbo_valid & OBD_MD_FLTYPE)
+		inode->i_mode = (inode->i_mode & ~S_IFMT) |
+				(body->mbo_mode & S_IFMT);
+
+	LASSERT(inode->i_mode != 0);
+	if (S_ISREG(inode->i_mode))
+		inode->i_blkbits = min(PTLRPC_MAX_BRW_BITS + 1,
+				       LL_MAX_BLKSIZE_BITS);
+	else
+		inode->i_blkbits = inode->i_sb->s_blocksize_bits;
+
+	if (body->mbo_valid & OBD_MD_FLUID)
+		inode->i_uid = make_kuid(&init_user_ns, body->mbo_uid);
+	if (body->mbo_valid & OBD_MD_FLGID)
+		inode->i_gid = make_kgid(&init_user_ns, body->mbo_gid);
+	if (body->mbo_valid & OBD_MD_FLPROJID)
+		lli->lli_projid = body->mbo_projid;
+	if (body->mbo_valid & OBD_MD_FLNLINK)
+		set_nlink(inode, body->mbo_nlink);
+	if (body->mbo_valid & OBD_MD_FLRDEV)
+		inode->i_rdev = old_decode_dev(body->mbo_rdev);
+
+	if (body->mbo_valid & OBD_MD_FLID) {
+		/* FID shouldn't be changed! */
+		if (fid_is_sane(&lli->lli_fid)) {
+			LASSERTF(lu_fid_eq(&lli->lli_fid, &body->mbo_fid1),
+				 "Trying to change FID "DFID
+				 " to the "DFID", inode "DFID"(%p)\n",
+				 PFID(&lli->lli_fid), PFID(&body->mbo_fid1),
+				 PFID(ll_inode2fid(inode)), inode);
+		} else {
+			lli->lli_fid = body->mbo_fid1;
+		}
+	}
+
+	LASSERT(fid_seq(&lli->lli_fid) != 0);
+
+	if (body->mbo_valid & OBD_MD_FLSIZE) {
+		i_size_write(inode, body->mbo_size);
+
+		CDEBUG(D_VFSTRACE, "inode="DFID", updating i_size %llu\n",
+		       PFID(ll_inode2fid(inode)),
+		       (unsigned long long)body->mbo_size);
+
+		if (body->mbo_valid & OBD_MD_FLBLOCKS)
+			inode->i_blocks = body->mbo_blocks;
+	}
+
+	if (body->mbo_valid & OBD_MD_TSTATE) {
+		/* Set LLIF_FILE_RESTORING if restore ongoing and
+		 * clear it when done to ensure to start again
+		 * glimpsing updated attrs
+		 */
+		if (body->mbo_t_state & MS_RESTORE)
+			ll_file_set_flag(lli, LLIF_FILE_RESTORING);
+		else
+			ll_file_clear_flag(lli, LLIF_FILE_RESTORING);
+	}
+
+	return 0;
+}
+
+int ll_read_inode2(struct inode *inode, void *opaque)
+{
+        struct lustre_md *md = opaque;
+        struct ll_inode_info *lli = ll_i2info(inode);
+	int	rc;
+        ENTRY;
+
+        CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
+               PFID(&lli->lli_fid), inode);
+
+        /* Core attributes from the MDS first.  This is a new inode, and
+         * the VFS doesn't zero times in the core inode so we have to do
+         * it ourselves.  They will be overwritten by either MDS or OST
+         * attributes - we just need to make sure they aren't newer. */
+        LTIME_S(inode->i_mtime) = 0;
+        LTIME_S(inode->i_atime) = 0;
+        LTIME_S(inode->i_ctime) = 0;
+        inode->i_rdev = 0;
+	rc = ll_update_inode(inode, md);
+	if (rc != 0)
+		RETURN(rc);
+
+        /* OIDEBUG(inode); */
+
+#ifdef HAVE_BACKING_DEV_INFO
+	/* initializing backing dev info. */
+	inode->i_mapping->backing_dev_info = &s2lsi(inode->i_sb)->lsi_bdi;
+#endif
+        if (S_ISREG(inode->i_mode)) {
+                struct ll_sb_info *sbi = ll_i2sbi(inode);
+                inode->i_op = &ll_file_inode_operations;
+                inode->i_fop = sbi->ll_fop;
+                inode->i_mapping->a_ops = (struct address_space_operations *)&ll_aops;
+                EXIT;
+        } else if (S_ISDIR(inode->i_mode)) {
+                inode->i_op = &ll_dir_inode_operations;
+                inode->i_fop = &ll_dir_operations;
+                EXIT;
+        } else if (S_ISLNK(inode->i_mode)) {
+                inode->i_op = &ll_fast_symlink_inode_operations;
+                EXIT;
+        } else {
+                inode->i_op = &ll_special_inode_operations;
+
+		init_special_inode(inode, inode->i_mode,
+				   inode->i_rdev);
+
+                EXIT;
+        }
+
+	return 0;
+}
+
+void ll_delete_inode(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	ENTRY;
+
+	if (S_ISREG(inode->i_mode) && lli->lli_clob != NULL)
+		/* It is last chance to write out dirty pages,
+		 * otherwise we may lose data while umount */
+		cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, CL_FSYNC_LOCAL, 1);
+
+	truncate_inode_pages_final(&inode->i_data);
+
+	LASSERTF(inode->i_data.nrpages == 0, "inode="DFID"(%p) nrpages=%lu, "
+		 "see https://jira.hpdd.intel.com/browse/LU-118\n",
+		 PFID(ll_inode2fid(inode)), inode, inode->i_data.nrpages);
+
+#ifdef HAVE_SBOPS_EVICT_INODE
+	ll_clear_inode(inode);
+#endif
+	clear_inode(inode);
+
+        EXIT;
+}
+
+int ll_iocontrol(struct inode *inode, struct file *file,
+                 unsigned int cmd, unsigned long arg)
+{
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        struct ptlrpc_request *req = NULL;
+        int rc, flags = 0;
+        ENTRY;
+
+        switch(cmd) {
+        case FSFILT_IOC_GETFLAGS: {
+                struct mdt_body *body;
+                struct md_op_data *op_data;
+
+                op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
+                                             0, 0, LUSTRE_OPC_ANY,
+                                             NULL);
+                if (IS_ERR(op_data))
+                        RETURN(PTR_ERR(op_data));
+
+                op_data->op_valid = OBD_MD_FLFLAGS;
+                rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+                ll_finish_md_op_data(op_data);
+                if (rc) {
+			CERROR("%s: failure inode "DFID": rc = %d\n",
+			       sbi->ll_md_exp->exp_obd->obd_name,
+			       PFID(ll_inode2fid(inode)), rc);
+                        RETURN(-abs(rc));
+                }
+
+                body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+
+		flags = body->mbo_flags;
+
+                ptlrpc_req_finished(req);
+
+		RETURN(put_user(flags, (int __user *)arg));
+        }
+        case FSFILT_IOC_SETFLAGS: {
+		struct iattr *attr;
+		struct md_op_data *op_data;
+		struct cl_object *obj;
+
+		if (get_user(flags, (int __user *)arg))
+			RETURN(-EFAULT);
+
+                op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+                                             LUSTRE_OPC_ANY, NULL);
+                if (IS_ERR(op_data))
+                        RETURN(PTR_ERR(op_data));
+
+		op_data->op_attr_flags = flags;
+                op_data->op_attr.ia_valid |= ATTR_ATTR_FLAG;
+		rc = md_setattr(sbi->ll_md_exp, op_data, NULL, 0, &req);
+                ll_finish_md_op_data(op_data);
+                ptlrpc_req_finished(req);
+		if (rc)
+			RETURN(rc);
+
+		inode->i_flags = ll_ext_to_inode_flags(flags);
+
+		obj = ll_i2info(inode)->lli_clob;
+		if (obj == NULL)
+			RETURN(0);
+
+		OBD_ALLOC_PTR(attr);
+		if (attr == NULL)
+			RETURN(-ENOMEM);
+
+		attr->ia_valid = ATTR_ATTR_FLAG;
+		rc = cl_setattr_ost(obj, attr, flags);
+
+		OBD_FREE_PTR(attr);
+		RETURN(rc);
+        }
+        default:
+                RETURN(-ENOSYS);
+        }
+
+        RETURN(0);
+}
+
+int ll_flush_ctx(struct inode *inode)
+{
+	struct ll_sb_info  *sbi = ll_i2sbi(inode);
+
+	CDEBUG(D_SEC, "flush context for user %d\n",
+	       from_kuid(&init_user_ns, current_uid()));
+
+	obd_set_info_async(NULL, sbi->ll_md_exp,
+			   sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX,
+			   0, NULL, NULL);
+	obd_set_info_async(NULL, sbi->ll_dt_exp,
+			   sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX,
+			   0, NULL, NULL);
+	return 0;
+}
+
+/* umount -f client means force down, don't save state */
+void ll_umount_begin(struct super_block *sb)
+{
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct obd_device *obd;
+	struct obd_ioctl_data *ioc_data;
+	struct l_wait_info lwi;
+	wait_queue_head_t waitq;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op: superblock %p count %d active %d\n", sb,
+	       sb->s_count, atomic_read(&sb->s_active));
+
+	obd = class_exp2obd(sbi->ll_md_exp);
+	if (obd == NULL) {
+		CERROR("Invalid MDC connection handle %#llx\n",
+		       sbi->ll_md_exp->exp_handle.h_cookie);
+		EXIT;
+		return;
+	}
+	obd->obd_force = 1;
+
+        obd = class_exp2obd(sbi->ll_dt_exp);
+        if (obd == NULL) {
+		CERROR("Invalid LOV connection handle %#llx\n",
+                       sbi->ll_dt_exp->exp_handle.h_cookie);
+                EXIT;
+                return;
+        }
+        obd->obd_force = 1;
+
+        OBD_ALLOC_PTR(ioc_data);
+	if (ioc_data) {
+		obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_md_exp,
+			      sizeof *ioc_data, ioc_data, NULL);
+
+		obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_dt_exp,
+			      sizeof *ioc_data, ioc_data, NULL);
+
+		OBD_FREE_PTR(ioc_data);
+	}
+
+	/* Really, we'd like to wait until there are no requests outstanding,
+	 * and then continue.  For now, we just periodically checking for vfs
+	 * to decrement mnt_cnt and hope to finish it within 10sec.
+	 */
+	init_waitqueue_head(&waitq);
+	lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(10),
+				   cfs_time_seconds(1), NULL, NULL);
+	l_wait_event(waitq, may_umount(sbi->ll_mnt.mnt), &lwi);
+
+	EXIT;
+}
+
+int ll_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        char *profilenm = get_profile_name(sb);
+        int err;
+        __u32 read_only;
+
+        if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+                read_only = *flags & MS_RDONLY;
+                err = obd_set_info_async(NULL, sbi->ll_md_exp,
+                                         sizeof(KEY_READ_ONLY),
+                                         KEY_READ_ONLY, sizeof(read_only),
+                                         &read_only, NULL);
+                if (err) {
+                        LCONSOLE_WARN("Failed to remount %s %s (%d)\n",
+                                      profilenm, read_only ?
+                                      "read-only" : "read-write", err);
+                        return err;
+                }
+
+                if (read_only)
+                        sb->s_flags |= MS_RDONLY;
+                else
+                        sb->s_flags &= ~MS_RDONLY;
+
+                if (sbi->ll_flags & LL_SBI_VERBOSE)
+                        LCONSOLE_WARN("Remounted %s %s\n", profilenm,
+                                      read_only ?  "read-only" : "read-write");
+        }
+        return 0;
+}
+
+/**
+ * Cleanup the open handle that is cached on MDT-side.
+ *
+ * For open case, the client side open handling thread may hit error
+ * after the MDT grant the open. Under such case, the client should
+ * send close RPC to the MDT as cleanup; otherwise, the open handle
+ * on the MDT will be leaked there until the client umount or evicted.
+ *
+ * In further, if someone unlinked the file, because the open handle
+ * holds the reference on such file/object, then it will block the
+ * subsequent threads that want to locate such object via FID.
+ *
+ * \param[in] sb	super block for this file-system
+ * \param[in] open_req	pointer to the original open request
+ */
+void ll_open_cleanup(struct super_block *sb, struct ptlrpc_request *open_req)
+{
+	struct mdt_body			*body;
+	struct md_op_data		*op_data;
+	struct ptlrpc_request		*close_req = NULL;
+	struct obd_export		*exp	   = ll_s2sbi(sb)->ll_md_exp;
+	ENTRY;
+
+	body = req_capsule_server_get(&open_req->rq_pill, &RMF_MDT_BODY);
+	OBD_ALLOC_PTR(op_data);
+	if (op_data == NULL) {
+		CWARN("%s: cannot allocate op_data to release open handle for "
+		      DFID"\n",
+		      ll_get_fsname(sb, NULL, 0), PFID(&body->mbo_fid1));
+
+		RETURN_EXIT;
+	}
+
+	op_data->op_fid1 = body->mbo_fid1;
+	op_data->op_handle = body->mbo_handle;
+	op_data->op_mod_time = ktime_get_real_seconds();
+	md_close(exp, op_data, NULL, &close_req);
+	ptlrpc_req_finished(close_req);
+	ll_finish_md_op_data(op_data);
+
+	EXIT;
+}
+
+int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req,
+		  struct super_block *sb, struct lookup_intent *it)
+{
+	struct ll_sb_info *sbi = NULL;
+	struct lustre_md md = { NULL };
+	int rc;
+	ENTRY;
+
+	LASSERT(*inode || sb);
+	sbi = sb ? ll_s2sbi(sb) : ll_i2sbi(*inode);
+	rc = md_get_lustre_md(sbi->ll_md_exp, req, sbi->ll_dt_exp,
+			      sbi->ll_md_exp, &md);
+	if (rc != 0)
+		GOTO(cleanup, rc);
+
+	if (*inode) {
+		rc = ll_update_inode(*inode, &md);
+		if (rc != 0)
+			GOTO(out, rc);
+	} else {
+		LASSERT(sb != NULL);
+
+		/*
+		 * At this point server returns to client's same fid as client
+		 * generated for creating. So using ->fid1 is okay here.
+		 */
+		if (!fid_is_sane(&md.body->mbo_fid1)) {
+			CERROR("%s: Fid is insane "DFID"\n",
+				ll_get_fsname(sb, NULL, 0),
+				PFID(&md.body->mbo_fid1));
+			GOTO(out, rc = -EINVAL);
+		}
+
+		*inode = ll_iget(sb, cl_fid_build_ino(&md.body->mbo_fid1,
+					     sbi->ll_flags & LL_SBI_32BIT_API),
+				 &md);
+		if (IS_ERR(*inode)) {
+#ifdef CONFIG_FS_POSIX_ACL
+                        if (md.posix_acl) {
+                                posix_acl_release(md.posix_acl);
+                                md.posix_acl = NULL;
+                        }
+#endif
+                        rc = IS_ERR(*inode) ? PTR_ERR(*inode) : -ENOMEM;
+                        *inode = NULL;
+                        CERROR("new_inode -fatal: rc %d\n", rc);
+                        GOTO(out, rc);
+                }
+        }
+
+	/* Handling piggyback layout lock.
+	 * Layout lock can be piggybacked by getattr and open request.
+	 * The lsm can be applied to inode only if it comes with a layout lock
+	 * otherwise correct layout may be overwritten, for example:
+	 * 1. proc1: mdt returns a lsm but not granting layout
+	 * 2. layout was changed by another client
+	 * 3. proc2: refresh layout and layout lock granted
+	 * 4. proc1: to apply a stale layout */
+	if (it != NULL && it->it_lock_mode != 0) {
+		struct lustre_handle lockh;
+		struct ldlm_lock *lock;
+
+		lockh.cookie = it->it_lock_handle;
+		lock = ldlm_handle2lock(&lockh);
+		LASSERT(lock != NULL);
+		if (ldlm_has_layout(lock)) {
+			struct cl_object_conf conf;
+
+			memset(&conf, 0, sizeof(conf));
+			conf.coc_opc = OBJECT_CONF_SET;
+			conf.coc_inode = *inode;
+			conf.coc_lock = lock;
+			conf.u.coc_layout = md.layout;
+			(void)ll_layout_conf(*inode, &conf);
+		}
+		LDLM_LOCK_PUT(lock);
+	}
+
+	GOTO(out, rc = 0);
+
+out:
+	md_free_lustre_md(sbi->ll_md_exp, &md);
+
+cleanup:
+	if (rc != 0 && it != NULL && it->it_op & IT_OPEN)
+		ll_open_cleanup(sb != NULL ? sb : (*inode)->i_sb, req);
+
+	return rc;
+}
+
+int ll_obd_statfs(struct inode *inode, void __user *arg)
+{
+        struct ll_sb_info *sbi = NULL;
+        struct obd_export *exp;
+        char *buf = NULL;
+        struct obd_ioctl_data *data = NULL;
+        __u32 type;
+        int len = 0, rc;
+
+        if (!inode || !(sbi = ll_i2sbi(inode)))
+                GOTO(out_statfs, rc = -EINVAL);
+
+        rc = obd_ioctl_getdata(&buf, &len, arg);
+        if (rc)
+                GOTO(out_statfs, rc);
+
+        data = (void*)buf;
+        if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2 ||
+            !data->ioc_pbuf1 || !data->ioc_pbuf2)
+                GOTO(out_statfs, rc = -EINVAL);
+
+        if (data->ioc_inllen1 != sizeof(__u32) ||
+            data->ioc_inllen2 != sizeof(__u32) ||
+            data->ioc_plen1 != sizeof(struct obd_statfs) ||
+            data->ioc_plen2 != sizeof(struct obd_uuid))
+                GOTO(out_statfs, rc = -EINVAL);
+
+        memcpy(&type, data->ioc_inlbuf1, sizeof(__u32));
+	if (type & LL_STATFS_LMV)
+                exp = sbi->ll_md_exp;
+	else if (type & LL_STATFS_LOV)
+                exp = sbi->ll_dt_exp;
+        else
+                GOTO(out_statfs, rc = -ENODEV);
+
+	rc = obd_iocontrol(IOC_OBD_STATFS, exp, len, buf, NULL);
+        if (rc)
+                GOTO(out_statfs, rc);
+out_statfs:
+	OBD_FREE_LARGE(buf, len);
+	return rc;
+}
+
+int ll_process_config(struct lustre_cfg *lcfg)
+{
+	struct super_block *sb;
+	unsigned long x;
+	int rc = 0;
+	char *ptr;
+
+	/* The instance name contains the sb: lustre-client-aacfe000 */
+	ptr = strrchr(lustre_cfg_string(lcfg, 0), '-');
+	if (!ptr || !*(++ptr))
+		return -EINVAL;
+	if (sscanf(ptr, "%lx", &x) != 1)
+		return -EINVAL;
+	sb = (struct super_block *)x;
+	/* This better be a real Lustre superblock! */
+	LASSERT(s2lsi(sb)->lsi_lmd->lmd_magic == LMD_MAGIC);
+
+	/* Note we have not called client_common_fill_super yet, so
+	   proc fns must be able to handle that! */
+	rc = class_process_proc_param(PARAM_LLITE, lprocfs_llite_obd_vars,
+				      lcfg, sb);
+	if (rc > 0)
+		rc = 0;
+	return rc;
+}
+
+/* this function prepares md_op_data hint for passing it down to MD stack. */
+struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
+				      struct inode *i1, struct inode *i2,
+				      const char *name, size_t namelen,
+				      __u32 mode, __u32 opc, void *data)
+{
+	LASSERT(i1 != NULL);
+
+	if (name == NULL) {
+		/* Do not reuse namelen for something else. */
+		if (namelen != 0)
+			return ERR_PTR(-EINVAL);
+	} else {
+		if (namelen > ll_i2sbi(i1)->ll_namelen)
+			return ERR_PTR(-ENAMETOOLONG);
+
+		if (!lu_name_is_valid_2(name, namelen))
+			return ERR_PTR(-EINVAL);
+	}
+
+	if (op_data == NULL)
+		OBD_ALLOC_PTR(op_data);
+
+	if (op_data == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	ll_i2gids(op_data->op_suppgids, i1, i2);
+	op_data->op_fid1 = *ll_inode2fid(i1);
+	op_data->op_default_stripe_offset = -1;
+	if (S_ISDIR(i1->i_mode)) {
+		op_data->op_mea1 = ll_i2info(i1)->lli_lsm_md;
+		if (opc == LUSTRE_OPC_MKDIR)
+			op_data->op_default_stripe_offset =
+				   ll_i2info(i1)->lli_def_stripe_offset;
+	}
+
+	if (i2) {
+		op_data->op_fid2 = *ll_inode2fid(i2);
+		if (S_ISDIR(i2->i_mode))
+			op_data->op_mea2 = ll_i2info(i2)->lli_lsm_md;
+	} else {
+		fid_zero(&op_data->op_fid2);
+	}
+
+	if (ll_i2sbi(i1)->ll_flags & LL_SBI_64BIT_HASH)
+		op_data->op_cli_flags |= CLI_HASH64;
+
+	if (ll_need_32bit_api(ll_i2sbi(i1)))
+		op_data->op_cli_flags |= CLI_API32;
+
+	op_data->op_name = name;
+	op_data->op_namelen = namelen;
+	op_data->op_mode = mode;
+	op_data->op_mod_time = cfs_time_current_sec();
+	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
+	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
+	op_data->op_cap = cfs_curproc_cap_pack();
+	if ((opc == LUSTRE_OPC_CREATE) && (name != NULL) &&
+	     filename_is_volatile(name, namelen, &op_data->op_mds)) {
+		op_data->op_bias |= MDS_CREATE_VOLATILE;
+	} else {
+		op_data->op_mds = 0;
+	}
+	op_data->op_data = data;
+
+	return op_data;
+}
+
+void ll_finish_md_op_data(struct md_op_data *op_data)
+{
+	security_release_secctx(op_data->op_file_secctx,
+				op_data->op_file_secctx_size);
+        OBD_FREE_PTR(op_data);
+}
+
+#ifdef HAVE_SUPEROPS_USE_DENTRY
+int ll_show_options(struct seq_file *seq, struct dentry *dentry)
+#else
+int ll_show_options(struct seq_file *seq, struct vfsmount *vfs)
+#endif
+{
+        struct ll_sb_info *sbi;
+
+#ifdef HAVE_SUPEROPS_USE_DENTRY
+	LASSERT((seq != NULL) && (dentry != NULL));
+	sbi = ll_s2sbi(dentry->d_sb);
+#else
+	LASSERT((seq != NULL) && (vfs != NULL));
+	sbi = ll_s2sbi(vfs->mnt_sb);
+#endif
+
+        if (sbi->ll_flags & LL_SBI_NOLCK)
+                seq_puts(seq, ",nolock");
+
+        if (sbi->ll_flags & LL_SBI_FLOCK)
+                seq_puts(seq, ",flock");
+
+        if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
+                seq_puts(seq, ",localflock");
+
+        if (sbi->ll_flags & LL_SBI_USER_XATTR)
+                seq_puts(seq, ",user_xattr");
+
+        if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
+                seq_puts(seq, ",lazystatfs");
+
+	if (sbi->ll_flags & LL_SBI_USER_FID2PATH)
+		seq_puts(seq, ",user_fid2path");
+
+	if (sbi->ll_flags & LL_SBI_ALWAYS_PING)
+		seq_puts(seq, ",always_ping");
+
+        RETURN(0);
+}
+
+/**
+ * Get obd name by cmd, and copy out to user space
+ */
+int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg)
+{
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        struct obd_device *obd;
+        ENTRY;
+
+        if (cmd == OBD_IOC_GETDTNAME)
+                obd = class_exp2obd(sbi->ll_dt_exp);
+        else if (cmd == OBD_IOC_GETMDNAME)
+                obd = class_exp2obd(sbi->ll_md_exp);
+        else
+                RETURN(-EINVAL);
+
+        if (!obd)
+                RETURN(-ENOENT);
+
+	if (copy_to_user((void __user *)arg, obd->obd_name,
+			 strlen(obd->obd_name) + 1))
+		RETURN(-EFAULT);
+
+	RETURN(0);
+}
+
+/**
+ * Get lustre file system name by \a sbi. If \a buf is provided(non-NULL), the
+ * fsname will be returned in this buffer; otherwise, a static buffer will be
+ * used to store the fsname and returned to caller.
+ */
+char *ll_get_fsname(struct super_block *sb, char *buf, int buflen)
+{
+	static char fsname_static[MTI_NAME_MAXLEN];
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	char *ptr;
+	int len;
+
+	if (buf == NULL) {
+		/* this means the caller wants to use static buffer
+		 * and it doesn't care about race. Usually this is
+		 * in error reporting path */
+		buf = fsname_static;
+		buflen = sizeof(fsname_static);
+	}
+
+	len = strlen(lsi->lsi_lmd->lmd_profile);
+	ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-');
+	if (ptr && (strcmp(ptr, "-client") == 0))
+		len -= 7;
+
+	if (unlikely(len >= buflen))
+		len = buflen - 1;
+	strncpy(buf, lsi->lsi_lmd->lmd_profile, len);
+	buf[len] = '\0';
+
+	return buf;
+}
+
+static char* ll_d_path(struct dentry *dentry, char *buf, int bufsize)
+{
+	char *path = NULL;
+
+	struct path p;
+
+	p.dentry = dentry;
+	p.mnt = current->fs->root.mnt;
+	path_get(&p);
+	path = d_path(&p, buf, bufsize);
+	path_put(&p);
+	return path;
+}
+
+void ll_dirty_page_discard_warn(struct page *page, int ioret)
+{
+	char *buf, *path = NULL;
+	struct dentry *dentry = NULL;
+	struct inode *inode = page->mapping->host;
+
+	/* this can be called inside spin lock so use GFP_ATOMIC. */
+	buf = (char *)__get_free_page(GFP_ATOMIC);
+	if (buf != NULL) {
+		dentry = d_find_alias(page->mapping->host);
+		if (dentry != NULL)
+			path = ll_d_path(dentry, buf, PAGE_SIZE);
+	}
+
+	CDEBUG(D_WARNING,
+	       "%s: dirty page discard: %s/fid: "DFID"/%s may get corrupted "
+	       "(rc %d)\n", ll_get_fsname(page->mapping->host->i_sb, NULL, 0),
+	       s2lsi(page->mapping->host->i_sb)->lsi_lmd->lmd_dev,
+	       PFID(ll_inode2fid(inode)),
+	       (path && !IS_ERR(path)) ? path : "", ioret);
+
+	if (dentry != NULL)
+		dput(dentry);
+
+	if (buf != NULL)
+		free_page((unsigned long)buf);
+}
+
+ssize_t ll_copy_user_md(const struct lov_user_md __user *md,
+			struct lov_user_md **kbuf)
+{
+	struct lov_user_md	lum;
+	ssize_t			lum_size;
+	ENTRY;
+
+	if (copy_from_user(&lum, md, sizeof(lum)))
+		RETURN(-EFAULT);
+
+	lum_size = ll_lov_user_md_size(&lum);
+	if (lum_size < 0)
+		RETURN(lum_size);
+
+	OBD_ALLOC(*kbuf, lum_size);
+	if (*kbuf == NULL)
+		RETURN(-ENOMEM);
+
+	if (copy_from_user(*kbuf, md, lum_size) != 0) {
+		OBD_FREE(*kbuf, lum_size);
+		RETURN(-EFAULT);
+	}
+
+	RETURN(lum_size);
+}
+
+/*
+ * Compute llite root squash state after a change of root squash
+ * configuration setting or add/remove of a lnet nid
+ */
+void ll_compute_rootsquash_state(struct ll_sb_info *sbi)
+{
+	struct root_squash_info *squash = &sbi->ll_squash;
+	int i;
+	bool matched;
+	struct lnet_process_id id;
+
+	/* Update norootsquash flag */
+	down_write(&squash->rsi_sem);
+	if (list_empty(&squash->rsi_nosquash_nids))
+		sbi->ll_flags &= ~LL_SBI_NOROOTSQUASH;
+	else {
+		/* Do not apply root squash as soon as one of our NIDs is
+		 * in the nosquash_nids list */
+		matched = false;
+		i = 0;
+		while (LNetGetId(i++, &id) != -ENOENT) {
+			if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
+				continue;
+			if (cfs_match_nid(id.nid, &squash->rsi_nosquash_nids)) {
+				matched = true;
+				break;
+			}
+		}
+		if (matched)
+			sbi->ll_flags |= LL_SBI_NOROOTSQUASH;
+		else
+			sbi->ll_flags &= ~LL_SBI_NOROOTSQUASH;
+	}
+	up_write(&squash->rsi_sem);
+}
+
+/**
+ * Parse linkea content to extract information about a given hardlink
+ *
+ * \param[in]   ldata      - Initialized linkea data
+ * \param[in]   linkno     - Link identifier
+ * \param[out]  parent_fid - The entry's parent FID
+ * \param[out]  ln         - Entry name destination buffer
+ *
+ * \retval 0 on success
+ * \retval Appropriate negative error code on failure
+ */
+static int ll_linkea_decode(struct linkea_data *ldata, unsigned int linkno,
+			    struct lu_fid *parent_fid, struct lu_name *ln)
+{
+	unsigned int	idx;
+	int		rc;
+	ENTRY;
+
+	rc = linkea_init_with_rec(ldata);
+	if (rc < 0)
+		RETURN(rc);
+
+	if (linkno >= ldata->ld_leh->leh_reccount)
+		/* beyond last link */
+		RETURN(-ENODATA);
+
+	linkea_first_entry(ldata);
+	for (idx = 0; ldata->ld_lee != NULL; idx++) {
+		linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen, ln,
+				    parent_fid);
+		if (idx == linkno)
+			break;
+
+		linkea_next_entry(ldata);
+	}
+
+	if (idx < linkno)
+		RETURN(-ENODATA);
+
+	RETURN(0);
+}
+
+/**
+ * Get parent FID and name of an identified link. Operation is performed for
+ * a given link number, letting the caller iterate over linkno to list one or
+ * all links of an entry.
+ *
+ * \param[in]     file - File descriptor against which to perform the operation
+ * \param[in,out] arg  - User-filled structure containing the linkno to operate
+ *                       on and the available size. It is eventually filled with
+ *                       the requested information or left untouched on error
+ *
+ * \retval - 0 on success
+ * \retval - Appropriate negative error code on failure
+ */
+int ll_getparent(struct file *file, struct getparent __user *arg)
+{
+	struct inode		*inode = file_inode(file);
+	struct linkea_data	*ldata;
+	struct lu_buf		 buf = LU_BUF_NULL;
+	struct lu_name		 ln;
+	struct lu_fid		 parent_fid;
+	__u32			 linkno;
+	__u32			 name_size;
+	int			 rc;
+
+	ENTRY;
+
+	if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
+	    !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
+		RETURN(-EPERM);
+
+	if (get_user(name_size, &arg->gp_name_size))
+		RETURN(-EFAULT);
+
+	if (get_user(linkno, &arg->gp_linkno))
+		RETURN(-EFAULT);
+
+	if (name_size > PATH_MAX)
+		RETURN(-EINVAL);
+
+	OBD_ALLOC(ldata, sizeof(*ldata));
+	if (ldata == NULL)
+		RETURN(-ENOMEM);
+
+	rc = linkea_data_new(ldata, &buf);
+	if (rc < 0)
+		GOTO(ldata_free, rc);
+
+#ifdef HAVE_XATTR_HANDLER_FLAGS
+	rc = ll_xattr_list(inode, XATTR_NAME_LINK, XATTR_TRUSTED_T, buf.lb_buf,
+			   buf.lb_len, OBD_MD_FLXATTR);
+#else
+	rc = ll_getxattr(file_dentry(file), XATTR_NAME_LINK, buf.lb_buf,
+			 buf.lb_len);
+#endif /* HAVE_XATTR_HANDLER_FLAGS */
+	if (rc < 0)
+		GOTO(lb_free, rc);
+
+	rc = ll_linkea_decode(ldata, linkno, &parent_fid, &ln);
+	if (rc < 0)
+		GOTO(lb_free, rc);
+
+	if (ln.ln_namelen >= name_size)
+		GOTO(lb_free, rc = -EOVERFLOW);
+
+	if (copy_to_user(&arg->gp_fid, &parent_fid, sizeof(arg->gp_fid)))
+		GOTO(lb_free, rc = -EFAULT);
+
+	if (copy_to_user(&arg->gp_name, ln.ln_name, ln.ln_namelen))
+		GOTO(lb_free, rc = -EFAULT);
+
+	if (put_user('\0', arg->gp_name + ln.ln_namelen))
+		GOTO(lb_free, rc = -EFAULT);
+
+lb_free:
+	lu_buf_free(&buf);
+ldata_free:
+	OBD_FREE(ldata, sizeof(*ldata));
+
+	RETURN(rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c b/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
new file mode 100644
index 0000000000000..0fff9b9663a9f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
@@ -0,0 +1,511 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/errno.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "llite_internal.h"
+#include <lustre_compat.h>
+
+static const struct vm_operations_struct ll_file_vm_ops;
+
+void policy_from_vma(union ldlm_policy_data *policy, struct vm_area_struct *vma,
+		     unsigned long addr, size_t count)
+{
+	policy->l_extent.start = ((addr - vma->vm_start) & PAGE_MASK) +
+				 (vma->vm_pgoff << PAGE_SHIFT);
+	policy->l_extent.end = (policy->l_extent.start + count - 1) |
+			       ~PAGE_MASK;
+}
+
+struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr,
+                               size_t count)
+{
+        struct vm_area_struct *vma, *ret = NULL;
+        ENTRY;
+
+        /* mmap_sem must have been held by caller. */
+        LASSERT(!down_write_trylock(&mm->mmap_sem));
+
+        for(vma = find_vma(mm, addr);
+            vma != NULL && vma->vm_start < (addr + count); vma = vma->vm_next) {
+                if (vma->vm_ops && vma->vm_ops == &ll_file_vm_ops &&
+                    vma->vm_flags & VM_SHARED) {
+                        ret = vma;
+                        break;
+                }
+        }
+        RETURN(ret);
+}
+
+/**
+ * API independent part for page fault initialization.
+ * \param env - corespondent lu_env to processing
+ * \param vma - virtual memory area addressed to page fault
+ * \param index - page index corespondent to fault.
+ * \parm ra_flags - vma readahead flags.
+ *
+ * \return error codes from cl_io_init.
+ */
+static struct cl_io *
+ll_fault_io_init(struct lu_env *env, struct vm_area_struct *vma,
+		 pgoff_t index, unsigned long *ra_flags)
+{
+	struct file	       *file = vma->vm_file;
+	struct inode	       *inode = file_inode(file);
+	struct cl_io	       *io;
+	struct cl_fault_io     *fio;
+	int			rc;
+	ENTRY;
+
+        if (ll_file_nolock(file))
+                RETURN(ERR_PTR(-EOPNOTSUPP));
+
+restart:
+	io = vvp_env_thread_io(env);
+        io->ci_obj = ll_i2info(inode)->lli_clob;
+        LASSERT(io->ci_obj != NULL);
+
+        fio = &io->u.ci_fault;
+        fio->ft_index      = index;
+        fio->ft_executable = vma->vm_flags&VM_EXEC;
+
+        /*
+         * disable VM_SEQ_READ and use VM_RAND_READ to make sure that
+         * the kernel will not read other pages not covered by ldlm in
+         * filemap_nopage. we do our readahead in ll_readpage.
+         */
+        if (ra_flags != NULL)
+                *ra_flags = vma->vm_flags & (VM_RAND_READ|VM_SEQ_READ);
+        vma->vm_flags &= ~VM_SEQ_READ;
+        vma->vm_flags |= VM_RAND_READ;
+
+        CDEBUG(D_MMAP, "vm_flags: %lx (%lu %d)\n", vma->vm_flags,
+               fio->ft_index, fio->ft_executable);
+
+	rc = cl_io_init(env, io, CIT_FAULT, io->ci_obj);
+	if (rc == 0) {
+		struct vvp_io *vio = vvp_env_io(env);
+		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+		LASSERT(vio->vui_cl.cis_io == io);
+
+		/* mmap lock must be MANDATORY it has to cache
+		 * pages. */
+		io->ci_lockreq = CILR_MANDATORY;
+		vio->vui_fd = fd;
+	} else {
+		LASSERT(rc < 0);
+		cl_io_fini(env, io);
+		if (io->ci_need_restart)
+			goto restart;
+
+		io = ERR_PTR(rc);
+	}
+
+	RETURN(io);
+}
+
+/* Sharing code of page_mkwrite method for rhel5 and rhel6 */
+static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage,
+                            bool *retry)
+{
+	struct lu_env           *env;
+	struct cl_io            *io;
+	struct vvp_io           *vio;
+	int                      result;
+	__u16			 refcheck;
+	sigset_t		 set;
+	struct inode             *inode;
+	struct ll_inode_info     *lli;
+	ENTRY;
+
+	LASSERT(vmpage != NULL);
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	io = ll_fault_io_init(env, vma, vmpage->index, NULL);
+	if (IS_ERR(io))
+		GOTO(out, result = PTR_ERR(io));
+
+	result = io->ci_result;
+	if (result < 0)
+		GOTO(out_io, result);
+
+	io->u.ci_fault.ft_mkwrite = 1;
+	io->u.ci_fault.ft_writable = 1;
+
+	vio = vvp_env_io(env);
+	vio->u.fault.ft_vma    = vma;
+	vio->u.fault.ft_vmpage = vmpage;
+
+	set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM));
+
+	inode = vvp_object_inode(io->ci_obj);
+	lli = ll_i2info(inode);
+
+	result = cl_io_loop(env, io);
+
+	cfs_restore_sigs(set);
+
+        if (result == 0) {
+                lock_page(vmpage);
+                if (vmpage->mapping == NULL) {
+                        unlock_page(vmpage);
+
+                        /* page was truncated and lock was cancelled, return
+                         * ENODATA so that VM_FAULT_NOPAGE will be returned
+                         * to handle_mm_fault(). */
+                        if (result == 0)
+                                result = -ENODATA;
+                } else if (!PageDirty(vmpage)) {
+                        /* race, the page has been cleaned by ptlrpcd after
+                         * it was unlocked, it has to be added into dirty
+                         * cache again otherwise this soon-to-dirty page won't
+                         * consume any grants, even worse if this page is being
+                         * transferred because it will break RPC checksum.
+                         */
+                        unlock_page(vmpage);
+
+                        CDEBUG(D_MMAP, "Race on page_mkwrite %p/%lu, page has "
+                               "been written out, retry.\n",
+                               vmpage, vmpage->index);
+
+                        *retry = true;
+                        result = -EAGAIN;
+                }
+
+		if (result == 0)
+			ll_file_set_flag(lli, LLIF_DATA_MODIFIED);
+        }
+        EXIT;
+
+out_io:
+	cl_io_fini(env, io);
+out:
+	cl_env_put(env, &refcheck);
+	CDEBUG(D_MMAP, "%s mkwrite with %d\n", current->comm, result);
+	LASSERT(ergo(result == 0, PageLocked(vmpage)));
+
+	return result;
+}
+
+static inline int to_fault_error(int result)
+{
+	switch(result) {
+	case 0:
+		result = VM_FAULT_LOCKED;
+		break;
+	case -EFAULT:
+		result = VM_FAULT_NOPAGE;
+		break;
+	case -ENOMEM:
+		result = VM_FAULT_OOM;
+		break;
+	default:
+		result = VM_FAULT_SIGBUS;
+		break;
+	}
+	return result;
+}
+
+/**
+ * Lustre implementation of a vm_operations_struct::fault() method, called by
+ * VM to server page fault (both in kernel and user space).
+ *
+ * \param vma - is virtiual area struct related to page fault
+ * \param vmf - structure which describe type and address where hit fault
+ *
+ * \return allocated and filled _locked_ page for address
+ * \retval VM_FAULT_ERROR on general error
+ * \retval NOPAGE_OOM not have memory for allocate new page
+ */
+static int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct lu_env           *env;
+	struct cl_io            *io;
+	struct vvp_io           *vio = NULL;
+	struct page             *vmpage;
+	unsigned long            ra_flags;
+	int                      result = 0;
+	int                      fault_ret = 0;
+	__u16			 refcheck;
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	if (ll_sbi_has_fast_read(ll_i2sbi(file_inode(vma->vm_file)))) {
+		/* do fast fault */
+		ll_cl_add(vma->vm_file, env, NULL, LCC_MMAP);
+		fault_ret = ll_filemap_fault(vma, vmf);
+		ll_cl_remove(vma->vm_file, env);
+
+		/* - If there is no error, then the page was found in cache and
+		 *   uptodate;
+		 * - If VM_FAULT_RETRY is set, the page existed but failed to
+		 *   lock. It will return to kernel and retry;
+		 * - Otherwise, it should try normal fault under DLM lock. */
+		if ((fault_ret & VM_FAULT_RETRY) ||
+		    !(fault_ret & VM_FAULT_ERROR))
+			GOTO(out, result = 0);
+
+		fault_ret = 0;
+	}
+
+	io = ll_fault_io_init(env, vma, vmf->pgoff, &ra_flags);
+	if (IS_ERR(io))
+		GOTO(out, result = PTR_ERR(io));
+
+	result = io->ci_result;
+	if (result == 0) {
+		vio = vvp_env_io(env);
+		vio->u.fault.ft_vma       = vma;
+		vio->u.fault.ft_vmpage    = NULL;
+		vio->u.fault.ft_vmf = vmf;
+		vio->u.fault.ft_flags = 0;
+		vio->u.fault.ft_flags_valid = 0;
+
+		/* May call ll_readpage() */
+		ll_cl_add(vma->vm_file, env, io, LCC_MMAP);
+
+		result = cl_io_loop(env, io);
+
+		ll_cl_remove(vma->vm_file, env);
+
+		/* ft_flags are only valid if we reached
+		 * the call to filemap_fault */
+		if (vio->u.fault.ft_flags_valid)
+			fault_ret = vio->u.fault.ft_flags;
+
+		vmpage = vio->u.fault.ft_vmpage;
+		if (result != 0 && vmpage != NULL) {
+			put_page(vmpage);
+			vmf->page = NULL;
+		}
+        }
+	cl_io_fini(env, io);
+
+	vma->vm_flags |= ra_flags;
+
+out:
+	cl_env_put(env, &refcheck);
+	if (result != 0 && !(fault_ret & VM_FAULT_RETRY))
+		fault_ret |= to_fault_error(result);
+
+	CDEBUG(D_MMAP, "%s fault %d/%d\n", current->comm, fault_ret, result);
+	RETURN(fault_ret);
+}
+
+#ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY
+static int ll_fault(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+#else
+static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+#endif
+	int count = 0;
+	bool printed = false;
+	int result;
+	sigset_t set;
+
+	/* Only SIGKILL and SIGTERM is allowed for fault/nopage/mkwrite
+	 * so that it can be killed by admin but not cause segfault by
+	 * other signals. */
+	set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM));
+
+	ll_stats_ops_tally(ll_i2sbi(file_inode(vma->vm_file)),
+			   LPROC_LL_FAULT, 1);
+
+restart:
+	result = ll_fault0(vma, vmf);
+	if (!(result & (VM_FAULT_RETRY | VM_FAULT_ERROR | VM_FAULT_LOCKED))) {
+                struct page *vmpage = vmf->page;
+
+                /* check if this page has been truncated */
+                lock_page(vmpage);
+                if (unlikely(vmpage->mapping == NULL)) { /* unlucky */
+                        unlock_page(vmpage);
+			put_page(vmpage);
+                        vmf->page = NULL;
+
+                        if (!printed && ++count > 16) {
+                                CWARN("the page is under heavy contention,"
+                                      "maybe your app(%s) needs revising :-)\n",
+                                      current->comm);
+                                printed = true;
+                        }
+
+                        goto restart;
+                }
+
+                result |= VM_FAULT_LOCKED;
+        }
+	cfs_restore_sigs(set);
+        return result;
+}
+
+#ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY
+static int ll_page_mkwrite(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+#else
+static int ll_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+#endif
+	int count = 0;
+	bool printed = false;
+	bool retry;
+	int result;
+
+	ll_stats_ops_tally(ll_i2sbi(file_inode(vma->vm_file)),
+			   LPROC_LL_MKWRITE, 1);
+
+	file_update_time(vma->vm_file);
+        do {
+                retry = false;
+                result = ll_page_mkwrite0(vma, vmf->page, &retry);
+
+                if (!printed && ++count > 16) {
+			const struct dentry *de = file_dentry(vma->vm_file);
+
+			CWARN("app(%s): the page %lu of file "DFID" is under"
+			      " heavy contention\n",
+			      current->comm, vmf->pgoff,
+			      PFID(ll_inode2fid(de->d_inode)));
+                        printed = true;
+                }
+        } while (retry);
+
+        switch(result) {
+        case 0:
+                LASSERT(PageLocked(vmf->page));
+                result = VM_FAULT_LOCKED;
+                break;
+        case -ENODATA:
+        case -EFAULT:
+                result = VM_FAULT_NOPAGE;
+                break;
+        case -ENOMEM:
+                result = VM_FAULT_OOM;
+                break;
+        case -EAGAIN:
+                result = VM_FAULT_RETRY;
+                break;
+        default:
+                result = VM_FAULT_SIGBUS;
+                break;
+        }
+
+        return result;
+}
+
+/**
+ *  To avoid cancel the locks covering mmapped region for lock cache pressure,
+ *  we track the mapped vma count in vvp_object::vob_mmap_cnt.
+ */
+static void ll_vm_open(struct vm_area_struct * vma)
+{
+	struct inode *inode    = file_inode(vma->vm_file);
+	struct vvp_object *vob = cl_inode2vvp(inode);
+
+	ENTRY;
+	LASSERT(atomic_read(&vob->vob_mmap_cnt) >= 0);
+	atomic_inc(&vob->vob_mmap_cnt);
+	EXIT;
+}
+
+/**
+ * Dual to ll_vm_open().
+ */
+static void ll_vm_close(struct vm_area_struct *vma)
+{
+	struct inode      *inode = file_inode(vma->vm_file);
+	struct vvp_object *vob   = cl_inode2vvp(inode);
+
+	ENTRY;
+	atomic_dec(&vob->vob_mmap_cnt);
+	LASSERT(atomic_read(&vob->vob_mmap_cnt) >= 0);
+	EXIT;
+}
+
+/* XXX put nice comment here.  talk about __free_pte -> dirty pages and
+ * nopage's reference passing to the pte */
+int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last)
+{
+        int rc = -ENOENT;
+        ENTRY;
+
+	LASSERTF(last > first, "last %llu first %llu\n", last, first);
+        if (mapping_mapped(mapping)) {
+                rc = 0;
+		unmap_mapping_range(mapping, first + PAGE_SIZE - 1,
+                                    last - first + 1, 0);
+        }
+
+        RETURN(rc);
+}
+
+static const struct vm_operations_struct ll_file_vm_ops = {
+	.fault			= ll_fault,
+	.page_mkwrite		= ll_page_mkwrite,
+	.open			= ll_vm_open,
+	.close			= ll_vm_close,
+};
+
+int ll_file_mmap(struct file *file, struct vm_area_struct * vma)
+{
+	struct inode *inode = file_inode(file);
+        int rc;
+        ENTRY;
+
+        if (ll_file_nolock(file))
+                RETURN(-EOPNOTSUPP);
+
+        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_MAP, 1);
+        rc = generic_file_mmap(file, vma);
+        if (rc == 0) {
+                vma->vm_ops = &ll_file_vm_ops;
+                vma->vm_ops->open(vma);
+                /* update the inode's size and mtime */
+                rc = ll_glimpse_size(inode);
+        }
+
+        RETURN(rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_nfs.c b/drivers/staging/lustrefsx/lustre/llite/llite_nfs.c
new file mode 100644
index 0000000000000..c24f7f6498ba0
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_nfs.c
@@ -0,0 +1,377 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/llite/llite_nfs.c
+ *
+ * NFS export of Lustre Light File System
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ * Author: Huang Hua <huanghua@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+#include "llite_internal.h"
+#include <linux/exportfs.h>
+
+__u32 get_uuid2int(const char *name, int len)
+{
+        __u32 key0 = 0x12a3fe2d, key1 = 0x37abe8f9;
+        while (len--) {
+                __u32 key = key1 + (key0 ^ (*name++ * 7152373));
+                if (key & 0x80000000) key -= 0x7fffffff;
+                key1 = key0;
+                key0 = key;
+        }
+        return (key0 << 1);
+}
+
+struct inode *search_inode_for_lustre(struct super_block *sb,
+				      const struct lu_fid *fid)
+{
+        struct ll_sb_info     *sbi = ll_s2sbi(sb);
+        struct ptlrpc_request *req = NULL;
+        struct inode          *inode = NULL;
+        int                   eadatalen = 0;
+	unsigned long	      hash = cl_fid_build_ino(fid,
+						      ll_need_32bit_api(sbi));
+        struct  md_op_data    *op_data;
+        int                   rc;
+        ENTRY;
+
+        CDEBUG(D_INFO, "searching inode for:(%lu,"DFID")\n", hash, PFID(fid));
+
+	inode = ilookup5(sb, hash, ll_test_inode_by_fid, (void *)fid);
+	if (inode)
+		RETURN(inode);
+
+	rc = ll_get_default_mdsize(sbi, &eadatalen);
+	if (rc)
+		RETURN(ERR_PTR(rc));
+
+        /* Because inode is NULL, ll_prep_md_op_data can not
+         * be used here. So we allocate op_data ourselves */
+        OBD_ALLOC_PTR(op_data);
+        if (op_data == NULL)
+                return ERR_PTR(-ENOMEM);
+
+        op_data->op_fid1 = *fid;
+        op_data->op_mode = eadatalen;
+        op_data->op_valid = OBD_MD_FLEASIZE;
+
+        /* mds_fid2dentry ignores f_type */
+        rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+        OBD_FREE_PTR(op_data);
+        if (rc) {
+		/* Suppress erroneous/confusing messages when NFS
+		 * is out of sync and requests old data. */
+		CDEBUG(D_INFO, "can't get object attrs, fid "DFID", rc %d\n",
+				PFID(fid), rc);
+                RETURN(ERR_PTR(rc));
+        }
+        rc = ll_prep_inode(&inode, req, sb, NULL);
+        ptlrpc_req_finished(req);
+        if (rc)
+                RETURN(ERR_PTR(rc));
+
+        RETURN(inode);
+}
+
+struct lustre_nfs_fid {
+        struct lu_fid   lnf_child;
+        struct lu_fid   lnf_parent;
+};
+
+static struct dentry *
+ll_iget_for_nfs(struct super_block *sb, struct lu_fid *fid, struct lu_fid *parent)
+{
+        struct inode  *inode;
+        struct dentry *result;
+        ENTRY;
+
+	if (!fid_is_sane(fid))
+		RETURN(ERR_PTR(-ESTALE));
+
+	CDEBUG(D_INFO, "Get dentry for fid: "DFID"\n", PFID(fid));
+
+        inode = search_inode_for_lustre(sb, fid);
+        if (IS_ERR(inode))
+                RETURN(ERR_PTR(PTR_ERR(inode)));
+
+        if (is_bad_inode(inode)) {
+                /* we didn't find the right inode.. */
+                iput(inode);
+                RETURN(ERR_PTR(-ESTALE));
+        }
+
+	/* N.B. d_obtain_alias() drops inode ref on error */
+	result = d_obtain_alias(inode);
+	if (!IS_ERR(result)) {
+		int rc;
+
+		rc = ll_d_init(result);
+		if (rc < 0) {
+			dput(result);
+			result = ERR_PTR(rc);
+		} else {
+			struct ll_dentry_data *ldd = ll_d2d(result);
+
+			/*
+			 * Need to signal to the ll_file_open that
+			 * we came from NFS and so opencache needs to be
+			 * enabled for this one
+			 */
+			spin_lock(&result->d_lock);
+			ldd->lld_nfs_dentry = 1;
+			spin_unlock(&result->d_lock);
+		}
+	}
+
+	RETURN(result);
+}
+
+#ifndef FILEID_INVALID
+#define FILEID_INVALID 0xff
+#endif
+#ifndef FILEID_LUSTRE
+#define FILEID_LUSTRE  0x97
+#endif
+
+/**
+ * \a connectable - is nfsd will connect himself or this should be done
+ *                  at lustre
+ *
+ * The return value is file handle type:
+ * 1 -- contains child file handle;
+ * 2 -- contains child file handle and parent file handle;
+ * 255 -- error.
+ */
+#ifndef HAVE_ENCODE_FH_PARENT
+static int ll_encode_fh(struct dentry *de, __u32 *fh, int *plen,
+			int connectable)
+{
+	struct inode *inode = de->d_inode;
+	struct inode *parent = de->d_parent->d_inode;
+#else
+static int ll_encode_fh(struct inode *inode, __u32 *fh, int *plen,
+			struct inode *parent)
+{
+#endif
+	int fileid_len = sizeof(struct lustre_nfs_fid) / 4;
+	struct lustre_nfs_fid *nfs_fid = (void *)fh;
+	ENTRY;
+
+	CDEBUG(D_INFO, "%s: encoding for ("DFID") maxlen=%d minlen=%d\n",
+	       ll_get_fsname(inode->i_sb, NULL, 0),
+	       PFID(ll_inode2fid(inode)), *plen, fileid_len);
+
+	if (*plen < fileid_len) {
+		*plen = fileid_len;
+		RETURN(FILEID_INVALID);
+	}
+
+	nfs_fid->lnf_child = *ll_inode2fid(inode);
+	if (parent != NULL)
+		nfs_fid->lnf_parent = *ll_inode2fid(parent);
+	else
+		fid_zero(&nfs_fid->lnf_parent);
+	*plen = fileid_len;
+
+	RETURN(FILEID_LUSTRE);
+}
+
+static int
+#ifndef HAVE_FILLDIR_USE_CTX
+ll_nfs_get_name_filldir(void *cookie, const char *name, int namelen,
+			loff_t hash, u64 ino, unsigned type)
+{
+	struct ll_getname_data *lgd = cookie;
+#else
+ll_nfs_get_name_filldir(struct dir_context *ctx, const char *name, int namelen,
+			loff_t hash, u64 ino, unsigned type)
+{
+	struct ll_getname_data *lgd =
+		container_of(ctx, struct ll_getname_data, ctx);
+#endif /* HAVE_FILLDIR_USE_CTX */
+        /* It is hack to access lde_fid for comparison with lgd_fid.
+         * So the input 'name' must be part of the 'lu_dirent'. */
+        struct lu_dirent *lde = container_of0(name, struct lu_dirent, lde_name);
+        struct lu_fid fid;
+
+        fid_le_to_cpu(&fid, &lde->lde_fid);
+        if (lu_fid_eq(&fid, &lgd->lgd_fid)) {
+                memcpy(lgd->lgd_name, name, namelen);
+                lgd->lgd_name[namelen] = 0;
+                lgd->lgd_found = 1;
+        }
+        return lgd->lgd_found;
+}
+
+static int ll_get_name(struct dentry *dentry, char *name,
+                       struct dentry *child)
+{
+	struct inode *dir = dentry->d_inode;
+	struct ll_getname_data lgd = {
+		.lgd_name	= name,
+		.lgd_fid	= ll_i2info(child->d_inode)->lli_fid,
+#ifdef HAVE_DIR_CONTEXT
+		.ctx.actor	= ll_nfs_get_name_filldir,
+#endif
+		.lgd_found = 0,
+	};
+	struct md_op_data *op_data;
+	__u64	pos = 0;
+	int rc;
+	ENTRY;
+
+        if (!dir || !S_ISDIR(dir->i_mode))
+                GOTO(out, rc = -ENOTDIR);
+
+        if (!dir->i_fop)
+                GOTO(out, rc = -EINVAL);
+
+	op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, dir);
+	if (IS_ERR(op_data))
+		GOTO(out, rc = PTR_ERR(op_data));
+
+	inode_lock(dir);
+#ifdef HAVE_DIR_CONTEXT
+	rc = ll_dir_read(dir, &pos, op_data, &lgd.ctx);
+#else
+	rc = ll_dir_read(dir, &pos, op_data, &lgd, ll_nfs_get_name_filldir);
+#endif
+	inode_unlock(dir);
+	ll_finish_md_op_data(op_data);
+	if (!rc && !lgd.lgd_found)
+		rc = -ENOENT;
+	EXIT;
+out:
+	return rc;
+}
+
+static struct dentry *ll_fh_to_dentry(struct super_block *sb, struct fid *fid,
+                                      int fh_len, int fh_type)
+{
+        struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid;
+
+	if (fh_type != FILEID_LUSTRE)
+		RETURN(ERR_PTR(-EPROTO));
+
+        RETURN(ll_iget_for_nfs(sb, &nfs_fid->lnf_child, &nfs_fid->lnf_parent));
+}
+
+static struct dentry *ll_fh_to_parent(struct super_block *sb, struct fid *fid,
+                                      int fh_len, int fh_type)
+{
+        struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid;
+
+	if (fh_type != FILEID_LUSTRE)
+		RETURN(ERR_PTR(-EPROTO));
+
+        RETURN(ll_iget_for_nfs(sb, &nfs_fid->lnf_parent, NULL));
+}
+
+int ll_dir_get_parent_fid(struct inode *dir, struct lu_fid *parent_fid)
+{
+	struct ptlrpc_request	*req = NULL;
+	struct ll_sb_info	*sbi;
+	struct mdt_body		*body;
+	static const char	dotdot[] = "..";
+	struct md_op_data	*op_data;
+	int			rc;
+	int			lmmsize;
+	ENTRY;
+
+	LASSERT(dir && S_ISDIR(dir->i_mode));
+
+	sbi = ll_s2sbi(dir->i_sb);
+
+	CDEBUG(D_INFO, "%s: getting parent for ("DFID")\n",
+	       ll_get_fsname(dir->i_sb, NULL, 0),
+	       PFID(ll_inode2fid(dir)));
+
+	rc = ll_get_default_mdsize(sbi, &lmmsize);
+	if (rc != 0)
+		RETURN(rc);
+
+	op_data = ll_prep_md_op_data(NULL, dir, NULL, dotdot,
+				     strlen(dotdot), lmmsize,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
+	ll_finish_md_op_data(op_data);
+	if (rc != 0) {
+		CERROR("%s: failure inode "DFID" get parent: rc = %d\n",
+		       ll_get_fsname(dir->i_sb, NULL, 0),
+		       PFID(ll_inode2fid(dir)), rc);
+		RETURN(rc);
+	}
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+
+	/*
+	 * LU-3952: MDT may lost the FID of its parent, we should not crash
+	 * the NFS server, ll_iget_for_nfs() will handle the error.
+	 */
+	if (body->mbo_valid & OBD_MD_FLID) {
+		CDEBUG(D_INFO, "parent for "DFID" is "DFID"\n",
+		       PFID(ll_inode2fid(dir)), PFID(&body->mbo_fid1));
+		*parent_fid = body->mbo_fid1;
+	}
+
+	ptlrpc_req_finished(req);
+	RETURN(0);
+}
+
+static struct dentry *ll_get_parent(struct dentry *dchild)
+{
+	struct lu_fid	parent_fid = { 0 };
+	int		rc;
+	struct dentry	*dentry;
+	ENTRY;
+
+	rc = ll_dir_get_parent_fid(dchild->d_inode, &parent_fid);
+	if (rc != 0)
+		RETURN(ERR_PTR(rc));
+
+	dentry = ll_iget_for_nfs(dchild->d_inode->i_sb, &parent_fid, NULL);
+
+	RETURN(dentry);
+}
+
+struct export_operations lustre_export_operations = {
+       .get_parent = ll_get_parent,
+       .encode_fh  = ll_encode_fh,
+       .get_name   = ll_get_name,
+        .fh_to_dentry = ll_fh_to_dentry,
+        .fh_to_parent = ll_fh_to_parent,
+};
diff --git a/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c b/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
new file mode 100755
index 0000000000000..7dfb36d2873cd
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
@@ -0,0 +1,1832 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/version.h>
+#include <linux/user_namespace.h>
+#ifdef HAVE_UIDGID_HEADER
+# include <linux/uidgid.h>
+#endif
+#include <uapi/linux/lustre_param.h>
+#include <lprocfs_status.h>
+#include <obd_support.h>
+
+#include "llite_internal.h"
+#include "vvp_internal.h"
+
+struct proc_dir_entry *proc_lustre_fs_root;
+
+#ifdef CONFIG_PROC_FS
+/* /proc/lustre/llite mount point registration */
+static const struct file_operations ll_rw_extents_stats_fops;
+static const struct file_operations ll_rw_extents_stats_pp_fops;
+static const struct file_operations ll_rw_offset_stats_fops;
+static __s64 ll_stats_pid_write(const char __user *buf, size_t len);
+
+static int ll_blksize_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct obd_statfs osfs;
+	int rc;
+
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc)
+		seq_printf(m, "%u\n", osfs.os_bsize);
+	return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_blksize);
+
+static int ll_stat_blksize_seq_show(struct seq_file *m, void *v)
+{
+	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
+
+	seq_printf(m, "%u\n", sbi->ll_stat_blksize);
+
+	return 0;
+}
+
+static ssize_t ll_stat_blksize_seq_write(struct file *file,
+					 const char __user *buffer,
+					 size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
+	__s64 val;
+	int rc;
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val != 0 && (val < PAGE_SIZE || (val & (val - 1))) != 0)
+		return -ERANGE;
+
+	sbi->ll_stat_blksize = val;
+
+	return count;
+}
+LPROC_SEQ_FOPS(ll_stat_blksize);
+
+static int ll_kbytestotal_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct obd_statfs osfs;
+	int rc;
+
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_blocks;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		seq_printf(m, "%llu\n", result);
+	}
+	return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_kbytestotal);
+
+static int ll_kbytesfree_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct obd_statfs osfs;
+	int rc;
+
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bfree;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		seq_printf(m, "%llu\n", result);
+	}
+	return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_kbytesfree);
+
+static int ll_kbytesavail_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct obd_statfs osfs;
+	int rc;
+
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bavail;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		seq_printf(m, "%llu\n", result);
+	}
+	return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_kbytesavail);
+
+static int ll_filestotal_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct obd_statfs osfs;
+	int rc;
+
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc)
+		seq_printf(m, "%llu\n", osfs.os_files);
+	return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_filestotal);
+
+static int ll_filesfree_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct obd_statfs osfs;
+	int rc;
+
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc)
+		seq_printf(m, "%llu\n", osfs.os_ffree);
+	return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_filesfree);
+
+static int ll_client_type_seq_show(struct seq_file *m, void *v)
+{
+	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
+
+	LASSERT(sbi != NULL);
+
+	seq_puts(m, "local client\n");
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(ll_client_type);
+
+static int ll_fstype_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+
+	LASSERT(sb != NULL);
+	seq_printf(m, "%s\n", sb->s_type->name);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(ll_fstype);
+
+static int ll_sb_uuid_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+
+	LASSERT(sb != NULL);
+	seq_printf(m, "%s\n", ll_s2sbi(sb)->ll_sb_uuid.uuid);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(ll_sb_uuid);
+
+static int ll_xattr_cache_seq_show(struct seq_file *m, void *v)
+{
+	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
+
+	seq_printf(m, "%u\n", sbi->ll_xattr_cache_enabled);
+	return 0;
+}
+
+static ssize_t ll_xattr_cache_seq_write(struct file *file,
+					const char __user *buffer,
+					size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
+	__s64 val;
+	int rc;
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val != 0 && val != 1)
+		return -ERANGE;
+
+	if (val == 1 && !(sbi->ll_flags & LL_SBI_XATTR_CACHE))
+		return -ENOTSUPP;
+
+	sbi->ll_xattr_cache_enabled = val;
+	sbi->ll_xattr_cache_set = 1;
+
+	return count;
+}
+LPROC_SEQ_FOPS(ll_xattr_cache);
+
+static int ll_site_stats_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+
+	/*
+	 * See description of statistical counters in struct cl_site, and
+	 * struct lu_site.
+	 */
+	return cl_site_stats_print(lu2cl_site(ll_s2sbi(sb)->ll_site), m);
+}
+LPROC_SEQ_FOPS_RO(ll_site_stats);
+
+static int ll_max_readahead_mb_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	long pages_number;
+	int mult;
+
+	spin_lock(&sbi->ll_lock);
+	pages_number = sbi->ll_ra_info.ra_max_pages;
+	spin_unlock(&sbi->ll_lock);
+
+	mult = 1 << (20 - PAGE_SHIFT);
+	return lprocfs_seq_read_frac_helper(m, pages_number, mult);
+}
+
+static ssize_t
+ll_max_readahead_mb_seq_write(struct file *file, const char __user *buffer,
+			      size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	__s64 pages_number;
+	int rc;
+
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &pages_number, 'M');
+	if (rc)
+		return rc;
+
+	pages_number >>= PAGE_SHIFT;
+
+	if (pages_number < 0 || pages_number > totalram_pages / 2) {
+		/* 1/2 of RAM */
+		CERROR("%s: can't set max_readahead_mb=%lu > %luMB\n",
+		       ll_get_fsname(sb, NULL, 0),
+		       (unsigned long)pages_number >> (20 - PAGE_SHIFT),
+		       totalram_pages >> (20 - PAGE_SHIFT + 1));
+		return -ERANGE;
+	}
+
+	spin_lock(&sbi->ll_lock);
+	sbi->ll_ra_info.ra_max_pages = pages_number;
+	spin_unlock(&sbi->ll_lock);
+	return count;
+}
+LPROC_SEQ_FOPS(ll_max_readahead_mb);
+
+static int ll_max_readahead_per_file_mb_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	long pages_number;
+	int mult;
+
+	spin_lock(&sbi->ll_lock);
+	pages_number = sbi->ll_ra_info.ra_max_pages_per_file;
+	spin_unlock(&sbi->ll_lock);
+
+	mult = 1 << (20 - PAGE_SHIFT);
+	return lprocfs_seq_read_frac_helper(m, pages_number, mult);
+}
+
+static ssize_t
+ll_max_readahead_per_file_mb_seq_write(struct file *file,
+				       const char __user *buffer,
+				       size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int rc;
+	__s64 pages_number;
+
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &pages_number, 'M');
+	if (rc)
+		return rc;
+
+	pages_number >>= PAGE_SHIFT;
+
+	if (pages_number < 0 || pages_number > sbi->ll_ra_info.ra_max_pages) {
+		CERROR("%s: can't set max_readahead_per_file_mb=%lu > "
+		       "max_read_ahead_mb=%lu\n", ll_get_fsname(sb, NULL, 0),
+		       (unsigned long)pages_number >> (20 - PAGE_SHIFT),
+		       sbi->ll_ra_info.ra_max_pages >> (20 - PAGE_SHIFT));
+		return -ERANGE;
+	}
+
+	spin_lock(&sbi->ll_lock);
+	sbi->ll_ra_info.ra_max_pages_per_file = pages_number;
+	spin_unlock(&sbi->ll_lock);
+	return count;
+}
+LPROC_SEQ_FOPS(ll_max_readahead_per_file_mb);
+
+static int ll_max_read_ahead_whole_mb_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	long pages_number;
+	int mult;
+
+	spin_lock(&sbi->ll_lock);
+	pages_number = sbi->ll_ra_info.ra_max_read_ahead_whole_pages;
+	spin_unlock(&sbi->ll_lock);
+
+	mult = 1 << (20 - PAGE_SHIFT);
+	return lprocfs_seq_read_frac_helper(m, pages_number, mult);
+}
+
+static ssize_t
+ll_max_read_ahead_whole_mb_seq_write(struct file *file,
+				     const char __user *buffer,
+				     size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int rc;
+	__s64 pages_number;
+
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &pages_number, 'M');
+	if (rc)
+		return rc;
+
+	pages_number >>= PAGE_SHIFT;
+
+	/* Cap this at the current max readahead window size, the readahead
+	 * algorithm does this anyway so it's pointless to set it larger. */
+	if (pages_number < 0 ||
+	    pages_number > sbi->ll_ra_info.ra_max_pages_per_file) {
+		int pages_shift = 20 - PAGE_SHIFT;
+		CERROR("%s: can't set max_read_ahead_whole_mb=%lu > "
+		       "max_read_ahead_per_file_mb=%lu\n",
+		       ll_get_fsname(sb, NULL, 0),
+		       (unsigned long)pages_number >> pages_shift,
+		       sbi->ll_ra_info.ra_max_pages_per_file >> pages_shift);
+		return -ERANGE;
+	}
+
+	spin_lock(&sbi->ll_lock);
+	sbi->ll_ra_info.ra_max_read_ahead_whole_pages = pages_number;
+	spin_unlock(&sbi->ll_lock);
+	return count;
+}
+LPROC_SEQ_FOPS(ll_max_read_ahead_whole_mb);
+
+static int ll_max_cached_mb_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block     *sb    = m->private;
+	struct ll_sb_info      *sbi   = ll_s2sbi(sb);
+	struct cl_client_cache *cache = sbi->ll_cache;
+	int shift = 20 - PAGE_SHIFT;
+	long max_cached_mb;
+	long unused_mb;
+
+	max_cached_mb = cache->ccc_lru_max >> shift;
+	unused_mb = atomic_long_read(&cache->ccc_lru_left) >> shift;
+	seq_printf(m, "users: %d\n"
+		   "max_cached_mb: %ld\n"
+		   "used_mb: %ld\n"
+		   "unused_mb: %ld\n"
+		   "reclaim_count: %u\n",
+		   atomic_read(&cache->ccc_users),
+		   max_cached_mb,
+		   max_cached_mb - unused_mb,
+		   unused_mb,
+		   cache->ccc_lru_shrinkers);
+	return 0;
+}
+
+static ssize_t
+ll_max_cached_mb_seq_write(struct file *file, const char __user *buffer,
+			   size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct cl_client_cache *cache = sbi->ll_cache;
+	struct lu_env *env;
+	long diff = 0;
+	long nrpages = 0;
+	__u16 refcheck;
+	__s64 pages_number;
+	long rc;
+	char kernbuf[128];
+	ENTRY;
+
+	if (count >= sizeof(kernbuf))
+		RETURN(-EINVAL);
+
+	if (copy_from_user(kernbuf, buffer, count))
+		RETURN(-EFAULT);
+	kernbuf[count] = 0;
+
+	buffer += lprocfs_find_named_value(kernbuf, "max_cached_mb:", &count) -
+		  kernbuf;
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &pages_number, 'M');
+	if (rc)
+		RETURN(rc);
+
+	pages_number >>= PAGE_SHIFT;
+
+	if (pages_number < 0 || pages_number > totalram_pages) {
+		CERROR("%s: can't set max cache more than %lu MB\n",
+		       ll_get_fsname(sb, NULL, 0),
+		       totalram_pages >> (20 - PAGE_SHIFT));
+		RETURN(-ERANGE);
+	}
+	/* Allow enough cache so clients can make well-formed RPCs */
+	pages_number = max_t(long, pages_number, PTLRPC_MAX_BRW_PAGES);
+
+	spin_lock(&sbi->ll_lock);
+	diff = pages_number - cache->ccc_lru_max;
+	spin_unlock(&sbi->ll_lock);
+
+	/* easy - add more LRU slots. */
+	if (diff >= 0) {
+		atomic_long_add(diff, &cache->ccc_lru_left);
+		GOTO(out, rc = 0);
+	}
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(rc);
+
+	diff = -diff;
+	while (diff > 0) {
+		long tmp;
+
+		/* reduce LRU budget from free slots. */
+		do {
+			long ov, nv;
+
+			ov = atomic_long_read(&cache->ccc_lru_left);
+			if (ov == 0)
+				break;
+
+			nv = ov > diff ? ov - diff : 0;
+			rc = atomic_long_cmpxchg(&cache->ccc_lru_left, ov, nv);
+			if (likely(ov == rc)) {
+				diff -= ov - nv;
+				nrpages += ov - nv;
+				break;
+			}
+		} while (1);
+
+		if (diff <= 0)
+			break;
+
+		if (sbi->ll_dt_exp == NULL) { /* being initialized */
+			rc = -ENODEV;
+			break;
+		}
+
+		/* difficult - have to ask OSCs to drop LRU slots. */
+		tmp = diff << 1;
+		rc = obd_set_info_async(env, sbi->ll_dt_exp,
+				sizeof(KEY_CACHE_LRU_SHRINK),
+				KEY_CACHE_LRU_SHRINK,
+				sizeof(tmp), &tmp, NULL);
+		if (rc < 0)
+			break;
+	}
+	cl_env_put(env, &refcheck);
+
+out:
+	if (rc >= 0) {
+		spin_lock(&sbi->ll_lock);
+		cache->ccc_lru_max = pages_number;
+		spin_unlock(&sbi->ll_lock);
+		rc = count;
+	} else {
+		atomic_long_add(nrpages, &cache->ccc_lru_left);
+	}
+	return rc;
+}
+LPROC_SEQ_FOPS(ll_max_cached_mb);
+
+static int ll_checksum_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	seq_printf(m, "%u\n", (sbi->ll_flags & LL_SBI_CHECKSUM) ? 1 : 0);
+	return 0;
+}
+
+static ssize_t ll_checksum_seq_write(struct file *file,
+				     const char __user *buffer,
+				     size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
+	int rc;
+	__s64 val;
+
+	if (!sbi->ll_dt_exp)
+		/* Not set up yet */
+		return -EAGAIN;
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+	if (val)
+		sbi->ll_flags |= LL_SBI_CHECKSUM;
+	else
+		sbi->ll_flags &= ~LL_SBI_CHECKSUM;
+
+	rc = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM),
+				KEY_CHECKSUM, sizeof(val), &val, NULL);
+	if (rc)
+		CWARN("Failed to set OSC checksum flags: %d\n", rc);
+
+	return count;
+}
+LPROC_SEQ_FOPS(ll_checksum);
+
+static int ll_rd_track_id(struct seq_file *m, enum stats_track_type type)
+{
+	struct super_block *sb = m->private;
+
+	if (ll_s2sbi(sb)->ll_stats_track_type == type) {
+		seq_printf(m, "%d\n",
+			   ll_s2sbi(sb)->ll_stats_track_id);
+	} else if (ll_s2sbi(sb)->ll_stats_track_type == STATS_TRACK_ALL) {
+		seq_puts(m, "0 (all)\n");
+	} else {
+		seq_puts(m, "untracked\n");
+	}
+	return 0;
+}
+
+static int ll_wr_track_id(const char __user *buffer, unsigned long count,
+			  void *data, enum stats_track_type type)
+{
+	struct super_block *sb = data;
+	int rc;
+	__s64 pid;
+
+	rc = lprocfs_str_to_s64(buffer, count, &pid);
+	if (rc)
+		return rc;
+	if (pid > INT_MAX || pid < 0)
+		return -ERANGE;
+
+	ll_s2sbi(sb)->ll_stats_track_id = pid;
+	if (pid == 0)
+		ll_s2sbi(sb)->ll_stats_track_type = STATS_TRACK_ALL;
+	else
+		ll_s2sbi(sb)->ll_stats_track_type = type;
+	lprocfs_clear_stats(ll_s2sbi(sb)->ll_stats);
+	return count;
+}
+
+static int ll_track_pid_seq_show(struct seq_file *m, void *v)
+{
+	return ll_rd_track_id(m, STATS_TRACK_PID);
+}
+
+static ssize_t ll_track_pid_seq_write(struct file *file,
+				      const char __user *buffer,
+				      size_t count, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	return ll_wr_track_id(buffer, count, seq->private, STATS_TRACK_PID);
+}
+LPROC_SEQ_FOPS(ll_track_pid);
+
+static int ll_track_ppid_seq_show(struct seq_file *m, void *v)
+{
+	return ll_rd_track_id(m, STATS_TRACK_PPID);
+}
+
+static ssize_t ll_track_ppid_seq_write(struct file *file,
+				       const char __user *buffer,
+				       size_t count, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	return ll_wr_track_id(buffer, count, seq->private, STATS_TRACK_PPID);
+}
+LPROC_SEQ_FOPS(ll_track_ppid);
+
+static int ll_track_gid_seq_show(struct seq_file *m, void *v)
+{
+	return ll_rd_track_id(m, STATS_TRACK_GID);
+}
+
+static ssize_t ll_track_gid_seq_write(struct file *file,
+				      const char __user *buffer,
+				      size_t count, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	return ll_wr_track_id(buffer, count, seq->private, STATS_TRACK_GID);
+}
+LPROC_SEQ_FOPS(ll_track_gid);
+
+static int ll_statahead_running_max_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	seq_printf(m, "%u\n", sbi->ll_sa_running_max);
+	return 0;
+}
+
+static ssize_t ll_statahead_running_max_seq_write(struct file *file,
+					   const char __user *buffer,
+					   size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int rc;
+	__s64 val;
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val >= 0 || val <= LL_SA_RUNNING_MAX)
+		sbi->ll_sa_running_max = val;
+	else
+		CERROR("%s: bad statahead_running_max value %lld. Valid values "
+		       "are in the range [0, %u]\n", ll_get_fsname(sb, NULL, 0),
+		       val, LL_SA_RUNNING_MAX);
+
+	return count;
+}
+LPROC_SEQ_FOPS(ll_statahead_running_max);
+
+static int ll_statahead_max_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	seq_printf(m, "%u\n", sbi->ll_sa_max);
+	return 0;
+}
+
+static ssize_t ll_statahead_max_seq_write(struct file *file,
+					  const char __user *buffer,
+					  size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int rc;
+	__s64 val;
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val >= 0 && val <= LL_SA_RPC_MAX)
+		sbi->ll_sa_max = val;
+	else
+		CERROR("%s: bad statahead_max value %lld. Valid values are in "
+		       "are in the range [0, %u]\n", ll_get_fsname(sb, NULL, 0),
+		       val, LL_SA_RPC_MAX);
+
+	return count;
+}
+LPROC_SEQ_FOPS(ll_statahead_max);
+
+static int ll_statahead_agl_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	seq_printf(m, "%u\n",
+		   sbi->ll_flags & LL_SBI_AGL_ENABLED ? 1 : 0);
+	return 0;
+}
+
+static ssize_t ll_statahead_agl_seq_write(struct file *file,
+					  const char __user *buffer,
+					  size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
+	int rc;
+	__s64 val;
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val)
+		sbi->ll_flags |= LL_SBI_AGL_ENABLED;
+	else
+		sbi->ll_flags &= ~LL_SBI_AGL_ENABLED;
+
+	return count;
+}
+LPROC_SEQ_FOPS(ll_statahead_agl);
+
+static int ll_statahead_stats_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	seq_printf(m, "statahead total: %u\n"
+		    "statahead wrong: %u\n"
+		    "agl total: %u\n",
+		    atomic_read(&sbi->ll_sa_total),
+		    atomic_read(&sbi->ll_sa_wrong),
+		    atomic_read(&sbi->ll_agl_total));
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(ll_statahead_stats);
+
+static int ll_lazystatfs_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	seq_printf(m, "%u\n",
+		   (sbi->ll_flags & LL_SBI_LAZYSTATFS) ? 1 : 0);
+	return 0;
+}
+
+static ssize_t ll_lazystatfs_seq_write(struct file *file,
+				       const char __user *buffer,
+					size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
+	int rc;
+	__s64 val;
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val)
+		sbi->ll_flags |= LL_SBI_LAZYSTATFS;
+	else
+		sbi->ll_flags &= ~LL_SBI_LAZYSTATFS;
+
+	return count;
+}
+LPROC_SEQ_FOPS(ll_lazystatfs);
+
+static int ll_max_easize_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	unsigned int ealen;
+	int rc;
+
+	rc = ll_get_max_mdsize(sbi, &ealen);
+	if (rc)
+		return rc;
+
+	seq_printf(m, "%u\n", ealen);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(ll_max_easize);
+
+/**
+ * Get default_easize.
+ *
+ * \see client_obd::cl_default_mds_easize
+ *
+ * \param[in] m		seq_file handle
+ * \param[in] v		unused for single entry
+ *
+ * \retval 0		on success
+ * \retval negative	negated errno on failure
+ */
+static int ll_default_easize_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	unsigned int ealen;
+	int rc;
+
+	rc = ll_get_default_mdsize(sbi, &ealen);
+	if (rc)
+		return rc;
+
+	seq_printf(m, "%u\n", ealen);
+	return 0;
+}
+
+/**
+ * Set default_easize.
+ *
+ * Range checking on the passed value is handled by
+ * ll_set_default_mdsize().
+ *
+ * \see client_obd::cl_default_mds_easize
+ *
+ * \param[in] file	proc file
+ * \param[in] buffer	string passed from user space
+ * \param[in] count	\a buffer length
+ * \param[in] off	unused for single entry
+ *
+ * \retval positive	\a count on success
+ * \retval negative	negated errno on failure
+ */
+static ssize_t ll_default_easize_seq_write(struct file *file,
+					   const char __user *buffer,
+					   size_t count, loff_t *unused)
+{
+	struct seq_file	*seq = file->private_data;
+	struct super_block *sb = (struct super_block *)seq->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	__s64 val;
+	int rc;
+
+	if (count == 0)
+		return 0;
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+	if (val < 0 || val > INT_MAX)
+		return -ERANGE;
+
+	rc = ll_set_default_mdsize(sbi, val);
+	if (rc)
+		return rc;
+
+	return count;
+}
+LPROC_SEQ_FOPS(ll_default_easize);
+
+static int ll_sbi_flags_seq_show(struct seq_file *m, void *v)
+{
+	const char *str[] = LL_SBI_FLAGS;
+	struct super_block *sb = m->private;
+	int flags = ll_s2sbi(sb)->ll_flags;
+	int i = 0;
+
+	while (flags != 0) {
+		if (ARRAY_SIZE(str) <= i) {
+			CERROR("%s: Revise array LL_SBI_FLAGS to match sbi "
+				"flags please.\n", ll_get_fsname(sb, NULL, 0));
+			return -EINVAL;
+		}
+
+		if (flags & 0x1)
+			seq_printf(m, "%s ", str[i]);
+		flags >>= 1;
+		++i;
+	}
+	seq_printf(m, "\b\n");
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(ll_sbi_flags);
+
+static int ll_fast_read_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	seq_printf(m, "%u\n", !!(sbi->ll_flags & LL_SBI_FAST_READ));
+	return 0;
+}
+
+static ssize_t
+ll_fast_read_seq_write(struct file *file, const char __user *buffer,
+		       size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int rc;
+	__s64 val;
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	spin_lock(&sbi->ll_lock);
+	if (val == 1)
+		sbi->ll_flags |= LL_SBI_FAST_READ;
+	else
+		sbi->ll_flags &= ~LL_SBI_FAST_READ;
+	spin_unlock(&sbi->ll_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(ll_fast_read);
+
+static int ll_pio_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	seq_printf(m, "%u\n", !!(sbi->ll_flags & LL_SBI_PIO));
+	return 0;
+}
+
+static ssize_t ll_pio_seq_write(struct file *file, const char __user *buffer,
+				size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int rc;
+	__s64 val;
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	spin_lock(&sbi->ll_lock);
+	if (val == 1)
+		sbi->ll_flags |= LL_SBI_PIO;
+	else
+		sbi->ll_flags &= ~LL_SBI_PIO;
+	spin_unlock(&sbi->ll_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(ll_pio);
+
+static int ll_unstable_stats_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block	*sb    = m->private;
+	struct ll_sb_info	*sbi   = ll_s2sbi(sb);
+	struct cl_client_cache	*cache = sbi->ll_cache;
+	long pages;
+	int mb;
+
+	pages = atomic_long_read(&cache->ccc_unstable_nr);
+	mb    = (pages * PAGE_SIZE) >> 20;
+
+	seq_printf(m, "unstable_check:     %8d\n"
+		   "unstable_pages: %12ld\n"
+		   "unstable_mb:        %8d\n",
+		   cache->ccc_unstable_check, pages, mb);
+	return 0;
+}
+
+static ssize_t ll_unstable_stats_seq_write(struct file *file,
+					   const char __user *buffer,
+					   size_t count, loff_t *unused)
+{
+	struct seq_file *seq = file->private_data;
+	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)seq->private);
+	char kernbuf[128];
+	int rc;
+	__s64 val;
+
+	if (count == 0)
+		return 0;
+	if (count >= sizeof(kernbuf))
+		return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+	kernbuf[count] = 0;
+
+	buffer += lprocfs_find_named_value(kernbuf, "unstable_check:", &count) -
+		  kernbuf;
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc < 0)
+		return rc;
+
+	/* borrow lru lock to set the value */
+	spin_lock(&sbi->ll_cache->ccc_lru_lock);
+	sbi->ll_cache->ccc_unstable_check = !!val;
+	spin_unlock(&sbi->ll_cache->ccc_lru_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(ll_unstable_stats);
+
+static int ll_root_squash_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct root_squash_info *squash = &sbi->ll_squash;
+
+	seq_printf(m, "%u:%u\n", squash->rsi_uid, squash->rsi_gid);
+	return 0;
+}
+
+static ssize_t ll_root_squash_seq_write(struct file *file,
+					const char __user *buffer,
+					size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct root_squash_info *squash = &sbi->ll_squash;
+
+	return lprocfs_wr_root_squash(buffer, count, squash,
+				      ll_get_fsname(sb, NULL, 0));
+}
+LPROC_SEQ_FOPS(ll_root_squash);
+
+static int ll_nosquash_nids_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct root_squash_info *squash = &sbi->ll_squash;
+	int len;
+
+	down_read(&squash->rsi_sem);
+	if (!list_empty(&squash->rsi_nosquash_nids)) {
+		len = cfs_print_nidlist(m->buf + m->count, m->size - m->count,
+					&squash->rsi_nosquash_nids);
+		m->count += len;
+		seq_putc(m, '\n');
+	} else {
+		seq_puts(m, "NONE\n");
+	}
+	up_read(&squash->rsi_sem);
+
+	return 0;
+}
+
+static ssize_t ll_nosquash_nids_seq_write(struct file *file,
+					  const char __user *buffer,
+					  size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct root_squash_info *squash = &sbi->ll_squash;
+	int rc;
+
+	rc = lprocfs_wr_nosquash_nids(buffer, count, squash,
+				      ll_get_fsname(sb, NULL, 0));
+	if (rc < 0)
+		return rc;
+
+	ll_compute_rootsquash_state(sbi);
+
+	return rc;
+}
+LPROC_SEQ_FOPS(ll_nosquash_nids);
+
+struct lprocfs_vars lprocfs_llite_obd_vars[] = {
+	{ .name	=	"uuid",
+	  .fops	=	&ll_sb_uuid_fops			},
+	{ .name	=	"fstype",
+	  .fops	=	&ll_fstype_fops				},
+	{ .name	=	"site",
+	  .fops	=	&ll_site_stats_fops			},
+	{ .name	=	"blocksize",
+	  .fops	=	&ll_blksize_fops			},
+	{ .name	=	"stat_blocksize",
+	  .fops	=	&ll_stat_blksize_fops			},
+	{ .name	=	"kbytestotal",
+	  .fops	=	&ll_kbytestotal_fops			},
+	{ .name	=	"kbytesfree",
+	  .fops	=	&ll_kbytesfree_fops			},
+	{ .name	=	"kbytesavail",
+	  .fops	=	&ll_kbytesavail_fops			},
+	{ .name	=	"filestotal",
+	  .fops	=	&ll_filestotal_fops			},
+	{ .name	=	"filesfree",
+	  .fops	=	&ll_filesfree_fops			},
+	{ .name	=	"client_type",
+	  .fops	=	&ll_client_type_fops			},
+	{ .name	=	"max_read_ahead_mb",
+	  .fops	=	&ll_max_readahead_mb_fops		},
+	{ .name	=	"max_read_ahead_per_file_mb",
+	  .fops	=	&ll_max_readahead_per_file_mb_fops	},
+	{ .name	=	"max_read_ahead_whole_mb",
+	  .fops	=	&ll_max_read_ahead_whole_mb_fops	},
+	{ .name	=	"max_cached_mb",
+	  .fops	=	&ll_max_cached_mb_fops			},
+	{ .name	=	"checksum_pages",
+	  .fops	=	&ll_checksum_fops			},
+	{ .name	=	"stats_track_pid",
+	  .fops	=	&ll_track_pid_fops			},
+	{ .name	=	"stats_track_ppid",
+	  .fops	=	&ll_track_ppid_fops			},
+	{ .name	=	"stats_track_gid",
+	  .fops	=	&ll_track_gid_fops			},
+	{ .name	=	"statahead_max",
+	  .fops	=	&ll_statahead_max_fops			},
+	{ .name	=	"statahead_running_max",
+	  .fops	=	&ll_statahead_running_max_fops		},
+	{ .name	=	"statahead_agl",
+	  .fops	=	&ll_statahead_agl_fops			},
+	{ .name	=	"statahead_stats",
+	  .fops	=	&ll_statahead_stats_fops		},
+	{ .name	=	"lazystatfs",
+	  .fops	=	&ll_lazystatfs_fops			},
+	{ .name	=	"max_easize",
+	  .fops	=	&ll_max_easize_fops			},
+	{ .name	=	"default_easize",
+	  .fops	=	&ll_default_easize_fops			},
+	{ .name	=	"sbi_flags",
+	  .fops	=	&ll_sbi_flags_fops			},
+	{ .name	=	"xattr_cache",
+	  .fops	=	&ll_xattr_cache_fops			},
+	{ .name	=	"unstable_stats",
+	  .fops	=	&ll_unstable_stats_fops			},
+	{ .name	=	"root_squash",
+	  .fops	=	&ll_root_squash_fops			},
+	{ .name	=	"nosquash_nids",
+	  .fops	=	&ll_nosquash_nids_fops			},
+	{ .name =	"fast_read",
+	  .fops =	&ll_fast_read_fops,			},
+	{ .name =	"pio",
+	  .fops =	&ll_pio_fops,				},
+	{ NULL }
+};
+
+#define MAX_STRING_SIZE 128
+
+static const struct llite_file_opcode {
+        __u32       opcode;
+        __u32       type;
+        const char *opname;
+} llite_opcode_table[LPROC_LL_FILE_OPCODES] = {
+        /* file operation */
+        { LPROC_LL_DIRTY_HITS,     LPROCFS_TYPE_REGS, "dirty_pages_hits" },
+        { LPROC_LL_DIRTY_MISSES,   LPROCFS_TYPE_REGS, "dirty_pages_misses" },
+        { LPROC_LL_READ_BYTES,     LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES,
+                                   "read_bytes" },
+        { LPROC_LL_WRITE_BYTES,    LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES,
+                                   "write_bytes" },
+        { LPROC_LL_BRW_READ,       LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_PAGES,
+                                   "brw_read" },
+        { LPROC_LL_BRW_WRITE,      LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_PAGES,
+                                   "brw_write" },
+        { LPROC_LL_IOCTL,          LPROCFS_TYPE_REGS, "ioctl" },
+        { LPROC_LL_OPEN,           LPROCFS_TYPE_REGS, "open" },
+        { LPROC_LL_RELEASE,        LPROCFS_TYPE_REGS, "close" },
+        { LPROC_LL_MAP,            LPROCFS_TYPE_REGS, "mmap" },
+	{ LPROC_LL_FAULT,          LPROCFS_TYPE_REGS, "page_fault" },
+	{ LPROC_LL_MKWRITE,        LPROCFS_TYPE_REGS, "page_mkwrite" },
+        { LPROC_LL_LLSEEK,         LPROCFS_TYPE_REGS, "seek" },
+        { LPROC_LL_FSYNC,          LPROCFS_TYPE_REGS, "fsync" },
+        { LPROC_LL_READDIR,        LPROCFS_TYPE_REGS, "readdir" },
+        /* inode operation */
+        { LPROC_LL_SETATTR,        LPROCFS_TYPE_REGS, "setattr" },
+        { LPROC_LL_TRUNC,          LPROCFS_TYPE_REGS, "truncate" },
+        { LPROC_LL_FLOCK,          LPROCFS_TYPE_REGS, "flock" },
+        { LPROC_LL_GETATTR,        LPROCFS_TYPE_REGS, "getattr" },
+        /* dir inode operation */
+        { LPROC_LL_CREATE,         LPROCFS_TYPE_REGS, "create" },
+        { LPROC_LL_LINK,           LPROCFS_TYPE_REGS, "link" },
+        { LPROC_LL_UNLINK,         LPROCFS_TYPE_REGS, "unlink" },
+        { LPROC_LL_SYMLINK,        LPROCFS_TYPE_REGS, "symlink" },
+        { LPROC_LL_MKDIR,          LPROCFS_TYPE_REGS, "mkdir" },
+        { LPROC_LL_RMDIR,          LPROCFS_TYPE_REGS, "rmdir" },
+        { LPROC_LL_MKNOD,          LPROCFS_TYPE_REGS, "mknod" },
+        { LPROC_LL_RENAME,         LPROCFS_TYPE_REGS, "rename" },
+        /* special inode operation */
+        { LPROC_LL_STAFS,          LPROCFS_TYPE_REGS, "statfs" },
+        { LPROC_LL_ALLOC_INODE,    LPROCFS_TYPE_REGS, "alloc_inode" },
+        { LPROC_LL_SETXATTR,       LPROCFS_TYPE_REGS, "setxattr" },
+        { LPROC_LL_GETXATTR,       LPROCFS_TYPE_REGS, "getxattr" },
+	{ LPROC_LL_GETXATTR_HITS,  LPROCFS_TYPE_REGS, "getxattr_hits" },
+        { LPROC_LL_LISTXATTR,      LPROCFS_TYPE_REGS, "listxattr" },
+        { LPROC_LL_REMOVEXATTR,    LPROCFS_TYPE_REGS, "removexattr" },
+        { LPROC_LL_INODE_PERM,     LPROCFS_TYPE_REGS, "inode_permission" },
+};
+
+void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count)
+{
+        if (!sbi->ll_stats)
+                return;
+        if (sbi->ll_stats_track_type == STATS_TRACK_ALL)
+                lprocfs_counter_add(sbi->ll_stats, op, count);
+        else if (sbi->ll_stats_track_type == STATS_TRACK_PID &&
+                 sbi->ll_stats_track_id == current->pid)
+                lprocfs_counter_add(sbi->ll_stats, op, count);
+        else if (sbi->ll_stats_track_type == STATS_TRACK_PPID &&
+                 sbi->ll_stats_track_id == current->parent->pid)
+                lprocfs_counter_add(sbi->ll_stats, op, count);
+	else if (sbi->ll_stats_track_type == STATS_TRACK_GID &&
+		 sbi->ll_stats_track_id ==
+			from_kgid(&init_user_ns, current_gid()))
+		lprocfs_counter_add(sbi->ll_stats, op, count);
+}
+EXPORT_SYMBOL(ll_stats_ops_tally);
+
+static const char *ra_stat_string[] = {
+	[RA_STAT_HIT] = "hits",
+	[RA_STAT_MISS] = "misses",
+	[RA_STAT_DISTANT_READPAGE] = "readpage not consecutive",
+	[RA_STAT_MISS_IN_WINDOW] = "miss inside window",
+	[RA_STAT_FAILED_GRAB_PAGE] = "failed grab_cache_page",
+	[RA_STAT_FAILED_MATCH] = "failed lock match",
+	[RA_STAT_DISCARDED] = "read but discarded",
+	[RA_STAT_ZERO_LEN] = "zero length file",
+	[RA_STAT_ZERO_WINDOW] = "zero size window",
+	[RA_STAT_EOF] = "read-ahead to EOF",
+	[RA_STAT_MAX_IN_FLIGHT] = "hit max r-a issue",
+	[RA_STAT_WRONG_GRAB_PAGE] = "wrong page from grab_cache_page",
+	[RA_STAT_FAILED_REACH_END] = "failed to reach end"
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(llite, name);
+LPROC_SEQ_FOPS_RO_TYPE(llite, uuid);
+
+int lprocfs_ll_register_mountpoint(struct proc_dir_entry *parent,
+				   struct super_block *sb)
+{
+	struct lprocfs_vars lvars[2];
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	char name[MAX_STRING_SIZE + 1], *ptr;
+	int err, id, len, rc;
+	ENTRY;
+
+	memset(lvars, 0, sizeof(lvars));
+
+	name[MAX_STRING_SIZE] = '\0';
+	lvars[0].name = name;
+
+	LASSERT(sbi != NULL);
+
+	/* Get fsname */
+	len = strlen(lsi->lsi_lmd->lmd_profile);
+	ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-');
+	if (ptr && (strcmp(ptr, "-client") == 0))
+		len -= 7;
+
+	/* Mount info */
+	snprintf(name, MAX_STRING_SIZE, "%.*s-%p", len,
+		 lsi->lsi_lmd->lmd_profile, sb);
+
+	sbi->ll_proc_root = lprocfs_register(name, parent, NULL, NULL);
+	if (IS_ERR(sbi->ll_proc_root)) {
+		err = PTR_ERR(sbi->ll_proc_root);
+		sbi->ll_proc_root = NULL;
+		RETURN(err);
+	}
+
+	rc = lprocfs_seq_create(sbi->ll_proc_root, "dump_page_cache", 0444,
+				&vvp_dump_pgcache_file_ops, sbi);
+	if (rc)
+		CWARN("Error adding the dump_page_cache file\n");
+
+	rc = lprocfs_seq_create(sbi->ll_proc_root, "extents_stats", 0644,
+				&ll_rw_extents_stats_fops, sbi);
+	if (rc)
+		CWARN("Error adding the extent_stats file\n");
+
+	rc = lprocfs_seq_create(sbi->ll_proc_root, "extents_stats_per_process",
+				0644, &ll_rw_extents_stats_pp_fops, sbi);
+	if (rc)
+		CWARN("Error adding the extents_stats_per_process file\n");
+
+	rc = lprocfs_seq_create(sbi->ll_proc_root, "offset_stats", 0644,
+				&ll_rw_offset_stats_fops, sbi);
+	if (rc)
+		CWARN("Error adding the offset_stats file\n");
+
+	/* File operations stats */
+	sbi->ll_stats = lprocfs_alloc_stats(LPROC_LL_FILE_OPCODES,
+					    LPROCFS_STATS_FLAG_NONE);
+	if (sbi->ll_stats == NULL)
+		GOTO(out, err = -ENOMEM);
+	/* do counter init */
+	for (id = 0; id < LPROC_LL_FILE_OPCODES; id++) {
+		__u32 type = llite_opcode_table[id].type;
+		void *ptr = NULL;
+		if (type & LPROCFS_TYPE_REGS)
+			ptr = "regs";
+		else if (type & LPROCFS_TYPE_BYTES)
+			ptr = "bytes";
+		else if (type & LPROCFS_TYPE_PAGES)
+			ptr = "pages";
+		lprocfs_counter_init(sbi->ll_stats,
+				     llite_opcode_table[id].opcode,
+				     (type & LPROCFS_CNTR_AVGMINMAX),
+				     llite_opcode_table[id].opname, ptr);
+	}
+	err = lprocfs_register_stats(sbi->ll_proc_root, "stats", sbi->ll_stats);
+	if (err)
+		GOTO(out, err);
+
+	sbi->ll_ra_stats = lprocfs_alloc_stats(ARRAY_SIZE(ra_stat_string),
+					       LPROCFS_STATS_FLAG_NONE);
+	if (sbi->ll_ra_stats == NULL)
+		GOTO(out, err = -ENOMEM);
+
+	for (id = 0; id < ARRAY_SIZE(ra_stat_string); id++)
+		lprocfs_counter_init(sbi->ll_ra_stats, id, 0,
+				     ra_stat_string[id], "pages");
+	err = lprocfs_register_stats(sbi->ll_proc_root, "read_ahead_stats",
+				     sbi->ll_ra_stats);
+	if (err)
+		GOTO(out, err);
+
+
+	err = lprocfs_add_vars(sbi->ll_proc_root, lprocfs_llite_obd_vars, sb);
+	if (err)
+		GOTO(out, err);
+
+out:
+	if (err) {
+		lprocfs_remove(&sbi->ll_proc_root);
+		lprocfs_free_stats(&sbi->ll_ra_stats);
+		lprocfs_free_stats(&sbi->ll_stats);
+	}
+	RETURN(err);
+}
+
+int lprocfs_ll_register_obd(struct super_block *sb, const char *obdname)
+{
+	struct lprocfs_vars lvars[2];
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct obd_device *obd;
+	struct proc_dir_entry *dir;
+	char name[MAX_STRING_SIZE + 1];
+	int err;
+	ENTRY;
+
+	memset(lvars, 0, sizeof(lvars));
+
+	name[MAX_STRING_SIZE] = '\0';
+	lvars[0].name = name;
+
+	LASSERT(sbi != NULL);
+	LASSERT(obdname != NULL);
+
+	obd = class_name2obd(obdname);
+
+	LASSERT(obd != NULL);
+	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+	LASSERT(obd->obd_type->typ_name != NULL);
+
+	dir = proc_mkdir(obd->obd_type->typ_name, sbi->ll_proc_root);
+	if (dir == NULL)
+		GOTO(out, err = -ENOMEM);
+
+	snprintf(name, MAX_STRING_SIZE, "common_name");
+	lvars[0].fops = &llite_name_fops;
+	err = lprocfs_add_vars(dir, lvars, obd);
+	if (err)
+		GOTO(out, err);
+
+	snprintf(name, MAX_STRING_SIZE, "uuid");
+	lvars[0].fops = &llite_uuid_fops;
+	err = lprocfs_add_vars(dir, lvars, obd);
+	if (err)
+		GOTO(out, err);
+
+out:
+	if (err) {
+		lprocfs_remove(&sbi->ll_proc_root);
+		lprocfs_free_stats(&sbi->ll_ra_stats);
+		lprocfs_free_stats(&sbi->ll_stats);
+	}
+	RETURN(err);
+}
+
+void lprocfs_ll_unregister_mountpoint(struct ll_sb_info *sbi)
+{
+        if (sbi->ll_proc_root) {
+                lprocfs_remove(&sbi->ll_proc_root);
+                lprocfs_free_stats(&sbi->ll_ra_stats);
+                lprocfs_free_stats(&sbi->ll_stats);
+        }
+}
+#undef MAX_STRING_SIZE
+
+#define pct(a,b) (b ? a * 100 / b : 0)
+
+static void ll_display_extents_info(struct ll_rw_extents_info *io_extents,
+                                   struct seq_file *seq, int which)
+{
+        unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum;
+        unsigned long start, end, r, w;
+        char *unitp = "KMGTPEZY";
+        int i, units = 10;
+        struct per_process_info *pp_info = &io_extents->pp_extents[which];
+
+        read_cum = 0;
+        write_cum = 0;
+        start = 0;
+
+        for(i = 0; i < LL_HIST_MAX; i++) {
+                read_tot += pp_info->pp_r_hist.oh_buckets[i];
+                write_tot += pp_info->pp_w_hist.oh_buckets[i];
+        }
+
+        for(i = 0; i < LL_HIST_MAX; i++) {
+                r = pp_info->pp_r_hist.oh_buckets[i];
+                w = pp_info->pp_w_hist.oh_buckets[i];
+                read_cum += r;
+                write_cum += w;
+                end = 1 << (i + LL_HIST_START - units);
+                seq_printf(seq, "%4lu%c - %4lu%c%c: %14lu %4lu %4lu  | "
+                           "%14lu %4lu %4lu\n", start, *unitp, end, *unitp,
+                           (i == LL_HIST_MAX - 1) ? '+' : ' ',
+                           r, pct(r, read_tot), pct(read_cum, read_tot),
+                           w, pct(w, write_tot), pct(write_cum, write_tot));
+                start = end;
+                if (start == 1<<10) {
+                        start = 1;
+                        units += 10;
+                        unitp++;
+                }
+                if (read_cum == read_tot && write_cum == write_tot)
+                        break;
+        }
+}
+
+static int ll_rw_extents_stats_pp_seq_show(struct seq_file *seq, void *v)
+{
+	struct timespec64 now;
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+	int k;
+
+	ktime_get_real_ts64(&now);
+
+	if (!sbi->ll_rw_stats_on) {
+		seq_puts(seq, "disabled\n write anything to this file to activate, then '0' or 'disable' to deactivate\n");
+		return 0;
+	}
+	seq_printf(seq, "snapshot_time:         %llu.%09lu (secs.nsecs)\n",
+		   (s64)now.tv_sec, now.tv_nsec);
+        seq_printf(seq, "%15s %19s       | %20s\n", " ", "read", "write");
+        seq_printf(seq, "%13s   %14s %4s %4s  | %14s %4s %4s\n",
+                   "extents", "calls", "%", "cum%",
+                   "calls", "%", "cum%");
+	spin_lock(&sbi->ll_pp_extent_lock);
+	for (k = 0; k < LL_PROCESS_HIST_MAX; k++) {
+		if (io_extents->pp_extents[k].pid != 0) {
+			seq_printf(seq, "\nPID: %d\n",
+				   io_extents->pp_extents[k].pid);
+			ll_display_extents_info(io_extents, seq, k);
+		}
+	}
+	spin_unlock(&sbi->ll_pp_extent_lock);
+	return 0;
+}
+
+static ssize_t ll_rw_extents_stats_pp_seq_write(struct file *file,
+						const char __user *buf,
+						size_t len,
+						loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+	int i;
+	__s64 value;
+
+	if (len == 0)
+		return -EINVAL;
+
+	value = ll_stats_pid_write(buf, len);
+
+	if (value == 0)
+		sbi->ll_rw_stats_on = 0;
+	else
+		sbi->ll_rw_stats_on = 1;
+
+	spin_lock(&sbi->ll_pp_extent_lock);
+	for (i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+		io_extents->pp_extents[i].pid = 0;
+		lprocfs_oh_clear(&io_extents->pp_extents[i].pp_r_hist);
+		lprocfs_oh_clear(&io_extents->pp_extents[i].pp_w_hist);
+	}
+	spin_unlock(&sbi->ll_pp_extent_lock);
+	return len;
+}
+
+LPROC_SEQ_FOPS(ll_rw_extents_stats_pp);
+
+static int ll_rw_extents_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct timespec64 now;
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+
+	ktime_get_real_ts64(&now);
+
+	if (!sbi->ll_rw_stats_on) {
+		seq_puts(seq, "disabled\n write anything to this file to activate, then '0' or 'disable' to deactivate\n");
+		return 0;
+	}
+	seq_printf(seq, "snapshot_time:         %llu.%09lu (secs.nsecs)\n",
+		   (s64)now.tv_sec, now.tv_nsec);
+
+	seq_printf(seq, "%15s %19s       | %20s\n", " ", "read", "write");
+	seq_printf(seq, "%13s   %14s %4s %4s  | %14s %4s %4s\n",
+		   "extents", "calls", "%", "cum%",
+		   "calls", "%", "cum%");
+	spin_lock(&sbi->ll_lock);
+	ll_display_extents_info(io_extents, seq, LL_PROCESS_HIST_MAX);
+	spin_unlock(&sbi->ll_lock);
+
+	return 0;
+}
+
+static ssize_t ll_rw_extents_stats_seq_write(struct file *file,
+					     const char __user *buf,
+					     size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+	int i;
+	__s64 value;
+
+	if (len == 0)
+		return -EINVAL;
+
+	value = ll_stats_pid_write(buf, len);
+
+	if (value == 0)
+		sbi->ll_rw_stats_on = 0;
+	else
+		sbi->ll_rw_stats_on = 1;
+
+	spin_lock(&sbi->ll_pp_extent_lock);
+	for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) {
+		io_extents->pp_extents[i].pid = 0;
+		lprocfs_oh_clear(&io_extents->pp_extents[i].pp_r_hist);
+		lprocfs_oh_clear(&io_extents->pp_extents[i].pp_w_hist);
+	}
+	spin_unlock(&sbi->ll_pp_extent_lock);
+
+	return len;
+}
+LPROC_SEQ_FOPS(ll_rw_extents_stats);
+
+void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
+                       struct ll_file_data *file, loff_t pos,
+                       size_t count, int rw)
+{
+        int i, cur = -1;
+        struct ll_rw_process_info *process;
+        struct ll_rw_process_info *offset;
+        int *off_count = &sbi->ll_rw_offset_entry_count;
+        int *process_count = &sbi->ll_offset_process_count;
+        struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+
+        if(!sbi->ll_rw_stats_on)
+                return;
+        process = sbi->ll_rw_process_info;
+        offset = sbi->ll_rw_offset_info;
+
+	spin_lock(&sbi->ll_pp_extent_lock);
+        /* Extent statistics */
+        for(i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+                if(io_extents->pp_extents[i].pid == pid) {
+                        cur = i;
+                        break;
+                }
+        }
+
+        if (cur == -1) {
+                /* new process */
+                sbi->ll_extent_process_count =
+                        (sbi->ll_extent_process_count + 1) % LL_PROCESS_HIST_MAX;
+                cur = sbi->ll_extent_process_count;
+                io_extents->pp_extents[cur].pid = pid;
+                lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_r_hist);
+                lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_w_hist);
+        }
+
+        for(i = 0; (count >= (1 << LL_HIST_START << i)) &&
+             (i < (LL_HIST_MAX - 1)); i++);
+        if (rw == 0) {
+                io_extents->pp_extents[cur].pp_r_hist.oh_buckets[i]++;
+                io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_r_hist.oh_buckets[i]++;
+        } else {
+                io_extents->pp_extents[cur].pp_w_hist.oh_buckets[i]++;
+                io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_w_hist.oh_buckets[i]++;
+        }
+	spin_unlock(&sbi->ll_pp_extent_lock);
+
+	spin_lock(&sbi->ll_process_lock);
+        /* Offset statistics */
+        for (i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+                if (process[i].rw_pid == pid) {
+                        if (process[i].rw_last_file != file) {
+                                process[i].rw_range_start = pos;
+                                process[i].rw_last_file_pos = pos + count;
+                                process[i].rw_smallest_extent = count;
+                                process[i].rw_largest_extent = count;
+                                process[i].rw_offset = 0;
+                                process[i].rw_last_file = file;
+				spin_unlock(&sbi->ll_process_lock);
+                                return;
+                        }
+                        if (process[i].rw_last_file_pos != pos) {
+                                *off_count =
+                                    (*off_count + 1) % LL_OFFSET_HIST_MAX;
+                                offset[*off_count].rw_op = process[i].rw_op;
+                                offset[*off_count].rw_pid = pid;
+                                offset[*off_count].rw_range_start =
+                                        process[i].rw_range_start;
+                                offset[*off_count].rw_range_end =
+                                        process[i].rw_last_file_pos;
+                                offset[*off_count].rw_smallest_extent =
+                                        process[i].rw_smallest_extent;
+                                offset[*off_count].rw_largest_extent =
+                                        process[i].rw_largest_extent;
+                                offset[*off_count].rw_offset =
+                                        process[i].rw_offset;
+                                process[i].rw_op = rw;
+                                process[i].rw_range_start = pos;
+                                process[i].rw_smallest_extent = count;
+                                process[i].rw_largest_extent = count;
+                                process[i].rw_offset = pos -
+                                        process[i].rw_last_file_pos;
+                        }
+                        if(process[i].rw_smallest_extent > count)
+                                process[i].rw_smallest_extent = count;
+                        if(process[i].rw_largest_extent < count)
+                                process[i].rw_largest_extent = count;
+                        process[i].rw_last_file_pos = pos + count;
+			spin_unlock(&sbi->ll_process_lock);
+                        return;
+                }
+        }
+        *process_count = (*process_count + 1) % LL_PROCESS_HIST_MAX;
+        process[*process_count].rw_pid = pid;
+        process[*process_count].rw_op = rw;
+        process[*process_count].rw_range_start = pos;
+        process[*process_count].rw_last_file_pos = pos + count;
+        process[*process_count].rw_smallest_extent = count;
+        process[*process_count].rw_largest_extent = count;
+        process[*process_count].rw_offset = 0;
+        process[*process_count].rw_last_file = file;
+	spin_unlock(&sbi->ll_process_lock);
+}
+
+static int ll_rw_offset_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct timespec64 now;
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_process_info *offset = sbi->ll_rw_offset_info;
+	struct ll_rw_process_info *process = sbi->ll_rw_process_info;
+	int i;
+
+	ktime_get_real_ts64(&now);
+
+	if (!sbi->ll_rw_stats_on) {
+		seq_puts(seq, "disabled\n write anything to this file to activate, then '0' or 'disable' to deactivate\n");
+		return 0;
+	}
+	spin_lock(&sbi->ll_process_lock);
+
+	seq_printf(seq, "snapshot_time:         %llu.%09lu (secs.nsecs)\n",
+		   (s64)now.tv_sec, now.tv_nsec);
+	seq_printf(seq, "%3s %10s %14s %14s %17s %17s %14s\n",
+		   "R/W", "PID", "RANGE START", "RANGE END",
+		   "SMALLEST EXTENT", "LARGEST EXTENT", "OFFSET");
+
+	/* We stored the discontiguous offsets here; print them first */
+	for (i = 0; i < LL_OFFSET_HIST_MAX; i++) {
+		if (offset[i].rw_pid != 0)
+			seq_printf(seq,
+				   "%3c %10d %14Lu %14Lu %17lu %17lu %14Lu",
+				   offset[i].rw_op == READ ? 'R' : 'W',
+				   offset[i].rw_pid,
+				   offset[i].rw_range_start,
+				   offset[i].rw_range_end,
+				   (unsigned long)offset[i].rw_smallest_extent,
+				   (unsigned long)offset[i].rw_largest_extent,
+				   offset[i].rw_offset);
+	}
+
+	/* Then print the current offsets for each process */
+	for (i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+		if (process[i].rw_pid != 0)
+			seq_printf(seq,
+				   "%3c %10d %14Lu %14Lu %17lu %17lu %14Lu",
+				   process[i].rw_op == READ ? 'R' : 'W',
+				   process[i].rw_pid,
+				   process[i].rw_range_start,
+				   process[i].rw_last_file_pos,
+				   (unsigned long)process[i].rw_smallest_extent,
+				   (unsigned long)process[i].rw_largest_extent,
+				   process[i].rw_offset);
+	}
+	spin_unlock(&sbi->ll_process_lock);
+
+	return 0;
+}
+
+static ssize_t ll_rw_offset_stats_seq_write(struct file *file,
+					    const char __user *buf,
+					    size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_process_info *process_info = sbi->ll_rw_process_info;
+	struct ll_rw_process_info *offset_info = sbi->ll_rw_offset_info;
+	__s64 value;
+
+	if (len == 0)
+		return -EINVAL;
+
+	value = ll_stats_pid_write(buf, len);
+
+	if (value == 0)
+		sbi->ll_rw_stats_on = 0;
+	else
+		sbi->ll_rw_stats_on = 1;
+
+	spin_lock(&sbi->ll_process_lock);
+	sbi->ll_offset_process_count = 0;
+	sbi->ll_rw_offset_entry_count = 0;
+	memset(process_info, 0, sizeof(struct ll_rw_process_info) *
+	       LL_PROCESS_HIST_MAX);
+	memset(offset_info, 0, sizeof(struct ll_rw_process_info) *
+	       LL_OFFSET_HIST_MAX);
+	spin_unlock(&sbi->ll_process_lock);
+
+	return len;
+}
+
+/**
+ * ll_stats_pid_write() - Determine if stats collection should be enabled
+ * @buf: Buffer containing the data written
+ * @len: Number of bytes in the buffer
+ *
+ * Several proc files begin collecting stats when a value is written, and stop
+ * collecting when either '0' or 'disable' is written. This function checks the
+ * written value to see if collection should be enabled or disabled.
+ *
+ * Return: If '0' or 'disable' is provided, 0 is returned. If the text
+ * equivalent of a number is written, that number is returned. Otherwise,
+ * 1 is returned. Non-zero return values indicate collection should be enabled.
+ */
+static __s64 ll_stats_pid_write(const char __user *buf, size_t len)
+{
+	__s64 value = 1;
+	int rc;
+	char kernbuf[16];
+
+	rc = lprocfs_str_to_s64(buf, len, &value);
+
+	if (rc < 0 && len < sizeof(kernbuf)) {
+
+		if (copy_from_user(kernbuf, buf, len))
+			return -EFAULT;
+		kernbuf[len] = 0;
+
+		if (kernbuf[len - 1] == '\n')
+			kernbuf[len - 1] = 0;
+
+		if (strncasecmp(kernbuf, "disable", 7) == 0)
+			value = 0;
+	}
+
+	return value;
+}
+
+LPROC_SEQ_FOPS(ll_rw_offset_stats);
+#endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/llite/namei.c b/drivers/staging/lustrefsx/lustre/llite/namei.c
new file mode 100644
index 0000000000000..bf8b76efefb85
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/namei.c
@@ -0,0 +1,1536 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/quotaops.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/security.h>
+#include <linux/user_namespace.h>
+#ifdef HAVE_UIDGID_HEADER
+# include <linux/uidgid.h>
+#endif
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <lustre_dlm.h>
+#include <lustre_ver.h>
+#include "llite_internal.h"
+
+static int ll_create_it(struct inode *dir, struct dentry *dentry,
+			struct lookup_intent *it,
+			void *secctx, __u32 secctxlen);
+
+/* called from iget5_locked->find_inode() under inode_lock spinlock */
+static int ll_test_inode(struct inode *inode, void *opaque)
+{
+	struct ll_inode_info	*lli = ll_i2info(inode);
+	struct lustre_md	*md = opaque;
+
+	if (unlikely(!(md->body->mbo_valid & OBD_MD_FLID))) {
+		CERROR("MDS body missing FID\n");
+		return 0;
+	}
+
+	if (!lu_fid_eq(&lli->lli_fid, &md->body->mbo_fid1))
+		return 0;
+
+	return 1;
+}
+
+static int ll_set_inode(struct inode *inode, void *opaque)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct mdt_body *body = ((struct lustre_md *)opaque)->body;
+
+	if (unlikely(!(body->mbo_valid & OBD_MD_FLID))) {
+		CERROR("MDS body missing FID\n");
+		return -EINVAL;
+	}
+
+	lli->lli_fid = body->mbo_fid1;
+	if (unlikely(!(body->mbo_valid & OBD_MD_FLTYPE))) {
+		CERROR("Can not initialize inode "DFID" without object type: "
+		       "valid = %#llx\n",
+		       PFID(&lli->lli_fid), body->mbo_valid);
+		return -EINVAL;
+	}
+
+	inode->i_mode = (inode->i_mode & ~S_IFMT) | (body->mbo_mode & S_IFMT);
+	if (unlikely(inode->i_mode == 0)) {
+		CERROR("Invalid inode "DFID" type\n", PFID(&lli->lli_fid));
+		return -EINVAL;
+	}
+
+	ll_lli_init(lli);
+
+	return 0;
+}
+
+
+/**
+ * Get an inode by inode number(@hash), which is already instantiated by
+ * the intent lookup).
+ */
+struct inode *ll_iget(struct super_block *sb, ino_t hash,
+                      struct lustre_md *md)
+{
+	struct inode	*inode;
+	int		rc = 0;
+
+	ENTRY;
+
+        LASSERT(hash != 0);
+        inode = iget5_locked(sb, hash, ll_test_inode, ll_set_inode, md);
+	if (inode == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	if (inode->i_state & I_NEW) {
+		rc = ll_read_inode2(inode, md);
+		if (rc == 0 && S_ISREG(inode->i_mode) &&
+		    ll_i2info(inode)->lli_clob == NULL)
+			rc = cl_file_inode_init(inode, md);
+
+		if (rc != 0) {
+			/* Let's clear directory lsm here, otherwise
+			 * make_bad_inode() will reset the inode mode
+			 * to regular, then ll_clear_inode will not
+			 * be able to clear lsm_md */
+			if (S_ISDIR(inode->i_mode))
+				ll_dir_clear_lsm_md(inode);
+			make_bad_inode(inode);
+			unlock_new_inode(inode);
+			iput(inode);
+			inode = ERR_PTR(rc);
+		} else {
+			inode_has_no_xattr(inode);
+			unlock_new_inode(inode);
+		}
+	} else if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
+		rc = ll_update_inode(inode, md);
+		CDEBUG(D_VFSTRACE, "got inode: "DFID"(%p): rc = %d\n",
+		       PFID(&md->body->mbo_fid1), inode, rc);
+		if (rc != 0) {
+			if (S_ISDIR(inode->i_mode))
+				ll_dir_clear_lsm_md(inode);
+			iput(inode);
+			inode = ERR_PTR(rc);
+		}
+	}
+
+        RETURN(inode);
+}
+
+static void ll_invalidate_negative_children(struct inode *dir)
+{
+	struct dentry *dentry, *tmp_subdir;
+	DECLARE_LL_D_HLIST_NODE_PTR(p);
+
+	ll_lock_dcache(dir);
+	ll_d_hlist_for_each_entry(dentry, p, &dir->i_dentry) {
+		spin_lock(&dentry->d_lock);
+		if (!list_empty(&dentry->d_subdirs)) {
+			struct dentry *child;
+
+			list_for_each_entry_safe(child, tmp_subdir,
+						 &dentry->d_subdirs,
+						 d_child) {
+				if (child->d_inode == NULL)
+					d_lustre_invalidate(child, 1);
+			}
+		}
+		spin_unlock(&dentry->d_lock);
+	}
+	ll_unlock_dcache(dir);
+}
+
+int ll_test_inode_by_fid(struct inode *inode, void *opaque)
+{
+	return lu_fid_eq(&ll_i2info(inode)->lli_fid, opaque);
+}
+
+int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+		       void *data, int flag)
+{
+	struct lustre_handle lockh;
+	int rc;
+	ENTRY;
+
+	switch (flag) {
+	case LDLM_CB_BLOCKING:
+		ldlm_lock2handle(lock, &lockh);
+		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+		if (rc < 0) {
+			CDEBUG(D_INODE, "ldlm_cli_cancel: rc = %d\n", rc);
+			RETURN(rc);
+		}
+		break;
+	case LDLM_CB_CANCELING: {
+		struct inode *inode = ll_inode_from_resource_lock(lock);
+		__u64 bits = lock->l_policy_data.l_inodebits.bits;
+
+		/* Inode is set to lock->l_resource->lr_lvb_inode
+		 * for mdc - bug 24555 */
+		LASSERT(lock->l_ast_data == NULL);
+
+		if (inode == NULL)
+			break;
+
+		/* Invalidate all dentries associated with this inode */
+		LASSERT(ldlm_is_canceling(lock));
+
+		if (!fid_res_name_eq(ll_inode2fid(inode),
+				     &lock->l_resource->lr_name)) {
+			LDLM_ERROR(lock, "data mismatch with object "DFID"(%p)",
+				   PFID(ll_inode2fid(inode)), inode);
+			LBUG();
+		}
+
+		if (bits & MDS_INODELOCK_XATTR) {
+			if (S_ISDIR(inode->i_mode))
+				ll_i2info(inode)->lli_def_stripe_offset = -1;
+			ll_xattr_cache_destroy(inode);
+			bits &= ~MDS_INODELOCK_XATTR;
+		}
+
+		/* For OPEN locks we differentiate between lock modes
+		 * LCK_CR, LCK_CW, LCK_PR - bug 22891 */
+		if (bits & MDS_INODELOCK_OPEN)
+			ll_have_md_lock(inode, &bits, lock->l_req_mode);
+
+		if (bits & MDS_INODELOCK_OPEN) {
+			fmode_t fmode;
+
+			switch (lock->l_req_mode) {
+			case LCK_CW:
+				fmode = FMODE_WRITE;
+				break;
+			case LCK_PR:
+				fmode = FMODE_EXEC;
+				break;
+			case LCK_CR:
+				fmode = FMODE_READ;
+				break;
+			default:
+				LDLM_ERROR(lock, "bad lock mode for OPEN lock");
+				LBUG();
+			}
+
+			ll_md_real_close(inode, fmode);
+
+			bits &= ~MDS_INODELOCK_OPEN;
+		}
+
+		if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE |
+			    MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM))
+			ll_have_md_lock(inode, &bits, LCK_MINMODE);
+
+		if (bits & MDS_INODELOCK_LAYOUT) {
+			struct cl_object_conf conf = {
+				.coc_opc = OBJECT_CONF_INVALIDATE,
+				.coc_inode = inode,
+			};
+
+			rc = ll_layout_conf(inode, &conf);
+			if (rc < 0)
+				CDEBUG(D_INODE, "cannot invalidate layout of "
+				       DFID": rc = %d\n",
+				       PFID(ll_inode2fid(inode)), rc);
+		}
+
+		if (bits & MDS_INODELOCK_UPDATE) {
+			struct ll_inode_info *lli = ll_i2info(inode);
+			lli->lli_update_atime = 1;
+		}
+
+		if ((bits & MDS_INODELOCK_UPDATE) && S_ISDIR(inode->i_mode)) {
+			struct ll_inode_info *lli = ll_i2info(inode);
+
+			CDEBUG(D_INODE, "invalidating inode "DFID" lli = %p, "
+			       "pfid  = "DFID"\n", PFID(ll_inode2fid(inode)),
+			       lli, PFID(&lli->lli_pfid));
+			truncate_inode_pages(inode->i_mapping, 0);
+
+			if (unlikely(!fid_is_zero(&lli->lli_pfid))) {
+				struct inode *master_inode = NULL;
+				unsigned long hash;
+
+				/* This is slave inode, since all of the child
+				 * dentry is connected on the master inode, so
+				 * we have to invalidate the negative children
+				 * on master inode */
+				CDEBUG(D_INODE, "Invalidate s"DFID" m"DFID"\n",
+				       PFID(ll_inode2fid(inode)),
+				       PFID(&lli->lli_pfid));
+
+				hash = cl_fid_build_ino(&lli->lli_pfid,
+					ll_need_32bit_api(ll_i2sbi(inode)));
+
+				/* Do not lookup the inode with ilookup5,
+				 * otherwise it will cause dead lock,
+				 *
+				 * 1. Client1 send chmod req to the MDT0, then
+				 * on MDT0, it enqueues master and all of its
+				 * slaves lock, (mdt_attr_set() ->
+				 * mdt_lock_slaves()), after gets master and
+				 * stripe0 lock, it will send the enqueue req
+				 * (for stripe1) to MDT1, then MDT1 finds the
+				 * lock has been granted to client2. Then MDT1
+				 * sends blocking ast to client2.
+				 *
+				 * 2. At the same time, client2 tries to unlink
+				 * the striped dir (rm -rf striped_dir), and
+				 * during lookup, it will hold the master inode
+				 * of the striped directory, whose inode state
+				 * is NEW, then tries to revalidate all of its
+				 * slaves, (ll_prep_inode()->ll_iget()->
+				 * ll_read_inode2()-> ll_update_inode().). And
+				 * it will be blocked on the server side because
+				 * of 1.
+				 *
+				 * 3. Then the client get the blocking_ast req,
+				 * cancel the lock, but being blocked if using
+				 * ->ilookup5()), because master inode state is
+				 *  NEW. */
+				master_inode = ilookup5_nowait(inode->i_sb,
+						    hash, ll_test_inode_by_fid,
+							(void *)&lli->lli_pfid);
+				if (master_inode) {
+					ll_invalidate_negative_children(
+								master_inode);
+					iput(master_inode);
+				}
+			} else {
+				ll_invalidate_negative_children(inode);
+			}
+		}
+
+		if ((bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM)) &&
+		    inode->i_sb->s_root != NULL &&
+		    inode != inode->i_sb->s_root->d_inode)
+			ll_invalidate_aliases(inode);
+
+		iput(inode);
+		break;
+	}
+	default:
+		LBUG();
+	}
+
+	RETURN(0);
+}
+
+__u32 ll_i2suppgid(struct inode *i)
+{
+	if (in_group_p(i->i_gid))
+		return (__u32)from_kgid(&init_user_ns, i->i_gid);
+	else
+		return (__u32) __kgid_val(INVALID_GID);
+}
+
+/* Pack the required supplementary groups into the supplied groups array.
+ * If we don't need to use the groups from the target inode(s) then we
+ * instead pack one or more groups from the user's supplementary group
+ * array in case it might be useful.  Not needed if doing an MDS-side upcall. */
+void ll_i2gids(__u32 *suppgids, struct inode *i1, struct inode *i2)
+{
+	LASSERT(i1 != NULL);
+	LASSERT(suppgids != NULL);
+
+	suppgids[0] = ll_i2suppgid(i1);
+
+	if (i2)
+		suppgids[1] = ll_i2suppgid(i2);
+	else
+		suppgids[1] = -1;
+}
+
+/*
+ * try to reuse three types of dentry:
+ * 1. unhashed alias, this one is unhashed by d_invalidate (but it may be valid
+ *    by concurrent .revalidate).
+ * 2. INVALID alias (common case for no valid ldlm lock held, but this flag may
+ *    be cleared by others calling d_lustre_revalidate).
+ * 3. DISCONNECTED alias.
+ */
+static struct dentry *ll_find_alias(struct inode *inode, struct dentry *dentry)
+{
+	struct dentry *alias, *discon_alias, *invalid_alias;
+	DECLARE_LL_D_HLIST_NODE_PTR(p);
+
+	if (ll_d_hlist_empty(&inode->i_dentry))
+		return NULL;
+
+	discon_alias = invalid_alias = NULL;
+
+	ll_lock_dcache(inode);
+	ll_d_hlist_for_each_entry(alias, p, &inode->i_dentry) {
+		LASSERT(alias != dentry);
+
+		spin_lock(&alias->d_lock);
+		if ((alias->d_flags & DCACHE_DISCONNECTED) &&
+		    S_ISDIR(inode->i_mode))
+			/* LASSERT(last_discon == NULL); LU-405, bz 20055 */
+			discon_alias = alias;
+		else if (alias->d_parent == dentry->d_parent             &&
+			 alias->d_name.hash == dentry->d_name.hash       &&
+			 alias->d_name.len == dentry->d_name.len         &&
+			 memcmp(alias->d_name.name, dentry->d_name.name,
+				dentry->d_name.len) == 0)
+			invalid_alias = alias;
+		spin_unlock(&alias->d_lock);
+
+		if (invalid_alias)
+			break;
+	}
+	alias = invalid_alias ?: discon_alias ?: NULL;
+	if (alias) {
+		spin_lock(&alias->d_lock);
+		dget_dlock(alias);
+		spin_unlock(&alias->d_lock);
+	}
+	ll_unlock_dcache(inode);
+
+	return alias;
+}
+
+/*
+ * Similar to d_splice_alias(), but lustre treats invalid alias
+ * similar to DCACHE_DISCONNECTED, and tries to use it anyway.
+ */
+struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de)
+{
+	struct dentry *new;
+	int rc;
+
+	if (inode) {
+		new = ll_find_alias(inode, de);
+		if (new) {
+			rc = ll_d_init(new);
+			if (rc < 0) {
+				dput(new);
+				return ERR_PTR(rc);
+			}
+			d_move(new, de);
+			iput(inode);
+			CDEBUG(D_DENTRY,
+			       "Reuse dentry %p inode %p refc %d flags %#x\n",
+			      new, new->d_inode, ll_d_count(new), new->d_flags);
+			return new;
+		}
+	}
+	rc = ll_d_init(de);
+	if (rc < 0)
+		return ERR_PTR(rc);
+	d_add(de, inode);
+	CDEBUG(D_DENTRY, "Add dentry %p inode %p refc %d flags %#x\n",
+	       de, de->d_inode, ll_d_count(de), de->d_flags);
+        return de;
+}
+
+static int ll_lookup_it_finish(struct ptlrpc_request *request,
+			       struct lookup_intent *it,
+			       struct inode *parent, struct dentry **de)
+{
+	struct inode		 *inode = NULL;
+	__u64			  bits = 0;
+	int			  rc;
+	struct dentry *alias;
+	ENTRY;
+
+	/* NB 1 request reference will be taken away by ll_intent_lock()
+	 * when I return */
+	CDEBUG(D_DENTRY, "it %p it_disposition %x\n", it,
+	       it->it_disposition);
+	if (!it_disposition(it, DISP_LOOKUP_NEG)) {
+                rc = ll_prep_inode(&inode, request, (*de)->d_sb, it);
+                if (rc)
+                        RETURN(rc);
+
+                ll_set_lock_data(ll_i2sbi(parent)->ll_md_exp, inode, it, &bits);
+
+                /* We used to query real size from OSTs here, but actually
+                   this is not needed. For stat() calls size would be updated
+                   from subsequent do_revalidate()->ll_inode_revalidate_it() in
+                   2.4 and
+                   vfs_getattr_it->ll_getattr()->ll_inode_revalidate_it() in 2.6
+                   Everybody else who needs correct file size would call
+                   ll_glimpse_size or some equivalent themselves anyway.
+                   Also see bug 7198. */
+	}
+
+	/* Only hash *de if it is unhashed (new dentry).
+	 * Atoimc_open may passin hashed dentries for open.
+	 */
+	alias = ll_splice_alias(inode, *de);
+	if (IS_ERR(alias))
+		GOTO(out, rc = PTR_ERR(alias));
+
+	*de = alias;
+
+	if (!it_disposition(it, DISP_LOOKUP_NEG)) {
+		/* we have lookup look - unhide dentry */
+		if (bits & MDS_INODELOCK_LOOKUP)
+			d_lustre_revalidate(*de);
+	} else if (!it_disposition(it, DISP_OPEN_CREATE)) {
+		/* If file created on server, don't depend on parent UPDATE
+		 * lock to unhide it. It is left hidden and next lookup can
+		 * find it in ll_splice_alias.
+		 */
+		/* Check that parent has UPDATE lock. */
+		struct lookup_intent parent_it = {
+					.it_op = IT_GETATTR,
+					.it_lock_handle = 0 };
+		struct lu_fid	fid = ll_i2info(parent)->lli_fid;
+
+		/* If it is striped directory, get the real stripe parent */
+		if (unlikely(ll_i2info(parent)->lli_lsm_md != NULL)) {
+			rc = md_get_fid_from_lsm(ll_i2mdexp(parent),
+						 ll_i2info(parent)->lli_lsm_md,
+						 (*de)->d_name.name,
+						 (*de)->d_name.len, &fid);
+			if (rc != 0)
+				GOTO(out, rc);
+		}
+
+		if (md_revalidate_lock(ll_i2mdexp(parent), &parent_it, &fid,
+				       NULL)) {
+			d_lustre_revalidate(*de);
+			ll_intent_release(&parent_it);
+		}
+	}
+
+	GOTO(out, rc = 0);
+
+out:
+	if (rc != 0 && it->it_op & IT_OPEN)
+		ll_open_cleanup((*de)->d_sb, request);
+
+	return rc;
+}
+
+static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
+				   struct lookup_intent *it,
+				   void **secctx, __u32 *secctxlen)
+{
+	struct lookup_intent lookup_it = { .it_op = IT_LOOKUP };
+	struct dentry *save = dentry, *retval;
+	struct ptlrpc_request *req = NULL;
+	struct md_op_data *op_data = NULL;
+        __u32 opc;
+        int rc;
+        ENTRY;
+
+        if (dentry->d_name.len > ll_i2sbi(parent)->ll_namelen)
+                RETURN(ERR_PTR(-ENAMETOOLONG));
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p), intent=%s\n",
+	       dentry->d_name.len, dentry->d_name.name,
+	       PFID(ll_inode2fid(parent)), parent, LL_IT2STR(it));
+
+        if (d_mountpoint(dentry))
+                CERROR("Tell Peter, lookup on mtpt, it %s\n", LL_IT2STR(it));
+
+	if (it == NULL || it->it_op == IT_GETXATTR)
+		it = &lookup_it;
+
+	if (it->it_op == IT_GETATTR && dentry_may_statahead(parent, dentry)) {
+		rc = ll_statahead(parent, &dentry, 0);
+		if (rc == 1)
+			RETURN(dentry == save ? NULL : dentry);
+	}
+
+	if (it->it_op & IT_OPEN && it->it_flags & FMODE_WRITE &&
+	    dentry->d_sb->s_flags & MS_RDONLY)
+		RETURN(ERR_PTR(-EROFS));
+
+	if (it->it_op & IT_CREAT)
+		opc = LUSTRE_OPC_CREATE;
+	else
+		opc = LUSTRE_OPC_ANY;
+
+	op_data = ll_prep_md_op_data(NULL, parent, NULL, dentry->d_name.name,
+				     dentry->d_name.len, 0, opc, NULL);
+	if (IS_ERR(op_data))
+		GOTO(out, retval = ERR_CAST(op_data));
+
+	/* enforce umask if acl disabled or MDS doesn't support umask */
+	if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent)))
+		it->it_create_mode &= ~current_umask();
+
+	if (it->it_op & IT_CREAT &&
+	    ll_i2sbi(parent)->ll_flags & LL_SBI_FILE_SECCTX) {
+		rc = ll_dentry_init_security(dentry, it->it_create_mode,
+					     &dentry->d_name,
+					     &op_data->op_file_secctx_name,
+					     &op_data->op_file_secctx,
+					     &op_data->op_file_secctx_size);
+		if (rc < 0)
+			GOTO(out, retval = ERR_PTR(rc));
+		if (secctx != NULL)
+			*secctx = op_data->op_file_secctx;
+		if (secctxlen != NULL)
+			*secctxlen = op_data->op_file_secctx_size;
+	}
+
+	rc = md_intent_lock(ll_i2mdexp(parent), op_data, it, &req,
+			    &ll_md_blocking_ast, 0);
+	/* If the MDS allows the client to chgrp (CFS_SETGRP_PERM), but the
+	 * client does not know which suppgid should be sent to the MDS, or
+	 * some other(s) changed the target file's GID after this RPC sent
+	 * to the MDS with the suppgid as the original GID, then we should
+	 * try again with right suppgid. */
+	if (rc == -EACCES && it->it_op & IT_OPEN &&
+	    it_disposition(it, DISP_OPEN_DENY)) {
+		struct mdt_body *body;
+
+		LASSERT(req != NULL);
+
+		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+		if (op_data->op_suppgids[0] == body->mbo_gid ||
+		    op_data->op_suppgids[1] == body->mbo_gid ||
+		    !in_group_p(make_kgid(&init_user_ns, body->mbo_gid)))
+			GOTO(out, retval = ERR_PTR(-EACCES));
+
+		fid_zero(&op_data->op_fid2);
+		op_data->op_suppgids[1] = body->mbo_gid;
+		ptlrpc_req_finished(req);
+		req = NULL;
+		ll_intent_release(it);
+		rc = md_intent_lock(ll_i2mdexp(parent), op_data, it, &req,
+				    &ll_md_blocking_ast, 0);
+	}
+
+	if (rc < 0)
+		GOTO(out, retval = ERR_PTR(rc));
+
+	rc = ll_lookup_it_finish(req, it, parent, &dentry);
+        if (rc != 0) {
+                ll_intent_release(it);
+                GOTO(out, retval = ERR_PTR(rc));
+        }
+
+        if ((it->it_op & IT_OPEN) && dentry->d_inode &&
+            !S_ISREG(dentry->d_inode->i_mode) &&
+            !S_ISDIR(dentry->d_inode->i_mode)) {
+                ll_release_openhandle(dentry, it);
+        }
+        ll_lookup_finish_locks(it, dentry);
+
+	GOTO(out, retval = (dentry == save) ? NULL : dentry);
+
+out:
+	if (op_data != NULL && !IS_ERR(op_data)) {
+		if (secctx != NULL && secctxlen != NULL) {
+			/* caller needs sec ctx info, so reset it in op_data to
+			 * prevent it from being freed */
+			op_data->op_file_secctx = NULL;
+			op_data->op_file_secctx_size = 0;
+		}
+		ll_finish_md_op_data(op_data);
+	}
+
+	ptlrpc_req_finished(req);
+	return retval;
+}
+
+#ifdef HAVE_IOP_ATOMIC_OPEN
+static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry,
+				   unsigned int flags)
+{
+	struct lookup_intent *itp, it = { .it_op = IT_GETATTR };
+	struct dentry *de;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p), flags=%u\n",
+	       dentry->d_name.len, dentry->d_name.name,
+	       PFID(ll_inode2fid(parent)), parent, flags);
+
+	/*
+	 * Optimize away (CREATE && !OPEN). Let .create handle the race.
+	 * but only if we have write permissions there, otherwise we need
+	 * to proceed with lookup. LU-4185
+	 */
+	if ((flags & LOOKUP_CREATE) && !(flags & LOOKUP_OPEN) &&
+	    (inode_permission(parent, MAY_WRITE | MAY_EXEC) == 0))
+		return NULL;
+
+	if (flags & (LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE))
+		itp = NULL;
+	else
+		itp = &it;
+	de = ll_lookup_it(parent, dentry, itp, NULL, NULL);
+
+	if (itp != NULL)
+		ll_intent_release(itp);
+
+	return de;
+}
+
+/*
+ * For cached negative dentry and new dentry, handle lookup/create/open
+ * together.
+ */
+static int ll_atomic_open(struct inode *dir, struct dentry *dentry,
+			  struct file *file, unsigned open_flags,
+			  umode_t mode, int *opened)
+{
+	struct lookup_intent *it;
+	struct dentry *de;
+	long long lookup_flags = LOOKUP_OPEN;
+	void *secctx = NULL;
+	__u32 secctxlen = 0;
+	int rc = 0;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p), file %p,"
+			   "open_flags %x, mode %x opened %d\n",
+	       dentry->d_name.len, dentry->d_name.name,
+	       PFID(ll_inode2fid(dir)), dir, file, open_flags, mode, *opened);
+
+	/* Only negative dentries enter here */
+	LASSERT(dentry->d_inode == NULL);
+
+	if (!d_unhashed(dentry)) {
+		/* A valid negative dentry that just passed revalidation,
+		 * there's little point to try and open it server-side,
+		 * even though there's a minuscule chance it might succeed.
+		 * Either way it's a valid race to just return -ENOENT here.
+		 */
+		if (!(open_flags & O_CREAT))
+			return -ENOENT;
+
+		/* Otherwise we just unhash it to be rehashed afresh via
+		 * lookup if necessary
+		 */
+		d_drop(dentry);
+	}
+
+	OBD_ALLOC(it, sizeof(*it));
+	if (!it)
+		RETURN(-ENOMEM);
+
+	it->it_op = IT_OPEN;
+	if (open_flags & O_CREAT) {
+		it->it_op |= IT_CREAT;
+		lookup_flags |= LOOKUP_CREATE;
+	}
+	it->it_create_mode = (mode & S_IALLUGO) | S_IFREG;
+	it->it_flags = (open_flags & ~O_ACCMODE) | OPEN_FMODE(open_flags);
+	it->it_flags &= ~MDS_OPEN_FL_INTERNAL;
+
+	/* Dentry added to dcache tree in ll_lookup_it */
+	de = ll_lookup_it(dir, dentry, it, &secctx, &secctxlen);
+	if (IS_ERR(de))
+		rc = PTR_ERR(de);
+	else if (de != NULL)
+		dentry = de;
+
+	CFS_FAIL_TIMEOUT(OBD_FAIL_LLITE_CREATE_FILE_PAUSE, cfs_fail_val);
+
+	if (!rc) {
+		if (it_disposition(it, DISP_OPEN_CREATE)) {
+			/* Dentry instantiated in ll_create_it. */
+			rc = ll_create_it(dir, dentry, it, secctx, secctxlen);
+			security_release_secctx(secctx, secctxlen);
+			if (rc) {
+				/* We dget in ll_splice_alias. */
+				if (de != NULL)
+					dput(de);
+				goto out_release;
+			}
+
+			*opened |= FILE_CREATED;
+		}
+		if (dentry->d_inode && it_disposition(it, DISP_OPEN_OPEN)) {
+			/* Open dentry. */
+			if (S_ISFIFO(dentry->d_inode->i_mode)) {
+				/* We cannot call open here as it might
+				 * deadlock. This case is unreachable in
+				 * practice because of OBD_CONNECT_NODEVOH. */
+				rc = finish_no_open(file, de);
+			} else {
+				file->private_data = it;
+				rc = finish_open(file, dentry, NULL, opened);
+				/* We dget in ll_splice_alias. finish_open takes
+				 * care of dget for fd open.
+				 */
+				if (de != NULL)
+					dput(de);
+			}
+		} else {
+			rc = finish_no_open(file, de);
+		}
+	}
+
+out_release:
+	ll_intent_release(it);
+	OBD_FREE(it, sizeof(*it));
+
+	RETURN(rc);
+}
+
+#else /* !HAVE_IOP_ATOMIC_OPEN */
+static struct lookup_intent *
+ll_convert_intent(struct open_intent *oit, int lookup_flags, bool is_readonly)
+{
+	struct lookup_intent *it;
+
+	OBD_ALLOC_PTR(it);
+	if (!it)
+		return ERR_PTR(-ENOMEM);
+
+	if (lookup_flags & LOOKUP_OPEN) {
+		it->it_op = IT_OPEN;
+		/* Avoid file creation for ro bind mount point(is_readonly) */
+		if ((lookup_flags & LOOKUP_CREATE) && !is_readonly)
+			it->it_op |= IT_CREAT;
+		it->it_create_mode = (oit->create_mode & S_IALLUGO) | S_IFREG;
+		it->it_flags = ll_namei_to_lookup_intent_flag(oit->flags &
+						~(is_readonly ? O_CREAT : 0));
+		it->it_flags &= ~MDS_OPEN_FL_INTERNAL;
+	} else {
+		it->it_op = IT_GETATTR;
+	}
+
+	return it;
+}
+
+static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry,
+                                   struct nameidata *nd)
+{
+	struct dentry *de;
+	ENTRY;
+
+	if (nd && !(nd->flags & (LOOKUP_CONTINUE|LOOKUP_PARENT))) {
+		struct lookup_intent *it;
+
+		if (ll_d2d(dentry) && ll_d2d(dentry)->lld_it) {
+			it = ll_d2d(dentry)->lld_it;
+			ll_d2d(dentry)->lld_it = NULL;
+		} else {
+			/*
+			 * Optimize away (CREATE && !OPEN). Let .create handle
+			 * the race. But only if we have write permissions
+			 * there, otherwise we need to proceed with lookup.
+			 * LU-4185
+			 */
+			if ((nd->flags & LOOKUP_CREATE) &&
+			    !(nd->flags & LOOKUP_OPEN) &&
+			    (inode_permission(parent,
+					      MAY_WRITE | MAY_EXEC) == 0))
+				RETURN(NULL);
+
+			it = ll_convert_intent(&nd->intent.open, nd->flags,
+				(nd->path.mnt->mnt_flags & MNT_READONLY) ||
+				(nd->path.mnt->mnt_sb->s_flags & MS_RDONLY));
+			if (IS_ERR(it))
+				RETURN((struct dentry *)it);
+		}
+
+		de = ll_lookup_it(parent, dentry, it, NULL, NULL);
+		if (de)
+			dentry = de;
+		if ((nd->flags & LOOKUP_OPEN) && !IS_ERR(dentry)) { /* Open */
+			if (dentry->d_inode &&
+			    it_disposition(it, DISP_OPEN_OPEN)) { /* nocreate */
+				if (S_ISFIFO(dentry->d_inode->i_mode)) {
+					/* We cannot call open here as it might
+					 * deadlock. This case is unreachable in
+					 * practice because of
+					 * OBD_CONNECT_NODEVOH. */
+				} else {
+					struct file *filp;
+
+					nd->intent.open.file->private_data = it;
+					filp = lookup_instantiate_filp(nd,
+								       dentry,
+								       NULL);
+					if (IS_ERR(filp)) {
+						if (de)
+							dput(de);
+						de = (struct dentry *)filp;
+					}
+				}
+			} else if (it_disposition(it, DISP_OPEN_CREATE)) {
+				/* XXX This can only reliably work on assumption
+				 * that there are NO hashed negative dentries.*/
+				ll_d2d(dentry)->lld_it = it;
+				it = NULL; /* Will be freed in ll_create_nd */
+				/* We absolutely depend on ll_create_nd to be
+				 * called to not leak this intent and possible
+				 * data attached to it */
+			}
+		}
+
+		if (it) {
+			ll_intent_release(it);
+			OBD_FREE(it, sizeof(*it));
+		}
+	} else {
+		de = ll_lookup_it(parent, dentry, NULL, NULL, NULL);
+	}
+
+	RETURN(de);
+}
+#endif /* HAVE_IOP_ATOMIC_OPEN */
+
+/* We depend on "mode" being set with the proper file type/umask by now */
+static struct inode *ll_create_node(struct inode *dir, struct lookup_intent *it)
+{
+        struct inode *inode = NULL;
+        struct ptlrpc_request *request = NULL;
+        struct ll_sb_info *sbi = ll_i2sbi(dir);
+        int rc;
+        ENTRY;
+
+	LASSERT(it && it->it_disposition);
+
+	LASSERT(it_disposition(it, DISP_ENQ_CREATE_REF));
+	request = it->it_request;
+        it_clear_disposition(it, DISP_ENQ_CREATE_REF);
+        rc = ll_prep_inode(&inode, request, dir->i_sb, it);
+        if (rc)
+                GOTO(out, inode = ERR_PTR(rc));
+
+	/* Pause to allow for a race with concurrent access by fid */
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LLITE_CREATE_NODE_PAUSE, cfs_fail_val);
+
+        /* We asked for a lock on the directory, but were granted a
+         * lock on the inode.  Since we finally have an inode pointer,
+         * stuff it in the lock. */
+	CDEBUG(D_DLMTRACE, "setting l_ast_data to inode "DFID"(%p)\n",
+	       PFID(ll_inode2fid(inode)), inode);
+        ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL);
+        EXIT;
+ out:
+        ptlrpc_req_finished(request);
+        return inode;
+}
+
+/*
+ * By the time this is called, we already have created the directory cache
+ * entry for the new file, but it is so far negative - it has no inode.
+ *
+ * We defer creating the OBD object(s) until open, to keep the intent and
+ * non-intent code paths similar, and also because we do not have the MDS
+ * inode number before calling ll_create_node() (which is needed for LOV),
+ * so we would need to do yet another RPC to the MDS to store the LOV EA
+ * data on the MDS.  If needed, we would pass the PACKED lmm as data and
+ * lmm_size in datalen (the MDS still has code which will handle that).
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+static int ll_create_it(struct inode *dir, struct dentry *dentry,
+			struct lookup_intent *it,
+			void *secctx, __u32 secctxlen)
+{
+	struct inode *inode;
+	int rc = 0;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p), intent=%s\n",
+	       dentry->d_name.len, dentry->d_name.name,
+	       PFID(ll_inode2fid(dir)), dir, LL_IT2STR(it));
+
+	rc = it_open_error(DISP_OPEN_CREATE, it);
+	if (rc)
+		RETURN(rc);
+
+	inode = ll_create_node(dir, it);
+	if (IS_ERR(inode))
+		RETURN(PTR_ERR(inode));
+
+	if ((ll_i2sbi(inode)->ll_flags & LL_SBI_FILE_SECCTX) &&
+	    secctx != NULL) {
+		inode_lock(inode);
+		/* must be done before d_instantiate, because it calls
+		 * security_d_instantiate, which means a getxattr if security
+		 * context is not set yet */
+		rc = security_inode_notifysecctx(inode, secctx, secctxlen);
+		inode_unlock(inode);
+		if (rc)
+			RETURN(rc);
+	}
+
+	d_instantiate(dentry, inode);
+
+	if (!(ll_i2sbi(inode)->ll_flags & LL_SBI_FILE_SECCTX)) {
+		rc = ll_inode_init_security(dentry, inode, dir);
+		if (rc)
+			RETURN(rc);
+	}
+
+	RETURN(0);
+}
+
+void ll_update_times(struct ptlrpc_request *request, struct inode *inode)
+{
+	struct mdt_body *body = req_capsule_server_get(&request->rq_pill,
+						       &RMF_MDT_BODY);
+
+	LASSERT(body);
+	if (body->mbo_valid & OBD_MD_FLMTIME &&
+	    body->mbo_mtime > LTIME_S(inode->i_mtime)) {
+		CDEBUG(D_INODE, "setting fid "DFID" mtime from %lu to %llu"
+		       "\n", PFID(ll_inode2fid(inode)),
+		       LTIME_S(inode->i_mtime), body->mbo_mtime);
+		LTIME_S(inode->i_mtime) = body->mbo_mtime;
+	}
+
+	if (body->mbo_valid & OBD_MD_FLCTIME &&
+	    body->mbo_ctime > LTIME_S(inode->i_ctime))
+		LTIME_S(inode->i_ctime) = body->mbo_ctime;
+}
+
+static int ll_new_node(struct inode *dir, struct dentry *dchild,
+		       const char *tgt, umode_t mode, int rdev, __u32 opc)
+{
+	struct qstr *name = &dchild->d_name;
+        struct ptlrpc_request *request = NULL;
+        struct md_op_data *op_data;
+        struct inode *inode = NULL;
+        struct ll_sb_info *sbi = ll_i2sbi(dir);
+        int tgt_len = 0;
+        int err;
+
+        ENTRY;
+        if (unlikely(tgt != NULL))
+                tgt_len = strlen(tgt) + 1;
+
+again:
+	op_data = ll_prep_md_op_data(NULL, dir, NULL, name->name,
+				     name->len, 0, opc, NULL);
+	if (IS_ERR(op_data))
+		GOTO(err_exit, err = PTR_ERR(op_data));
+
+	if (sbi->ll_flags & LL_SBI_FILE_SECCTX) {
+		err = ll_dentry_init_security(dchild, mode, &dchild->d_name,
+					      &op_data->op_file_secctx_name,
+					      &op_data->op_file_secctx,
+					      &op_data->op_file_secctx_size);
+		if (err < 0)
+			GOTO(err_exit, err);
+	}
+
+	err = md_create(sbi->ll_md_exp, op_data, tgt, tgt_len, mode,
+			from_kuid(&init_user_ns, current_fsuid()),
+			from_kgid(&init_user_ns, current_fsgid()),
+			cfs_curproc_cap_pack(), rdev, &request);
+	if (err < 0 && err != -EREMOTE)
+		GOTO(err_exit, err);
+
+	/* If the client doesn't know where to create a subdirectory (or
+	 * in case of a race that sends the RPC to the wrong MDS), the
+	 * MDS will return -EREMOTE and the client will fetch the layout
+	 * of the directory, then create the directory on the right MDT. */
+	if (unlikely(err == -EREMOTE)) {
+		struct ll_inode_info	*lli = ll_i2info(dir);
+		struct lmv_user_md	*lum;
+		int			lumsize;
+		int			err2;
+
+		ptlrpc_req_finished(request);
+		request = NULL;
+
+		err2 = ll_dir_getstripe(dir, (void **)&lum, &lumsize, &request,
+					OBD_MD_DEFAULT_MEA);
+		if (err2 == 0) {
+			/* Update stripe_offset and retry */
+			lli->lli_def_stripe_offset = lum->lum_stripe_offset;
+		} else if (err2 == -ENODATA &&
+			   lli->lli_def_stripe_offset != -1) {
+			/* If there are no default stripe EA on the MDT, but the
+			 * client has default stripe, then it probably means
+			 * default stripe EA has just been deleted. */
+			lli->lli_def_stripe_offset = -1;
+		} else {
+			GOTO(err_exit, err);
+		}
+
+		ptlrpc_req_finished(request);
+		request = NULL;
+		ll_finish_md_op_data(op_data);
+		goto again;
+	}
+
+	ll_update_times(request, dir);
+
+	CFS_FAIL_TIMEOUT(OBD_FAIL_LLITE_NEWNODE_PAUSE, cfs_fail_val);
+
+	err = ll_prep_inode(&inode, request, dchild->d_sb, NULL);
+	if (err)
+		GOTO(err_exit, err);
+
+	if (sbi->ll_flags & LL_SBI_FILE_SECCTX) {
+		inode_lock(inode);
+		/* must be done before d_instantiate, because it calls
+		 * security_d_instantiate, which means a getxattr if security
+		 * context is not set yet */
+		err = security_inode_notifysecctx(inode,
+						  op_data->op_file_secctx,
+						  op_data->op_file_secctx_size);
+		inode_unlock(inode);
+		if (err)
+			GOTO(err_exit, err);
+	}
+
+	d_instantiate(dchild, inode);
+
+	if (!(sbi->ll_flags & LL_SBI_FILE_SECCTX)) {
+		err = ll_inode_init_security(dchild, inode, dir);
+		if (err)
+			GOTO(err_exit, err);
+	}
+
+	EXIT;
+err_exit:
+	if (request != NULL)
+		ptlrpc_req_finished(request);
+
+	if (!IS_ERR_OR_NULL(op_data))
+		ll_finish_md_op_data(op_data);
+
+	return err;
+}
+
+static int ll_mknod(struct inode *dir, struct dentry *dchild, ll_umode_t mode,
+		    dev_t rdev)
+{
+	struct qstr *name = &dchild->d_name;
+	int err;
+        ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p) mode %o dev %x\n",
+	       name->len, name->name, PFID(ll_inode2fid(dir)), dir,
+               mode, rdev);
+
+	if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir)))
+		mode &= ~current_umask();
+
+        switch (mode & S_IFMT) {
+        case 0:
+                mode |= S_IFREG; /* for mode = 0 case, fallthrough */
+        case S_IFREG:
+        case S_IFCHR:
+        case S_IFBLK:
+        case S_IFIFO:
+        case S_IFSOCK:
+		err = ll_new_node(dir, dchild, NULL, mode, old_encode_dev(rdev),
+				  LUSTRE_OPC_MKNOD);
+                break;
+        case S_IFDIR:
+                err = -EPERM;
+                break;
+        default:
+                err = -EINVAL;
+        }
+
+        if (!err)
+                ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKNOD, 1);
+
+        RETURN(err);
+}
+
+#ifdef HAVE_IOP_ATOMIC_OPEN
+/*
+ * Plain create. Intent create is handled in atomic_open.
+ */
+static int ll_create_nd(struct inode *dir, struct dentry *dentry,
+			umode_t mode, bool want_excl)
+{
+	int rc;
+
+	CFS_FAIL_TIMEOUT(OBD_FAIL_LLITE_CREATE_FILE_PAUSE, cfs_fail_val);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p), "
+			   "flags=%u, excl=%d\n", dentry->d_name.len,
+	       dentry->d_name.name, PFID(ll_inode2fid(dir)),
+	       dir, mode, want_excl);
+
+	/* Using mknod(2) to create a regular file is designed to not recognize
+	 * volatile file name, so we use ll_mknod() here. */
+	rc = ll_mknod(dir, dentry, mode, 0);
+
+	ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_CREATE, 1);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, unhashed %d\n",
+	       dentry->d_name.len, dentry->d_name.name, d_unhashed(dentry));
+
+	return rc;
+}
+#else /* !HAVE_IOP_ATOMIC_OPEN */
+static int ll_create_nd(struct inode *dir, struct dentry *dentry,
+			ll_umode_t mode, struct nameidata *nd)
+{
+	struct ll_dentry_data *lld = ll_d2d(dentry);
+	struct lookup_intent *it = NULL;
+	int rc;
+
+	CFS_FAIL_TIMEOUT(OBD_FAIL_LLITE_CREATE_FILE_PAUSE, cfs_fail_val);
+
+	if (lld != NULL)
+		it = lld->lld_it;
+
+	if (!it) {
+		/* LU-8559: use LUSTRE_OPC_CREATE for non atomic open case
+		 * so that volatile file name is recoginized.
+		 * Mknod(2), however, is designed to not recognize volatile
+		 * file name to avoid inode leak under orphan directory until
+		 * MDT reboot */
+		return ll_new_node(dir, dentry, NULL, mode, 0,
+				   LUSTRE_OPC_CREATE);
+	}
+
+	lld->lld_it = NULL;
+
+	/* Was there an error? Propagate it! */
+	if (it->it_status) {
+		rc = it->it_status;
+		goto out;
+	}
+
+	rc = ll_create_it(dir, dentry, it, NULL, 0);
+	if (nd && (nd->flags & LOOKUP_OPEN) && dentry->d_inode) { /* Open */
+		struct file *filp;
+
+		nd->intent.open.file->private_data = it;
+		filp = lookup_instantiate_filp(nd, dentry, NULL);
+		if (IS_ERR(filp))
+			rc = PTR_ERR(filp);
+        }
+
+out:
+        ll_intent_release(it);
+        OBD_FREE(it, sizeof(*it));
+
+        if (!rc)
+                ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_CREATE, 1);
+
+        return rc;
+}
+#endif /* HAVE_IOP_ATOMIC_OPEN */
+
+static int ll_symlink(struct inode *dir, struct dentry *dchild,
+		      const char *oldpath)
+{
+	struct qstr *name = &dchild->d_name;
+	int err;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p), target=%.*s\n",
+	       name->len, name->name, PFID(ll_inode2fid(dir)),
+	       dir, 3000, oldpath);
+
+	err = ll_new_node(dir, dchild, oldpath, S_IFLNK | S_IRWXUGO, 0,
+			  LUSTRE_OPC_SYMLINK);
+
+        if (!err)
+                ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_SYMLINK, 1);
+
+        RETURN(err);
+}
+
+static int ll_link(struct dentry *old_dentry, struct inode *dir,
+		   struct dentry *new_dentry)
+{
+	struct inode *src = old_dentry->d_inode;
+	struct qstr *name = &new_dentry->d_name;
+	struct ll_sb_info *sbi = ll_i2sbi(dir);
+	struct ptlrpc_request *request = NULL;
+	struct md_op_data *op_data;
+	int err;
+
+	ENTRY;
+	CDEBUG(D_VFSTRACE, "VFS Op: inode="DFID"(%p), dir="DFID"(%p), "
+	       "target=%.*s\n", PFID(ll_inode2fid(src)), src,
+	       PFID(ll_inode2fid(dir)), dir, name->len, name->name);
+
+        op_data = ll_prep_md_op_data(NULL, src, dir, name->name, name->len,
+                                     0, LUSTRE_OPC_ANY, NULL);
+        if (IS_ERR(op_data))
+                RETURN(PTR_ERR(op_data));
+
+        err = md_link(sbi->ll_md_exp, op_data, &request);
+        ll_finish_md_op_data(op_data);
+        if (err)
+                GOTO(out, err);
+
+        ll_update_times(request, dir);
+        ll_stats_ops_tally(sbi, LPROC_LL_LINK, 1);
+        EXIT;
+out:
+        ptlrpc_req_finished(request);
+        RETURN(err);
+}
+
+static int ll_mkdir(struct inode *dir, struct dentry *dchild, ll_umode_t mode)
+{
+	struct qstr *name = &dchild->d_name;
+        int err;
+        ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p)\n",
+	       name->len, name->name, PFID(ll_inode2fid(dir)), dir);
+
+	if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir)))
+		mode &= ~current_umask();
+
+	mode = (mode & (S_IRWXUGO|S_ISVTX)) | S_IFDIR;
+
+	err = ll_new_node(dir, dchild, NULL, mode, 0, LUSTRE_OPC_MKDIR);
+	if (err == 0)
+		ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKDIR, 1);
+
+	RETURN(err);
+}
+
+static int ll_rmdir(struct inode *dir, struct dentry *dchild)
+{
+	struct qstr *name = &dchild->d_name;
+        struct ptlrpc_request *request = NULL;
+        struct md_op_data *op_data;
+        int rc;
+        ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p)\n",
+	       name->len, name->name, PFID(ll_inode2fid(dir)), dir);
+
+	if (unlikely(d_mountpoint(dchild)))
+                RETURN(-EBUSY);
+
+        op_data = ll_prep_md_op_data(NULL, dir, NULL, name->name, name->len,
+                                     S_IFDIR, LUSTRE_OPC_ANY, NULL);
+        if (IS_ERR(op_data))
+                RETURN(PTR_ERR(op_data));
+
+	if (dchild->d_inode != NULL)
+		op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
+
+	op_data->op_fid2 = op_data->op_fid3;
+        rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
+        ll_finish_md_op_data(op_data);
+        if (rc == 0) {
+                ll_update_times(request, dir);
+                ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_RMDIR, 1);
+        }
+
+        ptlrpc_req_finished(request);
+        RETURN(rc);
+}
+
+/**
+ * Remove dir entry
+ **/
+int ll_rmdir_entry(struct inode *dir, char *name, int namelen)
+{
+	struct ptlrpc_request *request = NULL;
+	struct md_op_data *op_data;
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p)\n",
+	       namelen, name, PFID(ll_inode2fid(dir)), dir);
+
+	op_data = ll_prep_md_op_data(NULL, dir, NULL, name, strlen(name),
+				     S_IFDIR, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+	op_data->op_cli_flags |= CLI_RM_ENTRY;
+	rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
+	ll_finish_md_op_data(op_data);
+	if (rc == 0) {
+		ll_update_times(request, dir);
+		ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_RMDIR, 1);
+	}
+
+	ptlrpc_req_finished(request);
+	RETURN(rc);
+}
+
+static int ll_unlink(struct inode *dir, struct dentry *dchild)
+{
+	struct qstr *name = &dchild->d_name;
+	struct ptlrpc_request *request = NULL;
+	struct md_op_data *op_data;
+	struct mdt_body *body;
+	int rc;
+	ENTRY;
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p)\n",
+	       name->len, name->name, PFID(ll_inode2fid(dir)), dir);
+
+	/*
+	 * XXX: unlink bind mountpoint maybe call to here,
+	 * just check it as vfs_unlink does.
+	 */
+	if (unlikely(d_mountpoint(dchild)))
+		RETURN(-EBUSY);
+
+	op_data = ll_prep_md_op_data(NULL, dir, NULL, name->name, name->len, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
+
+	op_data->op_fid2 = op_data->op_fid3;
+	rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
+	ll_finish_md_op_data(op_data);
+	if (rc)
+		GOTO(out, rc);
+
+	/*
+	 * The server puts attributes in on the last unlink, use them to update
+	 * the link count so the inode can be freed immediately.
+	 */
+	body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
+	if (body->mbo_valid & OBD_MD_FLNLINK)
+		set_nlink(dchild->d_inode, body->mbo_nlink);
+
+	ll_update_times(request, dir);
+	ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_UNLINK, 1);
+
+out:
+	ptlrpc_req_finished(request);
+	RETURN(rc);
+}
+
+static int ll_rename(struct inode *src, struct dentry *src_dchild,
+		     struct inode *tgt, struct dentry *tgt_dchild
+#ifdef HAVE_IOPS_RENAME_WITH_FLAGS
+		     , unsigned int flags
+#endif
+		     )
+{
+	struct qstr *src_name = &src_dchild->d_name;
+	struct qstr *tgt_name = &tgt_dchild->d_name;
+	struct ptlrpc_request *request = NULL;
+	struct ll_sb_info *sbi = ll_i2sbi(src);
+	struct md_op_data *op_data;
+	int err;
+	ENTRY;
+
+#ifdef HAVE_IOPS_RENAME_WITH_FLAGS
+	if (flags)
+		return -EINVAL;
+#endif
+
+	CDEBUG(D_VFSTRACE, "VFS Op:oldname=%.*s, src_dir="DFID
+	       "(%p), newname=%.*s, tgt_dir="DFID"(%p)\n",
+	       src_name->len, src_name->name,
+	       PFID(ll_inode2fid(src)), src, tgt_name->len,
+	       tgt_name->name, PFID(ll_inode2fid(tgt)), tgt);
+
+	if (unlikely(d_mountpoint(src_dchild) || d_mountpoint(tgt_dchild)))
+		RETURN(-EBUSY);
+
+	op_data = ll_prep_md_op_data(NULL, src, tgt, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	if (src_dchild->d_inode != NULL)
+		op_data->op_fid3 = *ll_inode2fid(src_dchild->d_inode);
+
+	if (tgt_dchild->d_inode != NULL)
+		op_data->op_fid4 = *ll_inode2fid(tgt_dchild->d_inode);
+
+        err = md_rename(sbi->ll_md_exp, op_data,
+                        src_name->name, src_name->len,
+                        tgt_name->name, tgt_name->len, &request);
+        ll_finish_md_op_data(op_data);
+        if (!err) {
+                ll_update_times(request, src);
+                ll_update_times(request, tgt);
+                ll_stats_ops_tally(sbi, LPROC_LL_RENAME, 1);
+        }
+
+        ptlrpc_req_finished(request);
+
+	if (err == 0)
+		d_move(src_dchild, tgt_dchild);
+
+	RETURN(err);
+}
+
+const struct inode_operations ll_dir_inode_operations = {
+	.mknod		= ll_mknod,
+#ifdef HAVE_IOP_ATOMIC_OPEN
+	.atomic_open	= ll_atomic_open,
+#endif
+	.lookup		= ll_lookup_nd,
+	.create		= ll_create_nd,
+	/* We need all these non-raw things for NFSD, to not patch it. */
+	.unlink		= ll_unlink,
+	.mkdir		= ll_mkdir,
+	.rmdir		= ll_rmdir,
+	.symlink	= ll_symlink,
+	.link		= ll_link,
+	.rename		= ll_rename,
+	.setattr	= ll_setattr,
+	.getattr	= ll_getattr,
+	.permission	= ll_inode_permission,
+#ifdef HAVE_IOP_XATTR
+	.setxattr	= ll_setxattr,
+	.getxattr	= ll_getxattr,
+	.removexattr	= ll_removexattr,
+#endif
+	.listxattr	= ll_listxattr,
+#ifdef HAVE_IOP_GET_ACL
+	.get_acl	= ll_get_acl,
+#endif
+#ifdef HAVE_IOP_SET_ACL
+	.set_acl	= ll_set_acl,
+#endif
+};
+
+const struct inode_operations ll_special_inode_operations = {
+	.setattr        = ll_setattr,
+	.getattr        = ll_getattr,
+	.permission     = ll_inode_permission,
+#ifdef HAVE_IOP_XATTR
+	.setxattr	= ll_setxattr,
+	.getxattr	= ll_getxattr,
+	.removexattr    = ll_removexattr,
+#endif
+	.listxattr      = ll_listxattr,
+#ifdef HAVE_IOP_GET_ACL
+	.get_acl	= ll_get_acl,
+#endif
+#ifdef HAVE_IOP_SET_ACL
+	.set_acl	= ll_set_acl,
+#endif
+};
diff --git a/drivers/staging/lustrefsx/lustre/llite/range_lock.c b/drivers/staging/lustrefsx/lustre/llite/range_lock.c
new file mode 100644
index 0000000000000..56e129165c4be
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/range_lock.c
@@ -0,0 +1,244 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Range lock is used to allow multiple threads writing a single shared
+ * file given each thread is writing to a non-overlapping portion of the
+ * file.
+ *
+ * Refer to the possible upstream kernel version of range lock by
+ * Jan Kara <jack@suse.cz>: https://lkml.org/lkml/2013/1/31/480
+ *
+ * This file could later replaced by the upstream kernel version.
+ */
+/*
+ * Author: Prakash Surya <surya1@llnl.gov>
+ * Author: Bobi Jam <bobijam.xu@intel.com>
+ */
+#include "range_lock.h"
+#include <lustre/lustre_user.h>
+
+/**
+ * Initialize a range lock tree
+ *
+ * \param tree [in]	an empty range lock tree
+ *
+ * Pre:  Caller should have allocated the range lock tree.
+ * Post: The range lock tree is ready to function.
+ */
+void range_lock_tree_init(struct range_lock_tree *tree)
+{
+	tree->rlt_root = NULL;
+	tree->rlt_sequence = 0;
+	spin_lock_init(&tree->rlt_lock);
+}
+
+/**
+ * Intialize a range lock node
+ *
+ * \param lock  [in]	an empty range lock node
+ * \param start [in]	start of the covering region
+ * \param end   [in]	end of the covering region
+ *
+ * Pre:  Caller should have allocated the range lock node.
+ * Post: The range lock node is meant to cover [start, end] region
+ */
+int range_lock_init(struct range_lock *lock, __u64 start, __u64 end)
+{
+	int rc;
+
+	interval_init(&lock->rl_node);
+	if (end != LUSTRE_EOF)
+		end >>= PAGE_SHIFT;
+	rc = interval_set(&lock->rl_node, start >> PAGE_SHIFT, end);
+	if (rc)
+		return rc;
+
+	INIT_LIST_HEAD(&lock->rl_next_lock);
+	lock->rl_task = NULL;
+	lock->rl_lock_count = 0;
+	lock->rl_blocking_ranges = 0;
+	lock->rl_sequence = 0;
+	return rc;
+}
+
+static inline struct range_lock *next_lock(struct range_lock *lock)
+{
+	return list_entry(lock->rl_next_lock.next, typeof(*lock), rl_next_lock);
+}
+
+/**
+ * Helper function of range_unlock()
+ *
+ * \param node [in]	a range lock found overlapped during interval node
+ *			search
+ * \param arg [in]	the range lock to be tested
+ *
+ * \retval INTERVAL_ITER_CONT	indicate to continue the search for next
+ *				overlapping range node
+ * \retval INTERVAL_ITER_STOP	indicate to stop the search
+ */
+static enum interval_iter range_unlock_cb(struct interval_node *node, void *arg)
+{
+	struct range_lock *lock = arg;
+	struct range_lock *overlap = node2rangelock(node);
+	struct range_lock *iter;
+	ENTRY;
+
+	list_for_each_entry(iter, &overlap->rl_next_lock, rl_next_lock) {
+		if (iter->rl_sequence > lock->rl_sequence) {
+			--iter->rl_blocking_ranges;
+			LASSERT(iter->rl_blocking_ranges > 0);
+		}
+	}
+	if (overlap->rl_sequence > lock->rl_sequence) {
+		--overlap->rl_blocking_ranges;
+		if (overlap->rl_blocking_ranges == 0)
+			wake_up_process(overlap->rl_task);
+	}
+	RETURN(INTERVAL_ITER_CONT);
+}
+
+/**
+ * Unlock a range lock, wake up locks blocked by this lock.
+ *
+ * \param tree [in]	range lock tree
+ * \param lock [in]	range lock to be deleted
+ *
+ * If this lock has been granted, relase it; if not, just delete it from
+ * the tree or the same region lock list. Wake up those locks only blocked
+ * by this lock through range_unlock_cb().
+ */
+void range_unlock(struct range_lock_tree *tree, struct range_lock *lock)
+{
+	ENTRY;
+
+	spin_lock(&tree->rlt_lock);
+	if (!list_empty(&lock->rl_next_lock)) {
+		struct range_lock *next;
+
+		if (interval_is_intree(&lock->rl_node)) { /* first lock */
+			/* Insert the next same range lock into the tree */
+			next = next_lock(lock);
+			next->rl_lock_count = lock->rl_lock_count - 1;
+			interval_erase(&lock->rl_node, &tree->rlt_root);
+			interval_insert(&next->rl_node, &tree->rlt_root);
+		} else {
+			/* find the first lock in tree */
+			list_for_each_entry(next, &lock->rl_next_lock,
+					    rl_next_lock) {
+				if (!interval_is_intree(&next->rl_node))
+					continue;
+
+				LASSERT(next->rl_lock_count > 0);
+				next->rl_lock_count--;
+				break;
+			}
+		}
+		list_del_init(&lock->rl_next_lock);
+	} else {
+		LASSERT(interval_is_intree(&lock->rl_node));
+		interval_erase(&lock->rl_node, &tree->rlt_root);
+	}
+
+	interval_search(tree->rlt_root, &lock->rl_node.in_extent,
+			range_unlock_cb, lock);
+	spin_unlock(&tree->rlt_lock);
+
+	EXIT;
+}
+
+/**
+ * Helper function of range_lock()
+ *
+ * \param node [in]	a range lock found overlapped during interval node
+ *			search
+ * \param arg [in]	the range lock to be tested
+ *
+ * \retval INTERVAL_ITER_CONT	indicate to continue the search for next
+ *				overlapping range node
+ * \retval INTERVAL_ITER_STOP	indicate to stop the search
+ */
+static enum interval_iter range_lock_cb(struct interval_node *node, void *arg)
+{
+	struct range_lock *lock = (struct range_lock *)arg;
+	struct range_lock *overlap = node2rangelock(node);
+
+	lock->rl_blocking_ranges += overlap->rl_lock_count + 1;
+	RETURN(INTERVAL_ITER_CONT);
+}
+
+/**
+ * Lock a region
+ *
+ * \param tree [in]	range lock tree
+ * \param lock [in]	range lock node containing the region span
+ *
+ * \retval 0	get the range lock
+ * \retval <0	error code while not getting the range lock
+ *
+ * If there exists overlapping range lock, the new lock will wait and
+ * retry, if later it find that it is not the chosen one to wake up,
+ * it wait again.
+ */
+int range_lock(struct range_lock_tree *tree, struct range_lock *lock)
+{
+	struct interval_node *node;
+	int rc = 0;
+	ENTRY;
+
+	spin_lock(&tree->rlt_lock);
+	/*
+	 * We need to check for all conflicting intervals
+	 * already in the tree.
+	 */
+	interval_search(tree->rlt_root, &lock->rl_node.in_extent,
+			range_lock_cb, lock);
+	/*
+	 * Insert to the tree if I am unique, otherwise I've been linked to
+	 * the rl_next_lock of another lock which has the same range as mine
+	 * in range_lock_cb().
+	 */
+	node = interval_insert(&lock->rl_node, &tree->rlt_root);
+	if (node != NULL) {
+		struct range_lock *tmp = node2rangelock(node);
+
+		list_add_tail(&lock->rl_next_lock, &tmp->rl_next_lock);
+		tmp->rl_lock_count++;
+	}
+	lock->rl_sequence = ++tree->rlt_sequence;
+
+	while (lock->rl_blocking_ranges > 0) {
+		lock->rl_task = current;
+		__set_current_state(TASK_INTERRUPTIBLE);
+		spin_unlock(&tree->rlt_lock);
+		schedule();
+
+		if (signal_pending(current)) {
+			range_unlock(tree, lock);
+			GOTO(out, rc = -ERESTARTSYS);
+		}
+		spin_lock(&tree->rlt_lock);
+	}
+	spin_unlock(&tree->rlt_lock);
+out:
+	RETURN(rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/range_lock.h b/drivers/staging/lustrefsx/lustre/llite/range_lock.h
new file mode 100644
index 0000000000000..5266db71bb676
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/range_lock.h
@@ -0,0 +1,87 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Range lock is used to allow multiple threads writing a single shared
+ * file given each thread is writing to a non-overlapping portion of the
+ * file.
+ *
+ * Refer to the possible upstream kernel version of range lock by
+ * Jan Kara <jack@suse.cz>: https://lkml.org/lkml/2013/1/31/480
+ *
+ * This file could later replaced by the upstream kernel version.
+ */
+/*
+ * Author: Prakash Surya <surya1@llnl.gov>
+ * Author: Bobi Jam <bobijam.xu@intel.com>
+ */
+#ifndef _RANGE_LOCK_H
+#define _RANGE_LOCK_H
+
+#include <libcfs/libcfs.h>
+#include <interval_tree.h>
+
+#define RL_FMT "[%llu, %llu]"
+#define RL_PARA(range)				\
+	(range)->rl_node.in_extent.start,	\
+	(range)->rl_node.in_extent.end
+
+struct range_lock {
+	struct interval_node	rl_node;
+	/**
+	 * Process to enqueue this lock.
+	 */
+	struct task_struct	*rl_task;
+	/**
+	 * List of locks with the same range.
+	 */
+	struct list_head	rl_next_lock;
+	/**
+	 * Number of locks in the list rl_next_lock
+	 */
+	unsigned int		rl_lock_count;
+	/**
+	 * Number of ranges which are blocking acquisition of the lock
+	 */
+	unsigned int		rl_blocking_ranges;
+	/**
+	 * Sequence number of range lock. This number is used to get to know
+	 * the order the locks are queued; this is required for range_cancel().
+	 */
+	__u64			rl_sequence;
+};
+
+static inline struct range_lock *node2rangelock(const struct interval_node *n)
+{
+	return container_of(n, struct range_lock, rl_node);
+}
+
+struct range_lock_tree {
+	struct interval_node	*rlt_root;
+	spinlock_t		 rlt_lock;
+	__u64			 rlt_sequence;
+};
+
+void range_lock_tree_init(struct range_lock_tree *tree);
+int  range_lock_init(struct range_lock *lock, __u64 start, __u64 end);
+int  range_lock(struct range_lock_tree *tree, struct range_lock *lock);
+void range_unlock(struct range_lock_tree *tree, struct range_lock *lock);
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/llite/rw.c b/drivers/staging/lustrefsx/lustre/llite/rw.c
new file mode 100644
index 0000000000000..a00ccef398702
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/rw.c
@@ -0,0 +1,1251 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/rw.c
+ *
+ * Lustre Lite I/O page cache routines shared by different kernel revs
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/writeback.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+/* current_is_kswapd() */
+#include <linux/swap.h>
+#include <linux/task_io_accounting_ops.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_cksum.h>
+#include "llite_internal.h"
+#include <lustre_compat.h>
+
+static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which);
+
+/**
+ * Get readahead pages from the filesystem readahead pool of the client for a
+ * thread.
+ *
+ * /param sbi superblock for filesystem readahead state ll_ra_info
+ * /param ria per-thread readahead state
+ * /param pages number of pages requested for readahead for the thread.
+ *
+ * WARNING: This algorithm is used to reduce contention on sbi->ll_lock.
+ * It should work well if the ra_max_pages is much greater than the single
+ * file's read-ahead window, and not too many threads contending for
+ * these readahead pages.
+ *
+ * TODO: There may be a 'global sync problem' if many threads are trying
+ * to get an ra budget that is larger than the remaining readahead pages
+ * and reach here at exactly the same time. They will compute /a ret to
+ * consume the remaining pages, but will fail at atomic_add_return() and
+ * get a zero ra window, although there is still ra space remaining. - Jay */
+
+static unsigned long ll_ra_count_get(struct ll_sb_info *sbi,
+				     struct ra_io_arg *ria,
+				     unsigned long pages, unsigned long min)
+{
+        struct ll_ra_info *ra = &sbi->ll_ra_info;
+        long ret;
+        ENTRY;
+
+        /* If read-ahead pages left are less than 1M, do not do read-ahead,
+         * otherwise it will form small read RPC(< 1M), which hurt server
+         * performance a lot. */
+	ret = min(ra->ra_max_pages - atomic_read(&ra->ra_cur_pages),
+		  pages);
+        if (ret < 0 || ret < min_t(long, PTLRPC_MAX_BRW_PAGES, pages))
+                GOTO(out, ret = 0);
+
+	if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) {
+		atomic_sub(ret, &ra->ra_cur_pages);
+		ret = 0;
+	}
+
+out:
+	if (ret < min) {
+		/* override ra limit for maximum performance */
+		atomic_add(min - ret, &ra->ra_cur_pages);
+		ret = min;
+	}
+	RETURN(ret);
+}
+
+void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len)
+{
+	struct ll_ra_info *ra = &sbi->ll_ra_info;
+	atomic_sub(len, &ra->ra_cur_pages);
+}
+
+static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which)
+{
+	LASSERTF(which < _NR_RA_STAT, "which: %u\n", which);
+	lprocfs_counter_incr(sbi->ll_ra_stats, which);
+}
+
+void ll_ra_stats_inc(struct inode *inode, enum ra_stat which)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	ll_ra_stats_inc_sbi(sbi, which);
+}
+
+#define RAS_CDEBUG(ras) \
+	CDEBUG(D_READA,                                                      \
+	       "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu rpc %lu "        \
+	       "r %lu ri %lu csr %lu sf %lu sp %lu sl %lu\n",                \
+	       ras->ras_last_readpage, ras->ras_consecutive_requests,        \
+	       ras->ras_consecutive_pages, ras->ras_window_start,            \
+	       ras->ras_window_len, ras->ras_next_readahead,                 \
+	       ras->ras_rpc_size,                                            \
+	       ras->ras_requests, ras->ras_request_index,                    \
+	       ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \
+	       ras->ras_stride_pages, ras->ras_stride_length)
+
+static int index_in_window(unsigned long index, unsigned long point,
+                           unsigned long before, unsigned long after)
+{
+        unsigned long start = point - before, end = point + after;
+
+        if (start > point)
+               start = 0;
+        if (end < point)
+               end = ~0;
+
+        return start <= index && index <= end;
+}
+
+void ll_ras_enter(struct file *f)
+{
+	struct ll_file_data *fd = LUSTRE_FPRIVATE(f);
+	struct ll_readahead_state *ras = &fd->fd_ras;
+
+	spin_lock(&ras->ras_lock);
+	ras->ras_requests++;
+	ras->ras_request_index = 0;
+	ras->ras_consecutive_requests++;
+	spin_unlock(&ras->ras_lock);
+}
+
+/**
+ * Initiates read-ahead of a page with given index.
+ *
+ * \retval +ve: page was already uptodate so it will be skipped
+ *              from being added;
+ * \retval -ve: page wasn't added to \a queue for error;
+ * \retval   0: page was added into \a queue for read ahead.
+ */
+static int ll_read_ahead_page(const struct lu_env *env, struct cl_io *io,
+			      struct cl_page_list *queue, pgoff_t index)
+{
+	struct cl_object *clob  = io->ci_obj;
+	struct inode     *inode = vvp_object_inode(clob);
+	struct page      *vmpage;
+	struct cl_page   *page;
+	struct vvp_page  *vpg;
+	enum ra_stat      which = _NR_RA_STAT; /* keep gcc happy */
+	int               rc    = 0;
+	const char       *msg   = NULL;
+	ENTRY;
+
+	vmpage = grab_cache_page_nowait(inode->i_mapping, index);
+	if (vmpage == NULL) {
+		which = RA_STAT_FAILED_GRAB_PAGE;
+		msg   = "g_c_p_n failed";
+		GOTO(out, rc = -EBUSY);
+	}
+
+	/* Check if vmpage was truncated or reclaimed */
+	if (vmpage->mapping != inode->i_mapping) {
+		which = RA_STAT_WRONG_GRAB_PAGE;
+		msg   = "g_c_p_n returned invalid page";
+		GOTO(out, rc = -EBUSY);
+	}
+
+	page = cl_page_find(env, clob, vmpage->index, vmpage, CPT_CACHEABLE);
+	if (IS_ERR(page)) {
+		which = RA_STAT_FAILED_GRAB_PAGE;
+		msg   = "cl_page_find failed";
+		GOTO(out, rc = PTR_ERR(page));
+	}
+
+	lu_ref_add(&page->cp_reference, "ra", current);
+	cl_page_assume(env, io, page);
+	vpg = cl2vvp_page(cl_object_page_slice(clob, page));
+	if (!vpg->vpg_defer_uptodate && !PageUptodate(vmpage)) {
+		vpg->vpg_defer_uptodate = 1;
+		vpg->vpg_ra_used = 0;
+		cl_page_list_add(queue, page);
+	} else {
+		/* skip completed pages */
+		cl_page_unassume(env, io, page);
+		/* This page is already uptodate, returning a positive number
+		 * to tell the callers about this */
+		rc = 1;
+	}
+
+	lu_ref_del(&page->cp_reference, "ra", current);
+	cl_page_put(env, page);
+
+out:
+	if (vmpage != NULL) {
+		if (rc != 0)
+			unlock_page(vmpage);
+		put_page(vmpage);
+	}
+	if (msg != NULL) {
+		ll_ra_stats_inc(inode, which);
+		CDEBUG(D_READA, "%s\n", msg);
+
+	}
+
+	RETURN(rc);
+}
+
+#define RIA_DEBUG(ria)                                                       \
+        CDEBUG(D_READA, "rs %lu re %lu ro %lu rl %lu rp %lu\n",       \
+        ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\
+        ria->ria_pages)
+
+static inline int stride_io_mode(struct ll_readahead_state *ras)
+{
+        return ras->ras_consecutive_stride_requests > 1;
+}
+
+/* The function calculates how much pages will be read in
+ * [off, off + length], in such stride IO area,
+ * stride_offset = st_off, stride_lengh = st_len,
+ * stride_pages = st_pgs
+ *
+ *   |------------------|*****|------------------|*****|------------|*****|....
+ * st_off
+ *   |--- st_pgs     ---|
+ *   |-----     st_len   -----|
+ *
+ *              How many pages it should read in such pattern
+ *              |-------------------------------------------------------------|
+ *              off
+ *              |<------                  length                      ------->|
+ *
+ *          =   |<----->|  +  |-------------------------------------| +   |---|
+ *             start_left                 st_pgs * i                    end_left
+ */
+static unsigned long
+stride_pg_count(pgoff_t st_off, unsigned long st_len, unsigned long st_pgs,
+                unsigned long off, unsigned long length)
+{
+        __u64 start = off > st_off ? off - st_off : 0;
+        __u64 end = off + length > st_off ? off + length - st_off : 0;
+        unsigned long start_left = 0;
+        unsigned long end_left = 0;
+        unsigned long pg_count;
+
+        if (st_len == 0 || length == 0 || end == 0)
+                return length;
+
+        start_left = do_div(start, st_len);
+        if (start_left < st_pgs)
+                start_left = st_pgs - start_left;
+        else
+                start_left = 0;
+
+        end_left = do_div(end, st_len);
+        if (end_left > st_pgs)
+                end_left = st_pgs;
+
+	CDEBUG(D_READA, "start %llu, end %llu start_left %lu end_left %lu\n",
+               start, end, start_left, end_left);
+
+        if (start == end)
+                pg_count = end_left - (st_pgs - start_left);
+        else
+                pg_count = start_left + st_pgs * (end - start - 1) + end_left;
+
+        CDEBUG(D_READA, "st_off %lu, st_len %lu st_pgs %lu off %lu length %lu"
+               "pgcount %lu\n", st_off, st_len, st_pgs, off, length, pg_count);
+
+        return pg_count;
+}
+
+static int ria_page_count(struct ra_io_arg *ria)
+{
+        __u64 length = ria->ria_end >= ria->ria_start ?
+                       ria->ria_end - ria->ria_start + 1 : 0;
+
+        return stride_pg_count(ria->ria_stoff, ria->ria_length,
+                               ria->ria_pages, ria->ria_start,
+                               length);
+}
+
+static unsigned long ras_align(struct ll_readahead_state *ras,
+			       unsigned long index,
+			       unsigned long *remainder)
+{
+	unsigned long rem = index % ras->ras_rpc_size;
+	if (remainder != NULL)
+		*remainder = rem;
+	return index - rem;
+}
+
+/*Check whether the index is in the defined ra-window */
+static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria)
+{
+        /* If ria_length == ria_pages, it means non-stride I/O mode,
+         * idx should always inside read-ahead window in this case
+         * For stride I/O mode, just check whether the idx is inside
+         * the ria_pages. */
+        return ria->ria_length == 0 || ria->ria_length == ria->ria_pages ||
+               (idx >= ria->ria_stoff && (idx - ria->ria_stoff) %
+                ria->ria_length < ria->ria_pages);
+}
+
+static unsigned long
+ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io,
+		    struct cl_page_list *queue, struct ll_readahead_state *ras,
+		    struct ra_io_arg *ria, pgoff_t *ra_end)
+{
+	struct cl_read_ahead ra = { 0 };
+	int rc = 0, count = 0;
+	bool stride_ria;
+	pgoff_t page_idx;
+
+	LASSERT(ria != NULL);
+	RIA_DEBUG(ria);
+
+	stride_ria = ria->ria_length > ria->ria_pages && ria->ria_pages > 0;
+	for (page_idx = ria->ria_start;
+	     page_idx <= ria->ria_end && ria->ria_reserved > 0; page_idx++) {
+		if (ras_inside_ra_window(page_idx, ria)) {
+			if (ra.cra_end == 0 || ra.cra_end < page_idx) {
+				unsigned long end;
+
+				cl_read_ahead_release(env, &ra);
+
+				rc = cl_io_read_ahead(env, io, page_idx, &ra);
+				if (rc < 0)
+					break;
+
+				CDEBUG(D_READA, "idx: %lu, ra: %lu, rpc: %lu\n",
+				       page_idx, ra.cra_end, ra.cra_rpc_size);
+				LASSERTF(ra.cra_end >= page_idx,
+					 "object: %p, indcies %lu / %lu\n",
+					 io->ci_obj, ra.cra_end, page_idx);
+				/* update read ahead RPC size.
+				 * NB: it's racy but doesn't matter */
+				if (ras->ras_rpc_size > ra.cra_rpc_size &&
+				    ra.cra_rpc_size > 0)
+					ras->ras_rpc_size = ra.cra_rpc_size;
+				/* trim it to align with optimal RPC size */
+				end = ras_align(ras, ria->ria_end + 1, NULL);
+				if (end > 0 && !ria->ria_eof)
+					ria->ria_end = end - 1;
+				if (ria->ria_end < ria->ria_end_min)
+					ria->ria_end = ria->ria_end_min;
+				if (ria->ria_end > ra.cra_end)
+					ria->ria_end = ra.cra_end;
+			}
+			if (page_idx > ria->ria_end)
+				break;
+
+			/* If the page is inside the read-ahead window */
+			rc = ll_read_ahead_page(env, io, queue, page_idx);
+			if (rc < 0)
+				break;
+
+			*ra_end = page_idx;
+			/* Only subtract from reserve & count the page if we
+			 * really did readahead on that page. */
+			if (rc == 0) {
+				ria->ria_reserved--;
+				count++;
+			}
+                } else if (stride_ria) {
+                        /* If it is not in the read-ahead window, and it is
+                         * read-ahead mode, then check whether it should skip
+                         * the stride gap */
+                        pgoff_t offset;
+                        /* FIXME: This assertion only is valid when it is for
+                         * forward read-ahead, it will be fixed when backward
+                         * read-ahead is implemented */
+			LASSERTF(page_idx >= ria->ria_stoff,
+				"Invalid page_idx %lu rs %lu re %lu ro %lu "
+				"rl %lu rp %lu\n", page_idx,
+				ria->ria_start, ria->ria_end, ria->ria_stoff,
+				ria->ria_length, ria->ria_pages);
+                        offset = page_idx - ria->ria_stoff;
+                        offset = offset % (ria->ria_length);
+                        if (offset > ria->ria_pages) {
+                                page_idx += ria->ria_length - offset;
+                                CDEBUG(D_READA, "i %lu skip %lu \n", page_idx,
+                                       ria->ria_length - offset);
+                                continue;
+                        }
+                }
+        }
+
+	cl_read_ahead_release(env, &ra);
+
+	return count;
+}
+
+static int ll_readahead(const struct lu_env *env, struct cl_io *io,
+			struct cl_page_list *queue,
+			struct ll_readahead_state *ras, bool hit)
+{
+	struct vvp_io *vio = vvp_env_io(env);
+	struct ll_thread_info *lti = ll_env_info(env);
+	struct cl_attr *attr = vvp_env_thread_attr(env);
+	unsigned long len, mlen = 0;
+	pgoff_t ra_end = 0, start = 0, end = 0;
+	struct inode *inode;
+	struct ra_io_arg *ria = &lti->lti_ria;
+	struct cl_object *clob;
+	int ret = 0;
+	__u64 kms;
+	ENTRY;
+
+	clob = io->ci_obj;
+	inode = vvp_object_inode(clob);
+
+	memset(ria, 0, sizeof *ria);
+
+	cl_object_attr_lock(clob);
+	ret = cl_object_attr_get(env, clob, attr);
+	cl_object_attr_unlock(clob);
+
+	if (ret != 0)
+		RETURN(ret);
+	kms = attr->cat_kms;
+	if (kms == 0) {
+		ll_ra_stats_inc(inode, RA_STAT_ZERO_LEN);
+		RETURN(0);
+	}
+
+	spin_lock(&ras->ras_lock);
+
+	/**
+	 * Note: other thread might rollback the ras_next_readahead,
+	 * if it can not get the full size of prepared pages, see the
+	 * end of this function. For stride read ahead, it needs to
+	 * make sure the offset is no less than ras_stride_offset,
+	 * so that stride read ahead can work correctly.
+	 */
+	if (stride_io_mode(ras))
+		start = max(ras->ras_next_readahead, ras->ras_stride_offset);
+	else
+		start = ras->ras_next_readahead;
+
+	if (ras->ras_window_len > 0)
+		end = ras->ras_window_start + ras->ras_window_len - 1;
+
+	/* Enlarge the RA window to encompass the full read */
+	if (vio->vui_ra_valid &&
+	    end < vio->vui_ra_start + vio->vui_ra_count - 1)
+		end = vio->vui_ra_start + vio->vui_ra_count - 1;
+
+        if (end != 0) {
+		unsigned long end_index;
+
+		/* Truncate RA window to end of file */
+		end_index = (unsigned long)((kms - 1) >> PAGE_SHIFT);
+		if (end_index <= end) {
+			end = end_index;
+			ria->ria_eof = true;
+		}
+        }
+        ria->ria_start = start;
+        ria->ria_end = end;
+        /* If stride I/O mode is detected, get stride window*/
+        if (stride_io_mode(ras)) {
+                ria->ria_stoff = ras->ras_stride_offset;
+                ria->ria_length = ras->ras_stride_length;
+                ria->ria_pages = ras->ras_stride_pages;
+        }
+	spin_unlock(&ras->ras_lock);
+
+	if (end == 0) {
+		ll_ra_stats_inc(inode, RA_STAT_ZERO_WINDOW);
+		RETURN(0);
+	}
+	len = ria_page_count(ria);
+	if (len == 0) {
+		ll_ra_stats_inc(inode, RA_STAT_ZERO_WINDOW);
+		RETURN(0);
+	}
+
+	RAS_CDEBUG(ras);
+	CDEBUG(D_READA, DFID": ria: %lu/%lu, bead: %lu/%lu, hit: %d\n",
+	       PFID(lu_object_fid(&clob->co_lu)),
+	       ria->ria_start, ria->ria_end,
+	       vio->vui_ra_valid ? vio->vui_ra_start : 0,
+	       vio->vui_ra_valid ? vio->vui_ra_count : 0,
+	       hit);
+
+	/* at least to extend the readahead window to cover current read */
+	if (!hit && vio->vui_ra_valid &&
+	    vio->vui_ra_start + vio->vui_ra_count > ria->ria_start) {
+		unsigned long remainder;
+
+		/* to the end of current read window. */
+		mlen = vio->vui_ra_start + vio->vui_ra_count - ria->ria_start;
+		/* trim to RPC boundary */
+		ras_align(ras, ria->ria_start, &remainder);
+		mlen = min(mlen, ras->ras_rpc_size - remainder);
+		ria->ria_end_min = ria->ria_start + mlen;
+	}
+
+	ria->ria_reserved = ll_ra_count_get(ll_i2sbi(inode), ria, len, mlen);
+	if (ria->ria_reserved < len)
+		ll_ra_stats_inc(inode, RA_STAT_MAX_IN_FLIGHT);
+
+	CDEBUG(D_READA, "reserved pages: %lu/%lu/%lu, ra_cur %d, ra_max %lu\n",
+	       ria->ria_reserved, len, mlen,
+	       atomic_read(&ll_i2sbi(inode)->ll_ra_info.ra_cur_pages),
+	       ll_i2sbi(inode)->ll_ra_info.ra_max_pages);
+
+	ret = ll_read_ahead_pages(env, io, queue, ras, ria, &ra_end);
+
+	if (ria->ria_reserved != 0)
+		ll_ra_count_put(ll_i2sbi(inode), ria->ria_reserved);
+
+	if (ra_end == end && ra_end == (kms >> PAGE_SHIFT))
+		ll_ra_stats_inc(inode, RA_STAT_EOF);
+
+	CDEBUG(D_READA, "ra_end = %lu end = %lu stride end = %lu pages = %d\n",
+	       ra_end, end, ria->ria_end, ret);
+
+	if (ra_end != end)
+		ll_ra_stats_inc(inode, RA_STAT_FAILED_REACH_END);
+	if (ra_end > 0) {
+		/* update the ras so that the next read-ahead tries from
+		 * where we left off. */
+		spin_lock(&ras->ras_lock);
+		ras->ras_next_readahead = ra_end + 1;
+		spin_unlock(&ras->ras_lock);
+		RAS_CDEBUG(ras);
+	}
+
+	RETURN(ret);
+}
+
+static void ras_set_start(struct inode *inode, struct ll_readahead_state *ras,
+			  unsigned long index)
+{
+	ras->ras_window_start = ras_align(ras, index, NULL);
+}
+
+/* called with the ras_lock held or from places where it doesn't matter */
+static void ras_reset(struct inode *inode, struct ll_readahead_state *ras,
+		      unsigned long index)
+{
+	ras->ras_last_readpage = index;
+	ras->ras_consecutive_requests = 0;
+	ras->ras_consecutive_pages = 0;
+	ras->ras_window_len = 0;
+	ras_set_start(inode, ras, index);
+	ras->ras_next_readahead = max(ras->ras_window_start, index + 1);
+
+	RAS_CDEBUG(ras);
+}
+
+/* called with the ras_lock held or from places where it doesn't matter */
+static void ras_stride_reset(struct ll_readahead_state *ras)
+{
+        ras->ras_consecutive_stride_requests = 0;
+        ras->ras_stride_length = 0;
+        ras->ras_stride_pages = 0;
+        RAS_CDEBUG(ras);
+}
+
+void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras)
+{
+	spin_lock_init(&ras->ras_lock);
+	ras->ras_rpc_size = PTLRPC_MAX_BRW_PAGES;
+	ras_reset(inode, ras, 0);
+	ras->ras_requests = 0;
+}
+
+/*
+ * Check whether the read request is in the stride window.
+ * If it is in the stride window, return 1, otherwise return 0.
+ */
+static int index_in_stride_window(struct ll_readahead_state *ras,
+				  unsigned long index)
+{
+	unsigned long stride_gap;
+
+	if (ras->ras_stride_length == 0 || ras->ras_stride_pages == 0 ||
+	    ras->ras_stride_pages == ras->ras_stride_length)
+		return 0;
+
+	stride_gap = index - ras->ras_last_readpage - 1;
+
+	/* If it is contiguous read */
+	if (stride_gap == 0)
+		return ras->ras_consecutive_pages + 1 <= ras->ras_stride_pages;
+
+	/* Otherwise check the stride by itself */
+	return (ras->ras_stride_length - ras->ras_stride_pages) == stride_gap &&
+		ras->ras_consecutive_pages == ras->ras_stride_pages;
+}
+
+static void ras_update_stride_detector(struct ll_readahead_state *ras,
+                                       unsigned long index)
+{
+        unsigned long stride_gap = index - ras->ras_last_readpage - 1;
+
+        if (!stride_io_mode(ras) && (stride_gap != 0 ||
+             ras->ras_consecutive_stride_requests == 0)) {
+                ras->ras_stride_pages = ras->ras_consecutive_pages;
+                ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages;
+        }
+        LASSERT(ras->ras_request_index == 0);
+        LASSERT(ras->ras_consecutive_stride_requests == 0);
+
+        if (index <= ras->ras_last_readpage) {
+                /*Reset stride window for forward read*/
+                ras_stride_reset(ras);
+                return;
+        }
+
+        ras->ras_stride_pages = ras->ras_consecutive_pages;
+        ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages;
+
+        RAS_CDEBUG(ras);
+        return;
+}
+
+static unsigned long
+stride_page_count(struct ll_readahead_state *ras, unsigned long len)
+{
+        return stride_pg_count(ras->ras_stride_offset, ras->ras_stride_length,
+                               ras->ras_stride_pages, ras->ras_stride_offset,
+                               len);
+}
+
+/* Stride Read-ahead window will be increased inc_len according to
+ * stride I/O pattern */
+static void ras_stride_increase_window(struct ll_readahead_state *ras,
+                                       struct ll_ra_info *ra,
+                                       unsigned long inc_len)
+{
+        unsigned long left, step, window_len;
+        unsigned long stride_len;
+
+        LASSERT(ras->ras_stride_length > 0);
+        LASSERTF(ras->ras_window_start + ras->ras_window_len
+                 >= ras->ras_stride_offset, "window_start %lu, window_len %lu"
+                 " stride_offset %lu\n", ras->ras_window_start,
+                 ras->ras_window_len, ras->ras_stride_offset);
+
+        stride_len = ras->ras_window_start + ras->ras_window_len -
+                     ras->ras_stride_offset;
+
+        left = stride_len % ras->ras_stride_length;
+        window_len = ras->ras_window_len - left;
+
+        if (left < ras->ras_stride_pages)
+                left += inc_len;
+        else
+                left = ras->ras_stride_pages + inc_len;
+
+        LASSERT(ras->ras_stride_pages != 0);
+
+        step = left / ras->ras_stride_pages;
+        left %= ras->ras_stride_pages;
+
+        window_len += step * ras->ras_stride_length + left;
+
+        if (stride_page_count(ras, window_len) <= ra->ra_max_pages_per_file)
+                ras->ras_window_len = window_len;
+
+        RAS_CDEBUG(ras);
+}
+
+static void ras_increase_window(struct inode *inode,
+				struct ll_readahead_state *ras,
+				struct ll_ra_info *ra)
+{
+	/* The stretch of ra-window should be aligned with max rpc_size
+	 * but current clio architecture does not support retrieve such
+	 * information from lower layer. FIXME later
+	 */
+	if (stride_io_mode(ras)) {
+		ras_stride_increase_window(ras, ra, ras->ras_rpc_size);
+	} else {
+		unsigned long wlen;
+
+		wlen = min(ras->ras_window_len + ras->ras_rpc_size,
+			   ra->ra_max_pages_per_file);
+		ras->ras_window_len = ras_align(ras, wlen, NULL);
+	}
+}
+
+static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
+		       struct ll_readahead_state *ras, unsigned long index,
+		       enum ras_update_flags flags)
+{
+	struct ll_ra_info *ra = &sbi->ll_ra_info;
+	bool hit = flags & LL_RAS_HIT;
+	int zero = 0, stride_detect = 0, ra_miss = 0;
+	ENTRY;
+
+	spin_lock(&ras->ras_lock);
+
+	if (!hit)
+		CDEBUG(D_READA, DFID " pages at %lu miss.\n",
+		       PFID(ll_inode2fid(inode)), index);
+        ll_ra_stats_inc_sbi(sbi, hit ? RA_STAT_HIT : RA_STAT_MISS);
+
+        /* reset the read-ahead window in two cases.  First when the app seeks
+         * or reads to some other part of the file.  Secondly if we get a
+         * read-ahead miss that we think we've previously issued.  This can
+         * be a symptom of there being so many read-ahead pages that the VM is
+         * reclaiming it before we get to it. */
+        if (!index_in_window(index, ras->ras_last_readpage, 8, 8)) {
+                zero = 1;
+                ll_ra_stats_inc_sbi(sbi, RA_STAT_DISTANT_READPAGE);
+        } else if (!hit && ras->ras_window_len &&
+                   index < ras->ras_next_readahead &&
+                   index_in_window(index, ras->ras_window_start, 0,
+                                   ras->ras_window_len)) {
+                ra_miss = 1;
+                ll_ra_stats_inc_sbi(sbi, RA_STAT_MISS_IN_WINDOW);
+        }
+
+        /* On the second access to a file smaller than the tunable
+         * ra_max_read_ahead_whole_pages trigger RA on all pages in the
+         * file up to ra_max_pages_per_file.  This is simply a best effort
+         * and only occurs once per open file.  Normal RA behavior is reverted
+         * to for subsequent IO.  The mmap case does not increment
+         * ras_requests and thus can never trigger this behavior. */
+	if (ras->ras_requests >= 2 && !ras->ras_request_index) {
+		__u64 kms_pages;
+
+		kms_pages = (i_size_read(inode) + PAGE_SIZE - 1) >>
+			    PAGE_SHIFT;
+
+		CDEBUG(D_READA, "kmsp %llu mwp %lu mp %lu\n", kms_pages,
+                       ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages_per_file);
+
+                if (kms_pages &&
+                    kms_pages <= ra->ra_max_read_ahead_whole_pages) {
+                        ras->ras_window_start = 0;
+			ras->ras_next_readahead = index + 1;
+                        ras->ras_window_len = min(ra->ra_max_pages_per_file,
+                                ra->ra_max_read_ahead_whole_pages);
+                        GOTO(out_unlock, 0);
+                }
+        }
+	if (zero) {
+		/* check whether it is in stride I/O mode*/
+		if (!index_in_stride_window(ras, index)) {
+			if (ras->ras_consecutive_stride_requests == 0 &&
+			    ras->ras_request_index == 0) {
+				ras_update_stride_detector(ras, index);
+				ras->ras_consecutive_stride_requests++;
+			} else {
+				ras_stride_reset(ras);
+			}
+			ras_reset(inode, ras, index);
+			ras->ras_consecutive_pages++;
+			GOTO(out_unlock, 0);
+		} else {
+			ras->ras_consecutive_pages = 0;
+			ras->ras_consecutive_requests = 0;
+			if (++ras->ras_consecutive_stride_requests > 1)
+				stride_detect = 1;
+			RAS_CDEBUG(ras);
+		}
+	} else {
+		if (ra_miss) {
+			if (index_in_stride_window(ras, index) &&
+			    stride_io_mode(ras)) {
+				if (index != ras->ras_last_readpage + 1)
+					ras->ras_consecutive_pages = 0;
+				ras_reset(inode, ras, index);
+
+				/* If stride-RA hit cache miss, the stride
+				 * detector will not be reset to avoid the
+				 * overhead of redetecting read-ahead mode,
+				 * but on the condition that the stride window
+				 * is still intersect with normal sequential
+				 * read-ahead window. */
+				if (ras->ras_window_start <
+				    ras->ras_stride_offset)
+					ras_stride_reset(ras);
+				RAS_CDEBUG(ras);
+			} else {
+				/* Reset both stride window and normal RA
+				 * window */
+				ras_reset(inode, ras, index);
+				ras->ras_consecutive_pages++;
+				ras_stride_reset(ras);
+				GOTO(out_unlock, 0);
+			}
+		} else if (stride_io_mode(ras)) {
+			/* If this is contiguous read but in stride I/O mode
+			 * currently, check whether stride step still is valid,
+			 * if invalid, it will reset the stride ra window*/
+			if (!index_in_stride_window(ras, index)) {
+				/* Shrink stride read-ahead window to be zero */
+				ras_stride_reset(ras);
+				ras->ras_window_len = 0;
+				ras->ras_next_readahead = index;
+			}
+		}
+	}
+	ras->ras_consecutive_pages++;
+	ras->ras_last_readpage = index;
+	ras_set_start(inode, ras, index);
+
+	if (stride_io_mode(ras)) {
+		/* Since stride readahead is sentivite to the offset
+		 * of read-ahead, so we use original offset here,
+		 * instead of ras_window_start, which is RPC aligned */
+		ras->ras_next_readahead = max(index + 1,
+					      ras->ras_next_readahead);
+		ras->ras_window_start = max(ras->ras_stride_offset,
+					    ras->ras_window_start);
+	} else {
+		if (ras->ras_next_readahead < ras->ras_window_start)
+			ras->ras_next_readahead = ras->ras_window_start;
+		if (!hit)
+			ras->ras_next_readahead = index + 1;
+	}
+	RAS_CDEBUG(ras);
+
+	/* Trigger RA in the mmap case where ras_consecutive_requests
+	 * is not incremented and thus can't be used to trigger RA */
+	if (ras->ras_consecutive_pages >= 4 && flags & LL_RAS_MMAP) {
+		ras_increase_window(inode, ras, ra);
+		/* reset consecutive pages so that the readahead window can
+		 * grow gradually. */
+		ras->ras_consecutive_pages = 0;
+		GOTO(out_unlock, 0);
+	}
+
+	/* Initially reset the stride window offset to next_readahead*/
+	if (ras->ras_consecutive_stride_requests == 2 && stride_detect) {
+		/**
+		 * Once stride IO mode is detected, next_readahead should be
+		 * reset to make sure next_readahead > stride offset
+		 */
+		ras->ras_next_readahead = max(index, ras->ras_next_readahead);
+		ras->ras_stride_offset = index;
+		ras->ras_window_start = max(index, ras->ras_window_start);
+	}
+
+	/* The initial ras_window_len is set to the request size.  To avoid
+	 * uselessly reading and discarding pages for random IO the window is
+	 * only increased once per consecutive request received. */
+	if ((ras->ras_consecutive_requests > 1 || stride_detect) &&
+	    !ras->ras_request_index)
+		ras_increase_window(inode, ras, ra);
+	EXIT;
+out_unlock:
+	RAS_CDEBUG(ras);
+	ras->ras_request_index++;
+	spin_unlock(&ras->ras_lock);
+	return;
+}
+
+int ll_writepage(struct page *vmpage, struct writeback_control *wbc)
+{
+	struct inode	       *inode = vmpage->mapping->host;
+	struct ll_inode_info   *lli   = ll_i2info(inode);
+        struct lu_env          *env;
+        struct cl_io           *io;
+        struct cl_page         *page;
+        struct cl_object       *clob;
+	bool redirtied = false;
+	bool unlocked = false;
+        int result;
+	__u16 refcheck;
+        ENTRY;
+
+        LASSERT(PageLocked(vmpage));
+        LASSERT(!PageWriteback(vmpage));
+
+	LASSERT(ll_i2dtexp(inode) != NULL);
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		GOTO(out, result = PTR_ERR(env));
+
+        clob  = ll_i2info(inode)->lli_clob;
+        LASSERT(clob != NULL);
+
+	io = vvp_env_thread_io(env);
+        io->ci_obj = clob;
+	io->ci_ignore_layout = 1;
+        result = cl_io_init(env, io, CIT_MISC, clob);
+        if (result == 0) {
+                page = cl_page_find(env, clob, vmpage->index,
+                                    vmpage, CPT_CACHEABLE);
+		if (!IS_ERR(page)) {
+			lu_ref_add(&page->cp_reference, "writepage",
+				   current);
+			cl_page_assume(env, io, page);
+			result = cl_page_flush(env, io, page);
+			if (result != 0) {
+				/*
+				 * Re-dirty page on error so it retries write,
+				 * but not in case when IO has actually
+				 * occurred and completed with an error.
+				 */
+				if (!PageError(vmpage)) {
+					redirty_page_for_writepage(wbc, vmpage);
+					result = 0;
+					redirtied = true;
+				}
+			}
+			cl_page_disown(env, io, page);
+			unlocked = true;
+			lu_ref_del(&page->cp_reference,
+				   "writepage", current);
+			cl_page_put(env, page);
+		} else {
+			result = PTR_ERR(page);
+		}
+        }
+        cl_io_fini(env, io);
+
+	if (redirtied && wbc->sync_mode == WB_SYNC_ALL) {
+		loff_t offset = cl_offset(clob, vmpage->index);
+
+		/* Flush page failed because the extent is being written out.
+		 * Wait for the write of extent to be finished to avoid
+		 * breaking kernel which assumes ->writepage should mark
+		 * PageWriteback or clean the page. */
+		result = cl_sync_file_range(inode, offset,
+					    offset + PAGE_SIZE - 1,
+					    CL_FSYNC_LOCAL, 1);
+		if (result > 0) {
+			/* actually we may have written more than one page.
+			 * decreasing this page because the caller will count
+			 * it. */
+			wbc->nr_to_write -= result - 1;
+			result = 0;
+		}
+	}
+
+	cl_env_put(env, &refcheck);
+	GOTO(out, result);
+
+out:
+	if (result < 0) {
+		if (!lli->lli_async_rc)
+			lli->lli_async_rc = result;
+		SetPageError(vmpage);
+		if (!unlocked)
+			unlock_page(vmpage);
+	}
+	return result;
+}
+
+int ll_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	loff_t start;
+	loff_t end;
+	enum cl_fsync_mode mode;
+	int range_whole = 0;
+	int result;
+	ENTRY;
+
+	if (wbc->range_cyclic) {
+		start = mapping->writeback_index << PAGE_SHIFT;
+		end = OBD_OBJECT_EOF;
+	} else {
+		start = wbc->range_start;
+		end = wbc->range_end;
+		if (end == LLONG_MAX) {
+			end = OBD_OBJECT_EOF;
+			range_whole = start == 0;
+		}
+	}
+
+	mode = CL_FSYNC_NONE;
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		mode = CL_FSYNC_LOCAL;
+
+	if (ll_i2info(inode)->lli_clob == NULL)
+		RETURN(0);
+
+	/* for directio, it would call writepages() to evict cached pages
+	 * inside the IO context of write, which will cause deadlock at
+	 * layout_conf since it waits for active IOs to complete. */
+	result = cl_sync_file_range(inode, start, end, mode, 1);
+	if (result > 0) {
+		wbc->nr_to_write -= result;
+		result = 0;
+	 }
+
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) {
+		if (end == OBD_OBJECT_EOF)
+			mapping->writeback_index = 0;
+		else
+			mapping->writeback_index = (end >> PAGE_SHIFT) + 1;
+	}
+	RETURN(result);
+}
+
+struct ll_cl_context *ll_cl_find(struct file *file)
+{
+	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+	struct ll_cl_context *lcc;
+	struct ll_cl_context *found = NULL;
+
+	read_lock(&fd->fd_lock);
+	list_for_each_entry(lcc, &fd->fd_lccs, lcc_list) {
+		if (lcc->lcc_cookie == current) {
+			found = lcc;
+			break;
+		}
+	}
+	read_unlock(&fd->fd_lock);
+
+	return found;
+}
+
+void ll_cl_add(struct file *file, const struct lu_env *env, struct cl_io *io,
+	       enum lcc_type type)
+{
+	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+	struct ll_cl_context *lcc = &ll_env_info(env)->lti_io_ctx;
+
+	memset(lcc, 0, sizeof(*lcc));
+	INIT_LIST_HEAD(&lcc->lcc_list);
+	lcc->lcc_cookie = current;
+	lcc->lcc_env = env;
+	lcc->lcc_io = io;
+	lcc->lcc_type = type;
+
+	write_lock(&fd->fd_lock);
+	list_add(&lcc->lcc_list, &fd->fd_lccs);
+	write_unlock(&fd->fd_lock);
+}
+
+void ll_cl_remove(struct file *file, const struct lu_env *env)
+{
+	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+	struct ll_cl_context *lcc = &ll_env_info(env)->lti_io_ctx;
+
+	write_lock(&fd->fd_lock);
+	list_del_init(&lcc->lcc_list);
+	write_unlock(&fd->fd_lock);
+}
+
+static int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
+			   struct cl_page *page, struct file *file)
+{
+	struct inode              *inode  = vvp_object_inode(page->cp_obj);
+	struct ll_sb_info         *sbi    = ll_i2sbi(inode);
+	struct ll_file_data       *fd     = LUSTRE_FPRIVATE(file);
+	struct ll_readahead_state *ras    = &fd->fd_ras;
+	struct cl_2queue          *queue  = &io->ci_queue;
+	struct vvp_page           *vpg;
+	int			   rc = 0;
+	bool			   uptodate;
+	ENTRY;
+
+	vpg = cl2vvp_page(cl_object_page_slice(page->cp_obj, page));
+	uptodate = vpg->vpg_defer_uptodate;
+
+	if (sbi->ll_ra_info.ra_max_pages_per_file > 0 &&
+	    sbi->ll_ra_info.ra_max_pages > 0 &&
+	    !vpg->vpg_ra_updated) {
+		struct vvp_io *vio = vvp_env_io(env);
+		enum ras_update_flags flags = 0;
+
+		if (uptodate)
+			flags |= LL_RAS_HIT;
+		if (!vio->vui_ra_valid)
+			flags |= LL_RAS_MMAP;
+		ras_update(sbi, inode, ras, vvp_index(vpg), flags);
+	}
+
+	cl_2queue_init(queue);
+	if (uptodate) {
+		vpg->vpg_ra_used = 1;
+		cl_page_export(env, page, 1);
+		cl_page_disown(env, io, page);
+	} else {
+		cl_2queue_add(queue, page);
+	}
+
+	if (sbi->ll_ra_info.ra_max_pages_per_file > 0 &&
+	    sbi->ll_ra_info.ra_max_pages > 0) {
+		int rc2;
+
+		rc2 = ll_readahead(env, io, &queue->c2_qin, ras,
+				   uptodate);
+		CDEBUG(D_READA, DFID "%d pages read ahead at %lu\n",
+		       PFID(ll_inode2fid(inode)), rc2, vvp_index(vpg));
+	}
+
+	if (queue->c2_qin.pl_nr > 0) {
+		int count = queue->c2_qin.pl_nr;
+		rc = cl_io_submit_rw(env, io, CRT_READ, queue);
+		if (rc == 0)
+			task_io_account_read(PAGE_SIZE * count);
+	}
+
+	/*
+	 * Unlock unsent pages in case of error.
+	 */
+	cl_page_list_disown(env, io, &queue->c2_qin);
+	cl_2queue_fini(env, queue);
+
+	RETURN(rc);
+}
+
+int ll_readpage(struct file *file, struct page *vmpage)
+{
+	struct inode *inode = file_inode(file);
+	struct cl_object *clob = ll_i2info(inode)->lli_clob;
+	struct ll_cl_context *lcc;
+	const struct lu_env  *env;
+	struct cl_io   *io;
+	struct cl_page *page;
+	int result;
+	ENTRY;
+
+	lcc = ll_cl_find(file);
+	if (lcc == NULL) {
+		unlock_page(vmpage);
+		RETURN(-EIO);
+	}
+
+	env = lcc->lcc_env;
+	io  = lcc->lcc_io;
+	if (io == NULL) { /* fast read */
+		struct inode *inode = file_inode(file);
+		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+		struct ll_readahead_state *ras = &fd->fd_ras;
+		struct vvp_page *vpg;
+
+		result = -ENODATA;
+
+		/* TODO: need to verify the layout version to make sure
+		 * the page is not invalid due to layout change. */
+		page = cl_vmpage_page(vmpage, clob);
+		if (page == NULL) {
+			unlock_page(vmpage);
+			RETURN(result);
+		}
+
+		vpg = cl2vvp_page(cl_object_page_slice(page->cp_obj, page));
+		if (vpg->vpg_defer_uptodate) {
+			enum ras_update_flags flags = LL_RAS_HIT;
+
+			if (lcc->lcc_type == LCC_MMAP)
+				flags |= LL_RAS_MMAP;
+
+			/* For fast read, it updates read ahead state only
+			 * if the page is hit in cache because non cache page
+			 * case will be handled by slow read later. */
+			ras_update(ll_i2sbi(inode), inode, ras, vvp_index(vpg),
+				   flags);
+			/* avoid duplicate ras_update() call */
+			vpg->vpg_ra_updated = 1;
+
+			/* Check if we can issue a readahead RPC, if that is
+			 * the case, we can't do fast IO because we will need
+			 * a cl_io to issue the RPC. */
+			if (ras->ras_window_start + ras->ras_window_len <
+			    ras->ras_next_readahead + PTLRPC_MAX_BRW_PAGES) {
+				/* export the page and skip io stack */
+				vpg->vpg_ra_used = 1;
+				cl_page_export(env, page, 1);
+				result = 0;
+			}
+		}
+
+		unlock_page(vmpage);
+		cl_page_put(env, page);
+		RETURN(result);
+	}
+
+	LASSERT(io->ci_state == CIS_IO_GOING);
+	page = cl_page_find(env, clob, vmpage->index, vmpage, CPT_CACHEABLE);
+	if (!IS_ERR(page)) {
+		LASSERT(page->cp_type == CPT_CACHEABLE);
+		if (likely(!PageUptodate(vmpage))) {
+			cl_page_assume(env, io, page);
+			result = ll_io_read_page(env, io, page, file);
+		} else {
+			/* Page from a non-object file. */
+			unlock_page(vmpage);
+			result = 0;
+		}
+		cl_page_put(env, page);
+	} else {
+		unlock_page(vmpage);
+		result = PTR_ERR(page);
+        }
+	RETURN(result);
+}
+
+int ll_page_sync_io(const struct lu_env *env, struct cl_io *io,
+		    struct cl_page *page, enum cl_req_type crt)
+{
+	struct cl_2queue  *queue;
+	int result;
+
+	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+	queue = &io->ci_queue;
+	cl_2queue_init_page(queue, page);
+
+	result = cl_io_submit_sync(env, io, crt, queue, 0);
+	LASSERT(cl_page_is_owned(page, io));
+
+	if (crt == CRT_READ)
+		/*
+		 * in CRT_WRITE case page is left locked even in case of
+		 * error.
+		 */
+		cl_page_list_disown(env, io, &queue->c2_qin);
+	cl_2queue_fini(env, queue);
+
+	return result;
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/rw26.c b/drivers/staging/lustrefsx/lustre/llite/rw26.c
new file mode 100644
index 0000000000000..528f2892e3b40
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/rw26.c
@@ -0,0 +1,810 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/llite/rw26.c
+ *
+ * Lustre Lite I/O page cache routines for the 2.5/2.6 kernel version
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mpage.h>
+#include <linux/pagemap.h>
+#include <linux/string.h>
+#include <linux/unistd.h>
+#include <linux/writeback.h>
+
+#ifdef HAVE_MIGRATE_H
+#include <linux/migrate.h>
+#elif defined(HAVE_MIGRATE_MODE_H)
+#include <linux/migrate_mode.h>
+#endif
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "llite_internal.h"
+#include <lustre_compat.h>
+
+/**
+ * Implements Linux VM address_space::invalidatepage() method. This method is
+ * called when the page is truncate from a file, either as a result of
+ * explicit truncate, or when inode is removed from memory (as a result of
+ * final iput(), umount, or memory pressure induced icache shrinking).
+ *
+ * [0, offset] bytes of the page remain valid (this is for a case of not-page
+ * aligned truncate). Lustre leaves partially truncated page in the cache,
+ * relying on struct inode::i_size to limit further accesses.
+ */
+static void ll_invalidatepage(struct page *vmpage,
+#ifdef HAVE_INVALIDATE_RANGE
+				unsigned int offset, unsigned int length
+#else
+				unsigned long offset
+#endif
+			     )
+{
+        struct inode     *inode;
+        struct lu_env    *env;
+        struct cl_page   *page;
+        struct cl_object *obj;
+
+        LASSERT(PageLocked(vmpage));
+        LASSERT(!PageWriteback(vmpage));
+
+	/*
+	 * It is safe to not check anything in invalidatepage/releasepage
+	 * below because they are run with page locked and all our io is
+	 * happening with locked page too
+	 */
+#ifdef HAVE_INVALIDATE_RANGE
+	if (offset == 0 && length == PAGE_SIZE) {
+#else
+	if (offset == 0) {
+#endif
+		/* See the comment in ll_releasepage() */
+		env = cl_env_percpu_get();
+		LASSERT(!IS_ERR(env));
+
+		inode = vmpage->mapping->host;
+		obj = ll_i2info(inode)->lli_clob;
+		if (obj != NULL) {
+			page = cl_vmpage_page(vmpage, obj);
+			if (page != NULL) {
+				cl_page_delete(env, page);
+				cl_page_put(env, page);
+			}
+		} else
+			LASSERT(vmpage->private == 0);
+
+		cl_env_percpu_put(env);
+        }
+}
+
+#ifdef HAVE_RELEASEPAGE_WITH_INT
+#define RELEASEPAGE_ARG_TYPE int
+#else
+#define RELEASEPAGE_ARG_TYPE gfp_t
+#endif
+static int ll_releasepage(struct page *vmpage, RELEASEPAGE_ARG_TYPE gfp_mask)
+{
+	struct lu_env		*env;
+	struct cl_object	*obj;
+	struct cl_page		*page;
+	struct address_space	*mapping;
+	int result = 0;
+
+	LASSERT(PageLocked(vmpage));
+	if (PageWriteback(vmpage) || PageDirty(vmpage))
+		return 0;
+
+	mapping = vmpage->mapping;
+	if (mapping == NULL)
+		return 1;
+
+	obj = ll_i2info(mapping->host)->lli_clob;
+	if (obj == NULL)
+		return 1;
+
+	/* 1 for caller, 1 for cl_page and 1 for page cache */
+	if (page_count(vmpage) > 3)
+		return 0;
+
+	page = cl_vmpage_page(vmpage, obj);
+	if (page == NULL)
+		return 1;
+
+	env = cl_env_percpu_get();
+	LASSERT(!IS_ERR(env));
+
+	if (!cl_page_in_use(page)) {
+		result = 1;
+		cl_page_delete(env, page);
+	}
+
+	/* To use percpu env array, the call path can not be rescheduled;
+	 * otherwise percpu array will be messed if ll_releaspage() called
+	 * again on the same CPU.
+	 *
+	 * If this page holds the last refc of cl_object, the following
+	 * call path may cause reschedule:
+	 *   cl_page_put -> cl_page_free -> cl_object_put ->
+	 *     lu_object_put -> lu_object_free -> lov_delete_raid0.
+	 *
+	 * However, the kernel can't get rid of this inode until all pages have
+	 * been cleaned up. Now that we hold page lock here, it's pretty safe
+	 * that we won't get into object delete path.
+	 */
+	LASSERT(cl_object_refc(obj) > 1);
+	cl_page_put(env, page);
+
+	cl_env_percpu_put(env);
+	return result;
+}
+
+#define MAX_DIRECTIO_SIZE 2*1024*1024*1024UL
+
+static ssize_t
+ll_direct_IO_seg(const struct lu_env *env, struct cl_io *io, int rw,
+		 struct inode *inode, size_t size, loff_t file_offset,
+		 struct page **pages, int page_count)
+{
+	struct cl_page *clp;
+	struct cl_2queue *queue;
+	struct cl_object *obj = io->ci_obj;
+	int i;
+	ssize_t rc = 0;
+	size_t page_size = cl_page_size(obj);
+	size_t orig_size = size;
+	bool do_io;
+	int io_pages = 0;
+
+	ENTRY;
+	queue = &io->ci_queue;
+	cl_2queue_init(queue);
+	for (i = 0; i < page_count; i++) {
+		LASSERT(!(file_offset & (page_size - 1)));
+		clp = cl_page_find(env, obj, cl_index(obj, file_offset),
+				   pages[i], CPT_TRANSIENT);
+		if (IS_ERR(clp)) {
+			rc = PTR_ERR(clp);
+			break;
+		}
+
+		rc = cl_page_own(env, io, clp);
+		if (rc) {
+			LASSERT(clp->cp_state == CPS_FREEING);
+			cl_page_put(env, clp);
+			break;
+		}
+
+		do_io = true;
+
+		/* check the page type: if the page is a host page, then do
+		 * write directly
+		 */
+		if (clp->cp_type == CPT_CACHEABLE) {
+			struct page *vmpage = cl_page_vmpage(clp);
+			struct page *src_page;
+			struct page *dst_page;
+			void *src;
+			void *dst;
+
+			src_page = (rw == WRITE) ? pages[i] : vmpage;
+			dst_page = (rw == WRITE) ? vmpage : pages[i];
+
+			src = ll_kmap_atomic(src_page, KM_USER0);
+			dst = ll_kmap_atomic(dst_page, KM_USER1);
+			memcpy(dst, src, min(page_size, size));
+			ll_kunmap_atomic(dst, KM_USER1);
+			ll_kunmap_atomic(src, KM_USER0);
+
+			/* make sure page will be added to the transfer by
+			 * cl_io_submit()->...->vvp_page_prep_write().
+			 */
+			if (rw == WRITE)
+				set_page_dirty(vmpage);
+
+			if (rw == READ) {
+				/* do not issue the page for read, since it
+				 * may reread a ra page which has NOT uptodate
+				 * bit set.
+				 */
+				cl_page_disown(env, io, clp);
+				do_io = false;
+			}
+		}
+
+		if (likely(do_io)) {
+			cl_2queue_add(queue, clp);
+
+			/*
+			 * Set page clip to tell transfer formation engine
+			 * that page has to be sent even if it is beyond KMS.
+			 */
+			cl_page_clip(env, clp, 0, min(size, page_size));
+
+			++io_pages;
+		}
+
+		/* drop the reference count for cl_page_find */
+		cl_page_put(env, clp);
+		size -= page_size;
+		file_offset += page_size;
+	}
+
+	if (rc == 0 && io_pages) {
+		rc = cl_io_submit_sync(env, io,
+				       rw == READ ? CRT_READ : CRT_WRITE,
+				       queue, 0);
+	}
+	if (rc == 0)
+		rc = orig_size;
+
+	cl_2queue_discard(env, io, queue);
+	cl_2queue_disown(env, io, queue);
+	cl_2queue_fini(env, queue);
+	RETURN(rc);
+}
+
+/*  ll_free_user_pages - tear down page struct array
+ *  @pages: array of page struct pointers underlying target buffer */
+static void ll_free_user_pages(struct page **pages, int npages, int do_dirty)
+{
+	int i;
+
+	for (i = 0; i < npages; i++) {
+		if (pages[i] == NULL)
+			break;
+		if (do_dirty)
+			set_page_dirty_lock(pages[i]);
+		put_page(pages[i]);
+	}
+
+#if defined(HAVE_DIRECTIO_ITER) || defined(HAVE_IOV_ITER_RW)
+	kvfree(pages);
+#else
+	OBD_FREE_LARGE(pages, npages * sizeof(*pages));
+#endif
+}
+
+#ifdef KMALLOC_MAX_SIZE
+#define MAX_MALLOC KMALLOC_MAX_SIZE
+#else
+#define MAX_MALLOC (128 * 1024)
+#endif
+
+/* This is the maximum size of a single O_DIRECT request, based on the
+ * kmalloc limit.  We need to fit all of the brw_page structs, each one
+ * representing PAGE_SIZE worth of user data, into a single buffer, and
+ * then truncate this to be a full-sized RPC.  For 4kB PAGE_SIZE this is
+ * up to 22MB for 128kB kmalloc and up to 682MB for 4MB kmalloc. */
+#define MAX_DIO_SIZE ((MAX_MALLOC / sizeof(struct brw_page) * PAGE_SIZE) & \
+		      ~(DT_MAX_BRW_SIZE - 1))
+
+#ifndef HAVE_IOV_ITER_RW
+# define iov_iter_rw(iter)	rw
+#endif
+
+#if defined(HAVE_DIRECTIO_ITER) || defined(HAVE_IOV_ITER_RW)
+static ssize_t
+ll_direct_IO(
+# ifndef HAVE_IOV_ITER_RW
+	     int rw,
+# endif
+	     struct kiocb *iocb, struct iov_iter *iter
+# ifndef HAVE_DIRECTIO_2ARGS
+	     , loff_t file_offset
+# endif
+	     )
+{
+#ifdef HAVE_DIRECTIO_2ARGS
+	loff_t file_offset = iocb->ki_pos;
+#endif
+	struct ll_cl_context *lcc;
+	const struct lu_env *env;
+	struct cl_io *io;
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	ssize_t count = iov_iter_count(iter);
+	ssize_t tot_bytes = 0, result = 0;
+	size_t size = MAX_DIO_SIZE;
+
+	/* Check EOF by ourselves */
+	if (iov_iter_rw(iter) == READ && file_offset >= i_size_read(inode))
+		return 0;
+	/* FIXME: io smaller than PAGE_SIZE is broken on ia64 ??? */
+	if ((file_offset & ~PAGE_MASK) || (count & ~PAGE_MASK))
+		return -EINVAL;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), size=%zd (max %lu), "
+	       "offset=%lld=%llx, pages %zd (max %lu)\n",
+	       PFID(ll_inode2fid(inode)), inode, count, MAX_DIO_SIZE,
+	       file_offset, file_offset, count >> PAGE_SHIFT,
+	       MAX_DIO_SIZE >> PAGE_SHIFT);
+
+	/* Check that all user buffers are aligned as well */
+	if (iov_iter_alignment(iter) & ~PAGE_MASK)
+		return -EINVAL;
+
+	lcc = ll_cl_find(file);
+	if (lcc == NULL)
+		RETURN(-EIO);
+
+	env = lcc->lcc_env;
+	LASSERT(!IS_ERR(env));
+	io = lcc->lcc_io;
+	LASSERT(io != NULL);
+
+	/* 0. Need locking between buffered and direct access. and race with
+	 *    size changing by concurrent truncates and writes.
+	 * 1. Need inode mutex to operate transient pages.
+	 */
+	if (iov_iter_rw(iter) == READ)
+		inode_lock(inode);
+
+	while (iov_iter_count(iter)) {
+		struct page **pages;
+		size_t offs;
+
+		count = min_t(size_t, iov_iter_count(iter), size);
+		if (iov_iter_rw(iter) == READ) {
+			if (file_offset >= i_size_read(inode))
+				break;
+
+			if (file_offset + count > i_size_read(inode))
+				count = i_size_read(inode) - file_offset;
+		}
+
+		result = iov_iter_get_pages_alloc(iter, &pages, count, &offs);
+		if (likely(result > 0)) {
+			int n = DIV_ROUND_UP(result + offs, PAGE_SIZE);
+
+			result = ll_direct_IO_seg(env, io, iov_iter_rw(iter),
+						  inode, result, file_offset,
+						  pages, n);
+			ll_free_user_pages(pages, n,
+					   iov_iter_rw(iter) == READ);
+
+		}
+		if (unlikely(result <= 0)) {
+			/* If we can't allocate a large enough buffer
+			 * for the request, shrink it to a smaller
+			 * PAGE_SIZE multiple and try again.
+			 * We should always be able to kmalloc for a
+			 * page worth of page pointers = 4MB on i386. */
+			if (result == -ENOMEM &&
+			    size > (PAGE_SIZE / sizeof(*pages)) *
+				    PAGE_SIZE) {
+				size = ((((size / 2) - 1) |
+					~PAGE_MASK) + 1) & PAGE_MASK;
+				CDEBUG(D_VFSTRACE, "DIO size now %zu\n",
+				       size);
+				continue;
+			}
+
+			GOTO(out, result);
+		}
+
+		iov_iter_advance(iter, result);
+		tot_bytes += result;
+		file_offset += result;
+	}
+out:
+	if (iov_iter_rw(iter) == READ)
+		inode_unlock(inode);
+
+	if (tot_bytes > 0) {
+		struct vvp_io *vio = vvp_env_io(env);
+
+		/* no commit async for direct IO */
+		vio->u.write.vui_written += tot_bytes;
+	}
+
+	return tot_bytes ? : result;
+}
+#else /* !HAVE_DIRECTIO_ITER && !HAVE_IOV_ITER_RW */
+
+static inline int ll_get_user_pages(int rw, unsigned long user_addr,
+				    size_t size, struct page ***pages,
+				    int *max_pages)
+{
+	int result = -ENOMEM;
+
+	/* set an arbitrary limit to prevent arithmetic overflow */
+	if (size > MAX_DIRECTIO_SIZE) {
+		*pages = NULL;
+		return -EFBIG;
+	}
+
+	*max_pages = (user_addr + size + PAGE_SIZE - 1) >>
+		      PAGE_SHIFT;
+	*max_pages -= user_addr >> PAGE_SHIFT;
+
+	OBD_ALLOC_LARGE(*pages, *max_pages * sizeof(**pages));
+	if (*pages) {
+		down_read(&current->mm->mmap_sem);
+		result = get_user_pages(current, current->mm, user_addr,
+					*max_pages, (rw == READ), 0, *pages,
+					NULL);
+		up_read(&current->mm->mmap_sem);
+		if (unlikely(result <= 0))
+			OBD_FREE_LARGE(*pages, *max_pages * sizeof(**pages));
+	}
+
+	return result;
+}
+
+static ssize_t
+ll_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+	     loff_t file_offset, unsigned long nr_segs)
+{
+	struct ll_cl_context *lcc;
+	const struct lu_env *env;
+	struct cl_io *io;
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	ssize_t count = iov_length(iov, nr_segs);
+	ssize_t tot_bytes = 0, result = 0;
+	unsigned long seg = 0;
+	size_t size = MAX_DIO_SIZE;
+	ENTRY;
+
+        /* FIXME: io smaller than PAGE_SIZE is broken on ia64 ??? */
+	if ((file_offset & ~PAGE_MASK) || (count & ~PAGE_MASK))
+                RETURN(-EINVAL);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), size=%zd (max %lu), "
+	       "offset=%lld=%llx, pages %zd (max %lu)\n",
+	       PFID(ll_inode2fid(inode)), inode, count, MAX_DIO_SIZE,
+	       file_offset, file_offset, count >> PAGE_SHIFT,
+	       MAX_DIO_SIZE >> PAGE_SHIFT);
+
+        /* Check that all user buffers are aligned as well */
+        for (seg = 0; seg < nr_segs; seg++) {
+		if (((unsigned long)iov[seg].iov_base & ~PAGE_MASK) ||
+		    (iov[seg].iov_len & ~PAGE_MASK))
+                        RETURN(-EINVAL);
+        }
+
+	lcc = ll_cl_find(file);
+	if (lcc == NULL)
+		RETURN(-EIO);
+
+	env = lcc->lcc_env;
+	LASSERT(!IS_ERR(env));
+	io = lcc->lcc_io;
+	LASSERT(io != NULL);
+
+        for (seg = 0; seg < nr_segs; seg++) {
+		size_t iov_left = iov[seg].iov_len;
+                unsigned long user_addr = (unsigned long)iov[seg].iov_base;
+
+                if (rw == READ) {
+                        if (file_offset >= i_size_read(inode))
+                                break;
+                        if (file_offset + iov_left > i_size_read(inode))
+                                iov_left = i_size_read(inode) - file_offset;
+                }
+
+                while (iov_left > 0) {
+                        struct page **pages;
+                        int page_count, max_pages = 0;
+			size_t bytes;
+
+                        bytes = min(size, iov_left);
+                        page_count = ll_get_user_pages(rw, user_addr, bytes,
+                                                       &pages, &max_pages);
+                        if (likely(page_count > 0)) {
+                                if (unlikely(page_count <  max_pages))
+					bytes = page_count << PAGE_SHIFT;
+				result = ll_direct_IO_seg(env, io, rw, inode,
+							  bytes, file_offset,
+							  pages, page_count);
+                                ll_free_user_pages(pages, max_pages, rw==READ);
+                        } else if (page_count == 0) {
+                                GOTO(out, result = -EFAULT);
+                        } else {
+                                result = page_count;
+                        }
+                        if (unlikely(result <= 0)) {
+                                /* If we can't allocate a large enough buffer
+                                 * for the request, shrink it to a smaller
+                                 * PAGE_SIZE multiple and try again.
+                                 * We should always be able to kmalloc for a
+                                 * page worth of page pointers = 4MB on i386. */
+                                if (result == -ENOMEM &&
+				    size > (PAGE_SIZE / sizeof(*pages)) *
+					   PAGE_SIZE) {
+                                        size = ((((size / 2) - 1) |
+						 ~PAGE_MASK) + 1) &
+						PAGE_MASK;
+					CDEBUG(D_VFSTRACE, "DIO size now %zu\n",
+                                               size);
+                                        continue;
+                                }
+
+                                GOTO(out, result);
+                        }
+
+                        tot_bytes += result;
+                        file_offset += result;
+                        iov_left -= result;
+                        user_addr += result;
+                }
+        }
+out:
+        if (tot_bytes > 0) {
+		struct vvp_io *vio = vvp_env_io(env);
+
+		/* no commit async for direct IO */
+		vio->u.write.vui_written += tot_bytes;
+	}
+
+	RETURN(tot_bytes ? tot_bytes : result);
+}
+#endif /* HAVE_DIRECTIO_ITER || HAVE_IOV_ITER_RW */
+
+/**
+ * Prepare partially written-to page for a write.
+ */
+static int ll_prepare_partial_page(const struct lu_env *env, struct cl_io *io,
+				   struct cl_page *pg)
+{
+	struct cl_attr *attr   = vvp_env_thread_attr(env);
+	struct cl_object *obj  = io->ci_obj;
+	struct vvp_page *vpg   = cl_object_page_slice(obj, pg);
+	loff_t          offset = cl_offset(obj, vvp_index(vpg));
+	int             result;
+
+	cl_object_attr_lock(obj);
+	result = cl_object_attr_get(env, obj, attr);
+	cl_object_attr_unlock(obj);
+	if (result == 0) {
+		/*
+		 * If are writing to a new page, no need to read old data.
+		 * The extent locking will have updated the KMS, and for our
+		 * purposes here we can treat it like i_size.
+		 */
+		if (attr->cat_kms <= offset) {
+			char *kaddr = ll_kmap_atomic(vpg->vpg_page, KM_USER0);
+
+			memset(kaddr, 0, cl_page_size(obj));
+			ll_kunmap_atomic(kaddr, KM_USER0);
+		} else if (vpg->vpg_defer_uptodate)
+			vpg->vpg_ra_used = 1;
+		else
+			result = ll_page_sync_io(env, io, pg, CRT_READ);
+	}
+	return result;
+}
+
+static int ll_write_begin(struct file *file, struct address_space *mapping,
+			  loff_t pos, unsigned len, unsigned flags,
+			  struct page **pagep, void **fsdata)
+{
+	struct ll_cl_context *lcc;
+	const struct lu_env  *env = NULL;
+	struct cl_io   *io;
+	struct cl_page *page = NULL;
+
+	struct cl_object *clob = ll_i2info(mapping->host)->lli_clob;
+	pgoff_t index = pos >> PAGE_SHIFT;
+	struct page *vmpage = NULL;
+	unsigned from = pos & (PAGE_SIZE - 1);
+	unsigned to = from + len;
+	int result = 0;
+	ENTRY;
+
+	CDEBUG(D_PAGE, "Writing %lu of %d to %d bytes\n", index, from, len);
+
+	lcc = ll_cl_find(file);
+	if (lcc == NULL) {
+		io = NULL;
+		GOTO(out, result = -EIO);
+	}
+
+	env = lcc->lcc_env;
+	io  = lcc->lcc_io;
+
+	/* To avoid deadlock, try to lock page first. */
+	vmpage = grab_cache_page_nowait(mapping, index);
+
+	if (unlikely(vmpage == NULL ||
+		     PageDirty(vmpage) || PageWriteback(vmpage))) {
+		struct vvp_io *vio = vvp_env_io(env);
+		struct cl_page_list *plist = &vio->u.write.vui_queue;
+
+                /* if the page is already in dirty cache, we have to commit
+		 * the pages right now; otherwise, it may cause deadlock
+		 * because it holds page lock of a dirty page and request for
+		 * more grants. It's okay for the dirty page to be the first
+		 * one in commit page list, though. */
+		if (vmpage != NULL && plist->pl_nr > 0) {
+			unlock_page(vmpage);
+			put_page(vmpage);
+			vmpage = NULL;
+		}
+
+		/* commit pages and then wait for page lock */
+		result = vvp_io_write_commit(env, io);
+		if (result < 0)
+			GOTO(out, result);
+
+		if (vmpage == NULL) {
+			vmpage = grab_cache_page_write_begin(mapping, index,
+							     flags);
+			if (vmpage == NULL)
+				GOTO(out, result = -ENOMEM);
+		}
+	}
+
+	page = cl_page_find(env, clob, vmpage->index, vmpage, CPT_CACHEABLE);
+	if (IS_ERR(page))
+		GOTO(out, result = PTR_ERR(page));
+
+	lcc->lcc_page = page;
+	lu_ref_add(&page->cp_reference, "cl_io", io);
+
+	cl_page_assume(env, io, page);
+	if (!PageUptodate(vmpage)) {
+		/*
+		 * We're completely overwriting an existing page,
+		 * so _don't_ set it up to date until commit_write
+		 */
+		if (from == 0 && to == PAGE_SIZE) {
+			CL_PAGE_HEADER(D_PAGE, env, page, "full page write\n");
+			POISON_PAGE(vmpage, 0x11);
+		} else {
+			/* TODO: can be optimized at OSC layer to check if it
+			 * is a lockless IO. In that case, it's not necessary
+			 * to read the data. */
+			result = ll_prepare_partial_page(env, io, page);
+			if (result == 0)
+				SetPageUptodate(vmpage);
+		}
+	}
+	if (result < 0)
+		cl_page_unassume(env, io, page);
+	EXIT;
+out:
+	if (result < 0) {
+		if (vmpage != NULL) {
+			unlock_page(vmpage);
+			put_page(vmpage);
+		}
+		if (!IS_ERR_OR_NULL(page)) {
+			lu_ref_del(&page->cp_reference, "cl_io", io);
+			cl_page_put(env, page);
+		}
+		if (io)
+			io->ci_result = result;
+	} else {
+		*pagep = vmpage;
+		*fsdata = lcc;
+	}
+	RETURN(result);
+}
+
+static int ll_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *vmpage, void *fsdata)
+{
+	struct ll_cl_context *lcc = fsdata;
+	const struct lu_env *env;
+	struct cl_io *io;
+	struct vvp_io *vio;
+	struct cl_page *page;
+	unsigned from = pos & (PAGE_SIZE - 1);
+	bool unplug = false;
+	int result = 0;
+	ENTRY;
+
+	put_page(vmpage);
+
+	LASSERT(lcc != NULL);
+	env  = lcc->lcc_env;
+	page = lcc->lcc_page;
+	io   = lcc->lcc_io;
+	vio  = vvp_env_io(env);
+
+	LASSERT(cl_page_is_owned(page, io));
+	if (copied > 0) {
+		struct cl_page_list *plist = &vio->u.write.vui_queue;
+
+		lcc->lcc_page = NULL; /* page will be queued */
+
+		/* Add it into write queue */
+		cl_page_list_add(plist, page);
+		if (plist->pl_nr == 1) /* first page */
+			vio->u.write.vui_from = from;
+		else
+			LASSERT(from == 0);
+		vio->u.write.vui_to = from + copied;
+
+		/* To address the deadlock in balance_dirty_pages() where
+		 * this dirty page may be written back in the same thread. */
+		if (PageDirty(vmpage))
+			unplug = true;
+
+		/* We may have one full RPC, commit it soon */
+		if (plist->pl_nr >= PTLRPC_MAX_BRW_PAGES)
+			unplug = true;
+
+		CL_PAGE_DEBUG(D_PAGE, env, page,
+			      "queued page: %d.\n", plist->pl_nr);
+	} else {
+		cl_page_disown(env, io, page);
+
+		lcc->lcc_page = NULL;
+		lu_ref_del(&page->cp_reference, "cl_io", io);
+		cl_page_put(env, page);
+
+		/* page list is not contiguous now, commit it now */
+		unplug = true;
+	}
+	if (unplug || io->u.ci_rw.rw_sync)
+		result = vvp_io_write_commit(env, io);
+
+	if (result < 0)
+		io->ci_result = result;
+	RETURN(result >= 0 ? copied : result);
+}
+
+#ifdef CONFIG_MIGRATION
+static int ll_migratepage(struct address_space *mapping,
+			  struct page *newpage, struct page *page
+#ifdef HAVE_MIGRATEPAGE_4ARGS
+			  , enum migrate_mode mode
+#endif
+	)
+{
+        /* Always fail page migration until we have a proper implementation */
+        return -EIO;
+}
+#endif
+
+const struct address_space_operations ll_aops = {
+	.readpage	= ll_readpage,
+	.direct_IO	= ll_direct_IO,
+	.writepage	= ll_writepage,
+	.writepages	= ll_writepages,
+	.set_page_dirty	= __set_page_dirty_nobuffers,
+	.write_begin	= ll_write_begin,
+	.write_end	= ll_write_end,
+	.invalidatepage	= ll_invalidatepage,
+	.releasepage	= (void *)ll_releasepage,
+#ifdef CONFIG_MIGRATION
+	.migratepage	= ll_migratepage,
+#endif
+};
diff --git a/drivers/staging/lustrefsx/lustre/llite/statahead.c b/drivers/staging/lustrefsx/lustre/llite/statahead.c
new file mode 100644
index 0000000000000..5b2af025d28f9
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/statahead.c
@@ -0,0 +1,1664 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_support.h>
+#include <lustre_dlm.h>
+#include "llite_internal.h"
+
+#define SA_OMITTED_ENTRY_MAX 8ULL
+
+typedef enum {
+	/** negative values are for error cases */
+	SA_ENTRY_INIT = 0,      /** init entry */
+	SA_ENTRY_SUCC = 1,      /** stat succeed */
+	SA_ENTRY_INVA = 2,      /** invalid entry */
+} se_state_t;
+
+/* sa_entry is not refcounted: statahead thread allocates it and do async stat,
+ * and in async stat callback ll_statahead_interpret() will add it into
+ * sai_interim_entries, later statahead thread will call sa_handle_callback() to
+ * instantiate entry and move it into sai_entries, and then only scanner process
+ * can access and free it. */
+struct sa_entry {
+	/* link into sai_interim_entries or sai_entries */
+	struct list_head	se_list;
+	/* link into sai hash table locally */
+	struct list_head	se_hash;
+	/* entry index in the sai */
+	__u64			se_index;
+	/* low layer ldlm lock handle */
+	__u64			se_handle;
+	/* entry status */
+	se_state_t		se_state;
+	/* entry size, contains name */
+	int			se_size;
+	/* pointer to async getattr enqueue info */
+	struct md_enqueue_info *se_minfo;
+	/* pointer to the async getattr request */
+	struct ptlrpc_request  *se_req;
+	/* pointer to the target inode */
+	struct inode	       *se_inode;
+	/* entry name */
+	struct qstr		se_qstr;
+	/* entry fid */
+	struct lu_fid		se_fid;
+};
+
+static unsigned int sai_generation = 0;
+static DEFINE_SPINLOCK(sai_generation_lock);
+
+static inline int sa_unhashed(struct sa_entry *entry)
+{
+	return list_empty(&entry->se_hash);
+}
+
+/* sa_entry is ready to use */
+static inline int sa_ready(struct sa_entry *entry)
+{
+	smp_rmb();
+	return (entry->se_state != SA_ENTRY_INIT);
+}
+
+/* hash value to put in sai_cache */
+static inline int sa_hash(int val)
+{
+	return val & LL_SA_CACHE_MASK;
+}
+
+/* hash entry into sai_cache */
+static inline void
+sa_rehash(struct ll_statahead_info *sai, struct sa_entry *entry)
+{
+	int i = sa_hash(entry->se_qstr.hash);
+
+	spin_lock(&sai->sai_cache_lock[i]);
+	list_add_tail(&entry->se_hash, &sai->sai_cache[i]);
+	spin_unlock(&sai->sai_cache_lock[i]);
+}
+
+/* unhash entry from sai_cache */
+static inline void
+sa_unhash(struct ll_statahead_info *sai, struct sa_entry *entry)
+{
+	int i = sa_hash(entry->se_qstr.hash);
+
+	spin_lock(&sai->sai_cache_lock[i]);
+	list_del_init(&entry->se_hash);
+	spin_unlock(&sai->sai_cache_lock[i]);
+}
+
+static inline int agl_should_run(struct ll_statahead_info *sai,
+                                 struct inode *inode)
+{
+	return (inode != NULL && S_ISREG(inode->i_mode) && sai->sai_agl_valid);
+}
+
+static inline struct ll_inode_info *
+agl_first_entry(struct ll_statahead_info *sai)
+{
+	return list_entry(sai->sai_agls.next, struct ll_inode_info,
+			  lli_agl_list);
+}
+
+/* statahead window is full */
+static inline int sa_sent_full(struct ll_statahead_info *sai)
+{
+	return atomic_read(&sai->sai_cache_count) >= sai->sai_max;
+}
+
+/* got async stat replies */
+static inline int sa_has_callback(struct ll_statahead_info *sai)
+{
+	return !list_empty(&sai->sai_interim_entries);
+}
+
+static inline int agl_list_empty(struct ll_statahead_info *sai)
+{
+	return list_empty(&sai->sai_agls);
+}
+
+/**
+ * (1) hit ratio less than 80%
+ * or
+ * (2) consecutive miss more than 8
+ * then means low hit.
+ */
+static inline int sa_low_hit(struct ll_statahead_info *sai)
+{
+        return ((sai->sai_hit > 7 && sai->sai_hit < 4 * sai->sai_miss) ||
+                (sai->sai_consecutive_miss > 8));
+}
+
+/*
+ * if the given index is behind of statahead window more than
+ * SA_OMITTED_ENTRY_MAX, then it is old.
+ */
+static inline int is_omitted_entry(struct ll_statahead_info *sai, __u64 index)
+{
+        return ((__u64)sai->sai_max + index + SA_OMITTED_ENTRY_MAX <
+                 sai->sai_index);
+}
+
+/* allocate sa_entry and hash it to allow scanner process to find it */
+static struct sa_entry *
+sa_alloc(struct dentry *parent, struct ll_statahead_info *sai, __u64 index,
+	 const char *name, int len, const struct lu_fid *fid)
+{
+	struct ll_inode_info *lli;
+	struct sa_entry *entry;
+	int entry_size;
+	char *dname;
+	ENTRY;
+
+	entry_size = sizeof(struct sa_entry) + (len & ~3) + 4;
+	OBD_ALLOC(entry, entry_size);
+	if (unlikely(entry == NULL))
+		RETURN(ERR_PTR(-ENOMEM));
+
+	CDEBUG(D_READA, "alloc sa entry %.*s(%p) index %llu\n",
+	       len, name, entry, index);
+
+	entry->se_index = index;
+
+	entry->se_state = SA_ENTRY_INIT;
+	entry->se_size = entry_size;
+	dname = (char *)entry + sizeof(struct sa_entry);
+	memcpy(dname, name, len);
+	dname[len] = 0;
+	entry->se_qstr.hash = ll_full_name_hash(parent, name, len);
+	entry->se_qstr.len = len;
+	entry->se_qstr.name = dname;
+	entry->se_fid = *fid;
+
+	lli = ll_i2info(sai->sai_dentry->d_inode);
+
+	spin_lock(&lli->lli_sa_lock);
+	INIT_LIST_HEAD(&entry->se_list);
+	sa_rehash(sai, entry);
+	spin_unlock(&lli->lli_sa_lock);
+
+	atomic_inc(&sai->sai_cache_count);
+
+	RETURN(entry);
+}
+
+/* free sa_entry, which should have been unhashed and not in any list */
+static void sa_free(struct ll_statahead_info *sai, struct sa_entry *entry)
+{
+	CDEBUG(D_READA, "free sa entry %.*s(%p) index %llu\n",
+	       entry->se_qstr.len, entry->se_qstr.name, entry,
+	       entry->se_index);
+
+	LASSERT(list_empty(&entry->se_list));
+	LASSERT(sa_unhashed(entry));
+
+	OBD_FREE(entry, entry->se_size);
+	atomic_dec(&sai->sai_cache_count);
+}
+
+/*
+ * find sa_entry by name, used by directory scanner, lock is not needed because
+ * only scanner can remove the entry from cache.
+ */
+static struct sa_entry *
+sa_get(struct ll_statahead_info *sai, const struct qstr *qstr)
+{
+	struct sa_entry *entry;
+	int i = sa_hash(qstr->hash);
+
+	list_for_each_entry(entry, &sai->sai_cache[i], se_hash) {
+		if (entry->se_qstr.hash == qstr->hash &&
+		    entry->se_qstr.len == qstr->len &&
+		    memcmp(entry->se_qstr.name, qstr->name, qstr->len) == 0)
+			return entry;
+	}
+	return NULL;
+}
+
+/* unhash and unlink sa_entry, and then free it */
+static inline void
+sa_kill(struct ll_statahead_info *sai, struct sa_entry *entry)
+{
+	struct ll_inode_info *lli = ll_i2info(sai->sai_dentry->d_inode);
+
+	LASSERT(!sa_unhashed(entry));
+	LASSERT(!list_empty(&entry->se_list));
+	LASSERT(sa_ready(entry));
+
+	sa_unhash(sai, entry);
+
+	spin_lock(&lli->lli_sa_lock);
+	list_del_init(&entry->se_list);
+	spin_unlock(&lli->lli_sa_lock);
+
+	if (entry->se_inode != NULL)
+		iput(entry->se_inode);
+
+	sa_free(sai, entry);
+}
+
+/* called by scanner after use, sa_entry will be killed */
+static void
+sa_put(struct ll_statahead_info *sai, struct sa_entry *entry)
+{
+	struct sa_entry *tmp, *next;
+
+	if (entry != NULL && entry->se_state == SA_ENTRY_SUCC) {
+		struct ll_sb_info *sbi = ll_i2sbi(sai->sai_dentry->d_inode);
+
+		sai->sai_hit++;
+		sai->sai_consecutive_miss = 0;
+		sai->sai_max = min(2 * sai->sai_max, sbi->ll_sa_max);
+	} else {
+		sai->sai_miss++;
+		sai->sai_consecutive_miss++;
+	}
+
+	if (entry != NULL)
+		sa_kill(sai, entry);
+
+	/* kill old completed entries, only scanner process does this, no need
+	 * to lock */
+	list_for_each_entry_safe(tmp, next, &sai->sai_entries, se_list) {
+		if (!is_omitted_entry(sai, tmp->se_index))
+			break;
+		sa_kill(sai, tmp);
+	}
+
+	wake_up(&sai->sai_thread.t_ctl_waitq);
+}
+
+/* update state and sort add entry to sai_entries by index, return true if
+ * scanner is waiting on this entry. */
+static bool
+__sa_make_ready(struct ll_statahead_info *sai, struct sa_entry *entry, int ret)
+{
+	struct sa_entry *se;
+	struct list_head *pos = &sai->sai_entries;
+	__u64 index = entry->se_index;
+
+	LASSERT(!sa_ready(entry));
+	LASSERT(list_empty(&entry->se_list));
+
+	list_for_each_entry_reverse(se, &sai->sai_entries, se_list) {
+		if (se->se_index < entry->se_index) {
+			pos = &se->se_list;
+			break;
+		}
+	}
+	list_add(&entry->se_list, pos);
+	entry->se_state = ret < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC;
+
+	return (index == sai->sai_index_wait);
+}
+
+/*
+ * release resources used in async stat RPC, update entry state and wakeup if
+ * scanner process it waiting on this entry.
+ */
+static void
+sa_make_ready(struct ll_statahead_info *sai, struct sa_entry *entry, int ret)
+{
+	struct ll_inode_info *lli = ll_i2info(sai->sai_dentry->d_inode);
+	struct md_enqueue_info *minfo = entry->se_minfo;
+	struct ptlrpc_request *req = entry->se_req;
+	bool wakeup;
+
+	/* release resources used in RPC */
+	if (minfo) {
+		entry->se_minfo = NULL;
+		ll_intent_release(&minfo->mi_it);
+		iput(minfo->mi_dir);
+		OBD_FREE_PTR(minfo);
+	}
+
+	if (req) {
+		entry->se_req = NULL;
+		ptlrpc_req_finished(req);
+	}
+
+	spin_lock(&lli->lli_sa_lock);
+	wakeup = __sa_make_ready(sai, entry, ret);
+	spin_unlock(&lli->lli_sa_lock);
+
+	if (wakeup)
+		wake_up(&sai->sai_waitq);
+}
+
+/* insert inode into the list of sai_agls */
+static void ll_agl_add(struct ll_statahead_info *sai,
+                       struct inode *inode, int index)
+{
+	struct ll_inode_info *child  = ll_i2info(inode);
+	struct ll_inode_info *parent = ll_i2info(sai->sai_dentry->d_inode);
+	int                   added  = 0;
+
+	spin_lock(&child->lli_agl_lock);
+	if (child->lli_agl_index == 0) {
+		child->lli_agl_index = index;
+		spin_unlock(&child->lli_agl_lock);
+
+		LASSERT(list_empty(&child->lli_agl_list));
+
+		igrab(inode);
+		spin_lock(&parent->lli_agl_lock);
+		if (agl_list_empty(sai))
+			added = 1;
+		list_add_tail(&child->lli_agl_list, &sai->sai_agls);
+		spin_unlock(&parent->lli_agl_lock);
+	} else {
+		spin_unlock(&child->lli_agl_lock);
+	}
+
+	if (added > 0)
+		wake_up(&sai->sai_agl_thread.t_ctl_waitq);
+}
+
+/* allocate sai */
+static struct ll_statahead_info *ll_sai_alloc(struct dentry *dentry)
+{
+	struct ll_statahead_info *sai;
+	struct ll_inode_info *lli = ll_i2info(dentry->d_inode);
+	int i;
+	ENTRY;
+
+	OBD_ALLOC_PTR(sai);
+	if (!sai)
+		RETURN(NULL);
+
+	sai->sai_dentry = dget(dentry);
+	atomic_set(&sai->sai_refcount, 1);
+	sai->sai_max = LL_SA_RPC_MIN;
+	sai->sai_index = 1;
+	init_waitqueue_head(&sai->sai_waitq);
+	init_waitqueue_head(&sai->sai_thread.t_ctl_waitq);
+	init_waitqueue_head(&sai->sai_agl_thread.t_ctl_waitq);
+
+	INIT_LIST_HEAD(&sai->sai_interim_entries);
+	INIT_LIST_HEAD(&sai->sai_entries);
+	INIT_LIST_HEAD(&sai->sai_agls);
+
+	for (i = 0; i < LL_SA_CACHE_SIZE; i++) {
+		INIT_LIST_HEAD(&sai->sai_cache[i]);
+		spin_lock_init(&sai->sai_cache_lock[i]);
+	}
+	atomic_set(&sai->sai_cache_count, 0);
+
+	spin_lock(&sai_generation_lock);
+	lli->lli_sa_generation = ++sai_generation;
+	if (unlikely(sai_generation == 0))
+		lli->lli_sa_generation = ++sai_generation;
+	spin_unlock(&sai_generation_lock);
+
+	RETURN(sai);
+}
+
+/* free sai */
+static inline void ll_sai_free(struct ll_statahead_info *sai)
+{
+	LASSERT(sai->sai_dentry != NULL);
+	dput(sai->sai_dentry);
+	OBD_FREE_PTR(sai);
+}
+
+/*
+ * take refcount of sai if sai for @dir exists, which means statahead is on for
+ * this directory.
+ */
+static inline struct ll_statahead_info *ll_sai_get(struct inode *dir)
+{
+	struct ll_inode_info *lli = ll_i2info(dir);
+	struct ll_statahead_info *sai = NULL;
+
+	spin_lock(&lli->lli_sa_lock);
+	sai = lli->lli_sai;
+	if (sai != NULL)
+		atomic_inc(&sai->sai_refcount);
+	spin_unlock(&lli->lli_sa_lock);
+
+	return sai;
+}
+
+/*
+ * put sai refcount after use, if refcount reaches zero, free sai and sa_entries
+ * attached to it.
+ */
+static void ll_sai_put(struct ll_statahead_info *sai)
+{
+	struct ll_inode_info *lli = ll_i2info(sai->sai_dentry->d_inode);
+
+	if (atomic_dec_and_lock(&sai->sai_refcount, &lli->lli_sa_lock)) {
+		struct sa_entry *entry, *next;
+		struct ll_sb_info *sbi = ll_i2sbi(sai->sai_dentry->d_inode);
+
+		lli->lli_sai = NULL;
+		spin_unlock(&lli->lli_sa_lock);
+
+		LASSERT(thread_is_stopped(&sai->sai_thread));
+		LASSERT(thread_is_stopped(&sai->sai_agl_thread));
+		LASSERT(sai->sai_sent == sai->sai_replied);
+		LASSERT(!sa_has_callback(sai));
+
+		list_for_each_entry_safe(entry, next, &sai->sai_entries,
+					 se_list)
+			sa_kill(sai, entry);
+
+		LASSERT(atomic_read(&sai->sai_cache_count) == 0);
+		LASSERT(agl_list_empty(sai));
+
+		ll_sai_free(sai);
+		atomic_dec(&sbi->ll_sa_running);
+	}
+}
+
+/* Do NOT forget to drop inode refcount when into sai_agls. */
+static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	__u64 index = lli->lli_agl_index;
+	int rc;
+	ENTRY;
+
+	LASSERT(list_empty(&lli->lli_agl_list));
+
+        /* AGL maybe fall behind statahead with one entry */
+        if (is_omitted_entry(sai, index + 1)) {
+                lli->lli_agl_index = 0;
+                iput(inode);
+                RETURN_EXIT;
+        }
+
+	/* In case of restore, the MDT has the right size and has already
+	 * sent it back without granting the layout lock, inode is up-to-date.
+	 * Then AGL (async glimpse lock) is useless.
+	 * Also to glimpse we need the layout, in case of a runninh restore
+	 * the MDT holds the layout lock so the glimpse will block up to the
+	 * end of restore (statahead/agl will block) */
+	if (ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
+		lli->lli_agl_index = 0;
+		iput(inode);
+		RETURN_EXIT;
+	}
+
+        /* Someone is in glimpse (sync or async), do nothing. */
+	rc = down_write_trylock(&lli->lli_glimpse_sem);
+        if (rc == 0) {
+                lli->lli_agl_index = 0;
+                iput(inode);
+                RETURN_EXIT;
+        }
+
+        /*
+         * Someone triggered glimpse within 1 sec before.
+         * 1) The former glimpse succeeded with glimpse lock granted by OST, and
+         *    if the lock is still cached on client, AGL needs to do nothing. If
+         *    it is cancelled by other client, AGL maybe cannot obtaion new lock
+         *    for no glimpse callback triggered by AGL.
+         * 2) The former glimpse succeeded, but OST did not grant glimpse lock.
+         *    Under such case, it is quite possible that the OST will not grant
+         *    glimpse lock for AGL also.
+         * 3) The former glimpse failed, compared with other two cases, it is
+         *    relative rare. AGL can ignore such case, and it will not muchly
+         *    affect the performance.
+         */
+        if (lli->lli_glimpse_time != 0 &&
+            cfs_time_before(cfs_time_shift(-1), lli->lli_glimpse_time)) {
+		up_write(&lli->lli_glimpse_sem);
+                lli->lli_agl_index = 0;
+                iput(inode);
+                RETURN_EXIT;
+        }
+
+        CDEBUG(D_READA, "Handling (init) async glimpse: inode = "
+	       DFID", idx = %llu\n", PFID(&lli->lli_fid), index);
+
+        cl_agl(inode);
+        lli->lli_agl_index = 0;
+        lli->lli_glimpse_time = cfs_time_current();
+	up_write(&lli->lli_glimpse_sem);
+
+        CDEBUG(D_READA, "Handled (init) async glimpse: inode= "
+	       DFID", idx = %llu, rc = %d\n",
+               PFID(&lli->lli_fid), index, rc);
+
+        iput(inode);
+
+        EXIT;
+}
+
+/*
+ * prepare inode for sa entry, add it into agl list, now sa_entry is ready
+ * to be used by scanner process.
+ */
+static void sa_instantiate(struct ll_statahead_info *sai,
+				 struct sa_entry *entry)
+{
+	struct inode *dir = sai->sai_dentry->d_inode;
+	struct inode *child;
+	struct md_enqueue_info *minfo;
+	struct lookup_intent *it;
+	struct ptlrpc_request *req;
+	struct mdt_body *body;
+	int rc = 0;
+	ENTRY;
+
+        LASSERT(entry->se_handle != 0);
+
+        minfo = entry->se_minfo;
+        it = &minfo->mi_it;
+        req = entry->se_req;
+        body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+        if (body == NULL)
+                GOTO(out, rc = -EFAULT);
+
+	child = entry->se_inode;
+	if (child != NULL) {
+		/* revalidate; unlinked and re-created with the same name */
+		if (unlikely(!lu_fid_eq(&minfo->mi_data.op_fid2,
+					&body->mbo_fid1))) {
+			entry->se_inode = NULL;
+			iput(child);
+			child = NULL;
+		}
+	}
+
+	it->it_lock_handle = entry->se_handle;
+	rc = md_revalidate_lock(ll_i2mdexp(dir), it, ll_inode2fid(dir), NULL);
+        if (rc != 1)
+                GOTO(out, rc = -EAGAIN);
+
+        rc = ll_prep_inode(&child, req, dir->i_sb, it);
+        if (rc)
+                GOTO(out, rc);
+
+	CDEBUG(D_READA, "%s: setting %.*s"DFID" l_data to inode %p\n",
+	       ll_get_fsname(child->i_sb, NULL, 0),
+	       entry->se_qstr.len, entry->se_qstr.name,
+	       PFID(ll_inode2fid(child)), child);
+        ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, child, it, NULL);
+
+        entry->se_inode = child;
+
+        if (agl_should_run(sai, child))
+                ll_agl_add(sai, child, entry->se_index);
+
+        EXIT;
+
+out:
+	/* sa_make_ready() will drop ldlm ibits lock refcount by calling
+	 * ll_intent_drop_lock() in spite of failures. Do not worry about
+	 * calling ll_intent_drop_lock() more than once. */
+	sa_make_ready(sai, entry, rc);
+}
+
+/* once there are async stat replies, instantiate sa_entry from replies */
+static void sa_handle_callback(struct ll_statahead_info *sai)
+{
+	struct ll_inode_info *lli;
+
+	lli = ll_i2info(sai->sai_dentry->d_inode);
+
+	while (sa_has_callback(sai)) {
+		struct sa_entry *entry;
+
+		spin_lock(&lli->lli_sa_lock);
+		if (unlikely(!sa_has_callback(sai))) {
+			spin_unlock(&lli->lli_sa_lock);
+			break;
+		}
+		entry = list_entry(sai->sai_interim_entries.next,
+				   struct sa_entry, se_list);
+		list_del_init(&entry->se_list);
+		spin_unlock(&lli->lli_sa_lock);
+
+		sa_instantiate(sai, entry);
+	}
+}
+
+/*
+ * callback for async stat RPC, because this is called in ptlrpcd context, we
+ * only put sa_entry in sai_interim_entries, and wake up statahead thread to
+ * really prepare inode and instantiate sa_entry later.
+ */
+static int ll_statahead_interpret(struct ptlrpc_request *req,
+				  struct md_enqueue_info *minfo, int rc)
+{
+	struct lookup_intent *it = &minfo->mi_it;
+	struct inode *dir = minfo->mi_dir;
+	struct ll_inode_info *lli = ll_i2info(dir);
+	struct ll_statahead_info *sai = lli->lli_sai;
+	struct sa_entry *entry = (struct sa_entry *)minfo->mi_cbdata;
+	__u64 handle = 0;
+	wait_queue_head_t *waitq = NULL;
+	ENTRY;
+
+	if (it_disposition(it, DISP_LOOKUP_NEG))
+		rc = -ENOENT;
+
+	/* because statahead thread will wait for all inflight RPC to finish,
+	 * sai should be always valid, no need to refcount */
+	LASSERT(sai != NULL);
+	LASSERT(!thread_is_stopped(&sai->sai_thread));
+	LASSERT(entry != NULL);
+
+	CDEBUG(D_READA, "sa_entry %.*s rc %d\n",
+	       entry->se_qstr.len, entry->se_qstr.name, rc);
+
+	if (rc != 0) {
+		ll_intent_release(it);
+		iput(dir);
+		OBD_FREE_PTR(minfo);
+	} else {
+		/* release ibits lock ASAP to avoid deadlock when statahead
+		 * thread enqueues lock on parent in readdir and another
+		 * process enqueues lock on child with parent lock held, eg.
+		 * unlink. */
+		handle = it->it_lock_handle;
+		ll_intent_drop_lock(it);
+	}
+
+	spin_lock(&lli->lli_sa_lock);
+	if (rc != 0) {
+		if (__sa_make_ready(sai, entry, rc))
+			waitq = &sai->sai_waitq;
+	} else {
+		entry->se_minfo = minfo;
+		entry->se_req = ptlrpc_request_addref(req);
+		/* Release the async ibits lock ASAP to avoid deadlock
+		 * when statahead thread tries to enqueue lock on parent
+		 * for readpage and other tries to enqueue lock on child
+		 * with parent's lock held, for example: unlink. */
+		entry->se_handle = handle;
+		if (!sa_has_callback(sai))
+			waitq = &sai->sai_thread.t_ctl_waitq;
+
+		list_add_tail(&entry->se_list, &sai->sai_interim_entries);
+	}
+	sai->sai_replied++;
+
+	smp_mb();
+	if (waitq != NULL)
+		wake_up(waitq);
+	spin_unlock(&lli->lli_sa_lock);
+
+	RETURN(rc);
+}
+
+/* finish async stat RPC arguments */
+static void sa_fini_data(struct md_enqueue_info *minfo)
+{
+        iput(minfo->mi_dir);
+        OBD_FREE_PTR(minfo);
+}
+
+/*
+ * prepare arguments for async stat RPC.
+ */
+static struct md_enqueue_info *
+sa_prep_data(struct inode *dir, struct inode *child, struct sa_entry *entry)
+{
+	struct md_enqueue_info   *minfo;
+	struct ldlm_enqueue_info *einfo;
+	struct md_op_data        *op_data;
+
+	OBD_ALLOC_PTR(minfo);
+	if (minfo == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	op_data = ll_prep_md_op_data(&minfo->mi_data, dir, child, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data)) {
+		OBD_FREE_PTR(minfo);
+		return (struct md_enqueue_info *)op_data;
+	}
+
+	if (child == NULL)
+		op_data->op_fid2 = entry->se_fid;
+
+	minfo->mi_it.it_op = IT_GETATTR;
+	minfo->mi_dir = igrab(dir);
+	minfo->mi_cb = ll_statahead_interpret;
+	minfo->mi_cbdata = entry;
+
+	einfo = &minfo->mi_einfo;
+	einfo->ei_type   = LDLM_IBITS;
+	einfo->ei_mode   = it_to_lock_mode(&minfo->mi_it);
+	einfo->ei_cb_bl  = ll_md_blocking_ast;
+	einfo->ei_cb_cp  = ldlm_completion_ast;
+	einfo->ei_cb_gl  = NULL;
+	einfo->ei_cbdata = NULL;
+
+	return minfo;
+}
+
+/* async stat for file not found in dcache */
+static int sa_lookup(struct inode *dir, struct sa_entry *entry)
+{
+	struct md_enqueue_info   *minfo;
+	int                       rc;
+	ENTRY;
+
+	minfo = sa_prep_data(dir, NULL, entry);
+	if (IS_ERR(minfo))
+		RETURN(PTR_ERR(minfo));
+
+	rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo);
+	if (rc < 0)
+		sa_fini_data(minfo);
+
+	RETURN(rc);
+}
+
+/**
+ * async stat for file found in dcache, similar to .revalidate
+ *
+ * \retval	1 dentry valid, no RPC sent
+ * \retval	0 dentry invalid, will send async stat RPC
+ * \retval	negative number upon error
+ */
+static int sa_revalidate(struct inode *dir, struct sa_entry *entry,
+			 struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct lookup_intent it = { .it_op = IT_GETATTR,
+				    .it_lock_handle = 0 };
+	struct md_enqueue_info *minfo;
+	int rc;
+	ENTRY;
+
+	if (unlikely(inode == NULL))
+		RETURN(1);
+
+	if (d_mountpoint(dentry))
+		RETURN(1);
+
+	entry->se_inode = igrab(inode);
+	rc = md_revalidate_lock(ll_i2mdexp(dir), &it, ll_inode2fid(inode),
+				NULL);
+	if (rc == 1) {
+		entry->se_handle = it.it_lock_handle;
+		ll_intent_release(&it);
+		RETURN(1);
+	}
+
+	minfo = sa_prep_data(dir, inode, entry);
+	if (IS_ERR(minfo)) {
+		entry->se_inode = NULL;
+		iput(inode);
+		RETURN(PTR_ERR(minfo));
+	}
+
+	rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo);
+	if (rc < 0) {
+		entry->se_inode = NULL;
+		iput(inode);
+		sa_fini_data(minfo);
+	}
+
+	RETURN(rc);
+}
+
+/* async stat for file with @name */
+static void sa_statahead(struct dentry *parent, const char *name, int len,
+			 const struct lu_fid *fid)
+{
+	struct inode *dir = parent->d_inode;
+	struct ll_inode_info *lli = ll_i2info(dir);
+	struct ll_statahead_info *sai = lli->lli_sai;
+	struct dentry *dentry = NULL;
+	struct sa_entry *entry;
+	int rc;
+	ENTRY;
+
+	entry = sa_alloc(parent, sai, sai->sai_index, name, len, fid);
+	if (IS_ERR(entry))
+		RETURN_EXIT;
+
+	dentry = d_lookup(parent, &entry->se_qstr);
+	if (!dentry) {
+		rc = sa_lookup(dir, entry);
+	} else {
+		rc = sa_revalidate(dir, entry, dentry);
+		if (rc == 1 && agl_should_run(sai, dentry->d_inode))
+			ll_agl_add(sai, dentry->d_inode, entry->se_index);
+	}
+
+	if (dentry != NULL)
+		dput(dentry);
+
+	if (rc != 0)
+		sa_make_ready(sai, entry, rc);
+	else
+		sai->sai_sent++;
+
+	sai->sai_index++;
+
+	EXIT;
+}
+
+/* async glimpse (agl) thread main function */
+static int ll_agl_thread(void *arg)
+{
+	struct dentry *parent = (struct dentry *)arg;
+	struct inode *dir = parent->d_inode;
+	struct ll_inode_info *plli = ll_i2info(dir);
+	struct ll_inode_info *clli;
+	struct ll_sb_info *sbi = ll_i2sbi(dir);
+	struct ll_statahead_info *sai;
+	struct ptlrpc_thread *thread;
+	struct l_wait_info lwi = { 0 };
+	ENTRY;
+
+
+	sai = ll_sai_get(dir);
+	thread = &sai->sai_agl_thread;
+	thread->t_pid = current_pid();
+	CDEBUG(D_READA, "agl thread started: sai %p, parent %.*s\n",
+	       sai, parent->d_name.len, parent->d_name.name);
+
+	atomic_inc(&sbi->ll_agl_total);
+	spin_lock(&plli->lli_agl_lock);
+	sai->sai_agl_valid = 1;
+	if (thread_is_init(thread))
+		/* If someone else has changed the thread state
+		 * (e.g. already changed to SVC_STOPPING), we can't just
+		 * blindly overwrite that setting. */
+		thread_set_flags(thread, SVC_RUNNING);
+	spin_unlock(&plli->lli_agl_lock);
+	wake_up(&thread->t_ctl_waitq);
+
+        while (1) {
+                l_wait_event(thread->t_ctl_waitq,
+                             !agl_list_empty(sai) ||
+                             !thread_is_running(thread),
+                             &lwi);
+
+                if (!thread_is_running(thread))
+                        break;
+
+		spin_lock(&plli->lli_agl_lock);
+		/* The statahead thread maybe help to process AGL entries,
+		 * so check whether list empty again. */
+		if (!agl_list_empty(sai)) {
+			clli = agl_first_entry(sai);
+			list_del_init(&clli->lli_agl_list);
+			spin_unlock(&plli->lli_agl_lock);
+			ll_agl_trigger(&clli->lli_vfs_inode, sai);
+		} else {
+			spin_unlock(&plli->lli_agl_lock);
+		}
+	}
+
+	spin_lock(&plli->lli_agl_lock);
+	sai->sai_agl_valid = 0;
+	while (!agl_list_empty(sai)) {
+		clli = agl_first_entry(sai);
+		list_del_init(&clli->lli_agl_list);
+		spin_unlock(&plli->lli_agl_lock);
+		clli->lli_agl_index = 0;
+		iput(&clli->lli_vfs_inode);
+		spin_lock(&plli->lli_agl_lock);
+	}
+	thread_set_flags(thread, SVC_STOPPED);
+	spin_unlock(&plli->lli_agl_lock);
+	wake_up(&thread->t_ctl_waitq);
+	ll_sai_put(sai);
+	CDEBUG(D_READA, "agl thread stopped: sai %p, parent %.*s\n",
+	       sai, parent->d_name.len, parent->d_name.name);
+	RETURN(0);
+}
+
+/* start agl thread */
+static void ll_start_agl(struct dentry *parent, struct ll_statahead_info *sai)
+{
+	struct ptlrpc_thread *thread = &sai->sai_agl_thread;
+	struct l_wait_info    lwi    = { 0 };
+	struct ll_inode_info  *plli;
+	struct task_struct	      *task;
+	ENTRY;
+
+	CDEBUG(D_READA, "start agl thread: sai %p, parent %.*s\n",
+	       sai, parent->d_name.len, parent->d_name.name);
+
+	plli = ll_i2info(parent->d_inode);
+	task = kthread_run(ll_agl_thread, parent,
+			       "ll_agl_%u", plli->lli_opendir_pid);
+	if (IS_ERR(task)) {
+		CERROR("can't start ll_agl thread, rc: %ld\n", PTR_ERR(task));
+		thread_set_flags(thread, SVC_STOPPED);
+		RETURN_EXIT;
+	}
+
+	l_wait_event(thread->t_ctl_waitq,
+		     thread_is_running(thread) || thread_is_stopped(thread),
+		     &lwi);
+	EXIT;
+}
+
+/* statahead thread main function */
+static int ll_statahead_thread(void *arg)
+{
+	struct dentry *parent = (struct dentry *)arg;
+	struct inode *dir = parent->d_inode;
+	struct ll_inode_info *lli = ll_i2info(dir);
+	struct ll_sb_info *sbi = ll_i2sbi(dir);
+	struct ll_statahead_info *sai;
+	struct ptlrpc_thread *sa_thread;
+	struct ptlrpc_thread *agl_thread;
+	int first = 0;
+	struct md_op_data *op_data;
+	struct ll_dir_chain chain;
+	struct l_wait_info lwi = { 0 };
+	struct page *page = NULL;
+	__u64 pos = 0;
+	int rc = 0;
+	ENTRY;
+
+	sai = ll_sai_get(dir);
+	sa_thread = &sai->sai_thread;
+	agl_thread = &sai->sai_agl_thread;
+	sa_thread->t_pid = current_pid();
+	CDEBUG(D_READA, "statahead thread starting: sai %p, parent %.*s\n",
+	       sai, parent->d_name.len, parent->d_name.name);
+
+	op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, dir);
+	if (IS_ERR(op_data))
+		GOTO(out, rc = PTR_ERR(op_data));
+
+	if (sbi->ll_flags & LL_SBI_AGL_ENABLED)
+		ll_start_agl(parent, sai);
+
+	atomic_inc(&sbi->ll_sa_total);
+	spin_lock(&lli->lli_sa_lock);
+	if (thread_is_init(sa_thread))
+		/* If someone else has changed the thread state
+		 * (e.g. already changed to SVC_STOPPING), we can't just
+		 * blindly overwrite that setting. */
+		thread_set_flags(sa_thread, SVC_RUNNING);
+	spin_unlock(&lli->lli_sa_lock);
+	wake_up(&sa_thread->t_ctl_waitq);
+
+	ll_dir_chain_init(&chain);
+	while (pos != MDS_DIR_END_OFF && thread_is_running(sa_thread)) {
+		struct lu_dirpage *dp;
+		struct lu_dirent  *ent;
+
+		sai->sai_in_readpage = 1;
+		page = ll_get_dir_page(dir, op_data, pos, &chain);
+		sai->sai_in_readpage = 0;
+		if (IS_ERR(page)) {
+			rc = PTR_ERR(page);
+			CDEBUG(D_READA, "error reading dir "DFID" at %llu"
+			       "/%llu opendir_pid = %u: rc = %d\n",
+			       PFID(ll_inode2fid(dir)), pos, sai->sai_index,
+			       lli->lli_opendir_pid, rc);
+			break;
+		}
+
+		dp = page_address(page);
+		for (ent = lu_dirent_start(dp);
+		     ent != NULL && thread_is_running(sa_thread) &&
+		     !sa_low_hit(sai);
+		     ent = lu_dirent_next(ent)) {
+			__u64 hash;
+			int namelen;
+			char *name;
+			struct lu_fid fid;
+
+			hash = le64_to_cpu(ent->lde_hash);
+			if (unlikely(hash < pos))
+				/*
+				 * Skip until we find target hash value.
+				 */
+				continue;
+
+			namelen = le16_to_cpu(ent->lde_namelen);
+			if (unlikely(namelen == 0))
+				/*
+				 * Skip dummy record.
+				 */
+				continue;
+
+			name = ent->lde_name;
+			if (name[0] == '.') {
+				if (namelen == 1) {
+					/*
+					 * skip "."
+					 */
+					continue;
+				} else if (name[1] == '.' && namelen == 2) {
+					/*
+					 * skip ".."
+					 */
+					continue;
+				} else if (!sai->sai_ls_all) {
+					/*
+					 * skip hidden files.
+					 */
+					sai->sai_skip_hidden++;
+					continue;
+				}
+			}
+
+			/*
+			 * don't stat-ahead first entry.
+			 */
+			if (unlikely(++first == 1))
+				continue;
+
+			fid_le_to_cpu(&fid, &ent->lde_fid);
+
+			/* wait for spare statahead window */
+			do {
+				l_wait_event(sa_thread->t_ctl_waitq,
+					     !sa_sent_full(sai) ||
+					     sa_has_callback(sai) ||
+					     !agl_list_empty(sai) ||
+					     !thread_is_running(sa_thread),
+					     &lwi);
+
+				sa_handle_callback(sai);
+
+				spin_lock(&lli->lli_agl_lock);
+				while (sa_sent_full(sai) &&
+				       !agl_list_empty(sai)) {
+					struct ll_inode_info *clli;
+
+					clli = agl_first_entry(sai);
+					list_del_init(&clli->lli_agl_list);
+					spin_unlock(&lli->lli_agl_lock);
+
+					ll_agl_trigger(&clli->lli_vfs_inode,
+							sai);
+
+					spin_lock(&lli->lli_agl_lock);
+				}
+				spin_unlock(&lli->lli_agl_lock);
+			} while (sa_sent_full(sai) &&
+				 thread_is_running(sa_thread));
+
+			sa_statahead(parent, name, namelen, &fid);
+		}
+
+		pos = le64_to_cpu(dp->ldp_hash_end);
+		ll_release_page(dir, page,
+				le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE);
+
+		if (sa_low_hit(sai)) {
+			rc = -EFAULT;
+			atomic_inc(&sbi->ll_sa_wrong);
+			CDEBUG(D_READA, "Statahead for dir "DFID" hit "
+			       "ratio too low: hit/miss %llu/%llu"
+			       ", sent/replied %llu/%llu, stopping "
+			       "statahead thread: pid %d\n",
+			       PFID(&lli->lli_fid), sai->sai_hit,
+			       sai->sai_miss, sai->sai_sent,
+			       sai->sai_replied, current_pid());
+			break;
+		}
+	}
+	ll_dir_chain_fini(&chain);
+	ll_finish_md_op_data(op_data);
+
+	if (rc < 0) {
+		spin_lock(&lli->lli_sa_lock);
+		thread_set_flags(sa_thread, SVC_STOPPING);
+		lli->lli_sa_enabled = 0;
+		spin_unlock(&lli->lli_sa_lock);
+	}
+
+	/* statahead is finished, but statahead entries need to be cached, wait
+	 * for file release to stop me. */
+	while (thread_is_running(sa_thread)) {
+		l_wait_event(sa_thread->t_ctl_waitq,
+			     sa_has_callback(sai) ||
+			     !thread_is_running(sa_thread),
+			     &lwi);
+
+		sa_handle_callback(sai);
+	}
+
+	EXIT;
+out:
+	if (sai->sai_agl_valid) {
+		spin_lock(&lli->lli_agl_lock);
+		thread_set_flags(agl_thread, SVC_STOPPING);
+		spin_unlock(&lli->lli_agl_lock);
+		wake_up(&agl_thread->t_ctl_waitq);
+
+		CDEBUG(D_READA, "stop agl thread: sai %p pid %u\n",
+		       sai, (unsigned int)agl_thread->t_pid);
+		l_wait_event(agl_thread->t_ctl_waitq,
+			     thread_is_stopped(agl_thread),
+			     &lwi);
+	} else {
+		/* Set agl_thread flags anyway. */
+		thread_set_flags(agl_thread, SVC_STOPPED);
+	}
+
+	/* wait for inflight statahead RPCs to finish, and then we can free sai
+	 * safely because statahead RPC will access sai data */
+	while (sai->sai_sent != sai->sai_replied) {
+		/* in case we're not woken up, timeout wait */
+		lwi = LWI_TIMEOUT(msecs_to_jiffies(MSEC_PER_SEC >> 3),
+				  NULL, NULL);
+		l_wait_event(sa_thread->t_ctl_waitq,
+			sai->sai_sent == sai->sai_replied, &lwi);
+	}
+
+	/* release resources held by statahead RPCs */
+	sa_handle_callback(sai);
+
+	spin_lock(&lli->lli_sa_lock);
+	thread_set_flags(sa_thread, SVC_STOPPED);
+	spin_unlock(&lli->lli_sa_lock);
+
+	CDEBUG(D_READA, "statahead thread stopped: sai %p, parent %.*s\n",
+	       sai, parent->d_name.len, parent->d_name.name);
+
+	wake_up(&sai->sai_waitq);
+	wake_up(&sa_thread->t_ctl_waitq);
+	ll_sai_put(sai);
+
+	return rc;
+}
+
+/* authorize opened dir handle @key to statahead */
+void ll_authorize_statahead(struct inode *dir, void *key)
+{
+	struct ll_inode_info *lli = ll_i2info(dir);
+
+	spin_lock(&lli->lli_sa_lock);
+	if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL) {
+		/*
+		 * if lli_sai is not NULL, it means previous statahead is not
+		 * finished yet, we'd better not start a new statahead for now.
+		 */
+		LASSERT(lli->lli_opendir_pid == 0);
+		lli->lli_opendir_key = key;
+		lli->lli_opendir_pid = current_pid();
+		lli->lli_sa_enabled = 1;
+	}
+	spin_unlock(&lli->lli_sa_lock);
+}
+
+/*
+ * deauthorize opened dir handle @key to statahead, and notify statahead thread
+ * to quit if it's running.
+ */
+void ll_deauthorize_statahead(struct inode *dir, void *key)
+{
+	struct ll_inode_info *lli = ll_i2info(dir);
+	struct ll_statahead_info *sai;
+
+	LASSERT(lli->lli_opendir_key == key);
+	LASSERT(lli->lli_opendir_pid != 0);
+
+	CDEBUG(D_READA, "deauthorize statahead for "DFID"\n",
+		PFID(&lli->lli_fid));
+
+	spin_lock(&lli->lli_sa_lock);
+	lli->lli_opendir_key = NULL;
+	lli->lli_opendir_pid = 0;
+	lli->lli_sa_enabled = 0;
+	sai = lli->lli_sai;
+	if (sai != NULL && thread_is_running(&sai->sai_thread)) {
+		/*
+		 * statahead thread may not quit yet because it needs to cache
+		 * entries, now it's time to tell it to quit.
+		 *
+		 * In case sai is released, wake_up() is called inside spinlock,
+		 * so we have to call smp_mb() explicitely to serialize ops.
+		 */
+		thread_set_flags(&sai->sai_thread, SVC_STOPPING);
+		smp_mb();
+		wake_up(&sai->sai_thread.t_ctl_waitq);
+	}
+	spin_unlock(&lli->lli_sa_lock);
+}
+
+enum {
+	/**
+	 * not first dirent, or is "."
+	 */
+	LS_NOT_FIRST_DE = 0,
+	/**
+	 * the first non-hidden dirent
+	 */
+	LS_FIRST_DE,
+	/**
+	 * the first hidden dirent, that is "."
+	 */
+	LS_FIRST_DOT_DE
+};
+
+/* file is first dirent under @dir */
+static int is_first_dirent(struct inode *dir, struct dentry *dentry)
+{
+	struct ll_dir_chain   chain;
+	struct qstr          *target = &dentry->d_name;
+	struct md_op_data    *op_data;
+	int                   dot_de;
+	struct page	     *page = NULL;
+	int                   rc = LS_NOT_FIRST_DE;
+	__u64		      pos = 0;
+	ENTRY;
+
+	op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, dir);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+	/**
+	 *FIXME choose the start offset of the readdir
+	 */
+
+	ll_dir_chain_init(&chain);
+	page = ll_get_dir_page(dir, op_data, 0, &chain);
+
+	while (1) {
+		struct lu_dirpage *dp;
+		struct lu_dirent  *ent;
+
+		if (IS_ERR(page)) {
+			struct ll_inode_info *lli = ll_i2info(dir);
+
+			rc = PTR_ERR(page);
+			CERROR("%s: reading dir "DFID" at %llu"
+			       "opendir_pid = %u : rc = %d\n",
+			       ll_get_fsname(dir->i_sb, NULL, 0),
+			       PFID(ll_inode2fid(dir)), pos,
+			       lli->lli_opendir_pid, rc);
+			break;
+		}
+
+		dp = page_address(page);
+		for (ent = lu_dirent_start(dp); ent != NULL;
+		     ent = lu_dirent_next(ent)) {
+			__u64 hash;
+			int namelen;
+			char *name;
+
+			hash = le64_to_cpu(ent->lde_hash);
+			/* The ll_get_dir_page() can return any page containing
+			 * the given hash which may be not the start hash. */
+			if (unlikely(hash < pos))
+				continue;
+
+			namelen = le16_to_cpu(ent->lde_namelen);
+			if (unlikely(namelen == 0))
+				/*
+				 * skip dummy record.
+				 */
+				continue;
+
+			name = ent->lde_name;
+			if (name[0] == '.') {
+				if (namelen == 1)
+					/*
+					 * skip "."
+					 */
+					continue;
+				else if (name[1] == '.' && namelen == 2)
+					/*
+					 * skip ".."
+					 */
+					continue;
+				else
+					dot_de = 1;
+			} else {
+				dot_de = 0;
+			}
+
+			if (dot_de && target->name[0] != '.') {
+				CDEBUG(D_READA, "%.*s skip hidden file %.*s\n",
+				       target->len, target->name,
+				       namelen, name);
+				continue;
+			}
+
+			if (target->len != namelen ||
+			    memcmp(target->name, name, namelen) != 0)
+				rc = LS_NOT_FIRST_DE;
+			else if (!dot_de)
+				rc = LS_FIRST_DE;
+			else
+				rc = LS_FIRST_DOT_DE;
+
+			ll_release_page(dir, page, false);
+			GOTO(out, rc);
+		}
+		pos = le64_to_cpu(dp->ldp_hash_end);
+		if (pos == MDS_DIR_END_OFF) {
+			/*
+			 * End of directory reached.
+			 */
+			ll_release_page(dir, page, false);
+			GOTO(out, rc);
+		} else {
+			/*
+			 * chain is exhausted
+			 * Normal case: continue to the next page.
+			 */
+			ll_release_page(dir, page, le32_to_cpu(dp->ldp_flags) &
+					      LDF_COLLIDE);
+			page = ll_get_dir_page(dir, op_data, pos, &chain);
+		}
+	}
+	EXIT;
+out:
+	ll_dir_chain_fini(&chain);
+	ll_finish_md_op_data(op_data);
+        return rc;
+}
+
+/**
+ * revalidate @dentryp from statahead cache
+ *
+ * \param[in] dir	parent directory
+ * \param[in] sai	sai structure
+ * \param[out] dentryp	pointer to dentry which will be revalidated
+ * \param[in] unplug	unplug statahead window only (normally for negative
+ *			dentry)
+ * \retval		1 on success, dentry is saved in @dentryp
+ * \retval		0 if revalidation failed (no proper lock on client)
+ * \retval		negative number upon error
+ */
+static int revalidate_statahead_dentry(struct inode *dir,
+					struct ll_statahead_info *sai,
+					struct dentry **dentryp,
+					bool unplug)
+{
+	struct sa_entry *entry = NULL;
+	struct l_wait_info lwi = { 0 };
+	struct ll_dentry_data *ldd;
+	struct ll_inode_info *lli = ll_i2info(dir);
+	int rc = 0;
+	ENTRY;
+
+	if ((*dentryp)->d_name.name[0] == '.') {
+		if (sai->sai_ls_all ||
+		    sai->sai_miss_hidden >= sai->sai_skip_hidden) {
+			/*
+			 * Hidden dentry is the first one, or statahead
+			 * thread does not skip so many hidden dentries
+			 * before "sai_ls_all" enabled as below.
+			 */
+		} else {
+			if (!sai->sai_ls_all)
+				/*
+				 * It maybe because hidden dentry is not
+				 * the first one, "sai_ls_all" was not
+				 * set, then "ls -al" missed. Enable
+				 * "sai_ls_all" for such case.
+				 */
+				sai->sai_ls_all = 1;
+
+			/*
+			 * Such "getattr" has been skipped before
+			 * "sai_ls_all" enabled as above.
+			 */
+			sai->sai_miss_hidden++;
+			RETURN(-EAGAIN);
+		}
+	}
+
+	if (unplug)
+		GOTO(out, rc = 1);
+
+	entry = sa_get(sai, &(*dentryp)->d_name);
+	if (entry == NULL)
+		GOTO(out, rc = -EAGAIN);
+
+	/* if statahead is busy in readdir, help it do post-work */
+	if (!sa_ready(entry) && sai->sai_in_readpage)
+		sa_handle_callback(sai);
+
+	if (!sa_ready(entry)) {
+		spin_lock(&lli->lli_sa_lock);
+		sai->sai_index_wait = entry->se_index;
+		spin_unlock(&lli->lli_sa_lock);
+		lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(30), NULL,
+				       LWI_ON_SIGNAL_NOOP, NULL);
+		rc = l_wait_event(sai->sai_waitq, sa_ready(entry), &lwi);
+		if (rc < 0) {
+			/*
+			 * entry may not be ready, so it may be used by inflight
+			 * statahead RPC, don't free it.
+			 */
+			entry = NULL;
+			GOTO(out, rc = -EAGAIN);
+		}
+	}
+
+	if (entry->se_state == SA_ENTRY_SUCC && entry->se_inode != NULL) {
+		struct inode *inode = entry->se_inode;
+		struct lookup_intent it = { .it_op = IT_GETATTR,
+					    .it_lock_handle =
+						entry->se_handle };
+		__u64 bits;
+
+		rc = md_revalidate_lock(ll_i2mdexp(dir), &it,
+					ll_inode2fid(inode), &bits);
+		if (rc == 1) {
+			if ((*dentryp)->d_inode == NULL) {
+				struct dentry *alias;
+
+				alias = ll_splice_alias(inode, *dentryp);
+				if (IS_ERR(alias)) {
+					ll_intent_release(&it);
+					GOTO(out, rc = PTR_ERR(alias));
+				}
+				*dentryp = alias;
+				/* statahead prepared this inode, transfer inode
+				 * refcount from sa_entry to dentry */
+				entry->se_inode = NULL;
+			} else if ((*dentryp)->d_inode != inode) {
+				/* revalidate, but inode is recreated */
+				CDEBUG(D_READA,
+					"%s: stale dentry %.*s inode "
+					DFID", statahead inode "DFID
+					"\n",
+					ll_get_fsname((*dentryp)->d_inode->i_sb,
+						      NULL, 0),
+					(*dentryp)->d_name.len,
+					(*dentryp)->d_name.name,
+					PFID(ll_inode2fid((*dentryp)->d_inode)),
+					PFID(ll_inode2fid(inode)));
+				ll_intent_release(&it);
+				GOTO(out, rc = -ESTALE);
+			}
+
+			if ((bits & MDS_INODELOCK_LOOKUP) &&
+			    d_lustre_invalid(*dentryp))
+				d_lustre_revalidate(*dentryp);
+			ll_intent_release(&it);
+		}
+	}
+out:
+	/*
+	 * statahead cached sa_entry can be used only once, and will be killed
+	 * right after use, so if lookup/revalidate accessed statahead cache,
+	 * set dentry ldd_sa_generation to parent lli_sa_generation, later if we
+	 * stat this file again, we know we've done statahead before, see
+	 * dentry_may_statahead().
+	 */
+	ldd = ll_d2d(*dentryp);
+	/* ldd can be NULL if llite lookup failed. */
+	if (ldd != NULL)
+		ldd->lld_sa_generation = lli->lli_sa_generation;
+	sa_put(sai, entry);
+
+	RETURN(rc);
+}
+
+/**
+ * start statahead thread
+ *
+ * \param[in] dir	parent directory
+ * \param[in] dentry	dentry that triggers statahead, normally the first
+ *			dirent under @dir
+ * \retval		-EAGAIN on success, because when this function is
+ *			called, it's already in lookup call, so client should
+ *			do it itself instead of waiting for statahead thread
+ *			to do it asynchronously.
+ * \retval		negative number upon error
+ */
+static int start_statahead_thread(struct inode *dir, struct dentry *dentry)
+{
+	struct ll_inode_info *lli = ll_i2info(dir);
+	struct ll_statahead_info *sai = NULL;
+	struct dentry *parent = dentry->d_parent;
+	struct ptlrpc_thread *thread;
+	struct l_wait_info lwi = { 0 };
+	struct task_struct *task;
+	struct ll_sb_info *sbi = ll_i2sbi(parent->d_inode);
+	int first = LS_FIRST_DE;
+	int rc = 0;
+	ENTRY;
+
+	/* I am the "lli_opendir_pid" owner, only me can set "lli_sai". */
+	first = is_first_dirent(dir, dentry);
+	if (first == LS_NOT_FIRST_DE)
+		/* It is not "ls -{a}l" operation, no need statahead for it. */
+		GOTO(out, rc = -EFAULT);
+
+	if (unlikely(atomic_inc_return(&sbi->ll_sa_running) >
+				       sbi->ll_sa_running_max)) {
+		CDEBUG(D_READA,
+		       "Too many concurrent statahead instances, "
+		       "avoid new statahead instance temporarily.\n");
+		GOTO(out, rc = -EMFILE);
+	}
+
+	sai = ll_sai_alloc(parent);
+	if (sai == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	sai->sai_ls_all = (first == LS_FIRST_DOT_DE);
+
+	/* if current lli_opendir_key was deauthorized, or dir re-opened by
+	 * another process, don't start statahead, otherwise the newly spawned
+	 * statahead thread won't be notified to quit. */
+	spin_lock(&lli->lli_sa_lock);
+	if (unlikely(lli->lli_sai != NULL ||
+		     lli->lli_opendir_key == NULL ||
+		     lli->lli_opendir_pid != current->pid)) {
+		spin_unlock(&lli->lli_sa_lock);
+		GOTO(out, rc = -EPERM);
+	}
+	lli->lli_sai = sai;
+	spin_unlock(&lli->lli_sa_lock);
+
+	CDEBUG(D_READA, "start statahead thread: [pid %d] [parent %.*s]\n",
+	       current_pid(), parent->d_name.len, parent->d_name.name);
+
+	task = kthread_run(ll_statahead_thread, parent, "ll_sa_%u",
+			   lli->lli_opendir_pid);
+	thread = &sai->sai_thread;
+	if (IS_ERR(task)) {
+		spin_lock(&lli->lli_sa_lock);
+		lli->lli_sai = NULL;
+		spin_unlock(&lli->lli_sa_lock);
+		atomic_dec(&ll_i2sbi(parent->d_inode)->ll_sa_running);
+		rc = PTR_ERR(task);
+		CERROR("can't start ll_sa thread, rc: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	l_wait_event(thread->t_ctl_waitq,
+		     thread_is_running(thread) || thread_is_stopped(thread),
+		     &lwi);
+	ll_sai_put(sai);
+
+	/*
+	 * We don't stat-ahead for the first dirent since we are already in
+	 * lookup.
+	 */
+	RETURN(-EAGAIN);
+
+out:
+	/* once we start statahead thread failed, disable statahead so that
+	 * subsequent stat won't waste time to try it. */
+	spin_lock(&lli->lli_sa_lock);
+	if (lli->lli_opendir_pid == current->pid)
+		lli->lli_sa_enabled = 0;
+	spin_unlock(&lli->lli_sa_lock);
+
+	if (sai != NULL)
+		ll_sai_free(sai);
+	if (first != LS_NOT_FIRST_DE)
+		atomic_dec(&sbi->ll_sa_running);
+
+	RETURN(rc);
+}
+
+/**
+ * statahead entry function, this is called when client getattr on a file, it
+ * will start statahead thread if this is the first dir entry, else revalidate
+ * dentry from statahead cache.
+ *
+ * \param[in]  dir	parent directory
+ * \param[out] dentryp	dentry to getattr
+ * \param[in]  unplug	unplug statahead window only (normally for negative
+ *			dentry)
+ * \retval		1 on success
+ * \retval		0 revalidation from statahead cache failed, caller needs
+ *			to getattr from server directly
+ * \retval		negative number on error, caller often ignores this and
+ *			then getattr from server
+ */
+int ll_statahead(struct inode *dir, struct dentry **dentryp, bool unplug)
+{
+	struct ll_statahead_info *sai;
+
+	sai = ll_sai_get(dir);
+	if (sai != NULL) {
+		int rc;
+
+		rc = revalidate_statahead_dentry(dir, sai, dentryp, unplug);
+		CDEBUG(D_READA, "revalidate statahead %.*s: %d.\n",
+			(*dentryp)->d_name.len, (*dentryp)->d_name.name, rc);
+		ll_sai_put(sai);
+		return rc;
+	}
+	return start_statahead_thread(dir, *dentryp);
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/super25.c b/drivers/staging/lustrefsx/lustre/llite/super25.c
new file mode 100644
index 0000000000000..0ce267546688c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/super25.c
@@ -0,0 +1,212 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/version.h>
+#include <lustre_ha.h>
+#include <lustre_dlm.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <lprocfs_status.h>
+#include "llite_internal.h"
+#include "vvp_internal.h"
+
+static struct kmem_cache *ll_inode_cachep;
+
+static struct inode *ll_alloc_inode(struct super_block *sb)
+{
+	struct ll_inode_info *lli;
+	ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_ALLOC_INODE, 1);
+	OBD_SLAB_ALLOC_PTR_GFP(lli, ll_inode_cachep, GFP_NOFS);
+	if (lli == NULL)
+		return NULL;
+
+	inode_init_once(&lli->lli_vfs_inode);
+	return &lli->lli_vfs_inode;
+}
+
+#ifdef HAVE_INODE_I_RCU
+static void ll_inode_destroy_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct ll_inode_info *ptr = ll_i2info(inode);
+	OBD_SLAB_FREE_PTR(ptr, ll_inode_cachep);
+}
+
+static void ll_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, ll_inode_destroy_callback);
+}
+#else
+static void ll_destroy_inode(struct inode *inode)
+{
+	struct ll_inode_info *ptr = ll_i2info(inode);
+	OBD_SLAB_FREE_PTR(ptr, ll_inode_cachep);
+}
+#endif
+
+/* exported operations */
+struct super_operations lustre_super_operations =
+{
+        .alloc_inode   = ll_alloc_inode,
+        .destroy_inode = ll_destroy_inode,
+#ifdef HAVE_SBOPS_EVICT_INODE
+        .evict_inode   = ll_delete_inode,
+#else
+        .clear_inode   = ll_clear_inode,
+        .delete_inode  = ll_delete_inode,
+#endif
+        .put_super     = ll_put_super,
+        .statfs        = ll_statfs,
+        .umount_begin  = ll_umount_begin,
+        .remount_fs    = ll_remount_fs,
+        .show_options  = ll_show_options,
+};
+
+
+void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg));
+
+static int __init lustre_init(void)
+{
+	struct proc_dir_entry *entry;
+	struct lnet_process_id lnet_id;
+	struct timespec64 ts;
+	int i, rc, seed[2];
+
+	CLASSERT(sizeof(LUSTRE_VOLATILE_HDR) == LUSTRE_VOLATILE_HDR_LEN + 1);
+
+	/* print an address of _any_ initialized kernel symbol from this
+	 * module, to allow debugging with gdb that doesn't support data
+	 * symbols from modules.*/
+	CDEBUG(D_INFO, "Lustre client module (%p).\n",
+	       &lustre_super_operations);
+
+	ll_inode_cachep = kmem_cache_create("lustre_inode_cache",
+					    sizeof(struct ll_inode_info),
+					    0, SLAB_HWCACHE_ALIGN, NULL);
+	if (ll_inode_cachep == NULL)
+		GOTO(out_cache, rc = -ENOMEM);
+
+	ll_file_data_slab = kmem_cache_create("ll_file_data",
+						 sizeof(struct ll_file_data), 0,
+						 SLAB_HWCACHE_ALIGN, NULL);
+	if (ll_file_data_slab == NULL)
+		GOTO(out_cache, rc = -ENOMEM);
+
+	entry = lprocfs_register("llite", proc_lustre_root, NULL, NULL);
+	if (IS_ERR(entry)) {
+		rc = PTR_ERR(entry);
+		CERROR("cannot register '/proc/fs/lustre/llite': rc = %d\n",
+		       rc);
+		GOTO(out_cache, rc);
+	}
+
+	proc_lustre_fs_root = entry;
+
+	cfs_get_random_bytes(seed, sizeof(seed));
+
+	/* Nodes with small feet have little entropy. The NID for this
+	 * node gives the most entropy in the low bits. */
+	for (i = 0;; i++) {
+		if (LNetGetId(i, &lnet_id) == -ENOENT)
+			break;
+
+		if (LNET_NETTYP(LNET_NIDNET(lnet_id.nid)) != LOLND)
+			seed[0] ^= LNET_NIDADDR(lnet_id.nid);
+	}
+
+	ktime_get_ts64(&ts);
+	cfs_srand(ts.tv_sec ^ seed[0], ts.tv_nsec ^ seed[1]);
+
+	rc = vvp_global_init();
+	if (rc != 0)
+		GOTO(out_proc, rc);
+
+	cl_inode_fini_env = cl_env_alloc(&cl_inode_fini_refcheck,
+					 LCT_REMEMBER | LCT_NOREF);
+	if (IS_ERR(cl_inode_fini_env))
+		GOTO(out_vvp, rc = PTR_ERR(cl_inode_fini_env));
+
+	cl_inode_fini_env->le_ctx.lc_cookie = 0x4;
+
+	rc = ll_xattr_init();
+	if (rc != 0)
+		GOTO(out_inode_fini_env, rc);
+
+	lustre_register_client_fill_super(ll_fill_super);
+	lustre_register_kill_super_cb(ll_kill_super);
+	lustre_register_client_process_config(ll_process_config);
+
+	RETURN(0);
+
+out_inode_fini_env:
+	cl_env_put(cl_inode_fini_env, &cl_inode_fini_refcheck);
+out_vvp:
+	vvp_global_fini();
+out_proc:
+	lprocfs_remove(&proc_lustre_fs_root);
+out_cache:
+	if (ll_inode_cachep != NULL)
+		kmem_cache_destroy(ll_inode_cachep);
+
+	if (ll_file_data_slab != NULL)
+		kmem_cache_destroy(ll_file_data_slab);
+
+	return rc;
+}
+
+static void __exit lustre_exit(void)
+{
+	lustre_register_client_fill_super(NULL);
+	lustre_register_kill_super_cb(NULL);
+	lustre_register_client_process_config(NULL);
+
+	lprocfs_remove(&proc_lustre_fs_root);
+
+	ll_xattr_fini();
+	cl_env_put(cl_inode_fini_env, &cl_inode_fini_refcheck);
+	vvp_global_fini();
+
+	kmem_cache_destroy(ll_inode_cachep);
+	kmem_cache_destroy(ll_file_data_slab);
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Client File System");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+MODULE_LICENSE("GPL");
+
+module_init(lustre_init);
+module_exit(lustre_exit);
diff --git a/drivers/staging/lustrefsx/lustre/llite/symlink.c b/drivers/staging/lustrefsx/lustre/llite/symlink.c
new file mode 100644
index 0000000000000..8e12995873cb8
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/symlink.c
@@ -0,0 +1,242 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/stat.h>
+#include <linux/version.h>
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "llite_internal.h"
+
+static int ll_readlink_internal(struct inode *inode,
+                                struct ptlrpc_request **request, char **symname)
+{
+        struct ll_inode_info *lli = ll_i2info(inode);
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        int rc, symlen = i_size_read(inode) + 1;
+        struct mdt_body *body;
+        struct md_op_data *op_data;
+        ENTRY;
+
+        *request = NULL;
+
+	if (lli->lli_symlink_name) {
+		int print_limit = min_t(int, PAGE_SIZE - 128, symlen);
+
+		*symname = lli->lli_symlink_name;
+		/* If the total CDEBUG() size is larger than a page, it
+		 * will print a warning to the console, avoid this by
+		 * printing just the last part of the symlink. */
+		CDEBUG(D_INODE, "using cached symlink %s%.*s, len = %d\n",
+		       print_limit < symlen ? "..." : "", print_limit,
+		       (*symname) + symlen - print_limit, symlen);
+		RETURN(0);
+	}
+
+        op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, symlen,
+                                     LUSTRE_OPC_ANY, NULL);
+        if (IS_ERR(op_data))
+                RETURN(PTR_ERR(op_data));
+
+        op_data->op_valid = OBD_MD_LINKNAME;
+        rc = md_getattr(sbi->ll_md_exp, op_data, request);
+        ll_finish_md_op_data(op_data);
+        if (rc) {
+                if (rc != -ENOENT)
+			CERROR("%s: inode "DFID": rc = %d\n",
+			       ll_get_fsname(inode->i_sb, NULL, 0),
+			       PFID(ll_inode2fid(inode)), rc);
+                GOTO (failed, rc);
+        }
+
+        body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY);
+        LASSERT(body != NULL);
+	if ((body->mbo_valid & OBD_MD_LINKNAME) == 0) {
+		CERROR("OBD_MD_LINKNAME not set on reply\n");
+		GOTO(failed, rc = -EPROTO);
+	}
+
+	LASSERT(symlen != 0);
+	if (body->mbo_eadatasize != symlen) {
+		CERROR("%s: inode "DFID": symlink length %d not expected %d\n",
+		       ll_get_fsname(inode->i_sb, NULL, 0),
+		       PFID(ll_inode2fid(inode)), body->mbo_eadatasize - 1,
+		       symlen - 1);
+                GOTO(failed, rc = -EPROTO);
+        }
+
+        *symname = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_MD);
+        if (*symname == NULL ||
+            strnlen(*symname, symlen) != symlen - 1) {
+                /* not full/NULL terminated */
+		CERROR("%s: inode "DFID": symlink not NULL terminated string"
+		       "of length %d\n", ll_get_fsname(inode->i_sb, NULL, 0),
+		       PFID(ll_inode2fid(inode)), symlen - 1);
+                GOTO(failed, rc = -EPROTO);
+        }
+
+        OBD_ALLOC(lli->lli_symlink_name, symlen);
+        /* do not return an error if we cannot cache the symlink locally */
+        if (lli->lli_symlink_name) {
+                memcpy(lli->lli_symlink_name, *symname, symlen);
+                *symname = lli->lli_symlink_name;
+        }
+        RETURN(0);
+
+failed:
+        RETURN (rc);
+}
+
+#ifdef HAVE_SYMLINK_OPS_USE_NAMEIDATA
+static void ll_put_link(struct dentry *dentry,
+			struct nameidata *nd, void *cookie)
+#else
+# ifdef HAVE_IOP_GET_LINK
+static void ll_put_link(void *cookie)
+# else
+static void ll_put_link(struct inode *unused, void *cookie)
+# endif
+#endif
+{
+	ptlrpc_req_finished(cookie);
+}
+
+#ifdef HAVE_SYMLINK_OPS_USE_NAMEIDATA
+static void *ll_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ptlrpc_request *request = NULL;
+	int rc;
+	char *symname = NULL;
+	ENTRY;
+
+        CDEBUG(D_VFSTRACE, "VFS Op\n");
+        /* Limit the recursive symlink depth to 5 instead of default
+         * 8 links when kernel has 4k stack to prevent stack overflow.
+         * For 8k stacks we need to limit it to 7 for local servers. */
+        if (THREAD_SIZE < 8192 && current->link_count >= 6) {
+                rc = -ELOOP;
+        } else if (THREAD_SIZE == 8192 && current->link_count >= 8) {
+                rc = -ELOOP;
+        } else {
+		ll_inode_size_lock(inode);
+		rc = ll_readlink_internal(inode, &request, &symname);
+		ll_inode_size_unlock(inode);
+        }
+	if (rc) {
+		ptlrpc_req_finished(request);
+		request = NULL;
+		symname = ERR_PTR(rc);
+	}
+
+	nd_set_link(nd, symname);
+	/* symname may contain a pointer to the request message buffer,
+	 * we delay request releasing until ll_put_link then.
+	 */
+	RETURN(request);
+}
+#else
+# ifdef HAVE_IOP_GET_LINK
+static const char *ll_get_link(struct dentry *dentry,
+			       struct inode *inode,
+			       struct delayed_call *done)
+{
+	struct ptlrpc_request *request;
+	char *symname = NULL;
+	int rc;
+
+	ENTRY;
+	CDEBUG(D_VFSTRACE, "VFS Op\n");
+	if (!dentry)
+		RETURN(ERR_PTR(-ECHILD));
+	ll_inode_size_lock(inode);
+	rc = ll_readlink_internal(inode, &request, &symname);
+	ll_inode_size_unlock(inode);
+	if (rc < 0) {
+		ptlrpc_req_finished(request);
+		return ERR_PTR(rc);
+	}
+
+	/* symname may contain a pointer to the request message buffer,
+	 * we delay request releasing then.
+	 */
+	set_delayed_call(done, ll_put_link, request);
+	RETURN(symname);
+}
+# else
+static const char *ll_follow_link(struct dentry *dentry, void **cookie)
+{
+	struct inode *inode = d_inode(dentry);
+	struct ptlrpc_request *request;
+	char *symname = NULL;
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op\n");
+	ll_inode_size_lock(inode);
+	rc = ll_readlink_internal(inode, &request, &symname);
+	ll_inode_size_unlock(inode);
+	if (rc < 0) {
+		ptlrpc_req_finished(request);
+		return ERR_PTR(rc);
+	}
+
+	/* symname may contain a pointer to the request message buffer,
+	 * we delay request releasing until ll_put_link then.
+	 */
+	*cookie = request;
+	RETURN(symname);
+}
+# endif /* HAVE_IOP_GET_LINK */
+#endif /* HAVE_SYMLINK_OPS_USE_NAMEIDATA */
+
+struct inode_operations ll_fast_symlink_inode_operations = {
+#ifdef HAVE_IOP_GENERIC_READLINK
+	.readlink	= generic_readlink,
+#endif
+	.setattr	= ll_setattr,
+#ifdef HAVE_IOP_GET_LINK
+	.get_link	= ll_get_link,
+#else
+	.follow_link	= ll_follow_link,
+	.put_link	= ll_put_link,
+#endif
+	.getattr	= ll_getattr,
+	.permission	= ll_inode_permission,
+#ifdef HAVE_IOP_XATTR
+	.setxattr	= ll_setxattr,
+	.getxattr	= ll_getxattr,
+	.removexattr	= ll_removexattr,
+#endif
+	.listxattr	= ll_listxattr,
+};
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c b/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c
new file mode 100644
index 0000000000000..ab92d303fc1e9
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c
@@ -0,0 +1,655 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl_device and cl_device_type implementation for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd.h>
+#include "llite_internal.h"
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Vvp device and device type functions.
+ *
+ */
+
+/*
+ * vvp_ prefix stands for "Vfs Vm Posix". It corresponds to historical
+ * "llite_" (var. "ll_") prefix.
+ */
+
+static struct kmem_cache *ll_thread_kmem;
+struct kmem_cache *vvp_lock_kmem;
+struct kmem_cache *vvp_object_kmem;
+static struct kmem_cache *vvp_session_kmem;
+static struct kmem_cache *vvp_thread_kmem;
+
+static struct lu_kmem_descr vvp_caches[] = {
+	{
+		.ckd_cache = &ll_thread_kmem,
+		.ckd_name  = "ll_thread_kmem",
+		.ckd_size  = sizeof(struct ll_thread_info),
+	},
+	{
+		.ckd_cache = &vvp_lock_kmem,
+		.ckd_name  = "vvp_lock_kmem",
+		.ckd_size  = sizeof(struct vvp_lock),
+	},
+	{
+		.ckd_cache = &vvp_object_kmem,
+		.ckd_name  = "vvp_object_kmem",
+		.ckd_size  = sizeof(struct vvp_object),
+	},
+        {
+                .ckd_cache = &vvp_session_kmem,
+                .ckd_name  = "vvp_session_kmem",
+                .ckd_size  = sizeof (struct vvp_session)
+        },
+	{
+		.ckd_cache = &vvp_thread_kmem,
+		.ckd_name  = "vvp_thread_kmem",
+		.ckd_size  = sizeof(struct vvp_thread_info),
+	},
+        {
+                .ckd_cache = NULL
+        }
+};
+
+static void *ll_thread_key_init(const struct lu_context *ctx,
+				struct lu_context_key *key)
+{
+	struct ll_thread_info *lti;
+
+	OBD_SLAB_ALLOC_PTR_GFP(lti, ll_thread_kmem, GFP_NOFS);
+	if (lti == NULL)
+		lti = ERR_PTR(-ENOMEM);
+
+	return lti;
+}
+
+static void ll_thread_key_fini(const struct lu_context *ctx,
+			       struct lu_context_key *key, void *data)
+{
+	struct ll_thread_info *lti = data;
+
+	OBD_SLAB_FREE_PTR(lti, ll_thread_kmem);
+}
+
+struct lu_context_key ll_thread_key = {
+	.lct_tags = LCT_CL_THREAD,
+	.lct_init = ll_thread_key_init,
+	.lct_fini = ll_thread_key_fini,
+};
+
+static void *vvp_session_key_init(const struct lu_context *ctx,
+				  struct lu_context_key *key)
+{
+	struct vvp_session *session;
+
+	OBD_SLAB_ALLOC_PTR_GFP(session, vvp_session_kmem, GFP_NOFS);
+	if (session == NULL)
+		session = ERR_PTR(-ENOMEM);
+	return session;
+}
+
+static void vvp_session_key_fini(const struct lu_context *ctx,
+                                 struct lu_context_key *key, void *data)
+{
+        struct vvp_session *session = data;
+        OBD_SLAB_FREE_PTR(session, vvp_session_kmem);
+}
+
+struct lu_context_key vvp_session_key = {
+        .lct_tags = LCT_SESSION,
+        .lct_init = vvp_session_key_init,
+        .lct_fini = vvp_session_key_fini
+};
+
+static void *vvp_thread_key_init(const struct lu_context *ctx,
+				 struct lu_context_key *key)
+{
+	struct vvp_thread_info *vti;
+
+	OBD_SLAB_ALLOC_PTR_GFP(vti, vvp_thread_kmem, GFP_NOFS);
+	if (vti == NULL)
+		vti = ERR_PTR(-ENOMEM);
+	return vti;
+}
+
+static void vvp_thread_key_fini(const struct lu_context *ctx,
+				struct lu_context_key *key, void *data)
+{
+	struct vvp_thread_info *vti = data;
+	OBD_SLAB_FREE_PTR(vti, vvp_thread_kmem);
+}
+
+struct lu_context_key vvp_thread_key = {
+	.lct_tags = LCT_CL_THREAD,
+	.lct_init = vvp_thread_key_init,
+	.lct_fini = vvp_thread_key_fini,
+};
+
+/* type constructor/destructor: vvp_type_{init,fini,start,stop}(). */
+LU_TYPE_INIT_FINI(vvp, &ll_thread_key, &vvp_session_key, &vvp_thread_key);
+
+static const struct lu_device_operations vvp_lu_ops = {
+        .ldo_object_alloc      = vvp_object_alloc
+};
+
+static struct lu_device *vvp_device_free(const struct lu_env *env,
+					 struct lu_device *d)
+{
+	struct vvp_device *vdv  = lu2vvp_dev(d);
+	struct cl_site    *site = lu2cl_site(d->ld_site);
+	struct lu_device  *next = cl2lu_dev(vdv->vdv_next);
+
+	if (d->ld_site != NULL) {
+		cl_site_fini(site);
+		OBD_FREE_PTR(site);
+	}
+
+	cl_device_fini(lu2cl_dev(d));
+	OBD_FREE_PTR(vdv);
+	return next;
+}
+
+static struct lu_device *vvp_device_alloc(const struct lu_env *env,
+					  struct lu_device_type *t,
+					  struct lustre_cfg *cfg)
+{
+	struct vvp_device *vdv;
+	struct lu_device *lud;
+	struct cl_site *site;
+	int rc;
+	ENTRY;
+
+	OBD_ALLOC_PTR(vdv);
+	if (vdv == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	lud = &vdv->vdv_cl.cd_lu_dev;
+	cl_device_init(&vdv->vdv_cl, t);
+	vvp2lu_dev(vdv)->ld_ops = &vvp_lu_ops;
+
+	OBD_ALLOC_PTR(site);
+	if (site != NULL) {
+		rc = cl_site_init(site, &vdv->vdv_cl);
+		if (rc == 0)
+			rc = lu_site_init_finish(&site->cs_lu);
+		else {
+			LASSERT(lud->ld_site == NULL);
+			CERROR("Cannot init lu_site, rc %d.\n", rc);
+			OBD_FREE_PTR(site);
+		}
+	} else
+		rc = -ENOMEM;
+	if (rc != 0) {
+		vvp_device_free(env, lud);
+		lud = ERR_PTR(rc);
+	}
+	RETURN(lud);
+}
+
+static int vvp_device_init(const struct lu_env *env, struct lu_device *d,
+			   const char *name, struct lu_device *next)
+{
+	struct vvp_device  *vdv;
+	int rc;
+	ENTRY;
+
+	vdv = lu2vvp_dev(d);
+	vdv->vdv_next = lu2cl_dev(next);
+
+	LASSERT(d->ld_site != NULL && next->ld_type != NULL);
+	next->ld_site = d->ld_site;
+	rc = next->ld_type->ldt_ops->ldto_device_init(
+		env, next, next->ld_type->ldt_name, NULL);
+	if (rc == 0) {
+		lu_device_get(next);
+		lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init);
+	}
+	RETURN(rc);
+}
+
+static struct lu_device *vvp_device_fini(const struct lu_env *env,
+					 struct lu_device *d)
+{
+	return cl2lu_dev(lu2vvp_dev(d)->vdv_next);
+}
+
+static const struct lu_device_type_operations vvp_device_type_ops = {
+        .ldto_init = vvp_type_init,
+        .ldto_fini = vvp_type_fini,
+
+        .ldto_start = vvp_type_start,
+        .ldto_stop  = vvp_type_stop,
+
+	.ldto_device_alloc	= vvp_device_alloc,
+	.ldto_device_free	= vvp_device_free,
+	.ldto_device_init	= vvp_device_init,
+	.ldto_device_fini	= vvp_device_fini,
+};
+
+struct lu_device_type vvp_device_type = {
+        .ldt_tags     = LU_DEVICE_CL,
+        .ldt_name     = LUSTRE_VVP_NAME,
+        .ldt_ops      = &vvp_device_type_ops,
+        .ldt_ctx_tags = LCT_CL_THREAD
+};
+
+/**
+ * A mutex serializing calls to vvp_inode_fini() under extreme memory
+ * pressure, when environments cannot be allocated.
+ */
+int vvp_global_init(void)
+{
+	int rc;
+
+	rc = lu_kmem_init(vvp_caches);
+	if (rc != 0)
+		return rc;
+
+	rc = lu_device_type_init(&vvp_device_type);
+	if (rc != 0)
+		goto out_kmem;
+
+	return 0;
+
+out_kmem:
+	lu_kmem_fini(vvp_caches);
+
+	return rc;
+}
+
+void vvp_global_fini(void)
+{
+	lu_device_type_fini(&vvp_device_type);
+	lu_kmem_fini(vvp_caches);
+}
+
+/*****************************************************************************
+ *
+ * mirror obd-devices into cl devices.
+ *
+ */
+
+int cl_sb_init(struct super_block *sb)
+{
+        struct ll_sb_info *sbi;
+        struct cl_device  *cl;
+        struct lu_env     *env;
+        int rc = 0;
+	__u16 refcheck;
+
+        sbi  = ll_s2sbi(sb);
+        env = cl_env_get(&refcheck);
+        if (!IS_ERR(env)) {
+                cl = cl_type_setup(env, NULL, &vvp_device_type,
+                                   sbi->ll_dt_exp->exp_obd->obd_lu_dev);
+                if (!IS_ERR(cl)) {
+                        sbi->ll_cl = cl;
+                        sbi->ll_site = cl2lu_dev(cl)->ld_site;
+                }
+                cl_env_put(env, &refcheck);
+        } else
+                rc = PTR_ERR(env);
+        RETURN(rc);
+}
+
+int cl_sb_fini(struct super_block *sb)
+{
+        struct ll_sb_info *sbi;
+        struct lu_env     *env;
+        struct cl_device  *cld;
+	__u16              refcheck;
+        int                result;
+
+        ENTRY;
+        sbi = ll_s2sbi(sb);
+        env = cl_env_get(&refcheck);
+        if (!IS_ERR(env)) {
+                cld = sbi->ll_cl;
+
+                if (cld != NULL) {
+                        cl_stack_fini(env, cld);
+                        sbi->ll_cl = NULL;
+                        sbi->ll_site = NULL;
+                }
+                cl_env_put(env, &refcheck);
+                result = 0;
+        } else {
+                CERROR("Cannot cleanup cl-stack due to memory shortage.\n");
+                result = PTR_ERR(env);
+        }
+
+	RETURN(result);
+}
+
+/****************************************************************************
+ *
+ * /proc/fs/lustre/llite/$MNT/dump_page_cache
+ *
+ ****************************************************************************/
+
+/*
+ * To represent contents of a page cache as a byte stream, following
+ * information if encoded in 64bit offset:
+ *
+ *       - file hash bucket in lu_site::ls_hash[]       28bits
+ *
+ *       - how far file is from bucket head              4bits
+ *
+ *       - page index                                   32bits
+ *
+ * First two data identify a file in the cache uniquely.
+ */
+
+#define PGC_OBJ_SHIFT (32 + 4)
+#define PGC_DEPTH_SHIFT (32)
+
+struct vvp_pgcache_id {
+        unsigned                 vpi_bucket;
+        unsigned                 vpi_depth;
+        uint32_t                 vpi_index;
+
+        unsigned                 vpi_curdep;
+        struct lu_object_header *vpi_obj;
+};
+
+static void vvp_pgcache_id_unpack(loff_t pos, struct vvp_pgcache_id *id)
+{
+        CLASSERT(sizeof(pos) == sizeof(__u64));
+
+        id->vpi_index  = pos & 0xffffffff;
+        id->vpi_depth  = (pos >> PGC_DEPTH_SHIFT) & 0xf;
+        id->vpi_bucket = ((unsigned long long)pos >> PGC_OBJ_SHIFT);
+}
+
+static loff_t vvp_pgcache_id_pack(struct vvp_pgcache_id *id)
+{
+        return
+                ((__u64)id->vpi_index) |
+                ((__u64)id->vpi_depth  << PGC_DEPTH_SHIFT) |
+                ((__u64)id->vpi_bucket << PGC_OBJ_SHIFT);
+}
+
+static int vvp_pgcache_obj_get(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			       struct hlist_node *hnode, void *data)
+{
+        struct vvp_pgcache_id   *id  = data;
+        struct lu_object_header *hdr = cfs_hash_object(hs, hnode);
+
+        if (id->vpi_curdep-- > 0)
+                return 0; /* continue */
+
+        if (lu_object_is_dying(hdr))
+                return 1;
+
+        cfs_hash_get(hs, hnode);
+        id->vpi_obj = hdr;
+        return 1;
+}
+
+static struct cl_object *vvp_pgcache_obj(const struct lu_env *env,
+					 struct lu_device *dev,
+					 struct vvp_pgcache_id *id)
+{
+	LASSERT(lu_device_is_cl(dev));
+
+	id->vpi_depth &= 0xf;
+	id->vpi_obj    = NULL;
+	id->vpi_curdep = id->vpi_depth;
+
+	cfs_hash_hlist_for_each(dev->ld_site->ls_obj_hash, id->vpi_bucket,
+				vvp_pgcache_obj_get, id);
+	if (id->vpi_obj != NULL) {
+		struct lu_object *lu_obj;
+
+		lu_obj = lu_object_locate(id->vpi_obj, dev->ld_type);
+		if (lu_obj != NULL) {
+			lu_object_ref_add(lu_obj, "dump", current);
+			return lu2cl(lu_obj);
+		}
+		lu_object_put(env, lu_object_top(id->vpi_obj));
+
+	} else if (id->vpi_curdep > 0) {
+		id->vpi_depth = 0xf;
+	}
+	return NULL;
+}
+
+static loff_t vvp_pgcache_find(const struct lu_env *env,
+			       struct lu_device *dev, loff_t pos)
+{
+	struct cl_object     *clob;
+	struct lu_site       *site;
+	struct vvp_pgcache_id id;
+
+	site = dev->ld_site;
+	vvp_pgcache_id_unpack(pos, &id);
+
+	while (1) {
+		if (id.vpi_bucket >= CFS_HASH_NHLIST(site->ls_obj_hash))
+			return ~0ULL;
+		clob = vvp_pgcache_obj(env, dev, &id);
+		if (clob != NULL) {
+			struct inode *inode = vvp_object_inode(clob);
+			struct page *vmpage;
+			int nr;
+
+			nr = find_get_pages_contig(inode->i_mapping,
+						   id.vpi_index, 1, &vmpage);
+			if (nr > 0) {
+				id.vpi_index = vmpage->index;
+				/* Cant support over 16T file */
+				nr = !(vmpage->index > 0xffffffff);
+				put_page(vmpage);
+			}
+
+			lu_object_ref_del(&clob->co_lu, "dump", current);
+			cl_object_put(env, clob);
+			if (nr > 0)
+				return vvp_pgcache_id_pack(&id);
+		}
+		/* to the next object. */
+		++id.vpi_depth;
+		id.vpi_depth &= 0xf;
+		if (id.vpi_depth == 0 && ++id.vpi_bucket == 0)
+			return ~0ULL;
+		id.vpi_index = 0;
+	}
+}
+
+#define seq_page_flag(seq, page, flag, has_flags) do {                  \
+	if (test_bit(PG_##flag, &(page)->flags)) {                  \
+                seq_printf(seq, "%s"#flag, has_flags ? "|" : "");       \
+                has_flags = 1;                                          \
+        }                                                               \
+} while(0)
+
+static void vvp_pgcache_page_show(const struct lu_env *env,
+				  struct seq_file *seq, struct cl_page *page)
+{
+	struct vvp_page *vpg;
+	struct page      *vmpage;
+	int              has_flags;
+
+	vpg = cl2vvp_page(cl_page_at(page, &vvp_device_type));
+	vmpage = vpg->vpg_page;
+	seq_printf(seq, " %5i | %p %p %s %s %s | %p "DFID"(%p) %lu %u [",
+		   0 /* gen */,
+		   vpg, page,
+		   "none",
+		   vpg->vpg_defer_uptodate ? "du" : "- ",
+		   PageWriteback(vmpage) ? "wb" : "-",
+		   vmpage,
+		   PFID(ll_inode2fid(vmpage->mapping->host)),
+		   vmpage->mapping->host, vmpage->index,
+		   page_count(vmpage));
+	has_flags = 0;
+	seq_page_flag(seq, vmpage, locked, has_flags);
+	seq_page_flag(seq, vmpage, error, has_flags);
+	seq_page_flag(seq, vmpage, referenced, has_flags);
+	seq_page_flag(seq, vmpage, uptodate, has_flags);
+	seq_page_flag(seq, vmpage, dirty, has_flags);
+	seq_page_flag(seq, vmpage, writeback, has_flags);
+	seq_printf(seq, "%s]\n", has_flags ? "" : "-");
+}
+
+static int vvp_pgcache_show(struct seq_file *f, void *v)
+{
+	loff_t                   pos;
+	struct ll_sb_info       *sbi;
+	struct cl_object        *clob;
+	struct lu_env           *env;
+	struct vvp_pgcache_id    id;
+	__u16                    refcheck;
+	int                      result;
+
+	env = cl_env_get(&refcheck);
+	if (!IS_ERR(env)) {
+		pos = *(loff_t *) v;
+		vvp_pgcache_id_unpack(pos, &id);
+		sbi = f->private;
+		clob = vvp_pgcache_obj(env, &sbi->ll_cl->cd_lu_dev, &id);
+		if (clob != NULL) {
+			struct inode *inode = vvp_object_inode(clob);
+			struct cl_page *page = NULL;
+			struct page *vmpage;
+
+			result = find_get_pages_contig(inode->i_mapping,
+						      id.vpi_index, 1, &vmpage);
+			if (result > 0) {
+				lock_page(vmpage);
+				page = cl_vmpage_page(vmpage, clob);
+				unlock_page(vmpage);
+
+				put_page(vmpage);
+			}
+
+			seq_printf(f, "%8x@"DFID": ", id.vpi_index,
+				   PFID(lu_object_fid(&clob->co_lu)));
+			if (page != NULL) {
+				vvp_pgcache_page_show(env, f, page);
+				cl_page_put(env, page);
+			} else
+				seq_puts(f, "missing\n");
+			lu_object_ref_del(&clob->co_lu, "dump", current);
+			cl_object_put(env, clob);
+		} else
+			seq_printf(f, "%llx missing\n", pos);
+		cl_env_put(env, &refcheck);
+		result = 0;
+	} else
+		result = PTR_ERR(env);
+	return result;
+}
+
+static void *vvp_pgcache_start(struct seq_file *f, loff_t *pos)
+{
+        struct ll_sb_info *sbi;
+        struct lu_env     *env;
+	__u16              refcheck;
+
+        sbi = f->private;
+
+        env = cl_env_get(&refcheck);
+        if (!IS_ERR(env)) {
+                sbi = f->private;
+                if (sbi->ll_site->ls_obj_hash->hs_cur_bits > 64 - PGC_OBJ_SHIFT)
+                        pos = ERR_PTR(-EFBIG);
+                else {
+                        *pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev,
+                                                *pos);
+                        if (*pos == ~0ULL)
+                                pos = NULL;
+                }
+                cl_env_put(env, &refcheck);
+        }
+        return pos;
+}
+
+static void *vvp_pgcache_next(struct seq_file *f, void *v, loff_t *pos)
+{
+        struct ll_sb_info *sbi;
+        struct lu_env     *env;
+	__u16              refcheck;
+
+        env = cl_env_get(&refcheck);
+        if (!IS_ERR(env)) {
+                sbi = f->private;
+                *pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev, *pos + 1);
+                if (*pos == ~0ULL)
+                        pos = NULL;
+                cl_env_put(env, &refcheck);
+        }
+        return pos;
+}
+
+static void vvp_pgcache_stop(struct seq_file *f, void *v)
+{
+        /* Nothing to do */
+}
+
+static struct seq_operations vvp_pgcache_ops = {
+        .start = vvp_pgcache_start,
+        .next  = vvp_pgcache_next,
+        .stop  = vvp_pgcache_stop,
+        .show  = vvp_pgcache_show
+};
+
+static int vvp_dump_pgcache_seq_open(struct inode *inode, struct file *filp)
+{
+	struct ll_sb_info	*sbi = PDE_DATA(inode);
+	struct seq_file		*seq;
+	int			result;
+
+	result = seq_open(filp, &vvp_pgcache_ops);
+	if (result == 0) {
+		seq = filp->private_data;
+		seq->private = sbi;
+	}
+	return result;
+}
+
+const struct file_operations vvp_dump_pgcache_file_ops = {
+        .owner   = THIS_MODULE,
+        .open    = vvp_dump_pgcache_seq_open,
+        .read    = seq_read,
+        .llseek	 = seq_lseek,
+        .release = seq_release,
+};
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_internal.h b/drivers/staging/lustrefsx/lustre/llite/vvp_internal.h
new file mode 100644
index 0000000000000..645b4b5cfca6b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_internal.h
@@ -0,0 +1,333 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2013, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal definitions for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#ifndef VVP_INTERNAL_H
+#define VVP_INTERNAL_H
+
+#include <lustre/lustre_idl.h>
+#include <cl_object.h>
+
+enum obd_notify_event;
+struct inode;
+struct lustre_md;
+struct obd_device;
+struct obd_export;
+struct page;
+
+enum vvp_io_subtype {
+	/** normal IO */
+	IO_NORMAL,
+	/** io started from splice_{read|write} */
+	IO_SPLICE,
+};
+
+/**
+ * IO state private to VVP layer.
+ */
+struct vvp_io {
+	/** super class */
+	struct cl_io_slice     vui_cl;
+	struct cl_io_lock_link vui_link;
+	/** Total size for the left IO. */
+	size_t vui_tot_count;
+
+	union {
+		struct vvp_fault_io {
+			/**
+			 * Inode modification time that is checked across DLM
+			 * lock request.
+			 */
+			time64_t		 ft_mtime;
+			struct vm_area_struct	*ft_vma;
+			/**
+			 *  locked page returned from vvp_io
+			 */
+			struct page		*ft_vmpage;
+			/**
+			 * kernel fault info
+			 */
+			struct vm_fault		*ft_vmf;
+			/**
+			 * fault API used bitflags for return code.
+			 */
+			unsigned int		 ft_flags;
+			/**
+			 * check that flags are from filemap_fault
+			 */
+			bool			 ft_flags_valid;
+		} fault;
+		struct {
+			struct pipe_inode_info	*vui_pipe;
+			unsigned int		 vui_flags;
+		} splice;
+		struct {
+			struct cl_page_list vui_queue;
+			unsigned long vui_written;
+			int vui_from;
+			int vui_to;
+		} write;
+	} u;
+
+	enum vvp_io_subtype	vui_io_subtype;
+
+	/**
+	 * Layout version when this IO is initialized
+	 */
+	__u32			vui_layout_gen;
+	/**
+	* File descriptor against which IO is done.
+	*/
+	struct ll_file_data	*vui_fd;
+
+	/* Readahead state. */
+	pgoff_t	vui_ra_start;
+	pgoff_t	vui_ra_count;
+	/* Set when vui_ra_{start,count} have been initialized. */
+	bool		vui_ra_valid;
+};
+
+extern struct lu_device_type vvp_device_type;
+
+extern struct lu_context_key vvp_session_key;
+extern struct lu_context_key vvp_thread_key;
+
+extern struct kmem_cache *vvp_lock_kmem;
+extern struct kmem_cache *vvp_object_kmem;
+
+struct vvp_thread_info {
+	struct cl_lock		vti_lock;
+	struct cl_lock_descr	vti_descr;
+	struct cl_io		vti_io;
+	struct cl_attr		vti_attr;
+};
+
+static inline struct vvp_thread_info *vvp_env_info(const struct lu_env *env)
+{
+	struct vvp_thread_info *vti;
+
+	vti = lu_context_key_get(&env->le_ctx, &vvp_thread_key);
+	LASSERT(vti != NULL);
+
+	return vti;
+}
+
+static inline struct cl_lock *vvp_env_lock(const struct lu_env *env)
+{
+	struct cl_lock *lock = &vvp_env_info(env)->vti_lock;
+
+	memset(lock, 0, sizeof(*lock));
+
+	return lock;
+}
+
+static inline struct cl_attr *vvp_env_thread_attr(const struct lu_env *env)
+{
+	struct cl_attr *attr = &vvp_env_info(env)->vti_attr;
+
+	memset(attr, 0, sizeof(*attr));
+
+	return attr;
+}
+
+static inline struct cl_io *vvp_env_thread_io(const struct lu_env *env)
+{
+	struct cl_io *io = &vvp_env_info(env)->vti_io;
+
+	memset(io, 0, sizeof(*io));
+
+	return io;
+}
+
+struct vvp_session {
+	struct vvp_io vs_ios;
+};
+
+static inline struct vvp_session *vvp_env_session(const struct lu_env *env)
+{
+	struct vvp_session *ses;
+
+	ses = lu_context_key_get(env->le_ses, &vvp_session_key);
+	LASSERT(ses != NULL);
+
+	return ses;
+}
+
+static inline struct vvp_io *vvp_env_io(const struct lu_env *env)
+{
+	return &vvp_env_session(env)->vs_ios;
+}
+
+/**
+ * VPP-private object state.
+ */
+struct vvp_object {
+	struct cl_object_header vob_header;
+	struct cl_object        vob_cl;
+	struct inode           *vob_inode;
+
+	/**
+	 * Number of transient pages.  This is no longer protected by i_sem,
+	 * and needs to be atomic.  This is not actually used for anything,
+	 * and can probably be removed.
+	 */
+	atomic_t		vob_transient_pages;
+	/**
+	 * Number of outstanding mmaps on this file.
+	 *
+	 * \see ll_vm_open(), ll_vm_close().
+	 */
+	atomic_t                vob_mmap_cnt;
+
+	/**
+	 * various flags
+	 * vob_discard_page_warned
+	 *     if pages belonging to this object are discarded when a client
+	 * is evicted, some debug info will be printed, this flag will be set
+	 * during processing the first discarded page, then avoid flooding
+	 * debug message for lots of discarded pages.
+	 *
+	 * \see ll_dirty_page_discard_warn.
+	 */
+	unsigned int		vob_discard_page_warned:1;
+};
+
+/**
+ * VVP-private page state.
+ */
+struct vvp_page {
+	struct cl_page_slice vpg_cl;
+	unsigned	vpg_defer_uptodate:1,
+			vpg_ra_updated:1,
+			vpg_ra_used:1;
+	/** VM page */
+	struct page	*vpg_page;
+};
+
+static inline struct vvp_page *cl2vvp_page(const struct cl_page_slice *slice)
+{
+	return container_of(slice, struct vvp_page, vpg_cl);
+}
+
+static inline pgoff_t vvp_index(struct vvp_page *vpg)
+{
+	return vpg->vpg_cl.cpl_index;
+}
+
+struct vvp_device {
+	struct cl_device    vdv_cl;
+	struct cl_device   *vdv_next;
+};
+
+struct vvp_lock {
+	struct cl_lock_slice vlk_cl;
+};
+
+static inline struct lu_device *vvp2lu_dev(struct vvp_device *vdv)
+{
+	return &vdv->vdv_cl.cd_lu_dev;
+}
+
+static inline struct vvp_device *lu2vvp_dev(const struct lu_device *d)
+{
+	return container_of0(d, struct vvp_device, vdv_cl.cd_lu_dev);
+}
+
+static inline struct vvp_device *cl2vvp_dev(const struct cl_device *d)
+{
+	return container_of0(d, struct vvp_device, vdv_cl);
+}
+
+static inline struct vvp_object *cl2vvp(const struct cl_object *obj)
+{
+	return container_of0(obj, struct vvp_object, vob_cl);
+}
+
+static inline struct vvp_object *lu2vvp(const struct lu_object *obj)
+{
+	return container_of0(obj, struct vvp_object, vob_cl.co_lu);
+}
+
+static inline struct inode *vvp_object_inode(const struct cl_object *obj)
+{
+	return cl2vvp(obj)->vob_inode;
+}
+
+int vvp_object_invariant(const struct cl_object *obj);
+struct vvp_object *cl_inode2vvp(struct inode *inode);
+
+static inline struct page *cl2vm_page(const struct cl_page_slice *slice)
+{
+	return cl2vvp_page(slice)->vpg_page;
+}
+
+static inline struct vvp_lock *cl2vvp_lock(const struct cl_lock_slice *slice)
+{
+	return container_of(slice, struct vvp_lock, vlk_cl);
+}
+
+#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
+# define CLOBINVRNT(env, clob, expr)					\
+	do {								\
+		if (unlikely(!(expr))) {				\
+			LU_OBJECT_DEBUG(D_ERROR, (env), &(clob)->co_lu, \
+					#expr);				\
+			LINVRNT(0);					\
+		}							\
+	} while (0)
+#else /* !CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK */
+# define CLOBINVRNT(env, clob, expr)					\
+	((void)sizeof(env), (void)sizeof(clob), (void)sizeof !!(expr))
+#endif /* CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK */
+
+int lov_read_and_clear_async_rc(struct cl_object *clob);
+
+int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
+		struct cl_io *io);
+int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io);
+int vvp_lock_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_lock *lock, const struct cl_io *io);
+int vvp_page_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_page *page, pgoff_t index);
+struct lu_object *vvp_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *hdr,
+				   struct lu_device *dev);
+
+int vvp_global_init(void);
+void vvp_global_fini(void);
+
+extern const struct file_operations vvp_dump_pgcache_file_ops;
+
+#endif /* VVP_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_io.c b/drivers/staging/lustrefsx/lustre/llite/vvp_io.c
new file mode 100644
index 0000000000000..9de5f9b40cf20
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_io.c
@@ -0,0 +1,1478 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include <obd.h>
+#include "llite_internal.h"
+#include "vvp_internal.h"
+
+static struct vvp_io *cl2vvp_io(const struct lu_env *env,
+				const struct cl_io_slice *slice)
+{
+	struct vvp_io *vio;
+
+	vio = container_of(slice, struct vvp_io, vui_cl);
+	LASSERT(vio == vvp_env_io(env));
+
+	return vio;
+}
+
+/**
+ * True, if \a io is a normal io, False for splice_{read,write}
+ */
+static int cl_is_normalio(const struct lu_env *env, const struct cl_io *io)
+{
+	struct vvp_io *vio = vvp_env_io(env);
+
+	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+	return vio->vui_io_subtype == IO_NORMAL;
+}
+
+/**
+ * For swapping layout. The file's layout may have changed.
+ * To avoid populating pages to a wrong stripe, we have to verify the
+ * correctness of layout. It works because swapping layout processes
+ * have to acquire group lock.
+ */
+static bool can_populate_pages(const struct lu_env *env, struct cl_io *io,
+				struct inode *inode)
+{
+	struct ll_inode_info	*lli = ll_i2info(inode);
+	struct vvp_io		*vio = vvp_env_io(env);
+	bool rc = true;
+
+	switch (io->ci_type) {
+	case CIT_READ:
+	case CIT_WRITE:
+		/* don't need lock here to check lli_layout_gen as we have held
+		 * extent lock and GROUP lock has to hold to swap layout */
+		if (ll_layout_version_get(lli) != vio->vui_layout_gen ||
+		    OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_LOST_LAYOUT, 0)) {
+			io->ci_need_restart = 1;
+			/* this will cause a short read/write */
+			io->ci_continue = 0;
+			rc = false;
+		}
+	case CIT_FAULT:
+		/* fault is okay because we've already had a page. */
+	default:
+		break;
+	}
+
+	return rc;
+}
+
+static void vvp_object_size_lock(struct cl_object *obj)
+{
+	struct inode *inode = vvp_object_inode(obj);
+
+	ll_inode_size_lock(inode);
+	cl_object_attr_lock(obj);
+}
+
+static void vvp_object_size_unlock(struct cl_object *obj)
+{
+	struct inode *inode = vvp_object_inode(obj);
+
+	cl_object_attr_unlock(obj);
+	ll_inode_size_unlock(inode);
+}
+
+/**
+ * Helper function that if necessary adjusts file size (inode->i_size), when
+ * position at the offset \a pos is accessed. File size can be arbitrary stale
+ * on a Lustre client, but client at least knows KMS. If accessed area is
+ * inside [0, KMS], set file size to KMS, otherwise glimpse file size.
+ *
+ * Locking: i_size_lock is used to serialize changes to inode size and to
+ * protect consistency between inode size and cl_object
+ * attributes. cl_object_size_lock() protects consistency between cl_attr's of
+ * top-object and sub-objects.
+ */
+static int vvp_prep_size(const struct lu_env *env, struct cl_object *obj,
+			 struct cl_io *io, loff_t start, size_t count,
+			 int *exceed)
+{
+	struct cl_attr *attr  = vvp_env_thread_attr(env);
+	struct inode   *inode = vvp_object_inode(obj);
+	loff_t          pos   = start + count - 1;
+	loff_t kms;
+	int result;
+
+	/*
+	 * Consistency guarantees: following possibilities exist for the
+	 * relation between region being accessed and real file size at this
+	 * moment:
+	 *
+	 *  (A): the region is completely inside of the file;
+	 *
+	 *  (B-x): x bytes of region are inside of the file, the rest is
+	 *  outside;
+	 *
+	 *  (C): the region is completely outside of the file.
+	 *
+	 * This classification is stable under DLM lock already acquired by
+	 * the caller, because to change the class, other client has to take
+	 * DLM lock conflicting with our lock. Also, any updates to ->i_size
+	 * by other threads on this client are serialized by
+	 * ll_inode_size_lock(). This guarantees that short reads are handled
+	 * correctly in the face of concurrent writes and truncates.
+	 */
+	vvp_object_size_lock(obj);
+	result = cl_object_attr_get(env, obj, attr);
+	if (result == 0) {
+		kms = attr->cat_kms;
+		if (pos > kms) {
+			/*
+			 * A glimpse is necessary to determine whether we
+			 * return a short read (B) or some zeroes at the end
+			 * of the buffer (C)
+			 */
+			vvp_object_size_unlock(obj);
+			result = cl_glimpse_lock(env, io, inode, obj, 0);
+			if (result == 0 && exceed != NULL) {
+				/* If objective page index exceed end-of-file
+				 * page index, return directly. Do not expect
+				 * kernel will check such case correctly.
+				 * linux-2.6.18-128.1.1 miss to do that.
+				 * --bug 17336 */
+				loff_t size = i_size_read(inode);
+				unsigned long cur_index = start >>
+					PAGE_SHIFT;
+
+				if ((size == 0 && cur_index != 0) ||
+				    (((size - 1) >> PAGE_SHIFT) <
+				     cur_index))
+					*exceed = 1;
+			}
+
+			return result;
+		} else {
+			/*
+			 * region is within kms and, hence, within real file
+			 * size (A). We need to increase i_size to cover the
+			 * read region so that generic_file_read() will do its
+			 * job, but that doesn't mean the kms size is
+			 * _correct_, it is only the _minimum_ size. If
+			 * someone does a stat they will get the correct size
+			 * which will always be >= the kms value here.
+			 * b=11081
+			 */
+			if (i_size_read(inode) < kms) {
+				i_size_write(inode, kms);
+				CDEBUG(D_VFSTRACE,
+				       DFID" updating i_size %llu\n",
+				       PFID(lu_object_fid(&obj->co_lu)),
+				       (__u64)i_size_read(inode));
+			}
+		}
+	}
+
+	vvp_object_size_unlock(obj);
+
+	return result;
+}
+
+/*****************************************************************************
+ *
+ * io operations.
+ *
+ */
+
+static int vvp_io_one_lock_index(const struct lu_env *env, struct cl_io *io,
+				 __u32 enqflags, enum cl_lock_mode mode,
+				 pgoff_t start, pgoff_t end)
+{
+	struct vvp_io          *vio   = vvp_env_io(env);
+	struct cl_lock_descr   *descr = &vio->vui_link.cill_descr;
+	struct cl_object       *obj   = io->ci_obj;
+
+	CLOBINVRNT(env, obj, vvp_object_invariant(obj));
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "lock: %d [%lu, %lu]\n", mode, start, end);
+
+	memset(&vio->vui_link, 0, sizeof vio->vui_link);
+
+	if (vio->vui_fd && (vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+		descr->cld_mode = CLM_GROUP;
+		descr->cld_gid  = vio->vui_fd->fd_grouplock.lg_gid;
+		enqflags |= CEF_LOCK_MATCH;
+	} else {
+		descr->cld_mode  = mode;
+	}
+
+	descr->cld_obj   = obj;
+	descr->cld_start = start;
+	descr->cld_end   = end;
+	descr->cld_enq_flags = enqflags;
+
+	cl_io_lock_add(env, io, &vio->vui_link);
+
+	RETURN(0);
+}
+
+static int vvp_io_one_lock(const struct lu_env *env, struct cl_io *io,
+			   __u32 enqflags, enum cl_lock_mode mode,
+			   loff_t start, loff_t end)
+{
+	struct cl_object *obj = io->ci_obj;
+
+	return vvp_io_one_lock_index(env, io, enqflags, mode,
+				     cl_index(obj, start), cl_index(obj, end));
+}
+
+static int vvp_io_write_iter_init(const struct lu_env *env,
+				  const struct cl_io_slice *ios)
+{
+	struct vvp_io *vio = cl2vvp_io(env, ios);
+
+	cl_page_list_init(&vio->u.write.vui_queue);
+	vio->u.write.vui_written = 0;
+	vio->u.write.vui_from = 0;
+	vio->u.write.vui_to = PAGE_SIZE;
+
+	return 0;
+}
+
+static void vvp_io_write_iter_fini(const struct lu_env *env,
+				   const struct cl_io_slice *ios)
+{
+	struct vvp_io *vio = cl2vvp_io(env, ios);
+
+	LASSERT(vio->u.write.vui_queue.pl_nr == 0);
+}
+
+static int vvp_io_fault_iter_init(const struct lu_env *env,
+                                  const struct cl_io_slice *ios)
+{
+	struct vvp_io *vio   = cl2vvp_io(env, ios);
+	struct inode  *inode = vvp_object_inode(ios->cis_obj);
+
+	LASSERT(inode == file_inode(vio->vui_fd->fd_file));
+	vio->u.fault.ft_mtime = inode->i_mtime.tv_sec;
+
+	return 0;
+}
+
+static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	struct cl_io     *io  = ios->cis_io;
+	struct cl_object *obj = io->ci_obj;
+	struct vvp_io    *vio = cl2vvp_io(env, ios);
+	struct inode     *inode = vvp_object_inode(obj);
+	int rc;
+
+	CLOBINVRNT(env, obj, vvp_object_invariant(obj));
+
+	CDEBUG(D_VFSTRACE, DFID" ignore/verify layout %d/%d, layout version %d "
+	       "need write layout %d, restore needed %d\n",
+	       PFID(lu_object_fid(&obj->co_lu)),
+	       io->ci_ignore_layout, io->ci_verify_layout,
+	       vio->vui_layout_gen, io->ci_need_write_intent,
+	       io->ci_restore_needed);
+
+	if (io->ci_restore_needed) {
+		/* file was detected release, we need to restore it
+		 * before finishing the io
+		 */
+		rc = ll_layout_restore(inode, 0, OBD_OBJECT_EOF);
+		/* if restore registration failed, no restart,
+		 * we will return -ENODATA */
+		/* The layout will change after restore, so we need to
+		 * block on layout lock held by the MDT
+		 * as MDT will not send new layout in lvb (see LU-3124)
+		 * we have to explicitly fetch it, all this will be done
+		 * by ll_layout_refresh()
+		 */
+		if (rc == 0) {
+			io->ci_restore_needed = 0;
+			io->ci_need_restart = 1;
+			io->ci_verify_layout = 1;
+		} else {
+			io->ci_restore_needed = 1;
+			io->ci_need_restart = 0;
+			io->ci_verify_layout = 0;
+			io->ci_result = rc;
+		}
+	}
+
+	/**
+	 * dynamic layout change needed, send layout intent
+	 * RPC.
+	 */
+	if (io->ci_need_write_intent) {
+		loff_t start = 0;
+		loff_t end = OBD_OBJECT_EOF;
+
+		io->ci_need_write_intent = 0;
+
+		LASSERT(io->ci_type == CIT_WRITE ||
+			cl_io_is_trunc(io) || cl_io_is_mkwrite(io));
+
+		if (io->ci_type == CIT_WRITE) {
+			if (!cl_io_is_append(io)) {
+				start = io->u.ci_rw.rw_range.cir_pos;
+				end = start + io->u.ci_rw.rw_range.cir_count;
+			}
+		} else if (cl_io_is_trunc(io)) {
+			end = io->u.ci_setattr.sa_attr.lvb_size;
+		} else { /* mkwrite */
+			pgoff_t index = io->u.ci_fault.ft_index;
+
+			start = cl_offset(io->ci_obj, index);
+			end = cl_offset(io->ci_obj, index + 1);
+		}
+
+		CDEBUG(D_VFSTRACE, DFID" write layout, type %u [%llu, %llu)\n",
+		       PFID(lu_object_fid(&obj->co_lu)), io->ci_type,
+		       start, end);
+		rc = ll_layout_write_intent(inode, start, end);
+		io->ci_result = rc;
+		if (!rc)
+			io->ci_need_restart = 1;
+	}
+
+	if (!io->ci_ignore_layout && io->ci_verify_layout) {
+		__u32 gen = 0;
+
+		/* check layout version */
+		ll_layout_refresh(inode, &gen);
+		io->ci_need_restart = vio->vui_layout_gen != gen;
+		if (io->ci_need_restart) {
+			CDEBUG(D_VFSTRACE,
+			       DFID" layout changed from %d to %d.\n",
+			       PFID(lu_object_fid(&obj->co_lu)),
+			       vio->vui_layout_gen, gen);
+			/* today successful restore is the only possible
+			 * case */
+			/* restore was done, clear restoring state */
+			ll_file_clear_flag(ll_i2info(vvp_object_inode(obj)),
+					   LLIF_FILE_RESTORING);
+		}
+	}
+}
+
+static void vvp_io_fault_fini(const struct lu_env *env,
+                              const struct cl_io_slice *ios)
+{
+        struct cl_io   *io   = ios->cis_io;
+        struct cl_page *page = io->u.ci_fault.ft_page;
+
+	CLOBINVRNT(env, io->ci_obj, vvp_object_invariant(io->ci_obj));
+
+        if (page != NULL) {
+                lu_ref_del(&page->cp_reference, "fault", io);
+                cl_page_put(env, page);
+                io->u.ci_fault.ft_page = NULL;
+        }
+        vvp_io_fini(env, ios);
+}
+
+static enum cl_lock_mode vvp_mode_from_vma(struct vm_area_struct *vma)
+{
+        /*
+         * we only want to hold PW locks if the mmap() can generate
+         * writes back to the file and that only happens in shared
+         * writable vmas
+         */
+        if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
+                return CLM_WRITE;
+        return CLM_READ;
+}
+
+static int vvp_mmap_locks(const struct lu_env *env, struct cl_io *io)
+{
+	struct vvp_thread_info *vti = vvp_env_info(env);
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	struct cl_lock_descr *descr = &vti->vti_descr;
+	union ldlm_policy_data policy;
+	struct iovec iov;
+	struct iov_iter i;
+	int result = 0;
+	ENTRY;
+
+	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+	if (!cl_is_normalio(env, io))
+		RETURN(0);
+
+	/* No MM (e.g. NFS)? No vmas too. */
+	if (mm == NULL)
+		RETURN(0);
+
+	iov_for_each(iov, i, io->u.ci_rw.rw_iter) {
+		unsigned long addr = (unsigned long)iov.iov_base;
+		size_t count = iov.iov_len;
+
+                if (count == 0)
+                        continue;
+
+		count += addr & ~PAGE_MASK;
+		addr &= PAGE_MASK;
+
+                down_read(&mm->mmap_sem);
+                while((vma = our_vma(mm, addr, count)) != NULL) {
+			struct dentry *de = file_dentry(vma->vm_file);
+			struct inode *inode = de->d_inode;
+                        int flags = CEF_MUST;
+
+			if (ll_file_nolock(vma->vm_file)) {
+				/*
+				 * For no lock case is not allowed for mmap
+				 */
+				result = -EINVAL;
+				break;
+			}
+
+                        /*
+                         * XXX: Required lock mode can be weakened: CIT_WRITE
+                         * io only ever reads user level buffer, and CIT_READ
+                         * only writes on it.
+                         */
+                        policy_from_vma(&policy, vma, addr, count);
+                        descr->cld_mode = vvp_mode_from_vma(vma);
+                        descr->cld_obj = ll_i2info(inode)->lli_clob;
+                        descr->cld_start = cl_index(descr->cld_obj,
+                                                    policy.l_extent.start);
+                        descr->cld_end = cl_index(descr->cld_obj,
+                                                  policy.l_extent.end);
+                        descr->cld_enq_flags = flags;
+                        result = cl_io_lock_alloc_add(env, io, descr);
+
+                        CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n",
+                               descr->cld_mode, descr->cld_start,
+                               descr->cld_end);
+
+			if (result < 0)
+				break;
+
+			if (vma->vm_end - addr >= count)
+				break;
+
+			count -= vma->vm_end - addr;
+			addr = vma->vm_end;
+		}
+		up_read(&mm->mmap_sem);
+		if (result < 0)
+			break;
+	}
+	RETURN(result);
+}
+
+static void vvp_io_advance(const struct lu_env *env,
+			   const struct cl_io_slice *ios,
+			   size_t nob)
+{
+	struct vvp_io    *vio = cl2vvp_io(env, ios);
+	struct cl_io     *io  = ios->cis_io;
+	struct cl_object *obj = ios->cis_io->ci_obj;
+
+	CLOBINVRNT(env, obj, vvp_object_invariant(obj));
+
+	if (!cl_is_normalio(env, io))
+		return;
+
+	vio->vui_tot_count -= nob;
+	if (io->ci_pio) {
+		iov_iter_advance(&io->u.ci_rw.rw_iter, nob);
+		io->u.ci_rw.rw_iocb.ki_pos = io->u.ci_rw.rw_range.cir_pos;
+#ifdef HAVE_KIOCB_KI_LEFT
+		io->u.ci_rw.rw_iocb.ki_left = vio->vui_tot_count;
+#elif defined(HAVE_KI_NBYTES)
+		io->u.ci_rw.rw_iocb.ki_nbytes = vio->vui_tot_count;
+#endif
+	} else {
+		/* It was truncated to stripe size in vvp_io_rw_lock() */
+		iov_iter_reexpand(&io->u.ci_rw.rw_iter, vio->vui_tot_count);
+	}
+}
+
+static int vvp_io_rw_lock(const struct lu_env *env, struct cl_io *io,
+                          enum cl_lock_mode mode, loff_t start, loff_t end)
+{
+	int result;
+	int ast_flags = 0;
+
+	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+	ENTRY;
+
+	if (cl_is_normalio(env, io))
+		iov_iter_truncate(&io->u.ci_rw.rw_iter,
+				  io->u.ci_rw.rw_range.cir_count);
+
+	if (io->u.ci_rw.rw_nonblock)
+		ast_flags |= CEF_NONBLOCK;
+
+	result = vvp_mmap_locks(env, io);
+	if (result == 0)
+		result = vvp_io_one_lock(env, io, ast_flags, mode, start, end);
+
+	RETURN(result);
+}
+
+static int vvp_io_read_lock(const struct lu_env *env,
+                            const struct cl_io_slice *ios)
+{
+	struct cl_io *io = ios->cis_io;
+	struct cl_io_range *range = &io->u.ci_rw.rw_range;
+	int rc;
+
+	ENTRY;
+	rc = vvp_io_rw_lock(env, io, CLM_READ, range->cir_pos,
+			    range->cir_pos + range->cir_count - 1);
+	RETURN(rc);
+}
+
+static int vvp_io_fault_lock(const struct lu_env *env,
+                             const struct cl_io_slice *ios)
+{
+        struct cl_io *io   = ios->cis_io;
+        struct vvp_io *vio = cl2vvp_io(env, ios);
+        /*
+         * XXX LDLM_FL_CBPENDING
+         */
+	return vvp_io_one_lock_index(env,
+				     io, 0,
+				     vvp_mode_from_vma(vio->u.fault.ft_vma),
+				     io->u.ci_fault.ft_index,
+				     io->u.ci_fault.ft_index);
+}
+
+static int vvp_io_write_lock(const struct lu_env *env,
+                             const struct cl_io_slice *ios)
+{
+	struct cl_io *io = ios->cis_io;
+	loff_t start;
+	loff_t end;
+	int rc;
+
+	ENTRY;
+	if (io->u.ci_rw.rw_append) {
+		start = 0;
+		end   = OBD_OBJECT_EOF;
+	} else {
+		start = io->u.ci_rw.rw_range.cir_pos;
+		end   = start + io->u.ci_rw.rw_range.cir_count - 1;
+	}
+	rc = vvp_io_rw_lock(env, io, CLM_WRITE, start, end);
+	RETURN(rc);
+}
+
+static int vvp_io_setattr_iter_init(const struct lu_env *env,
+				    const struct cl_io_slice *ios)
+{
+	return 0;
+}
+
+/**
+ * Implementation of cl_io_operations::cio_lock() method for CIT_SETATTR io.
+ *
+ * Handles "lockless io" mode when extent locking is done by server.
+ */
+static int vvp_io_setattr_lock(const struct lu_env *env,
+                               const struct cl_io_slice *ios)
+{
+	struct cl_io  *io  = ios->cis_io;
+	__u64 new_size;
+	__u32 enqflags = 0;
+
+        if (cl_io_is_trunc(io)) {
+                new_size = io->u.ci_setattr.sa_attr.lvb_size;
+                if (new_size == 0)
+                        enqflags = CEF_DISCARD_DATA;
+        } else {
+		unsigned int valid = io->u.ci_setattr.sa_valid;
+
+		if (!(valid & TIMES_SET_FLAGS))
+			return 0;
+
+		if ((!(valid & ATTR_MTIME) ||
+		     io->u.ci_setattr.sa_attr.lvb_mtime >=
+		     io->u.ci_setattr.sa_attr.lvb_ctime) &&
+		    (!(valid & ATTR_ATIME) ||
+		     io->u.ci_setattr.sa_attr.lvb_atime >=
+		     io->u.ci_setattr.sa_attr.lvb_ctime))
+			return 0;
+
+		new_size = 0;
+	}
+
+	return vvp_io_one_lock(env, io, enqflags, CLM_WRITE,
+			       new_size, OBD_OBJECT_EOF);
+}
+
+static int vvp_do_vmtruncate(struct inode *inode, size_t size)
+{
+	int     result;
+
+	/*
+	 * Only ll_inode_size_lock is taken at this level.
+	 */
+	ll_inode_size_lock(inode);
+	result = inode_newsize_ok(inode, size);
+	if (result < 0) {
+		ll_inode_size_unlock(inode);
+		return result;
+	}
+	i_size_write(inode, size);
+
+	ll_truncate_pagecache(inode, size);
+	ll_inode_size_unlock(inode);
+	return result;
+}
+
+static int vvp_io_setattr_time(const struct lu_env *env,
+                               const struct cl_io_slice *ios)
+{
+        struct cl_io       *io    = ios->cis_io;
+        struct cl_object   *obj   = io->ci_obj;
+	struct cl_attr     *attr  = vvp_env_thread_attr(env);
+        int result;
+        unsigned valid = CAT_CTIME;
+
+        cl_object_attr_lock(obj);
+        attr->cat_ctime = io->u.ci_setattr.sa_attr.lvb_ctime;
+        if (io->u.ci_setattr.sa_valid & ATTR_ATIME_SET) {
+                attr->cat_atime = io->u.ci_setattr.sa_attr.lvb_atime;
+                valid |= CAT_ATIME;
+        }
+        if (io->u.ci_setattr.sa_valid & ATTR_MTIME_SET) {
+                attr->cat_mtime = io->u.ci_setattr.sa_attr.lvb_mtime;
+                valid |= CAT_MTIME;
+        }
+	result = cl_object_attr_update(env, obj, attr, valid);
+	cl_object_attr_unlock(obj);
+
+	return result;
+}
+
+static int vvp_io_setattr_start(const struct lu_env *env,
+				const struct cl_io_slice *ios)
+{
+	struct cl_io		*io    = ios->cis_io;
+	struct inode		*inode = vvp_object_inode(io->ci_obj);
+	struct ll_inode_info	*lli   = ll_i2info(inode);
+
+	if (cl_io_is_trunc(io)) {
+		down_write(&lli->lli_trunc_sem);
+		inode_lock(inode);
+		inode_dio_wait(inode);
+	} else {
+		inode_lock(inode);
+	}
+
+	if (io->u.ci_setattr.sa_valid & TIMES_SET_FLAGS)
+		return vvp_io_setattr_time(env, ios);
+
+	return 0;
+}
+
+static void vvp_io_setattr_end(const struct lu_env *env,
+                               const struct cl_io_slice *ios)
+{
+	struct cl_io		*io    = ios->cis_io;
+	struct inode		*inode = vvp_object_inode(io->ci_obj);
+	struct ll_inode_info	*lli   = ll_i2info(inode);
+
+	if (cl_io_is_trunc(io)) {
+		/* Truncate in memory pages - they must be clean pages
+		 * because osc has already notified to destroy osc_extents. */
+		vvp_do_vmtruncate(inode, io->u.ci_setattr.sa_attr.lvb_size);
+		inode_dio_write_done(inode);
+		inode_unlock(inode);
+		up_write(&lli->lli_trunc_sem);
+	} else {
+		inode_unlock(inode);
+	}
+}
+
+static void vvp_io_setattr_fini(const struct lu_env *env,
+				const struct cl_io_slice *ios)
+{
+	bool restore_needed = ios->cis_io->ci_restore_needed;
+	struct inode *inode = vvp_object_inode(ios->cis_obj);
+
+	vvp_io_fini(env, ios);
+
+	if (restore_needed && !ios->cis_io->ci_restore_needed) {
+		/* restore finished, set data modified flag for HSM */
+		ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
+	}
+}
+
+static int vvp_io_read_start(const struct lu_env *env,
+			     const struct cl_io_slice *ios)
+{
+	struct vvp_io		*vio   = cl2vvp_io(env, ios);
+	struct cl_io		*io    = ios->cis_io;
+	struct cl_object	*obj   = io->ci_obj;
+	struct inode		*inode = vvp_object_inode(obj);
+	struct ll_inode_info	*lli   = ll_i2info(inode);
+	struct file		*file  = vio->vui_fd->fd_file;
+	struct cl_io_range	*range = &io->u.ci_rw.rw_range;
+	loff_t pos = range->cir_pos; /* for generic_file_splice_read() only */
+	size_t tot = vio->vui_tot_count;
+	int exceed = 0;
+	int result;
+
+	CLOBINVRNT(env, obj, vvp_object_invariant(obj));
+
+	CDEBUG(D_VFSTRACE, "%s: read [%llu, %llu)\n",
+		file_dentry(file)->d_name.name,
+		range->cir_pos, range->cir_pos + range->cir_count);
+
+	if (vio->vui_io_subtype == IO_NORMAL)
+		down_read(&lli->lli_trunc_sem);
+
+	if (!can_populate_pages(env, io, inode))
+		return 0;
+
+	result = vvp_prep_size(env, obj, io, range->cir_pos, tot, &exceed);
+	if (result != 0)
+		return result;
+	else if (exceed != 0)
+		goto out;
+
+	LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu,
+			 "Read ino %lu, %lu bytes, offset %lld, size %llu\n",
+			 inode->i_ino, range->cir_count, range->cir_pos,
+			 i_size_read(inode));
+
+	/* turn off the kernel's read-ahead */
+	vio->vui_fd->fd_file->f_ra.ra_pages = 0;
+
+	/* initialize read-ahead window once per syscall */
+	if (!vio->vui_ra_valid) {
+		vio->vui_ra_valid = true;
+		vio->vui_ra_start = cl_index(obj, range->cir_pos);
+		vio->vui_ra_count = cl_index(obj, tot + PAGE_SIZE - 1);
+		ll_ras_enter(file);
+	}
+
+	/* BUG: 5972 */
+	file_accessed(file);
+	switch (vio->vui_io_subtype) {
+	case IO_NORMAL:
+		LASSERTF(io->u.ci_rw.rw_iocb.ki_pos == range->cir_pos,
+			 "ki_pos %lld [%lld, %lld)\n",
+			 io->u.ci_rw.rw_iocb.ki_pos,
+			 range->cir_pos, range->cir_pos + range->cir_count);
+		result = generic_file_read_iter(&io->u.ci_rw.rw_iocb,
+						&io->u.ci_rw.rw_iter);
+		break;
+	case IO_SPLICE:
+		result = generic_file_splice_read(file, &pos,
+						  vio->u.splice.vui_pipe,
+						  range->cir_count,
+						  vio->u.splice.vui_flags);
+		/* LU-1109: do splice read stripe by stripe otherwise if it
+		 * may make nfsd stuck if this read occupied all internal pipe
+		 * buffers. */
+		io->ci_continue = 0;
+		break;
+	default:
+		CERROR("Wrong IO type %u\n", vio->vui_io_subtype);
+		LBUG();
+	}
+
+out:
+	if (result >= 0) {
+		if (result < range->cir_count)
+			io->ci_continue = 0;
+		io->ci_nob += result;
+		ll_rw_stats_tally(ll_i2sbi(inode), current->pid, vio->vui_fd,
+				  range->cir_pos, result, READ);
+		result = 0;
+	}
+
+	return result;
+}
+
+static int vvp_io_commit_sync(const struct lu_env *env, struct cl_io *io,
+			      struct cl_page_list *plist, int from, int to)
+{
+	struct cl_2queue *queue = &io->ci_queue;
+	struct cl_page *page;
+	unsigned int bytes = 0;
+	int rc = 0;
+	ENTRY;
+
+	if (plist->pl_nr == 0)
+		RETURN(0);
+
+	if (from > 0 || to != PAGE_SIZE) {
+		page = cl_page_list_first(plist);
+		if (plist->pl_nr == 1) {
+			cl_page_clip(env, page, from, to);
+		} else {
+			if (from > 0)
+				cl_page_clip(env, page, from, PAGE_SIZE);
+			if (to != PAGE_SIZE) {
+				page = cl_page_list_last(plist);
+				cl_page_clip(env, page, 0, to);
+			}
+		}
+	}
+
+	cl_2queue_init(queue);
+	cl_page_list_splice(plist, &queue->c2_qin);
+	rc = cl_io_submit_sync(env, io, CRT_WRITE, queue, 0);
+
+	/* plist is not sorted any more */
+	cl_page_list_splice(&queue->c2_qin, plist);
+	cl_page_list_splice(&queue->c2_qout, plist);
+	cl_2queue_fini(env, queue);
+
+	if (rc == 0) {
+		/* calculate bytes */
+		bytes = plist->pl_nr << PAGE_SHIFT;
+		bytes -= from + PAGE_SIZE - to;
+
+		while (plist->pl_nr > 0) {
+			page = cl_page_list_first(plist);
+			cl_page_list_del(env, plist, page);
+
+			cl_page_clip(env, page, 0, PAGE_SIZE);
+
+			SetPageUptodate(cl_page_vmpage(page));
+			cl_page_disown(env, io, page);
+
+			lu_ref_del(&page->cp_reference, "cl_io", io);
+			cl_page_put(env, page);
+		}
+	}
+
+	RETURN(bytes > 0 ? bytes : rc);
+}
+
+static void write_commit_callback(const struct lu_env *env, struct cl_io *io,
+				struct cl_page *page)
+{
+	struct page *vmpage = page->cp_vmpage;
+
+	SetPageUptodate(vmpage);
+	set_page_dirty(vmpage);
+
+	cl_page_disown(env, io, page);
+
+	lu_ref_del(&page->cp_reference, "cl_io", cl_io_top(io));
+	cl_page_put(env, page);
+}
+
+/* make sure the page list is contiguous */
+static bool page_list_sanity_check(struct cl_object *obj,
+				   struct cl_page_list *plist)
+{
+	struct cl_page *page;
+	pgoff_t index = CL_PAGE_EOF;
+
+	cl_page_list_for_each(page, plist) {
+		struct vvp_page *vpg = cl_object_page_slice(obj, page);
+
+		if (index == CL_PAGE_EOF) {
+			index = vvp_index(vpg);
+			continue;
+		}
+
+		++index;
+		if (index == vvp_index(vpg))
+			continue;
+
+		return false;
+	}
+	return true;
+}
+
+/* Return how many bytes have queued or written */
+int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io)
+{
+	struct cl_object *obj = io->ci_obj;
+	struct inode *inode = vvp_object_inode(obj);
+	struct vvp_io *vio = vvp_env_io(env);
+	struct cl_page_list *queue = &vio->u.write.vui_queue;
+	struct cl_page *page;
+	int rc = 0;
+	int bytes = 0;
+	unsigned int npages = vio->u.write.vui_queue.pl_nr;
+	ENTRY;
+
+	if (npages == 0)
+		RETURN(0);
+
+	CDEBUG(D_VFSTRACE, "commit async pages: %d, from %d, to %d\n",
+		npages, vio->u.write.vui_from, vio->u.write.vui_to);
+
+	LASSERT(page_list_sanity_check(obj, queue));
+
+	/* submit IO with async write */
+	rc = cl_io_commit_async(env, io, queue,
+				vio->u.write.vui_from, vio->u.write.vui_to,
+				write_commit_callback);
+	npages -= queue->pl_nr; /* already committed pages */
+	if (npages > 0) {
+		/* calculate how many bytes were written */
+		bytes = npages << PAGE_SHIFT;
+
+		/* first page */
+		bytes -= vio->u.write.vui_from;
+		if (queue->pl_nr == 0) /* last page */
+			bytes -= PAGE_SIZE - vio->u.write.vui_to;
+		LASSERTF(bytes > 0, "bytes = %d, pages = %d\n", bytes, npages);
+
+		vio->u.write.vui_written += bytes;
+
+		CDEBUG(D_VFSTRACE, "Committed %d pages %d bytes, tot: %ld\n",
+			npages, bytes, vio->u.write.vui_written);
+
+		/* the first page must have been written. */
+		vio->u.write.vui_from = 0;
+	}
+	LASSERT(page_list_sanity_check(obj, queue));
+	LASSERT(ergo(rc == 0, queue->pl_nr == 0));
+
+	/* out of quota, try sync write */
+	if (rc == -EDQUOT && !cl_io_is_mkwrite(io)) {
+		rc = vvp_io_commit_sync(env, io, queue,
+					vio->u.write.vui_from,
+					vio->u.write.vui_to);
+		if (rc > 0) {
+			vio->u.write.vui_written += rc;
+			rc = 0;
+		}
+	}
+
+	/* update inode size */
+	ll_merge_attr(env, inode);
+
+	/* Now the pages in queue were failed to commit, discard them
+	 * unless they were dirtied before. */
+	while (queue->pl_nr > 0) {
+		page = cl_page_list_first(queue);
+		cl_page_list_del(env, queue, page);
+
+		if (!PageDirty(cl_page_vmpage(page)))
+			cl_page_discard(env, io, page);
+
+		cl_page_disown(env, io, page);
+
+		lu_ref_del(&page->cp_reference, "cl_io", io);
+		cl_page_put(env, page);
+	}
+	cl_page_list_fini(env, queue);
+
+	RETURN(rc);
+}
+
+static int vvp_io_write_start(const struct lu_env *env,
+                              const struct cl_io_slice *ios)
+{
+	struct vvp_io		*vio   = cl2vvp_io(env, ios);
+	struct cl_io		*io    = ios->cis_io;
+	struct cl_object	*obj   = io->ci_obj;
+	struct inode		*inode = vvp_object_inode(obj);
+	struct ll_inode_info	*lli   = ll_i2info(inode);
+	struct file		*file  = vio->vui_fd->fd_file;
+	struct cl_io_range	*range = &io->u.ci_rw.rw_range;
+	bool			 lock_inode = !lli->lli_inode_locked &&
+					      !IS_NOSEC(inode);
+	ssize_t			 result = 0;
+	ENTRY;
+
+	if (vio->vui_io_subtype == IO_NORMAL)
+		down_read(&lli->lli_trunc_sem);
+
+	if (!can_populate_pages(env, io, inode))
+		RETURN(0);
+
+	if (cl_io_is_append(io)) {
+		/*
+		 * PARALLEL IO This has to be changed for parallel IO doing
+		 * out-of-order writes.
+		 */
+		ll_merge_attr(env, inode);
+		range->cir_pos = i_size_read(inode);
+		io->u.ci_rw.rw_iocb.ki_pos = range->cir_pos;
+	} else {
+		LASSERTF(io->u.ci_rw.rw_iocb.ki_pos == range->cir_pos,
+			 "ki_pos %lld [%lld, %lld)\n",
+			 io->u.ci_rw.rw_iocb.ki_pos,
+			 range->cir_pos, range->cir_pos + range->cir_count);
+	}
+
+	CDEBUG(D_VFSTRACE, "%s: write [%llu, %llu)\n",
+		file_dentry(file)->d_name.name,
+		range->cir_pos, range->cir_pos + range->cir_count);
+
+	/* The maximum Lustre file size is variable, based on the OST maximum
+	 * object size and number of stripes.  This needs another check in
+	 * addition to the VFS checks earlier. */
+	if (range->cir_pos + range->cir_count > ll_file_maxbytes(inode)) {
+		CDEBUG(D_INODE,
+		       "%s: file %s ("DFID") offset %llu > maxbytes %llu\n",
+		       ll_get_fsname(inode->i_sb, NULL, 0),
+		       file_dentry(file)->d_name.name,
+		       PFID(ll_inode2fid(inode)),
+		       range->cir_pos + range->cir_count,
+		       ll_file_maxbytes(inode));
+		RETURN(-EFBIG);
+	}
+
+	/* Tests to verify we take the i_mutex correctly */
+	if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_IMUTEX_SEC) && !lock_inode)
+		RETURN(-EINVAL);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_IMUTEX_NOSEC) && lock_inode)
+		RETURN(-EINVAL);
+
+	/*
+	 * When using the locked AIO function (generic_file_aio_write())
+	 * testing has shown the inode mutex to be a limiting factor
+	 * with multi-threaded single shared file performance. To get
+	 * around this, we now use the lockless version. To maintain
+	 * consistency, proper locking to protect against writes,
+	 * trucates, etc. is handled in the higher layers of lustre.
+	 */
+	if (lock_inode)
+		inode_lock(inode);
+	result = __generic_file_write_iter(&io->u.ci_rw.rw_iocb,
+					   &io->u.ci_rw.rw_iter);
+	if (lock_inode)
+		inode_unlock(inode);
+
+	if (result > 0 || result == -EIOCBQUEUED)
+#ifdef HAVE_GENERIC_WRITE_SYNC_2ARGS
+		result = generic_write_sync(&io->u.ci_rw.rw_iocb, result);
+#else
+	{
+		ssize_t err;
+
+		err = generic_write_sync(io->u.ci_rw.rw_iocb.ki_filp,
+					 range->cir_pos, result);
+		if (err < 0 && result > 0)
+			result = err;
+	}
+#endif
+
+	if (result > 0) {
+		result = vvp_io_write_commit(env, io);
+		if (vio->u.write.vui_written > 0) {
+			result = vio->u.write.vui_written;
+			CDEBUG(D_VFSTRACE, "%s: write nob %zd, result: %zd\n",
+				file_dentry(file)->d_name.name,
+				io->ci_nob, result);
+			io->ci_nob += result;
+		}
+	}
+	if (result > 0) {
+		ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
+
+		if (result < range->cir_count)
+			io->ci_continue = 0;
+		ll_rw_stats_tally(ll_i2sbi(inode), current->pid,
+				  vio->vui_fd, range->cir_pos, result, WRITE);
+		result = 0;
+	}
+
+	RETURN(result);
+}
+
+static void vvp_io_rw_end(const struct lu_env *env,
+			  const struct cl_io_slice *ios)
+{
+	struct vvp_io		*vio = cl2vvp_io(env, ios);
+	struct inode		*inode = vvp_object_inode(ios->cis_obj);
+	struct ll_inode_info	*lli = ll_i2info(inode);
+
+	if (vio->vui_io_subtype == IO_NORMAL)
+		up_read(&lli->lli_trunc_sem);
+}
+
+static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
+{
+	struct vm_fault *vmf = cfio->ft_vmf;
+
+	cfio->ft_flags = ll_filemap_fault(cfio->ft_vma, vmf);
+	cfio->ft_flags_valid = 1;
+
+	if (vmf->page) {
+		LL_CDEBUG_PAGE(D_PAGE, vmf->page, "got addr %p type NOPAGE\n",
+			       get_vmf_address(vmf));
+		if (unlikely(!(cfio->ft_flags & VM_FAULT_LOCKED))) {
+			lock_page(vmf->page);
+			cfio->ft_flags |= VM_FAULT_LOCKED;
+		}
+
+		cfio->ft_vmpage = vmf->page;
+
+		return 0;
+	}
+
+	if (cfio->ft_flags & VM_FAULT_SIGBUS) {
+		CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", get_vmf_address(vmf));
+		return -EFAULT;
+	}
+
+	if (cfio->ft_flags & VM_FAULT_OOM) {
+		CDEBUG(D_PAGE, "got addr %p - OOM\n", get_vmf_address(vmf));
+		return -ENOMEM;
+	}
+
+	if (cfio->ft_flags & VM_FAULT_RETRY)
+		return -EAGAIN;
+
+	CERROR("unknown error in page fault %d\n", cfio->ft_flags);
+
+	return -EINVAL;
+}
+
+static void mkwrite_commit_callback(const struct lu_env *env, struct cl_io *io,
+				    struct cl_page *page)
+{
+	set_page_dirty(page->cp_vmpage);
+}
+
+static int vvp_io_fault_start(const struct lu_env *env,
+                              const struct cl_io_slice *ios)
+{
+	struct vvp_io		*vio   = cl2vvp_io(env, ios);
+	struct cl_io		*io    = ios->cis_io;
+	struct cl_object	*obj   = io->ci_obj;
+	struct inode		*inode = vvp_object_inode(obj);
+	struct ll_inode_info	*lli   = ll_i2info(inode);
+	struct cl_fault_io	*fio   = &io->u.ci_fault;
+	struct vvp_fault_io	*cfio  = &vio->u.fault;
+	loff_t			 offset;
+	int			 result = 0;
+	struct page		*vmpage = NULL;
+	struct cl_page		*page;
+	loff_t			 size;
+	pgoff_t			 last_index;
+	ENTRY;
+
+	down_read(&lli->lli_trunc_sem);
+
+        /* offset of the last byte on the page */
+        offset = cl_offset(obj, fio->ft_index + 1) - 1;
+        LASSERT(cl_index(obj, offset) == fio->ft_index);
+	result = vvp_prep_size(env, obj, io, 0, offset + 1, NULL);
+	if (result != 0)
+		RETURN(result);
+
+	/* must return locked page */
+	if (fio->ft_mkwrite) {
+		LASSERT(cfio->ft_vmpage != NULL);
+		lock_page(cfio->ft_vmpage);
+	} else {
+		result = vvp_io_kernel_fault(cfio);
+		if (result != 0)
+			RETURN(result);
+	}
+
+	vmpage = cfio->ft_vmpage;
+	LASSERT(PageLocked(vmpage));
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_FAULT_TRUNC_RACE))
+		ll_invalidate_page(vmpage);
+
+	size = i_size_read(inode);
+        /* Though we have already held a cl_lock upon this page, but
+         * it still can be truncated locally. */
+	if (unlikely((vmpage->mapping != inode->i_mapping) ||
+		     (page_offset(vmpage) > size))) {
+                CDEBUG(D_PAGE, "llite: fault and truncate race happened!\n");
+
+                /* return +1 to stop cl_io_loop() and ll_fault() will catch
+                 * and retry. */
+                GOTO(out, result = +1);
+        }
+
+	last_index = cl_index(obj, size - 1);
+
+	if (fio->ft_mkwrite ) {
+		/*
+		 * Capture the size while holding the lli_trunc_sem from above
+		 * we want to make sure that we complete the mkwrite action
+		 * while holding this lock. We need to make sure that we are
+		 * not past the end of the file.
+		 */
+		if (last_index < fio->ft_index) {
+			CDEBUG(D_PAGE,
+				"llite: mkwrite and truncate race happened: "
+				"%p: 0x%lx 0x%lx\n",
+				vmpage->mapping,fio->ft_index,last_index);
+			/*
+			 * We need to return if we are
+			 * passed the end of the file. This will propagate
+			 * up the call stack to ll_page_mkwrite where
+			 * we will return VM_FAULT_NOPAGE. Any non-negative
+			 * value returned here will be silently
+			 * converted to 0. If the vmpage->mapping is null
+			 * the error code would be converted back to ENODATA
+			 * in ll_page_mkwrite0. Thus we return -ENODATA
+			 * to handle both cases
+			 */
+			GOTO(out, result = -ENODATA);
+		}
+	}
+
+	page = cl_page_find(env, obj, fio->ft_index, vmpage, CPT_CACHEABLE);
+	if (IS_ERR(page))
+		GOTO(out, result = PTR_ERR(page));
+
+	/* if page is going to be written, we should add this page into cache
+	 * earlier. */
+	if (fio->ft_mkwrite) {
+		wait_on_page_writeback(vmpage);
+		if (!PageDirty(vmpage)) {
+			struct cl_page_list *plist = &io->ci_queue.c2_qin;
+			struct vvp_page *vpg = cl_object_page_slice(obj, page);
+			int to = PAGE_SIZE;
+
+			/* vvp_page_assume() calls wait_on_page_writeback(). */
+			cl_page_assume(env, io, page);
+
+			cl_page_list_init(plist);
+			cl_page_list_add(plist, page);
+
+			/* size fixup */
+			if (last_index == vvp_index(vpg))
+				to = size & ~PAGE_MASK;
+
+			/* Do not set Dirty bit here so that in case IO is
+			 * started before the page is really made dirty, we
+			 * still have chance to detect it. */
+			result = cl_io_commit_async(env, io, plist, 0, to,
+						    mkwrite_commit_callback);
+			LASSERT(cl_page_is_owned(page, io));
+			cl_page_list_fini(env, plist);
+
+			vmpage = NULL;
+			if (result < 0) {
+				cl_page_discard(env, io, page);
+				cl_page_disown(env, io, page);
+
+				cl_page_put(env, page);
+
+				/* we're in big trouble, what can we do now? */
+				if (result == -EDQUOT)
+					result = -ENOSPC;
+				GOTO(out, result);
+			} else
+				cl_page_disown(env, io, page);
+		}
+	}
+
+	/*
+	 * The ft_index is only used in the case of
+	 * a mkwrite action. We need to check
+	 * our assertions are correct, since
+	 * we should have caught this above
+	 */
+	LASSERT(!fio->ft_mkwrite || fio->ft_index <= last_index);
+	if (fio->ft_index == last_index)
+                /*
+                 * Last page is mapped partially.
+                 */
+                fio->ft_nob = size - cl_offset(obj, fio->ft_index);
+        else
+                fio->ft_nob = cl_page_size(obj);
+
+        lu_ref_add(&page->cp_reference, "fault", io);
+        fio->ft_page = page;
+        EXIT;
+
+out:
+	/* return unlocked vmpage to avoid deadlocking */
+	if (vmpage != NULL)
+		unlock_page(vmpage);
+
+	cfio->ft_flags &= ~VM_FAULT_LOCKED;
+
+	return result;
+}
+
+static void vvp_io_fault_end(const struct lu_env *env,
+			     const struct cl_io_slice *ios)
+{
+	struct inode		*inode = vvp_object_inode(ios->cis_obj);
+	struct ll_inode_info	*lli   = ll_i2info(inode);
+
+	CLOBINVRNT(env, ios->cis_io->ci_obj,
+		   vvp_object_invariant(ios->cis_io->ci_obj));
+	up_read(&lli->lli_trunc_sem);
+}
+
+static int vvp_io_fsync_start(const struct lu_env *env,
+			      const struct cl_io_slice *ios)
+{
+	/* we should mark TOWRITE bit to each dirty page in radix tree to
+	 * verify pages have been written, but this is difficult because of
+	 * race. */
+	return 0;
+}
+
+static int vvp_io_read_ahead(const struct lu_env *env,
+			     const struct cl_io_slice *ios,
+			     pgoff_t start, struct cl_read_ahead *ra)
+{
+	int result = 0;
+	ENTRY;
+
+	if (ios->cis_io->ci_type == CIT_READ ||
+	    ios->cis_io->ci_type == CIT_FAULT) {
+		struct vvp_io *vio = cl2vvp_io(env, ios);
+
+		if (unlikely(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+			ra->cra_end = CL_PAGE_EOF;
+			result = +1; /* no need to call down */
+		}
+	}
+
+	RETURN(result);
+}
+
+static const struct cl_io_operations vvp_io_ops = {
+	.op = {
+		[CIT_READ] = {
+			.cio_fini	= vvp_io_fini,
+			.cio_lock	= vvp_io_read_lock,
+			.cio_start	= vvp_io_read_start,
+			.cio_end	= vvp_io_rw_end,
+			.cio_advance	= vvp_io_advance,
+		},
+                [CIT_WRITE] = {
+			.cio_fini      = vvp_io_fini,
+			.cio_iter_init = vvp_io_write_iter_init,
+			.cio_iter_fini = vvp_io_write_iter_fini,
+			.cio_lock      = vvp_io_write_lock,
+			.cio_start     = vvp_io_write_start,
+			.cio_end       = vvp_io_rw_end,
+			.cio_advance   = vvp_io_advance,
+                },
+                [CIT_SETATTR] = {
+                        .cio_fini       = vvp_io_setattr_fini,
+                        .cio_iter_init  = vvp_io_setattr_iter_init,
+                        .cio_lock       = vvp_io_setattr_lock,
+                        .cio_start      = vvp_io_setattr_start,
+                        .cio_end        = vvp_io_setattr_end
+                },
+                [CIT_FAULT] = {
+                        .cio_fini      = vvp_io_fault_fini,
+                        .cio_iter_init = vvp_io_fault_iter_init,
+                        .cio_lock      = vvp_io_fault_lock,
+                        .cio_start     = vvp_io_fault_start,
+			.cio_end       = vvp_io_fault_end,
+                },
+		[CIT_FSYNC] = {
+			.cio_start	= vvp_io_fsync_start,
+			.cio_fini	= vvp_io_fini
+		},
+		[CIT_MISC] = {
+			.cio_fini	= vvp_io_fini
+		},
+		[CIT_LADVISE] = {
+			.cio_fini	= vvp_io_fini
+		},
+	},
+	.cio_read_ahead = vvp_io_read_ahead
+};
+
+int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
+                struct cl_io *io)
+{
+	struct vvp_io      *vio   = vvp_env_io(env);
+	struct inode       *inode = vvp_object_inode(obj);
+	int                 result;
+
+	CLOBINVRNT(env, obj, vvp_object_invariant(obj));
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, DFID" ignore/verify layout %d/%d, layout version %d "
+	       "restore needed %d\n",
+	       PFID(lu_object_fid(&obj->co_lu)),
+	       io->ci_ignore_layout, io->ci_verify_layout,
+	       vio->vui_layout_gen, io->ci_restore_needed);
+
+	CL_IO_SLICE_CLEAN(vio, vui_cl);
+	cl_io_slice_add(io, &vio->vui_cl, obj, &vvp_io_ops);
+	vio->vui_ra_valid = false;
+	result = 0;
+	if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE) {
+		struct ll_inode_info *lli = ll_i2info(inode);
+
+		vio->vui_tot_count = io->u.ci_rw.rw_range.cir_count;
+		/* "If nbyte is 0, read() will return 0 and have no other
+		 *  results."  -- Single Unix Spec */
+		if (vio->vui_tot_count == 0)
+			result = 1;
+
+		/* for read/write, we store the jobid in the inode, and
+		 * it'll be fetched by osc when building RPC.
+		 *
+		 * it's not accurate if the file is shared by different
+		 * jobs.
+		 */
+		lustre_get_jobid(lli->lli_jobid);
+	} else if (io->ci_type == CIT_SETATTR) {
+		if (!cl_io_is_trunc(io))
+			io->ci_lockreq = CILR_MANDATORY;
+	}
+
+	/* Enqueue layout lock and get layout version. We need to do this
+	 * even for operations requiring to open file, such as read and write,
+	 * because it might not grant layout lock in IT_OPEN. */
+	if (result == 0 && !io->ci_ignore_layout) {
+		result = ll_layout_refresh(inode, &vio->vui_layout_gen);
+		if (result == -ENOENT)
+			/* If the inode on MDS has been removed, but the objects
+			 * on OSTs haven't been destroyed (async unlink), layout
+			 * fetch will return -ENOENT, we'd ingore this error
+			 * and continue with dirty flush. LU-3230. */
+			result = 0;
+		if (result < 0)
+			CERROR("%s: refresh file layout " DFID " error %d.\n",
+				ll_get_fsname(inode->i_sb, NULL, 0),
+				PFID(lu_object_fid(&obj->co_lu)), result);
+	}
+
+	RETURN(result);
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_lock.c b/drivers/staging/lustrefsx/lustre/llite/vvp_lock.c
new file mode 100644
index 0000000000000..651b8e128239d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_lock.c
@@ -0,0 +1,86 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_support.h>
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Vvp lock functions.
+ *
+ */
+
+static void vvp_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice)
+{
+	struct vvp_lock *vlk = cl2vvp_lock(slice);
+
+	OBD_SLAB_FREE_PTR(vlk, vvp_lock_kmem);
+}
+
+static int vvp_lock_enqueue(const struct lu_env *env,
+			    const struct cl_lock_slice *slice,
+			    struct cl_io *unused, struct cl_sync_io *anchor)
+{
+	CLOBINVRNT(env, slice->cls_obj, vvp_object_invariant(slice->cls_obj));
+
+	return 0;
+}
+
+static const struct cl_lock_operations vvp_lock_ops = {
+	.clo_fini	= vvp_lock_fini,
+	.clo_enqueue	= vvp_lock_enqueue,
+};
+
+int vvp_lock_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_lock *lock, const struct cl_io *unused)
+{
+	struct vvp_lock *vlk;
+	int result;
+
+	CLOBINVRNT(env, obj, vvp_object_invariant(obj));
+
+	OBD_SLAB_ALLOC_PTR_GFP(vlk, vvp_lock_kmem, GFP_NOFS);
+	if (vlk != NULL) {
+		cl_lock_slice_add(lock, &vlk->vlk_cl, obj, &vvp_lock_ops);
+		result = 0;
+	} else {
+		result = -ENOMEM;
+	}
+
+	return result;
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_object.c b/drivers/staging/lustrefsx/lustre/llite/vvp_object.c
new file mode 100644
index 0000000000000..8904e45918386
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_object.c
@@ -0,0 +1,315 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl_object implementation for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/user_namespace.h>
+#ifdef HAVE_UIDGID_HEADER
+# include <linux/uidgid.h>
+#endif
+#include <libcfs/libcfs.h>
+
+#include <obd.h>
+#include "llite_internal.h"
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Object operations.
+ *
+ */
+
+int vvp_object_invariant(const struct cl_object *obj)
+{
+	struct inode		*inode	= vvp_object_inode(obj);
+	struct ll_inode_info	*lli	= ll_i2info(inode);
+
+	return (S_ISREG(inode->i_mode) || inode->i_mode == 0) &&
+	       lli->lli_clob == obj;
+}
+
+static int vvp_object_print(const struct lu_env *env, void *cookie,
+			    lu_printer_t p, const struct lu_object *o)
+{
+	struct vvp_object    *obj   = lu2vvp(o);
+	struct inode         *inode = obj->vob_inode;
+	struct ll_inode_info *lli;
+
+	(*p)(env, cookie, "(%d %d) inode: %p ",
+	     atomic_read(&obj->vob_transient_pages),
+	     atomic_read(&obj->vob_mmap_cnt),
+	     inode);
+	if (inode) {
+		lli = ll_i2info(inode);
+		(*p)(env, cookie, "%lu/%u %o %u %d %p "DFID,
+		     inode->i_ino, inode->i_generation, inode->i_mode,
+		     inode->i_nlink, atomic_read(&inode->i_count),
+		     lli->lli_clob, PFID(&lli->lli_fid));
+	}
+	return 0;
+}
+
+static int vvp_attr_get(const struct lu_env *env, struct cl_object *obj,
+                        struct cl_attr *attr)
+{
+	struct inode *inode = vvp_object_inode(obj);
+
+	/*
+	 * lov overwrites most of these fields in
+	 * lov_attr_get()->...lov_merge_lvb_kms(), except when inode
+	 * attributes are newer.
+	 */
+
+	attr->cat_size = i_size_read(inode);
+	attr->cat_mtime = inode->i_mtime.tv_sec;
+	attr->cat_atime = inode->i_atime.tv_sec;
+	attr->cat_ctime = inode->i_ctime.tv_sec;
+	attr->cat_blocks = inode->i_blocks;
+	attr->cat_uid = from_kuid(&init_user_ns, inode->i_uid);
+	attr->cat_gid = from_kgid(&init_user_ns, inode->i_gid);
+	attr->cat_projid = ll_i2info(inode)->lli_projid;
+	/* KMS is not known by this layer */
+	return 0; /* layers below have to fill in the rest */
+}
+
+static int vvp_attr_update(const struct lu_env *env, struct cl_object *obj,
+			   const struct cl_attr *attr, unsigned valid)
+{
+	struct inode *inode = vvp_object_inode(obj);
+
+	if (valid & CAT_UID)
+		inode->i_uid = make_kuid(&init_user_ns, attr->cat_uid);
+	if (valid & CAT_GID)
+		inode->i_gid = make_kgid(&init_user_ns, attr->cat_gid);
+	if (valid & CAT_ATIME)
+		inode->i_atime.tv_sec = attr->cat_atime;
+	if (valid & CAT_MTIME)
+		inode->i_mtime.tv_sec = attr->cat_mtime;
+	if (valid & CAT_CTIME)
+		inode->i_ctime.tv_sec = attr->cat_ctime;
+	if (0 && valid & CAT_SIZE)
+		i_size_write(inode, attr->cat_size);
+	if (valid & CAT_PROJID)
+		ll_i2info(inode)->lli_projid = attr->cat_projid;
+	/* not currently necessary */
+	if (0 && valid & (CAT_UID|CAT_GID|CAT_SIZE|CAT_PROJID))
+		mark_inode_dirty(inode);
+	return 0;
+}
+
+static int vvp_conf_set(const struct lu_env *env, struct cl_object *obj,
+			const struct cl_object_conf *conf)
+{
+	struct ll_inode_info *lli = ll_i2info(conf->coc_inode);
+
+	if (conf->coc_opc == OBJECT_CONF_INVALIDATE) {
+		CDEBUG(D_VFSTRACE, DFID ": losing layout lock\n",
+		       PFID(&lli->lli_fid));
+
+		ll_layout_version_set(lli, CL_LAYOUT_GEN_NONE);
+
+		/* Clean up page mmap for this inode.
+		 * The reason for us to do this is that if the page has
+		 * already been installed into memory space, the process
+		 * can access it without interacting with lustre, so this
+		 * page may be stale due to layout change, and the process
+		 * will never be notified.
+		 * This operation is expensive but mmap processes have to pay
+		 * a price themselves. */
+		unmap_mapping_range(conf->coc_inode->i_mapping,
+				    0, OBD_OBJECT_EOF, 0);
+	}
+	return 0;
+}
+
+static int vvp_prune(const struct lu_env *env, struct cl_object *obj)
+{
+	struct inode *inode = vvp_object_inode(obj);
+	int rc;
+	ENTRY;
+
+	rc = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, CL_FSYNC_LOCAL, 1);
+	if (rc < 0) {
+		CDEBUG(D_VFSTRACE, DFID ": writeback failed: %d\n",
+		       PFID(lu_object_fid(&obj->co_lu)), rc);
+		RETURN(rc);
+	}
+
+	truncate_inode_pages(inode->i_mapping, 0);
+	RETURN(0);
+}
+
+static int vvp_object_glimpse(const struct lu_env *env,
+			      const struct cl_object *obj, struct ost_lvb *lvb)
+{
+	struct inode *inode = vvp_object_inode(obj);
+
+	ENTRY;
+	lvb->lvb_mtime = LTIME_S(inode->i_mtime);
+	lvb->lvb_atime = LTIME_S(inode->i_atime);
+	lvb->lvb_ctime = LTIME_S(inode->i_ctime);
+
+	/*
+	 * LU-417: Add dirty pages block count lest i_blocks reports 0, some
+	 * "cp" or "tar" on remote node may think it's a completely sparse file
+	 * and skip it.
+	 */
+	if (lvb->lvb_size > 0 && lvb->lvb_blocks == 0)
+		lvb->lvb_blocks = dirty_cnt(inode);
+
+	RETURN(0);
+}
+
+static void vvp_req_attr_set(const struct lu_env *env, struct cl_object *obj,
+			     struct cl_req_attr *attr)
+{
+	struct inode *inode;
+	struct obdo  *oa;
+	u64 valid_flags = OBD_MD_FLTYPE;
+
+	oa = attr->cra_oa;
+	inode = vvp_object_inode(obj);
+
+	if (attr->cra_type == CRT_WRITE) {
+		valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME |
+			       OBD_MD_FLUID | OBD_MD_FLGID;
+		obdo_set_o_projid(oa, ll_i2info(inode)->lli_projid);
+	}
+	obdo_from_inode(oa, inode, valid_flags & attr->cra_flags);
+	obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
+	if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_INVALID_PFID))
+		oa->o_parent_oid++;
+	memcpy(attr->cra_jobid, ll_i2info(inode)->lli_jobid, LUSTRE_JOBID_SIZE);
+}
+
+static const struct cl_object_operations vvp_ops = {
+	.coo_page_init    = vvp_page_init,
+	.coo_lock_init    = vvp_lock_init,
+	.coo_io_init      = vvp_io_init,
+	.coo_attr_get     = vvp_attr_get,
+	.coo_attr_update  = vvp_attr_update,
+	.coo_conf_set     = vvp_conf_set,
+	.coo_prune        = vvp_prune,
+	.coo_glimpse      = vvp_object_glimpse,
+	.coo_req_attr_set = vvp_req_attr_set
+};
+
+static int vvp_object_init0(const struct lu_env *env,
+			    struct vvp_object *vob,
+			    const struct cl_object_conf *conf)
+{
+	vob->vob_inode = conf->coc_inode;
+	atomic_set(&vob->vob_transient_pages, 0);
+	cl_object_page_init(&vob->vob_cl, sizeof(struct vvp_page));
+	return 0;
+}
+
+static int vvp_object_init(const struct lu_env *env, struct lu_object *obj,
+			   const struct lu_object_conf *conf)
+{
+	struct vvp_device *dev = lu2vvp_dev(obj->lo_dev);
+	struct vvp_object *vob = lu2vvp(obj);
+	struct lu_object  *below;
+	struct lu_device  *under;
+	int result;
+
+	under = &dev->vdv_next->cd_lu_dev;
+	below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under);
+	if (below != NULL) {
+		const struct cl_object_conf *cconf;
+
+		cconf = lu2cl_conf(conf);
+		lu_object_add(obj, below);
+		result = vvp_object_init0(env, vob, cconf);
+	} else
+		result = -ENOMEM;
+
+	return result;
+}
+
+static void vvp_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+	struct vvp_object *vob = lu2vvp(obj);
+
+	lu_object_fini(obj);
+	lu_object_header_fini(obj->lo_header);
+	OBD_SLAB_FREE_PTR(vob, vvp_object_kmem);
+}
+
+static const struct lu_object_operations vvp_lu_obj_ops = {
+	.loo_object_init	= vvp_object_init,
+	.loo_object_free	= vvp_object_free,
+	.loo_object_print	= vvp_object_print,
+};
+
+struct vvp_object *cl_inode2vvp(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+        struct cl_object     *obj = lli->lli_clob;
+        struct lu_object     *lu;
+
+        LASSERT(obj != NULL);
+        lu = lu_object_locate(obj->co_lu.lo_header, &vvp_device_type);
+        LASSERT(lu != NULL);
+
+	return lu2vvp(lu);
+}
+
+struct lu_object *vvp_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *unused,
+				   struct lu_device *dev)
+{
+	struct vvp_object *vob;
+	struct lu_object  *obj;
+
+	OBD_SLAB_ALLOC_PTR_GFP(vob, vvp_object_kmem, GFP_NOFS);
+	if (vob != NULL) {
+		struct cl_object_header *hdr;
+
+		obj = &vob->vob_cl.co_lu;
+		hdr = &vob->vob_header;
+		cl_object_header_init(hdr);
+		hdr->coh_page_bufsize = cfs_size_round(sizeof(struct cl_page));
+
+		lu_object_init(obj, &hdr->coh_lu, dev);
+		lu_object_add_top(&hdr->coh_lu, obj);
+
+		vob->vob_cl.co_ops = &vvp_ops;
+		obj->lo_ops = &vvp_lu_obj_ops;
+	} else
+		obj = NULL;
+	return obj;
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_page.c b/drivers/staging/lustrefsx/lustre/llite/vvp_page.c
new file mode 100644
index 0000000000000..47d48639ad43c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_page.c
@@ -0,0 +1,544 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/page-flags.h>
+#include <linux/pagemap.h>
+
+#include <libcfs/libcfs.h>
+#include "llite_internal.h"
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Page operations.
+ *
+ */
+
+static void vvp_page_fini_common(struct vvp_page *vpg)
+{
+	struct page *vmpage = vpg->vpg_page;
+
+	LASSERT(vmpage != NULL);
+	put_page(vmpage);
+}
+
+static void vvp_page_fini(const struct lu_env *env,
+			  struct cl_page_slice *slice)
+{
+	struct vvp_page *vpg     = cl2vvp_page(slice);
+	struct page     *vmpage  = vpg->vpg_page;
+
+	/*
+	 * vmpage->private was already cleared when page was moved into
+	 * VPG_FREEING state.
+	 */
+	LASSERT((struct cl_page *)vmpage->private != slice->cpl_page);
+	vvp_page_fini_common(vpg);
+}
+
+static int vvp_page_own(const struct lu_env *env,
+			const struct cl_page_slice *slice, struct cl_io *io,
+			int nonblock)
+{
+	struct vvp_page *vpg    = cl2vvp_page(slice);
+	struct page     *vmpage = vpg->vpg_page;
+
+	LASSERT(vmpage != NULL);
+	if (nonblock) {
+		if (!trylock_page(vmpage))
+			return -EAGAIN;
+
+		if (unlikely(PageWriteback(vmpage))) {
+			unlock_page(vmpage);
+			return -EAGAIN;
+		}
+
+		return 0;
+	}
+
+	lock_page(vmpage);
+	wait_on_page_writeback(vmpage);
+
+	return 0;
+}
+
+static void vvp_page_assume(const struct lu_env *env,
+			    const struct cl_page_slice *slice,
+			    struct cl_io *unused)
+{
+	struct page *vmpage = cl2vm_page(slice);
+
+	LASSERT(vmpage != NULL);
+	LASSERT(PageLocked(vmpage));
+	wait_on_page_writeback(vmpage);
+}
+
+static void vvp_page_unassume(const struct lu_env *env,
+			      const struct cl_page_slice *slice,
+			      struct cl_io *unused)
+{
+	struct page *vmpage = cl2vm_page(slice);
+
+	LASSERT(vmpage != NULL);
+	LASSERT(PageLocked(vmpage));
+}
+
+static void vvp_page_disown(const struct lu_env *env,
+			    const struct cl_page_slice *slice, struct cl_io *io)
+{
+	struct page *vmpage = cl2vm_page(slice);
+
+	LASSERT(vmpage != NULL);
+	LASSERT(PageLocked(vmpage));
+
+	unlock_page(cl2vm_page(slice));
+}
+
+static void vvp_page_discard(const struct lu_env *env,
+			     const struct cl_page_slice *slice,
+			     struct cl_io *unused)
+{
+	struct page     *vmpage = cl2vm_page(slice);
+	struct vvp_page *vpg    = cl2vvp_page(slice);
+
+	LASSERT(vmpage != NULL);
+	LASSERT(PageLocked(vmpage));
+
+	if (vpg->vpg_defer_uptodate && !vpg->vpg_ra_used)
+		ll_ra_stats_inc(vmpage->mapping->host, RA_STAT_DISCARDED);
+
+	ll_invalidate_page(vmpage);
+}
+
+static void vvp_page_delete(const struct lu_env *env,
+			    const struct cl_page_slice *slice)
+{
+	struct page      *vmpage = cl2vm_page(slice);
+	struct inode     *inode  = vmpage->mapping->host;
+	struct cl_object *obj    = slice->cpl_obj;
+	struct cl_page   *page   = slice->cpl_page;
+	int refc;
+
+	LASSERT(PageLocked(vmpage));
+	LASSERT((struct cl_page *)vmpage->private == page);
+	LASSERT(inode == vvp_object_inode(obj));
+
+	/* Drop the reference count held in vvp_page_init */
+	refc = atomic_dec_return(&page->cp_ref);
+	LASSERTF(refc >= 1, "page = %p, refc = %d\n", page, refc);
+
+	ClearPagePrivate(vmpage);
+	vmpage->private = 0;
+	/*
+	 * Reference from vmpage to cl_page is removed, but the reference back
+	 * is still here. It is removed later in vvp_page_fini().
+	 */
+}
+
+static void vvp_page_export(const struct lu_env *env,
+			    const struct cl_page_slice *slice,
+			    int uptodate)
+{
+	struct page *vmpage = cl2vm_page(slice);
+
+	LASSERT(vmpage != NULL);
+	LASSERT(PageLocked(vmpage));
+	if (uptodate)
+		SetPageUptodate(vmpage);
+	else
+		ClearPageUptodate(vmpage);
+}
+
+static int vvp_page_is_vmlocked(const struct lu_env *env,
+                                const struct cl_page_slice *slice)
+{
+        return PageLocked(cl2vm_page(slice)) ? -EBUSY : -ENODATA;
+}
+
+static int vvp_page_prep_read(const struct lu_env *env,
+                              const struct cl_page_slice *slice,
+                              struct cl_io *unused)
+{
+        ENTRY;
+        /* Skip the page already marked as PG_uptodate. */
+        RETURN(PageUptodate(cl2vm_page(slice)) ? -EALREADY : 0);
+}
+
+static int vvp_page_prep_write(const struct lu_env *env,
+                               const struct cl_page_slice *slice,
+                               struct cl_io *unused)
+{
+	struct page *vmpage = cl2vm_page(slice);
+	struct cl_page *pg = slice->cpl_page;
+
+	LASSERT(PageLocked(vmpage));
+	LASSERT(!PageDirty(vmpage));
+
+	/* ll_writepage path is not a sync write, so need to set page writeback
+	 * flag */
+	if (pg->cp_sync_io == NULL)
+		set_page_writeback(vmpage);
+
+	return 0;
+}
+
+/**
+ * Handles page transfer errors at VM level.
+ *
+ * This takes inode as a separate argument, because inode on which error is to
+ * be set can be different from \a vmpage inode in case of direct-io.
+ */
+static void vvp_vmpage_error(struct inode *inode, struct page *vmpage, int ioret)
+{
+	struct vvp_object *obj = cl_inode2vvp(inode);
+
+	if (ioret == 0) {
+		ClearPageError(vmpage);
+		obj->vob_discard_page_warned = 0;
+	} else {
+		SetPageError(vmpage);
+		if (ioret == -ENOSPC)
+			set_bit(AS_ENOSPC, &inode->i_mapping->flags);
+		else
+			set_bit(AS_EIO, &inode->i_mapping->flags);
+
+		if ((ioret == -ESHUTDOWN || ioret == -EINTR) &&
+		     obj->vob_discard_page_warned == 0) {
+			obj->vob_discard_page_warned = 1;
+			ll_dirty_page_discard_warn(vmpage, ioret);
+		}
+	}
+}
+
+static void vvp_page_completion_read(const struct lu_env *env,
+				     const struct cl_page_slice *slice,
+				     int ioret)
+{
+	struct vvp_page *vpg    = cl2vvp_page(slice);
+	struct page     *vmpage = vpg->vpg_page;
+	struct cl_page  *page   = slice->cpl_page;
+	struct inode    *inode  = vvp_object_inode(page->cp_obj);
+	ENTRY;
+
+	LASSERT(PageLocked(vmpage));
+	CL_PAGE_HEADER(D_PAGE, env, page, "completing READ with %d\n", ioret);
+
+	if (vpg->vpg_defer_uptodate)
+		ll_ra_count_put(ll_i2sbi(inode), 1);
+
+	if (ioret == 0)  {
+		if (!vpg->vpg_defer_uptodate)
+			cl_page_export(env, page, 1);
+	} else {
+		vpg->vpg_defer_uptodate = 0;
+	}
+
+	if (page->cp_sync_io == NULL)
+		unlock_page(vmpage);
+
+	EXIT;
+}
+
+static void vvp_page_completion_write(const struct lu_env *env,
+				      const struct cl_page_slice *slice,
+				      int ioret)
+{
+	struct vvp_page *vpg    = cl2vvp_page(slice);
+	struct cl_page  *pg     = slice->cpl_page;
+	struct page     *vmpage = vpg->vpg_page;
+	ENTRY;
+
+	CL_PAGE_HEADER(D_PAGE, env, pg, "completing WRITE with %d\n", ioret);
+
+	if (pg->cp_sync_io != NULL) {
+		LASSERT(PageLocked(vmpage));
+		LASSERT(!PageWriteback(vmpage));
+	} else {
+		LASSERT(PageWriteback(vmpage));
+		/*
+		 * Only mark the page error only when it's an async write
+		 * because applications won't wait for IO to finish.
+		 */
+		vvp_vmpage_error(vvp_object_inode(pg->cp_obj), vmpage, ioret);
+
+		end_page_writeback(vmpage);
+	}
+	EXIT;
+}
+
+/**
+ * Implements cl_page_operations::cpo_make_ready() method.
+ *
+ * This is called to yank a page from the transfer cache and to send it out as
+ * a part of transfer. This function try-locks the page. If try-lock failed,
+ * page is owned by some concurrent IO, and should be skipped (this is bad,
+ * but hopefully rare situation, as it usually results in transfer being
+ * shorter than possible).
+ *
+ * \retval 0      success, page can be placed into transfer
+ *
+ * \retval -EAGAIN page is either used by concurrent IO has been
+ * truncated. Skip it.
+ */
+static int vvp_page_make_ready(const struct lu_env *env,
+			       const struct cl_page_slice *slice)
+{
+	struct page *vmpage = cl2vm_page(slice);
+	struct cl_page *pg = slice->cpl_page;
+	int result = 0;
+
+	lock_page(vmpage);
+	if (clear_page_dirty_for_io(vmpage)) {
+		LASSERT(pg->cp_state == CPS_CACHED);
+		/* This actually clears the dirty bit in the radix
+		 * tree. */
+		set_page_writeback(vmpage);
+		CL_PAGE_HEADER(D_PAGE, env, pg, "readied\n");
+	} else if (pg->cp_state == CPS_PAGEOUT) {
+		/* is it possible for osc_flush_async_page() to already
+		 * make it ready? */
+		result = -EALREADY;
+	} else {
+		CL_PAGE_DEBUG(D_ERROR, env, pg, "Unexpecting page state %d.\n",
+			      pg->cp_state);
+		LBUG();
+	}
+	unlock_page(vmpage);
+	RETURN(result);
+}
+
+static int vvp_page_print(const struct lu_env *env,
+			  const struct cl_page_slice *slice,
+			  void *cookie, lu_printer_t printer)
+{
+	struct vvp_page *vpg	= cl2vvp_page(slice);
+	struct page     *vmpage	= vpg->vpg_page;
+
+	(*printer)(env, cookie, LUSTRE_VVP_NAME"-page@%p(%d:%d) "
+		   "vm@%p ",
+		   vpg, vpg->vpg_defer_uptodate, vpg->vpg_ra_used, vmpage);
+
+	if (vmpage != NULL) {
+		(*printer)(env, cookie, "%lx %d:%d %lx %lu %slru",
+			   (long)vmpage->flags, page_count(vmpage),
+			   page_mapcount(vmpage), vmpage->private,
+			   page_index(vmpage),
+			   list_empty(&vmpage->lru) ? "not-" : "");
+	}
+
+	(*printer)(env, cookie, "\n");
+
+	return 0;
+}
+
+static int vvp_page_fail(const struct lu_env *env,
+			 const struct cl_page_slice *slice)
+{
+	/*
+	 * Cached read?
+	 */
+	LBUG();
+
+	return 0;
+}
+
+static const struct cl_page_operations vvp_page_ops = {
+	.cpo_own           = vvp_page_own,
+	.cpo_assume        = vvp_page_assume,
+	.cpo_unassume      = vvp_page_unassume,
+	.cpo_disown        = vvp_page_disown,
+	.cpo_discard       = vvp_page_discard,
+	.cpo_delete        = vvp_page_delete,
+	.cpo_export        = vvp_page_export,
+	.cpo_is_vmlocked   = vvp_page_is_vmlocked,
+	.cpo_fini          = vvp_page_fini,
+	.cpo_print         = vvp_page_print,
+	.io = {
+		[CRT_READ] = {
+			.cpo_prep       = vvp_page_prep_read,
+			.cpo_completion = vvp_page_completion_read,
+			.cpo_make_ready = vvp_page_fail,
+		},
+		[CRT_WRITE] = {
+			.cpo_prep       = vvp_page_prep_write,
+			.cpo_completion = vvp_page_completion_write,
+			.cpo_make_ready = vvp_page_make_ready,
+		},
+	},
+};
+
+static int vvp_transient_page_prep(const struct lu_env *env,
+				   const struct cl_page_slice *slice,
+				   struct cl_io *unused)
+{
+	ENTRY;
+	/* transient page should always be sent. */
+	RETURN(0);
+}
+
+static void vvp_transient_page_verify(const struct cl_page *page)
+{
+}
+
+static int vvp_transient_page_own(const struct lu_env *env,
+                                  const struct cl_page_slice *slice,
+                                  struct cl_io *unused, int nonblock)
+{
+        vvp_transient_page_verify(slice->cpl_page);
+        return 0;
+}
+
+static void vvp_transient_page_assume(const struct lu_env *env,
+                                      const struct cl_page_slice *slice,
+                                      struct cl_io *unused)
+{
+        vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_unassume(const struct lu_env *env,
+                                        const struct cl_page_slice *slice,
+                                        struct cl_io *unused)
+{
+        vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_disown(const struct lu_env *env,
+                                      const struct cl_page_slice *slice,
+                                      struct cl_io *unused)
+{
+        vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_discard(const struct lu_env *env,
+                                       const struct cl_page_slice *slice,
+                                       struct cl_io *unused)
+{
+        struct cl_page *page = slice->cpl_page;
+
+        vvp_transient_page_verify(slice->cpl_page);
+
+        /*
+         * For transient pages, remove it from the radix tree.
+         */
+        cl_page_delete(env, page);
+}
+
+static int vvp_transient_page_is_vmlocked(const struct lu_env *env,
+					  const struct cl_page_slice *slice)
+{
+	struct inode    *inode = vvp_object_inode(slice->cpl_obj);
+	int	locked;
+
+	locked = !inode_trylock(inode);
+	if (!locked)
+		inode_unlock(inode);
+	return locked ? -EBUSY : -ENODATA;
+}
+
+static void
+vvp_transient_page_completion(const struct lu_env *env,
+                              const struct cl_page_slice *slice,
+                              int ioret)
+{
+        vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_fini(const struct lu_env *env,
+				    struct cl_page_slice *slice)
+{
+	struct vvp_page *vpg = cl2vvp_page(slice);
+	struct cl_page *clp = slice->cpl_page;
+	struct vvp_object *clobj = cl2vvp(clp->cp_obj);
+
+	vvp_page_fini_common(vpg);
+	atomic_dec(&clobj->vob_transient_pages);
+}
+
+static const struct cl_page_operations vvp_transient_page_ops = {
+	.cpo_own		= vvp_transient_page_own,
+	.cpo_assume		= vvp_transient_page_assume,
+	.cpo_unassume		= vvp_transient_page_unassume,
+	.cpo_disown		= vvp_transient_page_disown,
+	.cpo_discard		= vvp_transient_page_discard,
+	.cpo_fini		= vvp_transient_page_fini,
+	.cpo_is_vmlocked	= vvp_transient_page_is_vmlocked,
+	.cpo_print		= vvp_page_print,
+	.io = {
+		[CRT_READ] = {
+			.cpo_prep	= vvp_transient_page_prep,
+			.cpo_completion	= vvp_transient_page_completion,
+		},
+		[CRT_WRITE] = {
+			.cpo_prep	= vvp_transient_page_prep,
+			.cpo_completion	= vvp_transient_page_completion,
+		}
+	}
+};
+
+int vvp_page_init(const struct lu_env *env, struct cl_object *obj,
+		struct cl_page *page, pgoff_t index)
+{
+	struct vvp_page *vpg = cl_object_page_slice(obj, page);
+	struct page     *vmpage = page->cp_vmpage;
+
+	CLOBINVRNT(env, obj, vvp_object_invariant(obj));
+
+	vpg->vpg_page = vmpage;
+	get_page(vmpage);
+
+	if (page->cp_type == CPT_CACHEABLE) {
+		/* in cache, decref in vvp_page_delete */
+		atomic_inc(&page->cp_ref);
+		SetPagePrivate(vmpage);
+		vmpage->private = (unsigned long)page;
+		cl_page_slice_add(page, &vpg->vpg_cl, obj, index,
+				&vvp_page_ops);
+	} else {
+		struct vvp_object *clobj = cl2vvp(obj);
+
+		cl_page_slice_add(page, &vpg->vpg_cl, obj, index,
+				&vvp_transient_page_ops);
+		atomic_inc(&clobj->vob_transient_pages);
+	}
+	return 0;
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr.c b/drivers/staging/lustrefsx/lustre/llite/xattr.c
new file mode 100644
index 0000000000000..67cc5139f7366
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/xattr.c
@@ -0,0 +1,819 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/xattr.h>
+#include <linux/selinux.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_support.h>
+#include <lustre_dlm.h>
+#include <lustre_ver.h>
+#include <lustre_eacl.h>
+
+#include "llite_internal.h"
+
+const struct xattr_handler *get_xattr_type(const char *name)
+{
+	int i = 0;
+
+	while (ll_xattr_handlers[i]) {
+		size_t len = strlen(ll_xattr_handlers[i]->prefix);
+
+		if (!strncmp(ll_xattr_handlers[i]->prefix, name, len))
+			return ll_xattr_handlers[i];
+		i++;
+	}
+	return NULL;
+}
+
+static int xattr_type_filter(struct ll_sb_info *sbi,
+			     const struct xattr_handler *handler)
+{
+	/* No handler means XATTR_OTHER_T */
+	if (!handler)
+		return -EOPNOTSUPP;
+
+	if ((handler->flags == XATTR_ACL_ACCESS_T ||
+	     handler->flags == XATTR_ACL_DEFAULT_T) &&
+	    !(sbi->ll_flags & LL_SBI_ACL))
+                return -EOPNOTSUPP;
+
+	if (handler->flags == XATTR_USER_T &&
+	    !(sbi->ll_flags & LL_SBI_USER_XATTR))
+		return -EOPNOTSUPP;
+
+	if (handler->flags == XATTR_TRUSTED_T &&
+	    !capable(CFS_CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return 0;
+}
+
+static int ll_xattr_set_common(const struct xattr_handler *handler,
+			       struct dentry *dentry, struct inode *inode,
+			       const char *name, const void *value, size_t size,
+			       int flags)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *req = NULL;
+	const char *pv = value;
+	char *fullname;
+	u64 valid;
+	int rc;
+	ENTRY;
+
+	if (flags == XATTR_REPLACE) {
+		ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REMOVEXATTR, 1);
+		valid = OBD_MD_FLXATTRRM;
+	} else {
+		ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_SETXATTR, 1);
+		valid = OBD_MD_FLXATTR;
+	}
+
+	/* FIXME: enable IMA when the conditions are ready */
+	if (handler->flags == XATTR_SECURITY_T &&
+	    (!strcmp(name, "ima") || !strcmp(name, "evm")))
+		RETURN(-EOPNOTSUPP);
+
+	rc = xattr_type_filter(sbi, handler);
+	if (rc)
+		RETURN(rc);
+
+	if ((handler->flags == XATTR_ACL_ACCESS_T ||
+	     handler->flags == XATTR_ACL_DEFAULT_T) &&
+#ifdef HAVE_INODE_OWNER_OR_CAPABLE
+	    !inode_owner_or_capable(inode))
+#else
+	    !is_owner_or_cap(inode))
+#endif
+		RETURN(-EPERM);
+
+	/* b10667: ignore lustre special xattr for now */
+	if (!strcmp(name, "hsm") ||
+	    ((handler->flags == XATTR_TRUSTED_T && !strcmp(name, "lov")) ||
+	     (handler->flags == XATTR_LUSTRE_T && !strcmp(name, "lov"))))
+		RETURN(0);
+
+	/* LU-549:  Disable security.selinux when selinux is disabled */
+	if (handler->flags == XATTR_SECURITY_T && !selinux_is_enabled() &&
+	    strcmp(name, "selinux") == 0)
+		RETURN(-EOPNOTSUPP);
+
+	/*
+	 * In user.* namespace, only regular files and directories can have
+	 * extended attributes.
+	 */
+	if (handler->flags == XATTR_USER_T) {
+		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+			RETURN(-EPERM);
+	}
+
+	fullname = kasprintf(GFP_KERNEL, "%s%s", handler->prefix, name);
+	if (!fullname)
+		RETURN(-ENOMEM);
+
+	rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode), valid, fullname,
+			 pv, size, 0, flags, ll_i2suppgid(inode), &req);
+	kfree(fullname);
+	if (rc) {
+		if (rc == -EOPNOTSUPP && handler->flags == XATTR_USER_T) {
+			LCONSOLE_INFO("Disabling user_xattr feature because it is not supported on the server\n");
+                        sbi->ll_flags &= ~LL_SBI_USER_XATTR;
+                }
+		RETURN(rc);
+        }
+
+        ptlrpc_req_finished(req);
+	RETURN(0);
+}
+
+static int get_hsm_state(struct inode *inode, u32 *hus_states)
+{
+	struct md_op_data *op_data;
+	struct hsm_user_state *hus;
+	int rc;
+
+	OBD_ALLOC_PTR(hus);
+	if (!hus)
+		return -ENOMEM;
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, hus);
+	if (!IS_ERR(op_data)) {
+		rc = obd_iocontrol(LL_IOC_HSM_STATE_GET, ll_i2mdexp(inode),
+				   sizeof(*op_data), op_data, NULL);
+		if (!rc)
+			*hus_states = hus->hus_states;
+		else
+			CDEBUG(D_VFSTRACE, "obd_iocontrol failed. rc = %d\n",
+			       rc);
+
+		ll_finish_md_op_data(op_data);
+	} else {
+		rc = PTR_ERR(op_data);
+		CDEBUG(D_VFSTRACE, "Could not prepare the opdata. rc = %d\n",
+		       rc);
+	}
+	OBD_FREE_PTR(hus);
+	return rc;
+}
+
+static int ll_adjust_lum(struct inode *inode, struct lov_user_md *lump)
+{
+	struct lov_comp_md_v1 *comp_v1 = (struct lov_comp_md_v1 *)lump;
+	struct lov_user_md *v1 = lump;
+	bool need_clear_release = false;
+	bool release_checked = false;
+	bool is_composite = false;
+	u16 entry_count = 1;
+	int rc = 0;
+	int i;
+
+	if (!lump)
+		return 0;
+
+	if (lump->lmm_magic == LOV_USER_MAGIC_COMP_V1) {
+		entry_count = comp_v1->lcm_entry_count;
+		is_composite = true;
+	}
+
+	for (i = 0; i < entry_count; i++) {
+		if (lump->lmm_magic == LOV_USER_MAGIC_COMP_V1) {
+			void *ptr = comp_v1;
+
+			ptr += comp_v1->lcm_entries[i].lcme_offset;
+			v1 = (struct lov_user_md *)ptr;
+		}
+
+		/*
+		 * Attributes that are saved via getxattr will always
+		 * have the stripe_offset as 0. Instead, the MDS
+		 * should be allowed to pick the starting OST index.
+		 * b=17846
+		 */
+		if (!is_composite && v1->lmm_stripe_offset == 0)
+			v1->lmm_stripe_offset = -1;
+
+		/* Avoid anyone directly setting the RELEASED flag. */
+		if (v1->lmm_pattern & LOV_PATTERN_F_RELEASED) {
+			if (!release_checked) {
+				u32 state = HS_NONE;
+
+				rc = get_hsm_state(inode, &state);
+				if (rc)
+					return rc;
+
+				if (!(state & HS_ARCHIVED))
+					need_clear_release = true;
+				release_checked = true;
+			}
+			if (need_clear_release)
+				v1->lmm_pattern ^= LOV_PATTERN_F_RELEASED;
+		}
+	}
+
+	return rc;
+}
+
+static int ll_setstripe_ea(struct dentry *dentry, struct lov_user_md *lump,
+			   size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	int rc = 0;
+
+	/*
+	 * It is possible to set an xattr to a "" value of zero size.
+	 * For this case we are going to treat it as a removal.
+	 */
+	if (!size && lump)
+		lump = NULL;
+
+	rc = ll_adjust_lum(inode, lump);
+	if (rc)
+		return rc;
+
+	if (lump && S_ISREG(inode->i_mode)) {
+		u64 it_flags = FMODE_WRITE;
+		ssize_t lum_size;
+
+		lum_size = ll_lov_user_md_size(lump);
+		if (lum_size < 0 || size < lum_size)
+			return -ERANGE;
+
+		rc = ll_lov_setstripe_ea_info(inode, dentry, it_flags, lump,
+					      lum_size);
+		/**
+		 * b=10667: ignore -EEXIST.
+		 * Silently eat error on setting trusted.lov/lustre.lov
+		 * attribute for platforms that added the default option
+		 * to copy all attributes in 'cp' command. Both rsync and
+		 * tar --xattrs also will try to set LOVEA for existing
+		 * files.
+		 */
+		if (rc == -EEXIST)
+			rc = 0;
+	} else if (S_ISDIR(inode->i_mode)) {
+		if (size != 0 && size < sizeof(struct lov_user_md))
+			return -EINVAL;
+
+		rc = ll_dir_setstripe(inode, lump, 0);
+	}
+
+	return rc;
+}
+
+static int ll_xattr_set(const struct xattr_handler *handler,
+			struct dentry *dentry, struct inode *inode,
+			const char *name, const void *value, size_t size,
+			int flags)
+{
+	LASSERT(inode);
+	LASSERT(name);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p), xattr %s\n",
+	       PFID(ll_inode2fid(inode)), inode, name);
+
+	/* lustre/trusted.lov.xxx would be passed through xattr API */
+	if (!strcmp(name, "lov")) {
+		int op_type = flags == XATTR_REPLACE ? LPROC_LL_REMOVEXATTR :
+						       LPROC_LL_SETXATTR;
+
+		ll_stats_ops_tally(ll_i2sbi(inode), op_type, 1);
+
+		return ll_setstripe_ea(dentry, (struct lov_user_md *)value,
+				       size);
+	} else if (!strcmp(name, "lma") || !strcmp(name, "link")) {
+		int op_type = flags == XATTR_REPLACE ? LPROC_LL_REMOVEXATTR :
+						       LPROC_LL_SETXATTR;
+
+		ll_stats_ops_tally(ll_i2sbi(inode), op_type, 1);
+		return 0;
+	}
+
+	return ll_xattr_set_common(handler, dentry, inode, name, value, size,
+				   flags);
+}
+
+int ll_xattr_list(struct inode *inode, const char *name, int type, void *buffer,
+		  size_t size, u64 valid)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        struct ptlrpc_request *req = NULL;
+        struct mdt_body *body;
+        void *xdata;
+	int rc;
+	ENTRY;
+
+	if (sbi->ll_xattr_cache_enabled && type != XATTR_ACL_ACCESS_T &&
+	    (type != XATTR_SECURITY_T || strcmp(name, "security.selinux"))) {
+		rc = ll_xattr_cache_get(inode, name, buffer, size, valid);
+		if (rc == -EAGAIN)
+			goto getxattr_nocache;
+		if (rc < 0)
+			GOTO(out_xattr, rc);
+
+		/* Add "system.posix_acl_access" to the list */
+		if (lli->lli_posix_acl && valid & OBD_MD_FLXATTRLS) {
+			if (size == 0) {
+				rc += sizeof(XATTR_NAME_ACL_ACCESS);
+			} else if (size - rc >= sizeof(XATTR_NAME_ACL_ACCESS)) {
+				memcpy(buffer + rc, XATTR_NAME_ACL_ACCESS,
+				       sizeof(XATTR_NAME_ACL_ACCESS));
+				rc += sizeof(XATTR_NAME_ACL_ACCESS);
+			} else {
+				GOTO(out_xattr, rc = -ERANGE);
+			}
+		}
+	} else {
+getxattr_nocache:
+		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
+				 valid, name, NULL, 0, size, 0, &req);
+		if (rc < 0)
+			GOTO(out_xattr, rc);
+
+		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+		LASSERT(body);
+
+		/* only detect the xattr size */
+		if (size == 0)
+			GOTO(out, rc = body->mbo_eadatasize);
+
+		if (size < body->mbo_eadatasize) {
+			CERROR("server bug: replied size %u > %u\n",
+				body->mbo_eadatasize, (int)size);
+			GOTO(out, rc = -ERANGE);
+		}
+
+		if (body->mbo_eadatasize == 0)
+			GOTO(out, rc = -ENODATA);
+
+		/* do not need swab xattr data */
+		xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA,
+							body->mbo_eadatasize);
+		if (!xdata)
+			GOTO(out, rc = -EFAULT);
+
+		memcpy(buffer, xdata, body->mbo_eadatasize);
+		rc = body->mbo_eadatasize;
+	}
+
+	EXIT;
+
+out_xattr:
+	if (rc == -EOPNOTSUPP && type == XATTR_USER_T) {
+		LCONSOLE_INFO("%s: disabling user_xattr feature because "
+				"it is not supported on the server: rc = %d\n",
+				ll_get_fsname(inode->i_sb, NULL, 0), rc);
+		sbi->ll_flags &= ~LL_SBI_USER_XATTR;
+	}
+out:
+        ptlrpc_req_finished(req);
+	RETURN(rc);
+}
+
+static int ll_xattr_get_common(const struct xattr_handler *handler,
+			       struct dentry *dentry,
+			       struct inode *inode,
+			       const char *name, void *buffer, size_t size)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	char *fullname;
+	int rc;
+
+	ENTRY;
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR, 1);
+
+	rc = xattr_type_filter(sbi, handler);
+	if (rc)
+		RETURN(rc);
+
+	/* LU-549:  Disable security.selinux when selinux is disabled */
+	if (handler->flags == XATTR_SECURITY_T && !selinux_is_enabled() &&
+	    !strcmp(name, "selinux"))
+		RETURN(-EOPNOTSUPP);
+
+#ifdef CONFIG_FS_POSIX_ACL
+	/* posix acl is under protection of LOOKUP lock. when calling to this,
+	 * we just have path resolution to the target inode, so we have great
+	 * chance that cached ACL is uptodate.
+	 */
+	if (handler->flags == XATTR_ACL_ACCESS_T) {
+		struct ll_inode_info *lli = ll_i2info(inode);
+		struct posix_acl *acl;
+
+		spin_lock(&lli->lli_lock);
+		acl = posix_acl_dup(lli->lli_posix_acl);
+		spin_unlock(&lli->lli_lock);
+
+		if (!acl)
+			RETURN(-ENODATA);
+
+		rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
+		posix_acl_release(acl);
+		RETURN(rc);
+	}
+	if (handler->flags == XATTR_ACL_DEFAULT_T && !S_ISDIR(inode->i_mode))
+		RETURN(-ENODATA);
+#endif
+
+	fullname = kasprintf(GFP_KERNEL, "%s%s", handler->prefix, name);
+	if (!fullname)
+		RETURN(-ENOMEM);
+
+	rc = ll_xattr_list(inode, fullname, handler->flags, buffer, size,
+			   OBD_MD_FLXATTR);
+	kfree(fullname);
+	RETURN(rc);
+}
+
+static ssize_t ll_getxattr_lov(struct inode *inode, void *buf, size_t buf_size)
+{
+	ssize_t rc;
+
+	if (S_ISREG(inode->i_mode)) {
+		struct cl_object *obj = ll_i2info(inode)->lli_clob;
+		struct cl_layout cl = {
+			.cl_buf.lb_buf = buf,
+			.cl_buf.lb_len = buf_size,
+		};
+		struct lu_env *env;
+		u16 refcheck;
+
+		if (!obj)
+			RETURN(-ENODATA);
+
+		env = cl_env_get(&refcheck);
+		if (IS_ERR(env))
+			RETURN(PTR_ERR(env));
+
+		rc = cl_object_layout_get(env, obj, &cl);
+		if (rc < 0)
+			GOTO(out_env, rc);
+
+		if (!cl.cl_size)
+			GOTO(out_env, rc = -ENODATA);
+
+		rc = cl.cl_size;
+
+		if (!buf_size)
+			GOTO(out_env, rc);
+
+		LASSERT(buf && rc <= buf_size);
+
+		/*
+		 * Do not return layout gen for getxattr() since
+		 * otherwise it would confuse tar --xattr by
+		 * recognizing layout gen as stripe offset when the
+		 * file is restored. See LU-2809.
+		 */
+		if (((struct lov_mds_md *)buf)->lmm_magic == LOV_MAGIC_COMP_V1)
+			goto out_env;
+
+		((struct lov_mds_md *)buf)->lmm_layout_gen = 0;
+out_env:
+		cl_env_put(env, &refcheck);
+
+		RETURN(rc);
+	} else if (S_ISDIR(inode->i_mode)) {
+		struct ptlrpc_request *req = NULL;
+		struct lov_mds_md *lmm = NULL;
+		int lmm_size = 0;
+
+		rc = ll_dir_getstripe(inode, (void **)&lmm, &lmm_size,
+				      &req, 0);
+		if (rc < 0)
+			GOTO(out_req, rc);
+
+		if (!buf_size)
+			GOTO(out_req, rc = lmm_size);
+
+		if (buf_size < lmm_size)
+			GOTO(out_req, rc = -ERANGE);
+
+		memcpy(buf, lmm, lmm_size);
+		GOTO(out_req, rc = lmm_size);
+out_req:
+		if (req)
+			ptlrpc_req_finished(req);
+
+		RETURN(rc);
+	} else {
+		RETURN(-ENODATA);
+	}
+}
+
+static int ll_xattr_get(const struct xattr_handler *handler,
+			struct dentry *dentry, struct inode *inode,
+			const char *name, void *buffer, size_t size)
+{
+	LASSERT(inode);
+	LASSERT(name);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), xattr %s\n",
+	       PFID(ll_inode2fid(inode)), inode, name);
+
+	if (!strcmp(name, "lov")) {
+		ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR, 1);
+
+		return ll_getxattr_lov(inode, buffer, size);
+	}
+
+	return ll_xattr_get_common(handler, dentry, inode, name, buffer, size);
+}
+
+ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	char *xattr_name;
+	ssize_t rc, rc2;
+	size_t len, rem;
+
+	LASSERT(inode);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
+	       PFID(ll_inode2fid(inode)), inode);
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LISTXATTR, 1);
+
+	rc = ll_xattr_list(inode, NULL, XATTR_OTHER_T, buffer, size,
+			   OBD_MD_FLXATTRLS);
+	if (rc < 0)
+		RETURN(rc);
+
+	/*
+	 * If we're being called to get the size of the xattr list
+	 * (size == 0) then just assume that a lustre.lov xattr
+	 * exists.
+	 */
+	if (!size)
+		RETURN(rc + sizeof(XATTR_LUSTRE_LOV));
+
+	xattr_name = buffer;
+	rem = rc;
+
+	while (rem > 0) {
+		len = strnlen(xattr_name, rem - 1) + 1;
+		rem -= len;
+		if (!xattr_type_filter(sbi, get_xattr_type(xattr_name))) {
+			/* Skip OK xattr type, leave it in buffer. */
+			xattr_name += len;
+			continue;
+		}
+
+		/*
+		 * Move up remaining xattrs in buffer
+		 * removing the xattr that is not OK.
+		 */
+		memmove(xattr_name, xattr_name + len, rem);
+		rc -= len;
+	}
+
+	rc2 = ll_getxattr_lov(inode, NULL, 0);
+	if (rc2 == -ENODATA)
+		RETURN(rc);
+
+	if (rc2 < 0)
+		RETURN(rc2);
+
+	if (size < rc + sizeof(XATTR_LUSTRE_LOV))
+		RETURN(-ERANGE);
+
+	memcpy(buffer + rc, XATTR_LUSTRE_LOV, sizeof(XATTR_LUSTRE_LOV));
+
+	RETURN(rc + sizeof(XATTR_LUSTRE_LOV));
+}
+
+#ifdef HAVE_XATTR_HANDLER_SIMPLIFIED
+static int ll_xattr_get_common_4_3(const struct xattr_handler *handler,
+				   struct dentry *dentry, const char *name,
+				   void *buffer, size_t size)
+{
+	return ll_xattr_get_common(handler, dentry, dentry->d_inode, name,
+				   buffer, size);
+}
+
+static int ll_xattr_get_4_3(const struct xattr_handler *handler,
+			    struct dentry *dentry, const char *name,
+			    void *buffer, size_t size)
+{
+	return ll_xattr_get(handler, dentry, dentry->d_inode, name, buffer,
+			    size);
+}
+
+static int ll_xattr_set_common_4_3(const struct xattr_handler *handler,
+				   struct dentry *dentry, const char *name,
+				   const void *value, size_t size, int flags)
+{
+	return ll_xattr_set_common(handler, dentry, dentry->d_inode, name,
+				   value, size, flags);
+}
+
+static int ll_xattr_set_4_3(const struct xattr_handler *handler,
+			    struct dentry *dentry, const char *name,
+			    const void *value, size_t size, int flags)
+{
+	return ll_xattr_set(handler, dentry, dentry->d_inode, name, value,
+			    size, flags);
+}
+
+#elif !defined(HAVE_XATTR_HANDLER_INODE_PARAM)
+const struct xattr_handler *get_xattr_handler(int handler_flag)
+{
+	int i = 0;
+
+	while (ll_xattr_handlers[i]) {
+		if (ll_xattr_handlers[i]->flags == handler_flag)
+			return ll_xattr_handlers[i];
+		i++;
+	}
+	return NULL;
+}
+
+static int ll_xattr_get_common_3_11(struct dentry *dentry, const char *name,
+				   void *buffer, size_t size, int handler_flags)
+{
+	const struct xattr_handler *handler = get_xattr_handler(handler_flags);
+
+	if (!handler)
+		return -ENXIO;
+
+	return ll_xattr_get_common(handler, dentry, dentry->d_inode, name,
+				   buffer, size);
+}
+
+static int ll_xattr_get_3_11(struct dentry *dentry, const char *name,
+			    void *buffer, size_t size, int handler_flags)
+{
+	const struct xattr_handler *handler = get_xattr_handler(handler_flags);
+
+	if (!handler)
+		return -ENXIO;
+
+	return ll_xattr_get(handler, dentry, dentry->d_inode, name, buffer,
+			    size);
+}
+
+static int ll_xattr_set_common_3_11(struct dentry *dentry, const char *name,
+				   const void *value, size_t size, int flags,
+				   int handler_flags)
+{
+	const struct xattr_handler *handler = get_xattr_handler(handler_flags);
+
+	if (!handler)
+		return -ENXIO;
+
+	return ll_xattr_set_common(handler, dentry, dentry->d_inode, name,
+				   value, size, flags);
+}
+
+static int ll_xattr_set_3_11(struct dentry *dentry, const char *name,
+			    const void *value, size_t size, int flags,
+			    int handler_flags)
+{
+	const struct xattr_handler *handler = get_xattr_handler(handler_flags);
+
+	if (!handler)
+		return -ENXIO;
+
+	return ll_xattr_set(handler, dentry, dentry->d_inode, name, value,
+			    size, flags);
+}
+#endif
+
+static const struct xattr_handler ll_user_xattr_handler = {
+	.prefix = XATTR_USER_PREFIX,
+	.flags = XATTR_USER_T,
+#if defined(HAVE_XATTR_HANDLER_SIMPLIFIED)
+	.get = ll_xattr_get_common_4_3,
+	.set = ll_xattr_set_common_4_3,
+#elif !defined(HAVE_XATTR_HANDLER_INODE_PARAM)
+	.get = ll_xattr_get_common_3_11,
+	.set = ll_xattr_set_common_3_11,
+#else
+	.get = ll_xattr_get_common,
+	.set = ll_xattr_set_common,
+#endif
+};
+
+static const struct xattr_handler ll_trusted_xattr_handler = {
+	.prefix = XATTR_TRUSTED_PREFIX,
+	.flags = XATTR_TRUSTED_T,
+#if defined(HAVE_XATTR_HANDLER_SIMPLIFIED)
+	.get = ll_xattr_get_4_3,
+	.set = ll_xattr_set_4_3,
+#elif !defined(HAVE_XATTR_HANDLER_INODE_PARAM)
+	.get = ll_xattr_get_3_11,
+	.set = ll_xattr_set_3_11,
+#else
+	.get = ll_xattr_get,
+	.set = ll_xattr_set,
+#endif
+};
+
+static const struct xattr_handler ll_security_xattr_handler = {
+	.prefix = XATTR_SECURITY_PREFIX,
+	.flags = XATTR_SECURITY_T,
+#if defined(HAVE_XATTR_HANDLER_SIMPLIFIED)
+	.get = ll_xattr_get_common_4_3,
+	.set = ll_xattr_set_common_4_3,
+#elif !defined(HAVE_XATTR_HANDLER_INODE_PARAM)
+	.get = ll_xattr_get_common_3_11,
+	.set = ll_xattr_set_common_3_11,
+#else
+	.get = ll_xattr_get_common,
+	.set = ll_xattr_set_common,
+#endif
+};
+
+static const struct xattr_handler ll_acl_access_xattr_handler = {
+	.prefix = XATTR_NAME_POSIX_ACL_ACCESS,
+	.flags = XATTR_ACL_ACCESS_T,
+#if defined(HAVE_XATTR_HANDLER_SIMPLIFIED)
+	.get = ll_xattr_get_common_4_3,
+	.set = ll_xattr_set_common_4_3,
+#elif !defined(HAVE_XATTR_HANDLER_INODE_PARAM)
+	.get = ll_xattr_get_common_3_11,
+	.set = ll_xattr_set_common_3_11,
+#else
+	.get = ll_xattr_get_common,
+	.set = ll_xattr_set_common,
+#endif
+};
+
+static const struct xattr_handler ll_acl_default_xattr_handler = {
+	.prefix = XATTR_NAME_POSIX_ACL_DEFAULT,
+	.flags = XATTR_ACL_DEFAULT_T,
+#if defined(HAVE_XATTR_HANDLER_SIMPLIFIED)
+	.get = ll_xattr_get_common_4_3,
+	.set = ll_xattr_set_common_4_3,
+#elif !defined(HAVE_XATTR_HANDLER_INODE_PARAM)
+	.get = ll_xattr_get_common_3_11,
+	.set = ll_xattr_set_common_3_11,
+#else
+	.get = ll_xattr_get_common,
+	.set = ll_xattr_set_common,
+#endif
+};
+
+static const struct xattr_handler ll_lustre_xattr_handler = {
+	.prefix = XATTR_LUSTRE_PREFIX,
+	.flags = XATTR_LUSTRE_T,
+#if defined(HAVE_XATTR_HANDLER_SIMPLIFIED)
+	.get = ll_xattr_get_4_3,
+	.set = ll_xattr_set_4_3,
+#elif !defined(HAVE_XATTR_HANDLER_INODE_PARAM)
+	.get = ll_xattr_get_3_11,
+	.set = ll_xattr_set_3_11,
+#else
+	.get = ll_xattr_get,
+	.set = ll_xattr_set,
+#endif
+};
+
+const struct xattr_handler *ll_xattr_handlers[] = {
+	&ll_user_xattr_handler,
+	&ll_trusted_xattr_handler,
+	&ll_security_xattr_handler,
+#ifdef CONFIG_FS_POSIX_ACL
+	&ll_acl_access_xattr_handler,
+	&ll_acl_default_xattr_handler,
+#endif
+	&ll_lustre_xattr_handler,
+	NULL,
+};
diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr26.c b/drivers/staging/lustrefsx/lustre/llite/xattr26.c
new file mode 100644
index 0000000000000..84e9b8bcbe915
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/xattr26.c
@@ -0,0 +1,603 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/xattr.h>
+#include <linux/selinux.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_support.h>
+#include <lustre_dlm.h>
+#include <lustre_ver.h>
+#include <lustre_eacl.h>
+
+#include "llite_internal.h"
+
+/* xattr related to IMA(Integrity Measurement Architecture) */
+#ifndef XATTR_NAME_IMA
+#define XATTR_NAME_IMA		"security.ima"
+#endif
+#ifndef XATTR_NAME_EVM
+#define XATTR_NAME_EVM		"security.evm"
+#endif
+
+static
+int get_xattr26_type(const char *name)
+{
+	if (!strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS))
+		return XATTR_ACL_ACCESS_T;
+
+	if (!strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT))
+		return XATTR_ACL_DEFAULT_T;
+
+	if (!strncmp(name, XATTR_USER_PREFIX,
+		     sizeof(XATTR_USER_PREFIX) - 1))
+		return XATTR_USER_T;
+
+	if (!strncmp(name, XATTR_TRUSTED_PREFIX,
+		     sizeof(XATTR_TRUSTED_PREFIX) - 1))
+		return XATTR_TRUSTED_T;
+
+	if (!strncmp(name, XATTR_SECURITY_PREFIX,
+		     sizeof(XATTR_SECURITY_PREFIX) - 1))
+		return XATTR_SECURITY_T;
+
+	if (!strncmp(name, XATTR_LUSTRE_PREFIX,
+		     sizeof(XATTR_LUSTRE_PREFIX) - 1))
+		return XATTR_LUSTRE_T;
+
+	return XATTR_OTHER_T;
+}
+
+static
+int xattr_type_filter(struct ll_sb_info *sbi, int xattr_type)
+{
+	if ((xattr_type == XATTR_ACL_ACCESS_T ||
+	     xattr_type == XATTR_ACL_DEFAULT_T) &&
+	   !(sbi->ll_flags & LL_SBI_ACL))
+		return -EOPNOTSUPP;
+
+	if (xattr_type == XATTR_USER_T && !(sbi->ll_flags & LL_SBI_USER_XATTR))
+		return -EOPNOTSUPP;
+	if (xattr_type == XATTR_TRUSTED_T && !cfs_capable(CFS_CAP_SYS_ADMIN))
+		return -EPERM;
+	if (xattr_type == XATTR_OTHER_T)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+static
+int ll_setxattr_common(struct inode *inode, const char *name,
+		       const void *value, size_t size,
+		       int flags, __u64 valid)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *req = NULL;
+	int xattr_type, rc;
+	const char *pv = value;
+	ENTRY;
+
+	/*FIXME: enable IMA when the conditions are ready */
+	if (strncmp(name, XATTR_NAME_IMA,
+		    sizeof(XATTR_NAME_IMA)) == 0 ||
+	    strncmp(name, XATTR_NAME_EVM,
+		    sizeof(XATTR_NAME_EVM)) == 0)
+		return -EOPNOTSUPP;
+
+	xattr_type = get_xattr26_type(name);
+	rc = xattr_type_filter(sbi, xattr_type);
+	if (rc)
+		RETURN(rc);
+
+	if ((xattr_type == XATTR_ACL_ACCESS_T ||
+	     xattr_type == XATTR_ACL_DEFAULT_T) &&
+#ifdef HAVE_INODE_OWNER_OR_CAPABLE
+	    !inode_owner_or_capable(inode))
+#else
+	    !is_owner_or_cap(inode))
+#endif
+		return -EPERM;
+
+	/* b10667: ignore lustre special xattr for now */
+	if (strcmp(name, XATTR_NAME_HSM) == 0 ||
+		(xattr_type == XATTR_TRUSTED_T &&
+		strcmp(name, XATTR_NAME_LOV) == 0) ||
+		(xattr_type == XATTR_LUSTRE_T &&
+		 strcmp(name, "lustre.lov") == 0))
+		RETURN(0);
+
+	/* LU-549:  Disable security.selinux when selinux is disabled */
+	if (xattr_type == XATTR_SECURITY_T && !selinux_is_enabled() &&
+	    strcmp(name, "security.selinux") == 0)
+		RETURN(-EOPNOTSUPP);
+
+	/* In user.* namespace, only regular files and directories can have
+	 * extended attributes. */
+	if (xattr_type == XATTR_USER_T) {
+		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+			RETURN(-EPERM);
+	}
+
+	rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode), valid, name, pv,
+			 size, 0, flags, ll_i2suppgid(inode), &req);
+	if (rc) {
+		if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) {
+			LCONSOLE_INFO("Disabling user_xattr feature because "
+				      "it is not supported on the server\n");
+			sbi->ll_flags &= ~LL_SBI_USER_XATTR;
+		}
+		RETURN(rc);
+	}
+
+	ptlrpc_req_finished(req);
+	RETURN(0);
+}
+
+static int get_hsm_state(struct inode *inode, __u32 *hus_states)
+{
+	struct md_op_data *op_data;
+	struct hsm_user_state *hus;
+	int rc;
+
+	OBD_ALLOC_PTR(hus);
+	if (hus == NULL)
+		return -ENOMEM;
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, hus);
+	if (!IS_ERR(op_data)) {
+		rc = obd_iocontrol(LL_IOC_HSM_STATE_GET, ll_i2mdexp(inode),
+				   sizeof(*op_data), op_data, NULL);
+		if (rc == 0)
+			*hus_states = hus->hus_states;
+		else
+			CDEBUG(D_VFSTRACE, "obd_iocontrol failed. rc = %d\n",
+			       rc);
+
+		ll_finish_md_op_data(op_data);
+	} else {
+		rc = PTR_ERR(op_data);
+		CDEBUG(D_VFSTRACE, "Could not prepare the opdata. rc = %d\n",
+		       rc);
+	}
+	OBD_FREE_PTR(hus);
+	return rc;
+}
+
+static int ll_adjust_lum(struct inode *inode, struct lov_user_md *lump)
+{
+	struct lov_comp_md_v1 *comp_v1 = (struct lov_comp_md_v1 *)lump;
+	struct lov_user_md *v1 = lump;
+	bool release_checked = false;
+	bool need_clear_release = false;
+	__u16 entry_count = 1;
+	bool is_composite = false;
+	int rc = 0;
+	int i;
+
+	if (lump == NULL)
+		return 0;
+
+	if (lump->lmm_magic == LOV_USER_MAGIC_COMP_V1) {
+		entry_count = comp_v1->lcm_entry_count;
+		is_composite = true;
+	}
+
+	for (i = 0; i < entry_count; i++) {
+		if (lump->lmm_magic == LOV_USER_MAGIC_COMP_V1)
+			v1 = (struct lov_user_md *)((char *)comp_v1 +
+					comp_v1->lcm_entries[i].lcme_offset);
+
+		/* Attributes that are saved via getxattr will always
+		 * have the stripe_offset as 0.  Instead, the MDS
+		 * should be allowed to pick the starting OST index.
+		 * b=17846 */
+		if (!is_composite && v1->lmm_stripe_offset == 0)
+			v1->lmm_stripe_offset = -1;
+
+		/* Avoid anyone directly setting the RELEASED flag. */
+		if (v1->lmm_pattern & LOV_PATTERN_F_RELEASED) {
+			if (!release_checked) {
+				__u32 state = HS_NONE;
+				rc = get_hsm_state(inode, &state);
+				if (rc)
+					return rc;
+				if (!(state & HS_ARCHIVED))
+					need_clear_release = true;
+				release_checked = true;
+			}
+			if (need_clear_release)
+				v1->lmm_pattern ^= LOV_PATTERN_F_RELEASED;
+		}
+	}
+
+	return rc;
+}
+
+static int ll_setstripe_ea(struct dentry *dentry, struct lov_user_md *lump,
+			   size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	int rc = 0;
+
+	rc = ll_adjust_lum(inode, lump);
+	if (rc)
+		return rc;
+
+	if (lump != NULL && S_ISREG(inode->i_mode)) {
+		u64 it_flags = FMODE_WRITE;
+		int lum_size;
+
+		lum_size = ll_lov_user_md_size(lump);
+		if (lum_size < 0 || size < lum_size)
+			return -ERANGE;
+
+		rc = ll_lov_setstripe_ea_info(inode, dentry, it_flags, lump,
+					      lum_size);
+		/**
+		 * b=10667: ignore -EEXIST.
+		 * Silently eat error on setting trusted.lov/lustre.lov
+		 * attribute for SuSE 9, it added default option to copy
+		 * all attributes in 'cp' command. rsync, tar --xattrs
+		 * also will try to set LOVEA for existing files.
+		 */
+		if (rc == -EEXIST)
+			rc = 0;
+	} else if (S_ISDIR(inode->i_mode)) {
+		rc = ll_dir_setstripe(inode, lump, 0);
+	}
+
+	return rc;
+}
+
+int ll_setxattr(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags)
+{
+	struct inode *inode = dentry->d_inode;
+
+	LASSERT(inode);
+	LASSERT(name);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), xattr %s\n",
+	       PFID(ll_inode2fid(inode)), inode, name);
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_SETXATTR, 1);
+
+	/* lustre/trusted.lov.xxx would be passed through xattr API */
+	if (strcmp(name, XATTR_NAME_LOV) == 0 ||
+	    strcmp(name, XATTR_LUSTRE_LOV) == 0)
+		return ll_setstripe_ea(dentry, (struct lov_user_md *)value,
+				       size);
+	else if (strcmp(name, XATTR_NAME_LMA) == 0 ||
+		 strcmp(name, XATTR_NAME_LINK) == 0)
+		return 0;
+
+	return ll_setxattr_common(inode, name, value, size, flags,
+				  OBD_MD_FLXATTR);
+}
+
+int ll_removexattr(struct dentry *dentry, const char *name)
+{
+	struct inode *inode = dentry->d_inode;
+
+	LASSERT(inode);
+	LASSERT(name);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), xattr %s\n",
+	       PFID(ll_inode2fid(inode)), inode, name);
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REMOVEXATTR, 1);
+	return ll_setxattr_common(inode, name, NULL, 0, 0,
+				  OBD_MD_FLXATTRRM);
+}
+
+int ll_getxattr_common(struct inode *inode, const char *name,
+		       void *buffer, size_t size, __u64 valid)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *req = NULL;
+	struct mdt_body *body;
+	int xattr_type, rc;
+	void *xdata;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
+	       PFID(ll_inode2fid(inode)), inode);
+
+	/* listxattr have slightly different behavior from of ext3:
+	 * without 'user_xattr' ext3 will list all xattr names but
+	 * filtered out "^user..*"; we list them all for simplicity.
+	 */
+	if (!name) {
+		xattr_type = XATTR_OTHER_T;
+		goto do_getxattr;
+	}
+
+	xattr_type = get_xattr26_type(name);
+	rc = xattr_type_filter(sbi, xattr_type);
+	if (rc)
+		RETURN(rc);
+
+	/* LU-549:  Disable security.selinux when selinux is disabled */
+	if (xattr_type == XATTR_SECURITY_T && !selinux_is_enabled() &&
+	    strcmp(name, "security.selinux") == 0)
+		RETURN(-EOPNOTSUPP);
+
+#ifdef CONFIG_FS_POSIX_ACL
+	/* posix acl is under protection of LOOKUP lock. when calling to this,
+	 * we just have path resolution to the target inode, so we have great
+	 * chance that cached ACL is uptodate.
+	 */
+	if (xattr_type == XATTR_ACL_ACCESS_T) {
+		struct posix_acl *acl;
+
+		spin_lock(&lli->lli_lock);
+		acl = posix_acl_dup(lli->lli_posix_acl);
+		spin_unlock(&lli->lli_lock);
+
+		if (!acl)
+			RETURN(-ENODATA);
+
+		rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
+		posix_acl_release(acl);
+		RETURN(rc);
+	}
+	if (xattr_type == XATTR_ACL_DEFAULT_T && !S_ISDIR(inode->i_mode))
+		RETURN(-ENODATA);
+#endif
+
+do_getxattr:
+	if (sbi->ll_xattr_cache_enabled &&
+	    xattr_type != XATTR_ACL_ACCESS_T &&
+	    (xattr_type != XATTR_SECURITY_T ||
+		strcmp(name, "security.selinux") != 0)) {
+		rc = ll_xattr_cache_get(inode, name, buffer, size, valid);
+		if (rc == -EAGAIN)
+			goto getxattr_nocache;
+		if (rc < 0)
+			GOTO(out_xattr, rc);
+
+		/* Add "system.posix_acl_access" to the list */
+		if (lli->lli_posix_acl != NULL && valid & OBD_MD_FLXATTRLS) {
+			if (size == 0) {
+				rc += sizeof(XATTR_NAME_ACL_ACCESS);
+			} else if (size - rc >= sizeof(XATTR_NAME_ACL_ACCESS)) {
+				memcpy(buffer + rc, XATTR_NAME_ACL_ACCESS,
+				       sizeof(XATTR_NAME_ACL_ACCESS));
+				rc += sizeof(XATTR_NAME_ACL_ACCESS);
+			} else {
+				GOTO(out_xattr, rc = -ERANGE);
+			}
+		}
+	} else {
+getxattr_nocache:
+		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
+				valid, name, NULL, 0, size, 0, &req);
+
+		if (rc < 0)
+			GOTO(out_xattr, rc);
+
+		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+		LASSERT(body);
+
+		/* only detect the xattr size */
+		if (size == 0)
+			GOTO(out, rc = body->mbo_eadatasize);
+
+		if (size < body->mbo_eadatasize) {
+			CERROR("server bug: replied size %u > %u\n",
+				body->mbo_eadatasize, (int)size);
+			GOTO(out, rc = -ERANGE);
+		}
+
+		if (body->mbo_eadatasize == 0)
+			GOTO(out, rc = -ENODATA);
+
+		/* do not need swab xattr data */
+		xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA,
+							body->mbo_eadatasize);
+		if (!xdata)
+			GOTO(out, rc = -EFAULT);
+
+		memcpy(buffer, xdata, body->mbo_eadatasize);
+		rc = body->mbo_eadatasize;
+	}
+
+	EXIT;
+
+out_xattr:
+	if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) {
+		LCONSOLE_INFO("%s: disabling user_xattr feature because "
+				"it is not supported on the server: rc = %d\n",
+				ll_get_fsname(inode->i_sb, NULL, 0), rc);
+		sbi->ll_flags &= ~LL_SBI_USER_XATTR;
+	}
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static ssize_t ll_getxattr_lov(struct inode *inode, void *buf, size_t buf_size)
+{
+	ssize_t rc;
+
+	if (S_ISREG(inode->i_mode)) {
+		struct cl_object *obj = ll_i2info(inode)->lli_clob;
+		struct lu_env *env;
+		struct cl_layout cl = {
+			.cl_buf.lb_buf = buf,
+			.cl_buf.lb_len = buf_size,
+		};
+		__u16 refcheck;
+
+		if (obj == NULL)
+			RETURN(-ENODATA);
+
+		env = cl_env_get(&refcheck);
+		if (IS_ERR(env))
+			RETURN(PTR_ERR(env));
+
+		rc = cl_object_layout_get(env, obj, &cl);
+		if (rc < 0)
+			GOTO(out_env, rc);
+
+		if (cl.cl_size == 0)
+			GOTO(out_env, rc = -ENODATA);
+
+		rc = cl.cl_size;
+
+		if (buf_size == 0)
+			GOTO(out_env, rc);
+
+		LASSERT(buf != NULL && rc <= buf_size);
+
+		/* Do not return layout gen for getxattr() since
+		 * otherwise it would confuse tar --xattr by
+		 * recognizing layout gen as stripe offset when the
+		 * file is restored. See LU-2809. */
+		if (((struct lov_mds_md *)buf)->lmm_magic == LOV_MAGIC_COMP_V1)
+			goto out_env;
+
+		((struct lov_mds_md *)buf)->lmm_layout_gen = 0;
+out_env:
+		cl_env_put(env, &refcheck);
+
+		RETURN(rc);
+	} else if (S_ISDIR(inode->i_mode)) {
+		struct lov_mds_md *lmm = NULL;
+		int lmm_size = 0;
+		struct ptlrpc_request *req = NULL;
+
+		rc = ll_dir_getstripe(inode, (void **)&lmm, &lmm_size,
+				      &req, 0);
+		if (rc < 0)
+			GOTO(out_req, rc);
+
+		if (buf_size == 0)
+			GOTO(out_req, rc = lmm_size);
+
+		if (buf_size < lmm_size)
+			GOTO(out_req, rc = -ERANGE);
+
+		memcpy(buf, lmm, lmm_size);
+		GOTO(out_req, rc = lmm_size);
+out_req:
+		if (req != NULL)
+			ptlrpc_req_finished(req);
+
+		return rc;
+	} else {
+		RETURN(-ENODATA);
+	}
+}
+
+ssize_t ll_getxattr(struct dentry *dentry, const char *name, void *buf,
+		    size_t buf_size)
+{
+	struct inode *inode = dentry->d_inode;
+
+	LASSERT(inode);
+	LASSERT(name);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), xattr %s\n",
+	       PFID(ll_inode2fid(inode)), inode, name);
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR, 1);
+
+	if (strcmp(name, XATTR_LUSTRE_LOV) == 0 ||
+	    strcmp(name, XATTR_NAME_LOV) == 0)
+		return ll_getxattr_lov(inode, buf, buf_size);
+	else
+		return ll_getxattr_common(inode, name, buf, buf_size,
+					  OBD_MD_FLXATTR);
+}
+
+ssize_t ll_listxattr(struct dentry *dentry, char *buf, size_t buf_size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	char *xattr_name;
+	ssize_t rc, rc2;
+	size_t len, rem;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
+	       PFID(ll_inode2fid(inode)), inode);
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LISTXATTR, 1);
+
+	rc = ll_getxattr_common(inode, NULL, buf, buf_size, OBD_MD_FLXATTRLS);
+	if (rc < 0)
+		RETURN(rc);
+
+	/* If we're being called to get the size of the xattr list
+	 * (buf_size == 0) then just assume that a lustre.lov xattr
+	 * exists. */
+	if (buf_size == 0)
+		RETURN(rc + sizeof(XATTR_LUSTRE_LOV));
+
+	xattr_name = buf;
+	rem = rc;
+
+	while (rem > 0) {
+		len = strnlen(xattr_name, rem - 1) + 1;
+		rem -= len;
+		if (xattr_type_filter(sbi, get_xattr26_type(xattr_name)) == 0) {
+			/* Skip OK xattr type, leave it in buffer. */
+			xattr_name += len;
+			continue;
+		}
+
+		/* Move up remaining xattrs in buffer removing the
+		 * xattr that is not OK. */
+		memmove(xattr_name, xattr_name + len, rem);
+		rc -= len;
+	}
+
+	rc2 = ll_getxattr_lov(inode, NULL, 0);
+	if (rc2 == -ENODATA)
+		RETURN(rc);
+
+	if (rc2 < 0)
+		RETURN(rc2);
+
+	if (buf_size < rc + sizeof(XATTR_LUSTRE_LOV))
+		RETURN(-ERANGE);
+
+	memcpy(buf + rc, XATTR_LUSTRE_LOV, sizeof(XATTR_LUSTRE_LOV));
+
+	RETURN(rc + sizeof(XATTR_LUSTRE_LOV));
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr_cache.c b/drivers/staging/lustrefsx/lustre/llite/xattr_cache.c
new file mode 100644
index 0000000000000..a001e5c2d8a7b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/xattr_cache.c
@@ -0,0 +1,553 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Copyright (c) 2013, 2016, Intel Corporation.
+ *
+ * Author: Andrew Perepechko <Andrew_Perepechko@xyratex.com>
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <obd_support.h>
+#include <lustre_dlm.h>
+#include <lustre_ver.h>
+#include "llite_internal.h"
+
+/* If we ever have hundreds of extended attributes, we might want to consider
+ * using a hash or a tree structure instead of list for faster lookups.
+ */
+struct ll_xattr_entry {
+	struct list_head	xe_list;    /* protected with
+					     * lli_xattrs_list_rwsem */
+	char			*xe_name;   /* xattr name, \0-terminated */
+	char			*xe_value;  /* xattr value */
+	unsigned		xe_namelen; /* strlen(xe_name) + 1 */
+	unsigned		xe_vallen;  /* xattr value length */
+};
+
+static struct kmem_cache *xattr_kmem;
+static struct lu_kmem_descr xattr_caches[] = {
+	{
+		.ckd_cache = &xattr_kmem,
+		.ckd_name  = "xattr_kmem",
+		.ckd_size  = sizeof(struct ll_xattr_entry)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+int ll_xattr_init(void)
+{
+	return lu_kmem_init(xattr_caches);
+}
+
+void ll_xattr_fini(void)
+{
+	lu_kmem_fini(xattr_caches);
+}
+
+/**
+ * Initializes xattr cache for an inode.
+ *
+ * This initializes the xattr list and marks cache presence.
+ */
+static void ll_xattr_cache_init(struct ll_inode_info *lli)
+{
+	ENTRY;
+
+	LASSERT(lli != NULL);
+
+	INIT_LIST_HEAD(&lli->lli_xattrs);
+	ll_file_set_flag(lli, LLIF_XATTR_CACHE);
+}
+
+/**
+ *  This looks for a specific extended attribute.
+ *
+ *  Find in @cache and return @xattr_name attribute in @xattr,
+ *  for the NULL @xattr_name return the first cached @xattr.
+ *
+ *  \retval 0        success
+ *  \retval -ENODATA if not found
+ */
+static int ll_xattr_cache_find(struct list_head *cache,
+			       const char *xattr_name,
+			       struct ll_xattr_entry **xattr)
+{
+	struct ll_xattr_entry *entry;
+
+	ENTRY;
+
+	list_for_each_entry(entry, cache, xe_list) {
+		/* xattr_name == NULL means look for any entry */
+		if (xattr_name == NULL ||
+		    strcmp(xattr_name, entry->xe_name) == 0) {
+			*xattr = entry;
+			CDEBUG(D_CACHE, "find: [%s]=%.*s\n",
+			       entry->xe_name, entry->xe_vallen,
+			       entry->xe_value);
+			RETURN(0);
+		}
+	}
+
+	RETURN(-ENODATA);
+}
+
+/**
+ * This adds an xattr.
+ *
+ * Add @xattr_name attr with @xattr_val value and @xattr_val_len length,
+ *
+ * \retval 0       success
+ * \retval -ENOMEM if no memory could be allocated for the cached attr
+ * \retval -EPROTO if duplicate xattr is being added
+ */
+static int ll_xattr_cache_add(struct list_head *cache,
+			      const char *xattr_name,
+			      const char *xattr_val,
+			      unsigned xattr_val_len)
+{
+	struct ll_xattr_entry *xattr;
+
+	ENTRY;
+
+	if (ll_xattr_cache_find(cache, xattr_name, &xattr) == 0) {
+		CDEBUG(D_CACHE, "duplicate xattr: [%s]\n", xattr_name);
+		RETURN(-EPROTO);
+	}
+
+	OBD_SLAB_ALLOC_PTR_GFP(xattr, xattr_kmem, GFP_NOFS);
+	if (xattr == NULL) {
+		CDEBUG(D_CACHE, "failed to allocate xattr\n");
+		RETURN(-ENOMEM);
+	}
+
+	xattr->xe_namelen = strlen(xattr_name) + 1;
+
+	OBD_ALLOC(xattr->xe_name, xattr->xe_namelen);
+	if (!xattr->xe_name) {
+		CDEBUG(D_CACHE, "failed to alloc xattr name %u\n",
+		       xattr->xe_namelen);
+		goto err_name;
+	}
+	OBD_ALLOC(xattr->xe_value, xattr_val_len);
+	if (!xattr->xe_value) {
+		CDEBUG(D_CACHE, "failed to alloc xattr value %d\n",
+		       xattr_val_len);
+		goto err_value;
+	}
+
+	memcpy(xattr->xe_name, xattr_name, xattr->xe_namelen);
+	memcpy(xattr->xe_value, xattr_val, xattr_val_len);
+	xattr->xe_vallen = xattr_val_len;
+	list_add(&xattr->xe_list, cache);
+
+	CDEBUG(D_CACHE, "set: [%s]=%.*s\n", xattr_name,
+		xattr_val_len, xattr_val);
+
+	RETURN(0);
+err_value:
+	OBD_FREE(xattr->xe_name, xattr->xe_namelen);
+err_name:
+	OBD_SLAB_FREE_PTR(xattr, xattr_kmem);
+
+	RETURN(-ENOMEM);
+}
+
+/**
+ * This removes an extended attribute from cache.
+ *
+ * Remove @xattr_name attribute from @cache.
+ *
+ * \retval 0        success
+ * \retval -ENODATA if @xattr_name is not cached
+ */
+static int ll_xattr_cache_del(struct list_head *cache,
+			      const char *xattr_name)
+{
+	struct ll_xattr_entry *xattr;
+
+	ENTRY;
+
+	CDEBUG(D_CACHE, "del xattr: %s\n", xattr_name);
+
+	if (ll_xattr_cache_find(cache, xattr_name, &xattr) == 0) {
+		list_del(&xattr->xe_list);
+		OBD_FREE(xattr->xe_name, xattr->xe_namelen);
+		OBD_FREE(xattr->xe_value, xattr->xe_vallen);
+		OBD_SLAB_FREE_PTR(xattr, xattr_kmem);
+
+		RETURN(0);
+	}
+
+	RETURN(-ENODATA);
+}
+
+/**
+ * This iterates cached extended attributes.
+ *
+ * Walk over cached attributes in @cache and
+ * fill in @xld_buffer or only calculate buffer
+ * size if @xld_buffer is NULL.
+ *
+ * \retval >= 0     buffer list size
+ * \retval -ENODATA if the list cannot fit @xld_size buffer
+ */
+static int ll_xattr_cache_list(struct list_head *cache,
+			       char *xld_buffer,
+			       int xld_size)
+{
+	struct ll_xattr_entry *xattr, *tmp;
+	int xld_tail = 0;
+
+	ENTRY;
+
+	list_for_each_entry_safe(xattr, tmp, cache, xe_list) {
+		CDEBUG(D_CACHE, "list: buffer=%p[%d] name=%s\n",
+			xld_buffer, xld_tail, xattr->xe_name);
+
+		if (xld_buffer) {
+			xld_size -= xattr->xe_namelen;
+			if (xld_size < 0)
+				break;
+			memcpy(&xld_buffer[xld_tail],
+			       xattr->xe_name, xattr->xe_namelen);
+		}
+		xld_tail += xattr->xe_namelen;
+	}
+
+	if (xld_size < 0)
+		RETURN(-ERANGE);
+
+	RETURN(xld_tail);
+}
+
+/**
+ * Check if the xattr cache is initialized (filled).
+ *
+ * \retval 0 @cache is not initialized
+ * \retval 1 @cache is initialized
+ */
+static int ll_xattr_cache_valid(struct ll_inode_info *lli)
+{
+	return ll_file_test_flag(lli, LLIF_XATTR_CACHE);
+}
+
+/**
+ * This finalizes the xattr cache.
+ *
+ * Free all xattr memory. @lli is the inode info pointer.
+ *
+ * \retval 0 no error occured
+ */
+static int ll_xattr_cache_destroy_locked(struct ll_inode_info *lli)
+{
+	ENTRY;
+
+	if (!ll_xattr_cache_valid(lli))
+		RETURN(0);
+
+	while (ll_xattr_cache_del(&lli->lli_xattrs, NULL) == 0)
+		/* empty loop */ ;
+
+	ll_file_clear_flag(lli, LLIF_XATTR_CACHE);
+
+	RETURN(0);
+}
+
+int ll_xattr_cache_destroy(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int rc;
+
+	ENTRY;
+
+	down_write(&lli->lli_xattrs_list_rwsem);
+	rc = ll_xattr_cache_destroy_locked(lli);
+	up_write(&lli->lli_xattrs_list_rwsem);
+
+	RETURN(rc);
+}
+
+/**
+ * Match or enqueue a PR lock.
+ *
+ * Find or request an LDLM lock with xattr data.
+ * Since LDLM does not provide API for atomic match_or_enqueue,
+ * the function handles it with a separate enq lock.
+ * If successful, the function exits with the list lock held.
+ *
+ * \retval 0       no error occured
+ * \retval -ENOMEM not enough memory
+ */
+static int ll_xattr_find_get_lock(struct inode *inode,
+				  struct lookup_intent *oit,
+				  struct ptlrpc_request **req)
+{
+	enum ldlm_mode mode;
+	struct lustre_handle lockh = { 0 };
+	struct md_op_data *op_data;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct obd_export *exp = sbi->ll_md_exp;
+	int rc;
+
+	ENTRY;
+
+	mutex_lock(&lli->lli_xattrs_enq_lock);
+	/* inode may have been shrunk and recreated, so data is gone, match lock
+	 * only when data exists. */
+	if (ll_xattr_cache_valid(lli)) {
+		/* Try matching first. */
+		mode = ll_take_md_lock(inode, MDS_INODELOCK_XATTR, &lockh, 0,
+					LCK_PR);
+		if (mode != 0) {
+			/* fake oit in mdc_revalidate_lock() manner */
+			oit->it_lock_handle = lockh.cookie;
+			oit->it_lock_mode = mode;
+			goto out;
+		}
+	}
+
+	/* Enqueue if the lock isn't cached locally. */
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data)) {
+		mutex_unlock(&lli->lli_xattrs_enq_lock);
+		RETURN(PTR_ERR(op_data));
+	}
+
+	op_data->op_valid = OBD_MD_FLXATTR | OBD_MD_FLXATTRLS;
+
+	rc = md_intent_lock(exp, op_data, oit, req, &ll_md_blocking_ast, 0);
+	ll_finish_md_op_data(op_data);
+	*req = oit->it_request;
+
+	if (rc < 0) {
+		CDEBUG(D_CACHE, "md_intent_lock failed with %d for fid "DFID"\n",
+		       rc, PFID(ll_inode2fid(inode)));
+		mutex_unlock(&lli->lli_xattrs_enq_lock);
+		RETURN(rc);
+	}
+
+out:
+	down_write(&lli->lli_xattrs_list_rwsem);
+	mutex_unlock(&lli->lli_xattrs_enq_lock);
+
+	RETURN(0);
+}
+
+/**
+ * Refill the xattr cache.
+ *
+ * Fetch and cache the whole of xattrs for @inode, acquiring a read lock.
+ *
+ * \retval 0       no error occured
+ * \retval -EPROTO network protocol error
+ * \retval -ENOMEM not enough memory for the cache
+ */
+static int ll_xattr_cache_refill(struct inode *inode)
+{
+	struct lookup_intent oit = { .it_op = IT_GETXATTR };
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *req = NULL;
+	const char *xdata, *xval, *xtail, *xvtail;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct mdt_body *body;
+	__u32 *xsizes;
+	int rc = 0, i;
+
+	ENTRY;
+
+	rc = ll_xattr_find_get_lock(inode, &oit, &req);
+	if (rc)
+		GOTO(err_req, rc);
+
+	/* Do we have the data at this point? */
+	if (ll_xattr_cache_valid(lli)) {
+		ll_stats_ops_tally(sbi, LPROC_LL_GETXATTR_HITS, 1);
+		ll_intent_drop_lock(&oit);
+		GOTO(err_req, rc = 0);
+	}
+
+	/* Matched but no cache? Cancelled on error by a parallel refill. */
+	if (unlikely(req == NULL)) {
+		CDEBUG(D_CACHE, "cancelled by a parallel getxattr\n");
+		ll_intent_drop_lock(&oit);
+		GOTO(err_unlock, rc = -EAGAIN);
+	}
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL) {
+		CERROR("no MDT BODY in the refill xattr reply\n");
+		GOTO(err_cancel, rc = -EPROTO);
+	}
+	/* do not need swab xattr data */
+	xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA,
+						body->mbo_eadatasize);
+	xval = req_capsule_server_sized_get(&req->rq_pill, &RMF_EAVALS,
+						body->mbo_aclsize);
+	xsizes = req_capsule_server_sized_get(&req->rq_pill, &RMF_EAVALS_LENS,
+					      body->mbo_max_mdsize *
+					      sizeof(__u32));
+	if (xdata == NULL || xval == NULL || xsizes == NULL) {
+		CERROR("wrong setxattr reply\n");
+		GOTO(err_cancel, rc = -EPROTO);
+	}
+
+	xtail = xdata + body->mbo_eadatasize;
+	xvtail = xval + body->mbo_aclsize;
+
+	CDEBUG(D_CACHE, "caching: xdata=%p xtail=%p\n", xdata, xtail);
+
+	ll_xattr_cache_init(lli);
+
+	for (i = 0; i < body->mbo_max_mdsize; i++) {
+		CDEBUG(D_CACHE, "caching [%s]=%.*s\n", xdata, *xsizes, xval);
+		/* Perform consistency checks: attr names and vals in pill */
+		if (memchr(xdata, 0, xtail - xdata) == NULL) {
+			CERROR("xattr protocol violation (names are broken)\n");
+			rc = -EPROTO;
+		} else if (xval + *xsizes > xvtail) {
+			CERROR("xattr protocol violation (vals are broken)\n");
+			rc = -EPROTO;
+		} else if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_XATTR_ENOMEM)) {
+			rc = -ENOMEM;
+		} else if (!strcmp(xdata, XATTR_NAME_ACL_ACCESS)) {
+			/* Filter out ACL ACCESS since it's cached separately */
+			CDEBUG(D_CACHE, "not caching %s\n",
+			       XATTR_NAME_ACL_ACCESS);
+			rc = 0;
+		} else if (!strcmp(xdata, "security.selinux")) {
+			/* Filter out security.selinux, it is cached in slab */
+			CDEBUG(D_CACHE, "not caching security.selinux\n");
+			rc = 0;
+		} else {
+			rc = ll_xattr_cache_add(&lli->lli_xattrs, xdata, xval,
+						*xsizes);
+		}
+		if (rc < 0) {
+			ll_xattr_cache_destroy_locked(lli);
+			GOTO(err_cancel, rc);
+		}
+		xdata += strlen(xdata) + 1;
+		xval  += *xsizes;
+		xsizes++;
+	}
+
+	if (xdata != xtail || xval != xvtail)
+		CERROR("a hole in xattr data\n");
+
+	ll_set_lock_data(sbi->ll_md_exp, inode, &oit, NULL);
+	ll_intent_drop_lock(&oit);
+
+	ptlrpc_req_finished(req);
+	RETURN(0);
+
+err_cancel:
+	ldlm_lock_decref_and_cancel((struct lustre_handle *)
+				    &oit.it_lock_handle,
+				    oit.it_lock_mode);
+err_unlock:
+	up_write(&lli->lli_xattrs_list_rwsem);
+err_req:
+	if (rc == -ERANGE)
+		rc = -EAGAIN;
+
+	ptlrpc_req_finished(req);
+	RETURN(rc);
+}
+
+/**
+ * Get an xattr value or list xattrs using the write-through cache.
+ *
+ * Get the xattr value (@valid has OBD_MD_FLXATTR set) of @name or
+ * list xattr names (@valid has OBD_MD_FLXATTRLS set) for @inode.
+ * The resulting value/list is stored in @buffer if the former
+ * is not larger than @size.
+ *
+ * \retval 0        no error occured
+ * \retval -EPROTO  network protocol error
+ * \retval -ENOMEM  not enough memory for the cache
+ * \retval -ERANGE  the buffer is not large enough
+ * \retval -ENODATA no such attr or the list is empty
+ */
+int ll_xattr_cache_get(struct inode *inode,
+			const char *name,
+			char *buffer,
+			size_t size,
+			__u64 valid)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int rc = 0;
+
+	ENTRY;
+
+	LASSERT(!!(valid & OBD_MD_FLXATTR) ^ !!(valid & OBD_MD_FLXATTRLS));
+
+	down_read(&lli->lli_xattrs_list_rwsem);
+	if (!ll_xattr_cache_valid(lli)) {
+		up_read(&lli->lli_xattrs_list_rwsem);
+		rc = ll_xattr_cache_refill(inode);
+		if (rc)
+			RETURN(rc);
+		downgrade_write(&lli->lli_xattrs_list_rwsem);
+	} else {
+		ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR_HITS, 1);
+	}
+
+	if (valid & OBD_MD_FLXATTR) {
+		struct ll_xattr_entry *xattr;
+
+		rc = ll_xattr_cache_find(&lli->lli_xattrs, name, &xattr);
+		if (rc == 0) {
+			rc = xattr->xe_vallen;
+			/* zero size means we are only requested size in rc */
+			if (size != 0) {
+				if (size >= xattr->xe_vallen)
+					memcpy(buffer, xattr->xe_value,
+						xattr->xe_vallen);
+				else
+					rc = -ERANGE;
+			}
+		}
+	} else if (valid & OBD_MD_FLXATTRLS) {
+		rc = ll_xattr_cache_list(&lli->lli_xattrs,
+					 size ? buffer : NULL, size);
+	}
+
+	GOTO(out, rc);
+out:
+	up_read(&lli->lli_xattrs_list_rwsem);
+
+	RETURN(rc);
+}
+
diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr_security.c b/drivers/staging/lustrefsx/lustre/llite/xattr_security.c
new file mode 100644
index 0000000000000..82019cc8caef6
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/xattr_security.c
@@ -0,0 +1,189 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright (c) 2014 Bull SAS
+ *
+ * Copyright (c) 2015, 2016, Intel Corporation.
+ * Author: Sebastien Buisson sebastien.buisson@bull.net
+ */
+
+/*
+ * lustre/llite/xattr_security.c
+ * Handler for storing security labels as extended attributes.
+ */
+
+#include <linux/types.h>
+#include <linux/security.h>
+#include <linux/selinux.h>
+#include <linux/xattr.h>
+#include "llite_internal.h"
+
+#ifndef XATTR_SELINUX_SUFFIX
+# define XATTR_SELINUX_SUFFIX "selinux"
+#endif
+
+#ifndef XATTR_NAME_SELINUX
+# define XATTR_NAME_SELINUX XATTR_SECURITY_PREFIX XATTR_SELINUX_SUFFIX
+#endif
+
+/*
+ * Check for LL_SBI_FILE_SECCTX before calling.
+ */
+int ll_dentry_init_security(struct dentry *dentry, int mode, struct qstr *name,
+			    const char **secctx_name, void **secctx,
+			    __u32 *secctx_size)
+{
+#ifdef HAVE_SECURITY_DENTRY_INIT_SECURITY
+	int rc;
+
+	/* security_dentry_init_security() is strange. Like
+	 * security_inode_init_security() it may return a context (provided a
+	 * Linux security module is enabled) but unlike
+	 * security_inode_init_security() it does not return to us the name of
+	 * the extended attribute to store the context under (for example
+	 * "security.selinux"). So we only call it when we think we know what
+	 * the name of the extended attribute will be. This is OK-ish since
+	 * SELinux is the only module that implements
+	 * security_dentry_init_security(). Note that the NFS client code just
+	 * calls it and assumes that if anything is returned then it must come
+	 * from SELinux. */
+
+	if (!selinux_is_enabled())
+		return 0;
+
+	rc = security_dentry_init_security(dentry, mode, name, secctx,
+					   secctx_size);
+	if (rc < 0)
+		return rc;
+
+	*secctx_name = XATTR_NAME_SELINUX;
+#endif /* HAVE_SECURITY_DENTRY_INIT_SECURITY */
+
+	return 0;
+}
+
+#ifdef HAVE_SECURITY_IINITSEC_CALLBACK
+/**
+ * A helper function for ll_security_inode_init_security()
+ * that takes care of setting xattrs
+ *
+ * Get security context of @inode from @xattr_array,
+ * and put it in 'security.xxx' xattr of dentry
+ * stored in @fs_info.
+ *
+ * \retval 0        success
+ * \retval -ENOMEM  if no memory could be allocated for xattr name
+ * \retval < 0      failure to set xattr
+ */
+static int
+ll_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+	      void *fs_info)
+{
+	struct dentry *dentry = fs_info;
+	const struct xattr *xattr;
+	int err = 0;
+
+	for (xattr = xattr_array; xattr->name; xattr++) {
+		char *full_name;
+
+		full_name = kasprintf(GFP_KERNEL, "%s%s",
+				      XATTR_SECURITY_PREFIX, xattr->name);
+		if (!full_name) {
+			err = -ENOMEM;
+			break;
+		}
+
+		err = __vfs_setxattr(dentry, inode, full_name, xattr->value,
+				     xattr->value_len, XATTR_CREATE);
+		kfree(full_name);
+		if (err < 0)
+			break;
+	}
+	return err;
+}
+
+/**
+ * Initializes security context
+ *
+ * Get security context of @inode in @dir,
+ * and put it in 'security.xxx' xattr of @dentry.
+ *
+ * \retval 0        success, or SELinux is disabled
+ * \retval -ENOMEM  if no memory could be allocated for xattr name
+ * \retval < 0      failure to get security context or set xattr
+ */
+int
+ll_inode_init_security(struct dentry *dentry, struct inode *inode,
+		       struct inode *dir)
+{
+	if (!selinux_is_enabled())
+		return 0;
+
+	return ll_security_inode_init_security(inode, dir, NULL, NULL, 0,
+					       &ll_initxattrs, dentry);
+}
+#else /* !HAVE_SECURITY_IINITSEC_CALLBACK */
+/**
+ * Initializes security context
+ *
+ * Get security context of @inode in @dir,
+ * and put it in 'security.xxx' xattr of @dentry.
+ *
+ * \retval 0        success, or SELinux is disabled
+ * \retval -ENOMEM  if no memory could be allocated for xattr name
+ * \retval < 0      failure to get security context or set xattr
+ */
+int
+ll_inode_init_security(struct dentry *dentry, struct inode *inode,
+		       struct inode *dir)
+{
+	char *full_name;
+	void *value;
+	char *name;
+	size_t len;
+	int err;
+
+	if (!selinux_is_enabled())
+		return 0;
+
+	err = ll_security_inode_init_security(inode, dir, &name, &value, &len,
+					      NULL, dentry);
+	if (err != 0) {
+		if (err == -EOPNOTSUPP)
+			return 0;
+		return err;
+	}
+
+	full_name = kasprintf(GFP_KERNEL, "%s%s", XATTR_SECURITY_PREFIX, name);
+	if (!full_name)
+		GOTO(out_free, err = -ENOMEM);
+
+	err = __vfs_setxattr(dentry, inode, full_name, value, len,
+			     XATTR_CREATE);
+	kfree(full_name);
+out_free:
+	kfree(name);
+	kfree(value);
+
+	return err;
+}
+#endif /* HAVE_SECURITY_IINITSEC_CALLBACK */
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c b/drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c
new file mode 100644
index 0000000000000..b5ec306dcc224
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c
@@ -0,0 +1,84 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LMV
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/math64.h>
+#include <linux/seq_file.h>
+
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_fid.h>
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <lustre_dlm.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include "lmv_internal.h"
+
+int lmv_fld_lookup(struct lmv_obd *lmv, const struct lu_fid *fid, u32 *mds)
+{
+	struct obd_device *obd = lmv2obd_dev(lmv);
+	int rc;
+	ENTRY;
+
+	/* FIXME: Currently ZFS still use local seq for ROOT unfortunately, and
+	 * this fid_is_local check should be removed once LU-2240 is fixed */
+	if (!fid_is_sane(fid) || !(fid_seq_in_fldb(fid_seq(fid)) ||
+				   fid_seq_is_local_file(fid_seq(fid)))) {
+		CERROR("%s: invalid FID "DFID"\n", obd->obd_name, PFID(fid));
+		RETURN(-EINVAL);
+	}
+
+	rc = fld_client_lookup(&lmv->lmv_fld, fid_seq(fid), mds,
+                               LU_SEQ_RANGE_MDT, NULL);
+        if (rc) {
+		CERROR("Error while looking for mds number. Seq %#llx"
+                       ", err = %d\n", fid_seq(fid), rc);
+                RETURN(rc);
+        }
+
+        CDEBUG(D_INODE, "FLD lookup got mds #%x for fid="DFID"\n",
+               *mds, PFID(fid));
+
+        if (*mds >= lmv->desc.ld_tgt_count) {
+                CERROR("FLD lookup got invalid mds #%x (max: %x) "
+                       "for fid="DFID"\n", *mds, lmv->desc.ld_tgt_count,
+                       PFID(fid));
+                rc = -EINVAL;
+        }
+        RETURN(rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c b/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c
new file mode 100644
index 0000000000000..08a5a609e3fdb
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c
@@ -0,0 +1,516 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LMV
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/math64.h>
+#include <linux/seq_file.h>
+#include <linux/namei.h>
+#include <lustre_intent.h>
+
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <lustre_dlm.h>
+#include <lustre_mdc.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include "lmv_internal.h"
+
+static int lmv_intent_remote(struct obd_export *exp, struct lookup_intent *it,
+			     const struct lu_fid *parent_fid,
+			     struct ptlrpc_request **reqp,
+			     ldlm_blocking_callback cb_blocking,
+			     __u64 extra_lock_flags)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lmv_obd		*lmv = &obd->u.lmv;
+	struct ptlrpc_request	*req = NULL;
+	struct lustre_handle	plock;
+	struct md_op_data	*op_data;
+	struct lmv_tgt_desc	*tgt;
+	struct mdt_body		*body;
+	int			pmode;
+	int			rc = 0;
+	ENTRY;
+
+	body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		RETURN(-EPROTO);
+
+	LASSERT((body->mbo_valid & OBD_MD_MDS));
+
+	/*
+	 * Unfortunately, we have to lie to MDC/MDS to retrieve
+	 * attributes llite needs and provideproper locking.
+	 */
+	if (it->it_op & IT_LOOKUP)
+		it->it_op = IT_GETATTR;
+
+	/*
+	 * We got LOOKUP lock, but we really need attrs.
+	 */
+	pmode = it->it_lock_mode;
+	if (pmode) {
+		plock.cookie = it->it_lock_handle;
+		it->it_lock_mode = 0;
+		it->it_request = NULL;
+	}
+
+	LASSERT(fid_is_sane(&body->mbo_fid1));
+
+	tgt = lmv_find_target(lmv, &body->mbo_fid1);
+	if (IS_ERR(tgt))
+		GOTO(out, rc = PTR_ERR(tgt));
+
+	OBD_ALLOC_PTR(op_data);
+	if (op_data == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	op_data->op_fid1 = body->mbo_fid1;
+	/* Sent the parent FID to the remote MDT */
+	if (parent_fid != NULL) {
+		/* The parent fid is only for remote open to
+		 * check whether the open is from OBF,
+		 * see mdt_cross_open */
+		LASSERT(it->it_op & IT_OPEN);
+		op_data->op_fid2 = *parent_fid;
+	}
+
+	op_data->op_bias = MDS_CROSS_REF;
+	CDEBUG(D_INODE, "REMOTE_INTENT with fid="DFID" -> mds #%u\n",
+	       PFID(&body->mbo_fid1), tgt->ltd_idx);
+
+	rc = md_intent_lock(tgt->ltd_exp, op_data, it, &req, cb_blocking,
+			    extra_lock_flags);
+        if (rc)
+                GOTO(out_free_op_data, rc);
+
+	/*
+	 * LLite needs LOOKUP lock to track dentry revocation in order to
+	 * maintain dcache consistency. Thus drop UPDATE|PERM lock here
+	 * and put LOOKUP in request.
+	 */
+	if (it->it_lock_mode != 0) {
+		it->it_remote_lock_handle =
+					it->it_lock_handle;
+		it->it_remote_lock_mode = it->it_lock_mode;
+	}
+
+	if (pmode) {
+		it->it_lock_handle = plock.cookie;
+		it->it_lock_mode = pmode;
+	}
+
+	EXIT;
+out_free_op_data:
+	OBD_FREE_PTR(op_data);
+out:
+	if (rc && pmode)
+		ldlm_lock_decref(&plock, pmode);
+
+	ptlrpc_req_finished(*reqp);
+	*reqp = req;
+	return rc;
+}
+
+int lmv_revalidate_slaves(struct obd_export *exp,
+			  const struct lmv_stripe_md *lsm,
+			  ldlm_blocking_callback cb_blocking,
+			  int extra_lock_flags)
+{
+	struct obd_device      *obd = exp->exp_obd;
+	struct lmv_obd         *lmv = &obd->u.lmv;
+	struct ptlrpc_request	*req = NULL;
+	struct mdt_body		*body;
+	struct md_op_data      *op_data;
+	int                     i;
+	int                     rc = 0;
+
+	ENTRY;
+
+	/**
+	 * revalidate slaves has some problems, temporarily return,
+	 * we may not need that
+	 */
+	OBD_ALLOC_PTR(op_data);
+	if (op_data == NULL)
+		RETURN(-ENOMEM);
+
+	/**
+	 * Loop over the stripe information, check validity and update them
+	 * from MDS if needed.
+	 */
+	for (i = 0; i < lsm->lsm_md_stripe_count; i++) {
+		struct lu_fid		fid;
+		struct lookup_intent	it = { .it_op = IT_GETATTR };
+		struct lustre_handle	*lockh = NULL;
+		struct lmv_tgt_desc	*tgt = NULL;
+		struct inode		*inode;
+
+		fid = lsm->lsm_md_oinfo[i].lmo_fid;
+		inode = lsm->lsm_md_oinfo[i].lmo_root;
+
+		/*
+		 * Prepare op_data for revalidating. Note that @fid2 shluld be
+		 * defined otherwise it will go to server and take new lock
+		 * which is not needed here.
+		 */
+		memset(op_data, 0, sizeof(*op_data));
+		op_data->op_fid1 = fid;
+		op_data->op_fid2 = fid;
+
+		tgt = lmv_locate_mds(lmv, op_data, &fid);
+		if (IS_ERR(tgt))
+			GOTO(cleanup, rc = PTR_ERR(tgt));
+
+		CDEBUG(D_INODE, "Revalidate slave "DFID" -> mds #%u\n",
+		       PFID(&fid), tgt->ltd_idx);
+
+		if (req != NULL) {
+			ptlrpc_req_finished(req);
+			req = NULL;
+		}
+
+		rc = md_intent_lock(tgt->ltd_exp, op_data, &it, &req,
+				    cb_blocking, extra_lock_flags);
+		if (rc < 0)
+			GOTO(cleanup, rc);
+
+		lockh = (struct lustre_handle *)&it.it_lock_handle;
+		if (rc > 0 && req == NULL) {
+			/* slave inode is still valid */
+			CDEBUG(D_INODE, "slave "DFID" is still valid.\n",
+			       PFID(&fid));
+			rc = 0;
+		} else {
+			/* refresh slave from server */
+			body = req_capsule_server_get(&req->rq_pill,
+						      &RMF_MDT_BODY);
+			if (body == NULL) {
+				if (it.it_lock_mode && lockh) {
+					ldlm_lock_decref(lockh,
+						 it.it_lock_mode);
+					it.it_lock_mode = 0;
+				}
+				GOTO(cleanup, rc = -ENOENT);
+			}
+
+			i_size_write(inode, body->mbo_size);
+			inode->i_blocks = body->mbo_blocks;
+			set_nlink(inode, body->mbo_nlink);
+			LTIME_S(inode->i_atime) = body->mbo_atime;
+			LTIME_S(inode->i_ctime) = body->mbo_ctime;
+			LTIME_S(inode->i_mtime) = body->mbo_mtime;
+		}
+
+		md_set_lock_data(tgt->ltd_exp, lockh, inode, NULL);
+		if (it.it_lock_mode != 0 && lockh != NULL) {
+			ldlm_lock_decref(lockh, it.it_lock_mode);
+			it.it_lock_mode = 0;
+		}
+	}
+
+cleanup:
+	if (req != NULL)
+		ptlrpc_req_finished(req);
+
+	OBD_FREE_PTR(op_data);
+	RETURN(rc);
+}
+
+
+/*
+ * IT_OPEN is intended to open (and create, possible) an object. Parent (pid)
+ * may be split dir.
+ */
+static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
+			   struct lookup_intent *it,
+			   struct ptlrpc_request **reqp,
+			   ldlm_blocking_callback cb_blocking,
+			   __u64 extra_lock_flags)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lmv_obd		*lmv = &obd->u.lmv;
+	struct lmv_tgt_desc	*tgt;
+	struct mdt_body		*body;
+	int			rc;
+	ENTRY;
+
+	if (it->it_flags & MDS_OPEN_BY_FID) {
+		LASSERT(fid_is_sane(&op_data->op_fid2));
+
+		/* for striped directory, we can't know parent stripe fid
+		 * without name, but we can set it to child fid, and MDT
+		 * will obtain it from linkea in open in such case. */
+		if (op_data->op_mea1 != NULL)
+			op_data->op_fid1 = op_data->op_fid2;
+
+		tgt = lmv_find_target(lmv, &op_data->op_fid2);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
+
+		op_data->op_mds = tgt->ltd_idx;
+	} else {
+		LASSERT(fid_is_sane(&op_data->op_fid1));
+		LASSERT(fid_is_zero(&op_data->op_fid2));
+		LASSERT(op_data->op_name != NULL);
+
+		tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
+	}
+
+	/* If it is ready to open the file by FID, do not need
+	 * allocate FID at all, otherwise it will confuse MDT */
+	if ((it->it_op & IT_CREAT) && !(it->it_flags & MDS_OPEN_BY_FID)) {
+		/*
+		 * For lookup(IT_CREATE) cases allocate new fid and setup FLD
+		 * for it.
+		 */
+		rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
+		if (rc != 0)
+			RETURN(rc);
+	}
+
+	CDEBUG(D_INODE, "OPEN_INTENT with fid1="DFID", fid2="DFID","
+	       " name='%s' -> mds #%u\n", PFID(&op_data->op_fid1),
+	       PFID(&op_data->op_fid2), op_data->op_name, tgt->ltd_idx);
+
+	rc = md_intent_lock(tgt->ltd_exp, op_data, it, reqp, cb_blocking,
+			    extra_lock_flags);
+	if (rc != 0)
+		RETURN(rc);
+	/*
+	 * Nothing is found, do not access body->fid1 as it is zero and thus
+	 * pointless.
+	 */
+	if ((it->it_disposition & DISP_LOOKUP_NEG) &&
+	    !(it->it_disposition & DISP_OPEN_CREATE) &&
+	    !(it->it_disposition & DISP_OPEN_OPEN))
+		RETURN(rc);
+
+	body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		RETURN(-EPROTO);
+
+	/* Not cross-ref case, just get out of here. */
+	if (unlikely((body->mbo_valid & OBD_MD_MDS))) {
+		rc = lmv_intent_remote(exp, it, &op_data->op_fid1, reqp,
+				       cb_blocking, extra_lock_flags);
+		if (rc != 0)
+			RETURN(rc);
+
+		body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
+		if (body == NULL)
+			RETURN(-EPROTO);
+	}
+
+	RETURN(rc);
+}
+
+/*
+ * Handler for: getattr, lookup and revalidate cases.
+ */
+static int
+lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data,
+		  struct lookup_intent *it, struct ptlrpc_request **reqp,
+		  ldlm_blocking_callback cb_blocking,
+		  __u64 extra_lock_flags)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lmv_obd		*lmv = &obd->u.lmv;
+	struct lmv_tgt_desc	*tgt = NULL;
+	struct mdt_body		*body;
+	struct lmv_stripe_md	*lsm = op_data->op_mea1;
+	int			rc = 0;
+	ENTRY;
+
+	/* If it returns ERR_PTR(-EBADFD) then it is an unknown hash type
+	 * it will try all stripes to locate the object */
+	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(tgt) && (PTR_ERR(tgt) != -EBADFD))
+		RETURN(PTR_ERR(tgt));
+
+	/* Both migrating dir and unknown hash dir need to try
+	 * all of sub-stripes */
+	if (lsm != NULL && !lmv_is_known_hash_type(lsm->lsm_md_hash_type)) {
+		struct lmv_oinfo *oinfo;
+
+		oinfo = &lsm->lsm_md_oinfo[0];
+
+		op_data->op_fid1 = oinfo->lmo_fid;
+		op_data->op_mds = oinfo->lmo_mds;
+		tgt = lmv_get_target(lmv, oinfo->lmo_mds, NULL);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
+	}
+
+	if (!fid_is_sane(&op_data->op_fid2))
+		fid_zero(&op_data->op_fid2);
+
+	CDEBUG(D_INODE, "LOOKUP_INTENT with fid1="DFID", fid2="DFID
+	       ", name='%s' -> mds #%u lsm=%p lsm_magic=%x\n",
+	       PFID(&op_data->op_fid1), PFID(&op_data->op_fid2),
+	       op_data->op_name ? op_data->op_name : "<NULL>",
+	       tgt->ltd_idx, lsm, lsm == NULL ? -1 : lsm->lsm_md_magic);
+
+	op_data->op_bias &= ~MDS_CROSS_REF;
+
+	rc = md_intent_lock(tgt->ltd_exp, op_data, it, reqp, cb_blocking,
+			    extra_lock_flags);
+	if (rc < 0)
+		RETURN(rc);
+
+	if (*reqp == NULL) {
+		/* If RPC happens, lsm information will be revalidated
+		 * during update_inode process (see ll_update_lsm_md) */
+		if (op_data->op_mea2 != NULL) {
+			rc = lmv_revalidate_slaves(exp, op_data->op_mea2,
+						   cb_blocking,
+						   extra_lock_flags);
+			if (rc != 0)
+				RETURN(rc);
+		}
+		RETURN(rc);
+	} else if (it_disposition(it, DISP_LOOKUP_NEG) && lsm != NULL &&
+		   lmv_need_try_all_stripes(lsm)) {
+		/* For migrating and unknown hash type directory, it will
+		 * try to target the entry on other stripes */
+		int stripe_index;
+
+		for (stripe_index = 1;
+		     stripe_index < lsm->lsm_md_stripe_count &&
+		     it_disposition(it, DISP_LOOKUP_NEG); stripe_index++) {
+			struct lmv_oinfo *oinfo;
+
+			/* release the previous request */
+			ptlrpc_req_finished(*reqp);
+			it->it_request = NULL;
+			*reqp = NULL;
+
+			oinfo = &lsm->lsm_md_oinfo[stripe_index];
+			tgt = lmv_find_target(lmv, &oinfo->lmo_fid);
+			if (IS_ERR(tgt))
+				RETURN(PTR_ERR(tgt));
+
+			CDEBUG(D_INODE, "Try other stripes " DFID"\n",
+			       PFID(&oinfo->lmo_fid));
+
+			op_data->op_fid1 = oinfo->lmo_fid;
+			it->it_disposition &= ~DISP_ENQ_COMPLETE;
+			rc = md_intent_lock(tgt->ltd_exp, op_data, it, reqp,
+					    cb_blocking, extra_lock_flags);
+			if (rc != 0)
+				RETURN(rc);
+		}
+	}
+
+	if (!it_has_reply_body(it))
+		RETURN(0);
+
+	/*
+	 * MDS has returned success. Probably name has been resolved in
+	 * remote inode. Let's check this.
+	 */
+	body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		RETURN(-EPROTO);
+
+	/* Not cross-ref case, just get out of here. */
+	if (unlikely((body->mbo_valid & OBD_MD_MDS))) {
+		rc = lmv_intent_remote(exp, it, NULL, reqp, cb_blocking,
+				       extra_lock_flags);
+		if (rc != 0)
+			RETURN(rc);
+		body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
+		if (body == NULL)
+			RETURN(-EPROTO);
+	}
+
+	RETURN(rc);
+}
+
+int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
+		    struct lookup_intent *it, struct ptlrpc_request **reqp,
+		    ldlm_blocking_callback cb_blocking,
+		    __u64 extra_lock_flags)
+{
+	int rc;
+	ENTRY;
+
+	LASSERT(it != NULL);
+	LASSERT(fid_is_sane(&op_data->op_fid1));
+
+	CDEBUG(D_INODE, "INTENT LOCK '%s' for "DFID" '%.*s' on "DFID"\n",
+		LL_IT2STR(it), PFID(&op_data->op_fid2),
+		(int)op_data->op_namelen, op_data->op_name,
+		PFID(&op_data->op_fid1));
+
+	if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_LAYOUT | IT_GETXATTR))
+		rc = lmv_intent_lookup(exp, op_data, it, reqp, cb_blocking,
+				       extra_lock_flags);
+	else if (it->it_op & IT_OPEN)
+		rc = lmv_intent_open(exp, op_data, it, reqp, cb_blocking,
+				     extra_lock_flags);
+	else
+		LBUG();
+
+	if (rc < 0) {
+		struct lustre_handle lock_handle;
+
+		if (it->it_lock_mode != 0) {
+			lock_handle.cookie = it->it_lock_handle;
+			ldlm_lock_decref_and_cancel(&lock_handle,
+						    it->it_lock_mode);
+		}
+
+		it->it_lock_handle = 0;
+		it->it_lock_mode = 0;
+
+		if (it->it_remote_lock_mode != 0) {
+			lock_handle.cookie = it->it_remote_lock_handle;
+			ldlm_lock_decref_and_cancel(&lock_handle,
+						    it->it_remote_lock_mode);
+		}
+
+		it->it_remote_lock_handle = 0;
+		it->it_remote_lock_mode = 0;
+	}
+
+	RETURN(rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h b/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h
new file mode 100644
index 0000000000000..a9dd6644a2697
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h
@@ -0,0 +1,161 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LMV_INTERNAL_H_
+#define _LMV_INTERNAL_H_
+
+#include <lustre/lustre_idl.h>
+#include <obd.h>
+#include <lustre_lmv.h>
+
+#define LMV_MAX_TGT_COUNT 128
+
+#define LL_IT2STR(it)				        \
+	((it) ? ldlm_it2str((it)->it_op) : "0")
+
+int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
+		    struct lookup_intent *it, struct ptlrpc_request **reqp,
+		    ldlm_blocking_callback cb_blocking,
+		    __u64 extra_lock_flags);
+
+int lmv_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *,
+		     void *, int);
+int lmv_fld_lookup(struct lmv_obd *lmv, const struct lu_fid *fid, u32 *mds);
+int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, u32 mds);
+int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp,
+		  struct lu_fid *fid, struct md_op_data *op_data);
+
+int lmv_revalidate_slaves(struct obd_export *exp,
+			  const struct lmv_stripe_md *lsm,
+			  ldlm_blocking_callback cb_blocking,
+			  int extra_lock_flags);
+
+static inline struct obd_device *lmv2obd_dev(struct lmv_obd *lmv)
+{
+	return container_of0(lmv, struct obd_device, u.lmv);
+}
+
+static inline struct lmv_tgt_desc *
+lmv_get_target(struct lmv_obd *lmv, u32 mdt_idx, int *index)
+{
+	int i;
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		if (lmv->tgts[i] == NULL)
+			continue;
+
+		if (lmv->tgts[i]->ltd_idx == mdt_idx) {
+			if (index != NULL)
+				*index = i;
+			return lmv->tgts[i];
+		}
+	}
+
+	return ERR_PTR(-ENODEV);
+}
+
+static inline int
+lmv_find_target_index(struct lmv_obd *lmv, const struct lu_fid *fid)
+{
+	struct lmv_tgt_desc	*ltd;
+	u32			mdt_idx = 0;
+	int			index = 0;
+
+	if (lmv->desc.ld_tgt_count > 1) {
+		int rc;
+		rc = lmv_fld_lookup(lmv, fid, &mdt_idx);
+		if (rc < 0)
+			return rc;
+	}
+
+	ltd = lmv_get_target(lmv, mdt_idx, &index);
+	if (IS_ERR(ltd))
+		return PTR_ERR(ltd);
+
+	return index;
+}
+
+static inline struct lmv_tgt_desc *
+lmv_find_target(struct lmv_obd *lmv, const struct lu_fid *fid)
+{
+	int index;
+
+	index = lmv_find_target_index(lmv, fid);
+	if (index < 0)
+		return ERR_PTR(index);
+
+	return lmv->tgts[index];
+}
+
+static inline int lmv_stripe_md_size(int stripe_count)
+{
+	struct lmv_stripe_md *lsm;
+
+	return sizeof(*lsm) + stripe_count * sizeof(lsm->lsm_md_oinfo[0]);
+}
+
+static inline const struct lmv_oinfo *
+lsm_name_to_stripe_info(const struct lmv_stripe_md *lsm, const char *name,
+			int namelen)
+{
+	int stripe_index;
+
+	stripe_index = lmv_name_to_stripe_index(lsm->lsm_md_hash_type,
+						lsm->lsm_md_stripe_count,
+						name, namelen);
+	if (stripe_index < 0)
+		return ERR_PTR(stripe_index);
+
+	LASSERTF(stripe_index < lsm->lsm_md_stripe_count,
+		 "stripe_index = %d, stripe_count = %d hash_type = %x"
+		 "name = %.*s\n", stripe_index, lsm->lsm_md_stripe_count,
+		 lsm->lsm_md_hash_type, namelen, name);
+
+	return &lsm->lsm_md_oinfo[stripe_index];
+}
+
+static inline bool lmv_need_try_all_stripes(const struct lmv_stripe_md *lsm)
+{
+	return !lmv_is_known_hash_type(lsm->lsm_md_hash_type) ||
+	       lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION;
+}
+
+struct lmv_tgt_desc
+*lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
+		struct lu_fid *fid);
+/* lproc_lmv.c */
+#ifdef CONFIG_PROC_FS
+extern struct lprocfs_vars lprocfs_lmv_obd_vars[];
+#endif
+extern struct file_operations lmv_proc_target_fops;
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c b/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c
new file mode 100644
index 0000000000000..3fed10fc1a1c0
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c
@@ -0,0 +1,3193 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LMV
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/user_namespace.h>
+#ifdef HAVE_UIDGID_HEADER
+# include <linux/uidgid.h>
+#endif
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/mm.h>
+#include <linux/math64.h>
+#include <linux/seq_file.h>
+#include <linux/namei.h>
+
+#include <lustre/lustre_idl.h>
+#include <obd_support.h>
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <obd_class.h>
+#include <lustre_lmv.h>
+#include <lprocfs_status.h>
+#include <cl_object.h>
+#include <lustre_fid.h>
+#include <uapi/linux/lustre_ioctl.h>
+#include <lustre_kernelcomm.h>
+#include "lmv_internal.h"
+
+static int lmv_check_connect(struct obd_device *obd);
+
+static void lmv_activate_target(struct lmv_obd *lmv,
+                                struct lmv_tgt_desc *tgt,
+                                int activate)
+{
+        if (tgt->ltd_active == activate)
+                return;
+
+        tgt->ltd_active = activate;
+        lmv->desc.ld_active_tgt_count += (activate ? 1 : -1);
+
+	tgt->ltd_exp->exp_obd->obd_inactive = !activate;
+}
+
+/**
+ * Error codes:
+ *
+ *  -EINVAL  : UUID can't be found in the LMV's target list
+ *  -ENOTCONN: The UUID is found, but the target connection is bad (!)
+ *  -EBADF   : The UUID is found, but the OBD of the wrong type (!)
+ */
+static int lmv_set_mdc_active(struct lmv_obd *lmv,
+			      const struct obd_uuid *uuid,
+			      int activate)
+{
+	struct lmv_tgt_desc	*tgt = NULL;
+	struct obd_device	*obd;
+	__u32			 i;
+	int			 rc = 0;
+	ENTRY;
+
+	CDEBUG(D_INFO, "Searching in lmv %p for uuid %s (activate=%d)\n",
+			lmv, uuid->uuid, activate);
+
+	spin_lock(&lmv->lmv_lock);
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		tgt = lmv->tgts[i];
+		if (tgt == NULL || tgt->ltd_exp == NULL)
+			continue;
+
+		CDEBUG(D_INFO, "Target idx %d is %s conn %#llx\n", i,
+		       tgt->ltd_uuid.uuid, tgt->ltd_exp->exp_handle.h_cookie);
+
+		if (obd_uuid_equals(uuid, &tgt->ltd_uuid))
+			break;
+	}
+
+        if (i == lmv->desc.ld_tgt_count)
+                GOTO(out_lmv_lock, rc = -EINVAL);
+
+        obd = class_exp2obd(tgt->ltd_exp);
+        if (obd == NULL)
+                GOTO(out_lmv_lock, rc = -ENOTCONN);
+
+        CDEBUG(D_INFO, "Found OBD %s=%s device %d (%p) type %s at LMV idx %d\n",
+               obd->obd_name, obd->obd_uuid.uuid, obd->obd_minor, obd,
+               obd->obd_type->typ_name, i);
+        LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0);
+
+        if (tgt->ltd_active == activate) {
+                CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd,
+                       activate ? "" : "in");
+                GOTO(out_lmv_lock, rc);
+        }
+
+        CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd,
+               activate ? "" : "in");
+        lmv_activate_target(lmv, tgt, activate);
+        EXIT;
+
+ out_lmv_lock:
+	spin_unlock(&lmv->lmv_lock);
+	return rc;
+}
+
+struct obd_uuid *lmv_get_uuid(struct obd_export *exp)
+{
+	struct lmv_obd		*lmv = &exp->exp_obd->u.lmv;
+	struct lmv_tgt_desc	*tgt = lmv->tgts[0];
+
+	return (tgt == NULL) ? NULL : obd_get_uuid(tgt->ltd_exp);
+}
+
+static int lmv_notify(struct obd_device *obd, struct obd_device *watched,
+		      enum obd_notify_event ev)
+{
+        struct obd_connect_data *conn_data;
+        struct lmv_obd          *lmv = &obd->u.lmv;
+        struct obd_uuid         *uuid;
+        int                      rc = 0;
+        ENTRY;
+
+        if (strcmp(watched->obd_type->typ_name, LUSTRE_MDC_NAME)) {
+                CERROR("unexpected notification of %s %s!\n",
+                       watched->obd_type->typ_name,
+                       watched->obd_name);
+                RETURN(-EINVAL);
+        }
+
+        uuid = &watched->u.cli.cl_target_uuid;
+        if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE) {
+                /*
+                 * Set MDC as active before notifying the observer, so the
+                 * observer can use the MDC normally.
+                 */
+                rc = lmv_set_mdc_active(lmv, uuid,
+                                        ev == OBD_NOTIFY_ACTIVE);
+                if (rc) {
+                        CERROR("%sactivation of %s failed: %d\n",
+                               ev == OBD_NOTIFY_ACTIVE ? "" : "de",
+                               uuid->uuid, rc);
+                        RETURN(rc);
+                }
+	} else if (ev == OBD_NOTIFY_OCD) {
+		conn_data = &watched->u.cli.cl_import->imp_connect_data;
+		/*
+		 * XXX: Make sure that ocd_connect_flags from all targets are
+		 * the same. Otherwise one of MDTs runs wrong version or
+		 * something like this.  --umka
+		 */
+		obd->obd_self_export->exp_connect_data = *conn_data;
+	}
+
+	/*
+	 * Pass the notification up the chain.
+	 */
+	if (obd->obd_observer)
+		rc = obd_notify(obd->obd_observer, watched, ev);
+
+	RETURN(rc);
+}
+
+static int lmv_connect(const struct lu_env *env,
+		       struct obd_export **pexp, struct obd_device *obd,
+		       struct obd_uuid *cluuid, struct obd_connect_data *data,
+		       void *localdata)
+{
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lustre_handle conn = { 0 };
+	struct obd_export *exp;
+	int rc;
+	ENTRY;
+
+	rc = class_connect(&conn, obd, cluuid);
+	if (rc) {
+		CERROR("class_connection() returned %d\n", rc);
+		RETURN(rc);
+	}
+
+	exp = class_conn2export(&conn);
+
+	lmv->connected = 0;
+	lmv->conn_data = *data;
+
+	if (lmv->targets_proc_entry == NULL) {
+		lmv->targets_proc_entry = lprocfs_register("target_obds",
+							   obd->obd_proc_entry,
+							   NULL, NULL);
+		if (IS_ERR(lmv->targets_proc_entry)) {
+			CERROR("%s: cannot register "
+			       "/proc/fs/lustre/%s/%s/target_obds\n",
+			       obd->obd_name, obd->obd_type->typ_name,
+			       obd->obd_name);
+			lmv->targets_proc_entry = NULL;
+		}
+	}
+
+	rc = lmv_check_connect(obd);
+	if (rc != 0)
+		GOTO(out_proc, rc);
+
+	*pexp = exp;
+
+	RETURN(rc);
+
+out_proc:
+	if (lmv->targets_proc_entry != NULL)
+		lprocfs_remove(&lmv->targets_proc_entry);
+
+	class_disconnect(exp);
+
+	return rc;
+}
+
+static int lmv_init_ea_size(struct obd_export *exp, __u32 easize,
+			    __u32 def_easize)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lmv_obd		*lmv = &obd->u.lmv;
+	__u32			 i;
+	int			 rc = 0;
+	int			 change = 0;
+	ENTRY;
+
+        if (lmv->max_easize < easize) {
+                lmv->max_easize = easize;
+                change = 1;
+        }
+        if (lmv->max_def_easize < def_easize) {
+                lmv->max_def_easize = def_easize;
+                change = 1;
+        }
+
+	if (change == 0)
+		RETURN(0);
+
+	if (lmv->connected == 0)
+		RETURN(0);
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		struct lmv_tgt_desc *tgt = lmv->tgts[i];
+
+		if (tgt == NULL || tgt->ltd_exp == NULL || !tgt->ltd_active) {
+			CWARN("%s: NULL export for %d\n", obd->obd_name, i);
+			continue;
+		}
+
+		rc = md_init_ea_size(tgt->ltd_exp, easize, def_easize);
+		if (rc) {
+			CERROR("%s: obd_init_ea_size() failed on MDT target %d:"
+			       " rc = %d\n", obd->obd_name, i, rc);
+			break;
+		}
+	}
+	RETURN(rc);
+}
+
+#define MAX_STRING_SIZE 128
+
+int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
+{
+        struct lmv_obd          *lmv = &obd->u.lmv;
+        struct obd_device       *mdc_obd;
+        struct obd_export       *mdc_exp;
+        struct lu_fld_target     target;
+        int                      rc;
+        ENTRY;
+
+        mdc_obd = class_find_client_obd(&tgt->ltd_uuid, LUSTRE_MDC_NAME,
+                                        &obd->obd_uuid);
+        if (!mdc_obd) {
+                CERROR("target %s not attached\n", tgt->ltd_uuid.uuid);
+                RETURN(-EINVAL);
+        }
+
+	CDEBUG(D_CONFIG, "connect to %s(%s) - %s, %s\n",
+	       mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
+	       tgt->ltd_uuid.uuid, obd->obd_uuid.uuid);
+
+        if (!mdc_obd->obd_set_up) {
+                CERROR("target %s is not set up\n", tgt->ltd_uuid.uuid);
+                RETURN(-EINVAL);
+        }
+
+	rc = obd_connect(NULL, &mdc_exp, mdc_obd, &obd->obd_uuid,
+			 &lmv->conn_data, NULL);
+        if (rc) {
+                CERROR("target %s connect error %d\n", tgt->ltd_uuid.uuid, rc);
+                RETURN(rc);
+        }
+
+	/*
+	 * Init fid sequence client for this mdc and add new fld target.
+	 */
+	rc = obd_fid_init(mdc_obd, mdc_exp, LUSTRE_SEQ_METADATA);
+	if (rc)
+		RETURN(rc);
+
+        target.ft_srv = NULL;
+        target.ft_exp = mdc_exp;
+        target.ft_idx = tgt->ltd_idx;
+
+        fld_client_add_target(&lmv->lmv_fld, &target);
+
+        rc = obd_register_observer(mdc_obd, obd);
+        if (rc) {
+                obd_disconnect(mdc_exp);
+                CERROR("target %s register_observer error %d\n",
+                       tgt->ltd_uuid.uuid, rc);
+                RETURN(rc);
+        }
+
+	if (obd->obd_observer) {
+		/*
+		 * Tell the observer about the new target.
+		 */
+		rc = obd_notify(obd->obd_observer, mdc_exp->exp_obd,
+				OBD_NOTIFY_ACTIVE);
+		if (rc) {
+			obd_disconnect(mdc_exp);
+			RETURN(rc);
+		}
+	}
+
+	tgt->ltd_active = 1;
+	tgt->ltd_exp = mdc_exp;
+	lmv->desc.ld_active_tgt_count++;
+
+	md_init_ea_size(tgt->ltd_exp, lmv->max_easize, lmv->max_def_easize);
+
+	CDEBUG(D_CONFIG, "Connected to %s(%s) successfully (%d)\n",
+		mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
+		atomic_read(&obd->obd_refcount));
+
+	if (lmv->targets_proc_entry != NULL) {
+		struct proc_dir_entry *mdc_symlink;
+
+		LASSERT(mdc_obd->obd_type != NULL);
+		LASSERT(mdc_obd->obd_type->typ_name != NULL);
+		mdc_symlink = lprocfs_add_symlink(mdc_obd->obd_name,
+						  lmv->targets_proc_entry,
+						  "../../../%s/%s",
+						  mdc_obd->obd_type->typ_name,
+						  mdc_obd->obd_name);
+		if (mdc_symlink == NULL) {
+			CERROR("cannot register LMV target "
+			       "/proc/fs/lustre/%s/%s/target_obds/%s\n",
+			       obd->obd_type->typ_name, obd->obd_name,
+			       mdc_obd->obd_name);
+		}
+	}
+	RETURN(0);
+}
+
+static void lmv_del_target(struct lmv_obd *lmv, int index)
+{
+	if (lmv->tgts[index] == NULL)
+		return;
+
+	OBD_FREE_PTR(lmv->tgts[index]);
+	lmv->tgts[index] = NULL;
+	return;
+}
+
+static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
+			   __u32 index, int gen)
+{
+	struct obd_device *mdc_obd;
+        struct lmv_obd      *lmv = &obd->u.lmv;
+        struct lmv_tgt_desc *tgt;
+	int		     orig_tgt_count = 0;
+        int                  rc = 0;
+        ENTRY;
+
+	CDEBUG(D_CONFIG, "Target uuid: %s. index %d\n", uuidp->uuid, index);
+	mdc_obd = class_find_client_obd(uuidp, LUSTRE_MDC_NAME,
+					&obd->obd_uuid);
+	if (!mdc_obd) {
+		CERROR("%s: Target %s not attached: rc = %d\n",
+		       obd->obd_name, uuidp->uuid, -EINVAL);
+		RETURN(-EINVAL);
+	}
+
+	mutex_lock(&lmv->lmv_init_mutex);
+	if ((index < lmv->tgts_size) && (lmv->tgts[index] != NULL)) {
+		tgt = lmv->tgts[index];
+		CERROR("%s: UUID %s already assigned at LOV target index %d:"
+		       " rc = %d\n", obd->obd_name,
+		       obd_uuid2str(&tgt->ltd_uuid), index, -EEXIST);
+		mutex_unlock(&lmv->lmv_init_mutex);
+		RETURN(-EEXIST);
+	}
+
+	if (index >= lmv->tgts_size) {
+		/* We need to reallocate the lmv target array. */
+		struct lmv_tgt_desc **newtgts, **old = NULL;
+		__u32 newsize = 1;
+		__u32 oldsize = 0;
+
+		while (newsize < index + 1)
+			newsize = newsize << 1;
+		OBD_ALLOC(newtgts, sizeof(*newtgts) * newsize);
+		if (newtgts == NULL) {
+			mutex_unlock(&lmv->lmv_init_mutex);
+			RETURN(-ENOMEM);
+		}
+
+		if (lmv->tgts_size) {
+			memcpy(newtgts, lmv->tgts,
+			       sizeof(*newtgts) * lmv->tgts_size);
+			old = lmv->tgts;
+			oldsize = lmv->tgts_size;
+		}
+
+		lmv->tgts = newtgts;
+		lmv->tgts_size = newsize;
+		smp_rmb();
+		if (old)
+			OBD_FREE(old, sizeof(*old) * oldsize);
+
+		CDEBUG(D_CONFIG, "tgts: %p size: %d\n", lmv->tgts,
+		       lmv->tgts_size);
+	}
+
+	OBD_ALLOC_PTR(tgt);
+	if (!tgt) {
+		mutex_unlock(&lmv->lmv_init_mutex);
+		RETURN(-ENOMEM);
+	}
+
+	mutex_init(&tgt->ltd_fid_mutex);
+	tgt->ltd_idx = index;
+	tgt->ltd_uuid = *uuidp;
+	tgt->ltd_active = 0;
+	lmv->tgts[index] = tgt;
+	if (index >= lmv->desc.ld_tgt_count) {
+		orig_tgt_count = lmv->desc.ld_tgt_count;
+		lmv->desc.ld_tgt_count = index + 1;
+	}
+
+	if (lmv->connected == 0) {
+		/* lmv_check_connect() will connect this target. */
+		mutex_unlock(&lmv->lmv_init_mutex);
+		RETURN(0);
+	}
+
+	/* Otherwise let's connect it ourselves */
+	mutex_unlock(&lmv->lmv_init_mutex);
+	rc = lmv_connect_mdc(obd, tgt);
+	if (rc != 0) {
+		spin_lock(&lmv->lmv_lock);
+		if (lmv->desc.ld_tgt_count == index + 1)
+			lmv->desc.ld_tgt_count = orig_tgt_count;
+		memset(tgt, 0, sizeof(*tgt));
+		spin_unlock(&lmv->lmv_lock);
+	} else {
+		int easize = sizeof(struct lmv_stripe_md) +
+			lmv->desc.ld_tgt_count * sizeof(struct lu_fid);
+		lmv_init_ea_size(obd->obd_self_export, easize, 0);
+	}
+
+	RETURN(rc);
+}
+
+static int lmv_check_connect(struct obd_device *obd)
+{
+	struct lmv_obd		*lmv = &obd->u.lmv;
+	struct lmv_tgt_desc	*tgt;
+	__u32			 i;
+	int			 rc;
+	int			 easize;
+	ENTRY;
+
+        if (lmv->connected)
+                RETURN(0);
+
+	mutex_lock(&lmv->lmv_init_mutex);
+        if (lmv->connected) {
+		mutex_unlock(&lmv->lmv_init_mutex);
+                RETURN(0);
+        }
+
+        if (lmv->desc.ld_tgt_count == 0) {
+		mutex_unlock(&lmv->lmv_init_mutex);
+                CERROR("%s: no targets configured.\n", obd->obd_name);
+                RETURN(-EINVAL);
+        }
+
+	LASSERT(lmv->tgts != NULL);
+
+	if (lmv->tgts[0] == NULL) {
+		mutex_unlock(&lmv->lmv_init_mutex);
+		CERROR("%s: no target configured for index 0.\n",
+		       obd->obd_name);
+		RETURN(-EINVAL);
+	}
+
+	CDEBUG(D_CONFIG, "Time to connect %s to %s\n",
+	       obd->obd_uuid.uuid, obd->obd_name);
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		tgt = lmv->tgts[i];
+		if (tgt == NULL)
+			continue;
+		rc = lmv_connect_mdc(obd, tgt);
+		if (rc)
+			GOTO(out_disc, rc);
+	}
+
+	lmv->connected = 1;
+	easize = lmv_mds_md_size(lmv->desc.ld_tgt_count, LMV_MAGIC);
+	lmv_init_ea_size(obd->obd_self_export, easize, 0);
+	mutex_unlock(&lmv->lmv_init_mutex);
+	RETURN(0);
+
+ out_disc:
+        while (i-- > 0) {
+                int rc2;
+		tgt = lmv->tgts[i];
+		if (tgt == NULL)
+			continue;
+                tgt->ltd_active = 0;
+                if (tgt->ltd_exp) {
+                        --lmv->desc.ld_active_tgt_count;
+                        rc2 = obd_disconnect(tgt->ltd_exp);
+                        if (rc2) {
+                                CERROR("LMV target %s disconnect on "
+                                       "MDC idx %d: error %d\n",
+                                       tgt->ltd_uuid.uuid, i, rc2);
+                        }
+                }
+        }
+
+	mutex_unlock(&lmv->lmv_init_mutex);
+
+	RETURN(rc);
+}
+
+static int lmv_disconnect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
+{
+        struct lmv_obd         *lmv = &obd->u.lmv;
+        struct obd_device      *mdc_obd;
+        int                     rc;
+        ENTRY;
+
+        LASSERT(tgt != NULL);
+        LASSERT(obd != NULL);
+
+        mdc_obd = class_exp2obd(tgt->ltd_exp);
+
+        if (mdc_obd) {
+                mdc_obd->obd_force = obd->obd_force;
+                mdc_obd->obd_fail = obd->obd_fail;
+                mdc_obd->obd_no_recov = obd->obd_no_recov;
+
+		if (lmv->targets_proc_entry != NULL)
+			lprocfs_remove_proc_entry(mdc_obd->obd_name,
+						  lmv->targets_proc_entry);
+	}
+
+	rc = obd_fid_fini(tgt->ltd_exp->exp_obd);
+	if (rc)
+		CERROR("Can't finanize fids factory\n");
+
+        CDEBUG(D_INFO, "Disconnected from %s(%s) successfully\n",
+               tgt->ltd_exp->exp_obd->obd_name,
+               tgt->ltd_exp->exp_obd->obd_uuid.uuid);
+
+        obd_register_observer(tgt->ltd_exp->exp_obd, NULL);
+        rc = obd_disconnect(tgt->ltd_exp);
+        if (rc) {
+                if (tgt->ltd_active) {
+                        CERROR("Target %s disconnect error %d\n",
+                               tgt->ltd_uuid.uuid, rc);
+                }
+        }
+
+        lmv_activate_target(lmv, tgt, 0);
+        tgt->ltd_exp = NULL;
+        RETURN(0);
+}
+
+static int lmv_disconnect(struct obd_export *exp)
+{
+	struct obd_device	*obd = class_exp2obd(exp);
+	struct lmv_obd		*lmv = &obd->u.lmv;
+	int			 rc;
+	__u32			 i;
+	ENTRY;
+
+        if (!lmv->tgts)
+                goto out_local;
+
+        for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+                        continue;
+
+		lmv_disconnect_mdc(obd, lmv->tgts[i]);
+        }
+
+	if (lmv->targets_proc_entry != NULL)
+		lprocfs_remove(&lmv->targets_proc_entry);
+	else
+		CERROR("/proc/fs/lustre/%s/%s/target_obds missing\n",
+		       obd->obd_type->typ_name, obd->obd_name);
+
+out_local:
+        /*
+         * This is the case when no real connection is established by
+         * lmv_check_connect().
+         */
+        if (!lmv->connected)
+                class_export_put(exp);
+        rc = class_disconnect(exp);
+	lmv->connected = 0;
+
+	RETURN(rc);
+}
+
+static int lmv_fid2path(struct obd_export *exp, int len, void *karg,
+			void __user *uarg)
+{
+	struct obd_device	*obddev = class_exp2obd(exp);
+	struct lmv_obd		*lmv = &obddev->u.lmv;
+	struct getinfo_fid2path *gf;
+	struct lmv_tgt_desc     *tgt;
+	struct getinfo_fid2path *remote_gf = NULL;
+	struct lu_fid		root_fid;
+	int			remote_gf_size = 0;
+	int			rc;
+
+	gf = karg;
+	tgt = lmv_find_target(lmv, &gf->gf_fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	root_fid = *gf->gf_u.gf_root_fid;
+	LASSERT(fid_is_sane(&root_fid));
+
+repeat_fid2path:
+	rc = obd_iocontrol(OBD_IOC_FID2PATH, tgt->ltd_exp, len, gf, uarg);
+	if (rc != 0 && rc != -EREMOTE)
+		GOTO(out_fid2path, rc);
+
+	/* If remote_gf != NULL, it means just building the
+	 * path on the remote MDT, copy this path segement to gf */
+	if (remote_gf != NULL) {
+		struct getinfo_fid2path *ori_gf;
+		char *ptr;
+
+		ori_gf = (struct getinfo_fid2path *)karg;
+		if (strlen(ori_gf->gf_u.gf_path) +
+		    strlen(gf->gf_u.gf_path) > ori_gf->gf_pathlen)
+			GOTO(out_fid2path, rc = -EOVERFLOW);
+
+		ptr = ori_gf->gf_u.gf_path;
+
+		memmove(ptr + strlen(gf->gf_u.gf_path) + 1, ptr,
+			strlen(ori_gf->gf_u.gf_path));
+
+		strncpy(ptr, gf->gf_u.gf_path,
+			strlen(gf->gf_u.gf_path));
+		ptr += strlen(gf->gf_u.gf_path);
+		*ptr = '/';
+	}
+
+	CDEBUG(D_INFO, "%s: get path %s "DFID" rec: %llu ln: %u\n",
+	       tgt->ltd_exp->exp_obd->obd_name,
+	       gf->gf_u.gf_path, PFID(&gf->gf_fid), gf->gf_recno,
+	       gf->gf_linkno);
+
+	if (rc == 0)
+		GOTO(out_fid2path, rc);
+
+	/* sigh, has to go to another MDT to do path building further */
+	if (remote_gf == NULL) {
+		remote_gf_size = sizeof(*remote_gf) + PATH_MAX;
+		OBD_ALLOC(remote_gf, remote_gf_size);
+		if (remote_gf == NULL)
+			GOTO(out_fid2path, rc = -ENOMEM);
+		remote_gf->gf_pathlen = PATH_MAX;
+	}
+
+	if (!fid_is_sane(&gf->gf_fid)) {
+		CERROR("%s: invalid FID "DFID": rc = %d\n",
+		       tgt->ltd_exp->exp_obd->obd_name,
+		       PFID(&gf->gf_fid), -EINVAL);
+		GOTO(out_fid2path, rc = -EINVAL);
+	}
+
+	tgt = lmv_find_target(lmv, &gf->gf_fid);
+	if (IS_ERR(tgt))
+		GOTO(out_fid2path, rc = -EINVAL);
+
+	remote_gf->gf_fid = gf->gf_fid;
+	remote_gf->gf_recno = -1;
+	remote_gf->gf_linkno = -1;
+	memset(remote_gf->gf_u.gf_path, 0, remote_gf->gf_pathlen);
+	*remote_gf->gf_u.gf_root_fid = root_fid;
+	gf = remote_gf;
+	goto repeat_fid2path;
+
+out_fid2path:
+	if (remote_gf != NULL)
+		OBD_FREE(remote_gf, remote_gf_size);
+	RETURN(rc);
+}
+
+static int lmv_hsm_req_count(struct lmv_obd *lmv,
+			     const struct hsm_user_request *hur,
+			     const struct lmv_tgt_desc *tgt_mds)
+{
+	__u32			 i;
+	int			 nr = 0;
+	struct lmv_tgt_desc	*curr_tgt;
+
+	/* count how many requests must be sent to the given target */
+	for (i = 0; i < hur->hur_request.hr_itemcount; i++) {
+		curr_tgt = lmv_find_target(lmv, &hur->hur_user_item[i].hui_fid);
+		if (IS_ERR(curr_tgt))
+			RETURN(PTR_ERR(curr_tgt));
+		if (obd_uuid_equals(&curr_tgt->ltd_uuid, &tgt_mds->ltd_uuid))
+			nr++;
+	}
+	return nr;
+}
+
+static int lmv_hsm_req_build(struct lmv_obd *lmv,
+			      struct hsm_user_request *hur_in,
+			      const struct lmv_tgt_desc *tgt_mds,
+			      struct hsm_user_request *hur_out)
+{
+	__u32			 i, nr_out;
+	struct lmv_tgt_desc	*curr_tgt;
+
+	/* build the hsm_user_request for the given target */
+	hur_out->hur_request = hur_in->hur_request;
+	nr_out = 0;
+	for (i = 0; i < hur_in->hur_request.hr_itemcount; i++) {
+		curr_tgt = lmv_find_target(lmv,
+					   &hur_in->hur_user_item[i].hui_fid);
+		if (IS_ERR(curr_tgt))
+			RETURN(PTR_ERR(curr_tgt));
+		if (obd_uuid_equals(&curr_tgt->ltd_uuid, &tgt_mds->ltd_uuid)) {
+			hur_out->hur_user_item[nr_out] =
+						hur_in->hur_user_item[i];
+			nr_out++;
+		}
+	}
+	hur_out->hur_request.hr_itemcount = nr_out;
+	memcpy(hur_data(hur_out), hur_data(hur_in),
+	       hur_in->hur_request.hr_data_len);
+
+	RETURN(0);
+}
+
+static int lmv_hsm_ct_unregister(struct obd_device *obd, unsigned int cmd,
+				 int len, struct lustre_kernelcomm *lk,
+				 void __user *uarg)
+{
+	struct lmv_obd *lmv = &obd->u.lmv;
+	__u32	i;
+	int	rc;
+	ENTRY;
+
+	/* unregister request (call from llapi_hsm_copytool_fini) */
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		struct lmv_tgt_desc *tgt = lmv->tgts[i];
+
+		if (tgt == NULL || tgt->ltd_exp == NULL)
+			continue;
+		/* best effort: try to clean as much as possible
+		 * (continue on error) */
+		obd_iocontrol(cmd, tgt->ltd_exp, len, lk, uarg);
+	}
+
+	/* Whatever the result, remove copytool from kuc groups.
+	 * Unreached coordinators will get EPIPE on next requests
+	 * and will unregister automatically.
+	 */
+	rc = libcfs_kkuc_group_rem(&obd->obd_uuid, lk->lk_uid, lk->lk_group);
+
+	RETURN(rc);
+}
+
+static int lmv_hsm_ct_register(struct obd_device *obd, unsigned int cmd,
+			       int len, struct lustre_kernelcomm *lk,
+			       void __user *uarg)
+{
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct file		*filp;
+	__u32			 i, j;
+	int			 err;
+	bool			 any_set = false;
+	struct kkuc_ct_data	 kcd = {
+		.kcd_magic   = KKUC_CT_DATA_MAGIC,
+		.kcd_archive = lk->lk_data,
+	};
+	int			 rc = 0;
+	ENTRY;
+
+	filp = fget(lk->lk_wfd);
+	if (!filp)
+		RETURN(-EBADF);
+
+	rc = libcfs_kkuc_group_add(filp, &obd->obd_uuid, lk->lk_uid,
+				   lk->lk_group, &kcd, sizeof(kcd));
+	if (rc)
+		GOTO(err_fput, rc);
+
+	/* All or nothing: try to register to all MDS.
+	 * In case of failure, unregister from previous MDS,
+	 * except if it because of inactive target. */
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		struct lmv_tgt_desc *tgt = lmv->tgts[i];
+
+		if (tgt == NULL || tgt->ltd_exp == NULL)
+			continue;
+
+		err = obd_iocontrol(cmd, tgt->ltd_exp, len, lk, uarg);
+		if (err) {
+			if (tgt->ltd_active) {
+				/* permanent error */
+				CERROR("%s: iocontrol MDC %s on MDT"
+				       " idx %d cmd %x: err = %d\n",
+				       lmv2obd_dev(lmv)->obd_name,
+				       tgt->ltd_uuid.uuid, i, cmd, err);
+				rc = err;
+				lk->lk_flags |= LK_FLG_STOP;
+				/* unregister from previous MDS */
+				for (j = 0; j < i; j++) {
+					tgt = lmv->tgts[j];
+					if (tgt == NULL || tgt->ltd_exp == NULL)
+						continue;
+					obd_iocontrol(cmd, tgt->ltd_exp, len,
+						      lk, uarg);
+				}
+				GOTO(err_kkuc_rem, rc);
+			}
+			/* else: transient error.
+			 * kuc will register to the missing MDT
+			 * when it is back */
+		} else {
+			any_set = true;
+		}
+	}
+
+	if (!any_set)
+		/* no registration done: return error */
+		GOTO(err_kkuc_rem, rc = -ENOTCONN);
+
+	RETURN(0);
+
+err_kkuc_rem:
+	libcfs_kkuc_group_rem(&obd->obd_uuid, lk->lk_uid, lk->lk_group);
+
+err_fput:
+	fput(filp);
+	return rc;
+}
+
+
+
+
+static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
+			 int len, void *karg, void __user *uarg)
+{
+	struct obd_device	*obddev = class_exp2obd(exp);
+	struct lmv_obd		*lmv = &obddev->u.lmv;
+	struct lmv_tgt_desc	*tgt = NULL;
+	__u32			 i = 0;
+	int			 rc = 0;
+	int			 set = 0;
+	__u32			 count = lmv->desc.ld_tgt_count;
+	ENTRY;
+
+        if (count == 0)
+                RETURN(-ENOTTY);
+
+        switch (cmd) {
+        case IOC_OBD_STATFS: {
+                struct obd_ioctl_data *data = karg;
+                struct obd_device *mdc_obd;
+                struct obd_statfs stat_buf = {0};
+                __u32 index;
+
+                memcpy(&index, data->ioc_inlbuf2, sizeof(__u32));
+                if ((index >= count))
+                        RETURN(-ENODEV);
+
+		tgt = lmv->tgts[index];
+		if (tgt == NULL || !tgt->ltd_active)
+			RETURN(-ENODATA);
+
+		mdc_obd = class_exp2obd(tgt->ltd_exp);
+		if (!mdc_obd)
+			RETURN(-EINVAL);
+
+		/* copy UUID */
+		if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(mdc_obd),
+				 min((int) data->ioc_plen2,
+				     (int) sizeof(struct obd_uuid))))
+			RETURN(-EFAULT);
+
+		rc = obd_statfs(NULL, tgt->ltd_exp, &stat_buf,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				0);
+		if (rc)
+			RETURN(rc);
+		if (copy_to_user(data->ioc_pbuf1, &stat_buf,
+				 min((int) data->ioc_plen1,
+				     (int) sizeof(stat_buf))))
+			RETURN(-EFAULT);
+		break;
+        }
+        case OBD_IOC_QUOTACTL: {
+                struct if_quotactl *qctl = karg;
+                struct obd_quotactl *oqctl;
+
+		if (qctl->qc_valid == QC_MDTIDX) {
+			if (count <= qctl->qc_idx)
+				RETURN(-EINVAL);
+
+			tgt = lmv->tgts[qctl->qc_idx];
+			if (tgt == NULL || tgt->ltd_exp == NULL)
+				RETURN(-EINVAL);
+		} else if (qctl->qc_valid == QC_UUID) {
+			for (i = 0; i < count; i++) {
+				tgt = lmv->tgts[i];
+				if (tgt == NULL)
+					continue;
+				if (!obd_uuid_equals(&tgt->ltd_uuid,
+						     &qctl->obd_uuid))
+					continue;
+
+                                if (tgt->ltd_exp == NULL)
+                                        RETURN(-EINVAL);
+
+                                break;
+                        }
+                } else {
+                        RETURN(-EINVAL);
+                }
+
+                if (i >= count)
+                        RETURN(-EAGAIN);
+
+                LASSERT(tgt != NULL && tgt->ltd_exp != NULL);
+                OBD_ALLOC_PTR(oqctl);
+                if (!oqctl)
+                        RETURN(-ENOMEM);
+
+                QCTL_COPY(oqctl, qctl);
+                rc = obd_quotactl(tgt->ltd_exp, oqctl);
+                if (rc == 0) {
+                        QCTL_COPY(qctl, oqctl);
+                        qctl->qc_valid = QC_MDTIDX;
+                        qctl->obd_uuid = tgt->ltd_uuid;
+                }
+                OBD_FREE_PTR(oqctl);
+                break;
+        }
+	case LL_IOC_GET_CONNECT_FLAGS: {
+		tgt = lmv->tgts[0];
+		if (tgt == NULL || tgt->ltd_exp == NULL)
+			RETURN(-ENODATA);
+		rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg);
+		break;
+	}
+	case LL_IOC_FID2MDTIDX: {
+		struct lu_fid *fid = karg;
+		int		mdt_index;
+
+		rc = lmv_fld_lookup(lmv, fid, &mdt_index);
+		if (rc != 0)
+			RETURN(rc);
+
+		/* Note: this is from llite(see ll_dir_ioctl()), @uarg does not
+		 * point to user space memory for FID2MDTIDX. */
+		*(__u32 *)uarg = mdt_index;
+		break;
+	}
+	case OBD_IOC_FID2PATH: {
+		rc = lmv_fid2path(exp, len, karg, uarg);
+		break;
+	}
+	case LL_IOC_HSM_STATE_GET:
+	case LL_IOC_HSM_STATE_SET:
+	case LL_IOC_HSM_ACTION: {
+		struct md_op_data	*op_data = karg;
+
+		tgt = lmv_find_target(lmv, &op_data->op_fid1);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
+
+		if (tgt->ltd_exp == NULL)
+			RETURN(-EINVAL);
+
+		rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg);
+		break;
+	}
+	case LL_IOC_HSM_PROGRESS: {
+		const struct hsm_progress_kernel *hpk = karg;
+
+		tgt = lmv_find_target(lmv, &hpk->hpk_fid);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
+		rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg);
+		break;
+	}
+	case LL_IOC_HSM_REQUEST: {
+		struct hsm_user_request *hur = karg;
+		unsigned int reqcount = hur->hur_request.hr_itemcount;
+
+		if (reqcount == 0)
+			RETURN(0);
+
+		/* if the request is about a single fid
+		 * or if there is a single MDS, no need to split
+		 * the request. */
+		if (reqcount == 1 || count == 1) {
+			tgt = lmv_find_target(lmv,
+					      &hur->hur_user_item[0].hui_fid);
+			if (IS_ERR(tgt))
+				RETURN(PTR_ERR(tgt));
+			rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg);
+		} else {
+			/* split fid list to their respective MDS */
+			for (i = 0; i < count; i++) {
+				int nr, rc1;
+				size_t reqlen;
+				struct hsm_user_request *req;
+
+				tgt = lmv->tgts[i];
+				if (tgt == NULL || tgt->ltd_exp == NULL)
+					continue;
+
+				nr = lmv_hsm_req_count(lmv, hur, tgt);
+				if (nr < 0)
+					RETURN(nr);
+				if (nr == 0) /* nothing for this MDS */
+					continue;
+
+				/* build a request with fids for this MDS */
+				reqlen = offsetof(typeof(*hur),
+						  hur_user_item[nr])
+						+ hur->hur_request.hr_data_len;
+				OBD_ALLOC_LARGE(req, reqlen);
+				if (req == NULL)
+					RETURN(-ENOMEM);
+				rc1 = lmv_hsm_req_build(lmv, hur, tgt, req);
+				if (rc1 < 0)
+					GOTO(hsm_req_err, rc1);
+				rc1 = obd_iocontrol(cmd, tgt->ltd_exp, reqlen,
+						    req, uarg);
+hsm_req_err:
+				if (rc1 != 0 && rc == 0)
+					rc = rc1;
+				OBD_FREE_LARGE(req, reqlen);
+			}
+		}
+		break;
+	}
+	case LL_IOC_LOV_SWAP_LAYOUTS: {
+		struct md_op_data	*op_data = karg;
+		struct lmv_tgt_desc	*tgt1, *tgt2;
+
+		tgt1 = lmv_find_target(lmv, &op_data->op_fid1);
+		if (IS_ERR(tgt1))
+			RETURN(PTR_ERR(tgt1));
+
+		tgt2 = lmv_find_target(lmv, &op_data->op_fid2);
+		if (IS_ERR(tgt2))
+			RETURN(PTR_ERR(tgt2));
+
+		if ((tgt1->ltd_exp == NULL) || (tgt2->ltd_exp == NULL))
+			RETURN(-EINVAL);
+
+		/* only files on same MDT can have their layouts swapped */
+		if (tgt1->ltd_idx != tgt2->ltd_idx)
+			RETURN(-EPERM);
+
+		rc = obd_iocontrol(cmd, tgt1->ltd_exp, len, karg, uarg);
+		break;
+	}
+	case LL_IOC_HSM_CT_START: {
+		struct lustre_kernelcomm *lk = karg;
+		if (lk->lk_flags & LK_FLG_STOP)
+			rc = lmv_hsm_ct_unregister(obddev, cmd, len, lk, uarg);
+		else
+			rc = lmv_hsm_ct_register(obddev, cmd, len, lk, uarg);
+		break;
+	}
+	default:
+		for (i = 0; i < count; i++) {
+			struct obd_device *mdc_obd;
+			int err;
+
+			tgt = lmv->tgts[i];
+			if (tgt == NULL || tgt->ltd_exp == NULL)
+				continue;
+			/* ll_umount_begin() sets force flag but for lmv, not
+			 * mdc. Let's pass it through */
+			mdc_obd = class_exp2obd(tgt->ltd_exp);
+			mdc_obd->obd_force = obddev->obd_force;
+			err = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg);
+			if (err) {
+				if (tgt->ltd_active) {
+					CERROR("error: iocontrol MDC %s on MDT"
+					       " idx %d cmd %x: err = %d\n",
+					       tgt->ltd_uuid.uuid, i, cmd, err);
+					if (!rc)
+						rc = err;
+				}
+			} else
+				set = 1;
+                }
+                if (!set && !rc)
+                        rc = -EIO;
+        }
+        RETURN(rc);
+}
+
+/**
+ * This is _inode_ placement policy function (not name).
+ */
+static int lmv_placement_policy(struct obd_device *obd,
+				struct md_op_data *op_data, u32 *mds)
+{
+	struct lmv_obd	   *lmv = &obd->u.lmv;
+	struct lmv_user_md *lum;
+
+	ENTRY;
+
+	LASSERT(mds != NULL);
+
+	if (lmv->desc.ld_tgt_count == 1) {
+		*mds = 0;
+		RETURN(0);
+	}
+
+	lum = op_data->op_data;
+	/* Choose MDS by
+	 * 1. See if the stripe offset is specified by lum.
+	 * 2. Then check if there is default stripe offset.
+	 * 3. Finally choose MDS by name hash if the parent
+	 *    is striped directory. (see lmv_locate_mds()). */
+	if (op_data->op_cli_flags & CLI_SET_MEA && lum != NULL &&
+	    le32_to_cpu(lum->lum_stripe_offset) != (__u32)-1) {
+		*mds = le32_to_cpu(lum->lum_stripe_offset);
+	} else if (op_data->op_default_stripe_offset != (__u32)-1) {
+		*mds = op_data->op_default_stripe_offset;
+		op_data->op_mds = *mds;
+		/* Correct the stripe offset in lum */
+		if (lum != NULL)
+			lum->lum_stripe_offset = cpu_to_le32(*mds);
+	} else {
+		*mds = op_data->op_mds;
+	}
+
+	RETURN(0);
+}
+
+int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, u32 mds)
+{
+	struct lmv_tgt_desc	*tgt;
+	int			 rc;
+	ENTRY;
+
+	tgt = lmv_get_target(lmv, mds, NULL);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	/*
+	 * New seq alloc and FLD setup should be atomic. Otherwise we may find
+	 * on server that seq in new allocated fid is not yet known.
+	 */
+	mutex_lock(&tgt->ltd_fid_mutex);
+
+	if (tgt->ltd_active == 0 || tgt->ltd_exp == NULL)
+		GOTO(out, rc = -ENODEV);
+
+	/*
+	 * Asking underlying tgt layer to allocate new fid.
+	 */
+	rc = obd_fid_alloc(NULL, tgt->ltd_exp, fid, NULL);
+	if (rc > 0) {
+		LASSERT(fid_is_sane(fid));
+		rc = 0;
+	}
+
+        EXIT;
+out:
+	mutex_unlock(&tgt->ltd_fid_mutex);
+        return rc;
+}
+
+int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp,
+		  struct lu_fid *fid, struct md_op_data *op_data)
+{
+        struct obd_device     *obd = class_exp2obd(exp);
+        struct lmv_obd        *lmv = &obd->u.lmv;
+	u32		       mds = 0;
+        int                    rc;
+        ENTRY;
+
+        LASSERT(op_data != NULL);
+        LASSERT(fid != NULL);
+
+        rc = lmv_placement_policy(obd, op_data, &mds);
+        if (rc) {
+                CERROR("Can't get target for allocating fid, "
+                       "rc %d\n", rc);
+                RETURN(rc);
+        }
+
+        rc = __lmv_fid_alloc(lmv, fid, mds);
+        if (rc) {
+                CERROR("Can't alloc new fid, rc %d\n", rc);
+                RETURN(rc);
+        }
+
+        RETURN(rc);
+}
+
+static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct lmv_obd	*lmv = &obd->u.lmv;
+	struct lmv_desc	*desc;
+	int		rc;
+	ENTRY;
+
+        if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+                CERROR("LMV setup requires a descriptor\n");
+                RETURN(-EINVAL);
+        }
+
+        desc = (struct lmv_desc *)lustre_cfg_buf(lcfg, 1);
+        if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) {
+                CERROR("Lmv descriptor size wrong: %d > %d\n",
+                       (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1));
+                RETURN(-EINVAL);
+        }
+
+	lmv->tgts_size = 32U;
+	OBD_ALLOC(lmv->tgts, sizeof(*lmv->tgts) * lmv->tgts_size);
+	if (lmv->tgts == NULL)
+		RETURN(-ENOMEM);
+
+	obd_str2uuid(&lmv->desc.ld_uuid, desc->ld_uuid.uuid);
+	lmv->desc.ld_tgt_count = 0;
+	lmv->desc.ld_active_tgt_count = 0;
+	lmv->max_def_easize = 0;
+	lmv->max_easize = 0;
+
+	spin_lock_init(&lmv->lmv_lock);
+	mutex_init(&lmv->lmv_init_mutex);
+
+#ifdef CONFIG_PROC_FS
+	obd->obd_vars = lprocfs_lmv_obd_vars;
+	lprocfs_obd_setup(obd);
+	lprocfs_alloc_md_stats(obd, 0);
+	rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd",
+				0444, &lmv_proc_target_fops, obd);
+	if (rc)
+		CWARN("%s: error adding LMV target_obd file: rc = %d\n",
+		      obd->obd_name, rc);
+#endif
+	rc = fld_client_init(&lmv->lmv_fld, obd->obd_name,
+			     LUSTRE_CLI_FLD_HASH_DHT);
+	if (rc) {
+		CERROR("Can't init FLD, err %d\n", rc);
+		GOTO(out, rc);
+	}
+
+        RETURN(0);
+
+out:
+        return rc;
+}
+
+static int lmv_cleanup(struct obd_device *obd)
+{
+	struct lmv_obd   *lmv = &obd->u.lmv;
+	ENTRY;
+
+	fld_client_fini(&lmv->lmv_fld);
+	if (lmv->tgts != NULL) {
+		int i;
+		for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+			if (lmv->tgts[i] == NULL)
+				continue;
+			lmv_del_target(lmv, i);
+		}
+		OBD_FREE(lmv->tgts, sizeof(*lmv->tgts) * lmv->tgts_size);
+		lmv->tgts_size = 0;
+	}
+	RETURN(0);
+}
+
+static int lmv_process_config(struct obd_device *obd, size_t len, void *buf)
+{
+	struct lustre_cfg	*lcfg = buf;
+	struct obd_uuid		obd_uuid;
+	int			gen;
+	__u32			index;
+	int			rc;
+	ENTRY;
+
+	switch (lcfg->lcfg_command) {
+	case LCFG_ADD_MDC:
+		/* modify_mdc_tgts add 0:lustre-clilmv  1:lustre-MDT0000_UUID
+		 * 2:0  3:1  4:lustre-MDT0000-mdc_UUID */
+		if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid))
+			GOTO(out, rc = -EINVAL);
+
+		obd_str2uuid(&obd_uuid,  lustre_cfg_buf(lcfg, 1));
+
+		if (sscanf(lustre_cfg_buf(lcfg, 2), "%u", &index) != 1)
+			GOTO(out, rc = -EINVAL);
+		if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", &gen) != 1)
+			GOTO(out, rc = -EINVAL);
+		rc = lmv_add_target(obd, &obd_uuid, index, gen);
+		GOTO(out, rc);
+	default:
+		CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+		GOTO(out, rc = -EINVAL);
+	}
+out:
+	RETURN(rc);
+}
+
+static int lmv_statfs(const struct lu_env *env, struct obd_export *exp,
+                      struct obd_statfs *osfs, __u64 max_age, __u32 flags)
+{
+	struct obd_device	*obd = class_exp2obd(exp);
+	struct lmv_obd		*lmv = &obd->u.lmv;
+	struct obd_statfs	*temp;
+	int			 rc = 0;
+	__u32			 i;
+	ENTRY;
+
+        OBD_ALLOC(temp, sizeof(*temp));
+        if (temp == NULL)
+                RETURN(-ENOMEM);
+
+        for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+			continue;
+
+		rc = obd_statfs(env, lmv->tgts[i]->ltd_exp, temp,
+				max_age, flags);
+		if (rc) {
+			CERROR("can't stat MDS #%d (%s), error %d\n", i,
+			       lmv->tgts[i]->ltd_exp->exp_obd->obd_name,
+			       rc);
+			GOTO(out_free_temp, rc);
+		}
+
+		if (i == 0) {
+			*osfs = *temp;
+			/* If the statfs is from mount, it will needs
+			 * retrieve necessary information from MDT0.
+			 * i.e. mount does not need the merged osfs
+			 * from all of MDT.
+			 * And also clients can be mounted as long as
+			 * MDT0 is in service*/
+			if (flags & OBD_STATFS_FOR_MDT0)
+				GOTO(out_free_temp, rc);
+                } else {
+                        osfs->os_bavail += temp->os_bavail;
+                        osfs->os_blocks += temp->os_blocks;
+                        osfs->os_ffree += temp->os_ffree;
+                        osfs->os_files += temp->os_files;
+                }
+        }
+
+        EXIT;
+out_free_temp:
+        OBD_FREE(temp, sizeof(*temp));
+        return rc;
+}
+
+static int lmv_get_root(struct obd_export *exp, const char *fileset,
+			struct lu_fid *fid)
+{
+        struct obd_device    *obd = exp->exp_obd;
+        struct lmv_obd       *lmv = &obd->u.lmv;
+        int                   rc;
+        ENTRY;
+
+	rc = md_get_root(lmv->tgts[0]->ltd_exp, fileset, fid);
+	RETURN(rc);
+}
+
+static int lmv_getxattr(struct obd_export *exp, const struct lu_fid *fid,
+			u64 valid, const char *name,
+			const char *input, int input_size, int output_size,
+			int flags, struct ptlrpc_request **request)
+{
+        struct obd_device      *obd = exp->exp_obd;
+        struct lmv_obd         *lmv = &obd->u.lmv;
+        struct lmv_tgt_desc    *tgt;
+        int                     rc;
+        ENTRY;
+
+        tgt = lmv_find_target(lmv, fid);
+        if (IS_ERR(tgt))
+                RETURN(PTR_ERR(tgt));
+
+	rc = md_getxattr(tgt->ltd_exp, fid, valid, name, input,
+			 input_size, output_size, flags, request);
+
+	RETURN(rc);
+}
+
+static int lmv_setxattr(struct obd_export *exp, const struct lu_fid *fid,
+			u64 valid, const char *name,
+			const char *input, int input_size, int output_size,
+			int flags, __u32 suppgid,
+			struct ptlrpc_request **request)
+{
+        struct obd_device      *obd = exp->exp_obd;
+        struct lmv_obd         *lmv = &obd->u.lmv;
+        struct lmv_tgt_desc    *tgt;
+        int                     rc;
+        ENTRY;
+
+        tgt = lmv_find_target(lmv, fid);
+        if (IS_ERR(tgt))
+                RETURN(PTR_ERR(tgt));
+
+	rc = md_setxattr(tgt->ltd_exp, fid, valid, name, input,
+			 input_size, output_size, flags, suppgid,
+			 request);
+
+	RETURN(rc);
+}
+
+static int lmv_getattr(struct obd_export *exp, struct md_op_data *op_data,
+                       struct ptlrpc_request **request)
+{
+        struct obd_device       *obd = exp->exp_obd;
+        struct lmv_obd          *lmv = &obd->u.lmv;
+        struct lmv_tgt_desc     *tgt;
+        int                      rc;
+        ENTRY;
+
+        tgt = lmv_find_target(lmv, &op_data->op_fid1);
+        if (IS_ERR(tgt))
+                RETURN(PTR_ERR(tgt));
+
+	if (op_data->op_flags & MF_GET_MDT_IDX) {
+		op_data->op_mds = tgt->ltd_idx;
+		RETURN(0);
+	}
+
+        rc = md_getattr(tgt->ltd_exp, op_data, request);
+
+        RETURN(rc);
+}
+
+static int lmv_null_inode(struct obd_export *exp, const struct lu_fid *fid)
+{
+        struct obd_device   *obd = exp->exp_obd;
+        struct lmv_obd      *lmv = &obd->u.lmv;
+	__u32                i;
+        ENTRY;
+
+        CDEBUG(D_INODE, "CBDATA for "DFID"\n", PFID(fid));
+
+	/*
+	 * With DNE every object can have two locks in different namespaces:
+	 * lookup lock in space of MDT storing direntry and update/open lock in
+	 * space of MDT storing inode.
+	 */
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+			continue;
+		md_null_inode(lmv->tgts[i]->ltd_exp, fid);
+	}
+
+	RETURN(0);
+}
+
+static int lmv_close(struct obd_export *exp, struct md_op_data *op_data,
+                     struct md_open_data *mod, struct ptlrpc_request **request)
+{
+        struct obd_device     *obd = exp->exp_obd;
+        struct lmv_obd        *lmv = &obd->u.lmv;
+        struct lmv_tgt_desc   *tgt;
+        int                    rc;
+        ENTRY;
+
+        tgt = lmv_find_target(lmv, &op_data->op_fid1);
+        if (IS_ERR(tgt))
+                RETURN(PTR_ERR(tgt));
+
+        CDEBUG(D_INODE, "CLOSE "DFID"\n", PFID(&op_data->op_fid1));
+        rc = md_close(tgt->ltd_exp, op_data, mod, request);
+        RETURN(rc);
+}
+
+/**
+ * Choosing the MDT by name or FID in @op_data.
+ * For non-striped directory, it will locate MDT by fid.
+ * For striped-directory, it will locate MDT by name. And also
+ * it will reset op_fid1 with the FID of the choosen stripe.
+ **/
+struct lmv_tgt_desc *
+lmv_locate_target_for_name(struct lmv_obd *lmv, struct lmv_stripe_md *lsm,
+			   const char *name, int namelen, struct lu_fid *fid,
+			   u32 *mds)
+{
+	struct lmv_tgt_desc	*tgt;
+	const struct lmv_oinfo	*oinfo;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_NAME_HASH)) {
+		if (cfs_fail_val >= lsm->lsm_md_stripe_count)
+			RETURN(ERR_PTR(-EBADF));
+		oinfo = &lsm->lsm_md_oinfo[cfs_fail_val];
+	} else {
+		oinfo = lsm_name_to_stripe_info(lsm, name, namelen);
+		if (IS_ERR(oinfo))
+			RETURN(ERR_CAST(oinfo));
+	}
+
+	if (fid != NULL)
+		*fid = oinfo->lmo_fid;
+	if (mds != NULL)
+		*mds = oinfo->lmo_mds;
+
+	tgt = lmv_get_target(lmv, oinfo->lmo_mds, NULL);
+
+	CDEBUG(D_INFO, "locate on mds %u "DFID"\n", oinfo->lmo_mds,
+	       PFID(&oinfo->lmo_fid));
+	return tgt;
+}
+
+/**
+ * Locate mds by fid or name
+ *
+ * For striped directory (lsm != NULL), it will locate the stripe
+ * by name hash (see lsm_name_to_stripe_info()). Note: if the hash_type
+ * is unknown, it will return -EBADFD, and lmv_intent_lookup might need
+ * walk through all of stripes to locate the entry.
+ *
+ * For normal direcotry, it will locate MDS by FID directly.
+ * \param[in] lmv	LMV device
+ * \param[in] op_data	client MD stack parameters, name, namelen
+ *                      mds_num etc.
+ * \param[in] fid	object FID used to locate MDS.
+ *
+ * retval		pointer to the lmv_tgt_desc if succeed.
+ *                      ERR_PTR(errno) if failed.
+ */
+struct lmv_tgt_desc*
+lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
+	       struct lu_fid *fid)
+{
+	struct lmv_stripe_md	*lsm = op_data->op_mea1;
+	struct lmv_tgt_desc	*tgt;
+
+	/* During creating VOLATILE file, it should honor the mdt
+	 * index if the file under striped dir is being restored, see
+	 * ct_restore(). */
+	if (op_data->op_bias & MDS_CREATE_VOLATILE &&
+	    (int)op_data->op_mds != -1) {
+		int i;
+		tgt = lmv_get_target(lmv, op_data->op_mds, NULL);
+		if (IS_ERR(tgt))
+			return tgt;
+
+		if (lsm != NULL) {
+			/* refill the right parent fid */
+			for (i = 0; i < lsm->lsm_md_stripe_count; i++) {
+				struct lmv_oinfo *oinfo;
+
+				oinfo = &lsm->lsm_md_oinfo[i];
+				if (oinfo->lmo_mds == op_data->op_mds) {
+					*fid = oinfo->lmo_fid;
+					break;
+				}
+			}
+
+			if (i == lsm->lsm_md_stripe_count)
+				*fid = lsm->lsm_md_oinfo[0].lmo_fid;
+		}
+
+		return tgt;
+	}
+
+	if (lsm == NULL || op_data->op_namelen == 0) {
+		tgt = lmv_find_target(lmv, fid);
+		if (IS_ERR(tgt))
+			return tgt;
+
+		op_data->op_mds = tgt->ltd_idx;
+		return tgt;
+	}
+
+	return lmv_locate_target_for_name(lmv, lsm, op_data->op_name,
+					  op_data->op_namelen, fid,
+					  &op_data->op_mds);
+}
+
+int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
+		const void *data, size_t datalen, umode_t mode, uid_t uid,
+		gid_t gid, cfs_cap_t cap_effective, __u64 rdev,
+		struct ptlrpc_request **request)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd          *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	int                      rc;
+	ENTRY;
+
+	if (!lmv->desc.ld_active_tgt_count)
+		RETURN(-EIO);
+
+	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	CDEBUG(D_INODE, "CREATE name '%.*s' on "DFID" -> mds #%x\n",
+		(int)op_data->op_namelen, op_data->op_name,
+		PFID(&op_data->op_fid1), op_data->op_mds);
+
+	rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
+	if (rc)
+		RETURN(rc);
+	if (exp_connect_flags(exp) & OBD_CONNECT_DIR_STRIPE) {
+		/* Send the create request to the MDT where the object
+		 * will be located */
+		tgt = lmv_find_target(lmv, &op_data->op_fid2);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
+
+		op_data->op_mds = tgt->ltd_idx;
+	} else {
+		CDEBUG(D_CONFIG, "Server doesn't support striped dirs\n");
+	}
+
+	CDEBUG(D_INODE, "CREATE obj "DFID" -> mds #%x\n",
+	       PFID(&op_data->op_fid2), op_data->op_mds);
+
+	op_data->op_flags |= MF_MDC_CANCEL_FID1;
+	rc = md_create(tgt->ltd_exp, op_data, data, datalen, mode, uid, gid,
+		       cap_effective, rdev, request);
+	if (rc == 0) {
+		if (*request == NULL)
+			RETURN(rc);
+		CDEBUG(D_INODE, "Created - "DFID"\n", PFID(&op_data->op_fid2));
+	}
+	RETURN(rc);
+}
+
+static int
+lmv_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+	    const union ldlm_policy_data *policy, struct md_op_data *op_data,
+	    struct lustre_handle *lockh, __u64 extra_lock_flags)
+{
+	struct obd_device        *obd = exp->exp_obd;
+	struct lmv_obd           *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc      *tgt;
+	int                       rc;
+	ENTRY;
+
+	CDEBUG(D_INODE, "ENQUEUE on "DFID"\n", PFID(&op_data->op_fid1));
+
+	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	CDEBUG(D_INODE, "ENQUEUE on "DFID" -> mds #%u\n",
+	       PFID(&op_data->op_fid1), tgt->ltd_idx);
+
+	rc = md_enqueue(tgt->ltd_exp, einfo, policy, op_data, lockh,
+			extra_lock_flags);
+
+	RETURN(rc);
+}
+
+static int
+lmv_getattr_name(struct obd_export *exp,struct md_op_data *op_data,
+		 struct ptlrpc_request **preq)
+{
+	struct ptlrpc_request   *req = NULL;
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd          *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	struct mdt_body         *body;
+	int                      rc;
+	ENTRY;
+
+	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	CDEBUG(D_INODE, "GETATTR_NAME for %*s on "DFID" -> mds #%d\n",
+		(int)op_data->op_namelen, op_data->op_name,
+		PFID(&op_data->op_fid1), tgt->ltd_idx);
+
+	rc = md_getattr_name(tgt->ltd_exp, op_data, preq);
+	if (rc != 0)
+		RETURN(rc);
+
+	body = req_capsule_server_get(&(*preq)->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body != NULL);
+
+	if (body->mbo_valid & OBD_MD_MDS) {
+		struct lu_fid rid = body->mbo_fid1;
+		CDEBUG(D_INODE, "Request attrs for "DFID"\n",
+		       PFID(&rid));
+
+		tgt = lmv_find_target(lmv, &rid);
+		if (IS_ERR(tgt)) {
+			ptlrpc_req_finished(*preq);
+			preq = NULL;
+			RETURN(PTR_ERR(tgt));
+		}
+
+		op_data->op_fid1 = rid;
+		op_data->op_valid |= OBD_MD_FLCROSSREF;
+		op_data->op_namelen = 0;
+		op_data->op_name = NULL;
+		rc = md_getattr_name(tgt->ltd_exp, op_data, &req);
+		ptlrpc_req_finished(*preq);
+		*preq = req;
+	}
+
+	RETURN(rc);
+}
+
+#define md_op_data_fid(op_data, fl)                     \
+        (fl == MF_MDC_CANCEL_FID1 ? &op_data->op_fid1 : \
+         fl == MF_MDC_CANCEL_FID2 ? &op_data->op_fid2 : \
+         fl == MF_MDC_CANCEL_FID3 ? &op_data->op_fid3 : \
+         fl == MF_MDC_CANCEL_FID4 ? &op_data->op_fid4 : \
+         NULL)
+
+static int lmv_early_cancel(struct obd_export *exp, struct lmv_tgt_desc *tgt,
+			    struct md_op_data *op_data, __u32 op_tgt,
+			    enum ldlm_mode mode, int bits, int flag)
+{
+	struct lu_fid *fid = md_op_data_fid(op_data, flag);
+	struct lmv_obd *lmv = &exp->exp_obd->u.lmv;
+	union ldlm_policy_data policy = { { 0 } };
+	int rc = 0;
+	ENTRY;
+
+	if (!fid_is_sane(fid))
+		RETURN(0);
+
+	if (tgt == NULL) {
+		tgt = lmv_find_target(lmv, fid);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
+	}
+
+	if (tgt->ltd_idx != op_tgt) {
+		CDEBUG(D_INODE, "EARLY_CANCEL on "DFID"\n", PFID(fid));
+		policy.l_inodebits.bits = bits;
+		rc = md_cancel_unused(tgt->ltd_exp, fid, &policy,
+				      mode, LCF_ASYNC, NULL);
+	} else {
+		CDEBUG(D_INODE,
+		       "EARLY_CANCEL skip operation target %d on "DFID"\n",
+		       op_tgt, PFID(fid));
+		op_data->op_flags |= flag;
+		rc = 0;
+	}
+
+	RETURN(rc);
+}
+
+/*
+ * llite passes fid of an target inode in op_data->op_fid1 and id of directory in
+ * op_data->op_fid2
+ */
+static int lmv_link(struct obd_export *exp, struct md_op_data *op_data,
+                    struct ptlrpc_request **request)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd          *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	int                      rc;
+	ENTRY;
+
+	LASSERT(op_data->op_namelen != 0);
+
+	CDEBUG(D_INODE, "LINK "DFID":%*s to "DFID"\n",
+	       PFID(&op_data->op_fid2), (int)op_data->op_namelen,
+	       op_data->op_name, PFID(&op_data->op_fid1));
+
+	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
+	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
+	op_data->op_cap = cfs_curproc_cap_pack();
+	if (op_data->op_mea2 != NULL) {
+		struct lmv_stripe_md	*lsm = op_data->op_mea2;
+		const struct lmv_oinfo	*oinfo;
+
+		oinfo = lsm_name_to_stripe_info(lsm, op_data->op_name,
+						op_data->op_namelen);
+		if (IS_ERR(oinfo))
+			RETURN(PTR_ERR(oinfo));
+
+		op_data->op_fid2 = oinfo->lmo_fid;
+	}
+
+	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	/*
+	 * Cancel UPDATE lock on child (fid1).
+	 */
+	op_data->op_flags |= MF_MDC_CANCEL_FID2;
+	rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_idx, LCK_EX,
+			      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1);
+	if (rc != 0)
+		RETURN(rc);
+
+	rc = md_link(tgt->ltd_exp, op_data, request);
+
+	RETURN(rc);
+}
+
+static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
+		      const char *old, size_t oldlen,
+		      const char *new, size_t newlen,
+		      struct ptlrpc_request **request)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd          *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *src_tgt;
+	struct lmv_tgt_desc     *tgt_tgt;
+	struct obd_export	*target_exp;
+	struct mdt_body		*body;
+	int			rc;
+	ENTRY;
+
+	LASSERT(oldlen != 0);
+
+	CDEBUG(D_INODE, "RENAME %.*s in "DFID":%d to %.*s in "DFID":%d\n",
+	       (int)oldlen, old, PFID(&op_data->op_fid1),
+	       op_data->op_mea1 ? op_data->op_mea1->lsm_md_stripe_count : 0,
+	       (int)newlen, new, PFID(&op_data->op_fid2),
+	       op_data->op_mea2 ? op_data->op_mea2->lsm_md_stripe_count : 0);
+
+	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
+	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
+	op_data->op_cap = cfs_curproc_cap_pack();
+	if (op_data->op_cli_flags & CLI_MIGRATE) {
+		LASSERTF(fid_is_sane(&op_data->op_fid3), "invalid FID "DFID"\n",
+			 PFID(&op_data->op_fid3));
+
+		if (op_data->op_mea1 != NULL) {
+			struct lmv_stripe_md	*lsm = op_data->op_mea1;
+			struct lmv_tgt_desc	*tmp;
+
+			/* Fix the parent fid for striped dir */
+			tmp = lmv_locate_target_for_name(lmv, lsm, old,
+							 oldlen,
+							 &op_data->op_fid1,
+							 NULL);
+			if (IS_ERR(tmp))
+				RETURN(PTR_ERR(tmp));
+		}
+
+		rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
+		if (rc != 0)
+			RETURN(rc);
+
+		src_tgt = lmv_find_target(lmv, &op_data->op_fid3);
+		if (IS_ERR(src_tgt))
+			RETURN(PTR_ERR(src_tgt));
+
+		target_exp = src_tgt->ltd_exp;
+	} else {
+		if (op_data->op_mea1 != NULL) {
+			struct lmv_stripe_md	*lsm = op_data->op_mea1;
+
+			src_tgt = lmv_locate_target_for_name(lmv, lsm, old,
+							     oldlen,
+							     &op_data->op_fid1,
+							     &op_data->op_mds);
+		} else {
+			src_tgt = lmv_find_target(lmv, &op_data->op_fid1);
+		}
+		if (IS_ERR(src_tgt))
+			RETURN(PTR_ERR(src_tgt));
+
+
+		if (op_data->op_mea2 != NULL) {
+			struct lmv_stripe_md	*lsm = op_data->op_mea2;
+
+			tgt_tgt = lmv_locate_target_for_name(lmv, lsm, new,
+							     newlen,
+							     &op_data->op_fid2,
+							     &op_data->op_mds);
+		} else {
+			tgt_tgt = lmv_find_target(lmv, &op_data->op_fid2);
+
+		}
+		if (IS_ERR(tgt_tgt))
+			RETURN(PTR_ERR(tgt_tgt));
+
+		target_exp = tgt_tgt->ltd_exp;
+	}
+
+	/*
+	 * LOOKUP lock on src child (fid3) should also be cancelled for
+	 * src_tgt in mdc_rename.
+	 */
+	op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
+
+	/*
+	 * Cancel UPDATE locks on tgt parent (fid2), tgt_tgt is its
+	 * own target.
+	 */
+	rc = lmv_early_cancel(exp, NULL, op_data, src_tgt->ltd_idx,
+			      LCK_EX, MDS_INODELOCK_UPDATE,
+			      MF_MDC_CANCEL_FID2);
+
+	if (rc != 0)
+		RETURN(rc);
+	/*
+	 * Cancel LOOKUP locks on source child (fid3) for parent tgt_tgt.
+	 */
+	if (fid_is_sane(&op_data->op_fid3)) {
+		struct lmv_tgt_desc *tgt;
+
+		tgt = lmv_find_target(lmv, &op_data->op_fid1);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
+
+		/* Cancel LOOKUP lock on its parent */
+		rc = lmv_early_cancel(exp, tgt, op_data, src_tgt->ltd_idx,
+				      LCK_EX, MDS_INODELOCK_LOOKUP,
+				      MF_MDC_CANCEL_FID3);
+		if (rc != 0)
+			RETURN(rc);
+
+		rc = lmv_early_cancel(exp, NULL, op_data, src_tgt->ltd_idx,
+				      LCK_EX, MDS_INODELOCK_FULL,
+				      MF_MDC_CANCEL_FID3);
+		if (rc != 0)
+			RETURN(rc);
+	}
+
+retry_rename:
+	/*
+	 * Cancel all the locks on tgt child (fid4).
+	 */
+	if (fid_is_sane(&op_data->op_fid4)) {
+		struct lmv_tgt_desc *tgt;
+
+		rc = lmv_early_cancel(exp, NULL, op_data, src_tgt->ltd_idx,
+				      LCK_EX, MDS_INODELOCK_FULL,
+				      MF_MDC_CANCEL_FID4);
+		if (rc != 0)
+			RETURN(rc);
+
+		tgt = lmv_find_target(lmv, &op_data->op_fid4);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
+
+		/* Since the target child might be destroyed, and it might
+		 * become orphan, and we can only check orphan on the local
+		 * MDT right now, so we send rename request to the MDT where
+		 * target child is located. If target child does not exist,
+		 * then it will send the request to the target parent */
+		target_exp = tgt->ltd_exp;
+	}
+
+	rc = md_rename(target_exp, op_data, old, oldlen, new, newlen,
+		       request);
+
+	if (rc != 0 && rc != -EXDEV)
+		RETURN(rc);
+
+	body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		RETURN(-EPROTO);
+
+	/* Not cross-ref case, just get out of here. */
+	if (likely(!(body->mbo_valid & OBD_MD_MDS)))
+		RETURN(rc);
+
+	CDEBUG(D_INODE, "%s: try rename to another MDT for "DFID"\n",
+	       exp->exp_obd->obd_name, PFID(&body->mbo_fid1));
+
+	op_data->op_fid4 = body->mbo_fid1;
+	ptlrpc_req_finished(*request);
+	*request = NULL;
+	goto retry_rename;
+}
+
+static int lmv_setattr(struct obd_export *exp, struct md_op_data *op_data,
+		       void *ea, size_t ealen, struct ptlrpc_request **request)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd          *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	int                      rc = 0;
+	ENTRY;
+
+	CDEBUG(D_INODE, "SETATTR for "DFID", valid 0x%x\n",
+	       PFID(&op_data->op_fid1), op_data->op_attr.ia_valid);
+
+	op_data->op_flags |= MF_MDC_CANCEL_FID1;
+	tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_setattr(tgt->ltd_exp, op_data, ea, ealen, request);
+
+	RETURN(rc);
+}
+
+static int lmv_fsync(struct obd_export *exp, const struct lu_fid *fid,
+		     struct ptlrpc_request **request)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lmv_obd		*lmv = &obd->u.lmv;
+	struct lmv_tgt_desc	*tgt;
+	int			 rc;
+	ENTRY;
+
+	tgt = lmv_find_target(lmv, fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_fsync(tgt->ltd_exp, fid, request);
+	RETURN(rc);
+}
+
+/**
+ * Get current minimum entry from striped directory
+ *
+ * This function will search the dir entry, whose hash value is the
+ * closest(>=) to @hash_offset, from all of sub-stripes, and it is
+ * only being called for striped directory.
+ *
+ * \param[in] exp		export of LMV
+ * \param[in] op_data		parameters transferred beween client MD stack
+ *                              stripe_information will be included in this
+ *                              parameter
+ * \param[in] cb_op		ldlm callback being used in enqueue in
+ *                              mdc_read_page
+ * \param[in] hash_offset	the hash value, which is used to locate
+ *                              minum(closet) dir entry
+ * \param[in|out] stripe_offset the caller use this to indicate the stripe
+ *                              index of last entry, so to avoid hash conflict
+ *                              between stripes. It will also be used to
+ *                              return the stripe index of current dir entry.
+ * \param[in|out] entp		the minum entry and it also is being used
+ *                              to input the last dir entry to resolve the
+ *                              hash conflict
+ *
+ * \param[out] ppage		the page which holds the minum entry
+ *
+ * \retval                      = 0 get the entry successfully
+ *                              negative errno (< 0) does not get the entry
+ */
+static int lmv_get_min_striped_entry(struct obd_export *exp,
+				     struct md_op_data *op_data,
+				     struct md_callback *cb_op,
+				     __u64 hash_offset, int *stripe_offset,
+				     struct lu_dirent **entp,
+				     struct page **ppage)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lmv_obd		*lmv = &obd->u.lmv;
+	struct lmv_stripe_md	*lsm = op_data->op_mea1;
+	struct lmv_tgt_desc	*tgt;
+	int			stripe_count;
+	struct lu_dirent	*min_ent = NULL;
+	struct page		*min_page = NULL;
+	int			min_idx = 0;
+	int			i;
+	int			rc = 0;
+	ENTRY;
+
+	stripe_count = lsm->lsm_md_stripe_count;
+	for (i = 0; i < stripe_count; i++) {
+		struct lu_dirent	*ent = NULL;
+		struct page		*page = NULL;
+		struct lu_dirpage	*dp;
+		__u64			stripe_hash = hash_offset;
+
+		tgt = lmv_get_target(lmv, lsm->lsm_md_oinfo[i].lmo_mds, NULL);
+		if (IS_ERR(tgt))
+			GOTO(out, rc = PTR_ERR(tgt));
+
+		/* op_data will be shared by each stripe, so we need
+		 * reset these value for each stripe */
+		op_data->op_fid1 = lsm->lsm_md_oinfo[i].lmo_fid;
+		op_data->op_fid2 = lsm->lsm_md_oinfo[i].lmo_fid;
+		op_data->op_data = lsm->lsm_md_oinfo[i].lmo_root;
+next:
+		rc = md_read_page(tgt->ltd_exp, op_data, cb_op, stripe_hash,
+				  &page);
+		if (rc != 0)
+			GOTO(out, rc);
+
+		dp = page_address(page);
+		for (ent = lu_dirent_start(dp); ent != NULL;
+		     ent = lu_dirent_next(ent)) {
+			/* Skip dummy entry */
+			if (le16_to_cpu(ent->lde_namelen) == 0)
+				continue;
+
+			if (le64_to_cpu(ent->lde_hash) < hash_offset)
+				continue;
+
+			if (le64_to_cpu(ent->lde_hash) == hash_offset &&
+			    (*entp == ent || i < *stripe_offset))
+				continue;
+
+			/* skip . and .. for other stripes */
+			if (i != 0 &&
+			    (strncmp(ent->lde_name, ".",
+				     le16_to_cpu(ent->lde_namelen)) == 0 ||
+			     strncmp(ent->lde_name, "..",
+				     le16_to_cpu(ent->lde_namelen)) == 0))
+				continue;
+			break;
+		}
+
+		if (ent == NULL) {
+			stripe_hash = le64_to_cpu(dp->ldp_hash_end);
+
+			kunmap(page);
+			put_page(page);
+			page = NULL;
+
+			/* reach the end of current stripe, go to next stripe */
+			if (stripe_hash == MDS_DIR_END_OFF)
+				continue;
+			else
+				goto next;
+		}
+
+		if (min_ent != NULL) {
+			if (le64_to_cpu(min_ent->lde_hash) >
+			    le64_to_cpu(ent->lde_hash)) {
+				min_ent = ent;
+				kunmap(min_page);
+				put_page(min_page);
+				min_idx = i;
+				min_page = page;
+			} else {
+				kunmap(page);
+				put_page(page);
+				page = NULL;
+			}
+		} else {
+			min_ent = ent;
+			min_page = page;
+			min_idx = i;
+		}
+	}
+
+out:
+	if (*ppage != NULL) {
+		kunmap(*ppage);
+		put_page(*ppage);
+	}
+	*stripe_offset = min_idx;
+	*entp = min_ent;
+	*ppage = min_page;
+	RETURN(rc);
+}
+
+/**
+ * Build dir entry page from a striped directory
+ *
+ * This function gets one entry by @offset from a striped directory. It will
+ * read entries from all of stripes, and choose one closest to the required
+ * offset(&offset). A few notes
+ * 1. skip . and .. for non-zero stripes, because there can only have one .
+ * and .. in a directory.
+ * 2. op_data will be shared by all of stripes, instead of allocating new
+ * one, so need to restore before reusing.
+ * 3. release the entry page if that is not being chosen.
+ *
+ * \param[in] exp	obd export refer to LMV
+ * \param[in] op_data	hold those MD parameters of read_entry
+ * \param[in] cb_op	ldlm callback being used in enqueue in mdc_read_entry
+ * \param[out] ldp	the entry being read
+ * \param[out] ppage	the page holding the entry. Note: because the entry
+ *                      will be accessed in upper layer, so we need hold the
+ *                      page until the usages of entry is finished, see
+ *                      ll_dir_entry_next.
+ *
+ * retval		=0 if get entry successfully
+ *                      <0 cannot get entry
+ */
+static int lmv_read_striped_page(struct obd_export *exp,
+				 struct md_op_data *op_data,
+				 struct md_callback *cb_op,
+				 __u64 offset, struct page **ppage)
+{
+	struct lu_fid		master_fid = op_data->op_fid1;
+	struct inode		*master_inode = op_data->op_data;
+	__u64			hash_offset = offset;
+	struct lu_dirpage	*dp;
+	struct page		*min_ent_page = NULL;
+	struct page		*ent_page = NULL;
+	struct lu_dirent	*ent;
+	void			*area;
+	int			ent_idx = 0;
+	struct lu_dirent	*min_ent = NULL;
+	struct lu_dirent	*last_ent;
+	size_t			left_bytes;
+	int			rc;
+	ENTRY;
+
+	/* Allocate a page and read entries from all of stripes and fill
+	 * the page by hash order */
+	ent_page = alloc_page(GFP_KERNEL);
+	if (ent_page == NULL)
+		RETURN(-ENOMEM);
+
+	/* Initialize the entry page */
+	dp = kmap(ent_page);
+	memset(dp, 0, sizeof(*dp));
+	dp->ldp_hash_start = cpu_to_le64(offset);
+	dp->ldp_flags |= LDF_COLLIDE;
+
+	area = dp + 1;
+	left_bytes = PAGE_SIZE - sizeof(*dp);
+	ent = area;
+	last_ent = ent;
+	do {
+		__u16	ent_size;
+
+		/* Find the minum entry from all sub-stripes */
+		rc = lmv_get_min_striped_entry(exp, op_data, cb_op, hash_offset,
+					       &ent_idx, &min_ent,
+					       &min_ent_page);
+		if (rc != 0)
+			GOTO(out, rc);
+
+		/* If it can not get minum entry, it means it already reaches
+		 * the end of this directory */
+		if (min_ent == NULL) {
+			last_ent->lde_reclen = 0;
+			hash_offset = MDS_DIR_END_OFF;
+			GOTO(out, rc);
+		}
+
+		ent_size = le16_to_cpu(min_ent->lde_reclen);
+
+		/* the last entry lde_reclen is 0, but it might not
+		 * the end of this entry of this temporay entry */
+		if (ent_size == 0)
+			ent_size = lu_dirent_calc_size(
+					le16_to_cpu(min_ent->lde_namelen),
+					le32_to_cpu(min_ent->lde_attrs));
+		if (ent_size > left_bytes) {
+			last_ent->lde_reclen = cpu_to_le16(0);
+			hash_offset = le64_to_cpu(min_ent->lde_hash);
+			GOTO(out, rc);
+		}
+
+		memcpy(ent, min_ent, ent_size);
+
+		/* Replace . with master FID and Replace .. with the parent FID
+		 * of master object */
+		if (strncmp(ent->lde_name, ".",
+			    le16_to_cpu(ent->lde_namelen)) == 0 &&
+		    le16_to_cpu(ent->lde_namelen) == 1)
+			fid_cpu_to_le(&ent->lde_fid, &master_fid);
+		else if (strncmp(ent->lde_name, "..",
+				   le16_to_cpu(ent->lde_namelen)) == 0 &&
+			   le16_to_cpu(ent->lde_namelen) == 2)
+			fid_cpu_to_le(&ent->lde_fid, &op_data->op_fid3);
+
+		left_bytes -= ent_size;
+		ent->lde_reclen = cpu_to_le16(ent_size);
+		last_ent = ent;
+		ent = (void *)ent + ent_size;
+		hash_offset = le64_to_cpu(min_ent->lde_hash);
+		if (hash_offset == MDS_DIR_END_OFF) {
+			last_ent->lde_reclen = 0;
+			break;
+		}
+	} while (1);
+out:
+	if (min_ent_page != NULL) {
+		kunmap(min_ent_page);
+		put_page(min_ent_page);
+	}
+
+	if (unlikely(rc != 0)) {
+		__free_page(ent_page);
+		ent_page = NULL;
+	} else {
+		if (ent == area)
+			dp->ldp_flags |= LDF_EMPTY;
+		dp->ldp_flags = cpu_to_le32(dp->ldp_flags);
+		dp->ldp_hash_end = cpu_to_le64(hash_offset);
+	}
+
+	/* We do not want to allocate md_op_data during each
+	 * dir entry reading, so op_data will be shared by every stripe,
+	 * then we need to restore it back to original value before
+	 * return to the upper layer */
+	op_data->op_fid1 = master_fid;
+	op_data->op_fid2 = master_fid;
+	op_data->op_data = master_inode;
+
+	*ppage = ent_page;
+
+	RETURN(rc);
+}
+
+int lmv_read_page(struct obd_export *exp, struct md_op_data *op_data,
+		  struct md_callback *cb_op, __u64 offset,
+		  struct page **ppage)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lmv_obd		*lmv = &obd->u.lmv;
+	struct lmv_stripe_md	*lsm = op_data->op_mea1;
+	struct lmv_tgt_desc	*tgt;
+	int			rc;
+	ENTRY;
+
+	if (unlikely(lsm != NULL)) {
+		rc = lmv_read_striped_page(exp, op_data, cb_op, offset, ppage);
+		RETURN(rc);
+	}
+
+	tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_read_page(tgt->ltd_exp, op_data, cb_op, offset, ppage);
+
+	RETURN(rc);
+}
+
+/**
+ * Unlink a file/directory
+ *
+ * Unlink a file or directory under the parent dir. The unlink request
+ * usually will be sent to the MDT where the child is located, but if
+ * the client does not have the child FID then request will be sent to the
+ * MDT where the parent is located.
+ *
+ * If the parent is a striped directory then it also needs to locate which
+ * stripe the name of the child is located, and replace the parent FID
+ * (@op->op_fid1) with the stripe FID. Note: if the stripe is unknown,
+ * it will walk through all of sub-stripes until the child is being
+ * unlinked finally.
+ *
+ * \param[in] exp	export refer to LMV
+ * \param[in] op_data	different parameters transferred beween client
+ *                      MD stacks, name, namelen, FIDs etc.
+ *                      op_fid1 is the parent FID, op_fid2 is the child
+ *                      FID.
+ * \param[out] request	point to the request of unlink.
+ *
+ * retval		0 if succeed
+ *                      negative errno if failed.
+ */
+static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data,
+                      struct ptlrpc_request **request)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd          *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt = NULL;
+	struct lmv_tgt_desc     *parent_tgt = NULL;
+	struct mdt_body		*body;
+	int                     rc;
+	int			stripe_index = 0;
+	struct lmv_stripe_md	*lsm = op_data->op_mea1;
+	ENTRY;
+
+retry_unlink:
+	/* For striped dir, we need to locate the parent as well */
+	if (lsm != NULL) {
+		struct lmv_tgt_desc *tmp;
+
+		LASSERT(op_data->op_name != NULL &&
+			op_data->op_namelen != 0);
+
+		tmp = lmv_locate_target_for_name(lmv, lsm,
+						 op_data->op_name,
+						 op_data->op_namelen,
+						 &op_data->op_fid1,
+						 &op_data->op_mds);
+
+		/* return -EBADFD means unknown hash type, might
+		 * need try all sub-stripe here */
+		if (IS_ERR(tmp) && PTR_ERR(tmp) != -EBADFD)
+			RETURN(PTR_ERR(tmp));
+
+		/* Note: both migrating dir and unknown hash dir need to
+		 * try all of sub-stripes, so we need start search the
+		 * name from stripe 0, but migrating dir is already handled
+		 * inside lmv_locate_target_for_name(), so we only check
+		 * unknown hash type directory here */
+		if (!lmv_is_known_hash_type(lsm->lsm_md_hash_type)) {
+			struct lmv_oinfo *oinfo;
+
+			oinfo = &lsm->lsm_md_oinfo[stripe_index];
+
+			op_data->op_fid1 = oinfo->lmo_fid;
+			op_data->op_mds = oinfo->lmo_mds;
+		}
+	}
+
+try_next_stripe:
+	/* Send unlink requests to the MDT where the child is located */
+	if (likely(!fid_is_zero(&op_data->op_fid2)))
+		tgt = lmv_find_target(lmv, &op_data->op_fid2);
+	else if (lsm != NULL)
+		tgt = lmv_get_target(lmv, op_data->op_mds, NULL);
+	else
+		tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
+	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
+	op_data->op_cap = cfs_curproc_cap_pack();
+
+	/*
+	 * If child's fid is given, cancel unused locks for it if it is from
+	 * another export than parent.
+	 *
+	 * LOOKUP lock for child (fid3) should also be cancelled on parent
+	 * tgt_tgt in mdc_unlink().
+	 */
+	op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
+
+	/*
+	 * Cancel FULL locks on child (fid3).
+	 */
+	parent_tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	if (IS_ERR(parent_tgt))
+		RETURN(PTR_ERR(parent_tgt));
+
+	if (parent_tgt != tgt) {
+		rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_idx,
+				      LCK_EX, MDS_INODELOCK_LOOKUP,
+				      MF_MDC_CANCEL_FID3);
+	}
+
+	rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_idx, LCK_EX,
+			      MDS_INODELOCK_FULL, MF_MDC_CANCEL_FID3);
+	if (rc != 0)
+		RETURN(rc);
+
+	CDEBUG(D_INODE, "unlink with fid="DFID"/"DFID" -> mds #%u\n",
+	       PFID(&op_data->op_fid1), PFID(&op_data->op_fid2), tgt->ltd_idx);
+
+	rc = md_unlink(tgt->ltd_exp, op_data, request);
+	if (rc != 0 && rc != -EREMOTE && rc != -ENOENT)
+		RETURN(rc);
+
+	/* Try next stripe if it is needed. */
+	if (rc == -ENOENT && lsm != NULL && lmv_need_try_all_stripes(lsm)) {
+		struct lmv_oinfo *oinfo;
+
+		stripe_index++;
+		if (stripe_index >= lsm->lsm_md_stripe_count)
+			RETURN(rc);
+
+		oinfo = &lsm->lsm_md_oinfo[stripe_index];
+
+		op_data->op_fid1 = oinfo->lmo_fid;
+		op_data->op_mds = oinfo->lmo_mds;
+
+		ptlrpc_req_finished(*request);
+		*request = NULL;
+
+		goto try_next_stripe;
+	}
+
+	body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		RETURN(-EPROTO);
+
+	/* Not cross-ref case, just get out of here. */
+	if (likely(!(body->mbo_valid & OBD_MD_MDS)))
+		RETURN(rc);
+
+	CDEBUG(D_INODE, "%s: try unlink to another MDT for "DFID"\n",
+	       exp->exp_obd->obd_name, PFID(&body->mbo_fid1));
+
+	/* This is a remote object, try remote MDT, Note: it may
+	 * try more than 1 time here, Considering following case
+	 * /mnt/lustre is root on MDT0, remote1 is on MDT1
+	 * 1. Initially A does not know where remote1 is, it send
+	 *    unlink RPC to MDT0, MDT0 return -EREMOTE, it will
+	 *    resend unlink RPC to MDT1 (retry 1st time).
+	 *
+	 * 2. During the unlink RPC in flight,
+	 *    client B mv /mnt/lustre/remote1 /mnt/lustre/remote2
+	 *    and create new remote1, but on MDT0
+	 *
+	 * 3. MDT1 get unlink RPC(from A), then do remote lock on
+	 *    /mnt/lustre, then lookup get fid of remote1, and find
+	 *    it is remote dir again, and replay -EREMOTE again.
+	 *
+	 * 4. Then A will resend unlink RPC to MDT0. (retry 2nd times).
+	 *
+	 * In theory, it might try unlimited time here, but it should
+	 * be very rare case.  */
+	op_data->op_fid2 = body->mbo_fid1;
+	ptlrpc_req_finished(*request);
+	*request = NULL;
+
+	goto retry_unlink;
+}
+
+static int lmv_precleanup(struct obd_device *obd)
+{
+	ENTRY;
+	libcfs_kkuc_group_rem(&obd->obd_uuid, 0, KUC_GRP_HSM);
+	fld_client_proc_fini(&obd->u.lmv.lmv_fld);
+	lprocfs_obd_cleanup(obd);
+	lprocfs_free_md_stats(obd);
+	RETURN(0);
+}
+
+/**
+ * Get by key a value associated with a LMV device.
+ *
+ * Dispatch request to lower-layer devices as needed.
+ *
+ * \param[in] env		execution environment for this thread
+ * \param[in] exp		export for the LMV device
+ * \param[in] keylen		length of key identifier
+ * \param[in] key		identifier of key to get value for
+ * \param[in] vallen		size of \a val
+ * \param[out] val		pointer to storage location for value
+ * \param[in] lsm		optional striping metadata of object
+ *
+ * \retval 0		on success
+ * \retval negative	negated errno on failure
+ */
+static int lmv_get_info(const struct lu_env *env, struct obd_export *exp,
+			__u32 keylen, void *key, __u32 *vallen, void *val)
+{
+        struct obd_device       *obd;
+        struct lmv_obd          *lmv;
+        int                      rc = 0;
+        ENTRY;
+
+        obd = class_exp2obd(exp);
+        if (obd == NULL) {
+		CDEBUG(D_IOCTL, "Invalid client cookie %#llx\n",
+                       exp->exp_handle.h_cookie);
+                RETURN(-EINVAL);
+        }
+
+        lmv = &obd->u.lmv;
+        if (keylen >= strlen("remote_flag") && !strcmp(key, "remote_flag")) {
+                int i;
+
+                LASSERT(*vallen == sizeof(__u32));
+		for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+			struct lmv_tgt_desc *tgt = lmv->tgts[i];
+			/*
+			 * All tgts should be connected when this gets called.
+			 */
+			if (tgt == NULL || tgt->ltd_exp == NULL)
+				continue;
+
+			if (!obd_get_info(env, tgt->ltd_exp, keylen, key,
+					  vallen, val))
+				RETURN(0);
+		}
+		RETURN(-EINVAL);
+	} else if (KEY_IS(KEY_MAX_EASIZE) ||
+		   KEY_IS(KEY_DEFAULT_EASIZE) ||
+		   KEY_IS(KEY_CONN_DATA)) {
+		/*
+		 * Forwarding this request to first MDS, it should know LOV
+		 * desc.
+		 */
+		rc = obd_get_info(env, lmv->tgts[0]->ltd_exp, keylen, key,
+				  vallen, val);
+		if (!rc && KEY_IS(KEY_CONN_DATA))
+			exp->exp_connect_data = *(struct obd_connect_data *)val;
+                RETURN(rc);
+        } else if (KEY_IS(KEY_TGT_COUNT)) {
+                *((int *)val) = lmv->desc.ld_tgt_count;
+                RETURN(0);
+        }
+
+        CDEBUG(D_IOCTL, "Invalid key\n");
+        RETURN(-EINVAL);
+}
+
+/**
+ * Asynchronously set by key a value associated with a LMV device.
+ *
+ * Dispatch request to lower-layer devices as needed.
+ *
+ * \param[in] env	execution environment for this thread
+ * \param[in] exp	export for the LMV device
+ * \param[in] keylen	length of key identifier
+ * \param[in] key	identifier of key to store value for
+ * \param[in] vallen	size of value to store
+ * \param[in] val	pointer to data to be stored
+ * \param[in] set	optional list of related ptlrpc requests
+ *
+ * \retval 0		on success
+ * \retval negative	negated errno on failure
+ */
+int lmv_set_info_async(const struct lu_env *env, struct obd_export *exp,
+			__u32 keylen, void *key, __u32 vallen, void *val,
+			struct ptlrpc_request_set *set)
+{
+	struct lmv_tgt_desc	*tgt = NULL;
+	struct obd_device	*obd;
+	struct lmv_obd		*lmv;
+	int rc = 0;
+	ENTRY;
+
+	obd = class_exp2obd(exp);
+	if (obd == NULL) {
+		CDEBUG(D_IOCTL, "Invalid client cookie %#llx\n",
+		       exp->exp_handle.h_cookie);
+		RETURN(-EINVAL);
+	}
+	lmv = &obd->u.lmv;
+
+	if (KEY_IS(KEY_READ_ONLY) || KEY_IS(KEY_FLUSH_CTX) ||
+	    KEY_IS(KEY_DEFAULT_EASIZE)) {
+		int i, err = 0;
+
+		for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+			tgt = lmv->tgts[i];
+
+			if (tgt == NULL || tgt->ltd_exp == NULL)
+				continue;
+
+			err = obd_set_info_async(env, tgt->ltd_exp,
+						 keylen, key, vallen, val, set);
+			if (err && rc == 0)
+				rc = err;
+		}
+
+		RETURN(rc);
+	}
+
+	RETURN(-EINVAL);
+}
+
+static int lmv_unpack_md_v1(struct obd_export *exp, struct lmv_stripe_md *lsm,
+			    const struct lmv_mds_md_v1 *lmm1)
+{
+	struct lmv_obd	*lmv = &exp->exp_obd->u.lmv;
+	int		stripe_count;
+	int		cplen;
+	int		i;
+	int		rc = 0;
+	ENTRY;
+
+	lsm->lsm_md_magic = le32_to_cpu(lmm1->lmv_magic);
+	lsm->lsm_md_stripe_count = le32_to_cpu(lmm1->lmv_stripe_count);
+	lsm->lsm_md_master_mdt_index = le32_to_cpu(lmm1->lmv_master_mdt_index);
+	if (OBD_FAIL_CHECK(OBD_FAIL_UNKNOWN_LMV_STRIPE))
+		lsm->lsm_md_hash_type = LMV_HASH_TYPE_UNKNOWN;
+	else
+		lsm->lsm_md_hash_type = le32_to_cpu(lmm1->lmv_hash_type);
+	lsm->lsm_md_layout_version = le32_to_cpu(lmm1->lmv_layout_version);
+	cplen = strlcpy(lsm->lsm_md_pool_name, lmm1->lmv_pool_name,
+			sizeof(lsm->lsm_md_pool_name));
+
+	if (cplen >= sizeof(lsm->lsm_md_pool_name))
+		RETURN(-E2BIG);
+
+	CDEBUG(D_INFO, "unpack lsm count %d, master %d hash_type %d"
+	       "layout_version %d\n", lsm->lsm_md_stripe_count,
+	       lsm->lsm_md_master_mdt_index, lsm->lsm_md_hash_type,
+	       lsm->lsm_md_layout_version);
+
+	stripe_count = le32_to_cpu(lmm1->lmv_stripe_count);
+	for (i = 0; i < stripe_count; i++) {
+		fid_le_to_cpu(&lsm->lsm_md_oinfo[i].lmo_fid,
+			      &lmm1->lmv_stripe_fids[i]);
+		rc = lmv_fld_lookup(lmv, &lsm->lsm_md_oinfo[i].lmo_fid,
+				    &lsm->lsm_md_oinfo[i].lmo_mds);
+		if (rc != 0)
+			RETURN(rc);
+		CDEBUG(D_INFO, "unpack fid #%d "DFID"\n", i,
+		       PFID(&lsm->lsm_md_oinfo[i].lmo_fid));
+	}
+
+	RETURN(rc);
+}
+
+static int lmv_unpackmd(struct obd_export *exp, struct lmv_stripe_md **lsmp,
+			const union lmv_mds_md *lmm, size_t lmm_size)
+{
+	struct lmv_stripe_md	 *lsm;
+	int			 lsm_size;
+	int			 rc;
+	bool			 allocated = false;
+	ENTRY;
+
+	LASSERT(lsmp != NULL);
+
+	lsm = *lsmp;
+	/* Free memmd */
+	if (lsm != NULL && lmm == NULL) {
+		int i;
+		for (i = 0; i < lsm->lsm_md_stripe_count; i++) {
+			/* For migrating inode, the master stripe and master
+			 * object will be the same, so do not need iput, see
+			 * ll_update_lsm_md */
+			if (!(lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION &&
+			      i == 0) && lsm->lsm_md_oinfo[i].lmo_root != NULL)
+				iput(lsm->lsm_md_oinfo[i].lmo_root);
+		}
+		lsm_size = lmv_stripe_md_size(lsm->lsm_md_stripe_count);
+		OBD_FREE(lsm, lsm_size);
+		*lsmp = NULL;
+		RETURN(0);
+	}
+
+	if (le32_to_cpu(lmm->lmv_magic) == LMV_MAGIC_STRIPE)
+		RETURN(-EPERM);
+
+	/* Unpack memmd */
+	if (le32_to_cpu(lmm->lmv_magic) != LMV_MAGIC_V1 &&
+	    le32_to_cpu(lmm->lmv_magic) != LMV_USER_MAGIC) {
+		CERROR("%s: invalid lmv magic %x: rc = %d\n",
+		       exp->exp_obd->obd_name, le32_to_cpu(lmm->lmv_magic),
+		       -EIO);
+		RETURN(-EIO);
+	}
+
+	if (le32_to_cpu(lmm->lmv_magic) == LMV_MAGIC_V1)
+		lsm_size = lmv_stripe_md_size(lmv_mds_md_stripe_count_get(lmm));
+	else
+		/**
+		 * Unpack default dirstripe(lmv_user_md) to lmv_stripe_md,
+		 * stripecount should be 0 then.
+		 */
+		lsm_size = lmv_stripe_md_size(0);
+
+	lsm_size = lmv_stripe_md_size(lmv_mds_md_stripe_count_get(lmm));
+	if (lsm == NULL) {
+		OBD_ALLOC(lsm, lsm_size);
+		if (lsm == NULL)
+			RETURN(-ENOMEM);
+		allocated = true;
+		*lsmp = lsm;
+	}
+
+	switch (le32_to_cpu(lmm->lmv_magic)) {
+	case LMV_MAGIC_V1:
+		rc = lmv_unpack_md_v1(exp, lsm, &lmm->lmv_md_v1);
+		break;
+	default:
+		CERROR("%s: unrecognized magic %x\n", exp->exp_obd->obd_name,
+		       le32_to_cpu(lmm->lmv_magic));
+		rc = -EINVAL;
+		break;
+	}
+
+	if (rc != 0 && allocated) {
+		OBD_FREE(lsm, lsm_size);
+		*lsmp = NULL;
+		lsm_size = rc;
+	}
+	RETURN(lsm_size);
+}
+
+void lmv_free_memmd(struct lmv_stripe_md *lsm)
+{
+	lmv_unpackmd(NULL, &lsm, NULL, 0);
+}
+EXPORT_SYMBOL(lmv_free_memmd);
+
+static int lmv_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
+			     union ldlm_policy_data *policy,
+			     enum ldlm_mode mode, enum ldlm_cancel_flags flags,
+			     void *opaque)
+{
+	struct lmv_obd *lmv = &exp->exp_obd->u.lmv;
+	int rc = 0;
+	__u32 i;
+	ENTRY;
+
+	LASSERT(fid != NULL);
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		struct lmv_tgt_desc *tgt = lmv->tgts[i];
+		int err;
+
+		if (tgt == NULL || tgt->ltd_exp == NULL || !tgt->ltd_active)
+			continue;
+
+		err = md_cancel_unused(tgt->ltd_exp, fid, policy, mode, flags,
+				       opaque);
+		if (!rc)
+			rc = err;
+	}
+	RETURN(rc);
+}
+
+static int lmv_set_lock_data(struct obd_export *exp,
+			     const struct lustre_handle *lockh,
+			     void *data, __u64 *bits)
+{
+	struct lmv_obd		*lmv = &exp->exp_obd->u.lmv;
+	struct lmv_tgt_desc	*tgt = lmv->tgts[0];
+	int			 rc;
+	ENTRY;
+
+	if (tgt == NULL || tgt->ltd_exp == NULL)
+		RETURN(-EINVAL);
+	rc =  md_set_lock_data(tgt->ltd_exp, lockh, data, bits);
+	RETURN(rc);
+}
+
+enum ldlm_mode lmv_lock_match(struct obd_export *exp, __u64 flags,
+			      const struct lu_fid *fid, enum ldlm_type type,
+			      union ldlm_policy_data *policy,
+			      enum ldlm_mode mode, struct lustre_handle *lockh)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lmv_obd		*lmv = &obd->u.lmv;
+	enum ldlm_mode		rc;
+	int			tgt;
+	int			i;
+	ENTRY;
+
+	CDEBUG(D_INODE, "Lock match for "DFID"\n", PFID(fid));
+
+        /*
+	 * With DNE every object can have two locks in different namespaces:
+	 * lookup lock in space of MDT storing direntry and update/open lock in
+	 * space of MDT storing inode.  Try the MDT that the FID maps to first,
+	 * since this can be easily found, and only try others if that fails.
+	 */
+	for (i = 0, tgt = lmv_find_target_index(lmv, fid);
+	     i < lmv->desc.ld_tgt_count;
+	     i++, tgt = (tgt + 1) % lmv->desc.ld_tgt_count) {
+		if (tgt < 0) {
+			CDEBUG(D_HA, "%s: "DFID" is inaccessible: rc = %d\n",
+			       obd->obd_name, PFID(fid), tgt);
+			tgt = 0;
+		}
+
+		if (lmv->tgts[tgt] == NULL ||
+		    lmv->tgts[tgt]->ltd_exp == NULL ||
+		    lmv->tgts[tgt]->ltd_active == 0)
+			continue;
+
+		rc = md_lock_match(lmv->tgts[tgt]->ltd_exp, flags, fid,
+				   type, policy, mode, lockh);
+		if (rc)
+			RETURN(rc);
+	}
+
+	RETURN(0);
+}
+
+int lmv_get_lustre_md(struct obd_export *exp, struct ptlrpc_request *req,
+		      struct obd_export *dt_exp, struct obd_export *md_exp,
+		      struct lustre_md *md)
+{
+	struct lmv_obd          *lmv = &exp->exp_obd->u.lmv;
+	struct lmv_tgt_desc	*tgt = lmv->tgts[0];
+
+	if (tgt == NULL || tgt->ltd_exp == NULL)
+		RETURN(-EINVAL);
+
+	return md_get_lustre_md(lmv->tgts[0]->ltd_exp, req, dt_exp, md_exp, md);
+}
+
+int lmv_free_lustre_md(struct obd_export *exp, struct lustre_md *md)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lmv_obd		*lmv = &obd->u.lmv;
+	struct lmv_tgt_desc	*tgt = lmv->tgts[0];
+	ENTRY;
+
+	if (md->lmv != NULL) {
+		lmv_free_memmd(md->lmv);
+		md->lmv = NULL;
+	}
+	if (tgt == NULL || tgt->ltd_exp == NULL)
+		RETURN(-EINVAL);
+	RETURN(md_free_lustre_md(lmv->tgts[0]->ltd_exp, md));
+}
+
+int lmv_set_open_replay_data(struct obd_export *exp,
+			     struct obd_client_handle *och,
+			     struct lookup_intent *it)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lmv_obd		*lmv = &obd->u.lmv;
+	struct lmv_tgt_desc	*tgt;
+	ENTRY;
+
+	tgt = lmv_find_target(lmv, &och->och_fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	RETURN(md_set_open_replay_data(tgt->ltd_exp, och, it));
+}
+
+int lmv_clear_open_replay_data(struct obd_export *exp,
+                               struct obd_client_handle *och)
+{
+        struct obd_device       *obd = exp->exp_obd;
+        struct lmv_obd          *lmv = &obd->u.lmv;
+        struct lmv_tgt_desc     *tgt;
+        ENTRY;
+
+        tgt = lmv_find_target(lmv, &och->och_fid);
+        if (IS_ERR(tgt))
+                RETURN(PTR_ERR(tgt));
+
+        RETURN(md_clear_open_replay_data(tgt->ltd_exp, och));
+}
+
+int lmv_intent_getattr_async(struct obd_export *exp,
+			     struct md_enqueue_info *minfo)
+{
+	struct md_op_data       *op_data = &minfo->mi_data;
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd          *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *ptgt = NULL;
+	struct lmv_tgt_desc     *ctgt = NULL;
+	int                      rc;
+	ENTRY;
+
+	if (!fid_is_sane(&op_data->op_fid2))
+		RETURN(-EINVAL);
+
+	ptgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(ptgt))
+		RETURN(PTR_ERR(ptgt));
+
+	ctgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
+	if (IS_ERR(ctgt))
+		RETURN(PTR_ERR(ctgt));
+
+	/*
+	 * if child is on remote MDT, we need 2 async RPCs to fetch both LOOKUP
+	 * lock on parent, and UPDATE lock on child MDT, which makes all
+	 * complicated. Considering remote dir is rare case, and not supporting
+	 * it in statahead won't cause any issue, drop its support for now.
+	 */
+	if (ptgt != ctgt)
+		RETURN(-ENOTSUPP);
+
+	rc = md_intent_getattr_async(ptgt->ltd_exp, minfo);
+	RETURN(rc);
+}
+
+int lmv_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
+                        struct lu_fid *fid, __u64 *bits)
+{
+        struct obd_device       *obd = exp->exp_obd;
+        struct lmv_obd          *lmv = &obd->u.lmv;
+        struct lmv_tgt_desc     *tgt;
+        int                      rc;
+        ENTRY;
+
+        tgt = lmv_find_target(lmv, fid);
+        if (IS_ERR(tgt))
+                RETURN(PTR_ERR(tgt));
+
+        rc = md_revalidate_lock(tgt->ltd_exp, it, fid, bits);
+        RETURN(rc);
+}
+
+int lmv_get_fid_from_lsm(struct obd_export *exp,
+			 const struct lmv_stripe_md *lsm,
+			 const char *name, int namelen, struct lu_fid *fid)
+{
+	const struct lmv_oinfo *oinfo;
+
+	LASSERT(lsm != NULL);
+	oinfo = lsm_name_to_stripe_info(lsm, name, namelen);
+	if (IS_ERR(oinfo))
+		return PTR_ERR(oinfo);
+
+	*fid = oinfo->lmo_fid;
+
+	RETURN(0);
+}
+
+/**
+ * For lmv, only need to send request to master MDT, and the master MDT will
+ * process with other slave MDTs. The only exception is Q_GETOQUOTA for which
+ * we directly fetch data from the slave MDTs.
+ */
+int lmv_quotactl(struct obd_device *unused, struct obd_export *exp,
+		 struct obd_quotactl *oqctl)
+{
+	struct obd_device   *obd = class_exp2obd(exp);
+	struct lmv_obd      *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt = lmv->tgts[0];
+	int                  rc = 0;
+	__u32                i;
+	__u64                curspace, curinodes;
+	ENTRY;
+
+	if (tgt == NULL ||
+	    tgt->ltd_exp == NULL ||
+	    !tgt->ltd_active ||
+	    lmv->desc.ld_tgt_count == 0) {
+		CERROR("master lmv inactive\n");
+		RETURN(-EIO);
+	}
+
+        if (oqctl->qc_cmd != Q_GETOQUOTA) {
+                rc = obd_quotactl(tgt->ltd_exp, oqctl);
+                RETURN(rc);
+        }
+
+        curspace = curinodes = 0;
+        for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		int err;
+		tgt = lmv->tgts[i];
+
+		if (tgt == NULL || tgt->ltd_exp == NULL || !tgt->ltd_active)
+			continue;
+
+                err = obd_quotactl(tgt->ltd_exp, oqctl);
+                if (err) {
+                        CERROR("getquota on mdt %d failed. %d\n", i, err);
+                        if (!rc)
+                                rc = err;
+                } else {
+                        curspace += oqctl->qc_dqblk.dqb_curspace;
+                        curinodes += oqctl->qc_dqblk.dqb_curinodes;
+                }
+        }
+        oqctl->qc_dqblk.dqb_curspace = curspace;
+        oqctl->qc_dqblk.dqb_curinodes = curinodes;
+
+        RETURN(rc);
+}
+
+static int lmv_merge_attr(struct obd_export *exp,
+			  const struct lmv_stripe_md *lsm,
+			  struct cl_attr *attr,
+			  ldlm_blocking_callback cb_blocking)
+{
+	int rc;
+	int i;
+
+	rc = lmv_revalidate_slaves(exp, lsm, cb_blocking, 0);
+	if (rc < 0)
+		return rc;
+
+	for (i = 0; i < lsm->lsm_md_stripe_count; i++) {
+		struct inode *inode = lsm->lsm_md_oinfo[i].lmo_root;
+
+		CDEBUG(D_INFO, ""DFID" size %llu, blocks %llu nlink %u,"
+		       " atime %lu ctime %lu, mtime %lu.\n",
+		       PFID(&lsm->lsm_md_oinfo[i].lmo_fid),
+		       i_size_read(inode), (unsigned long long)inode->i_blocks,
+		       inode->i_nlink, LTIME_S(inode->i_atime),
+		       LTIME_S(inode->i_ctime), LTIME_S(inode->i_mtime));
+
+		/* for slave stripe, it needs to subtract nlink for . and .. */
+		if (i != 0)
+			attr->cat_nlink += inode->i_nlink - 2;
+		else
+			attr->cat_nlink = inode->i_nlink;
+
+		attr->cat_size += i_size_read(inode);
+		attr->cat_blocks += inode->i_blocks;
+
+		if (attr->cat_atime < LTIME_S(inode->i_atime))
+			attr->cat_atime = LTIME_S(inode->i_atime);
+
+		if (attr->cat_ctime < LTIME_S(inode->i_ctime))
+			attr->cat_ctime = LTIME_S(inode->i_ctime);
+
+		if (attr->cat_mtime < LTIME_S(inode->i_mtime))
+			attr->cat_mtime = LTIME_S(inode->i_mtime);
+	}
+	return 0;
+}
+
+struct obd_ops lmv_obd_ops = {
+        .o_owner                = THIS_MODULE,
+        .o_setup                = lmv_setup,
+        .o_cleanup              = lmv_cleanup,
+        .o_precleanup           = lmv_precleanup,
+        .o_process_config       = lmv_process_config,
+        .o_connect              = lmv_connect,
+        .o_disconnect           = lmv_disconnect,
+        .o_statfs               = lmv_statfs,
+        .o_get_info             = lmv_get_info,
+        .o_set_info_async       = lmv_set_info_async,
+        .o_notify               = lmv_notify,
+        .o_get_uuid             = lmv_get_uuid,
+        .o_iocontrol            = lmv_iocontrol,
+        .o_quotactl             = lmv_quotactl
+};
+
+struct md_ops lmv_md_ops = {
+	.m_get_root		= lmv_get_root,
+        .m_null_inode		= lmv_null_inode,
+        .m_close                = lmv_close,
+        .m_create               = lmv_create,
+        .m_enqueue              = lmv_enqueue,
+        .m_getattr              = lmv_getattr,
+        .m_getxattr             = lmv_getxattr,
+        .m_getattr_name         = lmv_getattr_name,
+        .m_intent_lock          = lmv_intent_lock,
+        .m_link                 = lmv_link,
+        .m_rename               = lmv_rename,
+        .m_setattr              = lmv_setattr,
+        .m_setxattr             = lmv_setxattr,
+	.m_fsync		= lmv_fsync,
+	.m_read_page		= lmv_read_page,
+        .m_unlink               = lmv_unlink,
+        .m_init_ea_size         = lmv_init_ea_size,
+        .m_cancel_unused        = lmv_cancel_unused,
+        .m_set_lock_data        = lmv_set_lock_data,
+        .m_lock_match           = lmv_lock_match,
+	.m_get_lustre_md        = lmv_get_lustre_md,
+	.m_free_lustre_md       = lmv_free_lustre_md,
+	.m_merge_attr		= lmv_merge_attr,
+        .m_set_open_replay_data = lmv_set_open_replay_data,
+        .m_clear_open_replay_data = lmv_clear_open_replay_data,
+        .m_intent_getattr_async = lmv_intent_getattr_async,
+	.m_revalidate_lock      = lmv_revalidate_lock,
+	.m_get_fid_from_lsm	= lmv_get_fid_from_lsm,
+	.m_unpackmd		= lmv_unpackmd,
+};
+
+static int __init lmv_init(void)
+{
+	return class_register_type(&lmv_obd_ops, &lmv_md_ops, true, NULL,
+				   LUSTRE_LMV_NAME, NULL);
+}
+
+static void __exit lmv_exit(void)
+{
+	class_unregister_type(LUSTRE_LMV_NAME);
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Logical Metadata Volume");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+MODULE_LICENSE("GPL");
+
+module_init(lmv_init);
+module_exit(lmv_exit);
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c b/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c
new file mode 100644
index 0000000000000..cefa71d34a12d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c
@@ -0,0 +1,172 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#include <linux/seq_file.h>
+#include <asm/statfs.h>
+#include <lprocfs_status.h>
+#include <obd_class.h>
+
+#include "lmv_internal.h"
+
+#ifndef CONFIG_PROC_FS
+static struct lprocfs_vars lprocfs_module_vars[] = { {0} };
+static struct lprocfs_vars lprocfs_obd_vars[] = { {0} };
+#else
+static int lmv_numobd_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device	*dev = (struct obd_device *)m->private;
+        struct lmv_desc         *desc;
+
+        LASSERT(dev != NULL);
+        desc = &dev->u.lmv.desc;
+	seq_printf(m, "%u\n", desc->ld_tgt_count);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(lmv_numobd);
+
+static int lmv_activeobd_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device	*dev = (struct obd_device *)m->private;
+        struct lmv_desc         *desc;
+
+        LASSERT(dev != NULL);
+        desc = &dev->u.lmv.desc;
+	seq_printf(m, "%u\n", desc->ld_active_tgt_count);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(lmv_activeobd);
+
+static int lmv_desc_uuid_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device	*dev = (struct obd_device*)m->private;
+        struct lmv_obd          *lmv;
+
+        LASSERT(dev != NULL);
+        lmv = &dev->u.lmv;
+	seq_printf(m, "%s\n", lmv->desc.ld_uuid.uuid);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(lmv_desc_uuid);
+
+static void *lmv_tgt_seq_start(struct seq_file *p, loff_t *pos)
+{
+	struct obd_device       *dev = p->private;
+	struct lmv_obd          *lmv = &dev->u.lmv;
+
+	while (*pos < lmv->tgts_size) {
+		if (lmv->tgts[*pos] != NULL)
+			return lmv->tgts[*pos];
+
+		++*pos;
+	}
+
+	return  NULL;
+}
+
+static void lmv_tgt_seq_stop(struct seq_file *p, void *v)
+{
+        return;
+}
+
+static void *lmv_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	struct obd_device       *dev = p->private;
+	struct lmv_obd          *lmv = &dev->u.lmv;
+
+	++*pos;
+	while (*pos < lmv->tgts_size) {
+		if (lmv->tgts[*pos] != NULL)
+			return lmv->tgts[*pos];
+
+		++*pos;
+	}
+
+	return  NULL;
+}
+
+static int lmv_tgt_seq_show(struct seq_file *p, void *v)
+{
+	struct lmv_tgt_desc     *tgt = v;
+
+	if (tgt == NULL)
+		return 0;
+	seq_printf(p, "%u: %s %sACTIVE\n", tgt->ltd_idx,
+		  tgt->ltd_uuid.uuid, tgt->ltd_active ? "" : "IN");
+	return 0;
+}
+
+static const struct seq_operations lmv_tgt_sops = {
+        .start                 = lmv_tgt_seq_start,
+        .stop                  = lmv_tgt_seq_stop,
+        .next                  = lmv_tgt_seq_next,
+        .show                  = lmv_tgt_seq_show,
+};
+
+static int lmv_target_seq_open(struct inode *inode, struct file *file)
+{
+        struct seq_file         *seq;
+        int                     rc;
+
+        rc = seq_open(file, &lmv_tgt_sops);
+        if (rc)
+                return rc;
+
+	seq = file->private_data;
+	seq->private = PDE_DATA(inode);
+	return 0;
+}
+
+LPROC_SEQ_FOPS_RO_TYPE(lmv, uuid);
+
+struct lprocfs_vars lprocfs_lmv_obd_vars[] = {
+	{ .name	=	"numobd",
+	  .fops	=	&lmv_numobd_fops	},
+	{ .name	=	"activeobd",
+	  .fops	=	&lmv_activeobd_fops	},
+	{ .name	=	"uuid",
+	  .fops	=	&lmv_uuid_fops		},
+	{ .name	=	"desc_uuid",
+	  .fops	=	&lmv_desc_uuid_fops	},
+	{ NULL }
+};
+
+struct file_operations lmv_proc_target_fops = {
+        .owner                = THIS_MODULE,
+        .open                 = lmv_target_seq_open,
+        .read                 = seq_read,
+        .llseek               = seq_lseek,
+        .release              = seq_release,
+};
+#endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_cl_internal.h b/drivers/staging/lustrefsx/lustre/lov/lov_cl_internal.h
new file mode 100644
index 0000000000000..0e84ab38e189a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_cl_internal.h
@@ -0,0 +1,661 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal interfaces of LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#ifndef LOV_CL_INTERNAL_H
+#define LOV_CL_INTERNAL_H
+
+#include <libcfs/libcfs.h>
+#include <obd.h>
+#include <cl_object.h>
+#include "lov_internal.h"
+
+/** \defgroup lov lov
+ * Logical object volume layer. This layer implements data striping (raid0).
+ *
+ * At the lov layer top-entity (object, page, lock, io) is connected to one or
+ * more sub-entities: top-object, representing a file is connected to a set of
+ * sub-objects, each representing a stripe, file-level top-lock is connected
+ * to a set of per-stripe sub-locks, top-page is connected to a (single)
+ * sub-page, and a top-level IO is connected to a set of (potentially
+ * concurrent) sub-IO's.
+ *
+ * Sub-object, sub-page, and sub-io have well-defined top-object and top-page
+ * respectively, while a single sub-lock can be part of multiple top-locks.
+ *
+ * Reference counting models are different for different types of entities:
+ *
+ *     - top-object keeps a reference to its sub-objects, and destroys them
+ *       when it is destroyed.
+ *
+ *     - top-page keeps a reference to its sub-page, and destroys it when it
+ *       is destroyed.
+ *
+ *     - IO's are not reference counted.
+ *
+ * To implement a connection between top and sub entities, lov layer is split
+ * into two pieces: lov ("upper half"), and lovsub ("bottom half"), both
+ * implementing full set of cl-interfaces. For example, top-object has vvp and
+ * lov layers, and it's sub-object has lovsub and osc layers. lovsub layer is
+ * used to track child-parent relationship.
+ *
+ * @{
+ */
+
+struct lovsub_device;
+struct lovsub_object;
+struct lovsub_lock;
+
+enum lov_device_flags {
+        LOV_DEV_INITIALIZED = 1 << 0
+};
+
+/*
+ * Upper half.
+ */
+
+struct lov_device {
+        /*
+         * XXX Locking of lov-private data is missing.
+         */
+        struct cl_device          ld_cl;
+        struct lov_obd           *ld_lov;
+        /** size of lov_device::ld_target[] array */
+        __u32                     ld_target_nr;
+        struct lovsub_device    **ld_target;
+        __u32                     ld_flags;
+};
+
+/**
+ * Layout type.
+ */
+enum lov_layout_type {
+	LLT_EMPTY,	/** empty file without body (mknod + truncate) */
+	LLT_RELEASED,	/** file with no objects (data in HSM) */
+	LLT_COMP,	/** support composite layout */
+	LLT_NR
+};
+
+static inline char *llt2str(enum lov_layout_type llt)
+{
+	switch (llt) {
+	case LLT_EMPTY:
+		return "EMPTY";
+	case LLT_RELEASED:
+		return "RELEASED";
+	case LLT_COMP:
+		return "COMPOSITE";
+	case LLT_NR:
+		LBUG();
+	}
+	LBUG();
+	return "";
+}
+
+struct lov_layout_raid0 {
+	unsigned               lo_nr;
+	/**
+	 * When this is true, lov_object::lo_attr contains
+	 * valid up to date attributes for a top-level
+	 * object. This field is reset to 0 when attributes of
+	 * any sub-object change.
+	 */
+	int		       lo_attr_valid;
+	/**
+	 * Array of sub-objects. Allocated when top-object is
+	 * created (lov_init_raid0()).
+	 *
+	 * Top-object is a strict master of its sub-objects:
+	 * it is created before them, and outlives its
+	 * children (this later is necessary so that basic
+	 * functions like cl_object_top() always
+	 * work). Top-object keeps a reference on every
+	 * sub-object.
+	 *
+	 * When top-object is destroyed (lov_delete_raid0())
+	 * it releases its reference to a sub-object and waits
+	 * until the latter is finally destroyed.
+	 */
+	struct lovsub_object **lo_sub;
+	/**
+	 * protect lo_sub
+	 */
+	spinlock_t		lo_sub_lock;
+	/**
+	 * Cached object attribute, built from sub-object
+	 * attributes.
+	 */
+	struct cl_attr         lo_attr;
+};
+
+/**
+ * lov-specific file state.
+ *
+ * lov object has particular layout type, determining how top-object is built
+ * on top of sub-objects. Layout type can change dynamically. When this
+ * happens, lov_object::lo_type_guard semaphore is taken in exclusive mode,
+ * all state pertaining to the old layout type is destroyed, and new state is
+ * constructed. All object methods take said semaphore in the shared mode,
+ * providing serialization against transition between layout types.
+ *
+ * To avoid multiple `if' or `switch' statements, selecting behavior for the
+ * current layout type, object methods perform double-dispatch, invoking
+ * function corresponding to the current layout type.
+ */
+struct lov_object {
+	struct cl_object       lo_cl;
+	/**
+	 * Serializes object operations with transitions between layout types.
+	 *
+	 * This semaphore is taken in shared mode by all object methods, and
+	 * is taken in exclusive mode when object type is changed.
+	 *
+	 * \see lov_object::lo_type
+	 */
+	struct rw_semaphore	lo_type_guard;
+	/**
+	 * Type of an object. Protected by lov_object::lo_type_guard.
+	 */
+	enum lov_layout_type	lo_type;
+	/**
+	 * True if layout is invalid. This bit is cleared when layout lock
+	 * is lost.
+	 */
+	bool			lo_layout_invalid;
+	/**
+	 * How many IOs are on going on this object. Layout can be changed
+	 * only if there is no active IO.
+	 */
+	atomic_t	       lo_active_ios;
+	/**
+	 * Waitq - wait for no one else is using lo_lsm
+	 */
+	wait_queue_head_t	lo_waitq;
+	/**
+	 * Layout metadata. NULL if empty layout.
+	 */
+	struct lov_stripe_md  *lo_lsm;
+
+	union lov_layout_state {
+		struct lov_layout_state_empty {
+		} empty;
+		struct lov_layout_state_released {
+		} released;
+		struct lov_layout_composite {
+			/**
+			 * Current valid entry count of lo_entries.
+			 */
+			unsigned int lo_entry_count;
+			struct lov_layout_entry {
+				struct lu_extent lle_extent;
+				struct lov_layout_raid0 lle_raid0;
+			} *lo_entries;
+		} composite;
+	} u;
+	/**
+	 * Thread that acquired lov_object::lo_type_guard in an exclusive
+	 * mode.
+	 */
+	struct task_struct            *lo_owner;
+};
+
+#define lov_foreach_layout_entry(lov, entry)			\
+	for (entry = &lov->u.composite.lo_entries[0];		\
+	     entry < &lov->u.composite.lo_entries		\
+			[lov->u.composite.lo_entry_count];	\
+	     entry++)
+
+/**
+ * State lov_lock keeps for each sub-lock.
+ */
+struct lov_lock_sub {
+	/** sub-lock itself */
+	struct cl_lock		sub_lock;
+	/** Set if the sublock has ever been enqueued, meaning it may
+	 * hold resources of underlying layers */
+	unsigned int		sub_is_enqueued:1,
+				sub_initialized:1;
+	int			sub_index;
+};
+
+/**
+ * lov-specific lock state.
+ */
+struct lov_lock {
+	struct cl_lock_slice	lls_cl;
+	/** Number of sub-locks in this lock */
+	int			lls_nr;
+	/** sublock array */
+	struct lov_lock_sub	lls_sub[0];
+};
+
+struct lov_page {
+	struct cl_page_slice	lps_cl;
+	/** layout_entry + stripe index, composed using lov_comp_index() */
+	unsigned int		lps_index;
+};
+
+/*
+ * Bottom half.
+ */
+
+struct lovsub_device {
+        struct cl_device   acid_cl;
+        struct cl_device  *acid_next;
+};
+
+struct lovsub_object {
+        struct cl_object_header lso_header;
+        struct cl_object        lso_cl;
+        struct lov_object      *lso_super;
+        int                     lso_index;
+};
+
+/**
+ * Lock state at lovsub layer.
+ */
+struct lovsub_lock {
+        struct cl_lock_slice  lss_cl;
+};
+
+/**
+ * Describe the environment settings for sublocks.
+ */
+struct lov_sublock_env {
+        const struct lu_env *lse_env;
+        struct cl_io        *lse_io;
+};
+
+struct lovsub_page {
+        struct cl_page_slice lsb_cl;
+};
+
+
+struct lov_thread_info {
+	struct cl_object_conf   lti_stripe_conf;
+	struct lu_fid           lti_fid;
+	struct ost_lvb          lti_lvb;
+	struct cl_2queue        lti_cl2q;
+	struct cl_page_list     lti_plist;
+	wait_queue_entry_t      lti_waiter;
+};
+
+/**
+ * State that lov_io maintains for every sub-io.
+ */
+struct lov_io_sub {
+	/**
+	 * Linkage into a list (hanging off lov_io::lis_subios)
+	 */
+	struct list_head	sub_list;
+	/**
+	 * Linkage into a list (hanging off lov_io::lis_active) of all
+	 * sub-io's active for the current IO iteration.
+	 */
+	struct list_head	sub_linkage;
+	unsigned int		sub_subio_index;
+	/**
+	 * sub-io for a stripe. Ideally sub-io's can be stopped and resumed
+	 * independently, with lov acting as a scheduler to maximize overall
+	 * throughput.
+	 */
+	struct cl_io		sub_io;
+	/**
+	 * environment, in which sub-io executes.
+	 */
+	struct lu_env		*sub_env;
+	/**
+	 * environment's refcheck.
+	 *
+	 * \see cl_env_get()
+	 */
+	__u16			sub_refcheck;
+	__u16			sub_reenter;
+};
+
+/**
+ * IO state private for LOV.
+ */
+struct lov_io {
+        /** super-class */
+        struct cl_io_slice lis_cl;
+        /**
+         * Pointer to the object slice. This is a duplicate of
+         * lov_io::lis_cl::cis_object.
+         */
+        struct lov_object *lis_object;
+        /**
+         * Original end-of-io position for this IO, set by the upper layer as
+         * cl_io::u::ci_rw::pos + cl_io::u::ci_rw::count. lov remembers this,
+         * changes pos and count to fit IO into a single stripe and uses saved
+         * value to determine when IO iterations have to stop.
+         *
+         * This is used only for CIT_READ and CIT_WRITE io's.
+         */
+        loff_t             lis_io_endpos;
+
+        /**
+         * starting position within a file, for the current io loop iteration
+         * (stripe), used by ci_io_loop().
+         */
+	loff_t			lis_pos;
+	/**
+	 * end position with in a file, for the current stripe io. This is
+	 * exclusive (i.e., next offset after last byte affected by io).
+	 */
+	loff_t			lis_endpos;
+	int			lis_nr_subios;
+
+	/**
+	 * the index of ls_single_subio in ls_subios array
+	 */
+	int			lis_single_subio_index;
+	struct lov_io_sub	lis_single_subio;
+
+	/**
+	 * List of active sub-io's. Active sub-io's are under the range
+	 * of [lis_pos, lis_endpos).
+	 */
+	struct list_head	lis_active;
+	/**
+	 * All sub-io's created in this lov_io.
+	 */
+	struct list_head	lis_subios;
+};
+
+struct lov_session {
+        struct lov_io          ls_io;
+        struct lov_sublock_env ls_subenv;
+};
+
+extern struct lu_device_type lov_device_type;
+extern struct lu_device_type lovsub_device_type;
+
+extern struct lu_context_key lov_key;
+extern struct lu_context_key lov_session_key;
+
+extern struct kmem_cache *lov_lock_kmem;
+extern struct kmem_cache *lov_object_kmem;
+extern struct kmem_cache *lov_thread_kmem;
+extern struct kmem_cache *lov_session_kmem;
+
+extern struct kmem_cache *lovsub_lock_kmem;
+extern struct kmem_cache *lovsub_object_kmem;
+
+int   lov_object_init     (const struct lu_env *env, struct lu_object *obj,
+                           const struct lu_object_conf *conf);
+int   lovsub_object_init  (const struct lu_env *env, struct lu_object *obj,
+                           const struct lu_object_conf *conf);
+int   lov_lock_init       (const struct lu_env *env, struct cl_object *obj,
+                           struct cl_lock *lock, const struct cl_io *io);
+int   lov_io_init         (const struct lu_env *env, struct cl_object *obj,
+                           struct cl_io *io);
+int   lovsub_lock_init    (const struct lu_env *env, struct cl_object *obj,
+                           struct cl_lock *lock, const struct cl_io *io);
+
+int   lov_lock_init_composite(const struct lu_env *env, struct cl_object *obj,
+                           struct cl_lock *lock, const struct cl_io *io);
+int   lov_lock_init_empty (const struct lu_env *env, struct cl_object *obj,
+                           struct cl_lock *lock, const struct cl_io *io);
+int   lov_io_init_composite(const struct lu_env *env, struct cl_object *obj,
+                           struct cl_io *io);
+int   lov_io_init_empty   (const struct lu_env *env, struct cl_object *obj,
+                           struct cl_io *io);
+int   lov_io_init_released(const struct lu_env *env, struct cl_object *obj,
+                           struct cl_io *io);
+
+struct lov_io_sub *lov_sub_get(const struct lu_env *env, struct lov_io *lio,
+                               int stripe);
+
+int   lov_page_init       (const struct lu_env *env, struct cl_object *ob,
+			   struct cl_page *page, pgoff_t index);
+int   lovsub_page_init    (const struct lu_env *env, struct cl_object *ob,
+			   struct cl_page *page, pgoff_t index);
+int   lov_page_init_empty (const struct lu_env *env, struct cl_object *obj,
+			   struct cl_page *page, pgoff_t index);
+int   lov_page_init_composite(const struct lu_env *env, struct cl_object *obj,
+			   struct cl_page *page, pgoff_t index);
+struct lu_object *lov_object_alloc   (const struct lu_env *env,
+                                      const struct lu_object_header *hdr,
+                                      struct lu_device *dev);
+struct lu_object *lovsub_object_alloc(const struct lu_env *env,
+                                      const struct lu_object_header *hdr,
+                                      struct lu_device *dev);
+
+struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov);
+int lov_page_stripe(const struct cl_page *page);
+int lov_lsm_entry(const struct lov_stripe_md *lsm, __u64 offset);
+
+#define lov_foreach_target(lov, var)                    \
+        for (var = 0; var < lov_targets_nr(lov); ++var)
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ * Accessors.
+ *
+ */
+
+static inline struct lov_session *lov_env_session(const struct lu_env *env)
+{
+        struct lov_session *ses;
+
+        ses = lu_context_key_get(env->le_ses, &lov_session_key);
+        LASSERT(ses != NULL);
+        return ses;
+}
+
+static inline struct lov_io *lov_env_io(const struct lu_env *env)
+{
+        return &lov_env_session(env)->ls_io;
+}
+
+static inline int lov_is_object(const struct lu_object *obj)
+{
+        return obj->lo_dev->ld_type == &lov_device_type;
+}
+
+static inline int lovsub_is_object(const struct lu_object *obj)
+{
+        return obj->lo_dev->ld_type == &lovsub_device_type;
+}
+
+static inline struct lu_device *lov2lu_dev(struct lov_device *lov)
+{
+        return &lov->ld_cl.cd_lu_dev;
+}
+
+static inline struct lov_device *lu2lov_dev(const struct lu_device *d)
+{
+        LINVRNT(d->ld_type == &lov_device_type);
+        return container_of0(d, struct lov_device, ld_cl.cd_lu_dev);
+}
+
+static inline struct cl_device *lovsub2cl_dev(struct lovsub_device *lovsub)
+{
+        return &lovsub->acid_cl;
+}
+
+static inline struct lu_device *lovsub2lu_dev(struct lovsub_device *lovsub)
+{
+        return &lovsub2cl_dev(lovsub)->cd_lu_dev;
+}
+
+static inline struct lovsub_device *lu2lovsub_dev(const struct lu_device *d)
+{
+        LINVRNT(d->ld_type == &lovsub_device_type);
+        return container_of0(d, struct lovsub_device, acid_cl.cd_lu_dev);
+}
+
+static inline struct lovsub_device *cl2lovsub_dev(const struct cl_device *d)
+{
+        LINVRNT(d->cd_lu_dev.ld_type == &lovsub_device_type);
+        return container_of0(d, struct lovsub_device, acid_cl);
+}
+
+static inline struct lu_object *lov2lu(struct lov_object *lov)
+{
+        return &lov->lo_cl.co_lu;
+}
+
+static inline struct cl_object *lov2cl(struct lov_object *lov)
+{
+        return &lov->lo_cl;
+}
+
+static inline struct lov_object *lu2lov(const struct lu_object *obj)
+{
+        LINVRNT(lov_is_object(obj));
+        return container_of0(obj, struct lov_object, lo_cl.co_lu);
+}
+
+static inline struct lov_object *cl2lov(const struct cl_object *obj)
+{
+        LINVRNT(lov_is_object(&obj->co_lu));
+        return container_of0(obj, struct lov_object, lo_cl);
+}
+
+static inline struct lu_object *lovsub2lu(struct lovsub_object *los)
+{
+        return &los->lso_cl.co_lu;
+}
+
+static inline struct cl_object *lovsub2cl(struct lovsub_object *los)
+{
+        return &los->lso_cl;
+}
+
+static inline struct lovsub_object *cl2lovsub(const struct cl_object *obj)
+{
+        LINVRNT(lovsub_is_object(&obj->co_lu));
+        return container_of0(obj, struct lovsub_object, lso_cl);
+}
+
+static inline struct lovsub_object *lu2lovsub(const struct lu_object *obj)
+{
+        LINVRNT(lovsub_is_object(obj));
+        return container_of0(obj, struct lovsub_object, lso_cl.co_lu);
+}
+
+static inline struct lovsub_lock *
+cl2lovsub_lock(const struct cl_lock_slice *slice)
+{
+        LINVRNT(lovsub_is_object(&slice->cls_obj->co_lu));
+        return container_of(slice, struct lovsub_lock, lss_cl);
+}
+
+static inline struct lovsub_lock *cl2sub_lock(const struct cl_lock *lock)
+{
+        const struct cl_lock_slice *slice;
+
+        slice = cl_lock_at(lock, &lovsub_device_type);
+        LASSERT(slice != NULL);
+        return cl2lovsub_lock(slice);
+}
+
+static inline struct lov_lock *cl2lov_lock(const struct cl_lock_slice *slice)
+{
+        LINVRNT(lov_is_object(&slice->cls_obj->co_lu));
+        return container_of(slice, struct lov_lock, lls_cl);
+}
+
+static inline struct lov_page *cl2lov_page(const struct cl_page_slice *slice)
+{
+        LINVRNT(lov_is_object(&slice->cpl_obj->co_lu));
+        return container_of0(slice, struct lov_page, lps_cl);
+}
+
+static inline struct lovsub_page *
+cl2lovsub_page(const struct cl_page_slice *slice)
+{
+        LINVRNT(lovsub_is_object(&slice->cpl_obj->co_lu));
+        return container_of0(slice, struct lovsub_page, lsb_cl);
+}
+
+static inline struct lov_io *cl2lov_io(const struct lu_env *env,
+                                const struct cl_io_slice *ios)
+{
+        struct lov_io *lio;
+
+        lio = container_of(ios, struct lov_io, lis_cl);
+        LASSERT(lio == lov_env_io(env));
+        return lio;
+}
+
+static inline int lov_targets_nr(const struct lov_device *lov)
+{
+        return lov->ld_lov->desc.ld_tgt_count;
+}
+
+static inline struct lov_thread_info *lov_env_info(const struct lu_env *env)
+{
+        struct lov_thread_info *info;
+
+        info = lu_context_key_get(&env->le_ctx, &lov_key);
+        LASSERT(info != NULL);
+        return info;
+}
+
+static inline struct lov_layout_raid0 *lov_r0(struct lov_object *lov, int i)
+{
+	LASSERT(lov->lo_type == LLT_COMP);
+	LASSERTF(i < lov->u.composite.lo_entry_count,
+		 "entry %d entry_count %d", i, lov->u.composite.lo_entry_count);
+
+	return &lov->u.composite.lo_entries[i].lle_raid0;
+}
+
+static inline struct lov_stripe_md_entry *lov_lse(struct lov_object *lov, int i)
+{
+	LASSERT(lov->lo_lsm != NULL);
+	LASSERT(i < lov->lo_lsm->lsm_entry_count);
+
+	return lov->lo_lsm->lsm_entries[i];
+}
+
+/* lov_pack.c */
+int lov_getstripe(const struct lu_env *env, struct lov_object *obj,
+		  struct lov_stripe_md *lsm, struct lov_user_md __user *lump,
+		  size_t size);
+
+/** @} lov */
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_dev.c b/drivers/staging/lustrefsx/lustre/lov/lov_dev.c
new file mode 100644
index 0000000000000..2506c39ec7296
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_dev.c
@@ -0,0 +1,392 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_device and cl_device_type for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+/* class_name2obd() */
+#include <obd_class.h>
+
+#include "lov_cl_internal.h"
+
+struct kmem_cache *lov_lock_kmem;
+struct kmem_cache *lov_object_kmem;
+struct kmem_cache *lov_thread_kmem;
+struct kmem_cache *lov_session_kmem;
+
+struct kmem_cache *lovsub_lock_kmem;
+struct kmem_cache *lovsub_object_kmem;
+
+struct lu_kmem_descr lov_caches[] = {
+        {
+                .ckd_cache = &lov_lock_kmem,
+                .ckd_name  = "lov_lock_kmem",
+                .ckd_size  = sizeof (struct lov_lock)
+        },
+        {
+                .ckd_cache = &lov_object_kmem,
+                .ckd_name  = "lov_object_kmem",
+                .ckd_size  = sizeof (struct lov_object)
+        },
+        {
+                .ckd_cache = &lov_thread_kmem,
+                .ckd_name  = "lov_thread_kmem",
+                .ckd_size  = sizeof (struct lov_thread_info)
+        },
+        {
+                .ckd_cache = &lov_session_kmem,
+                .ckd_name  = "lov_session_kmem",
+                .ckd_size  = sizeof (struct lov_session)
+        },
+        {
+                .ckd_cache = &lovsub_lock_kmem,
+                .ckd_name  = "lovsub_lock_kmem",
+                .ckd_size  = sizeof (struct lovsub_lock)
+        },
+        {
+                .ckd_cache = &lovsub_object_kmem,
+                .ckd_name  = "lovsub_object_kmem",
+                .ckd_size  = sizeof (struct lovsub_object)
+        },
+        {
+                .ckd_cache = NULL
+        }
+};
+
+/*****************************************************************************
+ *
+ * Lov device and device type functions.
+ *
+ */
+
+static void *lov_key_init(const struct lu_context *ctx,
+			  struct lu_context_key *key)
+{
+	struct lov_thread_info *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, lov_thread_kmem, GFP_NOFS);
+	if (info == NULL)
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+static void lov_key_fini(const struct lu_context *ctx,
+			 struct lu_context_key *key, void *data)
+{
+	struct lov_thread_info *info = data;
+	OBD_SLAB_FREE_PTR(info, lov_thread_kmem);
+}
+
+struct lu_context_key lov_key = {
+        .lct_tags = LCT_CL_THREAD,
+        .lct_init = lov_key_init,
+        .lct_fini = lov_key_fini
+};
+
+static void *lov_session_key_init(const struct lu_context *ctx,
+				  struct lu_context_key *key)
+{
+	struct lov_session *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, lov_session_kmem, GFP_NOFS);
+	if (info == NULL)
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+static void lov_session_key_fini(const struct lu_context *ctx,
+                                 struct lu_context_key *key, void *data)
+{
+        struct lov_session *info = data;
+        OBD_SLAB_FREE_PTR(info, lov_session_kmem);
+}
+
+struct lu_context_key lov_session_key = {
+        .lct_tags = LCT_SESSION,
+        .lct_init = lov_session_key_init,
+        .lct_fini = lov_session_key_fini
+};
+
+/* type constructor/destructor: lov_type_{init,fini,start,stop}() */
+LU_TYPE_INIT_FINI(lov, &lov_key, &lov_session_key);
+
+static struct lu_device *lov_device_fini(const struct lu_env *env,
+                                         struct lu_device *d)
+{
+        int i;
+        struct lov_device *ld = lu2lov_dev(d);
+
+        LASSERT(ld->ld_lov != NULL);
+        if (ld->ld_target == NULL)
+                RETURN(NULL);
+
+        lov_foreach_target(ld, i) {
+                struct lovsub_device *lsd;
+
+                lsd = ld->ld_target[i];
+                if (lsd != NULL) {
+                        cl_stack_fini(env, lovsub2cl_dev(lsd));
+                        ld->ld_target[i] = NULL;
+                }
+        }
+        RETURN(NULL);
+}
+
+static int lov_device_init(const struct lu_env *env, struct lu_device *d,
+                           const char *name, struct lu_device *next)
+{
+        struct lov_device *ld = lu2lov_dev(d);
+        int i;
+        int rc = 0;
+
+        LASSERT(d->ld_site != NULL);
+        if (ld->ld_target == NULL)
+                RETURN(rc);
+
+        lov_foreach_target(ld, i) {
+                struct lovsub_device *lsd;
+                struct cl_device     *cl;
+                struct lov_tgt_desc  *desc;
+
+                desc = ld->ld_lov->lov_tgts[i];
+                if (desc == NULL)
+                        continue;
+
+                cl = cl_type_setup(env, d->ld_site, &lovsub_device_type,
+                                   desc->ltd_obd->obd_lu_dev);
+                if (IS_ERR(cl)) {
+                        rc = PTR_ERR(cl);
+                        break;
+                }
+                lsd = cl2lovsub_dev(cl);
+                ld->ld_target[i] = lsd;
+        }
+
+        if (rc)
+                lov_device_fini(env, d);
+        else
+                ld->ld_flags |= LOV_DEV_INITIALIZED;
+
+        RETURN(rc);
+}
+
+/* Free the lov specific data created for the back end lu_device. */
+static struct lu_device *lov_device_free(const struct lu_env *env,
+                                         struct lu_device *d)
+{
+	struct lov_device *ld = lu2lov_dev(d);
+	const int nr = ld->ld_target_nr;
+
+	cl_device_fini(lu2cl_dev(d));
+	if (ld->ld_target != NULL)
+		OBD_FREE(ld->ld_target, nr * sizeof ld->ld_target[0]);
+
+	OBD_FREE_PTR(ld);
+	return NULL;
+}
+
+static void lov_cl_del_target(const struct lu_env *env, struct lu_device *dev,
+                              __u32 index)
+{
+        struct lov_device *ld = lu2lov_dev(dev);
+        ENTRY;
+
+        if (ld->ld_target[index] != NULL) {
+                cl_stack_fini(env, lovsub2cl_dev(ld->ld_target[index]));
+                ld->ld_target[index] = NULL;
+        }
+        EXIT;
+}
+
+static int lov_expand_targets(const struct lu_env *env, struct lov_device *dev)
+{
+	int result;
+	__u32 tgt_size;
+	__u32 sub_size;
+
+	ENTRY;
+	result = 0;
+	tgt_size = dev->ld_lov->lov_tgt_size;
+	sub_size = dev->ld_target_nr;
+	if (sub_size < tgt_size) {
+		struct lovsub_device **newd;
+		const size_t sz = sizeof(newd[0]);
+
+		OBD_ALLOC(newd, tgt_size * sz);
+		if (newd != NULL) {
+			if (sub_size > 0) {
+				memcpy(newd, dev->ld_target, sub_size * sz);
+				OBD_FREE(dev->ld_target, sub_size * sz);
+			}
+
+			dev->ld_target = newd;
+			dev->ld_target_nr = tgt_size;
+		} else {
+			result = -ENOMEM;
+		}
+	}
+
+	RETURN(result);
+}
+
+static int lov_cl_add_target(const struct lu_env *env, struct lu_device *dev,
+                             __u32 index)
+{
+        struct obd_device    *obd = dev->ld_obd;
+        struct lov_device    *ld  = lu2lov_dev(dev);
+        struct lov_tgt_desc  *tgt;
+        struct lovsub_device *lsd;
+        struct cl_device     *cl;
+        int rc;
+        ENTRY;
+
+        obd_getref(obd);
+
+        tgt = obd->u.lov.lov_tgts[index];
+        LASSERT(tgt != NULL);
+        LASSERT(tgt->ltd_obd != NULL);
+
+        if (!tgt->ltd_obd->obd_set_up) {
+                CERROR("Target %s not set up\n", obd_uuid2str(&tgt->ltd_uuid));
+                RETURN(-EINVAL);
+        }
+
+        rc = lov_expand_targets(env, ld);
+        if (rc == 0 && ld->ld_flags & LOV_DEV_INITIALIZED) {
+                LASSERT(dev->ld_site != NULL);
+
+		cl = cl_type_setup(env, dev->ld_site, &lovsub_device_type,
+				   tgt->ltd_obd->obd_lu_dev);
+		if (!IS_ERR(cl)) {
+			lsd = cl2lovsub_dev(cl);
+			ld->ld_target[index] = lsd;
+		} else {
+			CERROR("add failed (%d), deleting %s\n", rc,
+			       obd_uuid2str(&tgt->ltd_uuid));
+			lov_cl_del_target(env, dev, index);
+			rc = PTR_ERR(cl);
+		}
+        }
+        obd_putref(obd);
+        RETURN(rc);
+}
+
+static int lov_process_config(const struct lu_env *env,
+                              struct lu_device *d, struct lustre_cfg *cfg)
+{
+        struct obd_device *obd = d->ld_obd;
+        int cmd;
+        int rc;
+        int gen;
+        __u32 index;
+
+        obd_getref(obd);
+
+        cmd = cfg->lcfg_command;
+        rc = lov_process_config_base(d->ld_obd, cfg, &index, &gen);
+        if (rc == 0) {
+                switch(cmd) {
+                case LCFG_LOV_ADD_OBD:
+                case LCFG_LOV_ADD_INA:
+                        rc = lov_cl_add_target(env, d, index);
+                        if (rc != 0)
+				lov_del_target(d->ld_obd, index, NULL, 0);
+                        break;
+                case LCFG_LOV_DEL_OBD:
+                        lov_cl_del_target(env, d, index);
+                        break;
+                }
+        }
+        obd_putref(obd);
+        RETURN(rc);
+}
+
+static const struct lu_device_operations lov_lu_ops = {
+        .ldo_object_alloc      = lov_object_alloc,
+        .ldo_process_config    = lov_process_config,
+};
+
+static struct lu_device *lov_device_alloc(const struct lu_env *env,
+                                          struct lu_device_type *t,
+                                          struct lustre_cfg *cfg)
+{
+        struct lu_device *d;
+        struct lov_device *ld;
+        struct obd_device *obd;
+        int rc;
+
+        OBD_ALLOC_PTR(ld);
+        if (ld == NULL)
+                RETURN(ERR_PTR(-ENOMEM));
+
+	cl_device_init(&ld->ld_cl, t);
+	d = lov2lu_dev(ld);
+	d->ld_ops = &lov_lu_ops;
+
+        /* setup the LOV OBD */
+        obd = class_name2obd(lustre_cfg_string(cfg, 0));
+        LASSERT(obd != NULL);
+        rc = lov_setup(obd, cfg);
+        if (rc) {
+                lov_device_free(env, d);
+                RETURN(ERR_PTR(rc));
+        }
+
+        ld->ld_lov = &obd->u.lov;
+        RETURN(d);
+}
+
+static const struct lu_device_type_operations lov_device_type_ops = {
+        .ldto_init = lov_type_init,
+        .ldto_fini = lov_type_fini,
+
+        .ldto_start = lov_type_start,
+        .ldto_stop  = lov_type_stop,
+
+        .ldto_device_alloc = lov_device_alloc,
+        .ldto_device_free  = lov_device_free,
+
+        .ldto_device_init    = lov_device_init,
+        .ldto_device_fini    = lov_device_fini
+};
+
+struct lu_device_type lov_device_type = {
+        .ldt_tags     = LU_DEVICE_CL,
+        .ldt_name     = LUSTRE_LOV_NAME,
+        .ldt_ops      = &lov_device_type_ops,
+        .ldt_ctx_tags = LCT_CL_THREAD
+};
+
+/** @} lov */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_ea.c b/drivers/staging/lustrefsx/lustre/lov/lov_ea.c
new file mode 100644
index 0000000000000..5b50b0a9294dc
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_ea.c
@@ -0,0 +1,546 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_ea.c
+ *
+ * Author: Wang Di <wangdi@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <linux/math64.h>
+#include <linux/sort.h>
+#include <libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include <lustre/lustre_idl.h>
+#include <lustre/lustre_user.h>
+
+#include "lov_internal.h"
+
+static inline void
+lu_extent_le_to_cpu(struct lu_extent *dst, const struct lu_extent *src)
+{
+	dst->e_start = le64_to_cpu(src->e_start);
+	dst->e_end = le64_to_cpu(src->e_end);
+}
+
+/* Find minimum stripe maxbytes value.  For inactive or
+ * reconnecting targets use LUSTRE_EXT3_STRIPE_MAXBYTES. */
+static loff_t lov_tgt_maxbytes(struct lov_tgt_desc *tgt)
+{
+	struct obd_import *imp;
+	loff_t maxbytes = LUSTRE_EXT3_STRIPE_MAXBYTES;
+
+	if (!tgt->ltd_active)
+		return maxbytes;
+
+	imp = tgt->ltd_obd->u.cli.cl_import;
+	if (imp == NULL)
+		return maxbytes;
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_FULL &&
+	    (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES) &&
+	    imp->imp_connect_data.ocd_maxbytes > 0)
+		maxbytes = imp->imp_connect_data.ocd_maxbytes;
+
+	spin_unlock(&imp->imp_lock);
+
+	return maxbytes;
+}
+
+static int lsm_lmm_verify_v1v3(struct lov_mds_md *lmm, size_t lmm_size,
+			       u16 stripe_count)
+{
+	if (stripe_count > LOV_V1_INSANE_STRIPE_COUNT) {
+		CERROR("bad stripe count %d\n", stripe_count);
+		lov_dump_lmm_common(D_WARNING, lmm);
+		return -EINVAL;
+	}
+
+	if (lmm_oi_id(&lmm->lmm_oi) == 0) {
+		CERROR("zero object id\n");
+		lov_dump_lmm_common(D_WARNING, lmm);
+		return -EINVAL;
+	}
+
+	if (lov_pattern(le32_to_cpu(lmm->lmm_pattern)) != LOV_PATTERN_RAID0) {
+		CERROR("bad striping pattern\n");
+		lov_dump_lmm_common(D_WARNING, lmm);
+		return -EINVAL;
+	}
+
+	if (lmm->lmm_stripe_size == 0 ||
+	    (le32_to_cpu(lmm->lmm_stripe_size)&(LOV_MIN_STRIPE_SIZE-1)) != 0) {
+		CERROR("bad stripe size %u\n",
+		       le32_to_cpu(lmm->lmm_stripe_size));
+		lov_dump_lmm_common(D_WARNING, lmm);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void lsme_free(struct lov_stripe_md_entry *lsme)
+{
+	unsigned int stripe_count = lsme->lsme_stripe_count;
+	unsigned int i;
+	size_t lsme_size;
+
+	if (!lsme_inited(lsme) ||
+	    lsme->lsme_pattern & LOV_PATTERN_F_RELEASED)
+		stripe_count = 0;
+	for (i = 0; i < stripe_count; i++)
+		OBD_SLAB_FREE_PTR(lsme->lsme_oinfo[i], lov_oinfo_slab);
+
+	lsme_size = offsetof(typeof(*lsme), lsme_oinfo[stripe_count]);
+	OBD_FREE_LARGE(lsme, lsme_size);
+}
+
+void lsm_free(struct lov_stripe_md *lsm)
+{
+	unsigned int entry_count = lsm->lsm_entry_count;
+	unsigned int i;
+	size_t lsm_size;
+
+	for (i = 0; i < entry_count; i++)
+		lsme_free(lsm->lsm_entries[i]);
+
+	lsm_size = offsetof(typeof(*lsm), lsm_entries[entry_count]);
+	OBD_FREE(lsm, lsm_size);
+}
+
+/**
+ * Unpack a struct lov_mds_md into a struct lov_stripe_md_entry.
+ *
+ * The caller should set id and extent.
+ */
+static struct lov_stripe_md_entry *
+lsme_unpack(struct lov_obd *lov, struct lov_mds_md *lmm, size_t buf_size,
+	    const char *pool_name, bool inited, struct lov_ost_data_v1 *objects,
+	    loff_t *maxbytes)
+{
+	struct lov_stripe_md_entry *lsme;
+	size_t lsme_size;
+	loff_t min_stripe_maxbytes = 0;
+	loff_t lov_bytes;
+	u32 magic;
+	u32 pattern;
+	unsigned int stripe_count;
+	unsigned int i;
+	int rc;
+
+	magic = le32_to_cpu(lmm->lmm_magic);
+	if (magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3)
+		RETURN(ERR_PTR(-EINVAL));
+
+	pattern = le32_to_cpu(lmm->lmm_pattern);
+	if (pattern & LOV_PATTERN_F_RELEASED || !inited)
+		stripe_count = 0;
+	else
+		stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
+
+	if (buf_size < (magic == LOV_MAGIC_V1 ? sizeof(struct lov_mds_md_v1) :
+						sizeof(struct lov_mds_md_v3))) {
+		CERROR("LOV EA %s too small: %zu, need %u\n",
+		       magic == LOV_MAGIC_V1 ? "V1" : "V3", buf_size,
+		       lov_mds_md_size(stripe_count, magic == LOV_MAGIC_V1 ?
+				       LOV_MAGIC_V1 : LOV_MAGIC_V3));
+		lov_dump_lmm_common(D_WARNING, lmm);
+		return ERR_PTR(-EINVAL);
+	}
+
+	rc = lsm_lmm_verify_v1v3(lmm, buf_size, stripe_count);
+	if (rc < 0)
+		return ERR_PTR(rc);
+
+	lsme_size = offsetof(typeof(*lsme), lsme_oinfo[stripe_count]);
+	OBD_ALLOC_LARGE(lsme, lsme_size);
+	if (lsme == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	lsme->lsme_magic = magic;
+	lsme->lsme_pattern = pattern;
+	lsme->lsme_flags = 0;
+	lsme->lsme_stripe_size = le32_to_cpu(lmm->lmm_stripe_size);
+	/* preserve the possible -1 stripe count for uninstantiated component */
+	lsme->lsme_stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
+	lsme->lsme_layout_gen = le16_to_cpu(lmm->lmm_layout_gen);
+
+	if (pool_name != NULL) {
+		size_t pool_name_len;
+
+		pool_name_len = strlcpy(lsme->lsme_pool_name, pool_name,
+					sizeof(lsme->lsme_pool_name));
+		if (pool_name_len >= sizeof(lsme->lsme_pool_name))
+			GOTO(out_lsme, rc = -E2BIG);
+	}
+
+	for (i = 0; i < stripe_count; i++) {
+		struct lov_oinfo *loi;
+		struct lov_tgt_desc *ltd;
+
+		OBD_SLAB_ALLOC_PTR_GFP(loi, lov_oinfo_slab, GFP_NOFS);
+		if (loi == NULL)
+			GOTO(out_lsme, rc = -ENOMEM);
+
+		lsme->lsme_oinfo[i] = loi;
+
+		ostid_le_to_cpu(&objects[i].l_ost_oi, &loi->loi_oi);
+		loi->loi_ost_idx = le32_to_cpu(objects[i].l_ost_idx);
+		loi->loi_ost_gen = le32_to_cpu(objects[i].l_ost_gen);
+		if (lov_oinfo_is_dummy(loi))
+			continue;
+
+		if (loi->loi_ost_idx >= lov->desc.ld_tgt_count &&
+		    !lov2obd(lov)->obd_process_conf) {
+			CERROR("%s: OST index %d more than OST count %d\n",
+			       (char*)lov->desc.ld_uuid.uuid,
+			       loi->loi_ost_idx, lov->desc.ld_tgt_count);
+			lov_dump_lmm_v1(D_WARNING, lmm);
+			GOTO(out_lsme, rc = -EINVAL);
+		}
+
+		ltd = lov->lov_tgts[loi->loi_ost_idx];
+		if (ltd == NULL) {
+			CERROR("%s: OST index %d missing\n",
+			       (char*)lov->desc.ld_uuid.uuid, loi->loi_ost_idx);
+			lov_dump_lmm_v1(D_WARNING, lmm);
+			continue;
+		}
+
+		lov_bytes = lov_tgt_maxbytes(ltd);
+		if (min_stripe_maxbytes == 0 || lov_bytes < min_stripe_maxbytes)
+			min_stripe_maxbytes = lov_bytes;
+	}
+
+	if (min_stripe_maxbytes == 0)
+		min_stripe_maxbytes = LUSTRE_EXT3_STRIPE_MAXBYTES;
+
+	lov_bytes = min_stripe_maxbytes * stripe_count;
+
+	if (maxbytes != NULL) {
+		if (lov_bytes < min_stripe_maxbytes) /* handle overflow */
+			*maxbytes = MAX_LFS_FILESIZE;
+		else
+			*maxbytes = lov_bytes;
+	}
+
+	return lsme;
+
+out_lsme:
+	for (i = 0; i < stripe_count; i++) {
+		struct lov_oinfo *loi = lsme->lsme_oinfo[i];
+
+		if (loi != NULL)
+			OBD_SLAB_FREE_PTR(lsme->lsme_oinfo[i], lov_oinfo_slab);
+	}
+	OBD_FREE_LARGE(lsme, lsme_size);
+
+	return ERR_PTR(rc);
+}
+
+static struct
+lov_stripe_md *lsm_unpackmd_v1v3(struct lov_obd *lov, struct lov_mds_md *lmm,
+				 size_t buf_size, const char *pool_name,
+				 struct lov_ost_data_v1 *objects)
+{
+	struct lov_stripe_md *lsm;
+	struct lov_stripe_md_entry *lsme;
+	size_t lsm_size;
+	loff_t maxbytes;
+	u32 pattern;
+	int rc;
+
+	pattern = le32_to_cpu(lmm->lmm_pattern);
+
+	lsme = lsme_unpack(lov, lmm, buf_size, pool_name, true, objects,
+			   &maxbytes);
+	if (IS_ERR(lsme))
+		RETURN(ERR_CAST(lsme));
+
+	lsme->lsme_flags = LCME_FL_INIT;
+	lsme->lsme_extent.e_start = 0;
+	lsme->lsme_extent.e_end = LUSTRE_EOF;
+
+	lsm_size = offsetof(typeof(*lsm), lsm_entries[1]);
+	OBD_ALLOC(lsm, lsm_size);
+	if (lsm == NULL)
+		GOTO(out_lsme, rc = -ENOMEM);
+
+	atomic_set(&lsm->lsm_refc, 1);
+	spin_lock_init(&lsm->lsm_lock);
+	lsm->lsm_maxbytes = maxbytes;
+	lmm_oi_le_to_cpu(&lsm->lsm_oi, &lmm->lmm_oi);
+	lsm->lsm_magic = le32_to_cpu(lmm->lmm_magic);
+	lsm->lsm_layout_gen = le16_to_cpu(lmm->lmm_layout_gen);
+	lsm->lsm_entry_count = 1;
+	lsm->lsm_is_released = pattern & LOV_PATTERN_F_RELEASED;
+	lsm->lsm_entries[0] = lsme;
+
+	return lsm;
+
+out_lsme:
+	lsme_free(lsme);
+
+	return ERR_PTR(rc);
+}
+
+static inline struct lov_stripe_md *
+lsm_unpackmd_v1(struct lov_obd *lov, void *buf, size_t buf_size)
+{
+	struct lov_mds_md_v1 *lmm = buf;
+
+	return lsm_unpackmd_v1v3(lov, buf, buf_size, NULL, lmm->lmm_objects);
+}
+
+const struct lsm_operations lsm_v1_ops = {
+        .lsm_unpackmd           = lsm_unpackmd_v1,
+};
+
+static inline
+struct lov_stripe_md *lsm_unpackmd_v3(struct lov_obd *lov, void *buf,
+				      size_t buf_size)
+{
+	struct lov_mds_md_v3 *lmm = buf;
+
+	return lsm_unpackmd_v1v3(lov, buf, buf_size, lmm->lmm_pool_name,
+				 lmm->lmm_objects);
+}
+
+const struct lsm_operations lsm_v3_ops = {
+	.lsm_unpackmd           = lsm_unpackmd_v3,
+};
+
+static int lsm_verify_comp_md_v1(struct lov_comp_md_v1 *lcm,
+				 size_t lcm_buf_size)
+{
+	unsigned int entry_count;
+	unsigned int i;
+	size_t lcm_size;
+
+	lcm_size = le32_to_cpu(lcm->lcm_size);
+	if (lcm_buf_size < lcm_size) {
+		CERROR("bad LCM buffer size %zu, expected %zu\n",
+		       lcm_buf_size, lcm_size);
+		RETURN(-EINVAL);
+	}
+
+	entry_count = le16_to_cpu(lcm->lcm_entry_count);
+	for (i = 0; i < entry_count; i++) {
+		struct lov_comp_md_entry_v1 *lcme = &lcm->lcm_entries[i];
+		size_t blob_offset;
+		size_t blob_size;
+
+		blob_offset = le32_to_cpu(lcme->lcme_offset);
+		blob_size = le32_to_cpu(lcme->lcme_size);
+
+		if (lcm_size < blob_offset || lcm_size < blob_size ||
+		    lcm_size < blob_offset + blob_size) {
+			CERROR("LCM entry %u has invalid blob: "
+			       "LCM size = %zu, offset = %zu, size = %zu\n",
+			       le32_to_cpu(lcme->lcme_id),
+			       lcm_size, blob_offset, blob_size);
+			RETURN(-EINVAL);
+		}
+	}
+
+	return 0;
+}
+
+static struct lov_stripe_md_entry *
+lsme_unpack_comp(struct lov_obd *lov, struct lov_mds_md *lmm,
+		 size_t lmm_buf_size, bool inited, loff_t *maxbytes)
+{
+	unsigned int magic;
+	unsigned int stripe_count;
+
+	stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
+	if (stripe_count == 0)
+		RETURN(ERR_PTR(-EINVAL));
+	/* un-instantiated lmm contains no ost id info, i.e. lov_ost_data_v1 */
+	if (!inited)
+		stripe_count = 0;
+
+	magic = le32_to_cpu(lmm->lmm_magic);
+	if (magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3)
+		RETURN(ERR_PTR(-EINVAL));
+
+	if (lmm_buf_size < lov_mds_md_size(stripe_count, magic))
+		RETURN(ERR_PTR(-EINVAL));
+
+	if (magic == LOV_MAGIC_V1) {
+		return lsme_unpack(lov, lmm, lmm_buf_size, NULL,
+				   inited, lmm->lmm_objects, maxbytes);
+	} else {
+		struct lov_mds_md_v3 *lmm3 = (struct lov_mds_md_v3 *)lmm;
+
+		return lsme_unpack(lov, lmm, lmm_buf_size, lmm3->lmm_pool_name,
+				   inited, lmm3->lmm_objects, maxbytes);
+	}
+}
+
+static struct lov_stripe_md *
+lsm_unpackmd_comp_md_v1(struct lov_obd *lov, void *buf, size_t buf_size)
+{
+	struct lov_comp_md_v1 *lcm = buf;
+	struct lov_stripe_md *lsm;
+	size_t lsm_size;
+	unsigned int entry_count = 0;
+	unsigned int i;
+	loff_t maxbytes;
+	int rc;
+
+	rc = lsm_verify_comp_md_v1(buf, buf_size);
+	if (rc < 0)
+		return ERR_PTR(rc);
+
+	entry_count = le16_to_cpu(lcm->lcm_entry_count);
+
+	lsm_size = offsetof(typeof(*lsm), lsm_entries[entry_count]);
+	OBD_ALLOC(lsm, lsm_size);
+	if (lsm == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	atomic_set(&lsm->lsm_refc, 1);
+	spin_lock_init(&lsm->lsm_lock);
+	lsm->lsm_magic = le32_to_cpu(lcm->lcm_magic);
+	lsm->lsm_layout_gen = le32_to_cpu(lcm->lcm_layout_gen);
+	lsm->lsm_entry_count = entry_count;
+	lsm->lsm_is_released = true;
+	lsm->lsm_maxbytes = LLONG_MIN;
+
+	for (i = 0; i < entry_count; i++) {
+		struct lov_comp_md_entry_v1 *lcme = &lcm->lcm_entries[i];
+		struct lov_stripe_md_entry *lsme;
+		size_t blob_offset;
+		size_t blob_size;
+		void *blob;
+
+		blob_offset = le32_to_cpu(lcme->lcme_offset);
+		blob_size = le32_to_cpu(lcme->lcme_size);
+		blob = (char *)lcm + blob_offset;
+
+		lsme = lsme_unpack_comp(lov, blob, blob_size,
+					le32_to_cpu(lcme->lcme_flags) &
+					LCME_FL_INIT,
+					(i == entry_count - 1) ? &maxbytes :
+								 NULL);
+		if (IS_ERR(lsme))
+			GOTO(out_lsm, rc = PTR_ERR(lsme));
+
+		if (!(lsme->lsme_pattern & LOV_PATTERN_F_RELEASED))
+			lsm->lsm_is_released = false;
+
+		lsm->lsm_entries[i] = lsme;
+		lsme->lsme_id = le32_to_cpu(lcme->lcme_id);
+		lsme->lsme_flags = le32_to_cpu(lcme->lcme_flags);
+		lu_extent_le_to_cpu(&lsme->lsme_extent, &lcme->lcme_extent);
+
+		if (i == entry_count - 1) {
+			lsm->lsm_maxbytes = (loff_t)lsme->lsme_extent.e_start +
+					    maxbytes;
+			/* the last component hasn't been defined, or
+			 * lsm_maxbytes overflowed. */
+			if (lsme->lsme_extent.e_end != LUSTRE_EOF ||
+			    lsm->lsm_maxbytes <
+			    (loff_t)lsme->lsme_extent.e_start)
+				lsm->lsm_maxbytes = MAX_LFS_FILESIZE;
+		}
+	}
+
+	RETURN(lsm);
+
+out_lsm:
+	for (i = 0; i < entry_count; i++)
+		if (lsm->lsm_entries[i] != NULL)
+			lsme_free(lsm->lsm_entries[i]);
+
+	OBD_FREE(lsm, lsm_size);
+
+	RETURN(ERR_PTR(rc));
+}
+
+const struct lsm_operations lsm_comp_md_v1_ops = {
+	.lsm_unpackmd         = lsm_unpackmd_comp_md_v1,
+};
+
+void dump_lsm(unsigned int level, const struct lov_stripe_md *lsm)
+{
+	int i, j;
+
+	CDEBUG(level, "lsm %p, objid "DOSTID", maxbytes %#llx, magic 0x%08X, "
+	       "refc: %d, entry: %u, layout_gen %u\n",
+	       lsm, POSTID(&lsm->lsm_oi), lsm->lsm_maxbytes, lsm->lsm_magic,
+	       atomic_read(&lsm->lsm_refc), lsm->lsm_entry_count,
+	       lsm->lsm_layout_gen);
+
+	for (i = 0; i < lsm->lsm_entry_count; i++) {
+		struct lov_stripe_md_entry *lse = lsm->lsm_entries[i];
+
+		CDEBUG(level, DEXT ": id: %u, flags: %x, "
+		       "magic 0x%08X, layout_gen %u, "
+		       "stripe count %u, sstripe size %u, "
+		       "pool: ["LOV_POOLNAMEF"]\n",
+		       PEXT(&lse->lsme_extent), lse->lsme_id, lse->lsme_flags,
+		       lse->lsme_magic, lse->lsme_layout_gen,
+		       lse->lsme_stripe_count, lse->lsme_stripe_size,
+		       lse->lsme_pool_name);
+		if (!lsme_inited(lse) ||
+		    lse->lsme_pattern & LOV_PATTERN_F_RELEASED)
+			continue;
+		for (j = 0; j < lse->lsme_stripe_count; j++) {
+			CDEBUG(level, "   oinfo:%p: ostid: "DOSTID
+			       " ost idx: %d gen: %d\n",
+			       lse->lsme_oinfo[j],
+			       POSTID(&lse->lsme_oinfo[j]->loi_oi),
+			       lse->lsme_oinfo[j]->loi_ost_idx,
+			       lse->lsme_oinfo[j]->loi_ost_gen);
+		}
+	}
+}
+
+int lov_lsm_entry(const struct lov_stripe_md *lsm, __u64 offset)
+{
+	int i;
+
+	for (i = 0; i < lsm->lsm_entry_count; i++) {
+		struct lov_stripe_md_entry *lse = lsm->lsm_entries[i];
+
+		if ((offset >= lse->lsme_extent.e_start &&
+		     offset < lse->lsme_extent.e_end) ||
+		    (offset == OBD_OBJECT_EOF &&
+		     lse->lsme_extent.e_end == OBD_OBJECT_EOF))
+			return i;
+	}
+
+	return -1;
+}
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_internal.h b/drivers/staging/lustrefsx/lustre/lov/lov_internal.h
new file mode 100644
index 0000000000000..4ced4d31f76b6
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_internal.h
@@ -0,0 +1,364 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LOV_INTERNAL_H
+#define LOV_INTERNAL_H
+
+#include <obd_class.h>
+#include <lustre/lustre_user.h>
+
+/* If we are unable to get the maximum object size from the OST in
+ * ocd_maxbytes using OBD_CONNECT_MAXBYTES, then we fall back to using
+ * the old maximum object size from ext3. */
+#define LUSTRE_EXT3_STRIPE_MAXBYTES 0x1fffffff000ULL
+
+struct lov_stripe_md_entry {
+	struct lu_extent	lsme_extent;
+	u32			lsme_id;
+	u32			lsme_magic;
+	u32			lsme_flags;
+	u32			lsme_pattern;
+	u32			lsme_stripe_size;
+	u16			lsme_stripe_count;
+	u16			lsme_layout_gen;
+	char			lsme_pool_name[LOV_MAXPOOLNAME + 1];
+	struct lov_oinfo       *lsme_oinfo[];
+};
+
+static inline void copy_lsm_entry(struct lov_stripe_md_entry *dst,
+				  struct lov_stripe_md_entry *src)
+{
+	unsigned i;
+
+	for (i = 0; i < src->lsme_stripe_count; i++)
+		*dst->lsme_oinfo[i] = *src->lsme_oinfo[i];
+	memcpy(dst, src, offsetof(typeof(*src), lsme_oinfo));
+}
+
+struct lov_stripe_md {
+	atomic_t	lsm_refc;
+	spinlock_t	lsm_lock;
+	pid_t		lsm_lock_owner; /* debugging */
+
+	/* maximum possible file size, might change as OSTs status changes,
+	 * e.g. disconnected, deactivated */
+	loff_t		lsm_maxbytes;
+	struct ost_id	lsm_oi;
+	u32		lsm_magic;
+	u32		lsm_layout_gen;
+	u32		lsm_entry_count;
+	bool		lsm_is_released;
+	struct lov_stripe_md_entry *lsm_entries[];
+};
+
+static inline bool lsme_inited(const struct lov_stripe_md_entry *lsme)
+{
+	return lsme->lsme_flags & LCME_FL_INIT;
+}
+
+static inline bool lsm_entry_inited(const struct lov_stripe_md *lsm, int index)
+{
+	return lsme_inited(lsm->lsm_entries[index]);
+}
+
+static inline bool lsm_is_composite(__u32 magic)
+{
+	return magic == LOV_MAGIC_COMP_V1;
+}
+
+static inline size_t lov_comp_md_size(const struct lov_stripe_md *lsm)
+{
+	struct lov_stripe_md_entry *lsme;
+	size_t size;
+	int entry;
+
+	if (lsm->lsm_magic == LOV_MAGIC_V1 || lsm->lsm_magic == LOV_MAGIC_V3)
+		return lov_mds_md_size(lsm->lsm_entries[0]->lsme_stripe_count,
+				       lsm->lsm_entries[0]->lsme_magic);
+
+	LASSERT(lsm->lsm_magic == LOV_MAGIC_COMP_V1);
+
+	size = sizeof(struct lov_comp_md_v1);
+	for (entry = 0; entry < lsm->lsm_entry_count; entry++) {
+		u16 stripe_count;
+
+		lsme = lsm->lsm_entries[entry];
+
+		if (lsme_inited(lsme))
+			stripe_count = lsme->lsme_stripe_count;
+		else
+			stripe_count = 0;
+
+		size += sizeof(*lsme);
+		size += lov_mds_md_size(lsme->lsme_stripe_count,
+					lsme->lsme_magic);
+	}
+
+	return size;
+}
+
+static inline bool lsm_has_objects(struct lov_stripe_md *lsm)
+{
+	return lsm != NULL && !lsm->lsm_is_released;
+}
+
+static inline unsigned int lov_comp_index(int entry, int stripe)
+{
+	LASSERT(entry >= 0 && entry <= SHRT_MAX);
+	LASSERT(stripe >= 0 && stripe < USHRT_MAX);
+
+	return entry << 16 | stripe;
+}
+
+static inline int lov_comp_stripe(int index)
+{
+	return index & 0xffff;
+}
+
+static inline int lov_comp_entry(int index)
+{
+	return index >> 16;
+}
+
+struct lsm_operations {
+	struct lov_stripe_md *(*lsm_unpackmd)(struct lov_obd *, void *, size_t);
+};
+
+extern const struct lsm_operations lsm_v1_ops;
+extern const struct lsm_operations lsm_v3_ops;
+extern const struct lsm_operations lsm_comp_md_v1_ops;
+static inline const struct lsm_operations *lsm_op_find(int magic)
+{
+	switch (magic) {
+	case LOV_MAGIC_V1:
+		return &lsm_v1_ops;
+	case LOV_MAGIC_V3:
+		return &lsm_v3_ops;
+	case LOV_MAGIC_COMP_V1:
+		return &lsm_comp_md_v1_ops;
+	default:
+		CERROR("unrecognized lsm_magic %08x\n", magic);
+		return NULL;
+	}
+}
+
+void lsm_free(struct lov_stripe_md *lsm);
+
+/* lov_do_div64(a, b) returns a % b, and a = a / b.
+ * The 32-bit code is LOV-specific due to knowing about stripe limits in
+ * order to reduce the divisor to a 32-bit number.  If the divisor is
+ * already a 32-bit value the compiler handles this directly. */
+#if BITS_PER_LONG == 64
+# define lov_do_div64(n, base) ({					\
+	uint64_t __base = (base);					\
+	uint64_t __rem;							\
+	__rem = ((uint64_t)(n)) % __base;				\
+	(n) = ((uint64_t)(n)) / __base;					\
+	__rem;								\
+})
+#elif BITS_PER_LONG == 32
+# define lov_do_div64(n, base) ({					\
+	uint64_t __rem;							\
+	if ((sizeof(base) > 4) && (((base) & 0xffffffff00000000ULL) != 0)) {  \
+		int __remainder;					      \
+		LASSERTF(!((base) & (LOV_MIN_STRIPE_SIZE - 1)), "64 bit lov " \
+			 "division %llu / %llu\n", (n), (uint64_t)(base));    \
+		__remainder = (n) & (LOV_MIN_STRIPE_SIZE - 1);		\
+		(n) >>= LOV_MIN_STRIPE_BITS;				\
+		__rem = do_div(n, (base) >> LOV_MIN_STRIPE_BITS);	\
+		__rem <<= LOV_MIN_STRIPE_BITS;				\
+		__rem += __remainder;					\
+	} else {							\
+		__rem = do_div(n, base);				\
+	}								\
+	__rem;								\
+})
+#endif
+
+#define pool_tgt_count(p) ((p)->pool_obds.op_count)
+#define pool_tgt_array(p) ((p)->pool_obds.op_array)
+#define pool_tgt_rw_sem(p) ((p)->pool_obds.op_rw_sem)
+
+struct pool_desc {
+	char			 pool_name[LOV_MAXPOOLNAME + 1];
+	struct ost_pool		 pool_obds;
+	atomic_t		 pool_refcount;
+	struct hlist_node	 pool_hash;	/* access by poolname */
+	struct list_head	 pool_list;	/* serial access */
+	struct proc_dir_entry	*pool_proc_entry;
+	struct obd_device	*pool_lobd;	/* owner */
+};
+
+struct lov_request {
+	struct obd_info		 rq_oi;
+	struct lov_request_set	*rq_rqset;
+	struct list_head	 rq_link;
+	int			 rq_idx;	/* index in lov->tgts array */
+};
+
+struct lov_request_set {
+	struct obd_info		*set_oi;
+	struct obd_device	*set_obd;
+	int			 set_count;
+	atomic_t		 set_completes;
+	atomic_t		 set_success;
+	struct list_head	 set_list;
+};
+
+extern struct kmem_cache *lov_oinfo_slab;
+
+extern struct lu_kmem_descr lov_caches[];
+
+#define lov_uuid2str(lv, index) \
+        (char *)((lv)->lov_tgts[index]->ltd_uuid.uuid)
+
+/* lov_merge.c */
+int lov_merge_lvb_kms(struct lov_stripe_md *lsm, int index,
+                      struct ost_lvb *lvb, __u64 *kms_place);
+
+/* lov_offset.c */
+u64 lov_stripe_size(struct lov_stripe_md *lsm, int index,
+		    u64 ost_size, int stripeno);
+int lov_stripe_offset(struct lov_stripe_md *lsm, int index, loff_t lov_off,
+		      int stripeno, loff_t *obd_off);
+loff_t lov_size_to_stripe(struct lov_stripe_md *lsm, int index, u64 file_size,
+			  int stripeno);
+int lov_stripe_intersects(struct lov_stripe_md *lsm, int index, int stripeno,
+			  struct lu_extent *ext, u64 *obd_start, u64 *obd_end);
+int lov_stripe_number(struct lov_stripe_md *lsm, int index, loff_t lov_off);
+pgoff_t lov_stripe_pgoff(struct lov_stripe_md *lsm, int index,
+			 pgoff_t stripe_index, int stripe);
+
+/* lov_request.c */
+int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
+                        struct lov_request_set **reqset);
+int lov_fini_statfs_set(struct lov_request_set *set);
+
+/* lov_obd.c */
+void lov_stripe_lock(struct lov_stripe_md *md);
+void lov_stripe_unlock(struct lov_stripe_md *md);
+void lov_fix_desc(struct lov_desc *desc);
+void lov_fix_desc_stripe_size(__u64 *val);
+void lov_fix_desc_stripe_count(__u32 *val);
+void lov_fix_desc_pattern(__u32 *val);
+void lov_fix_desc_qos_maxage(__u32 *val);
+__u16 lov_get_stripe_count(struct lov_obd *lov, __u32 magic,
+			   __u16 stripe_count);
+int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
+                    struct obd_connect_data *data);
+int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
+int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg,
+                            __u32 *indexp, int *genp);
+int lov_del_target(struct obd_device *obd, __u32 index,
+                   struct obd_uuid *uuidp, int gen);
+
+/* lov_pack.c */
+ssize_t lov_lsm_pack(const struct lov_stripe_md *lsm, void *buf,
+		     size_t buf_size);
+struct lov_stripe_md *lov_unpackmd(struct lov_obd *lov, void *buf,
+				   size_t buf_size);
+int lov_free_memmd(struct lov_stripe_md **lsmp);
+
+void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm);
+void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm);
+void lov_dump_lmm_common(int level, void *lmmp);
+void lov_dump_lmm(int level, void *lmm);
+
+/* lov_ea.c */
+void lsm_free_plain(struct lov_stripe_md *lsm);
+void dump_lsm(unsigned int level, const struct lov_stripe_md *lsm);
+
+/* lproc_lov.c */
+extern struct file_operations lov_proc_target_fops;
+#ifdef CONFIG_PROC_FS
+extern struct lprocfs_vars lprocfs_lov_obd_vars[];
+#endif
+
+/* lov_cl.c */
+extern struct lu_device_type lov_device_type;
+
+/* pools */
+extern struct cfs_hash_ops pool_hash_operations;
+/* ost_pool methods */
+int lov_ost_pool_init(struct ost_pool *op, unsigned int count);
+int lov_ost_pool_extend(struct ost_pool *op, unsigned int min_count);
+int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count);
+int lov_ost_pool_remove(struct ost_pool *op, __u32 idx);
+int lov_ost_pool_free(struct ost_pool *op);
+
+/* high level pool methods */
+int lov_pool_new(struct obd_device *obd, char *poolname);
+int lov_pool_del(struct obd_device *obd, char *poolname);
+int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname);
+int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname);
+void lov_dump_pool(int level, struct pool_desc *pool);
+
+static inline struct lov_stripe_md *lsm_addref(struct lov_stripe_md *lsm)
+{
+	LASSERT(atomic_read(&lsm->lsm_refc) > 0);
+	atomic_inc(&lsm->lsm_refc);
+	return lsm;
+}
+
+static inline bool lov_oinfo_is_dummy(const struct lov_oinfo *loi)
+{
+	if (unlikely(loi->loi_oi.oi.oi_id == 0 &&
+		     loi->loi_oi.oi.oi_seq == 0 &&
+		     loi->loi_ost_idx == 0 &&
+		     loi->loi_ost_gen == 0))
+		return true;
+
+	return false;
+}
+
+static inline struct obd_device *lov2obd(const struct lov_obd *lov)
+{
+	return container_of0(lov, struct obd_device, u.lov);
+}
+
+static inline void lov_lsm2layout(struct lov_stripe_md *lsm,
+				  struct lov_stripe_md_entry *lsme,
+				  struct ost_layout *ol)
+{
+	ol->ol_stripe_size = lsme->lsme_stripe_size;
+	ol->ol_stripe_count = lsme->lsme_stripe_count;
+	if (lsm->lsm_magic == LOV_MAGIC_COMP_V1) {
+		ol->ol_comp_start = lsme->lsme_extent.e_start;
+		ol->ol_comp_end = lsme->lsme_extent.e_end;
+		ol->ol_comp_id = lsme->lsme_id;
+	} else {
+		ol->ol_comp_start = 0;
+		ol->ol_comp_end = 0;
+		ol->ol_comp_id = 0;
+	}
+}
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_io.c b/drivers/staging/lustrefsx/lustre/lov/lov_io.c
new file mode 100644
index 0000000000000..f40dfa274c356
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_io.c
@@ -0,0 +1,1224 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+static inline struct lov_io_sub *lov_sub_alloc(struct lov_io *lio, int index)
+{
+	struct lov_io_sub *sub;
+
+	if (lio->lis_nr_subios == 0) {
+		LASSERT(lio->lis_single_subio_index == -1);
+		sub = &lio->lis_single_subio;
+		lio->lis_single_subio_index = index;
+		memset(sub, 0, sizeof(*sub));
+	} else {
+		OBD_ALLOC_PTR(sub);
+	}
+
+	if (sub != NULL) {
+		INIT_LIST_HEAD(&sub->sub_list);
+		INIT_LIST_HEAD(&sub->sub_linkage);
+		sub->sub_subio_index = index;
+	}
+
+	return sub;
+}
+
+static inline void lov_sub_free(struct lov_io *lio, struct lov_io_sub *sub)
+{
+	if (sub->sub_subio_index == lio->lis_single_subio_index) {
+		LASSERT(sub == &lio->lis_single_subio);
+		lio->lis_single_subio_index = -1;
+	} else {
+		OBD_FREE_PTR(sub);
+	}
+}
+
+static void lov_io_sub_fini(const struct lu_env *env, struct lov_io *lio,
+			    struct lov_io_sub *sub)
+{
+	ENTRY;
+
+	cl_io_fini(sub->sub_env, &sub->sub_io);
+
+	if (sub->sub_env != NULL && !IS_ERR(sub->sub_env)) {
+		cl_env_put(sub->sub_env, &sub->sub_refcheck);
+		sub->sub_env = NULL;
+	}
+	EXIT;
+}
+
+static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio,
+			   struct lov_io_sub *sub)
+{
+	struct lov_object *lov = lio->lis_object;
+	struct cl_io *sub_io;
+	struct cl_object *sub_obj;
+	struct cl_io *io = lio->lis_cl.cis_io;
+	int index = lov_comp_entry(sub->sub_subio_index);
+	int stripe = lov_comp_stripe(sub->sub_subio_index);
+	int result = 0;
+	LASSERT(sub->sub_env == NULL);
+	ENTRY;
+
+	if (unlikely(!lov_r0(lov, index)->lo_sub ||
+		     !lov_r0(lov, index)->lo_sub[stripe]))
+		RETURN(-EIO);
+
+	/* obtain new environment */
+	sub->sub_env = cl_env_get(&sub->sub_refcheck);
+	if (IS_ERR(sub->sub_env))
+		result = PTR_ERR(sub->sub_env);
+
+	sub_obj = lovsub2cl(lov_r0(lov, index)->lo_sub[stripe]);
+	sub_io  = &sub->sub_io;
+
+	sub_io->ci_obj    = sub_obj;
+	sub_io->ci_result = 0;
+
+	sub_io->ci_parent  = io;
+	sub_io->ci_lockreq = io->ci_lockreq;
+	sub_io->ci_type    = io->ci_type;
+	sub_io->ci_no_srvlock = io->ci_no_srvlock;
+	sub_io->ci_noatime = io->ci_noatime;
+	sub_io->ci_pio = io->ci_pio;
+
+	result = cl_io_sub_init(sub->sub_env, sub_io, io->ci_type, sub_obj);
+
+	if (result < 0)
+		lov_io_sub_fini(env, lio, sub);
+
+	RETURN(result);
+}
+
+struct lov_io_sub *lov_sub_get(const struct lu_env *env,
+			       struct lov_io *lio, int index)
+{
+	struct lov_io_sub *sub;
+	int rc = 0;
+
+	ENTRY;
+
+	list_for_each_entry(sub, &lio->lis_subios, sub_list) {
+		if (sub->sub_subio_index == index) {
+			rc = 1;
+			break;
+		}
+	}
+
+	if (rc == 0) {
+		sub = lov_sub_alloc(lio, index);
+		if (sub == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		rc = lov_io_sub_init(env, lio, sub);
+		if (rc < 0) {
+			lov_sub_free(lio, sub);
+			GOTO(out, rc);
+		}
+
+		list_add_tail(&sub->sub_list, &lio->lis_subios);
+		lio->lis_nr_subios++;
+	}
+out:
+	if (rc < 0)
+		sub = ERR_PTR(rc);
+	RETURN(sub);
+}
+
+/*****************************************************************************
+ *
+ * Lov io operations.
+ *
+ */
+
+int lov_page_index(const struct cl_page *page)
+{
+	const struct cl_page_slice *slice;
+	ENTRY;
+
+	slice = cl_page_at(page, &lov_device_type);
+	LASSERT(slice != NULL);
+	LASSERT(slice->cpl_obj != NULL);
+
+	RETURN(cl2lov_page(slice)->lps_index);
+}
+
+static int lov_io_subio_init(const struct lu_env *env, struct lov_io *lio,
+                             struct cl_io *io)
+{
+	ENTRY;
+
+	LASSERT(lio->lis_object != NULL);
+
+	INIT_LIST_HEAD(&lio->lis_subios);
+	lio->lis_single_subio_index = -1;
+	lio->lis_nr_subios = 0;
+
+	RETURN(0);
+}
+
+static int lov_io_slice_init(struct lov_io *lio,
+			     struct lov_object *obj, struct cl_io *io)
+{
+	ENTRY;
+
+	io->ci_result = 0;
+	lio->lis_object = obj;
+
+	LASSERT(obj->lo_lsm != NULL);
+
+	switch (io->ci_type) {
+	case CIT_READ:
+	case CIT_WRITE:
+		lio->lis_pos = io->u.ci_rw.rw_range.cir_pos;
+		lio->lis_endpos = lio->lis_pos + io->u.ci_rw.rw_range.cir_count;
+		lio->lis_io_endpos = lio->lis_endpos;
+		if (cl_io_is_append(io)) {
+			LASSERT(io->ci_type == CIT_WRITE);
+
+			/* If there is LOV EA hole, then we may cannot locate
+			 * the current file-tail exactly. */
+			if (unlikely(obj->lo_lsm->lsm_entries[0]->lsme_pattern &
+				     LOV_PATTERN_F_HOLE))
+				RETURN(-EIO);
+
+			lio->lis_pos = 0;
+			lio->lis_endpos = OBD_OBJECT_EOF;
+		}
+		break;
+
+        case CIT_SETATTR:
+                if (cl_io_is_trunc(io))
+                        lio->lis_pos = io->u.ci_setattr.sa_attr.lvb_size;
+                else
+                        lio->lis_pos = 0;
+                lio->lis_endpos = OBD_OBJECT_EOF;
+                break;
+
+	case CIT_DATA_VERSION:
+		lio->lis_pos = 0;
+		lio->lis_endpos = OBD_OBJECT_EOF;
+		break;
+
+        case CIT_FAULT: {
+                pgoff_t index = io->u.ci_fault.ft_index;
+                lio->lis_pos = cl_offset(io->ci_obj, index);
+                lio->lis_endpos = cl_offset(io->ci_obj, index + 1);
+                break;
+        }
+
+	case CIT_FSYNC: {
+		lio->lis_pos = io->u.ci_fsync.fi_start;
+		lio->lis_endpos = io->u.ci_fsync.fi_end;
+		break;
+	}
+
+	case CIT_LADVISE: {
+		lio->lis_pos = io->u.ci_ladvise.li_start;
+		lio->lis_endpos = io->u.ci_ladvise.li_end;
+		break;
+	}
+
+        case CIT_MISC:
+                lio->lis_pos = 0;
+                lio->lis_endpos = OBD_OBJECT_EOF;
+                break;
+
+        default:
+                LBUG();
+        }
+
+	RETURN(0);
+}
+
+static void lov_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	struct lov_io *lio = cl2lov_io(env, ios);
+	struct lov_object *lov = cl2lov(ios->cis_obj);
+
+	ENTRY;
+
+	LASSERT(list_empty(&lio->lis_active));
+
+	while (!list_empty(&lio->lis_subios)) {
+		struct lov_io_sub *sub = list_entry(lio->lis_subios.next,
+						    struct lov_io_sub,
+						    sub_list);
+
+		list_del_init(&sub->sub_list);
+		lio->lis_nr_subios--;
+
+		lov_io_sub_fini(env, lio, sub);
+		lov_sub_free(lio, sub);
+	}
+	LASSERT(lio->lis_nr_subios == 0);
+
+	LASSERT(atomic_read(&lov->lo_active_ios) > 0);
+	if (atomic_dec_and_test(&lov->lo_active_ios))
+		wake_up_all(&lov->lo_waitq);
+	EXIT;
+}
+
+static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio,
+			       loff_t start, loff_t end)
+{
+	struct cl_io *io = &sub->sub_io;
+	struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+	struct cl_io *parent = lio->lis_cl.cis_io;
+	int index = lov_comp_entry(sub->sub_subio_index);
+	int stripe = lov_comp_stripe(sub->sub_subio_index);
+
+	io->ci_pio = parent->ci_pio;
+	switch (io->ci_type) {
+	case CIT_SETATTR: {
+		io->u.ci_setattr.sa_attr = parent->u.ci_setattr.sa_attr;
+		io->u.ci_setattr.sa_attr_flags =
+			parent->u.ci_setattr.sa_attr_flags;
+		io->u.ci_setattr.sa_valid = parent->u.ci_setattr.sa_valid;
+		io->u.ci_setattr.sa_stripe_index = stripe;
+		io->u.ci_setattr.sa_parent_fid =
+					parent->u.ci_setattr.sa_parent_fid;
+		if (cl_io_is_trunc(io)) {
+			loff_t new_size = parent->u.ci_setattr.sa_attr.lvb_size;
+
+			new_size = lov_size_to_stripe(lsm, index, new_size,
+						      stripe);
+			io->u.ci_setattr.sa_attr.lvb_size = new_size;
+		}
+		lov_lsm2layout(lsm, lsm->lsm_entries[index],
+			       &io->u.ci_setattr.sa_layout);
+		break;
+	}
+	case CIT_DATA_VERSION: {
+		io->u.ci_data_version.dv_data_version = 0;
+		io->u.ci_data_version.dv_flags =
+			parent->u.ci_data_version.dv_flags;
+		break;
+	}
+	case CIT_FAULT: {
+		struct cl_object *obj = parent->ci_obj;
+		loff_t off = cl_offset(obj, parent->u.ci_fault.ft_index);
+
+		io->u.ci_fault = parent->u.ci_fault;
+		off = lov_size_to_stripe(lsm, index, off, stripe);
+		io->u.ci_fault.ft_index = cl_index(obj, off);
+		break;
+	}
+	case CIT_FSYNC: {
+		io->u.ci_fsync.fi_start = start;
+		io->u.ci_fsync.fi_end = end;
+		io->u.ci_fsync.fi_fid = parent->u.ci_fsync.fi_fid;
+		io->u.ci_fsync.fi_mode = parent->u.ci_fsync.fi_mode;
+		break;
+	}
+	case CIT_READ:
+	case CIT_WRITE: {
+		io->u.ci_rw.rw_ptask = parent->u.ci_rw.rw_ptask;
+		io->u.ci_rw.rw_iter = parent->u.ci_rw.rw_iter;
+		io->u.ci_rw.rw_iocb = parent->u.ci_rw.rw_iocb;
+		io->u.ci_rw.rw_file = parent->u.ci_rw.rw_file;
+		io->u.ci_rw.rw_sync = parent->u.ci_rw.rw_sync;
+		if (cl_io_is_append(parent)) {
+			io->u.ci_rw.rw_append = 1;
+		} else {
+			io->u.ci_rw.rw_range.cir_pos = start;
+			io->u.ci_rw.rw_range.cir_count = end - start;
+		}
+		break;
+	}
+	case CIT_LADVISE: {
+		io->u.ci_ladvise.li_start = start;
+		io->u.ci_ladvise.li_end = end;
+		io->u.ci_ladvise.li_fid = parent->u.ci_ladvise.li_fid;
+		io->u.ci_ladvise.li_advice = parent->u.ci_ladvise.li_advice;
+		io->u.ci_ladvise.li_flags = parent->u.ci_ladvise.li_flags;
+		break;
+	}
+	default:
+		break;
+	}
+}
+
+static loff_t lov_offset_mod(loff_t val, int delta)
+{
+        if (val != OBD_OBJECT_EOF)
+                val += delta;
+        return val;
+}
+
+static int lov_io_iter_init(const struct lu_env *env,
+			    const struct cl_io_slice *ios)
+{
+	struct cl_io         *io = ios->cis_io;
+	struct lov_io        *lio = cl2lov_io(env, ios);
+	struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+	struct lov_io_sub    *sub;
+	struct lov_layout_entry *le;
+	struct lu_extent ext;
+	int index;
+	int rc = 0;
+
+        ENTRY;
+
+	ext.e_start = lio->lis_pos;
+	ext.e_end = lio->lis_endpos;
+
+	index = 0;
+	lov_foreach_layout_entry(lio->lis_object, le) {
+		struct lov_layout_raid0 *r0 = &le->lle_raid0;
+		u64 start;
+		u64 end;
+		int stripe;
+
+		index++;
+		if (!lu_extent_is_overlapped(&ext, &le->lle_extent))
+			continue;
+
+		CDEBUG(D_VFSTRACE, "component[%d] flags %#x\n",
+		       index - 1, lsm->lsm_entries[index - 1]->lsme_flags);
+		if (!lsm_entry_inited(lsm, index - 1)) {
+			/* truncate IO will trigger write intent as well, and
+			 * it's handled in lov_io_setattr_iter_init() */
+			if (io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io)) {
+				io->ci_need_write_intent = 1;
+				/* execute it in main thread */
+				io->ci_pio = 0;
+				rc = -ENODATA;
+				break;
+			}
+
+			/* Read from uninitialized components should return
+			 * zero filled pages. */
+			continue;
+		}
+
+		for (stripe = 0; stripe < r0->lo_nr; stripe++) {
+			if (!lov_stripe_intersects(lsm, index - 1, stripe,
+						   &ext, &start, &end))
+				continue;
+
+			if (unlikely(r0->lo_sub[stripe] == NULL)) {
+				if (ios->cis_io->ci_type == CIT_READ ||
+				    ios->cis_io->ci_type == CIT_WRITE ||
+				    ios->cis_io->ci_type == CIT_FAULT)
+					RETURN(-EIO);
+
+				continue;
+			}
+
+			end = lov_offset_mod(end, 1);
+			sub = lov_sub_get(env, lio,
+					  lov_comp_index(index - 1, stripe));
+			if (IS_ERR(sub)) {
+				rc = PTR_ERR(sub);
+				break;
+			}
+
+			lov_io_sub_inherit(sub, lio, start, end);
+			rc = cl_io_iter_init(sub->sub_env, &sub->sub_io);
+			if (rc != 0)
+				cl_io_iter_fini(sub->sub_env, &sub->sub_io);
+			if (rc != 0)
+				break;
+
+			CDEBUG(D_VFSTRACE,
+				"shrink stripe: {%d, %d} range: [%llu, %llu)\n",
+				index, stripe, start, end);
+
+			list_add_tail(&sub->sub_linkage, &lio->lis_active);
+		}
+		if (rc != 0)
+			break;
+	}
+	RETURN(rc);
+}
+
+static int lov_io_rw_iter_init(const struct lu_env *env,
+			       const struct cl_io_slice *ios)
+{
+	struct cl_io *io = ios->cis_io;
+	struct lov_io *lio = cl2lov_io(env, ios);
+	struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+	struct lov_stripe_md_entry *lse;
+	struct cl_io_range *range = &io->u.ci_rw.rw_range;
+	loff_t start = range->cir_pos;
+	loff_t next;
+	int index;
+
+	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+	ENTRY;
+
+	if (cl_io_is_append(io))
+		RETURN(lov_io_iter_init(env, ios));
+
+	index = lov_lsm_entry(lsm, range->cir_pos);
+	if (index < 0) { /* non-existing layout component */
+		if (io->ci_type == CIT_READ) {
+			/* TODO: it needs to detect the next component and
+			 * then set the next pos */
+			io->ci_continue = 0;
+			/* execute it in main thread */
+			io->ci_pio = 0;
+
+			RETURN(lov_io_iter_init(env, ios));
+		}
+
+		RETURN(-ENODATA);
+	}
+
+	lse = lov_lse(lio->lis_object, index);
+
+	next = MAX_LFS_FILESIZE;
+	if (lse->lsme_stripe_count > 1) {
+		unsigned long ssize = lse->lsme_stripe_size;
+
+		lov_do_div64(start, ssize);
+		next = (start + 1) * ssize;
+		if (next <= start * ssize)
+			next = MAX_LFS_FILESIZE;
+	}
+
+	LASSERTF(range->cir_pos >= lse->lsme_extent.e_start,
+		 "pos %lld, [%lld, %lld)\n", range->cir_pos,
+		 lse->lsme_extent.e_start, lse->lsme_extent.e_end);
+	next = min_t(__u64, next, lse->lsme_extent.e_end);
+	next = min_t(loff_t, next, lio->lis_io_endpos);
+
+	io->ci_continue  = next < lio->lis_io_endpos;
+	range->cir_count = next - range->cir_pos;
+	lio->lis_pos     = range->cir_pos;
+	lio->lis_endpos  = range->cir_pos + range->cir_count;
+	CDEBUG(D_VFSTRACE,
+	       "stripe: {%d, %llu} range: [%llu, %llu) end: %llu, count: %zd\n",
+	       index, start, lio->lis_pos, lio->lis_endpos,
+	       lio->lis_io_endpos, range->cir_count);
+
+	if (!io->ci_continue) {
+		/* the last piece of IO, execute it in main thread */
+		io->ci_pio = 0;
+	}
+
+	if (io->ci_pio) {
+		/* it only splits IO here for parallel IO,
+		 * there will be no actual IO going to occur,
+		 * so it doesn't need to invoke lov_io_iter_init()
+		 * to initialize sub IOs. */
+		if (!lsm_entry_inited(lsm, index)) {
+			io->ci_need_write_intent = 1;
+			RETURN(-ENODATA);
+		}
+		RETURN(0);
+	}
+
+	/*
+	 * XXX The following call should be optimized: we know, that
+	 * [lio->lis_pos, lio->lis_endpos) intersects with exactly one stripe.
+	 */
+	RETURN(lov_io_iter_init(env, ios));
+}
+
+static int lov_io_setattr_iter_init(const struct lu_env *env,
+				    const struct cl_io_slice *ios)
+{
+	struct lov_io *lio = cl2lov_io(env, ios);
+	struct cl_io *io = ios->cis_io;
+	struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+	int index;
+	ENTRY;
+
+	if (cl_io_is_trunc(io) && lio->lis_pos > 0) {
+		index = lov_lsm_entry(lsm, lio->lis_pos - 1);
+		if (index > 0 && !lsm_entry_inited(lsm, index)) {
+			io->ci_need_write_intent = 1;
+			RETURN(io->ci_result = -ENODATA);
+		}
+	}
+
+	RETURN(lov_io_iter_init(env, ios));
+}
+
+static int lov_io_call(const struct lu_env *env, struct lov_io *lio,
+		       int (*iofunc)(const struct lu_env *, struct cl_io *))
+{
+	struct cl_io *parent = lio->lis_cl.cis_io;
+	struct lov_io_sub *sub;
+	int rc = 0;
+
+	ENTRY;
+	list_for_each_entry(sub, &lio->lis_active, sub_linkage) {
+		rc = iofunc(sub->sub_env, &sub->sub_io);
+		if (rc)
+			break;
+
+		if (parent->ci_result == 0)
+			parent->ci_result = sub->sub_io.ci_result;
+	}
+	RETURN(rc);
+}
+
+static int lov_io_lock(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+        ENTRY;
+        RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_lock));
+}
+
+static int lov_io_start(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+        ENTRY;
+        RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_start));
+}
+
+static int lov_io_end_wrapper(const struct lu_env *env, struct cl_io *io)
+{
+        ENTRY;
+        /*
+         * It's possible that lov_io_start() wasn't called against this
+         * sub-io, either because previous sub-io failed, or upper layer
+         * completed IO.
+         */
+        if (io->ci_state == CIS_IO_GOING)
+                cl_io_end(env, io);
+        else
+                io->ci_state = CIS_IO_FINISHED;
+        RETURN(0);
+}
+
+static int lov_io_iter_fini_wrapper(const struct lu_env *env, struct cl_io *io)
+{
+        cl_io_iter_fini(env, io);
+        RETURN(0);
+}
+
+static int lov_io_unlock_wrapper(const struct lu_env *env, struct cl_io *io)
+{
+        cl_io_unlock(env, io);
+        RETURN(0);
+}
+
+static void lov_io_end(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+        int rc;
+
+        rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_end_wrapper);
+        LASSERT(rc == 0);
+}
+
+static void
+lov_io_data_version_end(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	struct lov_io *lio = cl2lov_io(env, ios);
+	struct cl_io *parent = lio->lis_cl.cis_io;
+	struct lov_io_sub *sub;
+
+	ENTRY;
+	list_for_each_entry(sub, &lio->lis_active, sub_linkage) {
+		lov_io_end_wrapper(env, &sub->sub_io);
+
+		parent->u.ci_data_version.dv_data_version +=
+			sub->sub_io.u.ci_data_version.dv_data_version;
+
+		if (parent->ci_result == 0)
+			parent->ci_result = sub->sub_io.ci_result;
+	}
+
+	EXIT;
+}
+
+static void lov_io_iter_fini(const struct lu_env *env,
+                             const struct cl_io_slice *ios)
+{
+        struct lov_io *lio = cl2lov_io(env, ios);
+        int rc;
+
+        ENTRY;
+        rc = lov_io_call(env, lio, lov_io_iter_fini_wrapper);
+        LASSERT(rc == 0);
+	while (!list_empty(&lio->lis_active))
+		list_del_init(lio->lis_active.next);
+        EXIT;
+}
+
+static void lov_io_unlock(const struct lu_env *env,
+                          const struct cl_io_slice *ios)
+{
+        int rc;
+
+        ENTRY;
+        rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_unlock_wrapper);
+        LASSERT(rc == 0);
+        EXIT;
+}
+
+static int lov_io_read_ahead(const struct lu_env *env,
+			     const struct cl_io_slice *ios,
+			     pgoff_t start, struct cl_read_ahead *ra)
+{
+	struct lov_io		*lio = cl2lov_io(env, ios);
+	struct lov_object	*loo = lio->lis_object;
+	struct cl_object	*obj = lov2cl(loo);
+	struct lov_layout_raid0 *r0;
+	struct lov_io_sub	*sub;
+	loff_t			 offset;
+	loff_t			 suboff;
+	pgoff_t			 ra_end;
+	unsigned int		 pps; /* pages per stripe */
+	int			 stripe;
+	int			 index;
+	int			 rc;
+	ENTRY;
+
+	offset = cl_offset(obj, start);
+	index = lov_lsm_entry(loo->lo_lsm, offset);
+	if (index < 0 || !lsm_entry_inited(loo->lo_lsm, index))
+		RETURN(-ENODATA);
+
+	stripe = lov_stripe_number(loo->lo_lsm, index, offset);
+
+	r0 = lov_r0(loo, index);
+	if (unlikely(r0->lo_sub[stripe] == NULL))
+		RETURN(-EIO);
+
+	sub = lov_sub_get(env, lio, lov_comp_index(index, stripe));
+	if (IS_ERR(sub))
+		RETURN(PTR_ERR(sub));
+
+	lov_stripe_offset(loo->lo_lsm, index, offset, stripe, &suboff);
+	rc = cl_io_read_ahead(sub->sub_env, &sub->sub_io,
+			      cl_index(lovsub2cl(r0->lo_sub[stripe]), suboff),
+			      ra);
+
+	CDEBUG(D_READA, DFID " cra_end = %lu, stripes = %d, rc = %d\n",
+	       PFID(lu_object_fid(lov2lu(loo))), ra->cra_end, r0->lo_nr, rc);
+	if (rc != 0)
+		RETURN(rc);
+
+	/**
+	 * Adjust the stripe index by layout of comp. ra->cra_end is the
+	 * maximum page index covered by an underlying DLM lock.
+	 * This function converts cra_end from stripe level to file level, and
+	 * make sure it's not beyond stripe and component boundary.
+	 */
+
+	/* cra_end is stripe level, convert it into file level */
+	ra_end = ra->cra_end;
+	if (ra_end != CL_PAGE_EOF)
+		ra->cra_end = lov_stripe_pgoff(loo->lo_lsm, index,
+					       ra_end, stripe);
+
+	/* boundary of current component */
+	ra_end = cl_index(obj, (loff_t)lov_lse(loo, index)->lsme_extent.e_end);
+	if (ra_end != CL_PAGE_EOF && ra->cra_end >= ra_end)
+		ra->cra_end = ra_end - 1;
+
+	if (r0->lo_nr == 1) /* single stripe file */
+		RETURN(0);
+
+	pps = lov_lse(loo, index)->lsme_stripe_size >> PAGE_SHIFT;
+
+	CDEBUG(D_READA, DFID " max_index = %lu, pps = %u, index = %u, "
+	       "stripe_size = %u, stripe no = %u, start index = %lu\n",
+	       PFID(lu_object_fid(lov2lu(loo))), ra->cra_end, pps, index,
+	       lov_lse(loo, index)->lsme_stripe_size, stripe, start);
+
+	/* never exceed the end of the stripe */
+	ra->cra_end = min_t(pgoff_t,
+			    ra->cra_end, start + pps - start % pps - 1);
+	RETURN(0);
+}
+
+/**
+ * lov implementation of cl_operations::cio_submit() method. It takes a list
+ * of pages in \a queue, splits it into per-stripe sub-lists, invokes
+ * cl_io_submit() on underlying devices to submit sub-lists, and then splices
+ * everything back.
+ *
+ * Major complication of this function is a need to handle memory cleansing:
+ * cl_io_submit() is called to write out pages as a part of VM memory
+ * reclamation, and hence it may not fail due to memory shortages (system
+ * dead-locks otherwise). To deal with this, some resources (sub-lists,
+ * sub-environment, etc.) are allocated per-device on "startup" (i.e., in a
+ * not-memory cleansing context), and in case of memory shortage, these
+ * pre-allocated resources are used by lov_io_submit() under
+ * lov_device::ld_mutex mutex.
+ */
+static int lov_io_submit(const struct lu_env *env,
+			 const struct cl_io_slice *ios,
+			 enum cl_req_type crt, struct cl_2queue *queue)
+{
+	struct cl_page_list	*qin = &queue->c2_qin;
+	struct lov_io		*lio = cl2lov_io(env, ios);
+	struct lov_io_sub	*sub;
+	struct cl_page_list	*plist = &lov_env_info(env)->lti_plist;
+	struct cl_page		*page;
+	int index;
+	int rc = 0;
+	ENTRY;
+
+	if (lio->lis_nr_subios == 1) {
+		int idx = lio->lis_single_subio_index;
+
+		sub = lov_sub_get(env, lio, idx);
+		LASSERT(!IS_ERR(sub));
+		LASSERT(sub == &lio->lis_single_subio);
+		rc = cl_io_submit_rw(sub->sub_env, &sub->sub_io,
+				     crt, queue);
+		RETURN(rc);
+	}
+
+	cl_page_list_init(plist);
+	while (qin->pl_nr > 0) {
+		struct cl_2queue  *cl2q = &lov_env_info(env)->lti_cl2q;
+
+		cl_2queue_init(cl2q);
+
+		page = cl_page_list_first(qin);
+		cl_page_list_move(&cl2q->c2_qin, qin, page);
+
+		index = lov_page_index(page);
+		while (qin->pl_nr > 0) {
+			page = cl_page_list_first(qin);
+			if (index != lov_page_index(page))
+				break;
+
+			cl_page_list_move(&cl2q->c2_qin, qin, page);
+		}
+
+		sub = lov_sub_get(env, lio, index);
+		if (!IS_ERR(sub)) {
+			rc = cl_io_submit_rw(sub->sub_env, &sub->sub_io,
+					     crt, cl2q);
+		} else {
+			rc = PTR_ERR(sub);
+		}
+
+		cl_page_list_splice(&cl2q->c2_qin, plist);
+		cl_page_list_splice(&cl2q->c2_qout, &queue->c2_qout);
+		cl_2queue_fini(env, cl2q);
+
+		if (rc != 0)
+			break;
+	}
+
+	cl_page_list_splice(plist, qin);
+	cl_page_list_fini(env, plist);
+
+	RETURN(rc);
+}
+
+static int lov_io_commit_async(const struct lu_env *env,
+			       const struct cl_io_slice *ios,
+			       struct cl_page_list *queue, int from, int to,
+			       cl_commit_cbt cb)
+{
+	struct cl_page_list *plist = &lov_env_info(env)->lti_plist;
+	struct lov_io     *lio = cl2lov_io(env, ios);
+	struct lov_io_sub *sub;
+	struct cl_page *page;
+	int rc = 0;
+	ENTRY;
+
+	if (lio->lis_nr_subios == 1) {
+		int idx = lio->lis_single_subio_index;
+
+		sub = lov_sub_get(env, lio, idx);
+		LASSERT(!IS_ERR(sub));
+		LASSERT(sub == &lio->lis_single_subio);
+		rc = cl_io_commit_async(sub->sub_env, &sub->sub_io, queue,
+					from, to, cb);
+		RETURN(rc);
+	}
+
+	cl_page_list_init(plist);
+	while (queue->pl_nr > 0) {
+		int stripe_to = to;
+		int index;
+
+		LASSERT(plist->pl_nr == 0);
+		page = cl_page_list_first(queue);
+		cl_page_list_move(plist, queue, page);
+
+		index = lov_page_index(page);
+		while (queue->pl_nr > 0) {
+			page = cl_page_list_first(queue);
+			if (index != lov_page_index(page))
+				break;
+
+			cl_page_list_move(plist, queue, page);
+		}
+
+		if (queue->pl_nr > 0) /* still has more pages */
+			stripe_to = PAGE_SIZE;
+
+		sub = lov_sub_get(env, lio, index);
+		if (!IS_ERR(sub)) {
+			rc = cl_io_commit_async(sub->sub_env, &sub->sub_io,
+						plist, from, stripe_to, cb);
+		} else {
+			rc = PTR_ERR(sub);
+			break;
+		}
+
+		if (plist->pl_nr > 0) /* short write */
+			break;
+
+		from = 0;
+	}
+
+	/* for error case, add the page back into the qin list */
+	LASSERT(ergo(rc == 0, plist->pl_nr == 0));
+	while (plist->pl_nr > 0) {
+		/* error occurred, add the uncommitted pages back into queue */
+		page = cl_page_list_last(plist);
+		cl_page_list_move_head(queue, plist, page);
+	}
+
+	RETURN(rc);
+}
+
+static int lov_io_fault_start(const struct lu_env *env,
+			      const struct cl_io_slice *ios)
+{
+	struct cl_fault_io *fio;
+	struct lov_io      *lio;
+	struct lov_io_sub  *sub;
+
+	ENTRY;
+
+	fio = &ios->cis_io->u.ci_fault;
+	lio = cl2lov_io(env, ios);
+	sub = lov_sub_get(env, lio, lov_page_index(fio->ft_page));
+	sub->sub_io.u.ci_fault.ft_nob = fio->ft_nob;
+
+	RETURN(lov_io_start(env, ios));
+}
+
+static void lov_io_fsync_end(const struct lu_env *env,
+			     const struct cl_io_slice *ios)
+{
+	struct lov_io *lio = cl2lov_io(env, ios);
+	struct lov_io_sub *sub;
+	unsigned int *written = &ios->cis_io->u.ci_fsync.fi_nr_written;
+	ENTRY;
+
+	*written = 0;
+	list_for_each_entry(sub, &lio->lis_active, sub_linkage) {
+		struct cl_io *subio = &sub->sub_io;
+
+		lov_io_end_wrapper(sub->sub_env, subio);
+
+		if (subio->ci_result == 0)
+			*written += subio->u.ci_fsync.fi_nr_written;
+	}
+	RETURN_EXIT;
+}
+
+static const struct cl_io_operations lov_io_ops = {
+        .op = {
+                [CIT_READ] = {
+                        .cio_fini      = lov_io_fini,
+                        .cio_iter_init = lov_io_rw_iter_init,
+                        .cio_iter_fini = lov_io_iter_fini,
+                        .cio_lock      = lov_io_lock,
+                        .cio_unlock    = lov_io_unlock,
+                        .cio_start     = lov_io_start,
+                        .cio_end       = lov_io_end
+                },
+                [CIT_WRITE] = {
+                        .cio_fini      = lov_io_fini,
+                        .cio_iter_init = lov_io_rw_iter_init,
+                        .cio_iter_fini = lov_io_iter_fini,
+                        .cio_lock      = lov_io_lock,
+                        .cio_unlock    = lov_io_unlock,
+                        .cio_start     = lov_io_start,
+                        .cio_end       = lov_io_end
+                },
+		[CIT_SETATTR] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_setattr_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_start,
+			.cio_end       = lov_io_end
+		},
+		[CIT_DATA_VERSION] = {
+			.cio_fini	= lov_io_fini,
+			.cio_iter_init	= lov_io_iter_init,
+			.cio_iter_fini	= lov_io_iter_fini,
+			.cio_lock	= lov_io_lock,
+			.cio_unlock	= lov_io_unlock,
+			.cio_start	= lov_io_start,
+			.cio_end	= lov_io_data_version_end,
+		},
+                [CIT_FAULT] = {
+                        .cio_fini      = lov_io_fini,
+                        .cio_iter_init = lov_io_iter_init,
+                        .cio_iter_fini = lov_io_iter_fini,
+                        .cio_lock      = lov_io_lock,
+                        .cio_unlock    = lov_io_unlock,
+                        .cio_start     = lov_io_fault_start,
+                        .cio_end       = lov_io_end
+                },
+		[CIT_FSYNC] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_start,
+			.cio_end       = lov_io_fsync_end
+		},
+		[CIT_LADVISE] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_start,
+			.cio_end       = lov_io_end
+		},
+		[CIT_MISC] = {
+			.cio_fini      = lov_io_fini
+		}
+	},
+	.cio_read_ahead		       = lov_io_read_ahead,
+	.cio_submit                    = lov_io_submit,
+	.cio_commit_async              = lov_io_commit_async,
+};
+
+/*****************************************************************************
+ *
+ * Empty lov io operations.
+ *
+ */
+
+static void lov_empty_io_fini(const struct lu_env *env,
+                              const struct cl_io_slice *ios)
+{
+	struct lov_object *lov = cl2lov(ios->cis_obj);
+	ENTRY;
+
+	if (atomic_dec_and_test(&lov->lo_active_ios))
+		wake_up_all(&lov->lo_waitq);
+	EXIT;
+}
+
+static int lov_empty_io_submit(const struct lu_env *env,
+			       const struct cl_io_slice *ios,
+			       enum cl_req_type crt, struct cl_2queue *queue)
+{
+	return -EBADF;
+}
+
+static void lov_empty_impossible(const struct lu_env *env,
+                                 struct cl_io_slice *ios)
+{
+        LBUG();
+}
+
+#define LOV_EMPTY_IMPOSSIBLE ((void *)lov_empty_impossible)
+
+/**
+ * An io operation vector for files without stripes.
+ */
+static const struct cl_io_operations lov_empty_io_ops = {
+        .op = {
+                [CIT_READ] = {
+                        .cio_fini       = lov_empty_io_fini,
+#if 0
+                        .cio_iter_init  = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_lock       = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_start      = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_end        = LOV_EMPTY_IMPOSSIBLE
+#endif
+                },
+                [CIT_WRITE] = {
+                        .cio_fini      = lov_empty_io_fini,
+                        .cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_start     = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_end       = LOV_EMPTY_IMPOSSIBLE
+                },
+                [CIT_SETATTR] = {
+                        .cio_fini      = lov_empty_io_fini,
+                        .cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_start     = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_end       = LOV_EMPTY_IMPOSSIBLE
+                },
+                [CIT_FAULT] = {
+                        .cio_fini      = lov_empty_io_fini,
+                        .cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_start     = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_end       = LOV_EMPTY_IMPOSSIBLE
+                },
+		[CIT_FSYNC] = {
+			.cio_fini      = lov_empty_io_fini
+		},
+		[CIT_LADVISE] = {
+			.cio_fini   = lov_empty_io_fini
+		},
+		[CIT_MISC] = {
+			.cio_fini      = lov_empty_io_fini
+		}
+	},
+	.cio_submit                    = lov_empty_io_submit,
+	.cio_commit_async              = LOV_EMPTY_IMPOSSIBLE
+};
+
+int lov_io_init_composite(const struct lu_env *env, struct cl_object *obj,
+			  struct cl_io *io)
+{
+	struct lov_io       *lio = lov_env_io(env);
+	struct lov_object   *lov = cl2lov(obj);
+
+	ENTRY;
+	INIT_LIST_HEAD(&lio->lis_active);
+	io->ci_result = lov_io_slice_init(lio, lov, io);
+	if (io->ci_result != 0)
+		RETURN(io->ci_result);
+
+	if (io->ci_result == 0) {
+		io->ci_result = lov_io_subio_init(env, lio, io);
+		if (io->ci_result == 0) {
+			cl_io_slice_add(io, &lio->lis_cl, obj, &lov_io_ops);
+			atomic_inc(&lov->lo_active_ios);
+		}
+	}
+	RETURN(io->ci_result);
+}
+
+int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj,
+                      struct cl_io *io)
+{
+	struct lov_object *lov = cl2lov(obj);
+	struct lov_io *lio = lov_env_io(env);
+	int result;
+	ENTRY;
+
+	lio->lis_object = lov;
+	switch (io->ci_type) {
+	default:
+		LBUG();
+	case CIT_MISC:
+	case CIT_READ:
+		result = 0;
+		break;
+	case CIT_FSYNC:
+	case CIT_LADVISE:
+	case CIT_SETATTR:
+	case CIT_DATA_VERSION:
+		result = +1;
+		break;
+	case CIT_WRITE:
+		result = -EBADF;
+		break;
+	case CIT_FAULT:
+		result = -EFAULT;
+		CERROR("Page fault on a file without stripes: "DFID"\n",
+		       PFID(lu_object_fid(&obj->co_lu)));
+		break;
+	}
+	if (result == 0) {
+		cl_io_slice_add(io, &lio->lis_cl, obj, &lov_empty_io_ops);
+		atomic_inc(&lov->lo_active_ios);
+	}
+
+	io->ci_result = result < 0 ? result : 0;
+	RETURN(result);
+}
+
+int lov_io_init_released(const struct lu_env *env, struct cl_object *obj,
+			struct cl_io *io)
+{
+	struct lov_object *lov = cl2lov(obj);
+	struct lov_io *lio = lov_env_io(env);
+	int result;
+	ENTRY;
+
+	LASSERT(lov->lo_lsm != NULL);
+	lio->lis_object = lov;
+
+	switch (io->ci_type) {
+	default:
+		LASSERTF(0, "invalid type %d\n", io->ci_type);
+		result = -EOPNOTSUPP;
+		break;
+	case CIT_MISC:
+	case CIT_FSYNC:
+	case CIT_LADVISE:
+	case CIT_DATA_VERSION:
+		result = 1;
+		break;
+	case CIT_SETATTR:
+		/* the truncate to 0 is managed by MDT:
+		 * - in open, for open O_TRUNC
+		 * - in setattr, for truncate
+		 */
+		/* the truncate is for size > 0 so triggers a restore */
+		if (cl_io_is_trunc(io)) {
+			io->ci_restore_needed = 1;
+			result = -ENODATA;
+		} else
+			result = 1;
+		break;
+	case CIT_READ:
+	case CIT_WRITE:
+	case CIT_FAULT:
+		io->ci_restore_needed = 1;
+		result = -ENODATA;
+		break;
+	}
+
+	if (result == 0) {
+		cl_io_slice_add(io, &lio->lis_cl, obj, &lov_empty_io_ops);
+		atomic_inc(&lov->lo_active_ios);
+	}
+
+	io->ci_result = result < 0 ? result : 0;
+	RETURN(result);
+}
+/** @} lov */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_lock.c b/drivers/staging/lustrefsx/lustre/lov/lov_lock.c
new file mode 100644
index 0000000000000..efa4cc11ea94e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_lock.c
@@ -0,0 +1,377 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lov lock operations.
+ *
+ */
+
+static struct lov_sublock_env *lov_sublock_env_get(const struct lu_env *env,
+						   const struct cl_lock *parent,
+						   struct lov_lock_sub *lls)
+{
+        struct lov_sublock_env *subenv;
+        struct lov_io          *lio    = lov_env_io(env);
+        struct cl_io           *io     = lio->lis_cl.cis_io;
+        struct lov_io_sub      *sub;
+
+        subenv = &lov_env_session(env)->ls_subenv;
+
+        /*
+         * FIXME: We tend to use the subio's env & io to call the sublock
+         * lock operations because osc lock sometimes stores some control
+         * variables in thread's IO infomation(Now only lockless information).
+         * However, if the lock's host(object) is different from the object
+         * for current IO, we have no way to get the subenv and subio because
+         * they are not initialized at all. As a temp fix, in this case,
+         * we still borrow the parent's env to call sublock operations.
+         */
+	if (!io || !cl_object_same(io->ci_obj, parent->cll_descr.cld_obj)) {
+		subenv->lse_env = env;
+		subenv->lse_io = io;
+	} else {
+		sub = lov_sub_get(env, lio, lls->sub_index);
+		if (!IS_ERR(sub)) {
+			subenv->lse_env = sub->sub_env;
+			subenv->lse_io  = &sub->sub_io;
+		} else {
+			subenv = (void *)sub;
+		}
+	}
+	return subenv;
+}
+
+static int lov_sublock_init(const struct lu_env *env,
+			    const struct cl_lock *parent,
+			    struct lov_lock_sub *lls)
+{
+	struct lov_sublock_env *subenv;
+	int result;
+	ENTRY;
+
+	subenv = lov_sublock_env_get(env, parent, lls);
+	if (!IS_ERR(subenv)) {
+		result = cl_lock_init(subenv->lse_env, &lls->sub_lock,
+				      subenv->lse_io);
+	} else {
+		/* error occurs. */
+		result = PTR_ERR(subenv);
+	}
+	RETURN(result);
+}
+
+/**
+ * Creates sub-locks for a given lov_lock for the first time.
+ *
+ * Goes through all sub-objects of top-object, and creates sub-locks on every
+ * sub-object intersecting with top-lock extent. This is complicated by the
+ * fact that top-lock (that is being created) can be accessed concurrently
+ * through already created sub-locks (possibly shared with other top-locks).
+ */
+static struct lov_lock *lov_lock_sub_init(const struct lu_env *env,
+					  const struct cl_object *obj,
+					  struct cl_lock *lock)
+{
+	struct lov_object *lov = cl2lov(obj);
+	struct lov_lock *lovlck;
+	struct lu_extent ext;
+	loff_t start;
+	loff_t end;
+	int result = 0;
+	int i;
+	int index;
+	int nr;
+
+	ENTRY;
+
+	ext.e_start = cl_offset(obj, lock->cll_descr.cld_start);
+	if (lock->cll_descr.cld_end == CL_PAGE_EOF)
+		ext.e_end = OBD_OBJECT_EOF;
+	else
+		ext.e_end  = cl_offset(obj, lock->cll_descr.cld_end + 1);
+
+	nr = 0;
+	for (index = lov_lsm_entry(lov->lo_lsm, ext.e_start);
+	     index >= 0 && index < lov->lo_lsm->lsm_entry_count; index++) {
+		struct lov_layout_raid0 *r0 = lov_r0(lov, index);
+
+		/* assume lsm entries are sorted. */
+		if (!lu_extent_is_overlapped(&ext,
+					     &lov_lse(lov, index)->lsme_extent))
+			break;
+
+		for (i = 0; i < r0->lo_nr; i++) {
+			if (likely(r0->lo_sub[i] != NULL) && /* spare layout */
+			    lov_stripe_intersects(lov->lo_lsm, index, i,
+						  &ext, &start, &end))
+				nr++;
+		}
+	}
+	/**
+	 * Aggressive lock request (from cl_setattr_ost) which asks for
+	 * [eof, -1) lock, could come across uninstantiated layout extent,
+	 * hence a 0 nr is possible.
+	 */
+
+	OBD_ALLOC_LARGE(lovlck, offsetof(struct lov_lock, lls_sub[nr]));
+	if (lovlck == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	lovlck->lls_nr = nr;
+	nr = 0;
+	for (index = lov_lsm_entry(lov->lo_lsm, ext.e_start);
+	     index >= 0 && index < lov->lo_lsm->lsm_entry_count; index++) {
+		struct lov_layout_raid0 *r0 = lov_r0(lov, index);
+
+		/* assume lsm entries are sorted. */
+		if (!lu_extent_is_overlapped(&ext,
+					     &lov_lse(lov, index)->lsme_extent))
+			break;
+		for (i = 0; i < r0->lo_nr; ++i) {
+			struct lov_lock_sub *lls = &lovlck->lls_sub[nr];
+			struct cl_lock_descr *descr = &lls->sub_lock.cll_descr;
+
+			if (unlikely(r0->lo_sub[i] == NULL) ||
+			    !lov_stripe_intersects(lov->lo_lsm, index, i,
+						   &ext, &start, &end))
+				continue;
+
+			LASSERT(descr->cld_obj == NULL);
+			descr->cld_obj   = lovsub2cl(r0->lo_sub[i]);
+			descr->cld_start = cl_index(descr->cld_obj, start);
+			descr->cld_end   = cl_index(descr->cld_obj, end);
+			descr->cld_mode  = lock->cll_descr.cld_mode;
+			descr->cld_gid   = lock->cll_descr.cld_gid;
+			descr->cld_enq_flags = lock->cll_descr.cld_enq_flags;
+
+			lls->sub_index = lov_comp_index(index, i);
+
+			/* initialize sub lock */
+			result = lov_sublock_init(env, lock, lls);
+			if (result < 0)
+				break;
+
+			lls->sub_initialized = 1;
+			nr++;
+		}
+	}
+	LASSERT(ergo(result == 0, nr == lovlck->lls_nr));
+
+	if (result != 0) {
+		for (i = 0; i < nr; ++i) {
+			if (!lovlck->lls_sub[i].sub_initialized)
+				break;
+
+			cl_lock_fini(env, &lovlck->lls_sub[i].sub_lock);
+		}
+
+		OBD_FREE_LARGE(lovlck,
+				offsetof(struct lov_lock, lls_sub[nr]));
+		lovlck = ERR_PTR(result);
+	}
+
+	RETURN(lovlck);
+}
+
+static void lov_lock_fini(const struct lu_env *env,
+                          struct cl_lock_slice *slice)
+{
+	struct lov_lock *lovlck;
+	int i;
+
+	ENTRY;
+	lovlck = cl2lov_lock(slice);
+	for (i = 0; i < lovlck->lls_nr; ++i) {
+		LASSERT(!lovlck->lls_sub[i].sub_is_enqueued);
+		if (lovlck->lls_sub[i].sub_initialized)
+			cl_lock_fini(env, &lovlck->lls_sub[i].sub_lock);
+	}
+	OBD_FREE_LARGE(lovlck,
+		       offsetof(struct lov_lock, lls_sub[lovlck->lls_nr]));
+	EXIT;
+}
+
+/**
+ * Implementation of cl_lock_operations::clo_enqueue() for lov layer. This
+ * function is rather subtle, as it enqueues top-lock (i.e., advances top-lock
+ * state machine from CLS_QUEUING to CLS_ENQUEUED states) by juggling sub-lock
+ * state machines in the face of sub-locks sharing (by multiple top-locks),
+ * and concurrent sub-lock cancellations.
+ */
+static int lov_lock_enqueue(const struct lu_env *env,
+			    const struct cl_lock_slice *slice,
+			    struct cl_io *io, struct cl_sync_io *anchor)
+{
+	struct cl_lock          *lock   = slice->cls_lock;
+	struct lov_lock         *lovlck = cl2lov_lock(slice);
+	int                     i;
+	int                     rc      = 0;
+
+	ENTRY;
+
+	for (i = 0; i < lovlck->lls_nr; ++i) {
+		struct lov_lock_sub     *lls = &lovlck->lls_sub[i];
+		struct lov_sublock_env  *subenv;
+
+		subenv = lov_sublock_env_get(env, lock, lls);
+		if (IS_ERR(subenv)) {
+			rc = PTR_ERR(subenv);
+			break;
+		}
+
+		rc = cl_lock_enqueue(subenv->lse_env, subenv->lse_io,
+				     &lls->sub_lock, anchor);
+		if (rc != 0)
+			break;
+
+		lls->sub_is_enqueued = 1;
+	}
+	RETURN(rc);
+}
+
+static void lov_lock_cancel(const struct lu_env *env,
+			    const struct cl_lock_slice *slice)
+{
+	struct cl_lock  *lock   = slice->cls_lock;
+	struct lov_lock *lovlck = cl2lov_lock(slice);
+	int i;
+
+	ENTRY;
+
+	for (i = 0; i < lovlck->lls_nr; ++i) {
+		struct lov_lock_sub     *lls = &lovlck->lls_sub[i];
+		struct cl_lock          *sublock = &lls->sub_lock;
+		struct lov_sublock_env  *subenv;
+
+		if (!lls->sub_is_enqueued)
+			continue;
+
+		lls->sub_is_enqueued = 0;
+		subenv = lov_sublock_env_get(env, lock, lls);
+		if (!IS_ERR(subenv)) {
+			cl_lock_cancel(subenv->lse_env, sublock);
+		} else {
+			CL_LOCK_DEBUG(D_ERROR, env, slice->cls_lock,
+				      "lov_lock_cancel fails with %ld.\n",
+				      PTR_ERR(subenv));
+		}
+	}
+}
+
+static int lov_lock_print(const struct lu_env *env, void *cookie,
+                          lu_printer_t p, const struct cl_lock_slice *slice)
+{
+        struct lov_lock *lck = cl2lov_lock(slice);
+        int              i;
+
+        (*p)(env, cookie, "%d\n", lck->lls_nr);
+        for (i = 0; i < lck->lls_nr; ++i) {
+                struct lov_lock_sub *sub;
+
+                sub = &lck->lls_sub[i];
+		(*p)(env, cookie, "    %d %x: ", i, sub->sub_is_enqueued);
+		cl_lock_print(env, cookie, p, &sub->sub_lock);
+        }
+        return 0;
+}
+
+static const struct cl_lock_operations lov_lock_ops = {
+        .clo_fini      = lov_lock_fini,
+        .clo_enqueue   = lov_lock_enqueue,
+        .clo_cancel    = lov_lock_cancel,
+        .clo_print     = lov_lock_print
+};
+
+int lov_lock_init_composite(const struct lu_env *env, struct cl_object *obj,
+			    struct cl_lock *lock, const struct cl_io *io)
+{
+	struct lov_lock *lck;
+	int result = 0;
+
+	ENTRY;
+	lck = lov_lock_sub_init(env, obj, lock);
+	if (!IS_ERR(lck))
+		cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_lock_ops);
+	else
+		result = PTR_ERR(lck);
+	RETURN(result);
+}
+
+static void lov_empty_lock_fini(const struct lu_env *env,
+				struct cl_lock_slice *slice)
+{
+	struct lov_lock *lck = cl2lov_lock(slice);
+	OBD_SLAB_FREE_PTR(lck, lov_lock_kmem);
+}
+
+static int lov_empty_lock_print(const struct lu_env *env, void *cookie,
+			lu_printer_t p, const struct cl_lock_slice *slice)
+{
+	(*p)(env, cookie, "empty\n");
+	return 0;
+}
+
+/* XXX: more methods will be added later. */
+static const struct cl_lock_operations lov_empty_lock_ops = {
+	.clo_fini  = lov_empty_lock_fini,
+	.clo_print = lov_empty_lock_print
+};
+
+int lov_lock_init_empty(const struct lu_env *env, struct cl_object *obj,
+			struct cl_lock *lock, const struct cl_io *io)
+{
+	struct lov_lock *lck;
+	int result = -ENOMEM;
+
+	ENTRY;
+	OBD_SLAB_ALLOC_PTR_GFP(lck, lov_lock_kmem, GFP_NOFS);
+	if (lck != NULL) {
+		cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_empty_lock_ops);
+		result = 0;
+	}
+	RETURN(result);
+}
+
+/** @} lov */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_merge.c b/drivers/staging/lustrefsx/lustre/lov/lov_merge.c
new file mode 100644
index 0000000000000..de9e4298dd884
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_merge.c
@@ -0,0 +1,109 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include "lov_internal.h"
+
+/** Merge the lock value block(&lvb) attributes and KMS from each of the
+ * stripes in a file into a single lvb. It is expected that the caller
+ * initializes the current atime, mtime, ctime to avoid regressing a more
+ * uptodate time on the local client.
+ */
+int lov_merge_lvb_kms(struct lov_stripe_md *lsm, int index,
+                      struct ost_lvb *lvb, __u64 *kms_place)
+{
+	struct lov_stripe_md_entry *lse = lsm->lsm_entries[index];
+	u64 size = 0;
+	u64 kms = 0;
+	u64 blocks = 0;
+	s64 current_mtime = lvb->lvb_mtime;
+	s64 current_atime = lvb->lvb_atime;
+	s64 current_ctime = lvb->lvb_ctime;
+	int i;
+	int rc = 0;
+
+	assert_spin_locked(&lsm->lsm_lock);
+	LASSERT(lsm->lsm_lock_owner == current_pid());
+
+	CDEBUG(D_INODE, "MDT ID "DOSTID" initial value: s=%llu m=%llu"
+	       " a=%llu c=%llu b=%llu\n", POSTID(&lsm->lsm_oi),
+	       lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime, lvb->lvb_ctime,
+	       lvb->lvb_blocks);
+	for (i = 0; i < lse->lsme_stripe_count; i++) {
+		struct lov_oinfo *loi = lse->lsme_oinfo[i];
+		u64 lov_size;
+		u64 tmpsize;
+
+                if (OST_LVB_IS_ERR(loi->loi_lvb.lvb_blocks)) {
+                        rc = OST_LVB_GET_ERR(loi->loi_lvb.lvb_blocks);
+                        continue;
+                }
+
+                tmpsize = loi->loi_kms;
+		lov_size = lov_stripe_size(lsm, index, tmpsize, i);
+                if (lov_size > kms)
+                        kms = lov_size;
+
+                if (loi->loi_lvb.lvb_size > tmpsize)
+                        tmpsize = loi->loi_lvb.lvb_size;
+
+		lov_size = lov_stripe_size(lsm, index, tmpsize, i);
+                if (lov_size > size)
+                        size = lov_size;
+                /* merge blocks, mtime, atime */
+                blocks += loi->loi_lvb.lvb_blocks;
+                if (loi->loi_lvb.lvb_mtime > current_mtime)
+                        current_mtime = loi->loi_lvb.lvb_mtime;
+                if (loi->loi_lvb.lvb_atime > current_atime)
+                        current_atime = loi->loi_lvb.lvb_atime;
+                if (loi->loi_lvb.lvb_ctime > current_ctime)
+                        current_ctime = loi->loi_lvb.lvb_ctime;
+
+		CDEBUG(D_INODE, "MDT ID "DOSTID" on OST[%u]: s=%llu m=%llu"
+		       " a=%llu c=%llu b=%llu\n", POSTID(&lsm->lsm_oi),
+		       loi->loi_ost_idx, loi->loi_lvb.lvb_size,
+		       loi->loi_lvb.lvb_mtime, loi->loi_lvb.lvb_atime,
+		       loi->loi_lvb.lvb_ctime, loi->loi_lvb.lvb_blocks);
+        }
+
+        *kms_place = kms;
+        lvb->lvb_size = size;
+        lvb->lvb_blocks = blocks;
+        lvb->lvb_mtime = current_mtime;
+        lvb->lvb_atime = current_atime;
+        lvb->lvb_ctime = current_ctime;
+        RETURN(rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_obd.c b/drivers/staging/lustrefsx/lustre/lov/lov_obd.c
new file mode 100644
index 0000000000000..e494abbaedf88
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_obd.c
@@ -0,0 +1,1474 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_obd.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+#include <libcfs/libcfs.h>
+
+#include <lustre/lustre_idl.h>
+
+#include <cl_object.h>
+#include <lustre_dlm.h>
+#include <lustre_fid.h>
+#include <uapi/linux/lustre_ioctl.h>
+#include <lustre_lib.h>
+#include <lustre_mds.h>
+#include <lustre_net.h>
+#include <uapi/linux/lustre_param.h>
+#include <lustre_swab.h>
+#include <lprocfs_status.h>
+#include <obd_class.h>
+#include <obd_support.h>
+
+#include "lov_internal.h"
+
+/* Keep a refcount of lov->tgt usage to prevent racing with addition/deletion.
+   Any function that expects lov_tgts to remain stationary must take a ref. */
+static void lov_getref(struct obd_device *obd)
+{
+	struct lov_obd *lov = &obd->u.lov;
+
+	/* nobody gets through here until lov_putref is done */
+	mutex_lock(&lov->lov_lock);
+	atomic_inc(&lov->lov_refcount);
+	mutex_unlock(&lov->lov_lock);
+	return;
+}
+
+static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt);
+
+static void lov_putref(struct obd_device *obd)
+{
+	struct lov_obd *lov = &obd->u.lov;
+
+	mutex_lock(&lov->lov_lock);
+	/* ok to dec to 0 more than once -- ltd_exp's will be null */
+	if (atomic_dec_and_test(&lov->lov_refcount) && lov->lov_death_row) {
+		struct list_head kill = LIST_HEAD_INIT(kill);
+		struct lov_tgt_desc *tgt, *n;
+		int i;
+
+		CDEBUG(D_CONFIG, "destroying %d lov targets\n",
+		       lov->lov_death_row);
+		for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+                        tgt = lov->lov_tgts[i];
+
+                        if (!tgt || !tgt->ltd_reap)
+                                continue;
+			list_add(&tgt->ltd_kill, &kill);
+                        /* XXX - right now there is a dependency on ld_tgt_count
+                         * being the maximum tgt index for computing the
+                         * mds_max_easize. So we can't shrink it. */
+                        lov_ost_pool_remove(&lov->lov_packed, i);
+                        lov->lov_tgts[i] = NULL;
+                        lov->lov_death_row--;
+                }
+		mutex_unlock(&lov->lov_lock);
+
+		list_for_each_entry_safe(tgt, n, &kill, ltd_kill) {
+			list_del(&tgt->ltd_kill);
+                        /* Disconnect */
+                        __lov_del_obd(obd, tgt);
+                }
+        } else {
+		mutex_unlock(&lov->lov_lock);
+        }
+}
+
+static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
+                              enum obd_notify_event ev);
+static int lov_notify(struct obd_device *obd, struct obd_device *watched,
+		      enum obd_notify_event ev);
+
+int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
+                    struct obd_connect_data *data)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	struct obd_uuid *tgt_uuid;
+	struct obd_device *tgt_obd;
+	static struct obd_uuid lov_osc_uuid = { "LOV_OSC_UUID" };
+	struct obd_import *imp;
+	int rc;
+	ENTRY;
+
+	if (lov->lov_tgts[index] == NULL)
+		RETURN(-EINVAL);
+
+        tgt_uuid = &lov->lov_tgts[index]->ltd_uuid;
+        tgt_obd = lov->lov_tgts[index]->ltd_obd;
+
+        if (!tgt_obd->obd_set_up) {
+                CERROR("Target %s not set up\n", obd_uuid2str(tgt_uuid));
+                RETURN(-EINVAL);
+        }
+
+        /* override the sp_me from lov */
+        tgt_obd->u.cli.cl_sp_me = lov->lov_sp_me;
+
+        if (data && (data->ocd_connect_flags & OBD_CONNECT_INDEX))
+                data->ocd_index = index;
+
+        /*
+         * Divine LOV knows that OBDs under it are OSCs.
+         */
+        imp = tgt_obd->u.cli.cl_import;
+
+        if (activate) {
+                tgt_obd->obd_no_recov = 0;
+                /* FIXME this is probably supposed to be
+                   ptlrpc_set_import_active.  Horrible naming. */
+                ptlrpc_activate_import(imp);
+        }
+
+        rc = obd_register_observer(tgt_obd, obd);
+        if (rc) {
+                CERROR("Target %s register_observer error %d\n",
+                       obd_uuid2str(tgt_uuid), rc);
+                RETURN(rc);
+        }
+
+
+        if (imp->imp_invalid) {
+                CDEBUG(D_CONFIG, "not connecting OSC %s; administratively "
+                       "disabled\n", obd_uuid2str(tgt_uuid));
+                RETURN(0);
+        }
+
+        rc = obd_connect(NULL, &lov->lov_tgts[index]->ltd_exp, tgt_obd,
+                         &lov_osc_uuid, data, NULL);
+        if (rc || !lov->lov_tgts[index]->ltd_exp) {
+                CERROR("Target %s connect error %d\n",
+                       obd_uuid2str(tgt_uuid), rc);
+                RETURN(-ENODEV);
+        }
+
+        lov->lov_tgts[index]->ltd_reap = 0;
+
+        CDEBUG(D_CONFIG, "Connected tgt idx %d %s (%s) %sactive\n", index,
+               obd_uuid2str(tgt_uuid), tgt_obd->obd_name, activate ? "":"in");
+
+	if (lov->targets_proc_entry != NULL) {
+		struct proc_dir_entry *osc_symlink;
+		struct obd_device *osc_obd;
+
+		osc_obd = lov->lov_tgts[index]->ltd_exp->exp_obd;
+
+		LASSERT(osc_obd != NULL);
+		LASSERT(osc_obd->obd_magic == OBD_DEVICE_MAGIC);
+		LASSERT(osc_obd->obd_type->typ_name != NULL);
+
+		osc_symlink = lprocfs_add_symlink(osc_obd->obd_name,
+						  lov->targets_proc_entry,
+						  "../../../%s/%s",
+						  osc_obd->obd_type->typ_name,
+						  osc_obd->obd_name);
+		if (osc_symlink == NULL) {
+			CERROR("cannot register LOV target "
+			       "/proc/fs/lustre/%s/%s/target_obds/%s\n",
+			       obd->obd_type->typ_name, obd->obd_name,
+			       osc_obd->obd_name);
+		}
+	}
+	RETURN(0);
+}
+
+static int lov_connect(const struct lu_env *env,
+                       struct obd_export **exp, struct obd_device *obd,
+                       struct obd_uuid *cluuid, struct obd_connect_data *data,
+                       void *localdata)
+{
+        struct lov_obd *lov = &obd->u.lov;
+        struct lov_tgt_desc *tgt;
+        struct lustre_handle conn;
+        int i, rc;
+        ENTRY;
+
+        CDEBUG(D_CONFIG, "connect #%d\n", lov->lov_connects);
+
+        rc = class_connect(&conn, obd, cluuid);
+        if (rc)
+                RETURN(rc);
+
+        *exp = class_conn2export(&conn);
+
+        /* Why should there ever be more than 1 connect? */
+        lov->lov_connects++;
+        LASSERT(lov->lov_connects == 1);
+
+        memset(&lov->lov_ocd, 0, sizeof(lov->lov_ocd));
+        if (data)
+                lov->lov_ocd = *data;
+
+	lov->targets_proc_entry = lprocfs_register("target_obds",
+						   obd->obd_proc_entry,
+						   NULL, NULL);
+	if (IS_ERR(lov->targets_proc_entry)) {
+		CERROR("%s: cannot register "
+		       "/proc/fs/lustre/%s/%s/target_obds\n",
+		       obd->obd_name, obd->obd_type->typ_name, obd->obd_name);
+		lov->targets_proc_entry = NULL;
+	}
+
+        obd_getref(obd);
+        for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+                tgt = lov->lov_tgts[i];
+                if (!tgt || obd_uuid_empty(&tgt->ltd_uuid))
+                        continue;
+                /* Flags will be lowest common denominator */
+                rc = lov_connect_obd(obd, i, tgt->ltd_activate, &lov->lov_ocd);
+                if (rc) {
+                        CERROR("%s: lov connect tgt %d failed: %d\n",
+                               obd->obd_name, i, rc);
+                        continue;
+                }
+                /* connect to administrative disabled ost */
+                if (!lov->lov_tgts[i]->ltd_exp)
+                        continue;
+
+		rc = lov_notify(obd, lov->lov_tgts[i]->ltd_exp->exp_obd,
+				OBD_NOTIFY_CONNECT);
+                if (rc) {
+                        CERROR("%s error sending notify %d\n",
+                               obd->obd_name, rc);
+                }
+        }
+        obd_putref(obd);
+
+        RETURN(0);
+}
+
+static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
+{
+        struct lov_obd *lov = &obd->u.lov;
+        struct obd_device *osc_obd;
+        int rc;
+        ENTRY;
+
+        osc_obd = class_exp2obd(tgt->ltd_exp);
+        CDEBUG(D_CONFIG, "%s: disconnecting target %s\n",
+               obd->obd_name, osc_obd->obd_name);
+
+        if (tgt->ltd_active) {
+                tgt->ltd_active = 0;
+                lov->desc.ld_active_tgt_count--;
+                tgt->ltd_exp->exp_obd->obd_inactive = 1;
+        }
+
+	if (osc_obd) {
+		/* Pass it on to our clients.
+		 * XXX This should be an argument to disconnect,
+		 * XXX not a back-door flag on the OBD.  Ah well.
+		 */
+		osc_obd->obd_force = obd->obd_force;
+		osc_obd->obd_fail = obd->obd_fail;
+		osc_obd->obd_no_recov = obd->obd_no_recov;
+
+		if (lov->targets_proc_entry != NULL)
+			lprocfs_remove_proc_entry(osc_obd->obd_name,
+						  lov->targets_proc_entry);
+	}
+
+        obd_register_observer(osc_obd, NULL);
+
+        rc = obd_disconnect(tgt->ltd_exp);
+        if (rc) {
+                CERROR("Target %s disconnect error %d\n",
+                       tgt->ltd_uuid.uuid, rc);
+                rc = 0;
+        }
+
+        tgt->ltd_exp = NULL;
+        RETURN(0);
+}
+
+static int lov_disconnect(struct obd_export *exp)
+{
+        struct obd_device *obd = class_exp2obd(exp);
+        struct lov_obd *lov = &obd->u.lov;
+        int i, rc;
+        ENTRY;
+
+        if (!lov->lov_tgts)
+                goto out;
+
+        /* Only disconnect the underlying layers on the final disconnect. */
+        lov->lov_connects--;
+        if (lov->lov_connects != 0) {
+                /* why should there be more than 1 connect? */
+                CERROR("disconnect #%d\n", lov->lov_connects);
+                goto out;
+        }
+
+        /* Let's hold another reference so lov_del_obd doesn't spin through
+           putref every time */
+        obd_getref(obd);
+
+        for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+                if (lov->lov_tgts[i] && lov->lov_tgts[i]->ltd_exp) {
+                        /* Disconnection is the last we know about an obd */
+			lov_del_target(obd, i, NULL, lov->lov_tgts[i]->ltd_gen);
+                }
+        }
+        obd_putref(obd);
+
+	if (lov->targets_proc_entry != NULL)
+		lprocfs_remove(&lov->targets_proc_entry);
+
+out:
+        rc = class_disconnect(exp); /* bz 9811 */
+        RETURN(rc);
+}
+
+/* Error codes:
+ *
+ *  -EINVAL  : UUID can't be found in the LOV's target list
+ *  -ENOTCONN: The UUID is found, but the target connection is bad (!)
+ *  -EBADF   : The UUID is found, but the OBD is the wrong type (!)
+ *  any >= 0 : is log target index
+ */
+static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
+                              enum obd_notify_event ev)
+{
+        struct lov_obd *lov = &obd->u.lov;
+        struct lov_tgt_desc *tgt;
+        int index, activate, active;
+        ENTRY;
+
+        CDEBUG(D_INFO, "Searching in lov %p for uuid %s event(%d)\n",
+               lov, uuid->uuid, ev);
+
+	obd_getref(obd);
+	for (index = 0; index < lov->desc.ld_tgt_count; index++) {
+		tgt = lov->lov_tgts[index];
+		if (!tgt)
+			continue;
+		/*
+		 * LU-642, initially inactive OSC could miss the obd_connect,
+		 * we make up for it here.
+		 */
+		if (ev == OBD_NOTIFY_ACTIVATE && tgt->ltd_exp == NULL &&
+		    obd_uuid_equals(uuid, &tgt->ltd_uuid)) {
+			struct obd_uuid lov_osc_uuid = {"LOV_OSC_UUID"};
+
+			obd_connect(NULL, &tgt->ltd_exp, tgt->ltd_obd,
+				    &lov_osc_uuid, &lov->lov_ocd, NULL);
+		}
+		if (!tgt->ltd_exp)
+			continue;
+
+		CDEBUG(D_INFO, "lov idx %d is %s conn %#llx\n",
+                       index, obd_uuid2str(&tgt->ltd_uuid),
+                       tgt->ltd_exp->exp_handle.h_cookie);
+                if (obd_uuid_equals(uuid, &tgt->ltd_uuid))
+                        break;
+        }
+
+        if (index == lov->desc.ld_tgt_count)
+                GOTO(out, index = -EINVAL);
+
+        if (ev == OBD_NOTIFY_DEACTIVATE || ev == OBD_NOTIFY_ACTIVATE) {
+                activate = (ev == OBD_NOTIFY_ACTIVATE) ? 1 : 0;
+
+                if (lov->lov_tgts[index]->ltd_activate == activate) {
+                        CDEBUG(D_INFO, "OSC %s already %sactivate!\n",
+                               uuid->uuid, activate ? "" : "de");
+                } else {
+                        lov->lov_tgts[index]->ltd_activate = activate;
+                        CDEBUG(D_CONFIG, "%sactivate OSC %s\n",
+                               activate ? "" : "de", obd_uuid2str(uuid));
+                }
+
+        } else if (ev == OBD_NOTIFY_INACTIVE || ev == OBD_NOTIFY_ACTIVE) {
+                active = (ev == OBD_NOTIFY_ACTIVE) ? 1 : 0;
+
+                if (lov->lov_tgts[index]->ltd_active == active) {
+                        CDEBUG(D_INFO, "OSC %s already %sactive!\n",
+                               uuid->uuid, active ? "" : "in");
+                        GOTO(out, index);
+                } else {
+                        CDEBUG(D_CONFIG, "Marking OSC %s %sactive\n",
+                               obd_uuid2str(uuid), active ? "" : "in");
+                }
+
+                lov->lov_tgts[index]->ltd_active = active;
+                if (active) {
+                        lov->desc.ld_active_tgt_count++;
+                        lov->lov_tgts[index]->ltd_exp->exp_obd->obd_inactive = 0;
+                } else {
+                        lov->desc.ld_active_tgt_count--;
+                        lov->lov_tgts[index]->ltd_exp->exp_obd->obd_inactive = 1;
+                }
+	} else {
+		CERROR("%s: unknown event %d for uuid %s\n", obd->obd_name,
+		       ev, uuid->uuid);
+	}
+
+ out:
+	obd_putref(obd);
+	RETURN(index);
+}
+
+static int lov_notify(struct obd_device *obd, struct obd_device *watched,
+		      enum obd_notify_event ev)
+{
+	int rc = 0;
+	struct lov_obd *lov = &obd->u.lov;
+	ENTRY;
+
+	down_read(&lov->lov_notify_lock);
+	if (!lov->lov_connects)
+		GOTO(out_notify_lock, rc = 0);
+
+	if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE ||
+	    ev == OBD_NOTIFY_ACTIVATE || ev == OBD_NOTIFY_DEACTIVATE) {
+		struct obd_uuid *uuid;
+
+		LASSERT(watched);
+
+		if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+			CERROR("unexpected notification of %s %s\n",
+			       watched->obd_type->typ_name, watched->obd_name);
+			GOTO(out_notify_lock, rc = -EINVAL);
+		}
+
+		uuid = &watched->u.cli.cl_target_uuid;
+
+		/* Set OSC as active before notifying the observer, so the
+		 * observer can use the OSC normally.
+		 */
+		rc = lov_set_osc_active(obd, uuid, ev);
+		if (rc < 0) {
+			CERROR("%s: event %d failed: rc = %d\n", obd->obd_name,
+			       ev, rc);
+			GOTO(out_notify_lock, rc);
+		}
+	}
+
+	/* Pass the notification up the chain. */
+	rc = obd_notify_observer(obd, watched, ev);
+
+out_notify_lock:
+	up_read(&lov->lov_notify_lock);
+
+	RETURN(rc);
+}
+
+static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
+                          __u32 index, int gen, int active)
+{
+        struct lov_obd *lov = &obd->u.lov;
+        struct lov_tgt_desc *tgt;
+        struct obd_device *tgt_obd;
+        int rc;
+        ENTRY;
+
+        CDEBUG(D_CONFIG, "uuid:%s idx:%d gen:%d active:%d\n",
+               uuidp->uuid, index, gen, active);
+
+        if (gen <= 0) {
+                CERROR("request to add OBD %s with invalid generation: %d\n",
+                       uuidp->uuid, gen);
+                RETURN(-EINVAL);
+        }
+
+        tgt_obd = class_find_client_obd(uuidp, LUSTRE_OSC_NAME,
+                                        &obd->obd_uuid);
+        if (tgt_obd == NULL)
+                RETURN(-EINVAL);
+
+	mutex_lock(&lov->lov_lock);
+
+        if ((index < lov->lov_tgt_size) && (lov->lov_tgts[index] != NULL)) {
+                tgt = lov->lov_tgts[index];
+                CERROR("UUID %s already assigned at LOV target index %d\n",
+                       obd_uuid2str(&tgt->ltd_uuid), index);
+		mutex_unlock(&lov->lov_lock);
+                RETURN(-EEXIST);
+        }
+
+        if (index >= lov->lov_tgt_size) {
+                /* We need to reallocate the lov target array. */
+                struct lov_tgt_desc **newtgts, **old = NULL;
+                __u32 newsize, oldsize = 0;
+
+                newsize = max(lov->lov_tgt_size, (__u32)2);
+                while (newsize < index + 1)
+                        newsize = newsize << 1;
+                OBD_ALLOC(newtgts, sizeof(*newtgts) * newsize);
+                if (newtgts == NULL) {
+			mutex_unlock(&lov->lov_lock);
+                        RETURN(-ENOMEM);
+                }
+
+                if (lov->lov_tgt_size) {
+                        memcpy(newtgts, lov->lov_tgts, sizeof(*newtgts) *
+                               lov->lov_tgt_size);
+                        old = lov->lov_tgts;
+                        oldsize = lov->lov_tgt_size;
+                }
+
+		lov->lov_tgts = newtgts;
+		lov->lov_tgt_size = newsize;
+		smp_rmb();
+		if (old)
+			OBD_FREE(old, sizeof(*old) * oldsize);
+
+                CDEBUG(D_CONFIG, "tgts: %p size: %d\n",
+                       lov->lov_tgts, lov->lov_tgt_size);
+        }
+
+        OBD_ALLOC_PTR(tgt);
+        if (!tgt) {
+		mutex_unlock(&lov->lov_lock);
+                RETURN(-ENOMEM);
+        }
+
+        rc = lov_ost_pool_add(&lov->lov_packed, index, lov->lov_tgt_size);
+        if (rc) {
+		mutex_unlock(&lov->lov_lock);
+                OBD_FREE_PTR(tgt);
+                RETURN(rc);
+        }
+
+        tgt->ltd_uuid = *uuidp;
+        tgt->ltd_obd = tgt_obd;
+        /* XXX - add a sanity check on the generation number. */
+        tgt->ltd_gen = gen;
+        tgt->ltd_index = index;
+        tgt->ltd_activate = active;
+        lov->lov_tgts[index] = tgt;
+        if (index >= lov->desc.ld_tgt_count)
+                lov->desc.ld_tgt_count = index + 1;
+
+	mutex_unlock(&lov->lov_lock);
+
+        CDEBUG(D_CONFIG, "idx=%d ltd_gen=%d ld_tgt_count=%d\n",
+                index, tgt->ltd_gen, lov->desc.ld_tgt_count);
+
+        if (lov->lov_connects == 0) {
+                /* lov_connect hasn't been called yet. We'll do the
+                   lov_connect_obd on this target when that fn first runs,
+                   because we don't know the connect flags yet. */
+                RETURN(0);
+        }
+
+        obd_getref(obd);
+
+        rc = lov_connect_obd(obd, index, active, &lov->lov_ocd);
+        if (rc)
+                GOTO(out, rc);
+
+        /* connect to administrative disabled ost */
+        if (!tgt->ltd_exp)
+                GOTO(out, rc = 0);
+
+	if (lov->lov_cache != NULL) {
+		rc = obd_set_info_async(NULL, tgt->ltd_exp,
+				sizeof(KEY_CACHE_SET), KEY_CACHE_SET,
+				sizeof(struct cl_client_cache), lov->lov_cache,
+				NULL);
+		if (rc < 0)
+			GOTO(out, rc);
+	}
+
+	rc = lov_notify(obd, tgt->ltd_exp->exp_obd,
+			active ? OBD_NOTIFY_CONNECT : OBD_NOTIFY_INACTIVE);
+
+out:
+        if (rc) {
+                CERROR("add failed (%d), deleting %s\n", rc,
+                       obd_uuid2str(&tgt->ltd_uuid));
+		lov_del_target(obd, index, NULL, 0);
+        }
+        obd_putref(obd);
+        RETURN(rc);
+}
+
+/* Schedule a target for deletion */
+int lov_del_target(struct obd_device *obd, __u32 index,
+                   struct obd_uuid *uuidp, int gen)
+{
+        struct lov_obd *lov = &obd->u.lov;
+        int count = lov->desc.ld_tgt_count;
+        int rc = 0;
+        ENTRY;
+
+        if (index >= count) {
+                CERROR("LOV target index %d >= number of LOV OBDs %d.\n",
+                       index, count);
+                RETURN(-EINVAL);
+        }
+
+	/* to make sure there's no ongoing lov_notify() now */
+	down_write(&lov->lov_notify_lock);
+        obd_getref(obd);
+
+        if (!lov->lov_tgts[index]) {
+                CERROR("LOV target at index %d is not setup.\n", index);
+                GOTO(out, rc = -EINVAL);
+        }
+
+        if (uuidp && !obd_uuid_equals(uuidp, &lov->lov_tgts[index]->ltd_uuid)) {
+                CERROR("LOV target UUID %s at index %d doesn't match %s.\n",
+                       lov_uuid2str(lov, index), index,
+                       obd_uuid2str(uuidp));
+                GOTO(out, rc = -EINVAL);
+        }
+
+        CDEBUG(D_CONFIG, "uuid: %s idx: %d gen: %d exp: %p active: %d\n",
+               lov_uuid2str(lov, index), index,
+               lov->lov_tgts[index]->ltd_gen, lov->lov_tgts[index]->ltd_exp,
+               lov->lov_tgts[index]->ltd_active);
+
+        lov->lov_tgts[index]->ltd_reap = 1;
+        lov->lov_death_row++;
+        /* we really delete it from obd_putref */
+out:
+        obd_putref(obd);
+	up_write(&lov->lov_notify_lock);
+
+        RETURN(rc);
+}
+
+static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
+{
+        struct obd_device *osc_obd;
+
+        LASSERT(tgt);
+        LASSERT(tgt->ltd_reap);
+
+        osc_obd = class_exp2obd(tgt->ltd_exp);
+
+        CDEBUG(D_CONFIG, "Removing tgt %s : %s\n",
+               tgt->ltd_uuid.uuid,
+               osc_obd ? osc_obd->obd_name : "<no obd>");
+
+        if (tgt->ltd_exp)
+                lov_disconnect_obd(obd, tgt);
+
+        OBD_FREE_PTR(tgt);
+
+        /* Manual cleanup - no cleanup logs to clean up the osc's.  We must
+           do it ourselves. And we can't do it from lov_cleanup,
+           because we just lost our only reference to it. */
+        if (osc_obd)
+                class_manual_cleanup(osc_obd);
+}
+
+void lov_fix_desc_stripe_size(__u64 *val)
+{
+	if (*val < LOV_MIN_STRIPE_SIZE) {
+		if (*val != 0)
+			LCONSOLE_INFO("Increasing default stripe size to "
+				      "minimum %u\n",
+				      LOV_DESC_STRIPE_SIZE_DEFAULT);
+		*val = LOV_DESC_STRIPE_SIZE_DEFAULT;
+	} else if (*val & (LOV_MIN_STRIPE_SIZE - 1)) {
+		*val &= ~(LOV_MIN_STRIPE_SIZE - 1);
+		LCONSOLE_WARN("Changing default stripe size to %llu (a "
+			      "multiple of %u)\n",
+			      *val, LOV_MIN_STRIPE_SIZE);
+	}
+}
+
+void lov_fix_desc_stripe_count(__u32 *val)
+{
+        if (*val == 0)
+                *val = 1;
+}
+
+void lov_fix_desc_pattern(__u32 *val)
+{
+        /* from lov_setstripe */
+        if ((*val != 0) && (*val != LOV_PATTERN_RAID0)) {
+                LCONSOLE_WARN("Unknown stripe pattern: %#x\n", *val);
+                *val = 0;
+        }
+}
+
+void lov_fix_desc_qos_maxage(__u32 *val)
+{
+	if (*val == 0)
+		*val = LOV_DESC_QOS_MAXAGE_DEFAULT;
+}
+
+void lov_fix_desc(struct lov_desc *desc)
+{
+	lov_fix_desc_stripe_size(&desc->ld_default_stripe_size);
+	lov_fix_desc_stripe_count(&desc->ld_default_stripe_count);
+	lov_fix_desc_pattern(&desc->ld_pattern);
+	lov_fix_desc_qos_maxage(&desc->ld_qos_maxage);
+}
+
+int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct lov_desc *desc;
+	struct lov_obd *lov = &obd->u.lov;
+#ifdef CONFIG_PROC_FS
+	struct obd_type *type;
+#endif
+	int rc;
+	ENTRY;
+
+        if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+                CERROR("LOV setup requires a descriptor\n");
+                RETURN(-EINVAL);
+        }
+
+        desc = (struct lov_desc *)lustre_cfg_buf(lcfg, 1);
+
+        if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) {
+                CERROR("descriptor size wrong: %d > %d\n",
+                       (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1));
+                RETURN(-EINVAL);
+        }
+
+        if (desc->ld_magic != LOV_DESC_MAGIC) {
+                if (desc->ld_magic == __swab32(LOV_DESC_MAGIC)) {
+                            CDEBUG(D_OTHER, "%s: Swabbing lov desc %p\n",
+                                   obd->obd_name, desc);
+                            lustre_swab_lov_desc(desc);
+                } else {
+                        CERROR("%s: Bad lov desc magic: %#x\n",
+                               obd->obd_name, desc->ld_magic);
+                        RETURN(-EINVAL);
+                }
+        }
+
+        lov_fix_desc(desc);
+
+	desc->ld_active_tgt_count = 0;
+	lov->desc = *desc;
+	lov->lov_tgt_size = 0;
+
+	mutex_init(&lov->lov_lock);
+	atomic_set(&lov->lov_refcount, 0);
+	lov->lov_sp_me = LUSTRE_SP_CLI;
+
+	init_rwsem(&lov->lov_notify_lock);
+
+        lov->lov_pools_hash_body = cfs_hash_create("POOLS", HASH_POOLS_CUR_BITS,
+                                                   HASH_POOLS_MAX_BITS,
+                                                   HASH_POOLS_BKT_BITS, 0,
+                                                   CFS_HASH_MIN_THETA,
+                                                   CFS_HASH_MAX_THETA,
+                                                   &pool_hash_operations,
+                                                   CFS_HASH_DEFAULT);
+	INIT_LIST_HEAD(&lov->lov_pool_list);
+        lov->lov_pool_count = 0;
+        rc = lov_ost_pool_init(&lov->lov_packed, 0);
+        if (rc)
+		GOTO(out, rc);
+
+#ifdef CONFIG_PROC_FS
+	obd->obd_vars = lprocfs_lov_obd_vars;
+	/* If this is true then both client (lov) and server
+	 * (lod) are on the same node. The lod layer if loaded
+	 * first will register the lov proc directory. In that
+	 * case obd->obd_type->typ_procroot will be not set.
+	 * Instead we use type->typ_procsym as the parent. */
+	type = class_search_type(LUSTRE_LOD_NAME);
+	if (type != NULL && type->typ_procsym != NULL) {
+		obd->obd_proc_entry = lprocfs_register(obd->obd_name,
+						       type->typ_procsym,
+						       obd->obd_vars, obd);
+		if (IS_ERR(obd->obd_proc_entry)) {
+			rc = PTR_ERR(obd->obd_proc_entry);
+			CERROR("error %d setting up lprocfs for %s\n", rc,
+			       obd->obd_name);
+			obd->obd_proc_entry = NULL;
+		}
+	} else {
+		rc = lprocfs_obd_setup(obd);
+	}
+
+	if (rc == 0) {
+		rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd",
+					0444, &lov_proc_target_fops, obd);
+		if (rc)
+			CWARN("Error adding the target_obd file\n");
+
+		lov->lov_pool_proc_entry = lprocfs_register("pools",
+							    obd->obd_proc_entry,
+							    NULL, NULL);
+		if (IS_ERR(lov->lov_pool_proc_entry)) {
+			rc = PTR_ERR(lov->lov_pool_proc_entry);
+			CERROR("error %d setting up lprocfs for pools\n", rc);
+			lov->lov_pool_proc_entry = NULL;
+		}
+	}
+#endif
+	RETURN(0);
+
+out:
+	return rc;
+}
+
+static int lov_cleanup(struct obd_device *obd)
+{
+        struct lov_obd *lov = &obd->u.lov;
+	struct list_head *pos, *tmp;
+        struct pool_desc *pool;
+        ENTRY;
+
+	list_for_each_safe(pos, tmp, &lov->lov_pool_list) {
+		pool = list_entry(pos, struct pool_desc, pool_list);
+                /* free pool structs */
+                CDEBUG(D_INFO, "delete pool %p\n", pool);
+		/* In the function below, .hs_keycmp resolves to
+		 * pool_hashkey_keycmp() */
+		/* coverity[overrun-buffer-val] */
+                lov_pool_del(obd, pool->pool_name);
+        }
+        cfs_hash_putref(lov->lov_pools_hash_body);
+        lov_ost_pool_free(&lov->lov_packed);
+
+	lprocfs_obd_cleanup(obd);
+        if (lov->lov_tgts) {
+                int i;
+                obd_getref(obd);
+                for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+			if (!lov->lov_tgts[i])
+				continue;
+
+			/* Inactive targets may never have connected */
+			if (lov->lov_tgts[i]->ltd_active ||
+			    atomic_read(&lov->lov_refcount))
+				/* We should never get here - these
+				 * should have been removed in the
+				 * disconnect. */
+				CERROR("%s: lov tgt %d not cleaned! "
+				       "deathrow=%d, lovrc=%d\n",
+				       obd->obd_name, i, lov->lov_death_row,
+				       atomic_read(&lov->lov_refcount));
+			lov_del_target(obd, i, NULL, 0);
+		}
+                obd_putref(obd);
+                OBD_FREE(lov->lov_tgts, sizeof(*lov->lov_tgts) *
+                         lov->lov_tgt_size);
+                lov->lov_tgt_size = 0;
+        }
+
+	if (lov->lov_cache != NULL) {
+		cl_cache_decref(lov->lov_cache);
+		lov->lov_cache = NULL;
+	}
+
+        RETURN(0);
+}
+
+int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg,
+                            __u32 *indexp, int *genp)
+{
+        struct obd_uuid obd_uuid;
+        int cmd;
+        int rc = 0;
+        ENTRY;
+
+        switch(cmd = lcfg->lcfg_command) {
+        case LCFG_LOV_ADD_OBD:
+        case LCFG_LOV_ADD_INA:
+        case LCFG_LOV_DEL_OBD: {
+                __u32 index;
+                int gen;
+                /* lov_modify_tgts add  0:lov_mdsA  1:ost1_UUID  2:0  3:1 */
+                if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid))
+                        GOTO(out, rc = -EINVAL);
+
+                obd_str2uuid(&obd_uuid,  lustre_cfg_buf(lcfg, 1));
+
+		if (sscanf(lustre_cfg_buf(lcfg, 2), "%u", indexp) != 1)
+                        GOTO(out, rc = -EINVAL);
+                if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", genp) != 1)
+                        GOTO(out, rc = -EINVAL);
+                index = *indexp;
+                gen = *genp;
+                if (cmd == LCFG_LOV_ADD_OBD)
+                        rc = lov_add_target(obd, &obd_uuid, index, gen, 1);
+                else if (cmd == LCFG_LOV_ADD_INA)
+                        rc = lov_add_target(obd, &obd_uuid, index, gen, 0);
+                else
+                        rc = lov_del_target(obd, index, &obd_uuid, gen);
+                GOTO(out, rc);
+        }
+        case LCFG_PARAM: {
+		struct lov_desc *desc = &(obd->u.lov.desc);
+
+		if (!desc)
+			GOTO(out, rc = -EINVAL);
+
+		rc = class_process_proc_param(PARAM_LOV, obd->obd_vars,
+					      lcfg, obd);
+		if (rc > 0)
+			rc = 0;
+                GOTO(out, rc);
+        }
+        case LCFG_POOL_NEW:
+        case LCFG_POOL_ADD:
+        case LCFG_POOL_DEL:
+        case LCFG_POOL_REM:
+                GOTO(out, rc);
+
+        default: {
+                CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+                GOTO(out, rc = -EINVAL);
+
+        }
+        }
+out:
+        RETURN(rc);
+}
+
+static int
+lov_statfs_interpret(struct ptlrpc_request_set *rqset, void *data, int rc)
+{
+	struct lov_request_set *lovset = (struct lov_request_set *)data;
+	int err;
+	ENTRY;
+
+	if (rc)
+		atomic_set(&lovset->set_completes, 0);
+
+	err = lov_fini_statfs_set(lovset);
+	RETURN(rc ? rc : err);
+}
+
+static int lov_statfs_async(struct obd_export *exp, struct obd_info *oinfo,
+                            __u64 max_age, struct ptlrpc_request_set *rqset)
+{
+        struct obd_device      *obd = class_exp2obd(exp);
+        struct lov_request_set *set;
+        struct lov_request *req;
+	struct list_head *pos;
+        struct lov_obd *lov;
+        int rc = 0;
+        ENTRY;
+
+        LASSERT(oinfo != NULL);
+        LASSERT(oinfo->oi_osfs != NULL);
+
+        lov = &obd->u.lov;
+        rc = lov_prep_statfs_set(obd, oinfo, &set);
+        if (rc)
+                RETURN(rc);
+
+	list_for_each(pos, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+                rc = obd_statfs_async(lov->lov_tgts[req->rq_idx]->ltd_exp,
+                                      &req->rq_oi, max_age, rqset);
+                if (rc)
+                        break;
+        }
+
+	if (rc || list_empty(&rqset->set_requests)) {
+		int err;
+		if (rc)
+			atomic_set(&set->set_completes, 0);
+		err = lov_fini_statfs_set(set);
+		RETURN(rc ? rc : err);
+	}
+
+	LASSERT(rqset->set_interpret == NULL);
+	rqset->set_interpret = lov_statfs_interpret;
+	rqset->set_arg = (void *)set;
+	RETURN(0);
+}
+
+static int lov_statfs(const struct lu_env *env, struct obd_export *exp,
+		      struct obd_statfs *osfs, __u64 max_age, __u32 flags)
+{
+	struct ptlrpc_request_set *set = NULL;
+	struct obd_info oinfo = {
+		.oi_osfs = osfs,
+		.oi_flags = flags,
+	};
+	int rc = 0;
+
+	ENTRY;
+
+	/* for obdclass we forbid using obd_statfs_rqset, but prefer using async
+	 * statfs requests */
+	set = ptlrpc_prep_set();
+	if (set == NULL)
+		RETURN(-ENOMEM);
+
+	rc = lov_statfs_async(exp, &oinfo, max_age, set);
+	if (rc == 0)
+		rc = ptlrpc_set_wait(set);
+
+	ptlrpc_set_destroy(set);
+
+	RETURN(rc);
+}
+
+static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+			 void *karg, void __user *uarg)
+{
+        struct obd_device *obddev = class_exp2obd(exp);
+        struct lov_obd *lov = &obddev->u.lov;
+        int i = 0, rc = 0, count = lov->desc.ld_tgt_count;
+        struct obd_uuid *uuidp;
+        ENTRY;
+
+        switch (cmd) {
+        case IOC_OBD_STATFS: {
+                struct obd_ioctl_data *data = karg;
+                struct obd_device *osc_obd;
+                struct obd_statfs stat_buf = {0};
+                __u32 index;
+		__u32 flags;
+
+                memcpy(&index, data->ioc_inlbuf2, sizeof(index));
+                if ((index >= count))
+                        RETURN(-ENODEV);
+
+                if (!lov->lov_tgts[index])
+                        /* Try again with the next index */
+                        RETURN(-EAGAIN);
+                if (!lov->lov_tgts[index]->ltd_active)
+                        RETURN(-ENODATA);
+
+                osc_obd = class_exp2obd(lov->lov_tgts[index]->ltd_exp);
+                if (!osc_obd)
+                        RETURN(-EINVAL);
+
+                /* copy UUID */
+		if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(osc_obd),
+				 min_t(unsigned long, data->ioc_plen2,
+				       sizeof(struct obd_uuid))))
+			RETURN(-EFAULT);
+
+		memcpy(&flags, data->ioc_inlbuf1, sizeof(flags));
+		flags = flags & LL_STATFS_NODELAY ? OBD_STATFS_NODELAY : 0;
+
+                /* got statfs data */
+                rc = obd_statfs(NULL, lov->lov_tgts[index]->ltd_exp, &stat_buf,
+                                cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                                flags);
+                if (rc)
+                        RETURN(rc);
+		if (copy_to_user(data->ioc_pbuf1, &stat_buf,
+				 min_t(unsigned long, data->ioc_plen1,
+				       sizeof(struct obd_statfs))))
+			RETURN(-EFAULT);
+		break;
+        }
+        case OBD_IOC_LOV_GET_CONFIG: {
+                struct obd_ioctl_data *data;
+                struct lov_desc *desc;
+                char *buf = NULL;
+                __u32 *genp;
+
+                len = 0;
+		if (obd_ioctl_getdata(&buf, &len, uarg))
+                        RETURN(-EINVAL);
+
+                data = (struct obd_ioctl_data *)buf;
+
+                if (sizeof(*desc) > data->ioc_inllen1) {
+			OBD_FREE_LARGE(buf, len);
+                        RETURN(-EINVAL);
+                }
+
+                if (sizeof(uuidp->uuid) * count > data->ioc_inllen2) {
+			OBD_FREE_LARGE(buf, len);
+                        RETURN(-EINVAL);
+                }
+
+                if (sizeof(__u32) * count > data->ioc_inllen3) {
+			OBD_FREE_LARGE(buf, len);
+                        RETURN(-EINVAL);
+                }
+
+                desc = (struct lov_desc *)data->ioc_inlbuf1;
+                memcpy(desc, &(lov->desc), sizeof(*desc));
+
+                uuidp = (struct obd_uuid *)data->ioc_inlbuf2;
+                genp = (__u32 *)data->ioc_inlbuf3;
+                /* the uuid will be empty for deleted OSTs */
+                for (i = 0; i < count; i++, uuidp++, genp++) {
+                        if (!lov->lov_tgts[i])
+                                continue;
+                        *uuidp = lov->lov_tgts[i]->ltd_uuid;
+                        *genp = lov->lov_tgts[i]->ltd_gen;
+                }
+
+		if (copy_to_user(uarg, buf, len))
+                        rc = -EFAULT;
+		OBD_FREE_LARGE(buf, len);
+                break;
+        }
+        case OBD_IOC_QUOTACTL: {
+                struct if_quotactl *qctl = karg;
+                struct lov_tgt_desc *tgt = NULL;
+                struct obd_quotactl *oqctl;
+
+		if (qctl->qc_valid == QC_OSTIDX) {
+			if (count <= qctl->qc_idx)
+				RETURN(-EINVAL);
+
+			tgt = lov->lov_tgts[qctl->qc_idx];
+			if (!tgt || !tgt->ltd_exp)
+				RETURN(-EINVAL);
+                } else if (qctl->qc_valid == QC_UUID) {
+                        for (i = 0; i < count; i++) {
+                                tgt = lov->lov_tgts[i];
+                                if (!tgt ||
+                                    !obd_uuid_equals(&tgt->ltd_uuid,
+                                                     &qctl->obd_uuid))
+                                        continue;
+
+                                if (tgt->ltd_exp == NULL)
+                                        RETURN(-EINVAL);
+
+                                break;
+                        }
+                } else {
+                        RETURN(-EINVAL);
+                }
+
+                if (i >= count)
+                        RETURN(-EAGAIN);
+
+                LASSERT(tgt && tgt->ltd_exp);
+                OBD_ALLOC_PTR(oqctl);
+                if (!oqctl)
+                        RETURN(-ENOMEM);
+
+                QCTL_COPY(oqctl, qctl);
+                rc = obd_quotactl(tgt->ltd_exp, oqctl);
+                if (rc == 0) {
+                        QCTL_COPY(qctl, oqctl);
+                        qctl->qc_valid = QC_OSTIDX;
+                        qctl->obd_uuid = tgt->ltd_uuid;
+                }
+                OBD_FREE_PTR(oqctl);
+                break;
+        }
+        default: {
+                int set = 0;
+
+                if (count == 0)
+                        RETURN(-ENOTTY);
+
+                for (i = 0; i < count; i++) {
+                        int err;
+                        struct obd_device *osc_obd;
+
+                        /* OST was disconnected */
+                        if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp)
+                                continue;
+
+                        /* ll_umount_begin() sets force flag but for lov, not
+                         * osc. Let's pass it through */
+                        osc_obd = class_exp2obd(lov->lov_tgts[i]->ltd_exp);
+                        osc_obd->obd_force = obddev->obd_force;
+                        err = obd_iocontrol(cmd, lov->lov_tgts[i]->ltd_exp,
+                                            len, karg, uarg);
+			if (err) {
+                                if (lov->lov_tgts[i]->ltd_active) {
+                                        CDEBUG(err == -ENOTTY ?
+                                               D_IOCTL : D_WARNING,
+                                               "iocontrol OSC %s on OST "
+                                               "idx %d cmd %x: err = %d\n",
+                                               lov_uuid2str(lov, i),
+                                               i, cmd, err);
+                                        if (!rc)
+                                                rc = err;
+                                }
+                        } else {
+                                set = 1;
+                        }
+                }
+                if (!set && !rc)
+                        rc = -EIO;
+        }
+        }
+
+        RETURN(rc);
+}
+
+static int lov_get_info(const struct lu_env *env, struct obd_export *exp,
+			__u32 keylen, void *key, __u32 *vallen, void *val)
+{
+	struct obd_device *obddev = class_exp2obd(exp);
+	struct lov_obd *lov = &obddev->u.lov;
+	struct lov_desc *ld = &lov->desc;
+	int rc = 0;
+	ENTRY;
+
+	if (vallen == NULL || val == NULL)
+		RETURN(-EFAULT);
+
+	obd_getref(obddev);
+
+	if (KEY_IS(KEY_MAX_EASIZE)) {
+		u32 max_stripe_count = min_t(u32, ld->ld_active_tgt_count,
+					     LOV_MAX_STRIPE_COUNT);
+
+		*((u32 *)val) = lov_mds_md_size(max_stripe_count, LOV_MAGIC_V3);
+	} else if (KEY_IS(KEY_DEFAULT_EASIZE)) {
+		u32 def_stripe_count = min_t(u32, ld->ld_default_stripe_count,
+					     LOV_MAX_STRIPE_COUNT);
+
+		*((u32 *)val) = lov_mds_md_size(def_stripe_count, LOV_MAGIC_V3);
+	} else if (KEY_IS(KEY_TGT_COUNT)) {
+		*((int *)val) = lov->desc.ld_tgt_count;
+	} else {
+		rc = -EINVAL;
+	}
+
+	obd_putref(obddev);
+
+	RETURN(rc);
+}
+
+static int lov_set_info_async(const struct lu_env *env, struct obd_export *exp,
+			      __u32 keylen, void *key,
+			      __u32 vallen, void *val,
+			      struct ptlrpc_request_set *set)
+{
+	struct obd_device *obddev = class_exp2obd(exp);
+	struct lov_obd *lov = &obddev->u.lov;
+	struct lov_tgt_desc *tgt;
+	int do_inactive = 0;
+	int no_set = 0;
+	u32 count;
+	u32 i;
+	int rc = 0;
+	int err;
+        ENTRY;
+
+        if (set == NULL) {
+                no_set = 1;
+                set = ptlrpc_prep_set();
+                if (!set)
+                        RETURN(-ENOMEM);
+        }
+
+        obd_getref(obddev);
+        count = lov->desc.ld_tgt_count;
+
+	if (KEY_IS(KEY_CHECKSUM)) {
+                do_inactive = 1;
+	} else if (KEY_IS(KEY_CACHE_SET)) {
+		LASSERT(lov->lov_cache == NULL);
+		lov->lov_cache = val;
+		do_inactive = 1;
+		cl_cache_incref(lov->lov_cache);
+	}
+
+	for (i = 0; i < count; i++) {
+		tgt = lov->lov_tgts[i];
+
+                /* OST was disconnected */
+                if (!tgt || !tgt->ltd_exp)
+                        continue;
+
+                /* OST is inactive and we don't want inactive OSCs */
+                if (!tgt->ltd_active && !do_inactive)
+                        continue;
+
+		err = obd_set_info_async(env, tgt->ltd_exp, keylen, key,
+					 vallen, val, set);
+                if (!rc)
+                        rc = err;
+        }
+
+        obd_putref(obddev);
+        if (no_set) {
+                err = ptlrpc_set_wait(set);
+                if (!rc)
+                        rc = err;
+                ptlrpc_set_destroy(set);
+        }
+        RETURN(rc);
+}
+
+void lov_stripe_lock(struct lov_stripe_md *md)
+__acquires(&md->lsm_lock)
+{
+	LASSERT(md->lsm_lock_owner != current_pid());
+	spin_lock(&md->lsm_lock);
+	LASSERT(md->lsm_lock_owner == 0);
+	md->lsm_lock_owner = current_pid();
+}
+
+void lov_stripe_unlock(struct lov_stripe_md *md)
+__releases(&md->lsm_lock)
+{
+	LASSERT(md->lsm_lock_owner == current_pid());
+	md->lsm_lock_owner = 0;
+	spin_unlock(&md->lsm_lock);
+}
+
+static int lov_quotactl(struct obd_device *obd, struct obd_export *exp,
+                        struct obd_quotactl *oqctl)
+{
+        struct lov_obd      *lov = &obd->u.lov;
+        struct lov_tgt_desc *tgt;
+        __u64                curspace = 0;
+        __u64                bhardlimit = 0;
+        int                  i, rc = 0;
+        ENTRY;
+
+	if (oqctl->qc_cmd != Q_GETOQUOTA &&
+	    oqctl->qc_cmd != LUSTRE_Q_SETQUOTA) {
+		CERROR("%s: bad quota opc %x for lov obd\n",
+		       obd->obd_name, oqctl->qc_cmd);
+		RETURN(-EFAULT);
+	}
+
+        /* for lov tgt */
+        obd_getref(obd);
+        for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+                int err;
+
+                tgt = lov->lov_tgts[i];
+
+                if (!tgt)
+                        continue;
+
+                if (!tgt->ltd_active || tgt->ltd_reap) {
+                        if (oqctl->qc_cmd == Q_GETOQUOTA &&
+                            lov->lov_tgts[i]->ltd_activate) {
+				rc = -ENETDOWN;
+                                CERROR("ost %d is inactive\n", i);
+                        } else {
+                                CDEBUG(D_HA, "ost %d is inactive\n", i);
+                        }
+                        continue;
+                }
+
+                err = obd_quotactl(tgt->ltd_exp, oqctl);
+                if (err) {
+                        if (tgt->ltd_active && !rc)
+                                rc = err;
+                        continue;
+                }
+
+                if (oqctl->qc_cmd == Q_GETOQUOTA) {
+                        curspace += oqctl->qc_dqblk.dqb_curspace;
+                        bhardlimit += oqctl->qc_dqblk.dqb_bhardlimit;
+                }
+        }
+        obd_putref(obd);
+
+        if (oqctl->qc_cmd == Q_GETOQUOTA) {
+                oqctl->qc_dqblk.dqb_curspace = curspace;
+                oqctl->qc_dqblk.dqb_bhardlimit = bhardlimit;
+        }
+        RETURN(rc);
+}
+
+static struct obd_ops lov_obd_ops = {
+	.o_owner		= THIS_MODULE,
+	.o_setup		= lov_setup,
+	.o_cleanup		= lov_cleanup,
+	.o_connect		= lov_connect,
+	.o_disconnect		= lov_disconnect,
+	.o_statfs		= lov_statfs,
+	.o_statfs_async		= lov_statfs_async,
+	.o_iocontrol		= lov_iocontrol,
+	.o_get_info		= lov_get_info,
+	.o_set_info_async	= lov_set_info_async,
+	.o_notify		= lov_notify,
+	.o_pool_new		= lov_pool_new,
+	.o_pool_rem		= lov_pool_remove,
+	.o_pool_add		= lov_pool_add,
+	.o_pool_del		= lov_pool_del,
+	.o_getref		= lov_getref,
+	.o_putref		= lov_putref,
+	.o_quotactl		= lov_quotactl,
+};
+
+struct kmem_cache *lov_oinfo_slab;
+
+static int __init lov_init(void)
+{
+	bool enable_proc = true;
+	struct obd_type *type;
+	int rc;
+	ENTRY;
+
+        /* print an address of _any_ initialized kernel symbol from this
+         * module, to allow debugging with gdb that doesn't support data
+         * symbols from modules.*/
+        CDEBUG(D_INFO, "Lustre LOV module (%p).\n", &lov_caches);
+
+        rc = lu_kmem_init(lov_caches);
+        if (rc)
+                return rc;
+
+	lov_oinfo_slab = kmem_cache_create("lov_oinfo",
+					   sizeof(struct lov_oinfo), 0,
+					   SLAB_HWCACHE_ALIGN, NULL);
+        if (lov_oinfo_slab == NULL) {
+                lu_kmem_fini(lov_caches);
+                return -ENOMEM;
+        }
+
+	type = class_search_type(LUSTRE_LOD_NAME);
+	if (type != NULL && type->typ_procsym != NULL)
+		enable_proc = false;
+
+	rc = class_register_type(&lov_obd_ops, NULL, enable_proc, NULL,
+				 LUSTRE_LOV_NAME, &lov_device_type);
+
+        if (rc) {
+		kmem_cache_destroy(lov_oinfo_slab);
+                lu_kmem_fini(lov_caches);
+        }
+
+        RETURN(rc);
+}
+
+static void __exit lov_exit(void)
+{
+	class_unregister_type(LUSTRE_LOV_NAME);
+	kmem_cache_destroy(lov_oinfo_slab);
+	lu_kmem_fini(lov_caches);
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Logical Object Volume");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+MODULE_LICENSE("GPL");
+
+module_init(lov_init);
+module_exit(lov_exit);
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_object.c b/drivers/staging/lustrefsx/lustre/lov/lov_object.c
new file mode 100644
index 0000000000000..87d496d8a68e4
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_object.c
@@ -0,0 +1,1778 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_object for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+static inline struct lov_device *lov_object_dev(struct lov_object *obj)
+{
+	return lu2lov_dev(obj->lo_cl.co_lu.lo_dev);
+}
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Layout operations.
+ *
+ */
+
+struct lov_layout_operations {
+	int (*llo_init)(const struct lu_env *env, struct lov_device *dev,
+			struct lov_object *lov, struct lov_stripe_md *lsm,
+			const struct cl_object_conf *conf,
+			union lov_layout_state *state);
+	int (*llo_delete)(const struct lu_env *env, struct lov_object *lov,
+                           union lov_layout_state *state);
+        void (*llo_fini)(const struct lu_env *env, struct lov_object *lov,
+                         union lov_layout_state *state);
+        int  (*llo_print)(const struct lu_env *env, void *cookie,
+                          lu_printer_t p, const struct lu_object *o);
+        int  (*llo_page_init)(const struct lu_env *env, struct cl_object *obj,
+			      struct cl_page *page, pgoff_t index);
+        int  (*llo_lock_init)(const struct lu_env *env,
+                              struct cl_object *obj, struct cl_lock *lock,
+                              const struct cl_io *io);
+        int  (*llo_io_init)(const struct lu_env *env,
+                            struct cl_object *obj, struct cl_io *io);
+        int  (*llo_getattr)(const struct lu_env *env, struct cl_object *obj,
+                            struct cl_attr *attr);
+};
+
+static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov);
+
+static void lov_lsm_put(struct lov_stripe_md *lsm)
+{
+	if (lsm != NULL)
+		lov_free_memmd(&lsm);
+}
+
+/*****************************************************************************
+ *
+ * Lov object layout operations.
+ *
+ */
+static int lov_init_empty(const struct lu_env *env, struct lov_device *dev,
+			  struct lov_object *lov, struct lov_stripe_md *lsm,
+			  const struct cl_object_conf *conf,
+			  union lov_layout_state *state)
+{
+	return 0;
+}
+
+static struct cl_object *lov_sub_find(const struct lu_env *env,
+                                      struct cl_device *dev,
+                                      const struct lu_fid *fid,
+                                      const struct cl_object_conf *conf)
+{
+        struct lu_object *o;
+
+        ENTRY;
+        o = lu_object_find_at(env, cl2lu_dev(dev), fid, &conf->coc_lu);
+        LASSERT(ergo(!IS_ERR(o), o->lo_dev->ld_type == &lovsub_device_type));
+        RETURN(lu2cl(o));
+}
+
+static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
+			struct cl_object *subobj, struct lov_layout_raid0 *r0,
+			struct lov_oinfo *oinfo, int idx)
+{
+	struct cl_object_header *hdr;
+	struct cl_object_header *subhdr;
+	struct cl_object_header *parent;
+	int entry = lov_comp_entry(idx);
+	int stripe = lov_comp_stripe(idx);
+	int result;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LOV_INIT)) {
+		/* For sanity:test_206.
+		 * Do not leave the object in cache to avoid accessing
+		 * freed memory. This is because osc_object is referring to
+		 * lov_oinfo of lsm_stripe_data which will be freed due to
+		 * this failure. */
+		cl_object_kill(env, subobj);
+		cl_object_put(env, subobj);
+		return -EIO;
+	}
+
+	hdr    = cl_object_header(lov2cl(lov));
+	subhdr = cl_object_header(subobj);
+
+	CDEBUG(D_INODE, DFID"@%p[%d:%d] -> "DFID"@%p: ostid: "DOSTID
+	       " ost idx: %d gen: %d\n",
+	       PFID(lu_object_fid(&subobj->co_lu)), subhdr, entry, stripe,
+	       PFID(lu_object_fid(lov2lu(lov))), hdr, POSTID(&oinfo->loi_oi),
+	       oinfo->loi_ost_idx, oinfo->loi_ost_gen);
+
+	/* reuse ->coh_attr_guard to protect coh_parent change */
+	spin_lock(&subhdr->coh_attr_guard);
+	parent = subhdr->coh_parent;
+	if (parent == NULL) {
+		subhdr->coh_parent = hdr;
+		spin_unlock(&subhdr->coh_attr_guard);
+		subhdr->coh_nesting = hdr->coh_nesting + 1;
+		lu_object_ref_add(&subobj->co_lu, "lov-parent", lov);
+		r0->lo_sub[stripe] = cl2lovsub(subobj);
+		r0->lo_sub[stripe]->lso_super = lov;
+		r0->lo_sub[stripe]->lso_index = idx;
+		result = 0;
+	} else {
+		struct lu_object  *old_obj;
+		struct lov_object *old_lov;
+		unsigned int mask = D_INODE;
+
+		spin_unlock(&subhdr->coh_attr_guard);
+		old_obj = lu_object_locate(&parent->coh_lu, &lov_device_type);
+		LASSERT(old_obj != NULL);
+		old_lov = cl2lov(lu2cl(old_obj));
+		if (old_lov->lo_layout_invalid) {
+			/* the object's layout has already changed but isn't
+			 * refreshed */
+			lu_object_unhash(env, &subobj->co_lu);
+			result = -EAGAIN;
+		} else {
+			mask = D_ERROR;
+			result = -EIO;
+		}
+
+		LU_OBJECT_DEBUG(mask, env, &subobj->co_lu,
+				"stripe %d is already owned.", idx);
+		LU_OBJECT_DEBUG(mask, env, old_obj, "owned.");
+		LU_OBJECT_HEADER(mask, env, lov2lu(lov), "try to own.\n");
+		cl_object_put(env, subobj);
+	}
+	return result;
+}
+
+static int lov_page_slice_fixup(struct lov_object *lov,
+				struct cl_object *stripe)
+{
+	struct cl_object_header *hdr = cl_object_header(&lov->lo_cl);
+	struct cl_object *o;
+
+	if (stripe == NULL)
+		return hdr->coh_page_bufsize - lov->lo_cl.co_slice_off -
+		       cfs_size_round(sizeof(struct lov_page));
+
+	cl_object_for_each(o, stripe)
+		o->co_slice_off += hdr->coh_page_bufsize;
+
+	return cl_object_header(stripe)->coh_page_bufsize;
+}
+
+static int lov_init_raid0(const struct lu_env *env, struct lov_device *dev,
+			  struct lov_object *lov, int index,
+			  struct lov_layout_raid0 *r0)
+{
+	struct lov_thread_info  *lti     = lov_env_info(env);
+	struct cl_object_conf   *subconf = &lti->lti_stripe_conf;
+	struct lu_fid           *ofid    = &lti->lti_fid;
+	struct cl_object        *stripe;
+	struct lov_stripe_md_entry *lse  = lov_lse(lov, index);
+	int result;
+	int psz;
+	int i;
+
+	ENTRY;
+
+	spin_lock_init(&r0->lo_sub_lock);
+	r0->lo_nr = lse->lsme_stripe_count;
+	LASSERT(r0->lo_nr <= lov_targets_nr(dev));
+
+	OBD_ALLOC_LARGE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]);
+	if (r0->lo_sub == NULL)
+		GOTO(out, result = -ENOMEM);
+
+	psz = 0;
+	result = 0;
+	memset(subconf, 0, sizeof(*subconf));
+
+	/*
+	 * Create stripe cl_objects.
+	 */
+	for (i = 0; i < r0->lo_nr; ++i) {
+		struct cl_device *subdev;
+		struct lov_oinfo *oinfo = lse->lsme_oinfo[i];
+		int ost_idx = oinfo->loi_ost_idx;
+
+		if (lov_oinfo_is_dummy(oinfo))
+			continue;
+
+		result = ostid_to_fid(ofid, &oinfo->loi_oi, oinfo->loi_ost_idx);
+		if (result != 0)
+			GOTO(out, result);
+
+		if (dev->ld_target[ost_idx] == NULL) {
+			CERROR("%s: OST %04x is not initialized\n",
+			       lov2obd(dev->ld_lov)->obd_name, ost_idx);
+			GOTO(out, result = -EIO);
+		}
+
+		subdev = lovsub2cl_dev(dev->ld_target[ost_idx]);
+		subconf->u.coc_oinfo = oinfo;
+		LASSERTF(subdev != NULL, "not init ost %d\n", ost_idx);
+		/* In the function below, .hs_keycmp resolves to
+		 * lu_obj_hop_keycmp() */
+		/* coverity[overrun-buffer-val] */
+		stripe = lov_sub_find(env, subdev, ofid, subconf);
+		if (IS_ERR(stripe))
+			GOTO(out, result = PTR_ERR(stripe));
+
+		result = lov_init_sub(env, lov, stripe, r0, oinfo,
+				      lov_comp_index(index, i));
+		if (result == -EAGAIN) { /* try again */
+			--i;
+			result = 0;
+			continue;
+		}
+
+		if (result == 0) {
+			int sz = lov_page_slice_fixup(lov, stripe);
+			LASSERT(ergo(psz > 0, psz == sz));
+			psz = sz;
+		}
+	}
+	if (result == 0)
+		result = psz;
+out:
+	RETURN(result);
+}
+
+static int lov_init_composite(const struct lu_env *env, struct lov_device *dev,
+			      struct lov_object *lov, struct lov_stripe_md *lsm,
+			      const struct cl_object_conf *conf,
+			      union lov_layout_state *state)
+{
+	struct lov_layout_composite *comp = &state->composite;
+	unsigned int entry_count;
+	unsigned int psz = 0;
+	int result = 0;
+	int i;
+
+	ENTRY;
+
+	LASSERT(lsm->lsm_entry_count > 0);
+	LASSERT(lov->lo_lsm == NULL);
+	lov->lo_lsm = lsm_addref(lsm);
+	lov->lo_layout_invalid = true;
+
+	entry_count = lsm->lsm_entry_count;
+	comp->lo_entry_count = entry_count;
+
+	OBD_ALLOC(comp->lo_entries, entry_count * sizeof(*comp->lo_entries));
+	if (comp->lo_entries == NULL)
+		RETURN(-ENOMEM);
+
+	for (i = 0; i < entry_count; i++) {
+		struct lov_layout_entry *le = &comp->lo_entries[i];
+
+		le->lle_extent = lsm->lsm_entries[i]->lsme_extent;
+		/**
+		 * If the component has not been init-ed on MDS side, for
+		 * PFL layout, we'd know that the components beyond this one
+		 * will be dynamically init-ed later on file write/trunc ops.
+		 */
+		if (!lsm_entry_inited(lsm, i))
+			continue;
+
+		result = lov_init_raid0(env, dev, lov, i, &le->lle_raid0);
+		if (result < 0)
+			break;
+
+		LASSERT(ergo(psz > 0, psz == result));
+		psz = result;
+	}
+	if (psz > 0)
+		cl_object_header(&lov->lo_cl)->coh_page_bufsize += psz;
+
+	return result > 0 ? 0 : result;
+}
+
+static int lov_init_released(const struct lu_env *env,
+			     struct lov_device *dev, struct lov_object *lov,
+			     struct lov_stripe_md *lsm,
+			     const struct cl_object_conf *conf,
+			     union lov_layout_state *state)
+{
+	LASSERT(lsm != NULL);
+	LASSERT(lsm->lsm_is_released);
+	LASSERT(lov->lo_lsm == NULL);
+
+	lov->lo_lsm = lsm_addref(lsm);
+	return 0;
+}
+
+static struct cl_object *lov_find_subobj(const struct lu_env *env,
+					 struct lov_object *lov,
+					 struct lov_stripe_md *lsm,
+					 int index)
+{
+	struct lov_device	*dev = lu2lov_dev(lov2lu(lov)->lo_dev);
+	struct lov_thread_info  *lti = lov_env_info(env);
+	struct lu_fid		*ofid = &lti->lti_fid;
+	struct lov_oinfo	*oinfo;
+	struct cl_device	*subdev;
+	int			entry = lov_comp_entry(index);
+	int			stripe = lov_comp_stripe(index);
+	int			ost_idx;
+	int			rc;
+	struct cl_object	*result;
+
+	if (lov->lo_type != LLT_COMP)
+		GOTO(out, result = NULL);
+
+	if (entry >= lsm->lsm_entry_count ||
+	    stripe >= lsm->lsm_entries[entry]->lsme_stripe_count)
+		GOTO(out, result = NULL);
+
+	oinfo = lsm->lsm_entries[entry]->lsme_oinfo[stripe];
+	ost_idx = oinfo->loi_ost_idx;
+	rc = ostid_to_fid(ofid, &oinfo->loi_oi, ost_idx);
+	if (rc != 0)
+		GOTO(out, result = NULL);
+
+	subdev = lovsub2cl_dev(dev->ld_target[ost_idx]);
+	result = lov_sub_find(env, subdev, ofid, NULL);
+out:
+	if (result == NULL)
+		result = ERR_PTR(-EINVAL);
+	return result;
+}
+
+static int lov_delete_empty(const struct lu_env *env, struct lov_object *lov,
+			    union lov_layout_state *state)
+{
+	LASSERT(lov->lo_type == LLT_EMPTY || lov->lo_type == LLT_RELEASED);
+
+	lov_layout_wait(env, lov);
+	return 0;
+}
+
+static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov,
+			       struct lov_layout_raid0 *r0,
+			       struct lovsub_object *los, int idx)
+{
+	struct cl_object        *sub;
+	struct lu_site          *site;
+	struct lu_site_bkt_data *bkt;
+	wait_queue_entry_t      *waiter;
+
+        LASSERT(r0->lo_sub[idx] == los);
+
+        sub  = lovsub2cl(los);
+        site = sub->co_lu.lo_dev->ld_site;
+        bkt  = lu_site_bkt_from_fid(site, &sub->co_lu.lo_header->loh_fid);
+
+        cl_object_kill(env, sub);
+        /* release a reference to the sub-object and ... */
+        lu_object_ref_del(&sub->co_lu, "lov-parent", lov);
+        cl_object_put(env, sub);
+
+        /* ... wait until it is actually destroyed---sub-object clears its
+         * ->lo_sub[] slot in lovsub_object_fini() */
+	if (r0->lo_sub[idx] == los) {
+		waiter = &lov_env_info(env)->lti_waiter;
+		init_waitqueue_entry(waiter, current);
+		add_wait_queue(&bkt->lsb_marche_funebre, waiter);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		while (1) {
+			/* this wait-queue is signaled at the end of
+			 * lu_object_free(). */
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			spin_lock(&r0->lo_sub_lock);
+			if (r0->lo_sub[idx] == los) {
+				spin_unlock(&r0->lo_sub_lock);
+				schedule();
+			} else {
+				spin_unlock(&r0->lo_sub_lock);
+				set_current_state(TASK_RUNNING);
+				break;
+			}
+		}
+		remove_wait_queue(&bkt->lsb_marche_funebre, waiter);
+	}
+	LASSERT(r0->lo_sub[idx] == NULL);
+}
+
+static void lov_delete_raid0(const struct lu_env *env, struct lov_object *lov,
+			     struct lov_layout_raid0 *r0)
+{
+	ENTRY;
+
+        if (r0->lo_sub != NULL) {
+		int i;
+
+		for (i = 0; i < r0->lo_nr; ++i) {
+			struct lovsub_object *los = r0->lo_sub[i];
+
+			if (los != NULL) {
+				cl_object_prune(env, &los->lso_cl);
+				/*
+				 * If top-level object is to be evicted from
+				 * the cache, so are its sub-objects.
+				 */
+				lov_subobject_kill(env, lov, r0, los, i);
+			}
+		}
+	}
+
+	EXIT;
+}
+
+static int lov_delete_composite(const struct lu_env *env,
+				struct lov_object *lov,
+				union lov_layout_state *state)
+{
+	struct lov_layout_entry *entry;
+	struct lov_layout_composite *comp = &state->composite;
+
+	ENTRY;
+
+	dump_lsm(D_INODE, lov->lo_lsm);
+
+	lov_layout_wait(env, lov);
+	if (comp->lo_entries)
+		lov_foreach_layout_entry(lov, entry)
+			lov_delete_raid0(env, lov, &entry->lle_raid0);
+
+	RETURN(0);
+}
+
+static void lov_fini_empty(const struct lu_env *env, struct lov_object *lov,
+                           union lov_layout_state *state)
+{
+	LASSERT(lov->lo_type == LLT_EMPTY || lov->lo_type == LLT_RELEASED);
+}
+
+static void lov_fini_raid0(const struct lu_env *env,
+			   struct lov_layout_raid0 *r0)
+{
+	if (r0->lo_sub != NULL) {
+		OBD_FREE_LARGE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]);
+		r0->lo_sub = NULL;
+	}
+}
+
+static void lov_fini_composite(const struct lu_env *env,
+			       struct lov_object *lov,
+			       union lov_layout_state *state)
+{
+	struct lov_layout_composite *comp = &state->composite;
+	ENTRY;
+
+	if (comp->lo_entries != NULL) {
+		struct lov_layout_entry *entry;
+
+		lov_foreach_layout_entry(lov, entry)
+			lov_fini_raid0(env, &entry->lle_raid0);
+
+		OBD_FREE(comp->lo_entries,
+			 comp->lo_entry_count * sizeof(*comp->lo_entries));
+		comp->lo_entries = NULL;
+	}
+
+	dump_lsm(D_INODE, lov->lo_lsm);
+	lov_free_memmd(&lov->lo_lsm);
+
+	EXIT;
+}
+
+static void lov_fini_released(const struct lu_env *env, struct lov_object *lov,
+				union lov_layout_state *state)
+{
+	ENTRY;
+	dump_lsm(D_INODE, lov->lo_lsm);
+	lov_free_memmd(&lov->lo_lsm);
+	EXIT;
+}
+
+static int lov_print_empty(const struct lu_env *env, void *cookie,
+                           lu_printer_t p, const struct lu_object *o)
+{
+        (*p)(env, cookie, "empty %d\n", lu2lov(o)->lo_layout_invalid);
+        return 0;
+}
+
+static int lov_print_raid0(const struct lu_env *env, void *cookie,
+			   lu_printer_t p, struct lov_layout_raid0 *r0)
+{
+	int i;
+
+	for (i = 0; i < r0->lo_nr; ++i) {
+		struct lu_object *sub;
+
+		if (r0->lo_sub[i] != NULL) {
+			sub = lovsub2lu(r0->lo_sub[i]);
+			lu_object_print(env, cookie, p, sub);
+		} else {
+			(*p)(env, cookie, "sub %d absent\n", i);
+		}
+	}
+	return 0;
+}
+
+static int lov_print_composite(const struct lu_env *env, void *cookie,
+			       lu_printer_t p, const struct lu_object *o)
+{
+	struct lov_object *lov = lu2lov(o);
+	struct lov_stripe_md *lsm = lov->lo_lsm;
+	int i;
+
+	(*p)(env, cookie, "entries: %d, %s, lsm{%p 0x%08X %d %u}:\n",
+	     lsm->lsm_entry_count,
+	     lov->lo_layout_invalid ? "invalid" : "valid", lsm,
+	     lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
+	     lsm->lsm_layout_gen);
+
+	for (i = 0; i < lsm->lsm_entry_count; i++) {
+		struct lov_stripe_md_entry *lse = lsm->lsm_entries[i];
+
+		(*p)(env, cookie, DEXT ": { 0x%08X, %u, %u, %#x, %u, %u }\n",
+		     PEXT(&lse->lsme_extent), lse->lsme_magic,
+		     lse->lsme_id, lse->lsme_layout_gen, lse->lsme_flags,
+		     lse->lsme_stripe_count, lse->lsme_stripe_size);
+		lov_print_raid0(env, cookie, p, lov_r0(lov, i));
+	}
+
+	return 0;
+}
+
+static int lov_print_released(const struct lu_env *env, void *cookie,
+				lu_printer_t p, const struct lu_object *o)
+{
+	struct lov_object	*lov = lu2lov(o);
+	struct lov_stripe_md	*lsm = lov->lo_lsm;
+
+	(*p)(env, cookie,
+		"released: %s, lsm{%p 0x%08X %d %u}:\n",
+		lov->lo_layout_invalid ? "invalid" : "valid", lsm,
+		lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
+		lsm->lsm_layout_gen);
+	return 0;
+}
+
+/**
+ * Implements cl_object_operations::coo_attr_get() method for an object
+ * without stripes (LLT_EMPTY layout type).
+ *
+ * The only attributes this layer is authoritative in this case is
+ * cl_attr::cat_blocks---it's 0.
+ */
+static int lov_attr_get_empty(const struct lu_env *env, struct cl_object *obj,
+                              struct cl_attr *attr)
+{
+        attr->cat_blocks = 0;
+        return 0;
+}
+
+static int lov_attr_get_raid0(const struct lu_env *env, struct lov_object *lov,
+			      unsigned int index, struct lov_layout_raid0 *r0)
+
+{
+	struct lov_stripe_md *lsm = lov->lo_lsm;
+	struct ost_lvb *lvb = &lov_env_info(env)->lti_lvb;
+	struct cl_attr *attr = &r0->lo_attr;
+	__u64 kms = 0;
+	int result = 0;
+
+	if (r0->lo_attr_valid)
+		return 0;
+
+	memset(lvb, 0, sizeof(*lvb));
+
+	/* XXX: timestamps can be negative by sanity:test_39m,
+	 * how can it be? */
+	lvb->lvb_atime = LLONG_MIN;
+	lvb->lvb_ctime = LLONG_MIN;
+	lvb->lvb_mtime = LLONG_MIN;
+
+	/*
+	 * XXX that should be replaced with a loop over sub-objects,
+	 * doing cl_object_attr_get() on them. But for now, let's
+	 * reuse old lov code.
+	 */
+
+	/*
+	 * XXX take lsm spin-lock to keep lov_merge_lvb_kms()
+	 * happy. It's not needed, because new code uses
+	 * ->coh_attr_guard spin-lock to protect consistency of
+	 * sub-object attributes.
+	 */
+	lov_stripe_lock(lsm);
+	result = lov_merge_lvb_kms(lsm, index, lvb, &kms);
+	lov_stripe_unlock(lsm);
+	if (result == 0) {
+		cl_lvb2attr(attr, lvb);
+		attr->cat_kms = kms;
+		r0->lo_attr_valid = 1;
+	}
+
+	return result;
+}
+
+static int lov_attr_get_composite(const struct lu_env *env,
+				  struct cl_object *obj,
+				  struct cl_attr *attr)
+{
+	struct lov_object	*lov = cl2lov(obj);
+	struct lov_layout_entry *entry;
+	int			 result = 0;
+	int			 index = 0;
+
+	ENTRY;
+
+	attr->cat_size = 0;
+	attr->cat_blocks = 0;
+	lov_foreach_layout_entry(lov, entry) {
+		struct lov_layout_raid0 *r0 = &entry->lle_raid0;
+		struct cl_attr *lov_attr = &r0->lo_attr;
+
+		/* PFL: This component has not been init-ed. */
+		if (!lsm_entry_inited(lov->lo_lsm, index))
+			break;
+
+		result = lov_attr_get_raid0(env, lov, index, r0);
+		if (result != 0)
+			break;
+
+		index++;
+
+		/* merge results */
+		attr->cat_blocks += lov_attr->cat_blocks;
+		if (attr->cat_size < lov_attr->cat_size)
+			attr->cat_size = lov_attr->cat_size;
+		if (attr->cat_kms < lov_attr->cat_kms)
+			attr->cat_kms = lov_attr->cat_kms;
+		if (attr->cat_atime < lov_attr->cat_atime)
+			attr->cat_atime = lov_attr->cat_atime;
+		if (attr->cat_ctime < lov_attr->cat_ctime)
+			attr->cat_ctime = lov_attr->cat_ctime;
+		if (attr->cat_mtime < lov_attr->cat_mtime)
+			attr->cat_mtime = lov_attr->cat_mtime;
+	}
+	RETURN(result);
+}
+
+const static struct lov_layout_operations lov_dispatch[] = {
+        [LLT_EMPTY] = {
+                .llo_init      = lov_init_empty,
+                .llo_delete    = lov_delete_empty,
+                .llo_fini      = lov_fini_empty,
+                .llo_print     = lov_print_empty,
+                .llo_page_init = lov_page_init_empty,
+                .llo_lock_init = lov_lock_init_empty,
+                .llo_io_init   = lov_io_init_empty,
+		.llo_getattr   = lov_attr_get_empty,
+        },
+        [LLT_RELEASED] = {
+                .llo_init      = lov_init_released,
+                .llo_delete    = lov_delete_empty,
+                .llo_fini      = lov_fini_released,
+                .llo_print     = lov_print_released,
+                .llo_page_init = lov_page_init_empty,
+                .llo_lock_init = lov_lock_init_empty,
+                .llo_io_init   = lov_io_init_released,
+		.llo_getattr   = lov_attr_get_empty,
+	},
+	[LLT_COMP] = {
+		.llo_init      = lov_init_composite,
+		.llo_delete    = lov_delete_composite,
+		.llo_fini      = lov_fini_composite,
+		.llo_print     = lov_print_composite,
+		.llo_page_init = lov_page_init_composite,
+		.llo_lock_init = lov_lock_init_composite,
+		.llo_io_init   = lov_io_init_composite,
+		.llo_getattr   = lov_attr_get_composite,
+	},
+};
+
+/**
+ * Performs a double-dispatch based on the layout type of an object.
+ */
+#define LOV_2DISPATCH_NOLOCK(obj, op, ...)		\
+({							\
+	struct lov_object *__obj = (obj);		\
+	enum lov_layout_type __llt;			\
+							\
+	__llt = __obj->lo_type;				\
+	LASSERT(__llt < ARRAY_SIZE(lov_dispatch));	\
+	lov_dispatch[__llt].op(__VA_ARGS__);		\
+})
+
+/**
+ * Return lov_layout_type associated with a given lsm
+ */
+static enum lov_layout_type lov_type(struct lov_stripe_md *lsm)
+{
+	if (lsm == NULL)
+		return LLT_EMPTY;
+
+	if (lsm->lsm_is_released)
+		return LLT_RELEASED;
+
+	if (lsm->lsm_magic == LOV_MAGIC_V1 ||
+	    lsm->lsm_magic == LOV_MAGIC_V3 ||
+	    lsm->lsm_magic == LOV_MAGIC_COMP_V1)
+		return LLT_COMP;
+
+	return LLT_EMPTY;
+}
+
+static inline void lov_conf_freeze(struct lov_object *lov)
+{
+	CDEBUG(D_INODE, "To take share lov(%p) owner %p/%p\n",
+		lov, lov->lo_owner, current);
+	if (lov->lo_owner != current)
+		down_read(&lov->lo_type_guard);
+}
+
+static inline void lov_conf_thaw(struct lov_object *lov)
+{
+	CDEBUG(D_INODE, "To release share lov(%p) owner %p/%p\n",
+		lov, lov->lo_owner, current);
+	if (lov->lo_owner != current)
+		up_read(&lov->lo_type_guard);
+}
+
+#define LOV_2DISPATCH_MAYLOCK(obj, op, lock, ...)                       \
+({                                                                      \
+        struct lov_object                      *__obj = (obj);          \
+        int                                     __lock = !!(lock);      \
+        typeof(lov_dispatch[0].op(__VA_ARGS__)) __result;               \
+                                                                        \
+        if (__lock)                                                     \
+                lov_conf_freeze(__obj);					\
+        __result = LOV_2DISPATCH_NOLOCK(obj, op, __VA_ARGS__);          \
+        if (__lock)                                                     \
+                lov_conf_thaw(__obj);					\
+        __result;                                                       \
+})
+
+/**
+ * Performs a locked double-dispatch based on the layout type of an object.
+ */
+#define LOV_2DISPATCH(obj, op, ...)                     \
+        LOV_2DISPATCH_MAYLOCK(obj, op, 1, __VA_ARGS__)
+
+#define LOV_2DISPATCH_VOID(obj, op, ...)                                \
+do {                                                                    \
+        struct lov_object                      *__obj = (obj);          \
+        enum lov_layout_type                    __llt;                  \
+                                                                        \
+	lov_conf_freeze(__obj);						\
+        __llt = __obj->lo_type;                                         \
+	LASSERT(__llt < ARRAY_SIZE(lov_dispatch));			\
+        lov_dispatch[__llt].op(__VA_ARGS__);                            \
+	lov_conf_thaw(__obj);						\
+} while (0)
+
+static void lov_conf_lock(struct lov_object *lov)
+{
+	LASSERT(lov->lo_owner != current);
+	down_write(&lov->lo_type_guard);
+	LASSERT(lov->lo_owner == NULL);
+	lov->lo_owner = current;
+	CDEBUG(D_INODE, "Took exclusive lov(%p) owner %p\n",
+		lov, lov->lo_owner);
+}
+
+static void lov_conf_unlock(struct lov_object *lov)
+{
+	CDEBUG(D_INODE, "To release exclusive lov(%p) owner %p\n",
+		lov, lov->lo_owner);
+	lov->lo_owner = NULL;
+	up_write(&lov->lo_type_guard);
+}
+
+static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov)
+{
+	struct l_wait_info lwi = { 0 };
+	ENTRY;
+
+	while (atomic_read(&lov->lo_active_ios) > 0) {
+		CDEBUG(D_INODE, "file:"DFID" wait for active IO, now: %d.\n",
+			PFID(lu_object_fid(lov2lu(lov))),
+			atomic_read(&lov->lo_active_ios));
+
+		l_wait_event(lov->lo_waitq,
+			     atomic_read(&lov->lo_active_ios) == 0, &lwi);
+	}
+	RETURN(0);
+}
+
+static int lov_layout_change(const struct lu_env *unused,
+			     struct lov_object *lov, struct lov_stripe_md *lsm,
+			     const struct cl_object_conf *conf)
+{
+	enum lov_layout_type llt = lov_type(lsm);
+	union lov_layout_state *state = &lov->u;
+	const struct lov_layout_operations *old_ops;
+	const struct lov_layout_operations *new_ops;
+	struct lov_device *lov_dev = lov_object_dev(lov);
+	struct lu_env *env;
+	__u16 refcheck;
+	int rc;
+	ENTRY;
+
+	LASSERT(lov->lo_type < ARRAY_SIZE(lov_dispatch));
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	LASSERT(llt < ARRAY_SIZE(lov_dispatch));
+
+	CDEBUG(D_INODE, DFID" from %s to %s\n",
+	       PFID(lu_object_fid(lov2lu(lov))),
+	       llt2str(lov->lo_type), llt2str(llt));
+
+	old_ops = &lov_dispatch[lov->lo_type];
+	new_ops = &lov_dispatch[llt];
+
+	rc = cl_object_prune(env, &lov->lo_cl);
+	if (rc != 0)
+		GOTO(out, rc);
+
+	rc = old_ops->llo_delete(env, lov, &lov->u);
+	if (rc != 0)
+		GOTO(out, rc);
+
+	old_ops->llo_fini(env, lov, &lov->u);
+
+	LASSERT(atomic_read(&lov->lo_active_ios) == 0);
+
+	CDEBUG(D_INODE, DFID "Apply new layout lov %p, type %d\n",
+	       PFID(lu_object_fid(lov2lu(lov))), lov, llt);
+
+	lov->lo_type = LLT_EMPTY;
+
+	/* page bufsize fixup */
+	cl_object_header(&lov->lo_cl)->coh_page_bufsize -=
+		lov_page_slice_fixup(lov, NULL);
+
+	rc = new_ops->llo_init(env, lov_dev, lov, lsm, conf, state);
+	if (rc != 0) {
+		struct obd_device *obd = lov2obd(lov_dev->ld_lov);
+
+		CERROR("%s: cannot apply new layout on "DFID" : rc = %d\n",
+		       obd->obd_name, PFID(lu_object_fid(lov2lu(lov))), rc);
+		new_ops->llo_delete(env, lov, state);
+		new_ops->llo_fini(env, lov, state);
+		/* this file becomes an EMPTY file. */
+		GOTO(out, rc);
+	}
+
+	lov->lo_type = llt;
+
+out:
+	cl_env_put(env, &refcheck);
+	RETURN(rc);
+}
+
+/*****************************************************************************
+ *
+ * Lov object operations.
+ *
+ */
+int lov_object_init(const struct lu_env *env, struct lu_object *obj,
+		    const struct lu_object_conf *conf)
+{
+	struct lov_object            *lov   = lu2lov(obj);
+	struct lov_device            *dev   = lov_object_dev(lov);
+	const struct cl_object_conf  *cconf = lu2cl_conf(conf);
+	union lov_layout_state	     *set   = &lov->u;
+	const struct lov_layout_operations *ops;
+	struct lov_stripe_md *lsm = NULL;
+	int rc;
+	ENTRY;
+
+	init_rwsem(&lov->lo_type_guard);
+	atomic_set(&lov->lo_active_ios, 0);
+	init_waitqueue_head(&lov->lo_waitq);
+	cl_object_page_init(lu2cl(obj), sizeof(struct lov_page));
+
+	lov->lo_type = LLT_EMPTY;
+	if (cconf->u.coc_layout.lb_buf != NULL) {
+		lsm = lov_unpackmd(dev->ld_lov,
+				   cconf->u.coc_layout.lb_buf,
+				   cconf->u.coc_layout.lb_len);
+		if (IS_ERR(lsm))
+			RETURN(PTR_ERR(lsm));
+
+		dump_lsm(D_INODE, lsm);
+	}
+
+	/* no locking is necessary, as object is being created */
+	lov->lo_type = lov_type(lsm);
+	ops = &lov_dispatch[lov->lo_type];
+	rc = ops->llo_init(env, dev, lov, lsm, cconf, set);
+	if (rc != 0)
+		GOTO(out_lsm, rc);
+
+out_lsm:
+	lov_lsm_put(lsm);
+
+	RETURN(rc);
+}
+
+static int lov_conf_set(const struct lu_env *env, struct cl_object *obj,
+                        const struct cl_object_conf *conf)
+{
+	struct lov_stripe_md	*lsm = NULL;
+	struct lov_object	*lov = cl2lov(obj);
+	int			 result = 0;
+	ENTRY;
+
+	if (conf->coc_opc == OBJECT_CONF_SET &&
+	    conf->u.coc_layout.lb_buf != NULL) {
+		lsm = lov_unpackmd(lov_object_dev(lov)->ld_lov,
+				   conf->u.coc_layout.lb_buf,
+				   conf->u.coc_layout.lb_len);
+		if (IS_ERR(lsm))
+			RETURN(PTR_ERR(lsm));
+		dump_lsm(D_INODE, lsm);
+	}
+
+	lov_conf_lock(lov);
+	if (conf->coc_opc == OBJECT_CONF_INVALIDATE) {
+		lov->lo_layout_invalid = true;
+		GOTO(out, result = 0);
+	}
+
+	if (conf->coc_opc == OBJECT_CONF_WAIT) {
+		if (lov->lo_layout_invalid &&
+		    atomic_read(&lov->lo_active_ios) > 0) {
+			lov_conf_unlock(lov);
+			result = lov_layout_wait(env, lov);
+			lov_conf_lock(lov);
+		}
+		GOTO(out, result);
+	}
+
+	LASSERT(conf->coc_opc == OBJECT_CONF_SET);
+
+	if ((lsm == NULL && lov->lo_lsm == NULL) ||
+	    ((lsm != NULL && lov->lo_lsm != NULL) &&
+	     (lov->lo_lsm->lsm_layout_gen == lsm->lsm_layout_gen) &&
+	     (lov->lo_lsm->lsm_entries[0]->lsme_pattern ==
+	      lsm->lsm_entries[0]->lsme_pattern))) {
+		/* same version of layout */
+		lov->lo_layout_invalid = false;
+		GOTO(out, result = 0);
+	}
+
+	/* will change layout - check if there still exists active IO. */
+	if (atomic_read(&lov->lo_active_ios) > 0) {
+		lov->lo_layout_invalid = true;
+		GOTO(out, result = -EBUSY);
+	}
+
+	result = lov_layout_change(env, lov, lsm, conf);
+	lov->lo_layout_invalid = result != 0;
+	EXIT;
+
+out:
+	lov_conf_unlock(lov);
+	lov_lsm_put(lsm);
+	CDEBUG(D_INODE, DFID" lo_layout_invalid=%d\n",
+	       PFID(lu_object_fid(lov2lu(lov))), lov->lo_layout_invalid);
+	RETURN(result);
+}
+
+static void lov_object_delete(const struct lu_env *env, struct lu_object *obj)
+{
+        struct lov_object *lov = lu2lov(obj);
+
+        ENTRY;
+        LOV_2DISPATCH_VOID(lov, llo_delete, env, lov, &lov->u);
+        EXIT;
+}
+
+static void lov_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+        struct lov_object *lov = lu2lov(obj);
+
+        ENTRY;
+        LOV_2DISPATCH_VOID(lov, llo_fini, env, lov, &lov->u);
+        lu_object_fini(obj);
+        OBD_SLAB_FREE_PTR(lov, lov_object_kmem);
+        EXIT;
+}
+
+static int lov_object_print(const struct lu_env *env, void *cookie,
+                            lu_printer_t p, const struct lu_object *o)
+{
+        return LOV_2DISPATCH_NOLOCK(lu2lov(o), llo_print, env, cookie, p, o);
+}
+
+int lov_page_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_page *page, pgoff_t index)
+{
+	return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_page_init, env, obj, page,
+				    index);
+}
+
+/**
+ * Implements cl_object_operations::clo_io_init() method for lov
+ * layer. Dispatches to the appropriate layout io initialization method.
+ */
+int lov_io_init(const struct lu_env *env, struct cl_object *obj,
+		struct cl_io *io)
+{
+	CL_IO_SLICE_CLEAN(lov_env_io(env), lis_cl);
+
+	CDEBUG(D_INODE, DFID "io %p type %d ignore/verify layout %d/%d\n",
+	       PFID(lu_object_fid(&obj->co_lu)), io, io->ci_type,
+	       io->ci_ignore_layout, io->ci_verify_layout);
+
+	return LOV_2DISPATCH_MAYLOCK(cl2lov(obj), llo_io_init,
+				     !io->ci_ignore_layout, env, obj, io);
+}
+
+/**
+ * An implementation of cl_object_operations::clo_attr_get() method for lov
+ * layer. For raid0 layout this collects and merges attributes of all
+ * sub-objects.
+ */
+static int lov_attr_get(const struct lu_env *env, struct cl_object *obj,
+                        struct cl_attr *attr)
+{
+        /* do not take lock, as this function is called under a
+         * spin-lock. Layout is protected from changing by ongoing IO. */
+        return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_getattr, env, obj, attr);
+}
+
+static int lov_attr_update(const struct lu_env *env, struct cl_object *obj,
+			   const struct cl_attr *attr, unsigned valid)
+{
+	/*
+	 * No dispatch is required here, as no layout implements this.
+	 */
+	return 0;
+}
+
+int lov_lock_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_lock *lock, const struct cl_io *io)
+{
+	/* No need to lock because we've taken one refcount of layout.  */
+	return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_lock_init, env, obj, lock,
+				    io);
+}
+
+/**
+ * We calculate on which OST the mapping will end. If the length of mapping
+ * is greater than (stripe_size * stripe_count) then the last_stripe will
+ * will be one just before start_stripe. Else we check if the mapping
+ * intersects each OST and find last_stripe.
+ * This function returns the last_stripe and also sets the stripe_count
+ * over which the mapping is spread
+ *
+ * \param lsm [in]		striping information for the file
+ * \param index [in]		stripe component index
+ * \param ext [in]		logical extent of mapping
+ * \param start_stripe [in]	starting stripe of the mapping
+ * \param stripe_count [out]	the number of stripes across which to map is
+ *				returned
+ *
+ * \retval last_stripe		return the last stripe of the mapping
+ */
+static int fiemap_calc_last_stripe(struct lov_stripe_md *lsm, int index,
+				   struct lu_extent *ext,
+				   int start_stripe, int *stripe_count)
+{
+	struct lov_stripe_md_entry *lsme = lsm->lsm_entries[index];
+	int last_stripe;
+	u64 obd_start;
+	u64 obd_end;
+	int i, j;
+
+	if (ext->e_end - ext->e_start >
+	    lsme->lsme_stripe_size * lsme->lsme_stripe_count) {
+		last_stripe = (start_stripe < 1 ? lsme->lsme_stripe_count - 1 :
+						  start_stripe - 1);
+		*stripe_count = lsme->lsme_stripe_count;
+	} else {
+		for (j = 0, i = start_stripe; j < lsme->lsme_stripe_count;
+		     i = (i + 1) % lsme->lsme_stripe_count, j++) {
+			if ((lov_stripe_intersects(lsm, index,  i, ext,
+						   &obd_start, &obd_end)) == 0)
+				break;
+		}
+		*stripe_count = j;
+		last_stripe = (start_stripe + j - 1) % lsme->lsme_stripe_count;
+	}
+
+	return last_stripe;
+}
+
+/**
+ * Set fe_device and copy extents from local buffer into main return buffer.
+ *
+ * \param fiemap [out]		fiemap to hold all extents
+ * \param lcl_fm_ext [in]	array of fiemap extents get from OSC layer
+ * \param ost_index [in]	OST index to be written into the fm_device
+ *				field for each extent
+ * \param ext_count [in]	number of extents to be copied
+ * \param current_extent [in]	where to start copying in the extent array
+ */
+static void fiemap_prepare_and_copy_exts(struct fiemap *fiemap,
+					 struct fiemap_extent *lcl_fm_ext,
+					 int ost_index, unsigned int ext_count,
+					 int current_extent)
+{
+	char		*to;
+	unsigned int	ext;
+
+	for (ext = 0; ext < ext_count; ext++) {
+		lcl_fm_ext[ext].fe_device = ost_index;
+		lcl_fm_ext[ext].fe_flags |= FIEMAP_EXTENT_NET;
+	}
+
+	/* Copy fm_extent's from fm_local to return buffer */
+	to = (char *)fiemap + fiemap_count_to_size(current_extent);
+	memcpy(to, lcl_fm_ext, ext_count * sizeof(struct fiemap_extent));
+}
+
+#define FIEMAP_BUFFER_SIZE 4096
+
+/**
+ * Non-zero fe_logical indicates that this is a continuation FIEMAP
+ * call. The local end offset and the device are sent in the first
+ * fm_extent. This function calculates the stripe number from the index.
+ * This function returns a stripe_no on which mapping is to be restarted.
+ *
+ * This function returns fm_end_offset which is the in-OST offset at which
+ * mapping should be restarted. If fm_end_offset=0 is returned then caller
+ * will re-calculate proper offset in next stripe.
+ * Note that the first extent is passed to lov_get_info via the value field.
+ *
+ * \param fiemap [in]		fiemap request header
+ * \param lsm [in]		striping information for the file
+ * \param index [in]		stripe component index
+ * \param ext [in]		logical extent of mapping
+ * \param start_stripe [out]	starting stripe will be returned in this
+ */
+static u64 fiemap_calc_fm_end_offset(struct fiemap *fiemap,
+				     struct lov_stripe_md *lsm,
+				     int index, struct lu_extent *ext,
+				     int *start_stripe)
+{
+	struct lov_stripe_md_entry *lsme = lsm->lsm_entries[index];
+	u64 local_end = fiemap->fm_extents[0].fe_logical;
+	u64 lun_start;
+	u64 lun_end;
+	u64 fm_end_offset;
+	int stripe_no = -1;
+	int i;
+
+	if (fiemap->fm_extent_count == 0 ||
+	    fiemap->fm_extents[0].fe_logical == 0)
+		return 0;
+
+	/* Find out stripe_no from ost_index saved in the fe_device */
+	for (i = 0; i < lsme->lsme_stripe_count; i++) {
+		struct lov_oinfo *oinfo = lsme->lsme_oinfo[i];
+
+		if (lov_oinfo_is_dummy(oinfo))
+			continue;
+
+		if (oinfo->loi_ost_idx == fiemap->fm_extents[0].fe_device) {
+			stripe_no = i;
+			break;
+		}
+	}
+
+	if (stripe_no == -1)
+		return -EINVAL;
+
+	/* If we have finished mapping on previous device, shift logical
+	 * offset to start of next device */
+	if (lov_stripe_intersects(lsm, index, stripe_no, ext,
+				   &lun_start, &lun_end) != 0 &&
+	    local_end < lun_end) {
+		fm_end_offset = local_end;
+		*start_stripe = stripe_no;
+	} else {
+		/* This is a special value to indicate that caller should
+		 * calculate offset in next stripe. */
+		fm_end_offset = 0;
+		*start_stripe = (stripe_no + 1) % lsme->lsme_stripe_count;
+	}
+
+	return fm_end_offset;
+}
+
+struct fiemap_state {
+	struct fiemap		*fs_fm;
+	struct lu_extent	fs_ext;
+	u64			fs_length;
+	u64			fs_end_offset;
+	int			fs_cur_extent;
+	int			fs_cnt_need;
+	int			fs_start_stripe;
+	int			fs_last_stripe;
+	bool			fs_device_done;
+	bool			fs_finish_stripe;
+	bool			fs_enough;
+};
+
+int fiemap_for_stripe(const struct lu_env *env, struct cl_object *obj,
+		      struct lov_stripe_md *lsm, struct fiemap *fiemap,
+		      size_t *buflen, struct ll_fiemap_info_key *fmkey,
+		      int index, int stripeno, struct fiemap_state *fs)
+{
+	struct lov_stripe_md_entry *lsme = lsm->lsm_entries[index];
+	struct cl_object *subobj;
+	struct lov_obd *lov = lu2lov_dev(obj->co_lu.lo_dev)->ld_lov;
+	struct fiemap_extent *fm_ext = &fs->fs_fm->fm_extents[0];
+	u64 req_fm_len; /* Stores length of required mapping */
+	u64 len_mapped_single_call;
+	u64 lun_start;
+	u64 lun_end;
+	u64 obd_object_end;
+	unsigned int ext_count;
+	/* EOF for object */
+	bool ost_eof = false;
+	/* done with required mapping for this OST? */
+	bool ost_done = false;
+	int ost_index;
+	int rc = 0;
+
+	fs->fs_device_done = false;
+	/* Find out range of mapping on this stripe */
+	if ((lov_stripe_intersects(lsm, index, stripeno, &fs->fs_ext,
+				   &lun_start, &obd_object_end)) == 0)
+		return 0;
+
+	if (lov_oinfo_is_dummy(lsme->lsme_oinfo[stripeno]))
+		return -EIO;
+
+	/* If this is a continuation FIEMAP call and we are on
+	 * starting stripe then lun_start needs to be set to
+	 * end_offset */
+	if (fs->fs_end_offset != 0 && stripeno == fs->fs_start_stripe)
+		lun_start = fs->fs_end_offset;
+	lun_end = lov_size_to_stripe(lsm, index, fs->fs_ext.e_end, stripeno);
+	if (lun_start == lun_end)
+		return 0;
+
+	req_fm_len = obd_object_end - lun_start;
+	fs->fs_fm->fm_length = 0;
+	len_mapped_single_call = 0;
+
+	/* find lobsub object */
+	subobj = lov_find_subobj(env, cl2lov(obj), lsm,
+				 lov_comp_index(index, stripeno));
+	if (IS_ERR(subobj))
+		return PTR_ERR(subobj);
+	/* If the output buffer is very large and the objects have many
+	 * extents we may need to loop on a single OST repeatedly */
+	do {
+		if (fiemap->fm_extent_count > 0) {
+			/* Don't get too many extents. */
+			if (fs->fs_cur_extent + fs->fs_cnt_need >
+			    fiemap->fm_extent_count)
+				fs->fs_cnt_need = fiemap->fm_extent_count -
+						  fs->fs_cur_extent;
+		}
+
+		lun_start += len_mapped_single_call;
+		fs->fs_fm->fm_length = req_fm_len - len_mapped_single_call;
+		req_fm_len = fs->fs_fm->fm_length;
+		/**
+		 * If we've collected enough extent map, we'd request 1 more,
+		 * to see whether we coincidentally finished all available
+		 * extent map, so that FIEMAP_EXTENT_LAST would be set.
+		 */
+		fs->fs_fm->fm_extent_count = fs->fs_enough ?
+					     1 : fs->fs_cnt_need;
+		fs->fs_fm->fm_mapped_extents = 0;
+		fs->fs_fm->fm_flags = fiemap->fm_flags;
+
+		ost_index = lsme->lsme_oinfo[stripeno]->loi_ost_idx;
+
+		if (ost_index < 0 || ost_index >= lov->desc.ld_tgt_count)
+			GOTO(obj_put, rc = -EINVAL);
+		/* If OST is inactive, return extent with UNKNOWN flag. */
+		if (!lov->lov_tgts[ost_index]->ltd_active) {
+			fs->fs_fm->fm_flags |= FIEMAP_EXTENT_LAST;
+			fs->fs_fm->fm_mapped_extents = 1;
+
+			fm_ext[0].fe_logical = lun_start;
+			fm_ext[0].fe_length = obd_object_end - lun_start;
+			fm_ext[0].fe_flags |= FIEMAP_EXTENT_UNKNOWN;
+
+			goto inactive_tgt;
+		}
+
+		fs->fs_fm->fm_start = lun_start;
+		fs->fs_fm->fm_flags &= ~FIEMAP_FLAG_DEVICE_ORDER;
+		memcpy(&fmkey->lfik_fiemap, fs->fs_fm, sizeof(*fs->fs_fm));
+		*buflen = fiemap_count_to_size(fs->fs_fm->fm_extent_count);
+
+		rc = cl_object_fiemap(env, subobj, fmkey, fs->fs_fm, buflen);
+		if (rc != 0)
+			GOTO(obj_put, rc);
+inactive_tgt:
+		ext_count = fs->fs_fm->fm_mapped_extents;
+		if (ext_count == 0) {
+			ost_done = true;
+			fs->fs_device_done = true;
+			/* If last stripe has hold at the end,
+			 * we need to return */
+			if (stripeno == fs->fs_last_stripe) {
+				fiemap->fm_mapped_extents = 0;
+				fs->fs_finish_stripe = true;
+				GOTO(obj_put, rc);
+			}
+			break;
+		} else if (fs->fs_enough) {
+			/*
+			 * We've collected enough extents and there are
+			 * more extents after it.
+			 */
+			GOTO(obj_put, rc);
+		}
+
+		/* If we just need num of extents, got to next device */
+		if (fiemap->fm_extent_count == 0) {
+			fs->fs_cur_extent += ext_count;
+			break;
+		}
+
+		/* prepare to copy retrived map extents */
+		len_mapped_single_call = fm_ext[ext_count - 1].fe_logical +
+					 fm_ext[ext_count - 1].fe_length -
+					 lun_start;
+
+		/* Have we finished mapping on this device? */
+		if (req_fm_len <= len_mapped_single_call) {
+			ost_done = true;
+			fs->fs_device_done = true;
+		}
+
+		/* Clear the EXTENT_LAST flag which can be present on
+		 * the last extent */
+		if (fm_ext[ext_count - 1].fe_flags & FIEMAP_EXTENT_LAST)
+			fm_ext[ext_count - 1].fe_flags &= ~FIEMAP_EXTENT_LAST;
+		if (lov_stripe_size(lsm, index,
+				    fm_ext[ext_count - 1].fe_logical +
+				    fm_ext[ext_count - 1].fe_length,
+				    stripeno) >= fmkey->lfik_oa.o_size) {
+			ost_eof = true;
+			fs->fs_device_done = true;
+		}
+
+		fiemap_prepare_and_copy_exts(fiemap, fm_ext, ost_index,
+					     ext_count, fs->fs_cur_extent);
+		fs->fs_cur_extent += ext_count;
+
+		/* Ran out of available extents? */
+		if (fs->fs_cur_extent >= fiemap->fm_extent_count)
+			fs->fs_enough = true;
+	} while (!ost_done && !ost_eof);
+
+	if (stripeno == fs->fs_last_stripe)
+		fs->fs_finish_stripe = true;
+obj_put:
+	cl_object_put(env, subobj);
+
+	return rc;
+}
+
+/**
+ * Break down the FIEMAP request and send appropriate calls to individual OSTs.
+ * This also handles the restarting of FIEMAP calls in case mapping overflows
+ * the available number of extents in single call.
+ *
+ * \param env [in]		lustre environment
+ * \param obj [in]		file object
+ * \param fmkey [in]		fiemap request header and other info
+ * \param fiemap [out]		fiemap buffer holding retrived map extents
+ * \param buflen [in/out]	max buffer length of @fiemap, when iterate
+ *				each OST, it is used to limit max map needed
+ * \retval 0	success
+ * \retval < 0	error
+ */
+static int lov_object_fiemap(const struct lu_env *env, struct cl_object *obj,
+			     struct ll_fiemap_info_key *fmkey,
+			     struct fiemap *fiemap, size_t *buflen)
+{
+	struct lov_stripe_md_entry *lsme;
+	struct lov_stripe_md *lsm;
+	struct fiemap *fm_local = NULL;
+	loff_t whole_start;
+	loff_t whole_end;
+	int entry;
+	int start_entry;
+	int end_entry;
+	int cur_stripe = 0;
+	int stripe_count;
+	unsigned int buffer_size = FIEMAP_BUFFER_SIZE;
+	int rc = 0;
+	struct fiemap_state fs = { 0 };
+	ENTRY;
+
+	lsm = lov_lsm_addref(cl2lov(obj));
+	if (lsm == NULL)
+		RETURN(-ENODATA);
+
+	if (!(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) {
+		/**
+		 * If the entry count > 1 or stripe_count > 1 and the
+		 * application does not understand DEVICE_ORDER flag,
+		 * it cannot interpret the extents correctly.
+		 */
+		if (lsm->lsm_entry_count > 1 ||
+		    (lsm->lsm_entry_count == 1 &&
+		     lsm->lsm_entries[0]->lsme_stripe_count > 1))
+			GOTO(out_lsm, rc = -ENOTSUPP);
+	}
+
+	if (lsm->lsm_is_released) {
+		if (fiemap->fm_start < fmkey->lfik_oa.o_size) {
+			/**
+			 * released file, return a minimal FIEMAP if
+			 * request fits in file-size.
+			 */
+			fiemap->fm_mapped_extents = 1;
+			fiemap->fm_extents[0].fe_logical = fiemap->fm_start;
+			if (fiemap->fm_start + fiemap->fm_length <
+			    fmkey->lfik_oa.o_size)
+				fiemap->fm_extents[0].fe_length =
+					fiemap->fm_length;
+			else
+				fiemap->fm_extents[0].fe_length =
+					fmkey->lfik_oa.o_size -
+					fiemap->fm_start;
+			fiemap->fm_extents[0].fe_flags |=
+				FIEMAP_EXTENT_UNKNOWN | FIEMAP_EXTENT_LAST;
+		}
+		GOTO(out_lsm, rc = 0);
+	}
+
+	/* buffer_size is small to hold fm_extent_count of extents. */
+	if (fiemap_count_to_size(fiemap->fm_extent_count) < buffer_size)
+		buffer_size = fiemap_count_to_size(fiemap->fm_extent_count);
+
+	OBD_ALLOC_LARGE(fm_local, buffer_size);
+	if (fm_local == NULL)
+		GOTO(out_lsm, rc = -ENOMEM);
+
+	/**
+	 * Requested extent count exceeds the fiemap buffer size, shrink our
+	 * ambition.
+	 */
+	if (fiemap_count_to_size(fiemap->fm_extent_count) > *buflen)
+		fiemap->fm_extent_count = fiemap_size_to_count(*buflen);
+	if (fiemap->fm_extent_count == 0)
+		fs.fs_cnt_need = 0;
+
+	fs.fs_enough = false;
+	fs.fs_cur_extent = 0;
+	fs.fs_fm = fm_local;
+	fs.fs_cnt_need = fiemap_size_to_count(buffer_size);
+
+	whole_start = fiemap->fm_start;
+	/* whole_start is beyond the end of the file */
+	if (whole_start > fmkey->lfik_oa.o_size)
+		GOTO(out_fm_local, rc = -EINVAL);
+	whole_end = (fiemap->fm_length == OBD_OBJECT_EOF) ?
+					fmkey->lfik_oa.o_size :
+					whole_start + fiemap->fm_length - 1;
+	/**
+	 * If fiemap->fm_length != OBD_OBJECT_EOF but whole_end exceeds file
+	 * size
+	 */
+	if (whole_end > fmkey->lfik_oa.o_size)
+		whole_end = fmkey->lfik_oa.o_size;
+
+	start_entry = lov_lsm_entry(lsm, whole_start);
+	end_entry = lov_lsm_entry(lsm, whole_end);
+	if (end_entry == -1)
+		end_entry = lsm->lsm_entry_count - 1;
+
+	if (start_entry == -1 || end_entry == -1)
+		GOTO(out_fm_local, rc = -EINVAL);
+
+	for (entry = start_entry; entry <= end_entry; entry++) {
+		lsme = lsm->lsm_entries[entry];
+
+		if (!lsme_inited(lsme))
+			break;
+
+		if (entry == start_entry)
+			fs.fs_ext.e_start = whole_start;
+		else
+			fs.fs_ext.e_start = lsme->lsme_extent.e_start;
+		if (entry == end_entry)
+			fs.fs_ext.e_end = whole_end;
+		else
+			fs.fs_ext.e_end = lsme->lsme_extent.e_end - 1;
+		fs.fs_length = fs.fs_ext.e_end - fs.fs_ext.e_start + 1;
+
+		/* Calculate start stripe, last stripe and length of mapping */
+		fs.fs_start_stripe = lov_stripe_number(lsm, entry,
+						       fs.fs_ext.e_start);
+		fs.fs_last_stripe = fiemap_calc_last_stripe(lsm, entry,
+					&fs.fs_ext, fs.fs_start_stripe,
+					&stripe_count);
+		fs.fs_end_offset = fiemap_calc_fm_end_offset(fiemap, lsm, entry,
+					&fs.fs_ext, &fs.fs_start_stripe);
+		/* Check each stripe */
+		for (cur_stripe = fs.fs_start_stripe; stripe_count > 0;
+		     --stripe_count,
+		     cur_stripe = (cur_stripe + 1) % lsme->lsme_stripe_count) {
+			rc = fiemap_for_stripe(env, obj, lsm, fiemap, buflen,
+					       fmkey, entry, cur_stripe, &fs);
+			if (rc < 0)
+				GOTO(out_fm_local, rc);
+			if (fs.fs_enough)
+				GOTO(finish, rc);
+			if (fs.fs_finish_stripe)
+				break;
+		} /* for each stripe */
+	} /* for covering layout component */
+	/*
+	 * We've traversed all components, set @entry to the last component
+	 * entry, it's for the last stripe check.
+	 */
+	entry--;
+finish:
+	/* Indicate that we are returning device offsets unless file just has
+	 * single stripe */
+	if (lsm->lsm_entry_count > 1 ||
+	    (lsm->lsm_entry_count == 1 &&
+	     lsm->lsm_entries[0]->lsme_stripe_count > 1))
+		fiemap->fm_flags |= FIEMAP_FLAG_DEVICE_ORDER;
+
+	if (fiemap->fm_extent_count == 0)
+		goto skip_last_device_calc;
+
+	/* Check if we have reached the last stripe and whether mapping for that
+	 * stripe is done. */
+	if ((cur_stripe == fs.fs_last_stripe) && fs.fs_device_done)
+		fiemap->fm_extents[fs.fs_cur_extent - 1].fe_flags |=
+							     FIEMAP_EXTENT_LAST;
+skip_last_device_calc:
+	fiemap->fm_mapped_extents = fs.fs_cur_extent;
+out_fm_local:
+	OBD_FREE_LARGE(fm_local, buffer_size);
+
+out_lsm:
+	lov_lsm_put(lsm);
+	return rc;
+}
+
+static int lov_object_getstripe(const struct lu_env *env, struct cl_object *obj,
+				struct lov_user_md __user *lum, size_t size)
+{
+	struct lov_object	*lov = cl2lov(obj);
+	struct lov_stripe_md	*lsm;
+	int			rc = 0;
+	ENTRY;
+
+	lsm = lov_lsm_addref(lov);
+	if (lsm == NULL)
+		RETURN(-ENODATA);
+
+	rc = lov_getstripe(env, cl2lov(obj), lsm, lum, size);
+	lov_lsm_put(lsm);
+	RETURN(rc);
+}
+
+static int lov_object_layout_get(const struct lu_env *env,
+				 struct cl_object *obj,
+				 struct cl_layout *cl)
+{
+	struct lov_object *lov = cl2lov(obj);
+	struct lov_stripe_md *lsm = lov_lsm_addref(lov);
+	struct lu_buf *buf = &cl->cl_buf;
+	ssize_t rc;
+	ENTRY;
+
+	if (lsm == NULL) {
+		cl->cl_size = 0;
+		cl->cl_layout_gen = CL_LAYOUT_GEN_EMPTY;
+
+		RETURN(0);
+	}
+
+	cl->cl_size = lov_comp_md_size(lsm);
+	cl->cl_layout_gen = lsm->lsm_layout_gen;
+	cl->cl_is_composite = lsm_is_composite(lsm->lsm_magic);
+
+	rc = lov_lsm_pack(lsm, buf->lb_buf, buf->lb_len);
+	lov_lsm_put(lsm);
+
+	RETURN(rc < 0 ? rc : 0);
+}
+
+static loff_t lov_object_maxbytes(struct cl_object *obj)
+{
+	struct lov_object *lov = cl2lov(obj);
+	struct lov_stripe_md *lsm = lov_lsm_addref(lov);
+	loff_t maxbytes;
+
+	if (lsm == NULL)
+		return LLONG_MAX;
+
+	maxbytes = lsm->lsm_maxbytes;
+
+	lov_lsm_put(lsm);
+
+	return maxbytes;
+}
+
+static const struct cl_object_operations lov_ops = {
+	.coo_page_init    = lov_page_init,
+	.coo_lock_init    = lov_lock_init,
+	.coo_io_init      = lov_io_init,
+	.coo_attr_get     = lov_attr_get,
+	.coo_attr_update  = lov_attr_update,
+	.coo_conf_set     = lov_conf_set,
+	.coo_getstripe    = lov_object_getstripe,
+	.coo_layout_get   = lov_object_layout_get,
+	.coo_maxbytes     = lov_object_maxbytes,
+	.coo_fiemap       = lov_object_fiemap,
+};
+
+static const struct lu_object_operations lov_lu_obj_ops = {
+        .loo_object_init      = lov_object_init,
+        .loo_object_delete    = lov_object_delete,
+        .loo_object_release   = NULL,
+        .loo_object_free      = lov_object_free,
+        .loo_object_print     = lov_object_print,
+        .loo_object_invariant = NULL
+};
+
+struct lu_object *lov_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *unused,
+				   struct lu_device *dev)
+{
+	struct lov_object *lov;
+	struct lu_object  *obj;
+
+	ENTRY;
+	OBD_SLAB_ALLOC_PTR_GFP(lov, lov_object_kmem, GFP_NOFS);
+	if (lov != NULL) {
+		obj = lov2lu(lov);
+		lu_object_init(obj, NULL, dev);
+		lov->lo_cl.co_ops = &lov_ops;
+		lov->lo_type = -1; /* invalid, to catch uninitialized type */
+		/*
+		 * object io operation vector (cl_object::co_iop) is installed
+		 * later in lov_object_init(), as different vectors are used
+		 * for object with different layouts.
+		 */
+		obj->lo_ops = &lov_lu_obj_ops;
+	} else
+		obj = NULL;
+	RETURN(obj);
+}
+
+struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov)
+{
+	struct lov_stripe_md *lsm = NULL;
+
+	lov_conf_freeze(lov);
+	if (lov->lo_lsm != NULL) {
+		lsm = lsm_addref(lov->lo_lsm);
+		CDEBUG(D_INODE, "lsm %p addref %d/%d by %p.\n",
+			lsm, atomic_read(&lsm->lsm_refc),
+			lov->lo_layout_invalid, current);
+	}
+	lov_conf_thaw(lov);
+	return lsm;
+}
+
+int lov_read_and_clear_async_rc(struct cl_object *clob)
+{
+	struct lu_object *luobj;
+	int rc = 0;
+	ENTRY;
+
+	luobj = lu_object_locate(&cl_object_header(clob)->coh_lu,
+				 &lov_device_type);
+	if (luobj != NULL) {
+		struct lov_object *lov = lu2lov(luobj);
+
+		lov_conf_freeze(lov);
+		switch (lov->lo_type) {
+		case LLT_COMP: {
+			struct lov_stripe_md *lsm;
+			int i;
+
+			lsm = lov->lo_lsm;
+			LASSERT(lsm != NULL);
+			for (i = 0; i < lsm->lsm_entry_count; i++) {
+				struct lov_stripe_md_entry *lse =
+						lsm->lsm_entries[i];
+				int j;
+
+				if (!lsme_inited(lse))
+					break;
+
+				for (j = 0; j < lse->lsme_stripe_count; j++) {
+					struct lov_oinfo *loi =
+							lse->lsme_oinfo[j];
+
+					if (lov_oinfo_is_dummy(loi))
+						continue;
+
+					if (loi->loi_ar.ar_rc && !rc)
+						rc = loi->loi_ar.ar_rc;
+					loi->loi_ar.ar_rc = 0;
+				}
+			}
+		}
+		case LLT_RELEASED:
+		case LLT_EMPTY:
+			break;
+		default:
+			LBUG();
+		}
+		lov_conf_thaw(lov);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lov_read_and_clear_async_rc);
+
+/** @} lov */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_offset.c b/drivers/staging/lustrefsx/lustre/lov/lov_offset.c
new file mode 100644
index 0000000000000..3ff0a38a7e263
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_offset.c
@@ -0,0 +1,288 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <libcfs/libcfs.h>
+
+#include <obd_class.h>
+
+#include "lov_internal.h"
+
+static loff_t stripe_width(struct lov_stripe_md *lsm, unsigned int index)
+{
+	struct lov_stripe_md_entry *entry = lsm->lsm_entries[index];
+
+	LASSERT(index < lsm->lsm_entry_count);
+
+	return (loff_t)entry->lsme_stripe_size * entry->lsme_stripe_count;
+}
+
+/* compute object size given "stripeno" and the ost size */
+u64 lov_stripe_size(struct lov_stripe_md *lsm, int index, u64 ost_size,
+		    int stripeno)
+{
+	unsigned long ssize = lsm->lsm_entries[index]->lsme_stripe_size;
+	unsigned long stripe_size;
+	loff_t swidth;
+	loff_t lov_size;
+        ENTRY;
+
+        if (ost_size == 0)
+                RETURN(0);
+
+	swidth = stripe_width(lsm, index);
+
+	/* lov_do_div64(a, b) returns a % b, and a = a / b */
+	stripe_size = lov_do_div64(ost_size, ssize);
+	if (stripe_size)
+		lov_size = ost_size * swidth + stripeno * ssize + stripe_size;
+	else
+		lov_size = (ost_size - 1) * swidth + (stripeno + 1) * ssize;
+
+        RETURN(lov_size);
+}
+
+/**
+ * Compute file level page index by stripe level page offset
+ */
+pgoff_t lov_stripe_pgoff(struct lov_stripe_md *lsm, int index,
+			 pgoff_t stripe_index, int stripe)
+{
+	loff_t offset;
+
+	offset = lov_stripe_size(lsm, index,
+				 (stripe_index << PAGE_SHIFT) + 1,
+				 stripe);
+	return offset >> PAGE_SHIFT;
+}
+
+/* we have an offset in file backed by an lov and want to find out where
+ * that offset lands in our given stripe of the file.  for the easy
+ * case where the offset is within the stripe, we just have to scale the
+ * offset down to make it relative to the stripe instead of the lov.
+ *
+ * the harder case is what to do when the offset doesn't intersect the
+ * stripe.  callers will want start offsets clamped ahead to the start
+ * of the nearest stripe in the file.  end offsets similarly clamped to the
+ * nearest ending byte of a stripe in the file:
+ *
+ * all this function does is move offsets to the nearest region of the
+ * stripe, and it does its work "mod" the full length of all the stripes.
+ * consider a file with 3 stripes:
+ *
+ *             S                                              E
+ * ---------------------------------------------------------------------
+ * |    0    |     1     |     2     |    0    |     1     |     2     |
+ * ---------------------------------------------------------------------
+ *
+ * to find stripe 1's offsets for S and E, it divides by the full stripe
+ * width and does its math in the context of a single set of stripes:
+ *
+ *             S         E
+ * -----------------------------------
+ * |    0    |     1     |     2     |
+ * -----------------------------------
+ *
+ * it'll notice that E is outside stripe 1 and clamp it to the end of the
+ * stripe, then multiply it back out by lov_off to give the real offsets in
+ * the stripe:
+ *
+ *   S                   E
+ * ---------------------------------------------------------------------
+ * |    1    |     1     |     1     |    1    |     1     |     1     |
+ * ---------------------------------------------------------------------
+ *
+ * it would have done similarly and pulled S forward to the start of a 1
+ * stripe if, say, S had landed in a 0 stripe.
+ *
+ * this rounding isn't always correct.  consider an E lov offset that lands
+ * on a 0 stripe, the "mod stripe width" math will pull it forward to the
+ * start of a 1 stripe, when in fact it wanted to be rounded back to the end
+ * of a previous 1 stripe.  this logic is handled by callers and this is why:
+ *
+ * this function returns < 0 when the offset was "before" the stripe and
+ * was moved forward to the start of the stripe in question;  0 when it
+ * falls in the stripe and no shifting was done; > 0 when the offset
+ * was outside the stripe and was pulled back to its final byte. */
+int lov_stripe_offset(struct lov_stripe_md *lsm, int index, loff_t lov_off,
+		      int stripeno, loff_t *obdoff)
+{
+	unsigned long ssize  = lsm->lsm_entries[index]->lsme_stripe_size;
+	loff_t stripe_off;
+	loff_t this_stripe;
+	loff_t swidth;
+        int ret = 0;
+
+        if (lov_off == OBD_OBJECT_EOF) {
+                *obdoff = OBD_OBJECT_EOF;
+                return 0;
+        }
+
+	swidth = stripe_width(lsm, index);
+
+	/* lov_do_div64(a, b) returns a % b, and a = a / b */
+	stripe_off = lov_do_div64(lov_off, swidth);
+
+	this_stripe = (loff_t)stripeno * ssize;
+        if (stripe_off < this_stripe) {
+                stripe_off = 0;
+                ret = -1;
+        } else {
+                stripe_off -= this_stripe;
+
+                if (stripe_off >= ssize) {
+                        stripe_off = ssize;
+                        ret = 1;
+                }
+        }
+
+        *obdoff = lov_off * ssize + stripe_off;
+        return ret;
+}
+
+/* Given a whole-file size and a stripe number, give the file size which
+ * corresponds to the individual object of that stripe.
+ *
+ * This behaves basically in the same was as lov_stripe_offset, except that
+ * file sizes falling before the beginning of a stripe are clamped to the end
+ * of the previous stripe, not the beginning of the next:
+ *
+ *                                               S
+ * ---------------------------------------------------------------------
+ * |    0    |     1     |     2     |    0    |     1     |     2     |
+ * ---------------------------------------------------------------------
+ *
+ * if clamped to stripe 2 becomes:
+ *
+ *                                   S
+ * ---------------------------------------------------------------------
+ * |    0    |     1     |     2     |    0    |     1     |     2     |
+ * ---------------------------------------------------------------------
+ */
+loff_t lov_size_to_stripe(struct lov_stripe_md *lsm, int index, u64 file_size,
+			  int stripeno)
+{
+	unsigned long ssize = lsm->lsm_entries[index]->lsme_stripe_size;
+	loff_t stripe_off;
+	loff_t this_stripe;
+	loff_t swidth;
+
+        if (file_size == OBD_OBJECT_EOF)
+                return OBD_OBJECT_EOF;
+
+	swidth = stripe_width(lsm, index);
+
+	/* lov_do_div64(a, b) returns a % b, and a = a / b */
+	stripe_off = lov_do_div64(file_size, swidth);
+
+	this_stripe = (loff_t)stripeno * ssize;
+        if (stripe_off < this_stripe) {
+                /* Move to end of previous stripe, or zero */
+                if (file_size > 0) {
+                        file_size--;
+                        stripe_off = ssize;
+                } else {
+                        stripe_off = 0;
+                }
+        } else {
+                stripe_off -= this_stripe;
+
+                if (stripe_off >= ssize) {
+                        /* Clamp to end of this stripe */
+                        stripe_off = ssize;
+                }
+        }
+
+        return (file_size * ssize + stripe_off);
+}
+
+/* given an extent in an lov and a stripe, calculate the extent of the stripe
+ * that is contained within the lov extent.  this returns true if the given
+ * stripe does intersect with the lov extent. */
+int lov_stripe_intersects(struct lov_stripe_md *lsm, int index, int stripeno,
+			  struct lu_extent *ext, u64 *obd_start, u64 *obd_end)
+{
+	struct lov_stripe_md_entry *entry = lsm->lsm_entries[index];
+	u64 start, end;
+        int start_side, end_side;
+
+	if (!lu_extent_is_overlapped(ext, &entry->lsme_extent))
+			return 0;
+
+	start = max_t(__u64, ext->e_start, entry->lsme_extent.e_start);
+	end = min_t(__u64, ext->e_end, entry->lsme_extent.e_end);
+	if (end != OBD_OBJECT_EOF)
+		end--;
+
+	start_side = lov_stripe_offset(lsm, index, start, stripeno, obd_start);
+	end_side = lov_stripe_offset(lsm, index, end, stripeno, obd_end);
+
+	CDEBUG(D_INODE, "[%lld->%lld] -> [(%d) %lld->%lld (%d)]\n",
+		start, end, start_side, *obd_start, *obd_end, end_side);
+
+        /* this stripe doesn't intersect the file extent when neither
+         * start or the end intersected the stripe and obd_start and
+         * obd_end got rounded up to the save value. */
+        if (start_side != 0 && end_side != 0 && *obd_start == *obd_end)
+                return 0;
+
+        /* as mentioned in the lov_stripe_offset commentary, end
+         * might have been shifted in the wrong direction.  This
+         * happens when an end offset is before the stripe when viewed
+         * through the "mod stripe size" math. we detect it being shifted
+         * in the wrong direction and touch it up.
+         * interestingly, this can't underflow since end must be > start
+         * if we passed through the previous check.
+         * (should we assert for that somewhere?) */
+        if (end_side != 0)
+                (*obd_end)--;
+
+        return 1;
+}
+
+/* compute which stripe number "lov_off" will be written into */
+int lov_stripe_number(struct lov_stripe_md *lsm, int index, loff_t lov_off)
+{
+	unsigned long ssize = lsm->lsm_entries[index]->lsme_stripe_size;
+	loff_t stripe_off;
+	loff_t swidth;
+
+	swidth = stripe_width(lsm, index);
+
+	stripe_off = lov_do_div64(lov_off, swidth);
+
+	/* Puts stripe_off/ssize result into stripe_off */
+	lov_do_div64(stripe_off, ssize);
+
+	return stripe_off;
+}
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_pack.c b/drivers/staging/lustrefsx/lustre/lov/lov_pack.c
new file mode 100644
index 0000000000000..940888afffdac
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_pack.c
@@ -0,0 +1,461 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_pack.c
+ *
+ * (Un)packing of OST/MDS requests
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <lustre/lustre_idl.h>
+#include <lustre/lustre_user.h>
+
+#include <lustre_net.h>
+#include <lustre_swab.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+
+#include "lov_cl_internal.h"
+#include "lov_internal.h"
+
+void lov_dump_lmm_common(int level, void *lmmp)
+{
+	struct lov_mds_md *lmm = lmmp;
+	struct ost_id	oi;
+
+	lmm_oi_le_to_cpu(&oi, &lmm->lmm_oi);
+	CDEBUG(level, "objid "DOSTID", magic 0x%08x, pattern %#x\n",
+	       POSTID(&oi), le32_to_cpu(lmm->lmm_magic),
+	       le32_to_cpu(lmm->lmm_pattern));
+	CDEBUG(level, "stripe_size %u, stripe_count %u, layout_gen %u\n",
+	       le32_to_cpu(lmm->lmm_stripe_size),
+	       le16_to_cpu(lmm->lmm_stripe_count),
+	       le16_to_cpu(lmm->lmm_layout_gen));
+}
+
+static void lov_dump_lmm_objects(int level, struct lov_ost_data *lod,
+				 int stripe_count)
+{
+	int i;
+
+	if (stripe_count > LOV_V1_INSANE_STRIPE_COUNT) {
+		CDEBUG(level, "bad stripe_count %u > max_stripe_count %u\n",
+		       stripe_count, LOV_V1_INSANE_STRIPE_COUNT);
+		return;
+	}
+
+	for (i = 0; i < stripe_count; ++i, ++lod) {
+		struct ost_id oi;
+
+		ostid_le_to_cpu(&lod->l_ost_oi, &oi);
+		CDEBUG(level, "stripe %u idx %u subobj "DOSTID"\n", i,
+		       le32_to_cpu(lod->l_ost_idx), POSTID(&oi));
+	}
+}
+
+void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm)
+{
+        lov_dump_lmm_common(level, lmm);
+        lov_dump_lmm_objects(level, lmm->lmm_objects,
+                             le16_to_cpu(lmm->lmm_stripe_count));
+}
+
+void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm)
+{
+	lov_dump_lmm_common(level, lmm);
+	CDEBUG(level, "pool_name "LOV_POOLNAMEF"\n", lmm->lmm_pool_name);
+	lov_dump_lmm_objects(level, lmm->lmm_objects,
+			     le16_to_cpu(lmm->lmm_stripe_count));
+}
+
+void lov_dump_lmm(int level, void *lmm)
+{
+	int magic;
+
+	magic = le32_to_cpu(((struct lov_mds_md *)lmm)->lmm_magic);
+	switch (magic) {
+	case LOV_MAGIC_V1:
+		lov_dump_lmm_v1(level, (struct lov_mds_md_v1 *)lmm);
+		break;
+	case LOV_MAGIC_V3:
+		lov_dump_lmm_v3(level, (struct lov_mds_md_v3 *)lmm);
+		break;
+	default:
+		CDEBUG(level, "unrecognized lmm_magic %x, assuming %x\n",
+		       magic, LOV_MAGIC_V1);
+		lov_dump_lmm_common(level, lmm);
+		break;
+	}
+}
+
+/**
+ * Pack LOV striping metadata for disk storage format (in little
+ * endian byte order).
+ *
+ * This follows the getxattr() conventions. If \a buf_size is zero
+ * then return the size needed. If \a buf_size is too small then
+ * return -ERANGE. Otherwise return the size of the result.
+ */
+ssize_t lov_lsm_pack_v1v3(const struct lov_stripe_md *lsm, void *buf,
+			  size_t buf_size)
+{
+	struct lov_mds_md_v1 *lmmv1 = buf;
+	struct lov_mds_md_v3 *lmmv3 = buf;
+	struct lov_ost_data_v1 *lmm_objects;
+	size_t lmm_size;
+	unsigned int i;
+	ENTRY;
+
+	lmm_size = lov_mds_md_size(lsm->lsm_entries[0]->lsme_stripe_count,
+				   lsm->lsm_magic);
+	if (buf_size == 0)
+		RETURN(lmm_size);
+
+	if (buf_size < lmm_size)
+		RETURN(-ERANGE);
+
+	/* lmmv1 and lmmv3 point to the same struct and have the
+	 * same first fields
+	 */
+	lmmv1->lmm_magic = cpu_to_le32(lsm->lsm_magic);
+	lmm_oi_cpu_to_le(&lmmv1->lmm_oi, &lsm->lsm_oi);
+	lmmv1->lmm_stripe_size = cpu_to_le32(
+				lsm->lsm_entries[0]->lsme_stripe_size);
+	lmmv1->lmm_stripe_count = cpu_to_le16(
+				lsm->lsm_entries[0]->lsme_stripe_count);
+	lmmv1->lmm_pattern = cpu_to_le32(lsm->lsm_entries[0]->lsme_pattern);
+	lmmv1->lmm_layout_gen = cpu_to_le16(lsm->lsm_layout_gen);
+
+	if (lsm->lsm_magic == LOV_MAGIC_V3) {
+		CLASSERT(sizeof(lsm->lsm_entries[0]->lsme_pool_name) ==
+			 sizeof(lmmv3->lmm_pool_name));
+		strlcpy(lmmv3->lmm_pool_name,
+			lsm->lsm_entries[0]->lsme_pool_name,
+			sizeof(lmmv3->lmm_pool_name));
+		lmm_objects = lmmv3->lmm_objects;
+	} else {
+		lmm_objects = lmmv1->lmm_objects;
+	}
+
+	if (lsm->lsm_is_released)
+		RETURN(lmm_size);
+
+	for (i = 0; i < lsm->lsm_entries[0]->lsme_stripe_count; i++) {
+		struct lov_oinfo *loi = lsm->lsm_entries[0]->lsme_oinfo[i];
+
+		ostid_cpu_to_le(&loi->loi_oi, &lmm_objects[i].l_ost_oi);
+		lmm_objects[i].l_ost_gen = cpu_to_le32(loi->loi_ost_gen);
+		lmm_objects[i].l_ost_idx = cpu_to_le32(loi->loi_ost_idx);
+	}
+
+	RETURN(lmm_size);
+}
+
+ssize_t lov_lsm_pack(const struct lov_stripe_md *lsm, void *buf,
+		     size_t buf_size)
+{
+	struct lov_comp_md_v1 *lcmv1 = buf;
+	struct lov_comp_md_entry_v1 *lcme;
+	struct lov_ost_data_v1 *lmm_objects;
+	size_t lmm_size;
+	unsigned int entry;
+	unsigned int offset;
+	unsigned int size;
+	unsigned int i;
+	ENTRY;
+
+	if (lsm->lsm_magic == LOV_MAGIC_V1 || lsm->lsm_magic == LOV_MAGIC_V3)
+		return lov_lsm_pack_v1v3(lsm, buf, buf_size);
+
+	lmm_size = lov_comp_md_size(lsm);
+	if (buf_size == 0)
+		RETURN(lmm_size);
+
+	if (buf_size < lmm_size)
+		RETURN(-ERANGE);
+
+	lcmv1->lcm_magic = cpu_to_le32(lsm->lsm_magic);
+	lcmv1->lcm_size = cpu_to_le32(lmm_size);
+	lcmv1->lcm_layout_gen = cpu_to_le32(lsm->lsm_layout_gen);
+	lcmv1->lcm_entry_count = cpu_to_le16(lsm->lsm_entry_count);
+
+	offset = sizeof(*lcmv1) + sizeof(*lcme) * lsm->lsm_entry_count;
+
+	for (entry = 0; entry < lsm->lsm_entry_count; entry++) {
+		struct lov_stripe_md_entry *lsme;
+		struct lov_mds_md *lmm;
+		__u16 stripe_count;
+
+		lsme = lsm->lsm_entries[entry];
+		lcme = &lcmv1->lcm_entries[entry];
+
+		lcme->lcme_id = cpu_to_le32(lsme->lsme_id);
+		lcme->lcme_flags = cpu_to_le32(lsme->lsme_flags);
+		lcme->lcme_extent.e_start =
+			cpu_to_le64(lsme->lsme_extent.e_start);
+		lcme->lcme_extent.e_end =
+			cpu_to_le64(lsme->lsme_extent.e_end);
+		lcme->lcme_offset = cpu_to_le32(offset);
+
+		lmm = (struct lov_mds_md *)((char *)lcmv1 + offset);
+		lmm->lmm_magic = cpu_to_le32(lsme->lsme_magic);
+		/* lmm->lmm_oi not set */
+		lmm->lmm_pattern = cpu_to_le32(lsme->lsme_pattern);
+		lmm->lmm_stripe_size = cpu_to_le32(lsme->lsme_stripe_size);
+		lmm->lmm_stripe_count = cpu_to_le16(lsme->lsme_stripe_count);
+		lmm->lmm_layout_gen = cpu_to_le16(lsme->lsme_layout_gen);
+
+		if (lsme->lsme_magic == LOV_MAGIC_V3) {
+			struct lov_mds_md_v3 *lmmv3 =
+						(struct lov_mds_md_v3 *)lmm;
+
+			strlcpy(lmmv3->lmm_pool_name, lsme->lsme_pool_name,
+				sizeof(lmmv3->lmm_pool_name));
+			lmm_objects = lmmv3->lmm_objects;
+		} else {
+			lmm_objects =
+				((struct lov_mds_md_v1 *)lmm)->lmm_objects;
+		}
+
+		if (lsme_inited(lsme) &&
+		    !(lsme->lsme_pattern & LOV_PATTERN_F_RELEASED))
+			stripe_count = lsme->lsme_stripe_count;
+		else
+			stripe_count = 0;
+
+		for (i = 0; i < stripe_count; i++) {
+			struct lov_oinfo *loi = lsme->lsme_oinfo[i];
+
+			ostid_cpu_to_le(&loi->loi_oi, &lmm_objects[i].l_ost_oi);
+			lmm_objects[i].l_ost_gen =
+					cpu_to_le32(loi->loi_ost_gen);
+			lmm_objects[i].l_ost_idx =
+					cpu_to_le32(loi->loi_ost_idx);
+		}
+
+		size = lov_mds_md_size(stripe_count, lsme->lsme_magic);
+		lcme->lcme_size = cpu_to_le32(size);
+		offset += size;
+	} /* for each layout component */
+
+	RETURN(lmm_size);
+}
+
+/* Find the max stripecount we should use */
+__u16 lov_get_stripe_count(struct lov_obd *lov, __u32 magic, __u16 stripe_count)
+{
+	__u32 max_stripes = LOV_MAX_STRIPE_COUNT_OLD;
+
+	if (!stripe_count)
+		stripe_count = lov->desc.ld_default_stripe_count;
+	if (stripe_count > lov->desc.ld_active_tgt_count)
+		stripe_count = lov->desc.ld_active_tgt_count;
+	if (!stripe_count)
+		stripe_count = 1;
+
+	/* stripe count is based on whether ldiskfs can handle
+	 * larger EA sizes */
+	if (lov->lov_ocd.ocd_connect_flags & OBD_CONNECT_MAX_EASIZE &&
+	    lov->lov_ocd.ocd_max_easize)
+		max_stripes = lov_mds_md_max_stripe_count(
+			lov->lov_ocd.ocd_max_easize, magic);
+
+	if (stripe_count > max_stripes)
+		stripe_count = max_stripes;
+
+	return stripe_count;
+}
+
+int lov_free_memmd(struct lov_stripe_md **lsmp)
+{
+	struct lov_stripe_md *lsm = *lsmp;
+	int refc;
+
+	*lsmp = NULL;
+	refc = atomic_dec_return(&lsm->lsm_refc);
+	LASSERT(refc >= 0);
+	if (refc == 0)
+		lsm_free(lsm);
+
+	return refc;
+}
+
+/* Unpack LOV object metadata from disk storage.  It is packed in LE byte
+ * order and is opaque to the networking layer.
+ */
+struct lov_stripe_md *lov_unpackmd(struct lov_obd *lov, void *buf,
+				   size_t buf_size)
+{
+	const struct lsm_operations *op;
+	struct lov_stripe_md *lsm;
+	u32 magic;
+	ENTRY;
+
+	if (buf_size < sizeof(magic))
+		RETURN(ERR_PTR(-EINVAL));
+
+	magic = le32_to_cpu(*(u32 *)buf);
+	op = lsm_op_find(magic);
+	if (op == NULL)
+		RETURN(ERR_PTR(-EINVAL));
+
+	lsm = op->lsm_unpackmd(lov, buf, buf_size);
+
+	RETURN(lsm);
+}
+
+/* Retrieve object striping information.
+ *
+ * @lump is a pointer to an in-core struct with lmm_ost_count indicating
+ * the maximum number of OST indices which will fit in the user buffer.
+ * lmm_magic must be LOV_USER_MAGIC.
+ *
+ * If @size > 0, User specified limited buffer size, usually the buffer is from
+ * ll_lov_setstripe(), and the buffer can only hold basic layout template info.
+ */
+int lov_getstripe(const struct lu_env *env, struct lov_object *obj,
+		  struct lov_stripe_md *lsm, struct lov_user_md __user *lump,
+		  size_t size)
+{
+	/* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */
+	struct lov_mds_md *lmmk, *lmm;
+	struct lov_user_md_v1 lum;
+	size_t	lmmk_size;
+	ssize_t	lmm_size, lum_size = 0;
+	static bool printed;
+	int	rc = 0;
+	ENTRY;
+
+	if (lsm->lsm_magic != LOV_MAGIC_V1 && lsm->lsm_magic != LOV_MAGIC_V3 &&
+	    lsm->lsm_magic != LOV_MAGIC_COMP_V1) {
+		CERROR("bad LSM MAGIC: 0x%08X != 0x%08X nor 0x%08X\n",
+		       lsm->lsm_magic, LOV_MAGIC_V1, LOV_MAGIC_V3);
+		GOTO(out, rc = -EIO);
+	}
+
+	if (!printed) {
+		LCONSOLE_WARN("%s: using old ioctl(LL_IOC_LOV_GETSTRIPE) on "
+			      DFID", use llapi_layout_get_by_path()\n",
+			      current->comm,
+			      PFID(&obj->lo_cl.co_lu.lo_header->loh_fid));
+		printed = true;
+	}
+
+	lmmk_size = lov_comp_md_size(lsm);
+
+	OBD_ALLOC_LARGE(lmmk, lmmk_size);
+	if (lmmk == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	lmm_size = lov_lsm_pack(lsm, lmmk, lmmk_size);
+	if (lmm_size < 0)
+		GOTO(out_free, rc = lmm_size);
+
+	if (cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) {
+		if (lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
+		    lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
+			lustre_swab_lov_mds_md(lmmk);
+			lustre_swab_lov_user_md_objects(
+				(struct lov_user_ost_data *)lmmk->lmm_objects,
+				lmmk->lmm_stripe_count);
+		} else if (lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_COMP_V1)) {
+			lustre_swab_lov_comp_md_v1(
+					(struct lov_comp_md_v1 *)lmmk);
+		}
+	}
+
+	/* Legacy appication passes limited buffer, we need to figure out
+	 * the user buffer size by the passed in lmm_stripe_count. */
+	if (copy_from_user(&lum, lump, sizeof(struct lov_user_md_v1)))
+		GOTO(out_free, rc = -EFAULT);
+
+	if (lum.lmm_magic == LOV_USER_MAGIC_V1 ||
+	    lum.lmm_magic == LOV_USER_MAGIC_V3)
+		lum_size = lov_user_md_size(lum.lmm_stripe_count,
+					    lum.lmm_magic);
+
+	if (lum_size != 0) {
+		struct lov_mds_md *comp_md = lmmk;
+
+		/* Legacy app (ADIO for instance) treats the layout as V1/V3
+		 * blindly, we'd return a reasonable V1/V3 for them. */
+		if (lmmk->lmm_magic == LOV_MAGIC_COMP_V1) {
+			struct lov_comp_md_v1 *comp_v1;
+			struct cl_object *cl_obj;
+			struct cl_attr attr;
+			int i;
+
+			attr.cat_size = 0;
+			cl_obj = cl_object_top(&obj->lo_cl);
+			cl_object_attr_lock(cl_obj);
+			cl_object_attr_get(env, cl_obj, &attr);
+			cl_object_attr_unlock(cl_obj);
+
+			/* return the last instantiated component if file size
+			 * is non-zero, otherwise, return the last component.*/
+			comp_v1 = (struct lov_comp_md_v1 *)lmmk;
+			i = attr.cat_size == 0 ? comp_v1->lcm_entry_count : 0;
+			for (; i < comp_v1->lcm_entry_count; i++) {
+				if (!(comp_v1->lcm_entries[i].lcme_flags &
+						LCME_FL_INIT))
+					break;
+			}
+			if (i > 0)
+				i--;
+			comp_md = (struct lov_mds_md *)((char *)comp_v1 +
+					comp_v1->lcm_entries[i].lcme_offset);
+		}
+
+		lmm = comp_md;
+		lmm_size = lum_size;
+	} else {
+		lmm = lmmk;
+		lmm_size = lmmk_size;
+	}
+	/**
+	 * User specified limited buffer size, usually the buffer is
+	 * from ll_lov_setstripe(), and the buffer can only hold basic
+	 * layout template info.
+	 */
+	if (size == 0 || size > lmm_size)
+		size = lmm_size;
+	if (copy_to_user(lump, lmm, size))
+		GOTO(out_free, rc = -EFAULT);
+
+out_free:
+	OBD_FREE_LARGE(lmmk, lmmk_size);
+out:
+	RETURN(rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_page.c b/drivers/staging/lustrefsx/lustre/lov/lov_page.c
new file mode 100644
index 0000000000000..869c0b8478760
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_page.c
@@ -0,0 +1,149 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lov page operations.
+ *
+ */
+
+static int lov_comp_page_print(const struct lu_env *env,
+			       const struct cl_page_slice *slice,
+			       void *cookie, lu_printer_t printer)
+{
+	struct lov_page *lp = cl2lov_page(slice);
+
+	return (*printer)(env, cookie,
+			  LUSTRE_LOV_NAME"-page@%p, comp index: %x\n",
+			  lp, lp->lps_index);
+}
+
+static const struct cl_page_operations lov_comp_page_ops = {
+	.cpo_print = lov_comp_page_print
+};
+
+int lov_page_init_composite(const struct lu_env *env, struct cl_object *obj,
+			    struct cl_page *page, pgoff_t index)
+{
+	struct lov_object *loo = cl2lov(obj);
+	struct lov_io     *lio = lov_env_io(env);
+	struct cl_object  *subobj;
+	struct cl_object  *o;
+	struct lov_io_sub *sub;
+	struct lov_page   *lpg = cl_object_page_slice(obj, page);
+	struct lov_layout_raid0 *r0;
+	loff_t             offset;
+	loff_t             suboff;
+	int                entry;
+	int                stripe;
+	int                rc;
+	ENTRY;
+
+	offset = cl_offset(obj, index);
+	entry = lov_lsm_entry(loo->lo_lsm, offset);
+	if (entry < 0 || !lsm_entry_inited(loo->lo_lsm, entry)) {
+		/* non-existing layout component */
+		lov_page_init_empty(env, obj, page, index);
+		RETURN(0);
+	}
+
+	r0 = lov_r0(loo, entry);
+	stripe = lov_stripe_number(loo->lo_lsm, entry, offset);
+	LASSERT(stripe < r0->lo_nr);
+	rc = lov_stripe_offset(loo->lo_lsm, entry, offset, stripe, &suboff);
+	LASSERT(rc == 0);
+
+	lpg->lps_index = lov_comp_index(entry, stripe);
+	cl_page_slice_add(page, &lpg->lps_cl, obj, index, &lov_comp_page_ops);
+
+	sub = lov_sub_get(env, lio, lpg->lps_index);
+	if (IS_ERR(sub))
+		RETURN(PTR_ERR(sub));
+
+	subobj = lovsub2cl(r0->lo_sub[stripe]);
+	list_for_each_entry(o, &subobj->co_lu.lo_header->loh_layers,
+			    co_lu.lo_linkage) {
+		if (o->co_ops->coo_page_init != NULL) {
+			rc = o->co_ops->coo_page_init(sub->sub_env, o, page,
+						      cl_index(subobj, suboff));
+			if (rc != 0)
+				break;
+		}
+	}
+
+	RETURN(rc);
+}
+
+static int lov_empty_page_print(const struct lu_env *env,
+				const struct cl_page_slice *slice,
+				void *cookie, lu_printer_t printer)
+{
+        struct lov_page *lp = cl2lov_page(slice);
+
+        return (*printer)(env, cookie, LUSTRE_LOV_NAME"-page@%p, empty.\n", lp);
+}
+
+static const struct cl_page_operations lov_empty_page_ops = {
+	.cpo_print = lov_empty_page_print
+};
+
+int lov_page_init_empty(const struct lu_env *env, struct cl_object *obj,
+			struct cl_page *page, pgoff_t index)
+{
+	struct lov_page *lpg = cl_object_page_slice(obj, page);
+	void *addr;
+	ENTRY;
+
+	cl_page_slice_add(page, &lpg->lps_cl, obj, index, &lov_empty_page_ops);
+	addr = kmap(page->cp_vmpage);
+	memset(addr, 0, cl_page_size(obj));
+	kunmap(page->cp_vmpage);
+	cl_page_export(env, page, 1);
+	RETURN(0);
+}
+
+
+/** @} lov */
+
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_pool.c b/drivers/staging/lustrefsx/lustre/lov/lov_pool.c
new file mode 100644
index 0000000000000..7a2b9ac32e92b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_pool.c
@@ -0,0 +1,619 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_pool.c
+ *
+ * OST pool methods
+ *
+ * Author: Jacques-Charles LAFOUCRIERE <jc.lafoucriere@cea.fr>
+ * Author: Alex Lyashkov <Alexey.Lyashkov@Sun.COM>
+ * Author: Nathaniel Rutman <Nathan.Rutman@Sun.COM>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <libcfs/libcfs.h>
+
+#include <obd.h>
+#include "lov_internal.h"
+
+#define pool_tgt(_p, _i) \
+		_p->pool_lobd->u.lov.lov_tgts[_p->pool_obds.op_array[_i]]
+
+static void lov_pool_getref(struct pool_desc *pool)
+{
+	CDEBUG(D_INFO, "pool %p\n", pool);
+	atomic_inc(&pool->pool_refcount);
+}
+
+static void lov_pool_putref(struct pool_desc *pool)
+{
+	CDEBUG(D_INFO, "pool %p\n", pool);
+	if (atomic_dec_and_test(&pool->pool_refcount)) {
+		LASSERT(hlist_unhashed(&pool->pool_hash));
+		LASSERT(list_empty(&pool->pool_list));
+		LASSERT(pool->pool_proc_entry == NULL);
+		lov_ost_pool_free(&(pool->pool_obds));
+		OBD_FREE_PTR(pool);
+		EXIT;
+	}
+}
+
+static void lov_pool_putref_locked(struct pool_desc *pool)
+{
+	CDEBUG(D_INFO, "pool %p\n", pool);
+	LASSERT(atomic_read(&pool->pool_refcount) > 1);
+
+	atomic_dec(&pool->pool_refcount);
+}
+
+/*
+ * hash function using a Rotating Hash algorithm
+ * Knuth, D. The Art of Computer Programming,
+ * Volume 3: Sorting and Searching,
+ * Chapter 6.4.
+ * Addison Wesley, 1973
+ */
+static __u32 pool_hashfn(struct cfs_hash *hash_body, const void *key,
+			 unsigned mask)
+{
+        int i;
+        __u32 result;
+        char *poolname;
+
+        result = 0;
+        poolname = (char *)key;
+        for (i = 0; i < LOV_MAXPOOLNAME; i++) {
+                if (poolname[i] == '\0')
+                        break;
+                result = (result << 4)^(result >> 28) ^  poolname[i];
+        }
+        return (result % mask);
+}
+
+static void *pool_key(struct hlist_node *hnode)
+{
+        struct pool_desc *pool;
+
+	pool = hlist_entry(hnode, struct pool_desc, pool_hash);
+        return (pool->pool_name);
+}
+
+static int
+pool_hashkey_keycmp(const void *key, struct hlist_node *compared_hnode)
+{
+        char *pool_name;
+        struct pool_desc *pool;
+
+        pool_name = (char *)key;
+	pool = hlist_entry(compared_hnode, struct pool_desc, pool_hash);
+        return !strncmp(pool_name, pool->pool_name, LOV_MAXPOOLNAME);
+}
+
+static void *pool_hashobject(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct pool_desc, pool_hash);
+}
+
+static void pool_hashrefcount_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+        struct pool_desc *pool;
+
+	pool = hlist_entry(hnode, struct pool_desc, pool_hash);
+        lov_pool_getref(pool);
+}
+
+static void pool_hashrefcount_put_locked(struct cfs_hash *hs,
+					 struct hlist_node *hnode)
+{
+        struct pool_desc *pool;
+
+	pool = hlist_entry(hnode, struct pool_desc, pool_hash);
+        lov_pool_putref_locked(pool);
+}
+
+struct cfs_hash_ops pool_hash_operations = {
+        .hs_hash        = pool_hashfn,
+        .hs_key         = pool_key,
+        .hs_keycmp      = pool_hashkey_keycmp,
+        .hs_object      = pool_hashobject,
+        .hs_get         = pool_hashrefcount_get,
+        .hs_put_locked  = pool_hashrefcount_put_locked,
+
+};
+
+#ifdef CONFIG_PROC_FS
+/* ifdef needed for liblustre support */
+/*
+ * pool /proc seq_file methods
+ */
+/*
+ * iterator is used to go through the target pool entries
+ * index is the current entry index in the lp_array[] array
+ * index >= pos returned to the seq_file interface
+ * pos is from 0 to (pool->pool_obds.op_count - 1)
+ */
+#define POOL_IT_MAGIC 0xB001CEA0
+struct pool_iterator {
+        int magic;
+        struct pool_desc *pool;
+        int idx;        /* from 0 to pool_tgt_size - 1 */
+};
+
+static void *pool_proc_next(struct seq_file *s, void *v, loff_t *pos)
+{
+        struct pool_iterator *iter = (struct pool_iterator *)s->private;
+        int prev_idx;
+
+	LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X\n", iter->magic);
+
+        /* test if end of file */
+        if (*pos >= pool_tgt_count(iter->pool))
+                return NULL;
+
+        /* iterate to find a non empty entry */
+        prev_idx = iter->idx;
+	down_read(&pool_tgt_rw_sem(iter->pool));
+        iter->idx++;
+        if (iter->idx == pool_tgt_count(iter->pool)) {
+                iter->idx = prev_idx; /* we stay on the last entry */
+		up_read(&pool_tgt_rw_sem(iter->pool));
+                return NULL;
+        }
+	up_read(&pool_tgt_rw_sem(iter->pool));
+        (*pos)++;
+        /* return != NULL to continue */
+        return iter;
+}
+
+static void *pool_proc_start(struct seq_file *s, loff_t *pos)
+{
+        struct pool_desc *pool = (struct pool_desc *)s->private;
+        struct pool_iterator *iter;
+
+        lov_pool_getref(pool);
+        if ((pool_tgt_count(pool) == 0) ||
+            (*pos >= pool_tgt_count(pool))) {
+                /* iter is not created, so stop() has no way to
+                 * find pool to dec ref */
+                lov_pool_putref(pool);
+                return NULL;
+        }
+
+        OBD_ALLOC_PTR(iter);
+        if (!iter)
+                return ERR_PTR(-ENOMEM);
+        iter->magic = POOL_IT_MAGIC;
+        iter->pool = pool;
+        iter->idx = 0;
+
+        /* we use seq_file private field to memorized iterator so
+         * we can free it at stop() */
+        /* /!\ do not forget to restore it to pool before freeing it */
+        s->private = iter;
+        if (*pos > 0) {
+                loff_t i;
+                void *ptr;
+
+                i = 0;
+                do {
+                     ptr = pool_proc_next(s, &iter, &i);
+                } while ((i < *pos) && (ptr != NULL));
+                return ptr;
+        }
+        return iter;
+}
+
+static void pool_proc_stop(struct seq_file *s, void *v)
+{
+        struct pool_iterator *iter = (struct pool_iterator *)s->private;
+
+        /* in some cases stop() method is called 2 times, without
+         * calling start() method (see seq_read() from fs/seq_file.c)
+         * we have to free only if s->private is an iterator */
+        if ((iter) && (iter->magic == POOL_IT_MAGIC)) {
+                /* we restore s->private so next call to pool_proc_start()
+                 * will work */
+                s->private = iter->pool;
+                lov_pool_putref(iter->pool);
+                OBD_FREE_PTR(iter);
+        }
+        return;
+}
+
+static int pool_proc_show(struct seq_file *s, void *v)
+{
+        struct pool_iterator *iter = (struct pool_iterator *)v;
+        struct lov_tgt_desc *tgt;
+
+	LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X\n", iter->magic);
+	LASSERT(iter->pool != NULL);
+	LASSERT(iter->idx <= pool_tgt_count(iter->pool));
+
+	down_read(&pool_tgt_rw_sem(iter->pool));
+        tgt = pool_tgt(iter->pool, iter->idx);
+	up_read(&pool_tgt_rw_sem(iter->pool));
+        if (tgt)
+                seq_printf(s, "%s\n", obd_uuid2str(&(tgt->ltd_uuid)));
+
+        return 0;
+}
+
+static struct seq_operations pool_proc_ops = {
+        .start          = pool_proc_start,
+        .next           = pool_proc_next,
+        .stop           = pool_proc_stop,
+        .show           = pool_proc_show,
+};
+
+static int pool_proc_open(struct inode *inode, struct file *file)
+{
+        int rc;
+
+        rc = seq_open(file, &pool_proc_ops);
+        if (!rc) {
+                struct seq_file *s = file->private_data;
+		s->private = PDE_DATA(inode);
+        }
+        return rc;
+}
+
+static struct file_operations pool_proc_operations = {
+        .open           = pool_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+#endif /* CONFIG_PROC_FS */
+
+void lov_dump_pool(int level, struct pool_desc *pool)
+{
+        int i;
+
+        lov_pool_getref(pool);
+
+        CDEBUG(level, "pool "LOV_POOLNAMEF" has %d members\n",
+               pool->pool_name, pool->pool_obds.op_count);
+	down_read(&pool_tgt_rw_sem(pool));
+
+        for (i = 0; i < pool_tgt_count(pool) ; i++) {
+                if (!pool_tgt(pool, i) || !(pool_tgt(pool, i))->ltd_exp)
+                        continue;
+                CDEBUG(level, "pool "LOV_POOLNAMEF"[%d] = %s\n",
+                       pool->pool_name, i,
+                       obd_uuid2str(&((pool_tgt(pool, i))->ltd_uuid)));
+        }
+
+	up_read(&pool_tgt_rw_sem(pool));
+        lov_pool_putref(pool);
+}
+
+#define LOV_POOL_INIT_COUNT 2
+int lov_ost_pool_init(struct ost_pool *op, unsigned int count)
+{
+	ENTRY;
+
+	if (count == 0)
+		count = LOV_POOL_INIT_COUNT;
+	op->op_array = NULL;
+	op->op_count = 0;
+	init_rwsem(&op->op_rw_sem);
+	op->op_size = count * sizeof(op->op_array[0]);
+	OBD_ALLOC(op->op_array, op->op_size);
+	if (op->op_array == NULL) {
+		op->op_size = 0;
+		RETURN(-ENOMEM);
+	}
+	EXIT;
+	return 0;
+}
+
+/* Caller must hold write op_rwlock */
+int lov_ost_pool_extend(struct ost_pool *op, unsigned int min_count)
+{
+	__u32 *new;
+	__u32 new_size;
+
+	LASSERT(min_count != 0);
+
+	if (op->op_count * sizeof(op->op_array[0]) < op->op_size)
+		return 0;
+
+	new_size = max_t(__u32, min_count * sizeof(op->op_array[0]),
+			 2 * op->op_size);
+	OBD_ALLOC(new, new_size);
+	if (new == NULL)
+		return -ENOMEM;
+
+	/* copy old array to new one */
+	memcpy(new, op->op_array, op->op_size);
+	OBD_FREE(op->op_array, op->op_size);
+	op->op_array = new;
+	op->op_size = new_size;
+	return 0;
+}
+
+int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count)
+{
+        int rc = 0, i;
+        ENTRY;
+
+	down_write(&op->op_rw_sem);
+
+        rc = lov_ost_pool_extend(op, min_count);
+        if (rc)
+                GOTO(out, rc);
+
+        /* search ost in pool array */
+        for (i = 0; i < op->op_count; i++) {
+                if (op->op_array[i] == idx)
+                        GOTO(out, rc = -EEXIST);
+        }
+        /* ost not found we add it */
+        op->op_array[op->op_count] = idx;
+        op->op_count++;
+        EXIT;
+out:
+	up_write(&op->op_rw_sem);
+        return rc;
+}
+
+int lov_ost_pool_remove(struct ost_pool *op, __u32 idx)
+{
+        int i;
+        ENTRY;
+
+	down_write(&op->op_rw_sem);
+
+        for (i = 0; i < op->op_count; i++) {
+                if (op->op_array[i] == idx) {
+                        memmove(&op->op_array[i], &op->op_array[i + 1],
+                                (op->op_count - i - 1) * sizeof(op->op_array[0]));
+                        op->op_count--;
+			up_write(&op->op_rw_sem);
+                        EXIT;
+                        return 0;
+                }
+        }
+
+	up_write(&op->op_rw_sem);
+        RETURN(-EINVAL);
+}
+
+int lov_ost_pool_free(struct ost_pool *op)
+{
+	ENTRY;
+
+	if (op->op_size == 0)
+		RETURN(0);
+
+	down_write(&op->op_rw_sem);
+
+	OBD_FREE(op->op_array, op->op_size);
+	op->op_array = NULL;
+	op->op_count = 0;
+	op->op_size = 0;
+
+	up_write(&op->op_rw_sem);
+	RETURN(0);
+}
+
+
+int lov_pool_new(struct obd_device *obd, char *poolname)
+{
+        struct lov_obd *lov;
+        struct pool_desc *new_pool;
+        int rc;
+        ENTRY;
+
+        lov = &(obd->u.lov);
+
+        if (strlen(poolname) > LOV_MAXPOOLNAME)
+                RETURN(-ENAMETOOLONG);
+
+        OBD_ALLOC_PTR(new_pool);
+        if (new_pool == NULL)
+                RETURN(-ENOMEM);
+
+	strlcpy(new_pool->pool_name, poolname, sizeof(new_pool->pool_name));
+	new_pool->pool_lobd = obd;
+	/* ref count init to 1 because when created a pool is always used
+	 * up to deletion
+	 */
+	atomic_set(&new_pool->pool_refcount, 1);
+	rc = lov_ost_pool_init(&new_pool->pool_obds, 0);
+	if (rc)
+		GOTO(out_err, rc);
+
+	INIT_HLIST_NODE(&new_pool->pool_hash);
+
+#ifdef CONFIG_PROC_FS
+	/* get ref for /proc file */
+        lov_pool_getref(new_pool);
+	new_pool->pool_proc_entry = lprocfs_add_simple(lov->lov_pool_proc_entry,
+						       poolname, new_pool,
+						       &pool_proc_operations);
+	if (IS_ERR(new_pool->pool_proc_entry)) {
+		CWARN("Cannot add proc pool entry "LOV_POOLNAMEF"\n", poolname);
+		new_pool->pool_proc_entry = NULL;
+		lov_pool_putref(new_pool);
+	}
+	CDEBUG(D_INFO, "pool %p - proc %p\n",
+	       new_pool, new_pool->pool_proc_entry);
+#endif
+
+	spin_lock(&obd->obd_dev_lock);
+	list_add_tail(&new_pool->pool_list, &lov->lov_pool_list);
+	lov->lov_pool_count++;
+	spin_unlock(&obd->obd_dev_lock);
+
+        /* add to find only when it fully ready  */
+        rc = cfs_hash_add_unique(lov->lov_pools_hash_body, poolname,
+                                 &new_pool->pool_hash);
+        if (rc)
+                GOTO(out_err, rc = -EEXIST);
+
+        CDEBUG(D_CONFIG, LOV_POOLNAMEF" is pool #%d\n",
+               poolname, lov->lov_pool_count);
+
+        RETURN(0);
+
+out_err:
+	spin_lock(&obd->obd_dev_lock);
+	list_del_init(&new_pool->pool_list);
+	lov->lov_pool_count--;
+	spin_unlock(&obd->obd_dev_lock);
+        lprocfs_remove(&new_pool->pool_proc_entry);
+	lov_ost_pool_free(&new_pool->pool_obds);
+	OBD_FREE_PTR(new_pool);
+
+	return rc;
+}
+
+int lov_pool_del(struct obd_device *obd, char *poolname)
+{
+        struct lov_obd *lov;
+        struct pool_desc *pool;
+        ENTRY;
+
+        lov = &(obd->u.lov);
+
+        /* lookup and kill hash reference */
+        pool = cfs_hash_del_key(lov->lov_pools_hash_body, poolname);
+        if (pool == NULL)
+                RETURN(-ENOENT);
+
+        if (pool->pool_proc_entry != NULL) {
+                CDEBUG(D_INFO, "proc entry %p\n", pool->pool_proc_entry);
+                lprocfs_remove(&pool->pool_proc_entry);
+                lov_pool_putref(pool);
+        }
+
+	spin_lock(&obd->obd_dev_lock);
+	list_del_init(&pool->pool_list);
+	lov->lov_pool_count--;
+	spin_unlock(&obd->obd_dev_lock);
+
+	/* release last reference */
+	lov_pool_putref(pool);
+
+	RETURN(0);
+}
+
+
+int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname)
+{
+        struct obd_uuid ost_uuid;
+        struct lov_obd *lov;
+        struct pool_desc *pool;
+        unsigned int lov_idx;
+        int rc;
+        ENTRY;
+
+        lov = &(obd->u.lov);
+
+        pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname);
+        if (pool == NULL)
+                RETURN(-ENOENT);
+
+        obd_str2uuid(&ost_uuid, ostname);
+
+
+        /* search ost in lov array */
+        obd_getref(obd);
+        for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) {
+                if (!lov->lov_tgts[lov_idx])
+                        continue;
+                if (obd_uuid_equals(&ost_uuid,
+                                    &(lov->lov_tgts[lov_idx]->ltd_uuid)))
+                        break;
+        }
+        /* test if ost found in lov */
+        if (lov_idx == lov->desc.ld_tgt_count)
+                GOTO(out, rc = -EINVAL);
+
+        rc = lov_ost_pool_add(&pool->pool_obds, lov_idx, lov->lov_tgt_size);
+        if (rc)
+                GOTO(out, rc);
+
+        CDEBUG(D_CONFIG, "Added %s to "LOV_POOLNAMEF" as member %d\n",
+               ostname, poolname,  pool_tgt_count(pool));
+
+        EXIT;
+out:
+        obd_putref(obd);
+        lov_pool_putref(pool);
+        return rc;
+}
+
+int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname)
+{
+        struct obd_uuid ost_uuid;
+        struct lov_obd *lov;
+        struct pool_desc *pool;
+        unsigned int lov_idx;
+        int rc = 0;
+        ENTRY;
+
+        lov = &(obd->u.lov);
+
+        pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname);
+        if (pool == NULL)
+                RETURN(-ENOENT);
+
+        obd_str2uuid(&ost_uuid, ostname);
+
+        obd_getref(obd);
+        /* search ost in lov array, to get index */
+        for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) {
+                if (!lov->lov_tgts[lov_idx])
+                        continue;
+
+                if (obd_uuid_equals(&ost_uuid,
+                                    &(lov->lov_tgts[lov_idx]->ltd_uuid)))
+                        break;
+        }
+
+        /* test if ost found in lov */
+        if (lov_idx == lov->desc.ld_tgt_count)
+                GOTO(out, rc = -EINVAL);
+
+        lov_ost_pool_remove(&pool->pool_obds, lov_idx);
+
+        CDEBUG(D_CONFIG, "%s removed from "LOV_POOLNAMEF"\n", ostname,
+               poolname);
+
+        EXIT;
+out:
+        obd_putref(obd);
+        lov_pool_putref(pool);
+        return rc;
+}
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_request.c b/drivers/staging/lustrefsx/lustre/lov/lov_request.c
new file mode 100644
index 0000000000000..fe74af4b7f82d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_request.c
@@ -0,0 +1,370 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include <lustre/lustre_idl.h>
+
+#include "lov_internal.h"
+
+static void lov_init_set(struct lov_request_set *set)
+{
+	set->set_count = 0;
+	atomic_set(&set->set_completes, 0);
+	atomic_set(&set->set_success, 0);
+	INIT_LIST_HEAD(&set->set_list);
+}
+
+static void lov_finish_set(struct lov_request_set *set)
+{
+	struct list_head *pos, *n;
+	struct lov_request *req;
+	ENTRY;
+
+	LASSERT(set != NULL);
+	list_for_each_safe(pos, n, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+		list_del_init(&req->rq_link);
+
+		if (req->rq_oi.oi_osfs != NULL)
+			OBD_FREE_PTR(req->rq_oi.oi_osfs);
+
+		OBD_FREE_PTR(req);
+	}
+
+	OBD_FREE_PTR(set);
+	EXIT;
+}
+
+static void
+lov_update_set(struct lov_request_set *set, struct lov_request *req, int rc)
+{
+	atomic_inc(&set->set_completes);
+	if (rc == 0)
+		atomic_inc(&set->set_success);
+}
+
+static void
+lov_set_add_req(struct lov_request *req, struct lov_request_set *set)
+{
+	list_add_tail(&req->rq_link, &set->set_list);
+        set->set_count++;
+        req->rq_rqset = set;
+}
+
+static int lov_check_set(struct lov_obd *lov, int idx)
+{
+	int rc = 0;
+	mutex_lock(&lov->lov_lock);
+
+	if (lov->lov_tgts[idx] == NULL ||
+	    lov->lov_tgts[idx]->ltd_active ||
+	    (lov->lov_tgts[idx]->ltd_exp != NULL &&
+	     class_exp2cliimp(lov->lov_tgts[idx]->ltd_exp)->imp_connect_tried))
+		rc = 1;
+
+	mutex_unlock(&lov->lov_lock);
+	return rc;
+}
+
+/* Check if the OSC connection exists and is active.
+ * If the OSC has not yet had a chance to connect to the OST the first time,
+ * wait once for it to connect instead of returning an error.
+ */
+static int lov_check_and_wait_active(struct lov_obd *lov, int ost_idx)
+{
+	wait_queue_head_t waitq;
+	struct l_wait_info lwi;
+	struct lov_tgt_desc *tgt;
+	int rc = 0;
+
+	mutex_lock(&lov->lov_lock);
+
+	tgt = lov->lov_tgts[ost_idx];
+
+	if (unlikely(tgt == NULL))
+		GOTO(out, rc = 0);
+
+	if (likely(tgt->ltd_active))
+		GOTO(out, rc = 1);
+
+	if (tgt->ltd_exp && class_exp2cliimp(tgt->ltd_exp)->imp_connect_tried)
+		GOTO(out, rc = 0);
+
+	mutex_unlock(&lov->lov_lock);
+
+	init_waitqueue_head(&waitq);
+	lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(obd_timeout),
+				   cfs_time_seconds(1), NULL, NULL);
+
+	rc = l_wait_event(waitq, lov_check_set(lov, ost_idx), &lwi);
+	if (tgt->ltd_active)
+		return 1;
+
+	return 0;
+
+out:
+	mutex_unlock(&lov->lov_lock);
+	return rc;
+}
+
+#define LOV_U64_MAX ((__u64)~0ULL)
+#define LOV_SUM_MAX(tot, add)                                           \
+        do {                                                            \
+                if ((tot) + (add) < (tot))                              \
+                        (tot) = LOV_U64_MAX;                            \
+                else                                                    \
+                        (tot) += (add);                                 \
+        } while(0)
+
+static int
+lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs, int success)
+{
+        ENTRY;
+
+        if (success) {
+                __u32 expected_stripes = lov_get_stripe_count(&obd->u.lov,
+							      LOV_MAGIC, 0);
+		if (osfs->os_files != LOV_U64_MAX)
+			lov_do_div64(osfs->os_files, expected_stripes);
+		if (osfs->os_ffree != LOV_U64_MAX)
+			lov_do_div64(osfs->os_ffree, expected_stripes);
+
+		spin_lock(&obd->obd_osfs_lock);
+		memcpy(&obd->obd_osfs, osfs, sizeof(*osfs));
+		obd->obd_osfs_age = cfs_time_current_64();
+		spin_unlock(&obd->obd_osfs_lock);
+		RETURN(0);
+	}
+
+	RETURN(-EIO);
+}
+
+int lov_fini_statfs_set(struct lov_request_set *set)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (set == NULL)
+		RETURN(0);
+
+	if (atomic_read(&set->set_completes)) {
+		rc = lov_fini_statfs(set->set_obd, set->set_oi->oi_osfs,
+				     atomic_read(&set->set_success));
+	}
+
+	lov_finish_set(set);
+
+	RETURN(rc);
+}
+
+static void
+lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs,
+		  int success)
+{
+        int shift = 0, quit = 0;
+        __u64 tmp;
+
+        if (success == 0) {
+                memcpy(osfs, lov_sfs, sizeof(*lov_sfs));
+        } else {
+                if (osfs->os_bsize != lov_sfs->os_bsize) {
+                        /* assume all block sizes are always powers of 2 */
+                        /* get the bits difference */
+                        tmp = osfs->os_bsize | lov_sfs->os_bsize;
+                        for (shift = 0; shift <= 64; ++shift) {
+                                if (tmp & 1) {
+                                        if (quit)
+                                                break;
+                                        else
+                                                quit = 1;
+                                        shift = 0;
+                                }
+                                tmp >>= 1;
+                        }
+                }
+
+                if (osfs->os_bsize < lov_sfs->os_bsize) {
+                        osfs->os_bsize = lov_sfs->os_bsize;
+
+                        osfs->os_bfree  >>= shift;
+                        osfs->os_bavail >>= shift;
+                        osfs->os_blocks >>= shift;
+                } else if (shift != 0) {
+                        lov_sfs->os_bfree  >>= shift;
+                        lov_sfs->os_bavail >>= shift;
+                        lov_sfs->os_blocks >>= shift;
+                }
+#ifdef MIN_DF
+                /* Sandia requested that df (and so, statfs) only
+                   returned minimal available space on
+                   a single OST, so people would be able to
+                   write this much data guaranteed. */
+                if (osfs->os_bavail > lov_sfs->os_bavail) {
+                        /* Presumably if new bavail is smaller,
+                           new bfree is bigger as well */
+                        osfs->os_bfree = lov_sfs->os_bfree;
+                        osfs->os_bavail = lov_sfs->os_bavail;
+                }
+#else
+                osfs->os_bfree += lov_sfs->os_bfree;
+                osfs->os_bavail += lov_sfs->os_bavail;
+#endif
+                osfs->os_blocks += lov_sfs->os_blocks;
+                /* XXX not sure about this one - depends on policy.
+                 *   - could be minimum if we always stripe on all OBDs
+                 *     (but that would be wrong for any other policy,
+                 *     if one of the OBDs has no more objects left)
+                 *   - could be sum if we stripe whole objects
+                 *   - could be average, just to give a nice number
+                 *
+                 * To give a "reasonable" (if not wholly accurate)
+                 * number, we divide the total number of free objects
+                 * by expected stripe count (watch out for overflow).
+                 */
+                LOV_SUM_MAX(osfs->os_files, lov_sfs->os_files);
+                LOV_SUM_MAX(osfs->os_ffree, lov_sfs->os_ffree);
+        }
+}
+
+/* The callback for osc_statfs_async that finilizes a request info when a
+ * response is received. */
+static int cb_statfs_update(void *cookie, int rc)
+{
+        struct obd_info *oinfo = cookie;
+        struct lov_request *lovreq;
+        struct lov_request_set *set;
+        struct obd_statfs *osfs, *lov_sfs;
+        struct lov_obd *lov;
+        struct lov_tgt_desc *tgt;
+        struct obd_device *lovobd, *tgtobd;
+        int success;
+        ENTRY;
+
+	lovreq = container_of(oinfo, struct lov_request, rq_oi);
+	set = lovreq->rq_rqset;
+	lovobd = set->set_obd;
+	lov = &lovobd->u.lov;
+	osfs = set->set_oi->oi_osfs;
+	lov_sfs = oinfo->oi_osfs;
+	success = atomic_read(&set->set_success);
+	/* XXX: the same is done in lov_update_common_set, however
+	   lovset->set_exp is not initialized. */
+	lov_update_set(set, lovreq, rc);
+	if (rc)
+		GOTO(out, rc);
+
+        obd_getref(lovobd);
+        tgt = lov->lov_tgts[lovreq->rq_idx];
+        if (!tgt || !tgt->ltd_active)
+                GOTO(out_update, rc);
+
+        tgtobd = class_exp2obd(tgt->ltd_exp);
+	spin_lock(&tgtobd->obd_osfs_lock);
+	memcpy(&tgtobd->obd_osfs, lov_sfs, sizeof(*lov_sfs));
+	if ((oinfo->oi_flags & OBD_STATFS_FROM_CACHE) == 0)
+		tgtobd->obd_osfs_age = cfs_time_current_64();
+	spin_unlock(&tgtobd->obd_osfs_lock);
+
+out_update:
+        lov_update_statfs(osfs, lov_sfs, success);
+        obd_putref(lovobd);
+
+out:
+	RETURN(0);
+}
+
+int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
+                        struct lov_request_set **reqset)
+{
+        struct lov_request_set *set;
+        struct lov_obd *lov = &obd->u.lov;
+        int rc = 0, i;
+        ENTRY;
+
+        OBD_ALLOC(set, sizeof(*set));
+        if (set == NULL)
+                RETURN(-ENOMEM);
+        lov_init_set(set);
+
+        set->set_obd = obd;
+        set->set_oi = oinfo;
+
+        /* We only get block data from the OBD */
+        for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+		struct lov_request *req;
+
+		if (lov->lov_tgts[i] == NULL ||
+		    (oinfo->oi_flags & OBD_STATFS_NODELAY &&
+		     !lov->lov_tgts[i]->ltd_active)) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", i);
+			continue;
+		}
+
+		/* skip targets that have been explicitely disabled by the
+		 * administrator */
+		if (!lov->lov_tgts[i]->ltd_exp) {
+			CDEBUG(D_HA, "lov idx %d administratively disabled\n",
+			       i);
+			continue;
+		}
+
+		if (!lov->lov_tgts[i]->ltd_active)
+			lov_check_and_wait_active(lov, i);
+
+		OBD_ALLOC(req, sizeof(*req));
+		if (req == NULL)
+			GOTO(out_set, rc = -ENOMEM);
+
+                OBD_ALLOC(req->rq_oi.oi_osfs, sizeof(*req->rq_oi.oi_osfs));
+                if (req->rq_oi.oi_osfs == NULL) {
+                        OBD_FREE(req, sizeof(*req));
+                        GOTO(out_set, rc = -ENOMEM);
+                }
+
+                req->rq_idx = i;
+                req->rq_oi.oi_cb_up = cb_statfs_update;
+                req->rq_oi.oi_flags = oinfo->oi_flags;
+
+                lov_set_add_req(req, set);
+        }
+        if (!set->set_count)
+                GOTO(out_set, rc = -EIO);
+        *reqset = set;
+        RETURN(rc);
+out_set:
+        lov_fini_statfs_set(set);
+        RETURN(rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/lov/lovsub_dev.c b/drivers/staging/lustrefsx/lustre/lov/lovsub_dev.c
new file mode 100644
index 0000000000000..0ada9b5b9ce53
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lovsub_dev.c
@@ -0,0 +1,149 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2013, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_device and cl_device_type for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lov-sub device and device type functions.
+ *
+ */
+
+static int lovsub_device_init(const struct lu_env *env, struct lu_device *d,
+                              const char *name, struct lu_device *next)
+{
+        struct lovsub_device  *lsd = lu2lovsub_dev(d);
+        struct lu_device_type *ldt;
+        int rc;
+
+        ENTRY;
+        next->ld_site = d->ld_site;
+        ldt = next->ld_type;
+        LASSERT(ldt != NULL);
+        rc = ldt->ldt_ops->ldto_device_init(env, next, ldt->ldt_name, NULL);
+        if (rc) {
+                next->ld_site = NULL;
+                RETURN(rc);
+        }
+
+        lu_device_get(next);
+        lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init);
+        lsd->acid_next = lu2cl_dev(next);
+        RETURN(rc);
+}
+
+static struct lu_device *lovsub_device_fini(const struct lu_env *env,
+                                            struct lu_device *d)
+{
+        struct lu_device *next;
+        struct lovsub_device *lsd;
+
+	ENTRY;
+	lsd = lu2lovsub_dev(d);
+	next = cl2lu_dev(lsd->acid_next);
+	lsd->acid_next = NULL;
+	RETURN(next);
+}
+
+static struct lu_device *lovsub_device_free(const struct lu_env *env,
+					    struct lu_device *d)
+{
+	struct lovsub_device *lsd  = lu2lovsub_dev(d);
+	struct lu_device     *next = cl2lu_dev(lsd->acid_next);
+
+	if (atomic_read(&d->ld_ref) && d->ld_site) {
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_ERROR, NULL);
+		lu_site_print(env, d->ld_site, &msgdata, lu_cdebug_printer);
+	}
+	cl_device_fini(lu2cl_dev(d));
+	OBD_FREE_PTR(lsd);
+	return next;
+}
+
+static const struct lu_device_operations lovsub_lu_ops = {
+        .ldo_object_alloc      = lovsub_object_alloc,
+        .ldo_process_config    = NULL,
+        .ldo_recovery_complete = NULL
+};
+
+static struct lu_device *lovsub_device_alloc(const struct lu_env *env,
+                                             struct lu_device_type *t,
+                                             struct lustre_cfg *cfg)
+{
+        struct lu_device     *d;
+        struct lovsub_device *lsd;
+
+        OBD_ALLOC_PTR(lsd);
+        if (lsd != NULL) {
+                int result;
+
+                result = cl_device_init(&lsd->acid_cl, t);
+                if (result == 0) {
+                        d = lovsub2lu_dev(lsd);
+                        d->ld_ops         = &lovsub_lu_ops;
+                } else
+                        d = ERR_PTR(result);
+        } else
+                d = ERR_PTR(-ENOMEM);
+        return d;
+}
+
+static const struct lu_device_type_operations lovsub_device_type_ops = {
+        .ldto_device_alloc = lovsub_device_alloc,
+        .ldto_device_free  = lovsub_device_free,
+
+        .ldto_device_init    = lovsub_device_init,
+        .ldto_device_fini    = lovsub_device_fini
+};
+
+#define LUSTRE_LOVSUB_NAME         "lovsub"
+
+struct lu_device_type lovsub_device_type = {
+        .ldt_tags     = LU_DEVICE_CL,
+        .ldt_name     = LUSTRE_LOVSUB_NAME,
+        .ldt_ops      = &lovsub_device_type_ops,
+        .ldt_ctx_tags = LCT_CL_THREAD
+};
+
+
+/** @} lov */
+
diff --git a/drivers/staging/lustrefsx/lustre/lov/lovsub_lock.c b/drivers/staging/lustrefsx/lustre/lov/lovsub_lock.c
new file mode 100644
index 0000000000000..de8b5c72260d7
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lovsub_lock.c
@@ -0,0 +1,82 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub lock operations.
+ *
+ */
+
+static void lovsub_lock_fini(const struct lu_env *env,
+                             struct cl_lock_slice *slice)
+{
+        struct lovsub_lock   *lsl;
+
+	ENTRY;
+	lsl = cl2lovsub_lock(slice);
+	OBD_SLAB_FREE_PTR(lsl, lovsub_lock_kmem);
+	EXIT;
+}
+
+static const struct cl_lock_operations lovsub_lock_ops = {
+        .clo_fini    = lovsub_lock_fini,
+};
+
+int lovsub_lock_init(const struct lu_env *env, struct cl_object *obj,
+		     struct cl_lock *lock, const struct cl_io *io)
+{
+	struct lovsub_lock *lsk;
+	int result;
+
+	ENTRY;
+	OBD_SLAB_ALLOC_PTR_GFP(lsk, lovsub_lock_kmem, GFP_NOFS);
+	if (lsk != NULL) {
+		cl_lock_slice_add(lock, &lsk->lss_cl, obj, &lovsub_lock_ops);
+		result = 0;
+	} else
+		result = -ENOMEM;
+	RETURN(result);
+}
+
+/** @} lov */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lovsub_object.c b/drivers/staging/lustrefsx/lustre/lov/lovsub_object.c
new file mode 100644
index 0000000000000..1471de7915162
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lovsub_object.c
@@ -0,0 +1,194 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_object for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub object operations.
+ *
+ */
+
+int lovsub_object_init(const struct lu_env *env, struct lu_object *obj,
+                       const struct lu_object_conf *conf)
+{
+        struct lovsub_device  *dev   = lu2lovsub_dev(obj->lo_dev);
+        struct lu_object      *below;
+        struct lu_device      *under;
+
+        int result;
+
+        ENTRY;
+        under = &dev->acid_next->cd_lu_dev;
+        below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under);
+        if (below != NULL) {
+                lu_object_add(obj, below);
+		cl_object_page_init(lu2cl(obj), sizeof(struct lovsub_page));
+                result = 0;
+        } else
+                result = -ENOMEM;
+        RETURN(result);
+
+}
+
+static void lovsub_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+        struct lovsub_object *los = lu2lovsub(obj);
+        struct lov_object    *lov = los->lso_super;
+        ENTRY;
+
+        /* We can't assume lov was assigned here, because of the shadow
+         * object handling in lu_object_find.
+         */
+	if (lov != NULL) {
+		int index = lov_comp_entry(los->lso_index);
+		int stripe = lov_comp_stripe(los->lso_index);
+		struct lov_layout_raid0 *r0 = lov_r0(lov, index);
+
+		LASSERT(lov->lo_type == LLT_COMP);
+		LASSERT(r0->lo_sub[stripe] == los);
+		spin_lock(&r0->lo_sub_lock);
+		r0->lo_sub[stripe] = NULL;
+		spin_unlock(&r0->lo_sub_lock);
+	}
+
+        lu_object_fini(obj);
+        lu_object_header_fini(&los->lso_header.coh_lu);
+        OBD_SLAB_FREE_PTR(los, lovsub_object_kmem);
+        EXIT;
+}
+
+static int lovsub_object_print(const struct lu_env *env, void *cookie,
+                               lu_printer_t p, const struct lu_object *obj)
+{
+        struct lovsub_object *los = lu2lovsub(obj);
+
+        return (*p)(env, cookie, "[%d]", los->lso_index);
+}
+
+static int lovsub_attr_update(const struct lu_env *env, struct cl_object *obj,
+			      const struct cl_attr *attr, unsigned valid)
+{
+	struct lovsub_object *los = cl2lovsub(obj);
+	struct lov_object *lov = cl2lovsub(obj)->lso_super;
+
+	ENTRY;
+	lov_r0(lov, lov_comp_entry(los->lso_index))->lo_attr_valid = 0;
+	RETURN(0);
+}
+
+static int lovsub_object_glimpse(const struct lu_env *env,
+                                 const struct cl_object *obj,
+                                 struct ost_lvb *lvb)
+{
+        struct lovsub_object *los = cl2lovsub(obj);
+
+        ENTRY;
+        RETURN(cl_object_glimpse(env, &los->lso_super->lo_cl, lvb));
+}
+
+/**
+ * Implementation of struct cl_object_operations::coo_req_attr_set() for lovsub
+ * layer. Lov and lovsub are responsible only for struct obdo::o_stripe_idx
+ * field, which is filled there.
+ */
+static void lovsub_req_attr_set(const struct lu_env *env, struct cl_object *obj,
+				struct cl_req_attr *attr)
+{
+	struct lovsub_object *subobj = cl2lovsub(obj);
+	struct lov_stripe_md *lsm = subobj->lso_super->lo_lsm;
+	ENTRY;
+	cl_req_attr_set(env, &subobj->lso_super->lo_cl, attr);
+
+	/*
+	 * There is no OBD_MD_* flag for obdo::o_stripe_idx, so set it
+	 * unconditionally. It never changes anyway.
+	 */
+	attr->cra_oa->o_stripe_idx = lov_comp_stripe(subobj->lso_index);
+	lov_lsm2layout(lsm, lsm->lsm_entries[lov_comp_entry(subobj->lso_index)],
+		       &attr->cra_oa->o_layout);
+	attr->cra_oa->o_valid |= OBD_MD_FLOSTLAYOUT;
+	EXIT;
+}
+
+static const struct cl_object_operations lovsub_ops = {
+	.coo_page_init    = lovsub_page_init,
+	.coo_lock_init    = lovsub_lock_init,
+	.coo_attr_update  = lovsub_attr_update,
+	.coo_glimpse      = lovsub_object_glimpse,
+	.coo_req_attr_set = lovsub_req_attr_set
+};
+
+static const struct lu_object_operations lovsub_lu_obj_ops = {
+        .loo_object_init      = lovsub_object_init,
+        .loo_object_delete    = NULL,
+        .loo_object_release   = NULL,
+        .loo_object_free      = lovsub_object_free,
+        .loo_object_print     = lovsub_object_print,
+        .loo_object_invariant = NULL
+};
+
+struct lu_object *lovsub_object_alloc(const struct lu_env *env,
+				      const struct lu_object_header *unused,
+				      struct lu_device *dev)
+{
+	struct lovsub_object *los;
+	struct lu_object     *obj;
+
+	ENTRY;
+	OBD_SLAB_ALLOC_PTR_GFP(los, lovsub_object_kmem, GFP_NOFS);
+	if (los != NULL) {
+		struct cl_object_header *hdr;
+
+		obj = lovsub2lu(los);
+		hdr = &los->lso_header;
+		cl_object_header_init(hdr);
+		lu_object_init(obj, &hdr->coh_lu, dev);
+		lu_object_add_top(&hdr->coh_lu, obj);
+		los->lso_cl.co_ops = &lovsub_ops;
+		obj->lo_ops = &lovsub_lu_obj_ops;
+	} else
+		obj = NULL;
+	RETURN(obj);
+}
+
+/** @} lov */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lovsub_page.c b/drivers/staging/lustrefsx/lustre/lov/lovsub_page.c
new file mode 100644
index 0000000000000..c10a3dfa38c1e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lovsub_page.c
@@ -0,0 +1,70 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2013, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub page operations.
+ *
+ */
+
+static void lovsub_page_fini(const struct lu_env *env,
+                             struct cl_page_slice *slice)
+{
+}
+
+static const struct cl_page_operations lovsub_page_ops = {
+        .cpo_fini   = lovsub_page_fini
+};
+
+int lovsub_page_init(const struct lu_env *env, struct cl_object *obj,
+		     struct cl_page *page, pgoff_t index)
+{
+	struct lovsub_page *lsb = cl_object_page_slice(obj, page);
+	ENTRY;
+
+	cl_page_slice_add(page, &lsb->lsb_cl, obj, index, &lovsub_page_ops);
+	RETURN(0);
+}
+
+/** @} lov */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c b/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c
new file mode 100644
index 0000000000000..c101c64b66c20
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c
@@ -0,0 +1,332 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#include <asm/statfs.h>
+#include <lprocfs_status.h>
+#include <obd_class.h>
+#include <uapi/linux/lustre_param.h>
+#include "lov_internal.h"
+
+#ifdef CONFIG_PROC_FS
+static int lov_stripesize_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = (struct obd_device *)m->private;
+	struct lov_desc *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+
+	seq_printf(m, "%llu\n", desc->ld_default_stripe_size);
+	return 0;
+}
+
+static ssize_t lov_stripesize_seq_write(struct file *file,
+					const char __user *buffer,
+					size_t count, loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct lov_desc *desc;
+	__s64 val;
+	int rc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+	if (val < 0)
+		return -ERANGE;
+
+	lov_fix_desc_stripe_size(&val);
+	desc->ld_default_stripe_size = val;
+
+	return count;
+}
+LPROC_SEQ_FOPS(lov_stripesize);
+
+static int lov_stripeoffset_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = (struct obd_device *)m->private;
+	struct lov_desc *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	seq_printf(m, "%lld\n", desc->ld_default_stripe_offset);
+	return 0;
+}
+
+static ssize_t lov_stripeoffset_seq_write(struct file *file,
+					  const char __user *buffer,
+					  size_t count, loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct lov_desc *desc;
+	__s64 val;
+	int rc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+	if (val < -1)
+		return -ERANGE;
+
+	desc->ld_default_stripe_offset = val;
+
+	return count;
+}
+LPROC_SEQ_FOPS(lov_stripeoffset);
+
+static int lov_stripetype_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device* dev = (struct obd_device*)m->private;
+	struct lov_desc *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	seq_printf(m, "%u\n", desc->ld_pattern);
+	return 0;
+}
+
+static ssize_t lov_stripetype_seq_write(struct file *file,
+					const char __user *buffer,
+					size_t count, loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct lov_desc *desc;
+	int pattern, rc;
+	__s64 val;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+	if (val < INT_MIN || val > INT_MAX)
+		return -ERANGE;
+
+	pattern = val;
+	lov_fix_desc_pattern(&pattern);
+	desc->ld_pattern = pattern;
+
+	return count;
+}
+LPROC_SEQ_FOPS(lov_stripetype);
+
+static int lov_stripecount_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = (struct obd_device *)m->private;
+	struct lov_desc *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	seq_printf(m, "%d\n",
+		  (__s16)(desc->ld_default_stripe_count + 1) - 1);
+	return 0;
+}
+
+static ssize_t lov_stripecount_seq_write(struct file *file,
+					 const char __user *buffer,
+					 size_t count, loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct lov_desc *desc;
+	int rc;
+	__u32 stripe_count;
+	__s64 val;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+	if (val < -1)
+		return -ERANGE;
+
+	stripe_count = val;
+	lov_fix_desc_stripe_count(&stripe_count);
+	desc->ld_default_stripe_count = stripe_count;
+
+	return count;
+}
+LPROC_SEQ_FOPS(lov_stripecount);
+
+static int lov_numobd_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = (struct obd_device*)m->private;
+	struct lov_desc *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	seq_printf(m, "%u\n", desc->ld_tgt_count);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(lov_numobd);
+
+static int lov_activeobd_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device* dev = (struct obd_device*)m->private;
+	struct lov_desc *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	seq_printf(m, "%u\n", desc->ld_active_tgt_count);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(lov_activeobd);
+
+static int lov_desc_uuid_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+	struct lov_obd *lov;
+
+	LASSERT(dev != NULL);
+	lov = &dev->u.lov;
+	seq_printf(m, "%s\n", lov->desc.ld_uuid.uuid);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(lov_desc_uuid);
+
+static void *lov_tgt_seq_start(struct seq_file *p, loff_t *pos)
+{
+        struct obd_device *dev = p->private;
+        struct lov_obd *lov = &dev->u.lov;
+
+        while (*pos < lov->desc.ld_tgt_count) {
+                if (lov->lov_tgts[*pos])
+                        return lov->lov_tgts[*pos];
+                ++*pos;
+        }
+        return NULL;
+}
+
+static void lov_tgt_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *lov_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+        struct obd_device *dev = p->private;
+        struct lov_obd *lov = &dev->u.lov;
+
+        while (++*pos < lov->desc.ld_tgt_count) {
+                if (lov->lov_tgts[*pos])
+                        return lov->lov_tgts[*pos];
+        }
+        return NULL;
+}
+
+static int lov_tgt_seq_show(struct seq_file *p, void *v)
+{
+        struct lov_tgt_desc *tgt = v;
+	seq_printf(p, "%d: %s %sACTIVE\n", tgt->ltd_index,
+		   obd_uuid2str(&tgt->ltd_uuid),
+		   tgt->ltd_active ? "" : "IN");
+	return 0;
+}
+
+static const struct seq_operations lov_tgt_sops = {
+        .start = lov_tgt_seq_start,
+        .stop = lov_tgt_seq_stop,
+        .next = lov_tgt_seq_next,
+        .show = lov_tgt_seq_show,
+};
+
+static int lov_target_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc;
+
+	rc = LPROCFS_ENTRY_CHECK(inode);
+	if (rc < 0)
+		return rc;
+
+	rc = seq_open(file, &lov_tgt_sops);
+	if (rc)
+		return rc;
+
+	seq = file->private_data;
+	seq->private = PDE_DATA(inode);
+	return 0;
+}
+
+LPROC_SEQ_FOPS_RO_TYPE(lov, uuid);
+LPROC_SEQ_FOPS_RO_TYPE(lov, filestotal);
+LPROC_SEQ_FOPS_RO_TYPE(lov, filesfree);
+LPROC_SEQ_FOPS_RO_TYPE(lov, blksize);
+LPROC_SEQ_FOPS_RO_TYPE(lov, kbytestotal);
+LPROC_SEQ_FOPS_RO_TYPE(lov, kbytesfree);
+LPROC_SEQ_FOPS_RO_TYPE(lov, kbytesavail);
+
+struct lprocfs_vars lprocfs_lov_obd_vars[] = {
+	{ .name	=	"uuid",
+	  .fops	=	&lov_uuid_fops		},
+	{ .name	=	"stripesize",
+	  .fops	=	&lov_stripesize_fops	},
+	{ .name	=	"stripeoffset",
+	  .fops	=	&lov_stripeoffset_fops	},
+	{ .name	=	"stripecount",
+	  .fops	=	&lov_stripecount_fops	},
+	{ .name	=	"stripetype",
+	  .fops	=	&lov_stripetype_fops	},
+	{ .name	=	"numobd",
+	  .fops	=	&lov_numobd_fops	},
+	{ .name	=	"activeobd",
+	  .fops	=	&lov_activeobd_fops	},
+	{ .name	=	"filestotal",
+	  .fops	=	&lov_filestotal_fops	},
+	{ .name	=	"filesfree",
+	  .fops	=	&lov_filesfree_fops	},
+	{ .name	=	"blocksize",
+	  .fops	=	&lov_blksize_fops	},
+	{ .name	=	"kbytestotal",
+	  .fops	=	&lov_kbytestotal_fops	},
+	{ .name	=	"kbytesfree",
+	  .fops	=	&lov_kbytesfree_fops	},
+	{ .name	=	"kbytesavail",
+	  .fops	=	&lov_kbytesavail_fops	},
+	{ .name	=	"desc_uuid",
+	  .fops	=	&lov_desc_uuid_fops	},
+	{ NULL }
+};
+
+struct file_operations lov_proc_target_fops = {
+        .owner   = THIS_MODULE,
+        .open    = lov_target_seq_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = lprocfs_seq_release,
+};
+#endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c b/drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c
new file mode 100644
index 0000000000000..2ede98f67846d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c
@@ -0,0 +1,231 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#include <linux/vfs.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+
+#include "mdc_internal.h"
+
+#ifdef CONFIG_PROC_FS
+static int mdc_active_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+
+	LPROCFS_CLIMP_CHECK(dev);
+	seq_printf(m, "%d\n", !dev->u.cli.cl_import->imp_deactive);
+	LPROCFS_CLIMP_EXIT(dev);
+	return 0;
+}
+
+static ssize_t mdc_active_seq_write(struct file *file,
+				    const char __user *buffer,
+				    size_t count, loff_t *off)
+{
+	struct obd_device *dev;
+	int rc;
+	__s64 val;
+
+	dev = ((struct seq_file *)file->private_data)->private;
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+	if (val < 0 || val > 1)
+		return -ERANGE;
+
+	/* opposite senses */
+	if (dev->u.cli.cl_import->imp_deactive == val)
+		rc = ptlrpc_set_import_active(dev->u.cli.cl_import, val);
+	else
+		CDEBUG(D_CONFIG, "activate %llu: ignoring repeat request\n",
+		       val);
+
+	return count;
+}
+LPROC_SEQ_FOPS(mdc_active);
+
+static int mdc_max_rpcs_in_flight_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+	__u32 max;
+
+	max = obd_get_max_rpcs_in_flight(&dev->u.cli);
+	seq_printf(m, "%u\n", max);
+
+	return 0;
+}
+
+static ssize_t mdc_max_rpcs_in_flight_seq_write(struct file *file,
+						const char __user *buffer,
+						size_t count, loff_t *off)
+{
+	struct obd_device *dev;
+	__s64 val;
+	int rc;
+
+	dev = ((struct seq_file *)file->private_data)->private;
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val < 0 || val > UINT_MAX)
+		return -ERANGE;
+
+	rc = obd_set_max_rpcs_in_flight(&dev->u.cli, val);
+	if (rc)
+		return rc;
+
+	return count;
+}
+LPROC_SEQ_FOPS(mdc_max_rpcs_in_flight);
+
+static int mdc_max_mod_rpcs_in_flight_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+	__u16 max;
+
+	max = obd_get_max_mod_rpcs_in_flight(&dev->u.cli);
+	seq_printf(m, "%hu\n", max);
+
+	return 0;
+}
+
+static ssize_t mdc_max_mod_rpcs_in_flight_seq_write(struct file *file,
+						    const char __user *buffer,
+						    size_t count, loff_t *off)
+{
+	struct obd_device *dev =
+			((struct seq_file *)file->private_data)->private;
+	__s64 val;
+	int rc;
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val < 0 || val > USHRT_MAX)
+		return -ERANGE;
+
+	rc = obd_set_max_mod_rpcs_in_flight(&dev->u.cli, val);
+	if (rc)
+		count = rc;
+
+	return count;
+}
+LPROC_SEQ_FOPS(mdc_max_mod_rpcs_in_flight);
+
+static int mdc_rpc_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct obd_device *dev = seq->private;
+
+	return obd_mod_rpc_stats_seq_show(&dev->u.cli, seq);
+}
+
+static ssize_t mdc_rpc_stats_seq_write(struct file *file,
+				       const char __user *buf,
+				       size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct obd_device *dev = seq->private;
+	struct client_obd *cli = &dev->u.cli;
+
+	lprocfs_oh_clear(&cli->cl_mod_rpcs_hist);
+
+	return len;
+}
+LPROC_SEQ_FOPS(mdc_rpc_stats);
+
+LPROC_SEQ_FOPS_WO_TYPE(mdc, ping);
+
+LPROC_SEQ_FOPS_RO_TYPE(mdc, uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, connect_flags);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, blksize);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, kbytestotal);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, kbytesfree);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, kbytesavail);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, filestotal);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, filesfree);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, server_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, conn_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, timeouts);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, state);
+LPROC_SEQ_FOPS_RW_TYPE(mdc, obd_max_pages_per_rpc);
+LPROC_SEQ_FOPS_RW_TYPE(mdc, import);
+LPROC_SEQ_FOPS_RW_TYPE(mdc, pinger_recov);
+
+struct lprocfs_vars lprocfs_mdc_obd_vars[] = {
+	{ .name	=	"uuid",
+	  .fops	=	&mdc_uuid_fops		},
+	{ .name	=	"ping",
+	  .fops	=	&mdc_ping_fops,
+	  .proc_mode =	0222			},
+	{ .name	=	"connect_flags",
+	  .fops	=	&mdc_connect_flags_fops	},
+	{ .name	=	"blocksize",
+	  .fops	=	&mdc_blksize_fops	},
+	{ .name	=	"kbytestotal",
+	  .fops	=	&mdc_kbytestotal_fops	},
+	{ .name	=	"kbytesfree",
+	  .fops	=	&mdc_kbytesfree_fops	},
+	{ .name	=	"kbytesavail",
+	  .fops	=	&mdc_kbytesavail_fops	},
+	{ .name	=	"filestotal",
+	  .fops	=	&mdc_filestotal_fops	},
+	{ .name	=	"filesfree",
+	  .fops	=	&mdc_filesfree_fops	},
+	{ .name	=	"mds_server_uuid",
+	  .fops	=	&mdc_server_uuid_fops	},
+	{ .name	=	"mds_conn_uuid",
+	  .fops	=	&mdc_conn_uuid_fops	},
+	{ .name	=	"max_pages_per_rpc",
+	  .fops	=	&mdc_obd_max_pages_per_rpc_fops	},
+	{ .name	=	"max_rpcs_in_flight",
+	  .fops	=	&mdc_max_rpcs_in_flight_fops	},
+	{ .name	=	"max_mod_rpcs_in_flight",
+	  .fops	=	&mdc_max_mod_rpcs_in_flight_fops	},
+	{ .name	=	"timeouts",
+	  .fops	=	&mdc_timeouts_fops		},
+	{ .name	=	"import",
+	  .fops	=	&mdc_import_fops		},
+	{ .name	=	"state",
+	  .fops	=	&mdc_state_fops			},
+	{ .name	=	"pinger_recov",
+	  .fops	=	&mdc_pinger_recov_fops		},
+	{ .name	=	"rpc_stats",
+	  .fops	=	&mdc_rpc_stats_fops		},
+	{ .name	=	"active",
+	  .fops	=	&mdc_active_fops		},
+	{ NULL }
+};
+#endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c
new file mode 100644
index 0000000000000..8431b1c26622b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c
@@ -0,0 +1,724 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Commissariat a l'Energie Atomique et aux Energies
+ *                     Alternatives.
+ *
+ * Author: Henri Doreau <henri.doreau@cea.fr>
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+#include <linux/init.h>
+#include <linux/kthread.h>
+#include <linux/poll.h>
+#include <linux/miscdevice.h>
+
+#include <lustre_log.h>
+
+#include "mdc_internal.h"
+
+
+/*
+ * -- Changelog delivery through character device --
+ */
+
+/**
+ * Mutex to protect chlg_registered_devices below
+ */
+static DEFINE_MUTEX(chlg_registered_dev_lock);
+
+/**
+ * Global linked list of all registered devices (one per MDT).
+ */
+static LIST_HEAD(chlg_registered_devices);
+
+
+struct chlg_registered_dev {
+	/* Device name of the form "changelog-{MDTNAME}" */
+	char			ced_name[32];
+	/* Misc device descriptor */
+	struct miscdevice	ced_misc;
+	/* OBDs referencing this device (multiple mount point) */
+	struct list_head	ced_obds;
+	/* Reference counter for proper deregistration */
+	struct kref		ced_refs;
+	/* Link within the global chlg_registered_devices */
+	struct list_head	ced_link;
+};
+
+struct chlg_reader_state {
+	/* Shortcut to the corresponding OBD device */
+	struct obd_device	*crs_obd;
+	/* Producer thread (if any) */
+	struct task_struct	*crs_prod_task;
+	/* An error occurred that prevents from reading further */
+	bool			 crs_err;
+	/* EOF, no more records available */
+	bool			 crs_eof;
+	/* Desired start position */
+	__u64			 crs_start_offset;
+	/* Wait queue for the catalog processing thread */
+	wait_queue_head_t	 crs_waitq_prod;
+	/* Wait queue for the record copy threads */
+	wait_queue_head_t	 crs_waitq_cons;
+	/* Mutex protecting crs_rec_count and crs_rec_queue */
+	struct mutex		 crs_lock;
+	/* Number of item in the list */
+	__u64			 crs_rec_count;
+	/* List of prefetched enqueued_record::enq_linkage_items */
+	struct list_head	 crs_rec_queue;
+};
+
+struct chlg_rec_entry {
+	/* Link within the chlg_reader_state::crs_rec_queue list */
+	struct list_head	enq_linkage;
+	/* Data (enq_record) field length */
+	__u64			enq_length;
+	/* Copy of a changelog record (see struct llog_changelog_rec) */
+	struct changelog_rec	enq_record[];
+};
+
+enum {
+	/* Number of records to prefetch locally. */
+	CDEV_CHLG_MAX_PREFETCH = 1024,
+};
+
+/**
+ * ChangeLog catalog processing callback invoked on each record.
+ * If the current record is eligible to userland delivery, push
+ * it into the crs_rec_queue where the consumer code will fetch it.
+ *
+ * @param[in]     env  (unused)
+ * @param[in]     llh  Client-side handle used to identify the llog
+ * @param[in]     hdr  Header of the current llog record
+ * @param[in,out] data chlg_reader_state passed from caller
+ *
+ * @return 0 or LLOG_PROC_* control code on success, negated error on failure.
+ */
+static int chlg_read_cat_process_cb(const struct lu_env *env,
+				    struct llog_handle *llh,
+				    struct llog_rec_hdr *hdr, void *data)
+{
+	struct llog_changelog_rec *rec;
+	struct chlg_reader_state *crs = data;
+	struct chlg_rec_entry *enq;
+	struct l_wait_info lwi = { 0 };
+	size_t len;
+	int rc;
+	ENTRY;
+
+	LASSERT(crs != NULL);
+	LASSERT(hdr != NULL);
+
+	rec = container_of(hdr, struct llog_changelog_rec, cr_hdr);
+
+	if (rec->cr_hdr.lrh_type != CHANGELOG_REC) {
+		rc = -EINVAL;
+		CERROR("%s: not a changelog rec %x/%d in llog "DFID" rc = %d\n",
+		       crs->crs_obd->obd_name, rec->cr_hdr.lrh_type,
+		       rec->cr.cr_type,
+		       PFID(lu_object_fid(&llh->lgh_obj->do_lu)), rc);
+		RETURN(rc);
+	}
+
+	/* Skip undesired records */
+	if (rec->cr.cr_index < crs->crs_start_offset)
+		RETURN(0);
+
+	CDEBUG(D_HSM, "%llu %02d%-5s %llu 0x%x t="DFID" p="DFID" %.*s\n",
+	       rec->cr.cr_index, rec->cr.cr_type,
+	       changelog_type2str(rec->cr.cr_type), rec->cr.cr_time,
+	       rec->cr.cr_flags & CLF_FLAGMASK,
+	       PFID(&rec->cr.cr_tfid), PFID(&rec->cr.cr_pfid),
+	       rec->cr.cr_namelen, changelog_rec_name(&rec->cr));
+
+	l_wait_event(crs->crs_waitq_prod,
+		     (crs->crs_rec_count < CDEV_CHLG_MAX_PREFETCH ||
+		      kthread_should_stop()), &lwi);
+
+	if (kthread_should_stop())
+		RETURN(LLOG_PROC_BREAK);
+
+	len = changelog_rec_size(&rec->cr) + rec->cr.cr_namelen;
+	OBD_ALLOC(enq, sizeof(*enq) + len);
+	if (enq == NULL)
+		RETURN(-ENOMEM);
+
+	INIT_LIST_HEAD(&enq->enq_linkage);
+	enq->enq_length = len;
+	memcpy(enq->enq_record, &rec->cr, len);
+
+	mutex_lock(&crs->crs_lock);
+	list_add_tail(&enq->enq_linkage, &crs->crs_rec_queue);
+	crs->crs_rec_count++;
+	mutex_unlock(&crs->crs_lock);
+
+	wake_up_all(&crs->crs_waitq_cons);
+
+	RETURN(0);
+}
+
+/**
+ * Remove record from the list it is attached to and free it.
+ */
+static void enq_record_delete(struct chlg_rec_entry *rec)
+{
+	list_del(&rec->enq_linkage);
+	OBD_FREE(rec, sizeof(*rec) + rec->enq_length);
+}
+
+/**
+ * Record prefetch thread entry point. Opens the changelog catalog and starts
+ * reading records.
+ *
+ * @param[in,out]  args  chlg_reader_state passed from caller.
+ * @return 0 on success, negated error code on failure.
+ */
+static int chlg_load(void *args)
+{
+	struct chlg_reader_state *crs = args;
+	struct obd_device *obd = crs->crs_obd;
+	struct llog_ctxt *ctx = NULL;
+	struct llog_handle *llh = NULL;
+	struct l_wait_info lwi = { 0 };
+	int rc;
+	ENTRY;
+
+	ctx = llog_get_context(obd, LLOG_CHANGELOG_REPL_CTXT);
+	if (ctx == NULL)
+		GOTO(err_out, rc = -ENOENT);
+
+	rc = llog_open(NULL, ctx, &llh, NULL, CHANGELOG_CATALOG,
+		       LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("%s: fail to open changelog catalog: rc = %d\n",
+		       obd->obd_name, rc);
+		GOTO(err_out, rc);
+	}
+
+	rc = llog_init_handle(NULL, llh, LLOG_F_IS_CAT|LLOG_F_EXT_JOBID, NULL);
+	if (rc) {
+		CERROR("%s: fail to init llog handle: rc = %d\n",
+		       obd->obd_name, rc);
+		GOTO(err_out, rc);
+	}
+
+	rc = llog_cat_process(NULL, llh, chlg_read_cat_process_cb, crs, 0, 0);
+	if (rc < 0) {
+		CERROR("%s: fail to process llog: rc = %d\n", obd->obd_name, rc);
+		GOTO(err_out, rc);
+	}
+
+	crs->crs_eof = true;
+
+err_out:
+	if (rc < 0)
+		crs->crs_err = true;
+
+	wake_up_all(&crs->crs_waitq_cons);
+
+	if (llh != NULL)
+		llog_cat_close(NULL, llh);
+
+	if (ctx != NULL)
+		llog_ctxt_put(ctx);
+
+	l_wait_event(crs->crs_waitq_prod, kthread_should_stop(), &lwi);
+
+	RETURN(rc);
+}
+
+/**
+ * Read handler, dequeues records from the chlg_reader_state if any.
+ * No partial records are copied to userland so this function can return less
+ * data than required (short read).
+ *
+ * @param[in]   file   File pointer to the character device.
+ * @param[out]  buff   Userland buffer where to copy the records.
+ * @param[in]   count  Userland buffer size.
+ * @param[out]  ppos   File position, updated with the index number of the next
+ *		       record to read.
+ * @return number of copied bytes on success, negated error code on failure.
+ */
+static ssize_t chlg_read(struct file *file, char __user *buff, size_t count,
+			 loff_t *ppos)
+{
+	struct chlg_reader_state *crs = file->private_data;
+	struct chlg_rec_entry *rec;
+	struct chlg_rec_entry *tmp;
+	struct l_wait_info lwi = { 0 };
+	ssize_t  written_total = 0;
+	LIST_HEAD(consumed);
+	ENTRY;
+
+	if (file->f_flags & O_NONBLOCK && crs->crs_rec_count == 0)
+		RETURN(-EAGAIN);
+
+	l_wait_event(crs->crs_waitq_cons,
+		     crs->crs_rec_count > 0 || crs->crs_eof || crs->crs_err,
+		     &lwi);
+
+	mutex_lock(&crs->crs_lock);
+	list_for_each_entry_safe(rec, tmp, &crs->crs_rec_queue, enq_linkage) {
+		if (written_total + rec->enq_length > count)
+			break;
+
+		if (copy_to_user(buff, rec->enq_record, rec->enq_length)) {
+			if (written_total == 0)
+				written_total = -EFAULT;
+			break;
+		}
+
+		buff += rec->enq_length;
+		written_total += rec->enq_length;
+
+		crs->crs_rec_count--;
+		list_move_tail(&rec->enq_linkage, &consumed);
+
+		crs->crs_start_offset = rec->enq_record->cr_index + 1;
+	}
+	mutex_unlock(&crs->crs_lock);
+
+	if (written_total > 0)
+		wake_up_all(&crs->crs_waitq_prod);
+
+	list_for_each_entry_safe(rec, tmp, &consumed, enq_linkage)
+		enq_record_delete(rec);
+
+	*ppos = crs->crs_start_offset;
+
+	RETURN(written_total);
+}
+
+/**
+ * Jump to a given record index. Helper for chlg_llseek().
+ *
+ * @param[in,out]  crs     Internal reader state.
+ * @param[in]      offset  Desired offset (index record).
+ * @return 0 on success, negated error code on failure.
+ */
+static int chlg_set_start_offset(struct chlg_reader_state *crs, __u64 offset)
+{
+	struct chlg_rec_entry *rec;
+	struct chlg_rec_entry *tmp;
+
+	mutex_lock(&crs->crs_lock);
+	if (offset < crs->crs_start_offset) {
+		mutex_unlock(&crs->crs_lock);
+		return -ERANGE;
+	}
+
+	crs->crs_start_offset = offset;
+	list_for_each_entry_safe(rec, tmp, &crs->crs_rec_queue, enq_linkage) {
+		struct changelog_rec *cr = rec->enq_record;
+
+		if (cr->cr_index >= crs->crs_start_offset)
+			break;
+
+		crs->crs_rec_count--;
+		enq_record_delete(rec);
+	}
+
+	mutex_unlock(&crs->crs_lock);
+	wake_up_all(&crs->crs_waitq_prod);
+	return 0;
+}
+
+/**
+ * Move read pointer to a certain record index, encoded as an offset.
+ *
+ * @param[in,out] file   File pointer to the changelog character device
+ * @param[in]	  off    Offset to skip, actually a record index, not byte count
+ * @param[in]	  whence Relative/Absolute interpretation of the offset
+ * @return the resulting position on success or negated error code on failure.
+ */
+static loff_t chlg_llseek(struct file *file, loff_t off, int whence)
+{
+	struct chlg_reader_state *crs = file->private_data;
+	loff_t pos;
+	int rc;
+
+	switch (whence) {
+	case SEEK_SET:
+		pos = off;
+		break;
+	case SEEK_CUR:
+		pos = file->f_pos + off;
+		break;
+	case SEEK_END:
+	default:
+		return -EINVAL;
+	}
+
+	/* We cannot go backward */
+	if (pos < file->f_pos)
+		return -EINVAL;
+
+	rc = chlg_set_start_offset(crs, pos);
+	if (rc != 0)
+		return rc;
+
+	file->f_pos = pos;
+	return pos;
+}
+
+/**
+ * Clear record range for a given changelog reader.
+ *
+ * @param[in]  crs     Current internal state.
+ * @param[in]  reader  Changelog reader ID (cl1, cl2...)
+ * @param[in]  record  Record index up which to clear
+ * @return 0 on success, negated error code on failure.
+ */
+static int chlg_clear(struct chlg_reader_state *crs, __u32 reader, __u64 record)
+{
+	struct obd_device *obd = crs->crs_obd;
+	struct changelog_setinfo cs  = {
+		.cs_recno = record,
+		.cs_id    = reader
+	};
+
+	return obd_set_info_async(NULL, obd->obd_self_export,
+				  strlen(KEY_CHANGELOG_CLEAR),
+				  KEY_CHANGELOG_CLEAR, sizeof(cs), &cs, NULL);
+}
+
+/** Maximum changelog control command size */
+#define CHLG_CONTROL_CMD_MAX	64
+
+/**
+ * Handle writes() into the changelog character device. Write() can be used
+ * to request special control operations.
+ *
+ * @param[in]  file  File pointer to the changelog character device
+ * @param[in]  buff  User supplied data (written data)
+ * @param[in]  count Number of written bytes
+ * @param[in]  off   (unused)
+ * @return number of written bytes on success, negated error code on failure.
+ */
+static ssize_t chlg_write(struct file *file, const char __user *buff,
+			  size_t count, loff_t *off)
+{
+	struct chlg_reader_state *crs = file->private_data;
+	char *kbuf;
+	__u64 record;
+	__u32 reader;
+	int rc = 0;
+	ENTRY;
+
+	if (count > CHLG_CONTROL_CMD_MAX)
+		RETURN(-EINVAL);
+
+	OBD_ALLOC(kbuf, CHLG_CONTROL_CMD_MAX);
+	if (kbuf == NULL)
+		RETURN(-ENOMEM);
+
+	if (copy_from_user(kbuf, buff, count))
+		GOTO(out_kbuf, rc = -EFAULT);
+
+	kbuf[CHLG_CONTROL_CMD_MAX - 1] = '\0';
+
+	if (sscanf(kbuf, "clear:cl%u:%llu", &reader, &record) == 2)
+		rc = chlg_clear(crs, reader, record);
+	else
+		rc = -EINVAL;
+
+	EXIT;
+out_kbuf:
+	OBD_FREE(kbuf, CHLG_CONTROL_CMD_MAX);
+	return rc < 0 ? rc : count;
+}
+
+/**
+ * Find the OBD device associated to a changelog character device.
+ * @param[in]  cdev  character device instance descriptor
+ * @return corresponding OBD device or NULL if none was found.
+ */
+static struct obd_device *chlg_obd_get(dev_t cdev)
+{
+	int minor = MINOR(cdev);
+	struct obd_device *obd = NULL;
+	struct chlg_registered_dev *curr;
+
+	mutex_lock(&chlg_registered_dev_lock);
+	list_for_each_entry(curr, &chlg_registered_devices, ced_link) {
+		if (curr->ced_misc.minor == minor) {
+			/* take the first available OBD device attached */
+			obd = list_first_entry(&curr->ced_obds,
+					       struct obd_device,
+					       u.cli.cl_chg_dev_linkage);
+			break;
+		}
+	}
+	mutex_unlock(&chlg_registered_dev_lock);
+	return obd;
+}
+
+/**
+ * Open handler, initialize internal CRS state and spawn prefetch thread if
+ * needed.
+ * @param[in]  inode  Inode struct for the open character device.
+ * @param[in]  file   Corresponding file pointer.
+ * @return 0 on success, negated error code on failure.
+ */
+static int chlg_open(struct inode *inode, struct file *file)
+{
+	struct chlg_reader_state *crs;
+	struct obd_device *obd = chlg_obd_get(inode->i_rdev);
+	struct task_struct *task;
+	int rc;
+	ENTRY;
+
+	if (!obd)
+		RETURN(-ENODEV);
+
+	OBD_ALLOC_PTR(crs);
+	if (!crs)
+		RETURN(-ENOMEM);
+
+	crs->crs_obd = obd;
+	crs->crs_err = false;
+	crs->crs_eof = false;
+
+	mutex_init(&crs->crs_lock);
+	INIT_LIST_HEAD(&crs->crs_rec_queue);
+	init_waitqueue_head(&crs->crs_waitq_prod);
+	init_waitqueue_head(&crs->crs_waitq_cons);
+
+	if (file->f_mode & FMODE_READ) {
+		task = kthread_run(chlg_load, crs, "chlg_load_thread");
+		if (IS_ERR(task)) {
+			rc = PTR_ERR(task);
+			CERROR("%s: cannot start changelog thread: rc = %d\n",
+			       obd->obd_name, rc);
+			GOTO(err_crs, rc);
+		}
+		crs->crs_prod_task = task;
+	}
+
+	file->private_data = crs;
+	RETURN(0);
+
+err_crs:
+	OBD_FREE_PTR(crs);
+	return rc;
+}
+
+/**
+ * Close handler, release resources.
+ *
+ * @param[in]  inode  Inode struct for the open character device.
+ * @param[in]  file   Corresponding file pointer.
+ * @return 0 on success, negated error code on failure.
+ */
+static int chlg_release(struct inode *inode, struct file *file)
+{
+	struct chlg_reader_state *crs = file->private_data;
+	struct chlg_rec_entry *rec;
+	struct chlg_rec_entry *tmp;
+
+	if (crs->crs_prod_task)
+		kthread_stop(crs->crs_prod_task);
+
+	list_for_each_entry_safe(rec, tmp, &crs->crs_rec_queue, enq_linkage)
+		enq_record_delete(rec);
+
+	OBD_FREE_PTR(crs);
+	return 0;
+}
+
+/**
+ * Poll handler, indicates whether the device is readable (new records) and
+ * writable (always).
+ *
+ * @param[in]  file   Device file pointer.
+ * @param[in]  wait   (opaque)
+ * @return combination of the poll status flags.
+ */
+static unsigned int chlg_poll(struct file *file, poll_table *wait)
+{
+	struct chlg_reader_state *crs = file->private_data;
+	unsigned int mask = 0;
+
+	mutex_lock(&crs->crs_lock);
+	poll_wait(file, &crs->crs_waitq_cons, wait);
+	if (crs->crs_rec_count > 0)
+		mask |= POLLIN | POLLRDNORM;
+	if (crs->crs_err)
+		mask |= POLLERR;
+	if (crs->crs_eof)
+		mask |= POLLHUP;
+	mutex_unlock(&crs->crs_lock);
+	return mask;
+}
+
+static const struct file_operations chlg_fops = {
+	.owner		= THIS_MODULE,
+	.llseek		= chlg_llseek,
+	.read		= chlg_read,
+	.write		= chlg_write,
+	.open		= chlg_open,
+	.release	= chlg_release,
+	.poll		= chlg_poll,
+};
+
+/**
+ * This uses obd_name of the form: "testfs-MDT0000-mdc-ffff88006501600"
+ * and returns a name of the form: "changelog-testfs-MDT0000".
+ */
+static void get_chlg_name(char *name, size_t name_len, struct obd_device *obd)
+{
+	int i;
+
+	snprintf(name, name_len, "changelog-%s", obd->obd_name);
+
+	/* Find the 2nd '-' from the end and truncate on it */
+	for (i = 0; i < 2; i++) {
+		char *p = strrchr(name, '-');
+
+		if (p == NULL)
+			return;
+		*p = '\0';
+	}
+}
+
+/**
+ * Find a changelog character device by name.
+ * All devices registered during MDC setup are listed in a global list with
+ * their names attached.
+ */
+static struct chlg_registered_dev *
+chlg_registered_dev_find_by_name(const char *name)
+{
+	struct chlg_registered_dev *dit;
+
+	list_for_each_entry(dit, &chlg_registered_devices, ced_link)
+		if (strcmp(name, dit->ced_name) == 0)
+			return dit;
+	return NULL;
+}
+
+/**
+ * Find chlg_registered_dev structure for a given OBD device.
+ * This is bad O(n^2) but for each filesystem:
+ *   - N is # of MDTs times # of mount points
+ *   - this only runs at shutdown
+ */
+static struct chlg_registered_dev *
+chlg_registered_dev_find_by_obd(const struct obd_device *obd)
+{
+	struct chlg_registered_dev *dit;
+	struct obd_device *oit;
+
+	list_for_each_entry(dit, &chlg_registered_devices, ced_link)
+		list_for_each_entry(oit, &dit->ced_obds,
+				    u.cli.cl_chg_dev_linkage)
+			if (oit == obd)
+				return dit;
+	return NULL;
+}
+
+/**
+ * Changelog character device initialization.
+ * Register a misc character device with a dynamic minor number, under a name
+ * of the form: 'changelog-fsname-MDTxxxx'. Reference this OBD device with it.
+ *
+ * @param[in] obd  This MDC obd_device.
+ * @return 0 on success, negated error code on failure.
+ */
+int mdc_changelog_cdev_init(struct obd_device *obd)
+{
+	struct chlg_registered_dev *exist;
+	struct chlg_registered_dev *entry;
+	int rc;
+	ENTRY;
+
+	OBD_ALLOC_PTR(entry);
+	if (entry == NULL)
+		RETURN(-ENOMEM);
+
+	get_chlg_name(entry->ced_name, sizeof(entry->ced_name), obd);
+
+	entry->ced_misc.minor = MISC_DYNAMIC_MINOR;
+	entry->ced_misc.name  = entry->ced_name;
+	entry->ced_misc.fops  = &chlg_fops;
+
+	kref_init(&entry->ced_refs);
+	INIT_LIST_HEAD(&entry->ced_obds);
+	INIT_LIST_HEAD(&entry->ced_link);
+
+	mutex_lock(&chlg_registered_dev_lock);
+	exist = chlg_registered_dev_find_by_name(entry->ced_name);
+	if (exist != NULL) {
+		kref_get(&exist->ced_refs);
+		list_add_tail(&obd->u.cli.cl_chg_dev_linkage, &exist->ced_obds);
+		GOTO(out_unlock, rc = 0);
+	}
+
+	/* Register new character device */
+	rc = misc_register(&entry->ced_misc);
+	if (rc != 0)
+		GOTO(out_unlock, rc);
+
+	list_add_tail(&obd->u.cli.cl_chg_dev_linkage, &entry->ced_obds);
+	list_add_tail(&entry->ced_link, &chlg_registered_devices);
+
+	entry = NULL;	/* prevent it from being freed below */
+
+out_unlock:
+	mutex_unlock(&chlg_registered_dev_lock);
+	if (entry)
+		OBD_FREE_PTR(entry);
+	RETURN(rc);
+}
+
+/**
+ * Deregister a changelog character device whose refcount has reached zero.
+ */
+static void chlg_dev_clear(struct kref *kref)
+{
+	struct chlg_registered_dev *entry = container_of(kref,
+						      struct chlg_registered_dev,
+						      ced_refs);
+	ENTRY;
+
+	list_del(&entry->ced_link);
+	misc_deregister(&entry->ced_misc);
+	OBD_FREE_PTR(entry);
+	EXIT;
+}
+
+/**
+ * Release OBD, decrease reference count of the corresponding changelog device.
+ */
+void mdc_changelog_cdev_finish(struct obd_device *obd)
+{
+	struct chlg_registered_dev *dev = chlg_registered_dev_find_by_obd(obd);
+	ENTRY;
+
+	mutex_lock(&chlg_registered_dev_lock);
+	list_del_init(&obd->u.cli.cl_chg_dev_linkage);
+	kref_put(&dev->ced_refs, chlg_dev_clear);
+	mutex_unlock(&chlg_registered_dev_lock);
+	EXIT;
+}
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_internal.h b/drivers/staging/lustrefsx/lustre/mdc/mdc_internal.h
new file mode 100644
index 0000000000000..98773524caee9
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_internal.h
@@ -0,0 +1,166 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _MDC_INTERNAL_H
+#define _MDC_INTERNAL_H
+
+#include <lustre_mdc.h>
+
+#ifdef CONFIG_PROC_FS
+extern struct lprocfs_vars lprocfs_mdc_obd_vars[];
+#endif
+
+void mdc_pack_body(struct ptlrpc_request *req, const struct lu_fid *fid,
+		   u64 valid, size_t ea_size, u32 suppgid, u32 flags);
+void mdc_swap_layouts_pack(struct ptlrpc_request *req,
+			   struct md_op_data *op_data);
+void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff, size_t size,
+		      const struct lu_fid *fid);
+void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, __u32 flags,
+		      struct md_op_data *data, size_t ea_size);
+void mdc_setattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		      void *ea, size_t ealen);
+void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		     const void *data, size_t datalen, umode_t mode,
+		     uid_t uid, gid_t gid, cfs_cap_t capability, __u64 rdev);
+void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		   umode_t mode, __u64 rdev, __u64 flags,
+		   const void *data, size_t datalen);
+void mdc_file_secctx_pack(struct ptlrpc_request *req,
+			  const char *secctx_name,
+			  const void *secctx, size_t secctx_size);
+
+void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
+void mdc_getxattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
+void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
+void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		     const char *old, size_t oldlen,
+		     const char *new, size_t newlen);
+void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
+
+/* mdc/mdc_locks.c */
+int mdc_set_lock_data(struct obd_export *exp,
+		      const struct lustre_handle *lockh,
+		      void *data, __u64 *bits);
+
+int mdc_null_inode(struct obd_export *exp, const struct lu_fid *fid);
+
+int mdc_intent_lock(struct obd_export *exp,
+		    struct md_op_data *op_data,
+		    struct lookup_intent *it,
+		    struct ptlrpc_request **reqp,
+		    ldlm_blocking_callback cb_blocking,
+		    __u64 extra_lock_flags);
+
+int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+		const union ldlm_policy_data *policy,
+		struct md_op_data *op_data,
+		struct lustre_handle *lockh, __u64 extra_lock_flags);
+
+int mdc_resource_get_unused(struct obd_export *exp, const struct lu_fid *fid,
+			    struct list_head *cancels, enum ldlm_mode mode,
+                            __u64 bits);
+int mdc_save_lovea(struct ptlrpc_request *req,
+		   const struct req_msg_field *field,
+		   void *data, u32 size);
+/* mdc/mdc_request.c */
+int mdc_fid_alloc(const struct lu_env *env, struct obd_export *exp,
+		  struct lu_fid *fid, struct md_op_data *op_data);
+
+struct obd_client_handle;
+
+int mdc_get_lustre_md(struct obd_export *md_exp, struct ptlrpc_request *req,
+                      struct obd_export *dt_exp, struct obd_export *lmv_exp,
+                      struct lustre_md *md);
+
+int mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md);
+
+int mdc_set_open_replay_data(struct obd_export *exp,
+			     struct obd_client_handle *och,
+			     struct lookup_intent *it);
+
+int mdc_clear_open_replay_data(struct obd_export *exp,
+                               struct obd_client_handle *och);
+void mdc_commit_open(struct ptlrpc_request *req);
+void mdc_replay_open(struct ptlrpc_request *req);
+
+int mdc_create(struct obd_export *exp, struct md_op_data *op_data,
+		const void *data, size_t datalen,
+		umode_t mode, uid_t uid, gid_t gid,
+		cfs_cap_t capability, __u64 rdev,
+		struct ptlrpc_request **request);
+int mdc_link(struct obd_export *exp, struct md_op_data *op_data,
+             struct ptlrpc_request **request);
+int mdc_rename(struct obd_export *exp, struct md_op_data *op_data,
+		const char *old, size_t oldlen, const char *new, size_t newlen,
+		struct ptlrpc_request **request);
+int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data,
+		void *ea, size_t ealen, struct ptlrpc_request **request);
+int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
+	       struct ptlrpc_request **request);
+int mdc_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
+		      union ldlm_policy_data *policy, enum ldlm_mode mode,
+		      enum ldlm_cancel_flags flags, void *opaque);
+
+int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
+                        struct lu_fid *fid, __u64 *bits);
+
+int mdc_intent_getattr_async(struct obd_export *exp,
+			     struct md_enqueue_info *minfo);
+
+enum ldlm_mode mdc_lock_match(struct obd_export *exp, __u64 flags,
+			      const struct lu_fid *fid, enum ldlm_type type,
+			      union ldlm_policy_data *policy,
+			      enum ldlm_mode mode, struct lustre_handle *lockh);
+
+
+int mdc_changelog_cdev_init(struct obd_device *obd);
+
+void mdc_changelog_cdev_finish(struct obd_device *obd);
+
+static inline int mdc_prep_elc_req(struct obd_export *exp,
+				   struct ptlrpc_request *req, int opc,
+				   struct list_head *cancels, int count)
+{
+	return ldlm_prep_elc_req(exp, req, LUSTRE_MDS_VERSION, opc, 0, cancels,
+				 count);
+}
+
+static inline unsigned long hash_x_index(__u64 hash, int hash64)
+{
+	if (BITS_PER_LONG == 32 && hash64)
+		hash >>= 32;
+	/* save hash 0 with hash 1 */
+	return ~0UL - (hash + !hash);
+}
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c
new file mode 100644
index 0000000000000..f02a8de80d4f9
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c
@@ -0,0 +1,551 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+#include <linux/user_namespace.h>
+#ifdef HAVE_UIDGID_HEADER
+# include <linux/uidgid.h>
+#endif
+#include <lustre_net.h>
+#include <lustre/lustre_idl.h>
+#include <obd_class.h>
+#include <obd.h>
+#include <cl_object.h>
+#include "mdc_internal.h"
+
+static void set_mrc_cr_flags(struct mdt_rec_create *mrc, __u64 flags)
+{
+	mrc->cr_flags_l = (__u32)(flags & 0xFFFFFFFFUll);
+	mrc->cr_flags_h = (__u32)(flags >> 32);
+}
+
+static void __mdc_pack_body(struct mdt_body *b, __u32 suppgid)
+{
+	LASSERT (b != NULL);
+
+	b->mbo_suppgid = suppgid;
+	b->mbo_uid = from_kuid(&init_user_ns, current_uid());
+	b->mbo_gid = from_kgid(&init_user_ns, current_gid());
+	b->mbo_fsuid = from_kuid(&init_user_ns, current_fsuid());
+	b->mbo_fsgid = from_kgid(&init_user_ns, current_fsgid());
+	b->mbo_capability = cfs_curproc_cap_pack();
+}
+
+void mdc_swap_layouts_pack(struct ptlrpc_request *req,
+			   struct md_op_data *op_data)
+{
+	struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+						    &RMF_MDT_BODY);
+
+	__mdc_pack_body(b, op_data->op_suppgids[0]);
+	b->mbo_fid1 = op_data->op_fid1;
+	b->mbo_fid2 = op_data->op_fid2;
+	b->mbo_valid |= OBD_MD_FLID;
+}
+
+void mdc_pack_body(struct ptlrpc_request *req, const struct lu_fid *fid,
+		   u64 valid, size_t ea_size, u32 suppgid, u32 flags)
+{
+	struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+						    &RMF_MDT_BODY);
+	LASSERT(b != NULL);
+	b->mbo_valid = valid;
+	b->mbo_eadatasize = ea_size;
+	b->mbo_flags = flags;
+	__mdc_pack_body(b, suppgid);
+	if (fid) {
+		b->mbo_fid1 = *fid;
+		b->mbo_valid |= OBD_MD_FLID;
+	}
+}
+
+/**
+ * Pack a name (path component) into a request
+ *
+ * \param[in]	req		request
+ * \param[in]	field		request field (usually RMF_NAME)
+ * \param[in]	name		path component
+ * \param[in]	name_len	length of path component
+ *
+ * \a field must be present in \a req and of size \a name_len + 1.
+ *
+ * \a name must be '\0' terminated of length \a name_len and represent
+ * a single path component (not contain '/').
+ */
+static void mdc_pack_name(struct ptlrpc_request *req,
+			  const struct req_msg_field *field,
+			  const char *name, size_t name_len)
+{
+	char *buf;
+	size_t buf_size;
+	size_t cpy_len;
+
+	buf = req_capsule_client_get(&req->rq_pill, field);
+	buf_size = req_capsule_get_size(&req->rq_pill, field, RCL_CLIENT);
+
+	LASSERT(name != NULL && name_len != 0 &&
+		buf != NULL && buf_size == name_len + 1);
+
+	cpy_len = strlcpy(buf, name, buf_size);
+
+	LASSERT(lu_name_is_valid_2(buf, cpy_len));
+	if (cpy_len != name_len)
+		CDEBUG(D_DENTRY, "%s: %s len %zd != %zd, concurrent rename?\n",
+		       req->rq_export->exp_obd->obd_name, buf, name_len,
+		       cpy_len);
+}
+
+void mdc_file_secctx_pack(struct ptlrpc_request *req, const char *secctx_name,
+			  const void *secctx, size_t secctx_size)
+{
+	void *buf;
+	size_t buf_size;
+
+	if (secctx_name == NULL)
+		return;
+
+	buf = req_capsule_client_get(&req->rq_pill, &RMF_FILE_SECCTX_NAME);
+	buf_size = req_capsule_get_size(&req->rq_pill, &RMF_FILE_SECCTX_NAME,
+					RCL_CLIENT);
+
+	LASSERT(buf_size == strlen(secctx_name) + 1);
+	memcpy(buf, secctx_name, buf_size);
+
+	buf = req_capsule_client_get(&req->rq_pill, &RMF_FILE_SECCTX);
+	buf_size = req_capsule_get_size(&req->rq_pill, &RMF_FILE_SECCTX,
+					RCL_CLIENT);
+
+	LASSERT(buf_size == secctx_size);
+	memcpy(buf, secctx, buf_size);
+}
+
+void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff, size_t size,
+		      const struct lu_fid *fid)
+{
+        struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+                                                    &RMF_MDT_BODY);
+	b->mbo_fid1 = *fid;
+	b->mbo_valid |= OBD_MD_FLID;
+	b->mbo_size = pgoff;		       /* !! */
+	b->mbo_nlink = size;			/* !! */
+	__mdc_pack_body(b, -1);
+	b->mbo_mode = LUDA_FID | LUDA_TYPE;
+}
+
+/* packing of MDS records */
+void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		     const void *data, size_t datalen, umode_t mode,
+		     uid_t uid, gid_t gid, cfs_cap_t cap_effective, __u64 rdev)
+{
+	struct mdt_rec_create	*rec;
+	char			*tmp;
+	__u64			 flags;
+
+	CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_create));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+
+	rec->cr_opcode   = REINT_CREATE;
+	rec->cr_fsuid    = uid;
+	rec->cr_fsgid    = gid;
+	rec->cr_cap      = cap_effective;
+	rec->cr_fid1     = op_data->op_fid1;
+	rec->cr_fid2     = op_data->op_fid2;
+	rec->cr_mode     = mode;
+	rec->cr_rdev     = rdev;
+	rec->cr_time     = op_data->op_mod_time;
+	rec->cr_suppgid1 = op_data->op_suppgids[0];
+	rec->cr_suppgid2 = op_data->op_suppgids[1];
+	flags = 0;
+	if (op_data->op_bias & MDS_CREATE_VOLATILE)
+		flags |= MDS_OPEN_VOLATILE;
+	set_mrc_cr_flags(rec, flags);
+	rec->cr_bias     = op_data->op_bias;
+	rec->cr_umask    = current_umask();
+
+	mdc_pack_name(req, &RMF_NAME, op_data->op_name, op_data->op_namelen);
+	if (data) {
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+		memcpy(tmp, data, datalen);
+	}
+
+	mdc_file_secctx_pack(req, op_data->op_file_secctx_name,
+			     op_data->op_file_secctx,
+			     op_data->op_file_secctx_size);
+}
+
+static inline __u64 mds_pack_open_flags(__u64 flags)
+{
+	__u64 cr_flags = (flags & (FMODE_READ | FMODE_WRITE |
+				   MDS_OPEN_FL_INTERNAL));
+
+	if (flags & O_CREAT)
+		cr_flags |= MDS_OPEN_CREAT;
+	if (flags & O_EXCL)
+		cr_flags |= MDS_OPEN_EXCL;
+	if (flags & O_TRUNC)
+		cr_flags |= MDS_OPEN_TRUNC;
+	if (flags & O_APPEND)
+		cr_flags |= MDS_OPEN_APPEND;
+	if (flags & O_SYNC)
+		cr_flags |= MDS_OPEN_SYNC;
+	if (flags & O_DIRECTORY)
+		cr_flags |= MDS_OPEN_DIRECTORY;
+#ifdef FMODE_EXEC
+	if (flags & FMODE_EXEC)
+		cr_flags |= MDS_FMODE_EXEC;
+#endif
+	if (cl_is_lov_delay_create(flags))
+		cr_flags |= MDS_OPEN_DELAY_CREATE;
+
+	if (flags & O_NONBLOCK)
+		cr_flags |= MDS_OPEN_NORESTORE;
+
+	return cr_flags;
+}
+
+/* packing of MDS records */
+void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		   umode_t mode, __u64 rdev, __u64 flags, const void *lmm,
+		   size_t lmmlen)
+{
+	struct mdt_rec_create *rec;
+	char *tmp;
+	__u64 cr_flags;
+
+	CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_create));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+	/* XXX do something about time, uid, gid */
+	rec->cr_opcode = REINT_OPEN;
+	rec->cr_fsuid	= from_kuid(&init_user_ns, current_fsuid());
+	rec->cr_fsgid	= from_kgid(&init_user_ns, current_fsgid());
+	rec->cr_cap    = cfs_curproc_cap_pack();
+	rec->cr_mode   = mode;
+	cr_flags	= mds_pack_open_flags(flags);
+	rec->cr_rdev   = rdev;
+	rec->cr_umask  = current_umask();
+	if (op_data != NULL) {
+		rec->cr_fid1       = op_data->op_fid1;
+		rec->cr_fid2       = op_data->op_fid2;
+		rec->cr_time       = op_data->op_mod_time;
+		rec->cr_suppgid1   = op_data->op_suppgids[0];
+		rec->cr_suppgid2   = op_data->op_suppgids[1];
+		rec->cr_bias       = op_data->op_bias;
+		rec->cr_old_handle = op_data->op_handle;
+
+		if (op_data->op_name) {
+			mdc_pack_name(req, &RMF_NAME, op_data->op_name,
+				      op_data->op_namelen);
+
+			if (op_data->op_bias & MDS_CREATE_VOLATILE)
+				cr_flags |= MDS_OPEN_VOLATILE;
+		}
+
+		mdc_file_secctx_pack(req, op_data->op_file_secctx_name,
+				     op_data->op_file_secctx,
+				     op_data->op_file_secctx_size);
+	}
+
+	if (lmm) {
+		cr_flags |= MDS_OPEN_HAS_EA;
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+		memcpy(tmp, lmm, lmmlen);
+	}
+	set_mrc_cr_flags(rec, cr_flags);
+}
+
+static inline __u64 attr_pack(unsigned int ia_valid) {
+        __u64 sa_valid = 0;
+
+        if (ia_valid & ATTR_MODE)
+                sa_valid |= MDS_ATTR_MODE;
+        if (ia_valid & ATTR_UID)
+                sa_valid |= MDS_ATTR_UID;
+        if (ia_valid & ATTR_GID)
+                sa_valid |= MDS_ATTR_GID;
+        if (ia_valid & ATTR_SIZE)
+                sa_valid |= MDS_ATTR_SIZE;
+        if (ia_valid & ATTR_ATIME)
+                sa_valid |= MDS_ATTR_ATIME;
+        if (ia_valid & ATTR_MTIME)
+                sa_valid |= MDS_ATTR_MTIME;
+        if (ia_valid & ATTR_CTIME)
+                sa_valid |= MDS_ATTR_CTIME;
+        if (ia_valid & ATTR_ATIME_SET)
+                sa_valid |= MDS_ATTR_ATIME_SET;
+        if (ia_valid & ATTR_MTIME_SET)
+                sa_valid |= MDS_ATTR_MTIME_SET;
+        if (ia_valid & ATTR_FORCE)
+                sa_valid |= MDS_ATTR_FORCE;
+        if (ia_valid & ATTR_ATTR_FLAG)
+                sa_valid |= MDS_ATTR_ATTR_FLAG;
+        if (ia_valid & ATTR_KILL_SUID)
+                sa_valid |=  MDS_ATTR_KILL_SUID;
+        if (ia_valid & ATTR_KILL_SGID)
+                sa_valid |= MDS_ATTR_KILL_SGID;
+        if (ia_valid & ATTR_CTIME_SET)
+                sa_valid |= MDS_ATTR_CTIME_SET;
+        if (ia_valid & ATTR_FROM_OPEN)
+                sa_valid |= MDS_ATTR_FROM_OPEN;
+        if (ia_valid & ATTR_BLOCKS)
+                sa_valid |= MDS_ATTR_BLOCKS;
+        if (ia_valid & MDS_OPEN_OWNEROVERRIDE)
+                /* NFSD hack (see bug 5781) */
+                sa_valid |= MDS_OPEN_OWNEROVERRIDE;
+	if (ia_valid & MDS_ATTR_PROJID)
+		sa_valid |= MDS_ATTR_PROJID;
+        return sa_valid;
+}
+
+static void mdc_setattr_pack_rec(struct mdt_rec_setattr *rec,
+				 struct md_op_data *op_data)
+{
+	rec->sa_opcode  = REINT_SETATTR;
+	rec->sa_fsuid	= from_kuid(&init_user_ns, current_fsuid());
+	rec->sa_fsgid	= from_kgid(&init_user_ns, current_fsgid());
+	rec->sa_cap     = cfs_curproc_cap_pack();
+	rec->sa_suppgid = -1;
+
+	rec->sa_fid    = op_data->op_fid1;
+	rec->sa_valid  = attr_pack(op_data->op_attr.ia_valid);
+	rec->sa_mode   = op_data->op_attr.ia_mode;
+	rec->sa_uid    = from_kuid(&init_user_ns, op_data->op_attr.ia_uid);
+	rec->sa_gid    = from_kgid(&init_user_ns, op_data->op_attr.ia_gid);
+	rec->sa_projid = op_data->op_projid;
+	rec->sa_size   = op_data->op_attr.ia_size;
+	rec->sa_blocks = op_data->op_attr_blocks;
+	rec->sa_atime  = LTIME_S(op_data->op_attr.ia_atime);
+	rec->sa_mtime  = LTIME_S(op_data->op_attr.ia_mtime);
+	rec->sa_ctime  = LTIME_S(op_data->op_attr.ia_ctime);
+	rec->sa_attr_flags = op_data->op_attr_flags;
+	if ((op_data->op_attr.ia_valid & ATTR_GID) &&
+	     in_group_p(op_data->op_attr.ia_gid))
+		rec->sa_suppgid =
+			from_kgid(&init_user_ns, op_data->op_attr.ia_gid);
+	else
+		rec->sa_suppgid = op_data->op_suppgids[0];
+
+	rec->sa_bias = op_data->op_bias;
+}
+
+static void mdc_ioepoch_pack(struct mdt_ioepoch *epoch,
+			     struct md_op_data *op_data)
+{
+	epoch->mio_handle = op_data->op_handle;
+	epoch->mio_unused1 = 0;
+	epoch->mio_unused2 = 0;
+	epoch->mio_padding = 0;
+}
+
+void mdc_setattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		      void *ea, size_t ealen)
+{
+	struct mdt_rec_setattr *rec;
+	struct lov_user_md *lum = NULL;
+
+	CLASSERT(sizeof(struct mdt_rec_reint) ==
+		 sizeof(struct mdt_rec_setattr));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+	mdc_setattr_pack_rec(rec, op_data);
+
+	if (ealen == 0)
+		return;
+
+	lum = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+	if (ea == NULL) { /* Remove LOV EA */
+		lum->lmm_magic = cpu_to_le32(LOV_USER_MAGIC_V1);
+		lum->lmm_stripe_size = 0;
+		lum->lmm_stripe_count = 0;
+		lum->lmm_stripe_offset = (typeof(lum->lmm_stripe_offset))(-1);
+	} else {
+		memcpy(lum, ea, ealen);
+	}
+}
+
+void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
+{
+	struct mdt_rec_unlink *rec;
+
+	CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_unlink));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+	LASSERT(rec != NULL);
+
+	rec->ul_opcode  = op_data->op_cli_flags & CLI_RM_ENTRY ?
+					REINT_RMENTRY : REINT_UNLINK;
+        rec->ul_fsuid   = op_data->op_fsuid;
+        rec->ul_fsgid   = op_data->op_fsgid;
+        rec->ul_cap     = op_data->op_cap;
+        rec->ul_mode    = op_data->op_mode;
+        rec->ul_suppgid1= op_data->op_suppgids[0];
+        rec->ul_suppgid2= -1;
+        rec->ul_fid1    = op_data->op_fid1;
+        rec->ul_fid2    = op_data->op_fid2;
+        rec->ul_time    = op_data->op_mod_time;
+        rec->ul_bias    = op_data->op_bias;
+
+	mdc_pack_name(req, &RMF_NAME, op_data->op_name, op_data->op_namelen);
+}
+
+void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
+{
+        struct mdt_rec_link *rec;
+
+        CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_link));
+        rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+        LASSERT (rec != NULL);
+
+        rec->lk_opcode   = REINT_LINK;
+        rec->lk_fsuid    = op_data->op_fsuid;//current->fsuid;
+        rec->lk_fsgid    = op_data->op_fsgid;//current->fsgid;
+        rec->lk_cap      = op_data->op_cap;//current->cap_effective;
+        rec->lk_suppgid1 = op_data->op_suppgids[0];
+        rec->lk_suppgid2 = op_data->op_suppgids[1];
+        rec->lk_fid1     = op_data->op_fid1;
+        rec->lk_fid2     = op_data->op_fid2;
+        rec->lk_time     = op_data->op_mod_time;
+        rec->lk_bias     = op_data->op_bias;
+
+	mdc_pack_name(req, &RMF_NAME, op_data->op_name, op_data->op_namelen);
+}
+
+static void mdc_intent_close_pack(struct ptlrpc_request *req,
+				  struct md_op_data *op_data)
+{
+	struct close_data	*data;
+	struct ldlm_lock	*lock;
+	enum mds_op_bias	 bias = op_data->op_bias;
+
+	if (!(bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP |
+		      MDS_RENAME_MIGRATE)))
+		return;
+
+	data = req_capsule_client_get(&req->rq_pill, &RMF_CLOSE_DATA);
+	LASSERT(data != NULL);
+
+	lock = ldlm_handle2lock(&op_data->op_lease_handle);
+	if (lock != NULL) {
+		data->cd_handle = lock->l_remote_handle;
+		LDLM_LOCK_PUT(lock);
+	}
+	ldlm_cli_cancel(&op_data->op_lease_handle, LCF_LOCAL);
+
+	data->cd_data_version = op_data->op_data_version;
+	data->cd_fid = op_data->op_fid2;
+}
+
+void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		     const char *old, size_t oldlen,
+		     const char *new, size_t newlen)
+{
+        struct mdt_rec_rename *rec;
+
+        CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_rename));
+        rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+        /* XXX do something about time, uid, gid */
+	rec->rn_opcode  = op_data->op_cli_flags & CLI_MIGRATE ?
+					REINT_MIGRATE : REINT_RENAME;
+        rec->rn_fsuid    = op_data->op_fsuid;
+        rec->rn_fsgid    = op_data->op_fsgid;
+        rec->rn_cap      = op_data->op_cap;
+        rec->rn_suppgid1 = op_data->op_suppgids[0];
+        rec->rn_suppgid2 = op_data->op_suppgids[1];
+        rec->rn_fid1     = op_data->op_fid1;
+        rec->rn_fid2     = op_data->op_fid2;
+        rec->rn_time     = op_data->op_mod_time;
+        rec->rn_mode     = op_data->op_mode;
+        rec->rn_bias     = op_data->op_bias;
+
+	mdc_pack_name(req, &RMF_NAME, old, oldlen);
+
+	if (new != NULL)
+		mdc_pack_name(req, &RMF_SYMTGT, new, newlen);
+
+	if (op_data->op_cli_flags & CLI_MIGRATE &&
+	    op_data->op_bias & MDS_RENAME_MIGRATE) {
+		struct mdt_ioepoch *epoch;
+
+		mdc_intent_close_pack(req, op_data);
+		epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
+		mdc_ioepoch_pack(epoch, op_data);
+	}
+}
+
+void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, __u32 flags,
+		      struct md_op_data *op_data, size_t ea_size)
+{
+        struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+                                                    &RMF_MDT_BODY);
+
+	b->mbo_valid = valid;
+	if (op_data->op_bias & MDS_CHECK_SPLIT)
+		b->mbo_valid |= OBD_MD_FLCKSPLIT;
+	if (op_data->op_bias & MDS_CROSS_REF)
+		b->mbo_valid |= OBD_MD_FLCROSSREF;
+	b->mbo_eadatasize = ea_size;
+	b->mbo_flags = flags;
+	__mdc_pack_body(b, op_data->op_suppgids[0]);
+
+	b->mbo_fid1 = op_data->op_fid1;
+	b->mbo_fid2 = op_data->op_fid2;
+	b->mbo_valid |= OBD_MD_FLID;
+
+	if (op_data->op_name != NULL)
+		mdc_pack_name(req, &RMF_NAME, op_data->op_name,
+			      op_data->op_namelen);
+}
+
+void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
+{
+	struct mdt_ioepoch *epoch;
+	struct mdt_rec_setattr *rec;
+
+	epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+	mdc_setattr_pack_rec(rec, op_data);
+	/*
+	 * The client will zero out local timestamps when losing the IBITS lock
+	 * so any new RPC timestamps will update the client inode's timestamps.
+	 * There was a defect on the server side which allowed the atime to be
+	 * overwritten by a zeroed-out atime packed into the close RPC.
+	 *
+	 * Proactively clear the MDS_ATTR_ATIME flag in the RPC in this case
+	 * to avoid zeroing the atime on old unpatched servers.  See LU-8041.
+	 */
+	if (rec->sa_atime == 0)
+		rec->sa_valid &= ~MDS_ATTR_ATIME;
+
+	mdc_ioepoch_pack(epoch, op_data);
+	mdc_intent_close_pack(req, op_data);
+}
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c
new file mode 100644
index 0000000000000..4a532f0a7b500
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c
@@ -0,0 +1,1282 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+#include <linux/module.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_dlm.h>
+#include <lustre_fid.h>
+#include <lustre_intent.h>
+#include <lustre_mdc.h>
+#include <lustre_net.h>
+#include <lustre_req_layout.h>
+#include <lustre_swab.h>
+
+#include "mdc_internal.h"
+
+struct mdc_getattr_args {
+	struct obd_export		*ga_exp;
+	struct md_enqueue_info		*ga_minfo;
+};
+
+int it_open_error(int phase, struct lookup_intent *it)
+{
+	if (it_disposition(it, DISP_OPEN_LEASE)) {
+		if (phase >= DISP_OPEN_LEASE)
+			return it->it_status;
+		else
+			return 0;
+	}
+	if (it_disposition(it, DISP_OPEN_OPEN)) {
+		if (phase >= DISP_OPEN_OPEN)
+			return it->it_status;
+		else
+			return 0;
+	}
+
+	if (it_disposition(it, DISP_OPEN_CREATE)) {
+		if (phase >= DISP_OPEN_CREATE)
+			return it->it_status;
+		else
+			return 0;
+	}
+
+	if (it_disposition(it, DISP_LOOKUP_EXECD)) {
+		if (phase >= DISP_LOOKUP_EXECD)
+			return it->it_status;
+		else
+			return 0;
+	}
+
+	if (it_disposition(it, DISP_IT_EXECD)) {
+		if (phase >= DISP_IT_EXECD)
+			return it->it_status;
+		else
+			return 0;
+	}
+
+	CERROR("it disp: %X, status: %d\n", it->it_disposition, it->it_status);
+	LBUG();
+
+        return 0;
+}
+EXPORT_SYMBOL(it_open_error);
+
+/* this must be called on a lockh that is known to have a referenced lock */
+int mdc_set_lock_data(struct obd_export *exp, const struct lustre_handle *lockh,
+		      void *data, __u64 *bits)
+{
+	struct ldlm_lock *lock;
+	struct inode *new_inode = data;
+        ENTRY;
+
+        if(bits)
+                *bits = 0;
+
+	if (!lustre_handle_is_used(lockh))
+		RETURN(0);
+
+	lock = ldlm_handle2lock(lockh);
+
+        LASSERT(lock != NULL);
+        lock_res_and_lock(lock);
+	if (lock->l_resource->lr_lvb_inode &&
+	    lock->l_resource->lr_lvb_inode != data) {
+		struct inode *old_inode = lock->l_resource->lr_lvb_inode;
+		LASSERTF(old_inode->i_state & I_FREEING,
+			 "Found existing inode %p/%lu/%u state %lu in lock: "
+			 "setting data to %p/%lu/%u\n", old_inode,
+			 old_inode->i_ino, old_inode->i_generation,
+			 old_inode->i_state,
+			 new_inode, new_inode->i_ino, new_inode->i_generation);
+	}
+	lock->l_resource->lr_lvb_inode = new_inode;
+        if (bits)
+                *bits = lock->l_policy_data.l_inodebits.bits;
+
+        unlock_res_and_lock(lock);
+        LDLM_LOCK_PUT(lock);
+
+        RETURN(0);
+}
+
+enum ldlm_mode mdc_lock_match(struct obd_export *exp, __u64 flags,
+			      const struct lu_fid *fid, enum ldlm_type type,
+			      union ldlm_policy_data *policy,
+			      enum ldlm_mode mode, struct lustre_handle *lockh)
+{
+	struct ldlm_res_id res_id;
+	enum ldlm_mode rc;
+	ENTRY;
+
+	fid_build_reg_res_name(fid, &res_id);
+	/* LU-4405: Clear bits not supported by server */
+	policy->l_inodebits.bits &= exp_connect_ibits(exp);
+	rc = ldlm_lock_match(class_exp2obd(exp)->obd_namespace, flags,
+			     &res_id, type, policy, mode, lockh, 0);
+	RETURN(rc);
+}
+
+int mdc_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
+		      union ldlm_policy_data *policy, enum ldlm_mode mode,
+		      enum ldlm_cancel_flags flags, void *opaque)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct ldlm_res_id res_id;
+	int rc;
+
+	ENTRY;
+
+	fid_build_reg_res_name(fid, &res_id);
+	rc = ldlm_cli_cancel_unused_resource(obd->obd_namespace, &res_id,
+					     policy, mode, flags, opaque);
+	RETURN(rc);
+}
+
+int mdc_null_inode(struct obd_export *exp,
+		   const struct lu_fid *fid)
+{
+	struct ldlm_res_id res_id;
+	struct ldlm_resource *res;
+	struct ldlm_namespace *ns = class_exp2obd(exp)->obd_namespace;
+	ENTRY;
+
+	LASSERTF(ns != NULL, "no namespace passed\n");
+
+	fid_build_reg_res_name(fid, &res_id);
+
+	res = ldlm_resource_get(ns, NULL, &res_id, 0, 0);
+	if (IS_ERR(res))
+		RETURN(0);
+
+	lock_res(res);
+	res->lr_lvb_inode = NULL;
+	unlock_res(res);
+
+	ldlm_resource_putref(res);
+	RETURN(0);
+}
+
+static inline void mdc_clear_replay_flag(struct ptlrpc_request *req, int rc)
+{
+	/* Don't hold error requests for replay. */
+	if (req->rq_replay) {
+		spin_lock(&req->rq_lock);
+		req->rq_replay = 0;
+		spin_unlock(&req->rq_lock);
+        }
+        if (rc && req->rq_transno != 0) {
+                DEBUG_REQ(D_ERROR, req, "transno returned on error rc %d", rc);
+                LBUG();
+        }
+}
+
+/* Save a large LOV EA into the request buffer so that it is available
+ * for replay.  We don't do this in the initial request because the
+ * original request doesn't need this buffer (at most it sends just the
+ * lov_mds_md) and it is a waste of RAM/bandwidth to send the empty
+ * buffer and may also be difficult to allocate and save a very large
+ * request buffer for each open. (bug 5707)
+ *
+ * OOM here may cause recovery failure if lmm is needed (only for the
+ * original open if the MDS crashed just when this client also OOM'd)
+ * but this is incredibly unlikely, and questionable whether the client
+ * could do MDS recovery under OOM anyways... */
+int mdc_save_lovea(struct ptlrpc_request *req,
+		   const struct req_msg_field *field,
+		   void *data, u32 size)
+{
+	struct req_capsule *pill = &req->rq_pill;
+	void *lmm;
+	int rc = 0;
+
+	if (req_capsule_get_size(pill, field, RCL_CLIENT) < size) {
+		rc = sptlrpc_cli_enlarge_reqbuf(req, field, size);
+		if (rc) {
+			CERROR("%s: Can't enlarge ea size to %d: rc = %d\n",
+			       req->rq_export->exp_obd->obd_name,
+			       size, rc);
+			return rc;
+		}
+	} else {
+		req_capsule_shrink(pill, field, size, RCL_CLIENT);
+	}
+
+	req_capsule_set_size(pill, field, RCL_CLIENT, size);
+	lmm = req_capsule_client_get(pill, field);
+	if (lmm)
+		memcpy(lmm, data, size);
+
+	return rc;
+}
+
+static struct ptlrpc_request *
+mdc_intent_open_pack(struct obd_export *exp, struct lookup_intent *it,
+		     struct md_op_data *op_data)
+{
+	struct ptlrpc_request	*req;
+	struct obd_device	*obddev = class_exp2obd(exp);
+	struct ldlm_intent	*lit;
+	const void		*lmm = op_data->op_data;
+	__u32			 lmmsize = op_data->op_data_size;
+	struct list_head	 cancels = LIST_HEAD_INIT(cancels);
+	int			 count = 0;
+	enum ldlm_mode		 mode;
+	int			 rc;
+	ENTRY;
+
+	it->it_create_mode = (it->it_create_mode & ~S_IFMT) | S_IFREG;
+
+	/* XXX: openlock is not cancelled for cross-refs. */
+	/* If inode is known, cancel conflicting OPEN locks. */
+	if (fid_is_sane(&op_data->op_fid2)) {
+		if (it->it_flags & MDS_OPEN_LEASE) { /* try to get lease */
+			if (it->it_flags & FMODE_WRITE)
+				mode = LCK_EX;
+			else
+				mode = LCK_PR;
+		} else {
+			if (it->it_flags & (FMODE_WRITE|MDS_OPEN_TRUNC))
+				mode = LCK_CW;
+#ifdef FMODE_EXEC
+			else if (it->it_flags & FMODE_EXEC)
+				mode = LCK_PR;
+#endif
+			else
+				mode = LCK_CR;
+		}
+		count = mdc_resource_get_unused(exp, &op_data->op_fid2,
+						&cancels, mode,
+						MDS_INODELOCK_OPEN);
+	}
+
+        /* If CREATE, cancel parent's UPDATE lock. */
+        if (it->it_op & IT_CREAT)
+                mode = LCK_EX;
+        else
+                mode = LCK_CR;
+        count += mdc_resource_get_unused(exp, &op_data->op_fid1,
+                                         &cancels, mode,
+                                         MDS_INODELOCK_UPDATE);
+
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                   &RQF_LDLM_INTENT_OPEN);
+        if (req == NULL) {
+                ldlm_lock_list_put(&cancels, l_bl_ast, count);
+                RETURN(ERR_PTR(-ENOMEM));
+        }
+
+        req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                             op_data->op_namelen + 1);
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+			     max(lmmsize, obddev->u.cli.cl_default_mds_easize));
+
+	req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX_NAME,
+			     RCL_CLIENT, op_data->op_file_secctx_name != NULL ?
+			     strlen(op_data->op_file_secctx_name) + 1 : 0);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX, RCL_CLIENT,
+			     op_data->op_file_secctx_size);
+
+	rc = ldlm_prep_enqueue_req(exp, req, &cancels, count);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(ERR_PTR(rc));
+	}
+
+	spin_lock(&req->rq_lock);
+	req->rq_replay = req->rq_import->imp_replayable;
+	spin_unlock(&req->rq_lock);
+
+        /* pack the intent */
+        lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+        lit->opc = (__u64)it->it_op;
+
+        /* pack the intended request */
+        mdc_open_pack(req, op_data, it->it_create_mode, 0, it->it_flags, lmm,
+                      lmmsize);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     obddev->u.cli.cl_max_mds_easize);
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+			     req->rq_import->imp_connect_data.ocd_max_easize);
+        ptlrpc_request_set_replen(req);
+        return req;
+}
+
+#define GA_DEFAULT_EA_NAME_LEN 20
+#define GA_DEFAULT_EA_VAL_LEN  250
+#define GA_DEFAULT_EA_NUM      10
+
+static struct ptlrpc_request *
+mdc_intent_getxattr_pack(struct obd_export *exp,
+			 struct lookup_intent *it,
+			 struct md_op_data *op_data)
+{
+	struct ptlrpc_request	*req;
+	struct ldlm_intent	*lit;
+	int			rc, count = 0;
+	struct list_head	cancels = LIST_HEAD_INIT(cancels);
+	u32 min_buf_size = 0;
+
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+					&RQF_LDLM_INTENT_GETXATTR);
+	if (req == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	rc = ldlm_prep_enqueue_req(exp, req, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(ERR_PTR(rc));
+	}
+
+	/* pack the intent */
+	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+	lit->opc = IT_GETXATTR;
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
+	/* If the supplied buffer is too small then the server will
+	 * return -ERANGE and llite will fallback to using non cached
+	 * xattr operations. On servers before 2.10.1 a (non-cached)
+	 * listxattr RPC for an orphan or dead file causes an oops. So
+	 * let's try to avoid sending too small a buffer to too old a
+	 * server. This is effectively undoing the memory conservation
+	 * of LU-9417 when it would be *more* likely to crash the
+	 * server. See LU-9856. */
+	if (exp->exp_connect_data.ocd_version < OBD_OCD_VERSION(2, 10, 1, 0))
+		min_buf_size = exp->exp_connect_data.ocd_max_easize;
+#endif
+
+	/* pack the intended request */
+	mdc_pack_body(req, &op_data->op_fid1, op_data->op_valid,
+		      max_t(u32, min_buf_size,
+			    GA_DEFAULT_EA_VAL_LEN * GA_DEFAULT_EA_NUM),
+		      -1, 0);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER,
+			     max_t(u32, min_buf_size,
+				 GA_DEFAULT_EA_NAME_LEN * GA_DEFAULT_EA_NUM));
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EAVALS, RCL_SERVER,
+			     max_t(u32, min_buf_size,
+				 GA_DEFAULT_EA_VAL_LEN * GA_DEFAULT_EA_NUM));
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EAVALS_LENS, RCL_SERVER,
+			     max_t(u32, min_buf_size,
+				 sizeof(__u32) * GA_DEFAULT_EA_NUM));
+
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, 0);
+
+	ptlrpc_request_set_replen(req);
+
+	RETURN(req);
+}
+
+static struct ptlrpc_request *mdc_intent_unlink_pack(struct obd_export *exp,
+                                                     struct lookup_intent *it,
+                                                     struct md_op_data *op_data)
+{
+        struct ptlrpc_request *req;
+        struct obd_device     *obddev = class_exp2obd(exp);
+        struct ldlm_intent    *lit;
+        int                    rc;
+        ENTRY;
+
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                   &RQF_LDLM_INTENT_UNLINK);
+        if (req == NULL)
+                RETURN(ERR_PTR(-ENOMEM));
+
+        req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                             op_data->op_namelen + 1);
+
+        rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+        if (rc) {
+                ptlrpc_request_free(req);
+                RETURN(ERR_PTR(rc));
+        }
+
+        /* pack the intent */
+        lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+        lit->opc = (__u64)it->it_op;
+
+        /* pack the intended request */
+        mdc_unlink_pack(req, op_data);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     obddev->u.cli.cl_default_mds_easize);
+	ptlrpc_request_set_replen(req);
+	RETURN(req);
+}
+
+static struct ptlrpc_request *mdc_intent_getattr_pack(struct obd_export *exp,
+                                                      struct lookup_intent *it,
+                                                      struct md_op_data *op_data)
+{
+	struct ptlrpc_request	*req;
+	struct obd_device	*obddev = class_exp2obd(exp);
+	u64			 valid = OBD_MD_FLGETATTR | OBD_MD_FLEASIZE |
+					 OBD_MD_FLMODEASIZE | OBD_MD_FLDIREA |
+					 OBD_MD_MEA | OBD_MD_FLACL;
+	struct ldlm_intent	*lit;
+	int			 rc;
+	__u32			 easize;
+	ENTRY;
+
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                   &RQF_LDLM_INTENT_GETATTR);
+        if (req == NULL)
+                RETURN(ERR_PTR(-ENOMEM));
+
+        req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                             op_data->op_namelen + 1);
+
+        rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+        if (rc) {
+                ptlrpc_request_free(req);
+                RETURN(ERR_PTR(rc));
+        }
+
+        /* pack the intent */
+        lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+        lit->opc = (__u64)it->it_op;
+
+	if (obddev->u.cli.cl_default_mds_easize > 0)
+		easize = obddev->u.cli.cl_default_mds_easize;
+	else
+		easize = obddev->u.cli.cl_max_mds_easize;
+
+	/* pack the intended request */
+	mdc_getattr_pack(req, valid, it->it_flags, op_data, easize);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, easize);
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+			     req->rq_import->imp_connect_data.ocd_max_easize);
+	ptlrpc_request_set_replen(req);
+	RETURN(req);
+}
+
+static struct ptlrpc_request *mdc_intent_layout_pack(struct obd_export *exp,
+						     struct lookup_intent *it,
+						     struct md_op_data *op_data)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct ptlrpc_request *req;
+	struct ldlm_intent    *lit;
+	struct layout_intent  *layout;
+	int rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				&RQF_LDLM_INTENT_LAYOUT);
+	if (req == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, 0);
+	rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(ERR_PTR(rc));
+	}
+
+	/* pack the intent */
+	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+	lit->opc = (__u64)it->it_op;
+
+	/* pack the layout intent request */
+	layout = req_capsule_client_get(&req->rq_pill, &RMF_LAYOUT_INTENT);
+	LASSERT(op_data->op_data != NULL);
+	LASSERT(op_data->op_data_size == sizeof(*layout));
+	memcpy(layout, op_data->op_data, sizeof(*layout));
+
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+			     obd->u.cli.cl_default_mds_easize);
+	ptlrpc_request_set_replen(req);
+	RETURN(req);
+}
+
+static struct ptlrpc_request *
+mdc_enqueue_pack(struct obd_export *exp, int lvb_len)
+{
+        struct ptlrpc_request *req;
+        int rc;
+        ENTRY;
+
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_ENQUEUE);
+        if (req == NULL)
+                RETURN(ERR_PTR(-ENOMEM));
+
+        rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+        if (rc) {
+                ptlrpc_request_free(req);
+                RETURN(ERR_PTR(rc));
+        }
+
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, lvb_len);
+        ptlrpc_request_set_replen(req);
+        RETURN(req);
+}
+
+static int mdc_finish_enqueue(struct obd_export *exp,
+                              struct ptlrpc_request *req,
+                              struct ldlm_enqueue_info *einfo,
+                              struct lookup_intent *it,
+                              struct lustre_handle *lockh,
+                              int rc)
+{
+	struct req_capsule  *pill = &req->rq_pill;
+	struct ldlm_request *lockreq;
+	struct ldlm_reply   *lockrep;
+	struct ldlm_lock    *lock;
+	void                *lvb_data = NULL;
+	__u32                lvb_len = 0;
+        ENTRY;
+
+        LASSERT(rc >= 0);
+        /* Similarly, if we're going to replay this request, we don't want to
+         * actually get a lock, just perform the intent. */
+        if (req->rq_transno || req->rq_replay) {
+                lockreq = req_capsule_client_get(pill, &RMF_DLM_REQ);
+		lockreq->lock_flags |= ldlm_flags_to_wire(LDLM_FL_INTENT_ONLY);
+        }
+
+        if (rc == ELDLM_LOCK_ABORTED) {
+                einfo->ei_mode = 0;
+                memset(lockh, 0, sizeof(*lockh));
+                rc = 0;
+        } else { /* rc = 0 */
+		lock = ldlm_handle2lock(lockh);
+		LASSERT(lock != NULL);
+
+                /* If the server gave us back a different lock mode, we should
+                 * fix up our variables. */
+                if (lock->l_req_mode != einfo->ei_mode) {
+                        ldlm_lock_addref(lockh, lock->l_req_mode);
+                        ldlm_lock_decref(lockh, einfo->ei_mode);
+                        einfo->ei_mode = lock->l_req_mode;
+                }
+		LDLM_LOCK_PUT(lock);
+	}
+
+	lockrep = req_capsule_server_get(pill, &RMF_DLM_REP);
+	LASSERT(lockrep != NULL); /* checked by ldlm_cli_enqueue() */
+
+	it->it_disposition = (int)lockrep->lock_policy_res1;
+	it->it_status = (int)lockrep->lock_policy_res2;
+	it->it_lock_mode = einfo->ei_mode;
+	it->it_lock_handle = lockh->cookie;
+	it->it_request = req;
+
+	/* Technically speaking rq_transno must already be zero if
+	 * it_status is in error, so the check is a bit redundant */
+	if ((!req->rq_transno || it->it_status < 0) && req->rq_replay)
+		mdc_clear_replay_flag(req, it->it_status);
+
+        /* If we're doing an IT_OPEN which did not result in an actual
+         * successful open, then we need to remove the bit which saves
+         * this request for unconditional replay.
+         *
+         * It's important that we do this first!  Otherwise we might exit the
+         * function without doing so, and try to replay a failed create
+         * (bug 3440) */
+        if (it->it_op & IT_OPEN && req->rq_replay &&
+	    (!it_disposition(it, DISP_OPEN_OPEN) || it->it_status != 0))
+		mdc_clear_replay_flag(req, it->it_status);
+
+	DEBUG_REQ(D_RPCTRACE, req, "op: %d disposition: %x, status: %d",
+		  it->it_op, it->it_disposition, it->it_status);
+
+        /* We know what to expect, so we do any byte flipping required here */
+	if (it_has_reply_body(it)) {
+                struct mdt_body *body;
+
+                body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+                if (body == NULL) {
+                        CERROR ("Can't swab mdt_body\n");
+                        RETURN (-EPROTO);
+                }
+
+                if (it_disposition(it, DISP_OPEN_OPEN) &&
+                    !it_open_error(DISP_OPEN_OPEN, it)) {
+                        /*
+                         * If this is a successful OPEN request, we need to set
+                         * replay handler and data early, so that if replay
+                         * happens immediately after swabbing below, new reply
+                         * is swabbed by that handler correctly.
+                         */
+			mdc_set_open_replay_data(NULL, NULL, it);
+		}
+
+		if (body->mbo_valid & (OBD_MD_FLDIREA | OBD_MD_FLEASIZE)) {
+                        void *eadata;
+
+			mdc_update_max_ea_from_body(exp, body);
+
+                        /*
+                         * The eadata is opaque; just check that it is there.
+                         * Eventually, obd_unpackmd() will check the contents.
+                         */
+                        eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD,
+							body->mbo_eadatasize);
+			if (eadata == NULL)
+				RETURN(-EPROTO);
+
+			/* save lvb data and length in case this is for layout
+			 * lock */
+			lvb_data = eadata;
+			lvb_len = body->mbo_eadatasize;
+
+                        /*
+                         * We save the reply LOV EA in case we have to replay a
+                         * create for recovery.  If we didn't allocate a large
+                         * enough request buffer above we need to reallocate it
+                         * here to hold the actual LOV EA.
+                         *
+                         * To not save LOV EA if request is not going to replay
+                         * (for example error one).
+                         */
+                        if ((it->it_op & IT_OPEN) && req->rq_replay) {
+				rc = mdc_save_lovea(req, &RMF_EADATA, eadata,
+						    body->mbo_eadatasize);
+				if (rc) {
+					body->mbo_valid &= ~OBD_MD_FLEASIZE;
+					body->mbo_eadatasize = 0;
+					rc = 0;
+				}
+			}
+		}
+	} else if (it->it_op & IT_LAYOUT) {
+		/* maybe the lock was granted right away and layout
+		 * is packed into RMF_DLM_LVB of req */
+		lvb_len = req_capsule_get_size(pill, &RMF_DLM_LVB, RCL_SERVER);
+		if (lvb_len > 0) {
+			lvb_data = req_capsule_server_sized_get(pill,
+							&RMF_DLM_LVB, lvb_len);
+			if (lvb_data == NULL)
+				RETURN(-EPROTO);
+
+			/**
+			 * save replied layout data to the request buffer for
+			 * recovery consideration (lest MDS reinitialize
+			 * another set of OST objects).
+			 */
+			if (req->rq_transno)
+				(void)mdc_save_lovea(req, &RMF_EADATA, lvb_data,
+						     lvb_len);
+		}
+	}
+
+	/* fill in stripe data for layout lock.
+	 * LU-6581: trust layout data only if layout lock is granted. The MDT
+	 * has stopped sending layout unless the layout lock is granted. The
+	 * client still does this checking in case it's talking with an old
+	 * server. - Jinshan */
+	lock = ldlm_handle2lock(lockh);
+	if (lock != NULL && ldlm_has_layout(lock) && lvb_data != NULL &&
+	    !(lockrep->lock_flags & LDLM_FL_BLOCKED_MASK)) {
+		void *lmm;
+
+		LDLM_DEBUG(lock, "layout lock returned by: %s, lvb_len: %d",
+			ldlm_it2str(it->it_op), lvb_len);
+
+		OBD_ALLOC_LARGE(lmm, lvb_len);
+		if (lmm == NULL) {
+			LDLM_LOCK_PUT(lock);
+			RETURN(-ENOMEM);
+		}
+		memcpy(lmm, lvb_data, lvb_len);
+
+		/* install lvb_data */
+		lock_res_and_lock(lock);
+		if (lock->l_lvb_data == NULL) {
+			lock->l_lvb_type = LVB_T_LAYOUT;
+			lock->l_lvb_data = lmm;
+			lock->l_lvb_len = lvb_len;
+			lmm = NULL;
+		}
+		unlock_res_and_lock(lock);
+		if (lmm != NULL)
+			OBD_FREE_LARGE(lmm, lvb_len);
+	}
+	if (lock != NULL)
+		LDLM_LOCK_PUT(lock);
+
+	RETURN(rc);
+}
+
+/* We always reserve enough space in the reply packet for a stripe MD, because
+ * we don't know in advance the file type. */
+static int mdc_enqueue_base(struct obd_export *exp,
+			    struct ldlm_enqueue_info *einfo,
+			    const union ldlm_policy_data *policy,
+			    struct lookup_intent *it,
+			    struct md_op_data *op_data,
+			    struct lustre_handle *lockh,
+			    __u64 extra_lock_flags)
+{
+	struct obd_device *obddev = class_exp2obd(exp);
+	struct ptlrpc_request *req = NULL;
+	__u64 flags, saved_flags = extra_lock_flags;
+	struct ldlm_res_id res_id;
+	static const union ldlm_policy_data lookup_policy = {
+				  .l_inodebits = { MDS_INODELOCK_LOOKUP } };
+	static const union ldlm_policy_data update_policy = {
+				  .l_inodebits = { MDS_INODELOCK_UPDATE } };
+	static const union ldlm_policy_data layout_policy = {
+				  .l_inodebits = { MDS_INODELOCK_LAYOUT } };
+	static const union ldlm_policy_data getxattr_policy = {
+				  .l_inodebits = { MDS_INODELOCK_XATTR } };
+	int generation, resends = 0;
+	struct ldlm_reply *lockrep;
+	enum lvb_type lvb_type = 0;
+	int rc;
+	ENTRY;
+
+	LASSERTF(!it || einfo->ei_type == LDLM_IBITS, "lock type %d\n",
+		 einfo->ei_type);
+	fid_build_reg_res_name(&op_data->op_fid1, &res_id);
+
+	if (it != NULL) {
+		LASSERT(policy == NULL);
+
+		saved_flags |= LDLM_FL_HAS_INTENT;
+		if (it->it_op & (IT_UNLINK | IT_GETATTR | IT_READDIR))
+			policy = &update_policy;
+		else if (it->it_op & IT_LAYOUT)
+			policy = &layout_policy;
+		else if (it->it_op & (IT_GETXATTR | IT_SETXATTR))
+			policy = &getxattr_policy;
+		else
+			policy = &lookup_policy;
+	}
+
+        generation = obddev->u.cli.cl_import->imp_generation;
+resend:
+        flags = saved_flags;
+	if (it == NULL) {
+		/* The only way right now is FLOCK. */
+		LASSERTF(einfo->ei_type == LDLM_FLOCK, "lock type %d\n",
+			 einfo->ei_type);
+		res_id.name[3] = LDLM_FLOCK;
+	} else if (it->it_op & IT_OPEN) {
+		req = mdc_intent_open_pack(exp, it, op_data);
+	} else if (it->it_op & IT_UNLINK) {
+		req = mdc_intent_unlink_pack(exp, it, op_data);
+	} else if (it->it_op & (IT_GETATTR | IT_LOOKUP)) {
+		req = mdc_intent_getattr_pack(exp, it, op_data);
+	} else if (it->it_op & IT_READDIR) {
+		req = mdc_enqueue_pack(exp, 0);
+	} else if (it->it_op & IT_LAYOUT) {
+		if (!imp_connect_lvb_type(class_exp2cliimp(exp)))
+			RETURN(-EOPNOTSUPP);
+		req = mdc_intent_layout_pack(exp, it, op_data);
+		lvb_type = LVB_T_LAYOUT;
+	} else if (it->it_op & IT_GETXATTR) {
+		req = mdc_intent_getxattr_pack(exp, it, op_data);
+	} else {
+                LBUG();
+                RETURN(-EINVAL);
+        }
+
+        if (IS_ERR(req))
+                RETURN(PTR_ERR(req));
+
+        if (resends) {
+                req->rq_generation_set = 1;
+                req->rq_import_generation = generation;
+		req->rq_sent = ktime_get_real_seconds() + resends;
+        }
+
+	/* It is important to obtain modify RPC slot first (if applicable), so
+	 * that threads that are waiting for a modify RPC slot are not polluting
+	 * our rpcs in flight counter.
+	 * We do not do flock request limiting, though */
+	if (it) {
+		mdc_get_mod_rpc_slot(req, it);
+		rc = obd_get_request_slot(&obddev->u.cli);
+		if (rc != 0) {
+			mdc_put_mod_rpc_slot(req, it);
+                        mdc_clear_replay_flag(req, 0);
+                        ptlrpc_req_finished(req);
+                        RETURN(rc);
+                }
+        }
+
+        rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, policy, &flags, NULL,
+			      0, lvb_type, lockh, 0);
+        if (!it) {
+                /* For flock requests we immediatelly return without further
+                   delay and let caller deal with the rest, since rest of
+                   this function metadata processing makes no sense for flock
+		   requests anyway. But in case of problem during comms with
+		   Server (ETIMEDOUT) or any signal/kill attempt (EINTR), we
+		   can not rely on caller and this mainly for F_UNLCKs
+		   (explicits or automatically generated by Kernel to clean
+		   current FLocks upon exit) that can't be trashed */
+		if (((rc == -EINTR) || (rc == -ETIMEDOUT)) &&
+		    (einfo->ei_type == LDLM_FLOCK) &&
+		    (einfo->ei_mode == LCK_NL))
+			goto resend;
+		RETURN(rc);
+	}
+
+	obd_put_request_slot(&obddev->u.cli);
+	mdc_put_mod_rpc_slot(req, it);
+
+	if (rc < 0) {
+		CDEBUG(D_INFO,
+		      "%s: ldlm_cli_enqueue "DFID":"DFID"=%s failed: rc = %d\n",
+		      obddev->obd_name, PFID(&op_data->op_fid1),
+		      PFID(&op_data->op_fid2), op_data->op_name ?: "", rc);
+
+		mdc_clear_replay_flag(req, rc);
+		ptlrpc_req_finished(req);
+		RETURN(rc);
+	}
+
+	lockrep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+	LASSERT(lockrep != NULL);
+
+	lockrep->lock_policy_res2 =
+		ptlrpc_status_ntoh(lockrep->lock_policy_res2);
+
+	/* Retry infinitely when the server returns -EINPROGRESS for the
+	 * intent operation, when server returns -EINPROGRESS for acquiring
+	 * intent lock, we'll retry in after_reply(). */
+	if (it && (int)lockrep->lock_policy_res2 == -EINPROGRESS) {
+		mdc_clear_replay_flag(req, rc);
+		ptlrpc_req_finished(req);
+		if (generation == obddev->u.cli.cl_import->imp_generation) {
+			if (signal_pending(current))
+				RETURN(-EINTR);
+
+			resends++;
+			CDEBUG(D_HA, "%s: resend:%d op:%d "DFID"/"DFID"\n",
+			       obddev->obd_name, resends, it->it_op,
+			       PFID(&op_data->op_fid1),
+			       PFID(&op_data->op_fid2));
+			goto resend;
+		} else {
+			CDEBUG(D_HA, "resend cross eviction\n");
+			RETURN(-EIO);
+		}
+	}
+
+	rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc);
+	if (rc < 0) {
+		if (lustre_handle_is_used(lockh)) {
+			ldlm_lock_decref(lockh, einfo->ei_mode);
+			memset(lockh, 0, sizeof(*lockh));
+		}
+		ptlrpc_req_finished(req);
+
+		it->it_lock_handle = 0;
+		it->it_lock_mode = 0;
+		it->it_request = NULL;
+	}
+
+	RETURN(rc);
+}
+
+int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+		const union ldlm_policy_data *policy,
+		struct md_op_data *op_data,
+		struct lustre_handle *lockh, __u64 extra_lock_flags)
+{
+	return mdc_enqueue_base(exp, einfo, policy, NULL,
+				op_data, lockh, extra_lock_flags);
+}
+
+static int mdc_finish_intent_lock(struct obd_export *exp,
+                                  struct ptlrpc_request *request,
+                                  struct md_op_data *op_data,
+                                  struct lookup_intent *it,
+                                  struct lustre_handle *lockh)
+{
+        struct lustre_handle old_lock;
+        struct ldlm_lock *lock;
+	int rc = 0;
+	ENTRY;
+
+        LASSERT(request != NULL);
+        LASSERT(request != LP_POISON);
+        LASSERT(request->rq_repmsg != LP_POISON);
+
+	if (it->it_op & IT_READDIR)
+		RETURN(0);
+
+	if (it->it_op & (IT_GETXATTR | IT_LAYOUT)) {
+		if (it->it_status != 0)
+			GOTO(out, rc = it->it_status);
+	} else {
+		if (!it_disposition(it, DISP_IT_EXECD)) {
+			/* The server failed before it even started executing
+			 * the intent, i.e. because it couldn't unpack the
+			 * request.
+			 */
+			LASSERT(it->it_status != 0);
+			GOTO(out, rc = it->it_status);
+		}
+		rc = it_open_error(DISP_IT_EXECD, it);
+		if (rc)
+			GOTO(out, rc);
+
+		rc = it_open_error(DISP_LOOKUP_EXECD, it);
+		if (rc)
+			GOTO(out, rc);
+
+		/* keep requests around for the multiple phases of the call
+		 * this shows the DISP_XX must guarantee we make it into the
+		 * call
+		 */
+		if (!it_disposition(it, DISP_ENQ_CREATE_REF) &&
+		    it_disposition(it, DISP_OPEN_CREATE) &&
+		    !it_open_error(DISP_OPEN_CREATE, it)) {
+			it_set_disposition(it, DISP_ENQ_CREATE_REF);
+			/* balanced in ll_create_node */
+			ptlrpc_request_addref(request);
+		}
+		if (!it_disposition(it, DISP_ENQ_OPEN_REF) &&
+		    it_disposition(it, DISP_OPEN_OPEN) &&
+		    !it_open_error(DISP_OPEN_OPEN, it)) {
+			it_set_disposition(it, DISP_ENQ_OPEN_REF);
+			/* balanced in ll_file_open */
+			ptlrpc_request_addref(request);
+			/* BUG 11546 - eviction in the middle of open rpc
+			 * processing
+			 */
+			OBD_FAIL_TIMEOUT(OBD_FAIL_MDC_ENQUEUE_PAUSE,
+					 obd_timeout);
+		}
+
+		if (it->it_op & IT_CREAT) {
+			/* XXX this belongs in ll_create_it */
+		} else if (it->it_op == IT_OPEN) {
+			LASSERT(!it_disposition(it, DISP_OPEN_CREATE));
+		} else {
+			LASSERT(it->it_op & (IT_GETATTR | IT_LOOKUP));
+		}
+	}
+
+	/* If we already have a matching lock, then cancel the new
+	 * one.  We have to set the data here instead of in
+	 * mdc_enqueue, because we need to use the child's inode as
+	 * the l_ast_data to match, and that's not available until
+	 * intent_finish has performed the iget().) */
+	lock = ldlm_handle2lock(lockh);
+	if (lock) {
+		union ldlm_policy_data policy = lock->l_policy_data;
+		LDLM_DEBUG(lock, "matching against this");
+
+		if (it_has_reply_body(it)) {
+			struct mdt_body *body;
+
+			body = req_capsule_server_get(&request->rq_pill,
+						      &RMF_MDT_BODY);
+			/* mdc_enqueue checked */
+			LASSERT(body != NULL);
+			LASSERTF(fid_res_name_eq(&body->mbo_fid1,
+						 &lock->l_resource->lr_name),
+				 "Lock res_id: "DLDLMRES", fid: "DFID"\n",
+				 PLDLMRES(lock->l_resource),
+				 PFID(&body->mbo_fid1));
+		}
+		LDLM_LOCK_PUT(lock);
+
+                memcpy(&old_lock, lockh, sizeof(*lockh));
+                if (ldlm_lock_match(NULL, LDLM_FL_BLOCK_GRANTED, NULL,
+                                    LDLM_IBITS, &policy, LCK_NL, &old_lock, 0)) {
+			ldlm_lock_decref_and_cancel(lockh, it->it_lock_mode);
+			memcpy(lockh, &old_lock, sizeof(old_lock));
+			it->it_lock_handle = lockh->cookie;
+		}
+	}
+
+	EXIT;
+out:
+	CDEBUG(D_DENTRY,"D_IT dentry %.*s intent: %s status %d disp %x rc %d\n",
+		(int)op_data->op_namelen, op_data->op_name,
+		ldlm_it2str(it->it_op), it->it_status,
+		it->it_disposition, rc);
+	return rc;
+}
+
+int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
+			struct lu_fid *fid, __u64 *bits)
+{
+	/* We could just return 1 immediately, but since we should only
+	 * be called in revalidate_it if we already have a lock, let's
+	 * verify that. */
+	struct ldlm_res_id res_id;
+	struct lustre_handle lockh;
+	union ldlm_policy_data policy;
+	enum ldlm_mode mode;
+	ENTRY;
+
+	if (it->it_lock_handle) {
+		lockh.cookie = it->it_lock_handle;
+		mode = ldlm_revalidate_lock_handle(&lockh, bits);
+        } else {
+                fid_build_reg_res_name(fid, &res_id);
+                switch (it->it_op) {
+		case IT_GETATTR:
+			/* File attributes are held under multiple bits:
+			 * nlink is under lookup lock, size and times are
+			 * under UPDATE lock and recently we've also got
+			 * a separate permissions lock for owner/group/acl that
+			 * were protected by lookup lock before.
+			 * Getattr must provide all of that information,
+			 * so we need to ensure we have all of those locks.
+			 * Unfortunately, if the bits are split across multiple
+			 * locks, there's no easy way to match all of them here,
+			 * so an extra RPC would be performed to fetch all
+			 * of those bits at once for now. */
+			/* For new MDTs(> 2.4), UPDATE|PERM should be enough,
+			 * but for old MDTs (< 2.4), permission is covered
+			 * by LOOKUP lock, so it needs to match all bits here.*/
+			policy.l_inodebits.bits = MDS_INODELOCK_UPDATE |
+						  MDS_INODELOCK_LOOKUP |
+						  MDS_INODELOCK_PERM;
+			break;
+		case IT_READDIR:
+			policy.l_inodebits.bits = MDS_INODELOCK_UPDATE;
+			break;
+		case IT_LAYOUT:
+			policy.l_inodebits.bits = MDS_INODELOCK_LAYOUT;
+			break;
+		default:
+			policy.l_inodebits.bits = MDS_INODELOCK_LOOKUP;
+			break;
+		}
+
+		mode = mdc_lock_match(exp, LDLM_FL_BLOCK_GRANTED, fid,
+				      LDLM_IBITS, &policy,
+				      LCK_CR | LCK_CW | LCK_PR | LCK_PW,
+				      &lockh);
+        }
+
+	if (mode) {
+		it->it_lock_handle = lockh.cookie;
+		it->it_lock_mode = mode;
+	} else {
+		it->it_lock_handle = 0;
+		it->it_lock_mode = 0;
+	}
+
+	RETURN(!!mode);
+}
+
+/*
+ * This long block is all about fixing up the lock and request state
+ * so that it is correct as of the moment _before_ the operation was
+ * applied; that way, the VFS will think that everything is normal and
+ * call Lustre's regular VFS methods.
+ *
+ * If we're performing a creation, that means that unless the creation
+ * failed with EEXIST, we should fake up a negative dentry.
+ *
+ * For everything else, we want to lookup to succeed.
+ *
+ * One additional note: if CREATE or OPEN succeeded, we add an extra
+ * reference to the request because we need to keep it around until
+ * ll_create/ll_open gets called.
+ *
+ * The server will return to us, in it_disposition, an indication of
+ * exactly what it_status refers to.
+ *
+ * If DISP_OPEN_OPEN is set, then it_status refers to the open() call,
+ * otherwise if DISP_OPEN_CREATE is set, then it status is the
+ * creation failure mode.  In either case, one of DISP_LOOKUP_NEG or
+ * DISP_LOOKUP_POS will be set, indicating whether the child lookup
+ * was successful.
+ *
+ * Else, if DISP_LOOKUP_EXECD then it_status is the rc of the
+ * child lookup.
+ */
+int mdc_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
+		    struct lookup_intent *it, struct ptlrpc_request **reqp,
+		    ldlm_blocking_callback cb_blocking, __u64 extra_lock_flags)
+{
+	struct ldlm_enqueue_info einfo = {
+		.ei_type	= LDLM_IBITS,
+		.ei_mode	= it_to_lock_mode(it),
+		.ei_cb_bl	= cb_blocking,
+		.ei_cb_cp	= ldlm_completion_ast,
+	};
+	struct lustre_handle lockh;
+	int rc = 0;
+	ENTRY;
+	LASSERT(it);
+
+	CDEBUG(D_DLMTRACE, "(name: %.*s,"DFID") in obj "DFID
+		", intent: %s flags %#llo\n", (int)op_data->op_namelen,
+		op_data->op_name, PFID(&op_data->op_fid2),
+		PFID(&op_data->op_fid1), ldlm_it2str(it->it_op),
+		it->it_flags);
+
+	lockh.cookie = 0;
+	if (fid_is_sane(&op_data->op_fid2) &&
+	    (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_READDIR))) {
+		/* We could just return 1 immediately, but since we should only
+		 * be called in revalidate_it if we already have a lock, let's
+		 * verify that. */
+		it->it_lock_handle = 0;
+		rc = mdc_revalidate_lock(exp, it, &op_data->op_fid2, NULL);
+		/* Only return failure if it was not GETATTR by cfid
+		   (from inode_revalidate) */
+		if (rc || op_data->op_namelen != 0)
+			RETURN(rc);
+	}
+
+	/* For case if upper layer did not alloc fid, do it now. */
+	if (!fid_is_sane(&op_data->op_fid2) && it->it_op & IT_CREAT) {
+		rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
+		if (rc < 0) {
+			CERROR("Can't alloc new fid, rc %d\n", rc);
+			RETURN(rc);
+		}
+	}
+
+	rc = mdc_enqueue_base(exp, &einfo, NULL, it, op_data, &lockh,
+			      extra_lock_flags);
+	if (rc < 0)
+		RETURN(rc);
+
+	*reqp = it->it_request;
+        rc = mdc_finish_intent_lock(exp, *reqp, op_data, it, &lockh);
+        RETURN(rc);
+}
+
+static int mdc_intent_getattr_async_interpret(const struct lu_env *env,
+                                              struct ptlrpc_request *req,
+                                              void *args, int rc)
+{
+	struct mdc_getattr_args  *ga = args;
+	struct obd_export        *exp = ga->ga_exp;
+	struct md_enqueue_info   *minfo = ga->ga_minfo;
+	struct ldlm_enqueue_info *einfo = &minfo->mi_einfo;
+	struct lookup_intent     *it;
+	struct lustre_handle     *lockh;
+	struct obd_device        *obddev;
+	struct ldlm_reply	 *lockrep;
+	__u64                     flags = LDLM_FL_HAS_INTENT;
+	ENTRY;
+
+        it    = &minfo->mi_it;
+        lockh = &minfo->mi_lockh;
+
+        obddev = class_exp2obd(exp);
+
+	obd_put_request_slot(&obddev->u.cli);
+        if (OBD_FAIL_CHECK(OBD_FAIL_MDC_GETATTR_ENQUEUE))
+                rc = -ETIMEDOUT;
+
+        rc = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, 1, einfo->ei_mode,
+                                   &flags, NULL, 0, lockh, rc);
+        if (rc < 0) {
+                CERROR("ldlm_cli_enqueue_fini: %d\n", rc);
+                mdc_clear_replay_flag(req, rc);
+                GOTO(out, rc);
+        }
+
+	lockrep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+	LASSERT(lockrep != NULL);
+
+	lockrep->lock_policy_res2 =
+		ptlrpc_status_ntoh(lockrep->lock_policy_res2);
+
+        rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc);
+        if (rc)
+                GOTO(out, rc);
+
+        rc = mdc_finish_intent_lock(exp, req, &minfo->mi_data, it, lockh);
+        EXIT;
+
+out:
+        minfo->mi_cb(req, minfo, rc);
+        return 0;
+}
+
+int mdc_intent_getattr_async(struct obd_export *exp,
+			     struct md_enqueue_info *minfo)
+{
+	struct md_op_data       *op_data = &minfo->mi_data;
+	struct lookup_intent    *it = &minfo->mi_it;
+	struct ptlrpc_request   *req;
+	struct mdc_getattr_args *ga;
+	struct obd_device       *obddev = class_exp2obd(exp);
+	struct ldlm_res_id       res_id;
+	union ldlm_policy_data policy = {
+				.l_inodebits = { MDS_INODELOCK_LOOKUP |
+						 MDS_INODELOCK_UPDATE } };
+	int			 rc = 0;
+	__u64			 flags = LDLM_FL_HAS_INTENT;
+	ENTRY;
+
+	CDEBUG(D_DLMTRACE, "name: %.*s in inode "DFID", intent: %s flags %#llo\n",
+		(int)op_data->op_namelen, op_data->op_name,
+		PFID(&op_data->op_fid1), ldlm_it2str(it->it_op), it->it_flags);
+
+	fid_build_reg_res_name(&op_data->op_fid1, &res_id);
+	req = mdc_intent_getattr_pack(exp, it, op_data);
+	if (IS_ERR(req))
+		RETURN(PTR_ERR(req));
+
+	rc = obd_get_request_slot(&obddev->u.cli);
+	if (rc != 0) {
+		ptlrpc_req_finished(req);
+		RETURN(rc);
+	}
+
+	rc = ldlm_cli_enqueue(exp, &req, &minfo->mi_einfo, &res_id, &policy,
+			      &flags, NULL, 0, LVB_T_NONE, &minfo->mi_lockh, 1);
+	if (rc < 0) {
+		obd_put_request_slot(&obddev->u.cli);
+		ptlrpc_req_finished(req);
+		RETURN(rc);
+	}
+
+	CLASSERT(sizeof(*ga) <= sizeof(req->rq_async_args));
+	ga = ptlrpc_req_async_args(req);
+	ga->ga_exp = exp;
+	ga->ga_minfo = minfo;
+
+	req->rq_interpret_reply = mdc_intent_getattr_async_interpret;
+	ptlrpcd_add_req(req);
+
+	RETURN(0);
+}
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c
new file mode 100644
index 0000000000000..75ed568153305
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c
@@ -0,0 +1,434 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <obd_class.h>
+#include "mdc_internal.h"
+#include <lustre_fid.h>
+
+/* mdc_setattr does its own semaphore handling */
+static int mdc_reint(struct ptlrpc_request *request, int level)
+{
+        int rc;
+
+        request->rq_send_state = level;
+
+	mdc_get_mod_rpc_slot(request, NULL);
+	rc = ptlrpc_queue_wait(request);
+	mdc_put_mod_rpc_slot(request, NULL);
+        if (rc)
+                CDEBUG(D_INFO, "error in handling %d\n", rc);
+        else if (!req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY)) {
+                rc = -EPROTO;
+        }
+        return rc;
+}
+
+/* Find and cancel locally locks matched by inode @bits & @mode in the resource
+ * found by @fid. Found locks are added into @cancel list. Returns the amount of
+ * locks added to @cancels list. */
+int mdc_resource_get_unused(struct obd_export *exp, const struct lu_fid *fid,
+			    struct list_head *cancels, enum ldlm_mode mode,
+			    __u64 bits)
+{
+	struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+	union ldlm_policy_data policy = { {0} };
+	struct ldlm_res_id res_id;
+	struct ldlm_resource *res;
+	int count;
+	ENTRY;
+
+	/* Return, i.e. cancel nothing, only if ELC is supported (flag in
+	 * export) but disabled through procfs (flag in NS).
+	 *
+	 * This distinguishes from a case when ELC is not supported originally,
+	 * when we still want to cancel locks in advance and just cancel them
+	 * locally, without sending any RPC. */
+	if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns))
+		RETURN(0);
+
+	fid_build_reg_res_name(fid, &res_id);
+	res = ldlm_resource_get(exp->exp_obd->obd_namespace,
+				NULL, &res_id, 0, 0);
+	if (IS_ERR(res))
+		RETURN(0);
+	LDLM_RESOURCE_ADDREF(res);
+	/* Initialize ibits lock policy. */
+	policy.l_inodebits.bits = bits;
+	count = ldlm_cancel_resource_local(res, cancels, &policy,
+					   mode, 0, 0, NULL);
+	LDLM_RESOURCE_DELREF(res);
+	ldlm_resource_putref(res);
+	RETURN(count);
+}
+
+int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data,
+		void *ea, size_t ealen, struct ptlrpc_request **request)
+{
+	struct list_head cancels = LIST_HEAD_INIT(cancels);
+        struct ptlrpc_request *req;
+        int count = 0, rc;
+        __u64 bits;
+        ENTRY;
+
+        LASSERT(op_data != NULL);
+
+        bits = MDS_INODELOCK_UPDATE;
+        if (op_data->op_attr.ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID))
+                bits |= MDS_INODELOCK_LOOKUP;
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+	    (fid_is_sane(&op_data->op_fid1)))
+		count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+						&cancels, LCK_EX, bits);
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                   &RQF_MDS_REINT_SETATTR);
+        if (req == NULL) {
+                ldlm_lock_list_put(&cancels, l_bl_ast, count);
+                RETURN(-ENOMEM);
+        }
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_EPOCH, RCL_CLIENT, 0);
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, ealen);
+	req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_CLIENT, 0);
+
+	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+        if (op_data->op_attr.ia_valid & (ATTR_MTIME | ATTR_CTIME))
+		CDEBUG(D_INODE, "setting mtime %ld, ctime %ld\n",
+                       LTIME_S(op_data->op_attr.ia_mtime),
+                       LTIME_S(op_data->op_attr.ia_ctime));
+	mdc_setattr_pack(req, op_data, ea, ealen);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+			     req->rq_import->imp_connect_data.ocd_max_easize);
+        ptlrpc_request_set_replen(req);
+
+	rc = mdc_reint(req, LUSTRE_IMP_FULL);
+	if (rc == -ERESTARTSYS)
+                rc = 0;
+
+        *request = req;
+
+	RETURN(rc);
+}
+
+int mdc_create(struct obd_export *exp, struct md_op_data *op_data,
+		const void *data, size_t datalen,
+		umode_t mode, uid_t uid, gid_t gid,
+		cfs_cap_t cap_effective, __u64 rdev,
+		struct ptlrpc_request **request)
+{
+        struct ptlrpc_request *req;
+        int level, rc;
+        int count, resends = 0;
+        struct obd_import *import = exp->exp_obd->u.cli.cl_import;
+        int generation = import->imp_generation;
+	struct list_head cancels = LIST_HEAD_INIT(cancels);
+        ENTRY;
+
+	/* For case if upper layer did not alloc fid, do it now. */
+	if (!fid_is_sane(&op_data->op_fid2)) {
+		/*
+		 * mdc_fid_alloc() may return errno 1 in case of switch to new
+		 * sequence, handle this.
+		 */
+		rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
+		if (rc < 0)
+			RETURN(rc);
+	}
+
+rebuild:
+        count = 0;
+        if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+            (fid_is_sane(&op_data->op_fid1)))
+                count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+                                                &cancels, LCK_EX,
+                                                MDS_INODELOCK_UPDATE);
+
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_REINT_CREATE_ACL);
+        if (req == NULL) {
+                ldlm_lock_list_put(&cancels, l_bl_ast, count);
+                RETURN(-ENOMEM);
+        }
+
+        req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                             op_data->op_namelen + 1);
+        req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+                             data && datalen ? datalen : 0);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX_NAME,
+			     RCL_CLIENT, op_data->op_file_secctx_name != NULL ?
+			     strlen(op_data->op_file_secctx_name) + 1 : 0);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX, RCL_CLIENT,
+			     op_data->op_file_secctx_size);
+
+	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+        /*
+         * mdc_create_pack() fills msg->bufs[1] with name and msg->bufs[2] with
+         * tgt, for symlinks or lov MD data.
+         */
+        mdc_create_pack(req, op_data, data, datalen, mode, uid,
+                        gid, cap_effective, rdev);
+
+        ptlrpc_request_set_replen(req);
+
+	/* ask ptlrpc not to resend on EINPROGRESS since we have our own retry
+	 * logic here */
+	req->rq_no_retry_einprogress = 1;
+
+        if (resends) {
+                req->rq_generation_set = 1;
+                req->rq_import_generation = generation;
+		req->rq_sent = ktime_get_real_seconds() + resends;
+        }
+        level = LUSTRE_IMP_FULL;
+ resend:
+	rc = mdc_reint(req, level);
+
+        /* Resend if we were told to. */
+        if (rc == -ERESTARTSYS) {
+                level = LUSTRE_IMP_RECOVER;
+                goto resend;
+        } else if (rc == -EINPROGRESS) {
+		/* Retry create infinitely until succeed or get other
+		 * error code or interrupted. */
+		ptlrpc_req_finished(req);
+		if (generation == import->imp_generation) {
+			if (signal_pending(current))
+				RETURN(-EINTR);
+
+			resends++;
+			CDEBUG(D_HA, "%s: resend:%d create on "DFID"/"DFID"\n",
+			       exp->exp_obd->obd_name, resends,
+			       PFID(&op_data->op_fid1),
+			       PFID(&op_data->op_fid2));
+			goto rebuild;
+                } else {
+                        CDEBUG(D_HA, "resend cross eviction\n");
+                        RETURN(-EIO);
+                }
+        }
+
+        *request = req;
+        RETURN(rc);
+}
+
+int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
+               struct ptlrpc_request **request)
+{
+	struct list_head cancels = LIST_HEAD_INIT(cancels);
+        struct obd_device *obd = class_exp2obd(exp);
+        struct ptlrpc_request *req = *request;
+        int count = 0, rc;
+        ENTRY;
+
+        LASSERT(req == NULL);
+
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+	    (fid_is_sane(&op_data->op_fid1)))
+		count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+						&cancels, LCK_EX,
+						MDS_INODELOCK_UPDATE);
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID3) &&
+	    (fid_is_sane(&op_data->op_fid3)))
+		count += mdc_resource_get_unused(exp, &op_data->op_fid3,
+						 &cancels, LCK_EX,
+						 MDS_INODELOCK_FULL);
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                   &RQF_MDS_REINT_UNLINK);
+        if (req == NULL) {
+                ldlm_lock_list_put(&cancels, l_bl_ast, count);
+                RETURN(-ENOMEM);
+        }
+
+        req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                             op_data->op_namelen + 1);
+
+	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_unlink_pack(req, op_data);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     obd->u.cli.cl_default_mds_easize);
+	ptlrpc_request_set_replen(req);
+
+        *request = req;
+
+	rc = mdc_reint(req, LUSTRE_IMP_FULL);
+        if (rc == -ERESTARTSYS)
+                rc = 0;
+        RETURN(rc);
+}
+
+int mdc_link(struct obd_export *exp, struct md_op_data *op_data,
+             struct ptlrpc_request **request)
+{
+	struct list_head cancels = LIST_HEAD_INIT(cancels);
+        struct ptlrpc_request *req;
+        int count = 0, rc;
+        ENTRY;
+
+        if ((op_data->op_flags & MF_MDC_CANCEL_FID2) &&
+            (fid_is_sane(&op_data->op_fid2)))
+                count = mdc_resource_get_unused(exp, &op_data->op_fid2,
+                                                &cancels, LCK_EX,
+                                                MDS_INODELOCK_UPDATE);
+        if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+            (fid_is_sane(&op_data->op_fid1)))
+                count += mdc_resource_get_unused(exp, &op_data->op_fid1,
+                                                 &cancels, LCK_EX,
+                                                 MDS_INODELOCK_UPDATE);
+
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_REINT_LINK);
+        if (req == NULL) {
+                ldlm_lock_list_put(&cancels, l_bl_ast, count);
+                RETURN(-ENOMEM);
+        }
+
+        req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                             op_data->op_namelen + 1);
+
+	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+        mdc_link_pack(req, op_data);
+        ptlrpc_request_set_replen(req);
+
+	rc = mdc_reint(req, LUSTRE_IMP_FULL);
+        *request = req;
+        if (rc == -ERESTARTSYS)
+                rc = 0;
+
+        RETURN(rc);
+}
+
+int mdc_rename(struct obd_export *exp, struct md_op_data *op_data,
+		const char *old, size_t oldlen, const char *new, size_t newlen,
+		struct ptlrpc_request **request)
+{
+	struct list_head cancels = LIST_HEAD_INIT(cancels);
+        struct obd_device *obd = exp->exp_obd;
+        struct ptlrpc_request *req;
+        int count = 0, rc;
+        ENTRY;
+
+        if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+            (fid_is_sane(&op_data->op_fid1)))
+                count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+                                                &cancels, LCK_EX,
+                                                MDS_INODELOCK_UPDATE);
+        if ((op_data->op_flags & MF_MDC_CANCEL_FID2) &&
+            (fid_is_sane(&op_data->op_fid2)))
+                count += mdc_resource_get_unused(exp, &op_data->op_fid2,
+                                                 &cancels, LCK_EX,
+                                                 MDS_INODELOCK_UPDATE);
+        if ((op_data->op_flags & MF_MDC_CANCEL_FID3) &&
+            (fid_is_sane(&op_data->op_fid3)))
+                count += mdc_resource_get_unused(exp, &op_data->op_fid3,
+                                                 &cancels, LCK_EX,
+                                                 MDS_INODELOCK_LOOKUP);
+        if ((op_data->op_flags & MF_MDC_CANCEL_FID4) &&
+             (fid_is_sane(&op_data->op_fid4)))
+                count += mdc_resource_get_unused(exp, &op_data->op_fid4,
+                                                 &cancels, LCK_EX,
+                                                 MDS_INODELOCK_FULL);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+			   op_data->op_cli_flags & CLI_MIGRATE ?
+			   &RQF_MDS_REINT_MIGRATE : &RQF_MDS_REINT_RENAME);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		RETURN(-ENOMEM);
+	}
+
+        req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, oldlen + 1);
+        req_capsule_set_size(&req->rq_pill, &RMF_SYMTGT, RCL_CLIENT, newlen+1);
+
+	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	if (op_data->op_cli_flags & CLI_MIGRATE && op_data->op_data != NULL) {
+		struct md_open_data *mod = op_data->op_data;
+
+		LASSERTF(mod->mod_open_req != NULL &&
+			 mod->mod_open_req->rq_type != LI_POISON,
+			 "POISONED open %p!\n", mod->mod_open_req);
+
+		DEBUG_REQ(D_HA, mod->mod_open_req, "matched open");
+		/* We no longer want to preserve this open for replay even
+		 * though the open was committed. b=3632, b=3633 */
+		spin_lock(&mod->mod_open_req->rq_lock);
+		mod->mod_open_req->rq_replay = 0;
+		spin_unlock(&mod->mod_open_req->rq_lock);
+	}
+
+        if (exp_connect_cancelset(exp) && req)
+                ldlm_cli_cancel_list(&cancels, count, req, 0);
+
+        mdc_rename_pack(req, op_data, old, oldlen, new, newlen);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     obd->u.cli.cl_default_mds_easize);
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_reint(req, LUSTRE_IMP_FULL);
+        *request = req;
+        if (rc == -ERESTARTSYS)
+                rc = 0;
+
+        RETURN(rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c
new file mode 100644
index 0000000000000..681e5bd94a6c9
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c
@@ -0,0 +1,2647 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+#include <linux/init.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/user_namespace.h>
+#include <linux/utsname.h>
+#ifdef HAVE_UIDGID_HEADER
+# include <linux/uidgid.h>
+#endif
+
+#include <lustre/lustre_errno.h>
+
+#include <cl_object.h>
+#include <llog_swab.h>
+#include <lprocfs_status.h>
+#include <lustre_acl.h>
+#include <lustre_fid.h>
+#include <uapi/linux/lustre_ioctl.h>
+#include <lustre_kernelcomm.h>
+#include <lustre_lmv.h>
+#include <lustre_log.h>
+#include <uapi/linux/lustre_param.h>
+#include <lustre_swab.h>
+#include <obd_class.h>
+
+#include "mdc_internal.h"
+
+#define REQUEST_MINOR 244
+
+static int mdc_cleanup(struct obd_device *obd);
+
+static inline int mdc_queue_wait(struct ptlrpc_request *req)
+{
+	struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+	int rc;
+
+	/* obd_get_request_slot() ensures that this client has no more
+	 * than cl_max_rpcs_in_flight RPCs simultaneously inf light
+	 * against an MDT. */
+	rc = obd_get_request_slot(cli);
+	if (rc != 0)
+		return rc;
+
+	rc = ptlrpc_queue_wait(req);
+	obd_put_request_slot(cli);
+
+	return rc;
+}
+
+/*
+ * Send MDS_GET_ROOT RPC to fetch root FID.
+ *
+ * If \a fileset is not NULL it should contain a subdirectory off
+ * the ROOT/ directory to be mounted on the client. Return the FID
+ * of the subdirectory to the client to mount onto its mountpoint.
+ *
+ * \param[in]	imp	MDC import
+ * \param[in]	fileset	fileset name, which could be NULL
+ * \param[out]	rootfid	root FID of this mountpoint
+ * \param[out]	pc	root capa will be unpacked and saved in this pointer
+ *
+ * \retval	0 on success, negative errno on failure
+ */
+static int mdc_get_root(struct obd_export *exp, const char *fileset,
+			 struct lu_fid *rootfid)
+{
+	struct ptlrpc_request	*req;
+	struct mdt_body		*body;
+	int			 rc;
+
+	ENTRY;
+
+	if (fileset && !(exp_connect_flags(exp) & OBD_CONNECT_SUBTREE))
+		RETURN(-ENOTSUPP);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				&RQF_MDS_GET_ROOT);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	if (fileset != NULL)
+		req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+				     strlen(fileset) + 1);
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GET_ROOT);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	mdc_pack_body(req, NULL, 0, 0, -1, 0);
+	if (fileset != NULL) {
+		char *name = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+
+		memcpy(name, fileset, strlen(fileset));
+	}
+	lustre_msg_add_flags(req->rq_reqmsg, LUSTRE_IMP_FULL);
+	req->rq_send_state = LUSTRE_IMP_FULL;
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	*rootfid = body->mbo_fid1;
+	CDEBUG(D_NET, "root fid="DFID", last_committed=%llu\n",
+	       PFID(rootfid), lustre_msg_get_last_committed(req->rq_repmsg));
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+
+	return rc;
+}
+
+/*
+ * This function now is known to always saying that it will receive 4 buffers
+ * from server. Even for cases when acl_size and md_size is zero, RPC header
+ * will contain 4 fields and RPC itself will contain zero size fields. This is
+ * because mdt_getattr*() _always_ returns 4 fields, but if acl is not needed
+ * and thus zero, it shrinks it, making zero size. The same story about
+ * md_size. And this is course of problem when client waits for smaller number
+ * of fields. This issue will be fixed later when client gets aware of RPC
+ * layouts.  --umka
+ */
+static int mdc_getattr_common(struct obd_export *exp,
+                              struct ptlrpc_request *req)
+{
+        struct req_capsule *pill = &req->rq_pill;
+        struct mdt_body    *body;
+        void               *eadata;
+        int                 rc;
+        ENTRY;
+
+        /* Request message already built. */
+        rc = ptlrpc_queue_wait(req);
+        if (rc != 0)
+                RETURN(rc);
+
+        /* sanity check for the reply */
+        body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+        if (body == NULL)
+                RETURN(-EPROTO);
+
+	CDEBUG(D_NET, "mode: %o\n", body->mbo_mode);
+
+	mdc_update_max_ea_from_body(exp, body);
+	if (body->mbo_eadatasize != 0) {
+		eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD,
+						      body->mbo_eadatasize);
+		if (eadata == NULL)
+			RETURN(-EPROTO);
+	}
+
+        RETURN(0);
+}
+
+static int mdc_getattr(struct obd_export *exp, struct md_op_data *op_data,
+		       struct ptlrpc_request **request)
+{
+        struct ptlrpc_request *req;
+        int                    rc;
+        ENTRY;
+
+	/* Single MDS without an LMV case */
+	if (op_data->op_flags & MF_GET_MDT_IDX) {
+		op_data->op_mds = 0;
+		RETURN(0);
+	}
+        *request = NULL;
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR);
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+        rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR);
+        if (rc) {
+                ptlrpc_request_free(req);
+                RETURN(rc);
+        }
+
+	mdc_pack_body(req, &op_data->op_fid1, op_data->op_valid,
+		      op_data->op_mode, -1, 0);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+			     req->rq_import->imp_connect_data.ocd_max_easize);
+        req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+                             op_data->op_mode);
+        ptlrpc_request_set_replen(req);
+
+        rc = mdc_getattr_common(exp, req);
+        if (rc)
+                ptlrpc_req_finished(req);
+        else
+                *request = req;
+        RETURN(rc);
+}
+
+static int mdc_getattr_name(struct obd_export *exp, struct md_op_data *op_data,
+			    struct ptlrpc_request **request)
+{
+        struct ptlrpc_request *req;
+        int                    rc;
+        ENTRY;
+
+        *request = NULL;
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                   &RQF_MDS_GETATTR_NAME);
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+        req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                             op_data->op_namelen + 1);
+
+        rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR_NAME);
+        if (rc) {
+                ptlrpc_request_free(req);
+                RETURN(rc);
+        }
+
+	mdc_pack_body(req, &op_data->op_fid1, op_data->op_valid,
+		      op_data->op_mode, op_data->op_suppgids[0], 0);
+
+        if (op_data->op_name) {
+                char *name = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+                LASSERT(strnlen(op_data->op_name, op_data->op_namelen) ==
+                                op_data->op_namelen);
+                memcpy(name, op_data->op_name, op_data->op_namelen);
+        }
+
+        req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+                             op_data->op_mode);
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+			     req->rq_import->imp_connect_data.ocd_max_easize);
+        ptlrpc_request_set_replen(req);
+
+        rc = mdc_getattr_common(exp, req);
+        if (rc)
+                ptlrpc_req_finished(req);
+        else
+                *request = req;
+        RETURN(rc);
+}
+
+static int mdc_xattr_common(struct obd_export *exp,const struct req_format *fmt,
+			    const struct lu_fid *fid, int opcode, u64 valid,
+			    const char *xattr_name, const char *input,
+			    int input_size, int output_size, int flags,
+			    __u32 suppgid, struct ptlrpc_request **request)
+{
+        struct ptlrpc_request *req;
+        int   xattr_namelen = 0;
+        char *tmp;
+        int   rc;
+        ENTRY;
+
+        *request = NULL;
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp), fmt);
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+        if (xattr_name) {
+                xattr_namelen = strlen(xattr_name) + 1;
+                req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                                     xattr_namelen);
+        }
+        if (input_size) {
+                LASSERT(input);
+                req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+                                     input_size);
+        }
+
+	/* Flush local XATTR locks to get rid of a possible cancel RPC */
+	if (opcode == MDS_REINT && fid_is_sane(fid) &&
+	    exp->exp_connect_data.ocd_ibits_known & MDS_INODELOCK_XATTR) {
+		struct list_head cancels = LIST_HEAD_INIT(cancels);
+		int count;
+
+		/* Without that packing would fail */
+		if (input_size == 0)
+			req_capsule_set_size(&req->rq_pill, &RMF_EADATA,
+					     RCL_CLIENT, 0);
+
+		count = mdc_resource_get_unused(exp, fid,
+						&cancels, LCK_EX,
+						MDS_INODELOCK_XATTR);
+
+		rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+		if (rc) {
+			ptlrpc_request_free(req);
+			RETURN(rc);
+		}
+	} else {
+		rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, opcode);
+		if (rc) {
+			ptlrpc_request_free(req);
+			RETURN(rc);
+		}
+	}
+
+        if (opcode == MDS_REINT) {
+                struct mdt_rec_setxattr *rec;
+
+                CLASSERT(sizeof(struct mdt_rec_setxattr) ==
+                         sizeof(struct mdt_rec_reint));
+		rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+		rec->sx_opcode = REINT_SETXATTR;
+		rec->sx_fsuid  = from_kuid(&init_user_ns, current_fsuid());
+		rec->sx_fsgid  = from_kgid(&init_user_ns, current_fsgid());
+		rec->sx_cap    = cfs_curproc_cap_pack();
+		rec->sx_suppgid1 = suppgid;
+                rec->sx_suppgid2 = -1;
+                rec->sx_fid    = *fid;
+                rec->sx_valid  = valid | OBD_MD_FLCTIME;
+		rec->sx_time   = ktime_get_real_seconds();
+                rec->sx_size   = output_size;
+                rec->sx_flags  = flags;
+	} else {
+		mdc_pack_body(req, fid, valid, output_size, suppgid, flags);
+	}
+
+        if (xattr_name) {
+                tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+                memcpy(tmp, xattr_name, xattr_namelen);
+        }
+        if (input_size) {
+                tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+                memcpy(tmp, input, input_size);
+        }
+
+        if (req_capsule_has_field(&req->rq_pill, &RMF_EADATA, RCL_SERVER))
+                req_capsule_set_size(&req->rq_pill, &RMF_EADATA,
+                                     RCL_SERVER, output_size);
+        ptlrpc_request_set_replen(req);
+
+        /* make rpc */
+        if (opcode == MDS_REINT)
+		mdc_get_mod_rpc_slot(req, NULL);
+
+        rc = ptlrpc_queue_wait(req);
+
+	if (opcode == MDS_REINT)
+		mdc_put_mod_rpc_slot(req, NULL);
+
+        if (rc)
+                ptlrpc_req_finished(req);
+        else
+                *request = req;
+        RETURN(rc);
+}
+
+static int mdc_setxattr(struct obd_export *exp, const struct lu_fid *fid,
+			u64 valid, const char *xattr_name,
+			const char *input, int input_size, int output_size,
+			int flags, __u32 suppgid,
+			struct ptlrpc_request **request)
+{
+	return mdc_xattr_common(exp, &RQF_MDS_REINT_SETXATTR,
+				fid, MDS_REINT, valid, xattr_name,
+				input, input_size, output_size, flags,
+				suppgid, request);
+}
+
+static int mdc_getxattr(struct obd_export *exp, const struct lu_fid *fid,
+			u64 valid, const char *xattr_name,
+			const char *input, int input_size, int output_size,
+			int flags, struct ptlrpc_request **request)
+{
+	return mdc_xattr_common(exp, &RQF_MDS_GETXATTR,
+				fid, MDS_GETXATTR, valid, xattr_name,
+				input, input_size, output_size, flags,
+				-1, request);
+}
+
+#ifdef CONFIG_FS_POSIX_ACL
+static int mdc_unpack_acl(struct ptlrpc_request *req, struct lustre_md *md)
+{
+        struct req_capsule     *pill = &req->rq_pill;
+        struct mdt_body        *body = md->body;
+        struct posix_acl       *acl;
+        void                   *buf;
+        int                     rc;
+        ENTRY;
+
+	if (!body->mbo_aclsize)
+		RETURN(0);
+
+	buf = req_capsule_server_sized_get(pill, &RMF_ACL, body->mbo_aclsize);
+
+	if (!buf)
+		RETURN(-EPROTO);
+
+	acl = posix_acl_from_xattr(&init_user_ns, buf, body->mbo_aclsize);
+	if (acl == NULL)
+		RETURN(0);
+        if (IS_ERR(acl)) {
+                rc = PTR_ERR(acl);
+                CERROR("convert xattr to acl: %d\n", rc);
+                RETURN(rc);
+        }
+
+        rc = posix_acl_valid(&init_user_ns, acl);
+        if (rc) {
+                CERROR("validate acl: %d\n", rc);
+                posix_acl_release(acl);
+                RETURN(rc);
+        }
+
+        md->posix_acl = acl;
+        RETURN(0);
+}
+#else
+#define mdc_unpack_acl(req, md) 0
+#endif
+
+int mdc_get_lustre_md(struct obd_export *exp, struct ptlrpc_request *req,
+                      struct obd_export *dt_exp, struct obd_export *md_exp,
+                      struct lustre_md *md)
+{
+        struct req_capsule *pill = &req->rq_pill;
+        int rc;
+        ENTRY;
+
+        LASSERT(md);
+        memset(md, 0, sizeof(*md));
+
+        md->body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+        LASSERT(md->body != NULL);
+
+	if (md->body->mbo_valid & OBD_MD_FLEASIZE) {
+		if (!S_ISREG(md->body->mbo_mode)) {
+			CDEBUG(D_INFO, "OBD_MD_FLEASIZE set, should be a "
+			       "regular file, but is not\n");
+			GOTO(out, rc = -EPROTO);
+		}
+
+		if (md->body->mbo_eadatasize == 0) {
+			CDEBUG(D_INFO, "OBD_MD_FLEASIZE set, "
+			       "but eadatasize 0\n");
+			GOTO(out, rc = -EPROTO);
+		}
+
+		md->layout.lb_len = md->body->mbo_eadatasize;
+		md->layout.lb_buf = req_capsule_server_sized_get(pill,
+							&RMF_MDT_MD,
+							md->layout.lb_len);
+		if (md->layout.lb_buf == NULL)
+			GOTO(out, rc = -EPROTO);
+	} else if (md->body->mbo_valid & OBD_MD_FLDIREA) {
+		const union lmv_mds_md *lmv;
+		size_t lmv_size;
+
+		if (!S_ISDIR(md->body->mbo_mode)) {
+			CDEBUG(D_INFO, "OBD_MD_FLDIREA set, should be a "
+			       "directory, but is not\n");
+			GOTO(out, rc = -EPROTO);
+		}
+
+		lmv_size = md->body->mbo_eadatasize;
+		if (lmv_size == 0) {
+			CDEBUG(D_INFO, "OBD_MD_FLDIREA is set, "
+			       "but eadatasize 0\n");
+			RETURN(-EPROTO);
+		}
+
+		if (md->body->mbo_valid & OBD_MD_MEA) {
+			lmv = req_capsule_server_sized_get(pill, &RMF_MDT_MD,
+							   lmv_size);
+			if (lmv == NULL)
+				GOTO(out, rc = -EPROTO);
+
+			rc = md_unpackmd(md_exp, &md->lmv, lmv, lmv_size);
+			if (rc < 0)
+				GOTO(out, rc);
+
+			if (rc < (typeof(rc))sizeof(*md->lmv)) {
+				CDEBUG(D_INFO, "size too small:  "
+				       "rc < sizeof(*md->lmv) (%d < %d)\n",
+					rc, (int)sizeof(*md->lmv));
+				GOTO(out, rc = -EPROTO);
+			}
+		}
+        }
+        rc = 0;
+
+	if (md->body->mbo_valid & OBD_MD_FLACL) {
+		/* for ACL, it's possible that FLACL is set but aclsize is zero.
+		 * only when aclsize != 0 there's an actual segment for ACL
+		 * in reply buffer.
+		 */
+		if (md->body->mbo_aclsize) {
+                        rc = mdc_unpack_acl(req, md);
+                        if (rc)
+                                GOTO(out, rc);
+#ifdef CONFIG_FS_POSIX_ACL
+                } else {
+                        md->posix_acl = NULL;
+#endif
+                }
+        }
+
+        EXIT;
+out:
+        if (rc) {
+#ifdef CONFIG_FS_POSIX_ACL
+                posix_acl_release(md->posix_acl);
+#endif
+        }
+        return rc;
+}
+
+int mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md)
+{
+        ENTRY;
+        RETURN(0);
+}
+
+void mdc_replay_open(struct ptlrpc_request *req)
+{
+        struct md_open_data *mod = req->rq_cb_data;
+        struct ptlrpc_request *close_req;
+        struct obd_client_handle *och;
+        struct lustre_handle old;
+        struct mdt_body *body;
+        ENTRY;
+
+        if (mod == NULL) {
+                DEBUG_REQ(D_ERROR, req,
+                          "Can't properly replay without open data.");
+                EXIT;
+                return;
+        }
+
+        body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+        LASSERT(body != NULL);
+
+	spin_lock(&req->rq_lock);
+	och = mod->mod_och;
+	if (och && och->och_fh.cookie)
+		req->rq_early_free_repbuf = 1;
+	else
+		req->rq_early_free_repbuf = 0;
+	spin_unlock(&req->rq_lock);
+
+	if (req->rq_early_free_repbuf) {
+		struct lustre_handle *file_fh;
+
+		LASSERT(och->och_magic == OBD_CLIENT_HANDLE_MAGIC);
+
+		file_fh = &och->och_fh;
+		CDEBUG(D_HA, "updating handle from %#llx to %#llx\n",
+		       file_fh->cookie, body->mbo_handle.cookie);
+		old = *file_fh;
+		*file_fh = body->mbo_handle;
+	}
+
+	close_req = mod->mod_close_req;
+	if (close_req) {
+		__u32 opc = lustre_msg_get_opc(close_req->rq_reqmsg);
+		struct mdt_ioepoch *epoch;
+
+		LASSERT(opc == MDS_CLOSE);
+		epoch = req_capsule_client_get(&close_req->rq_pill,
+					       &RMF_MDT_EPOCH);
+		LASSERT(epoch);
+
+		if (req->rq_early_free_repbuf)
+			LASSERT(!memcmp(&old, &epoch->mio_handle, sizeof(old)));
+
+		DEBUG_REQ(D_HA, close_req, "updating close body with new fh");
+		epoch->mio_handle = body->mbo_handle;
+	}
+	EXIT;
+}
+
+void mdc_commit_open(struct ptlrpc_request *req)
+{
+        struct md_open_data *mod = req->rq_cb_data;
+        if (mod == NULL)
+                return;
+
+        /**
+         * No need to touch md_open_data::mod_och, it holds a reference on
+         * \var mod and will zero references to each other, \var mod will be
+         * freed after that when md_open_data::mod_och will put the reference.
+         */
+
+        /**
+         * Do not let open request to disappear as it still may be needed
+         * for close rpc to happen (it may happen on evict only, otherwise
+         * ptlrpc_request::rq_replay does not let mdc_commit_open() to be
+         * called), just mark this rpc as committed to distinguish these 2
+         * cases, see mdc_close() for details. The open request reference will
+         * be put along with freeing \var mod.
+         */
+        ptlrpc_request_addref(req);
+	spin_lock(&req->rq_lock);
+	req->rq_committed = 1;
+	spin_unlock(&req->rq_lock);
+	req->rq_cb_data = NULL;
+	obd_mod_put(mod);
+}
+
+int mdc_set_open_replay_data(struct obd_export *exp,
+			     struct obd_client_handle *och,
+			     struct lookup_intent *it)
+{
+	struct md_open_data	*mod;
+	struct mdt_rec_create	*rec;
+	struct mdt_body		*body;
+	struct ptlrpc_request	*open_req = it->it_request;
+	struct obd_import	*imp = open_req->rq_import;
+	ENTRY;
+
+        if (!open_req->rq_replay)
+                RETURN(0);
+
+        rec = req_capsule_client_get(&open_req->rq_pill, &RMF_REC_REINT);
+        body = req_capsule_server_get(&open_req->rq_pill, &RMF_MDT_BODY);
+        LASSERT(rec != NULL);
+        /* Incoming message in my byte order (it's been swabbed). */
+        /* Outgoing messages always in my byte order. */
+        LASSERT(body != NULL);
+
+        /* Only if the import is replayable, we set replay_open data */
+        if (och && imp->imp_replayable) {
+                mod = obd_mod_alloc();
+                if (mod == NULL) {
+                        DEBUG_REQ(D_ERROR, open_req,
+                                  "Can't allocate md_open_data");
+                        RETURN(0);
+                }
+
+                /**
+                 * Take a reference on \var mod, to be freed on mdc_close().
+                 * It protects \var mod from being freed on eviction (commit
+                 * callback is called despite rq_replay flag).
+                 * Another reference for \var och.
+                 */
+                obd_mod_get(mod);
+                obd_mod_get(mod);
+
+		spin_lock(&open_req->rq_lock);
+		och->och_mod = mod;
+		mod->mod_och = och;
+		mod->mod_is_create = it_disposition(it, DISP_OPEN_CREATE) ||
+				     it_disposition(it, DISP_OPEN_STRIPE);
+		mod->mod_open_req = open_req;
+		open_req->rq_cb_data = mod;
+		open_req->rq_commit_cb = mdc_commit_open;
+		open_req->rq_early_free_repbuf = 1;
+		spin_unlock(&open_req->rq_lock);
+        }
+
+	rec->cr_fid2 = body->mbo_fid1;
+	rec->cr_ioepoch = body->mbo_ioepoch;
+	rec->cr_old_handle.cookie = body->mbo_handle.cookie;
+	open_req->rq_replay_cb = mdc_replay_open;
+	if (!fid_is_sane(&body->mbo_fid1)) {
+                DEBUG_REQ(D_ERROR, open_req, "Saving replay request with "
+                          "insane fid");
+                LBUG();
+        }
+
+        DEBUG_REQ(D_RPCTRACE, open_req, "Set up open replay data");
+        RETURN(0);
+}
+
+static void mdc_free_open(struct md_open_data *mod)
+{
+	int committed = 0;
+
+	if (mod->mod_is_create == 0 &&
+	    imp_connect_disp_stripe(mod->mod_open_req->rq_import))
+		committed = 1;
+
+	/**
+	 * No reason to asssert here if the open request has
+	 * rq_replay == 1. It means that mdc_close failed, and
+	 * close request wasn`t sent. It is not fatal to client.
+	 * The worst thing is eviction if the client gets open lock
+	 **/
+
+	DEBUG_REQ(D_RPCTRACE, mod->mod_open_req, "free open request rq_replay"
+		  "= %d\n", mod->mod_open_req->rq_replay);
+
+	ptlrpc_request_committed(mod->mod_open_req, committed);
+	if (mod->mod_close_req)
+		ptlrpc_request_committed(mod->mod_close_req, committed);
+}
+
+int mdc_clear_open_replay_data(struct obd_export *exp,
+                               struct obd_client_handle *och)
+{
+        struct md_open_data *mod = och->och_mod;
+        ENTRY;
+
+        /**
+         * It is possible to not have \var mod in a case of eviction between
+         * lookup and ll_file_open().
+         **/
+        if (mod == NULL)
+                RETURN(0);
+
+	LASSERT(mod != LP_POISON);
+	LASSERT(mod->mod_open_req != NULL);
+
+	spin_lock(&mod->mod_open_req->rq_lock);
+	if (mod->mod_och)
+		mod->mod_och->och_fh.cookie = 0;
+	mod->mod_open_req->rq_early_free_repbuf = 0;
+	spin_unlock(&mod->mod_open_req->rq_lock);
+	mdc_free_open(mod);
+
+        mod->mod_och = NULL;
+        och->och_mod = NULL;
+        obd_mod_put(mod);
+
+        RETURN(0);
+}
+
+static int mdc_close(struct obd_export *exp, struct md_op_data *op_data,
+		     struct md_open_data *mod, struct ptlrpc_request **request)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct ptlrpc_request *req;
+	struct req_format     *req_fmt;
+	int                    rc;
+	int		       saved_rc = 0;
+	ENTRY;
+
+	if (op_data->op_bias & MDS_HSM_RELEASE) {
+		req_fmt = &RQF_MDS_INTENT_CLOSE;
+
+		/* allocate a FID for volatile file */
+		rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
+		if (rc < 0) {
+			CERROR("%s: "DFID" failed to allocate FID: %d\n",
+			       obd->obd_name, PFID(&op_data->op_fid1), rc);
+			/* save the errcode and proceed to close */
+			saved_rc = rc;
+		}
+	} else if (op_data->op_bias & MDS_CLOSE_LAYOUT_SWAP) {
+		req_fmt = &RQF_MDS_INTENT_CLOSE;
+	} else {
+		req_fmt = &RQF_MDS_CLOSE;
+	}
+
+	*request = NULL;
+	if (OBD_FAIL_CHECK(OBD_FAIL_MDC_CLOSE))
+		req = NULL;
+	else
+		req = ptlrpc_request_alloc(class_exp2cliimp(exp), req_fmt);
+
+	/* Ensure that this close's handle is fixed up during replay. */
+	if (likely(mod != NULL)) {
+		LASSERTF(mod->mod_open_req != NULL &&
+			 mod->mod_open_req->rq_type != LI_POISON,
+			 "POISONED open %p!\n", mod->mod_open_req);
+
+		mod->mod_close_req = req;
+
+		DEBUG_REQ(D_HA, mod->mod_open_req, "matched open");
+		/* We no longer want to preserve this open for replay even
+		 * though the open was committed. b=3632, b=3633 */
+		spin_lock(&mod->mod_open_req->rq_lock);
+		mod->mod_open_req->rq_replay = 0;
+		spin_unlock(&mod->mod_open_req->rq_lock);
+	} else {
+		CDEBUG(D_HA, "couldn't find open req; expecting close error\n");
+	}
+	if (req == NULL) {
+		/**
+		 * TODO: repeat close after errors
+		 */
+		CWARN("%s: close of FID "DFID" failed, file reference will be "
+		      "dropped when this client unmounts or is evicted\n",
+		      obd->obd_name, PFID(&op_data->op_fid1));
+		GOTO(out, rc = -ENOMEM);
+	}
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_CLOSE);
+	if (rc) {
+		ptlrpc_request_free(req);
+		req = NULL;
+		GOTO(out, rc);
+	}
+
+        /* To avoid a livelock (bug 7034), we need to send CLOSE RPCs to a
+         * portal whose threads are not taking any DLM locks and are therefore
+         * always progressing */
+        req->rq_request_portal = MDS_READPAGE_PORTAL;
+        ptlrpc_at_set_req_timeout(req);
+
+
+        mdc_close_pack(req, op_data);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     obd->u.cli.cl_default_mds_easize);
+
+        ptlrpc_request_set_replen(req);
+
+	mdc_get_mod_rpc_slot(req, NULL);
+	rc = ptlrpc_queue_wait(req);
+	mdc_put_mod_rpc_slot(req, NULL);
+
+        if (req->rq_repmsg == NULL) {
+                CDEBUG(D_RPCTRACE, "request failed to send: %p, %d\n", req,
+                       req->rq_status);
+                if (rc == 0)
+                        rc = req->rq_status ?: -EIO;
+        } else if (rc == 0 || rc == -EAGAIN) {
+                struct mdt_body *body;
+
+                rc = lustre_msg_get_status(req->rq_repmsg);
+                if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
+                        DEBUG_REQ(D_ERROR, req, "type == PTL_RPC_MSG_ERR, err "
+                                  "= %d", rc);
+                        if (rc > 0)
+                                rc = -rc;
+                }
+                body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+                if (body == NULL)
+                        rc = -EPROTO;
+        } else if (rc == -ESTALE) {
+                /**
+                 * it can be allowed error after 3633 if open was committed and
+                 * server failed before close was sent. Let's check if mod
+                 * exists and return no error in that case
+                 */
+                if (mod) {
+                        DEBUG_REQ(D_HA, req, "Reset ESTALE = %d", rc);
+                        LASSERT(mod->mod_open_req != NULL);
+                        if (mod->mod_open_req->rq_committed)
+                                rc = 0;
+                }
+        }
+
+out:
+        if (mod) {
+                if (rc != 0)
+                        mod->mod_close_req = NULL;
+                /* Since now, mod is accessed through open_req only,
+                 * thus close req does not keep a reference on mod anymore. */
+                obd_mod_put(mod);
+        }
+        *request = req;
+
+	RETURN(rc < 0 ? rc : saved_rc);
+}
+
+static int mdc_getpage(struct obd_export *exp, const struct lu_fid *fid,
+		       u64 offset, struct page **pages, int npages,
+		       struct ptlrpc_request **request)
+{
+	struct ptlrpc_request   *req;
+	struct ptlrpc_bulk_desc *desc;
+	int                      i;
+	wait_queue_head_t        waitq;
+	int                      resends = 0;
+	struct l_wait_info       lwi;
+	int                      rc;
+	ENTRY;
+
+	*request = NULL;
+	init_waitqueue_head(&waitq);
+
+restart_bulk:
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_READPAGE);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_READPAGE);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	req->rq_request_portal = MDS_READPAGE_PORTAL;
+	ptlrpc_at_set_req_timeout(req);
+
+	desc = ptlrpc_prep_bulk_imp(req, npages, 1,
+				    PTLRPC_BULK_PUT_SINK | PTLRPC_BULK_BUF_KIOV,
+				    MDS_BULK_PORTAL,
+				    &ptlrpc_bulk_kiov_pin_ops);
+	if (desc == NULL) {
+		ptlrpc_req_finished(req);
+		RETURN(-ENOMEM);
+	}
+
+	/* NB req now owns desc and will free it when it gets freed */
+	for (i = 0; i < npages; i++)
+		desc->bd_frag_ops->add_kiov_frag(desc, pages[i], 0,
+						 PAGE_SIZE);
+
+	mdc_readdir_pack(req, offset, PAGE_SIZE * npages, fid);
+
+	ptlrpc_request_set_replen(req);
+	rc = ptlrpc_queue_wait(req);
+	if (rc) {
+		ptlrpc_req_finished(req);
+		if (rc != -ETIMEDOUT)
+			RETURN(rc);
+
+		resends++;
+		if (!client_should_resend(resends, &exp->exp_obd->u.cli)) {
+			CERROR("%s: too many resend retries: rc = %d\n",
+			       exp->exp_obd->obd_name, -EIO);
+			RETURN(-EIO);
+		}
+		lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(resends), NULL, NULL,
+				       NULL);
+		l_wait_event(waitq, 0, &lwi);
+
+		goto restart_bulk;
+	}
+
+	rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk,
+					  req->rq_bulk->bd_nob_transferred);
+	if (rc < 0) {
+		ptlrpc_req_finished(req);
+		RETURN(rc);
+	}
+
+	if (req->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK) {
+		CERROR("%s: unexpected bytes transferred: %d (%ld expected)\n",
+		       exp->exp_obd->obd_name, req->rq_bulk->bd_nob_transferred,
+		       PAGE_SIZE * npages);
+		ptlrpc_req_finished(req);
+		RETURN(-EPROTO);
+	}
+
+	*request = req;
+	RETURN(0);
+}
+
+static void mdc_release_page(struct page *page, int remove)
+{
+	if (remove) {
+		lock_page(page);
+		if (likely(page->mapping != NULL))
+			truncate_complete_page(page->mapping, page);
+		unlock_page(page);
+	}
+	put_page(page);
+}
+
+static struct page *mdc_page_locate(struct address_space *mapping, __u64 *hash,
+				    __u64 *start, __u64 *end, int hash64)
+{
+	/*
+	 * Complement of hash is used as an index so that
+	 * radix_tree_gang_lookup() can be used to find a page with starting
+	 * hash _smaller_ than one we are looking for.
+	 */
+	unsigned long offset = hash_x_index(*hash, hash64);
+	struct page *page;
+	int found;
+
+	spin_lock_irq(&mapping->tree_lock);
+	found = radix_tree_gang_lookup(&mapping->page_tree,
+				       (void **)&page, offset, 1);
+	if (found > 0 && !radix_tree_exceptional_entry(page)) {
+		struct lu_dirpage *dp;
+
+		get_page(page);
+		spin_unlock_irq(&mapping->tree_lock);
+		/*
+		 * In contrast to find_lock_page() we are sure that directory
+		 * page cannot be truncated (while DLM lock is held) and,
+		 * hence, can avoid restart.
+		 *
+		 * In fact, page cannot be locked here at all, because
+		 * mdc_read_page_remote does synchronous io.
+		 */
+		wait_on_page_locked(page);
+		if (PageUptodate(page)) {
+			dp = kmap(page);
+			if (BITS_PER_LONG == 32 && hash64) {
+				*start = le64_to_cpu(dp->ldp_hash_start) >> 32;
+				*end   = le64_to_cpu(dp->ldp_hash_end) >> 32;
+				*hash  = *hash >> 32;
+			} else {
+				*start = le64_to_cpu(dp->ldp_hash_start);
+				*end   = le64_to_cpu(dp->ldp_hash_end);
+			}
+			if (unlikely(*start == 1 && *hash == 0))
+				*hash = *start;
+			else
+				LASSERTF(*start <= *hash, "start = %#llx"
+					 ",end = %#llx,hash = %#llx\n",
+					 *start, *end, *hash);
+			CDEBUG(D_VFSTRACE, "offset %lx [%#llx %#llx],"
+			      " hash %#llx\n", offset, *start, *end, *hash);
+			if (*hash > *end) {
+				kunmap(page);
+				mdc_release_page(page, 0);
+				page = NULL;
+			} else if (*end != *start && *hash == *end) {
+				/*
+				 * upon hash collision, remove this page,
+				 * otherwise put page reference, and
+				 * mdc_read_page_remote() will issue RPC to
+				 * fetch the page we want.
+				 */
+				kunmap(page);
+				mdc_release_page(page,
+				    le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE);
+				page = NULL;
+			}
+		} else {
+			put_page(page);
+			page = ERR_PTR(-EIO);
+		}
+	} else {
+		spin_unlock_irq(&mapping->tree_lock);
+		page = NULL;
+	}
+	return page;
+}
+
+/*
+ * Adjust a set of pages, each page containing an array of lu_dirpages,
+ * so that each page can be used as a single logical lu_dirpage.
+ *
+ * A lu_dirpage is laid out as follows, where s = ldp_hash_start,
+ * e = ldp_hash_end, f = ldp_flags, p = padding, and each "ent" is a
+ * struct lu_dirent.  It has size up to LU_PAGE_SIZE. The ldp_hash_end
+ * value is used as a cookie to request the next lu_dirpage in a
+ * directory listing that spans multiple pages (two in this example):
+ *   ________
+ *  |        |
+ * .|--------v-------   -----.
+ * |s|e|f|p|ent|ent| ... |ent|
+ * '--|--------------   -----'   Each PAGE contains a single
+ *    '------.                   lu_dirpage.
+ * .---------v-------   -----.
+ * |s|e|f|p|ent| 0 | ... | 0 |
+ * '-----------------   -----'
+ *
+ * However, on hosts where the native VM page size (PAGE_SIZE) is
+ * larger than LU_PAGE_SIZE, a single host page may contain multiple
+ * lu_dirpages. After reading the lu_dirpages from the MDS, the
+ * ldp_hash_end of the first lu_dirpage refers to the one immediately
+ * after it in the same PAGE (arrows simplified for brevity, but
+ * in general e0==s1, e1==s2, etc.):
+ *
+ * .--------------------   -----.
+ * |s0|e0|f0|p|ent|ent| ... |ent|
+ * |---v----------------   -----|
+ * |s1|e1|f1|p|ent|ent| ... |ent|
+ * |---v----------------   -----|  Here, each PAGE contains
+ *             ...                 multiple lu_dirpages.
+ * |---v----------------   -----|
+ * |s'|e'|f'|p|ent|ent| ... |ent|
+ * '---|----------------   -----'
+ *     v
+ * .----------------------------.
+ * |        next PAGE           |
+ *
+ * This structure is transformed into a single logical lu_dirpage as follows:
+ *
+ * - Replace e0 with e' so the request for the next lu_dirpage gets the page
+ *   labeled 'next PAGE'.
+ *
+ * - Copy the LDF_COLLIDE flag from f' to f0 to correctly reflect whether
+ *   a hash collision with the next page exists.
+ *
+ * - Adjust the lde_reclen of the ending entry of each lu_dirpage to span
+ *   to the first entry of the next lu_dirpage.
+ */
+#if PAGE_SIZE > LU_PAGE_SIZE
+static void mdc_adjust_dirpages(struct page **pages, int cfs_pgs, int lu_pgs)
+{
+	int i;
+
+	for (i = 0; i < cfs_pgs; i++) {
+		struct lu_dirpage	*dp = kmap(pages[i]);
+		struct lu_dirpage	*first = dp;
+		struct lu_dirent	*end_dirent = NULL;
+		struct lu_dirent	*ent;
+		__u64		hash_end = le64_to_cpu(dp->ldp_hash_end);
+		__u32		flags = le32_to_cpu(dp->ldp_flags);
+
+		while (--lu_pgs > 0) {
+			ent = lu_dirent_start(dp);
+			for (end_dirent = ent; ent != NULL;
+			     end_dirent = ent, ent = lu_dirent_next(ent));
+
+			/* Advance dp to next lu_dirpage. */
+			dp = (struct lu_dirpage *)((char *)dp + LU_PAGE_SIZE);
+
+			/* Check if we've reached the end of the PAGE. */
+			if (!((unsigned long)dp & ~PAGE_MASK))
+				break;
+
+			/* Save the hash and flags of this lu_dirpage. */
+			hash_end = le64_to_cpu(dp->ldp_hash_end);
+			flags = le32_to_cpu(dp->ldp_flags);
+
+			/* Check if lu_dirpage contains no entries. */
+			if (end_dirent == NULL)
+				break;
+
+			/* Enlarge the end entry lde_reclen from 0 to
+			 * first entry of next lu_dirpage. */
+			LASSERT(le16_to_cpu(end_dirent->lde_reclen) == 0);
+			end_dirent->lde_reclen =
+				cpu_to_le16((char *)(dp->ldp_entries) -
+					    (char *)end_dirent);
+		}
+
+		first->ldp_hash_end = hash_end;
+		first->ldp_flags &= ~cpu_to_le32(LDF_COLLIDE);
+		first->ldp_flags |= flags & cpu_to_le32(LDF_COLLIDE);
+
+		kunmap(pages[i]);
+	}
+	LASSERTF(lu_pgs == 0, "left = %d\n", lu_pgs);
+}
+#else
+#define mdc_adjust_dirpages(pages, cfs_pgs, lu_pgs) do {} while (0)
+#endif	/* PAGE_SIZE > LU_PAGE_SIZE */
+
+/* parameters for readdir page */
+struct readpage_param {
+	struct md_op_data	*rp_mod;
+	__u64			rp_off;
+	int			rp_hash64;
+	struct obd_export	*rp_exp;
+	struct md_callback	*rp_cb;
+};
+
+#ifndef HAVE_DELETE_FROM_PAGE_CACHE
+static inline void delete_from_page_cache(struct page *page)
+{
+	remove_from_page_cache(page);
+	put_page(page);
+}
+#endif
+
+/**
+ * Read pages from server.
+ *
+ * Page in MDS_READPAGE RPC is packed in LU_PAGE_SIZE, and each page contains
+ * a header lu_dirpage which describes the start/end hash, and whether this
+ * page is empty (contains no dir entry) or hash collide with next page.
+ * After client receives reply, several pages will be integrated into dir page
+ * in PAGE_SIZE (if PAGE_SIZE greater than LU_PAGE_SIZE), and the
+ * lu_dirpage for this integrated page will be adjusted.
+ **/
+static int mdc_read_page_remote(void *data, struct page *page0)
+{
+	struct readpage_param *rp = data;
+	struct page **page_pool;
+	struct page *page;
+	struct lu_dirpage *dp;
+	struct md_op_data *op_data = rp->rp_mod;
+	struct ptlrpc_request *req;
+	int max_pages;
+	struct inode *inode;
+	struct lu_fid *fid;
+	int rd_pgs = 0; /* number of pages actually read */
+	int npages;
+	int i;
+	int rc;
+	ENTRY;
+
+	max_pages = rp->rp_exp->exp_obd->u.cli.cl_max_pages_per_rpc;
+	inode = op_data->op_data;
+	fid = &op_data->op_fid1;
+	LASSERT(inode != NULL);
+
+	OBD_ALLOC(page_pool, sizeof(page_pool[0]) * max_pages);
+	if (page_pool != NULL) {
+		page_pool[0] = page0;
+	} else {
+		page_pool = &page0;
+		max_pages = 1;
+	}
+
+	for (npages = 1; npages < max_pages; npages++) {
+		page = __page_cache_alloc(mapping_gfp_mask(inode->i_mapping)
+					  | __GFP_COLD);
+		if (page == NULL)
+			break;
+		page_pool[npages] = page;
+	}
+
+	rc = mdc_getpage(rp->rp_exp, fid, rp->rp_off, page_pool, npages, &req);
+	if (rc < 0) {
+		/* page0 is special, which was added into page cache early */
+		delete_from_page_cache(page0);
+	} else {
+		int lu_pgs;
+
+		rd_pgs = (req->rq_bulk->bd_nob_transferred + PAGE_SIZE - 1) >>
+			PAGE_SHIFT;
+		lu_pgs = req->rq_bulk->bd_nob_transferred >> LU_PAGE_SHIFT;
+		LASSERT(!(req->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK));
+
+		CDEBUG(D_INODE, "read %d(%d) pages\n", rd_pgs, lu_pgs);
+
+		mdc_adjust_dirpages(page_pool, rd_pgs, lu_pgs);
+
+		SetPageUptodate(page0);
+	}
+	unlock_page(page0);
+
+	ptlrpc_req_finished(req);
+	CDEBUG(D_CACHE, "read %d/%d pages\n", rd_pgs, npages);
+	for (i = 1; i < npages; i++) {
+		unsigned long	offset;
+		__u64		hash;
+		int ret;
+
+		page = page_pool[i];
+
+		if (rc < 0 || i >= rd_pgs) {
+			put_page(page);
+			continue;
+		}
+
+		SetPageUptodate(page);
+
+		dp = kmap(page);
+		hash = le64_to_cpu(dp->ldp_hash_start);
+		kunmap(page);
+
+		offset = hash_x_index(hash, rp->rp_hash64);
+
+		prefetchw(&page->flags);
+		ret = add_to_page_cache_lru(page, inode->i_mapping, offset,
+					    GFP_KERNEL);
+		if (ret == 0)
+			unlock_page(page);
+		else
+			CDEBUG(D_VFSTRACE, "page %lu add to page cache failed:"
+			       " rc = %d\n", offset, ret);
+		put_page(page);
+	}
+
+	if (page_pool != &page0)
+		OBD_FREE(page_pool, sizeof(page_pool[0]) * max_pages);
+
+	RETURN(rc);
+}
+
+/**
+ * Read dir page from cache first, if it can not find it, read it from
+ * server and add into the cache.
+ *
+ * \param[in] exp	MDC export
+ * \param[in] op_data	client MD stack parameters, transfering parameters
+ *                      between different layers on client MD stack.
+ * \param[in] cb_op	callback required for ldlm lock enqueue during
+ *                      read page
+ * \param[in] hash_offset the hash offset of the page to be read
+ * \param[in] ppage	the page to be read
+ *
+ * retval		= 0 get the page successfully
+ *                      errno(<0) get the page failed
+ */
+static int mdc_read_page(struct obd_export *exp, struct md_op_data *op_data,
+			 struct md_callback *cb_op, __u64 hash_offset,
+			 struct page **ppage)
+{
+	struct lookup_intent	it = { .it_op = IT_READDIR };
+	struct page		*page;
+	struct inode		*dir = op_data->op_data;
+	struct address_space	*mapping;
+	struct lu_dirpage	*dp;
+	__u64			start = 0;
+	__u64			end = 0;
+	struct lustre_handle	lockh;
+	struct ptlrpc_request	*enq_req = NULL;
+	struct readpage_param	rp_param;
+	int rc;
+
+	ENTRY;
+
+	*ppage = NULL;
+
+	LASSERT(dir != NULL);
+	mapping = dir->i_mapping;
+
+	rc = mdc_intent_lock(exp, op_data, &it, &enq_req,
+			     cb_op->md_blocking_ast, 0);
+	if (enq_req != NULL)
+		ptlrpc_req_finished(enq_req);
+
+	if (rc < 0) {
+		CERROR("%s: "DFID" lock enqueue fails: rc = %d\n",
+		       exp->exp_obd->obd_name, PFID(&op_data->op_fid1), rc);
+		RETURN(rc);
+	}
+
+	rc = 0;
+	lockh.cookie = it.it_lock_handle;
+	mdc_set_lock_data(exp, &lockh, dir, NULL);
+
+	rp_param.rp_off = hash_offset;
+	rp_param.rp_hash64 = op_data->op_cli_flags & CLI_HASH64;
+	page = mdc_page_locate(mapping, &rp_param.rp_off, &start, &end,
+			       rp_param.rp_hash64);
+	if (IS_ERR(page)) {
+		CERROR("%s: dir page locate: "DFID" at %llu: rc %ld\n",
+		       exp->exp_obd->obd_name, PFID(&op_data->op_fid1),
+		       rp_param.rp_off, PTR_ERR(page));
+		GOTO(out_unlock, rc = PTR_ERR(page));
+	} else if (page != NULL) {
+		/*
+		 * XXX nikita: not entirely correct handling of a corner case:
+		 * suppose hash chain of entries with hash value HASH crosses
+		 * border between pages P0 and P1. First both P0 and P1 are
+		 * cached, seekdir() is called for some entry from the P0 part
+		 * of the chain. Later P0 goes out of cache. telldir(HASH)
+		 * happens and finds P1, as it starts with matching hash
+		 * value. Remaining entries from P0 part of the chain are
+		 * skipped. (Is that really a bug?)
+		 *
+		 * Possible solutions: 0. don't cache P1 is such case, handle
+		 * it as an "overflow" page. 1. invalidate all pages at
+		 * once. 2. use HASH|1 as an index for P1.
+		 */
+		GOTO(hash_collision, page);
+	}
+
+	rp_param.rp_exp = exp;
+	rp_param.rp_mod = op_data;
+	page = read_cache_page(mapping,
+			       hash_x_index(rp_param.rp_off,
+					    rp_param.rp_hash64),
+			       mdc_read_page_remote, &rp_param);
+	if (IS_ERR(page)) {
+		CDEBUG(D_INFO, "%s: read cache page: "DFID" at %llu: %ld\n",
+		       exp->exp_obd->obd_name, PFID(&op_data->op_fid1),
+		       rp_param.rp_off, PTR_ERR(page));
+		GOTO(out_unlock, rc = PTR_ERR(page));
+	}
+
+	wait_on_page_locked(page);
+	(void)kmap(page);
+	if (!PageUptodate(page)) {
+		CERROR("%s: page not updated: "DFID" at %llu: rc %d\n",
+		       exp->exp_obd->obd_name, PFID(&op_data->op_fid1),
+		       rp_param.rp_off, -5);
+		goto fail;
+	}
+	if (!PageChecked(page))
+		SetPageChecked(page);
+	if (PageError(page)) {
+		CERROR("%s: page error: "DFID" at %llu: rc %d\n",
+		       exp->exp_obd->obd_name, PFID(&op_data->op_fid1),
+		       rp_param.rp_off, -5);
+		goto fail;
+	}
+
+hash_collision:
+	dp = page_address(page);
+	if (BITS_PER_LONG == 32 && rp_param.rp_hash64) {
+		start = le64_to_cpu(dp->ldp_hash_start) >> 32;
+		end   = le64_to_cpu(dp->ldp_hash_end) >> 32;
+		rp_param.rp_off = hash_offset >> 32;
+	} else {
+		start = le64_to_cpu(dp->ldp_hash_start);
+		end   = le64_to_cpu(dp->ldp_hash_end);
+		rp_param.rp_off = hash_offset;
+	}
+	if (end == start) {
+		LASSERT(start == rp_param.rp_off);
+		CWARN("Page-wide hash collision: %#lx\n", (unsigned long)end);
+#if BITS_PER_LONG == 32
+		CWARN("Real page-wide hash collision at [%llu %llu] with "
+		      "hash %llu\n", le64_to_cpu(dp->ldp_hash_start),
+		      le64_to_cpu(dp->ldp_hash_end), hash_offset);
+#endif
+
+		/*
+		 * Fetch whole overflow chain...
+		 *
+		 * XXX not yet.
+		 */
+		goto fail;
+	}
+	*ppage = page;
+out_unlock:
+	ldlm_lock_decref(&lockh, it.it_lock_mode);
+	return rc;
+fail:
+	kunmap(page);
+	mdc_release_page(page, 1);
+	rc = -EIO;
+	goto out_unlock;
+}
+
+
+static int mdc_statfs(const struct lu_env *env,
+                      struct obd_export *exp, struct obd_statfs *osfs,
+                      __u64 max_age, __u32 flags)
+{
+        struct obd_device     *obd = class_exp2obd(exp);
+        struct ptlrpc_request *req;
+        struct obd_statfs     *msfs;
+        struct obd_import     *imp = NULL;
+        int                    rc;
+        ENTRY;
+
+        /*
+         * Since the request might also come from lprocfs, so we need
+         * sync this with client_disconnect_export Bug15684
+         */
+	down_read(&obd->u.cli.cl_sem);
+        if (obd->u.cli.cl_import)
+                imp = class_import_get(obd->u.cli.cl_import);
+	up_read(&obd->u.cli.cl_sem);
+        if (!imp)
+                RETURN(-ENODEV);
+
+        req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_STATFS,
+                                        LUSTRE_MDS_VERSION, MDS_STATFS);
+        if (req == NULL)
+                GOTO(output, rc = -ENOMEM);
+
+        ptlrpc_request_set_replen(req);
+
+        if (flags & OBD_STATFS_NODELAY) {
+                /* procfs requests not want stay in wait for avoid deadlock */
+                req->rq_no_resend = 1;
+                req->rq_no_delay = 1;
+        }
+
+        rc = ptlrpc_queue_wait(req);
+        if (rc) {
+                /* check connection error first */
+                if (imp->imp_connect_error)
+                        rc = imp->imp_connect_error;
+                GOTO(out, rc);
+        }
+
+        msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+        if (msfs == NULL)
+                GOTO(out, rc = -EPROTO);
+
+        *osfs = *msfs;
+        EXIT;
+out:
+        ptlrpc_req_finished(req);
+output:
+        class_import_put(imp);
+        return rc;
+}
+
+static int mdc_ioc_fid2path(struct obd_export *exp, struct getinfo_fid2path *gf)
+{
+	__u32 keylen, vallen;
+	void *key;
+	int rc;
+
+	if (gf->gf_pathlen > PATH_MAX)
+		RETURN(-ENAMETOOLONG);
+	if (gf->gf_pathlen < 2)
+		RETURN(-EOVERFLOW);
+
+	/* Key is KEY_FID2PATH + getinfo_fid2path description */
+	keylen = cfs_size_round(sizeof(KEY_FID2PATH) + sizeof(*gf) +
+				sizeof(struct lu_fid));
+	OBD_ALLOC(key, keylen);
+	if (key == NULL)
+		RETURN(-ENOMEM);
+	memcpy(key, KEY_FID2PATH, sizeof(KEY_FID2PATH));
+	memcpy(key + cfs_size_round(sizeof(KEY_FID2PATH)), gf, sizeof(*gf));
+	memcpy(key + cfs_size_round(sizeof(KEY_FID2PATH)) + sizeof(*gf),
+	       gf->gf_u.gf_root_fid, sizeof(struct lu_fid));
+	CDEBUG(D_IOCTL, "path get "DFID" from %llu #%d\n",
+	       PFID(&gf->gf_fid), gf->gf_recno, gf->gf_linkno);
+
+	if (!fid_is_sane(&gf->gf_fid))
+		GOTO(out, rc = -EINVAL);
+
+        /* Val is struct getinfo_fid2path result plus path */
+        vallen = sizeof(*gf) + gf->gf_pathlen;
+
+	rc = obd_get_info(NULL, exp, keylen, key, &vallen, gf);
+	if (rc != 0 && rc != -EREMOTE)
+		GOTO(out, rc);
+
+	if (vallen <= sizeof(*gf))
+		GOTO(out, rc = -EPROTO);
+	if (vallen > sizeof(*gf) + gf->gf_pathlen)
+		GOTO(out, rc = -EOVERFLOW);
+
+	CDEBUG(D_IOCTL, "path got "DFID" from %llu #%d: %s\n",
+	       PFID(&gf->gf_fid), gf->gf_recno, gf->gf_linkno,
+	       gf->gf_pathlen < 512 ? gf->gf_u.gf_path :
+	       /* only log the last 512 characters of the path */
+	       gf->gf_u.gf_path + gf->gf_pathlen - 512);
+
+out:
+	OBD_FREE(key, keylen);
+	return rc;
+}
+
+static int mdc_ioc_hsm_progress(struct obd_export *exp,
+				struct hsm_progress_kernel *hpk)
+{
+	struct obd_import		*imp = class_exp2cliimp(exp);
+	struct hsm_progress_kernel	*req_hpk;
+	struct ptlrpc_request		*req;
+	int				 rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_PROGRESS,
+					LUSTRE_MDS_VERSION, MDS_HSM_PROGRESS);
+	if (req == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	mdc_pack_body(req, NULL, 0, 0, -1, 0);
+
+	/* Copy hsm_progress struct */
+	req_hpk = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_PROGRESS);
+	if (req_hpk == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	*req_hpk = *hpk;
+	req_hpk->hpk_errval = lustre_errno_hton(hpk->hpk_errval);
+
+	ptlrpc_request_set_replen(req);
+
+	mdc_get_mod_rpc_slot(req, NULL);
+	rc = ptlrpc_queue_wait(req);
+	mdc_put_mod_rpc_slot(req, NULL);
+
+	GOTO(out, rc);
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_ct_register(struct obd_import *imp, __u32 archives)
+{
+	__u32			*archive_mask;
+	struct ptlrpc_request	*req;
+	int			 rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_CT_REGISTER,
+					LUSTRE_MDS_VERSION,
+					MDS_HSM_CT_REGISTER);
+	if (req == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	mdc_pack_body(req, NULL, 0, 0, -1, 0);
+
+	/* Copy hsm_progress struct */
+	archive_mask = req_capsule_client_get(&req->rq_pill,
+					      &RMF_MDS_HSM_ARCHIVE);
+	if (archive_mask == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	*archive_mask = archives;
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_queue_wait(req);
+	GOTO(out, rc);
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_current_action(struct obd_export *exp,
+				      struct md_op_data *op_data)
+{
+	struct hsm_current_action	*hca = op_data->op_data;
+	struct hsm_current_action	*req_hca;
+	struct ptlrpc_request		*req;
+	int				 rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_HSM_ACTION);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_ACTION);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_pack_body(req, &op_data->op_fid1, 0, 0,
+		      op_data->op_suppgids[0], 0);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	req_hca = req_capsule_server_get(&req->rq_pill,
+					 &RMF_MDS_HSM_CURRENT_ACTION);
+	if (req_hca == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	*hca = *req_hca;
+
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_ct_unregister(struct obd_import *imp)
+{
+	struct ptlrpc_request	*req;
+	int			 rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_CT_UNREGISTER,
+					LUSTRE_MDS_VERSION,
+					MDS_HSM_CT_UNREGISTER);
+	if (req == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	mdc_pack_body(req, NULL, 0, 0, -1, 0);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_queue_wait(req);
+	GOTO(out, rc);
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_state_get(struct obd_export *exp,
+				 struct md_op_data *op_data)
+{
+	struct hsm_user_state	*hus = op_data->op_data;
+	struct hsm_user_state	*req_hus;
+	struct ptlrpc_request	*req;
+	int			 rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_HSM_STATE_GET);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_STATE_GET);
+	if (rc != 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_pack_body(req, &op_data->op_fid1, 0, 0,
+		      op_data->op_suppgids[0], 0);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	req_hus = req_capsule_server_get(&req->rq_pill, &RMF_HSM_USER_STATE);
+	if (req_hus == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	*hus = *req_hus;
+
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_state_set(struct obd_export *exp,
+				 struct md_op_data *op_data)
+{
+	struct hsm_state_set	*hss = op_data->op_data;
+	struct hsm_state_set	*req_hss;
+	struct ptlrpc_request	*req;
+	int			 rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_HSM_STATE_SET);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_STATE_SET);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_pack_body(req, &op_data->op_fid1, 0, 0,
+		      op_data->op_suppgids[0], 0);
+
+	/* Copy states */
+	req_hss = req_capsule_client_get(&req->rq_pill, &RMF_HSM_STATE_SET);
+	if (req_hss == NULL)
+		GOTO(out, rc = -EPROTO);
+	*req_hss = *hss;
+
+	ptlrpc_request_set_replen(req);
+
+	mdc_get_mod_rpc_slot(req, NULL);
+	rc = ptlrpc_queue_wait(req);
+	mdc_put_mod_rpc_slot(req, NULL);
+
+	GOTO(out, rc);
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_request(struct obd_export *exp,
+			       struct hsm_user_request *hur)
+{
+	struct obd_import	*imp = class_exp2cliimp(exp);
+	struct ptlrpc_request	*req;
+	struct hsm_request	*req_hr;
+	struct hsm_user_item	*req_hui;
+	char			*req_opaque;
+	int			 rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(imp, &RQF_MDS_HSM_REQUEST);
+	if (req == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDS_HSM_USER_ITEM, RCL_CLIENT,
+			     hur->hur_request.hr_itemcount
+			     * sizeof(struct hsm_user_item));
+	req_capsule_set_size(&req->rq_pill, &RMF_GENERIC_DATA, RCL_CLIENT,
+			     hur->hur_request.hr_data_len);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_REQUEST);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_pack_body(req, NULL, 0, 0, -1, 0);
+
+	/* Copy hsm_request struct */
+	req_hr = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_REQUEST);
+	if (req_hr == NULL)
+		GOTO(out, rc = -EPROTO);
+	*req_hr = hur->hur_request;
+
+	/* Copy hsm_user_item structs */
+	req_hui = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_USER_ITEM);
+	if (req_hui == NULL)
+		GOTO(out, rc = -EPROTO);
+	memcpy(req_hui, hur->hur_user_item,
+	       hur->hur_request.hr_itemcount * sizeof(struct hsm_user_item));
+
+	/* Copy opaque field */
+	req_opaque = req_capsule_client_get(&req->rq_pill, &RMF_GENERIC_DATA);
+	if (req_opaque == NULL)
+		GOTO(out, rc = -EPROTO);
+	memcpy(req_opaque, hur_data(hur), hur->hur_request.hr_data_len);
+
+	ptlrpc_request_set_replen(req);
+
+	mdc_get_mod_rpc_slot(req, NULL);
+	rc = ptlrpc_queue_wait(req);
+	mdc_put_mod_rpc_slot(req, NULL);
+
+	GOTO(out, rc);
+
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_ct_start(struct obd_export *exp,
+                                struct lustre_kernelcomm *lk);
+
+static int mdc_quotactl(struct obd_device *unused, struct obd_export *exp,
+                        struct obd_quotactl *oqctl)
+{
+	struct ptlrpc_request	*req;
+	struct obd_quotactl	*oqc;
+	int			 rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+					&RQF_MDS_QUOTACTL, LUSTRE_MDS_VERSION,
+					MDS_QUOTACTL);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	oqc = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+	*oqc = *oqctl;
+
+	ptlrpc_request_set_replen(req);
+	ptlrpc_at_set_req_timeout(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		CERROR("ptlrpc_queue_wait failed, rc: %d\n", rc);
+
+	if (req->rq_repmsg &&
+	    (oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL))) {
+		*oqctl = *oqc;
+	} else if (!rc) {
+		CERROR ("Can't unpack obd_quotactl\n");
+		rc = -EPROTO;
+	}
+	ptlrpc_req_finished(req);
+
+	RETURN(rc);
+}
+
+static int mdc_ioc_swap_layouts(struct obd_export *exp,
+				struct md_op_data *op_data)
+{
+	struct list_head cancels = LIST_HEAD_INIT(cancels);
+	struct ptlrpc_request	*req;
+	int			 rc, count;
+	struct mdc_swap_layouts *msl, *payload;
+	ENTRY;
+
+	msl = op_data->op_data;
+
+	/* When the MDT will get the MDS_SWAP_LAYOUTS RPC the
+	 * first thing it will do is to cancel the 2 layout
+	 * locks held by this client.
+	 * So the client must cancel its layout locks on the 2 fids
+	 * with the request RPC to avoid extra RPC round trips.
+	 */
+	count = mdc_resource_get_unused(exp, &op_data->op_fid1, &cancels,
+					LCK_EX, MDS_INODELOCK_LAYOUT |
+					MDS_INODELOCK_XATTR);
+	count += mdc_resource_get_unused(exp, &op_data->op_fid2, &cancels,
+					 LCK_EX, MDS_INODELOCK_LAYOUT |
+					 MDS_INODELOCK_XATTR);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_SWAP_LAYOUTS);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		RETURN(-ENOMEM);
+	}
+
+	rc = mdc_prep_elc_req(exp, req, MDS_SWAP_LAYOUTS, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_swap_layouts_pack(req, op_data);
+
+	payload = req_capsule_client_get(&req->rq_pill, &RMF_SWAP_LAYOUTS);
+	LASSERT(payload);
+
+	*payload = *msl;
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+	EXIT;
+
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+			 void *karg, void __user *uarg)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct obd_ioctl_data *data = karg;
+	struct obd_import *imp = obd->u.cli.cl_import;
+	int rc;
+	ENTRY;
+
+	if (!try_module_get(THIS_MODULE)) {
+		CERROR("%s: cannot get module '%s'\n", obd->obd_name,
+		       module_name(THIS_MODULE));
+		return -EINVAL;
+	}
+	switch (cmd) {
+	case OBD_IOC_FID2PATH:
+		rc = mdc_ioc_fid2path(exp, karg);
+		GOTO(out, rc);
+	case LL_IOC_HSM_CT_START:
+		rc = mdc_ioc_hsm_ct_start(exp, karg);
+		/* ignore if it was already registered on this MDS. */
+		if (rc == -EEXIST)
+			rc = 0;
+		GOTO(out, rc);
+	case LL_IOC_HSM_PROGRESS:
+		rc = mdc_ioc_hsm_progress(exp, karg);
+		GOTO(out, rc);
+	case LL_IOC_HSM_STATE_GET:
+		rc = mdc_ioc_hsm_state_get(exp, karg);
+		GOTO(out, rc);
+	case LL_IOC_HSM_STATE_SET:
+		rc = mdc_ioc_hsm_state_set(exp, karg);
+		GOTO(out, rc);
+	case LL_IOC_HSM_ACTION:
+		rc = mdc_ioc_hsm_current_action(exp, karg);
+		GOTO(out, rc);
+	case LL_IOC_HSM_REQUEST:
+		rc = mdc_ioc_hsm_request(exp, karg);
+		GOTO(out, rc);
+	case OBD_IOC_CLIENT_RECOVER:
+		rc = ptlrpc_recover_import(imp, data->ioc_inlbuf1, 0);
+		if (rc < 0)
+			GOTO(out, rc);
+		GOTO(out, rc = 0);
+	case IOC_OSC_SET_ACTIVE:
+		rc = ptlrpc_set_import_active(imp, data->ioc_offset);
+		GOTO(out, rc);
+	case OBD_IOC_PING_TARGET:
+		rc = ptlrpc_obd_ping(obd);
+		GOTO(out, rc);
+	/*
+	 * Normally IOC_OBD_STATFS, OBD_IOC_QUOTACTL iocontrol are handled by
+	 * LMV instead of MDC. But when the cluster is upgraded from 1.8,
+	 * there'd be no LMV layer thus we might be called here. Eventually
+	 * this code should be removed.
+	 * bz20731, LU-592.
+	 */
+	case IOC_OBD_STATFS: {
+		struct obd_statfs stat_buf = {0};
+
+		if (*((__u32 *) data->ioc_inlbuf2) != 0)
+			GOTO(out, rc = -ENODEV);
+
+		/* copy UUID */
+		if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(obd),
+				 min((int)data->ioc_plen2,
+				     (int)sizeof(struct obd_uuid))))
+			GOTO(out, rc = -EFAULT);
+
+		rc = mdc_statfs(NULL, obd->obd_self_export, &stat_buf,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				0);
+		if (rc != 0)
+			GOTO(out, rc);
+
+		if (copy_to_user(data->ioc_pbuf1, &stat_buf,
+				     min((int) data->ioc_plen1,
+					 (int) sizeof(stat_buf))))
+			GOTO(out, rc = -EFAULT);
+
+		GOTO(out, rc = 0);
+	}
+	case OBD_IOC_QUOTACTL: {
+		struct if_quotactl *qctl = karg;
+		struct obd_quotactl *oqctl;
+
+		OBD_ALLOC_PTR(oqctl);
+		if (oqctl == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		QCTL_COPY(oqctl, qctl);
+		rc = obd_quotactl(exp, oqctl);
+		if (rc == 0) {
+			QCTL_COPY(qctl, oqctl);
+			qctl->qc_valid = QC_MDTIDX;
+			qctl->obd_uuid = obd->u.cli.cl_target_uuid;
+		}
+
+		OBD_FREE_PTR(oqctl);
+		GOTO(out, rc);
+	}
+	case LL_IOC_GET_CONNECT_FLAGS:
+		if (copy_to_user(uarg, exp_connect_flags_ptr(exp),
+				 sizeof(*exp_connect_flags_ptr(exp))))
+			GOTO(out, rc = -EFAULT);
+
+		GOTO(out, rc = 0);
+	case LL_IOC_LOV_SWAP_LAYOUTS:
+		rc = mdc_ioc_swap_layouts(exp, karg);
+		GOTO(out, rc);
+	default:
+		CERROR("unrecognised ioctl: cmd = %#x\n", cmd);
+		GOTO(out, rc = -ENOTTY);
+	}
+out:
+	module_put(THIS_MODULE);
+
+	return rc;
+}
+
+static int mdc_get_info_rpc(struct obd_export *exp,
+			    u32 keylen, void *key,
+			    u32 vallen, void *val)
+{
+        struct obd_import      *imp = class_exp2cliimp(exp);
+        struct ptlrpc_request  *req;
+        char                   *tmp;
+        int                     rc = -EINVAL;
+        ENTRY;
+
+        req = ptlrpc_request_alloc(imp, &RQF_MDS_GET_INFO);
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+        req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_KEY,
+                             RCL_CLIENT, keylen);
+        req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VALLEN,
+			     RCL_CLIENT, sizeof(vallen));
+
+        rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GET_INFO);
+        if (rc) {
+                ptlrpc_request_free(req);
+                RETURN(rc);
+        }
+
+        tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_KEY);
+        memcpy(tmp, key, keylen);
+        tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_VALLEN);
+	memcpy(tmp, &vallen, sizeof(vallen));
+
+        req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VAL,
+                             RCL_SERVER, vallen);
+        ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	/* -EREMOTE means the get_info result is partial, and it needs to
+	 * continue on another MDT, see fid2path part in lmv_iocontrol */
+	if (rc == 0 || rc == -EREMOTE) {
+		tmp = req_capsule_server_get(&req->rq_pill, &RMF_GETINFO_VAL);
+		memcpy(val, tmp, vallen);
+		if (ptlrpc_rep_need_swab(req)) {
+			if (KEY_IS(KEY_FID2PATH))
+				lustre_swab_fid2path(val);
+		}
+	}
+	ptlrpc_req_finished(req);
+
+	RETURN(rc);
+}
+
+static void lustre_swab_hai(struct hsm_action_item *h)
+{
+	__swab32s(&h->hai_len);
+	__swab32s(&h->hai_action);
+	lustre_swab_lu_fid(&h->hai_fid);
+	lustre_swab_lu_fid(&h->hai_dfid);
+	__swab64s(&h->hai_cookie);
+	__swab64s(&h->hai_extent.offset);
+	__swab64s(&h->hai_extent.length);
+	__swab64s(&h->hai_gid);
+}
+
+static void lustre_swab_hal(struct hsm_action_list *h)
+{
+	struct hsm_action_item	*hai;
+	__u32			 i;
+
+	__swab32s(&h->hal_version);
+	__swab32s(&h->hal_count);
+	__swab32s(&h->hal_archive_id);
+	__swab64s(&h->hal_flags);
+	hai = hai_first(h);
+	for (i = 0; i < h->hal_count; i++, hai = hai_next(hai))
+		lustre_swab_hai(hai);
+}
+
+static void lustre_swab_kuch(struct kuc_hdr *l)
+{
+        __swab16s(&l->kuc_magic);
+        /* __u8 l->kuc_transport */
+        __swab16s(&l->kuc_msgtype);
+        __swab16s(&l->kuc_msglen);
+}
+
+static int mdc_ioc_hsm_ct_start(struct obd_export *exp,
+				struct lustre_kernelcomm *lk)
+{
+	struct obd_import  *imp = class_exp2cliimp(exp);
+	__u32		    archive = lk->lk_data;
+	int		    rc = 0;
+
+	if (lk->lk_group != KUC_GRP_HSM) {
+		CERROR("Bad copytool group %d\n", lk->lk_group);
+		return -EINVAL;
+	}
+
+	CDEBUG(D_HSM, "CT start r%d w%d u%d g%d f%#x\n", lk->lk_rfd, lk->lk_wfd,
+	       lk->lk_uid, lk->lk_group, lk->lk_flags);
+
+	if (lk->lk_flags & LK_FLG_STOP) {
+		/* Unregister with the coordinator */
+		rc = mdc_ioc_hsm_ct_unregister(imp);
+	} else {
+		rc = mdc_ioc_hsm_ct_register(imp, archive);
+	}
+
+	return rc;
+}
+
+/**
+ * Send a message to any listening copytools
+ * @param val KUC message (kuc_hdr + hsm_action_list)
+ * @param len total length of message
+ */
+static int mdc_hsm_copytool_send(const struct obd_uuid *uuid,
+				 size_t len, void *val)
+{
+	struct kuc_hdr		*lh = (struct kuc_hdr *)val;
+	struct hsm_action_list	*hal = (struct hsm_action_list *)(lh + 1);
+	int			 rc;
+	ENTRY;
+
+	if (len < sizeof(*lh) + sizeof(*hal)) {
+		CERROR("Short HSM message %zu < %zu\n", len,
+		       sizeof(*lh) + sizeof(*hal));
+		RETURN(-EPROTO);
+	}
+	if (lh->kuc_magic == __swab16(KUC_MAGIC)) {
+		lustre_swab_kuch(lh);
+		lustre_swab_hal(hal);
+	} else if (lh->kuc_magic != KUC_MAGIC) {
+		CERROR("Bad magic %x!=%x\n", lh->kuc_magic, KUC_MAGIC);
+		RETURN(-EPROTO);
+	}
+
+	CDEBUG(D_HSM, " Received message mg=%x t=%d m=%d l=%d actions=%d "
+	       "on %s\n",
+	       lh->kuc_magic, lh->kuc_transport, lh->kuc_msgtype,
+	       lh->kuc_msglen, hal->hal_count, hal->hal_fsname);
+
+	/* Broadcast to HSM listeners */
+	rc = libcfs_kkuc_group_put(uuid, KUC_GRP_HSM, lh);
+
+	RETURN(rc);
+}
+
+/**
+ * callback function passed to kuc for re-registering each HSM copytool
+ * running on MDC, after MDT shutdown/recovery.
+ * @param data copytool registration data
+ * @param cb_arg callback argument (obd_import)
+ */
+static int mdc_hsm_ct_reregister(void *data, void *cb_arg)
+{
+	struct kkuc_ct_data	*kcd = data;
+	struct obd_import	*imp = (struct obd_import *)cb_arg;
+	int			 rc;
+
+	if (kcd == NULL || kcd->kcd_magic != KKUC_CT_DATA_MAGIC)
+		return -EPROTO;
+
+	CDEBUG(D_HA, "%s: recover copytool registration to MDT (archive=%#x)\n",
+	       imp->imp_obd->obd_name, kcd->kcd_archive);
+	rc = mdc_ioc_hsm_ct_register(imp, kcd->kcd_archive);
+
+	/* ignore error if the copytool is already registered */
+	return (rc == -EEXIST) ? 0 : rc;
+}
+
+/**
+ * Re-establish all kuc contexts with MDT
+ * after MDT shutdown/recovery.
+ */
+static int mdc_kuc_reregister(struct obd_import *imp)
+{
+	/* re-register HSM agents */
+	return libcfs_kkuc_group_foreach(&imp->imp_obd->obd_uuid, KUC_GRP_HSM,
+					 mdc_hsm_ct_reregister, imp);
+}
+
+static int mdc_set_info_async(const struct lu_env *env,
+			      struct obd_export *exp,
+			      u32 keylen, void *key,
+			      u32 vallen, void *val,
+			      struct ptlrpc_request_set *set)
+{
+	struct obd_import	*imp = class_exp2cliimp(exp);
+	int			 rc;
+	ENTRY;
+
+	if (KEY_IS(KEY_READ_ONLY)) {
+		if (vallen != sizeof(int))
+			RETURN(-EINVAL);
+
+		spin_lock(&imp->imp_lock);
+		if (*((int *)val)) {
+			imp->imp_connect_flags_orig |= OBD_CONNECT_RDONLY;
+			imp->imp_connect_data.ocd_connect_flags |=
+							OBD_CONNECT_RDONLY;
+		} else {
+			imp->imp_connect_flags_orig &= ~OBD_CONNECT_RDONLY;
+			imp->imp_connect_data.ocd_connect_flags &=
+							~OBD_CONNECT_RDONLY;
+		}
+		spin_unlock(&imp->imp_lock);
+
+                rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION,
+                                       keylen, key, vallen, val, set);
+                RETURN(rc);
+        }
+        if (KEY_IS(KEY_SPTLRPC_CONF)) {
+                sptlrpc_conf_client_adapt(exp->exp_obd);
+                RETURN(0);
+        }
+        if (KEY_IS(KEY_FLUSH_CTX)) {
+                sptlrpc_import_flush_my_ctx(imp);
+                RETURN(0);
+        }
+        if (KEY_IS(KEY_CHANGELOG_CLEAR)) {
+                rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION,
+                                       keylen, key, vallen, val, set);
+                RETURN(rc);
+        }
+        if (KEY_IS(KEY_HSM_COPYTOOL_SEND)) {
+		rc = mdc_hsm_copytool_send(&imp->imp_obd->obd_uuid, vallen,
+					   val);
+                RETURN(rc);
+        }
+
+	if (KEY_IS(KEY_DEFAULT_EASIZE)) {
+		__u32 *default_easize = val;
+
+		exp->exp_obd->u.cli.cl_default_mds_easize = *default_easize;
+		RETURN(0);
+	}
+
+	CERROR("Unknown key %s\n", (char *)key);
+	RETURN(-EINVAL);
+}
+
+static int mdc_get_info(const struct lu_env *env, struct obd_export *exp,
+			__u32 keylen, void *key, __u32 *vallen, void *val)
+{
+	int rc = -EINVAL;
+
+	if (KEY_IS(KEY_MAX_EASIZE)) {
+		__u32 mdsize, *max_easize;
+
+		if (*vallen != sizeof(int))
+			RETURN(-EINVAL);
+		mdsize = *(__u32 *)val;
+		if (mdsize > exp->exp_obd->u.cli.cl_max_mds_easize)
+			exp->exp_obd->u.cli.cl_max_mds_easize = mdsize;
+		max_easize = val;
+		*max_easize = exp->exp_obd->u.cli.cl_max_mds_easize;
+		RETURN(0);
+	} else if (KEY_IS(KEY_DEFAULT_EASIZE)) {
+		__u32 *default_easize;
+
+		if (*vallen != sizeof(int))
+			RETURN(-EINVAL);
+		default_easize = val;
+		*default_easize = exp->exp_obd->u.cli.cl_default_mds_easize;
+		RETURN(0);
+        } else if (KEY_IS(KEY_CONN_DATA)) {
+                struct obd_import *imp = class_exp2cliimp(exp);
+                struct obd_connect_data *data = val;
+
+                if (*vallen != sizeof(*data))
+                        RETURN(-EINVAL);
+
+                *data = imp->imp_connect_data;
+                RETURN(0);
+        } else if (KEY_IS(KEY_TGT_COUNT)) {
+		*((__u32 *)val) = 1;
+                RETURN(0);
+        }
+
+        rc = mdc_get_info_rpc(exp, keylen, key, *vallen, val);
+
+        RETURN(rc);
+}
+
+static int mdc_fsync(struct obd_export *exp, const struct lu_fid *fid,
+		     struct ptlrpc_request **request)
+{
+        struct ptlrpc_request *req;
+        int                    rc;
+        ENTRY;
+
+        *request = NULL;
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_SYNC);
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+        rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_SYNC);
+        if (rc) {
+                ptlrpc_request_free(req);
+                RETURN(rc);
+        }
+
+	mdc_pack_body(req, fid, 0, 0, -1, 0);
+
+        ptlrpc_request_set_replen(req);
+
+        rc = ptlrpc_queue_wait(req);
+        if (rc)
+                ptlrpc_req_finished(req);
+        else
+                *request = req;
+        RETURN(rc);
+}
+
+static int mdc_import_event(struct obd_device *obd, struct obd_import *imp,
+			    enum obd_import_event event)
+{
+	int rc = 0;
+
+	LASSERT(imp->imp_obd == obd);
+
+	switch (event) {
+
+	case IMP_EVENT_INACTIVE: {
+		struct client_obd *cli = &obd->u.cli;
+		/*
+		 * Flush current sequence to make client obtain new one
+		 * from server in case of disconnect/reconnect.
+		 */
+		down_read(&cli->cl_seq_rwsem);
+		if (cli->cl_seq)
+			seq_client_flush(cli->cl_seq);
+		up_read(&cli->cl_seq_rwsem);
+
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE);
+		break;
+	}
+	case IMP_EVENT_INVALIDATE: {
+		struct ldlm_namespace *ns = obd->obd_namespace;
+
+		ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+
+		break;
+	}
+	case IMP_EVENT_ACTIVE:
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE);
+		/* redo the kuc registration after reconnecting */
+		if (rc == 0)
+			rc = mdc_kuc_reregister(imp);
+		break;
+	case IMP_EVENT_OCD:
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD);
+		break;
+	case IMP_EVENT_DISCON:
+	case IMP_EVENT_DEACTIVATE:
+	case IMP_EVENT_ACTIVATE:
+		break;
+	default:
+		CERROR("Unknown import event %x\n", event);
+		LBUG();
+	}
+	RETURN(rc);
+}
+
+int mdc_fid_alloc(const struct lu_env *env, struct obd_export *exp,
+		  struct lu_fid *fid, struct md_op_data *op_data)
+{
+	struct client_obd *cli = &exp->exp_obd->u.cli;
+	int rc = -EIO;
+
+	ENTRY;
+
+	down_read(&cli->cl_seq_rwsem);
+	if (cli->cl_seq)
+		rc = seq_client_alloc_fid(env, cli->cl_seq, fid);
+	up_read(&cli->cl_seq_rwsem);
+
+	RETURN(rc);
+}
+
+static struct obd_uuid *mdc_get_uuid(struct obd_export *exp)
+{
+        struct client_obd *cli = &exp->exp_obd->u.cli;
+        return &cli->cl_target_uuid;
+}
+
+/**
+ * Determine whether the lock can be canceled before replaying it during
+ * recovery, non zero value will be return if the lock can be canceled,
+ * or zero returned for not
+ */
+static int mdc_cancel_weight(struct ldlm_lock *lock)
+{
+	if (lock->l_resource->lr_type != LDLM_IBITS)
+		RETURN(0);
+
+	/* FIXME: if we ever get into a situation where there are too many
+	 * opened files with open locks on a single node, then we really
+	 * should replay these open locks to reget it */
+	if (lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_OPEN)
+		RETURN(0);
+
+	RETURN(1);
+}
+
+static int mdc_resource_inode_free(struct ldlm_resource *res)
+{
+	if (res->lr_lvb_inode)
+		res->lr_lvb_inode = NULL;
+
+	return 0;
+}
+
+static struct ldlm_valblock_ops inode_lvbo = {
+	.lvbo_free = mdc_resource_inode_free
+};
+
+static int mdc_llog_init(struct obd_device *obd)
+{
+	struct obd_llog_group	*olg = &obd->obd_olg;
+	struct llog_ctxt	*ctxt;
+	int			 rc;
+
+	ENTRY;
+
+	rc = llog_setup(NULL, obd, olg, LLOG_CHANGELOG_REPL_CTXT, obd,
+			&llog_client_ops);
+	if (rc < 0)
+		RETURN(rc);
+
+	ctxt = llog_group_get_ctxt(olg, LLOG_CHANGELOG_REPL_CTXT);
+	llog_initiator_connect(ctxt);
+	llog_ctxt_put(ctxt);
+
+	RETURN(0);
+}
+
+static void mdc_llog_finish(struct obd_device *obd)
+{
+	struct llog_ctxt *ctxt;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_CHANGELOG_REPL_CTXT);
+	if (ctxt != NULL)
+		llog_cleanup(NULL, ctxt);
+
+	EXIT;
+}
+
+static int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg)
+{
+	int				rc;
+	ENTRY;
+
+	rc = ptlrpcd_addref();
+	if (rc < 0)
+		RETURN(rc);
+
+        rc = client_obd_setup(obd, cfg);
+        if (rc)
+		GOTO(err_ptlrpcd_decref, rc);
+#ifdef CONFIG_PROC_FS
+	obd->obd_vars = lprocfs_mdc_obd_vars;
+	lprocfs_obd_setup(obd);
+	lprocfs_alloc_md_stats(obd, 0);
+#endif
+	sptlrpc_lprocfs_cliobd_attach(obd);
+	ptlrpc_lprocfs_register_obd(obd);
+
+	ns_register_cancel(obd->obd_namespace, mdc_cancel_weight);
+
+	obd->obd_namespace->ns_lvbo = &inode_lvbo;
+
+	rc = mdc_llog_init(obd);
+        if (rc) {
+                CERROR("%s: failed to setup llogging subsystems: rc = %d\n",
+		       obd->obd_name, rc);
+		GOTO(err_mdc_cleanup, rc);
+        }
+
+	rc = mdc_changelog_cdev_init(obd);
+	if (rc) {
+		CERROR("%s: failed to setup changelog char device: rc = %d\n",
+		       obd->obd_name, rc);
+		GOTO(err_mdc_cleanup, rc);
+	}
+
+	EXIT;
+err_mdc_cleanup:
+	if (rc)
+		client_obd_cleanup(obd);
+
+err_ptlrpcd_decref:
+	if (rc)
+	        ptlrpcd_decref();
+        return rc;
+}
+
+/* Initialize the default and maximum LOV EA sizes.  This allows
+ * us to make MDS RPCs with large enough reply buffers to hold a default
+ * sized EA without having to calculate this (via a call into the
+ * LOV + OSCs) each time we make an RPC.  The maximum size is also tracked
+ * but not used to avoid wastefully vmalloc()'ing large reply buffers when
+ * a large number of stripes is possible.  If a larger reply buffer is
+ * required it will be reallocated in the ptlrpc layer due to overflow.
+ */
+static int mdc_init_ea_size(struct obd_export *exp, __u32 easize,
+			    __u32 def_easize)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct client_obd *cli = &obd->u.cli;
+	ENTRY;
+
+	if (cli->cl_max_mds_easize < easize)
+		cli->cl_max_mds_easize = easize;
+
+	if (cli->cl_default_mds_easize < def_easize)
+		cli->cl_default_mds_easize = def_easize;
+
+	RETURN(0);
+}
+
+static int mdc_precleanup(struct obd_device *obd)
+{
+	ENTRY;
+
+	mdc_changelog_cdev_finish(obd);
+
+	obd_cleanup_client_import(obd);
+	ptlrpc_lprocfs_unregister_obd(obd);
+	lprocfs_free_md_stats(obd);
+	mdc_llog_finish(obd);
+	RETURN(0);
+}
+
+static int mdc_cleanup(struct obd_device *obd)
+{
+        ptlrpcd_decref();
+
+        return client_obd_cleanup(obd);
+}
+
+static int mdc_process_config(struct obd_device *obd, size_t len, void *buf)
+{
+        struct lustre_cfg *lcfg = buf;
+	int rc = class_process_proc_param(PARAM_MDC, obd->obd_vars, lcfg, obd);
+	return (rc > 0 ? 0: rc);
+}
+
+static struct obd_ops mdc_obd_ops = {
+        .o_owner            = THIS_MODULE,
+        .o_setup            = mdc_setup,
+        .o_precleanup       = mdc_precleanup,
+        .o_cleanup          = mdc_cleanup,
+        .o_add_conn         = client_import_add_conn,
+        .o_del_conn         = client_import_del_conn,
+        .o_connect          = client_connect_import,
+        .o_disconnect       = client_disconnect_export,
+        .o_iocontrol        = mdc_iocontrol,
+        .o_set_info_async   = mdc_set_info_async,
+        .o_statfs           = mdc_statfs,
+	.o_fid_init	    = client_fid_init,
+	.o_fid_fini	    = client_fid_fini,
+        .o_fid_alloc        = mdc_fid_alloc,
+        .o_import_event     = mdc_import_event,
+        .o_get_info         = mdc_get_info,
+        .o_process_config   = mdc_process_config,
+        .o_get_uuid         = mdc_get_uuid,
+        .o_quotactl         = mdc_quotactl,
+};
+
+static struct md_ops mdc_md_ops = {
+	.m_get_root	    = mdc_get_root,
+        .m_null_inode	    = mdc_null_inode,
+        .m_close            = mdc_close,
+        .m_create           = mdc_create,
+        .m_enqueue          = mdc_enqueue,
+        .m_getattr          = mdc_getattr,
+        .m_getattr_name     = mdc_getattr_name,
+        .m_intent_lock      = mdc_intent_lock,
+        .m_link             = mdc_link,
+        .m_rename           = mdc_rename,
+        .m_setattr          = mdc_setattr,
+        .m_setxattr         = mdc_setxattr,
+        .m_getxattr         = mdc_getxattr,
+	.m_fsync		= mdc_fsync,
+	.m_read_page		= mdc_read_page,
+        .m_unlink           = mdc_unlink,
+        .m_cancel_unused    = mdc_cancel_unused,
+        .m_init_ea_size     = mdc_init_ea_size,
+        .m_set_lock_data    = mdc_set_lock_data,
+        .m_lock_match       = mdc_lock_match,
+        .m_get_lustre_md    = mdc_get_lustre_md,
+        .m_free_lustre_md   = mdc_free_lustre_md,
+        .m_set_open_replay_data = mdc_set_open_replay_data,
+        .m_clear_open_replay_data = mdc_clear_open_replay_data,
+        .m_intent_getattr_async = mdc_intent_getattr_async,
+        .m_revalidate_lock      = mdc_revalidate_lock
+};
+
+static int __init mdc_init(void)
+{
+	return class_register_type(&mdc_obd_ops, &mdc_md_ops, true, NULL,
+				   LUSTRE_MDC_NAME, NULL);
+}
+
+static void __exit mdc_exit(void)
+{
+        class_unregister_type(LUSTRE_MDC_NAME);
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Metadata Client");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+MODULE_LICENSE("GPL");
+
+module_init(mdc_init);
+module_exit(mdc_exit);
diff --git a/drivers/staging/lustrefsx/lustre/mgc/lproc_mgc.c b/drivers/staging/lustrefsx/lustre/mgc/lproc_mgc.c
new file mode 100644
index 0000000000000..ab1985d9d9d24
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/mgc/lproc_mgc.c
@@ -0,0 +1,77 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#include <linux/vfs.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include "mgc_internal.h"
+
+#ifdef CONFIG_PROC_FS
+
+LPROC_SEQ_FOPS_RO_TYPE(mgc, uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, connect_flags);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, server_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, conn_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, import);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, state);
+
+LPROC_SEQ_FOPS_WO_TYPE(mgc, ping);
+
+static int mgc_ir_state_seq_show(struct seq_file *m, void *v)
+{
+	return lprocfs_mgc_rd_ir_state(m, m->private);
+}
+LPROC_SEQ_FOPS_RO(mgc_ir_state);
+
+struct lprocfs_vars lprocfs_mgc_obd_vars[] = {
+	{ .name	=	"uuid",
+	  .fops	=	&mgc_uuid_fops		},
+	{ .name	=	"ping",
+	  .fops	=	&mgc_ping_fops,
+	  .proc_mode =	0222			},
+	{ .name	=	"connect_flags",
+	  .fops	=	&mgc_connect_flags_fops	},
+	{ .name	=	"mgs_server_uuid",
+	  .fops	=	&mgc_server_uuid_fops	},
+	{ .name	=	"mgs_conn_uuid",
+	  .fops	=	&mgc_conn_uuid_fops	},
+	{ .name	=	"import",
+	  .fops	=	&mgc_import_fops	},
+	{ .name	=	"state",
+	  .fops	=	&mgc_state_fops		},
+	{ .name	=	"ir_state",
+	  .fops	=	&mgc_ir_state_fops	},
+	{ NULL }
+};
+#endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h b/drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h
new file mode 100644
index 0000000000000..1a37720e901eb
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h
@@ -0,0 +1,70 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _MGC_INTERNAL_H
+#define _MGC_INTERNAL_H
+
+#include <libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_lib.h>
+#include <lustre_dlm.h>
+#include <lustre_log.h>
+#include <lustre_export.h>
+
+#ifdef CONFIG_PROC_FS
+extern struct lprocfs_vars lprocfs_mgc_obd_vars[];
+int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data);
+#endif /* CONFIG_PROC_FS */
+
+int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld);
+
+static inline int cld_is_sptlrpc(struct config_llog_data *cld)
+{
+	return cld->cld_type == CONFIG_T_SPTLRPC;
+}
+
+static inline int cld_is_recover(struct config_llog_data *cld)
+{
+	return cld->cld_type == CONFIG_T_RECOVER;
+}
+
+static inline int cld_is_nodemap(struct config_llog_data *cld)
+{
+	return cld->cld_type == CONFIG_T_NODEMAP;
+}
+
+static inline int cld_is_barrier(struct config_llog_data *cld)
+{
+	return cld->cld_type == CONFIG_T_BARRIER;
+}
+
+#endif  /* _MGC_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c b/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c
new file mode 100644
index 0000000000000..151283328e485
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c
@@ -0,0 +1,2310 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/mgc/mgc_request.c
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_MGC
+#define D_MGC D_CONFIG /*|D_WARNING*/
+
+#include <linux/module.h>
+#include <linux/kthread.h>
+
+#include <dt_object.h>
+#include <lprocfs_status.h>
+#include <lustre_dlm.h>
+#include <lustre_disk.h>
+#include <lustre_log.h>
+#include <lustre_nodemap.h>
+#include <lustre_swab.h>
+#include <obd_class.h>
+#include <lustre_barrier.h>
+
+#include "mgc_internal.h"
+
+static int mgc_name2resid(char *name, int len, struct ldlm_res_id *res_id,
+                          int type)
+{
+        __u64 resname = 0;
+
+	if (len > sizeof(resname)) {
+                CERROR("name too long: %s\n", name);
+                return -EINVAL;
+        }
+        if (len <= 0) {
+                CERROR("missing name: %s\n", name);
+                return -EINVAL;
+        }
+        memcpy(&resname, name, len);
+
+        /* Always use the same endianness for the resid */
+        memset(res_id, 0, sizeof(*res_id));
+        res_id->name[0] = cpu_to_le64(resname);
+        /* XXX: unfortunately, sptlprc and config llog share one lock */
+        switch(type) {
+        case CONFIG_T_CONFIG:
+        case CONFIG_T_SPTLRPC:
+                resname = 0;
+                break;
+	case CONFIG_T_RECOVER:
+	case CONFIG_T_PARAMS:
+	case CONFIG_T_NODEMAP:
+	case CONFIG_T_BARRIER:
+		resname = type;
+		break;
+        default:
+                LBUG();
+        }
+        res_id->name[1] = cpu_to_le64(resname);
+	CDEBUG(D_MGC, "log %s to resid %#llx/%#llx (%.8s)\n", name,
+               res_id->name[0], res_id->name[1], (char *)&res_id->name[0]);
+        return 0;
+}
+
+int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, int type)
+{
+        /* fsname is at most 8 chars long, maybe contain "-".
+         * e.g. "lustre", "SUN-000" */
+        return mgc_name2resid(fsname, strlen(fsname), res_id, type);
+}
+EXPORT_SYMBOL(mgc_fsname2resid);
+
+int mgc_logname2resid(char *logname, struct ldlm_res_id *res_id, int type)
+{
+	char *name_end;
+	int len;
+
+	/* logname consists of "fsname-nodetype".
+	 * e.g. "lustre-MDT0001", "SUN-000-client"
+	 * there is an exception: llog "params" */
+	name_end = strrchr(logname, '-');
+	if (!name_end)
+		len = strlen(logname);
+	else
+		len = name_end - logname;
+	return mgc_name2resid(logname, len, res_id, type);
+}
+EXPORT_SYMBOL(mgc_logname2resid);
+
+/********************** config llog list **********************/
+static struct list_head config_llog_list = LIST_HEAD_INIT(config_llog_list);
+static DEFINE_SPINLOCK(config_list_lock);
+
+/* Take a reference to a config log */
+static int config_log_get(struct config_llog_data *cld)
+{
+	ENTRY;
+	atomic_inc(&cld->cld_refcount);
+	CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname,
+		atomic_read(&cld->cld_refcount));
+	RETURN(0);
+}
+
+/* Drop a reference to a config log.  When no longer referenced,
+   we can free the config log data */
+static void config_log_put(struct config_llog_data *cld)
+{
+	ENTRY;
+
+	if (unlikely(!cld))
+		RETURN_EXIT;
+
+	CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname,
+		atomic_read(&cld->cld_refcount));
+	LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+	/* spinlock to make sure no item with 0 refcount in the list */
+	if (atomic_dec_and_lock(&cld->cld_refcount, &config_list_lock)) {
+		list_del(&cld->cld_list_chain);
+		spin_unlock(&config_list_lock);
+
+		CDEBUG(D_MGC, "dropping config log %s\n", cld->cld_logname);
+
+		config_log_put(cld->cld_barrier);
+		config_log_put(cld->cld_recover);
+		config_log_put(cld->cld_params);
+		config_log_put(cld->cld_nodemap);
+		config_log_put(cld->cld_sptlrpc);
+		if (cld_is_sptlrpc(cld))
+			sptlrpc_conf_log_stop(cld->cld_logname);
+
+		class_export_put(cld->cld_mgcexp);
+		OBD_FREE(cld, sizeof(*cld) + strlen(cld->cld_logname) + 1);
+	}
+
+	EXIT;
+}
+
+/* Find a config log by name */
+static
+struct config_llog_data *config_log_find(char *logname,
+                                         struct config_llog_instance *cfg)
+{
+        struct config_llog_data *cld;
+        struct config_llog_data *found = NULL;
+        void *                   instance;
+        ENTRY;
+
+        LASSERT(logname != NULL);
+
+        instance = cfg ? cfg->cfg_instance : NULL;
+	spin_lock(&config_list_lock);
+	list_for_each_entry(cld, &config_llog_list, cld_list_chain) {
+		/* check if instance equals */
+		if (instance != cld->cld_cfg.cfg_instance)
+			continue;
+
+		/* instance may be NULL, should check name */
+		if (strcmp(logname, cld->cld_logname) == 0) {
+			found = cld;
+			config_log_get(found);
+			break;
+		}
+	}
+	spin_unlock(&config_list_lock);
+	RETURN(found);
+}
+
+static
+struct config_llog_data *do_config_log_add(struct obd_device *obd,
+					   char *logname,
+					   int type,
+					   struct config_llog_instance *cfg,
+					   struct super_block *sb)
+{
+	struct config_llog_data *cld;
+	int rc;
+
+	ENTRY;
+
+	CDEBUG(D_MGC, "do adding config log %s:%p\n", logname,
+	       cfg ? cfg->cfg_instance : NULL);
+
+	OBD_ALLOC(cld, sizeof(*cld) + strlen(logname) + 1);
+	if (!cld)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	rc = mgc_logname2resid(logname, &cld->cld_resid, type);
+	if (rc) {
+		OBD_FREE(cld, sizeof(*cld) + strlen(cld->cld_logname) + 1);
+		RETURN(ERR_PTR(rc));
+	}
+
+	strcpy(cld->cld_logname, logname);
+	if (cfg)
+		cld->cld_cfg = *cfg;
+	else
+		cld->cld_cfg.cfg_callback = class_config_llog_handler;
+	mutex_init(&cld->cld_lock);
+	cld->cld_cfg.cfg_last_idx = 0;
+	cld->cld_cfg.cfg_flags = 0;
+	cld->cld_cfg.cfg_sb = sb;
+	cld->cld_type = type;
+	atomic_set(&cld->cld_refcount, 1);
+
+	/* Keep the mgc around until we are done */
+	cld->cld_mgcexp = class_export_get(obd->obd_self_export);
+
+	if (cld_is_sptlrpc(cld)) {
+		sptlrpc_conf_log_start(logname);
+		cld->cld_cfg.cfg_obdname = obd->obd_name;
+	}
+
+	spin_lock(&config_list_lock);
+	list_add(&cld->cld_list_chain, &config_llog_list);
+	spin_unlock(&config_list_lock);
+
+	if (cld_is_sptlrpc(cld) || cld_is_nodemap(cld) || cld_is_barrier(cld)) {
+		rc = mgc_process_log(obd, cld);
+		if (rc && rc != -ENOENT)
+			CERROR("%s: failed processing log, type %d: rc = %d\n",
+			       obd->obd_name, type, rc);
+	}
+
+	RETURN(cld);
+}
+
+static struct config_llog_data *config_recover_log_add(struct obd_device *obd,
+        char *fsname,
+        struct config_llog_instance *cfg,
+        struct super_block *sb)
+{
+        struct config_llog_instance lcfg = *cfg;
+        struct lustre_sb_info *lsi = s2lsi(sb);
+        struct config_llog_data *cld;
+        char logname[32];
+
+	if (IS_OST(lsi))
+                return NULL;
+
+	/* for osp-on-ost, see lustre_start_osp() */
+	if (IS_MDT(lsi) && lcfg.cfg_instance)
+		return NULL;
+
+        /* we have to use different llog for clients and mdts for cmd
+         * where only clients are notified if one of cmd server restarts */
+        LASSERT(strlen(fsname) < sizeof(logname) / 2);
+        strcpy(logname, fsname);
+	if (IS_SERVER(lsi)) { /* mdt */
+                LASSERT(lcfg.cfg_instance == NULL);
+                lcfg.cfg_instance = sb;
+                strcat(logname, "-mdtir");
+        } else {
+                LASSERT(lcfg.cfg_instance != NULL);
+                strcat(logname, "-cliir");
+        }
+
+        cld = do_config_log_add(obd, logname, CONFIG_T_RECOVER, &lcfg, sb);
+        return cld;
+}
+
+static struct config_llog_data *config_log_find_or_add(struct obd_device *obd,
+				char *logname, struct super_block *sb, int type,
+				struct config_llog_instance *cfg)
+{
+	struct config_llog_instance	lcfg = *cfg;
+	struct config_llog_data		*cld;
+
+	lcfg.cfg_instance = sb != NULL ? (void *)sb : (void *)obd;
+
+	if (type == CONFIG_T_SPTLRPC)
+		lcfg.cfg_instance = NULL;
+
+	cld = config_log_find(logname, &lcfg);
+	if (unlikely(cld != NULL))
+		return cld;
+
+	return do_config_log_add(obd, logname, type, &lcfg, sb);
+}
+
+/** Add this log to the list of active logs watched by an MGC.
+ * Active means we're watching for updates.
+ * We have one active log per "mount" - client instance or servername.
+ * Each instance may be at a different point in the log.
+ */
+static struct config_llog_data *
+config_log_add(struct obd_device *obd, char *logname,
+	       struct config_llog_instance *cfg, struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct config_llog_data *cld = NULL;
+	struct config_llog_data *sptlrpc_cld = NULL;
+	struct config_llog_data *params_cld = NULL;
+	struct config_llog_data *nodemap_cld = NULL;
+	struct config_llog_data *barrier_cld = NULL;
+	char seclogname[32];
+	char *ptr;
+	int rc;
+	bool locked = false;
+	ENTRY;
+
+	CDEBUG(D_MGC, "adding config log %s:%p\n", logname, cfg->cfg_instance);
+
+	/*
+	 * for each regular log, the depended sptlrpc log name is
+	 * <fsname>-sptlrpc. multiple regular logs may share one sptlrpc log.
+	 */
+	ptr = strrchr(logname, '-');
+	if (ptr == NULL || ptr - logname > 8) {
+		CERROR("logname %s is too long\n", logname);
+		RETURN(ERR_PTR(-EINVAL));
+	}
+
+	memcpy(seclogname, logname, ptr - logname);
+	strcpy(seclogname + (ptr - logname), "-sptlrpc");
+
+	if (cfg->cfg_sub_clds & CONFIG_SUB_SPTLRPC) {
+		sptlrpc_cld = config_log_find_or_add(obd, seclogname, NULL,
+						     CONFIG_T_SPTLRPC, cfg);
+		if (IS_ERR(sptlrpc_cld)) {
+			CERROR("%s: can't create sptlrpc log %s: rc = %ld\n",
+			       obd->obd_name, seclogname, PTR_ERR(sptlrpc_cld));
+			RETURN(sptlrpc_cld);
+		}
+	}
+
+	if (!IS_MGS(lsi) && cfg->cfg_sub_clds & CONFIG_SUB_NODEMAP) {
+		nodemap_cld = config_log_find_or_add(obd, LUSTRE_NODEMAP_NAME,
+						     NULL, CONFIG_T_NODEMAP,
+						     cfg);
+		if (IS_ERR(nodemap_cld)) {
+			rc = PTR_ERR(nodemap_cld);
+			CERROR("%s: cannot create nodemap log: rc = %d\n",
+			       obd->obd_name, rc);
+			GOTO(out_sptlrpc, rc);
+		}
+	}
+
+	if (cfg->cfg_sub_clds & CONFIG_SUB_PARAMS) {
+		params_cld = config_log_find_or_add(obd, PARAMS_FILENAME, sb,
+						    CONFIG_T_PARAMS, cfg);
+		if (IS_ERR(params_cld)) {
+			rc = PTR_ERR(params_cld);
+			CERROR("%s: can't create params log: rc = %d\n",
+			       obd->obd_name, rc);
+			GOTO(out_nodemap, rc);
+		}
+	}
+
+	if (IS_MDT(s2lsi(sb)) && cfg->cfg_sub_clds & CONFIG_SUB_BARRIER) {
+		snprintf(seclogname + (ptr - logname), sizeof(seclogname) - 1,
+			 "-%s", BARRIER_FILENAME);
+		barrier_cld = config_log_find_or_add(obd, seclogname, sb,
+						     CONFIG_T_BARRIER, cfg);
+		if (IS_ERR(barrier_cld)) {
+			rc = PTR_ERR(barrier_cld);
+			CERROR("%s: can't create barrier log: rc = %d\n",
+			       obd->obd_name, rc);
+			GOTO(out_params, rc);
+		}
+	}
+
+	cld = do_config_log_add(obd, logname, CONFIG_T_CONFIG, cfg, sb);
+	if (IS_ERR(cld)) {
+		rc = PTR_ERR(cld);
+		CERROR("%s: can't create log: rc = %d\n",
+		       obd->obd_name, rc);
+		GOTO(out_barrier, rc = PTR_ERR(cld));
+	}
+
+	LASSERT(lsi->lsi_lmd);
+	if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR) &&
+	    cfg->cfg_sub_clds & CONFIG_SUB_RECOVER) {
+		struct config_llog_data *recover_cld;
+
+		ptr = strrchr(seclogname, '-');
+		if (ptr != NULL) {
+			*ptr = 0;
+		} else {
+			CERROR("%s: sptlrpc log name not correct, %s: "
+			       "rc = %d\n", obd->obd_name, seclogname, -EINVAL);
+			GOTO(out_cld, rc = -EINVAL);
+		}
+
+		recover_cld = config_recover_log_add(obd, seclogname, cfg, sb);
+		if (IS_ERR(recover_cld)) {
+			rc = PTR_ERR(recover_cld);
+			CERROR("%s: can't create recover log: rc = %d\n",
+			       obd->obd_name, rc);
+			GOTO(out_cld, rc);
+		}
+
+		mutex_lock(&cld->cld_lock);
+		locked = true;
+		cld->cld_recover = recover_cld;
+	}
+
+	if (!locked)
+		mutex_lock(&cld->cld_lock);
+	cld->cld_params = params_cld;
+	cld->cld_barrier = barrier_cld;
+	cld->cld_nodemap = nodemap_cld;
+	cld->cld_sptlrpc = sptlrpc_cld;
+	mutex_unlock(&cld->cld_lock);
+
+	RETURN(cld);
+
+out_cld:
+	config_log_put(cld);
+out_barrier:
+	config_log_put(barrier_cld);
+out_params:
+	config_log_put(params_cld);
+out_nodemap:
+	config_log_put(nodemap_cld);
+out_sptlrpc:
+	config_log_put(sptlrpc_cld);
+
+	return ERR_PTR(rc);
+}
+
+DEFINE_MUTEX(llog_process_lock);
+
+static inline void config_mark_cld_stop(struct config_llog_data *cld)
+{
+	if (cld) {
+		mutex_lock(&cld->cld_lock);
+		spin_lock(&config_list_lock);
+		cld->cld_stopping = 1;
+		spin_unlock(&config_list_lock);
+		mutex_unlock(&cld->cld_lock);
+	}
+}
+
+/** Stop watching for updates on this log.
+ */
+static int config_log_end(char *logname, struct config_llog_instance *cfg)
+{
+	struct config_llog_data *cld;
+	struct config_llog_data *cld_sptlrpc = NULL;
+	struct config_llog_data *cld_params = NULL;
+	struct config_llog_data *cld_recover = NULL;
+	struct config_llog_data *cld_nodemap = NULL;
+	struct config_llog_data *cld_barrier = NULL;
+	int rc = 0;
+
+	ENTRY;
+
+	cld = config_log_find(logname, cfg);
+	if (cld == NULL)
+		RETURN(-ENOENT);
+
+	mutex_lock(&cld->cld_lock);
+	/*
+	 * if cld_stopping is set, it means we didn't start the log thus
+	 * not owning the start ref. this can happen after previous umount:
+	 * the cld still hanging there waiting for lock cancel, and we
+	 * remount again but failed in the middle and call log_end without
+	 * calling start_log.
+	 */
+	if (unlikely(cld->cld_stopping)) {
+		mutex_unlock(&cld->cld_lock);
+		/* drop the ref from the find */
+		config_log_put(cld);
+		RETURN(rc);
+	}
+
+	spin_lock(&config_list_lock);
+	cld->cld_stopping = 1;
+	spin_unlock(&config_list_lock);
+
+	cld_recover = cld->cld_recover;
+	cld->cld_recover = NULL;
+	cld_params = cld->cld_params;
+	cld->cld_params = NULL;
+	cld_nodemap = cld->cld_nodemap;
+	cld->cld_nodemap = NULL;
+	cld_barrier = cld->cld_barrier;
+	cld->cld_barrier = NULL;
+	cld_sptlrpc = cld->cld_sptlrpc;
+	cld->cld_sptlrpc = NULL;
+	mutex_unlock(&cld->cld_lock);
+
+	config_mark_cld_stop(cld_recover);
+	config_log_put(cld_recover);
+
+	config_mark_cld_stop(cld_params);
+	config_log_put(cld_params);
+
+	/* don't set cld_stopping on nm lock as other targets may be active */
+	config_log_put(cld_nodemap);
+
+	if (cld_barrier) {
+		mutex_lock(&cld_barrier->cld_lock);
+		cld_barrier->cld_stopping = 1;
+		mutex_unlock(&cld_barrier->cld_lock);
+		config_log_put(cld_barrier);
+	}
+
+	config_log_put(cld_sptlrpc);
+
+	/* drop the ref from the find */
+	config_log_put(cld);
+	/* drop the start ref */
+	config_log_put(cld);
+
+	CDEBUG(D_MGC, "end config log %s (%d)\n", logname ? logname : "client",
+	       rc);
+	RETURN(rc);
+}
+
+#ifdef CONFIG_PROC_FS
+int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data)
+{
+	struct obd_device       *obd = data;
+	struct obd_import       *imp;
+	struct obd_connect_data *ocd;
+	struct config_llog_data *cld;
+	ENTRY;
+
+	LASSERT(obd != NULL);
+	LPROCFS_CLIMP_CHECK(obd);
+	imp = obd->u.cli.cl_import;
+	ocd = &imp->imp_connect_data;
+
+	seq_printf(m, "imperative_recovery: %s\n",
+		   OCD_HAS_FLAG(ocd, IMP_RECOV) ? "ENABLED" : "DISABLED");
+	seq_printf(m, "client_state:\n");
+
+	spin_lock(&config_list_lock);
+	list_for_each_entry(cld, &config_llog_list, cld_list_chain) {
+		if (cld->cld_recover == NULL)
+			continue;
+		seq_printf(m,  "    - { client: %s, nidtbl_version: %u }\n",
+			   cld->cld_logname,
+			   cld->cld_recover->cld_cfg.cfg_last_idx);
+	}
+	spin_unlock(&config_list_lock);
+
+	LPROCFS_CLIMP_EXIT(obd);
+	RETURN(0);
+}
+#endif
+
+/* reenqueue any lost locks */
+#define RQ_RUNNING	0x1
+#define RQ_NOW		0x2
+#define RQ_LATER	0x4
+#define RQ_STOP		0x8
+#define RQ_PRECLEANUP	0x10
+static int                    rq_state = 0;
+static wait_queue_head_t      rq_waitq;
+static DECLARE_COMPLETION(rq_exit);
+static DECLARE_COMPLETION(rq_start);
+
+static void do_requeue(struct config_llog_data *cld)
+{
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+	/*
+	 * Do not run mgc_process_log on a disconnected export or an
+	 * export which is being disconnected. Take the client
+	 * semaphore to make the check non-racy.
+	 */
+	down_read_nested(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem,
+			 OBD_CLI_SEM_MGC);
+	if (cld->cld_mgcexp->exp_obd->u.cli.cl_conn_count != 0) {
+		CDEBUG(D_MGC, "updating log %s\n", cld->cld_logname);
+		rc = mgc_process_log(cld->cld_mgcexp->exp_obd, cld);
+		if (rc && rc != -ENOENT)
+			CERROR("failed processing log: %d\n", rc);
+	} else {
+		CDEBUG(D_MGC, "disconnecting, won't update log %s\n",
+		       cld->cld_logname);
+	}
+	up_read(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem);
+
+        EXIT;
+}
+
+/* this timeout represents how many seconds MGC should wait before
+ * requeue config and recover lock to the MGS. We need to randomize this
+ * in order to not flood the MGS.
+ */
+#define MGC_TIMEOUT_MIN_SECONDS   5
+#define MGC_TIMEOUT_RAND_CENTISEC 0x1ff /* ~500 */
+
+static int mgc_requeue_thread(void *data)
+{
+	int rc = 0;
+	bool first = true;
+	ENTRY;
+
+	CDEBUG(D_MGC, "Starting requeue thread\n");
+
+	/* Keep trying failed locks periodically */
+	spin_lock(&config_list_lock);
+	rq_state |= RQ_RUNNING;
+	while (!(rq_state & RQ_STOP)) {
+		struct l_wait_info lwi;
+		struct config_llog_data *cld, *cld_prev;
+		int rand = cfs_rand() & MGC_TIMEOUT_RAND_CENTISEC;
+		int to;
+
+		/* Any new or requeued lostlocks will change the state */
+		rq_state &= ~(RQ_NOW | RQ_LATER);
+		spin_unlock(&config_list_lock);
+
+		if (first) {
+			first = false;
+			complete(&rq_start);
+		}
+
+		/* Always wait a few seconds to allow the server who
+		   caused the lock revocation to finish its setup, plus some
+		   random so everyone doesn't try to reconnect at once. */
+		to = msecs_to_jiffies(MGC_TIMEOUT_MIN_SECONDS * MSEC_PER_SEC);
+		/* rand is centi-seconds */
+		to += msecs_to_jiffies(rand * MSEC_PER_SEC / 100);
+		lwi = LWI_TIMEOUT(to, NULL, NULL);
+		l_wait_event(rq_waitq, rq_state & (RQ_STOP | RQ_PRECLEANUP),
+			     &lwi);
+
+                /*
+                 * iterate & processing through the list. for each cld, process
+                 * its depending sptlrpc cld firstly (if any) and then itself.
+                 *
+                 * it's guaranteed any item in the list must have
+                 * reference > 0; and if cld_lostlock is set, at
+                 * least one reference is taken by the previous enqueue.
+                 */
+                cld_prev = NULL;
+
+		spin_lock(&config_list_lock);
+		rq_state &= ~RQ_PRECLEANUP;
+		list_for_each_entry(cld, &config_llog_list,
+				    cld_list_chain) {
+			if (!cld->cld_lostlock || cld->cld_stopping)
+				continue;
+
+			/* hold reference to avoid being freed during
+			 * subsequent processing. */
+			config_log_get(cld);
+			cld->cld_lostlock = 0;
+			spin_unlock(&config_list_lock);
+
+			config_log_put(cld_prev);
+			cld_prev = cld;
+
+			if (likely(!(rq_state & RQ_STOP))) {
+				do_requeue(cld);
+				spin_lock(&config_list_lock);
+			} else {
+				spin_lock(&config_list_lock);
+				break;
+			}
+		}
+		spin_unlock(&config_list_lock);
+		config_log_put(cld_prev);
+
+		/* Wait a bit to see if anyone else needs a requeue */
+		lwi = (struct l_wait_info) { 0 };
+		l_wait_event(rq_waitq, rq_state & (RQ_NOW | RQ_STOP),
+			     &lwi);
+		spin_lock(&config_list_lock);
+	}
+
+	/* spinlock and while guarantee RQ_NOW and RQ_LATER are not set */
+	rq_state &= ~RQ_RUNNING;
+	spin_unlock(&config_list_lock);
+
+	complete(&rq_exit);
+
+	CDEBUG(D_MGC, "Ending requeue thread\n");
+	RETURN(rc);
+}
+
+/* Add a cld to the list to requeue.  Start the requeue thread if needed.
+   We are responsible for dropping the config log reference from here on out. */
+static void mgc_requeue_add(struct config_llog_data *cld)
+{
+	bool wakeup = false;
+	ENTRY;
+
+	CDEBUG(D_INFO, "log %s: requeue (r=%d sp=%d st=%x)\n",
+		cld->cld_logname, atomic_read(&cld->cld_refcount),
+		cld->cld_stopping, rq_state);
+	LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+	mutex_lock(&cld->cld_lock);
+	spin_lock(&config_list_lock);
+	if (!(rq_state & RQ_STOP) && !cld->cld_stopping && !cld->cld_lostlock) {
+		cld->cld_lostlock = 1;
+		rq_state |= RQ_NOW;
+		wakeup = true;
+	}
+	spin_unlock(&config_list_lock);
+	mutex_unlock(&cld->cld_lock);
+	if (wakeup)
+		wake_up(&rq_waitq);
+
+	EXIT;
+}
+
+/********************** class fns **********************/
+static int mgc_local_llog_init(const struct lu_env *env,
+			       struct obd_device *obd,
+			       struct obd_device *disk)
+{
+	struct llog_ctxt	*ctxt;
+	int			 rc;
+
+	ENTRY;
+
+	rc = llog_setup(env, obd, &obd->obd_olg, LLOG_CONFIG_ORIG_CTXT, disk,
+			&llog_osd_ops);
+	if (rc)
+		RETURN(rc);
+
+	ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
+	LASSERT(ctxt);
+	ctxt->loc_dir = obd->u.cli.cl_mgc_configs_dir;
+	llog_ctxt_put(ctxt);
+
+	RETURN(0);
+}
+
+static int mgc_local_llog_fini(const struct lu_env *env,
+			       struct obd_device *obd)
+{
+	struct llog_ctxt *ctxt;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
+	llog_cleanup(env, ctxt);
+
+	RETURN(0);
+}
+
+static int mgc_fs_setup(const struct lu_env *env, struct obd_device *obd,
+			struct super_block *sb)
+{
+	struct lustre_sb_info	*lsi = s2lsi(sb);
+	struct client_obd	*cli = &obd->u.cli;
+	struct lu_fid		 rfid, fid;
+	struct dt_object	*root, *dto;
+	int			 rc = 0;
+
+	ENTRY;
+
+	LASSERT(lsi);
+	LASSERT(lsi->lsi_dt_dev);
+
+	/* The mgc fs exclusion mutex. Only one fs can be setup at a time. */
+	mutex_lock(&cli->cl_mgc_mutex);
+
+	/* Setup the configs dir */
+	fid.f_seq = FID_SEQ_LOCAL_NAME;
+	fid.f_oid = 1;
+	fid.f_ver = 0;
+	rc = local_oid_storage_init(env, lsi->lsi_dt_dev, &fid,
+				    &cli->cl_mgc_los);
+	if (rc)
+		RETURN(rc);
+
+	rc = dt_root_get(env, lsi->lsi_dt_dev, &rfid);
+	if (rc)
+		GOTO(out_los, rc);
+
+	root = dt_locate_at(env, lsi->lsi_dt_dev, &rfid,
+			    &cli->cl_mgc_los->los_dev->dd_lu_dev, NULL);
+	if (unlikely(IS_ERR(root)))
+		GOTO(out_los, rc = PTR_ERR(root));
+
+	dto = local_file_find_or_create(env, cli->cl_mgc_los, root,
+					MOUNT_CONFIGS_DIR,
+					S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO);
+	dt_object_put_nocache(env, root);
+	if (IS_ERR(dto))
+		GOTO(out_los, rc = PTR_ERR(dto));
+
+	cli->cl_mgc_configs_dir = dto;
+
+	LASSERT(lsi->lsi_osd_exp->exp_obd->obd_lvfs_ctxt.dt);
+	rc = mgc_local_llog_init(env, obd, lsi->lsi_osd_exp->exp_obd);
+	if (rc)
+		GOTO(out_llog, rc);
+
+	/* We take an obd ref to insure that we can't get to mgc_cleanup
+	 * without calling mgc_fs_cleanup first. */
+	class_incref(obd, "mgc_fs", obd);
+
+	/* We keep the cl_mgc_sem until mgc_fs_cleanup */
+	EXIT;
+out_llog:
+	if (rc) {
+		dt_object_put(env, cli->cl_mgc_configs_dir);
+		cli->cl_mgc_configs_dir = NULL;
+	}
+out_los:
+	if (rc < 0) {
+		local_oid_storage_fini(env, cli->cl_mgc_los);
+		cli->cl_mgc_los = NULL;
+		mutex_unlock(&cli->cl_mgc_mutex);
+	}
+	return rc;
+}
+
+static int mgc_fs_cleanup(const struct lu_env *env, struct obd_device *obd)
+{
+	struct client_obd	*cli = &obd->u.cli;
+	ENTRY;
+
+	LASSERT(cli->cl_mgc_los != NULL);
+
+	mgc_local_llog_fini(env, obd);
+
+	dt_object_put_nocache(env, cli->cl_mgc_configs_dir);
+	cli->cl_mgc_configs_dir = NULL;
+
+	local_oid_storage_fini(env, cli->cl_mgc_los);
+	cli->cl_mgc_los = NULL;
+
+	class_decref(obd, "mgc_fs", obd);
+	mutex_unlock(&cli->cl_mgc_mutex);
+
+	RETURN(0);
+}
+
+static int mgc_llog_init(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_ctxt	*ctxt;
+	int			 rc;
+
+	ENTRY;
+
+	/* setup only remote ctxt, the local disk context is switched per each
+	 * filesystem during mgc_fs_setup() */
+	rc = llog_setup(env, obd, &obd->obd_olg, LLOG_CONFIG_REPL_CTXT, obd,
+			&llog_client_ops);
+	if (rc)
+		RETURN(rc);
+
+	ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT);
+	LASSERT(ctxt);
+
+	llog_initiator_connect(ctxt);
+	llog_ctxt_put(ctxt);
+
+	RETURN(0);
+}
+
+static int mgc_llog_fini(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_ctxt *ctxt;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT);
+	if (ctxt)
+		llog_cleanup(env, ctxt);
+
+	RETURN(0);
+}
+
+
+static atomic_t mgc_count = ATOMIC_INIT(0);
+static int mgc_precleanup(struct obd_device *obd)
+{
+	int	rc = 0;
+	int	temp;
+	ENTRY;
+
+	if (atomic_dec_and_test(&mgc_count)) {
+		LASSERT(rq_state & RQ_RUNNING);
+		/* stop requeue thread */
+		temp = RQ_STOP;
+	} else {
+		/* wakeup requeue thread to clean our cld */
+		temp = RQ_NOW | RQ_PRECLEANUP;
+	}
+
+	spin_lock(&config_list_lock);
+	rq_state |= temp;
+	spin_unlock(&config_list_lock);
+	wake_up(&rq_waitq);
+
+	if (temp & RQ_STOP)
+		wait_for_completion(&rq_exit);
+	obd_cleanup_client_import(obd);
+
+	rc = mgc_llog_fini(NULL, obd);
+	if (rc != 0)
+		CERROR("failed to cleanup llogging subsystems\n");
+
+	RETURN(rc);
+}
+
+static int mgc_cleanup(struct obd_device *obd)
+{
+        int rc;
+        ENTRY;
+
+        /* COMPAT_146 - old config logs may have added profiles we don't
+           know about */
+        if (obd->obd_type->typ_refcnt <= 1)
+                /* Only for the last mgc */
+                class_del_profiles();
+
+        lprocfs_obd_cleanup(obd);
+        ptlrpcd_decref();
+
+        rc = client_obd_cleanup(obd);
+        RETURN(rc);
+}
+
+static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct task_struct	*task;
+	int			 rc;
+	ENTRY;
+
+	rc = ptlrpcd_addref();
+	if (rc < 0)
+		RETURN(rc);
+
+	rc = client_obd_setup(obd, lcfg);
+	if (rc)
+		GOTO(err_decref, rc);
+
+	rc = mgc_llog_init(NULL, obd);
+	if (rc) {
+		CERROR("failed to setup llogging subsystems\n");
+		GOTO(err_cleanup, rc);
+	}
+
+#ifdef CONFIG_PROC_FS
+	obd->obd_vars = lprocfs_mgc_obd_vars;
+	lprocfs_obd_setup(obd);
+#endif
+	sptlrpc_lprocfs_cliobd_attach(obd);
+
+	if (atomic_inc_return(&mgc_count) == 1) {
+		rq_state = 0;
+		init_waitqueue_head(&rq_waitq);
+
+		/* start requeue thread */
+		task = kthread_run(mgc_requeue_thread, NULL, "ll_cfg_requeue");
+		if (IS_ERR(task)) {
+			rc = PTR_ERR(task);
+			CERROR("%s: cannot start requeue thread: rc = %d; "
+			       "no more log updates\n",
+			       obd->obd_name, rc);
+			GOTO(err_cleanup, rc);
+		}
+		/* rc is the task_struct pointer of mgc_requeue_thread. */
+		rc = 0;
+		wait_for_completion(&rq_start);
+	}
+
+	RETURN(rc);
+
+err_cleanup:
+	client_obd_cleanup(obd);
+err_decref:
+	ptlrpcd_decref();
+	RETURN(rc);
+}
+
+/* based on ll_mdc_blocking_ast */
+static int mgc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+                            void *data, int flag)
+{
+        struct lustre_handle lockh;
+        struct config_llog_data *cld = (struct config_llog_data *)data;
+        int rc = 0;
+        ENTRY;
+
+        switch (flag) {
+        case LDLM_CB_BLOCKING:
+                /* mgs wants the lock, give it up... */
+                LDLM_DEBUG(lock, "MGC blocking CB");
+                ldlm_lock2handle(lock, &lockh);
+		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+		break;
+	case LDLM_CB_CANCELING:
+		/* We've given up the lock, prepare ourselves to update. */
+		LDLM_DEBUG(lock, "MGC cancel CB");
+
+		CDEBUG(D_MGC, "Lock res "DLDLMRES" (%.8s)\n",
+		       PLDLMRES(lock->l_resource),
+		       (char *)&lock->l_resource->lr_name.name[0]);
+
+		if (!cld) {
+			CDEBUG(D_INFO, "missing data, won't requeue\n");
+			break;
+		}
+
+		/* held at mgc_process_log(). */
+		LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+		lock->l_ast_data = NULL;
+		/* Are we done with this log? */
+		if (cld->cld_stopping) {
+			CDEBUG(D_MGC, "log %s: stopping, won't requeue\n",
+				cld->cld_logname);
+			config_log_put(cld);
+			break;
+		}
+		/* Make sure not to re-enqueue when the mgc is stopping
+		   (we get called from client_disconnect_export) */
+		if (lock->l_conn_export == NULL ||
+		    lock->l_conn_export->exp_obd->u.cli.cl_conn_count == 0) {
+			CDEBUG(D_MGC, "log %.8s: disconnecting, won't requeue\n",
+				cld->cld_logname);
+			config_log_put(cld);
+			break;
+		}
+
+                /* Re-enqueue now */
+                mgc_requeue_add(cld);
+                config_log_put(cld);
+                break;
+        default:
+                LBUG();
+        }
+
+        RETURN(rc);
+}
+
+/* Not sure where this should go... */
+/* This is the timeout value for MGS_CONNECT request plus a ping interval, such
+ * that we can have a chance to try the secondary MGS if any. */
+#define  MGC_ENQUEUE_LIMIT (INITIAL_CONNECT_TIMEOUT + (AT_OFF ? 0 : at_min) \
+				+ PING_INTERVAL)
+#define  MGC_TARGET_REG_LIMIT 10
+#define  MGC_TARGET_REG_LIMIT_MAX RECONNECT_DELAY_MAX
+#define  MGC_SEND_PARAM_LIMIT 10
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0)
+/* Send parameter to MGS*/
+static int mgc_set_mgs_param(struct obd_export *exp,
+                             struct mgs_send_param *msp)
+{
+        struct ptlrpc_request *req;
+        struct mgs_send_param *req_msp, *rep_msp;
+        int rc;
+        ENTRY;
+
+        req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+                                        &RQF_MGS_SET_INFO, LUSTRE_MGS_VERSION,
+                                        MGS_SET_INFO);
+        if (!req)
+                RETURN(-ENOMEM);
+
+        req_msp = req_capsule_client_get(&req->rq_pill, &RMF_MGS_SEND_PARAM);
+        if (!req_msp) {
+                ptlrpc_req_finished(req);
+                RETURN(-ENOMEM);
+        }
+
+        memcpy(req_msp, msp, sizeof(*req_msp));
+        ptlrpc_request_set_replen(req);
+
+        /* Limit how long we will wait for the enqueue to complete */
+        req->rq_delay_limit = MGC_SEND_PARAM_LIMIT;
+        rc = ptlrpc_queue_wait(req);
+        if (!rc) {
+                rep_msp = req_capsule_server_get(&req->rq_pill, &RMF_MGS_SEND_PARAM);
+                memcpy(msp, rep_msp, sizeof(*rep_msp));
+        }
+
+        ptlrpc_req_finished(req);
+
+        RETURN(rc);
+}
+#endif
+
+/* Take a config lock so we can get cancel notifications */
+static int mgc_enqueue(struct obd_export *exp, enum ldlm_type type,
+		       union ldlm_policy_data *policy, enum ldlm_mode mode,
+		       __u64 *flags, ldlm_glimpse_callback glimpse_callback,
+		       void *data, __u32 lvb_len, void *lvb_swabber,
+		       struct lustre_handle *lockh)
+{
+	struct config_llog_data *cld = (struct config_llog_data *)data;
+	struct ldlm_enqueue_info einfo = {
+		.ei_type	= type,
+		.ei_mode	= mode,
+		.ei_cb_bl	= mgc_blocking_ast,
+		.ei_cb_cp	= ldlm_completion_ast,
+		.ei_cb_gl	= glimpse_callback,
+	};
+	struct ptlrpc_request *req;
+	int short_limit = cld_is_sptlrpc(cld);
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_MGC, "Enqueue for %s (res %#llx)\n", cld->cld_logname,
+               cld->cld_resid.name[0]);
+
+        /* We need a callback for every lockholder, so don't try to
+           ldlm_lock_match (see rev 1.1.2.11.2.47) */
+        req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+                                        &RQF_LDLM_ENQUEUE, LUSTRE_DLM_VERSION,
+                                        LDLM_ENQUEUE);
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, 0);
+        ptlrpc_request_set_replen(req);
+
+        /* check if this is server or client */
+        if (cld->cld_cfg.cfg_sb) {
+                struct lustre_sb_info *lsi = s2lsi(cld->cld_cfg.cfg_sb);
+		if (lsi && IS_SERVER(lsi))
+                        short_limit = 1;
+        }
+        /* Limit how long we will wait for the enqueue to complete */
+        req->rq_delay_limit = short_limit ? 5 : MGC_ENQUEUE_LIMIT;
+        rc = ldlm_cli_enqueue(exp, &req, &einfo, &cld->cld_resid, NULL, flags,
+			      NULL, 0, LVB_T_NONE, lockh, 0);
+        /* A failed enqueue should still call the mgc_blocking_ast,
+           where it will be requeued if needed ("grant failed"). */
+        ptlrpc_req_finished(req);
+        RETURN(rc);
+}
+
+static int mgc_cancel(struct obd_export *exp, enum ldlm_mode mode,
+		      struct lustre_handle *lockh)
+{
+	ENTRY;
+
+	ldlm_lock_decref(lockh, mode);
+
+	RETURN(0);
+}
+
+static void mgc_notify_active(struct obd_device *unused)
+{
+	/* wakeup mgc_requeue_thread to requeue mgc lock */
+	spin_lock(&config_list_lock);
+	rq_state |= RQ_NOW;
+	spin_unlock(&config_list_lock);
+	wake_up(&rq_waitq);
+
+	/* TODO: Help the MGS rebuild nidtbl. -jay */
+}
+
+/* Send target_reg message to MGS */
+static int mgc_target_register(struct obd_export *exp,
+                               struct mgs_target_info *mti)
+{
+        struct ptlrpc_request  *req;
+        struct mgs_target_info *req_mti, *rep_mti;
+        int                     rc;
+        ENTRY;
+
+        req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+                                        &RQF_MGS_TARGET_REG, LUSTRE_MGS_VERSION,
+                                        MGS_TARGET_REG);
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+        req_mti = req_capsule_client_get(&req->rq_pill, &RMF_MGS_TARGET_INFO);
+        if (!req_mti) {
+                ptlrpc_req_finished(req);
+                RETURN(-ENOMEM);
+        }
+
+	memcpy(req_mti, mti, sizeof(*req_mti));
+	ptlrpc_request_set_replen(req);
+	CDEBUG(D_MGC, "register %s\n", mti->mti_svname);
+	/* Limit how long we will wait for the enqueue to complete */
+	req->rq_delay_limit = MGC_TARGET_REG_LIMIT;
+
+	/* if the target needs to regenerate the config log in MGS, it's better
+	 * to use some longer limit to let MGC have time to change connection to
+	 * another MGS (or try again with the same MGS) for the target (server)
+	 * will fail and exit if the request expired due to delay limit. */
+	if (mti->mti_flags & (LDD_F_UPDATE | LDD_F_NEED_INDEX))
+		req->rq_delay_limit = MGC_TARGET_REG_LIMIT_MAX;
+
+        rc = ptlrpc_queue_wait(req);
+        if (!rc) {
+                rep_mti = req_capsule_server_get(&req->rq_pill,
+                                                 &RMF_MGS_TARGET_INFO);
+                memcpy(mti, rep_mti, sizeof(*rep_mti));
+                CDEBUG(D_MGC, "register %s got index = %d\n",
+                       mti->mti_svname, mti->mti_stripe_index);
+        }
+        ptlrpc_req_finished(req);
+
+        RETURN(rc);
+}
+
+static int mgc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+			      u32 keylen, void *key,
+			      u32 vallen, void *val,
+			      struct ptlrpc_request_set *set)
+{
+        int rc = -EINVAL;
+        ENTRY;
+
+	/* Turn off initial_recov after we try all backup servers once */
+	if (KEY_IS(KEY_INIT_RECOV_BACKUP)) {
+		struct obd_import *imp = class_exp2cliimp(exp);
+		int value;
+		if (vallen != sizeof(int))
+			RETURN(-EINVAL);
+		value = *(int *)val;
+		CDEBUG(D_MGC, "InitRecov %s %d/d%d:i%d:r%d:or%d:%s\n",
+		       imp->imp_obd->obd_name, value,
+		       imp->imp_deactive, imp->imp_invalid,
+		       imp->imp_replayable, imp->imp_obd->obd_replayable,
+		       ptlrpc_import_state_name(imp->imp_state));
+		/* Resurrect the import immediately if
+		 * 1. we previously got disconnected,
+		 * 2. value > 1 (at the same node with MGS)
+		 * */
+		if (imp->imp_state == LUSTRE_IMP_DISCON || value > 1)
+			ptlrpc_reconnect_import(imp);
+
+		RETURN(0);
+	}
+
+        /* FIXME move this to mgc_process_config */
+        if (KEY_IS(KEY_REGISTER_TARGET)) {
+                struct mgs_target_info *mti;
+                if (vallen != sizeof(struct mgs_target_info))
+                        RETURN(-EINVAL);
+                mti = (struct mgs_target_info *)val;
+                CDEBUG(D_MGC, "register_target %s %#x\n",
+                       mti->mti_svname, mti->mti_flags);
+                rc =  mgc_target_register(exp, mti);
+                RETURN(rc);
+        }
+	if (KEY_IS(KEY_SET_FS)) {
+		struct super_block *sb = (struct super_block *)val;
+
+		if (vallen != sizeof(struct super_block))
+			RETURN(-EINVAL);
+
+		rc = mgc_fs_setup(env, exp->exp_obd, sb);
+		RETURN(rc);
+	}
+	if (KEY_IS(KEY_CLEAR_FS)) {
+		if (vallen != 0)
+			RETURN(-EINVAL);
+		rc = mgc_fs_cleanup(env, exp->exp_obd);
+		RETURN(rc);
+	}
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0)
+        if (KEY_IS(KEY_SET_INFO)) {
+                struct mgs_send_param *msp;
+
+                msp = (struct mgs_send_param *)val;
+                rc =  mgc_set_mgs_param(exp, msp);
+                RETURN(rc);
+        }
+#endif
+        if (KEY_IS(KEY_MGSSEC)) {
+                struct client_obd     *cli = &exp->exp_obd->u.cli;
+                struct sptlrpc_flavor  flvr;
+
+                /*
+                 * empty string means using current flavor, if which haven't
+                 * been set yet, set it as null.
+                 *
+                 * if flavor has been set previously, check the asking flavor
+                 * must match the existing one.
+                 */
+                if (vallen == 0) {
+                        if (cli->cl_flvr_mgc.sf_rpc != SPTLRPC_FLVR_INVALID)
+                                RETURN(0);
+                        val = "null";
+                        vallen = 4;
+                }
+
+                rc = sptlrpc_parse_flavor(val, &flvr);
+                if (rc) {
+                        CERROR("invalid sptlrpc flavor %s to MGS\n",
+                               (char *) val);
+                        RETURN(rc);
+                }
+
+                /*
+                 * caller already hold a mutex
+                 */
+                if (cli->cl_flvr_mgc.sf_rpc == SPTLRPC_FLVR_INVALID) {
+                        cli->cl_flvr_mgc = flvr;
+                } else if (memcmp(&cli->cl_flvr_mgc, &flvr,
+                                  sizeof(flvr)) != 0) {
+                        char    str[20];
+
+                        sptlrpc_flavor2name(&cli->cl_flvr_mgc,
+                                            str, sizeof(str));
+                        LCONSOLE_ERROR("asking sptlrpc flavor %s to MGS but "
+                                       "currently %s is in use\n",
+                                       (char *) val, str);
+                        rc = -EPERM;
+                }
+                RETURN(rc);
+        }
+
+        RETURN(rc);
+}
+
+static int mgc_get_info(const struct lu_env *env, struct obd_export *exp,
+			__u32 keylen, void *key, __u32 *vallen, void *val)
+{
+        int rc = -EINVAL;
+
+        if (KEY_IS(KEY_CONN_DATA)) {
+                struct obd_import *imp = class_exp2cliimp(exp);
+                struct obd_connect_data *data = val;
+
+                if (*vallen == sizeof(*data)) {
+                        *data = imp->imp_connect_data;
+                        rc = 0;
+                }
+        }
+
+        return rc;
+}
+
+static int mgc_import_event(struct obd_device *obd,
+                            struct obd_import *imp,
+                            enum obd_import_event event)
+{
+        int rc = 0;
+
+        LASSERT(imp->imp_obd == obd);
+        CDEBUG(D_MGC, "import event %#x\n", event);
+
+        switch (event) {
+        case IMP_EVENT_DISCON:
+                /* MGC imports should not wait for recovery */
+		if (OCD_HAS_FLAG(&imp->imp_connect_data, IMP_RECOV))
+			ptlrpc_pinger_ir_down();
+                break;
+        case IMP_EVENT_INACTIVE:
+                break;
+        case IMP_EVENT_INVALIDATE: {
+                struct ldlm_namespace *ns = obd->obd_namespace;
+                ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+                break;
+        }
+	case IMP_EVENT_ACTIVE:
+		CDEBUG(D_INFO, "%s: Reactivating import\n", obd->obd_name);
+		/* Clearing obd_no_recov allows us to continue pinging */
+		obd->obd_no_recov = 0;
+		mgc_notify_active(obd);
+		if (OCD_HAS_FLAG(&imp->imp_connect_data, IMP_RECOV))
+			ptlrpc_pinger_ir_up();
+		break;
+        case IMP_EVENT_OCD:
+                break;
+        case IMP_EVENT_DEACTIVATE:
+        case IMP_EVENT_ACTIVATE:
+                break;
+        default:
+                CERROR("Unknown import event %#x\n", event);
+                LBUG();
+        }
+        RETURN(rc);
+}
+
+enum {
+	CONFIG_READ_NRPAGES_INIT = 1 << (20 - PAGE_SHIFT),
+        CONFIG_READ_NRPAGES      = 4
+};
+
+static int mgc_apply_recover_logs(struct obd_device *mgc,
+				  struct config_llog_data *cld,
+				  __u64 max_version,
+				  void *data, int datalen, bool mne_swab)
+{
+        struct config_llog_instance *cfg = &cld->cld_cfg;
+        struct lustre_sb_info       *lsi = s2lsi(cfg->cfg_sb);
+        struct mgs_nidtbl_entry *entry;
+        struct lustre_cfg       *lcfg;
+        struct lustre_cfg_bufs   bufs;
+        u64   prev_version = 0;
+        char *inst;
+        char *buf;
+        int   bufsz;
+        int   pos;
+        int   rc  = 0;
+        int   off = 0;
+        ENTRY;
+
+        LASSERT(cfg->cfg_instance != NULL);
+        LASSERT(cfg->cfg_sb == cfg->cfg_instance);
+
+	OBD_ALLOC(inst, PAGE_SIZE);
+	if (inst == NULL)
+		RETURN(-ENOMEM);
+
+	if (!IS_SERVER(lsi)) {
+		pos = snprintf(inst, PAGE_SIZE, "%p", cfg->cfg_instance);
+		if (pos >= PAGE_SIZE) {
+			OBD_FREE(inst, PAGE_SIZE);
+			return -E2BIG;
+		}
+        } else {
+		LASSERT(IS_MDT(lsi));
+		rc = server_name2svname(lsi->lsi_svname, inst, NULL,
+					PAGE_SIZE);
+		if (rc) {
+			OBD_FREE(inst, PAGE_SIZE);
+			RETURN(-EINVAL);
+		}
+		pos = strlen(inst);
+        }
+
+        ++pos;
+        buf   = inst + pos;
+	bufsz = PAGE_SIZE - pos;
+
+        while (datalen > 0) {
+                int   entry_len = sizeof(*entry);
+		int   is_ost, i;
+                struct obd_device *obd;
+                char *obdname;
+                char *cname;
+                char *params;
+                char *uuid;
+
+                rc = -EINVAL;
+                if (datalen < sizeof(*entry))
+                        break;
+
+                entry = (typeof(entry))(data + off);
+
+                /* sanity check */
+                if (entry->mne_nid_type != 0) /* only support type 0 for ipv4 */
+                        break;
+                if (entry->mne_nid_count == 0) /* at least one nid entry */
+                        break;
+                if (entry->mne_nid_size != sizeof(lnet_nid_t))
+                        break;
+
+                entry_len += entry->mne_nid_count * entry->mne_nid_size;
+                if (datalen < entry_len) /* must have entry_len at least */
+                        break;
+
+		/* Keep this swab for normal mixed endian handling. LU-1644 */
+		if (mne_swab)
+			lustre_swab_mgs_nidtbl_entry(entry);
+		if (entry->mne_length > PAGE_SIZE) {
+			CERROR("MNE too large (%u)\n", entry->mne_length);
+			break;
+		}
+
+		if (entry->mne_length < entry_len)
+			break;
+
+                off     += entry->mne_length;
+                datalen -= entry->mne_length;
+                if (datalen < 0)
+                        break;
+
+                if (entry->mne_version > max_version) {
+                        CERROR("entry index(%lld) is over max_index(%lld)\n",
+                               entry->mne_version, max_version);
+                        break;
+                }
+
+                if (prev_version >= entry->mne_version) {
+                        CERROR("index unsorted, prev %lld, now %lld\n",
+                               prev_version, entry->mne_version);
+                        break;
+                }
+                prev_version = entry->mne_version;
+
+                /*
+                 * Write a string with format "nid::instance" to
+                 * lustre/<osc|mdc>/<target>-<osc|mdc>-<instance>/import.
+                 */
+
+                is_ost = entry->mne_type == LDD_F_SV_TYPE_OST;
+                memset(buf, 0, bufsz);
+                obdname = buf;
+                pos = 0;
+
+                /* lustre-OST0001-osc-<instance #> */
+                strcpy(obdname, cld->cld_logname);
+                cname = strrchr(obdname, '-');
+                if (cname == NULL) {
+                        CERROR("mgc %s: invalid logname %s\n",
+                               mgc->obd_name, obdname);
+                        break;
+                }
+
+                pos = cname - obdname;
+                obdname[pos] = 0;
+                pos += sprintf(obdname + pos, "-%s%04x",
+                                  is_ost ? "OST" : "MDT", entry->mne_index);
+
+                cname = is_ost ? "osc" : "mdc",
+                pos += sprintf(obdname + pos, "-%s-%s", cname, inst);
+                lustre_cfg_bufs_reset(&bufs, obdname);
+
+                /* find the obd by obdname */
+                obd = class_name2obd(obdname);
+                if (obd == NULL) {
+                        CDEBUG(D_INFO, "mgc %s: cannot find obdname %s\n",
+                               mgc->obd_name, obdname);
+			rc = 0;
+                        /* this is a safe race, when the ost is starting up...*/
+                        continue;
+                }
+
+                /* osc.import = "connection=<Conn UUID>::<target instance>" */
+                ++pos;
+                params = buf + pos;
+                pos += sprintf(params, "%s.import=%s", cname, "connection=");
+                uuid = buf + pos;
+
+		down_read(&obd->u.cli.cl_sem);
+		if (obd->u.cli.cl_import == NULL) {
+			/* client does not connect to the OST yet */
+			up_read(&obd->u.cli.cl_sem);
+			rc = 0;
+			continue;
+		}
+
+		/* iterate all nids to find one */
+		/* find uuid by nid */
+		rc = -ENOENT;
+		for (i = 0; i < entry->mne_nid_count; i++) {
+			rc = client_import_find_conn(obd->u.cli.cl_import,
+						     entry->u.nids[i],
+						     (struct obd_uuid *)uuid);
+			if (rc == 0)
+				break;
+		}
+
+		up_read(&obd->u.cli.cl_sem);
+                if (rc < 0) {
+                        CERROR("mgc: cannot find uuid by nid %s\n",
+                               libcfs_nid2str(entry->u.nids[0]));
+                        break;
+                }
+
+                CDEBUG(D_INFO, "Find uuid %s by nid %s\n",
+                       uuid, libcfs_nid2str(entry->u.nids[0]));
+
+                pos += strlen(uuid);
+                pos += sprintf(buf + pos, "::%u", entry->mne_instance);
+                LASSERT(pos < bufsz);
+
+                lustre_cfg_bufs_set_string(&bufs, 1, params);
+
+		OBD_ALLOC(lcfg, lustre_cfg_len(bufs.lcfg_bufcount,
+					       bufs.lcfg_buflen));
+		if (!lcfg) {
+			rc = -ENOMEM;
+			break;
+		}
+		lustre_cfg_init(lcfg, LCFG_PARAM, &bufs);
+
+		CDEBUG(D_INFO, "ir apply logs %lld/%lld for %s -> %s\n",
+                       prev_version, max_version, obdname, params);
+
+                rc = class_process_config(lcfg);
+		OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount,
+					      lcfg->lcfg_buflens));
+                if (rc)
+                        CDEBUG(D_INFO, "process config for %s error %d\n",
+                               obdname, rc);
+
+                /* continue, even one with error */
+        }
+
+	OBD_FREE(inst, PAGE_SIZE);
+        RETURN(rc);
+}
+
+/**
+ * This function is called if this client was notified for target restarting
+ * by the MGS. A CONFIG_READ RPC is going to send to fetch recovery or
+ * nodemap logs.
+ */
+static int mgc_process_recover_nodemap_log(struct obd_device *obd,
+					   struct config_llog_data *cld)
+{
+	struct ptlrpc_connection *mgc_conn;
+	struct ptlrpc_request *req = NULL;
+	struct config_llog_instance *cfg = &cld->cld_cfg;
+	struct mgs_config_body *body;
+	struct mgs_config_res *res;
+	struct nodemap_config *new_config = NULL;
+	struct lu_nodemap *recent_nodemap = NULL;
+	struct ptlrpc_bulk_desc *desc;
+	struct page **pages = NULL;
+	__u64 config_read_offset = 0;
+	__u8 nodemap_cur_pass = 0;
+	int nrpages = 0;
+	bool eof = true;
+	bool mne_swab = false;
+	int i;
+	int ealen;
+	int rc;
+	ENTRY;
+
+	mgc_conn = class_exp2cliimp(cld->cld_mgcexp)->imp_connection;
+
+	/* don't need to get local config */
+	if (cld_is_nodemap(cld) &&
+	    (LNET_NETTYP(LNET_NIDNET(mgc_conn->c_peer.nid)) == LOLND))
+		GOTO(out, rc = 0);
+
+        /* allocate buffer for bulk transfer.
+         * if this is the first time for this mgs to read logs,
+         * CONFIG_READ_NRPAGES_INIT will be used since it will read all logs
+         * once; otherwise, it only reads increment of logs, this should be
+         * small and CONFIG_READ_NRPAGES will be used.
+         */
+        nrpages = CONFIG_READ_NRPAGES;
+	if (cfg->cfg_last_idx == 0 || cld_is_nodemap(cld))
+                nrpages = CONFIG_READ_NRPAGES_INIT;
+
+        OBD_ALLOC(pages, sizeof(*pages) * nrpages);
+        if (pages == NULL)
+                GOTO(out, rc = -ENOMEM);
+
+        for (i = 0; i < nrpages; i++) {
+		pages[i] = alloc_page(GFP_KERNEL);
+                if (pages[i] == NULL)
+                        GOTO(out, rc = -ENOMEM);
+        }
+
+again:
+#ifdef HAVE_SERVER_SUPPORT
+	if (cld_is_nodemap(cld) && config_read_offset == 0) {
+		new_config = nodemap_config_alloc();
+		if (IS_ERR(new_config)) {
+			rc = PTR_ERR(new_config);
+			new_config = NULL;
+			GOTO(out, rc);
+		}
+	}
+#endif
+	LASSERT(cld_is_recover(cld) || cld_is_nodemap(cld));
+	LASSERT(mutex_is_locked(&cld->cld_lock));
+	req = ptlrpc_request_alloc(class_exp2cliimp(cld->cld_mgcexp),
+				   &RQF_MGS_CONFIG_READ);
+	if (req == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MGS_VERSION, MGS_CONFIG_READ);
+	if (rc)
+		GOTO(out, rc);
+
+	/* pack request */
+	body = req_capsule_client_get(&req->rq_pill, &RMF_MGS_CONFIG_BODY);
+	LASSERT(body != NULL);
+	LASSERT(sizeof(body->mcb_name) > strlen(cld->cld_logname));
+	if (strlcpy(body->mcb_name, cld->cld_logname, sizeof(body->mcb_name))
+	    >= sizeof(body->mcb_name))
+		GOTO(out, rc = -E2BIG);
+	if (cld_is_nodemap(cld))
+		body->mcb_offset = config_read_offset;
+	else
+		body->mcb_offset = cfg->cfg_last_idx + 1;
+	body->mcb_type   = cld->cld_type;
+	body->mcb_bits   = PAGE_SHIFT;
+	body->mcb_units  = nrpages;
+	body->mcb_nm_cur_pass = nodemap_cur_pass;
+
+	/* allocate bulk transfer descriptor */
+	desc = ptlrpc_prep_bulk_imp(req, nrpages, 1,
+				    PTLRPC_BULK_PUT_SINK | PTLRPC_BULK_BUF_KIOV,
+				    MGS_BULK_PORTAL,
+				    &ptlrpc_bulk_kiov_pin_ops);
+	if (desc == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	for (i = 0; i < nrpages; i++)
+		desc->bd_frag_ops->add_kiov_frag(desc, pages[i], 0,
+						 PAGE_SIZE);
+
+	ptlrpc_request_set_replen(req);
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	res = req_capsule_server_get(&req->rq_pill, &RMF_MGS_CONFIG_RES);
+	if (!res)
+		GOTO(out, rc = -EPROTO);
+
+	if (cld_is_nodemap(cld)) {
+		config_read_offset = res->mcr_offset;
+		eof = config_read_offset == II_END_OFF;
+		nodemap_cur_pass = res->mcr_nm_cur_pass;
+	} else {
+		if (res->mcr_size < res->mcr_offset)
+			GOTO(out, rc = -EINVAL);
+
+		/* always update the index even though it might have errors with
+		 * handling the recover logs
+		 */
+		cfg->cfg_last_idx = res->mcr_offset;
+		eof = res->mcr_offset == res->mcr_size;
+
+		CDEBUG(D_INFO, "Latest version %lld, more %d.\n",
+		       res->mcr_offset, eof == false);
+	}
+
+	ealen = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, 0);
+	if (ealen < 0)
+		GOTO(out, rc = ealen);
+
+	if (ealen > nrpages << PAGE_SHIFT)
+		GOTO(out, rc = -EINVAL);
+
+	if (ealen == 0) { /* no logs transferred */
+#ifdef HAVE_SERVER_SUPPORT
+		/* config changed since first read RPC */
+		if (cld_is_nodemap(cld) && config_read_offset == 0) {
+			recent_nodemap = NULL;
+			nodemap_config_dealloc(new_config);
+			new_config = NULL;
+
+			CDEBUG(D_INFO, "nodemap config changed in transit, retrying\n");
+
+			/* setting eof to false, we request config again */
+			eof = false;
+			GOTO(out, rc = 0);
+		}
+#endif
+		if (!eof)
+			rc = -EINVAL;
+		GOTO(out, rc);
+	}
+
+	mne_swab = !!ptlrpc_rep_need_swab(req);
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
+	/* This import flag means the server did an extra swab of IR MNE
+	 * records (fixed in LU-1252), reverse it here if needed. LU-1644 */
+	if (unlikely(req->rq_import->imp_need_mne_swab))
+		mne_swab = !mne_swab;
+#endif
+
+	/* When a nodemap config is received, we build a new nodemap config,
+	 * with new nodemap structs. We keep track of the most recently added
+	 * nodemap since the config is read ordered by nodemap_id, and so it
+	 * is likely that the next record will be related. Because access to
+	 * the nodemaps is single threaded until the nodemap_config is active,
+	 * we don't need to reference count with recent_nodemap, though
+	 * recent_nodemap should be set to NULL when the nodemap_config
+	 * is either destroyed or set active.
+	 */
+	for (i = 0; i < nrpages && ealen > 0; i++) {
+		int rc2;
+		union lu_page	*ptr;
+
+		ptr = kmap(pages[i]);
+		if (cld_is_nodemap(cld))
+			rc2 = nodemap_process_idx_pages(new_config, ptr,
+						       &recent_nodemap);
+		else
+			rc2 = mgc_apply_recover_logs(obd, cld, res->mcr_offset,
+						     ptr,
+						     min_t(int, ealen,
+							   PAGE_SIZE),
+						     mne_swab);
+		kunmap(pages[i]);
+		if (rc2 < 0) {
+			CWARN("%s: error processing %s log %s: rc = %d\n",
+			      obd->obd_name,
+			      cld_is_nodemap(cld) ? "nodemap" : "recovery",
+			      cld->cld_logname,
+			      rc2);
+			GOTO(out, rc = rc2);
+		}
+
+		ealen -= PAGE_SIZE;
+	}
+
+out:
+	if (req) {
+		ptlrpc_req_finished(req);
+		req = NULL;
+	}
+
+	if (rc == 0 && !eof)
+		goto again;
+
+#ifdef HAVE_SERVER_SUPPORT
+	if (new_config != NULL) {
+		/* recent_nodemap cannot be used after set_active/dealloc */
+		if (rc == 0)
+			nodemap_config_set_active_mgc(new_config);
+		else
+			nodemap_config_dealloc(new_config);
+	}
+#endif
+
+	if (pages) {
+		for (i = 0; i < nrpages; i++) {
+			if (pages[i] == NULL)
+				break;
+			__free_page(pages[i]);
+		}
+		OBD_FREE(pages, sizeof(*pages) * nrpages);
+	}
+	return rc;
+}
+
+static int mgc_barrier_glimpse_ast(struct ldlm_lock *lock, void *data)
+{
+	struct config_llog_data *cld = lock->l_ast_data;
+	int rc;
+	ENTRY;
+
+	if (cld->cld_stopping)
+		RETURN(-ENODEV);
+
+	rc = barrier_handler(s2lsi(cld->cld_cfg.cfg_sb)->lsi_dt_dev,
+			     (struct ptlrpc_request *)data);
+
+	RETURN(rc);
+}
+
+/* Copy a remote log locally */
+static int mgc_llog_local_copy(const struct lu_env *env,
+			       struct obd_device *obd,
+			       struct llog_ctxt *rctxt,
+			       struct llog_ctxt *lctxt, char *logname)
+{
+	char	*temp_log;
+	int	 rc;
+
+	ENTRY;
+
+	/*
+	 * - copy it to backup using llog_backup()
+	 * - copy remote llog to logname using llog_backup()
+	 * - if failed then move bakup to logname again
+	 */
+
+	OBD_ALLOC(temp_log, strlen(logname) + 2);
+	if (!temp_log)
+		RETURN(-ENOMEM);
+	sprintf(temp_log, "%sT", logname);
+
+	/* make a copy of local llog at first */
+	rc = llog_backup(env, obd, lctxt, lctxt, logname, temp_log);
+	if (rc < 0 && rc != -ENOENT)
+		GOTO(out, rc);
+	/* copy remote llog to the local copy */
+	rc = llog_backup(env, obd, rctxt, lctxt, logname, logname);
+	if (rc == -ENOENT) {
+		/* no remote llog, delete local one too */
+		llog_erase(env, lctxt, NULL, logname);
+	} else if (rc < 0) {
+		/* error during backup, get local one back from the copy */
+		llog_backup(env, obd, lctxt, lctxt, temp_log, logname);
+out:
+		CERROR("%s: failed to copy remote log %s: rc = %d\n",
+		       obd->obd_name, logname, rc);
+	}
+	llog_erase(env, lctxt, NULL, temp_log);
+	OBD_FREE(temp_log, strlen(logname) + 2);
+	return rc;
+}
+
+/* local_only means it cannot get remote llogs */
+static int mgc_process_cfg_log(struct obd_device *mgc,
+			       struct config_llog_data *cld, int local_only)
+{
+	struct llog_ctxt	*ctxt, *lctxt = NULL;
+	struct client_obd	*cli = &mgc->u.cli;
+	struct lustre_sb_info	*lsi = NULL;
+	int			 rc = 0;
+	struct lu_env		*env;
+
+	ENTRY;
+
+	LASSERT(cld);
+	LASSERT(mutex_is_locked(&cld->cld_lock));
+
+	if (cld->cld_cfg.cfg_sb)
+		lsi = s2lsi(cld->cld_cfg.cfg_sb);
+
+	OBD_ALLOC_PTR(env);
+	if (env == NULL)
+		RETURN(-ENOMEM);
+
+	rc = lu_env_init(env, LCT_MG_THREAD);
+	if (rc)
+		GOTO(out_free, rc);
+
+	ctxt = llog_get_context(mgc, LLOG_CONFIG_REPL_CTXT);
+	LASSERT(ctxt);
+
+	lctxt = llog_get_context(mgc, LLOG_CONFIG_ORIG_CTXT);
+
+	/* Copy the setup log locally if we can. Don't mess around if we're
+	 * running an MGS though (logs are already local). */
+	if (lctxt && lsi && IS_SERVER(lsi) && !IS_MGS(lsi) &&
+	    cli->cl_mgc_configs_dir != NULL &&
+	    lu2dt_dev(cli->cl_mgc_configs_dir->do_lu.lo_dev) ==
+	    lsi->lsi_dt_dev) {
+		if (!local_only && !lsi->lsi_dt_dev->dd_rdonly)
+			/* Only try to copy log if we have the lock. */
+			rc = mgc_llog_local_copy(env, mgc, ctxt, lctxt,
+						 cld->cld_logname);
+		if (local_only || rc) {
+			if (strcmp(cld->cld_logname, PARAMS_FILENAME) != 0 &&
+			    llog_is_empty(env, lctxt, cld->cld_logname)) {
+				LCONSOLE_ERROR_MSG(0x13a, "Failed to get MGS "
+						   "log %s and no local copy."
+						   "\n", cld->cld_logname);
+				GOTO(out_pop, rc = -ENOENT);
+			}
+			CDEBUG(D_MGC, "Failed to get MGS log %s, using local "
+			       "copy for now, will try to update later.\n",
+			       cld->cld_logname);
+			rc = 0;
+		}
+		/* Now, whether we copied or not, start using the local llog.
+		 * If we failed to copy, we'll start using whatever the old
+		 * log has. */
+		llog_ctxt_put(ctxt);
+		ctxt = lctxt;
+		lctxt = NULL;
+	} else {
+		if (local_only) /* no local log at client side */
+			GOTO(out_pop, rc = -EIO);
+	}
+
+	rc = -EAGAIN;
+	if (lsi && IS_SERVER(lsi) && !IS_MGS(lsi) &&
+	    lsi->lsi_dt_dev->dd_rdonly) {
+		struct llog_ctxt *rctxt;
+
+		/* Under readonly mode, we may have no local copy or local
+		 * copy is incomplete, so try to use remote llog firstly. */
+		rctxt = llog_get_context(mgc, LLOG_CONFIG_REPL_CTXT);
+		LASSERT(rctxt);
+
+		rc = class_config_parse_llog(env, rctxt, cld->cld_logname,
+					     &cld->cld_cfg);
+		llog_ctxt_put(rctxt);
+	}
+
+	if (rc && rc != -ENOENT)
+		rc = class_config_parse_llog(env, ctxt, cld->cld_logname,
+					     &cld->cld_cfg);
+
+	/*
+	 * update settings on existing OBDs. doing it inside
+	 * of llog_process_lock so no device is attaching/detaching
+	 * in parallel.
+	 * the logname must be <fsname>-sptlrpc
+	 */
+	if (rc == 0 && cld_is_sptlrpc(cld))
+		class_notify_sptlrpc_conf(cld->cld_logname,
+					  strlen(cld->cld_logname) -
+					  strlen("-sptlrpc"));
+	EXIT;
+
+out_pop:
+	__llog_ctxt_put(env, ctxt);
+	if (lctxt)
+		__llog_ctxt_put(env, lctxt);
+
+	lu_env_fini(env);
+out_free:
+	OBD_FREE_PTR(env);
+	return rc;
+}
+
+static bool mgc_import_in_recovery(struct obd_import *imp)
+{
+	bool in_recovery = true;
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_FULL ||
+	    imp->imp_state == LUSTRE_IMP_CLOSED)
+		in_recovery = false;
+	spin_unlock(&imp->imp_lock);
+
+	return in_recovery;
+}
+
+/**
+ * Get a configuration log from the MGS and process it.
+ *
+ * This function is called for both clients and servers to process the
+ * configuration log from the MGS.  The MGC enqueues a DLM lock on the
+ * log from the MGS, and if the lock gets revoked the MGC will be notified
+ * by the lock cancellation callback that the config log has changed,
+ * and will enqueue another MGS lock on it, and then continue processing
+ * the new additions to the end of the log.
+ *
+ * Since the MGC import is not replayable, if the import is being evicted
+ * (rcl == -ESHUTDOWN, \see ptlrpc_import_delay_req()), retry to process
+ * the log until recovery is finished or the import is closed.
+ *
+ * Make a local copy of the log before parsing it if appropriate (non-MGS
+ * server) so that the server can start even when the MGS is down.
+ *
+ * There shouldn't be multiple processes running process_log at once --
+ * sounds like badness.  It actually might be fine, as long as they're not
+ * trying to update from the same log simultaneously, in which case we
+ * should use a per-log semaphore instead of cld_lock.
+ *
+ * \param[in] mgc	MGC device by which to fetch the configuration log
+ * \param[in] cld	log processing state (stored in lock callback data)
+ *
+ * \retval		0 on success
+ * \retval		negative errno on failure
+ */
+int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld)
+{
+        struct lustre_handle lockh = { 0 };
+	__u64 flags = LDLM_FL_NO_LRU;
+	int rc = 0, rcl;
+	bool retry = false;
+        ENTRY;
+
+	LASSERT(cld != NULL);
+
+        /* I don't want multiple processes running process_log at once --
+           sounds like badness.  It actually might be fine, as long as
+           we're not trying to update from the same log
+           simultaneously (in which case we should use a per-log sem.) */
+restart:
+	mutex_lock(&cld->cld_lock);
+	if (cld->cld_stopping) {
+		mutex_unlock(&cld->cld_lock);
+                RETURN(0);
+        }
+
+        OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PAUSE_PROCESS_LOG, 20);
+
+	CDEBUG(D_MGC, "Process log %s:%p from %d\n", cld->cld_logname,
+	       cld->cld_cfg.cfg_instance, cld->cld_cfg.cfg_last_idx + 1);
+
+	/* Get the cfg lock on the llog */
+	rcl = mgc_enqueue(mgc->u.cli.cl_mgc_mgsexp, LDLM_PLAIN, NULL,
+			  LCK_CR, &flags,
+			  cld_is_barrier(cld) ? mgc_barrier_glimpse_ast : NULL,
+			  cld, 0, NULL, &lockh);
+	if (rcl == 0) {
+		/* Get the cld, it will be released in mgc_blocking_ast. */
+		config_log_get(cld);
+		rc = ldlm_lock_set_data(&lockh, (void *)cld);
+		LASSERT(rc == 0);
+	} else {
+		CDEBUG(D_MGC, "Can't get cfg lock: %d\n", rcl);
+
+		if (rcl == -ESHUTDOWN &&
+		    atomic_read(&mgc->u.cli.cl_mgc_refcount) > 0 && !retry) {
+			struct obd_import *imp;
+			struct l_wait_info lwi;
+			int secs = cfs_time_seconds(obd_timeout);
+
+			mutex_unlock(&cld->cld_lock);
+			imp = class_exp2cliimp(mgc->u.cli.cl_mgc_mgsexp);
+
+			/* Let's force the pinger, and wait the import to be
+			 * connected, note: since mgc import is non-replayable,
+			 * and even the import state is disconnected, it does
+			 * not mean the "recovery" is stopped, so we will keep
+			 * waitting until timeout or the import state is
+			 * FULL or closed */
+			ptlrpc_pinger_force(imp);
+
+			lwi = LWI_TIMEOUT(secs, NULL, NULL);
+			l_wait_event(imp->imp_recovery_waitq,
+				     !mgc_import_in_recovery(imp), &lwi);
+
+			if (imp->imp_state == LUSTRE_IMP_FULL) {
+				retry = true;
+				goto restart;
+			} else {
+				mutex_lock(&cld->cld_lock);
+				spin_lock(&config_list_lock);
+				cld->cld_lostlock = 1;
+				spin_unlock(&config_list_lock);
+			}
+		} else {
+			/* mark cld_lostlock so that it will requeue
+			 * after MGC becomes available. */
+			spin_lock(&config_list_lock);
+			cld->cld_lostlock = 1;
+			spin_unlock(&config_list_lock);
+		}
+	}
+
+	if (cld_is_recover(cld) || cld_is_nodemap(cld)) {
+		if (!rcl)
+			rc = mgc_process_recover_nodemap_log(mgc, cld);
+		else if (cld_is_nodemap(cld))
+			rc = rcl;
+
+		if (cld_is_recover(cld) && rc) {
+			if (!rcl) {
+				CERROR("%s: recover log %s failed, not fatal: rc = %d\n",
+				       mgc->obd_name, cld->cld_logname, rc);
+				spin_lock(&config_list_lock);
+				cld->cld_lostlock = 1;
+				spin_unlock(&config_list_lock);
+			}
+			rc = 0; /* this is not a fatal error for recover log */
+		}
+	} else if (!cld_is_barrier(cld)) {
+		rc = mgc_process_cfg_log(mgc, cld, rcl != 0);
+	}
+
+	CDEBUG(D_MGC, "%s: configuration from log '%s' %sed (%d).\n",
+	       mgc->obd_name, cld->cld_logname, rc ? "fail" : "succeed", rc);
+
+	mutex_unlock(&cld->cld_lock);
+
+	/* Now drop the lock so MGS can revoke it */
+	if (!rcl) {
+		rcl = mgc_cancel(mgc->u.cli.cl_mgc_mgsexp, LCK_CR, &lockh);
+		if (rcl)
+			CERROR("Can't drop cfg lock: %d\n", rcl);
+	}
+
+	RETURN(rc);
+}
+
+
+/** Called from lustre_process_log.
+ * LCFG_LOG_START gets the config log from the MGS, processes it to start
+ * any services, and adds it to the list logs to watch (follow).
+ */
+static int mgc_process_config(struct obd_device *obd, size_t len, void *buf)
+{
+        struct lustre_cfg *lcfg = buf;
+        struct config_llog_instance *cfg = NULL;
+        char *logname;
+        int rc = 0;
+        ENTRY;
+
+        switch(lcfg->lcfg_command) {
+        case LCFG_LOV_ADD_OBD: {
+                /* Overloading this cfg command: register a new target */
+                struct mgs_target_info *mti;
+
+                if (LUSTRE_CFG_BUFLEN(lcfg, 1) !=
+                    sizeof(struct mgs_target_info))
+                        GOTO(out, rc = -EINVAL);
+
+                mti = (struct mgs_target_info *)lustre_cfg_buf(lcfg, 1);
+                CDEBUG(D_MGC, "add_target %s %#x\n",
+                       mti->mti_svname, mti->mti_flags);
+                rc = mgc_target_register(obd->u.cli.cl_mgc_mgsexp, mti);
+                break;
+        }
+        case LCFG_LOV_DEL_OBD:
+                /* Unregister has no meaning at the moment. */
+                CERROR("lov_del_obd unimplemented\n");
+                rc = -ENOSYS;
+                break;
+        case LCFG_SPTLRPC_CONF: {
+                rc = sptlrpc_process_config(lcfg);
+                break;
+        }
+        case LCFG_LOG_START: {
+                struct config_llog_data *cld;
+                struct super_block *sb;
+
+                logname = lustre_cfg_string(lcfg, 1);
+                cfg = (struct config_llog_instance *)lustre_cfg_buf(lcfg, 2);
+                sb = *(struct super_block **)lustre_cfg_buf(lcfg, 3);
+
+                CDEBUG(D_MGC, "parse_log %s from %d\n", logname,
+                       cfg->cfg_last_idx);
+
+                /* We're only called through here on the initial mount */
+		cld = config_log_add(obd, logname, cfg, sb);
+		if (IS_ERR(cld)) {
+			rc = PTR_ERR(cld);
+			break;
+		}
+
+		/* COMPAT_146 */
+		/* FIXME only set this for old logs!  Right now this forces
+		   us to always skip the "inside markers" check */
+		cld->cld_cfg.cfg_flags |= CFG_F_COMPAT146;
+
+		rc = mgc_process_log(obd, cld);
+		if (rc == 0 && cld->cld_recover != NULL) {
+			if (OCD_HAS_FLAG(&obd->u.cli.cl_import->
+					 imp_connect_data, IMP_RECOV)) {
+				rc = mgc_process_log(obd, cld->cld_recover);
+			} else {
+				struct config_llog_data *cir;
+
+				mutex_lock(&cld->cld_lock);
+				cir = cld->cld_recover;
+				cld->cld_recover = NULL;
+				mutex_unlock(&cld->cld_lock);
+				config_log_put(cir);
+			}
+
+			if (rc)
+				CERROR("Cannot process recover llog %d\n", rc);
+		}
+
+		if (rc == 0 && cld->cld_params != NULL) {
+			rc = mgc_process_log(obd, cld->cld_params);
+			if (rc == -ENOENT) {
+				CDEBUG(D_MGC, "There is no params "
+					      "config file yet\n");
+				rc = 0;
+			}
+			/* params log is optional */
+			if (rc)
+				CERROR("%s: can't process params llog: rc = %d\n",
+				       obd->obd_name, rc);
+		}
+
+                break;
+        }
+        case LCFG_LOG_END: {
+                logname = lustre_cfg_string(lcfg, 1);
+
+                if (lcfg->lcfg_bufcount >= 2)
+                        cfg = (struct config_llog_instance *)lustre_cfg_buf(
+                                lcfg, 2);
+                rc = config_log_end(logname, cfg);
+                break;
+        }
+        default: {
+                CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+                GOTO(out, rc = -EINVAL);
+
+        }
+        }
+out:
+        RETURN(rc);
+}
+
+static struct obd_ops mgc_obd_ops = {
+        .o_owner        = THIS_MODULE,
+        .o_setup        = mgc_setup,
+        .o_precleanup   = mgc_precleanup,
+        .o_cleanup      = mgc_cleanup,
+        .o_add_conn     = client_import_add_conn,
+        .o_del_conn     = client_import_del_conn,
+        .o_connect      = client_connect_import,
+        .o_disconnect   = client_disconnect_export,
+        .o_set_info_async = mgc_set_info_async,
+        .o_get_info       = mgc_get_info,
+        .o_import_event = mgc_import_event,
+        .o_process_config = mgc_process_config,
+};
+
+static int __init mgc_init(void)
+{
+	return class_register_type(&mgc_obd_ops, NULL, true, NULL,
+				   LUSTRE_MGC_NAME, NULL);
+}
+
+static void __exit mgc_exit(void)
+{
+        class_unregister_type(LUSTRE_MGC_NAME);
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Management Client");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+MODULE_LICENSE("GPL");
+
+module_init(mgc_init);
+module_exit(mgc_exit);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/acl.c b/drivers/staging/lustrefsx/lustre/obdclass/acl.c
new file mode 100644
index 0000000000000..77ea22644e27b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/acl.c
@@ -0,0 +1,282 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/acl.c
+ *
+ * Lustre Access Control List.
+ *
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <lu_object.h>
+#include <lustre_acl.h>
+#include <lustre_eacl.h>
+#include <obd_support.h>
+#ifdef HAVE_SERVER_SUPPORT
+# include <lustre_idmap.h>
+# include <md_object.h>
+#endif /* HAVE_SERVER_SUPPORT */
+
+#ifdef CONFIG_FS_POSIX_ACL
+
+static inline void lustre_posix_acl_le_to_cpu(posix_acl_xattr_entry *d,
+                                              posix_acl_xattr_entry *s)
+{
+        d->e_tag        = le16_to_cpu(s->e_tag);
+        d->e_perm       = le16_to_cpu(s->e_perm);
+        d->e_id         = le32_to_cpu(s->e_id);
+}
+
+/*static inline void lustre_posix_acl_cpu_to_le(posix_acl_xattr_entry *d,
+                                              posix_acl_xattr_entry *s)
+{
+        d->e_tag        = cpu_to_le16(s->e_tag);
+        d->e_perm       = cpu_to_le16(s->e_perm);
+        d->e_id         = cpu_to_le32(s->e_id);
+}*/
+
+/*
+ * Check permission based on POSIX ACL.
+ */
+int lustre_posix_acl_permission(struct lu_ucred *mu, const struct lu_attr *la,
+				int want, posix_acl_xattr_entry *entry,
+				int count)
+{
+        posix_acl_xattr_entry *pa, *pe, *mask_obj;
+        posix_acl_xattr_entry ae, me;
+        int found = 0;
+
+        if (count <= 0)
+                return -EACCES;
+
+        for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) {
+                lustre_posix_acl_le_to_cpu(&ae, pa);
+                switch (ae.e_tag) {
+                case ACL_USER_OBJ:
+                        /* (May have been checked already) */
+			if (la->la_uid == mu->uc_fsuid)
+				goto check_perm;
+                        break;
+                case ACL_USER:
+			if (ae.e_id == mu->uc_fsuid)
+				goto mask;
+                        break;
+                case ACL_GROUP_OBJ:
+                        if (lustre_in_group_p(mu, la->la_gid)) {
+                                found = 1;
+                                if ((ae.e_perm & want) == want)
+                                        goto mask;
+                        }
+                        break;
+                case ACL_GROUP:
+                        if (lustre_in_group_p(mu, ae.e_id)) {
+                                found = 1;
+                                if ((ae.e_perm & want) == want)
+                                        goto mask;
+                        }
+                        break;
+                case ACL_MASK:
+                        break;
+                case ACL_OTHER:
+                        if (found)
+                                return -EACCES;
+                        else
+                                goto check_perm;
+                default:
+                        return -EIO;
+                }
+        }
+        return -EIO;
+
+mask:
+        for (mask_obj = pa + 1; mask_obj <= pe; mask_obj++) {
+                lustre_posix_acl_le_to_cpu(&me, mask_obj);
+                if (me.e_tag == ACL_MASK) {
+                        if ((ae.e_perm & me.e_perm & want) == want)
+                                return 0;
+
+                        return -EACCES;
+                }
+        }
+
+check_perm:
+        if ((ae.e_perm & want) == want)
+                return 0;
+
+        return -EACCES;
+}
+EXPORT_SYMBOL(lustre_posix_acl_permission);
+
+/*
+ * Modify the ACL for the chmod.
+ */
+int lustre_posix_acl_chmod_masq(posix_acl_xattr_entry *entry, __u32 mode,
+                                int count)
+{
+	posix_acl_xattr_entry *group_obj = NULL, *mask_obj = NULL, *pa, *pe;
+
+        for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) {
+		switch (le16_to_cpu(pa->e_tag)) {
+		case ACL_USER_OBJ:
+			pa->e_perm = cpu_to_le16((mode & S_IRWXU) >> 6);
+			break;
+		case ACL_USER:
+		case ACL_GROUP:
+			break;
+		case ACL_GROUP_OBJ:
+			group_obj = pa;
+			break;
+		case ACL_MASK:
+			mask_obj = pa;
+			break;
+		case ACL_OTHER:
+			pa->e_perm = cpu_to_le16(mode & S_IRWXO);
+			break;
+		default:
+			return -EIO;
+		}
+	}
+
+	if (mask_obj) {
+		mask_obj->e_perm = cpu_to_le16((mode & S_IRWXG) >> 3);
+	} else {
+		if (!group_obj)
+			return -EIO;
+		group_obj->e_perm = cpu_to_le16((mode & S_IRWXG) >> 3);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(lustre_posix_acl_chmod_masq);
+
+/*
+ * Returns 0 if the acl can be exactly represented in the traditional
+ * file mode permission bits, or else 1. Returns -E... on error.
+ */
+int
+lustre_posix_acl_equiv_mode(posix_acl_xattr_entry *entry, mode_t *mode_p,
+			    int count)
+{
+	posix_acl_xattr_entry *pa, *pe;
+	mode_t                 mode = 0;
+	int                    not_equiv = 0;
+
+	for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) {
+		__u16 perm = le16_to_cpu(pa->e_perm);
+		switch (le16_to_cpu(pa->e_tag)) {
+			case ACL_USER_OBJ:
+				mode |= (perm & S_IRWXO) << 6;
+				break;
+			case ACL_GROUP_OBJ:
+				mode |= (perm & S_IRWXO) << 3;
+				break;
+			case ACL_OTHER:
+				mode |= perm & S_IRWXO;
+				break;
+			case ACL_MASK:
+				mode = (mode & ~S_IRWXG) |
+					((perm & S_IRWXO) << 3);
+				not_equiv = 1;
+				break;
+			case ACL_USER:
+			case ACL_GROUP:
+				not_equiv = 1;
+				break;
+			default:
+				return -EINVAL;
+		}
+	}
+	if (mode_p)
+		*mode_p = (*mode_p & ~S_IRWXUGO) | mode;
+	return not_equiv;
+}
+EXPORT_SYMBOL(lustre_posix_acl_equiv_mode);
+
+/*
+ * Modify acl when creating a new object.
+ */
+int lustre_posix_acl_create_masq(posix_acl_xattr_entry *entry, __u32 *pmode,
+                                 int count)
+{
+        posix_acl_xattr_entry *group_obj = NULL, *mask_obj = NULL, *pa, *pe;
+        posix_acl_xattr_entry ae;
+	__u32 mode = *pmode;
+	int not_equiv = 0;
+
+        for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) {
+                lustre_posix_acl_le_to_cpu(&ae, pa);
+                switch (ae.e_tag) {
+                case ACL_USER_OBJ:
+                        ae.e_perm &= (mode >> 6) | ~S_IRWXO;
+			pa->e_perm = cpu_to_le16(ae.e_perm);
+			mode &= (ae.e_perm << 6) | ~S_IRWXU;
+			break;
+		case ACL_USER:
+		case ACL_GROUP:
+			not_equiv = 1;
+			break;
+                case ACL_GROUP_OBJ:
+			group_obj = pa;
+                        break;
+                case ACL_OTHER:
+                        ae.e_perm &= mode | ~S_IRWXO;
+			pa->e_perm = cpu_to_le16(ae.e_perm);
+			mode &= ae.e_perm | ~S_IRWXO;
+                        break;
+                case ACL_MASK:
+			mask_obj = pa;
+			not_equiv = 1;
+                        break;
+		default:
+			return -EIO;
+                }
+        }
+
+	if (mask_obj) {
+		ae.e_perm = le16_to_cpu(mask_obj->e_perm) &
+                            ((mode >> 3) | ~S_IRWXO);
+		mode &= (ae.e_perm << 3) | ~S_IRWXG;
+                mask_obj->e_perm = cpu_to_le16(ae.e_perm);
+	} else {
+		if (!group_obj)
+			return -EIO;
+		ae.e_perm = le16_to_cpu(group_obj->e_perm) &
+                            ((mode >> 3) | ~S_IRWXO);
+		mode &= (ae.e_perm << 3) | ~S_IRWXG;
+                group_obj->e_perm = cpu_to_le16(ae.e_perm);
+	}
+
+	*pmode = (*pmode & ~S_IRWXUGO) | mode;
+        return not_equiv;
+}
+EXPORT_SYMBOL(lustre_posix_acl_create_masq);
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_internal.h b/drivers/staging/lustrefsx/lustre/obdclass/cl_internal.h
new file mode 100644
index 0000000000000..0f95caf310755
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_internal.h
@@ -0,0 +1,53 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal cl interfaces.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+#ifndef _CL_INTERNAL_H
+#define _CL_INTERNAL_H
+
+/**
+ * Thread local state internal for generic cl-code.
+ */
+struct cl_thread_info {
+	/**
+	 * Used for submitting a sync I/O.
+	 */
+	struct cl_sync_io clt_anchor;
+};
+
+struct cl_thread_info *cl_env_info(const struct lu_env *env);
+void cl_page_disown0(const struct lu_env *env,
+		     struct cl_io *io, struct cl_page *pg);
+
+#endif /* _CL_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_io.c b/drivers/staging/lustrefsx/lustre/obdclass/cl_io.c
new file mode 100644
index 0000000000000..fc22b2c89f17d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_io.c
@@ -0,0 +1,1362 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client IO.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <cl_object.h>
+#include "cl_internal.h"
+#include <lustre_compat.h>
+
+/*****************************************************************************
+ *
+ * cl_io interface.
+ *
+ */
+
+static inline int cl_io_type_is_valid(enum cl_io_type type)
+{
+        return CIT_READ <= type && type < CIT_OP_NR;
+}
+
+static inline int cl_io_is_loopable(const struct cl_io *io)
+{
+        return cl_io_type_is_valid(io->ci_type) && io->ci_type != CIT_MISC;
+}
+
+/**
+ * cl_io invariant that holds at all times when exported cl_io_*() functions
+ * are entered and left.
+ */
+static int cl_io_invariant(const struct cl_io *io)
+{
+        struct cl_io *up;
+
+        up = io->ci_parent;
+        return
+                /*
+                 * io can own pages only when it is ongoing. Sub-io might
+                 * still be in CIS_LOCKED state when top-io is in
+                 * CIS_IO_GOING.
+                 */
+                ergo(io->ci_owned_nr > 0, io->ci_state == CIS_IO_GOING ||
+                     (io->ci_state == CIS_LOCKED && up != NULL));
+}
+
+/**
+ * Finalize \a io, by calling cl_io_operations::cio_fini() bottom-to-top.
+ */
+void cl_io_fini(const struct lu_env *env, struct cl_io *io)
+{
+	struct cl_io_slice    *slice;
+
+        LINVRNT(cl_io_type_is_valid(io->ci_type));
+        LINVRNT(cl_io_invariant(io));
+        ENTRY;
+
+	while (!list_empty(&io->ci_layers)) {
+                slice = container_of(io->ci_layers.prev, struct cl_io_slice,
+                                     cis_linkage);
+		list_del_init(&slice->cis_linkage);
+                if (slice->cis_iop->op[io->ci_type].cio_fini != NULL)
+                        slice->cis_iop->op[io->ci_type].cio_fini(env, slice);
+                /*
+                 * Invalidate slice to catch use after free. This assumes that
+                 * slices are allocated within session and can be touched
+                 * after ->cio_fini() returns.
+                 */
+                slice->cis_io = NULL;
+        }
+        io->ci_state = CIS_FINI;
+
+	/* sanity check for layout change */
+	switch(io->ci_type) {
+	case CIT_READ:
+	case CIT_WRITE:
+	case CIT_DATA_VERSION:
+	case CIT_FAULT:
+		break;
+	case CIT_FSYNC:
+		LASSERT(!io->ci_need_restart);
+		break;
+	case CIT_SETATTR:
+	case CIT_MISC:
+		/* Check ignore layout change conf */
+		LASSERT(ergo(io->ci_ignore_layout || !io->ci_verify_layout,
+				!io->ci_need_restart));
+		break;
+	case CIT_LADVISE:
+		break;
+	default:
+		LBUG();
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_io_fini);
+
+static int cl_io_init0(const struct lu_env *env, struct cl_io *io,
+                       enum cl_io_type iot, struct cl_object *obj)
+{
+        struct cl_object *scan;
+        int result;
+
+        LINVRNT(io->ci_state == CIS_ZERO || io->ci_state == CIS_FINI);
+        LINVRNT(cl_io_type_is_valid(iot));
+        LINVRNT(cl_io_invariant(io));
+        ENTRY;
+
+        io->ci_type = iot;
+	INIT_LIST_HEAD(&io->ci_lockset.cls_todo);
+	INIT_LIST_HEAD(&io->ci_lockset.cls_done);
+	INIT_LIST_HEAD(&io->ci_layers);
+
+        result = 0;
+        cl_object_for_each(scan, obj) {
+                if (scan->co_ops->coo_io_init != NULL) {
+                        result = scan->co_ops->coo_io_init(env, scan, io);
+                        if (result != 0)
+                                break;
+                }
+        }
+        if (result == 0)
+                io->ci_state = CIS_INIT;
+        RETURN(result);
+}
+
+/**
+ * Initialize sub-io, by calling cl_io_operations::cio_init() top-to-bottom.
+ *
+ * \pre obj != cl_object_top(obj)
+ */
+int cl_io_sub_init(const struct lu_env *env, struct cl_io *io,
+                   enum cl_io_type iot, struct cl_object *obj)
+{
+        LASSERT(obj != cl_object_top(obj));
+
+        return cl_io_init0(env, io, iot, obj);
+}
+EXPORT_SYMBOL(cl_io_sub_init);
+
+/**
+ * Initialize \a io, by calling cl_io_operations::cio_init() top-to-bottom.
+ *
+ * Caller has to call cl_io_fini() after a call to cl_io_init(), no matter
+ * what the latter returned.
+ *
+ * \pre obj == cl_object_top(obj)
+ * \pre cl_io_type_is_valid(iot)
+ * \post cl_io_type_is_valid(io->ci_type) && io->ci_type == iot
+ */
+int cl_io_init(const struct lu_env *env, struct cl_io *io,
+               enum cl_io_type iot, struct cl_object *obj)
+{
+        LASSERT(obj == cl_object_top(obj));
+
+        return cl_io_init0(env, io, iot, obj);
+}
+EXPORT_SYMBOL(cl_io_init);
+
+/**
+ * Initialize read or write io.
+ *
+ * \pre iot == CIT_READ || iot == CIT_WRITE
+ */
+int cl_io_rw_init(const struct lu_env *env, struct cl_io *io,
+                  enum cl_io_type iot, loff_t pos, size_t count)
+{
+	LINVRNT(iot == CIT_READ || iot == CIT_WRITE);
+	LINVRNT(io->ci_obj != NULL);
+	ENTRY;
+
+	if (cfs_ptengine_weight(cl_io_engine) < 2)
+		io->ci_pio = 0;
+
+	LU_OBJECT_HEADER(D_VFSTRACE, env, &io->ci_obj->co_lu,
+			 "io %s range: [%llu, %llu) %s %s %s %s\n",
+			 iot == CIT_READ ? "read" : "write",
+			 pos, pos + count,
+			 io->u.ci_rw.rw_nonblock ? "nonblock" : "block",
+			 io->u.ci_rw.rw_append ? "append" : "-",
+			 io->u.ci_rw.rw_sync ? "sync" : "-",
+			 io->ci_pio ? "pio" : "-");
+
+	io->u.ci_rw.rw_range.cir_pos   = pos;
+	io->u.ci_rw.rw_range.cir_count = count;
+
+	RETURN(cl_io_init(env, io, iot, io->ci_obj));
+}
+EXPORT_SYMBOL(cl_io_rw_init);
+
+static int cl_lock_descr_sort(const struct cl_lock_descr *d0,
+                              const struct cl_lock_descr *d1)
+{
+	return lu_fid_cmp(lu_object_fid(&d0->cld_obj->co_lu),
+			  lu_object_fid(&d1->cld_obj->co_lu));
+}
+
+/*
+ * Sort locks in lexicographical order of their (fid, start-offset) pairs.
+ */
+static void cl_io_locks_sort(struct cl_io *io)
+{
+        int done = 0;
+
+        ENTRY;
+        /* hidden treasure: bubble sort for now. */
+        do {
+                struct cl_io_lock_link *curr;
+                struct cl_io_lock_link *prev;
+                struct cl_io_lock_link *temp;
+
+                done = 1;
+                prev = NULL;
+
+		list_for_each_entry_safe(curr, temp, &io->ci_lockset.cls_todo,
+					 cill_linkage) {
+			if (prev != NULL) {
+				switch (cl_lock_descr_sort(&prev->cill_descr,
+							   &curr->cill_descr)) {
+				case 0:
+					/*
+					 * IMPOSSIBLE:	Identical locks are
+					 *		already removed at
+					 *		this point.
+					 */
+				default:
+					LBUG();
+				case +1:
+					list_move_tail(&curr->cill_linkage,
+						       &prev->cill_linkage);
+					done = 0;
+					continue; /* don't change prev: it's
+						   * still "previous" */
+				case -1: /* already in order */
+					break;
+				}
+			}
+			prev = curr;
+		}
+	} while (!done);
+	EXIT;
+}
+
+static void cl_lock_descr_merge(struct cl_lock_descr *d0,
+				const struct cl_lock_descr *d1)
+{
+	d0->cld_start = min(d0->cld_start, d1->cld_start);
+	d0->cld_end = max(d0->cld_end, d1->cld_end);
+
+	if (d1->cld_mode == CLM_WRITE && d0->cld_mode != CLM_WRITE)
+		d0->cld_mode = CLM_WRITE;
+
+	if (d1->cld_mode == CLM_GROUP && d0->cld_mode != CLM_GROUP)
+		d0->cld_mode = CLM_GROUP;
+}
+
+static int cl_lockset_merge(const struct cl_lockset *set,
+			    const struct cl_lock_descr *need)
+{
+	struct cl_io_lock_link *scan;
+
+	ENTRY;
+	list_for_each_entry(scan, &set->cls_todo, cill_linkage) {
+		if (!cl_object_same(scan->cill_descr.cld_obj, need->cld_obj))
+			continue;
+
+		/* Merge locks for the same object because ldlm lock server
+		 * may expand the lock extent, otherwise there is a deadlock
+		 * case if two conflicted locks are queueud for the same object
+		 * and lock server expands one lock to overlap the another.
+		 * The side effect is that it can generate a multi-stripe lock
+		 * that may cause casacading problem */
+		cl_lock_descr_merge(&scan->cill_descr, need);
+		CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n",
+		       scan->cill_descr.cld_mode, scan->cill_descr.cld_start,
+		       scan->cill_descr.cld_end);
+		RETURN(+1);
+	}
+	RETURN(0);
+}
+
+static int cl_lockset_lock(const struct lu_env *env, struct cl_io *io,
+			   struct cl_lockset *set)
+{
+	struct cl_io_lock_link *link;
+	struct cl_io_lock_link *temp;
+	int result;
+
+	ENTRY;
+	result = 0;
+	list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage) {
+		result = cl_lock_request(env, io, &link->cill_lock);
+		if (result < 0)
+			break;
+
+		list_move(&link->cill_linkage, &set->cls_done);
+	}
+	RETURN(result);
+}
+
+/**
+ * Takes locks necessary for the current iteration of io.
+ *
+ * Calls cl_io_operations::cio_lock() top-to-bottom to collect locks required
+ * by layers for the current iteration. Then sort locks (to avoid dead-locks),
+ * and acquire them.
+ */
+int cl_io_lock(const struct lu_env *env, struct cl_io *io)
+{
+        const struct cl_io_slice *scan;
+        int result = 0;
+
+        LINVRNT(cl_io_is_loopable(io));
+        LINVRNT(io->ci_state == CIS_IT_STARTED);
+        LINVRNT(cl_io_invariant(io));
+
+        ENTRY;
+	list_for_each_entry(scan, &io->ci_layers, cis_linkage) {
+		if (scan->cis_iop->op[io->ci_type].cio_lock == NULL)
+			continue;
+		result = scan->cis_iop->op[io->ci_type].cio_lock(env, scan);
+		if (result != 0)
+			break;
+	}
+        if (result == 0) {
+                cl_io_locks_sort(io);
+                result = cl_lockset_lock(env, io, &io->ci_lockset);
+        }
+        if (result != 0)
+                cl_io_unlock(env, io);
+        else
+                io->ci_state = CIS_LOCKED;
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_lock);
+
+/**
+ * Release locks takes by io.
+ */
+void cl_io_unlock(const struct lu_env *env, struct cl_io *io)
+{
+        struct cl_lockset        *set;
+        struct cl_io_lock_link   *link;
+        struct cl_io_lock_link   *temp;
+        const struct cl_io_slice *scan;
+
+        LASSERT(cl_io_is_loopable(io));
+        LASSERT(CIS_IT_STARTED <= io->ci_state && io->ci_state < CIS_UNLOCKED);
+        LINVRNT(cl_io_invariant(io));
+
+        ENTRY;
+        set = &io->ci_lockset;
+
+	list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage) {
+		list_del_init(&link->cill_linkage);
+		if (link->cill_fini != NULL)
+			link->cill_fini(env, link);
+	}
+
+	list_for_each_entry_safe(link, temp, &set->cls_done, cill_linkage) {
+		list_del_init(&link->cill_linkage);
+		cl_lock_release(env, &link->cill_lock);
+		if (link->cill_fini != NULL)
+			link->cill_fini(env, link);
+	}
+
+	list_for_each_entry_reverse(scan, &io->ci_layers, cis_linkage) {
+		if (scan->cis_iop->op[io->ci_type].cio_unlock != NULL)
+			scan->cis_iop->op[io->ci_type].cio_unlock(env, scan);
+	}
+	io->ci_state = CIS_UNLOCKED;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_io_unlock);
+
+/**
+ * Prepares next iteration of io.
+ *
+ * Calls cl_io_operations::cio_iter_init() top-to-bottom. This exists to give
+ * layers a chance to modify io parameters, e.g., so that lov can restrict io
+ * to a single stripe.
+ */
+int cl_io_iter_init(const struct lu_env *env, struct cl_io *io)
+{
+        const struct cl_io_slice *scan;
+        int result;
+
+        LINVRNT(cl_io_is_loopable(io));
+        LINVRNT(io->ci_state == CIS_INIT || io->ci_state == CIS_IT_ENDED);
+        LINVRNT(cl_io_invariant(io));
+
+        ENTRY;
+        result = 0;
+	list_for_each_entry(scan, &io->ci_layers, cis_linkage) {
+		if (scan->cis_iop->op[io->ci_type].cio_iter_init == NULL)
+			continue;
+		result = scan->cis_iop->op[io->ci_type].cio_iter_init(env,
+								      scan);
+		if (result != 0)
+			break;
+	}
+        if (result == 0)
+                io->ci_state = CIS_IT_STARTED;
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_iter_init);
+
+/**
+ * Finalizes io iteration.
+ *
+ * Calls cl_io_operations::cio_iter_fini() bottom-to-top.
+ */
+void cl_io_iter_fini(const struct lu_env *env, struct cl_io *io)
+{
+        const struct cl_io_slice *scan;
+
+        LINVRNT(cl_io_is_loopable(io));
+        LINVRNT(io->ci_state == CIS_UNLOCKED);
+        LINVRNT(cl_io_invariant(io));
+
+        ENTRY;
+	list_for_each_entry_reverse(scan, &io->ci_layers, cis_linkage) {
+		if (scan->cis_iop->op[io->ci_type].cio_iter_fini != NULL)
+			scan->cis_iop->op[io->ci_type].cio_iter_fini(env, scan);
+	}
+        io->ci_state = CIS_IT_ENDED;
+        EXIT;
+}
+EXPORT_SYMBOL(cl_io_iter_fini);
+
+/**
+ * Records that read or write io progressed \a nob bytes forward.
+ */
+void cl_io_rw_advance(const struct lu_env *env, struct cl_io *io, size_t nob)
+{
+        const struct cl_io_slice *scan;
+
+        LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE ||
+                nob == 0);
+        LINVRNT(cl_io_is_loopable(io));
+        LINVRNT(cl_io_invariant(io));
+
+        ENTRY;
+
+	io->u.ci_rw.rw_range.cir_pos   += nob;
+	io->u.ci_rw.rw_range.cir_count -= nob;
+
+        /* layers have to be notified. */
+	list_for_each_entry_reverse(scan, &io->ci_layers, cis_linkage) {
+		if (scan->cis_iop->op[io->ci_type].cio_advance != NULL)
+			scan->cis_iop->op[io->ci_type].cio_advance(env, scan,
+								   nob);
+	}
+        EXIT;
+}
+
+/**
+ * Adds a lock to a lockset.
+ */
+int cl_io_lock_add(const struct lu_env *env, struct cl_io *io,
+                   struct cl_io_lock_link *link)
+{
+        int result;
+
+        ENTRY;
+        if (cl_lockset_merge(&io->ci_lockset, &link->cill_descr))
+                result = +1;
+        else {
+		list_add(&link->cill_linkage, &io->ci_lockset.cls_todo);
+                result = 0;
+        }
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_lock_add);
+
+static void cl_free_io_lock_link(const struct lu_env *env,
+                                 struct cl_io_lock_link *link)
+{
+        OBD_FREE_PTR(link);
+}
+
+/**
+ * Allocates new lock link, and uses it to add a lock to a lockset.
+ */
+int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io,
+                         struct cl_lock_descr *descr)
+{
+	struct cl_io_lock_link *link;
+	int result;
+
+	ENTRY;
+	OBD_ALLOC_PTR(link);
+	if (link != NULL) {
+		link->cill_descr = *descr;
+		link->cill_fini  = cl_free_io_lock_link;
+		result = cl_io_lock_add(env, io, link);
+		if (result) /* lock match */
+			link->cill_fini(env, link);
+	} else
+		result = -ENOMEM;
+
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_lock_alloc_add);
+
+/**
+ * Starts io by calling cl_io_operations::cio_start() top-to-bottom.
+ */
+int cl_io_start(const struct lu_env *env, struct cl_io *io)
+{
+        const struct cl_io_slice *scan;
+        int result = 0;
+
+        LINVRNT(cl_io_is_loopable(io));
+        LINVRNT(io->ci_state == CIS_LOCKED);
+        LINVRNT(cl_io_invariant(io));
+        ENTRY;
+
+        io->ci_state = CIS_IO_GOING;
+	list_for_each_entry(scan, &io->ci_layers, cis_linkage) {
+		if (scan->cis_iop->op[io->ci_type].cio_start == NULL)
+			continue;
+		result = scan->cis_iop->op[io->ci_type].cio_start(env, scan);
+		if (result != 0)
+			break;
+	}
+        if (result >= 0)
+                result = 0;
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_start);
+
+/**
+ * Wait until current io iteration is finished by calling
+ * cl_io_operations::cio_end() bottom-to-top.
+ */
+void cl_io_end(const struct lu_env *env, struct cl_io *io)
+{
+        const struct cl_io_slice *scan;
+
+        LINVRNT(cl_io_is_loopable(io));
+        LINVRNT(io->ci_state == CIS_IO_GOING);
+        LINVRNT(cl_io_invariant(io));
+        ENTRY;
+
+	list_for_each_entry_reverse(scan, &io->ci_layers, cis_linkage) {
+		if (scan->cis_iop->op[io->ci_type].cio_end != NULL)
+			scan->cis_iop->op[io->ci_type].cio_end(env, scan);
+		/* TODO: error handling. */
+	}
+        io->ci_state = CIS_IO_FINISHED;
+        EXIT;
+}
+EXPORT_SYMBOL(cl_io_end);
+
+/**
+ * Called by read io, to decide the readahead extent
+ *
+ * \see cl_io_operations::cio_read_ahead()
+ */
+int cl_io_read_ahead(const struct lu_env *env, struct cl_io *io,
+		     pgoff_t start, struct cl_read_ahead *ra)
+{
+	const struct cl_io_slice *scan;
+	int                       result = 0;
+
+	LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_FAULT);
+	LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+	LINVRNT(cl_io_invariant(io));
+	ENTRY;
+
+	list_for_each_entry(scan, &io->ci_layers, cis_linkage) {
+		if (scan->cis_iop->cio_read_ahead == NULL)
+			continue;
+
+		result = scan->cis_iop->cio_read_ahead(env, scan, start, ra);
+		if (result != 0)
+			break;
+	}
+	RETURN(result > 0 ? 0 : result);
+}
+EXPORT_SYMBOL(cl_io_read_ahead);
+
+/**
+ * Commit a list of contiguous pages into writeback cache.
+ *
+ * \returns 0 if all pages committed, or errcode if error occurred.
+ * \see cl_io_operations::cio_commit_async()
+ */
+int cl_io_commit_async(const struct lu_env *env, struct cl_io *io,
+			struct cl_page_list *queue, int from, int to,
+			cl_commit_cbt cb)
+{
+	const struct cl_io_slice *scan;
+	int result = 0;
+	ENTRY;
+
+	list_for_each_entry(scan, &io->ci_layers, cis_linkage) {
+		if (scan->cis_iop->cio_commit_async == NULL)
+			continue;
+		result = scan->cis_iop->cio_commit_async(env, scan, queue,
+							 from, to, cb);
+		if (result != 0)
+			break;
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_commit_async);
+
+/**
+ * Submits a list of pages for immediate io.
+ *
+ * After the function gets returned, The submitted pages are moved to
+ * queue->c2_qout queue, and queue->c2_qin contain both the pages don't need
+ * to be submitted, and the pages are errant to submit.
+ *
+ * \returns 0 if at least one page was submitted, error code otherwise.
+ * \see cl_io_operations::cio_submit()
+ */
+int cl_io_submit_rw(const struct lu_env *env, struct cl_io *io,
+		    enum cl_req_type crt, struct cl_2queue *queue)
+{
+	const struct cl_io_slice *scan;
+	int result = 0;
+	ENTRY;
+
+	list_for_each_entry(scan, &io->ci_layers, cis_linkage) {
+		if (scan->cis_iop->cio_submit == NULL)
+			continue;
+		result = scan->cis_iop->cio_submit(env, scan, crt, queue);
+		if (result != 0)
+			break;
+	}
+	/*
+	 * If ->cio_submit() failed, no pages were sent.
+	 */
+	LASSERT(ergo(result != 0, list_empty(&queue->c2_qout.pl_pages)));
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_submit_rw);
+
+/**
+ * Submit a sync_io and wait for the IO to be finished, or error happens.
+ * If \a timeout is zero, it means to wait for the IO unconditionally.
+ */
+int cl_io_submit_sync(const struct lu_env *env, struct cl_io *io,
+		      enum cl_req_type iot, struct cl_2queue *queue,
+		      long timeout)
+{
+	struct cl_sync_io *anchor = &cl_env_info(env)->clt_anchor;
+	struct cl_page *pg;
+	int rc;
+
+	cl_page_list_for_each(pg, &queue->c2_qin) {
+		LASSERT(pg->cp_sync_io == NULL);
+		pg->cp_sync_io = anchor;
+	}
+
+	cl_sync_io_init(anchor, queue->c2_qin.pl_nr, &cl_sync_io_end);
+	rc = cl_io_submit_rw(env, io, iot, queue);
+	if (rc == 0) {
+		/*
+		 * If some pages weren't sent for any reason (e.g.,
+		 * read found up-to-date pages in the cache, or write found
+		 * clean pages), count them as completed to avoid infinite
+		 * wait.
+		 */
+		cl_page_list_for_each(pg, &queue->c2_qin) {
+			pg->cp_sync_io = NULL;
+			cl_sync_io_note(env, anchor, 1);
+		}
+
+		/* wait for the IO to be finished. */
+		rc = cl_sync_io_wait(env, anchor, timeout);
+		cl_page_list_assume(env, io, &queue->c2_qout);
+	} else {
+		LASSERT(list_empty(&queue->c2_qout.pl_pages));
+		cl_page_list_for_each(pg, &queue->c2_qin)
+			pg->cp_sync_io = NULL;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(cl_io_submit_sync);
+
+/**
+ * Cancel an IO which has been submitted by cl_io_submit_rw.
+ */
+int cl_io_cancel(const struct lu_env *env, struct cl_io *io,
+                 struct cl_page_list *queue)
+{
+        struct cl_page *page;
+        int result = 0;
+
+        CERROR("Canceling ongoing page trasmission\n");
+        cl_page_list_for_each(page, queue) {
+                int rc;
+
+                rc = cl_page_cancel(env, page);
+                result = result ?: rc;
+        }
+        return result;
+}
+
+static
+struct cl_io_pt *cl_io_submit_pt(struct cl_io *io, loff_t pos, size_t count)
+{
+	struct cl_io_pt *pt;
+	int rc;
+
+	OBD_ALLOC(pt, sizeof(*pt));
+	if (pt == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	pt->cip_next = NULL;
+	init_sync_kiocb(&pt->cip_iocb, io->u.ci_rw.rw_file);
+	pt->cip_iocb.ki_pos = pos;
+#ifdef HAVE_KIOCB_KI_LEFT
+	pt->cip_iocb.ki_left = count;
+#elif defined(HAVE_KI_NBYTES)
+	pt->cip_iocb.ki_nbytes = count;
+#endif
+	pt->cip_iter = io->u.ci_rw.rw_iter;
+	iov_iter_truncate(&pt->cip_iter, count);
+	pt->cip_file   = io->u.ci_rw.rw_file;
+	pt->cip_iot    = io->ci_type;
+	pt->cip_pos    = pos;
+	pt->cip_count  = count;
+	pt->cip_result = 0;
+
+	rc = cfs_ptask_init(&pt->cip_task, io->u.ci_rw.rw_ptask, pt,
+			    PTF_ORDERED | PTF_COMPLETE |
+			    PTF_USER_MM | PTF_RETRY, smp_processor_id());
+	if (rc)
+		GOTO(out_error, rc);
+
+	CDEBUG(D_VFSTRACE, "submit %s range: [%llu, %llu)\n",
+		io->ci_type == CIT_READ ? "read" : "write",
+		pos, pos + count);
+
+	rc = cfs_ptask_submit(&pt->cip_task, cl_io_engine);
+	if (rc)
+		GOTO(out_error, rc);
+
+	RETURN(pt);
+
+out_error:
+	OBD_FREE(pt, sizeof(*pt));
+	RETURN(ERR_PTR(rc));
+}
+
+/**
+ * Main io loop.
+ *
+ * Pumps io through iterations calling
+ *
+ *    - cl_io_iter_init()
+ *
+ *    - cl_io_lock()
+ *
+ *    - cl_io_start()
+ *
+ *    - cl_io_end()
+ *
+ *    - cl_io_unlock()
+ *
+ *    - cl_io_iter_fini()
+ *
+ * repeatedly until there is no more io to do.
+ */
+int cl_io_loop(const struct lu_env *env, struct cl_io *io)
+{
+	struct cl_io_pt *pt = NULL, *head = NULL;
+	struct cl_io_pt **tail = &head;
+	loff_t pos;
+	size_t count;
+	size_t last_chunk_count = 0;
+	bool short_io = false;
+	int rc = 0;
+	ENTRY;
+
+	LINVRNT(cl_io_is_loopable(io));
+
+	do {
+		io->ci_continue = 0;
+
+		rc = cl_io_iter_init(env, io);
+		if (rc) {
+			cl_io_iter_fini(env, io);
+			break;
+		}
+
+		pos   = io->u.ci_rw.rw_range.cir_pos;
+		count = io->u.ci_rw.rw_range.cir_count;
+
+		if (io->ci_pio) {
+			/* submit this range for parallel execution */
+			pt = cl_io_submit_pt(io, pos, count);
+			if (IS_ERR(pt)) {
+				cl_io_iter_fini(env, io);
+				rc = PTR_ERR(pt);
+				break;
+			}
+
+			*tail = pt;
+			tail = &pt->cip_next;
+		} else {
+			size_t nob = io->ci_nob;
+
+			CDEBUG(D_VFSTRACE,
+				"execute type %u range: [%llu, %llu) nob: %zu %s\n",
+				io->ci_type, pos, pos + count, nob,
+				io->ci_continue ? "continue" : "stop");
+
+			rc = cl_io_lock(env, io);
+			if (rc) {
+				cl_io_iter_fini(env, io);
+				break;
+			}
+
+			/*
+			 * Notify layers that locks has been taken,
+			 * and do actual i/o.
+			 *
+			 *   - llite: kms, short read;
+			 *   - llite: generic_file_read();
+			 */
+			rc = cl_io_start(env, io);
+
+			/*
+			 * Send any remaining pending
+			 * io, etc.
+			 *
+			 *   - llite: ll_rw_stats_tally.
+			 */
+			cl_io_end(env, io);
+			cl_io_unlock(env, io);
+
+			count = io->ci_nob - nob;
+			last_chunk_count = count;
+		}
+
+		cl_io_rw_advance(env, io, count);
+		cl_io_iter_fini(env, io);
+	} while (!rc && io->ci_continue);
+
+	CDEBUG(D_VFSTRACE, "loop type %u done: nob: %zu, rc: %d %s\n",
+		io->ci_type, io->ci_nob, rc,
+		io->ci_continue ? "continue" : "stop");
+
+	while (head != NULL) {
+		int rc2;
+
+		pt = head;
+		head = head->cip_next;
+
+		rc2 = cfs_ptask_wait_for(&pt->cip_task);
+		LASSERTF(!rc2, "wait for task error: %d\n", rc2);
+
+		rc2 = cfs_ptask_result(&pt->cip_task);
+		CDEBUG(D_VFSTRACE,
+			"done %s range: [%llu, %llu) ret: %zd, rc: %d\n",
+			pt->cip_iot == CIT_READ ? "read" : "write",
+			pt->cip_pos, pt->cip_pos + pt->cip_count,
+			pt->cip_result, rc2);
+		if (rc2)
+			rc = rc ? rc : rc2;
+		if (!short_io) {
+			if (!rc2) /* IO is done by this task successfully */
+				io->ci_nob += pt->cip_result;
+			if (pt->cip_result < pt->cip_count) {
+				/* short IO happened.
+				 * Not necessary to be an error */
+				CDEBUG(D_VFSTRACE,
+					"incomplete range: [%llu, %llu) "
+					"last_chunk_count: %zu\n",
+					pt->cip_pos,
+					pt->cip_pos + pt->cip_count,
+					last_chunk_count);
+				io->ci_nob -= last_chunk_count;
+				short_io = true;
+			}
+		}
+		OBD_FREE(pt, sizeof(*pt));
+	}
+
+	CDEBUG(D_VFSTRACE, "return nob: %zu (%s io), rc: %d\n",
+		io->ci_nob, short_io ? "short" : "full", rc);
+
+	RETURN(rc < 0 ? rc : io->ci_result);
+}
+EXPORT_SYMBOL(cl_io_loop);
+
+/**
+ * Adds io slice to the cl_io.
+ *
+ * This is called by cl_object_operations::coo_io_init() methods to add a
+ * per-layer state to the io. New state is added at the end of
+ * cl_io::ci_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_req_slice_add(), cl_page_slice_add()
+ */
+void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice,
+                     struct cl_object *obj,
+                     const struct cl_io_operations *ops)
+{
+	struct list_head *linkage = &slice->cis_linkage;
+
+        LASSERT((linkage->prev == NULL && linkage->next == NULL) ||
+		list_empty(linkage));
+        ENTRY;
+
+	list_add_tail(linkage, &io->ci_layers);
+        slice->cis_io  = io;
+        slice->cis_obj = obj;
+        slice->cis_iop = ops;
+        EXIT;
+}
+EXPORT_SYMBOL(cl_io_slice_add);
+
+
+/**
+ * Initializes page list.
+ */
+void cl_page_list_init(struct cl_page_list *plist)
+{
+	ENTRY;
+	plist->pl_nr = 0;
+	INIT_LIST_HEAD(&plist->pl_pages);
+	plist->pl_owner = current;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_init);
+
+/**
+ * Adds a page to a page list.
+ */
+void cl_page_list_add(struct cl_page_list *plist, struct cl_page *page)
+{
+	ENTRY;
+	/* it would be better to check that page is owned by "current" io, but
+	 * it is not passed here. */
+	LASSERT(page->cp_owner != NULL);
+	LINVRNT(plist->pl_owner == current);
+
+	LASSERT(list_empty(&page->cp_batch));
+	list_add_tail(&page->cp_batch, &plist->pl_pages);
+	++plist->pl_nr;
+	lu_ref_add_at(&page->cp_reference, &page->cp_queue_ref, "queue", plist);
+	cl_page_get(page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_add);
+
+/**
+ * Removes a page from a page list.
+ */
+void cl_page_list_del(const struct lu_env *env,
+		      struct cl_page_list *plist, struct cl_page *page)
+{
+	LASSERT(plist->pl_nr > 0);
+	LASSERT(cl_page_is_vmlocked(env, page));
+	LINVRNT(plist->pl_owner == current);
+
+	ENTRY;
+	list_del_init(&page->cp_batch);
+	--plist->pl_nr;
+	lu_ref_del_at(&page->cp_reference, &page->cp_queue_ref, "queue", plist);
+	cl_page_put(env, page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_del);
+
+/**
+ * Moves a page from one page list to another.
+ */
+void cl_page_list_move(struct cl_page_list *dst, struct cl_page_list *src,
+		       struct cl_page *page)
+{
+	LASSERT(src->pl_nr > 0);
+	LINVRNT(dst->pl_owner == current);
+	LINVRNT(src->pl_owner == current);
+
+	ENTRY;
+	list_move_tail(&page->cp_batch, &dst->pl_pages);
+	--src->pl_nr;
+	++dst->pl_nr;
+	lu_ref_set_at(&page->cp_reference, &page->cp_queue_ref, "queue",
+		      src, dst);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_move);
+
+/**
+ * Moves a page from one page list to the head of another list.
+ */
+void cl_page_list_move_head(struct cl_page_list *dst, struct cl_page_list *src,
+			    struct cl_page *page)
+{
+	LASSERT(src->pl_nr > 0);
+	LINVRNT(dst->pl_owner == current);
+	LINVRNT(src->pl_owner == current);
+
+	ENTRY;
+	list_move(&page->cp_batch, &dst->pl_pages);
+	--src->pl_nr;
+	++dst->pl_nr;
+	lu_ref_set_at(&page->cp_reference, &page->cp_queue_ref, "queue",
+			src, dst);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_move_head);
+
+/**
+ * splice the cl_page_list, just as list head does
+ */
+void cl_page_list_splice(struct cl_page_list *list, struct cl_page_list *head)
+{
+	struct cl_page *page;
+	struct cl_page *tmp;
+
+	LINVRNT(list->pl_owner == current);
+	LINVRNT(head->pl_owner == current);
+
+	ENTRY;
+	cl_page_list_for_each_safe(page, tmp, list)
+		cl_page_list_move(head, list, page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_splice);
+
+/**
+ * Disowns pages in a queue.
+ */
+void cl_page_list_disown(const struct lu_env *env,
+			 struct cl_io *io, struct cl_page_list *plist)
+{
+	struct cl_page *page;
+	struct cl_page *temp;
+
+	LINVRNT(plist->pl_owner == current);
+
+	ENTRY;
+	cl_page_list_for_each_safe(page, temp, plist) {
+		LASSERT(plist->pl_nr > 0);
+
+		list_del_init(&page->cp_batch);
+		--plist->pl_nr;
+		/*
+		 * cl_page_disown0 rather than usual cl_page_disown() is used,
+		 * because pages are possibly in CPS_FREEING state already due
+		 * to the call to cl_page_list_discard().
+		 */
+		/*
+		 * XXX cl_page_disown0() will fail if page is not locked.
+		 */
+		cl_page_disown0(env, io, page);
+		lu_ref_del_at(&page->cp_reference, &page->cp_queue_ref, "queue",
+			      plist);
+		cl_page_put(env, page);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_disown);
+
+/**
+ * Releases pages from queue.
+ */
+void cl_page_list_fini(const struct lu_env *env, struct cl_page_list *plist)
+{
+	struct cl_page *page;
+	struct cl_page *temp;
+
+	LINVRNT(plist->pl_owner == current);
+
+	ENTRY;
+	cl_page_list_for_each_safe(page, temp, plist)
+		cl_page_list_del(env, plist, page);
+	LASSERT(plist->pl_nr == 0);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_fini);
+
+/**
+ * Assumes all pages in a queue.
+ */
+void cl_page_list_assume(const struct lu_env *env,
+			 struct cl_io *io, struct cl_page_list *plist)
+{
+	struct cl_page *page;
+
+	LINVRNT(plist->pl_owner == current);
+
+	cl_page_list_for_each(page, plist)
+		cl_page_assume(env, io, page);
+}
+
+/**
+ * Discards all pages in a queue.
+ */
+void cl_page_list_discard(const struct lu_env *env, struct cl_io *io,
+			  struct cl_page_list *plist)
+{
+	struct cl_page *page;
+
+	LINVRNT(plist->pl_owner == current);
+	ENTRY;
+	cl_page_list_for_each(page, plist)
+		cl_page_discard(env, io, page);
+	EXIT;
+}
+
+/**
+ * Initialize dual page queue.
+ */
+void cl_2queue_init(struct cl_2queue *queue)
+{
+        ENTRY;
+        cl_page_list_init(&queue->c2_qin);
+        cl_page_list_init(&queue->c2_qout);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_init);
+
+/**
+ * Add a page to the incoming page list of 2-queue.
+ */
+void cl_2queue_add(struct cl_2queue *queue, struct cl_page *page)
+{
+        ENTRY;
+        cl_page_list_add(&queue->c2_qin, page);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_add);
+
+/**
+ * Disown pages in both lists of a 2-queue.
+ */
+void cl_2queue_disown(const struct lu_env *env,
+                      struct cl_io *io, struct cl_2queue *queue)
+{
+        ENTRY;
+        cl_page_list_disown(env, io, &queue->c2_qin);
+        cl_page_list_disown(env, io, &queue->c2_qout);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_disown);
+
+/**
+ * Discard (truncate) pages in both lists of a 2-queue.
+ */
+void cl_2queue_discard(const struct lu_env *env,
+                       struct cl_io *io, struct cl_2queue *queue)
+{
+        ENTRY;
+        cl_page_list_discard(env, io, &queue->c2_qin);
+        cl_page_list_discard(env, io, &queue->c2_qout);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_discard);
+
+/**
+ * Assume to own the pages in cl_2queue
+ */
+void cl_2queue_assume(const struct lu_env *env,
+                      struct cl_io *io, struct cl_2queue *queue)
+{
+        cl_page_list_assume(env, io, &queue->c2_qin);
+        cl_page_list_assume(env, io, &queue->c2_qout);
+}
+
+/**
+ * Finalize both page lists of a 2-queue.
+ */
+void cl_2queue_fini(const struct lu_env *env, struct cl_2queue *queue)
+{
+        ENTRY;
+        cl_page_list_fini(env, &queue->c2_qout);
+        cl_page_list_fini(env, &queue->c2_qin);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_fini);
+
+/**
+ * Initialize a 2-queue to contain \a page in its incoming page list.
+ */
+void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page)
+{
+        ENTRY;
+        cl_2queue_init(queue);
+        cl_2queue_add(queue, page);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_init_page);
+
+/**
+ * Returns top-level io.
+ *
+ * \see cl_object_top()
+ */
+struct cl_io *cl_io_top(struct cl_io *io)
+{
+        ENTRY;
+        while (io->ci_parent != NULL)
+                io = io->ci_parent;
+        RETURN(io);
+}
+EXPORT_SYMBOL(cl_io_top);
+
+/**
+ * Prints human readable representation of \a io to the \a f.
+ */
+void cl_io_print(const struct lu_env *env, void *cookie,
+                 lu_printer_t printer, const struct cl_io *io)
+{
+}
+
+/**
+ * Fills in attributes that are passed to server together with transfer. Only
+ * attributes from \a flags may be touched. This can be called multiple times
+ * for the same request.
+ */
+void cl_req_attr_set(const struct lu_env *env, struct cl_object *obj,
+		     struct cl_req_attr *attr)
+{
+	struct cl_object *scan;
+	ENTRY;
+
+	cl_object_for_each(scan, obj) {
+		if (scan->co_ops->coo_req_attr_set != NULL)
+			scan->co_ops->coo_req_attr_set(env, scan, attr);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_req_attr_set);
+
+/* cl_sync_io_callback assumes the caller must call cl_sync_io_wait() to
+ * wait for the IO to finish. */
+void cl_sync_io_end(const struct lu_env *env, struct cl_sync_io *anchor)
+{
+	wake_up_all(&anchor->csi_waitq);
+
+	/* it's safe to nuke or reuse anchor now */
+	atomic_set(&anchor->csi_barrier, 0);
+}
+EXPORT_SYMBOL(cl_sync_io_end);
+
+/**
+ * Initialize synchronous io wait anchor
+ */
+void cl_sync_io_init(struct cl_sync_io *anchor, int nr,
+		     void (*end)(const struct lu_env *, struct cl_sync_io *))
+{
+	ENTRY;
+	memset(anchor, 0, sizeof(*anchor));
+	init_waitqueue_head(&anchor->csi_waitq);
+	atomic_set(&anchor->csi_sync_nr, nr);
+	atomic_set(&anchor->csi_barrier, nr > 0);
+	anchor->csi_sync_rc = 0;
+	anchor->csi_end_io = end;
+	LASSERT(end != NULL);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_sync_io_init);
+
+/**
+ * Wait until all IO completes. Transfer completion routine has to call
+ * cl_sync_io_note() for every entity.
+ */
+int cl_sync_io_wait(const struct lu_env *env, struct cl_sync_io *anchor,
+		    long timeout)
+{
+	struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout),
+						  NULL, NULL, NULL);
+	int rc;
+	ENTRY;
+
+	LASSERT(timeout >= 0);
+
+	rc = l_wait_event(anchor->csi_waitq,
+			  atomic_read(&anchor->csi_sync_nr) == 0,
+			  &lwi);
+	if (rc < 0) {
+		CERROR("IO failed: %d, still wait for %d remaining entries\n",
+		       rc, atomic_read(&anchor->csi_sync_nr));
+
+		lwi = (struct l_wait_info) { 0 };
+		(void)l_wait_event(anchor->csi_waitq,
+				   atomic_read(&anchor->csi_sync_nr) == 0,
+				   &lwi);
+	} else {
+		rc = anchor->csi_sync_rc;
+	}
+	LASSERT(atomic_read(&anchor->csi_sync_nr) == 0);
+
+	/* wait until cl_sync_io_note() has done wakeup */
+	while (unlikely(atomic_read(&anchor->csi_barrier) != 0)) {
+		cpu_relax();
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(cl_sync_io_wait);
+
+/**
+ * Indicate that transfer of a single page completed.
+ */
+void cl_sync_io_note(const struct lu_env *env, struct cl_sync_io *anchor,
+		     int ioret)
+{
+	ENTRY;
+	if (anchor->csi_sync_rc == 0 && ioret < 0)
+		anchor->csi_sync_rc = ioret;
+	/*
+	 * Synchronous IO done without releasing page lock (e.g., as a part of
+	 * ->{prepare,commit}_write(). Completion is used to signal the end of
+	 * IO.
+	 */
+	LASSERT(atomic_read(&anchor->csi_sync_nr) > 0);
+	if (atomic_dec_and_test(&anchor->csi_sync_nr)) {
+		LASSERT(anchor->csi_end_io != NULL);
+		anchor->csi_end_io(env, anchor);
+		/* Can't access anchor any more */
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_sync_io_note);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_lock.c b/drivers/staging/lustrefsx/lustre/obdclass/cl_lock.c
new file mode 100644
index 0000000000000..e92dbaf4fda68
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_lock.c
@@ -0,0 +1,288 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Extent Lock.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/list.h>
+#include <libcfs/libcfs.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <cl_object.h>
+#include "cl_internal.h"
+
+static void cl_lock_trace0(int level, const struct lu_env *env,
+			   const char *prefix, const struct cl_lock *lock,
+			   const char *func, const int line)
+{
+	struct cl_object_header *h = cl_object_header(lock->cll_descr.cld_obj);
+	CDEBUG(level, "%s: %p (%p/%d) at %s():%d\n",
+	       prefix, lock, env, h->coh_nesting, func, line);
+}
+#define cl_lock_trace(level, env, prefix, lock)                         \
+        cl_lock_trace0(level, env, prefix, lock, __FUNCTION__, __LINE__)
+
+/**
+ * Adds lock slice to the compound lock.
+ *
+ * This is called by cl_object_operations::coo_lock_init() methods to add a
+ * per-layer state to the lock. New state is added at the end of
+ * cl_lock::cll_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_req_slice_add(), cl_page_slice_add(), cl_io_slice_add()
+ */
+void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice,
+                       struct cl_object *obj,
+                       const struct cl_lock_operations *ops)
+{
+	ENTRY;
+	slice->cls_lock = lock;
+	list_add_tail(&slice->cls_linkage, &lock->cll_layers);
+	slice->cls_obj = obj;
+	slice->cls_ops = ops;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_slice_add);
+
+void cl_lock_fini(const struct lu_env *env, struct cl_lock *lock)
+{
+	ENTRY;
+
+	cl_lock_trace(D_DLMTRACE, env, "destroy lock", lock);
+
+	while (!list_empty(&lock->cll_layers)) {
+		struct cl_lock_slice *slice;
+
+		slice = list_entry(lock->cll_layers.next,
+				struct cl_lock_slice, cls_linkage);
+		list_del_init(lock->cll_layers.next);
+		slice->cls_ops->clo_fini(env, slice);
+	}
+	POISON(lock, 0x5a, sizeof(*lock));
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_fini);
+
+int cl_lock_init(const struct lu_env *env, struct cl_lock *lock,
+		 const struct cl_io *io)
+{
+	struct cl_object *obj = lock->cll_descr.cld_obj;
+	struct cl_object *scan;
+	int result = 0;
+	ENTRY;
+
+	/* Make sure cl_lock::cll_descr is initialized. */
+	LASSERT(obj != NULL);
+
+	INIT_LIST_HEAD(&lock->cll_layers);
+	list_for_each_entry(scan, &obj->co_lu.lo_header->loh_layers,
+			    co_lu.lo_linkage) {
+		result = scan->co_ops->coo_lock_init(env, scan, lock, io);
+		if (result != 0) {
+			cl_lock_fini(env, lock);
+			break;
+		}
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_init);
+
+/**
+ * Returns a slice with a lock, corresponding to the given layer in the
+ * device stack.
+ *
+ * \see cl_page_at()
+ */
+const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock,
+				       const struct lu_device_type *dtype)
+{
+	const struct cl_lock_slice *slice;
+
+	ENTRY;
+
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_obj->co_lu.lo_dev->ld_type == dtype)
+			RETURN(slice);
+	}
+	RETURN(NULL);
+}
+EXPORT_SYMBOL(cl_lock_at);
+
+void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock)
+{
+	const struct cl_lock_slice *slice;
+	ENTRY;
+
+	cl_lock_trace(D_DLMTRACE, env, "cancel lock", lock);
+	list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_ops->clo_cancel != NULL)
+			slice->cls_ops->clo_cancel(env, slice);
+	}
+
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_cancel);
+
+/**
+ * Enqueue a lock.
+ * \param anchor: if we need to wait for resources before getting the lock,
+ *                use @anchor for the purpose.
+ * \retval 0  enqueue successfully
+ * \retval <0 error code
+ */
+int cl_lock_enqueue(const struct lu_env *env, struct cl_io *io,
+		    struct cl_lock *lock, struct cl_sync_io *anchor)
+{
+	const struct cl_lock_slice	*slice;
+	int				rc = -ENOSYS;
+
+	ENTRY;
+
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_ops->clo_enqueue == NULL)
+			continue;
+
+		rc = slice->cls_ops->clo_enqueue(env, slice, io, anchor);
+		if (rc != 0)
+			break;
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(cl_lock_enqueue);
+
+/**
+ * Main high-level entry point of cl_lock interface that finds existing or
+ * enqueues new lock matching given description.
+ */
+int cl_lock_request(const struct lu_env *env, struct cl_io *io,
+		    struct cl_lock *lock)
+{
+	struct cl_sync_io	*anchor = NULL;
+	__u32			enq_flags = lock->cll_descr.cld_enq_flags;
+	int			rc;
+	ENTRY;
+
+	rc = cl_lock_init(env, lock, io);
+	if (rc < 0)
+		RETURN(rc);
+
+	if ((enq_flags & CEF_ASYNC) && !(enq_flags & CEF_AGL)) {
+		anchor = &cl_env_info(env)->clt_anchor;
+		cl_sync_io_init(anchor, 1, cl_sync_io_end);
+	}
+
+	rc = cl_lock_enqueue(env, io, lock, anchor);
+
+	if (anchor != NULL) {
+		int rc2;
+
+		/* drop the reference count held at initialization time */
+		cl_sync_io_note(env, anchor, 0);
+		rc2 = cl_sync_io_wait(env, anchor, 0);
+		if (rc2 < 0 && rc == 0)
+			rc = rc2;
+	}
+
+	if (rc < 0)
+		cl_lock_release(env, lock);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(cl_lock_request);
+
+/**
+ * Releases a hold and a reference on a lock, obtained by cl_lock_hold().
+ */
+void cl_lock_release(const struct lu_env *env, struct cl_lock *lock)
+{
+	ENTRY;
+
+	cl_lock_trace(D_DLMTRACE, env, "release lock", lock);
+	cl_lock_cancel(env, lock);
+	cl_lock_fini(env, lock);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_release);
+
+const char *cl_lock_mode_name(const enum cl_lock_mode mode)
+{
+	static const char * const names[] = {
+		[CLM_READ]    = "R",
+		[CLM_WRITE]   = "W",
+		[CLM_GROUP]   = "G"
+	};
+	CLASSERT(CLM_MAX == ARRAY_SIZE(names));
+	return names[mode];
+}
+EXPORT_SYMBOL(cl_lock_mode_name);
+
+/**
+ * Prints human readable representation of a lock description.
+ */
+void cl_lock_descr_print(const struct lu_env *env, void *cookie,
+			 lu_printer_t printer,
+			 const struct cl_lock_descr *descr)
+{
+	const struct lu_fid  *fid;
+
+	fid = lu_object_fid(&descr->cld_obj->co_lu);
+	(*printer)(env, cookie, DDESCR"@"DFID, PDESCR(descr), PFID(fid));
+}
+EXPORT_SYMBOL(cl_lock_descr_print);
+
+/**
+ * Prints human readable representation of \a lock to the \a f.
+ */
+void cl_lock_print(const struct lu_env *env, void *cookie,
+		   lu_printer_t printer, const struct cl_lock *lock)
+{
+	const struct cl_lock_slice *slice;
+
+	(*printer)(env, cookie, "lock@%p", lock);
+	cl_lock_descr_print(env, cookie, printer, &lock->cll_descr);
+	(*printer)(env, cookie, " {\n");
+
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+		(*printer)(env, cookie, "    %s@%p: ",
+			   slice->cls_obj->co_lu.lo_dev->ld_type->ldt_name,
+			   slice);
+		if (slice->cls_ops->clo_print != NULL)
+			slice->cls_ops->clo_print(env, cookie, printer, slice);
+		(*printer)(env, cookie, "\n");
+	}
+	(*printer)(env, cookie, "} lock@%p\n", lock);
+}
+EXPORT_SYMBOL(cl_lock_print);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_object.c b/drivers/staging/lustrefsx/lustre/obdclass/cl_object.c
new file mode 100644
index 0000000000000..ddf97fc2cf057
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_object.c
@@ -0,0 +1,1107 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Lustre Object.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+/*
+ * Locking.
+ *
+ *  i_mutex
+ *      PG_locked
+ *          ->coh_attr_guard
+ *          ->ls_guard
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/list.h>
+#include <libcfs/libcfs.h>
+/* class_put_type() */
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <libcfs/libcfs_hash.h> /* for cfs_hash stuff */
+#include <cl_object.h>
+#include <lu_object.h>
+#include "cl_internal.h"
+
+static struct kmem_cache *cl_env_kmem;
+
+/** Lock class of cl_object_header::coh_attr_guard */
+static struct lock_class_key cl_attr_guard_class;
+
+/**
+ * Initialize cl_object_header.
+ */
+int cl_object_header_init(struct cl_object_header *h)
+{
+	int result;
+
+	ENTRY;
+	result = lu_object_header_init(&h->coh_lu);
+	if (result == 0) {
+		spin_lock_init(&h->coh_attr_guard);
+		lockdep_set_class(&h->coh_attr_guard, &cl_attr_guard_class);
+		h->coh_page_bufsize = 0;
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_header_init);
+
+/**
+ * Finalize cl_object_header.
+ */
+void cl_object_header_fini(struct cl_object_header *h)
+{
+        lu_object_header_fini(&h->coh_lu);
+}
+
+/**
+ * Returns a cl_object with a given \a fid.
+ *
+ * Returns either cached or newly created object. Additional reference on the
+ * returned object is acquired.
+ *
+ * \see lu_object_find(), cl_page_find(), cl_lock_find()
+ */
+struct cl_object *cl_object_find(const struct lu_env *env,
+                                 struct cl_device *cd, const struct lu_fid *fid,
+                                 const struct cl_object_conf *c)
+{
+	might_sleep();
+        return lu2cl(lu_object_find_slice(env, cl2lu_dev(cd), fid, &c->coc_lu));
+}
+EXPORT_SYMBOL(cl_object_find);
+
+/**
+ * Releases a reference on \a o.
+ *
+ * When last reference is released object is returned to the cache, unless
+ * lu_object_header_flags::LU_OBJECT_HEARD_BANSHEE bit is set in its header.
+ *
+ * \see cl_page_put(), cl_lock_put().
+ */
+void cl_object_put(const struct lu_env *env, struct cl_object *o)
+{
+        lu_object_put(env, &o->co_lu);
+}
+EXPORT_SYMBOL(cl_object_put);
+
+/**
+ * Acquire an additional reference to the object \a o.
+ *
+ * This can only be used to acquire _additional_ reference, i.e., caller
+ * already has to possess at least one reference to \a o before calling this.
+ *
+ * \see cl_page_get(), cl_lock_get().
+ */
+void cl_object_get(struct cl_object *o)
+{
+        lu_object_get(&o->co_lu);
+}
+EXPORT_SYMBOL(cl_object_get);
+
+/**
+ * Returns the top-object for a given \a o.
+ *
+ * \see cl_io_top()
+ */
+struct cl_object *cl_object_top(struct cl_object *o)
+{
+        struct cl_object_header *hdr = cl_object_header(o);
+        struct cl_object *top;
+
+        while (hdr->coh_parent != NULL)
+                hdr = hdr->coh_parent;
+
+        top = lu2cl(lu_object_top(&hdr->coh_lu));
+        CDEBUG(D_TRACE, "%p -> %p\n", o, top);
+        return top;
+}
+EXPORT_SYMBOL(cl_object_top);
+
+/**
+ * Returns pointer to the lock protecting data-attributes for the given object
+ * \a o.
+ *
+ * Data-attributes are protected by the cl_object_header::coh_attr_guard
+ * spin-lock in the top-object.
+ *
+ * \see cl_attr, cl_object_attr_lock(), cl_object_operations::coo_attr_get().
+ */
+static spinlock_t *cl_object_attr_guard(struct cl_object *o)
+{
+	return &cl_object_header(cl_object_top(o))->coh_attr_guard;
+}
+
+/**
+ * Locks data-attributes.
+ *
+ * Prevents data-attributes from changing, until lock is released by
+ * cl_object_attr_unlock(). This has to be called before calls to
+ * cl_object_attr_get(), cl_object_attr_update().
+ */
+void cl_object_attr_lock(struct cl_object *o)
+__acquires(cl_object_attr_guard(o))
+{
+	spin_lock(cl_object_attr_guard(o));
+}
+EXPORT_SYMBOL(cl_object_attr_lock);
+
+/**
+ * Releases data-attributes lock, acquired by cl_object_attr_lock().
+ */
+void cl_object_attr_unlock(struct cl_object *o)
+__releases(cl_object_attr_guard(o))
+{
+	spin_unlock(cl_object_attr_guard(o));
+}
+EXPORT_SYMBOL(cl_object_attr_unlock);
+
+/**
+ * Returns data-attributes of an object \a obj.
+ *
+ * Every layer is asked (by calling cl_object_operations::coo_attr_get())
+ * top-to-bottom to fill in parts of \a attr that this layer is responsible
+ * for.
+ */
+int cl_object_attr_get(const struct lu_env *env, struct cl_object *obj,
+			struct cl_attr *attr)
+{
+	struct lu_object_header *top;
+	int result;
+
+	assert_spin_locked(cl_object_attr_guard(obj));
+	ENTRY;
+
+	top = obj->co_lu.lo_header;
+	result = 0;
+	list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) {
+		if (obj->co_ops->coo_attr_get != NULL) {
+			result = obj->co_ops->coo_attr_get(env, obj, attr);
+			if (result != 0) {
+				if (result > 0)
+					result = 0;
+				break;
+			}
+		}
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_attr_get);
+
+/**
+ * Updates data-attributes of an object \a obj.
+ *
+ * Only attributes, mentioned in a validness bit-mask \a v are
+ * updated. Calls cl_object_operations::coo_upd_attr() on every layer, bottom
+ * to top.
+ */
+int cl_object_attr_update(const struct lu_env *env, struct cl_object *obj,
+			  const struct cl_attr *attr, unsigned v)
+{
+	struct lu_object_header *top;
+	int result;
+
+	assert_spin_locked(cl_object_attr_guard(obj));
+	ENTRY;
+
+	top = obj->co_lu.lo_header;
+	result = 0;
+	list_for_each_entry_reverse(obj, &top->loh_layers, co_lu.lo_linkage) {
+		if (obj->co_ops->coo_attr_update != NULL) {
+			result = obj->co_ops->coo_attr_update(env, obj, attr,
+							      v);
+			if (result != 0) {
+				if (result > 0)
+					result = 0;
+				break;
+			}
+		}
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_attr_update);
+
+/**
+ * Notifies layers (bottom-to-top) that glimpse AST was received.
+ *
+ * Layers have to fill \a lvb fields with information that will be shipped
+ * back to glimpse issuer.
+ *
+ * \see cl_lock_operations::clo_glimpse()
+ */
+int cl_object_glimpse(const struct lu_env *env, struct cl_object *obj,
+                      struct ost_lvb *lvb)
+{
+        struct lu_object_header *top;
+        int result;
+
+        ENTRY;
+        top = obj->co_lu.lo_header;
+        result = 0;
+	list_for_each_entry_reverse(obj, &top->loh_layers, co_lu.lo_linkage) {
+                if (obj->co_ops->coo_glimpse != NULL) {
+                        result = obj->co_ops->coo_glimpse(env, obj, lvb);
+                        if (result != 0)
+                                break;
+                }
+        }
+        LU_OBJECT_HEADER(D_DLMTRACE, env, lu_object_top(top),
+			 "size: %llu mtime: %llu atime: %llu "
+			 "ctime: %llu blocks: %llu\n",
+                         lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime,
+                         lvb->lvb_ctime, lvb->lvb_blocks);
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_glimpse);
+
+/**
+ * Updates a configuration of an object \a obj.
+ */
+int cl_conf_set(const struct lu_env *env, struct cl_object *obj,
+                const struct cl_object_conf *conf)
+{
+        struct lu_object_header *top;
+        int result;
+
+        ENTRY;
+        top = obj->co_lu.lo_header;
+        result = 0;
+	list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) {
+                if (obj->co_ops->coo_conf_set != NULL) {
+                        result = obj->co_ops->coo_conf_set(env, obj, conf);
+                        if (result != 0)
+                                break;
+                }
+        }
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_conf_set);
+
+/**
+ * Prunes caches of pages and locks for this object.
+ */
+int cl_object_prune(const struct lu_env *env, struct cl_object *obj)
+{
+	struct lu_object_header *top;
+	struct cl_object *o;
+	int result;
+	ENTRY;
+
+	top = obj->co_lu.lo_header;
+	result = 0;
+	list_for_each_entry(o, &top->loh_layers, co_lu.lo_linkage) {
+		if (o->co_ops->coo_prune != NULL) {
+			result = o->co_ops->coo_prune(env, o);
+			if (result != 0)
+				break;
+		}
+	}
+
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_prune);
+
+/**
+ * Get stripe information of this object.
+ */
+int cl_object_getstripe(const struct lu_env *env, struct cl_object *obj,
+			struct lov_user_md __user *uarg, size_t size)
+{
+	struct lu_object_header	*top;
+	int			result = 0;
+	ENTRY;
+
+	top = obj->co_lu.lo_header;
+	list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) {
+		if (obj->co_ops->coo_getstripe != NULL) {
+			result = obj->co_ops->coo_getstripe(env, obj, uarg,
+							    size);
+			if (result != 0)
+				break;
+		}
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_getstripe);
+
+/**
+ * Get fiemap extents from file object.
+ *
+ * \param env [in]	lustre environment
+ * \param obj [in]	file object
+ * \param key [in]	fiemap request argument
+ * \param fiemap [out]	fiemap extents mapping retrived
+ * \param buflen [in]	max buffer length of @fiemap
+ *
+ * \retval 0	success
+ * \retval < 0	error
+ */
+int cl_object_fiemap(const struct lu_env *env, struct cl_object *obj,
+		     struct ll_fiemap_info_key *key,
+		     struct fiemap *fiemap, size_t *buflen)
+{
+	struct lu_object_header	*top;
+	int			result = 0;
+	ENTRY;
+
+	top = obj->co_lu.lo_header;
+	list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) {
+		if (obj->co_ops->coo_fiemap != NULL) {
+			result = obj->co_ops->coo_fiemap(env, obj, key, fiemap,
+							 buflen);
+			if (result != 0)
+				break;
+		}
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_fiemap);
+
+int cl_object_layout_get(const struct lu_env *env, struct cl_object *obj,
+			 struct cl_layout *cl)
+{
+	struct lu_object_header	*top = obj->co_lu.lo_header;
+	ENTRY;
+
+	list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) {
+		if (obj->co_ops->coo_layout_get != NULL)
+			return obj->co_ops->coo_layout_get(env, obj, cl);
+	}
+
+	RETURN(-EOPNOTSUPP);
+}
+EXPORT_SYMBOL(cl_object_layout_get);
+
+loff_t cl_object_maxbytes(struct cl_object *obj)
+{
+	struct lu_object_header	*top = obj->co_lu.lo_header;
+	loff_t maxbytes = LLONG_MAX;
+	ENTRY;
+
+	list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) {
+		if (obj->co_ops->coo_maxbytes != NULL)
+			maxbytes = min_t(loff_t, obj->co_ops->coo_maxbytes(obj),
+					 maxbytes);
+	}
+
+	RETURN(maxbytes);
+}
+EXPORT_SYMBOL(cl_object_maxbytes);
+
+/**
+ * Helper function removing all object locks, and marking object for
+ * deletion. All object pages must have been deleted at this point.
+ *
+ * This is called by cl_inode_fini() and lov_object_delete() to destroy top-
+ * and sub- objects respectively.
+ */
+void cl_object_kill(const struct lu_env *env, struct cl_object *obj)
+{
+	struct cl_object_header *hdr = cl_object_header(obj);
+
+	set_bit(LU_OBJECT_HEARD_BANSHEE, &hdr->coh_lu.loh_flags);
+}
+EXPORT_SYMBOL(cl_object_kill);
+
+void cache_stats_init(struct cache_stats *cs, const char *name)
+{
+	int i;
+
+        cs->cs_name = name;
+	for (i = 0; i < CS_NR; i++)
+		atomic_set(&cs->cs_stats[i], 0);
+}
+
+static int cache_stats_print(const struct cache_stats *cs,
+			     struct seq_file *m, int h)
+{
+	int i;
+
+	/*
+	 *   lookup    hit    total  cached create
+	 * env: ...... ...... ...... ...... ......
+	 */
+	if (h) {
+		const char *names[CS_NR] = CS_NAMES;
+
+		seq_printf(m, "%6s", " ");
+		for (i = 0; i < CS_NR; i++)
+			seq_printf(m, "%8s", names[i]);
+		seq_printf(m, "\n");
+	}
+
+	seq_printf(m, "%5.5s:", cs->cs_name);
+	for (i = 0; i < CS_NR; i++)
+		seq_printf(m, "%8u", atomic_read(&cs->cs_stats[i]));
+	return 0;
+}
+
+static void cl_env_percpu_refill(void);
+
+/**
+ * Initialize client site.
+ *
+ * Perform common initialization (lu_site_init()), and initialize statistical
+ * counters. Also perform global initializations on the first call.
+ */
+int cl_site_init(struct cl_site *s, struct cl_device *d)
+{
+	size_t i;
+        int result;
+
+        result = lu_site_init(&s->cs_lu, &d->cd_lu_dev);
+        if (result == 0) {
+                cache_stats_init(&s->cs_pages, "pages");
+                for (i = 0; i < ARRAY_SIZE(s->cs_pages_state); ++i)
+			atomic_set(&s->cs_pages_state[0], 0);
+		cl_env_percpu_refill();
+	}
+	return result;
+}
+EXPORT_SYMBOL(cl_site_init);
+
+/**
+ * Finalize client site. Dual to cl_site_init().
+ */
+void cl_site_fini(struct cl_site *s)
+{
+        lu_site_fini(&s->cs_lu);
+}
+EXPORT_SYMBOL(cl_site_fini);
+
+static struct cache_stats cl_env_stats = {
+        .cs_name    = "envs",
+	.cs_stats = { ATOMIC_INIT(0), }
+};
+
+/**
+ * Outputs client site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int cl_site_stats_print(const struct cl_site *site, struct seq_file *m)
+{
+	static const char *pstate[] = {
+		[CPS_CACHED]	= "c",
+		[CPS_OWNED]	= "o",
+		[CPS_PAGEOUT]	= "w",
+		[CPS_PAGEIN]	= "r",
+		[CPS_FREEING]	= "f"
+	};
+	size_t i;
+
+/*
+       lookup    hit  total   busy create
+pages: ...... ...... ...... ...... ...... [...... ...... ...... ......]
+locks: ...... ...... ...... ...... ...... [...... ...... ...... ...... ......]
+  env: ...... ...... ...... ...... ......
+ */
+	lu_site_stats_seq_print(&site->cs_lu, m);
+	cache_stats_print(&site->cs_pages, m, 1);
+	seq_printf(m, " [");
+	for (i = 0; i < ARRAY_SIZE(site->cs_pages_state); ++i)
+		seq_printf(m, "%s: %u ", pstate[i],
+			   atomic_read(&site->cs_pages_state[i]));
+	seq_printf(m, "]\n");
+	cache_stats_print(&cl_env_stats, m, 0);
+	seq_printf(m, "\n");
+	return 0;
+}
+EXPORT_SYMBOL(cl_site_stats_print);
+
+/*****************************************************************************
+ *
+ * lu_env handling on client.
+ *
+ */
+
+/**
+ * The most efficient way is to store cl_env pointer in task specific
+ * structures. On Linux, it wont' be easy to use task_struct->journal_info
+ * because Lustre code may call into other fs which has certain assumptions
+ * about journal_info. Currently following fields in task_struct are identified
+ * can be used for this purpose:
+ *  - cl_env: for liblustre.
+ *  - tux_info: ony on RedHat kernel.
+ *  - ...
+ * \note As long as we use task_struct to store cl_env, we assume that once
+ * called into Lustre, we'll never call into the other part of the kernel
+ * which will use those fields in task_struct without explicitly exiting
+ * Lustre.
+ *
+ * If there's no space in task_struct is available, hash will be used.
+ * bz20044, bz22683.
+ */
+
+static unsigned cl_envs_cached_max = 32; /* XXX: prototype: arbitrary limit
+					  * for now. */
+static struct cl_env_cache {
+	rwlock_t		cec_guard;
+	unsigned		cec_count;
+	struct list_head	cec_envs;
+} *cl_envs = NULL;
+
+struct cl_env {
+        void             *ce_magic;
+        struct lu_env     ce_lu;
+        struct lu_context ce_ses;
+
+        /*
+         * Linkage into global list of all client environments. Used for
+         * garbage collection.
+         */
+	struct list_head  ce_linkage;
+        /*
+         *
+         */
+        int               ce_ref;
+        /*
+         * Debugging field: address of the caller who made original
+         * allocation.
+         */
+        void             *ce_debug;
+};
+
+#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
+#define CL_ENV_INC(counter) atomic_inc(&cl_env_stats.cs_stats[CS_##counter])
+
+#define CL_ENV_DEC(counter) do {                                              \
+	LASSERT(atomic_read(&cl_env_stats.cs_stats[CS_##counter]) > 0);   \
+	atomic_dec(&cl_env_stats.cs_stats[CS_##counter]);                 \
+} while (0)
+#else
+#define CL_ENV_INC(counter)
+#define CL_ENV_DEC(counter)
+#endif
+
+static void cl_env_init0(struct cl_env *cle, void *debug)
+{
+	LASSERT(cle->ce_ref == 0);
+	LASSERT(cle->ce_magic == &cl_env_init0);
+	LASSERT(cle->ce_debug == NULL);
+
+	cle->ce_ref = 1;
+	cle->ce_debug = debug;
+	CL_ENV_INC(busy);
+}
+
+static struct lu_env *cl_env_new(__u32 ctx_tags, __u32 ses_tags, void *debug)
+{
+	struct lu_env *env;
+	struct cl_env *cle;
+
+	OBD_SLAB_ALLOC_PTR_GFP(cle, cl_env_kmem, GFP_NOFS);
+	if (cle != NULL) {
+		int rc;
+
+		INIT_LIST_HEAD(&cle->ce_linkage);
+		cle->ce_magic = &cl_env_init0;
+		env = &cle->ce_lu;
+		rc = lu_env_init(env, LCT_CL_THREAD|ctx_tags);
+		if (rc == 0) {
+			rc = lu_context_init(&cle->ce_ses,
+					     LCT_SESSION | ses_tags);
+			if (rc == 0) {
+				lu_context_enter(&cle->ce_ses);
+				env->le_ses = &cle->ce_ses;
+				cl_env_init0(cle, debug);
+			} else
+				lu_env_fini(env);
+		}
+		if (rc != 0) {
+			OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
+			env = ERR_PTR(rc);
+		} else {
+			CL_ENV_INC(create);
+			CL_ENV_INC(total);
+		}
+	} else
+		env = ERR_PTR(-ENOMEM);
+	return env;
+}
+
+static void cl_env_fini(struct cl_env *cle)
+{
+        CL_ENV_DEC(total);
+        lu_context_fini(&cle->ce_lu.le_ctx);
+        lu_context_fini(&cle->ce_ses);
+        OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
+}
+
+static struct lu_env *cl_env_obtain(void *debug)
+{
+	struct cl_env *cle;
+	struct lu_env *env;
+	int cpu = get_cpu();
+
+	ENTRY;
+
+	read_lock(&cl_envs[cpu].cec_guard);
+	LASSERT(equi(cl_envs[cpu].cec_count == 0,
+		list_empty(&cl_envs[cpu].cec_envs)));
+	if (cl_envs[cpu].cec_count > 0) {
+		int rc;
+
+		cle = container_of(cl_envs[cpu].cec_envs.next, struct cl_env,
+				   ce_linkage);
+		list_del_init(&cle->ce_linkage);
+		cl_envs[cpu].cec_count--;
+		read_unlock(&cl_envs[cpu].cec_guard);
+		put_cpu();
+
+                env = &cle->ce_lu;
+                rc = lu_env_refill(env);
+                if (rc == 0) {
+                        cl_env_init0(cle, debug);
+                        lu_context_enter(&env->le_ctx);
+                        lu_context_enter(&cle->ce_ses);
+                } else {
+                        cl_env_fini(cle);
+                        env = ERR_PTR(rc);
+                }
+        } else {
+		read_unlock(&cl_envs[cpu].cec_guard);
+		put_cpu();
+		env = cl_env_new(lu_context_tags_default,
+				 lu_session_tags_default, debug);
+	}
+	RETURN(env);
+}
+
+static inline struct cl_env *cl_env_container(struct lu_env *env)
+{
+        return container_of(env, struct cl_env, ce_lu);
+}
+
+/**
+ * Returns lu_env: if there already is an environment associated with the
+ * current thread, it is returned, otherwise, new environment is allocated.
+ *
+ * Allocations are amortized through the global cache of environments.
+ *
+ * \param refcheck pointer to a counter used to detect environment leaks. In
+ * the usual case cl_env_get() and cl_env_put() are called in the same lexical
+ * scope and pointer to the same integer is passed as \a refcheck. This is
+ * used to detect missed cl_env_put().
+ *
+ * \see cl_env_put()
+ */
+struct lu_env *cl_env_get(__u16 *refcheck)
+{
+        struct lu_env *env;
+
+	env = cl_env_obtain(__builtin_return_address(0));
+	if (!IS_ERR(env)) {
+		struct cl_env *cle;
+
+		cle = cl_env_container(env);
+		*refcheck = cle->ce_ref;
+		CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+        }
+        return env;
+}
+EXPORT_SYMBOL(cl_env_get);
+
+/**
+ * Forces an allocation of a fresh environment with given tags.
+ *
+ * \see cl_env_get()
+ */
+struct lu_env *cl_env_alloc(__u16 *refcheck, __u32 tags)
+{
+        struct lu_env *env;
+
+        env = cl_env_new(tags, tags, __builtin_return_address(0));
+        if (!IS_ERR(env)) {
+                struct cl_env *cle;
+
+                cle = cl_env_container(env);
+                *refcheck = cle->ce_ref;
+                CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+        }
+        return env;
+}
+EXPORT_SYMBOL(cl_env_alloc);
+
+static void cl_env_exit(struct cl_env *cle)
+{
+        lu_context_exit(&cle->ce_lu.le_ctx);
+        lu_context_exit(&cle->ce_ses);
+}
+
+/**
+ * Finalizes and frees a given number of cached environments. This is done to
+ * (1) free some memory (not currently hooked into VM), or (2) release
+ * references to modules.
+ */
+unsigned cl_env_cache_purge(unsigned nr)
+{
+	struct cl_env *cle;
+	unsigned i;
+
+	ENTRY;
+	for_each_possible_cpu(i) {
+		write_lock(&cl_envs[i].cec_guard);
+		for (; !list_empty(&cl_envs[i].cec_envs) && nr > 0; --nr) {
+			cle = container_of(cl_envs[i].cec_envs.next,
+					   struct cl_env, ce_linkage);
+			list_del_init(&cle->ce_linkage);
+			LASSERT(cl_envs[i].cec_count > 0);
+			cl_envs[i].cec_count--;
+			write_unlock(&cl_envs[i].cec_guard);
+
+			cl_env_fini(cle);
+			write_lock(&cl_envs[i].cec_guard);
+		}
+		LASSERT(equi(cl_envs[i].cec_count == 0,
+			list_empty(&cl_envs[i].cec_envs)));
+		write_unlock(&cl_envs[i].cec_guard);
+	}
+	RETURN(nr);
+}
+EXPORT_SYMBOL(cl_env_cache_purge);
+
+/**
+ * Release an environment.
+ *
+ * Decrement \a env reference counter. When counter drops to 0, nothing in
+ * this thread is using environment and it is returned to the allocation
+ * cache, or freed straight away, if cache is large enough.
+ */
+void cl_env_put(struct lu_env *env, __u16 *refcheck)
+{
+        struct cl_env *cle;
+
+        cle = cl_env_container(env);
+
+        LASSERT(cle->ce_ref > 0);
+        LASSERT(ergo(refcheck != NULL, cle->ce_ref == *refcheck));
+
+        CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+        if (--cle->ce_ref == 0) {
+		int cpu = get_cpu();
+
+                CL_ENV_DEC(busy);
+                cle->ce_debug = NULL;
+                cl_env_exit(cle);
+                /*
+                 * Don't bother to take a lock here.
+                 *
+                 * Return environment to the cache only when it was allocated
+                 * with the standard tags.
+                 */
+		if (cl_envs[cpu].cec_count < cl_envs_cached_max &&
+		    (env->le_ctx.lc_tags & ~LCT_HAS_EXIT) == LCT_CL_THREAD &&
+		    (env->le_ses->lc_tags & ~LCT_HAS_EXIT) == LCT_SESSION) {
+			read_lock(&cl_envs[cpu].cec_guard);
+			list_add(&cle->ce_linkage, &cl_envs[cpu].cec_envs);
+			cl_envs[cpu].cec_count++;
+			read_unlock(&cl_envs[cpu].cec_guard);
+		} else
+			cl_env_fini(cle);
+		put_cpu();
+	}
+}
+EXPORT_SYMBOL(cl_env_put);
+
+/**
+ * Converts struct cl_attr to struct ost_lvb.
+ *
+ * \see cl_lvb2attr
+ */
+void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr)
+{
+        ENTRY;
+        lvb->lvb_size   = attr->cat_size;
+        lvb->lvb_mtime  = attr->cat_mtime;
+        lvb->lvb_atime  = attr->cat_atime;
+        lvb->lvb_ctime  = attr->cat_ctime;
+        lvb->lvb_blocks = attr->cat_blocks;
+        EXIT;
+}
+
+/**
+ * Converts struct ost_lvb to struct cl_attr.
+ *
+ * \see cl_attr2lvb
+ */
+void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb)
+{
+        ENTRY;
+        attr->cat_size   = lvb->lvb_size;
+        attr->cat_mtime  = lvb->lvb_mtime;
+        attr->cat_atime  = lvb->lvb_atime;
+        attr->cat_ctime  = lvb->lvb_ctime;
+        attr->cat_blocks = lvb->lvb_blocks;
+        EXIT;
+}
+EXPORT_SYMBOL(cl_lvb2attr);
+
+static struct cl_env cl_env_percpu[NR_CPUS];
+
+static int cl_env_percpu_init(void)
+{
+	struct cl_env *cle;
+	int tags = LCT_REMEMBER | LCT_NOREF;
+	int i, j;
+	int rc = 0;
+
+	for_each_possible_cpu(i) {
+		struct lu_env *env;
+
+		rwlock_init(&cl_envs[i].cec_guard);
+		INIT_LIST_HEAD(&cl_envs[i].cec_envs);
+		cl_envs[i].cec_count = 0;
+
+		cle = &cl_env_percpu[i];
+		env = &cle->ce_lu;
+
+		INIT_LIST_HEAD(&cle->ce_linkage);
+		cle->ce_magic = &cl_env_init0;
+		rc = lu_env_init(env, LCT_CL_THREAD | tags);
+		if (rc == 0) {
+			rc = lu_context_init(&cle->ce_ses, LCT_SESSION | tags);
+                        if (rc == 0) {
+                                lu_context_enter(&cle->ce_ses);
+                                env->le_ses = &cle->ce_ses;
+			} else {
+				lu_env_fini(env);
+			}
+		}
+		if (rc != 0)
+			break;
+	}
+	if (rc != 0) {
+		/* Indices 0 to i (excluding i) were correctly initialized,
+		 * thus we must uninitialize up to i, the rest are undefined. */
+		for (j = 0; j < i; j++) {
+			cle = &cl_env_percpu[j];
+			lu_context_exit(&cle->ce_ses);
+			lu_context_fini(&cle->ce_ses);
+			lu_env_fini(&cle->ce_lu);
+		}
+	}
+
+	return rc;
+}
+
+static void cl_env_percpu_fini(void)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct cl_env *cle = &cl_env_percpu[i];
+
+		lu_context_exit(&cle->ce_ses);
+		lu_context_fini(&cle->ce_ses);
+		lu_env_fini(&cle->ce_lu);
+	}
+}
+
+static void cl_env_percpu_refill(void)
+{
+	int i;
+
+	for_each_possible_cpu(i)
+		lu_env_refill(&cl_env_percpu[i].ce_lu);
+}
+
+void cl_env_percpu_put(struct lu_env *env)
+{
+	struct cl_env *cle;
+	int cpu;
+
+	cpu = smp_processor_id();
+	cle = cl_env_container(env);
+	LASSERT(cle == &cl_env_percpu[cpu]);
+
+	cle->ce_ref--;
+	LASSERT(cle->ce_ref == 0);
+
+	CL_ENV_DEC(busy);
+	cle->ce_debug = NULL;
+
+	put_cpu();
+}
+EXPORT_SYMBOL(cl_env_percpu_put);
+
+struct lu_env *cl_env_percpu_get()
+{
+	struct cl_env *cle;
+
+	cle = &cl_env_percpu[get_cpu()];
+	cl_env_init0(cle, __builtin_return_address(0));
+
+	return &cle->ce_lu;
+}
+EXPORT_SYMBOL(cl_env_percpu_get);
+
+/*****************************************************************************
+ *
+ * Temporary prototype thing: mirror obd-devices into cl devices.
+ *
+ */
+
+struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site,
+                                struct lu_device_type *ldt,
+                                struct lu_device *next)
+{
+        const char       *typename;
+        struct lu_device *d;
+
+        LASSERT(ldt != NULL);
+
+        typename = ldt->ldt_name;
+        d = ldt->ldt_ops->ldto_device_alloc(env, ldt, NULL);
+        if (!IS_ERR(d)) {
+                int rc;
+
+                if (site != NULL)
+                        d->ld_site = site;
+                rc = ldt->ldt_ops->ldto_device_init(env, d, typename, next);
+                if (rc == 0) {
+                        lu_device_get(d);
+                        lu_ref_add(&d->ld_reference,
+                                   "lu-stack", &lu_site_init);
+                } else {
+                        ldt->ldt_ops->ldto_device_free(env, d);
+                        CERROR("can't init device '%s', %d\n", typename, rc);
+                        d = ERR_PTR(rc);
+                }
+        } else
+                CERROR("Cannot allocate device: '%s'\n", typename);
+        return lu2cl_dev(d);
+}
+EXPORT_SYMBOL(cl_type_setup);
+
+/**
+ * Finalize device stack by calling lu_stack_fini().
+ */
+void cl_stack_fini(const struct lu_env *env, struct cl_device *cl)
+{
+        lu_stack_fini(env, cl2lu_dev(cl));
+}
+EXPORT_SYMBOL(cl_stack_fini);
+
+static struct lu_context_key cl_key;
+
+struct cl_thread_info *cl_env_info(const struct lu_env *env)
+{
+        return lu_context_key_get(&env->le_ctx, &cl_key);
+}
+
+/* defines cl_key_{init,fini}() */
+LU_KEY_INIT_FINI(cl, struct cl_thread_info);
+
+static struct lu_context_key cl_key = {
+        .lct_tags = LCT_CL_THREAD,
+        .lct_init = cl_key_init,
+        .lct_fini = cl_key_fini,
+};
+
+static struct lu_kmem_descr cl_object_caches[] = {
+        {
+                .ckd_cache = &cl_env_kmem,
+                .ckd_name  = "cl_env_kmem",
+                .ckd_size  = sizeof (struct cl_env)
+        },
+        {
+                .ckd_cache = NULL
+        }
+};
+
+struct cfs_ptask_engine *cl_io_engine;
+
+/**
+ * Global initialization of cl-data. Create kmem caches, register
+ * lu_context_key's, etc.
+ *
+ * \see cl_global_fini()
+ */
+int cl_global_init(void)
+{
+	int result;
+
+	OBD_ALLOC(cl_envs, sizeof(*cl_envs) * num_possible_cpus());
+	if (cl_envs == NULL)
+		GOTO(out, result = -ENOMEM);
+
+	result = lu_kmem_init(cl_object_caches);
+	if (result)
+		GOTO(out_envs, result);
+
+	LU_CONTEXT_KEY_INIT(&cl_key);
+	result = lu_context_key_register(&cl_key);
+	if (result)
+		GOTO(out_kmem, result);
+
+	result = cl_env_percpu_init();
+	if (result) /* no cl_env_percpu_fini on error */
+		GOTO(out_keys, result);
+
+	cl_io_engine = cfs_ptengine_init("clio", cpu_online_mask);
+	if (IS_ERR(cl_io_engine)) {
+		result = PTR_ERR(cl_io_engine);
+		cl_io_engine = NULL;
+		GOTO(out_percpu, result);
+	}
+
+	return 0;
+
+out_percpu:
+	cl_env_percpu_fini();
+out_keys:
+	lu_context_key_degister(&cl_key);
+out_kmem:
+	lu_kmem_fini(cl_object_caches);
+out_envs:
+	OBD_FREE(cl_envs, sizeof(*cl_envs) * num_possible_cpus());
+out:
+	return result;
+}
+
+/**
+ * Finalization of global cl-data. Dual to cl_global_init().
+ */
+void cl_global_fini(void)
+{
+	cfs_ptengine_fini(cl_io_engine);
+	cl_io_engine = NULL;
+	cl_env_percpu_fini();
+	lu_context_key_degister(&cl_key);
+	lu_kmem_fini(cl_object_caches);
+	OBD_FREE(cl_envs, sizeof(*cl_envs) * num_possible_cpus());
+}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_page.c b/drivers/staging/lustrefsx/lustre/obdclass/cl_page.c
new file mode 100644
index 0000000000000..74f9225ec1d59
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_page.c
@@ -0,0 +1,1141 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Lustre Page.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/list.h>
+#include <libcfs/libcfs.h>
+#include <obd_class.h>
+#include <obd_support.h>
+
+#include <cl_object.h>
+#include "cl_internal.h"
+
+static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg);
+
+#ifdef LIBCFS_DEBUG
+# define PASSERT(env, page, expr)                                       \
+  do {                                                                    \
+          if (unlikely(!(expr))) {                                      \
+                  CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n");    \
+                  LASSERT(0);                                           \
+          }                                                             \
+  } while (0)
+#else /* !LIBCFS_DEBUG */
+# define PASSERT(env, page, exp) \
+        ((void)sizeof(env), (void)sizeof(page), (void)sizeof !!(exp))
+#endif /* !LIBCFS_DEBUG */
+
+#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
+# define PINVRNT(env, page, expr)                                       \
+  do {                                                                    \
+          if (unlikely(!(expr))) {                                      \
+                  CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n");    \
+                  LINVRNT(0);                                           \
+          }                                                             \
+  } while (0)
+#else /* !CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK */
+# define PINVRNT(env, page, exp) \
+	 ((void)sizeof(env), (void)sizeof(page), (void)sizeof !!(exp))
+#endif /* !CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK */
+
+/* Disable page statistic by default due to huge performance penalty. */
+#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
+#define CS_PAGE_INC(o, item) \
+	atomic_inc(&cl_object_site(o)->cs_pages.cs_stats[CS_##item])
+#define CS_PAGE_DEC(o, item) \
+	atomic_dec(&cl_object_site(o)->cs_pages.cs_stats[CS_##item])
+#define CS_PAGESTATE_INC(o, state) \
+	atomic_inc(&cl_object_site(o)->cs_pages_state[state])
+#define CS_PAGESTATE_DEC(o, state) \
+	atomic_dec(&cl_object_site(o)->cs_pages_state[state])
+#else
+#define CS_PAGE_INC(o, item)
+#define CS_PAGE_DEC(o, item)
+#define CS_PAGESTATE_INC(o, state)
+#define CS_PAGESTATE_DEC(o, state)
+#endif
+
+/**
+ * Internal version of cl_page_get().
+ *
+ * This function can be used to obtain initial reference to previously
+ * unreferenced cached object. It can be called only if concurrent page
+ * reclamation is somehow prevented, e.g., by keeping a lock on a VM page,
+ * associated with \a page.
+ *
+ * Use with care! Not exported.
+ */
+static void cl_page_get_trust(struct cl_page *page)
+{
+	LASSERT(atomic_read(&page->cp_ref) > 0);
+	atomic_inc(&page->cp_ref);
+}
+
+/**
+ * Returns a slice within a page, corresponding to the given layer in the
+ * device stack.
+ *
+ * \see cl_lock_at()
+ */
+static const struct cl_page_slice *
+cl_page_at_trusted(const struct cl_page *page,
+                   const struct lu_device_type *dtype)
+{
+	const struct cl_page_slice *slice;
+	ENTRY;
+
+	list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+		if (slice->cpl_obj->co_lu.lo_dev->ld_type == dtype)
+			RETURN(slice);
+	}
+	RETURN(NULL);
+}
+
+static void cl_page_free(const struct lu_env *env, struct cl_page *page)
+{
+	struct cl_object *obj  = page->cp_obj;
+	int pagesize = cl_object_header(obj)->coh_page_bufsize;
+
+	PASSERT(env, page, list_empty(&page->cp_batch));
+	PASSERT(env, page, page->cp_owner == NULL);
+	PASSERT(env, page, page->cp_state == CPS_FREEING);
+
+	ENTRY;
+	while (!list_empty(&page->cp_layers)) {
+		struct cl_page_slice *slice;
+
+		slice = list_entry(page->cp_layers.next,
+				   struct cl_page_slice, cpl_linkage);
+		list_del_init(page->cp_layers.next);
+		if (unlikely(slice->cpl_ops->cpo_fini != NULL))
+			slice->cpl_ops->cpo_fini(env, slice);
+	}
+	CS_PAGE_DEC(obj, total);
+	CS_PAGESTATE_DEC(obj, page->cp_state);
+	lu_object_ref_del_at(&obj->co_lu, &page->cp_obj_ref, "cl_page", page);
+	cl_object_put(env, obj);
+	lu_ref_fini(&page->cp_reference);
+	OBD_FREE(page, pagesize);
+	EXIT;
+}
+
+/**
+ * Helper function updating page state. This is the only place in the code
+ * where cl_page::cp_state field is mutated.
+ */
+static inline void cl_page_state_set_trust(struct cl_page *page,
+                                           enum cl_page_state state)
+{
+        /* bypass const. */
+        *(enum cl_page_state *)&page->cp_state = state;
+}
+
+struct cl_page *cl_page_alloc(const struct lu_env *env,
+		struct cl_object *o, pgoff_t ind, struct page *vmpage,
+		enum cl_page_type type)
+{
+	struct cl_page          *page;
+	struct lu_object_header *head;
+
+	ENTRY;
+	OBD_ALLOC_GFP(page, cl_object_header(o)->coh_page_bufsize,
+			GFP_NOFS);
+	if (page != NULL) {
+		int result = 0;
+		atomic_set(&page->cp_ref, 1);
+		page->cp_obj = o;
+		cl_object_get(o);
+		lu_object_ref_add_at(&o->co_lu, &page->cp_obj_ref, "cl_page",
+				     page);
+		page->cp_vmpage = vmpage;
+		cl_page_state_set_trust(page, CPS_CACHED);
+		page->cp_type = type;
+		INIT_LIST_HEAD(&page->cp_layers);
+		INIT_LIST_HEAD(&page->cp_batch);
+		lu_ref_init(&page->cp_reference);
+		head = o->co_lu.lo_header;
+		list_for_each_entry(o, &head->loh_layers,
+				    co_lu.lo_linkage) {
+			if (o->co_ops->coo_page_init != NULL) {
+				result = o->co_ops->coo_page_init(env, o, page,
+								  ind);
+				if (result != 0) {
+					cl_page_delete0(env, page);
+					cl_page_free(env, page);
+					page = ERR_PTR(result);
+					break;
+				}
+			}
+		}
+		if (result == 0) {
+			CS_PAGE_INC(o, total);
+			CS_PAGE_INC(o, create);
+			CS_PAGESTATE_DEC(o, CPS_CACHED);
+		}
+	} else {
+		page = ERR_PTR(-ENOMEM);
+	}
+	RETURN(page);
+}
+
+/**
+ * Returns a cl_page with index \a idx at the object \a o, and associated with
+ * the VM page \a vmpage.
+ *
+ * This is the main entry point into the cl_page caching interface. First, a
+ * cache (implemented as a per-object radix tree) is consulted. If page is
+ * found there, it is returned immediately. Otherwise new page is allocated
+ * and returned. In any case, additional reference to page is acquired.
+ *
+ * \see cl_object_find(), cl_lock_find()
+ */
+struct cl_page *cl_page_find(const struct lu_env *env,
+			     struct cl_object *o,
+			     pgoff_t idx, struct page *vmpage,
+			     enum cl_page_type type)
+{
+	struct cl_page          *page = NULL;
+	struct cl_object_header *hdr;
+
+	LASSERT(type == CPT_CACHEABLE || type == CPT_TRANSIENT);
+	might_sleep();
+
+	ENTRY;
+
+	hdr = cl_object_header(o);
+	CS_PAGE_INC(o, lookup);
+
+        CDEBUG(D_PAGE, "%lu@"DFID" %p %lx %d\n",
+               idx, PFID(&hdr->coh_lu.loh_fid), vmpage, vmpage->private, type);
+        /* fast path. */
+        if (type == CPT_CACHEABLE) {
+		/* vmpage lock is used to protect the child/parent
+		 * relationship */
+		KLASSERT(PageLocked(vmpage));
+                /*
+                 * cl_vmpage_page() can be called here without any locks as
+                 *
+                 *     - "vmpage" is locked (which prevents ->private from
+                 *       concurrent updates), and
+                 *
+                 *     - "o" cannot be destroyed while current thread holds a
+                 *       reference on it.
+                 */
+                page = cl_vmpage_page(vmpage, o);
+		if (page != NULL) {
+			CS_PAGE_INC(o, hit);
+			RETURN(page);
+		}
+        }
+
+        /* allocate and initialize cl_page */
+        page = cl_page_alloc(env, o, idx, vmpage, type);
+	RETURN(page);
+}
+EXPORT_SYMBOL(cl_page_find);
+
+static inline int cl_page_invariant(const struct cl_page *pg)
+{
+	return cl_page_in_use_noref(pg);
+}
+
+static void cl_page_state_set0(const struct lu_env *env,
+                               struct cl_page *page, enum cl_page_state state)
+{
+        enum cl_page_state old;
+
+        /*
+         * Matrix of allowed state transitions [old][new], for sanity
+         * checking.
+         */
+        static const int allowed_transitions[CPS_NR][CPS_NR] = {
+                [CPS_CACHED] = {
+                        [CPS_CACHED]  = 0,
+                        [CPS_OWNED]   = 1, /* io finds existing cached page */
+                        [CPS_PAGEIN]  = 0,
+                        [CPS_PAGEOUT] = 1, /* write-out from the cache */
+                        [CPS_FREEING] = 1, /* eviction on the memory pressure */
+                },
+                [CPS_OWNED] = {
+                        [CPS_CACHED]  = 1, /* release to the cache */
+                        [CPS_OWNED]   = 0,
+                        [CPS_PAGEIN]  = 1, /* start read immediately */
+                        [CPS_PAGEOUT] = 1, /* start write immediately */
+                        [CPS_FREEING] = 1, /* lock invalidation or truncate */
+                },
+                [CPS_PAGEIN] = {
+                        [CPS_CACHED]  = 1, /* io completion */
+                        [CPS_OWNED]   = 0,
+                        [CPS_PAGEIN]  = 0,
+                        [CPS_PAGEOUT] = 0,
+                        [CPS_FREEING] = 0,
+                },
+                [CPS_PAGEOUT] = {
+                        [CPS_CACHED]  = 1, /* io completion */
+                        [CPS_OWNED]   = 0,
+                        [CPS_PAGEIN]  = 0,
+                        [CPS_PAGEOUT] = 0,
+                        [CPS_FREEING] = 0,
+                },
+                [CPS_FREEING] = {
+                        [CPS_CACHED]  = 0,
+                        [CPS_OWNED]   = 0,
+                        [CPS_PAGEIN]  = 0,
+                        [CPS_PAGEOUT] = 0,
+                        [CPS_FREEING] = 0,
+                }
+        };
+
+        ENTRY;
+        old = page->cp_state;
+        PASSERT(env, page, allowed_transitions[old][state]);
+        CL_PAGE_HEADER(D_TRACE, env, page, "%d -> %d\n", old, state);
+	PASSERT(env, page, page->cp_state == old);
+	PASSERT(env, page, equi(state == CPS_OWNED, page->cp_owner != NULL));
+
+	CS_PAGESTATE_DEC(page->cp_obj, page->cp_state);
+	CS_PAGESTATE_INC(page->cp_obj, state);
+	cl_page_state_set_trust(page, state);
+	EXIT;
+}
+
+static void cl_page_state_set(const struct lu_env *env,
+                              struct cl_page *page, enum cl_page_state state)
+{
+        cl_page_state_set0(env, page, state);
+}
+
+/**
+ * Acquires an additional reference to a page.
+ *
+ * This can be called only by caller already possessing a reference to \a
+ * page.
+ *
+ * \see cl_object_get(), cl_lock_get().
+ */
+void cl_page_get(struct cl_page *page)
+{
+	ENTRY;
+	cl_page_get_trust(page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_get);
+
+/**
+ * Releases a reference to a page.
+ *
+ * When last reference is released, page is returned to the cache, unless it
+ * is in cl_page_state::CPS_FREEING state, in which case it is immediately
+ * destroyed.
+ *
+ * \see cl_object_put(), cl_lock_put().
+ */
+void cl_page_put(const struct lu_env *env, struct cl_page *page)
+{
+        ENTRY;
+        CL_PAGE_HEADER(D_TRACE, env, page, "%d\n",
+		       atomic_read(&page->cp_ref));
+
+	if (atomic_dec_and_test(&page->cp_ref)) {
+		LASSERT(page->cp_state == CPS_FREEING);
+
+		LASSERT(atomic_read(&page->cp_ref) == 0);
+		PASSERT(env, page, page->cp_owner == NULL);
+		PASSERT(env, page, list_empty(&page->cp_batch));
+		/*
+		 * Page is no longer reachable by other threads. Tear
+		 * it down.
+		 */
+		cl_page_free(env, page);
+	}
+
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_put);
+
+/**
+ * Returns a cl_page associated with a VM page, and given cl_object.
+ */
+struct cl_page *cl_vmpage_page(struct page *vmpage, struct cl_object *obj)
+{
+	struct cl_page *page;
+
+	ENTRY;
+	KLASSERT(PageLocked(vmpage));
+
+	/*
+	 * NOTE: absence of races and liveness of data are guaranteed by page
+	 *       lock on a "vmpage". That works because object destruction has
+	 *       bottom-to-top pass.
+	 */
+
+	page = (struct cl_page *)vmpage->private;
+	if (page != NULL) {
+		cl_page_get_trust(page);
+		LASSERT(page->cp_type == CPT_CACHEABLE);
+	}
+	RETURN(page);
+}
+EXPORT_SYMBOL(cl_vmpage_page);
+
+const struct cl_page_slice *cl_page_at(const struct cl_page *page,
+                                       const struct lu_device_type *dtype)
+{
+        return cl_page_at_trusted(page, dtype);
+}
+EXPORT_SYMBOL(cl_page_at);
+
+static void cl_page_owner_clear(struct cl_page *page)
+{
+	ENTRY;
+	if (page->cp_owner != NULL) {
+		LASSERT(page->cp_owner->ci_owned_nr > 0);
+		page->cp_owner->ci_owned_nr--;
+		page->cp_owner = NULL;
+	}
+	EXIT;
+}
+
+static void cl_page_owner_set(struct cl_page *page)
+{
+	ENTRY;
+	LASSERT(page->cp_owner != NULL);
+	page->cp_owner->ci_owned_nr++;
+	EXIT;
+}
+
+void cl_page_disown0(const struct lu_env *env,
+                     struct cl_io *io, struct cl_page *pg)
+{
+	const struct cl_page_slice *slice;
+        enum cl_page_state state;
+
+        ENTRY;
+        state = pg->cp_state;
+        PINVRNT(env, pg, state == CPS_OWNED || state == CPS_FREEING);
+        PINVRNT(env, pg, cl_page_invariant(pg) || state == CPS_FREEING);
+        cl_page_owner_clear(pg);
+
+        if (state == CPS_OWNED)
+                cl_page_state_set(env, pg, CPS_CACHED);
+        /*
+         * Completion call-backs are executed in the bottom-up order, so that
+         * uppermost layer (llite), responsible for VFS/VM interaction runs
+         * last and can release locks safely.
+         */
+	list_for_each_entry_reverse(slice, &pg->cp_layers, cpl_linkage) {
+		if (slice->cpl_ops->cpo_disown != NULL)
+			(*slice->cpl_ops->cpo_disown)(env, slice, io);
+	}
+
+        EXIT;
+}
+
+/**
+ * returns true, iff page is owned by the given io.
+ */
+int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io)
+{
+	struct cl_io *top = cl_io_top((struct cl_io *)io);
+	LINVRNT(cl_object_same(pg->cp_obj, io->ci_obj));
+	ENTRY;
+	RETURN(pg->cp_state == CPS_OWNED && pg->cp_owner == top);
+}
+EXPORT_SYMBOL(cl_page_is_owned);
+
+/**
+ * Try to own a page by IO.
+ *
+ * Waits until page is in cl_page_state::CPS_CACHED state, and then switch it
+ * into cl_page_state::CPS_OWNED state.
+ *
+ * \pre  !cl_page_is_owned(pg, io)
+ * \post result == 0 iff cl_page_is_owned(pg, io)
+ *
+ * \retval 0   success
+ *
+ * \retval -ve failure, e.g., page was destroyed (and landed in
+ *             cl_page_state::CPS_FREEING instead of cl_page_state::CPS_CACHED).
+ *             or, page was owned by another thread, or in IO.
+ *
+ * \see cl_page_disown()
+ * \see cl_page_operations::cpo_own()
+ * \see cl_page_own_try()
+ * \see cl_page_own
+ */
+static int cl_page_own0(const struct lu_env *env, struct cl_io *io,
+                        struct cl_page *pg, int nonblock)
+{
+	int result = 0;
+	const struct cl_page_slice *slice;
+
+        PINVRNT(env, pg, !cl_page_is_owned(pg, io));
+
+        ENTRY;
+        io = cl_io_top(io);
+
+        if (pg->cp_state == CPS_FREEING) {
+                result = -ENOENT;
+		goto out;
+	}
+
+	list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) {
+		if (slice->cpl_ops->cpo_own)
+			result = (*slice->cpl_ops->cpo_own)(env, slice,
+							    io, nonblock);
+
+		if (result != 0)
+			break;
+
+	}
+	if (result > 0)
+		result = 0;
+
+	if (result == 0) {
+		PASSERT(env, pg, pg->cp_owner == NULL);
+		pg->cp_owner = cl_io_top(io);
+		cl_page_owner_set(pg);
+		if (pg->cp_state != CPS_FREEING) {
+			cl_page_state_set(env, pg, CPS_OWNED);
+		} else {
+			cl_page_disown0(env, io, pg);
+			result = -ENOENT;
+		}
+	}
+
+out:
+        PINVRNT(env, pg, ergo(result == 0, cl_page_invariant(pg)));
+        RETURN(result);
+}
+
+/**
+ * Own a page, might be blocked.
+ *
+ * \see cl_page_own0()
+ */
+int cl_page_own(const struct lu_env *env, struct cl_io *io, struct cl_page *pg)
+{
+        return cl_page_own0(env, io, pg, 0);
+}
+EXPORT_SYMBOL(cl_page_own);
+
+/**
+ * Nonblock version of cl_page_own().
+ *
+ * \see cl_page_own0()
+ */
+int cl_page_own_try(const struct lu_env *env, struct cl_io *io,
+                    struct cl_page *pg)
+{
+        return cl_page_own0(env, io, pg, 1);
+}
+EXPORT_SYMBOL(cl_page_own_try);
+
+
+/**
+ * Assume page ownership.
+ *
+ * Called when page is already locked by the hosting VM.
+ *
+ * \pre !cl_page_is_owned(pg, io)
+ * \post cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_operations::cpo_assume()
+ */
+void cl_page_assume(const struct lu_env *env,
+                    struct cl_io *io, struct cl_page *pg)
+{
+	const struct cl_page_slice *slice;
+
+	PINVRNT(env, pg, cl_object_same(pg->cp_obj, io->ci_obj));
+
+	ENTRY;
+	io = cl_io_top(io);
+
+	list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) {
+		if (slice->cpl_ops->cpo_assume != NULL)
+			(*slice->cpl_ops->cpo_assume)(env, slice, io);
+	}
+
+	PASSERT(env, pg, pg->cp_owner == NULL);
+	pg->cp_owner = cl_io_top(io);
+	cl_page_owner_set(pg);
+	cl_page_state_set(env, pg, CPS_OWNED);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_assume);
+
+/**
+ * Releases page ownership without unlocking the page.
+ *
+ * Moves page into cl_page_state::CPS_CACHED without releasing a lock on the
+ * underlying VM page (as VM is supposed to do this itself).
+ *
+ * \pre   cl_page_is_owned(pg, io)
+ * \post !cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_assume()
+ */
+void cl_page_unassume(const struct lu_env *env,
+                      struct cl_io *io, struct cl_page *pg)
+{
+	const struct cl_page_slice *slice;
+
+        PINVRNT(env, pg, cl_page_is_owned(pg, io));
+        PINVRNT(env, pg, cl_page_invariant(pg));
+
+        ENTRY;
+        io = cl_io_top(io);
+        cl_page_owner_clear(pg);
+        cl_page_state_set(env, pg, CPS_CACHED);
+
+	list_for_each_entry_reverse(slice, &pg->cp_layers, cpl_linkage) {
+		if (slice->cpl_ops->cpo_unassume != NULL)
+			(*slice->cpl_ops->cpo_unassume)(env, slice, io);
+	}
+
+        EXIT;
+}
+EXPORT_SYMBOL(cl_page_unassume);
+
+/**
+ * Releases page ownership.
+ *
+ * Moves page into cl_page_state::CPS_CACHED.
+ *
+ * \pre   cl_page_is_owned(pg, io)
+ * \post !cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_own()
+ * \see cl_page_operations::cpo_disown()
+ */
+void cl_page_disown(const struct lu_env *env,
+                    struct cl_io *io, struct cl_page *pg)
+{
+	PINVRNT(env, pg, cl_page_is_owned(pg, io) ||
+		pg->cp_state == CPS_FREEING);
+
+	ENTRY;
+	io = cl_io_top(io);
+	cl_page_disown0(env, io, pg);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_disown);
+
+/**
+ * Called when page is to be removed from the object, e.g., as a result of
+ * truncate.
+ *
+ * Calls cl_page_operations::cpo_discard() top-to-bottom.
+ *
+ * \pre cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_operations::cpo_discard()
+ */
+void cl_page_discard(const struct lu_env *env,
+                     struct cl_io *io, struct cl_page *pg)
+{
+	const struct cl_page_slice *slice;
+
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+	PINVRNT(env, pg, cl_page_invariant(pg));
+
+	list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) {
+		if (slice->cpl_ops->cpo_discard != NULL)
+			(*slice->cpl_ops->cpo_discard)(env, slice, io);
+	}
+}
+EXPORT_SYMBOL(cl_page_discard);
+
+/**
+ * Version of cl_page_delete() that can be called for not fully constructed
+ * pages, e.g. in an error handling cl_page_find()->cl_page_delete0()
+ * path. Doesn't check page invariant.
+ */
+static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg)
+{
+	const struct cl_page_slice *slice;
+
+        ENTRY;
+
+        PASSERT(env, pg, pg->cp_state != CPS_FREEING);
+
+        /*
+         * Severe all ways to obtain new pointers to @pg.
+         */
+        cl_page_owner_clear(pg);
+        cl_page_state_set0(env, pg, CPS_FREEING);
+
+	list_for_each_entry_reverse(slice, &pg->cp_layers, cpl_linkage) {
+		if (slice->cpl_ops->cpo_delete != NULL)
+			(*slice->cpl_ops->cpo_delete)(env, slice);
+	}
+
+        EXIT;
+}
+
+/**
+ * Called when a decision is made to throw page out of memory.
+ *
+ * Notifies all layers about page destruction by calling
+ * cl_page_operations::cpo_delete() method top-to-bottom.
+ *
+ * Moves page into cl_page_state::CPS_FREEING state (this is the only place
+ * where transition to this state happens).
+ *
+ * Eliminates all venues through which new references to the page can be
+ * obtained:
+ *
+ *     - removes page from the radix trees,
+ *
+ *     - breaks linkage from VM page to cl_page.
+ *
+ * Once page reaches cl_page_state::CPS_FREEING, all remaining references will
+ * drain after some time, at which point page will be recycled.
+ *
+ * \pre  VM page is locked
+ * \post pg->cp_state == CPS_FREEING
+ *
+ * \see cl_page_operations::cpo_delete()
+ */
+void cl_page_delete(const struct lu_env *env, struct cl_page *pg)
+{
+	PINVRNT(env, pg, cl_page_invariant(pg));
+	ENTRY;
+	cl_page_delete0(env, pg);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_delete);
+
+/**
+ * Marks page up-to-date.
+ *
+ * Call cl_page_operations::cpo_export() through all layers top-to-bottom. The
+ * layer responsible for VM interaction has to mark/clear page as up-to-date
+ * by the \a uptodate argument.
+ *
+ * \see cl_page_operations::cpo_export()
+ */
+void cl_page_export(const struct lu_env *env, struct cl_page *pg, int uptodate)
+{
+	const struct cl_page_slice *slice;
+
+        PINVRNT(env, pg, cl_page_invariant(pg));
+
+	list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) {
+		if (slice->cpl_ops->cpo_export != NULL)
+			(*slice->cpl_ops->cpo_export)(env, slice, uptodate);
+	}
+}
+EXPORT_SYMBOL(cl_page_export);
+
+/**
+ * Returns true, iff \a pg is VM locked in a suitable sense by the calling
+ * thread.
+ */
+int cl_page_is_vmlocked(const struct lu_env *env, const struct cl_page *pg)
+{
+        const struct cl_page_slice *slice;
+	int result;
+
+        ENTRY;
+        slice = container_of(pg->cp_layers.next,
+                             const struct cl_page_slice, cpl_linkage);
+        PASSERT(env, pg, slice->cpl_ops->cpo_is_vmlocked != NULL);
+        /*
+         * Call ->cpo_is_vmlocked() directly instead of going through
+         * CL_PAGE_INVOKE(), because cl_page_is_vmlocked() is used by
+         * cl_page_invariant().
+         */
+        result = slice->cpl_ops->cpo_is_vmlocked(env, slice);
+        PASSERT(env, pg, result == -EBUSY || result == -ENODATA);
+        RETURN(result == -EBUSY);
+}
+EXPORT_SYMBOL(cl_page_is_vmlocked);
+
+static enum cl_page_state cl_req_type_state(enum cl_req_type crt)
+{
+        ENTRY;
+        RETURN(crt == CRT_WRITE ? CPS_PAGEOUT : CPS_PAGEIN);
+}
+
+static void cl_page_io_start(const struct lu_env *env,
+                             struct cl_page *pg, enum cl_req_type crt)
+{
+        /*
+         * Page is queued for IO, change its state.
+         */
+        ENTRY;
+        cl_page_owner_clear(pg);
+        cl_page_state_set(env, pg, cl_req_type_state(crt));
+        EXIT;
+}
+
+/**
+ * Prepares page for immediate transfer. cl_page_operations::cpo_prep() is
+ * called top-to-bottom. Every layer either agrees to submit this page (by
+ * returning 0), or requests to omit this page (by returning -EALREADY). Layer
+ * handling interactions with the VM also has to inform VM that page is under
+ * transfer now.
+ */
+int cl_page_prep(const struct lu_env *env, struct cl_io *io,
+                 struct cl_page *pg, enum cl_req_type crt)
+{
+	const struct cl_page_slice *slice;
+	int result = 0;
+
+        PINVRNT(env, pg, cl_page_is_owned(pg, io));
+        PINVRNT(env, pg, cl_page_invariant(pg));
+        PINVRNT(env, pg, crt < CRT_NR);
+
+        /*
+         * XXX this has to be called bottom-to-top, so that llite can set up
+         * PG_writeback without risking other layers deciding to skip this
+         * page.
+         */
+	if (crt >= CRT_NR)
+		return -EINVAL;
+
+	list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) {
+		if (slice->cpl_ops->cpo_own)
+			result = (*slice->cpl_ops->io[crt].cpo_prep)(env,
+								     slice,
+								     io);
+
+		if (result != 0)
+			break;
+
+	}
+
+	if (result >= 0) {
+		result = 0;
+		cl_page_io_start(env, pg, crt);
+	}
+
+	CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
+	return result;
+}
+EXPORT_SYMBOL(cl_page_prep);
+
+/**
+ * Notify layers about transfer completion.
+ *
+ * Invoked by transfer sub-system (which is a part of osc) to notify layers
+ * that a transfer, of which this page is a part of has completed.
+ *
+ * Completion call-backs are executed in the bottom-up order, so that
+ * uppermost layer (llite), responsible for the VFS/VM interaction runs last
+ * and can release locks safely.
+ *
+ * \pre  pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT
+ * \post pg->cp_state == CPS_CACHED
+ *
+ * \see cl_page_operations::cpo_completion()
+ */
+void cl_page_completion(const struct lu_env *env,
+                        struct cl_page *pg, enum cl_req_type crt, int ioret)
+{
+	const struct cl_page_slice *slice;
+        struct cl_sync_io *anchor = pg->cp_sync_io;
+
+        PASSERT(env, pg, crt < CRT_NR);
+        PASSERT(env, pg, pg->cp_state == cl_req_type_state(crt));
+
+        ENTRY;
+        CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, ioret);
+        cl_page_state_set(env, pg, CPS_CACHED);
+	if (crt >= CRT_NR)
+		return;
+
+	list_for_each_entry_reverse(slice, &pg->cp_layers, cpl_linkage) {
+		if (slice->cpl_ops->io[crt].cpo_completion != NULL)
+			(*slice->cpl_ops->io[crt].cpo_completion)(env, slice,
+								  ioret);
+	}
+
+	if (anchor != NULL) {
+		LASSERT(pg->cp_sync_io == anchor);
+		pg->cp_sync_io = NULL;
+		cl_sync_io_note(env, anchor, ioret);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_completion);
+
+/**
+ * Notify layers that transfer formation engine decided to yank this page from
+ * the cache and to make it a part of a transfer.
+ *
+ * \pre  pg->cp_state == CPS_CACHED
+ * \post pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT
+ *
+ * \see cl_page_operations::cpo_make_ready()
+ */
+int cl_page_make_ready(const struct lu_env *env, struct cl_page *pg,
+                       enum cl_req_type crt)
+{
+	const struct cl_page_slice *sli;
+	int result = 0;
+
+        PINVRNT(env, pg, crt < CRT_NR);
+
+        ENTRY;
+	if (crt >= CRT_NR)
+		RETURN(-EINVAL);
+
+	list_for_each_entry(sli, &pg->cp_layers, cpl_linkage) {
+		if (sli->cpl_ops->io[crt].cpo_make_ready != NULL)
+			result = (*sli->cpl_ops->io[crt].cpo_make_ready)(env,
+									 sli);
+		if (result != 0)
+			break;
+	}
+
+	if (result >= 0) {
+		result = 0;
+                PASSERT(env, pg, pg->cp_state == CPS_CACHED);
+                cl_page_io_start(env, pg, crt);
+        }
+        CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_make_ready);
+
+/**
+ * Called if a pge is being written back by kernel's intention.
+ *
+ * \pre  cl_page_is_owned(pg, io)
+ * \post ergo(result == 0, pg->cp_state == CPS_PAGEOUT)
+ *
+ * \see cl_page_operations::cpo_flush()
+ */
+int cl_page_flush(const struct lu_env *env, struct cl_io *io,
+		  struct cl_page *pg)
+{
+	const struct cl_page_slice *slice;
+	int result = 0;
+
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+	PINVRNT(env, pg, cl_page_invariant(pg));
+
+	ENTRY;
+
+	list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) {
+		if (slice->cpl_ops->cpo_flush != NULL)
+			result = (*slice->cpl_ops->cpo_flush)(env, slice, io);
+		if (result != 0)
+			break;
+	}
+	if (result > 0)
+		result = 0;
+
+	CL_PAGE_HEADER(D_TRACE, env, pg, "%d\n", result);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_flush);
+
+/**
+ * Tells transfer engine that only part of a page is to be transmitted.
+ *
+ * \see cl_page_operations::cpo_clip()
+ */
+void cl_page_clip(const struct lu_env *env, struct cl_page *pg,
+                  int from, int to)
+{
+	const struct cl_page_slice *slice;
+
+        PINVRNT(env, pg, cl_page_invariant(pg));
+
+        CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", from, to);
+	list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) {
+		if (slice->cpl_ops->cpo_clip != NULL)
+			(*slice->cpl_ops->cpo_clip)(env, slice, from, to);
+	}
+}
+EXPORT_SYMBOL(cl_page_clip);
+
+/**
+ * Prints human readable representation of \a pg to the \a f.
+ */
+void cl_page_header_print(const struct lu_env *env, void *cookie,
+                          lu_printer_t printer, const struct cl_page *pg)
+{
+	(*printer)(env, cookie,
+		   "page@%p[%d %p %d %d %p]\n",
+		   pg, atomic_read(&pg->cp_ref), pg->cp_obj,
+		   pg->cp_state, pg->cp_type,
+		   pg->cp_owner);
+}
+EXPORT_SYMBOL(cl_page_header_print);
+
+/**
+ * Prints human readable representation of \a pg to the \a f.
+ */
+void cl_page_print(const struct lu_env *env, void *cookie,
+                   lu_printer_t printer, const struct cl_page *pg)
+{
+	const struct cl_page_slice *slice;
+	int result = 0;
+
+	cl_page_header_print(env, cookie, printer, pg);
+	list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) {
+		if (slice->cpl_ops->cpo_print != NULL)
+			result = (*slice->cpl_ops->cpo_print)(env, slice,
+							     cookie, printer);
+		if (result != 0)
+			break;
+	}
+	(*printer)(env, cookie, "end page@%p\n", pg);
+}
+EXPORT_SYMBOL(cl_page_print);
+
+/**
+ * Cancel a page which is still in a transfer.
+ */
+int cl_page_cancel(const struct lu_env *env, struct cl_page *page)
+{
+	const struct cl_page_slice *slice;
+	int			    result = 0;
+
+	list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+		if (slice->cpl_ops->cpo_cancel != NULL)
+			result = (*slice->cpl_ops->cpo_cancel)(env, slice);
+		if (result != 0)
+			break;
+	}
+	if (result > 0)
+		result = 0;
+
+	return result;
+}
+
+/**
+ * Converts a byte offset within object \a obj into a page index.
+ */
+loff_t cl_offset(const struct cl_object *obj, pgoff_t idx)
+{
+	return (loff_t)idx << PAGE_SHIFT;
+}
+EXPORT_SYMBOL(cl_offset);
+
+/**
+ * Converts a page index into a byte offset within object \a obj.
+ */
+pgoff_t cl_index(const struct cl_object *obj, loff_t offset)
+{
+	return offset >> PAGE_SHIFT;
+}
+EXPORT_SYMBOL(cl_index);
+
+size_t cl_page_size(const struct cl_object *obj)
+{
+	return 1UL << PAGE_SHIFT;
+}
+EXPORT_SYMBOL(cl_page_size);
+
+/**
+ * Adds page slice to the compound page.
+ *
+ * This is called by cl_object_operations::coo_page_init() methods to add a
+ * per-layer state to the page. New state is added at the end of
+ * cl_page::cp_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_req_slice_add(), cl_io_slice_add()
+ */
+void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice,
+		       struct cl_object *obj, pgoff_t index,
+		       const struct cl_page_operations *ops)
+{
+	ENTRY;
+	list_add_tail(&slice->cpl_linkage, &page->cp_layers);
+	slice->cpl_obj  = obj;
+	slice->cpl_index = index;
+	slice->cpl_ops  = ops;
+	slice->cpl_page = page;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_slice_add);
+
+/**
+ * Allocate and initialize cl_cache, called by ll_init_sbi().
+ */
+struct cl_client_cache *cl_cache_init(unsigned long lru_page_max)
+{
+	struct cl_client_cache	*cache = NULL;
+
+	ENTRY;
+	OBD_ALLOC(cache, sizeof(*cache));
+	if (cache == NULL)
+		RETURN(NULL);
+
+	/* Initialize cache data */
+	atomic_set(&cache->ccc_users, 1);
+	cache->ccc_lru_max = lru_page_max;
+	atomic_long_set(&cache->ccc_lru_left, lru_page_max);
+	spin_lock_init(&cache->ccc_lru_lock);
+	INIT_LIST_HEAD(&cache->ccc_lru);
+
+	/* turn unstable check off by default as it impacts performance */
+	cache->ccc_unstable_check = 0;
+	atomic_long_set(&cache->ccc_unstable_nr, 0);
+	init_waitqueue_head(&cache->ccc_unstable_waitq);
+
+	RETURN(cache);
+}
+EXPORT_SYMBOL(cl_cache_init);
+
+/**
+ * Increase cl_cache refcount
+ */
+void cl_cache_incref(struct cl_client_cache *cache)
+{
+	atomic_inc(&cache->ccc_users);
+}
+EXPORT_SYMBOL(cl_cache_incref);
+
+/**
+ * Decrease cl_cache refcount and free the cache if refcount=0.
+ * Since llite, lov and osc all hold cl_cache refcount,
+ * the free will not cause race. (LU-6173)
+ */
+void cl_cache_decref(struct cl_client_cache *cache)
+{
+	if (atomic_dec_and_test(&cache->ccc_users))
+		OBD_FREE(cache, sizeof(*cache));
+}
+EXPORT_SYMBOL(cl_cache_decref);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c b/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c
new file mode 100644
index 0000000000000..913ae54465b4f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c
@@ -0,0 +1,713 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/miscdevice.h>
+#include <linux/user_namespace.h>
+#ifdef HAVE_UIDGID_HEADER
+# include <linux/uidgid.h>
+#endif
+#include <linux/atomic.h>
+#include <linux/list.h>
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lnet/lnetctl.h>
+#include <lustre_debug.h>
+#include <lustre_kernelcomm.h>
+#include <lprocfs_status.h>
+#include <lustre_ver.h>
+#include <cl_object.h>
+#ifdef HAVE_SERVER_SUPPORT
+# include <dt_object.h>
+# include <md_object.h>
+#endif /* HAVE_SERVER_SUPPORT */
+#include <uapi/linux/lustre_ioctl.h>
+#include "llog_internal.h"
+
+#ifdef CONFIG_PROC_FS
+static __u64 obd_max_alloc;
+#else
+__u64 obd_max_alloc;
+#endif
+
+static DEFINE_SPINLOCK(obd_updatemax_lock);
+
+/* The following are visible and mutable through /proc/sys/lustre/. */
+unsigned int obd_debug_peer_on_timeout;
+EXPORT_SYMBOL(obd_debug_peer_on_timeout);
+unsigned int obd_dump_on_timeout;
+EXPORT_SYMBOL(obd_dump_on_timeout);
+unsigned int obd_dump_on_eviction;
+EXPORT_SYMBOL(obd_dump_on_eviction);
+unsigned long obd_max_dirty_pages;
+EXPORT_SYMBOL(obd_max_dirty_pages);
+atomic_long_t obd_dirty_pages;
+EXPORT_SYMBOL(obd_dirty_pages);
+unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT;   /* seconds */
+EXPORT_SYMBOL(obd_timeout);
+unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */
+EXPORT_SYMBOL(ldlm_timeout);
+unsigned int obd_timeout_set;
+EXPORT_SYMBOL(obd_timeout_set);
+unsigned int ldlm_timeout_set;
+EXPORT_SYMBOL(ldlm_timeout_set);
+/* bulk transfer timeout, give up after 100s by default */
+unsigned int bulk_timeout = 100; /* seconds */
+EXPORT_SYMBOL(bulk_timeout);
+/* Adaptive timeout defs here instead of ptlrpc module for /proc/sys/ access */
+unsigned int at_min = 0;
+EXPORT_SYMBOL(at_min);
+unsigned int at_max = 600;
+EXPORT_SYMBOL(at_max);
+unsigned int at_history = 600;
+EXPORT_SYMBOL(at_history);
+int at_early_margin = 5;
+EXPORT_SYMBOL(at_early_margin);
+int at_extra = 30;
+EXPORT_SYMBOL(at_extra);
+
+atomic_long_t obd_dirty_transit_pages;
+EXPORT_SYMBOL(obd_dirty_transit_pages);
+
+char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE;
+
+#ifdef CONFIG_PROC_FS
+struct lprocfs_stats *obd_memory = NULL;
+EXPORT_SYMBOL(obd_memory);
+#endif
+
+char obd_jobid_node[LUSTRE_JOBID_SIZE + 1];
+
+/* Get jobid of current process by reading the environment variable
+ * stored in between the "env_start" & "env_end" of task struct.
+ *
+ * TODO:
+ * It's better to cache the jobid for later use if there is any
+ * efficient way, the cl_env code probably could be reused for this
+ * purpose.
+ *
+ * If some job scheduler doesn't store jobid in the "env_start/end",
+ * then an upcall could be issued here to get the jobid by utilizing
+ * the userspace tools/api. Then, the jobid must be cached.
+ */
+int lustre_get_jobid(char *jobid)
+{
+	int jobid_len = LUSTRE_JOBID_SIZE;
+	char tmp_jobid[LUSTRE_JOBID_SIZE] = { 0 };
+	int rc = 0;
+	ENTRY;
+
+	/* Jobstats isn't enabled */
+	if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0)
+		GOTO(out, rc = 0);
+
+	/* Whole node dedicated to single job */
+	if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) == 0) {
+		memcpy(tmp_jobid, obd_jobid_node, LUSTRE_JOBID_SIZE);
+		GOTO(out, rc = 0);
+	}
+
+	/* Use process name + fsuid as jobid */
+	if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) {
+		snprintf(tmp_jobid, LUSTRE_JOBID_SIZE, "%s.%u",
+			 current_comm(),
+			 from_kuid(&init_user_ns, current_fsuid()));
+		GOTO(out, rc = 0);
+	}
+
+	rc = cfs_get_environ(obd_jobid_var, tmp_jobid, &jobid_len);
+	if (rc) {
+		if (rc == -EOVERFLOW) {
+			/* For the PBS_JOBID and LOADL_STEP_ID keys (which are
+			 * variable length strings instead of just numbers), it
+			 * might make sense to keep the unique parts for JobID,
+			 * instead of just returning an error.  That means a
+			 * larger temp buffer for cfs_get_environ(), then
+			 * truncating the string at some separator to fit into
+			 * the specified jobid_len.  Fix later if needed. */
+			static bool printed;
+			if (unlikely(!printed)) {
+				LCONSOLE_ERROR_MSG(0x16b, "%s value too large "
+						   "for JobID buffer (%d)\n",
+						   obd_jobid_var, jobid_len);
+				printed = true;
+			}
+		} else {
+			CDEBUG((rc == -ENOENT || rc == -EINVAL ||
+				rc == -EDEADLK) ? D_INFO : D_ERROR,
+			       "Get jobid for (%s) failed: rc = %d\n",
+			       obd_jobid_var, rc);
+		}
+	}
+
+out:
+	if (rc != 0)
+		RETURN(rc);
+
+	/* Only replace the job ID if it changed. */
+	if (strcmp(jobid, tmp_jobid) != 0)
+		memcpy(jobid, tmp_jobid, jobid_len);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lustre_get_jobid);
+
+static int class_resolve_dev_name(__u32 len, const char *name)
+{
+        int rc;
+        int dev;
+
+        ENTRY;
+        if (!len || !name) {
+                CERROR("No name passed,!\n");
+                GOTO(out, rc = -EINVAL);
+        }
+        if (name[len - 1] != 0) {
+                CERROR("Name not nul terminated!\n");
+                GOTO(out, rc = -EINVAL);
+        }
+
+        CDEBUG(D_IOCTL, "device name %s\n", name);
+        dev = class_name2dev(name);
+        if (dev == -1) {
+                CDEBUG(D_IOCTL, "No device for name %s!\n", name);
+                GOTO(out, rc = -EINVAL);
+        }
+
+        CDEBUG(D_IOCTL, "device name %s, dev %d\n", name, dev);
+        rc = dev;
+
+out:
+        RETURN(rc);
+}
+
+int class_handle_ioctl(unsigned int cmd, unsigned long arg)
+{
+        char *buf = NULL;
+        struct obd_ioctl_data *data;
+        struct libcfs_debug_ioctl_data *debug_data;
+        struct obd_device *obd = NULL;
+        int err = 0, len = 0;
+        ENTRY;
+
+        /* only for debugging */
+        if (cmd == LIBCFS_IOC_DEBUG_MASK) {
+                debug_data = (struct libcfs_debug_ioctl_data*)arg;
+                libcfs_subsystem_debug = debug_data->subs;
+                libcfs_debug = debug_data->debug;
+                return 0;
+        }
+
+        CDEBUG(D_IOCTL, "cmd = %x\n", cmd);
+	if (obd_ioctl_getdata(&buf, &len, (void __user *)arg)) {
+                CERROR("OBD ioctl: data error\n");
+                RETURN(-EINVAL);
+        }
+        data = (struct obd_ioctl_data *)buf;
+
+        switch (cmd) {
+        case OBD_IOC_PROCESS_CFG: {
+                struct lustre_cfg *lcfg;
+
+                if (!data->ioc_plen1 || !data->ioc_pbuf1) {
+                        CERROR("No config buffer passed!\n");
+                        GOTO(out, err = -EINVAL);
+                }
+                OBD_ALLOC(lcfg, data->ioc_plen1);
+                if (lcfg == NULL)
+                        GOTO(out, err = -ENOMEM);
+		err = copy_from_user(lcfg, data->ioc_pbuf1,
+                                         data->ioc_plen1);
+                if (!err)
+                        err = lustre_cfg_sanity_check(lcfg, data->ioc_plen1);
+                if (!err)
+                        err = class_process_config(lcfg);
+
+                OBD_FREE(lcfg, data->ioc_plen1);
+                GOTO(out, err);
+        }
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
+	case OBD_GET_VERSION: {
+		static bool warned;
+
+		if (!data->ioc_inlbuf1) {
+			CERROR("No buffer passed in ioctl\n");
+			GOTO(out, err = -EINVAL);
+		}
+
+		if (strlen(LUSTRE_VERSION_STRING) + 1 > data->ioc_inllen1) {
+			CERROR("ioctl buffer too small to hold version\n");
+			GOTO(out, err = -EINVAL);
+		}
+
+		if (!warned) {
+			warned = true;
+			CWARN("%s: ioctl(OBD_GET_VERSION) is deprecated, "
+			      "use llapi_get_version_string() and/or relink\n",
+			      current->comm);
+		}
+		memcpy(data->ioc_bulk, LUSTRE_VERSION_STRING,
+		       strlen(LUSTRE_VERSION_STRING) + 1);
+
+		if (copy_to_user((void __user *)arg, data, len))
+			err = -EFAULT;
+		GOTO(out, err);
+	}
+#endif
+        case OBD_IOC_NAME2DEV: {
+                /* Resolve a device name.  This does not change the
+                 * currently selected device.
+                 */
+                int dev;
+
+                dev = class_resolve_dev_name(data->ioc_inllen1,
+                                             data->ioc_inlbuf1);
+                data->ioc_dev = dev;
+                if (dev < 0)
+                        GOTO(out, err = -EINVAL);
+
+		if (copy_to_user((void __user *)arg, data, sizeof(*data)))
+                        err = -EFAULT;
+                GOTO(out, err);
+        }
+
+        case OBD_IOC_UUID2DEV: {
+                /* Resolve a device uuid.  This does not change the
+                 * currently selected device.
+                 */
+                int dev;
+                struct obd_uuid uuid;
+
+                if (!data->ioc_inllen1 || !data->ioc_inlbuf1) {
+                        CERROR("No UUID passed!\n");
+                        GOTO(out, err = -EINVAL);
+                }
+                if (data->ioc_inlbuf1[data->ioc_inllen1 - 1] != 0) {
+                        CERROR("UUID not NUL terminated!\n");
+                        GOTO(out, err = -EINVAL);
+                }
+
+                CDEBUG(D_IOCTL, "device name %s\n", data->ioc_inlbuf1);
+                obd_str2uuid(&uuid, data->ioc_inlbuf1);
+                dev = class_uuid2dev(&uuid);
+                data->ioc_dev = dev;
+                if (dev == -1) {
+                        CDEBUG(D_IOCTL, "No device for UUID %s!\n",
+                               data->ioc_inlbuf1);
+                        GOTO(out, err = -EINVAL);
+                }
+
+                CDEBUG(D_IOCTL, "device name %s, dev %d\n", data->ioc_inlbuf1,
+                       dev);
+		if (copy_to_user((void __user *)arg, data, sizeof(*data)))
+                        err = -EFAULT;
+                GOTO(out, err);
+        }
+
+        case OBD_IOC_GETDEVICE: {
+                int     index = data->ioc_count;
+                char    *status, *str;
+
+                if (!data->ioc_inlbuf1) {
+                        CERROR("No buffer passed in ioctl\n");
+                        GOTO(out, err = -EINVAL);
+                }
+                if (data->ioc_inllen1 < 128) {
+                        CERROR("ioctl buffer too small to hold version\n");
+                        GOTO(out, err = -EINVAL);
+                }
+
+                obd = class_num2obd(index);
+                if (!obd)
+                        GOTO(out, err = -ENOENT);
+
+                if (obd->obd_stopping)
+                        status = "ST";
+                else if (obd->obd_set_up)
+                        status = "UP";
+                else if (obd->obd_attached)
+                        status = "AT";
+                else
+                        status = "--";
+                str = (char *)data->ioc_bulk;
+                snprintf(str, len - sizeof(*data), "%3d %s %s %s %s %d",
+                         (int)index, status, obd->obd_type->typ_name,
+                         obd->obd_name, obd->obd_uuid.uuid,
+			 atomic_read(&obd->obd_refcount));
+
+		if (copy_to_user((void __user *)arg, data, len))
+			err = -EFAULT;
+
+		GOTO(out, err);
+        }
+
+        }
+
+        if (data->ioc_dev == OBD_DEV_BY_DEVNAME) {
+                if (data->ioc_inllen4 <= 0 || data->ioc_inlbuf4 == NULL)
+                        GOTO(out, err = -EINVAL);
+                if (strnlen(data->ioc_inlbuf4, MAX_OBD_NAME) >= MAX_OBD_NAME)
+                        GOTO(out, err = -EINVAL);
+                obd = class_name2obd(data->ioc_inlbuf4);
+        } else if (data->ioc_dev < class_devno_max()) {
+                obd = class_num2obd(data->ioc_dev);
+        } else {
+                CERROR("OBD ioctl: No device\n");
+                GOTO(out, err = -EINVAL);
+        }
+
+        if (obd == NULL) {
+                CERROR("OBD ioctl : No Device %d\n", data->ioc_dev);
+                GOTO(out, err = -EINVAL);
+        }
+        LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+
+        if (!obd->obd_set_up || obd->obd_stopping) {
+                CERROR("OBD ioctl: device not setup %d \n", data->ioc_dev);
+                GOTO(out, err = -EINVAL);
+        }
+
+        switch(cmd) {
+        case OBD_IOC_NO_TRANSNO: {
+                if (!obd->obd_attached) {
+                        CERROR("Device %d not attached\n", obd->obd_minor);
+                        GOTO(out, err = -ENODEV);
+                }
+                CDEBUG(D_HA, "%s: disabling committed-transno notification\n",
+                       obd->obd_name);
+                obd->obd_no_transno = 1;
+                GOTO(out, err = 0);
+        }
+
+        default: {
+                err = obd_iocontrol(cmd, obd->obd_self_export, len, data, NULL);
+                if (err)
+                        GOTO(out, err);
+
+		if (copy_to_user((void __user *)arg, data, len))
+                        err = -EFAULT;
+                GOTO(out, err);
+        }
+        }
+
+out:
+	OBD_FREE_LARGE(buf, len);
+	RETURN(err);
+} /* class_handle_ioctl */
+
+#define OBD_INIT_CHECK
+#ifdef OBD_INIT_CHECK
+static int obd_init_checks(void)
+{
+        __u64 u64val, div64val;
+        char buf[64];
+        int len, ret = 0;
+
+	CDEBUG(D_INFO, "OBD_OBJECT_EOF = %#llx\n", (__u64)OBD_OBJECT_EOF);
+
+        u64val = OBD_OBJECT_EOF;
+	CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = %#llx\n", u64val);
+        if (u64val != OBD_OBJECT_EOF) {
+		CERROR("__u64 %#llx(%d) != 0xffffffffffffffff\n",
+                       u64val, (int)sizeof(u64val));
+                ret = -EINVAL;
+        }
+	len = snprintf(buf, sizeof(buf), "%#llx", u64val);
+        if (len != 18) {
+		CWARN("u64 hex wrong length! strlen(%s)=%d != 18\n", buf, len);
+                ret = -EINVAL;
+        }
+
+        div64val = OBD_OBJECT_EOF;
+	CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = %#llx\n", u64val);
+        if (u64val != OBD_OBJECT_EOF) {
+		CERROR("__u64 %#llx(%d) != 0xffffffffffffffff\n",
+                       u64val, (int)sizeof(u64val));
+                ret = -EOVERFLOW;
+        }
+        if (u64val >> 8 != OBD_OBJECT_EOF >> 8) {
+		CERROR("__u64 %#llx(%d) != 0xffffffffffffffff\n",
+                       u64val, (int)sizeof(u64val));
+                return -EOVERFLOW;
+        }
+        if (do_div(div64val, 256) != (u64val & 255)) {
+		CERROR("do_div(%#llx,256) != %llu\n", u64val, u64val & 255);
+                return -EOVERFLOW;
+        }
+        if (u64val >> 8 != div64val) {
+		CERROR("do_div(%#llx,256) %llu != %llu\n",
+                       u64val, div64val, u64val >> 8);
+                return -EOVERFLOW;
+        }
+	len = snprintf(buf, sizeof(buf), "%#llx", u64val);
+        if (len != 18) {
+		CWARN("u64 hex wrong length! strlen(%s)=%d != 18\n", buf, len);
+                ret = -EINVAL;
+        }
+	len = snprintf(buf, sizeof(buf), "%llu", u64val);
+        if (len != 20) {
+		CWARN("u64 wrong length! strlen(%s)=%d != 20\n", buf, len);
+                ret = -EINVAL;
+        }
+	len = snprintf(buf, sizeof(buf), "%lld", u64val);
+        if (len != 2) {
+		CWARN("s64 wrong length! strlen(%s)=%d != 2\n", buf, len);
+                ret = -EINVAL;
+        }
+	if ((u64val & ~PAGE_MASK) >= PAGE_SIZE) {
+		CWARN("mask failed: u64val %llu >= %llu\n", u64val,
+		      (__u64)PAGE_SIZE);
+                ret = -EINVAL;
+        }
+
+        return ret;
+}
+#else
+#define obd_init_checks() do {} while(0)
+#endif
+
+static int __init obdclass_init(void)
+{
+	int err;
+
+	LCONSOLE_INFO("Lustre: Build Version: "LUSTRE_VERSION_STRING"\n");
+
+	libcfs_kkuc_init();
+
+	err = obd_init_checks();
+	if (err == -EOVERFLOW)
+		return err;
+
+#ifdef CONFIG_PROC_FS
+	obd_memory = lprocfs_alloc_stats(OBD_STATS_NUM,
+					 LPROCFS_STATS_FLAG_NONE |
+					 LPROCFS_STATS_FLAG_IRQ_SAFE);
+	if (obd_memory == NULL) {
+		CERROR("kmalloc of 'obd_memory' failed\n");
+		return -ENOMEM;
+	}
+
+	lprocfs_counter_init(obd_memory, OBD_MEMORY_STAT,
+			     LPROCFS_CNTR_AVGMINMAX,
+			     "memused", "bytes");
+#endif
+	err = obd_zombie_impexp_init();
+	if (err)
+		goto cleanup_obd_memory;
+
+	err = class_handle_init();
+	if (err)
+		goto cleanup_zombie_impexp;
+
+	err = misc_register(&obd_psdev);
+	if (err) {
+		CERROR("cannot register OBD miscdevice: err = %d\n", err);
+		goto cleanup_class_handle;
+	}
+
+	/* Default the dirty page cache cap to 1/2 of system memory.
+	 * For clients with less memory, a larger fraction is needed
+	 * for other purposes (mostly for BGL). */
+	if (totalram_pages <= 512 << (20 - PAGE_SHIFT))
+		obd_max_dirty_pages = totalram_pages / 4;
+	else
+		obd_max_dirty_pages = totalram_pages / 2;
+
+	err = obd_init_caches();
+	if (err)
+		goto cleanup_deregister;
+
+	err = class_procfs_init();
+	if (err)
+		goto cleanup_caches;
+
+	err = lu_global_init();
+	if (err)
+		goto cleanup_class_procfs;
+
+	err = cl_global_init();
+	if (err != 0)
+		goto cleanup_lu_global;
+
+#ifdef HAVE_SERVER_SUPPORT
+	err = dt_global_init();
+	if (err != 0)
+		goto cleanup_cl_global;
+
+	err = lu_ucred_global_init();
+	if (err != 0)
+		goto cleanup_dt_global;
+#endif /* HAVE_SERVER_SUPPORT */
+
+	err = llog_info_init();
+	if (err)
+#ifdef HAVE_SERVER_SUPPORT
+		goto cleanup_lu_ucred_global;
+#else /* !HAVE_SERVER_SUPPORT */
+		goto cleanup_cl_global;
+#endif /* HAVE_SERVER_SUPPORT */
+
+	err = lustre_register_fs();
+
+	/* simulate a late OOM situation now to require all
+	 * alloc'ed/initialized resources to be freed */
+	if (OBD_FAIL_CHECK(OBD_FAIL_OBDCLASS_MODULE_LOAD)) {
+		/* fake error but filesystem has been registered */
+		lustre_unregister_fs();
+		/* force error to ensure module will be unloaded/cleaned */
+		err = -ENOMEM;
+	}
+
+	if (err)
+		goto cleanup_llog_info;
+
+	return 0;
+
+cleanup_llog_info:
+	llog_info_fini();
+
+#ifdef HAVE_SERVER_SUPPORT
+cleanup_lu_ucred_global:
+	lu_ucred_global_fini();
+
+cleanup_dt_global:
+	dt_global_fini();
+#endif /* HAVE_SERVER_SUPPORT */
+
+cleanup_cl_global:
+	cl_global_fini();
+
+cleanup_lu_global:
+	lu_global_fini();
+
+cleanup_class_procfs:
+	obd_sysctl_clean();
+	class_procfs_clean();
+
+cleanup_caches:
+	obd_cleanup_caches();
+
+cleanup_deregister:
+	misc_deregister(&obd_psdev);
+
+cleanup_class_handle:
+	class_handle_cleanup();
+
+cleanup_zombie_impexp:
+	obd_zombie_impexp_stop();
+
+cleanup_obd_memory:
+#ifdef CONFIG_PROC_FS
+	lprocfs_free_stats(&obd_memory);
+#endif
+
+	return err;
+}
+
+void obd_update_maxusage(void)
+{
+	__u64 max;
+
+	max = obd_memory_sum();
+
+	spin_lock(&obd_updatemax_lock);
+	if (max > obd_max_alloc)
+		obd_max_alloc = max;
+	spin_unlock(&obd_updatemax_lock);
+}
+EXPORT_SYMBOL(obd_update_maxusage);
+
+#ifdef CONFIG_PROC_FS
+__u64 obd_memory_max(void)
+{
+	__u64 ret;
+
+	obd_update_maxusage();
+	spin_lock(&obd_updatemax_lock);
+	ret = obd_max_alloc;
+	spin_unlock(&obd_updatemax_lock);
+
+	return ret;
+}
+#endif /* CONFIG_PROC_FS */
+
+static void __exit obdclass_exit(void)
+{
+#ifdef CONFIG_PROC_FS
+	__u64 memory_leaked;
+	__u64 memory_max;
+#endif /* CONFIG_PROC_FS */
+	ENTRY;
+
+	lustre_unregister_fs();
+
+	misc_deregister(&obd_psdev);
+	llog_info_fini();
+#ifdef HAVE_SERVER_SUPPORT
+	lu_ucred_global_fini();
+	dt_global_fini();
+#endif /* HAVE_SERVER_SUPPORT */
+	cl_global_fini();
+	lu_global_fini();
+
+        obd_cleanup_caches();
+        obd_sysctl_clean();
+
+        class_procfs_clean();
+
+        class_handle_cleanup();
+	class_del_uuid(NULL); /* Delete all UUIDs. */
+        obd_zombie_impexp_stop();
+
+#ifdef CONFIG_PROC_FS
+	memory_leaked = obd_memory_sum();
+	memory_max = obd_memory_max();
+
+	lprocfs_free_stats(&obd_memory);
+	CDEBUG((memory_leaked) ? D_ERROR : D_INFO,
+	       "obd_memory max: %llu, leaked: %llu\n",
+	       memory_max, memory_leaked);
+#endif /* CONFIG_PROC_FS */
+
+	EXIT;
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Class Driver");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+MODULE_LICENSE("GPL");
+
+module_init(obdclass_init);
+module_exit(obdclass_exit);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/debug.c b/drivers/staging/lustrefsx/lustre/obdclass/debug.c
new file mode 100644
index 0000000000000..bfa1bad3dcb4a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/debug.c
@@ -0,0 +1,106 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/debug.c
+ *
+ * Helper routines for dumping data structs for debugging.
+ */
+
+#define DEBUG_SUBSYSTEM D_OTHER
+
+
+#include <obd_support.h>
+#include <lustre_debug.h>
+#include <lustre_net.h>
+
+void dump_lniobuf(struct niobuf_local *nb)
+{
+	CDEBUG(D_RPCTRACE,
+	       "niobuf_local: file_offset=%lld, len=%d, page=%p, rc=%d\n",
+	       nb->lnb_file_offset, nb->lnb_len, nb->lnb_page, nb->lnb_rc);
+	CDEBUG(D_RPCTRACE, "nb->page: index = %ld\n",
+	       nb->lnb_page ? page_index(nb->lnb_page) : -1);
+}
+
+#define LPDS sizeof(__u64)
+int block_debug_setup(void *addr, int len, __u64 off, __u64 id)
+{
+        LASSERT(addr);
+
+        off = cpu_to_le64 (off);
+        id = cpu_to_le64 (id);
+        memcpy(addr, (char *)&off, LPDS);
+        memcpy(addr + LPDS, (char *)&id, LPDS);
+
+        addr += len - LPDS - LPDS;
+        memcpy(addr, (char *)&off, LPDS);
+        memcpy(addr + LPDS, (char *)&id, LPDS);
+
+        return 0;
+}
+EXPORT_SYMBOL(block_debug_setup);
+
+int block_debug_check(char *who, void *addr, int end, __u64 off, __u64 id)
+{
+        __u64 ne_off;
+        int err = 0;
+
+        LASSERT(addr);
+
+        ne_off = le64_to_cpu (off);
+        id = le64_to_cpu (id);
+        if (memcmp(addr, (char *)&ne_off, LPDS)) {
+		CDEBUG(D_ERROR, "%s: id %#llx offset %llu off: %#llx != "
+		       "%#llx\n", who, id, off, *(__u64 *)addr, ne_off);
+                err = -EINVAL;
+        }
+        if (memcmp(addr + LPDS, (char *)&id, LPDS)) {
+		CDEBUG(D_ERROR, "%s: id %#llx offset %llu id: %#llx != %#llx\n",
+                       who, id, off, *(__u64 *)(addr + LPDS), id);
+                err = -EINVAL;
+        }
+
+        addr += end - LPDS - LPDS;
+        if (memcmp(addr, (char *)&ne_off, LPDS)) {
+		CDEBUG(D_ERROR, "%s: id %#llx offset %llu end off: %#llx != "
+		       "%#llx\n", who, id, off, *(__u64 *)addr, ne_off);
+                err = -EINVAL;
+        }
+        if (memcmp(addr + LPDS, (char *)&id, LPDS)) {
+		CDEBUG(D_ERROR, "%s: id %#llx offset %llu end id: %#llx != "
+		       "%#llx\n", who, id, off, *(__u64 *)(addr + LPDS), id);
+                err = -EINVAL;
+        }
+
+        return err;
+}
+EXPORT_SYMBOL(block_debug_check);
+#undef LPDS
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/dt_object.c b/drivers/staging/lustrefsx/lustre/obdclass/dt_object.c
new file mode 100644
index 0000000000000..a8f144fd4c0e0
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/dt_object.c
@@ -0,0 +1,1097 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/dt_object.c
+ *
+ * Dt Object.
+ * Generic functions from dt_object.h
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/list.h>
+#include <obd.h>
+#include <dt_object.h>
+/* fid_be_to_cpu() */
+#include <lustre_fid.h>
+#include <lustre_nodemap.h>
+#include <lustre_quota.h>
+#include <lustre_lfsck.h>
+
+/* context key constructor/destructor: dt_global_key_init, dt_global_key_fini */
+LU_KEY_INIT(dt_global, struct dt_thread_info);
+LU_KEY_FINI(dt_global, struct dt_thread_info);
+
+struct lu_context_key dt_key = {
+        .lct_tags = LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD | LCT_LOCAL,
+        .lct_init = dt_global_key_init,
+        .lct_fini = dt_global_key_fini
+};
+
+/* no lock is necessary to protect the list, because call-backs
+ * are added during system startup. Please refer to "struct dt_device".
+ */
+void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb)
+{
+	list_add(&cb->dtc_linkage, &dev->dd_txn_callbacks);
+}
+EXPORT_SYMBOL(dt_txn_callback_add);
+
+void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb)
+{
+	list_del_init(&cb->dtc_linkage);
+}
+EXPORT_SYMBOL(dt_txn_callback_del);
+
+int dt_txn_hook_start(const struct lu_env *env,
+                      struct dt_device *dev, struct thandle *th)
+{
+	int rc = 0;
+	struct dt_txn_callback *cb;
+
+	if (th->th_local)
+		return 0;
+
+	list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) {
+		struct thandle *dtc_th = th;
+
+		if (cb->dtc_txn_start == NULL ||
+		    !(cb->dtc_tag & env->le_ctx.lc_tags))
+			continue;
+
+		/* Usually dt_txn_hook_start is called from bottom device,
+		 * and if the thandle has th_top, then we need use top
+		 * thandle for the callback in the top thandle layer */
+		if (th->th_top != NULL)
+			dtc_th = th->th_top;
+
+		rc = cb->dtc_txn_start(env, dtc_th, cb->dtc_cookie);
+		if (rc < 0)
+			break;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(dt_txn_hook_start);
+
+int dt_txn_hook_stop(const struct lu_env *env, struct thandle *th)
+{
+	struct dt_device       *dev = th->th_dev;
+	struct dt_txn_callback *cb;
+	int                     rc = 0;
+
+	if (th->th_local)
+		return 0;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_DT_TXN_STOP))
+		return -EIO;
+
+	list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) {
+		struct thandle *dtc_th = th;
+
+		if (cb->dtc_txn_stop == NULL ||
+		    !(cb->dtc_tag & env->le_ctx.lc_tags))
+			continue;
+
+		/* Usually dt_txn_hook_stop is called from bottom device,
+		 * and if the thandle has th_top, then we need use top
+		 * thandle for the callback in the top thandle layer */
+		if (th->th_top != NULL)
+			dtc_th = th->th_top;
+
+		rc = cb->dtc_txn_stop(env, dtc_th, cb->dtc_cookie);
+		if (rc < 0)
+			break;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(dt_txn_hook_stop);
+
+void dt_txn_hook_commit(struct thandle *th)
+{
+	struct dt_txn_callback *cb;
+
+	if (th->th_local)
+		return;
+
+	list_for_each_entry(cb, &th->th_dev->dd_txn_callbacks,
+			    dtc_linkage) {
+		/* Right now, the bottom device (OSD) will use this hook
+		 * commit to notify OSP, so we do not check and replace
+		 * the thandle to top thandle now */
+		if (cb->dtc_txn_commit)
+			cb->dtc_txn_commit(th, cb->dtc_cookie);
+	}
+}
+EXPORT_SYMBOL(dt_txn_hook_commit);
+
+int dt_device_init(struct dt_device *dev, struct lu_device_type *t)
+{
+	INIT_LIST_HEAD(&dev->dd_txn_callbacks);
+	return lu_device_init(&dev->dd_lu_dev, t);
+}
+EXPORT_SYMBOL(dt_device_init);
+
+void dt_device_fini(struct dt_device *dev)
+{
+        lu_device_fini(&dev->dd_lu_dev);
+}
+EXPORT_SYMBOL(dt_device_fini);
+
+int dt_object_init(struct dt_object *obj,
+                   struct lu_object_header *h, struct lu_device *d)
+
+{
+        return lu_object_init(&obj->do_lu, h, d);
+}
+EXPORT_SYMBOL(dt_object_init);
+
+void dt_object_fini(struct dt_object *obj)
+{
+        lu_object_fini(&obj->do_lu);
+}
+EXPORT_SYMBOL(dt_object_fini);
+
+int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj)
+{
+        if (obj->do_index_ops == NULL)
+                obj->do_ops->do_index_try(env, obj, &dt_directory_features);
+        return obj->do_index_ops != NULL;
+}
+EXPORT_SYMBOL(dt_try_as_dir);
+
+enum dt_format_type dt_mode_to_dft(__u32 mode)
+{
+        enum dt_format_type result;
+
+        switch (mode & S_IFMT) {
+        case S_IFDIR:
+                result = DFT_DIR;
+                break;
+        case S_IFREG:
+                result = DFT_REGULAR;
+                break;
+        case S_IFLNK:
+                result = DFT_SYM;
+                break;
+        case S_IFCHR:
+        case S_IFBLK:
+        case S_IFIFO:
+        case S_IFSOCK:
+                result = DFT_NODE;
+                break;
+        default:
+		LASSERTF(0, "invalid mode %o\n", mode);
+		result = 0; /* Just for satisfying compiler. */
+		break;
+	}
+	return result;
+}
+EXPORT_SYMBOL(dt_mode_to_dft);
+
+/**
+ * lookup fid for object named \a name in directory \a dir.
+ */
+
+int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir,
+                  const char *name, struct lu_fid *fid)
+{
+	if (dt_try_as_dir(env, dir))
+		return dt_lookup(env, dir, (struct dt_rec *)fid,
+				 (const struct dt_key *)name);
+	return -ENOTDIR;
+}
+EXPORT_SYMBOL(dt_lookup_dir);
+
+/* this differs from dt_locate by top_dev as parameter
+ * but not one from lu_site */
+struct dt_object *dt_locate_at(const struct lu_env *env,
+			       struct dt_device *dev,
+			       const struct lu_fid *fid,
+			       struct lu_device *top_dev,
+			       const struct lu_object_conf *conf)
+{
+	struct lu_object *lo;
+	struct lu_object *n;
+
+	lo = lu_object_find_at(env, top_dev, fid, conf);
+	if (IS_ERR(lo))
+		return ERR_PTR(PTR_ERR(lo));
+
+	LASSERT(lo != NULL);
+
+	list_for_each_entry(n, &lo->lo_header->loh_layers, lo_linkage) {
+		if (n->lo_dev == &dev->dd_lu_dev)
+			return container_of0(n, struct dt_object, do_lu);
+	}
+
+	return ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL(dt_locate_at);
+
+/**
+ * find an object named \a entry in given \a dfh->dfh_o directory.
+ */
+static int dt_find_entry(const struct lu_env *env, const char *entry,
+			 void *data)
+{
+	struct dt_find_hint *dfh = data;
+	struct dt_device *dt = dfh->dfh_dt;
+	struct lu_fid *fid = dfh->dfh_fid;
+	struct dt_object *obj = dfh->dfh_o;
+	int rc;
+
+	rc = dt_lookup_dir(env, obj, entry, fid);
+	dt_object_put(env, obj);
+	if (rc == 0) {
+		obj = dt_locate(env, dt, fid);
+		if (IS_ERR(obj))
+			rc = PTR_ERR(obj);
+	}
+	dfh->dfh_o = obj;
+
+	return rc;
+}
+
+/**
+ * Abstract function which parses path name. This function feeds
+ * path component to \a entry_func.
+ */
+int dt_path_parser(const struct lu_env *env,
+                   char *path, dt_entry_func_t entry_func,
+                   void *data)
+{
+        char *e;
+        int rc = 0;
+
+        while (1) {
+                e = strsep(&path, "/");
+                if (e == NULL)
+                        break;
+
+                if (e[0] == 0) {
+                        if (!path || path[0] == '\0')
+                                break;
+                        continue;
+                }
+                rc = entry_func(env, e, data);
+                if (rc)
+                        break;
+        }
+
+        return rc;
+}
+
+struct dt_object *
+dt_store_resolve(const struct lu_env *env, struct dt_device *dt,
+		 const char *path, struct lu_fid *fid)
+{
+	struct dt_thread_info *info = dt_info(env);
+	struct dt_find_hint   *dfh = &info->dti_dfh;
+	struct dt_object      *obj;
+	int		       result;
+
+
+        dfh->dfh_dt = dt;
+        dfh->dfh_fid = fid;
+
+	strlcpy(info->dti_buf, path, sizeof(info->dti_buf));
+
+        result = dt->dd_ops->dt_root_get(env, dt, fid);
+        if (result == 0) {
+                obj = dt_locate(env, dt, fid);
+                if (!IS_ERR(obj)) {
+                        dfh->dfh_o = obj;
+			result = dt_path_parser(env, info->dti_buf,
+						dt_find_entry, dfh);
+                        if (result != 0)
+                                obj = ERR_PTR(result);
+                        else
+                                obj = dfh->dfh_o;
+                }
+        } else {
+                obj = ERR_PTR(result);
+        }
+        return obj;
+}
+
+static struct dt_object *dt_reg_open(const struct lu_env *env,
+                                     struct dt_device *dt,
+                                     struct dt_object *p,
+                                     const char *name,
+                                     struct lu_fid *fid)
+{
+        struct dt_object *o;
+        int result;
+
+        result = dt_lookup_dir(env, p, name, fid);
+        if (result == 0){
+                o = dt_locate(env, dt, fid);
+        }
+        else
+                o = ERR_PTR(result);
+
+        return o;
+}
+
+/**
+ * Open dt object named \a filename from \a dirname directory.
+ *      \param  dt      dt device
+ *      \param  fid     on success, object fid is stored in *fid
+ */
+struct dt_object *dt_store_open(const struct lu_env *env, struct dt_device *dt,
+				const char *dirname, const char *filename,
+				struct lu_fid *fid)
+{
+	struct dt_object *file;
+	struct dt_object *dir;
+
+	dir = dt_store_resolve(env, dt, dirname, fid);
+	if (!IS_ERR(dir)) {
+		file = dt_reg_open(env, dt, dir, filename, fid);
+		dt_object_put(env, dir);
+	} else {
+		file = dir;
+	}
+
+	return file;
+}
+
+struct dt_object *dt_find_or_create(const struct lu_env *env,
+                                    struct dt_device *dt,
+                                    const struct lu_fid *fid,
+                                    struct dt_object_format *dof,
+                                    struct lu_attr *at)
+{
+        struct dt_object *dto;
+        struct thandle *th;
+        int rc;
+
+        ENTRY;
+
+        dto = dt_locate(env, dt, fid);
+        if (IS_ERR(dto))
+                RETURN(dto);
+
+        LASSERT(dto != NULL);
+        if (dt_object_exists(dto))
+                RETURN(dto);
+
+        th = dt_trans_create(env, dt);
+        if (IS_ERR(th))
+                GOTO(out, rc = PTR_ERR(th));
+
+        rc = dt_declare_create(env, dto, at, NULL, dof, th);
+        if (rc)
+                GOTO(trans_stop, rc);
+
+        rc = dt_trans_start_local(env, dt, th);
+        if (rc)
+                GOTO(trans_stop, rc);
+
+        dt_write_lock(env, dto, 0);
+        if (dt_object_exists(dto))
+                GOTO(unlock, rc = 0);
+
+        CDEBUG(D_OTHER, "create new object "DFID"\n", PFID(fid));
+
+        rc = dt_create(env, dto, at, NULL, dof, th);
+        if (rc)
+                GOTO(unlock, rc);
+        LASSERT(dt_object_exists(dto));
+unlock:
+	dt_write_unlock(env, dto);
+trans_stop:
+	dt_trans_stop(env, dt, th);
+out:
+	if (rc) {
+		dt_object_put(env, dto);
+		dto = ERR_PTR(rc);
+	}
+
+	RETURN(dto);
+}
+EXPORT_SYMBOL(dt_find_or_create);
+
+/* dt class init function. */
+int dt_global_init(void)
+{
+        int result;
+
+        LU_CONTEXT_KEY_INIT(&dt_key);
+        result = lu_context_key_register(&dt_key);
+        return result;
+}
+
+void dt_global_fini(void)
+{
+        lu_context_key_degister(&dt_key);
+}
+
+/**
+ * Generic read helper. May return an error for partial reads.
+ *
+ * \param env  lustre environment
+ * \param dt   object to be read
+ * \param buf  lu_buf to be filled, with buffer pointer and length
+ * \param pos position to start reading, updated as data is read
+ *
+ * \retval real size of data read
+ * \retval -ve errno on failure
+ */
+int dt_read(const struct lu_env *env, struct dt_object *dt,
+            struct lu_buf *buf, loff_t *pos)
+{
+	LASSERTF(dt != NULL, "dt is NULL when we want to read record\n");
+	return dt->do_body_ops->dbo_read(env, dt, buf, pos);
+}
+EXPORT_SYMBOL(dt_read);
+
+/**
+ * Read structures of fixed size from storage.  Unlike dt_read(), using
+ * dt_record_read() will return an error for partial reads.
+ *
+ * \param env  lustre environment
+ * \param dt   object to be read
+ * \param buf  lu_buf to be filled, with buffer pointer and length
+ * \param pos position to start reading, updated as data is read
+ *
+ * \retval 0 on successfully reading full buffer
+ * \retval -EFAULT on short read
+ * \retval -ve errno on failure
+ */
+int dt_record_read(const struct lu_env *env, struct dt_object *dt,
+                   struct lu_buf *buf, loff_t *pos)
+{
+	ssize_t size;
+
+	LASSERTF(dt != NULL, "dt is NULL when we want to read record\n");
+
+	size = dt->do_body_ops->dbo_read(env, dt, buf, pos);
+	if (size < 0)
+		return size;
+	return (size == (ssize_t)buf->lb_len) ? 0 : -EFAULT;
+}
+EXPORT_SYMBOL(dt_record_read);
+
+int dt_record_write(const struct lu_env *env, struct dt_object *dt,
+                    const struct lu_buf *buf, loff_t *pos, struct thandle *th)
+{
+	ssize_t size;
+
+	LASSERTF(dt != NULL, "dt is NULL when we want to write record\n");
+	LASSERT(th != NULL);
+	LASSERT(dt->do_body_ops);
+	LASSERT(dt->do_body_ops->dbo_write);
+
+	size = dt->do_body_ops->dbo_write(env, dt, buf, pos, th, 1);
+	if (size < 0)
+		return size;
+	return (size == (ssize_t)buf->lb_len) ? 0 : -EFAULT;
+}
+EXPORT_SYMBOL(dt_record_write);
+
+int dt_declare_version_set(const struct lu_env *env, struct dt_object *o,
+                           struct thandle *th)
+{
+        struct lu_buf vbuf;
+        char *xname = XATTR_NAME_VERSION;
+
+        LASSERT(o);
+        vbuf.lb_buf = NULL;
+        vbuf.lb_len = sizeof(dt_obj_version_t);
+        return dt_declare_xattr_set(env, o, &vbuf, xname, 0, th);
+
+}
+EXPORT_SYMBOL(dt_declare_version_set);
+
+void dt_version_set(const struct lu_env *env, struct dt_object *o,
+                    dt_obj_version_t version, struct thandle *th)
+{
+        struct lu_buf vbuf;
+        char *xname = XATTR_NAME_VERSION;
+        int rc;
+
+        LASSERT(o);
+        vbuf.lb_buf = &version;
+        vbuf.lb_len = sizeof(version);
+
+	rc = dt_xattr_set(env, o, &vbuf, xname, 0, th);
+        if (rc < 0)
+                CDEBUG(D_INODE, "Can't set version, rc %d\n", rc);
+        return;
+}
+EXPORT_SYMBOL(dt_version_set);
+
+dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o)
+{
+        struct lu_buf vbuf;
+        char *xname = XATTR_NAME_VERSION;
+        dt_obj_version_t version;
+        int rc;
+
+        LASSERT(o);
+        vbuf.lb_buf = &version;
+        vbuf.lb_len = sizeof(version);
+	rc = dt_xattr_get(env, o, &vbuf, xname);
+        if (rc != sizeof(version)) {
+                CDEBUG(D_INODE, "Can't get version, rc %d\n", rc);
+                version = 0;
+        }
+        return version;
+}
+EXPORT_SYMBOL(dt_version_get);
+
+/* list of all supported index types */
+
+/* directories */
+const struct dt_index_features dt_directory_features;
+EXPORT_SYMBOL(dt_directory_features);
+
+/* scrub iterator */
+const struct dt_index_features dt_otable_features;
+EXPORT_SYMBOL(dt_otable_features);
+
+/* lfsck layout orphan */
+const struct dt_index_features dt_lfsck_layout_orphan_features = {
+	.dif_flags		= 0,
+	.dif_keysize_min	= sizeof(struct lu_fid),
+	.dif_keysize_max	= sizeof(struct lu_fid),
+	.dif_recsize_min	= sizeof(struct lu_orphan_rec_v2),
+	.dif_recsize_max	= sizeof(struct lu_orphan_rec_v2),
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_lfsck_layout_orphan_features);
+
+/* lfsck layout dangling */
+const struct dt_index_features dt_lfsck_layout_dangling_features = {
+	.dif_flags		= DT_IND_UPDATE,
+	.dif_keysize_min	= sizeof(struct lfsck_layout_dangling_key),
+	.dif_keysize_max	= sizeof(struct lfsck_layout_dangling_key),
+	.dif_recsize_min	= sizeof(struct lu_fid),
+	.dif_recsize_max	= sizeof(struct lu_fid),
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_lfsck_layout_dangling_features);
+
+/* lfsck namespace */
+const struct dt_index_features dt_lfsck_namespace_features = {
+	.dif_flags		= DT_IND_UPDATE,
+	.dif_keysize_min	= sizeof(struct lu_fid),
+	.dif_keysize_max	= sizeof(struct lu_fid),
+	.dif_recsize_min	= sizeof(__u8),
+	.dif_recsize_max	= sizeof(__u8),
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_lfsck_namespace_features);
+
+/* accounting indexes */
+const struct dt_index_features dt_acct_features = {
+	.dif_flags		= DT_IND_UPDATE,
+	.dif_keysize_min	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_keysize_max	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_recsize_min	= sizeof(struct lquota_acct_rec), /* 16 bytes */
+	.dif_recsize_max	= sizeof(struct lquota_acct_rec), /* 16 bytes */
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_acct_features);
+
+/* global quota files */
+const struct dt_index_features dt_quota_glb_features = {
+	.dif_flags		= DT_IND_UPDATE,
+	/* a different key would have to be used for per-directory quota */
+	.dif_keysize_min	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_keysize_max	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_recsize_min	= sizeof(struct lquota_glb_rec), /* 32 bytes */
+	.dif_recsize_max	= sizeof(struct lquota_glb_rec), /* 32 bytes */
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_quota_glb_features);
+
+/* slave quota files */
+const struct dt_index_features dt_quota_slv_features = {
+	.dif_flags		= DT_IND_UPDATE,
+	/* a different key would have to be used for per-directory quota */
+	.dif_keysize_min	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_keysize_max	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_recsize_min	= sizeof(struct lquota_slv_rec), /* 8 bytes */
+	.dif_recsize_max	= sizeof(struct lquota_slv_rec), /* 8 bytes */
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_quota_slv_features);
+
+/* nodemap files, nodemap_rec size asserted in nodemap_storage.c */
+const struct dt_index_features dt_nodemap_features = {
+	.dif_flags		= DT_IND_UPDATE,
+	.dif_keysize_min	= sizeof(__u64), /* 64-bit nodemap/record id */
+	.dif_keysize_max	= sizeof(__u64), /* 64-bit nodemap/record id */
+	.dif_recsize_min	= sizeof(union nodemap_rec), /* 32 bytes */
+	.dif_recsize_max	= sizeof(union nodemap_rec), /* 32 bytes */
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_nodemap_features);
+
+/* helper function returning what dt_index_features structure should be used
+ * based on the FID sequence. This is used by OBD_IDX_READ RPC */
+static inline const struct dt_index_features *dt_index_feat_select(__u64 seq,
+								   __u32 mode)
+{
+	if (seq == FID_SEQ_QUOTA_GLB) {
+		/* global quota index */
+		if (!S_ISREG(mode))
+			/* global quota index should be a regular file */
+			return ERR_PTR(-ENOENT);
+		return &dt_quota_glb_features;
+	} else if (seq == FID_SEQ_QUOTA) {
+		/* quota slave index */
+		if (!S_ISREG(mode))
+			/* slave index should be a regular file */
+			return ERR_PTR(-ENOENT);
+		return &dt_quota_slv_features;
+	} else if (seq == FID_SEQ_LAYOUT_RBTREE){
+		return &dt_lfsck_layout_orphan_features;
+	} else if (seq >= FID_SEQ_NORMAL) {
+		/* object is part of the namespace, verify that it is a
+		 * directory */
+		if (!S_ISDIR(mode))
+			/* sorry, we can only deal with directory */
+			return ERR_PTR(-ENOTDIR);
+		return &dt_directory_features;
+	}
+
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+/*
+ * Fill a lu_idxpage with key/record pairs read for transfer via OBD_IDX_READ
+ * RPC
+ *
+ * \param env - is the environment passed by the caller
+ * \param lp  - is a pointer to the lu_page to fill
+ * \param nob - is the maximum number of bytes that should be copied
+ * \param iops - is the index operation vector associated with the index object
+ * \param it   - is a pointer to the current iterator
+ * \param attr - is the index attribute to pass to iops->rec()
+ * \param arg  - is a pointer to the idx_info structure
+ */
+static int dt_index_page_build(const struct lu_env *env, union lu_page *lp,
+			       size_t nob, const struct dt_it_ops *iops,
+			       struct dt_it *it, __u32 attr, void *arg)
+{
+	struct idx_info		*ii = (struct idx_info *)arg;
+	struct lu_idxpage	*lip = &lp->lp_idx;
+	char			*entry;
+	size_t			 size;
+	int			 rc;
+	ENTRY;
+
+	if (nob < LIP_HDR_SIZE)
+		return -EINVAL;
+
+	/* initialize the header of the new container */
+	memset(lip, 0, LIP_HDR_SIZE);
+	lip->lip_magic = LIP_MAGIC;
+	nob           -= LIP_HDR_SIZE;
+
+	/* compute size needed to store a key/record pair */
+	size = ii->ii_recsize + ii->ii_keysize;
+	if ((ii->ii_flags & II_FL_NOHASH) == 0)
+		/* add hash if the client wants it */
+		size += sizeof(__u64);
+
+	entry = lip->lip_entries;
+	do {
+		char		*tmp_entry = entry;
+		struct dt_key	*key;
+		__u64		hash;
+		__u16		keysize;
+		__u16		recsize;
+
+		/* fetch 64-bit hash value */
+		hash = iops->store(env, it);
+		ii->ii_hash_end = hash;
+
+		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_IDX_READ_BREAK)) {
+			if (lip->lip_nr != 0)
+				GOTO(out, rc = 0);
+		}
+
+		if (nob < size) {
+			if (lip->lip_nr == 0)
+				GOTO(out, rc = -EINVAL);
+			GOTO(out, rc = 0);
+		}
+
+		if (!(ii->ii_flags & II_FL_NOHASH)) {
+			/* client wants to the 64-bit hash value associated with
+			 * each record */
+			memcpy(tmp_entry, &hash, sizeof(hash));
+			tmp_entry += sizeof(hash);
+		}
+
+		if (ii->ii_flags & II_FL_VARKEY)
+			keysize = iops->key_size(env, it);
+		else
+			keysize = ii->ii_keysize;
+
+		if (!(ii->ii_flags & II_FL_NOKEY)) {
+			/* then the key value */
+			key = iops->key(env, it);
+			memcpy(tmp_entry, key, keysize);
+			tmp_entry += keysize;
+		}
+
+		/* and finally the record */
+		rc = iops->rec(env, it, (struct dt_rec *)tmp_entry, attr);
+		if (rc != -ESTALE) {
+			if (rc != 0)
+				GOTO(out, rc);
+
+			/* hash/key/record successfully copied! */
+			lip->lip_nr++;
+			if (unlikely(lip->lip_nr == 1 && ii->ii_count == 0))
+				ii->ii_hash_start = hash;
+
+			if (ii->ii_flags & II_FL_VARREC)
+				recsize = iops->rec_size(env, it, attr);
+			else
+				recsize = ii->ii_recsize;
+
+			entry = tmp_entry + recsize;
+			nob -= size;
+		}
+
+		/* move on to the next record */
+		do {
+			rc = iops->next(env, it);
+		} while (rc == -ESTALE);
+
+	} while (rc == 0);
+
+	GOTO(out, rc);
+out:
+	if (rc >= 0 && lip->lip_nr > 0)
+		/* one more container */
+		ii->ii_count++;
+	if (rc > 0)
+		/* no more entries */
+		ii->ii_hash_end = II_END_OFF;
+	return rc;
+}
+
+
+/*
+ * Walk index and fill lu_page containers with key/record pairs
+ *
+ * \param env - is the environment passed by the caller
+ * \param obj - is the index object to parse
+ * \param rdpg - is the lu_rdpg descriptor associated with the transfer
+ * \param filler - is the callback function responsible for filling a lu_page
+ *                 with key/record pairs in the format wanted by the caller.
+ *                 If NULL, uses dt_index_page_build
+ * \param arg    - is an opaq argument passed to the filler function
+ *
+ * \retval sum (in bytes) of all filled lu_pages
+ * \retval -ve errno on failure
+ */
+int dt_index_walk(const struct lu_env *env, struct dt_object *obj,
+		  const struct lu_rdpg *rdpg, dt_index_page_build_t filler,
+		  void *arg)
+{
+	struct dt_it		*it;
+	const struct dt_it_ops	*iops;
+	size_t			 pageidx, nob, nlupgs = 0;
+	int			 rc;
+	ENTRY;
+
+	LASSERT(rdpg->rp_pages != NULL);
+	LASSERT(obj->do_index_ops != NULL);
+
+	if (filler == NULL)
+		filler = dt_index_page_build;
+
+	nob = rdpg->rp_count;
+	if (nob == 0)
+		RETURN(-EFAULT);
+
+	/* Iterate through index and fill containers from @rdpg */
+	iops = &obj->do_index_ops->dio_it;
+	LASSERT(iops != NULL);
+	it = iops->init(env, obj, rdpg->rp_attrs);
+	if (IS_ERR(it))
+		RETURN(PTR_ERR(it));
+
+	rc = iops->load(env, it, rdpg->rp_hash);
+	if (rc == 0) {
+		/*
+		 * Iterator didn't find record with exactly the key requested.
+		 *
+		 * It is currently either
+		 *
+		 *     - positioned above record with key less than
+		 *     requested---skip it.
+		 *     - or not positioned at all (is in IAM_IT_SKEWED
+		 *     state)---position it on the next item.
+		 */
+		rc = iops->next(env, it);
+	} else if (rc > 0) {
+		rc = 0;
+	} else {
+		if (rc == -ENODATA)
+			rc = 0;
+		GOTO(out, rc);
+	}
+
+	/* Fill containers one after the other. There might be multiple
+	 * containers per physical page.
+	 *
+	 * At this point and across for-loop:
+	 *  rc == 0 -> ok, proceed.
+	 *  rc >  0 -> end of index.
+	 *  rc <  0 -> error. */
+	for (pageidx = 0; rc == 0 && nob > 0; pageidx++) {
+		union lu_page	*lp;
+		int		 i;
+
+		LASSERT(pageidx < rdpg->rp_npages);
+		lp = kmap(rdpg->rp_pages[pageidx]);
+
+		/* fill lu pages */
+		for (i = 0; i < LU_PAGE_COUNT; i++, lp++, nob -= LU_PAGE_SIZE) {
+			rc = filler(env, lp, min_t(size_t, nob, LU_PAGE_SIZE),
+				    iops, it, rdpg->rp_attrs, arg);
+			if (rc < 0)
+				break;
+			/* one more lu_page */
+			nlupgs++;
+			if (rc > 0)
+				/* end of index */
+				break;
+		}
+		kunmap(rdpg->rp_pages[i]);
+	}
+
+out:
+	iops->put(env, it);
+	iops->fini(env, it);
+
+	if (rc >= 0)
+		rc = min_t(size_t, nlupgs * LU_PAGE_SIZE, rdpg->rp_count);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(dt_index_walk);
+
+/**
+ * Walk key/record pairs of an index and copy them into 4KB containers to be
+ * transferred over the network. This is the common handler for OBD_IDX_READ
+ * RPC processing.
+ *
+ * \param env - is the environment passed by the caller
+ * \param dev - is the dt_device storing the index
+ * \param ii  - is the idx_info structure packed by the client in the
+ *              OBD_IDX_READ request
+ * \param rdpg - is the lu_rdpg descriptor
+ *
+ * \retval on success, return sum (in bytes) of all filled containers
+ * \retval appropriate error otherwise.
+ */
+int dt_index_read(const struct lu_env *env, struct dt_device *dev,
+		  struct idx_info *ii, const struct lu_rdpg *rdpg)
+{
+	const struct dt_index_features	*feat;
+	struct dt_object		*obj;
+	int				 rc;
+	ENTRY;
+
+	/* rp_count shouldn't be null and should be a multiple of the container
+	 * size */
+	if (rdpg->rp_count == 0 || (rdpg->rp_count & (LU_PAGE_SIZE - 1)) != 0)
+		RETURN(-EFAULT);
+
+	if (!fid_is_quota(&ii->ii_fid) && !fid_is_layout_rbtree(&ii->ii_fid) &&
+	    !fid_is_norm(&ii->ii_fid))
+		RETURN(-EOPNOTSUPP);
+
+	/* lookup index object subject to the transfer */
+	obj = dt_locate(env, dev, &ii->ii_fid);
+	if (IS_ERR(obj))
+		RETURN(PTR_ERR(obj));
+	if (dt_object_exists(obj) == 0)
+		GOTO(out, rc = -ENOENT);
+
+	/* fetch index features associated with index object */
+	feat = dt_index_feat_select(fid_seq(&ii->ii_fid),
+				    lu_object_attr(&obj->do_lu));
+	if (IS_ERR(feat))
+		GOTO(out, rc = PTR_ERR(feat));
+
+	/* load index feature if not done already */
+	if (obj->do_index_ops == NULL) {
+		rc = obj->do_ops->do_index_try(env, obj, feat);
+		if (rc)
+			GOTO(out, rc);
+	}
+
+	/* fill ii_flags with supported index features */
+	ii->ii_flags &= (II_FL_NOHASH | II_FL_NOKEY | II_FL_VARKEY |
+			 II_FL_VARREC);
+
+	if (!(feat->dif_flags & DT_IND_VARKEY))
+		ii->ii_keysize = feat->dif_keysize_max;
+
+	if (!(feat->dif_flags & DT_IND_VARREC))
+		ii->ii_recsize = feat->dif_recsize_max;
+
+	if (feat->dif_flags & DT_IND_NONUNQ)
+		/* key isn't necessarily unique */
+		ii->ii_flags |= II_FL_NONUNQ;
+
+	if (!fid_is_layout_rbtree(&ii->ii_fid)) {
+		dt_read_lock(env, obj, 0);
+		/* fetch object version before walking the index */
+		ii->ii_version = dt_version_get(env, obj);
+	}
+
+	/* walk the index and fill lu_idxpages with key/record pairs */
+	rc = dt_index_walk(env, obj, rdpg, dt_index_page_build, ii);
+	if (!fid_is_layout_rbtree(&ii->ii_fid))
+		dt_read_unlock(env, obj);
+
+	if (rc == 0) {
+		/* index is empty */
+		LASSERT(ii->ii_count == 0);
+		ii->ii_hash_end = II_END_OFF;
+	}
+
+	GOTO(out, rc);
+out:
+	dt_object_put(env, obj);
+	return rc;
+}
+EXPORT_SYMBOL(dt_index_read);
+
+#ifdef CONFIG_PROC_FS
+int lprocfs_dt_blksize_seq_show(struct seq_file *m, void *v)
+{
+	struct dt_device *dt = m->private;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc == 0)
+		seq_printf(m, "%u\n", (unsigned) osfs.os_bsize);
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_blksize_seq_show);
+
+int lprocfs_dt_kbytestotal_seq_show(struct seq_file *m, void *v)
+{
+	struct dt_device *dt = m->private;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc == 0) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_blocks;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		seq_printf(m, "%llu\n", result);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_kbytestotal_seq_show);
+
+int lprocfs_dt_kbytesfree_seq_show(struct seq_file *m, void *v)
+{
+	struct dt_device *dt = m->private;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc == 0) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bfree;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		seq_printf(m, "%llu\n", result);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_kbytesfree_seq_show);
+
+int lprocfs_dt_kbytesavail_seq_show(struct seq_file *m, void *v)
+{
+	struct dt_device *dt = m->private;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc == 0) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bavail;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		seq_printf(m, "%llu\n", result);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_kbytesavail_seq_show);
+
+int lprocfs_dt_filestotal_seq_show(struct seq_file *m, void *v)
+{
+	struct dt_device *dt = m->private;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc == 0)
+		seq_printf(m, "%llu\n", osfs.os_files);
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_filestotal_seq_show);
+
+int lprocfs_dt_filesfree_seq_show(struct seq_file *m, void *v)
+{
+	struct dt_device *dt = m->private;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc == 0)
+		seq_printf(m, "%llu\n", osfs.os_ffree);
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_filesfree_seq_show);
+
+#endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/genops.c b/drivers/staging/lustrefsx/lustre/obdclass/genops.c
new file mode 100644
index 0000000000000..7fb129d889900
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/genops.c
@@ -0,0 +1,2520 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/genops.c
+ *
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/pid_namespace.h>
+#include <linux/kthread.h>
+#include <obd_class.h>
+#include <lustre_log.h>
+#include <lprocfs_status.h>
+#include <lustre_disk.h>
+#include <lustre_kernelcomm.h>
+
+static DEFINE_SPINLOCK(obd_types_lock);
+static LIST_HEAD(obd_types);
+DEFINE_RWLOCK(obd_dev_lock);
+static struct obd_device *obd_devs[MAX_OBD_DEVICES];
+
+static struct kmem_cache *obd_device_cachep;
+struct kmem_cache *obdo_cachep;
+EXPORT_SYMBOL(obdo_cachep);
+static struct kmem_cache *import_cachep;
+
+static LIST_HEAD(obd_zombie_imports);
+static LIST_HEAD(obd_zombie_exports);
+static DEFINE_SPINLOCK(obd_zombie_impexp_lock);
+
+static void obd_zombie_impexp_notify(void);
+static void obd_zombie_export_add(struct obd_export *exp);
+static void obd_zombie_import_add(struct obd_import *imp);
+static void print_export_data(struct obd_export *exp,
+                              const char *status, int locks, int debug_level);
+
+static LIST_HEAD(obd_stale_exports);
+static DEFINE_SPINLOCK(obd_stale_export_lock);
+static atomic_t obd_stale_export_num = ATOMIC_INIT(0);
+
+int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c);
+EXPORT_SYMBOL(ptlrpc_put_connection_superhack);
+
+/*
+ * support functions: we could use inter-module communication, but this
+ * is more portable to other OS's
+ */
+static struct obd_device *obd_device_alloc(void)
+{
+	struct obd_device *obd;
+
+	OBD_SLAB_ALLOC_PTR_GFP(obd, obd_device_cachep, GFP_NOFS);
+	if (obd != NULL) {
+		obd->obd_magic = OBD_DEVICE_MAGIC;
+	}
+	return obd;
+}
+
+static void obd_device_free(struct obd_device *obd)
+{
+        LASSERT(obd != NULL);
+        LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "obd %p obd_magic %08x != %08x\n",
+                 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+        if (obd->obd_namespace != NULL) {
+                CERROR("obd %p: namespace %p was not properly cleaned up (obd_force=%d)!\n",
+                       obd, obd->obd_namespace, obd->obd_force);
+                LBUG();
+        }
+        lu_ref_fini(&obd->obd_reference);
+        OBD_SLAB_FREE_PTR(obd, obd_device_cachep);
+}
+
+struct obd_type *class_search_type(const char *name)
+{
+	struct list_head *tmp;
+	struct obd_type *type;
+
+	spin_lock(&obd_types_lock);
+	list_for_each(tmp, &obd_types) {
+		type = list_entry(tmp, struct obd_type, typ_chain);
+		if (strcmp(type->typ_name, name) == 0) {
+			spin_unlock(&obd_types_lock);
+			return type;
+		}
+	}
+	spin_unlock(&obd_types_lock);
+	return NULL;
+}
+EXPORT_SYMBOL(class_search_type);
+
+struct obd_type *class_get_type(const char *name)
+{
+        struct obd_type *type = class_search_type(name);
+
+#ifdef HAVE_MODULE_LOADING_SUPPORT
+        if (!type) {
+                const char *modname = name;
+
+		if (strcmp(modname, "obdfilter") == 0)
+			modname = "ofd";
+
+		if (strcmp(modname, LUSTRE_LWP_NAME) == 0)
+			modname = LUSTRE_OSP_NAME;
+
+		if (!strncmp(modname, LUSTRE_MDS_NAME, strlen(LUSTRE_MDS_NAME)))
+			modname = LUSTRE_MDT_NAME;
+
+		if (!request_module("%s", modname)) {
+			CDEBUG(D_INFO, "Loaded module '%s'\n", modname);
+			type = class_search_type(name);
+                } else {
+                        LCONSOLE_ERROR_MSG(0x158, "Can't load module '%s'\n",
+                                           modname);
+                }
+        }
+#endif
+        if (type) {
+		spin_lock(&type->obd_type_lock);
+		type->typ_refcnt++;
+		try_module_get(type->typ_dt_ops->o_owner);
+		spin_unlock(&type->obd_type_lock);
+	}
+	return type;
+}
+
+void class_put_type(struct obd_type *type)
+{
+	LASSERT(type);
+	spin_lock(&type->obd_type_lock);
+	type->typ_refcnt--;
+	module_put(type->typ_dt_ops->o_owner);
+	spin_unlock(&type->obd_type_lock);
+}
+
+#define CLASS_MAX_NAME 1024
+
+int class_register_type(struct obd_ops *dt_ops, struct md_ops *md_ops,
+			bool enable_proc, struct lprocfs_vars *vars,
+			const char *name, struct lu_device_type *ldt)
+{
+        struct obd_type *type;
+        int rc = 0;
+        ENTRY;
+
+        /* sanity check */
+        LASSERT(strnlen(name, CLASS_MAX_NAME) < CLASS_MAX_NAME);
+
+        if (class_search_type(name)) {
+                CDEBUG(D_IOCTL, "Type %s already registered\n", name);
+                RETURN(-EEXIST);
+        }
+
+        rc = -ENOMEM;
+        OBD_ALLOC(type, sizeof(*type));
+        if (type == NULL)
+                RETURN(rc);
+
+        OBD_ALLOC_PTR(type->typ_dt_ops);
+        OBD_ALLOC_PTR(type->typ_md_ops);
+        OBD_ALLOC(type->typ_name, strlen(name) + 1);
+
+        if (type->typ_dt_ops == NULL ||
+            type->typ_md_ops == NULL ||
+            type->typ_name == NULL)
+                GOTO (failed, rc);
+
+        *(type->typ_dt_ops) = *dt_ops;
+        /* md_ops is optional */
+        if (md_ops)
+                *(type->typ_md_ops) = *md_ops;
+        strcpy(type->typ_name, name);
+	spin_lock_init(&type->obd_type_lock);
+
+#ifdef CONFIG_PROC_FS
+	if (enable_proc) {
+		type->typ_procroot = lprocfs_register(type->typ_name,
+						      proc_lustre_root,
+						      vars, type);
+		if (IS_ERR(type->typ_procroot)) {
+			rc = PTR_ERR(type->typ_procroot);
+			type->typ_procroot = NULL;
+			GOTO(failed, rc);
+		}
+	}
+#endif
+        if (ldt != NULL) {
+                type->typ_lu = ldt;
+                rc = lu_device_type_init(ldt);
+                if (rc != 0)
+                        GOTO (failed, rc);
+        }
+
+	spin_lock(&obd_types_lock);
+	list_add(&type->typ_chain, &obd_types);
+	spin_unlock(&obd_types_lock);
+
+        RETURN (0);
+
+failed:
+	if (type->typ_name != NULL) {
+#ifdef CONFIG_PROC_FS
+		if (type->typ_procroot != NULL)
+			remove_proc_subtree(type->typ_name, proc_lustre_root);
+#endif
+                OBD_FREE(type->typ_name, strlen(name) + 1);
+	}
+        if (type->typ_md_ops != NULL)
+                OBD_FREE_PTR(type->typ_md_ops);
+        if (type->typ_dt_ops != NULL)
+                OBD_FREE_PTR(type->typ_dt_ops);
+        OBD_FREE(type, sizeof(*type));
+        RETURN(rc);
+}
+EXPORT_SYMBOL(class_register_type);
+
+int class_unregister_type(const char *name)
+{
+        struct obd_type *type = class_search_type(name);
+        ENTRY;
+
+        if (!type) {
+                CERROR("unknown obd type\n");
+                RETURN(-EINVAL);
+        }
+
+        if (type->typ_refcnt) {
+                CERROR("type %s has refcount (%d)\n", name, type->typ_refcnt);
+                /* This is a bad situation, let's make the best of it */
+                /* Remove ops, but leave the name for debugging */
+                OBD_FREE_PTR(type->typ_dt_ops);
+                OBD_FREE_PTR(type->typ_md_ops);
+                RETURN(-EBUSY);
+        }
+
+	/* we do not use type->typ_procroot as for compatibility purposes
+	 * other modules can share names (i.e. lod can use lov entry). so
+	 * we can't reference pointer as it can get invalided when another
+	 * module removes the entry */
+#ifdef CONFIG_PROC_FS
+	if (type->typ_procroot != NULL)
+		remove_proc_subtree(type->typ_name, proc_lustre_root);
+	if (type->typ_procsym != NULL)
+		lprocfs_remove(&type->typ_procsym);
+#endif
+        if (type->typ_lu)
+                lu_device_type_fini(type->typ_lu);
+
+	spin_lock(&obd_types_lock);
+	list_del(&type->typ_chain);
+	spin_unlock(&obd_types_lock);
+        OBD_FREE(type->typ_name, strlen(name) + 1);
+        if (type->typ_dt_ops != NULL)
+                OBD_FREE_PTR(type->typ_dt_ops);
+        if (type->typ_md_ops != NULL)
+                OBD_FREE_PTR(type->typ_md_ops);
+        OBD_FREE(type, sizeof(*type));
+        RETURN(0);
+} /* class_unregister_type */
+EXPORT_SYMBOL(class_unregister_type);
+
+/**
+ * Create a new obd device.
+ *
+ * Allocate the new obd_device and initialize it.
+ *
+ * \param[in] type_name obd device type string.
+ * \param[in] name      obd device name.
+ * \param[in] uuid      obd device UUID
+ *
+ * \retval newdev         pointer to created obd_device
+ * \retval ERR_PTR(errno) on error
+ */
+struct obd_device *class_newdev(const char *type_name, const char *name,
+				const char *uuid)
+{
+        struct obd_device *newdev;
+        struct obd_type *type = NULL;
+        ENTRY;
+
+        if (strlen(name) >= MAX_OBD_NAME) {
+                CERROR("name/uuid must be < %u bytes long\n", MAX_OBD_NAME);
+                RETURN(ERR_PTR(-EINVAL));
+        }
+
+        type = class_get_type(type_name);
+        if (type == NULL){
+                CERROR("OBD: unknown type: %s\n", type_name);
+                RETURN(ERR_PTR(-ENODEV));
+        }
+
+        newdev = obd_device_alloc();
+	if (newdev == NULL) {
+		class_put_type(type);
+		RETURN(ERR_PTR(-ENOMEM));
+	}
+        LASSERT(newdev->obd_magic == OBD_DEVICE_MAGIC);
+	strncpy(newdev->obd_name, name, sizeof(newdev->obd_name) - 1);
+	newdev->obd_type = type;
+	newdev->obd_minor = -1;
+
+	rwlock_init(&newdev->obd_pool_lock);
+	newdev->obd_pool_limit = 0;
+	newdev->obd_pool_slv = 0;
+
+	INIT_LIST_HEAD(&newdev->obd_exports);
+	INIT_LIST_HEAD(&newdev->obd_unlinked_exports);
+	INIT_LIST_HEAD(&newdev->obd_delayed_exports);
+	INIT_LIST_HEAD(&newdev->obd_exports_timed);
+	INIT_LIST_HEAD(&newdev->obd_nid_stats);
+	spin_lock_init(&newdev->obd_nid_lock);
+	spin_lock_init(&newdev->obd_dev_lock);
+	mutex_init(&newdev->obd_dev_mutex);
+	spin_lock_init(&newdev->obd_osfs_lock);
+	/* newdev->obd_osfs_age must be set to a value in the distant
+	 * past to guarantee a fresh statfs is fetched on mount. */
+	newdev->obd_osfs_age = cfs_time_shift_64(-1000);
+
+	/* XXX belongs in setup not attach  */
+	init_rwsem(&newdev->obd_observer_link_sem);
+	/* recovery data */
+	init_timer(&newdev->obd_recovery_timer);
+	spin_lock_init(&newdev->obd_recovery_task_lock);
+	init_waitqueue_head(&newdev->obd_next_transno_waitq);
+	init_waitqueue_head(&newdev->obd_evict_inprogress_waitq);
+	INIT_LIST_HEAD(&newdev->obd_req_replay_queue);
+	INIT_LIST_HEAD(&newdev->obd_lock_replay_queue);
+	INIT_LIST_HEAD(&newdev->obd_final_req_queue);
+	INIT_LIST_HEAD(&newdev->obd_evict_list);
+	INIT_LIST_HEAD(&newdev->obd_lwp_list);
+
+	llog_group_init(&newdev->obd_olg);
+	/* Detach drops this */
+	atomic_set(&newdev->obd_refcount, 1);
+	lu_ref_init(&newdev->obd_reference);
+	lu_ref_add(&newdev->obd_reference, "newdev", newdev);
+
+	newdev->obd_conn_inprogress = 0;
+
+	strncpy(newdev->obd_uuid.uuid, uuid, strlen(uuid));
+
+	CDEBUG(D_IOCTL, "Allocate new device %s (%p)\n",
+	       newdev->obd_name, newdev);
+
+	return newdev;
+}
+
+/**
+ * Free obd device.
+ *
+ * \param[in] obd obd_device to be freed
+ *
+ * \retval none
+ */
+void class_free_dev(struct obd_device *obd)
+{
+	struct obd_type *obd_type = obd->obd_type;
+
+	LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x "
+		 "!= %08x\n", obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+	LASSERTF(obd->obd_minor == -1 || obd_devs[obd->obd_minor] == obd,
+		 "obd %p != obd_devs[%d] %p\n",
+		 obd, obd->obd_minor, obd_devs[obd->obd_minor]);
+	LASSERTF(atomic_read(&obd->obd_refcount) == 0,
+		 "obd_refcount should be 0, not %d\n",
+		 atomic_read(&obd->obd_refcount));
+	LASSERT(obd_type != NULL);
+
+	CDEBUG(D_INFO, "Release obd device %s obd_type name = %s\n",
+	       obd->obd_name, obd->obd_type->typ_name);
+
+	CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n",
+			 obd->obd_name, obd->obd_uuid.uuid);
+	if (obd->obd_stopping) {
+		int err;
+
+		/* If we're not stopping, we were never set up */
+		err = obd_cleanup(obd);
+		if (err)
+			CERROR("Cleanup %s returned %d\n",
+				obd->obd_name, err);
+	}
+
+	obd_device_free(obd);
+
+	class_put_type(obd_type);
+}
+
+/**
+ * Unregister obd device.
+ *
+ * Free slot in obd_dev[] used by \a obd.
+ *
+ * \param[in] new_obd obd_device to be unregistered
+ *
+ * \retval none
+ */
+void class_unregister_device(struct obd_device *obd)
+{
+	write_lock(&obd_dev_lock);
+	if (obd->obd_minor >= 0) {
+		LASSERT(obd_devs[obd->obd_minor] == obd);
+		obd_devs[obd->obd_minor] = NULL;
+		obd->obd_minor = -1;
+	}
+	write_unlock(&obd_dev_lock);
+}
+
+/**
+ * Register obd device.
+ *
+ * Find free slot in obd_devs[], fills it with \a new_obd.
+ *
+ * \param[in] new_obd obd_device to be registered
+ *
+ * \retval 0          success
+ * \retval -EEXIST    device with this name is registered
+ * \retval -EOVERFLOW obd_devs[] is full
+ */
+int class_register_device(struct obd_device *new_obd)
+{
+	int ret = 0;
+	int i;
+	int new_obd_minor = 0;
+	bool minor_assign = false;
+	bool retried = false;
+
+again:
+	write_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+
+		if (obd != NULL &&
+		    (strcmp(new_obd->obd_name, obd->obd_name) == 0)) {
+
+			if (!retried) {
+				write_unlock(&obd_dev_lock);
+
+				/* the obd_device could be waited to be
+ 				 * destroyed by the "obd_zombie_impexp_thread".
+ 				 */
+				obd_zombie_barrier();
+				retried = true;
+				goto again;
+			}
+
+			CERROR("%s: already exists, won't add\n",
+			       obd->obd_name);
+			/* in case we found a free slot before duplicate */
+			minor_assign = false;
+			ret = -EEXIST;
+			break;
+		}
+		if (!minor_assign && obd == NULL) {
+			new_obd_minor = i;
+			minor_assign = true;
+		}
+	}
+
+	if (minor_assign) {
+		new_obd->obd_minor = new_obd_minor;
+		LASSERTF(obd_devs[new_obd_minor] == NULL, "obd_devs[%d] "
+			 "%p\n", new_obd_minor, obd_devs[new_obd_minor]);
+		obd_devs[new_obd_minor] = new_obd;
+	} else {
+		if (ret == 0) {
+			ret = -EOVERFLOW;
+			CERROR("%s: all %u/%u devices used, increase "
+			       "MAX_OBD_DEVICES: rc = %d\n", new_obd->obd_name,
+			       i, class_devno_max(), ret);
+		}
+	}
+	write_unlock(&obd_dev_lock);
+
+	RETURN(ret);
+}
+
+static int class_name2dev_nolock(const char *name)
+{
+        int i;
+
+        if (!name)
+                return -1;
+
+        for (i = 0; i < class_devno_max(); i++) {
+                struct obd_device *obd = class_num2obd(i);
+
+		if (obd && strcmp(name, obd->obd_name) == 0) {
+                        /* Make sure we finished attaching before we give
+                           out any references */
+                        LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+                        if (obd->obd_attached) {
+                                return i;
+                        }
+                        break;
+                }
+        }
+
+        return -1;
+}
+
+int class_name2dev(const char *name)
+{
+	int i;
+
+	if (!name)
+		return -1;
+
+	read_lock(&obd_dev_lock);
+	i = class_name2dev_nolock(name);
+	read_unlock(&obd_dev_lock);
+
+	return i;
+}
+EXPORT_SYMBOL(class_name2dev);
+
+struct obd_device *class_name2obd(const char *name)
+{
+        int dev = class_name2dev(name);
+
+        if (dev < 0 || dev > class_devno_max())
+                return NULL;
+        return class_num2obd(dev);
+}
+EXPORT_SYMBOL(class_name2obd);
+
+int class_uuid2dev_nolock(struct obd_uuid *uuid)
+{
+        int i;
+
+        for (i = 0; i < class_devno_max(); i++) {
+                struct obd_device *obd = class_num2obd(i);
+
+                if (obd && obd_uuid_equals(uuid, &obd->obd_uuid)) {
+                        LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+                        return i;
+                }
+        }
+
+        return -1;
+}
+
+int class_uuid2dev(struct obd_uuid *uuid)
+{
+	int i;
+
+	read_lock(&obd_dev_lock);
+	i = class_uuid2dev_nolock(uuid);
+	read_unlock(&obd_dev_lock);
+
+	return i;
+}
+EXPORT_SYMBOL(class_uuid2dev);
+
+struct obd_device *class_uuid2obd(struct obd_uuid *uuid)
+{
+        int dev = class_uuid2dev(uuid);
+        if (dev < 0)
+                return NULL;
+        return class_num2obd(dev);
+}
+EXPORT_SYMBOL(class_uuid2obd);
+
+/**
+ * Get obd device from ::obd_devs[]
+ *
+ * \param num [in] array index
+ *
+ * \retval NULL if ::obd_devs[\a num] does not contains an obd device
+ *         otherwise return the obd device there.
+ */
+struct obd_device *class_num2obd(int num)
+{
+        struct obd_device *obd = NULL;
+
+        if (num < class_devno_max()) {
+                obd = obd_devs[num];
+                if (obd == NULL)
+                        return NULL;
+
+                LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+                         "%p obd_magic %08x != %08x\n",
+                         obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+                LASSERTF(obd->obd_minor == num,
+                         "%p obd_minor %0d != %0d\n",
+                         obd, obd->obd_minor, num);
+        }
+
+        return obd;
+}
+
+/**
+ * Find obd in obd_dev[] by name or uuid.
+ *
+ * Increment obd's refcount if found.
+ *
+ * \param[in] str obd name or uuid
+ *
+ * \retval NULL    if not found
+ * \retval target  pointer to found obd_device
+ */
+struct obd_device *class_dev_by_str(const char *str)
+{
+	struct obd_device *target = NULL;
+	struct obd_uuid tgtuuid;
+	int rc;
+
+	obd_str2uuid(&tgtuuid, str);
+
+	read_lock(&obd_dev_lock);
+	rc = class_uuid2dev_nolock(&tgtuuid);
+	if (rc < 0)
+		rc = class_name2dev_nolock(str);
+
+	if (rc >= 0)
+		target = class_num2obd(rc);
+
+	if (target != NULL)
+		class_incref(target, "find", current);
+	read_unlock(&obd_dev_lock);
+
+	RETURN(target);
+}
+EXPORT_SYMBOL(class_dev_by_str);
+
+/**
+ * Get obd devices count. Device in any
+ *    state are counted
+ * \retval obd device count
+ */
+int get_devices_count(void)
+{
+	int index, max_index = class_devno_max(), dev_count = 0;
+
+	read_lock(&obd_dev_lock);
+	for (index = 0; index <= max_index; index++) {
+		struct obd_device *obd = class_num2obd(index);
+		if (obd != NULL)
+			dev_count++;
+	}
+	read_unlock(&obd_dev_lock);
+
+	return dev_count;
+}
+EXPORT_SYMBOL(get_devices_count);
+
+void class_obd_list(void)
+{
+        char *status;
+        int i;
+
+	read_lock(&obd_dev_lock);
+        for (i = 0; i < class_devno_max(); i++) {
+                struct obd_device *obd = class_num2obd(i);
+
+                if (obd == NULL)
+                        continue;
+                if (obd->obd_stopping)
+                        status = "ST";
+                else if (obd->obd_set_up)
+                        status = "UP";
+                else if (obd->obd_attached)
+                        status = "AT";
+                else
+                        status = "--";
+                LCONSOLE(D_CONFIG, "%3d %s %s %s %s %d\n",
+                         i, status, obd->obd_type->typ_name,
+                         obd->obd_name, obd->obd_uuid.uuid,
+			 atomic_read(&obd->obd_refcount));
+        }
+	read_unlock(&obd_dev_lock);
+        return;
+}
+
+/* Search for a client OBD connected to tgt_uuid.  If grp_uuid is
+   specified, then only the client with that uuid is returned,
+   otherwise any client connected to the tgt is returned. */
+struct obd_device * class_find_client_obd(struct obd_uuid *tgt_uuid,
+                                          const char * typ_name,
+                                          struct obd_uuid *grp_uuid)
+{
+        int i;
+
+	read_lock(&obd_dev_lock);
+        for (i = 0; i < class_devno_max(); i++) {
+                struct obd_device *obd = class_num2obd(i);
+
+                if (obd == NULL)
+                        continue;
+                if ((strncmp(obd->obd_type->typ_name, typ_name,
+                             strlen(typ_name)) == 0)) {
+                        if (obd_uuid_equals(tgt_uuid,
+                                            &obd->u.cli.cl_target_uuid) &&
+                            ((grp_uuid)? obd_uuid_equals(grp_uuid,
+                                                         &obd->obd_uuid) : 1)) {
+				read_unlock(&obd_dev_lock);
+                                return obd;
+                        }
+                }
+        }
+	read_unlock(&obd_dev_lock);
+
+        return NULL;
+}
+EXPORT_SYMBOL(class_find_client_obd);
+
+/* Iterate the obd_device list looking devices have grp_uuid. Start
+   searching at *next, and if a device is found, the next index to look
+   at is saved in *next. If next is NULL, then the first matching device
+   will always be returned. */
+struct obd_device * class_devices_in_group(struct obd_uuid *grp_uuid, int *next)
+{
+        int i;
+
+        if (next == NULL)
+                i = 0;
+        else if (*next >= 0 && *next < class_devno_max())
+                i = *next;
+        else
+                return NULL;
+
+	read_lock(&obd_dev_lock);
+        for (; i < class_devno_max(); i++) {
+                struct obd_device *obd = class_num2obd(i);
+
+                if (obd == NULL)
+                        continue;
+                if (obd_uuid_equals(grp_uuid, &obd->obd_uuid)) {
+                        if (next != NULL)
+                                *next = i+1;
+			read_unlock(&obd_dev_lock);
+                        return obd;
+                }
+        }
+	read_unlock(&obd_dev_lock);
+
+        return NULL;
+}
+EXPORT_SYMBOL(class_devices_in_group);
+
+/**
+ * to notify sptlrpc log for \a fsname has changed, let every relevant OBD
+ * adjust sptlrpc settings accordingly.
+ */
+int class_notify_sptlrpc_conf(const char *fsname, int namelen)
+{
+        struct obd_device  *obd;
+        const char         *type;
+        int                 i, rc = 0, rc2;
+
+        LASSERT(namelen > 0);
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		obd = class_num2obd(i);
+
+		if (obd == NULL || obd->obd_set_up == 0 || obd->obd_stopping)
+			continue;
+
+		/* only notify mdc, osc, osp, lwp, mdt, ost
+		 * because only these have a -sptlrpc llog */
+		type = obd->obd_type->typ_name;
+		if (strcmp(type, LUSTRE_MDC_NAME) != 0 &&
+		    strcmp(type, LUSTRE_OSC_NAME) != 0 &&
+		    strcmp(type, LUSTRE_OSP_NAME) != 0 &&
+		    strcmp(type, LUSTRE_LWP_NAME) != 0 &&
+		    strcmp(type, LUSTRE_MDT_NAME) != 0 &&
+		    strcmp(type, LUSTRE_OST_NAME) != 0)
+			continue;
+
+                if (strncmp(obd->obd_name, fsname, namelen))
+                        continue;
+
+                class_incref(obd, __FUNCTION__, obd);
+		read_unlock(&obd_dev_lock);
+                rc2 = obd_set_info_async(NULL, obd->obd_self_export,
+                                         sizeof(KEY_SPTLRPC_CONF),
+                                         KEY_SPTLRPC_CONF, 0, NULL, NULL);
+                rc = rc ? rc : rc2;
+                class_decref(obd, __FUNCTION__, obd);
+		read_lock(&obd_dev_lock);
+        }
+	read_unlock(&obd_dev_lock);
+        return rc;
+}
+EXPORT_SYMBOL(class_notify_sptlrpc_conf);
+
+void obd_cleanup_caches(void)
+{
+        ENTRY;
+        if (obd_device_cachep) {
+		kmem_cache_destroy(obd_device_cachep);
+                obd_device_cachep = NULL;
+        }
+        if (obdo_cachep) {
+		kmem_cache_destroy(obdo_cachep);
+                obdo_cachep = NULL;
+        }
+        if (import_cachep) {
+		kmem_cache_destroy(import_cachep);
+                import_cachep = NULL;
+        }
+
+        EXIT;
+}
+
+int obd_init_caches(void)
+{
+	int rc;
+	ENTRY;
+
+	LASSERT(obd_device_cachep == NULL);
+	obd_device_cachep = kmem_cache_create("ll_obd_dev_cache",
+					      sizeof(struct obd_device),
+					      0, 0, NULL);
+	if (!obd_device_cachep)
+		GOTO(out, rc = -ENOMEM);
+
+	LASSERT(obdo_cachep == NULL);
+	obdo_cachep = kmem_cache_create("ll_obdo_cache", sizeof(struct obdo),
+					0, 0, NULL);
+	if (!obdo_cachep)
+		GOTO(out, rc = -ENOMEM);
+
+	LASSERT(import_cachep == NULL);
+	import_cachep = kmem_cache_create("ll_import_cache",
+					  sizeof(struct obd_import),
+					  0, 0, NULL);
+	if (!import_cachep)
+		GOTO(out, rc = -ENOMEM);
+
+	RETURN(0);
+out:
+	obd_cleanup_caches();
+	RETURN(rc);
+}
+
+/* map connection to client */
+struct obd_export *class_conn2export(struct lustre_handle *conn)
+{
+        struct obd_export *export;
+        ENTRY;
+
+        if (!conn) {
+                CDEBUG(D_CACHE, "looking for null handle\n");
+                RETURN(NULL);
+        }
+
+        if (conn->cookie == -1) {  /* this means assign a new connection */
+                CDEBUG(D_CACHE, "want a new connection\n");
+                RETURN(NULL);
+        }
+
+	CDEBUG(D_INFO, "looking for export cookie %#llx\n", conn->cookie);
+	export = class_handle2object(conn->cookie, NULL);
+	RETURN(export);
+}
+EXPORT_SYMBOL(class_conn2export);
+
+struct obd_device *class_exp2obd(struct obd_export *exp)
+{
+        if (exp)
+                return exp->exp_obd;
+        return NULL;
+}
+EXPORT_SYMBOL(class_exp2obd);
+
+struct obd_device *class_conn2obd(struct lustre_handle *conn)
+{
+        struct obd_export *export;
+        export = class_conn2export(conn);
+        if (export) {
+                struct obd_device *obd = export->exp_obd;
+                class_export_put(export);
+                return obd;
+        }
+        return NULL;
+}
+
+struct obd_import *class_exp2cliimp(struct obd_export *exp)
+{
+        struct obd_device *obd = exp->exp_obd;
+        if (obd == NULL)
+                return NULL;
+        return obd->u.cli.cl_import;
+}
+EXPORT_SYMBOL(class_exp2cliimp);
+
+struct obd_import *class_conn2cliimp(struct lustre_handle *conn)
+{
+        struct obd_device *obd = class_conn2obd(conn);
+        if (obd == NULL)
+                return NULL;
+        return obd->u.cli.cl_import;
+}
+
+/* Export management functions */
+static void class_export_destroy(struct obd_export *exp)
+{
+        struct obd_device *obd = exp->exp_obd;
+        ENTRY;
+
+        LASSERT_ATOMIC_ZERO(&exp->exp_refcount);
+	LASSERT(obd != NULL);
+
+        CDEBUG(D_IOCTL, "destroying export %p/%s for %s\n", exp,
+               exp->exp_client_uuid.uuid, obd->obd_name);
+
+        /* "Local" exports (lctl, LOV->{mdc,osc}) have no connection. */
+        if (exp->exp_connection)
+                ptlrpc_put_connection_superhack(exp->exp_connection);
+
+	LASSERT(list_empty(&exp->exp_outstanding_replies));
+	LASSERT(list_empty(&exp->exp_uncommitted_replies));
+	LASSERT(list_empty(&exp->exp_req_replay_queue));
+	LASSERT(list_empty(&exp->exp_hp_rpcs));
+        obd_destroy_export(exp);
+	/* self export doesn't hold a reference to an obd, although it
+	 * exists until freeing of the obd */
+	if (exp != obd->obd_self_export)
+		class_decref(obd, "export", exp);
+
+        OBD_FREE_RCU(exp, sizeof(*exp), &exp->exp_handle);
+        EXIT;
+}
+
+static void export_handle_addref(void *export)
+{
+        class_export_get(export);
+}
+
+static struct portals_handle_ops export_handle_ops = {
+	.hop_addref = export_handle_addref,
+	.hop_free   = NULL,
+};
+
+struct obd_export *class_export_get(struct obd_export *exp)
+{
+	atomic_inc(&exp->exp_refcount);
+        CDEBUG(D_INFO, "GETting export %p : new refcount %d\n", exp,
+	       atomic_read(&exp->exp_refcount));
+        return exp;
+}
+EXPORT_SYMBOL(class_export_get);
+
+void class_export_put(struct obd_export *exp)
+{
+        LASSERT(exp != NULL);
+        LASSERT_ATOMIC_GT_LT(&exp->exp_refcount, 0, LI_POISON);
+        CDEBUG(D_INFO, "PUTting export %p : new refcount %d\n", exp,
+	       atomic_read(&exp->exp_refcount) - 1);
+
+	if (atomic_dec_and_test(&exp->exp_refcount)) {
+		struct obd_device *obd = exp->exp_obd;
+
+		CDEBUG(D_IOCTL, "final put %p/%s\n",
+		       exp, exp->exp_client_uuid.uuid);
+
+		/* release nid stat refererence */
+		lprocfs_exp_cleanup(exp);
+
+		if (exp == obd->obd_self_export) {
+			/* self export should be destroyed without
+			 * zombie thread as it doesn't hold a
+			 * reference to obd and doesn't hold any
+			 * resources */
+			class_export_destroy(exp);
+			/* self export is destroyed, no class
+			 * references exist and it is safe to free
+			 * obd */
+			class_free_dev(obd);
+		} else {
+			LASSERT(!list_empty(&exp->exp_obd_chain));
+			obd_zombie_export_add(exp);
+		}
+
+	}
+}
+EXPORT_SYMBOL(class_export_put);
+/* Creates a new export, adds it to the hash table, and returns a
+ * pointer to it. The refcount is 2: one for the hash reference, and
+ * one for the pointer returned by this function. */
+struct obd_export *__class_new_export(struct obd_device *obd,
+				      struct obd_uuid *cluuid, bool is_self)
+{
+        struct obd_export *export;
+	struct cfs_hash *hash = NULL;
+        int rc = 0;
+        ENTRY;
+
+        OBD_ALLOC_PTR(export);
+        if (!export)
+                return ERR_PTR(-ENOMEM);
+
+        export->exp_conn_cnt = 0;
+        export->exp_lock_hash = NULL;
+	export->exp_flock_hash = NULL;
+	/* 2 = class_handle_hash + last */
+	atomic_set(&export->exp_refcount, 2);
+	atomic_set(&export->exp_rpc_count, 0);
+	atomic_set(&export->exp_cb_count, 0);
+	atomic_set(&export->exp_locks_count, 0);
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	INIT_LIST_HEAD(&export->exp_locks_list);
+	spin_lock_init(&export->exp_locks_list_guard);
+#endif
+	atomic_set(&export->exp_replay_count, 0);
+	export->exp_obd = obd;
+	INIT_LIST_HEAD(&export->exp_outstanding_replies);
+	spin_lock_init(&export->exp_uncommitted_replies_lock);
+	INIT_LIST_HEAD(&export->exp_uncommitted_replies);
+	INIT_LIST_HEAD(&export->exp_req_replay_queue);
+	INIT_LIST_HEAD(&export->exp_handle.h_link);
+	INIT_LIST_HEAD(&export->exp_hp_rpcs);
+	INIT_LIST_HEAD(&export->exp_reg_rpcs);
+	class_handle_hash(&export->exp_handle, &export_handle_ops);
+	export->exp_last_request_time = cfs_time_current_sec();
+	spin_lock_init(&export->exp_lock);
+	spin_lock_init(&export->exp_rpc_lock);
+	INIT_HLIST_NODE(&export->exp_uuid_hash);
+	INIT_HLIST_NODE(&export->exp_nid_hash);
+	INIT_HLIST_NODE(&export->exp_gen_hash);
+	spin_lock_init(&export->exp_bl_list_lock);
+	INIT_LIST_HEAD(&export->exp_bl_list);
+	INIT_LIST_HEAD(&export->exp_stale_list);
+
+	export->exp_sp_peer = LUSTRE_SP_ANY;
+	export->exp_flvr.sf_rpc = SPTLRPC_FLVR_INVALID;
+	export->exp_client_uuid = *cluuid;
+	obd_init_export(export);
+
+	if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) {
+		spin_lock(&obd->obd_dev_lock);
+		/* shouldn't happen, but might race */
+		if (obd->obd_stopping)
+			GOTO(exit_unlock, rc = -ENODEV);
+
+		hash = cfs_hash_getref(obd->obd_uuid_hash);
+		if (hash == NULL)
+			GOTO(exit_unlock, rc = -ENODEV);
+		spin_unlock(&obd->obd_dev_lock);
+
+                rc = cfs_hash_add_unique(hash, cluuid, &export->exp_uuid_hash);
+                if (rc != 0) {
+                        LCONSOLE_WARN("%s: denying duplicate export for %s, %d\n",
+                                      obd->obd_name, cluuid->uuid, rc);
+                        GOTO(exit_err, rc = -EALREADY);
+                }
+        }
+
+	at_init(&export->exp_bl_lock_at, obd_timeout, 0);
+	spin_lock(&obd->obd_dev_lock);
+        if (obd->obd_stopping) {
+		if (hash)
+			cfs_hash_del(hash, cluuid, &export->exp_uuid_hash);
+		GOTO(exit_unlock, rc = -ESHUTDOWN);
+        }
+
+	if (!is_self) {
+		class_incref(obd, "export", export);
+		list_add_tail(&export->exp_obd_chain_timed,
+			      &obd->obd_exports_timed);
+		list_add(&export->exp_obd_chain, &obd->obd_exports);
+		obd->obd_num_exports++;
+	} else {
+		INIT_LIST_HEAD(&export->exp_obd_chain_timed);
+		INIT_LIST_HEAD(&export->exp_obd_chain);
+	}
+	spin_unlock(&obd->obd_dev_lock);
+	if (hash)
+		cfs_hash_putref(hash);
+	RETURN(export);
+
+exit_unlock:
+	spin_unlock(&obd->obd_dev_lock);
+exit_err:
+        if (hash)
+                cfs_hash_putref(hash);
+        class_handle_unhash(&export->exp_handle);
+	LASSERT(hlist_unhashed(&export->exp_uuid_hash));
+        obd_destroy_export(export);
+        OBD_FREE_PTR(export);
+        return ERR_PTR(rc);
+}
+
+struct obd_export *class_new_export(struct obd_device *obd,
+				    struct obd_uuid *uuid)
+{
+	return __class_new_export(obd, uuid, false);
+}
+EXPORT_SYMBOL(class_new_export);
+
+struct obd_export *class_new_export_self(struct obd_device *obd,
+					 struct obd_uuid *uuid)
+{
+	return __class_new_export(obd, uuid, true);
+}
+
+void class_unlink_export(struct obd_export *exp)
+{
+	class_handle_unhash(&exp->exp_handle);
+
+	if (exp->exp_obd->obd_self_export == exp) {
+		class_export_put(exp);
+		return;
+	}
+
+	spin_lock(&exp->exp_obd->obd_dev_lock);
+	/* delete an uuid-export hashitem from hashtables */
+	if (!hlist_unhashed(&exp->exp_uuid_hash))
+		cfs_hash_del(exp->exp_obd->obd_uuid_hash,
+			     &exp->exp_client_uuid,
+			     &exp->exp_uuid_hash);
+
+#ifdef HAVE_SERVER_SUPPORT
+	if (!hlist_unhashed(&exp->exp_gen_hash)) {
+		struct tg_export_data	*ted = &exp->exp_target_data;
+		struct cfs_hash		*hash;
+
+		/* Because obd_gen_hash will not be released until
+		 * class_cleanup(), so hash should never be NULL here */
+		hash = cfs_hash_getref(exp->exp_obd->obd_gen_hash);
+		LASSERT(hash != NULL);
+		cfs_hash_del(hash, &ted->ted_lcd->lcd_generation,
+			     &exp->exp_gen_hash);
+		cfs_hash_putref(hash);
+	}
+#endif /* HAVE_SERVER_SUPPORT */
+
+	list_move(&exp->exp_obd_chain, &exp->exp_obd->obd_unlinked_exports);
+	list_del_init(&exp->exp_obd_chain_timed);
+	exp->exp_obd->obd_num_exports--;
+	spin_unlock(&exp->exp_obd->obd_dev_lock);
+	atomic_inc(&obd_stale_export_num);
+
+	/* A reference is kept by obd_stale_exports list */
+	obd_stale_export_put(exp);
+}
+EXPORT_SYMBOL(class_unlink_export);
+
+/* Import management functions */
+static void class_import_destroy(struct obd_import *imp)
+{
+        ENTRY;
+
+        CDEBUG(D_IOCTL, "destroying import %p for %s\n", imp,
+                imp->imp_obd->obd_name);
+
+        LASSERT_ATOMIC_ZERO(&imp->imp_refcount);
+
+        ptlrpc_put_connection_superhack(imp->imp_connection);
+
+	while (!list_empty(&imp->imp_conn_list)) {
+		struct obd_import_conn *imp_conn;
+
+		imp_conn = list_entry(imp->imp_conn_list.next,
+				      struct obd_import_conn, oic_item);
+		list_del_init(&imp_conn->oic_item);
+                ptlrpc_put_connection_superhack(imp_conn->oic_conn);
+                OBD_FREE(imp_conn, sizeof(*imp_conn));
+        }
+
+        LASSERT(imp->imp_sec == NULL);
+        class_decref(imp->imp_obd, "import", imp);
+        OBD_FREE_RCU(imp, sizeof(*imp), &imp->imp_handle);
+        EXIT;
+}
+
+static void import_handle_addref(void *import)
+{
+        class_import_get(import);
+}
+
+static struct portals_handle_ops import_handle_ops = {
+	.hop_addref = import_handle_addref,
+	.hop_free   = NULL,
+};
+
+struct obd_import *class_import_get(struct obd_import *import)
+{
+	atomic_inc(&import->imp_refcount);
+        CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", import,
+	       atomic_read(&import->imp_refcount),
+               import->imp_obd->obd_name);
+        return import;
+}
+EXPORT_SYMBOL(class_import_get);
+
+void class_import_put(struct obd_import *imp)
+{
+	ENTRY;
+
+	LASSERT(list_empty(&imp->imp_zombie_chain));
+        LASSERT_ATOMIC_GT_LT(&imp->imp_refcount, 0, LI_POISON);
+
+        CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", imp,
+	       atomic_read(&imp->imp_refcount) - 1,
+               imp->imp_obd->obd_name);
+
+	if (atomic_dec_and_test(&imp->imp_refcount)) {
+                CDEBUG(D_INFO, "final put import %p\n", imp);
+                obd_zombie_import_add(imp);
+        }
+
+	/* catch possible import put race */
+	LASSERT_ATOMIC_GE_LT(&imp->imp_refcount, 0, LI_POISON);
+	EXIT;
+}
+EXPORT_SYMBOL(class_import_put);
+
+static void init_imp_at(struct imp_at *at) {
+        int i;
+        at_init(&at->iat_net_latency, 0, 0);
+        for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+                /* max service estimates are tracked on the server side, so
+                   don't use the AT history here, just use the last reported
+                   val. (But keep hist for proc histogram, worst_ever) */
+                at_init(&at->iat_service_estimate[i], INITIAL_CONNECT_TIMEOUT,
+                        AT_FLG_NOHIST);
+        }
+}
+
+struct obd_import *class_new_import(struct obd_device *obd)
+{
+	struct obd_import *imp;
+	struct pid_namespace *curr_pid_ns = ll_task_pid_ns(current);
+
+	OBD_ALLOC(imp, sizeof(*imp));
+	if (imp == NULL)
+		return NULL;
+
+	INIT_LIST_HEAD(&imp->imp_pinger_chain);
+	INIT_LIST_HEAD(&imp->imp_zombie_chain);
+	INIT_LIST_HEAD(&imp->imp_replay_list);
+	INIT_LIST_HEAD(&imp->imp_sending_list);
+	INIT_LIST_HEAD(&imp->imp_delayed_list);
+	INIT_LIST_HEAD(&imp->imp_committed_list);
+	INIT_LIST_HEAD(&imp->imp_unreplied_list);
+	imp->imp_known_replied_xid = 0;
+	imp->imp_replay_cursor = &imp->imp_committed_list;
+	spin_lock_init(&imp->imp_lock);
+	imp->imp_last_success_conn = 0;
+	imp->imp_state = LUSTRE_IMP_NEW;
+	imp->imp_obd = class_incref(obd, "import", imp);
+	mutex_init(&imp->imp_sec_mutex);
+	init_waitqueue_head(&imp->imp_recovery_waitq);
+
+	if (curr_pid_ns->child_reaper)
+		imp->imp_sec_refpid = curr_pid_ns->child_reaper->pid;
+	else
+		imp->imp_sec_refpid = 1;
+
+	atomic_set(&imp->imp_refcount, 2);
+	atomic_set(&imp->imp_unregistering, 0);
+	atomic_set(&imp->imp_inflight, 0);
+	atomic_set(&imp->imp_replay_inflight, 0);
+	atomic_set(&imp->imp_inval_count, 0);
+	INIT_LIST_HEAD(&imp->imp_conn_list);
+	INIT_LIST_HEAD(&imp->imp_handle.h_link);
+	class_handle_hash(&imp->imp_handle, &import_handle_ops);
+	init_imp_at(&imp->imp_at);
+
+	/* the default magic is V2, will be used in connect RPC, and
+	 * then adjusted according to the flags in request/reply. */
+	imp->imp_msg_magic = LUSTRE_MSG_MAGIC_V2;
+
+	return imp;
+}
+EXPORT_SYMBOL(class_new_import);
+
+void class_destroy_import(struct obd_import *import)
+{
+	LASSERT(import != NULL);
+	LASSERT(import != LP_POISON);
+
+	class_handle_unhash(&import->imp_handle);
+
+	spin_lock(&import->imp_lock);
+	import->imp_generation++;
+	spin_unlock(&import->imp_lock);
+	class_import_put(import);
+}
+EXPORT_SYMBOL(class_destroy_import);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+
+void __class_export_add_lock_ref(struct obd_export *exp, struct ldlm_lock *lock)
+{
+	spin_lock(&exp->exp_locks_list_guard);
+
+        LASSERT(lock->l_exp_refs_nr >= 0);
+
+        if (lock->l_exp_refs_target != NULL &&
+            lock->l_exp_refs_target != exp) {
+                LCONSOLE_WARN("setting export %p for lock %p which already has export %p\n",
+                              exp, lock, lock->l_exp_refs_target);
+        }
+        if ((lock->l_exp_refs_nr ++) == 0) {
+		list_add(&lock->l_exp_refs_link, &exp->exp_locks_list);
+                lock->l_exp_refs_target = exp;
+        }
+        CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n",
+               lock, exp, lock->l_exp_refs_nr);
+	spin_unlock(&exp->exp_locks_list_guard);
+}
+EXPORT_SYMBOL(__class_export_add_lock_ref);
+
+void __class_export_del_lock_ref(struct obd_export *exp, struct ldlm_lock *lock)
+{
+	spin_lock(&exp->exp_locks_list_guard);
+        LASSERT(lock->l_exp_refs_nr > 0);
+        if (lock->l_exp_refs_target != exp) {
+                LCONSOLE_WARN("lock %p, "
+                              "mismatching export pointers: %p, %p\n",
+                              lock, lock->l_exp_refs_target, exp);
+        }
+        if (-- lock->l_exp_refs_nr == 0) {
+		list_del_init(&lock->l_exp_refs_link);
+                lock->l_exp_refs_target = NULL;
+        }
+        CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n",
+               lock, exp, lock->l_exp_refs_nr);
+	spin_unlock(&exp->exp_locks_list_guard);
+}
+EXPORT_SYMBOL(__class_export_del_lock_ref);
+#endif
+
+/* A connection defines an export context in which preallocation can
+   be managed. This releases the export pointer reference, and returns
+   the export handle, so the export refcount is 1 when this function
+   returns. */
+int class_connect(struct lustre_handle *conn, struct obd_device *obd,
+                  struct obd_uuid *cluuid)
+{
+        struct obd_export *export;
+        LASSERT(conn != NULL);
+        LASSERT(obd != NULL);
+        LASSERT(cluuid != NULL);
+        ENTRY;
+
+        export = class_new_export(obd, cluuid);
+        if (IS_ERR(export))
+                RETURN(PTR_ERR(export));
+
+        conn->cookie = export->exp_handle.h_cookie;
+        class_export_put(export);
+
+	CDEBUG(D_IOCTL, "connect: client %s, cookie %#llx\n",
+               cluuid->uuid, conn->cookie);
+        RETURN(0);
+}
+EXPORT_SYMBOL(class_connect);
+
+/* if export is involved in recovery then clean up related things */
+static void class_export_recovery_cleanup(struct obd_export *exp)
+{
+	struct obd_device *obd = exp->exp_obd;
+
+	spin_lock(&obd->obd_recovery_task_lock);
+	if (obd->obd_recovering) {
+		if (exp->exp_in_recovery) {
+			spin_lock(&exp->exp_lock);
+			exp->exp_in_recovery = 0;
+			spin_unlock(&exp->exp_lock);
+			LASSERT_ATOMIC_POS(&obd->obd_connected_clients);
+			atomic_dec(&obd->obd_connected_clients);
+		}
+
+		/* if called during recovery then should update
+		 * obd_stale_clients counter,
+		 * lightweight exports are not counted */
+		if ((exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) == 0)
+			exp->exp_obd->obd_stale_clients++;
+	}
+	spin_unlock(&obd->obd_recovery_task_lock);
+
+	spin_lock(&exp->exp_lock);
+	/** Cleanup req replay fields */
+	if (exp->exp_req_replay_needed) {
+		exp->exp_req_replay_needed = 0;
+
+		LASSERT(atomic_read(&obd->obd_req_replay_clients));
+		atomic_dec(&obd->obd_req_replay_clients);
+	}
+
+	/** Cleanup lock replay data */
+	if (exp->exp_lock_replay_needed) {
+		exp->exp_lock_replay_needed = 0;
+
+		LASSERT(atomic_read(&obd->obd_lock_replay_clients));
+		atomic_dec(&obd->obd_lock_replay_clients);
+	}
+	spin_unlock(&exp->exp_lock);
+}
+
+/* This function removes 1-3 references from the export:
+ * 1 - for export pointer passed
+ * and if disconnect really need
+ * 2 - removing from hash
+ * 3 - in client_unlink_export
+ * The export pointer passed to this function can destroyed */
+int class_disconnect(struct obd_export *export)
+{
+        int already_disconnected;
+        ENTRY;
+
+        if (export == NULL) {
+                CWARN("attempting to free NULL export %p\n", export);
+                RETURN(-EINVAL);
+        }
+
+	spin_lock(&export->exp_lock);
+	already_disconnected = export->exp_disconnected;
+	export->exp_disconnected = 1;
+	/*  We hold references of export for uuid hash
+	 *  and nid_hash and export link at least. So
+	 *  it is safe to call cfs_hash_del in there.  */
+	if (!hlist_unhashed(&export->exp_nid_hash))
+		cfs_hash_del(export->exp_obd->obd_nid_hash,
+			     &export->exp_connection->c_peer.nid,
+			     &export->exp_nid_hash);
+	spin_unlock(&export->exp_lock);
+
+        /* class_cleanup(), abort_recovery(), and class_fail_export()
+         * all end up in here, and if any of them race we shouldn't
+         * call extra class_export_puts(). */
+        if (already_disconnected) {
+		LASSERT(hlist_unhashed(&export->exp_nid_hash));
+                GOTO(no_disconn, already_disconnected);
+        }
+
+	CDEBUG(D_IOCTL, "disconnect: cookie %#llx\n",
+               export->exp_handle.h_cookie);
+
+        class_export_recovery_cleanup(export);
+        class_unlink_export(export);
+no_disconn:
+        class_export_put(export);
+        RETURN(0);
+}
+EXPORT_SYMBOL(class_disconnect);
+
+/* Return non-zero for a fully connected export */
+int class_connected_export(struct obd_export *exp)
+{
+	int connected = 0;
+
+	if (exp) {
+		spin_lock(&exp->exp_lock);
+		connected = (exp->exp_conn_cnt > 0) && !exp->exp_failed;
+		spin_unlock(&exp->exp_lock);
+	}
+	return connected;
+}
+EXPORT_SYMBOL(class_connected_export);
+
+static void class_disconnect_export_list(struct list_head *list,
+                                         enum obd_option flags)
+{
+        int rc;
+        struct obd_export *exp;
+        ENTRY;
+
+        /* It's possible that an export may disconnect itself, but
+         * nothing else will be added to this list. */
+	while (!list_empty(list)) {
+		exp = list_entry(list->next, struct obd_export,
+				 exp_obd_chain);
+		/* need for safe call CDEBUG after obd_disconnect */
+		class_export_get(exp);
+
+		spin_lock(&exp->exp_lock);
+		exp->exp_flags = flags;
+		spin_unlock(&exp->exp_lock);
+
+                if (obd_uuid_equals(&exp->exp_client_uuid,
+                                    &exp->exp_obd->obd_uuid)) {
+                        CDEBUG(D_HA,
+                               "exp %p export uuid == obd uuid, don't discon\n",
+                               exp);
+                        /* Need to delete this now so we don't end up pointing
+                         * to work_list later when this export is cleaned up. */
+			list_del_init(&exp->exp_obd_chain);
+                        class_export_put(exp);
+                        continue;
+                }
+
+                class_export_get(exp);
+                CDEBUG(D_HA, "%s: disconnecting export at %s (%p), "
+		       "last request at %ld\n",
+                       exp->exp_obd->obd_name, obd_export_nid2str(exp),
+                       exp, exp->exp_last_request_time);
+                /* release one export reference anyway */
+                rc = obd_disconnect(exp);
+
+                CDEBUG(D_HA, "disconnected export at %s (%p): rc %d\n",
+                       obd_export_nid2str(exp), exp, rc);
+                class_export_put(exp);
+        }
+        EXIT;
+}
+
+void class_disconnect_exports(struct obd_device *obd)
+{
+	struct list_head work_list;
+	ENTRY;
+
+	/* Move all of the exports from obd_exports to a work list, en masse. */
+	INIT_LIST_HEAD(&work_list);
+	spin_lock(&obd->obd_dev_lock);
+	list_splice_init(&obd->obd_exports, &work_list);
+	list_splice_init(&obd->obd_delayed_exports, &work_list);
+	spin_unlock(&obd->obd_dev_lock);
+
+	if (!list_empty(&work_list)) {
+                CDEBUG(D_HA, "OBD device %d (%p) has exports, "
+                       "disconnecting them\n", obd->obd_minor, obd);
+                class_disconnect_export_list(&work_list,
+                                             exp_flags_from_obd(obd));
+        } else
+                CDEBUG(D_HA, "OBD device %d (%p) has no exports\n",
+                       obd->obd_minor, obd);
+        EXIT;
+}
+EXPORT_SYMBOL(class_disconnect_exports);
+
+/* Remove exports that have not completed recovery.
+ */
+void class_disconnect_stale_exports(struct obd_device *obd,
+                                    int (*test_export)(struct obd_export *))
+{
+	struct list_head work_list;
+	struct obd_export *exp, *n;
+        int evicted = 0;
+        ENTRY;
+
+	INIT_LIST_HEAD(&work_list);
+	spin_lock(&obd->obd_dev_lock);
+	list_for_each_entry_safe(exp, n, &obd->obd_exports,
+				 exp_obd_chain) {
+                /* don't count self-export as client */
+                if (obd_uuid_equals(&exp->exp_client_uuid,
+                                    &exp->exp_obd->obd_uuid))
+                        continue;
+
+		/* don't evict clients which have no slot in last_rcvd
+		 * (e.g. lightweight connection) */
+		if (exp->exp_target_data.ted_lr_idx == -1)
+			continue;
+
+		spin_lock(&exp->exp_lock);
+		if (exp->exp_failed || test_export(exp)) {
+			spin_unlock(&exp->exp_lock);
+			continue;
+		}
+		exp->exp_failed = 1;
+		spin_unlock(&exp->exp_lock);
+
+		list_move(&exp->exp_obd_chain, &work_list);
+                evicted++;
+                CDEBUG(D_HA, "%s: disconnect stale client %s@%s\n",
+                       obd->obd_name, exp->exp_client_uuid.uuid,
+                       exp->exp_connection == NULL ? "<unknown>" :
+                       libcfs_nid2str(exp->exp_connection->c_peer.nid));
+                print_export_data(exp, "EVICTING", 0, D_HA);
+        }
+	spin_unlock(&obd->obd_dev_lock);
+
+	if (evicted)
+		LCONSOLE_WARN("%s: disconnecting %d stale clients\n",
+			      obd->obd_name, evicted);
+
+	class_disconnect_export_list(&work_list, exp_flags_from_obd(obd) |
+						 OBD_OPT_ABORT_RECOV);
+	EXIT;
+}
+EXPORT_SYMBOL(class_disconnect_stale_exports);
+
+void class_fail_export(struct obd_export *exp)
+{
+	int rc, already_failed;
+
+	spin_lock(&exp->exp_lock);
+	already_failed = exp->exp_failed;
+	exp->exp_failed = 1;
+	spin_unlock(&exp->exp_lock);
+
+        if (already_failed) {
+                CDEBUG(D_HA, "disconnecting dead export %p/%s; skipping\n",
+                       exp, exp->exp_client_uuid.uuid);
+                return;
+        }
+
+        CDEBUG(D_HA, "disconnecting export %p/%s\n",
+               exp, exp->exp_client_uuid.uuid);
+
+        if (obd_dump_on_timeout)
+                libcfs_debug_dumplog();
+
+	/* need for safe call CDEBUG after obd_disconnect */
+	class_export_get(exp);
+
+        /* Most callers into obd_disconnect are removing their own reference
+         * (request, for example) in addition to the one from the hash table.
+         * We don't have such a reference here, so make one. */
+        class_export_get(exp);
+        rc = obd_disconnect(exp);
+        if (rc)
+                CERROR("disconnecting export %p failed: %d\n", exp, rc);
+        else
+                CDEBUG(D_HA, "disconnected export %p/%s\n",
+                       exp, exp->exp_client_uuid.uuid);
+	class_export_put(exp);
+}
+EXPORT_SYMBOL(class_fail_export);
+
+char *obd_export_nid2str(struct obd_export *exp)
+{
+        if (exp->exp_connection != NULL)
+                return libcfs_nid2str(exp->exp_connection->c_peer.nid);
+
+        return "(no nid)";
+}
+EXPORT_SYMBOL(obd_export_nid2str);
+
+int obd_export_evict_by_nid(struct obd_device *obd, const char *nid)
+{
+	struct cfs_hash *nid_hash;
+	struct obd_export *doomed_exp = NULL;
+	int exports_evicted = 0;
+
+	lnet_nid_t nid_key = libcfs_str2nid((char *)nid);
+
+	spin_lock(&obd->obd_dev_lock);
+	/* umount has run already, so evict thread should leave
+	 * its task to umount thread now */
+	if (obd->obd_stopping) {
+		spin_unlock(&obd->obd_dev_lock);
+		return exports_evicted;
+	}
+	nid_hash = obd->obd_nid_hash;
+	cfs_hash_getref(nid_hash);
+	spin_unlock(&obd->obd_dev_lock);
+
+	do {
+		doomed_exp = cfs_hash_lookup(nid_hash, &nid_key);
+                if (doomed_exp == NULL)
+                        break;
+
+                LASSERTF(doomed_exp->exp_connection->c_peer.nid == nid_key,
+                         "nid %s found, wanted nid %s, requested nid %s\n",
+                         obd_export_nid2str(doomed_exp),
+                         libcfs_nid2str(nid_key), nid);
+                LASSERTF(doomed_exp != obd->obd_self_export,
+                         "self-export is hashed by NID?\n");
+                exports_evicted++;
+		LCONSOLE_WARN("%s: evicting %s (at %s) by administrative "
+			      "request\n", obd->obd_name,
+			      obd_uuid2str(&doomed_exp->exp_client_uuid),
+			      obd_export_nid2str(doomed_exp));
+                class_fail_export(doomed_exp);
+                class_export_put(doomed_exp);
+        } while (1);
+
+	cfs_hash_putref(nid_hash);
+
+        if (!exports_evicted)
+                CDEBUG(D_HA,"%s: can't disconnect NID '%s': no exports found\n",
+                       obd->obd_name, nid);
+        return exports_evicted;
+}
+EXPORT_SYMBOL(obd_export_evict_by_nid);
+
+int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid)
+{
+	struct cfs_hash *uuid_hash;
+	struct obd_export *doomed_exp = NULL;
+	struct obd_uuid doomed_uuid;
+	int exports_evicted = 0;
+
+	spin_lock(&obd->obd_dev_lock);
+	if (obd->obd_stopping) {
+		spin_unlock(&obd->obd_dev_lock);
+		return exports_evicted;
+	}
+	uuid_hash = obd->obd_uuid_hash;
+	cfs_hash_getref(uuid_hash);
+	spin_unlock(&obd->obd_dev_lock);
+
+        obd_str2uuid(&doomed_uuid, uuid);
+        if (obd_uuid_equals(&doomed_uuid, &obd->obd_uuid)) {
+                CERROR("%s: can't evict myself\n", obd->obd_name);
+		cfs_hash_putref(uuid_hash);
+                return exports_evicted;
+        }
+
+	doomed_exp = cfs_hash_lookup(uuid_hash, &doomed_uuid);
+
+        if (doomed_exp == NULL) {
+                CERROR("%s: can't disconnect %s: no exports found\n",
+                       obd->obd_name, uuid);
+        } else {
+                CWARN("%s: evicting %s at adminstrative request\n",
+                       obd->obd_name, doomed_exp->exp_client_uuid.uuid);
+                class_fail_export(doomed_exp);
+                class_export_put(doomed_exp);
+                exports_evicted++;
+        }
+	cfs_hash_putref(uuid_hash);
+
+        return exports_evicted;
+}
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+void (*class_export_dump_hook)(struct obd_export*) = NULL;
+EXPORT_SYMBOL(class_export_dump_hook);
+#endif
+
+static void print_export_data(struct obd_export *exp, const char *status,
+			      int locks, int debug_level)
+{
+	struct ptlrpc_reply_state *rs;
+	struct ptlrpc_reply_state *first_reply = NULL;
+	int nreplies = 0;
+
+	spin_lock(&exp->exp_lock);
+	list_for_each_entry(rs, &exp->exp_outstanding_replies,
+			    rs_exp_list) {
+		if (nreplies == 0)
+			first_reply = rs;
+		nreplies++;
+	}
+	spin_unlock(&exp->exp_lock);
+
+	CDEBUG(debug_level, "%s: %s %p %s %s %d (%d %d %d) %d %d %d %d: "
+	       "%p %s %llu stale:%d\n",
+	       exp->exp_obd->obd_name, status, exp, exp->exp_client_uuid.uuid,
+	       obd_export_nid2str(exp), atomic_read(&exp->exp_refcount),
+	       atomic_read(&exp->exp_rpc_count),
+	       atomic_read(&exp->exp_cb_count),
+	       atomic_read(&exp->exp_locks_count),
+	       exp->exp_disconnected, exp->exp_delayed, exp->exp_failed,
+	       nreplies, first_reply, nreplies > 3 ? "..." : "",
+	       exp->exp_last_committed, !list_empty(&exp->exp_stale_list));
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	if (locks && class_export_dump_hook != NULL)
+		class_export_dump_hook(exp);
+#endif
+}
+
+void dump_exports(struct obd_device *obd, int locks, int debug_level)
+{
+        struct obd_export *exp;
+
+	spin_lock(&obd->obd_dev_lock);
+	list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain)
+		print_export_data(exp, "ACTIVE", locks, debug_level);
+	list_for_each_entry(exp, &obd->obd_unlinked_exports, exp_obd_chain)
+		print_export_data(exp, "UNLINKED", locks, debug_level);
+	list_for_each_entry(exp, &obd->obd_delayed_exports, exp_obd_chain)
+		print_export_data(exp, "DELAYED", locks, debug_level);
+	spin_unlock(&obd->obd_dev_lock);
+	spin_lock(&obd_zombie_impexp_lock);
+	list_for_each_entry(exp, &obd_zombie_exports, exp_obd_chain)
+		print_export_data(exp, "ZOMBIE", locks, debug_level);
+	spin_unlock(&obd_zombie_impexp_lock);
+}
+
+void obd_exports_barrier(struct obd_device *obd)
+{
+	int waited = 2;
+	LASSERT(list_empty(&obd->obd_exports));
+	spin_lock(&obd->obd_dev_lock);
+	while (!list_empty(&obd->obd_unlinked_exports)) {
+		spin_unlock(&obd->obd_dev_lock);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(waited));
+		if (waited > 5 && is_power_of_2(waited)) {
+			LCONSOLE_WARN("%s is waiting for obd_unlinked_exports "
+				      "more than %d seconds. "
+				      "The obd refcount = %d. Is it stuck?\n",
+				      obd->obd_name, waited,
+				      atomic_read(&obd->obd_refcount));
+			dump_exports(obd, 1, D_CONSOLE | D_WARNING);
+		}
+		waited *= 2;
+		spin_lock(&obd->obd_dev_lock);
+	}
+	spin_unlock(&obd->obd_dev_lock);
+}
+EXPORT_SYMBOL(obd_exports_barrier);
+
+/* Total amount of zombies to be destroyed */
+static int zombies_count = 0;
+
+/**
+ * kill zombie imports and exports
+ */
+void obd_zombie_impexp_cull(void)
+{
+	struct obd_import *import;
+	struct obd_export *export;
+	ENTRY;
+
+	do {
+		spin_lock(&obd_zombie_impexp_lock);
+
+		import = NULL;
+		if (!list_empty(&obd_zombie_imports)) {
+			import = list_entry(obd_zombie_imports.next,
+					    struct obd_import,
+					    imp_zombie_chain);
+			list_del_init(&import->imp_zombie_chain);
+		}
+
+		export = NULL;
+		if (!list_empty(&obd_zombie_exports)) {
+			export = list_entry(obd_zombie_exports.next,
+					    struct obd_export,
+					    exp_obd_chain);
+			list_del_init(&export->exp_obd_chain);
+		}
+
+		spin_unlock(&obd_zombie_impexp_lock);
+
+		if (import != NULL) {
+			class_import_destroy(import);
+			spin_lock(&obd_zombie_impexp_lock);
+			zombies_count--;
+			spin_unlock(&obd_zombie_impexp_lock);
+		}
+
+		if (export != NULL) {
+			class_export_destroy(export);
+			spin_lock(&obd_zombie_impexp_lock);
+			zombies_count--;
+			spin_unlock(&obd_zombie_impexp_lock);
+		}
+
+		cond_resched();
+	} while (import != NULL || export != NULL);
+	EXIT;
+}
+
+static DECLARE_COMPLETION(obd_zombie_start);
+static DECLARE_COMPLETION(obd_zombie_stop);
+static unsigned long obd_zombie_flags;
+static DECLARE_WAIT_QUEUE_HEAD(obd_zombie_waitq);
+static pid_t obd_zombie_pid;
+
+enum {
+	OBD_ZOMBIE_STOP		= 0x0001,
+};
+
+/**
+ * check for work for kill zombie import/export thread.
+ */
+static int obd_zombie_impexp_check(void *arg)
+{
+	int rc;
+
+	spin_lock(&obd_zombie_impexp_lock);
+	rc = (zombies_count == 0) &&
+	     !test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags);
+	spin_unlock(&obd_zombie_impexp_lock);
+
+	RETURN(rc);
+}
+
+/**
+ * Add export to the obd_zombe thread and notify it.
+ */
+static void obd_zombie_export_add(struct obd_export *exp) {
+	atomic_dec(&obd_stale_export_num);
+	spin_lock(&exp->exp_obd->obd_dev_lock);
+	LASSERT(!list_empty(&exp->exp_obd_chain));
+	list_del_init(&exp->exp_obd_chain);
+	spin_unlock(&exp->exp_obd->obd_dev_lock);
+	spin_lock(&obd_zombie_impexp_lock);
+	zombies_count++;
+	list_add(&exp->exp_obd_chain, &obd_zombie_exports);
+	spin_unlock(&obd_zombie_impexp_lock);
+
+	obd_zombie_impexp_notify();
+}
+
+/**
+ * Add import to the obd_zombe thread and notify it.
+ */
+static void obd_zombie_import_add(struct obd_import *imp) {
+	LASSERT(imp->imp_sec == NULL);
+	spin_lock(&obd_zombie_impexp_lock);
+	LASSERT(list_empty(&imp->imp_zombie_chain));
+	zombies_count++;
+	list_add(&imp->imp_zombie_chain, &obd_zombie_imports);
+	spin_unlock(&obd_zombie_impexp_lock);
+
+	obd_zombie_impexp_notify();
+}
+
+/**
+ * notify import/export destroy thread about new zombie.
+ */
+static void obd_zombie_impexp_notify(void)
+{
+	/*
+	 * Make sure obd_zomebie_impexp_thread get this notification.
+	 * It is possible this signal only get by obd_zombie_barrier, and
+	 * barrier gulps this notification and sleeps away and hangs ensues
+	 */
+	wake_up_all(&obd_zombie_waitq);
+}
+
+/**
+ * check whether obd_zombie is idle
+ */
+static int obd_zombie_is_idle(void)
+{
+	int rc;
+
+	LASSERT(!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags));
+	spin_lock(&obd_zombie_impexp_lock);
+	rc = (zombies_count == 0);
+	spin_unlock(&obd_zombie_impexp_lock);
+	return rc;
+}
+
+/**
+ * wait when obd_zombie import/export queues become empty
+ */
+void obd_zombie_barrier(void)
+{
+	struct l_wait_info lwi = { 0 };
+
+	if (obd_zombie_pid == current_pid())
+		/* don't wait for myself */
+		return;
+	l_wait_event(obd_zombie_waitq, obd_zombie_is_idle(), &lwi);
+}
+EXPORT_SYMBOL(obd_zombie_barrier);
+
+
+struct obd_export *obd_stale_export_get(void)
+{
+	struct obd_export *exp = NULL;
+	ENTRY;
+
+	spin_lock(&obd_stale_export_lock);
+	if (!list_empty(&obd_stale_exports)) {
+		exp = list_entry(obd_stale_exports.next,
+				 struct obd_export, exp_stale_list);
+		list_del_init(&exp->exp_stale_list);
+	}
+	spin_unlock(&obd_stale_export_lock);
+
+	if (exp) {
+		CDEBUG(D_DLMTRACE, "Get export %p: total %d\n", exp,
+		       atomic_read(&obd_stale_export_num));
+	}
+	RETURN(exp);
+}
+EXPORT_SYMBOL(obd_stale_export_get);
+
+void obd_stale_export_put(struct obd_export *exp)
+{
+	ENTRY;
+
+	LASSERT(list_empty(&exp->exp_stale_list));
+	if (exp->exp_lock_hash &&
+	    atomic_read(&exp->exp_lock_hash->hs_count)) {
+		CDEBUG(D_DLMTRACE, "Put export %p: total %d\n", exp,
+		       atomic_read(&obd_stale_export_num));
+
+		spin_lock_bh(&exp->exp_bl_list_lock);
+		spin_lock(&obd_stale_export_lock);
+		/* Add to the tail if there is no blocked locks,
+		 * to the head otherwise. */
+		if (list_empty(&exp->exp_bl_list))
+			list_add_tail(&exp->exp_stale_list,
+				      &obd_stale_exports);
+		else
+			list_add(&exp->exp_stale_list,
+				 &obd_stale_exports);
+
+		spin_unlock(&obd_stale_export_lock);
+		spin_unlock_bh(&exp->exp_bl_list_lock);
+	} else {
+		class_export_put(exp);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(obd_stale_export_put);
+
+/**
+ * Adjust the position of the export in the stale list,
+ * i.e. move to the head of the list if is needed.
+ **/
+void obd_stale_export_adjust(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	spin_lock_bh(&exp->exp_bl_list_lock);
+	spin_lock(&obd_stale_export_lock);
+
+	if (!list_empty(&exp->exp_stale_list) &&
+	    !list_empty(&exp->exp_bl_list))
+		list_move(&exp->exp_stale_list, &obd_stale_exports);
+
+	spin_unlock(&obd_stale_export_lock);
+	spin_unlock_bh(&exp->exp_bl_list_lock);
+}
+EXPORT_SYMBOL(obd_stale_export_adjust);
+
+/**
+ * destroy zombie export/import thread.
+ */
+static int obd_zombie_impexp_thread(void *unused)
+{
+	unshare_fs_struct();
+	complete(&obd_zombie_start);
+
+	obd_zombie_pid = current_pid();
+
+	while (!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags)) {
+		struct l_wait_info lwi = { 0 };
+
+		l_wait_event(obd_zombie_waitq,
+			     !obd_zombie_impexp_check(NULL), &lwi);
+		obd_zombie_impexp_cull();
+
+		/*
+		 * Notify obd_zombie_barrier callers that queues
+		 * may be empty.
+		 */
+		wake_up(&obd_zombie_waitq);
+	}
+
+	complete(&obd_zombie_stop);
+
+	RETURN(0);
+}
+
+
+/**
+ * start destroy zombie import/export thread
+ */
+int obd_zombie_impexp_init(void)
+{
+	struct task_struct *task;
+
+	task = kthread_run(obd_zombie_impexp_thread, NULL, "obd_zombid");
+	if (IS_ERR(task))
+		RETURN(PTR_ERR(task));
+
+	wait_for_completion(&obd_zombie_start);
+	RETURN(0);
+}
+/**
+ * stop destroy zombie import/export thread
+ */
+void obd_zombie_impexp_stop(void)
+{
+	set_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags);
+        obd_zombie_impexp_notify();
+	wait_for_completion(&obd_zombie_stop);
+	LASSERT(list_empty(&obd_stale_exports));
+}
+
+/***** Kernel-userspace comm helpers *******/
+
+/* Get length of entire message, including header */
+int kuc_len(int payload_len)
+{
+        return sizeof(struct kuc_hdr) + payload_len;
+}
+EXPORT_SYMBOL(kuc_len);
+
+/* Get a pointer to kuc header, given a ptr to the payload
+ * @param p Pointer to payload area
+ * @returns Pointer to kuc header
+ */
+struct kuc_hdr * kuc_ptr(void *p)
+{
+        struct kuc_hdr *lh = ((struct kuc_hdr *)p) - 1;
+        LASSERT(lh->kuc_magic == KUC_MAGIC);
+        return lh;
+}
+EXPORT_SYMBOL(kuc_ptr);
+
+/* Alloc space for a message, and fill in header
+ * @return Pointer to payload area
+ */
+void *kuc_alloc(int payload_len, int transport, int type)
+{
+        struct kuc_hdr *lh;
+        int len = kuc_len(payload_len);
+
+        OBD_ALLOC(lh, len);
+        if (lh == NULL)
+                return ERR_PTR(-ENOMEM);
+
+        lh->kuc_magic = KUC_MAGIC;
+        lh->kuc_transport = transport;
+        lh->kuc_msgtype = type;
+        lh->kuc_msglen = len;
+
+        return (void *)(lh + 1);
+}
+EXPORT_SYMBOL(kuc_alloc);
+
+/* Takes pointer to payload area */
+void kuc_free(void *p, int payload_len)
+{
+        struct kuc_hdr *lh = kuc_ptr(p);
+        OBD_FREE(lh, kuc_len(payload_len));
+}
+EXPORT_SYMBOL(kuc_free);
+
+struct obd_request_slot_waiter {
+	struct list_head	orsw_entry;
+	wait_queue_head_t	orsw_waitq;
+	bool			orsw_signaled;
+};
+
+static bool obd_request_slot_avail(struct client_obd *cli,
+				   struct obd_request_slot_waiter *orsw)
+{
+	bool avail;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	avail = !!list_empty(&orsw->orsw_entry);
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	return avail;
+};
+
+/*
+ * For network flow control, the RPC sponsor needs to acquire a credit
+ * before sending the RPC. The credits count for a connection is defined
+ * by the "cl_max_rpcs_in_flight". If all the credits are occpuied, then
+ * the subsequent RPC sponsors need to wait until others released their
+ * credits, or the administrator increased the "cl_max_rpcs_in_flight".
+ */
+int obd_get_request_slot(struct client_obd *cli)
+{
+	struct obd_request_slot_waiter	 orsw;
+	struct l_wait_info		 lwi;
+	int				 rc;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	if (cli->cl_r_in_flight < cli->cl_max_rpcs_in_flight) {
+		cli->cl_r_in_flight++;
+		spin_unlock(&cli->cl_loi_list_lock);
+		return 0;
+	}
+
+	init_waitqueue_head(&orsw.orsw_waitq);
+	list_add_tail(&orsw.orsw_entry, &cli->cl_loi_read_list);
+	orsw.orsw_signaled = false;
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+	rc = l_wait_event(orsw.orsw_waitq,
+			  obd_request_slot_avail(cli, &orsw) ||
+			  orsw.orsw_signaled,
+			  &lwi);
+
+	/* Here, we must take the lock to avoid the on-stack 'orsw' to be
+	 * freed but other (such as obd_put_request_slot) is using it. */
+	spin_lock(&cli->cl_loi_list_lock);
+	if (rc != 0) {
+		if (!orsw.orsw_signaled) {
+			if (list_empty(&orsw.orsw_entry))
+				cli->cl_r_in_flight--;
+			else
+				list_del(&orsw.orsw_entry);
+		}
+	}
+
+	if (orsw.orsw_signaled) {
+		LASSERT(list_empty(&orsw.orsw_entry));
+
+		rc = -EINTR;
+	}
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	return rc;
+}
+EXPORT_SYMBOL(obd_get_request_slot);
+
+void obd_put_request_slot(struct client_obd *cli)
+{
+	struct obd_request_slot_waiter *orsw;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	cli->cl_r_in_flight--;
+
+	/* If there is free slot, wakeup the first waiter. */
+	if (!list_empty(&cli->cl_loi_read_list) &&
+	    likely(cli->cl_r_in_flight < cli->cl_max_rpcs_in_flight)) {
+		orsw = list_entry(cli->cl_loi_read_list.next,
+				  struct obd_request_slot_waiter, orsw_entry);
+		list_del_init(&orsw->orsw_entry);
+		cli->cl_r_in_flight++;
+		wake_up(&orsw->orsw_waitq);
+	}
+	spin_unlock(&cli->cl_loi_list_lock);
+}
+EXPORT_SYMBOL(obd_put_request_slot);
+
+__u32 obd_get_max_rpcs_in_flight(struct client_obd *cli)
+{
+	return cli->cl_max_rpcs_in_flight;
+}
+EXPORT_SYMBOL(obd_get_max_rpcs_in_flight);
+
+int obd_set_max_rpcs_in_flight(struct client_obd *cli, __u32 max)
+{
+	struct obd_request_slot_waiter *orsw;
+	__u32				old;
+	int				diff;
+	int				i;
+	char				*typ_name;
+	int				rc;
+
+	if (max > OBD_MAX_RIF_MAX || max < 1)
+		return -ERANGE;
+
+	typ_name = cli->cl_import->imp_obd->obd_type->typ_name;
+	if (strcmp(typ_name, LUSTRE_MDC_NAME) == 0) {
+		/* adjust max_mod_rpcs_in_flight to ensure it is always
+		 * strictly lower that max_rpcs_in_flight */
+		if (max < 2) {
+			CERROR("%s: cannot set max_rpcs_in_flight to 1 "
+			       "because it must be higher than "
+			       "max_mod_rpcs_in_flight value",
+			       cli->cl_import->imp_obd->obd_name);
+			return -ERANGE;
+		}
+		if (max <= cli->cl_max_mod_rpcs_in_flight) {
+			rc = obd_set_max_mod_rpcs_in_flight(cli, max - 1);
+			if (rc != 0)
+				return rc;
+		}
+	}
+
+	spin_lock(&cli->cl_loi_list_lock);
+	old = cli->cl_max_rpcs_in_flight;
+	cli->cl_max_rpcs_in_flight = max;
+	diff = max - old;
+
+	/* We increase the max_rpcs_in_flight, then wakeup some waiters. */
+	for (i = 0; i < diff; i++) {
+		if (list_empty(&cli->cl_loi_read_list))
+			break;
+
+		orsw = list_entry(cli->cl_loi_read_list.next,
+				  struct obd_request_slot_waiter, orsw_entry);
+		list_del_init(&orsw->orsw_entry);
+		cli->cl_r_in_flight++;
+		wake_up(&orsw->orsw_waitq);
+	}
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(obd_set_max_rpcs_in_flight);
+
+__u16 obd_get_max_mod_rpcs_in_flight(struct client_obd *cli)
+{
+	return cli->cl_max_mod_rpcs_in_flight;
+}
+EXPORT_SYMBOL(obd_get_max_mod_rpcs_in_flight);
+
+int obd_set_max_mod_rpcs_in_flight(struct client_obd *cli, __u16 max)
+{
+	struct obd_connect_data	*ocd;
+	__u16 maxmodrpcs;
+	__u16 prev;
+
+	if (max > OBD_MAX_RIF_MAX || max < 1)
+		return -ERANGE;
+
+	/* cannot exceed or equal max_rpcs_in_flight */
+	if (max >= cli->cl_max_rpcs_in_flight) {
+		CERROR("%s: can't set max_mod_rpcs_in_flight to a value (%hu) "
+		       "higher or equal to max_rpcs_in_flight value (%u)\n",
+		       cli->cl_import->imp_obd->obd_name,
+		       max, cli->cl_max_rpcs_in_flight);
+		return -ERANGE;
+	}
+
+	/* cannot exceed max modify RPCs in flight supported by the server */
+	ocd = &cli->cl_import->imp_connect_data;
+	if (ocd->ocd_connect_flags & OBD_CONNECT_MULTIMODRPCS)
+		maxmodrpcs = ocd->ocd_maxmodrpcs;
+	else
+		maxmodrpcs = 1;
+	if (max > maxmodrpcs) {
+		CERROR("%s: can't set max_mod_rpcs_in_flight to a value (%hu) "
+		       "higher than max_mod_rpcs_per_client value (%hu) "
+		       "returned by the server at connection\n",
+		       cli->cl_import->imp_obd->obd_name,
+		       max, maxmodrpcs);
+		return -ERANGE;
+	}
+
+	spin_lock(&cli->cl_mod_rpcs_lock);
+
+	prev = cli->cl_max_mod_rpcs_in_flight;
+	cli->cl_max_mod_rpcs_in_flight = max;
+
+	/* wakeup waiters if limit has been increased */
+	if (cli->cl_max_mod_rpcs_in_flight > prev)
+		wake_up(&cli->cl_mod_rpcs_waitq);
+
+	spin_unlock(&cli->cl_mod_rpcs_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(obd_set_max_mod_rpcs_in_flight);
+
+
+#define pct(a, b) (b ? a * 100 / b : 0)
+int obd_mod_rpc_stats_seq_show(struct client_obd *cli,
+			       struct seq_file *seq)
+{
+	unsigned long mod_tot = 0, mod_cum;
+	struct timespec64 now;
+	int i;
+
+	ktime_get_real_ts64(&now);
+
+	spin_lock(&cli->cl_mod_rpcs_lock);
+
+	seq_printf(seq, "snapshot_time:         %llu.%9lu (secs.nsecs)\n",
+		   (s64)now.tv_sec, now.tv_nsec);
+	seq_printf(seq, "modify_RPCs_in_flight:  %hu\n",
+		   cli->cl_mod_rpcs_in_flight);
+
+	seq_printf(seq, "\n\t\t\tmodify\n");
+	seq_printf(seq, "rpcs in flight        rpcs   %% cum %%\n");
+
+	mod_tot = lprocfs_oh_sum(&cli->cl_mod_rpcs_hist);
+
+	mod_cum = 0;
+	for (i = 0; i < OBD_HIST_MAX; i++) {
+		unsigned long mod = cli->cl_mod_rpcs_hist.oh_buckets[i];
+		mod_cum += mod;
+		seq_printf(seq, "%d:\t\t%10lu %3lu %3lu\n",
+			   i, mod, pct(mod, mod_tot),
+			   pct(mod_cum, mod_tot));
+		if (mod_cum == mod_tot)
+			break;
+	}
+
+	spin_unlock(&cli->cl_mod_rpcs_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(obd_mod_rpc_stats_seq_show);
+#undef pct
+
+
+/* The number of modify RPCs sent in parallel is limited
+ * because the server has a finite number of slots per client to
+ * store request result and ensure reply reconstruction when needed.
+ * On the client, this limit is stored in cl_max_mod_rpcs_in_flight
+ * that takes into account server limit and cl_max_rpcs_in_flight
+ * value.
+ * On the MDC client, to avoid a potential deadlock (see Bugzilla 3462),
+ * one close request is allowed above the maximum.
+ */
+static inline bool obd_mod_rpc_slot_avail_locked(struct client_obd *cli,
+						 bool close_req)
+{
+	bool avail;
+
+	/* A slot is available if
+	 * - number of modify RPCs in flight is less than the max
+	 * - it's a close RPC and no other close request is in flight
+	 */
+	avail = cli->cl_mod_rpcs_in_flight < cli->cl_max_mod_rpcs_in_flight ||
+		(close_req && cli->cl_close_rpcs_in_flight == 0);
+
+	return avail;
+}
+
+static inline bool obd_mod_rpc_slot_avail(struct client_obd *cli,
+					 bool close_req)
+{
+	bool avail;
+
+	spin_lock(&cli->cl_mod_rpcs_lock);
+	avail = obd_mod_rpc_slot_avail_locked(cli, close_req);
+	spin_unlock(&cli->cl_mod_rpcs_lock);
+	return avail;
+}
+
+static inline bool obd_skip_mod_rpc_slot(const struct lookup_intent *it)
+{
+	if (it != NULL &&
+	    (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP ||
+	     it->it_op == IT_READDIR ||
+	     (it->it_op == IT_LAYOUT && !(it->it_flags & FMODE_WRITE))))
+			return true;
+	return false;
+}
+
+/* Get a modify RPC slot from the obd client @cli according
+ * to the kind of operation @opc that is going to be sent
+ * and the intent @it of the operation if it applies.
+ * If the maximum number of modify RPCs in flight is reached
+ * the thread is put to sleep.
+ * Returns the tag to be set in the request message. Tag 0
+ * is reserved for non-modifying requests.
+ */
+__u16 obd_get_mod_rpc_slot(struct client_obd *cli, __u32 opc,
+			   struct lookup_intent *it)
+{
+	struct l_wait_info	lwi = LWI_INTR(NULL, NULL);
+	bool			close_req = false;
+	__u16			i, max;
+
+	/* read-only metadata RPCs don't consume a slot on MDT
+	 * for reply reconstruction
+	 */
+	if (obd_skip_mod_rpc_slot(it))
+		return 0;
+
+	if (opc == MDS_CLOSE)
+		close_req = true;
+
+	do {
+		spin_lock(&cli->cl_mod_rpcs_lock);
+		max = cli->cl_max_mod_rpcs_in_flight;
+		if (obd_mod_rpc_slot_avail_locked(cli, close_req)) {
+			/* there is a slot available */
+			cli->cl_mod_rpcs_in_flight++;
+			if (close_req)
+				cli->cl_close_rpcs_in_flight++;
+			lprocfs_oh_tally(&cli->cl_mod_rpcs_hist,
+					 cli->cl_mod_rpcs_in_flight);
+			/* find a free tag */
+			i = find_first_zero_bit(cli->cl_mod_tag_bitmap,
+						max + 1);
+			LASSERT(i < OBD_MAX_RIF_MAX);
+			LASSERT(!test_and_set_bit(i, cli->cl_mod_tag_bitmap));
+			spin_unlock(&cli->cl_mod_rpcs_lock);
+			/* tag 0 is reserved for non-modify RPCs */
+			return i + 1;
+		}
+		spin_unlock(&cli->cl_mod_rpcs_lock);
+
+		CDEBUG(D_RPCTRACE, "%s: sleeping for a modify RPC slot "
+		       "opc %u, max %hu\n",
+		       cli->cl_import->imp_obd->obd_name, opc, max);
+
+		l_wait_event(cli->cl_mod_rpcs_waitq,
+			     obd_mod_rpc_slot_avail(cli, close_req), &lwi);
+	} while (true);
+}
+EXPORT_SYMBOL(obd_get_mod_rpc_slot);
+
+/* Put a modify RPC slot from the obd client @cli according
+ * to the kind of operation @opc that has been sent and the
+ * intent @it of the operation if it applies.
+ */
+void obd_put_mod_rpc_slot(struct client_obd *cli, __u32 opc,
+			  struct lookup_intent *it, __u16 tag)
+{
+	bool			close_req = false;
+
+	if (obd_skip_mod_rpc_slot(it))
+		return;
+
+	if (opc == MDS_CLOSE)
+		close_req = true;
+
+	spin_lock(&cli->cl_mod_rpcs_lock);
+	cli->cl_mod_rpcs_in_flight--;
+	if (close_req)
+		cli->cl_close_rpcs_in_flight--;
+	/* release the tag in the bitmap */
+	LASSERT(tag - 1 < OBD_MAX_RIF_MAX);
+	LASSERT(test_and_clear_bit(tag - 1, cli->cl_mod_tag_bitmap) != 0);
+	spin_unlock(&cli->cl_mod_rpcs_lock);
+	wake_up(&cli->cl_mod_rpcs_waitq);
+}
+EXPORT_SYMBOL(obd_put_mod_rpc_slot);
+
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/idmap.c b/drivers/staging/lustrefsx/lustre/obdclass/idmap.c
new file mode 100644
index 0000000000000..b45c6d6a55357
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/idmap.c
@@ -0,0 +1,171 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/idmap.c
+ *
+ * Lustre user identity mapping.
+ *
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/user_namespace.h>
+#ifdef HAVE_UIDGID_HEADER
+# include <linux/uidgid.h>
+#endif
+#include <lustre_idmap.h>
+#include <upcall_cache.h>
+#include <md_object.h>
+#include <obd_support.h>
+
+#define lustre_get_group_info(group_info) do {		\
+	atomic_inc(&(group_info)->usage);		\
+} while (0)
+
+#define lustre_put_group_info(group_info) do {		\
+	if (atomic_dec_and_test(&(group_info)->usage))	\
+		groups_free(group_info);		\
+} while (0)
+
+/*
+ * groups_search() is copied from linux kernel!
+ * A simple bsearch.
+ */
+static int lustre_groups_search(struct group_info *group_info,
+				gid_t grp)
+{
+	int left, right;
+
+	if (!group_info)
+		return 0;
+
+	left = 0;
+	right = group_info->ngroups;
+	while (left < right) {
+		int mid = (left + right) / 2;
+		int cmp = grp -
+			from_kgid(&init_user_ns, CFS_GROUP_AT(group_info, mid));
+
+		if (cmp > 0)
+			left = mid + 1;
+		else if (cmp < 0)
+			right = mid;
+		else
+			return 1;
+	}
+	return 0;
+}
+
+void lustre_groups_from_list(struct group_info *ginfo, gid_t *glist)
+{
+#ifdef HAVE_GROUP_INFO_GID
+	memcpy(ginfo->gid, glist, ginfo->ngroups * sizeof(__u32));
+#else
+	int i;
+	int count = ginfo->ngroups;
+
+	/* fill group_info from gid array */
+	for (i = 0; i < ginfo->nblocks && count > 0; i++) {
+		int cp_count = min(CFS_NGROUPS_PER_BLOCK, count);
+		int off = i * CFS_NGROUPS_PER_BLOCK;
+		int len = cp_count * sizeof(*glist);
+
+		memcpy(ginfo->blocks[i], glist + off, len);
+		count -= cp_count;
+	}
+#endif
+}
+EXPORT_SYMBOL(lustre_groups_from_list);
+
+/* groups_sort() is copied from linux kernel! */
+/* a simple shell-metzner sort */
+void lustre_groups_sort(struct group_info *group_info)
+{
+        int base, max, stride;
+        int gidsetsize = group_info->ngroups;
+
+        for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
+                ; /* nothing */
+        stride /= 3;
+
+	while (stride) {
+		max = gidsetsize - stride;
+		for (base = 0; base < max; base++) {
+			int left = base;
+			int right = left + stride;
+			gid_t tmp = from_kgid(&init_user_ns,
+					      CFS_GROUP_AT(group_info, right));
+
+			while (left >= 0 &&
+			       tmp < from_kgid(&init_user_ns,
+					       CFS_GROUP_AT(group_info, left))) {
+				CFS_GROUP_AT(group_info, right) =
+					CFS_GROUP_AT(group_info, left);
+				right = left;
+				left -= stride;
+			}
+			CFS_GROUP_AT(group_info, right) =
+						make_kgid(&init_user_ns, tmp);
+		}
+		stride /= 3;
+	}
+}
+EXPORT_SYMBOL(lustre_groups_sort);
+
+int lustre_in_group_p(struct lu_ucred *mu, gid_t grp)
+{
+	int rc = 1;
+
+	if (grp != mu->uc_fsgid) {
+		struct group_info *group_info = NULL;
+
+		if (mu->uc_ginfo || !mu->uc_identity ||
+		    mu->uc_valid == UCRED_OLD)
+			if (grp == mu->uc_suppgids[0] ||
+			    grp == mu->uc_suppgids[1])
+				return 1;
+
+		if (mu->uc_ginfo)
+			group_info = mu->uc_ginfo;
+		else if (mu->uc_identity)
+			group_info = mu->uc_identity->mi_ginfo;
+
+		if (!group_info)
+			return 0;
+
+		lustre_get_group_info(group_info);
+		rc = lustre_groups_search(group_info, grp);
+		lustre_put_group_info(group_info);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lustre_in_group_p);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/kernelcomm.c b/drivers/staging/lustrefsx/lustre/obdclass/kernelcomm.c
new file mode 100644
index 0000000000000..79d176dcd3d53
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/kernelcomm.c
@@ -0,0 +1,261 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ *
+ * Kernel <-> userspace communication routines.
+ * Using pipes for all arches.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#define D_KUC D_OTHER
+
+#include <obd_support.h>
+#include <lustre_kernelcomm.h>
+
+/**
+ * libcfs_kkuc_msg_put - send an message from kernel to userspace
+ * @param fp to send the message to
+ * @param payload Payload data.  First field of payload is always
+ *   struct kuc_hdr
+ */
+int libcfs_kkuc_msg_put(struct file *filp, void *payload)
+{
+	struct kuc_hdr *kuch = (struct kuc_hdr *)payload;
+	ssize_t count = kuch->kuc_msglen;
+	loff_t offset = 0;
+	int rc = 0;
+
+	if (IS_ERR_OR_NULL(filp))
+		return -EBADF;
+
+	if (kuch->kuc_magic != KUC_MAGIC) {
+		CERROR("KernelComm: bad magic %x\n", kuch->kuc_magic);
+		return -ENOSYS;
+	}
+
+	while (count > 0) {
+		rc = cfs_kernel_write(filp, payload, count, &offset);
+		if (rc < 0)
+			break;
+		count -= rc;
+		payload += rc;
+		rc = 0;
+	}
+
+	if (rc < 0)
+		CWARN("message send failed (%d)\n", rc);
+	else
+		CDEBUG(D_KUC, "Sent message rc=%d, fp=%p\n", rc, filp);
+
+	return rc;
+}
+EXPORT_SYMBOL(libcfs_kkuc_msg_put);
+
+/* Broadcast groups are global across all mounted filesystems;
+ * i.e. registering for a group on 1 fs will get messages for that
+ * group from any fs */
+/** A single group registration has a uid and a file pointer */
+struct kkuc_reg {
+	struct list_head kr_chain;
+	struct obd_uuid	 kr_uuid;
+	int		 kr_uid;
+	struct file	*kr_fp;
+	char		 kr_data[0];
+};
+
+static struct list_head kkuc_groups[KUC_GRP_MAX + 1];
+/* Protect message sending against remove and adds */
+static DECLARE_RWSEM(kg_sem);
+
+static inline bool libcfs_kkuc_group_is_valid(int group)
+{
+	return 0 <= group && group < ARRAY_SIZE(kkuc_groups);
+}
+
+void libcfs_kkuc_init(void)
+{
+	int group;
+
+	for (group = 0; group < ARRAY_SIZE(kkuc_groups); group++)
+		INIT_LIST_HEAD(&kkuc_groups[group]);
+}
+
+/** Add a receiver to a broadcast group
+ * @param filp pipe to write into
+ * @param uid identifier for this receiver
+ * @param group group number
+ * @param data user data
+ */
+int libcfs_kkuc_group_add(struct file *filp, const struct obd_uuid *uuid,
+			  int uid, int group, void *data, size_t data_len)
+{
+	struct kkuc_reg *reg;
+
+	if (!libcfs_kkuc_group_is_valid(group)) {
+		CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group);
+		return -EINVAL;
+	}
+
+	/* fput in group_rem */
+	if (filp == NULL)
+		return -EBADF;
+
+	/* freed in group_rem */
+	reg = kzalloc(sizeof(*reg) + data_len, 0);
+	if (reg == NULL)
+		return -ENOMEM;
+
+	reg->kr_uuid = *uuid;
+	reg->kr_fp = filp;
+	reg->kr_uid = uid;
+	memcpy(reg->kr_data, data, data_len);
+
+	down_write(&kg_sem);
+	list_add(&reg->kr_chain, &kkuc_groups[group]);
+	up_write(&kg_sem);
+
+	CDEBUG(D_KUC, "Added uid=%d fp=%p to group %d\n", uid, filp, group);
+
+	return 0;
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_add);
+
+int libcfs_kkuc_group_rem(const struct obd_uuid *uuid, int uid, int group)
+{
+	struct kkuc_reg *reg, *next;
+	ENTRY;
+
+	if (!libcfs_kkuc_group_is_valid(group)) {
+		CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group);
+		return -EINVAL;
+	}
+
+	if (uid == 0) {
+		/* Broadcast a shutdown message */
+		struct kuc_hdr lh;
+
+		lh.kuc_magic = KUC_MAGIC;
+		lh.kuc_transport = KUC_TRANSPORT_GENERIC;
+		lh.kuc_msgtype = KUC_MSG_SHUTDOWN;
+		lh.kuc_msglen = sizeof(lh);
+		libcfs_kkuc_group_put(uuid, group, &lh);
+	}
+
+	down_write(&kg_sem);
+	list_for_each_entry_safe(reg, next, &kkuc_groups[group], kr_chain) {
+		if (obd_uuid_equals(uuid, &reg->kr_uuid) &&
+		    (uid == 0 || uid == reg->kr_uid)) {
+			list_del(&reg->kr_chain);
+			CDEBUG(D_KUC, "Removed uid=%d fp=%p from group %d\n",
+				reg->kr_uid, reg->kr_fp, group);
+			if (reg->kr_fp != NULL)
+				fput(reg->kr_fp);
+			kfree(reg);
+		}
+	}
+	up_write(&kg_sem);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_rem);
+
+int libcfs_kkuc_group_put(const struct obd_uuid *uuid, int group, void *payload)
+{
+	struct kkuc_reg	*reg;
+	int		 rc = 0;
+	int one_success = 0;
+	ENTRY;
+
+	if (!libcfs_kkuc_group_is_valid(group)) {
+		CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group);
+		return -EINVAL;
+	}
+
+	down_write(&kg_sem);
+
+	if (unlikely(list_empty(&kkuc_groups[group])) ||
+	    unlikely(OBD_FAIL_CHECK(OBD_FAIL_MDS_HSM_CT_REGISTER_NET))) {
+		/* no agent have fully registered, CDT will retry */
+		up_write(&kg_sem);
+		RETURN(-EAGAIN);
+	}
+
+	list_for_each_entry(reg, &kkuc_groups[group], kr_chain) {
+		if (obd_uuid_equals(uuid, &reg->kr_uuid) &&
+		    reg->kr_fp != NULL) {
+			rc = libcfs_kkuc_msg_put(reg->kr_fp, payload);
+			if (rc == 0)
+				one_success = 1;
+			else if (rc == -EPIPE) {
+				fput(reg->kr_fp);
+				reg->kr_fp = NULL;
+			}
+		}
+	}
+	up_write(&kg_sem);
+
+	/* don't return an error if the message has been delivered
+	 * at least to one agent */
+	if (one_success)
+		rc = 0;
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_put);
+
+/**
+ * Calls a callback function for each link of the given kuc group.
+ * @param group the group to call the function on.
+ * @param cb_func the function to be called.
+ * @param cb_arg extra argument to be passed to the callback function.
+ */
+int libcfs_kkuc_group_foreach(const struct obd_uuid *uuid, int group,
+			      libcfs_kkuc_cb_t cb_func, void *cb_arg)
+{
+	struct kkuc_reg	*reg;
+	int		 rc = 0;
+	ENTRY;
+
+	if (!libcfs_kkuc_group_is_valid(group)) {
+		CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group);
+		RETURN(-EINVAL);
+	}
+
+	down_read(&kg_sem);
+	list_for_each_entry(reg, &kkuc_groups[group], kr_chain) {
+		if (obd_uuid_equals(uuid, &reg->kr_uuid) && reg->kr_fp != NULL)
+			rc = cb_func(reg->kr_data, cb_arg);
+	}
+	up_read(&kg_sem);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_foreach);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/linkea.c b/drivers/staging/lustrefsx/lustre/obdclass/linkea.c
new file mode 100644
index 0000000000000..a1bcc3d7de608
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/linkea.c
@@ -0,0 +1,307 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2016, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: Di Wang <di.wang@intel.com>
+ */
+
+#include <lustre/lustre_idl.h>
+#include <obd.h>
+#include <lustre_linkea.h>
+
+int linkea_data_new(struct linkea_data *ldata, struct lu_buf *buf)
+{
+	ldata->ld_buf = lu_buf_check_and_alloc(buf, PAGE_SIZE);
+	if (ldata->ld_buf->lb_buf == NULL)
+		return -ENOMEM;
+	ldata->ld_leh = ldata->ld_buf->lb_buf;
+	ldata->ld_leh->leh_magic = LINK_EA_MAGIC;
+	ldata->ld_leh->leh_reccount = 0;
+	ldata->ld_leh->leh_len = sizeof(struct link_ea_header);
+	ldata->ld_leh->leh_overflow_time = 0;
+	ldata->ld_leh->leh_padding = 0;
+	return 0;
+}
+EXPORT_SYMBOL(linkea_data_new);
+
+int linkea_init(struct linkea_data *ldata)
+{
+	struct link_ea_header *leh;
+
+	LASSERT(ldata->ld_buf != NULL);
+	leh = ldata->ld_buf->lb_buf;
+	if (leh->leh_magic == __swab32(LINK_EA_MAGIC)) {
+		leh->leh_magic = LINK_EA_MAGIC;
+		leh->leh_reccount = __swab32(leh->leh_reccount);
+		leh->leh_len = __swab64(leh->leh_len);
+		leh->leh_overflow_time = __swab32(leh->leh_overflow_time);
+		leh->leh_padding = __swab32(leh->leh_padding);
+		/* individual entries are swabbed by linkea_entry_unpack() */
+	}
+
+	if (leh->leh_magic != LINK_EA_MAGIC)
+		return -EINVAL;
+
+	if (leh->leh_reccount == 0 && leh->leh_overflow_time == 0)
+		return -ENODATA;
+
+	ldata->ld_leh = leh;
+	return 0;
+}
+EXPORT_SYMBOL(linkea_init);
+
+int linkea_init_with_rec(struct linkea_data *ldata)
+{
+	int rc;
+
+	rc = linkea_init(ldata);
+	if (!rc && ldata->ld_leh->leh_reccount == 0)
+		rc = -ENODATA;
+
+	return rc;
+}
+EXPORT_SYMBOL(linkea_init_with_rec);
+
+/**
+ * Pack a link_ea_entry.
+ * All elements are stored as chars to avoid alignment issues.
+ * Numbers are always big-endian
+ * \retval record length
+ */
+int linkea_entry_pack(struct link_ea_entry *lee, const struct lu_name *lname,
+		      const struct lu_fid *pfid)
+{
+	struct lu_fid   tmpfid;
+	int             reclen;
+
+	tmpfid = *pfid;
+	if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_MUL_REF))
+		tmpfid.f_oid--;
+	if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LINKEA_CRASH))
+		tmpfid.f_ver = ~0;
+	fid_cpu_to_be(&tmpfid, &tmpfid);
+	memcpy(&lee->lee_parent_fid, &tmpfid, sizeof(tmpfid));
+	memcpy(lee->lee_name, lname->ln_name, lname->ln_namelen);
+	reclen = sizeof(struct link_ea_entry) + lname->ln_namelen;
+
+	lee->lee_reclen[0] = (reclen >> 8) & 0xff;
+	lee->lee_reclen[1] = reclen & 0xff;
+	return reclen;
+}
+EXPORT_SYMBOL(linkea_entry_pack);
+
+void linkea_entry_unpack(const struct link_ea_entry *lee, int *reclen,
+			 struct lu_name *lname, struct lu_fid *pfid)
+{
+	LASSERT(lee != NULL);
+
+	*reclen = (lee->lee_reclen[0] << 8) | lee->lee_reclen[1];
+	memcpy(pfid, &lee->lee_parent_fid, sizeof(*pfid));
+	fid_be_to_cpu(pfid, pfid);
+	if (lname != NULL) {
+		lname->ln_name = lee->lee_name;
+		lname->ln_namelen = *reclen - sizeof(struct link_ea_entry);
+	}
+}
+EXPORT_SYMBOL(linkea_entry_unpack);
+
+/**
+ * Add a record to the end of link ea buf
+ **/
+int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname,
+		   const struct lu_fid *pfid)
+{
+	struct link_ea_header *leh = ldata->ld_leh;
+	int reclen;
+
+	LASSERT(leh != NULL);
+
+	if (lname == NULL || pfid == NULL)
+		return -EINVAL;
+
+	reclen = lname->ln_namelen + sizeof(struct link_ea_entry);
+	if (unlikely(leh->leh_len + reclen > MAX_LINKEA_SIZE)) {
+		/* Use 32-bits to save the overflow time, although it will
+		 * shrink the cfs_time_current_sec() returned 64-bits value
+		 * to 32-bits value, it is still quite large and can be used
+		 * for about 140 years. That is enough. */
+		leh->leh_overflow_time = cfs_time_current_sec();
+		if (unlikely(leh->leh_overflow_time == 0))
+			leh->leh_overflow_time++;
+
+		CDEBUG(D_INODE, "No enough space to hold linkea entry '"
+		       DFID": %.*s' at %u\n", PFID(pfid), lname->ln_namelen,
+		       lname->ln_name, leh->leh_overflow_time);
+		return 0;
+	}
+
+	if (leh->leh_len + reclen > ldata->ld_buf->lb_len) {
+		if (lu_buf_check_and_grow(ldata->ld_buf,
+					  leh->leh_len + reclen) < 0)
+			return -ENOMEM;
+
+		leh = ldata->ld_leh = ldata->ld_buf->lb_buf;
+	}
+
+	ldata->ld_lee = ldata->ld_buf->lb_buf + leh->leh_len;
+	ldata->ld_reclen = linkea_entry_pack(ldata->ld_lee, lname, pfid);
+	leh->leh_len += ldata->ld_reclen;
+	leh->leh_reccount++;
+	CDEBUG(D_INODE, "New link_ea name '"DFID":%.*s' is added\n",
+	       PFID(pfid), lname->ln_namelen, lname->ln_name);
+	return 0;
+}
+EXPORT_SYMBOL(linkea_add_buf);
+
+/** Del the current record from the link ea buf */
+void linkea_del_buf(struct linkea_data *ldata, const struct lu_name *lname)
+{
+	LASSERT(ldata->ld_leh != NULL && ldata->ld_lee != NULL);
+	LASSERT(ldata->ld_leh->leh_reccount > 0);
+
+	ldata->ld_leh->leh_reccount--;
+	ldata->ld_leh->leh_len -= ldata->ld_reclen;
+	memmove(ldata->ld_lee, (char *)ldata->ld_lee + ldata->ld_reclen,
+		(char *)ldata->ld_leh + ldata->ld_leh->leh_len -
+		(char *)ldata->ld_lee);
+	CDEBUG(D_INODE, "Old link_ea name '%.*s' is removed\n",
+	       lname->ln_namelen, lname->ln_name);
+
+	if ((char *)ldata->ld_lee >= ((char *)ldata->ld_leh +
+				      ldata->ld_leh->leh_len))
+		ldata->ld_lee = NULL;
+}
+EXPORT_SYMBOL(linkea_del_buf);
+
+int linkea_links_new(struct linkea_data *ldata, struct lu_buf *buf,
+		     const struct lu_name *cname, const struct lu_fid *pfid)
+{
+	int rc;
+
+	rc = linkea_data_new(ldata, buf);
+	if (!rc)
+		rc = linkea_add_buf(ldata, cname, pfid);
+
+	return rc;
+}
+EXPORT_SYMBOL(linkea_links_new);
+
+/**
+ * Mark the linkEA as overflow with current timestamp,
+ * and remove the last linkEA entry.
+ *
+ * Return the new linkEA size.
+ */
+int linkea_overflow_shrink(struct linkea_data *ldata)
+{
+	struct link_ea_header *leh;
+	struct lu_name tname;
+	struct lu_fid tfid;
+	int count;
+
+	leh = ldata->ld_leh = ldata->ld_buf->lb_buf;
+	if (leh->leh_magic == __swab32(LINK_EA_MAGIC)) {
+		leh->leh_magic = LINK_EA_MAGIC;
+		leh->leh_reccount = __swab32(leh->leh_reccount);
+		leh->leh_overflow_time = __swab32(leh->leh_overflow_time);
+		leh->leh_padding = __swab32(leh->leh_padding);
+	}
+
+	LASSERT(leh->leh_reccount > 0);
+
+	leh->leh_len = sizeof(struct link_ea_header);
+	leh->leh_reccount--;
+	if (unlikely(leh->leh_reccount == 0))
+		return 0;
+
+	leh->leh_overflow_time = cfs_time_current_sec();
+	if (unlikely(leh->leh_overflow_time == 0))
+		leh->leh_overflow_time++;
+	ldata->ld_reclen = 0;
+	ldata->ld_lee = (struct link_ea_entry *)(leh + 1);
+	for (count = 0; count < leh->leh_reccount; count++) {
+		linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen,
+				    &tname, &tfid);
+		leh->leh_len += ldata->ld_reclen;
+		ldata->ld_lee = (struct link_ea_entry *)((char *)ldata->ld_lee +
+							 ldata->ld_reclen);
+	}
+
+	linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen, &tname, &tfid);
+	CDEBUG(D_INODE, "No enough space to hold the last linkea entry '"
+	       DFID": %.*s', shrink it, left %d linkea entries, size %llu\n",
+	       PFID(&tfid), tname.ln_namelen, tname.ln_name,
+	       leh->leh_reccount, leh->leh_len);
+
+	return leh->leh_len;
+}
+EXPORT_SYMBOL(linkea_overflow_shrink);
+
+/**
+ * Check if such a link exists in linkEA.
+ *
+ * \param ldata link data the search to be done on
+ * \param lname name in the parent's directory entry pointing to this object
+ * \param pfid parent fid the link to be found for
+ *
+ * \retval   0 success
+ * \retval -ENOENT link does not exist
+ * \retval -ve on error
+ */
+int linkea_links_find(struct linkea_data *ldata, const struct lu_name *lname,
+		      const struct lu_fid  *pfid)
+{
+	struct lu_name tmpname;
+	struct lu_fid  tmpfid;
+	int count;
+
+	LASSERT(ldata->ld_leh != NULL);
+
+	/* link #0, if leh_reccount == 0 we skip the loop and return -ENOENT */
+	if (likely(ldata->ld_leh->leh_reccount > 0))
+		ldata->ld_lee = (struct link_ea_entry *)(ldata->ld_leh + 1);
+
+	for (count = 0; count < ldata->ld_leh->leh_reccount; count++) {
+		linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen,
+				    &tmpname, &tmpfid);
+		if (tmpname.ln_namelen == lname->ln_namelen &&
+		    lu_fid_eq(&tmpfid, pfid) &&
+		    (strncmp(tmpname.ln_name, lname->ln_name,
+			     tmpname.ln_namelen) == 0))
+			break;
+		ldata->ld_lee = (struct link_ea_entry *)((char *)ldata->ld_lee +
+							 ldata->ld_reclen);
+	}
+
+	if (count == ldata->ld_leh->leh_reccount) {
+		CDEBUG(D_INODE, "Old link_ea name '%.*s' not found\n",
+		       lname->ln_namelen, lname->ln_name);
+		ldata->ld_lee = NULL;
+		ldata->ld_reclen = 0;
+		return -ENOENT;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(linkea_links_find);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-module.c b/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-module.c
new file mode 100644
index 0000000000000..dabbf58057caf
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-module.c
@@ -0,0 +1,582 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/linux/linux-module.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/lp.h>
+#include <linux/slab.h>
+#include <linux/ioport.h>
+#include <linux/fcntl.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <asm/io.h>
+#include <asm/ioctls.h>
+#include <asm/poll.h>
+#include <asm/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/seq_file.h>
+#include <linux/kobject.h>
+
+#include <libcfs/libcfs.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lnet/lnetctl.h>
+#include <lprocfs_status.h>
+#include <uapi/linux/lustre_ioctl.h>
+#include <lustre_ver.h>
+
+static int obd_ioctl_is_invalid(struct obd_ioctl_data *data)
+{
+	if (data->ioc_len > BIT(30)) {
+		CERROR("OBD ioctl: ioc_len larger than 1<<30\n");
+		return 1;
+	}
+
+	if (data->ioc_inllen1 > BIT(30)) {
+		CERROR("OBD ioctl: ioc_inllen1 larger than 1<<30\n");
+		return 1;
+	}
+
+	if (data->ioc_inllen2 > BIT(30)) {
+		CERROR("OBD ioctl: ioc_inllen2 larger than 1<<30\n");
+		return 1;
+	}
+
+	if (data->ioc_inllen3 > BIT(30)) {
+		CERROR("OBD ioctl: ioc_inllen3 larger than 1<<30\n");
+		return 1;
+	}
+
+	if (data->ioc_inllen4 > BIT(30)) {
+		CERROR("OBD ioctl: ioc_inllen4 larger than 1<<30\n");
+		return 1;
+	}
+
+	if (data->ioc_inlbuf1 && data->ioc_inllen1 == 0) {
+		CERROR("OBD ioctl: inlbuf1 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (data->ioc_inlbuf2 && data->ioc_inllen2 == 0) {
+		CERROR("OBD ioctl: inlbuf2 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (data->ioc_inlbuf3 && data->ioc_inllen3 == 0) {
+		CERROR("OBD ioctl: inlbuf3 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (data->ioc_inlbuf4 && data->ioc_inllen4 == 0) {
+		CERROR("OBD ioctl: inlbuf4 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (data->ioc_pbuf1 && data->ioc_plen1 == 0) {
+		CERROR("OBD ioctl: pbuf1 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (data->ioc_pbuf2 && data->ioc_plen2 == 0) {
+		CERROR("OBD ioctl: pbuf2 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (!data->ioc_pbuf1 && data->ioc_plen1 != 0) {
+		CERROR("OBD ioctl: plen1 set but NULL pointer\n");
+		return 1;
+	}
+
+	if (!data->ioc_pbuf2 && data->ioc_plen2 != 0) {
+		CERROR("OBD ioctl: plen2 set but NULL pointer\n");
+		return 1;
+	}
+
+	if (obd_ioctl_packlen(data) > data->ioc_len) {
+		CERROR("OBD ioctl: packlen exceeds ioc_len (%d > %d)\n",
+		       obd_ioctl_packlen(data), data->ioc_len);
+		return 1;
+	}
+
+	return 0;
+}
+
+/* buffer MUST be at least the size of obd_ioctl_hdr */
+int obd_ioctl_getdata(char **buf, int *len, void __user *arg)
+{
+	struct obd_ioctl_hdr hdr;
+	struct obd_ioctl_data *data;
+	int offset = 0;
+	ENTRY;
+
+	if (copy_from_user(&hdr, arg, sizeof(hdr)))
+		RETURN(-EFAULT);
+
+        if (hdr.ioc_version != OBD_IOCTL_VERSION) {
+                CERROR("Version mismatch kernel (%x) vs application (%x)\n",
+                       OBD_IOCTL_VERSION, hdr.ioc_version);
+                RETURN(-EINVAL);
+        }
+
+        if (hdr.ioc_len > OBD_MAX_IOCTL_BUFFER) {
+                CERROR("User buffer len %d exceeds %d max buffer\n",
+                       hdr.ioc_len, OBD_MAX_IOCTL_BUFFER);
+                RETURN(-EINVAL);
+        }
+
+        if (hdr.ioc_len < sizeof(struct obd_ioctl_data)) {
+                CERROR("User buffer too small for ioctl (%d)\n", hdr.ioc_len);
+                RETURN(-EINVAL);
+        }
+
+        /* When there are lots of processes calling vmalloc on multi-core
+         * system, the high lock contention will hurt performance badly,
+         * obdfilter-survey is an example, which relies on ioctl. So we'd
+         * better avoid vmalloc on ioctl path. LU-66 */
+        OBD_ALLOC_LARGE(*buf, hdr.ioc_len);
+        if (*buf == NULL) {
+                CERROR("Cannot allocate control buffer of len %d\n",
+                       hdr.ioc_len);
+                RETURN(-EINVAL);
+        }
+        *len = hdr.ioc_len;
+        data = (struct obd_ioctl_data *)*buf;
+
+	if (copy_from_user(*buf, arg, hdr.ioc_len)) {
+		OBD_FREE_LARGE(*buf, hdr.ioc_len);
+		RETURN(-EFAULT);
+	}
+
+        if (obd_ioctl_is_invalid(data)) {
+                CERROR("ioctl not correctly formatted\n");
+                OBD_FREE_LARGE(*buf, hdr.ioc_len);
+                RETURN(-EINVAL);
+        }
+
+        if (data->ioc_inllen1) {
+                data->ioc_inlbuf1 = &data->ioc_bulk[0];
+                offset += cfs_size_round(data->ioc_inllen1);
+        }
+
+        if (data->ioc_inllen2) {
+                data->ioc_inlbuf2 = &data->ioc_bulk[0] + offset;
+                offset += cfs_size_round(data->ioc_inllen2);
+        }
+
+        if (data->ioc_inllen3) {
+                data->ioc_inlbuf3 = &data->ioc_bulk[0] + offset;
+                offset += cfs_size_round(data->ioc_inllen3);
+        }
+
+	if (data->ioc_inllen4)
+		data->ioc_inlbuf4 = &data->ioc_bulk[0] + offset;
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(obd_ioctl_getdata);
+
+/*  opening /dev/obd */
+static int obd_class_open(struct inode * inode, struct file * file)
+{
+	ENTRY;
+
+	try_module_get(THIS_MODULE);
+	RETURN(0);
+}
+
+/*  closing /dev/obd */
+static int obd_class_release(struct inode * inode, struct file * file)
+{
+	ENTRY;
+
+	module_put(THIS_MODULE);
+	RETURN(0);
+}
+
+/* to control /dev/obd */
+static long obd_class_ioctl(struct file *filp, unsigned int cmd,
+			    unsigned long arg)
+{
+        int err = 0;
+        ENTRY;
+
+        /* Allow non-root access for OBD_IOC_PING_TARGET - used by lfs check */
+        if (!cfs_capable(CFS_CAP_SYS_ADMIN) && (cmd != OBD_IOC_PING_TARGET))
+                RETURN(err = -EACCES);
+        if ((cmd & 0xffffff00) == ((int)'T') << 8) /* ignore all tty ioctls */
+                RETURN(err = -ENOTTY);
+
+        err = class_handle_ioctl(cmd, (unsigned long)arg);
+
+        RETURN(err);
+}
+
+/* declare character device */
+static struct file_operations obd_psdev_fops = {
+	.owner          = THIS_MODULE,
+	.unlocked_ioctl = obd_class_ioctl, /* unlocked_ioctl */
+	.open           = obd_class_open,      /* open */
+	.release        = obd_class_release,   /* release */
+};
+
+/* modules setup */
+struct miscdevice obd_psdev = {
+	.minor	= MISC_DYNAMIC_MINOR,
+	.name	= OBD_DEV_NAME,
+	.fops	= &obd_psdev_fops,
+};
+
+static ssize_t version_show(struct kobject *kobj, struct attribute *attr,
+			    char *buf)
+{
+	return sprintf(buf, "%s\n", LUSTRE_VERSION_STRING);
+}
+
+static ssize_t pinger_show(struct kobject *kobj, struct attribute *attr,
+			   char *buf)
+{
+#ifdef ENABLE_PINGER
+	const char *state = "on";
+#else
+	const char *state = "off";
+#endif
+	return sprintf(buf, "%s\n", state);
+}
+
+/**
+ * Check all obd devices health
+ *
+ * \param kobj
+ * \param buf [in]
+ *
+ * \retval number of characters printed if healthy
+ */
+static ssize_t
+health_check_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	bool healthy = true;
+	size_t len = 0;
+	int i;
+
+	if (libcfs_catastrophe) {
+		len = sprintf(buf, "LBUG\n");
+		healthy = false;
+	}
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd;
+
+		obd = class_num2obd(i);
+		if (obd == NULL || !obd->obd_attached || !obd->obd_set_up)
+			continue;
+
+		LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+		if (obd->obd_stopping)
+			continue;
+
+		class_incref(obd, __FUNCTION__, current);
+		read_unlock(&obd_dev_lock);
+
+		if (obd_health_check(NULL, obd)) {
+			len = sprintf(buf, "device %s reported unhealthy\n",
+				      obd->obd_name);
+			healthy = false;
+		}
+		class_decref(obd, __FUNCTION__, current);
+		read_lock(&obd_dev_lock);
+	}
+	read_unlock(&obd_dev_lock);
+
+	if (healthy)
+		len = sprintf(buf, "healthy\n");
+	else
+		len = sprintf(buf, "NOT HEALTHY\n");
+
+	return len;
+}
+
+static ssize_t jobid_var_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
+{
+	int rc = 0;
+
+	if (strlen(obd_jobid_var))
+		rc = snprintf(buf, PAGE_SIZE, "%s\n", obd_jobid_var);
+	return rc;
+}
+
+static ssize_t jobid_var_store(struct kobject *kobj, struct attribute *attr,
+			       const char *buffer, size_t count)
+{
+	if (!count || count > JOBSTATS_JOBID_VAR_MAX_LEN)
+		return -EINVAL;
+
+	memset(obd_jobid_var, 0, JOBSTATS_JOBID_VAR_MAX_LEN + 1);
+
+	memcpy(obd_jobid_var, buffer, count);
+
+	/* Trim the trailing '\n' if any */
+	if (obd_jobid_var[count - 1] == '\n')
+		obd_jobid_var[count - 1] = 0;
+
+	return count;
+}
+
+static ssize_t jobid_name_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	int rc = 0;
+
+	if (strlen(obd_jobid_node))
+		rc = snprintf(buf, PAGE_SIZE, "%s\n", obd_jobid_node);
+	return rc;
+}
+
+static ssize_t jobid_name_store(struct kobject *kobj, struct attribute *attr,
+				const char *buffer, size_t count)
+{
+	if (!count || count > LUSTRE_JOBID_SIZE)
+		return -EINVAL;
+
+	/* clear previous value */
+	memset(obd_jobid_node, 0, LUSTRE_JOBID_SIZE);
+
+	memcpy(obd_jobid_node, buffer, count);
+
+	/* Trim the trailing '\n' if any */
+	if (obd_jobid_node[count - 1] == '\n') {
+		/* Don't echo just a newline */
+		if (count == 1)
+			return -EINVAL;
+		obd_jobid_node[count - 1] = 0;
+	}
+
+	return count;
+}
+
+/* Root for /sys/kernel/debug/lustre */
+struct dentry *debugfs_lustre_root;
+EXPORT_SYMBOL_GPL(debugfs_lustre_root);
+
+#ifdef CONFIG_PROC_FS
+/* Root for /proc/fs/lustre */
+struct proc_dir_entry *proc_lustre_root = NULL;
+EXPORT_SYMBOL(proc_lustre_root);
+#else
+#define lprocfs_base NULL
+#endif /* CONFIG_PROC_FS */
+
+LUSTRE_RO_ATTR(version);
+LUSTRE_RO_ATTR(pinger);
+LUSTRE_RO_ATTR(health_check);
+LUSTRE_RW_ATTR(jobid_var);
+LUSTRE_RW_ATTR(jobid_name);
+
+static struct attribute *lustre_attrs[] = {
+	&lustre_attr_version.attr,
+	&lustre_attr_pinger.attr,
+	&lustre_attr_health_check.attr,
+	&lustre_attr_jobid_name.attr,
+	&lustre_attr_jobid_var.attr,
+	NULL,
+};
+
+static void *obd_device_list_seq_start(struct seq_file *p, loff_t *pos)
+{
+        if (*pos >= class_devno_max())
+                return NULL;
+
+        return pos;
+}
+
+static void obd_device_list_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *obd_device_list_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+        ++*pos;
+        if (*pos >= class_devno_max())
+                return NULL;
+
+        return pos;
+}
+
+static int obd_device_list_seq_show(struct seq_file *p, void *v)
+{
+        loff_t index = *(loff_t *)v;
+        struct obd_device *obd = class_num2obd((int)index);
+        char *status;
+
+        if (obd == NULL)
+                return 0;
+
+        LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+        if (obd->obd_stopping)
+                status = "ST";
+        else if (obd->obd_inactive)
+                status = "IN";
+        else if (obd->obd_set_up)
+                status = "UP";
+        else if (obd->obd_attached)
+                status = "AT";
+        else
+                status = "--";
+
+	seq_printf(p, "%3d %s %s %s %s %d\n",
+		   (int)index, status, obd->obd_type->typ_name,
+		   obd->obd_name, obd->obd_uuid.uuid,
+		   atomic_read(&obd->obd_refcount));
+	return 0;
+}
+
+static const struct seq_operations obd_device_list_sops = {
+        .start = obd_device_list_seq_start,
+        .stop = obd_device_list_seq_stop,
+        .next = obd_device_list_seq_next,
+        .show = obd_device_list_seq_show,
+};
+
+static int obd_device_list_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc = seq_open(file, &obd_device_list_sops);
+
+	if (rc)
+		return rc;
+
+	seq = file->private_data;
+	seq->private = inode->i_private;
+	return 0;
+}
+
+static const struct file_operations obd_device_list_fops = {
+        .owner   = THIS_MODULE,
+        .open    = obd_device_list_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release,
+};
+
+struct kobject *lustre_kobj;
+EXPORT_SYMBOL_GPL(lustre_kobj);
+
+static struct attribute_group lustre_attr_group = {
+	.attrs = lustre_attrs,
+};
+
+int class_procfs_init(void)
+{
+	struct proc_dir_entry *entry;
+	struct dentry *file;
+	int rc = -ENOMEM;
+	ENTRY;
+
+	lustre_kobj = kobject_create_and_add("lustre", fs_kobj);
+	if (lustre_kobj == NULL)
+		goto out;
+
+	/* Create the files associated with this kobject */
+	rc = sysfs_create_group(lustre_kobj, &lustre_attr_group);
+	if (rc) {
+		kobject_put(lustre_kobj);
+		goto out;
+	}
+
+	rc = obd_sysctl_init();
+	if (rc) {
+		kobject_put(lustre_kobj);
+		goto out;
+	}
+
+	debugfs_lustre_root = debugfs_create_dir("lustre", NULL);
+	if (IS_ERR_OR_NULL(debugfs_lustre_root)) {
+		rc = debugfs_lustre_root ? PTR_ERR(debugfs_lustre_root)
+					 : -ENOMEM;
+		debugfs_lustre_root = NULL;
+		kobject_put(lustre_kobj);
+		goto out;
+	}
+
+	file = debugfs_create_file("devices", 0444, debugfs_lustre_root, NULL,
+				   &obd_device_list_fops);
+	if (IS_ERR_OR_NULL(file)) {
+		rc = file ? PTR_ERR(file) : -ENOMEM;
+		kobject_put(lustre_kobj);
+		goto out;
+	}
+
+	entry = lprocfs_register("fs/lustre", NULL, NULL, NULL);
+	if (IS_ERR(entry)) {
+		rc = PTR_ERR(entry);
+		CERROR("cannot create '/proc/fs/lustre': rc = %d\n", rc);
+		kobject_put(lustre_kobj);
+		goto out;
+	}
+
+	proc_lustre_root = entry;
+out:
+	RETURN(rc);
+}
+
+int class_procfs_clean(void)
+{
+	ENTRY;
+
+	debugfs_remove_recursive(debugfs_lustre_root);
+
+	debugfs_lustre_root = NULL;
+
+	if (proc_lustre_root)
+		lprocfs_remove(&proc_lustre_root);
+
+	kobject_put(lustre_kobj);
+
+	RETURN(0);
+}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-obdo.c b/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-obdo.c
new file mode 100644
index 0000000000000..5f8e2b55d7258
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-obdo.c
@@ -0,0 +1,157 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/linux/linux-obdo.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/pagemap.h> /* for PAGE_SIZE */
+#include <lustre/lustre_idl.h>
+#include <obd_class.h>
+
+/*FIXME: Just copy from obdo_from_inode*/
+void obdo_from_la(struct obdo *dst, const struct lu_attr *la, u64 valid)
+{
+	u64 newvalid = 0;
+
+        if (valid & LA_ATIME) {
+                dst->o_atime = la->la_atime;
+                newvalid |= OBD_MD_FLATIME;
+        }
+        if (valid & LA_MTIME) {
+                dst->o_mtime = la->la_mtime;
+                newvalid |= OBD_MD_FLMTIME;
+        }
+        if (valid & LA_CTIME) {
+                dst->o_ctime = la->la_ctime;
+                newvalid |= OBD_MD_FLCTIME;
+        }
+        if (valid & LA_SIZE) {
+                dst->o_size = la->la_size;
+                newvalid |= OBD_MD_FLSIZE;
+        }
+        if (valid & LA_BLOCKS) {  /* allocation of space (x512 bytes) */
+                dst->o_blocks = la->la_blocks;
+                newvalid |= OBD_MD_FLBLOCKS;
+        }
+        if (valid & LA_TYPE) {
+                dst->o_mode = (dst->o_mode & S_IALLUGO) |
+                              (la->la_mode & S_IFMT);
+                newvalid |= OBD_MD_FLTYPE;
+        }
+        if (valid & LA_MODE) {
+                dst->o_mode = (dst->o_mode & S_IFMT) |
+                              (la->la_mode & S_IALLUGO);
+                newvalid |= OBD_MD_FLMODE;
+        }
+        if (valid & LA_UID) {
+                dst->o_uid = la->la_uid;
+                newvalid |= OBD_MD_FLUID;
+        }
+        if (valid & LA_GID) {
+                dst->o_gid = la->la_gid;
+                newvalid |= OBD_MD_FLGID;
+        }
+	if (valid & LA_PROJID) {
+		dst->o_projid = la->la_projid;
+		newvalid |= OBD_MD_FLPROJID;
+	}
+	if (valid & LA_FLAGS) {
+		dst->o_flags = la->la_flags;
+		newvalid |= OBD_MD_FLFLAGS;
+	}
+	dst->o_valid |= newvalid;
+}
+EXPORT_SYMBOL(obdo_from_la);
+
+/*FIXME: Just copy from obdo_from_inode*/
+void la_from_obdo(struct lu_attr *dst, const struct obdo *obdo, u64 valid)
+{
+	u64 newvalid = 0;
+
+        valid &= obdo->o_valid;
+
+        if (valid & OBD_MD_FLATIME) {
+                dst->la_atime = obdo->o_atime;
+                newvalid |= LA_ATIME;
+        }
+        if (valid & OBD_MD_FLMTIME) {
+                dst->la_mtime = obdo->o_mtime;
+                newvalid |= LA_MTIME;
+        }
+        if (valid & OBD_MD_FLCTIME) {
+                dst->la_ctime = obdo->o_ctime;
+                newvalid |= LA_CTIME;
+        }
+        if (valid & OBD_MD_FLSIZE) {
+                dst->la_size = obdo->o_size;
+                newvalid |= LA_SIZE;
+        }
+        if (valid & OBD_MD_FLBLOCKS) {
+                dst->la_blocks = obdo->o_blocks;
+                newvalid |= LA_BLOCKS;
+        }
+        if (valid & OBD_MD_FLTYPE) {
+                dst->la_mode = (dst->la_mode & S_IALLUGO) |
+                               (obdo->o_mode & S_IFMT);
+                newvalid |= LA_TYPE;
+        }
+        if (valid & OBD_MD_FLMODE) {
+                dst->la_mode = (dst->la_mode & S_IFMT) |
+                               (obdo->o_mode & S_IALLUGO);
+                newvalid |= LA_MODE;
+        }
+        if (valid & OBD_MD_FLUID) {
+                dst->la_uid = obdo->o_uid;
+                newvalid |= LA_UID;
+        }
+        if (valid & OBD_MD_FLGID) {
+                dst->la_gid = obdo->o_gid;
+                newvalid |= LA_GID;
+        }
+	if (valid & OBD_MD_FLPROJID) {
+		dst->la_projid = obdo->o_projid;
+		newvalid |= LA_PROJID;
+	}
+	if (valid & OBD_MD_FLFLAGS) {
+		dst->la_flags = obdo->o_flags;
+		newvalid |= LA_FLAGS;
+	}
+	dst->la_valid = newvalid;
+}
+EXPORT_SYMBOL(la_from_obdo);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-sysctl.c b/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-sysctl.c
new file mode 100644
index 0000000000000..19f95b8187ca9
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-sysctl.c
@@ -0,0 +1,190 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/ctype.h>
+#include <linux/bitops.h>
+#include <linux/uaccess.h>
+#include <linux/utsname.h>
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_support.h>
+#include <lprocfs_status.h>
+#include <obd_class.h>
+
+struct static_lustre_uintvalue_attr {
+	struct {
+		struct attribute attr;
+		ssize_t (*show)(struct kobject *kobj, struct attribute *attr,
+				char *buf);
+		ssize_t (*store)(struct kobject *kobj, struct attribute *attr,
+				 const char *buf, size_t len);
+	} u;
+	int *value;
+};
+
+static ssize_t static_uintvalue_show(struct kobject *kobj,
+				     struct attribute *attr,
+				     char *buf)
+{
+	struct static_lustre_uintvalue_attr *lattr = (void *)attr;
+
+	return sprintf(buf, "%d\n", *lattr->value);
+}
+
+static ssize_t static_uintvalue_store(struct kobject *kobj,
+				      struct attribute *attr,
+				      const char *buffer, size_t count)
+{
+	struct static_lustre_uintvalue_attr *lattr  = (void *)attr;
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buffer, 10, &val);
+	if (rc)
+		return rc;
+
+	*lattr->value = val;
+
+	return count;
+}
+
+#define LUSTRE_STATIC_UINT_ATTR(name, value) \
+static struct static_lustre_uintvalue_attr lustre_sattr_##name =	\
+					{__ATTR(name, 0644,		\
+						static_uintvalue_show,	\
+						static_uintvalue_store),\
+					  value }
+
+LUSTRE_STATIC_UINT_ATTR(timeout, &obd_timeout);
+
+static ssize_t max_dirty_mb_show(struct kobject *kobj, struct attribute *attr,
+				 char *buf)
+{
+	return sprintf(buf, "%lu\n",
+		       obd_max_dirty_pages / (1 << (20 - PAGE_SHIFT)));
+}
+
+static ssize_t max_dirty_mb_store(struct kobject *kobj, struct attribute *attr,
+				  const char *buffer, size_t count)
+{
+	unsigned long val;
+	int rc;
+
+	rc = kstrtoul(buffer, 10, &val);
+	if (rc)
+		return rc;
+
+	val *= 1 << (20 - PAGE_SHIFT); /* convert to pages */
+
+	if (val > ((totalram_pages / 10) * 9)) {
+		/* Somebody wants to assign too much memory to dirty pages */
+		return -EINVAL;
+	}
+
+	if (val < 4 << (20 - PAGE_SHIFT)) {
+		/* Less than 4 Mb for dirty cache is also bad */
+		return -EINVAL;
+	}
+
+	obd_max_dirty_pages = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(max_dirty_mb);
+
+LUSTRE_STATIC_UINT_ATTR(debug_peer_on_timeout, &obd_debug_peer_on_timeout);
+LUSTRE_STATIC_UINT_ATTR(dump_on_timeout, &obd_dump_on_timeout);
+LUSTRE_STATIC_UINT_ATTR(dump_on_eviction, &obd_dump_on_eviction);
+LUSTRE_STATIC_UINT_ATTR(at_min, &at_min);
+LUSTRE_STATIC_UINT_ATTR(at_max, &at_max);
+LUSTRE_STATIC_UINT_ATTR(at_extra, &at_extra);
+LUSTRE_STATIC_UINT_ATTR(at_early_margin, &at_early_margin);
+LUSTRE_STATIC_UINT_ATTR(at_history, &at_history);
+
+#ifdef HAVE_SERVER_SUPPORT
+LUSTRE_STATIC_UINT_ATTR(ldlm_timeout, &ldlm_timeout);
+LUSTRE_STATIC_UINT_ATTR(bulk_timeout, &bulk_timeout);
+#endif
+
+static ssize_t memused_show(struct kobject *kobj, struct attribute *attr,
+			    char *buf)
+{
+	return sprintf(buf, "%llu\n", obd_memory_sum());
+}
+LUSTRE_RO_ATTR(memused);
+
+static ssize_t memused_max_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	return sprintf(buf, "%llu\n", obd_memory_max());
+}
+LUSTRE_RO_ATTR(memused_max);
+
+static struct attribute *lustre_attrs[] = {
+	&lustre_sattr_timeout.u.attr,
+	&lustre_attr_max_dirty_mb.attr,
+	&lustre_sattr_debug_peer_on_timeout.u.attr,
+	&lustre_sattr_dump_on_timeout.u.attr,
+	&lustre_sattr_dump_on_eviction.u.attr,
+	&lustre_sattr_at_min.u.attr,
+	&lustre_sattr_at_max.u.attr,
+	&lustre_sattr_at_extra.u.attr,
+	&lustre_sattr_at_early_margin.u.attr,
+	&lustre_sattr_at_history.u.attr,
+	&lustre_attr_memused_max.attr,
+	&lustre_attr_memused.attr,
+#ifdef HAVE_SERVER_SUPPORT
+	&lustre_sattr_ldlm_timeout.u.attr,
+	&lustre_sattr_bulk_timeout.u.attr,
+#endif
+	NULL,
+};
+
+static struct attribute_group lustre_attr_group = {
+	.attrs = lustre_attrs,
+};
+
+int obd_sysctl_init(void)
+{
+	return sysfs_create_group(lustre_kobj, &lustre_attr_group);
+}
+
+void obd_sysctl_clean(void)
+{
+	sysfs_remove_group(lustre_kobj, &lustre_attr_group);
+}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog.c b/drivers/staging/lustrefsx/lustre/obdclass/llog.c
new file mode 100644
index 0000000000000..61c9a1d1f4e8a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog.c
@@ -0,0 +1,1359 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Alex Zhuravlev <bzzz@whamcloud.com>
+ * Author: Mikhail Pershin <tappro@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include <linux/pid_namespace.h>
+#include <linux/kthread.h>
+#include <llog_swab.h>
+#include <lustre_log.h>
+#include <obd_class.h>
+#include "llog_internal.h"
+/*
+ * Allocate a new log or catalog handle
+ * Used inside llog_open().
+ */
+static struct llog_handle *llog_alloc_handle(void)
+{
+	struct llog_handle *loghandle;
+
+	OBD_ALLOC_PTR(loghandle);
+	if (loghandle == NULL)
+		return NULL;
+
+	init_rwsem(&loghandle->lgh_lock);
+	mutex_init(&loghandle->lgh_hdr_mutex);
+	INIT_LIST_HEAD(&loghandle->u.phd.phd_entry);
+	atomic_set(&loghandle->lgh_refcount, 1);
+
+	return loghandle;
+}
+
+/*
+ * Free llog handle and header data if exists. Used in llog_close() only
+ */
+static void llog_free_handle(struct llog_handle *loghandle)
+{
+	LASSERT(loghandle != NULL);
+
+	/* failed llog_init_handle */
+	if (loghandle->lgh_hdr == NULL)
+		goto out;
+
+	if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)
+		LASSERT(list_empty(&loghandle->u.phd.phd_entry));
+	else if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)
+		LASSERT(list_empty(&loghandle->u.chd.chd_head));
+	OBD_FREE_LARGE(loghandle->lgh_hdr, loghandle->lgh_hdr_size);
+out:
+	OBD_FREE_PTR(loghandle);
+}
+
+void llog_handle_get(struct llog_handle *loghandle)
+{
+	atomic_inc(&loghandle->lgh_refcount);
+}
+
+void llog_handle_put(struct llog_handle *loghandle)
+{
+	LASSERT(atomic_read(&loghandle->lgh_refcount) > 0);
+	if (atomic_dec_and_test(&loghandle->lgh_refcount))
+		llog_free_handle(loghandle);
+}
+
+static int llog_declare_destroy(const struct lu_env *env,
+				struct llog_handle *handle,
+				struct thandle *th)
+{
+	struct llog_operations *lop;
+	int rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_declare_destroy == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	rc = lop->lop_declare_destroy(env, handle, th);
+
+	RETURN(rc);
+}
+
+int llog_trans_destroy(const struct lu_env *env, struct llog_handle *handle,
+		       struct thandle *th)
+{
+	struct llog_operations	*lop;
+	int rc;
+	ENTRY;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc < 0)
+		RETURN(rc);
+	if (lop->lop_destroy == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	LASSERT(handle->lgh_obj != NULL);
+	if (!dt_object_exists(handle->lgh_obj))
+		RETURN(0);
+
+	rc = lop->lop_destroy(env, handle, th);
+
+	RETURN(rc);
+}
+
+int llog_destroy(const struct lu_env *env, struct llog_handle *handle)
+{
+	struct llog_operations	*lop;
+	struct dt_device	*dt;
+	struct thandle		*th;
+	int rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc < 0)
+		RETURN(rc);
+	if (lop->lop_destroy == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	if (handle->lgh_obj == NULL) {
+		/* if lgh_obj == NULL, then it is from client side destroy */
+		rc = lop->lop_destroy(env, handle, NULL);
+		RETURN(rc);
+	}
+
+	if (!dt_object_exists(handle->lgh_obj))
+		RETURN(0);
+
+	dt = lu2dt_dev(handle->lgh_obj->do_lu.lo_dev);
+
+	th = dt_trans_create(env, dt);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	rc = llog_declare_destroy(env, handle, th);
+	if (rc != 0)
+		GOTO(out_trans, rc);
+
+	rc = dt_trans_start_local(env, dt, th);
+	if (rc < 0)
+		GOTO(out_trans, rc);
+
+	rc = lop->lop_destroy(env, handle, th);
+
+out_trans:
+	dt_trans_stop(env, dt, th);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_destroy);
+
+/* returns negative on error; 0 if success; 1 if success & log destroyed */
+int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
+		    int index)
+{
+	struct llog_thread_info *lgi = llog_info(env);
+	struct dt_device	*dt;
+	struct llog_log_hdr	*llh = loghandle->lgh_hdr;
+	struct thandle		*th;
+	int			 rc;
+	int rc1;
+	bool subtract_count = false;
+
+	ENTRY;
+
+	CDEBUG(D_RPCTRACE, "Canceling %d in log "DFID"\n", index,
+	       PFID(&loghandle->lgh_id.lgl_oi.oi_fid));
+
+	if (index == 0) {
+		CERROR("Can't cancel index 0 which is header\n");
+		RETURN(-EINVAL);
+	}
+
+	LASSERT(loghandle != NULL);
+	LASSERT(loghandle->lgh_ctxt != NULL);
+	LASSERT(loghandle->lgh_obj != NULL);
+
+	dt = lu2dt_dev(loghandle->lgh_obj->do_lu.lo_dev);
+
+	th = dt_trans_create(env, dt);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	rc = llog_declare_write_rec(env, loghandle, &llh->llh_hdr, index, th);
+	if (rc < 0)
+		GOTO(out_trans, rc);
+
+	if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY)) {
+		rc = llog_declare_destroy(env, loghandle, th);
+		if (rc < 0)
+			GOTO(out_trans, rc);
+	}
+
+	th->th_wait_submit = 1;
+	rc = dt_trans_start_local(env, dt, th);
+	if (rc < 0)
+		GOTO(out_trans, rc);
+
+	down_write(&loghandle->lgh_lock);
+	/* clear bitmap */
+	mutex_lock(&loghandle->lgh_hdr_mutex);
+	if (!ext2_clear_bit(index, LLOG_HDR_BITMAP(llh))) {
+		CDEBUG(D_RPCTRACE, "Catalog index %u already clear?\n", index);
+		GOTO(out_unlock, rc);
+	}
+
+	loghandle->lgh_hdr->llh_count--;
+	subtract_count = true;
+	/* Pass this index to llog_osd_write_rec(), which will use the index
+	 * to only update the necesary bitmap. */
+	lgi->lgi_cookie.lgc_index = index;
+	/* update header */
+	rc = llog_write_rec(env, loghandle, &llh->llh_hdr, &lgi->lgi_cookie,
+			    LLOG_HEADER_IDX, th);
+	if (rc != 0)
+		GOTO(out_unlock, rc);
+
+	if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+	    (llh->llh_count == 1) &&
+	    ((loghandle->lgh_last_idx == LLOG_HDR_BITMAP_SIZE(llh) - 1) ||
+	     (loghandle->u.phd.phd_cat_handle != NULL &&
+	      loghandle->u.phd.phd_cat_handle->u.chd.chd_current_log !=
+		loghandle))) {
+		/* never try to destroy it again */
+		llh->llh_flags &= ~LLOG_F_ZAP_WHEN_EMPTY;
+		rc = llog_trans_destroy(env, loghandle, th);
+		if (rc < 0) {
+			/* Sigh, can not destroy the final plain llog, but
+			 * the bitmap has been clearly, so the record can not
+			 * be accessed anymore, let's return 0 for now, and
+			 * the orphan will be handled by LFSCK. */
+			CERROR("%s: can't destroy empty llog "DFID": rc = %d\n",
+			       loghandle->lgh_ctxt->loc_obd->obd_name,
+			       PFID(&loghandle->lgh_id.lgl_oi.oi_fid), rc);
+			GOTO(out_unlock, rc = 0);
+		}
+		rc = LLOG_DEL_PLAIN;
+	}
+
+out_unlock:
+	mutex_unlock(&loghandle->lgh_hdr_mutex);
+	up_write(&loghandle->lgh_lock);
+out_trans:
+	rc1 = dt_trans_stop(env, dt, th);
+	if (rc == 0)
+		rc = rc1;
+	if (rc < 0 && subtract_count) {
+		mutex_lock(&loghandle->lgh_hdr_mutex);
+		loghandle->lgh_hdr->llh_count++;
+		ext2_set_bit(index, LLOG_HDR_BITMAP(llh));
+		mutex_unlock(&loghandle->lgh_hdr_mutex);
+	}
+	RETURN(rc);
+}
+
+int llog_read_header(const struct lu_env *env, struct llog_handle *handle,
+		     const struct obd_uuid *uuid)
+{
+	struct llog_operations *lop;
+	int rc;
+	ENTRY;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		RETURN(rc);
+
+	if (lop->lop_read_header == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	rc = lop->lop_read_header(env, handle);
+	if (rc == LLOG_EEMPTY) {
+		struct llog_log_hdr *llh = handle->lgh_hdr;
+
+		/* lrh_len should be initialized in llog_init_handle */
+		handle->lgh_last_idx = 0; /* header is record with index 0 */
+		llh->llh_count = 1;         /* for the header record */
+		llh->llh_hdr.lrh_type = LLOG_HDR_MAGIC;
+		LASSERT(handle->lgh_ctxt->loc_chunk_size >=
+						LLOG_MIN_CHUNK_SIZE);
+		llh->llh_hdr.lrh_len = handle->lgh_ctxt->loc_chunk_size;
+		llh->llh_hdr.lrh_index = 0;
+		llh->llh_timestamp = ktime_get_real_seconds();
+		if (uuid)
+			memcpy(&llh->llh_tgtuuid, uuid,
+			       sizeof(llh->llh_tgtuuid));
+		llh->llh_bitmap_offset = offsetof(typeof(*llh), llh_bitmap);
+		/* Since update llog header might also call this function,
+		 * let's reset the bitmap to 0 here */
+		memset(LLOG_HDR_BITMAP(llh), 0, llh->llh_hdr.lrh_len -
+						llh->llh_bitmap_offset -
+						sizeof(llh->llh_tail));
+		ext2_set_bit(0, LLOG_HDR_BITMAP(llh));
+		LLOG_HDR_TAIL(llh)->lrt_len = llh->llh_hdr.lrh_len;
+		LLOG_HDR_TAIL(llh)->lrt_index = llh->llh_hdr.lrh_index;
+		rc = 0;
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_read_header);
+
+int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
+		     int flags, struct obd_uuid *uuid)
+{
+	struct llog_log_hdr	*llh;
+	enum llog_flag		 fmt = flags & LLOG_F_EXT_MASK;
+	int			 rc;
+	int			chunk_size = handle->lgh_ctxt->loc_chunk_size;
+	ENTRY;
+
+	LASSERT(handle->lgh_hdr == NULL);
+
+	LASSERT(chunk_size >= LLOG_MIN_CHUNK_SIZE);
+	OBD_ALLOC_LARGE(llh, chunk_size);
+	if (llh == NULL)
+		RETURN(-ENOMEM);
+
+	handle->lgh_hdr = llh;
+	handle->lgh_hdr_size = chunk_size;
+	/* first assign flags to use llog_client_ops */
+	llh->llh_flags = flags;
+	rc = llog_read_header(env, handle, uuid);
+	if (rc == 0) {
+		if (unlikely((llh->llh_flags & LLOG_F_IS_PLAIN &&
+			      flags & LLOG_F_IS_CAT) ||
+			     (llh->llh_flags & LLOG_F_IS_CAT &&
+			      flags & LLOG_F_IS_PLAIN))) {
+			CERROR("%s: llog type is %s but initializing %s\n",
+			       handle->lgh_ctxt->loc_obd->obd_name,
+			       llh->llh_flags & LLOG_F_IS_CAT ?
+			       "catalog" : "plain",
+			       flags & LLOG_F_IS_CAT ? "catalog" : "plain");
+			GOTO(out, rc = -EINVAL);
+		} else if (llh->llh_flags &
+			   (LLOG_F_IS_PLAIN | LLOG_F_IS_CAT)) {
+			/*
+			 * it is possible to open llog without specifying llog
+			 * type so it is taken from llh_flags
+			 */
+			flags = llh->llh_flags;
+		} else {
+			/* for some reason the llh_flags has no type set */
+			CERROR("llog type is not specified!\n");
+			GOTO(out, rc = -EINVAL);
+		}
+		if (unlikely(uuid &&
+			     !obd_uuid_equals(uuid, &llh->llh_tgtuuid))) {
+			CERROR("%s: llog uuid mismatch: %s/%s\n",
+			       handle->lgh_ctxt->loc_obd->obd_name,
+			       (char *)uuid->uuid,
+			       (char *)llh->llh_tgtuuid.uuid);
+			GOTO(out, rc = -EEXIST);
+		}
+	}
+	if (flags & LLOG_F_IS_CAT) {
+		LASSERT(list_empty(&handle->u.chd.chd_head));
+		INIT_LIST_HEAD(&handle->u.chd.chd_head);
+		llh->llh_size = sizeof(struct llog_logid_rec);
+		llh->llh_flags |= LLOG_F_IS_FIXSIZE;
+	} else if (!(flags & LLOG_F_IS_PLAIN)) {
+		CERROR("%s: unknown flags: %#x (expected %#x or %#x)\n",
+		       handle->lgh_ctxt->loc_obd->obd_name,
+		       flags, LLOG_F_IS_CAT, LLOG_F_IS_PLAIN);
+		rc = -EINVAL;
+	}
+	llh->llh_flags |= fmt;
+out:
+	if (rc) {
+		OBD_FREE_LARGE(llh, chunk_size);
+		handle->lgh_hdr = NULL;
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_init_handle);
+
+static int llog_process_thread(void *arg)
+{
+	struct llog_process_info	*lpi = arg;
+	struct llog_handle		*loghandle = lpi->lpi_loghandle;
+	struct llog_log_hdr		*llh = loghandle->lgh_hdr;
+	struct llog_process_cat_data	*cd  = lpi->lpi_catdata;
+	char				*buf;
+	size_t				 chunk_size;
+	__u64				 cur_offset;
+	int				 rc = 0, index = 1, last_index;
+	int				 saved_index = 0;
+	int				 last_called_index = 0;
+	bool				 repeated = false;
+
+	ENTRY;
+
+	if (llh == NULL)
+		RETURN(-EINVAL);
+
+	cur_offset = chunk_size = llh->llh_hdr.lrh_len;
+	/* expect chunk_size to be power of two */
+	LASSERT(is_power_of_2(chunk_size));
+
+	OBD_ALLOC_LARGE(buf, chunk_size);
+	if (buf == NULL) {
+		lpi->lpi_rc = -ENOMEM;
+		RETURN(0);
+	}
+
+	if (cd != NULL) {
+		last_called_index = cd->lpcd_first_idx;
+		index = cd->lpcd_first_idx + 1;
+	}
+	if (cd != NULL && cd->lpcd_last_idx)
+		last_index = cd->lpcd_last_idx;
+	else
+		last_index = LLOG_HDR_BITMAP_SIZE(llh) - 1;
+
+	while (rc == 0) {
+		struct llog_rec_hdr *rec;
+		off_t chunk_offset = 0;
+		unsigned int buf_offset = 0;
+		bool partial_chunk;
+		int	lh_last_idx;
+
+		/* skip records not set in bitmap */
+		while (index <= last_index &&
+		       !ext2_test_bit(index, LLOG_HDR_BITMAP(llh)))
+			++index;
+
+		/* There are no indices prior the last_index */
+		if (index > last_index)
+			break;
+
+		CDEBUG(D_OTHER, "index: %d last_index %d\n", index,
+		       last_index);
+
+repeat:
+		/* get the buf with our target record; avoid old garbage */
+		memset(buf, 0, chunk_size);
+		/* the record index for outdated chunk data */
+		lh_last_idx = loghandle->lgh_last_idx + 1;
+		rc = llog_next_block(lpi->lpi_env, loghandle, &saved_index,
+				     index, &cur_offset, buf, chunk_size);
+		if (repeated && rc)
+			CDEBUG(D_OTHER, "cur_offset %llu, chunk_offset %llu,"
+			       " buf_offset %u, rc = %d\n", cur_offset,
+			       (__u64)chunk_offset, buf_offset, rc);
+		/* we`ve tried to reread the chunk, but there is no
+		 * new records */
+		if (rc == -EIO && repeated && (chunk_offset + buf_offset) ==
+		    cur_offset)
+			GOTO(out, rc = 0);
+		if (rc != 0)
+			GOTO(out, rc);
+
+		/* NB: after llog_next_block() call the cur_offset is the
+		 * offset of the next block after read one.
+		 * The absolute offset of the current chunk is calculated
+		 * from cur_offset value and stored in chunk_offset variable.
+		 */
+		if ((cur_offset & (chunk_size - 1)) != 0) {
+			partial_chunk = true;
+			chunk_offset = cur_offset & ~(chunk_size - 1);
+		} else {
+			partial_chunk = false;
+			chunk_offset = cur_offset - chunk_size;
+		}
+
+		/* NB: when rec->lrh_len is accessed it is already swabbed
+		 * since it is used at the "end" of the loop and the rec
+		 * swabbing is done at the beginning of the loop. */
+		for (rec = (struct llog_rec_hdr *)(buf + buf_offset);
+		     (char *)rec < buf + chunk_size;
+		     rec = llog_rec_hdr_next(rec)) {
+
+			CDEBUG(D_OTHER, "processing rec 0x%p type %#x\n",
+			       rec, rec->lrh_type);
+
+			if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+				lustre_swab_llog_rec(rec);
+
+			CDEBUG(D_OTHER, "after swabbing, type=%#x idx=%d\n",
+			       rec->lrh_type, rec->lrh_index);
+
+			/* the bitmap could be changed during processing
+			 * records from the chunk. For wrapped catalog
+			 * it means we can read deleted record and try to
+			 * process it. Check this case and reread the chunk. */
+
+			/* for partial chunk the end of it is zeroed, check
+			 * for index 0 to distinguish it. */
+			if ((partial_chunk && rec->lrh_index == 0) ||
+			     (index == lh_last_idx &&
+			      lh_last_idx != (loghandle->lgh_last_idx + 1))) {
+				/* concurrent llog_add() might add new records
+				 * while llog_processing, check this is not
+				 * the case and re-read the current chunk
+				 * otherwise. */
+				int records;
+				/* lgh_last_idx could be less then index
+				 * for catalog, if catalog is wrapped */
+				if ((index > loghandle->lgh_last_idx &&
+				    !(loghandle->lgh_hdr->llh_flags &
+				      LLOG_F_IS_CAT)) || repeated ||
+				    (loghandle->lgh_obj != NULL &&
+				     dt_object_remote(loghandle->lgh_obj)))
+					GOTO(out, rc = 0);
+				/* <2 records means no more records
+				 * if the last record we processed was
+				 * the final one, then the underlying
+				 * object might have been destroyed yet.
+				 * we better don't access that.. */
+				mutex_lock(&loghandle->lgh_hdr_mutex);
+				records = loghandle->lgh_hdr->llh_count;
+				mutex_unlock(&loghandle->lgh_hdr_mutex);
+				if (records <= 1)
+					GOTO(out, rc = 0);
+				CDEBUG(D_OTHER, "Re-read last llog buffer for "
+				       "new records, index %u, last %u\n",
+				       index, loghandle->lgh_last_idx);
+				/* save offset inside buffer for the re-read */
+				buf_offset = (char *)rec - (char *)buf;
+				cur_offset = chunk_offset;
+				repeated = true;
+				goto repeat;
+			}
+
+			repeated = false;
+
+			if (rec->lrh_len == 0 || rec->lrh_len > chunk_size) {
+				CWARN("%s: invalid length %d in llog "DFID
+				      "record for index %d/%d\n",
+				       loghandle->lgh_ctxt->loc_obd->obd_name,
+				       rec->lrh_len,
+				       PFID(&loghandle->lgh_id.lgl_oi.oi_fid),
+				       rec->lrh_index, index);
+
+				GOTO(out, rc = -EINVAL);
+			}
+
+			if (rec->lrh_index < index) {
+				CDEBUG(D_OTHER, "skipping lrh_index %d\n",
+				       rec->lrh_index);
+				continue;
+			}
+
+			if (rec->lrh_index != index) {
+				CERROR("%s: "DFID" Invalid record: index %u"
+				       " but expected %u\n",
+				       loghandle->lgh_ctxt->loc_obd->obd_name,
+				       PFID(&loghandle->lgh_id.lgl_oi.oi_fid),
+				       rec->lrh_index, index);
+				GOTO(out, rc = -ERANGE);
+			}
+
+			CDEBUG(D_OTHER,
+			       "lrh_index: %d lrh_len: %d (%d remains)\n",
+			       rec->lrh_index, rec->lrh_len,
+			       (int)(buf + chunk_size - (char *)rec));
+
+			loghandle->lgh_cur_idx = rec->lrh_index;
+			loghandle->lgh_cur_offset = (char *)rec - (char *)buf +
+						    chunk_offset;
+
+			/* if set, process the callback on this record */
+			if (ext2_test_bit(index, LLOG_HDR_BITMAP(llh))) {
+				rc = lpi->lpi_cb(lpi->lpi_env, loghandle, rec,
+						 lpi->lpi_cbdata);
+				last_called_index = index;
+				if (rc == LLOG_PROC_BREAK) {
+					GOTO(out, rc);
+				} else if (rc == LLOG_DEL_RECORD) {
+					rc = llog_cancel_rec(lpi->lpi_env,
+							     loghandle,
+							     rec->lrh_index);
+				}
+				if (rc)
+					GOTO(out, rc);
+				/* some stupid callbacks directly cancel records
+				 * and delete llog. Check it and stop
+				 * processing. */
+				if (loghandle->lgh_hdr == NULL ||
+				    loghandle->lgh_hdr->llh_count == 1)
+					GOTO(out, rc = 0);
+			}
+			/* exit if the last index is reached */
+			if (index >= last_index)
+				GOTO(out, rc = 0);
+			++index;
+		}
+	}
+
+out:
+	if (cd != NULL)
+		cd->lpcd_last_idx = last_called_index;
+
+	if (unlikely(rc == -EIO && loghandle->lgh_obj != NULL)) {
+		if (dt_object_remote(loghandle->lgh_obj)) {
+			/* If it is remote object, then -EIO might means
+			 * disconnection or eviction, let's return -EAGAIN,
+			 * so for update recovery log processing, it will
+			 * retry until the umount or abort recovery, see
+			 * lod_sub_recovery_thread() */
+			CERROR("%s retry remote llog process\n",
+			       loghandle->lgh_ctxt->loc_obd->obd_name);
+			rc = -EAGAIN;
+		} else {
+			/* something bad happened to the processing of a local
+			 * llog file, probably I/O error or the log got
+			 * corrupted to be able to finally release the log we
+			 * discard any remaining bits in the header */
+			CERROR("%s: Local llog found corrupted #"DOSTID":%x"
+			       " %s index %d count %d\n",
+			       loghandle->lgh_ctxt->loc_obd->obd_name,
+			       POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen,
+			       ((llh->llh_flags & LLOG_F_IS_CAT) ? "catalog" :
+				"plain"), index, llh->llh_count);
+
+			while (index <= last_index) {
+				if (ext2_test_bit(index,
+						  LLOG_HDR_BITMAP(llh)) != 0)
+					llog_cancel_rec(lpi->lpi_env, loghandle,
+							index);
+				index++;
+			}
+			rc = 0;
+		}
+	}
+
+	OBD_FREE_LARGE(buf, chunk_size);
+	lpi->lpi_rc = rc;
+	return 0;
+}
+
+static int llog_process_thread_daemonize(void *arg)
+{
+	struct llog_process_info	*lpi = arg;
+	struct lu_env			 env;
+	int				 rc;
+	struct nsproxy			*new_ns, *curr_ns = current->nsproxy;
+
+	task_lock(lpi->lpi_reftask);
+	new_ns = lpi->lpi_reftask->nsproxy;
+	if (curr_ns != new_ns) {
+		get_nsproxy(new_ns);
+
+		current->nsproxy = new_ns;
+		/* XXX: we should call put_nsproxy() instead of
+		 * atomic_dec(&ns->count) directly. But put_nsproxy() cannot be
+		 * used outside of the kernel itself, because it calls
+		 * free_nsproxy() which is not exported by the kernel
+		 * (defined in kernel/nsproxy.c) */
+		atomic_dec(&curr_ns->count);
+	}
+	task_unlock(lpi->lpi_reftask);
+
+	unshare_fs_struct();
+
+	/* client env has no keys, tags is just 0 */
+	rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
+	if (rc)
+		goto out;
+	lpi->lpi_env = &env;
+
+	rc = llog_process_thread(arg);
+
+	lu_env_fini(&env);
+out:
+	complete(&lpi->lpi_completion);
+	return rc;
+}
+
+int llog_process_or_fork(const struct lu_env *env,
+			 struct llog_handle *loghandle,
+			 llog_cb_t cb, void *data, void *catdata, bool fork)
+{
+        struct llog_process_info *lpi;
+        int                      rc;
+
+        ENTRY;
+
+	OBD_ALLOC_PTR(lpi);
+	if (lpi == NULL) {
+		CERROR("cannot alloc pointer\n");
+		RETURN(-ENOMEM);
+	}
+	lpi->lpi_loghandle = loghandle;
+	lpi->lpi_cb        = cb;
+	lpi->lpi_cbdata    = data;
+	lpi->lpi_catdata   = catdata;
+
+	if (fork) {
+		struct task_struct *task;
+
+		/* The new thread can't use parent env,
+		 * init the new one in llog_process_thread_daemonize. */
+		lpi->lpi_env = NULL;
+		init_completion(&lpi->lpi_completion);
+		/* take reference to current, so that
+		 * llog_process_thread_daemonize() can use it to switch to
+		 * namespace associated with current  */
+		lpi->lpi_reftask = current;
+		task = kthread_run(llog_process_thread_daemonize, lpi,
+				   "llog_process_thread");
+		if (IS_ERR(task)) {
+			rc = PTR_ERR(task);
+			CERROR("%s: cannot start thread: rc = %d\n",
+			       loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+			GOTO(out_lpi, rc);
+		}
+		wait_for_completion(&lpi->lpi_completion);
+	} else {
+		lpi->lpi_env = env;
+		llog_process_thread(lpi);
+	}
+	rc = lpi->lpi_rc;
+
+out_lpi:
+	OBD_FREE_PTR(lpi);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_process_or_fork);
+
+int llog_process(const struct lu_env *env, struct llog_handle *loghandle,
+		 llog_cb_t cb, void *data, void *catdata)
+{
+	int rc;
+	rc = llog_process_or_fork(env, loghandle, cb, data, catdata, true);
+	return rc == LLOG_DEL_PLAIN ? 0 : rc;
+}
+EXPORT_SYMBOL(llog_process);
+
+int llog_reverse_process(const struct lu_env *env,
+			 struct llog_handle *loghandle, llog_cb_t cb,
+			 void *data, void *catdata)
+{
+        struct llog_log_hdr *llh = loghandle->lgh_hdr;
+        struct llog_process_cat_data *cd = catdata;
+        void *buf;
+        int rc = 0, first_index = 1, index, idx;
+	__u32	chunk_size = llh->llh_hdr.lrh_len;
+        ENTRY;
+
+	OBD_ALLOC_LARGE(buf, chunk_size);
+	if (buf == NULL)
+		RETURN(-ENOMEM);
+
+	if (cd != NULL)
+		first_index = cd->lpcd_first_idx + 1;
+	if (cd != NULL && cd->lpcd_last_idx)
+		index = cd->lpcd_last_idx;
+	else
+		index = LLOG_HDR_BITMAP_SIZE(llh) - 1;
+
+	while (rc == 0) {
+		struct llog_rec_hdr *rec;
+		struct llog_rec_tail *tail;
+
+		/* skip records not set in bitmap */
+		while (index >= first_index &&
+		       !ext2_test_bit(index, LLOG_HDR_BITMAP(llh)))
+			--index;
+
+		LASSERT(index >= first_index - 1);
+		if (index == first_index - 1)
+			break;
+
+		/* get the buf with our target record; avoid old garbage */
+		memset(buf, 0, chunk_size);
+		rc = llog_prev_block(env, loghandle, index, buf, chunk_size);
+		if (rc)
+			GOTO(out, rc);
+
+		rec = buf;
+		idx = rec->lrh_index;
+		CDEBUG(D_RPCTRACE, "index %u : idx %u\n", index, idx);
+                while (idx < index) {
+			rec = (void *)rec + rec->lrh_len;
+			if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+				lustre_swab_llog_rec(rec);
+                        idx ++;
+                }
+		LASSERT(idx == index);
+		tail = (void *)rec + rec->lrh_len - sizeof(*tail);
+
+		/* process records in buffer, starting where we found one */
+		while ((void *)tail > buf) {
+			if (tail->lrt_index == 0)
+				GOTO(out, rc = 0); /* no more records */
+
+			/* if set, process the callback on this record */
+			if (ext2_test_bit(index, LLOG_HDR_BITMAP(llh))) {
+				rec = (void *)tail - tail->lrt_len +
+				      sizeof(*tail);
+
+				rc = cb(env, loghandle, rec, data);
+				if (rc == LLOG_PROC_BREAK) {
+					GOTO(out, rc);
+				} else if (rc == LLOG_DEL_RECORD) {
+					rc = llog_cancel_rec(env, loghandle,
+							     tail->lrt_index);
+				}
+                                if (rc)
+                                        GOTO(out, rc);
+                        }
+
+                        /* previous record, still in buffer? */
+                        --index;
+                        if (index < first_index)
+                                GOTO(out, rc = 0);
+			tail = (void *)tail - tail->lrt_len;
+                }
+        }
+
+out:
+	if (buf != NULL)
+		OBD_FREE_LARGE(buf, chunk_size);
+        RETURN(rc);
+}
+EXPORT_SYMBOL(llog_reverse_process);
+
+/**
+ * new llog API
+ *
+ * API functions:
+ *      llog_open - open llog, may not exist
+ *      llog_exist - check if llog exists
+ *      llog_close - close opened llog, pair for open, frees llog_handle
+ *      llog_declare_create - declare llog creation
+ *      llog_create - create new llog on disk, need transaction handle
+ *      llog_declare_write_rec - declaration of llog write
+ *      llog_write_rec - write llog record on disk, need transaction handle
+ *      llog_declare_add - declare llog catalog record addition
+ *      llog_add - add llog record in catalog, need transaction handle
+ */
+int llog_exist(struct llog_handle *loghandle)
+{
+	struct llog_operations	*lop;
+	int			 rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(loghandle, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_exist == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	rc = lop->lop_exist(loghandle);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_exist);
+
+int llog_declare_create(const struct lu_env *env,
+			struct llog_handle *loghandle, struct thandle *th)
+{
+	struct llog_operations	*lop;
+	int			 raised, rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(loghandle, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_declare_create == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lop->lop_declare_create(env, loghandle, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	RETURN(rc);
+}
+
+int llog_create(const struct lu_env *env, struct llog_handle *handle,
+		struct thandle *th)
+{
+	struct llog_operations	*lop;
+	int			 raised, rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_create == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lop->lop_create(env, handle, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	RETURN(rc);
+}
+
+int llog_declare_write_rec(const struct lu_env *env,
+			   struct llog_handle *handle,
+			   struct llog_rec_hdr *rec, int idx,
+			   struct thandle *th)
+{
+	struct llog_operations	*lop;
+	int			 raised, rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		RETURN(rc);
+	LASSERT(lop);
+	if (lop->lop_declare_write_rec == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lop->lop_declare_write_rec(env, handle, rec, idx, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	RETURN(rc);
+}
+
+int llog_write_rec(const struct lu_env *env, struct llog_handle *handle,
+		   struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+		   int idx, struct thandle *th)
+{
+	struct llog_operations	*lop;
+	int			 raised, rc, buflen;
+
+	ENTRY;
+
+	/* API sanity checks */
+	if (handle == NULL) {
+		CERROR("loghandle is missed\n");
+		RETURN(-EPROTO);
+	} else if (handle->lgh_obj == NULL) {
+		CERROR("loghandle %p with NULL object\n",
+			handle);
+		RETURN(-EPROTO);
+	} else if (th == NULL) {
+		CERROR("%s: missed transaction handle\n",
+			handle->lgh_obj->do_lu.lo_dev->ld_obd->obd_name);
+		RETURN(-EPROTO);
+	} else if (handle->lgh_hdr == NULL) {
+		CERROR("%s: loghandle %p with no header\n",
+			handle->lgh_obj->do_lu.lo_dev->ld_obd->obd_name,
+			handle);
+		RETURN(-EPROTO);
+	}
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		RETURN(rc);
+
+	if (lop->lop_write_rec == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	buflen = rec->lrh_len;
+	LASSERT(cfs_size_round(buflen) == buflen);
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lop->lop_write_rec(env, handle, rec, logcookies, idx, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	RETURN(rc);
+}
+
+int llog_add(const struct lu_env *env, struct llog_handle *lgh,
+	     struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+	     struct thandle *th)
+{
+	int raised, rc;
+
+	ENTRY;
+
+	if (lgh->lgh_logops->lop_add == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lgh->lgh_logops->lop_add(env, lgh, rec, logcookies, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_add);
+
+int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh,
+		     struct llog_rec_hdr *rec, struct thandle *th)
+{
+	int raised, rc;
+
+	ENTRY;
+
+	if (lgh->lgh_logops->lop_declare_add == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lgh->lgh_logops->lop_declare_add(env, lgh, rec, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_declare_add);
+
+/**
+ * Helper function to open llog or create it if doesn't exist.
+ * It hides all transaction handling from caller.
+ */
+int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt,
+		     struct llog_handle **res, struct llog_logid *logid,
+		     char *name)
+{
+	struct dt_device	*d;
+	struct thandle		*th;
+	int			 rc;
+
+	ENTRY;
+
+	rc = llog_open(env, ctxt, res, logid, name, LLOG_OPEN_NEW);
+	if (rc)
+		RETURN(rc);
+
+	if (llog_exist(*res))
+		RETURN(0);
+
+	LASSERT((*res)->lgh_obj != NULL);
+
+	d = lu2dt_dev((*res)->lgh_obj->do_lu.lo_dev);
+
+	th = dt_trans_create(env, d);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
+
+	/* Create update llog object synchronously, which
+	 * happens during inialization process see
+	 * lod_sub_prep_llog(), to make sure the update
+	 * llog object is created before corss-MDT writing
+	 * updates into the llog object */
+	if (ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID)
+		th->th_sync = 1;
+
+	th->th_wait_submit = 1;
+	rc = llog_declare_create(env, *res, th);
+	if (rc == 0) {
+		rc = dt_trans_start_local(env, d, th);
+		if (rc == 0)
+			rc = llog_create(env, *res, th);
+	}
+	dt_trans_stop(env, d, th);
+out:
+	if (rc)
+		llog_close(env, *res);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_open_create);
+
+/**
+ * Helper function to delete existent llog.
+ */
+int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt,
+	       struct llog_logid *logid, char *name)
+{
+	struct llog_handle	*handle;
+	int			 rc = 0, rc2;
+
+	ENTRY;
+
+	/* nothing to erase */
+	if (name == NULL && logid == NULL)
+		RETURN(0);
+
+	rc = llog_open(env, ctxt, &handle, logid, name, LLOG_OPEN_EXISTS);
+	if (rc < 0)
+		RETURN(rc);
+
+	rc = llog_init_handle(env, handle, LLOG_F_IS_PLAIN, NULL);
+	if (rc == 0)
+		rc = llog_destroy(env, handle);
+
+	rc2 = llog_close(env, handle);
+	if (rc == 0)
+		rc = rc2;
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_erase);
+
+/*
+ * Helper function for write record in llog.
+ * It hides all transaction handling from caller.
+ * Valid only with local llog.
+ */
+int llog_write(const struct lu_env *env, struct llog_handle *loghandle,
+	       struct llog_rec_hdr *rec, int idx)
+{
+	struct dt_device	*dt;
+	struct thandle		*th;
+	int			 rc;
+
+	ENTRY;
+
+	LASSERT(loghandle);
+	LASSERT(loghandle->lgh_ctxt);
+	LASSERT(loghandle->lgh_obj != NULL);
+
+	dt = lu2dt_dev(loghandle->lgh_obj->do_lu.lo_dev);
+
+	th = dt_trans_create(env, dt);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	rc = llog_declare_write_rec(env, loghandle, rec, idx, th);
+	if (rc)
+		GOTO(out_trans, rc);
+
+	th->th_wait_submit = 1;
+	rc = dt_trans_start_local(env, dt, th);
+	if (rc)
+		GOTO(out_trans, rc);
+
+	down_write(&loghandle->lgh_lock);
+	rc = llog_write_rec(env, loghandle, rec, NULL, idx, th);
+	up_write(&loghandle->lgh_lock);
+out_trans:
+	dt_trans_stop(env, dt, th);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_write);
+
+int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt,
+	      struct llog_handle **lgh, struct llog_logid *logid,
+	      char *name, enum llog_open_param open_param)
+{
+	int	 raised;
+	int	 rc;
+
+	ENTRY;
+
+	LASSERT(ctxt);
+	LASSERT(ctxt->loc_logops);
+
+	if (ctxt->loc_logops->lop_open == NULL) {
+		*lgh = NULL;
+		RETURN(-EOPNOTSUPP);
+	}
+
+	*lgh = llog_alloc_handle();
+	if (*lgh == NULL)
+		RETURN(-ENOMEM);
+	(*lgh)->lgh_ctxt = ctxt;
+	(*lgh)->lgh_logops = ctxt->loc_logops;
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = ctxt->loc_logops->lop_open(env, *lgh, logid, name, open_param);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	if (rc) {
+		llog_free_handle(*lgh);
+		*lgh = NULL;
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_open);
+
+int llog_close(const struct lu_env *env, struct llog_handle *loghandle)
+{
+	struct llog_operations	*lop;
+	int			 rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(loghandle, &lop);
+	if (rc)
+		GOTO(out, rc);
+	if (lop->lop_close == NULL)
+		GOTO(out, rc = -EOPNOTSUPP);
+	rc = lop->lop_close(env, loghandle);
+out:
+	llog_handle_put(loghandle);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_close);
+
+/**
+ * Helper function to get the llog size in records. It is used by MGS
+ * mostly to check that config llog exists and contains data.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ctxt	llog context
+ * \param[in] name	llog name
+ *
+ * \retval		true if there are records in llog besides a header
+ * \retval		false on error or llog without records
+ */
+int llog_is_empty(const struct lu_env *env, struct llog_ctxt *ctxt,
+		  char *name)
+{
+	struct llog_handle	*llh;
+	int			 rc = 0;
+
+	rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+	if (rc < 0) {
+		if (likely(rc == -ENOENT))
+			rc = 0;
+		GOTO(out, rc);
+	}
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+	if (rc)
+		GOTO(out_close, rc);
+	rc = llog_get_size(llh);
+
+out_close:
+	llog_close(env, llh);
+out:
+	/* The header is record 1, the llog is still considered as empty
+	 * if there is only header */
+	return (rc <= 1);
+}
+EXPORT_SYMBOL(llog_is_empty);
+
+int llog_copy_handler(const struct lu_env *env, struct llog_handle *llh,
+		      struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_handle	*copy_llh = data;
+
+	/* Append all records */
+	return llog_write(env, copy_llh, rec, LLOG_NEXT_IDX);
+}
+
+/* backup plain llog */
+int llog_backup(const struct lu_env *env, struct obd_device *obd,
+		struct llog_ctxt *ctxt, struct llog_ctxt *bctxt,
+		char *name, char *backup)
+{
+	struct llog_handle	*llh, *bllh;
+	int			 rc;
+
+	ENTRY;
+
+	/* open original log */
+	rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+	if (rc < 0) {
+		/* the -ENOENT case is also reported to the caller
+		 * but silently so it should handle that if needed.
+		 */
+		if (rc != -ENOENT)
+			CERROR("%s: failed to open log %s: rc = %d\n",
+			       obd->obd_name, name, rc);
+		RETURN(rc);
+	}
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+	if (rc)
+		GOTO(out_close, rc);
+
+	/* Make sure there's no old backup log */
+	rc = llog_erase(env, bctxt, NULL, backup);
+	if (rc < 0 && rc != -ENOENT)
+		GOTO(out_close, rc);
+
+	/* open backup log */
+	rc = llog_open_create(env, bctxt, &bllh, NULL, backup);
+	if (rc) {
+		CERROR("%s: failed to open backup logfile %s: rc = %d\n",
+		       obd->obd_name, backup, rc);
+		GOTO(out_close, rc);
+	}
+
+	/* check that backup llog is not the same object as original one */
+	if (llh->lgh_obj == bllh->lgh_obj) {
+		CERROR("%s: backup llog %s to itself (%s), objects %p/%p\n",
+		       obd->obd_name, name, backup, llh->lgh_obj,
+		       bllh->lgh_obj);
+		GOTO(out_backup, rc = -EEXIST);
+	}
+
+	rc = llog_init_handle(env, bllh, LLOG_F_IS_PLAIN, NULL);
+	if (rc)
+		GOTO(out_backup, rc);
+
+	/* Copy log record by record */
+	rc = llog_process_or_fork(env, llh, llog_copy_handler, (void *)bllh,
+				  NULL, false);
+	if (rc)
+		CERROR("%s: failed to backup log %s: rc = %d\n",
+		       obd->obd_name, name, rc);
+out_backup:
+	llog_close(env, bllh);
+out_close:
+	llog_close(env, llh);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_backup);
+
+/* Get size of llog */
+__u64 llog_size(const struct lu_env *env, struct llog_handle *llh)
+{
+	int rc;
+	struct lu_attr la;
+
+	rc = llh->lgh_obj->do_ops->do_attr_get(env, llh->lgh_obj, &la);
+	if (rc) {
+		CERROR("%s: attr_get failed, rc = %d\n",
+		       llh->lgh_ctxt->loc_obd->obd_name, rc);
+		return 0;
+	}
+
+	return la.la_size;
+}
+EXPORT_SYMBOL(llog_size);
+
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c
new file mode 100644
index 0000000000000..058d87e7fb3d1
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c
@@ -0,0 +1,1157 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_cat.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ *
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Alexey Zhuravlev <alexey.zhuravlev@intel.com>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd_class.h>
+
+#include "llog_internal.h"
+
+/* Create a new log handle and add it to the open list.
+ * This log handle will be closed when all of the records in it are removed.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ */
+static int llog_cat_new_log(const struct lu_env *env,
+			    struct llog_handle *cathandle,
+			    struct llog_handle *loghandle,
+			    struct thandle *th)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct llog_logid_rec	*rec = &lgi->lgi_logid;
+	struct thandle *handle = NULL;
+	struct dt_device *dt = NULL;
+	struct llog_log_hdr	*llh = cathandle->lgh_hdr;
+	int			 rc, index;
+
+	ENTRY;
+
+	index = (cathandle->lgh_last_idx + 1) %
+		(OBD_FAIL_PRECHECK(OBD_FAIL_CAT_RECORDS) ? (cfs_fail_val + 1) :
+						LLOG_HDR_BITMAP_SIZE(llh));
+
+	/* check that new llog index will not overlap with the first one.
+	 * - llh_cat_idx is the index just before the first/oldest still in-use
+	 *	index in catalog
+	 * - lgh_last_idx is the last/newest used index in catalog
+	 *
+	 * When catalog is not wrapped yet then lgh_last_idx is always larger
+	 * than llh_cat_idx. After the wrap around lgh_last_idx re-starts
+	 * from 0 and llh_cat_idx becomes the upper limit for it
+	 *
+	 * Check if catalog has already wrapped around or not by comparing
+	 * last_idx and cat_idx */
+	if ((index == llh->llh_cat_idx + 1 && llh->llh_count > 1) ||
+	    (index == 0 && llh->llh_cat_idx == 0)) {
+		if (cathandle->lgh_name == NULL) {
+			CWARN("%s: there are no more free slots in catalog "
+			      DFID":%x\n",
+			      loghandle->lgh_ctxt->loc_obd->obd_name,
+			      PFID(&cathandle->lgh_id.lgl_oi.oi_fid),
+			      cathandle->lgh_id.lgl_ogen);
+		} else {
+			CWARN("%s: there are no more free slots in "
+			      "catalog %s\n",
+			      loghandle->lgh_ctxt->loc_obd->obd_name,
+			      cathandle->lgh_name);
+		}
+		RETURN(-ENOSPC);
+	}
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_MDS_LLOG_CREATE_FAILED))
+		RETURN(-ENOSPC);
+
+	if (loghandle->lgh_hdr != NULL) {
+		/* If llog object is remote and creation is failed, lgh_hdr
+		 * might be left over here, free it first */
+		LASSERT(!llog_exist(loghandle));
+		OBD_FREE_LARGE(loghandle->lgh_hdr, loghandle->lgh_hdr_size);
+		loghandle->lgh_hdr = NULL;
+	}
+
+	if (th == NULL) {
+		dt = lu2dt_dev(cathandle->lgh_obj->do_lu.lo_dev);
+
+		handle = dt_trans_create(env, dt);
+		if (IS_ERR(handle))
+			RETURN(PTR_ERR(handle));
+
+		/* Create update llog object synchronously, which
+		 * happens during inialization process see
+		 * lod_sub_prep_llog(), to make sure the update
+		 * llog object is created before corss-MDT writing
+		 * updates into the llog object */
+		if (cathandle->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID)
+			handle->th_sync = 1;
+
+		handle->th_wait_submit = 1;
+
+		rc = llog_declare_create(env, loghandle, handle);
+		if (rc != 0)
+			GOTO(out, rc);
+
+		rec->lid_hdr.lrh_len = sizeof(*rec);
+		rec->lid_hdr.lrh_type = LLOG_LOGID_MAGIC;
+		rec->lid_id = loghandle->lgh_id;
+		rc = llog_declare_write_rec(env, cathandle, &rec->lid_hdr, -1,
+					    handle);
+		if (rc != 0)
+			GOTO(out, rc);
+
+		rc = dt_trans_start_local(env, dt, handle);
+		if (rc != 0)
+			GOTO(out, rc);
+
+		th = handle;
+	}
+
+	rc = llog_create(env, loghandle, th);
+	/* if llog is already created, no need to initialize it */
+	if (rc == -EEXIST) {
+		GOTO(out, rc = 0);
+	} else if (rc != 0) {
+		CERROR("%s: can't create new plain llog in catalog: rc = %d\n",
+		       loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+		GOTO(out, rc);
+	}
+
+	rc = llog_init_handle(env, loghandle,
+			      LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY,
+			      &cathandle->lgh_hdr->llh_tgtuuid);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	/* build the record for this log in the catalog */
+	rec->lid_hdr.lrh_len = sizeof(*rec);
+	rec->lid_hdr.lrh_type = LLOG_LOGID_MAGIC;
+	rec->lid_id = loghandle->lgh_id;
+
+	/* append the new record into catalog. The new index will be
+	 * assigned to the record and updated in rec header */
+	rc = llog_write_rec(env, cathandle, &rec->lid_hdr,
+			    &loghandle->u.phd.phd_cookie, LLOG_NEXT_IDX, th);
+	if (rc < 0)
+		GOTO(out_destroy, rc);
+
+	CDEBUG(D_OTHER, "new plain log "DFID".%u of catalog "DFID"\n",
+	       PFID(&loghandle->lgh_id.lgl_oi.oi_fid), rec->lid_hdr.lrh_index,
+	       PFID(&cathandle->lgh_id.lgl_oi.oi_fid));
+
+	loghandle->lgh_hdr->llh_cat_idx = rec->lid_hdr.lrh_index;
+
+	/* limit max size of plain llog so that space can be
+	 * released sooner, especially on small filesystems */
+	/* 2MB for the cases when free space hasn't been learned yet */
+	loghandle->lgh_max_size = 2 << 20;
+	dt = lu2dt_dev(cathandle->lgh_obj->do_lu.lo_dev);
+	rc = dt_statfs(env, dt, &lgi->lgi_statfs);
+	if (rc == 0 && lgi->lgi_statfs.os_bfree > 0) {
+		__u64 freespace = (lgi->lgi_statfs.os_bfree *
+				  lgi->lgi_statfs.os_bsize) >> 6;
+		if (freespace < loghandle->lgh_max_size)
+			loghandle->lgh_max_size = freespace;
+		/* shouldn't be > 128MB in any case?
+		 * it's 256K records of 512 bytes each */
+		if (freespace > (128 << 20))
+			loghandle->lgh_max_size = 128 << 20;
+	}
+	rc = 0;
+
+out:
+	if (handle != NULL) {
+		handle->th_result = rc >= 0 ? 0 : rc;
+		dt_trans_stop(env, dt, handle);
+	}
+	RETURN(rc);
+
+out_destroy:
+	/* to signal llog_cat_close() it shouldn't try to destroy the llog,
+	 * we want to destroy it in this transaction, otherwise the object
+	 * becomes an orphan */
+	loghandle->lgh_hdr->llh_flags &= ~LLOG_F_ZAP_WHEN_EMPTY;
+	/* this is to mimic full log, so another llog_cat_current_log()
+	 * can skip it and ask for another onet */
+	loghandle->lgh_last_idx = LLOG_HDR_BITMAP_SIZE(llh) + 1;
+	llog_trans_destroy(env, loghandle, th);
+	RETURN(rc);
+}
+
+/* Open an existent log handle and add it to the open list.
+ * This log handle will be closed when all of the records in it are removed.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ * We return a lock on the handle to ensure nobody yanks it from us.
+ *
+ * This takes extra reference on llog_handle via llog_handle_get() and require
+ * this reference to be put by caller using llog_handle_put()
+ */
+int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
+		       struct llog_handle **res, struct llog_logid *logid)
+{
+	struct llog_handle	*loghandle;
+	enum llog_flag		 fmt;
+	int			 rc = 0;
+
+	ENTRY;
+
+	if (cathandle == NULL)
+		RETURN(-EBADF);
+
+	fmt = cathandle->lgh_hdr->llh_flags & LLOG_F_EXT_MASK;
+	down_write(&cathandle->lgh_lock);
+	list_for_each_entry(loghandle, &cathandle->u.chd.chd_head,
+			    u.phd.phd_entry) {
+		struct llog_logid *cgl = &loghandle->lgh_id;
+
+		if (ostid_id(&cgl->lgl_oi) == ostid_id(&logid->lgl_oi) &&
+		    ostid_seq(&cgl->lgl_oi) == ostid_seq(&logid->lgl_oi)) {
+			if (cgl->lgl_ogen != logid->lgl_ogen) {
+				CWARN("%s: log "DFID" generation %x != %x\n",
+				      loghandle->lgh_ctxt->loc_obd->obd_name,
+				      PFID(&logid->lgl_oi.oi_fid),
+				      cgl->lgl_ogen, logid->lgl_ogen);
+				continue;
+			}
+			loghandle->u.phd.phd_cat_handle = cathandle;
+			up_write(&cathandle->lgh_lock);
+			GOTO(out, rc = 0);
+		}
+	}
+	up_write(&cathandle->lgh_lock);
+
+	rc = llog_open(env, cathandle->lgh_ctxt, &loghandle, logid, NULL,
+		       LLOG_OPEN_EXISTS);
+	if (rc < 0) {
+		CERROR("%s: error opening log id "DFID":%x: rc = %d\n",
+		       cathandle->lgh_ctxt->loc_obd->obd_name,
+		       PFID(&logid->lgl_oi.oi_fid), logid->lgl_ogen, rc);
+		RETURN(rc);
+	}
+
+	rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN | fmt, NULL);
+	if (rc < 0) {
+		llog_close(env, loghandle);
+		loghandle = NULL;
+		RETURN(rc);
+	}
+
+	down_write(&cathandle->lgh_lock);
+	list_add_tail(&loghandle->u.phd.phd_entry, &cathandle->u.chd.chd_head);
+	up_write(&cathandle->lgh_lock);
+
+	loghandle->u.phd.phd_cat_handle = cathandle;
+	loghandle->u.phd.phd_cookie.lgc_lgl = cathandle->lgh_id;
+	loghandle->u.phd.phd_cookie.lgc_index =
+				loghandle->lgh_hdr->llh_cat_idx;
+	EXIT;
+out:
+	llog_handle_get(loghandle);
+	*res = loghandle;
+	return 0;
+}
+
+int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle)
+{
+	struct llog_handle	*loghandle, *n;
+	int			 rc;
+
+	ENTRY;
+
+	list_for_each_entry_safe(loghandle, n, &cathandle->u.chd.chd_head,
+				 u.phd.phd_entry) {
+		struct llog_log_hdr	*llh = loghandle->lgh_hdr;
+		int			 index;
+
+		/* unlink open-not-created llogs */
+		list_del_init(&loghandle->u.phd.phd_entry);
+		llh = loghandle->lgh_hdr;
+		if (loghandle->lgh_obj != NULL && llh != NULL &&
+		    (llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+		    (llh->llh_count == 1)) {
+			rc = llog_destroy(env, loghandle);
+			if (rc)
+				CERROR("%s: failure destroying log during "
+				       "cleanup: rc = %d\n",
+				       loghandle->lgh_ctxt->loc_obd->obd_name,
+				       rc);
+
+			index = loghandle->u.phd.phd_cookie.lgc_index;
+			llog_cat_cleanup(env, cathandle, NULL, index);
+		}
+		llog_close(env, loghandle);
+	}
+	/* if handle was stored in ctxt, remove it too */
+	if (cathandle->lgh_ctxt->loc_handle == cathandle)
+		cathandle->lgh_ctxt->loc_handle = NULL;
+	rc = llog_close(env, cathandle);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_close);
+
+/**
+ * lockdep markers for nested struct llog_handle::lgh_lock locking.
+ */
+enum {
+        LLOGH_CAT,
+        LLOGH_LOG
+};
+
+/** Return the currently active log handle.  If the current log handle doesn't
+ * have enough space left for the current record, start a new one.
+ *
+ * If reclen is 0, we only want to know what the currently active log is,
+ * otherwise we get a lock on this log so nobody can steal our space.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ *
+ * NOTE: loghandle is write-locked upon successful return
+ */
+static struct llog_handle *llog_cat_current_log(struct llog_handle *cathandle,
+						struct thandle *th)
+{
+        struct llog_handle *loghandle = NULL;
+        ENTRY;
+
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_MDS_LLOG_CREATE_FAILED2)) {
+		down_write_nested(&cathandle->lgh_lock, LLOGH_CAT);
+		GOTO(next, loghandle);
+	}
+
+	down_read_nested(&cathandle->lgh_lock, LLOGH_CAT);
+        loghandle = cathandle->u.chd.chd_current_log;
+        if (loghandle) {
+		struct llog_log_hdr *llh;
+
+		down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+		llh = loghandle->lgh_hdr;
+		if (llh == NULL || !llog_is_full(loghandle)) {
+			up_read(&cathandle->lgh_lock);
+                        RETURN(loghandle);
+                } else {
+			up_write(&loghandle->lgh_lock);
+                }
+        }
+	up_read(&cathandle->lgh_lock);
+
+	/* time to use next log */
+
+	/* first, we have to make sure the state hasn't changed */
+	down_write_nested(&cathandle->lgh_lock, LLOGH_CAT);
+	loghandle = cathandle->u.chd.chd_current_log;
+	if (loghandle) {
+		struct llog_log_hdr *llh;
+
+		down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+		llh = loghandle->lgh_hdr;
+		LASSERT(llh);
+		if (!llog_is_full(loghandle))
+			GOTO(out_unlock, loghandle);
+		else
+			up_write(&loghandle->lgh_lock);
+	}
+
+next:
+	/* Sigh, the chd_next_log and chd_current_log is initialized
+	 * in declare phase, and we do not serialize the catlog
+	 * accessing, so it might be possible the llog creation
+	 * thread (see llog_cat_declare_add_rec()) did not create
+	 * llog successfully, then the following thread might
+	 * meet this situation. */
+	if (IS_ERR_OR_NULL(cathandle->u.chd.chd_next_log)) {
+		CERROR("%s: next log does not exist!\n",
+		       cathandle->lgh_ctxt->loc_obd->obd_name);
+		loghandle = ERR_PTR(-EIO);
+		if (cathandle->u.chd.chd_next_log == NULL) {
+			/* Store the error in chd_next_log, so
+			 * the following process can get correct
+			 * failure value */
+			cathandle->u.chd.chd_next_log = loghandle;
+		}
+		GOTO(out_unlock, loghandle);
+	}
+
+	CDEBUG(D_INODE, "use next log\n");
+
+	loghandle = cathandle->u.chd.chd_next_log;
+	cathandle->u.chd.chd_current_log = loghandle;
+	cathandle->u.chd.chd_next_log = NULL;
+	down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+
+out_unlock:
+	up_write(&cathandle->lgh_lock);
+	LASSERT(loghandle);
+	RETURN(loghandle);
+}
+
+static int llog_cat_update_header(const struct lu_env *env,
+			   struct llog_handle *cathandle)
+{
+	struct llog_handle *loghandle;
+	int rc;
+	ENTRY;
+
+	/* refresh llog */
+	down_write(&cathandle->lgh_lock);
+	if (!cathandle->lgh_stale) {
+		up_write(&cathandle->lgh_lock);
+		RETURN(0);
+	}
+	list_for_each_entry(loghandle, &cathandle->u.chd.chd_head,
+			    u.phd.phd_entry) {
+		if (!llog_exist(loghandle))
+			continue;
+
+		rc = llog_read_header(env, loghandle, NULL);
+		if (rc != 0) {
+			up_write(&cathandle->lgh_lock);
+			GOTO(out, rc);
+		}
+	}
+	rc = llog_read_header(env, cathandle, NULL);
+	if (rc == 0)
+		cathandle->lgh_stale = 0;
+	up_write(&cathandle->lgh_lock);
+	if (rc != 0)
+		GOTO(out, rc);
+out:
+	RETURN(rc);
+}
+
+/* Add a single record to the recovery log(s) using a catalog
+ * Returns as llog_write_record
+ *
+ * Assumes caller has already pushed us into the kernel context.
+ */
+int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle,
+		     struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+		     struct thandle *th)
+{
+        struct llog_handle *loghandle;
+	int rc, retried = 0;
+	ENTRY;
+
+	LASSERT(rec->lrh_len <= cathandle->lgh_ctxt->loc_chunk_size);
+
+retry:
+	loghandle = llog_cat_current_log(cathandle, th);
+	if (IS_ERR(loghandle))
+		RETURN(PTR_ERR(loghandle));
+
+	/* loghandle is already locked by llog_cat_current_log() for us */
+	if (!llog_exist(loghandle)) {
+		rc = llog_cat_new_log(env, cathandle, loghandle, th);
+		if (rc < 0) {
+			up_write(&loghandle->lgh_lock);
+			/* nobody should be trying to use this llog */
+			down_write(&cathandle->lgh_lock);
+			if (cathandle->u.chd.chd_current_log == loghandle)
+				cathandle->u.chd.chd_current_log = NULL;
+			up_write(&cathandle->lgh_lock);
+			RETURN(rc);
+		}
+	}
+	/* now let's try to add the record */
+	rc = llog_write_rec(env, loghandle, rec, reccookie, LLOG_NEXT_IDX, th);
+	if (rc < 0) {
+		CDEBUG_LIMIT(rc == -ENOSPC ? D_HA : D_ERROR,
+			     "llog_write_rec %d: lh=%p\n", rc, loghandle);
+		/* -ENOSPC is returned if no empty records left
+		 * and when it's lack of space on the stogage.
+		 * there is no point to try again if it's the second
+		 * case. many callers (like llog test) expect ENOSPC,
+		 * so we preserve this error code, but look for the
+		 * actual cause here */
+		if (rc == -ENOSPC && llog_is_full(loghandle))
+			rc = -ENOBUFS;
+	}
+	up_write(&loghandle->lgh_lock);
+
+	if (rc == -ENOBUFS) {
+		if (retried++ == 0)
+			GOTO(retry, rc);
+		CERROR("%s: error on 2nd llog: rc = %d\n",
+		       cathandle->lgh_ctxt->loc_obd->obd_name, rc);
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_add_rec);
+
+int llog_cat_declare_add_rec(const struct lu_env *env,
+			     struct llog_handle *cathandle,
+			     struct llog_rec_hdr *rec, struct thandle *th)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct llog_logid_rec	*lirec = &lgi->lgi_logid;
+	struct llog_handle	*loghandle, *next;
+	int			 rc = 0;
+
+	ENTRY;
+
+	if (cathandle->u.chd.chd_current_log == NULL) {
+		/* declare new plain llog */
+		down_write(&cathandle->lgh_lock);
+		if (cathandle->u.chd.chd_current_log == NULL) {
+			rc = llog_open(env, cathandle->lgh_ctxt, &loghandle,
+				       NULL, NULL, LLOG_OPEN_NEW);
+			if (rc == 0) {
+				cathandle->u.chd.chd_current_log = loghandle;
+				list_add_tail(&loghandle->u.phd.phd_entry,
+					      &cathandle->u.chd.chd_head);
+			}
+		}
+		up_write(&cathandle->lgh_lock);
+	} else if (cathandle->u.chd.chd_next_log == NULL ||
+		   IS_ERR(cathandle->u.chd.chd_next_log)) {
+		/* declare next plain llog */
+		down_write(&cathandle->lgh_lock);
+		if (cathandle->u.chd.chd_next_log == NULL ||
+		    IS_ERR(cathandle->u.chd.chd_next_log)) {
+			rc = llog_open(env, cathandle->lgh_ctxt, &loghandle,
+				       NULL, NULL, LLOG_OPEN_NEW);
+			if (rc == 0) {
+				cathandle->u.chd.chd_next_log = loghandle;
+				list_add_tail(&loghandle->u.phd.phd_entry,
+					      &cathandle->u.chd.chd_head);
+			}
+		}
+		up_write(&cathandle->lgh_lock);
+	}
+	if (rc)
+		GOTO(out, rc);
+
+	lirec->lid_hdr.lrh_len = sizeof(*lirec);
+
+	if (!llog_exist(cathandle->u.chd.chd_current_log)) {
+		if (dt_object_remote(cathandle->lgh_obj)) {
+			/* For remote operation, if we put the llog object
+			 * creation in the current transaction, then the
+			 * llog object will not be created on the remote
+			 * target until the transaction stop, if other
+			 * operations start before the transaction stop,
+			 * and use the same llog object, will be dependent
+			 * on the success of this transaction. So let's
+			 * create the llog object synchronously here to
+			 * remove the dependency. */
+create_again:
+			down_read_nested(&cathandle->lgh_lock, LLOGH_CAT);
+			loghandle = cathandle->u.chd.chd_current_log;
+			down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+			if (cathandle->lgh_stale) {
+				up_write(&loghandle->lgh_lock);
+				up_read(&cathandle->lgh_lock);
+				GOTO(out, rc = -EIO);
+			}
+			if (!llog_exist(loghandle)) {
+				rc = llog_cat_new_log(env, cathandle, loghandle,
+						      NULL);
+				if (rc == -ESTALE)
+					cathandle->lgh_stale = 1;
+			}
+			up_write(&loghandle->lgh_lock);
+			up_read(&cathandle->lgh_lock);
+			if (rc == -ESTALE) {
+				rc = llog_cat_update_header(env, cathandle);
+				if (rc != 0)
+					GOTO(out, rc);
+				goto create_again;
+			} else if (rc < 0) {
+				GOTO(out, rc);
+			}
+		} else {
+			rc = llog_declare_create(env,
+					cathandle->u.chd.chd_current_log, th);
+			if (rc)
+				GOTO(out, rc);
+			llog_declare_write_rec(env, cathandle,
+					       &lirec->lid_hdr, -1, th);
+		}
+	}
+
+write_again:
+	/* declare records in the llogs */
+	rc = llog_declare_write_rec(env, cathandle->u.chd.chd_current_log,
+				    rec, -1, th);
+	if (rc == -ESTALE) {
+		down_write(&cathandle->lgh_lock);
+		if (cathandle->lgh_stale) {
+			up_write(&cathandle->lgh_lock);
+			GOTO(out, rc = -EIO);
+		}
+
+		cathandle->lgh_stale = 1;
+		up_write(&cathandle->lgh_lock);
+		rc = llog_cat_update_header(env, cathandle);
+		if (rc != 0)
+			GOTO(out, rc);
+		goto write_again;
+	} else if (rc < 0) {
+		GOTO(out, rc);
+	}
+
+	next = cathandle->u.chd.chd_next_log;
+	if (!IS_ERR_OR_NULL(next)) {
+		if (!llog_exist(next)) {
+			if (dt_object_remote(cathandle->lgh_obj)) {
+				/* For remote operation, if we put the llog
+				 * object creation in the current transaction,
+				 * then the llog object will not be created on
+				 * the remote target until the transaction stop,
+				 * if other operations start before the
+				 * transaction stop, and use the same llog
+				 * object, will be dependent on the success of
+				 * this transaction. So let's create the llog
+				 * object synchronously here to remove the
+				 * dependency. */
+				down_write_nested(&cathandle->lgh_lock,
+						 LLOGH_CAT);
+				next = cathandle->u.chd.chd_next_log;
+				if (IS_ERR_OR_NULL(next)) {
+					/* Sigh, another thread just tried,
+					 * let's fail as well */
+					up_write(&cathandle->lgh_lock);
+					if (next == NULL)
+						rc = -EIO;
+					else
+						rc = PTR_ERR(next);
+					GOTO(out, rc);
+				}
+
+				down_write_nested(&next->lgh_lock, LLOGH_LOG);
+				if (!llog_exist(next)) {
+					rc = llog_cat_new_log(env, cathandle,
+							      next, NULL);
+					if (rc < 0)
+						cathandle->u.chd.chd_next_log =
+								ERR_PTR(rc);
+				}
+				up_write(&next->lgh_lock);
+				up_write(&cathandle->lgh_lock);
+				if (rc < 0)
+					GOTO(out, rc);
+			} else {
+				rc = llog_declare_create(env, next, th);
+				llog_declare_write_rec(env, cathandle,
+						&lirec->lid_hdr, -1, th);
+			}
+		}
+		/* XXX: we hope for declarations made for existing llog
+		 *	this might be not correct with some backends
+		 *	where declarations are expected against specific
+		 *	object like ZFS with full debugging enabled */
+		/*llog_declare_write_rec(env, next, rec, -1, th);*/
+	}
+out:
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_declare_add_rec);
+
+int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle,
+		 struct llog_rec_hdr *rec, struct llog_cookie *reccookie)
+{
+	struct llog_ctxt	*ctxt;
+	struct dt_device	*dt;
+	struct thandle		*th = NULL;
+	int			 rc;
+
+	ctxt = cathandle->lgh_ctxt;
+	LASSERT(ctxt);
+	LASSERT(ctxt->loc_exp);
+
+	LASSERT(cathandle->lgh_obj != NULL);
+	dt = lu2dt_dev(cathandle->lgh_obj->do_lu.lo_dev);
+
+	th = dt_trans_create(env, dt);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	rc = llog_cat_declare_add_rec(env, cathandle, rec, th);
+	if (rc)
+		GOTO(out_trans, rc);
+
+	rc = dt_trans_start_local(env, dt, th);
+	if (rc)
+		GOTO(out_trans, rc);
+	rc = llog_cat_add_rec(env, cathandle, rec, reccookie, th);
+out_trans:
+	dt_trans_stop(env, dt, th);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_add);
+
+/* For each cookie in the cookie array, we clear the log in-use bit and either:
+ * - the log is empty, so mark it free in the catalog header and delete it
+ * - the log is not empty, just write out the log header
+ *
+ * The cookies may be in different log files, so we need to get new logs
+ * each time.
+ *
+ * Assumes caller has already pushed us into the kernel context.
+ */
+int llog_cat_cancel_records(const struct lu_env *env,
+			    struct llog_handle *cathandle, int count,
+			    struct llog_cookie *cookies)
+{
+	int i, index, rc = 0, failed = 0;
+
+	ENTRY;
+
+	for (i = 0; i < count; i++, cookies++) {
+		struct llog_handle *loghandle;
+		struct llog_logid *lgl = &cookies->lgc_lgl;
+		int  lrc;
+
+		rc = llog_cat_id2handle(env, cathandle, &loghandle, lgl);
+		if (rc) {
+			CDEBUG(D_HA, "%s: cannot find llog for handle "DFID":%x"
+			       ": rc = %d\n",
+			       cathandle->lgh_ctxt->loc_obd->obd_name,
+			       PFID(&lgl->lgl_oi.oi_fid), lgl->lgl_ogen, rc);
+			failed++;
+			continue;
+		}
+
+		if ((cathandle->lgh_ctxt->loc_flags &
+		     LLOG_CTXT_FLAG_NORMAL_FID) && !llog_exist(loghandle)) {
+			/* For update log, some of loghandles of cathandle
+			 * might not exist because remote llog creation might
+			 * be failed, so let's skip the record cancellation
+			 * for these non-exist llogs.
+			 */
+			lrc = -ENOENT;
+			CDEBUG(D_HA, "%s: llog "DFID":%x does not exist"
+			       ": rc = %d\n",
+			       cathandle->lgh_ctxt->loc_obd->obd_name,
+			       PFID(&lgl->lgl_oi.oi_fid), lgl->lgl_ogen, lrc);
+			failed++;
+			if (rc == 0)
+				rc = lrc;
+			continue;
+		}
+
+		lrc = llog_cancel_rec(env, loghandle, cookies->lgc_index);
+		if (lrc == LLOG_DEL_PLAIN) { /* log has been destroyed */
+			index = loghandle->u.phd.phd_cookie.lgc_index;
+			lrc = llog_cat_cleanup(env, cathandle, loghandle,
+					       index);
+			if (rc == 0)
+				rc = lrc;
+		} else if (lrc == -ENOENT) {
+			if (rc == 0) /* ENOENT shouldn't rewrite any error */
+				rc = lrc;
+		} else if (lrc < 0) {
+			failed++;
+			if (rc == 0)
+				rc = lrc;
+		}
+		llog_handle_put(loghandle);
+	}
+	if (rc)
+		CERROR("%s: fail to cancel %d of %d llog-records: rc = %d\n",
+		       cathandle->lgh_ctxt->loc_obd->obd_name, failed, count,
+		       rc);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_cancel_records);
+
+static int llog_cat_process_cb(const struct lu_env *env,
+			       struct llog_handle *cat_llh,
+			       struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_process_data *d = data;
+	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+	struct llog_handle *llh;
+	struct llog_log_hdr *hdr;
+	int rc;
+
+	ENTRY;
+	if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+		CERROR("invalid record in catalog\n");
+		RETURN(-EINVAL);
+	}
+	CDEBUG(D_HA, "processing log "DFID":%x at index %u of catalog "
+	       DFID"\n", PFID(&lir->lid_id.lgl_oi.oi_fid), lir->lid_id.lgl_ogen,
+	       rec->lrh_index, PFID(&cat_llh->lgh_id.lgl_oi.oi_fid));
+
+	rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id);
+	if (rc) {
+		CERROR("%s: cannot find handle for llog "DFID": rc = %d\n",
+		       cat_llh->lgh_ctxt->loc_obd->obd_name,
+		       PFID(&lir->lid_id.lgl_oi.oi_fid), rc);
+		if (rc == -ENOENT || rc == -ESTALE) {
+			/* After a server crash, a stub of index
+			 * record in catlog could be kept, because
+			 * plain log destroy + catlog index record
+			 * deletion are not atomic. So we end up with
+			 * an index but no actual record. Destroy the
+			 * index and move on. */
+			rc = llog_cat_cleanup(env, cat_llh, NULL,
+					      rec->lrh_index);
+		}
+
+		RETURN(rc);
+	}
+
+	/* clean old empty llogs, do not consider current llog in use */
+	/* ignore remote (lgh_obj=NULL) llogs */
+	hdr = llh->lgh_hdr;
+	if ((hdr->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+	    hdr->llh_count == 1 && cat_llh->lgh_obj != NULL &&
+	    llh != cat_llh->u.chd.chd_current_log) {
+		rc = llog_destroy(env, llh);
+		if (rc)
+			CERROR("%s: fail to destroy empty log: rc = %d\n",
+			       llh->lgh_ctxt->loc_obd->obd_name, rc);
+		GOTO(out, rc = LLOG_DEL_PLAIN);
+	}
+
+	if (rec->lrh_index < d->lpd_startcat) {
+		/* Skip processing of the logs until startcat */
+		rc = 0;
+	} else if (d->lpd_startidx > 0) {
+                struct llog_process_cat_data cd;
+
+                cd.lpcd_first_idx = d->lpd_startidx;
+                cd.lpcd_last_idx = 0;
+		rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data,
+					  &cd, false);
+		/* Continue processing the next log from idx 0 */
+		d->lpd_startidx = 0;
+	} else {
+		rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data,
+					  NULL, false);
+	}
+
+out:
+	/* The empty plain log was destroyed while processing */
+	if (rc == LLOG_DEL_PLAIN)
+		rc = llog_cat_cleanup(env, cat_llh, llh,
+				      llh->u.phd.phd_cookie.lgc_index);
+	llog_handle_put(llh);
+
+	RETURN(rc);
+}
+
+int llog_cat_process_or_fork(const struct lu_env *env,
+			     struct llog_handle *cat_llh, llog_cb_t cat_cb,
+			     llog_cb_t cb, void *data, int startcat,
+			     int startidx, bool fork)
+{
+        struct llog_process_data d;
+        struct llog_log_hdr *llh = cat_llh->lgh_hdr;
+        int rc;
+        ENTRY;
+
+        LASSERT(llh->llh_flags & LLOG_F_IS_CAT);
+        d.lpd_data = data;
+        d.lpd_cb = cb;
+        d.lpd_startcat = startcat;
+        d.lpd_startidx = startidx;
+
+	if (llh->llh_cat_idx >= cat_llh->lgh_last_idx &&
+	    llh->llh_count > 1) {
+		struct llog_process_cat_data cd;
+
+		CWARN("%s: catlog "DFID" crosses index zero\n",
+		      cat_llh->lgh_ctxt->loc_obd->obd_name,
+		      PFID(&cat_llh->lgh_id.lgl_oi.oi_fid));
+
+		cd.lpcd_first_idx = llh->llh_cat_idx;
+		cd.lpcd_last_idx = 0;
+		rc = llog_process_or_fork(env, cat_llh, cat_cb,
+					  &d, &cd, fork);
+		if (rc != 0)
+			RETURN(rc);
+
+		cd.lpcd_first_idx = 0;
+		cd.lpcd_last_idx = cat_llh->lgh_last_idx;
+		rc = llog_process_or_fork(env, cat_llh, cat_cb,
+					  &d, &cd, fork);
+        } else {
+		rc = llog_process_or_fork(env, cat_llh, cat_cb,
+					  &d, NULL, fork);
+        }
+
+        RETURN(rc);
+}
+
+int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh,
+		     llog_cb_t cb, void *data, int startcat, int startidx)
+{
+	return llog_cat_process_or_fork(env, cat_llh, llog_cat_process_cb,
+					cb, data, startcat, startidx, false);
+}
+EXPORT_SYMBOL(llog_cat_process);
+
+static int llog_cat_size_cb(const struct lu_env *env,
+			     struct llog_handle *cat_llh,
+			     struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_process_data *d = data;
+	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+	struct llog_handle *llh;
+	int rc;
+	__u64 *cum_size = d->lpd_data;
+	__u64 size;
+
+	ENTRY;
+	if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+		CERROR("%s: invalid record in catalog, rc = %d\n",
+		       cat_llh->lgh_ctxt->loc_obd->obd_name, -EINVAL);
+		RETURN(-EINVAL);
+	}
+	CDEBUG(D_HA, "processing log "DFID":%x at index %u of catalog "
+	       DFID"\n", PFID(&lir->lid_id.lgl_oi.oi_fid), lir->lid_id.lgl_ogen,
+	       rec->lrh_index, PFID(&cat_llh->lgh_id.lgl_oi.oi_fid));
+
+	rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id);
+	if (rc) {
+		CWARN("%s: cannot find handle for llog "DFID": rc = %d\n",
+		      cat_llh->lgh_ctxt->loc_obd->obd_name,
+		      PFID(&lir->lid_id.lgl_oi.oi_fid), rc);
+		RETURN(0);
+	}
+	size = llog_size(env, llh);
+	*cum_size += size;
+
+	CDEBUG(D_INFO, "Add llog entry "DFID" size %llu\n",
+	       PFID(&llh->lgh_id.lgl_oi.oi_fid), size);
+
+	llog_handle_put(llh);
+
+	RETURN(0);
+
+}
+
+__u64 llog_cat_size(const struct lu_env *env, struct llog_handle *cat_llh)
+{
+	__u64 size = llog_size(env, cat_llh);
+
+	llog_cat_process_or_fork(env, cat_llh, llog_cat_size_cb,
+				 NULL, &size, 0, 0, false);
+
+	return size;
+}
+EXPORT_SYMBOL(llog_cat_size);
+
+static int llog_cat_reverse_process_cb(const struct lu_env *env,
+				       struct llog_handle *cat_llh,
+				       struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_process_data *d = data;
+	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+	struct llog_handle *llh;
+	struct llog_log_hdr *hdr;
+	int rc;
+
+	if (le32_to_cpu(rec->lrh_type) != LLOG_LOGID_MAGIC) {
+		CERROR("invalid record in catalog\n");
+		RETURN(-EINVAL);
+	}
+	CDEBUG(D_HA, "processing log "DFID":%x at index %u of catalog "
+	       DFID"\n", PFID(&lir->lid_id.lgl_oi.oi_fid), lir->lid_id.lgl_ogen,
+	       le32_to_cpu(rec->lrh_index),
+	       PFID(&cat_llh->lgh_id.lgl_oi.oi_fid));
+
+	rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id);
+	if (rc) {
+		CERROR("%s: cannot find handle for llog "DFID": rc = %d\n",
+		       cat_llh->lgh_ctxt->loc_obd->obd_name,
+		       PFID(&lir->lid_id.lgl_oi.oi_fid), rc);
+		if (rc == -ENOENT || rc == -ESTALE) {
+			/* After a server crash, a stub of index
+			 * record in catlog could be kept, because
+			 * plain log destroy + catlog index record
+			 * deletion are not atomic. So we end up with
+			 * an index but no actual record. Destroy the
+			 * index and move on. */
+			rc = llog_cat_cleanup(env, cat_llh, NULL,
+					      rec->lrh_index);
+		}
+
+		RETURN(rc);
+	}
+
+	/* clean old empty llogs, do not consider current llog in use */
+	hdr = llh->lgh_hdr;
+	if ((hdr->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+	    hdr->llh_count == 1 &&
+	    llh != cat_llh->u.chd.chd_current_log) {
+		rc = llog_destroy(env, llh);
+		if (rc)
+			CERROR("%s: fail to destroy empty log: rc = %d\n",
+			       llh->lgh_ctxt->loc_obd->obd_name, rc);
+		GOTO(out, rc = LLOG_DEL_PLAIN);
+	}
+
+	rc = llog_reverse_process(env, llh, d->lpd_cb, d->lpd_data, NULL);
+
+out:
+	/* The empty plain was destroyed while processing */
+	if (rc == LLOG_DEL_PLAIN)
+		rc = llog_cat_cleanup(env, cat_llh, llh,
+				      llh->u.phd.phd_cookie.lgc_index);
+
+	llog_handle_put(llh);
+	RETURN(rc);
+}
+
+int llog_cat_reverse_process(const struct lu_env *env,
+			     struct llog_handle *cat_llh,
+			     llog_cb_t cb, void *data)
+{
+        struct llog_process_data d;
+        struct llog_process_cat_data cd;
+        struct llog_log_hdr *llh = cat_llh->lgh_hdr;
+        int rc;
+        ENTRY;
+
+        LASSERT(llh->llh_flags & LLOG_F_IS_CAT);
+        d.lpd_data = data;
+        d.lpd_cb = cb;
+
+	if (llh->llh_cat_idx >= cat_llh->lgh_last_idx &&
+	    llh->llh_count > 1) {
+		CWARN("%s: catalog "DFID" crosses index zero\n",
+		      cat_llh->lgh_ctxt->loc_obd->obd_name,
+		      PFID(&cat_llh->lgh_id.lgl_oi.oi_fid));
+
+		cd.lpcd_first_idx = 0;
+		cd.lpcd_last_idx = cat_llh->lgh_last_idx;
+		rc = llog_reverse_process(env, cat_llh,
+					  llog_cat_reverse_process_cb,
+					  &d, &cd);
+		if (rc != 0)
+			RETURN(rc);
+
+		cd.lpcd_first_idx = le32_to_cpu(llh->llh_cat_idx);
+		cd.lpcd_last_idx = 0;
+		rc = llog_reverse_process(env, cat_llh,
+					  llog_cat_reverse_process_cb,
+					  &d, &cd);
+        } else {
+		rc = llog_reverse_process(env, cat_llh,
+					  llog_cat_reverse_process_cb,
+					  &d, NULL);
+        }
+
+        RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_reverse_process);
+
+static int llog_cat_set_first_idx(struct llog_handle *cathandle, int idx)
+{
+	struct llog_log_hdr *llh = cathandle->lgh_hdr;
+	int bitmap_size;
+
+	ENTRY;
+
+	bitmap_size = LLOG_HDR_BITMAP_SIZE(llh);
+	/*
+	 * The llh_cat_idx equals to the first used index minus 1
+	 * so if we canceled the first index then llh_cat_idx
+	 * must be renewed.
+	 */
+	if (llh->llh_cat_idx == (idx - 1)) {
+		llh->llh_cat_idx = idx;
+
+		while (idx != cathandle->lgh_last_idx) {
+			idx = (idx + 1) % bitmap_size;
+			if (!ext2_test_bit(idx, LLOG_HDR_BITMAP(llh))) {
+				/* update llh_cat_idx for each unset bit,
+				 * expecting the next one is set */
+				llh->llh_cat_idx = idx;
+			} else if (idx == 0) {
+				/* skip header bit */
+				llh->llh_cat_idx = 0;
+				continue;
+			} else {
+				/* the first index is found */
+				break;
+			}
+		}
+
+		CDEBUG(D_RPCTRACE, "catlog "DFID" first idx %u, last_idx %u\n",
+		       PFID(&cathandle->lgh_id.lgl_oi.oi_fid),
+		       llh->llh_cat_idx, cathandle->lgh_last_idx);
+	}
+
+	RETURN(0);
+}
+
+/* Cleanup deleted plain llog traces from catalog */
+int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle,
+		     struct llog_handle *loghandle, int index)
+{
+	int rc;
+
+	LASSERT(index);
+	if (loghandle != NULL) {
+		/* remove destroyed llog from catalog list and
+		 * chd_current_log variable */
+		down_write(&cathandle->lgh_lock);
+		if (cathandle->u.chd.chd_current_log == loghandle)
+			cathandle->u.chd.chd_current_log = NULL;
+		list_del_init(&loghandle->u.phd.phd_entry);
+		up_write(&cathandle->lgh_lock);
+		LASSERT(index == loghandle->u.phd.phd_cookie.lgc_index);
+		/* llog was opened and keep in a list, close it now */
+		llog_close(env, loghandle);
+	}
+
+	/* do not attempt to cleanup on-disk llog if on client side */
+	if (cathandle->lgh_obj == NULL)
+		return 0;
+
+	/* remove plain llog entry from catalog by index */
+	llog_cat_set_first_idx(cathandle, index);
+	rc = llog_cancel_rec(env, cathandle, index);
+	if (rc == 0)
+		CDEBUG(D_HA, "cancel plain log at index %u of catalog "DFID"\n",
+		       index, PFID(&cathandle->lgh_id.lgl_oi.oi_fid));
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_internal.h b/drivers/staging/lustrefsx/lustre/obdclass/llog_internal.h
new file mode 100644
index 0000000000000..eb9526ad504d0
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_internal.h
@@ -0,0 +1,95 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LLOG_INTERNAL_H__
+#define __LLOG_INTERNAL_H__
+
+#include <lustre_log.h>
+
+struct llog_process_info {
+	struct llog_handle	*lpi_loghandle;
+	llog_cb_t		 lpi_cb;
+	void			*lpi_cbdata;
+	void			*lpi_catdata;
+	int			 lpi_rc;
+	struct completion	 lpi_completion;
+	const struct lu_env	*lpi_env;
+	struct task_struct      *lpi_reftask;
+};
+
+struct llog_thread_info {
+	struct lu_attr			 lgi_attr;
+	struct lu_fid			 lgi_fid;
+	struct dt_object_format		 lgi_dof;
+	struct lu_buf			 lgi_buf;
+	loff_t				 lgi_off;
+	struct llog_logid_rec		 lgi_logid;
+	struct dt_insert_rec		 lgi_dt_rec;
+	struct lu_seq_range		 lgi_range;
+	struct llog_cookie		 lgi_cookie;
+	struct obd_statfs		 lgi_statfs;
+	char				 lgi_name[32];
+};
+
+extern struct lu_context_key llog_thread_key;
+
+static inline struct llog_thread_info *llog_info(const struct lu_env *env)
+{
+	struct llog_thread_info *lgi;
+
+	lgi = lu_context_key_get(&env->le_ctx, &llog_thread_key);
+	LASSERT(lgi);
+	return lgi;
+}
+
+int llog_info_init(void);
+void llog_info_fini(void);
+
+void llog_handle_get(struct llog_handle *loghandle);
+void llog_handle_put(struct llog_handle *loghandle);
+int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
+		       struct llog_handle **res, struct llog_logid *logid);
+int class_config_dump_handler(const struct lu_env *env,
+			      struct llog_handle *handle,
+			      struct llog_rec_hdr *rec, void *data);
+int class_config_yaml_output(struct llog_rec_hdr *rec, char *buf, int size);
+int llog_process_or_fork(const struct lu_env *env,
+			 struct llog_handle *loghandle,
+			 llog_cb_t cb, void *data, void *catdata, bool fork);
+int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle,
+		     struct llog_handle *loghandle, int index);
+
+static inline struct llog_rec_hdr *llog_rec_hdr_next(struct llog_rec_hdr *rec)
+{
+	return (struct llog_rec_hdr *)((char *)rec + rec->lrh_len);
+}
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_ioctl.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_ioctl.c
new file mode 100644
index 0000000000000..906e6e64ef4e6
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_ioctl.c
@@ -0,0 +1,496 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include <obd_class.h>
+#include <uapi/linux/lustre_ioctl.h>
+#include <lustre_log.h>
+#include "llog_internal.h"
+
+static int str2logid(struct llog_logid *logid, char *str, int len)
+{
+	char *start, *end, *endp;
+	__u64 id, seq;
+
+	ENTRY;
+	start = str;
+	if (start[0] == '[') {
+		struct lu_fid *fid = &logid->lgl_oi.oi_fid;
+		int num;
+
+		fid_zero(fid);
+		logid->lgl_ogen = 0;
+		num = sscanf(start + 1, SFID, RFID(fid));
+		CDEBUG(D_INFO, DFID":%x\n", PFID(fid), logid->lgl_ogen);
+		RETURN(num == 3 && fid_is_sane(fid) ? 0 : -EINVAL);
+	}
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 1, 53, 0)
+	/* logids used to be input in the form "#id#seq:ogen" before they
+	 * were changed over to accept the FID [seq:oid:ver] format.
+	 * This is accepted for compatibility reasons, though I doubt
+	 * anyone is actually using this for anything. */
+	if (start[0] != '#')
+		RETURN(-EINVAL);
+
+	start++;
+	if (start - str >= len - 1)
+		RETURN(-EINVAL);
+	end = strchr(start, '#');
+	if (end == NULL || end == start)
+		RETURN(-EINVAL);
+
+	*end = '\0';
+	id = simple_strtoull(start, &endp, 0);
+        if (endp != end)
+                RETURN(-EINVAL);
+
+        start = ++end;
+        if (start - str >= len - 1)
+                RETURN(-EINVAL);
+        end = strchr(start, '#');
+        if (end == NULL || end == start)
+                RETURN(-EINVAL);
+
+        *end = '\0';
+	seq = simple_strtoull(start, &endp, 0);
+        if (endp != end)
+                RETURN(-EINVAL);
+
+	ostid_set_seq(&logid->lgl_oi, seq);
+	if (ostid_set_id(&logid->lgl_oi, id))
+		RETURN(-EINVAL);
+
+	start = ++end;
+        if (start - str >= len - 1)
+                RETURN(-EINVAL);
+        logid->lgl_ogen = simple_strtoul(start, &endp, 16);
+        if (*endp != '\0')
+                RETURN(-EINVAL);
+
+        RETURN(0);
+#else
+	RETURN(-EINVAL);
+#endif
+}
+
+static int llog_check_cb(const struct lu_env *env, struct llog_handle *handle,
+			 struct llog_rec_hdr *rec, void *data)
+{
+        struct obd_ioctl_data *ioc_data = (struct obd_ioctl_data *)data;
+	static int l, remains;
+	static long from, to;
+        static char *out;
+        char *endp;
+        int cur_index, rc = 0;
+
+        ENTRY;
+
+	if (ioc_data && ioc_data->ioc_inllen1 > 0) {
+                l = 0;
+                remains = ioc_data->ioc_inllen4 +
+                        cfs_size_round(ioc_data->ioc_inllen1) +
+                        cfs_size_round(ioc_data->ioc_inllen2) +
+                        cfs_size_round(ioc_data->ioc_inllen3);
+                from = simple_strtol(ioc_data->ioc_inlbuf2, &endp, 0);
+                if (*endp != '\0')
+                        RETURN(-EINVAL);
+                to = simple_strtol(ioc_data->ioc_inlbuf3, &endp, 0);
+                if (*endp != '\0')
+                        RETURN(-EINVAL);
+                ioc_data->ioc_inllen1 = 0;
+                out = ioc_data->ioc_bulk;
+	}
+
+	cur_index = rec->lrh_index;
+	if (cur_index < from)
+		RETURN(0);
+	if (to > 0 && cur_index > to)
+		RETURN(-LLOG_EEMPTY);
+
+	if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) {
+		struct llog_logid_rec	*lir = (struct llog_logid_rec *)rec;
+		struct llog_handle	*loghandle;
+
+                if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+                        l = snprintf(out, remains, "[index]: %05d  [type]: "
+                                     "%02x  [len]: %04d failed\n",
+                                     cur_index, rec->lrh_type,
+                                     rec->lrh_len);
+                }
+                if (handle->lgh_ctxt == NULL)
+                        RETURN(-EOPNOTSUPP);
+		rc = llog_cat_id2handle(env, handle, &loghandle, &lir->lid_id);
+		if (rc) {
+			CDEBUG(D_IOCTL, "cannot find log "DFID":%x\n",
+			       PFID(&lir->lid_id.lgl_oi.oi_fid),
+			       lir->lid_id.lgl_ogen);
+			RETURN(rc);
+		}
+		rc = llog_process(env, loghandle, llog_check_cb, NULL, NULL);
+		llog_handle_put(loghandle);
+	} else {
+		bool ok;
+
+                switch (rec->lrh_type) {
+                case OST_SZ_REC:
+                case MDS_UNLINK_REC:
+		case MDS_UNLINK64_REC:
+                case MDS_SETATTR64_REC:
+                case OBD_CFG_REC:
+		case LLOG_GEN_REC:
+		case LLOG_HDR_MAGIC:
+			ok = true;
+			break;
+		default:
+			ok = false;
+		}
+
+		l = snprintf(out, remains, "[index]: %05d  [type]: "
+			     "%02x  [len]: %04d %s\n",
+			     cur_index, rec->lrh_type, rec->lrh_len,
+			     ok ? "ok" : "failed");
+		out += l;
+		remains -= l;
+		if (remains <= 0) {
+			CERROR("%s: no space to print log records\n",
+			       handle->lgh_ctxt->loc_obd->obd_name);
+			RETURN(-LLOG_EEMPTY);
+		}
+	}
+	RETURN(rc);
+}
+
+static int llog_print_cb(const struct lu_env *env, struct llog_handle *handle,
+			 struct llog_rec_hdr *rec, void *data)
+{
+        struct obd_ioctl_data *ioc_data = (struct obd_ioctl_data *)data;
+	static int l, remains;
+	static long from, to;
+        static char *out;
+        char *endp;
+        int cur_index;
+
+        ENTRY;
+	if (ioc_data != NULL && ioc_data->ioc_inllen1 > 0) {
+                l = 0;
+                remains = ioc_data->ioc_inllen4 +
+                        cfs_size_round(ioc_data->ioc_inllen1) +
+                        cfs_size_round(ioc_data->ioc_inllen2) +
+                        cfs_size_round(ioc_data->ioc_inllen3);
+                from = simple_strtol(ioc_data->ioc_inlbuf2, &endp, 0);
+                if (*endp != '\0')
+                        RETURN(-EINVAL);
+                to = simple_strtol(ioc_data->ioc_inlbuf3, &endp, 0);
+                if (*endp != '\0')
+                        RETURN(-EINVAL);
+                out = ioc_data->ioc_bulk;
+                ioc_data->ioc_inllen1 = 0;
+        }
+
+        cur_index = rec->lrh_index;
+        if (cur_index < from)
+                RETURN(0);
+        if (to > 0 && cur_index > to)
+                RETURN(-LLOG_EEMPTY);
+
+        if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) {
+                struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+
+                if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+                        CERROR("invalid record in catalog\n");
+                        RETURN(-EINVAL);
+                }
+
+		l = snprintf(out, remains,
+			     "[index]: %05d  [logid]: "DFID":%x\n",
+			     cur_index, PFID(&lir->lid_id.lgl_oi.oi_fid),
+			     lir->lid_id.lgl_ogen);
+	} else if (rec->lrh_type == OBD_CFG_REC) {
+		int rc;
+
+		rc = class_config_yaml_output(rec, out, remains);
+		if (rc < 0)
+			RETURN(rc);
+		l = rc;
+	} else {
+		l = snprintf(out, remains,
+			     "[index]: %05d  [type]: %02x  [len]: %04d\n",
+			     cur_index, rec->lrh_type, rec->lrh_len);
+        }
+        out += l;
+        remains -= l;
+        if (remains <= 0) {
+                CERROR("not enough space for print log records\n");
+                RETURN(-LLOG_EEMPTY);
+        }
+
+        RETURN(0);
+}
+static int llog_remove_log(const struct lu_env *env, struct llog_handle *cat,
+			   struct llog_logid *logid)
+{
+	struct llog_handle	*log;
+	int			 rc;
+
+	ENTRY;
+
+	rc = llog_cat_id2handle(env, cat, &log, logid);
+	if (rc) {
+		CDEBUG(D_IOCTL, "cannot find log "DFID":%x\n",
+		       PFID(&logid->lgl_oi.oi_fid), logid->lgl_ogen);
+		RETURN(-ENOENT);
+	}
+
+	rc = llog_destroy(env, log);
+	if (rc) {
+		CDEBUG(D_IOCTL, "cannot destroy log "DFID":%x\n",
+		       PFID(&logid->lgl_oi.oi_fid), logid->lgl_ogen);
+		GOTO(out, rc);
+	}
+	llog_cat_cleanup(env, cat, log, log->u.phd.phd_cookie.lgc_index);
+out:
+	llog_handle_put(log);
+	RETURN(rc);
+
+}
+
+static int llog_delete_cb(const struct lu_env *env, struct llog_handle *handle,
+			  struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_logid_rec	*lir = (struct llog_logid_rec *)rec;
+	int			 rc;
+
+	ENTRY;
+	if (rec->lrh_type != LLOG_LOGID_MAGIC)
+		RETURN(-EINVAL);
+	rc = llog_remove_log(env, handle, &lir->lid_id);
+
+	RETURN(rc);
+}
+
+
+int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
+	       struct obd_ioctl_data *data)
+{
+	struct llog_logid	 logid;
+	int			 rc = 0;
+	struct llog_handle	*handle = NULL;
+	char *logname;
+
+	ENTRY;
+
+	logname = data->ioc_inlbuf1;
+	if (logname[0] == '#' || logname[0] == '[') {
+		rc = str2logid(&logid, logname, data->ioc_inllen1);
+		if (rc)
+			RETURN(rc);
+		rc = llog_open(env, ctxt, &handle, &logid, NULL,
+			       LLOG_OPEN_EXISTS);
+		if (rc)
+			RETURN(rc);
+	} else if (logname[0] == '$' || isalpha(logname[0])) {
+		if (logname[0] == '$')
+			logname++;
+
+		rc = llog_open(env, ctxt, &handle, NULL, logname,
+			       LLOG_OPEN_EXISTS);
+		if (rc)
+			RETURN(rc);
+	} else {
+		RETURN(-EINVAL);
+	}
+
+	rc = llog_init_handle(env, handle, 0, NULL);
+	if (rc)
+		GOTO(out_close, rc = -ENOENT);
+
+	switch (cmd) {
+	case OBD_IOC_LLOG_INFO: {
+		int	 l;
+		int	 remains = data->ioc_inllen2 +
+				   cfs_size_round(data->ioc_inllen1);
+		char	*out = data->ioc_bulk;
+
+		l = snprintf(out, remains,
+			     "logid:            "DFID":%x\n"
+			     "flags:            %x (%s)\n"
+			     "records_count:    %d\n"
+			     "last_index:       %d\n",
+			     PFID(&handle->lgh_id.lgl_oi.oi_fid),
+			     handle->lgh_id.lgl_ogen,
+			     handle->lgh_hdr->llh_flags,
+			     handle->lgh_hdr->llh_flags &
+				LLOG_F_IS_CAT ? "cat" : "plain",
+			     handle->lgh_hdr->llh_count,
+			     handle->lgh_last_idx);
+		out += l;
+		remains -= l;
+		if (remains <= 0) {
+			CERROR("%s: not enough space for log header info\n",
+			       ctxt->loc_obd->obd_name);
+			rc = -ENOSPC;
+		}
+		break;
+	}
+	case OBD_IOC_LLOG_CHECK:
+		LASSERT(data->ioc_inllen1 > 0);
+		rc = llog_process(env, handle, llog_check_cb, data, NULL);
+		if (rc == -LLOG_EEMPTY)
+			rc = 0;
+		else if (rc)
+			GOTO(out_close, rc);
+		break;
+	case OBD_IOC_LLOG_PRINT:
+		LASSERT(data->ioc_inllen1 > 0);
+		rc = llog_process(env, handle, llog_print_cb, data, NULL);
+		if (rc == -LLOG_EEMPTY)
+			rc = 0;
+		else if (rc)
+			GOTO(out_close, rc);
+		break;
+	case OBD_IOC_LLOG_CANCEL: {
+		struct llog_cookie cookie;
+		struct llog_logid plain;
+		char *endp;
+
+		cookie.lgc_index = simple_strtoul(data->ioc_inlbuf3, &endp, 0);
+		if (*endp != '\0')
+			GOTO(out_close, rc = -EINVAL);
+
+		if (handle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) {
+			rc = llog_cancel_rec(env, handle, cookie.lgc_index);
+			GOTO(out_close, rc);
+		} else if (!(handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)) {
+			GOTO(out_close, rc = -EINVAL);
+		}
+
+		if (data->ioc_inlbuf2 == NULL) /* catalog but no logid */
+			GOTO(out_close, rc = -ENOTTY);
+
+		rc = str2logid(&plain, data->ioc_inlbuf2, data->ioc_inllen2);
+		if (rc)
+			GOTO(out_close, rc);
+		cookie.lgc_lgl = plain;
+		rc = llog_cat_cancel_records(env, handle, 1, &cookie);
+		if (rc)
+			GOTO(out_close, rc);
+		break;
+	}
+	case OBD_IOC_LLOG_REMOVE: {
+		struct llog_logid plain;
+
+		if (handle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) {
+			rc = llog_destroy(env, handle);
+			GOTO(out_close, rc);
+		} else if (!(handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)) {
+			GOTO(out_close, rc = -EINVAL);
+		}
+
+		if (data->ioc_inlbuf2 > 0) {
+			/* remove indicate log from the catalog */
+			rc = str2logid(&plain, data->ioc_inlbuf2,
+				       data->ioc_inllen2);
+			if (rc)
+				GOTO(out_close, rc);
+			rc = llog_remove_log(env, handle, &plain);
+		} else {
+			/* remove all the log of the catalog */
+			rc = llog_process(env, handle, llog_delete_cb, NULL,
+					  NULL);
+			if (rc)
+				GOTO(out_close, rc);
+		}
+		break;
+	}
+	default:
+		CERROR("%s: Unknown ioctl cmd %#x\n",
+		       ctxt->loc_obd->obd_name, cmd);
+		GOTO(out_close, rc = -ENOTTY);
+	}
+
+out_close:
+	if (handle->lgh_hdr &&
+	    handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)
+		llog_cat_close(env, handle);
+	else
+		llog_close(env, handle);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_ioctl);
+
+int llog_catalog_list(const struct lu_env *env, struct dt_device *d,
+		      int count, struct obd_ioctl_data *data,
+		      const struct lu_fid *fid)
+{
+	int			 size, i;
+	struct llog_catid	*idarray;
+	struct llog_logid	*id;
+	char			*out;
+	int			 l, remains, rc = 0;
+
+	ENTRY;
+
+	if (count == 0) { /* get total number of logs */
+		rc = llog_osd_get_cat_list(env, d, 0, 0, NULL, fid);
+		if (rc < 0)
+			RETURN(rc);
+		count = rc;
+	}
+
+	size = sizeof(*idarray) * count;
+
+	OBD_ALLOC_LARGE(idarray, size);
+	if (!idarray)
+		RETURN(-ENOMEM);
+
+	rc = llog_osd_get_cat_list(env, d, 0, count, idarray, fid);
+	if (rc)
+		GOTO(out, rc);
+
+	out = data->ioc_bulk;
+	remains = data->ioc_inllen1;
+	for (i = 0; i < count; i++) {
+		id = &idarray[i].lci_logid;
+		l = snprintf(out, remains, "catalog_log: "DFID":%x\n",
+			     PFID(&id->lgl_oi.oi_fid), id->lgl_ogen);
+		out += l;
+		remains -= l;
+		if (remains <= 0)
+			break;
+	}
+out:
+	OBD_FREE_LARGE(idarray, size);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_catalog_list);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_obd.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_obd.c
new file mode 100644
index 0000000000000..a5cdc6e184185
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_obd.c
@@ -0,0 +1,266 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include "llog_internal.h"
+
+/* helper functions for calling the llog obd methods */
+static struct llog_ctxt* llog_new_ctxt(struct obd_device *obd)
+{
+        struct llog_ctxt *ctxt;
+
+        OBD_ALLOC_PTR(ctxt);
+        if (!ctxt)
+                return NULL;
+
+        ctxt->loc_obd = obd;
+	atomic_set(&ctxt->loc_refcount, 1);
+
+        return ctxt;
+}
+
+static void llog_ctxt_destroy(struct llog_ctxt *ctxt)
+{
+        if (ctxt->loc_exp) {
+                class_export_put(ctxt->loc_exp);
+                ctxt->loc_exp = NULL;
+        }
+        if (ctxt->loc_imp) {
+                class_import_put(ctxt->loc_imp);
+                ctxt->loc_imp = NULL;
+        }
+        OBD_FREE_PTR(ctxt);
+}
+
+int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+        struct obd_llog_group *olg = ctxt->loc_olg;
+        struct obd_device *obd;
+        int rc = 0;
+
+	spin_lock(&olg->olg_lock);
+	if (!atomic_dec_and_test(&ctxt->loc_refcount)) {
+		spin_unlock(&olg->olg_lock);
+		return rc;
+	}
+	olg->olg_ctxts[ctxt->loc_idx] = NULL;
+	spin_unlock(&olg->olg_lock);
+
+	obd = ctxt->loc_obd;
+	spin_lock(&obd->obd_dev_lock);
+	/* sync with llog ctxt user thread */
+	spin_unlock(&obd->obd_dev_lock);
+
+        /* obd->obd_starting is needed for the case of cleanup
+         * in error case while obd is starting up. */
+        LASSERTF(obd->obd_starting == 1 ||
+                 obd->obd_stopping == 1 || obd->obd_set_up == 0,
+                 "wrong obd state: %d/%d/%d\n", !!obd->obd_starting,
+                 !!obd->obd_stopping, !!obd->obd_set_up);
+
+        /* cleanup the llog ctxt here */
+        if (CTXTP(ctxt, cleanup))
+		rc = CTXTP(ctxt, cleanup)(env, ctxt);
+
+	llog_ctxt_destroy(ctxt);
+	wake_up(&olg->olg_waitq);
+	return rc;
+}
+EXPORT_SYMBOL(__llog_ctxt_put);
+
+int llog_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+        struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+        struct obd_llog_group *olg;
+        int rc, idx;
+        ENTRY;
+
+        LASSERT(ctxt != NULL);
+        LASSERT(ctxt != LP_POISON);
+
+        olg = ctxt->loc_olg;
+        LASSERT(olg != NULL);
+        LASSERT(olg != LP_POISON);
+
+        idx = ctxt->loc_idx;
+
+	/*
+         * Banlance the ctxt get when calling llog_cleanup()
+         */
+	LASSERT(atomic_read(&ctxt->loc_refcount) < LI_POISON);
+	LASSERT(atomic_read(&ctxt->loc_refcount) > 1);
+        llog_ctxt_put(ctxt);
+
+	/*
+	 * Try to free the ctxt.
+	 */
+	rc = __llog_ctxt_put(env, ctxt);
+        if (rc)
+                CERROR("Error %d while cleaning up ctxt %p\n",
+                       rc, ctxt);
+
+        l_wait_event(olg->olg_waitq,
+                     llog_group_ctxt_null(olg, idx), &lwi);
+
+        RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cleanup);
+
+int llog_setup(const struct lu_env *env, struct obd_device *obd,
+	       struct obd_llog_group *olg, int index,
+	       struct obd_device *disk_obd, struct llog_operations *op)
+{
+        struct llog_ctxt *ctxt;
+        int rc = 0;
+        ENTRY;
+
+        if (index < 0 || index >= LLOG_MAX_CTXTS)
+                RETURN(-EINVAL);
+
+        LASSERT(olg != NULL);
+
+        ctxt = llog_new_ctxt(obd);
+        if (!ctxt)
+                RETURN(-ENOMEM);
+
+        ctxt->loc_obd = obd;
+        ctxt->loc_olg = olg;
+        ctxt->loc_idx = index;
+        ctxt->loc_logops = op;
+	mutex_init(&ctxt->loc_mutex);
+	if (disk_obd != NULL)
+		ctxt->loc_exp = class_export_get(disk_obd->obd_self_export);
+	else
+		ctxt->loc_exp = class_export_get(obd->obd_self_export);
+
+	ctxt->loc_flags = LLOG_CTXT_FLAG_UNINITIALIZED;
+	ctxt->loc_chunk_size = LLOG_MIN_CHUNK_SIZE;
+
+        rc = llog_group_set_ctxt(olg, ctxt, index);
+        if (rc) {
+                llog_ctxt_destroy(ctxt);
+                if (rc == -EEXIST) {
+                        ctxt = llog_group_get_ctxt(olg, index);
+			if (ctxt) {
+				CDEBUG(D_CONFIG, "%s: ctxt %d already set up\n",
+				       obd->obd_name, index);
+				LASSERT(ctxt->loc_olg == olg);
+				LASSERT(ctxt->loc_obd == obd);
+				if (disk_obd != NULL)
+					LASSERT(ctxt->loc_exp ==
+						disk_obd->obd_self_export);
+				else
+					LASSERT(ctxt->loc_exp ==
+						obd->obd_self_export);
+				LASSERT(ctxt->loc_logops == op);
+				llog_ctxt_put(ctxt);
+			}
+                        rc = 0;
+                }
+                RETURN(rc);
+        }
+
+	if (op->lop_setup) {
+		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LLOG_SETUP))
+			rc = -EOPNOTSUPP;
+		else
+			rc = op->lop_setup(env, obd, olg, index, disk_obd);
+	}
+
+	if (rc) {
+		CERROR("%s: ctxt %d lop_setup=%p failed: rc = %d\n",
+		       obd->obd_name, index, op->lop_setup, rc);
+		llog_group_clear_ctxt(olg, index);
+		llog_ctxt_destroy(ctxt);
+        } else {
+                CDEBUG(D_CONFIG, "obd %s ctxt %d is initialized\n",
+                       obd->obd_name, index);
+                ctxt->loc_flags &= ~LLOG_CTXT_FLAG_UNINITIALIZED;
+        }
+
+        RETURN(rc);
+}
+EXPORT_SYMBOL(llog_setup);
+
+int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags)
+{
+        int rc = 0;
+        ENTRY;
+
+        if (!ctxt)
+                RETURN(0);
+
+        if (CTXTP(ctxt, sync))
+		rc = CTXTP(ctxt, sync)(ctxt, exp, flags);
+
+        RETURN(rc);
+}
+EXPORT_SYMBOL(llog_sync);
+
+int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt,
+		struct llog_cookie *cookies, int flags)
+{
+        int rc;
+        ENTRY;
+
+        if (!ctxt) {
+                CERROR("No ctxt\n");
+                RETURN(-ENODEV);
+        }
+
+        CTXT_CHECK_OP(ctxt, cancel, -EOPNOTSUPP);
+	rc = CTXTP(ctxt, cancel)(env, ctxt, cookies, flags);
+        RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cancel);
+
+/* context key constructor/destructor: llog_key_init, llog_key_fini */
+LU_KEY_INIT_FINI(llog, struct llog_thread_info);
+/* context key: llog_thread_key */
+LU_CONTEXT_KEY_DEFINE(llog, LCT_MD_THREAD | LCT_MG_THREAD | LCT_LOCAL);
+LU_KEY_INIT_GENERIC(llog);
+
+int llog_info_init(void)
+{
+	llog_key_init_generic(&llog_thread_key, NULL);
+	lu_context_key_register(&llog_thread_key);
+	return 0;
+}
+
+void llog_info_fini(void)
+{
+	lu_context_key_degister(&llog_thread_key);
+}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_osd.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_osd.c
new file mode 100644
index 0000000000000..ffa1ad0149b25
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_osd.c
@@ -0,0 +1,2143 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/*
+ * lustre/obdclass/llog_osd.c
+ *
+ * Low level llog routines on top of OSD API
+ *
+ * This file provides set of methods for llog operations on top of
+ * dt_device. It contains all supported llog_operations interfaces and
+ * supplimental functions.
+ *
+ * Author: Alexey Zhuravlev <alexey.zhuravlev@intel.com>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include <dt_object.h>
+#include <llog_swab.h>
+#include <lustre_fid.h>
+#include <obd.h>
+#include <obd_class.h>
+
+#include "llog_internal.h"
+#include "local_storage.h"
+
+/**
+ * Implementation of the llog_operations::lop_declare_create
+ *
+ * This function is a wrapper over local_storage API function
+ * local_object_declare_create().
+ *
+ * \param[in] env	execution environment
+ * \param[in] los	local_storage for bottom storage device
+ * \param[in] o		dt_object to create
+ * \param[in] th	current transaction handle
+ *
+ * \retval		0 on successful declaration of the new object
+ * \retval		negative error if declaration was failed
+ */
+static int llog_osd_declare_new_object(const struct lu_env *env,
+				       struct local_oid_storage *los,
+				       struct dt_object *o,
+				       struct thandle *th)
+{
+	struct llog_thread_info *lgi = llog_info(env);
+
+	lgi->lgi_attr.la_valid = LA_MODE;
+	lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+	lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+	return local_object_declare_create(env, los, o, &lgi->lgi_attr,
+					   &lgi->lgi_dof, th);
+}
+
+/**
+ * Implementation of the llog_operations::lop_create
+ *
+ * This function is a wrapper over local_storage API function
+ * local_object_create().
+ *
+ * \param[in] env	execution environment
+ * \param[in] los	local_storage for bottom storage device
+ * \param[in] o		dt_object to create
+ * \param[in] th	current transaction handle
+ *
+ * \retval		0 on successful creation of the new object
+ * \retval		negative error if creation was failed
+ */
+static int llog_osd_create_new_object(const struct lu_env *env,
+				      struct local_oid_storage *los,
+				      struct dt_object *o,
+				      struct thandle *th)
+{
+	struct llog_thread_info *lgi = llog_info(env);
+
+	lgi->lgi_attr.la_valid = LA_MODE;
+	lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+	lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+	return local_object_create(env, los, o, &lgi->lgi_attr,
+				   &lgi->lgi_dof, th);
+}
+
+/**
+ * Implementation of the llog_operations::lop_exist
+ *
+ * This function checks that llog exists on storage.
+ *
+ * \param[in] handle	llog handle of the current llog
+ *
+ * \retval		true if llog object exists and is not just destroyed
+ * \retval		false if llog doesn't exist or just destroyed
+ */
+static int llog_osd_exist(struct llog_handle *handle)
+{
+	LASSERT(handle->lgh_obj);
+	return dt_object_exists(handle->lgh_obj) &&
+		!lu_object_is_dying(handle->lgh_obj->do_lu.lo_header);
+}
+
+static void *rec_tail(struct llog_rec_hdr *rec)
+{
+	return (void *)((char *)rec + rec->lrh_len -
+			sizeof(struct llog_rec_tail));
+}
+
+/**
+ * Write a padding record to the llog
+ *
+ * This function writes a padding record to the end of llog. That may
+ * be needed if llog contains records of variable size, e.g. config logs
+ * or changelogs.
+ * The padding record just aligns llog to the llog chunk_size boundary if
+ * the current record doesn't fit in the remaining space.
+ *
+ * It allocates full length to avoid two separate writes for header and tail.
+ * Such 2-steps scheme needs extra protection and complex error handling.
+ *
+ * \param[in]     env	execution environment
+ * \param[in]     o	dt_object to create
+ * \param[in,out] off	pointer to the padding start offset
+ * \param[in]     len	padding length
+ * \param[in]     index	index of the padding record in a llog
+ * \param[in]     th	current transaction handle
+ *
+ * \retval		0 on successful padding write
+ * \retval		negative error if write failed
+ */
+static int llog_osd_pad(const struct lu_env *env, struct dt_object *o,
+			loff_t *off, int len, int index, struct thandle *th)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct llog_rec_hdr	*rec;
+	struct llog_rec_tail	*tail;
+	int			 rc;
+
+	ENTRY;
+
+	LASSERT(th);
+	LASSERT(off);
+	LASSERT(len >= LLOG_MIN_REC_SIZE && (len & 0x7) == 0);
+
+	OBD_ALLOC(rec, len);
+	if (rec == NULL)
+		RETURN(-ENOMEM);
+
+	rec->lrh_len = len;
+	rec->lrh_index = index;
+	rec->lrh_type = LLOG_PAD_MAGIC;
+
+	tail = rec_tail(rec);
+	tail->lrt_len = len;
+	tail->lrt_index = index;
+
+	lgi->lgi_buf.lb_buf = rec;
+	lgi->lgi_buf.lb_len = len;
+	rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+	if (rc)
+		CERROR("%s: error writing padding record: rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name, rc);
+
+	OBD_FREE(rec, len);
+	RETURN(rc);
+}
+
+/**
+ * Implementation of the llog_operations::lop_read_header
+ *
+ * This function reads the current llog header from the bottom storage
+ * device.
+ *
+ * \param[in] env	execution environment
+ * \param[in] handle	llog handle of the current llog
+ *
+ * \retval		0 on successful header read
+ * \retval		negative error if read failed
+ */
+static int llog_osd_read_header(const struct lu_env *env,
+				struct llog_handle *handle)
+{
+	struct llog_rec_hdr	*llh_hdr;
+	struct dt_object	*o;
+	struct llog_thread_info	*lgi;
+	enum llog_flag		 flags;
+	int			 rc;
+
+	ENTRY;
+
+	o = handle->lgh_obj;
+	LASSERT(o);
+
+	lgi = llog_info(env);
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr);
+	if (rc)
+		RETURN(rc);
+
+	LASSERT(lgi->lgi_attr.la_valid & LA_SIZE);
+
+	if (lgi->lgi_attr.la_size == 0) {
+		CDEBUG(D_HA, "not reading header from 0-byte log\n");
+		RETURN(LLOG_EEMPTY);
+	}
+
+	flags = handle->lgh_hdr->llh_flags;
+
+	lgi->lgi_off = 0;
+	lgi->lgi_buf.lb_buf = handle->lgh_hdr;
+	lgi->lgi_buf.lb_len = handle->lgh_hdr_size;
+	rc = dt_read(env, o, &lgi->lgi_buf, &lgi->lgi_off);
+	llh_hdr = &handle->lgh_hdr->llh_hdr;
+	if (rc < sizeof(*llh_hdr) || rc < llh_hdr->lrh_len) {
+		CERROR("%s: error reading "DFID" log header size %d: rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,
+		       PFID(lu_object_fid(&o->do_lu)), rc < 0 ? 0 : rc,
+		       -EFAULT);
+
+		if (rc >= 0)
+			rc = -EFAULT;
+
+		RETURN(rc);
+	}
+
+	if (LLOG_REC_HDR_NEEDS_SWABBING(llh_hdr))
+		lustre_swab_llog_hdr(handle->lgh_hdr);
+
+	if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) {
+		CERROR("%s: bad log %s "DFID" header magic: %#x "
+		       "(expected %#x)\n", o->do_lu.lo_dev->ld_obd->obd_name,
+		       handle->lgh_name ? handle->lgh_name : "",
+		       PFID(lu_object_fid(&o->do_lu)),
+		       llh_hdr->lrh_type, LLOG_HDR_MAGIC);
+		RETURN(-EIO);
+	} else if (llh_hdr->lrh_len < LLOG_MIN_CHUNK_SIZE ||
+		   llh_hdr->lrh_len > handle->lgh_hdr_size) {
+		CERROR("%s: incorrectly sized log %s "DFID" header: "
+		       "%#x (expected at least %#x)\n"
+		       "you may need to re-run lconf --write_conf.\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,
+		       handle->lgh_name ? handle->lgh_name : "",
+		       PFID(lu_object_fid(&o->do_lu)),
+		       llh_hdr->lrh_len, LLOG_MIN_CHUNK_SIZE);
+		RETURN(-EIO);
+	} else if (LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_index >
+		   LLOG_HDR_BITMAP_SIZE(handle->lgh_hdr) ||
+		   LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_len !=
+			llh_hdr->lrh_len) {
+		CERROR("%s: incorrectly sized log %s "DFID" tailer: "
+		       "%#x : rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,
+		       handle->lgh_name ? handle->lgh_name : "",
+		       PFID(lu_object_fid(&o->do_lu)),
+		       LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_len, -EIO);
+		RETURN(-EIO);
+	}
+
+	handle->lgh_hdr->llh_flags |= (flags & LLOG_F_EXT_MASK);
+	handle->lgh_last_idx = LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_index;
+
+	RETURN(0);
+}
+
+/**
+ * Implementation of the llog_operations::lop_declare_write
+ *
+ * This function declares the new record write.
+ *
+ * \param[in] env	execution environment
+ * \param[in] loghandle	llog handle of the current llog
+ * \param[in] rec	llog record header. This is a real header of the full
+ *			llog record to write. This is the beginning of buffer
+ *			to write, the length of buffer is stored in
+ *			\a rec::lrh_len
+ * \param[in] idx	index of the llog record. If \a idx == -1 then this is
+ *			append case, otherwise \a idx is the index of record
+ *			to modify
+ * \param[in] th	current transaction handle
+ *
+ * \retval		0 on successful declaration
+ * \retval		negative error if declaration failed
+ */
+static int llog_osd_declare_write_rec(const struct lu_env *env,
+				      struct llog_handle *loghandle,
+				      struct llog_rec_hdr *rec,
+				      int idx, struct thandle *th)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	__u32			chunk_size;
+	struct dt_object	*o;
+	int			 rc;
+
+	ENTRY;
+
+	LASSERT(env);
+	LASSERT(th);
+	LASSERT(loghandle);
+	LASSERT(rec);
+	LASSERT(rec->lrh_len <= loghandle->lgh_ctxt->loc_chunk_size);
+
+	o = loghandle->lgh_obj;
+	LASSERT(o);
+
+	chunk_size = loghandle->lgh_ctxt->loc_chunk_size;
+	lgi->lgi_buf.lb_len = chunk_size;
+	lgi->lgi_buf.lb_buf = NULL;
+	/* each time we update header */
+	rc = dt_declare_record_write(env, o, &lgi->lgi_buf, 0,
+				     th);
+	if (rc || idx == 0) /* if error or just header */
+		RETURN(rc);
+
+	/**
+	 * the pad record can be inserted so take into account double
+	 * record size
+	 */
+	lgi->lgi_buf.lb_len = chunk_size * 2;
+	lgi->lgi_buf.lb_buf = NULL;
+	/* XXX: implement declared window or multi-chunks approach */
+	rc = dt_declare_record_write(env, o, &lgi->lgi_buf, -1, th);
+
+	RETURN(rc);
+}
+
+/**
+ * Implementation of the llog_operations::lop_write
+ *
+ * This function writes the new record in the llog or modify the existed one.
+ *
+ * \param[in]  env		execution environment
+ * \param[in]  loghandle	llog handle of the current llog
+ * \param[in]  rec		llog record header. This is a real header of
+ *				the full llog record to write. This is
+ *				the beginning of buffer to write, the length
+ *				of buffer is stored in \a rec::lrh_len
+ * \param[out] reccookie	pointer to the cookie to return back if needed.
+ *				It is used for further cancel of this llog
+ *				record.
+ * \param[in]  idx		index of the llog record. If \a idx == -1 then
+ *				this is append case, otherwise \a idx is
+ *				the index of record to modify
+ * \param[in]  th		current transaction handle
+ *
+ * \retval			0 on successful write && \a reccookie == NULL
+ *				1 on successful write && \a reccookie != NULL
+ * \retval			negative error if write failed
+ */
+static int llog_osd_write_rec(const struct lu_env *env,
+			      struct llog_handle *loghandle,
+			      struct llog_rec_hdr *rec,
+			      struct llog_cookie *reccookie,
+			      int idx, struct thandle *th)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct llog_log_hdr	*llh;
+	int			 reclen = rec->lrh_len;
+	int			 index, rc;
+	struct llog_rec_tail	*lrt;
+	struct dt_object	*o;
+	__u32			chunk_size;
+	size_t			 left;
+	__u32			orig_last_idx;
+	ENTRY;
+
+	llh = loghandle->lgh_hdr;
+	o = loghandle->lgh_obj;
+
+	chunk_size = llh->llh_hdr.lrh_len;
+	CDEBUG(D_OTHER, "new record %x to "DFID"\n",
+	       rec->lrh_type, PFID(lu_object_fid(&o->do_lu)));
+
+	if (!llog_osd_exist(loghandle))
+		RETURN(-ENOENT);
+
+	/* record length should not bigger than  */
+	if (reclen > loghandle->lgh_hdr->llh_hdr.lrh_len)
+		RETURN(-E2BIG);
+
+	/* sanity check for fixed-records llog */
+	if (idx != LLOG_HEADER_IDX && (llh->llh_flags & LLOG_F_IS_FIXSIZE)) {
+		LASSERT(llh->llh_size != 0);
+		LASSERT(llh->llh_size == reclen);
+	}
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr);
+	if (rc)
+		RETURN(rc);
+
+	/**
+	 * The modification case.
+	 * If idx set then the record with that index must be modified.
+	 * There are three cases possible:
+	 * 1) the common case is the llog header update (idx == 0)
+	 * 2) the llog record modification during llog process.
+	 *    This is indicated by the \a loghandle::lgh_cur_idx > 0.
+	 *    In that case the \a loghandle::lgh_cur_offset
+	 * 3) otherwise this is assumed that llog consist of records of
+	 *    fixed size, i.e. catalog. The llog header must has llh_size
+	 *    field equal to record size. The record offset is calculated
+	 *    just by /a idx value
+	 *
+	 * During modification we don't need extra header update because
+	 * the bitmap and record count are not changed. The record header
+	 * and tail remains the same too.
+	 */
+	if (idx != LLOG_NEXT_IDX) {
+		/* llog can be empty only when first record is being written */
+		LASSERT(ergo(idx > 0, lgi->lgi_attr.la_size > 0));
+
+		if (!ext2_test_bit(idx, LLOG_HDR_BITMAP(llh))) {
+			CERROR("%s: modify unset record %u\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name, idx);
+			RETURN(-ENOENT);
+		}
+
+		if (idx != rec->lrh_index) {
+			CERROR("%s: modify index mismatch %d %u\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name, idx,
+			       rec->lrh_index);
+			RETURN(-EFAULT);
+		}
+
+		if (idx == LLOG_HEADER_IDX) {
+			/* llog header update */
+			__u32	*bitmap = LLOG_HDR_BITMAP(llh);
+
+			lgi->lgi_off = 0;
+
+			/* If it does not indicate the bitmap index
+			 * (reccookie == NULL), then it means update
+			 * the whole update header. Otherwise only
+			 * update header and bits needs to be updated,
+			 * and in DNE cases, it will signaficantly
+			 * shrink the RPC size.
+			 * see distribute_txn_cancel_records()*/
+			if (reccookie == NULL) {
+				lgi->lgi_buf.lb_len = reclen;
+				lgi->lgi_buf.lb_buf = rec;
+				rc = dt_record_write(env, o, &lgi->lgi_buf,
+						     &lgi->lgi_off, th);
+				RETURN(rc);
+			}
+
+			/* update the header */
+			lgi->lgi_buf.lb_len = llh->llh_bitmap_offset;
+			lgi->lgi_buf.lb_buf = llh;
+			rc = dt_record_write(env, o, &lgi->lgi_buf,
+					     &lgi->lgi_off, th);
+			if (rc != 0)
+				RETURN(rc);
+
+			/* update the bitmap */
+			index = reccookie->lgc_index;
+			lgi->lgi_off = llh->llh_bitmap_offset +
+				      (index / (sizeof(*bitmap) * 8)) *
+							sizeof(*bitmap);
+			lgi->lgi_buf.lb_len = sizeof(*bitmap);
+			lgi->lgi_buf.lb_buf =
+					&bitmap[index/(sizeof(*bitmap)*8)];
+			rc = dt_record_write(env, o, &lgi->lgi_buf,
+					     &lgi->lgi_off, th);
+
+			RETURN(rc);
+		} else if (loghandle->lgh_cur_idx > 0) {
+			/**
+			 * The lgh_cur_offset can be used only if index is
+			 * the same.
+			 */
+			if (idx != loghandle->lgh_cur_idx) {
+				CERROR("%s: modify index mismatch %d %d\n",
+				       o->do_lu.lo_dev->ld_obd->obd_name, idx,
+				       loghandle->lgh_cur_idx);
+				RETURN(-EFAULT);
+			}
+
+			lgi->lgi_off = loghandle->lgh_cur_offset;
+			CDEBUG(D_OTHER, "modify record "DFID": idx:%u, "
+			       "len:%u offset %llu\n",
+			       PFID(&loghandle->lgh_id.lgl_oi.oi_fid), idx,
+			       rec->lrh_len, (long long)lgi->lgi_off);
+		} else if (llh->llh_flags & LLOG_F_IS_FIXSIZE) {
+			lgi->lgi_off = llh->llh_hdr.lrh_len +
+				       (idx - 1) * reclen;
+		} else {
+			/* This can be result of lgh_cur_idx is not set during
+			 * llog processing or llh_size is not set to proper
+			 * record size for fixed records llog. Therefore it is
+			 * impossible to get record offset. */
+			CERROR("%s: can't get record offset, idx:%d, "
+			       "len:%u.\n", o->do_lu.lo_dev->ld_obd->obd_name,
+			       idx, rec->lrh_len);
+			RETURN(-EFAULT);
+		}
+
+		/* update only data, header and tail remain the same */
+		lgi->lgi_off += sizeof(struct llog_rec_hdr);
+		lgi->lgi_buf.lb_len = REC_DATA_LEN(rec);
+		lgi->lgi_buf.lb_buf = REC_DATA(rec);
+		rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th);
+		if (rc == 0 && reccookie) {
+			reccookie->lgc_lgl = loghandle->lgh_id;
+			reccookie->lgc_index = idx;
+			rc = 1;
+		}
+		RETURN(rc);
+	}
+
+	/**
+	 * The append case.
+	 * The most common case of using llog. The new index is assigned to
+	 * the new record, new bit is set in llog bitmap and llog count is
+	 * incremented.
+	 *
+	 * Make sure that records don't cross a chunk boundary, so we can
+	 * process them page-at-a-time if needed.  If it will cross a chunk
+	 * boundary, write in a fake (but referenced) entry to pad the chunk.
+	 */
+
+
+	/* simulate ENOSPC when new plain llog is being added to the
+	 * catalog */
+	if (OBD_FAIL_CHECK(OBD_FAIL_MDS_LLOG_CREATE_FAILED2) &&
+	    llh->llh_flags & LLOG_F_IS_CAT)
+		RETURN(-ENOSPC);
+
+	LASSERT(lgi->lgi_attr.la_valid & LA_SIZE);
+	orig_last_idx = loghandle->lgh_last_idx;
+	lgi->lgi_off = lgi->lgi_attr.la_size;
+
+	if (loghandle->lgh_max_size > 0 &&
+	    lgi->lgi_off >= loghandle->lgh_max_size) {
+		CDEBUG(D_OTHER, "llog is getting too large (%u > %u) at %u "
+		       DFID"\n", (unsigned)lgi->lgi_off,
+		       loghandle->lgh_max_size, (int)loghandle->lgh_last_idx,
+		       PFID(&loghandle->lgh_id.lgl_oi.oi_fid));
+		/* this is to signal that this llog is full */
+		loghandle->lgh_last_idx = LLOG_HDR_BITMAP_SIZE(llh) - 1;
+		RETURN(-ENOSPC);
+	}
+
+	left = chunk_size - (lgi->lgi_off & (chunk_size - 1));
+	/* NOTE: padding is a record, but no bit is set */
+	if (left != 0 && left != reclen &&
+	    left < (reclen + LLOG_MIN_REC_SIZE)) {
+		index = loghandle->lgh_last_idx + 1;
+		rc = llog_osd_pad(env, o, &lgi->lgi_off, left, index, th);
+		if (rc)
+			RETURN(rc);
+
+		loghandle->lgh_last_idx++; /* for pad rec */
+	}
+	/* if it's the last idx in log file, then return -ENOSPC
+	 * or wrap around if a catalog */
+	if (llog_is_full(loghandle) ||
+	    unlikely(llh->llh_flags & LLOG_F_IS_CAT &&
+		     OBD_FAIL_PRECHECK(OBD_FAIL_CAT_RECORDS) &&
+		     loghandle->lgh_last_idx >= cfs_fail_val)) {
+		if (llh->llh_flags & LLOG_F_IS_CAT)
+			loghandle->lgh_last_idx = 0;
+		else
+			RETURN(-ENOSPC);
+	}
+
+	/* increment the last_idx along with llh_tail index, they should
+	 * be equal for a llog lifetime */
+	loghandle->lgh_last_idx++;
+	index = loghandle->lgh_last_idx;
+	LLOG_HDR_TAIL(llh)->lrt_index = index;
+	/**
+	 * NB: the caller should make sure only 1 process access
+	 * the lgh_last_idx, e.g. append should be exclusive.
+	 * Otherwise it might hit the assert.
+	 */
+	LASSERT(index < LLOG_HDR_BITMAP_SIZE(llh));
+	rec->lrh_index = index;
+	lrt = rec_tail(rec);
+	lrt->lrt_len = rec->lrh_len;
+	lrt->lrt_index = rec->lrh_index;
+
+	/* the lgh_hdr_mutex protects llog header data from concurrent
+	 * update/cancel, the llh_count and llh_bitmap are protected */
+	mutex_lock(&loghandle->lgh_hdr_mutex);
+	if (ext2_set_bit(index, LLOG_HDR_BITMAP(llh))) {
+		CERROR("%s: index %u already set in log bitmap\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name, index);
+		mutex_unlock(&loghandle->lgh_hdr_mutex);
+		LBUG(); /* should never happen */
+	}
+	llh->llh_count++;
+
+	if (!(llh->llh_flags & LLOG_F_IS_FIXSIZE)) {
+		/* Update the minimum size of the llog record */
+		if (llh->llh_size == 0)
+			llh->llh_size = reclen;
+		else if (reclen < llh->llh_size)
+			llh->llh_size = reclen;
+	}
+
+	if (lgi->lgi_attr.la_size == 0) {
+		lgi->lgi_off = 0;
+		lgi->lgi_buf.lb_len = llh->llh_hdr.lrh_len;
+		lgi->lgi_buf.lb_buf = &llh->llh_hdr;
+		rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th);
+		if (rc != 0)
+			GOTO(out_unlock, rc);
+	} else {
+		__u32	*bitmap = LLOG_HDR_BITMAP(llh);
+
+		/* Note: If this is not initialization (size == 0), then do not
+		 * write the whole header (8k bytes), only update header/tail
+		 * and bits needs to be updated. Because this update might be
+		 * part of cross-MDT operation, which needs to write these
+		 * updates into the update log(32KB limit) and also pack inside
+		 * the RPC (1MB limit), if we write 8K for each operation, which
+		 * will cost a lot space, and keep us adding more updates to one
+		 * update log.*/
+		lgi->lgi_off = 0;
+		lgi->lgi_buf.lb_len = llh->llh_bitmap_offset;
+		lgi->lgi_buf.lb_buf = &llh->llh_hdr;
+		rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th);
+		if (rc != 0)
+			GOTO(out_unlock, rc);
+
+		lgi->lgi_off = llh->llh_bitmap_offset +
+			      (index / (sizeof(*bitmap) * 8)) * sizeof(*bitmap);
+		lgi->lgi_buf.lb_len = sizeof(*bitmap);
+		lgi->lgi_buf.lb_buf = &bitmap[index/(sizeof(*bitmap)*8)];
+		rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th);
+		if (rc != 0)
+			GOTO(out_unlock, rc);
+
+		lgi->lgi_off =  (unsigned long)LLOG_HDR_TAIL(llh) -
+				(unsigned long)llh;
+		lgi->lgi_buf.lb_len = sizeof(llh->llh_tail);
+		lgi->lgi_buf.lb_buf = LLOG_HDR_TAIL(llh);
+		rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th);
+		if (rc != 0)
+			GOTO(out_unlock, rc);
+	}
+
+out_unlock:
+	/* unlock here for remote object */
+	mutex_unlock(&loghandle->lgh_hdr_mutex);
+	if (rc)
+		GOTO(out, rc);
+
+	/* computed index can be used to determine offset for fixed-size
+	 * records. This also allows to handle Catalog wrap around case */
+	if (llh->llh_flags & LLOG_F_IS_FIXSIZE) {
+		lgi->lgi_off = llh->llh_hdr.lrh_len + (index - 1) * reclen;
+	} else {
+		rc = dt_attr_get(env, o, &lgi->lgi_attr);
+		if (rc)
+			GOTO(out, rc);
+
+		LASSERT(lgi->lgi_attr.la_valid & LA_SIZE);
+		lgi->lgi_off = max_t(__u64, lgi->lgi_attr.la_size,
+				     lgi->lgi_off);
+	}
+
+	lgi->lgi_buf.lb_len = reclen;
+	lgi->lgi_buf.lb_buf = rec;
+	rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	CDEBUG(D_HA, "added record "DFID".%u, %u off%llu\n",
+	       PFID(lu_object_fid(&o->do_lu)), index, rec->lrh_len,
+	       lgi->lgi_off);
+	if (reccookie != NULL) {
+		reccookie->lgc_lgl = loghandle->lgh_id;
+		reccookie->lgc_index = index;
+		if ((rec->lrh_type == MDS_UNLINK_REC) ||
+		    (rec->lrh_type == MDS_SETATTR64_REC))
+			reccookie->lgc_subsys = LLOG_MDS_OST_ORIG_CTXT;
+		else if (rec->lrh_type == OST_SZ_REC)
+			reccookie->lgc_subsys = LLOG_SIZE_ORIG_CTXT;
+		else
+			reccookie->lgc_subsys = -1;
+		rc = 1;
+	}
+	RETURN(rc);
+out:
+	/* cleanup llog for error case */
+	mutex_lock(&loghandle->lgh_hdr_mutex);
+	ext2_clear_bit(index, LLOG_HDR_BITMAP(llh));
+	llh->llh_count--;
+	mutex_unlock(&loghandle->lgh_hdr_mutex);
+
+	/* restore llog last_idx */
+	if (dt_object_remote(o)) {
+		loghandle->lgh_last_idx = orig_last_idx;
+	} else if (--loghandle->lgh_last_idx == 0 &&
+	    (llh->llh_flags & LLOG_F_IS_CAT) && llh->llh_cat_idx != 0) {
+		/* catalog had just wrap-around case */
+		loghandle->lgh_last_idx = LLOG_HDR_BITMAP_SIZE(llh) - 1;
+	}
+
+	LLOG_HDR_TAIL(llh)->lrt_index = loghandle->lgh_last_idx;
+
+	RETURN(rc);
+}
+
+/**
+ * We can skip reading at least as many log blocks as the number of
+ * minimum sized log records we are skipping.  If it turns out
+ * that we are not far enough along the log (because the
+ * actual records are larger than minimum size) we just skip
+ * some more records.
+ *
+ * Note: in llog_process_thread, it will use bitmap offset as
+ * the index to locate the record, which also includs some pad
+ * records, whose record size is very small, and it also does not
+ * consider pad record when recording minimum record size (otherwise
+ * min_record size might be too small), so in some rare cases,
+ * it might skip too much record for @goal, see llog_osd_next_block().
+ *
+ * When force_mini_rec is true, it means we have to use LLOG_MIN_REC_SIZE
+ * as the min record size to skip over, usually because in the previous
+ * try, it skip too much record, see loog_osd_next(prev)_block().
+ */
+static inline void llog_skip_over(struct llog_handle *lgh, __u64 *off,
+				  int curr, int goal, __u32 chunk_size,
+				  bool force_mini_rec)
+{
+	struct llog_log_hdr *llh = lgh->lgh_hdr;
+
+	/* Goal should not bigger than the record count */
+	if (goal > lgh->lgh_last_idx)
+		goal = lgh->lgh_last_idx;
+
+	if (goal > curr) {
+		if (llh->llh_flags & LLOG_F_IS_FIXSIZE) {
+			*off = chunk_size + (goal - 1) * llh->llh_size;
+		} else {
+			__u64 min_rec_size = LLOG_MIN_REC_SIZE;
+
+			if (llh->llh_size > 0 && !force_mini_rec)
+				min_rec_size = llh->llh_size;
+
+			*off = *off + (goal - curr - 1) * min_rec_size;
+		}
+	}
+	/* always align with lower chunk boundary*/
+	*off &= ~(chunk_size - 1);
+}
+
+/**
+ * Remove optional fields that the client doesn't expect.
+ * This is typically in order to ensure compatibility with older clients.
+ * It is assumed that since we exclusively remove fields, the block will be
+ * big enough to handle the remapped records. It is also assumed that records
+ * of a block have the same format (i.e.: the same features enabled).
+ *
+ * \param[in,out]    hdr	Header of the block of records to remap.
+ * \param[in,out]    last_hdr   Last header, don't read past this point.
+ * \param[in]        flags	Flags describing the fields to keep.
+ */
+static void changelog_block_trim_ext(struct llog_rec_hdr *hdr,
+				     struct llog_rec_hdr *last_hdr,
+				     enum changelog_rec_flags flags)
+{
+	if (hdr->lrh_type != CHANGELOG_REC)
+		return;
+
+	do {
+		struct changelog_rec *rec = (struct changelog_rec *)(hdr + 1);
+
+		if (unlikely(hdr->lrh_len == 0)) {
+			/* It is corruption case, we cannot know the next rec,
+			 * jump to the last one directly to avoid dead loop. */
+			LCONSOLE(D_WARNING, "Hit invalid llog record: "
+				 "idx %u, type %u, id %u\n",
+				 hdr->lrh_index, hdr->lrh_type, hdr->lrh_id);
+			hdr = llog_rec_hdr_next(last_hdr);
+			if (unlikely(hdr == last_hdr))
+				LCONSOLE(D_WARNING, "The last record crashed: "
+					 "idx %u, type %u, id %u\n",
+					 hdr->lrh_index, hdr->lrh_type,
+					 hdr->lrh_id);
+			break;
+		}
+
+		changelog_remap_rec(rec, rec->cr_flags & flags);
+		hdr = llog_rec_hdr_next(hdr);
+		/* Yield CPU to avoid soft-lockup if there are too many records
+		 * to be handled. */
+		cond_resched();
+	} while ((char *)hdr <= (char *)last_hdr);
+}
+
+/**
+ * Implementation of the llog_operations::lop_next_block
+ *
+ * This function finds the the next llog block to return which contains
+ * record with required index. It is main part of llog processing.
+ *
+ * \param[in]     env		execution environment
+ * \param[in]     loghandle	llog handle of the current llog
+ * \param[in,out] cur_idx	index preceeding cur_offset
+ * \param[in]     next_idx	target index to find
+ * \param[in,out] cur_offset	furtherst point read in the file
+ * \param[in]     buf		pointer to data buffer to fill
+ * \param[in]     len		required len to read, it is
+ *				usually llog chunk_size.
+ *
+ * \retval			0 on successful buffer read
+ * \retval			negative value on error
+ */
+static int llog_osd_next_block(const struct lu_env *env,
+			       struct llog_handle *loghandle, int *cur_idx,
+			       int next_idx, __u64 *cur_offset, void *buf,
+			       int len)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct dt_object	*o;
+	struct dt_device	*dt;
+	int			 rc;
+	__u32			chunk_size;
+	int last_idx = *cur_idx;
+	__u64 last_offset = *cur_offset;
+	bool force_mini_rec = false;
+
+	ENTRY;
+
+	LASSERT(env);
+	LASSERT(lgi);
+
+	chunk_size = loghandle->lgh_hdr->llh_hdr.lrh_len;
+	if (len == 0 || len & (chunk_size - 1))
+		RETURN(-EINVAL);
+
+	LASSERT(loghandle);
+	LASSERT(loghandle->lgh_ctxt);
+
+	o = loghandle->lgh_obj;
+	LASSERT(o);
+	LASSERT(dt_object_exists(o));
+	dt = lu2dt_dev(o->do_lu.lo_dev);
+	LASSERT(dt);
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr);
+	if (rc)
+		GOTO(out, rc);
+
+	CDEBUG(D_OTHER, "looking for log index %u (cur idx %u off"
+	       "%llu), size %llu\n", next_idx, *cur_idx,
+	       *cur_offset, lgi->lgi_attr.la_size);
+
+	while (*cur_offset < lgi->lgi_attr.la_size) {
+		struct llog_rec_hdr	*rec, *last_rec;
+		struct llog_rec_tail	*tail;
+
+		llog_skip_over(loghandle, cur_offset, *cur_idx,
+			       next_idx, chunk_size, force_mini_rec);
+
+		/* read up to next llog chunk_size block */
+		lgi->lgi_buf.lb_len = chunk_size -
+				      (*cur_offset & (chunk_size - 1));
+		lgi->lgi_buf.lb_buf = buf;
+
+		rc = dt_read(env, o, &lgi->lgi_buf, cur_offset);
+		if (rc < 0) {
+			if (rc == -EBADR && !force_mini_rec)
+				goto retry;
+
+			CERROR("%s: can't read llog block from log "DFID
+			       " offset %llu: rc = %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       PFID(lu_object_fid(&o->do_lu)), *cur_offset,
+			       rc);
+			GOTO(out, rc);
+		}
+
+		if (rc < len) {
+			/* signal the end of the valid buffer to
+			 * llog_process */
+			memset(buf + rc, 0, len - rc);
+		}
+
+		if (rc == 0) { /* end of file, nothing to do */
+			if (!force_mini_rec)
+				goto retry;
+			GOTO(out, rc);
+		}
+
+		if (rc < sizeof(*tail)) {
+			if (!force_mini_rec)
+				goto retry;
+
+			CERROR("%s: invalid llog block at log id "DFID":%x "
+			       "offset %llu\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       PFID(&loghandle->lgh_id.lgl_oi.oi_fid),
+			       loghandle->lgh_id.lgl_ogen, *cur_offset);
+			GOTO(out, rc = -EINVAL);
+		}
+
+		rec = buf;
+		if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+			lustre_swab_llog_rec(rec);
+
+		tail = (struct llog_rec_tail *)((char *)buf + rc -
+						sizeof(struct llog_rec_tail));
+		/* get the last record in block */
+		last_rec = (struct llog_rec_hdr *)((char *)buf + rc -
+						   tail->lrt_len);
+
+		if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+			lustre_swab_llog_rec(last_rec);
+
+		if (last_rec->lrh_index != tail->lrt_index) {
+			CERROR("%s: invalid llog tail at log id "DFID":%x "
+			       "offset %llu last_rec idx %u tail idx %u"
+			       "lrt len %u read_size %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       PFID(&loghandle->lgh_id.lgl_oi.oi_fid),
+			       loghandle->lgh_id.lgl_ogen, *cur_offset,
+			       last_rec->lrh_index, tail->lrt_index,
+			       tail->lrt_len, rc);
+			GOTO(out, rc = -EINVAL);
+		}
+
+		*cur_idx = tail->lrt_index;
+
+		/* this shouldn't happen */
+		if (tail->lrt_index == 0) {
+			CERROR("%s: invalid llog tail at log id "DFID":%x "
+			       "offset %llu bytes %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       PFID(&loghandle->lgh_id.lgl_oi.oi_fid),
+			       loghandle->lgh_id.lgl_ogen, *cur_offset, rc);
+			GOTO(out, rc = -EINVAL);
+		}
+		if (tail->lrt_index < next_idx) {
+			last_idx = *cur_idx;
+			last_offset = *cur_offset;
+			continue;
+		}
+
+		/* sanity check that the start of the new buffer is no farther
+		 * than the record that we wanted.  This shouldn't happen. */
+		if (rec->lrh_index > next_idx) {
+			if (!force_mini_rec && next_idx > last_idx)
+				goto retry;
+
+			CERROR("%s: missed desired record? %u > %u\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       rec->lrh_index, next_idx);
+			GOTO(out, rc = -ENOENT);
+		}
+
+		/* Trim unsupported extensions for compat w/ older clients */
+		if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_JOBID))
+			changelog_block_trim_ext(rec, last_rec,
+						 CLF_VERSION | CLF_RENAME);
+
+		GOTO(out, rc = 0);
+
+retry:
+		/* Note: because there are some pad records in the
+		 * llog, so llog_skip_over() might skip too much
+		 * records, let's try skip again with minimum record */
+		force_mini_rec = true;
+		*cur_offset = last_offset;
+		*cur_idx = last_idx;
+	}
+	GOTO(out, rc = -EIO);
+out:
+	return rc;
+}
+
+/**
+ * Implementation of the llog_operations::lop_prev_block
+ *
+ * This function finds the llog block to return which contains
+ * record with required index but in reverse order - from end of llog
+ * to the beginning.
+ * It is main part of reverse llog processing.
+ *
+ * \param[in] env	execution environment
+ * \param[in] loghandle	llog handle of the current llog
+ * \param[in] prev_idx	target index to find
+ * \param[in] buf	pointer to data buffer to fill
+ * \param[in] len	required len to read, it is llog_chunk_size usually.
+ *
+ * \retval		0 on successful buffer read
+ * \retval		negative value on error
+ */
+static int llog_osd_prev_block(const struct lu_env *env,
+			       struct llog_handle *loghandle,
+			       int prev_idx, void *buf, int len)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct dt_object	*o;
+	struct dt_device	*dt;
+	loff_t			 cur_offset;
+	__u32			chunk_size;
+	int			 rc;
+
+	ENTRY;
+
+	chunk_size = loghandle->lgh_hdr->llh_hdr.lrh_len;
+	if (len == 0 || len & (chunk_size - 1))
+		RETURN(-EINVAL);
+
+	CDEBUG(D_OTHER, "looking for log index %u\n", prev_idx);
+
+	LASSERT(loghandle);
+	LASSERT(loghandle->lgh_ctxt);
+
+	o = loghandle->lgh_obj;
+	LASSERT(o);
+	LASSERT(dt_object_exists(o));
+	dt = lu2dt_dev(o->do_lu.lo_dev);
+	LASSERT(dt);
+
+	/* Let's only use mini record size for previous block read
+	 * for now XXX */
+	cur_offset = chunk_size;
+	llog_skip_over(loghandle, &cur_offset, 0, prev_idx,
+		       chunk_size, true);
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr);
+	if (rc)
+		GOTO(out, rc);
+
+	while (cur_offset < lgi->lgi_attr.la_size) {
+		struct llog_rec_hdr	*rec, *last_rec;
+		struct llog_rec_tail	*tail;
+
+		lgi->lgi_buf.lb_len = len;
+		lgi->lgi_buf.lb_buf = buf;
+		rc = dt_read(env, o, &lgi->lgi_buf, &cur_offset);
+		if (rc < 0) {
+			CERROR("%s: can't read llog block from log "DFID
+			       " offset %llu: rc = %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       PFID(lu_object_fid(&o->do_lu)), cur_offset, rc);
+			GOTO(out, rc);
+		}
+
+		if (rc == 0) /* end of file, nothing to do */
+			GOTO(out, rc);
+
+		if (rc < sizeof(*tail)) {
+			CERROR("%s: invalid llog block at log id "DFID":%x "
+			       "offset %llu\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       PFID(&loghandle->lgh_id.lgl_oi.oi_fid),
+			       loghandle->lgh_id.lgl_ogen, cur_offset);
+			GOTO(out, rc = -EINVAL);
+		}
+
+		rec = buf;
+		if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+			lustre_swab_llog_rec(rec);
+
+		tail = (struct llog_rec_tail *)((char *)buf + rc -
+						sizeof(struct llog_rec_tail));
+		/* get the last record in block */
+		last_rec = (struct llog_rec_hdr *)((char *)buf + rc -
+						   le32_to_cpu(tail->lrt_len));
+
+		if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+			lustre_swab_llog_rec(last_rec);
+		LASSERT(last_rec->lrh_index == tail->lrt_index);
+
+		/* this shouldn't happen */
+		if (tail->lrt_index == 0) {
+			CERROR("%s: invalid llog tail at log id "DFID":%x "
+			       "offset %llu\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       PFID(&loghandle->lgh_id.lgl_oi.oi_fid),
+			       loghandle->lgh_id.lgl_ogen, cur_offset);
+			GOTO(out, rc = -EINVAL);
+		}
+		if (tail->lrt_index < prev_idx)
+			continue;
+
+		/* sanity check that the start of the new buffer is no farther
+		 * than the record that we wanted.  This shouldn't happen. */
+		if (rec->lrh_index > prev_idx) {
+			CERROR("%s: missed desired record? %u > %u\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       rec->lrh_index, prev_idx);
+			GOTO(out, rc = -ENOENT);
+		}
+
+		/* Trim unsupported extensions for compat w/ older clients */
+		if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_JOBID))
+			changelog_block_trim_ext(rec, last_rec,
+						 CLF_VERSION | CLF_RENAME);
+
+		GOTO(out, rc = 0);
+	}
+	GOTO(out, rc = -EIO);
+out:
+	return rc;
+}
+
+/**
+ * This is helper function to get llog directory object. It is used by named
+ * llog operations to find/insert/delete llog entry from llog directory.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ctxt	llog context
+ *
+ * \retval		dt_object of llog directory
+ * \retval		ERR_PTR of negative value on error
+ */
+static struct dt_object *llog_osd_dir_get(const struct lu_env *env,
+					  struct llog_ctxt *ctxt)
+{
+	struct dt_device	*dt;
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dir;
+	int			 rc;
+
+	dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+	if (ctxt->loc_dir == NULL) {
+		rc = dt_root_get(env, dt, &dti->dti_fid);
+		if (rc)
+			return ERR_PTR(rc);
+		dir = dt_locate(env, dt, &dti->dti_fid);
+
+		if (!IS_ERR(dir) && !dt_try_as_dir(env, dir)) {
+			dt_object_put(env, dir);
+			return ERR_PTR(-ENOTDIR);
+		}
+	} else {
+		lu_object_get(&ctxt->loc_dir->do_lu);
+		dir = ctxt->loc_dir;
+	}
+
+	return dir;
+}
+
+/**
+ * Implementation of the llog_operations::lop_open
+ *
+ * This function opens the llog by its logid or by name, it may open also
+ * non existent llog and assing then new id to it.
+ * The llog_open/llog_close pair works similar to lu_object_find/put,
+ * the object may not exist prior open. The result of open is just dt_object
+ * in the llog header.
+ *
+ * \param[in] env		execution environment
+ * \param[in] handle		llog handle of the current llog
+ * \param[in] logid		logid of llog to open (nameless llog)
+ * \param[in] name		name of llog to open (named llog)
+ * \param[in] open_param
+ *				LLOG_OPEN_NEW - new llog, may not exist
+ *				LLOG_OPEN_EXIST - old llog, must exist
+ *
+ * \retval			0 on successful open, llog_handle::lgh_obj
+ *				contains the dt_object of the llog.
+ * \retval			negative value on error
+ */
+static int llog_osd_open(const struct lu_env *env, struct llog_handle *handle,
+			 struct llog_logid *logid, char *name,
+			 enum llog_open_param open_param)
+{
+	struct llog_thread_info		*lgi = llog_info(env);
+	struct llog_ctxt		*ctxt = handle->lgh_ctxt;
+	struct dt_object		*o;
+	struct dt_device		*dt;
+	struct ls_device		*ls;
+	struct local_oid_storage	*los = NULL;
+	int				 rc = 0;
+	bool new_id = false;
+
+	ENTRY;
+
+	LASSERT(env);
+	LASSERT(ctxt);
+	LASSERT(ctxt->loc_exp);
+	LASSERT(ctxt->loc_exp->exp_obd);
+	dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+	LASSERT(dt);
+	if (ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) {
+		struct lu_object_conf conf = { 0 };
+		if (logid != NULL) {
+			logid_to_fid(logid, &lgi->lgi_fid);
+		} else {
+			/* If logid == NULL, then it means the caller needs
+			 * to allocate new FID (llog_cat_declare_add_rec()). */
+			rc = obd_fid_alloc(env, ctxt->loc_exp,
+					   &lgi->lgi_fid, NULL);
+			if (rc < 0)
+				RETURN(rc);
+			rc = 0;
+			conf.loc_flags = LOC_F_NEW;
+		}
+
+		o = dt_locate_at(env, dt, &lgi->lgi_fid,
+				 dt->dd_lu_dev.ld_site->ls_top_dev, &conf);
+		if (IS_ERR(o))
+			RETURN(PTR_ERR(o));
+
+		goto after_open;
+	}
+
+	ls = ls_device_get(dt);
+	if (IS_ERR(ls))
+		RETURN(PTR_ERR(ls));
+
+	mutex_lock(&ls->ls_los_mutex);
+	los = dt_los_find(ls, name != NULL ? FID_SEQ_LLOG_NAME : FID_SEQ_LLOG);
+	mutex_unlock(&ls->ls_los_mutex);
+	LASSERT(los);
+	ls_device_put(env, ls);
+
+	LASSERT(handle);
+
+	if (logid != NULL) {
+		logid_to_fid(logid, &lgi->lgi_fid);
+	} else if (name) {
+		struct dt_object *llog_dir;
+
+		llog_dir = llog_osd_dir_get(env, ctxt);
+		if (IS_ERR(llog_dir))
+			GOTO(out, rc = PTR_ERR(llog_dir));
+		dt_read_lock(env, llog_dir, 0);
+		rc = dt_lookup_dir(env, llog_dir, name, &lgi->lgi_fid);
+		dt_read_unlock(env, llog_dir);
+		dt_object_put(env, llog_dir);
+		if (rc == -ENOENT && open_param == LLOG_OPEN_NEW) {
+			/* generate fid for new llog */
+			rc = local_object_fid_generate(env, los,
+						       &lgi->lgi_fid);
+			new_id = true;
+		}
+		if (rc < 0)
+			GOTO(out, rc);
+		OBD_ALLOC(handle->lgh_name, strlen(name) + 1);
+		if (handle->lgh_name)
+			strcpy(handle->lgh_name, name);
+		else
+			GOTO(out, rc = -ENOMEM);
+	} else {
+		LASSERTF(open_param & LLOG_OPEN_NEW, "%#x\n", open_param);
+		/* generate fid for new llog */
+generate:
+		rc = local_object_fid_generate(env, los, &lgi->lgi_fid);
+		if (rc < 0)
+			GOTO(out, rc);
+		new_id = true;
+	}
+
+	o = ls_locate(env, ls, &lgi->lgi_fid, NULL);
+	if (IS_ERR(o))
+		GOTO(out_name, rc = PTR_ERR(o));
+
+	if (dt_object_exists(o) && new_id) {
+		/* llog exists with just generated ID, e.g. some old llog file
+		 * still is in use or is orphan, drop a warn and skip it. */
+		CDEBUG(D_INFO, "%s: llog exists with the same FID: "DFID
+		       ", skipping\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,
+		       PFID(lu_object_fid(&o->do_lu)));
+		dt_object_put(env, o);
+		/* just skip this llog ID, we shouldn't delete it because we
+		 * don't know exactly what is its purpose and state. */
+		goto generate;
+	}
+
+after_open:
+	/* No new llog is expected but doesn't exist */
+	if (open_param != LLOG_OPEN_NEW && !dt_object_exists(o)) {
+		CDEBUG(D_INFO, "%s: llog FID: "DFID" obj %p doesn`t exist\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,
+		       PFID(lu_object_fid(&o->do_lu)), o);
+		GOTO(out_put, rc = -ENOENT);
+	}
+	fid_to_logid(&lgi->lgi_fid, &handle->lgh_id);
+	handle->lgh_obj = o;
+	handle->private_data = los;
+	LASSERT(handle->lgh_ctxt);
+
+	RETURN(rc);
+
+out_put:
+	dt_object_put(env, o);
+out_name:
+	if (handle->lgh_name != NULL)
+		OBD_FREE(handle->lgh_name, strlen(name) + 1);
+out:
+	if (los != NULL)
+		dt_los_put(los);
+	RETURN(rc);
+}
+
+/**
+ * Get dir for regular fid log object
+ *
+ * Get directory for regular fid log object, and these regular fid log
+ * object will be inserted under this directory, to satisfy the FS
+ * consistency check, e2fsck etc.
+ *
+ * \param [in] env	execution environment
+ * \param [in] dto	llog object
+ *
+ * \retval		pointer to the directory if it is found.
+ * \retval		ERR_PTR(negative errno) if it fails.
+ */
+struct dt_object *llog_osd_get_regular_fid_dir(const struct lu_env *env,
+					       struct dt_object *dto)
+{
+	struct llog_thread_info *lgi = llog_info(env);
+	struct seq_server_site *ss = dto->do_lu.lo_dev->ld_site->ld_seq_site;
+	struct lu_seq_range	*range = &lgi->lgi_range;
+	struct lu_fid		*dir_fid = &lgi->lgi_fid;
+	struct dt_object	*dir;
+	int			rc;
+	ENTRY;
+
+	fld_range_set_any(range);
+	LASSERT(ss != NULL);
+	rc = ss->ss_server_fld->lsf_seq_lookup(env, ss->ss_server_fld,
+				   fid_seq(lu_object_fid(&dto->do_lu)), range);
+	if (rc < 0)
+		RETURN(ERR_PTR(rc));
+
+	lu_update_log_dir_fid(dir_fid, range->lsr_index);
+	dir = dt_locate(env, lu2dt_dev(dto->do_lu.lo_dev), dir_fid);
+	if (IS_ERR(dir))
+		RETURN(dir);
+
+	if (!dt_try_as_dir(env, dir)) {
+		dt_object_put(env, dir);
+		RETURN(ERR_PTR(-ENOTDIR));
+	}
+
+	RETURN(dir);
+}
+
+/**
+ * Add llog object with regular FID to name entry
+ *
+ * Add llog object with regular FID to name space, and each llog
+ * object on each MDT will be /update_log_dir/[seq:oid:ver],
+ * so to satisfy the namespace consistency check, e2fsck etc.
+ *
+ * \param [in] env	execution environment
+ * \param [in] dto	llog object
+ * \param [in] th	thandle
+ * \param [in] declare	if it is declare or execution
+ *
+ * \retval		0 if insertion succeeds.
+ * \retval		negative errno if insertion fails.
+ */
+static int
+llog_osd_regular_fid_add_name_entry(const struct lu_env *env,
+				    struct dt_object *dto,
+				    struct thandle *th, bool declare)
+{
+	struct llog_thread_info *lgi = llog_info(env);
+	const struct lu_fid	*fid = lu_object_fid(&dto->do_lu);
+	struct dt_insert_rec	*rec = &lgi->lgi_dt_rec;
+	struct dt_object	*dir;
+	char			*name = lgi->lgi_name;
+	int			rc;
+	ENTRY;
+
+	if (!fid_is_norm(fid))
+		RETURN(0);
+
+	dir = llog_osd_get_regular_fid_dir(env, dto);
+	if (IS_ERR(dir))
+		RETURN(PTR_ERR(dir));
+
+	rec->rec_fid = fid;
+	rec->rec_type = S_IFREG;
+	snprintf(name, sizeof(lgi->lgi_name), DFID, PFID(fid));
+	dt_write_lock(env, dir, 0);
+	if (declare) {
+		rc = dt_declare_insert(env, dir, (struct dt_rec *)rec,
+			       (struct dt_key *)name, th);
+	} else {
+		rc = dt_insert(env, dir, (struct dt_rec *)rec,
+			       (struct dt_key *)name, th, 1);
+	}
+	dt_write_unlock(env, dir);
+
+	dt_object_put(env, dir);
+	RETURN(rc);
+}
+
+
+/**
+ * Implementation of the llog_operations::lop_declare_create
+ *
+ * This function declares the llog create. It declares also name insert
+ * into llog directory in case of named llog.
+ *
+ * \param[in] env	execution environment
+ * \param[in] res	llog handle of the current llog
+ * \param[in] th	current transaction handle
+ *
+ * \retval		0 on successful create declaration
+ * \retval		negative value on error
+ */
+static int llog_osd_declare_create(const struct lu_env *env,
+				   struct llog_handle *res, struct thandle *th)
+{
+	struct llog_thread_info		*lgi = llog_info(env);
+	struct dt_insert_rec		*rec = &lgi->lgi_dt_rec;
+	struct local_oid_storage	*los;
+	struct dt_object		*o;
+	int				 rc;
+
+	ENTRY;
+
+	LASSERT(res->lgh_obj);
+	LASSERT(th);
+
+	/* object can be created by another thread */
+	o = res->lgh_obj;
+	if (dt_object_exists(o))
+		RETURN(0);
+
+	if (res->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) {
+		struct llog_thread_info *lgi = llog_info(env);
+
+		lgi->lgi_attr.la_valid = LA_MODE | LA_SIZE;
+		lgi->lgi_attr.la_size = 0;
+		lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+		lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+		rc = dt_declare_create(env, o, &lgi->lgi_attr, NULL,
+				       &lgi->lgi_dof, th);
+		if (rc < 0)
+			RETURN(rc);
+
+
+		rc = llog_osd_regular_fid_add_name_entry(env, o, th, true);
+
+		RETURN(rc);
+	}
+	los = res->private_data;
+	LASSERT(los);
+
+	rc = llog_osd_declare_new_object(env, los, o, th);
+	if (rc)
+		RETURN(rc);
+
+	/* do not declare header initialization here as it's declared
+	 * in llog_osd_declare_write_rec() which is always called */
+
+	if (res->lgh_name) {
+		struct dt_object *llog_dir;
+
+		llog_dir = llog_osd_dir_get(env, res->lgh_ctxt);
+		if (IS_ERR(llog_dir))
+			RETURN(PTR_ERR(llog_dir));
+		logid_to_fid(&res->lgh_id, &lgi->lgi_fid);
+		rec->rec_fid = &lgi->lgi_fid;
+		rec->rec_type = S_IFREG;
+		rc = dt_declare_insert(env, llog_dir,
+				       (struct dt_rec *)rec,
+				       (struct dt_key *)res->lgh_name, th);
+		dt_object_put(env, llog_dir);
+		if (rc)
+			CERROR("%s: can't declare named llog %s: rc = %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       res->lgh_name, rc);
+	}
+	RETURN(rc);
+}
+
+/**
+ * Implementation of the llog_operations::lop_create
+ *
+ * This function creates the llog according with llog_handle::lgh_obj
+ * and llog_handle::lgh_name.
+ *
+ * \param[in] env	execution environment
+ * \param[in] res	llog handle of the current llog
+ * \param[in] th	current transaction handle
+ *
+ * \retval		0 on successful create
+ * \retval		negative value on error
+ */
+static int llog_osd_create(const struct lu_env *env, struct llog_handle *res,
+			   struct thandle *th)
+{
+	struct llog_thread_info *lgi = llog_info(env);
+	struct dt_insert_rec	*rec = &lgi->lgi_dt_rec;
+	struct local_oid_storage *los;
+	struct dt_object        *o;
+	int                      rc = 0;
+
+	ENTRY;
+
+	LASSERT(env);
+	o = res->lgh_obj;
+	LASSERT(o);
+
+	/* llog can be already created */
+	if (dt_object_exists(o))
+		RETURN(-EEXIST);
+
+	if (res->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) {
+		struct llog_thread_info *lgi = llog_info(env);
+
+		lgi->lgi_attr.la_valid = LA_MODE | LA_SIZE | LA_TYPE;
+		lgi->lgi_attr.la_size = 0;
+		lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+		lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+		dt_write_lock(env, o, 0);
+		rc = dt_create(env, o, &lgi->lgi_attr, NULL,
+			       &lgi->lgi_dof, th);
+		dt_write_unlock(env, o);
+		if (rc < 0)
+			RETURN(rc);
+
+		rc = llog_osd_regular_fid_add_name_entry(env, o, th, false);
+
+		RETURN(rc);
+	}
+
+	los = res->private_data;
+	LASSERT(los);
+
+	dt_write_lock(env, o, 0);
+	if (!dt_object_exists(o))
+		rc = llog_osd_create_new_object(env, los, o, th);
+	else
+		rc = -EEXIST;
+
+	dt_write_unlock(env, o);
+	if (rc)
+		RETURN(rc);
+
+	if (res->lgh_name) {
+		struct dt_object *llog_dir;
+
+		llog_dir = llog_osd_dir_get(env, res->lgh_ctxt);
+		if (IS_ERR(llog_dir))
+			RETURN(PTR_ERR(llog_dir));
+
+		logid_to_fid(&res->lgh_id, &lgi->lgi_fid);
+		rec->rec_fid = &lgi->lgi_fid;
+		rec->rec_type = S_IFREG;
+		dt_read_lock(env, llog_dir, 0);
+		rc = dt_insert(env, llog_dir, (struct dt_rec *)rec,
+			       (struct dt_key *)res->lgh_name,
+			       th, 1);
+		dt_read_unlock(env, llog_dir);
+		dt_object_put(env, llog_dir);
+		if (rc)
+			CERROR("%s: can't create named llog %s: rc = %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       res->lgh_name, rc);
+	}
+	RETURN(rc);
+}
+
+/**
+ * Implementation of the llog_operations::lop_close
+ *
+ * This function closes the llog. It just put llog object and referenced
+ * local storage.
+ *
+ * \param[in] env	execution environment
+ * \param[in] handle	llog handle of the current llog
+ *
+ * \retval		0 on successful llog close
+ * \retval		negative value on error
+ */
+static int llog_osd_close(const struct lu_env *env, struct llog_handle *handle)
+{
+	struct local_oid_storage	*los;
+	int				 rc = 0;
+
+	ENTRY;
+
+	LASSERT(handle->lgh_obj);
+
+	if (handle->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) {
+		/* Remove the object from the cache, otherwise it may
+		 * hold LOD being released during cleanup process */
+		dt_object_put_nocache(env, handle->lgh_obj);
+		LASSERT(handle->private_data == NULL);
+		RETURN(rc);
+	} else {
+		dt_object_put(env, handle->lgh_obj);
+	}
+	los = handle->private_data;
+	LASSERT(los);
+	dt_los_put(los);
+
+	if (handle->lgh_name)
+		OBD_FREE(handle->lgh_name, strlen(handle->lgh_name) + 1);
+
+	RETURN(rc);
+}
+
+/**
+ * delete llog object name entry
+ *
+ * Delete llog object (with regular FID) from name space (under
+ * update_log_dir).
+ *
+ * \param [in] env	execution environment
+ * \param [in] dto	llog object
+ * \param [in] th	thandle
+ * \param [in] declare	if it is declare or execution
+ *
+ * \retval		0 if deletion succeeds.
+ * \retval		negative errno if deletion fails.
+ */
+static int
+llog_osd_regular_fid_del_name_entry(const struct lu_env *env,
+				    struct dt_object *dto,
+				    struct thandle *th, bool declare)
+{
+	struct llog_thread_info *lgi = llog_info(env);
+	const struct lu_fid	*fid = lu_object_fid(&dto->do_lu);
+	struct dt_object	*dir;
+	char			*name = lgi->lgi_name;
+	int			rc;
+	ENTRY;
+
+	if (!fid_is_norm(fid))
+		RETURN(0);
+
+	dir = llog_osd_get_regular_fid_dir(env, dto);
+	if (IS_ERR(dir))
+		RETURN(PTR_ERR(dir));
+
+	snprintf(name, sizeof(lgi->lgi_name), DFID, PFID(fid));
+	dt_write_lock(env, dir, 0);
+	if (declare) {
+		rc = dt_declare_delete(env, dir, (struct dt_key *)name,
+				       th);
+	} else {
+		rc = dt_delete(env, dir, (struct dt_key *)name, th);
+	}
+	dt_write_unlock(env, dir);
+
+	dt_object_put(env, dir);
+	RETURN(rc);
+}
+
+/**
+ * Implementation of the llog_operations::lop_declare_destroy
+ *
+ * This function declare destroys the llog and deletes also entry in the
+ * llog directory in case of named llog. Llog should be opened prior that.
+ *
+ * \param[in] env		execution environment
+ * \param[in] loghandle	llog handle of the current llog
+ *
+ * \retval		0 on successful destroy
+ * \retval		negative value on error
+ */
+static int llog_osd_declare_destroy(const struct lu_env *env,
+				    struct llog_handle *loghandle,
+				    struct thandle *th)
+{
+	struct llog_ctxt	*ctxt;
+	struct dt_object	*o, *llog_dir = NULL;
+	int			 rc;
+
+	ENTRY;
+
+	ctxt = loghandle->lgh_ctxt;
+	LASSERT(ctxt);
+
+	o = loghandle->lgh_obj;
+	LASSERT(o);
+
+	if (loghandle->lgh_name) {
+		llog_dir = llog_osd_dir_get(env, ctxt);
+		if (IS_ERR(llog_dir))
+			RETURN(PTR_ERR(llog_dir));
+
+		rc = dt_declare_delete(env, llog_dir,
+				       (struct dt_key *)loghandle->lgh_name,
+				       th);
+		if (rc < 0)
+			GOTO(out_put, rc);
+	}
+
+	rc = dt_declare_ref_del(env, o, th);
+	if (rc < 0)
+		GOTO(out_put, rc);
+
+	rc = dt_declare_destroy(env, o, th);
+	if (rc < 0)
+		GOTO(out_put, rc);
+
+	if (loghandle->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) {
+		rc = llog_osd_regular_fid_del_name_entry(env, o, th, true);
+		if (rc < 0)
+			GOTO(out_put, rc);
+	}
+
+out_put:
+	if (!(IS_ERR_OR_NULL(llog_dir)))
+		dt_object_put(env, llog_dir);
+
+	RETURN(rc);
+}
+
+
+/**
+ * Implementation of the llog_operations::lop_destroy
+ *
+ * This function destroys the llog and deletes also entry in the
+ * llog directory in case of named llog. Llog should be opened prior that.
+ * Destroy method is not part of external transaction and does everything
+ * inside.
+ *
+ * \param[in] env		execution environment
+ * \param[in] loghandle	llog handle of the current llog
+ *
+ * \retval		0 on successful destroy
+ * \retval		negative value on error
+ */
+static int llog_osd_destroy(const struct lu_env *env,
+			    struct llog_handle *loghandle, struct thandle *th)
+{
+	struct llog_ctxt	*ctxt;
+	struct dt_object	*o, *llog_dir = NULL;
+	int			 rc;
+
+	ENTRY;
+
+	ctxt = loghandle->lgh_ctxt;
+	LASSERT(ctxt != NULL);
+
+	o = loghandle->lgh_obj;
+	LASSERT(o != NULL);
+
+	dt_write_lock(env, o, 0);
+	if (!dt_object_exists(o))
+		GOTO(out_unlock, rc = 0);
+
+	if (loghandle->lgh_name) {
+		llog_dir = llog_osd_dir_get(env, ctxt);
+		if (IS_ERR(llog_dir))
+			GOTO(out_unlock, rc = PTR_ERR(llog_dir));
+
+		dt_read_lock(env, llog_dir, 0);
+		rc = dt_delete(env, llog_dir,
+			       (struct dt_key *)loghandle->lgh_name,
+			       th);
+		dt_read_unlock(env, llog_dir);
+		if (rc) {
+			CERROR("%s: can't remove llog %s: rc = %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       loghandle->lgh_name, rc);
+			GOTO(out_unlock, rc);
+		}
+	}
+
+	dt_ref_del(env, o, th);
+	rc = dt_destroy(env, o, th);
+	if (rc < 0)
+		GOTO(out_unlock, rc);
+
+	if (loghandle->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) {
+		rc = llog_osd_regular_fid_del_name_entry(env, o, th, false);
+		if (rc < 0)
+			GOTO(out_unlock, rc);
+	}
+
+out_unlock:
+	dt_write_unlock(env, o);
+	if (!(IS_ERR_OR_NULL(llog_dir)))
+		dt_object_put(env, llog_dir);
+	RETURN(rc);
+}
+
+/**
+ * Implementation of the llog_operations::lop_setup
+ *
+ * This function setup the llog on local storage.
+ *
+ * \param[in] env	execution environment
+ * \param[in] obd	obd device the llog belongs to
+ * \param[in] olg	the llog group, it is always zero group now.
+ * \param[in] ctxt_idx	the llog index, it defines the purpose of this llog.
+ *			Every new llog type have to use own index.
+ * \param[in] disk_obd	the storage obd, where llog is stored.
+ *
+ * \retval		0 on successful llog setup
+ * \retval		negative value on error
+ */
+static int llog_osd_setup(const struct lu_env *env, struct obd_device *obd,
+			  struct obd_llog_group *olg, int ctxt_idx,
+			  struct obd_device *disk_obd)
+{
+	struct llog_thread_info		*lgi = llog_info(env);
+	struct llog_ctxt		*ctxt;
+	int				 rc = 0;
+	ENTRY;
+
+	LASSERT(obd);
+	LASSERT(olg->olg_ctxts[ctxt_idx]);
+
+	ctxt = llog_ctxt_get(olg->olg_ctxts[ctxt_idx]);
+	LASSERT(ctxt);
+
+	if (disk_obd == NULL)
+		GOTO(out, rc = 0);
+
+	/* initialize data allowing to generate new fids,
+	 * literally we need a sequece */
+	lgi->lgi_fid.f_seq = FID_SEQ_LLOG;
+	lgi->lgi_fid.f_oid = 1;
+	lgi->lgi_fid.f_ver = 0;
+	rc = local_oid_storage_init(env, disk_obd->obd_lvfs_ctxt.dt,
+				    &lgi->lgi_fid,
+				    &ctxt->loc_los_nameless);
+	if (rc != 0)
+		GOTO(out, rc);
+
+	lgi->lgi_fid.f_seq = FID_SEQ_LLOG_NAME;
+	lgi->lgi_fid.f_oid = 1;
+	lgi->lgi_fid.f_ver = 0;
+	rc = local_oid_storage_init(env, disk_obd->obd_lvfs_ctxt.dt,
+				    &lgi->lgi_fid,
+				    &ctxt->loc_los_named);
+	if (rc != 0) {
+		local_oid_storage_fini(env, ctxt->loc_los_nameless);
+		ctxt->loc_los_nameless = NULL;
+	}
+
+	GOTO(out, rc);
+
+out:
+	llog_ctxt_put(ctxt);
+	return rc;
+}
+
+/**
+ * Implementation of the llog_operations::lop_cleanup
+ *
+ * This function cleanups the llog on local storage.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ctxt	the llog context
+ *
+ * \retval		0
+ */
+static int llog_osd_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+	if (ctxt->loc_los_nameless != NULL) {
+		local_oid_storage_fini(env, ctxt->loc_los_nameless);
+		ctxt->loc_los_nameless = NULL;
+	}
+
+	if (ctxt->loc_los_named != NULL) {
+		local_oid_storage_fini(env, ctxt->loc_los_named);
+		ctxt->loc_los_named = NULL;
+	}
+
+	return 0;
+}
+
+struct llog_operations llog_osd_ops = {
+	.lop_next_block		= llog_osd_next_block,
+	.lop_prev_block		= llog_osd_prev_block,
+	.lop_read_header	= llog_osd_read_header,
+	.lop_declare_destroy	= llog_osd_declare_destroy,
+	.lop_destroy		= llog_osd_destroy,
+	.lop_setup		= llog_osd_setup,
+	.lop_cleanup		= llog_osd_cleanup,
+	.lop_open		= llog_osd_open,
+	.lop_exist		= llog_osd_exist,
+	.lop_declare_create	= llog_osd_declare_create,
+	.lop_create		= llog_osd_create,
+	.lop_declare_write_rec	= llog_osd_declare_write_rec,
+	.lop_write_rec		= llog_osd_write_rec,
+	.lop_close		= llog_osd_close,
+};
+EXPORT_SYMBOL(llog_osd_ops);
+
+struct llog_operations llog_common_cat_ops = {
+	.lop_next_block		= llog_osd_next_block,
+	.lop_prev_block		= llog_osd_prev_block,
+	.lop_read_header	= llog_osd_read_header,
+	.lop_declare_destroy	= llog_osd_declare_destroy,
+	.lop_destroy		= llog_osd_destroy,
+	.lop_setup		= llog_osd_setup,
+	.lop_cleanup		= llog_osd_cleanup,
+	.lop_open		= llog_osd_open,
+	.lop_exist		= llog_osd_exist,
+	.lop_declare_create	= llog_osd_declare_create,
+	.lop_create		= llog_osd_create,
+	.lop_declare_write_rec	= llog_osd_declare_write_rec,
+	.lop_write_rec		= llog_osd_write_rec,
+	.lop_close		= llog_osd_close,
+	.lop_add		= llog_cat_add_rec,
+	.lop_declare_add	= llog_cat_declare_add_rec,
+};
+EXPORT_SYMBOL(llog_common_cat_ops);
+
+/**
+ * Read the special file which contains the list of llog catalogs IDs
+ *
+ * This function reads the CATALOGS file which contains the array of llog
+ * catalogs IDs. The main purpose of this file is to store OSP llogs indexed
+ * by OST/MDT number.
+ *
+ * \param[in]  env		execution environment
+ * \param[in]  d		corresponding storage device
+ * \param[in]  idx		position to start from, usually OST/MDT index
+ * \param[in]  count		how many catalog IDs to read
+ * \param[out] idarray		the buffer for the data. If it is NULL then
+ *				function returns just number of catalog IDs
+ *				in the file.
+ * \param[in]  fid		LLOG_CATALOGS_OID for CATALOG object
+ *
+ * \retval			0 on successful read of catalog IDs
+ * \retval			negative value on error
+ * \retval			positive value which is number of records in
+ *				the file if \a idarray is NULL
+ */
+int llog_osd_get_cat_list(const struct lu_env *env, struct dt_device *d,
+			  int idx, int count, struct llog_catid *idarray,
+			  const struct lu_fid *fid)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct dt_object	*o = NULL;
+	struct thandle		*th;
+	int			 rc, size;
+
+	ENTRY;
+
+	LASSERT(d);
+
+	size = sizeof(*idarray) * count;
+	lgi->lgi_off = idx *  sizeof(*idarray);
+
+	lgi->lgi_fid = *fid;
+	o = dt_locate(env, d, &lgi->lgi_fid);
+	if (IS_ERR(o))
+		RETURN(PTR_ERR(o));
+
+	if (!dt_object_exists(o)) {
+		th = dt_trans_create(env, d);
+		if (IS_ERR(th))
+			GOTO(out, rc = PTR_ERR(th));
+
+		lgi->lgi_attr.la_valid = LA_MODE;
+		lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+		lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+		th->th_wait_submit = 1;
+		/* Make the llog object creation synchronization, so
+		 * it will be reliable to the reference, especially
+		 * for remote reference */
+		th->th_sync = 1;
+
+		rc = dt_declare_create(env, o, &lgi->lgi_attr, NULL,
+				       &lgi->lgi_dof, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		rc = dt_trans_start_local(env, d, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		dt_write_lock(env, o, 0);
+		if (!dt_object_exists(o))
+			rc = dt_create(env, o, &lgi->lgi_attr, NULL,
+				       &lgi->lgi_dof, th);
+		dt_write_unlock(env, o);
+out_trans:
+		dt_trans_stop(env, d, th);
+		if (rc)
+			GOTO(out, rc);
+	}
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr);
+	if (rc)
+		GOTO(out, rc);
+
+	if (!S_ISREG(lgi->lgi_attr.la_mode)) {
+		CERROR("%s: CATALOGS is not a regular file!: mode = %o\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,
+		       lgi->lgi_attr.la_mode);
+		GOTO(out, rc = -ENOENT);
+	}
+
+	CDEBUG(D_CONFIG, "cat list: disk size=%d, read=%d\n",
+	       (int)lgi->lgi_attr.la_size, size);
+
+	/* return just number of llogs */
+	if (idarray == NULL) {
+		rc = lgi->lgi_attr.la_size / sizeof(*idarray);
+		GOTO(out, rc);
+	}
+
+	/* read for new ost index or for empty file */
+	memset(idarray, 0, size);
+	if (lgi->lgi_attr.la_size <= lgi->lgi_off)
+		GOTO(out, rc = 0);
+	if (lgi->lgi_attr.la_size < lgi->lgi_off + size)
+		size = lgi->lgi_attr.la_size - lgi->lgi_off;
+
+	lgi->lgi_buf.lb_buf = idarray;
+	lgi->lgi_buf.lb_len = size;
+	rc = dt_record_read(env, o, &lgi->lgi_buf, &lgi->lgi_off);
+	/* -EFAULT means the llog is a sparse file. This is not an error
+	 * after arbitrary OST index is supported. */
+	if (rc < 0 && rc != -EFAULT) {
+		CERROR("%s: error reading CATALOGS: rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,  rc);
+		GOTO(out, rc);
+	}
+
+	EXIT;
+out:
+	dt_object_put(env, o);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_osd_get_cat_list);
+
+/**
+ * Write the special file which contains the list of llog catalogs IDs
+ *
+ * This function writes the CATALOG file which contains the array of llog
+ * catalogs IDs. It is used mostly to store OSP llogs indexed by OST/MDT
+ * number.
+ *
+ * \param[in]  env	execution environment
+ * \param[in]  d	corresponding storage device
+ * \param[in]  idx	position to start from, usually OST/MDT index
+ * \param[in]  count	how many catalog IDs to write
+ * \param[out] idarray	the buffer with the data to write.
+ * \param[in]  fid	LLOG_CATALOGS_OID for CATALOG object
+ *
+ * \retval		0 on successful write of catalog IDs
+ * \retval		negative value on error
+ */
+int llog_osd_put_cat_list(const struct lu_env *env, struct dt_device *d,
+			  int idx, int count, struct llog_catid *idarray,
+			  const struct lu_fid *fid)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct dt_object	*o = NULL;
+	struct thandle		*th;
+	int			 rc, size;
+
+	if (count == 0)
+		RETURN(0);
+
+	LASSERT(d);
+
+	size = sizeof(*idarray) * count;
+	lgi->lgi_off = idx * sizeof(*idarray);
+	lgi->lgi_fid = *fid;
+
+	o = dt_locate(env, d, &lgi->lgi_fid);
+	if (IS_ERR(o))
+		RETURN(PTR_ERR(o));
+
+	if (!dt_object_exists(o))
+		GOTO(out, rc = -ENOENT);
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr);
+	if (rc)
+		GOTO(out, rc);
+
+	if (!S_ISREG(lgi->lgi_attr.la_mode)) {
+		CERROR("%s: CATALOGS is not a regular file!: mode = %o\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,
+		       lgi->lgi_attr.la_mode);
+		GOTO(out, rc = -ENOENT);
+	}
+
+	th = dt_trans_create(env, d);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
+
+	lgi->lgi_buf.lb_len = size;
+	lgi->lgi_buf.lb_buf = idarray;
+	rc = dt_declare_record_write(env, o, &lgi->lgi_buf, lgi->lgi_off, th);
+	if (rc)
+		GOTO(out_trans, rc);
+
+	/* For update log, this happens during initialization,
+	 * see lod_sub_prep_llog(), and we need make sure catlog
+	 * file ID is written to catlist file(committed) before
+	 * cross-MDT operation write update records to catlog FILE,
+	 * otherwise, during failover these update records might
+	 * missing */
+	if (fid_is_update_log(fid))
+		th->th_sync = 1;
+
+	rc = dt_trans_start_local(env, d, th);
+	if (rc)
+		GOTO(out_trans, rc);
+
+	th->th_wait_submit = 1;
+
+	rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th);
+	if (rc)
+		CDEBUG(D_INODE, "can't write CATALOGS at index %d: rc = %d\n",
+		       idx, rc);
+out_trans:
+	dt_trans_stop(env, d, th);
+out:
+	dt_object_put(env, o);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_osd_put_cat_list);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_swab.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_swab.c
new file mode 100644
index 0000000000000..3ab0b430fca14
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_swab.c
@@ -0,0 +1,472 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_swab.c
+ *
+ * Swabbing of llog datatypes (from disk or over the wire).
+ *
+ * Author: jacob berkman  <jacob@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <lustre_log.h>
+#include <lustre_update.h>
+
+static void print_llogd_body(struct llogd_body *d)
+{
+	CDEBUG(D_OTHER, "llogd body: %p\n", d);
+	CDEBUG(D_OTHER, "\tlgd_logid.lgl_oi.oi_fid: "DFID"\n",
+	       PFID(&d->lgd_logid.lgl_oi.oi_fid));
+	CDEBUG(D_OTHER, "\tlgd_logid.lgl_ogen: %#x\n", d->lgd_logid.lgl_ogen);
+	CDEBUG(D_OTHER, "\tlgd_ctxt_idx: %#x\n", d->lgd_ctxt_idx);
+	CDEBUG(D_OTHER, "\tlgd_llh_flags: %#x\n", d->lgd_llh_flags);
+	CDEBUG(D_OTHER, "\tlgd_index: %#x\n", d->lgd_index);
+	CDEBUG(D_OTHER, "\tlgd_saved_index: %#x\n", d->lgd_saved_index);
+	CDEBUG(D_OTHER, "\tlgd_len: %#x\n", d->lgd_len);
+	CDEBUG(D_OTHER, "\tlgd_cur_offset: %#llx\n", d->lgd_cur_offset);
+}
+
+void lustre_swab_lu_fid(struct lu_fid *fid)
+{
+        __swab64s (&fid->f_seq);
+        __swab32s (&fid->f_oid);
+        __swab32s (&fid->f_ver);
+}
+EXPORT_SYMBOL(lustre_swab_lu_fid);
+
+void lustre_swab_ost_id(struct ost_id *oid)
+{
+	if (fid_seq_is_mdt0(oid->oi.oi_seq) ||
+	    fid_seq_is_default(oid->oi.oi_seq)) {
+		__swab64s(&oid->oi.oi_id);
+		__swab64s(&oid->oi.oi_seq);
+	} else {
+		lustre_swab_lu_fid(&oid->oi_fid);
+	}
+}
+EXPORT_SYMBOL(lustre_swab_ost_id);
+
+void lustre_swab_llog_id(struct llog_logid *log_id)
+{
+	__swab64s(&log_id->lgl_oi.oi.oi_id);
+	__swab64s(&log_id->lgl_oi.oi.oi_seq);
+        __swab32s(&log_id->lgl_ogen);
+}
+
+void lustre_swab_llogd_body (struct llogd_body *d)
+{
+        ENTRY;
+        print_llogd_body(d);
+	lustre_swab_llog_id(&d->lgd_logid);
+        __swab32s (&d->lgd_ctxt_idx);
+        __swab32s (&d->lgd_llh_flags);
+        __swab32s (&d->lgd_index);
+        __swab32s (&d->lgd_saved_index);
+        __swab32s (&d->lgd_len);
+        __swab64s (&d->lgd_cur_offset);
+        print_llogd_body(d);
+        EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_llogd_body);
+
+void lustre_swab_llogd_conn_body (struct llogd_conn_body *d)
+{
+        __swab64s (&d->lgdc_gen.mnt_cnt);
+        __swab64s (&d->lgdc_gen.conn_cnt);
+	lustre_swab_llog_id(&d->lgdc_logid);
+        __swab32s (&d->lgdc_ctxt_idx);
+}
+EXPORT_SYMBOL(lustre_swab_llogd_conn_body);
+
+void lustre_swab_ll_fid(struct ll_fid *fid)
+{
+        __swab64s (&fid->id);
+        __swab32s (&fid->generation);
+        __swab32s (&fid->f_type);
+}
+
+void lustre_swab_lu_seq_range(struct lu_seq_range *range)
+{
+        __swab64s (&range->lsr_start);
+        __swab64s (&range->lsr_end);
+        __swab32s (&range->lsr_index);
+        __swab32s (&range->lsr_flags);
+}
+EXPORT_SYMBOL(lustre_swab_lu_seq_range);
+
+void lustre_swab_update_ops(struct update_ops *uops, unsigned int op_count)
+{
+	unsigned int i;
+	unsigned int j;
+
+	for (i = 0; i < op_count; i++) {
+		lustre_swab_lu_fid(&uops->uops_op[i].uop_fid);
+		__swab16s(&uops->uops_op[i].uop_type);
+		__swab16s(&uops->uops_op[i].uop_param_count);
+		for (j = 0; j < uops->uops_op[i].uop_param_count; j++)
+			__swab16s(&uops->uops_op[i].uop_params_off[j]);
+	}
+}
+EXPORT_SYMBOL(lustre_swab_update_ops);
+
+void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
+{
+	struct llog_rec_tail *tail = NULL;
+
+        __swab32s(&rec->lrh_len);
+        __swab32s(&rec->lrh_index);
+        __swab32s(&rec->lrh_type);
+	__swab32s(&rec->lrh_id);
+
+        switch (rec->lrh_type) {
+	case OST_SZ_REC:
+	{
+                struct llog_size_change_rec *lsc =
+                        (struct llog_size_change_rec *)rec;
+
+                lustre_swab_ll_fid(&lsc->lsc_fid);
+                __swab32s(&lsc->lsc_ioepoch);
+		tail = &lsc->lsc_tail;
+                break;
+        }
+	case MDS_UNLINK_REC:
+	{
+                struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec;
+
+                __swab64s(&lur->lur_oid);
+                __swab32s(&lur->lur_oseq);
+                __swab32s(&lur->lur_count);
+		tail = &lur->lur_tail;
+                break;
+        }
+	case MDS_UNLINK64_REC:
+	{
+		struct llog_unlink64_rec *lur =
+			(struct llog_unlink64_rec *)rec;
+
+		lustre_swab_lu_fid(&lur->lur_fid);
+		__swab32s(&lur->lur_count);
+		tail = &lur->lur_tail;
+		break;
+	}
+	case CHANGELOG_REC:
+	{
+		struct llog_changelog_rec *cr =
+			(struct llog_changelog_rec *)rec;
+
+		__swab16s(&cr->cr.cr_namelen);
+		__swab16s(&cr->cr.cr_flags);
+		__swab32s(&cr->cr.cr_type);
+		__swab64s(&cr->cr.cr_index);
+		__swab64s(&cr->cr.cr_prev);
+		__swab64s(&cr->cr.cr_time);
+		lustre_swab_lu_fid(&cr->cr.cr_tfid);
+		lustre_swab_lu_fid(&cr->cr.cr_pfid);
+		if (cr->cr.cr_flags & CLF_RENAME) {
+			struct changelog_ext_rename *rnm =
+				changelog_rec_rename(&cr->cr);
+
+			lustre_swab_lu_fid(&rnm->cr_sfid);
+			lustre_swab_lu_fid(&rnm->cr_spfid);
+		}
+		/* Because the tail follows a variable-length structure we need
+		 * to compute its location at runtime */
+		tail = (struct llog_rec_tail *)((char *)&cr->cr +
+						changelog_rec_size(&cr->cr) +
+						cr->cr.cr_namelen);
+		break;
+	}
+
+	case CHANGELOG_USER_REC:
+	{
+                struct llog_changelog_user_rec *cur =
+                        (struct llog_changelog_user_rec*)rec;
+
+                __swab32s(&cur->cur_id);
+                __swab64s(&cur->cur_endrec);
+		tail = &cur->cur_tail;
+                break;
+        }
+
+	case HSM_AGENT_REC: {
+		struct llog_agent_req_rec *arr =
+			(struct llog_agent_req_rec *)rec;
+
+		__swab32s(&arr->arr_hai.hai_len);
+		__swab32s(&arr->arr_hai.hai_action);
+		lustre_swab_lu_fid(&arr->arr_hai.hai_fid);
+		lustre_swab_lu_fid(&arr->arr_hai.hai_dfid);
+		__swab64s(&arr->arr_hai.hai_cookie);
+		__swab64s(&arr->arr_hai.hai_extent.offset);
+		__swab64s(&arr->arr_hai.hai_extent.length);
+		__swab64s(&arr->arr_hai.hai_gid);
+		/* no swabing for opaque data */
+		/* hai_data[0]; */
+		break;
+	}
+
+	case MDS_SETATTR64_REC:
+	{
+		struct llog_setattr64_rec *lsr =
+			(struct llog_setattr64_rec *)rec;
+
+		lustre_swab_ost_id(&lsr->lsr_oi);
+		__swab32s(&lsr->lsr_uid);
+		__swab32s(&lsr->lsr_uid_h);
+		__swab32s(&lsr->lsr_gid);
+		__swab32s(&lsr->lsr_gid_h);
+		__swab64s(&lsr->lsr_valid);
+
+		if (rec->lrh_len > sizeof(struct llog_setattr64_rec)) {
+			struct llog_setattr64_rec_v2 *lsr2 =
+				(struct llog_setattr64_rec_v2 *)rec;
+
+			__swab32s(&lsr2->lsr_projid);
+			tail = &lsr2->lsr_tail;
+		} else {
+			tail = &lsr->lsr_tail;
+		}
+		break;
+	}
+	case OBD_CFG_REC:
+		/* these are swabbed as they are consumed */
+		break;
+	case LLOG_HDR_MAGIC:
+	{
+		struct llog_log_hdr *llh = (struct llog_log_hdr *)rec;
+
+		__swab64s(&llh->llh_timestamp);
+		__swab32s(&llh->llh_count);
+		__swab32s(&llh->llh_bitmap_offset);
+		__swab32s(&llh->llh_flags);
+		__swab32s(&llh->llh_size);
+		__swab32s(&llh->llh_cat_idx);
+		tail = LLOG_HDR_TAIL(llh);
+		break;
+	}
+	case LLOG_LOGID_MAGIC:
+	{
+		struct llog_logid_rec *lid = (struct llog_logid_rec *)rec;
+
+		lustre_swab_llog_id(&lid->lid_id);
+		tail = &lid->lid_tail;
+		break;
+	}
+	case LLOG_GEN_REC:
+	{
+		struct llog_gen_rec *lgr = (struct llog_gen_rec *)rec;
+
+		__swab64s(&lgr->lgr_gen.mnt_cnt);
+		__swab64s(&lgr->lgr_gen.conn_cnt);
+		tail = &lgr->lgr_tail;
+		break;
+	}
+        case LLOG_PAD_MAGIC:
+                break;
+	case UPDATE_REC:
+	{
+		struct llog_update_record *lur =
+				(struct llog_update_record *)rec;
+		struct update_records *record = &lur->lur_update_rec;
+
+		__swab32s(&record->ur_flags);
+		__swab64s(&record->ur_batchid);
+		__swab64s(&record->ur_master_transno);
+		__swab32s(&record->ur_param_count);
+		__swab32s(&record->ur_update_count);
+		lustre_swab_update_ops(&record->ur_ops,
+				       record->ur_update_count);
+
+		/* Compute tail location. */
+		tail = (struct llog_rec_tail *)((char *)record +
+						update_records_size(record));
+		break;
+	}
+        default:
+                CERROR("Unknown llog rec type %#x swabbing rec %p\n",
+                       rec->lrh_type, rec);
+        }
+
+	if (tail) {
+		__swab32s(&tail->lrt_len);
+		__swab32s(&tail->lrt_index);
+	}
+}
+
+static void print_llog_hdr(struct llog_log_hdr *h)
+{
+	CDEBUG(D_OTHER, "llog header: %p\n", h);
+	CDEBUG(D_OTHER, "\tllh_hdr.lrh_index: %#x\n", h->llh_hdr.lrh_index);
+	CDEBUG(D_OTHER, "\tllh_hdr.lrh_len: %#x\n", h->llh_hdr.lrh_len);
+	CDEBUG(D_OTHER, "\tllh_hdr.lrh_type: %#x\n", h->llh_hdr.lrh_type);
+	CDEBUG(D_OTHER, "\tllh_timestamp: %#llx\n", h->llh_timestamp);
+	CDEBUG(D_OTHER, "\tllh_count: %#x\n", h->llh_count);
+	CDEBUG(D_OTHER, "\tllh_bitmap_offset: %#x\n", h->llh_bitmap_offset);
+	CDEBUG(D_OTHER, "\tllh_flags: %#x\n", h->llh_flags);
+	CDEBUG(D_OTHER, "\tllh_size: %#x\n", h->llh_size);
+	CDEBUG(D_OTHER, "\tllh_cat_idx: %#x\n", h->llh_cat_idx);
+	CDEBUG(D_OTHER, "\tllh_tail.lrt_index: %#x\n",
+	       LLOG_HDR_TAIL(h)->lrt_index);
+	CDEBUG(D_OTHER, "\tllh_tail.lrt_len: %#x\n",
+	       LLOG_HDR_TAIL(h)->lrt_len);
+}
+
+void lustre_swab_llog_hdr (struct llog_log_hdr *h)
+{
+        ENTRY;
+        print_llog_hdr(h);
+
+	lustre_swab_llog_rec(&h->llh_hdr);
+
+        print_llog_hdr(h);
+        EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_llog_hdr);
+
+void print_lustre_cfg(struct lustre_cfg *lcfg)
+{
+        int i;
+        ENTRY;
+
+        if (!(libcfs_debug & D_OTHER)) /* don't loop on nothing */
+                return;
+
+        CDEBUG(D_OTHER, "lustre_cfg: %p\n", lcfg);
+        CDEBUG(D_OTHER, "\tlcfg->lcfg_version: %#x\n", lcfg->lcfg_version);
+
+        CDEBUG(D_OTHER, "\tlcfg->lcfg_command: %#x\n", lcfg->lcfg_command);
+        CDEBUG(D_OTHER, "\tlcfg->lcfg_num: %#x\n", lcfg->lcfg_num);
+        CDEBUG(D_OTHER, "\tlcfg->lcfg_flags: %#x\n", lcfg->lcfg_flags);
+        CDEBUG(D_OTHER, "\tlcfg->lcfg_nid: %s\n", libcfs_nid2str(lcfg->lcfg_nid));
+
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_bufcount: %d\n", lcfg->lcfg_bufcount);
+	if (lcfg->lcfg_bufcount < LUSTRE_CFG_MAX_BUFCOUNT)
+		for (i = 0; i < lcfg->lcfg_bufcount; i++) {
+			CDEBUG(D_OTHER, "\tlcfg->lcfg_buflens[%d]: %d %s\n",
+			       i, lcfg->lcfg_buflens[i],
+			       lustre_cfg_string(lcfg, i));
+		}
+
+        EXIT;
+}
+EXPORT_SYMBOL(print_lustre_cfg);
+
+void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg)
+{
+        int i;
+        ENTRY;
+
+        __swab32s(&lcfg->lcfg_version);
+
+        if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) {
+                CERROR("not swabbing lustre_cfg version %#x (expecting %#x)\n",
+                       lcfg->lcfg_version, LUSTRE_CFG_VERSION);
+                EXIT;
+                return;
+        }
+
+        __swab32s(&lcfg->lcfg_command);
+        __swab32s(&lcfg->lcfg_num);
+        __swab32s(&lcfg->lcfg_flags);
+        __swab64s(&lcfg->lcfg_nid);
+        __swab32s(&lcfg->lcfg_bufcount);
+        for (i = 0; i < lcfg->lcfg_bufcount && i < LUSTRE_CFG_MAX_BUFCOUNT; i++)
+                __swab32s(&lcfg->lcfg_buflens[i]);
+
+        print_lustre_cfg(lcfg);
+        EXIT;
+        return;
+}
+
+/* used only for compatibility with old on-disk cfg_marker data */
+struct cfg_marker32 {
+        __u32   cm_step;
+        __u32   cm_flags;
+        __u32   cm_vers;
+        __u32   padding;
+        __u32   cm_createtime;
+        __u32   cm_canceltime;
+        char    cm_tgtname[MTI_NAME_MAXLEN];
+        char    cm_comment[MTI_NAME_MAXLEN];
+};
+
+#define MTI_NAMELEN32    (MTI_NAME_MAXLEN - \
+        (sizeof(struct cfg_marker) - sizeof(struct cfg_marker32)))
+
+void lustre_swab_cfg_marker(struct cfg_marker *marker, int swab, int size)
+{
+        struct cfg_marker32 *cm32 = (struct cfg_marker32*)marker;
+        ENTRY;
+
+        if (swab) {
+                __swab32s(&marker->cm_step);
+                __swab32s(&marker->cm_flags);
+                __swab32s(&marker->cm_vers);
+        }
+        if (size == sizeof(*cm32)) {
+                __u32 createtime, canceltime;
+                /* There was a problem with the original declaration of
+                 * cfg_marker on 32-bit systems because it used time_t as
+                 * a wire protocol structure, and didn't verify this in
+                 * wirecheck.  We now have to convert the offsets of the
+                 * later fields in order to work on 32- and 64-bit systems.
+                 *
+                 * Fortunately, the cm_comment field has no functional use
+                 * so can be sacrificed when converting the timestamp size.
+                 *
+                 * Overwrite fields from the end first, so they are not
+                 * clobbered, and use memmove() instead of memcpy() because
+                 * the source and target buffers overlap.  bug 16771 */
+                createtime = cm32->cm_createtime;
+                canceltime = cm32->cm_canceltime;
+                memmove(marker->cm_comment, cm32->cm_comment, MTI_NAMELEN32);
+                marker->cm_comment[MTI_NAMELEN32 - 1] = '\0';
+                memmove(marker->cm_tgtname, cm32->cm_tgtname,
+                        sizeof(marker->cm_tgtname));
+                if (swab) {
+                        __swab32s(&createtime);
+                        __swab32s(&canceltime);
+                }
+                marker->cm_createtime = createtime;
+                marker->cm_canceltime = canceltime;
+                CDEBUG(D_CONFIG, "Find old cfg_marker(Srv32b,Clt64b) "
+                       "for target %s, converting\n",
+                       marker->cm_tgtname);
+        } else if (swab) {
+                __swab64s(&marker->cm_createtime);
+                __swab64s(&marker->cm_canceltime);
+        }
+
+        EXIT;
+        return;
+}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_test.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_test.c
new file mode 100644
index 0000000000000..27f52aa15078b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_test.c
@@ -0,0 +1,2151 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_test.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <obd_class.h>
+#include <lustre_fid.h>
+#include <lustre_log.h>
+
+/* This is slightly more than the number of records that can fit into a
+ * single llog file, because the llog_log_header takes up some of the
+ * space in the first block that cannot be used for the bitmap. */
+#define LLOG_TEST_RECNUM  (LLOG_MIN_CHUNK_SIZE * 8)
+
+static int llog_test_rand;
+static struct obd_uuid uuid = { .uuid = "test_uuid" };
+static struct llog_logid cat_logid;
+
+struct llog_mini_rec {
+        struct llog_rec_hdr     lmr_hdr;
+        struct llog_rec_tail    lmr_tail;
+} __attribute__((packed));
+
+static int verify_handle(char *test, struct llog_handle *llh, int num_recs)
+{
+	int i;
+	int last_idx = 0;
+	int active_recs = 0;
+
+	for (i = 0; i < LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr); i++) {
+		if (ext2_test_bit(i, LLOG_HDR_BITMAP(llh->lgh_hdr))) {
+			last_idx = i;
+			active_recs++;
+		}
+	}
+
+	/* check the llog is sane at first, llh_count and lgh_last_idx*/
+	if (llh->lgh_hdr->llh_count != active_recs) {
+		CERROR("%s: handle->count is %d, but there are %d recs found\n",
+		       test, llh->lgh_hdr->llh_count, active_recs);
+		RETURN(-ERANGE);
+	}
+
+	if (llh->lgh_last_idx != LLOG_HDR_TAIL(llh->lgh_hdr)->lrt_index ||
+	    (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_CAT) &&
+	     llh->lgh_last_idx < last_idx)) {
+		CERROR("%s: lgh_last_idx is %d (%d in the header), last found %d\n",
+		       test, llh->lgh_last_idx,
+		       LLOG_HDR_TAIL(llh->lgh_hdr)->lrt_index, last_idx);
+		RETURN(-ERANGE);
+	}
+
+	/* finally checks against expected value from the caller */
+	if (active_recs != num_recs) {
+		CERROR("%s: expected %d active recs after write, found %d\n",
+		       test, num_recs, active_recs);
+		RETURN(-ERANGE);
+	}
+
+	RETURN(0);
+}
+
+/* Test named-log create/open, close */
+static int llog_test_1(const struct lu_env *env,
+		       struct obd_device *obd, char *name)
+{
+	struct llog_handle	*llh;
+	struct llog_ctxt	*ctxt;
+	int rc;
+	int rc2;
+
+	ENTRY;
+
+	CWARN("1a: create a log with name: %s\n", name);
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	rc = llog_open_create(env, ctxt, &llh, NULL, name);
+	if (rc) {
+		CERROR("1a: llog_create with name %s failed: %d\n", name, rc);
+		GOTO(out, rc);
+	}
+	rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, &uuid);
+	if (rc) {
+		CERROR("1a: can't init llog handle: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+
+	rc = verify_handle("1", llh, 1);
+
+	CWARN("1b: close newly-created log\n");
+out_close:
+	rc2 = llog_close(env, llh);
+	if (rc2) {
+		CERROR("1b: close log %s failed: %d\n", name, rc2);
+		if (rc == 0)
+			rc = rc2;
+	}
+out:
+	llog_ctxt_put(ctxt);
+	RETURN(rc);
+}
+
+static int test_2_cancel_cb(const struct lu_env *env, struct llog_handle *llh,
+			    struct llog_rec_hdr *rec, void *data)
+{
+	return LLOG_DEL_RECORD;
+}
+
+/* Test named-log reopen; returns opened log on success */
+static int llog_test_2(const struct lu_env *env, struct obd_device *obd,
+		       char *name, struct llog_handle **llh)
+{
+	struct llog_ctxt	*ctxt;
+	struct llog_handle	*lgh;
+	struct llog_logid	 logid;
+	int			 rc;
+	struct llog_mini_rec	 lmr;
+
+	ENTRY;
+
+	CWARN("2a: re-open a log with name: %s\n", name);
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	rc = llog_open(env, ctxt, llh, NULL, name, LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("2a: re-open log with name %s failed: %d\n", name, rc);
+		GOTO(out_put, rc);
+	}
+
+	rc = llog_init_handle(env, *llh, LLOG_F_IS_PLAIN, &uuid);
+	if (rc) {
+		CERROR("2a: can't init llog handle: %d\n", rc);
+		GOTO(out_close_llh, rc);
+	}
+
+	rc = verify_handle("2", *llh, 1);
+	if (rc)
+		GOTO(out_close_llh, rc);
+
+	CWARN("2b: create a log without specified NAME & LOGID\n");
+	rc = llog_open_create(env, ctxt, &lgh, NULL, NULL);
+	if (rc) {
+		CERROR("2b: create log failed\n");
+		GOTO(out_close_llh, rc);
+	}
+	rc = llog_init_handle(env, lgh, LLOG_F_IS_PLAIN, &uuid);
+	if (rc) {
+		CERROR("2b: can't init llog handle: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+
+	logid = lgh->lgh_id;
+
+	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
+	lmr.lmr_hdr.lrh_type = 0xf02f02;
+
+	/* Check llog header values are correct after record add/cancel */
+	CWARN("2b: write 1 llog records, check llh_count\n");
+	rc = llog_write(env, lgh, &lmr.lmr_hdr, LLOG_NEXT_IDX);
+	if (rc < 0)
+		GOTO(out_close, rc);
+
+	/* in-memory values after record addition */
+	rc = verify_handle("2b", lgh, 2);
+	if (rc < 0)
+		GOTO(out_close, rc);
+
+	/* re-open llog to read on-disk values */
+	llog_close(env, lgh);
+
+	CWARN("2c: re-open the log by LOGID and verify llh_count\n");
+	rc = llog_open(env, ctxt, &lgh, &logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc < 0) {
+		CERROR("2c: re-open log by LOGID failed\n");
+		GOTO(out_close_llh, rc);
+	}
+
+	rc = llog_init_handle(env, lgh, LLOG_F_IS_PLAIN, &uuid);
+	if (rc < 0) {
+		CERROR("2c: can't init llog handle: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+
+	/* check values just read from disk */
+	rc = verify_handle("2c", lgh, 2);
+	if (rc < 0)
+		GOTO(out_close, rc);
+
+	rc = llog_process(env, lgh, test_2_cancel_cb, NULL, NULL);
+	if (rc < 0)
+		GOTO(out_close, rc);
+
+	/* in-memory values */
+	rc = verify_handle("2c", lgh, 1);
+	if (rc < 0)
+		GOTO(out_close, rc);
+
+	/* re-open llog to get on-disk values */
+	llog_close(env, lgh);
+
+	rc = llog_open(env, ctxt, &lgh, &logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("2c: re-open log by LOGID failed\n");
+		GOTO(out_close_llh, rc);
+	}
+
+	rc = llog_init_handle(env, lgh, LLOG_F_IS_PLAIN, &uuid);
+	if (rc) {
+		CERROR("2c: can't init llog handle: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+
+	/* on-disk values after llog re-open */
+	rc = verify_handle("2c", lgh, 1);
+	if (rc < 0)
+		GOTO(out_close, rc);
+
+	CWARN("2d: destroy this log\n");
+	rc = llog_destroy(env, lgh);
+	if (rc)
+		CERROR("2d: destroy log failed\n");
+out_close:
+	llog_close(env, lgh);
+out_close_llh:
+	if (rc)
+		llog_close(env, *llh);
+out_put:
+	llog_ctxt_put(ctxt);
+
+	RETURN(rc);
+}
+
+static int test_3_rec_num;
+static off_t test_3_rec_off;
+static int test_3_paddings;
+static int test_3_start_idx;
+
+/*
+ * Test 3 callback.
+ * - check lgh_cur_offset correctness
+ * - check record index consistency
+ * - modify each record in-place
+ * - add new record during *last_idx processing
+ */
+static int test3_check_n_add_cb(const struct lu_env *env,
+				struct llog_handle *lgh,
+				struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_gen_rec *lgr = (struct llog_gen_rec *)rec;
+	int *last_rec = data;
+	unsigned cur_idx = test_3_start_idx + test_3_rec_num;
+	int rc;
+
+	if (lgh->lgh_hdr->llh_flags & LLOG_F_IS_FIXSIZE) {
+		LASSERT(lgh->lgh_hdr->llh_size > 0);
+		if (lgh->lgh_cur_offset != lgh->lgh_hdr->llh_hdr.lrh_len +
+					(cur_idx - 1) * lgh->lgh_hdr->llh_size)
+			CERROR("Wrong record offset in cur_off: %llu, should be %u\n",
+			       lgh->lgh_cur_offset,
+			       lgh->lgh_hdr->llh_hdr.lrh_len +
+			       (cur_idx - 1) * lgh->lgh_hdr->llh_size);
+	} else {
+		size_t chunk_size = lgh->lgh_hdr->llh_hdr.lrh_len;
+
+		/* For variable size records the start offset is unknown, trust
+		 * the first value and check others are consistent with it. */
+		if (test_3_rec_off == 0)
+			test_3_rec_off = lgh->lgh_cur_offset;
+
+		if (lgh->lgh_cur_offset != test_3_rec_off) {
+			__u64 tmp = lgh->lgh_cur_offset;
+
+			/* there can be padding record */
+			if ((do_div(tmp, chunk_size) == 0) &&
+			    (lgh->lgh_cur_offset - test_3_rec_off <
+			     rec->lrh_len + LLOG_MIN_REC_SIZE)) {
+				test_3_rec_off = lgh->lgh_cur_offset;
+				test_3_paddings++;
+			} else {
+				CERROR("Wrong record offset in cur_off: %llu"
+				       ", should be %lld (rec len %u)\n",
+				       lgh->lgh_cur_offset,
+				       (long long)test_3_rec_off,
+				       rec->lrh_len);
+			}
+		}
+		test_3_rec_off += rec->lrh_len;
+	}
+
+	cur_idx += test_3_paddings;
+	if (cur_idx != rec->lrh_index)
+		CERROR("Record with wrong index was read: %u, expected %u\n",
+		       rec->lrh_index, cur_idx);
+
+	/* modify all records in place */
+	lgr->lgr_gen.conn_cnt = rec->lrh_index;
+	rc = llog_write(env, lgh, rec, rec->lrh_index);
+	if (rc < 0)
+		CERROR("cb_test_3: cannot modify record while processing\n");
+
+	/* Add new record to the llog at *last_rec position one by one to
+	 * check that last block is re-read during processing */
+	if (cur_idx == *last_rec || cur_idx == (*last_rec + 1)) {
+		rc = llog_write(env, lgh, rec, LLOG_NEXT_IDX);
+		if (rc < 0)
+			CERROR("cb_test_3: cannot add new record while "
+			       "processing\n");
+	}
+	test_3_rec_num++;
+
+	return rc;
+}
+
+/* Check in-place modifications were done for all records*/
+static int test3_check_cb(const struct lu_env *env, struct llog_handle *lgh,
+			  struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_gen_rec *lgr = (struct llog_gen_rec *)rec;
+
+	if (lgr->lgr_gen.conn_cnt != rec->lrh_index) {
+		CERROR("cb_test_3: record %u is not modified\n",
+		       rec->lrh_index);
+		return -EINVAL;
+	}
+	test_3_rec_num++;
+	return 0;
+}
+
+static int llog_test3_process(const struct lu_env *env,
+			      struct llog_handle *lgh,
+			      llog_cb_t cb, int start)
+{
+	struct llog_process_cat_data cd;
+	int last_idx; /* new record will be injected here */
+	int rc = 0;
+
+	CWARN("test3: processing records from index %d to the end\n",
+	      start);
+	cd.lpcd_first_idx = start - 1;
+	cd.lpcd_last_idx = 0;
+	test_3_rec_num = test_3_paddings = 0;
+	last_idx = lgh->lgh_last_idx;
+	rc = llog_process(env, lgh, cb, &last_idx, &cd);
+	if (rc < 0)
+		return rc;
+	CWARN("test3: total %u records processed with %u paddings\n",
+	      test_3_rec_num, test_3_paddings);
+	return test_3_rec_num;
+}
+
+/* Test plain llog functionality */
+static int llog_test_3(const struct lu_env *env, struct obd_device *obd,
+		       struct llog_handle *llh)
+{
+	char buf[128];
+	struct llog_rec_hdr *hdr = (void *)buf;
+	int rc, i;
+	int num_recs = 1; /* 1 for the header */
+	int expected;
+
+	ENTRY;
+
+	hdr->lrh_len = sizeof(struct llog_gen_rec);
+	hdr->lrh_type = LLOG_GEN_REC;
+	llh->lgh_hdr->llh_size = sizeof(struct llog_gen_rec);
+	llh->lgh_hdr->llh_flags |= LLOG_F_IS_FIXSIZE;
+
+	/* Fill the llog with 64-bytes records, use 1023 records,
+	 * so last chunk will be partially full. Don't change this
+	 * value until record size is changed.
+	 */
+	CWARN("3a: write 1023 fixed-size llog records\n");
+	for (i = 0; i < 1023; i++) {
+		rc = llog_write(env, llh, hdr, LLOG_NEXT_IDX);
+		if (rc < 0) {
+			CERROR("3a: write 1023 records failed at #%d: %d\n",
+			       i + 1, rc);
+			RETURN(rc);
+		}
+		num_recs++;
+	}
+
+	rc = verify_handle("3a", llh, num_recs);
+	if (rc)
+		RETURN(rc);
+
+	/*
+	 * Test fixed-size records processing:
+	 * - search the needed index
+	 * - go through all records from that index
+	 * - check all indices are growing monotonically and exist
+	 * - modify each record
+	 *
+	 * NB: test3_check_n_add adds two new records while processing
+	 * after last record. There were 1023 records created so the last chunk
+	 * misses exactly one record. Therefore one of new records will be
+	 * the last in the current chunk and second causes the new chunk to be
+	 * created.
+	 */
+	test_3_rec_off = 0;
+	test_3_start_idx = 501;
+	expected = 525;
+	rc = llog_test3_process(env, llh, test3_check_n_add_cb,
+				test_3_start_idx);
+	if (rc < 0)
+		RETURN(rc);
+
+	/* extra record is created during llog_process() */
+	if (rc != expected) {
+		CERROR("3a: process total %d records but expect %d\n",
+		       rc, expected);
+		RETURN(-ERANGE);
+	}
+
+	num_recs += 2;
+
+	/* test modification in place */
+	rc = llog_test3_process(env, llh, test3_check_cb, test_3_start_idx);
+	if (rc < 0)
+		RETURN(rc);
+
+	if (rc != expected) {
+		CERROR("3a: process total %d records but expect %d\n",
+		       rc, expected);
+		RETURN(-ERANGE);
+	}
+
+	CWARN("3b: write 566 variable size llog records\n");
+
+	/* Drop llh_size to 0 to mark llog as variable-size and write
+	 * header to make this change permanent. */
+	llh->lgh_hdr->llh_flags &= ~LLOG_F_IS_FIXSIZE;
+	llog_write(env, llh, &llh->lgh_hdr->llh_hdr, LLOG_HEADER_IDX);
+
+	hdr->lrh_type = OBD_CFG_REC;
+
+	/* there are 1025 64-bytes records in llog already,
+	 * the last chunk contains single record, i.e. 64 bytes.
+	 * Each pair of variable size records is 200 bytes, so
+	 * we will have the following distribution per chunks:
+	 * block 1: 64 + 80(80/120) + 80 + 48(pad) = 81 iterations
+	 * block 2: 80(120/80) + 120 + 72(pad) = 81 itereations
+	 * block 3: 80(80/120) + 80 + 112(pad) = 81 iterations
+	 * -- the same as block 2 again and so on.
+	 * block 7: 80(80/120) = 80 iterations and 192 bytes remain
+	 * Total 6 * 81 + 80 = 566 itereations.
+	 * Callback will add another 120 bytes in the end of the last chunk
+	 * and another 120 bytes will cause padding (72 bytes) plus 120
+	 * bytes in the new block.
+	 */
+	for (i = 0; i < 566; i++) {
+		if ((i % 2) == 0)
+			hdr->lrh_len = 80;
+		else
+			hdr->lrh_len = 120;
+
+		rc = llog_write(env, llh, hdr, LLOG_NEXT_IDX);
+		if (rc < 0) {
+			CERROR("3b: write 566 records failed at #%d: %d\n",
+			       i + 1, rc);
+			RETURN(rc);
+		}
+		num_recs++;
+	}
+
+	rc = verify_handle("3b", llh, num_recs);
+	if (rc)
+		RETURN(rc);
+
+	test_3_start_idx = 1026;
+	expected = 568;
+	rc = llog_test3_process(env, llh, test3_check_n_add_cb,
+				test_3_start_idx);
+	if (rc < 0)
+		RETURN(rc);
+
+	if (rc != expected) {
+		CERROR("3b: process total %d records but expect %d\n",
+		       rc, expected);
+		RETURN(-ERANGE);
+	}
+
+	num_recs += 2;
+
+	/* test modification in place */
+	rc = llog_test3_process(env, llh, test3_check_cb, test_3_start_idx);
+	if (rc < 0)
+		RETURN(rc);
+
+	if (rc != expected) {
+		CERROR("3b: process total %d records but expect %d\n",
+		       rc, expected);
+		RETURN(-ERANGE);
+	}
+
+	CWARN("3c: write records with variable size until BITMAP_SIZE, "
+	      "return -ENOSPC\n");
+	while (num_recs < LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr)) {
+		if ((num_recs % 2) == 0)
+			hdr->lrh_len = 80;
+		else
+			hdr->lrh_len = 128;
+
+		rc = llog_write(env, llh, hdr, LLOG_NEXT_IDX);
+		if (rc == -ENOSPC) {
+			break;
+		} else if (rc < 0) {
+			CERROR("3c: write recs failed at #%d: %d\n",
+			       num_recs, rc);
+			RETURN(rc);
+		}
+		num_recs++;
+	}
+
+	if (rc != -ENOSPC) {
+		CWARN("3c: write record more than BITMAP size!\n");
+		RETURN(-EINVAL);
+	}
+	CWARN("3c: wrote %d more records before end of llog is reached\n",
+	      num_recs);
+
+	rc = verify_handle("3c", llh, num_recs);
+
+	RETURN(rc);
+}
+
+/* Test catalogue additions */
+static int llog_test_4(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_handle	*cath;
+	char			 name[10];
+	int			 rc, rc2, i, buflen;
+	struct llog_mini_rec	 lmr;
+	struct llog_cookie	 cookie;
+	struct llog_ctxt	*ctxt;
+	int			 num_recs = 0;
+	char			*buf;
+	struct llog_rec_hdr	*rec;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
+	lmr.lmr_hdr.lrh_type = 0xf00f00;
+
+	sprintf(name, "%x", llog_test_rand + 1);
+	CWARN("4a: create a catalog log with name: %s\n", name);
+	rc = llog_open_create(env, ctxt, &cath, NULL, name);
+	if (rc) {
+		CERROR("4a: llog_create with name %s failed: %d\n", name, rc);
+		GOTO(ctxt_release, rc);
+        }
+	rc = llog_init_handle(env, cath, LLOG_F_IS_CAT, &uuid);
+	if (rc) {
+		CERROR("4a: can't init llog handle: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	num_recs++;
+	cat_logid = cath->lgh_id;
+
+	CWARN("4b: write 1 record into the catalog\n");
+	rc = llog_cat_add(env, cath, &lmr.lmr_hdr, &cookie);
+	if (rc != 1) {
+		CERROR("4b: write 1 catalog record failed at: %d\n", rc);
+		GOTO(out, rc);
+	}
+	num_recs++;
+	rc = verify_handle("4b", cath, 2);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = verify_handle("4b", cath->u.chd.chd_current_log, num_recs);
+	if (rc)
+		GOTO(out, rc);
+
+	CWARN("4c: cancel 1 log record\n");
+	rc = llog_cat_cancel_records(env, cath, 1, &cookie);
+	if (rc) {
+		CERROR("4c: cancel 1 catalog based record failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	num_recs--;
+
+	rc = verify_handle("4c", cath->u.chd.chd_current_log, num_recs);
+	if (rc)
+		GOTO(out, rc);
+
+	CWARN("4d: write %d more log records\n", LLOG_TEST_RECNUM);
+	for (i = 0; i < LLOG_TEST_RECNUM; i++) {
+		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
+		if (rc) {
+			CERROR("4d: write %d records failed at #%d: %d\n",
+			       LLOG_TEST_RECNUM, i + 1, rc);
+			GOTO(out, rc);
+		}
+		num_recs++;
+	}
+
+	/* make sure new plain llog appears */
+	rc = verify_handle("4d", cath, 3);
+	if (rc)
+		GOTO(out, rc);
+
+	CWARN("4e: add 5 large records, one record per block\n");
+	buflen = LLOG_MIN_CHUNK_SIZE;
+	OBD_ALLOC(buf, buflen);
+	if (buf == NULL)
+		GOTO(out, rc = -ENOMEM);
+	for (i = 0; i < 5; i++) {
+		rec = (void *)buf;
+		rec->lrh_len = buflen;
+		rec->lrh_type = OBD_CFG_REC;
+		rc = llog_cat_add(env, cath, rec, NULL);
+		if (rc) {
+			CERROR("4e: write 5 records failed at #%d: %d\n",
+			       i + 1, rc);
+			GOTO(out_free, rc);
+		}
+		num_recs++;
+	}
+out_free:
+	OBD_FREE(buf, buflen);
+out:
+	CWARN("4f: put newly-created catalog\n");
+	rc2 = llog_cat_close(env, cath);
+	if (rc2) {
+		CERROR("4: close log %s failed: %d\n", name, rc2);
+		if (rc == 0)
+			rc = rc2;
+	}
+ctxt_release:
+	llog_ctxt_put(ctxt);
+	RETURN(rc);
+}
+
+static int cat_counter;
+
+static int cat_print_cb(const struct lu_env *env, struct llog_handle *llh,
+			struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_logid_rec	*lir = (struct llog_logid_rec *)rec;
+	struct lu_fid		 fid = {0};
+
+	if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+		CERROR("invalid record in catalog\n");
+		RETURN(-EINVAL);
+	}
+
+	logid_to_fid(&lir->lid_id, &fid);
+
+	CWARN("seeing record at index %d - "DFID" in log "DFID"\n",
+	      rec->lrh_index, PFID(&fid),
+	      PFID(lu_object_fid(&llh->lgh_obj->do_lu)));
+
+	cat_counter++;
+
+	RETURN(0);
+}
+
+static int plain_counter;
+
+static int plain_print_cb(const struct lu_env *env, struct llog_handle *llh,
+			  struct llog_rec_hdr *rec, void *data)
+{
+	struct lu_fid fid = {0};
+
+	if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) {
+		CERROR("log is not plain\n");
+		RETURN(-EINVAL);
+	}
+
+	logid_to_fid(&llh->lgh_id, &fid);
+
+	CDEBUG(D_INFO, "seeing record at index %d in log "DFID"\n",
+	       rec->lrh_index, PFID(&fid));
+
+	plain_counter++;
+
+	RETURN(0);
+}
+
+static int cancel_count;
+
+static int llog_cancel_rec_cb(const struct lu_env *env,
+			      struct llog_handle *llh,
+			      struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_cookie cookie;
+
+	if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) {
+		CERROR("log is not plain\n");
+		RETURN(-EINVAL);
+	}
+
+	cookie.lgc_lgl = llh->lgh_id;
+	cookie.lgc_index = rec->lrh_index;
+
+	llog_cat_cancel_records(env, llh->u.phd.phd_cat_handle, 1, &cookie);
+	cancel_count++;
+	if (cancel_count == LLOG_TEST_RECNUM)
+		RETURN(-LLOG_EEMPTY);
+	RETURN(0);
+}
+
+/* Test log and catalogue processing */
+static int llog_test_5(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_handle	*llh = NULL;
+	char			 name[10];
+	int			 rc, rc2;
+	struct llog_mini_rec	 lmr;
+	struct llog_ctxt	*ctxt;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
+	lmr.lmr_hdr.lrh_type = 0xf00f00;
+
+	CWARN("5a: re-open catalog by id\n");
+	rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("5a: llog_create with logid failed: %d\n", rc);
+		GOTO(out_put, rc);
+	}
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, &uuid);
+	if (rc) {
+		CERROR("5a: can't init llog handle: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	CWARN("5b: print the catalog entries.. we expect 2\n");
+	cat_counter = 0;
+	rc = llog_process(env, llh, cat_print_cb, "test 5", NULL);
+	if (rc) {
+		CERROR("5b: process with cat_print_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	if (cat_counter != 2) {
+		CERROR("5b: %d entries in catalog\n", cat_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CWARN("5c: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
+	cancel_count = 0;
+	rc = llog_cat_process(env, llh, llog_cancel_rec_cb, "foobar", 0, 0);
+	if (rc != -LLOG_EEMPTY) {
+		CERROR("5c: process with llog_cancel_rec_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	CWARN("5c: print the catalog entries.. we expect 1\n");
+	cat_counter = 0;
+	rc = llog_process(env, llh, cat_print_cb, "test 5", NULL);
+	if (rc) {
+		CERROR("5c: process with cat_print_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	if (cat_counter != 1) {
+		CERROR("5c: %d entries in catalog\n", cat_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CWARN("5d: add 1 record to the log with many canceled empty pages\n");
+	rc = llog_cat_add(env, llh, &lmr.lmr_hdr, NULL);
+	if (rc) {
+		CERROR("5d: add record to the log with many canceled empty "
+		       "pages failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("5e: print plain log entries.. expect 6\n");
+	plain_counter = 0;
+	rc = llog_cat_process(env, llh, plain_print_cb, "foobar", 0, 0);
+	if (rc) {
+		CERROR("5e: process with plain_print_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	if (plain_counter != 6) {
+		CERROR("5e: found %d records\n", plain_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CWARN("5f: print plain log entries reversely.. expect 6\n");
+	plain_counter = 0;
+	rc = llog_cat_reverse_process(env, llh, plain_print_cb, "foobar");
+	if (rc) {
+		CERROR("5f: reversely process with plain_print_cb failed: "
+		       "%d\n", rc);
+		GOTO(out, rc);
+	}
+	if (plain_counter != 6) {
+		CERROR("5f: found %d records\n", plain_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+out:
+	CWARN("5g: close re-opened catalog\n");
+	rc2 = llog_cat_close(env, llh);
+	if (rc2) {
+		CERROR("5g: close log %s failed: %d\n", name, rc2);
+		if (rc == 0)
+			rc = rc2;
+	}
+out_put:
+	llog_ctxt_put(ctxt);
+
+	RETURN(rc);
+}
+
+/* Test client api; open log by name and process */
+static int llog_test_6(const struct lu_env *env, struct obd_device *obd,
+		       char *name)
+{
+	struct obd_device	*mgc_obd;
+	struct llog_ctxt	*ctxt;
+	struct obd_uuid		*mgs_uuid;
+	struct obd_export	*exp;
+	struct obd_uuid		 uuid = { "LLOG_TEST6_UUID" };
+	struct llog_handle	*llh = NULL;
+	struct llog_ctxt	*nctxt;
+	int			 rc, rc2;
+
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+	mgs_uuid = &ctxt->loc_exp->exp_obd->obd_uuid;
+
+	CWARN("6a: re-open log %s using client API\n", name);
+	mgc_obd = class_find_client_obd(mgs_uuid, LUSTRE_MGC_NAME, NULL);
+	if (mgc_obd == NULL) {
+		CERROR("6a: no MGC devices connected to %s found.\n",
+		       mgs_uuid->uuid);
+		GOTO(ctxt_release, rc = -ENOENT);
+	}
+
+	rc = obd_connect(NULL, &exp, mgc_obd, &uuid,
+			 NULL /* obd_connect_data */, NULL);
+	if (rc != -EALREADY) {
+		CERROR("6a: connect on connected MGC (%s) failed to return"
+		       " -EALREADY\n", mgc_obd->obd_name);
+		if (rc == 0)
+			obd_disconnect(exp);
+		GOTO(ctxt_release, rc = -EINVAL);
+	}
+
+	nctxt = llog_get_context(mgc_obd, LLOG_CONFIG_REPL_CTXT);
+	rc = llog_open(env, nctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("6a: llog_open failed %d\n", rc);
+		GOTO(nctxt_put, rc);
+	}
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+	if (rc) {
+		CERROR("6a: llog_init_handle failed %d\n", rc);
+		GOTO(parse_out, rc);
+	}
+
+	plain_counter = 1; /* llog header is first record */
+	CWARN("6b: process log %s using client API\n", name);
+	rc = llog_process(env, llh, plain_print_cb, NULL, NULL);
+	if (rc)
+		CERROR("6b: llog_process failed %d\n", rc);
+	CWARN("6b: processed %d records\n", plain_counter);
+
+	rc = verify_handle("6b", llh, plain_counter);
+	if (rc)
+		GOTO(parse_out, rc);
+
+	plain_counter = 1; /* llog header is first record */
+	CWARN("6c: process log %s reversely using client API\n", name);
+	rc = llog_reverse_process(env, llh, plain_print_cb, NULL, NULL);
+	if (rc)
+		CERROR("6c: llog_reverse_process failed %d\n", rc);
+	CWARN("6c: processed %d records\n", plain_counter);
+
+	rc = verify_handle("6c", llh, plain_counter);
+	if (rc)
+		GOTO(parse_out, rc);
+
+parse_out:
+	rc2 = llog_close(env, llh);
+	if (rc2) {
+		CERROR("6: llog_close failed: rc = %d\n", rc2);
+		if (rc == 0)
+			rc = rc2;
+	}
+nctxt_put:
+	llog_ctxt_put(nctxt);
+ctxt_release:
+	llog_ctxt_put(ctxt);
+	RETURN(rc);
+}
+
+static union {
+	struct llog_rec_hdr		lrh;   /* common header */
+	struct llog_logid_rec		llr;   /* LLOG_LOGID_MAGIC */
+	struct llog_unlink64_rec	lur;   /* MDS_UNLINK64_REC */
+	struct llog_setattr64_rec	lsr64; /* MDS_SETATTR64_REC */
+	struct llog_setattr64_rec_v2	lsr64_v2; /* MDS_SETATTR64_REC */
+	struct llog_size_change_rec	lscr;  /* OST_SZ_REC */
+	struct llog_changelog_rec	lcr;   /* CHANGELOG_REC */
+	struct llog_changelog_user_rec	lcur;  /* CHANGELOG_USER_REC */
+	struct llog_gen_rec		lgr;   /* LLOG_GEN_REC */
+} llog_records;
+
+static int test_7_print_cb(const struct lu_env *env, struct llog_handle *llh,
+			   struct llog_rec_hdr *rec, void *data)
+{
+	struct lu_fid fid = {0};
+
+	logid_to_fid(&llh->lgh_id, &fid);
+
+	CDEBUG(D_OTHER, "record type %#x at index %d in log "DFID"\n",
+	       rec->lrh_type, rec->lrh_index, PFID(&fid));
+
+	plain_counter++;
+	return 0;
+}
+
+static int test_7_cancel_cb(const struct lu_env *env, struct llog_handle *llh,
+			    struct llog_rec_hdr *rec, void *data)
+{
+	plain_counter++;
+	/* test LLOG_DEL_RECORD is working */
+	return LLOG_DEL_RECORD;
+}
+
+static int llog_test_7_sub(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+	struct llog_handle	*llh;
+	int			 rc = 0, i, process_count;
+	int			 num_recs = 0;
+
+	ENTRY;
+
+	rc = llog_open_create(env, ctxt, &llh, NULL, NULL);
+	if (rc) {
+		CERROR("7_sub: create log failed\n");
+		RETURN(rc);
+	}
+
+	rc = llog_init_handle(env, llh,
+			      LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY,
+			      &uuid);
+	if (rc) {
+		CERROR("7_sub: can't init llog handle: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+	for (i = 0; i < LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr); i++) {
+		rc = llog_write(env, llh, &llog_records.lrh, LLOG_NEXT_IDX);
+		if (rc == -ENOSPC) {
+			break;
+		} else if (rc < 0) {
+			CERROR("7_sub: write recs failed at #%d: %d\n",
+			       i + 1, rc);
+			GOTO(out_close, rc);
+		}
+		num_recs++;
+	}
+	if (rc != -ENOSPC) {
+		CWARN("7_sub: write record more than BITMAP size!\n");
+		GOTO(out_close, rc = -EINVAL);
+	}
+
+	rc = verify_handle("7_sub", llh, num_recs + 1);
+	if (rc) {
+		CERROR("7_sub: verify handle failed: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+	if (num_recs < LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr) - 1)
+		CWARN("7_sub: records are not aligned, written %d from %u\n",
+		      num_recs, LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr) - 1);
+
+	plain_counter = 0;
+	rc = llog_process(env, llh, test_7_print_cb, "test 7", NULL);
+	if (rc) {
+		CERROR("7_sub: llog process failed: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+	process_count = plain_counter;
+	if (process_count != num_recs) {
+		CERROR("7_sub: processed %d records from %d total\n",
+		       process_count, num_recs);
+		GOTO(out_close, rc = -EINVAL);
+	}
+
+	plain_counter = 0;
+	rc = llog_reverse_process(env, llh, test_7_cancel_cb, "test 7", NULL);
+	if (rc && rc != LLOG_DEL_PLAIN) {
+		CERROR("7_sub: reverse llog process failed: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+	if (process_count != plain_counter) {
+		CERROR("7_sub: Reverse/direct processing found different"
+		       "number of records: %d/%d\n",
+		       plain_counter, process_count);
+		GOTO(out_close, rc = -EINVAL);
+	}
+	if (llog_exist(llh)) {
+		CERROR("7_sub: llog exists but should be zapped\n");
+		GOTO(out_close, rc = -EEXIST);
+	}
+
+	rc = verify_handle("7_sub", llh, 1);
+out_close:
+	if (rc)
+		llog_destroy(env, llh);
+	llog_close(env, llh);
+	RETURN(rc);
+}
+
+/* Test all llog records writing and processing */
+static int llog_test_7(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_ctxt	*ctxt;
+	int			 rc;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+
+	CWARN("7a: test llog_logid_rec\n");
+	llog_records.llr.lid_hdr.lrh_len = sizeof(llog_records.llr);
+	llog_records.llr.lid_tail.lrt_len = sizeof(llog_records.llr);
+	llog_records.llr.lid_hdr.lrh_type = LLOG_LOGID_MAGIC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7a: llog_logid_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7b: test llog_unlink64_rec\n");
+	llog_records.lur.lur_hdr.lrh_len = sizeof(llog_records.lur);
+	llog_records.lur.lur_tail.lrt_len = sizeof(llog_records.lur);
+	llog_records.lur.lur_hdr.lrh_type = MDS_UNLINK64_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7b: llog_unlink_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7c: test llog_setattr64_rec\n");
+	llog_records.lsr64.lsr_hdr.lrh_len = sizeof(llog_records.lsr64);
+	llog_records.lsr64.lsr_tail.lrt_len = sizeof(llog_records.lsr64);
+	llog_records.lsr64.lsr_hdr.lrh_type = MDS_SETATTR64_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7c: llog_setattr64_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7d: test llog_size_change_rec\n");
+	llog_records.lscr.lsc_hdr.lrh_len = sizeof(llog_records.lscr);
+	llog_records.lscr.lsc_tail.lrt_len = sizeof(llog_records.lscr);
+	llog_records.lscr.lsc_hdr.lrh_type = OST_SZ_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7d: llog_size_change_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7e: test llog_changelog_rec\n");
+	/* Direct access to cr_do_not_use: peculiar case for this test */
+	llog_records.lcr.cr_hdr.lrh_len = sizeof(llog_records.lcr);
+	llog_records.lcr.cr_do_not_use.lrt_len = sizeof(llog_records.lcr);
+	llog_records.lcr.cr_hdr.lrh_type = CHANGELOG_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7e: llog_changelog_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7f: test llog_changelog_user_rec\n");
+	llog_records.lcur.cur_hdr.lrh_len = sizeof(llog_records.lcur);
+	llog_records.lcur.cur_tail.lrt_len = sizeof(llog_records.lcur);
+	llog_records.lcur.cur_hdr.lrh_type = CHANGELOG_USER_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7f: llog_changelog_user_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7g: test llog_gen_rec\n");
+	llog_records.lgr.lgr_hdr.lrh_len = sizeof(llog_records.lgr);
+	llog_records.lgr.lgr_tail.lrt_len = sizeof(llog_records.lgr);
+	llog_records.lgr.lgr_hdr.lrh_type = LLOG_GEN_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7g: llog_size_change_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7h: test llog_setattr64_rec_v2\n");
+	llog_records.lsr64.lsr_hdr.lrh_len = sizeof(llog_records.lsr64_v2);
+	llog_records.lsr64.lsr_tail.lrt_len = sizeof(llog_records.lsr64_v2);
+	llog_records.lsr64.lsr_hdr.lrh_type = MDS_SETATTR64_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7h: llog_setattr64_rec_v2 test failed\n");
+		GOTO(out, rc);
+	}
+out:
+	llog_ctxt_put(ctxt);
+	RETURN(rc);
+}
+
+static int llog_truncate(const struct lu_env *env, struct dt_object *o)
+{
+	struct lu_attr		 la;
+	struct thandle		*th;
+	struct dt_device	*d;
+	int			 rc;
+	ENTRY;
+
+	LASSERT(o);
+	d = lu2dt_dev(o->do_lu.lo_dev);
+	LASSERT(d);
+
+	rc = dt_attr_get(env, o, &la);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_OTHER, "original size %llu\n", la.la_size);
+	rc = sizeof(struct llog_log_hdr) + sizeof(struct llog_mini_rec);
+	if (la.la_size < rc) {
+		CERROR("too small llog: %llu\n", la.la_size);
+		RETURN(0);
+	}
+
+	/* drop 2 records */
+	la.la_size = la.la_size - (sizeof(struct llog_mini_rec) * 2);
+	la.la_valid = LA_SIZE;
+
+	th = dt_trans_create(env, d);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	rc = dt_declare_attr_set(env, o, &la, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_declare_punch(env, o, la.la_size, OBD_OBJECT_EOF, th);
+
+	rc = dt_trans_start_local(env, d, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_punch(env, o, la.la_size, OBD_OBJECT_EOF, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_attr_set(env, o, &la, th);
+	if (rc)
+		GOTO(stop, rc);
+
+stop:
+	dt_trans_stop(env, d, th);
+
+	RETURN(rc);
+}
+
+static int test_8_cb(const struct lu_env *env, struct llog_handle *llh,
+			  struct llog_rec_hdr *rec, void *data)
+{
+	plain_counter++;
+	return 0;
+}
+
+static int llog_test_8(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_handle	*llh = NULL;
+	char			 name[10];
+	int			 rc, rc2, i;
+	int			 orig_counter;
+	struct llog_mini_rec	 lmr;
+	struct llog_ctxt	*ctxt;
+	struct dt_object	*obj = NULL;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
+	lmr.lmr_hdr.lrh_type = 0xf00f00;
+
+	CWARN("8a: fill the first plain llog\n");
+	rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("8a: llog_create with logid failed: %d\n", rc);
+		GOTO(out_put, rc);
+	}
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, &uuid);
+	if (rc) {
+		CERROR("8a: can't init llog handle: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	plain_counter = 0;
+	rc = llog_cat_process(env, llh, test_8_cb, "foobar", 0, 0);
+	if (rc != 0) {
+		CERROR("5a: process with test_8_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	orig_counter = plain_counter;
+
+	for (i = 0; i < 100; i++) {
+		rc = llog_cat_add(env, llh, &lmr.lmr_hdr, NULL);
+		if (rc) {
+			CERROR("5a: add record failed\n");
+			GOTO(out, rc);
+		}
+	}
+
+	/* grab the current plain llog, we'll corrupt it later */
+	obj = llh->u.chd.chd_current_log->lgh_obj;
+	LASSERT(obj);
+	lu_object_get(&obj->do_lu);
+	CWARN("8a: pin llog "DFID"\n", PFID(lu_object_fid(&obj->do_lu)));
+
+	rc2 = llog_cat_close(env, llh);
+	if (rc2) {
+		CERROR("8a: close log %s failed: %d\n", name, rc2);
+		if (rc == 0)
+			rc = rc2;
+		GOTO(out_put, rc);
+	}
+
+	CWARN("8b: fill the second plain llog\n");
+	rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("8b: llog_create with logid failed: %d\n", rc);
+		GOTO(out_put, rc);
+	}
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, &uuid);
+	if (rc) {
+		CERROR("8b: can't init llog handle: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	for (i = 0; i < 100; i++) {
+		rc = llog_cat_add(env, llh, &lmr.lmr_hdr, NULL);
+		if (rc) {
+			CERROR("8b: add record failed\n");
+			GOTO(out, rc);
+		}
+	}
+	CWARN("8b: second llog "DFID"\n",
+		PFID(lu_object_fid(&llh->u.chd.chd_current_log->lgh_obj->do_lu)));
+
+	rc2 = llog_cat_close(env, llh);
+	if (rc2) {
+		CERROR("8b: close log %s failed: %d\n", name, rc2);
+		if (rc == 0)
+			rc = rc2;
+		GOTO(out_put, rc);
+	}
+
+	CWARN("8c: drop two records from the first plain llog\n");
+	llog_truncate(env, obj);
+
+	CWARN("8d: count survived records\n");
+	rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("8d: llog_create with logid failed: %d\n", rc);
+		GOTO(out_put, rc);
+	}
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, &uuid);
+	if (rc) {
+		CERROR("8d: can't init llog handle: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	plain_counter = 0;
+	rc = llog_cat_process(env, llh, test_8_cb, "foobar", 0, 0);
+	if (rc != 0) {
+		CERROR("8d: process with test_8_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	if (orig_counter + 200 - 2 != plain_counter) {
+		CERROR("found %d records (expected %d)\n", plain_counter,
+		       orig_counter + 200 - 2);
+		rc = -EIO;
+	}
+
+out:
+	CWARN("8d: close re-opened catalog\n");
+	rc2 = llog_cat_close(env, llh);
+	if (rc2) {
+		CERROR("8d: close log %s failed: %d\n", name, rc2);
+		if (rc == 0)
+			rc = rc2;
+	}
+out_put:
+	llog_ctxt_put(ctxt);
+
+	if (obj != NULL)
+		dt_object_put(env, obj);
+
+	RETURN(rc);
+}
+
+static int llog_test_9_sub(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+	struct llog_handle	*llh;
+	struct lu_fid		 fid;
+	int			 rc = 0;
+
+	ENTRY;
+
+	rc = llog_open_create(env, ctxt, &llh, NULL, NULL);
+	if (rc != 0) {
+		CERROR("9_sub: create log failed\n");
+		RETURN(rc);
+	}
+
+	rc = llog_init_handle(env, llh,
+			      LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY,
+			      &uuid);
+	if (rc != 0) {
+		CERROR("9_sub: can't init llog handle: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+
+	logid_to_fid(&llh->lgh_id, &fid);
+	fid_to_logid(&fid, &llog_records.llr.lid_id);
+	rc = llog_write(env, llh, &llog_records.lrh, LLOG_NEXT_IDX);
+	if (rc < 0) {
+		CERROR("9_sub: write recs failed at #1: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+	CWARN("9_sub: record type %x in log "DFID_NOBRACE"\n",
+	      llog_records.lrh.lrh_type, PFID(&fid));
+out_close:
+	llog_close(env, llh);
+	RETURN(rc);
+}
+
+/* Prepare different types of llog records for llog_reader test*/
+static int llog_test_9(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_ctxt	*ctxt;
+	int			 rc;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+
+	CWARN("9a: test llog_logid_rec\n");
+	llog_records.llr.lid_hdr.lrh_len = sizeof(llog_records.llr);
+	llog_records.llr.lid_tail.lrt_len = sizeof(llog_records.llr);
+	llog_records.llr.lid_hdr.lrh_type = LLOG_LOGID_MAGIC;
+
+	rc = llog_test_9_sub(env, ctxt);
+	if (rc != 0) {
+		CERROR("9a: llog_logid_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("9b: test llog_obd_cfg_rec\n");
+	llog_records.lscr.lsc_hdr.lrh_len = sizeof(llog_records.lscr);
+	llog_records.lscr.lsc_tail.lrt_len = sizeof(llog_records.lscr);
+	llog_records.lscr.lsc_hdr.lrh_type = OBD_CFG_REC;
+
+	rc = llog_test_9_sub(env, ctxt);
+	if (rc != 0) {
+		CERROR("9b: llog_obd_cfg_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("9c: test llog_changelog_rec\n");
+	/* Direct access to cr_do_not_use: peculiar case for this test */
+	llog_records.lcr.cr_hdr.lrh_len = sizeof(llog_records.lcr);
+	llog_records.lcr.cr_do_not_use.lrt_len = sizeof(llog_records.lcr);
+	llog_records.lcr.cr_hdr.lrh_type = CHANGELOG_REC;
+
+	rc = llog_test_9_sub(env, ctxt);
+	if (rc != 0) {
+		CERROR("9c: llog_changelog_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("9d: test llog_changelog_user_rec\n");
+	llog_records.lcur.cur_hdr.lrh_len = sizeof(llog_records.lcur);
+	llog_records.lcur.cur_tail.lrt_len = sizeof(llog_records.lcur);
+	llog_records.lcur.cur_hdr.lrh_type = CHANGELOG_USER_REC;
+
+	rc = llog_test_9_sub(env, ctxt);
+	if (rc != 0) {
+		CERROR("9d: llog_changelog_user_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+out:
+	llog_ctxt_put(ctxt);
+	RETURN(rc);
+}
+
+/* test catalog wrap around */
+static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_handle	*cath;
+	char			 name[10];
+	int			 rc, rc2, i, enospc, eok;
+	struct llog_mini_rec	 lmr;
+	struct llog_ctxt	*ctxt;
+	struct lu_attr		 la;
+	__u64			 cat_max_size;
+	struct dt_device	*dt;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
+	lmr.lmr_hdr.lrh_type = 0xf00f00;
+
+	snprintf(name, sizeof(name), "%x", llog_test_rand + 2);
+	CWARN("10a: create a catalog log with name: %s\n", name);
+	rc = llog_open_create(env, ctxt, &cath, NULL, name);
+	if (rc) {
+		CERROR("10a: llog_create with name %s failed: %d\n", name, rc);
+		GOTO(ctxt_release, rc);
+	}
+	rc = llog_init_handle(env, cath, LLOG_F_IS_CAT, &uuid);
+	if (rc) {
+		CERROR("10a: can't init llog handle: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	cat_logid = cath->lgh_id;
+	dt = lu2dt_dev(cath->lgh_obj->do_lu.lo_dev);
+
+	/* sync device to commit all recent LLOG changes to disk and avoid
+	 * to consume a huge space with delayed journal commit callbacks
+	 * particularly on low memory nodes or VMs */
+	rc = dt_sync(env, dt);
+	if (rc) {
+		CERROR("10c: sync failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	/* force catalog wrap for 5th plain LLOG */
+	cfs_fail_loc = CFS_FAIL_SKIP|OBD_FAIL_CAT_RECORDS;
+	cfs_fail_val = 4;
+
+	CWARN("10b: write %d log records\n", LLOG_TEST_RECNUM);
+	for (i = 0; i < LLOG_TEST_RECNUM; i++) {
+		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
+		if (rc) {
+			CERROR("10b: write %d records failed at #%d: %d\n",
+			       LLOG_TEST_RECNUM, i + 1, rc);
+			GOTO(out, rc);
+		}
+	}
+
+	/* make sure 2 new plain llog appears in catalog (+1 with hdr) */
+	rc = verify_handle("10b", cath, 3);
+	if (rc)
+		GOTO(out, rc);
+
+	/* sync device to commit all recent LLOG changes to disk and avoid
+	 * to consume a huge space with delayed journal commit callbacks
+	 * particularly on low memory nodes or VMs */
+	rc = dt_sync(env, dt);
+	if (rc) {
+		CERROR("10b: sync failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	CWARN("10c: write %d more log records\n", 2 * LLOG_TEST_RECNUM);
+	for (i = 0; i < 2 * LLOG_TEST_RECNUM; i++) {
+		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
+		if (rc) {
+			CERROR("10c: write %d records failed at #%d: %d\n",
+			       2*LLOG_TEST_RECNUM, i + 1, rc);
+			GOTO(out, rc);
+		}
+	}
+
+	/* make sure 2 new plain llog appears in catalog (+1 with hdr) */
+	rc = verify_handle("10c", cath, 5);
+	if (rc)
+		GOTO(out, rc);
+
+	/* sync device to commit all recent LLOG changes to disk and avoid
+	 * to consume a huge space with delayed journal commit callbacks
+	 * particularly on low memory nodes or VMs */
+	rc = dt_sync(env, dt);
+	if (rc) {
+		CERROR("10c: sync failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	/* fill last allocated plain LLOG and reach -ENOSPC condition
+	 * because no slot available in Catalog */
+	enospc = 0;
+	eok = 0;
+	CWARN("10c: write %d more log records\n", LLOG_TEST_RECNUM);
+	for (i = 0; i < LLOG_TEST_RECNUM; i++) {
+		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
+		if (rc && rc != -ENOSPC) {
+			CERROR("10c: write %d records failed at #%d: %d\n",
+			       LLOG_TEST_RECNUM, i + 1, rc);
+			GOTO(out, rc);
+		}
+		/* after last added plain LLOG has filled up, all new
+		 * records add should fail with -ENOSPC */
+		if (rc == -ENOSPC) {
+			enospc++;
+		} else {
+			enospc = 0;
+			eok++;
+		}
+	}
+
+	if ((enospc == 0) && (enospc+eok != LLOG_TEST_RECNUM)) {
+		CERROR("10c: all last records adds should have failed with"
+		       " -ENOSPC\n");
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CWARN("10c: wrote %d records then %d failed with ENOSPC\n", eok,
+	      enospc);
+
+	/* make sure no new record in Catalog */
+	rc = verify_handle("10c", cath, 5);
+	if (rc)
+		GOTO(out, rc);
+
+	/* Catalog should have reached its max size for test */
+	rc = dt_attr_get(env, cath->lgh_obj, &la);
+	if (rc) {
+		CERROR("10c: failed to get catalog attrs: %d\n", rc);
+		GOTO(out, rc);
+	}
+	cat_max_size = la.la_size;
+
+	/* cancel all 1st plain llog records to empty it, this will also cause
+	 * its catalog entry to be freed for next forced wrap in 10e */
+	CWARN("10d: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
+	cancel_count = 0;
+	rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0);
+	if (rc != -LLOG_EEMPTY) {
+		CERROR("10d: process with llog_cancel_rec_cb failed: %d\n", rc);
+		/* need to indicate error if for any reason LLOG_TEST_RECNUM is
+		 * not reached */
+		if (rc == 0)
+			rc = -ERANGE;
+		GOTO(out, rc);
+	}
+
+	CWARN("10d: print the catalog entries.. we expect 3\n");
+	cat_counter = 0;
+	rc = llog_process(env, cath, cat_print_cb, "test 10", NULL);
+	if (rc) {
+		CERROR("10d: process with cat_print_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	if (cat_counter != 3) {
+		CERROR("10d: %d entries in catalog\n", cat_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	/* verify one down in catalog (+1 with hdr) */
+	rc = verify_handle("10d", cath, 4);
+	if (rc)
+		GOTO(out, rc);
+
+	/* sync device to commit all recent LLOG changes to disk and avoid
+	 * to consume a huge space with delayed journal commit callbacks
+	 * particularly on low memory nodes or VMs */
+	rc = dt_sync(env, dt);
+	if (rc) {
+		CERROR("10d: sync failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	enospc = 0;
+	eok = 0;
+	CWARN("10e: write %d more log records\n", LLOG_TEST_RECNUM);
+	for (i = 0; i < LLOG_TEST_RECNUM; i++) {
+		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
+		if (rc && rc != -ENOSPC) {
+			CERROR("10e: write %d records failed at #%d: %d\n",
+			       LLOG_TEST_RECNUM, i + 1, rc);
+			GOTO(out, rc);
+		}
+		/* after last added plain LLOG has filled up, all new
+		 * records add should fail with -ENOSPC */
+		if (rc == -ENOSPC) {
+			enospc++;
+		} else {
+			enospc = 0;
+			eok++;
+		}
+	}
+
+	if ((enospc == 0) && (enospc+eok != LLOG_TEST_RECNUM)) {
+		CERROR("10e: all last records adds should have failed with"
+		       " -ENOSPC\n");
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CWARN("10e: wrote %d records then %d failed with ENOSPC\n", eok,
+	      enospc);
+
+	CWARN("10e: print the catalog entries.. we expect 4\n");
+	cat_counter = 0;
+	rc = llog_process(env, cath, cat_print_cb, "test 10", NULL);
+	if (rc) {
+		CERROR("10d: process with cat_print_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	if (cat_counter != 4) {
+		CERROR("10d: %d entries in catalog\n", cat_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	/* make sure 1 new plain llog appears in catalog (+1 with hdr) */
+	rc = verify_handle("10e", cath, 5);
+	if (rc)
+		GOTO(out, rc);
+
+	/* verify catalog has wrap around */
+	if (cath->lgh_last_idx > cath->lgh_hdr->llh_cat_idx) {
+		CERROR("10e: catalog failed to wrap around\n");
+		GOTO(out, rc = -EINVAL);
+	}
+
+	rc = dt_attr_get(env, cath->lgh_obj, &la);
+	if (rc) {
+		CERROR("10e: failed to get catalog attrs: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	if (la.la_size != cat_max_size) {
+		CERROR("10e: catalog size has changed after it has wrap around,"
+		       " current size = %llu, expected size = %llu\n",
+		       la.la_size, cat_max_size);
+		GOTO(out, rc = -EINVAL);
+	}
+	CWARN("10e: catalog successfully wrap around, last_idx %d, first %d\n",
+	      cath->lgh_last_idx, cath->lgh_hdr->llh_cat_idx);
+
+	/* sync device to commit all recent LLOG changes to disk and avoid
+	 * to consume a huge space with delayed journal commit callbacks
+	 * particularly on low memory nodes or VMs */
+	rc = dt_sync(env, dt);
+	if (rc) {
+		CERROR("10e: sync failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	/* cancel more records to free one more slot in Catalog
+	 * see if it is re-allocated when adding more records */
+	CWARN("10f: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
+	cancel_count = 0;
+	rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0);
+	if (rc != -LLOG_EEMPTY) {
+		CERROR("10f: process with llog_cancel_rec_cb failed: %d\n", rc);
+		/* need to indicate error if for any reason LLOG_TEST_RECNUM is
+		 * not reached */
+		if (rc == 0)
+			rc = -ERANGE;
+		GOTO(out, rc);
+	}
+
+	CWARN("10f: print the catalog entries.. we expect 3\n");
+	cat_counter = 0;
+	rc = llog_process(env, cath, cat_print_cb, "test 10", NULL);
+	if (rc) {
+		CERROR("10f: process with cat_print_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	if (cat_counter != 3) {
+		CERROR("10f: %d entries in catalog\n", cat_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	/* verify one down in catalog (+1 with hdr) */
+	rc = verify_handle("10f", cath, 4);
+	if (rc)
+		GOTO(out, rc);
+
+	/* sync device to commit all recent LLOG changes to disk and avoid
+	 * to consume a huge space with delayed journal commit callbacks
+	 * particularly on low memory nodes or VMs */
+	rc = dt_sync(env, dt);
+	if (rc) {
+		CERROR("10f: sync failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	enospc = 0;
+	eok = 0;
+	CWARN("10f: write %d more log records\n", LLOG_TEST_RECNUM);
+	for (i = 0; i < LLOG_TEST_RECNUM; i++) {
+		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
+		if (rc && rc != -ENOSPC) {
+			CERROR("10f: write %d records failed at #%d: %d\n",
+			       LLOG_TEST_RECNUM, i + 1, rc);
+			GOTO(out, rc);
+		}
+		/* after last added plain LLOG has filled up, all new
+		 * records add should fail with -ENOSPC */
+		if (rc == -ENOSPC) {
+			enospc++;
+		} else {
+			enospc = 0;
+			eok++;
+		}
+	}
+
+	if ((enospc == 0) && (enospc+eok != LLOG_TEST_RECNUM)) {
+		CERROR("10f: all last records adds should have failed with"
+		       " -ENOSPC\n");
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CWARN("10f: wrote %d records then %d failed with ENOSPC\n", eok,
+	      enospc);
+
+	/* make sure 1 new plain llog appears in catalog (+1 with hdr) */
+	rc = verify_handle("10f", cath, 5);
+	if (rc)
+		GOTO(out, rc);
+
+	/* verify lgh_last_idx = llh_cat_idx = 2 now */
+	if (cath->lgh_last_idx != cath->lgh_hdr->llh_cat_idx ||
+	    cath->lgh_last_idx != 2) {
+		CERROR("10f: lgh_last_idx = %d vs 2, llh_cat_idx = %d vs 2\n",
+		       cath->lgh_last_idx, cath->lgh_hdr->llh_cat_idx);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	rc = dt_attr_get(env, cath->lgh_obj, &la);
+	if (rc) {
+		CERROR("10f: failed to get catalog attrs: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	if (la.la_size != cat_max_size) {
+		CERROR("10f: catalog size has changed after it has wrap around,"
+		       " current size = %llu, expected size = %llu\n",
+		       la.la_size, cat_max_size);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	/* sync device to commit all recent LLOG changes to disk and avoid
+	 * to consume a huge space with delayed journal commit callbacks
+	 * particularly on low memory nodes or VMs */
+	rc = dt_sync(env, dt);
+	if (rc) {
+		CERROR("10f: sync failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	/* will llh_cat_idx also successfully wrap ? */
+
+	/* cancel all records in the plain LLOGs referenced by 2 last indexes in
+	 * Catalog */
+
+	/* cancel more records to free one more slot in Catalog */
+	CWARN("10g: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
+	cancel_count = 0;
+	rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0);
+	if (rc != -LLOG_EEMPTY) {
+		CERROR("10g: process with llog_cancel_rec_cb failed: %d\n", rc);
+		/* need to indicate error if for any reason LLOG_TEST_RECNUM is
+		 * not reached */
+		if (rc == 0)
+			rc = -ERANGE;
+		GOTO(out, rc);
+	}
+
+	CWARN("10g: print the catalog entries.. we expect 3\n");
+	cat_counter = 0;
+	rc = llog_process(env, cath, cat_print_cb, "test 10", NULL);
+	if (rc) {
+		CERROR("10g: process with cat_print_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	if (cat_counter != 3) {
+		CERROR("10g: %d entries in catalog\n", cat_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	/* verify one down in catalog (+1 with hdr) */
+	rc = verify_handle("10g", cath, 4);
+	if (rc)
+		GOTO(out, rc);
+
+	/* sync device to commit all recent LLOG changes to disk and avoid
+	 * to consume a huge space with delayed journal commit callbacks
+	 * particularly on low memory nodes or VMs */
+	rc = dt_sync(env, dt);
+	if (rc) {
+		CERROR("10g: sync failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	/* cancel more records to free one more slot in Catalog */
+	CWARN("10g: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
+	cancel_count = 0;
+	rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0);
+	if (rc != -LLOG_EEMPTY) {
+		CERROR("10g: process with llog_cancel_rec_cb failed: %d\n", rc);
+		/* need to indicate error if for any reason LLOG_TEST_RECNUM is
+		 * not reached */
+		if (rc == 0)
+			rc = -ERANGE;
+		GOTO(out, rc);
+	}
+
+	CWARN("10g: print the catalog entries.. we expect 2\n");
+	cat_counter = 0;
+	rc = llog_process(env, cath, cat_print_cb, "test 10", NULL);
+	if (rc) {
+		CERROR("10g: process with cat_print_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	if (cat_counter != 2) {
+		CERROR("10g: %d entries in catalog\n", cat_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	/* verify one down in catalog (+1 with hdr) */
+	rc = verify_handle("10g", cath, 3);
+	if (rc)
+		GOTO(out, rc);
+
+	/* verify lgh_last_idx = 2 and llh_cat_idx = 0 now */
+	if (cath->lgh_hdr->llh_cat_idx != 0 ||
+	    cath->lgh_last_idx != 2) {
+		CERROR("10g: lgh_last_idx = %d vs 2, llh_cat_idx = %d vs 0\n",
+		       cath->lgh_last_idx, cath->lgh_hdr->llh_cat_idx);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	/* sync device to commit all recent LLOG changes to disk and avoid
+	 * to consume a huge space with delayed journal commit callbacks
+	 * particularly on low memory nodes or VMs */
+	rc = dt_sync(env, dt);
+	if (rc) {
+		CERROR("10g: sync failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	/* cancel more records to free one more slot in Catalog */
+	CWARN("10g: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
+	cancel_count = 0;
+	rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0);
+	if (rc != -LLOG_EEMPTY) {
+		CERROR("10g: process with llog_cancel_rec_cb failed: %d\n", rc);
+		/* need to indicate error if for any reason LLOG_TEST_RECNUM is
+		 * not reached */
+		if (rc == 0)
+			rc = -ERANGE;
+		GOTO(out, rc);
+	}
+
+	CWARN("10g: print the catalog entries.. we expect 1\n");
+	cat_counter = 0;
+	rc = llog_process(env, cath, cat_print_cb, "test 10", NULL);
+	if (rc) {
+		CERROR("10g: process with cat_print_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	if (cat_counter != 1) {
+		CERROR("10g: %d entries in catalog\n", cat_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	/* verify one down in catalog (+1 with hdr) */
+	rc = verify_handle("10g", cath, 2);
+	if (rc)
+		GOTO(out, rc);
+
+	/* verify lgh_last_idx = 2 and llh_cat_idx = 1 now */
+	if (cath->lgh_hdr->llh_cat_idx != 1 ||
+	    cath->lgh_last_idx != 2) {
+		CERROR("10g: lgh_last_idx = %d vs 2, llh_cat_idx = %d vs 1\n",
+		       cath->lgh_last_idx, cath->lgh_hdr->llh_cat_idx);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CWARN("10g: llh_cat_idx has also successfully wrapped!\n");
+
+out:
+	cfs_fail_loc = 0;
+	cfs_fail_val = 0;
+
+	CWARN("10: put newly-created catalog\n");
+	rc2 = llog_cat_close(env, cath);
+	if (rc2) {
+		CERROR("10: close log %s failed: %d\n", name, rc2);
+		if (rc == 0)
+			rc = rc2;
+	}
+ctxt_release:
+	llog_ctxt_put(ctxt);
+	RETURN(rc);
+}
+
+/* -------------------------------------------------------------------------
+ * Tests above, boring obd functions below
+ * ------------------------------------------------------------------------- */
+static int llog_run_tests(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_handle	*llh = NULL;
+	struct llog_ctxt	*ctxt;
+	int			 rc, err;
+	char			 name[10];
+
+	ENTRY;
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	sprintf(name, "%x", llog_test_rand);
+
+	rc = llog_test_1(env, obd, name);
+	if (rc)
+		GOTO(cleanup_ctxt, rc);
+
+	rc = llog_test_2(env, obd, name, &llh);
+	if (rc)
+		GOTO(cleanup_ctxt, rc);
+
+	rc = llog_test_3(env, obd, llh);
+	if (rc)
+		GOTO(cleanup, rc);
+
+	rc = llog_test_4(env, obd);
+	if (rc)
+		GOTO(cleanup, rc);
+
+	rc = llog_test_5(env, obd);
+	if (rc)
+		GOTO(cleanup, rc);
+
+	rc = llog_test_6(env, obd, name);
+	if (rc)
+		GOTO(cleanup, rc);
+
+	rc = llog_test_7(env, obd);
+	if (rc)
+		GOTO(cleanup, rc);
+
+	rc = llog_test_8(env, obd);
+	if (rc)
+		GOTO(cleanup, rc);
+
+	rc = llog_test_9(env, obd);
+	if (rc != 0)
+		GOTO(cleanup, rc);
+
+	rc = llog_test_10(env, obd);
+	if (rc)
+		GOTO(cleanup, rc);
+
+cleanup:
+	err = llog_destroy(env, llh);
+	if (err)
+		CERROR("cleanup: llog_destroy failed: %d\n", err);
+	llog_close(env, llh);
+	if (rc == 0)
+		rc = err;
+cleanup_ctxt:
+	llog_ctxt_put(ctxt);
+	return rc;
+}
+
+static int llog_test_cleanup(struct obd_device *obd)
+{
+	struct obd_device	*tgt;
+	struct lu_env		 env;
+	int			 rc;
+
+	ENTRY;
+
+	rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
+	if (rc)
+		RETURN(rc);
+
+	tgt = obd->obd_lvfs_ctxt.dt->dd_lu_dev.ld_obd;
+	rc = llog_cleanup(&env, llog_get_context(tgt, LLOG_TEST_ORIG_CTXT));
+	if (rc)
+		CERROR("failed to llog_test_llog_finish: %d\n", rc);
+	lu_env_fini(&env);
+	RETURN(rc);
+}
+
+static int llog_test_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct obd_device	*tgt;
+	struct llog_ctxt	*ctxt;
+	struct dt_object	*o;
+	struct lu_env		 env;
+	struct lu_context	 test_session;
+	int			 rc;
+
+        ENTRY;
+
+        if (lcfg->lcfg_bufcount < 2) {
+                CERROR("requires a TARGET OBD name\n");
+                RETURN(-EINVAL);
+        }
+
+        if (lcfg->lcfg_buflens[1] < 1) {
+                CERROR("requires a TARGET OBD name\n");
+                RETURN(-EINVAL);
+        }
+
+        /* disk obd */
+        tgt = class_name2obd(lustre_cfg_string(lcfg, 1));
+        if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) {
+                CERROR("target device not attached or not set up (%s)\n",
+                       lustre_cfg_string(lcfg, 1));
+                RETURN(-EINVAL);
+        }
+
+	rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
+	if (rc)
+		RETURN(rc);
+
+	rc = lu_context_init(&test_session, LCT_SERVER_SESSION);
+	if (rc)
+		GOTO(cleanup_env, rc);
+	test_session.lc_thread = (struct ptlrpc_thread *)current;
+	lu_context_enter(&test_session);
+	env.le_ses = &test_session;
+
+	CWARN("Setup llog-test device over %s device\n",
+	      lustre_cfg_string(lcfg, 1));
+
+	OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt);
+	obd->obd_lvfs_ctxt.dt = lu2dt_dev(tgt->obd_lu_dev);
+
+	rc = llog_setup(&env, tgt, &tgt->obd_olg, LLOG_TEST_ORIG_CTXT, tgt,
+			&llog_osd_ops);
+	if (rc)
+		GOTO(cleanup_session, rc);
+
+	/* use MGS llog dir for tests */
+	ctxt = llog_get_context(tgt, LLOG_CONFIG_ORIG_CTXT);
+	LASSERT(ctxt);
+	o = ctxt->loc_dir;
+	llog_ctxt_put(ctxt);
+
+	ctxt = llog_get_context(tgt, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+	ctxt->loc_dir = o;
+	llog_ctxt_put(ctxt);
+
+	llog_test_rand = cfs_rand();
+
+	rc = llog_run_tests(&env, tgt);
+	if (rc)
+		llog_test_cleanup(obd);
+cleanup_session:
+	lu_context_exit(&test_session);
+	lu_context_fini(&test_session);
+cleanup_env:
+	lu_env_fini(&env);
+	RETURN(rc);
+}
+
+static struct obd_ops llog_obd_ops = {
+        .o_owner       = THIS_MODULE,
+        .o_setup       = llog_test_setup,
+        .o_cleanup     = llog_test_cleanup,
+};
+
+static int __init llog_test_init(void)
+{
+	return class_register_type(&llog_obd_ops, NULL, true, NULL,
+				   "llog_test", NULL);
+}
+
+static void __exit llog_test_exit(void)
+{
+	class_unregister_type("llog_test");
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Log test module");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+MODULE_LICENSE("GPL");
+
+module_init(llog_test_init);
+module_exit(llog_test_exit);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/local_storage.c b/drivers/staging/lustrefsx/lustre/obdclass/local_storage.c
new file mode 100644
index 0000000000000..89b227b0cfa09
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/local_storage.c
@@ -0,0 +1,973 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * lustre/obdclass/local_storage.c
+ *
+ * Local storage for file/objects with fid generation. Works on top of OSD.
+ *
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "local_storage.h"
+
+/* all initialized local storages on this node are linked on this */
+static LIST_HEAD(ls_list_head);
+static DEFINE_MUTEX(ls_list_mutex);
+
+static int ls_object_init(const struct lu_env *env, struct lu_object *o,
+			  const struct lu_object_conf *unused)
+{
+	struct ls_device	*ls;
+	struct lu_object	*below;
+	struct lu_device	*under;
+
+	ENTRY;
+
+	ls = container_of0(o->lo_dev, struct ls_device, ls_top_dev.dd_lu_dev);
+	under = &ls->ls_osd->dd_lu_dev;
+	below = under->ld_ops->ldo_object_alloc(env, o->lo_header, under);
+	if (below == NULL)
+		RETURN(-ENOMEM);
+
+	lu_object_add(o, below);
+
+	RETURN(0);
+}
+
+static void ls_object_free(const struct lu_env *env, struct lu_object *o)
+{
+	struct ls_object	*obj = lu2ls_obj(o);
+	struct lu_object_header	*h = o->lo_header;
+
+	dt_object_fini(&obj->ls_obj);
+	lu_object_header_fini(h);
+	OBD_FREE_PTR(obj);
+}
+
+static struct lu_object_operations ls_lu_obj_ops = {
+	.loo_object_init  = ls_object_init,
+	.loo_object_free  = ls_object_free,
+};
+
+static struct lu_object *ls_object_alloc(const struct lu_env *env,
+					 const struct lu_object_header *_h,
+					 struct lu_device *d)
+{
+	struct lu_object_header	*h;
+	struct ls_object	*o;
+	struct lu_object	*l;
+
+	LASSERT(_h == NULL);
+
+	OBD_ALLOC_PTR(o);
+	if (o != NULL) {
+		l = &o->ls_obj.do_lu;
+		h = &o->ls_header;
+
+		lu_object_header_init(h);
+		dt_object_init(&o->ls_obj, h, d);
+		lu_object_add_top(h, l);
+
+		l->lo_ops = &ls_lu_obj_ops;
+
+		return l;
+	} else {
+		return NULL;
+	}
+}
+
+static struct lu_device_operations ls_lu_dev_ops = {
+	.ldo_object_alloc =	ls_object_alloc
+};
+
+static struct ls_device *__ls_find_dev(struct dt_device *dev)
+{
+	struct ls_device *ls, *ret = NULL;
+
+	list_for_each_entry(ls, &ls_list_head, ls_linkage) {
+		if (ls->ls_osd == dev) {
+			atomic_inc(&ls->ls_refcount);
+			ret = ls;
+			break;
+		}
+	}
+	return ret;
+}
+
+struct ls_device *ls_find_dev(struct dt_device *dev)
+{
+	struct ls_device *ls;
+
+	mutex_lock(&ls_list_mutex);
+	ls = __ls_find_dev(dev);
+	mutex_unlock(&ls_list_mutex);
+
+	return ls;
+}
+
+static struct lu_device_type_operations ls_device_type_ops = {
+	.ldto_start = NULL,
+	.ldto_stop  = NULL,
+};
+
+static struct lu_device_type ls_lu_type = {
+	.ldt_name = "local_storage",
+	.ldt_ops  = &ls_device_type_ops,
+};
+
+struct ls_device *ls_device_get(struct dt_device *dev)
+{
+	struct ls_device *ls;
+
+	ENTRY;
+
+	mutex_lock(&ls_list_mutex);
+	ls = __ls_find_dev(dev);
+	if (ls)
+		GOTO(out_ls, ls);
+
+	/* not found, then create */
+	OBD_ALLOC_PTR(ls);
+	if (ls == NULL)
+		GOTO(out_ls, ls = ERR_PTR(-ENOMEM));
+
+	atomic_set(&ls->ls_refcount, 1);
+	INIT_LIST_HEAD(&ls->ls_los_list);
+	mutex_init(&ls->ls_los_mutex);
+
+	ls->ls_osd = dev;
+
+	LASSERT(dev->dd_lu_dev.ld_site);
+	lu_device_init(&ls->ls_top_dev.dd_lu_dev, &ls_lu_type);
+	ls->ls_top_dev.dd_lu_dev.ld_ops = &ls_lu_dev_ops;
+	ls->ls_top_dev.dd_lu_dev.ld_site = dev->dd_lu_dev.ld_site;
+
+	/* finally add ls to the list */
+	list_add(&ls->ls_linkage, &ls_list_head);
+out_ls:
+	mutex_unlock(&ls_list_mutex);
+	RETURN(ls);
+}
+
+void ls_device_put(const struct lu_env *env, struct ls_device *ls)
+{
+	LASSERT(env);
+	if (!atomic_dec_and_test(&ls->ls_refcount))
+		return;
+
+	mutex_lock(&ls_list_mutex);
+	if (atomic_read(&ls->ls_refcount) == 0) {
+		LASSERT(list_empty(&ls->ls_los_list));
+		list_del(&ls->ls_linkage);
+		lu_site_purge(env, ls->ls_top_dev.dd_lu_dev.ld_site, ~0);
+		lu_device_fini(&ls->ls_top_dev.dd_lu_dev);
+		OBD_FREE_PTR(ls);
+	}
+	mutex_unlock(&ls_list_mutex);
+}
+
+/**
+ * local file fid generation
+ */
+int local_object_fid_generate(const struct lu_env *env,
+			      struct local_oid_storage *los,
+			      struct lu_fid *fid)
+{
+	LASSERT(los->los_dev);
+	LASSERT(los->los_obj);
+
+	/* take next OID */
+
+	/* to make it unique after reboot we store
+	 * the latest generated fid atomically with
+	 * object creation see local_object_create() */
+
+	mutex_lock(&los->los_id_lock);
+	fid->f_seq = los->los_seq;
+	fid->f_oid = ++los->los_last_oid;
+	fid->f_ver = 0;
+	mutex_unlock(&los->los_id_lock);
+
+	return 0;
+}
+
+int local_object_declare_create(const struct lu_env *env,
+				struct local_oid_storage *los,
+				struct dt_object *o, struct lu_attr *attr,
+				struct dt_object_format *dof,
+				struct thandle *th)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	int			 rc;
+
+	ENTRY;
+
+	/* update fid generation file */
+	if (los != NULL) {
+		LASSERT(dt_object_exists(los->los_obj));
+		dti->dti_lb.lb_buf = NULL;
+		dti->dti_lb.lb_len = sizeof(struct los_ondisk);
+		rc = dt_declare_record_write(env, los->los_obj,
+					     &dti->dti_lb, 0, th);
+		if (rc)
+			RETURN(rc);
+	}
+
+	rc = dt_declare_create(env, o, attr, NULL, dof, th);
+	if (rc)
+		RETURN(rc);
+
+	dti->dti_lb.lb_buf = NULL;
+	dti->dti_lb.lb_len = sizeof(dti->dti_lma);
+	rc = dt_declare_xattr_set(env, o, &dti->dti_lb, XATTR_NAME_LMA, 0, th);
+
+	RETURN(rc);
+}
+
+int local_object_create(const struct lu_env *env,
+			struct local_oid_storage *los,
+			struct dt_object *o, struct lu_attr *attr,
+			struct dt_object_format *dof, struct thandle *th)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	u64			 lastid;
+	int			 rc;
+
+	ENTRY;
+
+	rc = dt_create(env, o, attr, NULL, dof, th);
+	if (rc)
+		RETURN(rc);
+
+	if (los == NULL)
+		RETURN(rc);
+
+	LASSERT(los->los_obj);
+	LASSERT(dt_object_exists(los->los_obj));
+
+	/* many threads can be updated this, serialize
+	 * them here to avoid the race where one thread
+	 * takes the value first, but writes it last */
+	mutex_lock(&los->los_id_lock);
+
+	/* update local oid number on disk so that
+	 * we know the last one used after reboot */
+	lastid = cpu_to_le64(los->los_last_oid);
+
+	dti->dti_off = 0;
+	dti->dti_lb.lb_buf = &lastid;
+	dti->dti_lb.lb_len = sizeof(lastid);
+	rc = dt_record_write(env, los->los_obj, &dti->dti_lb, &dti->dti_off,
+			     th);
+	mutex_unlock(&los->los_id_lock);
+
+	RETURN(rc);
+}
+
+/*
+ * Create local named object (file, directory or index) in parent directory.
+ */
+static struct dt_object *__local_file_create(const struct lu_env *env,
+					     const struct lu_fid *fid,
+					     struct local_oid_storage *los,
+					     struct ls_device *ls,
+					     struct dt_object *parent,
+					     const char *name,
+					     struct lu_attr *attr,
+					     struct dt_object_format *dof)
+{
+	struct dt_thread_info	*dti	= dt_info(env);
+	struct lu_object_conf	*conf	= &dti->dti_conf;
+	struct dt_insert_rec	*rec	= &dti->dti_dt_rec;
+	struct dt_object	*dto;
+	struct thandle		*th;
+	int			 rc;
+
+	/* We know that the target object does not exist, to be created,
+	 * then give some hints - LOC_F_NEW to help low layer to handle
+	 * that efficiently and properly. */
+	memset(conf, 0, sizeof(*conf));
+	conf->loc_flags = LOC_F_NEW;
+	dto = ls_locate(env, ls, fid, conf);
+	if (unlikely(IS_ERR(dto)))
+		RETURN(dto);
+
+	LASSERT(dto != NULL);
+	if (dt_object_exists(dto))
+		GOTO(out, rc = -EEXIST);
+
+	th = dt_trans_create(env, ls->ls_osd);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
+
+	rc = local_object_declare_create(env, los, dto, attr, dof, th);
+	if (rc)
+		GOTO(trans_stop, rc);
+
+	if (dti->dti_dof.dof_type == DFT_DIR) {
+		rc = dt_declare_ref_add(env, dto, th);
+		if (rc < 0)
+			GOTO(trans_stop, rc);
+
+		rc = dt_declare_ref_add(env, parent, th);
+		if (rc < 0)
+			GOTO(trans_stop, rc);
+	}
+
+	rec->rec_fid = fid;
+	rec->rec_type = attr->la_mode & S_IFMT;
+	rc = dt_declare_insert(env, parent, (const struct dt_rec *)rec,
+			       (const struct dt_key *)name, th);
+	if (rc)
+		GOTO(trans_stop, rc);
+
+	if (dti->dti_dof.dof_type == DFT_DIR) {
+		if (!dt_try_as_dir(env, dto))
+			GOTO(trans_stop, rc = -ENOTDIR);
+
+		rec->rec_type = S_IFDIR;
+		rec->rec_fid = fid;
+		rc = dt_declare_insert(env, dto, (const struct dt_rec *)rec,
+				(const struct dt_key *)".", th);
+		if (rc != 0)
+			GOTO(trans_stop, rc);
+
+		rec->rec_fid = lu_object_fid(&parent->do_lu);
+		rc = dt_declare_insert(env, dto, (const struct dt_rec *)rec,
+				(const struct dt_key *)"..", th);
+		if (rc != 0)
+			GOTO(trans_stop, rc);
+
+		rc = dt_declare_ref_add(env, dto, th);
+		if (rc != 0)
+			GOTO(trans_stop, rc);
+	}
+
+	rc = dt_trans_start_local(env, ls->ls_osd, th);
+	if (rc)
+		GOTO(trans_stop, rc);
+
+	dt_write_lock(env, dto, LOS_CHILD);
+	if (dt_object_exists(dto))
+		GOTO(unlock, rc = 0);
+
+	CDEBUG(D_OTHER, "create new object "DFID"\n",
+	       PFID(lu_object_fid(&dto->do_lu)));
+	rc = local_object_create(env, los, dto, attr, dof, th);
+	if (rc)
+		GOTO(unlock, rc);
+	LASSERT(dt_object_exists(dto));
+
+	if (dti->dti_dof.dof_type == DFT_DIR) {
+
+		rec->rec_type = S_IFDIR;
+		rec->rec_fid = fid;
+		/* Add "." and ".." for newly created dir */
+		rc = dt_insert(env, dto, (const struct dt_rec *)rec,
+			       (const struct dt_key *)".", th, 1);
+		if (rc != 0)
+			GOTO(destroy, rc);
+
+		dt_ref_add(env, dto, th);
+		rec->rec_fid = lu_object_fid(&parent->do_lu);
+		rc = dt_insert(env, dto, (const struct dt_rec *)rec,
+			       (const struct dt_key *)"..", th, 1);
+		if (rc != 0)
+			GOTO(destroy, rc);
+	}
+
+	rec->rec_fid = fid;
+	rec->rec_type = dto->do_lu.lo_header->loh_attr;
+	dt_write_lock(env, parent, LOS_PARENT);
+	rc = dt_insert(env, parent, (const struct dt_rec *)rec,
+		       (const struct dt_key *)name, th, 1);
+	if (dti->dti_dof.dof_type == DFT_DIR)
+		dt_ref_add(env, parent, th);
+	dt_write_unlock(env, parent);
+	if (rc)
+		GOTO(destroy, rc);
+destroy:
+	if (rc)
+		dt_destroy(env, dto, th);
+unlock:
+	dt_write_unlock(env, dto);
+trans_stop:
+	dt_trans_stop(env, ls->ls_osd, th);
+out:
+	if (rc) {
+		dt_object_put_nocache(env, dto);
+		dto = ERR_PTR(rc);
+	}
+	RETURN(dto);
+}
+
+struct dt_object *local_file_find(const struct lu_env *env,
+				  struct local_oid_storage *los,
+				  struct dt_object *parent,
+				  const char *name)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	int			 rc;
+
+	LASSERT(parent);
+
+	rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+	if (!rc)
+		dto = ls_locate(env, dt2ls_dev(los->los_dev),
+				&dti->dti_fid, NULL);
+	else
+		dto = ERR_PTR(rc);
+
+	return dto;
+}
+EXPORT_SYMBOL(local_file_find);
+
+/*
+ * Look up and create (if it does not exist) a local named file or directory in
+ * parent directory.
+ */
+struct dt_object *local_file_find_or_create(const struct lu_env *env,
+					    struct local_oid_storage *los,
+					    struct dt_object *parent,
+					    const char *name, __u32 mode)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	int			 rc;
+
+	dto = local_file_find(env, los, parent, name);
+	if (!IS_ERR(dto) || PTR_ERR(dto) != -ENOENT)
+		return dto;
+
+	rc = local_object_fid_generate(env, los, &dti->dti_fid);
+	if (rc)
+		return ERR_PTR(rc);
+
+	/* create the object */
+	dti->dti_attr.la_valid = LA_MODE;
+	dti->dti_attr.la_mode = mode;
+	dti->dti_dof.dof_type = dt_mode_to_dft(mode & S_IFMT);
+	dto = __local_file_create(env, &dti->dti_fid, los,
+				  dt2ls_dev(los->los_dev), parent, name,
+				  &dti->dti_attr, &dti->dti_dof);
+	return dto;
+}
+EXPORT_SYMBOL(local_file_find_or_create);
+
+struct dt_object *local_file_find_or_create_with_fid(const struct lu_env *env,
+						     struct dt_device *dt,
+						     const struct lu_fid *fid,
+						     struct dt_object *parent,
+						     const char *name,
+						     __u32 mode)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	int			 rc;
+
+	LASSERT(parent);
+
+	rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+	if (rc == 0) {
+		dto = dt_locate(env, dt, &dti->dti_fid);
+	} else if (rc != -ENOENT) {
+		dto = ERR_PTR(rc);
+	} else {
+		struct ls_device *ls;
+
+		ls = ls_device_get(dt);
+		if (IS_ERR(ls)) {
+			dto = ERR_PTR(PTR_ERR(ls));
+		} else {
+			/* create the object */
+			dti->dti_attr.la_valid	= LA_MODE;
+			dti->dti_attr.la_mode	= mode;
+			dti->dti_dof.dof_type	= dt_mode_to_dft(mode & S_IFMT);
+			dto = __local_file_create(env, fid, NULL, ls, parent,
+						  name, &dti->dti_attr,
+						  &dti->dti_dof);
+			/* ls_device_put() will finalize the ls device, we
+			 * have to open the object in other device stack */
+			if (!IS_ERR(dto)) {
+				dti->dti_fid = dto->do_lu.lo_header->loh_fid;
+				dt_object_put_nocache(env, dto);
+				dto = dt_locate(env, dt, &dti->dti_fid);
+			}
+			ls_device_put(env, ls);
+		}
+	}
+	return dto;
+}
+EXPORT_SYMBOL(local_file_find_or_create_with_fid);
+
+/*
+ * Look up and create (if it does not exist) a local named index file in parent
+ * directory.
+ */
+struct dt_object *local_index_find_or_create(const struct lu_env *env,
+					     struct local_oid_storage *los,
+					     struct dt_object *parent,
+					     const char *name, __u32 mode,
+					     const struct dt_index_features *ft)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	int			 rc;
+
+	LASSERT(parent);
+
+	rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+	if (rc == 0) {
+		/* name is found, get the object */
+		dto = ls_locate(env, dt2ls_dev(los->los_dev),
+				&dti->dti_fid, NULL);
+	} else if (rc != -ENOENT) {
+		dto = ERR_PTR(rc);
+	} else {
+		rc = local_object_fid_generate(env, los, &dti->dti_fid);
+		if (rc < 0) {
+			dto = ERR_PTR(rc);
+		} else {
+			/* create the object */
+			dti->dti_attr.la_valid		= LA_MODE;
+			dti->dti_attr.la_mode		= mode;
+			dti->dti_dof.dof_type		= DFT_INDEX;
+			dti->dti_dof.u.dof_idx.di_feat	= ft;
+			dto = __local_file_create(env, &dti->dti_fid, los,
+						  dt2ls_dev(los->los_dev),
+						  parent, name, &dti->dti_attr,
+						  &dti->dti_dof);
+		}
+	}
+	return dto;
+
+}
+EXPORT_SYMBOL(local_index_find_or_create);
+
+struct dt_object *
+local_index_find_or_create_with_fid(const struct lu_env *env,
+				    struct dt_device *dt,
+				    const struct lu_fid *fid,
+				    struct dt_object *parent,
+				    const char *name, __u32 mode,
+				    const struct dt_index_features *ft)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	int			 rc;
+
+	LASSERT(parent);
+
+	rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+	if (rc == 0) {
+		/* name is found, get the object */
+		if (!lu_fid_eq(fid, &dti->dti_fid))
+			dto = ERR_PTR(-EINVAL);
+		else
+			dto = dt_locate(env, dt, fid);
+	} else if (rc != -ENOENT) {
+		dto = ERR_PTR(rc);
+	} else {
+		struct ls_device *ls;
+
+		ls = ls_device_get(dt);
+		if (IS_ERR(ls)) {
+			dto = ERR_PTR(PTR_ERR(ls));
+		} else {
+			/* create the object */
+			dti->dti_attr.la_valid		= LA_MODE;
+			dti->dti_attr.la_mode		= mode;
+			dti->dti_dof.dof_type		= DFT_INDEX;
+			dti->dti_dof.u.dof_idx.di_feat  = ft;
+			dto = __local_file_create(env, fid, NULL, ls, parent,
+						  name, &dti->dti_attr,
+						  &dti->dti_dof);
+			/* ls_device_put() will finalize the ls device, we
+			 * have to open the object in other device stack */
+			if (!IS_ERR(dto)) {
+				dti->dti_fid = dto->do_lu.lo_header->loh_fid;
+				dt_object_put_nocache(env, dto);
+				dto = dt_locate(env, dt, &dti->dti_fid);
+			}
+			ls_device_put(env, ls);
+		}
+	}
+	return dto;
+}
+EXPORT_SYMBOL(local_index_find_or_create_with_fid);
+
+static int local_object_declare_unlink(const struct lu_env *env,
+				       struct dt_device *dt,
+				       struct dt_object *p,
+				       struct dt_object *c, const char *name,
+				       struct thandle *th)
+{
+	int rc;
+
+	rc = dt_declare_delete(env, p, (const struct dt_key *)name, th);
+	if (rc < 0)
+		return rc;
+
+	rc = dt_declare_ref_del(env, c, th);
+	if (rc < 0)
+		return rc;
+
+	return dt_declare_destroy(env, c, th);
+}
+
+int local_object_unlink(const struct lu_env *env, struct dt_device *dt,
+			struct dt_object *parent, const char *name)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	struct thandle		*th;
+	int			 rc;
+
+	ENTRY;
+
+	rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+	if (rc == -ENOENT)
+		RETURN(0);
+	else if (rc < 0)
+		RETURN(rc);
+
+	dto = dt_locate(env, dt, &dti->dti_fid);
+	if (unlikely(IS_ERR(dto)))
+		RETURN(PTR_ERR(dto));
+
+	th = dt_trans_create(env, dt);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
+
+	rc = local_object_declare_unlink(env, dt, parent, dto, name, th);
+	if (rc < 0)
+		GOTO(stop, rc);
+
+	rc = dt_trans_start_local(env, dt, th);
+	if (rc < 0)
+		GOTO(stop, rc);
+
+	dt_write_lock(env, dto, 0);
+	rc = dt_delete(env, parent, (struct dt_key *)name, th);
+	if (rc < 0)
+		GOTO(unlock, rc);
+
+	rc = dt_ref_del(env, dto, th);
+	if (rc < 0) {
+		struct dt_insert_rec *rec = &dti->dti_dt_rec;
+
+		rec->rec_fid = &dti->dti_fid;
+		rec->rec_type = dto->do_lu.lo_header->loh_attr;
+		rc = dt_insert(env, parent, (const struct dt_rec *)rec,
+			       (const struct dt_key *)name, th, 1);
+		GOTO(unlock, rc);
+	}
+
+	rc = dt_destroy(env, dto, th);
+unlock:
+	dt_write_unlock(env, dto);
+stop:
+	dt_trans_stop(env, dt, th);
+out:
+	dt_object_put_nocache(env, dto);
+	return rc;
+}
+EXPORT_SYMBOL(local_object_unlink);
+
+struct local_oid_storage *dt_los_find(struct ls_device *ls, __u64 seq)
+{
+	struct local_oid_storage *los, *ret = NULL;
+
+	list_for_each_entry(los, &ls->ls_los_list, los_list) {
+		if (los->los_seq == seq) {
+			atomic_inc(&los->los_refcount);
+			ret = los;
+			break;
+		}
+	}
+	return ret;
+}
+
+void dt_los_put(struct local_oid_storage *los)
+{
+	if (atomic_dec_and_test(&los->los_refcount))
+		/* should never happen, only local_oid_storage_fini should
+		 * drop refcount to zero */
+		LBUG();
+	return;
+}
+
+/* after Lustre 2.3 release there may be old file to store last generated FID
+ * If such file exists then we have to read its content
+ */
+static int lastid_compat_check(const struct lu_env *env, struct dt_device *dev,
+			       __u64 lastid_seq, __u32 *first_oid,
+			       struct ls_device *ls)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*root = NULL;
+	struct los_ondisk	 losd;
+	struct dt_object	*o = NULL;
+	int			 rc = 0;
+
+	rc = dt_root_get(env, dev, &dti->dti_fid);
+	if (rc)
+		return rc;
+
+	root = ls_locate(env, ls, &dti->dti_fid, NULL);
+	if (IS_ERR(root))
+		return PTR_ERR(root);
+
+	/* find old last_id file */
+	snprintf(dti->dti_buf, sizeof(dti->dti_buf), "seq-%#llx-lastid",
+		 lastid_seq);
+	rc = dt_lookup_dir(env, root, dti->dti_buf, &dti->dti_fid);
+	dt_object_put_nocache(env, root);
+	if (rc == -ENOENT) {
+		/* old llog lastid accessed by FID only */
+		if (lastid_seq != FID_SEQ_LLOG)
+			return 0;
+		dti->dti_fid.f_seq = FID_SEQ_LLOG;
+		dti->dti_fid.f_oid = 1;
+		dti->dti_fid.f_ver = 0;
+		o = ls_locate(env, ls, &dti->dti_fid, NULL);
+		if (IS_ERR(o))
+			return PTR_ERR(o);
+
+		if (!dt_object_exists(o)) {
+			dt_object_put_nocache(env, o);
+			return 0;
+		}
+		CDEBUG(D_INFO, "Found old llog lastid file\n");
+	} else if (rc < 0) {
+		return rc;
+	} else {
+		CDEBUG(D_INFO, "Found old lastid file for sequence %#llx\n",
+		       lastid_seq);
+		o = ls_locate(env, ls, &dti->dti_fid, NULL);
+		if (IS_ERR(o))
+			return PTR_ERR(o);
+	}
+	/* let's read seq-NNNNNN-lastid file value */
+	LASSERT(dt_object_exists(o));
+	dti->dti_off = 0;
+	dti->dti_lb.lb_buf = &losd;
+	dti->dti_lb.lb_len = sizeof(losd);
+	dt_read_lock(env, o, 0);
+	rc = dt_record_read(env, o, &dti->dti_lb, &dti->dti_off);
+	dt_read_unlock(env, o);
+	if (rc == 0 && le32_to_cpu(losd.lso_magic) != LOS_MAGIC) {
+		CERROR("%s: wrong content of seq-%#llx-lastid file, magic %x\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name, lastid_seq,
+		       le32_to_cpu(losd.lso_magic));
+		rc = -EINVAL;
+	} else if (rc < 0) {
+		CERROR("%s: failed to read seq-%#llx-lastid: rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name, lastid_seq, rc);
+	}
+	dt_object_put_nocache(env, o);
+	if (rc == 0)
+		*first_oid = le32_to_cpu(losd.lso_next_oid);
+	return rc;
+}
+
+/**
+ * Initialize local OID storage for required sequence.
+ * That may be needed for services that uses local files and requires
+ * dynamic OID allocation for them.
+ *
+ * Per each sequence we have an object with 'first_fid' identificator
+ * containing the counter for OIDs of locally created files with that
+ * sequence.
+ *
+ * It is used now by llog subsystem and MGS for NID tables
+ *
+ * Function gets first_fid to create counter object.
+ * All dynamic fids will be generated with the same sequence and incremented
+ * OIDs
+ *
+ * Returned local_oid_storage is in-memory representaion of OID storage
+ */
+int local_oid_storage_init(const struct lu_env *env, struct dt_device *dev,
+			   const struct lu_fid *first_fid,
+			   struct local_oid_storage **los)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct ls_device	*ls;
+	u64			 lastid;
+	struct dt_object	*o = NULL;
+	struct thandle		*th;
+	__u32			 first_oid = fid_oid(first_fid);
+	int			 rc = 0;
+
+	ENTRY;
+
+	ls = ls_device_get(dev);
+	if (IS_ERR(ls))
+		RETURN(PTR_ERR(ls));
+
+	mutex_lock(&ls->ls_los_mutex);
+	*los = dt_los_find(ls, fid_seq(first_fid));
+	if (*los != NULL)
+		GOTO(out, rc = 0);
+
+	/* not found, then create */
+	OBD_ALLOC_PTR(*los);
+	if (*los == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	atomic_set(&(*los)->los_refcount, 1);
+	mutex_init(&(*los)->los_id_lock);
+	(*los)->los_dev = &ls->ls_top_dev;
+	atomic_inc(&ls->ls_refcount);
+	list_add(&(*los)->los_list, &ls->ls_los_list);
+
+	/* Use {seq, 0, 0} to create the LAST_ID file for every
+	 * sequence.  OIDs start at LUSTRE_FID_INIT_OID.
+	 */
+	dti->dti_fid.f_seq = fid_seq(first_fid);
+	dti->dti_fid.f_oid = LUSTRE_FID_LASTID_OID;
+	dti->dti_fid.f_ver = 0;
+	o = ls_locate(env, ls, &dti->dti_fid, NULL);
+	if (IS_ERR(o))
+		GOTO(out_los, rc = PTR_ERR(o));
+
+	if (!dt_object_exists(o)) {
+		rc = lastid_compat_check(env, dev, fid_seq(first_fid),
+					 &first_oid, ls);
+		if (rc < 0)
+			GOTO(out_los, rc);
+
+		th = dt_trans_create(env, dev);
+		if (IS_ERR(th))
+			GOTO(out_los, rc = PTR_ERR(th));
+
+		dti->dti_attr.la_valid = LA_MODE | LA_TYPE;
+		dti->dti_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+		dti->dti_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+		rc = dt_declare_create(env, o, &dti->dti_attr, NULL,
+				       &dti->dti_dof, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		lastid = cpu_to_le64(first_oid);
+
+		dti->dti_off = 0;
+		dti->dti_lb.lb_buf = &lastid;
+		dti->dti_lb.lb_len = sizeof(lastid);
+		rc = dt_declare_record_write(env, o, &dti->dti_lb, dti->dti_off,
+					     th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		rc = dt_trans_start_local(env, dev, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		dt_write_lock(env, o, 0);
+		if (dt_object_exists(o))
+			GOTO(out_lock, rc = 0);
+
+		rc = dt_create(env, o, &dti->dti_attr, NULL, &dti->dti_dof,
+			       th);
+		if (rc)
+			GOTO(out_lock, rc);
+
+		rc = dt_record_write(env, o, &dti->dti_lb, &dti->dti_off, th);
+		if (rc)
+			GOTO(out_lock, rc);
+out_lock:
+		dt_write_unlock(env, o);
+out_trans:
+		dt_trans_stop(env, dev, th);
+	} else {
+		dti->dti_off = 0;
+		dti->dti_lb.lb_buf = &lastid;
+		dti->dti_lb.lb_len = sizeof(lastid);
+		dt_read_lock(env, o, 0);
+		rc = dt_record_read(env, o, &dti->dti_lb, &dti->dti_off);
+		dt_read_unlock(env, o);
+		if (rc == 0 && le64_to_cpu(lastid) > OBIF_MAX_OID) {
+			CERROR("%s: bad oid %llu is read from LAST_ID\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       le64_to_cpu(lastid));
+			rc = -EINVAL;
+		}
+	}
+out_los:
+	if (rc != 0) {
+		list_del(&(*los)->los_list);
+		atomic_dec(&ls->ls_refcount);
+		OBD_FREE_PTR(*los);
+		*los = NULL;
+		if (o != NULL && !IS_ERR(o))
+			dt_object_put_nocache(env, o);
+	} else {
+		(*los)->los_seq = fid_seq(first_fid);
+		(*los)->los_last_oid = le64_to_cpu(lastid);
+		(*los)->los_obj = o;
+		/* Read value should not be less than initial one
+		 * but possible after upgrade from older fs.
+		 * In this case just switch to the first_oid in memory and
+		 * it will be updated on disk with first object generated */
+		if ((*los)->los_last_oid < first_oid)
+			(*los)->los_last_oid = first_oid;
+	}
+out:
+	mutex_unlock(&ls->ls_los_mutex);
+	ls_device_put(env, ls);
+	return rc;
+}
+EXPORT_SYMBOL(local_oid_storage_init);
+
+void local_oid_storage_fini(const struct lu_env *env,
+			    struct local_oid_storage *los)
+{
+	struct ls_device *ls;
+
+	LASSERT(env);
+	LASSERT(los->los_dev);
+	ls = dt2ls_dev(los->los_dev);
+
+	/* Take the mutex before decreasing the reference to avoid race
+	 * conditions as described in LU-4721. */
+	mutex_lock(&ls->ls_los_mutex);
+	if (!atomic_dec_and_test(&los->los_refcount)) {
+		mutex_unlock(&ls->ls_los_mutex);
+		return;
+	}
+
+	if (los->los_obj)
+		dt_object_put_nocache(env, los->los_obj);
+	list_del(&los->los_list);
+	OBD_FREE_PTR(los);
+	mutex_unlock(&ls->ls_los_mutex);
+	ls_device_put(env, ls);
+}
+EXPORT_SYMBOL(local_oid_storage_fini);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/local_storage.h b/drivers/staging/lustrefsx/lustre/obdclass/local_storage.h
new file mode 100644
index 0000000000000..caf26bfec6e28
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/local_storage.h
@@ -0,0 +1,102 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * lustre/obdclass/local_storage.c
+ *
+ * Local storage for file/objects with fid generation. Works on top of OSD.
+ *
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+#ifndef __LOCAL_STORAGE_H
+#define __LOCAL_STORAGE_H
+
+#include <dt_object.h>
+#include <obd.h>
+#include <lustre_fid.h>
+#include <lustre_disk.h>
+
+struct ls_device {
+	struct dt_device	 ls_top_dev;
+	/* all initialized ls_devices on this node linked by this */
+	struct list_head	 ls_linkage;
+	/* how many handle's reference this local storage */
+	atomic_t		 ls_refcount;
+	/* underlaying OSD device */
+	struct dt_device	*ls_osd;
+	/* list of all local OID storages */
+	struct list_head	 ls_los_list;
+	struct mutex		 ls_los_mutex;
+};
+
+static inline struct ls_device *dt2ls_dev(struct dt_device *d)
+{
+	return container_of0(d, struct ls_device, ls_top_dev);
+}
+
+struct ls_object {
+	struct lu_object_header	 ls_header;
+	struct dt_object	 ls_obj;
+};
+
+static inline struct ls_object *lu2ls_obj(struct lu_object *o)
+{
+	return container_of0(o, struct ls_object, ls_obj.do_lu);
+}
+
+static inline struct dt_object *ls_locate(const struct lu_env *env,
+					  struct ls_device *ls,
+					  const struct lu_fid *fid,
+					  const struct lu_object_conf *conf)
+{
+	return dt_locate_at(env, ls->ls_osd, fid,
+			    &ls->ls_top_dev.dd_lu_dev, conf);
+}
+
+struct ls_device *ls_device_get(struct dt_device *dev);
+void ls_device_put(const struct lu_env *env, struct ls_device *ls);
+struct local_oid_storage *dt_los_find(struct ls_device *ls, __u64 seq);
+void dt_los_put(struct local_oid_storage *los);
+
+/* Lustre 2.3 on-disk structure describing local object OIDs storage
+ * the structure to be used with any sequence managed by
+ * local object library.
+ * Obsoleted since 2.4 but is kept for compatibility reasons,
+ * see lastid_compat_check() in obdclass/local_storage.c */
+struct los_ondisk {
+	__u32 lso_magic;
+	__u32 lso_next_oid;
+};
+
+#define LOS_MAGIC	0xdecafbee
+
+/**
+ * Used in __local_file_create() for object lock role
+ **/
+enum los_object_role {
+	LOS_PARENT,
+	LOS_CHILD,
+};
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_counters.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_counters.c
new file mode 100644
index 0000000000000..bbdcfd47ebad9
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_counters.c
@@ -0,0 +1,137 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ *
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lprocfs_counters.c
+ *
+ * Lustre lprocfs counter routines
+ *
+ * Author: Andreas Dilger <andreas.dilger@intel.com>
+ */
+#include <linux/module.h>
+#include <lustre_lib.h>
+#include <lprocfs_status.h>
+
+#ifdef CONFIG_PROC_FS
+void lprocfs_counter_add(struct lprocfs_stats *stats, int idx, long amount)
+{
+	struct lprocfs_counter		*percpu_cntr;
+	struct lprocfs_counter_header	*header;
+	int				smp_id;
+	unsigned long			flags = 0;
+
+	if (stats == NULL)
+		return;
+
+	LASSERTF(0 <= idx && idx < stats->ls_num,
+		 "idx %d, ls_num %hu\n", idx, stats->ls_num);
+
+	/* With per-client stats, statistics are allocated only for
+	 * single CPU area, so the smp_id should be 0 always. */
+	smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID, &flags);
+	if (smp_id < 0)
+		return;
+
+	header = &stats->ls_cnt_header[idx];
+	percpu_cntr = lprocfs_stats_counter_get(stats, smp_id, idx);
+	percpu_cntr->lc_count++;
+
+	if (header->lc_config & LPROCFS_CNTR_AVGMINMAX) {
+		/*
+		 * lprocfs_counter_add() can be called in interrupt context,
+		 * as memory allocation could trigger memory shrinker call
+		 * ldlm_pool_shrink(), which calls lprocfs_counter_add().
+		 * LU-1727.
+		 *
+		 * Only obd_memory uses LPROCFS_STATS_FLAG_IRQ_SAFE
+		 * flag, because it needs accurate counting lest memory leak
+		 * check reports error.
+		 */
+		if (in_interrupt() &&
+		    (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+			percpu_cntr->lc_sum_irq += amount;
+		else
+			percpu_cntr->lc_sum += amount;
+
+		if (header->lc_config & LPROCFS_CNTR_STDDEV)
+			percpu_cntr->lc_sumsquare += (__s64)amount * amount;
+		if (amount < percpu_cntr->lc_min)
+			percpu_cntr->lc_min = amount;
+		if (amount > percpu_cntr->lc_max)
+			percpu_cntr->lc_max = amount;
+	}
+	lprocfs_stats_unlock(stats, LPROCFS_GET_SMP_ID, &flags);
+}
+EXPORT_SYMBOL(lprocfs_counter_add);
+
+void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx, long amount)
+{
+	struct lprocfs_counter		*percpu_cntr;
+	struct lprocfs_counter_header	*header;
+	int				smp_id;
+	unsigned long			flags = 0;
+
+	if (stats == NULL)
+		return;
+
+	LASSERTF(0 <= idx && idx < stats->ls_num,
+		 "idx %d, ls_num %hu\n", idx, stats->ls_num);
+
+	/* With per-client stats, statistics are allocated only for
+	 * single CPU area, so the smp_id should be 0 always. */
+	smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID, &flags);
+	if (smp_id < 0)
+		return;
+
+	header = &stats->ls_cnt_header[idx];
+	percpu_cntr = lprocfs_stats_counter_get(stats, smp_id, idx);
+	if (header->lc_config & LPROCFS_CNTR_AVGMINMAX) {
+		/*
+		 * Sometimes we use RCU callbacks to free memory which calls
+		 * lprocfs_counter_sub(), and RCU callbacks may execute in
+		 * softirq context - right now that's the only case we're in
+		 * softirq context here, use separate counter for that.
+		 * bz20650.
+		 *
+		 * Only obd_memory uses LPROCFS_STATS_FLAG_IRQ_SAFE
+		 * flag, because it needs accurate counting lest memory leak
+		 * check reports error.
+		 */
+		if (in_interrupt() &&
+		    (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+			percpu_cntr->lc_sum_irq -= amount;
+		else
+			percpu_cntr->lc_sum -= amount;
+	}
+	lprocfs_stats_unlock(stats, LPROCFS_GET_SMP_ID, &flags);
+}
+EXPORT_SYMBOL(lprocfs_counter_sub);
+#endif  /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c
new file mode 100644
index 0000000000000..a341794dc4226
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c
@@ -0,0 +1,670 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: Niu Yawei <niu@whamcloud.com>
+ */
+/*
+ * lustre/obdclass/lprocfs_jobstats.c
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+
+#ifdef CONFIG_PROC_FS
+
+/*
+ * JobID formats & JobID environment variable names for supported
+ * job schedulers:
+ *
+ * SLURM:
+ *   JobID format:  32 bit integer.
+ *   JobID env var: SLURM_JOB_ID.
+ * SGE:
+ *   JobID format:  Decimal integer range to 99999.
+ *   JobID env var: JOB_ID.
+ * LSF:
+ *   JobID format:  6 digit integer by default (up to 999999), can be
+ *		  increased to 10 digit (up to 2147483646).
+ *   JobID env var: LSB_JOBID.
+ * Loadleveler:
+ *   JobID format:  String of machine_name.cluster_id.process_id, for
+ *		  example: fr2n02.32.0
+ *   JobID env var: LOADL_STEP_ID.
+ * PBS:
+ *   JobID format:  String of sequence_number[.server_name][@server].
+ *   JobID env var: PBS_JOBID.
+ * Maui/MOAB:
+ *   JobID format:  Same as PBS.
+ *   JobID env var: Same as PBS.
+ */
+
+struct job_stat {
+	struct hlist_node	js_hash;	/* hash struct for this jobid */
+	struct list_head	js_list;	/* on ojs_list, with ojs_lock */
+	atomic_t		js_refcount;	/* num users of this struct */
+	char			js_jobid[LUSTRE_JOBID_SIZE]; /* job name */
+	time_t			js_timestamp;	/* seconds of most recent stat*/
+	struct lprocfs_stats	*js_stats;	/* per-job statistics */
+	struct obd_job_stats	*js_jobstats;	/* for accessing ojs_lock */
+};
+
+static unsigned
+job_stat_hash(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, strlen(key), mask);
+}
+
+static void *job_stat_key(struct hlist_node *hnode)
+{
+	struct job_stat *job;
+	job = hlist_entry(hnode, struct job_stat, js_hash);
+	return job->js_jobid;
+}
+
+static int job_stat_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct job_stat *job;
+	job = hlist_entry(hnode, struct job_stat, js_hash);
+	return (strlen(job->js_jobid) == strlen(key)) &&
+	       !strncmp(job->js_jobid, key, strlen(key));
+}
+
+static void *job_stat_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct job_stat, js_hash);
+}
+
+static void job_stat_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct job_stat *job;
+	job = hlist_entry(hnode, struct job_stat, js_hash);
+	atomic_inc(&job->js_refcount);
+}
+
+static void job_free(struct job_stat *job)
+{
+	LASSERT(atomic_read(&job->js_refcount) == 0);
+	LASSERT(job->js_jobstats != NULL);
+
+	write_lock(&job->js_jobstats->ojs_lock);
+	list_del_init(&job->js_list);
+	write_unlock(&job->js_jobstats->ojs_lock);
+
+	lprocfs_free_stats(&job->js_stats);
+	OBD_FREE_PTR(job);
+}
+
+static void job_putref(struct job_stat *job)
+{
+	LASSERT(atomic_read(&job->js_refcount) > 0);
+	if (atomic_dec_and_test(&job->js_refcount))
+		job_free(job);
+}
+
+static void job_stat_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct job_stat *job;
+	job = hlist_entry(hnode, struct job_stat, js_hash);
+	job_putref(job);
+}
+
+static void job_stat_exit(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	CERROR("should not have any items\n");
+}
+
+static struct cfs_hash_ops job_stats_hash_ops = {
+	.hs_hash       = job_stat_hash,
+	.hs_key        = job_stat_key,
+	.hs_keycmp     = job_stat_keycmp,
+	.hs_object     = job_stat_object,
+	.hs_get        = job_stat_get,
+	.hs_put_locked = job_stat_put_locked,
+	.hs_exit       = job_stat_exit,
+};
+
+/**
+ * Jobstats expiry iterator to clean up old jobids
+ *
+ * Called for each job_stat structure on this device, it should delete stats
+ * older than the specified \a oldest_time in seconds.  If \a oldest_time is
+ * in the future then this will delete all statistics (e.g. during shutdown).
+ *
+ * \param[in] hs	hash of all jobids on this device
+ * \param[in] bd	hash bucket containing this jobid
+ * \param[in] hnode	hash structure for this jobid
+ * \param[in] data	pointer to stats expiry time in seconds
+ */
+static int job_cleanup_iter_callback(struct cfs_hash *hs,
+				     struct cfs_hash_bd *bd,
+				     struct hlist_node *hnode, void *data)
+{
+	time_t oldest_time = *((time_t *)data);
+	struct job_stat *job;
+
+	job = hlist_entry(hnode, struct job_stat, js_hash);
+	if (job->js_timestamp < oldest_time)
+		cfs_hash_bd_del_locked(hs, bd, hnode);
+
+	return 0;
+}
+
+/**
+ * Clean up jobstats that were updated more than \a before seconds ago.
+ *
+ * Since this function may be called frequently, do not scan all of the
+ * jobstats on each call, only twice per cleanup interval.  That means stats
+ * may be around on average cleanup_interval / 4 longer than necessary,
+ * but that is not considered harmful.
+ *
+ * If \a before is negative then this will force clean up all jobstats due
+ * to the expiry time being in the future (e.g. at shutdown).
+ *
+ * If there is already another thread doing jobstats cleanup, don't try to
+ * do this again in the current thread unless this is a force cleanup.
+ *
+ * \param[in] stats	stucture tracking all job stats for this device
+ * \param[in] before	expire jobstats updated more than this many seconds ago
+ */
+static void lprocfs_job_cleanup(struct obd_job_stats *stats, int before)
+{
+	time_t now = cfs_time_current_sec();
+	time_t oldest;
+
+	if (likely(before >= 0)) {
+		unsigned int cleanup_interval = stats->ojs_cleanup_interval;
+
+		if (cleanup_interval == 0 || before == 0)
+			return;
+
+		if (now < stats->ojs_last_cleanup + cleanup_interval / 2)
+			return;
+
+		if (stats->ojs_cleaning)
+			return;
+	}
+
+	write_lock(&stats->ojs_lock);
+	if (before >= 0 && stats->ojs_cleaning) {
+		write_unlock(&stats->ojs_lock);
+		return;
+	}
+
+	stats->ojs_cleaning = true;
+	write_unlock(&stats->ojs_lock);
+
+	/* Can't hold ojs_lock over hash iteration, since it is grabbed by
+	 * job_cleanup_iter_callback()
+	 *   ->cfs_hash_bd_del_locked()
+	 *     ->job_putref()
+	 *       ->job_free()
+	 *
+	 * Holding ojs_lock isn't necessary for safety of the hash iteration,
+	 * since locking of the hash is handled internally, but there isn't
+	 * any benefit to having multiple threads doing cleanup at one time.
+	 */
+	oldest = now - before;
+	cfs_hash_for_each_safe(stats->ojs_hash, job_cleanup_iter_callback,
+			       &oldest);
+
+	write_lock(&stats->ojs_lock);
+	stats->ojs_cleaning = false;
+	stats->ojs_last_cleanup = cfs_time_current_sec();
+	write_unlock(&stats->ojs_lock);
+}
+
+static struct job_stat *job_alloc(char *jobid, struct obd_job_stats *jobs)
+{
+	struct job_stat *job;
+
+	OBD_ALLOC_PTR(job);
+	if (job == NULL)
+		return NULL;
+
+	job->js_stats = lprocfs_alloc_stats(jobs->ojs_cntr_num, 0);
+	if (job->js_stats == NULL) {
+		OBD_FREE_PTR(job);
+		return NULL;
+	}
+
+	jobs->ojs_cntr_init_fn(job->js_stats);
+
+	memcpy(job->js_jobid, jobid, LUSTRE_JOBID_SIZE);
+	job->js_timestamp = cfs_time_current_sec();
+	job->js_jobstats = jobs;
+	INIT_HLIST_NODE(&job->js_hash);
+	INIT_LIST_HEAD(&job->js_list);
+	atomic_set(&job->js_refcount, 1);
+
+	return job;
+}
+
+int lprocfs_job_stats_log(struct obd_device *obd, char *jobid,
+			  int event, long amount)
+{
+	struct obd_job_stats *stats = &obd->u.obt.obt_jobstats;
+	struct job_stat *job, *job2;
+	ENTRY;
+
+	LASSERT(stats != NULL);
+	LASSERT(stats->ojs_hash != NULL);
+
+	if (event >= stats->ojs_cntr_num)
+		RETURN(-EINVAL);
+
+	if (jobid == NULL || strlen(jobid) == 0)
+		RETURN(-EINVAL);
+
+	if (strlen(jobid) >= LUSTRE_JOBID_SIZE) {
+		CERROR("Invalid jobid size (%lu), expect(%d)\n",
+		       (unsigned long)strlen(jobid) + 1, LUSTRE_JOBID_SIZE);
+		RETURN(-EINVAL);
+	}
+
+	job = cfs_hash_lookup(stats->ojs_hash, jobid);
+	if (job)
+		goto found;
+
+	lprocfs_job_cleanup(stats, stats->ojs_cleanup_interval);
+
+	job = job_alloc(jobid, stats);
+	if (job == NULL)
+		RETURN(-ENOMEM);
+
+	job2 = cfs_hash_findadd_unique(stats->ojs_hash, job->js_jobid,
+				       &job->js_hash);
+	if (job2 != job) {
+		job_putref(job);
+		job = job2;
+		/* We cannot LASSERT(!list_empty(&job->js_list)) here,
+		 * since we just lost the race for inserting "job" into the
+		 * ojs_list, and some other thread is doing it _right_now_.
+		 * Instead, be content the other thread is doing this, since
+		 * "job2" was initialized in job_alloc() already. LU-2163 */
+	} else {
+		LASSERT(list_empty(&job->js_list));
+		write_lock(&stats->ojs_lock);
+		list_add_tail(&job->js_list, &stats->ojs_list);
+		write_unlock(&stats->ojs_lock);
+	}
+
+found:
+	LASSERT(stats == job->js_jobstats);
+	job->js_timestamp = cfs_time_current_sec();
+	lprocfs_counter_add(job->js_stats, event, amount);
+
+	job_putref(job);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lprocfs_job_stats_log);
+
+void lprocfs_job_stats_fini(struct obd_device *obd)
+{
+	struct obd_job_stats *stats = &obd->u.obt.obt_jobstats;
+
+	if (stats->ojs_hash == NULL)
+		return;
+
+	lprocfs_job_cleanup(stats, -99);
+	cfs_hash_putref(stats->ojs_hash);
+	stats->ojs_hash = NULL;
+	LASSERT(list_empty(&stats->ojs_list));
+}
+EXPORT_SYMBOL(lprocfs_job_stats_fini);
+
+static void *lprocfs_jobstats_seq_start(struct seq_file *p, loff_t *pos)
+{
+	struct obd_job_stats *stats = p->private;
+	loff_t off = *pos;
+	struct job_stat *job;
+
+	read_lock(&stats->ojs_lock);
+	if (off == 0)
+		return SEQ_START_TOKEN;
+	off--;
+	list_for_each_entry(job, &stats->ojs_list, js_list) {
+		if (!off--)
+			return job;
+	}
+	return NULL;
+}
+
+static void lprocfs_jobstats_seq_stop(struct seq_file *p, void *v)
+{
+	struct obd_job_stats *stats = p->private;
+
+	read_unlock(&stats->ojs_lock);
+}
+
+static void *lprocfs_jobstats_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	struct obd_job_stats *stats = p->private;
+	struct job_stat *job;
+	struct list_head *next;
+
+	++*pos;
+	if (v == SEQ_START_TOKEN) {
+		next = stats->ojs_list.next;
+	} else {
+		job = (struct job_stat *)v;
+		next = job->js_list.next;
+	}
+
+	return next == &stats->ojs_list ? NULL :
+		list_entry(next, struct job_stat, js_list);
+}
+
+/*
+ * Example of output on MDT:
+ *
+ * job_stats:
+ * - job_id:        dd.4854
+ *   snapshot_time: 1322494486
+ *   open:          { samples:	       1, unit: reqs }
+ *   close:         { samples:	       1, unit: reqs }
+ *   mknod:         { samples:	       0, unit: reqs }
+ *   link:          { samples:	       0, unit: reqs }
+ *   unlink:        { samples:	       0, unit: reqs }
+ *   mkdir:         { samples:	       0, unit: reqs }
+ *   rmdir:         { samples:	       0, unit: reqs }
+ *   rename:        { samples:	       0, unit: reqs }
+ *   getattr:       { samples:	       1, unit: reqs }
+ *   setattr:       { samples:	       0, unit: reqs }
+ *   getxattr:      { samples:	       0, unit: reqs }
+ *   setxattr:      { samples:	       0, unit: reqs }
+ *   statfs:        { samples:	       0, unit: reqs }
+ *   sync:          { samples:	       0, unit: reqs }
+ *
+ * Example of output on OST:
+ *
+ * job_stats:
+ * - job_id         dd.4854
+ *   snapshot_time: 1322494602
+ *   read:          { samples: 0, unit: bytes, min:  0, max:  0, sum:  0 }
+ *   write:         { samples: 1, unit: bytes, min: 4096, max: 4096, sum: 4096 }
+ *   setattr:       { samples: 0, unit: reqs }
+ *   punch:         { samples: 0, unit: reqs }
+ *   sync:          { samples: 0, unit: reqs }
+ */
+
+static const char spaces[] = "                    ";
+
+static int inline width(const char *str, int len)
+{
+	return len - min((int)strlen(str), 15);
+}
+
+static int lprocfs_jobstats_seq_show(struct seq_file *p, void *v)
+{
+	struct job_stat			*job = v;
+	struct lprocfs_stats		*s;
+	struct lprocfs_counter		ret;
+	struct lprocfs_counter_header	*cntr_header;
+	int				i;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(p, "job_stats:\n");
+		return 0;
+	}
+
+	/* Replace the non-printable character in jobid with '?', so
+	 * that the output of jobid will be confined in single line. */
+	seq_printf(p, "- %-16s ", "job_id:");
+	for (i = 0; i < strlen(job->js_jobid); i++) {
+		if (isprint(job->js_jobid[i]) != 0)
+			seq_putc(p, job->js_jobid[i]);
+		else
+			seq_putc(p, '?');
+	}
+	seq_putc(p, '\n');
+
+	seq_printf(p, "  %-16s %ld\n", "snapshot_time:", job->js_timestamp);
+
+	s = job->js_stats;
+	for (i = 0; i < s->ls_num; i++) {
+		cntr_header = &s->ls_cnt_header[i];
+		lprocfs_stats_collect(s, i, &ret);
+
+		seq_printf(p, "  %s:%.*s { samples: %11llu",
+			   cntr_header->lc_name,
+			   width(cntr_header->lc_name, 15), spaces,
+			   ret.lc_count);
+		if (cntr_header->lc_units[0] != '\0')
+			seq_printf(p, ", unit: %5s", cntr_header->lc_units);
+
+		if (cntr_header->lc_config & LPROCFS_CNTR_AVGMINMAX) {
+			seq_printf(p, ", min:%8llu, max:%8llu,"
+				   " sum:%16llu",
+				   ret.lc_count ? ret.lc_min : 0,
+				   ret.lc_count ? ret.lc_max : 0,
+				   ret.lc_count ? ret.lc_sum : 0);
+		}
+		if (cntr_header->lc_config & LPROCFS_CNTR_STDDEV) {
+			seq_printf(p, ", sumsq: %18llu",
+				   ret.lc_count ? ret.lc_sumsquare : 0);
+		}
+
+		seq_printf(p, " }\n");
+
+	}
+	return 0;
+}
+
+static const struct seq_operations lprocfs_jobstats_seq_sops = {
+	.start	= lprocfs_jobstats_seq_start,
+	.stop	= lprocfs_jobstats_seq_stop,
+	.next	= lprocfs_jobstats_seq_next,
+	.show	= lprocfs_jobstats_seq_show,
+};
+
+static int lprocfs_jobstats_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc;
+
+	rc = LPROCFS_ENTRY_CHECK(inode);
+	if (rc < 0)
+		return rc;
+
+	rc = seq_open(file, &lprocfs_jobstats_seq_sops);
+	if (rc)
+		return rc;
+	seq = file->private_data;
+	seq->private = PDE_DATA(inode);
+	return 0;
+}
+
+static ssize_t lprocfs_jobstats_seq_write(struct file *file,
+					  const char __user *buf,
+					  size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct obd_job_stats *stats = seq->private;
+	char jobid[LUSTRE_JOBID_SIZE];
+	struct job_stat *job;
+
+	if (len == 0 || len >= LUSTRE_JOBID_SIZE)
+		return -EINVAL;
+
+	if (stats->ojs_hash == NULL)
+		return -ENODEV;
+
+	if (copy_from_user(jobid, buf, len))
+		return -EFAULT;
+	jobid[len] = 0;
+
+	/* Trim '\n' if any */
+	if (jobid[len - 1] == '\n')
+		jobid[len - 1] = 0;
+
+	if (strcmp(jobid, "clear") == 0) {
+		lprocfs_job_cleanup(stats, -99);
+
+		return len;
+	}
+
+	if (strlen(jobid) == 0)
+		return -EINVAL;
+
+	job = cfs_hash_lookup(stats->ojs_hash, jobid);
+	if (!job)
+		return -EINVAL;
+
+	cfs_hash_del_key(stats->ojs_hash, jobid);
+
+	job_putref(job);
+	return len;
+}
+
+/**
+ * Clean up the seq file state when the /proc file is closed.
+ *
+ * This also expires old job stats from the cache after they have been
+ * printed in case the system is idle and not generating new jobstats.
+ *
+ * \param[in] inode	struct inode for seq file being closed
+ * \param[in] file	struct file for seq file being closed
+ *
+ * \retval		0 on success
+ * \retval		negative errno on failure
+ */
+static int lprocfs_jobstats_seq_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct obd_job_stats *stats = seq->private;
+
+	lprocfs_job_cleanup(stats, stats->ojs_cleanup_interval);
+
+	return lprocfs_seq_release(inode, file);
+}
+
+static const struct file_operations lprocfs_jobstats_seq_fops = {
+	.owner   = THIS_MODULE,
+	.open    = lprocfs_jobstats_seq_open,
+	.read    = seq_read,
+	.write   = lprocfs_jobstats_seq_write,
+	.llseek  = seq_lseek,
+	.release = lprocfs_jobstats_seq_release,
+};
+
+int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
+			   cntr_init_callback init_fn)
+{
+	struct proc_dir_entry *entry;
+	struct obd_job_stats *stats;
+	ENTRY;
+
+	LASSERT(obd->obd_proc_entry != NULL);
+	LASSERT(obd->obd_type->typ_name);
+
+	if (cntr_num <= 0)
+		RETURN(-EINVAL);
+
+	if (init_fn == NULL)
+		RETURN(-EINVAL);
+
+	/* Currently needs to be a target due to the use of obt_jobstats. */
+	if (strcmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME) != 0 &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_OST_NAME) != 0) {
+		CERROR("%s: invalid device type %s for job stats: rc = %d\n",
+		       obd->obd_name, obd->obd_type->typ_name, -EINVAL);
+		RETURN(-EINVAL);
+	}
+	stats = &obd->u.obt.obt_jobstats;
+
+	LASSERT(stats->ojs_hash == NULL);
+	stats->ojs_hash = cfs_hash_create("JOB_STATS",
+					  HASH_JOB_STATS_CUR_BITS,
+					  HASH_JOB_STATS_MAX_BITS,
+					  HASH_JOB_STATS_BKT_BITS, 0,
+					  CFS_HASH_MIN_THETA,
+					  CFS_HASH_MAX_THETA,
+					  &job_stats_hash_ops,
+					  CFS_HASH_DEFAULT);
+	if (stats->ojs_hash == NULL)
+		RETURN(-ENOMEM);
+
+	INIT_LIST_HEAD(&stats->ojs_list);
+	rwlock_init(&stats->ojs_lock);
+	stats->ojs_cntr_num = cntr_num;
+	stats->ojs_cntr_init_fn = init_fn;
+	stats->ojs_cleanup_interval = 600; /* 10 mins by default */
+	stats->ojs_last_cleanup = cfs_time_current_sec();
+
+	entry = lprocfs_add_simple(obd->obd_proc_entry, "job_stats", stats,
+				   &lprocfs_jobstats_seq_fops);
+	if (IS_ERR(entry)) {
+		lprocfs_job_stats_fini(obd);
+		RETURN(-ENOMEM);
+	}
+	RETURN(0);
+}
+EXPORT_SYMBOL(lprocfs_job_stats_init);
+
+int lprocfs_job_interval_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = m->private;
+	struct obd_job_stats *stats;
+
+	if (obd == NULL)
+		return -ENODEV;
+
+	stats = &obd->u.obt.obt_jobstats;
+	seq_printf(m, "%d\n", stats->ojs_cleanup_interval);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_job_interval_seq_show);
+
+ssize_t
+lprocfs_job_interval_seq_write(struct file *file, const char __user *buffer,
+				size_t count, loff_t *off)
+{
+	struct obd_device *obd;
+	struct obd_job_stats *stats;
+	int rc;
+	__s64 val;
+
+	obd = ((struct seq_file *)file->private_data)->private;
+	if (obd == NULL)
+		return -ENODEV;
+
+	stats = &obd->u.obt.obt_jobstats;
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+	if (val < 0 || val > UINT_MAX)
+		return -ERANGE;
+
+	stats->ojs_cleanup_interval = val;
+	lprocfs_job_cleanup(stats, stats->ojs_cleanup_interval);
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_job_interval_seq_write);
+#endif /* CONFIG_PROC_FS*/
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
new file mode 100644
index 0000000000000..0c0badb13a95d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
@@ -0,0 +1,2479 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lprocfs_status.c
+ *
+ * Author: Hariharan Thantry <thantry@users.sourceforge.net>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+
+#ifdef CONFIG_PROC_FS
+
+static int lprocfs_no_percpu_stats = 0;
+module_param(lprocfs_no_percpu_stats, int, 0644);
+MODULE_PARM_DESC(lprocfs_no_percpu_stats, "Do not alloc percpu data for lprocfs stats");
+
+#define MAX_STRING_SIZE 128
+
+int lprocfs_single_release(struct inode *inode, struct file *file)
+{
+        return single_release(inode, file);
+}
+EXPORT_SYMBOL(lprocfs_single_release);
+
+int lprocfs_seq_release(struct inode *inode, struct file *file)
+{
+        return seq_release(inode, file);
+}
+EXPORT_SYMBOL(lprocfs_seq_release);
+
+struct proc_dir_entry *
+lprocfs_add_simple(struct proc_dir_entry *root, char *name,
+		   void *data, const struct file_operations *fops)
+{
+	struct proc_dir_entry *proc;
+	mode_t mode = 0;
+
+	if (root == NULL || name == NULL || fops == NULL)
+                return ERR_PTR(-EINVAL);
+
+	if (fops->read)
+		mode = 0444;
+	if (fops->write)
+		mode |= 0200;
+	proc = proc_create_data(name, mode, root, fops, data);
+	if (!proc) {
+		CERROR("LprocFS: No memory to create /proc entry %s\n",
+		       name);
+		return ERR_PTR(-ENOMEM);
+	}
+        return proc;
+}
+EXPORT_SYMBOL(lprocfs_add_simple);
+
+struct proc_dir_entry *lprocfs_add_symlink(const char *name,
+                        struct proc_dir_entry *parent, const char *format, ...)
+{
+        struct proc_dir_entry *entry;
+        char *dest;
+        va_list ap;
+
+        if (parent == NULL || format == NULL)
+                return NULL;
+
+        OBD_ALLOC_WAIT(dest, MAX_STRING_SIZE + 1);
+        if (dest == NULL)
+                return NULL;
+
+        va_start(ap, format);
+        vsnprintf(dest, MAX_STRING_SIZE, format, ap);
+        va_end(ap);
+
+        entry = proc_symlink(name, parent, dest);
+	if (entry == NULL)
+		CERROR("LprocFS: Could not create symbolic link from "
+		       "%s to %s\n", name, dest);
+
+        OBD_FREE(dest, MAX_STRING_SIZE + 1);
+        return entry;
+}
+EXPORT_SYMBOL(lprocfs_add_symlink);
+
+static const struct file_operations lprocfs_generic_fops = { };
+
+int ldebugfs_add_vars(struct dentry *parent, struct lprocfs_vars *list,
+		      void *data)
+{
+	if (IS_ERR_OR_NULL(parent) || IS_ERR_OR_NULL(list))
+		return -EINVAL;
+
+	while (list->name) {
+		struct dentry *entry;
+		umode_t mode = 0;
+
+		if (list->proc_mode != 0000) {
+			mode = list->proc_mode;
+		} else if (list->fops) {
+			if (list->fops->read)
+				mode = 0444;
+			if (list->fops->write)
+				mode |= 0200;
+		}
+		entry = debugfs_create_file(list->name, mode, parent,
+					    list->data ? : data,
+					    list->fops ? : &lprocfs_generic_fops);
+		if (IS_ERR_OR_NULL(entry))
+			return entry ? PTR_ERR(entry) : -ENOMEM;
+		list++;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ldebugfs_add_vars);
+
+/**
+ * Add /proc entries.
+ *
+ * \param root [in]  The parent proc entry on which new entry will be added.
+ * \param list [in]  Array of proc entries to be added.
+ * \param data [in]  The argument to be passed when entries read/write routines
+ *                   are called through /proc file.
+ *
+ * \retval 0   on success
+ *         < 0 on error
+ */
+int
+lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list,
+		 void *data)
+{
+	if (root == NULL || list == NULL)
+		return -EINVAL;
+
+	while (list->name != NULL) {
+		struct proc_dir_entry *proc;
+		mode_t mode = 0;
+
+		if (list->proc_mode != 0000) {
+			mode = list->proc_mode;
+		} else if (list->fops) {
+			if (list->fops->read)
+				mode = 0444;
+			if (list->fops->write)
+				mode |= 0200;
+		}
+		proc = proc_create_data(list->name, mode, root,
+					list->fops ?: &lprocfs_generic_fops,
+					list->data ?: data);
+		if (proc == NULL)
+			return -ENOMEM;
+		list++;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_add_vars);
+
+void ldebugfs_remove(struct dentry **entryp)
+{
+	debugfs_remove(*entryp);
+	*entryp = NULL;
+}
+EXPORT_SYMBOL_GPL(ldebugfs_remove);
+
+#ifndef HAVE_REMOVE_PROC_SUBTREE
+/* for b=10866, global variable */
+DECLARE_RWSEM(_lprocfs_lock);
+EXPORT_SYMBOL(_lprocfs_lock);
+
+static void lprocfs_remove_nolock(struct proc_dir_entry **proot)
+{
+	struct proc_dir_entry *root = *proot;
+	struct proc_dir_entry *temp = root;
+	struct proc_dir_entry *rm_entry;
+	struct proc_dir_entry *parent;
+
+	*proot = NULL;
+	if (root == NULL || IS_ERR(root))
+		return;
+
+        parent = root->parent;
+        LASSERT(parent != NULL);
+
+        while (1) {
+                while (temp->subdir != NULL)
+                        temp = temp->subdir;
+
+                rm_entry = temp;
+                temp = temp->parent;
+
+                /* Memory corruption once caused this to fail, and
+                   without this LASSERT we would loop here forever. */
+                LASSERTF(strlen(rm_entry->name) == rm_entry->namelen,
+                         "0x%p  %s/%s len %d\n", rm_entry, temp->name,
+                         rm_entry->name, (int)strlen(rm_entry->name));
+
+                remove_proc_entry(rm_entry->name, temp);
+                if (temp == parent)
+                        break;
+        }
+}
+
+int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
+{
+	struct proc_dir_entry	 *t = NULL;
+	struct proc_dir_entry	**p;
+	int			  len, busy = 0;
+
+	LASSERT(parent != NULL);
+	len = strlen(name);
+
+	down_write(&_lprocfs_lock);
+	/* lookup target name */
+	for (p = &parent->subdir; *p; p = &(*p)->next) {
+		if ((*p)->namelen != len)
+			continue;
+		if (memcmp(name, (*p)->name, len))
+			continue;
+		t = *p;
+		break;
+	}
+
+	if (t) {
+		/* verify it's empty: do not count "num_refs" */
+		for (p = &t->subdir; *p; p = &(*p)->next) {
+			if ((*p)->namelen != strlen("num_refs")) {
+				busy = 1;
+				break;
+			}
+			if (memcmp("num_refs", (*p)->name,
+				   strlen("num_refs"))) {
+				busy = 1;
+				break;
+			}
+		}
+	}
+
+	if (busy == 0)
+		lprocfs_remove_nolock(&t);
+
+	up_write(&_lprocfs_lock);
+	return 0;
+}
+#endif /* !HAVE_REMOVE_PROC_SUBTREE */
+
+#ifndef HAVE_PROC_REMOVE
+void proc_remove(struct proc_dir_entry *de)
+{
+#ifndef HAVE_REMOVE_PROC_SUBTREE
+	down_write(&_lprocfs_lock); /* search vs remove race */
+	lprocfs_remove_nolock(&de);
+	up_write(&_lprocfs_lock);
+#else
+	if (de)
+		remove_proc_subtree(de->name, de->parent);
+#endif
+}
+#endif
+
+void lprocfs_remove(struct proc_dir_entry **rooth)
+{
+	proc_remove(*rooth);
+	*rooth = NULL;
+}
+EXPORT_SYMBOL(lprocfs_remove);
+
+void lprocfs_remove_proc_entry(const char *name, struct proc_dir_entry *parent)
+{
+	LASSERT(parent != NULL);
+	remove_proc_entry(name, parent);
+}
+EXPORT_SYMBOL(lprocfs_remove_proc_entry);
+
+struct dentry *ldebugfs_register(const char *name, struct dentry *parent,
+				 struct lprocfs_vars *list, void *data)
+{
+	struct dentry *entry;
+
+	entry = debugfs_create_dir(name, parent);
+	if (IS_ERR_OR_NULL(entry)) {
+		entry = entry ?: ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	if (!IS_ERR_OR_NULL(list)) {
+		int rc;
+
+		rc = ldebugfs_add_vars(entry, list, data);
+		if (rc) {
+			debugfs_remove(entry);
+			entry = ERR_PTR(rc);
+		}
+	}
+out:
+	return entry;
+}
+EXPORT_SYMBOL_GPL(ldebugfs_register);
+
+struct proc_dir_entry *
+lprocfs_register(const char *name, struct proc_dir_entry *parent,
+		 struct lprocfs_vars *list, void *data)
+{
+	struct proc_dir_entry *newchild;
+
+	newchild = proc_mkdir(name, parent);
+	if (newchild == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	if (list != NULL) {
+		int rc = lprocfs_add_vars(newchild, list, data);
+		if (rc) {
+			lprocfs_remove(&newchild);
+			return ERR_PTR(rc);
+		}
+	}
+	return newchild;
+}
+EXPORT_SYMBOL(lprocfs_register);
+
+/* Generic callbacks */
+int lprocfs_uint_seq_show(struct seq_file *m, void *data)
+{
+	seq_printf(m, "%u\n", *(unsigned int *)data);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_uint_seq_show);
+
+int lprocfs_wr_uint(struct file *file, const char __user *buffer,
+                    unsigned long count, void *data)
+{
+	unsigned	*p = data;
+	char		 dummy[MAX_STRING_SIZE + 1];
+	char		*end;
+	unsigned long	 tmp;
+
+	if (count >= sizeof(dummy))
+		return -EINVAL;
+
+	if (count == 0)
+		return 0;
+
+	if (copy_from_user(dummy, buffer, count))
+		return -EFAULT;
+
+	dummy[count] = 0;
+
+	tmp = simple_strtoul(dummy, &end, 0);
+	if (dummy == end)
+		return -EINVAL;
+
+	*p = (unsigned int)tmp;
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_uint);
+
+ssize_t lprocfs_uint_seq_write(struct file *file, const char __user *buffer,
+			       size_t count, loff_t *off)
+{
+	int *data = ((struct seq_file *)file->private_data)->private;
+	int rc;
+	__s64 val = 0;
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc < 0)
+		return rc;
+
+	return lprocfs_wr_uint(file, buffer, count, data);
+}
+EXPORT_SYMBOL(lprocfs_uint_seq_write);
+
+int lprocfs_u64_seq_show(struct seq_file *m, void *data)
+{
+	LASSERT(data != NULL);
+	seq_printf(m, "%llu\n", *(__u64 *)data);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_u64_seq_show);
+
+int lprocfs_atomic_seq_show(struct seq_file *m, void *data)
+{
+	atomic_t *atom = data;
+	LASSERT(atom != NULL);
+	seq_printf(m, "%d\n", atomic_read(atom));
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_atomic_seq_show);
+
+ssize_t
+lprocfs_atomic_seq_write(struct file *file, const char __user *buffer,
+			size_t count, loff_t *off)
+{
+	atomic_t *atm = ((struct seq_file *)file->private_data)->private;
+	__s64 val = 0;
+	int rc;
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc < 0)
+		return rc;
+
+	if (val <= 0 || val > INT_MAX)
+		return -ERANGE;
+
+	atomic_set(atm, val);
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_atomic_seq_write);
+
+int lprocfs_uuid_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+
+	LASSERT(obd != NULL);
+	seq_printf(m, "%s\n", obd->obd_uuid.uuid);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_uuid_seq_show);
+
+int lprocfs_name_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *dev = data;
+
+	LASSERT(dev != NULL);
+	seq_printf(m, "%s\n", dev->obd_name);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_name_seq_show);
+
+int lprocfs_blksize_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc)
+		seq_printf(m, "%u\n", osfs.os_bsize);
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_blksize_seq_show);
+
+int lprocfs_kbytestotal_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_blocks;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		seq_printf(m, "%llu\n", result);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_kbytestotal_seq_show);
+
+int lprocfs_kbytesfree_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bfree;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		seq_printf(m, "%llu\n", result);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_kbytesfree_seq_show);
+
+int lprocfs_kbytesavail_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bavail;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		seq_printf(m, "%llu\n", result);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_kbytesavail_seq_show);
+
+int lprocfs_filestotal_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc)
+		seq_printf(m, "%llu\n", osfs.os_files);
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_filestotal_seq_show);
+
+int lprocfs_filesfree_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc)
+		seq_printf(m, "%llu\n", osfs.os_ffree);
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_filesfree_seq_show);
+
+int lprocfs_server_uuid_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_import *imp;
+	char *imp_state_name = NULL;
+	int rc = 0;
+
+	LASSERT(obd != NULL);
+	LPROCFS_CLIMP_CHECK(obd);
+	imp = obd->u.cli.cl_import;
+	imp_state_name = ptlrpc_import_state_name(imp->imp_state);
+	seq_printf(m, "%s\t%s%s\n", obd2cli_tgt(obd), imp_state_name,
+		   imp->imp_deactive ? "\tDEACTIVATED" : "");
+
+	LPROCFS_CLIMP_EXIT(obd);
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_server_uuid_seq_show);
+
+int lprocfs_conn_uuid_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct ptlrpc_connection *conn;
+	int rc = 0;
+
+	LASSERT(obd != NULL);
+
+	LPROCFS_CLIMP_CHECK(obd);
+	conn = obd->u.cli.cl_import->imp_connection;
+	if (conn && obd->u.cli.cl_import)
+		seq_printf(m, "%s\n", conn->c_remote_uuid.uuid);
+	else
+		seq_printf(m, "%s\n", "<none>");
+
+	LPROCFS_CLIMP_EXIT(obd);
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_conn_uuid_seq_show);
+
+/** add up per-cpu counters */
+
+/**
+ * Lock statistics structure for access, possibly only on this CPU.
+ *
+ * The statistics struct may be allocated with per-CPU structures for
+ * efficient concurrent update (usually only on server-wide stats), or
+ * as a single global struct (e.g. for per-client or per-job statistics),
+ * so the required locking depends on the type of structure allocated.
+ *
+ * For per-CPU statistics, pin the thread to the current cpuid so that
+ * will only access the statistics for that CPU.  If the stats structure
+ * for the current CPU has not been allocated (or previously freed),
+ * allocate it now.  The per-CPU statistics do not need locking since
+ * the thread is pinned to the CPU during update.
+ *
+ * For global statistics, lock the stats structure to prevent concurrent update.
+ *
+ * \param[in] stats	statistics structure to lock
+ * \param[in] opc	type of operation:
+ *			LPROCFS_GET_SMP_ID: "lock" and return current CPU index
+ *				for incrementing statistics for that CPU
+ *			LPROCFS_GET_NUM_CPU: "lock" and return number of used
+ *				CPU indices to iterate over all indices
+ * \param[out] flags	CPU interrupt saved state for IRQ-safe locking
+ *
+ * \retval cpuid of current thread or number of allocated structs
+ * \retval negative on error (only for opc LPROCFS_GET_SMP_ID + per-CPU stats)
+ */
+int lprocfs_stats_lock(struct lprocfs_stats *stats,
+		       enum lprocfs_stats_lock_ops opc,
+		       unsigned long *flags)
+{
+	if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+		if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+			spin_lock_irqsave(&stats->ls_lock, *flags);
+		else
+			spin_lock(&stats->ls_lock);
+		return opc == LPROCFS_GET_NUM_CPU ? 1 : 0;
+	}
+
+	switch (opc) {
+	case LPROCFS_GET_SMP_ID: {
+		unsigned int cpuid = get_cpu();
+
+		if (unlikely(!stats->ls_percpu[cpuid])) {
+			int rc = lprocfs_stats_alloc_one(stats, cpuid);
+
+			if (rc < 0) {
+				put_cpu();
+				return rc;
+			}
+		}
+		return cpuid;
+	}
+	case LPROCFS_GET_NUM_CPU:
+		return stats->ls_biggest_alloc_num;
+	default:
+		LBUG();
+	}
+}
+
+/**
+ * Unlock statistics structure after access.
+ *
+ * Unlock the lock acquired via lprocfs_stats_lock() for global statistics,
+ * or unpin this thread from the current cpuid for per-CPU statistics.
+ *
+ * This function must be called using the same arguments as used when calling
+ * lprocfs_stats_lock() so that the correct operation can be performed.
+ *
+ * \param[in] stats	statistics structure to unlock
+ * \param[in] opc	type of operation (current cpuid or number of structs)
+ * \param[in] flags	CPU interrupt saved state for IRQ-safe locking
+ */
+void lprocfs_stats_unlock(struct lprocfs_stats *stats,
+			  enum lprocfs_stats_lock_ops opc,
+			  unsigned long *flags)
+{
+	if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+		if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+			spin_unlock_irqrestore(&stats->ls_lock, *flags);
+		else
+			spin_unlock(&stats->ls_lock);
+	} else if (opc == LPROCFS_GET_SMP_ID) {
+		put_cpu();
+	}
+}
+
+/** add up per-cpu counters */
+void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
+			   struct lprocfs_counter *cnt)
+{
+	unsigned int			num_entry;
+	struct lprocfs_counter		*percpu_cntr;
+	int				i;
+	unsigned long			flags = 0;
+
+	memset(cnt, 0, sizeof(*cnt));
+
+	if (stats == NULL) {
+		/* set count to 1 to avoid divide-by-zero errs in callers */
+		cnt->lc_count = 1;
+		return;
+	}
+
+	cnt->lc_min = LC_MIN_INIT;
+
+	num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+
+	for (i = 0; i < num_entry; i++) {
+		if (stats->ls_percpu[i] == NULL)
+			continue;
+		percpu_cntr = lprocfs_stats_counter_get(stats, i, idx);
+
+		cnt->lc_count += percpu_cntr->lc_count;
+		cnt->lc_sum += percpu_cntr->lc_sum;
+		if (percpu_cntr->lc_min < cnt->lc_min)
+			cnt->lc_min = percpu_cntr->lc_min;
+		if (percpu_cntr->lc_max > cnt->lc_max)
+			cnt->lc_max = percpu_cntr->lc_max;
+		cnt->lc_sumsquare += percpu_cntr->lc_sumsquare;
+	}
+
+	lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+
+/**
+ * Append a space separated list of current set flags to str.
+ */
+#define flag2str(flag)						\
+	do {								\
+		if (imp->imp_##flag) {					\
+			seq_printf(m, "%s" #flag, first ? "" : ", ");	\
+			first = false;					\
+		}							\
+	} while (0)
+static void obd_import_flags2str(struct obd_import *imp, struct seq_file *m)
+{
+	bool first = true;
+
+	if (imp->imp_obd->obd_no_recov) {
+		seq_printf(m, "no_recov");
+		first = false;
+	}
+
+	flag2str(invalid);
+	flag2str(deactive);
+	flag2str(replayable);
+	flag2str(delayed_recovery);
+	flag2str(no_lock_replay);
+	flag2str(vbr_failed);
+	flag2str(pingable);
+	flag2str(resend_replay);
+	flag2str(no_pinger_recover);
+	flag2str(need_mne_swab);
+	flag2str(connect_tried);
+}
+#undef flag2str
+
+static const char *obd_connect_names[] = {
+	/* flags names  */
+	"read_only",
+	"lov_index",
+	"connect_from_mds",
+	"write_grant",
+	"server_lock",
+	"version",
+	"request_portal",
+	"acl",
+	"xattr",
+	"create_on_write",
+	"truncate_lock",
+	"initial_transno",
+	"inode_bit_locks",
+	"barrier",
+	"getattr_by_fid",
+	"no_oh_for_devices",
+	"remote_client",
+	"remote_client_by_force",
+	"max_byte_per_rpc",
+	"64bit_qdata",
+	"mds_capability",
+	"oss_capability",
+	"early_lock_cancel",
+	"som",
+	"adaptive_timeouts",
+	"lru_resize",
+	"mds_mds_connection",
+	"real_conn",
+	"change_qunit_size",
+	"alt_checksum_algorithm",
+	"fid_is_enabled",
+	"version_recovery",
+	"pools",
+	"grant_shrink",
+	"skip_orphan",
+	"large_ea",
+	"full20",
+	"layout_lock",
+	"64bithash",
+	"object_max_bytes",
+	"imp_recov",
+	"jobstats",
+	"umask",
+	"einprogress",
+	"grant_param",
+	"flock_owner",
+	"lvb_type",
+	"nanoseconds_times",
+	"lightweight_conn",
+	"short_io",
+	"pingless",
+	"flock_deadlock",
+	"disp_stripe",
+	"open_by_fid",
+	"lfsck",
+	"unknown",
+	"unlink_close",
+	"multi_mod_rpcs",
+	"dir_stripe",
+	"subtree",
+	"lock_ahead",
+	"bulk_mbits",
+	"compact_obdo",
+	"second_flags",
+	/* flags2 names */
+	"file_secctx",
+	NULL
+};
+
+static void obd_connect_seq_flags2str(struct seq_file *m, __u64 flags,
+				      __u64 flags2, const char *sep)
+{
+	bool first = true;
+	__u64 mask;
+	int i;
+
+	for (i = 0, mask = 1; i < 64; i++, mask <<= 1) {
+		if (flags & mask) {
+			seq_printf(m, "%s%s",
+				   first ? "" : sep, obd_connect_names[i]);
+			first = false;
+		}
+	}
+
+	if (flags & ~(mask - 1)) {
+		seq_printf(m, "%sunknown_%#llx",
+			   first ? "" : sep, flags & ~(mask - 1));
+		first = false;
+	}
+
+	if (!(flags & OBD_CONNECT_FLAGS2) || flags2 == 0)
+		return;
+
+	for (i = 64, mask = 1; obd_connect_names[i] != NULL; i++, mask <<= 1) {
+		if (flags2 & mask) {
+			seq_printf(m, "%s%s",
+				   first ? "" : sep, obd_connect_names[i]);
+			first = false;
+		}
+	}
+
+	if (flags2 & ~(mask - 1)) {
+		seq_printf(m, "%sunknown2_%#llx",
+			   first ? "" : sep, flags2 & ~(mask - 1));
+		first = false;
+	}
+}
+
+int obd_connect_flags2str(char *page, int count, __u64 flags, __u64 flags2,
+			  const char *sep)
+{
+	__u64 mask;
+	int i, ret = 0;
+
+	for (i = 0, mask = 1; i < 64; i++, mask <<= 1) {
+		if (flags & mask)
+			ret += snprintf(page + ret, count - ret, "%s%s",
+					ret ? sep : "", obd_connect_names[i]);
+	}
+
+	if (flags & ~(mask - 1))
+		ret += snprintf(page + ret, count - ret,
+				"%sunknown_%#llx",
+				ret ? sep : "", flags & ~(mask - 1));
+
+	if (!(flags & OBD_CONNECT_FLAGS2) || flags2 == 0)
+		return ret;
+
+	for (i = 64, mask = 1; obd_connect_names[i] != NULL; i++, mask <<= 1) {
+		if (flags2 & mask)
+			ret += snprintf(page + ret, count - ret, "%s%s",
+					ret ? sep : "", obd_connect_names[i]);
+	}
+
+	if (flags2 & ~(mask - 1))
+		ret += snprintf(page + ret, count - ret,
+				"%sunknown2_%#llx",
+				ret ? sep : "", flags2 & ~(mask - 1));
+
+	return ret;
+}
+EXPORT_SYMBOL(obd_connect_flags2str);
+
+static void obd_connect_data_seqprint(struct seq_file *m,
+				      struct obd_connect_data *ocd)
+{
+	__u64 flags;
+
+	LASSERT(ocd != NULL);
+	flags = ocd->ocd_connect_flags;
+
+	seq_printf(m, "    connect_data:\n"
+		   "       flags: %#llx\n"
+		   "       instance: %u\n",
+		   ocd->ocd_connect_flags,
+		   ocd->ocd_instance);
+	if (flags & OBD_CONNECT_VERSION)
+		seq_printf(m, "       target_version: %u.%u.%u.%u\n",
+			   OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
+			   OBD_OCD_VERSION_MINOR(ocd->ocd_version),
+			   OBD_OCD_VERSION_PATCH(ocd->ocd_version),
+			   OBD_OCD_VERSION_FIX(ocd->ocd_version));
+	if (flags & OBD_CONNECT_MDS)
+		seq_printf(m, "       mdt_index: %d\n", ocd->ocd_group);
+	if (flags & OBD_CONNECT_GRANT)
+		seq_printf(m, "       initial_grant: %d\n", ocd->ocd_grant);
+	if (flags & OBD_CONNECT_INDEX)
+		seq_printf(m, "       target_index: %u\n", ocd->ocd_index);
+	if (flags & OBD_CONNECT_BRW_SIZE)
+		seq_printf(m, "       max_brw_size: %d\n", ocd->ocd_brw_size);
+	if (flags & OBD_CONNECT_IBITS)
+		seq_printf(m, "       ibits_known: %#llx\n",
+			   ocd->ocd_ibits_known);
+	if (flags & OBD_CONNECT_GRANT_PARAM)
+		seq_printf(m, "       grant_block_size: %d\n"
+			   "       grant_inode_size: %d\n"
+			   "       grant_max_extent_size: %d\n"
+			   "       grant_extent_tax: %d\n",
+			   1 << ocd->ocd_grant_blkbits,
+			   1 << ocd->ocd_grant_inobits,
+			   ocd->ocd_grant_max_blks << ocd->ocd_grant_blkbits,
+			   ocd->ocd_grant_tax_kb << 10);
+	if (flags & OBD_CONNECT_TRANSNO)
+		seq_printf(m, "       first_transno: %#llx\n",
+			   ocd->ocd_transno);
+	if (flags & OBD_CONNECT_CKSUM)
+		seq_printf(m, "       cksum_types: %#x\n",
+			   ocd->ocd_cksum_types);
+	if (flags & OBD_CONNECT_MAX_EASIZE)
+		seq_printf(m, "       max_easize: %d\n", ocd->ocd_max_easize);
+	if (flags & OBD_CONNECT_MAXBYTES)
+		seq_printf(m, "       max_object_bytes: %llu\n",
+			   ocd->ocd_maxbytes);
+	if (flags & OBD_CONNECT_MULTIMODRPCS)
+		seq_printf(m, "       max_mod_rpcs: %hu\n",
+			   ocd->ocd_maxmodrpcs);
+}
+
+int lprocfs_import_seq_show(struct seq_file *m, void *data)
+{
+	char				nidstr[LNET_NIDSTR_SIZE];
+	struct lprocfs_counter          ret;
+	struct lprocfs_counter_header   *header;
+	struct obd_device               *obd    = (struct obd_device *)data;
+	struct obd_import               *imp;
+	struct obd_import_conn          *conn;
+	struct obd_connect_data		*ocd;
+	int                             j;
+	int                             k;
+	int                             rw      = 0;
+
+	LASSERT(obd != NULL);
+	LPROCFS_CLIMP_CHECK(obd);
+	imp = obd->u.cli.cl_import;
+	ocd = &imp->imp_connect_data;
+
+	seq_printf(m, "import:\n"
+		   "    name: %s\n"
+		   "    target: %s\n"
+		   "    state: %s\n"
+		   "    connect_flags: [ ",
+		   obd->obd_name,
+		   obd2cli_tgt(obd),
+		   ptlrpc_import_state_name(imp->imp_state));
+	obd_connect_seq_flags2str(m, imp->imp_connect_data.ocd_connect_flags,
+				  imp->imp_connect_data.ocd_connect_flags2,
+				  ", ");
+	seq_printf(m, " ]\n");
+	obd_connect_data_seqprint(m, ocd);
+	seq_printf(m, "    import_flags: [ ");
+	obd_import_flags2str(imp, m);
+
+	seq_printf(m, " ]\n"
+		   "    connection:\n"
+		   "       failover_nids: [ ");
+	spin_lock(&imp->imp_lock);
+	j = 0;
+	list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+		libcfs_nid2str_r(conn->oic_conn->c_peer.nid,
+				 nidstr, sizeof(nidstr));
+		seq_printf(m, "%s%s", j ? ", " : "", nidstr);
+		j++;
+	}
+	if (imp->imp_connection != NULL)
+		libcfs_nid2str_r(imp->imp_connection->c_peer.nid,
+				 nidstr, sizeof(nidstr));
+	else
+		strncpy(nidstr, "<none>", sizeof(nidstr));
+	seq_printf(m, " ]\n"
+		   "       current_connection: %s\n"
+		   "       connection_attempts: %u\n"
+		   "       generation: %u\n"
+		   "       in-progress_invalidations: %u\n",
+		   nidstr,
+		   imp->imp_conn_cnt,
+		   imp->imp_generation,
+		   atomic_read(&imp->imp_inval_count));
+	spin_unlock(&imp->imp_lock);
+
+	if (obd->obd_svc_stats == NULL)
+		goto out_climp;
+
+	header = &obd->obd_svc_stats->ls_cnt_header[PTLRPC_REQWAIT_CNTR];
+	lprocfs_stats_collect(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR, &ret);
+	if (ret.lc_count != 0) {
+		/* first argument to do_div MUST be __u64 */
+		__u64 sum = ret.lc_sum;
+		do_div(sum, ret.lc_count);
+		ret.lc_sum = sum;
+	} else
+		ret.lc_sum = 0;
+	seq_printf(m, "    rpcs:\n"
+		   "       inflight: %u\n"
+		   "       unregistering: %u\n"
+		   "       timeouts: %u\n"
+		   "       avg_waittime: %llu %s\n",
+		   atomic_read(&imp->imp_inflight),
+		   atomic_read(&imp->imp_unregistering),
+		   atomic_read(&imp->imp_timeouts),
+		   ret.lc_sum, header->lc_units);
+
+	k = 0;
+	for(j = 0; j < IMP_AT_MAX_PORTALS; j++) {
+		if (imp->imp_at.iat_portal[j] == 0)
+			break;
+		k = max_t(unsigned int, k,
+			  at_get(&imp->imp_at.iat_service_estimate[j]));
+	}
+	seq_printf(m, "    service_estimates:\n"
+		   "       services: %u sec\n"
+		   "       network: %u sec\n",
+		   k,
+		   at_get(&imp->imp_at.iat_net_latency));
+
+	seq_printf(m, "    transactions:\n"
+		   "       last_replay: %llu\n"
+		   "       peer_committed: %llu\n"
+		   "       last_checked: %llu\n",
+		   imp->imp_last_replay_transno,
+		   imp->imp_peer_committed_transno,
+		   imp->imp_last_transno_checked);
+
+	/* avg data rates */
+	for (rw = 0; rw <= 1; rw++) {
+		lprocfs_stats_collect(obd->obd_svc_stats,
+				      PTLRPC_LAST_CNTR + BRW_READ_BYTES + rw,
+				      &ret);
+		if (ret.lc_sum > 0 && ret.lc_count > 0) {
+			/* first argument to do_div MUST be __u64 */
+			__u64 sum = ret.lc_sum;
+			do_div(sum, ret.lc_count);
+			ret.lc_sum = sum;
+			seq_printf(m, "    %s_data_averages:\n"
+				   "       bytes_per_rpc: %llu\n",
+				   rw ? "write" : "read",
+				   ret.lc_sum);
+		}
+		k = (int)ret.lc_sum;
+		j = opcode_offset(OST_READ + rw) + EXTRA_MAX_OPCODES;
+		header = &obd->obd_svc_stats->ls_cnt_header[j];
+		lprocfs_stats_collect(obd->obd_svc_stats, j, &ret);
+		if (ret.lc_sum > 0 && ret.lc_count != 0) {
+			/* first argument to do_div MUST be __u64 */
+			__u64 sum = ret.lc_sum;
+			do_div(sum, ret.lc_count);
+			ret.lc_sum = sum;
+			seq_printf(m, "       %s_per_rpc: %llu\n",
+				   header->lc_units, ret.lc_sum);
+			j = (int)ret.lc_sum;
+			if (j > 0)
+				seq_printf(m, "       MB_per_sec: %u.%.02u\n",
+					   k / j, (100 * k / j) % 100);
+		}
+	}
+
+out_climp:
+	LPROCFS_CLIMP_EXIT(obd);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_import_seq_show);
+
+int lprocfs_state_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = (struct obd_device *)data;
+	struct obd_import *imp;
+	int j, k;
+
+	LASSERT(obd != NULL);
+	LPROCFS_CLIMP_CHECK(obd);
+	imp = obd->u.cli.cl_import;
+
+	seq_printf(m, "current_state: %s\n",
+		   ptlrpc_import_state_name(imp->imp_state));
+	seq_printf(m, "state_history:\n");
+	k = imp->imp_state_hist_idx;
+	for (j = 0; j < IMP_STATE_HIST_LEN; j++) {
+		struct import_state_hist *ish =
+			&imp->imp_state_hist[(k + j) % IMP_STATE_HIST_LEN];
+		if (ish->ish_state == 0)
+			continue;
+		seq_printf(m, " - [ %lld, %s ]\n", (s64)ish->ish_time,
+			   ptlrpc_import_state_name(ish->ish_state));
+	}
+
+	LPROCFS_CLIMP_EXIT(obd);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_state_seq_show);
+
+int lprocfs_at_hist_helper(struct seq_file *m, struct adaptive_timeout *at)
+{
+	int i;
+	for (i = 0; i < AT_BINS; i++)
+		seq_printf(m, "%3u ", at->at_hist[i]);
+	seq_printf(m, "\n");
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_at_hist_helper);
+
+/* See also ptlrpc_lprocfs_timeouts_show_seq */
+int lprocfs_timeouts_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = (struct obd_device *)data;
+	struct obd_import *imp;
+	unsigned int cur, worst;
+	time64_t now, worstt;
+	int i;
+
+	LASSERT(obd != NULL);
+	LPROCFS_CLIMP_CHECK(obd);
+	imp = obd->u.cli.cl_import;
+
+	now = ktime_get_real_seconds();
+
+	/* Some network health info for kicks */
+	seq_printf(m, "%-10s : %lld, %llds ago\n",
+		   "last reply", (s64)imp->imp_last_reply_time,
+		   (s64)(now - imp->imp_last_reply_time));
+
+	cur = at_get(&imp->imp_at.iat_net_latency);
+	worst = imp->imp_at.iat_net_latency.at_worst_ever;
+	worstt = imp->imp_at.iat_net_latency.at_worst_time;
+	seq_printf(m, "%-10s : cur %3u  worst %3u (at %lld, %llds ago) ",
+		   "network", cur, worst, (s64)worstt, (s64)(now - worstt));
+	lprocfs_at_hist_helper(m, &imp->imp_at.iat_net_latency);
+
+	for(i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+		if (imp->imp_at.iat_portal[i] == 0)
+			break;
+		cur = at_get(&imp->imp_at.iat_service_estimate[i]);
+		worst = imp->imp_at.iat_service_estimate[i].at_worst_ever;
+		worstt = imp->imp_at.iat_service_estimate[i].at_worst_time;
+		seq_printf(m, "portal %-2d  : cur %3u  worst %3u (at %lld, %llds ago) ",
+			   imp->imp_at.iat_portal[i], cur, worst, (s64)worstt,
+			   (s64)(now - worstt));
+		lprocfs_at_hist_helper(m, &imp->imp_at.iat_service_estimate[i]);
+	}
+
+	LPROCFS_CLIMP_EXIT(obd);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_timeouts_seq_show);
+
+int lprocfs_connect_flags_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	__u64 flags;
+	__u64 flags2;
+
+	LPROCFS_CLIMP_CHECK(obd);
+	flags = obd->u.cli.cl_import->imp_connect_data.ocd_connect_flags;
+	flags2 = obd->u.cli.cl_import->imp_connect_data.ocd_connect_flags2;
+	seq_printf(m, "flags=%#llx\n", flags);
+	seq_printf(m, "flags2=%#llx\n", flags2);
+	obd_connect_seq_flags2str(m, flags, flags2, "\n");
+	seq_printf(m, "\n");
+	LPROCFS_CLIMP_EXIT(obd);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_connect_flags_seq_show);
+
+int
+lprocfs_obd_setup(struct obd_device *obd)
+{
+	int rc = 0;
+
+	LASSERT(obd != NULL);
+	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+	LASSERT(obd->obd_type->typ_procroot != NULL);
+
+	obd->obd_proc_entry = lprocfs_register(obd->obd_name,
+					       obd->obd_type->typ_procroot,
+					       obd->obd_vars, obd);
+	if (IS_ERR(obd->obd_proc_entry)) {
+		rc = PTR_ERR(obd->obd_proc_entry);
+		CERROR("error %d setting up lprocfs for %s\n",rc,obd->obd_name);
+		obd->obd_proc_entry = NULL;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_obd_setup);
+
+int lprocfs_obd_cleanup(struct obd_device *obd)
+{
+        if (!obd)
+                return -EINVAL;
+        if (obd->obd_proc_exports_entry) {
+                /* Should be no exports left */
+                lprocfs_remove(&obd->obd_proc_exports_entry);
+                obd->obd_proc_exports_entry = NULL;
+        }
+        if (obd->obd_proc_entry) {
+                lprocfs_remove(&obd->obd_proc_entry);
+                obd->obd_proc_entry = NULL;
+        }
+        return 0;
+}
+EXPORT_SYMBOL(lprocfs_obd_cleanup);
+
+int lprocfs_stats_alloc_one(struct lprocfs_stats *stats, unsigned int cpuid)
+{
+	struct lprocfs_counter  *cntr;
+	unsigned int            percpusize;
+	int                     rc = -ENOMEM;
+	unsigned long           flags = 0;
+	int                     i;
+
+	LASSERT(stats->ls_percpu[cpuid] == NULL);
+	LASSERT((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0);
+
+	percpusize = lprocfs_stats_counter_size(stats);
+	LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[cpuid], percpusize);
+	if (stats->ls_percpu[cpuid] != NULL) {
+		rc = 0;
+		if (unlikely(stats->ls_biggest_alloc_num <= cpuid)) {
+			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+				spin_lock_irqsave(&stats->ls_lock, flags);
+			else
+				spin_lock(&stats->ls_lock);
+			if (stats->ls_biggest_alloc_num <= cpuid)
+				stats->ls_biggest_alloc_num = cpuid + 1;
+			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) {
+				spin_unlock_irqrestore(&stats->ls_lock, flags);
+			} else {
+				spin_unlock(&stats->ls_lock);
+			}
+		}
+		/* initialize the ls_percpu[cpuid] non-zero counter */
+		for (i = 0; i < stats->ls_num; ++i) {
+			cntr = lprocfs_stats_counter_get(stats, cpuid, i);
+			cntr->lc_min = LC_MIN_INIT;
+		}
+	}
+	return rc;
+}
+
+struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num,
+                                          enum lprocfs_stats_flags flags)
+{
+	struct lprocfs_stats	*stats;
+	unsigned int		num_entry;
+	unsigned int		percpusize = 0;
+	int			i;
+
+        if (num == 0)
+                return NULL;
+
+        if (lprocfs_no_percpu_stats != 0)
+                flags |= LPROCFS_STATS_FLAG_NOPERCPU;
+
+	if (flags & LPROCFS_STATS_FLAG_NOPERCPU)
+		num_entry = 1;
+	else
+		num_entry = num_possible_cpus();
+
+	/* alloc percpu pointers for all possible cpu slots */
+	LIBCFS_ALLOC(stats, offsetof(typeof(*stats), ls_percpu[num_entry]));
+	if (stats == NULL)
+		return NULL;
+
+	stats->ls_num = num;
+	stats->ls_flags = flags;
+	spin_lock_init(&stats->ls_lock);
+
+	/* alloc num of counter headers */
+	LIBCFS_ALLOC(stats->ls_cnt_header,
+		     stats->ls_num * sizeof(struct lprocfs_counter_header));
+	if (stats->ls_cnt_header == NULL)
+		goto fail;
+
+	if ((flags & LPROCFS_STATS_FLAG_NOPERCPU) != 0) {
+		/* contains only one set counters */
+		percpusize = lprocfs_stats_counter_size(stats);
+		LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[0], percpusize);
+		if (stats->ls_percpu[0] == NULL)
+			goto fail;
+		stats->ls_biggest_alloc_num = 1;
+	} else if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) {
+		/* alloc all percpu data, currently only obd_memory use this */
+		for (i = 0; i < num_entry; ++i)
+			if (lprocfs_stats_alloc_one(stats, i) < 0)
+				goto fail;
+	}
+
+	return stats;
+
+fail:
+	lprocfs_free_stats(&stats);
+	return NULL;
+}
+EXPORT_SYMBOL(lprocfs_alloc_stats);
+
+void lprocfs_free_stats(struct lprocfs_stats **statsh)
+{
+	struct lprocfs_stats *stats = *statsh;
+	unsigned int num_entry;
+	unsigned int percpusize;
+	unsigned int i;
+
+        if (stats == NULL || stats->ls_num == 0)
+                return;
+        *statsh = NULL;
+
+	if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU)
+		num_entry = 1;
+	else
+		num_entry = num_possible_cpus();
+
+	percpusize = lprocfs_stats_counter_size(stats);
+	for (i = 0; i < num_entry; i++)
+		if (stats->ls_percpu[i] != NULL)
+			LIBCFS_FREE(stats->ls_percpu[i], percpusize);
+	if (stats->ls_cnt_header != NULL)
+		LIBCFS_FREE(stats->ls_cnt_header, stats->ls_num *
+					sizeof(struct lprocfs_counter_header));
+	LIBCFS_FREE(stats, offsetof(typeof(*stats), ls_percpu[num_entry]));
+}
+EXPORT_SYMBOL(lprocfs_free_stats);
+
+u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx,
+			    enum lprocfs_fields_flags field)
+{
+	unsigned long flags = 0;
+	unsigned int num_cpu;
+	unsigned int i;
+	u64 ret = 0;
+
+	LASSERT(stats);
+
+	num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+	for (i = 0; i < num_cpu; i++) {
+		struct lprocfs_counter *cntr;
+
+		if (!stats->ls_percpu[i])
+			continue;
+
+		cntr = lprocfs_stats_counter_get(stats, i, idx);
+		ret += lprocfs_read_helper(cntr, &stats->ls_cnt_header[idx],
+					   stats->ls_flags, field);
+	}
+	lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+	return ret;
+}
+EXPORT_SYMBOL(lprocfs_stats_collector);
+
+void lprocfs_clear_stats(struct lprocfs_stats *stats)
+{
+	struct lprocfs_counter		*percpu_cntr;
+	int				i;
+	int				j;
+	unsigned int			num_entry;
+	unsigned long			flags = 0;
+
+	num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+
+	for (i = 0; i < num_entry; i++) {
+		if (stats->ls_percpu[i] == NULL)
+			continue;
+		for (j = 0; j < stats->ls_num; j++) {
+			percpu_cntr = lprocfs_stats_counter_get(stats, i, j);
+			percpu_cntr->lc_count		= 0;
+			percpu_cntr->lc_min		= LC_MIN_INIT;
+			percpu_cntr->lc_max		= 0;
+			percpu_cntr->lc_sumsquare	= 0;
+			percpu_cntr->lc_sum		= 0;
+			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+				percpu_cntr->lc_sum_irq	= 0;
+		}
+	}
+
+	lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+EXPORT_SYMBOL(lprocfs_clear_stats);
+
+static ssize_t lprocfs_stats_seq_write(struct file *file,
+				       const char __user *buf,
+				       size_t len, loff_t *off)
+{
+        struct seq_file *seq = file->private_data;
+        struct lprocfs_stats *stats = seq->private;
+
+        lprocfs_clear_stats(stats);
+
+        return len;
+}
+
+static void *lprocfs_stats_seq_start(struct seq_file *p, loff_t *pos)
+{
+	struct lprocfs_stats *stats = p->private;
+
+	return (*pos < stats->ls_num) ? pos : NULL;
+}
+
+static void lprocfs_stats_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *lprocfs_stats_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	(*pos)++;
+
+	return lprocfs_stats_seq_start(p, pos);
+}
+
+/* seq file export of one lprocfs counter */
+static int lprocfs_stats_seq_show(struct seq_file *p, void *v)
+{
+	struct lprocfs_stats		*stats	= p->private;
+	struct lprocfs_counter_header	*hdr;
+	struct lprocfs_counter		 ctr;
+	int				 idx	= *(loff_t *)v;
+
+	if (idx == 0) {
+		struct timespec64 now;
+
+		ktime_get_real_ts64(&now);
+		seq_printf(p, "%-25s %llu.%09lu secs.nsecs\n",
+			   "snapshot_time", (s64)now.tv_sec, now.tv_nsec);
+	}
+
+	hdr = &stats->ls_cnt_header[idx];
+	lprocfs_stats_collect(stats, idx, &ctr);
+
+	if (ctr.lc_count == 0)
+		return 0;
+
+	seq_printf(p, "%-25s %lld samples [%s]", hdr->lc_name,
+		   ctr.lc_count, hdr->lc_units);
+
+	if ((hdr->lc_config & LPROCFS_CNTR_AVGMINMAX) && ctr.lc_count > 0) {
+		seq_printf(p, " %lld %lld %lld",
+			   ctr.lc_min, ctr.lc_max, ctr.lc_sum);
+		if (hdr->lc_config & LPROCFS_CNTR_STDDEV)
+			seq_printf(p, " %llu", ctr.lc_sumsquare);
+	}
+	seq_putc(p, '\n');
+	return 0;
+}
+
+static const struct seq_operations lprocfs_stats_seq_sops = {
+	.start	= lprocfs_stats_seq_start,
+	.stop	= lprocfs_stats_seq_stop,
+	.next	= lprocfs_stats_seq_next,
+	.show	= lprocfs_stats_seq_show,
+};
+
+static int lprocfs_stats_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc;
+
+	rc = LPROCFS_ENTRY_CHECK(inode);
+	if (rc < 0)
+		return rc;
+
+	rc = seq_open(file, &lprocfs_stats_seq_sops);
+	if (rc)
+		return rc;
+	seq = file->private_data;
+	seq->private = inode->i_private ? : PDE_DATA(inode);
+	return 0;
+}
+
+static const struct file_operations lprocfs_stats_seq_fops = {
+        .owner   = THIS_MODULE,
+        .open    = lprocfs_stats_seq_open,
+        .read    = seq_read,
+        .write   = lprocfs_stats_seq_write,
+        .llseek  = seq_lseek,
+        .release = lprocfs_seq_release,
+};
+
+int ldebugfs_register_stats(struct dentry *parent, const char *name,
+			    struct lprocfs_stats *stats)
+{
+	struct dentry *entry;
+
+	LASSERT(!IS_ERR_OR_NULL(parent));
+
+	entry = debugfs_create_file(name, 0644, parent, stats,
+				    &lprocfs_stats_seq_fops);
+	if (IS_ERR_OR_NULL(entry))
+		return entry ? PTR_ERR(entry) : -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ldebugfs_register_stats);
+
+int lprocfs_register_stats(struct proc_dir_entry *root, const char *name,
+                           struct lprocfs_stats *stats)
+{
+	struct proc_dir_entry *entry;
+	LASSERT(root != NULL);
+
+	entry = proc_create_data(name, 0644, root,
+				 &lprocfs_stats_seq_fops, stats);
+	if (entry == NULL)
+		return -ENOMEM;
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_register_stats);
+
+void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
+			  unsigned conf, const char *name, const char *units)
+{
+	struct lprocfs_counter_header	*header;
+	struct lprocfs_counter		*percpu_cntr;
+	unsigned long			flags = 0;
+	unsigned int			i;
+	unsigned int			num_cpu;
+
+	LASSERT(stats != NULL);
+
+	header = &stats->ls_cnt_header[index];
+	LASSERTF(header != NULL, "Failed to allocate stats header:[%d]%s/%s\n",
+		 index, name, units);
+
+	header->lc_config = conf;
+	header->lc_name   = name;
+	header->lc_units  = units;
+
+	num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+	for (i = 0; i < num_cpu; ++i) {
+		if (stats->ls_percpu[i] == NULL)
+			continue;
+		percpu_cntr = lprocfs_stats_counter_get(stats, i, index);
+		percpu_cntr->lc_count		= 0;
+		percpu_cntr->lc_min		= LC_MIN_INIT;
+		percpu_cntr->lc_max		= 0;
+		percpu_cntr->lc_sumsquare	= 0;
+		percpu_cntr->lc_sum		= 0;
+		if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+			percpu_cntr->lc_sum_irq	= 0;
+	}
+	lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+EXPORT_SYMBOL(lprocfs_counter_init);
+
+/* Note that we only init md counters for ops whose offset is less
+ * than NUM_MD_STATS. This is explained in a comment in the definition
+ * of struct md_ops. */
+#define LPROCFS_MD_OP_INIT(base, stats, op)				       \
+	do {								       \
+		unsigned int _idx = base + MD_COUNTER_OFFSET(op);	       \
+									       \
+		if (MD_COUNTER_OFFSET(op) < NUM_MD_STATS) {		       \
+			LASSERT(_idx < stats->ls_num);			       \
+			lprocfs_counter_init(stats, _idx, 0, #op, "reqs");     \
+		}							       \
+	} while (0)
+
+void lprocfs_init_mps_stats(int num_private_stats, struct lprocfs_stats *stats)
+{
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, get_root);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, null_inode);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, close);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, create);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, enqueue);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr_name);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_lock);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, link);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, rename);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, setattr);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, fsync);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, read_page);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, unlink);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, setxattr);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, getxattr);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, init_ea_size);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, get_lustre_md);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, free_lustre_md);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, merge_attr);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, set_open_replay_data);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, clear_open_replay_data);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, set_lock_data);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, lock_match);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, cancel_unused);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_getattr_async);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, revalidate_lock);
+}
+
+int lprocfs_alloc_md_stats(struct obd_device *obd,
+			   unsigned int num_private_stats)
+{
+	struct lprocfs_stats *stats;
+	unsigned int num_stats;
+	int rc, i;
+
+	CLASSERT(offsetof(struct md_ops, MD_STATS_FIRST_OP) == 0);
+	CLASSERT(_MD_COUNTER_OFFSET(MD_STATS_FIRST_OP) == 0);
+	CLASSERT(_MD_COUNTER_OFFSET(MD_STATS_LAST_OP) > 0);
+
+	/* TODO Ensure that this function is only used where
+	 * appropriate by adding an assertion to the effect that
+	 * obd->obd_type->typ_md_ops is not NULL. We can't do this now
+	 * because mdt_procfs_init() uses this function to allocate
+	 * the stats backing /proc/fs/lustre/mdt/.../md_stats but the
+	 * mdt layer does not use the md_ops interface. This is
+	 * confusing and a waste of memory. See LU-2484.
+	 */
+	LASSERT(obd->obd_proc_entry != NULL);
+	LASSERT(obd->obd_md_stats == NULL);
+	LASSERT(obd->obd_md_cntr_base == 0);
+
+	num_stats = NUM_MD_STATS + num_private_stats;
+	stats = lprocfs_alloc_stats(num_stats, 0);
+	if (stats == NULL)
+		return -ENOMEM;
+
+	lprocfs_init_mps_stats(num_private_stats, stats);
+
+	for (i = num_private_stats; i < num_stats; i++) {
+		if (stats->ls_cnt_header[i].lc_name == NULL) {
+			CERROR("Missing md_stat initializer md_op "
+			       "operation at offset %d. Aborting.\n",
+			       i - num_private_stats);
+			LBUG();
+		}
+	}
+
+	rc = lprocfs_register_stats(obd->obd_proc_entry, "md_stats", stats);
+	if (rc < 0) {
+		lprocfs_free_stats(&stats);
+	} else {
+		obd->obd_md_stats = stats;
+		obd->obd_md_cntr_base = num_private_stats;
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_alloc_md_stats);
+
+void lprocfs_free_md_stats(struct obd_device *obd)
+{
+	struct lprocfs_stats *stats = obd->obd_md_stats;
+
+	if (stats != NULL) {
+		obd->obd_md_stats = NULL;
+		obd->obd_md_cntr_base = 0;
+		lprocfs_free_stats(&stats);
+	}
+}
+EXPORT_SYMBOL(lprocfs_free_md_stats);
+
+void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
+{
+        lprocfs_counter_init(ldlm_stats,
+                             LDLM_ENQUEUE - LDLM_FIRST_OPC,
+                             0, "ldlm_enqueue", "reqs");
+        lprocfs_counter_init(ldlm_stats,
+                             LDLM_CONVERT - LDLM_FIRST_OPC,
+                             0, "ldlm_convert", "reqs");
+        lprocfs_counter_init(ldlm_stats,
+                             LDLM_CANCEL - LDLM_FIRST_OPC,
+                             0, "ldlm_cancel", "reqs");
+        lprocfs_counter_init(ldlm_stats,
+                             LDLM_BL_CALLBACK - LDLM_FIRST_OPC,
+                             0, "ldlm_bl_callback", "reqs");
+        lprocfs_counter_init(ldlm_stats,
+                             LDLM_CP_CALLBACK - LDLM_FIRST_OPC,
+                             0, "ldlm_cp_callback", "reqs");
+        lprocfs_counter_init(ldlm_stats,
+                             LDLM_GL_CALLBACK - LDLM_FIRST_OPC,
+                             0, "ldlm_gl_callback", "reqs");
+}
+EXPORT_SYMBOL(lprocfs_init_ldlm_stats);
+
+__s64 lprocfs_read_helper(struct lprocfs_counter *lc,
+			  struct lprocfs_counter_header *header,
+			  enum lprocfs_stats_flags flags,
+			  enum lprocfs_fields_flags field)
+{
+	__s64 ret = 0;
+
+	if (lc == NULL || header == NULL)
+		RETURN(0);
+
+	switch (field) {
+		case LPROCFS_FIELDS_FLAGS_CONFIG:
+			ret = header->lc_config;
+			break;
+		case LPROCFS_FIELDS_FLAGS_SUM:
+			ret = lc->lc_sum;
+			if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+				ret += lc->lc_sum_irq;
+			break;
+		case LPROCFS_FIELDS_FLAGS_MIN:
+			ret = lc->lc_min;
+			break;
+		case LPROCFS_FIELDS_FLAGS_MAX:
+			ret = lc->lc_max;
+			break;
+		case LPROCFS_FIELDS_FLAGS_AVG:
+			ret = (lc->lc_max - lc->lc_min) / 2;
+			break;
+		case LPROCFS_FIELDS_FLAGS_SUMSQUARE:
+			ret = lc->lc_sumsquare;
+			break;
+		case LPROCFS_FIELDS_FLAGS_COUNT:
+			ret = lc->lc_count;
+			break;
+		default:
+			break;
+	};
+	RETURN(ret);
+}
+EXPORT_SYMBOL(lprocfs_read_helper);
+
+int lprocfs_read_frac_helper(char *buffer, unsigned long count, long val,
+                             int mult)
+{
+        long decimal_val, frac_val;
+        int prtn;
+
+        if (count < 10)
+                return -EINVAL;
+
+        decimal_val = val / mult;
+        prtn = snprintf(buffer, count, "%ld", decimal_val);
+        frac_val = val % mult;
+
+        if (prtn < (count - 4) && frac_val > 0) {
+                long temp_frac;
+                int i, temp_mult = 1, frac_bits = 0;
+
+                temp_frac = frac_val * 10;
+                buffer[prtn++] = '.';
+                while (frac_bits < 2 && (temp_frac / mult) < 1 ) {
+                        /* only reserved 2 bits fraction */
+                        buffer[prtn++] ='0';
+                        temp_frac *= 10;
+                        frac_bits++;
+                }
+                /*
+                 * Need to think these cases :
+                 *      1. #echo x.00 > /proc/xxx       output result : x
+                 *      2. #echo x.0x > /proc/xxx       output result : x.0x
+                 *      3. #echo x.x0 > /proc/xxx       output result : x.x
+                 *      4. #echo x.xx > /proc/xxx       output result : x.xx
+                 *      Only reserved 2 bits fraction.
+                 */
+                for (i = 0; i < (5 - prtn); i++)
+                        temp_mult *= 10;
+
+                frac_bits = min((int)count - prtn, 3 - frac_bits);
+                prtn += snprintf(buffer + prtn, frac_bits, "%ld",
+                                 frac_val * temp_mult / mult);
+
+                prtn--;
+                while(buffer[prtn] < '1' || buffer[prtn] > '9') {
+                        prtn--;
+                        if (buffer[prtn] == '.') {
+                                prtn--;
+                                break;
+                        }
+                }
+                prtn++;
+        }
+        buffer[prtn++] ='\n';
+        return prtn;
+}
+EXPORT_SYMBOL(lprocfs_read_frac_helper);
+
+int lprocfs_seq_read_frac_helper(struct seq_file *m, long val, int mult)
+{
+	long decimal_val, frac_val;
+
+	decimal_val = val / mult;
+	seq_printf(m, "%ld", decimal_val);
+	frac_val = val % mult;
+
+	if (frac_val > 0) {
+		frac_val *= 100;
+		frac_val /= mult;
+	}
+	if (frac_val > 0) {
+		/* Three cases: x0, xx, 0x */
+		if ((frac_val % 10) != 0)
+			seq_printf(m, ".%ld", frac_val);
+		else
+			seq_printf(m, ".%ld", frac_val / 10);
+	}
+
+	seq_printf(m, "\n");
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_seq_read_frac_helper);
+
+/* Obtains the conversion factor for the unit specified */
+static int get_mult(char unit, __u64 *mult)
+{
+	__u64 units = 1;
+
+	switch (unit) {
+	/* peta, tera, giga, mega, and kilo */
+	case 'p':
+	case 'P':
+		units <<= 10;
+	case 't':
+	case 'T':
+		units <<= 10;
+	case 'g':
+	case 'G':
+		units <<= 10;
+	case 'm':
+	case 'M':
+		units <<= 10;
+	case 'k':
+	case 'K':
+		units <<= 10;
+		break;
+	/* some tests expect % to be accepted */
+	case '%':
+		units = 1;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	*mult = units;
+
+	return 0;
+}
+
+/*
+ * Ensures the numeric string is valid. The function provides the final
+ * multiplier in the case a unit exists at the end of the string. It also
+ * locates the start of the whole and fractional parts (if any). This
+ * function modifies the string so kstrtoull can be used to parse both
+ * the whole and fraction portions. This function also figures out
+ * the base of the number.
+ */
+static int preprocess_numeric_str(char *buffer, __u64 *mult, __u64 def_mult,
+				  bool allow_units, char **whole, char **frac,
+				  unsigned int *base)
+{
+	bool hit_decimal = false;
+	bool hit_unit = false;
+	int rc = 0;
+	char *start;
+	*mult = def_mult;
+	*whole = NULL;
+	*frac = NULL;
+	*base = 10;
+
+	/* a hex string if it starts with "0x" */
+	if (buffer[0] == '0' && tolower(buffer[1]) == 'x') {
+		*base = 16;
+		buffer += 2;
+	}
+
+	start = buffer;
+
+	while (*buffer) {
+		/* allow for a single new line before the null terminator */
+		if (*buffer == '\n') {
+			*buffer = '\0';
+			buffer++;
+
+			if (*buffer)
+				return -EINVAL;
+
+			break;
+		}
+
+		/* any chars after our unit indicates a malformed string */
+		if (hit_unit)
+			return -EINVAL;
+
+		/* ensure we only hit one decimal */
+		if (*buffer == '.') {
+			if (hit_decimal)
+				return -EINVAL;
+
+			/* if past start, there's a whole part */
+			if (start != buffer)
+				*whole = start;
+
+			*buffer = '\0';
+			start = buffer + 1;
+			hit_decimal = true;
+		} else if (!isdigit(*buffer) &&
+			   !(*base == 16 && isxdigit(*buffer))) {
+			if (allow_units) {
+				/* if we allow units, attempt to get mult */
+				hit_unit = true;
+				rc = get_mult(*buffer, mult);
+				if (rc)
+					return rc;
+
+				/* string stops here, but keep processing */
+				*buffer = '\0';
+			} else {
+				/* bad string */
+				return -EINVAL;
+			}
+		}
+
+		buffer++;
+	}
+
+	if (hit_decimal) {
+		/* hit a decimal, make sure there's a fractional part */
+		if (!*start)
+			return -EINVAL;
+
+		*frac = start;
+	} else {
+		/* didn't hit a decimal, but may have a whole part */
+		if (start != buffer && *start)
+			*whole = start;
+	}
+
+	/* malformed string if we didn't get anything */
+	if (!*frac && !*whole)
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * Parses a numeric string which can contain a whole and fraction portion
+ * into a __u64. Accepts a multiplier to apply to the value parsed. Also
+ * allows the string to have a unit at the end. The function handles
+ * wrapping of the final unsigned value.
+ */
+static int str_to_u64_parse(char *buffer, unsigned long count,
+			    __u64 *val, __u64 def_mult, bool allow_units)
+{
+	__u64 whole = 0;
+	__u64 frac = 0;
+	unsigned int frac_d = 1;
+	__u64 wrap_indicator = ULLONG_MAX;
+	int rc = 0;
+	__u64 mult;
+	char *strwhole;
+	char *strfrac;
+	unsigned int base = 10;
+
+	rc = preprocess_numeric_str(buffer, &mult, def_mult, allow_units,
+				    &strwhole, &strfrac, &base);
+
+	if (rc)
+		return rc;
+
+	if (mult == 0) {
+		*val = 0;
+		return 0;
+	}
+
+	/* the multiplier limits how large the value can be */
+	wrap_indicator /=  mult;
+
+	if (strwhole) {
+		rc = kstrtoull(strwhole, base, &whole);
+		if (rc)
+			return rc;
+
+		if (whole > wrap_indicator)
+			return -ERANGE;
+
+		whole *= mult;
+	}
+
+	if (strfrac) {
+		if (strlen(strfrac) > 10)
+			strfrac[10] = '\0';
+
+		rc = kstrtoull(strfrac, base, &frac);
+		if (rc)
+			return rc;
+
+		/* determine power of fractional portion */
+		while (*strfrac) {
+			frac_d *= base;
+			strfrac++;
+		}
+
+		/* fractional portion is too large to perform calculation */
+		if (frac > wrap_indicator)
+			return -ERANGE;
+
+		frac *= mult;
+		do_div(frac, frac_d);
+	}
+
+	/* check that the sum of whole and fraction fits in u64 */
+	if (whole > (ULLONG_MAX - frac))
+		return -ERANGE;
+
+	*val = whole + frac;
+
+	return 0;
+}
+
+/*
+ * This function parses numeric/hex strings into __s64. It accepts a multiplier
+ * which will apply to the value parsed. It also can allow the string to
+ * have a unit as the last character. The function handles overflow/underflow
+ * of the signed integer.
+ */
+static int str_to_s64_internal(const char __user *buffer, unsigned long count,
+			       __s64 *val, __u64 def_mult, bool allow_units)
+{
+	char kernbuf[22];
+	__u64 tmp;
+	unsigned int offset = 0;
+	int signed sign = 1;
+	__u64 max = LLONG_MAX;
+	int rc = 0;
+
+	if (count > (sizeof(kernbuf) - 1))
+		return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+
+	kernbuf[count] = '\0';
+
+	/* keep track of our sign */
+	if (*kernbuf == '-') {
+		sign = -1;
+		offset++;
+		/* equivalent to max = -LLONG_MIN, avoids overflow */
+		max++;
+	}
+
+	rc = str_to_u64_parse(kernbuf + offset, count - offset,
+			      &tmp, def_mult, allow_units);
+	if (rc)
+		return rc;
+
+	/* check for overflow/underflow */
+	if (max < tmp)
+		return -ERANGE;
+
+	*val = (__s64)tmp * sign;
+
+	return 0;
+}
+
+/**
+ * Convert a user string into a signed 64 bit number. This function produces
+ * an error when the value parsed from the string underflows or
+ * overflows. This function accepts strings which contain digits and
+ * optionally a decimal or hex strings which are prefixed with "0x".
+ *
+ * \param[in] buffer	string consisting of numbers and optionally a decimal
+ * \param[in] count	buffer length
+ * \param[in] val	if successful, the value represented by the string
+ *
+ * \retval		0 on success
+ * \retval		negative number on error
+ */
+int lprocfs_str_to_s64(const char __user *buffer, unsigned long count,
+		       __s64 *val)
+{
+	return str_to_s64_internal(buffer, count, val, 1, false);
+}
+EXPORT_SYMBOL(lprocfs_str_to_s64);
+
+/**
+ * Convert a user string into a signed 64 bit number. This function produces
+ * an error when the value parsed from the string times multiplier underflows or
+ * overflows. This function only accepts strings that contains digits, an
+ * optional decimal, and a char representing a unit at the end. If a unit is
+ * specified in the string, the multiplier provided by the caller is ignored.
+ * This function can also accept hexadecimal strings which are prefixed with
+ * "0x".
+ *
+ * \param[in] buffer	string consisting of numbers, a decimal, and a unit
+ * \param[in] count	buffer length
+ * \param[in] val	if successful, the value represented by the string
+ * \param[in] defunit	default unit if string doesn't contain one
+ *
+ * \retval		0 on success
+ * \retval		negative number on error
+ */
+int lprocfs_str_with_units_to_s64(const char __user *buffer,
+				  unsigned long count, __s64 *val, char defunit)
+{
+	__u64 mult = 1;
+	int rc;
+
+	if (defunit != '1') {
+		rc = get_mult(defunit, &mult);
+		if (rc)
+			return rc;
+	}
+
+	return str_to_s64_internal(buffer, count, val, mult, true);
+}
+EXPORT_SYMBOL(lprocfs_str_with_units_to_s64);
+
+char *lprocfs_strnstr(const char *s1, const char *s2, size_t len)
+{
+	size_t l2;
+
+	l2 = strlen(s2);
+	if (!l2)
+		return (char *)s1;
+	while (len >= l2) {
+		len--;
+		if (!memcmp(s1, s2, l2))
+			return (char *)s1;
+		s1++;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(lprocfs_strnstr);
+
+/**
+ * Find the string \a name in the input \a buffer, and return a pointer to the
+ * value immediately following \a name, reducing \a count appropriately.
+ * If \a name is not found the original \a buffer is returned.
+ */
+char *lprocfs_find_named_value(const char *buffer, const char *name,
+				size_t *count)
+{
+	char *val;
+	size_t buflen = *count;
+
+	/* there is no strnstr() in rhel5 and ubuntu kernels */
+	val = lprocfs_strnstr(buffer, name, buflen);
+	if (val == NULL)
+		return (char *)buffer;
+
+	val += strlen(name);                             /* skip prefix */
+	while (val < buffer + buflen && isspace(*val)) /* skip separator */
+		val++;
+
+	*count = 0;
+	while (val < buffer + buflen && isalnum(*val)) {
+		++*count;
+		++val;
+	}
+
+	return val - *count;
+}
+EXPORT_SYMBOL(lprocfs_find_named_value);
+
+int ldebugfs_seq_create(struct dentry *parent, const char *name, umode_t mode,
+			const struct file_operations *seq_fops, void *data)
+{
+	struct dentry *entry;
+
+	/* Disallow secretly (un)writable entries. */
+	LASSERT((!seq_fops->write) == (!(mode & 0222)));
+
+	entry = debugfs_create_file(name, mode, parent, data, seq_fops);
+	if (IS_ERR_OR_NULL(entry))
+		return entry ? PTR_ERR(entry) : -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ldebugfs_seq_create);
+
+int lprocfs_seq_create(struct proc_dir_entry *parent,
+		       const char *name,
+		       mode_t mode,
+		       const struct file_operations *seq_fops,
+		       void *data)
+{
+	struct proc_dir_entry *entry;
+	ENTRY;
+
+	/* Disallow secretly (un)writable entries. */
+	LASSERT((seq_fops->write == NULL) == ((mode & 0222) == 0));
+
+	entry = proc_create_data(name, mode, parent, seq_fops, data);
+
+	if (entry == NULL)
+		RETURN(-ENOMEM);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lprocfs_seq_create);
+
+int lprocfs_obd_seq_create(struct obd_device *dev,
+			   const char *name,
+			   mode_t mode,
+			   const struct file_operations *seq_fops,
+			   void *data)
+{
+        return (lprocfs_seq_create(dev->obd_proc_entry, name,
+                                   mode, seq_fops, data));
+}
+EXPORT_SYMBOL(lprocfs_obd_seq_create);
+
+void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value)
+{
+	if (value >= OBD_HIST_MAX)
+		value = OBD_HIST_MAX - 1;
+
+	spin_lock(&oh->oh_lock);
+	oh->oh_buckets[value]++;
+	spin_unlock(&oh->oh_lock);
+}
+EXPORT_SYMBOL(lprocfs_oh_tally);
+
+void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value)
+{
+	unsigned int val = 0;
+
+	if (likely(value != 0))
+		val = min(fls(value - 1), OBD_HIST_MAX);
+
+	lprocfs_oh_tally(oh, val);
+}
+EXPORT_SYMBOL(lprocfs_oh_tally_log2);
+
+unsigned long lprocfs_oh_sum(struct obd_histogram *oh)
+{
+        unsigned long ret = 0;
+        int i;
+
+        for (i = 0; i < OBD_HIST_MAX; i++)
+                ret +=  oh->oh_buckets[i];
+        return ret;
+}
+EXPORT_SYMBOL(lprocfs_oh_sum);
+
+void lprocfs_oh_clear(struct obd_histogram *oh)
+{
+	spin_lock(&oh->oh_lock);
+	memset(oh->oh_buckets, 0, sizeof(oh->oh_buckets));
+	spin_unlock(&oh->oh_lock);
+}
+EXPORT_SYMBOL(lprocfs_oh_clear);
+
+ssize_t lustre_attr_show(struct kobject *kobj,
+			 struct attribute *attr, char *buf)
+{
+	struct lustre_attr *a = container_of(attr, struct lustre_attr, attr);
+
+	return a->show ? a->show(kobj, attr, buf) : 0;
+}
+EXPORT_SYMBOL_GPL(lustre_attr_show);
+
+ssize_t lustre_attr_store(struct kobject *kobj, struct attribute *attr,
+			  const char *buf, size_t len)
+{
+	struct lustre_attr *a = container_of(attr, struct lustre_attr, attr);
+
+	return a->store ? a->store(kobj, attr, buf, len) : len;
+}
+EXPORT_SYMBOL_GPL(lustre_attr_store);
+
+const struct sysfs_ops lustre_sysfs_ops = {
+	.show  = lustre_attr_show,
+	.store = lustre_attr_store,
+};
+EXPORT_SYMBOL_GPL(lustre_sysfs_ops);
+
+int lprocfs_obd_max_pages_per_rpc_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *dev = data;
+	struct client_obd *cli = &dev->u.cli;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	seq_printf(m, "%d\n", cli->cl_max_pages_per_rpc);
+	spin_unlock(&cli->cl_loi_list_lock);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_obd_max_pages_per_rpc_seq_show);
+
+ssize_t lprocfs_obd_max_pages_per_rpc_seq_write(struct file *file,
+						const char __user *buffer,
+						size_t count, loff_t *off)
+{
+	struct obd_device *dev =
+		((struct seq_file *)file->private_data)->private;
+	struct client_obd *cli = &dev->u.cli;
+	struct obd_connect_data *ocd = &cli->cl_import->imp_connect_data;
+	int chunk_mask, rc;
+	__s64 val;
+
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &val, '1');
+	if (rc)
+		return rc;
+	if (val < 0)
+		return -ERANGE;
+
+	/* if the max_pages is specified in bytes, convert to pages */
+	if (val >= ONE_MB_BRW_SIZE)
+		val >>= PAGE_SHIFT;
+
+	LPROCFS_CLIMP_CHECK(dev);
+
+	chunk_mask = ~((1 << (cli->cl_chunkbits - PAGE_SHIFT)) - 1);
+	/* max_pages_per_rpc must be chunk aligned */
+	val = (val + ~chunk_mask) & chunk_mask;
+	if (val == 0 || (ocd->ocd_brw_size != 0 &&
+			 val > ocd->ocd_brw_size >> PAGE_SHIFT)) {
+		LPROCFS_CLIMP_EXIT(dev);
+		return -ERANGE;
+	}
+	spin_lock(&cli->cl_loi_list_lock);
+	cli->cl_max_pages_per_rpc = val;
+	client_adjust_max_dirty(cli);
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	LPROCFS_CLIMP_EXIT(dev);
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_obd_max_pages_per_rpc_seq_write);
+
+int lprocfs_wr_root_squash(const char __user *buffer, unsigned long count,
+			   struct root_squash_info *squash, char *name)
+{
+	int rc;
+	char kernbuf[64], *tmp, *errmsg;
+	unsigned long uid, gid;
+	ENTRY;
+
+	if (count >= sizeof(kernbuf)) {
+		errmsg = "string too long";
+		GOTO(failed_noprint, rc = -EINVAL);
+	}
+	if (copy_from_user(kernbuf, buffer, count)) {
+		errmsg = "bad address";
+		GOTO(failed_noprint, rc = -EFAULT);
+	}
+	kernbuf[count] = '\0';
+
+	/* look for uid gid separator */
+	tmp = strchr(kernbuf, ':');
+	if (tmp == NULL) {
+		errmsg = "needs uid:gid format";
+		GOTO(failed, rc = -EINVAL);
+	}
+	*tmp = '\0';
+	tmp++;
+
+	/* parse uid */
+	if (kstrtoul(kernbuf, 0, &uid) != 0) {
+		errmsg = "bad uid";
+		GOTO(failed, rc = -EINVAL);
+	}
+
+	/* parse gid */
+	if (kstrtoul(tmp, 0, &gid) != 0) {
+		errmsg = "bad gid";
+		GOTO(failed, rc = -EINVAL);
+	}
+
+	squash->rsi_uid = uid;
+	squash->rsi_gid = gid;
+
+	LCONSOLE_INFO("%s: root_squash is set to %u:%u\n",
+		      name, squash->rsi_uid, squash->rsi_gid);
+	RETURN(count);
+
+failed:
+	if (tmp != NULL) {
+		tmp--;
+		*tmp = ':';
+	}
+	CWARN("%s: failed to set root_squash to \"%s\", %s, rc = %d\n",
+	      name, kernbuf, errmsg, rc);
+	RETURN(rc);
+failed_noprint:
+	CWARN("%s: failed to set root_squash due to %s, rc = %d\n",
+	      name, errmsg, rc);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lprocfs_wr_root_squash);
+
+
+int lprocfs_wr_nosquash_nids(const char __user *buffer, unsigned long count,
+			     struct root_squash_info *squash, char *name)
+{
+	int rc;
+	char *kernbuf = NULL;
+	char *errmsg;
+	struct list_head tmp;
+	int len = count;
+	ENTRY;
+
+	if (count > 4096) {
+		errmsg = "string too long";
+		GOTO(failed, rc = -EINVAL);
+	}
+
+	OBD_ALLOC(kernbuf, count + 1);
+	if (kernbuf == NULL) {
+		errmsg = "no memory";
+		GOTO(failed, rc = -ENOMEM);
+	}
+	if (copy_from_user(kernbuf, buffer, count)) {
+		errmsg = "bad address";
+		GOTO(failed, rc = -EFAULT);
+	}
+	kernbuf[count] = '\0';
+
+	if (count > 0 && kernbuf[count - 1] == '\n')
+		len = count - 1;
+
+	if ((len == 4 && strncmp(kernbuf, "NONE", len) == 0) ||
+	    (len == 5 && strncmp(kernbuf, "clear", len) == 0)) {
+		/* empty string is special case */
+		down_write(&squash->rsi_sem);
+		if (!list_empty(&squash->rsi_nosquash_nids))
+			cfs_free_nidlist(&squash->rsi_nosquash_nids);
+		up_write(&squash->rsi_sem);
+		LCONSOLE_INFO("%s: nosquash_nids is cleared\n", name);
+		OBD_FREE(kernbuf, count + 1);
+		RETURN(count);
+	}
+
+	INIT_LIST_HEAD(&tmp);
+	if (cfs_parse_nidlist(kernbuf, count, &tmp) <= 0) {
+		errmsg = "can't parse";
+		GOTO(failed, rc = -EINVAL);
+	}
+	LCONSOLE_INFO("%s: nosquash_nids set to %s\n",
+		      name, kernbuf);
+	OBD_FREE(kernbuf, count + 1);
+	kernbuf = NULL;
+
+	down_write(&squash->rsi_sem);
+	if (!list_empty(&squash->rsi_nosquash_nids))
+		cfs_free_nidlist(&squash->rsi_nosquash_nids);
+	list_splice(&tmp, &squash->rsi_nosquash_nids);
+	up_write(&squash->rsi_sem);
+
+	RETURN(count);
+
+failed:
+	if (kernbuf) {
+		CWARN("%s: failed to set nosquash_nids to \"%s\", %s rc = %d\n",
+		      name, kernbuf, errmsg, rc);
+		OBD_FREE(kernbuf, count + 1);
+	} else {
+		CWARN("%s: failed to set nosquash_nids due to %s rc = %d\n",
+		      name, errmsg, rc);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lprocfs_wr_nosquash_nids);
+
+#endif /* CONFIG_PROC_FS*/
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c
new file mode 100644
index 0000000000000..46ad92df952f2
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c
@@ -0,0 +1,803 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lprocfs_status_server.c
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_nodemap.h>
+
+#ifdef CONFIG_PROC_FS
+
+int lprocfs_evict_client_open(struct inode *inode, struct file *f)
+{
+	struct obd_device *obd = PDE_DATA(file_inode(f));
+
+	atomic_inc(&obd->obd_evict_inprogress);
+	return 0;
+}
+
+int lprocfs_evict_client_release(struct inode *inode, struct file *f)
+{
+	struct obd_device *obd = PDE_DATA(file_inode(f));
+
+	atomic_dec(&obd->obd_evict_inprogress);
+	wake_up(&obd->obd_evict_inprogress_waitq);
+
+	return 0;
+}
+
+#define BUFLEN (UUID_MAX + 5)
+
+ssize_t
+lprocfs_evict_client_seq_write(struct file *file, const char __user *buffer,
+			       size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct obd_device *obd = m->private;
+	char *tmpbuf, *kbuf;
+
+	OBD_ALLOC(kbuf, BUFLEN);
+	if (kbuf == NULL)
+		return -ENOMEM;
+
+	/*
+	 * OBD_ALLOC() will zero kbuf, but we only copy BUFLEN - 1
+	 * bytes into kbuf, to ensure that the string is NUL-terminated.
+	 * UUID_MAX should include a trailing NUL already.
+	 */
+	if (copy_from_user(kbuf, buffer,
+			   min_t(unsigned long, BUFLEN - 1, count))) {
+		count = -EFAULT;
+		goto out;
+	}
+	tmpbuf = cfs_firststr(kbuf, min_t(unsigned long, BUFLEN - 1, count));
+	class_incref(obd, __func__, current);
+
+	if (strncmp(tmpbuf, "nid:", 4) == 0)
+		obd_export_evict_by_nid(obd, tmpbuf + 4);
+	else if (strncmp(tmpbuf, "uuid:", 5) == 0)
+		obd_export_evict_by_uuid(obd, tmpbuf + 5);
+	else
+		obd_export_evict_by_uuid(obd, tmpbuf);
+
+	class_decref(obd, __func__, current);
+
+out:
+	OBD_FREE(kbuf, BUFLEN);
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_evict_client_seq_write);
+
+#undef BUFLEN
+
+int lprocfs_num_exports_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+
+	LASSERT(obd != NULL);
+	seq_printf(m, "%u\n", obd->obd_num_exports);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_num_exports_seq_show);
+
+static void lprocfs_free_client_stats(struct nid_stat *client_stat)
+{
+	CDEBUG(D_CONFIG, "stat %p - data %p/%p\n", client_stat,
+	       client_stat->nid_proc, client_stat->nid_stats);
+
+	LASSERTF(atomic_read(&client_stat->nid_exp_ref_count) == 0,
+		 "nid %s:count %d\n", libcfs_nid2str(client_stat->nid),
+		 atomic_read(&client_stat->nid_exp_ref_count));
+
+	if (client_stat->nid_proc)
+		lprocfs_remove(&client_stat->nid_proc);
+
+	if (client_stat->nid_stats)
+		lprocfs_free_stats(&client_stat->nid_stats);
+
+	if (client_stat->nid_ldlm_stats)
+		lprocfs_free_stats(&client_stat->nid_ldlm_stats);
+
+	OBD_FREE_PTR(client_stat);
+	return;
+}
+
+void lprocfs_free_per_client_stats(struct obd_device *obd)
+{
+	struct cfs_hash *hash = obd->obd_nid_stats_hash;
+	struct nid_stat *stat;
+	ENTRY;
+
+	/* we need extra list - because hash_exit called to early */
+	/* not need locking because all clients is died */
+	while (!list_empty(&obd->obd_nid_stats)) {
+		stat = list_entry(obd->obd_nid_stats.next,
+				  struct nid_stat, nid_list);
+		list_del_init(&stat->nid_list);
+		cfs_hash_del(hash, &stat->nid, &stat->nid_hash);
+		lprocfs_free_client_stats(stat);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(lprocfs_free_per_client_stats);
+
+static int
+lprocfs_exp_print_uuid_seq(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			   struct hlist_node *hnode, void *cb_data)
+{
+	struct seq_file *m = cb_data;
+	struct obd_export *exp = cfs_hash_object(hs, hnode);
+
+	if (exp->exp_nid_stats != NULL)
+		seq_printf(m, "%s\n", obd_uuid2str(&exp->exp_client_uuid));
+	return 0;
+}
+
+static int
+lprocfs_exp_print_nodemap_seq(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			      struct hlist_node *hnode, void *cb_data)
+{
+	struct seq_file *m = cb_data;
+	struct obd_export *exp = cfs_hash_object(hs, hnode);
+	struct lu_nodemap *nodemap = exp->exp_target_data.ted_nodemap;
+
+	if (nodemap != NULL)
+		seq_printf(m, "%s\n", nodemap->nm_name);
+	return 0;
+}
+
+static int
+lprocfs_exp_nodemap_seq_show(struct seq_file *m, void *data)
+{
+	struct nid_stat *stats = m->private;
+	struct obd_device *obd = stats->nid_obd;
+
+	cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
+			      lprocfs_exp_print_nodemap_seq, m);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(lprocfs_exp_nodemap);
+
+static int lprocfs_exp_uuid_seq_show(struct seq_file *m, void *data)
+{
+	struct nid_stat *stats = m->private;
+	struct obd_device *obd = stats->nid_obd;
+
+	cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
+				lprocfs_exp_print_uuid_seq, m);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(lprocfs_exp_uuid);
+
+static int
+lprocfs_exp_print_hash_seq(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			   struct hlist_node *hnode, void *cb_data)
+
+{
+	struct seq_file *m = cb_data;
+	struct obd_export *exp = cfs_hash_object(hs, hnode);
+
+	if (exp->exp_lock_hash != NULL) {
+		cfs_hash_debug_header(m);
+		cfs_hash_debug_str(hs, m);
+	}
+	return 0;
+}
+
+static int lprocfs_exp_hash_seq_show(struct seq_file *m, void *data)
+{
+	struct nid_stat *stats = m->private;
+	struct obd_device *obd = stats->nid_obd;
+
+	cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
+				lprocfs_exp_print_hash_seq, m);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(lprocfs_exp_hash);
+
+int lprocfs_exp_print_replydata_seq(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+				    struct hlist_node *hnode, void *cb_data)
+
+{
+	struct obd_export *exp = cfs_hash_object(hs, hnode);
+	struct seq_file *m = cb_data;
+	struct tg_export_data *ted = &exp->exp_target_data;
+
+	seq_printf(m, "reply_cnt: %d\n"
+		   "reply_max: %d\n"
+		   "reply_released_by_xid: %d\n"
+		   "reply_released_by_tag: %d\n\n",
+		   ted->ted_reply_cnt,
+		   ted->ted_reply_max,
+		   ted->ted_release_xid,
+		   ted->ted_release_tag);
+	return 0;
+}
+
+int lprocfs_exp_replydata_seq_show(struct seq_file *m, void *data)
+{
+	struct nid_stat *stats = m->private;
+	struct obd_device *obd = stats->nid_obd;
+
+	cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
+				lprocfs_exp_print_replydata_seq, m);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(lprocfs_exp_replydata);
+
+int lprocfs_nid_stats_clear_seq_show(struct seq_file *m, void *data)
+{
+	seq_puts(m, "Write into this file to clear all nid stats and stale nid entries\n");
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_nid_stats_clear_seq_show);
+
+static int lprocfs_nid_stats_clear_write_cb(void *obj, void *data)
+{
+	struct nid_stat *stat = obj;
+	ENTRY;
+
+	CDEBUG(D_INFO, "refcnt %d\n", atomic_read(&stat->nid_exp_ref_count));
+	if (atomic_read(&stat->nid_exp_ref_count) == 1) {
+		/* object has only hash references. */
+		spin_lock(&stat->nid_obd->obd_nid_lock);
+		list_move(&stat->nid_list, data);
+		spin_unlock(&stat->nid_obd->obd_nid_lock);
+		RETURN(1);
+	}
+	/* we has reference to object - only clear data*/
+	if (stat->nid_stats)
+		lprocfs_clear_stats(stat->nid_stats);
+
+	RETURN(0);
+}
+
+ssize_t
+lprocfs_nid_stats_clear_seq_write(struct file *file, const char __user *buffer,
+					size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct obd_device *obd = m->private;
+	struct nid_stat *client_stat;
+	struct list_head free_list;
+
+	INIT_LIST_HEAD(&free_list);
+	cfs_hash_cond_del(obd->obd_nid_stats_hash,
+			  lprocfs_nid_stats_clear_write_cb, &free_list);
+
+	while (!list_empty(&free_list)) {
+		client_stat = list_entry(free_list.next, struct nid_stat,
+					 nid_list);
+		list_del_init(&client_stat->nid_list);
+		lprocfs_free_client_stats(client_stat);
+	}
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_nid_stats_clear_seq_write);
+
+int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid)
+{
+	struct nid_stat *new_stat, *old_stat;
+	struct obd_device *obd = NULL;
+	struct proc_dir_entry *entry;
+	char nidstr[LNET_NIDSTR_SIZE];
+	int rc = 0;
+	ENTRY;
+
+	if (!exp || !exp->exp_obd || !exp->exp_obd->obd_proc_exports_entry ||
+	    !exp->exp_obd->obd_nid_stats_hash)
+		RETURN(-EINVAL);
+
+	/* not test against zero because eric say:
+	 * You may only test nid against another nid, or LNET_NID_ANY.
+	 * Anything else is nonsense.*/
+	if (nid == NULL || *nid == LNET_NID_ANY)
+		RETURN(-EALREADY);
+
+	libcfs_nid2str_r(*nid, nidstr, sizeof(nidstr));
+
+	spin_lock(&exp->exp_lock);
+	if (exp->exp_nid_stats != NULL) {
+		spin_unlock(&exp->exp_lock);
+		RETURN(-EALREADY);
+	}
+	spin_unlock(&exp->exp_lock);
+
+	obd = exp->exp_obd;
+
+	CDEBUG(D_CONFIG, "using hash %p\n", obd->obd_nid_stats_hash);
+
+	OBD_ALLOC_PTR(new_stat);
+	if (new_stat == NULL)
+		RETURN(-ENOMEM);
+
+	new_stat->nid     = *nid;
+	new_stat->nid_obd = exp->exp_obd;
+	/* we need set default refcount to 1 to balance obd_disconnect */
+	atomic_set(&new_stat->nid_exp_ref_count, 1);
+
+	old_stat = cfs_hash_findadd_unique(obd->obd_nid_stats_hash,
+					   nid, &new_stat->nid_hash);
+	CDEBUG(D_INFO, "Found stats %p for nid %s - ref %d\n",
+	       old_stat, nidstr, atomic_read(&old_stat->nid_exp_ref_count));
+
+	/* Return -EALREADY here so that we know that the /proc
+	 * entry already has been created */
+	if (old_stat != new_stat) {
+		spin_lock(&exp->exp_lock);
+		if (exp->exp_nid_stats) {
+			LASSERT(exp->exp_nid_stats == old_stat);
+			nidstat_putref(exp->exp_nid_stats);
+		}
+		exp->exp_nid_stats = old_stat;
+		spin_unlock(&exp->exp_lock);
+		GOTO(destroy_new, rc = -EALREADY);
+	}
+	/* not found - create */
+	new_stat->nid_proc = lprocfs_register(nidstr,
+					      obd->obd_proc_exports_entry,
+					      NULL, NULL);
+
+	if (IS_ERR(new_stat->nid_proc)) {
+		rc = PTR_ERR(new_stat->nid_proc);
+		new_stat->nid_proc = NULL;
+		CERROR("%s: cannot create proc entry for export %s: rc = %d\n",
+		       obd->obd_name, nidstr, rc);
+		GOTO(destroy_new_ns, rc);
+	}
+
+	entry = lprocfs_add_simple(new_stat->nid_proc, "nodemap", new_stat,
+				   &lprocfs_exp_nodemap_fops);
+	if (IS_ERR(entry)) {
+		rc = PTR_ERR(entry);
+		CWARN("Error adding the nodemap file: rc = %d\n", rc);
+		GOTO(destroy_new_ns, rc);
+	}
+
+	entry = lprocfs_add_simple(new_stat->nid_proc, "uuid", new_stat,
+				   &lprocfs_exp_uuid_fops);
+	if (IS_ERR(entry)) {
+		rc = PTR_ERR(entry);
+		CWARN("Error adding the NID stats file: rc = %d\n", rc);
+		GOTO(destroy_new_ns, rc);
+	}
+
+	entry = lprocfs_add_simple(new_stat->nid_proc, "hash", new_stat,
+				   &lprocfs_exp_hash_fops);
+	if (IS_ERR(entry)) {
+		rc = PTR_ERR(entry);
+		CWARN("Error adding the hash file: rc = %d\n", rc);
+		GOTO(destroy_new_ns, rc);
+	}
+
+	entry = lprocfs_add_simple(new_stat->nid_proc, "reply_data", new_stat,
+				   &lprocfs_exp_replydata_fops);
+	if (IS_ERR(entry)) {
+		rc = PTR_ERR(entry);
+		CWARN("%s: Error adding the reply_data file: rc = %d\n",
+		      obd->obd_name, rc);
+		GOTO(destroy_new_ns, rc);
+	}
+
+	spin_lock(&exp->exp_lock);
+	exp->exp_nid_stats = new_stat;
+	spin_unlock(&exp->exp_lock);
+
+	/* protect competitive add to list, not need locking on destroy */
+	spin_lock(&obd->obd_nid_lock);
+	list_add(&new_stat->nid_list, &obd->obd_nid_stats);
+	spin_unlock(&obd->obd_nid_lock);
+
+	RETURN(0);
+
+destroy_new_ns:
+	if (new_stat->nid_proc != NULL)
+		lprocfs_remove(&new_stat->nid_proc);
+	cfs_hash_del(obd->obd_nid_stats_hash, nid, &new_stat->nid_hash);
+
+destroy_new:
+	nidstat_putref(new_stat);
+	OBD_FREE_PTR(new_stat);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lprocfs_exp_setup);
+
+int lprocfs_exp_cleanup(struct obd_export *exp)
+{
+	struct nid_stat *stat = exp->exp_nid_stats;
+
+	if (!stat || !exp->exp_obd)
+		RETURN(0);
+
+	nidstat_putref(exp->exp_nid_stats);
+	exp->exp_nid_stats = NULL;
+
+	return 0;
+}
+
+#define LPROCFS_OBD_OP_INIT(base, stats, op)			\
+do {								\
+	unsigned int coffset = base + OBD_COUNTER_OFFSET(op);	\
+	LASSERT(coffset < stats->ls_num);			\
+	lprocfs_counter_init(stats, coffset, 0, #op, "reqs");	\
+} while (0)
+
+void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats)
+{
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, iocontrol);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_info);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, set_info_async);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, setup);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, precleanup);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, cleanup);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, process_config);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, postrecov);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, add_conn);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, del_conn);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, connect);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, reconnect);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, disconnect);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_init);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_fini);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_alloc);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs_async);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, create);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, preprw);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, commitrw);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, init_export);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy_export);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, import_event);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, notify);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, health_check);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_uuid);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotactl);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, ping);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_new);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_rem);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_add);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_del);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, getref);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, putref);
+
+	CLASSERT(NUM_OBD_STATS == OBD_COUNTER_OFFSET(putref) + 1);
+}
+EXPORT_SYMBOL(lprocfs_init_ops_stats);
+
+int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats)
+{
+	struct lprocfs_stats *stats;
+	unsigned int num_stats;
+	int rc, i;
+
+	LASSERT(obd->obd_stats == NULL);
+	LASSERT(obd->obd_proc_entry != NULL);
+	LASSERT(obd->obd_cntr_base == 0);
+
+	num_stats = NUM_OBD_STATS + num_private_stats;
+	stats = lprocfs_alloc_stats(num_stats, 0);
+	if (stats == NULL)
+		return -ENOMEM;
+
+	lprocfs_init_ops_stats(num_private_stats, stats);
+
+	for (i = num_private_stats; i < num_stats; i++) {
+		/* If this LBUGs, it is likely that an obd
+		 * operation was added to struct obd_ops in
+		 * <obd.h>, and that the corresponding line item
+		 * LPROCFS_OBD_OP_INIT(.., .., opname)
+		 * is missing from the list above. */
+		LASSERTF(stats->ls_cnt_header[i].lc_name != NULL,
+			 "Missing obd_stat initializer obd_op "
+			 "operation at offset %d.\n", i - num_private_stats);
+	}
+	rc = lprocfs_register_stats(obd->obd_proc_entry, "stats", stats);
+	if (rc < 0) {
+		lprocfs_free_stats(&stats);
+	} else {
+		obd->obd_stats  = stats;
+		obd->obd_cntr_base = num_private_stats;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_alloc_obd_stats);
+
+void lprocfs_free_obd_stats(struct obd_device *obd)
+{
+	if (obd->obd_stats)
+		lprocfs_free_stats(&obd->obd_stats);
+}
+EXPORT_SYMBOL(lprocfs_free_obd_stats);
+
+int lprocfs_hash_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = m->private;
+
+	if (obd == NULL)
+		return 0;
+
+	cfs_hash_debug_header(m);
+	cfs_hash_debug_str(obd->obd_uuid_hash, m);
+	cfs_hash_debug_str(obd->obd_nid_hash, m);
+	cfs_hash_debug_str(obd->obd_nid_stats_hash, m);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_hash_seq_show);
+
+int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = m->private;
+	struct target_distribute_txn_data *tdtd;
+
+	LASSERT(obd != NULL);
+
+	seq_printf(m, "status: ");
+	if (obd->obd_max_recoverable_clients == 0) {
+		seq_printf(m, "INACTIVE\n");
+		goto out;
+	}
+
+	/* sampled unlocked, but really... */
+	if (obd->obd_recovering == 0) {
+		seq_printf(m, "COMPLETE\n");
+		seq_printf(m, "recovery_start: %lld\n",
+			   (s64)obd->obd_recovery_start);
+		seq_printf(m, "recovery_duration: %lld\n",
+			   obd->obd_recovery_end ?
+			   obd->obd_recovery_end - obd->obd_recovery_start :
+			   ktime_get_real_seconds() - obd->obd_recovery_start);
+		/* Number of clients that have completed recovery */
+		seq_printf(m, "completed_clients: %d/%d\n",
+			   obd->obd_max_recoverable_clients -
+			   obd->obd_stale_clients,
+			   obd->obd_max_recoverable_clients);
+		seq_printf(m, "replayed_requests: %d\n",
+			   obd->obd_replayed_requests);
+		seq_printf(m, "last_transno: %lld\n",
+			   obd->obd_next_recovery_transno - 1);
+		seq_printf(m, "VBR: %s\n", obd->obd_version_recov ?
+			   "ENABLED" : "DISABLED");
+		seq_printf(m, "IR: %s\n", obd->obd_no_ir ?
+			   "DISABLED" : "ENABLED");
+		goto out;
+	}
+
+	tdtd = obd->u.obt.obt_lut->lut_tdtd;
+	if (tdtd && tdtd->tdtd_show_update_logs_retrievers) {
+		char *buf;
+		int size = 0;
+		int count = 0;
+
+		buf = tdtd->tdtd_show_update_logs_retrievers(
+			tdtd->tdtd_show_retrievers_cbdata,
+			&size, &count);
+		if (count > 0) {
+			seq_printf(m, "WAITING\n");
+			seq_printf(m, "non-ready MDTs: %s\n",
+				   buf ? buf : "unknown (not enough RAM)");
+			seq_printf(m, "recovery_start: %lld\n",
+				   (s64)obd->obd_recovery_start);
+			seq_printf(m, "time_waited: %lld\n",
+				   (s64)(ktime_get_real_seconds() -
+					 obd->obd_recovery_start));
+		}
+
+		if (buf != NULL)
+			OBD_FREE(buf, size);
+
+		if (likely(count > 0))
+			goto out;
+	}
+
+	/* recovery won't start until the clients connect */
+	if (obd->obd_recovery_start == 0) {
+		seq_printf(m, "WAITING_FOR_CLIENTS\n");
+		goto out;
+	}
+
+	seq_printf(m, "RECOVERING\n");
+	seq_printf(m, "recovery_start: %lld\n", (s64)obd->obd_recovery_start);
+	seq_printf(m, "time_remaining: %lld\n",
+		   ktime_get_real_seconds() >=
+		   obd->obd_recovery_start +
+		   obd->obd_recovery_timeout ? 0 :
+		   (s64)(obd->obd_recovery_start +
+			 obd->obd_recovery_timeout -
+			 ktime_get_real_seconds()));
+	seq_printf(m, "connected_clients: %d/%d\n",
+		   atomic_read(&obd->obd_connected_clients),
+		   obd->obd_max_recoverable_clients);
+	/* Number of clients that have completed recovery */
+	seq_printf(m, "req_replay_clients: %d\n",
+		   atomic_read(&obd->obd_req_replay_clients));
+	seq_printf(m, "lock_repay_clients: %d\n",
+		   atomic_read(&obd->obd_lock_replay_clients));
+	seq_printf(m, "completed_clients: %d\n",
+		   atomic_read(&obd->obd_connected_clients) -
+		   atomic_read(&obd->obd_lock_replay_clients));
+	seq_printf(m, "evicted_clients: %d\n", obd->obd_stale_clients);
+	seq_printf(m, "replayed_requests: %d\n", obd->obd_replayed_requests);
+	seq_printf(m, "queued_requests: %d\n",
+		   obd->obd_requests_queued_for_recovery);
+	seq_printf(m, "next_transno: %lld\n",
+		   obd->obd_next_recovery_transno);
+out:
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_recovery_status_seq_show);
+
+int lprocfs_ir_factor_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = m->private;
+
+	LASSERT(obd != NULL);
+	seq_printf(m, "%d\n", obd->obd_recovery_ir_factor);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_ir_factor_seq_show);
+
+ssize_t
+lprocfs_ir_factor_seq_write(struct file *file, const char __user *buffer,
+			    size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct obd_device *obd = m->private;
+	int rc;
+	__s64 val;
+
+	LASSERT(obd != NULL);
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val < OBD_IR_FACTOR_MIN || val > OBD_IR_FACTOR_MAX)
+		return -EINVAL;
+
+	obd->obd_recovery_ir_factor = val;
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_ir_factor_seq_write);
+
+int lprocfs_checksum_dump_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = m->private;
+
+	LASSERT(obd != NULL);
+	seq_printf(m, "%d\n", obd->obd_checksum_dump);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_checksum_dump_seq_show);
+
+ssize_t
+lprocfs_checksum_dump_seq_write(struct file *file, const char __user *buffer,
+			    size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct obd_device *obd = m->private;
+	int rc;
+	__s64 val;
+
+	LASSERT(obd != NULL);
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	obd->obd_checksum_dump = !!val;
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_checksum_dump_seq_write);
+
+int lprocfs_recovery_time_soft_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = m->private;
+
+	LASSERT(obd != NULL);
+	seq_printf(m, "%llu\n", obd->obd_recovery_timeout);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_recovery_time_soft_seq_show);
+
+ssize_t
+lprocfs_recovery_time_soft_seq_write(struct file *file,
+				     const char __user *buffer,
+				     size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct obd_device *obd = m->private;
+	int rc;
+	__s64 val;
+
+	LASSERT(obd != NULL);
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+	if (val < 0 || val > INT_MAX)
+		return -ERANGE;
+
+	obd->obd_recovery_timeout = val;
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_recovery_time_soft_seq_write);
+
+int lprocfs_recovery_time_hard_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = m->private;
+
+	LASSERT(obd != NULL);
+	seq_printf(m, "%lld\n", obd->obd_recovery_time_hard);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_recovery_time_hard_seq_show);
+
+ssize_t
+lprocfs_recovery_time_hard_seq_write(struct file *file,
+				     const char __user *buffer,
+				     size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct obd_device *obd = m->private;
+	int rc;
+	__s64 val;
+
+	LASSERT(obd != NULL);
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+	if (val < 0 || val > INT_MAX)
+		return -ERANGE;
+
+	obd->obd_recovery_time_hard = val;
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_recovery_time_hard_seq_write);
+
+int lprocfs_target_instance_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = m->private;
+	struct obd_device_target *target = &obd->u.obt;
+
+	LASSERT(obd != NULL);
+	LASSERT(target->obt_magic == OBT_MAGIC);
+	seq_printf(m, "%u\n", obd->u.obt.obt_instance);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_target_instance_seq_show);
+
+#endif /* CONFIG_PROC_FS*/
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c b/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c
new file mode 100644
index 0000000000000..f6a043b7af9e8
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c
@@ -0,0 +1,2371 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lu_object.c
+ *
+ * Lustre Object.
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <linux/list.h>
+#include <libcfs/libcfs.h>
+#include <libcfs/libcfs_hash.h> /* hash_long() */
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_disk.h>
+#include <lustre_fid.h>
+#include <lu_object.h>
+#include <lu_ref.h>
+
+enum {
+	LU_CACHE_PERCENT_MAX     = 50,
+	LU_CACHE_PERCENT_DEFAULT = 20
+};
+
+#define	LU_CACHE_NR_MAX_ADJUST		512
+#define	LU_CACHE_NR_UNLIMITED		-1
+#define	LU_CACHE_NR_DEFAULT		LU_CACHE_NR_UNLIMITED
+#define	LU_CACHE_NR_LDISKFS_LIMIT	LU_CACHE_NR_UNLIMITED
+/** This is set to roughly (20 * OSS_NTHRS_MAX) to prevent thrashing */
+#define	LU_CACHE_NR_ZFS_LIMIT		10240
+
+#define LU_SITE_BITS_MIN    12
+#define LU_SITE_BITS_MAX    24
+#define LU_SITE_BITS_MAX_CL 19
+/**
+ * total 256 buckets, we don't want too many buckets because:
+ * - consume too much memory
+ * - avoid unbalanced LRU list
+ */
+#define LU_SITE_BKT_BITS    8
+
+
+static unsigned int lu_cache_percent = LU_CACHE_PERCENT_DEFAULT;
+module_param(lu_cache_percent, int, 0644);
+MODULE_PARM_DESC(lu_cache_percent, "Percentage of memory to be used as lu_object cache");
+
+static long lu_cache_nr = LU_CACHE_NR_DEFAULT;
+module_param(lu_cache_nr, long, 0644);
+MODULE_PARM_DESC(lu_cache_nr, "Maximum number of objects in lu_object cache");
+
+static void lu_object_free(const struct lu_env *env, struct lu_object *o);
+static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx);
+
+/**
+ * Decrease reference counter on object. If last reference is freed, return
+ * object to the cache, unless lu_object_is_dying(o) holds. In the latter
+ * case, free object immediately.
+ */
+void lu_object_put(const struct lu_env *env, struct lu_object *o)
+{
+	struct lu_site_bkt_data *bkt;
+	struct lu_object_header *top;
+	struct lu_site *site;
+	struct lu_object *orig;
+	struct cfs_hash_bd bd;
+	const struct lu_fid *fid;
+
+	top  = o->lo_header;
+	site = o->lo_dev->ld_site;
+	orig = o;
+
+	/*
+	 * till we have full fids-on-OST implemented anonymous objects
+	 * are possible in OSP. such an object isn't listed in the site
+	 * so we should not remove it from the site.
+	 */
+	fid = lu_object_fid(o);
+	if (fid_is_zero(fid)) {
+		LASSERT(top->loh_hash.next == NULL
+			&& top->loh_hash.pprev == NULL);
+		LASSERT(list_empty(&top->loh_lru));
+		if (!atomic_dec_and_test(&top->loh_ref))
+			return;
+		list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) {
+			if (o->lo_ops->loo_object_release != NULL)
+				o->lo_ops->loo_object_release(env, o);
+		}
+		lu_object_free(env, orig);
+		return;
+	}
+
+	cfs_hash_bd_get(site->ls_obj_hash, &top->loh_fid, &bd);
+	bkt = cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
+
+	if (!cfs_hash_bd_dec_and_lock(site->ls_obj_hash, &bd, &top->loh_ref)) {
+		if (lu_object_is_dying(top)) {
+			/*
+			 * somebody may be waiting for this, currently only
+			 * used for cl_object, see cl_object_put_last().
+			 */
+			wake_up_all(&bkt->lsb_marche_funebre);
+		}
+		return;
+	}
+
+	/*
+	 * When last reference is released, iterate over object
+	 * layers, and notify them that object is no longer busy.
+	 */
+	list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) {
+		if (o->lo_ops->loo_object_release != NULL)
+			o->lo_ops->loo_object_release(env, o);
+	}
+
+	if (!lu_object_is_dying(top) &&
+	    (lu_object_exists(orig) || lu_object_is_cl(orig))) {
+		LASSERT(list_empty(&top->loh_lru));
+		list_add_tail(&top->loh_lru, &bkt->lsb_lru);
+		bkt->lsb_lru_len++;
+		percpu_counter_inc(&site->ls_lru_len_counter);
+		CDEBUG(D_INODE, "Add %p to site lru. hash: %p, bkt: %p, "
+		       "lru_len: %ld\n",
+		       o, site->ls_obj_hash, bkt, bkt->lsb_lru_len);
+		cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1);
+		return;
+	}
+
+	/*
+	 * If object is dying (will not be cached) then remove it
+	 * from hash table and LRU.
+	 *
+	 * This is done with hash table and LRU lists locked. As the only
+	 * way to acquire first reference to previously unreferenced
+	 * object is through hash-table lookup (lu_object_find()),
+	 * or LRU scanning (lu_site_purge()), that are done under hash-table
+	 * and LRU lock, no race with concurrent object lookup is possible
+	 * and we can safely destroy object below.
+	 */
+	if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags))
+		cfs_hash_bd_del_locked(site->ls_obj_hash, &bd, &top->loh_hash);
+	cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1);
+	/*
+	 * Object was already removed from hash and lru above, can
+	 * kill it.
+	 */
+	lu_object_free(env, orig);
+}
+EXPORT_SYMBOL(lu_object_put);
+
+/**
+ * Put object and don't keep in cache. This is temporary solution for
+ * multi-site objects when its layering is not constant.
+ */
+void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o)
+{
+	set_bit(LU_OBJECT_HEARD_BANSHEE, &o->lo_header->loh_flags);
+	return lu_object_put(env, o);
+}
+EXPORT_SYMBOL(lu_object_put_nocache);
+
+/**
+ * Kill the object and take it out of LRU cache.
+ * Currently used by client code for layout change.
+ */
+void lu_object_unhash(const struct lu_env *env, struct lu_object *o)
+{
+	struct lu_object_header *top;
+
+	top = o->lo_header;
+	set_bit(LU_OBJECT_HEARD_BANSHEE, &top->loh_flags);
+	if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags)) {
+		struct lu_site *site = o->lo_dev->ld_site;
+		struct cfs_hash *obj_hash = site->ls_obj_hash;
+		struct cfs_hash_bd bd;
+
+		cfs_hash_bd_get_and_lock(obj_hash, &top->loh_fid, &bd, 1);
+		if (!list_empty(&top->loh_lru)) {
+			struct lu_site_bkt_data *bkt;
+
+			list_del_init(&top->loh_lru);
+			bkt = cfs_hash_bd_extra_get(obj_hash, &bd);
+			bkt->lsb_lru_len--;
+			percpu_counter_dec(&site->ls_lru_len_counter);
+		}
+		cfs_hash_bd_del_locked(obj_hash, &bd, &top->loh_hash);
+		cfs_hash_bd_unlock(obj_hash, &bd, 1);
+	}
+}
+EXPORT_SYMBOL(lu_object_unhash);
+
+/**
+ * Allocate new object.
+ *
+ * This follows object creation protocol, described in the comment within
+ * struct lu_device_operations definition.
+ */
+static struct lu_object *lu_object_alloc(const struct lu_env *env,
+					 struct lu_device *dev,
+					 const struct lu_fid *f,
+					 const struct lu_object_conf *conf)
+{
+	struct lu_object *scan;
+	struct lu_object *top;
+	struct list_head *layers;
+	unsigned int init_mask = 0;
+	unsigned int init_flag;
+	int clean;
+	int result;
+	ENTRY;
+
+	/*
+	 * Create top-level object slice. This will also create
+	 * lu_object_header.
+	 */
+	top = dev->ld_ops->ldo_object_alloc(env, NULL, dev);
+	if (top == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+	if (IS_ERR(top))
+		RETURN(top);
+        /*
+         * This is the only place where object fid is assigned. It's constant
+         * after this point.
+         */
+        top->lo_header->loh_fid = *f;
+        layers = &top->lo_header->loh_layers;
+
+	do {
+		/*
+		 * Call ->loo_object_init() repeatedly, until no more new
+		 * object slices are created.
+		 */
+		clean = 1;
+		init_flag = 1;
+		list_for_each_entry(scan, layers, lo_linkage) {
+			if (init_mask & init_flag)
+				goto next;
+			clean = 0;
+			scan->lo_header = top->lo_header;
+			result = scan->lo_ops->loo_object_init(env, scan, conf);
+			if (result != 0) {
+				lu_object_free(env, top);
+				RETURN(ERR_PTR(result));
+			}
+			init_mask |= init_flag;
+next:
+			init_flag <<= 1;
+		}
+	} while (!clean);
+
+	list_for_each_entry_reverse(scan, layers, lo_linkage) {
+                if (scan->lo_ops->loo_object_start != NULL) {
+                        result = scan->lo_ops->loo_object_start(env, scan);
+                        if (result != 0) {
+                                lu_object_free(env, top);
+                                RETURN(ERR_PTR(result));
+                        }
+                }
+        }
+
+        lprocfs_counter_incr(dev->ld_site->ls_stats, LU_SS_CREATED);
+        RETURN(top);
+}
+
+/**
+ * Free an object.
+ */
+static void lu_object_free(const struct lu_env *env, struct lu_object *o)
+{
+	struct lu_site_bkt_data *bkt;
+	struct lu_site		*site;
+	struct lu_object	*scan;
+	struct list_head	*layers;
+	struct list_head	 splice;
+
+        site   = o->lo_dev->ld_site;
+        layers = &o->lo_header->loh_layers;
+        bkt    = lu_site_bkt_from_fid(site, &o->lo_header->loh_fid);
+        /*
+         * First call ->loo_object_delete() method to release all resources.
+         */
+	list_for_each_entry_reverse(scan, layers, lo_linkage) {
+                if (scan->lo_ops->loo_object_delete != NULL)
+                        scan->lo_ops->loo_object_delete(env, scan);
+        }
+
+        /*
+         * Then, splice object layers into stand-alone list, and call
+         * ->loo_object_free() on all layers to free memory. Splice is
+         * necessary, because lu_object_header is freed together with the
+         * top-level slice.
+         */
+	INIT_LIST_HEAD(&splice);
+	list_splice_init(layers, &splice);
+	while (!list_empty(&splice)) {
+		/*
+		 * Free layers in bottom-to-top order, so that object header
+		 * lives as long as possible and ->loo_object_free() methods
+		 * can look at its contents.
+		 */
+		o = container_of0(splice.prev, struct lu_object, lo_linkage);
+		list_del_init(&o->lo_linkage);
+		LASSERT(o->lo_ops->loo_object_free != NULL);
+		o->lo_ops->loo_object_free(env, o);
+	}
+
+	if (waitqueue_active(&bkt->lsb_marche_funebre))
+		wake_up_all(&bkt->lsb_marche_funebre);
+}
+
+/**
+ * Free \a nr objects from the cold end of the site LRU list.
+ * if canblock is 0, then don't block awaiting for another
+ * instance of lu_site_purge() to complete
+ */
+int lu_site_purge_objects(const struct lu_env *env, struct lu_site *s,
+			  int nr, int canblock)
+{
+        struct lu_object_header *h;
+        struct lu_object_header *temp;
+        struct lu_site_bkt_data *bkt;
+	struct cfs_hash_bd            bd;
+	struct cfs_hash_bd            bd2;
+	struct list_head	 dispose;
+	int                      did_sth;
+	unsigned int		 start = 0;
+        int                      count;
+        int                      bnr;
+	unsigned int             i;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_OBD_NO_LRU))
+		RETURN(0);
+
+	INIT_LIST_HEAD(&dispose);
+        /*
+         * Under LRU list lock, scan LRU list and move unreferenced objects to
+         * the dispose list, removing them from LRU and hash table.
+         */
+	if (nr != ~0)
+		start = s->ls_purge_start;
+	bnr = (nr == ~0) ? -1 : nr / (int)CFS_HASH_NBKT(s->ls_obj_hash) + 1;
+ again:
+	/*
+	 * It doesn't make any sense to make purge threads parallel, that can
+	 * only bring troubles to us. See LU-5331.
+	 */
+	if (canblock != 0)
+		mutex_lock(&s->ls_purge_mutex);
+	else if (mutex_trylock(&s->ls_purge_mutex) == 0)
+		goto out;
+
+        did_sth = 0;
+        cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) {
+                if (i < start)
+                        continue;
+                count = bnr;
+                cfs_hash_bd_lock(s->ls_obj_hash, &bd, 1);
+                bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd);
+
+		list_for_each_entry_safe(h, temp, &bkt->lsb_lru, loh_lru) {
+			LASSERT(atomic_read(&h->loh_ref) == 0);
+
+                        cfs_hash_bd_get(s->ls_obj_hash, &h->loh_fid, &bd2);
+                        LASSERT(bd.bd_bucket == bd2.bd_bucket);
+
+                        cfs_hash_bd_del_locked(s->ls_obj_hash,
+                                               &bd2, &h->loh_hash);
+			list_move(&h->loh_lru, &dispose);
+			bkt->lsb_lru_len--;
+			percpu_counter_dec(&s->ls_lru_len_counter);
+                        if (did_sth == 0)
+                                did_sth = 1;
+
+                        if (nr != ~0 && --nr == 0)
+                                break;
+
+                        if (count > 0 && --count == 0)
+                                break;
+
+		}
+		cfs_hash_bd_unlock(s->ls_obj_hash, &bd, 1);
+		cond_resched();
+		/*
+		 * Free everything on the dispose list. This is safe against
+		 * races due to the reasons described in lu_object_put().
+		 */
+		while (!list_empty(&dispose)) {
+			h = container_of0(dispose.next,
+					  struct lu_object_header, loh_lru);
+			list_del_init(&h->loh_lru);
+			lu_object_free(env, lu_object_top(h));
+			lprocfs_counter_incr(s->ls_stats, LU_SS_LRU_PURGED);
+		}
+
+                if (nr == 0)
+                        break;
+        }
+	mutex_unlock(&s->ls_purge_mutex);
+
+        if (nr != 0 && did_sth && start != 0) {
+                start = 0; /* restart from the first bucket */
+                goto again;
+        }
+        /* race on s->ls_purge_start, but nobody cares */
+        s->ls_purge_start = i % CFS_HASH_NBKT(s->ls_obj_hash);
+
+out:
+        return nr;
+}
+EXPORT_SYMBOL(lu_site_purge_objects);
+
+/*
+ * Object printing.
+ *
+ * Code below has to jump through certain loops to output object description
+ * into libcfs_debug_msg-based log. The problem is that lu_object_print()
+ * composes object description from strings that are parts of _lines_ of
+ * output (i.e., strings that are not terminated by newline). This doesn't fit
+ * very well into libcfs_debug_msg() interface that assumes that each message
+ * supplied to it is a self-contained output line.
+ *
+ * To work around this, strings are collected in a temporary buffer
+ * (implemented as a value of lu_cdebug_key key), until terminating newline
+ * character is detected.
+ *
+ */
+
+enum {
+        /**
+         * Maximal line size.
+         *
+         * XXX overflow is not handled correctly.
+         */
+        LU_CDEBUG_LINE = 512
+};
+
+struct lu_cdebug_data {
+        /**
+         * Temporary buffer.
+         */
+        char lck_area[LU_CDEBUG_LINE];
+};
+
+/* context key constructor/destructor: lu_global_key_init, lu_global_key_fini */
+LU_KEY_INIT_FINI(lu_global, struct lu_cdebug_data);
+
+/**
+ * Key, holding temporary buffer. This key is registered very early by
+ * lu_global_init().
+ */
+static struct lu_context_key lu_global_key = {
+	.lct_tags = LCT_MD_THREAD | LCT_DT_THREAD |
+		    LCT_MG_THREAD | LCT_CL_THREAD | LCT_LOCAL,
+	.lct_init = lu_global_key_init,
+	.lct_fini = lu_global_key_fini
+};
+
+/**
+ * Printer function emitting messages through libcfs_debug_msg().
+ */
+int lu_cdebug_printer(const struct lu_env *env,
+                      void *cookie, const char *format, ...)
+{
+        struct libcfs_debug_msg_data *msgdata = cookie;
+        struct lu_cdebug_data        *key;
+        int used;
+        int complete;
+        va_list args;
+
+        va_start(args, format);
+
+        key = lu_context_key_get(&env->le_ctx, &lu_global_key);
+        LASSERT(key != NULL);
+
+        used = strlen(key->lck_area);
+        complete = format[strlen(format) - 1] == '\n';
+        /*
+         * Append new chunk to the buffer.
+         */
+        vsnprintf(key->lck_area + used,
+                  ARRAY_SIZE(key->lck_area) - used, format, args);
+        if (complete) {
+		if (cfs_cdebug_show(msgdata->msg_mask, msgdata->msg_subsys))
+			libcfs_debug_msg(msgdata, "%s\n", key->lck_area);
+                key->lck_area[0] = 0;
+        }
+        va_end(args);
+        return 0;
+}
+EXPORT_SYMBOL(lu_cdebug_printer);
+
+/**
+ * Print object header.
+ */
+void lu_object_header_print(const struct lu_env *env, void *cookie,
+                            lu_printer_t printer,
+                            const struct lu_object_header *hdr)
+{
+	(*printer)(env, cookie, "header@%p[%#lx, %d, "DFID"%s%s%s]",
+		   hdr, hdr->loh_flags, atomic_read(&hdr->loh_ref),
+		   PFID(&hdr->loh_fid),
+		   hlist_unhashed(&hdr->loh_hash) ? "" : " hash",
+		   list_empty((struct list_head *)&hdr->loh_lru) ? \
+		   "" : " lru",
+		   hdr->loh_attr & LOHA_EXISTS ? " exist" : "");
+}
+EXPORT_SYMBOL(lu_object_header_print);
+
+/**
+ * Print human readable representation of the \a o to the \a printer.
+ */
+void lu_object_print(const struct lu_env *env, void *cookie,
+		     lu_printer_t printer, const struct lu_object *o)
+{
+	static const char ruler[] = "........................................";
+	struct lu_object_header *top;
+	int depth = 4;
+
+	top = o->lo_header;
+	lu_object_header_print(env, cookie, printer, top);
+	(*printer)(env, cookie, "{\n");
+
+	list_for_each_entry(o, &top->loh_layers, lo_linkage) {
+		/*
+		 * print `.' \a depth times followed by type name and address
+		 */
+		(*printer)(env, cookie, "%*.*s%s@%p", depth, depth, ruler,
+			   o->lo_dev->ld_type->ldt_name, o);
+
+		if (o->lo_ops->loo_object_print != NULL)
+			(*o->lo_ops->loo_object_print)(env, cookie, printer, o);
+
+		(*printer)(env, cookie, "\n");
+	}
+
+	(*printer)(env, cookie, "} header@%p\n", top);
+}
+EXPORT_SYMBOL(lu_object_print);
+
+/**
+ * Check object consistency.
+ */
+int lu_object_invariant(const struct lu_object *o)
+{
+        struct lu_object_header *top;
+
+        top = o->lo_header;
+	list_for_each_entry(o, &top->loh_layers, lo_linkage) {
+                if (o->lo_ops->loo_object_invariant != NULL &&
+                    !o->lo_ops->loo_object_invariant(o))
+                        return 0;
+        }
+        return 1;
+}
+
+static struct lu_object *htable_lookup(struct lu_site *s,
+				       struct cfs_hash_bd *bd,
+				       const struct lu_fid *f,
+				       __u64 *version)
+{
+	struct lu_site_bkt_data	*bkt;
+	struct lu_object_header	*h;
+	struct hlist_node *hnode;
+	__u64 ver = cfs_hash_bd_version_get(bd);
+
+	if (*version == ver)
+		return ERR_PTR(-ENOENT);
+
+	*version = ver;
+	bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, bd);
+	/* cfs_hash_bd_peek_locked is a somehow "internal" function
+	 * of cfs_hash, it doesn't add refcount on object. */
+	hnode = cfs_hash_bd_peek_locked(s->ls_obj_hash, bd, (void *)f);
+	if (!hnode) {
+		lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_MISS);
+		return ERR_PTR(-ENOENT);
+	}
+
+	h = container_of0(hnode, struct lu_object_header, loh_hash);
+	cfs_hash_get(s->ls_obj_hash, hnode);
+	lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_HIT);
+	if (!list_empty(&h->loh_lru)) {
+		list_del_init(&h->loh_lru);
+		bkt->lsb_lru_len--;
+		percpu_counter_dec(&s->ls_lru_len_counter);
+	}
+	return lu_object_top(h);
+}
+
+/**
+ * Search cache for an object with the fid \a f. If such object is found,
+ * return it. Otherwise, create new object, insert it into cache and return
+ * it. In any case, additional reference is acquired on the returned object.
+ */
+struct lu_object *lu_object_find(const struct lu_env *env,
+                                 struct lu_device *dev, const struct lu_fid *f,
+                                 const struct lu_object_conf *conf)
+{
+        return lu_object_find_at(env, dev->ld_site->ls_top_dev, f, conf);
+}
+EXPORT_SYMBOL(lu_object_find);
+
+/*
+ * Limit the lu_object cache to a maximum of lu_cache_nr objects.  Because
+ * the calculation for the number of objects to reclaim is not covered by
+ * a lock the maximum number of objects is capped by LU_CACHE_MAX_ADJUST.
+ * This ensures that many concurrent threads will not accidentally purge
+ * the entire cache.
+ */
+static void lu_object_limit(const struct lu_env *env,
+			    struct lu_device *dev)
+{
+	__u64 size, nr;
+
+	if (lu_cache_nr == LU_CACHE_NR_UNLIMITED)
+		return;
+
+	size = cfs_hash_size_get(dev->ld_site->ls_obj_hash);
+	nr = (__u64)lu_cache_nr;
+	if (size <= nr)
+		return;
+
+	lu_site_purge_objects(env, dev->ld_site,
+			      MIN(size - nr, LU_CACHE_NR_MAX_ADJUST), 0);
+}
+
+static struct lu_object *lu_object_new(const struct lu_env *env,
+				       struct lu_device *dev,
+				       const struct lu_fid *f,
+				       const struct lu_object_conf *conf)
+{
+	struct lu_object *o;
+	struct cfs_hash *hs;
+	struct cfs_hash_bd bd;
+
+	o = lu_object_alloc(env, dev, f, conf);
+	if (unlikely(IS_ERR(o)))
+		return o;
+
+	hs = dev->ld_site->ls_obj_hash;
+	cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1);
+	cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+	cfs_hash_bd_unlock(hs, &bd, 1);
+
+	lu_object_limit(env, dev);
+
+	return o;
+}
+
+/**
+ * Core logic of lu_object_find*() functions.
+ *
+ * Much like lu_object_find(), but top level device of object is specifically
+ * \a dev rather than top level device of the site. This interface allows
+ * objects of different "stacking" to be created within the same site.
+ */
+struct lu_object *lu_object_find_at(const struct lu_env *env,
+				    struct lu_device *dev,
+				    const struct lu_fid *f,
+				    const struct lu_object_conf *conf)
+{
+	struct lu_object *o;
+	struct lu_object *shadow;
+	struct lu_site *s;
+	struct cfs_hash *hs;
+	struct cfs_hash_bd bd;
+	__u64 version = 0;
+
+	/*
+	 * This uses standard index maintenance protocol:
+	 *
+	 *     - search index under lock, and return object if found;
+	 *     - otherwise, unlock index, allocate new object;
+	 *     - lock index and search again;
+	 *     - if nothing is found (usual case), insert newly created
+	 *       object into index;
+	 *     - otherwise (race: other thread inserted object), free
+	 *       object just allocated.
+	 *     - unlock index;
+	 *     - return object.
+	 *
+	 * For "LOC_F_NEW" case, we are sure the object is new established.
+	 * It is unnecessary to perform lookup-alloc-lookup-insert, instead,
+	 * just alloc and insert directly.
+	 *
+	 * If dying object is found during index search, add @waiter to the
+	 * site wait-queue and return ERR_PTR(-EAGAIN).
+	 */
+	if (conf && conf->loc_flags & LOC_F_NEW)
+		return lu_object_new(env, dev, f, conf);
+
+	s  = dev->ld_site;
+	hs = s->ls_obj_hash;
+	cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1);
+	o = htable_lookup(s, &bd, f, &version);
+	cfs_hash_bd_unlock(hs, &bd, 1);
+	if (!IS_ERR(o) || PTR_ERR(o) != -ENOENT)
+		return o;
+
+	/*
+	 * Allocate new object. This may result in rather complicated
+	 * operations, including fld queries, inode loading, etc.
+	 */
+	o = lu_object_alloc(env, dev, f, conf);
+	if (unlikely(IS_ERR(o)))
+		return o;
+
+	LASSERT(lu_fid_eq(lu_object_fid(o), f));
+
+	cfs_hash_bd_lock(hs, &bd, 1);
+
+	shadow = htable_lookup(s, &bd, f, &version);
+	if (likely(IS_ERR(shadow) && PTR_ERR(shadow) == -ENOENT)) {
+		cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+		cfs_hash_bd_unlock(hs, &bd, 1);
+
+		lu_object_limit(env, dev);
+
+		return o;
+	}
+
+	lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_RACE);
+	cfs_hash_bd_unlock(hs, &bd, 1);
+	lu_object_free(env, o);
+	return shadow;
+}
+EXPORT_SYMBOL(lu_object_find_at);
+
+/**
+ * Find object with given fid, and return its slice belonging to given device.
+ */
+struct lu_object *lu_object_find_slice(const struct lu_env *env,
+                                       struct lu_device *dev,
+                                       const struct lu_fid *f,
+                                       const struct lu_object_conf *conf)
+{
+	struct lu_object *top;
+	struct lu_object *obj;
+
+	top = lu_object_find(env, dev, f, conf);
+	if (IS_ERR(top))
+		return top;
+
+	obj = lu_object_locate(top->lo_header, dev->ld_type);
+	if (unlikely(obj == NULL)) {
+		lu_object_put(env, top);
+		obj = ERR_PTR(-ENOENT);
+	}
+
+	return obj;
+}
+EXPORT_SYMBOL(lu_object_find_slice);
+
+int lu_device_type_init(struct lu_device_type *ldt)
+{
+	int result = 0;
+
+	atomic_set(&ldt->ldt_device_nr, 0);
+	if (ldt->ldt_ops->ldto_init)
+		result = ldt->ldt_ops->ldto_init(ldt);
+
+	return result;
+}
+EXPORT_SYMBOL(lu_device_type_init);
+
+void lu_device_type_fini(struct lu_device_type *ldt)
+{
+	if (ldt->ldt_ops->ldto_fini)
+		ldt->ldt_ops->ldto_fini(ldt);
+}
+EXPORT_SYMBOL(lu_device_type_fini);
+
+/**
+ * Global list of all sites on this node
+ */
+static LIST_HEAD(lu_sites);
+static DECLARE_RWSEM(lu_sites_guard);
+
+/**
+ * Global environment used by site shrinker.
+ */
+static struct lu_env lu_shrink_env;
+
+struct lu_site_print_arg {
+        struct lu_env   *lsp_env;
+        void            *lsp_cookie;
+        lu_printer_t     lsp_printer;
+};
+
+static int
+lu_site_obj_print(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		  struct hlist_node *hnode, void *data)
+{
+	struct lu_site_print_arg *arg = (struct lu_site_print_arg *)data;
+	struct lu_object_header  *h;
+
+	h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+	if (!list_empty(&h->loh_layers)) {
+		const struct lu_object *o;
+
+		o = lu_object_top(h);
+		lu_object_print(arg->lsp_env, arg->lsp_cookie,
+				arg->lsp_printer, o);
+	} else {
+		lu_object_header_print(arg->lsp_env, arg->lsp_cookie,
+				       arg->lsp_printer, h);
+	}
+	return 0;
+}
+
+/**
+ * Print all objects in \a s.
+ */
+void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie,
+                   lu_printer_t printer)
+{
+        struct lu_site_print_arg arg = {
+                .lsp_env     = (struct lu_env *)env,
+                .lsp_cookie  = cookie,
+                .lsp_printer = printer,
+        };
+
+        cfs_hash_for_each(s->ls_obj_hash, lu_site_obj_print, &arg);
+}
+EXPORT_SYMBOL(lu_site_print);
+
+/**
+ * Return desired hash table order.
+ */
+static unsigned long lu_htable_order(struct lu_device *top)
+{
+	unsigned long cache_size;
+	unsigned long bits;
+	unsigned long bits_max = LU_SITE_BITS_MAX;
+
+	/*
+	 * For ZFS based OSDs the cache should be disabled by default.  This
+	 * allows the ZFS ARC maximum flexibility in determining what buffers
+	 * to cache.  If Lustre has objects or buffer which it wants to ensure
+	 * always stay cached it must maintain a hold on them.
+	 */
+	if (strcmp(top->ld_type->ldt_name, LUSTRE_OSD_ZFS_NAME) == 0) {
+		lu_cache_percent = 1;
+		lu_cache_nr = LU_CACHE_NR_ZFS_LIMIT;
+		return LU_SITE_BITS_MIN;
+	}
+
+	if (strcmp(top->ld_type->ldt_name, LUSTRE_VVP_NAME) == 0)
+		bits_max = LU_SITE_BITS_MAX_CL;
+
+        /*
+         * Calculate hash table size, assuming that we want reasonable
+         * performance when 20% of total memory is occupied by cache of
+         * lu_objects.
+         *
+         * Size of lu_object is (arbitrary) taken as 1K (together with inode).
+         */
+	cache_size = totalram_pages;
+
+#if BITS_PER_LONG == 32
+        /* limit hashtable size for lowmem systems to low RAM */
+	if (cache_size > 1 << (30 - PAGE_SHIFT))
+		cache_size = 1 << (30 - PAGE_SHIFT) * 3 / 4;
+#endif
+
+        /* clear off unreasonable cache setting. */
+        if (lu_cache_percent == 0 || lu_cache_percent > LU_CACHE_PERCENT_MAX) {
+                CWARN("obdclass: invalid lu_cache_percent: %u, it must be in"
+                      " the range of (0, %u]. Will use default value: %u.\n",
+                      lu_cache_percent, LU_CACHE_PERCENT_MAX,
+                      LU_CACHE_PERCENT_DEFAULT);
+
+                lu_cache_percent = LU_CACHE_PERCENT_DEFAULT;
+        }
+        cache_size = cache_size / 100 * lu_cache_percent *
+		(PAGE_SIZE / 1024);
+
+        for (bits = 1; (1 << bits) < cache_size; ++bits) {
+                ;
+        }
+
+	return clamp_t(typeof(bits), bits, LU_SITE_BITS_MIN, bits_max);
+}
+
+static unsigned lu_obj_hop_hash(struct cfs_hash *hs,
+				const void *key, unsigned mask)
+{
+	struct lu_fid  *fid = (struct lu_fid *)key;
+	__u32           hash;
+
+	hash = fid_flatten32(fid);
+	hash += (hash >> 4) + (hash << 12); /* mixing oid and seq */
+	hash = hash_long(hash, hs->hs_bkt_bits);
+
+	/* give me another random factor */
+	hash -= hash_long((unsigned long)hs, fid_oid(fid) % 11 + 3);
+
+	hash <<= hs->hs_cur_bits - hs->hs_bkt_bits;
+	hash |= (fid_seq(fid) + fid_oid(fid)) & (CFS_HASH_NBKT(hs) - 1);
+
+	return hash & mask;
+}
+
+static void *lu_obj_hop_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct lu_object_header, loh_hash);
+}
+
+static void *lu_obj_hop_key(struct hlist_node *hnode)
+{
+	struct lu_object_header *h;
+
+	h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+	return &h->loh_fid;
+}
+
+static int lu_obj_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct lu_object_header *h;
+
+	h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+	return lu_fid_eq(&h->loh_fid, (struct lu_fid *)key);
+}
+
+static void lu_obj_hop_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct lu_object_header *h;
+
+	h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+	atomic_inc(&h->loh_ref);
+}
+
+static void lu_obj_hop_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+        LBUG(); /* we should never called it */
+}
+
+static struct cfs_hash_ops lu_site_hash_ops = {
+        .hs_hash        = lu_obj_hop_hash,
+        .hs_key         = lu_obj_hop_key,
+        .hs_keycmp      = lu_obj_hop_keycmp,
+        .hs_object      = lu_obj_hop_object,
+        .hs_get         = lu_obj_hop_get,
+        .hs_put_locked  = lu_obj_hop_put_locked,
+};
+
+void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d)
+{
+	spin_lock(&s->ls_ld_lock);
+	if (list_empty(&d->ld_linkage))
+		list_add(&d->ld_linkage, &s->ls_ld_linkage);
+	spin_unlock(&s->ls_ld_lock);
+}
+EXPORT_SYMBOL(lu_dev_add_linkage);
+
+void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d)
+{
+	spin_lock(&s->ls_ld_lock);
+	list_del_init(&d->ld_linkage);
+	spin_unlock(&s->ls_ld_lock);
+}
+EXPORT_SYMBOL(lu_dev_del_linkage);
+
+/**
+  * Initialize site \a s, with \a d as the top level device.
+  */
+int lu_site_init(struct lu_site *s, struct lu_device *top)
+{
+	struct lu_site_bkt_data *bkt;
+	struct cfs_hash_bd bd;
+	char name[16];
+	unsigned long bits;
+	unsigned int i;
+	int rc;
+	ENTRY;
+
+	memset(s, 0, sizeof *s);
+	mutex_init(&s->ls_purge_mutex);
+
+#ifdef HAVE_PERCPU_COUNTER_INIT_GFP_FLAG
+	rc = percpu_counter_init(&s->ls_lru_len_counter, 0, GFP_NOFS);
+#else
+	rc = percpu_counter_init(&s->ls_lru_len_counter, 0);
+#endif
+	if (rc)
+		return -ENOMEM;
+
+	snprintf(name, sizeof(name), "lu_site_%s", top->ld_type->ldt_name);
+	for (bits = lu_htable_order(top);
+	     bits >= LU_SITE_BITS_MIN; bits--) {
+		s->ls_obj_hash = cfs_hash_create(name, bits, bits,
+						 bits - LU_SITE_BKT_BITS,
+						 sizeof(*bkt), 0, 0,
+						 &lu_site_hash_ops,
+						 CFS_HASH_SPIN_BKTLOCK |
+						 CFS_HASH_NO_ITEMREF |
+						 CFS_HASH_DEPTH |
+						 CFS_HASH_ASSERT_EMPTY |
+						 CFS_HASH_COUNTER);
+		if (s->ls_obj_hash != NULL)
+			break;
+	}
+
+	if (s->ls_obj_hash == NULL) {
+		CERROR("failed to create lu_site hash with bits: %lu\n", bits);
+		return -ENOMEM;
+	}
+
+	cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) {
+		bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd);
+		INIT_LIST_HEAD(&bkt->lsb_lru);
+		init_waitqueue_head(&bkt->lsb_marche_funebre);
+	}
+
+        s->ls_stats = lprocfs_alloc_stats(LU_SS_LAST_STAT, 0);
+        if (s->ls_stats == NULL) {
+                cfs_hash_putref(s->ls_obj_hash);
+                s->ls_obj_hash = NULL;
+                return -ENOMEM;
+        }
+
+        lprocfs_counter_init(s->ls_stats, LU_SS_CREATED,
+                             0, "created", "created");
+        lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_HIT,
+                             0, "cache_hit", "cache_hit");
+        lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_MISS,
+                             0, "cache_miss", "cache_miss");
+        lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_RACE,
+                             0, "cache_race", "cache_race");
+        lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_DEATH_RACE,
+                             0, "cache_death_race", "cache_death_race");
+        lprocfs_counter_init(s->ls_stats, LU_SS_LRU_PURGED,
+                             0, "lru_purged", "lru_purged");
+
+	INIT_LIST_HEAD(&s->ls_linkage);
+        s->ls_top_dev = top;
+        top->ld_site = s;
+        lu_device_get(top);
+        lu_ref_add(&top->ld_reference, "site-top", s);
+
+	INIT_LIST_HEAD(&s->ls_ld_linkage);
+	spin_lock_init(&s->ls_ld_lock);
+
+	lu_dev_add_linkage(s, top);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lu_site_init);
+
+/**
+ * Finalize \a s and release its resources.
+ */
+void lu_site_fini(struct lu_site *s)
+{
+	down_write(&lu_sites_guard);
+	list_del_init(&s->ls_linkage);
+	up_write(&lu_sites_guard);
+
+	percpu_counter_destroy(&s->ls_lru_len_counter);
+
+        if (s->ls_obj_hash != NULL) {
+                cfs_hash_putref(s->ls_obj_hash);
+                s->ls_obj_hash = NULL;
+        }
+
+        if (s->ls_top_dev != NULL) {
+                s->ls_top_dev->ld_site = NULL;
+                lu_ref_del(&s->ls_top_dev->ld_reference, "site-top", s);
+                lu_device_put(s->ls_top_dev);
+                s->ls_top_dev = NULL;
+        }
+
+        if (s->ls_stats != NULL)
+                lprocfs_free_stats(&s->ls_stats);
+}
+EXPORT_SYMBOL(lu_site_fini);
+
+/**
+ * Called when initialization of stack for this site is completed.
+ */
+int lu_site_init_finish(struct lu_site *s)
+{
+        int result;
+	down_write(&lu_sites_guard);
+        result = lu_context_refill(&lu_shrink_env.le_ctx);
+        if (result == 0)
+		list_add(&s->ls_linkage, &lu_sites);
+	up_write(&lu_sites_guard);
+        return result;
+}
+EXPORT_SYMBOL(lu_site_init_finish);
+
+/**
+ * Acquire additional reference on device \a d
+ */
+void lu_device_get(struct lu_device *d)
+{
+	atomic_inc(&d->ld_ref);
+}
+EXPORT_SYMBOL(lu_device_get);
+
+/**
+ * Release reference on device \a d.
+ */
+void lu_device_put(struct lu_device *d)
+{
+	LASSERT(atomic_read(&d->ld_ref) > 0);
+	atomic_dec(&d->ld_ref);
+}
+EXPORT_SYMBOL(lu_device_put);
+
+/**
+ * Initialize device \a d of type \a t.
+ */
+int lu_device_init(struct lu_device *d, struct lu_device_type *t)
+{
+	if (atomic_inc_return(&t->ldt_device_nr) == 1 &&
+	    t->ldt_ops->ldto_start != NULL)
+		t->ldt_ops->ldto_start(t);
+
+	memset(d, 0, sizeof *d);
+	d->ld_type = t;
+	lu_ref_init(&d->ld_reference);
+	INIT_LIST_HEAD(&d->ld_linkage);
+
+	return 0;
+}
+EXPORT_SYMBOL(lu_device_init);
+
+/**
+ * Finalize device \a d.
+ */
+void lu_device_fini(struct lu_device *d)
+{
+	struct lu_device_type *t = d->ld_type;
+
+	if (d->ld_obd != NULL) {
+		d->ld_obd->obd_lu_dev = NULL;
+		d->ld_obd = NULL;
+	}
+
+	lu_ref_fini(&d->ld_reference);
+	LASSERTF(atomic_read(&d->ld_ref) == 0,
+		 "Refcount is %u\n", atomic_read(&d->ld_ref));
+	LASSERT(atomic_read(&t->ldt_device_nr) > 0);
+
+	if (atomic_dec_and_test(&t->ldt_device_nr) &&
+	    t->ldt_ops->ldto_stop != NULL)
+		t->ldt_ops->ldto_stop(t);
+}
+EXPORT_SYMBOL(lu_device_fini);
+
+/**
+ * Initialize object \a o that is part of compound object \a h and was created
+ * by device \a d.
+ */
+int lu_object_init(struct lu_object *o, struct lu_object_header *h,
+		   struct lu_device *d)
+{
+	memset(o, 0, sizeof(*o));
+	o->lo_header = h;
+	o->lo_dev = d;
+	lu_device_get(d);
+	lu_ref_add_at(&d->ld_reference, &o->lo_dev_ref, "lu_object", o);
+	INIT_LIST_HEAD(&o->lo_linkage);
+
+	return 0;
+}
+EXPORT_SYMBOL(lu_object_init);
+
+/**
+ * Finalize object and release its resources.
+ */
+void lu_object_fini(struct lu_object *o)
+{
+	struct lu_device *dev = o->lo_dev;
+
+	LASSERT(list_empty(&o->lo_linkage));
+
+	if (dev != NULL) {
+		lu_ref_del_at(&dev->ld_reference, &o->lo_dev_ref,
+			      "lu_object", o);
+		lu_device_put(dev);
+		o->lo_dev = NULL;
+	}
+}
+EXPORT_SYMBOL(lu_object_fini);
+
+/**
+ * Add object \a o as first layer of compound object \a h
+ *
+ * This is typically called by the ->ldo_object_alloc() method of top-level
+ * device.
+ */
+void lu_object_add_top(struct lu_object_header *h, struct lu_object *o)
+{
+	list_move(&o->lo_linkage, &h->loh_layers);
+}
+EXPORT_SYMBOL(lu_object_add_top);
+
+/**
+ * Add object \a o as a layer of compound object, going after \a before.
+ *
+ * This is typically called by the ->ldo_object_alloc() method of \a
+ * before->lo_dev.
+ */
+void lu_object_add(struct lu_object *before, struct lu_object *o)
+{
+	list_move(&o->lo_linkage, &before->lo_linkage);
+}
+EXPORT_SYMBOL(lu_object_add);
+
+/**
+ * Initialize compound object.
+ */
+int lu_object_header_init(struct lu_object_header *h)
+{
+        memset(h, 0, sizeof *h);
+	atomic_set(&h->loh_ref, 1);
+	INIT_HLIST_NODE(&h->loh_hash);
+	INIT_LIST_HEAD(&h->loh_lru);
+	INIT_LIST_HEAD(&h->loh_layers);
+        lu_ref_init(&h->loh_reference);
+        return 0;
+}
+EXPORT_SYMBOL(lu_object_header_init);
+
+/**
+ * Finalize compound object.
+ */
+void lu_object_header_fini(struct lu_object_header *h)
+{
+	LASSERT(list_empty(&h->loh_layers));
+	LASSERT(list_empty(&h->loh_lru));
+	LASSERT(hlist_unhashed(&h->loh_hash));
+        lu_ref_fini(&h->loh_reference);
+}
+EXPORT_SYMBOL(lu_object_header_fini);
+
+/**
+ * Given a compound object, find its slice, corresponding to the device type
+ * \a dtype.
+ */
+struct lu_object *lu_object_locate(struct lu_object_header *h,
+                                   const struct lu_device_type *dtype)
+{
+	struct lu_object *o;
+
+	list_for_each_entry(o, &h->loh_layers, lo_linkage) {
+		if (o->lo_dev->ld_type == dtype)
+			return o;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(lu_object_locate);
+
+/**
+ * Finalize and free devices in the device stack.
+ *
+ * Finalize device stack by purging object cache, and calling
+ * lu_device_type_operations::ldto_device_fini() and
+ * lu_device_type_operations::ldto_device_free() on all devices in the stack.
+ */
+void lu_stack_fini(const struct lu_env *env, struct lu_device *top)
+{
+        struct lu_site   *site = top->ld_site;
+        struct lu_device *scan;
+        struct lu_device *next;
+
+        lu_site_purge(env, site, ~0);
+        for (scan = top; scan != NULL; scan = next) {
+                next = scan->ld_type->ldt_ops->ldto_device_fini(env, scan);
+                lu_ref_del(&scan->ld_reference, "lu-stack", &lu_site_init);
+                lu_device_put(scan);
+        }
+
+        /* purge again. */
+        lu_site_purge(env, site, ~0);
+
+        for (scan = top; scan != NULL; scan = next) {
+                const struct lu_device_type *ldt = scan->ld_type;
+                struct obd_type             *type;
+
+                next = ldt->ldt_ops->ldto_device_free(env, scan);
+                type = ldt->ldt_obd_type;
+                if (type != NULL) {
+                        type->typ_refcnt--;
+                        class_put_type(type);
+                }
+        }
+}
+
+enum {
+        /**
+         * Maximal number of tld slots.
+         */
+        LU_CONTEXT_KEY_NR = 40
+};
+
+static struct lu_context_key *lu_keys[LU_CONTEXT_KEY_NR] = { NULL, };
+
+DEFINE_RWLOCK(lu_keys_guard);
+static atomic_t lu_key_initing_cnt = ATOMIC_INIT(0);
+
+/**
+ * Global counter incremented whenever key is registered, unregistered,
+ * revived or quiesced. This is used to void unnecessary calls to
+ * lu_context_refill(). No locking is provided, as initialization and shutdown
+ * are supposed to be externally serialized.
+ */
+static unsigned key_set_version = 0;
+
+/**
+ * Register new key.
+ */
+int lu_context_key_register(struct lu_context_key *key)
+{
+	int result;
+	unsigned int i;
+
+        LASSERT(key->lct_init != NULL);
+        LASSERT(key->lct_fini != NULL);
+        LASSERT(key->lct_tags != 0);
+        LASSERT(key->lct_owner != NULL);
+
+        result = -ENFILE;
+	write_lock(&lu_keys_guard);
+        for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+                if (lu_keys[i] == NULL) {
+                        key->lct_index = i;
+			atomic_set(&key->lct_used, 1);
+                        lu_keys[i] = key;
+                        lu_ref_init(&key->lct_reference);
+                        result = 0;
+                        ++key_set_version;
+                        break;
+                }
+        }
+	write_unlock(&lu_keys_guard);
+	return result;
+}
+EXPORT_SYMBOL(lu_context_key_register);
+
+static void key_fini(struct lu_context *ctx, int index)
+{
+        if (ctx->lc_value != NULL && ctx->lc_value[index] != NULL) {
+                struct lu_context_key *key;
+
+                key = lu_keys[index];
+                LASSERT(key != NULL);
+                LASSERT(key->lct_fini != NULL);
+		LASSERT(atomic_read(&key->lct_used) > 1);
+
+                key->lct_fini(ctx, key, ctx->lc_value[index]);
+                lu_ref_del(&key->lct_reference, "ctx", ctx);
+		atomic_dec(&key->lct_used);
+
+		LASSERT(key->lct_owner != NULL);
+		if ((ctx->lc_tags & LCT_NOREF) == 0) {
+			LINVRNT(module_refcount(key->lct_owner) > 0);
+			module_put(key->lct_owner);
+		}
+		ctx->lc_value[index] = NULL;
+	}
+}
+
+/**
+ * Deregister key.
+ */
+void lu_context_key_degister(struct lu_context_key *key)
+{
+	LASSERT(atomic_read(&key->lct_used) >= 1);
+	LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys));
+
+	lu_context_key_quiesce(key);
+
+	write_lock(&lu_keys_guard);
+	++key_set_version;
+	key_fini(&lu_shrink_env.le_ctx, key->lct_index);
+
+	/**
+	 * Wait until all transient contexts referencing this key have
+	 * run lu_context_key::lct_fini() method.
+	 */
+	while (atomic_read(&key->lct_used) > 1) {
+		write_unlock(&lu_keys_guard);
+		CDEBUG(D_INFO, "lu_context_key_degister: \"%s\" %p, %d\n",
+		       key->lct_owner ? key->lct_owner->name : "", key,
+		       atomic_read(&key->lct_used));
+		schedule();
+		write_lock(&lu_keys_guard);
+	}
+	if (lu_keys[key->lct_index]) {
+		lu_keys[key->lct_index] = NULL;
+		lu_ref_fini(&key->lct_reference);
+	}
+	write_unlock(&lu_keys_guard);
+
+	LASSERTF(atomic_read(&key->lct_used) == 1,
+		 "key has instances: %d\n",
+		 atomic_read(&key->lct_used));
+}
+EXPORT_SYMBOL(lu_context_key_degister);
+
+/**
+ * Register a number of keys. This has to be called after all keys have been
+ * initialized by a call to LU_CONTEXT_KEY_INIT().
+ */
+int lu_context_key_register_many(struct lu_context_key *k, ...)
+{
+        struct lu_context_key *key = k;
+        va_list args;
+        int result;
+
+        va_start(args, k);
+        do {
+                result = lu_context_key_register(key);
+                if (result)
+                        break;
+                key = va_arg(args, struct lu_context_key *);
+        } while (key != NULL);
+        va_end(args);
+
+        if (result != 0) {
+                va_start(args, k);
+                while (k != key) {
+                        lu_context_key_degister(k);
+                        k = va_arg(args, struct lu_context_key *);
+                }
+                va_end(args);
+        }
+
+        return result;
+}
+EXPORT_SYMBOL(lu_context_key_register_many);
+
+/**
+ * De-register a number of keys. This is a dual to
+ * lu_context_key_register_many().
+ */
+void lu_context_key_degister_many(struct lu_context_key *k, ...)
+{
+        va_list args;
+
+        va_start(args, k);
+        do {
+                lu_context_key_degister(k);
+                k = va_arg(args, struct lu_context_key*);
+        } while (k != NULL);
+        va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_degister_many);
+
+/**
+ * Revive a number of keys.
+ */
+void lu_context_key_revive_many(struct lu_context_key *k, ...)
+{
+        va_list args;
+
+        va_start(args, k);
+        do {
+                lu_context_key_revive(k);
+                k = va_arg(args, struct lu_context_key*);
+        } while (k != NULL);
+        va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_revive_many);
+
+/**
+ * Quiescent a number of keys.
+ */
+void lu_context_key_quiesce_many(struct lu_context_key *k, ...)
+{
+        va_list args;
+
+        va_start(args, k);
+        do {
+                lu_context_key_quiesce(k);
+                k = va_arg(args, struct lu_context_key*);
+        } while (k != NULL);
+        va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_quiesce_many);
+
+/**
+ * Return value associated with key \a key in context \a ctx.
+ */
+void *lu_context_key_get(const struct lu_context *ctx,
+                         const struct lu_context_key *key)
+{
+        LINVRNT(ctx->lc_state == LCS_ENTERED);
+        LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys));
+        LASSERT(lu_keys[key->lct_index] == key);
+        return ctx->lc_value[key->lct_index];
+}
+EXPORT_SYMBOL(lu_context_key_get);
+
+/**
+ * List of remembered contexts. XXX document me.
+ */
+static LIST_HEAD(lu_context_remembered);
+
+/**
+ * Destroy \a key in all remembered contexts. This is used to destroy key
+ * values in "shared" contexts (like service threads), when a module owning
+ * the key is about to be unloaded.
+ */
+void lu_context_key_quiesce(struct lu_context_key *key)
+{
+	struct lu_context *ctx;
+
+	if (!(key->lct_tags & LCT_QUIESCENT)) {
+                /*
+                 * XXX memory barrier has to go here.
+                 */
+		write_lock(&lu_keys_guard);
+		key->lct_tags |= LCT_QUIESCENT;
+
+		/**
+		 * Wait until all lu_context_key::lct_init() methods
+		 * have completed.
+		 */
+		while (atomic_read(&lu_key_initing_cnt) > 0) {
+			write_unlock(&lu_keys_guard);
+			CDEBUG(D_INFO, "lu_context_key_quiesce: \"%s\""
+			       " %p, %d (%d)\n",
+			       key->lct_owner ? key->lct_owner->name : "",
+			       key, atomic_read(&key->lct_used),
+			       atomic_read(&lu_key_initing_cnt));
+			schedule();
+			write_lock(&lu_keys_guard);
+		}
+
+		list_for_each_entry(ctx, &lu_context_remembered,
+				    lc_remember)
+			key_fini(ctx, key->lct_index);
+
+		++key_set_version;
+		write_unlock(&lu_keys_guard);
+	}
+}
+
+void lu_context_key_revive(struct lu_context_key *key)
+{
+	write_lock(&lu_keys_guard);
+	key->lct_tags &= ~LCT_QUIESCENT;
+	++key_set_version;
+	write_unlock(&lu_keys_guard);
+}
+
+static void keys_fini(struct lu_context *ctx)
+{
+	unsigned int i;
+
+	if (ctx->lc_value == NULL)
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(lu_keys); ++i)
+		key_fini(ctx, i);
+
+	OBD_FREE(ctx->lc_value, ARRAY_SIZE(lu_keys) * sizeof ctx->lc_value[0]);
+	ctx->lc_value = NULL;
+}
+
+static int keys_fill(struct lu_context *ctx)
+{
+	unsigned int i;
+	unsigned pre_version;
+
+	/*
+	 * A serialisation with lu_context_key_quiesce() is needed, but some
+	 * "key->lct_init()" are calling kernel memory allocation routine and
+	 * can't be called while holding a spin_lock.
+	 * "lu_keys_guard" is held while incrementing "lu_key_initing_cnt"
+	 * to ensure the start of the serialisation.
+	 * An atomic_t variable is still used, in order not to reacquire the
+	 * lock when decrementing the counter.
+	 */
+	read_lock(&lu_keys_guard);
+	atomic_inc(&lu_key_initing_cnt);
+	pre_version = key_set_version;
+	read_unlock(&lu_keys_guard);
+
+refill:
+	LINVRNT(ctx->lc_value != NULL);
+	for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+		struct lu_context_key *key;
+
+		key = lu_keys[i];
+		if (ctx->lc_value[i] == NULL && key != NULL &&
+		    (key->lct_tags & ctx->lc_tags) &&
+		    /*
+		     * Don't create values for a LCT_QUIESCENT key, as this
+		     * will pin module owning a key.
+		     */
+		    !(key->lct_tags & LCT_QUIESCENT)) {
+			void *value;
+
+			LINVRNT(key->lct_init != NULL);
+			LINVRNT(key->lct_index == i);
+
+			LASSERT(key->lct_owner != NULL);
+			if (!(ctx->lc_tags & LCT_NOREF) &&
+			    try_module_get(key->lct_owner) == 0) {
+				/* module is unloading, skip this key */
+				continue;
+			}
+
+			value = key->lct_init(ctx, key);
+			if (unlikely(IS_ERR(value))) {
+				atomic_dec(&lu_key_initing_cnt);
+				return PTR_ERR(value);
+			}
+
+			lu_ref_add_atomic(&key->lct_reference, "ctx", ctx);
+			atomic_inc(&key->lct_used);
+			/*
+			 * This is the only place in the code, where an
+			 * element of ctx->lc_value[] array is set to non-NULL
+			 * value.
+			 */
+			ctx->lc_value[i] = value;
+			if (key->lct_exit != NULL)
+				ctx->lc_tags |= LCT_HAS_EXIT;
+		}
+	}
+
+	read_lock(&lu_keys_guard);
+	if (pre_version != key_set_version) {
+		pre_version = key_set_version;
+		read_unlock(&lu_keys_guard);
+		goto refill;
+	}
+
+	ctx->lc_version = key_set_version;
+
+	atomic_dec(&lu_key_initing_cnt);
+	read_unlock(&lu_keys_guard);
+	return 0;
+}
+
+static int keys_init(struct lu_context *ctx)
+{
+	OBD_ALLOC(ctx->lc_value, ARRAY_SIZE(lu_keys) * sizeof ctx->lc_value[0]);
+	if (likely(ctx->lc_value != NULL))
+		return keys_fill(ctx);
+
+	return -ENOMEM;
+}
+
+/**
+ * Initialize context data-structure. Create values for all keys.
+ */
+int lu_context_init(struct lu_context *ctx, __u32 tags)
+{
+	int	rc;
+
+	memset(ctx, 0, sizeof *ctx);
+	ctx->lc_state = LCS_INITIALIZED;
+	ctx->lc_tags = tags;
+	if (tags & LCT_REMEMBER) {
+		write_lock(&lu_keys_guard);
+		list_add(&ctx->lc_remember, &lu_context_remembered);
+		write_unlock(&lu_keys_guard);
+	} else {
+		INIT_LIST_HEAD(&ctx->lc_remember);
+	}
+
+	rc = keys_init(ctx);
+	if (rc != 0)
+		lu_context_fini(ctx);
+
+	return rc;
+}
+EXPORT_SYMBOL(lu_context_init);
+
+/**
+ * Finalize context data-structure. Destroy key values.
+ */
+void lu_context_fini(struct lu_context *ctx)
+{
+	LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT);
+	ctx->lc_state = LCS_FINALIZED;
+
+	if ((ctx->lc_tags & LCT_REMEMBER) == 0) {
+		LASSERT(list_empty(&ctx->lc_remember));
+		keys_fini(ctx);
+
+	} else { /* could race with key degister */
+		write_lock(&lu_keys_guard);
+		keys_fini(ctx);
+		list_del_init(&ctx->lc_remember);
+		write_unlock(&lu_keys_guard);
+	}
+}
+EXPORT_SYMBOL(lu_context_fini);
+
+/**
+ * Called before entering context.
+ */
+void lu_context_enter(struct lu_context *ctx)
+{
+        LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT);
+        ctx->lc_state = LCS_ENTERED;
+}
+EXPORT_SYMBOL(lu_context_enter);
+
+/**
+ * Called after exiting from \a ctx
+ */
+void lu_context_exit(struct lu_context *ctx)
+{
+	unsigned int i;
+
+        LINVRNT(ctx->lc_state == LCS_ENTERED);
+        ctx->lc_state = LCS_LEFT;
+        if (ctx->lc_tags & LCT_HAS_EXIT && ctx->lc_value != NULL) {
+		/* could race with key quiescency */
+		if (ctx->lc_tags & LCT_REMEMBER)
+			read_lock(&lu_keys_guard);
+
+                for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+			if (ctx->lc_value[i] != NULL) {
+				struct lu_context_key *key;
+
+				key = lu_keys[i];
+				LASSERT(key != NULL);
+				if (key->lct_exit != NULL)
+					key->lct_exit(ctx,
+						      key, ctx->lc_value[i]);
+			}
+                }
+
+		if (ctx->lc_tags & LCT_REMEMBER)
+			read_unlock(&lu_keys_guard);
+        }
+}
+EXPORT_SYMBOL(lu_context_exit);
+
+/**
+ * Allocate for context all missing keys that were registered after context
+ * creation. key_set_version is only changed in rare cases when modules
+ * are loaded and removed.
+ */
+int lu_context_refill(struct lu_context *ctx)
+{
+	read_lock(&lu_keys_guard);
+	if (likely(ctx->lc_version == key_set_version)) {
+		read_unlock(&lu_keys_guard);
+		return 0;
+	}
+
+	read_unlock(&lu_keys_guard);
+	return keys_fill(ctx);
+}
+
+/**
+ * lu_ctx_tags/lu_ses_tags will be updated if there are new types of
+ * obd being added. Currently, this is only used on client side, specifically
+ * for echo device client, for other stack (like ptlrpc threads), context are
+ * predefined when the lu_device type are registered, during the module probe
+ * phase.
+ */
+__u32 lu_context_tags_default = 0;
+__u32 lu_session_tags_default = 0;
+
+void lu_context_tags_update(__u32 tags)
+{
+	write_lock(&lu_keys_guard);
+	lu_context_tags_default |= tags;
+	key_set_version++;
+	write_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_context_tags_update);
+
+void lu_context_tags_clear(__u32 tags)
+{
+	write_lock(&lu_keys_guard);
+	lu_context_tags_default &= ~tags;
+	key_set_version++;
+	write_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_context_tags_clear);
+
+void lu_session_tags_update(__u32 tags)
+{
+	write_lock(&lu_keys_guard);
+	lu_session_tags_default |= tags;
+	key_set_version++;
+	write_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_session_tags_update);
+
+void lu_session_tags_clear(__u32 tags)
+{
+	write_lock(&lu_keys_guard);
+	lu_session_tags_default &= ~tags;
+	key_set_version++;
+	write_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_session_tags_clear);
+
+int lu_env_init(struct lu_env *env, __u32 tags)
+{
+        int result;
+
+        env->le_ses = NULL;
+        result = lu_context_init(&env->le_ctx, tags);
+        if (likely(result == 0))
+                lu_context_enter(&env->le_ctx);
+        return result;
+}
+EXPORT_SYMBOL(lu_env_init);
+
+void lu_env_fini(struct lu_env *env)
+{
+        lu_context_exit(&env->le_ctx);
+        lu_context_fini(&env->le_ctx);
+        env->le_ses = NULL;
+}
+EXPORT_SYMBOL(lu_env_fini);
+
+int lu_env_refill(struct lu_env *env)
+{
+        int result;
+
+        result = lu_context_refill(&env->le_ctx);
+        if (result == 0 && env->le_ses != NULL)
+                result = lu_context_refill(env->le_ses);
+        return result;
+}
+EXPORT_SYMBOL(lu_env_refill);
+
+/**
+ * Currently, this API will only be used by echo client.
+ * Because echo client and normal lustre client will share
+ * same cl_env cache. So echo client needs to refresh
+ * the env context after it get one from the cache, especially
+ * when normal client and echo client co-exist in the same client.
+ */
+int lu_env_refill_by_tags(struct lu_env *env, __u32 ctags,
+                          __u32 stags)
+{
+        int    result;
+
+        if ((env->le_ctx.lc_tags & ctags) != ctags) {
+                env->le_ctx.lc_version = 0;
+                env->le_ctx.lc_tags |= ctags;
+        }
+
+        if (env->le_ses && (env->le_ses->lc_tags & stags) != stags) {
+                env->le_ses->lc_version = 0;
+                env->le_ses->lc_tags |= stags;
+        }
+
+        result = lu_env_refill(env);
+
+        return result;
+}
+EXPORT_SYMBOL(lu_env_refill_by_tags);
+
+static struct shrinker *lu_site_shrinker;
+
+typedef struct lu_site_stats{
+        unsigned        lss_populated;
+        unsigned        lss_max_search;
+        unsigned        lss_total;
+        unsigned        lss_busy;
+} lu_site_stats_t;
+
+static void lu_site_stats_get(struct cfs_hash *hs,
+                              lu_site_stats_t *stats, int populated)
+{
+	struct cfs_hash_bd bd;
+	unsigned int  i;
+
+        cfs_hash_for_each_bucket(hs, &bd, i) {
+                struct lu_site_bkt_data *bkt = cfs_hash_bd_extra_get(hs, &bd);
+		struct hlist_head	*hhead;
+
+                cfs_hash_bd_lock(hs, &bd, 1);
+		stats->lss_busy  +=
+			cfs_hash_bd_count_get(&bd) - bkt->lsb_lru_len;
+                stats->lss_total += cfs_hash_bd_count_get(&bd);
+                stats->lss_max_search = max((int)stats->lss_max_search,
+                                            cfs_hash_bd_depmax_get(&bd));
+                if (!populated) {
+                        cfs_hash_bd_unlock(hs, &bd, 1);
+                        continue;
+                }
+
+                cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+			if (!hlist_empty(hhead))
+                                stats->lss_populated++;
+                }
+                cfs_hash_bd_unlock(hs, &bd, 1);
+        }
+}
+
+
+/*
+ * lu_cache_shrink_count() returns an approximate number of cached objects
+ * that can be freed by shrink_slab(). A counter, which tracks the
+ * number of items in the site's lru, is maintained in a percpu_counter
+ * for each site. The percpu values are incremented and decremented as
+ * objects are added or removed from the lru. The percpu values are summed
+ * and saved whenever a percpu value exceeds a threshold. Thus the saved,
+ * summed value at any given time may not accurately reflect the current
+ * lru length. But this value is sufficiently accurate for the needs of
+ * a shrinker.
+ *
+ * Using a per cpu counter is a compromise solution to concurrent access:
+ * lu_object_put() can update the counter without locking the site and
+ * lu_cache_shrink_count can sum the counters without locking each
+ * ls_obj_hash bucket.
+ */
+static unsigned long lu_cache_shrink_count(struct shrinker *sk,
+					   struct shrink_control *sc)
+{
+	struct lu_site *s;
+	struct lu_site *tmp;
+	unsigned long cached = 0;
+
+	if (!(sc->gfp_mask & __GFP_FS))
+		return 0;
+
+	down_read(&lu_sites_guard);
+	list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage)
+		cached += percpu_counter_read_positive(&s->ls_lru_len_counter);
+	up_read(&lu_sites_guard);
+
+	cached = (cached / 100) * sysctl_vfs_cache_pressure;
+	CDEBUG(D_INODE, "%ld objects cached, cache pressure %d\n",
+	       cached, sysctl_vfs_cache_pressure);
+
+	return cached;
+}
+
+static unsigned long lu_cache_shrink_scan(struct shrinker *sk,
+					  struct shrink_control *sc)
+{
+	struct lu_site *s;
+	struct lu_site *tmp;
+	unsigned long remain = sc->nr_to_scan;
+	LIST_HEAD(splice);
+
+	if (!(sc->gfp_mask & __GFP_FS))
+		/* We must not take the lu_sites_guard lock when
+		 * __GFP_FS is *not* set because of the deadlock
+		 * possibility detailed above. Additionally,
+		 * since we cannot determine the number of
+		 * objects in the cache without taking this
+		 * lock, we're in a particularly tough spot. As
+		 * a result, we'll just lie and say our cache is
+		 * empty. This _should_ be ok, as we can't
+		 * reclaim objects when __GFP_FS is *not* set
+		 * anyways.
+		 */
+		return SHRINK_STOP;
+
+	down_write(&lu_sites_guard);
+	list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) {
+		remain = lu_site_purge(&lu_shrink_env, s, remain);
+		/*
+		 * Move just shrunk site to the tail of site list to
+		 * assure shrinking fairness.
+		 */
+		list_move_tail(&s->ls_linkage, &splice);
+	}
+	list_splice(&splice, lu_sites.prev);
+	up_write(&lu_sites_guard);
+
+	return sc->nr_to_scan - remain;
+}
+
+#ifndef HAVE_SHRINKER_COUNT
+/*
+ * There exists a potential lock inversion deadlock scenario when using
+ * Lustre on top of ZFS. This occurs between one of ZFS's
+ * buf_hash_table.ht_lock's, and Lustre's lu_sites_guard lock. Essentially,
+ * thread A will take the lu_sites_guard lock and sleep on the ht_lock,
+ * while thread B will take the ht_lock and sleep on the lu_sites_guard
+ * lock. Obviously neither thread will wake and drop their respective hold
+ * on their lock.
+ *
+ * To prevent this from happening we must ensure the lu_sites_guard lock is
+ * not taken while down this code path. ZFS reliably does not set the
+ * __GFP_FS bit in its code paths, so this can be used to determine if it
+ * is safe to take the lu_sites_guard lock.
+ *
+ * Ideally we should accurately return the remaining number of cached
+ * objects without taking the lu_sites_guard lock, but this is not
+ * possible in the current implementation.
+ */
+static int lu_cache_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
+{
+        int cached = 0;
+	struct shrink_control scv = {
+		 .nr_to_scan = shrink_param(sc, nr_to_scan),
+		 .gfp_mask   = shrink_param(sc, gfp_mask)
+	};
+#if !defined(HAVE_SHRINKER_WANT_SHRINK_PTR) && !defined(HAVE_SHRINK_CONTROL)
+	struct shrinker* shrinker = NULL;
+#endif
+
+
+	CDEBUG(D_INODE, "Shrink %lu objects\n", scv.nr_to_scan);
+
+	if (scv.nr_to_scan != 0)
+		lu_cache_shrink_scan(shrinker, &scv);
+
+	cached = lu_cache_shrink_count(shrinker, &scv);
+	return cached;
+}
+
+#endif /* HAVE_SHRINKER_COUNT */
+
+
+/*
+ * Debugging stuff.
+ */
+
+/**
+ * Environment to be used in debugger, contains all tags.
+ */
+static struct lu_env lu_debugging_env;
+
+/**
+ * Debugging printer function using printk().
+ */
+int lu_printk_printer(const struct lu_env *env,
+		      void *unused, const char *format, ...)
+{
+        va_list args;
+
+        va_start(args, format);
+        vprintk(format, args);
+        va_end(args);
+        return 0;
+}
+
+int lu_debugging_setup(void)
+{
+	return lu_env_init(&lu_debugging_env, ~0);
+}
+
+void lu_context_keys_dump(void)
+{
+	unsigned int i;
+
+        for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+                struct lu_context_key *key;
+
+                key = lu_keys[i];
+                if (key != NULL) {
+                        CERROR("[%d]: %p %x (%p,%p,%p) %d %d \"%s\"@%p\n",
+                               i, key, key->lct_tags,
+                               key->lct_init, key->lct_fini, key->lct_exit,
+			       key->lct_index, atomic_read(&key->lct_used),
+                               key->lct_owner ? key->lct_owner->name : "",
+                               key->lct_owner);
+                        lu_ref_print(&key->lct_reference);
+                }
+        }
+}
+
+/**
+ * Initialization of global lu_* data.
+ */
+int lu_global_init(void)
+{
+        int result;
+	DEF_SHRINKER_VAR(shvar, lu_cache_shrink,
+			 lu_cache_shrink_count, lu_cache_shrink_scan);
+
+        CDEBUG(D_INFO, "Lustre LU module (%p).\n", &lu_keys);
+
+        result = lu_ref_global_init();
+        if (result != 0)
+                return result;
+
+        LU_CONTEXT_KEY_INIT(&lu_global_key);
+        result = lu_context_key_register(&lu_global_key);
+        if (result != 0)
+                return result;
+
+        /*
+         * At this level, we don't know what tags are needed, so allocate them
+         * conservatively. This should not be too bad, because this
+         * environment is global.
+         */
+	down_write(&lu_sites_guard);
+        result = lu_env_init(&lu_shrink_env, LCT_SHRINKER);
+	up_write(&lu_sites_guard);
+        if (result != 0)
+                return result;
+
+        /*
+         * seeks estimation: 3 seeks to read a record from oi, one to read
+         * inode, one for ea. Unfortunately setting this high value results in
+         * lu_object/inode cache consuming all the memory.
+         */
+	lu_site_shrinker = set_shrinker(DEFAULT_SEEKS, &shvar);
+        if (lu_site_shrinker == NULL)
+                return -ENOMEM;
+
+        return result;
+}
+
+/**
+ * Dual to lu_global_init().
+ */
+void lu_global_fini(void)
+{
+        if (lu_site_shrinker != NULL) {
+		remove_shrinker(lu_site_shrinker);
+                lu_site_shrinker = NULL;
+        }
+
+	lu_context_key_degister(&lu_global_key);
+
+        /*
+         * Tear shrinker environment down _after_ de-registering
+         * lu_global_key, because the latter has a value in the former.
+         */
+	down_write(&lu_sites_guard);
+        lu_env_fini(&lu_shrink_env);
+	up_write(&lu_sites_guard);
+
+        lu_ref_global_fini();
+}
+
+static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx)
+{
+#ifdef CONFIG_PROC_FS
+	struct lprocfs_counter ret;
+
+	lprocfs_stats_collect(stats, idx, &ret);
+	return (__u32)ret.lc_count;
+#else
+	return 0;
+#endif
+}
+
+/**
+ * Output site statistical counters into a buffer. Suitable for
+ * lprocfs_rd_*()-style functions.
+ */
+int lu_site_stats_seq_print(const struct lu_site *s, struct seq_file *m)
+{
+	lu_site_stats_t stats;
+
+	memset(&stats, 0, sizeof(stats));
+	lu_site_stats_get(s->ls_obj_hash, &stats, 1);
+
+	seq_printf(m, "%d/%d %d/%d %d %d %d %d %d %d %d\n",
+		   stats.lss_busy,
+		   stats.lss_total,
+		   stats.lss_populated,
+		   CFS_HASH_NHLIST(s->ls_obj_hash),
+		   stats.lss_max_search,
+		   ls_stats_read(s->ls_stats, LU_SS_CREATED),
+		   ls_stats_read(s->ls_stats, LU_SS_CACHE_HIT),
+		   ls_stats_read(s->ls_stats, LU_SS_CACHE_MISS),
+		   ls_stats_read(s->ls_stats, LU_SS_CACHE_RACE),
+		   ls_stats_read(s->ls_stats, LU_SS_CACHE_DEATH_RACE),
+		   ls_stats_read(s->ls_stats, LU_SS_LRU_PURGED));
+	return 0;
+}
+EXPORT_SYMBOL(lu_site_stats_seq_print);
+
+/**
+ * Helper function to initialize a number of kmem slab caches at once.
+ */
+int lu_kmem_init(struct lu_kmem_descr *caches)
+{
+        int result;
+        struct lu_kmem_descr *iter = caches;
+
+        for (result = 0; iter->ckd_cache != NULL; ++iter) {
+		*iter->ckd_cache = kmem_cache_create(iter->ckd_name,
+						     iter->ckd_size,
+						     0, 0, NULL);
+                if (*iter->ckd_cache == NULL) {
+                        result = -ENOMEM;
+                        /* free all previously allocated caches */
+                        lu_kmem_fini(caches);
+                        break;
+                }
+        }
+        return result;
+}
+EXPORT_SYMBOL(lu_kmem_init);
+
+/**
+ * Helper function to finalize a number of kmem slab cached at once. Dual to
+ * lu_kmem_init().
+ */
+void lu_kmem_fini(struct lu_kmem_descr *caches)
+{
+        for (; caches->ckd_cache != NULL; ++caches) {
+                if (*caches->ckd_cache != NULL) {
+			kmem_cache_destroy(*caches->ckd_cache);
+                        *caches->ckd_cache = NULL;
+                }
+        }
+}
+EXPORT_SYMBOL(lu_kmem_fini);
+
+/**
+ * Temporary solution to be able to assign fid in ->do_create()
+ * till we have fully-functional OST fids
+ */
+void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o,
+			  const struct lu_fid *fid)
+{
+	struct lu_site		*s = o->lo_dev->ld_site;
+	struct lu_fid		*old = &o->lo_header->loh_fid;
+	struct cfs_hash		*hs;
+	struct cfs_hash_bd	 bd;
+
+	LASSERT(fid_is_zero(old));
+
+	/* supposed to be unique */
+	hs = s->ls_obj_hash;
+	cfs_hash_bd_get_and_lock(hs, (void *)fid, &bd, 1);
+#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
+	{
+		__u64 version = 0;
+		struct lu_object *shadow;
+
+		shadow = htable_lookup(s, &bd, fid, &version);
+		/* supposed to be unique */
+		LASSERT(IS_ERR(shadow) && PTR_ERR(shadow) == -ENOENT);
+	}
+#endif
+	*old = *fid;
+	cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+	cfs_hash_bd_unlock(hs, &bd, 1);
+}
+EXPORT_SYMBOL(lu_object_assign_fid);
+
+/**
+ * allocates object with 0 (non-assiged) fid
+ * XXX: temporary solution to be able to assign fid in ->do_create()
+ *      till we have fully-functional OST fids
+ */
+struct lu_object *lu_object_anon(const struct lu_env *env,
+				 struct lu_device *dev,
+				 const struct lu_object_conf *conf)
+{
+	struct lu_fid     fid;
+	struct lu_object *o;
+
+	fid_zero(&fid);
+	o = lu_object_alloc(env, dev, &fid, conf);
+
+	return o;
+}
+EXPORT_SYMBOL(lu_object_anon);
+
+struct lu_buf LU_BUF_NULL = {
+	.lb_buf = NULL,
+	.lb_len = 0
+};
+EXPORT_SYMBOL(LU_BUF_NULL);
+
+void lu_buf_free(struct lu_buf *buf)
+{
+	LASSERT(buf);
+	if (buf->lb_buf) {
+		LASSERT(buf->lb_len > 0);
+		OBD_FREE_LARGE(buf->lb_buf, buf->lb_len);
+		buf->lb_buf = NULL;
+		buf->lb_len = 0;
+	}
+}
+EXPORT_SYMBOL(lu_buf_free);
+
+void lu_buf_alloc(struct lu_buf *buf, size_t size)
+{
+	LASSERT(buf);
+	LASSERT(buf->lb_buf == NULL);
+	LASSERT(buf->lb_len == 0);
+	OBD_ALLOC_LARGE(buf->lb_buf, size);
+	if (likely(buf->lb_buf))
+		buf->lb_len = size;
+}
+EXPORT_SYMBOL(lu_buf_alloc);
+
+void lu_buf_realloc(struct lu_buf *buf, size_t size)
+{
+	lu_buf_free(buf);
+	lu_buf_alloc(buf, size);
+}
+EXPORT_SYMBOL(lu_buf_realloc);
+
+struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, size_t len)
+{
+	if (buf->lb_buf == NULL && buf->lb_len == 0)
+		lu_buf_alloc(buf, len);
+
+	if ((len > buf->lb_len) && (buf->lb_buf != NULL))
+		lu_buf_realloc(buf, len);
+
+	return buf;
+}
+EXPORT_SYMBOL(lu_buf_check_and_alloc);
+
+/**
+ * Increase the size of the \a buf.
+ * preserves old data in buffer
+ * old buffer remains unchanged on error
+ * \retval 0 or -ENOMEM
+ */
+int lu_buf_check_and_grow(struct lu_buf *buf, size_t len)
+{
+	char *ptr;
+
+	if (len <= buf->lb_len)
+		return 0;
+
+	OBD_ALLOC_LARGE(ptr, len);
+	if (ptr == NULL)
+		return -ENOMEM;
+
+	/* Free the old buf */
+	if (buf->lb_buf != NULL) {
+		memcpy(ptr, buf->lb_buf, buf->lb_len);
+		OBD_FREE_LARGE(buf->lb_buf, buf->lb_len);
+	}
+
+	buf->lb_buf = ptr;
+	buf->lb_len = len;
+	return 0;
+}
+EXPORT_SYMBOL(lu_buf_check_and_grow);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c b/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c
new file mode 100644
index 0000000000000..bef29033f30ee
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c
@@ -0,0 +1,444 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lu_ref.c
+ *
+ * Lustre reference.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <libcfs/libcfs.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lu_ref.h>
+
+#ifdef USE_LU_REF
+
+/**
+ * Asserts a condition for a given lu_ref. Must be called with
+ * lu_ref::lf_guard held.
+ */
+#define REFASSERT(ref, expr) do {					\
+	struct lu_ref *__tmp = (ref);					\
+									\
+	if (unlikely(!(expr))) {					\
+		lu_ref_print(__tmp);					\
+		spin_unlock(&__tmp->lf_guard);				\
+		lu_ref_print_all();					\
+		LASSERT(0);						\
+		spin_lock(&__tmp->lf_guard);				\
+	}								\
+} while (0)
+
+static struct kmem_cache *lu_ref_link_kmem;
+
+static struct lu_kmem_descr lu_ref_caches[] = {
+        {
+                .ckd_cache = &lu_ref_link_kmem,
+                .ckd_name  = "lu_ref_link_kmem",
+                .ckd_size  = sizeof (struct lu_ref_link)
+        },
+        {
+                .ckd_cache = NULL
+        }
+};
+
+/**
+ * Global list of active (initialized, but not finalized) lu_ref's.
+ *
+ * Protected by lu_ref_refs_guard.
+ */
+static LIST_HEAD(lu_ref_refs);
+static DEFINE_SPINLOCK(lu_ref_refs_guard);
+static struct lu_ref lu_ref_marker = {
+	.lf_guard	= __SPIN_LOCK_UNLOCKED(lu_ref_marker.lf_guard),
+	.lf_list	= LIST_HEAD_INIT(lu_ref_marker.lf_list),
+	.lf_linkage	= LIST_HEAD_INIT(lu_ref_marker.lf_linkage)
+};
+
+void lu_ref_print(const struct lu_ref *ref)
+{
+        struct lu_ref_link *link;
+
+        CERROR("lu_ref: %p %d %d %s:%d\n",
+               ref, ref->lf_refs, ref->lf_failed, ref->lf_func, ref->lf_line);
+	list_for_each_entry(link, &ref->lf_list, ll_linkage) {
+                CERROR("     link: %s %p\n", link->ll_scope, link->ll_source);
+        }
+}
+
+static int lu_ref_is_marker(const struct lu_ref *ref)
+{
+        return (ref == &lu_ref_marker);
+}
+
+void lu_ref_print_all(void)
+{
+	struct lu_ref *ref;
+
+	spin_lock(&lu_ref_refs_guard);
+	list_for_each_entry(ref, &lu_ref_refs, lf_linkage) {
+		if (lu_ref_is_marker(ref))
+			continue;
+
+		spin_lock(&ref->lf_guard);
+		lu_ref_print(ref);
+		spin_unlock(&ref->lf_guard);
+	}
+	spin_unlock(&lu_ref_refs_guard);
+}
+
+void lu_ref_init_loc(struct lu_ref *ref, const char *func, const int line)
+{
+	ref->lf_refs = 0;
+	ref->lf_func = func;
+	ref->lf_line = line;
+	spin_lock_init(&ref->lf_guard);
+	INIT_LIST_HEAD(&ref->lf_list);
+	spin_lock(&lu_ref_refs_guard);
+	list_add(&ref->lf_linkage, &lu_ref_refs);
+	spin_unlock(&lu_ref_refs_guard);
+}
+EXPORT_SYMBOL(lu_ref_init_loc);
+
+void lu_ref_fini(struct lu_ref *ref)
+{
+	spin_lock(&ref->lf_guard);
+	REFASSERT(ref, list_empty(&ref->lf_list));
+	REFASSERT(ref, ref->lf_refs == 0);
+	spin_unlock(&ref->lf_guard);
+	spin_lock(&lu_ref_refs_guard);
+	list_del_init(&ref->lf_linkage);
+	spin_unlock(&lu_ref_refs_guard);
+}
+EXPORT_SYMBOL(lu_ref_fini);
+
+static struct lu_ref_link *lu_ref_add_context(struct lu_ref *ref,
+                                              int flags,
+                                              const char *scope,
+                                              const void *source)
+{
+        struct lu_ref_link *link;
+
+        link = NULL;
+        if (lu_ref_link_kmem != NULL) {
+                OBD_SLAB_ALLOC_PTR_GFP(link, lu_ref_link_kmem, flags);
+                if (link != NULL) {
+                        link->ll_ref    = ref;
+                        link->ll_scope  = scope;
+                        link->ll_source = source;
+			spin_lock(&ref->lf_guard);
+			list_add_tail(&link->ll_linkage, &ref->lf_list);
+			ref->lf_refs++;
+			spin_unlock(&ref->lf_guard);
+		}
+	}
+
+	if (link == NULL) {
+		spin_lock(&ref->lf_guard);
+		ref->lf_failed++;
+		spin_unlock(&ref->lf_guard);
+		link = ERR_PTR(-ENOMEM);
+	}
+
+	return link;
+}
+
+void lu_ref_add(struct lu_ref *ref, const char *scope, const void *source)
+{
+	might_sleep();
+	lu_ref_add_context(ref, GFP_NOFS, scope, source);
+}
+EXPORT_SYMBOL(lu_ref_add);
+
+void lu_ref_add_at(struct lu_ref *ref, struct lu_ref_link *link,
+		   const char *scope, const void *source)
+{
+	link->ll_ref = ref;
+	link->ll_scope = scope;
+	link->ll_source = source;
+	spin_lock(&ref->lf_guard);
+	list_add_tail(&link->ll_linkage, &ref->lf_list);
+	ref->lf_refs++;
+	spin_unlock(&ref->lf_guard);
+}
+EXPORT_SYMBOL(lu_ref_add_at);
+
+/**
+ * Version of lu_ref_add() to be used in non-blockable contexts.
+ */
+void lu_ref_add_atomic(struct lu_ref *ref, const char *scope,
+		       const void *source)
+{
+	lu_ref_add_context(ref, GFP_ATOMIC, scope, source);
+}
+EXPORT_SYMBOL(lu_ref_add_atomic);
+
+static inline int lu_ref_link_eq(const struct lu_ref_link *link,
+                                 const char *scope, const void *source)
+{
+        return link->ll_source == source && !strcmp(link->ll_scope, scope);
+}
+
+/**
+ * Maximal chain length seen so far.
+ */
+static unsigned lu_ref_chain_max_length = 127;
+
+/**
+ * Searches for a lu_ref_link with given [scope, source] within given lu_ref.
+ */
+static struct lu_ref_link *lu_ref_find(struct lu_ref *ref, const char *scope,
+                                       const void *source)
+{
+        struct lu_ref_link *link;
+        unsigned            iterations;
+
+        iterations = 0;
+	list_for_each_entry(link, &ref->lf_list, ll_linkage) {
+                ++iterations;
+                if (lu_ref_link_eq(link, scope, source)) {
+                        if (iterations > lu_ref_chain_max_length) {
+                                CWARN("Long lu_ref chain %d \"%s\":%p\n",
+                                      iterations, scope, source);
+                                lu_ref_chain_max_length = iterations * 3 / 2;
+                        }
+                        return link;
+                }
+        }
+        return NULL;
+}
+
+void lu_ref_del(struct lu_ref *ref, const char *scope, const void *source)
+{
+	struct lu_ref_link *link;
+
+	spin_lock(&ref->lf_guard);
+	link = lu_ref_find(ref, scope, source);
+	if (link != NULL) {
+		list_del(&link->ll_linkage);
+		ref->lf_refs--;
+		spin_unlock(&ref->lf_guard);
+		OBD_SLAB_FREE(link, lu_ref_link_kmem, sizeof(*link));
+	} else {
+		REFASSERT(ref, ref->lf_failed > 0);
+		ref->lf_failed--;
+		spin_unlock(&ref->lf_guard);
+	}
+}
+EXPORT_SYMBOL(lu_ref_del);
+
+void lu_ref_set_at(struct lu_ref *ref, struct lu_ref_link *link,
+		   const char *scope,
+		   const void *source0, const void *source1)
+{
+	spin_lock(&ref->lf_guard);
+	REFASSERT(ref, link != NULL && !IS_ERR(link));
+	REFASSERT(ref, link->ll_ref == ref);
+	REFASSERT(ref, lu_ref_link_eq(link, scope, source0));
+	link->ll_source = source1;
+	spin_unlock(&ref->lf_guard);
+}
+EXPORT_SYMBOL(lu_ref_set_at);
+
+void lu_ref_del_at(struct lu_ref *ref, struct lu_ref_link *link,
+		   const char *scope, const void *source)
+{
+	spin_lock(&ref->lf_guard);
+	REFASSERT(ref, link != NULL && !IS_ERR(link));
+	REFASSERT(ref, link->ll_ref == ref);
+	REFASSERT(ref, lu_ref_link_eq(link, scope, source));
+	list_del(&link->ll_linkage);
+	ref->lf_refs--;
+	spin_unlock(&ref->lf_guard);
+}
+EXPORT_SYMBOL(lu_ref_del_at);
+
+#ifdef CONFIG_PROC_FS
+
+static void *lu_ref_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct lu_ref *ref = seq->private;
+
+	spin_lock(&lu_ref_refs_guard);
+	if (list_empty(&ref->lf_linkage))
+		ref = NULL;
+	spin_unlock(&lu_ref_refs_guard);
+
+	return ref;
+}
+
+static void *lu_ref_seq_next(struct seq_file *seq, void *p, loff_t *pos)
+{
+        struct lu_ref *ref = p;
+        struct lu_ref *next;
+
+        LASSERT(seq->private == p);
+	LASSERT(!list_empty(&ref->lf_linkage));
+
+	spin_lock(&lu_ref_refs_guard);
+	next = list_entry(ref->lf_linkage.next, struct lu_ref, lf_linkage);
+	if (&next->lf_linkage == &lu_ref_refs) {
+		p = NULL;
+	} else {
+		(*pos)++;
+		list_move(&ref->lf_linkage, &next->lf_linkage);
+	}
+	spin_unlock(&lu_ref_refs_guard);
+	return p;
+}
+
+static void lu_ref_seq_stop(struct seq_file *seq, void *p)
+{
+        /* Nothing to do */
+}
+
+
+static int lu_ref_seq_show(struct seq_file *seq, void *p)
+{
+	struct lu_ref *ref  = p;
+	struct lu_ref *next;
+
+	spin_lock(&lu_ref_refs_guard);
+	next = list_entry(ref->lf_linkage.next, struct lu_ref, lf_linkage);
+	if ((&next->lf_linkage == &lu_ref_refs) || lu_ref_is_marker(next)) {
+		spin_unlock(&lu_ref_refs_guard);
+		return 0;
+	}
+
+	/* print the entry */
+	spin_lock(&next->lf_guard);
+        seq_printf(seq, "lu_ref: %p %d %d %s:%d\n",
+                   next, next->lf_refs, next->lf_failed,
+                   next->lf_func, next->lf_line);
+        if (next->lf_refs > 64) {
+                seq_printf(seq, "  too many references, skip\n");
+        } else {
+                struct lu_ref_link *link;
+                int i = 0;
+
+		list_for_each_entry(link, &next->lf_list, ll_linkage)
+                        seq_printf(seq, "  #%d link: %s %p\n",
+                                   i++, link->ll_scope, link->ll_source);
+        }
+	spin_unlock(&next->lf_guard);
+	spin_unlock(&lu_ref_refs_guard);
+
+	return 0;
+}
+
+static struct seq_operations lu_ref_seq_ops = {
+        .start = lu_ref_seq_start,
+        .stop  = lu_ref_seq_stop,
+        .next  = lu_ref_seq_next,
+        .show  = lu_ref_seq_show
+};
+
+static int lu_ref_seq_open(struct inode *inode, struct file *file)
+{
+	struct lu_ref *marker = &lu_ref_marker;
+	int result = 0;
+
+	result = seq_open(file, &lu_ref_seq_ops);
+	if (result == 0) {
+		spin_lock(&lu_ref_refs_guard);
+		if (!list_empty(&marker->lf_linkage))
+			result = -EAGAIN;
+		else
+			list_add(&marker->lf_linkage, &lu_ref_refs);
+		spin_unlock(&lu_ref_refs_guard);
+
+                if (result == 0) {
+                        struct seq_file *f = file->private_data;
+                        f->private = marker;
+                } else {
+                        seq_release(inode, file);
+                }
+        }
+
+        return result;
+}
+
+static int lu_ref_seq_release(struct inode *inode, struct file *file)
+{
+	struct lu_ref *ref = ((struct seq_file *)file->private_data)->private;
+
+	spin_lock(&lu_ref_refs_guard);
+	list_del_init(&ref->lf_linkage);
+	spin_unlock(&lu_ref_refs_guard);
+
+	return seq_release(inode, file);
+}
+
+static struct file_operations lu_ref_dump_fops = {
+        .owner   = THIS_MODULE,
+        .open    = lu_ref_seq_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = lu_ref_seq_release
+};
+
+#endif /* CONFIG_PROC_FS */
+
+int lu_ref_global_init(void)
+{
+	int result;
+
+	CDEBUG(D_CONSOLE,
+	       "lu_ref tracking is enabled. Performance isn't.\n");
+
+        result = lu_kmem_init(lu_ref_caches);
+
+#ifdef CONFIG_PROC_FS
+        if (result == 0) {
+                result = lprocfs_seq_create(proc_lustre_root, "lu_refs",
+                                            0444, &lu_ref_dump_fops, NULL);
+                if (result)
+                        lu_kmem_fini(lu_ref_caches);
+        }
+#endif /* CONFIG_PROC_FS */
+
+        return result;
+}
+
+void lu_ref_global_fini(void)
+{
+#ifdef CONFIG_PROC_FS
+        lprocfs_remove_proc_entry("lu_refs", proc_lustre_root);
+#endif /* CONFIG_PROC_FS */
+        lu_kmem_fini(lu_ref_caches);
+}
+
+#endif /* USE_LU_REF */
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lu_ucred.c b/drivers/staging/lustrefsx/lustre/obdclass/lu_ucred.c
new file mode 100644
index 0000000000000..44a69f730e1cb
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lu_ucred.c
@@ -0,0 +1,103 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lu_ucred.c
+ *
+ * Lustre user credentials context infrastructure.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Fan Yong <fan.yong@intel.com>
+ *   Author: Vitaly Fertman <vitaly_fertman@xyratex.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <libcfs/libcfs.h>
+#include <obd_support.h>
+#include <lu_object.h>
+#include <md_object.h>
+
+/* context key constructor/destructor: lu_ucred_key_init, lu_ucred_key_fini */
+LU_KEY_INIT_FINI(lu_ucred, struct lu_ucred);
+
+static struct lu_context_key lu_ucred_key = {
+	.lct_tags = LCT_SERVER_SESSION,
+	.lct_init = lu_ucred_key_init,
+	.lct_fini = lu_ucred_key_fini
+};
+
+/**
+ * Get ucred key if session exists and ucred key is allocated on it.
+ * Return NULL otherwise.
+ */
+struct lu_ucred *lu_ucred(const struct lu_env *env)
+{
+	if (!env->le_ses)
+		return NULL;
+	return lu_context_key_get(env->le_ses, &lu_ucred_key);
+}
+EXPORT_SYMBOL(lu_ucred);
+
+/**
+ * Get ucred key and check if it is properly initialized.
+ * Return NULL otherwise.
+ */
+struct lu_ucred *lu_ucred_check(const struct lu_env *env)
+{
+	struct lu_ucred *uc = lu_ucred(env);
+	if (uc && uc->uc_valid != UCRED_OLD && uc->uc_valid != UCRED_NEW)
+		return NULL;
+	return uc;
+}
+EXPORT_SYMBOL(lu_ucred_check);
+
+/**
+ * Get ucred key, which must exist and must be properly initialized.
+ * Assert otherwise.
+ */
+struct lu_ucred *lu_ucred_assert(const struct lu_env *env)
+{
+	struct lu_ucred *uc = lu_ucred_check(env);
+	LASSERT(uc != NULL);
+	return uc;
+}
+EXPORT_SYMBOL(lu_ucred_assert);
+
+int lu_ucred_global_init(void)
+{
+	LU_CONTEXT_KEY_INIT(&lu_ucred_key);
+	return lu_context_key_register(&lu_ucred_key);
+}
+
+void lu_ucred_global_fini(void)
+{
+	lu_context_key_degister(&lu_ucred_key);
+}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lustre_handles.c b/drivers/staging/lustrefsx/lustre/obdclass/lustre_handles.c
new file mode 100644
index 0000000000000..bd149ddf7a967
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lustre_handles.c
@@ -0,0 +1,261 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lustre_handles.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_support.h>
+#include <lustre_handles.h>
+#include <lustre_lib.h>
+
+
+static __u64 handle_base;
+#define HANDLE_INCR 7
+static DEFINE_SPINLOCK(handle_base_lock);
+
+static struct handle_bucket {
+	spinlock_t	 lock;
+	struct list_head head;
+} *handle_hash;
+
+#define HANDLE_HASH_SIZE (1 << 16)
+#define HANDLE_HASH_MASK (HANDLE_HASH_SIZE - 1)
+
+/*
+ * Generate a unique 64bit cookie (hash) for a handle and insert it into
+ * global (per-node) hash-table.
+ */
+void class_handle_hash(struct portals_handle *h,
+		       struct portals_handle_ops *ops)
+{
+        struct handle_bucket *bucket;
+        ENTRY;
+
+        LASSERT(h != NULL);
+	LASSERT(list_empty(&h->h_link));
+
+        /*
+         * This is fast, but simplistic cookie generation algorithm, it will
+         * need a re-do at some point in the future for security.
+         */
+	spin_lock(&handle_base_lock);
+	handle_base += HANDLE_INCR;
+
+	if (unlikely(handle_base == 0)) {
+		/*
+		 * Cookie of zero is "dangerous", because in many places it's
+		 * assumed that 0 means "unassigned" handle, not bound to any
+		 * object.
+		 */
+		CWARN("The universe has been exhausted: cookie wrap-around.\n");
+		handle_base += HANDLE_INCR;
+	}
+	h->h_cookie = handle_base;
+	spin_unlock(&handle_base_lock);
+
+	h->h_ops = ops;
+	spin_lock_init(&h->h_lock);
+
+	bucket = &handle_hash[h->h_cookie & HANDLE_HASH_MASK];
+	spin_lock(&bucket->lock);
+	list_add_rcu(&h->h_link, &bucket->head);
+	h->h_in = 1;
+	spin_unlock(&bucket->lock);
+
+	CDEBUG(D_INFO, "added object %p with handle %#llx to hash\n",
+	       h, h->h_cookie);
+	EXIT;
+}
+EXPORT_SYMBOL(class_handle_hash);
+
+static void class_handle_unhash_nolock(struct portals_handle *h)
+{
+	if (list_empty(&h->h_link)) {
+		CERROR("removing an already-removed handle (%#llx)\n",
+                       h->h_cookie);
+                return;
+        }
+
+	CDEBUG(D_INFO, "removing object %p with handle %#llx from hash\n",
+               h, h->h_cookie);
+
+	spin_lock(&h->h_lock);
+	if (h->h_in == 0) {
+		spin_unlock(&h->h_lock);
+		return;
+	}
+	h->h_in = 0;
+	spin_unlock(&h->h_lock);
+	list_del_rcu(&h->h_link);
+}
+
+void class_handle_unhash(struct portals_handle *h)
+{
+	struct handle_bucket *bucket;
+	bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK);
+
+	spin_lock(&bucket->lock);
+	class_handle_unhash_nolock(h);
+	spin_unlock(&bucket->lock);
+}
+EXPORT_SYMBOL(class_handle_unhash);
+
+void class_handle_hash_back(struct portals_handle *h)
+{
+	struct handle_bucket *bucket;
+	ENTRY;
+
+	bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK);
+
+	spin_lock(&bucket->lock);
+	list_add_rcu(&h->h_link, &bucket->head);
+	h->h_in = 1;
+	spin_unlock(&bucket->lock);
+
+	EXIT;
+}
+EXPORT_SYMBOL(class_handle_hash_back);
+
+void *class_handle2object(__u64 cookie, const void *owner)
+{
+        struct handle_bucket *bucket;
+        struct portals_handle *h;
+        void *retval = NULL;
+        ENTRY;
+
+        LASSERT(handle_hash != NULL);
+
+	/* Be careful when you want to change this code. See the
+	 * rcu_read_lock() definition on top this file. - jxiong */
+        bucket = handle_hash + (cookie & HANDLE_HASH_MASK);
+
+        rcu_read_lock();
+        list_for_each_entry_rcu(h, &bucket->head, h_link) {
+		if (h->h_cookie != cookie || h->h_owner != owner)
+                        continue;
+
+		spin_lock(&h->h_lock);
+		if (likely(h->h_in != 0)) {
+			h->h_ops->hop_addref(h);
+			retval = h;
+		}
+		spin_unlock(&h->h_lock);
+		break;
+	}
+	rcu_read_unlock();
+
+	RETURN(retval);
+}
+EXPORT_SYMBOL(class_handle2object);
+
+void class_handle_free_cb(struct rcu_head *rcu)
+{
+	struct portals_handle *h;
+	void *ptr;
+
+	h = container_of(rcu, struct portals_handle, h_rcu);
+	ptr = (void *)(unsigned long)h->h_cookie;
+
+	if (h->h_ops->hop_free != NULL)
+		h->h_ops->hop_free(ptr, h->h_size);
+	else
+		OBD_FREE(ptr, h->h_size);
+}
+EXPORT_SYMBOL(class_handle_free_cb);
+
+int class_handle_init(void)
+{
+        struct handle_bucket *bucket;
+	struct timespec64 ts;
+        int seed[2];
+
+        LASSERT(handle_hash == NULL);
+
+        OBD_ALLOC_LARGE(handle_hash, sizeof(*bucket) * HANDLE_HASH_SIZE);
+        if (handle_hash == NULL)
+                return -ENOMEM;
+
+	for (bucket = handle_hash + HANDLE_HASH_SIZE - 1; bucket >= handle_hash;
+	     bucket--) {
+		INIT_LIST_HEAD(&bucket->head);
+		spin_lock_init(&bucket->lock);
+	}
+
+	/** bug 21430: add randomness to the initial base */
+	cfs_get_random_bytes(seed, sizeof(seed));
+	ktime_get_ts64(&ts);
+	cfs_srand(ts.tv_sec ^ seed[0], ts.tv_nsec ^ seed[1]);
+
+        cfs_get_random_bytes(&handle_base, sizeof(handle_base));
+        LASSERT(handle_base != 0ULL);
+
+        return 0;
+}
+
+static int cleanup_all_handles(void)
+{
+	int rc;
+	int i;
+
+	for (rc = i = 0; i < HANDLE_HASH_SIZE; i++) {
+		struct portals_handle *h;
+
+		spin_lock(&handle_hash[i].lock);
+		list_for_each_entry_rcu(h, &(handle_hash[i].head), h_link) {
+			CERROR("force clean handle %#llx addr %p ops %p\n",
+			       h->h_cookie, h, h->h_ops);
+
+			class_handle_unhash_nolock(h);
+			rc++;
+		}
+		spin_unlock(&handle_hash[i].lock);
+	}
+
+	return rc;
+}
+
+void class_handle_cleanup(void)
+{
+        int count;
+        LASSERT(handle_hash != NULL);
+
+        count = cleanup_all_handles();
+
+        OBD_FREE_LARGE(handle_hash, sizeof(*handle_hash) * HANDLE_HASH_SIZE);
+        handle_hash = NULL;
+
+        if (count != 0)
+                CERROR("handle_count at cleanup: %d\n", count);
+}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lustre_peer.c b/drivers/staging/lustrefsx/lustre/obdclass/lustre_peer.c
new file mode 100644
index 0000000000000..95716e1ccac88
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lustre_peer.c
@@ -0,0 +1,202 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <obd.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+#include <lustre_ha.h>
+#include <lustre_net.h>
+#include <lprocfs_status.h>
+
+#define NIDS_MAX        32
+
+struct uuid_nid_data {
+	struct list_head	un_list;
+	struct obd_uuid		un_uuid;
+	int			un_nid_count;
+	lnet_nid_t		un_nids[NIDS_MAX];
+};
+
+/* FIXME: This should probably become more elegant than a global linked list */
+static LIST_HEAD(g_uuid_list);
+static DEFINE_SPINLOCK(g_uuid_lock);
+
+int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index)
+{
+	struct uuid_nid_data *data;
+	struct obd_uuid tmp;
+	int rc = -ENOENT;
+
+	obd_str2uuid(&tmp, uuid);
+	spin_lock(&g_uuid_lock);
+	list_for_each_entry(data, &g_uuid_list, un_list) {
+		if (obd_uuid_equals(&data->un_uuid, &tmp)) {
+			if (index >= data->un_nid_count)
+				break;
+
+			rc = 0;
+			*peer_nid = data->un_nids[index];
+			break;
+		}
+	}
+	spin_unlock(&g_uuid_lock);
+	return rc;
+}
+EXPORT_SYMBOL(lustre_uuid_to_peer);
+
+/* Add a nid to a niduuid.  Multiple nids can be added to a single uuid;
+   LNET will choose the best one. */
+int class_add_uuid(const char *uuid, __u64 nid)
+{
+        struct uuid_nid_data *data, *entry;
+        int found = 0;
+
+        LASSERT(nid != 0);  /* valid newconfig NID is never zero */
+
+        if (strlen(uuid) > UUID_MAX - 1)
+                return -EOVERFLOW;
+
+        OBD_ALLOC_PTR(data);
+        if (data == NULL)
+                return -ENOMEM;
+
+        obd_str2uuid(&data->un_uuid, uuid);
+        data->un_nids[0] = nid;
+        data->un_nid_count = 1;
+
+	spin_lock(&g_uuid_lock);
+	list_for_each_entry(entry, &g_uuid_list, un_list) {
+                if (obd_uuid_equals(&entry->un_uuid, &data->un_uuid)) {
+                        int i;
+
+                        found = 1;
+                        for (i = 0; i < entry->un_nid_count; i++)
+                                if (nid == entry->un_nids[i])
+                                        break;
+
+                        if (i == entry->un_nid_count) {
+                                LASSERT(entry->un_nid_count < NIDS_MAX);
+                                entry->un_nids[entry->un_nid_count++] = nid;
+                        }
+                        break;
+                }
+        }
+        if (!found)
+		list_add(&data->un_list, &g_uuid_list);
+	spin_unlock(&g_uuid_lock);
+
+        if (found) {
+                CDEBUG(D_INFO, "found uuid %s %s cnt=%d\n", uuid,
+                       libcfs_nid2str(nid), entry->un_nid_count);
+                OBD_FREE(data, sizeof(*data));
+        } else {
+                CDEBUG(D_INFO, "add uuid %s %s\n", uuid, libcfs_nid2str(nid));
+        }
+        return 0;
+}
+
+/* Delete the nids for one uuid if specified, otherwise delete all */
+int class_del_uuid(const char *uuid)
+{
+	struct uuid_nid_data *data;
+	struct list_head deathrow;
+
+	INIT_LIST_HEAD(&deathrow);
+
+	spin_lock(&g_uuid_lock);
+	if (uuid != NULL) {
+		struct obd_uuid tmp;
+
+		obd_str2uuid(&tmp, uuid);
+		list_for_each_entry(data, &g_uuid_list, un_list) {
+			if (obd_uuid_equals(&data->un_uuid, &tmp)) {
+				list_move(&data->un_list, &deathrow);
+				break;
+			}
+		}
+	} else
+		list_splice_init(&g_uuid_list, &deathrow);
+	spin_unlock(&g_uuid_lock);
+
+	if (uuid != NULL && list_empty(&deathrow)) {
+		CDEBUG(D_INFO, "Try to delete a non-existent uuid %s\n", uuid);
+		return -EINVAL;
+	}
+
+	while (!list_empty(&deathrow)) {
+		data = list_entry(deathrow.next, struct uuid_nid_data,
+				  un_list);
+		list_del(&data->un_list);
+
+		CDEBUG(D_INFO, "del uuid %s %s/%d\n",
+		       obd_uuid2str(&data->un_uuid),
+		       libcfs_nid2str(data->un_nids[0]),
+		       data->un_nid_count);
+
+		OBD_FREE(data, sizeof(*data));
+	}
+	return 0;
+}
+
+/* check if @nid exists in nid list of @uuid */
+int class_check_uuid(struct obd_uuid *uuid, __u64 nid)
+{
+        struct uuid_nid_data *entry;
+        int found = 0;
+        ENTRY;
+
+        CDEBUG(D_INFO, "check if uuid %s has %s.\n",
+               obd_uuid2str(uuid), libcfs_nid2str(nid));
+
+	spin_lock(&g_uuid_lock);
+	list_for_each_entry(entry, &g_uuid_list, un_list) {
+                int i;
+
+                if (!obd_uuid_equals(&entry->un_uuid, uuid))
+                        continue;
+
+                /* found the uuid, check if it has @nid */
+                for (i = 0; i < entry->un_nid_count; i++) {
+                        if (entry->un_nids[i] == nid) {
+                                found = 1;
+                                break;
+                        }
+                }
+                break;
+        }
+	spin_unlock(&g_uuid_lock);
+	RETURN(found);
+}
+EXPORT_SYMBOL(class_check_uuid);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/md_attrs.c b/drivers/staging/lustrefsx/lustre/obdclass/md_attrs.c
new file mode 100644
index 0000000000000..85003937e7466
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/md_attrs.c
@@ -0,0 +1,186 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: Johann Lombardi <johann.lombardi@intel.com>
+ */
+
+#include <lustre/lustre_idl.h>
+
+#include <llog_swab.h>
+#include <lustre_swab.h>
+#include <obd.h>
+#include <md_object.h>
+
+/**
+ * Initialize new \a lma. Only fid is stored.
+ *
+ * \param lma - is the new LMA structure to be initialized
+ * \param fid - is the FID of the object this LMA belongs to
+ * \param incompat - features that MDS must understand to access object
+ */
+void lustre_lma_init(struct lustre_mdt_attrs *lma, const struct lu_fid *fid,
+		     __u32 compat, __u32 incompat)
+{
+	lma->lma_compat   = compat;
+	lma->lma_incompat = incompat;
+	lma->lma_self_fid = *fid;
+
+	/* If a field is added in struct lustre_mdt_attrs, zero it explicitly
+	 * and change the test below. */
+	LASSERT(sizeof(*lma) ==
+		(offsetof(struct lustre_mdt_attrs, lma_self_fid) +
+		 sizeof(lma->lma_self_fid)));
+}
+EXPORT_SYMBOL(lustre_lma_init);
+
+/**
+ * Swab, if needed, LMA structure which is stored on-disk in little-endian order.
+ *
+ * \param lma - is a pointer to the LMA structure to be swabbed.
+ */
+void lustre_lma_swab(struct lustre_mdt_attrs *lma)
+{
+#ifdef __BIG_ENDIAN
+	__swab32s(&lma->lma_compat);
+	__swab32s(&lma->lma_incompat);
+	lustre_swab_lu_fid(&lma->lma_self_fid);
+#endif
+}
+EXPORT_SYMBOL(lustre_lma_swab);
+
+void lustre_loa_init(struct lustre_ost_attrs *loa, const struct lu_fid *fid,
+		     __u32 compat, __u32 incompat)
+{
+	CLASSERT(sizeof(*loa) == LMA_OLD_SIZE);
+
+	memset(&loa->loa_parent_fid, 0,
+	       sizeof(*loa) - offsetof(typeof(*loa), loa_parent_fid));
+	lustre_lma_init(&loa->loa_lma, fid, compat, incompat);
+}
+EXPORT_SYMBOL(lustre_loa_init);
+
+/**
+ * Swab, if needed, LOA (for OST-object only) structure with LMA EA and PFID EA
+ * combined together are stored on-disk in little-endian order.
+ *
+ * \param[in] loa	- the pointer to the LOA structure to be swabbed.
+ * \param[in] to_cpu	- to indicate swab for CPU order or not.
+ */
+void lustre_loa_swab(struct lustre_ost_attrs *loa, bool to_cpu)
+{
+	struct lustre_mdt_attrs *lma = &loa->loa_lma;
+#ifdef __BIG_ENDIAN
+	__u32 compat = lma->lma_compat;
+#endif
+
+	lustre_lma_swab(lma);
+#ifdef __BIG_ENDIAN
+	if (to_cpu)
+		compat = lma->lma_compat;
+
+	if (compat & LMAC_STRIPE_INFO) {
+		lustre_swab_lu_fid(&loa->loa_parent_fid);
+		__swab32s(&loa->loa_stripe_size);
+	}
+	if (compat & LMAC_COMP_INFO) {
+		__swab32s(&loa->loa_comp_id);
+		__swab64s(&loa->loa_comp_start);
+		__swab64s(&loa->loa_comp_end);
+	}
+#endif
+}
+EXPORT_SYMBOL(lustre_loa_swab);
+
+/**
+ * Swab, if needed, HSM structure which is stored on-disk in little-endian
+ * order.
+ *
+ * \param attrs - is a pointer to the HSM structure to be swabbed.
+ */
+void lustre_hsm_swab(struct hsm_attrs *attrs)
+{
+#ifdef __BIG_ENDIAN
+	__swab32s(&attrs->hsm_compat);
+	__swab32s(&attrs->hsm_flags);
+	__swab64s(&attrs->hsm_arch_id);
+	__swab64s(&attrs->hsm_arch_ver);
+#endif
+}
+
+/*
+ * Swab and extract HSM attributes from on-disk xattr.
+ *
+ * \param buf - is a buffer containing the on-disk HSM extended attribute.
+ * \param rc  - is the HSM xattr stored in \a buf
+ * \param mh  - is the md_hsm structure where to extract HSM attributes.
+ */
+int lustre_buf2hsm(void *buf, int rc, struct md_hsm *mh)
+{
+	struct hsm_attrs *attrs = (struct hsm_attrs *)buf;
+	ENTRY;
+
+	if (rc == 0 ||  rc == -ENODATA)
+		/* no HSM attributes */
+		RETURN(-ENODATA);
+
+	if (rc < 0)
+		/* error hit while fetching xattr */
+		RETURN(rc);
+
+	/* unpack HSM attributes */
+	lustre_hsm_swab(attrs);
+
+	/* fill md_hsm structure */
+	mh->mh_compat   = attrs->hsm_compat;
+	mh->mh_flags    = attrs->hsm_flags;
+	mh->mh_arch_id  = attrs->hsm_arch_id;
+	mh->mh_arch_ver = attrs->hsm_arch_ver;
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lustre_buf2hsm);
+
+/*
+ * Pack HSM attributes.
+ *
+ * \param buf - is the output buffer where to pack the on-disk HSM xattr.
+ * \param mh  - is the md_hsm structure to pack.
+ */
+void lustre_hsm2buf(void *buf, const struct md_hsm *mh)
+{
+	struct hsm_attrs *attrs = (struct hsm_attrs *)buf;
+	ENTRY;
+
+	/* copy HSM attributes */
+	attrs->hsm_compat   = mh->mh_compat;
+	attrs->hsm_flags    = mh->mh_flags;
+	attrs->hsm_arch_id  = mh->mh_arch_id;
+	attrs->hsm_arch_ver = mh->mh_arch_ver;
+
+	/* pack xattr */
+	lustre_hsm_swab(attrs);
+}
+EXPORT_SYMBOL(lustre_hsm2buf);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
new file mode 100644
index 0000000000000..84f6a7ad0c146
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
@@ -0,0 +1,2236 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obd_config.c
+ *
+ * Config API
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/string.h>
+
+#include <llog_swab.h>
+#include <lprocfs_status.h>
+#include <lustre_disk.h>
+#include <uapi/linux/lustre_ioctl.h>
+#include <lustre_log.h>
+#include <uapi/linux/lustre_param.h>
+#include <obd_class.h>
+
+#include "llog_internal.h"
+
+static struct cfs_hash_ops uuid_hash_ops;
+static struct cfs_hash_ops nid_hash_ops;
+static struct cfs_hash_ops nid_stat_hash_ops;
+static struct cfs_hash_ops gen_hash_ops;
+
+/*********** string parsing utils *********/
+
+/* returns 0 if we find this key in the buffer, else 1 */
+int class_find_param(char *buf, char *key, char **valp)
+{
+        char *ptr;
+
+        if (!buf)
+                return 1;
+
+        if ((ptr = strstr(buf, key)) == NULL)
+                return 1;
+
+        if (valp)
+                *valp = ptr + strlen(key);
+
+        return 0;
+}
+EXPORT_SYMBOL(class_find_param);
+
+/**
+ * Check whether the proc parameter \a param is an old parameter or not from
+ * the array \a ptr which contains the mapping from old parameters to new ones.
+ * If it's an old one, then return the pointer to the cfg_interop_param struc-
+ * ture which contains both the old and new parameters.
+ *
+ * \param param			proc parameter
+ * \param ptr			an array which contains the mapping from
+ *				old parameters to new ones
+ *
+ * \retval valid-pointer	pointer to the cfg_interop_param structure
+ *				which contains the old and new parameters
+ * \retval NULL			\a param or \a ptr is NULL,
+ *				or \a param is not an old parameter
+ */
+struct cfg_interop_param *class_find_old_param(const char *param,
+					       struct cfg_interop_param *ptr)
+{
+	char *value = NULL;
+	int   name_len = 0;
+
+	if (param == NULL || ptr == NULL)
+		RETURN(NULL);
+
+	value = strchr(param, '=');
+	if (value == NULL)
+		name_len = strlen(param);
+	else
+		name_len = value - param;
+
+	while (ptr->old_param != NULL) {
+		if (strncmp(param, ptr->old_param, name_len) == 0 &&
+		    name_len == strlen(ptr->old_param))
+			RETURN(ptr);
+		ptr++;
+	}
+
+	RETURN(NULL);
+}
+EXPORT_SYMBOL(class_find_old_param);
+
+/**
+ * Finds a parameter in \a params and copies it to \a copy.
+ *
+ * Leading spaces are skipped. Next space or end of string is the
+ * parameter terminator with the exception that spaces inside single or double
+ * quotes get included into a parameter. The parameter is copied into \a copy
+ * which has to be allocated big enough by a caller, quotes are stripped in
+ * the copy and the copy is terminated by 0.
+ *
+ * On return \a params is set to next parameter or to NULL if last
+ * parameter is returned.
+ *
+ * \retval 0 if parameter is returned in \a copy
+ * \retval 1 otherwise
+ * \retval -EINVAL if unbalanced quota is found
+ */
+int class_get_next_param(char **params, char *copy)
+{
+        char *q1, *q2, *str;
+        int len;
+
+        str = *params;
+        while (*str == ' ')
+                str++;
+
+        if (*str == '\0') {
+                *params = NULL;
+                return 1;
+        }
+
+        while (1) {
+                q1 = strpbrk(str, " '\"");
+                if (q1 == NULL) {
+                        len = strlen(str);
+                        memcpy(copy, str, len);
+                        copy[len] = '\0';
+                        *params = NULL;
+                        return 0;
+                }
+                len = q1 - str;
+                if (*q1 == ' ') {
+                        memcpy(copy, str, len);
+                        copy[len] = '\0';
+                        *params = str + len;
+                        return 0;
+                }
+
+                memcpy(copy, str, len);
+                copy += len;
+
+                /* search for the matching closing quote */
+                str = q1 + 1;
+                q2 = strchr(str, *q1);
+                if (q2 == NULL) {
+                        CERROR("Unbalanced quota in parameters: \"%s\"\n",
+                               *params);
+                        return -EINVAL;
+                }
+                len = q2 - str;
+                memcpy(copy, str, len);
+                copy += len;
+                str = q2 + 1;
+        }
+        return 1;
+}
+EXPORT_SYMBOL(class_get_next_param);
+
+/* returns 0 if this is the first key in the buffer, else 1.
+   valp points to first char after key. */
+int class_match_param(char *buf, const char *key, char **valp)
+{
+        if (!buf)
+                return 1;
+
+        if (memcmp(buf, key, strlen(key)) != 0)
+                return 1;
+
+        if (valp)
+                *valp = buf + strlen(key);
+
+        return 0;
+}
+EXPORT_SYMBOL(class_match_param);
+
+static int parse_nid(char *buf, void *value, int quiet)
+{
+        lnet_nid_t *nid = (lnet_nid_t *)value;
+
+        *nid = libcfs_str2nid(buf);
+        if (*nid != LNET_NID_ANY)
+                return 0;
+
+	if (!quiet)
+		LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", buf);
+        return -EINVAL;
+}
+
+static int parse_net(char *buf, void *value)
+{
+        __u32 *net = (__u32 *)value;
+
+        *net = libcfs_str2net(buf);
+        CDEBUG(D_INFO, "Net %s\n", libcfs_net2str(*net));
+        return 0;
+}
+
+enum {
+        CLASS_PARSE_NID = 1,
+        CLASS_PARSE_NET,
+};
+
+/* 0 is good nid,
+   1 not found
+   < 0 error
+   endh is set to next separator */
+static int class_parse_value(char *buf, int opc, void *value, char **endh,
+			     int quiet)
+{
+        char *endp;
+        char  tmp;
+        int   rc = 0;
+
+        if (!buf)
+                return 1;
+        while (*buf == ',' || *buf == ':')
+                buf++;
+        if (*buf == ' ' || *buf == '/' || *buf == '\0')
+                return 1;
+
+        /* nid separators or end of nids */
+        endp = strpbrk(buf, ",: /");
+        if (endp == NULL)
+                endp = buf + strlen(buf);
+
+        tmp = *endp;
+        *endp = '\0';
+        switch (opc) {
+        default:
+                LBUG();
+        case CLASS_PARSE_NID:
+		rc = parse_nid(buf, value, quiet);
+                break;
+        case CLASS_PARSE_NET:
+                rc = parse_net(buf, value);
+                break;
+        }
+        *endp = tmp;
+        if (rc != 0)
+                return rc;
+        if (endh)
+                *endh = endp;
+        return 0;
+}
+
+int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh)
+{
+	return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 0);
+}
+EXPORT_SYMBOL(class_parse_nid);
+
+int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh)
+{
+	return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 1);
+}
+EXPORT_SYMBOL(class_parse_nid_quiet);
+
+int class_parse_net(char *buf, __u32 *net, char **endh)
+{
+	return class_parse_value(buf, CLASS_PARSE_NET, (void *)net, endh, 0);
+}
+
+/* 1 param contains key and match
+ * 0 param contains key and not match
+ * -1 param does not contain key
+ */
+int class_match_nid(char *buf, char *key, lnet_nid_t nid)
+{
+        lnet_nid_t tmp;
+        int   rc = -1;
+
+        while (class_find_param(buf, key, &buf) == 0) {
+                /* please restrict to the nids pertaining to
+                 * the specified nids */
+                while (class_parse_nid(buf, &tmp, &buf) == 0) {
+                        if (tmp == nid)
+                                return 1;
+                }
+                rc = 0;
+        }
+        return rc;
+}
+
+int class_match_net(char *buf, char *key, __u32 net)
+{
+        __u32 tmp;
+        int   rc = -1;
+
+        while (class_find_param(buf, key, &buf) == 0) {
+                /* please restrict to the nids pertaining to
+                 * the specified networks */
+                while (class_parse_net(buf, &tmp, &buf) == 0) {
+                        if (tmp == net)
+                                return 1;
+                }
+                rc = 0;
+        }
+        return rc;
+}
+
+char *lustre_cfg_string(struct lustre_cfg *lcfg, u32 index)
+{
+	char *s;
+
+	if (!lcfg->lcfg_buflens[index])
+		return NULL;
+
+	s = lustre_cfg_buf(lcfg, index);
+	if (!s)
+		return NULL;
+
+	/*
+	 * make sure it's NULL terminated, even if this kills a char
+	 * of data.  Try to use the padding first though.
+	 */
+	if (s[lcfg->lcfg_buflens[index] - 1] != '\0') {
+		size_t last = ALIGN(lcfg->lcfg_buflens[index], 8) - 1;
+		char lost;
+
+		/* Use the smaller value */
+		if (last > lcfg->lcfg_buflens[index])
+			last = lcfg->lcfg_buflens[index];
+
+		lost = s[last];
+		s[last] = '\0';
+		if (lost != '\0') {
+			CWARN("Truncated buf %d to '%s' (lost '%c'...)\n",
+			      index, s, lost);
+		}
+	}
+	return s;
+}
+EXPORT_SYMBOL(lustre_cfg_string);
+
+/********************** class fns **********************/
+
+/**
+ * Create a new obd device and set the type, name and uuid.  If successful,
+ * the new device can be accessed by either name or uuid.
+ */
+int class_attach(struct lustre_cfg *lcfg)
+{
+	struct obd_export *exp;
+        struct obd_device *obd = NULL;
+        char *typename, *name, *uuid;
+        int rc, len;
+        ENTRY;
+
+        if (!LUSTRE_CFG_BUFLEN(lcfg, 1)) {
+                CERROR("No type passed!\n");
+                RETURN(-EINVAL);
+        }
+        typename = lustre_cfg_string(lcfg, 1);
+
+        if (!LUSTRE_CFG_BUFLEN(lcfg, 0)) {
+                CERROR("No name passed!\n");
+                RETURN(-EINVAL);
+        }
+        name = lustre_cfg_string(lcfg, 0);
+        if (!LUSTRE_CFG_BUFLEN(lcfg, 2)) {
+                CERROR("No UUID passed!\n");
+                RETURN(-EINVAL);
+        }
+
+	uuid = lustre_cfg_string(lcfg, 2);
+	len = strlen(uuid);
+	if (len >= sizeof(obd->obd_uuid)) {
+		CERROR("%s: uuid must be < %d bytes long\n",
+		       name, (int)sizeof(obd->obd_uuid));
+		RETURN(-EINVAL);
+	}
+
+	obd = class_newdev(typename, name, uuid);
+	if (IS_ERR(obd)) { /* Already exists or out of obds */
+		rc = PTR_ERR(obd);
+                CERROR("Cannot create device %s of type %s : %d\n",
+                       name, typename, rc);
+		RETURN(rc);
+        }
+        LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+                 "obd %p obd_magic %08X != %08X\n",
+                 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+        LASSERTF(strncmp(obd->obd_name, name, strlen(name)) == 0,
+                 "%p obd_name %s != %s\n", obd, obd->obd_name, name);
+
+	exp = class_new_export_self(obd, &obd->obd_uuid);
+	if (IS_ERR(exp)) {
+		rc = PTR_ERR(exp);
+		class_free_dev(obd);
+		RETURN(rc);
+	}
+
+	obd->obd_self_export = exp;
+	list_del_init(&exp->exp_obd_chain_timed);
+	class_export_put(exp);
+
+	rc = class_register_device(obd);
+	if (rc != 0) {
+		class_decref(obd, "newdev", obd);
+		RETURN(rc);
+	}
+
+	obd->obd_attached = 1;
+	CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n",
+	       obd->obd_minor, typename, atomic_read(&obd->obd_refcount));
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(class_attach);
+
+/** Create hashes, self-export, and call type-specific setup.
+ * Setup is effectively the "start this obd" call.
+ */
+int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+        int err = 0;
+        ENTRY;
+
+        LASSERT(obd != NULL);
+        LASSERTF(obd == class_num2obd(obd->obd_minor),
+                 "obd %p != obd_devs[%d] %p\n",
+                 obd, obd->obd_minor, class_num2obd(obd->obd_minor));
+        LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+                 "obd %p obd_magic %08x != %08x\n",
+                 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+
+        /* have we attached a type to this device? */
+        if (!obd->obd_attached) {
+                CERROR("Device %d not attached\n", obd->obd_minor);
+                RETURN(-ENODEV);
+        }
+
+        if (obd->obd_set_up) {
+                CERROR("Device %d already setup (type %s)\n",
+                       obd->obd_minor, obd->obd_type->typ_name);
+                RETURN(-EEXIST);
+        }
+
+        /* is someone else setting us up right now? (attach inits spinlock) */
+	spin_lock(&obd->obd_dev_lock);
+	if (obd->obd_starting) {
+		spin_unlock(&obd->obd_dev_lock);
+                CERROR("Device %d setup in progress (type %s)\n",
+                       obd->obd_minor, obd->obd_type->typ_name);
+                RETURN(-EEXIST);
+        }
+        /* just leave this on forever.  I can't use obd_set_up here because
+           other fns check that status, and we're not actually set up yet. */
+        obd->obd_starting = 1;
+        obd->obd_uuid_hash = NULL;
+        obd->obd_nid_hash = NULL;
+        obd->obd_nid_stats_hash = NULL;
+	obd->obd_gen_hash = NULL;
+	spin_unlock(&obd->obd_dev_lock);
+
+        /* create an uuid-export lustre hash */
+        obd->obd_uuid_hash = cfs_hash_create("UUID_HASH",
+                                             HASH_UUID_CUR_BITS,
+                                             HASH_UUID_MAX_BITS,
+                                             HASH_UUID_BKT_BITS, 0,
+                                             CFS_HASH_MIN_THETA,
+                                             CFS_HASH_MAX_THETA,
+                                             &uuid_hash_ops, CFS_HASH_DEFAULT);
+        if (!obd->obd_uuid_hash)
+		GOTO(err_exit, err = -ENOMEM);
+
+        /* create a nid-export lustre hash */
+        obd->obd_nid_hash = cfs_hash_create("NID_HASH",
+                                            HASH_NID_CUR_BITS,
+                                            HASH_NID_MAX_BITS,
+                                            HASH_NID_BKT_BITS, 0,
+                                            CFS_HASH_MIN_THETA,
+                                            CFS_HASH_MAX_THETA,
+                                            &nid_hash_ops, CFS_HASH_DEFAULT);
+        if (!obd->obd_nid_hash)
+		GOTO(err_exit, err = -ENOMEM);
+
+        /* create a nid-stats lustre hash */
+        obd->obd_nid_stats_hash = cfs_hash_create("NID_STATS",
+                                                  HASH_NID_STATS_CUR_BITS,
+                                                  HASH_NID_STATS_MAX_BITS,
+                                                  HASH_NID_STATS_BKT_BITS, 0,
+                                                  CFS_HASH_MIN_THETA,
+                                                  CFS_HASH_MAX_THETA,
+                                                  &nid_stat_hash_ops, CFS_HASH_DEFAULT);
+	if (!obd->obd_nid_stats_hash)
+		GOTO(err_exit, err = -ENOMEM);
+
+	/* create a client_generation-export lustre hash */
+	obd->obd_gen_hash = cfs_hash_create("UUID_HASH",
+					    HASH_GEN_CUR_BITS,
+					    HASH_GEN_MAX_BITS,
+					    HASH_GEN_BKT_BITS, 0,
+					    CFS_HASH_MIN_THETA,
+					    CFS_HASH_MAX_THETA,
+					    &gen_hash_ops, CFS_HASH_DEFAULT);
+	if (!obd->obd_gen_hash)
+		GOTO(err_exit, err = -ENOMEM);
+
+	err = obd_setup(obd, lcfg);
+	if (err)
+		GOTO(err_exit, err);
+
+	obd->obd_set_up = 1;
+
+	spin_lock(&obd->obd_dev_lock);
+	/* cleanup drops this */
+	class_incref(obd, "setup", obd);
+	spin_unlock(&obd->obd_dev_lock);
+
+        CDEBUG(D_IOCTL, "finished setup of obd %s (uuid %s)\n",
+               obd->obd_name, obd->obd_uuid.uuid);
+
+        RETURN(0);
+err_exit:
+        if (obd->obd_uuid_hash) {
+                cfs_hash_putref(obd->obd_uuid_hash);
+                obd->obd_uuid_hash = NULL;
+        }
+        if (obd->obd_nid_hash) {
+                cfs_hash_putref(obd->obd_nid_hash);
+                obd->obd_nid_hash = NULL;
+        }
+        if (obd->obd_nid_stats_hash) {
+                cfs_hash_putref(obd->obd_nid_stats_hash);
+                obd->obd_nid_stats_hash = NULL;
+        }
+	if (obd->obd_gen_hash) {
+		cfs_hash_putref(obd->obd_gen_hash);
+		obd->obd_gen_hash = NULL;
+	}
+        obd->obd_starting = 0;
+        CERROR("setup %s failed (%d)\n", obd->obd_name, err);
+        return err;
+}
+EXPORT_SYMBOL(class_setup);
+
+/** We have finished using this obd and are ready to destroy it.
+ * There can be no more references to this obd.
+ */
+int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+        ENTRY;
+
+        if (obd->obd_set_up) {
+                CERROR("OBD device %d still set up\n", obd->obd_minor);
+                RETURN(-EBUSY);
+        }
+
+	spin_lock(&obd->obd_dev_lock);
+	if (!obd->obd_attached) {
+		spin_unlock(&obd->obd_dev_lock);
+		CERROR("OBD device %d not attached\n", obd->obd_minor);
+		RETURN(-ENODEV);
+	}
+	obd->obd_attached = 0;
+	spin_unlock(&obd->obd_dev_lock);
+
+	/* cleanup in progress. we don't like to find this device after now */
+	class_unregister_device(obd);
+
+        CDEBUG(D_IOCTL, "detach on obd %s (uuid %s)\n",
+               obd->obd_name, obd->obd_uuid.uuid);
+
+	class_decref(obd, "newdev", obd);
+
+        RETURN(0);
+}
+EXPORT_SYMBOL(class_detach);
+
+/** Start shutting down the obd.  There may be in-progess ops when
+ * this is called.  We tell them to start shutting down with a call
+ * to class_disconnect_exports().
+ */
+int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	int err = 0;
+	char *flag;
+	ENTRY;
+
+	OBD_RACE(OBD_FAIL_LDLM_RECOV_CLIENTS);
+
+	if (!obd->obd_set_up) {
+		CERROR("Device %d not setup\n", obd->obd_minor);
+		RETURN(-ENODEV);
+	}
+
+	spin_lock(&obd->obd_dev_lock);
+	if (obd->obd_stopping) {
+		spin_unlock(&obd->obd_dev_lock);
+		CERROR("OBD %d already stopping\n", obd->obd_minor);
+		RETURN(-ENODEV);
+	}
+	/* Leave this on forever */
+	obd->obd_stopping = 1;
+	/* function can't return error after that point, so clear setup flag
+	 * as early as possible to avoid finding via obd_devs / hash */
+	obd->obd_set_up = 0;
+	spin_unlock(&obd->obd_dev_lock);
+
+	/* wait for already-arrived-connections to finish. */
+	while (obd->obd_conn_inprogress > 0)
+		yield();
+	smp_rmb();
+
+	if (lcfg->lcfg_bufcount >= 2 && LUSTRE_CFG_BUFLEN(lcfg, 1) > 0) {
+		for (flag = lustre_cfg_string(lcfg, 1); *flag != 0; flag++)
+			switch (*flag) {
+			case 'F':
+				obd->obd_force = 1;
+				break;
+			case 'A':
+				LCONSOLE_WARN("Failing over %s\n",
+					      obd->obd_name);
+				obd->obd_fail = 1;
+				obd->obd_no_transno = 1;
+				obd->obd_no_recov = 1;
+				if (OBP(obd, iocontrol)) {
+					obd_iocontrol(OBD_IOC_SYNC,
+						      obd->obd_self_export,
+						      0, NULL, NULL);
+				}
+				break;
+			default:
+				CERROR("Unrecognised flag '%c'\n", *flag);
+			}
+	}
+
+	LASSERT(obd->obd_self_export);
+
+	CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d/%d\n",
+	       obd->obd_name, obd->obd_num_exports,
+	       atomic_read(&obd->obd_refcount) - 2);
+	dump_exports(obd, 0, D_HA);
+	class_disconnect_exports(obd);
+
+	/* Precleanup, we must make sure all exports get destroyed. */
+	err = obd_precleanup(obd);
+	if (err)
+		CERROR("Precleanup %s returned %d\n",
+		       obd->obd_name, err);
+
+	/* destroy an uuid-export hash body */
+	if (obd->obd_uuid_hash) {
+		cfs_hash_putref(obd->obd_uuid_hash);
+		obd->obd_uuid_hash = NULL;
+	}
+
+	/* destroy a nid-export hash body */
+	if (obd->obd_nid_hash) {
+		cfs_hash_putref(obd->obd_nid_hash);
+		obd->obd_nid_hash = NULL;
+	}
+
+	/* destroy a nid-stats hash body */
+	if (obd->obd_nid_stats_hash) {
+		cfs_hash_putref(obd->obd_nid_stats_hash);
+		obd->obd_nid_stats_hash = NULL;
+	}
+
+	/* destroy a client_generation-export hash body */
+	if (obd->obd_gen_hash) {
+		cfs_hash_putref(obd->obd_gen_hash);
+		obd->obd_gen_hash = NULL;
+	}
+
+	class_decref(obd, "setup", obd);
+	obd->obd_set_up = 0;
+
+	RETURN(0);
+}
+
+struct obd_device *class_incref(struct obd_device *obd,
+                                const char *scope, const void *source)
+{
+        lu_ref_add_atomic(&obd->obd_reference, scope, source);
+	atomic_inc(&obd->obd_refcount);
+        CDEBUG(D_INFO, "incref %s (%p) now %d\n", obd->obd_name, obd,
+	       atomic_read(&obd->obd_refcount));
+
+        return obd;
+}
+EXPORT_SYMBOL(class_incref);
+
+void class_decref(struct obd_device *obd, const char *scope, const void *source)
+{
+	int last;
+
+	CDEBUG(D_INFO, "Decref %s (%p) now %d - %s\n", obd->obd_name, obd,
+	       atomic_read(&obd->obd_refcount), scope);
+
+	LASSERT(obd->obd_num_exports >= 0);
+	last = atomic_dec_and_test(&obd->obd_refcount);
+	lu_ref_del(&obd->obd_reference, scope, source);
+
+	if (last) {
+		struct obd_export *exp;
+
+		LASSERT(!obd->obd_attached);
+		/* All exports have been destroyed; there should
+		 * be no more in-progress ops by this point.*/
+		exp = obd->obd_self_export;
+
+		if (exp) {
+			exp->exp_flags |= exp_flags_from_obd(obd);
+			class_unlink_export(exp);
+                }
+        }
+}
+EXPORT_SYMBOL(class_decref);
+
+/** Add a failover nid location.
+ * Client obd types contact server obd types using this nid list.
+ */
+int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+        struct obd_import *imp;
+        struct obd_uuid uuid;
+        int rc;
+        ENTRY;
+
+        if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 ||
+            LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) {
+                CERROR("invalid conn_uuid\n");
+                RETURN(-EINVAL);
+        }
+        if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
+            strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) &&
+            strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME)) {
+                CERROR("can't add connection on non-client dev\n");
+                RETURN(-EINVAL);
+        }
+
+        imp = obd->u.cli.cl_import;
+        if (!imp) {
+                CERROR("try to add conn on immature client dev\n");
+                RETURN(-EINVAL);
+        }
+
+        obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1));
+        rc = obd_add_conn(imp, &uuid, lcfg->lcfg_num);
+
+        RETURN(rc);
+}
+
+/** Remove a failover nid location.
+ */
+static int class_del_conn(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+        struct obd_import *imp;
+        struct obd_uuid uuid;
+        int rc;
+        ENTRY;
+
+        if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 ||
+            LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) {
+                CERROR("invalid conn_uuid\n");
+                RETURN(-EINVAL);
+        }
+        if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
+            strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+                CERROR("can't del connection on non-client dev\n");
+                RETURN(-EINVAL);
+        }
+
+        imp = obd->u.cli.cl_import;
+        if (!imp) {
+                CERROR("try to del conn on immature client dev\n");
+                RETURN(-EINVAL);
+        }
+
+        obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1));
+        rc = obd_del_conn(imp, &uuid);
+
+        RETURN(rc);
+}
+
+static LIST_HEAD(lustre_profile_list);
+static DEFINE_SPINLOCK(lustre_profile_list_lock);
+
+struct lustre_profile *class_get_profile(const char * prof)
+{
+	struct lustre_profile *lprof;
+
+	ENTRY;
+	spin_lock(&lustre_profile_list_lock);
+	list_for_each_entry(lprof, &lustre_profile_list, lp_list) {
+		if (!strcmp(lprof->lp_profile, prof)) {
+			lprof->lp_refs++;
+			spin_unlock(&lustre_profile_list_lock);
+			RETURN(lprof);
+		}
+	}
+	spin_unlock(&lustre_profile_list_lock);
+	RETURN(NULL);
+}
+EXPORT_SYMBOL(class_get_profile);
+
+/** Create a named "profile".
+ * This defines the mdc and osc names to use for a client.
+ * This also is used to define the lov to be used by a mdt.
+ */
+static int class_add_profile(int proflen, char *prof, int osclen, char *osc,
+			     int mdclen, char *mdc)
+{
+        struct lustre_profile *lprof;
+        int err = 0;
+        ENTRY;
+
+        CDEBUG(D_CONFIG, "Add profile %s\n", prof);
+
+        OBD_ALLOC(lprof, sizeof(*lprof));
+        if (lprof == NULL)
+                RETURN(-ENOMEM);
+	INIT_LIST_HEAD(&lprof->lp_list);
+
+        LASSERT(proflen == (strlen(prof) + 1));
+        OBD_ALLOC(lprof->lp_profile, proflen);
+        if (lprof->lp_profile == NULL)
+                GOTO(out, err = -ENOMEM);
+        memcpy(lprof->lp_profile, prof, proflen);
+
+        LASSERT(osclen == (strlen(osc) + 1));
+        OBD_ALLOC(lprof->lp_dt, osclen);
+        if (lprof->lp_dt == NULL)
+                GOTO(out, err = -ENOMEM);
+        memcpy(lprof->lp_dt, osc, osclen);
+
+        if (mdclen > 0) {
+                LASSERT(mdclen == (strlen(mdc) + 1));
+                OBD_ALLOC(lprof->lp_md, mdclen);
+                if (lprof->lp_md == NULL)
+                        GOTO(out, err = -ENOMEM);
+                memcpy(lprof->lp_md, mdc, mdclen);
+        }
+
+	spin_lock(&lustre_profile_list_lock);
+	lprof->lp_refs = 1;
+	lprof->lp_list_deleted = false;
+
+	list_add(&lprof->lp_list, &lustre_profile_list);
+	spin_unlock(&lustre_profile_list_lock);
+        RETURN(err);
+
+out:
+        if (lprof->lp_md)
+                OBD_FREE(lprof->lp_md, mdclen);
+        if (lprof->lp_dt)
+                OBD_FREE(lprof->lp_dt, osclen);
+        if (lprof->lp_profile)
+                OBD_FREE(lprof->lp_profile, proflen);
+        OBD_FREE(lprof, sizeof(*lprof));
+        RETURN(err);
+}
+
+void class_del_profile(const char *prof)
+{
+	struct lustre_profile *lprof;
+	ENTRY;
+
+	CDEBUG(D_CONFIG, "Del profile %s\n", prof);
+
+	lprof = class_get_profile(prof);
+	if (lprof) {
+		spin_lock(&lustre_profile_list_lock);
+		/* because get profile increments the ref counter */
+		lprof->lp_refs--;
+		list_del(&lprof->lp_list);
+		lprof->lp_list_deleted = true;
+		spin_unlock(&lustre_profile_list_lock);
+
+		class_put_profile(lprof);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(class_del_profile);
+
+void class_put_profile(struct lustre_profile *lprof)
+{
+	spin_lock(&lustre_profile_list_lock);
+	if ((--lprof->lp_refs) > 0) {
+		LASSERT(lprof->lp_refs > 0);
+		spin_unlock(&lustre_profile_list_lock);
+		return;
+	}
+	spin_unlock(&lustre_profile_list_lock);
+
+	/* confirm not a negative number */
+	LASSERT(lprof->lp_refs == 0);
+
+	/* At least one class_del_profile/profiles must be called
+	 * on the target profile or lustre_profile_list will corrupt */
+	LASSERT(lprof->lp_list_deleted);
+	OBD_FREE(lprof->lp_profile, strlen(lprof->lp_profile) + 1);
+	OBD_FREE(lprof->lp_dt, strlen(lprof->lp_dt) + 1);
+	if (lprof->lp_md != NULL)
+		OBD_FREE(lprof->lp_md, strlen(lprof->lp_md) + 1);
+	OBD_FREE(lprof, sizeof(*lprof));
+}
+EXPORT_SYMBOL(class_put_profile);
+
+/* COMPAT_146 */
+void class_del_profiles(void)
+{
+	struct lustre_profile *lprof, *n;
+	ENTRY;
+
+	spin_lock(&lustre_profile_list_lock);
+	list_for_each_entry_safe(lprof, n, &lustre_profile_list, lp_list) {
+		list_del(&lprof->lp_list);
+		lprof->lp_list_deleted = true;
+		spin_unlock(&lustre_profile_list_lock);
+
+		class_put_profile(lprof);
+
+		spin_lock(&lustre_profile_list_lock);
+	}
+	spin_unlock(&lustre_profile_list_lock);
+	EXIT;
+}
+EXPORT_SYMBOL(class_del_profiles);
+
+static int class_set_global(char *ptr, int val, struct lustre_cfg *lcfg)
+{
+	ENTRY;
+	if (class_match_param(ptr, PARAM_AT_MIN, NULL) == 0)
+		at_min = val;
+	else if (class_match_param(ptr, PARAM_AT_MAX, NULL) == 0)
+		at_max = val;
+	else if (class_match_param(ptr, PARAM_AT_EXTRA, NULL) == 0)
+		at_extra = val;
+	else if (class_match_param(ptr, PARAM_AT_EARLY_MARGIN, NULL) == 0)
+		at_early_margin = val;
+	else if (class_match_param(ptr, PARAM_AT_HISTORY, NULL) == 0)
+		at_history = val;
+	else if (class_match_param(ptr, PARAM_JOBID_VAR, NULL) == 0)
+		strlcpy(obd_jobid_var, lustre_cfg_string(lcfg, 2),
+			JOBSTATS_JOBID_VAR_MAX_LEN + 1);
+	else
+		RETURN(-EINVAL);
+
+	CDEBUG(D_IOCTL, "global %s = %d\n", ptr, val);
+	RETURN(0);
+}
+
+
+/* We can't call ll_process_config or lquota_process_config directly because
+ * it lives in a module that must be loaded after this one. */
+static int (*client_process_config)(struct lustre_cfg *lcfg) = NULL;
+static int (*quota_process_config)(struct lustre_cfg *lcfg) = NULL;
+
+void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg))
+{
+        client_process_config = cpc;
+}
+EXPORT_SYMBOL(lustre_register_client_process_config);
+
+/**
+ * Rename the proc parameter in \a cfg with a new name \a new_name.
+ *
+ * \param cfg	   config structure which contains the proc parameter
+ * \param new_name new name of the proc parameter
+ *
+ * \retval valid-pointer    pointer to the newly-allocated config structure
+ *			    which contains the renamed proc parameter
+ * \retval ERR_PTR(-EINVAL) if \a cfg or \a new_name is NULL, or \a cfg does
+ *			    not contain a proc parameter
+ * \retval ERR_PTR(-ENOMEM) if memory allocation failure occurs
+ */
+struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg,
+				     const char *new_name)
+{
+	struct lustre_cfg_bufs	*bufs = NULL;
+	struct lustre_cfg	*new_cfg = NULL;
+	char			*param = NULL;
+	char			*new_param = NULL;
+	char			*value = NULL;
+	int			 name_len = 0;
+	int			 new_len = 0;
+	ENTRY;
+
+	if (!cfg || !new_name)
+		GOTO(out_nocfg, new_cfg = ERR_PTR(-EINVAL));
+
+	param = lustre_cfg_string(cfg, 1);
+	if (!param)
+		GOTO(out_nocfg, new_cfg = ERR_PTR(-EINVAL));
+
+	value = strchr(param, '=');
+	if (value == NULL)
+		name_len = strlen(param);
+	else
+		name_len = value - param;
+
+	new_len = LUSTRE_CFG_BUFLEN(cfg, 1) + strlen(new_name) - name_len;
+
+	OBD_ALLOC(new_param, new_len);
+	if (!new_param)
+		GOTO(out_nocfg, new_cfg = ERR_PTR(-ENOMEM));
+
+	strcpy(new_param, new_name);
+	if (value != NULL)
+		strcat(new_param, value);
+
+	OBD_ALLOC_PTR(bufs);
+	if (!bufs)
+		GOTO(out_free_param, new_cfg = ERR_PTR(-ENOMEM));
+
+	lustre_cfg_bufs_reset(bufs, NULL);
+	lustre_cfg_bufs_init(bufs, cfg);
+	lustre_cfg_bufs_set_string(bufs, 1, new_param);
+
+	OBD_ALLOC(new_cfg, lustre_cfg_len(bufs->lcfg_bufcount,
+					  bufs->lcfg_buflen));
+	if (!new_cfg)
+		GOTO(out_free_buf, new_cfg = ERR_PTR(-ENOMEM));
+
+	lustre_cfg_init(new_cfg, cfg->lcfg_command, bufs);
+
+	new_cfg->lcfg_num = cfg->lcfg_num;
+	new_cfg->lcfg_flags = cfg->lcfg_flags;
+	new_cfg->lcfg_nid = cfg->lcfg_nid;
+	new_cfg->lcfg_nal = cfg->lcfg_nal;
+out_free_buf:
+	OBD_FREE_PTR(bufs);
+out_free_param:
+	OBD_FREE(new_param, new_len);
+out_nocfg:
+	RETURN(new_cfg);
+}
+EXPORT_SYMBOL(lustre_cfg_rename);
+
+static int process_param2_config(struct lustre_cfg *lcfg)
+{
+	char *param = lustre_cfg_string(lcfg, 1);
+	char *upcall = lustre_cfg_string(lcfg, 2);
+	char *argv[] = {
+		[0] = "/usr/sbin/lctl",
+		[1] = "set_param",
+		[2] = param,
+		[3] = NULL
+	};
+	ktime_t start;
+	ktime_t end;
+	int		rc;
+	ENTRY;
+
+	/* Add upcall processing here. Now only lctl is supported */
+	if (strcmp(upcall, LCTL_UPCALL) != 0) {
+		CERROR("Unsupported upcall %s\n", upcall);
+		RETURN(-EINVAL);
+	}
+
+	start = ktime_get();
+	rc = call_usermodehelper(argv[0], argv, NULL, UMH_WAIT_PROC);
+	end = ktime_get();
+
+	if (rc < 0) {
+		CERROR("lctl: error invoking upcall %s %s %s: rc = %d; "
+		       "time %ldus\n", argv[0], argv[1], argv[2], rc,
+		       (long)ktime_us_delta(end, start));
+	} else {
+		CDEBUG(D_HA, "lctl: invoked upcall %s %s %s, time %ldus\n",
+		       argv[0], argv[1], argv[2],
+		       (long)ktime_us_delta(end, start));
+		       rc = 0;
+	}
+
+	RETURN(rc);
+}
+
+void lustre_register_quota_process_config(int (*qpc)(struct lustre_cfg *lcfg))
+{
+	quota_process_config = qpc;
+}
+EXPORT_SYMBOL(lustre_register_quota_process_config);
+
+/** Process configuration commands given in lustre_cfg form.
+ * These may come from direct calls (e.g. class_manual_cleanup)
+ * or processing the config llog, or ioctl from lctl.
+ */
+int class_process_config(struct lustre_cfg *lcfg)
+{
+        struct obd_device *obd;
+        int err;
+
+        LASSERT(lcfg && !IS_ERR(lcfg));
+        CDEBUG(D_IOCTL, "processing cmd: %x\n", lcfg->lcfg_command);
+
+        /* Commands that don't need a device */
+        switch(lcfg->lcfg_command) {
+        case LCFG_ATTACH: {
+                err = class_attach(lcfg);
+                GOTO(out, err);
+        }
+        case LCFG_ADD_UUID: {
+		CDEBUG(D_IOCTL, "adding mapping from uuid %s to nid %#llx"
+                       " (%s)\n", lustre_cfg_string(lcfg, 1),
+                       lcfg->lcfg_nid, libcfs_nid2str(lcfg->lcfg_nid));
+
+                err = class_add_uuid(lustre_cfg_string(lcfg, 1), lcfg->lcfg_nid);
+                GOTO(out, err);
+        }
+        case LCFG_DEL_UUID: {
+                CDEBUG(D_IOCTL, "removing mappings for uuid %s\n",
+                       (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) == 0)
+                       ? "<all uuids>" : lustre_cfg_string(lcfg, 1));
+
+                err = class_del_uuid(lustre_cfg_string(lcfg, 1));
+                GOTO(out, err);
+        }
+        case LCFG_MOUNTOPT: {
+                CDEBUG(D_IOCTL, "mountopt: profile %s osc %s mdc %s\n",
+                       lustre_cfg_string(lcfg, 1),
+                       lustre_cfg_string(lcfg, 2),
+                       lustre_cfg_string(lcfg, 3));
+                /* set these mount options somewhere, so ll_fill_super
+                 * can find them. */
+                err = class_add_profile(LUSTRE_CFG_BUFLEN(lcfg, 1),
+                                        lustre_cfg_string(lcfg, 1),
+                                        LUSTRE_CFG_BUFLEN(lcfg, 2),
+                                        lustre_cfg_string(lcfg, 2),
+                                        LUSTRE_CFG_BUFLEN(lcfg, 3),
+                                        lustre_cfg_string(lcfg, 3));
+                GOTO(out, err);
+        }
+        case LCFG_DEL_MOUNTOPT: {
+                CDEBUG(D_IOCTL, "mountopt: profile %s\n",
+                       lustre_cfg_string(lcfg, 1));
+                class_del_profile(lustre_cfg_string(lcfg, 1));
+                GOTO(out, err = 0);
+        }
+        case LCFG_SET_TIMEOUT: {
+                CDEBUG(D_IOCTL, "changing lustre timeout from %d to %d\n",
+                       obd_timeout, lcfg->lcfg_num);
+                obd_timeout = max(lcfg->lcfg_num, 1U);
+		obd_timeout_set = 1;
+                GOTO(out, err = 0);
+        }
+        case LCFG_SET_LDLM_TIMEOUT: {
+                CDEBUG(D_IOCTL, "changing lustre ldlm_timeout from %d to %d\n",
+                       ldlm_timeout, lcfg->lcfg_num);
+                ldlm_timeout = max(lcfg->lcfg_num, 1U);
+                if (ldlm_timeout >= obd_timeout)
+                        ldlm_timeout = max(obd_timeout / 3, 1U);
+		ldlm_timeout_set = 1;
+                GOTO(out, err = 0);
+        }
+        case LCFG_SET_UPCALL: {
+                LCONSOLE_ERROR_MSG(0x15a, "recovery upcall is deprecated\n");
+                /* COMPAT_146 Don't fail on old configs */
+                GOTO(out, err = 0);
+        }
+        case LCFG_MARKER: {
+                struct cfg_marker *marker;
+                marker = lustre_cfg_buf(lcfg, 1);
+                CDEBUG(D_IOCTL, "marker %d (%#x) %.16s %s\n", marker->cm_step,
+                       marker->cm_flags, marker->cm_tgtname, marker->cm_comment);
+                GOTO(out, err = 0);
+        }
+        case LCFG_PARAM: {
+                char *tmp;
+                /* llite has no obd */
+                if ((class_match_param(lustre_cfg_string(lcfg, 1),
+				       PARAM_LLITE, NULL) == 0) &&
+                    client_process_config) {
+                        err = (*client_process_config)(lcfg);
+                        GOTO(out, err);
+                } else if ((class_match_param(lustre_cfg_string(lcfg, 1),
+                                              PARAM_SYS, &tmp) == 0)) {
+                        /* Global param settings */
+			err = class_set_global(tmp, lcfg->lcfg_num, lcfg);
+			/*
+			 * Client or server should not fail to mount if
+			 * it hits an unknown configuration parameter.
+			 */
+			if (err != 0)
+				CWARN("Ignoring unknown param %s\n", tmp);
+
+			GOTO(out, err = 0);
+		} else if ((class_match_param(lustre_cfg_string(lcfg, 1),
+					      PARAM_QUOTA, &tmp) == 0) &&
+			   quota_process_config) {
+			err = (*quota_process_config)(lcfg);
+			GOTO(out, err);
+		}
+
+		break;
+	}
+	case LCFG_SET_PARAM: {
+		err = process_param2_config(lcfg);
+		GOTO(out, err = 0);
+	}
+	}
+        /* Commands that require a device */
+        obd = class_name2obd(lustre_cfg_string(lcfg, 0));
+        if (obd == NULL) {
+                if (!LUSTRE_CFG_BUFLEN(lcfg, 0))
+                        CERROR("this lcfg command requires a device name\n");
+                else
+                        CERROR("no device for: %s\n",
+                               lustre_cfg_string(lcfg, 0));
+
+                GOTO(out, err = -EINVAL);
+        }
+
+	switch(lcfg->lcfg_command) {
+	case LCFG_SETUP: {
+		err = class_setup(obd, lcfg);
+		GOTO(out, err);
+	}
+        case LCFG_DETACH: {
+                err = class_detach(obd, lcfg);
+                GOTO(out, err = 0);
+        }
+        case LCFG_CLEANUP: {
+                err = class_cleanup(obd, lcfg);
+                GOTO(out, err = 0);
+        }
+        case LCFG_ADD_CONN: {
+                err = class_add_conn(obd, lcfg);
+                GOTO(out, err = 0);
+        }
+        case LCFG_DEL_CONN: {
+                err = class_del_conn(obd, lcfg);
+                GOTO(out, err = 0);
+        }
+        case LCFG_POOL_NEW: {
+                err = obd_pool_new(obd, lustre_cfg_string(lcfg, 2));
+                GOTO(out, err = 0);
+        }
+        case LCFG_POOL_ADD: {
+                err = obd_pool_add(obd, lustre_cfg_string(lcfg, 2),
+                                   lustre_cfg_string(lcfg, 3));
+                GOTO(out, err = 0);
+        }
+        case LCFG_POOL_REM: {
+                err = obd_pool_rem(obd, lustre_cfg_string(lcfg, 2),
+                                   lustre_cfg_string(lcfg, 3));
+                GOTO(out, err = 0);
+        }
+        case LCFG_POOL_DEL: {
+                err = obd_pool_del(obd, lustre_cfg_string(lcfg, 2));
+                GOTO(out, err = 0);
+        }
+        default: {
+                err = obd_process_config(obd, sizeof(*lcfg), lcfg);
+                GOTO(out, err);
+
+        }
+        }
+out:
+        if ((err < 0) && !(lcfg->lcfg_command & LCFG_REQUIRED)) {
+                CWARN("Ignoring error %d on optional command %#x\n", err,
+                      lcfg->lcfg_command);
+                err = 0;
+        }
+        return err;
+}
+EXPORT_SYMBOL(class_process_config);
+
+int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
+			     struct lustre_cfg *lcfg, void *data)
+{
+	struct lprocfs_vars *var;
+	struct file fakefile;
+	struct seq_file fake_seqfile;
+	char *key, *sval;
+	int i, keylen, vallen;
+	int matched = 0, j = 0;
+	int rc = 0;
+	int skip = 0;
+	ENTRY;
+
+	if (lcfg->lcfg_command != LCFG_PARAM) {
+		CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+		RETURN(-EINVAL);
+	}
+
+	/* fake a seq file so that var->fops->write can work... */
+	fakefile.private_data = &fake_seqfile;
+	fake_seqfile.private = data;
+	/* e.g. tunefs.lustre --param mdt.group_upcall=foo /r/tmp/lustre-mdt
+	   or   lctl conf_param lustre-MDT0000.mdt.group_upcall=bar
+	   or   lctl conf_param lustre-OST0000.osc.max_dirty_mb=36 */
+	for (i = 1; i < lcfg->lcfg_bufcount; i++) {
+		key = lustre_cfg_buf(lcfg, i);
+		/* Strip off prefix */
+		if (class_match_param(key, prefix, &key))
+			/* If the prefix doesn't match, return error so we
+			 * can pass it down the stack */
+			RETURN(-ENOSYS);
+		sval = strchr(key, '=');
+		if (!sval || *(sval + 1) == 0) {
+			CERROR("%s: can't parse param '%s' (missing '=')\n",
+			       lustre_cfg_string(lcfg, 0),
+			       lustre_cfg_string(lcfg, i));
+			/* rc = -EINVAL;        continue parsing other params */
+			continue;
+		}
+		keylen = sval - key;
+		sval++;
+		vallen = strlen(sval);
+		matched = 0;
+		j = 0;
+		/* Search proc entries */
+		while (lvars[j].name) {
+			var = &lvars[j];
+			if (class_match_param(key, var->name, NULL) == 0 &&
+			    keylen == strlen(var->name)) {
+				matched++;
+				rc = -EROFS;
+
+				if (var->fops && var->fops->write) {
+					mm_segment_t oldfs;
+					oldfs = get_fs();
+					set_fs(KERNEL_DS);
+					rc = (var->fops->write)(&fakefile, sval,
+								vallen, NULL);
+					set_fs(oldfs);
+				}
+				break;
+			}
+			j++;
+		}
+		if (!matched) {
+			/* It was upgraded from old MDT/OST device,
+			 * ignore the obsolete "sec_level" parameter. */
+			if (strncmp("sec_level", key, keylen) == 0)
+				continue;
+
+			CERROR("%s: unknown config parameter '%s'\n",
+			       lustre_cfg_string(lcfg, 0),
+			       lustre_cfg_string(lcfg, i));
+			/* rc = -EINVAL;        continue parsing other params */
+			skip++;
+		} else if (rc < 0) {
+			CERROR("%s: error writing parameter '%s': rc = %d\n",
+			       lustre_cfg_string(lcfg, 0), key, rc);
+			rc = 0;
+		} else {
+			CDEBUG(D_CONFIG, "%s: set parameter '%s'\n",
+			       lustre_cfg_string(lcfg, 0), key);
+		}
+	}
+
+	if (rc > 0)
+		rc = 0;
+	if (!rc && skip)
+		rc = skip;
+	RETURN(rc);
+}
+EXPORT_SYMBOL(class_process_proc_param);
+
+/*
+ * Supplemental functions for config logs, it allocates lustre_cfg
+ * buffers plus initialized llog record header at the beginning.
+ */
+struct llog_cfg_rec *lustre_cfg_rec_new(int cmd, struct lustre_cfg_bufs *bufs)
+{
+	struct llog_cfg_rec	*lcr;
+	int			 reclen;
+
+	ENTRY;
+
+	reclen = lustre_cfg_len(bufs->lcfg_bufcount, bufs->lcfg_buflen);
+	reclen = llog_data_len(reclen) + sizeof(struct llog_rec_hdr) +
+		 sizeof(struct llog_rec_tail);
+
+	OBD_ALLOC(lcr, reclen);
+	if (lcr == NULL)
+		RETURN(NULL);
+
+	lustre_cfg_init(&lcr->lcr_cfg, cmd, bufs);
+
+	lcr->lcr_hdr.lrh_len = reclen;
+	lcr->lcr_hdr.lrh_type = OBD_CFG_REC;
+
+	RETURN(lcr);
+}
+EXPORT_SYMBOL(lustre_cfg_rec_new);
+
+void lustre_cfg_rec_free(struct llog_cfg_rec *lcr)
+{
+	ENTRY;
+	OBD_FREE(lcr, lcr->lcr_hdr.lrh_len);
+	EXIT;
+}
+EXPORT_SYMBOL(lustre_cfg_rec_free);
+
+/** Parse a configuration llog, doing various manipulations on them
+ * for various reasons, (modifications for compatibility, skip obsolete
+ * records, change uuids, etc), then class_process_config() resulting
+ * net records.
+ */
+int class_config_llog_handler(const struct lu_env *env,
+			      struct llog_handle *handle,
+			      struct llog_rec_hdr *rec, void *data)
+{
+	struct config_llog_instance *cfg = data;
+	int cfg_len = rec->lrh_len;
+	char *cfg_buf = (char *) (rec + 1);
+	int rc = 0;
+	ENTRY;
+
+	/* class_config_dump_handler(handle, rec, data); */
+
+	switch (rec->lrh_type) {
+	case OBD_CFG_REC: {
+		struct lustre_cfg *lcfg, *lcfg_new;
+		struct lustre_cfg_bufs bufs;
+		char *inst_name = NULL;
+		int inst_len = 0;
+		int swab = 0;
+
+		lcfg = (struct lustre_cfg *)cfg_buf;
+		if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) {
+			lustre_swab_lustre_cfg(lcfg);
+			swab = 1;
+		}
+
+		rc = lustre_cfg_sanity_check(cfg_buf, cfg_len);
+		if (rc)
+			GOTO(out, rc);
+
+                /* Figure out config state info */
+		if (lcfg->lcfg_command == LCFG_MARKER) {
+			struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
+			lustre_swab_cfg_marker(marker, swab,
+					       LUSTRE_CFG_BUFLEN(lcfg, 1));
+			CDEBUG(D_CONFIG, "Marker, inst_flg=%#x mark_flg=%#x\n",
+			       cfg->cfg_flags, marker->cm_flags);
+			if (marker->cm_flags & CM_START) {
+                                /* all previous flags off */
+				cfg->cfg_flags = CFG_F_MARKER;
+				server_name2index(marker->cm_tgtname,
+						  &cfg->cfg_lwp_idx, NULL);
+				if (marker->cm_flags & CM_SKIP) {
+					cfg->cfg_flags |= CFG_F_SKIP;
+					CDEBUG(D_CONFIG, "SKIP #%d\n",
+					       marker->cm_step);
+				} else if ((marker->cm_flags & CM_EXCLUDE) ||
+					   (cfg->cfg_sb &&
+					   lustre_check_exclusion(cfg->cfg_sb,
+							marker->cm_tgtname))) {
+					cfg->cfg_flags |= CFG_F_EXCLUDE;
+					CDEBUG(D_CONFIG, "EXCLUDE %d\n",
+					       marker->cm_step);
+				}
+			} else if (marker->cm_flags & CM_END) {
+				cfg->cfg_flags = 0;
+			}
+		}
+		/* A config command without a start marker before it is
+		   illegal (post 146) */
+		if (!(cfg->cfg_flags & CFG_F_COMPAT146) &&
+		    !(cfg->cfg_flags & CFG_F_MARKER) &&
+		    (lcfg->lcfg_command != LCFG_MARKER)) {
+			CWARN("Config not inside markers, ignoring! "
+			      "(inst: %p, uuid: %s, flags: %#x)\n",
+				cfg->cfg_instance,
+				cfg->cfg_uuid.uuid, cfg->cfg_flags);
+			cfg->cfg_flags |= CFG_F_SKIP;
+		}
+		if (cfg->cfg_flags & CFG_F_SKIP) {
+			CDEBUG(D_CONFIG, "skipping %#x\n",
+			       cfg->cfg_flags);
+			rc = 0;
+			/* No processing! */
+			break;
+		}
+
+                /*
+                 * For interoperability between 1.8 and 2.0,
+                 * rename "mds" obd device type to "mdt".
+                 */
+		{
+			char *typename = lustre_cfg_string(lcfg, 1);
+			char *index = lustre_cfg_string(lcfg, 2);
+
+			if ((lcfg->lcfg_command == LCFG_ATTACH && typename &&
+			    strcmp(typename, "mds") == 0)) {
+				CWARN("For 1.8 interoperability, rename obd "
+					"type from mds to mdt\n");
+				typename[2] = 't';
+			}
+			if ((lcfg->lcfg_command == LCFG_SETUP && index &&
+			    strcmp(index, "type") == 0)) {
+				CDEBUG(D_INFO, "For 1.8 interoperability, "
+				       "set this index to '0'\n");
+				index[0] = '0';
+				index[1] = 0;
+			}
+		}
+
+#ifdef HAVE_SERVER_SUPPORT
+		/* newer MDS replaces LOV/OSC with LOD/OSP */
+		{
+			char *typename = lustre_cfg_string(lcfg, 1);
+
+			if ((lcfg->lcfg_command == LCFG_ATTACH && typename &&
+			    strcmp(typename, LUSTRE_LOV_NAME) == 0) &&
+			    cfg->cfg_sb && IS_MDT(s2lsi(cfg->cfg_sb))) {
+				CDEBUG(D_CONFIG,
+				       "For 2.x interoperability, rename obd "
+				       "type from lov to lod (%s)\n",
+				       s2lsi(cfg->cfg_sb)->lsi_svname);
+				strcpy(typename, LUSTRE_LOD_NAME);
+			}
+			if ((lcfg->lcfg_command == LCFG_ATTACH && typename &&
+			    strcmp(typename, LUSTRE_OSC_NAME) == 0) &&
+			    cfg->cfg_sb && IS_MDT(s2lsi(cfg->cfg_sb))) {
+				CDEBUG(D_CONFIG,
+				       "For 2.x interoperability, rename obd "
+				       "type from osc to osp (%s)\n",
+				       s2lsi(cfg->cfg_sb)->lsi_svname);
+				strcpy(typename, LUSTRE_OSP_NAME);
+			}
+		}
+#endif /* HAVE_SERVER_SUPPORT */
+
+		if (cfg->cfg_flags & CFG_F_EXCLUDE) {
+			CDEBUG(D_CONFIG, "cmd: %x marked EXCLUDED\n",
+			       lcfg->lcfg_command);
+			if (lcfg->lcfg_command == LCFG_LOV_ADD_OBD)
+				/* Add inactive instead */
+				lcfg->lcfg_command = LCFG_LOV_ADD_INA;
+		}
+
+		lustre_cfg_bufs_reset(&bufs, NULL);
+		lustre_cfg_bufs_init(&bufs, lcfg);
+
+		if (cfg->cfg_instance &&
+		    LUSTRE_CFG_BUFLEN(lcfg, 0) > 0) {
+			inst_len = LUSTRE_CFG_BUFLEN(lcfg, 0) +
+				   sizeof(cfg->cfg_instance) * 2 + 4;
+			OBD_ALLOC(inst_name, inst_len);
+			if (inst_name == NULL)
+				GOTO(out, rc = -ENOMEM);
+			snprintf(inst_name, inst_len, "%s-%p",
+				lustre_cfg_string(lcfg, 0),
+				cfg->cfg_instance);
+			lustre_cfg_bufs_set_string(&bufs, 0, inst_name);
+			CDEBUG(D_CONFIG, "cmd %x, instance name: %s\n",
+			       lcfg->lcfg_command, inst_name);
+		}
+
+                /* we override the llog's uuid for clients, to insure they
+                are unique */
+		if (cfg->cfg_instance != NULL &&
+		    lcfg->lcfg_command == LCFG_ATTACH) {
+			lustre_cfg_bufs_set_string(&bufs, 2,
+						   cfg->cfg_uuid.uuid);
+		}
+                /*
+                 * sptlrpc config record, we expect 2 data segments:
+                 *  [0]: fs_name/target_name,
+                 *  [1]: rule string
+                 * moving them to index [1] and [2], and insert MGC's
+                 * obdname at index [0].
+                 */
+		if (cfg->cfg_instance == NULL &&
+		    lcfg->lcfg_command == LCFG_SPTLRPC_CONF) {
+			lustre_cfg_bufs_set(&bufs, 2, bufs.lcfg_buf[1],
+					    bufs.lcfg_buflen[1]);
+			lustre_cfg_bufs_set(&bufs, 1, bufs.lcfg_buf[0],
+					    bufs.lcfg_buflen[0]);
+			lustre_cfg_bufs_set_string(&bufs, 0,
+						   cfg->cfg_obdname);
+		}
+
+		/* Add net info to setup command
+		 * if given on command line.
+		 * So config log will be:
+		 * [0]: client name
+		 * [1]: client UUID
+		 * [2]: server UUID
+		 * [3]: inactive-on-startup
+		 * [4]: restrictive net
+		 */
+		if (cfg && cfg->cfg_sb && s2lsi(cfg->cfg_sb) &&
+		    !IS_SERVER(s2lsi(cfg->cfg_sb))) {
+			struct lustre_sb_info *lsi = s2lsi(cfg->cfg_sb);
+			char *nidnet = lsi->lsi_lmd->lmd_nidnet;
+
+			if (lcfg->lcfg_command == LCFG_SETUP &&
+			    lcfg->lcfg_bufcount != 2 && nidnet) {
+				CDEBUG(D_CONFIG, "Adding net %s info to setup "
+				       "command for client %s\n", nidnet,
+				       lustre_cfg_string(lcfg, 0));
+				lustre_cfg_bufs_set_string(&bufs, 4, nidnet);
+			}
+		}
+
+		/* Skip add_conn command if uuid is
+		 * not on restricted net */
+		if (cfg && cfg->cfg_sb && s2lsi(cfg->cfg_sb) &&
+		    !IS_SERVER(s2lsi(cfg->cfg_sb))) {
+			struct lustre_sb_info *lsi = s2lsi(cfg->cfg_sb);
+			char *uuid_str = lustre_cfg_string(lcfg, 1);
+
+			if (lcfg->lcfg_command == LCFG_ADD_CONN &&
+			    lsi->lsi_lmd->lmd_nidnet &&
+			    LNET_NIDNET(libcfs_str2nid(uuid_str)) !=
+			    libcfs_str2net(lsi->lsi_lmd->lmd_nidnet)) {
+				CDEBUG(D_CONFIG, "skipping add_conn for %s\n",
+				       uuid_str);
+				rc = 0;
+				/* No processing! */
+				break;
+			}
+		}
+
+		OBD_ALLOC(lcfg_new, lustre_cfg_len(bufs.lcfg_bufcount,
+						   bufs.lcfg_buflen));
+		if (!lcfg_new)
+			GOTO(out, rc = -ENOMEM);
+
+		lustre_cfg_init(lcfg_new, lcfg->lcfg_command, &bufs);
+		lcfg_new->lcfg_num   = lcfg->lcfg_num;
+		lcfg_new->lcfg_flags = lcfg->lcfg_flags;
+
+		/* XXX Hack to try to remain binary compatible with
+		 * pre-newconfig logs */
+		if (lcfg->lcfg_nal != 0 &&      /* pre-newconfig log? */
+		    (lcfg->lcfg_nid >> 32) == 0) {
+			__u32 addr = (__u32)(lcfg->lcfg_nid & 0xffffffff);
+
+			lcfg_new->lcfg_nid =
+				LNET_MKNID(LNET_MKNET(lcfg->lcfg_nal, 0), addr);
+			CWARN("Converted pre-newconfig NAL %d NID %x to %s\n",
+			      lcfg->lcfg_nal, addr,
+			      libcfs_nid2str(lcfg_new->lcfg_nid));
+		} else {
+			lcfg_new->lcfg_nid = lcfg->lcfg_nid;
+		}
+
+		lcfg_new->lcfg_nal = 0; /* illegal value for obsolete field */
+
+		rc = class_process_config(lcfg_new);
+		OBD_FREE(lcfg_new, lustre_cfg_len(lcfg_new->lcfg_bufcount,
+						  lcfg_new->lcfg_buflens));
+		if (inst_name)
+			OBD_FREE(inst_name, inst_len);
+		break;
+	}
+	default:
+		CERROR("Unknown llog record type %#x encountered\n",
+		       rec->lrh_type);
+		break;
+	}
+out:
+	if (rc) {
+		CERROR("%s: cfg command failed: rc = %d\n",
+			handle->lgh_ctxt->loc_obd->obd_name, rc);
+		class_config_dump_handler(NULL, handle, rec, data);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(class_config_llog_handler);
+
+int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+			    char *name, struct config_llog_instance *cfg)
+{
+	struct llog_process_cat_data cd = {
+		.lpcd_first_idx = 0,
+	};
+	struct llog_handle *llh;
+	llog_cb_t callback;
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_INFO, "looking up llog %s\n", name);
+	rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+	if (rc)
+		RETURN(rc);
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+	if (rc)
+		GOTO(parse_out, rc);
+
+	/* continue processing from where we last stopped to end-of-log */
+	if (cfg) {
+		cd.lpcd_first_idx = cfg->cfg_last_idx;
+		callback = cfg->cfg_callback;
+		LASSERT(callback != NULL);
+	} else {
+		callback = class_config_llog_handler;
+	}
+
+	cd.lpcd_last_idx = 0;
+
+	rc = llog_process(env, llh, callback, cfg, &cd);
+
+	CDEBUG(D_CONFIG, "Processed log %s gen %d-%d (rc=%d)\n", name,
+	       cd.lpcd_first_idx + 1, cd.lpcd_last_idx, rc);
+	if (cfg)
+		cfg->cfg_last_idx = cd.lpcd_last_idx;
+
+parse_out:
+	llog_close(env, llh);
+        RETURN(rc);
+}
+EXPORT_SYMBOL(class_config_parse_llog);
+
+static struct lcfg_type_data {
+	__u32	 ltd_type;
+	char	*ltd_name;
+	char	*ltd_bufs[4];
+} lcfg_data_table[] = {
+	{ LCFG_ATTACH, "attach", { "type", "UUID", "3", "4" } },
+	{ LCFG_DETACH, "detach", { "1", "2", "3", "4" } },
+	{ LCFG_SETUP, "setup", { "UUID", "node", "options", "failout" } },
+	{ LCFG_CLEANUP, "cleanup", { "1", "2", "3", "4" } },
+	{ LCFG_ADD_UUID, "add_uuid", { "node", "2", "3", "4" }  },
+	{ LCFG_DEL_UUID, "del_uuid", { "1", "2", "3", "4" }  },
+	{ LCFG_MOUNTOPT, "new_profile", { "name", "lov", "lmv", "4" }  },
+	{ LCFG_DEL_MOUNTOPT, "del_mountopt", { "1", "2", "3", "4" } , },
+	{ LCFG_SET_TIMEOUT, "set_timeout", { "parameter", "2", "3", "4" }  },
+	{ LCFG_SET_UPCALL, "set_upcall", { "1", "2", "3", "4" }  },
+	{ LCFG_ADD_CONN, "add_conn", { "node", "2", "3", "4" }  },
+	{ LCFG_DEL_CONN, "del_conn", { "1", "2", "3", "4" }  },
+	{ LCFG_LOV_ADD_OBD, "add_osc", { "ost", "index", "gen", "UUID" } },
+	{ LCFG_LOV_DEL_OBD, "del_osc", { "1", "2", "3", "4" } },
+	{ LCFG_PARAM, "set_param", { "parameter", "value", "3", "4" } },
+	{ LCFG_MARKER, "marker", { "1", "2", "3", "4" } },
+	{ LCFG_LOG_START, "log_start", { "1", "2", "3", "4" } },
+	{ LCFG_LOG_END, "log_end", { "1", "2", "3", "4" } },
+	{ LCFG_LOV_ADD_INA, "add_osc_inactive", { "1", "2", "3", "4" }  },
+	{ LCFG_ADD_MDC, "add_mdc", { "mdt", "index", "gen", "UUID" } },
+	{ LCFG_DEL_MDC, "del_mdc", { "1", "2", "3", "4" } },
+	{ LCFG_SPTLRPC_CONF, "security", { "parameter", "2", "3", "4" } },
+	{ LCFG_POOL_NEW, "new_pool", { "fsname", "pool", "3", "4" }  },
+	{ LCFG_POOL_ADD, "add_pool", { "fsname", "pool", "ost", "4" } },
+	{ LCFG_POOL_REM, "remove_pool", { "fsname", "pool", "ost", "4" } },
+	{ LCFG_POOL_DEL, "del_pool", { "fsname", "pool", "3", "4" } },
+	{ LCFG_SET_LDLM_TIMEOUT, "set_ldlm_timeout",
+	  { "parameter", "2", "3", "4" } },
+	{ 0, NULL, { NULL, NULL, NULL, NULL } }
+};
+
+static struct lcfg_type_data *lcfg_cmd2data(__u32 cmd)
+{
+	int i = 0;
+
+	while (lcfg_data_table[i].ltd_type != 0) {
+		if (lcfg_data_table[i].ltd_type == cmd)
+			return &lcfg_data_table[i];
+		i++;
+	}
+	return NULL;
+}
+
+/**
+ * Parse config record and output dump in supplied buffer.
+ *
+ * This is separated from class_config_dump_handler() to use
+ * for ioctl needs as well
+ *
+ * Sample Output:
+ * - { index: 4, event: attach, device: lustrewt-clilov, type: lov,
+ *     UUID: lustrewt-clilov_UUID }
+ */
+int class_config_yaml_output(struct llog_rec_hdr *rec, char *buf, int size)
+{
+	struct lustre_cfg	*lcfg = (struct lustre_cfg *)(rec + 1);
+	char			*ptr = buf;
+	char			*end = buf + size;
+	int			 rc = 0, i;
+	struct lcfg_type_data	*ldata;
+
+	LASSERT(rec->lrh_type == OBD_CFG_REC);
+	rc = lustre_cfg_sanity_check(lcfg, rec->lrh_len);
+	if (rc < 0)
+		return rc;
+
+	ldata = lcfg_cmd2data(lcfg->lcfg_command);
+	if (ldata == NULL)
+		return -ENOTTY;
+
+	if (lcfg->lcfg_command == LCFG_MARKER)
+		return 0;
+
+	/* form YAML entity */
+	ptr += snprintf(ptr, end - ptr, "- { index: %u, event: %s",
+			rec->lrh_index, ldata->ltd_name);
+
+	if (lcfg->lcfg_flags)
+		ptr += snprintf(ptr, end - ptr, ", flags: %#08x",
+				lcfg->lcfg_flags);
+	if (lcfg->lcfg_num)
+		ptr += snprintf(ptr, end - ptr, ", num: %#08x",
+				lcfg->lcfg_num);
+	if (lcfg->lcfg_nid) {
+		char nidstr[LNET_NIDSTR_SIZE];
+
+		libcfs_nid2str_r(lcfg->lcfg_nid, nidstr, sizeof(nidstr));
+		ptr += snprintf(ptr, end - ptr, ", nid: %s(%#llx)",
+				nidstr, lcfg->lcfg_nid);
+	}
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 0) > 0)
+		ptr += snprintf(ptr, end - ptr, ", device: %s",
+				lustre_cfg_string(lcfg, 0));
+
+	for (i = 1; i < lcfg->lcfg_bufcount; i++) {
+		if (LUSTRE_CFG_BUFLEN(lcfg, i) > 0)
+			ptr += snprintf(ptr, end - ptr, ", %s: %s",
+					ldata->ltd_bufs[i - 1],
+					lustre_cfg_string(lcfg, i));
+	}
+
+	ptr += snprintf(ptr, end - ptr, " }\n");
+	/* return consumed bytes */
+	rc = ptr - buf;
+	return rc;
+}
+
+/**
+ * parse config record and output dump in supplied buffer.
+ * This is separated from class_config_dump_handler() to use
+ * for ioctl needs as well
+ */
+static int class_config_parse_rec(struct llog_rec_hdr *rec, char *buf, int size)
+{
+	struct lustre_cfg	*lcfg = (struct lustre_cfg *)(rec + 1);
+	char			*ptr = buf;
+	char			*end = buf + size;
+	int			 rc = 0;
+
+	ENTRY;
+
+	LASSERT(rec->lrh_type == OBD_CFG_REC);
+	rc = lustre_cfg_sanity_check(lcfg, rec->lrh_len);
+	if (rc < 0)
+		RETURN(rc);
+
+	ptr += snprintf(ptr, end-ptr, "cmd=%05x ", lcfg->lcfg_command);
+	if (lcfg->lcfg_flags)
+		ptr += snprintf(ptr, end-ptr, "flags=%#08x ",
+				lcfg->lcfg_flags);
+
+	if (lcfg->lcfg_num)
+		ptr += snprintf(ptr, end-ptr, "num=%#08x ", lcfg->lcfg_num);
+
+	if (lcfg->lcfg_nid) {
+		char nidstr[LNET_NIDSTR_SIZE];
+
+		libcfs_nid2str_r(lcfg->lcfg_nid, nidstr, sizeof(nidstr));
+		ptr += snprintf(ptr, end-ptr, "nid=%s(%#llx)\n     ",
+				nidstr, lcfg->lcfg_nid);
+	}
+
+	if (lcfg->lcfg_command == LCFG_MARKER) {
+		struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
+
+		ptr += snprintf(ptr, end-ptr, "marker=%d(%#x)%s '%s'",
+				marker->cm_step, marker->cm_flags,
+				marker->cm_tgtname, marker->cm_comment);
+	} else {
+		int i;
+
+		for (i = 0; i <  lcfg->lcfg_bufcount; i++) {
+			ptr += snprintf(ptr, end-ptr, "%d:%s  ", i,
+					lustre_cfg_string(lcfg, i));
+		}
+	}
+	ptr += snprintf(ptr, end - ptr, "\n");
+	/* return consumed bytes */
+	rc = ptr - buf;
+	RETURN(rc);
+}
+
+int class_config_dump_handler(const struct lu_env *env,
+			      struct llog_handle *handle,
+			      struct llog_rec_hdr *rec, void *data)
+{
+	char	*outstr;
+	int	 rc = 0;
+
+	ENTRY;
+
+	OBD_ALLOC(outstr, 256);
+	if (outstr == NULL)
+		RETURN(-ENOMEM);
+
+	if (rec->lrh_type == OBD_CFG_REC) {
+		class_config_parse_rec(rec, outstr, 256);
+		LCONSOLE(D_WARNING, "   %s\n", outstr);
+	} else {
+		LCONSOLE(D_WARNING, "unhandled lrh_type: %#x\n", rec->lrh_type);
+		rc = -EINVAL;
+	}
+
+	OBD_FREE(outstr, 256);
+	RETURN(rc);
+}
+
+/** Call class_cleanup and class_detach.
+ * "Manual" only in the sense that we're faking lcfg commands.
+ */
+int class_manual_cleanup(struct obd_device *obd)
+{
+        char                    flags[3] = "";
+        struct lustre_cfg      *lcfg;
+        struct lustre_cfg_bufs  bufs;
+        int                     rc;
+        ENTRY;
+
+        if (!obd) {
+                CERROR("empty cleanup\n");
+                RETURN(-EALREADY);
+        }
+
+        if (obd->obd_force)
+                strcat(flags, "F");
+        if (obd->obd_fail)
+                strcat(flags, "A");
+
+        CDEBUG(D_CONFIG, "Manual cleanup of %s (flags='%s')\n",
+               obd->obd_name, flags);
+
+	lustre_cfg_bufs_reset(&bufs, obd->obd_name);
+	lustre_cfg_bufs_set_string(&bufs, 1, flags);
+	OBD_ALLOC(lcfg, lustre_cfg_len(bufs.lcfg_bufcount, bufs.lcfg_buflen));
+	if (!lcfg)
+		RETURN(-ENOMEM);
+	lustre_cfg_init(lcfg, LCFG_CLEANUP, &bufs);
+
+        rc = class_process_config(lcfg);
+        if (rc) {
+                CERROR("cleanup failed %d: %s\n", rc, obd->obd_name);
+                GOTO(out, rc);
+        }
+
+        /* the lcfg is almost the same for both ops */
+        lcfg->lcfg_command = LCFG_DETACH;
+        rc = class_process_config(lcfg);
+        if (rc)
+                CERROR("detach failed %d: %s\n", rc, obd->obd_name);
+out:
+	OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens));
+	RETURN(rc);
+}
+EXPORT_SYMBOL(class_manual_cleanup);
+
+/*
+ * uuid<->export lustre hash operations
+ */
+
+static unsigned
+uuid_hash(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+        return cfs_hash_djb2_hash(((struct obd_uuid *)key)->uuid,
+                                  sizeof(((struct obd_uuid *)key)->uuid), mask);
+}
+
+static void *
+uuid_key(struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+
+	return &exp->exp_client_uuid;
+}
+
+/*
+ * NOTE: It is impossible to find an export that is in failed
+ *       state with this function
+ */
+static int
+uuid_keycmp(const void *key, struct hlist_node *hnode)
+{
+        struct obd_export *exp;
+
+        LASSERT(key);
+	exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+
+        return obd_uuid_equals(key, &exp->exp_client_uuid) &&
+               !exp->exp_failed;
+}
+
+static void *
+uuid_export_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+}
+
+static void
+uuid_export_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+        struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+        class_export_get(exp);
+}
+
+static void
+uuid_export_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+        struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+        class_export_put(exp);
+}
+
+static struct cfs_hash_ops uuid_hash_ops = {
+        .hs_hash        = uuid_hash,
+        .hs_key         = uuid_key,
+        .hs_keycmp      = uuid_keycmp,
+        .hs_object      = uuid_export_object,
+        .hs_get         = uuid_export_get,
+        .hs_put_locked  = uuid_export_put_locked,
+};
+
+
+/*
+ * nid<->export hash operations
+ */
+
+static unsigned
+nid_hash(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+        return cfs_hash_djb2_hash(key, sizeof(lnet_nid_t), mask);
+}
+
+static void *
+nid_key(struct hlist_node *hnode)
+{
+        struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+
+        RETURN(&exp->exp_connection->c_peer.nid);
+}
+
+/*
+ * NOTE: It is impossible to find an export that is in failed
+ *       state with this function
+ */
+static int
+nid_kepcmp(const void *key, struct hlist_node *hnode)
+{
+        struct obd_export *exp;
+
+        LASSERT(key);
+	exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+
+        RETURN(exp->exp_connection->c_peer.nid == *(lnet_nid_t *)key &&
+               !exp->exp_failed);
+}
+
+static void *
+nid_export_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct obd_export, exp_nid_hash);
+}
+
+static void
+nid_export_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+        struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+        class_export_get(exp);
+}
+
+static void
+nid_export_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+        struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+        class_export_put(exp);
+}
+
+static struct cfs_hash_ops nid_hash_ops = {
+        .hs_hash        = nid_hash,
+        .hs_key         = nid_key,
+        .hs_keycmp      = nid_kepcmp,
+        .hs_object      = nid_export_object,
+        .hs_get         = nid_export_get,
+        .hs_put_locked  = nid_export_put_locked,
+};
+
+
+/*
+ * nid<->nidstats hash operations
+ */
+
+static void *
+nidstats_key(struct hlist_node *hnode)
+{
+        struct nid_stat *ns;
+
+	ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+
+        return &ns->nid;
+}
+
+static int
+nidstats_keycmp(const void *key, struct hlist_node *hnode)
+{
+        return *(lnet_nid_t *)nidstats_key(hnode) == *(lnet_nid_t *)key;
+}
+
+static void *
+nidstats_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct nid_stat, nid_hash);
+}
+
+static void
+nidstats_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+        struct nid_stat *ns;
+
+	ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+        nidstat_getref(ns);
+}
+
+static void
+nidstats_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+        struct nid_stat *ns;
+
+	ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+        nidstat_putref(ns);
+}
+
+static struct cfs_hash_ops nid_stat_hash_ops = {
+        .hs_hash        = nid_hash,
+        .hs_key         = nidstats_key,
+        .hs_keycmp      = nidstats_keycmp,
+        .hs_object      = nidstats_object,
+        .hs_get         = nidstats_get,
+        .hs_put_locked  = nidstats_put_locked,
+};
+
+
+/*
+ * client_generation<->export hash operations
+ */
+
+static unsigned
+gen_hash(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, sizeof(__u32), mask);
+}
+
+static void *
+gen_key(struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_gen_hash);
+
+	RETURN(&exp->exp_target_data.ted_lcd->lcd_generation);
+}
+
+/*
+ * NOTE: It is impossible to find an export that is in failed
+ *       state with this function
+ */
+static int
+gen_kepcmp(const void *key, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	LASSERT(key);
+	exp = hlist_entry(hnode, struct obd_export, exp_gen_hash);
+
+	RETURN(exp->exp_target_data.ted_lcd->lcd_generation == *(__u32 *)key &&
+	       !exp->exp_failed);
+}
+
+static void *
+gen_export_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct obd_export, exp_gen_hash);
+}
+
+static void
+gen_export_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_gen_hash);
+	class_export_get(exp);
+}
+
+static void
+gen_export_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_gen_hash);
+	class_export_put(exp);
+}
+
+static struct cfs_hash_ops gen_hash_ops = {
+	.hs_hash        = gen_hash,
+	.hs_key         = gen_key,
+	.hs_keycmp      = gen_kepcmp,
+	.hs_object      = gen_export_object,
+	.hs_get         = gen_export_get,
+	.hs_put_locked  = gen_export_put_locked,
+};
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c
new file mode 100644
index 0000000000000..ed1a1d7eea343
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c
@@ -0,0 +1,1664 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obd_mount.c
+ *
+ * Client mount routines
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#define D_MOUNT (D_SUPER|D_CONFIG/*|D_WARNING */)
+#define PRINT_CMD CDEBUG
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre/lustre_user.h>
+#include <linux/version.h>
+#include <lustre_log.h>
+#include <lustre_disk.h>
+#include <uapi/linux/lustre_param.h>
+
+static int (*client_fill_super)(struct super_block *sb,
+				struct vfsmount *mnt);
+
+static void (*kill_super_cb)(struct super_block *sb);
+
+/**************** config llog ********************/
+
+/** Get a config log from the MGS and process it.
+ * This func is called for both clients and servers.
+ * Continue to process new statements appended to the logs
+ * (whenever the config lock is revoked) until lustre_end_log
+ * is called.
+ * @param sb The superblock is used by the MGC to write to the local copy of
+ *   the config log
+ * @param logname The name of the llog to replicate from the MGS
+ * @param cfg Since the same mgc may be used to follow multiple config logs
+ *   (e.g. ost1, ost2, client), the config_llog_instance keeps the state for
+ *   this log, and is added to the mgc's list of logs to follow.
+ */
+int lustre_process_log(struct super_block *sb, char *logname,
+                     struct config_llog_instance *cfg)
+{
+        struct lustre_cfg *lcfg;
+        struct lustre_cfg_bufs *bufs;
+        struct lustre_sb_info *lsi = s2lsi(sb);
+        struct obd_device *mgc = lsi->lsi_mgc;
+        int rc;
+        ENTRY;
+
+        LASSERT(mgc);
+        LASSERT(cfg);
+
+        OBD_ALLOC_PTR(bufs);
+        if (bufs == NULL)
+                RETURN(-ENOMEM);
+
+	/* mgc_process_config */
+	lustre_cfg_bufs_reset(bufs, mgc->obd_name);
+	lustre_cfg_bufs_set_string(bufs, 1, logname);
+	lustre_cfg_bufs_set(bufs, 2, cfg, sizeof(*cfg));
+	lustre_cfg_bufs_set(bufs, 3, &sb, sizeof(sb));
+	OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount, bufs->lcfg_buflen));
+	if (!lcfg)
+		GOTO(out, rc = -ENOMEM);
+	lustre_cfg_init(lcfg, LCFG_LOG_START, bufs);
+
+	rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
+	OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens));
+out:
+        OBD_FREE_PTR(bufs);
+
+        if (rc == -EINVAL)
+                LCONSOLE_ERROR_MSG(0x15b, "%s: The configuration from log '%s'"
+                                   "failed from the MGS (%d).  Make sure this "
+                                   "client and the MGS are running compatible "
+                                   "versions of Lustre.\n",
+                                   mgc->obd_name, logname, rc);
+	else if (rc != 0)
+                LCONSOLE_ERROR_MSG(0x15c, "%s: The configuration from log '%s' "
+                                   "failed (%d). This may be the result of "
+                                   "communication errors between this node and "
+                                   "the MGS, a bad configuration, or other "
+                                   "errors. See the syslog for more "
+                                   "information.\n", mgc->obd_name, logname,
+                                   rc);
+
+        /* class_obd_list(); */
+        RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_process_log);
+
+/* Stop watching this config log for updates */
+int lustre_end_log(struct super_block *sb, char *logname,
+                       struct config_llog_instance *cfg)
+{
+        struct lustre_cfg *lcfg;
+        struct lustre_cfg_bufs bufs;
+        struct lustre_sb_info *lsi = s2lsi(sb);
+        struct obd_device *mgc = lsi->lsi_mgc;
+        int rc;
+        ENTRY;
+
+        if (!mgc)
+                RETURN(-ENOENT);
+
+        /* mgc_process_config */
+	lustre_cfg_bufs_reset(&bufs, mgc->obd_name);
+	lustre_cfg_bufs_set_string(&bufs, 1, logname);
+	if (cfg)
+		lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg));
+	OBD_ALLOC(lcfg, lustre_cfg_len(bufs.lcfg_bufcount, bufs.lcfg_buflen));
+	if (!lcfg)
+		RETURN(-ENOMEM);
+	lustre_cfg_init(lcfg, LCFG_LOG_END, &bufs);
+	rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
+	OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens));
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_end_log);
+
+/**************** obd start *******************/
+
+/** lustre_cfg_bufs are a holdover from 1.4; we can still set these up from
+ * lctl (and do for echo cli/srv.
+ */
+static int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
+		   char *s1, char *s2, char *s3, char *s4)
+{
+        struct lustre_cfg_bufs bufs;
+	struct lustre_cfg *lcfg = NULL;
+        int rc;
+
+        CDEBUG(D_TRACE, "lcfg %s %#x %s %s %s %s\n", cfgname,
+               cmd, s1, s2, s3, s4);
+
+        lustre_cfg_bufs_reset(&bufs, cfgname);
+        if (s1)
+                lustre_cfg_bufs_set_string(&bufs, 1, s1);
+        if (s2)
+                lustre_cfg_bufs_set_string(&bufs, 2, s2);
+        if (s3)
+                lustre_cfg_bufs_set_string(&bufs, 3, s3);
+        if (s4)
+                lustre_cfg_bufs_set_string(&bufs, 4, s4);
+
+	OBD_ALLOC(lcfg, lustre_cfg_len(bufs.lcfg_bufcount, bufs.lcfg_buflen));
+	if (!lcfg)
+		return -ENOMEM;
+	lustre_cfg_init(lcfg, cmd, &bufs);
+	lcfg->lcfg_nid = nid;
+	rc = class_process_config(lcfg);
+	OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens));
+	return rc;
+}
+
+/** Call class_attach and class_setup.  These methods in turn call
+ * obd type-specific methods.
+ */
+int lustre_start_simple(char *obdname, char *type, char *uuid,
+			char *s1, char *s2, char *s3, char *s4)
+{
+	int rc;
+	CDEBUG(D_MOUNT, "Starting obd %s (typ=%s)\n", obdname, type);
+
+	rc = do_lcfg(obdname, 0, LCFG_ATTACH, type, uuid, NULL, NULL);
+	if (rc) {
+		CERROR("%s attach error %d\n", obdname, rc);
+		return rc;
+	}
+	rc = do_lcfg(obdname, 0, LCFG_SETUP, s1, s2, s3, s4);
+	if (rc) {
+		CERROR("%s setup error %d\n", obdname, rc);
+		do_lcfg(obdname, 0, LCFG_DETACH, NULL, NULL, NULL, NULL);
+	}
+	return rc;
+}
+
+static DEFINE_MUTEX(mgc_start_lock);
+
+/** Set up a mgc obd to process startup logs
+ *
+ * \param sb [in] super block of the mgc obd
+ *
+ * \retval 0 success, otherwise error code
+ */
+int lustre_start_mgc(struct super_block *sb)
+{
+        struct obd_connect_data *data = NULL;
+        struct lustre_sb_info *lsi = s2lsi(sb);
+        struct obd_device *obd;
+        struct obd_export *exp;
+	struct obd_uuid *uuid = NULL;
+        class_uuid_t uuidc;
+        lnet_nid_t nid;
+	char nidstr[LNET_NIDSTR_SIZE];
+        char *mgcname = NULL, *niduuid = NULL, *mgssec = NULL;
+        char *ptr;
+	int rc = 0, i = 0, j;
+	size_t len;
+	ENTRY;
+
+	LASSERT(lsi->lsi_lmd);
+
+	/* Find the first non-lo MGS nid for our MGC name */
+	if (IS_SERVER(lsi)) {
+		/* mount -o mgsnode=nid */
+		ptr = lsi->lsi_lmd->lmd_mgs;
+		if (lsi->lsi_lmd->lmd_mgs &&
+		    (class_parse_nid(lsi->lsi_lmd->lmd_mgs, &nid, &ptr) == 0)) {
+			i++;
+		} else if (IS_MGS(lsi)) {
+			struct lnet_process_id id;
+
+                        while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
+                                if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
+                                        continue;
+                                nid = id.nid;
+                                i++;
+                                break;
+                        }
+                }
+        } else { /* client */
+                /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
+                ptr = lsi->lsi_lmd->lmd_dev;
+                if (class_parse_nid(ptr, &nid, &ptr) == 0)
+                        i++;
+        }
+        if (i == 0) {
+                CERROR("No valid MGS nids found.\n");
+                RETURN(-EINVAL);
+        }
+
+	mutex_lock(&mgc_start_lock);
+
+	libcfs_nid2str_r(nid, nidstr, sizeof(nidstr));
+	len = strlen(LUSTRE_MGC_OBDNAME) + strlen(nidstr) + 1;
+	OBD_ALLOC(mgcname, len);
+	OBD_ALLOC(niduuid, len + 2);
+	if (mgcname == NULL || niduuid == NULL)
+		GOTO(out_free, rc = -ENOMEM);
+	snprintf(mgcname, len, "%s%s", LUSTRE_MGC_OBDNAME, nidstr);
+
+        mgssec = lsi->lsi_lmd->lmd_mgssec ? lsi->lsi_lmd->lmd_mgssec : "";
+
+        OBD_ALLOC_PTR(data);
+        if (data == NULL)
+                GOTO(out_free, rc = -ENOMEM);
+
+        obd = class_name2obd(mgcname);
+        if (obd && !obd->obd_stopping) {
+		int recov_bk;
+
+                rc = obd_set_info_async(NULL, obd->obd_self_export,
+                                        strlen(KEY_MGSSEC), KEY_MGSSEC,
+                                        strlen(mgssec), mgssec, NULL);
+                if (rc)
+                        GOTO(out_free, rc);
+
+                /* Re-using an existing MGC */
+		atomic_inc(&obd->u.cli.cl_mgc_refcount);
+
+                /* IR compatibility check, only for clients */
+                if (lmd_is_client(lsi->lsi_lmd)) {
+                        int has_ir;
+                        int vallen = sizeof(*data);
+                        __u32 *flags = &lsi->lsi_lmd->lmd_flags;
+
+			rc = obd_get_info(NULL, obd->obd_self_export,
+					  strlen(KEY_CONN_DATA), KEY_CONN_DATA,
+					  &vallen, data);
+                        LASSERT(rc == 0);
+                        has_ir = OCD_HAS_FLAG(data, IMP_RECOV);
+                        if (has_ir ^ !(*flags & LMD_FLG_NOIR)) {
+                                /* LMD_FLG_NOIR is for test purpose only */
+                                LCONSOLE_WARN(
+                                    "Trying to mount a client with IR setting "
+                                    "not compatible with current mgc. "
+                                    "Force to use current mgc setting that is "
+                                    "IR %s.\n",
+                                    has_ir ? "enabled" : "disabled");
+                                if (has_ir)
+                                        *flags &= ~LMD_FLG_NOIR;
+                                else
+                                        *flags |= LMD_FLG_NOIR;
+                        }
+                }
+
+                recov_bk = 0;
+                /* If we are restarting the MGS, don't try to keep the MGC's
+                   old connection, or registration will fail. */
+		if (IS_MGS(lsi)) {
+                        CDEBUG(D_MOUNT, "New MGS with live MGC\n");
+                        recov_bk = 1;
+                }
+
+                /* Try all connections, but only once (again).
+                   We don't want to block another target from starting
+                   (using its local copy of the log), but we do want to connect
+                   if at all possible. */
+                recov_bk++;
+                CDEBUG(D_MOUNT, "%s: Set MGC reconnect %d\n", mgcname,recov_bk);
+                rc = obd_set_info_async(NULL, obd->obd_self_export,
+                                        sizeof(KEY_INIT_RECOV_BACKUP),
+                                        KEY_INIT_RECOV_BACKUP,
+                                        sizeof(recov_bk), &recov_bk, NULL);
+                GOTO(out, rc = 0);
+        }
+
+        CDEBUG(D_MOUNT, "Start MGC '%s'\n", mgcname);
+
+        /* Add the primary nids for the MGS */
+        i = 0;
+	snprintf(niduuid, len + 2, "%s_%x", mgcname, i);
+	if (IS_SERVER(lsi)) {
+		ptr = lsi->lsi_lmd->lmd_mgs;
+		CDEBUG(D_MOUNT, "mgs nids %s.\n", ptr);
+		if (IS_MGS(lsi)) {
+			/* Use local nids (including LO) */
+			struct lnet_process_id id;
+
+			while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
+				rc = do_lcfg(mgcname, id.nid, LCFG_ADD_UUID,
+					     niduuid, NULL, NULL, NULL);
+			}
+		} else {
+			/* Use mgsnode= nids */
+			/* mount -o mgsnode=nid */
+			if (lsi->lsi_lmd->lmd_mgs) {
+				ptr = lsi->lsi_lmd->lmd_mgs;
+			} else if (class_find_param(ptr, PARAM_MGSNODE,
+						    &ptr) != 0) {
+				CERROR("No MGS nids given.\n");
+				GOTO(out_free, rc = -EINVAL);
+			}
+			/*
+			 * Add primary MGS nid(s).
+			 * Multiple nids on one MGS node are separated
+			 * by commas.
+			 */
+			while (class_parse_nid(ptr, &nid, &ptr) == 0) {
+				rc = do_lcfg(mgcname, nid, LCFG_ADD_UUID,
+					     niduuid, NULL, NULL, NULL);
+				if (rc == 0)
+					++i;
+				/* Stop at the first failover nid */
+				if (*ptr == ':')
+					break;
+			}
+		}
+        } else { /* client */
+                /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
+                ptr = lsi->lsi_lmd->lmd_dev;
+                while (class_parse_nid(ptr, &nid, &ptr) == 0) {
+			rc = do_lcfg(mgcname, nid, LCFG_ADD_UUID,
+				     niduuid, NULL, NULL, NULL);
+			if (rc == 0)
+				++i;
+                        /* Stop at the first failover nid */
+                        if (*ptr == ':')
+                                break;
+                }
+        }
+        if (i == 0) {
+                CERROR("No valid MGS nids found.\n");
+                GOTO(out_free, rc = -EINVAL);
+        }
+        lsi->lsi_lmd->lmd_mgs_failnodes = 1;
+
+        /* Random uuid for MGC allows easier reconnects */
+        OBD_ALLOC_PTR(uuid);
+	if (uuid == NULL)
+		GOTO(out_free, rc = -ENOMEM);
+
+        ll_generate_random_uuid(uuidc);
+        class_uuid_unparse(uuidc, uuid);
+
+        /* Start the MGC */
+        rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME,
+				 (char *)uuid->uuid, LUSTRE_MGS_OBDNAME,
+				 niduuid, NULL, NULL);
+        if (rc)
+                GOTO(out_free, rc);
+
+        /* Add any failover MGS nids */
+        i = 1;
+	while (ptr && ((*ptr == ':' ||
+	       class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0))) {
+		/* New failover node */
+		sprintf(niduuid, "%s_%x", mgcname, i);
+		j = 0;
+		while (class_parse_nid_quiet(ptr, &nid, &ptr) == 0) {
+			rc = do_lcfg(mgcname, nid, LCFG_ADD_UUID,
+				     niduuid, NULL, NULL, NULL);
+			if (rc == 0)
+				++j;
+			if (*ptr == ':')
+				break;
+		}
+		if (j > 0) {
+			rc = do_lcfg(mgcname, 0, LCFG_ADD_CONN,
+				     niduuid, NULL, NULL, NULL);
+			if (rc == 0)
+				++i;
+                } else {
+                        /* at ":/fsname" */
+                        break;
+                }
+        }
+        lsi->lsi_lmd->lmd_mgs_failnodes = i;
+
+        obd = class_name2obd(mgcname);
+        if (!obd) {
+                CERROR("Can't find mgcobd %s\n", mgcname);
+                GOTO(out_free, rc = -ENOTCONN);
+        }
+
+        rc = obd_set_info_async(NULL, obd->obd_self_export,
+                                strlen(KEY_MGSSEC), KEY_MGSSEC,
+                                strlen(mgssec), mgssec, NULL);
+        if (rc)
+                GOTO(out_free, rc);
+
+        /* Keep a refcount of servers/clients who started with "mount",
+           so we know when we can get rid of the mgc. */
+	atomic_set(&obd->u.cli.cl_mgc_refcount, 1);
+
+	/* We connect to the MGS at setup, and don't disconnect until cleanup */
+	data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT |
+				  OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV |
+				  OBD_CONNECT_LVB_TYPE |
+				  OBD_CONNECT_BULK_MBITS | OBD_CONNECT_BARRIER;
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
+	data->ocd_connect_flags |= OBD_CONNECT_MNE_SWAB;
+#endif
+
+        if (lmd_is_client(lsi->lsi_lmd) &&
+            lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR)
+                data->ocd_connect_flags &= ~OBD_CONNECT_IMP_RECOV;
+        data->ocd_version = LUSTRE_VERSION_CODE;
+	rc = obd_connect(NULL, &exp, obd, uuid, data, NULL);
+        if (rc) {
+                CERROR("connect failed %d\n", rc);
+                GOTO(out, rc);
+        }
+
+        obd->u.cli.cl_mgc_mgsexp = exp;
+
+out:
+        /* Keep the mgc info in the sb. Note that many lsi's can point
+           to the same mgc.*/
+        lsi->lsi_mgc = obd;
+out_free:
+	mutex_unlock(&mgc_start_lock);
+
+	if (uuid)
+		OBD_FREE_PTR(uuid);
+        if (data)
+                OBD_FREE_PTR(data);
+        if (mgcname)
+                OBD_FREE(mgcname, len);
+        if (niduuid)
+                OBD_FREE(niduuid, len + 2);
+        RETURN(rc);
+}
+
+static int lustre_stop_mgc(struct super_block *sb)
+{
+        struct lustre_sb_info *lsi = s2lsi(sb);
+        struct obd_device *obd;
+	char *niduuid = NULL, *ptr = NULL;
+        int i, rc = 0, len = 0;
+        ENTRY;
+
+        if (!lsi)
+                RETURN(-ENOENT);
+        obd = lsi->lsi_mgc;
+        if (!obd)
+                RETURN(-ENOENT);
+        lsi->lsi_mgc = NULL;
+
+	mutex_lock(&mgc_start_lock);
+	LASSERT(atomic_read(&obd->u.cli.cl_mgc_refcount) > 0);
+	if (!atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) {
+                /* This is not fatal, every client that stops
+                   will call in here. */
+                CDEBUG(D_MOUNT, "mgc still has %d references.\n",
+		       atomic_read(&obd->u.cli.cl_mgc_refcount));
+                GOTO(out, rc = -EBUSY);
+        }
+
+        /* The MGC has no recoverable data in any case.
+         * force shotdown set in umount_begin */
+        obd->obd_no_recov = 1;
+
+        if (obd->u.cli.cl_mgc_mgsexp) {
+                /* An error is not fatal, if we are unable to send the
+                   disconnect mgs ping evictor cleans up the export */
+                rc = obd_disconnect(obd->u.cli.cl_mgc_mgsexp);
+                if (rc)
+                        CDEBUG(D_MOUNT, "disconnect failed %d\n", rc);
+        }
+
+        /* Save the obdname for cleaning the nid uuids, which are
+           obdname_XX */
+        len = strlen(obd->obd_name) + 6;
+        OBD_ALLOC(niduuid, len);
+        if (niduuid) {
+                strcpy(niduuid, obd->obd_name);
+                ptr = niduuid + strlen(niduuid);
+        }
+
+        rc = class_manual_cleanup(obd);
+        if (rc)
+                GOTO(out, rc);
+
+        /* Clean the nid uuids */
+        if (!niduuid)
+                GOTO(out, rc = -ENOMEM);
+
+        for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) {
+                sprintf(ptr, "_%x", i);
+                rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_DEL_UUID,
+			     niduuid, NULL, NULL, NULL);
+                if (rc)
+                        CERROR("del MDC UUID %s failed: rc = %d\n",
+                               niduuid, rc);
+        }
+out:
+        if (niduuid)
+                OBD_FREE(niduuid, len);
+
+        /* class_import_put will get rid of the additional connections */
+	mutex_unlock(&mgc_start_lock);
+        RETURN(rc);
+}
+
+/***************** lustre superblock **************/
+
+static struct lustre_sb_info *lustre_init_lsi(struct super_block *sb)
+{
+        struct lustre_sb_info *lsi;
+        ENTRY;
+
+        OBD_ALLOC_PTR(lsi);
+        if (!lsi)
+                RETURN(NULL);
+        OBD_ALLOC_PTR(lsi->lsi_lmd);
+        if (!lsi->lsi_lmd) {
+                OBD_FREE_PTR(lsi);
+                RETURN(NULL);
+        }
+
+        lsi->lsi_lmd->lmd_exclude_count = 0;
+        lsi->lsi_lmd->lmd_recovery_time_soft = 0;
+        lsi->lsi_lmd->lmd_recovery_time_hard = 0;
+        s2lsi_nocast(sb) = lsi;
+        /* we take 1 extra ref for our setup */
+	atomic_set(&lsi->lsi_mounts, 1);
+
+        /* Default umount style */
+        lsi->lsi_flags = LSI_UMOUNT_FAILOVER;
+	INIT_LIST_HEAD(&lsi->lsi_lwp_list);
+	spin_lock_init(&lsi->lsi_lwp_lock);
+
+	RETURN(lsi);
+}
+
+static int lustre_free_lsi(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	ENTRY;
+
+	LASSERT(lsi != NULL);
+	CDEBUG(D_MOUNT, "Freeing lsi %p\n", lsi);
+
+	/* someone didn't call server_put_mount. */
+	LASSERT(atomic_read(&lsi->lsi_mounts) == 0);
+
+	if (lsi->lsi_lmd != NULL) {
+		if (lsi->lsi_lmd->lmd_dev != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_dev,
+				strlen(lsi->lsi_lmd->lmd_dev) + 1);
+		if (lsi->lsi_lmd->lmd_profile != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_profile,
+				strlen(lsi->lsi_lmd->lmd_profile) + 1);
+		if (lsi->lsi_lmd->lmd_fileset != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_fileset,
+				strlen(lsi->lsi_lmd->lmd_fileset) + 1);
+		if (lsi->lsi_lmd->lmd_mgssec != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_mgssec,
+				strlen(lsi->lsi_lmd->lmd_mgssec) + 1);
+		if (lsi->lsi_lmd->lmd_opts != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_opts,
+				strlen(lsi->lsi_lmd->lmd_opts) + 1);
+		if (lsi->lsi_lmd->lmd_exclude_count)
+			OBD_FREE(lsi->lsi_lmd->lmd_exclude,
+				sizeof(lsi->lsi_lmd->lmd_exclude[0]) *
+				lsi->lsi_lmd->lmd_exclude_count);
+		if (lsi->lsi_lmd->lmd_mgs != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_mgs,
+				 strlen(lsi->lsi_lmd->lmd_mgs) + 1);
+		if (lsi->lsi_lmd->lmd_osd_type != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_osd_type,
+				 strlen(lsi->lsi_lmd->lmd_osd_type) + 1);
+		if (lsi->lsi_lmd->lmd_params != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_params, 4096);
+		if (lsi->lsi_lmd->lmd_nidnet != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_nidnet,
+				strlen(lsi->lsi_lmd->lmd_nidnet) + 1);
+
+		OBD_FREE_PTR(lsi->lsi_lmd);
+	}
+
+	LASSERT(lsi->lsi_llsbi == NULL);
+	OBD_FREE_PTR(lsi);
+	s2lsi_nocast(sb) = NULL;
+
+	RETURN(0);
+}
+
+/* The lsi has one reference for every server that is using the disk -
+   e.g. MDT, MGS, and potentially MGC */
+int lustre_put_lsi(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	ENTRY;
+
+	LASSERT(lsi != NULL);
+
+	CDEBUG(D_MOUNT, "put %p %d\n", sb, atomic_read(&lsi->lsi_mounts));
+	if (atomic_dec_and_test(&lsi->lsi_mounts)) {
+		if (IS_SERVER(lsi) && lsi->lsi_osd_exp) {
+			lu_device_put(&lsi->lsi_dt_dev->dd_lu_dev);
+			lsi->lsi_osd_exp->exp_obd->obd_lvfs_ctxt.dt = NULL;
+			lsi->lsi_dt_dev = NULL;
+			obd_disconnect(lsi->lsi_osd_exp);
+			/* wait till OSD is gone */
+			obd_zombie_barrier();
+		}
+		lustre_free_lsi(sb);
+		RETURN(1);
+	}
+	RETURN(0);
+}
+
+/*
+ * The goal of this function is to extract the file system name
+ * from the obd name. This can come in two flavors. One is
+ * fsname-MDTXXXX or fsname-XXXXXXX were X is a hexadecimal
+ * number. In both cases we should return fsname. If it is
+ * not a valid obd name it is assumed to be the file system
+ * name itself.
+ */
+void obdname2fsname(const char *tgt, char *fsname, size_t buflen)
+{
+	const char *ptr;
+	const char *tmp;
+	size_t len = 0;
+
+	/* First we have to see if the @tgt has '-' at all. It is
+	 * valid for the user to request something like
+	 * lctl set_param -P llite.lustre*.xattr_cache=0
+	 */
+	ptr = strrchr(tgt, '-');
+	if (!ptr) {
+		/* No '-' means it could end in '*' */
+		ptr = strchr(tgt, '*');
+		if (!ptr) {
+			/* No '*' either. Assume tgt = fsname */
+			len = strlen(tgt);
+			goto valid_obd_name;
+		}
+		len = ptr - tgt;
+		goto valid_obd_name;
+	}
+
+	/* tgt format fsname-MDT0000-* */
+	if ((!strncmp(ptr, "-MDT", 4) ||
+	     !strncmp(ptr, "-OST", 4)) &&
+	     (isxdigit(ptr[4]) && isxdigit(ptr[5]) &&
+	      isxdigit(ptr[6]) && isxdigit(ptr[7]))) {
+		len = ptr - tgt;
+		goto valid_obd_name;
+	}
+
+	/* tgt_format fsname-cli'dev'-'uuid' except for the llite case
+	 * which are named fsname-'uuid'. Examples:
+	 *
+	 * lustre-clilov-ffff88104db5b800
+	 * lustre-ffff88104db5b800  (for llite device)
+	 *
+	 * The length of the obd uuid can vary on different platforms.
+	 * This test if any invalid characters are in string. Allow
+	 * wildcards with '*' character.
+	 */
+	ptr++;
+	if (!strspn(ptr, "0123456789abcdefABCDEF*")) {
+		len = 0;
+		goto no_fsname;
+	}
+
+	/* Now that we validated the device name lets extract the
+	 * file system name. Most of the names in this class will
+	 * have '-cli' in its name which needs to be dropped. If
+	 * it doesn't have '-cli' then its a llite device which
+	 * ptr already points to the start of the uuid string.
+	 */
+	tmp = strstr(tgt, "-cli");
+	if (tmp)
+		ptr = tmp;
+	else
+		ptr--;
+	len = ptr - tgt;
+valid_obd_name:
+	len = min_t(size_t, len, LUSTRE_MAXFSNAME);
+	snprintf(fsname, buflen, "%.*s", (int)len, tgt);
+no_fsname:
+	fsname[len] = '\0';
+}
+EXPORT_SYMBOL(obdname2fsname);
+
+/*** SERVER NAME ***
+ * <FSNAME><SEPARATOR><TYPE><INDEX>
+ * FSNAME is between 1 and 8 characters (inclusive).
+ *	Excluded characters are '/' and ':'
+ * SEPARATOR is either ':' or '-'
+ * TYPE: "OST", "MDT", etc.
+ * INDEX: Hex representation of the index
+ */
+
+/** Get the fsname ("lustre") from the server name ("lustre-OST003F").
+ * @param [in] svname server name including type and index
+ * @param [out] fsname Buffer to copy filesystem name prefix into.
+ *  Must have at least 'strlen(fsname) + 1' chars.
+ * @param [out] endptr if endptr isn't NULL it is set to end of fsname
+ * rc < 0  on error
+ */
+int server_name2fsname(const char *svname, char *fsname, const char **endptr)
+{
+	const char *dash;
+
+	dash = svname + strnlen(svname, LUSTRE_MAXFSNAME);
+	for (; dash > svname && *dash != '-' && *dash != ':'; dash--)
+		;
+	if (dash == svname)
+		return -EINVAL;
+
+	if (fsname != NULL) {
+		strncpy(fsname, svname, dash - svname);
+		fsname[dash - svname] = '\0';
+	}
+
+	if (endptr != NULL)
+		*endptr = dash;
+
+	return 0;
+}
+EXPORT_SYMBOL(server_name2fsname);
+
+/**
+ * Get service name (svname) from string
+ * rc < 0 on error
+ * if endptr isn't NULL it is set to end of fsname *
+ */
+int server_name2svname(const char *label, char *svname, const char **endptr,
+		       size_t svsize)
+{
+	int rc;
+	const char *dash;
+
+	/* We use server_name2fsname() just for parsing */
+	rc = server_name2fsname(label, NULL, &dash);
+	if (rc != 0)
+		return rc;
+
+	if (endptr != NULL)
+		*endptr = dash;
+
+	if (strlcpy(svname, dash + 1, svsize) >= svsize)
+		return -E2BIG;
+
+	return 0;
+}
+EXPORT_SYMBOL(server_name2svname);
+
+/**
+ * check server name is OST.
+ **/
+int server_name_is_ost(const char *svname)
+{
+	const char *dash;
+	int rc;
+
+	/* We use server_name2fsname() just for parsing */
+	rc = server_name2fsname(svname, NULL, &dash);
+	if (rc != 0)
+		return rc;
+
+	dash++;
+
+	if (strncmp(dash, "OST", 3) == 0)
+		return 1;
+	return 0;
+}
+EXPORT_SYMBOL(server_name_is_ost);
+
+/**
+ * Get the index from the target name MDTXXXX/OSTXXXX
+ * rc = server type, or rc < 0  on error
+ **/
+int target_name2index(const char *tgtname, __u32 *idx, const char **endptr)
+{
+	const char *dash = tgtname;
+	unsigned long index;
+	int rc;
+
+	if (strncmp(dash, "MDT", 3) == 0)
+		rc = LDD_F_SV_TYPE_MDT;
+	else if (strncmp(dash, "OST", 3) == 0)
+		rc = LDD_F_SV_TYPE_OST;
+	else
+		return -EINVAL;
+
+	dash += 3;
+
+	if (strncmp(dash, "all", 3) == 0) {
+		if (endptr != NULL)
+			*endptr = dash + 3;
+		return rc | LDD_F_SV_ALL;
+	}
+
+	index = simple_strtoul(dash, (char **)endptr, 16);
+	if (idx != NULL)
+		*idx = index;
+	return rc;
+}
+EXPORT_SYMBOL(target_name2index);
+
+/* Get the index from the obd name.
+   rc = server type, or
+   rc < 0  on error
+   if endptr isn't NULL it is set to end of name */
+int server_name2index(const char *svname, __u32 *idx, const char **endptr)
+{
+	const char *dash;
+	int rc;
+
+	/* We use server_name2fsname() just for parsing */
+	rc = server_name2fsname(svname, NULL, &dash);
+	if (rc != 0)
+		return rc;
+
+	dash++;
+	rc = target_name2index(dash, idx, endptr);
+	if (rc < 0)
+		return rc;
+
+	/* Account for -mdc after index that is possible when specifying mdt */
+	if (endptr != NULL && strncmp(LUSTRE_MDC_NAME, *endptr + 1,
+				      sizeof(LUSTRE_MDC_NAME)-1) == 0)
+		*endptr += sizeof(LUSTRE_MDC_NAME);
+
+	return rc;
+}
+EXPORT_SYMBOL(server_name2index);
+
+/*************** mount common betweeen server and client ***************/
+
+/* Common umount */
+int lustre_common_put_super(struct super_block *sb)
+{
+        int rc;
+        ENTRY;
+
+        CDEBUG(D_MOUNT, "dropping sb %p\n", sb);
+
+        /* Drop a ref to the MGC */
+        rc = lustre_stop_mgc(sb);
+        if (rc && (rc != -ENOENT)) {
+                if (rc != -EBUSY) {
+                        CERROR("Can't stop MGC: %d\n", rc);
+                        RETURN(rc);
+                }
+                /* BUSY just means that there's some other obd that
+                   needs the mgc.  Let him clean it up. */
+                CDEBUG(D_MOUNT, "MGC still in use\n");
+        }
+        /* Drop a ref to the mounted disk */
+        lustre_put_lsi(sb);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_common_put_super);
+
+static void lmd_print(struct lustre_mount_data *lmd)
+{
+	int i;
+
+	PRINT_CMD(D_MOUNT, "  mount data:\n");
+	if (lmd_is_client(lmd))
+		PRINT_CMD(D_MOUNT, "profile: %s\n", lmd->lmd_profile);
+	PRINT_CMD(D_MOUNT, "device:  %s\n", lmd->lmd_dev);
+	PRINT_CMD(D_MOUNT, "flags:   %x\n", lmd->lmd_flags);
+
+	if (lmd->lmd_opts)
+		PRINT_CMD(D_MOUNT, "options: %s\n", lmd->lmd_opts);
+
+	if (lmd->lmd_recovery_time_soft)
+		PRINT_CMD(D_MOUNT, "recovery time soft: %d\n",
+			  lmd->lmd_recovery_time_soft);
+
+	if (lmd->lmd_recovery_time_hard)
+		PRINT_CMD(D_MOUNT, "recovery time hard: %d\n",
+			  lmd->lmd_recovery_time_hard);
+
+	for (i = 0; i < lmd->lmd_exclude_count; i++) {
+		PRINT_CMD(D_MOUNT, "exclude %d:  OST%04x\n", i,
+			  lmd->lmd_exclude[i]);
+	}
+}
+
+/* Is this server on the exclusion list */
+int lustre_check_exclusion(struct super_block *sb, char *svname)
+{
+        struct lustre_sb_info *lsi = s2lsi(sb);
+        struct lustre_mount_data *lmd = lsi->lsi_lmd;
+        __u32 index;
+        int i, rc;
+        ENTRY;
+
+        rc = server_name2index(svname, &index, NULL);
+        if (rc != LDD_F_SV_TYPE_OST)
+                /* Only exclude OSTs */
+                RETURN(0);
+
+        CDEBUG(D_MOUNT, "Check exclusion %s (%d) in %d of %s\n", svname,
+               index, lmd->lmd_exclude_count, lmd->lmd_dev);
+
+        for(i = 0; i < lmd->lmd_exclude_count; i++) {
+                if (index == lmd->lmd_exclude[i]) {
+                        CWARN("Excluding %s (on exclusion list)\n", svname);
+                        RETURN(1);
+                }
+        }
+        RETURN(0);
+}
+
+/* mount -v  -o exclude=lustre-OST0001:lustre-OST0002 -t lustre ... */
+static int lmd_make_exclusion(struct lustre_mount_data *lmd, const char *ptr)
+{
+	const char *s1 = ptr, *s2;
+	__u32 *exclude_list;
+	__u32 index = 0;
+	int rc = 0, devmax;
+	ENTRY;
+
+	/* The shortest an ost name can be is 8 chars: -OST0000.
+	   We don't actually know the fsname at this time, so in fact
+	   a user could specify any fsname. */
+	devmax = strlen(ptr) / 8 + 1;
+
+	/* temp storage until we figure out how many we have */
+	OBD_ALLOC(exclude_list, sizeof(index) * devmax);
+	if (!exclude_list)
+		RETURN(-ENOMEM);
+
+	/* we enter this fn pointing at the '=' */
+	while (*s1 && *s1 != ' ' && *s1 != ',') {
+		s1++;
+		rc = server_name2index(s1, &index, &s2);
+		if (rc < 0) {
+			CERROR("Can't parse server name '%s': rc = %d\n",
+			       s1, rc);
+			break;
+		}
+		if (rc == LDD_F_SV_TYPE_OST)
+			exclude_list[lmd->lmd_exclude_count++] = index;
+		else
+			CDEBUG(D_MOUNT, "ignoring exclude %.*s: type = %#x\n",
+			       (uint)(s2-s1), s1, rc);
+		s1 = s2;
+		/* now we are pointing at ':' (next exclude)
+		   or ',' (end of excludes) */
+		if (lmd->lmd_exclude_count >= devmax)
+			break;
+	}
+	if (rc >= 0) /* non-err */
+		rc = 0;
+
+	if (lmd->lmd_exclude_count) {
+		/* permanent, freed in lustre_free_lsi */
+		OBD_ALLOC(lmd->lmd_exclude, sizeof(index) *
+			  lmd->lmd_exclude_count);
+		if (lmd->lmd_exclude) {
+			memcpy(lmd->lmd_exclude, exclude_list,
+			       sizeof(index) * lmd->lmd_exclude_count);
+		} else {
+			rc = -ENOMEM;
+			lmd->lmd_exclude_count = 0;
+		}
+	}
+	OBD_FREE(exclude_list, sizeof(index) * devmax);
+	RETURN(rc);
+}
+
+static int lmd_parse_mgssec(struct lustre_mount_data *lmd, char *ptr)
+{
+        char   *tail;
+        int     length;
+
+        if (lmd->lmd_mgssec != NULL) {
+                OBD_FREE(lmd->lmd_mgssec, strlen(lmd->lmd_mgssec) + 1);
+                lmd->lmd_mgssec = NULL;
+        }
+
+        tail = strchr(ptr, ',');
+        if (tail == NULL)
+                length = strlen(ptr);
+        else
+                length = tail - ptr;
+
+        OBD_ALLOC(lmd->lmd_mgssec, length + 1);
+        if (lmd->lmd_mgssec == NULL)
+                return -ENOMEM;
+
+        memcpy(lmd->lmd_mgssec, ptr, length);
+        lmd->lmd_mgssec[length] = '\0';
+        return 0;
+}
+
+static int lmd_parse_network(struct lustre_mount_data *lmd, char *ptr)
+{
+	char   *tail;
+	int     length;
+
+	if (lmd->lmd_nidnet != NULL) {
+		OBD_FREE(lmd->lmd_nidnet, strlen(lmd->lmd_nidnet) + 1);
+		lmd->lmd_nidnet = NULL;
+	}
+
+	tail = strchr(ptr, ',');
+	if (tail == NULL)
+		length = strlen(ptr);
+	else
+		length = tail - ptr;
+
+	OBD_ALLOC(lmd->lmd_nidnet, length + 1);
+	if (lmd->lmd_nidnet == NULL)
+		return -ENOMEM;
+
+	memcpy(lmd->lmd_nidnet, ptr, length);
+	lmd->lmd_nidnet[length] = '\0';
+	return 0;
+}
+
+static int lmd_parse_string(char **handle, char *ptr)
+{
+	char   *tail;
+	int     length;
+
+	if ((handle == NULL) || (ptr == NULL))
+		return -EINVAL;
+
+	if (*handle != NULL) {
+		OBD_FREE(*handle, strlen(*handle) + 1);
+		*handle = NULL;
+	}
+
+	tail = strchr(ptr, ',');
+	if (tail == NULL)
+		length = strlen(ptr);
+	else
+		length = tail - ptr;
+
+	OBD_ALLOC(*handle, length + 1);
+	if (*handle == NULL)
+		return -ENOMEM;
+
+	memcpy(*handle, ptr, length);
+	(*handle)[length] = '\0';
+
+	return 0;
+}
+
+/* Collect multiple values for mgsnid specifiers */
+static int lmd_parse_mgs(struct lustre_mount_data *lmd, char **ptr)
+{
+	lnet_nid_t nid;
+	char *tail = *ptr;
+	char *mgsnid;
+	int   length;
+	int   oldlen = 0;
+
+	/* Find end of nidlist */
+	while (class_parse_nid_quiet(tail, &nid, &tail) == 0) {}
+	length = tail - *ptr;
+	if (length == 0) {
+		LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", *ptr);
+		return -EINVAL;
+	}
+
+	if (lmd->lmd_mgs != NULL)
+		oldlen = strlen(lmd->lmd_mgs) + 1;
+
+	OBD_ALLOC(mgsnid, oldlen + length + 1);
+	if (mgsnid == NULL)
+		return -ENOMEM;
+
+	if (lmd->lmd_mgs != NULL) {
+		/* Multiple mgsnid= are taken to mean failover locations */
+		memcpy(mgsnid, lmd->lmd_mgs, oldlen);
+		mgsnid[oldlen - 1] = ':';
+		OBD_FREE(lmd->lmd_mgs, oldlen);
+	}
+	memcpy(mgsnid + oldlen, *ptr, length);
+	mgsnid[oldlen + length] = '\0';
+	lmd->lmd_mgs = mgsnid;
+	*ptr = tail;
+
+	return 0;
+}
+
+/**
+ * Find the first delimiter (comma or colon) from the specified \a buf and
+ * make \a *endh point to the string starting with the delimiter. The commas
+ * in expression list [...] will be skipped.
+ *
+ * \param[in] buf	a delimiter-separated string
+ * \param[in] endh	a pointer to a pointer that will point to the string
+ *			starting with the delimiter
+ *
+ * \retval 0		if delimiter is found
+ * \retval 1		if delimiter is not found
+ */
+static int lmd_find_delimiter(char *buf, char **endh)
+{
+	char *c = buf;
+	int   skip = 0;
+
+	if (buf == NULL)
+		return 1;
+
+	while (*c != '\0') {
+		if (*c == '[')
+			skip++;
+		else if (*c == ']')
+			skip--;
+
+		if ((*c == ',' || *c == ':') && skip == 0) {
+			if (endh != NULL)
+				*endh = c;
+			return 0;
+		}
+
+		c++;
+	}
+
+	return 1;
+}
+
+/**
+ * Find the first valid string delimited by comma or colon from the specified
+ * \a buf and parse it to see whether it's a valid nid list. If yes, \a *endh
+ * will point to the next string starting with the delimiter.
+ *
+ * \param[in] buf	a delimiter-separated string
+ * \param[in] endh	a pointer to a pointer that will point to the string
+ *			starting with the delimiter
+ *
+ * \retval 0		if the string is a valid nid list
+ * \retval 1		if the string is not a valid nid list
+ */
+static int lmd_parse_nidlist(char *buf, char **endh)
+{
+	struct list_head nidlist;
+	char		*endp = buf;
+	char		 tmp;
+	int		 rc = 0;
+
+	if (buf == NULL)
+		return 1;
+	while (*buf == ',' || *buf == ':')
+		buf++;
+	if (*buf == ' ' || *buf == '/' || *buf == '\0')
+		return 1;
+
+	if (lmd_find_delimiter(buf, &endp) != 0)
+		endp = buf + strlen(buf);
+
+	tmp = *endp;
+	*endp = '\0';
+
+	INIT_LIST_HEAD(&nidlist);
+	if (cfs_parse_nidlist(buf, strlen(buf), &nidlist) <= 0)
+		rc = 1;
+	cfs_free_nidlist(&nidlist);
+
+	*endp = tmp;
+	if (rc != 0)
+		return rc;
+	if (endh != NULL)
+		*endh = endp;
+	return 0;
+}
+
+/** Parse mount line options
+ * e.g. mount -v -t lustre -o abort_recov uml1:uml2:/lustre-client /mnt/lustre
+ * dev is passed as device=uml1:/lustre by mount.lustre
+ */
+static int lmd_parse(char *options, struct lustre_mount_data *lmd)
+{
+	char *s1, *s2, *devname = NULL;
+	struct lustre_mount_data *raw = (struct lustre_mount_data *)options;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(lmd);
+	if (!options) {
+		LCONSOLE_ERROR_MSG(0x162, "Missing mount data: check that "
+				   "/sbin/mount.lustre is installed.\n");
+		RETURN(-EINVAL);
+	}
+
+	/* Options should be a string - try to detect old lmd data */
+	if ((raw->lmd_magic & 0xffffff00) == (LMD_MAGIC & 0xffffff00)) {
+		LCONSOLE_ERROR_MSG(0x163, "You're using an old version of "
+				   "/sbin/mount.lustre.  Please install "
+				   "version %s\n", LUSTRE_VERSION_STRING);
+		RETURN(-EINVAL);
+	}
+	lmd->lmd_magic = LMD_MAGIC;
+
+	OBD_ALLOC(lmd->lmd_params, LMD_PARAMS_MAXLEN);
+	if (lmd->lmd_params == NULL)
+		RETURN(-ENOMEM);
+	lmd->lmd_params[0] = '\0';
+
+	/* Set default flags here */
+
+	s1 = options;
+	while (*s1) {
+		int clear = 0;
+		int time_min = OBD_RECOVERY_TIME_MIN;
+		char *s3;
+
+		/* Skip whitespace and extra commas */
+		while (*s1 == ' ' || *s1 == ',')
+			s1++;
+		s3 = s1;
+
+		/* Client options are parsed in ll_options: eg. flock,
+		   user_xattr, acl */
+
+		/* Parse non-ldiskfs options here. Rather than modifying
+		   ldiskfs, we just zero these out here */
+		if (strncmp(s1, "abort_recov", 11) == 0) {
+			lmd->lmd_flags |= LMD_FLG_ABORT_RECOV;
+			clear++;
+		} else if (strncmp(s1, "recovery_time_soft=", 19) == 0) {
+			lmd->lmd_recovery_time_soft =
+				max_t(int, simple_strtoul(s1 + 19, NULL, 10),
+				      time_min);
+			clear++;
+		} else if (strncmp(s1, "recovery_time_hard=", 19) == 0) {
+			lmd->lmd_recovery_time_hard =
+				max_t(int, simple_strtoul(s1 + 19, NULL, 10),
+				      time_min);
+			clear++;
+		} else if (strncmp(s1, "noir", 4) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NOIR; /* test purpose only. */
+			clear++;
+		} else if (strncmp(s1, "nosvc", 5) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NOSVC;
+			clear++;
+		} else if (strncmp(s1, "nomgs", 5) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NOMGS;
+			clear++;
+		} else if (strncmp(s1, "noscrub", 7) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NOSCRUB;
+			clear++;
+		} else if (strncmp(s1, "skip_lfsck", 10) == 0) {
+			lmd->lmd_flags |= LMD_FLG_SKIP_LFSCK;
+			clear++;
+		} else if (strncmp(s1, "rdonly_dev", 10) == 0) {
+			lmd->lmd_flags |= LMD_FLG_DEV_RDONLY;
+			clear++;
+		} else if (strncmp(s1, PARAM_MGSNODE,
+				   sizeof(PARAM_MGSNODE) - 1) == 0) {
+			s2 = s1 + sizeof(PARAM_MGSNODE) - 1;
+			/* Assume the next mount opt is the first
+			   invalid nid we get to. */
+			rc = lmd_parse_mgs(lmd, &s2);
+			if (rc)
+				goto invalid;
+			s3 = s2;
+			clear++;
+		} else if (strncmp(s1, "writeconf", 9) == 0) {
+			lmd->lmd_flags |= LMD_FLG_WRITECONF;
+			clear++;
+		} else if (strncmp(s1, "update", 6) == 0) {
+			lmd->lmd_flags |= LMD_FLG_UPDATE;
+			clear++;
+		} else if (strncmp(s1, "virgin", 6) == 0) {
+			lmd->lmd_flags |= LMD_FLG_VIRGIN;
+			clear++;
+		} else if (strncmp(s1, "noprimnode", 10) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NO_PRIMNODE;
+			clear++;
+		} else if (strncmp(s1, "mgssec=", 7) == 0) {
+			rc = lmd_parse_mgssec(lmd, s1 + 7);
+			if (rc)
+				goto invalid;
+			clear++;
+			/* ost exclusion list */
+		} else if (strncmp(s1, "exclude=", 8) == 0) {
+			rc = lmd_make_exclusion(lmd, s1 + 7);
+			if (rc)
+				goto invalid;
+			clear++;
+		} else if (strncmp(s1, "mgs", 3) == 0) {
+			/* We are an MGS */
+			lmd->lmd_flags |= LMD_FLG_MGS;
+			clear++;
+		} else if (strncmp(s1, "svname=", 7) == 0) {
+			rc = lmd_parse_string(&lmd->lmd_profile, s1 + 7);
+			if (rc)
+				goto invalid;
+			clear++;
+		} else if (strncmp(s1, "param=", 6) == 0) {
+			size_t length, params_length;
+			char  *tail = s1;
+			if (lmd_find_delimiter(s1 + 6, &tail) != 0)
+				length = strlen(s1);
+			else {
+				char *param_str = tail + 1;
+				int   supplementary = 1;
+				while (lmd_parse_nidlist(param_str,
+							 &param_str) == 0) {
+					supplementary = 0;
+				}
+				length = param_str - s1 - supplementary;
+			}
+			length -= 6;
+			params_length = strlen(lmd->lmd_params);
+			if (params_length + length + 1 >= LMD_PARAMS_MAXLEN)
+				RETURN(-E2BIG);
+			strncat(lmd->lmd_params, s1 + 6, length);
+			lmd->lmd_params[params_length + length] = '\0';
+			strlcat(lmd->lmd_params, " ", LMD_PARAMS_MAXLEN);
+			s3 = s1 + 6 + length;
+			clear++;
+		} else if (strncmp(s1, "osd=", 4) == 0) {
+			rc = lmd_parse_string(&lmd->lmd_osd_type, s1 + 4);
+			if (rc)
+				goto invalid;
+			clear++;
+		}
+		/* Linux 2.4 doesn't pass the device, so we stuck it at the
+		   end of the options. */
+		else if (strncmp(s1, "device=", 7) == 0) {
+			devname = s1 + 7;
+			/* terminate options right before device.  device
+			   must be the last one. */
+			*s1 = '\0';
+			break;
+		} else if (strncmp(s1, "network=", 8) == 0) {
+			rc = lmd_parse_network(lmd, s1 + 8);
+			if (rc)
+				goto invalid;
+			clear++;
+		}
+
+		/* Find next opt */
+		s2 = strchr(s3, ',');
+		if (s2 == NULL) {
+			if (clear)
+				*s1 = '\0';
+			break;
+		}
+		s2++;
+		if (clear)
+			memmove(s1, s2, strlen(s2) + 1);
+		else
+			s1 = s2;
+	}
+
+	if (!devname) {
+		LCONSOLE_ERROR_MSG(0x164, "Can't find the device name "
+				   "(need mount option 'device=...')\n");
+		goto invalid;
+	}
+
+	s1 = strstr(devname, ":/");
+	if (s1) {
+		++s1;
+		lmd->lmd_flags |= LMD_FLG_CLIENT;
+		/* Remove leading /s from fsname */
+		while (*++s1 == '/')
+			;
+		s2 = s1;
+		while (*s2 != '/' && *s2 != '\0')
+			s2++;
+		/* Freed in lustre_free_lsi */
+		OBD_ALLOC(lmd->lmd_profile, s2 - s1 + 8);
+		if (!lmd->lmd_profile)
+			RETURN(-ENOMEM);
+
+		strncat(lmd->lmd_profile, s1, s2 - s1);
+		strncat(lmd->lmd_profile, "-client", 7);
+
+		s1 = s2;
+		s2 = s1 + strlen(s1) - 1;
+		/* Remove padding /s from fileset */
+		while (*s2 == '/')
+			s2--;
+		if (s2 > s1) {
+			OBD_ALLOC(lmd->lmd_fileset, s2 - s1 + 2);
+			if (lmd->lmd_fileset == NULL) {
+				OBD_FREE(lmd->lmd_profile,
+					 strlen(lmd->lmd_profile) + 1);
+				RETURN(-ENOMEM);
+			}
+			strncat(lmd->lmd_fileset, s1, s2 - s1 + 1);
+		}
+	} else {
+		/* server mount */
+		if (lmd->lmd_nidnet != NULL) {
+			/* 'network=' mount option forbidden for server */
+			OBD_FREE(lmd->lmd_nidnet, strlen(lmd->lmd_nidnet) + 1);
+			lmd->lmd_nidnet = NULL;
+			rc = -EINVAL;
+			CERROR("%s: option 'network=' not allowed for Lustre "
+			       "servers: rc = %d\n", devname, rc);
+			RETURN(rc);
+		}
+	}
+
+	/* Freed in lustre_free_lsi */
+	OBD_ALLOC(lmd->lmd_dev, strlen(devname) + 1);
+	if (!lmd->lmd_dev)
+		RETURN(-ENOMEM);
+	strncpy(lmd->lmd_dev, devname, strlen(devname)+1);
+
+	/* Save mount options */
+	s1 = options + strlen(options) - 1;
+	while (s1 >= options && (*s1 == ',' || *s1 == ' '))
+		*s1-- = 0;
+	if (*options != 0) {
+		/* Freed in lustre_free_lsi */
+		OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1);
+		if (!lmd->lmd_opts)
+			RETURN(-ENOMEM);
+		strncpy(lmd->lmd_opts, options, strlen(options)+1);
+	}
+
+	lmd_print(lmd);
+	lmd->lmd_magic = LMD_MAGIC;
+
+	RETURN(rc);
+
+invalid:
+	CERROR("Bad mount options %s\n", options);
+	RETURN(-EINVAL);
+}
+
+struct lustre_mount_data2 {
+        void *lmd2_data;
+        struct vfsmount *lmd2_mnt;
+};
+
+/** This is the entry point for the mount call into Lustre.
+ * This is called when a server or client is mounted,
+ * and this is where we start setting things up.
+ * @param data Mount options (e.g. -o flock,abort_recov)
+ */
+static int lustre_fill_super(struct super_block *sb, void *data, int silent)
+{
+        struct lustre_mount_data *lmd;
+        struct lustre_mount_data2 *lmd2 = data;
+        struct lustre_sb_info *lsi;
+        int rc;
+        ENTRY;
+
+        CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb);
+
+        lsi = lustre_init_lsi(sb);
+        if (!lsi)
+                RETURN(-ENOMEM);
+        lmd = lsi->lsi_lmd;
+
+	/*
+	 * Disable lockdep during mount, because mount locking patterns are
+	 * `special'.
+	 */
+	lockdep_off();
+
+        /*
+         * LU-639: the obd cleanup of last mount may not finish yet, wait here.
+         */
+        obd_zombie_barrier();
+
+        /* Figure out the lmd from the mount options */
+        if (lmd_parse((char *)(lmd2->lmd2_data), lmd)) {
+                lustre_put_lsi(sb);
+                GOTO(out, rc = -EINVAL);
+        }
+
+	if (lmd_is_client(lmd)) {
+		CDEBUG(D_MOUNT, "Mounting client %s\n", lmd->lmd_profile);
+		if (client_fill_super == NULL)
+			request_module("lustre");
+		if (client_fill_super == NULL) {
+			LCONSOLE_ERROR_MSG(0x165, "Nothing registered for "
+					   "client mount! Is the 'lustre' "
+					   "module loaded?\n");
+			lustre_put_lsi(sb);
+			rc = -ENODEV;
+		} else {
+			rc = lustre_start_mgc(sb);
+			if (rc) {
+				lustre_common_put_super(sb);
+				GOTO(out, rc);
+			}
+			/* Connect and start */
+			/* (should always be ll_fill_super) */
+			rc = (*client_fill_super)(sb, lmd2->lmd2_mnt);
+			/* c_f_s will call lustre_common_put_super on failure */
+		}
+	} else {
+#ifdef HAVE_SERVER_SUPPORT
+		CDEBUG(D_MOUNT, "Mounting server from %s\n", lmd->lmd_dev);
+		rc = server_fill_super(sb);
+		/* s_f_s calls lustre_start_mgc after the mount because we need
+		   the MGS nids which are stored on disk.  Plus, we may
+		   need to start the MGS first. */
+		/* s_f_s will call server_put_super on failure */
+#else
+		CERROR("This is client-side-only module, "
+		       "cannot handle server mount.\n");
+		rc = -EINVAL;
+#endif
+	}
+
+        /* If error happens in fill_super() call, @lsi will be killed there.
+         * This is why we do not put it here. */
+        GOTO(out, rc);
+out:
+        if (rc) {
+                CERROR("Unable to mount %s (%d)\n",
+                       s2lsi(sb) ? lmd->lmd_dev : "", rc);
+        } else {
+                CDEBUG(D_SUPER, "Mount %s complete\n",
+                       lmd->lmd_dev);
+        }
+	lockdep_on();
+	return rc;
+}
+
+
+/* We can't call ll_fill_super by name because it lives in a module that
+   must be loaded after this one. */
+void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb,
+                                                  struct vfsmount *mnt))
+{
+        client_fill_super = cfs;
+}
+EXPORT_SYMBOL(lustre_register_client_fill_super);
+
+void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb))
+{
+        kill_super_cb = cfs;
+}
+EXPORT_SYMBOL(lustre_register_kill_super_cb);
+
+/***************** FS registration ******************/
+#ifdef HAVE_FSTYPE_MOUNT
+static struct dentry *lustre_mount(struct file_system_type *fs_type, int flags,
+				   const char *devname, void *data)
+{
+	struct lustre_mount_data2 lmd2 = {
+		.lmd2_data = data,
+	};
+
+	return mount_nodev(fs_type, flags, &lmd2, lustre_fill_super);
+}
+#else
+static int lustre_get_sb(struct file_system_type *fs_type, int flags,
+			 const char *devname, void *data, struct vfsmount *mnt)
+{
+	struct lustre_mount_data2 lmd2 = {
+		.lmd2_data = data,
+		.lmd2_mnt = mnt,
+	};
+
+	return get_sb_nodev(fs_type, flags, &lmd2, lustre_fill_super, mnt);
+}
+#endif
+
+static void lustre_kill_super(struct super_block *sb)
+{
+        struct lustre_sb_info *lsi = s2lsi(sb);
+
+	if (kill_super_cb && lsi && !IS_SERVER(lsi))
+                (*kill_super_cb)(sb);
+
+        kill_anon_super(sb);
+}
+
+/** Register the "lustre" fs type
+ */
+static struct file_system_type lustre_fs_type = {
+        .owner        = THIS_MODULE,
+        .name         = "lustre",
+#ifdef HAVE_FSTYPE_MOUNT
+	.mount        = lustre_mount,
+#else
+        .get_sb       = lustre_get_sb,
+#endif
+        .kill_sb      = lustre_kill_super,
+	.fs_flags     = FS_REQUIRES_DEV | FS_HAS_FIEMAP | FS_RENAME_DOES_D_MOVE,
+};
+MODULE_ALIAS_FS("lustre");
+
+int lustre_register_fs(void)
+{
+        return register_filesystem(&lustre_fs_type);
+}
+
+int lustre_unregister_fs(void)
+{
+        return unregister_filesystem(&lustre_fs_type);
+}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c
new file mode 100644
index 0000000000000..dc2d192dcb048
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c
@@ -0,0 +1,1963 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2013, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obd_mount_server.c
+ *
+ * Server mount routines
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#define D_MOUNT (D_SUPER | D_CONFIG /* | D_WARNING */)
+#define PRINT_CMD CDEBUG
+#define PRINT_MASK (D_SUPER | D_CONFIG)
+
+#include <linux/types.h>
+#include <linux/selinux.h>
+#include <linux/statfs.h>
+#include <linux/version.h>
+#ifdef HAVE_KERNEL_LOCKED
+#include <linux/smp_lock.h>
+#endif
+
+#include <lustre/lustre_idl.h>
+#include <lustre/lustre_user.h>
+
+#include <llog_swab.h>
+#include <lustre_disk.h>
+#include <uapi/linux/lustre_ioctl.h>
+#include <lustre_log.h>
+#include <uapi/linux/lustre_param.h>
+#include <obd.h>
+#include <obd_class.h>
+
+/*********** mount lookup *********/
+
+static DEFINE_MUTEX(lustre_mount_info_lock);
+static LIST_HEAD(server_mount_info_list);
+
+static struct lustre_mount_info *server_find_mount(const char *name)
+{
+	struct list_head *tmp;
+	struct lustre_mount_info *lmi;
+	ENTRY;
+
+	list_for_each(tmp, &server_mount_info_list) {
+		lmi = list_entry(tmp, struct lustre_mount_info,
+				 lmi_list_chain);
+		if (strcmp(name, lmi->lmi_name) == 0)
+			RETURN(lmi);
+	}
+	RETURN(NULL);
+}
+
+/* we must register an obd for a mount before we call the setup routine.
+ *_setup will call lustre_get_mount to get the mnt struct
+ by obd_name, since we can't pass the pointer to setup. */
+static int server_register_mount(const char *name, struct super_block *sb)
+{
+	struct lustre_mount_info *lmi;
+	char *name_cp;
+	ENTRY;
+
+	LASSERT(sb);
+
+	OBD_ALLOC(lmi, sizeof(*lmi));
+	if (!lmi)
+		RETURN(-ENOMEM);
+	OBD_ALLOC(name_cp, strlen(name) + 1);
+	if (!name_cp) {
+		OBD_FREE(lmi, sizeof(*lmi));
+		RETURN(-ENOMEM);
+	}
+	strcpy(name_cp, name);
+
+	mutex_lock(&lustre_mount_info_lock);
+
+	if (server_find_mount(name)) {
+		mutex_unlock(&lustre_mount_info_lock);
+		OBD_FREE(lmi, sizeof(*lmi));
+		OBD_FREE(name_cp, strlen(name) + 1);
+		CERROR("Already registered %s\n", name);
+		RETURN(-EEXIST);
+	}
+	lmi->lmi_name = name_cp;
+	lmi->lmi_sb = sb;
+	list_add(&lmi->lmi_list_chain, &server_mount_info_list);
+
+	mutex_unlock(&lustre_mount_info_lock);
+
+	CDEBUG(D_MOUNT, "register mount %p from %s\n", sb, name);
+
+	RETURN(0);
+}
+
+/* when an obd no longer needs a mount */
+static int server_deregister_mount(const char *name)
+{
+	struct lustre_mount_info *lmi;
+	ENTRY;
+
+	mutex_lock(&lustre_mount_info_lock);
+	lmi = server_find_mount(name);
+	if (!lmi) {
+		mutex_unlock(&lustre_mount_info_lock);
+		CERROR("%s not registered\n", name);
+		RETURN(-ENOENT);
+	}
+
+	CDEBUG(D_MOUNT, "deregister mount %p from %s\n", lmi->lmi_sb, name);
+
+	OBD_FREE(lmi->lmi_name, strlen(lmi->lmi_name) + 1);
+	list_del(&lmi->lmi_list_chain);
+	OBD_FREE(lmi, sizeof(*lmi));
+	mutex_unlock(&lustre_mount_info_lock);
+
+	RETURN(0);
+}
+
+/* obd's look up a registered mount using their obdname. This is just
+   for initial obd setup to find the mount struct.  It should not be
+   called every time you want to mntget. */
+struct lustre_mount_info *server_get_mount(const char *name)
+{
+	struct lustre_mount_info *lmi;
+	struct lustre_sb_info *lsi;
+	ENTRY;
+
+	mutex_lock(&lustre_mount_info_lock);
+	lmi = server_find_mount(name);
+	mutex_unlock(&lustre_mount_info_lock);
+	if (!lmi) {
+		CERROR("Can't find mount for %s\n", name);
+		RETURN(NULL);
+	}
+	lsi = s2lsi(lmi->lmi_sb);
+
+	atomic_inc(&lsi->lsi_mounts);
+
+	CDEBUG(D_MOUNT, "get mount %p from %s, refs=%d\n", lmi->lmi_sb,
+	       name, atomic_read(&lsi->lsi_mounts));
+
+	RETURN(lmi);
+}
+EXPORT_SYMBOL(server_get_mount);
+
+/**
+ * server_put_mount: to be called from obd_cleanup methods
+ * @name:	obd name
+ * @dereg_mnt:	0 or 1 depending on whether the mount is to be deregistered or
+ * not
+ *
+ * The caller decides whether server_deregister_mount() needs to be called or
+ * not. Calling of server_deregister_mount() does not depend on refcounting on
+ * lsi because we could have say the mgs and mds on the same node and we
+ * unmount the mds, then the ref on the lsi would still be non-zero but we
+ * would still want to deregister the mds mount.
+ */
+int server_put_mount(const char *name, bool dereg_mnt)
+{
+	struct lustre_mount_info *lmi;
+	struct lustre_sb_info *lsi;
+	ENTRY;
+
+	mutex_lock(&lustre_mount_info_lock);
+	lmi = server_find_mount(name);
+	mutex_unlock(&lustre_mount_info_lock);
+	if (!lmi) {
+		CERROR("Can't find mount for %s\n", name);
+		RETURN(-ENOENT);
+	}
+	lsi = s2lsi(lmi->lmi_sb);
+
+	CDEBUG(D_MOUNT, "put mount %p from %s, refs=%d\n",
+	       lmi->lmi_sb, name, atomic_read(&lsi->lsi_mounts));
+
+	if (lustre_put_lsi(lmi->lmi_sb))
+		CDEBUG(D_MOUNT, "Last put of mount %p from %s\n",
+		       lmi->lmi_sb, name);
+
+	if (dereg_mnt)
+		/* this obd should never need the mount again */
+		server_deregister_mount(name);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(server_put_mount);
+
+/* Set up a MGS to serve startup logs */
+static int server_start_mgs(struct super_block *sb)
+{
+	struct lustre_sb_info    *lsi = s2lsi(sb);
+	struct lustre_mount_info *lmi;
+	int    rc = 0;
+	ENTRY;
+
+	/* It is impossible to have more than 1 MGS per node, since
+	   MGC wouldn't know which to connect to */
+	lmi = server_find_mount(LUSTRE_MGS_OBDNAME);
+	if (lmi) {
+		lsi = s2lsi(lmi->lmi_sb);
+		LCONSOLE_ERROR_MSG(0x15d, "The MGS service was already started"
+				   " from server\n");
+		RETURN(-EALREADY);
+	}
+
+	CDEBUG(D_CONFIG, "Start MGS service %s\n", LUSTRE_MGS_OBDNAME);
+
+	rc = server_register_mount(LUSTRE_MGS_OBDNAME, sb);
+
+	if (!rc) {
+		rc = lustre_start_simple(LUSTRE_MGS_OBDNAME, LUSTRE_MGS_NAME,
+					 LUSTRE_MGS_OBDNAME, NULL, NULL,
+					 lsi->lsi_osd_obdname, NULL);
+		/* server_deregister_mount() is not called previously, for lsi
+		 * and other stuff can't be freed cleanly when mgs calls
+		 * server_put_mount() in error handling case (see b=17758),
+		 * this problem is caused by a bug in mgs_init0, which forgot
+		 * calling server_put_mount in error case. */
+
+		if (rc)
+			server_deregister_mount(LUSTRE_MGS_OBDNAME);
+	}
+
+	if (rc)
+		LCONSOLE_ERROR_MSG(0x15e, "Failed to start MGS '%s' (%d). "
+				   "Is the 'mgs' module loaded?\n",
+				   LUSTRE_MGS_OBDNAME, rc);
+	RETURN(rc);
+}
+
+static int server_stop_mgs(struct super_block *sb)
+{
+	struct obd_device *obd;
+	int rc;
+	struct lustre_mount_info *lmi;
+	ENTRY;
+
+	/* Do not stop MGS if this device is not the running MGT */
+	lmi = server_find_mount(LUSTRE_MGS_OBDNAME);
+	if (lmi != NULL && lmi->lmi_sb != sb)
+		RETURN(0);
+
+	CDEBUG(D_MOUNT, "Stop MGS service %s\n", LUSTRE_MGS_OBDNAME);
+
+	/* There better be only one MGS */
+	obd = class_name2obd(LUSTRE_MGS_OBDNAME);
+	if (!obd) {
+		CDEBUG(D_CONFIG, "mgs %s not running\n", LUSTRE_MGS_OBDNAME);
+		RETURN(-EALREADY);
+	}
+
+	/* The MGS should always stop when we say so */
+	obd->obd_force = 1;
+	rc = class_manual_cleanup(obd);
+	RETURN(rc);
+}
+
+/* Since there's only one mgc per node, we have to change it's fs to get
+   access to the right disk. */
+static int server_mgc_set_fs(const struct lu_env *env,
+			     struct obd_device *mgc, struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_MOUNT, "Set mgc disk for %s\n", lsi->lsi_lmd->lmd_dev);
+
+	/* cl_mgc_sem in mgc insures we sleep if the mgc_fs is busy */
+	rc = obd_set_info_async(env, mgc->obd_self_export,
+				sizeof(KEY_SET_FS), KEY_SET_FS,
+				sizeof(*sb), sb, NULL);
+	if (rc != 0)
+		CERROR("can't set_fs %d\n", rc);
+
+	RETURN(rc);
+}
+
+static int server_mgc_clear_fs(const struct lu_env *env,
+			       struct obd_device *mgc)
+{
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_MOUNT, "Unassign mgc disk\n");
+
+	rc = obd_set_info_async(env, mgc->obd_self_export,
+				sizeof(KEY_CLEAR_FS), KEY_CLEAR_FS,
+				0, NULL, NULL);
+	RETURN(rc);
+}
+
+static inline bool is_mdc_device(const char *devname)
+{
+	char *ptr;
+
+	ptr = strrchr(devname, '-');
+	return ptr != NULL && strcmp(ptr, "-mdc") == 0;
+}
+
+static inline bool tgt_is_mdt(const char *tgtname, __u32 *idx)
+{
+	int type;
+
+	type = server_name2index(tgtname, idx, NULL);
+
+	return type == LDD_F_SV_TYPE_MDT;
+}
+
+/**
+ * Convert OST/MDT name(fsname-{MDT,OST}xxxx) to a lwp name with the @idx:yyyy
+ * (fsname-MDTyyyy-lwp-{MDT,OST}xxxx)
+ **/
+int tgt_name2lwp_name(const char *tgt_name, char *lwp_name, int len, __u32 idx)
+{
+	char		*fsname;
+	const char	*tgt;
+	int		rc;
+	ENTRY;
+
+	OBD_ALLOC(fsname, MTI_NAME_MAXLEN);
+	if (fsname == NULL)
+		RETURN(-ENOMEM);
+
+	rc = server_name2fsname(tgt_name, fsname, &tgt);
+	if (rc != 0) {
+		CERROR("%s: failed to get fsname from tgt_name: rc = %d\n",
+		       tgt_name, rc);
+		GOTO(cleanup, rc);
+	}
+
+	if (*tgt != '-' && *tgt != ':') {
+		CERROR("%s: invalid tgt_name name!\n", tgt_name);
+		GOTO(cleanup, rc = -EINVAL);
+	}
+
+	tgt++;
+	if (strncmp(tgt, "OST", 3) != 0 && strncmp(tgt, "MDT", 3) != 0) {
+		CERROR("%s is not an OST or MDT target!\n", tgt_name);
+		GOTO(cleanup, rc = -EINVAL);
+	}
+	snprintf(lwp_name, len, "%s-MDT%04x-%s-%s",
+		 fsname, idx, LUSTRE_LWP_NAME, tgt);
+
+	GOTO(cleanup, rc = 0);
+
+cleanup:
+	if (fsname != NULL)
+		OBD_FREE(fsname, MTI_NAME_MAXLEN);
+
+	return rc;
+}
+EXPORT_SYMBOL(tgt_name2lwp_name);
+
+static LIST_HEAD(lwp_register_list);
+static DEFINE_SPINLOCK(lwp_register_list_lock);
+
+static void lustre_put_lwp_item(struct lwp_register_item *lri)
+{
+	if (atomic_dec_and_test(&lri->lri_ref)) {
+		LASSERT(list_empty(&lri->lri_list));
+
+		if (*lri->lri_exp != NULL)
+			class_export_put(*lri->lri_exp);
+		OBD_FREE_PTR(lri);
+	}
+}
+
+int lustre_register_lwp_item(const char *lwpname, struct obd_export **exp,
+			     register_lwp_cb cb_func, void *cb_data)
+{
+	struct obd_device	 *lwp;
+	struct lwp_register_item *lri;
+	bool cb = false;
+	ENTRY;
+
+	LASSERTF(strlen(lwpname) < MTI_NAME_MAXLEN, "lwpname is too long %s\n",
+		 lwpname);
+	LASSERT(exp != NULL && *exp == NULL);
+
+	OBD_ALLOC_PTR(lri);
+	if (lri == NULL)
+		RETURN(-ENOMEM);
+
+	lwp = class_name2obd(lwpname);
+	if (lwp != NULL && lwp->obd_set_up == 1) {
+		struct obd_uuid *uuid;
+
+		OBD_ALLOC_PTR(uuid);
+		if (uuid == NULL) {
+			OBD_FREE_PTR(lri);
+			RETURN(-ENOMEM);
+		}
+		memcpy(uuid->uuid, lwpname, strlen(lwpname));
+		*exp = cfs_hash_lookup(lwp->obd_uuid_hash, uuid);
+		OBD_FREE_PTR(uuid);
+	}
+
+	memcpy(lri->lri_name, lwpname, strlen(lwpname));
+	lri->lri_exp = exp;
+	lri->lri_cb_func = cb_func;
+	lri->lri_cb_data = cb_data;
+	INIT_LIST_HEAD(&lri->lri_list);
+	/*
+	 * Initialize the lri_ref at 2, one will be released before
+	 * current function returned via lustre_put_lwp_item(), the
+	 * other will be released in lustre_deregister_lwp_item().
+	 */
+	atomic_set(&lri->lri_ref, 2);
+
+	spin_lock(&lwp_register_list_lock);
+	list_add(&lri->lri_list, &lwp_register_list);
+	if (*exp != NULL)
+		cb = true;
+	spin_unlock(&lwp_register_list_lock);
+
+	if (cb && cb_func != NULL)
+		cb_func(cb_data);
+	lustre_put_lwp_item(lri);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lustre_register_lwp_item);
+
+void lustre_deregister_lwp_item(struct obd_export **exp)
+{
+	struct lwp_register_item *lri;
+	bool removed = false;
+	int repeat = 0;
+
+	spin_lock(&lwp_register_list_lock);
+	list_for_each_entry(lri, &lwp_register_list, lri_list) {
+		if (exp == lri->lri_exp) {
+			list_del_init(&lri->lri_list);
+			removed = true;
+			break;
+		}
+	}
+	spin_unlock(&lwp_register_list_lock);
+
+	if (!removed)
+		return;
+
+	/* See lustre_notify_lwp_list(), in some extreme race conditions,
+	 * the notify callback could be still on the fly, we need to wait
+	 * for the callback done before moving on to free the data used
+	 * by callback. */
+	while (atomic_read(&lri->lri_ref) > 1) {
+		CDEBUG(D_MOUNT, "lri reference count %u, repeat: %d\n",
+		       atomic_read(&lri->lri_ref), repeat);
+		repeat++;
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(1));
+	}
+	lustre_put_lwp_item(lri);
+}
+EXPORT_SYMBOL(lustre_deregister_lwp_item);
+
+struct obd_export *lustre_find_lwp_by_index(const char *dev, __u32 idx)
+{
+	struct lustre_mount_info *lmi;
+	struct lustre_sb_info	 *lsi;
+	struct obd_device	 *lwp;
+	struct obd_export	 *exp = NULL;
+	char			  fsname[16];
+	char			  lwp_name[24];
+	int			  rc;
+
+	lmi = server_get_mount(dev);
+	if (lmi == NULL)
+		return NULL;
+
+	lsi = s2lsi(lmi->lmi_sb);
+	rc = server_name2fsname(lsi->lsi_svname, fsname, NULL);
+	if (rc != 0) {
+		CERROR("%s: failed to get fsname: rc = %d\n",
+		       lsi->lsi_svname, rc);
+		goto err_lmi;
+	}
+
+	snprintf(lwp_name, sizeof(lwp_name), "%s-MDT%04x", fsname, idx);
+	spin_lock(&lsi->lsi_lwp_lock);
+	list_for_each_entry(lwp, &lsi->lsi_lwp_list, obd_lwp_list) {
+		char *ptr = strstr(lwp->obd_name, lwp_name);
+
+		if (ptr != NULL && lwp->obd_lwp_export != NULL) {
+			exp = class_export_get(lwp->obd_lwp_export);
+			break;
+		}
+	}
+	spin_unlock(&lsi->lsi_lwp_lock);
+
+err_lmi:
+	server_put_mount(dev, false);
+
+	return exp;
+}
+EXPORT_SYMBOL(lustre_find_lwp_by_index);
+
+void lustre_notify_lwp_list(struct obd_export *exp)
+{
+	struct lwp_register_item *lri;
+	LASSERT(exp != NULL);
+
+again:
+	spin_lock(&lwp_register_list_lock);
+	list_for_each_entry(lri, &lwp_register_list, lri_list) {
+		if (strcmp(exp->exp_obd->obd_name, lri->lri_name))
+			continue;
+		if (*lri->lri_exp != NULL)
+			continue;
+		*lri->lri_exp = class_export_get(exp);
+		if (lri->lri_cb_func == NULL)
+			continue;
+		atomic_inc(&lri->lri_ref);
+		spin_unlock(&lwp_register_list_lock);
+
+		lri->lri_cb_func(lri->lri_cb_data);
+		lustre_put_lwp_item(lri);
+
+		/* Others may have changed the list after we unlock, we have
+		 * to rescan the list from the beginning. Usually, the list
+		 * 'lwp_register_list' is very short, and there is 'guard'
+		 * lri::lri_exp that will prevent the callback to be done
+		 * repeatedly. So rescanning the list has no problem. */
+		goto again;
+	}
+	spin_unlock(&lwp_register_list_lock);
+}
+EXPORT_SYMBOL(lustre_notify_lwp_list);
+
+static int lustre_lwp_connect(struct obd_device *lwp)
+{
+	struct lu_env		 env;
+	struct lu_context	 session_ctx;
+	struct obd_export	*exp;
+	struct obd_uuid		*uuid = NULL;
+	struct obd_connect_data	*data = NULL;
+	int			 rc;
+	ENTRY;
+
+	/* log has been fully processed, let clients connect */
+	rc = lu_env_init(&env, lwp->obd_lu_dev->ld_type->ldt_ctx_tags);
+	if (rc != 0)
+		RETURN(rc);
+
+	lu_context_init(&session_ctx, LCT_SERVER_SESSION);
+	session_ctx.lc_thread = NULL;
+	lu_context_enter(&session_ctx);
+	env.le_ses = &session_ctx;
+
+	OBD_ALLOC_PTR(data);
+	if (data == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_INDEX;
+	data->ocd_version = LUSTRE_VERSION_CODE;
+	data->ocd_connect_flags |= OBD_CONNECT_MDS_MDS | OBD_CONNECT_FID |
+		OBD_CONNECT_AT | OBD_CONNECT_LRU_RESIZE |
+		OBD_CONNECT_FULL20 | OBD_CONNECT_LVB_TYPE |
+		OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_LFSCK |
+		OBD_CONNECT_BULK_MBITS;
+	OBD_ALLOC_PTR(uuid);
+	if (uuid == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	if (strlen(lwp->obd_name) > sizeof(uuid->uuid)) {
+		CERROR("%s: Too long lwp name %s, max_size is %d\n",
+		       lwp->obd_name, lwp->obd_name, (int)sizeof(uuid->uuid));
+		GOTO(out, rc = -EINVAL);
+	}
+
+	/* Use lwp name as the uuid, so we find the export by lwp name later */
+	memcpy(uuid->uuid, lwp->obd_name, strlen(lwp->obd_name));
+	rc = obd_connect(&env, &exp, lwp, uuid, data, NULL);
+	if (rc != 0) {
+		CERROR("%s: connect failed: rc = %d\n", lwp->obd_name, rc);
+	} else {
+		if (unlikely(lwp->obd_lwp_export != NULL))
+			class_export_put(lwp->obd_lwp_export);
+		lwp->obd_lwp_export = class_export_get(exp);
+	}
+
+	GOTO(out, rc);
+
+out:
+	if (data != NULL)
+		OBD_FREE_PTR(data);
+	if (uuid != NULL)
+		OBD_FREE_PTR(uuid);
+
+	lu_env_fini(&env);
+	lu_context_exit(&session_ctx);
+	lu_context_fini(&session_ctx);
+
+	return rc;
+}
+
+/**
+ * lwp is used by slaves (Non-MDT0 targets) to manage the connection to MDT0,
+ * or from the OSTx to MDTy.
+ **/
+static int lustre_lwp_setup(struct lustre_cfg *lcfg, struct lustre_sb_info *lsi,
+			    __u32 idx)
+{
+	struct obd_device	*obd;
+	char			*lwpname = NULL;
+	char			*lwpuuid = NULL;
+	int			 rc;
+	ENTRY;
+
+	rc = class_add_uuid(lustre_cfg_string(lcfg, 1),
+			    lcfg->lcfg_nid);
+	if (rc != 0) {
+		CERROR("%s: Can't add uuid: rc =%d\n", lsi->lsi_svname, rc);
+		RETURN(rc);
+	}
+
+	OBD_ALLOC(lwpname, MTI_NAME_MAXLEN);
+	if (lwpname == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	rc = tgt_name2lwp_name(lsi->lsi_svname, lwpname, MTI_NAME_MAXLEN, idx);
+	if (rc != 0) {
+		CERROR("%s: failed to generate lwp name: rc = %d\n",
+		       lsi->lsi_svname, rc);
+		GOTO(out, rc);
+	}
+
+	OBD_ALLOC(lwpuuid, MTI_NAME_MAXLEN);
+	if (lwpuuid == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	sprintf(lwpuuid, "%s_UUID", lwpname);
+	rc = lustre_start_simple(lwpname, LUSTRE_LWP_NAME,
+				 lwpuuid, lustre_cfg_string(lcfg, 1),
+				 NULL, NULL, NULL);
+	if (rc) {
+		CERROR("%s: setup up failed: rc %d\n", lwpname, rc);
+		GOTO(out, rc);
+	}
+
+	obd = class_name2obd(lwpname);
+	LASSERT(obd != NULL);
+
+	rc = lustre_lwp_connect(obd);
+	if (rc == 0) {
+		obd->u.cli.cl_max_mds_easize = MAX_MD_SIZE;
+		spin_lock(&lsi->lsi_lwp_lock);
+		list_add_tail(&obd->obd_lwp_list, &lsi->lsi_lwp_list);
+		spin_unlock(&lsi->lsi_lwp_lock);
+	} else {
+		CERROR("%s: connect failed: rc = %d\n", lwpname, rc);
+	}
+
+	GOTO(out, rc);
+
+out:
+	if (lwpname != NULL)
+		OBD_FREE(lwpname, MTI_NAME_MAXLEN);
+	if (lwpuuid != NULL)
+		OBD_FREE(lwpuuid, MTI_NAME_MAXLEN);
+
+	return rc;
+}
+
+/* the caller is responsible for memory free */
+static struct obd_device *lustre_find_lwp(struct lustre_sb_info *lsi,
+					  char **lwpname, __u32 idx)
+{
+	struct obd_device	*lwp;
+	int			 rc = 0;
+	ENTRY;
+
+	LASSERT(lwpname != NULL);
+	LASSERT(IS_OST(lsi) || IS_MDT(lsi));
+
+	OBD_ALLOC(*lwpname, MTI_NAME_MAXLEN);
+	if (*lwpname == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	rc = tgt_name2lwp_name(lsi->lsi_svname, *lwpname, MTI_NAME_MAXLEN, idx);
+	if (rc != 0) {
+		CERROR("%s: failed to generate lwp name: rc = %d\n",
+		       lsi->lsi_svname, rc);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	lwp = class_name2obd(*lwpname);
+
+out:
+	if (rc != 0) {
+		if (*lwpname != NULL) {
+			OBD_FREE(*lwpname, MTI_NAME_MAXLEN);
+			*lwpname = NULL;
+		}
+		lwp = ERR_PTR(rc);
+	}
+
+	RETURN(lwp != NULL ? lwp : ERR_PTR(-ENOENT));
+}
+
+static int lustre_lwp_add_conn(struct lustre_cfg *cfg,
+			       struct lustre_sb_info *lsi, __u32 idx)
+{
+	struct lustre_cfg_bufs *bufs = NULL;
+	struct lustre_cfg      *lcfg = NULL;
+	char		       *lwpname = NULL;
+	struct obd_device      *lwp;
+	int			rc;
+	ENTRY;
+
+	lwp = lustre_find_lwp(lsi, &lwpname, idx);
+	if (IS_ERR(lwp)) {
+		CERROR("%s: can't find lwp device.\n", lsi->lsi_svname);
+		GOTO(out, rc = PTR_ERR(lwp));
+	}
+	LASSERT(lwpname != NULL);
+
+	OBD_ALLOC_PTR(bufs);
+	if (bufs == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	lustre_cfg_bufs_reset(bufs, lwpname);
+	lustre_cfg_bufs_set_string(bufs, 1,
+				   lustre_cfg_string(cfg, 1));
+
+	OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount, bufs->lcfg_buflen));
+	if (!lcfg)
+		GOTO(out_cfg, rc = -ENOMEM);
+	lustre_cfg_init(lcfg, LCFG_ADD_CONN, bufs);
+
+	rc = class_add_conn(lwp, lcfg);
+	if (rc)
+		CERROR("%s: can't add conn: rc = %d\n", lwpname, rc);
+
+	if (lcfg)
+		OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount,
+					      lcfg->lcfg_buflens));
+out_cfg:
+	if (bufs != NULL)
+		OBD_FREE_PTR(bufs);
+out:
+	if (lwpname != NULL)
+		OBD_FREE(lwpname, MTI_NAME_MAXLEN);
+	RETURN(rc);
+}
+
+/**
+ * Retrieve MDT nids from the client log, then start the lwp device.
+ * there are only two scenarios which would include mdt nid.
+ * 1.
+ * marker   5 (flags=0x01, v2.1.54.0) lustre-MDTyyyy  'add mdc' xxx-
+ * add_uuid  nid=192.168.122.162@tcp(0x20000c0a87aa2)  0:  1:192.168.122.162@tcp
+ * attach    0:lustre-MDTyyyy-mdc  1:mdc  2:lustre-clilmv_UUID
+ * setup     0:lustre-MDTyyyy-mdc  1:lustre-MDTyyyy_UUID  2:192.168.122.162@tcp
+ * add_uuid  nid=192.168.172.1@tcp(0x20000c0a8ac01)  0:  1:192.168.172.1@tcp
+ * add_conn  0:lustre-MDTyyyy-mdc  1:192.168.172.1@tcp
+ * modify_mdc_tgts add 0:lustre-clilmv  1:lustre-MDTyyyy_UUID xxxx
+ * marker   5 (flags=0x02, v2.1.54.0) lustre-MDTyyyy  'add mdc' xxxx-
+ * 2.
+ * marker   7 (flags=0x01, v2.1.54.0) lustre-MDTyyyy  'add failnid' xxxx-
+ * add_uuid  nid=192.168.122.2@tcp(0x20000c0a87a02)  0:  1:192.168.122.2@tcp
+ * add_conn  0:lustre-MDTyyyy-mdc  1:192.168.122.2@tcp
+ * marker   7 (flags=0x02, v2.1.54.0) lustre-MDTyyyy  'add failnid' xxxx-
+ **/
+static int client_lwp_config_process(const struct lu_env *env,
+				     struct llog_handle *handle,
+				     struct llog_rec_hdr *rec, void *data)
+{
+	struct config_llog_instance *cfg = data;
+	int			     cfg_len = rec->lrh_len;
+	char			    *cfg_buf = (char *) (rec + 1);
+	struct lustre_cfg	    *lcfg = NULL;
+	struct lustre_sb_info	    *lsi;
+	int			     rc = 0, swab = 0;
+	ENTRY;
+
+	if (rec->lrh_type != OBD_CFG_REC) {
+		CERROR("Unknown llog record type %#x encountered\n",
+		       rec->lrh_type);
+		RETURN(-EINVAL);
+	}
+
+	if (cfg->cfg_sb == NULL)
+		GOTO(out, rc = -EINVAL);
+	lsi = s2lsi(cfg->cfg_sb);
+
+	lcfg = (struct lustre_cfg *)cfg_buf;
+	if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) {
+		lustre_swab_lustre_cfg(lcfg);
+		swab = 1;
+	}
+
+	rc = lustre_cfg_sanity_check(cfg_buf, cfg_len);
+	if (rc)
+		GOTO(out, rc);
+
+	switch (lcfg->lcfg_command) {
+	case LCFG_MARKER: {
+		struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
+
+		lustre_swab_cfg_marker(marker, swab,
+				       LUSTRE_CFG_BUFLEN(lcfg, 1));
+		if (marker->cm_flags & CM_SKIP ||
+		    marker->cm_flags & CM_EXCLUDE)
+			GOTO(out, rc = 0);
+
+		if (!tgt_is_mdt(marker->cm_tgtname, &cfg->cfg_lwp_idx))
+			GOTO(out, rc = 0);
+
+		if (IS_MDT(lsi) && cfg->cfg_lwp_idx != 0)
+			GOTO(out, rc = 0);
+
+		if (!strncmp(marker->cm_comment, "add mdc", 7) ||
+		    !strncmp(marker->cm_comment, "add failnid", 11)) {
+			if (marker->cm_flags & CM_START) {
+				cfg->cfg_flags = CFG_F_MARKER;
+				/* This hack is to differentiate the
+				 * ADD_UUID is come from "add mdc" record
+				 * or from "add failnid" record. */
+				if (!strncmp(marker->cm_comment,
+					     "add failnid", 11))
+					cfg->cfg_flags |= CFG_F_SKIP;
+			} else if (marker->cm_flags & CM_END) {
+				cfg->cfg_flags = 0;
+			}
+		}
+		break;
+	}
+	case LCFG_ADD_UUID: {
+		if (cfg->cfg_flags == CFG_F_MARKER) {
+			rc = lustre_lwp_setup(lcfg, lsi, cfg->cfg_lwp_idx);
+			/* XXX: process only the first nid as
+			 * we don't need another instance of lwp */
+			cfg->cfg_flags |= CFG_F_SKIP;
+		} else if (cfg->cfg_flags == (CFG_F_MARKER | CFG_F_SKIP)) {
+			rc = class_add_uuid(lustre_cfg_string(lcfg, 1),
+					    lcfg->lcfg_nid);
+			if (rc)
+				CERROR("%s: Fail to add uuid, rc:%d\n",
+				       lsi->lsi_svname, rc);
+		}
+		break;
+	}
+	case LCFG_ADD_CONN: {
+		char *devname = lustre_cfg_string(lcfg, 0);
+		char *ptr;
+		__u32 idx     = 0;
+
+		if (!is_mdc_device(devname))
+			break;
+
+		ptr = strrchr(devname, '-');
+		if (ptr == NULL)
+			break;
+
+		*ptr = 0;
+		if (!tgt_is_mdt(devname, &idx)) {
+			*ptr = '-';
+			break;
+		}
+		*ptr = '-';
+
+		if (IS_MDT(lsi) && idx != 0)
+			break;
+
+		rc = lustre_lwp_add_conn(lcfg, lsi, idx);
+		break;
+	}
+	default:
+		break;
+	}
+out:
+	RETURN(rc);
+}
+
+static int lustre_disconnect_lwp(struct super_block *sb)
+{
+	struct lustre_sb_info		*lsi	 = s2lsi(sb);
+	struct obd_device		*lwp;
+	char				*logname = NULL;
+	struct lustre_cfg_bufs		*bufs	 = NULL;
+	struct config_llog_instance	*cfg	 = NULL;
+	int				 rc	 = 0;
+	int				 rc1	 = 0;
+	ENTRY;
+
+	if (likely(lsi->lsi_lwp_started)) {
+		OBD_ALLOC(logname, MTI_NAME_MAXLEN);
+		if (logname == NULL)
+			RETURN(-ENOMEM);
+
+		rc = server_name2fsname(lsi->lsi_svname, logname, NULL);
+		if (rc != 0) {
+			CERROR("%s: failed to get fsname from svname: "
+			       "rc = %d\n", lsi->lsi_svname, rc);
+			GOTO(out, rc = -EINVAL);
+		}
+
+		strcat(logname, "-client");
+		OBD_ALLOC_PTR(cfg);
+		if (cfg == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		/* end log first */
+		cfg->cfg_instance = sb;
+		rc = lustre_end_log(sb, logname, cfg);
+		if (rc != 0 && rc != -ENOENT)
+			GOTO(out, rc);
+
+		lsi->lsi_lwp_started = 0;
+	}
+
+	OBD_ALLOC_PTR(bufs);
+	if (bufs == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	list_for_each_entry(lwp, &lsi->lsi_lwp_list, obd_lwp_list) {
+		struct lustre_cfg *lcfg;
+
+		if (likely(lwp->obd_lwp_export != NULL)) {
+			class_export_put(lwp->obd_lwp_export);
+			lwp->obd_lwp_export = NULL;
+		}
+
+		lustre_cfg_bufs_reset(bufs, lwp->obd_name);
+		lustre_cfg_bufs_set_string(bufs, 1, NULL);
+		OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount,
+					       bufs->lcfg_buflen));
+		if (!lcfg)
+			GOTO(out, rc = -ENOMEM);
+		lustre_cfg_init(lcfg, LCFG_CLEANUP, bufs);
+
+		/* Disconnect import first. NULL is passed for the '@env',
+		 * since it will not be used. */
+		rc = lwp->obd_lu_dev->ld_ops->ldo_process_config(NULL,
+							lwp->obd_lu_dev, lcfg);
+		OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount,
+					      lcfg->lcfg_buflens));
+		if (rc != 0 && rc != -ETIMEDOUT) {
+			CERROR("%s: fail to disconnect LWP: rc = %d\n",
+			       lwp->obd_name, rc);
+			rc1 = rc;
+		}
+	}
+
+	GOTO(out, rc);
+
+out:
+	if (bufs != NULL)
+		OBD_FREE_PTR(bufs);
+	if (cfg != NULL)
+		OBD_FREE_PTR(cfg);
+	if (logname != NULL)
+		OBD_FREE(logname, MTI_NAME_MAXLEN);
+
+	return rc1 != 0 ? rc1 : rc;
+}
+
+/**
+ * Stop the lwp for an OST/MDT target.
+ **/
+static int lustre_stop_lwp(struct super_block *sb)
+{
+	struct lustre_sb_info	*lsi = s2lsi(sb);
+	struct obd_device	*lwp;
+	int			 rc  = 0;
+	int			 rc1 = 0;
+	ENTRY;
+
+	while (!list_empty(&lsi->lsi_lwp_list)) {
+		lwp = list_entry(lsi->lsi_lwp_list.next, struct obd_device,
+				 obd_lwp_list);
+		list_del_init(&lwp->obd_lwp_list);
+		lwp->obd_force = 1;
+		rc = class_manual_cleanup(lwp);
+		if (rc != 0) {
+			CERROR("%s: fail to stop LWP: rc = %d\n",
+			       lwp->obd_name, rc);
+			rc1 = rc;
+		}
+	}
+
+	RETURN(rc1 != 0 ? rc1 : rc);
+}
+
+/**
+ * Start the lwp(fsname-MDTyyyy-lwp-{MDT,OST}xxxx) for a MDT/OST or MDT target.
+ **/
+static int lustre_start_lwp(struct super_block *sb)
+{
+	struct lustre_sb_info	    *lsi = s2lsi(sb);
+	struct config_llog_instance *cfg = NULL;
+	char			    *logname;
+	int			     rc;
+	ENTRY;
+
+	if (unlikely(lsi->lsi_lwp_started))
+		RETURN(0);
+
+	OBD_ALLOC(logname, MTI_NAME_MAXLEN);
+	if (logname == NULL)
+		RETURN(-ENOMEM);
+
+	rc = server_name2fsname(lsi->lsi_svname, logname, NULL);
+	if (rc != 0) {
+		CERROR("%s: failed to get fsname from svname: rc = %d\n",
+		       lsi->lsi_svname, rc);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	strcat(logname, "-client");
+	OBD_ALLOC_PTR(cfg);
+	if (cfg == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	cfg->cfg_callback = client_lwp_config_process;
+	cfg->cfg_instance = sb;
+	rc = lustre_process_log(sb, logname, cfg);
+	/* need to remove config llog from mgc */
+	lsi->lsi_lwp_started = 1;
+
+	GOTO(out, rc);
+
+out:
+	OBD_FREE(logname, MTI_NAME_MAXLEN);
+	if (cfg != NULL)
+		OBD_FREE_PTR(cfg);
+
+	return rc;
+}
+
+static DEFINE_MUTEX(server_start_lock);
+
+/* Stop MDS/OSS if nobody is using them */
+static int server_stop_servers(int lsiflags)
+{
+	struct obd_device *obd = NULL;
+	struct obd_type *type = NULL;
+	int rc = 0;
+	ENTRY;
+
+	mutex_lock(&server_start_lock);
+
+	/* Either an MDT or an OST or neither  */
+	/* if this was an MDT, and there are no more MDT's, clean up the MDS */
+	if (lsiflags & LDD_F_SV_TYPE_MDT) {
+		obd = class_name2obd(LUSTRE_MDS_OBDNAME);
+		if (obd != NULL)
+			type = class_search_type(LUSTRE_MDT_NAME);
+	}
+
+	/* if this was an OST, and there are no more OST's, clean up the OSS */
+	if (lsiflags & LDD_F_SV_TYPE_OST) {
+		obd = class_name2obd(LUSTRE_OSS_OBDNAME);
+		if (obd != NULL)
+			type = class_search_type(LUSTRE_OST_NAME);
+	}
+
+	if (obd != NULL && (type == NULL || type->typ_refcnt == 0)) {
+		obd->obd_force = 1;
+		/* obd_fail doesn't mean much on a server obd */
+		rc = class_manual_cleanup(obd);
+	}
+
+	mutex_unlock(&server_start_lock);
+
+	RETURN(rc);
+}
+
+int server_mti_print(const char *title, struct mgs_target_info *mti)
+{
+	PRINT_CMD(PRINT_MASK, "mti %s\n", title);
+	PRINT_CMD(PRINT_MASK, "server: %s\n", mti->mti_svname);
+	PRINT_CMD(PRINT_MASK, "fs:     %s\n", mti->mti_fsname);
+	PRINT_CMD(PRINT_MASK, "uuid:   %s\n", mti->mti_uuid);
+	PRINT_CMD(PRINT_MASK, "ver: %d  flags: %#x\n",
+		  mti->mti_config_ver, mti->mti_flags);
+	return 0;
+}
+
+/* Generate data for registration */
+static int server_lsi2mti(struct lustre_sb_info *lsi,
+			  struct mgs_target_info *mti)
+{
+	struct lnet_process_id id;
+	int rc, i = 0;
+	int cplen = 0;
+	ENTRY;
+
+	if (!IS_SERVER(lsi))
+		RETURN(-EINVAL);
+
+	if (strlcpy(mti->mti_svname, lsi->lsi_svname, sizeof(mti->mti_svname))
+	    >= sizeof(mti->mti_svname))
+		RETURN(-E2BIG);
+
+	mti->mti_nid_count = 0;
+	while (LNetGetId(i++, &id) != -ENOENT) {
+		if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
+			continue;
+
+		/* server use --servicenode param, only allow specified
+		 * nids be registered */
+		if ((lsi->lsi_lmd->lmd_flags & LMD_FLG_NO_PRIMNODE) != 0 &&
+		    class_match_nid(lsi->lsi_lmd->lmd_params,
+				    PARAM_FAILNODE, id.nid) < 1)
+			continue;
+
+		/* match specified network */
+		if (!class_match_net(lsi->lsi_lmd->lmd_params,
+				     PARAM_NETWORK, LNET_NIDNET(id.nid)))
+			continue;
+
+		mti->mti_nids[mti->mti_nid_count] = id.nid;
+		mti->mti_nid_count++;
+		if (mti->mti_nid_count >= MTI_NIDS_MAX) {
+			CWARN("Only using first %d nids for %s\n",
+			      mti->mti_nid_count, mti->mti_svname);
+			break;
+		}
+	}
+
+	if (mti->mti_nid_count == 0) {
+		CERROR("Failed to get NID for server %s, please check whether "
+		       "the target is specifed with improper --servicenode or "
+		       "--network options.\n", mti->mti_svname);
+		RETURN(-EINVAL);
+	}
+
+	mti->mti_lustre_ver = LUSTRE_VERSION_CODE;
+	mti->mti_config_ver = 0;
+
+	rc = server_name2fsname(lsi->lsi_svname, mti->mti_fsname, NULL);
+	if (rc != 0)
+		return rc;
+
+	rc = server_name2index(lsi->lsi_svname, &mti->mti_stripe_index, NULL);
+	if (rc < 0)
+		return rc;
+	/* Orion requires index to be set */
+	LASSERT(!(rc & LDD_F_NEED_INDEX));
+	/* keep only LDD flags */
+	mti->mti_flags = lsi->lsi_flags & LDD_F_MASK;
+	if (mti->mti_flags & (LDD_F_WRITECONF | LDD_F_VIRGIN))
+		mti->mti_flags |= LDD_F_UPDATE;
+	cplen = strlcpy(mti->mti_params, lsi->lsi_lmd->lmd_params,
+			sizeof(mti->mti_params));
+	if (cplen >= sizeof(mti->mti_params))
+		return -E2BIG;
+	return 0;
+}
+
+/* Register an old or new target with the MGS. If needed MGS will construct
+   startup logs and assign index */
+static int server_register_target(struct lustre_sb_info *lsi)
+{
+	struct obd_device *mgc = lsi->lsi_mgc;
+	struct mgs_target_info *mti = NULL;
+	bool writeconf;
+	int rc;
+	int tried = 0;
+	ENTRY;
+
+	LASSERT(mgc);
+
+	if (!IS_SERVER(lsi))
+		RETURN(-EINVAL);
+
+	OBD_ALLOC_PTR(mti);
+	if (!mti)
+		RETURN(-ENOMEM);
+
+	rc = server_lsi2mti(lsi, mti);
+	if (rc)
+		GOTO(out, rc);
+
+	CDEBUG(D_MOUNT, "Registration %s, fs=%s, %s, index=%04x, flags=%#x\n",
+	       mti->mti_svname, mti->mti_fsname,
+	       libcfs_nid2str(mti->mti_nids[0]), mti->mti_stripe_index,
+	       mti->mti_flags);
+
+	/* if write_conf is true, the registration must succeed */
+	writeconf = !!(lsi->lsi_flags & (LDD_F_NEED_INDEX | LDD_F_UPDATE));
+	mti->mti_flags |= LDD_F_OPC_REG;
+
+again:
+	/* Register the target */
+	/* FIXME use mgc_process_config instead */
+	rc = obd_set_info_async(NULL, mgc->u.cli.cl_mgc_mgsexp,
+				sizeof(KEY_REGISTER_TARGET),
+				KEY_REGISTER_TARGET,
+				sizeof(*mti), mti, NULL);
+	if (rc) {
+		if (mti->mti_flags & LDD_F_ERROR) {
+			LCONSOLE_ERROR_MSG(0x160,
+				"%s: the MGS refuses to allow this server "
+				"to start: rc = %d. Please see messages on "
+				"the MGS.\n", lsi->lsi_svname, rc);
+		} else if (writeconf) {
+			if ((rc == -ESHUTDOWN || rc == -EIO) && ++tried < 5) {
+				/* The connection with MGS is not established.
+				 * Try again after 2 seconds. Interruptable. */
+				set_current_state(TASK_INTERRUPTIBLE);
+				schedule_timeout(
+					msecs_to_jiffies(MSEC_PER_SEC) * 2);
+				set_current_state(TASK_RUNNING);
+				if (!signal_pending(current))
+					goto again;
+			}
+
+			LCONSOLE_ERROR_MSG(0x15f,
+				"%s: cannot register this server with the MGS: "
+				"rc = %d. Is the MGS running?\n",
+				lsi->lsi_svname, rc);
+		} else {
+			CDEBUG(D_HA, "%s: error registering with the MGS: "
+			       "rc = %d (not fatal)\n", lsi->lsi_svname, rc);
+			/* reset the error code for non-fatal error. */
+			rc = 0;
+		}
+		GOTO(out, rc);
+	}
+
+out:
+	if (mti)
+		OBD_FREE_PTR(mti);
+	RETURN(rc);
+}
+
+/**
+ * Notify the MGS that this target is ready.
+ * Used by IR - if the MGS receives this message, it will notify clients.
+ */
+static int server_notify_target(struct super_block *sb, struct obd_device *obd)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_device *mgc = lsi->lsi_mgc;
+	struct mgs_target_info *mti = NULL;
+	int rc;
+	ENTRY;
+
+	LASSERT(mgc);
+
+	if (!(IS_SERVER(lsi)))
+		RETURN(-EINVAL);
+
+	OBD_ALLOC_PTR(mti);
+	if (!mti)
+		RETURN(-ENOMEM);
+	rc = server_lsi2mti(lsi, mti);
+	if (rc)
+		GOTO(out, rc);
+
+	mti->mti_instance = obd->u.obt.obt_instance;
+	mti->mti_flags |= LDD_F_OPC_READY;
+
+	/* FIXME use mgc_process_config instead */
+	rc = obd_set_info_async(NULL, mgc->u.cli.cl_mgc_mgsexp,
+				sizeof(KEY_REGISTER_TARGET),
+				KEY_REGISTER_TARGET,
+				sizeof(*mti), mti, NULL);
+
+	/* Imperative recovery: if the mgs informs us to use IR? */
+	if (!rc && !(mti->mti_flags & LDD_F_ERROR) &&
+	    (mti->mti_flags & LDD_F_IR_CAPABLE))
+		lsi->lsi_flags |= LDD_F_IR_CAPABLE;
+
+out:
+	if (mti)
+		OBD_FREE_PTR(mti);
+	RETURN(rc);
+
+}
+
+/** Start server targets: MDTs and OSTs
+ */
+static int server_start_targets(struct super_block *sb)
+{
+	struct obd_device *obd;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct config_llog_instance cfg;
+	struct lu_env mgc_env;
+	struct lu_device *dev;
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_MOUNT, "starting target %s\n", lsi->lsi_svname);
+
+	if (IS_MDT(lsi)) {
+		/* make sure the MDS is started */
+		mutex_lock(&server_start_lock);
+		obd = class_name2obd(LUSTRE_MDS_OBDNAME);
+		if (!obd) {
+			rc = lustre_start_simple(LUSTRE_MDS_OBDNAME,
+						 LUSTRE_MDS_NAME,
+						 LUSTRE_MDS_OBDNAME"_uuid",
+						 NULL, NULL, NULL, NULL);
+			if (rc) {
+				mutex_unlock(&server_start_lock);
+				CERROR("failed to start MDS: %d\n", rc);
+				RETURN(rc);
+			}
+		}
+		mutex_unlock(&server_start_lock);
+	}
+
+	/* If we're an OST, make sure the global OSS is running */
+	if (IS_OST(lsi)) {
+		/* make sure OSS is started */
+		mutex_lock(&server_start_lock);
+		obd = class_name2obd(LUSTRE_OSS_OBDNAME);
+		if (!obd) {
+			rc = lustre_start_simple(LUSTRE_OSS_OBDNAME,
+						 LUSTRE_OSS_NAME,
+						 LUSTRE_OSS_OBDNAME"_uuid",
+						 NULL, NULL, NULL, NULL);
+			if (rc) {
+				mutex_unlock(&server_start_lock);
+				CERROR("failed to start OSS: %d\n", rc);
+				RETURN(rc);
+			}
+		}
+		mutex_unlock(&server_start_lock);
+	}
+
+	rc = lu_env_init(&mgc_env, LCT_MG_THREAD);
+	if (rc != 0)
+		GOTO(out_stop_service, rc);
+
+	/* Set the mgc fs to our server disk.  This allows the MGC to
+	 * read and write configs locally, in case it can't talk to the MGS. */
+	rc = server_mgc_set_fs(&mgc_env, lsi->lsi_mgc, sb);
+	if (rc)
+		GOTO(out_env, rc);
+
+	/* Register with MGS */
+	rc = server_register_target(lsi);
+	if (rc)
+		GOTO(out_mgc, rc);
+
+	/* Let the target look up the mount using the target's name
+	   (we can't pass the sb or mnt through class_process_config.) */
+	rc = server_register_mount(lsi->lsi_svname, sb);
+	if (rc)
+		GOTO(out_mgc, rc);
+
+	/* Start targets using the llog named for the target */
+	memset(&cfg, 0, sizeof(cfg));
+	cfg.cfg_callback = class_config_llog_handler;
+	cfg.cfg_sub_clds = CONFIG_SUB_SERVER;
+	rc = lustre_process_log(sb, lsi->lsi_svname, &cfg);
+	if (rc) {
+		CERROR("failed to start server %s: %d\n",
+		       lsi->lsi_svname, rc);
+		/* Do NOT call server_deregister_mount() here. This makes it
+		 * impossible to find mount later in cleanup time and leaves
+		 * @lsi and othder stuff leaked. -umka */
+		GOTO(out_mgc, rc);
+	}
+
+	obd = class_name2obd(lsi->lsi_svname);
+	if (!obd) {
+		CERROR("no server named %s was started\n", lsi->lsi_svname);
+		GOTO(out_mgc, rc = -ENXIO);
+	}
+
+	if (IS_OST(lsi) || IS_MDT(lsi)) {
+		rc = lustre_start_lwp(sb);
+		if (rc) {
+			CERROR("%s: failed to start LWP: %d\n",
+			       lsi->lsi_svname, rc);
+			GOTO(out_mgc, rc);
+		}
+	}
+
+	server_notify_target(sb, obd);
+
+	/* calculate recovery timeout, do it after lustre_process_log */
+	server_calc_timeout(lsi, obd);
+
+	/* log has been fully processed, let clients connect */
+	dev = obd->obd_lu_dev;
+	if (dev && dev->ld_ops->ldo_prepare) {
+		struct lu_env env;
+
+		rc = lu_env_init(&env, dev->ld_type->ldt_ctx_tags);
+		if (rc == 0) {
+			struct lu_context  session_ctx;
+
+			lu_context_init(&session_ctx, LCT_SERVER_SESSION);
+			session_ctx.lc_thread = NULL;
+			lu_context_enter(&session_ctx);
+			env.le_ses = &session_ctx;
+
+			rc = dev->ld_ops->ldo_prepare(&env, NULL, dev);
+
+			lu_env_fini(&env);
+			lu_context_exit(&session_ctx);
+			lu_context_fini(&session_ctx);
+		}
+	}
+
+	/* abort recovery only on the complete stack:
+	 * many devices can be involved */
+	if ((lsi->lsi_lmd->lmd_flags & LMD_FLG_ABORT_RECOV) &&
+	    (OBP(obd, iocontrol))) {
+		obd_iocontrol(OBD_IOC_ABORT_RECOVERY, obd->obd_self_export, 0,
+			      NULL, NULL);
+	}
+
+out_mgc:
+	/* Release the mgc fs for others to use */
+	server_mgc_clear_fs(&mgc_env, lsi->lsi_mgc);
+out_env:
+	lu_env_fini(&mgc_env);
+out_stop_service:
+	if (rc != 0)
+		server_stop_servers(lsi->lsi_flags);
+
+	RETURN(rc);
+}
+
+static int lsi_prepare(struct lustre_sb_info *lsi)
+{
+	const char *osd_type;
+	const char *fstype;
+	__u32 index;
+	int rc;
+	ENTRY;
+
+	LASSERT(lsi);
+	LASSERT(lsi->lsi_lmd);
+
+	/* The server name is given as a mount line option */
+	if (lsi->lsi_lmd->lmd_profile == NULL) {
+		LCONSOLE_ERROR("Can't determine server name\n");
+		RETURN(-EINVAL);
+	}
+
+	/* Determine osd type */
+	if (lsi->lsi_lmd->lmd_osd_type == NULL) {
+		osd_type = LUSTRE_OSD_LDISKFS_NAME;
+		fstype = "ldiskfs";
+	} else {
+		osd_type = lsi->lsi_lmd->lmd_osd_type;
+		fstype = lsi->lsi_lmd->lmd_osd_type;
+	}
+
+	if (strlen(lsi->lsi_lmd->lmd_profile) >= sizeof(lsi->lsi_svname) ||
+	    strlen(osd_type) >= sizeof(lsi->lsi_osd_type) ||
+	    strlen(fstype) >= sizeof(lsi->lsi_fstype))
+		RETURN(-ENAMETOOLONG);
+
+	strlcpy(lsi->lsi_svname, lsi->lsi_lmd->lmd_profile,
+		sizeof(lsi->lsi_svname));
+	strlcpy(lsi->lsi_osd_type, osd_type, sizeof(lsi->lsi_osd_type));
+	/* XXX: a temp. solution for components using ldiskfs
+	 *      to be removed in one of the subsequent patches */
+	strlcpy(lsi->lsi_fstype, fstype, sizeof(lsi->lsi_fstype));
+
+	/* Determine server type */
+	rc = server_name2index(lsi->lsi_svname, &index, NULL);
+	if (rc < 0) {
+		if (lsi->lsi_lmd->lmd_flags & LMD_FLG_MGS) {
+			/* Assume we're a bare MGS */
+			rc = 0;
+			lsi->lsi_lmd->lmd_flags |= LMD_FLG_NOSVC;
+		} else {
+			LCONSOLE_ERROR("Can't determine server type of '%s'\n",
+				       lsi->lsi_svname);
+			RETURN(rc);
+		}
+	}
+	lsi->lsi_flags |= rc;
+
+	/* Add mount line flags that used to be in ldd:
+	 * writeconf, mgs, anything else?
+	 */
+	lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_WRITECONF) ?
+		LDD_F_WRITECONF : 0;
+	lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_VIRGIN) ?
+		LDD_F_VIRGIN : 0;
+	lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_UPDATE) ?
+		LDD_F_UPDATE : 0;
+	lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_MGS) ?
+		LDD_F_SV_TYPE_MGS : 0;
+	lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_NO_PRIMNODE) ?
+		LDD_F_NO_PRIMNODE : 0;
+
+	RETURN(0);
+}
+
+/*************** server mount ******************/
+
+/** Start the shutdown of servers at umount.
+ */
+static void server_put_super(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_device     *obd;
+	char *tmpname, *extraname = NULL;
+	int tmpname_sz;
+	int lsiflags = lsi->lsi_flags;
+	ENTRY;
+
+	LASSERT(IS_SERVER(lsi));
+
+	tmpname_sz = strlen(lsi->lsi_svname) + 1;
+	OBD_ALLOC(tmpname, tmpname_sz);
+	memcpy(tmpname, lsi->lsi_svname, tmpname_sz);
+	CDEBUG(D_MOUNT, "server put_super %s\n", tmpname);
+	if (IS_MDT(lsi) && (lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC))
+		snprintf(tmpname, tmpname_sz, "MGS");
+
+	/* disconnect the lwp first to drain off the inflight request */
+	if (IS_OST(lsi) || IS_MDT(lsi)) {
+		int	rc;
+
+		rc = lustre_disconnect_lwp(sb);
+		if (rc != 0 && rc != -ETIMEDOUT &&
+		    rc != -ENOTCONN && rc != -ESHUTDOWN)
+			CWARN("%s: failed to disconnect lwp: rc= %d\n",
+			      tmpname, rc);
+	}
+
+	/* Stop the target */
+	if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) &&
+	    (IS_MDT(lsi) || IS_OST(lsi))) {
+		struct lustre_profile *lprof = NULL;
+
+		/* tell the mgc to drop the config log */
+		lustre_end_log(sb, lsi->lsi_svname, NULL);
+
+		/* COMPAT_146 - profile may get deleted in mgc_cleanup.
+		   If there are any setup/cleanup errors, save the lov
+		   name for safety cleanup later. */
+		lprof = class_get_profile(lsi->lsi_svname);
+		if (lprof != NULL) {
+			if (lprof->lp_dt != NULL) {
+				OBD_ALLOC(extraname, strlen(lprof->lp_dt) + 1);
+				strncpy(extraname, lprof->lp_dt,
+					strlen(lprof->lp_dt) + 1);
+			}
+			class_put_profile(lprof);
+		}
+
+		obd = class_name2obd(lsi->lsi_svname);
+		if (obd) {
+			CDEBUG(D_MOUNT, "stopping %s\n", obd->obd_name);
+			if (lsiflags & LSI_UMOUNT_FAILOVER)
+				obd->obd_fail = 1;
+			/* We can't seem to give an error return code
+			 * to .put_super, so we better make sure we clean up! */
+			obd->obd_force = 1;
+			class_manual_cleanup(obd);
+		} else {
+			CERROR("no obd %s\n", lsi->lsi_svname);
+			server_deregister_mount(lsi->lsi_svname);
+		}
+	}
+
+	/* If they wanted the mgs to stop separately from the mdt, they
+	   should have put it on a different device. */
+	if (IS_MGS(lsi)) {
+		/* if MDS start with --nomgs, don't stop MGS then */
+		if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS))
+			server_stop_mgs(sb);
+	}
+
+	if (IS_OST(lsi) || IS_MDT(lsi)) {
+		if (lustre_stop_lwp(sb) < 0)
+			CERROR("%s: failed to stop lwp!\n", tmpname);
+	}
+
+	/* Clean the mgc and sb */
+	lustre_common_put_super(sb);
+
+	/* wait till all in-progress cleanups are done
+	 * specifically we're interested in ofd cleanup
+	 * as it pins OSS */
+	obd_zombie_barrier();
+
+	/* Stop the servers (MDS, OSS) if no longer needed.  We must wait
+	   until the target is really gone so that our type refcount check
+	   is right. */
+	server_stop_servers(lsiflags);
+
+	/* In case of startup or cleanup err, stop related obds */
+	if (extraname) {
+		obd = class_name2obd(extraname);
+		if (obd) {
+			CWARN("Cleaning orphaned obd %s\n", extraname);
+			obd->obd_force = 1;
+			class_manual_cleanup(obd);
+		}
+		OBD_FREE(extraname, strlen(extraname) + 1);
+	}
+
+	LCONSOLE_WARN("server umount %s complete\n", tmpname);
+	OBD_FREE(tmpname, tmpname_sz);
+	EXIT;
+}
+
+/** Called only for 'umount -f'
+ */
+static void server_umount_begin(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	ENTRY;
+
+	CDEBUG(D_MOUNT, "umount -f\n");
+	/* umount = failover
+	   umount -f = force
+	   no third way to do non-force, non-failover */
+	lsi->lsi_flags &= ~LSI_UMOUNT_FAILOVER;
+	EXIT;
+}
+
+static int server_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_statfs statfs;
+	int rc;
+	ENTRY;
+
+	if (lsi->lsi_dt_dev) {
+		rc = dt_statfs(NULL, lsi->lsi_dt_dev, &statfs);
+		if (rc == 0) {
+			statfs_unpack(buf, &statfs);
+			buf->f_type = sb->s_magic;
+			RETURN(0);
+		}
+	}
+
+	/* just return 0 */
+	buf->f_type = sb->s_magic;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_blocks = 1;
+	buf->f_bfree = 0;
+	buf->f_bavail = 0;
+	buf->f_files = 1;
+	buf->f_ffree = 0;
+	buf->f_namelen = NAME_MAX;
+	RETURN(0);
+}
+
+/** The operations we support directly on the superblock:
+ * mount, umount, and df.
+ */
+static struct super_operations server_ops = {
+	.put_super	= server_put_super,
+	.umount_begin	= server_umount_begin, /* umount -f */
+	.statfs		= server_statfs,
+};
+
+/*
+ * Xattr support for Lustre servers
+ */
+#ifdef HAVE_IOP_XATTR
+static ssize_t lustre_getxattr(struct dentry *dentry, const char *name,
+				void *buffer, size_t size)
+{
+	if (!selinux_is_enabled())
+		return -EOPNOTSUPP;
+	return -ENODATA;
+}
+
+static int lustre_setxattr(struct dentry *dentry, const char *name,
+			    const void *value, size_t size, int flags)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
+static ssize_t lustre_listxattr(struct dentry *d_entry, char *name,
+				size_t size)
+{
+	return -EOPNOTSUPP;
+}
+
+static const struct inode_operations server_inode_operations = {
+#ifdef HAVE_IOP_XATTR
+	.setxattr       = lustre_setxattr,
+	.getxattr       = lustre_getxattr,
+#endif
+	.listxattr      = lustre_listxattr,
+};
+
+#define log2(n) ffz(~(n))
+#define LUSTRE_SUPER_MAGIC 0x0BD00BD1
+
+static int server_fill_super_common(struct super_block *sb)
+{
+	struct inode *root = NULL;
+	ENTRY;
+
+	CDEBUG(D_MOUNT, "Server sb, dev=%d\n", (int)sb->s_dev);
+
+	sb->s_blocksize = 4096;
+	sb->s_blocksize_bits = log2(sb->s_blocksize);
+	sb->s_magic = LUSTRE_SUPER_MAGIC;
+	sb->s_maxbytes = 0; /* we don't allow file IO on server mountpoints */
+	sb->s_flags |= MS_RDONLY;
+	sb->s_op = &server_ops;
+
+	root = new_inode(sb);
+	if (!root) {
+		CERROR("Can't make root inode\n");
+		RETURN(-EIO);
+	}
+
+	/* returns -EIO for every operation */
+	/* make_bad_inode(root); -- badness - can't umount */
+	/* apparently we need to be a directory for the mount to finish */
+	root->i_mode = S_IFDIR;
+	root->i_op = &server_inode_operations;
+	sb->s_root = d_make_root(root);
+	if (!sb->s_root) {
+		CERROR("%s: can't make root dentry\n", sb->s_id);
+		RETURN(-EIO);
+	}
+
+	RETURN(0);
+}
+
+static int osd_start(struct lustre_sb_info *lsi, unsigned long mflags)
+{
+	struct lustre_mount_data *lmd = lsi->lsi_lmd;
+	struct obd_device	 *obd;
+	struct dt_device_param    p;
+	char			  flagstr[16];
+	int			  rc;
+	ENTRY;
+
+	CDEBUG(D_MOUNT,
+	       "Attempting to start %s, type=%s, lsifl=%x, mountfl=%lx\n",
+	       lsi->lsi_svname, lsi->lsi_osd_type, lsi->lsi_flags, mflags);
+
+	sprintf(lsi->lsi_osd_obdname, "%s-osd", lsi->lsi_svname);
+	strcpy(lsi->lsi_osd_uuid, lsi->lsi_osd_obdname);
+	strcat(lsi->lsi_osd_uuid, "_UUID");
+	sprintf(flagstr, "%lu:%lu", mflags, (unsigned long) lmd->lmd_flags);
+
+	obd = class_name2obd(lsi->lsi_osd_obdname);
+	if (obd == NULL) {
+		rc = lustre_start_simple(lsi->lsi_osd_obdname,
+					 lsi->lsi_osd_type,
+					 lsi->lsi_osd_uuid, lmd->lmd_dev,
+					 flagstr, lsi->lsi_lmd->lmd_opts,
+					 lsi->lsi_svname);
+		if (rc)
+			GOTO(out, rc);
+		obd = class_name2obd(lsi->lsi_osd_obdname);
+		LASSERT(obd);
+	} else {
+		CDEBUG(D_MOUNT, "%s already started\n", lsi->lsi_osd_obdname);
+		/* but continue setup to allow special case of MDT and internal
+		 * MGT being started separately. */
+		if (!((IS_MGS(lsi) && (lsi->lsi_lmd->lmd_flags &
+				      LMD_FLG_NOMGS)) ||
+		     (IS_MDT(lsi) && (lsi->lsi_lmd->lmd_flags &
+				      LMD_FLG_NOSVC))))
+			RETURN(-EALREADY);
+	}
+
+	rc = obd_connect(NULL, &lsi->lsi_osd_exp,
+			 obd, &obd->obd_uuid, NULL, NULL);
+
+	if (rc) {
+		obd->obd_force = 1;
+		class_manual_cleanup(obd);
+		lsi->lsi_dt_dev = NULL;
+		RETURN(rc);
+	}
+
+	LASSERT(obd->obd_lu_dev);
+	lu_device_get(obd->obd_lu_dev);
+	lsi->lsi_dt_dev = lu2dt_dev(obd->obd_lu_dev);
+	LASSERT(lsi->lsi_dt_dev);
+
+	/* set disk context for llog usage */
+	OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt);
+	obd->obd_lvfs_ctxt.dt = lsi->lsi_dt_dev;
+
+	dt_conf_get(NULL, lsi->lsi_dt_dev, &p);
+out:
+	RETURN(rc);
+}
+
+/** Fill in the superblock info for a Lustre server.
+ * Mount the device with the correct options.
+ * Read the on-disk config file.
+ * Start the services.
+ */
+int server_fill_super(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	int rc;
+	ENTRY;
+
+	/* to simulate target mount race */
+	OBD_RACE(OBD_FAIL_TGT_MOUNT_RACE);
+
+	rc = lsi_prepare(lsi);
+	if (rc)
+		RETURN(rc);
+
+	/* Start low level OSD */
+	rc = osd_start(lsi, sb->s_flags);
+	if (rc) {
+		CERROR("Unable to start osd on %s: %d\n",
+		       lsi->lsi_lmd->lmd_dev, rc);
+		lustre_put_lsi(sb);
+		RETURN(rc);
+	}
+
+	CDEBUG(D_MOUNT, "Found service %s on device %s\n",
+	       lsi->lsi_svname, lsi->lsi_lmd->lmd_dev);
+
+	if (class_name2obd(lsi->lsi_svname)) {
+		LCONSOLE_ERROR_MSG(0x161, "The target named %s is already "
+				   "running. Double-mount may have compromised"
+				   " the disk journal.\n",
+				   lsi->lsi_svname);
+		lustre_put_lsi(sb);
+		RETURN(-EALREADY);
+	}
+
+	/* Start MGS before MGC */
+	if (IS_MGS(lsi) && !(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS)) {
+		rc = server_start_mgs(sb);
+		if (rc)
+			GOTO(out_mnt, rc);
+	}
+
+	/* Start MGC before servers */
+	rc = lustre_start_mgc(sb);
+	if (rc)
+		GOTO(out_mnt, rc);
+
+	/* Set up all obd devices for service */
+	if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) &&
+	    (IS_OST(lsi) || IS_MDT(lsi))) {
+		rc = server_start_targets(sb);
+		if (rc < 0) {
+			CERROR("Unable to start targets: %d\n", rc);
+			GOTO(out_mnt, rc);
+		}
+		/* FIXME overmount client here, or can we just start a
+		 * client log and client_fill_super on this sb?  We
+		 * need to make sure server_put_super gets called too
+		 * - ll_put_super calls lustre_common_put_super; check
+		 * there for LSI_SERVER flag, call s_p_s if so.
+		 *
+		 * Probably should start client from new thread so we
+		 * can return.  Client will not finish until all
+		 * servers are connected.  Note - MGS-only server does
+		 * NOT get a client, since there is no lustre fs
+		 * associated - the MGS is for all lustre fs's */
+	}
+
+	rc = server_fill_super_common(sb);
+	if (rc)
+		GOTO(out_mnt, rc);
+
+	RETURN(0);
+out_mnt:
+	/* We jump here in case of failure while starting targets or MGS.
+	 * In this case we can't just put @mnt and have to do real cleanup
+	 * with stoping targets, etc. */
+	server_put_super(sb);
+	return rc;
+}
+
+/*
+ * Calculate timeout value for a target.
+ */
+void server_calc_timeout(struct lustre_sb_info *lsi, struct obd_device *obd)
+{
+	struct lustre_mount_data *lmd;
+	int soft = 0;
+	int hard = 0;
+	int factor = 0;
+	bool has_ir = !!(lsi->lsi_flags & LDD_F_IR_CAPABLE);
+	int min = OBD_RECOVERY_TIME_MIN;
+
+	LASSERT(IS_SERVER(lsi));
+
+	lmd = lsi->lsi_lmd;
+	if (lmd) {
+		soft   = lmd->lmd_recovery_time_soft;
+		hard   = lmd->lmd_recovery_time_hard;
+		has_ir = has_ir && !(lmd->lmd_flags & LMD_FLG_NOIR);
+		obd->obd_no_ir = !has_ir;
+	}
+
+	if (soft == 0)
+		soft = OBD_RECOVERY_TIME_SOFT;
+	if (hard == 0)
+		hard = OBD_RECOVERY_TIME_HARD;
+
+	/* target may have ir_factor configured. */
+	factor = OBD_IR_FACTOR_DEFAULT;
+	if (obd->obd_recovery_ir_factor)
+		factor = obd->obd_recovery_ir_factor;
+
+	if (has_ir) {
+		int new_soft = soft;
+
+		/* adjust timeout value by imperative recovery */
+		new_soft = (soft * factor) / OBD_IR_FACTOR_MAX;
+		/* make sure the timeout is not too short */
+		new_soft = max(min, new_soft);
+
+		LCONSOLE_INFO("%s: Imperative Recovery enabled, recovery "
+			      "window shrunk from %d-%d down to %d-%d\n",
+			      obd->obd_name, soft, hard, new_soft, hard);
+
+		soft = new_soft;
+	} else {
+		LCONSOLE_INFO("%s: Imperative Recovery not enabled, recovery "
+			      "window %d-%d\n", obd->obd_name, soft, hard);
+	}
+
+	/* we're done */
+	obd->obd_recovery_timeout = max_t(time64_t, obd->obd_recovery_timeout,
+					  soft);
+	obd->obd_recovery_time_hard = hard;
+	obd->obd_recovery_ir_factor = factor;
+}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obdo.c b/drivers/staging/lustrefsx/lustre/obdclass/obdo.c
new file mode 100644
index 0000000000000..dddb24b036fee
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obdo.c
@@ -0,0 +1,217 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obdo.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/user_namespace.h>
+#ifdef HAVE_UIDGID_HEADER
+# include <linux/uidgid.h>
+#endif
+#include <obd_class.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_obdo.h>
+
+void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent)
+{
+        dst->o_parent_oid = fid_oid(parent);
+        dst->o_parent_seq = fid_seq(parent);
+        dst->o_parent_ver = fid_ver(parent);
+        dst->o_valid |= OBD_MD_FLGENER | OBD_MD_FLFID;
+}
+EXPORT_SYMBOL(obdo_set_parent_fid);
+
+void obdo_set_o_projid(struct obdo *dst, u32 projid)
+{
+	dst->o_projid = projid;
+	dst->o_valid |= OBD_MD_FLPROJID;
+}
+EXPORT_SYMBOL(obdo_set_o_projid);
+
+/* WARNING: the file systems must take care not to tinker with
+   attributes they don't manage (such as blocks). */
+void obdo_from_inode(struct obdo *dst, struct inode *src, u64 valid)
+{
+	u64 newvalid = 0;
+
+	if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+		CDEBUG(D_INODE, "valid %#llx, new time %lu/%lu\n",
+			valid, LTIME_S(src->i_mtime),
+			LTIME_S(src->i_ctime));
+
+        if (valid & OBD_MD_FLATIME) {
+                dst->o_atime = LTIME_S(src->i_atime);
+                newvalid |= OBD_MD_FLATIME;
+        }
+        if (valid & OBD_MD_FLMTIME) {
+                dst->o_mtime = LTIME_S(src->i_mtime);
+                newvalid |= OBD_MD_FLMTIME;
+        }
+        if (valid & OBD_MD_FLCTIME) {
+                dst->o_ctime = LTIME_S(src->i_ctime);
+                newvalid |= OBD_MD_FLCTIME;
+        }
+        if (valid & OBD_MD_FLSIZE) {
+                dst->o_size = i_size_read(src);
+                newvalid |= OBD_MD_FLSIZE;
+        }
+        if (valid & OBD_MD_FLBLOCKS) {  /* allocation of space (x512 bytes) */
+                dst->o_blocks = src->i_blocks;
+                newvalid |= OBD_MD_FLBLOCKS;
+        }
+        if (valid & OBD_MD_FLBLKSZ) {   /* optimal block size */
+		dst->o_blksize = 1U << src->i_blkbits;
+                newvalid |= OBD_MD_FLBLKSZ;
+        }
+        if (valid & OBD_MD_FLTYPE) {
+                dst->o_mode = (dst->o_mode & S_IALLUGO) |
+                              (src->i_mode & S_IFMT);
+                newvalid |= OBD_MD_FLTYPE;
+        }
+        if (valid & OBD_MD_FLMODE) {
+                dst->o_mode = (dst->o_mode & S_IFMT) |
+                              (src->i_mode & S_IALLUGO);
+                newvalid |= OBD_MD_FLMODE;
+        }
+	if (valid & OBD_MD_FLUID) {
+		dst->o_uid = from_kuid(&init_user_ns, src->i_uid);
+		newvalid |= OBD_MD_FLUID;
+	}
+	if (valid & OBD_MD_FLGID) {
+		dst->o_gid = from_kgid(&init_user_ns, src->i_gid);
+		newvalid |= OBD_MD_FLGID;
+	}
+	if (valid & OBD_MD_FLFLAGS) {
+		dst->o_flags = src->i_flags;
+		newvalid |= OBD_MD_FLFLAGS;
+	}
+	dst->o_valid |= newvalid;
+}
+EXPORT_SYMBOL(obdo_from_inode);
+
+void obdo_cpy_md(struct obdo *dst, const struct obdo *src, u64 valid)
+{
+	CDEBUG(D_INODE, "src obdo "DOSTID" valid %#llx, dst obdo "DOSTID"\n",
+               POSTID(&src->o_oi), src->o_valid, POSTID(&dst->o_oi));
+        if (valid & OBD_MD_FLATIME)
+                dst->o_atime = src->o_atime;
+        if (valid & OBD_MD_FLMTIME)
+                dst->o_mtime = src->o_mtime;
+        if (valid & OBD_MD_FLCTIME)
+                dst->o_ctime = src->o_ctime;
+        if (valid & OBD_MD_FLSIZE)
+                dst->o_size = src->o_size;
+        if (valid & OBD_MD_FLBLOCKS) /* allocation of space */
+                dst->o_blocks = src->o_blocks;
+        if (valid & OBD_MD_FLBLKSZ)
+                dst->o_blksize = src->o_blksize;
+        if (valid & OBD_MD_FLTYPE)
+                dst->o_mode = (dst->o_mode & ~S_IFMT) | (src->o_mode & S_IFMT);
+        if (valid & OBD_MD_FLMODE)
+                dst->o_mode = (dst->o_mode & S_IFMT) | (src->o_mode & ~S_IFMT);
+        if (valid & OBD_MD_FLUID)
+                dst->o_uid = src->o_uid;
+        if (valid & OBD_MD_FLGID)
+                dst->o_gid = src->o_gid;
+        if (valid & OBD_MD_FLFLAGS)
+                dst->o_flags = src->o_flags;
+        if (valid & OBD_MD_FLFID) {
+                dst->o_parent_seq = src->o_parent_seq;
+                dst->o_parent_ver = src->o_parent_ver;
+        }
+        if (valid & OBD_MD_FLGENER)
+                dst->o_parent_oid = src->o_parent_oid;
+        if (valid & OBD_MD_FLHANDLE)
+                dst->o_handle = src->o_handle;
+
+        dst->o_valid |= valid;
+}
+EXPORT_SYMBOL(obdo_cpy_md);
+
+void obdo_to_ioobj(const struct obdo *oa, struct obd_ioobj *ioobj)
+{
+	ioobj->ioo_oid = oa->o_oi;
+	if (unlikely(!(oa->o_valid & OBD_MD_FLGROUP)))
+		ostid_set_seq_mdt0(&ioobj->ioo_oid);
+
+	/* Since 2.4 this does not contain o_mode in the low 16 bits.
+	 * Instead, it holds (bd_md_max_brw - 1) for multi-bulk BRW RPCs */
+	ioobj->ioo_max_brw = 0;
+}
+EXPORT_SYMBOL(obdo_to_ioobj);
+
+/**
+ * Create an obdo to send over the wire
+ */
+void lustre_set_wire_obdo(const struct obd_connect_data *ocd,
+				 struct obdo *wobdo,
+				 const struct obdo *lobdo)
+{
+	*wobdo = *lobdo;
+	if (ocd == NULL)
+		return;
+
+	if (unlikely(!(ocd->ocd_connect_flags & OBD_CONNECT_FID)) &&
+	    fid_seq_is_echo(ostid_seq(&lobdo->o_oi))) {
+		/* Currently OBD_FL_OSTID will only be used when 2.4 echo
+		 * client communicate with pre-2.4 server */
+		wobdo->o_oi.oi.oi_id = fid_oid(&lobdo->o_oi.oi_fid);
+		wobdo->o_oi.oi.oi_seq = fid_seq(&lobdo->o_oi.oi_fid);
+	}
+}
+EXPORT_SYMBOL(lustre_set_wire_obdo);
+
+/**
+ * Create a local obdo from a wire based odbo
+ */
+void lustre_get_wire_obdo(const struct obd_connect_data *ocd,
+				 struct obdo *lobdo,
+				 const struct obdo *wobdo)
+{
+	*lobdo = *wobdo;
+	if (ocd == NULL)
+		return;
+
+	if (unlikely(!(ocd->ocd_connect_flags & OBD_CONNECT_FID)) &&
+	    fid_seq_is_echo(wobdo->o_oi.oi.oi_seq)) {
+		/* see above */
+		lobdo->o_oi.oi_fid.f_seq = wobdo->o_oi.oi.oi_seq;
+		lobdo->o_oi.oi_fid.f_oid = wobdo->o_oi.oi.oi_id;
+		lobdo->o_oi.oi_fid.f_ver = 0;
+	}
+}
+EXPORT_SYMBOL(lustre_get_wire_obdo);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/statfs_pack.c b/drivers/staging/lustrefsx/lustre/obdclass/statfs_pack.c
new file mode 100644
index 0000000000000..2a36051e52356
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/statfs_pack.c
@@ -0,0 +1,73 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/statfs_pack.c
+ *
+ * (Un)packing of OST/MDS requests
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/statfs.h>
+#include <lustre_export.h>
+#include <lustre_net.h>
+#include <obd_support.h>
+#include <obd_class.h>
+
+void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs)
+{
+        memset(osfs, 0, sizeof(*osfs));
+        osfs->os_type = sfs->f_type;
+        osfs->os_blocks = sfs->f_blocks;
+        osfs->os_bfree = sfs->f_bfree;
+        osfs->os_bavail = sfs->f_bavail;
+        osfs->os_files = sfs->f_files;
+        osfs->os_ffree = sfs->f_ffree;
+        osfs->os_bsize = sfs->f_bsize;
+        osfs->os_namelen = sfs->f_namelen;
+}
+EXPORT_SYMBOL(statfs_pack);
+
+void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs)
+{
+        memset(sfs, 0, sizeof(*sfs));
+        sfs->f_type = osfs->os_type;
+        sfs->f_blocks = osfs->os_blocks;
+        sfs->f_bfree = osfs->os_bfree;
+        sfs->f_bavail = osfs->os_bavail;
+        sfs->f_files = osfs->os_files;
+        sfs->f_ffree = osfs->os_ffree;
+        sfs->f_bsize = osfs->os_bsize;
+        sfs->f_namelen = osfs->os_namelen;
+}
+EXPORT_SYMBOL(statfs_unpack);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/upcall_cache.c b/drivers/staging/lustrefsx/lustre/obdclass/upcall_cache.c
new file mode 100644
index 0000000000000..2112733e50c54
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/upcall_cache.c
@@ -0,0 +1,449 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/upcall_cache.c
+ *
+ * Supplementary groups cache.
+ */
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <libcfs/linux/linux-misc.h>
+#include <libcfs/libcfs.h>
+#include <lnet/types.h>
+#include <upcall_cache.h>
+
+static struct upcall_cache_entry *alloc_entry(struct upcall_cache *cache,
+					      __u64 key, void *args)
+{
+	struct upcall_cache_entry *entry;
+
+	LIBCFS_ALLOC(entry, sizeof(*entry));
+	if (!entry)
+		return NULL;
+
+	UC_CACHE_SET_NEW(entry);
+	INIT_LIST_HEAD(&entry->ue_hash);
+	entry->ue_key = key;
+	atomic_set(&entry->ue_refcount, 0);
+	init_waitqueue_head(&entry->ue_waitq);
+	if (cache->uc_ops->init_entry)
+		cache->uc_ops->init_entry(entry, args);
+	return entry;
+}
+
+/* protected by cache lock */
+static void free_entry(struct upcall_cache *cache,
+		       struct upcall_cache_entry *entry)
+{
+	if (cache->uc_ops->free_entry)
+		cache->uc_ops->free_entry(cache, entry);
+
+	list_del(&entry->ue_hash);
+	CDEBUG(D_OTHER, "destroy cache entry %p for key %llu\n",
+		entry, entry->ue_key);
+	LIBCFS_FREE(entry, sizeof(*entry));
+}
+
+static inline int upcall_compare(struct upcall_cache *cache,
+				 struct upcall_cache_entry *entry,
+				 __u64 key, void *args)
+{
+	if (entry->ue_key != key)
+		return -1;
+
+	if (cache->uc_ops->upcall_compare)
+		return cache->uc_ops->upcall_compare(cache, entry, key, args);
+
+	return 0;
+}
+
+static inline int downcall_compare(struct upcall_cache *cache,
+				   struct upcall_cache_entry *entry,
+				   __u64 key, void *args)
+{
+	if (entry->ue_key != key)
+		return -1;
+
+	if (cache->uc_ops->downcall_compare)
+		return cache->uc_ops->downcall_compare(cache, entry, key, args);
+
+	return 0;
+}
+
+static inline void get_entry(struct upcall_cache_entry *entry)
+{
+	atomic_inc(&entry->ue_refcount);
+}
+
+static inline void put_entry(struct upcall_cache *cache,
+			     struct upcall_cache_entry *entry)
+{
+	if (atomic_dec_and_test(&entry->ue_refcount) &&
+	    (UC_CACHE_IS_INVALID(entry) || UC_CACHE_IS_EXPIRED(entry))) {
+		free_entry(cache, entry);
+	}
+}
+
+static int check_unlink_entry(struct upcall_cache *cache,
+			      struct upcall_cache_entry *entry)
+{
+	if (UC_CACHE_IS_VALID(entry) &&
+	    cfs_time_before(cfs_time_current(), entry->ue_expire))
+		return 0;
+
+	if (UC_CACHE_IS_ACQUIRING(entry)) {
+		if (entry->ue_acquire_expire == 0 ||
+		    cfs_time_before(cfs_time_current(),
+				    entry->ue_acquire_expire))
+			return 0;
+
+		UC_CACHE_SET_EXPIRED(entry);
+		wake_up_all(&entry->ue_waitq);
+	} else if (!UC_CACHE_IS_INVALID(entry)) {
+		UC_CACHE_SET_EXPIRED(entry);
+	}
+
+	list_del_init(&entry->ue_hash);
+	if (!atomic_read(&entry->ue_refcount))
+		free_entry(cache, entry);
+	return 1;
+}
+
+static inline int refresh_entry(struct upcall_cache *cache,
+			 struct upcall_cache_entry *entry)
+{
+	LASSERT(cache->uc_ops->do_upcall);
+	return cache->uc_ops->do_upcall(cache, entry);
+}
+
+struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *cache,
+						  __u64 key, void *args)
+{
+	struct upcall_cache_entry *entry = NULL, *new = NULL, *next;
+	struct list_head *head;
+	wait_queue_entry_t wait;
+	int rc, found;
+	ENTRY;
+
+	LASSERT(cache);
+
+	head = &cache->uc_hashtable[UC_CACHE_HASH_INDEX(key)];
+find_again:
+	found = 0;
+	spin_lock(&cache->uc_lock);
+	list_for_each_entry_safe(entry, next, head, ue_hash) {
+		/* check invalid & expired items */
+		if (check_unlink_entry(cache, entry))
+			continue;
+		if (upcall_compare(cache, entry, key, args) == 0) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found) {
+		if (!new) {
+			spin_unlock(&cache->uc_lock);
+			new = alloc_entry(cache, key, args);
+			if (!new) {
+				CERROR("fail to alloc entry\n");
+				RETURN(ERR_PTR(-ENOMEM));
+			}
+			goto find_again;
+		} else {
+			list_add(&new->ue_hash, head);
+			entry = new;
+		}
+	} else {
+		if (new) {
+			free_entry(cache, new);
+			new = NULL;
+		}
+		list_move(&entry->ue_hash, head);
+	}
+	get_entry(entry);
+
+	/* acquire for new one */
+	if (UC_CACHE_IS_NEW(entry)) {
+		UC_CACHE_SET_ACQUIRING(entry);
+		UC_CACHE_CLEAR_NEW(entry);
+		spin_unlock(&cache->uc_lock);
+		rc = refresh_entry(cache, entry);
+		spin_lock(&cache->uc_lock);
+		entry->ue_acquire_expire =
+			cfs_time_shift(cache->uc_acquire_expire);
+		if (rc < 0) {
+			UC_CACHE_CLEAR_ACQUIRING(entry);
+			UC_CACHE_SET_INVALID(entry);
+			wake_up_all(&entry->ue_waitq);
+			if (unlikely(rc == -EREMCHG)) {
+				put_entry(cache, entry);
+				GOTO(out, entry = ERR_PTR(rc));
+			}
+		}
+	}
+	/* someone (and only one) is doing upcall upon this item,
+	 * wait it to complete */
+	if (UC_CACHE_IS_ACQUIRING(entry)) {
+		long expiry = (entry == new) ?
+			      cfs_time_seconds(cache->uc_acquire_expire) :
+			      MAX_SCHEDULE_TIMEOUT;
+		long left;
+
+		init_waitqueue_entry(&wait, current);
+		add_wait_queue(&entry->ue_waitq, &wait);
+		set_current_state(TASK_INTERRUPTIBLE);
+		spin_unlock(&cache->uc_lock);
+
+		left = schedule_timeout(expiry);
+
+		spin_lock(&cache->uc_lock);
+		remove_wait_queue(&entry->ue_waitq, &wait);
+		if (UC_CACHE_IS_ACQUIRING(entry)) {
+			/* we're interrupted or upcall failed in the middle */
+			rc = left > 0 ? -EINTR : -ETIMEDOUT;
+			CERROR("acquire for key %llu: error %d\n",
+			       entry->ue_key, rc);
+			put_entry(cache, entry);
+			GOTO(out, entry = ERR_PTR(rc));
+		}
+	}
+
+	/* invalid means error, don't need to try again */
+	if (UC_CACHE_IS_INVALID(entry)) {
+		put_entry(cache, entry);
+		GOTO(out, entry = ERR_PTR(-EIDRM));
+	}
+
+	/* check expired
+	 * We can't refresh the existing one because some
+	 * memory might be shared by multiple processes.
+	 */
+	if (check_unlink_entry(cache, entry)) {
+		/* if expired, try again. but if this entry is
+		 * created by me but too quickly turn to expired
+		 * without any error, should at least give a
+		 * chance to use it once.
+		 */
+		if (entry != new) {
+			put_entry(cache, entry);
+			spin_unlock(&cache->uc_lock);
+			new = NULL;
+			goto find_again;
+		}
+	}
+
+	/* Now we know it's good */
+out:
+	spin_unlock(&cache->uc_lock);
+	RETURN(entry);
+}
+EXPORT_SYMBOL(upcall_cache_get_entry);
+
+void upcall_cache_put_entry(struct upcall_cache *cache,
+			    struct upcall_cache_entry *entry)
+{
+	ENTRY;
+
+	if (!entry) {
+		EXIT;
+		return;
+	}
+
+	LASSERT(atomic_read(&entry->ue_refcount) > 0);
+	spin_lock(&cache->uc_lock);
+	put_entry(cache, entry);
+	spin_unlock(&cache->uc_lock);
+	EXIT;
+}
+EXPORT_SYMBOL(upcall_cache_put_entry);
+
+int upcall_cache_downcall(struct upcall_cache *cache, __u32 err, __u64 key,
+			  void *args)
+{
+	struct upcall_cache_entry *entry = NULL;
+	struct list_head *head;
+	int found = 0, rc = 0;
+	ENTRY;
+
+	LASSERT(cache);
+
+	head = &cache->uc_hashtable[UC_CACHE_HASH_INDEX(key)];
+
+	spin_lock(&cache->uc_lock);
+	list_for_each_entry(entry, head, ue_hash) {
+		if (downcall_compare(cache, entry, key, args) == 0) {
+			found = 1;
+			get_entry(entry);
+			break;
+		}
+	}
+
+	if (!found) {
+		CDEBUG(D_OTHER, "%s: upcall for key %llu not expected\n",
+		       cache->uc_name, key);
+		/* haven't found, it's possible */
+		spin_unlock(&cache->uc_lock);
+		RETURN(-EINVAL);
+	}
+
+	if (err) {
+		CDEBUG(D_OTHER, "%s: upcall for key %llu returned %d\n",
+		       cache->uc_name, entry->ue_key, err);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	if (!UC_CACHE_IS_ACQUIRING(entry)) {
+		CDEBUG(D_RPCTRACE, "%s: found uptodate entry %p (key %llu)"
+		       "\n", cache->uc_name, entry, entry->ue_key);
+		GOTO(out, rc = 0);
+	}
+
+	if (UC_CACHE_IS_INVALID(entry) || UC_CACHE_IS_EXPIRED(entry)) {
+		CERROR("%s: found a stale entry %p (key %llu) in ioctl\n",
+		       cache->uc_name, entry, entry->ue_key);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	spin_unlock(&cache->uc_lock);
+	if (cache->uc_ops->parse_downcall)
+		rc = cache->uc_ops->parse_downcall(cache, entry, args);
+	spin_lock(&cache->uc_lock);
+	if (rc)
+		GOTO(out, rc);
+
+	entry->ue_expire = cfs_time_shift(cache->uc_entry_expire);
+	UC_CACHE_SET_VALID(entry);
+	CDEBUG(D_OTHER, "%s: created upcall cache entry %p for key %llu\n",
+	       cache->uc_name, entry, entry->ue_key);
+out:
+	if (rc) {
+		UC_CACHE_SET_INVALID(entry);
+		list_del_init(&entry->ue_hash);
+	}
+	UC_CACHE_CLEAR_ACQUIRING(entry);
+	spin_unlock(&cache->uc_lock);
+	wake_up_all(&entry->ue_waitq);
+	put_entry(cache, entry);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(upcall_cache_downcall);
+
+void upcall_cache_flush(struct upcall_cache *cache, int force)
+{
+	struct upcall_cache_entry *entry, *next;
+	int i;
+	ENTRY;
+
+	spin_lock(&cache->uc_lock);
+	for (i = 0; i < UC_CACHE_HASH_SIZE; i++) {
+		list_for_each_entry_safe(entry, next,
+					 &cache->uc_hashtable[i], ue_hash) {
+			if (!force && atomic_read(&entry->ue_refcount)) {
+				UC_CACHE_SET_EXPIRED(entry);
+				continue;
+			}
+			LASSERT(!atomic_read(&entry->ue_refcount));
+			free_entry(cache, entry);
+		}
+	}
+	spin_unlock(&cache->uc_lock);
+	EXIT;
+}
+EXPORT_SYMBOL(upcall_cache_flush);
+
+void upcall_cache_flush_one(struct upcall_cache *cache, __u64 key, void *args)
+{
+	struct list_head *head;
+	struct upcall_cache_entry *entry;
+	int found = 0;
+	ENTRY;
+
+	head = &cache->uc_hashtable[UC_CACHE_HASH_INDEX(key)];
+
+	spin_lock(&cache->uc_lock);
+	list_for_each_entry(entry, head, ue_hash) {
+		if (upcall_compare(cache, entry, key, args) == 0) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (found) {
+		CWARN("%s: flush entry %p: key %llu, ref %d, fl %x, "
+		      "cur %lu, ex %ld/%ld\n",
+		      cache->uc_name, entry, entry->ue_key,
+		      atomic_read(&entry->ue_refcount), entry->ue_flags,
+		      cfs_time_current_sec(), entry->ue_acquire_expire,
+		      entry->ue_expire);
+		UC_CACHE_SET_EXPIRED(entry);
+		if (!atomic_read(&entry->ue_refcount))
+			free_entry(cache, entry);
+	}
+	spin_unlock(&cache->uc_lock);
+}
+EXPORT_SYMBOL(upcall_cache_flush_one);
+
+struct upcall_cache *upcall_cache_init(const char *name, const char *upcall,
+				       struct upcall_cache_ops *ops)
+{
+	struct upcall_cache *cache;
+	int i;
+	ENTRY;
+
+	LIBCFS_ALLOC(cache, sizeof(*cache));
+	if (!cache)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	spin_lock_init(&cache->uc_lock);
+	init_rwsem(&cache->uc_upcall_rwsem);
+	for (i = 0; i < UC_CACHE_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&cache->uc_hashtable[i]);
+	strlcpy(cache->uc_name, name, sizeof(cache->uc_name));
+	/* upcall pathname proc tunable */
+	strlcpy(cache->uc_upcall, upcall, sizeof(cache->uc_upcall));
+	cache->uc_entry_expire = 20 * 60;
+	cache->uc_acquire_expire = 30;
+	cache->uc_ops = ops;
+
+	RETURN(cache);
+}
+EXPORT_SYMBOL(upcall_cache_init);
+
+void upcall_cache_cleanup(struct upcall_cache *cache)
+{
+	if (!cache)
+		return;
+	upcall_cache_flush_all(cache);
+	LIBCFS_FREE(cache, sizeof(*cache));
+}
+EXPORT_SYMBOL(upcall_cache_cleanup);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/uuid.c b/drivers/staging/lustrefsx/lustre/obdclass/uuid.c
new file mode 100644
index 0000000000000..cc0092687511b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/uuid.c
@@ -0,0 +1,78 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/uuid.c
+ *
+ * Public include file for the UUID library
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <libcfs/libcfs.h>
+#include <obd_support.h>
+#include <obd_class.h>
+
+static inline size_t consume(size_t nob, __u8 **ptr)
+{
+	size_t value;
+
+	LASSERT(nob <= sizeof(value));
+
+	for (value = 0; nob > 0; --nob)
+		value = (value << 8) | *((*ptr)++);
+	return value;
+}
+
+#define CONSUME(val, ptr) (val) = consume(sizeof(val), (ptr))
+
+static void uuid_unpack(class_uuid_t in, __u16 *uu, size_t nr)
+{
+	__u8 *ptr = in;
+
+	LASSERT(nr * sizeof(*uu) == sizeof(class_uuid_t));
+
+	while (nr-- > 0)
+		CONSUME(uu[nr], &ptr);
+}
+
+void class_uuid_unparse(class_uuid_t uu, struct obd_uuid *out)
+{
+	/* uu as an array of __u16's */
+        __u16 uuid[sizeof(class_uuid_t) / sizeof(__u16)];
+
+	CLASSERT(ARRAY_SIZE(uuid) == 8);
+
+        uuid_unpack(uu, uuid, ARRAY_SIZE(uuid));
+        sprintf(out->uuid, "%04x%04x-%04x-%04x-%04x-%04x%04x%04x",
+		uuid[0], uuid[1], uuid[2], uuid[3],
+		uuid[4], uuid[5], uuid[6], uuid[7]);
+}
+EXPORT_SYMBOL(class_uuid_unparse);
diff --git a/drivers/staging/lustrefsx/lustre/obdecho/echo.c b/drivers/staging/lustrefsx/lustre/obdecho/echo.c
new file mode 100644
index 0000000000000..de7fd77920392
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdecho/echo.c
@@ -0,0 +1,674 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdecho/echo.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_ECHO
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_debug.h>
+#include <lustre_dlm.h>
+#include <lprocfs_status.h>
+
+#include "echo_internal.h"
+
+/* The echo objid needs to be below 2^32, because regular FID numbers are
+ * limited to 2^32 objects in f_oid for the FID_SEQ_ECHO range. b=23335 */
+#define ECHO_INIT_OID        0x10000000ULL
+#define ECHO_HANDLE_MAGIC    0xabcd0123fedc9876ULL
+
+#define ECHO_PERSISTENT_PAGES (ECHO_PERSISTENT_SIZE >> PAGE_SHIFT)
+static struct page *echo_persistent_pages[ECHO_PERSISTENT_PAGES];
+
+enum {
+        LPROC_ECHO_READ_BYTES = 1,
+        LPROC_ECHO_WRITE_BYTES = 2,
+        LPROC_ECHO_LAST = LPROC_ECHO_WRITE_BYTES +1
+};
+
+static int echo_connect(const struct lu_env *env,
+                        struct obd_export **exp, struct obd_device *obd,
+                        struct obd_uuid *cluuid, struct obd_connect_data *data,
+                        void *localdata)
+{
+	struct lustre_handle conn = { 0 };
+	int rc;
+
+	data->ocd_connect_flags &= ECHO_CONNECT_SUPPORTED;
+
+	if (data->ocd_connect_flags & OBD_CONNECT_FLAGS2)
+		data->ocd_connect_flags2 &= ECHO_CONNECT_SUPPORTED2;
+
+	rc = class_connect(&conn, obd, cluuid);
+	if (rc) {
+		CERROR("can't connect %d\n", rc);
+		return rc;
+	}
+	*exp = class_conn2export(&conn);
+
+	return 0;
+}
+
+static int echo_disconnect(struct obd_export *exp)
+{
+        LASSERT (exp != NULL);
+
+        return server_disconnect_export(exp);
+}
+
+static int echo_init_export(struct obd_export *exp)
+{
+        return ldlm_init_export(exp);
+}
+
+static int echo_destroy_export(struct obd_export *exp)
+{
+        ENTRY;
+
+        target_destroy_export(exp);
+        ldlm_destroy_export(exp);
+
+        RETURN(0);
+}
+
+static u64 echo_next_id(struct obd_device *obddev)
+{
+	u64 id;
+
+	spin_lock(&obddev->u.echo.eo_lock);
+	id = ++obddev->u.echo.eo_lastino;
+	spin_unlock(&obddev->u.echo.eo_lock);
+
+	return id;
+}
+
+static int echo_create(const struct lu_env *env, struct obd_export *exp,
+		       struct obdo *oa)
+{
+        struct obd_device *obd = class_exp2obd(exp);
+
+        if (!obd) {
+		CERROR("invalid client cookie %#llx\n",
+                       exp->exp_handle.h_cookie);
+                return -EINVAL;
+        }
+
+	if (!(oa->o_mode & S_IFMT)) {
+		CERROR("echo obd: no type!\n");
+		return -ENOENT;
+	}
+
+        if (!(oa->o_valid & OBD_MD_FLTYPE)) {
+		CERROR("invalid o_valid %#llx\n", oa->o_valid);
+                return -EINVAL;
+        }
+
+	ostid_set_seq_echo(&oa->o_oi);
+	if (ostid_set_id(&oa->o_oi, echo_next_id(obd))) {
+		CERROR("Bad %llu to set " DOSTID "\n",
+		       echo_next_id(obd), POSTID(&oa->o_oi));
+		return -EINVAL;
+	}
+	oa->o_valid = OBD_MD_FLID;
+
+	return 0;
+}
+
+static int echo_destroy(const struct lu_env *env, struct obd_export *exp,
+			struct obdo *oa)
+{
+        struct obd_device *obd = class_exp2obd(exp);
+
+        ENTRY;
+        if (!obd) {
+		CERROR("invalid client cookie %#llx\n",
+                       exp->exp_handle.h_cookie);
+                RETURN(-EINVAL);
+        }
+
+        if (!(oa->o_valid & OBD_MD_FLID)) {
+		CERROR("obdo missing FLID valid flag: %#llx\n", oa->o_valid);
+                RETURN(-EINVAL);
+        }
+
+	if (ostid_id(&oa->o_oi) > obd->u.echo.eo_lastino ||
+	    ostid_id(&oa->o_oi) < ECHO_INIT_OID) {
+		CERROR("bad destroy objid: "DOSTID"\n", POSTID(&oa->o_oi));
+		RETURN(-EINVAL);
+	}
+
+        RETURN(0);
+}
+
+static int echo_getattr(const struct lu_env *env, struct obd_export *exp,
+			struct obdo *oa)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	u64 id = ostid_id(&oa->o_oi);
+
+	ENTRY;
+	if (!obd) {
+		CERROR("invalid client cookie %#llx\n",
+		       exp->exp_handle.h_cookie);
+		RETURN(-EINVAL);
+	}
+
+	if (!(oa->o_valid & OBD_MD_FLID)) {
+		CERROR("obdo missing FLID valid flag: %#llx\n", oa->o_valid);
+		RETURN(-EINVAL);
+	}
+
+	obdo_cpy_md(oa, &obd->u.echo.eo_oa, oa->o_valid);
+	ostid_set_seq_echo(&oa->o_oi);
+	if (ostid_set_id(&oa->o_oi, id)) {
+		CERROR("Bad %llu to set " DOSTID "\n",
+		       id, POSTID(&oa->o_oi));
+		RETURN(-EINVAL);
+	}
+
+	RETURN(0);
+}
+
+static int echo_setattr(const struct lu_env *env, struct obd_export *exp,
+			struct obdo *oa)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+
+	ENTRY;
+	if (!obd) {
+		CERROR("invalid client cookie %#llx\n",
+		       exp->exp_handle.h_cookie);
+		RETURN(-EINVAL);
+	}
+
+	if (!(oa->o_valid & OBD_MD_FLID)) {
+		CERROR("obdo missing FLID valid flag: %#llx\n", oa->o_valid);
+		RETURN(-EINVAL);
+	}
+
+	obd->u.echo.eo_oa = *oa;
+
+	RETURN(0);
+}
+
+static void
+echo_page_debug_setup(struct page *page, int rw, u64 id,
+		      __u64 offset, int len)
+{
+	int   page_offset = offset & ~PAGE_MASK;
+	char *addr        = ((char *)kmap(page)) + page_offset;
+
+        if (len % OBD_ECHO_BLOCK_SIZE != 0)
+                CERROR("Unexpected block size %d\n", len);
+
+        while (len > 0) {
+                if (rw & OBD_BRW_READ)
+                        block_debug_setup(addr, OBD_ECHO_BLOCK_SIZE,
+                                          offset, id);
+                else
+                        block_debug_setup(addr, OBD_ECHO_BLOCK_SIZE,
+                                          0xecc0ecc0ecc0ecc0ULL,
+                                          0xecc0ecc0ecc0ecc0ULL);
+
+                addr   += OBD_ECHO_BLOCK_SIZE;
+                offset += OBD_ECHO_BLOCK_SIZE;
+                len    -= OBD_ECHO_BLOCK_SIZE;
+        }
+
+	kunmap(page);
+}
+
+static int
+echo_page_debug_check(struct page *page, u64 id,
+		      __u64 offset, int len)
+{
+	int   page_offset = offset & ~PAGE_MASK;
+	char *addr        = ((char *)kmap(page)) + page_offset;
+	int   rc          = 0;
+	int   rc2;
+
+        if (len % OBD_ECHO_BLOCK_SIZE != 0)
+                CERROR("Unexpected block size %d\n", len);
+
+        while (len > 0) {
+                rc2 = block_debug_check("echo", addr, OBD_ECHO_BLOCK_SIZE,
+                                        offset, id);
+
+                if (rc2 != 0 && rc == 0)
+                        rc = rc2;
+
+                addr   += OBD_ECHO_BLOCK_SIZE;
+                offset += OBD_ECHO_BLOCK_SIZE;
+                len    -= OBD_ECHO_BLOCK_SIZE;
+        }
+
+	kunmap(page);
+
+	return rc;
+}
+
+static int echo_map_nb_to_lb(struct obdo *oa, struct obd_ioobj *obj,
+                             struct niobuf_remote *nb, int *pages,
+                             struct niobuf_local *lb, int cmd, int *left)
+{
+	gfp_t gfp_mask = (ostid_id(&obj->ioo_oid) & 1) ?
+			GFP_HIGHUSER : GFP_KERNEL;
+	int ispersistent = ostid_id(&obj->ioo_oid) == ECHO_PERSISTENT_OBJID;
+	int debug_setup = (!ispersistent &&
+			   (oa->o_valid & OBD_MD_FLFLAGS) != 0 &&
+			   (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0);
+	struct niobuf_local *res = lb;
+	u64 offset = nb->rnb_offset;
+	int len = nb->rnb_len;
+
+	while (len > 0) {
+		int plen = PAGE_SIZE - (offset & (PAGE_SIZE-1));
+		if (len < plen)
+			plen = len;
+
+                /* check for local buf overflow */
+                if (*left == 0)
+                        return -EINVAL;
+
+		res->lnb_file_offset = offset;
+		res->lnb_len = plen;
+		LASSERT((res->lnb_file_offset & ~PAGE_MASK) +
+			res->lnb_len <= PAGE_SIZE);
+
+		if (ispersistent &&
+		    ((res->lnb_file_offset >> PAGE_SHIFT) <
+		      ECHO_PERSISTENT_PAGES)) {
+			res->lnb_page =
+				echo_persistent_pages[res->lnb_file_offset >>
+						      PAGE_SHIFT];
+			/* Take extra ref so __free_pages() can be called OK */
+			get_page(res->lnb_page);
+		} else {
+			res->lnb_page = alloc_page(gfp_mask);
+			if (res->lnb_page == NULL) {
+				CERROR("can't get page for id " DOSTID"\n",
+				       POSTID(&obj->ioo_oid));
+				return -ENOMEM;
+			}
+		}
+
+		CDEBUG(D_PAGE, "$$$$ get page %p @ %llu for %d\n",
+		       res->lnb_page, res->lnb_file_offset, res->lnb_len);
+
+		if (cmd & OBD_BRW_READ)
+			res->lnb_rc = res->lnb_len;
+
+		if (debug_setup)
+			echo_page_debug_setup(res->lnb_page, cmd,
+					      ostid_id(&obj->ioo_oid),
+					      res->lnb_file_offset,
+					      res->lnb_len);
+
+                offset += plen;
+                len -= plen;
+                res++;
+
+                (*left)--;
+                (*pages)++;
+        }
+
+        return 0;
+}
+
+static int echo_finalize_lb(struct obdo *oa, struct obd_ioobj *obj,
+			    struct niobuf_remote *rb, int *pgs,
+			    struct niobuf_local *lb, int verify)
+{
+	struct niobuf_local *res = lb;
+	u64 start = rb->rnb_offset >> PAGE_SHIFT;
+	u64 end   = (rb->rnb_offset + rb->rnb_len + PAGE_SIZE - 1) >>
+		    PAGE_SHIFT;
+	int     count  = (int)(end - start);
+	int     rc     = 0;
+	int     i;
+
+	for (i = 0; i < count; i++, (*pgs) ++, res++) {
+		struct page *page = res->lnb_page;
+		void       *addr;
+
+		if (page == NULL) {
+			CERROR("null page objid %llu:%p, buf %d/%d\n",
+			       ostid_id(&obj->ioo_oid), page, i,
+			       obj->ioo_bufcnt);
+			return -EFAULT;
+		}
+
+		addr = kmap(page);
+
+		CDEBUG(D_PAGE, "$$$$ use page %p, addr %p@%llu\n",
+		       res->lnb_page, addr, res->lnb_file_offset);
+
+		if (verify) {
+			int vrc = echo_page_debug_check(page,
+							ostid_id(&obj->ioo_oid),
+							res->lnb_file_offset,
+							res->lnb_len);
+			/* check all the pages always */
+			if (vrc != 0 && rc == 0)
+				rc = vrc;
+		}
+
+		kunmap(page);
+		/* NB see comment above regarding persistent pages */
+		__free_page(page);
+	}
+
+	return rc;
+}
+
+static int echo_preprw(const struct lu_env *env, int cmd,
+		       struct obd_export *export, struct obdo *oa,
+		       int objcount, struct obd_ioobj *obj,
+		       struct niobuf_remote *nb, int *pages,
+		       struct niobuf_local *res)
+{
+        struct obd_device *obd;
+        int tot_bytes = 0;
+        int rc = 0;
+        int i, left;
+        ENTRY;
+
+        obd = export->exp_obd;
+        if (obd == NULL)
+                RETURN(-EINVAL);
+
+        /* Temp fix to stop falling foul of osc_announce_cached() */
+        oa->o_valid &= ~(OBD_MD_FLBLOCKS | OBD_MD_FLGRANT);
+
+        memset(res, 0, sizeof(*res) * *pages);
+
+        CDEBUG(D_PAGE, "%s %d obdos with %d IOs\n",
+               cmd == OBD_BRW_READ ? "reading" : "writing", objcount, *pages);
+
+        left = *pages;
+        *pages = 0;
+
+        for (i = 0; i < objcount; i++, obj++) {
+                int j;
+
+                for (j = 0 ; j < obj->ioo_bufcnt ; j++, nb++) {
+
+                        rc = echo_map_nb_to_lb(oa, obj, nb, pages,
+                                               res + *pages, cmd, &left);
+                        if (rc)
+                                GOTO(preprw_cleanup, rc);
+
+			tot_bytes += nb->rnb_len;
+                }
+        }
+
+	atomic_add(*pages, &obd->u.echo.eo_prep);
+
+        if (cmd & OBD_BRW_READ)
+                lprocfs_counter_add(obd->obd_stats, LPROC_ECHO_READ_BYTES,
+                                    tot_bytes);
+        else
+                lprocfs_counter_add(obd->obd_stats, LPROC_ECHO_WRITE_BYTES,
+                                    tot_bytes);
+
+        CDEBUG(D_PAGE, "%d pages allocated after prep\n",
+	       atomic_read(&obd->u.echo.eo_prep));
+
+        RETURN(0);
+
+preprw_cleanup:
+        /* It is possible that we would rather handle errors by  allow
+         * any already-set-up pages to complete, rather than tearing them
+         * all down again.  I believe that this is what the in-kernel
+         * prep/commit operations do.
+         */
+        CERROR("cleaning up %u pages (%d obdos)\n", *pages, objcount);
+        for (i = 0; i < *pages; i++) {
+		kunmap(res[i].lnb_page);
+		/* NB if this is a persistent page, __free_page() will just
+		 * lose the extra ref gained above */
+		__free_page(res[i].lnb_page);
+		res[i].lnb_page = NULL;
+		atomic_dec(&obd->u.echo.eo_prep);
+	}
+
+	return rc;
+}
+
+static int echo_commitrw(const struct lu_env *env, int cmd,
+			 struct obd_export *export, struct obdo *oa,
+			 int objcount, struct obd_ioobj *obj,
+			 struct niobuf_remote *rb, int niocount,
+			 struct niobuf_local *res, int rc)
+{
+        struct obd_device *obd;
+        int pgs = 0;
+        int i;
+        ENTRY;
+
+        obd = export->exp_obd;
+        if (obd == NULL)
+                RETURN(-EINVAL);
+
+        if (rc)
+                GOTO(commitrw_cleanup, rc);
+
+        if ((cmd & OBD_BRW_RWMASK) == OBD_BRW_READ) {
+                CDEBUG(D_PAGE, "reading %d obdos with %d IOs\n",
+                       objcount, niocount);
+        } else {
+                CDEBUG(D_PAGE, "writing %d obdos with %d IOs\n",
+                       objcount, niocount);
+        }
+
+        if (niocount && res == NULL) {
+                CERROR("NULL res niobuf with niocount %d\n", niocount);
+                RETURN(-EINVAL);
+        }
+
+	for (i = 0; i < objcount; i++, obj++) {
+		int verify = (rc == 0 &&
+			     ostid_id(&obj->ioo_oid) != ECHO_PERSISTENT_OBJID &&
+			      (oa->o_valid & OBD_MD_FLFLAGS) != 0 &&
+			      (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0);
+		int j;
+
+		for (j = 0 ; j < obj->ioo_bufcnt ; j++, rb++) {
+			int vrc = echo_finalize_lb(oa, obj, rb, &pgs, &res[pgs],
+						   verify);
+			if (vrc == 0)
+				continue;
+
+			if (vrc == -EFAULT)
+				GOTO(commitrw_cleanup, rc = vrc);
+
+			if (rc == 0)
+				rc = vrc;
+		}
+
+	}
+
+	atomic_sub(pgs, &obd->u.echo.eo_prep);
+
+        CDEBUG(D_PAGE, "%d pages remain after commit\n",
+	       atomic_read(&obd->u.echo.eo_prep));
+        RETURN(rc);
+
+commitrw_cleanup:
+	atomic_sub(pgs, &obd->u.echo.eo_prep);
+
+	CERROR("cleaning up %d pages (%d obdos)\n",
+	       niocount - pgs - 1, objcount);
+
+	while (pgs < niocount) {
+		struct page *page = res[pgs++].lnb_page;
+
+		if (page == NULL)
+			continue;
+
+		/* NB see comment above regarding persistent pages */
+		__free_page(page);
+		atomic_dec(&obd->u.echo.eo_prep);
+	}
+	return rc;
+}
+
+LPROC_SEQ_FOPS_RO_TYPE(echo, uuid);
+static struct lprocfs_vars lprocfs_echo_obd_vars[] = {
+	{ .name	=	"uuid",
+	  .fops	=	&echo_uuid_fops		},
+	{ NULL }
+};
+
+static int echo_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	int			rc;
+	__u64			lock_flags = 0;
+	struct ldlm_res_id	res_id = {.name = {1}};
+	char			ns_name[48];
+	ENTRY;
+
+        obd->u.echo.eo_obt.obt_magic = OBT_MAGIC;
+	spin_lock_init(&obd->u.echo.eo_lock);
+        obd->u.echo.eo_lastino = ECHO_INIT_OID;
+
+        sprintf(ns_name, "echotgt-%s", obd->obd_uuid.uuid);
+        obd->obd_namespace = ldlm_namespace_new(obd, ns_name,
+                                                LDLM_NAMESPACE_SERVER,
+                                                LDLM_NAMESPACE_MODEST,
+                                                LDLM_NS_TYPE_OST);
+        if (obd->obd_namespace == NULL) {
+                LBUG();
+                RETURN(-ENOMEM);
+        }
+
+        rc = ldlm_cli_enqueue_local(obd->obd_namespace, &res_id, LDLM_PLAIN,
+                                    NULL, LCK_NL, &lock_flags, NULL,
+				    ldlm_completion_ast, NULL, NULL, 0,
+				    LVB_T_NONE, NULL, &obd->u.echo.eo_nl_lock);
+        LASSERT (rc == ELDLM_OK);
+
+	obd->obd_vars = lprocfs_echo_obd_vars;
+	if (lprocfs_obd_setup(obd) == 0 &&
+            lprocfs_alloc_obd_stats(obd, LPROC_ECHO_LAST) == 0) {
+                lprocfs_counter_init(obd->obd_stats, LPROC_ECHO_READ_BYTES,
+                                     LPROCFS_CNTR_AVGMINMAX,
+                                     "read_bytes", "bytes");
+                lprocfs_counter_init(obd->obd_stats, LPROC_ECHO_WRITE_BYTES,
+                                     LPROCFS_CNTR_AVGMINMAX,
+                                     "write_bytes", "bytes");
+        }
+
+	ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL,
+			   "echo_ldlm_cb_client", &obd->obd_ldlm_client);
+        RETURN(0);
+}
+
+static int echo_cleanup(struct obd_device *obd)
+{
+	int leaked;
+	ENTRY;
+
+	lprocfs_obd_cleanup(obd);
+	lprocfs_free_obd_stats(obd);
+
+	ldlm_lock_decref(&obd->u.echo.eo_nl_lock, LCK_NL);
+
+	/* XXX Bug 3413; wait for a bit to ensure the BL callback has
+	 * happened before calling ldlm_namespace_free() */
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule_timeout(cfs_time_seconds(1));
+
+	ldlm_namespace_free(obd->obd_namespace, NULL, obd->obd_force);
+	obd->obd_namespace = NULL;
+
+	leaked = atomic_read(&obd->u.echo.eo_prep);
+	if (leaked != 0)
+		CERROR("%d prep/commitrw pages leaked\n", leaked);
+
+	RETURN(0);
+}
+
+struct obd_ops echo_obd_ops = {
+        .o_owner           = THIS_MODULE,
+        .o_connect         = echo_connect,
+        .o_disconnect      = echo_disconnect,
+        .o_init_export     = echo_init_export,
+        .o_destroy_export  = echo_destroy_export,
+        .o_create          = echo_create,
+        .o_destroy         = echo_destroy,
+        .o_getattr         = echo_getattr,
+        .o_setattr         = echo_setattr,
+        .o_preprw          = echo_preprw,
+        .o_commitrw        = echo_commitrw,
+        .o_setup           = echo_setup,
+        .o_cleanup         = echo_cleanup
+};
+
+void echo_persistent_pages_fini(void)
+{
+	int i;
+
+	for (i = 0; i < ECHO_PERSISTENT_PAGES; i++)
+		if (echo_persistent_pages[i] != NULL) {
+			__free_page(echo_persistent_pages[i]);
+			echo_persistent_pages[i] = NULL;
+		}
+}
+
+int echo_persistent_pages_init(void)
+{
+	struct page *pg;
+	int          i;
+
+	for (i = 0; i < ECHO_PERSISTENT_PAGES; i++) {
+		gfp_t gfp_mask = (i < ECHO_PERSISTENT_PAGES/2) ?
+			GFP_KERNEL : GFP_HIGHUSER;
+
+		pg = alloc_page(gfp_mask);
+		if (pg == NULL) {
+			echo_persistent_pages_fini();
+			return -ENOMEM;
+		}
+
+		memset(kmap(pg), 0, PAGE_SIZE);
+		kunmap(pg);
+
+		echo_persistent_pages[i] = pg;
+	}
+
+	return 0;
+}
diff --git a/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c b/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c
new file mode 100644
index 0000000000000..1b558f7e0e641
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c
@@ -0,0 +1,3120 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_ECHO
+
+#include <linux/user_namespace.h>
+#ifdef HAVE_UIDGID_HEADER
+# include <linux/uidgid.h>
+#endif
+#include <libcfs/libcfs.h>
+
+#include <obd.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_debug.h>
+#include <lprocfs_status.h>
+#include <cl_object.h>
+#include <lustre_fid.h>
+#include <lustre_lmv.h>
+#include <lustre_acl.h>
+#include <uapi/linux/lustre_ioctl.h>
+#include <lustre_net.h>
+#ifdef HAVE_SERVER_SUPPORT
+# include <md_object.h>
+
+#define ETI_NAME_LEN	20
+
+#endif /* HAVE_SERVER_SUPPORT */
+
+#include "echo_internal.h"
+
+/** \defgroup echo_client Echo Client
+ * @{
+ */
+
+struct echo_device {
+	struct cl_device	  ed_cl;
+	struct echo_client_obd	 *ed_ec;
+
+	struct cl_site		  ed_site_myself;
+	struct lu_site		 *ed_site;
+	struct lu_device	 *ed_next;
+	int			  ed_next_ismd;
+	struct lu_client_seq	 *ed_cl_seq;
+#ifdef HAVE_SERVER_SUPPORT
+	struct local_oid_storage *ed_los;
+	struct lu_fid		  ed_root_fid;
+#endif /* HAVE_SERVER_SUPPORT */
+};
+
+struct echo_object {
+	struct cl_object	eo_cl;
+	struct cl_object_header	eo_hdr;
+	struct echo_device     *eo_dev;
+	struct list_head	eo_obj_chain;
+	struct lov_oinfo       *eo_oinfo;
+	atomic_t		eo_npages;
+	int			eo_deleted;
+};
+
+struct echo_object_conf {
+	struct cl_object_conf	eoc_cl;
+	struct lov_oinfo      **eoc_oinfo;
+};
+
+struct echo_page {
+	struct cl_page_slice	ep_cl;
+	struct mutex		ep_lock;
+};
+
+struct echo_lock {
+	struct cl_lock_slice	el_cl;
+	struct list_head	el_chain;
+	struct echo_object     *el_object;
+	__u64			el_cookie;
+	atomic_t		el_refcount;
+};
+
+#ifdef HAVE_SERVER_SUPPORT
+static const char echo_md_root_dir_name[] = "ROOT_ECHO";
+
+/**
+ * In order to use the values of members in struct mdd_device,
+ * we define an alias structure here.
+ */
+struct echo_md_device {
+	struct md_device		 emd_md_dev;
+	struct obd_export		*emd_child_exp;
+	struct dt_device		*emd_child;
+	struct dt_device		*emd_bottom;
+	struct lu_fid			 emd_root_fid;
+	struct lu_fid			 emd_local_root_fid;
+};
+#endif /* HAVE_SERVER_SUPPORT */
+
+static int echo_client_setup(const struct lu_env *env,
+                             struct obd_device *obddev,
+                             struct lustre_cfg *lcfg);
+static int echo_client_cleanup(struct obd_device *obddev);
+
+
+/** \defgroup echo_helpers Helper functions
+ * @{
+ */
+static inline struct echo_device *cl2echo_dev(const struct cl_device *dev)
+{
+        return container_of0(dev, struct echo_device, ed_cl);
+}
+
+static inline struct cl_device *echo_dev2cl(struct echo_device *d)
+{
+        return &d->ed_cl;
+}
+
+static inline struct echo_device *obd2echo_dev(const struct obd_device *obd)
+{
+        return cl2echo_dev(lu2cl_dev(obd->obd_lu_dev));
+}
+
+static inline struct cl_object *echo_obj2cl(struct echo_object *eco)
+{
+        return &eco->eo_cl;
+}
+
+static inline struct echo_object *cl2echo_obj(const struct cl_object *o)
+{
+        return container_of(o, struct echo_object, eo_cl);
+}
+
+static inline struct echo_page *cl2echo_page(const struct cl_page_slice *s)
+{
+        return container_of(s, struct echo_page, ep_cl);
+}
+
+static inline struct echo_lock *cl2echo_lock(const struct cl_lock_slice *s)
+{
+        return container_of(s, struct echo_lock, el_cl);
+}
+
+static inline struct cl_lock *echo_lock2cl(const struct echo_lock *ecl)
+{
+        return ecl->el_cl.cls_lock;
+}
+
+static struct lu_context_key echo_thread_key;
+static inline struct echo_thread_info *echo_env_info(const struct lu_env *env)
+{
+        struct echo_thread_info *info;
+        info = lu_context_key_get(&env->le_ctx, &echo_thread_key);
+        LASSERT(info != NULL);
+        return info;
+}
+
+static inline
+struct echo_object_conf *cl2echo_conf(const struct cl_object_conf *c)
+{
+        return container_of(c, struct echo_object_conf, eoc_cl);
+}
+
+#ifdef HAVE_SERVER_SUPPORT
+static inline struct echo_md_device *lu2emd_dev(struct lu_device *d)
+{
+	return container_of0(d, struct echo_md_device, emd_md_dev.md_lu_dev);
+}
+
+static inline struct lu_device *emd2lu_dev(struct echo_md_device *d)
+{
+	return &d->emd_md_dev.md_lu_dev;
+}
+
+static inline struct seq_server_site *echo_md_seq_site(struct echo_md_device *d)
+{
+	return emd2lu_dev(d)->ld_site->ld_seq_site;
+}
+
+static inline struct obd_device *emd2obd_dev(struct echo_md_device *d)
+{
+	return d->emd_md_dev.md_lu_dev.ld_obd;
+}
+#endif /* HAVE_SERVER_SUPPORT */
+
+/** @} echo_helpers */
+
+static int cl_echo_object_put(struct echo_object *eco);
+static int cl_echo_object_brw(struct echo_object *eco, int rw, u64 offset,
+			      struct page **pages, int npages, int async);
+
+struct echo_thread_info {
+	struct echo_object_conf eti_conf;
+	struct lustre_md        eti_md;
+
+	struct cl_2queue        eti_queue;
+	struct cl_io            eti_io;
+	struct cl_lock          eti_lock;
+	struct lu_fid           eti_fid;
+	struct lu_fid		eti_fid2;
+#ifdef HAVE_SERVER_SUPPORT
+	struct md_op_spec       eti_spec;
+	struct lov_mds_md_v3    eti_lmm;
+	struct lov_user_md_v3   eti_lum;
+	struct md_attr          eti_ma;
+	struct lu_name          eti_lname;
+	/* per-thread values, can be re-used */
+	void			*eti_big_lmm; /* may be vmalloc'd */
+	int			eti_big_lmmsize;
+	char                    eti_name[ETI_NAME_LEN];
+	struct lu_buf           eti_buf;
+	/* If we want to test large ACL, then need to enlarge the buffer. */
+	char                    eti_xattr_buf[LUSTRE_POSIX_ACL_MAX_SIZE_OLD];
+#endif
+};
+
+/* No session used right now */
+struct echo_session_info {
+        unsigned long dummy;
+};
+
+static struct kmem_cache *echo_lock_kmem;
+static struct kmem_cache *echo_object_kmem;
+static struct kmem_cache *echo_thread_kmem;
+static struct kmem_cache *echo_session_kmem;
+/* static struct kmem_cache *echo_req_kmem; */
+
+static struct lu_kmem_descr echo_caches[] = {
+        {
+                .ckd_cache = &echo_lock_kmem,
+                .ckd_name  = "echo_lock_kmem",
+                .ckd_size  = sizeof (struct echo_lock)
+        },
+        {
+                .ckd_cache = &echo_object_kmem,
+                .ckd_name  = "echo_object_kmem",
+                .ckd_size  = sizeof (struct echo_object)
+        },
+        {
+                .ckd_cache = &echo_thread_kmem,
+                .ckd_name  = "echo_thread_kmem",
+                .ckd_size  = sizeof (struct echo_thread_info)
+        },
+        {
+                .ckd_cache = &echo_session_kmem,
+                .ckd_name  = "echo_session_kmem",
+                .ckd_size  = sizeof (struct echo_session_info)
+        },
+        {
+                .ckd_cache = NULL
+        }
+};
+
+/** \defgroup echo_page Page operations
+ *
+ * Echo page operations.
+ *
+ * @{
+ */
+static int echo_page_own(const struct lu_env *env,
+                         const struct cl_page_slice *slice,
+                         struct cl_io *io, int nonblock)
+{
+        struct echo_page *ep = cl2echo_page(slice);
+
+        if (!nonblock)
+		mutex_lock(&ep->ep_lock);
+	else if (!mutex_trylock(&ep->ep_lock))
+                return -EAGAIN;
+        return 0;
+}
+
+static void echo_page_disown(const struct lu_env *env,
+                             const struct cl_page_slice *slice,
+                             struct cl_io *io)
+{
+        struct echo_page *ep = cl2echo_page(slice);
+
+	LASSERT(mutex_is_locked(&ep->ep_lock));
+	mutex_unlock(&ep->ep_lock);
+}
+
+static void echo_page_discard(const struct lu_env *env,
+                              const struct cl_page_slice *slice,
+                              struct cl_io *unused)
+{
+        cl_page_delete(env, slice->cpl_page);
+}
+
+static int echo_page_is_vmlocked(const struct lu_env *env,
+                                 const struct cl_page_slice *slice)
+{
+	if (mutex_is_locked(&cl2echo_page(slice)->ep_lock))
+                return -EBUSY;
+        return -ENODATA;
+}
+
+static void echo_page_completion(const struct lu_env *env,
+                                 const struct cl_page_slice *slice,
+                                 int ioret)
+{
+        LASSERT(slice->cpl_page->cp_sync_io != NULL);
+}
+
+static void echo_page_fini(const struct lu_env *env,
+			   struct cl_page_slice *slice)
+{
+	struct echo_object *eco = cl2echo_obj(slice->cpl_obj);
+	ENTRY;
+
+	atomic_dec(&eco->eo_npages);
+	put_page(slice->cpl_page->cp_vmpage);
+	EXIT;
+}
+
+static int echo_page_prep(const struct lu_env *env,
+                          const struct cl_page_slice *slice,
+                          struct cl_io *unused)
+{
+        return 0;
+}
+
+static int echo_page_print(const struct lu_env *env,
+                           const struct cl_page_slice *slice,
+                           void *cookie, lu_printer_t printer)
+{
+	struct echo_page *ep = cl2echo_page(slice);
+
+	(*printer)(env, cookie, LUSTRE_ECHO_CLIENT_NAME"-page@%p %d vm@%p\n",
+		   ep, mutex_is_locked(&ep->ep_lock),
+		   slice->cpl_page->cp_vmpage);
+	return 0;
+}
+
+static const struct cl_page_operations echo_page_ops = {
+        .cpo_own           = echo_page_own,
+        .cpo_disown        = echo_page_disown,
+        .cpo_discard       = echo_page_discard,
+        .cpo_fini          = echo_page_fini,
+        .cpo_print         = echo_page_print,
+        .cpo_is_vmlocked   = echo_page_is_vmlocked,
+        .io = {
+                [CRT_READ] = {
+                        .cpo_prep        = echo_page_prep,
+                        .cpo_completion  = echo_page_completion,
+                },
+                [CRT_WRITE] = {
+                        .cpo_prep        = echo_page_prep,
+                        .cpo_completion  = echo_page_completion,
+                }
+        }
+};
+/** @} echo_page */
+
+/** \defgroup echo_lock Locking
+ *
+ * echo lock operations
+ *
+ * @{
+ */
+static void echo_lock_fini(const struct lu_env *env,
+                           struct cl_lock_slice *slice)
+{
+        struct echo_lock *ecl = cl2echo_lock(slice);
+
+	LASSERT(list_empty(&ecl->el_chain));
+        OBD_SLAB_FREE_PTR(ecl, echo_lock_kmem);
+}
+
+static struct cl_lock_operations echo_lock_ops = {
+        .clo_fini      = echo_lock_fini,
+};
+
+/** @} echo_lock */
+
+/** \defgroup echo_cl_ops cl_object operations
+ *
+ * operations for cl_object
+ *
+ * @{
+ */
+static int echo_page_init(const struct lu_env *env, struct cl_object *obj,
+			  struct cl_page *page, pgoff_t index)
+{
+	struct echo_page *ep = cl_object_page_slice(obj, page);
+	struct echo_object *eco = cl2echo_obj(obj);
+	ENTRY;
+
+	get_page(page->cp_vmpage);
+	mutex_init(&ep->ep_lock);
+	cl_page_slice_add(page, &ep->ep_cl, obj, index, &echo_page_ops);
+	atomic_inc(&eco->eo_npages);
+	RETURN(0);
+}
+
+static int echo_io_init(const struct lu_env *env, struct cl_object *obj,
+                        struct cl_io *io)
+{
+        return 0;
+}
+
+static int echo_lock_init(const struct lu_env *env,
+			  struct cl_object *obj, struct cl_lock *lock,
+			  const struct cl_io *unused)
+{
+	struct echo_lock *el;
+	ENTRY;
+
+	OBD_SLAB_ALLOC_PTR_GFP(el, echo_lock_kmem, GFP_NOFS);
+	if (el != NULL) {
+		cl_lock_slice_add(lock, &el->el_cl, obj, &echo_lock_ops);
+		el->el_object = cl2echo_obj(obj);
+		INIT_LIST_HEAD(&el->el_chain);
+		atomic_set(&el->el_refcount, 0);
+	}
+	RETURN(el == NULL ? -ENOMEM : 0);
+}
+
+static int echo_conf_set(const struct lu_env *env, struct cl_object *obj,
+                         const struct cl_object_conf *conf)
+{
+        return 0;
+}
+
+static const struct cl_object_operations echo_cl_obj_ops = {
+        .coo_page_init = echo_page_init,
+        .coo_lock_init = echo_lock_init,
+        .coo_io_init   = echo_io_init,
+        .coo_conf_set  = echo_conf_set
+};
+/** @} echo_cl_ops */
+
+/** \defgroup echo_lu_ops lu_object operations
+ *
+ * operations for echo lu object.
+ *
+ * @{
+ */
+static int echo_object_init(const struct lu_env *env, struct lu_object *obj,
+                            const struct lu_object_conf *conf)
+{
+        struct echo_device *ed         = cl2echo_dev(lu2cl_dev(obj->lo_dev));
+        struct echo_client_obd *ec     = ed->ed_ec;
+        struct echo_object *eco        = cl2echo_obj(lu2cl(obj));
+        ENTRY;
+
+        if (ed->ed_next) {
+                struct lu_object  *below;
+                struct lu_device  *under;
+
+                under = ed->ed_next;
+                below = under->ld_ops->ldo_object_alloc(env, obj->lo_header,
+                                                        under);
+                if (below == NULL)
+                        RETURN(-ENOMEM);
+                lu_object_add(obj, below);
+        }
+
+        if (!ed->ed_next_ismd) {
+                const struct cl_object_conf *cconf = lu2cl_conf(conf);
+                struct echo_object_conf *econf = cl2echo_conf(cconf);
+
+		LASSERT(econf->eoc_oinfo != NULL);
+
+		/* Transfer the oinfo pointer to eco that it won't be
+		 * freed. */
+		eco->eo_oinfo = *econf->eoc_oinfo;
+		*econf->eoc_oinfo = NULL;
+	} else {
+		eco->eo_oinfo = NULL;
+	}
+
+        eco->eo_dev = ed;
+	atomic_set(&eco->eo_npages, 0);
+	cl_object_page_init(lu2cl(obj), sizeof(struct echo_page));
+
+	spin_lock(&ec->ec_lock);
+	list_add_tail(&eco->eo_obj_chain, &ec->ec_objects);
+	spin_unlock(&ec->ec_lock);
+
+	RETURN(0);
+}
+
+static void echo_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+        struct echo_object *eco    = cl2echo_obj(lu2cl(obj));
+        struct echo_client_obd *ec = eco->eo_dev->ed_ec;
+        ENTRY;
+
+	LASSERT(atomic_read(&eco->eo_npages) == 0);
+
+	spin_lock(&ec->ec_lock);
+	list_del_init(&eco->eo_obj_chain);
+	spin_unlock(&ec->ec_lock);
+
+        lu_object_fini(obj);
+        lu_object_header_fini(obj->lo_header);
+
+	if (eco->eo_oinfo != NULL)
+		OBD_FREE_PTR(eco->eo_oinfo);
+
+	OBD_SLAB_FREE_PTR(eco, echo_object_kmem);
+	EXIT;
+}
+
+static int echo_object_print(const struct lu_env *env, void *cookie,
+                            lu_printer_t p, const struct lu_object *o)
+{
+        struct echo_object *obj = cl2echo_obj(lu2cl(o));
+
+        return (*p)(env, cookie, "echoclient-object@%p", obj);
+}
+
+static const struct lu_object_operations echo_lu_obj_ops = {
+        .loo_object_init      = echo_object_init,
+        .loo_object_delete    = NULL,
+        .loo_object_release   = NULL,
+        .loo_object_free      = echo_object_free,
+        .loo_object_print     = echo_object_print,
+        .loo_object_invariant = NULL
+};
+/** @} echo_lu_ops */
+
+/** \defgroup echo_lu_dev_ops  lu_device operations
+ *
+ * Operations for echo lu device.
+ *
+ * @{
+ */
+static struct lu_object *echo_object_alloc(const struct lu_env *env,
+					   const struct lu_object_header *hdr,
+					   struct lu_device *dev)
+{
+	struct echo_object *eco;
+	struct lu_object *obj = NULL;
+	ENTRY;
+
+	/* we're the top dev. */
+	LASSERT(hdr == NULL);
+	OBD_SLAB_ALLOC_PTR_GFP(eco, echo_object_kmem, GFP_NOFS);
+	if (eco != NULL) {
+		struct cl_object_header *hdr = &eco->eo_hdr;
+
+		obj = &echo_obj2cl(eco)->co_lu;
+		cl_object_header_init(hdr);
+		hdr->coh_page_bufsize = cfs_size_round(sizeof(struct cl_page));
+
+		lu_object_init(obj, &hdr->coh_lu, dev);
+		lu_object_add_top(&hdr->coh_lu, obj);
+
+		eco->eo_cl.co_ops = &echo_cl_obj_ops;
+		obj->lo_ops       = &echo_lu_obj_ops;
+	}
+	RETURN(obj);
+}
+
+static struct lu_device_operations echo_device_lu_ops = {
+        .ldo_object_alloc   = echo_object_alloc,
+};
+
+/** @} echo_lu_dev_ops */
+
+/** \defgroup echo_init Setup and teardown
+ *
+ * Init and fini functions for echo client.
+ *
+ * @{
+ */
+static int echo_site_init(const struct lu_env *env, struct echo_device *ed)
+{
+        struct cl_site *site = &ed->ed_site_myself;
+        int rc;
+
+	/* initialize site */
+        rc = cl_site_init(site, &ed->ed_cl);
+        if (rc) {
+		CERROR("Cannot initialize site for echo client(%d)\n", rc);
+                return rc;
+        }
+
+	rc = lu_site_init_finish(&site->cs_lu);
+	if (rc) {
+		cl_site_fini(site);
+		return rc;
+	}
+
+	ed->ed_site = &site->cs_lu;
+	return 0;
+}
+
+static void echo_site_fini(const struct lu_env *env, struct echo_device *ed)
+{
+	if (ed->ed_site) {
+		if (!ed->ed_next_ismd)
+			lu_site_fini(ed->ed_site);
+		ed->ed_site = NULL;
+	}
+}
+
+static void *echo_thread_key_init(const struct lu_context *ctx,
+				  struct lu_context_key *key)
+{
+	struct echo_thread_info *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, echo_thread_kmem, GFP_NOFS);
+	if (info == NULL)
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+static void echo_thread_key_fini(const struct lu_context *ctx,
+                         struct lu_context_key *key, void *data)
+{
+        struct echo_thread_info *info = data;
+        OBD_SLAB_FREE_PTR(info, echo_thread_kmem);
+}
+
+static struct lu_context_key echo_thread_key = {
+        .lct_tags = LCT_CL_THREAD,
+        .lct_init = echo_thread_key_init,
+        .lct_fini = echo_thread_key_fini,
+};
+
+static void *echo_session_key_init(const struct lu_context *ctx,
+				  struct lu_context_key *key)
+{
+	struct echo_session_info *session;
+
+	OBD_SLAB_ALLOC_PTR_GFP(session, echo_session_kmem, GFP_NOFS);
+	if (session == NULL)
+		session = ERR_PTR(-ENOMEM);
+	return session;
+}
+
+static void echo_session_key_fini(const struct lu_context *ctx,
+                                 struct lu_context_key *key, void *data)
+{
+        struct echo_session_info *session = data;
+        OBD_SLAB_FREE_PTR(session, echo_session_kmem);
+}
+
+static struct lu_context_key echo_session_key = {
+        .lct_tags = LCT_SESSION,
+        .lct_init = echo_session_key_init,
+        .lct_fini = echo_session_key_fini,
+};
+
+LU_TYPE_INIT_FINI(echo, &echo_thread_key, &echo_session_key);
+
+#ifdef HAVE_SERVER_SUPPORT
+# define ECHO_SEQ_WIDTH 0xffffffff
+static int echo_fid_init(struct echo_device *ed, char *obd_name,
+			 struct seq_server_site *ss)
+{
+        char *prefix;
+        int rc;
+        ENTRY;
+
+        OBD_ALLOC_PTR(ed->ed_cl_seq);
+        if (ed->ed_cl_seq == NULL)
+                RETURN(-ENOMEM);
+
+        OBD_ALLOC(prefix, MAX_OBD_NAME + 5);
+        if (prefix == NULL)
+                GOTO(out_free_seq, rc = -ENOMEM);
+
+        snprintf(prefix, MAX_OBD_NAME + 5, "srv-%s", obd_name);
+
+	/* Init client side sequence-manager */
+	rc = seq_client_init(ed->ed_cl_seq, NULL,
+			     LUSTRE_SEQ_METADATA,
+			     prefix, ss->ss_server_seq);
+        ed->ed_cl_seq->lcs_width = ECHO_SEQ_WIDTH;
+        OBD_FREE(prefix, MAX_OBD_NAME + 5);
+        if (rc)
+                GOTO(out_free_seq, rc);
+
+        RETURN(0);
+
+out_free_seq:
+        OBD_FREE_PTR(ed->ed_cl_seq);
+        ed->ed_cl_seq = NULL;
+        RETURN(rc);
+}
+
+static int echo_fid_fini(struct obd_device *obddev)
+{
+        struct echo_device *ed = obd2echo_dev(obddev);
+        ENTRY;
+
+        if (ed->ed_cl_seq != NULL) {
+                seq_client_fini(ed->ed_cl_seq);
+                OBD_FREE_PTR(ed->ed_cl_seq);
+                ed->ed_cl_seq = NULL;
+        }
+
+        RETURN(0);
+}
+
+static void echo_ed_los_fini(const struct lu_env *env, struct echo_device *ed)
+{
+	ENTRY;
+
+	if (ed != NULL && ed->ed_next_ismd && ed->ed_los != NULL) {
+		local_oid_storage_fini(env, ed->ed_los);
+		ed->ed_los = NULL;
+	}
+}
+
+static int
+echo_md_local_file_create(const struct lu_env *env, struct echo_md_device *emd,
+			  struct local_oid_storage *los,
+			  const struct lu_fid *pfid, const char *name,
+			  __u32 mode, struct lu_fid *fid)
+{
+	struct dt_object	*parent = NULL;
+	struct dt_object	*dto = NULL;
+	int			 rc = 0;
+	ENTRY;
+
+	LASSERT(!fid_is_zero(pfid));
+	parent = dt_locate(env, emd->emd_bottom, pfid);
+	if (unlikely(IS_ERR(parent)))
+		RETURN(PTR_ERR(parent));
+
+	/* create local file with @fid */
+	dto = local_file_find_or_create_with_fid(env, emd->emd_bottom, fid,
+						 parent, name, mode);
+	if (IS_ERR(dto))
+		GOTO(out_put, rc = PTR_ERR(dto));
+
+	*fid = *lu_object_fid(&dto->do_lu);
+	/* since stack is not fully set up the local_storage uses own stack
+	 * and we should drop its object from cache */
+	dt_object_put_nocache(env, dto);
+
+	EXIT;
+out_put:
+	dt_object_put(env, parent);
+	RETURN(rc);
+}
+
+static int
+echo_md_root_get(const struct lu_env *env, struct echo_md_device *emd,
+		 struct echo_device *ed)
+{
+	struct lu_fid			 fid;
+	int				 rc = 0;
+	ENTRY;
+
+	/* Setup local dirs */
+	fid.f_seq = FID_SEQ_LOCAL_NAME;
+	fid.f_oid = 1;
+	fid.f_ver = 0;
+	rc = local_oid_storage_init(env, emd->emd_bottom, &fid, &ed->ed_los);
+	if (rc != 0)
+		RETURN(rc);
+
+	lu_echo_root_fid(&fid);
+	if (echo_md_seq_site(emd)->ss_node_id == 0) {
+		rc = echo_md_local_file_create(env, emd, ed->ed_los,
+					       &emd->emd_local_root_fid,
+					       echo_md_root_dir_name, S_IFDIR |
+					       S_IRUGO | S_IWUSR | S_IXUGO,
+					       &fid);
+		if (rc != 0) {
+			CERROR("%s: create md echo root fid failed: rc = %d\n",
+			       emd2obd_dev(emd)->obd_name, rc);
+			GOTO(out_los, rc);
+		}
+	}
+	ed->ed_root_fid = fid;
+
+	RETURN(0);
+out_los:
+	echo_ed_los_fini(env, ed);
+
+	RETURN(rc);
+}
+#endif /* HAVE_SERVER_SUPPORT */
+
+static struct lu_device *echo_device_alloc(const struct lu_env *env,
+                                           struct lu_device_type *t,
+                                           struct lustre_cfg *cfg)
+{
+        struct lu_device   *next;
+        struct echo_device *ed;
+        struct cl_device   *cd;
+        struct obd_device  *obd = NULL; /* to keep compiler happy */
+        struct obd_device  *tgt;
+        const char *tgt_type_name;
+        int rc;
+        int cleanup = 0;
+        ENTRY;
+
+        OBD_ALLOC_PTR(ed);
+        if (ed == NULL)
+                GOTO(out, rc = -ENOMEM);
+
+        cleanup = 1;
+        cd = &ed->ed_cl;
+        rc = cl_device_init(cd, t);
+        if (rc)
+                GOTO(out, rc);
+
+        cd->cd_lu_dev.ld_ops = &echo_device_lu_ops;
+
+        cleanup = 2;
+        obd = class_name2obd(lustre_cfg_string(cfg, 0));
+        LASSERT(obd != NULL);
+        LASSERT(env != NULL);
+
+        tgt = class_name2obd(lustre_cfg_string(cfg, 1));
+        if (tgt == NULL) {
+                CERROR("Can not find tgt device %s\n",
+                        lustre_cfg_string(cfg, 1));
+                GOTO(out, rc = -ENODEV);
+        }
+
+        next = tgt->obd_lu_dev;
+
+	if (strcmp(tgt->obd_type->typ_name, LUSTRE_MDT_NAME) == 0) {
+		ed->ed_next_ismd = 1;
+	} else if (strcmp(tgt->obd_type->typ_name, LUSTRE_OST_NAME) == 0 ||
+		   strcmp(tgt->obd_type->typ_name, LUSTRE_OSC_NAME) == 0) {
+		ed->ed_next_ismd = 0;
+		rc = echo_site_init(env, ed);
+		if (rc)
+			GOTO(out, rc);
+	} else {
+		GOTO(out, rc = -EINVAL);
+	}
+
+        cleanup = 3;
+
+        rc = echo_client_setup(env, obd, cfg);
+        if (rc)
+                GOTO(out, rc);
+
+        ed->ed_ec = &obd->u.echo_client;
+        cleanup = 4;
+
+        if (ed->ed_next_ismd) {
+#ifdef HAVE_SERVER_SUPPORT
+		/* Suppose to connect to some Metadata layer */
+		struct lu_site		*ls = NULL;
+		struct lu_device	*ld = NULL;
+		struct md_device	*md = NULL;
+		struct echo_md_device	*emd = NULL;
+		int			 found = 0;
+
+                if (next == NULL) {
+                        CERROR("%s is not lu device type!\n",
+                               lustre_cfg_string(cfg, 1));
+                        GOTO(out, rc = -EINVAL);
+                }
+
+                tgt_type_name = lustre_cfg_string(cfg, 2);
+                if (!tgt_type_name) {
+                        CERROR("%s no type name for echo %s setup\n",
+                                lustre_cfg_string(cfg, 1),
+                                tgt->obd_type->typ_name);
+                        GOTO(out, rc = -EINVAL);
+                }
+
+                ls = next->ld_site;
+
+		spin_lock(&ls->ls_ld_lock);
+		list_for_each_entry(ld, &ls->ls_ld_linkage, ld_linkage) {
+			if (strcmp(ld->ld_type->ldt_name, tgt_type_name) == 0) {
+				found = 1;
+				break;
+			}
+		}
+		spin_unlock(&ls->ls_ld_lock);
+
+                if (found == 0) {
+                        CERROR("%s is not lu device type!\n",
+                               lustre_cfg_string(cfg, 1));
+                        GOTO(out, rc = -EINVAL);
+                }
+
+		next = ld;
+		/* For MD echo client, it will use the site in MDS stack */
+		ed->ed_site = ls;
+		ed->ed_cl.cd_lu_dev.ld_site = ls;
+		rc = echo_fid_init(ed, obd->obd_name, lu_site2seq(ls));
+		if (rc) {
+			CERROR("echo fid init error %d\n", rc);
+			GOTO(out, rc);
+		}
+
+		md = lu2md_dev(next);
+		emd = lu2emd_dev(&md->md_lu_dev);
+		rc = echo_md_root_get(env, emd, ed);
+		if (rc != 0) {
+			CERROR("%s: get root error: rc = %d\n",
+				emd2obd_dev(emd)->obd_name, rc);
+			GOTO(out, rc);
+		}
+#else /* !HAVE_SERVER_SUPPORT */
+		CERROR("Local operations are NOT supported on client side. "
+		       "Only remote operations are supported. Metadata client "
+		       "must be run on server side.\n");
+		GOTO(out, rc = -EOPNOTSUPP);
+#endif /* HAVE_SERVER_SUPPORT */
+        } else {
+                 /* if echo client is to be stacked upon ost device, the next is
+                  * NULL since ost is not a clio device so far */
+                if (next != NULL && !lu_device_is_cl(next))
+                        next = NULL;
+
+                tgt_type_name = tgt->obd_type->typ_name;
+                if (next != NULL) {
+                        LASSERT(next != NULL);
+                        if (next->ld_site != NULL)
+                                GOTO(out, rc = -EBUSY);
+
+                        next->ld_site = ed->ed_site;
+                        rc = next->ld_type->ldt_ops->ldto_device_init(env, next,
+                                                     next->ld_type->ldt_name,
+                                                     NULL);
+                        if (rc)
+                                GOTO(out, rc);
+                } else
+                        LASSERT(strcmp(tgt_type_name, LUSTRE_OST_NAME) == 0);
+        }
+
+        ed->ed_next = next;
+        RETURN(&cd->cd_lu_dev);
+out:
+        switch(cleanup) {
+        case 4: {
+                int rc2;
+                rc2 = echo_client_cleanup(obd);
+                if (rc2)
+                        CERROR("Cleanup obd device %s error(%d)\n",
+                               obd->obd_name, rc2);
+        }
+
+        case 3:
+                echo_site_fini(env, ed);
+        case 2:
+                cl_device_fini(&ed->ed_cl);
+        case 1:
+                OBD_FREE_PTR(ed);
+        case 0:
+        default:
+                break;
+        }
+        return(ERR_PTR(rc));
+}
+
+static int echo_device_init(const struct lu_env *env, struct lu_device *d,
+                          const char *name, struct lu_device *next)
+{
+        LBUG();
+        return 0;
+}
+
+static struct lu_device *echo_device_fini(const struct lu_env *env,
+                                          struct lu_device *d)
+{
+        struct echo_device *ed = cl2echo_dev(lu2cl_dev(d));
+        struct lu_device *next = ed->ed_next;
+
+        while (next && !ed->ed_next_ismd)
+                next = next->ld_type->ldt_ops->ldto_device_fini(env, next);
+        return NULL;
+}
+
+static void echo_lock_release(const struct lu_env *env,
+                              struct echo_lock *ecl,
+                              int still_used)
+{
+	struct cl_lock *clk = echo_lock2cl(ecl);
+
+	cl_lock_release(env, clk);
+}
+
+static struct lu_device *echo_device_free(const struct lu_env *env,
+                                          struct lu_device *d)
+{
+        struct echo_device     *ed   = cl2echo_dev(lu2cl_dev(d));
+        struct echo_client_obd *ec   = ed->ed_ec;
+        struct echo_object     *eco;
+        struct lu_device       *next = ed->ed_next;
+
+        CDEBUG(D_INFO, "echo device:%p is going to be freed, next = %p\n",
+               ed, next);
+
+	lu_site_purge(env, ed->ed_site, -1);
+
+        /* check if there are objects still alive.
+         * It shouldn't have any object because lu_site_purge would cleanup
+         * all of cached objects. Anyway, probably the echo device is being
+         * parallelly accessed.
+         */
+	spin_lock(&ec->ec_lock);
+	list_for_each_entry(eco, &ec->ec_objects, eo_obj_chain)
+		eco->eo_deleted = 1;
+	spin_unlock(&ec->ec_lock);
+
+	/* purge again */
+	lu_site_purge(env, ed->ed_site, -1);
+
+	CDEBUG(D_INFO,
+	       "Waiting for the reference of echo object to be dropped\n");
+
+	/* Wait for the last reference to be dropped. */
+	spin_lock(&ec->ec_lock);
+	while (!list_empty(&ec->ec_objects)) {
+		spin_unlock(&ec->ec_lock);
+		CERROR("echo_client still has objects at cleanup time, "
+		       "wait for 1 second\n");
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(1));
+		lu_site_purge(env, ed->ed_site, -1);
+		spin_lock(&ec->ec_lock);
+	}
+	spin_unlock(&ec->ec_lock);
+
+	LASSERT(list_empty(&ec->ec_locks));
+
+	CDEBUG(D_INFO, "No object exists, exiting...\n");
+
+	echo_client_cleanup(d->ld_obd);
+#ifdef HAVE_SERVER_SUPPORT
+	echo_fid_fini(d->ld_obd);
+	echo_ed_los_fini(env, ed);
+#endif
+	while (next && !ed->ed_next_ismd)
+		next = next->ld_type->ldt_ops->ldto_device_free(env, next);
+
+	LASSERT(ed->ed_site == d->ld_site);
+	echo_site_fini(env, ed);
+	cl_device_fini(&ed->ed_cl);
+	OBD_FREE_PTR(ed);
+
+	cl_env_cache_purge(~0);
+
+	return NULL;
+}
+
+static const struct lu_device_type_operations echo_device_type_ops = {
+        .ldto_init = echo_type_init,
+        .ldto_fini = echo_type_fini,
+
+        .ldto_start = echo_type_start,
+        .ldto_stop  = echo_type_stop,
+
+        .ldto_device_alloc = echo_device_alloc,
+        .ldto_device_free  = echo_device_free,
+        .ldto_device_init  = echo_device_init,
+        .ldto_device_fini  = echo_device_fini
+};
+
+static struct lu_device_type echo_device_type = {
+        .ldt_tags     = LU_DEVICE_CL,
+        .ldt_name     = LUSTRE_ECHO_CLIENT_NAME,
+        .ldt_ops      = &echo_device_type_ops,
+        .ldt_ctx_tags = LCT_CL_THREAD | LCT_MD_THREAD | LCT_DT_THREAD,
+};
+/** @} echo_init */
+
+/** \defgroup echo_exports Exported operations
+ *
+ * exporting functions to echo client
+ *
+ * @{
+ */
+
+/* Interfaces to echo client obd device */
+static struct echo_object *
+cl_echo_object_find(struct echo_device *d, const struct ost_id *oi)
+{
+	struct lu_env *env;
+	struct echo_thread_info *info;
+	struct echo_object_conf *conf;
+	struct echo_object *eco;
+	struct cl_object *obj;
+	struct lov_oinfo *oinfo = NULL;
+	struct lu_fid *fid;
+	__u16  refcheck;
+	int rc;
+	ENTRY;
+
+	LASSERTF(ostid_id(oi) != 0, DOSTID"\n", POSTID(oi));
+	LASSERTF(ostid_seq(oi) == FID_SEQ_ECHO, DOSTID"\n", POSTID(oi));
+
+        /* Never return an object if the obd is to be freed. */
+        if (echo_dev2cl(d)->cd_lu_dev.ld_obd->obd_stopping)
+                RETURN(ERR_PTR(-ENODEV));
+
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN((void *)env);
+
+        info = echo_env_info(env);
+        conf = &info->eti_conf;
+        if (d->ed_next) {
+		OBD_ALLOC_PTR(oinfo);
+		if (oinfo == NULL)
+			GOTO(out, eco = ERR_PTR(-ENOMEM));
+
+		oinfo->loi_oi = *oi;
+		conf->eoc_cl.u.coc_oinfo = oinfo;
+	}
+
+	/* If echo_object_init() is successful then ownership of oinfo
+	 * is transferred to the object. */
+	conf->eoc_oinfo = &oinfo;
+
+	fid = &info->eti_fid;
+	rc = ostid_to_fid(fid, oi, 0);
+	if (rc != 0)
+		GOTO(out, eco = ERR_PTR(rc));
+
+	/* In the function below, .hs_keycmp resolves to
+	 * lu_obj_hop_keycmp() */
+	/* coverity[overrun-buffer-val] */
+        obj = cl_object_find(env, echo_dev2cl(d), fid, &conf->eoc_cl);
+        if (IS_ERR(obj))
+                GOTO(out, eco = (void*)obj);
+
+        eco = cl2echo_obj(obj);
+        if (eco->eo_deleted) {
+                cl_object_put(env, obj);
+                eco = ERR_PTR(-EAGAIN);
+        }
+
+out:
+	if (oinfo != NULL)
+		OBD_FREE_PTR(oinfo);
+
+        cl_env_put(env, &refcheck);
+        RETURN(eco);
+}
+
+static int cl_echo_object_put(struct echo_object *eco)
+{
+        struct lu_env *env;
+        struct cl_object *obj = echo_obj2cl(eco);
+	__u16  refcheck;
+        ENTRY;
+
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN(PTR_ERR(env));
+
+        /* an external function to kill an object? */
+        if (eco->eo_deleted) {
+                struct lu_object_header *loh = obj->co_lu.lo_header;
+                LASSERT(&eco->eo_hdr == luh2coh(loh));
+		set_bit(LU_OBJECT_HEARD_BANSHEE, &loh->loh_flags);
+        }
+
+        cl_object_put(env, obj);
+        cl_env_put(env, &refcheck);
+        RETURN(0);
+}
+
+static int cl_echo_enqueue0(struct lu_env *env, struct echo_object *eco,
+			    u64 start, u64 end, int mode,
+			    __u64 *cookie , __u32 enqflags)
+{
+        struct cl_io *io;
+        struct cl_lock *lck;
+        struct cl_object *obj;
+        struct cl_lock_descr *descr;
+        struct echo_thread_info *info;
+        int rc = -ENOMEM;
+        ENTRY;
+
+        info = echo_env_info(env);
+        io = &info->eti_io;
+	lck = &info->eti_lock;
+	obj = echo_obj2cl(eco);
+
+	memset(lck, 0, sizeof(*lck));
+	descr = &lck->cll_descr;
+        descr->cld_obj   = obj;
+        descr->cld_start = cl_index(obj, start);
+        descr->cld_end   = cl_index(obj, end);
+        descr->cld_mode  = mode == LCK_PW ? CLM_WRITE : CLM_READ;
+        descr->cld_enq_flags = enqflags;
+        io->ci_obj = obj;
+
+	rc = cl_lock_request(env, io, lck);
+	if (rc == 0) {
+		struct echo_client_obd *ec = eco->eo_dev->ed_ec;
+		struct echo_lock *el;
+
+		el = cl2echo_lock(cl_lock_at(lck, &echo_device_type));
+		spin_lock(&ec->ec_lock);
+		if (list_empty(&el->el_chain)) {
+			list_add(&el->el_chain, &ec->ec_locks);
+			el->el_cookie = ++ec->ec_unique;
+		}
+		atomic_inc(&el->el_refcount);
+		*cookie = el->el_cookie;
+		spin_unlock(&ec->ec_lock);
+	}
+	RETURN(rc);
+}
+
+static int cl_echo_cancel0(struct lu_env *env, struct echo_device *ed,
+                           __u64 cookie)
+{
+        struct echo_client_obd *ec = ed->ed_ec;
+        struct echo_lock       *ecl = NULL;
+	struct list_head	*el;
+        int found = 0, still_used = 0;
+        ENTRY;
+
+        LASSERT(ec != NULL);
+	spin_lock(&ec->ec_lock);
+	list_for_each(el, &ec->ec_locks) {
+		ecl = list_entry(el, struct echo_lock, el_chain);
+		CDEBUG(D_INFO, "ecl: %p, cookie: %#llx\n", ecl, ecl->el_cookie);
+                found = (ecl->el_cookie == cookie);
+                if (found) {
+			if (atomic_dec_and_test(&ecl->el_refcount))
+				list_del_init(&ecl->el_chain);
+                        else
+                                still_used = 1;
+                        break;
+                }
+        }
+	spin_unlock(&ec->ec_lock);
+
+        if (!found)
+                RETURN(-ENOENT);
+
+        echo_lock_release(env, ecl, still_used);
+        RETURN(0);
+}
+
+static void echo_commit_callback(const struct lu_env *env, struct cl_io *io,
+				struct cl_page *page)
+{
+	struct echo_thread_info *info;
+	struct cl_2queue        *queue;
+
+	info = echo_env_info(env);
+	LASSERT(io == &info->eti_io);
+
+	queue = &info->eti_queue;
+	cl_page_list_add(&queue->c2_qout, page);
+}
+
+static int cl_echo_object_brw(struct echo_object *eco, int rw, u64 offset,
+			      struct page **pages, int npages, int async)
+{
+        struct lu_env           *env;
+        struct echo_thread_info *info;
+        struct cl_object        *obj = echo_obj2cl(eco);
+        struct echo_device      *ed  = eco->eo_dev;
+        struct cl_2queue        *queue;
+        struct cl_io            *io;
+        struct cl_page          *clp;
+        struct lustre_handle    lh = { 0 };
+        int page_size = cl_page_size(obj);
+        int rc;
+        int i;
+	__u16 refcheck;
+        ENTRY;
+
+	LASSERT((offset & ~PAGE_MASK) == 0);
+        LASSERT(ed->ed_next != NULL);
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN(PTR_ERR(env));
+
+        info    = echo_env_info(env);
+        io      = &info->eti_io;
+        queue   = &info->eti_queue;
+
+        cl_2queue_init(queue);
+
+	io->ci_ignore_layout = 1;
+        rc = cl_io_init(env, io, CIT_MISC, obj);
+        if (rc < 0)
+                GOTO(out, rc);
+        LASSERT(rc == 0);
+
+
+        rc = cl_echo_enqueue0(env, eco, offset,
+			      offset + npages * PAGE_SIZE - 1,
+                              rw == READ ? LCK_PR : LCK_PW, &lh.cookie,
+                              CEF_NEVER);
+        if (rc < 0)
+                GOTO(error_lock, rc);
+
+        for (i = 0; i < npages; i++) {
+                LASSERT(pages[i]);
+                clp = cl_page_find(env, obj, cl_index(obj, offset),
+                                   pages[i], CPT_TRANSIENT);
+                if (IS_ERR(clp)) {
+                        rc = PTR_ERR(clp);
+                        break;
+                }
+                LASSERT(clp->cp_type == CPT_TRANSIENT);
+
+                rc = cl_page_own(env, io, clp);
+                if (rc) {
+                        LASSERT(clp->cp_state == CPS_FREEING);
+                        cl_page_put(env, clp);
+                        break;
+                }
+
+                cl_2queue_add(queue, clp);
+
+                /* drop the reference count for cl_page_find, so that the page
+                 * will be freed in cl_2queue_fini. */
+                cl_page_put(env, clp);
+                cl_page_clip(env, clp, 0, page_size);
+
+                offset += page_size;
+        }
+
+        if (rc == 0) {
+                enum cl_req_type typ = rw == READ ? CRT_READ : CRT_WRITE;
+
+                async = async && (typ == CRT_WRITE);
+                if (async)
+			rc = cl_io_commit_async(env, io, &queue->c2_qin,
+						0, PAGE_SIZE,
+						echo_commit_callback);
+		else
+			rc = cl_io_submit_sync(env, io, typ, queue, 0);
+                CDEBUG(D_INFO, "echo_client %s write returns %d\n",
+                       async ? "async" : "sync", rc);
+        }
+
+        cl_echo_cancel0(env, ed, lh.cookie);
+        EXIT;
+error_lock:
+        cl_2queue_discard(env, io, queue);
+        cl_2queue_disown(env, io, queue);
+        cl_2queue_fini(env, queue);
+        cl_io_fini(env, io);
+out:
+        cl_env_put(env, &refcheck);
+        return rc;
+}
+/** @} echo_exports */
+
+
+static u64 last_object_id;
+
+#ifdef HAVE_SERVER_SUPPORT
+static inline void echo_md_build_name(struct lu_name *lname, char *name,
+				      __u64 id)
+{
+	snprintf(name, ETI_NAME_LEN, "%llu", id);
+	lname->ln_name = name;
+	lname->ln_namelen = strlen(name);
+}
+
+/* similar to mdt_attr_get_complex */
+static int echo_big_lmm_get(const struct lu_env *env, struct md_object *o,
+			    struct md_attr *ma)
+{
+	struct echo_thread_info	*info = echo_env_info(env);
+	int			 rc;
+
+	ENTRY;
+
+	LASSERT(ma->ma_lmm_size > 0);
+
+	LASSERT(ma->ma_need & (MA_LOV | MA_LMV));
+	if (ma->ma_need & MA_LOV)
+		rc = mo_xattr_get(env, o, &LU_BUF_NULL, XATTR_NAME_LOV);
+	else
+		rc = mo_xattr_get(env, o, &LU_BUF_NULL, XATTR_NAME_LMV);
+
+	if (rc < 0)
+		RETURN(rc);
+
+	/* big_lmm may need to be grown */
+	if (info->eti_big_lmmsize < rc) {
+		int size = size_roundup_power2(rc);
+
+		if (info->eti_big_lmmsize > 0) {
+			/* free old buffer */
+			LASSERT(info->eti_big_lmm);
+			OBD_FREE_LARGE(info->eti_big_lmm,
+				       info->eti_big_lmmsize);
+			info->eti_big_lmm = NULL;
+			info->eti_big_lmmsize = 0;
+		}
+
+		OBD_ALLOC_LARGE(info->eti_big_lmm, size);
+		if (info->eti_big_lmm == NULL)
+			RETURN(-ENOMEM);
+		info->eti_big_lmmsize = size;
+	}
+	LASSERT(info->eti_big_lmmsize >= rc);
+
+	info->eti_buf.lb_buf = info->eti_big_lmm;
+	info->eti_buf.lb_len = info->eti_big_lmmsize;
+	if (ma->ma_need & MA_LOV)
+		rc = mo_xattr_get(env, o, &info->eti_buf, XATTR_NAME_LOV);
+	else
+		rc = mo_xattr_get(env, o, &info->eti_buf, XATTR_NAME_LMV);
+	if (rc < 0)
+		RETURN(rc);
+
+	if (ma->ma_need & MA_LOV)
+		ma->ma_valid |= MA_LOV;
+	else
+		ma->ma_valid |= MA_LMV;
+
+	ma->ma_lmm = info->eti_big_lmm;
+	ma->ma_lmm_size = rc;
+
+	RETURN(0);
+}
+
+static int echo_attr_get_complex(const struct lu_env *env,
+				 struct md_object *next,
+				 struct md_attr *ma)
+{
+	struct echo_thread_info	*info = echo_env_info(env);
+	struct lu_buf		*buf = &info->eti_buf;
+	umode_t		 mode = lu_object_attr(&next->mo_lu);
+	int			 need = ma->ma_need;
+	int			 rc = 0, rc2;
+
+	ENTRY;
+
+	ma->ma_valid = 0;
+
+	if (need & MA_INODE) {
+		ma->ma_need = MA_INODE;
+		rc = mo_attr_get(env, next, ma);
+		if (rc)
+			GOTO(out, rc);
+		ma->ma_valid |= MA_INODE;
+	}
+
+	if (need & MA_LOV) {
+		if (S_ISREG(mode) || S_ISDIR(mode)) {
+			LASSERT(ma->ma_lmm_size > 0);
+			buf->lb_buf = ma->ma_lmm;
+			buf->lb_len = ma->ma_lmm_size;
+			rc2 = mo_xattr_get(env, next, buf, XATTR_NAME_LOV);
+			if (rc2 > 0) {
+				ma->ma_lmm_size = rc2;
+				ma->ma_valid |= MA_LOV;
+			} else if (rc2 == -ENODATA) {
+				/* no LOV EA */
+				ma->ma_lmm_size = 0;
+			} else if (rc2 == -ERANGE) {
+				rc2 = echo_big_lmm_get(env, next, ma);
+				if (rc2 < 0)
+					GOTO(out, rc = rc2);
+			} else {
+				GOTO(out, rc = rc2);
+			}
+		}
+	}
+
+	if (need & MA_LMV && S_ISDIR(mode)) {
+		LASSERT(ma->ma_lmm_size > 0);
+		buf->lb_buf = ma->ma_lmm;
+		buf->lb_len = ma->ma_lmm_size;
+		rc2 = mo_xattr_get(env, next, buf, XATTR_NAME_LMV);
+		if (rc2 > 0) {
+			ma->ma_lmm_size = rc2;
+			ma->ma_valid |= MA_LMV;
+		} else if (rc2 == -ENODATA) {
+			/* no LMV EA */
+			ma->ma_lmm_size = 0;
+		} else if (rc2 == -ERANGE) {
+			rc2 = echo_big_lmm_get(env, next, ma);
+			if (rc2 < 0)
+				GOTO(out, rc = rc2);
+		} else {
+			GOTO(out, rc = rc2);
+		}
+	}
+
+#ifdef CONFIG_FS_POSIX_ACL
+	if (need & MA_ACL_DEF && S_ISDIR(mode)) {
+		buf->lb_buf = ma->ma_acl;
+		buf->lb_len = ma->ma_acl_size;
+		rc2 = mo_xattr_get(env, next, buf, XATTR_NAME_ACL_DEFAULT);
+		if (rc2 > 0) {
+			ma->ma_acl_size = rc2;
+			ma->ma_valid |= MA_ACL_DEF;
+		} else if (rc2 == -ENODATA) {
+			/* no ACLs */
+			ma->ma_acl_size = 0;
+		} else {
+			GOTO(out, rc = rc2);
+		}
+	}
+#endif
+out:
+	ma->ma_need = need;
+	CDEBUG(D_INODE, "after getattr rc = %d, ma_valid = %#llx ma_lmm=%p\n",
+	       rc, ma->ma_valid, ma->ma_lmm);
+	RETURN(rc);
+}
+
+static int
+echo_md_create_internal(const struct lu_env *env, struct echo_device *ed,
+			struct md_object *parent, struct lu_fid *fid,
+			struct lu_name *lname, struct md_op_spec *spec,
+			struct md_attr *ma)
+{
+	struct lu_object	*ec_child, *child;
+	struct lu_device	*ld = ed->ed_next;
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_fid		*fid2 = &info->eti_fid2;
+	struct lu_object_conf    conf = { .loc_flags = LOC_F_NEW };
+	int			 rc;
+
+	ENTRY;
+
+	rc = mdo_lookup(env, parent, lname, fid2, spec);
+	if (rc == 0)
+		return -EEXIST;
+	else if (rc != -ENOENT)
+		return rc;
+
+	ec_child = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev,
+				     fid, &conf);
+        if (IS_ERR(ec_child)) {
+                CERROR("Can not find the child "DFID": rc = %ld\n", PFID(fid),
+                        PTR_ERR(ec_child));
+		RETURN(PTR_ERR(ec_child));
+        }
+
+        child = lu_object_locate(ec_child->lo_header, ld->ld_type);
+        if (child == NULL) {
+                CERROR("Can not locate the child "DFID"\n", PFID(fid));
+                GOTO(out_put, rc = -EINVAL);
+        }
+
+        CDEBUG(D_RPCTRACE, "Start creating object "DFID" %s %p\n",
+               PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent);
+
+	/*
+	 * Do not perform lookup sanity check. We know that name does not exist.
+	 */
+	spec->sp_cr_lookup = 0;
+        rc = mdo_create(env, parent, lname, lu2md(child), spec, ma);
+        if (rc) {
+                CERROR("Can not create child "DFID": rc = %d\n", PFID(fid), rc);
+                GOTO(out_put, rc);
+        }
+        CDEBUG(D_RPCTRACE, "End creating object "DFID" %s %p rc  = %d\n",
+               PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent, rc);
+	EXIT;
+out_put:
+        lu_object_put(env, ec_child);
+        return rc;
+}
+
+static int echo_set_lmm_size(const struct lu_env *env, struct lu_device *ld,
+			     struct md_attr *ma)
+{
+	struct echo_thread_info *info = echo_env_info(env);
+
+	if (strcmp(ld->ld_type->ldt_name, LUSTRE_MDD_NAME)) {
+		ma->ma_lmm = (void *)&info->eti_lmm;
+		ma->ma_lmm_size = sizeof(info->eti_lmm);
+	} else {
+		LASSERT(info->eti_big_lmmsize);
+		ma->ma_lmm = info->eti_big_lmm;
+		ma->ma_lmm_size = info->eti_big_lmmsize;
+	}
+
+	return 0;
+}
+
+static int
+echo_md_dir_stripe_choose(const struct lu_env *env, struct echo_device *ed,
+			  struct lu_object *obj, const char *name,
+			  unsigned int namelen, __u64 id,
+			  struct lu_object **new_parent)
+{
+	struct echo_thread_info *info = echo_env_info(env);
+	struct md_attr		*ma = &info->eti_ma;
+	struct lmv_mds_md_v1	*lmv;
+	struct lu_device        *ld = ed->ed_next;
+	unsigned int		idx;
+	struct lu_name		tmp_ln_name;
+	struct lu_fid		stripe_fid;
+	struct lu_object	*stripe_obj;
+	int			rc;
+
+	LASSERT(obj != NULL);
+	LASSERT(S_ISDIR(obj->lo_header->loh_attr));
+
+	memset(ma, 0, sizeof(*ma));
+	echo_set_lmm_size(env, ld, ma);
+	ma->ma_need = MA_LMV;
+	rc = echo_attr_get_complex(env, lu2md(obj), ma);
+	if (rc) {
+		CERROR("Can not getattr child "DFID": rc = %d\n",
+			PFID(lu_object_fid(obj)), rc);
+		return rc;
+	}
+
+	if (!(ma->ma_valid & MA_LMV)) {
+		*new_parent = obj;
+		return 0;
+	}
+
+	lmv = (struct lmv_mds_md_v1 *)ma->ma_lmm;
+	if (le32_to_cpu(lmv->lmv_magic) != LMV_MAGIC_V1) {
+		rc = -EINVAL;
+		CERROR("Invalid mds md magic %x "DFID": rc = %d\n",
+		       le32_to_cpu(lmv->lmv_magic), PFID(lu_object_fid(obj)),
+		       rc);
+		return rc;
+	}
+
+	if (name != NULL) {
+		tmp_ln_name.ln_name = name;
+		tmp_ln_name.ln_namelen = namelen;
+	} else {
+		LASSERT(id != -1);
+		echo_md_build_name(&tmp_ln_name, info->eti_name, id);
+	}
+
+	idx = lmv_name_to_stripe_index(LMV_HASH_TYPE_FNV_1A_64,
+				le32_to_cpu(lmv->lmv_stripe_count),
+				tmp_ln_name.ln_name, tmp_ln_name.ln_namelen);
+
+	LASSERT(idx < le32_to_cpu(lmv->lmv_stripe_count));
+	fid_le_to_cpu(&stripe_fid, &lmv->lmv_stripe_fids[idx]);
+
+	stripe_obj = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev, &stripe_fid,
+				       NULL);
+	if (IS_ERR(stripe_obj)) {
+		rc = PTR_ERR(stripe_obj);
+		CERROR("Can not find the parent "DFID": rc = %d\n",
+		       PFID(&stripe_fid), rc);
+		return rc;
+	}
+
+	*new_parent = lu_object_locate(stripe_obj->lo_header, ld->ld_type);
+	if (*new_parent == NULL) {
+		lu_object_put(env, stripe_obj);
+		RETURN(-ENXIO);
+	}
+
+	return rc;
+}
+
+static int echo_create_md_object(const struct lu_env *env,
+                                 struct echo_device *ed,
+                                 struct lu_object *ec_parent,
+                                 struct lu_fid *fid,
+                                 char *name, int namelen,
+                                 __u64 id, __u32 mode, int count,
+                                 int stripe_count, int stripe_offset)
+{
+	struct lu_object        *parent;
+	struct lu_object        *new_parent;
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_name          *lname = &info->eti_lname;
+	struct md_op_spec       *spec = &info->eti_spec;
+	struct md_attr          *ma = &info->eti_ma;
+	struct lu_device        *ld = ed->ed_next;
+	int                      rc = 0;
+	int                      i;
+
+	ENTRY;
+
+	if (ec_parent == NULL)
+		return -1;
+	parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+	if (parent == NULL)
+		RETURN(-ENXIO);
+
+	rc = echo_md_dir_stripe_choose(env, ed, parent, name, namelen,
+				       id, &new_parent);
+	if (rc != 0)
+		RETURN(rc);
+
+	LASSERT(new_parent != NULL);
+	memset(ma, 0, sizeof(*ma));
+	memset(spec, 0, sizeof(*spec));
+	echo_set_lmm_size(env, ld, ma);
+	if (stripe_count != 0) {
+		spec->sp_cr_flags |= FMODE_WRITE;
+		if (stripe_count != -1) {
+			if (S_ISDIR(mode)) {
+				struct lmv_user_md *lmu;
+
+				lmu = (struct lmv_user_md *)&info->eti_lum;
+				lmu->lum_magic = LMV_USER_MAGIC;
+				lmu->lum_stripe_offset = stripe_offset;
+				lmu->lum_stripe_count = stripe_count;
+				lmu->lum_hash_type = LMV_HASH_TYPE_FNV_1A_64;
+				spec->u.sp_ea.eadata = lmu;
+				spec->u.sp_ea.eadatalen = sizeof(*lmu);
+			} else {
+				struct lov_user_md_v3 *lum = &info->eti_lum;
+
+				lum->lmm_magic = LOV_USER_MAGIC_V3;
+				lum->lmm_stripe_count = stripe_count;
+				lum->lmm_stripe_offset = stripe_offset;
+				lum->lmm_pattern = LOV_PATTERN_NONE;
+				spec->u.sp_ea.eadata = lum;
+				spec->u.sp_ea.eadatalen = sizeof(*lum);
+			}
+			spec->sp_cr_flags |= MDS_OPEN_HAS_EA;
+		}
+	}
+
+	ma->ma_attr.la_mode = mode;
+	ma->ma_attr.la_valid = LA_CTIME | LA_MODE;
+        ma->ma_attr.la_ctime = cfs_time_current_64();
+
+	if (name != NULL) {
+		lname->ln_name = name;
+		lname->ln_namelen = namelen;
+		/* If name is specified, only create one object by name */
+		rc = echo_md_create_internal(env, ed, lu2md(new_parent), fid,
+					     lname, spec, ma);
+		GOTO(out_put, rc);
+	}
+
+	/* Create multiple object sequenced by id */
+	for (i = 0; i < count; i++) {
+		char *tmp_name = info->eti_name;
+
+		echo_md_build_name(lname, tmp_name, id);
+
+		rc = echo_md_create_internal(env, ed, lu2md(new_parent),
+					     fid, lname, spec, ma);
+		if (rc) {
+			CERROR("Can not create child %s: rc = %d\n", tmp_name,
+				rc);
+			break;
+		}
+		id++;
+		fid->f_oid++;
+	}
+
+out_put:
+	if (new_parent != parent)
+		lu_object_put(env, new_parent);
+
+	RETURN(rc);
+}
+
+static struct lu_object *echo_md_lookup(const struct lu_env *env,
+                                        struct echo_device *ed,
+                                        struct md_object *parent,
+                                        struct lu_name *lname)
+{
+        struct echo_thread_info *info = echo_env_info(env);
+        struct lu_fid           *fid = &info->eti_fid;
+        struct lu_object        *child;
+        int    rc;
+        ENTRY;
+
+        CDEBUG(D_INFO, "lookup %s in parent "DFID" %p\n", lname->ln_name,
+               PFID(fid), parent);
+
+	rc = mdo_lookup(env, parent, lname, fid, NULL);
+	if (rc) {
+		CERROR("lookup %s: rc = %d\n", lname->ln_name, rc);
+		RETURN(ERR_PTR(rc));
+	}
+
+	/* In the function below, .hs_keycmp resolves to
+	 * lu_obj_hop_keycmp() */
+	/* coverity[overrun-buffer-val] */
+        child = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev, fid, NULL);
+
+        RETURN(child);
+}
+
+static int echo_setattr_object(const struct lu_env *env,
+                               struct echo_device *ed,
+                               struct lu_object *ec_parent,
+                               __u64 id, int count)
+{
+	struct lu_object        *parent;
+	struct lu_object        *new_parent;
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_name          *lname = &info->eti_lname;
+	char                    *name = info->eti_name;
+	struct lu_device        *ld = ed->ed_next;
+	struct lu_buf           *buf = &info->eti_buf;
+	int                      rc = 0;
+	int                      i;
+
+	ENTRY;
+
+	if (ec_parent == NULL)
+		return -1;
+        parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+	if (parent == NULL)
+		RETURN(-ENXIO);
+
+	rc = echo_md_dir_stripe_choose(env, ed, parent, NULL, 0, id,
+				       &new_parent);
+	if (rc != 0)
+		RETURN(rc);
+
+        for (i = 0; i < count; i++) {
+                struct lu_object *ec_child, *child;
+
+                echo_md_build_name(lname, name, id);
+
+		ec_child = echo_md_lookup(env, ed, lu2md(new_parent), lname);
+		if (IS_ERR(ec_child)) {
+			rc = PTR_ERR(ec_child);
+			CERROR("Can't find child %s: rc = %d\n",
+				lname->ln_name, rc);
+			break;
+		}
+
+                child = lu_object_locate(ec_child->lo_header, ld->ld_type);
+                if (child == NULL) {
+                        CERROR("Can not locate the child %s\n", lname->ln_name);
+                        lu_object_put(env, ec_child);
+                        rc = -EINVAL;
+                        break;
+                }
+
+                CDEBUG(D_RPCTRACE, "Start setattr object "DFID"\n",
+                       PFID(lu_object_fid(child)));
+
+		buf->lb_buf = info->eti_xattr_buf;
+		buf->lb_len = sizeof(info->eti_xattr_buf);
+
+                sprintf(name, "%s.test1", XATTR_USER_PREFIX);
+                rc = mo_xattr_set(env, lu2md(child), buf, name,
+                                  LU_XATTR_CREATE);
+		if (rc < 0) {
+                        CERROR("Can not setattr child "DFID": rc = %d\n",
+                                PFID(lu_object_fid(child)), rc);
+                        lu_object_put(env, ec_child);
+                        break;
+                }
+                CDEBUG(D_RPCTRACE, "End setattr object "DFID"\n",
+                       PFID(lu_object_fid(child)));
+                id++;
+                lu_object_put(env, ec_child);
+        }
+
+	if (new_parent != parent)
+		lu_object_put(env, new_parent);
+
+	RETURN(rc);
+}
+
+static int echo_getattr_object(const struct lu_env *env,
+                               struct echo_device *ed,
+                               struct lu_object *ec_parent,
+                               __u64 id, int count)
+{
+	struct lu_object        *parent;
+	struct lu_object        *new_parent;
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_name          *lname = &info->eti_lname;
+	char                    *name = info->eti_name;
+	struct md_attr          *ma = &info->eti_ma;
+	struct lu_device        *ld = ed->ed_next;
+	int                      rc = 0;
+	int                      i;
+
+	ENTRY;
+
+	if (ec_parent == NULL)
+		return -1;
+        parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+	if (parent == NULL)
+		RETURN(-ENXIO);
+
+	rc = echo_md_dir_stripe_choose(env, ed, parent, NULL, 0, id,
+				       &new_parent);
+	if (rc != 0)
+		RETURN(rc);
+
+        memset(ma, 0, sizeof(*ma));
+        ma->ma_need |= MA_INODE | MA_LOV | MA_PFID | MA_HSM | MA_ACL_DEF;
+        ma->ma_acl = info->eti_xattr_buf;
+        ma->ma_acl_size = sizeof(info->eti_xattr_buf);
+
+        for (i = 0; i < count; i++) {
+                struct lu_object *ec_child, *child;
+
+                ma->ma_valid = 0;
+                echo_md_build_name(lname, name, id);
+		echo_set_lmm_size(env, ld, ma);
+
+		ec_child = echo_md_lookup(env, ed, lu2md(new_parent), lname);
+		if (IS_ERR(ec_child)) {
+			CERROR("Can't find child %s: rc = %ld\n",
+			       lname->ln_name, PTR_ERR(ec_child));
+			RETURN(PTR_ERR(ec_child));
+		}
+
+                child = lu_object_locate(ec_child->lo_header, ld->ld_type);
+                if (child == NULL) {
+                        CERROR("Can not locate the child %s\n", lname->ln_name);
+                        lu_object_put(env, ec_child);
+			RETURN(-EINVAL);
+                }
+
+                CDEBUG(D_RPCTRACE, "Start getattr object "DFID"\n",
+                       PFID(lu_object_fid(child)));
+		rc = echo_attr_get_complex(env, lu2md(child), ma);
+                if (rc) {
+                        CERROR("Can not getattr child "DFID": rc = %d\n",
+                                PFID(lu_object_fid(child)), rc);
+                        lu_object_put(env, ec_child);
+                        break;
+                }
+                CDEBUG(D_RPCTRACE, "End getattr object "DFID"\n",
+                       PFID(lu_object_fid(child)));
+                id++;
+                lu_object_put(env, ec_child);
+        }
+
+	if (new_parent != parent)
+		lu_object_put(env, new_parent);
+
+	RETURN(rc);
+}
+
+static int echo_lookup_object(const struct lu_env *env,
+                              struct echo_device *ed,
+                              struct lu_object *ec_parent,
+                              __u64 id, int count)
+{
+	struct lu_object        *parent;
+	struct lu_object        *new_parent;
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_name          *lname = &info->eti_lname;
+	char                    *name = info->eti_name;
+	struct lu_fid           *fid = &info->eti_fid;
+	struct lu_device        *ld = ed->ed_next;
+	int                      rc = 0;
+	int                      i;
+
+	if (ec_parent == NULL)
+		return -1;
+        parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+	if (parent == NULL)
+		return -ENXIO;
+
+	rc = echo_md_dir_stripe_choose(env, ed, parent, NULL, 0, id,
+				       &new_parent);
+	if (rc != 0)
+		RETURN(rc);
+
+        /*prepare the requests*/
+        for (i = 0; i < count; i++) {
+		echo_md_build_name(lname, name, id);
+
+		CDEBUG(D_RPCTRACE, "Start lookup object "DFID" %s %p\n",
+		       PFID(lu_object_fid(new_parent)), lname->ln_name,
+		       new_parent);
+
+		rc = mdo_lookup(env, lu2md(new_parent), lname, fid, NULL);
+		if (rc) {
+			CERROR("Can not lookup child %s: rc = %d\n", name, rc);
+			break;
+		}
+
+		CDEBUG(D_RPCTRACE, "End lookup object "DFID" %s %p\n",
+		       PFID(lu_object_fid(new_parent)), lname->ln_name,
+		       new_parent);
+
+		id++;
+	}
+
+	if (new_parent != parent)
+		lu_object_put(env, new_parent);
+
+	return rc;
+}
+
+static int echo_md_destroy_internal(const struct lu_env *env,
+                                    struct echo_device *ed,
+                                    struct md_object *parent,
+                                    struct lu_name *lname,
+                                    struct md_attr *ma)
+{
+        struct lu_device   *ld = ed->ed_next;
+        struct lu_object   *ec_child;
+        struct lu_object   *child;
+        int                 rc;
+
+	ENTRY;
+
+        ec_child = echo_md_lookup(env, ed, parent, lname);
+        if (IS_ERR(ec_child)) {
+                CERROR("Can't find child %s: rc = %ld\n", lname->ln_name,
+                        PTR_ERR(ec_child));
+                RETURN(PTR_ERR(ec_child));
+        }
+
+        child = lu_object_locate(ec_child->lo_header, ld->ld_type);
+        if (child == NULL) {
+                CERROR("Can not locate the child %s\n", lname->ln_name);
+                GOTO(out_put, rc = -EINVAL);
+        }
+
+	if (lu_object_remote(child)) {
+		CERROR("Can not destroy remote object %s: rc = %d\n",
+		       lname->ln_name, -EPERM);
+		GOTO(out_put, rc = -EPERM);
+	}
+        CDEBUG(D_RPCTRACE, "Start destroy object "DFID" %s %p\n",
+               PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent);
+
+	rc = mdo_unlink(env, parent, lu2md(child), lname, ma, 0);
+	if (rc) {
+		CERROR("Can not unlink child %s: rc = %d\n",
+			lname->ln_name, rc);
+		GOTO(out_put, rc);
+	}
+        CDEBUG(D_RPCTRACE, "End destroy object "DFID" %s %p\n",
+               PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent);
+out_put:
+        lu_object_put(env, ec_child);
+        return rc;
+}
+
+static int echo_destroy_object(const struct lu_env *env,
+                               struct echo_device *ed,
+                               struct lu_object *ec_parent,
+                               char *name, int namelen,
+                               __u64 id, __u32 mode,
+                               int count)
+{
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_name          *lname = &info->eti_lname;
+	struct md_attr          *ma = &info->eti_ma;
+	struct lu_device        *ld = ed->ed_next;
+	struct lu_object        *parent;
+	struct lu_object        *new_parent;
+	int                      rc = 0;
+	int                      i;
+	ENTRY;
+
+        parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+        if (parent == NULL)
+                RETURN(-EINVAL);
+
+	rc = echo_md_dir_stripe_choose(env, ed, parent, name, namelen,
+				       id, &new_parent);
+	if (rc != 0)
+		RETURN(rc);
+
+        memset(ma, 0, sizeof(*ma));
+        ma->ma_attr.la_mode = mode;
+        ma->ma_attr.la_valid = LA_CTIME;
+        ma->ma_attr.la_ctime = cfs_time_current_64();
+        ma->ma_need = MA_INODE;
+        ma->ma_valid = 0;
+
+        if (name != NULL) {
+                lname->ln_name = name;
+                lname->ln_namelen = namelen;
+		rc = echo_md_destroy_internal(env, ed, lu2md(new_parent), lname,
+					      ma);
+		GOTO(out_put, rc);
+	}
+
+	/*prepare the requests*/
+	for (i = 0; i < count; i++) {
+		char *tmp_name = info->eti_name;
+
+		ma->ma_valid = 0;
+		echo_md_build_name(lname, tmp_name, id);
+
+		rc = echo_md_destroy_internal(env, ed, lu2md(new_parent), lname,
+					      ma);
+		if (rc) {
+			CERROR("Can not unlink child %s: rc = %d\n", name, rc);
+			break;
+		}
+		id++;
+	}
+
+out_put:
+	if (new_parent != parent)
+		lu_object_put(env, new_parent);
+
+	RETURN(rc);
+}
+
+static struct lu_object *echo_resolve_path(const struct lu_env *env,
+                                           struct echo_device *ed, char *path,
+                                           int path_len)
+{
+	struct lu_device	*ld = ed->ed_next;
+	struct echo_thread_info	*info = echo_env_info(env);
+	struct lu_fid		*fid = &info->eti_fid;
+	struct lu_name		*lname = &info->eti_lname;
+	struct lu_object	*parent = NULL;
+	struct lu_object	*child = NULL;
+	int			 rc = 0;
+	ENTRY;
+
+	*fid = ed->ed_root_fid;
+
+	/* In the function below, .hs_keycmp resolves to
+	 * lu_obj_hop_keycmp() */
+	/* coverity[overrun-buffer-val] */
+        parent = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev, fid, NULL);
+        if (IS_ERR(parent)) {
+                CERROR("Can not find the parent "DFID": rc = %ld\n",
+                        PFID(fid), PTR_ERR(parent));
+                RETURN(parent);
+        }
+
+        while (1) {
+                struct lu_object *ld_parent;
+                char *e;
+
+                e = strsep(&path, "/");
+                if (e == NULL)
+                        break;
+
+                if (e[0] == 0) {
+                        if (!path || path[0] == '\0')
+                                break;
+                        continue;
+                }
+
+                lname->ln_name = e;
+                lname->ln_namelen = strlen(e);
+
+                ld_parent = lu_object_locate(parent->lo_header, ld->ld_type);
+                if (ld_parent == NULL) {
+                        lu_object_put(env, parent);
+                        rc = -EINVAL;
+                        break;
+                }
+
+                child = echo_md_lookup(env, ed, lu2md(ld_parent), lname);
+                lu_object_put(env, parent);
+                if (IS_ERR(child)) {
+                        rc = (int)PTR_ERR(child);
+                        CERROR("lookup %s under parent "DFID": rc = %d\n",
+                                lname->ln_name, PFID(lu_object_fid(ld_parent)),
+                                rc);
+                        break;
+                }
+                parent = child;
+        }
+        if (rc)
+                RETURN(ERR_PTR(rc));
+
+        RETURN(parent);
+}
+
+static void echo_ucred_init(struct lu_env *env)
+{
+	struct lu_ucred *ucred = lu_ucred(env);
+
+	ucred->uc_valid = UCRED_INVALID;
+
+	ucred->uc_suppgids[0] = -1;
+	ucred->uc_suppgids[1] = -1;
+
+	ucred->uc_uid = ucred->uc_o_uid  =
+				from_kuid(&init_user_ns, current_uid());
+	ucred->uc_gid = ucred->uc_o_gid  =
+				from_kgid(&init_user_ns, current_gid());
+	ucred->uc_fsuid = ucred->uc_o_fsuid =
+				from_kuid(&init_user_ns, current_fsuid());
+	ucred->uc_fsgid = ucred->uc_o_fsgid =
+				from_kgid(&init_user_ns, current_fsgid());
+	ucred->uc_cap = cfs_curproc_cap_pack();
+
+	/* remove fs privilege for non-root user. */
+	if (ucred->uc_fsuid)
+		ucred->uc_cap &= ~CFS_CAP_FS_MASK;
+	ucred->uc_valid = UCRED_NEW;
+}
+
+static void echo_ucred_fini(struct lu_env *env)
+{
+	struct lu_ucred *ucred = lu_ucred(env);
+	ucred->uc_valid = UCRED_INIT;
+}
+
+#define ECHO_MD_CTX_TAG (LCT_REMEMBER | LCT_MD_THREAD)
+#define ECHO_MD_SES_TAG (LCT_REMEMBER | LCT_SESSION | LCT_SERVER_SESSION)
+static int echo_md_handler(struct echo_device *ed, int command,
+			   char *path, int path_len, __u64 id, int count,
+			   struct obd_ioctl_data *data)
+{
+	struct echo_thread_info *info;
+        struct lu_device      *ld = ed->ed_next;
+        struct lu_env         *env;
+	__u16                  refcheck;
+        struct lu_object      *parent;
+        char                  *name = NULL;
+        int                    namelen = data->ioc_plen2;
+        int                    rc = 0;
+        ENTRY;
+
+        if (ld == NULL) {
+                CERROR("MD echo client is not being initialized properly\n");
+                RETURN(-EINVAL);
+        }
+
+        if (strcmp(ld->ld_type->ldt_name, LUSTRE_MDD_NAME)) {
+                CERROR("Only support MDD layer right now!\n");
+                RETURN(-EINVAL);
+        }
+
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN(PTR_ERR(env));
+
+        rc = lu_env_refill_by_tags(env, ECHO_MD_CTX_TAG, ECHO_MD_SES_TAG);
+	if (rc != 0)
+		GOTO(out_env, rc);
+
+	/* init big_lmm buffer */
+	info = echo_env_info(env);
+	LASSERT(info->eti_big_lmm == NULL);
+	OBD_ALLOC_LARGE(info->eti_big_lmm, MIN_MD_SIZE);
+	if (info->eti_big_lmm == NULL)
+		GOTO(out_env, rc = -ENOMEM);
+	info->eti_big_lmmsize = MIN_MD_SIZE;
+
+        parent = echo_resolve_path(env, ed, path, path_len);
+        if (IS_ERR(parent)) {
+                CERROR("Can not resolve the path %s: rc = %ld\n", path,
+                        PTR_ERR(parent));
+		GOTO(out_free, rc = PTR_ERR(parent));
+        }
+
+        if (namelen > 0) {
+                OBD_ALLOC(name, namelen + 1);
+                if (name == NULL)
+			GOTO(out_put, rc = -ENOMEM);
+		if (copy_from_user(name, data->ioc_pbuf2, namelen))
+			GOTO(out_name, rc = -EFAULT);
+        }
+
+	echo_ucred_init(env);
+
+        switch (command) {
+        case ECHO_MD_CREATE:
+        case ECHO_MD_MKDIR: {
+                struct echo_thread_info *info = echo_env_info(env);
+                __u32 mode = data->ioc_obdo2.o_mode;
+                struct lu_fid *fid = &info->eti_fid;
+                int stripe_count = (int)data->ioc_obdo2.o_misc;
+                int stripe_index = (int)data->ioc_obdo2.o_stripe_idx;
+
+		rc = ostid_to_fid(fid, &data->ioc_obdo1.o_oi, 0);
+		if (rc != 0)
+			break;
+
+		/* In the function below, .hs_keycmp resolves to
+		 * lu_obj_hop_keycmp() */
+		/* coverity[overrun-buffer-val] */
+                rc = echo_create_md_object(env, ed, parent, fid, name, namelen,
+                                           id, mode, count, stripe_count,
+                                           stripe_index);
+                break;
+        }
+        case ECHO_MD_DESTROY:
+        case ECHO_MD_RMDIR: {
+                __u32 mode = data->ioc_obdo2.o_mode;
+
+                rc = echo_destroy_object(env, ed, parent, name, namelen,
+                                         id, mode, count);
+                break;
+        }
+        case ECHO_MD_LOOKUP:
+                rc = echo_lookup_object(env, ed, parent, id, count);
+                break;
+        case ECHO_MD_GETATTR:
+                rc = echo_getattr_object(env, ed, parent, id, count);
+                break;
+        case ECHO_MD_SETATTR:
+                rc = echo_setattr_object(env, ed, parent, id, count);
+                break;
+        default:
+                CERROR("unknown command %d\n", command);
+                rc = -EINVAL;
+                break;
+        }
+	echo_ucred_fini(env);
+
+out_name:
+        if (name != NULL)
+                OBD_FREE(name, namelen + 1);
+out_put:
+        lu_object_put(env, parent);
+out_free:
+	LASSERT(info->eti_big_lmm);
+	OBD_FREE_LARGE(info->eti_big_lmm, info->eti_big_lmmsize);
+	info->eti_big_lmm = NULL;
+	info->eti_big_lmmsize = 0;
+out_env:
+        cl_env_put(env, &refcheck);
+        return rc;
+}
+#endif /* HAVE_SERVER_SUPPORT */
+
+static int echo_create_object(const struct lu_env *env, struct echo_device *ed,
+			      struct obdo *oa)
+{
+	struct echo_object	*eco;
+	struct echo_client_obd	*ec = ed->ed_ec;
+	int created = 0;
+	int rc;
+	ENTRY;
+
+	if (!(oa->o_valid & OBD_MD_FLID) ||
+	    !(oa->o_valid & OBD_MD_FLGROUP) ||
+	    !fid_seq_is_echo(ostid_seq(&oa->o_oi))) {
+		CERROR("invalid oid "DOSTID"\n", POSTID(&oa->o_oi));
+		RETURN(-EINVAL);
+	}
+
+	if (ostid_id(&oa->o_oi) == 0) {
+		rc = ostid_set_id(&oa->o_oi, ++last_object_id);
+		if (rc)
+			GOTO(failed, rc);
+	}
+
+	rc = obd_create(env, ec->ec_exp, oa);
+	if (rc != 0) {
+		CERROR("Cannot create objects: rc = %d\n", rc);
+		GOTO(failed, rc);
+	}
+
+	created = 1;
+
+	oa->o_valid |= OBD_MD_FLID;
+
+	eco = cl_echo_object_find(ed, &oa->o_oi);
+        if (IS_ERR(eco))
+                GOTO(failed, rc = PTR_ERR(eco));
+        cl_echo_object_put(eco);
+
+        CDEBUG(D_INFO, "oa oid "DOSTID"\n", POSTID(&oa->o_oi));
+        EXIT;
+
+failed:
+	if (created && rc != 0)
+		obd_destroy(env, ec->ec_exp, oa);
+
+	if (rc != 0)
+		CERROR("create object failed with: rc = %d\n", rc);
+
+	return rc;
+}
+
+static int echo_get_object(struct echo_object **ecop, struct echo_device *ed,
+			   struct obdo *oa)
+{
+	struct echo_object *eco;
+	int rc;
+	ENTRY;
+
+	if (!(oa->o_valid & OBD_MD_FLID) ||
+	    !(oa->o_valid & OBD_MD_FLGROUP) ||
+	    ostid_id(&oa->o_oi) == 0) {
+		CERROR("invalid oid "DOSTID"\n", POSTID(&oa->o_oi));
+		RETURN(-EINVAL);
+	}
+
+	rc = 0;
+	eco = cl_echo_object_find(ed, &oa->o_oi);
+	if (!IS_ERR(eco))
+		*ecop = eco;
+	else
+		rc = PTR_ERR(eco);
+
+	RETURN(rc);
+}
+
+static void echo_put_object(struct echo_object *eco)
+{
+	int rc;
+
+	rc = cl_echo_object_put(eco);
+	if (rc)
+		CERROR("%s: echo client drop an object failed: rc = %d\n",
+		       eco->eo_dev->ed_ec->ec_exp->exp_obd->obd_name, rc);
+}
+
+static void echo_client_page_debug_setup(struct page *page, int rw, u64 id,
+					 u64 offset, u64 count)
+{
+	char    *addr;
+	u64	 stripe_off;
+	u64	 stripe_id;
+	int	 delta;
+
+        /* no partial pages on the client */
+	LASSERT(count == PAGE_SIZE);
+
+	addr = kmap(page);
+
+	for (delta = 0; delta < PAGE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) {
+                if (rw == OBD_BRW_WRITE) {
+                        stripe_off = offset + delta;
+                        stripe_id = id;
+                } else {
+                        stripe_off = 0xdeadbeef00c0ffeeULL;
+                        stripe_id = 0xdeadbeef00c0ffeeULL;
+                }
+                block_debug_setup(addr + delta, OBD_ECHO_BLOCK_SIZE,
+                                  stripe_off, stripe_id);
+        }
+
+	kunmap(page);
+}
+
+static int
+echo_client_page_debug_check(struct page *page, u64 id, u64 offset, u64 count)
+{
+	u64	 stripe_off;
+	u64	 stripe_id;
+        char   *addr;
+        int     delta;
+        int     rc;
+        int     rc2;
+
+        /* no partial pages on the client */
+	LASSERT(count == PAGE_SIZE);
+
+	addr = kmap(page);
+
+	for (rc = delta = 0; delta < PAGE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) {
+                stripe_off = offset + delta;
+                stripe_id = id;
+
+                rc2 = block_debug_check("test_brw",
+                                        addr + delta, OBD_ECHO_BLOCK_SIZE,
+                                        stripe_off, stripe_id);
+                if (rc2 != 0) {
+			CERROR("Error in echo object %#llx\n", id);
+                        rc = rc2;
+                }
+        }
+
+	kunmap(page);
+        return rc;
+}
+
+static int echo_client_kbrw(struct echo_device *ed, int rw, struct obdo *oa,
+			    struct echo_object *eco, u64 offset,
+			    u64 count, int async)
+{
+	size_t			npages;
+        struct brw_page        *pga;
+        struct brw_page        *pgp;
+	struct page            **pages;
+	u64			 off;
+	size_t			i;
+        int                     rc;
+        int                     verify;
+	gfp_t			gfp_mask;
+	u32			brw_flags = 0;
+        ENTRY;
+
+        verify = (ostid_id(&oa->o_oi) != ECHO_PERSISTENT_OBJID &&
+                  (oa->o_valid & OBD_MD_FLFLAGS) != 0 &&
+                  (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0);
+
+	gfp_mask = ((ostid_id(&oa->o_oi) & 2) == 0) ? GFP_KERNEL : GFP_HIGHUSER;
+
+	LASSERT(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ);
+
+	if ((count & (~PAGE_MASK)) != 0)
+		RETURN(-EINVAL);
+
+        /* XXX think again with misaligned I/O */
+	npages = count >> PAGE_SHIFT;
+
+        if (rw == OBD_BRW_WRITE)
+                brw_flags = OBD_BRW_ASYNC;
+
+        OBD_ALLOC(pga, npages * sizeof(*pga));
+        if (pga == NULL)
+                RETURN(-ENOMEM);
+
+        OBD_ALLOC(pages, npages * sizeof(*pages));
+        if (pages == NULL) {
+                OBD_FREE(pga, npages * sizeof(*pga));
+                RETURN(-ENOMEM);
+        }
+
+	for (i = 0, pgp = pga, off = offset;
+	     i < npages;
+	     i++, pgp++, off += PAGE_SIZE) {
+
+		LASSERT(pgp->pg == NULL);	/* for cleanup */
+
+		rc = -ENOMEM;
+		pgp->pg = alloc_page(gfp_mask);
+		if (pgp->pg == NULL)
+			goto out;
+
+		pages[i] = pgp->pg;
+		pgp->count = PAGE_SIZE;
+		pgp->off = off;
+		pgp->flag = brw_flags;
+
+		if (verify)
+			echo_client_page_debug_setup(pgp->pg, rw,
+						     ostid_id(&oa->o_oi), off,
+						     pgp->count);
+	}
+
+        /* brw mode can only be used at client */
+        LASSERT(ed->ed_next != NULL);
+        rc = cl_echo_object_brw(eco, rw, offset, pages, npages, async);
+
+ out:
+        if (rc != 0 || rw != OBD_BRW_READ)
+                verify = 0;
+
+        for (i = 0, pgp = pga; i < npages; i++, pgp++) {
+                if (pgp->pg == NULL)
+                        continue;
+
+		if (verify) {
+			int vrc;
+			vrc = echo_client_page_debug_check(pgp->pg,
+							   ostid_id(&oa->o_oi),
+							   pgp->off, pgp->count);
+			if (vrc != 0 && rc == 0)
+				rc = vrc;
+		}
+		__free_page(pgp->pg);
+        }
+        OBD_FREE(pga, npages * sizeof(*pga));
+        OBD_FREE(pages, npages * sizeof(*pages));
+        RETURN(rc);
+}
+
+static int echo_client_prep_commit(const struct lu_env *env,
+				   struct obd_export *exp, int rw,
+				   struct obdo *oa, struct echo_object *eco,
+				   u64 offset, u64 count,
+				   u64 batch, int async)
+{
+	struct obd_ioobj	 ioo;
+	struct niobuf_local	*lnb;
+	struct niobuf_remote	 rnb;
+	u64			 off;
+	u64			 npages, tot_pages, apc;
+	int i, ret = 0, brw_flags = 0;
+
+	ENTRY;
+
+	if (count <= 0 || (count & ~PAGE_MASK) != 0)
+		RETURN(-EINVAL);
+
+	apc = npages = batch >> PAGE_SHIFT;
+	tot_pages = count >> PAGE_SHIFT;
+
+	OBD_ALLOC(lnb, apc * sizeof(struct niobuf_local));
+	if (lnb == NULL)
+		RETURN(-ENOMEM);
+
+	if (rw == OBD_BRW_WRITE && async)
+		brw_flags |= OBD_BRW_ASYNC;
+
+	obdo_to_ioobj(oa, &ioo);
+
+	off = offset;
+
+	for (; tot_pages > 0; tot_pages -= npages) {
+		int lpages;
+
+		if (tot_pages < npages)
+			npages = tot_pages;
+
+		rnb.rnb_offset = off;
+		rnb.rnb_len = npages * PAGE_SIZE;
+		rnb.rnb_flags = brw_flags;
+		ioo.ioo_bufcnt = 1;
+		off += npages * PAGE_SIZE;
+
+		lpages = npages;
+		ret = obd_preprw(env, rw, exp, oa, 1, &ioo, &rnb, &lpages, lnb);
+		if (ret != 0)
+			GOTO(out, ret);
+
+		for (i = 0; i < lpages; i++) {
+			struct page *page = lnb[i].lnb_page;
+
+			/* read past eof? */
+			if (page == NULL && lnb[i].lnb_rc == 0)
+				continue;
+
+			if (async)
+				lnb[i].lnb_flags |= OBD_BRW_ASYNC;
+
+			if (ostid_id(&oa->o_oi) == ECHO_PERSISTENT_OBJID ||
+			    (oa->o_valid & OBD_MD_FLFLAGS) == 0 ||
+			    (oa->o_flags & OBD_FL_DEBUG_CHECK) == 0)
+				continue;
+
+			if (rw == OBD_BRW_WRITE)
+				echo_client_page_debug_setup(page, rw,
+							ostid_id(&oa->o_oi),
+							lnb[i].lnb_file_offset,
+							lnb[i].lnb_len);
+			else
+				echo_client_page_debug_check(page,
+							ostid_id(&oa->o_oi),
+							lnb[i].lnb_file_offset,
+							lnb[i].lnb_len);
+		}
+
+		ret = obd_commitrw(env, rw, exp, oa, 1, &ioo, &rnb, npages, lnb,
+				   ret);
+		if (ret != 0)
+			break;
+
+		/* Reuse env context. */
+		lu_context_exit((struct lu_context *)&env->le_ctx);
+		lu_context_enter((struct lu_context *)&env->le_ctx);
+	}
+
+out:
+	OBD_FREE(lnb, apc * sizeof(struct niobuf_local));
+
+	RETURN(ret);
+}
+
+static int echo_client_brw_ioctl(const struct lu_env *env, int rw,
+				 struct obd_export *exp,
+				 struct obd_ioctl_data *data)
+{
+        struct obd_device *obd = class_exp2obd(exp);
+        struct echo_device *ed = obd2echo_dev(obd);
+        struct echo_client_obd *ec = ed->ed_ec;
+        struct obdo *oa = &data->ioc_obdo1;
+        struct echo_object *eco;
+        int rc;
+        int async = 0;
+        long test_mode;
+        ENTRY;
+
+        LASSERT(oa->o_valid & OBD_MD_FLGROUP);
+
+        rc = echo_get_object(&eco, ed, oa);
+        if (rc)
+                RETURN(rc);
+
+        oa->o_valid &= ~OBD_MD_FLHANDLE;
+
+	/* OFD/obdfilter works only via prep/commit */
+        test_mode = (long)data->ioc_pbuf1;
+        if (ed->ed_next == NULL && test_mode != 3) {
+                test_mode = 3;
+                data->ioc_plen1 = data->ioc_count;
+        }
+
+	if (test_mode == 3)
+		async = 1;
+
+        /* Truncate batch size to maximum */
+        if (data->ioc_plen1 > PTLRPC_MAX_BRW_SIZE)
+                data->ioc_plen1 = PTLRPC_MAX_BRW_SIZE;
+
+        switch (test_mode) {
+        case 1:
+                /* fall through */
+        case 2:
+		rc = echo_client_kbrw(ed, rw, oa, eco, data->ioc_offset,
+				      data->ioc_count, async);
+		break;
+	case 3:
+		rc = echo_client_prep_commit(env, ec->ec_exp, rw, oa, eco,
+					     data->ioc_offset, data->ioc_count,
+					     data->ioc_plen1, async);
+		break;
+	default:
+		rc = -EINVAL;
+	}
+
+	echo_put_object(eco);
+
+	RETURN(rc);
+}
+
+static int
+echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+		      void *karg, void __user *uarg)
+{
+#ifdef HAVE_SERVER_SUPPORT
+	struct tgt_session_info *tsi;
+#endif
+        struct obd_device      *obd = exp->exp_obd;
+        struct echo_device     *ed = obd2echo_dev(obd);
+        struct echo_client_obd *ec = ed->ed_ec;
+        struct echo_object     *eco;
+        struct obd_ioctl_data  *data = karg;
+        struct lu_env          *env;
+        struct obdo            *oa;
+        struct lu_fid           fid;
+        int                     rw = OBD_BRW_READ;
+        int                     rc = 0;
+#ifdef HAVE_SERVER_SUPPORT
+	struct lu_context	 echo_session;
+#endif
+        ENTRY;
+
+	oa = &data->ioc_obdo1;
+	if (!(oa->o_valid & OBD_MD_FLGROUP)) {
+		oa->o_valid |= OBD_MD_FLGROUP;
+		ostid_set_seq_echo(&oa->o_oi);
+	}
+
+        /* This FID is unpacked just for validation at this point */
+        rc = ostid_to_fid(&fid, &oa->o_oi, 0);
+        if (rc < 0)
+                RETURN(rc);
+
+        OBD_ALLOC_PTR(env);
+        if (env == NULL)
+                RETURN(-ENOMEM);
+
+	rc = lu_env_init(env, LCT_DT_THREAD);
+	if (rc)
+		GOTO(out_alloc, rc = -ENOMEM);
+
+#ifdef HAVE_SERVER_SUPPORT
+	env->le_ses = &echo_session;
+	rc = lu_context_init(env->le_ses, LCT_SERVER_SESSION | LCT_NOREF);
+	if (unlikely(rc < 0))
+		GOTO(out_env, rc);
+	lu_context_enter(env->le_ses);
+
+	tsi = tgt_ses_info(env);
+	tsi->tsi_exp = ec->ec_exp;
+	tsi->tsi_jobid = NULL;
+#endif
+        switch (cmd) {
+        case OBD_IOC_CREATE:                    /* may create echo object */
+                if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+                        GOTO (out, rc = -EPERM);
+
+		rc = echo_create_object(env, ed, oa);
+                GOTO(out, rc);
+
+#ifdef HAVE_SERVER_SUPPORT
+	case OBD_IOC_ECHO_MD: {
+		int count;
+		int cmd;
+		char *dir = NULL;
+		int dirlen;
+		__u64 id;
+
+		if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+			GOTO(out, rc = -EPERM);
+
+		count = data->ioc_count;
+		cmd = data->ioc_command;
+
+		id = data->ioc_obdo2.o_oi.oi.oi_id;
+		dirlen = data->ioc_plen1;
+		OBD_ALLOC(dir, dirlen + 1);
+		if (dir == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		if (copy_from_user(dir, data->ioc_pbuf1, dirlen)) {
+			OBD_FREE(dir, data->ioc_plen1 + 1);
+			GOTO(out, rc = -EFAULT);
+		}
+
+		rc = echo_md_handler(ed, cmd, dir, dirlen, id, count, data);
+		OBD_FREE(dir, dirlen + 1);
+		GOTO(out, rc);
+	}
+        case OBD_IOC_ECHO_ALLOC_SEQ: {
+                struct lu_env   *cl_env;
+		__u16            refcheck;
+                __u64            seq;
+                int              max_count;
+
+                if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+                        GOTO(out, rc = -EPERM);
+
+                cl_env = cl_env_get(&refcheck);
+                if (IS_ERR(cl_env))
+                        GOTO(out, rc = PTR_ERR(cl_env));
+
+                rc = lu_env_refill_by_tags(cl_env, ECHO_MD_CTX_TAG,
+                                            ECHO_MD_SES_TAG);
+                if (rc != 0) {
+                        cl_env_put(cl_env, &refcheck);
+                        GOTO(out, rc);
+                }
+
+                rc = seq_client_get_seq(cl_env, ed->ed_cl_seq, &seq);
+                cl_env_put(cl_env, &refcheck);
+                if (rc < 0) {
+                        CERROR("%s: Can not alloc seq: rc = %d\n",
+                               obd->obd_name, rc);
+                        GOTO(out, rc);
+                }
+
+		if (copy_to_user(data->ioc_pbuf1, &seq, data->ioc_plen1))
+                        return -EFAULT;
+
+		max_count = LUSTRE_METADATA_SEQ_MAX_WIDTH;
+		if (copy_to_user(data->ioc_pbuf2, &max_count,
+				     data->ioc_plen2))
+			return -EFAULT;
+		GOTO(out, rc);
+        }
+#endif /* HAVE_SERVER_SUPPORT */
+        case OBD_IOC_DESTROY:
+                if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+                        GOTO (out, rc = -EPERM);
+
+                rc = echo_get_object(&eco, ed, oa);
+                if (rc == 0) {
+			rc = obd_destroy(env, ec->ec_exp, oa);
+                        if (rc == 0)
+                                eco->eo_deleted = 1;
+                        echo_put_object(eco);
+                }
+                GOTO(out, rc);
+
+        case OBD_IOC_GETATTR:
+                rc = echo_get_object(&eco, ed, oa);
+                if (rc == 0) {
+			rc = obd_getattr(env, ec->ec_exp, oa);
+                        echo_put_object(eco);
+                }
+                GOTO(out, rc);
+
+        case OBD_IOC_SETATTR:
+                if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+                        GOTO (out, rc = -EPERM);
+
+                rc = echo_get_object(&eco, ed, oa);
+                if (rc == 0) {
+			rc = obd_setattr(env, ec->ec_exp, oa);
+                        echo_put_object(eco);
+                }
+                GOTO(out, rc);
+
+        case OBD_IOC_BRW_WRITE:
+                if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+                        GOTO (out, rc = -EPERM);
+
+                rw = OBD_BRW_WRITE;
+                /* fall through */
+        case OBD_IOC_BRW_READ:
+		rc = echo_client_brw_ioctl(env, rw, exp, data);
+                GOTO(out, rc);
+
+        default:
+                CERROR ("echo_ioctl(): unrecognised ioctl %#x\n", cmd);
+                GOTO (out, rc = -ENOTTY);
+        }
+
+        EXIT;
+out:
+#ifdef HAVE_SERVER_SUPPORT
+	lu_context_exit(env->le_ses);
+	lu_context_fini(env->le_ses);
+out_env:
+#endif
+        lu_env_fini(env);
+out_alloc:
+        OBD_FREE_PTR(env);
+
+        return rc;
+}
+
+static int echo_client_setup(const struct lu_env *env,
+                             struct obd_device *obddev, struct lustre_cfg *lcfg)
+{
+        struct echo_client_obd *ec = &obddev->u.echo_client;
+        struct obd_device *tgt;
+        struct obd_uuid echo_uuid = { "ECHO_UUID" };
+        struct obd_connect_data *ocd = NULL;
+        int rc;
+        ENTRY;
+
+        if (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+                CERROR("requires a TARGET OBD name\n");
+                RETURN(-EINVAL);
+        }
+
+        tgt = class_name2obd(lustre_cfg_string(lcfg, 1));
+        if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) {
+                CERROR("device not attached or not set up (%s)\n",
+                       lustre_cfg_string(lcfg, 1));
+                RETURN(-EINVAL);
+        }
+
+	spin_lock_init(&ec->ec_lock);
+	INIT_LIST_HEAD(&ec->ec_objects);
+	INIT_LIST_HEAD(&ec->ec_locks);
+        ec->ec_unique = 0;
+
+	if (!strcmp(tgt->obd_type->typ_name, LUSTRE_MDT_NAME)) {
+#ifdef HAVE_SERVER_SUPPORT
+		lu_context_tags_update(ECHO_MD_CTX_TAG);
+		lu_session_tags_update(ECHO_MD_SES_TAG);
+#else
+		CERROR("Local operations are NOT supported on client side. "
+		       "Only remote operations are supported. Metadata client "
+		       "must be run on server side.\n");
+#endif
+		RETURN(0);
+	}
+
+        OBD_ALLOC(ocd, sizeof(*ocd));
+        if (ocd == NULL) {
+                CERROR("Can't alloc ocd connecting to %s\n",
+                       lustre_cfg_string(lcfg, 1));
+                return -ENOMEM;
+        }
+
+        ocd->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_REQPORTAL |
+				 OBD_CONNECT_BRW_SIZE |
+                                 OBD_CONNECT_GRANT | OBD_CONNECT_FULL20 |
+				 OBD_CONNECT_64BITHASH | OBD_CONNECT_LVB_TYPE |
+				 OBD_CONNECT_FID;
+	ocd->ocd_brw_size = DT_MAX_BRW_SIZE;
+        ocd->ocd_version = LUSTRE_VERSION_CODE;
+        ocd->ocd_group = FID_SEQ_ECHO;
+
+        rc = obd_connect(env, &ec->ec_exp, tgt, &echo_uuid, ocd, NULL);
+        if (rc == 0) {
+                /* Turn off pinger because it connects to tgt obd directly. */
+		spin_lock(&tgt->obd_dev_lock);
+		list_del_init(&ec->ec_exp->exp_obd_chain_timed);
+		spin_unlock(&tgt->obd_dev_lock);
+        }
+
+        OBD_FREE(ocd, sizeof(*ocd));
+
+        if (rc != 0) {
+                CERROR("fail to connect to device %s\n",
+                       lustre_cfg_string(lcfg, 1));
+                return (rc);
+        }
+
+        RETURN(rc);
+}
+
+static int echo_client_cleanup(struct obd_device *obddev)
+{
+        struct echo_device *ed = obd2echo_dev(obddev);
+        struct echo_client_obd *ec = &obddev->u.echo_client;
+        int rc;
+        ENTRY;
+
+        /*Do nothing for Metadata echo client*/
+        if (ed == NULL )
+                RETURN(0);
+
+        if (ed->ed_next_ismd) {
+#ifdef HAVE_SERVER_SUPPORT
+		lu_context_tags_clear(ECHO_MD_CTX_TAG);
+		lu_session_tags_clear(ECHO_MD_SES_TAG);
+#else
+		CERROR("This is client-side only module, does not support "
+			"metadata echo client.\n");
+#endif
+                RETURN(0);
+        }
+
+	if (!list_empty(&obddev->obd_exports)) {
+                CERROR("still has clients!\n");
+                RETURN(-EBUSY);
+        }
+
+	LASSERT(atomic_read(&ec->ec_exp->exp_refcount) > 0);
+        rc = obd_disconnect(ec->ec_exp);
+        if (rc != 0)
+                CERROR("fail to disconnect device: %d\n", rc);
+
+        RETURN(rc);
+}
+
+static int echo_client_connect(const struct lu_env *env,
+                               struct obd_export **exp,
+                               struct obd_device *src, struct obd_uuid *cluuid,
+                               struct obd_connect_data *data, void *localdata)
+{
+        int                rc;
+        struct lustre_handle conn = { 0 };
+
+        ENTRY;
+        rc = class_connect(&conn, src, cluuid);
+        if (rc == 0) {
+                *exp = class_conn2export(&conn);
+        }
+
+        RETURN (rc);
+}
+
+static int echo_client_disconnect(struct obd_export *exp)
+{
+        int                     rc;
+        ENTRY;
+
+        if (exp == NULL)
+                GOTO(out, rc = -EINVAL);
+
+        rc = class_disconnect(exp);
+        GOTO(out, rc);
+ out:
+        return rc;
+}
+
+static struct obd_ops echo_client_obd_ops = {
+        .o_owner       = THIS_MODULE,
+        .o_iocontrol   = echo_client_iocontrol,
+        .o_connect     = echo_client_connect,
+        .o_disconnect  = echo_client_disconnect
+};
+
+static int __init obdecho_init(void)
+{
+        int rc;
+
+        ENTRY;
+        LCONSOLE_INFO("Echo OBD driver; http://www.lustre.org/\n");
+
+	LASSERT(PAGE_SIZE % OBD_ECHO_BLOCK_SIZE == 0);
+
+# ifdef HAVE_SERVER_SUPPORT
+        rc = echo_persistent_pages_init();
+        if (rc != 0)
+                goto failed_0;
+
+	rc = class_register_type(&echo_obd_ops, NULL, true, NULL,
+				 LUSTRE_ECHO_NAME, NULL);
+	if (rc != 0)
+		goto failed_1;
+# endif
+
+	rc = lu_kmem_init(echo_caches);
+	if (rc == 0) {
+		rc = class_register_type(&echo_client_obd_ops, NULL, true, NULL,
+					 LUSTRE_ECHO_CLIENT_NAME,
+					 &echo_device_type);
+		if (rc)
+			lu_kmem_fini(echo_caches);
+	}
+
+# ifdef HAVE_SERVER_SUPPORT
+        if (rc == 0)
+                RETURN(0);
+
+        class_unregister_type(LUSTRE_ECHO_NAME);
+failed_1:
+        echo_persistent_pages_fini();
+failed_0:
+# endif
+        RETURN(rc);
+}
+
+static void __exit obdecho_exit(void)
+{
+	class_unregister_type(LUSTRE_ECHO_CLIENT_NAME);
+	lu_kmem_fini(echo_caches);
+
+#ifdef HAVE_SERVER_SUPPORT
+	class_unregister_type(LUSTRE_ECHO_NAME);
+	echo_persistent_pages_fini();
+#endif
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Echo Client test driver");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+MODULE_LICENSE("GPL");
+
+module_init(obdecho_init);
+module_exit(obdecho_exit);
+
+/** @} echo_client */
diff --git a/drivers/staging/lustrefsx/lustre/obdecho/echo_internal.h b/drivers/staging/lustrefsx/lustre/obdecho/echo_internal.h
new file mode 100644
index 0000000000000..8c72c40ebb767
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdecho/echo_internal.h
@@ -0,0 +1,52 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014 Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdecho/echo_internal.h
+ */
+
+#ifndef _ECHO_INTERNAL_H
+#define _ECHO_INTERNAL_H
+
+/* The persistent object (i.e. actually stores stuff!) */
+#define ECHO_PERSISTENT_OBJID    1ULL
+#define ECHO_PERSISTENT_SIZE     ((__u64)(1<<20))
+
+/* block size to use for data verification */
+#define OBD_ECHO_BLOCK_SIZE	(4<<10)
+
+#ifdef HAVE_SERVER_SUPPORT
+extern struct obd_ops echo_obd_ops;
+int echo_persistent_pages_init(void);
+void echo_persistent_pages_fini(void);
+#endif /* HAVE_SERVER_SUPPORT */
+
+#endif
diff --git a/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c b/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c
new file mode 100644
index 0000000000000..c7c3f0cd5bb23
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c
@@ -0,0 +1,845 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#include <asm/statfs.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <linux/seq_file.h>
+#include "osc_internal.h"
+
+#ifdef CONFIG_PROC_FS
+static int osc_active_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+
+	LPROCFS_CLIMP_CHECK(dev);
+	seq_printf(m, "%d\n", !dev->u.cli.cl_import->imp_deactive);
+	LPROCFS_CLIMP_EXIT(dev);
+	return 0;
+}
+
+static ssize_t osc_active_seq_write(struct file *file,
+				    const char __user *buffer,
+				    size_t count, loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	int rc;
+	__s64 val;
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+	if (val < 0 || val > 1)
+		return -ERANGE;
+
+	/* opposite senses */
+	if (dev->u.cli.cl_import->imp_deactive == val)
+		rc = ptlrpc_set_import_active(dev->u.cli.cl_import, val);
+	else
+		CDEBUG(D_CONFIG, "activate %d: ignoring repeat request\n",
+			(int)val);
+
+	return count;
+}
+LPROC_SEQ_FOPS(osc_active);
+
+static int osc_max_rpcs_in_flight_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+	struct client_obd *cli = &dev->u.cli;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	seq_printf(m, "%u\n", cli->cl_max_rpcs_in_flight);
+	spin_unlock(&cli->cl_loi_list_lock);
+	return 0;
+}
+
+static ssize_t osc_max_rpcs_in_flight_seq_write(struct file *file,
+						const char __user *buffer,
+						size_t count, loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct client_obd *cli = &dev->u.cli;
+	int rc;
+	int adding, added, req_count;
+	__s64 val;
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+	if (val < 1 || val > OSC_MAX_RIF_MAX)
+		return -ERANGE;
+
+	LPROCFS_CLIMP_CHECK(dev);
+
+	adding = (int)val - cli->cl_max_rpcs_in_flight;
+	req_count = atomic_read(&osc_pool_req_count);
+	if (adding > 0 && req_count < osc_reqpool_maxreqcount) {
+		/*
+		 * There might be some race which will cause over-limit
+		 * allocation, but it is fine.
+		 */
+		if (req_count + adding > osc_reqpool_maxreqcount)
+			adding = osc_reqpool_maxreqcount - req_count;
+
+		added = osc_rq_pool->prp_populate(osc_rq_pool, adding);
+		atomic_add(added, &osc_pool_req_count);
+	}
+
+	spin_lock(&cli->cl_loi_list_lock);
+	cli->cl_max_rpcs_in_flight = val;
+	client_adjust_max_dirty(cli);
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	LPROCFS_CLIMP_EXIT(dev);
+	return count;
+}
+LPROC_SEQ_FOPS(osc_max_rpcs_in_flight);
+
+static int osc_max_dirty_mb_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+        struct client_obd *cli = &dev->u.cli;
+        long val;
+        int mult;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	val = cli->cl_dirty_max_pages;
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	mult = 1 << (20 - PAGE_SHIFT);
+	return lprocfs_seq_read_frac_helper(m, val, mult);
+}
+
+static ssize_t osc_max_dirty_mb_seq_write(struct file *file,
+					  const char __user *buffer,
+					  size_t count, loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct client_obd *cli = &dev->u.cli;
+	int rc;
+	__s64 pages_number;
+
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &pages_number, 'M');
+	if (rc)
+		return rc;
+
+	pages_number >>= PAGE_SHIFT;
+
+	if (pages_number <= 0 ||
+	    pages_number >= OSC_MAX_DIRTY_MB_MAX << (20 - PAGE_SHIFT) ||
+	    pages_number > totalram_pages / 4) /* 1/4 of RAM */
+		return -ERANGE;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	cli->cl_dirty_max_pages = pages_number;
+	osc_wake_cache_waiters(cli);
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(osc_max_dirty_mb);
+
+static int osc_cached_mb_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+	struct client_obd *cli = &dev->u.cli;
+	int shift = 20 - PAGE_SHIFT;
+
+	seq_printf(m, "used_mb: %ld\n"
+		   "busy_cnt: %ld\n"
+		   "reclaim: %llu\n",
+		   (atomic_long_read(&cli->cl_lru_in_list) +
+		    atomic_long_read(&cli->cl_lru_busy)) >> shift,
+		    atomic_long_read(&cli->cl_lru_busy),
+		   cli->cl_lru_reclaim);
+
+	return 0;
+}
+
+/* shrink the number of caching pages to a specific number */
+static ssize_t
+osc_cached_mb_seq_write(struct file *file, const char __user *buffer,
+			size_t count, loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct client_obd *cli = &dev->u.cli;
+	__s64 pages_number;
+	long rc;
+	char kernbuf[128];
+
+	if (count >= sizeof(kernbuf))
+		return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+	kernbuf[count] = 0;
+
+	buffer += lprocfs_find_named_value(kernbuf, "used_mb:", &count) -
+		  kernbuf;
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &pages_number, 'M');
+	if (rc)
+		return rc;
+
+	pages_number >>= PAGE_SHIFT;
+
+	if (pages_number < 0)
+		return -ERANGE;
+
+	rc = atomic_long_read(&cli->cl_lru_in_list) - pages_number;
+	if (rc > 0) {
+		struct lu_env *env;
+		__u16 refcheck;
+
+		env = cl_env_get(&refcheck);
+		if (!IS_ERR(env)) {
+			(void)osc_lru_shrink(env, cli, rc, true);
+			cl_env_put(env, &refcheck);
+		}
+	}
+
+	return count;
+}
+LPROC_SEQ_FOPS(osc_cached_mb);
+
+static int osc_cur_dirty_bytes_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+	struct client_obd *cli = &dev->u.cli;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	seq_printf(m, "%lu\n", cli->cl_dirty_pages << PAGE_SHIFT);
+	spin_unlock(&cli->cl_loi_list_lock);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(osc_cur_dirty_bytes);
+
+static int osc_cur_grant_bytes_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+	struct client_obd *cli = &dev->u.cli;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	seq_printf(m, "%lu\n", cli->cl_avail_grant);
+	spin_unlock(&cli->cl_loi_list_lock);
+	return 0;
+}
+
+static ssize_t osc_cur_grant_bytes_seq_write(struct file *file,
+					     const char __user *buffer,
+					     size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	struct client_obd *cli = &obd->u.cli;
+	int                rc;
+	__s64              val;
+
+	if (obd == NULL)
+		return 0;
+
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &val, '1');
+	if (rc)
+		return rc;
+	if (val < 0)
+		return -ERANGE;
+
+	/* this is only for shrinking grant */
+	spin_lock(&cli->cl_loi_list_lock);
+	if (val >= cli->cl_avail_grant) {
+		spin_unlock(&cli->cl_loi_list_lock);
+		return 0;
+	}
+
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	LPROCFS_CLIMP_CHECK(obd);
+	if (cli->cl_import->imp_state == LUSTRE_IMP_FULL)
+		rc = osc_shrink_grant_to_target(cli, val);
+	LPROCFS_CLIMP_EXIT(obd);
+	if (rc)
+		return rc;
+	return count;
+}
+LPROC_SEQ_FOPS(osc_cur_grant_bytes);
+
+static int osc_cur_lost_grant_bytes_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+	struct client_obd *cli = &dev->u.cli;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	seq_printf(m, "%lu\n", cli->cl_lost_grant);
+	spin_unlock(&cli->cl_loi_list_lock);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(osc_cur_lost_grant_bytes);
+
+static int osc_cur_dirty_grant_bytes_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+	struct client_obd *cli = &dev->u.cli;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	seq_printf(m, "%lu\n", cli->cl_dirty_grant);
+	spin_unlock(&cli->cl_loi_list_lock);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(osc_cur_dirty_grant_bytes);
+
+static int osc_grant_shrink_interval_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+
+	if (obd == NULL)
+		return 0;
+	seq_printf(m, "%d\n",
+		   obd->u.cli.cl_grant_shrink_interval);
+	return 0;
+}
+
+static ssize_t osc_grant_shrink_interval_seq_write(struct file *file,
+						   const char __user *buffer,
+						   size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	int rc;
+	__s64 val;
+
+	if (obd == NULL)
+		return 0;
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val <= 0 || val > INT_MAX)
+		return -ERANGE;
+
+	obd->u.cli.cl_grant_shrink_interval = val;
+
+	return count;
+}
+LPROC_SEQ_FOPS(osc_grant_shrink_interval);
+
+static int osc_checksum_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+
+	if (obd == NULL)
+		return 0;
+
+	seq_printf(m, "%d\n", obd->u.cli.cl_checksum ? 1 : 0);
+	return 0;
+}
+
+static ssize_t osc_checksum_seq_write(struct file *file,
+				      const char __user *buffer,
+				      size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	int rc;
+	__s64 val;
+
+	if (obd == NULL)
+		return 0;
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	obd->u.cli.cl_checksum = !!val;
+
+	return count;
+}
+LPROC_SEQ_FOPS(osc_checksum);
+
+static int osc_checksum_type_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+	int i;
+	DECLARE_CKSUM_NAME;
+
+        if (obd == NULL)
+                return 0;
+
+	for (i = 0; i < ARRAY_SIZE(cksum_name); i++) {
+		if (((1 << i) & obd->u.cli.cl_supp_cksum_types) == 0)
+			continue;
+		if (obd->u.cli.cl_cksum_type == (1 << i))
+			seq_printf(m, "[%s] ", cksum_name[i]);
+		else
+			seq_printf(m, "%s ", cksum_name[i]);
+	}
+	seq_printf(m, "\n");
+	return 0;
+}
+
+static ssize_t osc_checksum_type_seq_write(struct file *file,
+					   const char __user *buffer,
+					   size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	int i;
+	DECLARE_CKSUM_NAME;
+	char kernbuf[10];
+
+        if (obd == NULL)
+                return 0;
+
+        if (count > sizeof(kernbuf) - 1)
+                return -EINVAL;
+	if (copy_from_user(kernbuf, buffer, count))
+                return -EFAULT;
+        if (count > 0 && kernbuf[count - 1] == '\n')
+                kernbuf[count - 1] = '\0';
+        else
+                kernbuf[count] = '\0';
+
+        for (i = 0; i < ARRAY_SIZE(cksum_name); i++) {
+                if (((1 << i) & obd->u.cli.cl_supp_cksum_types) == 0)
+                        continue;
+                if (!strcmp(kernbuf, cksum_name[i])) {
+                       obd->u.cli.cl_cksum_type = 1 << i;
+                       return count;
+                }
+        }
+        return -EINVAL;
+}
+LPROC_SEQ_FOPS(osc_checksum_type);
+
+static int osc_resend_count_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+
+	seq_printf(m, "%u\n", atomic_read(&obd->u.cli.cl_resends));
+	return 0;
+}
+
+static ssize_t osc_resend_count_seq_write(struct file *file,
+					  const char __user *buffer,
+					  size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	int rc;
+	__s64 val;
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val < 0 || val > INT_MAX)
+		return -EINVAL;
+
+	atomic_set(&obd->u.cli.cl_resends, val);
+
+	return count;
+}
+LPROC_SEQ_FOPS(osc_resend_count);
+
+static int osc_checksum_dump_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+
+	if (obd == NULL)
+		return 0;
+
+	seq_printf(m, "%d\n", obd->u.cli.cl_checksum_dump ? 1 : 0);
+	return 0;
+}
+
+static ssize_t osc_checksum_dump_seq_write(struct file *file,
+					   const char __user *buffer,
+					   size_t count, loff_t *off)
+{
+	struct obd_device *obd;
+	int rc;
+	__s64 val;
+
+	obd = ((struct seq_file *)file->private_data)->private;
+	if (obd == NULL)
+		return 0;
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	obd->u.cli.cl_checksum_dump = (val ? 1 : 0);
+
+	return count;
+}
+LPROC_SEQ_FOPS(osc_checksum_dump);
+
+static int osc_contention_seconds_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+	struct osc_device *od  = obd2osc_dev(obd);
+
+	seq_printf(m, "%u\n", od->od_contention_time);
+	return 0;
+}
+
+static ssize_t osc_contention_seconds_seq_write(struct file *file,
+						const char __user *buffer,
+						size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	struct osc_device *od  = obd2osc_dev(obd);
+	int rc;
+	__s64 val;
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+	if (val < 0 || val > INT_MAX)
+		return -ERANGE;
+
+	od->od_contention_time = val;
+
+	return count;
+}
+LPROC_SEQ_FOPS(osc_contention_seconds);
+
+static int osc_lockless_truncate_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+	struct osc_device *od  = obd2osc_dev(obd);
+
+	seq_printf(m, "%u\n", od->od_lockless_truncate);
+	return 0;
+}
+
+static ssize_t osc_lockless_truncate_seq_write(struct file *file,
+					       const char __user *buffer,
+				    size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+        struct osc_device *od  = obd2osc_dev(obd);
+	int rc;
+	__s64 val;
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc)
+		return rc;
+	if (val < 0)
+		return -ERANGE;
+
+	od->od_lockless_truncate = !!val;
+
+	return count;
+}
+LPROC_SEQ_FOPS(osc_lockless_truncate);
+
+static int osc_destroys_in_flight_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+	seq_printf(m, "%u\n",
+		   atomic_read(&obd->u.cli.cl_destroy_in_flight));
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(osc_destroys_in_flight);
+
+LPROC_SEQ_FOPS_RW_TYPE(osc, obd_max_pages_per_rpc);
+
+static int osc_unstable_stats_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+	struct client_obd *cli = &dev->u.cli;
+	long pages;
+	int mb;
+
+	pages = atomic_long_read(&cli->cl_unstable_count);
+	mb    = (pages * PAGE_SIZE) >> 20;
+
+	seq_printf(m, "unstable_pages: %20ld\n"
+		   "unstable_mb:              %10d\n",
+		   pages, mb);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(osc_unstable_stats);
+
+LPROC_SEQ_FOPS_RO_TYPE(osc, uuid);
+LPROC_SEQ_FOPS_RO_TYPE(osc, connect_flags);
+LPROC_SEQ_FOPS_RO_TYPE(osc, blksize);
+LPROC_SEQ_FOPS_RO_TYPE(osc, kbytestotal);
+LPROC_SEQ_FOPS_RO_TYPE(osc, kbytesfree);
+LPROC_SEQ_FOPS_RO_TYPE(osc, kbytesavail);
+LPROC_SEQ_FOPS_RO_TYPE(osc, filestotal);
+LPROC_SEQ_FOPS_RO_TYPE(osc, filesfree);
+LPROC_SEQ_FOPS_RO_TYPE(osc, server_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(osc, conn_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(osc, timeouts);
+LPROC_SEQ_FOPS_RO_TYPE(osc, state);
+
+LPROC_SEQ_FOPS_WO_TYPE(osc, ping);
+
+LPROC_SEQ_FOPS_RW_TYPE(osc, import);
+LPROC_SEQ_FOPS_RW_TYPE(osc, pinger_recov);
+
+struct lprocfs_vars lprocfs_osc_obd_vars[] = {
+	{ .name	=	"uuid",
+	  .fops	=	&osc_uuid_fops			},
+	{ .name	=	"ping",
+	  .fops	=	&osc_ping_fops,
+	  .proc_mode =	0222				},
+	{ .name	=	"connect_flags",
+	  .fops	=	&osc_connect_flags_fops		},
+	{ .name	=	"blocksize",
+	  .fops	=	&osc_blksize_fops		},
+	{ .name	=	"kbytestotal",
+	  .fops	=	&osc_kbytestotal_fops		},
+	{ .name	=	"kbytesfree",
+	  .fops	=	&osc_kbytesfree_fops		},
+	{ .name	=	"kbytesavail",
+	  .fops	=	&osc_kbytesavail_fops		},
+	{ .name	=	"filestotal",
+	  .fops	=	&osc_filestotal_fops		},
+	{ .name	=	"filesfree",
+	  .fops	=	&osc_filesfree_fops		},
+	{ .name	=	"ost_server_uuid",
+	  .fops	=	&osc_server_uuid_fops		},
+	{ .name	=	"ost_conn_uuid",
+	  .fops	=	&osc_conn_uuid_fops		},
+	{ .name	=	"active",
+	  .fops	=	&osc_active_fops		},
+	{ .name	=	"max_pages_per_rpc",
+	  .fops	=	&osc_obd_max_pages_per_rpc_fops	},
+	{ .name	=	"max_rpcs_in_flight",
+	  .fops	=	&osc_max_rpcs_in_flight_fops	},
+	{ .name	=	"destroys_in_flight",
+	  .fops	=	&osc_destroys_in_flight_fops	},
+	{ .name	=	"max_dirty_mb",
+	  .fops	=	&osc_max_dirty_mb_fops		},
+	{ .name	=	"osc_cached_mb",
+	  .fops	=	&osc_cached_mb_fops		},
+	{ .name	=	"cur_dirty_bytes",
+	  .fops	=	&osc_cur_dirty_bytes_fops	},
+	{ .name	=	"cur_grant_bytes",
+	  .fops	=	&osc_cur_grant_bytes_fops	},
+	{ .name	=	"cur_lost_grant_bytes",
+	  .fops	=	&osc_cur_lost_grant_bytes_fops	},
+	{ .name	=	"cur_dirty_grant_bytes",
+	  .fops	=	&osc_cur_dirty_grant_bytes_fops	},
+	{ .name	=	"grant_shrink_interval",
+	  .fops	=	&osc_grant_shrink_interval_fops	},
+	{ .name	=	"checksums",
+	  .fops	=	&osc_checksum_fops		},
+	{ .name	=	"checksum_type",
+	  .fops	=	&osc_checksum_type_fops		},
+	{ .name	=	"checksum_dump",
+	  .fops	=	&osc_checksum_dump_fops		},
+	{ .name	=	"resend_count",
+	  .fops	=	&osc_resend_count_fops		},
+	{ .name	=	"timeouts",
+	  .fops	=	&osc_timeouts_fops		},
+	{ .name	=	"contention_seconds",
+	  .fops	=	&osc_contention_seconds_fops	},
+	{ .name	=	"lockless_truncate",
+	  .fops	=	&osc_lockless_truncate_fops	},
+	{ .name	=	"import",
+	  .fops	=	&osc_import_fops		},
+	{ .name	=	"state",
+	  .fops	=	&osc_state_fops			},
+	{ .name	=	"pinger_recov",
+	  .fops	=	&osc_pinger_recov_fops		},
+	{ .name	=	"unstable_stats",
+	  .fops	=	&osc_unstable_stats_fops	},
+	{ NULL }
+};
+
+#define pct(a,b) (b ? a * 100 / b : 0)
+
+static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct timespec64 now;
+	struct obd_device *dev = seq->private;
+	struct client_obd *cli = &dev->u.cli;
+	unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum;
+	int i;
+
+	ktime_get_real_ts64(&now);
+
+	spin_lock(&cli->cl_loi_list_lock);
+
+	seq_printf(seq, "snapshot_time:         %lld.%09lu (secs.nsecs)\n",
+		   (s64)now.tv_sec, now.tv_nsec);
+	seq_printf(seq, "read RPCs in flight:  %d\n",
+		   cli->cl_r_in_flight);
+	seq_printf(seq, "write RPCs in flight: %d\n",
+		   cli->cl_w_in_flight);
+	seq_printf(seq, "pending write pages:  %d\n",
+		   atomic_read(&cli->cl_pending_w_pages));
+	seq_printf(seq, "pending read pages:   %d\n",
+		   atomic_read(&cli->cl_pending_r_pages));
+
+	seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
+	seq_printf(seq, "pages per rpc         rpcs   %% cum %% |");
+	seq_printf(seq, "       rpcs   %% cum %%\n");
+
+	read_tot = lprocfs_oh_sum(&cli->cl_read_page_hist);
+	write_tot = lprocfs_oh_sum(&cli->cl_write_page_hist);
+
+	read_cum = 0;
+	write_cum = 0;
+	for (i = 0; i < OBD_HIST_MAX; i++) {
+		unsigned long r = cli->cl_read_page_hist.oh_buckets[i];
+		unsigned long w = cli->cl_write_page_hist.oh_buckets[i];
+
+		read_cum += r;
+		write_cum += w;
+		seq_printf(seq, "%d:\t\t%10lu %3lu %3lu   | %10lu %3lu %3lu\n",
+			   1 << i, r, pct(r, read_tot),
+			   pct(read_cum, read_tot), w,
+			   pct(w, write_tot),
+			   pct(write_cum, write_tot));
+		if (read_cum == read_tot && write_cum == write_tot)
+			break;
+	}
+
+	seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
+	seq_printf(seq, "rpcs in flight        rpcs   %% cum %% |");
+	seq_printf(seq, "       rpcs   %% cum %%\n");
+
+	read_tot = lprocfs_oh_sum(&cli->cl_read_rpc_hist);
+	write_tot = lprocfs_oh_sum(&cli->cl_write_rpc_hist);
+
+        read_cum = 0;
+        write_cum = 0;
+        for (i = 0; i < OBD_HIST_MAX; i++) {
+                unsigned long r = cli->cl_read_rpc_hist.oh_buckets[i];
+                unsigned long w = cli->cl_write_rpc_hist.oh_buckets[i];
+                read_cum += r;
+                write_cum += w;
+		seq_printf(seq, "%d:\t\t%10lu %3lu %3lu   | %10lu %3lu %3lu\n",
+			   i, r, pct(r, read_tot),
+			   pct(read_cum, read_tot), w,
+			   pct(w, write_tot),
+			   pct(write_cum, write_tot));
+                if (read_cum == read_tot && write_cum == write_tot)
+                        break;
+        }
+
+        seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
+        seq_printf(seq, "offset                rpcs   %% cum %% |");
+        seq_printf(seq, "       rpcs   %% cum %%\n");
+
+        read_tot = lprocfs_oh_sum(&cli->cl_read_offset_hist);
+        write_tot = lprocfs_oh_sum(&cli->cl_write_offset_hist);
+
+        read_cum = 0;
+        write_cum = 0;
+        for (i = 0; i < OBD_HIST_MAX; i++) {
+                unsigned long r = cli->cl_read_offset_hist.oh_buckets[i];
+                unsigned long w = cli->cl_write_offset_hist.oh_buckets[i];
+                read_cum += r;
+                write_cum += w;
+                seq_printf(seq, "%d:\t\t%10lu %3lu %3lu   | %10lu %3lu %3lu\n",
+                           (i == 0) ? 0 : 1 << (i - 1),
+                           r, pct(r, read_tot), pct(read_cum, read_tot),
+                           w, pct(w, write_tot), pct(write_cum, write_tot));
+                if (read_cum == read_tot && write_cum == write_tot)
+                        break;
+        }
+
+	spin_unlock(&cli->cl_loi_list_lock);
+
+        return 0;
+}
+#undef pct
+
+static ssize_t osc_rpc_stats_seq_write(struct file *file,
+				       const char __user *buf,
+                                       size_t len, loff_t *off)
+{
+        struct seq_file *seq = file->private_data;
+        struct obd_device *dev = seq->private;
+        struct client_obd *cli = &dev->u.cli;
+
+        lprocfs_oh_clear(&cli->cl_read_rpc_hist);
+        lprocfs_oh_clear(&cli->cl_write_rpc_hist);
+        lprocfs_oh_clear(&cli->cl_read_page_hist);
+        lprocfs_oh_clear(&cli->cl_write_page_hist);
+        lprocfs_oh_clear(&cli->cl_read_offset_hist);
+        lprocfs_oh_clear(&cli->cl_write_offset_hist);
+
+        return len;
+}
+LPROC_SEQ_FOPS(osc_rpc_stats);
+
+static int osc_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct timespec64 now;
+	struct obd_device *dev = seq->private;
+	struct osc_stats *stats = &obd2osc_dev(dev)->od_stats;
+
+	ktime_get_real_ts64(&now);
+
+	seq_printf(seq, "snapshot_time:         %lld.%09lu (secs.nsecs)\n",
+		   (s64)now.tv_sec, now.tv_nsec);
+	seq_printf(seq, "lockless_write_bytes\t\t%llu\n",
+		   stats->os_lockless_writes);
+	seq_printf(seq, "lockless_read_bytes\t\t%llu\n",
+		   stats->os_lockless_reads);
+	seq_printf(seq, "lockless_truncate\t\t%llu\n",
+		   stats->os_lockless_truncates);
+	return 0;
+}
+
+static ssize_t osc_stats_seq_write(struct file *file,
+				   const char __user *buf,
+                                   size_t len, loff_t *off)
+{
+        struct seq_file *seq = file->private_data;
+        struct obd_device *dev = seq->private;
+        struct osc_stats *stats = &obd2osc_dev(dev)->od_stats;
+
+        memset(stats, 0, sizeof(*stats));
+        return len;
+}
+
+LPROC_SEQ_FOPS(osc_stats);
+
+int lproc_osc_attach_seqstat(struct obd_device *dev)
+{
+	int rc;
+
+	rc = lprocfs_seq_create(dev->obd_proc_entry, "osc_stats", 0644,
+				&osc_stats_fops, dev);
+	if (rc == 0)
+		rc = lprocfs_obd_seq_create(dev, "rpc_stats", 0644,
+					    &osc_rpc_stats_fops, dev);
+
+	return rc;
+}
+#endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_cache.c b/drivers/staging/lustrefsx/lustre/osc/osc_cache.c
new file mode 100644
index 0000000000000..06da0c5333a3b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_cache.c
@@ -0,0 +1,3323 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ *
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * osc cache management.
+ *
+ * Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+#include "osc_internal.h"
+
+static int extent_debug; /* set it to be true for more debug */
+
+static void osc_update_pending(struct osc_object *obj, int cmd, int delta);
+static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext,
+			   enum osc_extent_state state);
+static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli,
+			      struct osc_async_page *oap, int sent, int rc);
+static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap,
+			  int cmd);
+static int osc_refresh_count(const struct lu_env *env,
+			     struct osc_async_page *oap, int cmd);
+static int osc_io_unplug_async(const struct lu_env *env,
+			       struct client_obd *cli, struct osc_object *osc);
+static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages,
+			   unsigned int lost_grant, unsigned int dirty_grant);
+
+static void osc_extent_tree_dump0(int level, struct osc_object *obj,
+				  const char *func, int line);
+#define osc_extent_tree_dump(lvl, obj) \
+	osc_extent_tree_dump0(lvl, obj, __func__, __LINE__)
+
+static void osc_unreserve_grant(struct client_obd *cli, unsigned int reserved,
+				unsigned int unused);
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/* ------------------ osc extent ------------------ */
+static inline char *ext_flags(struct osc_extent *ext, char *flags)
+{
+	char *buf = flags;
+	*buf++ = ext->oe_rw ? 'r' : 'w';
+	if (ext->oe_intree)
+		*buf++ = 'i';
+	if (ext->oe_sync)
+		*buf++ = 'S';
+	if (ext->oe_srvlock)
+		*buf++ = 's';
+	if (ext->oe_hp)
+		*buf++ = 'h';
+	if (ext->oe_urgent)
+		*buf++ = 'u';
+	if (ext->oe_memalloc)
+		*buf++ = 'm';
+	if (ext->oe_trunc_pending)
+		*buf++ = 't';
+	if (ext->oe_fsync_wait)
+		*buf++ = 'Y';
+	*buf = 0;
+	return flags;
+}
+
+static inline char list_empty_marker(struct list_head *list)
+{
+	return list_empty(list) ? '-' : '+';
+}
+
+#define EXTSTR       "[%lu -> %lu/%lu]"
+#define EXTPARA(ext) (ext)->oe_start, (ext)->oe_end, (ext)->oe_max_end
+static const char *oes_strings[] = {
+	"inv", "active", "cache", "locking", "lockdone", "rpc", "trunc", NULL };
+
+#define OSC_EXTENT_DUMP(lvl, extent, fmt, ...) do {			      \
+	struct osc_extent *__ext = (extent);				      \
+	char __buf[16];							      \
+									      \
+	CDEBUG(lvl,							      \
+		"extent %p@{" EXTSTR ", "				      \
+		"[%d|%d|%c|%s|%s|%p], [%d|%d|%c|%c|%p|%u|%p]} " fmt,	      \
+		/* ----- extent part 0 ----- */				      \
+		__ext, EXTPARA(__ext),					      \
+		/* ----- part 1 ----- */				      \
+		atomic_read(&__ext->oe_refc),			      \
+		atomic_read(&__ext->oe_users),			      \
+		list_empty_marker(&__ext->oe_link),			      \
+		oes_strings[__ext->oe_state], ext_flags(__ext, __buf),	      \
+		__ext->oe_obj,						      \
+		/* ----- part 2 ----- */				      \
+		__ext->oe_grants, __ext->oe_nr_pages,			      \
+		list_empty_marker(&__ext->oe_pages),			      \
+		waitqueue_active(&__ext->oe_waitq) ? '+' : '-',		      \
+		__ext->oe_dlmlock, __ext->oe_mppr, __ext->oe_owner,	      \
+		/* ----- part 4 ----- */				      \
+		## __VA_ARGS__);					      \
+	if (lvl == D_ERROR && __ext->oe_dlmlock != NULL)		      \
+		LDLM_ERROR(__ext->oe_dlmlock, "extent: %p", __ext);	      \
+	else								      \
+		LDLM_DEBUG(__ext->oe_dlmlock, "extent: %p", __ext);	      \
+} while (0)
+
+#undef EASSERTF
+#define EASSERTF(expr, ext, fmt, args...) do {				\
+	if (!(expr)) {							\
+		OSC_EXTENT_DUMP(D_ERROR, (ext), fmt, ##args);		\
+		osc_extent_tree_dump(D_ERROR, (ext)->oe_obj);		\
+		LASSERT(expr);						\
+	}								\
+} while (0)
+
+#undef EASSERT
+#define EASSERT(expr, ext) EASSERTF(expr, ext, "\n")
+
+static inline struct osc_extent *rb_extent(struct rb_node *n)
+{
+	if (n == NULL)
+		return NULL;
+
+	return container_of(n, struct osc_extent, oe_node);
+}
+
+static inline struct osc_extent *next_extent(struct osc_extent *ext)
+{
+	if (ext == NULL)
+		return NULL;
+
+	LASSERT(ext->oe_intree);
+	return rb_extent(rb_next(&ext->oe_node));
+}
+
+static inline struct osc_extent *prev_extent(struct osc_extent *ext)
+{
+	if (ext == NULL)
+		return NULL;
+
+	LASSERT(ext->oe_intree);
+	return rb_extent(rb_prev(&ext->oe_node));
+}
+
+static inline struct osc_extent *first_extent(struct osc_object *obj)
+{
+	return rb_extent(rb_first(&obj->oo_root));
+}
+
+/* object must be locked by caller. */
+static int osc_extent_sanity_check0(struct osc_extent *ext,
+				    const char *func, const int line)
+{
+	struct osc_object *obj = ext->oe_obj;
+	struct osc_async_page *oap;
+	size_t page_count;
+	int rc = 0;
+
+	if (!osc_object_is_locked(obj))
+		GOTO(out, rc = 9);
+
+	if (ext->oe_state >= OES_STATE_MAX)
+		GOTO(out, rc = 10);
+
+	if (atomic_read(&ext->oe_refc) <= 0)
+		GOTO(out, rc = 20);
+
+	if (atomic_read(&ext->oe_refc) < atomic_read(&ext->oe_users))
+		GOTO(out, rc = 30);
+
+	switch (ext->oe_state) {
+	case OES_INV:
+		if (ext->oe_nr_pages > 0 || !list_empty(&ext->oe_pages))
+			GOTO(out, rc = 35);
+		GOTO(out, rc = 0);
+		break;
+	case OES_ACTIVE:
+		if (atomic_read(&ext->oe_users) == 0)
+			GOTO(out, rc = 40);
+		if (ext->oe_hp)
+			GOTO(out, rc = 50);
+		if (ext->oe_fsync_wait && !ext->oe_urgent)
+			GOTO(out, rc = 55);
+		break;
+	case OES_CACHE:
+		if (ext->oe_grants == 0)
+			GOTO(out, rc = 60);
+		if (ext->oe_fsync_wait && !ext->oe_urgent && !ext->oe_hp)
+			GOTO(out, rc = 65);
+	default:
+		if (atomic_read(&ext->oe_users) > 0)
+			GOTO(out, rc = 70);
+	}
+
+	if (ext->oe_max_end < ext->oe_end || ext->oe_end < ext->oe_start)
+		GOTO(out, rc = 80);
+
+	if (ext->oe_sync && ext->oe_grants > 0)
+		GOTO(out, rc = 90);
+
+	if (ext->oe_dlmlock != NULL && !ldlm_is_failed(ext->oe_dlmlock)) {
+		struct ldlm_extent *extent;
+
+		extent = &ext->oe_dlmlock->l_policy_data.l_extent;
+		if (!(extent->start <= cl_offset(osc2cl(obj), ext->oe_start) &&
+		      extent->end   >= cl_offset(osc2cl(obj), ext->oe_max_end)))
+			GOTO(out, rc = 100);
+
+		if (!(ext->oe_dlmlock->l_granted_mode & (LCK_PW | LCK_GROUP)))
+			GOTO(out, rc = 102);
+	}
+
+	if (ext->oe_nr_pages > ext->oe_mppr)
+		GOTO(out, rc = 105);
+
+	/* Do not verify page list if extent is in RPC. This is because an
+	 * in-RPC extent is supposed to be exclusively accessible w/o lock. */
+	if (ext->oe_state > OES_CACHE)
+		GOTO(out, rc = 0);
+
+	if (!extent_debug)
+		GOTO(out, rc = 0);
+
+	page_count = 0;
+	list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+		pgoff_t index = osc_index(oap2osc(oap));
+		++page_count;
+		if (index > ext->oe_end || index < ext->oe_start)
+			GOTO(out, rc = 110);
+	}
+	if (page_count != ext->oe_nr_pages)
+		GOTO(out, rc = 120);
+
+out:
+	if (rc != 0)
+		OSC_EXTENT_DUMP(D_ERROR, ext,
+				"%s:%d sanity check %p failed with rc = %d\n",
+				func, line, ext, rc);
+	return rc;
+}
+
+#define sanity_check_nolock(ext) \
+	osc_extent_sanity_check0(ext, __func__, __LINE__)
+
+#define sanity_check(ext) ({                                                   \
+	int __res;                                                             \
+	osc_object_lock((ext)->oe_obj);                                        \
+	__res = sanity_check_nolock(ext);                                      \
+	osc_object_unlock((ext)->oe_obj);                                      \
+	__res;                                                                 \
+})
+
+
+/**
+ * sanity check - to make sure there is no overlapped extent in the tree.
+ */
+static int osc_extent_is_overlapped(struct osc_object *obj,
+				    struct osc_extent *ext)
+{
+	struct osc_extent *tmp;
+
+	LASSERT(osc_object_is_locked(obj));
+
+	if (!extent_debug)
+		return 0;
+
+	for (tmp = first_extent(obj); tmp != NULL; tmp = next_extent(tmp)) {
+		if (tmp == ext)
+			continue;
+		if (tmp->oe_end >= ext->oe_start &&
+		    tmp->oe_start <= ext->oe_end)
+			return 1;
+	}
+	return 0;
+}
+
+static void osc_extent_state_set(struct osc_extent *ext, int state)
+{
+	LASSERT(osc_object_is_locked(ext->oe_obj));
+	LASSERT(state >= OES_INV && state < OES_STATE_MAX);
+
+	/* Never try to sanity check a state changing extent :-) */
+	/* LASSERT(sanity_check_nolock(ext) == 0); */
+
+	/* TODO: validate the state machine */
+	ext->oe_state = state;
+	wake_up_all(&ext->oe_waitq);
+}
+
+static struct osc_extent *osc_extent_alloc(struct osc_object *obj)
+{
+	struct osc_extent *ext;
+
+	OBD_SLAB_ALLOC_PTR_GFP(ext, osc_extent_kmem, GFP_NOFS);
+	if (ext == NULL)
+		return NULL;
+
+	RB_CLEAR_NODE(&ext->oe_node);
+	ext->oe_obj = obj;
+	cl_object_get(osc2cl(obj));
+	atomic_set(&ext->oe_refc, 1);
+	atomic_set(&ext->oe_users, 0);
+	INIT_LIST_HEAD(&ext->oe_link);
+	ext->oe_state = OES_INV;
+	INIT_LIST_HEAD(&ext->oe_pages);
+	init_waitqueue_head(&ext->oe_waitq);
+	ext->oe_dlmlock = NULL;
+
+	return ext;
+}
+
+static void osc_extent_free(struct osc_extent *ext)
+{
+	OBD_SLAB_FREE_PTR(ext, osc_extent_kmem);
+}
+
+static struct osc_extent *osc_extent_get(struct osc_extent *ext)
+{
+	LASSERT(atomic_read(&ext->oe_refc) >= 0);
+	atomic_inc(&ext->oe_refc);
+	return ext;
+}
+
+static void osc_extent_put(const struct lu_env *env, struct osc_extent *ext)
+{
+	LASSERT(atomic_read(&ext->oe_refc) > 0);
+	if (atomic_dec_and_test(&ext->oe_refc)) {
+		LASSERT(list_empty(&ext->oe_link));
+		LASSERT(atomic_read(&ext->oe_users) == 0);
+		LASSERT(ext->oe_state == OES_INV);
+		LASSERT(!ext->oe_intree);
+
+		if (ext->oe_dlmlock != NULL) {
+			lu_ref_add(&ext->oe_dlmlock->l_reference,
+				   "osc_extent", ext);
+			LDLM_LOCK_PUT(ext->oe_dlmlock);
+			ext->oe_dlmlock = NULL;
+		}
+		cl_object_put(env, osc2cl(ext->oe_obj));
+		osc_extent_free(ext);
+	}
+}
+
+/**
+ * osc_extent_put_trust() is a special version of osc_extent_put() when
+ * it's known that the caller is not the last user. This is to address the
+ * problem of lacking of lu_env ;-).
+ */
+static void osc_extent_put_trust(struct osc_extent *ext)
+{
+	LASSERT(atomic_read(&ext->oe_refc) > 1);
+	LASSERT(osc_object_is_locked(ext->oe_obj));
+	atomic_dec(&ext->oe_refc);
+}
+
+/**
+ * Return the extent which includes pgoff @index, or return the greatest
+ * previous extent in the tree.
+ */
+static struct osc_extent *osc_extent_search(struct osc_object *obj,
+					    pgoff_t index)
+{
+	struct rb_node    *n = obj->oo_root.rb_node;
+	struct osc_extent *tmp, *p = NULL;
+
+	LASSERT(osc_object_is_locked(obj));
+	while (n != NULL) {
+		tmp = rb_extent(n);
+		if (index < tmp->oe_start) {
+			n = n->rb_left;
+		} else if (index > tmp->oe_end) {
+			p = rb_extent(n);
+			n = n->rb_right;
+		} else {
+			return tmp;
+		}
+	}
+	return p;
+}
+
+/*
+ * Return the extent covering @index, otherwise return NULL.
+ * caller must have held object lock.
+ */
+static struct osc_extent *osc_extent_lookup(struct osc_object *obj,
+					    pgoff_t index)
+{
+	struct osc_extent *ext;
+
+	ext = osc_extent_search(obj, index);
+	if (ext != NULL && ext->oe_start <= index && index <= ext->oe_end)
+		return osc_extent_get(ext);
+	return NULL;
+}
+
+/* caller must have held object lock. */
+static void osc_extent_insert(struct osc_object *obj, struct osc_extent *ext)
+{
+	struct rb_node   **n      = &obj->oo_root.rb_node;
+	struct rb_node    *parent = NULL;
+	struct osc_extent *tmp;
+
+	LASSERT(ext->oe_intree == 0);
+	LASSERT(ext->oe_obj == obj);
+	LASSERT(osc_object_is_locked(obj));
+	while (*n != NULL) {
+		tmp = rb_extent(*n);
+		parent = *n;
+
+		if (ext->oe_end < tmp->oe_start)
+			n = &(*n)->rb_left;
+		else if (ext->oe_start > tmp->oe_end)
+			n = &(*n)->rb_right;
+		else
+			EASSERTF(0, tmp, EXTSTR"\n", EXTPARA(ext));
+	}
+	rb_link_node(&ext->oe_node, parent, n);
+	rb_insert_color(&ext->oe_node, &obj->oo_root);
+	osc_extent_get(ext);
+	ext->oe_intree = 1;
+}
+
+/* caller must have held object lock. */
+static void osc_extent_erase(struct osc_extent *ext)
+{
+	struct osc_object *obj = ext->oe_obj;
+	LASSERT(osc_object_is_locked(obj));
+	if (ext->oe_intree) {
+		rb_erase(&ext->oe_node, &obj->oo_root);
+		ext->oe_intree = 0;
+		/* rbtree held a refcount */
+		osc_extent_put_trust(ext);
+	}
+}
+
+static struct osc_extent *osc_extent_hold(struct osc_extent *ext)
+{
+	struct osc_object *obj = ext->oe_obj;
+
+	LASSERT(osc_object_is_locked(obj));
+	LASSERT(ext->oe_state == OES_ACTIVE || ext->oe_state == OES_CACHE);
+	if (ext->oe_state == OES_CACHE) {
+		osc_extent_state_set(ext, OES_ACTIVE);
+		osc_update_pending(obj, OBD_BRW_WRITE, -ext->oe_nr_pages);
+	}
+	atomic_inc(&ext->oe_users);
+	list_del_init(&ext->oe_link);
+	return osc_extent_get(ext);
+}
+
+static void __osc_extent_remove(struct osc_extent *ext)
+{
+	LASSERT(osc_object_is_locked(ext->oe_obj));
+	LASSERT(list_empty(&ext->oe_pages));
+	osc_extent_erase(ext);
+	list_del_init(&ext->oe_link);
+	osc_extent_state_set(ext, OES_INV);
+	OSC_EXTENT_DUMP(D_CACHE, ext, "destroyed.\n");
+}
+
+static void osc_extent_remove(struct osc_extent *ext)
+{
+	struct osc_object *obj = ext->oe_obj;
+
+	osc_object_lock(obj);
+	__osc_extent_remove(ext);
+	osc_object_unlock(obj);
+}
+
+/**
+ * This function is used to merge extents to get better performance. It checks
+ * if @cur and @victim are contiguous at block level.
+ */
+static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur,
+			    struct osc_extent *victim)
+{
+	struct osc_object	*obj = cur->oe_obj;
+	struct client_obd	*cli = osc_cli(obj);
+	pgoff_t			 chunk_start;
+	pgoff_t			 chunk_end;
+	int			 ppc_bits;
+
+	LASSERT(cur->oe_state == OES_CACHE);
+	LASSERT(osc_object_is_locked(obj));
+	if (victim == NULL)
+		return -EINVAL;
+
+	if (victim->oe_state != OES_CACHE || victim->oe_fsync_wait)
+		return -EBUSY;
+
+	if (cur->oe_max_end != victim->oe_max_end)
+		return -ERANGE;
+
+	LASSERT(cur->oe_dlmlock == victim->oe_dlmlock);
+	ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_SHIFT;
+	chunk_start = cur->oe_start >> ppc_bits;
+	chunk_end   = cur->oe_end   >> ppc_bits;
+	if (chunk_start   != (victim->oe_end >> ppc_bits) + 1 &&
+	    chunk_end + 1 != victim->oe_start >> ppc_bits)
+		return -ERANGE;
+
+	/* overall extent size should not exceed the max supported limit
+	 * reported by the server */
+	if (cur->oe_end - cur->oe_start + 1 +
+	    victim->oe_end - victim->oe_start + 1 > cli->cl_max_extent_pages)
+		return -ERANGE;
+
+	OSC_EXTENT_DUMP(D_CACHE, victim, "will be merged by %p.\n", cur);
+
+	cur->oe_start     = min(cur->oe_start, victim->oe_start);
+	cur->oe_end       = max(cur->oe_end,   victim->oe_end);
+	/* per-extent tax should be accounted only once for the whole extent */
+	cur->oe_grants   += victim->oe_grants - cli->cl_grant_extent_tax;
+	cur->oe_nr_pages += victim->oe_nr_pages;
+	/* only the following bits are needed to merge */
+	cur->oe_urgent   |= victim->oe_urgent;
+	cur->oe_memalloc |= victim->oe_memalloc;
+	list_splice_init(&victim->oe_pages, &cur->oe_pages);
+	list_del_init(&victim->oe_link);
+	victim->oe_nr_pages = 0;
+
+	osc_extent_get(victim);
+	__osc_extent_remove(victim);
+	osc_extent_put(env, victim);
+
+	OSC_EXTENT_DUMP(D_CACHE, cur, "after merging %p.\n", victim);
+	return 0;
+}
+
+/**
+ * Drop user count of osc_extent, and unplug IO asynchronously.
+ */
+int osc_extent_release(const struct lu_env *env, struct osc_extent *ext)
+{
+	struct osc_object *obj = ext->oe_obj;
+	struct client_obd *cli = osc_cli(obj);
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(atomic_read(&ext->oe_users) > 0);
+	LASSERT(sanity_check(ext) == 0);
+	LASSERT(ext->oe_grants > 0);
+
+	if (atomic_dec_and_lock(&ext->oe_users, &obj->oo_lock)) {
+		LASSERT(ext->oe_state == OES_ACTIVE);
+		if (ext->oe_trunc_pending) {
+			/* a truncate process is waiting for this extent.
+			 * This may happen due to a race, check
+			 * osc_cache_truncate_start(). */
+			osc_extent_state_set(ext, OES_TRUNC);
+			ext->oe_trunc_pending = 0;
+		} else {
+			int grant = 0;
+
+			osc_extent_state_set(ext, OES_CACHE);
+			osc_update_pending(obj, OBD_BRW_WRITE,
+					   ext->oe_nr_pages);
+
+			/* try to merge the previous and next extent. */
+			if (osc_extent_merge(env, ext, prev_extent(ext)) == 0)
+				grant += cli->cl_grant_extent_tax;
+			if (osc_extent_merge(env, ext, next_extent(ext)) == 0)
+				grant += cli->cl_grant_extent_tax;
+			if (grant > 0)
+				osc_unreserve_grant(cli, 0, grant);
+
+			if (ext->oe_urgent)
+				list_move_tail(&ext->oe_link,
+					       &obj->oo_urgent_exts);
+			else if (ext->oe_nr_pages == ext->oe_mppr) {
+				list_move_tail(&ext->oe_link,
+					       &obj->oo_full_exts);
+			}
+		}
+		osc_object_unlock(obj);
+
+		osc_io_unplug_async(env, cli, obj);
+	}
+	osc_extent_put(env, ext);
+	RETURN(rc);
+}
+
+static inline int overlapped(struct osc_extent *ex1, struct osc_extent *ex2)
+{
+	return !(ex1->oe_end < ex2->oe_start || ex2->oe_end < ex1->oe_start);
+}
+
+/**
+ * Find or create an extent which includes @index, core function to manage
+ * extent tree.
+ */
+static struct osc_extent *osc_extent_find(const struct lu_env *env,
+					  struct osc_object *obj, pgoff_t index,
+					  unsigned int *grants)
+{
+	struct client_obd *cli = osc_cli(obj);
+	struct osc_lock   *olck;
+	struct cl_lock_descr *descr;
+	struct osc_extent *cur;
+	struct osc_extent *ext;
+	struct osc_extent *conflict = NULL;
+	struct osc_extent *found = NULL;
+	pgoff_t    chunk;
+	pgoff_t    max_end;
+	unsigned int max_pages; /* max_pages_per_rpc */
+	unsigned int chunksize;
+	int        ppc_bits; /* pages per chunk bits */
+	pgoff_t    chunk_mask;
+	int        rc;
+	ENTRY;
+
+	cur = osc_extent_alloc(obj);
+	if (cur == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	olck = osc_env_io(env)->oi_write_osclock;
+	LASSERTF(olck != NULL, "page %lu is not covered by lock\n", index);
+	LASSERT(olck->ols_state == OLS_GRANTED);
+
+	descr = &olck->ols_cl.cls_lock->cll_descr;
+	LASSERT(descr->cld_mode >= CLM_WRITE);
+
+	LASSERTF(cli->cl_chunkbits >= PAGE_SHIFT,
+		 "chunkbits: %u\n", cli->cl_chunkbits);
+	ppc_bits   = cli->cl_chunkbits - PAGE_SHIFT;
+	chunk_mask = ~((1 << ppc_bits) - 1);
+	chunksize  = 1 << cli->cl_chunkbits;
+	chunk      = index >> ppc_bits;
+
+	/* align end to RPC edge. */
+	max_pages = cli->cl_max_pages_per_rpc;
+	if ((max_pages & ~chunk_mask) != 0) {
+		CERROR("max_pages: %#x chunkbits: %u chunk_mask: %#lx\n",
+		       max_pages, cli->cl_chunkbits, chunk_mask);
+		RETURN(ERR_PTR(-EINVAL));
+	}
+	max_end = index - (index % max_pages) + max_pages - 1;
+	max_end = min_t(pgoff_t, max_end, descr->cld_end);
+
+	/* initialize new extent by parameters so far */
+	cur->oe_max_end = max_end;
+	cur->oe_start   = index & chunk_mask;
+	cur->oe_end     = ((index + ~chunk_mask + 1) & chunk_mask) - 1;
+	if (cur->oe_start < descr->cld_start)
+		cur->oe_start = descr->cld_start;
+	if (cur->oe_end > max_end)
+		cur->oe_end = max_end;
+	cur->oe_grants  = 0;
+	cur->oe_mppr    = max_pages;
+	if (olck->ols_dlmlock != NULL) {
+		LASSERT(olck->ols_hold);
+		cur->oe_dlmlock = LDLM_LOCK_GET(olck->ols_dlmlock);
+		lu_ref_add(&olck->ols_dlmlock->l_reference, "osc_extent", cur);
+	}
+
+	/* grants has been allocated by caller */
+	LASSERTF(*grants >= chunksize + cli->cl_grant_extent_tax,
+		 "%u/%u/%u.\n", *grants, chunksize, cli->cl_grant_extent_tax);
+	LASSERTF((max_end - cur->oe_start) < max_pages, EXTSTR"\n",
+		 EXTPARA(cur));
+
+restart:
+	osc_object_lock(obj);
+	ext = osc_extent_search(obj, cur->oe_start);
+	if (ext == NULL)
+		ext = first_extent(obj);
+	while (ext != NULL) {
+		pgoff_t ext_chk_start = ext->oe_start >> ppc_bits;
+		pgoff_t ext_chk_end   = ext->oe_end   >> ppc_bits;
+
+		LASSERT(sanity_check_nolock(ext) == 0);
+		if (chunk > ext_chk_end + 1)
+			break;
+
+		/* if covering by different locks, no chance to match */
+		if (olck->ols_dlmlock != ext->oe_dlmlock) {
+			EASSERTF(!overlapped(ext, cur), ext,
+				 EXTSTR"\n", EXTPARA(cur));
+
+			ext = next_extent(ext);
+			continue;
+		}
+
+		/* discontiguous chunks? */
+		if (chunk + 1 < ext_chk_start) {
+			ext = next_extent(ext);
+			continue;
+		}
+
+		/* ok, from now on, ext and cur have these attrs:
+		 * 1. covered by the same lock
+		 * 2. contiguous at chunk level or overlapping. */
+
+		if (overlapped(ext, cur)) {
+			/* cur is the minimum unit, so overlapping means
+			 * full contain. */
+			EASSERTF((ext->oe_start <= cur->oe_start &&
+				  ext->oe_end >= cur->oe_end),
+				 ext, EXTSTR"\n", EXTPARA(cur));
+
+			if (ext->oe_state > OES_CACHE || ext->oe_fsync_wait) {
+				/* for simplicity, we wait for this extent to
+				 * finish before going forward. */
+				conflict = osc_extent_get(ext);
+				break;
+			}
+
+			found = osc_extent_hold(ext);
+			break;
+		}
+
+		/* non-overlapped extent */
+		if (ext->oe_state != OES_CACHE || ext->oe_fsync_wait) {
+			/* we can't do anything for a non OES_CACHE extent, or
+			 * if there is someone waiting for this extent to be
+			 * flushed, try next one. */
+			ext = next_extent(ext);
+			continue;
+		}
+
+		/* check if they belong to the same rpc slot before trying to
+		 * merge. the extents are not overlapped and contiguous at
+		 * chunk level to get here. */
+		if (ext->oe_max_end != max_end) {
+			/* if they don't belong to the same RPC slot or
+			 * max_pages_per_rpc has ever changed, do not merge. */
+			ext = next_extent(ext);
+			continue;
+		}
+
+		/* check whether maximum extent size will be hit */
+		if ((ext_chk_end - ext_chk_start + 1 + 1) << ppc_bits >
+		    cli->cl_max_extent_pages) {
+			ext = next_extent(ext);
+			continue;
+		}
+
+		/* it's required that an extent must be contiguous at chunk
+		 * level so that we know the whole extent is covered by grant
+		 * (the pages in the extent are NOT required to be contiguous).
+		 * Otherwise, it will be too much difficult to know which
+		 * chunks have grants allocated. */
+
+		/* try to do front merge - extend ext's start */
+		if (chunk + 1 == ext_chk_start) {
+			/* ext must be chunk size aligned */
+			EASSERT((ext->oe_start & ~chunk_mask) == 0, ext);
+
+			/* pull ext's start back to cover cur */
+			ext->oe_start   = cur->oe_start;
+			ext->oe_grants += chunksize;
+			LASSERT(*grants >= chunksize);
+			*grants -= chunksize;
+
+			found = osc_extent_hold(ext);
+		} else if (chunk == ext_chk_end + 1) {
+			/* rear merge */
+			ext->oe_end     = cur->oe_end;
+			ext->oe_grants += chunksize;
+			LASSERT(*grants >= chunksize);
+			*grants -= chunksize;
+
+			/* try to merge with the next one because we just fill
+			 * in a gap */
+			if (osc_extent_merge(env, ext, next_extent(ext)) == 0)
+				/* we can save extent tax from next extent */
+				*grants += cli->cl_grant_extent_tax;
+
+			found = osc_extent_hold(ext);
+		}
+		if (found != NULL)
+			break;
+
+		ext = next_extent(ext);
+	}
+
+	osc_extent_tree_dump(D_CACHE, obj);
+	if (found != NULL) {
+		LASSERT(conflict == NULL);
+		if (!IS_ERR(found)) {
+			LASSERT(found->oe_dlmlock == cur->oe_dlmlock);
+			OSC_EXTENT_DUMP(D_CACHE, found,
+					"found caching ext for %lu.\n", index);
+		}
+	} else if (conflict == NULL) {
+		/* create a new extent */
+		EASSERT(osc_extent_is_overlapped(obj, cur) == 0, cur);
+		cur->oe_grants = chunksize + cli->cl_grant_extent_tax;
+		LASSERT(*grants >= cur->oe_grants);
+		*grants -= cur->oe_grants;
+
+		cur->oe_state = OES_CACHE;
+		found = osc_extent_hold(cur);
+		osc_extent_insert(obj, cur);
+		OSC_EXTENT_DUMP(D_CACHE, cur, "add into tree %lu/%lu.\n",
+				index, descr->cld_end);
+	}
+	osc_object_unlock(obj);
+
+	if (conflict != NULL) {
+		LASSERT(found == NULL);
+
+		/* waiting for IO to finish. Please notice that it's impossible
+		 * to be an OES_TRUNC extent. */
+		rc = osc_extent_wait(env, conflict, OES_INV);
+		osc_extent_put(env, conflict);
+		conflict = NULL;
+		if (rc < 0)
+			GOTO(out, found = ERR_PTR(rc));
+
+		goto restart;
+	}
+	EXIT;
+
+out:
+	osc_extent_put(env, cur);
+	return found;
+}
+
+/**
+ * Called when IO is finished to an extent.
+ */
+int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
+		      int sent, int rc)
+{
+	struct client_obd *cli = osc_cli(ext->oe_obj);
+	struct osc_async_page *oap;
+	struct osc_async_page *tmp;
+	int nr_pages = ext->oe_nr_pages;
+	int lost_grant = 0;
+	int blocksize = cli->cl_import->imp_obd->obd_osfs.os_bsize ? : 4096;
+	loff_t last_off = 0;
+	int last_count = -1;
+	ENTRY;
+
+	OSC_EXTENT_DUMP(D_CACHE, ext, "extent finished.\n");
+
+	ext->oe_rc = rc ?: ext->oe_nr_pages;
+	EASSERT(ergo(rc == 0, ext->oe_state == OES_RPC), ext);
+
+	osc_lru_add_batch(cli, &ext->oe_pages);
+	list_for_each_entry_safe(oap, tmp, &ext->oe_pages,
+				     oap_pending_item) {
+		list_del_init(&oap->oap_rpc_item);
+		list_del_init(&oap->oap_pending_item);
+		if (last_off <= oap->oap_obj_off) {
+			last_off = oap->oap_obj_off;
+			last_count = oap->oap_count;
+		}
+
+		--ext->oe_nr_pages;
+		osc_ap_completion(env, cli, oap, sent, rc);
+	}
+	EASSERT(ext->oe_nr_pages == 0, ext);
+
+	if (!sent) {
+		lost_grant = ext->oe_grants;
+	} else if (blocksize < PAGE_SIZE &&
+		   last_count != PAGE_SIZE) {
+		/* For short writes we shouldn't count parts of pages that
+		 * span a whole chunk on the OST side, or our accounting goes
+		 * wrong.  Should match the code in filter_grant_check. */
+		int offset = last_off & ~PAGE_MASK;
+		int count = last_count + (offset & (blocksize - 1));
+		int end = (offset + last_count) & (blocksize - 1);
+		if (end)
+			count += blocksize - end;
+
+		lost_grant = PAGE_SIZE - count;
+	}
+	if (ext->oe_grants > 0)
+		osc_free_grant(cli, nr_pages, lost_grant, ext->oe_grants);
+
+	osc_extent_remove(ext);
+	/* put the refcount for RPC */
+	osc_extent_put(env, ext);
+	RETURN(0);
+}
+
+static int extent_wait_cb(struct osc_extent *ext, enum osc_extent_state state)
+{
+	int ret;
+
+	osc_object_lock(ext->oe_obj);
+	ret = ext->oe_state == state;
+	osc_object_unlock(ext->oe_obj);
+
+	return ret;
+}
+
+/**
+ * Wait for the extent's state to become @state.
+ */
+static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext,
+			   enum osc_extent_state state)
+{
+	struct osc_object *obj = ext->oe_obj;
+	struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(600), NULL,
+						  LWI_ON_SIGNAL_NOOP, NULL);
+	int rc = 0;
+	ENTRY;
+
+	osc_object_lock(obj);
+	LASSERT(sanity_check_nolock(ext) == 0);
+	/* `Kick' this extent only if the caller is waiting for it to be
+	 * written out. */
+	if (state == OES_INV && !ext->oe_urgent && !ext->oe_hp) {
+		if (ext->oe_state == OES_ACTIVE) {
+			ext->oe_urgent = 1;
+		} else if (ext->oe_state == OES_CACHE) {
+			ext->oe_urgent = 1;
+			osc_extent_hold(ext);
+			rc = 1;
+		}
+	}
+	osc_object_unlock(obj);
+	if (rc == 1)
+		osc_extent_release(env, ext);
+
+	/* wait for the extent until its state becomes @state */
+	rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state), &lwi);
+	if (rc == -ETIMEDOUT) {
+		OSC_EXTENT_DUMP(D_ERROR, ext,
+			"%s: wait ext to %u timedout, recovery in progress?\n",
+			cli_name(osc_cli(obj)), state);
+
+		lwi = LWI_INTR(NULL, NULL);
+		rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state),
+				  &lwi);
+	}
+	if (rc == 0 && ext->oe_rc < 0)
+		rc = ext->oe_rc;
+	RETURN(rc);
+}
+
+/**
+ * Discard pages with index greater than @size. If @ext is overlapped with
+ * @size, then partial truncate happens.
+ */
+static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
+				bool partial)
+{
+	struct lu_env         *env;
+	struct cl_io          *io;
+	struct osc_object     *obj = ext->oe_obj;
+	struct client_obd     *cli = osc_cli(obj);
+	struct osc_async_page *oap;
+	struct osc_async_page *tmp;
+	int                    pages_in_chunk = 0;
+	int                    ppc_bits    = cli->cl_chunkbits -
+					     PAGE_SHIFT;
+	__u64                  trunc_chunk = trunc_index >> ppc_bits;
+	int                    grants   = 0;
+	int                    nr_pages = 0;
+	int                    rc       = 0;
+	__u16		       refcheck;
+	ENTRY;
+
+	LASSERT(sanity_check(ext) == 0);
+	LASSERT(ext->oe_state == OES_TRUNC);
+	LASSERT(!ext->oe_urgent);
+
+	/* Request new lu_env.
+	 * We can't use that env from osc_cache_truncate_start() because
+	 * it's from lov_io_sub and not fully initialized. */
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	io  = &osc_env_info(env)->oti_io;
+	io->ci_obj = cl_object_top(osc2cl(obj));
+	io->ci_ignore_layout = 1;
+	rc = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	/* discard all pages with index greater than trunc_index */
+	list_for_each_entry_safe(oap, tmp, &ext->oe_pages,
+				     oap_pending_item) {
+		pgoff_t index = osc_index(oap2osc(oap));
+		struct cl_page  *page = oap2cl_page(oap);
+
+		LASSERT(list_empty(&oap->oap_rpc_item));
+
+		/* only discard the pages with their index greater than
+		 * trunc_index, and ... */
+		if (index < trunc_index ||
+		    (index == trunc_index && partial)) {
+			/* accounting how many pages remaining in the chunk
+			 * so that we can calculate grants correctly. */
+			if (index >> ppc_bits == trunc_chunk)
+				++pages_in_chunk;
+			continue;
+		}
+
+		list_del_init(&oap->oap_pending_item);
+
+		cl_page_get(page);
+		lu_ref_add(&page->cp_reference, "truncate", current);
+
+		if (cl_page_own(env, io, page) == 0) {
+			cl_page_discard(env, io, page);
+			cl_page_disown(env, io, page);
+		} else {
+			LASSERT(page->cp_state == CPS_FREEING);
+			LASSERT(0);
+		}
+
+		lu_ref_del(&page->cp_reference, "truncate", current);
+		cl_page_put(env, page);
+
+		--ext->oe_nr_pages;
+		++nr_pages;
+	}
+	EASSERTF(ergo(ext->oe_start >= trunc_index + !!partial,
+		      ext->oe_nr_pages == 0),
+		ext, "trunc_index %lu, partial %d\n", trunc_index, partial);
+
+	osc_object_lock(obj);
+	if (ext->oe_nr_pages == 0) {
+		LASSERT(pages_in_chunk == 0);
+		grants = ext->oe_grants;
+		ext->oe_grants = 0;
+	} else { /* calculate how many grants we can free */
+		int     chunks = (ext->oe_end >> ppc_bits) - trunc_chunk;
+		pgoff_t last_index;
+
+
+		/* if there is no pages in this chunk, we can also free grants
+		 * for the last chunk */
+		if (pages_in_chunk == 0) {
+			/* if this is the 1st chunk and no pages in this chunk,
+			 * ext->oe_nr_pages must be zero, so we should be in
+			 * the other if-clause. */
+			LASSERT(trunc_chunk > 0);
+			--trunc_chunk;
+			++chunks;
+		}
+
+		/* this is what we can free from this extent */
+		grants          = chunks << cli->cl_chunkbits;
+		ext->oe_grants -= grants;
+		last_index      = ((trunc_chunk + 1) << ppc_bits) - 1;
+		ext->oe_end     = min(last_index, ext->oe_max_end);
+		LASSERT(ext->oe_end >= ext->oe_start);
+		LASSERT(ext->oe_grants > 0);
+	}
+	osc_object_unlock(obj);
+
+	if (grants > 0 || nr_pages > 0)
+		osc_free_grant(cli, nr_pages, grants, grants);
+
+out:
+	cl_io_fini(env, io);
+	cl_env_put(env, &refcheck);
+	RETURN(rc);
+}
+
+/**
+ * This function is used to make the extent prepared for transfer.
+ * A race with flusing page - ll_writepage() has to be handled cautiously.
+ */
+static int osc_extent_make_ready(const struct lu_env *env,
+				 struct osc_extent *ext)
+{
+	struct osc_async_page *oap;
+	struct osc_async_page *last = NULL;
+	struct osc_object *obj = ext->oe_obj;
+	unsigned int page_count = 0;
+	int rc;
+	ENTRY;
+
+	/* we're going to grab page lock, so object lock must not be taken. */
+	LASSERT(sanity_check(ext) == 0);
+	/* in locking state, any process should not touch this extent. */
+	EASSERT(ext->oe_state == OES_LOCKING, ext);
+	EASSERT(ext->oe_owner != NULL, ext);
+
+	OSC_EXTENT_DUMP(D_CACHE, ext, "make ready\n");
+
+	list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+		++page_count;
+		if (last == NULL || last->oap_obj_off < oap->oap_obj_off)
+			last = oap;
+
+		/* checking ASYNC_READY is race safe */
+		if ((oap->oap_async_flags & ASYNC_READY) != 0)
+			continue;
+
+		rc = osc_make_ready(env, oap, OBD_BRW_WRITE);
+		switch (rc) {
+		case 0:
+			spin_lock(&oap->oap_lock);
+			oap->oap_async_flags |= ASYNC_READY;
+			spin_unlock(&oap->oap_lock);
+			break;
+		case -EALREADY:
+			LASSERT((oap->oap_async_flags & ASYNC_READY) != 0);
+			break;
+		default:
+			LASSERTF(0, "unknown return code: %d\n", rc);
+		}
+	}
+
+	LASSERT(page_count == ext->oe_nr_pages);
+	LASSERT(last != NULL);
+	/* the last page is the only one we need to refresh its count by
+	 * the size of file. */
+	if (!(last->oap_async_flags & ASYNC_COUNT_STABLE)) {
+		int last_oap_count = osc_refresh_count(env, last, OBD_BRW_WRITE);
+		LASSERT(last_oap_count > 0);
+		LASSERT(last->oap_page_off + last_oap_count <= PAGE_SIZE);
+		last->oap_count = last_oap_count;
+		spin_lock(&last->oap_lock);
+		last->oap_async_flags |= ASYNC_COUNT_STABLE;
+		spin_unlock(&last->oap_lock);
+	}
+
+	/* for the rest of pages, we don't need to call osf_refresh_count()
+	 * because it's known they are not the last page */
+	list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+		if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) {
+			oap->oap_count = PAGE_SIZE - oap->oap_page_off;
+			spin_lock(&oap->oap_lock);
+			oap->oap_async_flags |= ASYNC_COUNT_STABLE;
+			spin_unlock(&oap->oap_lock);
+		}
+	}
+
+	osc_object_lock(obj);
+	osc_extent_state_set(ext, OES_RPC);
+	osc_object_unlock(obj);
+	/* get a refcount for RPC. */
+	osc_extent_get(ext);
+
+	RETURN(0);
+}
+
+/**
+ * Quick and simple version of osc_extent_find(). This function is frequently
+ * called to expand the extent for the same IO. To expand the extent, the
+ * page index must be in the same or next chunk of ext->oe_end.
+ */
+static int osc_extent_expand(struct osc_extent *ext, pgoff_t index,
+			     unsigned int *grants)
+{
+	struct osc_object *obj = ext->oe_obj;
+	struct client_obd *cli = osc_cli(obj);
+	struct osc_extent *next;
+	int ppc_bits = cli->cl_chunkbits - PAGE_SHIFT;
+	pgoff_t chunk = index >> ppc_bits;
+	pgoff_t end_chunk;
+	pgoff_t end_index;
+	unsigned int chunksize = 1 << cli->cl_chunkbits;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(ext->oe_max_end >= index && ext->oe_start <= index);
+	osc_object_lock(obj);
+	LASSERT(sanity_check_nolock(ext) == 0);
+	end_chunk = ext->oe_end >> ppc_bits;
+	if (chunk > end_chunk + 1)
+		GOTO(out, rc = -ERANGE);
+
+	if (end_chunk >= chunk)
+		GOTO(out, rc = 0);
+
+	LASSERT(end_chunk + 1 == chunk);
+
+	/* try to expand this extent to cover @index */
+	end_index = min(ext->oe_max_end, ((chunk + 1) << ppc_bits) - 1);
+
+	/* don't go over the maximum extent size reported by server */
+	if (end_index - ext->oe_start + 1 > cli->cl_max_extent_pages)
+		GOTO(out, rc = -ERANGE);
+
+	next = next_extent(ext);
+	if (next != NULL && next->oe_start <= end_index)
+		/* complex mode - overlapped with the next extent,
+		 * this case will be handled by osc_extent_find() */
+		GOTO(out, rc = -EAGAIN);
+
+	ext->oe_end = end_index;
+	ext->oe_grants += chunksize;
+	LASSERT(*grants >= chunksize);
+	*grants -= chunksize;
+	EASSERTF(osc_extent_is_overlapped(obj, ext) == 0, ext,
+		 "overlapped after expanding for %lu.\n", index);
+	EXIT;
+
+out:
+	osc_object_unlock(obj);
+	RETURN(rc);
+}
+
+static void osc_extent_tree_dump0(int level, struct osc_object *obj,
+				  const char *func, int line)
+{
+	struct osc_extent *ext;
+	int cnt;
+
+	if (!cfs_cdebug_show(level, DEBUG_SUBSYSTEM))
+		return;
+
+	CDEBUG(level, "Dump object %p extents at %s:%d, mppr: %u.\n",
+	       obj, func, line, osc_cli(obj)->cl_max_pages_per_rpc);
+
+	/* osc_object_lock(obj); */
+	cnt = 1;
+	for (ext = first_extent(obj); ext != NULL; ext = next_extent(ext))
+		OSC_EXTENT_DUMP(level, ext, "in tree %d.\n", cnt++);
+
+	cnt = 1;
+	list_for_each_entry(ext, &obj->oo_hp_exts, oe_link)
+		OSC_EXTENT_DUMP(level, ext, "hp %d.\n", cnt++);
+
+	cnt = 1;
+	list_for_each_entry(ext, &obj->oo_urgent_exts, oe_link)
+		OSC_EXTENT_DUMP(level, ext, "urgent %d.\n", cnt++);
+
+	cnt = 1;
+	list_for_each_entry(ext, &obj->oo_reading_exts, oe_link)
+		OSC_EXTENT_DUMP(level, ext, "reading %d.\n", cnt++);
+	/* osc_object_unlock(obj); */
+}
+
+/* ------------------ osc extent end ------------------ */
+
+static inline int osc_is_ready(struct osc_object *osc)
+{
+	return !list_empty(&osc->oo_ready_item) ||
+	       !list_empty(&osc->oo_hp_ready_item);
+}
+
+#define OSC_IO_DEBUG(OSC, STR, args...)					       \
+	CDEBUG(D_CACHE, "obj %p ready %d|%c|%c wr %d|%c|%c rd %d|%c " STR,     \
+	       (OSC), osc_is_ready(OSC),				       \
+	       list_empty_marker(&(OSC)->oo_hp_ready_item),		       \
+	       list_empty_marker(&(OSC)->oo_ready_item),		       \
+	       atomic_read(&(OSC)->oo_nr_writes),			       \
+	       list_empty_marker(&(OSC)->oo_hp_exts),			       \
+	       list_empty_marker(&(OSC)->oo_urgent_exts),		       \
+	       atomic_read(&(OSC)->oo_nr_reads),			       \
+	       list_empty_marker(&(OSC)->oo_reading_exts),		       \
+	       ##args)
+
+static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap,
+			  int cmd)
+{
+	struct osc_page *opg  = oap2osc_page(oap);
+	struct cl_page  *page = oap2cl_page(oap);
+	int result;
+
+	LASSERT(cmd == OBD_BRW_WRITE); /* no cached reads */
+
+	ENTRY;
+	result = cl_page_make_ready(env, page, CRT_WRITE);
+	if (result == 0)
+		opg->ops_submit_time = cfs_time_current();
+	RETURN(result);
+}
+
+static int osc_refresh_count(const struct lu_env *env,
+			     struct osc_async_page *oap, int cmd)
+{
+	struct osc_page  *opg = oap2osc_page(oap);
+	pgoff_t index = osc_index(oap2osc(oap));
+	struct cl_object *obj;
+	struct cl_attr   *attr = &osc_env_info(env)->oti_attr;
+
+	int result;
+	loff_t kms;
+
+	/* readpage queues with _COUNT_STABLE, shouldn't get here. */
+	LASSERT(!(cmd & OBD_BRW_READ));
+	LASSERT(opg != NULL);
+	obj = opg->ops_cl.cpl_obj;
+
+	cl_object_attr_lock(obj);
+	result = cl_object_attr_get(env, obj, attr);
+	cl_object_attr_unlock(obj);
+	if (result < 0)
+		return result;
+	kms = attr->cat_kms;
+	if (cl_offset(obj, index) >= kms)
+		/* catch race with truncate */
+		return 0;
+	else if (cl_offset(obj, index + 1) > kms)
+		/* catch sub-page write at end of file */
+		return kms % PAGE_SIZE;
+	else
+		return PAGE_SIZE;
+}
+
+static int osc_completion(const struct lu_env *env, struct osc_async_page *oap,
+			  int cmd, int rc)
+{
+	struct osc_page   *opg  = oap2osc_page(oap);
+	struct cl_page    *page = oap2cl_page(oap);
+	enum cl_req_type   crt;
+	int srvlock;
+
+	ENTRY;
+
+	cmd &= ~OBD_BRW_NOQUOTA;
+	LASSERTF(equi(page->cp_state == CPS_PAGEIN,  cmd == OBD_BRW_READ),
+		 "cp_state:%u, cmd:%d\n", page->cp_state, cmd);
+	LASSERTF(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE),
+		"cp_state:%u, cmd:%d\n", page->cp_state, cmd);
+	LASSERT(opg->ops_transfer_pinned);
+
+	crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE;
+	/* Clear opg->ops_transfer_pinned before VM lock is released. */
+	opg->ops_transfer_pinned = 0;
+
+	opg->ops_submit_time = 0;
+	srvlock = oap->oap_brw_flags & OBD_BRW_SRVLOCK;
+
+	/* statistic */
+	if (rc == 0 && srvlock) {
+		struct lu_device *ld    = opg->ops_cl.cpl_obj->co_lu.lo_dev;
+		struct osc_stats *stats = &lu2osc_dev(ld)->od_stats;
+		size_t bytes = oap->oap_count;
+
+		if (crt == CRT_READ)
+			stats->os_lockless_reads += bytes;
+		else
+			stats->os_lockless_writes += bytes;
+	}
+
+	/*
+	 * This has to be the last operation with the page, as locks are
+	 * released in cl_page_completion() and nothing except for the
+	 * reference counter protects page from concurrent reclaim.
+	 */
+	lu_ref_del(&page->cp_reference, "transfer", page);
+
+	cl_page_completion(env, page, crt, rc);
+	cl_page_put(env, page);
+
+	RETURN(0);
+}
+
+#define OSC_DUMP_GRANT(lvl, cli, fmt, args...) do {			\
+	struct client_obd *__tmp = (cli);				\
+	CDEBUG(lvl, "%s: grant { dirty: %ld/%ld dirty_pages: %ld/%lu "	\
+	       "dropped: %ld avail: %ld, dirty_grant: %ld, "		\
+	       "reserved: %ld, flight: %d } lru {in list: %ld, "	\
+	       "left: %ld, waiters: %d }" fmt "\n",			\
+	       cli_name(__tmp),						\
+	       __tmp->cl_dirty_pages, __tmp->cl_dirty_max_pages,	\
+	       atomic_long_read(&obd_dirty_pages), obd_max_dirty_pages,	\
+	       __tmp->cl_lost_grant, __tmp->cl_avail_grant,		\
+	       __tmp->cl_dirty_grant,					\
+	       __tmp->cl_reserved_grant, __tmp->cl_w_in_flight,		\
+	       atomic_long_read(&__tmp->cl_lru_in_list),		\
+	       atomic_long_read(&__tmp->cl_lru_busy),			\
+	       atomic_read(&__tmp->cl_lru_shrinkers), ##args);		\
+} while (0)
+
+/* caller must hold loi_list_lock */
+static void osc_consume_write_grant(struct client_obd *cli,
+				    struct brw_page *pga)
+{
+	assert_spin_locked(&cli->cl_loi_list_lock);
+	LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT));
+	atomic_long_inc(&obd_dirty_pages);
+	cli->cl_dirty_pages++;
+	pga->flag |= OBD_BRW_FROM_GRANT;
+	CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n",
+	       PAGE_SIZE, pga, pga->pg);
+	osc_update_next_shrink(cli);
+}
+
+/* the companion to osc_consume_write_grant, called when a brw has completed.
+ * must be called with the loi lock held. */
+static void osc_release_write_grant(struct client_obd *cli,
+				    struct brw_page *pga)
+{
+	ENTRY;
+
+	assert_spin_locked(&cli->cl_loi_list_lock);
+	if (!(pga->flag & OBD_BRW_FROM_GRANT)) {
+		EXIT;
+		return;
+	}
+
+	pga->flag &= ~OBD_BRW_FROM_GRANT;
+	atomic_long_dec(&obd_dirty_pages);
+	cli->cl_dirty_pages--;
+	if (pga->flag & OBD_BRW_NOCACHE) {
+		pga->flag &= ~OBD_BRW_NOCACHE;
+		atomic_long_dec(&obd_dirty_transit_pages);
+		cli->cl_dirty_transit--;
+	}
+	EXIT;
+}
+
+/**
+ * To avoid sleeping with object lock held, it's good for us allocate enough
+ * grants before entering into critical section.
+ *
+ * client_obd_list_lock held by caller
+ */
+static int osc_reserve_grant(struct client_obd *cli, unsigned int bytes)
+{
+	int rc = -EDQUOT;
+
+	if (cli->cl_avail_grant >= bytes) {
+		cli->cl_avail_grant    -= bytes;
+		cli->cl_reserved_grant += bytes;
+		rc = 0;
+	}
+	return rc;
+}
+
+static void __osc_unreserve_grant(struct client_obd *cli,
+				  unsigned int reserved, unsigned int unused)
+{
+	/* it's quite normal for us to get more grant than reserved.
+	 * Thinking about a case that two extents merged by adding a new
+	 * chunk, we can save one extent tax. If extent tax is greater than
+	 * one chunk, we can save more grant by adding a new chunk */
+	cli->cl_reserved_grant -= reserved;
+	if (unused > reserved) {
+		cli->cl_avail_grant += reserved;
+		cli->cl_lost_grant  += unused - reserved;
+		cli->cl_dirty_grant -= unused - reserved;
+	} else {
+		cli->cl_avail_grant += unused;
+		cli->cl_dirty_grant += reserved - unused;
+	}
+}
+
+static void osc_unreserve_grant(struct client_obd *cli,
+				unsigned int reserved, unsigned int unused)
+{
+	spin_lock(&cli->cl_loi_list_lock);
+	__osc_unreserve_grant(cli, reserved, unused);
+	if (unused > 0)
+		osc_wake_cache_waiters(cli);
+	spin_unlock(&cli->cl_loi_list_lock);
+}
+
+/**
+ * Free grant after IO is finished or canceled.
+ *
+ * @lost_grant is used to remember how many grants we have allocated but not
+ * used, we should return these grants to OST. There're two cases where grants
+ * can be lost:
+ * 1. truncate;
+ * 2. blocksize at OST is less than PAGE_SIZE and a partial page was
+ *    written. In this case OST may use less chunks to serve this partial
+ *    write. OSTs don't actually know the page size on the client side. so
+ *    clients have to calculate lost grant by the blocksize on the OST.
+ *    See filter_grant_check() for details.
+ */
+static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages,
+			   unsigned int lost_grant, unsigned int dirty_grant)
+{
+	unsigned long grant;
+
+	grant = (1 << cli->cl_chunkbits) + cli->cl_grant_extent_tax;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	atomic_long_sub(nr_pages, &obd_dirty_pages);
+	cli->cl_dirty_pages -= nr_pages;
+	cli->cl_lost_grant += lost_grant;
+	cli->cl_dirty_grant -= dirty_grant;
+	if (cli->cl_avail_grant < grant && cli->cl_lost_grant >= grant) {
+		/* borrow some grant from truncate to avoid the case that
+		 * truncate uses up all avail grant */
+		cli->cl_lost_grant -= grant;
+		cli->cl_avail_grant += grant;
+	}
+	osc_wake_cache_waiters(cli);
+	spin_unlock(&cli->cl_loi_list_lock);
+	CDEBUG(D_CACHE, "lost %u grant: %lu avail: %lu dirty: %lu/%lu\n",
+	       lost_grant, cli->cl_lost_grant,
+	       cli->cl_avail_grant, cli->cl_dirty_pages << PAGE_SHIFT,
+	       cli->cl_dirty_grant);
+}
+
+/**
+ * The companion to osc_enter_cache(), called when @oap is no longer part of
+ * the dirty accounting due to error.
+ */
+static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap)
+{
+	spin_lock(&cli->cl_loi_list_lock);
+	osc_release_write_grant(cli, &oap->oap_brw_page);
+	spin_unlock(&cli->cl_loi_list_lock);
+}
+
+/**
+ * Non-blocking version of osc_enter_cache() that consumes grant only when it
+ * is available.
+ */
+static int osc_enter_cache_try(struct client_obd *cli,
+			       struct osc_async_page *oap,
+			       int bytes, int transient)
+{
+	int rc;
+
+	OSC_DUMP_GRANT(D_CACHE, cli, "need:%d\n", bytes);
+
+	rc = osc_reserve_grant(cli, bytes);
+	if (rc < 0)
+		return 0;
+
+	if (cli->cl_dirty_pages < cli->cl_dirty_max_pages &&
+	    1 + atomic_long_read(&obd_dirty_pages) <= obd_max_dirty_pages) {
+		osc_consume_write_grant(cli, &oap->oap_brw_page);
+		if (transient) {
+			cli->cl_dirty_transit++;
+			atomic_long_inc(&obd_dirty_transit_pages);
+			oap->oap_brw_flags |= OBD_BRW_NOCACHE;
+		}
+		rc = 1;
+	} else {
+		__osc_unreserve_grant(cli, bytes, bytes);
+		rc = 0;
+	}
+	return rc;
+}
+
+static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw)
+{
+	int rc;
+	spin_lock(&cli->cl_loi_list_lock);
+	rc = list_empty(&ocw->ocw_entry);
+	spin_unlock(&cli->cl_loi_list_lock);
+	return rc;
+}
+
+/**
+ * The main entry to reserve dirty page accounting. Usually the grant reserved
+ * in this function will be freed in bulk in osc_free_grant() unless it fails
+ * to add osc cache, in that case, it will be freed in osc_exit_cache().
+ *
+ * The process will be put into sleep if it's already run out of grant.
+ */
+static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli,
+			   struct osc_async_page *oap, int bytes)
+{
+	struct osc_object	*osc = oap->oap_obj;
+	struct lov_oinfo	*loi = osc->oo_oinfo;
+	struct osc_cache_waiter	 ocw;
+	struct l_wait_info	 lwi;
+	int			 rc = -EDQUOT;
+	ENTRY;
+
+	lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(AT_OFF ? obd_timeout : at_max),
+			       NULL, LWI_ON_SIGNAL_NOOP, NULL);
+
+	OSC_DUMP_GRANT(D_CACHE, cli, "need:%d\n", bytes);
+
+	spin_lock(&cli->cl_loi_list_lock);
+
+	/* force the caller to try sync io.  this can jump the list
+	 * of queued writes and create a discontiguous rpc stream */
+	if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_GRANT) ||
+	    cli->cl_dirty_max_pages == 0 ||
+	    cli->cl_ar.ar_force_sync || loi->loi_ar.ar_force_sync) {
+		OSC_DUMP_GRANT(D_CACHE, cli, "forced sync i/o\n");
+		GOTO(out, rc = -EDQUOT);
+	}
+
+	/* Hopefully normal case - cache space and write credits available */
+	if (osc_enter_cache_try(cli, oap, bytes, 0)) {
+		OSC_DUMP_GRANT(D_CACHE, cli, "granted from cache\n");
+		GOTO(out, rc = 0);
+	}
+
+	/* We can get here for two reasons: too many dirty pages in cache, or
+	 * run out of grants. In both cases we should write dirty pages out.
+	 * Adding a cache waiter will trigger urgent write-out no matter what
+	 * RPC size will be.
+	 * The exiting condition is no avail grants and no dirty pages caching,
+	 * that really means there is no space on the OST. */
+	init_waitqueue_head(&ocw.ocw_waitq);
+	ocw.ocw_oap   = oap;
+	ocw.ocw_grant = bytes;
+	while (cli->cl_dirty_pages > 0 || cli->cl_w_in_flight > 0) {
+		list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters);
+		ocw.ocw_rc = 0;
+		spin_unlock(&cli->cl_loi_list_lock);
+
+		osc_io_unplug_async(env, cli, NULL);
+
+		CDEBUG(D_CACHE, "%s: sleeping for cache space @ %p for %p\n",
+		       cli_name(cli), &ocw, oap);
+
+		rc = l_wait_event(ocw.ocw_waitq, ocw_granted(cli, &ocw), &lwi);
+
+		spin_lock(&cli->cl_loi_list_lock);
+
+		if (rc < 0) {
+			/* l_wait_event is interrupted by signal or timed out */
+			list_del_init(&ocw.ocw_entry);
+			break;
+		}
+		LASSERT(list_empty(&ocw.ocw_entry));
+		rc = ocw.ocw_rc;
+
+		if (rc != -EDQUOT)
+			break;
+		if (osc_enter_cache_try(cli, oap, bytes, 0)) {
+			rc = 0;
+			break;
+		}
+	}
+
+	switch (rc) {
+	case 0:
+		OSC_DUMP_GRANT(D_CACHE, cli, "finally got grant space\n");
+		break;
+	case -ETIMEDOUT:
+		OSC_DUMP_GRANT(D_CACHE, cli,
+			       "timeout, fall back to sync i/o\n");
+		osc_extent_tree_dump(D_CACHE, osc);
+		/* fall back to synchronous I/O */
+		rc = -EDQUOT;
+		break;
+	case -EINTR:
+		/* Ensures restartability - LU-3581 */
+		OSC_DUMP_GRANT(D_CACHE, cli, "interrupted\n");
+		rc = -ERESTARTSYS;
+		break;
+	case -EDQUOT:
+		OSC_DUMP_GRANT(D_CACHE, cli,
+			       "no grant space, fall back to sync i/o\n");
+		break;
+	default:
+		CDEBUG(D_CACHE, "%s: event for cache space @ %p never arrived "
+		       "due to %d, fall back to sync i/o\n",
+		       cli_name(cli), &ocw, rc);
+		break;
+	}
+	EXIT;
+out:
+	spin_unlock(&cli->cl_loi_list_lock);
+	RETURN(rc);
+}
+
+/* caller must hold loi_list_lock */
+void osc_wake_cache_waiters(struct client_obd *cli)
+{
+	struct list_head *l, *tmp;
+	struct osc_cache_waiter *ocw;
+
+	ENTRY;
+	list_for_each_safe(l, tmp, &cli->cl_cache_waiters) {
+		ocw = list_entry(l, struct osc_cache_waiter, ocw_entry);
+		list_del_init(&ocw->ocw_entry);
+
+		ocw->ocw_rc = -EDQUOT;
+		/* we can't dirty more */
+		if ((cli->cl_dirty_pages  >= cli->cl_dirty_max_pages) ||
+		    (1 + atomic_long_read(&obd_dirty_pages) >
+		     obd_max_dirty_pages)) {
+			CDEBUG(D_CACHE, "no dirty room: dirty: %ld "
+			       "osc max %ld, sys max %ld\n",
+			       cli->cl_dirty_pages, cli->cl_dirty_max_pages,
+			       obd_max_dirty_pages);
+			goto wakeup;
+		}
+
+		if (osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant, 0))
+			ocw->ocw_rc = 0;
+wakeup:
+		CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant %ld, %d\n",
+		       ocw, ocw->ocw_oap, cli->cl_avail_grant, ocw->ocw_rc);
+
+		wake_up(&ocw->ocw_waitq);
+	}
+
+	EXIT;
+}
+
+static int osc_max_rpc_in_flight(struct client_obd *cli, struct osc_object *osc)
+{
+	int hprpc = !!list_empty(&osc->oo_hp_exts);
+	return rpcs_in_flight(cli) >= cli->cl_max_rpcs_in_flight + hprpc;
+}
+
+/* This maintains the lists of pending pages to read/write for a given object
+ * (lop).  This is used by osc_check_rpcs->osc_next_obj() and osc_list_maint()
+ * to quickly find objects that are ready to send an RPC. */
+static int osc_makes_rpc(struct client_obd *cli, struct osc_object *osc,
+			 int cmd)
+{
+	int invalid_import = 0;
+	ENTRY;
+
+	/* if we have an invalid import we want to drain the queued pages
+	 * by forcing them through rpcs that immediately fail and complete
+	 * the pages.  recovery relies on this to empty the queued pages
+	 * before canceling the locks and evicting down the llite pages */
+	if ((cli->cl_import == NULL || cli->cl_import->imp_invalid))
+		invalid_import = 1;
+
+	if (cmd & OBD_BRW_WRITE) {
+		if (atomic_read(&osc->oo_nr_writes) == 0)
+			RETURN(0);
+		if (invalid_import) {
+			CDEBUG(D_CACHE, "invalid import forcing RPC\n");
+			RETURN(1);
+		}
+		if (!list_empty(&osc->oo_hp_exts)) {
+			CDEBUG(D_CACHE, "high prio request forcing RPC\n");
+			RETURN(1);
+		}
+		if (!list_empty(&osc->oo_urgent_exts)) {
+			CDEBUG(D_CACHE, "urgent request forcing RPC\n");
+			RETURN(1);
+		}
+		/* trigger a write rpc stream as long as there are dirtiers
+		 * waiting for space.  as they're waiting, they're not going to
+		 * create more pages to coalesce with what's waiting.. */
+		if (!list_empty(&cli->cl_cache_waiters)) {
+			CDEBUG(D_CACHE, "cache waiters forcing RPC\n");
+			RETURN(1);
+		}
+		if (!list_empty(&osc->oo_full_exts)) {
+			CDEBUG(D_CACHE, "full extent ready, make an RPC\n");
+			RETURN(1);
+		}
+	} else {
+		if (atomic_read(&osc->oo_nr_reads) == 0)
+			RETURN(0);
+		if (invalid_import) {
+			CDEBUG(D_CACHE, "invalid import forcing RPC\n");
+			RETURN(1);
+		}
+		/* all read are urgent. */
+		if (!list_empty(&osc->oo_reading_exts))
+			RETURN(1);
+	}
+
+	RETURN(0);
+}
+
+static void osc_update_pending(struct osc_object *obj, int cmd, int delta)
+{
+	struct client_obd *cli = osc_cli(obj);
+	if (cmd & OBD_BRW_WRITE) {
+		atomic_add(delta, &obj->oo_nr_writes);
+		atomic_add(delta, &cli->cl_pending_w_pages);
+		LASSERT(atomic_read(&obj->oo_nr_writes) >= 0);
+	} else {
+		atomic_add(delta, &obj->oo_nr_reads);
+		atomic_add(delta, &cli->cl_pending_r_pages);
+		LASSERT(atomic_read(&obj->oo_nr_reads) >= 0);
+	}
+	OSC_IO_DEBUG(obj, "update pending cmd %d delta %d.\n", cmd, delta);
+}
+
+static int osc_makes_hprpc(struct osc_object *obj)
+{
+	return !list_empty(&obj->oo_hp_exts);
+}
+
+static void on_list(struct list_head *item, struct list_head *list,
+		    int should_be_on)
+{
+	if (list_empty(item) && should_be_on)
+		list_add_tail(item, list);
+	else if (!list_empty(item) && !should_be_on)
+		list_del_init(item);
+}
+
+/* maintain the osc's cli list membership invariants so that osc_send_oap_rpc
+ * can find pages to build into rpcs quickly */
+static int __osc_list_maint(struct client_obd *cli, struct osc_object *osc)
+{
+	if (osc_makes_hprpc(osc)) {
+		/* HP rpc */
+		on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list, 0);
+		on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 1);
+	} else {
+		on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 0);
+		on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list,
+			osc_makes_rpc(cli, osc, OBD_BRW_WRITE) ||
+			osc_makes_rpc(cli, osc, OBD_BRW_READ));
+	}
+
+	on_list(&osc->oo_write_item, &cli->cl_loi_write_list,
+		atomic_read(&osc->oo_nr_writes) > 0);
+
+	on_list(&osc->oo_read_item, &cli->cl_loi_read_list,
+		atomic_read(&osc->oo_nr_reads) > 0);
+
+	return osc_is_ready(osc);
+}
+
+static int osc_list_maint(struct client_obd *cli, struct osc_object *osc)
+{
+	int is_ready;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	is_ready = __osc_list_maint(cli, osc);
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	return is_ready;
+}
+
+/* this is trying to propogate async writeback errors back up to the
+ * application.  As an async write fails we record the error code for later if
+ * the app does an fsync.  As long as errors persist we force future rpcs to be
+ * sync so that the app can get a sync error and break the cycle of queueing
+ * pages for which writeback will fail. */
+static void osc_process_ar(struct osc_async_rc *ar, __u64 xid,
+			   int rc)
+{
+	if (rc) {
+		if (!ar->ar_rc)
+			ar->ar_rc = rc;
+
+		ar->ar_force_sync = 1;
+		ar->ar_min_xid = ptlrpc_sample_next_xid();
+		return;
+
+	}
+
+	if (ar->ar_force_sync && (xid >= ar->ar_min_xid))
+		ar->ar_force_sync = 0;
+}
+
+/* this must be called holding the loi list lock to give coverage to exit_cache,
+ * async_flag maintenance, and oap_request */
+static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli,
+			      struct osc_async_page *oap, int sent, int rc)
+{
+	struct osc_object *osc = oap->oap_obj;
+	struct lov_oinfo  *loi = osc->oo_oinfo;
+	__u64 xid = 0;
+
+	ENTRY;
+	if (oap->oap_request != NULL) {
+		xid = ptlrpc_req_xid(oap->oap_request);
+		ptlrpc_req_finished(oap->oap_request);
+		oap->oap_request = NULL;
+	}
+
+	/* As the transfer for this page is being done, clear the flags */
+	spin_lock(&oap->oap_lock);
+	oap->oap_async_flags = 0;
+	spin_unlock(&oap->oap_lock);
+	oap->oap_interrupted = 0;
+
+	if (oap->oap_cmd & OBD_BRW_WRITE && xid > 0) {
+		spin_lock(&cli->cl_loi_list_lock);
+		osc_process_ar(&cli->cl_ar, xid, rc);
+		osc_process_ar(&loi->loi_ar, xid, rc);
+		spin_unlock(&cli->cl_loi_list_lock);
+	}
+
+	rc = osc_completion(env, oap, oap->oap_cmd, rc);
+	if (rc)
+		CERROR("completion on oap %p obj %p returns %d.\n",
+		       oap, osc, rc);
+
+	EXIT;
+}
+
+struct extent_rpc_data {
+	struct list_head	*erd_rpc_list;
+	unsigned int		erd_page_count;
+	unsigned int		erd_max_pages;
+	unsigned int		erd_max_chunks;
+	unsigned int		erd_max_extents;
+};
+
+static inline unsigned osc_extent_chunks(const struct osc_extent *ext)
+{
+	struct client_obd *cli = osc_cli(ext->oe_obj);
+	unsigned ppc_bits = cli->cl_chunkbits - PAGE_SHIFT;
+
+	return (ext->oe_end >> ppc_bits) - (ext->oe_start >> ppc_bits) + 1;
+}
+
+/**
+ * Try to add extent to one RPC. We need to think about the following things:
+ * - # of pages must not be over max_pages_per_rpc
+ * - extent must be compatible with previous ones
+ */
+static int try_to_add_extent_for_io(struct client_obd *cli,
+				    struct osc_extent *ext,
+				    struct extent_rpc_data *data)
+{
+	struct osc_extent *tmp;
+	unsigned int chunk_count;
+	struct osc_async_page *oap = list_first_entry(&ext->oe_pages,
+						      struct osc_async_page,
+						      oap_pending_item);
+	ENTRY;
+
+	EASSERT((ext->oe_state == OES_CACHE || ext->oe_state == OES_LOCK_DONE),
+		ext);
+	OSC_EXTENT_DUMP(D_CACHE, ext, "trying to add this extent\n");
+
+	if (data->erd_max_extents == 0)
+		RETURN(0);
+
+	chunk_count = osc_extent_chunks(ext);
+	EASSERTF(data->erd_page_count != 0 ||
+		 chunk_count <= data->erd_max_chunks, ext,
+		 "The first extent to be fit in a RPC contains %u chunks, "
+		 "which is over the limit %u.\n", chunk_count,
+		 data->erd_max_chunks);
+	if (chunk_count > data->erd_max_chunks)
+		RETURN(0);
+
+	data->erd_max_pages = max(ext->oe_mppr, data->erd_max_pages);
+	EASSERTF(data->erd_page_count != 0 ||
+		ext->oe_nr_pages <= data->erd_max_pages, ext,
+		"The first extent to be fit in a RPC contains %u pages, "
+		"which is over the limit %u.\n", ext->oe_nr_pages,
+		data->erd_max_pages);
+	if (data->erd_page_count + ext->oe_nr_pages > data->erd_max_pages)
+		RETURN(0);
+
+	list_for_each_entry(tmp, data->erd_rpc_list, oe_link) {
+		struct osc_async_page *oap2;
+		oap2 = list_first_entry(&tmp->oe_pages, struct osc_async_page,
+					oap_pending_item);
+		EASSERT(tmp->oe_owner == current, tmp);
+#if 0
+		if (overlapped(tmp, ext)) {
+			OSC_EXTENT_DUMP(D_ERROR, tmp, "overlapped %p.\n", ext);
+			EASSERT(0, ext);
+		}
+#endif
+		if (oap2cl_page(oap)->cp_type != oap2cl_page(oap2)->cp_type) {
+			CDEBUG(D_CACHE, "Do not permit different types of IO "
+			       "in one RPC\n");
+			RETURN(0);
+		}
+
+		if (tmp->oe_srvlock != ext->oe_srvlock ||
+		    !tmp->oe_grants != !ext->oe_grants ||
+		    tmp->oe_no_merge || ext->oe_no_merge)
+			RETURN(0);
+
+		/* remove break for strict check */
+		break;
+	}
+
+	data->erd_max_extents--;
+	data->erd_max_chunks -= chunk_count;
+	data->erd_page_count += ext->oe_nr_pages;
+	list_move_tail(&ext->oe_link, data->erd_rpc_list);
+	ext->oe_owner = current;
+	RETURN(1);
+}
+
+static inline unsigned osc_max_write_chunks(const struct client_obd *cli)
+{
+	/*
+	 * LU-8135:
+	 *
+	 * The maximum size of a single transaction is about 64MB in ZFS.
+	 * #define DMU_MAX_ACCESS (64 * 1024 * 1024)
+	 *
+	 * Since ZFS is a copy-on-write file system, a single dirty page in
+	 * a chunk will result in the rewrite of the whole chunk, therefore
+	 * an RPC shouldn't be allowed to contain too many chunks otherwise
+	 * it will make transaction size much bigger than 64MB, especially
+	 * with big block size for ZFS.
+	 *
+	 * This piece of code is to make sure that OSC won't send write RPCs
+	 * with too many chunks. The maximum chunk size that an RPC can cover
+	 * is set to PTLRPC_MAX_BRW_SIZE, which is defined to 16MB. Ideally
+	 * OST should tell the client what the biggest transaction size is,
+	 * but it's good enough for now.
+	 *
+	 * This limitation doesn't apply to ldiskfs, which allows as many
+	 * chunks in one RPC as we want. However, it won't have any benefits
+	 * to have too many discontiguous pages in one RPC.
+	 *
+	 * An osc_extent won't cover over a RPC size, so the chunks in an
+	 * osc_extent won't bigger than PTLRPC_MAX_BRW_SIZE >> chunkbits.
+	 */
+	return PTLRPC_MAX_BRW_SIZE >> cli->cl_chunkbits;
+}
+
+/**
+ * In order to prevent multiple ptlrpcd from breaking contiguous extents,
+ * get_write_extent() takes all appropriate extents in atomic.
+ *
+ * The following policy is used to collect extents for IO:
+ * 1. Add as many HP extents as possible;
+ * 2. Add the first urgent extent in urgent extent list and take it out of
+ *    urgent list;
+ * 3. Add subsequent extents of this urgent extent;
+ * 4. If urgent list is not empty, goto 2;
+ * 5. Traverse the extent tree from the 1st extent;
+ * 6. Above steps exit if there is no space in this RPC.
+ */
+static unsigned int get_write_extents(struct osc_object *obj,
+				      struct list_head *rpclist)
+{
+	struct client_obd *cli = osc_cli(obj);
+	struct osc_extent *ext;
+	struct extent_rpc_data data = {
+		.erd_rpc_list	= rpclist,
+		.erd_page_count	= 0,
+		.erd_max_pages	= cli->cl_max_pages_per_rpc,
+		.erd_max_chunks	= osc_max_write_chunks(cli),
+		.erd_max_extents = 256,
+	};
+
+	LASSERT(osc_object_is_locked(obj));
+	while (!list_empty(&obj->oo_hp_exts)) {
+		ext = list_entry(obj->oo_hp_exts.next, struct osc_extent,
+				 oe_link);
+		LASSERT(ext->oe_state == OES_CACHE);
+		if (!try_to_add_extent_for_io(cli, ext, &data))
+			return data.erd_page_count;
+		EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext);
+	}
+	if (data.erd_page_count == data.erd_max_pages)
+		return data.erd_page_count;
+
+	while (!list_empty(&obj->oo_urgent_exts)) {
+		ext = list_entry(obj->oo_urgent_exts.next,
+				 struct osc_extent, oe_link);
+		if (!try_to_add_extent_for_io(cli, ext, &data))
+			return data.erd_page_count;
+	}
+	if (data.erd_page_count == data.erd_max_pages)
+		return data.erd_page_count;
+
+	/* One key difference between full extents and other extents: full
+	 * extents can usually only be added if the rpclist was empty, so if we
+	 * can't add one, we continue on to trying to add normal extents.  This
+	 * is so we don't miss adding extra extents to an RPC containing high
+	 * priority or urgent extents. */
+	while (!list_empty(&obj->oo_full_exts)) {
+		ext = list_entry(obj->oo_full_exts.next,
+				 struct osc_extent, oe_link);
+		if (!try_to_add_extent_for_io(cli, ext, &data))
+			break;
+	}
+	if (data.erd_page_count == data.erd_max_pages)
+		return data.erd_page_count;
+
+	ext = first_extent(obj);
+	while (ext != NULL) {
+		if ((ext->oe_state != OES_CACHE) ||
+		    /* this extent may be already in current rpclist */
+		    (!list_empty(&ext->oe_link) && ext->oe_owner != NULL)) {
+			ext = next_extent(ext);
+			continue;
+		}
+
+		if (!try_to_add_extent_for_io(cli, ext, &data))
+			return data.erd_page_count;
+
+		ext = next_extent(ext);
+	}
+	return data.erd_page_count;
+}
+
+static int
+osc_send_write_rpc(const struct lu_env *env, struct client_obd *cli,
+		   struct osc_object *osc)
+__must_hold(osc)
+{
+	struct list_head   rpclist = LIST_HEAD_INIT(rpclist);
+	struct osc_extent *ext;
+	struct osc_extent *tmp;
+	struct osc_extent *first = NULL;
+	unsigned int page_count = 0;
+	int srvlock = 0;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(osc_object_is_locked(osc));
+
+	page_count = get_write_extents(osc, &rpclist);
+	LASSERT(equi(page_count == 0, list_empty(&rpclist)));
+
+	if (list_empty(&rpclist))
+		RETURN(0);
+
+	osc_update_pending(osc, OBD_BRW_WRITE, -page_count);
+
+	list_for_each_entry(ext, &rpclist, oe_link) {
+		LASSERT(ext->oe_state == OES_CACHE ||
+			ext->oe_state == OES_LOCK_DONE);
+		if (ext->oe_state == OES_CACHE)
+			osc_extent_state_set(ext, OES_LOCKING);
+		else
+			osc_extent_state_set(ext, OES_RPC);
+	}
+
+	/* we're going to grab page lock, so release object lock because
+	 * lock order is page lock -> object lock. */
+	osc_object_unlock(osc);
+
+	list_for_each_entry_safe(ext, tmp, &rpclist, oe_link) {
+		if (ext->oe_state == OES_LOCKING) {
+			rc = osc_extent_make_ready(env, ext);
+			if (unlikely(rc < 0)) {
+				list_del_init(&ext->oe_link);
+				osc_extent_finish(env, ext, 0, rc);
+				continue;
+			}
+		}
+		if (first == NULL) {
+			first = ext;
+			srvlock = ext->oe_srvlock;
+		} else {
+			LASSERT(srvlock == ext->oe_srvlock);
+		}
+	}
+
+	if (!list_empty(&rpclist)) {
+		LASSERT(page_count > 0);
+		rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_WRITE);
+		LASSERT(list_empty(&rpclist));
+	}
+
+	osc_object_lock(osc);
+	RETURN(rc);
+}
+
+/**
+ * prepare pages for ASYNC io and put pages in send queue.
+ *
+ * \param cmd OBD_BRW_* macroses
+ * \param lop pending pages
+ *
+ * \return zero if no page added to send queue.
+ * \return 1 if pages successfully added to send queue.
+ * \return negative on errors.
+ */
+static int
+osc_send_read_rpc(const struct lu_env *env, struct client_obd *cli,
+		  struct osc_object *osc)
+__must_hold(osc)
+{
+	struct osc_extent *ext;
+	struct osc_extent *next;
+	struct list_head rpclist = LIST_HEAD_INIT(rpclist);
+	struct extent_rpc_data data = {
+		.erd_rpc_list	= &rpclist,
+		.erd_page_count	= 0,
+		.erd_max_pages	= cli->cl_max_pages_per_rpc,
+		.erd_max_chunks	= UINT_MAX,
+		.erd_max_extents = UINT_MAX,
+	};
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(osc_object_is_locked(osc));
+	list_for_each_entry_safe(ext, next, &osc->oo_reading_exts, oe_link) {
+		EASSERT(ext->oe_state == OES_LOCK_DONE, ext);
+		if (!try_to_add_extent_for_io(cli, ext, &data))
+			break;
+		osc_extent_state_set(ext, OES_RPC);
+		EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext);
+	}
+	LASSERT(data.erd_page_count <= data.erd_max_pages);
+
+	osc_update_pending(osc, OBD_BRW_READ, -data.erd_page_count);
+
+	if (!list_empty(&rpclist)) {
+		osc_object_unlock(osc);
+
+		rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_READ);
+		LASSERT(list_empty(&rpclist));
+
+		osc_object_lock(osc);
+	}
+	RETURN(rc);
+}
+
+#define list_to_obj(list, item) ({					      \
+	struct list_head *__tmp = (list)->next;				      \
+	list_del_init(__tmp);					      \
+	list_entry(__tmp, struct osc_object, oo_##item);		      \
+})
+
+/* This is called by osc_check_rpcs() to find which objects have pages that
+ * we could be sending.  These lists are maintained by osc_makes_rpc(). */
+static struct osc_object *osc_next_obj(struct client_obd *cli)
+{
+	ENTRY;
+
+	/* First return objects that have blocked locks so that they
+	 * will be flushed quickly and other clients can get the lock,
+	 * then objects which have pages ready to be stuffed into RPCs */
+	if (!list_empty(&cli->cl_loi_hp_ready_list))
+		RETURN(list_to_obj(&cli->cl_loi_hp_ready_list, hp_ready_item));
+	if (!list_empty(&cli->cl_loi_ready_list))
+		RETURN(list_to_obj(&cli->cl_loi_ready_list, ready_item));
+
+	/* then if we have cache waiters, return all objects with queued
+	 * writes.  This is especially important when many small files
+	 * have filled up the cache and not been fired into rpcs because
+	 * they don't pass the nr_pending/object threshhold */
+	if (!list_empty(&cli->cl_cache_waiters) &&
+	    !list_empty(&cli->cl_loi_write_list))
+		RETURN(list_to_obj(&cli->cl_loi_write_list, write_item));
+
+	/* then return all queued objects when we have an invalid import
+	 * so that they get flushed */
+	if (cli->cl_import == NULL || cli->cl_import->imp_invalid) {
+		if (!list_empty(&cli->cl_loi_write_list))
+			RETURN(list_to_obj(&cli->cl_loi_write_list,
+					   write_item));
+		if (!list_empty(&cli->cl_loi_read_list))
+			RETURN(list_to_obj(&cli->cl_loi_read_list,
+					   read_item));
+	}
+	RETURN(NULL);
+}
+
+/* called with the loi list lock held */
+static void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli)
+__must_hold(&cli->cl_loi_list_lock)
+{
+	struct osc_object *osc;
+	int rc = 0;
+	ENTRY;
+
+	while ((osc = osc_next_obj(cli)) != NULL) {
+		struct cl_object *obj = osc2cl(osc);
+		struct lu_ref_link link;
+
+		OSC_IO_DEBUG(osc, "%lu in flight\n", rpcs_in_flight(cli));
+
+		if (osc_max_rpc_in_flight(cli, osc)) {
+			__osc_list_maint(cli, osc);
+			break;
+		}
+
+		cl_object_get(obj);
+		spin_unlock(&cli->cl_loi_list_lock);
+		lu_object_ref_add_at(&obj->co_lu, &link, "check", current);
+
+		/* attempt some read/write balancing by alternating between
+		 * reads and writes in an object.  The makes_rpc checks here
+		 * would be redundant if we were getting read/write work items
+		 * instead of objects.  we don't want send_oap_rpc to drain a
+		 * partial read pending queue when we're given this object to
+		 * do io on writes while there are cache waiters */
+		osc_object_lock(osc);
+		if (osc_makes_rpc(cli, osc, OBD_BRW_WRITE)) {
+			rc = osc_send_write_rpc(env, cli, osc);
+			if (rc < 0) {
+				CERROR("Write request failed with %d\n", rc);
+
+				/* osc_send_write_rpc failed, mostly because of
+				 * memory pressure.
+				 *
+				 * It can't break here, because if:
+				 *  - a page was submitted by osc_io_submit, so
+				 *    page locked;
+				 *  - no request in flight
+				 *  - no subsequent request
+				 * The system will be in live-lock state,
+				 * because there is no chance to call
+				 * osc_io_unplug() and osc_check_rpcs() any
+				 * more. pdflush can't help in this case,
+				 * because it might be blocked at grabbing
+				 * the page lock as we mentioned.
+				 *
+				 * Anyway, continue to drain pages. */
+				/* break; */
+			}
+		}
+		if (osc_makes_rpc(cli, osc, OBD_BRW_READ)) {
+			rc = osc_send_read_rpc(env, cli, osc);
+			if (rc < 0)
+				CERROR("Read request failed with %d\n", rc);
+		}
+		osc_object_unlock(osc);
+
+		osc_list_maint(cli, osc);
+		lu_object_ref_del_at(&obj->co_lu, &link, "check", current);
+		cl_object_put(env, obj);
+
+		spin_lock(&cli->cl_loi_list_lock);
+	}
+}
+
+static int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli,
+			  struct osc_object *osc, int async)
+{
+	int rc = 0;
+
+	if (osc != NULL && osc_list_maint(cli, osc) == 0)
+		return 0;
+
+	if (!async) {
+		spin_lock(&cli->cl_loi_list_lock);
+		osc_check_rpcs(env, cli);
+		spin_unlock(&cli->cl_loi_list_lock);
+	} else {
+		CDEBUG(D_CACHE, "Queue writeback work for client %p.\n", cli);
+		LASSERT(cli->cl_writeback_work != NULL);
+		rc = ptlrpcd_queue_work(cli->cl_writeback_work);
+	}
+	return rc;
+}
+
+static int osc_io_unplug_async(const struct lu_env *env,
+				struct client_obd *cli, struct osc_object *osc)
+{
+	return osc_io_unplug0(env, cli, osc, 1);
+}
+
+void osc_io_unplug(const struct lu_env *env, struct client_obd *cli,
+		   struct osc_object *osc)
+{
+	(void)osc_io_unplug0(env, cli, osc, 0);
+}
+
+int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
+			struct page *page, loff_t offset)
+{
+	struct obd_export     *exp = osc_export(osc);
+	struct osc_async_page *oap = &ops->ops_oap;
+	ENTRY;
+
+	if (!page)
+		return cfs_size_round(sizeof(*oap));
+
+	oap->oap_magic = OAP_MAGIC;
+	oap->oap_cli = &exp->exp_obd->u.cli;
+	oap->oap_obj = osc;
+
+	oap->oap_page = page;
+	oap->oap_obj_off = offset;
+	LASSERT(!(offset & ~PAGE_MASK));
+
+	if (cfs_capable(CFS_CAP_SYS_RESOURCE))
+		oap->oap_brw_flags = OBD_BRW_NOQUOTA;
+
+	INIT_LIST_HEAD(&oap->oap_pending_item);
+	INIT_LIST_HEAD(&oap->oap_rpc_item);
+
+	spin_lock_init(&oap->oap_lock);
+	CDEBUG(D_INFO, "oap %p page %p obj off %llu\n",
+	       oap, page, oap->oap_obj_off);
+	RETURN(0);
+}
+
+int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
+		       struct osc_page *ops)
+{
+	struct osc_io *oio = osc_env_io(env);
+	struct osc_extent     *ext = NULL;
+	struct osc_async_page *oap = &ops->ops_oap;
+	struct client_obd     *cli = oap->oap_cli;
+	struct osc_object     *osc = oap->oap_obj;
+	pgoff_t index;
+	unsigned int tmp;
+	unsigned int grants = 0;
+	u32    brw_flags = OBD_BRW_ASYNC;
+	int    cmd = OBD_BRW_WRITE;
+	int    need_release = 0;
+	int    rc = 0;
+	ENTRY;
+
+	if (oap->oap_magic != OAP_MAGIC)
+		RETURN(-EINVAL);
+
+	if (cli->cl_import == NULL || cli->cl_import->imp_invalid)
+		RETURN(-EIO);
+
+	if (!list_empty(&oap->oap_pending_item) ||
+	    !list_empty(&oap->oap_rpc_item))
+		RETURN(-EBUSY);
+
+	/* Set the OBD_BRW_SRVLOCK before the page is queued. */
+	brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0;
+	if (cfs_capable(CFS_CAP_SYS_RESOURCE)) {
+		brw_flags |= OBD_BRW_NOQUOTA;
+		cmd |= OBD_BRW_NOQUOTA;
+	}
+
+	/* check if the file's owner/group is over quota */
+	if (!(cmd & OBD_BRW_NOQUOTA)) {
+		struct cl_object *obj;
+		struct cl_attr   *attr;
+		unsigned int qid[LL_MAXQUOTAS];
+
+		obj = cl_object_top(&osc->oo_cl);
+		attr = &osc_env_info(env)->oti_attr;
+
+		cl_object_attr_lock(obj);
+		rc = cl_object_attr_get(env, obj, attr);
+		cl_object_attr_unlock(obj);
+
+		qid[USRQUOTA] = attr->cat_uid;
+		qid[GRPQUOTA] = attr->cat_gid;
+		qid[PRJQUOTA] = attr->cat_projid;
+		if (rc == 0 && osc_quota_chkdq(cli, qid) == NO_QUOTA)
+			rc = -EDQUOT;
+		if (rc)
+			RETURN(rc);
+	}
+
+	oap->oap_cmd = cmd;
+	oap->oap_page_off = ops->ops_from;
+	oap->oap_count = ops->ops_to - ops->ops_from;
+	/* No need to hold a lock here,
+	 * since this page is not in any list yet. */
+	oap->oap_async_flags = 0;
+	oap->oap_brw_flags = brw_flags;
+
+	OSC_IO_DEBUG(osc, "oap %p page %p added for cmd %d\n",
+		     oap, oap->oap_page, oap->oap_cmd & OBD_BRW_RWMASK);
+
+	index = osc_index(oap2osc(oap));
+
+	/* Add this page into extent by the following steps:
+	 * 1. if there exists an active extent for this IO, mostly this page
+	 *    can be added to the active extent and sometimes we need to
+	 *    expand extent to accomodate this page;
+	 * 2. otherwise, a new extent will be allocated. */
+
+	ext = oio->oi_active;
+	if (ext != NULL && ext->oe_start <= index && ext->oe_max_end >= index) {
+		/* one chunk plus extent overhead must be enough to write this
+		 * page */
+		grants = (1 << cli->cl_chunkbits) + cli->cl_grant_extent_tax;
+		if (ext->oe_end >= index)
+			grants = 0;
+
+		/* it doesn't need any grant to dirty this page */
+		spin_lock(&cli->cl_loi_list_lock);
+		rc = osc_enter_cache_try(cli, oap, grants, 0);
+		spin_unlock(&cli->cl_loi_list_lock);
+		if (rc == 0) { /* try failed */
+			grants = 0;
+			need_release = 1;
+		} else if (ext->oe_end < index) {
+			tmp = grants;
+			/* try to expand this extent */
+			rc = osc_extent_expand(ext, index, &tmp);
+			if (rc < 0) {
+				need_release = 1;
+				/* don't free reserved grant */
+			} else {
+				OSC_EXTENT_DUMP(D_CACHE, ext,
+						"expanded for %lu.\n", index);
+				osc_unreserve_grant(cli, grants, tmp);
+				grants = 0;
+			}
+		}
+		rc = 0;
+	} else if (ext != NULL) {
+		/* index is located outside of active extent */
+		need_release = 1;
+	}
+	if (need_release) {
+		osc_extent_release(env, ext);
+		oio->oi_active = NULL;
+		ext = NULL;
+	}
+
+	if (ext == NULL) {
+		tmp = (1 << cli->cl_chunkbits) + cli->cl_grant_extent_tax;
+
+		/* try to find new extent to cover this page */
+		LASSERT(oio->oi_active == NULL);
+		/* we may have allocated grant for this page if we failed
+		 * to expand the previous active extent. */
+		LASSERT(ergo(grants > 0, grants >= tmp));
+
+		rc = 0;
+		if (grants == 0) {
+			/* we haven't allocated grant for this page. */
+			rc = osc_enter_cache(env, cli, oap, tmp);
+			if (rc == 0)
+				grants = tmp;
+		}
+
+		tmp = grants;
+		if (rc == 0) {
+			ext = osc_extent_find(env, osc, index, &tmp);
+			if (IS_ERR(ext)) {
+				LASSERT(tmp == grants);
+				osc_exit_cache(cli, oap);
+				rc = PTR_ERR(ext);
+				ext = NULL;
+			} else {
+				oio->oi_active = ext;
+			}
+		}
+		if (grants > 0)
+			osc_unreserve_grant(cli, grants, tmp);
+	}
+
+	LASSERT(ergo(rc == 0, ext != NULL));
+	if (ext != NULL) {
+		EASSERTF(ext->oe_end >= index && ext->oe_start <= index,
+			 ext, "index = %lu.\n", index);
+		LASSERT((oap->oap_brw_flags & OBD_BRW_FROM_GRANT) != 0);
+
+		osc_object_lock(osc);
+		if (ext->oe_nr_pages == 0)
+			ext->oe_srvlock = ops->ops_srvlock;
+		else
+			LASSERT(ext->oe_srvlock == ops->ops_srvlock);
+		++ext->oe_nr_pages;
+		list_add_tail(&oap->oap_pending_item, &ext->oe_pages);
+		osc_object_unlock(osc);
+	}
+	RETURN(rc);
+}
+
+int osc_teardown_async_page(const struct lu_env *env,
+			    struct osc_object *obj, struct osc_page *ops)
+{
+	struct osc_async_page *oap = &ops->ops_oap;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(oap->oap_magic == OAP_MAGIC);
+
+	CDEBUG(D_INFO, "teardown oap %p page %p at index %lu.\n",
+	       oap, ops, osc_index(oap2osc(oap)));
+
+	if (!list_empty(&oap->oap_rpc_item)) {
+		CDEBUG(D_CACHE, "oap %p is not in cache.\n", oap);
+		rc = -EBUSY;
+	} else if (!list_empty(&oap->oap_pending_item)) {
+		struct osc_extent *ext = NULL;
+
+		osc_object_lock(obj);
+		ext = osc_extent_lookup(obj, osc_index(oap2osc(oap)));
+		osc_object_unlock(obj);
+		/* only truncated pages are allowed to be taken out.
+		 * See osc_extent_truncate() and osc_cache_truncate_start()
+		 * for details. */
+		if (ext != NULL && ext->oe_state != OES_TRUNC) {
+			OSC_EXTENT_DUMP(D_ERROR, ext, "trunc at %lu.\n",
+					osc_index(oap2osc(oap)));
+			rc = -EBUSY;
+		}
+		if (ext != NULL)
+			osc_extent_put(env, ext);
+	}
+	RETURN(rc);
+}
+
+/**
+ * This is called when a page is picked up by kernel to write out.
+ *
+ * We should find out the corresponding extent and add the whole extent
+ * into urgent list. The extent may be being truncated or used, handle it
+ * carefully.
+ */
+int osc_flush_async_page(const struct lu_env *env, struct cl_io *io,
+			 struct osc_page *ops)
+{
+	struct osc_extent *ext   = NULL;
+	struct osc_object *obj   = cl2osc(ops->ops_cl.cpl_obj);
+	struct cl_page    *cp    = ops->ops_cl.cpl_page;
+	pgoff_t            index = osc_index(ops);
+	struct osc_async_page *oap = &ops->ops_oap;
+	bool unplug = false;
+	int rc = 0;
+	ENTRY;
+
+	osc_object_lock(obj);
+	ext = osc_extent_lookup(obj, index);
+	if (ext == NULL) {
+		osc_extent_tree_dump(D_ERROR, obj);
+		LASSERTF(0, "page index %lu is NOT covered.\n", index);
+	}
+
+	switch (ext->oe_state) {
+	case OES_RPC:
+	case OES_LOCK_DONE:
+		CL_PAGE_DEBUG(D_ERROR, env, cp, "flush an in-rpc page?\n");
+		LASSERT(0);
+		break;
+	case OES_LOCKING:
+		/* If we know this extent is being written out, we should abort
+		 * so that the writer can make this page ready. Otherwise, there
+		 * exists a deadlock problem because other process can wait for
+		 * page writeback bit holding page lock; and meanwhile in
+		 * vvp_page_make_ready(), we need to grab page lock before
+		 * really sending the RPC. */
+	case OES_TRUNC:
+		/* race with truncate, page will be redirtied */
+	case OES_ACTIVE:
+		/* The extent is active so we need to abort and let the caller
+		 * re-dirty the page. If we continued on here, and we were the
+		 * one making the extent active, we could deadlock waiting for
+		 * the page writeback to clear but it won't because the extent
+		 * is active and won't be written out. */
+		GOTO(out, rc = -EAGAIN);
+	default:
+		break;
+	}
+
+	rc = cl_page_prep(env, io, cp, CRT_WRITE);
+	if (rc)
+		GOTO(out, rc);
+
+	spin_lock(&oap->oap_lock);
+	oap->oap_async_flags |= ASYNC_READY|ASYNC_URGENT;
+	spin_unlock(&oap->oap_lock);
+
+	if (memory_pressure_get())
+		ext->oe_memalloc = 1;
+
+	ext->oe_urgent = 1;
+	if (ext->oe_state == OES_CACHE) {
+		OSC_EXTENT_DUMP(D_CACHE, ext,
+				"flush page %p make it urgent.\n", oap);
+		if (list_empty(&ext->oe_link))
+			list_add_tail(&ext->oe_link, &obj->oo_urgent_exts);
+		unplug = true;
+	}
+	rc = 0;
+	EXIT;
+
+out:
+	osc_object_unlock(obj);
+	osc_extent_put(env, ext);
+	if (unplug)
+		osc_io_unplug_async(env, osc_cli(obj), obj);
+	return rc;
+}
+
+/**
+ * this is called when a sync waiter receives an interruption.  Its job is to
+ * get the caller woken as soon as possible.  If its page hasn't been put in an
+ * rpc yet it can dequeue immediately.  Otherwise it has to mark the rpc as
+ * desiring interruption which will forcefully complete the rpc once the rpc
+ * has timed out.
+ */
+int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops)
+{
+	struct osc_async_page *oap = &ops->ops_oap;
+	struct osc_object     *obj = oap->oap_obj;
+	struct client_obd     *cli = osc_cli(obj);
+	struct osc_extent     *ext;
+	struct osc_extent     *found = NULL;
+	struct list_head            *plist;
+	pgoff_t index = osc_index(ops);
+	int     rc = -EBUSY;
+	int     cmd;
+	ENTRY;
+
+	LASSERT(!oap->oap_interrupted);
+	oap->oap_interrupted = 1;
+
+	/* Find out the caching extent */
+	osc_object_lock(obj);
+	if (oap->oap_cmd & OBD_BRW_WRITE) {
+		plist = &obj->oo_urgent_exts;
+		cmd   = OBD_BRW_WRITE;
+	} else {
+		plist = &obj->oo_reading_exts;
+		cmd   = OBD_BRW_READ;
+	}
+	list_for_each_entry(ext, plist, oe_link) {
+		if (ext->oe_start <= index && ext->oe_end >= index) {
+			LASSERT(ext->oe_state == OES_LOCK_DONE);
+			/* For OES_LOCK_DONE state extent, it has already held
+			 * a refcount for RPC. */
+			found = osc_extent_get(ext);
+			break;
+		}
+	}
+	if (found != NULL) {
+		list_del_init(&found->oe_link);
+		osc_update_pending(obj, cmd, -found->oe_nr_pages);
+		osc_object_unlock(obj);
+
+		osc_extent_finish(env, found, 0, -EINTR);
+		osc_extent_put(env, found);
+		rc = 0;
+	} else {
+		osc_object_unlock(obj);
+		/* ok, it's been put in an rpc. only one oap gets a request
+		 * reference */
+		if (oap->oap_request != NULL) {
+			ptlrpc_mark_interrupted(oap->oap_request);
+			ptlrpcd_wake(oap->oap_request);
+			ptlrpc_req_finished(oap->oap_request);
+			oap->oap_request = NULL;
+		}
+	}
+
+	osc_list_maint(cli, obj);
+	RETURN(rc);
+}
+
+int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
+			 struct list_head *list, int cmd, int brw_flags)
+{
+	struct client_obd     *cli = osc_cli(obj);
+	struct osc_extent     *ext;
+	struct osc_async_page *oap;
+	int     page_count = 0;
+	int     mppr       = cli->cl_max_pages_per_rpc;
+	bool	can_merge   = true;
+	pgoff_t start      = CL_PAGE_EOF;
+	pgoff_t end        = 0;
+	ENTRY;
+
+	list_for_each_entry(oap, list, oap_pending_item) {
+		struct osc_page *opg = oap2osc_page(oap);
+		pgoff_t index = osc_index(opg);
+
+		if (index > end)
+			end = index;
+		if (index < start)
+			start = index;
+		++page_count;
+		mppr <<= (page_count > mppr);
+
+		if (unlikely(opg->ops_from > 0 || opg->ops_to < PAGE_SIZE))
+			can_merge = false;
+	}
+
+	ext = osc_extent_alloc(obj);
+	if (ext == NULL) {
+		struct osc_async_page *tmp;
+
+		list_for_each_entry_safe(oap, tmp, list, oap_pending_item) {
+			list_del_init(&oap->oap_pending_item);
+			osc_ap_completion(env, cli, oap, 0, -ENOMEM);
+		}
+		RETURN(-ENOMEM);
+	}
+
+	ext->oe_rw = !!(cmd & OBD_BRW_READ);
+	ext->oe_sync = 1;
+	ext->oe_no_merge = !can_merge;
+	ext->oe_urgent = 1;
+	ext->oe_start = start;
+	ext->oe_end = ext->oe_max_end = end;
+	ext->oe_obj = obj;
+	ext->oe_srvlock = !!(brw_flags & OBD_BRW_SRVLOCK);
+	ext->oe_nr_pages = page_count;
+	ext->oe_mppr = mppr;
+	list_splice_init(list, &ext->oe_pages);
+
+	osc_object_lock(obj);
+	/* Reuse the initial refcount for RPC, don't drop it */
+	osc_extent_state_set(ext, OES_LOCK_DONE);
+	if (cmd & OBD_BRW_WRITE) {
+		list_add_tail(&ext->oe_link, &obj->oo_urgent_exts);
+		osc_update_pending(obj, OBD_BRW_WRITE, page_count);
+	} else {
+		list_add_tail(&ext->oe_link, &obj->oo_reading_exts);
+		osc_update_pending(obj, OBD_BRW_READ, page_count);
+	}
+	osc_object_unlock(obj);
+
+	osc_io_unplug_async(env, cli, obj);
+	RETURN(0);
+}
+
+/**
+ * Called by osc_io_setattr_start() to freeze and destroy covering extents.
+ */
+int osc_cache_truncate_start(const struct lu_env *env, struct osc_object *obj,
+			     __u64 size, struct osc_extent **extp)
+{
+	struct client_obd *cli = osc_cli(obj);
+	struct osc_extent *ext;
+	struct osc_extent *waiting = NULL;
+	pgoff_t index;
+	struct list_head list = LIST_HEAD_INIT(list);
+	int result = 0;
+	bool partial;
+	ENTRY;
+
+	/* pages with index greater or equal to index will be truncated. */
+	index = cl_index(osc2cl(obj), size);
+	partial = size > cl_offset(osc2cl(obj), index);
+
+again:
+	osc_object_lock(obj);
+	ext = osc_extent_search(obj, index);
+	if (ext == NULL)
+		ext = first_extent(obj);
+	else if (ext->oe_end < index)
+		ext = next_extent(ext);
+	while (ext != NULL) {
+		EASSERT(ext->oe_state != OES_TRUNC, ext);
+
+		if (ext->oe_state > OES_CACHE || ext->oe_urgent) {
+			/* if ext is in urgent state, it means there must exist
+			 * a page already having been flushed by write_page().
+			 * We have to wait for this extent because we can't
+			 * truncate that page. */
+			OSC_EXTENT_DUMP(D_CACHE, ext,
+					"waiting for busy extent\n");
+			waiting = osc_extent_get(ext);
+			break;
+		}
+
+		OSC_EXTENT_DUMP(D_CACHE, ext, "try to trunc:%llu.\n", size);
+
+		osc_extent_get(ext);
+		if (ext->oe_state == OES_ACTIVE) {
+			/* though we grab inode mutex for write path, but we
+			 * release it before releasing extent(in osc_io_end()),
+			 * so there is a race window that an extent is still
+			 * in OES_ACTIVE when truncate starts. */
+			LASSERT(!ext->oe_trunc_pending);
+			ext->oe_trunc_pending = 1;
+		} else {
+			EASSERT(ext->oe_state == OES_CACHE, ext);
+			osc_extent_state_set(ext, OES_TRUNC);
+			osc_update_pending(obj, OBD_BRW_WRITE,
+					   -ext->oe_nr_pages);
+		}
+		/* This extent could be on the full extents list, that's OK */
+		EASSERT(!ext->oe_hp && !ext->oe_urgent, ext);
+		if (!list_empty(&ext->oe_link))
+			list_move_tail(&ext->oe_link, &list);
+		else
+			list_add_tail(&ext->oe_link, &list);
+
+		ext = next_extent(ext);
+	}
+	osc_object_unlock(obj);
+
+	osc_list_maint(cli, obj);
+
+	while (!list_empty(&list)) {
+		int rc;
+
+		ext = list_entry(list.next, struct osc_extent, oe_link);
+		list_del_init(&ext->oe_link);
+
+		/* extent may be in OES_ACTIVE state because inode mutex
+		 * is released before osc_io_end() in file write case */
+		if (ext->oe_state != OES_TRUNC)
+			osc_extent_wait(env, ext, OES_TRUNC);
+
+		rc = osc_extent_truncate(ext, index, partial);
+		if (rc < 0) {
+			if (result == 0)
+				result = rc;
+
+			OSC_EXTENT_DUMP(D_ERROR, ext,
+					"truncate error %d\n", rc);
+		} else if (ext->oe_nr_pages == 0) {
+			osc_extent_remove(ext);
+		} else {
+			/* this must be an overlapped extent which means only
+			 * part of pages in this extent have been truncated.
+			 */
+			EASSERTF(ext->oe_start <= index, ext,
+				 "trunc index = %lu/%d.\n", index, partial);
+			/* fix index to skip this partially truncated extent */
+			index = ext->oe_end + 1;
+			partial = false;
+
+			/* we need to hold this extent in OES_TRUNC state so
+			 * that no writeback will happen. This is to avoid
+			 * BUG 17397.
+			 * Only partial truncate can reach here, if @size is
+			 * not zero, the caller should provide a valid @extp. */
+			LASSERT(*extp == NULL);
+			*extp = osc_extent_get(ext);
+			OSC_EXTENT_DUMP(D_CACHE, ext,
+					"trunc at %llu\n", size);
+		}
+		osc_extent_put(env, ext);
+	}
+	if (waiting != NULL) {
+		int rc;
+
+		/* ignore the result of osc_extent_wait the write initiator
+		 * should take care of it. */
+		rc = osc_extent_wait(env, waiting, OES_INV);
+		if (rc < 0)
+			OSC_EXTENT_DUMP(D_CACHE, waiting, "error: %d.\n", rc);
+
+		osc_extent_put(env, waiting);
+		waiting = NULL;
+		goto again;
+	}
+	RETURN(result);
+}
+
+/**
+ * Called after osc_io_setattr_end to add oio->oi_trunc back to cache.
+ */
+void osc_cache_truncate_end(const struct lu_env *env, struct osc_extent *ext)
+{
+	if (ext != NULL) {
+		struct osc_object *obj = ext->oe_obj;
+		bool unplug = false;
+
+		EASSERT(ext->oe_nr_pages > 0, ext);
+		EASSERT(ext->oe_state == OES_TRUNC, ext);
+		EASSERT(!ext->oe_urgent, ext);
+
+		OSC_EXTENT_DUMP(D_CACHE, ext, "trunc -> cache.\n");
+		osc_object_lock(obj);
+		osc_extent_state_set(ext, OES_CACHE);
+		if (ext->oe_fsync_wait && !ext->oe_urgent) {
+			ext->oe_urgent = 1;
+			list_move_tail(&ext->oe_link, &obj->oo_urgent_exts);
+			unplug = true;
+		}
+		osc_update_pending(obj, OBD_BRW_WRITE, ext->oe_nr_pages);
+		osc_object_unlock(obj);
+		osc_extent_put(env, ext);
+
+		if (unplug)
+			osc_io_unplug_async(env, osc_cli(obj), obj);
+	}
+}
+
+/**
+ * Wait for extents in a specific range to be written out.
+ * The caller must have called osc_cache_writeback_range() to issue IO
+ * otherwise it will take a long time for this function to finish.
+ *
+ * Caller must hold inode_mutex , or cancel exclusive dlm lock so that
+ * nobody else can dirty this range of file while we're waiting for
+ * extents to be written.
+ */
+int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
+			 pgoff_t start, pgoff_t end)
+{
+	struct osc_extent *ext;
+	pgoff_t index = start;
+	int     result = 0;
+	ENTRY;
+
+again:
+	osc_object_lock(obj);
+	ext = osc_extent_search(obj, index);
+	if (ext == NULL)
+		ext = first_extent(obj);
+	else if (ext->oe_end < index)
+		ext = next_extent(ext);
+	while (ext != NULL) {
+		int rc;
+
+		if (ext->oe_start > end)
+			break;
+
+		if (!ext->oe_fsync_wait) {
+			ext = next_extent(ext);
+			continue;
+		}
+
+		EASSERT(ergo(ext->oe_state == OES_CACHE,
+			     ext->oe_hp || ext->oe_urgent), ext);
+		EASSERT(ergo(ext->oe_state == OES_ACTIVE,
+			     !ext->oe_hp && ext->oe_urgent), ext);
+
+		index = ext->oe_end + 1;
+		osc_extent_get(ext);
+		osc_object_unlock(obj);
+
+		rc = osc_extent_wait(env, ext, OES_INV);
+		if (result == 0)
+			result = rc;
+		osc_extent_put(env, ext);
+		goto again;
+	}
+	osc_object_unlock(obj);
+
+	OSC_IO_DEBUG(obj, "sync file range.\n");
+	RETURN(result);
+}
+
+/**
+ * Called to write out a range of osc object.
+ *
+ * @hp     : should be set this is caused by lock cancel;
+ * @discard: is set if dirty pages should be dropped - file will be deleted or
+ *	   truncated, this implies there is no partially discarding extents.
+ *
+ * Return how many pages will be issued, or error code if error occurred.
+ */
+int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
+			      pgoff_t start, pgoff_t end, int hp, int discard)
+{
+	struct osc_extent *ext;
+	struct list_head discard_list = LIST_HEAD_INIT(discard_list);
+	bool unplug = false;
+	int result = 0;
+	ENTRY;
+
+	osc_object_lock(obj);
+	ext = osc_extent_search(obj, start);
+	if (ext == NULL)
+		ext = first_extent(obj);
+	else if (ext->oe_end < start)
+		ext = next_extent(ext);
+	while (ext != NULL) {
+		if (ext->oe_start > end)
+			break;
+
+		ext->oe_fsync_wait = 1;
+		switch (ext->oe_state) {
+		case OES_CACHE:
+			result += ext->oe_nr_pages;
+			if (!discard) {
+				struct list_head *list = NULL;
+				if (hp) {
+					EASSERT(!ext->oe_hp, ext);
+					ext->oe_hp = 1;
+					list = &obj->oo_hp_exts;
+				} else if (!ext->oe_urgent) {
+					ext->oe_urgent = 1;
+					list = &obj->oo_urgent_exts;
+				}
+				if (list != NULL)
+					list_move_tail(&ext->oe_link, list);
+				unplug = true;
+			} else {
+				/* the only discarder is lock cancelling, so
+				 * [start, end] must contain this extent */
+				EASSERT(ext->oe_start >= start &&
+					ext->oe_max_end <= end, ext);
+				osc_extent_state_set(ext, OES_LOCKING);
+				ext->oe_owner = current;
+				list_move_tail(&ext->oe_link,
+						   &discard_list);
+				osc_update_pending(obj, OBD_BRW_WRITE,
+						   -ext->oe_nr_pages);
+			}
+			break;
+		case OES_ACTIVE:
+			/* It's pretty bad to wait for ACTIVE extents, because
+			 * we don't know how long we will wait for it to be
+			 * flushed since it may be blocked at awaiting more
+			 * grants. We do this for the correctness of fsync. */
+			LASSERT(hp == 0 && discard == 0);
+			ext->oe_urgent = 1;
+			break;
+		case OES_TRUNC:
+			/* this extent is being truncated, can't do anything
+			 * for it now. it will be set to urgent after truncate
+			 * is finished in osc_cache_truncate_end(). */
+		default:
+			break;
+		}
+		ext = next_extent(ext);
+	}
+	osc_object_unlock(obj);
+
+	LASSERT(ergo(!discard, list_empty(&discard_list)));
+	if (!list_empty(&discard_list)) {
+		struct osc_extent *tmp;
+		int rc;
+
+		osc_list_maint(osc_cli(obj), obj);
+		list_for_each_entry_safe(ext, tmp, &discard_list, oe_link) {
+			list_del_init(&ext->oe_link);
+			EASSERT(ext->oe_state == OES_LOCKING, ext);
+
+			/* Discard caching pages. We don't actually write this
+			 * extent out but we complete it as if we did. */
+			rc = osc_extent_make_ready(env, ext);
+			if (unlikely(rc < 0)) {
+				OSC_EXTENT_DUMP(D_ERROR, ext,
+						"make_ready returned %d\n", rc);
+				if (result >= 0)
+					result = rc;
+			}
+
+			/* finish the extent as if the pages were sent */
+			osc_extent_finish(env, ext, 0, 0);
+		}
+	}
+
+	if (unplug)
+		osc_io_unplug(env, osc_cli(obj), obj);
+
+	if (hp || discard) {
+		int rc;
+		rc = osc_cache_wait_range(env, obj, start, end);
+		if (result >= 0 && rc < 0)
+			result = rc;
+	}
+
+	OSC_IO_DEBUG(obj, "pageout [%lu, %lu], %d.\n", start, end, result);
+	RETURN(result);
+}
+
+/**
+ * Returns a list of pages by a given [start, end] of \a obj.
+ *
+ * \param resched If not NULL, then we give up before hogging CPU for too
+ * long and set *resched = 1, in that case caller should implement a retry
+ * logic.
+ *
+ * Gang tree lookup (radix_tree_gang_lookup()) optimization is absolutely
+ * crucial in the face of [offset, EOF] locks.
+ *
+ * Return at least one page in @queue unless there is no covered page.
+ */
+int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
+			struct osc_object *osc, pgoff_t start, pgoff_t end,
+			osc_page_gang_cbt cb, void *cbdata)
+{
+	struct osc_page *ops;
+	void            **pvec;
+	pgoff_t         idx;
+	unsigned int    nr;
+	unsigned int    i;
+	unsigned int    j;
+	int             res = CLP_GANG_OKAY;
+	bool            tree_lock = true;
+	ENTRY;
+
+	idx = start;
+	pvec = osc_env_info(env)->oti_pvec;
+	spin_lock(&osc->oo_tree_lock);
+	while ((nr = radix_tree_gang_lookup(&osc->oo_tree, pvec,
+					    idx, OTI_PVEC_SIZE)) > 0) {
+		struct cl_page *page;
+		bool end_of_region = false;
+
+		for (i = 0, j = 0; i < nr; ++i) {
+			ops = pvec[i];
+			pvec[i] = NULL;
+
+			idx = osc_index(ops);
+			if (idx > end) {
+				end_of_region = true;
+				break;
+			}
+
+			page = ops->ops_cl.cpl_page;
+			LASSERT(page->cp_type == CPT_CACHEABLE);
+			if (page->cp_state == CPS_FREEING)
+				continue;
+
+			cl_page_get(page);
+			lu_ref_add_atomic(&page->cp_reference,
+					  "gang_lookup", current);
+			pvec[j++] = ops;
+		}
+		++idx;
+
+		/*
+		 * Here a delicate locking dance is performed. Current thread
+		 * holds a reference to a page, but has to own it before it
+		 * can be placed into queue. Owning implies waiting, so
+		 * radix-tree lock is to be released. After a wait one has to
+		 * check that pages weren't truncated (cl_page_own() returns
+		 * error in the latter case).
+		 */
+		spin_unlock(&osc->oo_tree_lock);
+		tree_lock = false;
+
+		for (i = 0; i < j; ++i) {
+			ops = pvec[i];
+			if (res == CLP_GANG_OKAY)
+				res = (*cb)(env, io, ops, cbdata);
+
+			page = ops->ops_cl.cpl_page;
+			lu_ref_del(&page->cp_reference, "gang_lookup", current);
+			cl_page_put(env, page);
+		}
+		if (nr < OTI_PVEC_SIZE || end_of_region)
+			break;
+
+		if (res == CLP_GANG_OKAY && need_resched())
+			res = CLP_GANG_RESCHED;
+		if (res != CLP_GANG_OKAY)
+			break;
+
+		spin_lock(&osc->oo_tree_lock);
+		tree_lock = true;
+	}
+	if (tree_lock)
+		spin_unlock(&osc->oo_tree_lock);
+	RETURN(res);
+}
+
+/**
+ * Check if page @page is covered by an extra lock or discard it.
+ */
+static int check_and_discard_cb(const struct lu_env *env, struct cl_io *io,
+				struct osc_page *ops, void *cbdata)
+{
+	struct osc_thread_info *info = osc_env_info(env);
+	struct osc_object *osc = cbdata;
+	pgoff_t index;
+
+	index = osc_index(ops);
+	if (index >= info->oti_fn_index) {
+		struct ldlm_lock *tmp;
+		struct cl_page *page = ops->ops_cl.cpl_page;
+
+		/* refresh non-overlapped index */
+		tmp = osc_dlmlock_at_pgoff(env, osc, index,
+					   OSC_DAP_FL_TEST_LOCK);
+		if (tmp != NULL) {
+			__u64 end = tmp->l_policy_data.l_extent.end;
+			/* Cache the first-non-overlapped index so as to skip
+			 * all pages within [index, oti_fn_index). This is safe
+			 * because if tmp lock is canceled, it will discard
+			 * these pages. */
+			info->oti_fn_index = cl_index(osc2cl(osc), end + 1);
+			if (end == OBD_OBJECT_EOF)
+				info->oti_fn_index = CL_PAGE_EOF;
+			LDLM_LOCK_PUT(tmp);
+		} else if (cl_page_own(env, io, page) == 0) {
+			/* discard the page */
+			cl_page_discard(env, io, page);
+			cl_page_disown(env, io, page);
+		} else {
+			LASSERT(page->cp_state == CPS_FREEING);
+		}
+	}
+
+	info->oti_next_index = index + 1;
+	return CLP_GANG_OKAY;
+}
+
+static int discard_cb(const struct lu_env *env, struct cl_io *io,
+		      struct osc_page *ops, void *cbdata)
+{
+	struct osc_thread_info *info = osc_env_info(env);
+	struct cl_page *page = ops->ops_cl.cpl_page;
+
+	/* page is top page. */
+	info->oti_next_index = osc_index(ops) + 1;
+	if (cl_page_own(env, io, page) == 0) {
+		if (!ergo(page->cp_type == CPT_CACHEABLE,
+			  !PageDirty(cl_page_vmpage(page))))
+			CL_PAGE_DEBUG(D_ERROR, env, page,
+					"discard dirty page?\n");
+
+		/* discard the page */
+		cl_page_discard(env, io, page);
+		cl_page_disown(env, io, page);
+	} else {
+		LASSERT(page->cp_state == CPS_FREEING);
+	}
+
+	return CLP_GANG_OKAY;
+}
+
+/**
+ * Discard pages protected by the given lock. This function traverses radix
+ * tree to find all covering pages and discard them. If a page is being covered
+ * by other locks, it should remain in cache.
+ *
+ * If error happens on any step, the process continues anyway (the reasoning
+ * behind this being that lock cancellation cannot be delayed indefinitely).
+ */
+int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc,
+			   pgoff_t start, pgoff_t end, bool discard)
+{
+	struct osc_thread_info *info = osc_env_info(env);
+	struct cl_io *io = &info->oti_io;
+	osc_page_gang_cbt cb;
+	int res;
+	int result;
+
+	ENTRY;
+
+	io->ci_obj = cl_object_top(osc2cl(osc));
+	io->ci_ignore_layout = 1;
+	result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+	if (result != 0)
+		GOTO(out, result);
+
+	cb = discard ? discard_cb : check_and_discard_cb;
+	info->oti_fn_index = info->oti_next_index = start;
+	do {
+		res = osc_page_gang_lookup(env, io, osc,
+					   info->oti_next_index, end, cb, osc);
+		if (info->oti_next_index > end)
+			break;
+
+		if (res == CLP_GANG_RESCHED)
+			cond_resched();
+	} while (res != CLP_GANG_OKAY);
+out:
+	cl_io_fini(env, io);
+	RETURN(result);
+}
+
+
+/** @} osc */
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_cl_internal.h b/drivers/staging/lustrefsx/lustre/osc/osc_cl_internal.h
new file mode 100644
index 0000000000000..7e6cbc017dfde
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_cl_internal.h
@@ -0,0 +1,679 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal interfaces of OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#ifndef OSC_CL_INTERNAL_H
+#define OSC_CL_INTERNAL_H
+
+#include <libcfs/libcfs.h>
+#include <obd.h>
+/* osc_build_res_name() */
+#include <cl_object.h>
+#include "osc_internal.h"
+
+/** \defgroup osc osc
+ *  @{
+ */
+
+struct osc_extent;
+
+/**
+ * State maintained by osc layer for each IO context.
+ */
+struct osc_io {
+	/** super class */
+	struct cl_io_slice oi_cl;
+	/** true if this io is lockless. */
+	unsigned int	   oi_lockless:1,
+	/** true if this io is counted as active IO */
+			   oi_is_active:1;
+	/** how many LRU pages are reserved for this IO */
+	unsigned long	   oi_lru_reserved;
+
+	/** active extents, we know how many bytes is going to be written,
+	 * so having an active extent will prevent it from being fragmented */
+	struct osc_extent *oi_active;
+	/** partially truncated extent, we need to hold this extent to prevent
+	 * page writeback from happening. */
+	struct osc_extent *oi_trunc;
+	/** write osc_lock for this IO, used by osc_extent_find(). */
+	struct osc_lock   *oi_write_osclock;
+	struct obdo        oi_oa;
+	struct osc_async_cbargs {
+		bool		  opc_rpc_sent;
+		int               opc_rc;
+		struct completion	opc_sync;
+	} oi_cbarg;
+};
+
+/**
+ * State maintained by osc layer for the duration of a system call.
+ */
+struct osc_session {
+        struct osc_io       os_io;
+};
+
+#define OTI_PVEC_SIZE 256
+struct osc_thread_info {
+	struct ldlm_res_id	oti_resname;
+	union ldlm_policy_data	oti_policy;
+	struct cl_lock_descr	oti_descr;
+	struct cl_attr		oti_attr;
+	struct lustre_handle	oti_handle;
+	struct cl_page_list	oti_plist;
+	struct cl_io		oti_io;
+	void			*oti_pvec[OTI_PVEC_SIZE];
+	/**
+	 * Fields used by cl_lock_discard_pages().
+	 */
+	pgoff_t			oti_next_index;
+	pgoff_t			oti_fn_index; /* first non-overlapped index */
+	struct cl_sync_io	oti_anchor;
+	struct cl_req_attr	oti_req_attr;
+	struct lu_buf		oti_ladvise_buf;
+};
+
+struct osc_object {
+        struct cl_object   oo_cl;
+        struct lov_oinfo  *oo_oinfo;
+        /**
+         * True if locking against this stripe got -EUSERS.
+         */
+        int                oo_contended;
+        cfs_time_t         oo_contention_time;
+#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
+        /**
+         * IO context used for invariant checks in osc_lock_has_pages().
+         */
+        struct cl_io       oo_debug_io;
+        /** Serialization object for osc_object::oo_debug_io. */
+	struct mutex	   oo_debug_mutex;
+#endif
+	/**
+	 * used by the osc to keep track of what objects to build into rpcs.
+	 * Protected by client_obd->cli_loi_list_lock.
+	 */
+	struct list_head	oo_ready_item;
+	struct list_head	oo_hp_ready_item;
+	struct list_head	oo_write_item;
+	struct list_head	oo_read_item;
+
+	/**
+	 * extent is a red black tree to manage (async) dirty pages.
+	 */
+	struct rb_root       oo_root;
+	/**
+	 * Manage write(dirty) extents.
+	 */
+	struct list_head	oo_hp_exts;	/* list of hp extents */
+	struct list_head	oo_urgent_exts;	/* list of writeback extents */
+	struct list_head	oo_full_exts;
+
+	struct list_head	oo_reading_exts;
+
+	atomic_t	 oo_nr_reads;
+	atomic_t	 oo_nr_writes;
+
+	/** Protect extent tree. Will be used to protect
+	 * oo_{read|write}_pages soon. */
+	spinlock_t	    oo_lock;
+
+	/**
+	 * Radix tree for caching pages
+	 */
+	spinlock_t		oo_tree_lock;
+	struct radix_tree_root	oo_tree;
+	unsigned long		oo_npages;
+
+	/* Protect osc_lock this osc_object has */
+	struct list_head	oo_ol_list;
+	spinlock_t		oo_ol_spin;
+
+	/** number of active IOs of this object */
+	atomic_t		oo_nr_ios;
+	wait_queue_head_t	oo_io_waitq;
+};
+
+static inline void osc_object_lock(struct osc_object *obj)
+{
+	spin_lock(&obj->oo_lock);
+}
+
+static inline int osc_object_trylock(struct osc_object *obj)
+{
+	return spin_trylock(&obj->oo_lock);
+}
+
+static inline void osc_object_unlock(struct osc_object *obj)
+{
+	spin_unlock(&obj->oo_lock);
+}
+
+static inline int osc_object_is_locked(struct osc_object *obj)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+	return spin_is_locked(&obj->oo_lock);
+#else
+	/*
+	 * It is not perfect to return true all the time.
+	 * But since this function is only used for assertion
+	 * and checking, it seems OK.
+	 */
+	return 1;
+#endif
+}
+
+/*
+ * Lock "micro-states" for osc layer.
+ */
+enum osc_lock_state {
+        OLS_NEW,
+        OLS_ENQUEUED,
+        OLS_UPCALL_RECEIVED,
+        OLS_GRANTED,
+        OLS_CANCELLED
+};
+
+/**
+ * osc-private state of cl_lock.
+ *
+ * Interaction with DLM.
+ *
+ * Once receive upcall is invoked, osc_lock remembers a handle of DLM lock in
+ * osc_lock::ols_handle and a pointer to that lock in osc_lock::ols_dlmlock.
+ *
+ * This pointer is protected through a reference, acquired by
+ * osc_lock_upcall0(). Also, an additional reference is acquired by
+ * ldlm_lock_addref() call protecting the lock from cancellation, until
+ * osc_lock_unuse() releases it.
+ *
+ * Below is a description of how lock references are acquired and released
+ * inside of DLM.
+ *
+ * - When new lock is created and enqueued to the server (ldlm_cli_enqueue())
+ *      - ldlm_lock_create()
+ *          - ldlm_lock_new(): initializes a lock with 2 references. One for
+ *            the caller (released when reply from the server is received, or on
+ *            error), and another for the hash table.
+ *      - ldlm_lock_addref_internal(): protects the lock from cancellation.
+ *
+ * - When reply is received from the server (osc_enqueue_interpret())
+ *      - ldlm_cli_enqueue_fini()
+ *          - LDLM_LOCK_PUT(): releases caller reference acquired by
+ *            ldlm_lock_new().
+ *          - if (rc != 0)
+ *                ldlm_lock_decref(): error case: matches ldlm_cli_enqueue().
+ *      - ldlm_lock_decref(): for async locks, matches ldlm_cli_enqueue().
+ *
+ * - When lock is being cancelled (ldlm_lock_cancel())
+ *      - ldlm_lock_destroy()
+ *          - LDLM_LOCK_PUT(): releases hash-table reference acquired by
+ *            ldlm_lock_new().
+ *
+ * osc_lock is detached from ldlm_lock by osc_lock_detach() that is called
+ * either when lock is cancelled (osc_lock_blocking()), or when locks is
+ * deleted without cancellation (e.g., from cl_locks_prune()). In the latter
+ * case ldlm lock remains in memory, and can be re-attached to osc_lock in the
+ * future.
+ */
+struct osc_lock {
+	struct cl_lock_slice	ols_cl;
+	/** Internal lock to protect states, etc. */
+	spinlock_t		ols_lock;
+	/** Owner sleeps on this channel for state change */
+	struct cl_sync_io	*ols_owner;
+	/** waiting list for this lock to be cancelled */
+	struct list_head	ols_waiting_list;
+	/** wait entry of ols_waiting_list */
+	struct list_head	ols_wait_entry;
+	/** list entry for osc_object::oo_ol_list */
+	struct list_head	ols_nextlock_oscobj;
+
+	/** underlying DLM lock */
+	struct ldlm_lock	*ols_dlmlock;
+	/** DLM flags with which osc_lock::ols_lock was enqueued */
+	__u64			ols_flags;
+	/** osc_lock::ols_lock handle */
+	struct lustre_handle     ols_handle;
+	struct ldlm_enqueue_info ols_einfo;
+	enum osc_lock_state      ols_state;
+	/** lock value block */
+	struct ost_lvb		ols_lvb;
+
+        /**
+         * true, if ldlm_lock_addref() was called against
+         * osc_lock::ols_lock. This is used for sanity checking.
+         *
+         * \see osc_lock::ols_has_ref
+         */
+        unsigned                  ols_hold :1,
+        /**
+         * this is much like osc_lock::ols_hold, except that this bit is
+         * cleared _after_ reference in released in osc_lock_unuse(). This
+         * fine distinction is needed because:
+         *
+         *     - if ldlm lock still has a reference, osc_ast_data_get() needs
+         *       to return associated cl_lock (so that a flag is needed that is
+         *       cleared after ldlm_lock_decref() returned), and
+         *
+         *     - ldlm_lock_decref() can invoke blocking ast (for a
+         *       LDLM_FL_CBPENDING lock), and osc_lock functions like
+         *       osc_lock_cancel() called from there need to know whether to
+         *       release lock reference (so that a flag is needed that is
+         *       cleared before ldlm_lock_decref() is called).
+         */
+                                 ols_has_ref:1,
+        /**
+         * inherit the lockless attribute from top level cl_io.
+         * If true, osc_lock_enqueue is able to tolerate the -EUSERS error.
+         */
+                                 ols_locklessable:1,
+        /**
+         * if set, the osc_lock is a glimpse lock. For glimpse locks, we treat
+         * the EVAVAIL error as torerable, this will make upper logic happy
+         * to wait all glimpse locks to each OSTs to be completed.
+         * Glimpse lock converts to normal lock if the server lock is
+         * granted.
+         * Glimpse lock should be destroyed immediately after use.
+         */
+                                 ols_glimpse:1,
+        /**
+         * For async glimpse lock.
+         */
+                                 ols_agl:1;
+};
+
+
+/**
+ * Page state private for osc layer.
+ */
+struct osc_page {
+	struct cl_page_slice  ops_cl;
+	/**
+	 * Page queues used by osc to detect when RPC can be formed.
+	 */
+	struct osc_async_page ops_oap;
+	/**
+	 * An offset within page from which next transfer starts. This is used
+	 * by cl_page_clip() to submit partial page transfers.
+	 */
+	int                   ops_from;
+	/**
+	 * An offset within page at which next transfer ends.
+	 *
+	 * \see osc_page::ops_from.
+	 */
+	int                   ops_to;
+	/**
+	 * Boolean, true iff page is under transfer. Used for sanity checking.
+	 */
+	unsigned              ops_transfer_pinned:1,
+	/**
+	 * in LRU?
+	 */
+			      ops_in_lru:1,
+	/**
+	 * Set if the page must be transferred with OBD_BRW_SRVLOCK.
+	 */
+				ops_srvlock:1,
+	/**
+	 * If the page is in osc_object::oo_tree.
+	 */
+				ops_intree:1;
+	/**
+	 * lru page list. See osc_lru_{del|use}() in osc_page.c for usage.
+	 */
+	struct list_head	ops_lru;
+	/**
+	 * Submit time - the time when the page is starting RPC. For debugging.
+	 */
+	cfs_time_t            ops_submit_time;
+};
+
+extern struct kmem_cache *osc_lock_kmem;
+extern struct kmem_cache *osc_object_kmem;
+extern struct kmem_cache *osc_thread_kmem;
+extern struct kmem_cache *osc_session_kmem;
+extern struct kmem_cache *osc_extent_kmem;
+
+extern struct lu_device_type osc_device_type;
+extern struct lu_context_key osc_key;
+extern struct lu_context_key osc_session_key;
+
+#define OSC_FLAGS (ASYNC_URGENT|ASYNC_READY)
+
+int osc_lock_init(const struct lu_env *env,
+                  struct cl_object *obj, struct cl_lock *lock,
+                  const struct cl_io *io);
+int osc_io_init  (const struct lu_env *env,
+                  struct cl_object *obj, struct cl_io *io);
+struct lu_object *osc_object_alloc(const struct lu_env *env,
+                                   const struct lu_object_header *hdr,
+                                   struct lu_device *dev);
+int osc_page_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_page *page, pgoff_t ind);
+
+void osc_index2policy(union ldlm_policy_data *policy,
+		      const struct cl_object *obj, pgoff_t start, pgoff_t end);
+int  osc_lvb_print(const struct lu_env *env, void *cookie,
+		   lu_printer_t p, const struct ost_lvb *lvb);
+
+void osc_lru_add_batch(struct client_obd *cli, struct list_head *list);
+void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
+		     enum cl_req_type crt, int brw_flags);
+int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops);
+int osc_set_async_flags(struct osc_object *obj, struct osc_page *opg,
+			u32 async_flags);
+int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
+			struct page *page, loff_t offset);
+int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
+		       struct osc_page *ops);
+int osc_page_cache_add(const struct lu_env *env,
+		       const struct cl_page_slice *slice, struct cl_io *io);
+int osc_teardown_async_page(const struct lu_env *env, struct osc_object *obj,
+			    struct osc_page *ops);
+int osc_flush_async_page(const struct lu_env *env, struct cl_io *io,
+			 struct osc_page *ops);
+int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
+			 struct list_head *list, int cmd, int brw_flags);
+int osc_cache_truncate_start(const struct lu_env *env, struct osc_object *obj,
+			     __u64 size, struct osc_extent **extp);
+void osc_cache_truncate_end(const struct lu_env *env, struct osc_extent *ext);
+int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
+			      pgoff_t start, pgoff_t end, int hp, int discard);
+int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
+			 pgoff_t start, pgoff_t end);
+void osc_io_unplug(const struct lu_env *env, struct client_obd *cli,
+		   struct osc_object *osc);
+int lru_queue_work(const struct lu_env *env, void *data);
+
+void osc_object_set_contended  (struct osc_object *obj);
+void osc_object_clear_contended(struct osc_object *obj);
+int  osc_object_is_contended   (struct osc_object *obj);
+
+int  osc_lock_is_lockless      (const struct osc_lock *olck);
+
+/*****************************************************************************
+ *
+ * Accessors.
+ *
+ */
+
+static inline struct osc_thread_info *osc_env_info(const struct lu_env *env)
+{
+        struct osc_thread_info *info;
+
+        info = lu_context_key_get(&env->le_ctx, &osc_key);
+        LASSERT(info != NULL);
+        return info;
+}
+
+static inline struct osc_session *osc_env_session(const struct lu_env *env)
+{
+        struct osc_session *ses;
+
+        ses = lu_context_key_get(env->le_ses, &osc_session_key);
+        LASSERT(ses != NULL);
+        return ses;
+}
+
+static inline struct osc_io *osc_env_io(const struct lu_env *env)
+{
+        return &osc_env_session(env)->os_io;
+}
+
+static inline int osc_is_object(const struct lu_object *obj)
+{
+        return obj->lo_dev->ld_type == &osc_device_type;
+}
+
+static inline struct osc_device *lu2osc_dev(const struct lu_device *d)
+{
+        LINVRNT(d->ld_type == &osc_device_type);
+        return container_of0(d, struct osc_device, od_cl.cd_lu_dev);
+}
+
+static inline struct obd_export *osc_export(const struct osc_object *obj)
+{
+        return lu2osc_dev(obj->oo_cl.co_lu.lo_dev)->od_exp;
+}
+
+static inline struct client_obd *osc_cli(const struct osc_object *obj)
+{
+	return &osc_export(obj)->exp_obd->u.cli;
+}
+
+static inline struct osc_object *cl2osc(const struct cl_object *obj)
+{
+        LINVRNT(osc_is_object(&obj->co_lu));
+        return container_of0(obj, struct osc_object, oo_cl);
+}
+
+static inline struct cl_object *osc2cl(const struct osc_object *obj)
+{
+	return (struct cl_object *)&obj->oo_cl;
+}
+
+static inline enum ldlm_mode osc_cl_lock2ldlm(enum cl_lock_mode mode)
+{
+	LASSERT(mode == CLM_READ || mode == CLM_WRITE || mode == CLM_GROUP);
+	if (mode == CLM_READ)
+		return LCK_PR;
+	if (mode == CLM_WRITE)
+		return LCK_PW;
+	return LCK_GROUP;
+}
+
+static inline enum cl_lock_mode osc_ldlm2cl_lock(enum ldlm_mode mode)
+{
+	LASSERT(mode == LCK_PR || mode == LCK_PW || mode == LCK_GROUP);
+	if (mode == LCK_PR)
+		return CLM_READ;
+	if (mode == LCK_PW)
+		return CLM_WRITE;
+	return CLM_GROUP;
+}
+
+static inline struct osc_page *cl2osc_page(const struct cl_page_slice *slice)
+{
+        LINVRNT(osc_is_object(&slice->cpl_obj->co_lu));
+        return container_of0(slice, struct osc_page, ops_cl);
+}
+
+static inline struct osc_page *oap2osc(struct osc_async_page *oap)
+{
+	return container_of0(oap, struct osc_page, ops_oap);
+}
+
+static inline pgoff_t osc_index(struct osc_page *opg)
+{
+	return opg->ops_cl.cpl_index;
+}
+
+static inline struct cl_page *oap2cl_page(struct osc_async_page *oap)
+{
+	return oap2osc(oap)->ops_cl.cpl_page;
+}
+
+static inline struct osc_page *oap2osc_page(struct osc_async_page *oap)
+{
+	return (struct osc_page *)container_of(oap, struct osc_page, ops_oap);
+}
+
+static inline struct osc_page *
+osc_cl_page_osc(struct cl_page *page, struct osc_object *osc)
+{
+	const struct cl_page_slice *slice;
+
+	LASSERT(osc != NULL);
+	slice = cl_object_page_slice(&osc->oo_cl, page);
+	return cl2osc_page(slice);
+}
+
+static inline struct osc_lock *cl2osc_lock(const struct cl_lock_slice *slice)
+{
+        LINVRNT(osc_is_object(&slice->cls_obj->co_lu));
+        return container_of0(slice, struct osc_lock, ols_cl);
+}
+
+static inline struct osc_lock *osc_lock_at(const struct cl_lock *lock)
+{
+        return cl2osc_lock(cl_lock_at(lock, &osc_device_type));
+}
+
+static inline int osc_io_srvlock(struct osc_io *oio)
+{
+        return (oio->oi_lockless && !oio->oi_cl.cis_io->ci_no_srvlock);
+}
+
+enum osc_extent_state {
+	OES_INV       = 0, /** extent is just initialized or destroyed */
+	OES_ACTIVE    = 1, /** process is using this extent */
+	OES_CACHE     = 2, /** extent is ready for IO */
+	OES_LOCKING   = 3, /** locking page to prepare IO */
+	OES_LOCK_DONE = 4, /** locking finished, ready to send */
+	OES_RPC       = 5, /** in RPC */
+	OES_TRUNC     = 6, /** being truncated */
+	OES_STATE_MAX
+};
+
+/**
+ * osc_extent data to manage dirty pages.
+ * osc_extent has the following attributes:
+ * 1. all pages in the same must be in one RPC in write back;
+ * 2. # of pages must be less than max_pages_per_rpc - implied by 1;
+ * 3. must be covered by only 1 osc_lock;
+ * 4. exclusive. It's impossible to have overlapped osc_extent.
+ *
+ * The lifetime of an extent is from when the 1st page is dirtied to when
+ * all pages inside it are written out.
+ *
+ * LOCKING ORDER
+ * =============
+ * page lock -> client_obd_list_lock -> object lock(osc_object::oo_lock)
+ */
+struct osc_extent {
+	/** red-black tree node */
+	struct rb_node		oe_node;
+	/** osc_object of this extent */
+	struct osc_object	*oe_obj;
+	/** refcount, removed from red-black tree if reaches zero. */
+	atomic_t		oe_refc;
+	/** busy if non-zero */
+	atomic_t		oe_users;
+	/** link list of osc_object's oo_{hp|urgent|locking}_exts. */
+	struct list_head	oe_link;
+	/** state of this extent */
+	enum osc_extent_state	oe_state;
+	/** flags for this extent. */
+	unsigned int		oe_intree:1,
+	/** 0 is write, 1 is read */
+				oe_rw:1,
+	/** sync extent, queued by osc_queue_sync_pages() */
+				oe_sync:1,
+	/** set if this extent has partial, sync pages.
+	 * Extents with partial page(s) can't merge with others in RPC */
+				oe_no_merge:1,
+				oe_srvlock:1,
+				oe_memalloc:1,
+	/** an ACTIVE extent is going to be truncated, so when this extent
+	 * is released, it will turn into TRUNC state instead of CACHE. */
+				oe_trunc_pending:1,
+	/** this extent should be written asap and someone may wait for the
+	 * write to finish. This bit is usually set along with urgent if
+	 * the extent was CACHE state.
+	 * fsync_wait extent can't be merged because new extent region may
+	 * exceed fsync range. */
+				oe_fsync_wait:1,
+	/** covering lock is being canceled */
+				oe_hp:1,
+	/** this extent should be written back asap. set if one of pages is
+	 * called by page WB daemon, or sync write or reading requests. */
+				oe_urgent:1;
+	/** how many grants allocated for this extent.
+	 *  Grant allocated for this extent. There is no grant allocated
+	 *  for reading extents and sync write extents. */
+	unsigned int		oe_grants;
+	/** # of dirty pages in this extent */
+	unsigned int		oe_nr_pages;
+	/** list of pending oap pages. Pages in this list are NOT sorted. */
+	struct list_head	oe_pages;
+	/** Since an extent has to be written out in atomic, this is used to
+	 * remember the next page need to be locked to write this extent out.
+	 * Not used right now.
+	 */
+	struct osc_page		*oe_next_page;
+	/** start and end index of this extent, include start and end
+	 * themselves. Page offset here is the page index of osc_pages.
+	 * oe_start is used as keyword for red-black tree. */
+	pgoff_t			oe_start;
+	pgoff_t			oe_end;
+	/** maximum ending index of this extent, this is limited by
+	 * max_pages_per_rpc, lock extent and chunk size. */
+	pgoff_t			oe_max_end;
+	/** waitqueue - for those who want to be notified if this extent's
+	 * state has changed. */
+	wait_queue_head_t	oe_waitq;
+	/** lock covering this extent */
+	struct ldlm_lock	*oe_dlmlock;
+	/** terminator of this extent. Must be true if this extent is in IO. */
+	struct task_struct	*oe_owner;
+	/** return value of writeback. If somebody is waiting for this extent,
+	 * this value can be known by outside world. */
+	int			oe_rc;
+	/** max pages per rpc when this extent was created */
+	unsigned int		oe_mppr;
+};
+
+int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
+		      int sent, int rc);
+int osc_extent_release(const struct lu_env *env, struct osc_extent *ext);
+
+int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc,
+			   pgoff_t start, pgoff_t end, bool discard_pages);
+
+typedef int (*osc_page_gang_cbt)(const struct lu_env *, struct cl_io *,
+				 struct osc_page *, void *);
+int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
+			 struct osc_object *osc, pgoff_t start, pgoff_t end,
+			 osc_page_gang_cbt cb, void *cbdata);
+/** @} osc */
+
+#endif /* OSC_CL_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_dev.c b/drivers/staging/lustrefsx/lustre/osc/osc_dev.c
new file mode 100644
index 0000000000000..c06a5deb339b7
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_dev.c
@@ -0,0 +1,246 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_device, for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+/* class_name2obd() */
+#include <obd_class.h>
+
+#include "osc_cl_internal.h"
+
+/** \addtogroup osc 
+ * @{ 
+ */
+
+struct kmem_cache *osc_lock_kmem;
+struct kmem_cache *osc_object_kmem;
+struct kmem_cache *osc_thread_kmem;
+struct kmem_cache *osc_session_kmem;
+struct kmem_cache *osc_extent_kmem;
+struct kmem_cache *osc_quota_kmem;
+
+struct lu_kmem_descr osc_caches[] = {
+        {
+                .ckd_cache = &osc_lock_kmem,
+                .ckd_name  = "osc_lock_kmem",
+                .ckd_size  = sizeof (struct osc_lock)
+        },
+        {
+                .ckd_cache = &osc_object_kmem,
+                .ckd_name  = "osc_object_kmem",
+                .ckd_size  = sizeof (struct osc_object)
+        },
+        {
+                .ckd_cache = &osc_thread_kmem,
+                .ckd_name  = "osc_thread_kmem",
+                .ckd_size  = sizeof (struct osc_thread_info)
+        },
+        {
+                .ckd_cache = &osc_session_kmem,
+                .ckd_name  = "osc_session_kmem",
+                .ckd_size  = sizeof (struct osc_session)
+        },
+        {
+		.ckd_cache = &osc_extent_kmem,
+		.ckd_name  = "osc_extent_kmem",
+		.ckd_size  = sizeof (struct osc_extent)
+	},
+	{
+		.ckd_cache = &osc_quota_kmem,
+		.ckd_name  = "osc_quota_kmem",
+		.ckd_size  = sizeof(struct osc_quota_info)
+	},
+	{
+                .ckd_cache = NULL
+        }
+};
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static struct lu_device *osc2lu_dev(struct osc_device *osc)
+{
+        return &osc->od_cl.cd_lu_dev;
+}
+
+/*****************************************************************************
+ *
+ * Osc device and device type functions.
+ *
+ */
+
+static void *osc_key_init(const struct lu_context *ctx,
+			  struct lu_context_key *key)
+{
+	struct osc_thread_info *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, osc_thread_kmem, GFP_NOFS);
+	if (info == NULL)
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+static void osc_key_fini(const struct lu_context *ctx,
+                         struct lu_context_key *key, void *data)
+{
+	struct osc_thread_info *info = data;
+
+	lu_buf_free(&info->oti_ladvise_buf);
+	OBD_SLAB_FREE_PTR(info, osc_thread_kmem);
+}
+
+struct lu_context_key osc_key = {
+        .lct_tags = LCT_CL_THREAD,
+        .lct_init = osc_key_init,
+        .lct_fini = osc_key_fini
+};
+
+static void *osc_session_init(const struct lu_context *ctx,
+			      struct lu_context_key *key)
+{
+	struct osc_session *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, osc_session_kmem, GFP_NOFS);
+	if (info == NULL)
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+static void osc_session_fini(const struct lu_context *ctx,
+                             struct lu_context_key *key, void *data)
+{
+        struct osc_session *info = data;
+        OBD_SLAB_FREE_PTR(info, osc_session_kmem);
+}
+
+struct lu_context_key osc_session_key = {
+        .lct_tags = LCT_SESSION,
+        .lct_init = osc_session_init,
+        .lct_fini = osc_session_fini
+};
+
+/* type constructor/destructor: osc_type_{init,fini,start,stop}(). */
+LU_TYPE_INIT_FINI(osc, &osc_key, &osc_session_key);
+
+static int osc_cl_process_config(const struct lu_env *env,
+                                 struct lu_device *d, struct lustre_cfg *cfg)
+{
+        ENTRY;
+        RETURN(osc_process_config_base(d->ld_obd, cfg));
+}
+
+static const struct lu_device_operations osc_lu_ops = {
+        .ldo_object_alloc      = osc_object_alloc,
+        .ldo_process_config    = osc_cl_process_config,
+        .ldo_recovery_complete = NULL
+};
+
+static int osc_device_init(const struct lu_env *env, struct lu_device *d,
+                           const char *name, struct lu_device *next)
+{
+        RETURN(0);
+}
+
+static struct lu_device *osc_device_fini(const struct lu_env *env,
+                                         struct lu_device *d)
+{
+	return NULL;
+}
+
+static struct lu_device *osc_device_free(const struct lu_env *env,
+                                         struct lu_device *d)
+{
+        struct osc_device *od = lu2osc_dev(d);
+
+        cl_device_fini(lu2cl_dev(d));
+        OBD_FREE_PTR(od);
+        return NULL;
+}
+
+static struct lu_device *osc_device_alloc(const struct lu_env *env,
+                                          struct lu_device_type *t,
+                                          struct lustre_cfg *cfg)
+{
+        struct lu_device *d;
+        struct osc_device *od;
+        struct obd_device *obd;
+        int rc;
+
+        OBD_ALLOC_PTR(od);
+        if (od == NULL)
+                RETURN(ERR_PTR(-ENOMEM));
+
+        cl_device_init(&od->od_cl, t);
+        d = osc2lu_dev(od);
+        d->ld_ops = &osc_lu_ops;
+
+        /* Setup OSC OBD */
+        obd = class_name2obd(lustre_cfg_string(cfg, 0));
+        LASSERT(obd != NULL);
+        rc = osc_setup(obd, cfg);
+        if (rc) {
+                osc_device_free(env, d);
+                RETURN(ERR_PTR(rc));
+        }
+        od->od_exp = obd->obd_self_export;
+        RETURN(d);
+}
+
+static const struct lu_device_type_operations osc_device_type_ops = {
+        .ldto_init = osc_type_init,
+        .ldto_fini = osc_type_fini,
+
+        .ldto_start = osc_type_start,
+        .ldto_stop  = osc_type_stop,
+
+        .ldto_device_alloc = osc_device_alloc,
+        .ldto_device_free  = osc_device_free,
+
+        .ldto_device_init    = osc_device_init,
+        .ldto_device_fini    = osc_device_fini
+};
+
+struct lu_device_type osc_device_type = {
+        .ldt_tags     = LU_DEVICE_CL,
+        .ldt_name     = LUSTRE_OSC_NAME,
+        .ldt_ops      = &osc_device_type_ops,
+        .ldt_ctx_tags = LCT_CL_THREAD
+};
+
+/** @} osc */
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_internal.h b/drivers/staging/lustrefsx/lustre/osc/osc_internal.h
new file mode 100644
index 0000000000000..24766263514a6
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_internal.h
@@ -0,0 +1,248 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef OSC_INTERNAL_H
+#define OSC_INTERNAL_H
+
+#define OAP_MAGIC 8675309
+
+extern atomic_t osc_pool_req_count;
+extern unsigned int osc_reqpool_maxreqcount;
+extern struct ptlrpc_request_pool *osc_rq_pool;
+
+struct lu_env;
+
+enum async_flags {
+        ASYNC_READY = 0x1, /* ap_make_ready will not be called before this
+                              page is added to an rpc */
+        ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */
+        ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called
+                                     to give the caller a chance to update
+                                     or cancel the size of the io */
+        ASYNC_HP = 0x10,
+};
+
+struct osc_async_page {
+        int                     oap_magic;
+        unsigned short          oap_cmd;
+        unsigned short          oap_interrupted:1;
+
+	struct list_head	oap_pending_item;
+	struct list_head	oap_rpc_item;
+
+	loff_t			oap_obj_off;
+        unsigned                oap_page_off;
+        enum async_flags        oap_async_flags;
+
+        struct brw_page         oap_brw_page;
+
+        struct ptlrpc_request   *oap_request;
+        struct client_obd       *oap_cli;
+	struct osc_object       *oap_obj;
+
+	spinlock_t		 oap_lock;
+};
+
+#define oap_page        oap_brw_page.pg
+#define oap_count       oap_brw_page.count
+#define oap_brw_flags   oap_brw_page.flag
+
+static inline struct osc_async_page *brw_page2oap(struct brw_page *pga)
+{
+	return (struct osc_async_page *)container_of(pga, struct osc_async_page,
+						     oap_brw_page);
+}
+
+struct osc_cache_waiter {
+	struct list_head	ocw_entry;
+	wait_queue_head_t	ocw_waitq;
+	struct osc_async_page  *ocw_oap;
+	int                     ocw_grant;
+	int                     ocw_rc;
+};
+
+void osc_wake_cache_waiters(struct client_obd *cli);
+int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes);
+void osc_update_next_shrink(struct client_obd *cli);
+
+/*
+ * cl integration.
+ */
+#include <cl_object.h>
+
+extern struct ptlrpc_request_set *PTLRPCD_SET;
+
+typedef int (*osc_enqueue_upcall_f)(void *cookie, struct lustre_handle *lockh,
+				    int rc);
+
+int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+		     __u64 *flags, union ldlm_policy_data *policy,
+		     struct ost_lvb *lvb, int kms_valid,
+		     osc_enqueue_upcall_f upcall,
+		     void *cookie, struct ldlm_enqueue_info *einfo,
+		     struct ptlrpc_request_set *rqset, int async, int agl);
+
+int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+		   enum ldlm_type type, union ldlm_policy_data *policy,
+		   enum ldlm_mode mode, __u64 *flags, void *data,
+		   struct lustre_handle *lockh, int unref);
+
+int osc_setattr_async(struct obd_export *exp, struct obdo *oa,
+		      obd_enqueue_update_f upcall, void *cookie,
+		      struct ptlrpc_request_set *rqset);
+int osc_punch_base(struct obd_export *exp, struct obdo *oa,
+                   obd_enqueue_update_f upcall, void *cookie,
+                   struct ptlrpc_request_set *rqset);
+int osc_sync_base(struct osc_object *obj, struct obdo *oa,
+		  obd_enqueue_update_f upcall, void *cookie,
+		  struct ptlrpc_request_set *rqset);
+int osc_ladvise_base(struct obd_export *exp, struct obdo *oa,
+		     struct ladvise_hdr *ladvise_hdr,
+		     obd_enqueue_update_f upcall, void *cookie,
+		     struct ptlrpc_request_set *rqset);
+int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *cfg);
+int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
+		  struct list_head *ext_list, int cmd);
+long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
+		   long target, bool force);
+unsigned long osc_lru_reserve(struct client_obd *cli, unsigned long npages);
+void osc_lru_unreserve(struct client_obd *cli, unsigned long npages);
+
+extern struct lu_kmem_descr osc_caches[];
+
+unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock);
+
+int osc_cleanup(struct obd_device *obd);
+int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
+
+#ifdef CONFIG_PROC_FS
+extern struct lprocfs_vars lprocfs_osc_obd_vars[];
+int lproc_osc_attach_seqstat(struct obd_device *dev);
+#else
+static inline int lproc_osc_attach_seqstat(struct obd_device *dev) {return 0;}
+#endif
+
+extern struct lu_device_type osc_device_type;
+
+static inline int osc_recoverable_error(int rc)
+{
+        return (rc == -EIO || rc == -EROFS || rc == -ENOMEM ||
+                rc == -EAGAIN || rc == -EINPROGRESS);
+}
+
+static inline unsigned long rpcs_in_flight(struct client_obd *cli)
+{
+	return cli->cl_r_in_flight + cli->cl_w_in_flight;
+}
+
+static inline char *cli_name(struct client_obd *cli)
+{
+	return cli->cl_import->imp_obd->obd_name;
+}
+
+#ifndef min_t
+#define min_t(type,x,y) \
+        ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; })
+#endif
+
+struct osc_device {
+        struct cl_device    od_cl;
+        struct obd_export  *od_exp;
+
+        /* Write stats is actually protected by client_obd's lock. */
+        struct osc_stats {
+                uint64_t     os_lockless_writes;          /* by bytes */
+                uint64_t     os_lockless_reads;           /* by bytes */
+                uint64_t     os_lockless_truncates;       /* by times */
+        } od_stats;
+
+        /* configuration item(s) */
+        int                 od_contention_time;
+        int                 od_lockless_truncate;
+};
+
+static inline struct osc_device *obd2osc_dev(const struct obd_device *d)
+{
+        return container_of0(d->obd_lu_dev, struct osc_device, od_cl.cd_lu_dev);
+}
+
+extern struct kmem_cache *osc_quota_kmem;
+struct osc_quota_info {
+	/** linkage for quota hash table */
+	struct hlist_node oqi_hash;
+	u32		  oqi_id;
+};
+
+struct osc_async_args {
+	struct obd_info	*aa_oi;
+};
+
+int osc_quota_setup(struct obd_device *obd);
+int osc_quota_cleanup(struct obd_device *obd);
+int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[],
+		    u64 valid, u32 flags);
+int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[]);
+int osc_quotactl(struct obd_device *unused, struct obd_export *exp,
+                 struct obd_quotactl *oqctl);
+void osc_inc_unstable_pages(struct ptlrpc_request *req);
+void osc_dec_unstable_pages(struct ptlrpc_request *req);
+bool osc_over_unstable_soft_limit(struct client_obd *cli);
+/**
+ * Bit flags for osc_dlm_lock_at_pageoff().
+ */
+enum osc_dap_flags {
+	/**
+	 * Just check if the desired lock exists, it won't hold reference
+	 * count on lock.
+	 */
+	OSC_DAP_FL_TEST_LOCK = 1 << 0,
+	/**
+	 * Return the lock even if it is being canceled.
+	 */
+	OSC_DAP_FL_CANCELING = 1 << 1
+};
+struct ldlm_lock *osc_dlmlock_at_pgoff(const struct lu_env *env,
+				       struct osc_object *obj, pgoff_t index,
+				       enum osc_dap_flags flags);
+void osc_pack_req_body(struct ptlrpc_request *req, struct obdo *oa);
+int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc);
+
+/** osc shrink list to link all osc client obd */
+extern struct list_head osc_shrink_list;
+/** spin lock to protect osc_shrink_list */
+extern spinlock_t osc_shrink_lock;
+extern unsigned long osc_cache_shrink_count(struct shrinker *sk,
+					    struct shrink_control *sc);
+extern unsigned long osc_cache_shrink_scan(struct shrinker *sk,
+					   struct shrink_control *sc);
+
+#endif /* OSC_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_io.c b/drivers/staging/lustrefsx/lustre/osc/osc_io.c
new file mode 100644
index 0000000000000..38fe2532829fd
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_io.c
@@ -0,0 +1,1001 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include <lustre_obdo.h>
+
+#include "osc_cl_internal.h"
+
+/** \addtogroup osc 
+ *  @{ 
+ */
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static struct osc_io *cl2osc_io(const struct lu_env *env,
+                                const struct cl_io_slice *slice)
+{
+        struct osc_io *oio = container_of0(slice, struct osc_io, oi_cl);
+        LINVRNT(oio == osc_env_io(env));
+        return oio;
+}
+
+/*****************************************************************************
+ *
+ * io operations.
+ *
+ */
+
+static void osc_io_fini(const struct lu_env *env, const struct cl_io_slice *io)
+{
+}
+
+static void osc_read_ahead_release(const struct lu_env *env,
+				   void *cbdata)
+{
+	struct ldlm_lock *dlmlock = cbdata;
+	struct lustre_handle lockh;
+
+	ldlm_lock2handle(dlmlock, &lockh);
+	ldlm_lock_decref(&lockh, LCK_PR);
+	LDLM_LOCK_PUT(dlmlock);
+}
+
+static int osc_io_read_ahead(const struct lu_env *env,
+			     const struct cl_io_slice *ios,
+			     pgoff_t start, struct cl_read_ahead *ra)
+{
+	struct osc_object	*osc = cl2osc(ios->cis_obj);
+	struct ldlm_lock	*dlmlock;
+	int			result = -ENODATA;
+	ENTRY;
+
+	dlmlock = osc_dlmlock_at_pgoff(env, osc, start, 0);
+	if (dlmlock != NULL) {
+		LASSERT(dlmlock->l_ast_data == osc);
+		if (dlmlock->l_req_mode != LCK_PR) {
+			struct lustre_handle lockh;
+			ldlm_lock2handle(dlmlock, &lockh);
+			ldlm_lock_addref(&lockh, LCK_PR);
+			ldlm_lock_decref(&lockh, dlmlock->l_req_mode);
+		}
+
+		ra->cra_rpc_size = osc_cli(osc)->cl_max_pages_per_rpc;
+		ra->cra_end = cl_index(osc2cl(osc),
+				       dlmlock->l_policy_data.l_extent.end);
+		ra->cra_release = osc_read_ahead_release;
+		ra->cra_cbdata = dlmlock;
+		result = 0;
+	}
+
+	RETURN(result);
+}
+
+/**
+ * An implementation of cl_io_operations::cio_io_submit() method for osc
+ * layer. Iterates over pages in the in-queue, prepares each for io by calling
+ * cl_page_prep() and then either submits them through osc_io_submit_page()
+ * or, if page is already submitted, changes osc flags through
+ * osc_set_async_flags().
+ */
+static int osc_io_submit(const struct lu_env *env,
+                         const struct cl_io_slice *ios,
+			 enum cl_req_type crt, struct cl_2queue *queue)
+{
+	struct cl_page	  *page;
+	struct cl_page	  *tmp;
+	struct client_obd *cli  = NULL;
+	struct osc_object *osc  = NULL;	/* to keep gcc happy */
+	struct osc_page	  *opg;
+	struct cl_io	  *io;
+	struct list_head  list = LIST_HEAD_INIT(list);
+
+	struct cl_page_list *qin      = &queue->c2_qin;
+	struct cl_page_list *qout     = &queue->c2_qout;
+	unsigned int queued = 0;
+	int result = 0;
+	int cmd;
+	int brw_flags;
+	unsigned int max_pages;
+
+	LASSERT(qin->pl_nr > 0);
+
+	CDEBUG(D_CACHE|D_READA, "%d %d\n", qin->pl_nr, crt);
+
+	osc = cl2osc(ios->cis_obj);
+	cli = osc_cli(osc);
+	max_pages = cli->cl_max_pages_per_rpc;
+
+	cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
+	brw_flags = osc_io_srvlock(cl2osc_io(env, ios)) ? OBD_BRW_SRVLOCK : 0;
+
+        /*
+         * NOTE: here @page is a top-level page. This is done to avoid
+         *       creation of sub-page-list.
+         */
+        cl_page_list_for_each_safe(page, tmp, qin) {
+                struct osc_async_page *oap;
+
+                /* Top level IO. */
+                io = page->cp_owner;
+                LASSERT(io != NULL);
+
+		opg = osc_cl_page_osc(page, osc);
+		oap = &opg->ops_oap;
+		LASSERT(osc == oap->oap_obj);
+
+		if (!list_empty(&oap->oap_pending_item) ||
+		    !list_empty(&oap->oap_rpc_item)) {
+			CDEBUG(D_CACHE, "Busy oap %p page %p for submit.\n",
+			       oap, opg);
+                        result = -EBUSY;
+                        break;
+                }
+
+                result = cl_page_prep(env, io, page, crt);
+		if (result != 0) {
+                        LASSERT(result < 0);
+                        if (result != -EALREADY)
+                                break;
+                        /*
+                         * Handle -EALREADY error: for read case, the page is
+                         * already in UPTODATE state; for write, the page
+                         * is not dirty.
+                         */
+                        result = 0;
+			continue;
+                }
+
+		spin_lock(&oap->oap_lock);
+		oap->oap_async_flags = ASYNC_URGENT|ASYNC_READY;
+		oap->oap_async_flags |= ASYNC_COUNT_STABLE;
+		spin_unlock(&oap->oap_lock);
+
+		osc_page_submit(env, opg, crt, brw_flags);
+		list_add_tail(&oap->oap_pending_item, &list);
+
+		if (page->cp_sync_io != NULL)
+			cl_page_list_move(qout, qin, page);
+		else /* async IO */
+			cl_page_list_del(env, qin, page);
+
+		if (++queued == max_pages) {
+			queued = 0;
+			result = osc_queue_sync_pages(env, osc, &list, cmd,
+						      brw_flags);
+			if (result < 0)
+				break;
+		}
+	}
+
+	if (queued > 0)
+		result = osc_queue_sync_pages(env, osc, &list, cmd, brw_flags);
+
+	/* Update c/mtime for sync write. LU-7310 */
+	if (crt == CRT_WRITE && qout->pl_nr > 0 && result == 0) {
+		struct cl_object *obj   = ios->cis_obj;
+		struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+
+		cl_object_attr_lock(obj);
+		attr->cat_mtime = attr->cat_ctime = ktime_get_real_seconds();
+		cl_object_attr_update(env, obj, attr, CAT_MTIME | CAT_CTIME);
+		cl_object_attr_unlock(obj);
+	}
+
+	CDEBUG(D_INFO, "%d/%d %d\n", qin->pl_nr, qout->pl_nr, result);
+	return qout->pl_nr > 0 ? 0 : result;
+}
+
+/**
+ * This is called when a page is accessed within file in a way that creates
+ * new page, if one were missing (i.e., if there were a hole at that place in
+ * the file, or accessed page is beyond the current file size).
+ *
+ * Expand stripe KMS if necessary.
+ */
+static void osc_page_touch_at(const struct lu_env *env,
+			      struct cl_object *obj, pgoff_t idx, size_t to)
+{
+        struct lov_oinfo  *loi  = cl2osc(obj)->oo_oinfo;
+        struct cl_attr    *attr = &osc_env_info(env)->oti_attr;
+        int valid;
+        __u64 kms;
+
+        /* offset within stripe */
+        kms = cl_offset(obj, idx) + to;
+
+        cl_object_attr_lock(obj);
+        /*
+         * XXX old code used
+         *
+         *         ll_inode_size_lock(inode, 0); lov_stripe_lock(lsm);
+         *
+         * here
+         */
+	CDEBUG(D_INODE, "stripe KMS %sincreasing %llu->%llu %llu\n",
+               kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms,
+               loi->loi_lvb.lvb_size);
+
+	attr->cat_mtime = attr->cat_ctime = ktime_get_real_seconds();
+	valid = CAT_MTIME | CAT_CTIME;
+	if (kms > loi->loi_kms) {
+		attr->cat_kms = kms;
+		valid |= CAT_KMS;
+	}
+	if (kms > loi->loi_lvb.lvb_size) {
+		attr->cat_size = kms;
+		valid |= CAT_SIZE;
+	}
+	cl_object_attr_update(env, obj, attr, valid);
+	cl_object_attr_unlock(obj);
+}
+
+static int osc_io_commit_async(const struct lu_env *env,
+				const struct cl_io_slice *ios,
+				struct cl_page_list *qin, int from, int to,
+				cl_commit_cbt cb)
+{
+	struct cl_io    *io = ios->cis_io;
+	struct osc_io   *oio = cl2osc_io(env, ios);
+	struct osc_object *osc = cl2osc(ios->cis_obj);
+	struct cl_page  *page;
+	struct cl_page  *last_page;
+	struct osc_page *opg;
+	int result = 0;
+	ENTRY;
+
+	LASSERT(qin->pl_nr > 0);
+
+	/* Handle partial page cases */
+	last_page = cl_page_list_last(qin);
+	if (oio->oi_lockless) {
+		page = cl_page_list_first(qin);
+		if (page == last_page) {
+			cl_page_clip(env, page, from, to);
+		} else {
+			if (from != 0)
+				cl_page_clip(env, page, from, PAGE_SIZE);
+			if (to != PAGE_SIZE)
+				cl_page_clip(env, last_page, 0, to);
+		}
+	}
+
+	while (qin->pl_nr > 0) {
+		struct osc_async_page *oap;
+
+		page = cl_page_list_first(qin);
+		opg = osc_cl_page_osc(page, osc);
+		oap = &opg->ops_oap;
+
+		if (!list_empty(&oap->oap_rpc_item)) {
+			CDEBUG(D_CACHE, "Busy oap %p page %p for submit.\n",
+			       oap, opg);
+			result = -EBUSY;
+			break;
+		}
+
+		/* The page may be already in dirty cache. */
+		if (list_empty(&oap->oap_pending_item)) {
+			result = osc_page_cache_add(env, &opg->ops_cl, io);
+			if (result != 0)
+				break;
+		}
+
+		osc_page_touch_at(env, osc2cl(osc), osc_index(opg),
+				  page == last_page ? to : PAGE_SIZE);
+
+		cl_page_list_del(env, qin, page);
+
+		(*cb)(env, io, page);
+		/* Can't access page any more. Page can be in transfer and
+		 * complete at any time. */
+	}
+
+	/* for sync write, kernel will wait for this page to be flushed before
+	 * osc_io_end() is called, so release it earlier.
+	 * for mkwrite(), it's known there is no further pages. */
+	if (cl_io_is_sync_write(io) && oio->oi_active != NULL) {
+		osc_extent_release(env, oio->oi_active);
+		oio->oi_active = NULL;
+	}
+
+	CDEBUG(D_INFO, "%d %d\n", qin->pl_nr, result);
+	RETURN(result);
+}
+
+static int osc_io_iter_init(const struct lu_env *env,
+			    const struct cl_io_slice *ios)
+{
+	struct osc_object *osc = cl2osc(ios->cis_obj);
+	struct obd_import *imp = osc_cli(osc)->cl_import;
+	int rc = -EIO;
+
+	spin_lock(&imp->imp_lock);
+	if (likely(!imp->imp_invalid)) {
+		struct osc_io *oio = osc_env_io(env);
+
+		atomic_inc(&osc->oo_nr_ios);
+		oio->oi_is_active = 1;
+		rc = 0;
+	}
+	spin_unlock(&imp->imp_lock);
+
+	return rc;
+}
+
+static int osc_io_write_iter_init(const struct lu_env *env,
+				  const struct cl_io_slice *ios)
+{
+	struct cl_io *io = ios->cis_io;
+	struct osc_io *oio = osc_env_io(env);
+	struct osc_object *osc = cl2osc(ios->cis_obj);
+	unsigned long npages;
+	ENTRY;
+
+	if (cl_io_is_append(io))
+		RETURN(osc_io_iter_init(env, ios));
+
+	npages = io->u.ci_rw.rw_range.cir_count >> PAGE_SHIFT;
+	if (io->u.ci_rw.rw_range.cir_pos & ~PAGE_MASK)
+		++npages;
+
+	oio->oi_lru_reserved = osc_lru_reserve(osc_cli(osc), npages);
+
+	RETURN(osc_io_iter_init(env, ios));
+}
+
+static void osc_io_iter_fini(const struct lu_env *env,
+			     const struct cl_io_slice *ios)
+{
+	struct osc_io *oio = osc_env_io(env);
+
+	if (oio->oi_is_active) {
+		struct osc_object *osc = cl2osc(ios->cis_obj);
+
+		oio->oi_is_active = 0;
+		LASSERT(atomic_read(&osc->oo_nr_ios) > 0);
+		if (atomic_dec_and_test(&osc->oo_nr_ios))
+			wake_up_all(&osc->oo_io_waitq);
+	}
+}
+
+static void osc_io_write_iter_fini(const struct lu_env *env,
+				   const struct cl_io_slice *ios)
+{
+	struct osc_io *oio = osc_env_io(env);
+	struct osc_object *osc = cl2osc(ios->cis_obj);
+
+	if (oio->oi_lru_reserved > 0) {
+		osc_lru_unreserve(osc_cli(osc), oio->oi_lru_reserved);
+		oio->oi_lru_reserved = 0;
+	}
+	oio->oi_write_osclock = NULL;
+
+	osc_io_iter_fini(env, ios);
+}
+
+static int osc_io_fault_start(const struct lu_env *env,
+			      const struct cl_io_slice *ios)
+{
+	struct cl_io       *io;
+	struct cl_fault_io *fio;
+	ENTRY;
+
+	io  = ios->cis_io;
+	fio = &io->u.ci_fault;
+	CDEBUG(D_INFO, "%lu %d %zu\n",
+		fio->ft_index, fio->ft_writable, fio->ft_nob);
+	/*
+	 * If mapping is writeable, adjust kms to cover this page,
+	 * but do not extend kms beyond actual file size.
+	 * See bug 10919.
+	 */
+	if (fio->ft_writable)
+		osc_page_touch_at(env, ios->cis_obj,
+				  fio->ft_index, fio->ft_nob);
+	RETURN(0);
+}
+
+static int osc_async_upcall(void *a, int rc)
+{
+	struct osc_async_cbargs *args = a;
+
+        args->opc_rc = rc;
+	complete(&args->opc_sync);
+        return 0;
+}
+
+/**
+ * Checks that there are no pages being written in the extent being truncated.
+ */
+static int trunc_check_cb(const struct lu_env *env, struct cl_io *io,
+			  struct osc_page *ops , void *cbdata)
+{
+	struct cl_page *page = ops->ops_cl.cpl_page;
+	struct osc_async_page *oap;
+	__u64 start = *(__u64 *)cbdata;
+
+	oap = &ops->ops_oap;
+	if (oap->oap_cmd & OBD_BRW_WRITE &&
+	    !list_empty(&oap->oap_pending_item))
+		CL_PAGE_DEBUG(D_ERROR, env, page, "exists %llu/%s.\n",
+				start, current->comm);
+
+	if (PageLocked(page->cp_vmpage))
+		CDEBUG(D_CACHE, "page %p index %lu locked for %d.\n",
+		       ops, osc_index(ops), oap->oap_cmd & OBD_BRW_RWMASK);
+
+	return CLP_GANG_OKAY;
+}
+
+static void osc_trunc_check(const struct lu_env *env, struct cl_io *io,
+			    struct osc_io *oio, __u64 size)
+{
+	struct cl_object *clob;
+	int     partial;
+	pgoff_t start;
+
+        clob    = oio->oi_cl.cis_obj;
+        start   = cl_index(clob, size);
+        partial = cl_offset(clob, start) < size;
+
+        /*
+         * Complain if there are pages in the truncated region.
+         */
+	osc_page_gang_lookup(env, io, cl2osc(clob),
+				start + partial, CL_PAGE_EOF,
+				trunc_check_cb, (void *)&size);
+}
+
+static int osc_io_setattr_start(const struct lu_env *env,
+                                const struct cl_io_slice *slice)
+{
+        struct cl_io            *io     = slice->cis_io;
+        struct osc_io           *oio    = cl2osc_io(env, slice);
+        struct cl_object        *obj    = slice->cis_obj;
+        struct lov_oinfo        *loi    = cl2osc(obj)->oo_oinfo;
+        struct cl_attr          *attr   = &osc_env_info(env)->oti_attr;
+        struct obdo             *oa     = &oio->oi_oa;
+	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+	__u64                    size   = io->u.ci_setattr.sa_attr.lvb_size;
+	unsigned int             ia_valid = io->u.ci_setattr.sa_valid;
+	int                      result = 0;
+	ENTRY;
+
+	/* truncate cache dirty pages first */
+	if (cl_io_is_trunc(io))
+		result = osc_cache_truncate_start(env, cl2osc(obj), size,
+						  &oio->oi_trunc);
+
+	if (result == 0 && oio->oi_lockless == 0) {
+		cl_object_attr_lock(obj);
+		result = cl_object_attr_get(env, obj, attr);
+		if (result == 0) {
+			struct ost_lvb *lvb = &io->u.ci_setattr.sa_attr;
+			unsigned int cl_valid = 0;
+
+			if (ia_valid & ATTR_SIZE) {
+				attr->cat_size = attr->cat_kms = size;
+				cl_valid = (CAT_SIZE | CAT_KMS);
+			}
+			if (ia_valid & ATTR_MTIME_SET) {
+				attr->cat_mtime = lvb->lvb_mtime;
+				cl_valid |= CAT_MTIME;
+			}
+			if (ia_valid & ATTR_ATIME_SET) {
+				attr->cat_atime = lvb->lvb_atime;
+				cl_valid |= CAT_ATIME;
+			}
+			if (ia_valid & ATTR_CTIME_SET) {
+				attr->cat_ctime = lvb->lvb_ctime;
+				cl_valid |= CAT_CTIME;
+			}
+			result = cl_object_attr_update(env, obj, attr,
+						       cl_valid);
+		}
+		cl_object_attr_unlock(obj);
+	}
+	memset(oa, 0, sizeof(*oa));
+	if (result == 0) {
+		oa->o_oi = loi->loi_oi;
+		obdo_set_parent_fid(oa, io->u.ci_setattr.sa_parent_fid);
+		oa->o_stripe_idx = io->u.ci_setattr.sa_stripe_index;
+		oa->o_layout = io->u.ci_setattr.sa_layout;
+		oa->o_valid |= OBD_MD_FLID | OBD_MD_FLGROUP |
+			OBD_MD_FLOSTLAYOUT;
+		if (ia_valid & ATTR_CTIME) {
+			oa->o_valid |= OBD_MD_FLCTIME;
+			oa->o_ctime = attr->cat_ctime;
+		}
+		if (ia_valid & ATTR_ATIME) {
+			oa->o_valid |= OBD_MD_FLATIME;
+			oa->o_atime = attr->cat_atime;
+		}
+		if (ia_valid & ATTR_MTIME) {
+			oa->o_valid |= OBD_MD_FLMTIME;
+			oa->o_mtime = attr->cat_mtime;
+		}
+                if (ia_valid & ATTR_SIZE) {
+                        oa->o_size = size;
+                        oa->o_blocks = OBD_OBJECT_EOF;
+                        oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+
+                        if (oio->oi_lockless) {
+                                oa->o_flags = OBD_FL_SRVLOCK;
+                                oa->o_valid |= OBD_MD_FLFLAGS;
+                        }
+                } else {
+                        LASSERT(oio->oi_lockless == 0);
+                }
+
+		if (ia_valid & ATTR_ATTR_FLAG) {
+			oa->o_flags = io->u.ci_setattr.sa_attr_flags;
+			oa->o_valid |= OBD_MD_FLFLAGS;
+		}
+
+		init_completion(&cbargs->opc_sync);
+
+		if (ia_valid & ATTR_SIZE)
+			result = osc_punch_base(osc_export(cl2osc(obj)),
+						oa, osc_async_upcall,
+						cbargs, PTLRPCD_SET);
+		else
+			result = osc_setattr_async(osc_export(cl2osc(obj)),
+						   oa, osc_async_upcall,
+						   cbargs, PTLRPCD_SET);
+
+		cbargs->opc_rpc_sent = result == 0;
+	}
+
+	RETURN(result);
+}
+
+static void osc_io_setattr_end(const struct lu_env *env,
+                               const struct cl_io_slice *slice)
+{
+	struct cl_io     *io  = slice->cis_io;
+	struct osc_io    *oio = cl2osc_io(env, slice);
+	struct cl_object *obj = slice->cis_obj;
+	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+        int result = 0;
+
+	if (cbargs->opc_rpc_sent) {
+		wait_for_completion(&cbargs->opc_sync);
+		result = io->ci_result = cbargs->opc_rc;
+	}
+        if (result == 0) {
+                if (oio->oi_lockless) {
+                        /* lockless truncate */
+                        struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
+
+                        LASSERT(cl_io_is_trunc(io));
+                        /* XXX: Need a lock. */
+                        osd->od_stats.os_lockless_truncates++;
+                }
+        }
+
+	if (cl_io_is_trunc(io)) {
+		__u64 size = io->u.ci_setattr.sa_attr.lvb_size;
+		osc_trunc_check(env, io, oio, size);
+		osc_cache_truncate_end(env, oio->oi_trunc);
+		oio->oi_trunc = NULL;
+	}
+}
+
+struct osc_data_version_args {
+	struct osc_io *dva_oio;
+};
+
+static int
+osc_data_version_interpret(const struct lu_env *env, struct ptlrpc_request *req,
+			   void *arg, int rc)
+{
+	struct osc_data_version_args *dva = arg;
+	struct osc_io *oio = dva->dva_oio;
+	const struct ost_body *body;
+
+	ENTRY;
+	if (rc < 0)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	lustre_get_wire_obdo(&req->rq_import->imp_connect_data, &oio->oi_oa,
+			     &body->oa);
+	EXIT;
+out:
+	oio->oi_cbarg.opc_rc = rc;
+	complete(&oio->oi_cbarg.opc_sync);
+
+	return 0;
+}
+
+static int osc_io_data_version_start(const struct lu_env *env,
+				     const struct cl_io_slice *slice)
+{
+	struct cl_data_version_io *dv	= &slice->cis_io->u.ci_data_version;
+	struct osc_io		*oio	= cl2osc_io(env, slice);
+	struct obdo		*oa	= &oio->oi_oa;
+	struct osc_async_cbargs	*cbargs	= &oio->oi_cbarg;
+	struct osc_object	*obj	= cl2osc(slice->cis_obj);
+	struct lov_oinfo	*loi	= obj->oo_oinfo;
+	struct obd_export	*exp	= osc_export(obj);
+	struct ptlrpc_request	*req;
+	struct ost_body		*body;
+	struct osc_data_version_args *dva;
+	int rc;
+
+	ENTRY;
+	memset(oa, 0, sizeof(*oa));
+	oa->o_oi = loi->loi_oi;
+	oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+
+	if (dv->dv_flags & (LL_DV_RD_FLUSH | LL_DV_WR_FLUSH)) {
+		oa->o_valid |= OBD_MD_FLFLAGS;
+		oa->o_flags |= OBD_FL_SRVLOCK;
+		if (dv->dv_flags & LL_DV_WR_FLUSH)
+			oa->o_flags |= OBD_FL_FLUSH;
+	}
+
+	init_completion(&cbargs->opc_sync);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+
+	ptlrpc_request_set_replen(req);
+	req->rq_interpret_reply = osc_data_version_interpret;
+	CLASSERT(sizeof(*dva) <= sizeof(req->rq_async_args));
+	dva = ptlrpc_req_async_args(req);
+	dva->dva_oio = oio;
+
+	ptlrpcd_add_req(req);
+
+	RETURN(0);
+}
+
+static void osc_io_data_version_end(const struct lu_env *env,
+				    const struct cl_io_slice *slice)
+{
+	struct cl_data_version_io *dv = &slice->cis_io->u.ci_data_version;
+	struct osc_io		*oio    = cl2osc_io(env, slice);
+	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+
+	ENTRY;
+	wait_for_completion(&cbargs->opc_sync);
+
+	if (cbargs->opc_rc != 0) {
+		slice->cis_io->ci_result = cbargs->opc_rc;
+	} else if (!(oio->oi_oa.o_valid & OBD_MD_FLDATAVERSION)) {
+		slice->cis_io->ci_result = -EOPNOTSUPP;
+	} else {
+		dv->dv_data_version = oio->oi_oa.o_data_version;
+		slice->cis_io->ci_result = 0;
+	}
+
+	EXIT;
+}
+
+static int osc_io_read_start(const struct lu_env *env,
+                             const struct cl_io_slice *slice)
+{
+	struct cl_object *obj  = slice->cis_obj;
+	struct cl_attr	 *attr = &osc_env_info(env)->oti_attr;
+	int rc = 0;
+	ENTRY;
+
+	if (!slice->cis_io->ci_noatime) {
+		cl_object_attr_lock(obj);
+		attr->cat_atime = ktime_get_real_seconds();
+		rc = cl_object_attr_update(env, obj, attr, CAT_ATIME);
+		cl_object_attr_unlock(obj);
+	}
+
+	RETURN(rc);
+}
+
+static int osc_io_write_start(const struct lu_env *env,
+                              const struct cl_io_slice *slice)
+{
+	struct cl_object *obj   = slice->cis_obj;
+	struct cl_attr   *attr  = &osc_env_info(env)->oti_attr;
+	int rc = 0;
+	ENTRY;
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_SETTIME, 1);
+	cl_object_attr_lock(obj);
+	attr->cat_mtime = attr->cat_ctime = ktime_get_real_seconds();
+	rc = cl_object_attr_update(env, obj, attr, CAT_MTIME | CAT_CTIME);
+	cl_object_attr_unlock(obj);
+
+	RETURN(rc);
+}
+
+static int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj,
+			 struct cl_fsync_io *fio)
+{
+	struct osc_io    *oio   = osc_env_io(env);
+	struct obdo      *oa    = &oio->oi_oa;
+	struct lov_oinfo *loi   = obj->oo_oinfo;
+	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+	int rc = 0;
+	ENTRY;
+
+	memset(oa, 0, sizeof(*oa));
+	oa->o_oi = loi->loi_oi;
+	oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+
+	/* reload size abd blocks for start and end of sync range */
+	oa->o_size = fio->fi_start;
+	oa->o_blocks = fio->fi_end;
+	oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+
+	obdo_set_parent_fid(oa, fio->fi_fid);
+
+	init_completion(&cbargs->opc_sync);
+
+	rc = osc_sync_base(obj, oa, osc_async_upcall, cbargs, PTLRPCD_SET);
+	RETURN(rc);
+}
+
+static int osc_io_fsync_start(const struct lu_env *env,
+			      const struct cl_io_slice *slice)
+{
+	struct cl_io       *io  = slice->cis_io;
+	struct cl_fsync_io *fio = &io->u.ci_fsync;
+	struct cl_object   *obj = slice->cis_obj;
+	struct osc_object  *osc = cl2osc(obj);
+	pgoff_t start  = cl_index(obj, fio->fi_start);
+	pgoff_t end    = cl_index(obj, fio->fi_end);
+	int     result = 0;
+	ENTRY;
+
+	if (fio->fi_end == OBD_OBJECT_EOF)
+		end = CL_PAGE_EOF;
+
+	result = osc_cache_writeback_range(env, osc, start, end, 0,
+					   fio->fi_mode == CL_FSYNC_DISCARD);
+	if (result > 0) {
+		fio->fi_nr_written += result;
+		result = 0;
+	}
+	if (fio->fi_mode == CL_FSYNC_ALL) {
+		int rc;
+
+		/* we have to wait for writeback to finish before we can
+		 * send OST_SYNC RPC. This is bad because it causes extents
+		 * to be written osc by osc. However, we usually start
+		 * writeback before CL_FSYNC_ALL so this won't have any real
+		 * problem. */
+		rc = osc_cache_wait_range(env, osc, start, end);
+		if (result == 0)
+			result = rc;
+		rc = osc_fsync_ost(env, osc, fio);
+		if (result == 0)
+			result = rc;
+	}
+
+	RETURN(result);
+}
+
+static void osc_io_fsync_end(const struct lu_env *env,
+			     const struct cl_io_slice *slice)
+{
+	struct cl_fsync_io *fio = &slice->cis_io->u.ci_fsync;
+	struct cl_object   *obj = slice->cis_obj;
+	pgoff_t start = cl_index(obj, fio->fi_start);
+	pgoff_t end   = cl_index(obj, fio->fi_end);
+	int result = 0;
+
+	if (fio->fi_mode == CL_FSYNC_LOCAL) {
+		result = osc_cache_wait_range(env, cl2osc(obj), start, end);
+	} else if (fio->fi_mode == CL_FSYNC_ALL) {
+		struct osc_io           *oio    = cl2osc_io(env, slice);
+		struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+
+		wait_for_completion(&cbargs->opc_sync);
+		if (result == 0)
+			result = cbargs->opc_rc;
+	}
+	slice->cis_io->ci_result = result;
+}
+
+static int osc_io_ladvise_start(const struct lu_env *env,
+				const struct cl_io_slice *slice)
+{
+	int			 result = 0;
+	struct cl_io		*io = slice->cis_io;
+	struct osc_io		*oio = cl2osc_io(env, slice);
+	struct cl_object	*obj = slice->cis_obj;
+	struct lov_oinfo	*loi = cl2osc(obj)->oo_oinfo;
+	struct cl_ladvise_io	*lio = &io->u.ci_ladvise;
+	struct obdo		*oa = &oio->oi_oa;
+	struct osc_async_cbargs	*cbargs = &oio->oi_cbarg;
+	struct lu_ladvise	*ladvise;
+	struct ladvise_hdr	*ladvise_hdr;
+	int			 buf_size;
+	int			 num_advise = 1;
+	ENTRY;
+
+	/* TODO: add multiple ladvise support in CLIO */
+	buf_size = offsetof(typeof(*ladvise_hdr), lah_advise[num_advise]);
+	if (osc_env_info(env)->oti_ladvise_buf.lb_len < buf_size)
+		lu_buf_realloc(&osc_env_info(env)->oti_ladvise_buf, buf_size);
+
+	ladvise_hdr = osc_env_info(env)->oti_ladvise_buf.lb_buf;
+	if (ladvise_hdr == NULL)
+		RETURN(-ENOMEM);
+
+	memset(ladvise_hdr, 0, buf_size);
+	ladvise_hdr->lah_magic = LADVISE_MAGIC;
+	ladvise_hdr->lah_count = num_advise;
+	ladvise_hdr->lah_flags = lio->li_flags;
+
+	memset(oa, 0, sizeof(*oa));
+	oa->o_oi = loi->loi_oi;
+	oa->o_valid = OBD_MD_FLID;
+	obdo_set_parent_fid(oa, lio->li_fid);
+
+	ladvise = ladvise_hdr->lah_advise;
+	ladvise->lla_start = lio->li_start;
+	ladvise->lla_end = lio->li_end;
+	ladvise->lla_advice = lio->li_advice;
+
+	if (lio->li_flags & LF_ASYNC) {
+		result = osc_ladvise_base(osc_export(cl2osc(obj)), oa,
+					  ladvise_hdr, NULL, NULL, NULL);
+	} else {
+		init_completion(&cbargs->opc_sync);
+		result = osc_ladvise_base(osc_export(cl2osc(obj)), oa,
+					  ladvise_hdr, osc_async_upcall,
+					  cbargs, PTLRPCD_SET);
+		cbargs->opc_rpc_sent = result == 0;
+	}
+	RETURN(result);
+}
+
+static void osc_io_ladvise_end(const struct lu_env *env,
+			       const struct cl_io_slice *slice)
+{
+	struct cl_io		*io = slice->cis_io;
+	struct osc_io		*oio = cl2osc_io(env, slice);
+	struct osc_async_cbargs	*cbargs = &oio->oi_cbarg;
+	int			 result = 0;
+	struct cl_ladvise_io	*lio = &io->u.ci_ladvise;
+
+	if ((!(lio->li_flags & LF_ASYNC)) && cbargs->opc_rpc_sent) {
+		wait_for_completion(&cbargs->opc_sync);
+		result = cbargs->opc_rc;
+	}
+	slice->cis_io->ci_result = result;
+}
+
+static void osc_io_end(const struct lu_env *env,
+		       const struct cl_io_slice *slice)
+{
+	struct osc_io *oio = cl2osc_io(env, slice);
+
+	if (oio->oi_active) {
+		osc_extent_release(env, oio->oi_active);
+		oio->oi_active = NULL;
+	}
+}
+
+static const struct cl_io_operations osc_io_ops = {
+	.op = {
+		[CIT_READ] = {
+			.cio_iter_init = osc_io_iter_init,
+			.cio_iter_fini = osc_io_iter_fini,
+			.cio_start  = osc_io_read_start,
+			.cio_fini   = osc_io_fini
+		},
+		[CIT_WRITE] = {
+			.cio_iter_init = osc_io_write_iter_init,
+			.cio_iter_fini = osc_io_write_iter_fini,
+			.cio_start  = osc_io_write_start,
+			.cio_end    = osc_io_end,
+			.cio_fini   = osc_io_fini
+		},
+		[CIT_SETATTR] = {
+			.cio_iter_init = osc_io_iter_init,
+			.cio_iter_fini = osc_io_iter_fini,
+			.cio_start  = osc_io_setattr_start,
+			.cio_end    = osc_io_setattr_end
+		},
+		[CIT_DATA_VERSION] = {
+			.cio_start  = osc_io_data_version_start,
+			.cio_end    = osc_io_data_version_end,
+		},
+		[CIT_FAULT] = {
+			.cio_iter_init = osc_io_iter_init,
+			.cio_iter_fini = osc_io_iter_fini,
+			.cio_start  = osc_io_fault_start,
+			.cio_end    = osc_io_end,
+			.cio_fini   = osc_io_fini
+		},
+		[CIT_FSYNC] = {
+			.cio_start  = osc_io_fsync_start,
+			.cio_end    = osc_io_fsync_end,
+			.cio_fini   = osc_io_fini
+		},
+		[CIT_LADVISE] = {
+			.cio_start  = osc_io_ladvise_start,
+			.cio_end    = osc_io_ladvise_end,
+			.cio_fini   = osc_io_fini
+		},
+		[CIT_MISC] = {
+			.cio_fini   = osc_io_fini
+		}
+	},
+	.cio_read_ahead		    = osc_io_read_ahead,
+	.cio_submit                 = osc_io_submit,
+	.cio_commit_async           = osc_io_commit_async
+};
+
+/*****************************************************************************
+ *
+ * Transfer operations.
+ *
+ */
+
+int osc_io_init(const struct lu_env *env,
+                struct cl_object *obj, struct cl_io *io)
+{
+        struct osc_io *oio = osc_env_io(env);
+
+        CL_IO_SLICE_CLEAN(oio, oi_cl);
+        cl_io_slice_add(io, &oio->oi_cl, obj, &osc_io_ops);
+        return 0;
+}
+
+/** @} osc */
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_lock.c b/drivers/staging/lustrefsx/lustre/osc/osc_lock.c
new file mode 100644
index 0000000000000..6d53b5b80c580
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_lock.c
@@ -0,0 +1,1249 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include <libcfs/libcfs.h>
+/* fid_build_reg_res_name() */
+#include <lustre_fid.h>
+
+#include "osc_cl_internal.h"
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static const struct cl_lock_operations osc_lock_ops;
+static const struct cl_lock_operations osc_lock_lockless_ops;
+static void osc_lock_to_lockless(const struct lu_env *env,
+                                 struct osc_lock *ols, int force);
+
+int osc_lock_is_lockless(const struct osc_lock *olck)
+{
+        return (olck->ols_cl.cls_ops == &osc_lock_lockless_ops);
+}
+
+/**
+ * Returns a weak pointer to the ldlm lock identified by a handle. Returned
+ * pointer cannot be dereferenced, as lock is not protected from concurrent
+ * reclaim. This function is a helper for osc_lock_invariant().
+ */
+static struct ldlm_lock *osc_handle_ptr(struct lustre_handle *handle)
+{
+        struct ldlm_lock *lock;
+
+        lock = ldlm_handle2lock(handle);
+        if (lock != NULL)
+                LDLM_LOCK_PUT(lock);
+        return lock;
+}
+
+/**
+ * Invariant that has to be true all of the time.
+ */
+static int osc_lock_invariant(struct osc_lock *ols)
+{
+	struct ldlm_lock *lock	      = osc_handle_ptr(&ols->ols_handle);
+	struct ldlm_lock *olock	      = ols->ols_dlmlock;
+	int		  handle_used = lustre_handle_is_used(&ols->ols_handle);
+
+	if (ergo(osc_lock_is_lockless(ols),
+		 ols->ols_locklessable && ols->ols_dlmlock == NULL))
+		return 1;
+
+	/*
+	 * If all the following "ergo"s are true, return 1, otherwise 0
+	 */
+	if (! ergo(olock != NULL, handle_used))
+		return 0;
+
+	if (! ergo(olock != NULL,
+		   olock->l_handle.h_cookie == ols->ols_handle.cookie))
+		return 0;
+
+	if (! ergo(handle_used,
+		   ergo(lock != NULL && olock != NULL, lock == olock) &&
+		   ergo(lock == NULL, olock == NULL)))
+		return 0;
+	/*
+	 * Check that ->ols_handle and ->ols_dlmlock are consistent, but
+	 * take into account that they are set at the different time.
+	 */
+	if (! ergo(ols->ols_state == OLS_CANCELLED,
+		   olock == NULL && !handle_used))
+		return 0;
+	/*
+	 * DLM lock is destroyed only after we have seen cancellation
+	 * ast.
+	 */
+	if (! ergo(olock != NULL && ols->ols_state < OLS_CANCELLED,
+		   !ldlm_is_destroyed(olock)))
+		return 0;
+
+	if (! ergo(ols->ols_state == OLS_GRANTED,
+		   olock != NULL &&
+		   olock->l_req_mode == olock->l_granted_mode &&
+		   ols->ols_hold))
+		return 0;
+	return 1;
+}
+
+/*****************************************************************************
+ *
+ * Lock operations.
+ *
+ */
+
+static void osc_lock_fini(const struct lu_env *env,
+                          struct cl_lock_slice *slice)
+{
+	struct osc_lock  *ols = cl2osc_lock(slice);
+
+	LINVRNT(osc_lock_invariant(ols));
+	LASSERT(ols->ols_dlmlock == NULL);
+
+	OBD_SLAB_FREE_PTR(ols, osc_lock_kmem);
+}
+
+static void osc_lock_build_policy(const struct lu_env *env,
+				  const struct cl_lock *lock,
+				  union ldlm_policy_data *policy)
+{
+	const struct cl_lock_descr *d = &lock->cll_descr;
+
+	osc_index2policy(policy, d->cld_obj, d->cld_start, d->cld_end);
+	policy->l_extent.gid = d->cld_gid;
+}
+
+static __u64 osc_enq2ldlm_flags(__u32 enqflags)
+{
+	__u64 result = 0;
+
+	LASSERT((enqflags & ~CEF_MASK) == 0);
+
+	if (enqflags & CEF_NONBLOCK)
+		result |= LDLM_FL_BLOCK_NOWAIT;
+	if (enqflags & CEF_ASYNC)
+		result |= LDLM_FL_HAS_INTENT;
+	if (enqflags & CEF_DISCARD_DATA)
+		result |= LDLM_FL_AST_DISCARD_DATA;
+	if (enqflags & CEF_PEEK)
+		result |= LDLM_FL_TEST_LOCK;
+	if (enqflags & CEF_LOCK_MATCH)
+		result |= LDLM_FL_MATCH_LOCK;
+	return result;
+}
+
+/**
+ * Updates object attributes from a lock value block (lvb) received together
+ * with the DLM lock reply from the server. Copy of osc_update_enqueue()
+ * logic.
+ *
+ * This can be optimized to not update attributes when lock is a result of a
+ * local match.
+ *
+ * Called under lock and resource spin-locks.
+ */
+static void osc_lock_lvb_update(const struct lu_env *env,
+				struct osc_object *osc,
+				struct ldlm_lock *dlmlock,
+				struct ost_lvb *lvb)
+{
+	struct cl_object  *obj = osc2cl(osc);
+	struct lov_oinfo  *oinfo = osc->oo_oinfo;
+	struct cl_attr    *attr = &osc_env_info(env)->oti_attr;
+	unsigned           valid;
+
+	ENTRY;
+
+	valid = CAT_BLOCKS | CAT_ATIME | CAT_CTIME | CAT_MTIME | CAT_SIZE;
+	if (lvb == NULL) {
+		LASSERT(dlmlock != NULL);
+		lvb = dlmlock->l_lvb_data;
+	}
+	cl_lvb2attr(attr, lvb);
+
+	cl_object_attr_lock(obj);
+	if (dlmlock != NULL) {
+		__u64 size;
+
+		check_res_locked(dlmlock->l_resource);
+
+		LASSERT(lvb == dlmlock->l_lvb_data);
+                size = lvb->lvb_size;
+
+                /* Extend KMS up to the end of this lock and no further
+                 * A lock on [x,y] means a KMS of up to y + 1 bytes! */
+                if (size > dlmlock->l_policy_data.l_extent.end)
+                        size = dlmlock->l_policy_data.l_extent.end + 1;
+                if (size >= oinfo->loi_kms) {
+			LDLM_DEBUG(dlmlock, "lock acquired, setting rss=%llu"
+				   ", kms=%llu", lvb->lvb_size, size);
+                        valid |= CAT_KMS;
+                        attr->cat_kms = size;
+                } else {
+                        LDLM_DEBUG(dlmlock, "lock acquired, setting rss="
+				   "%llu; leaving kms=%llu, end=%llu",
+                                   lvb->lvb_size, oinfo->loi_kms,
+                                   dlmlock->l_policy_data.l_extent.end);
+                }
+		ldlm_lock_allow_match_locked(dlmlock);
+	}
+
+	cl_object_attr_update(env, obj, attr, valid);
+	cl_object_attr_unlock(obj);
+
+	EXIT;
+}
+
+static void osc_lock_granted(const struct lu_env *env, struct osc_lock *oscl,
+			     struct lustre_handle *lockh, bool lvb_update)
+{
+	struct ldlm_lock *dlmlock;
+
+	dlmlock = ldlm_handle2lock_long(lockh, 0);
+	LASSERT(dlmlock != NULL);
+
+	/* lock reference taken by ldlm_handle2lock_long() is
+	 * owned by osc_lock and released in osc_lock_detach()
+	 */
+	lu_ref_add(&dlmlock->l_reference, "osc_lock", oscl);
+	oscl->ols_has_ref = 1;
+
+	LASSERT(oscl->ols_dlmlock == NULL);
+	oscl->ols_dlmlock = dlmlock;
+
+	/* This may be a matched lock for glimpse request, do not hold
+	 * lock reference in that case. */
+	if (!oscl->ols_glimpse) {
+		/* hold a refc for non glimpse lock which will
+		 * be released in osc_lock_cancel() */
+		lustre_handle_copy(&oscl->ols_handle, lockh);
+		ldlm_lock_addref(lockh, oscl->ols_einfo.ei_mode);
+		oscl->ols_hold = 1;
+	}
+
+	/* Lock must have been granted. */
+	lock_res_and_lock(dlmlock);
+	if (dlmlock->l_granted_mode == dlmlock->l_req_mode) {
+		struct ldlm_extent *ext = &dlmlock->l_policy_data.l_extent;
+		struct cl_lock_descr *descr = &oscl->ols_cl.cls_lock->cll_descr;
+
+		/* extend the lock extent, otherwise it will have problem when
+		 * we decide whether to grant a lockless lock. */
+		descr->cld_mode  = osc_ldlm2cl_lock(dlmlock->l_granted_mode);
+		descr->cld_start = cl_index(descr->cld_obj, ext->start);
+		descr->cld_end   = cl_index(descr->cld_obj, ext->end);
+		descr->cld_gid   = ext->gid;
+
+		/* no lvb update for matched lock */
+		if (lvb_update) {
+			LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY);
+			osc_lock_lvb_update(env, cl2osc(oscl->ols_cl.cls_obj),
+					    dlmlock, NULL);
+		}
+		LINVRNT(osc_lock_invariant(oscl));
+	}
+	unlock_res_and_lock(dlmlock);
+
+	LASSERT(oscl->ols_state != OLS_GRANTED);
+	oscl->ols_state = OLS_GRANTED;
+}
+
+/**
+ * Lock upcall function that is executed either when a reply to ENQUEUE rpc is
+ * received from a server, or after osc_enqueue_base() matched a local DLM
+ * lock.
+ */
+static int osc_lock_upcall(void *cookie, struct lustre_handle *lockh,
+			   int errcode)
+{
+	struct osc_lock         *oscl  = cookie;
+	struct cl_lock_slice    *slice = &oscl->ols_cl;
+	struct lu_env           *env;
+	int			rc;
+
+	ENTRY;
+
+	env = cl_env_percpu_get();
+	/* should never happen, similar to osc_ldlm_blocking_ast(). */
+	LASSERT(!IS_ERR(env));
+
+	rc = ldlm_error2errno(errcode);
+	if (oscl->ols_state == OLS_ENQUEUED) {
+		oscl->ols_state = OLS_UPCALL_RECEIVED;
+	} else if (oscl->ols_state == OLS_CANCELLED) {
+		rc = -EIO;
+	} else {
+		CERROR("Impossible state: %d\n", oscl->ols_state);
+		LBUG();
+	}
+
+	if (rc == 0)
+		osc_lock_granted(env, oscl, lockh, errcode == ELDLM_OK);
+
+	/* Error handling, some errors are tolerable. */
+	if (oscl->ols_locklessable && rc == -EUSERS) {
+		/* This is a tolerable error, turn this lock into
+		 * lockless lock.
+		 */
+		osc_object_set_contended(cl2osc(slice->cls_obj));
+		LASSERT(slice->cls_ops == &osc_lock_ops);
+
+		/* Change this lock to ldlmlock-less lock. */
+		osc_lock_to_lockless(env, oscl, 1);
+		oscl->ols_state = OLS_GRANTED;
+		rc = 0;
+	} else if (oscl->ols_glimpse && rc == -ENAVAIL) {
+		LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY);
+		osc_lock_lvb_update(env, cl2osc(slice->cls_obj),
+				    NULL, &oscl->ols_lvb);
+		/* Hide the error. */
+		rc = 0;
+	}
+
+	if (oscl->ols_owner != NULL)
+		cl_sync_io_note(env, oscl->ols_owner, rc);
+	cl_env_percpu_put(env);
+
+	RETURN(rc);
+}
+
+static int osc_lock_upcall_agl(void *cookie, struct lustre_handle *lockh,
+			       int errcode)
+{
+	struct osc_object	*osc = cookie;
+	struct ldlm_lock	*dlmlock;
+	struct lu_env           *env;
+	__u16			 refcheck;
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	LASSERT(!IS_ERR(env));
+
+	if (errcode == ELDLM_LOCK_MATCHED)
+		GOTO(out, errcode = ELDLM_OK);
+
+	if (errcode != ELDLM_OK)
+		GOTO(out, errcode);
+
+	dlmlock = ldlm_handle2lock(lockh);
+	LASSERT(dlmlock != NULL);
+
+	lock_res_and_lock(dlmlock);
+	LASSERT(dlmlock->l_granted_mode == dlmlock->l_req_mode);
+
+	/* there is no osc_lock associated with AGL lock */
+	osc_lock_lvb_update(env, osc, dlmlock, NULL);
+
+	unlock_res_and_lock(dlmlock);
+	LDLM_LOCK_PUT(dlmlock);
+
+out:
+	cl_object_put(env, osc2cl(osc));
+	cl_env_put(env, &refcheck);
+	RETURN(ldlm_error2errno(errcode));
+}
+
+static int osc_lock_flush(struct osc_object *obj, pgoff_t start, pgoff_t end,
+			  enum cl_lock_mode mode, bool discard)
+{
+	struct lu_env		*env;
+	__u16			refcheck;
+	int			rc = 0;
+	int			rc2 = 0;
+
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	if (mode == CLM_WRITE) {
+		rc = osc_cache_writeback_range(env, obj, start, end, 1,
+					       discard);
+		CDEBUG(D_CACHE, "object %p: [%lu -> %lu] %d pages were %s.\n",
+		       obj, start, end, rc,
+		       discard ? "discarded" : "written back");
+		if (rc > 0)
+			rc = 0;
+	}
+
+	rc2 = osc_lock_discard_pages(env, obj, start, end, discard);
+	if (rc == 0 && rc2 < 0)
+		rc = rc2;
+
+	cl_env_put(env, &refcheck);
+	RETURN(rc);
+}
+
+/**
+ * Helper for osc_dlm_blocking_ast() handling discrepancies between cl_lock
+ * and ldlm_lock caches.
+ */
+static int osc_dlm_blocking_ast0(const struct lu_env *env,
+                                 struct ldlm_lock *dlmlock,
+                                 void *data, int flag)
+{
+	struct cl_object	*obj = NULL;
+	int			result = 0;
+	bool			discard;
+	enum cl_lock_mode	mode = CLM_READ;
+	ENTRY;
+
+	LASSERT(flag == LDLM_CB_CANCELING);
+
+	lock_res_and_lock(dlmlock);
+	if (dlmlock->l_granted_mode != dlmlock->l_req_mode) {
+		dlmlock->l_ast_data = NULL;
+		unlock_res_and_lock(dlmlock);
+		RETURN(0);
+	}
+
+	discard = ldlm_is_discard_data(dlmlock);
+	if (dlmlock->l_granted_mode & (LCK_PW | LCK_GROUP))
+		mode = CLM_WRITE;
+
+	if (dlmlock->l_ast_data != NULL) {
+		obj = osc2cl(dlmlock->l_ast_data);
+		dlmlock->l_ast_data = NULL;
+
+		cl_object_get(obj);
+	}
+
+	unlock_res_and_lock(dlmlock);
+
+	/* if l_ast_data is NULL, the dlmlock was enqueued by AGL or
+	 * the object has been destroyed. */
+	if (obj != NULL) {
+		struct ldlm_extent *extent = &dlmlock->l_policy_data.l_extent;
+		struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+		__u64 old_kms;
+
+		/* Destroy pages covered by the extent of the DLM lock */
+		result = osc_lock_flush(cl2osc(obj),
+					cl_index(obj, extent->start),
+					cl_index(obj, extent->end),
+					mode, discard);
+
+		/* losing a lock, update kms */
+		lock_res_and_lock(dlmlock);
+		cl_object_attr_lock(obj);
+		/* Must get the value under the lock to avoid race. */
+		old_kms = cl2osc(obj)->oo_oinfo->loi_kms;
+		/* Update the kms. Need to loop all granted locks.
+		 * Not a problem for the client */
+		attr->cat_kms = ldlm_extent_shift_kms(dlmlock, old_kms);
+
+		cl_object_attr_update(env, obj, attr, CAT_KMS);
+		cl_object_attr_unlock(obj);
+		unlock_res_and_lock(dlmlock);
+
+		cl_object_put(env, obj);
+	}
+	RETURN(result);
+}
+
+/**
+ * Blocking ast invoked by ldlm when dlm lock is either blocking progress of
+ * some other lock, or is canceled. This function is installed as a
+ * ldlm_lock::l_blocking_ast() for client extent locks.
+ *
+ * Control flow is tricky, because ldlm uses the same call-back
+ * (ldlm_lock::l_blocking_ast()) for both blocking and cancellation ast's.
+ *
+ * \param dlmlock lock for which ast occurred.
+ *
+ * \param new description of a conflicting lock in case of blocking ast.
+ *
+ * \param data value of dlmlock->l_ast_data
+ *
+ * \param flag LDLM_CB_BLOCKING or LDLM_CB_CANCELING. Used to distinguish
+ *             cancellation and blocking ast's.
+ *
+ * Possible use cases:
+ *
+ *     - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) to cancel
+ *       lock due to lock lru pressure, or explicit user request to purge
+ *       locks.
+ *
+ *     - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_BLOCKING) to notify
+ *       us that dlmlock conflicts with another lock that some client is
+ *       enqueuing. Lock is canceled.
+ *
+ *           - cl_lock_cancel() is called. osc_lock_cancel() calls
+ *             ldlm_cli_cancel() that calls
+ *
+ *                  dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING)
+ *
+ *             recursively entering osc_ldlm_blocking_ast().
+ *
+ *     - client cancels lock voluntary (e.g., as a part of early cancellation):
+ *
+ *           cl_lock_cancel()->
+ *             osc_lock_cancel()->
+ *               ldlm_cli_cancel()->
+ *                 dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING)
+ *
+ */
+static int osc_ldlm_blocking_ast(struct ldlm_lock *dlmlock,
+                                 struct ldlm_lock_desc *new, void *data,
+                                 int flag)
+{
+	int result = 0;
+	ENTRY;
+
+	switch (flag) {
+	case LDLM_CB_BLOCKING: {
+		struct lustre_handle lockh;
+
+		ldlm_lock2handle(dlmlock, &lockh);
+		result = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+		if (result == -ENODATA)
+			result = 0;
+		break;
+	}
+	case LDLM_CB_CANCELING: {
+		struct lu_env     *env;
+		__u16		   refcheck;
+
+		/*
+		 * This can be called in the context of outer IO, e.g.,
+		 *
+		 *    osc_enqueue_base()->...
+		 *      ->ldlm_prep_elc_req()->...
+		 *        ->ldlm_cancel_callback()->...
+		 *          ->osc_ldlm_blocking_ast()
+		 *
+		 * new environment has to be created to not corrupt outer
+		 * context.
+		 */
+		env = cl_env_get(&refcheck);
+		if (IS_ERR(env)) {
+			result = PTR_ERR(env);
+			break;
+		}
+
+		result = osc_dlm_blocking_ast0(env, dlmlock, data, flag);
+		cl_env_put(env, &refcheck);
+		break;
+	}
+	default:
+		LBUG();
+	}
+	RETURN(result);
+}
+
+static int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
+{
+	struct ptlrpc_request	*req  = data;
+	struct lu_env		*env;
+	struct ost_lvb		*lvb;
+	struct req_capsule	*cap;
+	struct cl_object	*obj = NULL;
+	int			result;
+	__u16			refcheck;
+
+	ENTRY;
+
+	LASSERT(lustre_msg_get_opc(req->rq_reqmsg) == LDLM_GL_CALLBACK);
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		GOTO(out, result = PTR_ERR(env));
+
+
+	lock_res_and_lock(dlmlock);
+	if (dlmlock->l_ast_data != NULL) {
+		obj = osc2cl(dlmlock->l_ast_data);
+		cl_object_get(obj);
+	}
+	unlock_res_and_lock(dlmlock);
+
+	if (obj != NULL) {
+		/* Do not grab the mutex of cl_lock for glimpse.
+		 * See LU-1274 for details.
+		 * BTW, it's okay for cl_lock to be cancelled during
+		 * this period because server can handle this race.
+		 * See ldlm_server_glimpse_ast() for details.
+		 * cl_lock_mutex_get(env, lock); */
+		cap = &req->rq_pill;
+		req_capsule_extend(cap, &RQF_LDLM_GL_CALLBACK);
+		req_capsule_set_size(cap, &RMF_DLM_LVB, RCL_SERVER,
+					sizeof *lvb);
+		result = req_capsule_server_pack(cap);
+		if (result == 0) {
+			lvb = req_capsule_server_get(cap, &RMF_DLM_LVB);
+			result = cl_object_glimpse(env, obj, lvb);
+		}
+		if (!exp_connect_lvb_type(req->rq_export))
+			req_capsule_shrink(&req->rq_pill, &RMF_DLM_LVB,
+					sizeof(struct ost_lvb_v1), RCL_SERVER);
+		cl_object_put(env, obj);
+	} else {
+		/*
+		 * These errors are normal races, so we don't want to
+		 * fill the console with messages by calling
+		 * ptlrpc_error()
+		 */
+		lustre_pack_reply(req, 1, NULL, NULL);
+		result = -ELDLM_NO_LOCK_DATA;
+	}
+	cl_env_put(env, &refcheck);
+	EXIT;
+
+out:
+	req->rq_status = result;
+	RETURN(result);
+}
+
+static int weigh_cb(const struct lu_env *env, struct cl_io *io,
+		    struct osc_page *ops, void *cbdata)
+{
+	struct cl_page *page = ops->ops_cl.cpl_page;
+
+	if (cl_page_is_vmlocked(env, page)
+	    || PageDirty(page->cp_vmpage) || PageWriteback(page->cp_vmpage)
+	   )
+		return CLP_GANG_ABORT;
+
+	*(pgoff_t *)cbdata = osc_index(ops) + 1;
+	return CLP_GANG_OKAY;
+}
+
+static unsigned long osc_lock_weight(const struct lu_env *env,
+				     struct osc_object *oscobj,
+				     struct ldlm_extent *extent)
+{
+	struct cl_io     *io = &osc_env_info(env)->oti_io;
+	struct cl_object *obj = cl_object_top(&oscobj->oo_cl);
+	pgoff_t          page_index;
+	int              result;
+	ENTRY;
+
+	io->ci_obj = obj;
+	io->ci_ignore_layout = 1;
+	result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+	if (result != 0)
+		RETURN(result);
+
+	page_index = cl_index(obj, extent->start);
+	do {
+		result = osc_page_gang_lookup(env, io, oscobj,
+					      page_index,
+					      cl_index(obj, extent->end),
+					      weigh_cb, (void *)&page_index);
+		if (result == CLP_GANG_ABORT)
+			break;
+		if (result == CLP_GANG_RESCHED)
+			cond_resched();
+	} while (result != CLP_GANG_OKAY);
+	cl_io_fini(env, io);
+
+	return result == CLP_GANG_ABORT ? 1 : 0;
+}
+
+/**
+ * Get the weight of dlm lock for early cancellation.
+ */
+unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock)
+{
+	struct lu_env           *env;
+	struct osc_object	*obj;
+	struct osc_lock		*oscl;
+	unsigned long            weight;
+	bool			found = false;
+	__u16			refcheck;
+	ENTRY;
+
+	might_sleep();
+	/*
+	 * osc_ldlm_weigh_ast has a complex context since it might be called
+	 * because of lock canceling, or from user's input. We have to make
+	 * a new environment for it. Probably it is implementation safe to use
+	 * the upper context because cl_lock_put don't modify environment
+	 * variables. But just in case ..
+	 */
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		/* Mostly because lack of memory, do not eliminate this lock */
+		RETURN(1);
+
+	LASSERT(dlmlock->l_resource->lr_type == LDLM_EXTENT);
+	lock_res_and_lock(dlmlock);
+	obj = dlmlock->l_ast_data;
+	if (obj)
+		cl_object_get(osc2cl(obj));
+	unlock_res_and_lock(dlmlock);
+
+	if (obj == NULL)
+		GOTO(out, weight = 1);
+
+	spin_lock(&obj->oo_ol_spin);
+	list_for_each_entry(oscl, &obj->oo_ol_list, ols_nextlock_oscobj) {
+		if (oscl->ols_dlmlock != NULL && oscl->ols_dlmlock != dlmlock)
+			continue;
+		found = true;
+	}
+	spin_unlock(&obj->oo_ol_spin);
+	if (found) {
+		/*
+		 * If the lock is being used by an IO, definitely not cancel it.
+		 */
+		GOTO(out, weight = 1);
+	}
+
+	weight = osc_lock_weight(env, obj, &dlmlock->l_policy_data.l_extent);
+	EXIT;
+
+out:
+	if (obj)
+		cl_object_put(env, osc2cl(obj));
+
+	cl_env_put(env, &refcheck);
+	return weight;
+}
+
+static void osc_lock_build_einfo(const struct lu_env *env,
+				 const struct cl_lock *lock,
+				 struct osc_object *osc,
+				 struct ldlm_enqueue_info *einfo)
+{
+	einfo->ei_type   = LDLM_EXTENT;
+	einfo->ei_mode   = osc_cl_lock2ldlm(lock->cll_descr.cld_mode);
+	einfo->ei_cb_bl  = osc_ldlm_blocking_ast;
+	einfo->ei_cb_cp  = ldlm_completion_ast;
+	einfo->ei_cb_gl  = osc_ldlm_glimpse_ast;
+	einfo->ei_cbdata = osc; /* value to be put into ->l_ast_data */
+}
+
+/**
+ * Determine if the lock should be converted into a lockless lock.
+ *
+ * Steps to check:
+ * - if the lock has an explicite requirment for a non-lockless lock;
+ * - if the io lock request type ci_lockreq;
+ * - send the enqueue rpc to ost to make the further decision;
+ * - special treat to truncate lockless lock
+ *
+ *  Additional policy can be implemented here, e.g., never do lockless-io
+ *  for large extents.
+ */
+static void osc_lock_to_lockless(const struct lu_env *env,
+                                 struct osc_lock *ols, int force)
+{
+        struct cl_lock_slice *slice = &ols->ols_cl;
+
+        LASSERT(ols->ols_state == OLS_NEW ||
+                ols->ols_state == OLS_UPCALL_RECEIVED);
+
+        if (force) {
+                ols->ols_locklessable = 1;
+                slice->cls_ops = &osc_lock_lockless_ops;
+        } else {
+                struct osc_io *oio     = osc_env_io(env);
+                struct cl_io  *io      = oio->oi_cl.cis_io;
+                struct cl_object *obj  = slice->cls_obj;
+                struct osc_object *oob = cl2osc(obj);
+                const struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
+                struct obd_connect_data *ocd;
+
+                LASSERT(io->ci_lockreq == CILR_MANDATORY ||
+                        io->ci_lockreq == CILR_MAYBE ||
+                        io->ci_lockreq == CILR_NEVER);
+
+                ocd = &class_exp2cliimp(osc_export(oob))->imp_connect_data;
+                ols->ols_locklessable = (io->ci_type != CIT_SETATTR) &&
+                                (io->ci_lockreq == CILR_MAYBE) &&
+                                (ocd->ocd_connect_flags & OBD_CONNECT_SRVLOCK);
+                if (io->ci_lockreq == CILR_NEVER ||
+                        /* lockless IO */
+                    (ols->ols_locklessable && osc_object_is_contended(oob)) ||
+                        /* lockless truncate */
+                    (cl_io_is_trunc(io) &&
+                     (ocd->ocd_connect_flags & OBD_CONNECT_TRUNCLOCK) &&
+                      osd->od_lockless_truncate)) {
+                        ols->ols_locklessable = 1;
+                        slice->cls_ops = &osc_lock_lockless_ops;
+                }
+        }
+        LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols)));
+}
+
+static bool osc_lock_compatible(const struct osc_lock *qing,
+				const struct osc_lock *qed)
+{
+	struct cl_lock_descr *qed_descr = &qed->ols_cl.cls_lock->cll_descr;
+	struct cl_lock_descr *qing_descr = &qing->ols_cl.cls_lock->cll_descr;
+
+	if (qed->ols_glimpse)
+		return true;
+
+	if (qing_descr->cld_mode == CLM_READ && qed_descr->cld_mode == CLM_READ)
+		return true;
+
+	if (qed->ols_state < OLS_GRANTED)
+		return true;
+
+	if (qed_descr->cld_mode  >= qing_descr->cld_mode &&
+	    qed_descr->cld_start <= qing_descr->cld_start &&
+	    qed_descr->cld_end   >= qing_descr->cld_end)
+		return true;
+
+	return false;
+}
+
+static void osc_lock_wake_waiters(const struct lu_env *env,
+				  struct osc_object *osc,
+				  struct osc_lock *oscl)
+{
+	spin_lock(&osc->oo_ol_spin);
+	list_del_init(&oscl->ols_nextlock_oscobj);
+	spin_unlock(&osc->oo_ol_spin);
+
+	spin_lock(&oscl->ols_lock);
+	while (!list_empty(&oscl->ols_waiting_list)) {
+		struct osc_lock *scan;
+
+		scan = list_entry(oscl->ols_waiting_list.next, struct osc_lock,
+				  ols_wait_entry);
+		list_del_init(&scan->ols_wait_entry);
+
+		cl_sync_io_note(env, scan->ols_owner, 0);
+	}
+	spin_unlock(&oscl->ols_lock);
+}
+
+static int osc_lock_enqueue_wait(const struct lu_env *env,
+		struct osc_object *obj, struct osc_lock *oscl)
+{
+	struct osc_lock         *tmp_oscl;
+	struct cl_lock_descr    *need = &oscl->ols_cl.cls_lock->cll_descr;
+	struct cl_sync_io       *waiter = &osc_env_info(env)->oti_anchor;
+	int rc = 0;
+	ENTRY;
+
+	spin_lock(&obj->oo_ol_spin);
+	list_add_tail(&oscl->ols_nextlock_oscobj, &obj->oo_ol_list);
+
+restart:
+	list_for_each_entry(tmp_oscl, &obj->oo_ol_list,
+			    ols_nextlock_oscobj) {
+		struct cl_lock_descr *descr;
+
+		if (tmp_oscl == oscl)
+			break;
+
+		descr = &tmp_oscl->ols_cl.cls_lock->cll_descr;
+		if (descr->cld_start > need->cld_end ||
+		    descr->cld_end   < need->cld_start)
+			continue;
+
+		/* We're not supposed to give up group lock */
+		if (descr->cld_mode == CLM_GROUP)
+			break;
+
+		if (!osc_lock_is_lockless(oscl) &&
+		    osc_lock_compatible(oscl, tmp_oscl))
+			continue;
+
+		/* wait for conflicting lock to be canceled */
+		cl_sync_io_init(waiter, 1, cl_sync_io_end);
+		oscl->ols_owner = waiter;
+
+		spin_lock(&tmp_oscl->ols_lock);
+		/* add oscl into tmp's ols_waiting list */
+		list_add_tail(&oscl->ols_wait_entry,
+			      &tmp_oscl->ols_waiting_list);
+		spin_unlock(&tmp_oscl->ols_lock);
+
+		spin_unlock(&obj->oo_ol_spin);
+		rc = cl_sync_io_wait(env, waiter, 0);
+		spin_lock(&obj->oo_ol_spin);
+
+		if (rc < 0)
+			break;
+
+		oscl->ols_owner = NULL;
+		goto restart;
+	}
+	spin_unlock(&obj->oo_ol_spin);
+
+	RETURN(rc);
+}
+
+/**
+ * Implementation of cl_lock_operations::clo_enqueue() method for osc
+ * layer. This initiates ldlm enqueue:
+ *
+ *     - cancels conflicting locks early (osc_lock_enqueue_wait());
+ *
+ *     - calls osc_enqueue_base() to do actual enqueue.
+ *
+ * osc_enqueue_base() is supplied with an upcall function that is executed
+ * when lock is received either after a local cached ldlm lock is matched, or
+ * when a reply from the server is received.
+ *
+ * This function does not wait for the network communication to complete.
+ */
+static int osc_lock_enqueue(const struct lu_env *env,
+			    const struct cl_lock_slice *slice,
+			    struct cl_io *unused, struct cl_sync_io *anchor)
+{
+	struct osc_thread_info		*info  = osc_env_info(env);
+	struct osc_io			*oio   = osc_env_io(env);
+	struct osc_object		*osc   = cl2osc(slice->cls_obj);
+	struct osc_lock			*oscl  = cl2osc_lock(slice);
+	struct cl_lock			*lock  = slice->cls_lock;
+	struct ldlm_res_id		*resname = &info->oti_resname;
+	union ldlm_policy_data		*policy  = &info->oti_policy;
+	osc_enqueue_upcall_f		upcall   = osc_lock_upcall;
+	void				*cookie  = oscl;
+	bool				async    = false;
+	int				result;
+
+        ENTRY;
+
+	LASSERTF(ergo(oscl->ols_glimpse, lock->cll_descr.cld_mode <= CLM_READ),
+		"lock = %p, ols = %p\n", lock, oscl);
+
+	if (oscl->ols_state == OLS_GRANTED)
+		RETURN(0);
+
+	if (oscl->ols_flags & LDLM_FL_TEST_LOCK)
+		GOTO(enqueue_base, 0);
+
+	if (oscl->ols_glimpse) {
+		LASSERT(equi(oscl->ols_agl, anchor == NULL));
+		async = true;
+		GOTO(enqueue_base, 0);
+	}
+
+	result = osc_lock_enqueue_wait(env, osc, oscl);
+	if (result < 0)
+		GOTO(out, result);
+
+	/* we can grant lockless lock right after all conflicting locks
+	 * are canceled. */
+	if (osc_lock_is_lockless(oscl)) {
+		oscl->ols_state = OLS_GRANTED;
+		oio->oi_lockless = 1;
+		RETURN(0);
+	}
+
+enqueue_base:
+	oscl->ols_state = OLS_ENQUEUED;
+	if (anchor != NULL) {
+		atomic_inc(&anchor->csi_sync_nr);
+		oscl->ols_owner = anchor;
+	}
+
+	/**
+	 * DLM lock's ast data must be osc_object;
+	 * if glimpse or AGL lock, async of osc_enqueue_base() must be true,
+	 * DLM's enqueue callback set to osc_lock_upcall() with cookie as
+	 * osc_lock.
+	 */
+	ostid_build_res_name(&osc->oo_oinfo->loi_oi, resname);
+	osc_lock_build_policy(env, lock, policy);
+	if (oscl->ols_agl) {
+		oscl->ols_einfo.ei_cbdata = NULL;
+		/* hold a reference for callback */
+		cl_object_get(osc2cl(osc));
+		upcall = osc_lock_upcall_agl;
+		cookie = osc;
+	}
+	result = osc_enqueue_base(osc_export(osc), resname, &oscl->ols_flags,
+				  policy, &oscl->ols_lvb,
+				  osc->oo_oinfo->loi_kms_valid,
+				  upcall, cookie,
+				  &oscl->ols_einfo, PTLRPCD_SET, async,
+				  oscl->ols_agl);
+	if (result == 0) {
+		if (osc_lock_is_lockless(oscl)) {
+			oio->oi_lockless = 1;
+		} else if (!async) {
+			LASSERT(oscl->ols_state == OLS_GRANTED);
+			LASSERT(oscl->ols_hold);
+			LASSERT(oscl->ols_dlmlock != NULL);
+		}
+	} else if (oscl->ols_agl) {
+		cl_object_put(env, osc2cl(osc));
+		result = 0;
+	}
+
+out:
+	if (result < 0) {
+		oscl->ols_state = OLS_CANCELLED;
+		osc_lock_wake_waiters(env, osc, oscl);
+
+		if (anchor != NULL)
+			cl_sync_io_note(env, anchor, result);
+	}
+	RETURN(result);
+}
+
+/**
+ * Breaks a link between osc_lock and dlm_lock.
+ */
+static void osc_lock_detach(const struct lu_env *env, struct osc_lock *olck)
+{
+	struct ldlm_lock *dlmlock;
+
+	ENTRY;
+
+	dlmlock = olck->ols_dlmlock;
+	if (dlmlock == NULL)
+		RETURN_EXIT;
+
+	if (olck->ols_hold) {
+		olck->ols_hold = 0;
+		ldlm_lock_decref(&olck->ols_handle, olck->ols_einfo.ei_mode);
+		olck->ols_handle.cookie = 0ULL;
+	}
+
+	olck->ols_dlmlock = NULL;
+
+	/* release a reference taken in osc_lock_upcall(). */
+	LASSERT(olck->ols_has_ref);
+	lu_ref_del(&dlmlock->l_reference, "osc_lock", olck);
+	LDLM_LOCK_RELEASE(dlmlock);
+	olck->ols_has_ref = 0;
+
+	EXIT;
+}
+
+/**
+ * Implements cl_lock_operations::clo_cancel() method for osc layer. This is
+ * called (as part of cl_lock_cancel()) when lock is canceled either voluntary
+ * (LRU pressure, early cancellation, umount, etc.) or due to the conflict
+ * with some other lock some where in the cluster. This function does the
+ * following:
+ *
+ *     - invalidates all pages protected by this lock (after sending dirty
+ *       ones to the server, as necessary);
+ *
+ *     - decref's underlying ldlm lock;
+ *
+ *     - cancels ldlm lock (ldlm_cli_cancel()).
+ */
+static void osc_lock_cancel(const struct lu_env *env,
+                            const struct cl_lock_slice *slice)
+{
+	struct osc_object *obj  = cl2osc(slice->cls_obj);
+	struct osc_lock	  *oscl = cl2osc_lock(slice);
+
+	ENTRY;
+
+	LINVRNT(osc_lock_invariant(oscl));
+
+	osc_lock_detach(env, oscl);
+	oscl->ols_state = OLS_CANCELLED;
+	oscl->ols_flags &= ~LDLM_FL_LVB_READY;
+
+	osc_lock_wake_waiters(env, obj, oscl);
+	EXIT;
+}
+
+static int osc_lock_print(const struct lu_env *env, void *cookie,
+			  lu_printer_t p, const struct cl_lock_slice *slice)
+{
+	struct osc_lock *lock = cl2osc_lock(slice);
+
+	(*p)(env, cookie, "%p %#llx %#llx %d %p ",
+	     lock->ols_dlmlock, lock->ols_flags, lock->ols_handle.cookie,
+	     lock->ols_state, lock->ols_owner);
+	osc_lvb_print(env, cookie, p, &lock->ols_lvb);
+	return 0;
+}
+
+static const struct cl_lock_operations osc_lock_ops = {
+        .clo_fini    = osc_lock_fini,
+        .clo_enqueue = osc_lock_enqueue,
+        .clo_cancel  = osc_lock_cancel,
+        .clo_print   = osc_lock_print,
+};
+
+static void osc_lock_lockless_cancel(const struct lu_env *env,
+				     const struct cl_lock_slice *slice)
+{
+	struct osc_lock      *ols   = cl2osc_lock(slice);
+	struct osc_object    *osc   = cl2osc(slice->cls_obj);
+	struct cl_lock_descr *descr = &slice->cls_lock->cll_descr;
+	int result;
+
+	LASSERT(ols->ols_dlmlock == NULL);
+	result = osc_lock_flush(osc, descr->cld_start, descr->cld_end,
+				descr->cld_mode, false);
+        if (result)
+                CERROR("Pages for lockless lock %p were not purged(%d)\n",
+                       ols, result);
+
+	osc_lock_wake_waiters(env, osc, ols);
+}
+
+static const struct cl_lock_operations osc_lock_lockless_ops = {
+        .clo_fini      = osc_lock_fini,
+        .clo_enqueue   = osc_lock_enqueue,
+        .clo_cancel    = osc_lock_lockless_cancel,
+        .clo_print     = osc_lock_print
+};
+
+static void osc_lock_set_writer(const struct lu_env *env,
+				const struct cl_io *io,
+				struct cl_object *obj, struct osc_lock *oscl)
+{
+	struct cl_lock_descr *descr = &oscl->ols_cl.cls_lock->cll_descr;
+	pgoff_t io_start;
+	pgoff_t io_end;
+
+	if (!cl_object_same(io->ci_obj, obj))
+		return;
+
+	if (likely(io->ci_type == CIT_WRITE)) {
+		io_start = cl_index(obj, io->u.ci_rw.rw_range.cir_pos);
+		io_end = cl_index(obj, io->u.ci_rw.rw_range.cir_pos +
+				  io->u.ci_rw.rw_range.cir_count - 1);
+	} else {
+		LASSERT(cl_io_is_mkwrite(io));
+		io_start = io_end = io->u.ci_fault.ft_index;
+	}
+
+	if (descr->cld_mode >= CLM_WRITE &&
+	    (cl_io_is_append(io) ||
+	     (descr->cld_start <= io_start && descr->cld_end >= io_end))) {
+		struct osc_io *oio = osc_env_io(env);
+
+		/* There must be only one lock to match the write region */
+		LASSERT(oio->oi_write_osclock == NULL);
+		oio->oi_write_osclock = oscl;
+	}
+}
+
+int osc_lock_init(const struct lu_env *env,
+		  struct cl_object *obj, struct cl_lock *lock,
+		  const struct cl_io *io)
+{
+	struct osc_lock *oscl;
+	__u32 enqflags = lock->cll_descr.cld_enq_flags;
+
+	OBD_SLAB_ALLOC_PTR_GFP(oscl, osc_lock_kmem, GFP_NOFS);
+	if (oscl == NULL)
+		return -ENOMEM;
+
+	oscl->ols_state = OLS_NEW;
+	spin_lock_init(&oscl->ols_lock);
+	INIT_LIST_HEAD(&oscl->ols_waiting_list);
+	INIT_LIST_HEAD(&oscl->ols_wait_entry);
+	INIT_LIST_HEAD(&oscl->ols_nextlock_oscobj);
+
+	oscl->ols_flags = osc_enq2ldlm_flags(enqflags);
+	oscl->ols_agl = !!(enqflags & CEF_AGL);
+	if (oscl->ols_agl)
+		oscl->ols_flags |= LDLM_FL_BLOCK_NOWAIT;
+	if (oscl->ols_flags & LDLM_FL_HAS_INTENT) {
+		oscl->ols_flags |= LDLM_FL_BLOCK_GRANTED;
+		oscl->ols_glimpse = 1;
+	}
+	osc_lock_build_einfo(env, lock, cl2osc(obj), &oscl->ols_einfo);
+
+	cl_lock_slice_add(lock, &oscl->ols_cl, obj, &osc_lock_ops);
+
+	if (!(enqflags & CEF_MUST))
+		/* try to convert this lock to a lockless lock */
+		osc_lock_to_lockless(env, oscl, (enqflags & CEF_NEVER));
+	if (oscl->ols_locklessable && !(enqflags & CEF_DISCARD_DATA))
+		oscl->ols_flags |= LDLM_FL_DENY_ON_CONTENTION;
+
+	if (io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io))
+		osc_lock_set_writer(env, io, obj, oscl);
+
+	LDLM_DEBUG_NOLOCK("lock %p, osc lock %p, flags %#llx",
+			  lock, oscl, oscl->ols_flags);
+
+	return 0;
+}
+
+/**
+ * Finds an existing lock covering given index and optionally different from a
+ * given \a except lock.
+ */
+struct ldlm_lock *osc_dlmlock_at_pgoff(const struct lu_env *env,
+				       struct osc_object *obj, pgoff_t index,
+				       enum osc_dap_flags dap_flags)
+{
+	struct osc_thread_info *info = osc_env_info(env);
+	struct ldlm_res_id *resname = &info->oti_resname;
+	union ldlm_policy_data *policy  = &info->oti_policy;
+	struct lustre_handle lockh;
+	struct ldlm_lock *lock = NULL;
+	enum ldlm_mode mode;
+	__u64 flags;
+
+	ENTRY;
+
+	ostid_build_res_name(&obj->oo_oinfo->loi_oi, resname);
+	osc_index2policy(policy, osc2cl(obj), index, index);
+	policy->l_extent.gid = LDLM_GID_ANY;
+
+	flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
+	if (dap_flags & OSC_DAP_FL_TEST_LOCK)
+		flags |= LDLM_FL_TEST_LOCK;
+	/*
+	 * It is fine to match any group lock since there could be only one
+	 * with a uniq gid and it conflicts with all other lock modes too
+	 */
+again:
+	mode = osc_match_base(osc_export(obj), resname, LDLM_EXTENT, policy,
+			       LCK_PR | LCK_PW | LCK_GROUP, &flags, obj, &lockh,
+			       dap_flags & OSC_DAP_FL_CANCELING);
+	if (mode != 0) {
+		lock = ldlm_handle2lock(&lockh);
+		/* RACE: the lock is cancelled so let's try again */
+		if (unlikely(lock == NULL))
+			goto again;
+	}
+
+	RETURN(lock);
+}
+/** @} osc */
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_object.c b/drivers/staging/lustrefsx/lustre/osc/osc_object.c
new file mode 100644
index 0000000000000..052f8bc90525c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_object.c
@@ -0,0 +1,482 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_object for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static struct lu_object *osc2lu(struct osc_object *osc)
+{
+        return &osc->oo_cl.co_lu;
+}
+
+static struct osc_object *lu2osc(const struct lu_object *obj)
+{
+        LINVRNT(osc_is_object(obj));
+        return container_of0(obj, struct osc_object, oo_cl.co_lu);
+}
+
+/*****************************************************************************
+ *
+ * Object operations.
+ *
+ */
+
+static int osc_object_init(const struct lu_env *env, struct lu_object *obj,
+                           const struct lu_object_conf *conf)
+{
+        struct osc_object           *osc   = lu2osc(obj);
+        const struct cl_object_conf *cconf = lu2cl_conf(conf);
+
+        osc->oo_oinfo = cconf->u.coc_oinfo;
+#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
+	mutex_init(&osc->oo_debug_mutex);
+#endif
+	INIT_LIST_HEAD(&osc->oo_ready_item);
+	INIT_LIST_HEAD(&osc->oo_hp_ready_item);
+	INIT_LIST_HEAD(&osc->oo_write_item);
+	INIT_LIST_HEAD(&osc->oo_read_item);
+
+	osc->oo_root.rb_node = NULL;
+	INIT_LIST_HEAD(&osc->oo_hp_exts);
+	INIT_LIST_HEAD(&osc->oo_urgent_exts);
+	INIT_LIST_HEAD(&osc->oo_full_exts);
+	INIT_LIST_HEAD(&osc->oo_reading_exts);
+	atomic_set(&osc->oo_nr_reads, 0);
+	atomic_set(&osc->oo_nr_writes, 0);
+	spin_lock_init(&osc->oo_lock);
+	spin_lock_init(&osc->oo_tree_lock);
+	spin_lock_init(&osc->oo_ol_spin);
+	INIT_LIST_HEAD(&osc->oo_ol_list);
+
+	atomic_set(&osc->oo_nr_ios, 0);
+	init_waitqueue_head(&osc->oo_io_waitq);
+
+	cl_object_page_init(lu2cl(obj), sizeof(struct osc_page));
+
+	return 0;
+}
+
+static void osc_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+	struct osc_object *osc = lu2osc(obj);
+
+	LASSERT(list_empty(&osc->oo_ready_item));
+	LASSERT(list_empty(&osc->oo_hp_ready_item));
+	LASSERT(list_empty(&osc->oo_write_item));
+	LASSERT(list_empty(&osc->oo_read_item));
+
+	LASSERT(osc->oo_root.rb_node == NULL);
+	LASSERT(list_empty(&osc->oo_hp_exts));
+	LASSERT(list_empty(&osc->oo_urgent_exts));
+	LASSERT(list_empty(&osc->oo_full_exts));
+	LASSERT(list_empty(&osc->oo_reading_exts));
+	LASSERT(atomic_read(&osc->oo_nr_reads) == 0);
+	LASSERT(atomic_read(&osc->oo_nr_writes) == 0);
+	LASSERT(list_empty(&osc->oo_ol_list));
+	LASSERT(atomic_read(&osc->oo_nr_ios) == 0);
+
+	lu_object_fini(obj);
+	OBD_SLAB_FREE_PTR(osc, osc_object_kmem);
+}
+
+int osc_lvb_print(const struct lu_env *env, void *cookie,
+                  lu_printer_t p, const struct ost_lvb *lvb)
+{
+	return (*p)(env, cookie, "size: %llu mtime: %llu atime: %llu "
+		    "ctime: %llu blocks: %llu",
+                    lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime,
+                    lvb->lvb_ctime, lvb->lvb_blocks);
+}
+
+static int osc_object_print(const struct lu_env *env, void *cookie,
+                            lu_printer_t p, const struct lu_object *obj)
+{
+	struct osc_object   *osc   = lu2osc(obj);
+	struct lov_oinfo    *oinfo = osc->oo_oinfo;
+	struct osc_async_rc *ar    = &oinfo->loi_ar;
+
+	(*p)(env, cookie, "id: "DOSTID" "
+	     "idx: %d gen: %d kms_valid: %u kms %llu "
+	     "rc: %d force_sync: %d min_xid: %llu ",
+	     POSTID(&oinfo->loi_oi), oinfo->loi_ost_idx,
+	     oinfo->loi_ost_gen, oinfo->loi_kms_valid, oinfo->loi_kms,
+	     ar->ar_rc, ar->ar_force_sync, ar->ar_min_xid);
+	osc_lvb_print(env, cookie, p, &oinfo->loi_lvb);
+	return 0;
+}
+
+
+static int osc_attr_get(const struct lu_env *env, struct cl_object *obj,
+                        struct cl_attr *attr)
+{
+        struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+
+        cl_lvb2attr(attr, &oinfo->loi_lvb);
+        attr->cat_kms = oinfo->loi_kms_valid ? oinfo->loi_kms : 0;
+        return 0;
+}
+
+static int osc_attr_update(const struct lu_env *env, struct cl_object *obj,
+			   const struct cl_attr *attr, unsigned valid)
+{
+	struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+	struct ost_lvb   *lvb   = &oinfo->loi_lvb;
+
+	if (valid & CAT_SIZE)
+		lvb->lvb_size = attr->cat_size;
+	if (valid & CAT_MTIME)
+		lvb->lvb_mtime = attr->cat_mtime;
+	if (valid & CAT_ATIME)
+		lvb->lvb_atime = attr->cat_atime;
+	if (valid & CAT_CTIME)
+		lvb->lvb_ctime = attr->cat_ctime;
+	if (valid & CAT_BLOCKS)
+		lvb->lvb_blocks = attr->cat_blocks;
+	if (valid & CAT_KMS) {
+		CDEBUG(D_CACHE, "set kms from %lluto %llu\n",
+		       oinfo->loi_kms, (__u64)attr->cat_kms);
+		loi_kms_set(oinfo, attr->cat_kms);
+	}
+	return 0;
+}
+
+static int osc_object_glimpse(const struct lu_env *env,
+                              const struct cl_object *obj, struct ost_lvb *lvb)
+{
+        struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+
+        ENTRY;
+        lvb->lvb_size   = oinfo->loi_kms;
+        lvb->lvb_blocks = oinfo->loi_lvb.lvb_blocks;
+        RETURN(0);
+}
+
+static int osc_object_ast_clear(struct ldlm_lock *lock, void *data)
+{
+	ENTRY;
+
+	if (lock->l_ast_data == data)
+		lock->l_ast_data = NULL;
+	RETURN(LDLM_ITER_CONTINUE);
+}
+
+static int osc_object_prune(const struct lu_env *env, struct cl_object *obj)
+{
+	struct osc_object       *osc = cl2osc(obj);
+	struct ldlm_res_id      *resname = &osc_env_info(env)->oti_resname;
+
+	/* DLM locks don't hold a reference of osc_object so we have to
+	 * clear it before the object is being destroyed. */
+	ostid_build_res_name(&osc->oo_oinfo->loi_oi, resname);
+	ldlm_resource_iterate(osc_export(osc)->exp_obd->obd_namespace, resname,
+			      osc_object_ast_clear, osc);
+	return 0;
+}
+
+static int osc_object_fiemap(const struct lu_env *env, struct cl_object *obj,
+			     struct ll_fiemap_info_key *fmkey,
+			     struct fiemap *fiemap, size_t *buflen)
+{
+	struct obd_export *exp = osc_export(cl2osc(obj));
+	struct ldlm_res_id resid;
+	union ldlm_policy_data policy;
+	struct lustre_handle lockh;
+	enum ldlm_mode mode = LCK_MINMODE;
+	struct ptlrpc_request *req;
+	struct fiemap *reply;
+	char *tmp;
+	int rc;
+	ENTRY;
+
+	fmkey->lfik_oa.o_oi = cl2osc(obj)->oo_oinfo->loi_oi;
+	if (!(fmkey->lfik_fiemap.fm_flags & FIEMAP_FLAG_SYNC))
+		goto skip_locking;
+
+	policy.l_extent.start = fmkey->lfik_fiemap.fm_start & PAGE_MASK;
+
+	if (OBD_OBJECT_EOF - fmkey->lfik_fiemap.fm_length <=
+	    fmkey->lfik_fiemap.fm_start + PAGE_SIZE - 1)
+		policy.l_extent.end = OBD_OBJECT_EOF;
+	else
+		policy.l_extent.end = (fmkey->lfik_fiemap.fm_start +
+				       fmkey->lfik_fiemap.fm_length +
+				       PAGE_SIZE - 1) & PAGE_MASK;
+
+	ostid_build_res_name(&fmkey->lfik_oa.o_oi, &resid);
+	mode = ldlm_lock_match(exp->exp_obd->obd_namespace,
+			       LDLM_FL_BLOCK_GRANTED | LDLM_FL_LVB_READY,
+			       &resid, LDLM_EXTENT, &policy,
+			       LCK_PR | LCK_PW, &lockh, 0);
+	if (mode) { /* lock is cached on client */
+		if (mode != LCK_PR) {
+			ldlm_lock_addref(&lockh, LCK_PR);
+			ldlm_lock_decref(&lockh, LCK_PW);
+		}
+	} else { /* no cached lock, needs acquire lock on server side */
+		fmkey->lfik_oa.o_valid |= OBD_MD_FLFLAGS;
+		fmkey->lfik_oa.o_flags |= OBD_FL_SRVLOCK;
+	}
+
+skip_locking:
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_OST_GET_INFO_FIEMAP);
+	if (req == NULL)
+		GOTO(drop_lock, rc = -ENOMEM);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_KEY, RCL_CLIENT,
+			     sizeof(*fmkey));
+	req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL, RCL_CLIENT,
+			     *buflen);
+	req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL, RCL_SERVER,
+			     *buflen);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GET_INFO);
+	if (rc != 0) {
+		ptlrpc_request_free(req);
+		GOTO(drop_lock, rc);
+	}
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_KEY);
+	memcpy(tmp, fmkey, sizeof(*fmkey));
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_VAL);
+	memcpy(tmp, fiemap, *buflen);
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc != 0)
+		GOTO(fini_req, rc);
+
+	reply = req_capsule_server_get(&req->rq_pill, &RMF_FIEMAP_VAL);
+	if (reply == NULL)
+		GOTO(fini_req, rc = -EPROTO);
+
+	memcpy(fiemap, reply, *buflen);
+fini_req:
+	ptlrpc_req_finished(req);
+drop_lock:
+	if (mode)
+		ldlm_lock_decref(&lockh, LCK_PR);
+	RETURN(rc);
+}
+
+void osc_object_set_contended(struct osc_object *obj)
+{
+        obj->oo_contention_time = cfs_time_current();
+        /* mb(); */
+        obj->oo_contended = 1;
+}
+
+void osc_object_clear_contended(struct osc_object *obj)
+{
+        obj->oo_contended = 0;
+}
+
+int osc_object_is_contended(struct osc_object *obj)
+{
+        struct osc_device *dev  = lu2osc_dev(obj->oo_cl.co_lu.lo_dev);
+        int osc_contention_time = dev->od_contention_time;
+        cfs_time_t cur_time     = cfs_time_current();
+        cfs_time_t retry_time;
+
+        if (OBD_FAIL_CHECK(OBD_FAIL_OSC_OBJECT_CONTENTION))
+                return 1;
+
+        if (!obj->oo_contended)
+                return 0;
+
+        /*
+         * I like copy-paste. the code is copied from
+         * ll_file_is_contended.
+         */
+        retry_time = cfs_time_add(obj->oo_contention_time,
+                                  cfs_time_seconds(osc_contention_time));
+        if (cfs_time_after(cur_time, retry_time)) {
+                osc_object_clear_contended(obj);
+                return 0;
+        }
+        return 1;
+}
+
+/**
+ * Implementation of struct cl_object_operations::coo_req_attr_set() for osc
+ * layer. osc is responsible for struct obdo::o_id and struct obdo::o_seq
+ * fields.
+ */
+static void osc_req_attr_set(const struct lu_env *env, struct cl_object *obj,
+			     struct cl_req_attr *attr)
+{
+	struct lov_oinfo *oinfo;
+	struct obdo      *oa;
+	struct ost_lvb   *lvb;
+	u64		  flags = attr->cra_flags;
+
+	oinfo   = cl2osc(obj)->oo_oinfo;
+	lvb     = &oinfo->loi_lvb;
+	oa      = attr->cra_oa;
+
+	if ((flags & OBD_MD_FLMTIME) != 0) {
+		oa->o_mtime = lvb->lvb_mtime;
+		oa->o_valid |= OBD_MD_FLMTIME;
+	}
+	if ((flags & OBD_MD_FLATIME) != 0) {
+		oa->o_atime = lvb->lvb_atime;
+		oa->o_valid |= OBD_MD_FLATIME;
+	}
+	if ((flags & OBD_MD_FLCTIME) != 0) {
+		oa->o_ctime = lvb->lvb_ctime;
+		oa->o_valid |= OBD_MD_FLCTIME;
+	}
+	if (flags & OBD_MD_FLGROUP) {
+		ostid_set_seq(&oa->o_oi, ostid_seq(&oinfo->loi_oi));
+		oa->o_valid |= OBD_MD_FLGROUP;
+	}
+	if (flags & OBD_MD_FLID) {
+		int rc;
+
+		rc = ostid_set_id(&oa->o_oi, ostid_id(&oinfo->loi_oi));
+		if (rc) {
+			CERROR("Bad %llu to set " DOSTID " : rc %d\n",
+			       (unsigned long long)ostid_id(&oinfo->loi_oi),
+			       POSTID(&oa->o_oi), rc);
+		}
+		oa->o_valid |= OBD_MD_FLID;
+	}
+	if (flags & OBD_MD_FLHANDLE) {
+		struct ldlm_lock *lock;
+		struct osc_page *opg;
+
+		opg = osc_cl_page_osc(attr->cra_page, cl2osc(obj));
+		lock = osc_dlmlock_at_pgoff(env, cl2osc(obj), osc_index(opg),
+				OSC_DAP_FL_TEST_LOCK | OSC_DAP_FL_CANCELING);
+		if (lock == NULL && !opg->ops_srvlock) {
+			struct ldlm_resource *res;
+			struct ldlm_res_id *resname;
+
+			CL_PAGE_DEBUG(D_ERROR, env, attr->cra_page,
+				      "uncovered page!\n");
+
+			resname = &osc_env_info(env)->oti_resname;
+			ostid_build_res_name(&oinfo->loi_oi, resname);
+			res = ldlm_resource_get(
+				osc_export(cl2osc(obj))->exp_obd->obd_namespace,
+				NULL, resname, LDLM_EXTENT, 0);
+			ldlm_resource_dump(D_ERROR, res);
+
+			libcfs_debug_dumpstack(NULL);
+			LBUG();
+		}
+
+		/* check for lockless io. */
+		if (lock != NULL) {
+			oa->o_handle = lock->l_remote_handle;
+			oa->o_valid |= OBD_MD_FLHANDLE;
+			LDLM_LOCK_PUT(lock);
+		}
+	}
+}
+
+static const struct cl_object_operations osc_ops = {
+	.coo_page_init    = osc_page_init,
+	.coo_lock_init    = osc_lock_init,
+	.coo_io_init      = osc_io_init,
+	.coo_attr_get     = osc_attr_get,
+	.coo_attr_update  = osc_attr_update,
+	.coo_glimpse      = osc_object_glimpse,
+	.coo_prune        = osc_object_prune,
+	.coo_fiemap       = osc_object_fiemap,
+	.coo_req_attr_set = osc_req_attr_set
+};
+
+static const struct lu_object_operations osc_lu_obj_ops = {
+	.loo_object_init      = osc_object_init,
+	.loo_object_release   = NULL,
+	.loo_object_free      = osc_object_free,
+	.loo_object_print     = osc_object_print,
+	.loo_object_invariant = NULL
+};
+
+struct lu_object *osc_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *unused,
+				   struct lu_device *dev)
+{
+	struct osc_object *osc;
+	struct lu_object  *obj;
+
+	OBD_SLAB_ALLOC_PTR_GFP(osc, osc_object_kmem, GFP_NOFS);
+	if (osc != NULL) {
+		obj = osc2lu(osc);
+		lu_object_init(obj, NULL, dev);
+		osc->oo_cl.co_ops = &osc_ops;
+		obj->lo_ops = &osc_lu_obj_ops;
+	} else
+		obj = NULL;
+	return obj;
+}
+
+int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc)
+{
+	struct l_wait_info lwi = { 0 };
+	ENTRY;
+
+	CDEBUG(D_INODE, "Invalidate osc object: %p, # of active IOs: %d\n",
+	       osc, atomic_read(&osc->oo_nr_ios));
+
+	l_wait_event(osc->oo_io_waitq, atomic_read(&osc->oo_nr_ios) == 0, &lwi);
+
+	/* Discard all dirty pages of this object. */
+	osc_cache_truncate_start(env, osc, 0, NULL);
+
+	/* Discard all caching pages */
+	osc_lock_discard_pages(env, osc, 0, CL_PAGE_EOF, true);
+
+	/* Clear ast data of dlm lock. Do this after discarding all pages */
+	osc_object_prune(env, osc2cl(osc));
+
+	RETURN(0);
+}
+
+/** @} osc */
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_page.c b/drivers/staging/lustrefsx/lustre/osc/osc_page.c
new file mode 100644
index 0000000000000..20ed2d75db79f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_page.c
@@ -0,0 +1,1107 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+
+static void osc_lru_del(struct client_obd *cli, struct osc_page *opg);
+static void osc_lru_use(struct client_obd *cli, struct osc_page *opg);
+static int osc_lru_alloc(const struct lu_env *env, struct client_obd *cli,
+			 struct osc_page *opg);
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/*
+ * Page operations.
+ */
+static void osc_page_transfer_get(struct osc_page *opg, const char *label)
+{
+	struct cl_page *page = opg->ops_cl.cpl_page;
+
+	LASSERT(!opg->ops_transfer_pinned);
+	cl_page_get(page);
+	lu_ref_add_atomic(&page->cp_reference, label, page);
+	opg->ops_transfer_pinned = 1;
+}
+
+static void osc_page_transfer_put(const struct lu_env *env,
+				  struct osc_page *opg)
+{
+	struct cl_page *page = opg->ops_cl.cpl_page;
+
+	if (opg->ops_transfer_pinned) {
+		opg->ops_transfer_pinned = 0;
+		lu_ref_del(&page->cp_reference, "transfer", page);
+		cl_page_put(env, page);
+	}
+}
+
+/**
+ * This is called once for every page when it is submitted for a transfer
+ * either opportunistic (osc_page_cache_add()), or immediate
+ * (osc_page_submit()).
+ */
+static void osc_page_transfer_add(const struct lu_env *env,
+                                  struct osc_page *opg, enum cl_req_type crt)
+{
+	struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
+
+	/* ops_lru and ops_inflight share the same field, so take it from LRU
+	 * first and then use it as inflight. */
+	osc_lru_use(osc_cli(obj), opg);
+}
+
+int osc_page_cache_add(const struct lu_env *env,
+			const struct cl_page_slice *slice, struct cl_io *io)
+{
+	struct osc_page *opg = cl2osc_page(slice);
+	int result;
+	ENTRY;
+
+	osc_page_transfer_get(opg, "transfer\0cache");
+	result = osc_queue_async_io(env, io, opg);
+	if (result != 0)
+		osc_page_transfer_put(env, opg);
+	else
+		osc_page_transfer_add(env, opg, CRT_WRITE);
+
+	RETURN(result);
+}
+
+void osc_index2policy(union ldlm_policy_data *policy,
+		      const struct cl_object *obj, pgoff_t start, pgoff_t end)
+{
+	memset(policy, 0, sizeof *policy);
+	policy->l_extent.start = cl_offset(obj, start);
+	policy->l_extent.end   = cl_offset(obj, end + 1) - 1;
+}
+
+static const char *osc_list(struct list_head *head)
+{
+	return list_empty(head) ? "-" : "+";
+}
+
+static inline cfs_time_t osc_submit_duration(struct osc_page *opg)
+{
+        if (opg->ops_submit_time == 0)
+                return 0;
+
+        return (cfs_time_current() - opg->ops_submit_time);
+}
+
+static int osc_page_print(const struct lu_env *env,
+                          const struct cl_page_slice *slice,
+                          void *cookie, lu_printer_t printer)
+{
+        struct osc_page       *opg = cl2osc_page(slice);
+        struct osc_async_page *oap = &opg->ops_oap;
+        struct osc_object     *obj = cl2osc(slice->cpl_obj);
+        struct client_obd     *cli = &osc_export(obj)->exp_obd->u.cli;
+
+	return (*printer)(env, cookie, LUSTRE_OSC_NAME"-page@%p %lu: "
+			  "1< %#x %d %u %s %s > "
+			  "2< %lld %u %u %#x %#x | %p %p %p > "
+			  "3< %d %lu %d > "
+			  "4< %d %d %d %lu %s | %s %s %s %s > "
+			  "5< %s %s %s %s | %d %s | %d %s %s>\n",
+			  opg, osc_index(opg),
+                          /* 1 */
+                          oap->oap_magic, oap->oap_cmd,
+                          oap->oap_interrupted,
+                          osc_list(&oap->oap_pending_item),
+                          osc_list(&oap->oap_rpc_item),
+                          /* 2 */
+                          oap->oap_obj_off, oap->oap_page_off, oap->oap_count,
+                          oap->oap_async_flags, oap->oap_brw_flags,
+			  oap->oap_request, oap->oap_cli, obj,
+			  /* 3 */
+			  opg->ops_transfer_pinned,
+			  osc_submit_duration(opg), opg->ops_srvlock,
+                          /* 4 */
+                          cli->cl_r_in_flight, cli->cl_w_in_flight,
+                          cli->cl_max_rpcs_in_flight,
+                          cli->cl_avail_grant,
+                          osc_list(&cli->cl_cache_waiters),
+                          osc_list(&cli->cl_loi_ready_list),
+                          osc_list(&cli->cl_loi_hp_ready_list),
+                          osc_list(&cli->cl_loi_write_list),
+                          osc_list(&cli->cl_loi_read_list),
+                          /* 5 */
+			  osc_list(&obj->oo_ready_item),
+			  osc_list(&obj->oo_hp_ready_item),
+			  osc_list(&obj->oo_write_item),
+			  osc_list(&obj->oo_read_item),
+			  atomic_read(&obj->oo_nr_reads),
+			  osc_list(&obj->oo_reading_exts),
+			  atomic_read(&obj->oo_nr_writes),
+			  osc_list(&obj->oo_hp_exts),
+			  osc_list(&obj->oo_urgent_exts));
+}
+
+static void osc_page_delete(const struct lu_env *env,
+			    const struct cl_page_slice *slice)
+{
+	struct osc_page   *opg = cl2osc_page(slice);
+	struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
+	int rc;
+
+	ENTRY;
+	CDEBUG(D_TRACE, "%p\n", opg);
+	osc_page_transfer_put(env, opg);
+	rc = osc_teardown_async_page(env, obj, opg);
+	if (rc) {
+		CL_PAGE_DEBUG(D_ERROR, env, slice->cpl_page,
+			      "Trying to teardown failed: %d\n", rc);
+		LASSERT(0);
+	}
+
+	osc_lru_del(osc_cli(obj), opg);
+
+	if (slice->cpl_page->cp_type == CPT_CACHEABLE) {
+		void *value = NULL;
+
+		spin_lock(&obj->oo_tree_lock);
+		if (opg->ops_intree) {
+			value = radix_tree_delete(&obj->oo_tree,
+						  osc_index(opg));
+			if (value != NULL) {
+				--obj->oo_npages;
+				opg->ops_intree = 0;
+			}
+		}
+		spin_unlock(&obj->oo_tree_lock);
+
+		LASSERT(ergo(value != NULL, value == opg));
+	}
+
+	EXIT;
+}
+
+static void osc_page_clip(const struct lu_env *env,
+			  const struct cl_page_slice *slice,
+			  int from, int to)
+{
+	struct osc_page       *opg = cl2osc_page(slice);
+	struct osc_async_page *oap = &opg->ops_oap;
+
+	opg->ops_from = from;
+	opg->ops_to   = to;
+	spin_lock(&oap->oap_lock);
+	oap->oap_async_flags |= ASYNC_COUNT_STABLE;
+	spin_unlock(&oap->oap_lock);
+}
+
+static int osc_page_cancel(const struct lu_env *env,
+			   const struct cl_page_slice *slice)
+{
+	struct osc_page *opg = cl2osc_page(slice);
+	int rc = 0;
+
+	/* Check if the transferring against this page
+	 * is completed, or not even queued. */
+	if (opg->ops_transfer_pinned)
+		/* FIXME: may not be interrupted.. */
+		rc = osc_cancel_async_page(env, opg);
+	LASSERT(ergo(rc == 0, opg->ops_transfer_pinned == 0));
+	return rc;
+}
+
+static int osc_page_flush(const struct lu_env *env,
+			  const struct cl_page_slice *slice,
+			  struct cl_io *io)
+{
+	struct osc_page *opg = cl2osc_page(slice);
+	int rc = 0;
+	ENTRY;
+	rc = osc_flush_async_page(env, io, opg);
+	RETURN(rc);
+}
+
+static const struct cl_page_operations osc_page_ops = {
+	.cpo_print         = osc_page_print,
+	.cpo_delete        = osc_page_delete,
+	.cpo_clip           = osc_page_clip,
+	.cpo_cancel         = osc_page_cancel,
+	.cpo_flush          = osc_page_flush
+};
+
+int osc_page_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_page *page, pgoff_t index)
+{
+	struct osc_object *osc = cl2osc(obj);
+	struct osc_page   *opg = cl_object_page_slice(obj, page);
+	struct osc_io *oio = osc_env_io(env);
+	int result;
+
+	opg->ops_from = 0;
+	opg->ops_to   = PAGE_SIZE;
+
+	INIT_LIST_HEAD(&opg->ops_lru);
+
+	result = osc_prep_async_page(osc, opg, page->cp_vmpage,
+				     cl_offset(obj, index));
+	if (result != 0)
+		return result;
+
+	opg->ops_srvlock = osc_io_srvlock(oio);
+	cl_page_slice_add(page, &opg->ops_cl, obj, index,
+			  &osc_page_ops);
+
+
+	/* reserve an LRU space for this page */
+	if (page->cp_type == CPT_CACHEABLE) {
+		result = osc_lru_alloc(env, osc_cli(osc), opg);
+		if (result == 0) {
+			result = radix_tree_preload(GFP_NOFS);
+			if (result == 0) {
+				spin_lock(&osc->oo_tree_lock);
+				result = radix_tree_insert(&osc->oo_tree,
+							   index, opg);
+				if (result == 0) {
+					++osc->oo_npages;
+					opg->ops_intree = 1;
+				}
+				spin_unlock(&osc->oo_tree_lock);
+
+				radix_tree_preload_end();
+			}
+		}
+	}
+
+	return result;
+}
+
+/**
+ * Helper function called by osc_io_submit() for every page in an immediate
+ * transfer (i.e., transferred synchronously).
+ */
+void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
+		     enum cl_req_type crt, int brw_flags)
+{
+	struct osc_async_page *oap = &opg->ops_oap;
+
+	LASSERTF(oap->oap_magic == OAP_MAGIC, "Bad oap magic: oap %p, "
+		 "magic 0x%x\n", oap, oap->oap_magic);
+	LASSERT(oap->oap_async_flags & ASYNC_READY);
+	LASSERT(oap->oap_async_flags & ASYNC_COUNT_STABLE);
+
+	oap->oap_cmd       = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
+	oap->oap_page_off  = opg->ops_from;
+	oap->oap_count     = opg->ops_to - opg->ops_from;
+	oap->oap_brw_flags = OBD_BRW_SYNC | brw_flags;
+
+	if (cfs_capable(CFS_CAP_SYS_RESOURCE)) {
+		oap->oap_brw_flags |= OBD_BRW_NOQUOTA;
+		oap->oap_cmd |= OBD_BRW_NOQUOTA;
+	}
+
+	opg->ops_submit_time = cfs_time_current();
+	osc_page_transfer_get(opg, "transfer\0imm");
+	osc_page_transfer_add(env, opg, crt);
+}
+
+/* --------------- LRU page management ------------------ */
+
+/* OSC is a natural place to manage LRU pages as applications are specialized
+ * to write OSC by OSC. Ideally, if one OSC is used more frequently it should
+ * occupy more LRU slots. On the other hand, we should avoid using up all LRU
+ * slots (client_obd::cl_lru_left) otherwise process has to be put into sleep
+ * for free LRU slots - this will be very bad so the algorithm requires each
+ * OSC to free slots voluntarily to maintain a reasonable number of free slots
+ * at any time.
+ */
+
+static DECLARE_WAIT_QUEUE_HEAD(osc_lru_waitq);
+
+/**
+ * LRU pages are freed in batch mode. OSC should at least free this
+ * number of pages to avoid running out of LRU slots.
+ */
+static inline int lru_shrink_min(struct client_obd *cli)
+{
+	return cli->cl_max_pages_per_rpc * 2;
+}
+
+/**
+ * free this number at most otherwise it will take too long time to finsih.
+ */
+static inline int lru_shrink_max(struct client_obd *cli)
+{
+	return cli->cl_max_pages_per_rpc * cli->cl_max_rpcs_in_flight;
+}
+
+/**
+ * Check if we can free LRU slots from this OSC. If there exists LRU waiters,
+ * we should free slots aggressively. In this way, slots are freed in a steady
+ * step to maintain fairness among OSCs.
+ *
+ * Return how many LRU pages should be freed.
+ */
+static int osc_cache_too_much(struct client_obd *cli)
+{
+	struct cl_client_cache *cache = cli->cl_cache;
+	long pages = atomic_long_read(&cli->cl_lru_in_list);
+	unsigned long budget;
+
+	LASSERT(cache != NULL);
+	budget = cache->ccc_lru_max / (atomic_read(&cache->ccc_users) - 2);
+
+	/* if it's going to run out LRU slots, we should free some, but not
+	 * too much to maintain faireness among OSCs. */
+	if (atomic_long_read(cli->cl_lru_left) < cache->ccc_lru_max >> 2) {
+		if (pages >= budget)
+			return lru_shrink_max(cli);
+		else if (pages >= budget / 2)
+			return lru_shrink_min(cli);
+	} else {
+		time64_t duration = ktime_get_real_seconds();
+		long timediff;
+
+		/* knock out pages by duration of no IO activity */
+		duration -= cli->cl_lru_last_used;
+		/*
+		 * The difference shouldn't be more than 70 years
+		 * so we can safely case to a long. Round to
+		 * approximately 1 minute.
+		 */
+		timediff = (long)(duration >> 6);
+		if (timediff > 0 && pages >= budget / timediff)
+			return lru_shrink_min(cli);
+	}
+	return 0;
+}
+
+int lru_queue_work(const struct lu_env *env, void *data)
+{
+	struct client_obd *cli = data;
+	int count;
+
+	CDEBUG(D_CACHE, "%s: run LRU work for client obd\n", cli_name(cli));
+	count = osc_cache_too_much(cli);
+	if (count > 0) {
+		int rc = osc_lru_shrink(env, cli, count, false);
+
+		CDEBUG(D_CACHE, "%s: shrank %d/%d pages from client obd\n",
+		       cli_name(cli), rc, count);
+		if (rc >= count) {
+			CDEBUG(D_CACHE, "%s: queue again\n", cli_name(cli));
+			ptlrpcd_queue_work(cli->cl_lru_work);
+		}
+	}
+
+	RETURN(0);
+}
+
+void osc_lru_add_batch(struct client_obd *cli, struct list_head *plist)
+{
+	struct list_head lru = LIST_HEAD_INIT(lru);
+	struct osc_async_page *oap;
+	long npages = 0;
+
+	list_for_each_entry(oap, plist, oap_pending_item) {
+		struct osc_page *opg = oap2osc_page(oap);
+
+		if (!opg->ops_in_lru)
+			continue;
+
+		++npages;
+		LASSERT(list_empty(&opg->ops_lru));
+		list_add(&opg->ops_lru, &lru);
+	}
+
+	if (npages > 0) {
+		spin_lock(&cli->cl_lru_list_lock);
+		list_splice_tail(&lru, &cli->cl_lru_list);
+		atomic_long_sub(npages, &cli->cl_lru_busy);
+		atomic_long_add(npages, &cli->cl_lru_in_list);
+		cli->cl_lru_last_used = ktime_get_real_seconds();
+		spin_unlock(&cli->cl_lru_list_lock);
+
+		if (waitqueue_active(&osc_lru_waitq))
+			(void)ptlrpcd_queue_work(cli->cl_lru_work);
+	}
+}
+
+static void __osc_lru_del(struct client_obd *cli, struct osc_page *opg)
+{
+	LASSERT(atomic_long_read(&cli->cl_lru_in_list) > 0);
+	list_del_init(&opg->ops_lru);
+	atomic_long_dec(&cli->cl_lru_in_list);
+}
+
+/**
+ * Page is being destroyed. The page may be not in LRU list, if the transfer
+ * has never finished(error occurred).
+ */
+static void osc_lru_del(struct client_obd *cli, struct osc_page *opg)
+{
+	if (opg->ops_in_lru) {
+		spin_lock(&cli->cl_lru_list_lock);
+		if (!list_empty(&opg->ops_lru)) {
+			__osc_lru_del(cli, opg);
+		} else {
+			LASSERT(atomic_long_read(&cli->cl_lru_busy) > 0);
+			atomic_long_dec(&cli->cl_lru_busy);
+		}
+		spin_unlock(&cli->cl_lru_list_lock);
+
+		atomic_long_inc(cli->cl_lru_left);
+		/* this is a great place to release more LRU pages if
+		 * this osc occupies too many LRU pages and kernel is
+		 * stealing one of them. */
+		if (osc_cache_too_much(cli)) {
+			CDEBUG(D_CACHE, "%s: queue LRU work\n", cli_name(cli));
+			(void)ptlrpcd_queue_work(cli->cl_lru_work);
+		}
+		wake_up(&osc_lru_waitq);
+	} else {
+		LASSERT(list_empty(&opg->ops_lru));
+	}
+}
+
+/**
+ * Delete page from LRU list for redirty.
+ */
+static void osc_lru_use(struct client_obd *cli, struct osc_page *opg)
+{
+	/* If page is being transferred for the first time,
+	 * ops_lru should be empty */
+	if (opg->ops_in_lru) {
+		spin_lock(&cli->cl_lru_list_lock);
+		if (!list_empty(&opg->ops_lru)) {
+			__osc_lru_del(cli, opg);
+			atomic_long_inc(&cli->cl_lru_busy);
+		}
+		spin_unlock(&cli->cl_lru_list_lock);
+	}
+}
+
+static void discard_pagevec(const struct lu_env *env, struct cl_io *io,
+				struct cl_page **pvec, int max_index)
+{
+        int i;
+
+        for (i = 0; i < max_index; i++) {
+                struct cl_page *page = pvec[i];
+
+		LASSERT(cl_page_is_owned(page, io));
+		cl_page_delete(env, page);
+		cl_page_discard(env, io, page);
+		cl_page_disown(env, io, page);
+                cl_page_put(env, page);
+
+                pvec[i] = NULL;
+        }
+}
+
+/**
+ * Check if a cl_page can be released, i.e, it's not being used.
+ *
+ * If unstable account is turned on, bulk transfer may hold one refcount
+ * for recovery so we need to check vmpage refcount as well; otherwise,
+ * even we can destroy cl_page but the corresponding vmpage can't be reused.
+ */
+static inline bool lru_page_busy(struct client_obd *cli, struct cl_page *page)
+{
+	if (cl_page_in_use_noref(page))
+		return true;
+
+	if (cli->cl_cache->ccc_unstable_check) {
+		struct page *vmpage = cl_page_vmpage(page);
+
+		/* vmpage have two known users: cl_page and VM page cache */
+		if (page_count(vmpage) - page_mapcount(vmpage) > 2)
+			return true;
+	}
+	return false;
+}
+
+/**
+ * Drop @target of pages from LRU at most.
+ */
+long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
+		   long target, bool force)
+{
+	struct cl_io *io;
+	struct cl_object *clobj = NULL;
+	struct cl_page **pvec;
+	struct osc_page *opg;
+	long count = 0;
+	int maxscan = 0;
+	int index = 0;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(atomic_long_read(&cli->cl_lru_in_list) >= 0);
+	if (atomic_long_read(&cli->cl_lru_in_list) == 0 || target <= 0)
+		RETURN(0);
+
+	CDEBUG(D_CACHE, "%s: shrinkers: %d, force: %d\n",
+	       cli_name(cli), atomic_read(&cli->cl_lru_shrinkers), force);
+	if (!force) {
+		if (atomic_read(&cli->cl_lru_shrinkers) > 0)
+			RETURN(-EBUSY);
+
+		if (atomic_inc_return(&cli->cl_lru_shrinkers) > 1) {
+			atomic_dec(&cli->cl_lru_shrinkers);
+			RETURN(-EBUSY);
+		}
+	} else {
+		atomic_inc(&cli->cl_lru_shrinkers);
+	}
+
+	pvec = (struct cl_page **)osc_env_info(env)->oti_pvec;
+	io = &osc_env_info(env)->oti_io;
+
+	spin_lock(&cli->cl_lru_list_lock);
+	if (force)
+		cli->cl_lru_reclaim++;
+	maxscan = min(target << 1, atomic_long_read(&cli->cl_lru_in_list));
+	while (!list_empty(&cli->cl_lru_list)) {
+		struct cl_page *page;
+		bool will_free = false;
+
+		if (!force && atomic_read(&cli->cl_lru_shrinkers) > 1)
+			break;
+
+		if (--maxscan < 0)
+			break;
+
+		opg = list_entry(cli->cl_lru_list.next, struct osc_page,
+				 ops_lru);
+		page = opg->ops_cl.cpl_page;
+		if (lru_page_busy(cli, page)) {
+			list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+			continue;
+		}
+
+		LASSERT(page->cp_obj != NULL);
+		if (clobj != page->cp_obj) {
+			struct cl_object *tmp = page->cp_obj;
+
+			cl_object_get(tmp);
+			spin_unlock(&cli->cl_lru_list_lock);
+
+			if (clobj != NULL) {
+				discard_pagevec(env, io, pvec, index);
+				index = 0;
+
+				cl_io_fini(env, io);
+				cl_object_put(env, clobj);
+				clobj = NULL;
+			}
+
+			clobj = tmp;
+			io->ci_obj = clobj;
+			io->ci_ignore_layout = 1;
+			rc = cl_io_init(env, io, CIT_MISC, clobj);
+
+			spin_lock(&cli->cl_lru_list_lock);
+
+			if (rc != 0)
+				break;
+
+			++maxscan;
+			continue;
+		}
+
+		if (cl_page_own_try(env, io, page) == 0) {
+			if (!lru_page_busy(cli, page)) {
+				/* remove it from lru list earlier to avoid
+				 * lock contention */
+				__osc_lru_del(cli, opg);
+				opg->ops_in_lru = 0; /* will be discarded */
+
+				cl_page_get(page);
+				will_free = true;
+			} else {
+				cl_page_disown(env, io, page);
+			}
+		}
+
+		if (!will_free) {
+			list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+			continue;
+		}
+
+		/* Don't discard and free the page with cl_lru_list held */
+		pvec[index++] = page;
+		if (unlikely(index == OTI_PVEC_SIZE)) {
+			spin_unlock(&cli->cl_lru_list_lock);
+			discard_pagevec(env, io, pvec, index);
+			index = 0;
+
+			spin_lock(&cli->cl_lru_list_lock);
+		}
+
+		if (++count >= target)
+			break;
+	}
+	spin_unlock(&cli->cl_lru_list_lock);
+
+	if (clobj != NULL) {
+		discard_pagevec(env, io, pvec, index);
+
+		cl_io_fini(env, io);
+		cl_object_put(env, clobj);
+	}
+
+	atomic_dec(&cli->cl_lru_shrinkers);
+	if (count > 0) {
+		atomic_long_add(count, cli->cl_lru_left);
+		wake_up_all(&osc_lru_waitq);
+	}
+	RETURN(count > 0 ? count : rc);
+}
+
+/**
+ * Reclaim LRU pages by an IO thread. The caller wants to reclaim at least
+ * \@npages of LRU slots. For performance consideration, it's better to drop
+ * LRU pages in batch. Therefore, the actual number is adjusted at least
+ * max_pages_per_rpc.
+ */
+static long osc_lru_reclaim(struct client_obd *cli, unsigned long npages)
+{
+	struct lu_env *env;
+	struct cl_client_cache *cache = cli->cl_cache;
+	int max_scans;
+	__u16 refcheck;
+	long rc = 0;
+	ENTRY;
+
+	LASSERT(cache != NULL);
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(rc);
+
+	npages = max_t(int, npages, cli->cl_max_pages_per_rpc);
+	CDEBUG(D_CACHE, "%s: start to reclaim %ld pages from LRU\n",
+	       cli_name(cli), npages);
+	rc = osc_lru_shrink(env, cli, npages, true);
+	if (rc >= npages) {
+		CDEBUG(D_CACHE, "%s: reclaimed %ld/%ld pages from LRU\n",
+		       cli_name(cli), rc, npages);
+		if (osc_cache_too_much(cli) > 0)
+			ptlrpcd_queue_work(cli->cl_lru_work);
+		GOTO(out, rc);
+	} else if (rc > 0) {
+		npages -= rc;
+	}
+
+	CDEBUG(D_CACHE, "%s: cli %p no free slots, pages: %ld/%ld, want: %ld\n",
+		cli_name(cli), cli, atomic_long_read(&cli->cl_lru_in_list),
+		atomic_long_read(&cli->cl_lru_busy), npages);
+
+	/* Reclaim LRU slots from other client_obd as it can't free enough
+	 * from its own. This should rarely happen. */
+	spin_lock(&cache->ccc_lru_lock);
+	LASSERT(!list_empty(&cache->ccc_lru));
+
+	cache->ccc_lru_shrinkers++;
+	list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru);
+
+	max_scans = atomic_read(&cache->ccc_users) - 2;
+	while (--max_scans > 0 && !list_empty(&cache->ccc_lru)) {
+		cli = list_entry(cache->ccc_lru.next, struct client_obd,
+				 cl_lru_osc);
+
+		CDEBUG(D_CACHE, "%s: cli %p LRU pages: %ld, busy: %ld.\n",
+			cli_name(cli), cli,
+			atomic_long_read(&cli->cl_lru_in_list),
+			atomic_long_read(&cli->cl_lru_busy));
+
+		list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru);
+		if (osc_cache_too_much(cli) > 0) {
+			spin_unlock(&cache->ccc_lru_lock);
+
+			rc = osc_lru_shrink(env, cli, npages, true);
+			spin_lock(&cache->ccc_lru_lock);
+			if (rc >= npages)
+				break;
+			if (rc > 0)
+				npages -= rc;
+		}
+	}
+	spin_unlock(&cache->ccc_lru_lock);
+
+out:
+	cl_env_put(env, &refcheck);
+	CDEBUG(D_CACHE, "%s: cli %p freed %ld pages.\n",
+		cli_name(cli), cli, rc);
+	return rc;
+}
+
+/**
+ * osc_lru_alloc() is called to allocate an LRU slot for a cl_page.
+ *
+ * Usually the LRU slots are reserved in osc_io_iter_rw_init().
+ * Only in the case that the LRU slots are in extreme shortage, it should
+ * have reserved enough slots for an IO.
+ */
+static int osc_lru_alloc(const struct lu_env *env, struct client_obd *cli,
+			 struct osc_page *opg)
+{
+	struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+	struct osc_io *oio = osc_env_io(env);
+	int rc = 0;
+	ENTRY;
+
+	if (cli->cl_cache == NULL) /* shall not be in LRU */
+		RETURN(0);
+
+	if (oio->oi_lru_reserved > 0) {
+		--oio->oi_lru_reserved;
+		goto out;
+	}
+
+	LASSERT(atomic_long_read(cli->cl_lru_left) >= 0);
+	while (!atomic_long_add_unless(cli->cl_lru_left, -1, 0)) {
+		/* run out of LRU spaces, try to drop some by itself */
+		rc = osc_lru_reclaim(cli, 1);
+		if (rc < 0)
+			break;
+		if (rc > 0)
+			continue;
+
+		cond_resched();
+		rc = l_wait_event(osc_lru_waitq,
+				atomic_long_read(cli->cl_lru_left) > 0,
+				&lwi);
+		if (rc < 0)
+			break;
+	}
+
+out:
+	if (rc >= 0) {
+		atomic_long_inc(&cli->cl_lru_busy);
+		opg->ops_in_lru = 1;
+		rc = 0;
+	}
+
+	RETURN(rc);
+}
+
+/**
+ * osc_lru_reserve() is called to reserve enough LRU slots for I/O.
+ *
+ * The benefit of doing this is to reduce contention against atomic counter
+ * cl_lru_left by changing it from per-page access to per-IO access.
+ */
+unsigned long osc_lru_reserve(struct client_obd *cli, unsigned long npages)
+{
+	unsigned long reserved = 0;
+	unsigned long max_pages;
+	unsigned long c;
+
+	/* reserve a full RPC window at most to avoid that a thread accidentally
+	 * consumes too many LRU slots */
+	max_pages = cli->cl_max_pages_per_rpc * cli->cl_max_rpcs_in_flight;
+	if (npages > max_pages)
+		npages = max_pages;
+
+	c = atomic_long_read(cli->cl_lru_left);
+	if (c < npages && osc_lru_reclaim(cli, npages) > 0)
+		c = atomic_long_read(cli->cl_lru_left);
+	while (c >= npages) {
+		if (c == atomic_long_cmpxchg(cli->cl_lru_left, c, c - npages)) {
+			reserved = npages;
+			break;
+		}
+		c = atomic_long_read(cli->cl_lru_left);
+	}
+	if (atomic_long_read(cli->cl_lru_left) < max_pages) {
+		/* If there aren't enough pages in the per-OSC LRU then
+		 * wake up the LRU thread to try and clear out space, so
+		 * we don't block if pages are being dirtied quickly. */
+		CDEBUG(D_CACHE, "%s: queue LRU, left: %lu/%ld.\n",
+		       cli_name(cli), atomic_long_read(cli->cl_lru_left),
+		       max_pages);
+		(void)ptlrpcd_queue_work(cli->cl_lru_work);
+	}
+
+	return reserved;
+}
+
+/**
+ * osc_lru_unreserve() is called to unreserve LRU slots.
+ *
+ * LRU slots reserved by osc_lru_reserve() may have entries left due to several
+ * reasons such as page already existing or I/O error. Those reserved slots
+ * should be freed by calling this function.
+ */
+void osc_lru_unreserve(struct client_obd *cli, unsigned long npages)
+{
+	atomic_long_add(npages, cli->cl_lru_left);
+	wake_up_all(&osc_lru_waitq);
+}
+
+/**
+ * Atomic operations are expensive. We accumulate the accounting for the
+ * same page zone to get better performance.
+ * In practice this can work pretty good because the pages in the same RPC
+ * are likely from the same page zone.
+ */
+static inline void unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
+					    int factor)
+{
+	int page_count = desc->bd_iov_count;
+	void *zone = NULL;
+	int count = 0;
+	int i;
+
+	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
+
+	for (i = 0; i < page_count; i++) {
+		void *pz = page_zone(BD_GET_KIOV(desc, i).kiov_page);
+
+		if (likely(pz == zone)) {
+			++count;
+			continue;
+		}
+
+		if (count > 0) {
+			mod_zone_page_state(zone, NR_UNSTABLE_NFS,
+					    factor * count);
+			count = 0;
+		}
+		zone = pz;
+		++count;
+	}
+	if (count > 0)
+		mod_zone_page_state(zone, NR_UNSTABLE_NFS, factor * count);
+}
+
+static inline void add_unstable_page_accounting(struct ptlrpc_bulk_desc *desc)
+{
+	unstable_page_accounting(desc, 1);
+}
+
+static inline void dec_unstable_page_accounting(struct ptlrpc_bulk_desc *desc)
+{
+	unstable_page_accounting(desc, -1);
+}
+
+/**
+ * Performs "unstable" page accounting. This function balances the
+ * increment operations performed in osc_inc_unstable_pages. It is
+ * registered as the RPC request callback, and is executed when the
+ * bulk RPC is committed on the server. Thus at this point, the pages
+ * involved in the bulk transfer are no longer considered unstable.
+ *
+ * If this function is called, the request should have been committed
+ * or req:rq_unstable must have been set; it implies that the unstable
+ * statistic have been added.
+ */
+void osc_dec_unstable_pages(struct ptlrpc_request *req)
+{
+	struct ptlrpc_bulk_desc *desc       = req->rq_bulk;
+	struct client_obd       *cli        = &req->rq_import->imp_obd->u.cli;
+	int			 page_count = desc->bd_iov_count;
+	long			 unstable_count;
+
+	LASSERT(page_count >= 0);
+	dec_unstable_page_accounting(desc);
+
+	unstable_count = atomic_long_sub_return(page_count,
+						&cli->cl_unstable_count);
+	LASSERT(unstable_count >= 0);
+
+	unstable_count = atomic_long_sub_return(page_count,
+					   &cli->cl_cache->ccc_unstable_nr);
+	LASSERT(unstable_count >= 0);
+	if (unstable_count == 0)
+		wake_up_all(&cli->cl_cache->ccc_unstable_waitq);
+
+	if (waitqueue_active(&osc_lru_waitq))
+		(void)ptlrpcd_queue_work(cli->cl_lru_work);
+}
+
+/**
+ * "unstable" page accounting. See: osc_dec_unstable_pages.
+ */
+void osc_inc_unstable_pages(struct ptlrpc_request *req)
+{
+	struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+	struct client_obd       *cli  = &req->rq_import->imp_obd->u.cli;
+	long			 page_count = desc->bd_iov_count;
+
+	/* No unstable page tracking */
+	if (cli->cl_cache == NULL || !cli->cl_cache->ccc_unstable_check)
+		return;
+
+	add_unstable_page_accounting(desc);
+	atomic_long_add(page_count, &cli->cl_unstable_count);
+	atomic_long_add(page_count, &cli->cl_cache->ccc_unstable_nr);
+
+	/* If the request has already been committed (i.e. brw_commit
+	 * called via rq_commit_cb), we need to undo the unstable page
+	 * increments we just performed because rq_commit_cb wont be
+	 * called again. */
+	spin_lock(&req->rq_lock);
+	if (unlikely(req->rq_committed)) {
+		spin_unlock(&req->rq_lock);
+
+		osc_dec_unstable_pages(req);
+	} else {
+		req->rq_unstable = 1;
+		spin_unlock(&req->rq_lock);
+	}
+}
+
+/**
+ * Check if it piggybacks SOFT_SYNC flag to OST from this OSC.
+ * This function will be called by every BRW RPC so it's critical
+ * to make this function fast.
+ */
+bool osc_over_unstable_soft_limit(struct client_obd *cli)
+{
+	long unstable_nr, osc_unstable_count;
+
+	/* Can't check cli->cl_unstable_count, therefore, no soft limit */
+	if (cli->cl_cache == NULL || !cli->cl_cache->ccc_unstable_check)
+		return false;
+
+	osc_unstable_count = atomic_long_read(&cli->cl_unstable_count);
+	unstable_nr = atomic_long_read(&cli->cl_cache->ccc_unstable_nr);
+
+	CDEBUG(D_CACHE,
+	       "%s: cli: %p unstable pages: %lu, osc unstable pages: %lu\n",
+	       cli_name(cli), cli, unstable_nr, osc_unstable_count);
+
+	/* If the LRU slots are in shortage - 25% remaining AND this OSC
+	 * has one full RPC window of unstable pages, it's a good chance
+	 * to piggyback a SOFT_SYNC flag.
+	 * Please notice that the OST won't take immediate response for the
+	 * SOFT_SYNC request so active OSCs will have more chance to carry
+	 * the flag, this is reasonable. */
+	return unstable_nr > cli->cl_cache->ccc_lru_max >> 2 &&
+	       osc_unstable_count > cli->cl_max_pages_per_rpc *
+				    cli->cl_max_rpcs_in_flight;
+}
+
+/**
+ * Return how many LRU pages in the cache of all OSC devices
+ *
+ * \retval	return # of cached LRU pages times reclaimation tendency
+ * \retval	SHRINK_STOP if it cannot do any scanning in this time
+ */
+unsigned long osc_cache_shrink_count(struct shrinker *sk,
+				     struct shrink_control *sc)
+{
+	struct client_obd *cli;
+	unsigned long cached = 0;
+
+	spin_lock(&osc_shrink_lock);
+	list_for_each_entry(cli, &osc_shrink_list, cl_shrink_list)
+		cached += atomic_long_read(&cli->cl_lru_in_list);
+	spin_unlock(&osc_shrink_lock);
+
+	return (cached  * sysctl_vfs_cache_pressure) / 100;
+}
+
+/**
+ * Scan and try to reclaim sc->nr_to_scan cached LRU pages
+ *
+ * \retval	number of cached LRU pages reclaimed
+ * \retval	SHRINK_STOP if it cannot do any scanning in this time
+ *
+ * Linux kernel will loop calling this shrinker scan routine with
+ * sc->nr_to_scan = SHRINK_BATCH(128 for now) until kernel got enough memory.
+ *
+ * If sc->nr_to_scan is 0, the VM is querying the cache size, we don't need
+ * to scan and try to reclaim LRU pages, just return 0 and
+ * osc_cache_shrink_count() will report the LRU page number.
+ */
+unsigned long osc_cache_shrink_scan(struct shrinker *sk,
+				    struct shrink_control *sc)
+{
+	struct client_obd *cli;
+	struct client_obd *stop_anchor = NULL;
+	struct lu_env *env;
+	long shrank = 0;
+	int rc;
+	__u16 refcheck;
+
+	if (sc->nr_to_scan == 0)
+		return 0;
+
+	if (!(sc->gfp_mask & __GFP_FS))
+		return SHRINK_STOP;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		return SHRINK_STOP;
+
+	spin_lock(&osc_shrink_lock);
+	while (!list_empty(&osc_shrink_list)) {
+		cli = list_entry(osc_shrink_list.next, struct client_obd,
+				 cl_shrink_list);
+
+		if (stop_anchor == NULL)
+			stop_anchor = cli;
+		else if (cli == stop_anchor)
+			break;
+
+		list_move_tail(&cli->cl_shrink_list, &osc_shrink_list);
+		spin_unlock(&osc_shrink_lock);
+
+		/* shrink no more than max_pages_per_rpc for an OSC */
+		rc = osc_lru_shrink(env, cli, (sc->nr_to_scan - shrank) >
+				    cli->cl_max_pages_per_rpc ?
+				    cli->cl_max_pages_per_rpc :
+				    sc->nr_to_scan - shrank, true);
+		if (rc > 0)
+			shrank += rc;
+
+		if (shrank >= sc->nr_to_scan)
+			goto out;
+
+		spin_lock(&osc_shrink_lock);
+	}
+	spin_unlock(&osc_shrink_lock);
+
+out:
+	cl_env_put(env, &refcheck);
+
+	return shrank;
+}
+
+/** @} osc */
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_quota.c b/drivers/staging/lustrefsx/lustre/osc/osc_quota.c
new file mode 100644
index 0000000000000..7dcbbd79a5de0
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_quota.c
@@ -0,0 +1,303 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ *
+ * Code originally extracted from quota directory
+ */
+
+#include <obd.h>
+#include "osc_internal.h"
+
+static inline struct osc_quota_info *osc_oqi_alloc(u32 id)
+{
+	struct osc_quota_info *oqi;
+
+	OBD_SLAB_ALLOC_PTR(oqi, osc_quota_kmem);
+	if (oqi != NULL)
+		oqi->oqi_id = id;
+
+	return oqi;
+}
+
+int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[])
+{
+	int type;
+	ENTRY;
+
+	for (type = 0; type < LL_MAXQUOTAS; type++) {
+		struct osc_quota_info *oqi;
+
+		oqi = cfs_hash_lookup(cli->cl_quota_hash[type], &qid[type]);
+		if (oqi) {
+			/* do not try to access oqi here, it could have been
+			 * freed by osc_quota_setdq() */
+
+			/* the slot is busy, the user is about to run out of
+			 * quota space on this OST */
+			CDEBUG(D_QUOTA, "chkdq found noquota for %s %d\n",
+			       type == USRQUOTA ? "user" : "grout", qid[type]);
+			RETURN(NO_QUOTA);
+		}
+	}
+
+	RETURN(QUOTA_OK);
+}
+
+static inline u32 md_quota_flag(int qtype)
+{
+	switch (qtype) {
+	case USRQUOTA:
+		return OBD_MD_FLUSRQUOTA;
+	case GRPQUOTA:
+		return OBD_MD_FLGRPQUOTA;
+	case PRJQUOTA:
+		return OBD_MD_FLPRJQUOTA;
+	default:
+		return 0;
+	}
+}
+
+static inline u32 fl_quota_flag(int qtype)
+{
+	switch (qtype) {
+	case USRQUOTA:
+		return OBD_FL_NO_USRQUOTA;
+	case GRPQUOTA:
+		return OBD_FL_NO_GRPQUOTA;
+	case PRJQUOTA:
+		return OBD_FL_NO_PRJQUOTA;
+	default:
+		return 0;
+	}
+}
+
+int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[],
+		    u64 valid, u32 flags)
+{
+	int type;
+	int rc = 0;
+
+        ENTRY;
+
+	if ((valid & (OBD_MD_FLALLQUOTA)) == 0)
+		RETURN(0);
+
+	for (type = 0; type < LL_MAXQUOTAS; type++) {
+		struct osc_quota_info *oqi;
+
+		if ((valid & md_quota_flag(type)) == 0)
+			continue;
+
+		/* lookup the ID in the per-type hash table */
+		oqi = cfs_hash_lookup(cli->cl_quota_hash[type], &qid[type]);
+		if ((flags & fl_quota_flag(type)) != 0) {
+			/* This ID is getting close to its quota limit, let's
+			 * switch to sync I/O */
+			if (oqi != NULL)
+				continue;
+
+			oqi = osc_oqi_alloc(qid[type]);
+			if (oqi == NULL) {
+				rc = -ENOMEM;
+				break;
+			}
+
+			rc = cfs_hash_add_unique(cli->cl_quota_hash[type],
+						 &qid[type], &oqi->oqi_hash);
+			/* race with others? */
+			if (rc == -EALREADY) {
+				rc = 0;
+				OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem);
+			}
+
+			CDEBUG(D_QUOTA, "%s: setdq to insert for %s %d (%d)\n",
+			       cli_name(cli), qtype_name(type), qid[type], rc);
+		} else {
+			/* This ID is now off the hook, let's remove it from
+			 * the hash table */
+			if (oqi == NULL)
+				continue;
+
+			oqi = cfs_hash_del_key(cli->cl_quota_hash[type],
+					       &qid[type]);
+			if (oqi)
+				OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem);
+
+			CDEBUG(D_QUOTA, "%s: setdq to remove for %s %d (%p)\n",
+			       cli_name(cli), qtype_name(type), qid[type], oqi);
+		}
+	}
+
+	RETURN(rc);
+}
+
+/*
+ * Hash operations for uid/gid <-> osc_quota_info
+ */
+static unsigned
+oqi_hashfn(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_u32_hash(*((__u32*)key), mask);
+}
+
+static int
+oqi_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct osc_quota_info *oqi;
+	u32 uid;
+
+	LASSERT(key != NULL);
+	uid = *((u32 *)key);
+	oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+
+	return uid == oqi->oqi_id;
+}
+
+static void *
+oqi_key(struct hlist_node *hnode)
+{
+	struct osc_quota_info *oqi;
+	oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+	return &oqi->oqi_id;
+}
+
+static void *
+oqi_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+}
+
+static void
+oqi_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+}
+
+static void
+oqi_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+}
+
+static void
+oqi_exit(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct osc_quota_info *oqi;
+
+	oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+
+        OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem);
+}
+
+#define HASH_QUOTA_BKT_BITS 5
+#define HASH_QUOTA_CUR_BITS 5
+#define HASH_QUOTA_MAX_BITS 15
+
+static struct cfs_hash_ops quota_hash_ops = {
+	.hs_hash	= oqi_hashfn,
+	.hs_keycmp	= oqi_keycmp,
+	.hs_key		= oqi_key,
+	.hs_object	= oqi_object,
+	.hs_get		= oqi_get,
+	.hs_put_locked	= oqi_put_locked,
+	.hs_exit	= oqi_exit,
+};
+
+int osc_quota_setup(struct obd_device *obd)
+{
+	struct client_obd *cli = &obd->u.cli;
+	int i, type;
+	ENTRY;
+
+	for (type = 0; type < LL_MAXQUOTAS; type++) {
+		cli->cl_quota_hash[type] = cfs_hash_create("QUOTA_HASH",
+							   HASH_QUOTA_CUR_BITS,
+							   HASH_QUOTA_MAX_BITS,
+							   HASH_QUOTA_BKT_BITS,
+							   0,
+							   CFS_HASH_MIN_THETA,
+							   CFS_HASH_MAX_THETA,
+							   &quota_hash_ops,
+							   CFS_HASH_DEFAULT);
+		if (cli->cl_quota_hash[type] == NULL)
+			break;
+	}
+
+	if (type == LL_MAXQUOTAS)
+		RETURN(0);
+
+	for (i = 0; i < type; i++)
+		cfs_hash_putref(cli->cl_quota_hash[i]);
+
+	RETURN(-ENOMEM);
+}
+
+int osc_quota_cleanup(struct obd_device *obd)
+{
+	struct client_obd     *cli = &obd->u.cli;
+	int type;
+	ENTRY;
+
+	for (type = 0; type < LL_MAXQUOTAS; type++)
+		cfs_hash_putref(cli->cl_quota_hash[type]);
+
+	RETURN(0);
+}
+
+int osc_quotactl(struct obd_device *unused, struct obd_export *exp,
+                 struct obd_quotactl *oqctl)
+{
+        struct ptlrpc_request *req;
+        struct obd_quotactl   *oqc;
+        int                    rc;
+        ENTRY;
+
+        req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+                                        &RQF_OST_QUOTACTL, LUSTRE_OST_VERSION,
+                                        OST_QUOTACTL);
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+        oqc = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+        *oqc = *oqctl;
+
+        ptlrpc_request_set_replen(req);
+        ptlrpc_at_set_req_timeout(req);
+        req->rq_no_resend = 1;
+
+        rc = ptlrpc_queue_wait(req);
+        if (rc)
+                CERROR("ptlrpc_queue_wait failed, rc: %d\n", rc);
+
+        if (req->rq_repmsg &&
+            (oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL))) {
+                *oqctl = *oqc;
+        } else if (!rc) {
+                CERROR ("Can't unpack obd_quotactl\n");
+                rc = -EPROTO;
+        }
+        ptlrpc_req_finished(req);
+
+        RETURN(rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_request.c b/drivers/staging/lustrefsx/lustre/osc/osc_request.c
new file mode 100644
index 0000000000000..db6ca2da2e4db
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_request.c
@@ -0,0 +1,3117 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include <libcfs/libcfs.h>
+
+#include <lustre/lustre_user.h>
+
+#include <lprocfs_status.h>
+#include <lustre_debug.h>
+#include <lustre_dlm.h>
+#include <lustre_fid.h>
+#include <lustre_ha.h>
+#include <uapi/linux/lustre_ioctl.h>
+#include <lustre_net.h>
+#include <lustre_obdo.h>
+#include <uapi/linux/lustre_param.h>
+#include <obd.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+
+#include "osc_cl_internal.h"
+#include "osc_internal.h"
+
+atomic_t osc_pool_req_count;
+unsigned int osc_reqpool_maxreqcount;
+struct ptlrpc_request_pool *osc_rq_pool;
+
+/* max memory used for request pool, unit is MB */
+static unsigned int osc_reqpool_mem_max = 5;
+module_param(osc_reqpool_mem_max, uint, 0444);
+
+struct osc_brw_async_args {
+	struct obdo		 *aa_oa;
+	int			  aa_requested_nob;
+	int			  aa_nio_count;
+	u32			  aa_page_count;
+	int			  aa_resends;
+	struct brw_page	**aa_ppga;
+	struct client_obd	 *aa_cli;
+	struct list_head	  aa_oaps;
+	struct list_head	  aa_exts;
+};
+
+#define osc_grant_args osc_brw_async_args
+
+struct osc_setattr_args {
+	struct obdo		*sa_oa;
+	obd_enqueue_update_f	 sa_upcall;
+	void			*sa_cookie;
+};
+
+struct osc_fsync_args {
+	struct osc_object	*fa_obj;
+	struct obdo		*fa_oa;
+	obd_enqueue_update_f	fa_upcall;
+	void			*fa_cookie;
+};
+
+struct osc_ladvise_args {
+	struct obdo		*la_oa;
+	obd_enqueue_update_f	 la_upcall;
+	void			*la_cookie;
+};
+
+struct osc_enqueue_args {
+	struct obd_export	*oa_exp;
+	enum ldlm_type		oa_type;
+	enum ldlm_mode		oa_mode;
+	__u64			*oa_flags;
+	osc_enqueue_upcall_f	oa_upcall;
+	void			*oa_cookie;
+	struct ost_lvb		*oa_lvb;
+	struct lustre_handle	oa_lockh;
+	unsigned int		oa_agl:1;
+};
+
+static void osc_release_ppga(struct brw_page **ppga, size_t count);
+static int brw_interpret(const struct lu_env *env, struct ptlrpc_request *req,
+			 void *data, int rc);
+
+void osc_pack_req_body(struct ptlrpc_request *req, struct obdo *oa)
+{
+	struct ost_body *body;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+}
+
+static int osc_getattr(const struct lu_env *env, struct obd_export *exp,
+		       struct obdo *oa)
+{
+	struct ptlrpc_request	*req;
+	struct ost_body		*body;
+	int			 rc;
+
+	ENTRY;
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	osc_pack_req_body(req, oa);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
+	lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa);
+
+	oa->o_blksize = cli_brw_size(exp->exp_obd);
+	oa->o_valid |= OBD_MD_FLBLKSZ;
+
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+
+	return rc;
+}
+
+static int osc_setattr(const struct lu_env *env, struct obd_export *exp,
+		       struct obdo *oa)
+{
+	struct ptlrpc_request	*req;
+	struct ost_body		*body;
+	int			 rc;
+
+	ENTRY;
+	LASSERT(oa->o_valid & OBD_MD_FLGROUP);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	osc_pack_req_body(req, oa);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa);
+
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+
+	RETURN(rc);
+}
+
+static int osc_setattr_interpret(const struct lu_env *env,
+                                 struct ptlrpc_request *req,
+                                 struct osc_setattr_args *sa, int rc)
+{
+        struct ost_body *body;
+        ENTRY;
+
+        if (rc != 0)
+                GOTO(out, rc);
+
+        body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+        if (body == NULL)
+                GOTO(out, rc = -EPROTO);
+
+	lustre_get_wire_obdo(&req->rq_import->imp_connect_data, sa->sa_oa,
+			     &body->oa);
+out:
+        rc = sa->sa_upcall(sa->sa_cookie, rc);
+        RETURN(rc);
+}
+
+int osc_setattr_async(struct obd_export *exp, struct obdo *oa,
+		      obd_enqueue_update_f upcall, void *cookie,
+		      struct ptlrpc_request_set *rqset)
+{
+	struct ptlrpc_request	*req;
+	struct osc_setattr_args	*sa;
+	int			 rc;
+
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	osc_pack_req_body(req, oa);
+
+	ptlrpc_request_set_replen(req);
+
+	/* do mds to ost setattr asynchronously */
+	if (!rqset) {
+		/* Do not wait for response. */
+		ptlrpcd_add_req(req);
+	} else {
+		req->rq_interpret_reply =
+			(ptlrpc_interpterer_t)osc_setattr_interpret;
+
+		CLASSERT(sizeof(*sa) <= sizeof(req->rq_async_args));
+		sa = ptlrpc_req_async_args(req);
+		sa->sa_oa = oa;
+		sa->sa_upcall = upcall;
+		sa->sa_cookie = cookie;
+
+		if (rqset == PTLRPCD_SET)
+			ptlrpcd_add_req(req);
+		else
+			ptlrpc_set_add_req(rqset, req);
+	}
+
+	RETURN(0);
+}
+
+static int osc_ladvise_interpret(const struct lu_env *env,
+				 struct ptlrpc_request *req,
+				 void *arg, int rc)
+{
+	struct osc_ladvise_args *la = arg;
+	struct ost_body *body;
+	ENTRY;
+
+	if (rc != 0)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	*la->la_oa = body->oa;
+out:
+	rc = la->la_upcall(la->la_cookie, rc);
+	RETURN(rc);
+}
+
+/**
+ * If rqset is NULL, do not wait for response. Upcall and cookie could also
+ * be NULL in this case
+ */
+int osc_ladvise_base(struct obd_export *exp, struct obdo *oa,
+		     struct ladvise_hdr *ladvise_hdr,
+		     obd_enqueue_update_f upcall, void *cookie,
+		     struct ptlrpc_request_set *rqset)
+{
+	struct ptlrpc_request	*req;
+	struct ost_body		*body;
+	struct osc_ladvise_args	*la;
+	int			 rc;
+	struct lu_ladvise	*req_ladvise;
+	struct lu_ladvise	*ladvise = ladvise_hdr->lah_advise;
+	int			 num_advise = ladvise_hdr->lah_count;
+	struct ladvise_hdr	*req_ladvise_hdr;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_LADVISE);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_OST_LADVISE, RCL_CLIENT,
+			     num_advise * sizeof(*ladvise));
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_LADVISE);
+	if (rc != 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	req->rq_request_portal = OST_IO_PORTAL;
+	ptlrpc_at_set_req_timeout(req);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa,
+			     oa);
+
+	req_ladvise_hdr = req_capsule_client_get(&req->rq_pill,
+						 &RMF_OST_LADVISE_HDR);
+	memcpy(req_ladvise_hdr, ladvise_hdr, sizeof(*ladvise_hdr));
+
+	req_ladvise = req_capsule_client_get(&req->rq_pill, &RMF_OST_LADVISE);
+	memcpy(req_ladvise, ladvise, sizeof(*ladvise) * num_advise);
+	ptlrpc_request_set_replen(req);
+
+	if (rqset == NULL) {
+		/* Do not wait for response. */
+		ptlrpcd_add_req(req);
+		RETURN(0);
+	}
+
+	req->rq_interpret_reply = osc_ladvise_interpret;
+	CLASSERT(sizeof(*la) <= sizeof(req->rq_async_args));
+	la = ptlrpc_req_async_args(req);
+	la->la_oa = oa;
+	la->la_upcall = upcall;
+	la->la_cookie = cookie;
+
+	if (rqset == PTLRPCD_SET)
+		ptlrpcd_add_req(req);
+	else
+		ptlrpc_set_add_req(rqset, req);
+
+	RETURN(0);
+}
+
+static int osc_create(const struct lu_env *env, struct obd_export *exp,
+		      struct obdo *oa)
+{
+        struct ptlrpc_request *req;
+        struct ost_body       *body;
+        int                    rc;
+        ENTRY;
+
+	LASSERT(oa != NULL);
+	LASSERT(oa->o_valid & OBD_MD_FLGROUP);
+	LASSERT(fid_seq_is_echo(ostid_seq(&oa->o_oi)));
+
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_CREATE);
+        if (req == NULL)
+                GOTO(out, rc = -ENOMEM);
+
+        rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_CREATE);
+        if (rc) {
+                ptlrpc_request_free(req);
+                GOTO(out, rc);
+        }
+
+        body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+        LASSERT(body);
+
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+
+        ptlrpc_request_set_replen(req);
+
+        rc = ptlrpc_queue_wait(req);
+        if (rc)
+                GOTO(out_req, rc);
+
+        body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+        if (body == NULL)
+                GOTO(out_req, rc = -EPROTO);
+
+	CDEBUG(D_INFO, "oa flags %x\n", oa->o_flags);
+	lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa);
+
+	oa->o_blksize = cli_brw_size(exp->exp_obd);
+	oa->o_valid |= OBD_MD_FLBLKSZ;
+
+	CDEBUG(D_HA, "transno: %lld\n",
+	       lustre_msg_get_transno(req->rq_repmsg));
+out_req:
+	ptlrpc_req_finished(req);
+out:
+	RETURN(rc);
+}
+
+int osc_punch_base(struct obd_export *exp, struct obdo *oa,
+                   obd_enqueue_update_f upcall, void *cookie,
+                   struct ptlrpc_request_set *rqset)
+{
+        struct ptlrpc_request   *req;
+        struct osc_setattr_args *sa;
+        struct ost_body         *body;
+        int                      rc;
+        ENTRY;
+
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_PUNCH);
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+        rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH);
+        if (rc) {
+                ptlrpc_request_free(req);
+                RETURN(rc);
+        }
+        req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
+        ptlrpc_at_set_req_timeout(req);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+
+	ptlrpc_request_set_replen(req);
+
+	req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_setattr_interpret;
+	CLASSERT(sizeof(*sa) <= sizeof(req->rq_async_args));
+	sa = ptlrpc_req_async_args(req);
+	sa->sa_oa = oa;
+	sa->sa_upcall = upcall;
+	sa->sa_cookie = cookie;
+	if (rqset == PTLRPCD_SET)
+		ptlrpcd_add_req(req);
+	else
+		ptlrpc_set_add_req(rqset, req);
+
+	RETURN(0);
+}
+
+static int osc_sync_interpret(const struct lu_env *env,
+                              struct ptlrpc_request *req,
+                              void *arg, int rc)
+{
+	struct osc_fsync_args	*fa = arg;
+	struct ost_body		*body;
+	struct cl_attr		*attr = &osc_env_info(env)->oti_attr;
+	unsigned long		valid = 0;
+	struct cl_object	*obj;
+	ENTRY;
+
+	if (rc != 0)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL) {
+		CERROR("can't unpack ost_body\n");
+		GOTO(out, rc = -EPROTO);
+	}
+
+	*fa->fa_oa = body->oa;
+	obj = osc2cl(fa->fa_obj);
+
+	/* Update osc object's blocks attribute */
+	cl_object_attr_lock(obj);
+	if (body->oa.o_valid & OBD_MD_FLBLOCKS) {
+		attr->cat_blocks = body->oa.o_blocks;
+		valid |= CAT_BLOCKS;
+	}
+
+	if (valid != 0)
+		cl_object_attr_update(env, obj, attr, valid);
+	cl_object_attr_unlock(obj);
+
+out:
+	rc = fa->fa_upcall(fa->fa_cookie, rc);
+	RETURN(rc);
+}
+
+int osc_sync_base(struct osc_object *obj, struct obdo *oa,
+		  obd_enqueue_update_f upcall, void *cookie,
+                  struct ptlrpc_request_set *rqset)
+{
+	struct obd_export     *exp = osc_export(obj);
+	struct ptlrpc_request *req;
+	struct ost_body       *body;
+	struct osc_fsync_args *fa;
+        int                    rc;
+        ENTRY;
+
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SYNC);
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+        rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SYNC);
+        if (rc) {
+                ptlrpc_request_free(req);
+                RETURN(rc);
+        }
+
+	/* overload the size and blocks fields in the oa with start/end */
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+
+	ptlrpc_request_set_replen(req);
+	req->rq_interpret_reply = osc_sync_interpret;
+
+	CLASSERT(sizeof(*fa) <= sizeof(req->rq_async_args));
+	fa = ptlrpc_req_async_args(req);
+	fa->fa_obj = obj;
+	fa->fa_oa = oa;
+	fa->fa_upcall = upcall;
+	fa->fa_cookie = cookie;
+
+	if (rqset == PTLRPCD_SET)
+		ptlrpcd_add_req(req);
+	else
+		ptlrpc_set_add_req(rqset, req);
+
+	RETURN (0);
+}
+
+/* Find and cancel locally locks matched by @mode in the resource found by
+ * @objid. Found locks are added into @cancel list. Returns the amount of
+ * locks added to @cancels list. */
+static int osc_resource_get_unused(struct obd_export *exp, struct obdo *oa,
+				   struct list_head *cancels,
+				   enum ldlm_mode mode, __u64 lock_flags)
+{
+	struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+	struct ldlm_res_id res_id;
+	struct ldlm_resource *res;
+	int count;
+	ENTRY;
+
+	/* Return, i.e. cancel nothing, only if ELC is supported (flag in
+	 * export) but disabled through procfs (flag in NS).
+	 *
+	 * This distinguishes from a case when ELC is not supported originally,
+	 * when we still want to cancel locks in advance and just cancel them
+	 * locally, without sending any RPC. */
+	if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns))
+		RETURN(0);
+
+	ostid_build_res_name(&oa->o_oi, &res_id);
+	res = ldlm_resource_get(ns, NULL, &res_id, 0, 0);
+	if (IS_ERR(res))
+		RETURN(0);
+
+        LDLM_RESOURCE_ADDREF(res);
+        count = ldlm_cancel_resource_local(res, cancels, NULL, mode,
+                                           lock_flags, 0, NULL);
+        LDLM_RESOURCE_DELREF(res);
+        ldlm_resource_putref(res);
+        RETURN(count);
+}
+
+static int osc_destroy_interpret(const struct lu_env *env,
+				 struct ptlrpc_request *req, void *data,
+				 int rc)
+{
+	struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+
+	atomic_dec(&cli->cl_destroy_in_flight);
+	wake_up(&cli->cl_destroy_waitq);
+	return 0;
+}
+
+static int osc_can_send_destroy(struct client_obd *cli)
+{
+	if (atomic_inc_return(&cli->cl_destroy_in_flight) <=
+	    cli->cl_max_rpcs_in_flight) {
+		/* The destroy request can be sent */
+		return 1;
+	}
+	if (atomic_dec_return(&cli->cl_destroy_in_flight) <
+	    cli->cl_max_rpcs_in_flight) {
+		/*
+		 * The counter has been modified between the two atomic
+		 * operations.
+		 */
+		wake_up(&cli->cl_destroy_waitq);
+	}
+	return 0;
+}
+
+static int osc_destroy(const struct lu_env *env, struct obd_export *exp,
+		       struct obdo *oa)
+{
+        struct client_obd     *cli = &exp->exp_obd->u.cli;
+        struct ptlrpc_request *req;
+        struct ost_body       *body;
+	struct list_head       cancels = LIST_HEAD_INIT(cancels);
+        int rc, count;
+        ENTRY;
+
+        if (!oa) {
+                CDEBUG(D_INFO, "oa NULL\n");
+                RETURN(-EINVAL);
+        }
+
+        count = osc_resource_get_unused(exp, oa, &cancels, LCK_PW,
+                                        LDLM_FL_DISCARD_DATA);
+
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_DESTROY);
+        if (req == NULL) {
+                ldlm_lock_list_put(&cancels, l_bl_ast, count);
+                RETURN(-ENOMEM);
+        }
+
+        rc = ldlm_prep_elc_req(exp, req, LUSTRE_OST_VERSION, OST_DESTROY,
+                               0, &cancels, count);
+        if (rc) {
+                ptlrpc_request_free(req);
+                RETURN(rc);
+        }
+
+        req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
+        ptlrpc_at_set_req_timeout(req);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+
+        ptlrpc_request_set_replen(req);
+
+	req->rq_interpret_reply = osc_destroy_interpret;
+	if (!osc_can_send_destroy(cli)) {
+		struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+
+		/*
+		 * Wait until the number of on-going destroy RPCs drops
+		 * under max_rpc_in_flight
+		 */
+		rc = l_wait_event_exclusive(cli->cl_destroy_waitq,
+					    osc_can_send_destroy(cli), &lwi);
+		if (rc) {
+			ptlrpc_req_finished(req);
+			RETURN(rc);
+		}
+	}
+
+	/* Do not wait for response */
+	ptlrpcd_add_req(req);
+	RETURN(0);
+}
+
+static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
+                                long writing_bytes)
+{
+	u64 bits = OBD_MD_FLBLOCKS | OBD_MD_FLGRANT;
+
+	LASSERT(!(oa->o_valid & bits));
+
+	oa->o_valid |= bits;
+	spin_lock(&cli->cl_loi_list_lock);
+	if (OCD_HAS_FLAG(&cli->cl_import->imp_connect_data, GRANT_PARAM))
+		oa->o_dirty = cli->cl_dirty_grant;
+	else
+		oa->o_dirty = cli->cl_dirty_pages << PAGE_SHIFT;
+	if (unlikely(cli->cl_dirty_pages - cli->cl_dirty_transit >
+		     cli->cl_dirty_max_pages)) {
+		CERROR("dirty %lu - %lu > dirty_max %lu\n",
+		       cli->cl_dirty_pages, cli->cl_dirty_transit,
+		       cli->cl_dirty_max_pages);
+		oa->o_undirty = 0;
+	} else if (unlikely(atomic_long_read(&obd_dirty_pages) -
+			    atomic_long_read(&obd_dirty_transit_pages) >
+			    (long)(obd_max_dirty_pages + 1))) {
+		/* The atomic_read() allowing the atomic_inc() are
+		 * not covered by a lock thus they may safely race and trip
+		 * this CERROR() unless we add in a small fudge factor (+1). */
+		CERROR("%s: dirty %ld - %ld > system dirty_max %ld\n",
+		       cli_name(cli), atomic_long_read(&obd_dirty_pages),
+		       atomic_long_read(&obd_dirty_transit_pages),
+		       obd_max_dirty_pages);
+		oa->o_undirty = 0;
+	} else if (unlikely(cli->cl_dirty_max_pages - cli->cl_dirty_pages >
+			    0x7fffffff)) {
+		CERROR("dirty %lu - dirty_max %lu too big???\n",
+		       cli->cl_dirty_pages, cli->cl_dirty_max_pages);
+		oa->o_undirty = 0;
+	} else {
+		unsigned long nrpages;
+
+		nrpages = cli->cl_max_pages_per_rpc;
+		nrpages *= cli->cl_max_rpcs_in_flight + 1;
+		nrpages = max(nrpages, cli->cl_dirty_max_pages);
+		oa->o_undirty = nrpages << PAGE_SHIFT;
+		if (OCD_HAS_FLAG(&cli->cl_import->imp_connect_data,
+				 GRANT_PARAM)) {
+			int nrextents;
+
+			/* take extent tax into account when asking for more
+			 * grant space */
+			nrextents = (nrpages + cli->cl_max_extent_pages - 1)  /
+				     cli->cl_max_extent_pages;
+			oa->o_undirty += nrextents * cli->cl_grant_extent_tax;
+		}
+        }
+	oa->o_grant = cli->cl_avail_grant + cli->cl_reserved_grant;
+        oa->o_dropped = cli->cl_lost_grant;
+        cli->cl_lost_grant = 0;
+	spin_unlock(&cli->cl_loi_list_lock);
+	CDEBUG(D_CACHE, "dirty: %llu undirty: %u dropped %u grant: %llu\n",
+               oa->o_dirty, oa->o_undirty, oa->o_dropped, oa->o_grant);
+}
+
+void osc_update_next_shrink(struct client_obd *cli)
+{
+        cli->cl_next_shrink_grant =
+                cfs_time_shift(cli->cl_grant_shrink_interval);
+        CDEBUG(D_CACHE, "next time %ld to shrink grant \n",
+               cli->cl_next_shrink_grant);
+}
+
+static void __osc_update_grant(struct client_obd *cli, u64 grant)
+{
+	spin_lock(&cli->cl_loi_list_lock);
+	cli->cl_avail_grant += grant;
+	spin_unlock(&cli->cl_loi_list_lock);
+}
+
+static void osc_update_grant(struct client_obd *cli, struct ost_body *body)
+{
+        if (body->oa.o_valid & OBD_MD_FLGRANT) {
+		CDEBUG(D_CACHE, "got %llu extra grant\n", body->oa.o_grant);
+                __osc_update_grant(cli, body->oa.o_grant);
+        }
+}
+
+static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+			      u32 keylen, void *key,
+			      u32 vallen, void *val,
+			      struct ptlrpc_request_set *set);
+
+static int osc_shrink_grant_interpret(const struct lu_env *env,
+                                      struct ptlrpc_request *req,
+                                      void *aa, int rc)
+{
+        struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+        struct obdo *oa = ((struct osc_grant_args *)aa)->aa_oa;
+        struct ost_body *body;
+
+        if (rc != 0) {
+                __osc_update_grant(cli, oa->o_grant);
+                GOTO(out, rc);
+        }
+
+        body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+        LASSERT(body);
+        osc_update_grant(cli, body);
+out:
+        OBDO_FREE(oa);
+        return rc;
+}
+
+static void osc_shrink_grant_local(struct client_obd *cli, struct obdo *oa)
+{
+	spin_lock(&cli->cl_loi_list_lock);
+	oa->o_grant = cli->cl_avail_grant / 4;
+	cli->cl_avail_grant -= oa->o_grant;
+	spin_unlock(&cli->cl_loi_list_lock);
+        if (!(oa->o_valid & OBD_MD_FLFLAGS)) {
+                oa->o_valid |= OBD_MD_FLFLAGS;
+                oa->o_flags = 0;
+        }
+        oa->o_flags |= OBD_FL_SHRINK_GRANT;
+        osc_update_next_shrink(cli);
+}
+
+/* Shrink the current grant, either from some large amount to enough for a
+ * full set of in-flight RPCs, or if we have already shrunk to that limit
+ * then to enough for a single RPC.  This avoids keeping more grant than
+ * needed, and avoids shrinking the grant piecemeal. */
+static int osc_shrink_grant(struct client_obd *cli)
+{
+	__u64 target_bytes = (cli->cl_max_rpcs_in_flight + 1) *
+			     (cli->cl_max_pages_per_rpc << PAGE_SHIFT);
+
+	spin_lock(&cli->cl_loi_list_lock);
+	if (cli->cl_avail_grant <= target_bytes)
+		target_bytes = cli->cl_max_pages_per_rpc << PAGE_SHIFT;
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	return osc_shrink_grant_to_target(cli, target_bytes);
+}
+
+int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes)
+{
+	int			rc = 0;
+	struct ost_body        *body;
+	ENTRY;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	/* Don't shrink if we are already above or below the desired limit
+	 * We don't want to shrink below a single RPC, as that will negatively
+	 * impact block allocation and long-term performance. */
+	if (target_bytes < cli->cl_max_pages_per_rpc << PAGE_SHIFT)
+		target_bytes = cli->cl_max_pages_per_rpc << PAGE_SHIFT;
+
+	if (target_bytes >= cli->cl_avail_grant) {
+		spin_unlock(&cli->cl_loi_list_lock);
+		RETURN(0);
+	}
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	OBD_ALLOC_PTR(body);
+	if (!body)
+		RETURN(-ENOMEM);
+
+	osc_announce_cached(cli, &body->oa, 0);
+
+	spin_lock(&cli->cl_loi_list_lock);
+	body->oa.o_grant = cli->cl_avail_grant - target_bytes;
+	cli->cl_avail_grant = target_bytes;
+	spin_unlock(&cli->cl_loi_list_lock);
+        if (!(body->oa.o_valid & OBD_MD_FLFLAGS)) {
+                body->oa.o_valid |= OBD_MD_FLFLAGS;
+                body->oa.o_flags = 0;
+        }
+        body->oa.o_flags |= OBD_FL_SHRINK_GRANT;
+        osc_update_next_shrink(cli);
+
+        rc = osc_set_info_async(NULL, cli->cl_import->imp_obd->obd_self_export,
+                                sizeof(KEY_GRANT_SHRINK), KEY_GRANT_SHRINK,
+                                sizeof(*body), body, NULL);
+        if (rc != 0)
+                __osc_update_grant(cli, body->oa.o_grant);
+        OBD_FREE_PTR(body);
+        RETURN(rc);
+}
+
+static int osc_should_shrink_grant(struct client_obd *client)
+{
+        cfs_time_t time = cfs_time_current();
+        cfs_time_t next_shrink = client->cl_next_shrink_grant;
+
+        if ((client->cl_import->imp_connect_data.ocd_connect_flags &
+             OBD_CONNECT_GRANT_SHRINK) == 0)
+                return 0;
+
+	if (cfs_time_aftereq(time, next_shrink - 5 * CFS_TICK)) {
+		/* Get the current RPC size directly, instead of going via:
+		 * cli_brw_size(obd->u.cli.cl_import->imp_obd->obd_self_export)
+		 * Keep comment here so that it can be found by searching. */
+		int brw_size = client->cl_max_pages_per_rpc << PAGE_SHIFT;
+
+		if (client->cl_import->imp_state == LUSTRE_IMP_FULL &&
+		    client->cl_avail_grant > brw_size)
+			return 1;
+		else
+			osc_update_next_shrink(client);
+	}
+        return 0;
+}
+
+static int osc_grant_shrink_grant_cb(struct timeout_item *item, void *data)
+{
+	struct client_obd *client;
+
+	list_for_each_entry(client, &item->ti_obd_list, cl_grant_shrink_list) {
+		if (osc_should_shrink_grant(client))
+			osc_shrink_grant(client);
+	}
+	return 0;
+}
+
+static int osc_add_shrink_grant(struct client_obd *client)
+{
+	int rc;
+
+	rc = ptlrpc_add_timeout_client(client->cl_grant_shrink_interval,
+				       TIMEOUT_GRANT,
+				       osc_grant_shrink_grant_cb, NULL,
+				       &client->cl_grant_shrink_list);
+	if (rc) {
+		CERROR("add grant client %s error %d\n", cli_name(client), rc);
+		return rc;
+	}
+	CDEBUG(D_CACHE, "add grant client %s\n", cli_name(client));
+	osc_update_next_shrink(client);
+	return 0;
+}
+
+static int osc_del_shrink_grant(struct client_obd *client)
+{
+        return ptlrpc_del_timeout_client(&client->cl_grant_shrink_list,
+                                         TIMEOUT_GRANT);
+}
+
+static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
+{
+	/*
+	 * ocd_grant is the total grant amount we're expect to hold: if we've
+	 * been evicted, it's the new avail_grant amount, cl_dirty_pages will
+	 * drop to 0 as inflight RPCs fail out; otherwise, it's avail_grant +
+	 * dirty.
+	 *
+	 * race is tolerable here: if we're evicted, but imp_state already
+	 * left EVICTED state, then cl_dirty_pages must be 0 already.
+	 */
+	spin_lock(&cli->cl_loi_list_lock);
+	cli->cl_avail_grant = ocd->ocd_grant;
+	if (cli->cl_import->imp_state != LUSTRE_IMP_EVICTED) {
+		cli->cl_avail_grant -= cli->cl_reserved_grant;
+		if (OCD_HAS_FLAG(ocd, GRANT_PARAM))
+			cli->cl_avail_grant -= cli->cl_dirty_grant;
+		else
+			cli->cl_avail_grant -=
+					cli->cl_dirty_pages << PAGE_SHIFT;
+	}
+
+	if (OCD_HAS_FLAG(ocd, GRANT_PARAM)) {
+		u64 size;
+		int chunk_mask;
+
+		/* overhead for each extent insertion */
+		cli->cl_grant_extent_tax = ocd->ocd_grant_tax_kb << 10;
+		/* determine the appropriate chunk size used by osc_extent. */
+		cli->cl_chunkbits = max_t(int, PAGE_SHIFT,
+					  ocd->ocd_grant_blkbits);
+		/* max_pages_per_rpc must be chunk aligned */
+		chunk_mask = ~((1 << (cli->cl_chunkbits - PAGE_SHIFT)) - 1);
+		cli->cl_max_pages_per_rpc = (cli->cl_max_pages_per_rpc +
+					     ~chunk_mask) & chunk_mask;
+		/* determine maximum extent size, in #pages */
+		size = (u64)ocd->ocd_grant_max_blks << ocd->ocd_grant_blkbits;
+		cli->cl_max_extent_pages = size >> PAGE_SHIFT;
+		if (cli->cl_max_extent_pages == 0)
+			cli->cl_max_extent_pages = 1;
+	} else {
+		cli->cl_grant_extent_tax = 0;
+		cli->cl_chunkbits = PAGE_SHIFT;
+		cli->cl_max_extent_pages = DT_MAX_BRW_PAGES;
+	}
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	CDEBUG(D_CACHE, "%s, setting cl_avail_grant: %ld cl_lost_grant: %ld."
+		"chunk bits: %d cl_max_extent_pages: %d\n",
+		cli_name(cli),
+		cli->cl_avail_grant, cli->cl_lost_grant, cli->cl_chunkbits,
+		cli->cl_max_extent_pages);
+
+	if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT_SHRINK &&
+	    list_empty(&cli->cl_grant_shrink_list))
+		osc_add_shrink_grant(cli);
+}
+
+/* We assume that the reason this OSC got a short read is because it read
+ * beyond the end of a stripe file; i.e. lustre is reading a sparse file
+ * via the LOV, and it _knows_ it's reading inside the file, it's just that
+ * this stripe never got written at or beyond this stripe offset yet. */
+static void handle_short_read(int nob_read, size_t page_count,
+                              struct brw_page **pga)
+{
+        char *ptr;
+        int i = 0;
+
+        /* skip bytes read OK */
+        while (nob_read > 0) {
+                LASSERT (page_count > 0);
+
+		if (pga[i]->count > nob_read) {
+			/* EOF inside this page */
+			ptr = kmap(pga[i]->pg) +
+				(pga[i]->off & ~PAGE_MASK);
+			memset(ptr + nob_read, 0, pga[i]->count - nob_read);
+			kunmap(pga[i]->pg);
+			page_count--;
+			i++;
+			break;
+		}
+
+                nob_read -= pga[i]->count;
+                page_count--;
+                i++;
+        }
+
+	/* zero remaining pages */
+	while (page_count-- > 0) {
+		ptr = kmap(pga[i]->pg) + (pga[i]->off & ~PAGE_MASK);
+		memset(ptr, 0, pga[i]->count);
+		kunmap(pga[i]->pg);
+		i++;
+	}
+}
+
+static int check_write_rcs(struct ptlrpc_request *req,
+			   int requested_nob, int niocount,
+			   size_t page_count, struct brw_page **pga)
+{
+        int     i;
+        __u32   *remote_rcs;
+
+        remote_rcs = req_capsule_server_sized_get(&req->rq_pill, &RMF_RCS,
+                                                  sizeof(*remote_rcs) *
+                                                  niocount);
+        if (remote_rcs == NULL) {
+                CDEBUG(D_INFO, "Missing/short RC vector on BRW_WRITE reply\n");
+                return(-EPROTO);
+        }
+
+        /* return error if any niobuf was in error */
+        for (i = 0; i < niocount; i++) {
+                if ((int)remote_rcs[i] < 0)
+                        return(remote_rcs[i]);
+
+                if (remote_rcs[i] != 0) {
+                        CDEBUG(D_INFO, "rc[%d] invalid (%d) req %p\n",
+                                i, remote_rcs[i], req);
+                        return(-EPROTO);
+                }
+        }
+
+        if (req->rq_bulk->bd_nob_transferred != requested_nob) {
+                CERROR("Unexpected # bytes transferred: %d (requested %d)\n",
+                       req->rq_bulk->bd_nob_transferred, requested_nob);
+                return(-EPROTO);
+        }
+
+        return (0);
+}
+
+static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2)
+{
+        if (p1->flag != p2->flag) {
+		unsigned mask = ~(OBD_BRW_FROM_GRANT | OBD_BRW_NOCACHE |
+				  OBD_BRW_SYNC       | OBD_BRW_ASYNC   |
+				  OBD_BRW_NOQUOTA    | OBD_BRW_SOFT_SYNC);
+
+                /* warn if we try to combine flags that we don't know to be
+                 * safe to combine */
+                if (unlikely((p1->flag & mask) != (p2->flag & mask))) {
+                        CWARN("Saw flags 0x%x and 0x%x in the same brw, please "
+                              "report this at https://jira.hpdd.intel.com/\n",
+                              p1->flag, p2->flag);
+                }
+                return 0;
+        }
+
+        return (p1->off + p1->count == p2->off);
+}
+
+static u32 osc_checksum_bulk(int nob, size_t pg_count,
+			     struct brw_page **pga, int opc,
+			     cksum_type_t cksum_type)
+{
+	u32				cksum;
+	int				i = 0;
+	struct cfs_crypto_hash_desc	*hdesc;
+	unsigned int			bufsize;
+	unsigned char			cfs_alg = cksum_obd2cfs(cksum_type);
+
+	LASSERT(pg_count > 0);
+
+	hdesc = cfs_crypto_hash_init(cfs_alg, NULL, 0);
+	if (IS_ERR(hdesc)) {
+		CERROR("Unable to initialize checksum hash %s\n",
+		       cfs_crypto_hash_name(cfs_alg));
+		return PTR_ERR(hdesc);
+	}
+
+	while (nob > 0 && pg_count > 0) {
+		unsigned int count = pga[i]->count > nob ? nob : pga[i]->count;
+
+		/* corrupt the data before we compute the checksum, to
+		 * simulate an OST->client data error */
+		if (i == 0 && opc == OST_READ &&
+		    OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) {
+			unsigned char *ptr = kmap(pga[i]->pg);
+			int off = pga[i]->off & ~PAGE_MASK;
+
+			memcpy(ptr + off, "bad1", min_t(typeof(nob), 4, nob));
+			kunmap(pga[i]->pg);
+		}
+		cfs_crypto_hash_update_page(hdesc, pga[i]->pg,
+					    pga[i]->off & ~PAGE_MASK,
+					    count);
+		LL_CDEBUG_PAGE(D_PAGE, pga[i]->pg, "off %d\n",
+			       (int)(pga[i]->off & ~PAGE_MASK));
+
+		nob -= pga[i]->count;
+		pg_count--;
+		i++;
+	}
+
+	bufsize = sizeof(cksum);
+	cfs_crypto_hash_final(hdesc, (unsigned char *)&cksum, &bufsize);
+
+	/* For sending we only compute the wrong checksum instead
+	 * of corrupting the data so it is still correct on a redo */
+	if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND))
+		cksum++;
+
+	return cksum;
+}
+
+static int
+osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
+		     u32 page_count, struct brw_page **pga,
+		     struct ptlrpc_request **reqp, int resend)
+{
+        struct ptlrpc_request   *req;
+        struct ptlrpc_bulk_desc *desc;
+        struct ost_body         *body;
+        struct obd_ioobj        *ioobj;
+        struct niobuf_remote    *niobuf;
+        int niocount, i, requested_nob, opc, rc;
+        struct osc_brw_async_args *aa;
+        struct req_capsule      *pill;
+        struct brw_page *pg_prev;
+
+        ENTRY;
+        if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ))
+                RETURN(-ENOMEM); /* Recoverable */
+        if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ2))
+                RETURN(-EINVAL); /* Fatal */
+
+	if ((cmd & OBD_BRW_WRITE) != 0) {
+		opc = OST_WRITE;
+		req = ptlrpc_request_alloc_pool(cli->cl_import,
+						osc_rq_pool,
+						&RQF_OST_BRW_WRITE);
+	} else {
+		opc = OST_READ;
+		req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_BRW_READ);
+	}
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+        for (niocount = i = 1; i < page_count; i++) {
+                if (!can_merge_pages(pga[i - 1], pga[i]))
+                        niocount++;
+        }
+
+        pill = &req->rq_pill;
+        req_capsule_set_size(pill, &RMF_OBD_IOOBJ, RCL_CLIENT,
+                             sizeof(*ioobj));
+        req_capsule_set_size(pill, &RMF_NIOBUF_REMOTE, RCL_CLIENT,
+                             niocount * sizeof(*niobuf));
+
+        rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, opc);
+        if (rc) {
+                ptlrpc_request_free(req);
+                RETURN(rc);
+        }
+        req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
+        ptlrpc_at_set_req_timeout(req);
+	/* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own
+	 * retry logic */
+	req->rq_no_retry_einprogress = 1;
+
+	desc = ptlrpc_prep_bulk_imp(req, page_count,
+		cli->cl_import->imp_connect_data.ocd_brw_size >> LNET_MTU_BITS,
+		(opc == OST_WRITE ? PTLRPC_BULK_GET_SOURCE :
+			PTLRPC_BULK_PUT_SINK) |
+			PTLRPC_BULK_BUF_KIOV,
+		OST_BULK_PORTAL,
+		&ptlrpc_bulk_kiov_pin_ops);
+
+        if (desc == NULL)
+                GOTO(out, rc = -ENOMEM);
+        /* NB request now owns desc and will free it when it gets freed */
+
+        body = req_capsule_client_get(pill, &RMF_OST_BODY);
+        ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ);
+        niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
+        LASSERT(body != NULL && ioobj != NULL && niobuf != NULL);
+
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+
+	obdo_to_ioobj(oa, ioobj);
+	ioobj->ioo_bufcnt = niocount;
+	/* The high bits of ioo_max_brw tells server _maximum_ number of bulks
+	 * that might be send for this request.  The actual number is decided
+	 * when the RPC is finally sent in ptlrpc_register_bulk(). It sends
+	 * "max - 1" for old client compatibility sending "0", and also so the
+	 * the actual maximum is a power-of-two number, not one less. LU-1431 */
+	ioobj_max_brw_set(ioobj, desc->bd_md_max_brw);
+	LASSERT(page_count > 0);
+	pg_prev = pga[0];
+        for (requested_nob = i = 0; i < page_count; i++, niobuf++) {
+                struct brw_page *pg = pga[i];
+		int poff = pg->off & ~PAGE_MASK;
+
+                LASSERT(pg->count > 0);
+                /* make sure there is no gap in the middle of page array */
+		LASSERTF(page_count == 1 ||
+			 (ergo(i == 0, poff + pg->count == PAGE_SIZE) &&
+			  ergo(i > 0 && i < page_count - 1,
+			       poff == 0 && pg->count == PAGE_SIZE)   &&
+			  ergo(i == page_count - 1, poff == 0)),
+			 "i: %d/%d pg: %p off: %llu, count: %u\n",
+			 i, page_count, pg, pg->off, pg->count);
+                LASSERTF(i == 0 || pg->off > pg_prev->off,
+			 "i %d p_c %u pg %p [pri %lu ind %lu] off %llu"
+			 " prev_pg %p [pri %lu ind %lu] off %llu\n",
+                         i, page_count,
+                         pg->pg, page_private(pg->pg), pg->pg->index, pg->off,
+                         pg_prev->pg, page_private(pg_prev->pg),
+                         pg_prev->pg->index, pg_prev->off);
+                LASSERT((pga[0]->flag & OBD_BRW_SRVLOCK) ==
+                        (pg->flag & OBD_BRW_SRVLOCK));
+
+		desc->bd_frag_ops->add_kiov_frag(desc, pg->pg, poff, pg->count);
+                requested_nob += pg->count;
+
+                if (i > 0 && can_merge_pages(pg_prev, pg)) {
+                        niobuf--;
+			niobuf->rnb_len += pg->count;
+		} else {
+			niobuf->rnb_offset = pg->off;
+			niobuf->rnb_len    = pg->count;
+			niobuf->rnb_flags  = pg->flag;
+                }
+                pg_prev = pg;
+        }
+
+        LASSERTF((void *)(niobuf - niocount) ==
+                req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE),
+                "want %p - real %p\n", req_capsule_client_get(&req->rq_pill,
+                &RMF_NIOBUF_REMOTE), (void *)(niobuf - niocount));
+
+        osc_announce_cached(cli, &body->oa, opc == OST_WRITE ? requested_nob:0);
+        if (resend) {
+                if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
+                        body->oa.o_valid |= OBD_MD_FLFLAGS;
+                        body->oa.o_flags = 0;
+                }
+                body->oa.o_flags |= OBD_FL_RECOV_RESEND;
+        }
+
+        if (osc_should_shrink_grant(cli))
+                osc_shrink_grant_local(cli, &body->oa);
+
+        /* size[REQ_REC_OFF] still sizeof (*body) */
+        if (opc == OST_WRITE) {
+                if (cli->cl_checksum &&
+                    !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
+                        /* store cl_cksum_type in a local variable since
+                         * it can be changed via lprocfs */
+                        cksum_type_t cksum_type = cli->cl_cksum_type;
+
+                        if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0)
+                                body->oa.o_flags = 0;
+
+                        body->oa.o_flags |= cksum_type_pack(cksum_type);
+                        body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+                        body->oa.o_cksum = osc_checksum_bulk(requested_nob,
+                                                             page_count, pga,
+                                                             OST_WRITE,
+                                                             cksum_type);
+                        CDEBUG(D_PAGE, "checksum at write origin: %x\n",
+                               body->oa.o_cksum);
+                        /* save this in 'oa', too, for later checking */
+                        oa->o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+                        oa->o_flags |= cksum_type_pack(cksum_type);
+                } else {
+                        /* clear out the checksum flag, in case this is a
+                         * resend but cl_checksum is no longer set. b=11238 */
+                        oa->o_valid &= ~OBD_MD_FLCKSUM;
+                }
+                oa->o_cksum = body->oa.o_cksum;
+                /* 1 RC per niobuf */
+                req_capsule_set_size(pill, &RMF_RCS, RCL_SERVER,
+                                     sizeof(__u32) * niocount);
+        } else {
+                if (cli->cl_checksum &&
+                    !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
+                        if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0)
+                                body->oa.o_flags = 0;
+                        body->oa.o_flags |= cksum_type_pack(cli->cl_cksum_type);
+                        body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+                }
+
+		/* Client cksum has been already copied to wire obdo in previous
+		 * lustre_set_wire_obdo(), and in the case a bulk-read is being
+		 * resent due to cksum error, this will allow Server to
+		 * check+dump pages on its side */
+	}
+        ptlrpc_request_set_replen(req);
+
+        CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+        aa = ptlrpc_req_async_args(req);
+        aa->aa_oa = oa;
+        aa->aa_requested_nob = requested_nob;
+        aa->aa_nio_count = niocount;
+        aa->aa_page_count = page_count;
+        aa->aa_resends = 0;
+        aa->aa_ppga = pga;
+        aa->aa_cli = cli;
+	INIT_LIST_HEAD(&aa->aa_oaps);
+
+	*reqp = req;
+	niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
+	CDEBUG(D_RPCTRACE, "brw rpc %p - object "DOSTID" offset %lld<>%lld\n",
+		req, POSTID(&oa->o_oi), niobuf[0].rnb_offset,
+		niobuf[niocount - 1].rnb_offset + niobuf[niocount - 1].rnb_len);
+        RETURN(0);
+
+ out:
+        ptlrpc_req_finished(req);
+        RETURN(rc);
+}
+
+char dbgcksum_file_name[PATH_MAX];
+
+static void dump_all_bulk_pages(struct obdo *oa, __u32 page_count,
+				struct brw_page **pga, __u32 server_cksum,
+				__u32 client_cksum)
+{
+	struct file *filp;
+	int rc, i;
+	unsigned int len;
+	char *buf;
+
+	/* will only keep dump of pages on first error for the same range in
+	 * file/fid, not during the resends/retries. */
+	snprintf(dbgcksum_file_name, sizeof(dbgcksum_file_name),
+		 "%s-checksum_dump-osc-"DFID":[%llu-%llu]-%x-%x",
+		 (strncmp(libcfs_debug_file_path_arr, "NONE", 4) != 0 ?
+		  libcfs_debug_file_path_arr :
+		  LIBCFS_DEBUG_FILE_PATH_DEFAULT),
+		 oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : 0ULL,
+		 oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
+		 oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
+		 pga[0]->off,
+		 pga[page_count-1]->off + pga[page_count-1]->count - 1,
+		 client_cksum, server_cksum);
+	filp = filp_open(dbgcksum_file_name,
+			 O_CREAT | O_EXCL | O_WRONLY | O_LARGEFILE, 0600);
+	if (IS_ERR(filp)) {
+		rc = PTR_ERR(filp);
+		if (rc == -EEXIST)
+			CDEBUG(D_INFO, "%s: can't open to dump pages with "
+			       "checksum error: rc = %d\n", dbgcksum_file_name,
+			       rc);
+		else
+			CERROR("%s: can't open to dump pages with checksum "
+			       "error: rc = %d\n", dbgcksum_file_name, rc);
+		return;
+	}
+
+	for (i = 0; i < page_count; i++) {
+		len = pga[i]->count;
+		buf = kmap(pga[i]->pg);
+		while (len != 0) {
+			rc = cfs_kernel_write(filp, buf, len, &filp->f_pos);
+			if (rc < 0) {
+				CERROR("%s: wanted to write %u but got %d "
+				       "error\n", dbgcksum_file_name, len, rc);
+				break;
+			}
+			len -= rc;
+			buf += rc;
+			CDEBUG(D_INFO, "%s: wrote %d bytes\n",
+			       dbgcksum_file_name, rc);
+		}
+		kunmap(pga[i]->pg);
+	}
+
+	rc = ll_vfs_fsync_range(filp, 0, LLONG_MAX, 1);
+	if (rc)
+		CERROR("%s: sync returns %d\n", dbgcksum_file_name, rc);
+	filp_close(filp, NULL);
+	return;
+}
+
+static int
+check_write_checksum(struct obdo *oa, const lnet_process_id_t *peer,
+				__u32 client_cksum, __u32 server_cksum,
+				struct osc_brw_async_args *aa)
+{
+        __u32 new_cksum;
+        char *msg;
+        cksum_type_t cksum_type;
+
+        if (server_cksum == client_cksum) {
+                CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
+                return 0;
+        }
+
+	if (aa->aa_cli->cl_checksum_dump)
+		dump_all_bulk_pages(oa, aa->aa_page_count, aa->aa_ppga,
+				    server_cksum, client_cksum);
+
+	cksum_type = cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
+				       oa->o_flags : 0);
+	new_cksum = osc_checksum_bulk(aa->aa_requested_nob, aa->aa_page_count,
+				      aa->aa_ppga, OST_WRITE, cksum_type);
+
+	if (cksum_type != cksum_type_unpack(aa->aa_oa->o_flags))
+                msg = "the server did not use the checksum type specified in "
+                      "the original request - likely a protocol problem";
+        else if (new_cksum == server_cksum)
+                msg = "changed on the client after we checksummed it - "
+                      "likely false positive due to mmap IO (bug 11742)";
+        else if (new_cksum == client_cksum)
+                msg = "changed in transit before arrival at OST";
+        else
+                msg = "changed in transit AND doesn't match the original - "
+                      "likely false positive due to mmap IO (bug 11742)";
+
+	LCONSOLE_ERROR_MSG(0x132, "%s: BAD WRITE CHECKSUM: %s: from %s inode "
+			   DFID " object "DOSTID" extent [%llu-%llu], original "
+			   "client csum %x (type %x), server csum %x (type %x),"
+			   " client csum now %x\n",
+			   aa->aa_cli->cl_import->imp_obd->obd_name,
+			   msg, libcfs_nid2str(peer->nid),
+			   oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0,
+			   oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
+			   oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
+			   POSTID(&oa->o_oi), aa->aa_ppga[0]->off,
+			   aa->aa_ppga[aa->aa_page_count - 1]->off +
+				aa->aa_ppga[aa->aa_page_count-1]->count - 1,
+			   client_cksum, cksum_type_unpack(aa->aa_oa->o_flags),
+			   server_cksum, cksum_type, new_cksum);
+	return 1;
+}
+
+/* Note rc enters this function as number of bytes transferred */
+static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
+{
+        struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
+	const struct lnet_process_id *peer =
+                        &req->rq_import->imp_connection->c_peer;
+        struct client_obd *cli = aa->aa_cli;
+        struct ost_body *body;
+	u32 client_cksum = 0;
+        ENTRY;
+
+        if (rc < 0 && rc != -EDQUOT) {
+                DEBUG_REQ(D_INFO, req, "Failed request with rc = %d\n", rc);
+                RETURN(rc);
+        }
+
+        LASSERTF(req->rq_repmsg != NULL, "rc = %d\n", rc);
+        body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+        if (body == NULL) {
+                DEBUG_REQ(D_INFO, req, "Can't unpack body\n");
+                RETURN(-EPROTO);
+        }
+
+	/* set/clear over quota flag for a uid/gid/projid */
+	if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE &&
+	    body->oa.o_valid & (OBD_MD_FLALLQUOTA)) {
+		unsigned qid[LL_MAXQUOTAS] = {
+					 body->oa.o_uid, body->oa.o_gid,
+					 body->oa.o_projid };
+		CDEBUG(D_QUOTA, "setdq for [%u %u %u] with valid %#llx, flags %x\n",
+		       body->oa.o_uid, body->oa.o_gid, body->oa.o_projid,
+		       body->oa.o_valid, body->oa.o_flags);
+		       osc_quota_setdq(cli, qid, body->oa.o_valid,
+				       body->oa.o_flags);
+        }
+
+        osc_update_grant(cli, body);
+
+        if (rc < 0)
+                RETURN(rc);
+
+        if (aa->aa_oa->o_valid & OBD_MD_FLCKSUM)
+                client_cksum = aa->aa_oa->o_cksum; /* save for later */
+
+        if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) {
+                if (rc > 0) {
+                        CERROR("Unexpected +ve rc %d\n", rc);
+                        RETURN(-EPROTO);
+                }
+                LASSERT(req->rq_bulk->bd_nob == aa->aa_requested_nob);
+
+                if (sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk))
+                        RETURN(-EAGAIN);
+
+                if ((aa->aa_oa->o_valid & OBD_MD_FLCKSUM) && client_cksum &&
+                    check_write_checksum(&body->oa, peer, client_cksum,
+					 body->oa.o_cksum, aa))
+                        RETURN(-EAGAIN);
+
+                rc = check_write_rcs(req, aa->aa_requested_nob,aa->aa_nio_count,
+                                     aa->aa_page_count, aa->aa_ppga);
+                GOTO(out, rc);
+        }
+
+        /* The rest of this function executes only for OST_READs */
+
+        /* if unwrap_bulk failed, return -EAGAIN to retry */
+        rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc);
+        if (rc < 0)
+                GOTO(out, rc = -EAGAIN);
+
+        if (rc > aa->aa_requested_nob) {
+                CERROR("Unexpected rc %d (%d requested)\n", rc,
+                       aa->aa_requested_nob);
+                RETURN(-EPROTO);
+        }
+
+        if (rc != req->rq_bulk->bd_nob_transferred) {
+                CERROR ("Unexpected rc %d (%d transferred)\n",
+                        rc, req->rq_bulk->bd_nob_transferred);
+                return (-EPROTO);
+        }
+
+        if (rc < aa->aa_requested_nob)
+                handle_short_read(rc, aa->aa_page_count, aa->aa_ppga);
+
+        if (body->oa.o_valid & OBD_MD_FLCKSUM) {
+                static int cksum_counter;
+		u32        server_cksum = body->oa.o_cksum;
+		char      *via = "";
+		char      *router = "";
+                cksum_type_t cksum_type;
+
+                cksum_type = cksum_type_unpack(body->oa.o_valid &OBD_MD_FLFLAGS?
+                                               body->oa.o_flags : 0);
+                client_cksum = osc_checksum_bulk(rc, aa->aa_page_count,
+                                                 aa->aa_ppga, OST_READ,
+                                                 cksum_type);
+
+		if (peer->nid != req->rq_bulk->bd_sender) {
+			via = " via ";
+			router = libcfs_nid2str(req->rq_bulk->bd_sender);
+		}
+
+		if (server_cksum != client_cksum) {
+			struct ost_body *clbody;
+			u32 page_count = aa->aa_page_count;
+
+			clbody = req_capsule_client_get(&req->rq_pill,
+							&RMF_OST_BODY);
+			if (cli->cl_checksum_dump)
+				dump_all_bulk_pages(&clbody->oa, page_count,
+						    aa->aa_ppga, server_cksum,
+						    client_cksum);
+
+			LCONSOLE_ERROR_MSG(0x133, "%s: BAD READ CHECKSUM: from "
+					   "%s%s%s inode "DFID" object "DOSTID
+					   " extent [%llu-%llu], client %x, "
+					   "server %x, cksum_type %x\n",
+					   req->rq_import->imp_obd->obd_name,
+					   libcfs_nid2str(peer->nid),
+					   via, router,
+					   clbody->oa.o_valid & OBD_MD_FLFID ?
+						clbody->oa.o_parent_seq : 0ULL,
+					   clbody->oa.o_valid & OBD_MD_FLFID ?
+						clbody->oa.o_parent_oid : 0,
+					   clbody->oa.o_valid & OBD_MD_FLFID ?
+						clbody->oa.o_parent_ver : 0,
+					   POSTID(&body->oa.o_oi),
+					   aa->aa_ppga[0]->off,
+					   aa->aa_ppga[page_count-1]->off +
+					   aa->aa_ppga[page_count-1]->count - 1,
+					   client_cksum, server_cksum,
+					   cksum_type);
+			cksum_counter = 0;
+			aa->aa_oa->o_cksum = client_cksum;
+			rc = -EAGAIN;
+		} else {
+			cksum_counter++;
+			CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
+			rc = 0;
+		}
+        } else if (unlikely(client_cksum)) {
+                static int cksum_missed;
+
+                cksum_missed++;
+                if ((cksum_missed & (-cksum_missed)) == cksum_missed)
+                        CERROR("Checksum %u requested from %s but not sent\n",
+                               cksum_missed, libcfs_nid2str(peer->nid));
+        } else {
+                rc = 0;
+        }
+out:
+	if (rc >= 0)
+		lustre_get_wire_obdo(&req->rq_import->imp_connect_data,
+				     aa->aa_oa, &body->oa);
+
+        RETURN(rc);
+}
+
+static int osc_brw_redo_request(struct ptlrpc_request *request,
+				struct osc_brw_async_args *aa, int rc)
+{
+        struct ptlrpc_request *new_req;
+        struct osc_brw_async_args *new_aa;
+        struct osc_async_page *oap;
+        ENTRY;
+
+	DEBUG_REQ(rc == -EINPROGRESS ? D_RPCTRACE : D_ERROR, request,
+		  "redo for recoverable error %d", rc);
+
+	rc = osc_brw_prep_request(lustre_msg_get_opc(request->rq_reqmsg) ==
+				OST_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ,
+				  aa->aa_cli, aa->aa_oa, aa->aa_page_count,
+				  aa->aa_ppga, &new_req, 1);
+        if (rc)
+                RETURN(rc);
+
+	list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) {
+                if (oap->oap_request != NULL) {
+                        LASSERTF(request == oap->oap_request,
+                                 "request %p != oap_request %p\n",
+                                 request, oap->oap_request);
+                        if (oap->oap_interrupted) {
+                                ptlrpc_req_finished(new_req);
+                                RETURN(-EINTR);
+                        }
+                }
+        }
+        /* New request takes over pga and oaps from old request.
+         * Note that copying a list_head doesn't work, need to move it... */
+        aa->aa_resends++;
+        new_req->rq_interpret_reply = request->rq_interpret_reply;
+        new_req->rq_async_args = request->rq_async_args;
+	new_req->rq_commit_cb = request->rq_commit_cb;
+	/* cap resend delay to the current request timeout, this is similar to
+	 * what ptlrpc does (see after_reply()) */
+	if (aa->aa_resends > new_req->rq_timeout)
+		new_req->rq_sent = ktime_get_real_seconds() + new_req->rq_timeout;
+	else
+		new_req->rq_sent = ktime_get_real_seconds() + aa->aa_resends;
+        new_req->rq_generation_set = 1;
+        new_req->rq_import_generation = request->rq_import_generation;
+
+        new_aa = ptlrpc_req_async_args(new_req);
+
+	INIT_LIST_HEAD(&new_aa->aa_oaps);
+	list_splice_init(&aa->aa_oaps, &new_aa->aa_oaps);
+	INIT_LIST_HEAD(&new_aa->aa_exts);
+	list_splice_init(&aa->aa_exts, &new_aa->aa_exts);
+	new_aa->aa_resends = aa->aa_resends;
+
+	list_for_each_entry(oap, &new_aa->aa_oaps, oap_rpc_item) {
+                if (oap->oap_request) {
+                        ptlrpc_req_finished(oap->oap_request);
+                        oap->oap_request = ptlrpc_request_addref(new_req);
+                }
+        }
+
+	/* XXX: This code will run into problem if we're going to support
+	 * to add a series of BRW RPCs into a self-defined ptlrpc_request_set
+	 * and wait for all of them to be finished. We should inherit request
+	 * set from old request. */
+	ptlrpcd_add_req(new_req);
+
+	DEBUG_REQ(D_INFO, new_req, "new request");
+	RETURN(0);
+}
+
+/*
+ * ugh, we want disk allocation on the target to happen in offset order.  we'll
+ * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do
+ * fine for our small page arrays and doesn't require allocation.  its an
+ * insertion sort that swaps elements that are strides apart, shrinking the
+ * stride down until its '1' and the array is sorted.
+ */
+static void sort_brw_pages(struct brw_page **array, int num)
+{
+        int stride, i, j;
+        struct brw_page *tmp;
+
+        if (num == 1)
+                return;
+        for (stride = 1; stride < num ; stride = (stride * 3) + 1)
+                ;
+
+        do {
+                stride /= 3;
+                for (i = stride ; i < num ; i++) {
+                        tmp = array[i];
+                        j = i;
+                        while (j >= stride && array[j - stride]->off > tmp->off) {
+                                array[j] = array[j - stride];
+                                j -= stride;
+                        }
+                        array[j] = tmp;
+                }
+        } while (stride > 1);
+}
+
+static void osc_release_ppga(struct brw_page **ppga, size_t count)
+{
+        LASSERT(ppga != NULL);
+        OBD_FREE(ppga, sizeof(*ppga) * count);
+}
+
+static int brw_interpret(const struct lu_env *env,
+                         struct ptlrpc_request *req, void *data, int rc)
+{
+	struct osc_brw_async_args *aa = data;
+	struct osc_extent *ext;
+	struct osc_extent *tmp;
+	struct client_obd *cli = aa->aa_cli;
+        ENTRY;
+
+        rc = osc_brw_fini_request(req, rc);
+        CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc);
+        /* When server return -EINPROGRESS, client should always retry
+         * regardless of the number of times the bulk was resent already. */
+	if (osc_recoverable_error(rc)) {
+		if (req->rq_import_generation !=
+		    req->rq_import->imp_generation) {
+			CDEBUG(D_HA, "%s: resend cross eviction for object: "
+			       ""DOSTID", rc = %d.\n",
+			       req->rq_import->imp_obd->obd_name,
+			       POSTID(&aa->aa_oa->o_oi), rc);
+		} else if (rc == -EINPROGRESS ||
+		    client_should_resend(aa->aa_resends, aa->aa_cli)) {
+			rc = osc_brw_redo_request(req, aa, rc);
+		} else {
+			CERROR("%s: too many resent retries for object: "
+			       "%llu:%llu, rc = %d.\n",
+			       req->rq_import->imp_obd->obd_name,
+			       POSTID(&aa->aa_oa->o_oi), rc);
+		}
+
+		if (rc == 0)
+			RETURN(0);
+		else if (rc == -EAGAIN || rc == -EINPROGRESS)
+			rc = -EIO;
+	}
+
+	if (rc == 0) {
+		struct obdo *oa = aa->aa_oa;
+		struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+		unsigned long valid = 0;
+		struct cl_object *obj;
+		struct osc_async_page *last;
+
+		last = brw_page2oap(aa->aa_ppga[aa->aa_page_count - 1]);
+		obj = osc2cl(last->oap_obj);
+
+		cl_object_attr_lock(obj);
+		if (oa->o_valid & OBD_MD_FLBLOCKS) {
+			attr->cat_blocks = oa->o_blocks;
+			valid |= CAT_BLOCKS;
+		}
+		if (oa->o_valid & OBD_MD_FLMTIME) {
+			attr->cat_mtime = oa->o_mtime;
+			valid |= CAT_MTIME;
+		}
+		if (oa->o_valid & OBD_MD_FLATIME) {
+			attr->cat_atime = oa->o_atime;
+			valid |= CAT_ATIME;
+		}
+		if (oa->o_valid & OBD_MD_FLCTIME) {
+			attr->cat_ctime = oa->o_ctime;
+			valid |= CAT_CTIME;
+		}
+
+		if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) {
+			struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo;
+			loff_t last_off = last->oap_count + last->oap_obj_off +
+				last->oap_page_off;
+
+			/* Change file size if this is an out of quota or
+			 * direct IO write and it extends the file size */
+			if (loi->loi_lvb.lvb_size < last_off) {
+				attr->cat_size = last_off;
+				valid |= CAT_SIZE;
+			}
+			/* Extend KMS if it's not a lockless write */
+			if (loi->loi_kms < last_off &&
+			    oap2osc_page(last)->ops_srvlock == 0) {
+				attr->cat_kms = last_off;
+				valid |= CAT_KMS;
+			}
+		}
+
+		if (valid != 0)
+			cl_object_attr_update(env, obj, attr, valid);
+		cl_object_attr_unlock(obj);
+	}
+	OBDO_FREE(aa->aa_oa);
+
+	if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE && rc == 0)
+		osc_inc_unstable_pages(req);
+
+	list_for_each_entry_safe(ext, tmp, &aa->aa_exts, oe_link) {
+		list_del_init(&ext->oe_link);
+		osc_extent_finish(env, ext, 1, rc);
+	}
+	LASSERT(list_empty(&aa->aa_exts));
+	LASSERT(list_empty(&aa->aa_oaps));
+
+	osc_release_ppga(aa->aa_ppga, aa->aa_page_count);
+	ptlrpc_lprocfs_brw(req, req->rq_bulk->bd_nob_transferred);
+
+	spin_lock(&cli->cl_loi_list_lock);
+	/* We need to decrement before osc_ap_completion->osc_wake_cache_waiters
+	 * is called so we know whether to go to sync BRWs or wait for more
+	 * RPCs to complete */
+	if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE)
+		cli->cl_w_in_flight--;
+	else
+		cli->cl_r_in_flight--;
+	osc_wake_cache_waiters(cli);
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	osc_io_unplug(env, cli, NULL);
+	RETURN(rc);
+}
+
+static void brw_commit(struct ptlrpc_request *req)
+{
+	/* If osc_inc_unstable_pages (via osc_extent_finish) races with
+	 * this called via the rq_commit_cb, I need to ensure
+	 * osc_dec_unstable_pages is still called. Otherwise unstable
+	 * pages may be leaked. */
+	spin_lock(&req->rq_lock);
+	if (likely(req->rq_unstable)) {
+		req->rq_unstable = 0;
+		spin_unlock(&req->rq_lock);
+
+		osc_dec_unstable_pages(req);
+	} else {
+		req->rq_committed = 1;
+		spin_unlock(&req->rq_lock);
+	}
+}
+
+/**
+ * Build an RPC by the list of extent @ext_list. The caller must ensure
+ * that the total pages in this list are NOT over max pages per RPC.
+ * Extents in the list must be in OES_RPC state.
+ */
+int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
+		  struct list_head *ext_list, int cmd)
+{
+	struct ptlrpc_request		*req = NULL;
+	struct osc_extent		*ext;
+	struct brw_page			**pga = NULL;
+	struct osc_brw_async_args	*aa = NULL;
+	struct obdo			*oa = NULL;
+	struct osc_async_page		*oap;
+	struct osc_object		*obj = NULL;
+	struct cl_req_attr		*crattr = NULL;
+	loff_t				starting_offset = OBD_OBJECT_EOF;
+	loff_t				ending_offset = 0;
+	int				mpflag = 0;
+	int				mem_tight = 0;
+	int				page_count = 0;
+	bool				soft_sync = false;
+	bool				interrupted = false;
+	int				i;
+	int				grant = 0;
+	int				rc;
+	struct list_head		rpc_list = LIST_HEAD_INIT(rpc_list);
+	struct ost_body			*body;
+	ENTRY;
+	LASSERT(!list_empty(ext_list));
+
+	/* add pages into rpc_list to build BRW rpc */
+	list_for_each_entry(ext, ext_list, oe_link) {
+		LASSERT(ext->oe_state == OES_RPC);
+		mem_tight |= ext->oe_memalloc;
+		grant += ext->oe_grants;
+		page_count += ext->oe_nr_pages;
+		if (obj == NULL)
+			obj = ext->oe_obj;
+	}
+
+	soft_sync = osc_over_unstable_soft_limit(cli);
+	if (mem_tight)
+		mpflag = cfs_memory_pressure_get_and_set();
+
+	OBD_ALLOC(pga, sizeof(*pga) * page_count);
+	if (pga == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	OBDO_ALLOC(oa);
+	if (oa == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	i = 0;
+	list_for_each_entry(ext, ext_list, oe_link) {
+		list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+			if (mem_tight)
+				oap->oap_brw_flags |= OBD_BRW_MEMALLOC;
+			if (soft_sync)
+				oap->oap_brw_flags |= OBD_BRW_SOFT_SYNC;
+			pga[i] = &oap->oap_brw_page;
+			pga[i]->off = oap->oap_obj_off + oap->oap_page_off;
+			i++;
+
+			list_add_tail(&oap->oap_rpc_item, &rpc_list);
+			if (starting_offset == OBD_OBJECT_EOF ||
+			    starting_offset > oap->oap_obj_off)
+				starting_offset = oap->oap_obj_off;
+			else
+				LASSERT(oap->oap_page_off == 0);
+			if (ending_offset < oap->oap_obj_off + oap->oap_count)
+				ending_offset = oap->oap_obj_off +
+						oap->oap_count;
+			else
+				LASSERT(oap->oap_page_off + oap->oap_count ==
+					PAGE_SIZE);
+			if (oap->oap_interrupted)
+				interrupted = true;
+		}
+	}
+
+	/* first page in the list */
+	oap = list_entry(rpc_list.next, typeof(*oap), oap_rpc_item);
+
+	crattr = &osc_env_info(env)->oti_req_attr;
+	memset(crattr, 0, sizeof(*crattr));
+	crattr->cra_type = (cmd & OBD_BRW_WRITE) ? CRT_WRITE : CRT_READ;
+	crattr->cra_flags = ~0ULL;
+	crattr->cra_page = oap2cl_page(oap);
+	crattr->cra_oa = oa;
+	cl_req_attr_set(env, osc2cl(obj), crattr);
+
+	if (cmd == OBD_BRW_WRITE)
+		oa->o_grant_used = grant;
+
+	sort_brw_pages(pga, page_count);
+	rc = osc_brw_prep_request(cmd, cli, oa, page_count, pga, &req, 0);
+	if (rc != 0) {
+		CERROR("prep_req failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	req->rq_commit_cb = brw_commit;
+	req->rq_interpret_reply = brw_interpret;
+	req->rq_memalloc = mem_tight != 0;
+	oap->oap_request = ptlrpc_request_addref(req);
+	if (interrupted && !req->rq_intr)
+		ptlrpc_mark_interrupted(req);
+
+	/* Need to update the timestamps after the request is built in case
+	 * we race with setattr (locally or in queue at OST).  If OST gets
+	 * later setattr before earlier BRW (as determined by the request xid),
+	 * the OST will not use BRW timestamps.  Sadly, there is no obvious
+	 * way to do this in a single call.  bug 10150 */
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	crattr->cra_oa = &body->oa;
+	crattr->cra_flags = OBD_MD_FLMTIME|OBD_MD_FLCTIME|OBD_MD_FLATIME;
+	cl_req_attr_set(env, osc2cl(obj), crattr);
+	lustre_msg_set_jobid(req->rq_reqmsg, crattr->cra_jobid);
+
+	CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+	aa = ptlrpc_req_async_args(req);
+	INIT_LIST_HEAD(&aa->aa_oaps);
+	list_splice_init(&rpc_list, &aa->aa_oaps);
+	INIT_LIST_HEAD(&aa->aa_exts);
+	list_splice_init(ext_list, &aa->aa_exts);
+
+	spin_lock(&cli->cl_loi_list_lock);
+	starting_offset >>= PAGE_SHIFT;
+	if (cmd == OBD_BRW_READ) {
+		cli->cl_r_in_flight++;
+		lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count);
+		lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_r_in_flight);
+		lprocfs_oh_tally_log2(&cli->cl_read_offset_hist,
+				      starting_offset + 1);
+	} else {
+		cli->cl_w_in_flight++;
+		lprocfs_oh_tally_log2(&cli->cl_write_page_hist, page_count);
+		lprocfs_oh_tally(&cli->cl_write_rpc_hist, cli->cl_w_in_flight);
+		lprocfs_oh_tally_log2(&cli->cl_write_offset_hist,
+				      starting_offset + 1);
+	}
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	DEBUG_REQ(D_INODE, req, "%d pages, aa %p. now %ur/%uw in flight",
+		  page_count, aa, cli->cl_r_in_flight,
+		  cli->cl_w_in_flight);
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_IO, cfs_fail_val);
+
+	ptlrpcd_add_req(req);
+	rc = 0;
+	EXIT;
+
+out:
+	if (mem_tight != 0)
+		cfs_memory_pressure_restore(mpflag);
+
+	if (rc != 0) {
+		LASSERT(req == NULL);
+
+		if (oa)
+			OBDO_FREE(oa);
+		if (pga)
+			OBD_FREE(pga, sizeof(*pga) * page_count);
+		/* this should happen rarely and is pretty bad, it makes the
+		 * pending list not follow the dirty order */
+		while (!list_empty(ext_list)) {
+			ext = list_entry(ext_list->next, struct osc_extent,
+					 oe_link);
+			list_del_init(&ext->oe_link);
+			osc_extent_finish(env, ext, 0, rc);
+		}
+	}
+	RETURN(rc);
+}
+
+static int osc_set_lock_data(struct ldlm_lock *lock, void *data)
+{
+        int set = 0;
+
+        LASSERT(lock != NULL);
+
+        lock_res_and_lock(lock);
+
+	if (lock->l_ast_data == NULL)
+		lock->l_ast_data = data;
+	if (lock->l_ast_data == data)
+		set = 1;
+
+	unlock_res_and_lock(lock);
+
+	return set;
+}
+
+static int osc_enqueue_fini(struct ptlrpc_request *req,
+			    osc_enqueue_upcall_f upcall, void *cookie,
+			    struct lustre_handle *lockh, enum ldlm_mode mode,
+			    __u64 *flags, int agl, int errcode)
+{
+	bool intent = *flags & LDLM_FL_HAS_INTENT;
+	int rc;
+	ENTRY;
+
+	/* The request was created before ldlm_cli_enqueue call. */
+	if (intent && errcode == ELDLM_LOCK_ABORTED) {
+		struct ldlm_reply *rep;
+
+		rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+		LASSERT(rep != NULL);
+
+		rep->lock_policy_res1 =
+			ptlrpc_status_ntoh(rep->lock_policy_res1);
+		if (rep->lock_policy_res1)
+			errcode = rep->lock_policy_res1;
+		if (!agl)
+			*flags |= LDLM_FL_LVB_READY;
+	} else if (errcode == ELDLM_OK) {
+		*flags |= LDLM_FL_LVB_READY;
+	}
+
+        /* Call the update callback. */
+	rc = (*upcall)(cookie, lockh, errcode);
+
+	/* release the reference taken in ldlm_cli_enqueue() */
+	if (errcode == ELDLM_LOCK_MATCHED)
+		errcode = ELDLM_OK;
+	if (errcode == ELDLM_OK && lustre_handle_is_used(lockh))
+		ldlm_lock_decref(lockh, mode);
+
+        RETURN(rc);
+}
+
+static int osc_enqueue_interpret(const struct lu_env *env,
+				 struct ptlrpc_request *req,
+				 struct osc_enqueue_args *aa, int rc)
+{
+	struct ldlm_lock *lock;
+	struct lustre_handle *lockh = &aa->oa_lockh;
+	enum ldlm_mode mode = aa->oa_mode;
+	struct ost_lvb *lvb = aa->oa_lvb;
+	__u32 lvb_len = sizeof(*lvb);
+	__u64 flags = 0;
+
+	ENTRY;
+
+	/* ldlm_cli_enqueue is holding a reference on the lock, so it must
+	 * be valid. */
+	lock = ldlm_handle2lock(lockh);
+	LASSERTF(lock != NULL,
+		 "lockh %#llx, req %p, aa %p - client evicted?\n",
+		 lockh->cookie, req, aa);
+
+	/* Take an additional reference so that a blocking AST that
+	 * ldlm_cli_enqueue_fini() might post for a failed lock, is guaranteed
+	 * to arrive after an upcall has been executed by
+	 * osc_enqueue_fini(). */
+	ldlm_lock_addref(lockh, mode);
+
+	/* Let cl_lock_state_wait fail with -ERESTARTSYS to unuse sublocks. */
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_HANG, 2);
+
+	/* Let CP AST to grant the lock first. */
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1);
+
+	if (aa->oa_agl) {
+		LASSERT(aa->oa_lvb == NULL);
+		LASSERT(aa->oa_flags == NULL);
+		aa->oa_flags = &flags;
+	}
+
+	/* Complete obtaining the lock procedure. */
+	rc = ldlm_cli_enqueue_fini(aa->oa_exp, req, aa->oa_type, 1,
+				   aa->oa_mode, aa->oa_flags, lvb, lvb_len,
+				   lockh, rc);
+	/* Complete osc stuff. */
+	rc = osc_enqueue_fini(req, aa->oa_upcall, aa->oa_cookie, lockh, mode,
+			      aa->oa_flags, aa->oa_agl, rc);
+
+        OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
+
+	ldlm_lock_decref(lockh, mode);
+	LDLM_LOCK_PUT(lock);
+	RETURN(rc);
+}
+
+struct ptlrpc_request_set *PTLRPCD_SET = (void *)1;
+
+/* When enqueuing asynchronously, locks are not ordered, we can obtain a lock
+ * from the 2nd OSC before a lock from the 1st one. This does not deadlock with
+ * other synchronous requests, however keeping some locks and trying to obtain
+ * others may take a considerable amount of time in a case of ost failure; and
+ * when other sync requests do not get released lock from a client, the client
+ * is evicted from the cluster -- such scenarious make the life difficult, so
+ * release locks just after they are obtained. */
+int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+		     __u64 *flags, union ldlm_policy_data *policy,
+		     struct ost_lvb *lvb, int kms_valid,
+		     osc_enqueue_upcall_f upcall, void *cookie,
+		     struct ldlm_enqueue_info *einfo,
+		     struct ptlrpc_request_set *rqset, int async, int agl)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lustre_handle lockh = { 0 };
+	struct ptlrpc_request *req = NULL;
+	int intent = *flags & LDLM_FL_HAS_INTENT;
+	__u64 match_flags = *flags;
+	enum ldlm_mode mode;
+	int rc;
+	ENTRY;
+
+        /* Filesystem lock extents are extended to page boundaries so that
+         * dealing with the page cache is a little smoother.  */
+	policy->l_extent.start -= policy->l_extent.start & ~PAGE_MASK;
+	policy->l_extent.end |= ~PAGE_MASK;
+
+        /*
+         * kms is not valid when either object is completely fresh (so that no
+         * locks are cached), or object was evicted. In the latter case cached
+         * lock cannot be used, because it would prime inode state with
+         * potentially stale LVB.
+         */
+        if (!kms_valid)
+                goto no_match;
+
+        /* Next, search for already existing extent locks that will cover us */
+        /* If we're trying to read, we also search for an existing PW lock.  The
+         * VFS and page cache already protect us locally, so lots of readers/
+         * writers can share a single PW lock.
+         *
+         * There are problems with conversion deadlocks, so instead of
+         * converting a read lock to a write lock, we'll just enqueue a new
+         * one.
+         *
+         * At some point we should cancel the read lock instead of making them
+         * send us a blocking callback, but there are problems with canceling
+         * locks out from other users right now, too. */
+        mode = einfo->ei_mode;
+        if (einfo->ei_mode == LCK_PR)
+                mode |= LCK_PW;
+	if (agl == 0)
+		match_flags |= LDLM_FL_LVB_READY;
+	if (intent != 0)
+		match_flags |= LDLM_FL_BLOCK_GRANTED;
+	mode = ldlm_lock_match(obd->obd_namespace, match_flags, res_id,
+			       einfo->ei_type, policy, mode, &lockh, 0);
+	if (mode) {
+		struct ldlm_lock *matched;
+
+		if (*flags & LDLM_FL_TEST_LOCK)
+			RETURN(ELDLM_OK);
+
+		matched = ldlm_handle2lock(&lockh);
+		if (agl) {
+			/* AGL enqueues DLM locks speculatively. Therefore if
+			 * it already exists a DLM lock, it wll just inform the
+			 * caller to cancel the AGL process for this stripe. */
+			ldlm_lock_decref(&lockh, mode);
+			LDLM_LOCK_PUT(matched);
+			RETURN(-ECANCELED);
+		} else if (osc_set_lock_data(matched, einfo->ei_cbdata)) {
+			*flags |= LDLM_FL_LVB_READY;
+
+			/* We already have a lock, and it's referenced. */
+			(*upcall)(cookie, &lockh, ELDLM_LOCK_MATCHED);
+
+			ldlm_lock_decref(&lockh, mode);
+			LDLM_LOCK_PUT(matched);
+			RETURN(ELDLM_OK);
+		} else {
+			ldlm_lock_decref(&lockh, mode);
+			LDLM_LOCK_PUT(matched);
+		}
+	}
+
+no_match:
+	if (*flags & (LDLM_FL_TEST_LOCK | LDLM_FL_MATCH_LOCK))
+		RETURN(-ENOLCK);
+
+	if (intent) {
+		req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+					   &RQF_LDLM_ENQUEUE_LVB);
+		if (req == NULL)
+			RETURN(-ENOMEM);
+
+		rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+		if (rc) {
+                        ptlrpc_request_free(req);
+                        RETURN(rc);
+                }
+
+                req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+                                     sizeof *lvb);
+                ptlrpc_request_set_replen(req);
+        }
+
+        /* users of osc_enqueue() can pass this flag for ldlm_lock_match() */
+        *flags &= ~LDLM_FL_BLOCK_GRANTED;
+
+        rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, lvb,
+			      sizeof(*lvb), LVB_T_OST, &lockh, async);
+	if (async) {
+		if (!rc) {
+			struct osc_enqueue_args *aa;
+			CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+			aa = ptlrpc_req_async_args(req);
+			aa->oa_exp    = exp;
+			aa->oa_mode   = einfo->ei_mode;
+			aa->oa_type   = einfo->ei_type;
+			lustre_handle_copy(&aa->oa_lockh, &lockh);
+			aa->oa_upcall = upcall;
+			aa->oa_cookie = cookie;
+			aa->oa_agl    = !!agl;
+			if (!agl) {
+				aa->oa_flags  = flags;
+				aa->oa_lvb    = lvb;
+			} else {
+				/* AGL is essentially to enqueue an DLM lock
+				 * in advance, so we don't care about the
+				 * result of AGL enqueue. */
+				aa->oa_lvb    = NULL;
+				aa->oa_flags  = NULL;
+			}
+
+			req->rq_interpret_reply =
+				(ptlrpc_interpterer_t)osc_enqueue_interpret;
+			if (rqset == PTLRPCD_SET)
+				ptlrpcd_add_req(req);
+			else
+				ptlrpc_set_add_req(rqset, req);
+		} else if (intent) {
+			ptlrpc_req_finished(req);
+		}
+		RETURN(rc);
+	}
+
+	rc = osc_enqueue_fini(req, upcall, cookie, &lockh, einfo->ei_mode,
+			      flags, agl, rc);
+	if (intent)
+		ptlrpc_req_finished(req);
+
+	RETURN(rc);
+}
+
+int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+		   enum ldlm_type type, union ldlm_policy_data *policy,
+		   enum ldlm_mode mode, __u64 *flags, void *data,
+		   struct lustre_handle *lockh, int unref)
+{
+	struct obd_device *obd = exp->exp_obd;
+	__u64 lflags = *flags;
+	enum ldlm_mode rc;
+	ENTRY;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_OSC_MATCH))
+		RETURN(-EIO);
+
+	/* Filesystem lock extents are extended to page boundaries so that
+	 * dealing with the page cache is a little smoother */
+	policy->l_extent.start -= policy->l_extent.start & ~PAGE_MASK;
+	policy->l_extent.end |= ~PAGE_MASK;
+
+        /* Next, search for already existing extent locks that will cover us */
+        /* If we're trying to read, we also search for an existing PW lock.  The
+         * VFS and page cache already protect us locally, so lots of readers/
+         * writers can share a single PW lock. */
+        rc = mode;
+        if (mode == LCK_PR)
+                rc |= LCK_PW;
+        rc = ldlm_lock_match(obd->obd_namespace, lflags,
+                             res_id, type, policy, rc, lockh, unref);
+	if (rc == 0 || lflags & LDLM_FL_TEST_LOCK)
+		RETURN(rc);
+
+	if (data != NULL) {
+		struct ldlm_lock *lock = ldlm_handle2lock(lockh);
+
+		LASSERT(lock != NULL);
+		if (!osc_set_lock_data(lock, data)) {
+			ldlm_lock_decref(lockh, rc);
+			rc = 0;
+		}
+		LDLM_LOCK_PUT(lock);
+	}
+	RETURN(rc);
+}
+
+static int osc_statfs_interpret(const struct lu_env *env,
+                                struct ptlrpc_request *req,
+                                struct osc_async_args *aa, int rc)
+{
+        struct obd_statfs *msfs;
+        ENTRY;
+
+        if (rc == -EBADR)
+                /* The request has in fact never been sent
+                 * due to issues at a higher level (LOV).
+                 * Exit immediately since the caller is
+                 * aware of the problem and takes care
+                 * of the clean up */
+                 RETURN(rc);
+
+        if ((rc == -ENOTCONN || rc == -EAGAIN) &&
+            (aa->aa_oi->oi_flags & OBD_STATFS_NODELAY))
+                GOTO(out, rc = 0);
+
+        if (rc != 0)
+                GOTO(out, rc);
+
+        msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+        if (msfs == NULL) {
+                GOTO(out, rc = -EPROTO);
+        }
+
+        *aa->aa_oi->oi_osfs = *msfs;
+out:
+        rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc);
+        RETURN(rc);
+}
+
+static int osc_statfs_async(struct obd_export *exp,
+                            struct obd_info *oinfo, __u64 max_age,
+                            struct ptlrpc_request_set *rqset)
+{
+        struct obd_device     *obd = class_exp2obd(exp);
+        struct ptlrpc_request *req;
+        struct osc_async_args *aa;
+        int                    rc;
+        ENTRY;
+
+        /* We could possibly pass max_age in the request (as an absolute
+         * timestamp or a "seconds.usec ago") so the target can avoid doing
+         * extra calls into the filesystem if that isn't necessary (e.g.
+         * during mount that would help a bit).  Having relative timestamps
+         * is not so great if request processing is slow, while absolute
+         * timestamps are not ideal because they need time synchronization. */
+        req = ptlrpc_request_alloc(obd->u.cli.cl_import, &RQF_OST_STATFS);
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+        rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
+        if (rc) {
+                ptlrpc_request_free(req);
+                RETURN(rc);
+        }
+        ptlrpc_request_set_replen(req);
+        req->rq_request_portal = OST_CREATE_PORTAL;
+        ptlrpc_at_set_req_timeout(req);
+
+        if (oinfo->oi_flags & OBD_STATFS_NODELAY) {
+                /* procfs requests not want stat in wait for avoid deadlock */
+                req->rq_no_resend = 1;
+                req->rq_no_delay = 1;
+        }
+
+        req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_statfs_interpret;
+        CLASSERT (sizeof(*aa) <= sizeof(req->rq_async_args));
+        aa = ptlrpc_req_async_args(req);
+        aa->aa_oi = oinfo;
+
+        ptlrpc_set_add_req(rqset, req);
+        RETURN(0);
+}
+
+static int osc_statfs(const struct lu_env *env, struct obd_export *exp,
+                      struct obd_statfs *osfs, __u64 max_age, __u32 flags)
+{
+        struct obd_device     *obd = class_exp2obd(exp);
+        struct obd_statfs     *msfs;
+        struct ptlrpc_request *req;
+        struct obd_import     *imp = NULL;
+        int rc;
+        ENTRY;
+
+        /*Since the request might also come from lprocfs, so we need
+         *sync this with client_disconnect_export Bug15684*/
+	down_read(&obd->u.cli.cl_sem);
+        if (obd->u.cli.cl_import)
+                imp = class_import_get(obd->u.cli.cl_import);
+	up_read(&obd->u.cli.cl_sem);
+        if (!imp)
+                RETURN(-ENODEV);
+
+        /* We could possibly pass max_age in the request (as an absolute
+         * timestamp or a "seconds.usec ago") so the target can avoid doing
+         * extra calls into the filesystem if that isn't necessary (e.g.
+         * during mount that would help a bit).  Having relative timestamps
+         * is not so great if request processing is slow, while absolute
+         * timestamps are not ideal because they need time synchronization. */
+        req = ptlrpc_request_alloc(imp, &RQF_OST_STATFS);
+
+        class_import_put(imp);
+
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+        rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
+        if (rc) {
+                ptlrpc_request_free(req);
+                RETURN(rc);
+        }
+        ptlrpc_request_set_replen(req);
+        req->rq_request_portal = OST_CREATE_PORTAL;
+        ptlrpc_at_set_req_timeout(req);
+
+        if (flags & OBD_STATFS_NODELAY) {
+                /* procfs requests not want stat in wait for avoid deadlock */
+                req->rq_no_resend = 1;
+                req->rq_no_delay = 1;
+        }
+
+        rc = ptlrpc_queue_wait(req);
+        if (rc)
+                GOTO(out, rc);
+
+        msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+        if (msfs == NULL) {
+                GOTO(out, rc = -EPROTO);
+        }
+
+        *osfs = *msfs;
+
+        EXIT;
+ out:
+        ptlrpc_req_finished(req);
+        return rc;
+}
+
+static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+			 void *karg, void __user *uarg)
+{
+        struct obd_device *obd = exp->exp_obd;
+        struct obd_ioctl_data *data = karg;
+        int err = 0;
+        ENTRY;
+
+	if (!try_module_get(THIS_MODULE)) {
+		CERROR("%s: cannot get module '%s'\n", obd->obd_name,
+		       module_name(THIS_MODULE));
+		return -EINVAL;
+	}
+        switch (cmd) {
+        case OBD_IOC_CLIENT_RECOVER:
+                err = ptlrpc_recover_import(obd->u.cli.cl_import,
+                                            data->ioc_inlbuf1, 0);
+                if (err > 0)
+                        err = 0;
+                GOTO(out, err);
+        case IOC_OSC_SET_ACTIVE:
+                err = ptlrpc_set_import_active(obd->u.cli.cl_import,
+                                               data->ioc_offset);
+                GOTO(out, err);
+        case OBD_IOC_PING_TARGET:
+                err = ptlrpc_obd_ping(obd);
+                GOTO(out, err);
+	default:
+		CDEBUG(D_INODE, "unrecognised ioctl %#x by %s\n",
+		       cmd, current_comm());
+		GOTO(out, err = -ENOTTY);
+	}
+out:
+	module_put(THIS_MODULE);
+	return err;
+}
+
+static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+			      u32 keylen, void *key,
+			      u32 vallen, void *val,
+			      struct ptlrpc_request_set *set)
+{
+        struct ptlrpc_request *req;
+        struct obd_device     *obd = exp->exp_obd;
+        struct obd_import     *imp = class_exp2cliimp(exp);
+        char                  *tmp;
+        int                    rc;
+        ENTRY;
+
+        OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_SHUTDOWN, 10);
+
+        if (KEY_IS(KEY_CHECKSUM)) {
+                if (vallen != sizeof(int))
+                        RETURN(-EINVAL);
+                exp->exp_obd->u.cli.cl_checksum = (*(int *)val) ? 1 : 0;
+                RETURN(0);
+        }
+
+        if (KEY_IS(KEY_SPTLRPC_CONF)) {
+                sptlrpc_conf_client_adapt(obd);
+                RETURN(0);
+        }
+
+        if (KEY_IS(KEY_FLUSH_CTX)) {
+                sptlrpc_import_flush_my_ctx(imp);
+                RETURN(0);
+        }
+
+	if (KEY_IS(KEY_CACHE_SET)) {
+		struct client_obd *cli = &obd->u.cli;
+
+		LASSERT(cli->cl_cache == NULL); /* only once */
+		cli->cl_cache = (struct cl_client_cache *)val;
+		cl_cache_incref(cli->cl_cache);
+		cli->cl_lru_left = &cli->cl_cache->ccc_lru_left;
+
+		/* add this osc into entity list */
+		LASSERT(list_empty(&cli->cl_lru_osc));
+		spin_lock(&cli->cl_cache->ccc_lru_lock);
+		list_add(&cli->cl_lru_osc, &cli->cl_cache->ccc_lru);
+		spin_unlock(&cli->cl_cache->ccc_lru_lock);
+
+		RETURN(0);
+	}
+
+	if (KEY_IS(KEY_CACHE_LRU_SHRINK)) {
+		struct client_obd *cli = &obd->u.cli;
+		long nr = atomic_long_read(&cli->cl_lru_in_list) >> 1;
+		long target = *(long *)val;
+
+		nr = osc_lru_shrink(env, cli, min(nr, target), true);
+		*(long *)val -= nr;
+		RETURN(0);
+	}
+
+        if (!set && !KEY_IS(KEY_GRANT_SHRINK))
+                RETURN(-EINVAL);
+
+        /* We pass all other commands directly to OST. Since nobody calls osc
+           methods directly and everybody is supposed to go through LOV, we
+           assume lov checked invalid values for us.
+           The only recognised values so far are evict_by_nid and mds_conn.
+           Even if something bad goes through, we'd get a -EINVAL from OST
+           anyway. */
+
+	req = ptlrpc_request_alloc(imp, KEY_IS(KEY_GRANT_SHRINK) ?
+						&RQF_OST_SET_GRANT_INFO :
+						&RQF_OBD_SET_INFO);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
+			     RCL_CLIENT, keylen);
+	if (!KEY_IS(KEY_GRANT_SHRINK))
+		req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL,
+				     RCL_CLIENT, vallen);
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SET_INFO);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+	memcpy(tmp, key, keylen);
+	tmp = req_capsule_client_get(&req->rq_pill, KEY_IS(KEY_GRANT_SHRINK) ?
+							&RMF_OST_BODY :
+							&RMF_SETINFO_VAL);
+        memcpy(tmp, val, vallen);
+
+	if (KEY_IS(KEY_GRANT_SHRINK)) {
+                struct osc_grant_args *aa;
+                struct obdo *oa;
+
+                CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+                aa = ptlrpc_req_async_args(req);
+                OBDO_ALLOC(oa);
+                if (!oa) {
+                        ptlrpc_req_finished(req);
+                        RETURN(-ENOMEM);
+                }
+                *oa = ((struct ost_body *)val)->oa;
+                aa->aa_oa = oa;
+                req->rq_interpret_reply = osc_shrink_grant_interpret;
+        }
+
+	ptlrpc_request_set_replen(req);
+	if (!KEY_IS(KEY_GRANT_SHRINK)) {
+		LASSERT(set != NULL);
+		ptlrpc_set_add_req(set, req);
+		ptlrpc_check_set(NULL, set);
+	} else {
+		ptlrpcd_add_req(req);
+	}
+
+	RETURN(0);
+}
+
+static int osc_reconnect(const struct lu_env *env,
+                         struct obd_export *exp, struct obd_device *obd,
+                         struct obd_uuid *cluuid,
+                         struct obd_connect_data *data,
+                         void *localdata)
+{
+        struct client_obd *cli = &obd->u.cli;
+
+        if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) {
+                long lost_grant;
+		long grant;
+
+		spin_lock(&cli->cl_loi_list_lock);
+		grant = cli->cl_avail_grant + cli->cl_reserved_grant;
+		if (data->ocd_connect_flags & OBD_CONNECT_GRANT_PARAM)
+			grant += cli->cl_dirty_grant;
+		else
+			grant += cli->cl_dirty_pages << PAGE_SHIFT;
+		data->ocd_grant = grant ? : 2 * cli_brw_size(obd);
+		lost_grant = cli->cl_lost_grant;
+		cli->cl_lost_grant = 0;
+		spin_unlock(&cli->cl_loi_list_lock);
+
+		CDEBUG(D_RPCTRACE, "ocd_connect_flags: %#llx ocd_version: %d"
+		       " ocd_grant: %d, lost: %ld.\n", data->ocd_connect_flags,
+		       data->ocd_version, data->ocd_grant, lost_grant);
+	}
+
+	RETURN(0);
+}
+
+static int osc_disconnect(struct obd_export *exp)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	int rc;
+
+        rc = client_disconnect_export(exp);
+        /**
+         * Initially we put del_shrink_grant before disconnect_export, but it
+         * causes the following problem if setup (connect) and cleanup
+         * (disconnect) are tangled together.
+         *      connect p1                     disconnect p2
+         *   ptlrpc_connect_import
+         *     ...............               class_manual_cleanup
+         *                                     osc_disconnect
+         *                                     del_shrink_grant
+         *   ptlrpc_connect_interrupt
+         *     init_grant_shrink
+         *   add this client to shrink list
+         *                                      cleanup_osc
+         * Bang! pinger trigger the shrink.
+         * So the osc should be disconnected from the shrink list, after we
+         * are sure the import has been destroyed. BUG18662
+         */
+        if (obd->u.cli.cl_import == NULL)
+                osc_del_shrink_grant(&obd->u.cli);
+        return rc;
+}
+
+static int osc_ldlm_resource_invalidate(struct cfs_hash *hs,
+	struct cfs_hash_bd *bd, struct hlist_node *hnode, void *arg)
+{
+	struct lu_env *env = arg;
+	struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+	struct ldlm_lock *lock;
+	struct osc_object *osc = NULL;
+	ENTRY;
+
+	lock_res(res);
+	list_for_each_entry(lock, &res->lr_granted, l_res_link) {
+		if (lock->l_ast_data != NULL && osc == NULL) {
+			osc = lock->l_ast_data;
+			cl_object_get(osc2cl(osc));
+		}
+
+		/* clear LDLM_FL_CLEANED flag to make sure it will be canceled
+		 * by the 2nd round of ldlm_namespace_clean() call in
+		 * osc_import_event(). */
+		ldlm_clear_cleaned(lock);
+	}
+	unlock_res(res);
+
+	if (osc != NULL) {
+		osc_object_invalidate(env, osc);
+		cl_object_put(env, osc2cl(osc));
+	}
+
+	RETURN(0);
+}
+
+static int osc_import_event(struct obd_device *obd,
+                            struct obd_import *imp,
+                            enum obd_import_event event)
+{
+        struct client_obd *cli;
+        int rc = 0;
+
+        ENTRY;
+        LASSERT(imp->imp_obd == obd);
+
+        switch (event) {
+        case IMP_EVENT_DISCON: {
+                cli = &obd->u.cli;
+		spin_lock(&cli->cl_loi_list_lock);
+		cli->cl_avail_grant = 0;
+		cli->cl_lost_grant = 0;
+		spin_unlock(&cli->cl_loi_list_lock);
+                break;
+        }
+        case IMP_EVENT_INACTIVE: {
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE);
+                break;
+        }
+        case IMP_EVENT_INVALIDATE: {
+                struct ldlm_namespace *ns = obd->obd_namespace;
+                struct lu_env         *env;
+		__u16                  refcheck;
+
+		ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+
+                env = cl_env_get(&refcheck);
+                if (!IS_ERR(env)) {
+			osc_io_unplug(env, &obd->u.cli, NULL);
+
+			cfs_hash_for_each_nolock(ns->ns_rs_hash,
+						 osc_ldlm_resource_invalidate,
+						 env, 0);
+			cl_env_put(env, &refcheck);
+
+			ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+                } else
+                        rc = PTR_ERR(env);
+                break;
+        }
+        case IMP_EVENT_ACTIVE: {
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE);
+                break;
+        }
+        case IMP_EVENT_OCD: {
+                struct obd_connect_data *ocd = &imp->imp_connect_data;
+
+                if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT)
+                        osc_init_grant(&obd->u.cli, ocd);
+
+                /* See bug 7198 */
+                if (ocd->ocd_connect_flags & OBD_CONNECT_REQPORTAL)
+                        imp->imp_client->cli_request_portal =OST_REQUEST_PORTAL;
+
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD);
+                break;
+        }
+        case IMP_EVENT_DEACTIVATE: {
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_DEACTIVATE);
+                break;
+        }
+        case IMP_EVENT_ACTIVATE: {
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVATE);
+                break;
+        }
+        default:
+                CERROR("Unknown import event %d\n", event);
+                LBUG();
+        }
+        RETURN(rc);
+}
+
+/**
+ * Determine whether the lock can be canceled before replaying the lock
+ * during recovery, see bug16774 for detailed information.
+ *
+ * \retval zero the lock can't be canceled
+ * \retval other ok to cancel
+ */
+static int osc_cancel_weight(struct ldlm_lock *lock)
+{
+	/*
+	 * Cancel all unused and granted extent lock.
+	 */
+	if (lock->l_resource->lr_type == LDLM_EXTENT &&
+	    lock->l_granted_mode == lock->l_req_mode &&
+	    osc_ldlm_weigh_ast(lock) == 0)
+		RETURN(1);
+
+	RETURN(0);
+}
+
+static int brw_queue_work(const struct lu_env *env, void *data)
+{
+	struct client_obd *cli = data;
+
+	CDEBUG(D_CACHE, "Run writeback work for client obd %p.\n", cli);
+
+	osc_io_unplug(env, cli, NULL);
+	RETURN(0);
+}
+
+int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct client_obd *cli = &obd->u.cli;
+	struct obd_type	  *type;
+	void		  *handler;
+	int		   rc;
+	int		   adding;
+	int		   added;
+	int		   req_count;
+	ENTRY;
+
+	rc = ptlrpcd_addref();
+	if (rc)
+		RETURN(rc);
+
+	rc = client_obd_setup(obd, lcfg);
+	if (rc)
+		GOTO(out_ptlrpcd, rc);
+
+	handler = ptlrpcd_alloc_work(cli->cl_import, brw_queue_work, cli);
+	if (IS_ERR(handler))
+		GOTO(out_client_setup, rc = PTR_ERR(handler));
+	cli->cl_writeback_work = handler;
+
+	handler = ptlrpcd_alloc_work(cli->cl_import, lru_queue_work, cli);
+	if (IS_ERR(handler))
+		GOTO(out_ptlrpcd_work, rc = PTR_ERR(handler));
+	cli->cl_lru_work = handler;
+
+	rc = osc_quota_setup(obd);
+	if (rc)
+		GOTO(out_ptlrpcd_work, rc);
+
+	cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL;
+
+#ifdef CONFIG_PROC_FS
+	obd->obd_vars = lprocfs_osc_obd_vars;
+#endif
+	/* If this is true then both client (osc) and server (osp) are on the
+	 * same node. The osp layer if loaded first will register the osc proc
+	 * directory. In that case this obd_device will be attached its proc
+	 * tree to type->typ_procsym instead of obd->obd_type->typ_procroot. */
+	type = class_search_type(LUSTRE_OSP_NAME);
+	if (type && type->typ_procsym) {
+		obd->obd_proc_entry = lprocfs_register(obd->obd_name,
+						       type->typ_procsym,
+						       obd->obd_vars, obd);
+		if (IS_ERR(obd->obd_proc_entry)) {
+			rc = PTR_ERR(obd->obd_proc_entry);
+			CERROR("error %d setting up lprocfs for %s\n", rc,
+			       obd->obd_name);
+			obd->obd_proc_entry = NULL;
+		}
+	} else {
+		rc = lprocfs_obd_setup(obd);
+	}
+
+	/* If the basic OSC proc tree construction succeeded then
+	 * lets do the rest. */
+	if (rc == 0) {
+		lproc_osc_attach_seqstat(obd);
+		sptlrpc_lprocfs_cliobd_attach(obd);
+		ptlrpc_lprocfs_register_obd(obd);
+	}
+
+	/*
+	 * We try to control the total number of requests with a upper limit
+	 * osc_reqpool_maxreqcount. There might be some race which will cause
+	 * over-limit allocation, but it is fine.
+	 */
+	req_count = atomic_read(&osc_pool_req_count);
+	if (req_count < osc_reqpool_maxreqcount) {
+		adding = cli->cl_max_rpcs_in_flight + 2;
+		if (req_count + adding > osc_reqpool_maxreqcount)
+			adding = osc_reqpool_maxreqcount - req_count;
+
+		added = ptlrpc_add_rqs_to_pool(osc_rq_pool, adding);
+		atomic_add(added, &osc_pool_req_count);
+	}
+
+	INIT_LIST_HEAD(&cli->cl_grant_shrink_list);
+	ns_register_cancel(obd->obd_namespace, osc_cancel_weight);
+
+	spin_lock(&osc_shrink_lock);
+	list_add_tail(&cli->cl_shrink_list, &osc_shrink_list);
+	spin_unlock(&osc_shrink_lock);
+
+	RETURN(0);
+
+out_ptlrpcd_work:
+	if (cli->cl_writeback_work != NULL) {
+		ptlrpcd_destroy_work(cli->cl_writeback_work);
+		cli->cl_writeback_work = NULL;
+	}
+	if (cli->cl_lru_work != NULL) {
+		ptlrpcd_destroy_work(cli->cl_lru_work);
+		cli->cl_lru_work = NULL;
+	}
+out_client_setup:
+	client_obd_cleanup(obd);
+out_ptlrpcd:
+	ptlrpcd_decref();
+	RETURN(rc);
+}
+
+static int osc_precleanup(struct obd_device *obd)
+{
+	struct client_obd *cli = &obd->u.cli;
+	ENTRY;
+
+	/* LU-464
+	 * for echo client, export may be on zombie list, wait for
+	 * zombie thread to cull it, because cli.cl_import will be
+	 * cleared in client_disconnect_export():
+	 *   class_export_destroy() -> obd_cleanup() ->
+	 *   echo_device_free() -> echo_client_cleanup() ->
+	 *   obd_disconnect() -> osc_disconnect() ->
+	 *   client_disconnect_export()
+	 */
+	obd_zombie_barrier();
+	if (cli->cl_writeback_work) {
+		ptlrpcd_destroy_work(cli->cl_writeback_work);
+		cli->cl_writeback_work = NULL;
+	}
+
+	if (cli->cl_lru_work) {
+		ptlrpcd_destroy_work(cli->cl_lru_work);
+		cli->cl_lru_work = NULL;
+	}
+
+	obd_cleanup_client_import(obd);
+	ptlrpc_lprocfs_unregister_obd(obd);
+	RETURN(0);
+}
+
+int osc_cleanup(struct obd_device *obd)
+{
+	struct client_obd *cli = &obd->u.cli;
+	int rc;
+
+	ENTRY;
+
+	spin_lock(&osc_shrink_lock);
+	list_del(&cli->cl_shrink_list);
+	spin_unlock(&osc_shrink_lock);
+
+	/* lru cleanup */
+	if (cli->cl_cache != NULL) {
+		LASSERT(atomic_read(&cli->cl_cache->ccc_users) > 0);
+		spin_lock(&cli->cl_cache->ccc_lru_lock);
+		list_del_init(&cli->cl_lru_osc);
+		spin_unlock(&cli->cl_cache->ccc_lru_lock);
+		cli->cl_lru_left = NULL;
+		cl_cache_decref(cli->cl_cache);
+		cli->cl_cache = NULL;
+	}
+
+	/* free memory of osc quota cache */
+	osc_quota_cleanup(obd);
+
+	rc = client_obd_cleanup(obd);
+
+	ptlrpcd_decref();
+	RETURN(rc);
+}
+
+int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	int rc = class_process_proc_param(PARAM_OSC, obd->obd_vars, lcfg, obd);
+	return rc > 0 ? 0: rc;
+}
+
+static int osc_process_config(struct obd_device *obd, size_t len, void *buf)
+{
+        return osc_process_config_base(obd, buf);
+}
+
+static struct obd_ops osc_obd_ops = {
+        .o_owner                = THIS_MODULE,
+        .o_setup                = osc_setup,
+        .o_precleanup           = osc_precleanup,
+        .o_cleanup              = osc_cleanup,
+        .o_add_conn             = client_import_add_conn,
+        .o_del_conn             = client_import_del_conn,
+        .o_connect              = client_connect_import,
+        .o_reconnect            = osc_reconnect,
+        .o_disconnect           = osc_disconnect,
+        .o_statfs               = osc_statfs,
+        .o_statfs_async         = osc_statfs_async,
+        .o_create               = osc_create,
+        .o_destroy              = osc_destroy,
+        .o_getattr              = osc_getattr,
+        .o_setattr              = osc_setattr,
+        .o_iocontrol            = osc_iocontrol,
+        .o_set_info_async       = osc_set_info_async,
+        .o_import_event         = osc_import_event,
+        .o_process_config       = osc_process_config,
+        .o_quotactl             = osc_quotactl,
+};
+
+static struct shrinker *osc_cache_shrinker;
+struct list_head osc_shrink_list = LIST_HEAD_INIT(osc_shrink_list);
+DEFINE_SPINLOCK(osc_shrink_lock);
+
+#ifndef HAVE_SHRINKER_COUNT
+static int osc_cache_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
+{
+	struct shrink_control scv = {
+		.nr_to_scan = shrink_param(sc, nr_to_scan),
+		.gfp_mask   = shrink_param(sc, gfp_mask)
+	};
+#if !defined(HAVE_SHRINKER_WANT_SHRINK_PTR) && !defined(HAVE_SHRINK_CONTROL)
+	struct shrinker *shrinker = NULL;
+#endif
+
+	(void)osc_cache_shrink_scan(shrinker, &scv);
+
+	return osc_cache_shrink_count(shrinker, &scv);
+}
+#endif
+
+static int __init osc_init(void)
+{
+	bool enable_proc = true;
+	struct obd_type *type;
+	unsigned int reqpool_size;
+	unsigned int reqsize;
+	int rc;
+	DEF_SHRINKER_VAR(osc_shvar, osc_cache_shrink,
+			 osc_cache_shrink_count, osc_cache_shrink_scan);
+	ENTRY;
+
+	/* print an address of _any_ initialized kernel symbol from this
+	 * module, to allow debugging with gdb that doesn't support data
+	 * symbols from modules.*/
+	CDEBUG(D_INFO, "Lustre OSC module (%p).\n", &osc_caches);
+
+	rc = lu_kmem_init(osc_caches);
+	if (rc)
+		RETURN(rc);
+
+	type = class_search_type(LUSTRE_OSP_NAME);
+	if (type != NULL && type->typ_procsym != NULL)
+		enable_proc = false;
+
+	rc = class_register_type(&osc_obd_ops, NULL, enable_proc, NULL,
+				 LUSTRE_OSC_NAME, &osc_device_type);
+	if (rc)
+		GOTO(out_kmem, rc);
+
+	osc_cache_shrinker = set_shrinker(DEFAULT_SEEKS, &osc_shvar);
+
+	/* This is obviously too much memory, only prevent overflow here */
+	if (osc_reqpool_mem_max >= 1 << 12 || osc_reqpool_mem_max == 0)
+		GOTO(out_type, rc = -EINVAL);
+
+	reqpool_size = osc_reqpool_mem_max << 20;
+
+	reqsize = 1;
+	while (reqsize < OST_IO_MAXREQSIZE)
+		reqsize = reqsize << 1;
+
+	/*
+	 * We don't enlarge the request count in OSC pool according to
+	 * cl_max_rpcs_in_flight. The allocation from the pool will only be
+	 * tried after normal allocation failed. So a small OSC pool won't
+	 * cause much performance degression in most of cases.
+	 */
+	osc_reqpool_maxreqcount = reqpool_size / reqsize;
+
+	atomic_set(&osc_pool_req_count, 0);
+	osc_rq_pool = ptlrpc_init_rq_pool(0, OST_IO_MAXREQSIZE,
+					  ptlrpc_add_rqs_to_pool);
+
+	if (osc_rq_pool != NULL)
+		GOTO(out, rc);
+	rc = -ENOMEM;
+out_type:
+	class_unregister_type(LUSTRE_OSC_NAME);
+out_kmem:
+	lu_kmem_fini(osc_caches);
+out:
+	RETURN(rc);
+}
+
+static void __exit osc_exit(void)
+{
+	remove_shrinker(osc_cache_shrinker);
+	class_unregister_type(LUSTRE_OSC_NAME);
+	lu_kmem_fini(osc_caches);
+	ptlrpc_free_rq_pool(osc_rq_pool);
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Object Storage Client (OSC)");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+MODULE_LICENSE("GPL");
+
+module_init(osc_init);
+module_exit(osc_exit);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/client.c b/drivers/staging/lustrefsx/lustre/ptlrpc/client.c
new file mode 100644
index 0000000000000..d127e5e63bfdb
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/client.c
@@ -0,0 +1,3478 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/** Implementation of client-side PortalRPC interfaces */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+#include <lustre_ha.h>
+#include <lustre_import.h>
+#include <lustre_req_layout.h>
+
+#include "ptlrpc_internal.h"
+
+const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kiov_pin_ops = {
+	.add_kiov_frag	= ptlrpc_prep_bulk_page_pin,
+	.release_frags	= ptlrpc_release_bulk_page_pin,
+};
+EXPORT_SYMBOL(ptlrpc_bulk_kiov_pin_ops);
+
+const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kiov_nopin_ops = {
+	.add_kiov_frag	= ptlrpc_prep_bulk_page_nopin,
+	.release_frags	= ptlrpc_release_bulk_noop,
+};
+EXPORT_SYMBOL(ptlrpc_bulk_kiov_nopin_ops);
+
+const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kvec_ops = {
+	.add_iov_frag = ptlrpc_prep_bulk_frag,
+};
+EXPORT_SYMBOL(ptlrpc_bulk_kvec_ops);
+
+static int ptlrpc_send_new_req(struct ptlrpc_request *req);
+static int ptlrpcd_check_work(struct ptlrpc_request *req);
+static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async);
+
+/**
+ * Initialize passed in client structure \a cl.
+ */
+void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
+                        struct ptlrpc_client *cl)
+{
+        cl->cli_request_portal = req_portal;
+        cl->cli_reply_portal   = rep_portal;
+        cl->cli_name           = name;
+}
+EXPORT_SYMBOL(ptlrpc_init_client);
+
+/**
+ * Return PortalRPC connection for remore uud \a uuid
+ */
+struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid,
+						    lnet_nid_t nid4refnet)
+{
+	struct ptlrpc_connection *c;
+	lnet_nid_t                self;
+	struct lnet_process_id peer;
+	int                       err;
+
+	/* ptlrpc_uuid_to_peer() initializes its 2nd parameter
+	 * before accessing its values. */
+	/* coverity[uninit_use_in_call] */
+	peer.nid = nid4refnet;
+	err = ptlrpc_uuid_to_peer(uuid, &peer, &self);
+	if (err != 0) {
+		CNETERR("cannot find peer %s!\n", uuid->uuid);
+		return NULL;
+	}
+
+	c = ptlrpc_connection_get(peer, self, uuid);
+	if (c) {
+		memcpy(c->c_remote_uuid.uuid,
+		       uuid->uuid, sizeof(c->c_remote_uuid.uuid));
+	}
+
+	CDEBUG(D_INFO, "%s -> %p\n", uuid->uuid, c);
+
+	return c;
+}
+
+/**
+ * Allocate and initialize new bulk descriptor on the sender.
+ * Returns pointer to the descriptor or NULL on error.
+ */
+struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned nfrags, unsigned max_brw,
+					 enum ptlrpc_bulk_op_type type,
+					 unsigned portal,
+					 const struct ptlrpc_bulk_frag_ops *ops)
+{
+	struct ptlrpc_bulk_desc *desc;
+	int i;
+
+	/* ensure that only one of KIOV or IOVEC is set but not both */
+	LASSERT((ptlrpc_is_bulk_desc_kiov(type) &&
+		 ops->add_kiov_frag != NULL) ||
+		(ptlrpc_is_bulk_desc_kvec(type) &&
+		 ops->add_iov_frag != NULL));
+
+	OBD_ALLOC_PTR(desc);
+	if (desc == NULL)
+		return NULL;
+	if (type & PTLRPC_BULK_BUF_KIOV) {
+		OBD_ALLOC_LARGE(GET_KIOV(desc),
+				nfrags * sizeof(*GET_KIOV(desc)));
+		if (GET_KIOV(desc) == NULL)
+			goto out;
+	} else {
+		OBD_ALLOC_LARGE(GET_KVEC(desc),
+				nfrags * sizeof(*GET_KVEC(desc)));
+		if (GET_KVEC(desc) == NULL)
+			goto out;
+	}
+
+	spin_lock_init(&desc->bd_lock);
+	init_waitqueue_head(&desc->bd_waitq);
+	desc->bd_max_iov = nfrags;
+	desc->bd_iov_count = 0;
+	desc->bd_portal = portal;
+	desc->bd_type = type;
+	desc->bd_md_count = 0;
+	desc->bd_frag_ops = (struct ptlrpc_bulk_frag_ops *) ops;
+	LASSERT(max_brw > 0);
+	desc->bd_md_max_brw = min(max_brw, PTLRPC_BULK_OPS_COUNT);
+	/* PTLRPC_BULK_OPS_COUNT is the compile-time transfer limit for this
+	 * node. Negotiated ocd_brw_size will always be <= this number. */
+	for (i = 0; i < PTLRPC_BULK_OPS_COUNT; i++)
+		LNetInvalidateMDHandle(&desc->bd_mds[i]);
+
+	return desc;
+out:
+	OBD_FREE_PTR(desc);
+	return NULL;
+}
+
+/**
+ * Prepare bulk descriptor for specified outgoing request \a req that
+ * can fit \a nfrags * pages. \a type is bulk type. \a portal is where
+ * the bulk to be sent. Used on client-side.
+ * Returns pointer to newly allocatrd initialized bulk descriptor or NULL on
+ * error.
+ */
+struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req,
+					      unsigned nfrags, unsigned max_brw,
+					      unsigned int type,
+					      unsigned portal,
+					      const struct ptlrpc_bulk_frag_ops
+						*ops)
+{
+	struct obd_import *imp = req->rq_import;
+	struct ptlrpc_bulk_desc *desc;
+
+	ENTRY;
+	LASSERT(ptlrpc_is_bulk_op_passive(type));
+
+	desc = ptlrpc_new_bulk(nfrags, max_brw, type, portal, ops);
+	if (desc == NULL)
+		RETURN(NULL);
+
+        desc->bd_import_generation = req->rq_import_generation;
+        desc->bd_import = class_import_get(imp);
+        desc->bd_req = req;
+
+        desc->bd_cbid.cbid_fn  = client_bulk_callback;
+        desc->bd_cbid.cbid_arg = desc;
+
+        /* This makes req own desc, and free it when she frees herself */
+        req->rq_bulk = desc;
+
+        return desc;
+}
+EXPORT_SYMBOL(ptlrpc_prep_bulk_imp);
+
+void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
+			     struct page *page, int pageoffset, int len,
+			     int pin)
+{
+	lnet_kiov_t *kiov;
+
+	LASSERT(desc->bd_iov_count < desc->bd_max_iov);
+	LASSERT(page != NULL);
+	LASSERT(pageoffset >= 0);
+	LASSERT(len > 0);
+	LASSERT(pageoffset + len <= PAGE_SIZE);
+	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
+
+	kiov = &BD_GET_KIOV(desc, desc->bd_iov_count);
+
+	desc->bd_nob += len;
+
+	if (pin)
+		get_page(page);
+
+	kiov->kiov_page = page;
+	kiov->kiov_offset = pageoffset;
+	kiov->kiov_len = len;
+
+	desc->bd_iov_count++;
+}
+EXPORT_SYMBOL(__ptlrpc_prep_bulk_page);
+
+int ptlrpc_prep_bulk_frag(struct ptlrpc_bulk_desc *desc,
+			  void *frag, int len)
+{
+	struct kvec *iovec;
+	ENTRY;
+
+	LASSERT(desc->bd_iov_count < desc->bd_max_iov);
+	LASSERT(frag != NULL);
+	LASSERT(len > 0);
+	LASSERT(ptlrpc_is_bulk_desc_kvec(desc->bd_type));
+
+	iovec = &BD_GET_KVEC(desc, desc->bd_iov_count);
+
+	desc->bd_nob += len;
+
+	iovec->iov_base = frag;
+	iovec->iov_len = len;
+
+	desc->bd_iov_count++;
+
+	RETURN(desc->bd_nob);
+}
+EXPORT_SYMBOL(ptlrpc_prep_bulk_frag);
+
+void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc)
+{
+	ENTRY;
+
+	LASSERT(desc != NULL);
+	LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */
+	LASSERT(desc->bd_md_count == 0);         /* network hands off */
+	LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL));
+	LASSERT(desc->bd_frag_ops != NULL);
+
+	if (ptlrpc_is_bulk_desc_kiov(desc->bd_type))
+		sptlrpc_enc_pool_put_pages(desc);
+
+	if (desc->bd_export)
+		class_export_put(desc->bd_export);
+	else
+		class_import_put(desc->bd_import);
+
+	if (desc->bd_frag_ops->release_frags != NULL)
+		desc->bd_frag_ops->release_frags(desc);
+
+	if (ptlrpc_is_bulk_desc_kiov(desc->bd_type))
+		OBD_FREE_LARGE(GET_KIOV(desc),
+			desc->bd_max_iov * sizeof(*GET_KIOV(desc)));
+	else
+		OBD_FREE_LARGE(GET_KVEC(desc),
+			desc->bd_max_iov * sizeof(*GET_KVEC(desc)));
+	OBD_FREE_PTR(desc);
+	EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_free_bulk);
+
+/**
+ * Set server timelimit for this req, i.e. how long are we willing to wait
+ * for reply before timing out this request.
+ */
+void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req)
+{
+        __u32 serv_est;
+        int idx;
+        struct imp_at *at;
+
+        LASSERT(req->rq_import);
+
+        if (AT_OFF) {
+                /* non-AT settings */
+                /**
+                 * \a imp_server_timeout means this is reverse import and
+                 * we send (currently only) ASTs to the client and cannot afford
+                 * to wait too long for the reply, otherwise the other client
+                 * (because of which we are sending this request) would
+                 * timeout waiting for us
+                 */
+                req->rq_timeout = req->rq_import->imp_server_timeout ?
+                                  obd_timeout / 2 : obd_timeout;
+        } else {
+                at = &req->rq_import->imp_at;
+                idx = import_at_get_index(req->rq_import,
+                                          req->rq_request_portal);
+                serv_est = at_get(&at->iat_service_estimate[idx]);
+                req->rq_timeout = at_est2timeout(serv_est);
+        }
+        /* We could get even fancier here, using history to predict increased
+           loading... */
+
+        /* Let the server know what this RPC timeout is by putting it in the
+           reqmsg*/
+        lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
+}
+EXPORT_SYMBOL(ptlrpc_at_set_req_timeout);
+
+/* Adjust max service estimate based on server value */
+static void ptlrpc_at_adj_service(struct ptlrpc_request *req,
+                                  unsigned int serv_est)
+{
+        int idx;
+        unsigned int oldse;
+        struct imp_at *at;
+
+        LASSERT(req->rq_import);
+        at = &req->rq_import->imp_at;
+
+        idx = import_at_get_index(req->rq_import, req->rq_request_portal);
+        /* max service estimates are tracked on the server side,
+           so just keep minimal history here */
+        oldse = at_measured(&at->iat_service_estimate[idx], serv_est);
+        if (oldse != 0)
+                CDEBUG(D_ADAPTTO, "The RPC service estimate for %s ptl %d "
+                       "has changed from %d to %d\n",
+                       req->rq_import->imp_obd->obd_name,req->rq_request_portal,
+                       oldse, at_get(&at->iat_service_estimate[idx]));
+}
+
+/* Expected network latency per remote node (secs) */
+int ptlrpc_at_get_net_latency(struct ptlrpc_request *req)
+{
+        return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency);
+}
+
+/* Adjust expected network latency */
+void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
+			       unsigned int service_time)
+{
+        unsigned int nl, oldnl;
+        struct imp_at *at;
+	time64_t now = ktime_get_real_seconds();
+
+        LASSERT(req->rq_import);
+
+	if (service_time > now - req->rq_sent + 3) {
+		/* bz16408, however, this can also happen if early reply
+		 * is lost and client RPC is expired and resent, early reply
+		 * or reply of original RPC can still be fit in reply buffer
+		 * of resent RPC, now client is measuring time from the
+		 * resent time, but server sent back service time of original
+		 * RPC.
+		 */
+		CDEBUG((lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) ?
+		       D_ADAPTTO : D_WARNING,
+		       "Reported service time %u > total measured time %lld\n",
+		       service_time, now - req->rq_sent);
+		return;
+	}
+
+        /* Network latency is total time less server processing time */
+	nl = max_t(int, now - req->rq_sent -
+			service_time, 0) + 1; /* st rounding */
+	at = &req->rq_import->imp_at;
+
+        oldnl = at_measured(&at->iat_net_latency, nl);
+        if (oldnl != 0)
+                CDEBUG(D_ADAPTTO, "The network latency for %s (nid %s) "
+                       "has changed from %d to %d\n",
+                       req->rq_import->imp_obd->obd_name,
+                       obd_uuid2str(
+                               &req->rq_import->imp_connection->c_remote_uuid),
+                       oldnl, at_get(&at->iat_net_latency));
+}
+
+static int unpack_reply(struct ptlrpc_request *req)
+{
+        int rc;
+
+        if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) {
+                rc = ptlrpc_unpack_rep_msg(req, req->rq_replen);
+                if (rc) {
+                        DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc);
+                        return(-EPROTO);
+                }
+        }
+
+        rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
+        if (rc) {
+                DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc);
+                return(-EPROTO);
+        }
+        return 0;
+}
+
+/**
+ * Handle an early reply message, called with the rq_lock held.
+ * If anything goes wrong just ignore it - same as if it never happened
+ */
+static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req)
+__must_hold(&req->rq_lock)
+{
+	struct ptlrpc_request *early_req;
+	time64_t olddl;
+	int rc;
+
+	ENTRY;
+        req->rq_early = 0;
+	spin_unlock(&req->rq_lock);
+
+	rc = sptlrpc_cli_unwrap_early_reply(req, &early_req);
+	if (rc) {
+		spin_lock(&req->rq_lock);
+		RETURN(rc);
+	}
+
+	rc = unpack_reply(early_req);
+	if (rc != 0) {
+		sptlrpc_cli_finish_early_reply(early_req);
+		spin_lock(&req->rq_lock);
+		RETURN(rc);
+	}
+
+	/* Use new timeout value just to adjust the local value for this
+	 * request, don't include it into at_history. It is unclear yet why
+	 * service time increased and should it be counted or skipped, e.g.
+	 * that can be recovery case or some error or server, the real reply
+	 * will add all new data if it is worth to add. */
+	req->rq_timeout = lustre_msg_get_timeout(early_req->rq_repmsg);
+	lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
+
+	/* Network latency can be adjusted, it is pure network delays */
+	ptlrpc_at_adj_net_latency(req,
+			lustre_msg_get_service_time(early_req->rq_repmsg));
+
+	sptlrpc_cli_finish_early_reply(early_req);
+
+	spin_lock(&req->rq_lock);
+	olddl = req->rq_deadline;
+	/* server assumes it now has rq_timeout from when the request
+	 * arrived, so the client should give it at least that long.
+	 * since we don't know the arrival time we'll use the original
+	 * sent time */
+	req->rq_deadline = req->rq_sent + req->rq_timeout +
+			   ptlrpc_at_get_net_latency(req);
+
+	DEBUG_REQ(D_ADAPTTO, req,
+		  "Early reply #%d, new deadline in %llds (%llds)",
+		  req->rq_early_count,
+		  req->rq_deadline - ktime_get_real_seconds(),
+		  req->rq_deadline - olddl);
+
+	RETURN(rc);
+}
+
+static struct kmem_cache *request_cache;
+
+int ptlrpc_request_cache_init(void)
+{
+	request_cache = kmem_cache_create("ptlrpc_cache",
+					  sizeof(struct ptlrpc_request),
+					  0, SLAB_HWCACHE_ALIGN, NULL);
+	return request_cache == NULL ? -ENOMEM : 0;
+}
+
+void ptlrpc_request_cache_fini(void)
+{
+	kmem_cache_destroy(request_cache);
+}
+
+struct ptlrpc_request *ptlrpc_request_cache_alloc(gfp_t flags)
+{
+	struct ptlrpc_request *req;
+
+	OBD_SLAB_ALLOC_PTR_GFP(req, request_cache, flags);
+	return req;
+}
+
+void ptlrpc_request_cache_free(struct ptlrpc_request *req)
+{
+	OBD_SLAB_FREE_PTR(req, request_cache);
+}
+
+/**
+ * Wind down request pool \a pool.
+ * Frees all requests from the pool too
+ */
+void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool)
+{
+	struct list_head *l, *tmp;
+	struct ptlrpc_request *req;
+
+	LASSERT(pool != NULL);
+
+	spin_lock(&pool->prp_lock);
+	list_for_each_safe(l, tmp, &pool->prp_req_list) {
+		req = list_entry(l, struct ptlrpc_request, rq_list);
+		list_del(&req->rq_list);
+		LASSERT(req->rq_reqbuf);
+		LASSERT(req->rq_reqbuf_len == pool->prp_rq_size);
+		OBD_FREE_LARGE(req->rq_reqbuf, pool->prp_rq_size);
+		ptlrpc_request_cache_free(req);
+	}
+	spin_unlock(&pool->prp_lock);
+	OBD_FREE(pool, sizeof(*pool));
+}
+EXPORT_SYMBOL(ptlrpc_free_rq_pool);
+
+/**
+ * Allocates, initializes and adds \a num_rq requests to the pool \a pool
+ */
+int ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq)
+{
+        int i;
+        int size = 1;
+
+        while (size < pool->prp_rq_size)
+                size <<= 1;
+
+	LASSERTF(list_empty(&pool->prp_req_list) ||
+                 size == pool->prp_rq_size,
+                 "Trying to change pool size with nonempty pool "
+                 "from %d to %d bytes\n", pool->prp_rq_size, size);
+
+	spin_lock(&pool->prp_lock);
+	pool->prp_rq_size = size;
+	for (i = 0; i < num_rq; i++) {
+		struct ptlrpc_request *req;
+		struct lustre_msg *msg;
+
+		spin_unlock(&pool->prp_lock);
+		req = ptlrpc_request_cache_alloc(GFP_NOFS);
+		if (!req)
+			return i;
+		OBD_ALLOC_LARGE(msg, size);
+		if (!msg) {
+			ptlrpc_request_cache_free(req);
+			return i;
+		}
+		req->rq_reqbuf = msg;
+		req->rq_reqbuf_len = size;
+		req->rq_pool = pool;
+		spin_lock(&pool->prp_lock);
+		list_add_tail(&req->rq_list, &pool->prp_req_list);
+	}
+	spin_unlock(&pool->prp_lock);
+	return num_rq;
+}
+EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool);
+
+/**
+ * Create and initialize new request pool with given attributes:
+ * \a num_rq - initial number of requests to create for the pool
+ * \a msgsize - maximum message size possible for requests in thid pool
+ * \a populate_pool - function to be called when more requests need to be added
+ *                    to the pool
+ * Returns pointer to newly created pool or NULL on error.
+ */
+struct ptlrpc_request_pool *
+ptlrpc_init_rq_pool(int num_rq, int msgsize,
+		    int (*populate_pool)(struct ptlrpc_request_pool *, int))
+{
+	struct ptlrpc_request_pool *pool;
+
+	OBD_ALLOC(pool, sizeof(struct ptlrpc_request_pool));
+	if (!pool)
+		return NULL;
+
+	/* Request next power of two for the allocation, because internally
+	   kernel would do exactly this */
+
+	spin_lock_init(&pool->prp_lock);
+	INIT_LIST_HEAD(&pool->prp_req_list);
+	pool->prp_rq_size = msgsize + SPTLRPC_MAX_PAYLOAD;
+	pool->prp_populate = populate_pool;
+
+	populate_pool(pool, num_rq);
+
+	return pool;
+}
+EXPORT_SYMBOL(ptlrpc_init_rq_pool);
+
+/**
+ * Fetches one request from pool \a pool
+ */
+static struct ptlrpc_request *
+ptlrpc_prep_req_from_pool(struct ptlrpc_request_pool *pool)
+{
+        struct ptlrpc_request *request;
+        struct lustre_msg *reqbuf;
+
+        if (!pool)
+                return NULL;
+
+	spin_lock(&pool->prp_lock);
+
+	/* See if we have anything in a pool, and bail out if nothing,
+	 * in writeout path, where this matters, this is safe to do, because
+	 * nothing is lost in this case, and when some in-flight requests
+	 * complete, this code will be called again. */
+	if (unlikely(list_empty(&pool->prp_req_list))) {
+		spin_unlock(&pool->prp_lock);
+		return NULL;
+	}
+
+	request = list_entry(pool->prp_req_list.next, struct ptlrpc_request,
+			     rq_list);
+	list_del_init(&request->rq_list);
+	spin_unlock(&pool->prp_lock);
+
+        LASSERT(request->rq_reqbuf);
+        LASSERT(request->rq_pool);
+
+        reqbuf = request->rq_reqbuf;
+        memset(request, 0, sizeof(*request));
+        request->rq_reqbuf = reqbuf;
+        request->rq_reqbuf_len = pool->prp_rq_size;
+        request->rq_pool = pool;
+
+        return request;
+}
+
+/**
+ * Returns freed \a request to pool.
+ */
+static void __ptlrpc_free_req_to_pool(struct ptlrpc_request *request)
+{
+	struct ptlrpc_request_pool *pool = request->rq_pool;
+
+	spin_lock(&pool->prp_lock);
+	LASSERT(list_empty(&request->rq_list));
+	LASSERT(!request->rq_receiving_reply);
+	list_add_tail(&request->rq_list, &pool->prp_req_list);
+	spin_unlock(&pool->prp_lock);
+}
+
+void ptlrpc_add_unreplied(struct ptlrpc_request *req)
+{
+	struct obd_import	*imp = req->rq_import;
+	struct list_head	*tmp;
+	struct ptlrpc_request	*iter;
+
+	assert_spin_locked(&imp->imp_lock);
+	LASSERT(list_empty(&req->rq_unreplied_list));
+
+	/* unreplied list is sorted by xid in ascending order */
+	list_for_each_prev(tmp, &imp->imp_unreplied_list) {
+		iter = list_entry(tmp, struct ptlrpc_request,
+				  rq_unreplied_list);
+
+		LASSERT(req->rq_xid != iter->rq_xid);
+		if (req->rq_xid < iter->rq_xid)
+			continue;
+		list_add(&req->rq_unreplied_list, &iter->rq_unreplied_list);
+		return;
+	}
+	list_add(&req->rq_unreplied_list, &imp->imp_unreplied_list);
+}
+
+void ptlrpc_assign_next_xid_nolock(struct ptlrpc_request *req)
+{
+	req->rq_xid = ptlrpc_next_xid();
+	ptlrpc_add_unreplied(req);
+}
+
+static inline void ptlrpc_assign_next_xid(struct ptlrpc_request *req)
+{
+	spin_lock(&req->rq_import->imp_lock);
+	ptlrpc_assign_next_xid_nolock(req);
+	spin_unlock(&req->rq_import->imp_lock);
+}
+
+int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
+			     __u32 version, int opcode, char **bufs,
+			     struct ptlrpc_cli_ctx *ctx)
+{
+	int count;
+	struct obd_import *imp;
+	__u32 *lengths;
+	int rc;
+
+	ENTRY;
+
+	count = req_capsule_filled_sizes(&request->rq_pill, RCL_CLIENT);
+	imp = request->rq_import;
+	lengths = request->rq_pill.rc_area[RCL_CLIENT];
+
+	if (ctx != NULL) {
+		request->rq_cli_ctx = sptlrpc_cli_ctx_get(ctx);
+	} else {
+		rc = sptlrpc_req_get_ctx(request);
+		if (rc)
+			GOTO(out_free, rc);
+	}
+	sptlrpc_req_set_flavor(request, opcode);
+
+	rc = lustre_pack_request(request, imp->imp_msg_magic, count,
+				 lengths, bufs);
+	if (rc)
+		GOTO(out_ctx, rc);
+
+	lustre_msg_add_version(request->rq_reqmsg, version);
+	request->rq_send_state = LUSTRE_IMP_FULL;
+	request->rq_type = PTL_RPC_MSG_REQUEST;
+
+	request->rq_req_cbid.cbid_fn  = request_out_callback;
+	request->rq_req_cbid.cbid_arg = request;
+
+	request->rq_reply_cbid.cbid_fn  = reply_in_callback;
+	request->rq_reply_cbid.cbid_arg = request;
+
+	request->rq_reply_deadline = 0;
+	request->rq_bulk_deadline = 0;
+	request->rq_req_deadline = 0;
+	request->rq_phase = RQ_PHASE_NEW;
+	request->rq_next_phase = RQ_PHASE_UNDEFINED;
+
+	request->rq_request_portal = imp->imp_client->cli_request_portal;
+	request->rq_reply_portal = imp->imp_client->cli_reply_portal;
+
+	ptlrpc_at_set_req_timeout(request);
+
+	lustre_msg_set_opc(request->rq_reqmsg, opcode);
+	ptlrpc_assign_next_xid(request);
+
+	/* Let's setup deadline for req/reply/bulk unlink for opcode. */
+	if (cfs_fail_val == opcode) {
+		time64_t *fail_t = NULL, *fail2_t = NULL;
+
+		if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK))
+			fail_t = &request->rq_bulk_deadline;
+		else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK))
+			fail_t = &request->rq_reply_deadline;
+		else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REQ_UNLINK))
+			fail_t = &request->rq_req_deadline;
+		else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BOTH_UNLINK)) {
+			fail_t = &request->rq_reply_deadline;
+			fail2_t = &request->rq_bulk_deadline;
+		}
+
+		if (fail_t) {
+			*fail_t = ktime_get_real_seconds() + LONG_UNLINK;
+
+			if (fail2_t)
+				*fail2_t = ktime_get_real_seconds() +
+					   LONG_UNLINK;
+
+			/*
+			 * The RPC is infected, let the test to change the
+			 * fail_loc
+			 */
+			msleep(4 * MSEC_PER_SEC);
+		}
+	}
+
+	RETURN(0);
+
+out_ctx:
+	LASSERT(!request->rq_pool);
+	sptlrpc_cli_ctx_put(request->rq_cli_ctx, 1);
+out_free:
+	class_import_put(imp);
+
+	return rc;
+
+}
+EXPORT_SYMBOL(ptlrpc_request_bufs_pack);
+
+/**
+ * Pack request buffers for network transfer, performing necessary encryption
+ * steps if necessary.
+ */
+int ptlrpc_request_pack(struct ptlrpc_request *request,
+                        __u32 version, int opcode)
+{
+	int rc;
+	rc = ptlrpc_request_bufs_pack(request, version, opcode, NULL, NULL);
+	if (rc)
+		return rc;
+
+	/* For some old 1.8 clients (< 1.8.7), they will LASSERT the size of
+	 * ptlrpc_body sent from server equal to local ptlrpc_body size, so we
+	 * have to send old ptlrpc_body to keep interoprability with these
+	 * clients.
+	 *
+	 * Only three kinds of server->client RPCs so far:
+	 *  - LDLM_BL_CALLBACK
+	 *  - LDLM_CP_CALLBACK
+	 *  - LDLM_GL_CALLBACK
+	 *
+	 * XXX This should be removed whenever we drop the interoprability with
+	 *     the these old clients.
+	 */
+	if (opcode == LDLM_BL_CALLBACK || opcode == LDLM_CP_CALLBACK ||
+	    opcode == LDLM_GL_CALLBACK)
+		req_capsule_shrink(&request->rq_pill, &RMF_PTLRPC_BODY,
+				   sizeof(struct ptlrpc_body_v2), RCL_CLIENT);
+
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_request_pack);
+
+/**
+ * Helper function to allocate new request on import \a imp
+ * and possibly using existing request from pool \a pool if provided.
+ * Returns allocated request structure with import field filled or
+ * NULL on error.
+ */
+static inline
+struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp,
+					      struct ptlrpc_request_pool *pool)
+{
+	struct ptlrpc_request *request = NULL;
+
+	request = ptlrpc_request_cache_alloc(GFP_NOFS);
+
+	if (!request && pool)
+		request = ptlrpc_prep_req_from_pool(pool);
+
+	if (request) {
+		ptlrpc_cli_req_init(request);
+
+		LASSERTF((unsigned long)imp > 0x1000, "%p", imp);
+		LASSERT(imp != LP_POISON);
+		LASSERTF((unsigned long)imp->imp_client > 0x1000, "%p\n",
+			imp->imp_client);
+		LASSERT(imp->imp_client != LP_POISON);
+
+		request->rq_import = class_import_get(imp);
+	} else {
+		CERROR("request allocation out of memory\n");
+	}
+
+	return request;
+}
+
+/**
+ * Helper function for creating a request.
+ * Calls __ptlrpc_request_alloc to allocate new request sturcture and inits
+ * buffer structures according to capsule template \a format.
+ * Returns allocated request structure pointer or NULL on error.
+ */
+static struct ptlrpc_request *
+ptlrpc_request_alloc_internal(struct obd_import *imp,
+                              struct ptlrpc_request_pool * pool,
+                              const struct req_format *format)
+{
+        struct ptlrpc_request *request;
+
+        request = __ptlrpc_request_alloc(imp, pool);
+        if (request == NULL)
+                return NULL;
+
+        req_capsule_init(&request->rq_pill, request, RCL_CLIENT);
+        req_capsule_set(&request->rq_pill, format);
+        return request;
+}
+
+/**
+ * Allocate new request structure for import \a imp and initialize its
+ * buffer structure according to capsule template \a format.
+ */
+struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp,
+                                            const struct req_format *format)
+{
+        return ptlrpc_request_alloc_internal(imp, NULL, format);
+}
+EXPORT_SYMBOL(ptlrpc_request_alloc);
+
+/**
+ * Allocate new request structure for import \a imp from pool \a pool and
+ * initialize its buffer structure according to capsule template \a format.
+ */
+struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp,
+                                            struct ptlrpc_request_pool * pool,
+                                            const struct req_format *format)
+{
+        return ptlrpc_request_alloc_internal(imp, pool, format);
+}
+EXPORT_SYMBOL(ptlrpc_request_alloc_pool);
+
+/**
+ * For requests not from pool, free memory of the request structure.
+ * For requests obtained from a pool earlier, return request back to pool.
+ */
+void ptlrpc_request_free(struct ptlrpc_request *request)
+{
+	if (request->rq_pool)
+		__ptlrpc_free_req_to_pool(request);
+	else
+		ptlrpc_request_cache_free(request);
+}
+EXPORT_SYMBOL(ptlrpc_request_free);
+
+/**
+ * Allocate new request for operatione \a opcode and immediatelly pack it for
+ * network transfer.
+ * Only used for simple requests like OBD_PING where the only important
+ * part of the request is operation itself.
+ * Returns allocated request or NULL on error.
+ */
+struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp,
+                                                const struct req_format *format,
+                                                __u32 version, int opcode)
+{
+        struct ptlrpc_request *req = ptlrpc_request_alloc(imp, format);
+        int                    rc;
+
+        if (req) {
+                rc = ptlrpc_request_pack(req, version, opcode);
+                if (rc) {
+                        ptlrpc_request_free(req);
+                        req = NULL;
+                }
+        }
+        return req;
+}
+EXPORT_SYMBOL(ptlrpc_request_alloc_pack);
+
+/**
+ * Allocate and initialize new request set structure on the current CPT.
+ * Returns a pointer to the newly allocated set structure or NULL on error.
+ */
+struct ptlrpc_request_set *ptlrpc_prep_set(void)
+{
+	struct ptlrpc_request_set	*set;
+	int				cpt;
+
+	ENTRY;
+	cpt = cfs_cpt_current(cfs_cpt_table, 0);
+	OBD_CPT_ALLOC(set, cfs_cpt_table, cpt, sizeof *set);
+	if (!set)
+		RETURN(NULL);
+	atomic_set(&set->set_refcount, 1);
+	INIT_LIST_HEAD(&set->set_requests);
+	init_waitqueue_head(&set->set_waitq);
+	atomic_set(&set->set_new_count, 0);
+	atomic_set(&set->set_remaining, 0);
+	spin_lock_init(&set->set_new_req_lock);
+	INIT_LIST_HEAD(&set->set_new_requests);
+	INIT_LIST_HEAD(&set->set_cblist);
+	set->set_max_inflight = UINT_MAX;
+	set->set_producer     = NULL;
+	set->set_producer_arg = NULL;
+	set->set_rc           = 0;
+
+	RETURN(set);
+}
+EXPORT_SYMBOL(ptlrpc_prep_set);
+
+/**
+ * Allocate and initialize new request set structure with flow control
+ * extension. This extension allows to control the number of requests in-flight
+ * for the whole set. A callback function to generate requests must be provided
+ * and the request set will keep the number of requests sent over the wire to
+ * @max_inflight.
+ * Returns a pointer to the newly allocated set structure or NULL on error.
+ */
+struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func,
+					     void *arg)
+
+{
+	struct ptlrpc_request_set *set;
+
+	set = ptlrpc_prep_set();
+	if (!set)
+		RETURN(NULL);
+
+	set->set_max_inflight  = max;
+	set->set_producer      = func;
+	set->set_producer_arg  = arg;
+
+	RETURN(set);
+}
+
+/**
+ * Wind down and free request set structure previously allocated with
+ * ptlrpc_prep_set.
+ * Ensures that all requests on the set have completed and removes
+ * all requests from the request list in a set.
+ * If any unsent request happen to be on the list, pretends that they got
+ * an error in flight and calls their completion handler.
+ */
+void ptlrpc_set_destroy(struct ptlrpc_request_set *set)
+{
+	struct list_head	*tmp;
+	struct list_head	*next;
+	int			 expected_phase;
+	int			 n = 0;
+	ENTRY;
+
+	/* Requests on the set should either all be completed, or all be new */
+	expected_phase = (atomic_read(&set->set_remaining) == 0) ?
+			 RQ_PHASE_COMPLETE : RQ_PHASE_NEW;
+	list_for_each(tmp, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(tmp, struct ptlrpc_request,
+				   rq_set_chain);
+
+		LASSERT(req->rq_phase == expected_phase);
+		n++;
+	}
+
+	LASSERTF(atomic_read(&set->set_remaining) == 0 ||
+		 atomic_read(&set->set_remaining) == n, "%d / %d\n",
+		 atomic_read(&set->set_remaining), n);
+
+	list_for_each_safe(tmp, next, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(tmp, struct ptlrpc_request,
+				   rq_set_chain);
+		list_del_init(&req->rq_set_chain);
+
+		LASSERT(req->rq_phase == expected_phase);
+
+		if (req->rq_phase == RQ_PHASE_NEW) {
+			ptlrpc_req_interpret(NULL, req, -EBADR);
+			atomic_dec(&set->set_remaining);
+		}
+
+		spin_lock(&req->rq_lock);
+		req->rq_set = NULL;
+		req->rq_invalid_rqset = 0;
+		spin_unlock(&req->rq_lock);
+
+                ptlrpc_req_finished (req);
+        }
+
+	LASSERT(atomic_read(&set->set_remaining) == 0);
+
+	ptlrpc_reqset_put(set);
+	EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_set_destroy);
+
+/**
+ * Add a callback function \a fn to the set.
+ * This function would be called when all requests on this set are completed.
+ * The function will be passed \a data argument.
+ */
+int ptlrpc_set_add_cb(struct ptlrpc_request_set *set,
+                      set_interpreter_func fn, void *data)
+{
+	struct ptlrpc_set_cbdata *cbdata;
+
+	OBD_ALLOC_PTR(cbdata);
+	if (cbdata == NULL)
+		RETURN(-ENOMEM);
+
+	cbdata->psc_interpret = fn;
+	cbdata->psc_data = data;
+	list_add_tail(&cbdata->psc_item, &set->set_cblist);
+
+	RETURN(0);
+}
+
+/**
+ * Add a new request to the general purpose request set.
+ * Assumes request reference from the caller.
+ */
+void ptlrpc_set_add_req(struct ptlrpc_request_set *set,
+                        struct ptlrpc_request *req)
+{
+	LASSERT(list_empty(&req->rq_set_chain));
+
+	if (req->rq_allow_intr)
+		set->set_allow_intr = 1;
+
+	/* The set takes over the caller's request reference */
+	list_add_tail(&req->rq_set_chain, &set->set_requests);
+	req->rq_set = set;
+	atomic_inc(&set->set_remaining);
+	req->rq_queued_time = cfs_time_current();
+
+	if (req->rq_reqmsg != NULL)
+		lustre_msg_set_jobid(req->rq_reqmsg, NULL);
+
+	if (set->set_producer != NULL)
+		/* If the request set has a producer callback, the RPC must be
+		 * sent straight away */
+		ptlrpc_send_new_req(req);
+}
+EXPORT_SYMBOL(ptlrpc_set_add_req);
+
+/**
+ * Add a request to a request with dedicated server thread
+ * and wake the thread to make any necessary processing.
+ * Currently only used for ptlrpcd.
+ */
+void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
+                           struct ptlrpc_request *req)
+{
+        struct ptlrpc_request_set *set = pc->pc_set;
+        int count, i;
+
+        LASSERT(req->rq_set == NULL);
+	LASSERT(test_bit(LIOD_STOP, &pc->pc_flags) == 0);
+
+	spin_lock(&set->set_new_req_lock);
+	/*
+	 * The set takes over the caller's request reference.
+	 */
+	req->rq_set = set;
+	req->rq_queued_time = cfs_time_current();
+	list_add_tail(&req->rq_set_chain, &set->set_new_requests);
+	count = atomic_inc_return(&set->set_new_count);
+	spin_unlock(&set->set_new_req_lock);
+
+	/* Only need to call wakeup once for the first entry. */
+	if (count == 1) {
+		wake_up(&set->set_waitq);
+
+		/* XXX: It maybe unnecessary to wakeup all the partners. But to
+		 *      guarantee the async RPC can be processed ASAP, we have
+		 *      no other better choice. It maybe fixed in future. */
+		for (i = 0; i < pc->pc_npartners; i++)
+			wake_up(&pc->pc_partners[i]->pc_set->set_waitq);
+	}
+}
+
+/**
+ * Based on the current state of the import, determine if the request
+ * can be sent, is an error, or should be delayed.
+ *
+ * Returns true if this request should be delayed. If false, and
+ * *status is set, then the request can not be sent and *status is the
+ * error code.  If false and status is 0, then request can be sent.
+ *
+ * The imp->imp_lock must be held.
+ */
+static int ptlrpc_import_delay_req(struct obd_import *imp,
+                                   struct ptlrpc_request *req, int *status)
+{
+        int delay = 0;
+        ENTRY;
+
+        LASSERT (status != NULL);
+        *status = 0;
+
+        if (req->rq_ctx_init || req->rq_ctx_fini) {
+                /* always allow ctx init/fini rpc go through */
+        } else if (imp->imp_state == LUSTRE_IMP_NEW) {
+                DEBUG_REQ(D_ERROR, req, "Uninitialized import.");
+                *status = -EIO;
+	} else if (imp->imp_state == LUSTRE_IMP_CLOSED) {
+		/* pings may safely race with umount */
+		DEBUG_REQ(lustre_msg_get_opc(req->rq_reqmsg) == OBD_PING ?
+			  D_HA : D_ERROR, req, "IMP_CLOSED ");
+		*status = -EIO;
+        } else if (ptlrpc_send_limit_expired(req)) {
+		/* probably doesn't need to be a D_ERROR after initial testing*/
+		DEBUG_REQ(D_HA, req, "send limit expired ");
+		*status = -ETIMEDOUT;
+	} else if (req->rq_send_state == LUSTRE_IMP_CONNECTING &&
+		   imp->imp_state == LUSTRE_IMP_CONNECTING) {
+		/* allow CONNECT even if import is invalid */ ;
+		if (atomic_read(&imp->imp_inval_count) != 0) {
+			DEBUG_REQ(D_ERROR, req, "invalidate in flight");
+			*status = -EIO;
+		}
+	} else if (imp->imp_invalid || imp->imp_obd->obd_no_recov) {
+		if (!imp->imp_deactive)
+			DEBUG_REQ(D_NET, req, "IMP_INVALID");
+		*status = -ESHUTDOWN; /* bz 12940 */
+	} else if (req->rq_import_generation != imp->imp_generation) {
+                DEBUG_REQ(D_ERROR, req, "req wrong generation:");
+                *status = -EIO;
+        } else if (req->rq_send_state != imp->imp_state) {
+                /* invalidate in progress - any requests should be drop */
+		if (atomic_read(&imp->imp_inval_count) != 0) {
+                        DEBUG_REQ(D_ERROR, req, "invalidate in flight");
+                        *status = -EIO;
+		} else if (req->rq_no_delay) {
+                        *status = -EWOULDBLOCK;
+		} else if (req->rq_allow_replay &&
+			  (imp->imp_state == LUSTRE_IMP_REPLAY ||
+			   imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS ||
+			   imp->imp_state == LUSTRE_IMP_REPLAY_WAIT ||
+			   imp->imp_state == LUSTRE_IMP_RECOVER)) {
+			DEBUG_REQ(D_HA, req, "allow during recovery.\n");
+		} else {
+			delay = 1;
+		}
+	}
+
+	RETURN(delay);
+}
+
+/**
+ * Decide if the error message should be printed to the console or not.
+ * Makes its decision based on request type, status, and failure frequency.
+ *
+ * \param[in] req  request that failed and may need a console message
+ *
+ * \retval false if no message should be printed
+ * \retval true  if console message should be printed
+ */
+static bool ptlrpc_console_allow(struct ptlrpc_request *req)
+{
+	__u32 opc;
+
+	LASSERT(req->rq_reqmsg != NULL);
+	opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+	/* Suppress particular reconnect errors which are to be expected. */
+	if (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT) {
+		int err;
+
+		/* Suppress timed out reconnect requests */
+		if (lustre_handle_is_used(&req->rq_import->imp_remote_handle) ||
+		    req->rq_timedout)
+			return false;
+
+		/* Suppress most unavailable/again reconnect requests, but
+		 * print occasionally so it is clear client is trying to
+		 * connect to a server where no target is running. */
+		err = lustre_msg_get_status(req->rq_repmsg);
+		if ((err == -ENODEV || err == -EAGAIN) &&
+		    req->rq_import->imp_conn_cnt % 30 != 20)
+			return false;
+	}
+
+	return true;
+}
+
+/**
+ * Check request processing status.
+ * Returns the status.
+ */
+static int ptlrpc_check_status(struct ptlrpc_request *req)
+{
+        int err;
+        ENTRY;
+
+        err = lustre_msg_get_status(req->rq_repmsg);
+	if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
+		struct obd_import *imp = req->rq_import;
+		lnet_nid_t nid = imp->imp_connection->c_peer.nid;
+		__u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+		/* -EAGAIN is normal when using POSIX flocks */
+		if (ptlrpc_console_allow(req) &&
+		    !(opc == LDLM_ENQUEUE && err == -EAGAIN))
+			LCONSOLE_ERROR_MSG(0x11, "%s: operation %s to node %s "
+					   "failed: rc = %d\n",
+					   imp->imp_obd->obd_name,
+					   ll_opcode2str(opc),
+					   libcfs_nid2str(nid), err);
+		RETURN(err < 0 ? err : -EINVAL);
+	}
+
+        if (err < 0) {
+                DEBUG_REQ(D_INFO, req, "status is %d", err);
+        } else if (err > 0) {
+                /* XXX: translate this error from net to host */
+                DEBUG_REQ(D_INFO, req, "status is %d", err);
+        }
+
+        RETURN(err);
+}
+
+/**
+ * save pre-versions of objects into request for replay.
+ * Versions are obtained from server reply.
+ * used for VBR.
+ */
+static void ptlrpc_save_versions(struct ptlrpc_request *req)
+{
+        struct lustre_msg *repmsg = req->rq_repmsg;
+        struct lustre_msg *reqmsg = req->rq_reqmsg;
+        __u64 *versions = lustre_msg_get_versions(repmsg);
+        ENTRY;
+
+        if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
+                return;
+
+        LASSERT(versions);
+        lustre_msg_set_versions(reqmsg, versions);
+	CDEBUG(D_INFO, "Client save versions [%#llx/%#llx]\n",
+               versions[0], versions[1]);
+
+        EXIT;
+}
+
+__u64 ptlrpc_known_replied_xid(struct obd_import *imp)
+{
+	struct ptlrpc_request *req;
+
+	assert_spin_locked(&imp->imp_lock);
+	if (list_empty(&imp->imp_unreplied_list))
+		return 0;
+
+	req = list_entry(imp->imp_unreplied_list.next, struct ptlrpc_request,
+			 rq_unreplied_list);
+	LASSERTF(req->rq_xid >= 1, "XID:%llu\n", req->rq_xid);
+
+	if (imp->imp_known_replied_xid < req->rq_xid - 1)
+		imp->imp_known_replied_xid = req->rq_xid - 1;
+
+	return req->rq_xid - 1;
+}
+
+/**
+ * Callback function called when client receives RPC reply for \a req.
+ * Returns 0 on success or error code.
+ * The return alue would be assigned to req->rq_status by the caller
+ * as request processing status.
+ * This function also decides if the request needs to be saved for later replay.
+ */
+static int after_reply(struct ptlrpc_request *req)
+{
+	struct obd_import *imp = req->rq_import;
+	struct obd_device *obd = req->rq_import->imp_obd;
+	ktime_t work_start;
+	u64 committed;
+	s64 timediff;
+	int rc;
+
+	ENTRY;
+        LASSERT(obd != NULL);
+        /* repbuf must be unlinked */
+	LASSERT(!req->rq_receiving_reply && req->rq_reply_unlinked);
+
+	if (req->rq_reply_truncated) {
+                if (ptlrpc_no_resend(req)) {
+                        DEBUG_REQ(D_ERROR, req, "reply buffer overflow,"
+                                  " expected: %d, actual size: %d",
+                                  req->rq_nob_received, req->rq_repbuf_len);
+                        RETURN(-EOVERFLOW);
+                }
+
+                sptlrpc_cli_free_repbuf(req);
+                /* Pass the required reply buffer size (include
+                 * space for early reply).
+                 * NB: no need to roundup because alloc_repbuf
+                 * will roundup it */
+                req->rq_replen       = req->rq_nob_received;
+                req->rq_nob_received = 0;
+		spin_lock(&req->rq_lock);
+		req->rq_resend       = 1;
+		spin_unlock(&req->rq_lock);
+                RETURN(0);
+        }
+
+	work_start = ktime_get_real();
+	timediff = ktime_us_delta(work_start, req->rq_sent_ns);
+
+        /*
+         * NB Until this point, the whole of the incoming message,
+         * including buflens, status etc is in the sender's byte order.
+         */
+        rc = sptlrpc_cli_unwrap_reply(req);
+        if (rc) {
+                DEBUG_REQ(D_ERROR, req, "unwrap reply failed (%d):", rc);
+                RETURN(rc);
+        }
+
+	/*
+	 * Security layer unwrap might ask resend this request.
+	 */
+	if (req->rq_resend)
+		RETURN(0);
+
+	rc = unpack_reply(req);
+	if (rc)
+		RETURN(rc);
+
+	/* retry indefinitely on EINPROGRESS */
+	if (lustre_msg_get_status(req->rq_repmsg) == -EINPROGRESS &&
+	    ptlrpc_no_resend(req) == 0 && !req->rq_no_retry_einprogress) {
+		time64_t now = ktime_get_real_seconds();
+
+		DEBUG_REQ(D_RPCTRACE, req, "Resending request on EINPROGRESS");
+		spin_lock(&req->rq_lock);
+		req->rq_resend = 1;
+		spin_unlock(&req->rq_lock);
+		req->rq_nr_resend++;
+
+		/* Readjust the timeout for current conditions */
+		ptlrpc_at_set_req_timeout(req);
+		/* delay resend to give a chance to the server to get ready.
+		 * The delay is increased by 1s on every resend and is capped to
+		 * the current request timeout (i.e. obd_timeout if AT is off,
+		 * or AT service time x 125% + 5s, see at_est2timeout) */
+		if (req->rq_nr_resend > req->rq_timeout)
+			req->rq_sent = now + req->rq_timeout;
+		else
+			req->rq_sent = now + req->rq_nr_resend;
+
+		/* Resend for EINPROGRESS will use a new XID */
+		spin_lock(&imp->imp_lock);
+		list_del_init(&req->rq_unreplied_list);
+		spin_unlock(&imp->imp_lock);
+
+		RETURN(0);
+	}
+
+	if (obd->obd_svc_stats != NULL) {
+		lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR,
+				    timediff);
+		ptlrpc_lprocfs_rpc_sent(req, timediff);
+	}
+
+        if (lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_REPLY &&
+            lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_ERR) {
+                DEBUG_REQ(D_ERROR, req, "invalid packet received (type=%u)",
+                          lustre_msg_get_type(req->rq_repmsg));
+                RETURN(-EPROTO);
+        }
+
+        if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING)
+                CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, cfs_fail_val);
+        ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg));
+        ptlrpc_at_adj_net_latency(req,
+                                  lustre_msg_get_service_time(req->rq_repmsg));
+
+        rc = ptlrpc_check_status(req);
+        imp->imp_connect_error = rc;
+
+	if (rc) {
+		/*
+		 * Either we've been evicted, or the server has failed for
+		 * some reason. Try to reconnect, and if that fails, punt to
+		 * the upcall.
+		 */
+		if (ptlrpc_recoverable_error(rc)) {
+			if (req->rq_send_state != LUSTRE_IMP_FULL ||
+			    imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) {
+				RETURN(rc);
+			}
+			ptlrpc_request_handle_notconn(req);
+			RETURN(rc);
+		}
+        } else {
+                /*
+                 * Let's look if server sent slv. Do it only for RPC with
+                 * rc == 0.
+                 */
+                ldlm_cli_update_pool(req);
+        }
+
+        /*
+         * Store transno in reqmsg for replay.
+         */
+        if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) {
+                req->rq_transno = lustre_msg_get_transno(req->rq_repmsg);
+                lustre_msg_set_transno(req->rq_reqmsg, req->rq_transno);
+        }
+
+        if (imp->imp_replayable) {
+		spin_lock(&imp->imp_lock);
+                /*
+                 * No point in adding already-committed requests to the replay
+                 * list, we will just remove them immediately. b=9829
+                 */
+                if (req->rq_transno != 0 &&
+                    (req->rq_transno >
+                     lustre_msg_get_last_committed(req->rq_repmsg) ||
+                     req->rq_replay)) {
+                        /** version recovery */
+                        ptlrpc_save_versions(req);
+                        ptlrpc_retain_replayable_request(req, imp);
+		} else if (req->rq_commit_cb != NULL &&
+			   list_empty(&req->rq_replay_list)) {
+			/* NB: don't call rq_commit_cb if it's already on
+			 * rq_replay_list, ptlrpc_free_committed() will call
+			 * it later, see LU-3618 for details */
+			spin_unlock(&imp->imp_lock);
+			req->rq_commit_cb(req);
+			spin_lock(&imp->imp_lock);
+                }
+
+                /*
+                 * Replay-enabled imports return commit-status information.
+                 */
+		committed = lustre_msg_get_last_committed(req->rq_repmsg);
+		if (likely(committed > imp->imp_peer_committed_transno))
+			imp->imp_peer_committed_transno = committed;
+
+		ptlrpc_free_committed(imp);
+
+		if (!list_empty(&imp->imp_replay_list)) {
+			struct ptlrpc_request *last;
+
+			last = list_entry(imp->imp_replay_list.prev,
+					  struct ptlrpc_request,
+					  rq_replay_list);
+			/*
+			 * Requests with rq_replay stay on the list even if no
+			 * commit is expected.
+			 */
+			if (last->rq_transno > imp->imp_peer_committed_transno)
+				ptlrpc_pinger_commit_expected(imp);
+		}
+
+		spin_unlock(&imp->imp_lock);
+	}
+
+	RETURN(rc);
+}
+
+/**
+ * Helper function to send request \a req over the network for the first time
+ * Also adjusts request phase.
+ * Returns 0 on success or error code.
+ */
+static int ptlrpc_send_new_req(struct ptlrpc_request *req)
+{
+        struct obd_import     *imp = req->rq_import;
+	__u64		       min_xid = 0;
+        int rc;
+        ENTRY;
+
+        LASSERT(req->rq_phase == RQ_PHASE_NEW);
+
+	/* do not try to go further if there is not enough memory in enc_pool */
+	if (req->rq_sent && req->rq_bulk != NULL)
+		if (req->rq_bulk->bd_iov_count > get_free_pages_in_pool() &&
+		    pool_is_at_full_capacity())
+			RETURN(-ENOMEM);
+
+	if (req->rq_sent && (req->rq_sent > ktime_get_real_seconds()) &&
+            (!req->rq_generation_set ||
+             req->rq_import_generation == imp->imp_generation))
+                RETURN (0);
+
+        ptlrpc_rqphase_move(req, RQ_PHASE_RPC);
+
+	spin_lock(&imp->imp_lock);
+
+	LASSERT(req->rq_xid != 0);
+	LASSERT(!list_empty(&req->rq_unreplied_list));
+
+	if (!req->rq_generation_set)
+		req->rq_import_generation = imp->imp_generation;
+
+	if (ptlrpc_import_delay_req(imp, req, &rc)) {
+		spin_lock(&req->rq_lock);
+		req->rq_waiting = 1;
+		spin_unlock(&req->rq_lock);
+
+		DEBUG_REQ(D_HA, req, "req from PID %d waiting for recovery: "
+			  "(%s != %s)", lustre_msg_get_status(req->rq_reqmsg),
+			  ptlrpc_import_state_name(req->rq_send_state),
+			  ptlrpc_import_state_name(imp->imp_state));
+		LASSERT(list_empty(&req->rq_list));
+		list_add_tail(&req->rq_list, &imp->imp_delayed_list);
+		atomic_inc(&req->rq_import->imp_inflight);
+		spin_unlock(&imp->imp_lock);
+		RETURN(0);
+	}
+
+	if (rc != 0) {
+		spin_unlock(&imp->imp_lock);
+		req->rq_status = rc;
+		ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+		RETURN(rc);
+	}
+
+	LASSERT(list_empty(&req->rq_list));
+	list_add_tail(&req->rq_list, &imp->imp_sending_list);
+	atomic_inc(&req->rq_import->imp_inflight);
+
+	/* find the known replied XID from the unreplied list, CONNECT
+	 * and DISCONNECT requests are skipped to make the sanity check
+	 * on server side happy. see process_req_last_xid().
+	 *
+	 * For CONNECT: Because replay requests have lower XID, it'll
+	 * break the sanity check if CONNECT bump the exp_last_xid on
+	 * server.
+	 *
+	 * For DISCONNECT: Since client will abort inflight RPC before
+	 * sending DISCONNECT, DISCONNECT may carry an XID which higher
+	 * than the inflight RPC.
+	 */
+	if (!ptlrpc_req_is_connect(req) && !ptlrpc_req_is_disconnect(req))
+		min_xid = ptlrpc_known_replied_xid(imp);
+	spin_unlock(&imp->imp_lock);
+
+	lustre_msg_set_last_xid(req->rq_reqmsg, min_xid);
+
+	lustre_msg_set_status(req->rq_reqmsg, current_pid());
+
+        rc = sptlrpc_req_refresh_ctx(req, -1);
+        if (rc) {
+                if (req->rq_err) {
+                        req->rq_status = rc;
+                        RETURN(1);
+                } else {
+			spin_lock(&req->rq_lock);
+			req->rq_wait_ctx = 1;
+			spin_unlock(&req->rq_lock);
+                        RETURN(0);
+                }
+        }
+
+	CDEBUG(D_RPCTRACE, "Sending RPC pname:cluuid:pid:xid:nid:opc"
+	       " %s:%s:%d:%llu:%s:%d\n", current_comm(),
+	       imp->imp_obd->obd_uuid.uuid,
+	       lustre_msg_get_status(req->rq_reqmsg), req->rq_xid,
+	       libcfs_nid2str(imp->imp_connection->c_peer.nid),
+	       lustre_msg_get_opc(req->rq_reqmsg));
+
+        rc = ptl_send_rpc(req, 0);
+	if (rc == -ENOMEM) {
+		spin_lock(&imp->imp_lock);
+		if (!list_empty(&req->rq_list)) {
+			list_del_init(&req->rq_list);
+			atomic_dec(&req->rq_import->imp_inflight);
+		}
+		spin_unlock(&imp->imp_lock);
+		ptlrpc_rqphase_move(req, RQ_PHASE_NEW);
+		RETURN(rc);
+	}
+        if (rc) {
+                DEBUG_REQ(D_HA, req, "send failed (%d); expect timeout", rc);
+		spin_lock(&req->rq_lock);
+		req->rq_net_err = 1;
+		spin_unlock(&req->rq_lock);
+                RETURN(rc);
+        }
+        RETURN(0);
+}
+
+static inline int ptlrpc_set_producer(struct ptlrpc_request_set *set)
+{
+	int remaining, rc;
+	ENTRY;
+
+	LASSERT(set->set_producer != NULL);
+
+	remaining = atomic_read(&set->set_remaining);
+
+	/* populate the ->set_requests list with requests until we
+	 * reach the maximum number of RPCs in flight for this set */
+	while (atomic_read(&set->set_remaining) < set->set_max_inflight) {
+		rc = set->set_producer(set, set->set_producer_arg);
+		if (rc == -ENOENT) {
+			/* no more RPC to produce */
+			set->set_producer     = NULL;
+			set->set_producer_arg = NULL;
+			RETURN(0);
+		}
+	}
+
+	RETURN((atomic_read(&set->set_remaining) - remaining));
+}
+
+/**
+ * this sends any unsent RPCs in \a set and returns 1 if all are sent
+ * and no more replies are expected.
+ * (it is possible to get less replies than requests sent e.g. due to timed out
+ * requests or requests that we had trouble to send out)
+ *
+ * NOTE: This function contains a potential schedule point (cond_resched()).
+ */
+int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
+{
+	struct list_head *tmp, *next;
+	struct list_head  comp_reqs;
+	int force_timer_recalc = 0;
+	ENTRY;
+
+	if (atomic_read(&set->set_remaining) == 0)
+		RETURN(1);
+
+	INIT_LIST_HEAD(&comp_reqs);
+	list_for_each_safe(tmp, next, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(tmp, struct ptlrpc_request,
+				   rq_set_chain);
+		struct obd_import *imp = req->rq_import;
+		int unregistered = 0;
+		int async = 1;
+		int rc = 0;
+
+		if (req->rq_phase == RQ_PHASE_COMPLETE) {
+			list_move_tail(&req->rq_set_chain, &comp_reqs);
+			continue;
+		}
+
+		/* This schedule point is mainly for the ptlrpcd caller of this
+		 * function.  Most ptlrpc sets are not long-lived and unbounded
+		 * in length, but at the least the set used by the ptlrpcd is.
+		 * Since the processing time is unbounded, we need to insert an
+		 * explicit schedule point to make the thread well-behaved.
+		 */
+		cond_resched();
+
+		/* If the caller requires to allow to be interpreted by force
+		 * and it has really been interpreted, then move the request
+		 * to RQ_PHASE_INTERPRET phase in spite of what the current
+		 * phase is. */
+		if (unlikely(req->rq_allow_intr && req->rq_intr)) {
+			req->rq_status = -EINTR;
+			ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+
+			/* Since it is interpreted and we have to wait for
+			 * the reply to be unlinked, then use sync mode. */
+			async = 0;
+
+			GOTO(interpret, req->rq_status);
+		}
+
+		if (req->rq_phase == RQ_PHASE_NEW && ptlrpc_send_new_req(req))
+			force_timer_recalc = 1;
+
+		/* delayed send - skip */
+		if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent)
+			continue;
+
+		/* delayed resend - skip */
+		if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend &&
+		    req->rq_sent > ktime_get_real_seconds())
+			continue;
+
+		if (!(req->rq_phase == RQ_PHASE_RPC ||
+		      req->rq_phase == RQ_PHASE_BULK ||
+		      req->rq_phase == RQ_PHASE_INTERPRET ||
+		      req->rq_phase == RQ_PHASE_UNREG_RPC ||
+		      req->rq_phase == RQ_PHASE_UNREG_BULK)) {
+			DEBUG_REQ(D_ERROR, req, "bad phase %x", req->rq_phase);
+			LBUG();
+		}
+
+		if (req->rq_phase == RQ_PHASE_UNREG_RPC ||
+		    req->rq_phase == RQ_PHASE_UNREG_BULK) {
+			LASSERT(req->rq_next_phase != req->rq_phase);
+			LASSERT(req->rq_next_phase != RQ_PHASE_UNDEFINED);
+
+			if (req->rq_req_deadline &&
+			    !OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REQ_UNLINK))
+				req->rq_req_deadline = 0;
+			if (req->rq_reply_deadline &&
+			    !OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK))
+				req->rq_reply_deadline = 0;
+			if (req->rq_bulk_deadline &&
+			    !OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK))
+				req->rq_bulk_deadline = 0;
+
+			/*
+			 * Skip processing until reply is unlinked. We
+			 * can't return to pool before that and we can't
+			 * call interpret before that. We need to make
+			 * sure that all rdma transfers finished and will
+			 * not corrupt any data.
+			 */
+			if (req->rq_phase == RQ_PHASE_UNREG_RPC &&
+			    ptlrpc_client_recv_or_unlink(req))
+				continue;
+			if (req->rq_phase == RQ_PHASE_UNREG_BULK &&
+			    ptlrpc_client_bulk_active(req))
+				continue;
+
+                        /*
+                         * Turn fail_loc off to prevent it from looping
+                         * forever.
+                         */
+                        if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
+                                OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK,
+                                                     OBD_FAIL_ONCE);
+                        }
+                        if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) {
+                                OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK,
+                                                     OBD_FAIL_ONCE);
+                        }
+
+                        /*
+                         * Move to next phase if reply was successfully
+                         * unlinked.
+                         */
+                        ptlrpc_rqphase_move(req, req->rq_next_phase);
+                }
+
+                if (req->rq_phase == RQ_PHASE_INTERPRET)
+                        GOTO(interpret, req->rq_status);
+
+                /*
+                 * Note that this also will start async reply unlink.
+                 */
+                if (req->rq_net_err && !req->rq_timedout) {
+                        ptlrpc_expire_one_request(req, 1);
+
+                        /*
+                         * Check if we still need to wait for unlink.
+                         */
+                        if (ptlrpc_client_recv_or_unlink(req) ||
+                            ptlrpc_client_bulk_active(req))
+                                continue;
+                        /* If there is no need to resend, fail it now. */
+                        if (req->rq_no_resend) {
+                                if (req->rq_status == 0)
+                                        req->rq_status = -EIO;
+                                ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+                                GOTO(interpret, req->rq_status);
+                        } else {
+                                continue;
+                        }
+                }
+
+                if (req->rq_err) {
+			spin_lock(&req->rq_lock);
+			req->rq_replied = 0;
+			spin_unlock(&req->rq_lock);
+                        if (req->rq_status == 0)
+                                req->rq_status = -EIO;
+                        ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+                        GOTO(interpret, req->rq_status);
+                }
+
+                /* ptlrpc_set_wait->l_wait_event sets lwi_allow_intr
+                 * so it sets rq_intr regardless of individual rpc
+		 * timeouts. The synchronous IO waiting path sets
+                 * rq_intr irrespective of whether ptlrpcd
+                 * has seen a timeout.  Our policy is to only interpret
+                 * interrupted rpcs after they have timed out, so we
+                 * need to enforce that here.
+                 */
+
+                if (req->rq_intr && (req->rq_timedout || req->rq_waiting ||
+                                     req->rq_wait_ctx)) {
+                        req->rq_status = -EINTR;
+                        ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+                        GOTO(interpret, req->rq_status);
+                }
+
+                if (req->rq_phase == RQ_PHASE_RPC) {
+                        if (req->rq_timedout || req->rq_resend ||
+                            req->rq_waiting || req->rq_wait_ctx) {
+                                int status;
+
+				if (!ptlrpc_unregister_reply(req, 1)) {
+					ptlrpc_unregister_bulk(req, 1);
+					continue;
+				}
+
+				spin_lock(&imp->imp_lock);
+				if (ptlrpc_import_delay_req(imp, req, &status)){
+					/* put on delay list - only if we wait
+					 * recovery finished - before send */
+					list_del_init(&req->rq_list);
+					list_add_tail(&req->rq_list,
+							  &imp->
+							  imp_delayed_list);
+					spin_unlock(&imp->imp_lock);
+                                        continue;
+                                }
+
+                                if (status != 0)  {
+                                        req->rq_status = status;
+                                        ptlrpc_rqphase_move(req,
+                                                RQ_PHASE_INTERPRET);
+					spin_unlock(&imp->imp_lock);
+					GOTO(interpret, req->rq_status);
+				}
+				if (ptlrpc_no_resend(req) &&
+				    !req->rq_wait_ctx) {
+					req->rq_status = -ENOTCONN;
+					ptlrpc_rqphase_move(req,
+							    RQ_PHASE_INTERPRET);
+					spin_unlock(&imp->imp_lock);
+					GOTO(interpret, req->rq_status);
+				}
+
+				list_del_init(&req->rq_list);
+				list_add_tail(&req->rq_list,
+						  &imp->imp_sending_list);
+
+				spin_unlock(&imp->imp_lock);
+
+				spin_lock(&req->rq_lock);
+				req->rq_waiting = 0;
+				spin_unlock(&req->rq_lock);
+
+				if (req->rq_timedout || req->rq_resend) {
+					/* This is re-sending anyways,
+					 * let's mark req as resend. */
+					spin_lock(&req->rq_lock);
+					req->rq_resend = 1;
+					spin_unlock(&req->rq_lock);
+
+					if (req->rq_bulk != NULL &&
+					    !ptlrpc_unregister_bulk(req, 1))
+						continue;
+                                }
+                                /*
+                                 * rq_wait_ctx is only touched by ptlrpcd,
+                                 * so no lock is needed here.
+                                 */
+                                status = sptlrpc_req_refresh_ctx(req, -1);
+                                if (status) {
+                                        if (req->rq_err) {
+                                                req->rq_status = status;
+						spin_lock(&req->rq_lock);
+						req->rq_wait_ctx = 0;
+						spin_unlock(&req->rq_lock);
+						force_timer_recalc = 1;
+					} else {
+						spin_lock(&req->rq_lock);
+						req->rq_wait_ctx = 1;
+						spin_unlock(&req->rq_lock);
+					}
+
+					continue;
+				} else {
+					spin_lock(&req->rq_lock);
+					req->rq_wait_ctx = 0;
+					spin_unlock(&req->rq_lock);
+				}
+
+				rc = ptl_send_rpc(req, 0);
+				if (rc == -ENOMEM) {
+					spin_lock(&imp->imp_lock);
+					if (!list_empty(&req->rq_list))
+						list_del_init(&req->rq_list);
+					spin_unlock(&imp->imp_lock);
+					ptlrpc_rqphase_move(req, RQ_PHASE_NEW);
+					continue;
+				}
+				if (rc) {
+					DEBUG_REQ(D_HA, req,
+						  "send failed: rc = %d", rc);
+					force_timer_recalc = 1;
+					spin_lock(&req->rq_lock);
+					req->rq_net_err = 1;
+					spin_unlock(&req->rq_lock);
+					continue;
+				}
+				/* need to reset the timeout */
+				force_timer_recalc = 1;
+			}
+
+			spin_lock(&req->rq_lock);
+
+			if (ptlrpc_client_early(req)) {
+				ptlrpc_at_recv_early_reply(req);
+				spin_unlock(&req->rq_lock);
+				continue;
+			}
+
+			/* Still waiting for a reply? */
+			if (ptlrpc_client_recv(req)) {
+				spin_unlock(&req->rq_lock);
+				continue;
+			}
+
+			/* Did we actually receive a reply? */
+			if (!ptlrpc_client_replied(req)) {
+				spin_unlock(&req->rq_lock);
+				continue;
+			}
+
+			spin_unlock(&req->rq_lock);
+
+                        /* unlink from net because we are going to
+                         * swab in-place of reply buffer */
+                        unregistered = ptlrpc_unregister_reply(req, 1);
+                        if (!unregistered)
+                                continue;
+
+                        req->rq_status = after_reply(req);
+                        if (req->rq_resend)
+                                continue;
+
+                        /* If there is no bulk associated with this request,
+                         * then we're done and should let the interpreter
+                         * process the reply. Similarly if the RPC returned
+                         * an error, and therefore the bulk will never arrive.
+                         */
+                        if (req->rq_bulk == NULL || req->rq_status < 0) {
+                                ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+                                GOTO(interpret, req->rq_status);
+                        }
+
+                        ptlrpc_rqphase_move(req, RQ_PHASE_BULK);
+                }
+
+                LASSERT(req->rq_phase == RQ_PHASE_BULK);
+                if (ptlrpc_client_bulk_active(req))
+                        continue;
+
+		if (req->rq_bulk->bd_failure) {
+			/* The RPC reply arrived OK, but the bulk screwed
+			 * up!  Dead weird since the server told us the RPC
+			 * was good after getting the REPLY for her GET or
+			 * the ACK for her PUT. */
+			DEBUG_REQ(D_ERROR, req, "bulk transfer failed");
+			req->rq_status = -EIO;
+		}
+
+		ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+
+	interpret:
+		LASSERT(req->rq_phase == RQ_PHASE_INTERPRET);
+
+		/* This moves to "unregistering" phase we need to wait for
+		 * reply unlink. */
+		if (!unregistered && !ptlrpc_unregister_reply(req, async)) {
+			/* start async bulk unlink too */
+			ptlrpc_unregister_bulk(req, 1);
+			continue;
+		}
+
+		if (!ptlrpc_unregister_bulk(req, async))
+			continue;
+
+		/* When calling interpret receiving already should be
+		 * finished. */
+		LASSERT(!req->rq_receiving_reply);
+
+		ptlrpc_req_interpret(env, req, req->rq_status);
+
+		if (ptlrpcd_check_work(req)) {
+			atomic_dec(&set->set_remaining);
+			continue;
+		}
+		ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE);
+
+		if (req->rq_reqmsg != NULL)
+			CDEBUG(D_RPCTRACE,
+			       "Completed RPC pname:cluuid:pid:xid:nid:"
+			       "opc %s:%s:%d:%llu:%s:%d\n", current_comm(),
+			       imp->imp_obd->obd_uuid.uuid,
+			       lustre_msg_get_status(req->rq_reqmsg),
+			       req->rq_xid,
+			       libcfs_nid2str(imp->imp_connection->c_peer.nid),
+			       lustre_msg_get_opc(req->rq_reqmsg));
+
+		spin_lock(&imp->imp_lock);
+		/* Request already may be not on sending or delaying list. This
+		 * may happen in the case of marking it erroneous for the case
+		 * ptlrpc_import_delay_req(req, status) find it impossible to
+		 * allow sending this rpc and returns *status != 0. */
+		if (!list_empty(&req->rq_list)) {
+			list_del_init(&req->rq_list);
+			atomic_dec(&imp->imp_inflight);
+		}
+		list_del_init(&req->rq_unreplied_list);
+		spin_unlock(&imp->imp_lock);
+
+		atomic_dec(&set->set_remaining);
+		wake_up_all(&imp->imp_recovery_waitq);
+
+		if (set->set_producer) {
+			/* produce a new request if possible */
+			if (ptlrpc_set_producer(set) > 0)
+				force_timer_recalc = 1;
+
+			/* free the request that has just been completed
+			 * in order not to pollute set->set_requests */
+			list_del_init(&req->rq_set_chain);
+			spin_lock(&req->rq_lock);
+			req->rq_set = NULL;
+			req->rq_invalid_rqset = 0;
+			spin_unlock(&req->rq_lock);
+
+			/* record rq_status to compute the final status later */
+			if (req->rq_status != 0)
+				set->set_rc = req->rq_status;
+			ptlrpc_req_finished(req);
+		} else {
+			list_move_tail(&req->rq_set_chain, &comp_reqs);
+		}
+	}
+
+	/* move completed request at the head of list so it's easier for
+	 * caller to find them */
+	list_splice(&comp_reqs, &set->set_requests);
+
+	/* If we hit an error, we want to recover promptly. */
+	RETURN(atomic_read(&set->set_remaining) == 0 || force_timer_recalc);
+}
+EXPORT_SYMBOL(ptlrpc_check_set);
+
+/**
+ * Time out request \a req. is \a async_unlink is set, that means do not wait
+ * until LNet actually confirms network buffer unlinking.
+ * Return 1 if we should give up further retrying attempts or 0 otherwise.
+ */
+int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
+{
+	struct obd_import *imp = req->rq_import;
+	int rc = 0;
+	ENTRY;
+
+	spin_lock(&req->rq_lock);
+	req->rq_timedout = 1;
+	spin_unlock(&req->rq_lock);
+
+	DEBUG_REQ(D_WARNING, req, "Request sent has %s: [sent %lld/real %lld]",
+                  req->rq_net_err ? "failed due to network error" :
+                     ((req->rq_real_sent == 0 ||
+		       req->rq_real_sent < req->rq_sent ||
+		       req->rq_real_sent >= req->rq_deadline) ?
+                      "timed out for sent delay" : "timed out for slow reply"),
+		  (s64)req->rq_sent, (s64)req->rq_real_sent);
+
+	if (imp != NULL && obd_debug_peer_on_timeout)
+		LNetDebugPeer(imp->imp_connection->c_peer);
+
+        ptlrpc_unregister_reply(req, async_unlink);
+        ptlrpc_unregister_bulk(req, async_unlink);
+
+        if (obd_dump_on_timeout)
+                libcfs_debug_dumplog();
+
+        if (imp == NULL) {
+                DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?");
+                RETURN(1);
+        }
+
+	atomic_inc(&imp->imp_timeouts);
+
+        /* The DLM server doesn't want recovery run on its imports. */
+        if (imp->imp_dlm_fake)
+                RETURN(1);
+
+        /* If this request is for recovery or other primordial tasks,
+         * then error it out here. */
+        if (req->rq_ctx_init || req->rq_ctx_fini ||
+            req->rq_send_state != LUSTRE_IMP_FULL ||
+            imp->imp_obd->obd_no_recov) {
+                DEBUG_REQ(D_RPCTRACE, req, "err -110, sent_state=%s (now=%s)",
+                          ptlrpc_import_state_name(req->rq_send_state),
+                          ptlrpc_import_state_name(imp->imp_state));
+		spin_lock(&req->rq_lock);
+		req->rq_status = -ETIMEDOUT;
+		req->rq_err = 1;
+		spin_unlock(&req->rq_lock);
+		RETURN(1);
+        }
+
+        /* if a request can't be resent we can't wait for an answer after
+           the timeout */
+        if (ptlrpc_no_resend(req)) {
+                DEBUG_REQ(D_RPCTRACE, req, "TIMEOUT-NORESEND:");
+                rc = 1;
+        }
+
+        ptlrpc_fail_import(imp, lustre_msg_get_conn_cnt(req->rq_reqmsg));
+
+        RETURN(rc);
+}
+
+/**
+ * Time out all uncompleted requests in request set pointed by \a data
+ * Callback used when waiting on sets with l_wait_event.
+ * Always returns 1.
+ */
+int ptlrpc_expired_set(void *data)
+{
+	struct ptlrpc_request_set *set = data;
+	struct list_head *tmp;
+	time64_t now = ktime_get_real_seconds();
+
+	ENTRY;
+	LASSERT(set != NULL);
+
+	/*
+	 * A timeout expired. See which reqs it applies to...
+	 */
+	list_for_each(tmp, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(tmp, struct ptlrpc_request,
+				   rq_set_chain);
+
+                /* don't expire request waiting for context */
+                if (req->rq_wait_ctx)
+                        continue;
+
+                /* Request in-flight? */
+                if (!((req->rq_phase == RQ_PHASE_RPC &&
+                       !req->rq_waiting && !req->rq_resend) ||
+                      (req->rq_phase == RQ_PHASE_BULK)))
+                        continue;
+
+                if (req->rq_timedout ||     /* already dealt with */
+                    req->rq_deadline > now) /* not expired */
+                        continue;
+
+                /* Deal with this guy. Do it asynchronously to not block
+                 * ptlrpcd thread. */
+                ptlrpc_expire_one_request(req, 1);
+        }
+
+        /*
+         * When waiting for a whole set, we always break out of the
+         * sleep so we can recalculate the timeout, or enable interrupts
+         * if everyone's timed out.
+         */
+        RETURN(1);
+}
+
+/**
+ * Sets rq_intr flag in \a req under spinlock.
+ */
+void ptlrpc_mark_interrupted(struct ptlrpc_request *req)
+{
+	spin_lock(&req->rq_lock);
+	req->rq_intr = 1;
+	spin_unlock(&req->rq_lock);
+}
+EXPORT_SYMBOL(ptlrpc_mark_interrupted);
+
+/**
+ * Interrupts (sets interrupted flag) all uncompleted requests in
+ * a set \a data. Callback for l_wait_event for interruptible waits.
+ */
+static void ptlrpc_interrupted_set(void *data)
+{
+	struct ptlrpc_request_set *set = data;
+	struct list_head *tmp;
+
+	LASSERT(set != NULL);
+	CDEBUG(D_RPCTRACE, "INTERRUPTED SET %p\n", set);
+
+	list_for_each(tmp, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(tmp, struct ptlrpc_request, rq_set_chain);
+
+		if (req->rq_intr)
+			continue;
+
+		if (req->rq_phase != RQ_PHASE_RPC &&
+		    req->rq_phase != RQ_PHASE_UNREG_RPC &&
+		    !req->rq_allow_intr)
+			continue;
+
+		ptlrpc_mark_interrupted(req);
+	}
+}
+
+/**
+ * Get the smallest timeout in the set; this does NOT set a timeout.
+ */
+int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
+{
+	struct list_head *tmp;
+	time64_t now = ktime_get_real_seconds();
+	int timeout = 0;
+	struct ptlrpc_request *req;
+	time64_t deadline;
+
+	ENTRY;
+	list_for_each(tmp, &set->set_requests) {
+		req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
+
+                /*
+                 * Request in-flight?
+                 */
+                if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
+                      (req->rq_phase == RQ_PHASE_BULK) ||
+                      (req->rq_phase == RQ_PHASE_NEW)))
+                        continue;
+
+                /*
+                 * Already timed out.
+                 */
+                if (req->rq_timedout)
+                        continue;
+
+                /*
+                 * Waiting for ctx.
+                 */
+                if (req->rq_wait_ctx)
+                        continue;
+
+                if (req->rq_phase == RQ_PHASE_NEW)
+                        deadline = req->rq_sent;
+		else if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend)
+			deadline = req->rq_sent;
+                else
+                        deadline = req->rq_sent + req->rq_timeout;
+
+                if (deadline <= now)    /* actually expired already */
+                        timeout = 1;    /* ASAP */
+                else if (timeout == 0 || timeout > deadline - now)
+                        timeout = deadline - now;
+        }
+        RETURN(timeout);
+}
+
+/**
+ * Send all unset request from the set and then wait untill all
+ * requests in the set complete (either get a reply, timeout, get an
+ * error or otherwise be interrupted).
+ * Returns 0 on success or error code otherwise.
+ */
+int ptlrpc_set_wait(struct ptlrpc_request_set *set)
+{
+	struct list_head            *tmp;
+        struct ptlrpc_request *req;
+        struct l_wait_info     lwi;
+        int                    rc, timeout;
+        ENTRY;
+
+	if (set->set_producer)
+		(void)ptlrpc_set_producer(set);
+	else
+		list_for_each(tmp, &set->set_requests) {
+			req = list_entry(tmp, struct ptlrpc_request,
+					 rq_set_chain);
+			if (req->rq_phase == RQ_PHASE_NEW)
+				(void)ptlrpc_send_new_req(req);
+		}
+
+	if (list_empty(&set->set_requests))
+                RETURN(0);
+
+        do {
+                timeout = ptlrpc_set_next_timeout(set);
+
+                /* wait until all complete, interrupted, or an in-flight
+                 * req times out */
+                CDEBUG(D_RPCTRACE, "set %p going to sleep for %d seconds\n",
+                       set, timeout);
+
+		if ((timeout == 0 && !signal_pending(current)) ||
+		    set->set_allow_intr)
+			/* No requests are in-flight (ether timed out
+			 * or delayed), so we can allow interrupts.
+			 * We still want to block for a limited time,
+			 * so we allow interrupts during the timeout. */
+			lwi = LWI_TIMEOUT_INTR_ALL(
+					cfs_time_seconds(timeout ? timeout : 1),
+					ptlrpc_expired_set,
+					ptlrpc_interrupted_set, set);
+                else
+                        /*
+                         * At least one request is in flight, so no
+                         * interrupts are allowed. Wait until all
+			 * complete, or an in-flight req times out.
+                         */
+                        lwi = LWI_TIMEOUT(cfs_time_seconds(timeout? timeout : 1),
+                                          ptlrpc_expired_set, set);
+
+                rc = l_wait_event(set->set_waitq, ptlrpc_check_set(NULL, set), &lwi);
+
+                /* LU-769 - if we ignored the signal because it was already
+                 * pending when we started, we need to handle it now or we risk
+                 * it being ignored forever */
+		if (rc == -ETIMEDOUT &&
+		    (!lwi.lwi_allow_intr || set->set_allow_intr) &&
+		    signal_pending(current)) {
+			sigset_t blocked_sigs =
+					   cfs_block_sigsinv(LUSTRE_FATAL_SIGS);
+
+			/* In fact we only interrupt for the "fatal" signals
+			 * like SIGINT or SIGKILL. We still ignore less
+			 * important signals since ptlrpc set is not easily
+			 * reentrant from userspace again */
+			if (signal_pending(current))
+				ptlrpc_interrupted_set(set);
+			cfs_restore_sigs(blocked_sigs);
+		}
+
+                LASSERT(rc == 0 || rc == -EINTR || rc == -ETIMEDOUT);
+
+                /* -EINTR => all requests have been flagged rq_intr so next
+                 * check completes.
+                 * -ETIMEDOUT => someone timed out.  When all reqs have
+                 * timed out, signals are enabled allowing completion with
+                 * EINTR.
+                 * I don't really care if we go once more round the loop in
+                 * the error cases -eeb. */
+		if (rc == 0 && atomic_read(&set->set_remaining) == 0) {
+			list_for_each(tmp, &set->set_requests) {
+				req = list_entry(tmp, struct ptlrpc_request,
+						 rq_set_chain);
+				spin_lock(&req->rq_lock);
+				req->rq_invalid_rqset = 1;
+				spin_unlock(&req->rq_lock);
+			}
+		}
+	} while (rc != 0 || atomic_read(&set->set_remaining) != 0);
+
+	LASSERT(atomic_read(&set->set_remaining) == 0);
+
+        rc = set->set_rc; /* rq_status of already freed requests if any */
+	list_for_each(tmp, &set->set_requests) {
+		req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
+
+                LASSERT(req->rq_phase == RQ_PHASE_COMPLETE);
+                if (req->rq_status != 0)
+                        rc = req->rq_status;
+        }
+
+        if (set->set_interpret != NULL) {
+                int (*interpreter)(struct ptlrpc_request_set *set,void *,int) =
+                        set->set_interpret;
+                rc = interpreter (set, set->set_arg, rc);
+        } else {
+                struct ptlrpc_set_cbdata *cbdata, *n;
+                int err;
+
+		list_for_each_entry_safe(cbdata, n,
+                                         &set->set_cblist, psc_item) {
+			list_del_init(&cbdata->psc_item);
+                        err = cbdata->psc_interpret(set, cbdata->psc_data, rc);
+                        if (err && !rc)
+                                rc = err;
+                        OBD_FREE_PTR(cbdata);
+                }
+        }
+
+        RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_set_wait);
+
+/**
+ * Helper fuction for request freeing.
+ * Called when request count reached zero and request needs to be freed.
+ * Removes request from all sorts of sending/replay lists it might be on,
+ * frees network buffers if any are present.
+ * If \a locked is set, that means caller is already holding import imp_lock
+ * and so we no longer need to reobtain it (for certain lists manipulations)
+ */
+static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
+{
+	ENTRY;
+
+	if (request == NULL)
+		RETURN_EXIT;
+
+	LASSERT(!request->rq_srv_req);
+	LASSERT(request->rq_export == NULL);
+	LASSERTF(!request->rq_receiving_reply, "req %p\n", request);
+	LASSERTF(list_empty(&request->rq_list), "req %p\n", request);
+	LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request);
+	LASSERTF(!request->rq_replay, "req %p\n", request);
+
+	req_capsule_fini(&request->rq_pill);
+
+	/* We must take it off the imp_replay_list first.  Otherwise, we'll set
+	 * request->rq_reqmsg to NULL while osc_close is dereferencing it. */
+	if (request->rq_import != NULL) {
+		if (!locked)
+			spin_lock(&request->rq_import->imp_lock);
+		list_del_init(&request->rq_replay_list);
+		list_del_init(&request->rq_unreplied_list);
+		if (!locked)
+			spin_unlock(&request->rq_import->imp_lock);
+        }
+	LASSERTF(list_empty(&request->rq_replay_list), "req %p\n", request);
+
+	if (atomic_read(&request->rq_refcount) != 0) {
+		DEBUG_REQ(D_ERROR, request,
+			  "freeing request with nonzero refcount");
+		LBUG();
+	}
+
+        if (request->rq_repbuf != NULL)
+                sptlrpc_cli_free_repbuf(request);
+
+        if (request->rq_import != NULL) {
+                class_import_put(request->rq_import);
+                request->rq_import = NULL;
+        }
+	if (request->rq_bulk != NULL)
+		ptlrpc_free_bulk(request->rq_bulk);
+
+        if (request->rq_reqbuf != NULL || request->rq_clrbuf != NULL)
+                sptlrpc_cli_free_reqbuf(request);
+
+        if (request->rq_cli_ctx)
+                sptlrpc_req_put_ctx(request, !locked);
+
+        if (request->rq_pool)
+                __ptlrpc_free_req_to_pool(request);
+        else
+		ptlrpc_request_cache_free(request);
+	EXIT;
+}
+
+static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked);
+/**
+ * Drop one request reference. Must be called with import imp_lock held.
+ * When reference count drops to zero, request is freed.
+ */
+void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request)
+{
+	assert_spin_locked(&request->rq_import->imp_lock);
+	(void)__ptlrpc_req_finished(request, 1);
+}
+
+/**
+ * Helper function
+ * Drops one reference count for request \a request.
+ * \a locked set indicates that caller holds import imp_lock.
+ * Frees the request whe reference count reaches zero.
+ *
+ * \retval 1	the request is freed
+ * \retval 0	some others still hold references on the request
+ */
+static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked)
+{
+	int count;
+	ENTRY;
+
+	if (!request)
+		RETURN(1);
+
+	LASSERT(request != LP_POISON);
+	LASSERT(request->rq_reqmsg != LP_POISON);
+
+	DEBUG_REQ(D_INFO, request, "refcount now %u",
+		  atomic_read(&request->rq_refcount) - 1);
+
+	spin_lock(&request->rq_lock);
+	count = atomic_dec_return(&request->rq_refcount);
+	LASSERTF(count >= 0, "Invalid ref count %d\n", count);
+
+	/* For open RPC, the client does not know the EA size (LOV, ACL, and
+	 * so on) before replied, then the client has to reserve very large
+	 * reply buffer. Such buffer will not be released until the RPC freed.
+	 * Since The open RPC is replayable, we need to keep it in the replay
+	 * list until close. If there are a lot of files opened concurrently,
+	 * then the client may be OOM.
+	 *
+	 * If fact, it is unnecessary to keep reply buffer for open replay,
+	 * related EAs have already been saved via mdc_save_lovea() before
+	 * coming here. So it is safe to free the reply buffer some earlier
+	 * before releasing the RPC to avoid client OOM. LU-9514 */
+	if (count == 1 && request->rq_early_free_repbuf && request->rq_repbuf) {
+		spin_lock(&request->rq_early_free_lock);
+		sptlrpc_cli_free_repbuf(request);
+		request->rq_repbuf = NULL;
+		request->rq_repbuf_len = 0;
+		request->rq_repdata = NULL;
+		request->rq_reqdata_len = 0;
+		spin_unlock(&request->rq_early_free_lock);
+	}
+	spin_unlock(&request->rq_lock);
+
+	if (!count)
+		__ptlrpc_free_req(request, locked);
+
+	RETURN(!count);
+}
+
+/**
+ * Drops one reference count for a request.
+ */
+void ptlrpc_req_finished(struct ptlrpc_request *request)
+{
+        __ptlrpc_req_finished(request, 0);
+}
+EXPORT_SYMBOL(ptlrpc_req_finished);
+
+/**
+ * Returns xid of a \a request
+ */
+__u64 ptlrpc_req_xid(struct ptlrpc_request *request)
+{
+        return request->rq_xid;
+}
+EXPORT_SYMBOL(ptlrpc_req_xid);
+
+/**
+ * Disengage the client's reply buffer from the network
+ * NB does _NOT_ unregister any client-side bulk.
+ * IDEMPOTENT, but _not_ safe against concurrent callers.
+ * The request owner (i.e. the thread doing the I/O) must call...
+ * Returns 0 on success or 1 if unregistering cannot be made.
+ */
+static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async)
+{
+	int                rc;
+	struct l_wait_info lwi;
+
+	/*
+	 * Might sleep.
+	 */
+	LASSERT(!in_interrupt());
+
+	/* Let's setup deadline for reply unlink. */
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+	    async && request->rq_reply_deadline == 0 && cfs_fail_val == 0)
+		request->rq_reply_deadline = ktime_get_real_seconds() +
+					     LONG_UNLINK;
+
+        /*
+         * Nothing left to do.
+         */
+        if (!ptlrpc_client_recv_or_unlink(request))
+                RETURN(1);
+
+        LNetMDUnlink(request->rq_reply_md_h);
+
+        /*
+         * Let's check it once again.
+         */
+        if (!ptlrpc_client_recv_or_unlink(request))
+                RETURN(1);
+
+	/* Move to "Unregistering" phase as reply was not unlinked yet. */
+	ptlrpc_rqphase_move(request, RQ_PHASE_UNREG_RPC);
+
+        /*
+         * Do not wait for unlink to finish.
+         */
+        if (async)
+                RETURN(0);
+
+        /*
+         * We have to l_wait_event() whatever the result, to give liblustre
+         * a chance to run reply_in_callback(), and to make sure we've
+         * unlinked before returning a req to the pool.
+         */
+        for (;;) {
+		/* The wq argument is ignored by user-space wait_event macros */
+		wait_queue_head_t *wq = (request->rq_set != NULL) ?
+					&request->rq_set->set_waitq :
+					&request->rq_reply_waitq;
+                /* Network access will complete in finite time but the HUGE
+                 * timeout lets us CWARN for visibility of sluggish NALs */
+                lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK),
+                                           cfs_time_seconds(1), NULL, NULL);
+                rc = l_wait_event(*wq, !ptlrpc_client_recv_or_unlink(request),
+                                  &lwi);
+                if (rc == 0) {
+                        ptlrpc_rqphase_move(request, request->rq_next_phase);
+                        RETURN(1);
+                }
+
+                LASSERT(rc == -ETIMEDOUT);
+		DEBUG_REQ(D_WARNING, request, "Unexpectedly long timeout "
+			  "receiving_reply=%d req_ulinked=%d reply_unlinked=%d",
+			  request->rq_receiving_reply,
+			  request->rq_req_unlinked,
+			  request->rq_reply_unlinked);
+        }
+        RETURN(0);
+}
+
+static void ptlrpc_free_request(struct ptlrpc_request *req)
+{
+	spin_lock(&req->rq_lock);
+	req->rq_replay = 0;
+	spin_unlock(&req->rq_lock);
+
+	if (req->rq_commit_cb != NULL)
+		req->rq_commit_cb(req);
+	list_del_init(&req->rq_replay_list);
+
+	__ptlrpc_req_finished(req, 1);
+}
+
+/**
+ * the request is committed and dropped from the replay list of its import
+ */
+void ptlrpc_request_committed(struct ptlrpc_request *req, int force)
+{
+	struct obd_import 	*imp = req->rq_import;
+
+	spin_lock(&imp->imp_lock);
+	if (list_empty(&req->rq_replay_list)) {
+		spin_unlock(&imp->imp_lock);
+		return;
+	}
+
+	if (force || req->rq_transno <= imp->imp_peer_committed_transno)
+		ptlrpc_free_request(req);
+
+	spin_unlock(&imp->imp_lock);
+}
+EXPORT_SYMBOL(ptlrpc_request_committed);
+
+/**
+ * Iterates through replay_list on import and prunes
+ * all requests have transno smaller than last_committed for the
+ * import and don't have rq_replay set.
+ * Since requests are sorted in transno order, stops when meetign first
+ * transno bigger than last_committed.
+ * caller must hold imp->imp_lock
+ */
+void ptlrpc_free_committed(struct obd_import *imp)
+{
+	struct ptlrpc_request	*req, *saved;
+	struct ptlrpc_request	*last_req = NULL; /* temporary fire escape */
+	bool			 skip_committed_list = true;
+	ENTRY;
+
+	LASSERT(imp != NULL);
+	assert_spin_locked(&imp->imp_lock);
+
+        if (imp->imp_peer_committed_transno == imp->imp_last_transno_checked &&
+            imp->imp_generation == imp->imp_last_generation_checked) {
+		CDEBUG(D_INFO, "%s: skip recheck: last_committed %llu\n",
+                       imp->imp_obd->obd_name, imp->imp_peer_committed_transno);
+		RETURN_EXIT;
+        }
+	CDEBUG(D_RPCTRACE, "%s: committing for last_committed %llu gen %d\n",
+               imp->imp_obd->obd_name, imp->imp_peer_committed_transno,
+               imp->imp_generation);
+
+	if (imp->imp_generation != imp->imp_last_generation_checked ||
+	    imp->imp_last_transno_checked == 0)
+		skip_committed_list = false;
+
+        imp->imp_last_transno_checked = imp->imp_peer_committed_transno;
+        imp->imp_last_generation_checked = imp->imp_generation;
+
+	list_for_each_entry_safe(req, saved, &imp->imp_replay_list,
+				     rq_replay_list) {
+                /* XXX ok to remove when 1357 resolved - rread 05/29/03  */
+                LASSERT(req != last_req);
+                last_req = req;
+
+                if (req->rq_transno == 0) {
+                        DEBUG_REQ(D_EMERG, req, "zero transno during replay");
+                        LBUG();
+                }
+                if (req->rq_import_generation < imp->imp_generation) {
+                        DEBUG_REQ(D_RPCTRACE, req, "free request with old gen");
+                        GOTO(free_req, 0);
+                }
+
+                /* not yet committed */
+                if (req->rq_transno > imp->imp_peer_committed_transno) {
+                        DEBUG_REQ(D_RPCTRACE, req, "stopping search");
+                        break;
+                }
+
+		if (req->rq_replay) {
+			DEBUG_REQ(D_RPCTRACE, req, "keeping (FL_REPLAY)");
+			list_move_tail(&req->rq_replay_list,
+					   &imp->imp_committed_list);
+			continue;
+		}
+
+		DEBUG_REQ(D_INFO, req, "commit (last_committed %llu)",
+                          imp->imp_peer_committed_transno);
+free_req:
+		ptlrpc_free_request(req);
+        }
+
+	if (skip_committed_list)
+		GOTO(out, 0);
+
+	list_for_each_entry_safe(req, saved, &imp->imp_committed_list,
+				 rq_replay_list) {
+		LASSERT(req->rq_transno != 0);
+		if (req->rq_import_generation < imp->imp_generation ||
+		    !req->rq_replay) {
+			DEBUG_REQ(D_RPCTRACE, req, "free %s open request",
+				  req->rq_import_generation <
+				  imp->imp_generation ? "stale" : "closed");
+
+			if (imp->imp_replay_cursor == &req->rq_replay_list)
+				imp->imp_replay_cursor =
+					req->rq_replay_list.next;
+
+			ptlrpc_free_request(req);
+		}
+	}
+out:
+        EXIT;
+}
+
+void ptlrpc_cleanup_client(struct obd_import *imp)
+{
+        ENTRY;
+        EXIT;
+}
+
+/**
+ * Schedule previously sent request for resend.
+ * For bulk requests we assign new xid (to avoid problems with
+ * lost replies and therefore several transfers landing into same buffer
+ * from different sending attempts).
+ */
+void ptlrpc_resend_req(struct ptlrpc_request *req)
+{
+        DEBUG_REQ(D_HA, req, "going to resend");
+	spin_lock(&req->rq_lock);
+
+	/* Request got reply but linked to the import list still.
+	   Let ptlrpc_check_set() to process it. */
+	if (ptlrpc_client_replied(req)) {
+		spin_unlock(&req->rq_lock);
+		DEBUG_REQ(D_HA, req, "it has reply, so skip it");
+		return;
+	}
+
+        lustre_msg_set_handle(req->rq_reqmsg, &(struct lustre_handle){ 0 });
+        req->rq_status = -EAGAIN;
+
+        req->rq_resend = 1;
+        req->rq_net_err = 0;
+        req->rq_timedout = 0;
+
+        ptlrpc_client_wake_req(req);
+	spin_unlock(&req->rq_lock);
+}
+
+/* XXX: this function and rq_status are currently unused */
+void ptlrpc_restart_req(struct ptlrpc_request *req)
+{
+	DEBUG_REQ(D_HA, req, "restarting (possibly-)completed request");
+	req->rq_status = -ERESTARTSYS;
+
+	spin_lock(&req->rq_lock);
+	req->rq_restart = 1;
+	req->rq_timedout = 0;
+	ptlrpc_client_wake_req(req);
+	spin_unlock(&req->rq_lock);
+}
+
+/**
+ * Grab additional reference on a request \a req
+ */
+struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req)
+{
+	ENTRY;
+	atomic_inc(&req->rq_refcount);
+	RETURN(req);
+}
+EXPORT_SYMBOL(ptlrpc_request_addref);
+
+/**
+ * Add a request to import replay_list.
+ * Must be called under imp_lock
+ */
+void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
+                                      struct obd_import *imp)
+{
+	struct list_head *tmp;
+
+	assert_spin_locked(&imp->imp_lock);
+
+        if (req->rq_transno == 0) {
+                DEBUG_REQ(D_EMERG, req, "saving request with zero transno");
+                LBUG();
+        }
+
+        /* clear this for new requests that were resent as well
+           as resent replayed requests. */
+        lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT);
+
+	/* don't re-add requests that have been replayed */
+	if (!list_empty(&req->rq_replay_list))
+		return;
+
+	lustre_msg_add_flags(req->rq_reqmsg, MSG_REPLAY);
+
+	spin_lock(&req->rq_lock);
+	req->rq_resend = 0;
+	spin_unlock(&req->rq_lock);
+
+	LASSERT(imp->imp_replayable);
+	/* Balanced in ptlrpc_free_committed, usually. */
+	ptlrpc_request_addref(req);
+	list_for_each_prev(tmp, &imp->imp_replay_list) {
+		struct ptlrpc_request *iter = list_entry(tmp,
+							 struct ptlrpc_request,
+							 rq_replay_list);
+
+                /* We may have duplicate transnos if we create and then
+                 * open a file, or for closes retained if to match creating
+                 * opens, so use req->rq_xid as a secondary key.
+                 * (See bugs 684, 685, and 428.)
+                 * XXX no longer needed, but all opens need transnos!
+                 */
+                if (iter->rq_transno > req->rq_transno)
+                        continue;
+
+                if (iter->rq_transno == req->rq_transno) {
+                        LASSERT(iter->rq_xid != req->rq_xid);
+                        if (iter->rq_xid > req->rq_xid)
+                                continue;
+                }
+
+		list_add(&req->rq_replay_list, &iter->rq_replay_list);
+		return;
+	}
+
+	list_add(&req->rq_replay_list, &imp->imp_replay_list);
+}
+
+/**
+ * Send request and wait until it completes.
+ * Returns request processing status.
+ */
+int ptlrpc_queue_wait(struct ptlrpc_request *req)
+{
+        struct ptlrpc_request_set *set;
+        int rc;
+        ENTRY;
+
+        LASSERT(req->rq_set == NULL);
+        LASSERT(!req->rq_receiving_reply);
+
+	set = ptlrpc_prep_set();
+	if (set == NULL) {
+		CERROR("cannot allocate ptlrpc set: rc = %d\n", -ENOMEM);
+		RETURN(-ENOMEM);
+	}
+
+	/* for distributed debugging */
+	lustre_msg_set_status(req->rq_reqmsg, current_pid());
+
+        /* add a ref for the set (see comment in ptlrpc_set_add_req) */
+        ptlrpc_request_addref(req);
+        ptlrpc_set_add_req(set, req);
+        rc = ptlrpc_set_wait(set);
+        ptlrpc_set_destroy(set);
+
+        RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_queue_wait);
+
+/**
+ * Callback used for replayed requests reply processing.
+ * In case of successful reply calls registered request replay callback.
+ * In case of error restart replay process.
+ */
+static int ptlrpc_replay_interpret(const struct lu_env *env,
+				   struct ptlrpc_request *req,
+				   void * data, int rc)
+{
+	struct ptlrpc_replay_async_args *aa = data;
+	struct obd_import *imp = req->rq_import;
+
+	ENTRY;
+	atomic_dec(&imp->imp_replay_inflight);
+
+	/* Note: if it is bulk replay (MDS-MDS replay), then even if
+	 * server got the request, but bulk transfer timeout, let's
+	 * replay the bulk req again */
+	if (!ptlrpc_client_replied(req) ||
+	    (req->rq_bulk != NULL &&
+	     lustre_msg_get_status(req->rq_repmsg) == -ETIMEDOUT)) {
+		DEBUG_REQ(D_ERROR, req, "request replay timed out.\n");
+		GOTO(out, rc = -ETIMEDOUT);
+	}
+
+        if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR &&
+            (lustre_msg_get_status(req->rq_repmsg) == -ENOTCONN ||
+             lustre_msg_get_status(req->rq_repmsg) == -ENODEV))
+                GOTO(out, rc = lustre_msg_get_status(req->rq_repmsg));
+
+        /** VBR: check version failure */
+        if (lustre_msg_get_status(req->rq_repmsg) == -EOVERFLOW) {
+		/** replay was failed due to version mismatch */
+		DEBUG_REQ(D_WARNING, req, "Version mismatch during replay\n");
+		spin_lock(&imp->imp_lock);
+		imp->imp_vbr_failed = 1;
+		imp->imp_no_lock_replay = 1;
+		spin_unlock(&imp->imp_lock);
+		lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
+        } else {
+                /** The transno had better not change over replay. */
+                LASSERTF(lustre_msg_get_transno(req->rq_reqmsg) ==
+                         lustre_msg_get_transno(req->rq_repmsg) ||
+                         lustre_msg_get_transno(req->rq_repmsg) == 0,
+			 "%#llx/%#llx\n",
+                         lustre_msg_get_transno(req->rq_reqmsg),
+                         lustre_msg_get_transno(req->rq_repmsg));
+        }
+
+	spin_lock(&imp->imp_lock);
+	/** if replays by version then gap occur on server, no trust to locks */
+	if (lustre_msg_get_flags(req->rq_repmsg) & MSG_VERSION_REPLAY)
+		imp->imp_no_lock_replay = 1;
+	imp->imp_last_replay_transno = lustre_msg_get_transno(req->rq_reqmsg);
+	spin_unlock(&imp->imp_lock);
+        LASSERT(imp->imp_last_replay_transno);
+
+        /* transaction number shouldn't be bigger than the latest replayed */
+        if (req->rq_transno > lustre_msg_get_transno(req->rq_reqmsg)) {
+                DEBUG_REQ(D_ERROR, req,
+			  "Reported transno %llu is bigger than the "
+			  "replayed one: %llu", req->rq_transno,
+                          lustre_msg_get_transno(req->rq_reqmsg));
+                GOTO(out, rc = -EINVAL);
+        }
+
+        DEBUG_REQ(D_HA, req, "got rep");
+
+        /* let the callback do fixups, possibly including in the request */
+        if (req->rq_replay_cb)
+                req->rq_replay_cb(req);
+
+        if (ptlrpc_client_replied(req) &&
+            lustre_msg_get_status(req->rq_repmsg) != aa->praa_old_status) {
+                DEBUG_REQ(D_ERROR, req, "status %d, old was %d",
+                          lustre_msg_get_status(req->rq_repmsg),
+                          aa->praa_old_status);
+
+		/* Note: If the replay fails for MDT-MDT recovery, let's
+		 * abort all of the following requests in the replay
+		 * and sending list, because MDT-MDT update requests
+		 * are dependent on each other, see LU-7039 */
+		if (imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS) {
+			struct ptlrpc_request *free_req;
+			struct ptlrpc_request *tmp;
+
+			spin_lock(&imp->imp_lock);
+			list_for_each_entry_safe(free_req, tmp,
+						 &imp->imp_replay_list,
+						 rq_replay_list) {
+				ptlrpc_free_request(free_req);
+			}
+
+			list_for_each_entry_safe(free_req, tmp,
+						 &imp->imp_committed_list,
+						 rq_replay_list) {
+				ptlrpc_free_request(free_req);
+			}
+
+			list_for_each_entry_safe(free_req, tmp,
+						&imp->imp_delayed_list,
+						rq_list) {
+				spin_lock(&free_req->rq_lock);
+				free_req->rq_err = 1;
+				free_req->rq_status = -EIO;
+				ptlrpc_client_wake_req(free_req);
+				spin_unlock(&free_req->rq_lock);
+			}
+
+			list_for_each_entry_safe(free_req, tmp,
+						&imp->imp_sending_list,
+						rq_list) {
+				spin_lock(&free_req->rq_lock);
+				free_req->rq_err = 1;
+				free_req->rq_status = -EIO;
+				ptlrpc_client_wake_req(free_req);
+				spin_unlock(&free_req->rq_lock);
+			}
+			spin_unlock(&imp->imp_lock);
+		}
+        } else {
+                /* Put it back for re-replay. */
+                lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
+        }
+
+        /*
+         * Errors while replay can set transno to 0, but
+         * imp_last_replay_transno shouldn't be set to 0 anyway
+         */
+        if (req->rq_transno == 0)
+                CERROR("Transno is 0 during replay!\n");
+
+        /* continue with recovery */
+        rc = ptlrpc_import_recovery_state_machine(imp);
+ out:
+        req->rq_send_state = aa->praa_old_state;
+
+        if (rc != 0)
+                /* this replay failed, so restart recovery */
+                ptlrpc_connect_import(imp);
+
+        RETURN(rc);
+}
+
+/**
+ * Prepares and queues request for replay.
+ * Adds it to ptlrpcd queue for actual sending.
+ * Returns 0 on success.
+ */
+int ptlrpc_replay_req(struct ptlrpc_request *req)
+{
+        struct ptlrpc_replay_async_args *aa;
+        ENTRY;
+
+        LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY);
+
+        LASSERT (sizeof (*aa) <= sizeof (req->rq_async_args));
+        aa = ptlrpc_req_async_args(req);
+        memset(aa, 0, sizeof *aa);
+
+        /* Prepare request to be resent with ptlrpcd */
+        aa->praa_old_state = req->rq_send_state;
+        req->rq_send_state = LUSTRE_IMP_REPLAY;
+        req->rq_phase = RQ_PHASE_NEW;
+        req->rq_next_phase = RQ_PHASE_UNDEFINED;
+        if (req->rq_repmsg)
+                aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg);
+        req->rq_status = 0;
+        req->rq_interpret_reply = ptlrpc_replay_interpret;
+        /* Readjust the timeout for current conditions */
+        ptlrpc_at_set_req_timeout(req);
+
+        /* Tell server the net_latency, so the server can calculate how long
+         * it should wait for next replay */
+        lustre_msg_set_service_time(req->rq_reqmsg,
+                                    ptlrpc_at_get_net_latency(req));
+        DEBUG_REQ(D_HA, req, "REPLAY");
+
+	atomic_inc(&req->rq_import->imp_replay_inflight);
+	spin_lock(&req->rq_lock);
+	req->rq_early_free_repbuf = 0;
+	spin_unlock(&req->rq_lock);
+	ptlrpc_request_addref(req);	/* ptlrpcd needs a ref */
+
+	ptlrpcd_add_req(req);
+	RETURN(0);
+}
+
+/**
+ * Aborts all in-flight request on import \a imp sending and delayed lists
+ */
+void ptlrpc_abort_inflight(struct obd_import *imp)
+{
+	struct list_head *tmp, *n;
+	ENTRY;
+
+	/* Make sure that no new requests get processed for this import.
+	 * ptlrpc_{queue,set}_wait must (and does) hold imp_lock while testing
+	 * this flag and then putting requests on sending_list or delayed_list.
+	 */
+	spin_lock(&imp->imp_lock);
+
+	/* XXX locking?  Maybe we should remove each request with the list
+	 * locked?  Also, how do we know if the requests on the list are
+	 * being freed at this time?
+	 */
+	list_for_each_safe(tmp, n, &imp->imp_sending_list) {
+		struct ptlrpc_request *req = list_entry(tmp,
+							struct ptlrpc_request,
+							rq_list);
+
+                DEBUG_REQ(D_RPCTRACE, req, "inflight");
+
+		spin_lock(&req->rq_lock);
+		if (req->rq_import_generation < imp->imp_generation) {
+			req->rq_err = 1;
+			req->rq_status = -EIO;
+			ptlrpc_client_wake_req(req);
+		}
+		spin_unlock(&req->rq_lock);
+	}
+
+	list_for_each_safe(tmp, n, &imp->imp_delayed_list) {
+		struct ptlrpc_request *req =
+			list_entry(tmp, struct ptlrpc_request, rq_list);
+
+		DEBUG_REQ(D_RPCTRACE, req, "aborting waiting req");
+
+		spin_lock(&req->rq_lock);
+		if (req->rq_import_generation < imp->imp_generation) {
+			req->rq_err = 1;
+			req->rq_status = -EIO;
+			ptlrpc_client_wake_req(req);
+		}
+		spin_unlock(&req->rq_lock);
+	}
+
+	/* Last chance to free reqs left on the replay list, but we
+	 * will still leak reqs that haven't committed.  */
+	if (imp->imp_replayable)
+		ptlrpc_free_committed(imp);
+
+	spin_unlock(&imp->imp_lock);
+
+	EXIT;
+}
+
+/**
+ * Abort all uncompleted requests in request set \a set
+ */
+void ptlrpc_abort_set(struct ptlrpc_request_set *set)
+{
+	struct list_head *tmp, *pos;
+
+	LASSERT(set != NULL);
+
+	list_for_each_safe(pos, tmp, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(pos, struct ptlrpc_request,
+				   rq_set_chain);
+
+		spin_lock(&req->rq_lock);
+		if (req->rq_phase != RQ_PHASE_RPC) {
+			spin_unlock(&req->rq_lock);
+			continue;
+		}
+
+		req->rq_err = 1;
+		req->rq_status = -EINTR;
+		ptlrpc_client_wake_req(req);
+		spin_unlock(&req->rq_lock);
+	}
+}
+
+static __u64 ptlrpc_last_xid;
+static spinlock_t ptlrpc_last_xid_lock;
+
+/**
+ * Initialize the XID for the node.  This is common among all requests on
+ * this node, and only requires the property that it is monotonically
+ * increasing.  It does not need to be sequential.  Since this is also used
+ * as the RDMA match bits, it is important that a single client NOT have
+ * the same match bits for two different in-flight requests, hence we do
+ * NOT want to have an XID per target or similar.
+ *
+ * To avoid an unlikely collision between match bits after a client reboot
+ * (which would deliver old data into the wrong RDMA buffer) initialize
+ * the XID based on the current time, assuming a maximum RPC rate of 1M RPC/s.
+ * If the time is clearly incorrect, we instead use a 62-bit random number.
+ * In the worst case the random number will overflow 1M RPCs per second in
+ * 9133 years, or permutations thereof.
+ */
+#define YEAR_2004 (1ULL << 30)
+void ptlrpc_init_xid(void)
+{
+	time64_t now = ktime_get_real_seconds();
+
+	spin_lock_init(&ptlrpc_last_xid_lock);
+	if (now < YEAR_2004) {
+		cfs_get_random_bytes(&ptlrpc_last_xid, sizeof(ptlrpc_last_xid));
+		ptlrpc_last_xid >>= 2;
+		ptlrpc_last_xid |= (1ULL << 61);
+	} else {
+		ptlrpc_last_xid = (__u64)now << 20;
+	}
+
+	/* Need to always be aligned to a power-of-two for mutli-bulk BRW */
+	CLASSERT((PTLRPC_BULK_OPS_COUNT & (PTLRPC_BULK_OPS_COUNT - 1)) == 0);
+	ptlrpc_last_xid &= PTLRPC_BULK_OPS_MASK;
+}
+
+/**
+ * Increase xid and returns resulting new value to the caller.
+ *
+ * Multi-bulk BRW RPCs consume multiple XIDs for each bulk transfer, starting
+ * at the returned xid, up to xid + PTLRPC_BULK_OPS_COUNT - 1. The BRW RPC
+ * itself uses the last bulk xid needed, so the server can determine the
+ * the number of bulk transfers from the RPC XID and a bitmask.  The starting
+ * xid must align to a power-of-two value.
+ *
+ * This is assumed to be true due to the initial ptlrpc_last_xid
+ * value also being initialized to a power-of-two value. LU-1431
+ */
+__u64 ptlrpc_next_xid(void)
+{
+	__u64 next;
+
+	spin_lock(&ptlrpc_last_xid_lock);
+	next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
+	ptlrpc_last_xid = next;
+	spin_unlock(&ptlrpc_last_xid_lock);
+
+	return next;
+}
+
+/**
+ * If request has a new allocated XID (new request or EINPROGRESS resend),
+ * use this XID as matchbits of bulk, otherwise allocate a new matchbits for
+ * request to ensure previous bulk fails and avoid problems with lost replies
+ * and therefore several transfers landing into the same buffer from different
+ * sending attempts.
+ */
+void ptlrpc_set_bulk_mbits(struct ptlrpc_request *req)
+{
+	struct ptlrpc_bulk_desc *bd = req->rq_bulk;
+
+	LASSERT(bd != NULL);
+
+	/* Generate new matchbits for all resend requests, including
+	 * resend replay. */
+	if (req->rq_resend) {
+		__u64 old_mbits = req->rq_mbits;
+
+		/* First time resend on -EINPROGRESS will generate new xid,
+		 * so we can actually use the rq_xid as rq_mbits in such case,
+		 * however, it's bit hard to distinguish such resend with a
+		 * 'resend for the -EINPROGRESS resend'. To make it simple,
+		 * we opt to generate mbits for all resend cases. */
+		if (OCD_HAS_FLAG(&bd->bd_import->imp_connect_data, BULK_MBITS)){
+			req->rq_mbits = ptlrpc_next_xid();
+		} else {
+			/* Old version transfers rq_xid to peer as
+			 * matchbits. */
+			spin_lock(&req->rq_import->imp_lock);
+			list_del_init(&req->rq_unreplied_list);
+			ptlrpc_assign_next_xid_nolock(req);
+			spin_unlock(&req->rq_import->imp_lock);
+			req->rq_mbits = req->rq_xid;
+		}
+		CDEBUG(D_HA, "resend bulk old x%llu new x%llu\n",
+		       old_mbits, req->rq_mbits);
+	} else if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) {
+		/* Request being sent first time, use xid as matchbits. */
+		req->rq_mbits = req->rq_xid;
+	} else {
+		/* Replay request, xid and matchbits have already been
+		 * correctly assigned. */
+		return;
+	}
+
+	/* For multi-bulk RPCs, rq_mbits is the last mbits needed for bulks so
+	 * that server can infer the number of bulks that were prepared,
+	 * see LU-1431 */
+	req->rq_mbits += ((bd->bd_iov_count + LNET_MAX_IOV - 1) /
+			  LNET_MAX_IOV) - 1;
+
+	/* Set rq_xid as rq_mbits to indicate the final bulk for the old
+	 * server which does not support OBD_CONNECT_BULK_MBITS. LU-6808.
+	 *
+	 * It's ok to directly set the rq_xid here, since this xid bump
+	 * won't affect the request position in unreplied list. */
+	if (!OCD_HAS_FLAG(&bd->bd_import->imp_connect_data, BULK_MBITS))
+		req->rq_xid = req->rq_mbits;
+}
+
+/**
+ * Get a glimpse at what next xid value might have been.
+ * Returns possible next xid.
+ */
+__u64 ptlrpc_sample_next_xid(void)
+{
+#if BITS_PER_LONG == 32
+	/* need to avoid possible word tearing on 32-bit systems */
+	__u64 next;
+
+	spin_lock(&ptlrpc_last_xid_lock);
+	next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
+	spin_unlock(&ptlrpc_last_xid_lock);
+
+	return next;
+#else
+	/* No need to lock, since returned value is racy anyways */
+	return ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
+#endif
+}
+EXPORT_SYMBOL(ptlrpc_sample_next_xid);
+
+/**
+ * Functions for operating ptlrpc workers.
+ *
+ * A ptlrpc work is a function which will be running inside ptlrpc context.
+ * The callback shouldn't sleep otherwise it will block that ptlrpcd thread.
+ *
+ * 1. after a work is created, it can be used many times, that is:
+ *         handler = ptlrpcd_alloc_work();
+ *         ptlrpcd_queue_work();
+ *
+ *    queue it again when necessary:
+ *         ptlrpcd_queue_work();
+ *         ptlrpcd_destroy_work();
+ * 2. ptlrpcd_queue_work() can be called by multiple processes meanwhile, but
+ *    it will only be queued once in any time. Also as its name implies, it may
+ *    have delay before it really runs by ptlrpcd thread.
+ */
+struct ptlrpc_work_async_args {
+	int   (*cb)(const struct lu_env *, void *);
+	void   *cbdata;
+};
+
+static void ptlrpcd_add_work_req(struct ptlrpc_request *req)
+{
+	/* re-initialize the req */
+	req->rq_timeout		= obd_timeout;
+	req->rq_sent		= ktime_get_real_seconds();
+	req->rq_deadline	= req->rq_sent + req->rq_timeout;
+	req->rq_phase		= RQ_PHASE_INTERPRET;
+	req->rq_next_phase	= RQ_PHASE_COMPLETE;
+	req->rq_xid		= ptlrpc_next_xid();
+	req->rq_import_generation = req->rq_import->imp_generation;
+
+	ptlrpcd_add_req(req);
+}
+
+static int work_interpreter(const struct lu_env *env,
+			    struct ptlrpc_request *req, void *data, int rc)
+{
+	struct ptlrpc_work_async_args *arg = data;
+
+	LASSERT(ptlrpcd_check_work(req));
+	LASSERT(arg->cb != NULL);
+
+	rc = arg->cb(env, arg->cbdata);
+
+	list_del_init(&req->rq_set_chain);
+	req->rq_set = NULL;
+
+	if (atomic_dec_return(&req->rq_refcount) > 1) {
+		atomic_set(&req->rq_refcount, 2);
+		ptlrpcd_add_work_req(req);
+	}
+	return rc;
+}
+
+static int worker_format;
+
+static int ptlrpcd_check_work(struct ptlrpc_request *req)
+{
+	return req->rq_pill.rc_fmt == (void *)&worker_format;
+}
+
+/**
+ * Create a work for ptlrpc.
+ */
+void *ptlrpcd_alloc_work(struct obd_import *imp,
+			 int (*cb)(const struct lu_env *, void *), void *cbdata)
+{
+	struct ptlrpc_request	      *req = NULL;
+	struct ptlrpc_work_async_args *args;
+	ENTRY;
+
+	might_sleep();
+
+	if (cb == NULL)
+		RETURN(ERR_PTR(-EINVAL));
+
+	/* copy some code from deprecated fakereq. */
+	req = ptlrpc_request_cache_alloc(GFP_NOFS);
+	if (req == NULL) {
+		CERROR("ptlrpc: run out of memory!\n");
+		RETURN(ERR_PTR(-ENOMEM));
+	}
+
+	ptlrpc_cli_req_init(req);
+
+	req->rq_send_state = LUSTRE_IMP_FULL;
+	req->rq_type = PTL_RPC_MSG_REQUEST;
+	req->rq_import = class_import_get(imp);
+	req->rq_interpret_reply = work_interpreter;
+	/* don't want reply */
+	req->rq_no_delay = req->rq_no_resend = 1;
+	req->rq_pill.rc_fmt = (void *)&worker_format;
+
+	CLASSERT (sizeof(*args) <= sizeof(req->rq_async_args));
+	args = ptlrpc_req_async_args(req);
+	args->cb     = cb;
+	args->cbdata = cbdata;
+
+	RETURN(req);
+}
+EXPORT_SYMBOL(ptlrpcd_alloc_work);
+
+void ptlrpcd_destroy_work(void *handler)
+{
+        struct ptlrpc_request *req = handler;
+
+        if (req)
+                ptlrpc_req_finished(req);
+}
+EXPORT_SYMBOL(ptlrpcd_destroy_work);
+
+int ptlrpcd_queue_work(void *handler)
+{
+	struct ptlrpc_request *req = handler;
+
+        /*
+         * Check if the req is already being queued.
+         *
+         * Here comes a trick: it lacks a way of checking if a req is being
+         * processed reliably in ptlrpc. Here I have to use refcount of req
+         * for this purpose. This is okay because the caller should use this
+         * req as opaque data. - Jinshan
+         */
+	LASSERT(atomic_read(&req->rq_refcount) > 0);
+	if (atomic_inc_return(&req->rq_refcount) == 2)
+		ptlrpcd_add_work_req(req);
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpcd_queue_work);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/connection.c b/drivers/staging/lustrefsx/lustre/ptlrpc/connection.c
new file mode 100644
index 0000000000000..369eace7f9233
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/connection.c
@@ -0,0 +1,240 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+
+#include "ptlrpc_internal.h"
+
+static struct cfs_hash *conn_hash;
+static struct cfs_hash_ops conn_hash_ops;
+
+struct ptlrpc_connection *
+ptlrpc_connection_get(struct lnet_process_id peer, lnet_nid_t self,
+		      struct obd_uuid *uuid)
+{
+	struct ptlrpc_connection *conn, *conn2;
+	ENTRY;
+
+	peer.nid = LNetPrimaryNID(peer.nid);
+	conn = cfs_hash_lookup(conn_hash, &peer);
+	if (conn)
+		GOTO(out, conn);
+
+	OBD_ALLOC_PTR(conn);
+	if (!conn)
+		RETURN(NULL);
+
+	conn->c_peer = peer;
+	conn->c_self = self;
+	INIT_HLIST_NODE(&conn->c_hash);
+	atomic_set(&conn->c_refcount, 1);
+	if (uuid)
+		obd_str2uuid(&conn->c_remote_uuid, uuid->uuid);
+
+	/*
+	 * Add the newly created conn to the hash, on key collision we
+	 * lost a racing addition and must destroy our newly allocated
+	 * connection.	The object which exists in the hash will be
+	 * returned and may be compared against out object.
+	 */
+	/* In the function below, .hs_keycmp resolves to
+	 * conn_keycmp() */
+	/* coverity[overrun-buffer-val] */
+	conn2 = cfs_hash_findadd_unique(conn_hash, &peer, &conn->c_hash);
+	if (conn != conn2) {
+		OBD_FREE_PTR(conn);
+		conn = conn2;
+	}
+	EXIT;
+out:
+	CDEBUG(D_INFO, "conn=%p refcount %d to %s\n",
+	       conn, atomic_read(&conn->c_refcount),
+	       libcfs_nid2str(conn->c_peer.nid));
+	return conn;
+}
+
+int ptlrpc_connection_put(struct ptlrpc_connection *conn)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (!conn)
+		RETURN(rc);
+
+	LASSERT(atomic_read(&conn->c_refcount) > 1);
+
+	/*
+	 * We do not remove connection from hashtable and
+	 * do not free it even if last caller released ref,
+	 * as we want to have it cached for the case it is
+	 * needed again.
+	 *
+	 * Deallocating it and later creating new connection
+	 * again would be wastful. This way we also avoid
+	 * expensive locking to protect things from get/put
+	 * race when found cached connection is freed by
+	 * ptlrpc_connection_put().
+	 *
+	 * It will be freed later in module unload time,
+	 * when ptlrpc_connection_fini()->lh_exit->conn_exit()
+	 * path is called.
+	 */
+	if (atomic_dec_return(&conn->c_refcount) == 1)
+		rc = 1;
+
+	CDEBUG(D_INFO, "PUT conn=%p refcount %d to %s\n",
+	       conn, atomic_read(&conn->c_refcount),
+	       libcfs_nid2str(conn->c_peer.nid));
+
+	RETURN(rc);
+}
+
+struct ptlrpc_connection *
+ptlrpc_connection_addref(struct ptlrpc_connection *conn)
+{
+	ENTRY;
+
+	atomic_inc(&conn->c_refcount);
+	CDEBUG(D_INFO, "conn=%p refcount %d to %s\n",
+	       conn, atomic_read(&conn->c_refcount),
+	       libcfs_nid2str(conn->c_peer.nid));
+
+	RETURN(conn);
+}
+
+int ptlrpc_connection_init(void)
+{
+        ENTRY;
+
+        conn_hash = cfs_hash_create("CONN_HASH",
+                                    HASH_CONN_CUR_BITS,
+                                    HASH_CONN_MAX_BITS,
+                                    HASH_CONN_BKT_BITS, 0,
+                                    CFS_HASH_MIN_THETA,
+                                    CFS_HASH_MAX_THETA,
+                                    &conn_hash_ops, CFS_HASH_DEFAULT);
+        if (!conn_hash)
+                RETURN(-ENOMEM);
+
+        RETURN(0);
+}
+
+void ptlrpc_connection_fini(void) {
+        ENTRY;
+        cfs_hash_putref(conn_hash);
+        EXIT;
+}
+
+/*
+ * Hash operations for net_peer<->connection
+ */
+static unsigned
+conn_hashfn(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, sizeof(struct lnet_process_id), mask);
+}
+
+static int
+conn_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct ptlrpc_connection *conn;
+	const struct lnet_process_id *conn_key;
+
+	LASSERT(key != NULL);
+	conn_key = (struct lnet_process_id *)key;
+	conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+
+	return conn_key->nid == conn->c_peer.nid &&
+               conn_key->pid == conn->c_peer.pid;
+}
+
+static void *
+conn_key(struct hlist_node *hnode)
+{
+	struct ptlrpc_connection *conn;
+	conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+	return &conn->c_peer;
+}
+
+static void *
+conn_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+}
+
+static void
+conn_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct ptlrpc_connection *conn;
+
+	conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+	atomic_inc(&conn->c_refcount);
+}
+
+static void
+conn_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct ptlrpc_connection *conn;
+
+	conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+	atomic_dec(&conn->c_refcount);
+}
+
+static void
+conn_exit(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct ptlrpc_connection *conn;
+
+	conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+	/*
+	 * Nothing should be left. Connection user put it and
+	 * connection also was deleted from table by this time
+	 * so we should have 0 refs.
+	 */
+	LASSERTF(atomic_read(&conn->c_refcount) == 0,
+		 "Busy connection with %d refs\n",
+		 atomic_read(&conn->c_refcount));
+	OBD_FREE_PTR(conn);
+}
+
+static struct cfs_hash_ops conn_hash_ops = {
+	.hs_hash	= conn_hashfn,
+	.hs_keycmp	= conn_keycmp,
+	.hs_key		= conn_key,
+	.hs_object	= conn_object,
+	.hs_get		= conn_get,
+	.hs_put_locked	= conn_put_locked,
+	.hs_exit	= conn_exit,
+};
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/errno.c b/drivers/staging/lustrefsx/lustre/ptlrpc/errno.c
new file mode 100644
index 0000000000000..fb302c70d08be
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/errno.c
@@ -0,0 +1,384 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.txt
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (C) 2011 FUJITSU LIMITED.  All rights reserved.
+ *
+ * Copyright (c) 2013, Intel Corporation.
+ */
+
+#include <libcfs/libcfs.h>
+#include <lustre/lustre_errno.h>
+
+#ifdef LUSTRE_TRANSLATE_ERRNOS
+
+/*
+ * The two translation tables below must define a one-to-one mapping between
+ * host and network errnos.
+ *
+ * EWOULDBLOCK is equal to EAGAIN on all architectures except for parisc, which
+ * appears irrelevant.  Thus, existing references to EWOULDBLOCK are fine.
+ *
+ * EDEADLOCK is equal to EDEADLK on x86 but not on sparc, at least.  A sparc
+ * host has no context-free way to determine if a LUSTRE_EDEADLK represents an
+ * EDEADLK or an EDEADLOCK.  Therefore, all existing references to EDEADLOCK
+ * that need to be transferred on wire have been replaced with EDEADLK.
+ */
+static int lustre_errno_hton_mapping[] = {
+	[EPERM]			= LUSTRE_EPERM,
+	[ENOENT]		= LUSTRE_ENOENT,
+	[ESRCH]			= LUSTRE_ESRCH,
+	[EINTR]			= LUSTRE_EINTR,
+	[EIO]			= LUSTRE_EIO,
+	[ENXIO]			= LUSTRE_ENXIO,
+	[E2BIG]			= LUSTRE_E2BIG,
+	[ENOEXEC]		= LUSTRE_ENOEXEC,
+	[EBADF]			= LUSTRE_EBADF,
+	[ECHILD]		= LUSTRE_ECHILD,
+	[EAGAIN]		= LUSTRE_EAGAIN,
+	[ENOMEM]		= LUSTRE_ENOMEM,
+	[EACCES]		= LUSTRE_EACCES,
+	[EFAULT]		= LUSTRE_EFAULT,
+	[ENOTBLK]		= LUSTRE_ENOTBLK,
+	[EBUSY]			= LUSTRE_EBUSY,
+	[EEXIST]		= LUSTRE_EEXIST,
+	[EXDEV]			= LUSTRE_EXDEV,
+	[ENODEV]		= LUSTRE_ENODEV,
+	[ENOTDIR]		= LUSTRE_ENOTDIR,
+	[EISDIR]		= LUSTRE_EISDIR,
+	[EINVAL]		= LUSTRE_EINVAL,
+	[ENFILE]		= LUSTRE_ENFILE,
+	[EMFILE]		= LUSTRE_EMFILE,
+	[ENOTTY]		= LUSTRE_ENOTTY,
+	[ETXTBSY]		= LUSTRE_ETXTBSY,
+	[EFBIG]			= LUSTRE_EFBIG,
+	[ENOSPC]		= LUSTRE_ENOSPC,
+	[ESPIPE]		= LUSTRE_ESPIPE,
+	[EROFS]			= LUSTRE_EROFS,
+	[EMLINK]		= LUSTRE_EMLINK,
+	[EPIPE]			= LUSTRE_EPIPE,
+	[EDOM]			= LUSTRE_EDOM,
+	[ERANGE]		= LUSTRE_ERANGE,
+	[EDEADLK]		= LUSTRE_EDEADLK,
+	[ENAMETOOLONG]		= LUSTRE_ENAMETOOLONG,
+	[ENOLCK]		= LUSTRE_ENOLCK,
+	[ENOSYS]		= LUSTRE_ENOSYS,
+	[ENOTEMPTY]		= LUSTRE_ENOTEMPTY,
+	[ELOOP]			= LUSTRE_ELOOP,
+	[ENOMSG]		= LUSTRE_ENOMSG,
+	[EIDRM]			= LUSTRE_EIDRM,
+	[ECHRNG]		= LUSTRE_ECHRNG,
+	[EL2NSYNC]		= LUSTRE_EL2NSYNC,
+	[EL3HLT]		= LUSTRE_EL3HLT,
+	[EL3RST]		= LUSTRE_EL3RST,
+	[ELNRNG]		= LUSTRE_ELNRNG,
+	[EUNATCH]		= LUSTRE_EUNATCH,
+	[ENOCSI]		= LUSTRE_ENOCSI,
+	[EL2HLT]		= LUSTRE_EL2HLT,
+	[EBADE]			= LUSTRE_EBADE,
+	[EBADR]			= LUSTRE_EBADR,
+	[EXFULL]		= LUSTRE_EXFULL,
+	[ENOANO]		= LUSTRE_ENOANO,
+	[EBADRQC]		= LUSTRE_EBADRQC,
+	[EBADSLT]		= LUSTRE_EBADSLT,
+	[EBFONT]		= LUSTRE_EBFONT,
+	[ENOSTR]		= LUSTRE_ENOSTR,
+	[ENODATA]		= LUSTRE_ENODATA,
+	[ETIME]			= LUSTRE_ETIME,
+	[ENOSR]			= LUSTRE_ENOSR,
+	[ENONET]		= LUSTRE_ENONET,
+	[ENOPKG]		= LUSTRE_ENOPKG,
+	[EREMOTE]		= LUSTRE_EREMOTE,
+	[ENOLINK]		= LUSTRE_ENOLINK,
+	[EADV]			= LUSTRE_EADV,
+	[ESRMNT]		= LUSTRE_ESRMNT,
+	[ECOMM]			= LUSTRE_ECOMM,
+	[EPROTO]		= LUSTRE_EPROTO,
+	[EMULTIHOP]		= LUSTRE_EMULTIHOP,
+	[EDOTDOT]		= LUSTRE_EDOTDOT,
+	[EBADMSG]		= LUSTRE_EBADMSG,
+	[EOVERFLOW]		= LUSTRE_EOVERFLOW,
+	[ENOTUNIQ]		= LUSTRE_ENOTUNIQ,
+	[EBADFD]		= LUSTRE_EBADFD,
+	[EREMCHG]		= LUSTRE_EREMCHG,
+	[ELIBACC]		= LUSTRE_ELIBACC,
+	[ELIBBAD]		= LUSTRE_ELIBBAD,
+	[ELIBSCN]		= LUSTRE_ELIBSCN,
+	[ELIBMAX]		= LUSTRE_ELIBMAX,
+	[ELIBEXEC]		= LUSTRE_ELIBEXEC,
+	[EILSEQ]		= LUSTRE_EILSEQ,
+	[ERESTART]		= LUSTRE_ERESTART,
+	[ESTRPIPE]		= LUSTRE_ESTRPIPE,
+	[EUSERS]		= LUSTRE_EUSERS,
+	[ENOTSOCK]		= LUSTRE_ENOTSOCK,
+	[EDESTADDRREQ]		= LUSTRE_EDESTADDRREQ,
+	[EMSGSIZE]		= LUSTRE_EMSGSIZE,
+	[EPROTOTYPE]		= LUSTRE_EPROTOTYPE,
+	[ENOPROTOOPT]		= LUSTRE_ENOPROTOOPT,
+	[EPROTONOSUPPORT]	= LUSTRE_EPROTONOSUPPORT,
+	[ESOCKTNOSUPPORT]	= LUSTRE_ESOCKTNOSUPPORT,
+	[EOPNOTSUPP]		= LUSTRE_EOPNOTSUPP,
+	[EPFNOSUPPORT]		= LUSTRE_EPFNOSUPPORT,
+	[EAFNOSUPPORT]		= LUSTRE_EAFNOSUPPORT,
+	[EADDRINUSE]		= LUSTRE_EADDRINUSE,
+	[EADDRNOTAVAIL]		= LUSTRE_EADDRNOTAVAIL,
+	[ENETDOWN]		= LUSTRE_ENETDOWN,
+	[ENETUNREACH]		= LUSTRE_ENETUNREACH,
+	[ENETRESET]		= LUSTRE_ENETRESET,
+	[ECONNABORTED]		= LUSTRE_ECONNABORTED,
+	[ECONNRESET]		= LUSTRE_ECONNRESET,
+	[ENOBUFS]		= LUSTRE_ENOBUFS,
+	[EISCONN]		= LUSTRE_EISCONN,
+	[ENOTCONN]		= LUSTRE_ENOTCONN,
+	[ESHUTDOWN]		= LUSTRE_ESHUTDOWN,
+	[ETOOMANYREFS]		= LUSTRE_ETOOMANYREFS,
+	[ETIMEDOUT]		= LUSTRE_ETIMEDOUT,
+	[ECONNREFUSED]		= LUSTRE_ECONNREFUSED,
+	[EHOSTDOWN]		= LUSTRE_EHOSTDOWN,
+	[EHOSTUNREACH]		= LUSTRE_EHOSTUNREACH,
+	[EALREADY]		= LUSTRE_EALREADY,
+	[EINPROGRESS]		= LUSTRE_EINPROGRESS,
+	[ESTALE]		= LUSTRE_ESTALE,
+	[EUCLEAN]		= LUSTRE_EUCLEAN,
+	[ENOTNAM]		= LUSTRE_ENOTNAM,
+	[ENAVAIL]		= LUSTRE_ENAVAIL,
+	[EISNAM]		= LUSTRE_EISNAM,
+	[EREMOTEIO]		= LUSTRE_EREMOTEIO,
+	[EDQUOT]		= LUSTRE_EDQUOT,
+	[ENOMEDIUM]		= LUSTRE_ENOMEDIUM,
+	[EMEDIUMTYPE]		= LUSTRE_EMEDIUMTYPE,
+	[ECANCELED]		= LUSTRE_ECANCELED,
+	[ENOKEY]		= LUSTRE_ENOKEY,
+	[EKEYEXPIRED]		= LUSTRE_EKEYEXPIRED,
+	[EKEYREVOKED]		= LUSTRE_EKEYREVOKED,
+	[EKEYREJECTED]		= LUSTRE_EKEYREJECTED,
+	[EOWNERDEAD]		= LUSTRE_EOWNERDEAD,
+	[ENOTRECOVERABLE]	= LUSTRE_ENOTRECOVERABLE,
+	[ERESTARTSYS]		= LUSTRE_ERESTARTSYS,
+	[ERESTARTNOINTR]	= LUSTRE_ERESTARTNOINTR,
+	[ERESTARTNOHAND]	= LUSTRE_ERESTARTNOHAND,
+	[ENOIOCTLCMD]		= LUSTRE_ENOIOCTLCMD,
+	[ERESTART_RESTARTBLOCK]	= LUSTRE_ERESTART_RESTARTBLOCK,
+	[EBADHANDLE]		= LUSTRE_EBADHANDLE,
+	[ENOTSYNC]		= LUSTRE_ENOTSYNC,
+	[EBADCOOKIE]		= LUSTRE_EBADCOOKIE,
+	[ENOTSUPP]		= LUSTRE_ENOTSUPP,
+	[ETOOSMALL]		= LUSTRE_ETOOSMALL,
+	[ESERVERFAULT]		= LUSTRE_ESERVERFAULT,
+	[EBADTYPE]		= LUSTRE_EBADTYPE,
+	[EJUKEBOX]		= LUSTRE_EJUKEBOX,
+	[EIOCBQUEUED]		= LUSTRE_EIOCBQUEUED
+};
+
+static int lustre_errno_ntoh_mapping[] = {
+	[LUSTRE_EPERM]			= EPERM,
+	[LUSTRE_ENOENT]			= ENOENT,
+	[LUSTRE_ESRCH]			= ESRCH,
+	[LUSTRE_EINTR]			= EINTR,
+	[LUSTRE_EIO]			= EIO,
+	[LUSTRE_ENXIO]			= ENXIO,
+	[LUSTRE_E2BIG]			= E2BIG,
+	[LUSTRE_ENOEXEC]		= ENOEXEC,
+	[LUSTRE_EBADF]			= EBADF,
+	[LUSTRE_ECHILD]			= ECHILD,
+	[LUSTRE_EAGAIN]			= EAGAIN,
+	[LUSTRE_ENOMEM]			= ENOMEM,
+	[LUSTRE_EACCES]			= EACCES,
+	[LUSTRE_EFAULT]			= EFAULT,
+	[LUSTRE_ENOTBLK]		= ENOTBLK,
+	[LUSTRE_EBUSY]			= EBUSY,
+	[LUSTRE_EEXIST]			= EEXIST,
+	[LUSTRE_EXDEV]			= EXDEV,
+	[LUSTRE_ENODEV]			= ENODEV,
+	[LUSTRE_ENOTDIR]		= ENOTDIR,
+	[LUSTRE_EISDIR]			= EISDIR,
+	[LUSTRE_EINVAL]			= EINVAL,
+	[LUSTRE_ENFILE]			= ENFILE,
+	[LUSTRE_EMFILE]			= EMFILE,
+	[LUSTRE_ENOTTY]			= ENOTTY,
+	[LUSTRE_ETXTBSY]		= ETXTBSY,
+	[LUSTRE_EFBIG]			= EFBIG,
+	[LUSTRE_ENOSPC]			= ENOSPC,
+	[LUSTRE_ESPIPE]			= ESPIPE,
+	[LUSTRE_EROFS]			= EROFS,
+	[LUSTRE_EMLINK]			= EMLINK,
+	[LUSTRE_EPIPE]			= EPIPE,
+	[LUSTRE_EDOM]			= EDOM,
+	[LUSTRE_ERANGE]			= ERANGE,
+	[LUSTRE_EDEADLK]		= EDEADLK,
+	[LUSTRE_ENAMETOOLONG]		= ENAMETOOLONG,
+	[LUSTRE_ENOLCK]			= ENOLCK,
+	[LUSTRE_ENOSYS]			= ENOSYS,
+	[LUSTRE_ENOTEMPTY]		= ENOTEMPTY,
+	[LUSTRE_ELOOP]			= ELOOP,
+	[LUSTRE_ENOMSG]			= ENOMSG,
+	[LUSTRE_EIDRM]			= EIDRM,
+	[LUSTRE_ECHRNG]			= ECHRNG,
+	[LUSTRE_EL2NSYNC]		= EL2NSYNC,
+	[LUSTRE_EL3HLT]			= EL3HLT,
+	[LUSTRE_EL3RST]			= EL3RST,
+	[LUSTRE_ELNRNG]			= ELNRNG,
+	[LUSTRE_EUNATCH]		= EUNATCH,
+	[LUSTRE_ENOCSI]			= ENOCSI,
+	[LUSTRE_EL2HLT]			= EL2HLT,
+	[LUSTRE_EBADE]			= EBADE,
+	[LUSTRE_EBADR]			= EBADR,
+	[LUSTRE_EXFULL]			= EXFULL,
+	[LUSTRE_ENOANO]			= ENOANO,
+	[LUSTRE_EBADRQC]		= EBADRQC,
+	[LUSTRE_EBADSLT]		= EBADSLT,
+	[LUSTRE_EBFONT]			= EBFONT,
+	[LUSTRE_ENOSTR]			= ENOSTR,
+	[LUSTRE_ENODATA]		= ENODATA,
+	[LUSTRE_ETIME]			= ETIME,
+	[LUSTRE_ENOSR]			= ENOSR,
+	[LUSTRE_ENONET]			= ENONET,
+	[LUSTRE_ENOPKG]			= ENOPKG,
+	[LUSTRE_EREMOTE]		= EREMOTE,
+	[LUSTRE_ENOLINK]		= ENOLINK,
+	[LUSTRE_EADV]			= EADV,
+	[LUSTRE_ESRMNT]			= ESRMNT,
+	[LUSTRE_ECOMM]			= ECOMM,
+	[LUSTRE_EPROTO]			= EPROTO,
+	[LUSTRE_EMULTIHOP]		= EMULTIHOP,
+	[LUSTRE_EDOTDOT]		= EDOTDOT,
+	[LUSTRE_EBADMSG]		= EBADMSG,
+	[LUSTRE_EOVERFLOW]		= EOVERFLOW,
+	[LUSTRE_ENOTUNIQ]		= ENOTUNIQ,
+	[LUSTRE_EBADFD]			= EBADFD,
+	[LUSTRE_EREMCHG]		= EREMCHG,
+	[LUSTRE_ELIBACC]		= ELIBACC,
+	[LUSTRE_ELIBBAD]		= ELIBBAD,
+	[LUSTRE_ELIBSCN]		= ELIBSCN,
+	[LUSTRE_ELIBMAX]		= ELIBMAX,
+	[LUSTRE_ELIBEXEC]		= ELIBEXEC,
+	[LUSTRE_EILSEQ]			= EILSEQ,
+	[LUSTRE_ERESTART]		= ERESTART,
+	[LUSTRE_ESTRPIPE]		= ESTRPIPE,
+	[LUSTRE_EUSERS]			= EUSERS,
+	[LUSTRE_ENOTSOCK]		= ENOTSOCK,
+	[LUSTRE_EDESTADDRREQ]		= EDESTADDRREQ,
+	[LUSTRE_EMSGSIZE]		= EMSGSIZE,
+	[LUSTRE_EPROTOTYPE]		= EPROTOTYPE,
+	[LUSTRE_ENOPROTOOPT]		= ENOPROTOOPT,
+	[LUSTRE_EPROTONOSUPPORT]	= EPROTONOSUPPORT,
+	[LUSTRE_ESOCKTNOSUPPORT]	= ESOCKTNOSUPPORT,
+	[LUSTRE_EOPNOTSUPP]		= EOPNOTSUPP,
+	[LUSTRE_EPFNOSUPPORT]		= EPFNOSUPPORT,
+	[LUSTRE_EAFNOSUPPORT]		= EAFNOSUPPORT,
+	[LUSTRE_EADDRINUSE]		= EADDRINUSE,
+	[LUSTRE_EADDRNOTAVAIL]		= EADDRNOTAVAIL,
+	[LUSTRE_ENETDOWN]		= ENETDOWN,
+	[LUSTRE_ENETUNREACH]		= ENETUNREACH,
+	[LUSTRE_ENETRESET]		= ENETRESET,
+	[LUSTRE_ECONNABORTED]		= ECONNABORTED,
+	[LUSTRE_ECONNRESET]		= ECONNRESET,
+	[LUSTRE_ENOBUFS]		= ENOBUFS,
+	[LUSTRE_EISCONN]		= EISCONN,
+	[LUSTRE_ENOTCONN]		= ENOTCONN,
+	[LUSTRE_ESHUTDOWN]		= ESHUTDOWN,
+	[LUSTRE_ETOOMANYREFS]		= ETOOMANYREFS,
+	[LUSTRE_ETIMEDOUT]		= ETIMEDOUT,
+	[LUSTRE_ECONNREFUSED]		= ECONNREFUSED,
+	[LUSTRE_EHOSTDOWN]		= EHOSTDOWN,
+	[LUSTRE_EHOSTUNREACH]		= EHOSTUNREACH,
+	[LUSTRE_EALREADY]		= EALREADY,
+	[LUSTRE_EINPROGRESS]		= EINPROGRESS,
+	[LUSTRE_ESTALE]			= ESTALE,
+	[LUSTRE_EUCLEAN]		= EUCLEAN,
+	[LUSTRE_ENOTNAM]		= ENOTNAM,
+	[LUSTRE_ENAVAIL]		= ENAVAIL,
+	[LUSTRE_EISNAM]			= EISNAM,
+	[LUSTRE_EREMOTEIO]		= EREMOTEIO,
+	[LUSTRE_EDQUOT]			= EDQUOT,
+	[LUSTRE_ENOMEDIUM]		= ENOMEDIUM,
+	[LUSTRE_EMEDIUMTYPE]		= EMEDIUMTYPE,
+	[LUSTRE_ECANCELED]		= ECANCELED,
+	[LUSTRE_ENOKEY]			= ENOKEY,
+	[LUSTRE_EKEYEXPIRED]		= EKEYEXPIRED,
+	[LUSTRE_EKEYREVOKED]		= EKEYREVOKED,
+	[LUSTRE_EKEYREJECTED]		= EKEYREJECTED,
+	[LUSTRE_EOWNERDEAD]		= EOWNERDEAD,
+	[LUSTRE_ENOTRECOVERABLE]	= ENOTRECOVERABLE,
+	[LUSTRE_ERESTARTSYS]		= ERESTARTSYS,
+	[LUSTRE_ERESTARTNOINTR]		= ERESTARTNOINTR,
+	[LUSTRE_ERESTARTNOHAND]		= ERESTARTNOHAND,
+	[LUSTRE_ENOIOCTLCMD]		= ENOIOCTLCMD,
+	[LUSTRE_ERESTART_RESTARTBLOCK]	= ERESTART_RESTARTBLOCK,
+	[LUSTRE_EBADHANDLE]		= EBADHANDLE,
+	[LUSTRE_ENOTSYNC]		= ENOTSYNC,
+	[LUSTRE_EBADCOOKIE]		= EBADCOOKIE,
+	[LUSTRE_ENOTSUPP]		= ENOTSUPP,
+	[LUSTRE_ETOOSMALL]		= ETOOSMALL,
+	[LUSTRE_ESERVERFAULT]		= ESERVERFAULT,
+	[LUSTRE_EBADTYPE]		= EBADTYPE,
+	[LUSTRE_EJUKEBOX]		= EJUKEBOX,
+	[LUSTRE_EIOCBQUEUED]		= EIOCBQUEUED
+};
+
+unsigned int lustre_errno_hton(unsigned int h)
+{
+	unsigned int n;
+
+	if (h == 0) {
+		n = 0;
+	} else if (h < ARRAY_SIZE(lustre_errno_hton_mapping)) {
+		n = lustre_errno_hton_mapping[h];
+		if (n == 0)
+			goto generic;
+	} else {
+generic:
+		/*
+		 * A generic errno is better than the unknown one that could
+		 * mean anything to a different host.
+		 */
+		n = LUSTRE_EIO;
+	}
+
+	return n;
+}
+EXPORT_SYMBOL(lustre_errno_hton);
+
+unsigned int lustre_errno_ntoh(unsigned int n)
+{
+	unsigned int h;
+
+	if (n == 0) {
+		h = 0;
+	} else if (n < ARRAY_SIZE(lustre_errno_ntoh_mapping)) {
+		h = lustre_errno_ntoh_mapping[n];
+		if (h == 0)
+			goto generic;
+	} else {
+generic:
+		/*
+		 * Similar to the situation in lustre_errno_hton(), an unknown
+		 * network errno could coincide with anything.  Hence, it is
+		 * better to return a generic errno.
+		 */
+		h = EIO;
+	}
+
+	return h;
+}
+EXPORT_SYMBOL(lustre_errno_ntoh);
+
+#endif /* LUSTRE_TRANSLATE_ERRNOS */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/events.c b/drivers/staging/lustrefsx/lustre/ptlrpc/events.c
new file mode 100644
index 0000000000000..28533cca19a32
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/events.c
@@ -0,0 +1,640 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <libcfs/libcfs.h>
+#include <linux/kernel.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+#include "ptlrpc_internal.h"
+
+struct lnet_handle_eq ptlrpc_eq_h;
+
+/*
+ *  Client's outgoing request callback
+ */
+void request_out_callback(struct lnet_event *ev)
+{
+	struct ptlrpc_cb_id   *cbid = ev->md.user_ptr;
+	struct ptlrpc_request *req = cbid->cbid_arg;
+	bool		       wakeup = false;
+	ENTRY;
+
+	LASSERT(ev->type == LNET_EVENT_SEND || ev->type == LNET_EVENT_UNLINK);
+	LASSERT(ev->unlinked);
+
+	DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status);
+
+	sptlrpc_request_out_callback(req);
+
+	spin_lock(&req->rq_lock);
+	req->rq_real_sent = ktime_get_real_seconds();
+	req->rq_req_unlinked = 1;
+	/* reply_in_callback happened before request_out_callback? */
+	if (req->rq_reply_unlinked)
+		wakeup = true;
+
+	if (ev->type == LNET_EVENT_UNLINK || ev->status != 0) {
+		/* Failed send: make it seem like the reply timed out, just
+		 * like failing sends in client.c does currently...  */
+		req->rq_net_err = 1;
+		wakeup = true;
+	}
+
+	if (wakeup)
+		ptlrpc_client_wake_req(req);
+
+	spin_unlock(&req->rq_lock);
+
+	ptlrpc_req_finished(req);
+	EXIT;
+}
+
+/*
+ * Client's incoming reply callback
+ */
+void reply_in_callback(struct lnet_event *ev)
+{
+        struct ptlrpc_cb_id   *cbid = ev->md.user_ptr;
+        struct ptlrpc_request *req = cbid->cbid_arg;
+        ENTRY;
+
+        DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status);
+
+        LASSERT (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_UNLINK);
+        LASSERT (ev->md.start == req->rq_repbuf);
+        LASSERT (ev->offset + ev->mlength <= req->rq_repbuf_len);
+        /* We've set LNET_MD_MANAGE_REMOTE for all outgoing requests
+           for adaptive timeouts' early reply. */
+        LASSERT((ev->md.options & LNET_MD_MANAGE_REMOTE) != 0);
+
+	spin_lock(&req->rq_lock);
+
+	req->rq_receiving_reply = 0;
+	req->rq_early = 0;
+	if (ev->unlinked)
+		req->rq_reply_unlinked = 1;
+
+        if (ev->status)
+                goto out_wake;
+
+        if (ev->type == LNET_EVENT_UNLINK) {
+                LASSERT(ev->unlinked);
+                DEBUG_REQ(D_NET, req, "unlink");
+                goto out_wake;
+        }
+
+        if (ev->mlength < ev->rlength ) {
+                CDEBUG(D_RPCTRACE, "truncate req %p rpc %d - %d+%d\n", req,
+                       req->rq_replen, ev->rlength, ev->offset);
+		req->rq_reply_truncated = 1;
+                req->rq_replied = 1;
+                req->rq_status = -EOVERFLOW;
+                req->rq_nob_received = ev->rlength + ev->offset;
+                goto out_wake;
+        }
+
+        if ((ev->offset == 0) &&
+            ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT))) {
+                /* Early reply */
+                DEBUG_REQ(D_ADAPTTO, req,
+                          "Early reply received: mlen=%u offset=%d replen=%d "
+                          "replied=%d unlinked=%d", ev->mlength, ev->offset,
+                          req->rq_replen, req->rq_replied, ev->unlinked);
+
+                req->rq_early_count++; /* number received, client side */
+
+		/* already got the real reply or buffers are already unlinked */
+		if (req->rq_replied ||
+		    req->rq_reply_unlinked == 1)
+			goto out_wake;
+
+                req->rq_early = 1;
+                req->rq_reply_off = ev->offset;
+                req->rq_nob_received = ev->mlength;
+                /* And we're still receiving */
+                req->rq_receiving_reply = 1;
+        } else {
+                /* Real reply */
+                req->rq_rep_swab_mask = 0;
+                req->rq_replied = 1;
+		/* Got reply, no resend required */
+		req->rq_resend = 0;
+                req->rq_reply_off = ev->offset;
+                req->rq_nob_received = ev->mlength;
+                /* LNetMDUnlink can't be called under the LNET_LOCK,
+                   so we must unlink in ptlrpc_unregister_reply */
+                DEBUG_REQ(D_INFO, req,
+                          "reply in flags=%x mlen=%u offset=%d replen=%d",
+                          lustre_msg_get_flags(req->rq_reqmsg),
+                          ev->mlength, ev->offset, req->rq_replen);
+        }
+
+	req->rq_import->imp_last_reply_time = ktime_get_real_seconds();
+
+out_wake:
+        /* NB don't unlock till after wakeup; req can disappear under us
+         * since we don't have our own ref */
+        ptlrpc_client_wake_req(req);
+	spin_unlock(&req->rq_lock);
+	EXIT;
+}
+
+/*
+ * Client's bulk has been written/read
+ */
+void client_bulk_callback(struct lnet_event *ev)
+{
+        struct ptlrpc_cb_id     *cbid = ev->md.user_ptr;
+        struct ptlrpc_bulk_desc *desc = cbid->cbid_arg;
+        struct ptlrpc_request   *req;
+        ENTRY;
+
+	LASSERT((ptlrpc_is_bulk_put_sink(desc->bd_type) &&
+		 ev->type == LNET_EVENT_PUT) ||
+		(ptlrpc_is_bulk_get_source(desc->bd_type) &&
+		 ev->type == LNET_EVENT_GET) ||
+		ev->type == LNET_EVENT_UNLINK);
+	LASSERT(ev->unlinked);
+
+        if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB, CFS_FAIL_ONCE))
+                ev->status = -EIO;
+
+        if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2,CFS_FAIL_ONCE))
+                ev->status = -EIO;
+
+        CDEBUG((ev->status == 0) ? D_NET : D_ERROR,
+               "event type %d, status %d, desc %p\n",
+               ev->type, ev->status, desc);
+
+	spin_lock(&desc->bd_lock);
+	req = desc->bd_req;
+	LASSERT(desc->bd_md_count > 0);
+	desc->bd_md_count--;
+
+	if (ev->type != LNET_EVENT_UNLINK && ev->status == 0) {
+		desc->bd_nob_transferred += ev->mlength;
+		desc->bd_sender = ev->sender;
+	} else {
+		/* start reconnect and resend if network error hit */
+		spin_lock(&req->rq_lock);
+		req->rq_net_err = 1;
+		spin_unlock(&req->rq_lock);
+	}
+
+	if (ev->status != 0)
+		desc->bd_failure = 1;
+
+	/* NB don't unlock till after wakeup; desc can disappear under us
+	 * otherwise */
+	if (desc->bd_md_count == 0)
+		ptlrpc_client_wake_req(desc->bd_req);
+
+	spin_unlock(&desc->bd_lock);
+	EXIT;
+}
+
+/*
+ * We will have percpt request history list for ptlrpc service in upcoming
+ * patches because we don't want to be serialized by current per-service
+ * history operations. So we require history ID can (somehow) show arriving
+ * order w/o grabbing global lock, and user can sort them in userspace.
+ *
+ * This is how we generate history ID for ptlrpc_request:
+ * ----------------------------------------------------
+ * |  32 bits  |  16 bits  | (16 - X)bits  |  X bits  |
+ * ----------------------------------------------------
+ * |  seconds  | usec / 16 |   sequence    | CPT id   |
+ * ----------------------------------------------------
+ *
+ * it might not be precise but should be good enough.
+ */
+
+#define REQS_CPT_BITS(svcpt)	((svcpt)->scp_service->srv_cpt_bits)
+
+#define REQS_SEC_SHIFT		32
+#define REQS_USEC_SHIFT		16
+#define REQS_SEQ_SHIFT(svcpt)	REQS_CPT_BITS(svcpt)
+
+static void ptlrpc_req_add_history(struct ptlrpc_service_part *svcpt,
+				   struct ptlrpc_request *req)
+{
+	u64 sec = req->rq_arrival_time.tv_sec;
+	u32 usec = req->rq_arrival_time.tv_nsec / NSEC_PER_USEC / 16; /* usec / 16 */
+	u64 new_seq;
+
+	/* set sequence ID for request and add it to history list,
+	 * it must be called with hold svcpt::scp_lock */
+
+	new_seq = (sec << REQS_SEC_SHIFT) |
+		  (usec << REQS_USEC_SHIFT) |
+		  (svcpt->scp_cpt < 0 ? 0 : svcpt->scp_cpt);
+
+	if (new_seq > svcpt->scp_hist_seq) {
+		/* This handles the initial case of scp_hist_seq == 0 or
+		 * we just jumped into a new time window */
+		svcpt->scp_hist_seq = new_seq;
+	} else {
+		LASSERT(REQS_SEQ_SHIFT(svcpt) < REQS_USEC_SHIFT);
+		/* NB: increase sequence number in current usec bucket,
+		 * however, it's possible that we used up all bits for
+		 * sequence and jumped into the next usec bucket (future time),
+		 * then we hope there will be less RPCs per bucket at some
+		 * point, and sequence will catch up again */
+		svcpt->scp_hist_seq += (1U << REQS_SEQ_SHIFT(svcpt));
+		new_seq = svcpt->scp_hist_seq;
+	}
+
+	req->rq_history_seq = new_seq;
+
+	list_add_tail(&req->rq_history_list, &svcpt->scp_hist_reqs);
+}
+
+/*
+ * Server's incoming request callback
+ */
+void request_in_callback(struct lnet_event *ev)
+{
+	struct ptlrpc_cb_id		  *cbid = ev->md.user_ptr;
+	struct ptlrpc_request_buffer_desc *rqbd = cbid->cbid_arg;
+	struct ptlrpc_service_part	  *svcpt = rqbd->rqbd_svcpt;
+	struct ptlrpc_service             *service = svcpt->scp_service;
+        struct ptlrpc_request             *req;
+        ENTRY;
+
+        LASSERT (ev->type == LNET_EVENT_PUT ||
+                 ev->type == LNET_EVENT_UNLINK);
+        LASSERT ((char *)ev->md.start >= rqbd->rqbd_buffer);
+        LASSERT ((char *)ev->md.start + ev->offset + ev->mlength <=
+                 rqbd->rqbd_buffer + service->srv_buf_size);
+
+        CDEBUG((ev->status == 0) ? D_NET : D_ERROR,
+               "event type %d, status %d, service %s\n",
+               ev->type, ev->status, service->srv_name);
+
+        if (ev->unlinked) {
+                /* If this is the last request message to fit in the
+                 * request buffer we can use the request object embedded in
+                 * rqbd.  Note that if we failed to allocate a request,
+                 * we'd have to re-post the rqbd, which we can't do in this
+                 * context. */
+                req = &rqbd->rqbd_req;
+                memset(req, 0, sizeof (*req));
+        } else {
+                LASSERT (ev->type == LNET_EVENT_PUT);
+                if (ev->status != 0) {
+                        /* We moaned above already... */
+                        return;
+                }
+		req = ptlrpc_request_cache_alloc(GFP_ATOMIC);
+                if (req == NULL) {
+                        CERROR("Can't allocate incoming request descriptor: "
+                               "Dropping %s RPC from %s\n",
+                               service->srv_name,
+                               libcfs_id2str(ev->initiator));
+                        return;
+                }
+        }
+
+	ptlrpc_srv_req_init(req);
+	/* NB we ABSOLUTELY RELY on req being zeroed, so pointers are NULL,
+	 * flags are reset and scalars are zero.  We only set the message
+	 * size to non-zero if this was a successful receive. */
+	req->rq_xid = ev->match_bits;
+	req->rq_reqbuf = ev->md.start + ev->offset;
+	if (ev->type == LNET_EVENT_PUT && ev->status == 0)
+		req->rq_reqdata_len = ev->mlength;
+	ktime_get_real_ts64(&req->rq_arrival_time);
+	/* Multi-Rail: keep track of both initiator and source NID. */
+	req->rq_peer = ev->initiator;
+	req->rq_source = ev->source;
+	req->rq_self = ev->target.nid;
+	req->rq_rqbd = rqbd;
+	req->rq_phase = RQ_PHASE_NEW;
+	if (ev->type == LNET_EVENT_PUT)
+		CDEBUG(D_INFO, "incoming req@%p x%llu msgsize %u\n",
+		       req, req->rq_xid, ev->mlength);
+
+	CDEBUG(D_RPCTRACE, "peer: %s (source: %s)\n",
+		libcfs_id2str(req->rq_peer), libcfs_id2str(req->rq_source));
+
+	spin_lock(&svcpt->scp_lock);
+
+	ptlrpc_req_add_history(svcpt, req);
+
+	if (ev->unlinked) {
+		svcpt->scp_nrqbds_posted--;
+		CDEBUG(D_INFO, "Buffer complete: %d buffers still posted\n",
+		       svcpt->scp_nrqbds_posted);
+
+		/* Normally, don't complain about 0 buffers posted; LNET won't
+		 * drop incoming reqs since we set the portal lazy */
+		if (test_req_buffer_pressure &&
+		    ev->type != LNET_EVENT_UNLINK &&
+		    svcpt->scp_nrqbds_posted == 0)
+                        CWARN("All %s request buffers busy\n",
+                              service->srv_name);
+
+                /* req takes over the network's ref on rqbd */
+        } else {
+                /* req takes a ref on rqbd */
+                rqbd->rqbd_refcount++;
+        }
+
+	list_add_tail(&req->rq_list, &svcpt->scp_req_incoming);
+	svcpt->scp_nreqs_incoming++;
+
+	/* NB everything can disappear under us once the request
+	 * has been queued and we unlock, so do the wake now... */
+	wake_up(&svcpt->scp_waitq);
+
+	spin_unlock(&svcpt->scp_lock);
+	EXIT;
+}
+
+/*
+ *  Server's outgoing reply callback
+ */
+void reply_out_callback(struct lnet_event *ev)
+{
+	struct ptlrpc_cb_id	  *cbid = ev->md.user_ptr;
+	struct ptlrpc_reply_state *rs = cbid->cbid_arg;
+	struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+        ENTRY;
+
+        LASSERT (ev->type == LNET_EVENT_SEND ||
+                 ev->type == LNET_EVENT_ACK ||
+                 ev->type == LNET_EVENT_UNLINK);
+
+        if (!rs->rs_difficult) {
+                /* 'Easy' replies have no further processing so I drop the
+                 * net's ref on 'rs' */
+                LASSERT (ev->unlinked);
+                ptlrpc_rs_decref(rs);
+                EXIT;
+                return;
+        }
+
+        LASSERT (rs->rs_on_net);
+
+        if (ev->unlinked) {
+                /* Last network callback. The net's ref on 'rs' stays put
+                 * until ptlrpc_handle_rs() is done with it */
+		spin_lock(&svcpt->scp_rep_lock);
+		spin_lock(&rs->rs_lock);
+
+		rs->rs_on_net = 0;
+		if (!rs->rs_no_ack ||
+		    rs->rs_transno <=
+		    rs->rs_export->exp_obd->obd_last_committed ||
+		    list_empty(&rs->rs_obd_list))
+			ptlrpc_schedule_difficult_reply(rs);
+
+		spin_unlock(&rs->rs_lock);
+		spin_unlock(&svcpt->scp_rep_lock);
+	}
+	EXIT;
+}
+
+#ifdef HAVE_SERVER_SUPPORT
+/*
+ * Server's bulk completion callback
+ */
+void server_bulk_callback(struct lnet_event *ev)
+{
+	struct ptlrpc_cb_id     *cbid = ev->md.user_ptr;
+	struct ptlrpc_bulk_desc *desc = cbid->cbid_arg;
+	ENTRY;
+
+	LASSERT(ev->type == LNET_EVENT_SEND ||
+		ev->type == LNET_EVENT_UNLINK ||
+		(ptlrpc_is_bulk_put_source(desc->bd_type) &&
+		 ev->type == LNET_EVENT_ACK) ||
+		(ptlrpc_is_bulk_get_sink(desc->bd_type) &&
+		 ev->type == LNET_EVENT_REPLY));
+
+        CDEBUG((ev->status == 0) ? D_NET : D_ERROR,
+               "event type %d, status %d, desc %p\n",
+               ev->type, ev->status, desc);
+
+	spin_lock(&desc->bd_lock);
+
+	LASSERT(desc->bd_md_count > 0);
+
+	if ((ev->type == LNET_EVENT_ACK ||
+	     ev->type == LNET_EVENT_REPLY) &&
+	    ev->status == 0) {
+		/* We heard back from the peer, so even if we get this
+		 * before the SENT event (oh yes we can), we know we
+		 * read/wrote the peer buffer and how much... */
+		desc->bd_nob_transferred += ev->mlength;
+		desc->bd_sender = ev->sender;
+	}
+
+	if (ev->status != 0)
+		desc->bd_failure = 1;
+
+	if (ev->unlinked) {
+		desc->bd_md_count--;
+		/* This is the last callback no matter what... */
+		if (desc->bd_md_count == 0)
+			wake_up(&desc->bd_waitq);
+	}
+
+	spin_unlock(&desc->bd_lock);
+	EXIT;
+}
+#endif
+
+static void ptlrpc_master_callback(struct lnet_event *ev)
+{
+        struct ptlrpc_cb_id *cbid = ev->md.user_ptr;
+	void (*callback)(struct lnet_event *ev) = cbid->cbid_fn;
+
+        /* Honestly, it's best to find out early. */
+        LASSERT (cbid->cbid_arg != LP_POISON);
+        LASSERT (callback == request_out_callback ||
+                 callback == reply_in_callback ||
+                 callback == client_bulk_callback ||
+                 callback == request_in_callback ||
+                 callback == reply_out_callback
+#ifdef HAVE_SERVER_SUPPORT
+                 || callback == server_bulk_callback
+#endif
+                 );
+
+        callback (ev);
+}
+
+int ptlrpc_uuid_to_peer(struct obd_uuid *uuid,
+			struct lnet_process_id *peer, lnet_nid_t *self)
+{
+	int               best_dist = 0;
+	__u32             best_order = 0;
+	int               count = 0;
+	int               rc = -ENOENT;
+	int               dist;
+	__u32             order;
+	lnet_nid_t        dst_nid;
+	lnet_nid_t        src_nid;
+
+	peer->pid = LNET_PID_LUSTRE;
+
+	/* Choose the matching UUID that's closest */
+	while (lustre_uuid_to_peer(uuid->uuid, &dst_nid, count++) == 0) {
+		if (peer->nid != LNET_NID_ANY && LNET_NIDADDR(peer->nid) == 0 &&
+		    LNET_NIDNET(dst_nid) != LNET_NIDNET(peer->nid))
+			continue;
+
+		dist = LNetDist(dst_nid, &src_nid, &order);
+		if (dist < 0)
+			continue;
+
+		if (dist == 0) {                /* local! use loopback LND */
+			peer->nid = *self = LNET_MKNID(LNET_MKNET(LOLND, 0), 0);
+			rc = 0;
+			break;
+		}
+
+		if (rc < 0 ||
+		    dist < best_dist ||
+		    (dist == best_dist && order < best_order)) {
+			best_dist = dist;
+			best_order = order;
+
+			peer->nid = dst_nid;
+			*self = src_nid;
+			rc = 0;
+		}
+	}
+
+	CDEBUG(D_NET, "%s->%s\n", uuid->uuid, libcfs_id2str(*peer));
+	return rc;
+}
+
+void ptlrpc_ni_fini(void)
+{
+	wait_queue_head_t         waitq;
+	struct l_wait_info  lwi;
+	int                 rc;
+	int                 retries;
+
+	/* Wait for the event queue to become idle since there may still be
+	 * messages in flight with pending events (i.e. the fire-and-forget
+	 * messages == client requests and "non-difficult" server
+	 * replies */
+
+	for (retries = 0;; retries++) {
+		rc = LNetEQFree(ptlrpc_eq_h);
+		switch (rc) {
+		default:
+			LBUG();
+
+		case 0:
+			LNetNIFini();
+			return;
+
+		case -EBUSY:
+			if (retries != 0)
+				CWARN("Event queue still busy\n");
+
+			/* Wait for a bit */
+			init_waitqueue_head(&waitq);
+			lwi = LWI_TIMEOUT(cfs_time_seconds(2), NULL, NULL);
+			l_wait_event(waitq, 0, &lwi);
+			break;
+		}
+	}
+	/* notreached */
+}
+
+lnet_pid_t ptl_get_pid(void)
+{
+	return LNET_PID_LUSTRE;
+}
+
+int ptlrpc_ni_init(void)
+{
+        int              rc;
+        lnet_pid_t       pid;
+
+        pid = ptl_get_pid();
+        CDEBUG(D_NET, "My pid is: %x\n", pid);
+
+        /* We're not passing any limits yet... */
+        rc = LNetNIInit(pid);
+        if (rc < 0) {
+                CDEBUG (D_NET, "Can't init network interface: %d\n", rc);
+		return rc;
+        }
+
+        /* CAVEAT EMPTOR: how we process portals events is _radically_
+         * different depending on... */
+	/* kernel LNet calls our master callback when there are new event,
+	 * because we are guaranteed to get every event via callback,
+	 * so we just set EQ size to 0 to avoid overhread of serializing
+	 * enqueue/dequeue operations in LNet. */
+	rc = LNetEQAlloc(0, ptlrpc_master_callback, &ptlrpc_eq_h);
+        if (rc == 0)
+                return 0;
+
+        CERROR ("Failed to allocate event queue: %d\n", rc);
+        LNetNIFini();
+
+	return rc;
+}
+
+
+int ptlrpc_init_portals(void)
+{
+        int   rc = ptlrpc_ni_init();
+
+        if (rc != 0) {
+                CERROR("network initialisation failed\n");
+		return rc;
+        }
+        rc = ptlrpcd_addref();
+        if (rc == 0)
+                return 0;
+
+        CERROR("rpcd initialisation failed\n");
+        ptlrpc_ni_fini();
+        return rc;
+}
+
+void ptlrpc_exit_portals(void)
+{
+        ptlrpcd_decref();
+        ptlrpc_ni_fini();
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_api.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_api.h
new file mode 100644
index 0000000000000..a5bbaea6065d3
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_api.h
@@ -0,0 +1,179 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * Somewhat simplified version of the gss api.
+ *
+ * Dug Song <dugsong@monkey.org>
+ * Andy Adamson <andros@umich.edu>
+ * Bruce Fields <bfields@umich.edu>
+ * Copyright (c) 2000 The Regents of the University of Michigan
+ *
+ */
+
+#ifndef __PTLRPC_GSS_GSS_API_H_
+#define __PTLRPC_GSS_GSS_API_H_
+
+struct gss_api_mech;
+
+/* The mechanism-independent gss-api context: */
+struct gss_ctx {
+        struct gss_api_mech    *mech_type;
+        void                   *internal_ctx_id;
+};
+
+#define GSS_C_NO_BUFFER         ((rawobj_t) 0)
+#define GSS_C_NO_CONTEXT        ((struct gss_ctx *) 0)
+#define GSS_C_NULL_OID          ((rawobj_t) 0)
+
+/*
+ * gss-api prototypes; note that these are somewhat simplified versions of
+ * the prototypes specified in RFC 2744.
+ */
+__u32 lgss_import_sec_context(
+                rawobj_t                *input_token,
+                struct gss_api_mech     *mech,
+                struct gss_ctx         **ctx);
+__u32 lgss_copy_reverse_context(
+                struct gss_ctx          *ctx,
+                struct gss_ctx         **ctx_new);
+__u32 lgss_inquire_context(
+                struct gss_ctx          *ctx,
+                unsigned long           *endtime);
+__u32 lgss_get_mic(
+                struct gss_ctx          *ctx,
+                int                      msgcnt,
+                rawobj_t                *msgs,
+                int                      iovcnt,
+                lnet_kiov_t             *iovs,
+                rawobj_t                *mic_token);
+__u32 lgss_verify_mic(
+                struct gss_ctx          *ctx,
+                int                      msgcnt,
+                rawobj_t                *msgs,
+                int                      iovcnt,
+                lnet_kiov_t             *iovs,
+                rawobj_t                *mic_token);
+__u32 lgss_wrap(
+                struct gss_ctx          *ctx,
+                rawobj_t                *gsshdr,
+                rawobj_t                *msg,
+                int                      msg_buflen,
+                rawobj_t                *out_token);
+__u32 lgss_unwrap(
+                struct gss_ctx          *ctx,
+                rawobj_t                *gsshdr,
+                rawobj_t                *token,
+                rawobj_t                *out_msg);
+__u32 lgss_prep_bulk(
+                struct gss_ctx          *gctx,
+                struct ptlrpc_bulk_desc *desc);
+__u32 lgss_wrap_bulk(
+                struct gss_ctx          *gctx,
+                struct ptlrpc_bulk_desc *desc,
+                rawobj_t                *token,
+                int                      adj_nob);
+__u32 lgss_unwrap_bulk(
+                struct gss_ctx          *gctx,
+                struct ptlrpc_bulk_desc *desc,
+                rawobj_t                *token,
+                int                      adj_nob);
+__u32 lgss_delete_sec_context(
+                struct gss_ctx         **ctx);
+int lgss_display(
+                struct gss_ctx          *ctx,
+                char                    *buf,
+                int                      bufsize);
+
+struct subflavor_desc {
+        __u32           sf_subflavor;
+        __u32           sf_qop;
+        __u32           sf_service;
+        char           *sf_name;
+};
+
+/* Each mechanism is described by the following struct: */
+struct gss_api_mech {
+	struct list_head	gm_list;
+	struct module	       *gm_owner;
+	char		       *gm_name;
+	rawobj_t		gm_oid;
+	atomic_t		gm_count;
+	struct gss_api_ops     *gm_ops;
+	int			gm_sf_num;
+	struct subflavor_desc  *gm_sfs;
+};
+
+/* and must provide the following operations: */
+struct gss_api_ops {
+        __u32 (*gss_import_sec_context)(
+                        rawobj_t               *input_token,
+                        struct gss_ctx         *ctx);
+        __u32 (*gss_copy_reverse_context)(
+                        struct gss_ctx         *ctx,
+                        struct gss_ctx         *ctx_new);
+        __u32 (*gss_inquire_context)(
+                        struct gss_ctx         *ctx,
+                        unsigned long          *endtime);
+        __u32 (*gss_get_mic)(
+                        struct gss_ctx         *ctx,
+                        int                     msgcnt,
+                        rawobj_t               *msgs,
+                        int                     iovcnt,
+                        lnet_kiov_t            *iovs,
+                        rawobj_t               *mic_token);
+        __u32 (*gss_verify_mic)(
+                        struct gss_ctx         *ctx,
+                        int                     msgcnt,
+                        rawobj_t               *msgs,
+                        int                     iovcnt,
+                        lnet_kiov_t            *iovs,
+                        rawobj_t               *mic_token);
+        __u32 (*gss_wrap)(
+                        struct gss_ctx         *ctx,
+                        rawobj_t               *gsshdr,
+                        rawobj_t               *msg,
+                        int                     msg_buflen,
+                        rawobj_t               *out_token);
+        __u32 (*gss_unwrap)(
+                        struct gss_ctx         *ctx,
+                        rawobj_t               *gsshdr,
+                        rawobj_t               *token,
+                        rawobj_t               *out_msg);
+        __u32 (*gss_prep_bulk)(
+                        struct gss_ctx         *gctx,
+                        struct ptlrpc_bulk_desc *desc);
+        __u32 (*gss_wrap_bulk)(
+                        struct gss_ctx         *gctx,
+                        struct ptlrpc_bulk_desc *desc,
+                        rawobj_t               *token,
+                        int                     adj_nob);
+        __u32 (*gss_unwrap_bulk)(
+                        struct gss_ctx         *gctx,
+                        struct ptlrpc_bulk_desc *desc,
+                        rawobj_t               *token,
+                        int                     adj_nob);
+        void (*gss_delete_sec_context)(
+                        void                   *ctx);
+        int  (*gss_display)(
+                        struct gss_ctx         *ctx,
+                        char                   *buf,
+                        int                     bufsize);
+};
+
+int lgss_mech_register(struct gss_api_mech *mech);
+void lgss_mech_unregister(struct gss_api_mech *mech);
+
+struct gss_api_mech * lgss_OID_to_mech(rawobj_t *oid);
+struct gss_api_mech * lgss_name_to_mech(char *name);
+struct gss_api_mech * lgss_subflavor_to_mech(__u32 subflavor);
+
+struct gss_api_mech * lgss_mech_get(struct gss_api_mech *mech);
+void lgss_mech_put(struct gss_api_mech *mech);
+
+#endif /* __PTLRPC_GSS_GSS_API_H_ */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_asn1.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_asn1.h
new file mode 100644
index 0000000000000..1f535485bd0f3
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_asn1.h
@@ -0,0 +1,84 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  minimal asn1 for generic encoding/decoding of gss tokens
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h,
+ *  lib/gssapi/krb5/gssapiP_krb5.h, and others
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1995 by the Massachusetts Institute of Technology.
+ * All Rights Reserved.
+ *
+ * Export of this software from the United States of America may
+ *   require a specific license from the United States Government.
+ *   It is the responsibility of any person or organization contemplating
+ *   export to obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of M.I.T. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission.  Furthermore if you modify this software you must label
+ * your software as modified software and not distribute it in such a
+ * fashion that it might be confused with the original M.I.T. software.
+ * M.I.T. makes no representations about the suitability of
+ * this software for any purpose.  It is provided "as is" without express
+ * or implied warranty.
+ *
+ */
+
+#define SIZEOF_INT 4
+
+/* from gssapi_err_generic.h */
+#define G_BAD_SERVICE_NAME                       (-2045022976L)
+#define G_BAD_STRING_UID                         (-2045022975L)
+#define G_NOUSER                                 (-2045022974L)
+#define G_VALIDATE_FAILED                        (-2045022973L)
+#define G_BUFFER_ALLOC                           (-2045022972L)
+#define G_BAD_MSG_CTX                            (-2045022971L)
+#define G_WRONG_SIZE                             (-2045022970L)
+#define G_BAD_USAGE                              (-2045022969L)
+#define G_UNKNOWN_QOP                            (-2045022968L)
+#define G_NO_HOSTNAME                            (-2045022967L)
+#define G_BAD_HOSTNAME                           (-2045022966L)
+#define G_WRONG_MECH                             (-2045022965L)
+#define G_BAD_TOK_HEADER                         (-2045022964L)
+#define G_BAD_DIRECTION                          (-2045022963L)
+#define G_TOK_TRUNC                              (-2045022962L)
+#define G_REFLECT                                (-2045022961L)
+#define G_WRONG_TOKID                            (-2045022960L)
+
+#define g_OID_equal(o1,o2) \
+   (((o1)->len == (o2)->len) && \
+    (memcmp((o1)->data,(o2)->data,(int) (o1)->len) == 0))
+
+__u32 g_verify_token_header(rawobj_t *mech,
+                            int *body_size,
+                            unsigned char **buf_in,
+                            int toksize);
+
+__u32 g_get_mech_oid(rawobj_t *mech,
+                     rawobj_t *in_buf);
+
+int g_token_size(rawobj_t *mech,
+                 unsigned int body_size);
+
+void g_make_token_header(rawobj_t *mech,
+                         int body_size,
+                         unsigned char **buf);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_bulk.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_bulk.c
new file mode 100644
index 0000000000000..3f703372d272f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_bulk.c
@@ -0,0 +1,522 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/gss/gss_bulk.c
+ *
+ * Author: Eric Mei <eric.mei@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/crypto.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+int gss_cli_ctx_wrap_bulk(struct ptlrpc_cli_ctx *ctx,
+                          struct ptlrpc_request *req,
+                          struct ptlrpc_bulk_desc *desc)
+{
+	struct gss_cli_ctx              *gctx;
+	struct lustre_msg               *msg;
+	struct ptlrpc_bulk_sec_desc     *bsd;
+	rawobj_t                         token;
+	__u32                            maj;
+	int                              offset;
+	int                              rc;
+	ENTRY;
+
+	LASSERT(req->rq_pack_bulk);
+	LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
+
+	gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+	LASSERT(gctx->gc_mechctx);
+
+	switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+	case SPTLRPC_SVC_NULL:
+		LASSERT(req->rq_reqbuf->lm_bufcount >= 3);
+		msg = req->rq_reqbuf;
+		offset = msg->lm_bufcount - 1;
+		break;
+	case SPTLRPC_SVC_AUTH:
+	case SPTLRPC_SVC_INTG:
+		LASSERT(req->rq_reqbuf->lm_bufcount >= 4);
+		msg = req->rq_reqbuf;
+		offset = msg->lm_bufcount - 2;
+		break;
+	case SPTLRPC_SVC_PRIV:
+		LASSERT(req->rq_clrbuf->lm_bufcount >= 2);
+		msg = req->rq_clrbuf;
+		offset = msg->lm_bufcount - 1;
+		break;
+	default:
+		LBUG();
+	}
+
+	bsd = lustre_msg_buf(msg, offset, sizeof(*bsd));
+	bsd->bsd_version = 0;
+	bsd->bsd_flags = 0;
+	bsd->bsd_type = SPTLRPC_BULK_DEFAULT;
+	bsd->bsd_svc = SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc);
+
+	if (bsd->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+		RETURN(0);
+
+	LASSERT(bsd->bsd_svc == SPTLRPC_BULK_SVC_INTG ||
+		bsd->bsd_svc == SPTLRPC_BULK_SVC_PRIV);
+
+	if (req->rq_bulk_read) {
+		/*
+		 * bulk read: prepare receiving pages only for privacy mode.
+		 */
+		if (bsd->bsd_svc == SPTLRPC_BULK_SVC_PRIV)
+			return gss_cli_prep_bulk(req, desc);
+	} else {
+		/*
+		 * bulk write: sign or encrypt bulk pages.
+		 */
+		bsd->bsd_nob = desc->bd_nob;
+
+		if (bsd->bsd_svc == SPTLRPC_BULK_SVC_INTG) {
+			/* integrity mode */
+			token.data = bsd->bsd_data;
+			token.len = lustre_msg_buflen(msg, offset) -
+				    sizeof(*bsd);
+
+			maj = lgss_get_mic(gctx->gc_mechctx, 0, NULL,
+					   desc->bd_iov_count,
+					   GET_KIOV(desc),
+					   &token);
+			if (maj != GSS_S_COMPLETE) {
+				CWARN("failed to sign bulk data: %x\n", maj);
+				RETURN(-EACCES);
+			}
+		} else {
+			/* privacy mode */
+			if (desc->bd_iov_count == 0)
+				RETURN(0);
+
+			rc = sptlrpc_enc_pool_get_pages(desc);
+			if (rc) {
+				CERROR("bulk write: failed to allocate "
+				       "encryption pages: %d\n", rc);
+				RETURN(rc);
+			}
+
+			token.data = bsd->bsd_data;
+			token.len = lustre_msg_buflen(msg, offset) -
+				    sizeof(*bsd);
+
+			maj = lgss_wrap_bulk(gctx->gc_mechctx, desc, &token, 0);
+			if (maj != GSS_S_COMPLETE) {
+				CWARN("fail to encrypt bulk data: %x\n", maj);
+				RETURN(-EACCES);
+			}
+		}
+	}
+
+	RETURN(0);
+}
+
+int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx,
+                            struct ptlrpc_request *req,
+                            struct ptlrpc_bulk_desc *desc)
+{
+        struct gss_cli_ctx              *gctx;
+        struct lustre_msg               *rmsg, *vmsg;
+        struct ptlrpc_bulk_sec_desc     *bsdr, *bsdv;
+        rawobj_t                         token;
+        __u32                            maj;
+        int                              roff, voff;
+        ENTRY;
+
+        LASSERT(req->rq_pack_bulk);
+        LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
+
+        switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+        case SPTLRPC_SVC_NULL:
+                vmsg = req->rq_repdata;
+		LASSERT(vmsg != NULL && vmsg->lm_bufcount >= 3);
+                voff = vmsg->lm_bufcount - 1;
+
+                rmsg = req->rq_reqbuf;
+		LASSERT(rmsg != NULL && rmsg->lm_bufcount >= 3);
+                roff = rmsg->lm_bufcount - 1; /* last segment */
+                break;
+        case SPTLRPC_SVC_AUTH:
+        case SPTLRPC_SVC_INTG:
+                vmsg = req->rq_repdata;
+		LASSERT(vmsg != NULL && vmsg->lm_bufcount >= 4);
+                voff = vmsg->lm_bufcount - 2;
+
+                rmsg = req->rq_reqbuf;
+		LASSERT(rmsg != NULL && rmsg->lm_bufcount >= 4);
+                roff = rmsg->lm_bufcount - 2; /* second last segment */
+                break;
+        case SPTLRPC_SVC_PRIV:
+                vmsg = req->rq_repdata;
+		LASSERT(vmsg != NULL && vmsg->lm_bufcount >= 2);
+                voff = vmsg->lm_bufcount - 1;
+
+                rmsg = req->rq_clrbuf;
+		LASSERT(rmsg != NULL && rmsg->lm_bufcount >= 2);
+                roff = rmsg->lm_bufcount - 1; /* last segment */
+                break;
+        default:
+                LBUG();
+        }
+
+        bsdr = lustre_msg_buf(rmsg, roff, sizeof(*bsdr));
+        bsdv = lustre_msg_buf(vmsg, voff, sizeof(*bsdv));
+        LASSERT(bsdr && bsdv);
+
+        if (bsdr->bsd_version != bsdv->bsd_version ||
+            bsdr->bsd_type != bsdv->bsd_type ||
+            bsdr->bsd_svc != bsdv->bsd_svc) {
+                CERROR("bulk security descriptor mismatch: "
+                       "(%u,%u,%u) != (%u,%u,%u)\n",
+                       bsdr->bsd_version, bsdr->bsd_type, bsdr->bsd_svc,
+                       bsdv->bsd_version, bsdv->bsd_type, bsdv->bsd_svc);
+                RETURN(-EPROTO);
+        }
+
+        LASSERT(bsdv->bsd_svc == SPTLRPC_BULK_SVC_NULL ||
+                bsdv->bsd_svc == SPTLRPC_BULK_SVC_INTG ||
+                bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV);
+
+        /*
+         * in privacy mode if return success, make sure bd_nob_transferred
+         * is the actual size of the clear text, otherwise upper layer
+         * may be surprised.
+         */
+        if (req->rq_bulk_write) {
+                if (bsdv->bsd_flags & BSD_FL_ERR) {
+                        CERROR("server reported bulk i/o failure\n");
+                        RETURN(-EIO);
+                }
+
+                if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV)
+                        desc->bd_nob_transferred = desc->bd_nob;
+        } else {
+                /*
+                 * bulk read, upon return success, bd_nob_transferred is
+                 * the size of plain text actually received.
+                 */
+                gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+                LASSERT(gctx->gc_mechctx);
+
+		if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_INTG) {
+			int i, nob;
+
+			/* fix the actual data size */
+			for (i = 0, nob = 0; i < desc->bd_iov_count; i++) {
+				if (BD_GET_KIOV(desc, i).kiov_len + nob >
+				    desc->bd_nob_transferred) {
+					BD_GET_KIOV(desc, i).kiov_len =
+						desc->bd_nob_transferred - nob;
+				}
+				nob += BD_GET_KIOV(desc, i).kiov_len;
+			}
+
+			token.data = bsdv->bsd_data;
+			token.len = lustre_msg_buflen(vmsg, voff) -
+				    sizeof(*bsdv);
+
+			maj = lgss_verify_mic(gctx->gc_mechctx, 0, NULL,
+					      desc->bd_iov_count,
+					      GET_KIOV(desc),
+					      &token);
+                        if (maj != GSS_S_COMPLETE) {
+                                CERROR("failed to verify bulk read: %x\n", maj);
+                                RETURN(-EACCES);
+                        }
+                } else if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV) {
+                        desc->bd_nob = bsdv->bsd_nob;
+                        if (desc->bd_nob == 0)
+                                RETURN(0);
+
+                        token.data = bsdv->bsd_data;
+                        token.len = lustre_msg_buflen(vmsg, voff) -
+                                    sizeof(*bsdr);
+
+                        maj = lgss_unwrap_bulk(gctx->gc_mechctx, desc,
+                                               &token, 1);
+                        if (maj != GSS_S_COMPLETE) {
+                                CERROR("failed to decrypt bulk read: %x\n",
+                                       maj);
+                                RETURN(-EACCES);
+                        }
+
+                        desc->bd_nob_transferred = desc->bd_nob;
+                }
+        }
+
+        RETURN(0);
+}
+
+static int gss_prep_bulk(struct ptlrpc_bulk_desc *desc,
+                         struct gss_ctx *mechctx)
+{
+        int     rc;
+
+        if (desc->bd_iov_count == 0)
+                return 0;
+
+        rc = sptlrpc_enc_pool_get_pages(desc);
+        if (rc)
+                return rc;
+
+        if (lgss_prep_bulk(mechctx, desc) != GSS_S_COMPLETE)
+                return -EACCES;
+
+        return 0;
+}
+
+int gss_cli_prep_bulk(struct ptlrpc_request *req,
+                      struct ptlrpc_bulk_desc *desc)
+{
+        int             rc;
+        ENTRY;
+
+        LASSERT(req->rq_cli_ctx);
+        LASSERT(req->rq_pack_bulk);
+        LASSERT(req->rq_bulk_read);
+
+        if (SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc) != SPTLRPC_BULK_SVC_PRIV)
+                RETURN(0);
+
+        rc = gss_prep_bulk(desc, ctx2gctx(req->rq_cli_ctx)->gc_mechctx);
+        if (rc)
+                CERROR("bulk read: failed to prepare encryption "
+                       "pages: %d\n", rc);
+
+        RETURN(rc);
+}
+
+int gss_svc_prep_bulk(struct ptlrpc_request *req,
+                      struct ptlrpc_bulk_desc *desc)
+{
+        struct gss_svc_reqctx        *grctx;
+        struct ptlrpc_bulk_sec_desc  *bsd;
+        int                           rc;
+        ENTRY;
+
+        LASSERT(req->rq_svc_ctx);
+        LASSERT(req->rq_pack_bulk);
+        LASSERT(req->rq_bulk_write);
+
+        grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+        LASSERT(grctx->src_reqbsd);
+        LASSERT(grctx->src_repbsd);
+        LASSERT(grctx->src_ctx);
+        LASSERT(grctx->src_ctx->gsc_mechctx);
+
+        bsd = grctx->src_reqbsd;
+        if (bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV)
+                RETURN(0);
+
+        rc = gss_prep_bulk(desc, grctx->src_ctx->gsc_mechctx);
+        if (rc)
+                CERROR("bulk write: failed to prepare encryption "
+                       "pages: %d\n", rc);
+
+        RETURN(rc);
+}
+
+int gss_svc_unwrap_bulk(struct ptlrpc_request *req,
+                        struct ptlrpc_bulk_desc *desc)
+{
+        struct gss_svc_reqctx        *grctx;
+        struct ptlrpc_bulk_sec_desc  *bsdr, *bsdv;
+        rawobj_t                      token;
+        __u32                         maj;
+        ENTRY;
+
+        LASSERT(req->rq_svc_ctx);
+        LASSERT(req->rq_pack_bulk);
+        LASSERT(req->rq_bulk_write);
+	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
+
+        grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+
+        LASSERT(grctx->src_reqbsd);
+        LASSERT(grctx->src_repbsd);
+        LASSERT(grctx->src_ctx);
+        LASSERT(grctx->src_ctx->gsc_mechctx);
+
+        bsdr = grctx->src_reqbsd;
+        bsdv = grctx->src_repbsd;
+
+        /* bsdr has been sanity checked during unpacking */
+        bsdv->bsd_version = 0;
+        bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+        bsdv->bsd_svc = bsdr->bsd_svc;
+        bsdv->bsd_flags = 0;
+
+        switch (bsdv->bsd_svc) {
+        case SPTLRPC_BULK_SVC_INTG:
+                token.data = bsdr->bsd_data;
+                token.len = grctx->src_reqbsd_size - sizeof(*bsdr);
+
+		maj = lgss_verify_mic(grctx->src_ctx->gsc_mechctx, 0, NULL,
+				      desc->bd_iov_count,
+				      GET_KIOV(desc), &token);
+                if (maj != GSS_S_COMPLETE) {
+                        bsdv->bsd_flags |= BSD_FL_ERR;
+                        CERROR("failed to verify bulk signature: %x\n", maj);
+                        RETURN(-EACCES);
+                }
+                break;
+        case SPTLRPC_BULK_SVC_PRIV:
+                if (bsdr->bsd_nob != desc->bd_nob) {
+                        bsdv->bsd_flags |= BSD_FL_ERR;
+                        CERROR("prepared nob %d doesn't match the actual "
+                               "nob %d\n", desc->bd_nob, bsdr->bsd_nob);
+                        RETURN(-EPROTO);
+                }
+
+                if (desc->bd_iov_count == 0) {
+                        LASSERT(desc->bd_nob == 0);
+                        break;
+                }
+
+                token.data = bsdr->bsd_data;
+                token.len = grctx->src_reqbsd_size - sizeof(*bsdr);
+
+                maj = lgss_unwrap_bulk(grctx->src_ctx->gsc_mechctx,
+                                       desc, &token, 0);
+                if (maj != GSS_S_COMPLETE) {
+                        bsdv->bsd_flags |= BSD_FL_ERR;
+                        CERROR("failed decrypt bulk data: %x\n", maj);
+                        RETURN(-EACCES);
+                }
+
+		/* mimic gss_cli_ctx_unwrap_bulk */
+		desc->bd_nob_transferred = desc->bd_nob;
+
+                break;
+        }
+
+        RETURN(0);
+}
+
+int gss_svc_wrap_bulk(struct ptlrpc_request *req,
+                      struct ptlrpc_bulk_desc *desc)
+{
+        struct gss_svc_reqctx        *grctx;
+        struct ptlrpc_bulk_sec_desc  *bsdr, *bsdv;
+        rawobj_t                      token;
+        __u32                         maj;
+        int                           rc;
+        ENTRY;
+
+        LASSERT(req->rq_svc_ctx);
+        LASSERT(req->rq_pack_bulk);
+        LASSERT(req->rq_bulk_read);
+	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
+
+        grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+
+        LASSERT(grctx->src_reqbsd);
+        LASSERT(grctx->src_repbsd);
+        LASSERT(grctx->src_ctx);
+        LASSERT(grctx->src_ctx->gsc_mechctx);
+
+        bsdr = grctx->src_reqbsd;
+        bsdv = grctx->src_repbsd;
+
+        /* bsdr has been sanity checked during unpacking */
+        bsdv->bsd_version = 0;
+        bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+        bsdv->bsd_svc = bsdr->bsd_svc;
+        bsdv->bsd_flags = 0;
+
+        switch (bsdv->bsd_svc) {
+        case SPTLRPC_BULK_SVC_INTG:
+                token.data = bsdv->bsd_data;
+                token.len = grctx->src_repbsd_size - sizeof(*bsdv);
+
+		maj = lgss_get_mic(grctx->src_ctx->gsc_mechctx, 0, NULL,
+				   desc->bd_iov_count,
+				   GET_KIOV(desc), &token);
+		if (maj != GSS_S_COMPLETE) {
+                        bsdv->bsd_flags |= BSD_FL_ERR;
+                        CERROR("failed to sign bulk data: %x\n", maj);
+                        RETURN(-EACCES);
+                }
+                break;
+        case SPTLRPC_BULK_SVC_PRIV:
+                bsdv->bsd_nob = desc->bd_nob;
+
+                if (desc->bd_iov_count == 0) {
+                        LASSERT(desc->bd_nob == 0);
+                        break;
+                }
+
+                rc = sptlrpc_enc_pool_get_pages(desc);
+                if (rc) {
+                        bsdv->bsd_flags |= BSD_FL_ERR;
+                        CERROR("bulk read: failed to allocate encryption "
+                               "pages: %d\n", rc);
+                        RETURN(rc);
+                }
+
+                token.data = bsdv->bsd_data;
+                token.len = grctx->src_repbsd_size - sizeof(*bsdv);
+
+                maj = lgss_wrap_bulk(grctx->src_ctx->gsc_mechctx,
+                                     desc, &token, 1);
+                if (maj != GSS_S_COMPLETE) {
+                        bsdv->bsd_flags |= BSD_FL_ERR;
+                        CERROR("failed to encrypt bulk data: %x\n", maj);
+                        RETURN(-EACCES);
+                }
+                break;
+        }
+
+        RETURN(0);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_cli_upcall.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_cli_upcall.c
new file mode 100644
index 0000000000000..d1fa9200452ba
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_cli_upcall.c
@@ -0,0 +1,443 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/gss/gss_cli_upcall.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+/**********************************************
+ * gss context init/fini helper               *
+ **********************************************/
+
+static
+int ctx_init_pack_request(struct obd_import *imp,
+                          struct ptlrpc_request *req,
+                          int lustre_srv,
+                          uid_t uid, gid_t gid,
+                          long token_size,
+                          char __user *token)
+{
+        struct lustre_msg       *msg = req->rq_reqbuf;
+        struct gss_sec          *gsec;
+        struct gss_header       *ghdr;
+        struct ptlrpc_user_desc *pud;
+        __u32                   *p, size, offset = 2;
+        rawobj_t                 obj;
+
+        LASSERT(msg->lm_bufcount <= 4);
+        LASSERT(req->rq_cli_ctx);
+        LASSERT(req->rq_cli_ctx->cc_sec);
+
+        /* gss hdr */
+        ghdr = lustre_msg_buf(msg, 0, sizeof(*ghdr));
+        ghdr->gh_version = PTLRPC_GSS_VERSION;
+        ghdr->gh_sp = (__u8) imp->imp_sec->ps_part;
+        ghdr->gh_flags = 0;
+        ghdr->gh_proc = PTLRPC_GSS_PROC_INIT;
+        ghdr->gh_seq = 0;
+        ghdr->gh_svc = SPTLRPC_SVC_NULL;
+        ghdr->gh_handle.len = 0;
+
+        /* fix the user desc */
+        if (req->rq_pack_udesc) {
+                ghdr->gh_flags |= LUSTRE_GSS_PACK_USER;
+
+                pud = lustre_msg_buf(msg, offset, sizeof(*pud));
+                LASSERT(pud);
+                pud->pud_uid = pud->pud_fsuid = uid;
+                pud->pud_gid = pud->pud_fsgid = gid;
+                pud->pud_cap = 0;
+                pud->pud_ngroups = 0;
+                offset++;
+        }
+
+        /* security payload */
+        p = lustre_msg_buf(msg, offset, 0);
+        size = msg->lm_buflens[offset];
+        LASSERT(p);
+
+        /* 1. lustre svc type */
+        LASSERT(size > 4);
+        *p++ = cpu_to_le32(lustre_srv);
+        size -= 4;
+
+        /* 2. target uuid */
+        obj.len = strlen(imp->imp_obd->u.cli.cl_target_uuid.uuid) + 1;
+        obj.data = imp->imp_obd->u.cli.cl_target_uuid.uuid;
+        if (rawobj_serialize(&obj, &p, &size))
+                LBUG();
+
+        /* 3. reverse context handle. actually only needed by root user,
+         *    but we send it anyway. */
+        gsec = sec2gsec(req->rq_cli_ctx->cc_sec);
+        obj.len = sizeof(gsec->gs_rvs_hdl);
+        obj.data = (__u8 *) &gsec->gs_rvs_hdl;
+        if (rawobj_serialize(&obj, &p, &size))
+                LBUG();
+
+        /* 4. now the token */
+        LASSERT(size >= (sizeof(__u32) + token_size));
+        *p++ = cpu_to_le32(((__u32) token_size));
+	if (copy_from_user(p, token, token_size)) {
+                CERROR("can't copy token\n");
+                return -EFAULT;
+        }
+        size -= sizeof(__u32) + cfs_size_round4(token_size);
+
+        req->rq_reqdata_len = lustre_shrink_msg(req->rq_reqbuf, offset,
+                                                msg->lm_buflens[offset] - size, 0);
+        return 0;
+}
+
+static
+int ctx_init_parse_reply(struct lustre_msg *msg, int swabbed,
+                         char __user *outbuf, long outlen)
+{
+        struct gss_rep_header   *ghdr;
+        __u32                    obj_len, round_len;
+        __u32                    status, effective = 0;
+
+        if (msg->lm_bufcount != 3) {
+                CERROR("unexpected bufcount %u\n", msg->lm_bufcount);
+                return -EPROTO;
+        }
+
+        ghdr = (struct gss_rep_header *) gss_swab_header(msg, 0, swabbed);
+        if (ghdr == NULL) {
+                CERROR("unable to extract gss reply header\n");
+                return -EPROTO;
+        }
+
+        if (ghdr->gh_version != PTLRPC_GSS_VERSION) {
+                CERROR("invalid gss version %u\n", ghdr->gh_version);
+                return -EPROTO;
+        }
+
+        if (outlen < (4 + 2) * 4 + cfs_size_round4(ghdr->gh_handle.len) +
+                     cfs_size_round4(msg->lm_buflens[2])) {
+                CERROR("output buffer size %ld too small\n", outlen);
+                return -EFAULT;
+        }
+
+        status = 0;
+        effective = 0;
+
+	if (copy_to_user(outbuf, &status, 4))
+		return -EFAULT;
+	outbuf += 4;
+	if (copy_to_user(outbuf, &ghdr->gh_major, 4))
+		return -EFAULT;
+	outbuf += 4;
+	if (copy_to_user(outbuf, &ghdr->gh_minor, 4))
+		return -EFAULT;
+	outbuf += 4;
+	if (copy_to_user(outbuf, &ghdr->gh_seqwin, 4))
+		return -EFAULT;
+	outbuf += 4;
+	effective += 4 * 4;
+
+	/* handle */
+	obj_len = ghdr->gh_handle.len;
+	round_len = (obj_len + 3) & ~3;
+	if (copy_to_user(outbuf, &obj_len, 4))
+		return -EFAULT;
+	outbuf += 4;
+	if (copy_to_user(outbuf, (char *) ghdr->gh_handle.data, round_len))
+		return -EFAULT;
+	outbuf += round_len;
+	effective += 4 + round_len;
+
+	/* out token */
+	obj_len = msg->lm_buflens[2];
+	round_len = (obj_len + 3) & ~3;
+	if (copy_to_user(outbuf, &obj_len, 4))
+		return -EFAULT;
+	outbuf += 4;
+	if (copy_to_user(outbuf, lustre_msg_buf(msg, 2, 0), round_len))
+		return -EFAULT;
+	outbuf += round_len;
+	effective += 4 + round_len;
+
+	return effective;
+}
+
+/* XXX move to where lgssd could see */
+struct lgssd_ioctl_param {
+        int             version;        /* in   */
+        int             secid;          /* in   */
+	char __user    *uuid;		/* in   */
+        int             lustre_svc;     /* in   */
+        uid_t           uid;            /* in   */
+        gid_t           gid;            /* in   */
+        long            send_token_size;/* in   */
+	char __user    *send_token;     /* in   */
+        long            reply_buf_size; /* in   */
+	char __user    *reply_buf;	/* in   */
+        long            status;         /* out  */
+        long            reply_length;   /* out  */
+};
+
+int gss_do_ctx_init_rpc(char __user *buffer, unsigned long count)
+{
+        struct obd_import        *imp;
+        struct ptlrpc_request    *req;
+        struct lgssd_ioctl_param  param;
+        struct obd_device        *obd;
+        char                      obdname[64];
+        long                      lsize;
+        int                       rc;
+
+        if (count != sizeof(param)) {
+                CERROR("ioctl size %lu, expect %lu, please check lgss_keyring "
+                       "version\n", count, (unsigned long) sizeof(param));
+                RETURN(-EINVAL);
+        }
+	if (copy_from_user(&param, buffer, sizeof(param))) {
+                CERROR("failed copy data from lgssd\n");
+                RETURN(-EFAULT);
+        }
+
+        if (param.version != GSSD_INTERFACE_VERSION) {
+                CERROR("gssd interface version %d (expect %d)\n",
+                        param.version, GSSD_INTERFACE_VERSION);
+                RETURN(-EINVAL);
+        }
+
+        /* take name */
+        if (strncpy_from_user(obdname, param.uuid, sizeof(obdname)) <= 0) {
+                CERROR("Invalid obdname pointer\n");
+                RETURN(-EFAULT);
+        }
+
+        obd = class_name2obd(obdname);
+        if (!obd) {
+                CERROR("no such obd %s\n", obdname);
+                RETURN(-EINVAL);
+        }
+
+        if (unlikely(!obd->obd_set_up)) {
+                CERROR("obd %s not setup\n", obdname);
+                RETURN(-EINVAL);
+        }
+
+	spin_lock(&obd->obd_dev_lock);
+	if (obd->obd_stopping) {
+		CERROR("obd %s has stopped\n", obdname);
+		spin_unlock(&obd->obd_dev_lock);
+		RETURN(-EINVAL);
+	}
+
+	if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME)) {
+		CERROR("obd %s is not a client device\n", obdname);
+		spin_unlock(&obd->obd_dev_lock);
+		RETURN(-EINVAL);
+	}
+	spin_unlock(&obd->obd_dev_lock);
+
+	down_read(&obd->u.cli.cl_sem);
+	if (obd->u.cli.cl_import == NULL) {
+		CERROR("obd %s: import has gone\n", obd->obd_name);
+		up_read(&obd->u.cli.cl_sem);
+		RETURN(-EINVAL);
+	}
+	imp = class_import_get(obd->u.cli.cl_import);
+	up_read(&obd->u.cli.cl_sem);
+
+        if (imp->imp_deactive) {
+                CERROR("import has been deactivated\n");
+                class_import_put(imp);
+                RETURN(-EINVAL);
+        }
+
+        req = ptlrpc_request_alloc_pack(imp, &RQF_SEC_CTX, LUSTRE_OBD_VERSION,
+                                        SEC_CTX_INIT);
+        if (req == NULL) {
+                param.status = -ENOMEM;
+                goto out_copy;
+        }
+
+        if (req->rq_cli_ctx->cc_sec->ps_id != param.secid) {
+                CWARN("original secid %d, now has changed to %d, "
+                      "cancel this negotiation\n", param.secid,
+                      req->rq_cli_ctx->cc_sec->ps_id);
+                param.status = -EINVAL;
+                goto out_copy;
+        }
+
+        /* get token */
+        rc = ctx_init_pack_request(imp, req,
+                                   param.lustre_svc,
+                                   param.uid, param.gid,
+                                   param.send_token_size,
+                                   param.send_token);
+        if (rc) {
+                param.status = rc;
+                goto out_copy;
+        }
+
+        ptlrpc_request_set_replen(req);
+
+        rc = ptlrpc_queue_wait(req);
+        if (rc) {
+                /* If any _real_ denial be made, we expect server return
+                 * -EACCES reply or return success but indicate gss error
+                 * inside reply messsage. All other errors are treated as
+                 * timeout, caller might try the negotiation repeatedly,
+                 * leave recovery decisions to general ptlrpc layer.
+                 *
+                 * FIXME maybe some other error code shouldn't be treated
+                 * as timeout. */
+                param.status = rc;
+                if (rc != -EACCES)
+                        param.status = -ETIMEDOUT;
+                goto out_copy;
+        }
+
+        LASSERT(req->rq_repdata);
+        lsize = ctx_init_parse_reply(req->rq_repdata,
+                                     ptlrpc_rep_need_swab(req),
+                                     param.reply_buf, param.reply_buf_size);
+        if (lsize < 0) {
+                param.status = (int) lsize;
+                goto out_copy;
+        }
+
+        param.status = 0;
+        param.reply_length = lsize;
+
+out_copy:
+	if (copy_to_user(buffer, &param, sizeof(param)))
+                rc = -EFAULT;
+        else
+                rc = 0;
+
+        class_import_put(imp);
+        ptlrpc_req_finished(req);
+        RETURN(rc);
+}
+
+int gss_do_ctx_fini_rpc(struct gss_cli_ctx *gctx)
+{
+	struct ptlrpc_cli_ctx	*ctx = &gctx->gc_base;
+	struct obd_import	*imp = ctx->cc_sec->ps_import;
+	struct ptlrpc_request	*req;
+	struct ptlrpc_user_desc	*pud;
+	int			 rc;
+	ENTRY;
+
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+	if (cli_ctx_is_error(ctx) || !cli_ctx_is_uptodate(ctx)) {
+		CDEBUG(D_SEC, "ctx %p(%u->%s) not uptodate, "
+		       "don't send destroy rpc\n", ctx,
+		       ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+		RETURN(0);
+	}
+
+	might_sleep();
+
+	CWARN("%s ctx %p idx %#llx (%u->%s)\n",
+	      sec_is_reverse(ctx->cc_sec) ?
+	      "server finishing reverse" : "client finishing forward",
+	      ctx, gss_handle_to_u64(&gctx->gc_handle),
+	      ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+
+        gctx->gc_proc = PTLRPC_GSS_PROC_DESTROY;
+
+        req = ptlrpc_request_alloc(imp, &RQF_SEC_CTX);
+        if (req == NULL) {
+                CWARN("ctx %p(%u): fail to prepare rpc, destroy locally\n",
+                      ctx, ctx->cc_vcred.vc_uid);
+                GOTO(out, rc = -ENOMEM);
+        }
+
+        rc = ptlrpc_request_bufs_pack(req, LUSTRE_OBD_VERSION, SEC_CTX_FINI,
+                                      NULL, ctx);
+	if (rc)
+		GOTO(out_ref, rc);
+
+        /* fix the user desc */
+        if (req->rq_pack_udesc) {
+                /* we rely the fact that this request is in AUTH mode,
+                 * and user_desc at offset 2. */
+                pud = lustre_msg_buf(req->rq_reqbuf, 2, sizeof(*pud));
+                LASSERT(pud);
+                pud->pud_uid = pud->pud_fsuid = ctx->cc_vcred.vc_uid;
+                pud->pud_gid = pud->pud_fsgid = ctx->cc_vcred.vc_gid;
+                pud->pud_cap = 0;
+                pud->pud_ngroups = 0;
+        }
+
+        req->rq_phase = RQ_PHASE_RPC;
+        rc = ptl_send_rpc(req, 1);
+        if (rc)
+                CWARN("ctx %p(%u->%s): rpc error %d, destroy locally\n", ctx,
+                      ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec), rc);
+
+out_ref:
+        ptlrpc_req_finished(req);
+out:
+        RETURN(rc);
+}
+
+int __init gss_init_cli_upcall(void)
+{
+        return 0;
+}
+
+void gss_exit_cli_upcall(void)
+{
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.c
new file mode 100644
index 0000000000000..17fd9cf3c00c1
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.c
@@ -0,0 +1,491 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/net/sunrpc/gss_krb5_mech.c
+ *  linux/net/sunrpc/gss_krb5_crypto.c
+ *  linux/net/sunrpc/gss_krb5_seal.c
+ *  linux/net/sunrpc/gss_krb5_seqnum.c
+ *  linux/net/sunrpc/gss_krb5_unseal.c
+ *
+ *  Copyright (c) 2001 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson <andros@umich.edu>
+ *  J. Bruce Fields <bfields@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <libcfs/linux/linux-crypto.h>
+#include <obd.h>
+#include <obd_support.h>
+
+#include "gss_internal.h"
+#include "gss_crypto.h"
+
+int gss_keyblock_init(struct gss_keyblock *kb, char *alg_name,
+		      const int alg_mode)
+{
+	int rc;
+
+	kb->kb_tfm = crypto_alloc_blkcipher(alg_name, alg_mode, 0);
+	if (IS_ERR(kb->kb_tfm)) {
+		rc = PTR_ERR(kb->kb_tfm);
+		kb->kb_tfm = NULL;
+		CERROR("failed to alloc tfm: %s, mode %d: rc = %d\n", alg_name,
+		       alg_mode, rc);
+		return rc;
+	}
+
+	rc = crypto_blkcipher_setkey(kb->kb_tfm, kb->kb_key.data,
+				     kb->kb_key.len);
+	if (rc) {
+		CERROR("failed to set %s key, len %d, rc = %d\n", alg_name,
+		       kb->kb_key.len, rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+void gss_keyblock_free(struct gss_keyblock *kb)
+{
+	rawobj_free(&kb->kb_key);
+	if (kb->kb_tfm)
+		crypto_free_blkcipher(kb->kb_tfm);
+}
+
+int gss_keyblock_dup(struct gss_keyblock *new, struct gss_keyblock *kb)
+{
+	return rawobj_dup(&new->kb_key, &kb->kb_key);
+}
+
+int gss_get_bytes(char **ptr, const char *end, void *res, size_t len)
+{
+	char *p, *q;
+	p = *ptr;
+	q = p + len;
+	if (q > end || q < p)
+		return -EINVAL;
+	memcpy(res, p, len);
+	*ptr = q;
+	return 0;
+}
+
+int gss_get_rawobj(char **ptr, const char *end, rawobj_t *res)
+{
+	char   *p, *q;
+	__u32   len;
+
+	p = *ptr;
+	if (gss_get_bytes(&p, end, &len, sizeof(len)))
+		return -EINVAL;
+
+	q = p + len;
+	if (q > end || q < p)
+		return -EINVAL;
+
+	/* Support empty objects */
+	if (len != 0) {
+		OBD_ALLOC_LARGE(res->data, len);
+		if (!res->data)
+			return -ENOMEM;
+	} else {
+		res->len = len;
+		res->data = NULL;
+		return 0;
+	}
+
+	res->len = len;
+	memcpy(res->data, p, len);
+	*ptr = q;
+	return 0;
+}
+
+int gss_get_keyblock(char **ptr, const char *end,
+		     struct gss_keyblock *kb, __u32 keysize)
+{
+	char *buf;
+	int rc;
+
+	OBD_ALLOC_LARGE(buf, keysize);
+	if (buf == NULL)
+		return -ENOMEM;
+
+	rc = gss_get_bytes(ptr, end, buf, keysize);
+	if (rc) {
+		OBD_FREE_LARGE(buf, keysize);
+		return rc;
+	}
+
+	kb->kb_key.len = keysize;
+	kb->kb_key.data = buf;
+	return 0;
+}
+
+/*
+ * Should be used for buffers allocated with k/vmalloc().
+ *
+ * Dispose of @sgt with gss_teardown_sgtable().
+ *
+ * @prealloc_sg is to avoid memory allocation inside sg_alloc_table()
+ * in cases where a single sg is sufficient.  No attempt to reduce the
+ * number of sgs by squeezing physically contiguous pages together is
+ * made though, for simplicity.
+ *
+ * This function is copied from the ceph filesystem code.
+ */
+int gss_setup_sgtable(struct sg_table *sgt, struct scatterlist *prealloc_sg,
+		      const void *buf, unsigned int buf_len)
+{
+	struct scatterlist *sg;
+	const bool is_vmalloc = is_vmalloc_addr(buf);
+	unsigned int off = offset_in_page(buf);
+	unsigned int chunk_cnt = 1;
+	unsigned int chunk_len = PAGE_ALIGN(off + buf_len);
+	int i;
+	int rc;
+
+	if (buf_len == 0) {
+		memset(sgt, 0, sizeof(*sgt));
+		return -EINVAL;
+	}
+
+	if (is_vmalloc) {
+		chunk_cnt = chunk_len >> PAGE_SHIFT;
+		chunk_len = PAGE_SIZE;
+	}
+
+	if (chunk_cnt > 1) {
+		rc = sg_alloc_table(sgt, chunk_cnt, GFP_NOFS);
+		if (rc)
+			return rc;
+	} else {
+		WARN_ON_ONCE(chunk_cnt != 1);
+		sg_init_table(prealloc_sg, 1);
+		sgt->sgl = prealloc_sg;
+		sgt->nents = sgt->orig_nents = 1;
+	}
+
+	for_each_sg(sgt->sgl, sg, sgt->orig_nents, i) {
+		struct page *page;
+		unsigned int len = min(chunk_len - off, buf_len);
+
+		if (is_vmalloc)
+			page = vmalloc_to_page(buf);
+		else
+			page = virt_to_page(buf);
+
+		sg_set_page(sg, page, len, off);
+
+		off = 0;
+		buf += len;
+		buf_len -= len;
+	}
+
+	WARN_ON_ONCE(buf_len != 0);
+
+	return 0;
+}
+
+void gss_teardown_sgtable(struct sg_table *sgt)
+{
+	if (sgt->orig_nents > 1)
+		sg_free_table(sgt);
+}
+
+int gss_crypt_generic(struct crypto_blkcipher *tfm, int decrypt, const void *iv,
+		      const void *in, void *out, size_t length)
+{
+	struct blkcipher_desc desc;
+	struct scatterlist sg;
+	struct sg_table sg_out;
+	__u8 local_iv[16] = {0};
+	__u32 ret = -EINVAL;
+
+	LASSERT(tfm);
+	desc.tfm = tfm;
+	desc.info = local_iv;
+	desc.flags = 0;
+
+	if (length % crypto_blkcipher_blocksize(tfm) != 0) {
+		CERROR("output length %zu mismatch blocksize %d\n",
+		       length, crypto_blkcipher_blocksize(tfm));
+		goto out;
+	}
+
+	if (crypto_blkcipher_ivsize(tfm) > ARRAY_SIZE(local_iv)) {
+		CERROR("iv size too large %d\n", crypto_blkcipher_ivsize(tfm));
+		goto out;
+	}
+
+	if (iv)
+		memcpy(local_iv, iv, crypto_blkcipher_ivsize(tfm));
+
+	memcpy(out, in, length);
+
+	ret = gss_setup_sgtable(&sg_out, &sg, out, length);
+	if (ret != 0)
+		goto out;
+
+	if (decrypt)
+		ret = crypto_blkcipher_decrypt_iv(&desc, &sg, &sg, length);
+	else
+		ret = crypto_blkcipher_encrypt_iv(&desc, &sg, &sg, length);
+
+	gss_teardown_sgtable(&sg_out);
+out:
+	return ret;
+}
+
+int gss_digest_hmac(struct crypto_hash *tfm,
+		    rawobj_t *key,
+		    rawobj_t *hdr,
+		    int msgcnt, rawobj_t *msgs,
+		    int iovcnt, lnet_kiov_t *iovs,
+		    rawobj_t *cksum)
+{
+	struct hash_desc desc = {
+		.tfm = tfm,
+		.flags = 0,
+	};
+	struct scatterlist sg[1];
+	struct sg_table sgt;
+	int i;
+	int rc;
+
+	rc = crypto_hash_setkey(tfm, key->data, key->len);
+	if (rc)
+		return rc;
+
+	rc = crypto_hash_init(&desc);
+	if (rc)
+		return rc;
+
+	for (i = 0; i < msgcnt; i++) {
+		if (msgs[i].len == 0)
+			continue;
+
+		rc = gss_setup_sgtable(&sgt, sg, msgs[i].data, msgs[i].len);
+		if (rc != 0)
+			return rc;
+		rc = crypto_hash_update(&desc, sg, msgs[i].len);
+		if (rc)
+			return rc;
+
+		gss_teardown_sgtable(&sgt);
+	}
+
+	for (i = 0; i < iovcnt; i++) {
+		if (iovs[i].kiov_len == 0)
+			continue;
+
+		sg_init_table(sg, 1);
+		sg_set_page(&sg[0], iovs[i].kiov_page, iovs[i].kiov_len,
+			    iovs[i].kiov_offset);
+		rc = crypto_hash_update(&desc, sg, iovs[i].kiov_len);
+		if (rc)
+			return rc;
+	}
+
+	if (hdr) {
+		rc = gss_setup_sgtable(&sgt, sg, hdr, sizeof(*hdr));
+		if (rc != 0)
+			return rc;
+		rc = crypto_hash_update(&desc, sg, sizeof(hdr->len));
+		if (rc)
+			return rc;
+
+		gss_teardown_sgtable(&sgt);
+	}
+
+	return crypto_hash_final(&desc, cksum->data);
+}
+
+int gss_digest_norm(struct crypto_hash *tfm,
+		    struct gss_keyblock *kb,
+		    rawobj_t *hdr,
+		    int msgcnt, rawobj_t *msgs,
+		    int iovcnt, lnet_kiov_t *iovs,
+		    rawobj_t *cksum)
+{
+	struct hash_desc   desc;
+	struct scatterlist sg[1];
+	struct sg_table sgt;
+	int                i;
+	int                rc;
+
+	LASSERT(kb->kb_tfm);
+	desc.tfm = tfm;
+	desc.flags = 0;
+
+	rc = crypto_hash_init(&desc);
+	if (rc)
+		return rc;
+
+	for (i = 0; i < msgcnt; i++) {
+		if (msgs[i].len == 0)
+			continue;
+
+		rc = gss_setup_sgtable(&sgt, sg, msgs[i].data, msgs[i].len);
+		if (rc != 0)
+			return rc;
+
+		rc = crypto_hash_update(&desc, sg, msgs[i].len);
+		if (rc)
+			return rc;
+
+		gss_teardown_sgtable(&sgt);
+	}
+
+	for (i = 0; i < iovcnt; i++) {
+		if (iovs[i].kiov_len == 0)
+			continue;
+
+		sg_init_table(sg, 1);
+		sg_set_page(&sg[0], iovs[i].kiov_page, iovs[i].kiov_len,
+			    iovs[i].kiov_offset);
+		rc = crypto_hash_update(&desc, sg, iovs[i].kiov_len);
+		if (rc)
+			return rc;
+	}
+
+	if (hdr) {
+		rc = gss_setup_sgtable(&sgt, sg, hdr, sizeof(*hdr));
+		if (rc != 0)
+			return rc;
+
+		rc = crypto_hash_update(&desc, sg, sizeof(*hdr));
+		if (rc)
+			return rc;
+
+		gss_teardown_sgtable(&sgt);
+	}
+
+	rc = crypto_hash_final(&desc, cksum->data);
+	if (rc)
+		return rc;
+
+	return gss_crypt_generic(kb->kb_tfm, 0, NULL, cksum->data,
+				 cksum->data, cksum->len);
+}
+
+int gss_add_padding(rawobj_t *msg, int msg_buflen, int blocksize)
+{
+	int padding;
+
+	padding = (blocksize - (msg->len & (blocksize - 1))) &
+		  (blocksize - 1);
+	if (!padding)
+		return 0;
+
+	if (msg->len + padding > msg_buflen) {
+		CERROR("bufsize %u too small: datalen %u, padding %u\n",
+		       msg_buflen, msg->len, padding);
+		return -EINVAL;
+	}
+
+	memset(msg->data + msg->len, padding, padding);
+	msg->len += padding;
+	return 0;
+}
+
+int gss_crypt_rawobjs(struct crypto_blkcipher *tfm, __u8 *iv,
+		      int inobj_cnt, rawobj_t *inobjs, rawobj_t *outobj,
+		      int enc)
+{
+	struct blkcipher_desc desc;
+	struct scatterlist src;
+	struct scatterlist dst;
+	struct sg_table sg_dst;
+	struct sg_table sg_src;
+	__u8 *buf;
+	__u32 datalen = 0;
+	int i, rc;
+	ENTRY;
+
+	buf = outobj->data;
+	desc.tfm  = tfm;
+	desc.info = iv;
+	desc.flags = 0;
+
+	for (i = 0; i < inobj_cnt; i++) {
+		LASSERT(buf + inobjs[i].len <= outobj->data + outobj->len);
+
+		rc = gss_setup_sgtable(&sg_src, &src, inobjs[i].data,
+				   inobjs[i].len);
+		if (rc != 0)
+			RETURN(rc);
+
+		rc = gss_setup_sgtable(&sg_dst, &dst, buf,
+				       outobj->len - datalen);
+		if (rc != 0) {
+			gss_teardown_sgtable(&sg_src);
+			RETURN(rc);
+		}
+
+		if (iv) {
+			if (enc)
+				rc = crypto_blkcipher_encrypt_iv(&desc, &dst,
+								 &src,
+								 src.length);
+			else
+				rc = crypto_blkcipher_decrypt_iv(&desc, &dst,
+								 &src,
+								 src.length);
+		} else {
+			if (enc)
+				rc = crypto_blkcipher_encrypt(&desc, &dst, &src,
+							      src.length);
+			else
+				rc = crypto_blkcipher_decrypt(&desc, &dst, &src,
+							      src.length);
+		}
+
+		gss_teardown_sgtable(&sg_src);
+		gss_teardown_sgtable(&sg_dst);
+
+		if (rc) {
+			CERROR("encrypt error %d\n", rc);
+			RETURN(rc);
+		}
+
+		datalen += inobjs[i].len;
+		buf += inobjs[i].len;
+	}
+
+	outobj->len = datalen;
+	RETURN(0);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.h
new file mode 100644
index 0000000000000..ad15cdedd66d5
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.h
@@ -0,0 +1,35 @@
+#ifndef PTLRPC_GSS_CRYPTO_H
+#define PTLRPC_GSS_CRYPTO_H
+
+#include "gss_internal.h"
+
+struct gss_keyblock {
+	rawobj_t		 kb_key;
+	struct crypto_blkcipher *kb_tfm;
+};
+
+int gss_keyblock_init(struct gss_keyblock *kb, char *alg_name,
+		      const int alg_mode);
+void gss_keyblock_free(struct gss_keyblock *kb);
+int gss_keyblock_dup(struct gss_keyblock *new, struct gss_keyblock *kb);
+int gss_get_bytes(char **ptr, const char *end, void *res, size_t len);
+int gss_get_rawobj(char **ptr, const char *end, rawobj_t *res);
+int gss_get_keyblock(char **ptr, const char *end, struct gss_keyblock *kb,
+		     __u32 keysize);
+int gss_setup_sgtable(struct sg_table *sgt, struct scatterlist *prealloc_sg,
+		      const void *buf, unsigned int buf_len);
+void gss_teardown_sgtable(struct sg_table *sgt);
+int gss_crypt_generic(struct crypto_blkcipher *tfm, int decrypt, const void *iv,
+		      const void *in, void *out, size_t length);
+int gss_digest_hmac(struct crypto_hash *tfm, rawobj_t *key, rawobj_t *hdr,
+		    int msgcnt, rawobj_t *msgs, int iovcnt, lnet_kiov_t *iovs,
+		    rawobj_t *cksum);
+int gss_digest_norm(struct crypto_hash *tfm, struct gss_keyblock *kb,
+		    rawobj_t *hdr, int msgcnt, rawobj_t *msgs, int iovcnt,
+		    lnet_kiov_t *iovs, rawobj_t *cksum);
+int gss_add_padding(rawobj_t *msg, int msg_buflen, int blocksize);
+int gss_crypt_rawobjs(struct crypto_blkcipher *tfm, __u8 *iv,
+		      int inobj_cnt, rawobj_t *inobjs, rawobj_t *outobj,
+		      int enc);
+
+#endif /* PTLRPC_GSS_CRYPTO_H */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_err.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_err.h
new file mode 100644
index 0000000000000..bcf81304ff750
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_err.h
@@ -0,0 +1,193 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  Adapted from MIT Kerberos 5-1.2.1 include/gssapi/gssapi.h
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose.  It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef __PTLRPC_GSS_GSS_ERR_H_
+#define __PTLRPC_GSS_GSS_ERR_H_
+
+typedef unsigned int OM_uint32;
+
+/*
+ * Flag bits for context-level services.
+ */
+#define GSS_C_DELEG_FLAG        (1)
+#define GSS_C_MUTUAL_FLAG       (2)
+#define GSS_C_REPLAY_FLAG       (4)
+#define GSS_C_SEQUENCE_FLAG     (8)
+#define GSS_C_CONF_FLAG         (16)
+#define GSS_C_INTEG_FLAG        (32)
+#define GSS_C_ANON_FLAG         (64)
+#define GSS_C_PROT_READY_FLAG   (128)
+#define GSS_C_TRANS_FLAG        (256)
+
+/*
+ * Credential usage options
+ */
+#define GSS_C_BOTH              (0)
+#define GSS_C_INITIATE          (1)
+#define GSS_C_ACCEPT            (2)
+
+/*
+ * Status code types for gss_display_status
+ */
+#define GSS_C_GSS_CODE          (1)
+#define GSS_C_MECH_CODE         (2)
+
+
+/*
+ * Define the default Quality of Protection for per-message services.  Note
+ * that an implementation that offers multiple levels of QOP may either reserve
+ * a value (for example zero, as assumed here) to mean "default protection", or
+ * alternatively may simply equate GSS_C_QOP_DEFAULT to a specific explicit
+ * QOP value.  However a value of 0 should always be interpreted by a GSSAPI
+ * implementation as a request for the default protection level.
+ */
+#define GSS_C_QOP_DEFAULT       (0)
+
+/*
+ * Expiration time of 2^32-1 seconds means infinite lifetime for a
+ * credential or security context
+ */
+#define GSS_C_INDEFINITE        ((OM_uint32) 0xfffffffful)
+
+
+/* Major status codes */
+
+#define GSS_S_COMPLETE          (0)
+
+/*
+ * Some "helper" definitions to make the status code macros obvious.
+ */
+#define GSS_C_CALLING_ERROR_OFFSET      (24)
+#define GSS_C_ROUTINE_ERROR_OFFSET      (16)
+#define GSS_C_SUPPLEMENTARY_OFFSET      (0)
+#define GSS_C_CALLING_ERROR_MASK        ((OM_uint32) 0377ul)
+#define GSS_C_ROUTINE_ERROR_MASK        ((OM_uint32) 0377ul)
+#define GSS_C_SUPPLEMENTARY_MASK        ((OM_uint32) 0177777ul)
+
+/*
+ * The macros that test status codes for error conditions.  Note that the
+ * GSS_ERROR() macro has changed slightly from the V1 GSSAPI so that it now
+ * evaluates its argument only once.
+ */
+#define GSS_CALLING_ERROR(x) \
+  ((x) & (GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET))
+#define GSS_ROUTINE_ERROR(x) \
+  ((x) & (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET))
+#define GSS_SUPPLEMENTARY_INFO(x) \
+  ((x) & (GSS_C_SUPPLEMENTARY_MASK << GSS_C_SUPPLEMENTARY_OFFSET))
+#define GSS_ERROR(x) \
+  ((x) & ((GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET) | \
+          (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET)))
+
+/*
+ * Now the actual status code definitions
+ */
+
+/*
+ * Calling errors:
+ */
+#define GSS_S_CALL_INACCESSIBLE_READ \
+        (((OM_uint32) 1ul) << GSS_C_CALLING_ERROR_OFFSET)
+#define GSS_S_CALL_INACCESSIBLE_WRITE \
+        (((OM_uint32) 2ul) << GSS_C_CALLING_ERROR_OFFSET)
+#define GSS_S_CALL_BAD_STRUCTURE \
+        (((OM_uint32) 3ul) << GSS_C_CALLING_ERROR_OFFSET)
+
+/*
+ * Routine errors:
+ */
+#define GSS_S_BAD_MECH \
+        (((OM_uint32) 1ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_NAME \
+        (((OM_uint32) 2ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_NAMETYPE \
+        (((OM_uint32) 3ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_BINDINGS \
+        (((OM_uint32) 4ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_STATUS \
+        (((OM_uint32) 5ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_SIG \
+        (((OM_uint32) 6ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_NO_CRED \
+        (((OM_uint32) 7ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_NO_CONTEXT \
+        (((OM_uint32) 8ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_DEFECTIVE_TOKEN \
+        (((OM_uint32) 9ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_DEFECTIVE_CREDENTIAL \
+        (((OM_uint32) 10ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_CREDENTIALS_EXPIRED \
+        (((OM_uint32) 11ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_CONTEXT_EXPIRED \
+        (((OM_uint32) 12ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_FAILURE \
+        (((OM_uint32) 13ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_QOP \
+        (((OM_uint32) 14ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_UNAUTHORIZED \
+        (((OM_uint32) 15ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_UNAVAILABLE \
+        (((OM_uint32) 16ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_DUPLICATE_ELEMENT \
+        (((OM_uint32) 17ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_NAME_NOT_MN \
+        (((OM_uint32) 18ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+
+/*
+ * Supplementary info bits:
+ */
+#define GSS_S_CONTINUE_NEEDED   (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 0))
+#define GSS_S_DUPLICATE_TOKEN   (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 1))
+#define GSS_S_OLD_TOKEN         (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 2))
+#define GSS_S_UNSEQ_TOKEN       (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 3))
+#define GSS_S_GAP_TOKEN         (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 4))
+
+/* XXXX these are not part of the GSSAPI C bindings!  (but should be) */
+
+#define GSS_CALLING_ERROR_FIELD(x) \
+        (((x) >> GSS_C_CALLING_ERROR_OFFSET) & GSS_C_CALLING_ERROR_MASK)
+#define GSS_ROUTINE_ERROR_FIELD(x) \
+        (((x) >> GSS_C_ROUTINE_ERROR_OFFSET) & GSS_C_ROUTINE_ERROR_MASK)
+#define GSS_SUPPLEMENTARY_INFO_FIELD(x) \
+        (((x) >> GSS_C_SUPPLEMENTARY_OFFSET) & GSS_C_SUPPLEMENTARY_MASK)
+
+/* XXXX This is a necessary evil until the spec is fixed */
+#define GSS_S_CRED_UNAVAIL GSS_S_FAILURE
+
+#endif /* __PTLRPC_GSS_GSS_ERR_H_ */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_generic_token.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_generic_token.c
new file mode 100644
index 0000000000000..3c4e63b992bee
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_generic_token.c
@@ -0,0 +1,285 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/net/sunrpc/gss_generic_token.c
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/generic/util_token.c
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose.  It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_krb5.h"
+#include "gss_asn1.h"
+
+
+/* TWRITE_STR from gssapiP_generic.h */
+#define TWRITE_STR(ptr, str, len) \
+        memcpy((ptr), (char *) (str), (len)); \
+        (ptr) += (len);
+
+/* XXXX this code currently makes the assumption that a mech oid will
+   never be longer than 127 bytes.  This assumption is not inherent in
+   the interfaces, so the code can be fixed if the OSI namespace
+   balloons unexpectedly. */
+
+/* Each token looks like this:
+
+0x60                                tag for APPLICATION 0, SEQUENCE
+                                        (constructed, definite-length)
+        <length>                possible multiple bytes, need to parse/generate
+        0x06                        tag for OBJECT IDENTIFIER
+                <moid_length>        compile-time constant string (assume 1 byte)
+                <moid_bytes>        compile-time constant string
+        <inner_bytes>                the ANY containing the application token
+                                        bytes 0,1 are the token type
+                                        bytes 2,n are the token data
+
+For the purposes of this abstraction, the token "header" consists of
+the sequence tag and length octets, the mech OID DER encoding, and the
+first two inner bytes, which indicate the token type.  The token
+"body" consists of everything else.
+
+*/
+
+static
+int der_length_size(int length)
+{
+        if (length < (1 << 7))
+                return 1;
+        else if (length < (1 << 8))
+                return 2;
+#if (SIZEOF_INT == 2)
+        else
+                return 3;
+#else
+        else if (length < (1 << 16))
+                return 3;
+        else if (length < (1 << 24))
+                return 4;
+        else
+                return 5;
+#endif
+}
+
+static
+void der_write_length(unsigned char **buf, int length)
+{
+        if (length < (1 << 7)) {
+                *(*buf)++ = (unsigned char) length;
+        } else {
+                *(*buf)++ = (unsigned char) (der_length_size(length) + 127);
+#if (SIZEOF_INT > 2)
+                if (length >= (1 << 24))
+                        *(*buf)++ = (unsigned char) (length >> 24);
+                if (length >= (1 << 16))
+                        *(*buf)++ = (unsigned char) ((length >> 16) & 0xff);
+#endif
+                if (length >= (1 << 8))
+                        *(*buf)++ = (unsigned char) ((length >> 8) & 0xff);
+                *(*buf)++ = (unsigned char) (length & 0xff);
+        }
+}
+
+/*
+ * returns decoded length, or < 0 on failure.  Advances buf and
+ * decrements bufsize
+ */
+static
+int der_read_length(unsigned char **buf, int *bufsize)
+{
+        unsigned char sf;
+        int ret;
+
+        if (*bufsize < 1)
+                return -1;
+        sf = *(*buf)++;
+        (*bufsize)--;
+        if (sf & 0x80) {
+                if ((sf &= 0x7f) > ((*bufsize) - 1))
+                        return -1;
+                if (sf > SIZEOF_INT)
+                        return -1;
+                ret = 0;
+                for (; sf; sf--) {
+                        ret = (ret << 8) + (*(*buf)++);
+                        (*bufsize)--;
+                }
+        } else {
+                ret = sf;
+        }
+
+        return ret;
+}
+
+/*
+ * returns the length of a token, given the mech oid and the body size
+ */
+int g_token_size(rawobj_t *mech, unsigned int body_size)
+{
+        /* set body_size to sequence contents size */
+        body_size += 4 + (int) mech->len; /* NEED overflow check */
+        return (1 + der_length_size(body_size) + body_size);
+}
+
+/*
+ * fills in a buffer with the token header.  The buffer is assumed to
+ * be the right size.  buf is advanced past the token header
+ */
+void g_make_token_header(rawobj_t *mech, int body_size, unsigned char **buf)
+{
+        *(*buf)++ = 0x60;
+        der_write_length(buf, 4 + mech->len + body_size);
+        *(*buf)++ = 0x06;
+        *(*buf)++ = (unsigned char) mech->len;
+        TWRITE_STR(*buf, mech->data, ((int) mech->len));
+}
+
+/*
+ * Given a buffer containing a token, reads and verifies the token,
+ * leaving buf advanced past the token header, and setting body_size
+ * to the number of remaining bytes.  Returns 0 on success,
+ * G_BAD_TOK_HEADER for a variety of errors, and G_WRONG_MECH if the
+ * mechanism in the token does not match the mech argument.  buf and
+ * *body_size are left unmodified on error.
+ */
+__u32 g_verify_token_header(rawobj_t *mech, int *body_size,
+                            unsigned char **buf_in, int toksize)
+{
+        unsigned char *buf = *buf_in;
+        int seqsize;
+        rawobj_t toid;
+        int ret = 0;
+
+        if ((toksize -= 1) < 0)
+                return (G_BAD_TOK_HEADER);
+        if (*buf++ != 0x60)
+                return (G_BAD_TOK_HEADER);
+
+        if ((seqsize = der_read_length(&buf, &toksize)) < 0)
+                return(G_BAD_TOK_HEADER);
+
+        if (seqsize != toksize)
+                return (G_BAD_TOK_HEADER);
+
+        if ((toksize -= 1) < 0)
+                return (G_BAD_TOK_HEADER);
+        if (*buf++ != 0x06)
+                return (G_BAD_TOK_HEADER);
+ 
+        if ((toksize -= 1) < 0)
+                return (G_BAD_TOK_HEADER);
+        toid.len = *buf++;
+
+        if ((toksize -= toid.len) < 0)
+                return (G_BAD_TOK_HEADER);
+        toid.data = buf;
+        buf += toid.len;
+
+        if (!g_OID_equal(&toid, mech)) 
+                ret = G_WRONG_MECH;
+ 
+        /* G_WRONG_MECH is not returned immediately because it's more
+         * important to return G_BAD_TOK_HEADER if the token header is
+         * in fact bad
+         */
+        if ((toksize -= 2) < 0)
+                return (G_BAD_TOK_HEADER);
+
+        if (ret)
+                return (ret);
+
+        if (!ret) {
+                *buf_in = buf;
+                *body_size = toksize;
+        }
+
+        return (ret);
+}
+
+/*
+ * Given a buffer containing a token, returns a copy of the mech oid in
+ * the parameter mech.
+ */
+__u32 g_get_mech_oid(rawobj_t *mech, rawobj_t *in_buf)
+{
+        unsigned char *buf = in_buf->data;
+        int len = in_buf->len;
+        int ret = 0;
+        int seqsize;
+
+        if ((len -= 1) < 0)
+                return (G_BAD_TOK_HEADER);
+        if (*buf++ != 0x60)
+                return (G_BAD_TOK_HEADER);
+
+        if ((seqsize = der_read_length(&buf, &len)) < 0)
+                return (G_BAD_TOK_HEADER);
+
+        if ((len -= 1) < 0)
+                return (G_BAD_TOK_HEADER);
+        if (*buf++ != 0x06)
+                return (G_BAD_TOK_HEADER);
+
+        if ((len -= 1) < 0)
+                return (G_BAD_TOK_HEADER);
+        mech->len = *buf++;
+
+        if ((len -= mech->len) < 0)
+                return (G_BAD_TOK_HEADER);
+        OBD_ALLOC_LARGE(mech->data, mech->len);
+        if (!mech->data) 
+                return (G_BUFFER_ALLOC);
+        memcpy(mech->data, buf, mech->len);
+
+        return ret;
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h
new file mode 100644
index 0000000000000..95d00f5f7c1a7
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h
@@ -0,0 +1,557 @@
+/*
+ * Modified from NFSv4 project for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#ifndef __PTLRPC_GSS_GSS_INTERNAL_H_
+#define __PTLRPC_GSS_GSS_INTERNAL_H_
+
+#include <linux/crypto.h>
+#include <lustre_sec.h>
+
+/*
+ * rawobj stuff
+ */
+typedef struct netobj_s {
+        __u32           len;
+        __u8            data[0];
+} netobj_t;
+
+#define NETOBJ_EMPTY    ((netobj_t) { 0 })
+
+typedef struct rawobj_s {
+        __u32           len;
+        __u8           *data;
+} rawobj_t;
+
+#define RAWOBJ_EMPTY    ((rawobj_t) { 0, NULL })
+
+typedef struct rawobj_buf_s {
+        __u32           dataoff;
+        __u32           datalen;
+        __u32           buflen;
+        __u8           *buf;
+} rawobj_buf_t;
+
+int rawobj_empty(rawobj_t *obj);
+int rawobj_alloc(rawobj_t *obj, char *buf, int len);
+void rawobj_free(rawobj_t *obj);
+int rawobj_equal(rawobj_t *a, rawobj_t *b);
+int rawobj_dup(rawobj_t *dest, rawobj_t *src);
+int rawobj_serialize(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_extract_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_extract_local(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_extract_local_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_from_netobj(rawobj_t *rawobj, netobj_t *netobj);
+int rawobj_from_netobj_alloc(rawobj_t *obj, netobj_t *netobj);
+
+int buffer_extract_bytes(const void **buf, __u32 *buflen,
+                         void *res, __u32 reslen);
+
+/*
+ * several timeout values. client refresh upcall timeout we using
+ * default in pipefs implemnetation.
+ */
+#define __TIMEOUT_DELTA                 (10)
+
+#define GSS_SECINIT_RPC_TIMEOUT                                         \
+        (obd_timeout < __TIMEOUT_DELTA ?                                \
+         __TIMEOUT_DELTA : obd_timeout - __TIMEOUT_DELTA)
+
+#define GSS_SECFINI_RPC_TIMEOUT         (__TIMEOUT_DELTA)
+#define GSS_SECSVC_UPCALL_TIMEOUT       (GSS_SECINIT_RPC_TIMEOUT)
+
+/*
+ * default gc interval
+ */
+#define GSS_GC_INTERVAL                 (60 * 60) /* 60 minutes */
+
+static inline
+unsigned long gss_round_ctx_expiry(unsigned long expiry,
+                                   unsigned long sec_flags)
+{
+        if (sec_flags & PTLRPC_SEC_FL_REVERSE)
+                return expiry;
+
+        if (get_seconds() + __TIMEOUT_DELTA <= expiry)
+                return expiry - __TIMEOUT_DELTA;
+
+        return expiry;
+}
+
+/*
+ * Max encryption element in block cipher algorithms.
+ */
+#define GSS_MAX_CIPHER_BLOCK               (16)
+
+/*
+ * XXX make it visible of kernel and lgssd/lsvcgssd
+ */
+enum {
+	GSSD_INTERFACE_VERSION_V1 = 1,
+	GSSD_INTERFACE_VERSION_V2 = 2,
+	GSSD_INTERFACE_VERSION = GSSD_INTERFACE_VERSION_V2,
+};
+
+#define PTLRPC_GSS_VERSION              (1)
+
+
+enum ptlrpc_gss_proc {
+        PTLRPC_GSS_PROC_DATA            = 0,
+        PTLRPC_GSS_PROC_INIT            = 1,
+        PTLRPC_GSS_PROC_CONTINUE_INIT   = 2,
+        PTLRPC_GSS_PROC_DESTROY         = 3,
+        PTLRPC_GSS_PROC_ERR             = 4,
+};
+
+enum ptlrpc_gss_tgt {
+        LUSTRE_GSS_TGT_MGS              = 0,
+        LUSTRE_GSS_TGT_MDS              = 1,
+        LUSTRE_GSS_TGT_OSS              = 2,
+};
+
+enum ptlrpc_gss_header_flags {
+        LUSTRE_GSS_PACK_BULK            = 1,
+        LUSTRE_GSS_PACK_USER            = 2,
+};
+
+static inline
+__u32 import_to_gss_svc(struct obd_import *imp)
+{
+	int cl_sp_to = LUSTRE_SP_ANY;
+
+	if (imp->imp_obd)
+		cl_sp_to = imp->imp_obd->u.cli.cl_sp_to;
+
+	switch (cl_sp_to) {
+	case LUSTRE_SP_MDT:
+		return LUSTRE_GSS_TGT_MDS;
+	case LUSTRE_SP_OST:
+		return LUSTRE_GSS_TGT_OSS;
+	case LUSTRE_SP_MGC:
+	case LUSTRE_SP_MGS:
+		return LUSTRE_GSS_TGT_MGS;
+	case LUSTRE_SP_CLI:
+	case LUSTRE_SP_ANY:
+	default:
+		return 0;
+	}
+}
+
+/*
+ * following 3 header must have the same size and offset
+ */
+struct gss_header {
+        __u8                    gh_version;     /* gss version */
+        __u8                    gh_sp;          /* sec part */
+        __u16                   gh_pad0;
+        __u32                   gh_flags;       /* wrap flags */
+        __u32                   gh_proc;        /* proc */
+        __u32                   gh_seq;         /* sequence */
+        __u32                   gh_svc;         /* service */
+        __u32                   gh_pad1;
+        __u32                   gh_pad2;
+        __u32                   gh_pad3;
+        netobj_t                gh_handle;      /* context handle */
+};
+
+struct gss_rep_header {
+        __u8                    gh_version;
+        __u8                    gh_sp;
+        __u16                   gh_pad0;
+        __u32                   gh_flags;
+        __u32                   gh_proc;
+        __u32                   gh_major;
+        __u32                   gh_minor;
+        __u32                   gh_seqwin;
+        __u32                   gh_pad2;
+        __u32                   gh_pad3;
+        netobj_t                gh_handle;
+};
+
+struct gss_err_header {
+        __u8                    gh_version;
+        __u8                    gh_sp;
+        __u16                   gh_pad0;
+        __u32                   gh_flags;
+        __u32                   gh_proc;
+        __u32                   gh_major;
+        __u32                   gh_minor;
+        __u32                   gh_pad1;
+        __u32                   gh_pad2;
+        __u32                   gh_pad3;
+        netobj_t                gh_handle;
+};
+
+/*
+ * part of wire context information send from client which be saved and
+ * used later by server.
+ */
+struct gss_wire_ctx {
+        __u32                   gw_flags;
+        __u32                   gw_proc;
+        __u32                   gw_seq;
+        __u32                   gw_svc;
+        rawobj_t                gw_handle;
+};
+
+#define PTLRPC_GSS_MAX_HANDLE_SIZE      (8)
+#define PTLRPC_GSS_HEADER_SIZE          (sizeof(struct gss_header) + \
+                                         PTLRPC_GSS_MAX_HANDLE_SIZE)
+
+
+static inline __u64 gss_handle_to_u64(rawobj_t *handle)
+{
+        if (handle->len != PTLRPC_GSS_MAX_HANDLE_SIZE)
+                return -1;
+        return *((__u64 *) handle->data);
+}
+
+#define GSS_SEQ_WIN                     (2048)
+#define GSS_SEQ_WIN_MAIN                GSS_SEQ_WIN
+#define GSS_SEQ_WIN_BACK                (128)
+#define GSS_SEQ_REPACK_THRESHOLD        (GSS_SEQ_WIN_MAIN / 2 + \
+                                         GSS_SEQ_WIN_MAIN / 4)
+
+struct gss_svc_seq_data {
+	spinlock_t		ssd_lock;
+        /*
+         * highest sequence number seen so far, for main and back window
+         */
+        __u32                   ssd_max_main;
+        __u32                   ssd_max_back;
+        /*
+         * main and back window
+         * for i such that ssd_max - GSS_SEQ_WIN < i <= ssd_max, the i-th bit
+         * of ssd_win is nonzero iff sequence number i has been seen already.
+         */
+        unsigned long           ssd_win_main[GSS_SEQ_WIN_MAIN/BITS_PER_LONG];
+        unsigned long           ssd_win_back[GSS_SEQ_WIN_BACK/BITS_PER_LONG];
+};
+
+struct gss_svc_ctx {
+        struct gss_ctx         *gsc_mechctx;
+        struct gss_svc_seq_data gsc_seqdata;
+        rawobj_t                gsc_rvs_hdl;
+        __u32                   gsc_rvs_seq;
+        uid_t                   gsc_uid;
+        gid_t                   gsc_gid;
+        uid_t                   gsc_mapped_uid;
+        unsigned int            gsc_usr_root:1,
+                                gsc_usr_mds:1,
+                                gsc_usr_oss:1,
+                                gsc_remote:1,
+                                gsc_reverse:1;
+};
+
+struct gss_svc_reqctx {
+        struct ptlrpc_svc_ctx           src_base;
+        /*
+         * context
+         */
+        struct gss_wire_ctx             src_wirectx;
+        struct gss_svc_ctx             *src_ctx;
+        /*
+         * record place of bulk_sec_desc in request/reply buffer
+         */
+        struct ptlrpc_bulk_sec_desc    *src_reqbsd;
+        int                             src_reqbsd_size;
+        struct ptlrpc_bulk_sec_desc    *src_repbsd;
+        int                             src_repbsd_size;
+        /*
+         * flags
+         */
+        unsigned int                    src_init:1,
+                                        src_init_continue:1,
+                                        src_err_notify:1;
+        int                             src_reserve_len;
+};
+
+struct gss_cli_ctx {
+	struct ptlrpc_cli_ctx	gc_base;
+	__u32			gc_flavor;
+	__u32			gc_proc;
+	__u32			gc_win;
+	atomic_t		gc_seq;
+	rawobj_t		gc_handle;
+	struct gss_ctx		*gc_mechctx;
+	/* handle for the buddy svc ctx */
+	rawobj_t		gc_svc_handle;
+};
+
+struct gss_cli_ctx_keyring {
+        struct gss_cli_ctx      gck_base;
+        struct key             *gck_key;
+        struct timer_list      *gck_timer;
+};
+
+struct gss_sec {
+	struct ptlrpc_sec	gs_base;
+	struct gss_api_mech	*gs_mech;
+	spinlock_t		gs_lock;
+	__u64			gs_rvs_hdl;
+};
+
+struct gss_sec_pipefs {
+	struct gss_sec		gsp_base;
+	int			gsp_chash_size;	/* must be 2^n */
+	struct hlist_head	gsp_chash[0];
+};
+
+/*
+ * FIXME cleanup the keyring upcall mutexes
+ */
+#define HAVE_KEYRING_UPCALL_SERIALIZED  1
+
+struct gss_sec_keyring {
+        struct gss_sec          gsk_base;
+        /*
+         * all contexts listed here. access is protected by sec spinlock.
+         */
+	struct hlist_head	gsk_clist;
+        /*
+         * specially point to root ctx (only one at a time). access is
+         * protected by sec spinlock.
+         */
+        struct ptlrpc_cli_ctx  *gsk_root_ctx;
+        /*
+         * specially serialize upcalls for root context.
+         */
+	struct mutex			gsk_root_uc_lock;
+
+#ifdef HAVE_KEYRING_UPCALL_SERIALIZED
+	struct mutex		gsk_uc_lock;	/* serialize upcalls */
+#endif
+};
+
+static inline struct gss_cli_ctx *ctx2gctx(struct ptlrpc_cli_ctx *ctx)
+{
+        return container_of(ctx, struct gss_cli_ctx, gc_base);
+}
+
+static inline
+struct gss_cli_ctx_keyring *ctx2gctx_keyring(struct ptlrpc_cli_ctx *ctx)
+{
+        return container_of(ctx2gctx(ctx),
+                            struct gss_cli_ctx_keyring, gck_base);
+}
+
+static inline struct gss_sec *sec2gsec(struct ptlrpc_sec *sec)
+{
+        return container_of(sec, struct gss_sec, gs_base);
+}
+
+static inline struct gss_sec_pipefs *sec2gsec_pipefs(struct ptlrpc_sec *sec)
+{
+        return container_of(sec2gsec(sec), struct gss_sec_pipefs, gsp_base);
+}
+
+static inline struct gss_sec_keyring *sec2gsec_keyring(struct ptlrpc_sec *sec)
+{
+        return container_of(sec2gsec(sec), struct gss_sec_keyring, gsk_base);
+}
+
+
+#define GSS_CTX_INIT_MAX_LEN            (1024)
+
+/*
+ * This only guaranteed be enough for current krb5 des-cbc-crc . We might
+ * adjust this when new enc type or mech added in.
+ */
+#define GSS_PRIVBUF_PREFIX_LEN         (32)
+#define GSS_PRIVBUF_SUFFIX_LEN         (32)
+
+static inline
+struct gss_svc_reqctx *gss_svc_ctx2reqctx(struct ptlrpc_svc_ctx *ctx)
+{
+        LASSERT(ctx);
+        return container_of(ctx, struct gss_svc_reqctx, src_base);
+}
+
+static inline
+struct gss_svc_ctx *gss_svc_ctx2gssctx(struct ptlrpc_svc_ctx *ctx)
+{
+        LASSERT(ctx);
+        return gss_svc_ctx2reqctx(ctx)->src_ctx;
+}
+
+/* sec_gss.c */
+int gss_cli_ctx_match(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred);
+int gss_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize);
+int gss_cli_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req);
+int gss_cli_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req);
+int gss_cli_ctx_seal(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req);
+int gss_cli_ctx_unseal(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req);
+
+int  gss_sec_install_rctx(struct obd_import *imp, struct ptlrpc_sec *sec,
+                          struct ptlrpc_cli_ctx *ctx);
+int  gss_alloc_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req,
+                      int msgsize);
+void gss_free_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req);
+int  gss_alloc_repbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req,
+                      int msgsize);
+void gss_free_repbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req);
+int  gss_enlarge_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req,
+                        int segment, int newsize);
+
+int  gss_svc_accept(struct ptlrpc_sec_policy *policy,
+                    struct ptlrpc_request *req);
+void gss_svc_invalidate_ctx(struct ptlrpc_svc_ctx *svc_ctx);
+int  gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen);
+int  gss_svc_authorize(struct ptlrpc_request *req);
+void gss_svc_free_rs(struct ptlrpc_reply_state *rs);
+void gss_svc_free_ctx(struct ptlrpc_svc_ctx *ctx);
+
+int cli_ctx_expire(struct ptlrpc_cli_ctx *ctx);
+int cli_ctx_check_death(struct ptlrpc_cli_ctx *ctx);
+
+int gss_copy_rvc_cli_ctx(struct ptlrpc_cli_ctx *cli_ctx,
+                         struct ptlrpc_svc_ctx *svc_ctx);
+
+struct gss_header *gss_swab_header(struct lustre_msg *msg, int segment,
+                                   int swabbed);
+netobj_t *gss_swab_netobj(struct lustre_msg *msg, int segment);
+
+void gss_cli_ctx_uptodate(struct gss_cli_ctx *gctx);
+int gss_pack_err_notify(struct ptlrpc_request *req, __u32 major, __u32 minor);
+int gss_check_seq_num(struct gss_svc_seq_data *sd, __u32 seq_num, int set);
+
+int gss_sec_create_common(struct gss_sec *gsec,
+                          struct ptlrpc_sec_policy *policy,
+                          struct obd_import *imp,
+                          struct ptlrpc_svc_ctx *ctx,
+                          struct sptlrpc_flavor *sf);
+void gss_sec_destroy_common(struct gss_sec *gsec);
+void gss_sec_kill(struct ptlrpc_sec *sec);
+
+int gss_cli_ctx_init_common(struct ptlrpc_sec *sec,
+                            struct ptlrpc_cli_ctx *ctx,
+                            struct ptlrpc_ctx_ops *ctxops,
+                            struct vfs_cred *vcred);
+int gss_cli_ctx_fini_common(struct ptlrpc_sec *sec,
+                            struct ptlrpc_cli_ctx *ctx);
+
+void gss_cli_ctx_flags2str(unsigned long flags, char *buf, int bufsize);
+
+/* gss_keyring.c */
+#ifndef HAVE_GSS_KEYRING
+static inline int  __init gss_init_keyring(void) { return 0; }
+static inline void __exit gss_exit_keyring(void) { return; }
+#else
+int  __init gss_init_keyring(void);
+void __exit gss_exit_keyring(void);
+#endif
+
+/* gss_pipefs.c */
+#ifndef HAVE_GSS_PIPEFS
+static inline int  __init gss_init_pipefs(void) { return 0; }
+static inline void __exit gss_exit_pipefs(void) { return; }
+#else
+int  __init gss_init_pipefs(void);
+void __exit gss_exit_pipefs(void);
+#endif
+
+/* gss_bulk.c */
+int gss_cli_prep_bulk(struct ptlrpc_request *req,
+                      struct ptlrpc_bulk_desc *desc);
+int gss_cli_ctx_wrap_bulk(struct ptlrpc_cli_ctx *ctx,
+                          struct ptlrpc_request *req,
+                          struct ptlrpc_bulk_desc *desc);
+int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx,
+                            struct ptlrpc_request *req,
+                            struct ptlrpc_bulk_desc *desc);
+int gss_svc_prep_bulk(struct ptlrpc_request *req,
+                      struct ptlrpc_bulk_desc *desc);
+int gss_svc_unwrap_bulk(struct ptlrpc_request *req,
+                        struct ptlrpc_bulk_desc *desc);
+int gss_svc_wrap_bulk(struct ptlrpc_request *req,
+                      struct ptlrpc_bulk_desc *desc);
+
+/* gss_generic_token.c */
+int g_token_size(rawobj_t *mech, unsigned int body_size);
+void g_make_token_header(rawobj_t *mech, int body_size, unsigned char **buf);
+__u32 g_verify_token_header(rawobj_t *mech, int *body_size,
+                            unsigned char **buf_in, int toksize);
+
+
+/* gss_cli_upcall.c */
+int gss_do_ctx_init_rpc(char __user *buffer, unsigned long count);
+int gss_do_ctx_fini_rpc(struct gss_cli_ctx *gctx);
+
+int  __init gss_init_cli_upcall(void);
+void gss_exit_cli_upcall(void);
+
+/* gss_svc_upcall.c */
+__u64 gss_get_next_ctx_index(void);
+int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp,
+                                   struct gss_sec *gsec,
+                                   struct gss_cli_ctx *gctx);
+int gss_svc_upcall_expire_rvs_ctx(rawobj_t *handle);
+int gss_svc_upcall_dup_handle(rawobj_t *handle, struct gss_svc_ctx *ctx);
+int gss_svc_upcall_update_sequence(rawobj_t *handle, __u32 seq);
+int gss_svc_upcall_handle_init(struct ptlrpc_request *req,
+                               struct gss_svc_reqctx *grctx,
+                               struct gss_wire_ctx *gw,
+                               struct obd_device *target,
+                               __u32 lustre_svc,
+                               rawobj_t *rvs_hdl,
+                               rawobj_t *in_token);
+struct gss_svc_ctx *gss_svc_upcall_get_ctx(struct ptlrpc_request *req,
+                                           struct gss_wire_ctx *gw);
+void gss_svc_upcall_put_ctx(struct gss_svc_ctx *ctx);
+void gss_svc_upcall_destroy_ctx(struct gss_svc_ctx *ctx);
+
+int  __init gss_init_svc_upcall(void);
+void gss_exit_svc_upcall(void);
+
+/* lproc_gss.c */
+void gss_stat_oos_record_cli(int behind);
+void gss_stat_oos_record_svc(int phase, int replay);
+
+int  __init gss_init_lproc(void);
+void gss_exit_lproc(void);
+
+/* gss_null_mech.c */
+int __init init_null_module(void);
+void cleanup_null_module(void);
+
+/* gss_krb5_mech.c */
+int __init init_kerberos_module(void);
+void cleanup_kerberos_module(void);
+
+/* gss_sk_mech.c */
+#ifdef HAVE_OPENSSL_SSK
+int __init init_sk_module(void);
+void cleanup_sk_module(void);
+#else
+static inline int init_sk_module(void) { return 0; }
+static inline void cleanup_sk_module(void) { return; }
+#endif /* HAVE_OPENSSL_SSK */
+
+/* debug */
+static inline
+void __dbg_memdump(char *name, void *ptr, int size)
+{
+        char *buf, *p = (char *) ptr;
+        int bufsize = size * 2 + 1, i;
+
+        OBD_ALLOC(buf, bufsize);
+        if (!buf) {
+                CDEBUG(D_ERROR, "DUMP ERROR: can't alloc %d bytes\n", bufsize);
+                return;
+        }
+
+        for (i = 0; i < size; i++)
+                sprintf(&buf[i+i], "%02x", (__u8) p[i]);
+        buf[size + size] = '\0';
+        LCONSOLE_INFO("DUMP %s@%p(%d): %s\n", name, ptr, size, buf);
+        OBD_FREE(buf, bufsize);
+}
+
+#endif /* __PTLRPC_GSS_GSS_INTERNAL_H_ */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_keyring.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_keyring.c
new file mode 100644
index 0000000000000..81aad1ffea6e2
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_keyring.c
@@ -0,0 +1,1614 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/gss/gss_keyring.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/crypto.h>
+#include <linux/key.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <linux/mutex.h>
+#include <asm/atomic.h>
+
+#include <libcfs/linux/linux-list.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_sec.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+static struct ptlrpc_sec_policy gss_policy_keyring;
+static struct ptlrpc_ctx_ops gss_keyring_ctxops;
+static struct key_type gss_key_type;
+
+static int sec_install_rctx_kr(struct ptlrpc_sec *sec,
+                               struct ptlrpc_svc_ctx *svc_ctx);
+
+/*
+ * the timeout is only for the case that upcall child process die abnormally.
+ * in any other cases it should finally update kernel key.
+ *
+ * FIXME we'd better to incorporate the client & server side upcall timeouts
+ * into the framework of Adaptive Timeouts, but we need to figure out how to
+ * make sure that kernel knows the upcall processes is in-progress or died
+ * unexpectedly.
+ */
+#define KEYRING_UPCALL_TIMEOUT  (obd_timeout + obd_timeout)
+
+/****************************************
+ * internal helpers                     *
+ ****************************************/
+
+#define DUMP_PROCESS_KEYRINGS(tsk)					\
+{									\
+	CWARN("DUMP PK: %s[%u,%u/%u](<-%s[%u,%u/%u]): "			\
+	      "a %d, t %d, p %d, s %d, u %d, us %d, df %d\n",		\
+	      tsk->comm, tsk->pid, tsk->uid, tsk->fsuid,		\
+	      tsk->parent->comm, tsk->parent->pid,			\
+	      tsk->parent->uid, tsk->parent->fsuid,			\
+	      tsk->request_key_auth ?					\
+	      tsk->request_key_auth->serial : 0,			\
+	      key_cred(tsk)->thread_keyring ?				\
+	      key_cred(tsk)->thread_keyring->serial : 0,		\
+	      key_tgcred(tsk)->process_keyring ?			\
+	      key_tgcred(tsk)->process_keyring->serial : 0,		\
+	      key_tgcred(tsk)->session_keyring ?			\
+	      key_tgcred(tsk)->session_keyring->serial : 0,		\
+	      key_cred(tsk)->user->uid_keyring ?			\
+	      key_cred(tsk)->user->uid_keyring->serial : 0,		\
+	      key_cred(tsk)->user->session_keyring ?			\
+	      key_cred(tsk)->user->session_keyring->serial : 0,		\
+	      key_cred(tsk)->jit_keyring				\
+	     );								\
+}
+
+#define DUMP_KEY(key)                                                   \
+{                                                                       \
+        CWARN("DUMP KEY: %p(%d) ref %d u%u/g%u desc %s\n",              \
+              key, key->serial, atomic_read(&key->usage),               \
+              key->uid, key->gid,                                       \
+              key->description ? key->description : "n/a"               \
+             );                                                         \
+}
+
+#define key_cred(tsk)   ((tsk)->cred)
+#ifdef HAVE_CRED_TGCRED
+#define key_tgcred(tsk) ((tsk)->cred->tgcred)
+#else
+#define key_tgcred(tsk) key_cred(tsk)
+#endif
+
+static inline void keyring_upcall_lock(struct gss_sec_keyring *gsec_kr)
+{
+#ifdef HAVE_KEYRING_UPCALL_SERIALIZED
+	mutex_lock(&gsec_kr->gsk_uc_lock);
+#endif
+}
+
+static inline void keyring_upcall_unlock(struct gss_sec_keyring *gsec_kr)
+{
+#ifdef HAVE_KEYRING_UPCALL_SERIALIZED
+	mutex_unlock(&gsec_kr->gsk_uc_lock);
+#endif
+}
+
+static inline void key_revoke_locked(struct key *key)
+{
+        set_bit(KEY_FLAG_REVOKED, &key->flags);
+}
+
+static void ctx_upcall_timeout_kr(unsigned long data)
+{
+        struct ptlrpc_cli_ctx *ctx = (struct ptlrpc_cli_ctx *) data;
+        struct key            *key = ctx2gctx_keyring(ctx)->gck_key;
+
+        CWARN("ctx %p, key %p\n", ctx, key);
+
+        LASSERT(key);
+
+        cli_ctx_expire(ctx);
+        key_revoke_locked(key);
+}
+
+static void ctx_start_timer_kr(struct ptlrpc_cli_ctx *ctx, long timeout)
+{
+	struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx);
+	struct timer_list          *timer = gctx_kr->gck_timer;
+
+	LASSERT(timer);
+
+	CDEBUG(D_SEC, "ctx %p: start timer %lds\n", ctx, timeout);
+	timeout = msecs_to_jiffies(timeout * MSEC_PER_SEC) +
+		  cfs_time_current();
+
+	init_timer(timer);
+	timer->expires = timeout;
+	timer->data = (unsigned long ) ctx;
+	timer->function = ctx_upcall_timeout_kr;
+
+	add_timer(timer);
+}
+
+/*
+ * caller should make sure no race with other threads
+ */
+static
+void ctx_clear_timer_kr(struct ptlrpc_cli_ctx *ctx)
+{
+        struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx);
+        struct timer_list          *timer = gctx_kr->gck_timer;
+
+        if (timer == NULL)
+                return;
+
+        CDEBUG(D_SEC, "ctx %p, key %p\n", ctx, gctx_kr->gck_key);
+
+        gctx_kr->gck_timer = NULL;
+
+        del_singleshot_timer_sync(timer);
+
+        OBD_FREE_PTR(timer);
+}
+
+static
+struct ptlrpc_cli_ctx *ctx_create_kr(struct ptlrpc_sec *sec,
+                                     struct vfs_cred *vcred)
+{
+        struct ptlrpc_cli_ctx      *ctx;
+        struct gss_cli_ctx_keyring *gctx_kr;
+
+        OBD_ALLOC_PTR(gctx_kr);
+        if (gctx_kr == NULL)
+                return NULL;
+
+        OBD_ALLOC_PTR(gctx_kr->gck_timer);
+        if (gctx_kr->gck_timer == NULL) {
+                OBD_FREE_PTR(gctx_kr);
+                return NULL;
+        }
+        init_timer(gctx_kr->gck_timer);
+
+        ctx = &gctx_kr->gck_base.gc_base;
+
+        if (gss_cli_ctx_init_common(sec, ctx, &gss_keyring_ctxops, vcred)) {
+                OBD_FREE_PTR(gctx_kr->gck_timer);
+                OBD_FREE_PTR(gctx_kr);
+                return NULL;
+        }
+
+	ctx->cc_expire = cfs_time_current_sec() + KEYRING_UPCALL_TIMEOUT;
+	clear_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags);
+	atomic_inc(&ctx->cc_refcount); /* for the caller */
+
+	return ctx;
+}
+
+static void ctx_destroy_kr(struct ptlrpc_cli_ctx *ctx)
+{
+	struct ptlrpc_sec		*sec = ctx->cc_sec;
+	struct gss_cli_ctx_keyring	*gctx_kr = ctx2gctx_keyring(ctx);
+
+	CDEBUG(D_SEC, "destroying ctx %p\n", ctx);
+
+        /* at this time the association with key has been broken. */
+        LASSERT(sec);
+	LASSERT(atomic_read(&sec->ps_refcount) > 0);
+	LASSERT(atomic_read(&sec->ps_nctx) > 0);
+	LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0);
+        LASSERT(gctx_kr->gck_key == NULL);
+
+	ctx_clear_timer_kr(ctx);
+	LASSERT(gctx_kr->gck_timer == NULL);
+
+	if (gss_cli_ctx_fini_common(sec, ctx))
+		return;
+
+	OBD_FREE_PTR(gctx_kr);
+
+	atomic_dec(&sec->ps_nctx);
+	sptlrpc_sec_put(sec);
+}
+
+static void ctx_release_kr(struct ptlrpc_cli_ctx *ctx, int sync)
+{
+	if (sync) {
+		ctx_destroy_kr(ctx);
+	} else {
+		atomic_inc(&ctx->cc_refcount);
+		sptlrpc_gc_add_ctx(ctx);
+	}
+}
+
+static void ctx_put_kr(struct ptlrpc_cli_ctx *ctx, int sync)
+{
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+	if (atomic_dec_and_test(&ctx->cc_refcount))
+		ctx_release_kr(ctx, sync);
+}
+
+/*
+ * key <-> ctx association and rules:
+ * - ctx might not bind with any key
+ * - key/ctx binding is protected by key semaphore (if the key present)
+ * - key and ctx each take a reference of the other
+ * - ctx enlist/unlist is protected by ctx spinlock
+ * - never enlist a ctx after it's been unlisted
+ * - whoever do enlist should also do bind, lock key before enlist:
+ *   - lock key -> lock ctx -> enlist -> unlock ctx -> bind -> unlock key
+ * - whoever do unlist should also do unbind:
+ *   - lock key -> lock ctx -> unlist -> unlock ctx -> unbind -> unlock key
+ *   - lock ctx -> unlist -> unlock ctx -> lock key -> unbind -> unlock key
+ */
+
+static inline void spin_lock_if(spinlock_t *lock, int condition)
+{
+	if (condition)
+		spin_lock(lock);
+}
+
+static inline void spin_unlock_if(spinlock_t *lock, int condition)
+{
+	if (condition)
+		spin_unlock(lock);
+}
+
+static void ctx_enlist_kr(struct ptlrpc_cli_ctx *ctx, int is_root, int locked)
+{
+	struct ptlrpc_sec	*sec = ctx->cc_sec;
+	struct gss_sec_keyring	*gsec_kr = sec2gsec_keyring(sec);
+
+	LASSERT(!test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags));
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+	spin_lock_if(&sec->ps_lock, !locked);
+
+	atomic_inc(&ctx->cc_refcount);
+	set_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags);
+	hlist_add_head(&ctx->cc_cache, &gsec_kr->gsk_clist);
+	if (is_root)
+		gsec_kr->gsk_root_ctx = ctx;
+
+	spin_unlock_if(&sec->ps_lock, !locked);
+}
+
+/*
+ * Note after this get called, caller should not access ctx again because
+ * it might have been freed, unless caller hold at least one refcount of
+ * the ctx.
+ *
+ * return non-zero if we indeed unlist this ctx.
+ */
+static int ctx_unlist_kr(struct ptlrpc_cli_ctx *ctx, int locked)
+{
+	struct ptlrpc_sec	*sec = ctx->cc_sec;
+	struct gss_sec_keyring	*gsec_kr = sec2gsec_keyring(sec);
+
+	/* if hashed bit has gone, leave the job to somebody who is doing it */
+	if (test_and_clear_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0)
+		return 0;
+
+	/* drop ref inside spin lock to prevent race with other operations */
+	spin_lock_if(&sec->ps_lock, !locked);
+
+	if (gsec_kr->gsk_root_ctx == ctx)
+		gsec_kr->gsk_root_ctx = NULL;
+	hlist_del_init(&ctx->cc_cache);
+	atomic_dec(&ctx->cc_refcount);
+
+	spin_unlock_if(&sec->ps_lock, !locked);
+
+	return 1;
+}
+
+/*
+ * Get specific payload. Newer kernels support 4 slots.
+ */
+static void *
+key_get_payload(struct key *key, unsigned int index)
+{
+	void *key_ptr = NULL;
+
+#ifdef HAVE_KEY_PAYLOAD_DATA_ARRAY
+	key_ptr = key->payload.data[index];
+#else
+	if (!index)
+		key_ptr = key->payload.data;
+#endif
+	return key_ptr;
+}
+
+/*
+ * Set specific payload. Newer kernels support 4 slots.
+ */
+static int key_set_payload(struct key *key, unsigned int index,
+			   struct ptlrpc_cli_ctx *ctx)
+{
+	int rc = -EINVAL;
+
+#ifdef HAVE_KEY_PAYLOAD_DATA_ARRAY
+	if (index < 4) {
+		key->payload.data[index] = ctx;
+#else
+	if (!index) {
+		key->payload.data = ctx;
+#endif
+		rc = 0;
+	}
+	return rc;
+}
+
+/*
+ * bind a key with a ctx together.
+ * caller must hold write lock of the key, as well as ref on key & ctx.
+ */
+static void bind_key_ctx(struct key *key, struct ptlrpc_cli_ctx *ctx)
+{
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+        LASSERT(atomic_read(&key->usage) > 0);
+	LASSERT(ctx2gctx_keyring(ctx)->gck_key == NULL);
+	LASSERT(!key_get_payload(key, 0));
+
+	/* at this time context may or may not in list. */
+	key_get(key);
+	atomic_inc(&ctx->cc_refcount);
+	ctx2gctx_keyring(ctx)->gck_key = key;
+	LASSERT(!key_set_payload(key, 0, ctx));
+}
+
+/*
+ * unbind a key and a ctx.
+ * caller must hold write lock, as well as a ref of the key.
+ */
+static void unbind_key_ctx(struct key *key, struct ptlrpc_cli_ctx *ctx)
+{
+	LASSERT(key_get_payload(key, 0) == ctx);
+	LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0);
+
+        /* must revoke the key, or others may treat it as newly created */
+        key_revoke_locked(key);
+
+	key_set_payload(key, 0, NULL);
+        ctx2gctx_keyring(ctx)->gck_key = NULL;
+
+        /* once ctx get split from key, the timer is meaningless */
+        ctx_clear_timer_kr(ctx);
+
+        ctx_put_kr(ctx, 1);
+        key_put(key);
+}
+
+/*
+ * given a ctx, unbind with its coupled key, if any.
+ * unbind could only be called once, so we don't worry the key be released
+ * by someone else.
+ */
+static void unbind_ctx_kr(struct ptlrpc_cli_ctx *ctx)
+{
+        struct key      *key = ctx2gctx_keyring(ctx)->gck_key;
+
+        if (key) {
+		LASSERT(key_get_payload(key, 0) == ctx);
+
+                key_get(key);
+                down_write(&key->sem);
+                unbind_key_ctx(key, ctx);
+                up_write(&key->sem);
+                key_put(key);
+        }
+}
+
+/*
+ * given a key, unbind with its coupled ctx, if any.
+ * caller must hold write lock, as well as a ref of the key.
+ */
+static void unbind_key_locked(struct key *key)
+{
+	struct ptlrpc_cli_ctx *ctx = key_get_payload(key, 0);
+
+        if (ctx)
+                unbind_key_ctx(key, ctx);
+}
+
+/*
+ * unlist a ctx, and unbind from coupled key
+ */
+static void kill_ctx_kr(struct ptlrpc_cli_ctx *ctx)
+{
+        if (ctx_unlist_kr(ctx, 0))
+                unbind_ctx_kr(ctx);
+}
+
+/*
+ * given a key, unlist and unbind with the coupled ctx (if any).
+ * caller must hold write lock, as well as a ref of the key.
+ */
+static void kill_key_locked(struct key *key)
+{
+	struct ptlrpc_cli_ctx *ctx = key_get_payload(key, 0);
+
+        if (ctx && ctx_unlist_kr(ctx, 0))
+                unbind_key_locked(key);
+}
+
+/*
+ * caller should hold one ref on contexts in freelist.
+ */
+static void dispose_ctx_list_kr(struct hlist_head *freelist)
+{
+	struct hlist_node	__maybe_unused *pos, *next;
+	struct ptlrpc_cli_ctx	*ctx;
+	struct gss_cli_ctx	*gctx;
+
+	cfs_hlist_for_each_entry_safe(ctx, pos, next, freelist, cc_cache) {
+		hlist_del_init(&ctx->cc_cache);
+
+		/* reverse ctx: update current seq to buddy svcctx if exist.
+		 * ideally this should be done at gss_cli_ctx_finalize(), but
+		 * the ctx destroy could be delayed by:
+		 *  1) ctx still has reference;
+		 *  2) ctx destroy is asynchronous;
+		 * and reverse import call inval_all_ctx() require this be done
+		 * _immediately_ otherwise newly created reverse ctx might copy
+		 * the very old sequence number from svcctx. */
+		gctx = ctx2gctx(ctx);
+		if (!rawobj_empty(&gctx->gc_svc_handle) &&
+		    sec_is_reverse(gctx->gc_base.cc_sec)) {
+			gss_svc_upcall_update_sequence(&gctx->gc_svc_handle,
+					(__u32) atomic_read(&gctx->gc_seq));
+		}
+
+		/* we need to wakeup waiting reqs here. the context might
+		 * be forced released before upcall finished, then the
+		 * late-arrived downcall can't find the ctx even. */
+		sptlrpc_cli_ctx_wakeup(ctx);
+
+		unbind_ctx_kr(ctx);
+		ctx_put_kr(ctx, 0);
+	}
+}
+
+/*
+ * lookup a root context directly in a sec, return root ctx with a
+ * reference taken or NULL.
+ */
+static
+struct ptlrpc_cli_ctx * sec_lookup_root_ctx_kr(struct ptlrpc_sec *sec)
+{
+	struct gss_sec_keyring  *gsec_kr = sec2gsec_keyring(sec);
+	struct ptlrpc_cli_ctx   *ctx = NULL;
+
+	spin_lock(&sec->ps_lock);
+
+        ctx = gsec_kr->gsk_root_ctx;
+
+        if (ctx == NULL && unlikely(sec_is_reverse(sec))) {
+		struct hlist_node	__maybe_unused *node;
+		struct ptlrpc_cli_ctx	*tmp;
+
+                /* reverse ctx, search root ctx in list, choose the one
+                 * with shortest expire time, which is most possibly have
+                 * an established peer ctx at client side. */
+                cfs_hlist_for_each_entry(tmp, node, &gsec_kr->gsk_clist,
+                                         cc_cache) {
+                        if (ctx == NULL || ctx->cc_expire == 0 ||
+                            ctx->cc_expire > tmp->cc_expire) {
+                                ctx = tmp;
+                                /* promote to be root_ctx */
+                                gsec_kr->gsk_root_ctx = ctx;
+                        }
+                }
+        }
+
+	if (ctx) {
+		LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+		LASSERT(!hlist_empty(&gsec_kr->gsk_clist));
+		atomic_inc(&ctx->cc_refcount);
+	}
+
+	spin_unlock(&sec->ps_lock);
+
+	return ctx;
+}
+
+#define RVS_CTX_EXPIRE_NICE    (10)
+
+static
+void rvs_sec_install_root_ctx_kr(struct ptlrpc_sec *sec,
+                                 struct ptlrpc_cli_ctx *new_ctx,
+                                 struct key *key)
+{
+	struct gss_sec_keyring	*gsec_kr = sec2gsec_keyring(sec);
+	struct hlist_node	__maybe_unused *hnode;
+	struct ptlrpc_cli_ctx	*ctx;
+	cfs_time_t		now;
+	ENTRY;
+
+        LASSERT(sec_is_reverse(sec));
+
+	spin_lock(&sec->ps_lock);
+
+        now = cfs_time_current_sec();
+
+        /* set all existing ctxs short expiry */
+        cfs_hlist_for_each_entry(ctx, hnode, &gsec_kr->gsk_clist, cc_cache) {
+                if (ctx->cc_expire > now + RVS_CTX_EXPIRE_NICE) {
+                        ctx->cc_early_expire = 1;
+                        ctx->cc_expire = now + RVS_CTX_EXPIRE_NICE;
+                }
+        }
+
+        /* if there's root_ctx there, instead obsolete the current
+         * immediately, we leave it continue operating for a little while.
+         * hopefully when the first backward rpc with newest ctx send out,
+         * the client side already have the peer ctx well established. */
+        ctx_enlist_kr(new_ctx, gsec_kr->gsk_root_ctx ? 0 : 1, 1);
+
+        if (key)
+                bind_key_ctx(key, new_ctx);
+
+	spin_unlock(&sec->ps_lock);
+}
+
+static void construct_key_desc(void *buf, int bufsize,
+                               struct ptlrpc_sec *sec, uid_t uid)
+{
+        snprintf(buf, bufsize, "%d@%x", uid, sec->ps_id);
+        ((char *)buf)[bufsize - 1] = '\0';
+}
+
+/****************************************
+ * sec apis                             *
+ ****************************************/
+
+static
+struct ptlrpc_sec * gss_sec_create_kr(struct obd_import *imp,
+                                      struct ptlrpc_svc_ctx *svcctx,
+                                      struct sptlrpc_flavor *sf)
+{
+        struct gss_sec_keyring  *gsec_kr;
+        ENTRY;
+
+        OBD_ALLOC(gsec_kr, sizeof(*gsec_kr));
+        if (gsec_kr == NULL)
+                RETURN(NULL);
+
+	INIT_HLIST_HEAD(&gsec_kr->gsk_clist);
+        gsec_kr->gsk_root_ctx = NULL;
+	mutex_init(&gsec_kr->gsk_root_uc_lock);
+#ifdef HAVE_KEYRING_UPCALL_SERIALIZED
+	mutex_init(&gsec_kr->gsk_uc_lock);
+#endif
+
+        if (gss_sec_create_common(&gsec_kr->gsk_base, &gss_policy_keyring,
+                                  imp, svcctx, sf))
+                goto err_free;
+
+        if (svcctx != NULL &&
+            sec_install_rctx_kr(&gsec_kr->gsk_base.gs_base, svcctx)) {
+                gss_sec_destroy_common(&gsec_kr->gsk_base);
+                goto err_free;
+        }
+
+        RETURN(&gsec_kr->gsk_base.gs_base);
+
+err_free:
+        OBD_FREE(gsec_kr, sizeof(*gsec_kr));
+        RETURN(NULL);
+}
+
+static
+void gss_sec_destroy_kr(struct ptlrpc_sec *sec)
+{
+        struct gss_sec          *gsec = sec2gsec(sec);
+        struct gss_sec_keyring  *gsec_kr = sec2gsec_keyring(sec);
+
+        CDEBUG(D_SEC, "destroy %s@%p\n", sec->ps_policy->sp_name, sec);
+
+	LASSERT(hlist_empty(&gsec_kr->gsk_clist));
+        LASSERT(gsec_kr->gsk_root_ctx == NULL);
+
+        gss_sec_destroy_common(gsec);
+
+        OBD_FREE(gsec_kr, sizeof(*gsec_kr));
+}
+
+static inline int user_is_root(struct ptlrpc_sec *sec, struct vfs_cred *vcred)
+{
+        /* except the ROOTONLY flag, treat it as root user only if real uid
+         * is 0, euid/fsuid being 0 are handled as setuid scenarios */
+        if (sec_is_rootonly(sec) || (vcred->vc_uid == 0))
+                return 1;
+        else
+                return 0;
+}
+
+/*
+ * unlink request key from it's ring, which is linked during request_key().
+ * sadly, we have to 'guess' which keyring it's linked to.
+ *
+ * FIXME this code is fragile, depend on how request_key_link() is implemented.
+ */
+static void request_key_unlink(struct key *key)
+{
+	struct task_struct *tsk = current;
+	struct key *ring;
+
+	switch (key_cred(tsk)->jit_keyring) {
+	case KEY_REQKEY_DEFL_DEFAULT:
+	case KEY_REQKEY_DEFL_THREAD_KEYRING:
+		ring = key_get(key_cred(tsk)->thread_keyring);
+		if (ring)
+			break;
+	case KEY_REQKEY_DEFL_PROCESS_KEYRING:
+		ring = key_get(key_tgcred(tsk)->process_keyring);
+		if (ring)
+			break;
+	case KEY_REQKEY_DEFL_SESSION_KEYRING:
+		rcu_read_lock();
+		ring = key_get(rcu_dereference(key_tgcred(tsk)
+					       ->session_keyring));
+		rcu_read_unlock();
+		if (ring)
+			break;
+	case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
+		ring = key_get(key_cred(tsk)->user->session_keyring);
+		break;
+	case KEY_REQKEY_DEFL_USER_KEYRING:
+		ring = key_get(key_cred(tsk)->user->uid_keyring);
+		break;
+	case KEY_REQKEY_DEFL_GROUP_KEYRING:
+	default:
+		LBUG();
+	}
+
+	LASSERT(ring);
+	key_unlink(ring, key);
+	key_put(ring);
+}
+
+static
+struct ptlrpc_cli_ctx * gss_sec_lookup_ctx_kr(struct ptlrpc_sec *sec,
+                                              struct vfs_cred *vcred,
+                                              int create, int remove_dead)
+{
+        struct obd_import       *imp = sec->ps_import;
+        struct gss_sec_keyring  *gsec_kr = sec2gsec_keyring(sec);
+        struct ptlrpc_cli_ctx   *ctx = NULL;
+        unsigned int             is_root = 0, create_new = 0;
+        struct key              *key;
+        char                     desc[24];
+        char                    *coinfo;
+        int                      coinfo_size;
+	const char		*sec_part_flags = "";
+	char			 svc_flag = '-';
+        ENTRY;
+
+        LASSERT(imp != NULL);
+
+        is_root = user_is_root(sec, vcred);
+
+        /* a little bit optimization for root context */
+        if (is_root) {
+                ctx = sec_lookup_root_ctx_kr(sec);
+                /*
+                 * Only lookup directly for REVERSE sec, which should
+                 * always succeed.
+                 */
+                if (ctx || sec_is_reverse(sec))
+                        RETURN(ctx);
+        }
+
+        LASSERT(create != 0);
+
+        /* for root context, obtain lock and check again, this time hold
+         * the root upcall lock, make sure nobody else populated new root
+         * context after last check. */
+        if (is_root) {
+		mutex_lock(&gsec_kr->gsk_root_uc_lock);
+
+                ctx = sec_lookup_root_ctx_kr(sec);
+                if (ctx)
+                        goto out;
+
+                /* update reverse handle for root user */
+                sec2gsec(sec)->gs_rvs_hdl = gss_get_next_ctx_index();
+
+		switch (sec->ps_part) {
+		case LUSTRE_SP_MDT:
+			sec_part_flags = "m";
+			break;
+		case LUSTRE_SP_OST:
+			sec_part_flags = "o";
+			break;
+		case LUSTRE_SP_MGC:
+			sec_part_flags = "rmo";
+			break;
+		case LUSTRE_SP_CLI:
+			sec_part_flags = "r";
+			break;
+		case LUSTRE_SP_MGS:
+		default:
+			LBUG();
+                }
+
+		switch (SPTLRPC_FLVR_SVC(sec->ps_flvr.sf_rpc)) {
+		case SPTLRPC_SVC_NULL:
+			svc_flag = 'n';
+			break;
+		case SPTLRPC_SVC_AUTH:
+			svc_flag = 'a';
+			break;
+		case SPTLRPC_SVC_INTG:
+			svc_flag = 'i';
+			break;
+		case SPTLRPC_SVC_PRIV:
+			svc_flag = 'p';
+			break;
+		default:
+			LBUG();
+		}
+	}
+
+	/* in case of setuid, key will be constructed as owner of fsuid/fsgid,
+	 * but we do authentication based on real uid/gid. the key permission
+	 * bits will be exactly as POS_ALL, so only processes who subscribed
+	 * this key could have the access, although the quota might be counted
+	 * on others (fsuid/fsgid).
+	 *
+	 * keyring will use fsuid/fsgid as upcall parameters, so we have to
+	 * encode real uid/gid into callout info.
+	 */
+
+	/* But first we need to make sure the obd type is supported */
+	if (strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
+	    strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_OSC_NAME) &&
+	    strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_MGC_NAME) &&
+	    strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_LWP_NAME) &&
+	    strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_OSP_NAME)) {
+		CERROR("obd %s is not a supported device\n",
+		       imp->imp_obd->obd_name);
+		GOTO(out, ctx = NULL);
+	}
+
+	construct_key_desc(desc, sizeof(desc), sec, vcred->vc_uid);
+
+	/* callout info format:
+	 * secid:mech:uid:gid:sec_flags:svc_flag:svc_type:peer_nid:target_uuid:
+	 * self_nid:pid
+	 */
+	coinfo_size = sizeof(struct obd_uuid) + MAX_OBD_NAME + 64;
+	OBD_ALLOC(coinfo, coinfo_size);
+	if (coinfo == NULL)
+		goto out;
+
+	/* Last callout parameter is pid of process whose namespace will be used
+	 * for credentials' retrieval.
+	 * For user's credentials (in which case sec_part_flags is empty), use
+	 * current PID instead of import's reference PID to get reference
+	 * namespace. */
+	snprintf(coinfo, coinfo_size, "%d:%s:%u:%u:%s:%c:%d:%#llx:%s:%#llx:%d",
+		 sec->ps_id, sec2gsec(sec)->gs_mech->gm_name,
+		 vcred->vc_uid, vcred->vc_gid,
+		 sec_part_flags, svc_flag, import_to_gss_svc(imp),
+		 imp->imp_connection->c_peer.nid, imp->imp_obd->obd_name,
+		 imp->imp_connection->c_self,
+		 sec_part_flags[0] == '\0' ?
+		       current_pid() : imp->imp_sec_refpid);
+
+        CDEBUG(D_SEC, "requesting key for %s\n", desc);
+
+        keyring_upcall_lock(gsec_kr);
+        key = request_key(&gss_key_type, desc, coinfo);
+        keyring_upcall_unlock(gsec_kr);
+
+        OBD_FREE(coinfo, coinfo_size);
+
+        if (IS_ERR(key)) {
+                CERROR("failed request key: %ld\n", PTR_ERR(key));
+                goto out;
+        }
+        CDEBUG(D_SEC, "obtained key %08x for %s\n", key->serial, desc);
+
+        /* once payload.data was pointed to a ctx, it never changes until
+         * we de-associate them; but parallel request_key() may return
+         * a key with payload.data == NULL at the same time. so we still
+         * need wirtelock of key->sem to serialize them. */
+        down_write(&key->sem);
+
+	ctx = key_get_payload(key, 0);
+	if (likely(ctx)) {
+		LASSERT(atomic_read(&ctx->cc_refcount) >= 1);
+		LASSERT(ctx2gctx_keyring(ctx)->gck_key == key);
+		LASSERT(atomic_read(&key->usage) >= 2);
+
+		/* simply take a ref and return. it's upper layer's
+		 * responsibility to detect & replace dead ctx. */
+		atomic_inc(&ctx->cc_refcount);
+	} else {
+		/* pre initialization with a cli_ctx. this can't be done in
+		 * key_instantiate() because we'v no enough information
+		 * there. */
+		ctx = ctx_create_kr(sec, vcred);
+		if (ctx != NULL) {
+			ctx_enlist_kr(ctx, is_root, 0);
+			bind_key_ctx(key, ctx);
+
+			ctx_start_timer_kr(ctx, KEYRING_UPCALL_TIMEOUT);
+
+			CDEBUG(D_SEC, "installed key %p <-> ctx %p (sec %p)\n",
+			       key, ctx, sec);
+		} else {
+			/* we'd prefer to call key_revoke(), but we more like
+			 * to revoke it within this key->sem locked period. */
+			key_revoke_locked(key);
+		}
+
+		create_new = 1;
+	}
+
+        up_write(&key->sem);
+
+        if (is_root && create_new)
+                request_key_unlink(key);
+
+        key_put(key);
+out:
+        if (is_root)
+		mutex_unlock(&gsec_kr->gsk_root_uc_lock);
+        RETURN(ctx);
+}
+
+static
+void gss_sec_release_ctx_kr(struct ptlrpc_sec *sec,
+                            struct ptlrpc_cli_ctx *ctx,
+                            int sync)
+{
+	LASSERT(atomic_read(&sec->ps_refcount) > 0);
+	LASSERT(atomic_read(&ctx->cc_refcount) == 0);
+        ctx_release_kr(ctx, sync);
+}
+
+/*
+ * flush context of normal user, we must resort to keyring itself to find out
+ * contexts which belong to me.
+ *
+ * Note here we suppose only to flush _my_ context, the "uid" will
+ * be ignored in the search.
+ */
+static
+void flush_user_ctx_cache_kr(struct ptlrpc_sec *sec,
+                             uid_t uid,
+                             int grace, int force)
+{
+        struct key              *key;
+        char                     desc[24];
+
+        /* nothing to do for reverse or rootonly sec */
+        if (sec_is_reverse(sec) || sec_is_rootonly(sec))
+                return;
+
+        construct_key_desc(desc, sizeof(desc), sec, uid);
+
+	/* there should be only one valid key, but we put it in the
+	 * loop in case of any weird cases */
+	for (;;) {
+		key = request_key(&gss_key_type, desc, NULL);
+		if (IS_ERR(key)) {
+			CDEBUG(D_SEC, "No more key found for current user\n");
+			break;
+		}
+
+		down_write(&key->sem);
+
+		kill_key_locked(key);
+
+		/* kill_key_locked() should usually revoke the key, but we
+		 * revoke it again to make sure, e.g. some case the key may
+		 * not well coupled with a context. */
+		key_revoke_locked(key);
+
+		up_write(&key->sem);
+
+		request_key_unlink(key);
+
+		key_put(key);
+	}
+}
+
+/*
+ * flush context of root or all, we iterate through the list.
+ */
+static
+void flush_spec_ctx_cache_kr(struct ptlrpc_sec *sec, uid_t uid, int grace,
+			     int force)
+{
+	struct gss_sec_keyring	*gsec_kr;
+	struct hlist_head	 freelist = HLIST_HEAD_INIT;
+	struct hlist_node	__maybe_unused *pos, *next;
+	struct ptlrpc_cli_ctx	*ctx;
+	ENTRY;
+
+        gsec_kr = sec2gsec_keyring(sec);
+
+	spin_lock(&sec->ps_lock);
+	cfs_hlist_for_each_entry_safe(ctx, pos, next,
+				      &gsec_kr->gsk_clist, cc_cache) {
+		LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+		if (uid != -1 && uid != ctx->cc_vcred.vc_uid)
+			continue;
+
+		/* at this moment there's at least 2 base reference:
+		 * key association and in-list. */
+		if (atomic_read(&ctx->cc_refcount) > 2) {
+			if (!force)
+				continue;
+			CWARN("flush busy ctx %p(%u->%s, extra ref %d)\n",
+			      ctx, ctx->cc_vcred.vc_uid,
+			      sec2target_str(ctx->cc_sec),
+			      atomic_read(&ctx->cc_refcount) - 2);
+		}
+
+		set_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags);
+		if (!grace)
+			clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags);
+
+		atomic_inc(&ctx->cc_refcount);
+
+		if (ctx_unlist_kr(ctx, 1)) {
+			hlist_add_head(&ctx->cc_cache, &freelist);
+		} else {
+			LASSERT(atomic_read(&ctx->cc_refcount) >= 2);
+			atomic_dec(&ctx->cc_refcount);
+		}
+	}
+	spin_unlock(&sec->ps_lock);
+
+	dispose_ctx_list_kr(&freelist);
+	EXIT;
+}
+
+static
+int gss_sec_flush_ctx_cache_kr(struct ptlrpc_sec *sec,
+                               uid_t uid, int grace, int force)
+{
+	ENTRY;
+
+	CDEBUG(D_SEC, "sec %p(%d, nctx %d), uid %d, grace %d, force %d\n",
+	       sec, atomic_read(&sec->ps_refcount),
+	       atomic_read(&sec->ps_nctx),
+	       uid, grace, force);
+
+	if (uid != -1 && uid != 0)
+		flush_user_ctx_cache_kr(sec, uid, grace, force);
+	else
+		flush_spec_ctx_cache_kr(sec, uid, grace, force);
+
+	RETURN(0);
+}
+
+static
+void gss_sec_gc_ctx_kr(struct ptlrpc_sec *sec)
+{
+	struct gss_sec_keyring	*gsec_kr = sec2gsec_keyring(sec);
+	struct hlist_head	freelist = HLIST_HEAD_INIT;
+	struct hlist_node	__maybe_unused *pos, *next;
+	struct ptlrpc_cli_ctx	*ctx;
+	ENTRY;
+
+	CWARN("running gc\n");
+
+	spin_lock(&sec->ps_lock);
+	cfs_hlist_for_each_entry_safe(ctx, pos, next,
+				      &gsec_kr->gsk_clist, cc_cache) {
+		LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+		atomic_inc(&ctx->cc_refcount);
+
+		if (cli_ctx_check_death(ctx) && ctx_unlist_kr(ctx, 1)) {
+			hlist_add_head(&ctx->cc_cache, &freelist);
+			CWARN("unhashed ctx %p\n", ctx);
+		} else {
+			LASSERT(atomic_read(&ctx->cc_refcount) >= 2);
+			atomic_dec(&ctx->cc_refcount);
+		}
+	}
+	spin_unlock(&sec->ps_lock);
+
+	dispose_ctx_list_kr(&freelist);
+	EXIT;
+	return;
+}
+
+static
+int gss_sec_display_kr(struct ptlrpc_sec *sec, struct seq_file *seq)
+{
+	struct gss_sec_keyring	*gsec_kr = sec2gsec_keyring(sec);
+	struct hlist_node	__maybe_unused *pos, *next;
+	struct ptlrpc_cli_ctx	*ctx;
+	struct gss_cli_ctx	*gctx;
+	time_t			 now = cfs_time_current_sec();
+	ENTRY;
+
+	spin_lock(&sec->ps_lock);
+        cfs_hlist_for_each_entry_safe(ctx, pos, next,
+				      &gsec_kr->gsk_clist, cc_cache) {
+                struct key             *key;
+                char                    flags_str[40];
+                char                    mech[40];
+
+                gctx = ctx2gctx(ctx);
+                key = ctx2gctx_keyring(ctx)->gck_key;
+
+                gss_cli_ctx_flags2str(ctx->cc_flags,
+                                      flags_str, sizeof(flags_str));
+
+                if (gctx->gc_mechctx)
+                        lgss_display(gctx->gc_mechctx, mech, sizeof(mech));
+                else
+                        snprintf(mech, sizeof(mech), "N/A");
+                mech[sizeof(mech) - 1] = '\0';
+
+		seq_printf(seq, "%p: uid %u, ref %d, expire %lu(%+ld), fl %s, "
+			   "seq %d, win %u, key %08x(ref %d), "
+			   "hdl %#llx:%#llx, mech: %s\n",
+			   ctx, ctx->cc_vcred.vc_uid,
+			   atomic_read(&ctx->cc_refcount),
+			   ctx->cc_expire,
+			   ctx->cc_expire ?  ctx->cc_expire - now : 0,
+			   flags_str,
+			   atomic_read(&gctx->gc_seq),
+			   gctx->gc_win,
+			   key ? key->serial : 0,
+			   key ? atomic_read(&key->usage) : 0,
+			   gss_handle_to_u64(&gctx->gc_handle),
+			   gss_handle_to_u64(&gctx->gc_svc_handle),
+			   mech);
+	}
+	spin_unlock(&sec->ps_lock);
+
+	RETURN(0);
+}
+
+/****************************************
+ * cli_ctx apis                         *
+ ****************************************/
+
+static
+int gss_cli_ctx_refresh_kr(struct ptlrpc_cli_ctx *ctx)
+{
+        /* upcall is already on the way */
+        return 0;
+}
+
+static
+int gss_cli_ctx_validate_kr(struct ptlrpc_cli_ctx *ctx)
+{
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+	LASSERT(ctx->cc_sec);
+
+	if (cli_ctx_check_death(ctx)) {
+		kill_ctx_kr(ctx);
+		return 1;
+	}
+
+	if (cli_ctx_is_ready(ctx))
+		return 0;
+	return 1;
+}
+
+static
+void gss_cli_ctx_die_kr(struct ptlrpc_cli_ctx *ctx, int grace)
+{
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+	LASSERT(ctx->cc_sec);
+
+	cli_ctx_expire(ctx);
+	kill_ctx_kr(ctx);
+}
+
+/****************************************
+ * (reverse) service                    *
+ ****************************************/
+
+/*
+ * reverse context could have nothing to do with keyrings. here we still keep
+ * the version which bind to a key, for future reference.
+ */
+#define HAVE_REVERSE_CTX_NOKEY
+
+#ifdef HAVE_REVERSE_CTX_NOKEY
+
+static
+int sec_install_rctx_kr(struct ptlrpc_sec *sec,
+			struct ptlrpc_svc_ctx *svc_ctx)
+{
+	struct ptlrpc_cli_ctx *cli_ctx;
+	struct vfs_cred vcred = { .vc_uid = 0 };
+	int rc;
+
+        LASSERT(sec);
+        LASSERT(svc_ctx);
+
+        cli_ctx = ctx_create_kr(sec, &vcred);
+        if (cli_ctx == NULL)
+                return -ENOMEM;
+
+        rc = gss_copy_rvc_cli_ctx(cli_ctx, svc_ctx);
+        if (rc) {
+                CERROR("failed copy reverse cli ctx: %d\n", rc);
+
+                ctx_put_kr(cli_ctx, 1);
+                return rc;
+        }
+
+        rvs_sec_install_root_ctx_kr(sec, cli_ctx, NULL);
+
+        ctx_put_kr(cli_ctx, 1);
+
+        return 0;
+}
+
+#else /* ! HAVE_REVERSE_CTX_NOKEY */
+
+static
+int sec_install_rctx_kr(struct ptlrpc_sec *sec,
+			struct ptlrpc_svc_ctx *svc_ctx)
+{
+	struct ptlrpc_cli_ctx *cli_ctx = NULL;
+	struct key *key;
+	struct vfs_cred vcred = { .vc_uid = 0 };
+	char desc[64];
+	int rc;
+
+        LASSERT(sec);
+        LASSERT(svc_ctx);
+        CWARN("called\n");
+
+        construct_key_desc(desc, sizeof(desc), sec, 0);
+
+        key = key_alloc(&gss_key_type, desc, 0, 0,
+                        KEY_POS_ALL | KEY_USR_ALL, 1);
+        if (IS_ERR(key)) {
+                CERROR("failed to alloc key: %ld\n", PTR_ERR(key));
+                return PTR_ERR(key);
+        }
+
+        rc = key_instantiate_and_link(key, NULL, 0, NULL, NULL);
+        if (rc) {
+                CERROR("failed to instantiate key: %d\n", rc);
+                goto err_revoke;
+        }
+
+        down_write(&key->sem);
+
+	LASSERT(!key_get_payload(key, 0));
+
+        cli_ctx = ctx_create_kr(sec, &vcred);
+        if (cli_ctx == NULL) {
+                rc = -ENOMEM;
+                goto err_up;
+        }
+
+        rc = gss_copy_rvc_cli_ctx(cli_ctx, svc_ctx);
+        if (rc) {
+                CERROR("failed copy reverse cli ctx: %d\n", rc);
+                goto err_put;
+        }
+
+        rvs_sec_install_root_ctx_kr(sec, cli_ctx, key);
+
+        ctx_put_kr(cli_ctx, 1);
+        up_write(&key->sem);
+
+        rc = 0;
+        CWARN("ok!\n");
+out:
+        key_put(key);
+        return rc;
+
+err_put:
+        ctx_put_kr(cli_ctx, 1);
+err_up:
+        up_write(&key->sem);
+err_revoke:
+        key_revoke(key);
+        goto out;
+}
+
+#endif /* HAVE_REVERSE_CTX_NOKEY */
+
+/****************************************
+ * service apis                         *
+ ****************************************/
+
+static
+int gss_svc_accept_kr(struct ptlrpc_request *req)
+{
+        return gss_svc_accept(&gss_policy_keyring, req);
+}
+
+static
+int gss_svc_install_rctx_kr(struct obd_import *imp,
+                            struct ptlrpc_svc_ctx *svc_ctx)
+{
+        struct ptlrpc_sec *sec;
+        int                rc;
+
+        sec = sptlrpc_import_sec_ref(imp);
+        LASSERT(sec);
+
+        rc = sec_install_rctx_kr(sec, svc_ctx);
+        sptlrpc_sec_put(sec);
+
+        return rc;
+}
+
+/****************************************
+ * key apis                             *
+ ****************************************/
+
+static
+#ifdef HAVE_KEY_TYPE_INSTANTIATE_2ARGS
+int gss_kt_instantiate(struct key *key, struct key_preparsed_payload *prep)
+{
+	const void     *data = prep->data;
+	size_t          datalen = prep->datalen;
+#else
+int gss_kt_instantiate(struct key *key, const void *data, size_t datalen)
+{
+#endif
+        int             rc;
+        ENTRY;
+
+        if (data != NULL || datalen != 0) {
+                CERROR("invalid: data %p, len %lu\n", data, (long)datalen);
+                RETURN(-EINVAL);
+        }
+
+	if (key_get_payload(key, 0)) {
+                CERROR("key already have payload\n");
+                RETURN(-EINVAL);
+        }
+
+        /* link the key to session keyring, so following context negotiation
+         * rpc fired from user space could find this key. This will be unlinked
+         * automatically when upcall processes die.
+         *
+         * we can't do this through keyctl from userspace, because the upcall
+         * might be neither possessor nor owner of the key (setuid).
+         *
+         * the session keyring is created upon upcall, and don't change all
+         * the way until upcall finished, so rcu lock is not needed here.
+         */
+	LASSERT(key_tgcred(current)->session_keyring);
+
+	lockdep_off();
+	rc = key_link(key_tgcred(current)->session_keyring, key);
+	lockdep_on();
+	if (unlikely(rc)) {
+		CERROR("failed to link key %08x to keyring %08x: %d\n",
+		       key->serial,
+		       key_tgcred(current)->session_keyring->serial, rc);
+		RETURN(rc);
+	}
+
+	CDEBUG(D_SEC, "key %p instantiated, ctx %p\n", key,
+	       key_get_payload(key, 0));
+	RETURN(0);
+}
+
+/*
+ * called with key semaphore write locked. it means we can operate
+ * on the context without fear of loosing refcount.
+ */
+static
+#ifdef HAVE_KEY_TYPE_INSTANTIATE_2ARGS
+int gss_kt_update(struct key *key, struct key_preparsed_payload *prep)
+{
+	const void              *data = prep->data;
+	__u32                    datalen32 = (__u32) prep->datalen;
+#else
+int gss_kt_update(struct key *key, const void *data, size_t datalen)
+{
+	__u32                    datalen32 = (__u32) datalen;
+#endif
+	struct ptlrpc_cli_ctx *ctx = key_get_payload(key, 0);
+        struct gss_cli_ctx      *gctx;
+        rawobj_t                 tmpobj = RAWOBJ_EMPTY;
+        int                      rc;
+        ENTRY;
+
+	if (data == NULL || datalen32 == 0) {
+		CWARN("invalid: data %p, len %lu\n", data, (long)datalen32);
+		RETURN(-EINVAL);
+	}
+
+        /* if upcall finished negotiation too fast (mostly likely because
+         * of local error happened) and call kt_update(), the ctx
+         * might be still NULL. but the key will finally be associate
+         * with a context, or be revoked. if key status is fine, return
+         * -EAGAIN to allow userspace sleep a while and call again. */
+        if (ctx == NULL) {
+                CDEBUG(D_SEC, "update too soon: key %p(%x) flags %lx\n",
+                      key, key->serial, key->flags);
+
+                rc = key_validate(key);
+                if (rc == 0)
+                        RETURN(-EAGAIN);
+                else
+                        RETURN(rc);
+        }
+
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+	LASSERT(ctx->cc_sec);
+
+	ctx_clear_timer_kr(ctx);
+
+        /* don't proceed if already refreshed */
+        if (cli_ctx_is_refreshed(ctx)) {
+                CWARN("ctx already done refresh\n");
+                RETURN(0);
+        }
+
+        sptlrpc_cli_ctx_get(ctx);
+        gctx = ctx2gctx(ctx);
+
+        rc = buffer_extract_bytes(&data, &datalen32, &gctx->gc_win,
+                                  sizeof(gctx->gc_win));
+        if (rc) {
+                CERROR("failed extract seq_win\n");
+                goto out;
+        }
+
+	if (gctx->gc_win == 0) {
+		__u32   nego_rpc_err, nego_gss_err;
+
+		rc = buffer_extract_bytes(&data, &datalen32, &nego_rpc_err,
+					  sizeof(nego_rpc_err));
+		if (rc) {
+			CERROR("cannot extract RPC: rc = %d\n", rc);
+			goto out;
+		}
+
+		rc = buffer_extract_bytes(&data, &datalen32, &nego_gss_err,
+					  sizeof(nego_gss_err));
+		if (rc) {
+			CERROR("failed to extract gss rc = %d\n", rc);
+			goto out;
+		}
+
+		CERROR("negotiation: rpc err %d, gss err %x\n",
+		       nego_rpc_err, nego_gss_err);
+
+		rc = nego_rpc_err ? nego_rpc_err : -EACCES;
+	} else {
+		rc = rawobj_extract_local_alloc(&gctx->gc_handle,
+						(__u32 **) &data, &datalen32);
+		if (rc) {
+			CERROR("failed extract handle\n");
+			goto out;
+		}
+
+		rc = rawobj_extract_local(&tmpobj,
+					  (__u32 **) &data, &datalen32);
+		if (rc) {
+			CERROR("failed extract mech\n");
+			goto out;
+		}
+
+		rc = lgss_import_sec_context(&tmpobj,
+					     sec2gsec(ctx->cc_sec)->gs_mech,
+					     &gctx->gc_mechctx);
+		if (rc != GSS_S_COMPLETE)
+			CERROR("failed import context\n");
+		else
+			rc = 0;
+	}
+out:
+        /* we don't care what current status of this ctx, even someone else
+         * is operating on the ctx at the same time. we just add up our own
+         * opinions here. */
+        if (rc == 0) {
+                gss_cli_ctx_uptodate(gctx);
+        } else {
+                /* this will also revoke the key. has to be done before
+                 * wakeup waiters otherwise they can find the stale key */
+                kill_key_locked(key);
+
+                cli_ctx_expire(ctx);
+
+                if (rc != -ERESTART)
+			set_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags);
+        }
+
+        /* let user space think it's a success */
+        sptlrpc_cli_ctx_put(ctx, 1);
+        RETURN(0);
+}
+
+#ifndef HAVE_KEY_MATCH_DATA
+static int
+gss_kt_match(const struct key *key, const void *desc)
+{
+	return strcmp(key->description, (const char *) desc) == 0 &&
+		!test_bit(KEY_FLAG_REVOKED, &key->flags);
+}
+#else /* ! HAVE_KEY_MATCH_DATA */
+static bool
+gss_kt_match(const struct key *key, const struct key_match_data *match_data)
+{
+	const char *desc = match_data->raw_data;
+
+	return strcmp(key->description, desc) == 0 &&
+		!test_bit(KEY_FLAG_REVOKED, &key->flags);
+}
+
+/*
+ * Preparse the match criterion.
+ */
+static int gss_kt_match_preparse(struct key_match_data *match_data)
+{
+	match_data->lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT;
+	match_data->cmp = gss_kt_match;
+	return 0;
+}
+#endif /* HAVE_KEY_MATCH_DATA */
+
+static
+void gss_kt_destroy(struct key *key)
+{
+        ENTRY;
+	LASSERT(!key_get_payload(key, 0));
+        CDEBUG(D_SEC, "destroy key %p\n", key);
+        EXIT;
+}
+
+static
+void gss_kt_describe(const struct key *key, struct seq_file *s)
+{
+        if (key->description == NULL)
+                seq_puts(s, "[null]");
+        else
+                seq_puts(s, key->description);
+}
+
+static struct key_type gss_key_type =
+{
+	.name		= "lgssc",
+	.def_datalen	= 0,
+	.instantiate	= gss_kt_instantiate,
+	.update		= gss_kt_update,
+#ifdef HAVE_KEY_MATCH_DATA
+	.match_preparse = gss_kt_match_preparse,
+#else
+	.match		= gss_kt_match,
+#endif
+	.destroy	= gss_kt_destroy,
+	.describe	= gss_kt_describe,
+};
+
+/****************************************
+ * lustre gss keyring policy            *
+ ****************************************/
+
+static struct ptlrpc_ctx_ops gss_keyring_ctxops = {
+        .match                  = gss_cli_ctx_match,
+        .refresh                = gss_cli_ctx_refresh_kr,
+        .validate               = gss_cli_ctx_validate_kr,
+        .die                    = gss_cli_ctx_die_kr,
+        .sign                   = gss_cli_ctx_sign,
+        .verify                 = gss_cli_ctx_verify,
+        .seal                   = gss_cli_ctx_seal,
+        .unseal                 = gss_cli_ctx_unseal,
+        .wrap_bulk              = gss_cli_ctx_wrap_bulk,
+        .unwrap_bulk            = gss_cli_ctx_unwrap_bulk,
+};
+
+static struct ptlrpc_sec_cops gss_sec_keyring_cops = {
+        .create_sec             = gss_sec_create_kr,
+        .destroy_sec            = gss_sec_destroy_kr,
+        .kill_sec               = gss_sec_kill,
+        .lookup_ctx             = gss_sec_lookup_ctx_kr,
+        .release_ctx            = gss_sec_release_ctx_kr,
+        .flush_ctx_cache        = gss_sec_flush_ctx_cache_kr,
+        .gc_ctx                 = gss_sec_gc_ctx_kr,
+        .install_rctx           = gss_sec_install_rctx,
+        .alloc_reqbuf           = gss_alloc_reqbuf,
+        .free_reqbuf            = gss_free_reqbuf,
+        .alloc_repbuf           = gss_alloc_repbuf,
+        .free_repbuf            = gss_free_repbuf,
+        .enlarge_reqbuf         = gss_enlarge_reqbuf,
+        .display                = gss_sec_display_kr,
+};
+
+static struct ptlrpc_sec_sops gss_sec_keyring_sops = {
+        .accept                 = gss_svc_accept_kr,
+        .invalidate_ctx         = gss_svc_invalidate_ctx,
+        .alloc_rs               = gss_svc_alloc_rs,
+        .authorize              = gss_svc_authorize,
+        .free_rs                = gss_svc_free_rs,
+        .free_ctx               = gss_svc_free_ctx,
+        .prep_bulk              = gss_svc_prep_bulk,
+        .unwrap_bulk            = gss_svc_unwrap_bulk,
+        .wrap_bulk              = gss_svc_wrap_bulk,
+        .install_rctx           = gss_svc_install_rctx_kr,
+};
+
+static struct ptlrpc_sec_policy gss_policy_keyring = {
+        .sp_owner               = THIS_MODULE,
+        .sp_name                = "gss.keyring",
+        .sp_policy              = SPTLRPC_POLICY_GSS,
+        .sp_cops                = &gss_sec_keyring_cops,
+        .sp_sops                = &gss_sec_keyring_sops,
+};
+
+
+int __init gss_init_keyring(void)
+{
+        int rc;
+
+        rc = register_key_type(&gss_key_type);
+        if (rc) {
+                CERROR("failed to register keyring type: %d\n", rc);
+                return rc;
+        }
+
+        rc = sptlrpc_register_policy(&gss_policy_keyring);
+        if (rc) {
+                unregister_key_type(&gss_key_type);
+                return rc;
+        }
+
+        return 0;
+}
+
+void __exit gss_exit_keyring(void)
+{
+        unregister_key_type(&gss_key_type);
+        sptlrpc_unregister_policy(&gss_policy_keyring);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5.h
new file mode 100644
index 0000000000000..97ad55e3025c0
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5.h
@@ -0,0 +1,160 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/include/linux/sunrpc/gss_krb5_types.h
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h,
+ *  lib/gssapi/krb5/gssapiP_krb5.h, and others
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ *  Bruce Fields   <bfields@umich.edu>
+ */
+
+/*
+ * Copyright 1995 by the Massachusetts Institute of Technology.
+ * All Rights Reserved.
+ *
+ * Export of this software from the United States of America may
+ *   require a specific license from the United States Government.
+ *   It is the responsibility of any person or organization contemplating
+ *   export to obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of M.I.T. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission.  Furthermore if you modify this software you must label
+ * your software as modified software and not distribute it in such a
+ * fashion that it might be confused with the original M.I.T. software.
+ * M.I.T. makes no representations about the suitability of
+ * this software for any purpose.  It is provided "as is" without express
+ * or implied warranty.
+ *
+ */
+
+#ifndef PTLRPC_GSS_KRB5_H
+#define PTLRPC_GSS_KRB5_H
+
+#include "gss_crypto.h"
+
+/*
+ * RFC 4142
+ */
+
+#define KG_USAGE_ACCEPTOR_SEAL          22
+#define KG_USAGE_ACCEPTOR_SIGN          23
+#define KG_USAGE_INITIATOR_SEAL         24
+#define KG_USAGE_INITIATOR_SIGN         25
+
+#define KG_TOK_MIC_MSG                  0x0404
+#define KG_TOK_WRAP_MSG                 0x0504
+
+#define FLAG_SENDER_IS_ACCEPTOR         0x01
+#define FLAG_WRAP_CONFIDENTIAL          0x02
+#define FLAG_ACCEPTOR_SUBKEY            0x04
+
+struct krb5_header {
+        __u16           kh_tok_id;      /* token id */
+        __u8            kh_flags;       /* acceptor flags */
+        __u8            kh_filler;      /* 0xff */
+        __u16           kh_ec;          /* extra count */
+        __u16           kh_rrc;         /* right rotation count */
+        __u64           kh_seq;         /* sequence number */
+        __u8            kh_cksum[0];    /* checksum */
+};
+
+struct krb5_ctx {
+	unsigned int		kc_initiate:1,
+				kc_cfx:1,
+				kc_seed_init:1,
+				kc_have_acceptor_subkey:1;
+	__s32			kc_endtime;
+	__u8			kc_seed[16];
+	__u64			kc_seq_send;
+	__u64			kc_seq_recv;
+	__u32			kc_enctype;
+	struct gss_keyblock	kc_keye;	/* encryption */
+	struct gss_keyblock	kc_keyi;	/* integrity */
+	struct gss_keyblock	kc_keyc;	/* checksum */
+	rawobj_t		kc_mech_used;
+};
+
+enum sgn_alg {
+        SGN_ALG_DES_MAC_MD5           = 0x0000,
+        SGN_ALG_MD2_5                 = 0x0001,
+        SGN_ALG_DES_MAC               = 0x0002,
+        SGN_ALG_3                     = 0x0003, /* not published */
+        SGN_ALG_HMAC_MD5              = 0x0011, /* microsoft w2k; no support */
+        SGN_ALG_HMAC_SHA1_DES3_KD     = 0x0004
+};
+
+enum seal_alg {
+        SEAL_ALG_NONE                 = 0xffff,
+        SEAL_ALG_DES                  = 0x0000,
+        SEAL_ALG_1                    = 0x0001, /* not published */
+        SEAL_ALG_MICROSOFT_RC4        = 0x0010, /* microsoft w2k; no support */
+        SEAL_ALG_DES3KD               = 0x0002
+};
+
+#define CKSUMTYPE_CRC32                 0x0001
+#define CKSUMTYPE_RSA_MD4               0x0002
+#define CKSUMTYPE_RSA_MD4_DES           0x0003
+#define CKSUMTYPE_DESCBC                0x0004
+/* des-mac-k */
+/* rsa-md4-des-k */
+#define CKSUMTYPE_RSA_MD5               0x0007
+#define CKSUMTYPE_RSA_MD5_DES           0x0008
+#define CKSUMTYPE_NIST_SHA              0x0009
+#define CKSUMTYPE_HMAC_SHA1_DES3        0x000c
+#define CKSUMTYPE_HMAC_SHA1_96_AES128   0x000f
+#define CKSUMTYPE_HMAC_SHA1_96_AES256   0x0010
+#define CKSUMTYPE_HMAC_MD5_ARCFOUR      -138
+
+/* from gssapi_err_krb5.h */
+#define KG_CCACHE_NOMATCH                        (39756032L)
+#define KG_KEYTAB_NOMATCH                        (39756033L)
+#define KG_TGT_MISSING                           (39756034L)
+#define KG_NO_SUBKEY                             (39756035L)
+#define KG_CONTEXT_ESTABLISHED                   (39756036L)
+#define KG_BAD_SIGN_TYPE                         (39756037L)
+#define KG_BAD_LENGTH                            (39756038L)
+#define KG_CTX_INCOMPLETE                        (39756039L)
+#define KG_CONTEXT                               (39756040L)
+#define KG_CRED                                  (39756041L)
+#define KG_ENC_DESC                              (39756042L)
+#define KG_BAD_SEQ                               (39756043L)
+#define KG_EMPTY_CCACHE                          (39756044L)
+#define KG_NO_CTYPES                             (39756045L)
+
+/* per Kerberos v5 protocol spec crypto types from the wire. 
+ * these get mapped to linux kernel crypto routines.  
+ */
+#define ENCTYPE_NULL            0x0000
+#define ENCTYPE_DES_CBC_CRC     0x0001        /* DES cbc mode with CRC-32 */
+#define ENCTYPE_DES_CBC_MD4     0x0002        /* DES cbc mode with RSA-MD4 */
+#define ENCTYPE_DES_CBC_MD5     0x0003        /* DES cbc mode with RSA-MD5 */
+#define ENCTYPE_DES_CBC_RAW     0x0004        /* DES cbc mode raw */
+/* XXX deprecated? */
+#define ENCTYPE_DES3_CBC_SHA    0x0005        /* DES-3 cbc mode with NIST-SHA */
+#define ENCTYPE_DES3_CBC_RAW    0x0006        /* DES-3 cbc mode raw */
+#define ENCTYPE_DES_HMAC_SHA1   0x0008
+#define ENCTYPE_DES3_CBC_SHA1   0x0010
+#define ENCTYPE_AES128_CTS_HMAC_SHA1_96 0x0011
+#define ENCTYPE_AES256_CTS_HMAC_SHA1_96 0x0012
+#define ENCTYPE_ARCFOUR_HMAC    0x0017
+#define ENCTYPE_ARCFOUR_HMAC_EXP 0x0018
+#define ENCTYPE_UNKNOWN         0x01ff
+
+#endif /* PTLRPC_GSS_KRB5_H */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5_mech.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5_mech.c
new file mode 100644
index 0000000000000..000d7a8e87b47
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5_mech.c
@@ -0,0 +1,1554 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/net/sunrpc/gss_krb5_mech.c
+ *  linux/net/sunrpc/gss_krb5_crypto.c
+ *  linux/net/sunrpc/gss_krb5_seal.c
+ *  linux/net/sunrpc/gss_krb5_seqnum.c
+ *  linux/net/sunrpc/gss_krb5_unseal.c
+ *
+ *  Copyright (c) 2001 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson <andros@umich.edu>
+ *  J. Bruce Fields <bfields@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_asn1.h"
+#include "gss_krb5.h"
+#include "gss_crypto.h"
+
+static spinlock_t krb5_seq_lock;
+
+struct krb5_enctype {
+        char           *ke_dispname;
+        char           *ke_enc_name;            /* linux tfm name */
+        char           *ke_hash_name;           /* linux tfm name */
+        int             ke_enc_mode;            /* linux tfm mode */
+        int             ke_hash_size;           /* checksum size */
+        int             ke_conf_size;           /* confounder size */
+        unsigned int    ke_hash_hmac:1;         /* is hmac? */
+};
+
+/*
+ * NOTE: for aes128-cts and aes256-cts, MIT implementation use CTS encryption.
+ * but currently we simply CBC with padding, because linux doesn't support CTS
+ * yet. this need to be fixed in the future.
+ */
+static struct krb5_enctype enctypes[] = {
+	[ENCTYPE_DES_CBC_RAW] = {		/* des-cbc-md5 */
+		.ke_dispname	= "des-cbc-md5",
+		.ke_enc_name	= "cbc(des)",
+		.ke_hash_name	= "md5",
+		.ke_hash_size	= 16,
+		.ke_conf_size	= 8,
+	},
+	[ENCTYPE_DES3_CBC_RAW] = {		/* des3-hmac-sha1 */
+		.ke_dispname	= "des3-hmac-sha1",
+		.ke_enc_name	= "cbc(des3_ede)",
+		.ke_hash_name	= "hmac(sha1)",
+		.ke_hash_size	= 20,
+		.ke_conf_size	= 8,
+		.ke_hash_hmac	= 1,
+	},
+	[ENCTYPE_AES128_CTS_HMAC_SHA1_96] = {	/* aes128-cts */
+		.ke_dispname	= "aes128-cts-hmac-sha1-96",
+		.ke_enc_name	= "cbc(aes)",
+		.ke_hash_name	= "hmac(sha1)",
+		.ke_hash_size	= 12,
+		.ke_conf_size	= 16,
+		.ke_hash_hmac	= 1,
+	},
+	[ENCTYPE_AES256_CTS_HMAC_SHA1_96] = {	/* aes256-cts */
+		.ke_dispname	= "aes256-cts-hmac-sha1-96",
+		.ke_enc_name	= "cbc(aes)",
+		.ke_hash_name	= "hmac(sha1)",
+		.ke_hash_size	= 12,
+		.ke_conf_size	= 16,
+		.ke_hash_hmac	= 1,
+	},
+	[ENCTYPE_ARCFOUR_HMAC] = {		/* arcfour-hmac-md5 */
+		.ke_dispname	= "arcfour-hmac-md5",
+		.ke_enc_name	= "ecb(arc4)",
+		.ke_hash_name	= "hmac(md5)",
+		.ke_hash_size	= 16,
+		.ke_conf_size	= 8,
+		.ke_hash_hmac	= 1,
+	}
+};
+
+#define MAX_ENCTYPES    sizeof(enctypes)/sizeof(struct krb5_enctype)
+
+static const char * enctype2str(__u32 enctype)
+{
+        if (enctype < MAX_ENCTYPES && enctypes[enctype].ke_dispname)
+                return enctypes[enctype].ke_dispname;
+
+        return "unknown";
+}
+
+static
+int krb5_init_keys(struct krb5_ctx *kctx)
+{
+        struct krb5_enctype *ke;
+
+        if (kctx->kc_enctype >= MAX_ENCTYPES ||
+            enctypes[kctx->kc_enctype].ke_hash_size == 0) {
+                CERROR("unsupported enctype %x\n", kctx->kc_enctype);
+                return -1;
+        }
+
+        ke = &enctypes[kctx->kc_enctype];
+
+	/* tfm arc4 is stateful, user should alloc-use-free by his own */
+	if (kctx->kc_enctype != ENCTYPE_ARCFOUR_HMAC &&
+	    gss_keyblock_init(&kctx->kc_keye, ke->ke_enc_name, ke->ke_enc_mode))
+		return -1;
+
+	/* tfm hmac is stateful, user should alloc-use-free by his own */
+	if (ke->ke_hash_hmac == 0 &&
+	    gss_keyblock_init(&kctx->kc_keyi, ke->ke_enc_name, ke->ke_enc_mode))
+		return -1;
+	if (ke->ke_hash_hmac == 0 &&
+	    gss_keyblock_init(&kctx->kc_keyc, ke->ke_enc_name, ke->ke_enc_mode))
+		return -1;
+
+        return 0;
+}
+
+static
+void delete_context_kerberos(struct krb5_ctx *kctx)
+{
+	rawobj_free(&kctx->kc_mech_used);
+
+	gss_keyblock_free(&kctx->kc_keye);
+	gss_keyblock_free(&kctx->kc_keyi);
+	gss_keyblock_free(&kctx->kc_keyc);
+}
+
+static
+__u32 import_context_rfc1964(struct krb5_ctx *kctx, char *p, char *end)
+{
+	unsigned int    tmp_uint, keysize;
+
+	/* seed_init flag */
+	if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+		goto out_err;
+	kctx->kc_seed_init = (tmp_uint != 0);
+
+	/* seed */
+	if (gss_get_bytes(&p, end, kctx->kc_seed, sizeof(kctx->kc_seed)))
+		goto out_err;
+
+	/* sign/seal algorithm, not really used now */
+	if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)) ||
+	    gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+		goto out_err;
+
+	/* end time */
+	if (gss_get_bytes(&p, end, &kctx->kc_endtime, sizeof(kctx->kc_endtime)))
+		goto out_err;
+
+	/* seq send */
+	if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+		goto out_err;
+	kctx->kc_seq_send = tmp_uint;
+
+	/* mech oid */
+	if (gss_get_rawobj(&p, end, &kctx->kc_mech_used))
+		goto out_err;
+
+	/* old style enc/seq keys in format:
+	 *   - enctype (u32)
+	 *   - keysize (u32)
+	 *   - keydata
+	 * we decompose them to fit into the new context
+	 */
+
+	/* enc key */
+	if (gss_get_bytes(&p, end, &kctx->kc_enctype, sizeof(kctx->kc_enctype)))
+		goto out_err;
+
+	if (gss_get_bytes(&p, end, &keysize, sizeof(keysize)))
+		goto out_err;
+
+	if (gss_get_keyblock(&p, end, &kctx->kc_keye, keysize))
+		goto out_err;
+
+	/* seq key */
+	if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)) ||
+	    tmp_uint != kctx->kc_enctype)
+		goto out_err;
+
+	if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)) ||
+	    tmp_uint != keysize)
+		goto out_err;
+
+	if (gss_get_keyblock(&p, end, &kctx->kc_keyc, keysize))
+		goto out_err;
+
+	/* old style fallback */
+	if (gss_keyblock_dup(&kctx->kc_keyi, &kctx->kc_keyc))
+		goto out_err;
+
+	if (p != end)
+		goto out_err;
+
+	CDEBUG(D_SEC, "successfully imported rfc1964 context\n");
+	return 0;
+out_err:
+	return GSS_S_FAILURE;
+}
+
+/* Flags for version 2 context flags */
+#define KRB5_CTX_FLAG_INITIATOR		0x00000001
+#define KRB5_CTX_FLAG_CFX		0x00000002
+#define KRB5_CTX_FLAG_ACCEPTOR_SUBKEY	0x00000004
+
+static
+__u32 import_context_rfc4121(struct krb5_ctx *kctx, char *p, char *end)
+{
+	unsigned int    tmp_uint, keysize;
+
+	/* end time */
+	if (gss_get_bytes(&p, end, &kctx->kc_endtime, sizeof(kctx->kc_endtime)))
+		goto out_err;
+
+	/* flags */
+	if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+		goto out_err;
+
+	if (tmp_uint & KRB5_CTX_FLAG_INITIATOR)
+		kctx->kc_initiate = 1;
+	if (tmp_uint & KRB5_CTX_FLAG_CFX)
+		kctx->kc_cfx = 1;
+	if (tmp_uint & KRB5_CTX_FLAG_ACCEPTOR_SUBKEY)
+		kctx->kc_have_acceptor_subkey = 1;
+
+	/* seq send */
+	if (gss_get_bytes(&p, end, &kctx->kc_seq_send,
+	    sizeof(kctx->kc_seq_send)))
+		goto out_err;
+
+	/* enctype */
+	if (gss_get_bytes(&p, end, &kctx->kc_enctype, sizeof(kctx->kc_enctype)))
+		goto out_err;
+
+	/* size of each key */
+	if (gss_get_bytes(&p, end, &keysize, sizeof(keysize)))
+		goto out_err;
+
+	/* number of keys - should always be 3 */
+	if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+		goto out_err;
+
+	if (tmp_uint != 3) {
+		CERROR("Invalid number of keys: %u\n", tmp_uint);
+		goto out_err;
+	}
+
+	/* ke */
+	if (gss_get_keyblock(&p, end, &kctx->kc_keye, keysize))
+		goto out_err;
+	/* ki */
+	if (gss_get_keyblock(&p, end, &kctx->kc_keyi, keysize))
+		goto out_err;
+	/* ki */
+	if (gss_get_keyblock(&p, end, &kctx->kc_keyc, keysize))
+		goto out_err;
+
+	CDEBUG(D_SEC, "successfully imported v2 context\n");
+	return 0;
+out_err:
+	return GSS_S_FAILURE;
+}
+
+/*
+ * The whole purpose here is trying to keep user level gss context parsing
+ * from nfs-utils unchanged as possible as we can, they are not quite mature
+ * yet, and many stuff still not clear, like heimdal etc.
+ */
+static
+__u32 gss_import_sec_context_kerberos(rawobj_t *inbuf,
+                                      struct gss_ctx *gctx)
+{
+	struct krb5_ctx *kctx;
+	char *p = (char *)inbuf->data;
+	char *end = (char *)(inbuf->data + inbuf->len);
+	unsigned int tmp_uint, rc;
+
+	if (gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint))) {
+		CERROR("Fail to read version\n");
+		return GSS_S_FAILURE;
+	}
+
+        /* only support 0, 1 for the moment */
+        if (tmp_uint > 2) {
+                CERROR("Invalid version %u\n", tmp_uint);
+                return GSS_S_FAILURE;
+        }
+
+        OBD_ALLOC_PTR(kctx);
+        if (!kctx)
+                return GSS_S_FAILURE;
+
+        if (tmp_uint == 0 || tmp_uint == 1) {
+                kctx->kc_initiate = tmp_uint;
+                rc = import_context_rfc1964(kctx, p, end);
+        } else {
+                rc = import_context_rfc4121(kctx, p, end);
+        }
+
+        if (rc == 0)
+                rc = krb5_init_keys(kctx);
+
+        if (rc) {
+                delete_context_kerberos(kctx);
+                OBD_FREE_PTR(kctx);
+
+                return GSS_S_FAILURE;
+        }
+
+        gctx->internal_ctx_id = kctx;
+        return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_copy_reverse_context_kerberos(struct gss_ctx *gctx,
+                                        struct gss_ctx *gctx_new)
+{
+        struct krb5_ctx *kctx = gctx->internal_ctx_id;
+        struct krb5_ctx *knew;
+
+        OBD_ALLOC_PTR(knew);
+        if (!knew)
+                return GSS_S_FAILURE;
+
+        knew->kc_initiate = kctx->kc_initiate ? 0 : 1;
+        knew->kc_cfx = kctx->kc_cfx;
+        knew->kc_seed_init = kctx->kc_seed_init;
+        knew->kc_have_acceptor_subkey = kctx->kc_have_acceptor_subkey;
+        knew->kc_endtime = kctx->kc_endtime;
+
+        memcpy(knew->kc_seed, kctx->kc_seed, sizeof(kctx->kc_seed));
+        knew->kc_seq_send = kctx->kc_seq_recv;
+        knew->kc_seq_recv = kctx->kc_seq_send;
+        knew->kc_enctype = kctx->kc_enctype;
+
+        if (rawobj_dup(&knew->kc_mech_used, &kctx->kc_mech_used))
+                goto out_err;
+
+	if (gss_keyblock_dup(&knew->kc_keye, &kctx->kc_keye))
+		goto out_err;
+	if (gss_keyblock_dup(&knew->kc_keyi, &kctx->kc_keyi))
+		goto out_err;
+	if (gss_keyblock_dup(&knew->kc_keyc, &kctx->kc_keyc))
+		goto out_err;
+        if (krb5_init_keys(knew))
+                goto out_err;
+
+        gctx_new->internal_ctx_id = knew;
+	CDEBUG(D_SEC, "successfully copied reverse context\n");
+        return GSS_S_COMPLETE;
+
+out_err:
+        delete_context_kerberos(knew);
+        OBD_FREE_PTR(knew);
+        return GSS_S_FAILURE;
+}
+
+static
+__u32 gss_inquire_context_kerberos(struct gss_ctx *gctx,
+                                   unsigned long  *endtime)
+{
+        struct krb5_ctx *kctx = gctx->internal_ctx_id;
+
+	*endtime = (unsigned long)((__u32) kctx->kc_endtime);
+        return GSS_S_COMPLETE;
+}
+
+static
+void gss_delete_sec_context_kerberos(void *internal_ctx)
+{
+        struct krb5_ctx *kctx = internal_ctx;
+
+        delete_context_kerberos(kctx);
+        OBD_FREE_PTR(kctx);
+}
+
+/*
+ * compute (keyed/keyless) checksum against the plain text which appended
+ * with krb5 wire token header.
+ */
+static
+__s32 krb5_make_checksum(__u32 enctype,
+			 struct gss_keyblock *kb,
+			 struct krb5_header *khdr,
+			 int msgcnt, rawobj_t *msgs,
+			 int iovcnt, lnet_kiov_t *iovs,
+			 rawobj_t *cksum)
+{
+        struct krb5_enctype   *ke = &enctypes[enctype];
+	struct crypto_hash    *tfm;
+	rawobj_t	       hdr;
+        __u32                  code = GSS_S_FAILURE;
+        int                    rc;
+
+	if (!(tfm = crypto_alloc_hash(ke->ke_hash_name, 0, 0))) {
+                CERROR("failed to alloc TFM: %s\n", ke->ke_hash_name);
+                return GSS_S_FAILURE;
+        }
+
+	cksum->len = crypto_hash_digestsize(tfm);
+        OBD_ALLOC_LARGE(cksum->data, cksum->len);
+        if (!cksum->data) {
+                cksum->len = 0;
+                goto out_tfm;
+        }
+
+	hdr.data = (__u8 *)khdr;
+	hdr.len = sizeof(*khdr);
+
+        if (ke->ke_hash_hmac)
+		rc = gss_digest_hmac(tfm, &kb->kb_key,
+				     &hdr, msgcnt, msgs, iovcnt, iovs, cksum);
+        else
+		rc = gss_digest_norm(tfm, kb,
+				     &hdr, msgcnt, msgs, iovcnt, iovs, cksum);
+
+        if (rc == 0)
+                code = GSS_S_COMPLETE;
+out_tfm:
+	crypto_free_hash(tfm);
+        return code;
+}
+
+static void fill_krb5_header(struct krb5_ctx *kctx,
+                             struct krb5_header *khdr,
+                             int privacy)
+{
+        unsigned char acceptor_flag;
+
+        acceptor_flag = kctx->kc_initiate ? 0 : FLAG_SENDER_IS_ACCEPTOR;
+
+        if (privacy) {
+                khdr->kh_tok_id = cpu_to_be16(KG_TOK_WRAP_MSG);
+                khdr->kh_flags = acceptor_flag | FLAG_WRAP_CONFIDENTIAL;
+                khdr->kh_ec = cpu_to_be16(0);
+                khdr->kh_rrc = cpu_to_be16(0);
+        } else {
+                khdr->kh_tok_id = cpu_to_be16(KG_TOK_MIC_MSG);
+                khdr->kh_flags = acceptor_flag;
+                khdr->kh_ec = cpu_to_be16(0xffff);
+                khdr->kh_rrc = cpu_to_be16(0xffff);
+        }
+
+        khdr->kh_filler = 0xff;
+	spin_lock(&krb5_seq_lock);
+	khdr->kh_seq = cpu_to_be64(kctx->kc_seq_send++);
+	spin_unlock(&krb5_seq_lock);
+}
+
+static __u32 verify_krb5_header(struct krb5_ctx *kctx,
+                                struct krb5_header *khdr,
+                                int privacy)
+{
+        unsigned char acceptor_flag;
+        __u16         tok_id, ec_rrc;
+
+        acceptor_flag = kctx->kc_initiate ? FLAG_SENDER_IS_ACCEPTOR : 0;
+
+        if (privacy) {
+                tok_id = KG_TOK_WRAP_MSG;
+                ec_rrc = 0x0;
+        } else {
+                tok_id = KG_TOK_MIC_MSG;
+                ec_rrc = 0xffff;
+        }
+
+        /* sanity checks */
+        if (be16_to_cpu(khdr->kh_tok_id) != tok_id) {
+                CERROR("bad token id\n");
+                return GSS_S_DEFECTIVE_TOKEN;
+        }
+        if ((khdr->kh_flags & FLAG_SENDER_IS_ACCEPTOR) != acceptor_flag) {
+                CERROR("bad direction flag\n");
+                return GSS_S_BAD_SIG;
+        }
+        if (privacy && (khdr->kh_flags & FLAG_WRAP_CONFIDENTIAL) == 0) {
+                CERROR("missing confidential flag\n");
+                return GSS_S_BAD_SIG;
+        }
+        if (khdr->kh_filler != 0xff) {
+                CERROR("bad filler\n");
+                return GSS_S_DEFECTIVE_TOKEN;
+        }
+        if (be16_to_cpu(khdr->kh_ec) != ec_rrc ||
+            be16_to_cpu(khdr->kh_rrc) != ec_rrc) {
+                CERROR("bad EC or RRC\n");
+                return GSS_S_DEFECTIVE_TOKEN;
+        }
+        return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_get_mic_kerberos(struct gss_ctx *gctx,
+                           int msgcnt,
+                           rawobj_t *msgs,
+                           int iovcnt,
+                           lnet_kiov_t *iovs,
+                           rawobj_t *token)
+{
+        struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+        struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+        struct krb5_header  *khdr;
+        rawobj_t             cksum = RAWOBJ_EMPTY;
+
+        /* fill krb5 header */
+        LASSERT(token->len >= sizeof(*khdr));
+	khdr = (struct krb5_header *)token->data;
+        fill_krb5_header(kctx, khdr, 0);
+
+        /* checksum */
+        if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc,
+                               khdr, msgcnt, msgs, iovcnt, iovs, &cksum))
+                return GSS_S_FAILURE;
+
+        LASSERT(cksum.len >= ke->ke_hash_size);
+        LASSERT(token->len >= sizeof(*khdr) + ke->ke_hash_size);
+        memcpy(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size,
+               ke->ke_hash_size);
+
+        token->len = sizeof(*khdr) + ke->ke_hash_size;
+        rawobj_free(&cksum);
+        return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_verify_mic_kerberos(struct gss_ctx *gctx,
+                              int msgcnt,
+                              rawobj_t *msgs,
+                              int iovcnt,
+                              lnet_kiov_t *iovs,
+                              rawobj_t *token)
+{
+        struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+        struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+        struct krb5_header  *khdr;
+        rawobj_t             cksum = RAWOBJ_EMPTY;
+        __u32                major;
+
+        if (token->len < sizeof(*khdr)) {
+                CERROR("short signature: %u\n", token->len);
+                return GSS_S_DEFECTIVE_TOKEN;
+        }
+
+	khdr = (struct krb5_header *)token->data;
+
+        major = verify_krb5_header(kctx, khdr, 0);
+        if (major != GSS_S_COMPLETE) {
+                CERROR("bad krb5 header\n");
+                return major;
+        }
+
+        if (token->len < sizeof(*khdr) + ke->ke_hash_size) {
+                CERROR("short signature: %u, require %d\n",
+                       token->len, (int) sizeof(*khdr) + ke->ke_hash_size);
+                return GSS_S_FAILURE;
+        }
+
+        if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc,
+                               khdr, msgcnt, msgs, iovcnt, iovs, &cksum)) {
+                CERROR("failed to make checksum\n");
+                return GSS_S_FAILURE;
+        }
+
+        LASSERT(cksum.len >= ke->ke_hash_size);
+        if (memcmp(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size,
+                   ke->ke_hash_size)) {
+                CERROR("checksum mismatch\n");
+                rawobj_free(&cksum);
+                return GSS_S_BAD_SIG;
+        }
+
+        rawobj_free(&cksum);
+        return GSS_S_COMPLETE;
+}
+
+/*
+ * if adj_nob != 0, we adjust desc->bd_nob to the actual cipher text size.
+ */
+static
+int krb5_encrypt_bulk(struct crypto_blkcipher *tfm,
+                      struct krb5_header *khdr,
+                      char *confounder,
+                      struct ptlrpc_bulk_desc *desc,
+                      rawobj_t *cipher,
+                      int adj_nob)
+{
+        struct blkcipher_desc   ciph_desc;
+        __u8                    local_iv[16] = {0};
+        struct scatterlist      src, dst;
+	struct sg_table		sg_src, sg_dst;
+        int                     blocksize, i, rc, nob = 0;
+
+	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
+        LASSERT(desc->bd_iov_count);
+	LASSERT(GET_ENC_KIOV(desc));
+
+	blocksize = crypto_blkcipher_blocksize(tfm);
+        LASSERT(blocksize > 1);
+        LASSERT(cipher->len == blocksize + sizeof(*khdr));
+
+        ciph_desc.tfm  = tfm;
+        ciph_desc.info = local_iv;
+        ciph_desc.flags = 0;
+
+        /* encrypt confounder */
+	rc = gss_setup_sgtable(&sg_src, &src, confounder, blocksize);
+	if (rc != 0)
+		return rc;
+
+	rc = gss_setup_sgtable(&sg_dst, &dst, cipher->data, blocksize);
+	if (rc != 0) {
+		gss_teardown_sgtable(&sg_src);
+		return rc;
+	}
+
+	rc = crypto_blkcipher_encrypt_iv(&ciph_desc, sg_dst.sgl,
+					 sg_src.sgl, blocksize);
+
+	gss_teardown_sgtable(&sg_dst);
+	gss_teardown_sgtable(&sg_src);
+
+        if (rc) {
+                CERROR("error to encrypt confounder: %d\n", rc);
+                return rc;
+        }
+
+        /* encrypt clear pages */
+        for (i = 0; i < desc->bd_iov_count; i++) {
+		sg_init_table(&src, 1);
+		sg_set_page(&src, BD_GET_KIOV(desc, i).kiov_page,
+			    (BD_GET_KIOV(desc, i).kiov_len +
+				blocksize - 1) &
+			    (~(blocksize - 1)),
+			    BD_GET_KIOV(desc, i).kiov_offset);
+		if (adj_nob)
+			nob += src.length;
+		sg_init_table(&dst, 1);
+		sg_set_page(&dst, BD_GET_ENC_KIOV(desc, i).kiov_page,
+			    src.length, src.offset);
+
+		BD_GET_ENC_KIOV(desc, i).kiov_offset = dst.offset;
+		BD_GET_ENC_KIOV(desc, i).kiov_len = dst.length;
+
+		rc = crypto_blkcipher_encrypt_iv(&ciph_desc, &dst, &src,
+                                                    src.length);
+                if (rc) {
+                        CERROR("error to encrypt page: %d\n", rc);
+                        return rc;
+                }
+        }
+
+        /* encrypt krb5 header */
+	rc = gss_setup_sgtable(&sg_src, &src, khdr, sizeof(*khdr));
+	if (rc != 0)
+		return rc;
+
+	rc = gss_setup_sgtable(&sg_dst, &dst, cipher->data + blocksize,
+			   sizeof(*khdr));
+	if (rc != 0) {
+		gss_teardown_sgtable(&sg_src);
+		return rc;
+	}
+
+	rc = crypto_blkcipher_encrypt_iv(&ciph_desc, sg_dst.sgl, sg_src.sgl,
+					 sizeof(*khdr));
+
+	gss_teardown_sgtable(&sg_dst);
+	gss_teardown_sgtable(&sg_src);
+
+        if (rc) {
+                CERROR("error to encrypt krb5 header: %d\n", rc);
+                return rc;
+        }
+
+        if (adj_nob)
+                desc->bd_nob = nob;
+
+        return 0;
+}
+
+/*
+ * desc->bd_nob_transferred is the size of cipher text received.
+ * desc->bd_nob is the target size of plain text supposed to be.
+ *
+ * if adj_nob != 0, we adjust each page's kiov_len to the actual
+ * plain text size.
+ * - for client read: we don't know data size for each page, so
+ *   bd_iov[]->kiov_len is set to PAGE_SIZE, but actual data received might
+ *   be smaller, so we need to adjust it according to
+ *   bd_u.bd_kiov.bd_enc_vec[]->kiov_len.
+ *   this means we DO NOT support the situation that server send an odd size
+ *   data in a page which is not the last one.
+ * - for server write: we knows exactly data size for each page being expected,
+ *   thus kiov_len is accurate already, so we should not adjust it at all.
+ *   and bd_u.bd_kiov.bd_enc_vec[]->kiov_len should be
+ *   round_up(bd_iov[]->kiov_len) which
+ *   should have been done by prep_bulk().
+ */
+static
+int krb5_decrypt_bulk(struct crypto_blkcipher *tfm,
+                      struct krb5_header *khdr,
+                      struct ptlrpc_bulk_desc *desc,
+                      rawobj_t *cipher,
+                      rawobj_t *plain,
+                      int adj_nob)
+{
+        struct blkcipher_desc   ciph_desc;
+        __u8                    local_iv[16] = {0};
+        struct scatterlist      src, dst;
+	struct sg_table		sg_src, sg_dst;
+        int                     ct_nob = 0, pt_nob = 0;
+        int                     blocksize, i, rc;
+
+	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
+        LASSERT(desc->bd_iov_count);
+	LASSERT(GET_ENC_KIOV(desc));
+        LASSERT(desc->bd_nob_transferred);
+
+	blocksize = crypto_blkcipher_blocksize(tfm);
+        LASSERT(blocksize > 1);
+        LASSERT(cipher->len == blocksize + sizeof(*khdr));
+
+        ciph_desc.tfm  = tfm;
+        ciph_desc.info = local_iv;
+        ciph_desc.flags = 0;
+
+        if (desc->bd_nob_transferred % blocksize) {
+                CERROR("odd transferred nob: %d\n", desc->bd_nob_transferred);
+                return -EPROTO;
+        }
+
+        /* decrypt head (confounder) */
+	rc = gss_setup_sgtable(&sg_src, &src, cipher->data, blocksize);
+	if (rc != 0)
+		return rc;
+
+	rc = gss_setup_sgtable(&sg_dst, &dst, plain->data, blocksize);
+	if (rc != 0) {
+		gss_teardown_sgtable(&sg_src);
+		return rc;
+	}
+
+	rc = crypto_blkcipher_decrypt_iv(&ciph_desc, sg_dst.sgl,
+					 sg_src.sgl, blocksize);
+
+	gss_teardown_sgtable(&sg_dst);
+	gss_teardown_sgtable(&sg_src);
+
+        if (rc) {
+                CERROR("error to decrypt confounder: %d\n", rc);
+                return rc;
+        }
+
+	for (i = 0; i < desc->bd_iov_count && ct_nob < desc->bd_nob_transferred;
+	     i++) {
+		if (BD_GET_ENC_KIOV(desc, i).kiov_offset % blocksize
+		    != 0 ||
+		    BD_GET_ENC_KIOV(desc, i).kiov_len % blocksize
+		    != 0) {
+			CERROR("page %d: odd offset %u len %u, blocksize %d\n",
+			       i, BD_GET_ENC_KIOV(desc, i).kiov_offset,
+			       BD_GET_ENC_KIOV(desc, i).kiov_len,
+			       blocksize);
+			return -EFAULT;
+		}
+
+		if (adj_nob) {
+			if (ct_nob + BD_GET_ENC_KIOV(desc, i).kiov_len >
+			    desc->bd_nob_transferred)
+				BD_GET_ENC_KIOV(desc, i).kiov_len =
+					desc->bd_nob_transferred - ct_nob;
+
+			BD_GET_KIOV(desc, i).kiov_len =
+			  BD_GET_ENC_KIOV(desc, i).kiov_len;
+			if (pt_nob + BD_GET_ENC_KIOV(desc, i).kiov_len >
+			    desc->bd_nob)
+				BD_GET_KIOV(desc, i).kiov_len =
+				  desc->bd_nob - pt_nob;
+		} else {
+			/* this should be guaranteed by LNET */
+			LASSERT(ct_nob + BD_GET_ENC_KIOV(desc, i).
+				kiov_len <=
+				desc->bd_nob_transferred);
+			LASSERT(BD_GET_KIOV(desc, i).kiov_len <=
+				BD_GET_ENC_KIOV(desc, i).kiov_len);
+		}
+
+		if (BD_GET_ENC_KIOV(desc, i).kiov_len == 0)
+			continue;
+
+		sg_init_table(&src, 1);
+		sg_set_page(&src, BD_GET_ENC_KIOV(desc, i).kiov_page,
+			    BD_GET_ENC_KIOV(desc, i).kiov_len,
+			    BD_GET_ENC_KIOV(desc, i).kiov_offset);
+		dst = src;
+		if (BD_GET_KIOV(desc, i).kiov_len % blocksize == 0)
+			sg_assign_page(&dst,
+				       BD_GET_KIOV(desc, i).kiov_page);
+
+		rc = crypto_blkcipher_decrypt_iv(&ciph_desc, &dst, &src,
+						 src.length);
+                if (rc) {
+                        CERROR("error to decrypt page: %d\n", rc);
+                        return rc;
+                }
+
+		if (BD_GET_KIOV(desc, i).kiov_len % blocksize != 0) {
+			memcpy(page_address(BD_GET_KIOV(desc, i).kiov_page) +
+			       BD_GET_KIOV(desc, i).kiov_offset,
+			       page_address(BD_GET_ENC_KIOV(desc, i).
+					    kiov_page) +
+			       BD_GET_KIOV(desc, i).kiov_offset,
+			       BD_GET_KIOV(desc, i).kiov_len);
+		}
+
+		ct_nob += BD_GET_ENC_KIOV(desc, i).kiov_len;
+		pt_nob += BD_GET_KIOV(desc, i).kiov_len;
+	}
+
+        if (unlikely(ct_nob != desc->bd_nob_transferred)) {
+                CERROR("%d cipher text transferred but only %d decrypted\n",
+                       desc->bd_nob_transferred, ct_nob);
+                return -EFAULT;
+        }
+
+        if (unlikely(!adj_nob && pt_nob != desc->bd_nob)) {
+                CERROR("%d plain text expected but only %d received\n",
+                       desc->bd_nob, pt_nob);
+                return -EFAULT;
+        }
+
+	/* if needed, clear up the rest unused iovs */
+	if (adj_nob)
+		while (i < desc->bd_iov_count)
+			BD_GET_KIOV(desc, i++).kiov_len = 0;
+
+        /* decrypt tail (krb5 header) */
+	rc = gss_setup_sgtable(&sg_src, &src, cipher->data + blocksize,
+			       sizeof(*khdr));
+	if (rc != 0)
+		return rc;
+
+	rc = gss_setup_sgtable(&sg_dst, &dst, cipher->data + blocksize,
+			       sizeof(*khdr));
+	if (rc != 0) {
+		gss_teardown_sgtable(&sg_src);
+		return rc;
+	}
+
+	rc = crypto_blkcipher_decrypt_iv(&ciph_desc, sg_dst.sgl, sg_src.sgl,
+					 sizeof(*khdr));
+
+	gss_teardown_sgtable(&sg_src);
+	gss_teardown_sgtable(&sg_dst);
+
+        if (rc) {
+                CERROR("error to decrypt tail: %d\n", rc);
+                return rc;
+        }
+
+        if (memcmp(cipher->data + blocksize, khdr, sizeof(*khdr))) {
+                CERROR("krb5 header doesn't match\n");
+                return -EACCES;
+        }
+
+        return 0;
+}
+
+static
+__u32 gss_wrap_kerberos(struct gss_ctx *gctx,
+                        rawobj_t *gsshdr,
+                        rawobj_t *msg,
+                        int msg_buflen,
+                        rawobj_t *token)
+{
+        struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+        struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+        struct krb5_header  *khdr;
+        int                  blocksize;
+        rawobj_t             cksum = RAWOBJ_EMPTY;
+        rawobj_t             data_desc[3], cipher;
+        __u8                 conf[GSS_MAX_CIPHER_BLOCK];
+	__u8                 local_iv[16] = {0};
+        int                  rc = 0;
+
+        LASSERT(ke);
+        LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK);
+        LASSERT(kctx->kc_keye.kb_tfm == NULL ||
+                ke->ke_conf_size >=
+		crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm));
+
+        /*
+         * final token format:
+         * ---------------------------------------------------
+         * | krb5 header | cipher text | checksum (16 bytes) |
+         * ---------------------------------------------------
+         */
+
+        /* fill krb5 header */
+        LASSERT(token->len >= sizeof(*khdr));
+	khdr = (struct krb5_header *)token->data;
+        fill_krb5_header(kctx, khdr, 1);
+
+        /* generate confounder */
+        cfs_get_random_bytes(conf, ke->ke_conf_size);
+
+        /* get encryption blocksize. note kc_keye might not associated with
+         * a tfm, currently only for arcfour-hmac */
+        if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+                LASSERT(kctx->kc_keye.kb_tfm == NULL);
+                blocksize = 1;
+        } else {
+                LASSERT(kctx->kc_keye.kb_tfm);
+		blocksize = crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+        }
+        LASSERT(blocksize <= ke->ke_conf_size);
+
+	/* padding the message */
+	if (gss_add_padding(msg, msg_buflen, blocksize))
+		return GSS_S_FAILURE;
+
+        /*
+         * clear text layout for checksum:
+         * ------------------------------------------------------
+         * | confounder | gss header | clear msgs | krb5 header |
+         * ------------------------------------------------------
+         */
+        data_desc[0].data = conf;
+        data_desc[0].len = ke->ke_conf_size;
+        data_desc[1].data = gsshdr->data;
+        data_desc[1].len = gsshdr->len;
+        data_desc[2].data = msg->data;
+        data_desc[2].len = msg->len;
+
+        /* compute checksum */
+        if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+                               khdr, 3, data_desc, 0, NULL, &cksum))
+                return GSS_S_FAILURE;
+        LASSERT(cksum.len >= ke->ke_hash_size);
+
+        /*
+         * clear text layout for encryption:
+         * -----------------------------------------
+         * | confounder | clear msgs | krb5 header |
+         * -----------------------------------------
+         */
+        data_desc[0].data = conf;
+        data_desc[0].len = ke->ke_conf_size;
+        data_desc[1].data = msg->data;
+        data_desc[1].len = msg->len;
+        data_desc[2].data = (__u8 *) khdr;
+        data_desc[2].len = sizeof(*khdr);
+
+        /* cipher text will be directly inplace */
+	cipher.data = (__u8 *)(khdr + 1);
+        cipher.len = token->len - sizeof(*khdr);
+        LASSERT(cipher.len >= ke->ke_conf_size + msg->len + sizeof(*khdr));
+
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		rawobj_t		 arc4_keye;
+		struct crypto_blkcipher *arc4_tfm;
+
+		if (krb5_make_checksum(ENCTYPE_ARCFOUR_HMAC, &kctx->kc_keyi,
+				       NULL, 1, &cksum, 0, NULL, &arc4_keye)) {
+			CERROR("failed to obtain arc4 enc key\n");
+			GOTO(arc4_out, rc = -EACCES);
+		}
+
+		arc4_tfm = crypto_alloc_blkcipher("ecb(arc4)", 0, 0);
+		if (IS_ERR(arc4_tfm)) {
+			CERROR("failed to alloc tfm arc4 in ECB mode\n");
+			GOTO(arc4_out_key, rc = -EACCES);
+		}
+
+		if (crypto_blkcipher_setkey(arc4_tfm, arc4_keye.data,
+                                               arc4_keye.len)) {
+                        CERROR("failed to set arc4 key, len %d\n",
+                               arc4_keye.len);
+                        GOTO(arc4_out_tfm, rc = -EACCES);
+                }
+
+		rc = gss_crypt_rawobjs(arc4_tfm, NULL, 3, data_desc,
+				       &cipher, 1);
+arc4_out_tfm:
+		crypto_free_blkcipher(arc4_tfm);
+arc4_out_key:
+                rawobj_free(&arc4_keye);
+arc4_out:
+                do {} while(0); /* just to avoid compile warning */
+        } else {
+		rc = gss_crypt_rawobjs(kctx->kc_keye.kb_tfm, local_iv, 3,
+				       data_desc, &cipher, 1);
+        }
+
+        if (rc != 0) {
+                rawobj_free(&cksum);
+                return GSS_S_FAILURE;
+        }
+
+        /* fill in checksum */
+        LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size);
+        memcpy((char *)(khdr + 1) + cipher.len,
+               cksum.data + cksum.len - ke->ke_hash_size,
+               ke->ke_hash_size);
+        rawobj_free(&cksum);
+
+        /* final token length */
+        token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size;
+        return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_prep_bulk_kerberos(struct gss_ctx *gctx,
+			     struct ptlrpc_bulk_desc *desc)
+{
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	int                  blocksize, i;
+
+	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
+	LASSERT(desc->bd_iov_count);
+	LASSERT(GET_ENC_KIOV(desc));
+	LASSERT(kctx->kc_keye.kb_tfm);
+
+	blocksize = crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		LASSERT(BD_GET_ENC_KIOV(desc, i).kiov_page);
+		/*
+		 * offset should always start at page boundary of either
+		 * client or server side.
+		 */
+		if (BD_GET_KIOV(desc, i).kiov_offset & blocksize) {
+			CERROR("odd offset %d in page %d\n",
+			       BD_GET_KIOV(desc, i).kiov_offset, i);
+			return GSS_S_FAILURE;
+		}
+
+		BD_GET_ENC_KIOV(desc, i).kiov_offset =
+			BD_GET_KIOV(desc, i).kiov_offset;
+		BD_GET_ENC_KIOV(desc, i).kiov_len =
+			(BD_GET_KIOV(desc, i).kiov_len +
+			 blocksize - 1) & (~(blocksize - 1));
+	}
+
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_wrap_bulk_kerberos(struct gss_ctx *gctx,
+                             struct ptlrpc_bulk_desc *desc,
+                             rawobj_t *token, int adj_nob)
+{
+        struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+        struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+        struct krb5_header  *khdr;
+        int                  blocksize;
+        rawobj_t             cksum = RAWOBJ_EMPTY;
+        rawobj_t             data_desc[1], cipher;
+        __u8                 conf[GSS_MAX_CIPHER_BLOCK];
+        int                  rc = 0;
+
+	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
+        LASSERT(ke);
+        LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK);
+
+        /*
+         * final token format:
+         * --------------------------------------------------
+         * | krb5 header | head/tail cipher text | checksum |
+         * --------------------------------------------------
+         */
+
+        /* fill krb5 header */
+        LASSERT(token->len >= sizeof(*khdr));
+	khdr = (struct krb5_header *)token->data;
+        fill_krb5_header(kctx, khdr, 1);
+
+        /* generate confounder */
+        cfs_get_random_bytes(conf, ke->ke_conf_size);
+
+        /* get encryption blocksize. note kc_keye might not associated with
+         * a tfm, currently only for arcfour-hmac */
+        if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+                LASSERT(kctx->kc_keye.kb_tfm == NULL);
+                blocksize = 1;
+        } else {
+                LASSERT(kctx->kc_keye.kb_tfm);
+		blocksize = crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+        }
+
+        /*
+         * we assume the size of krb5_header (16 bytes) must be n * blocksize.
+         * the bulk token size would be exactly (sizeof(krb5_header) +
+         * blocksize + sizeof(krb5_header) + hashsize)
+         */
+        LASSERT(blocksize <= ke->ke_conf_size);
+        LASSERT(sizeof(*khdr) >= blocksize && sizeof(*khdr) % blocksize == 0);
+        LASSERT(token->len >= sizeof(*khdr) + blocksize + sizeof(*khdr) + 16);
+
+        /*
+         * clear text layout for checksum:
+         * ------------------------------------------
+         * | confounder | clear pages | krb5 header |
+         * ------------------------------------------
+         */
+        data_desc[0].data = conf;
+        data_desc[0].len = ke->ke_conf_size;
+
+	/* compute checksum */
+	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+			       khdr, 1, data_desc,
+			       desc->bd_iov_count, GET_KIOV(desc),
+			       &cksum))
+		return GSS_S_FAILURE;
+	LASSERT(cksum.len >= ke->ke_hash_size);
+
+        /*
+         * clear text layout for encryption:
+         * ------------------------------------------
+         * | confounder | clear pages | krb5 header |
+         * ------------------------------------------
+         *        |              |             |
+         *        ----------  (cipher pages)   |
+         * result token:   |                   |
+         * -------------------------------------------
+         * | krb5 header | cipher text | cipher text |
+         * -------------------------------------------
+         */
+        data_desc[0].data = conf;
+        data_desc[0].len = ke->ke_conf_size;
+
+	cipher.data = (__u8 *)(khdr + 1);
+        cipher.len = blocksize + sizeof(*khdr);
+
+        if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+                LBUG();
+                rc = 0;
+        } else {
+                rc = krb5_encrypt_bulk(kctx->kc_keye.kb_tfm, khdr,
+                                       conf, desc, &cipher, adj_nob);
+        }
+
+        if (rc != 0) {
+                rawobj_free(&cksum);
+                return GSS_S_FAILURE;
+        }
+
+        /* fill in checksum */
+        LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size);
+        memcpy((char *)(khdr + 1) + cipher.len,
+               cksum.data + cksum.len - ke->ke_hash_size,
+               ke->ke_hash_size);
+        rawobj_free(&cksum);
+
+        /* final token length */
+        token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size;
+        return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_unwrap_kerberos(struct gss_ctx  *gctx,
+                          rawobj_t        *gsshdr,
+                          rawobj_t        *token,
+                          rawobj_t        *msg)
+{
+        struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+        struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+        struct krb5_header  *khdr;
+        unsigned char       *tmpbuf;
+        int                  blocksize, bodysize;
+        rawobj_t             cksum = RAWOBJ_EMPTY;
+        rawobj_t             cipher_in, plain_out;
+        rawobj_t             hash_objs[3];
+        int                  rc = 0;
+        __u32                major;
+	__u8                 local_iv[16] = {0};
+
+        LASSERT(ke);
+
+        if (token->len < sizeof(*khdr)) {
+                CERROR("short signature: %u\n", token->len);
+                return GSS_S_DEFECTIVE_TOKEN;
+        }
+
+	khdr = (struct krb5_header *)token->data;
+
+        major = verify_krb5_header(kctx, khdr, 1);
+        if (major != GSS_S_COMPLETE) {
+                CERROR("bad krb5 header\n");
+                return major;
+        }
+
+        /* block size */
+        if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+                LASSERT(kctx->kc_keye.kb_tfm == NULL);
+                blocksize = 1;
+        } else {
+                LASSERT(kctx->kc_keye.kb_tfm);
+		blocksize = crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+        }
+
+        /* expected token layout:
+         * ----------------------------------------
+         * | krb5 header | cipher text | checksum |
+         * ----------------------------------------
+         */
+        bodysize = token->len - sizeof(*khdr) - ke->ke_hash_size;
+
+        if (bodysize % blocksize) {
+                CERROR("odd bodysize %d\n", bodysize);
+                return GSS_S_DEFECTIVE_TOKEN;
+        }
+
+        if (bodysize <= ke->ke_conf_size + sizeof(*khdr)) {
+                CERROR("incomplete token: bodysize %d\n", bodysize);
+                return GSS_S_DEFECTIVE_TOKEN;
+        }
+
+        if (msg->len < bodysize - ke->ke_conf_size - sizeof(*khdr)) {
+                CERROR("buffer too small: %u, require %d\n",
+                       msg->len, bodysize - ke->ke_conf_size);
+                return GSS_S_FAILURE;
+        }
+
+        /* decrypting */
+        OBD_ALLOC_LARGE(tmpbuf, bodysize);
+        if (!tmpbuf)
+                return GSS_S_FAILURE;
+
+        major = GSS_S_FAILURE;
+
+	cipher_in.data = (__u8 *)(khdr + 1);
+        cipher_in.len = bodysize;
+        plain_out.data = tmpbuf;
+        plain_out.len = bodysize;
+
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		rawobj_t		 arc4_keye;
+		struct crypto_blkcipher *arc4_tfm;
+
+		cksum.data = token->data + token->len - ke->ke_hash_size;
+		cksum.len = ke->ke_hash_size;
+
+		if (krb5_make_checksum(ENCTYPE_ARCFOUR_HMAC, &kctx->kc_keyi,
+				       NULL, 1, &cksum, 0, NULL, &arc4_keye)) {
+			CERROR("failed to obtain arc4 enc key\n");
+			GOTO(arc4_out, rc = -EACCES);
+		}
+
+		arc4_tfm = crypto_alloc_blkcipher("ecb(arc4)", 0, 0);
+		if (IS_ERR(arc4_tfm)) {
+			CERROR("failed to alloc tfm arc4 in ECB mode\n");
+			GOTO(arc4_out_key, rc = -EACCES);
+		}
+
+		if (crypto_blkcipher_setkey(arc4_tfm,
+                                         arc4_keye.data, arc4_keye.len)) {
+                        CERROR("failed to set arc4 key, len %d\n",
+                               arc4_keye.len);
+                        GOTO(arc4_out_tfm, rc = -EACCES);
+                }
+
+		rc = gss_crypt_rawobjs(arc4_tfm, NULL, 1, &cipher_in,
+				       &plain_out, 0);
+arc4_out_tfm:
+		crypto_free_blkcipher(arc4_tfm);
+arc4_out_key:
+                rawobj_free(&arc4_keye);
+arc4_out:
+                cksum = RAWOBJ_EMPTY;
+        } else {
+		rc = gss_crypt_rawobjs(kctx->kc_keye.kb_tfm, local_iv, 1,
+				       &cipher_in, &plain_out, 0);
+        }
+
+        if (rc != 0) {
+                CERROR("error decrypt\n");
+                goto out_free;
+        }
+        LASSERT(plain_out.len == bodysize);
+
+        /* expected clear text layout:
+         * -----------------------------------------
+         * | confounder | clear msgs | krb5 header |
+         * -----------------------------------------
+         */
+
+        /* verify krb5 header in token is not modified */
+        if (memcmp(khdr, plain_out.data + plain_out.len - sizeof(*khdr),
+                   sizeof(*khdr))) {
+                CERROR("decrypted krb5 header mismatch\n");
+                goto out_free;
+        }
+
+        /* verify checksum, compose clear text as layout:
+         * ------------------------------------------------------
+         * | confounder | gss header | clear msgs | krb5 header |
+         * ------------------------------------------------------
+         */
+        hash_objs[0].len = ke->ke_conf_size;
+        hash_objs[0].data = plain_out.data;
+        hash_objs[1].len = gsshdr->len;
+        hash_objs[1].data = gsshdr->data;
+        hash_objs[2].len = plain_out.len - ke->ke_conf_size - sizeof(*khdr);
+        hash_objs[2].data = plain_out.data + ke->ke_conf_size;
+        if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+                               khdr, 3, hash_objs, 0, NULL, &cksum))
+                goto out_free;
+
+        LASSERT(cksum.len >= ke->ke_hash_size);
+        if (memcmp((char *)(khdr + 1) + bodysize,
+                   cksum.data + cksum.len - ke->ke_hash_size,
+                   ke->ke_hash_size)) {
+                CERROR("checksum mismatch\n");
+                goto out_free;
+        }
+
+        msg->len =  bodysize - ke->ke_conf_size - sizeof(*khdr);
+        memcpy(msg->data, tmpbuf + ke->ke_conf_size, msg->len);
+
+        major = GSS_S_COMPLETE;
+out_free:
+        OBD_FREE_LARGE(tmpbuf, bodysize);
+        rawobj_free(&cksum);
+        return major;
+}
+
+static
+__u32 gss_unwrap_bulk_kerberos(struct gss_ctx *gctx,
+                               struct ptlrpc_bulk_desc *desc,
+                               rawobj_t *token, int adj_nob)
+{
+        struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+        struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+        struct krb5_header  *khdr;
+        int                  blocksize;
+        rawobj_t             cksum = RAWOBJ_EMPTY;
+        rawobj_t             cipher, plain;
+        rawobj_t             data_desc[1];
+        int                  rc;
+        __u32                major;
+
+	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
+        LASSERT(ke);
+
+        if (token->len < sizeof(*khdr)) {
+                CERROR("short signature: %u\n", token->len);
+                return GSS_S_DEFECTIVE_TOKEN;
+        }
+
+	khdr = (struct krb5_header *)token->data;
+
+        major = verify_krb5_header(kctx, khdr, 1);
+        if (major != GSS_S_COMPLETE) {
+                CERROR("bad krb5 header\n");
+                return major;
+        }
+
+        /* block size */
+        if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+                LASSERT(kctx->kc_keye.kb_tfm == NULL);
+                blocksize = 1;
+                LBUG();
+        } else {
+                LASSERT(kctx->kc_keye.kb_tfm);
+		blocksize = crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+        }
+        LASSERT(sizeof(*khdr) >= blocksize && sizeof(*khdr) % blocksize == 0);
+
+        /*
+         * token format is expected as:
+         * -----------------------------------------------
+         * | krb5 header | head/tail cipher text | cksum |
+         * -----------------------------------------------
+         */
+        if (token->len < sizeof(*khdr) + blocksize + sizeof(*khdr) +
+                         ke->ke_hash_size) {
+                CERROR("short token size: %u\n", token->len);
+                return GSS_S_DEFECTIVE_TOKEN;
+        }
+
+        cipher.data = (__u8 *) (khdr + 1);
+        cipher.len = blocksize + sizeof(*khdr);
+        plain.data = cipher.data;
+        plain.len = cipher.len;
+
+        rc = krb5_decrypt_bulk(kctx->kc_keye.kb_tfm, khdr,
+                               desc, &cipher, &plain, adj_nob);
+        if (rc)
+                return GSS_S_DEFECTIVE_TOKEN;
+
+        /*
+         * verify checksum, compose clear text as layout:
+         * ------------------------------------------
+         * | confounder | clear pages | krb5 header |
+         * ------------------------------------------
+         */
+        data_desc[0].data = plain.data;
+        data_desc[0].len = blocksize;
+
+	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+			       khdr, 1, data_desc,
+			       desc->bd_iov_count,
+			       GET_KIOV(desc),
+			       &cksum))
+		return GSS_S_FAILURE;
+	LASSERT(cksum.len >= ke->ke_hash_size);
+
+        if (memcmp(plain.data + blocksize + sizeof(*khdr),
+                   cksum.data + cksum.len - ke->ke_hash_size,
+                   ke->ke_hash_size)) {
+                CERROR("checksum mismatch\n");
+                rawobj_free(&cksum);
+                return GSS_S_BAD_SIG;
+        }
+
+        rawobj_free(&cksum);
+        return GSS_S_COMPLETE;
+}
+
+int gss_display_kerberos(struct gss_ctx        *ctx,
+                         char                  *buf,
+                         int                    bufsize)
+{
+        struct krb5_ctx    *kctx = ctx->internal_ctx_id;
+        int                 written;
+
+        written = snprintf(buf, bufsize, "krb5 (%s)",
+                           enctype2str(kctx->kc_enctype));
+        return written;
+}
+
+static struct gss_api_ops gss_kerberos_ops = {
+        .gss_import_sec_context     = gss_import_sec_context_kerberos,
+        .gss_copy_reverse_context   = gss_copy_reverse_context_kerberos,
+        .gss_inquire_context        = gss_inquire_context_kerberos,
+        .gss_get_mic                = gss_get_mic_kerberos,
+        .gss_verify_mic             = gss_verify_mic_kerberos,
+        .gss_wrap                   = gss_wrap_kerberos,
+        .gss_unwrap                 = gss_unwrap_kerberos,
+        .gss_prep_bulk              = gss_prep_bulk_kerberos,
+        .gss_wrap_bulk              = gss_wrap_bulk_kerberos,
+        .gss_unwrap_bulk            = gss_unwrap_bulk_kerberos,
+        .gss_delete_sec_context     = gss_delete_sec_context_kerberos,
+        .gss_display                = gss_display_kerberos,
+};
+
+static struct subflavor_desc gss_kerberos_sfs[] = {
+        {
+                .sf_subflavor   = SPTLRPC_SUBFLVR_KRB5N,
+                .sf_qop         = 0,
+                .sf_service     = SPTLRPC_SVC_NULL,
+                .sf_name        = "krb5n"
+        },
+        {
+                .sf_subflavor   = SPTLRPC_SUBFLVR_KRB5A,
+                .sf_qop         = 0,
+                .sf_service     = SPTLRPC_SVC_AUTH,
+                .sf_name        = "krb5a"
+        },
+        {
+                .sf_subflavor   = SPTLRPC_SUBFLVR_KRB5I,
+                .sf_qop         = 0,
+                .sf_service     = SPTLRPC_SVC_INTG,
+                .sf_name        = "krb5i"
+        },
+        {
+                .sf_subflavor   = SPTLRPC_SUBFLVR_KRB5P,
+                .sf_qop         = 0,
+                .sf_service     = SPTLRPC_SVC_PRIV,
+                .sf_name        = "krb5p"
+        },
+};
+
+static struct gss_api_mech gss_kerberos_mech = {
+	/* .gm_owner uses default NULL value for THIS_MODULE */
+        .gm_name        = "krb5",
+        .gm_oid         = (rawobj_t)
+                                {9, "\052\206\110\206\367\022\001\002\002"},
+        .gm_ops         = &gss_kerberos_ops,
+        .gm_sf_num      = 4,
+        .gm_sfs         = gss_kerberos_sfs,
+};
+
+int __init init_kerberos_module(void)
+{
+	int status;
+
+	spin_lock_init(&krb5_seq_lock);
+
+	status = lgss_mech_register(&gss_kerberos_mech);
+	if (status)
+		CERROR("Failed to register kerberos gss mechanism!\n");
+	return status;
+}
+
+void cleanup_kerberos_module(void)
+{
+        lgss_mech_unregister(&gss_kerberos_mech);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_mech_switch.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_mech_switch.c
new file mode 100644
index 0000000000000..be66ffde266d4
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_mech_switch.c
@@ -0,0 +1,359 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/net/sunrpc/gss_mech_switch.c
+ *
+ *  Copyright (c) 2001 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  J. Bruce Fields   <bfields@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+static struct list_head registered_mechs = LIST_HEAD_INIT(registered_mechs);
+static DEFINE_SPINLOCK(registered_mechs_lock);
+
+int lgss_mech_register(struct gss_api_mech *gm)
+{
+	spin_lock(&registered_mechs_lock);
+	list_add(&gm->gm_list, &registered_mechs);
+	spin_unlock(&registered_mechs_lock);
+	CWARN("Register %s mechanism\n", gm->gm_name);
+	return 0;
+}
+
+void lgss_mech_unregister(struct gss_api_mech *gm)
+{
+	spin_lock(&registered_mechs_lock);
+	list_del(&gm->gm_list);
+	spin_unlock(&registered_mechs_lock);
+	CWARN("Unregister %s mechanism\n", gm->gm_name);
+}
+
+
+struct gss_api_mech *lgss_mech_get(struct gss_api_mech *gm)
+{
+	__module_get(gm->gm_owner);
+	return gm;
+}
+
+struct gss_api_mech *lgss_name_to_mech(char *name)
+{
+	struct gss_api_mech *pos, *gm = NULL;
+
+	spin_lock(&registered_mechs_lock);
+	list_for_each_entry(pos, &registered_mechs, gm_list) {
+		if (0 == strcmp(name, pos->gm_name)) {
+			if (!try_module_get(pos->gm_owner))
+				continue;
+			gm = pos;
+			break;
+		}
+	}
+	spin_unlock(&registered_mechs_lock);
+	return gm;
+
+}
+
+static inline
+int mech_supports_subflavor(struct gss_api_mech *gm, __u32 subflavor)
+{
+        int i;
+
+        for (i = 0; i < gm->gm_sf_num; i++) {
+                if (gm->gm_sfs[i].sf_subflavor == subflavor)
+                        return 1;
+        }
+        return 0;
+}
+
+struct gss_api_mech *lgss_subflavor_to_mech(__u32 subflavor)
+{
+	struct gss_api_mech *pos, *gm = NULL;
+
+	spin_lock(&registered_mechs_lock);
+	list_for_each_entry(pos, &registered_mechs, gm_list) {
+		if (!try_module_get(pos->gm_owner))
+			continue;
+		if (!mech_supports_subflavor(pos, subflavor)) {
+			module_put(pos->gm_owner);
+			continue;
+		}
+		gm = pos;
+		break;
+	}
+	spin_unlock(&registered_mechs_lock);
+	return gm;
+}
+
+void lgss_mech_put(struct gss_api_mech *gm)
+{
+	module_put(gm->gm_owner);
+}
+
+/* The mech could probably be determined from the token instead, but it's just
+ * as easy for now to pass it in. */
+__u32 lgss_import_sec_context(rawobj_t *input_token,
+                              struct gss_api_mech *mech,
+                              struct gss_ctx **ctx_id)
+{
+        OBD_ALLOC_PTR(*ctx_id);
+        if (*ctx_id == NULL)
+                return GSS_S_FAILURE;
+
+        (*ctx_id)->mech_type = lgss_mech_get(mech);
+
+        LASSERT(mech);
+        LASSERT(mech->gm_ops);
+        LASSERT(mech->gm_ops->gss_import_sec_context);
+        return mech->gm_ops->gss_import_sec_context(input_token, *ctx_id);
+}
+
+__u32 lgss_copy_reverse_context(struct gss_ctx *ctx_id,
+                                struct gss_ctx **ctx_id_new)
+{
+        struct gss_api_mech *mech = ctx_id->mech_type;
+        __u32                major;
+
+        LASSERT(mech);
+
+        OBD_ALLOC_PTR(*ctx_id_new);
+        if (*ctx_id_new == NULL)
+                return GSS_S_FAILURE;
+
+        (*ctx_id_new)->mech_type = lgss_mech_get(mech);
+
+        LASSERT(mech);
+        LASSERT(mech->gm_ops);
+        LASSERT(mech->gm_ops->gss_copy_reverse_context);
+
+        major = mech->gm_ops->gss_copy_reverse_context(ctx_id, *ctx_id_new);
+        if (major != GSS_S_COMPLETE) {
+                lgss_mech_put(mech);
+                OBD_FREE_PTR(*ctx_id_new);
+                *ctx_id_new = NULL;
+        }
+        return major;
+}
+
+/*
+ * this interface is much simplified, currently we only need endtime.
+ */
+__u32 lgss_inquire_context(struct gss_ctx *context_handle,
+                           unsigned long  *endtime)
+{
+        LASSERT(context_handle);
+        LASSERT(context_handle->mech_type);
+        LASSERT(context_handle->mech_type->gm_ops);
+        LASSERT(context_handle->mech_type->gm_ops->gss_inquire_context);
+
+        return context_handle->mech_type->gm_ops
+                ->gss_inquire_context(context_handle,
+                                      endtime);
+}
+
+/* gss_get_mic: compute a mic over message and return mic_token. */
+__u32 lgss_get_mic(struct gss_ctx *context_handle,
+                   int msgcnt,
+                   rawobj_t *msg,
+                   int iovcnt,
+                   lnet_kiov_t *iovs,
+                   rawobj_t *mic_token)
+{
+        LASSERT(context_handle);
+        LASSERT(context_handle->mech_type);
+        LASSERT(context_handle->mech_type->gm_ops);
+        LASSERT(context_handle->mech_type->gm_ops->gss_get_mic);
+
+        return context_handle->mech_type->gm_ops
+                ->gss_get_mic(context_handle,
+                              msgcnt,
+                              msg,
+                              iovcnt,
+                              iovs,
+                              mic_token);
+}
+
+/* gss_verify_mic: check whether the provided mic_token verifies message. */
+__u32 lgss_verify_mic(struct gss_ctx *context_handle,
+                      int msgcnt,
+                      rawobj_t *msg,
+                      int iovcnt,
+                      lnet_kiov_t *iovs,
+                      rawobj_t *mic_token)
+{
+        LASSERT(context_handle);
+        LASSERT(context_handle->mech_type);
+        LASSERT(context_handle->mech_type->gm_ops);
+        LASSERT(context_handle->mech_type->gm_ops->gss_verify_mic);
+
+        return context_handle->mech_type->gm_ops
+                ->gss_verify_mic(context_handle,
+                                 msgcnt,
+                                 msg,
+                                 iovcnt,
+                                 iovs,
+                                 mic_token);
+}
+
+__u32 lgss_wrap(struct gss_ctx *context_handle,
+                rawobj_t *gsshdr,
+                rawobj_t *msg,
+                int msg_buflen,
+                rawobj_t *out_token)
+{
+        LASSERT(context_handle);
+        LASSERT(context_handle->mech_type);
+        LASSERT(context_handle->mech_type->gm_ops);
+        LASSERT(context_handle->mech_type->gm_ops->gss_wrap);
+
+        return context_handle->mech_type->gm_ops
+                ->gss_wrap(context_handle, gsshdr, msg, msg_buflen, out_token);
+}
+
+__u32 lgss_unwrap(struct gss_ctx *context_handle,
+                  rawobj_t *gsshdr,
+                  rawobj_t *token,
+                  rawobj_t *out_msg)
+{
+        LASSERT(context_handle);
+        LASSERT(context_handle->mech_type);
+        LASSERT(context_handle->mech_type->gm_ops);
+        LASSERT(context_handle->mech_type->gm_ops->gss_unwrap);
+
+        return context_handle->mech_type->gm_ops
+                ->gss_unwrap(context_handle, gsshdr, token, out_msg);
+}
+
+
+__u32 lgss_prep_bulk(struct gss_ctx *context_handle,
+                     struct ptlrpc_bulk_desc *desc)
+{
+        LASSERT(context_handle);
+        LASSERT(context_handle->mech_type);
+        LASSERT(context_handle->mech_type->gm_ops);
+        LASSERT(context_handle->mech_type->gm_ops->gss_prep_bulk);
+
+        return context_handle->mech_type->gm_ops
+                ->gss_prep_bulk(context_handle, desc);
+}
+
+__u32 lgss_wrap_bulk(struct gss_ctx *context_handle,
+                     struct ptlrpc_bulk_desc *desc,
+                     rawobj_t *token,
+                     int adj_nob)
+{
+        LASSERT(context_handle);
+        LASSERT(context_handle->mech_type);
+        LASSERT(context_handle->mech_type->gm_ops);
+        LASSERT(context_handle->mech_type->gm_ops->gss_wrap_bulk);
+
+        return context_handle->mech_type->gm_ops
+                ->gss_wrap_bulk(context_handle, desc, token, adj_nob);
+}
+
+__u32 lgss_unwrap_bulk(struct gss_ctx *context_handle,
+                       struct ptlrpc_bulk_desc *desc,
+                       rawobj_t *token,
+                       int adj_nob)
+{
+        LASSERT(context_handle);
+        LASSERT(context_handle->mech_type);
+        LASSERT(context_handle->mech_type->gm_ops);
+        LASSERT(context_handle->mech_type->gm_ops->gss_unwrap_bulk);
+
+        return context_handle->mech_type->gm_ops
+                ->gss_unwrap_bulk(context_handle, desc, token, adj_nob);
+}
+
+/* gss_delete_sec_context: free all resources associated with context_handle.
+ * Note this differs from the RFC 2744-specified prototype in that we don't
+ * bother returning an output token, since it would never be used anyway. */
+
+__u32 lgss_delete_sec_context(struct gss_ctx **context_handle)
+{
+	struct gss_api_mech *mech;
+
+	if (!*context_handle)
+		return GSS_S_NO_CONTEXT;
+
+	CDEBUG(D_SEC, "deleting %p\n", *context_handle);
+
+	mech = (*context_handle)->mech_type;
+	if ((*context_handle)->internal_ctx_id != NULL) {
+		LASSERT(mech);
+		LASSERT(mech->gm_ops);
+		LASSERT(mech->gm_ops->gss_delete_sec_context);
+		mech->gm_ops->gss_delete_sec_context(
+			(*context_handle)->internal_ctx_id);
+	}
+	if (mech)
+		lgss_mech_put(mech);
+
+	OBD_FREE_PTR(*context_handle);
+	*context_handle = NULL;
+	return GSS_S_COMPLETE;
+}
+
+int lgss_display(struct gss_ctx *ctx,
+                 char           *buf,
+                 int             bufsize)
+{
+        LASSERT(ctx);
+        LASSERT(ctx->mech_type);
+        LASSERT(ctx->mech_type->gm_ops);
+        LASSERT(ctx->mech_type->gm_ops->gss_display);
+
+        return ctx->mech_type->gm_ops->gss_display(ctx, buf, bufsize);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_null_mech.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_null_mech.c
new file mode 100644
index 0000000000000..fddd3ed3443c1
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_null_mech.c
@@ -0,0 +1,219 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (C) 2013, 2015, Trustees of Indiana University
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * Author: Jeremy Filizetti <jfilizet@iu.edu>
+ * Author: Andrew Korty <ajk@iu.edu>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_asn1.h"
+
+struct null_ctx {
+	__u64 nc_token;
+};
+
+static
+__u32 gss_import_sec_context_null(rawobj_t *inbuf, struct gss_ctx *gss_context)
+{
+	struct null_ctx *null_context;
+
+	if (inbuf == NULL || inbuf->data == NULL ||
+	    inbuf->len != sizeof(*null_context)) {
+		CDEBUG(D_SEC, "Invalid input buffer for null context\n");
+		return GSS_S_FAILURE;
+	}
+
+	OBD_ALLOC_PTR(null_context);
+	if (null_context == NULL)
+		return GSS_S_FAILURE;
+
+	memcpy(&null_context->nc_token, inbuf->data, inbuf->len);
+
+	gss_context->internal_ctx_id = null_context;
+	CDEBUG(D_SEC, "successfully imported null context\n");
+
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_copy_reverse_context_null(struct gss_ctx *gss_context_old,
+				    struct gss_ctx *gss_context_new)
+{
+	struct null_ctx *null_context_old;
+	struct null_ctx *null_context_new;
+
+	OBD_ALLOC_PTR(null_context_new);
+	if (null_context_new == NULL)
+		return GSS_S_FAILURE;
+
+	null_context_old = gss_context_old->internal_ctx_id;
+	memcpy(null_context_new, null_context_old, sizeof(*null_context_new));
+	gss_context_new->internal_ctx_id = null_context_new;
+	CDEBUG(D_SEC, "successfully copied reverse null context\n");
+
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_inquire_context_null(struct gss_ctx *gss_context,
+			       unsigned long *endtime)
+{
+	/* quick timeout for testing purposes */
+	*endtime = cfs_time_current_sec() + 60;
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_wrap_null(struct gss_ctx *gss_context, rawobj_t *gss_header,
+		    rawobj_t *message, int message_buffer_length,
+		    rawobj_t *token)
+{
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_unwrap_null(struct gss_ctx *gss_context, rawobj_t *gss_header,
+		      rawobj_t *token, rawobj_t *message)
+{
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_prep_bulk_null(struct gss_ctx *gss_context,
+			 struct ptlrpc_bulk_desc *desc)
+{
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_wrap_bulk_null(struct gss_ctx *gss_context,
+			 struct ptlrpc_bulk_desc *desc, rawobj_t *token,
+			 int adj_nob)
+{
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_unwrap_bulk_null(struct gss_ctx *gss_context,
+			   struct ptlrpc_bulk_desc *desc,
+			   rawobj_t *token, int adj_nob)
+{
+	return GSS_S_COMPLETE;
+}
+
+static
+void gss_delete_sec_context_null(void *internal_context)
+{
+	struct null_ctx *null_context = internal_context;
+
+	OBD_FREE_PTR(null_context);
+}
+
+int gss_display_null(struct gss_ctx *gss_context, char *buf, int bufsize)
+{
+	return snprintf(buf, bufsize, "null");
+}
+
+static
+__u32 gss_get_mic_null(struct gss_ctx *gss_context, int message_count,
+		       rawobj_t *messages, int iov_count, lnet_kiov_t *iovs,
+		       rawobj_t *token)
+{
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_verify_mic_null(struct gss_ctx *gss_context, int message_count,
+			  rawobj_t *messages, int iov_count, lnet_kiov_t *iovs,
+			  rawobj_t *token)
+{
+	return GSS_S_COMPLETE;
+}
+
+static struct gss_api_ops gss_null_ops = {
+	.gss_import_sec_context     = gss_import_sec_context_null,
+	.gss_copy_reverse_context   = gss_copy_reverse_context_null,
+	.gss_inquire_context        = gss_inquire_context_null,
+	.gss_get_mic                = gss_get_mic_null,
+	.gss_verify_mic             = gss_verify_mic_null,
+	.gss_wrap                   = gss_wrap_null,
+	.gss_unwrap                 = gss_unwrap_null,
+	.gss_prep_bulk              = gss_prep_bulk_null,
+	.gss_wrap_bulk              = gss_wrap_bulk_null,
+	.gss_unwrap_bulk            = gss_unwrap_bulk_null,
+	.gss_delete_sec_context     = gss_delete_sec_context_null,
+	.gss_display                = gss_display_null,
+};
+
+static struct subflavor_desc gss_null_sfs[] = {
+	{
+		.sf_subflavor   = SPTLRPC_SUBFLVR_GSSNULL,
+		.sf_qop         = 0,
+		.sf_service     = SPTLRPC_SVC_NULL,
+		.sf_name        = "gssnull"
+	},
+};
+
+static struct gss_api_mech gss_null_mech = {
+	/* .gm_owner uses default NULL value for THIS_MODULE */
+	.gm_name        = "gssnull",
+	.gm_oid         = (rawobj_t) {
+		12,
+		"\053\006\001\004\001\311\146\215\126\001\000\000"
+	},
+	.gm_ops         = &gss_null_ops,
+	.gm_sf_num      = 1,
+	.gm_sfs         = gss_null_sfs,
+};
+
+int __init init_null_module(void)
+{
+	int status;
+
+	status = lgss_mech_register(&gss_null_mech);
+	if (status)
+		CERROR("Failed to register null gss mechanism!\n");
+
+	return status;
+}
+
+void cleanup_null_module(void)
+{
+	lgss_mech_unregister(&gss_null_mech);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c
new file mode 100644
index 0000000000000..016d455040972
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c
@@ -0,0 +1,1254 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * linux/net/sunrpc/auth_gss.c
+ *
+ * RPCSEC_GSS client authentication.
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Dug Song       <dugsong@monkey.org>
+ *  Andy Adamson   <andros@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/crypto.h>
+#include <asm/atomic.h>
+struct rpc_clnt; /* for rpc_pipefs */
+#include <linux/sunrpc/rpc_pipe_fs.h>
+
+#include <libcfs/linux/linux-list.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_sec.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+static struct ptlrpc_sec_policy gss_policy_pipefs;
+static struct ptlrpc_ctx_ops gss_pipefs_ctxops;
+
+static int gss_cli_ctx_refresh_pf(struct ptlrpc_cli_ctx *ctx);
+
+static int gss_sec_pipe_upcall_init(struct gss_sec *gsec)
+{
+        return 0;
+}
+
+static void gss_sec_pipe_upcall_fini(struct gss_sec *gsec)
+{
+}
+
+/****************************************
+ * internal context helpers             *
+ ****************************************/
+
+static
+struct ptlrpc_cli_ctx *ctx_create_pf(struct ptlrpc_sec *sec,
+                                     struct vfs_cred *vcred)
+{
+        struct gss_cli_ctx *gctx;
+        int                 rc;
+
+        OBD_ALLOC_PTR(gctx);
+        if (gctx == NULL)
+                return NULL;
+
+        rc = gss_cli_ctx_init_common(sec, &gctx->gc_base,
+                                     &gss_pipefs_ctxops, vcred);
+        if (rc) {
+                OBD_FREE_PTR(gctx);
+                return NULL;
+        }
+
+        return &gctx->gc_base;
+}
+
+static
+void ctx_destroy_pf(struct ptlrpc_sec *sec, struct ptlrpc_cli_ctx *ctx)
+{
+	struct gss_cli_ctx *gctx = ctx2gctx(ctx);
+
+	if (gss_cli_ctx_fini_common(sec, ctx))
+		return;
+
+	OBD_FREE_PTR(gctx);
+
+	atomic_dec(&sec->ps_nctx);
+	sptlrpc_sec_put(sec);
+}
+
+static
+void ctx_enhash_pf(struct ptlrpc_cli_ctx *ctx, struct hlist_head *hash)
+{
+	set_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags);
+	atomic_inc(&ctx->cc_refcount);
+	hlist_add_head(&ctx->cc_cache, hash);
+}
+
+/*
+ * caller must hold spinlock
+ */
+static
+void ctx_unhash_pf(struct ptlrpc_cli_ctx *ctx, struct hlist_head *freelist)
+{
+	assert_spin_locked(&ctx->cc_sec->ps_lock);
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+	LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags));
+	LASSERT(!hlist_unhashed(&ctx->cc_cache));
+
+	clear_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags);
+
+	if (atomic_dec_and_test(&ctx->cc_refcount)) {
+		__hlist_del(&ctx->cc_cache);
+		hlist_add_head(&ctx->cc_cache, freelist);
+	} else {
+		hlist_del_init(&ctx->cc_cache);
+	}
+}
+
+/*
+ * return 1 if the context is dead.
+ */
+static
+int ctx_check_death_pf(struct ptlrpc_cli_ctx *ctx,
+		       struct hlist_head *freelist)
+{
+        if (cli_ctx_check_death(ctx)) {
+                if (freelist)
+                        ctx_unhash_pf(ctx, freelist);
+                return 1;
+        }
+
+        return 0;
+}
+
+static inline
+int ctx_check_death_locked_pf(struct ptlrpc_cli_ctx *ctx,
+			      struct hlist_head *freelist)
+{
+	LASSERT(ctx->cc_sec);
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+	LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags));
+
+	return ctx_check_death_pf(ctx, freelist);
+}
+
+static inline
+int ctx_match_pf(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred)
+{
+        /* a little bit optimization for null policy */
+        if (!ctx->cc_ops->match)
+                return 1;
+
+        return ctx->cc_ops->match(ctx, vcred);
+}
+
+static
+void ctx_list_destroy_pf(struct hlist_head *head)
+{
+	struct ptlrpc_cli_ctx *ctx;
+
+	while (!hlist_empty(head)) {
+		ctx = cfs_hlist_entry(head->first, struct ptlrpc_cli_ctx,
+				      cc_cache);
+
+		LASSERT(atomic_read(&ctx->cc_refcount) == 0);
+		LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT,
+				 &ctx->cc_flags) == 0);
+
+		hlist_del_init(&ctx->cc_cache);
+		ctx_destroy_pf(ctx->cc_sec, ctx);
+	}
+}
+
+/****************************************
+ * context apis                         *
+ ****************************************/
+
+static
+int gss_cli_ctx_validate_pf(struct ptlrpc_cli_ctx *ctx)
+{
+        if (ctx_check_death_pf(ctx, NULL))
+                return 1;
+        if (cli_ctx_is_ready(ctx))
+                return 0;
+        return 1;
+}
+
+static
+void gss_cli_ctx_die_pf(struct ptlrpc_cli_ctx *ctx, int grace)
+{
+	LASSERT(ctx->cc_sec);
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+	cli_ctx_expire(ctx);
+
+	spin_lock(&ctx->cc_sec->ps_lock);
+
+	if (test_and_clear_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags)) {
+		LASSERT(!hlist_unhashed(&ctx->cc_cache));
+		LASSERT(atomic_read(&ctx->cc_refcount) > 1);
+
+		hlist_del_init(&ctx->cc_cache);
+		if (atomic_dec_and_test(&ctx->cc_refcount))
+			LBUG();
+	}
+
+	spin_unlock(&ctx->cc_sec->ps_lock);
+}
+
+/****************************************
+ * reverse context installation         *
+ ****************************************/
+
+static inline
+unsigned int ctx_hash_index(int hashsize, __u64 key)
+{
+        return (unsigned int) (key & ((__u64) hashsize - 1));
+}
+
+static
+void gss_sec_ctx_replace_pf(struct gss_sec *gsec,
+                            struct ptlrpc_cli_ctx *new)
+{
+	struct hlist_node __maybe_unused *pos, *next;
+	struct gss_sec_pipefs *gsec_pf;
+	struct ptlrpc_cli_ctx *ctx;
+	HLIST_HEAD(freelist);
+	unsigned int hash;
+	ENTRY;
+
+        gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base);
+
+        hash = ctx_hash_index(gsec_pf->gsp_chash_size,
+                              (__u64) new->cc_vcred.vc_uid);
+        LASSERT(hash < gsec_pf->gsp_chash_size);
+
+	spin_lock(&gsec->gs_base.ps_lock);
+
+        cfs_hlist_for_each_entry_safe(ctx, pos, next,
+                                      &gsec_pf->gsp_chash[hash], cc_cache) {
+                if (!ctx_match_pf(ctx, &new->cc_vcred))
+                        continue;
+
+                cli_ctx_expire(ctx);
+                ctx_unhash_pf(ctx, &freelist);
+                break;
+        }
+
+        ctx_enhash_pf(new, &gsec_pf->gsp_chash[hash]);
+
+	spin_unlock(&gsec->gs_base.ps_lock);
+
+        ctx_list_destroy_pf(&freelist);
+        EXIT;
+}
+
+static
+int gss_install_rvs_cli_ctx_pf(struct gss_sec *gsec,
+                               struct ptlrpc_svc_ctx *svc_ctx)
+{
+        struct vfs_cred          vcred;
+        struct ptlrpc_cli_ctx   *cli_ctx;
+        int                      rc;
+        ENTRY;
+
+        vcred.vc_uid = 0;
+        vcred.vc_gid = 0;
+
+        cli_ctx = ctx_create_pf(&gsec->gs_base, &vcred);
+        if (!cli_ctx)
+                RETURN(-ENOMEM);
+
+        rc = gss_copy_rvc_cli_ctx(cli_ctx, svc_ctx);
+        if (rc) {
+                ctx_destroy_pf(cli_ctx->cc_sec, cli_ctx);
+                RETURN(rc);
+        }
+
+        gss_sec_ctx_replace_pf(gsec, cli_ctx);
+        RETURN(0);
+}
+
+static
+void gss_ctx_cache_gc_pf(struct gss_sec_pipefs *gsec_pf,
+			 struct hlist_head *freelist)
+{
+	struct ptlrpc_sec	*sec;
+	struct ptlrpc_cli_ctx	*ctx;
+	struct hlist_node	__maybe_unused *pos;
+	struct hlist_node	*next;
+	int i;
+	ENTRY;
+
+        sec = &gsec_pf->gsp_base.gs_base;
+
+        CDEBUG(D_SEC, "do gc on sec %s@%p\n", sec->ps_policy->sp_name, sec);
+
+        for (i = 0; i < gsec_pf->gsp_chash_size; i++) {
+                cfs_hlist_for_each_entry_safe(ctx, pos, next,
+                                              &gsec_pf->gsp_chash[i], cc_cache)
+                        ctx_check_death_locked_pf(ctx, freelist);
+        }
+
+	sec->ps_gc_next = ktime_get_real_seconds() + sec->ps_gc_interval;
+	EXIT;
+}
+
+static
+struct ptlrpc_sec* gss_sec_create_pf(struct obd_import *imp,
+                                     struct ptlrpc_svc_ctx *ctx,
+                                     struct sptlrpc_flavor *sf)
+{
+        struct gss_sec_pipefs   *gsec_pf;
+        int                      alloc_size, hash_size, i;
+        ENTRY;
+
+#define GSS_SEC_PIPEFS_CTX_HASH_SIZE    (32)
+
+        if (ctx ||
+            sf->sf_flags & (PTLRPC_SEC_FL_ROOTONLY | PTLRPC_SEC_FL_REVERSE))
+                hash_size = 1;
+        else
+                hash_size = GSS_SEC_PIPEFS_CTX_HASH_SIZE;
+
+        alloc_size = sizeof(*gsec_pf) +
+		     sizeof(struct hlist_head) * hash_size;
+
+        OBD_ALLOC(gsec_pf, alloc_size);
+        if (!gsec_pf)
+                RETURN(NULL);
+
+        gsec_pf->gsp_chash_size = hash_size;
+        for (i = 0; i < hash_size; i++)
+		INIT_HLIST_HEAD(&gsec_pf->gsp_chash[i]);
+
+        if (gss_sec_create_common(&gsec_pf->gsp_base, &gss_policy_pipefs,
+                                  imp, ctx, sf))
+                goto err_free;
+
+        if (ctx == NULL) {
+                if (gss_sec_pipe_upcall_init(&gsec_pf->gsp_base))
+                        goto err_destroy;
+        } else {
+                if (gss_install_rvs_cli_ctx_pf(&gsec_pf->gsp_base, ctx))
+                        goto err_destroy;
+        }
+
+        RETURN(&gsec_pf->gsp_base.gs_base);
+
+err_destroy:
+        gss_sec_destroy_common(&gsec_pf->gsp_base);
+err_free:
+        OBD_FREE(gsec_pf, alloc_size);
+        RETURN(NULL);
+}
+
+static
+void gss_sec_destroy_pf(struct ptlrpc_sec *sec)
+{
+        struct gss_sec_pipefs   *gsec_pf;
+        struct gss_sec          *gsec;
+
+        CWARN("destroy %s@%p\n", sec->ps_policy->sp_name, sec);
+
+        gsec = container_of(sec, struct gss_sec, gs_base);
+        gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base);
+
+        LASSERT(gsec_pf->gsp_chash);
+        LASSERT(gsec_pf->gsp_chash_size);
+
+        gss_sec_pipe_upcall_fini(gsec);
+
+        gss_sec_destroy_common(gsec);
+
+        OBD_FREE(gsec, sizeof(*gsec_pf) +
+		       sizeof(struct hlist_head) * gsec_pf->gsp_chash_size);
+}
+
+static
+struct ptlrpc_cli_ctx * gss_sec_lookup_ctx_pf(struct ptlrpc_sec *sec,
+                                              struct vfs_cred *vcred,
+                                              int create, int remove_dead)
+{
+	struct gss_sec		*gsec;
+	struct gss_sec_pipefs	*gsec_pf;
+	struct ptlrpc_cli_ctx	*ctx = NULL, *new = NULL;
+	struct hlist_head	*hash_head;
+	struct hlist_node	__maybe_unused *pos, *next;
+	unsigned int		hash, gc = 0, found = 0;
+	HLIST_HEAD(freelist);
+	ENTRY;
+
+	might_sleep();
+
+	gsec = container_of(sec, struct gss_sec, gs_base);
+	gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base);
+
+        hash = ctx_hash_index(gsec_pf->gsp_chash_size,
+                              (__u64) vcred->vc_uid);
+        hash_head = &gsec_pf->gsp_chash[hash];
+        LASSERT(hash < gsec_pf->gsp_chash_size);
+
+retry:
+	spin_lock(&sec->ps_lock);
+
+	/* gc_next == 0 means never do gc */
+	if (remove_dead && sec->ps_gc_next &&
+	   (ktime_get_real_seconds() > sec->ps_gc_next)) {
+		gss_ctx_cache_gc_pf(gsec_pf, &freelist);
+		gc = 1;
+	}
+
+        cfs_hlist_for_each_entry_safe(ctx, pos, next, hash_head, cc_cache) {
+                if (gc == 0 &&
+                    ctx_check_death_locked_pf(ctx,
+                                              remove_dead ? &freelist : NULL))
+                        continue;
+
+                if (ctx_match_pf(ctx, vcred)) {
+                        found = 1;
+                        break;
+                }
+        }
+
+        if (found) {
+                if (new && new != ctx) {
+                        /* lost the race, just free it */
+			hlist_add_head(&new->cc_cache, &freelist);
+                        new = NULL;
+                }
+
+                /* hot node, move to head */
+                if (hash_head->first != &ctx->cc_cache) {
+			__hlist_del(&ctx->cc_cache);
+			hlist_add_head(&ctx->cc_cache, hash_head);
+                }
+        } else {
+                /* don't allocate for reverse sec */
+                if (sec_is_reverse(sec)) {
+			spin_unlock(&sec->ps_lock);
+			RETURN(NULL);
+		}
+
+		if (new) {
+			ctx_enhash_pf(new, hash_head);
+			ctx = new;
+		} else if (create) {
+			spin_unlock(&sec->ps_lock);
+			new = ctx_create_pf(sec, vcred);
+			if (new) {
+				clear_bit(PTLRPC_CTX_NEW_BIT, &new->cc_flags);
+				goto retry;
+			}
+		} else {
+			ctx = NULL;
+		}
+	}
+
+	/* hold a ref */
+	if (ctx)
+		atomic_inc(&ctx->cc_refcount);
+
+	spin_unlock(&sec->ps_lock);
+
+	/* the allocator of the context must give the first push to refresh */
+	if (new) {
+		LASSERT(new == ctx);
+		gss_cli_ctx_refresh_pf(new);
+	}
+
+	ctx_list_destroy_pf(&freelist);
+	RETURN(ctx);
+}
+
+static
+void gss_sec_release_ctx_pf(struct ptlrpc_sec *sec,
+                            struct ptlrpc_cli_ctx *ctx,
+                            int sync)
+{
+	LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0);
+	LASSERT(hlist_unhashed(&ctx->cc_cache));
+
+        /* if required async, we must clear the UPTODATE bit to prevent extra
+         * rpcs during destroy procedure. */
+        if (!sync)
+		clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags);
+
+        /* destroy this context */
+        ctx_destroy_pf(sec, ctx);
+}
+
+/*
+ * @uid: which user. "-1" means flush all.
+ * @grace: mark context DEAD, allow graceful destroy like notify
+ *         server side, etc.
+ * @force: also flush busy entries.
+ *
+ * return the number of busy context encountered.
+ *
+ * In any cases, never touch "eternal" contexts.
+ */
+static
+int gss_sec_flush_ctx_cache_pf(struct ptlrpc_sec *sec,
+                               uid_t uid,
+                               int grace, int force)
+{
+	struct gss_sec		*gsec;
+	struct gss_sec_pipefs	*gsec_pf;
+	struct ptlrpc_cli_ctx	*ctx;
+	struct hlist_node	__maybe_unused *pos, *next;
+	HLIST_HEAD(freelist);
+	int i, busy = 0;
+	ENTRY;
+
+	might_sleep_if(grace);
+
+	gsec = container_of(sec, struct gss_sec, gs_base);
+	gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base);
+
+	spin_lock(&sec->ps_lock);
+	for (i = 0; i < gsec_pf->gsp_chash_size; i++) {
+		cfs_hlist_for_each_entry_safe(ctx, pos, next,
+					      &gsec_pf->gsp_chash[i],
+					      cc_cache) {
+			LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+			if (uid != -1 && uid != ctx->cc_vcred.vc_uid)
+				continue;
+
+			if (atomic_read(&ctx->cc_refcount) > 1) {
+				busy++;
+				if (!force)
+					continue;
+
+				CWARN("flush busy(%d) ctx %p(%u->%s) by force, "
+				      "grace %d\n",
+				      atomic_read(&ctx->cc_refcount),
+				      ctx, ctx->cc_vcred.vc_uid,
+				      sec2target_str(ctx->cc_sec), grace);
+			}
+			ctx_unhash_pf(ctx, &freelist);
+
+			set_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags);
+			if (!grace)
+				clear_bit(PTLRPC_CTX_UPTODATE_BIT,
+					  &ctx->cc_flags);
+		}
+	}
+	spin_unlock(&sec->ps_lock);
+
+	ctx_list_destroy_pf(&freelist);
+	RETURN(busy);
+}
+
+/****************************************
+ * service apis                         *
+ ****************************************/
+
+static
+int gss_svc_accept_pf(struct ptlrpc_request *req)
+{
+        return gss_svc_accept(&gss_policy_pipefs, req);
+}
+
+static
+int gss_svc_install_rctx_pf(struct obd_import *imp,
+                            struct ptlrpc_svc_ctx *ctx)
+{
+        struct ptlrpc_sec *sec;
+        int                rc;
+
+        sec = sptlrpc_import_sec_ref(imp);
+        LASSERT(sec);
+        rc = gss_install_rvs_cli_ctx_pf(sec2gsec(sec), ctx);
+
+        sptlrpc_sec_put(sec);
+        return rc;
+}
+
+/****************************************
+ * rpc_pipefs definitions               *
+ ****************************************/
+
+#define LUSTRE_PIPE_ROOT        "/lustre"
+#define LUSTRE_PIPE_KRB5        LUSTRE_PIPE_ROOT"/krb5"
+
+struct gss_upcall_msg_data {
+        __u32                           gum_seq;
+        __u32                           gum_uid;
+        __u32                           gum_gid;
+        __u32                           gum_svc;        /* MDS/OSS... */
+        __u64                           gum_nid;        /* peer NID */
+        __u8                            gum_obd[64];    /* client obd name */
+};
+
+struct gss_upcall_msg {
+	struct rpc_pipe_msg		gum_base;
+	atomic_t			gum_refcount;
+	struct list_head		gum_list;
+	__u32				gum_mechidx;
+	struct gss_sec			*gum_gsec;
+	struct gss_cli_ctx		*gum_gctx;
+	struct gss_upcall_msg_data	gum_data;
+};
+
+static atomic_t upcall_seq = ATOMIC_INIT(0);
+
+static inline
+__u32 upcall_get_sequence(void)
+{
+	return (__u32) atomic_inc_return(&upcall_seq);
+}
+
+enum mech_idx_t {
+        MECH_KRB5   = 0,
+        MECH_MAX
+};
+
+static inline
+__u32 mech_name2idx(const char *name)
+{
+        LASSERT(!strcmp(name, "krb5"));
+        return MECH_KRB5;
+}
+
+/* pipefs dentries for each mechanisms */
+static struct dentry *de_pipes[MECH_MAX] = { NULL, };
+/* all upcall messgaes linked here */
+static struct list_head upcall_lists[MECH_MAX];
+/* and protected by this */
+static spinlock_t upcall_locks[MECH_MAX];
+
+static inline
+void upcall_list_lock(int idx)
+{
+	spin_lock(&upcall_locks[idx]);
+}
+
+static inline
+void upcall_list_unlock(int idx)
+{
+	spin_unlock(&upcall_locks[idx]);
+}
+
+static
+void upcall_msg_enlist(struct gss_upcall_msg *msg)
+{
+	__u32 idx = msg->gum_mechidx;
+
+	upcall_list_lock(idx);
+	list_add(&msg->gum_list, &upcall_lists[idx]);
+	upcall_list_unlock(idx);
+}
+
+static
+void upcall_msg_delist(struct gss_upcall_msg *msg)
+{
+	__u32 idx = msg->gum_mechidx;
+
+	upcall_list_lock(idx);
+	list_del_init(&msg->gum_list);
+	upcall_list_unlock(idx);
+}
+
+/****************************************
+ * rpc_pipefs upcall helpers            *
+ ****************************************/
+
+static
+void gss_release_msg(struct gss_upcall_msg *gmsg)
+{
+	ENTRY;
+	LASSERT(atomic_read(&gmsg->gum_refcount) > 0);
+
+	if (!atomic_dec_and_test(&gmsg->gum_refcount)) {
+		EXIT;
+		return;
+	}
+
+        if (gmsg->gum_gctx) {
+                sptlrpc_cli_ctx_wakeup(&gmsg->gum_gctx->gc_base);
+                sptlrpc_cli_ctx_put(&gmsg->gum_gctx->gc_base, 1);
+                gmsg->gum_gctx = NULL;
+        }
+
+	LASSERT(list_empty(&gmsg->gum_list));
+	LASSERT(list_empty(&gmsg->gum_base.list));
+	OBD_FREE_PTR(gmsg);
+	EXIT;
+}
+
+static
+void gss_unhash_msg_nolock(struct gss_upcall_msg *gmsg)
+{
+	__u32 idx = gmsg->gum_mechidx;
+
+	LASSERT(idx < MECH_MAX);
+	assert_spin_locked(&upcall_locks[idx]);
+
+	if (list_empty(&gmsg->gum_list))
+		return;
+
+	list_del_init(&gmsg->gum_list);
+	LASSERT(atomic_read(&gmsg->gum_refcount) > 1);
+	atomic_dec(&gmsg->gum_refcount);
+}
+
+static
+void gss_unhash_msg(struct gss_upcall_msg *gmsg)
+{
+        __u32 idx = gmsg->gum_mechidx;
+
+        LASSERT(idx < MECH_MAX);
+        upcall_list_lock(idx);
+        gss_unhash_msg_nolock(gmsg);
+        upcall_list_unlock(idx);
+}
+
+static
+void gss_msg_fail_ctx(struct gss_upcall_msg *gmsg)
+{
+	if (gmsg->gum_gctx) {
+		struct ptlrpc_cli_ctx *ctx = &gmsg->gum_gctx->gc_base;
+
+		LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+		sptlrpc_cli_ctx_expire(ctx);
+		set_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags);
+	}
+}
+
+static
+struct gss_upcall_msg * gss_find_upcall(__u32 mechidx, __u32 seq)
+{
+	struct gss_upcall_msg *gmsg;
+
+	upcall_list_lock(mechidx);
+	list_for_each_entry(gmsg, &upcall_lists[mechidx], gum_list) {
+		if (gmsg->gum_data.gum_seq != seq)
+			continue;
+
+		LASSERT(atomic_read(&gmsg->gum_refcount) > 0);
+		LASSERT(gmsg->gum_mechidx == mechidx);
+
+		atomic_inc(&gmsg->gum_refcount);
+		upcall_list_unlock(mechidx);
+		return gmsg;
+	}
+	upcall_list_unlock(mechidx);
+	return NULL;
+}
+
+static
+int simple_get_bytes(char **buf, __u32 *buflen, void *res, __u32 reslen)
+{
+        if (*buflen < reslen) {
+                CERROR("buflen %u < %u\n", *buflen, reslen);
+                return -EINVAL;
+        }
+
+        memcpy(res, *buf, reslen);
+        *buf += reslen;
+        *buflen -= reslen;
+        return 0;
+}
+
+/****************************************
+ * rpc_pipefs apis                      *
+ ****************************************/
+
+static
+ssize_t gss_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
+                        char *dst, size_t buflen)
+{
+        char *data = (char *)msg->data + msg->copied;
+        ssize_t mlen = msg->len;
+        ssize_t left;
+        ENTRY;
+
+        if (mlen > buflen)
+                mlen = buflen;
+	left = copy_to_user(dst, data, mlen);
+        if (left < 0) {
+                msg->errno = left;
+                RETURN(left);
+        }
+        mlen -= left;
+        msg->copied += mlen;
+        msg->errno = 0;
+        RETURN(mlen);
+}
+
+static
+ssize_t gss_pipe_downcall(struct file *filp, const char *src, size_t mlen)
+{
+	struct rpc_inode        *rpci = RPC_I(file_inode(filp));
+        struct gss_upcall_msg   *gss_msg;
+        struct ptlrpc_cli_ctx   *ctx;
+        struct gss_cli_ctx      *gctx = NULL;
+        char                    *buf, *data;
+        int                      datalen;
+        int                      timeout, rc;
+        __u32                    mechidx, seq, gss_err;
+        ENTRY;
+
+        mechidx = (__u32) (long) rpci->private;
+        LASSERT(mechidx < MECH_MAX);
+
+        OBD_ALLOC(buf, mlen);
+        if (!buf)
+                RETURN(-ENOMEM);
+
+	if (copy_from_user(buf, src, mlen)) {
+                CERROR("failed copy user space data\n");
+                GOTO(out_free, rc = -EFAULT);
+        }
+        data = buf;
+        datalen = mlen;
+
+        /* data passed down format:
+         *  - seq
+         *  - timeout
+         *  - gc_win / error
+         *  - wire_ctx (rawobj)
+         *  - mech_ctx (rawobj)
+         */
+        if (simple_get_bytes(&data, &datalen, &seq, sizeof(seq))) {
+                CERROR("fail to get seq\n");
+                GOTO(out_free, rc = -EFAULT);
+        }
+
+        gss_msg = gss_find_upcall(mechidx, seq);
+        if (!gss_msg) {
+                CERROR("upcall %u has aborted earlier\n", seq);
+                GOTO(out_free, rc = -EINVAL);
+        }
+
+	gss_unhash_msg(gss_msg);
+	gctx = gss_msg->gum_gctx;
+	LASSERT(gctx);
+	LASSERT(atomic_read(&gctx->gc_base.cc_refcount) > 0);
+
+        /* timeout is not in use for now */
+        if (simple_get_bytes(&data, &datalen, &timeout, sizeof(timeout)))
+                GOTO(out_msg, rc = -EFAULT);
+
+        /* lgssd signal an error by gc_win == 0 */
+        if (simple_get_bytes(&data, &datalen, &gctx->gc_win,
+                             sizeof(gctx->gc_win)))
+                GOTO(out_msg, rc = -EFAULT);
+
+        if (gctx->gc_win == 0) {
+                /* followed by:
+                 * - rpc error
+                 * - gss error
+                 */
+                if (simple_get_bytes(&data, &datalen, &rc, sizeof(rc)))
+                        GOTO(out_msg, rc = -EFAULT);
+                if (simple_get_bytes(&data, &datalen, &gss_err,sizeof(gss_err)))
+                        GOTO(out_msg, rc = -EFAULT);
+
+                if (rc == 0 && gss_err == GSS_S_COMPLETE) {
+                        CWARN("both rpc & gss error code not set\n");
+                        rc = -EPERM;
+                }
+        } else {
+                rawobj_t tmpobj;
+
+                /* handle */
+                if (rawobj_extract_local(&tmpobj, (__u32 **) &data, &datalen))
+                        GOTO(out_msg, rc = -EFAULT);
+                if (rawobj_dup(&gctx->gc_handle, &tmpobj))
+                        GOTO(out_msg, rc = -ENOMEM);
+
+                /* mechctx */
+                if (rawobj_extract_local(&tmpobj, (__u32 **) &data, &datalen))
+                        GOTO(out_msg, rc = -EFAULT);
+                gss_err = lgss_import_sec_context(&tmpobj,
+                                                  gss_msg->gum_gsec->gs_mech,
+                                                  &gctx->gc_mechctx);
+                rc = 0;
+        }
+
+        if (likely(rc == 0 && gss_err == GSS_S_COMPLETE)) {
+                gss_cli_ctx_uptodate(gctx);
+        } else {
+                ctx = &gctx->gc_base;
+                sptlrpc_cli_ctx_expire(ctx);
+                if (rc != -ERESTART || gss_err != GSS_S_COMPLETE)
+			set_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags);
+
+                CERROR("refresh ctx %p(uid %d) failed: %d/0x%08x: %s\n",
+                       ctx, ctx->cc_vcred.vc_uid, rc, gss_err,
+		       test_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags) ?
+                       "fatal error" : "non-fatal");
+        }
+
+        rc = mlen;
+
+out_msg:
+        gss_release_msg(gss_msg);
+
+out_free:
+        OBD_FREE(buf, mlen);
+        /* FIXME
+         * hack pipefs: always return asked length unless all following
+         * downcalls might be messed up. */
+        rc = mlen;
+        RETURN(rc);
+}
+
+static
+void gss_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+        struct gss_upcall_msg          *gmsg;
+        struct gss_upcall_msg_data     *gumd;
+	static time64_t ratelimit;
+        ENTRY;
+
+	LASSERT(list_empty(&msg->list));
+
+        /* normally errno is >= 0 */
+        if (msg->errno >= 0) {
+                EXIT;
+                return;
+        }
+
+	gmsg = container_of(msg, struct gss_upcall_msg, gum_base);
+	gumd = &gmsg->gum_data;
+	LASSERT(atomic_read(&gmsg->gum_refcount) > 0);
+
+	CERROR("failed msg %p (seq %u, uid %u, svc %u, nid %#llx, obd %.*s): "
+	       "errno %d\n", msg, gumd->gum_seq, gumd->gum_uid, gumd->gum_svc,
+	       gumd->gum_nid, (int) sizeof(gumd->gum_obd),
+	       gumd->gum_obd, msg->errno);
+
+	atomic_inc(&gmsg->gum_refcount);
+	gss_unhash_msg(gmsg);
+	if (msg->errno == -ETIMEDOUT || msg->errno == -EPIPE) {
+		time64_t now = ktime_get_real_seconds();
+
+		if (now > ratelimit) {
+			CWARN("upcall timed out, is lgssd running?\n");
+			ratelimit = now + 15;
+		}
+	}
+	gss_msg_fail_ctx(gmsg);
+	gss_release_msg(gmsg);
+	EXIT;
+}
+
+static
+void gss_pipe_release(struct inode *inode)
+{
+	struct rpc_inode *rpci = RPC_I(inode);
+	__u32		  idx;
+	ENTRY;
+
+	idx = (__u32) (long) rpci->private;
+	LASSERT(idx < MECH_MAX);
+
+	upcall_list_lock(idx);
+	while (!list_empty(&upcall_lists[idx])) {
+		struct gss_upcall_msg	   *gmsg;
+		struct gss_upcall_msg_data *gumd;
+
+		gmsg = list_entry(upcall_lists[idx].next,
+				  struct gss_upcall_msg, gum_list);
+		gumd = &gmsg->gum_data;
+		LASSERT(list_empty(&gmsg->gum_base.list));
+
+                CERROR("failing remaining msg %p:seq %u, uid %u, svc %u, "
+		       "nid %#llx, obd %.*s\n", gmsg,
+                       gumd->gum_seq, gumd->gum_uid, gumd->gum_svc,
+                       gumd->gum_nid, (int) sizeof(gumd->gum_obd),
+                       gumd->gum_obd);
+
+		gmsg->gum_base.errno = -EPIPE;
+		atomic_inc(&gmsg->gum_refcount);
+		gss_unhash_msg_nolock(gmsg);
+
+		gss_msg_fail_ctx(gmsg);
+
+		upcall_list_unlock(idx);
+		gss_release_msg(gmsg);
+		upcall_list_lock(idx);
+	}
+	upcall_list_unlock(idx);
+	EXIT;
+}
+
+static struct rpc_pipe_ops gss_upcall_ops = {
+        .upcall         = gss_pipe_upcall,
+        .downcall       = gss_pipe_downcall,
+        .destroy_msg    = gss_pipe_destroy_msg,
+        .release_pipe   = gss_pipe_release,
+};
+
+/****************************************
+ * upcall helper functions              *
+ ****************************************/
+
+static
+int gss_ctx_refresh_pf(struct ptlrpc_cli_ctx *ctx)
+{
+	struct obd_import          *imp;
+	struct gss_sec             *gsec;
+	struct gss_upcall_msg      *gmsg;
+	int                         rc = 0;
+	ENTRY;
+
+	might_sleep();
+
+	LASSERT(ctx->cc_sec);
+	LASSERT(ctx->cc_sec->ps_import);
+	LASSERT(ctx->cc_sec->ps_import->imp_obd);
+
+        imp = ctx->cc_sec->ps_import;
+        if (!imp->imp_connection) {
+                CERROR("import has no connection set\n");
+                RETURN(-EINVAL);
+        }
+
+        gsec = container_of(ctx->cc_sec, struct gss_sec, gs_base);
+
+        OBD_ALLOC_PTR(gmsg);
+        if (!gmsg)
+                RETURN(-ENOMEM);
+
+        /* initialize pipefs base msg */
+	INIT_LIST_HEAD(&gmsg->gum_base.list);
+        gmsg->gum_base.data = &gmsg->gum_data;
+        gmsg->gum_base.len = sizeof(gmsg->gum_data);
+        gmsg->gum_base.copied = 0;
+        gmsg->gum_base.errno = 0;
+
+	/* init upcall msg */
+	atomic_set(&gmsg->gum_refcount, 1);
+	gmsg->gum_mechidx = mech_name2idx(gsec->gs_mech->gm_name);
+	gmsg->gum_gsec = gsec;
+	gmsg->gum_gctx = container_of(sptlrpc_cli_ctx_get(ctx),
+				      struct gss_cli_ctx, gc_base);
+	gmsg->gum_data.gum_seq = upcall_get_sequence();
+	gmsg->gum_data.gum_uid = ctx->cc_vcred.vc_uid;
+	gmsg->gum_data.gum_gid = 0; /* not used for now */
+	gmsg->gum_data.gum_svc = import_to_gss_svc(imp);
+	gmsg->gum_data.gum_nid = imp->imp_connection->c_peer.nid;
+	strlcpy(gmsg->gum_data.gum_obd, imp->imp_obd->obd_name,
+		sizeof(gmsg->gum_data.gum_obd));
+
+        /* This only could happen when sysadmin set it dead/expired
+         * using lctl by force. */
+        if (ctx->cc_flags & PTLRPC_CTX_STATUS_MASK) {
+                CWARN("ctx %p(%u->%s) was set flags %lx unexpectedly\n",
+                      ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
+                      ctx->cc_flags);
+
+                LASSERT(!(ctx->cc_flags & PTLRPC_CTX_UPTODATE));
+                ctx->cc_flags |= PTLRPC_CTX_DEAD | PTLRPC_CTX_ERROR;
+
+                rc = -EIO;
+                goto err_free;
+        }
+
+        upcall_msg_enlist(gmsg);
+
+        rc = rpc_queue_upcall(de_pipes[gmsg->gum_mechidx]->d_inode,
+                              &gmsg->gum_base);
+        if (rc) {
+                CERROR("rpc_queue_upcall failed: %d\n", rc);
+
+                upcall_msg_delist(gmsg);
+                goto err_free;
+        }
+
+        RETURN(0);
+err_free:
+        OBD_FREE_PTR(gmsg);
+        RETURN(rc);
+}
+
+static
+int gss_cli_ctx_refresh_pf(struct ptlrpc_cli_ctx *ctx)
+{
+        /* if we are refreshing for root, also update the reverse
+         * handle index, do not confuse reverse contexts. */
+        if (ctx->cc_vcred.vc_uid == 0) {
+                struct gss_sec *gsec;
+
+                gsec = container_of(ctx->cc_sec, struct gss_sec, gs_base);
+                gsec->gs_rvs_hdl = gss_get_next_ctx_index();
+        }
+
+        return gss_ctx_refresh_pf(ctx);
+}
+
+/****************************************
+ * lustre gss pipefs policy             *
+ ****************************************/
+
+static struct ptlrpc_ctx_ops gss_pipefs_ctxops = {
+        .match                  = gss_cli_ctx_match,
+        .refresh                = gss_cli_ctx_refresh_pf,
+        .validate               = gss_cli_ctx_validate_pf,
+        .die                    = gss_cli_ctx_die_pf,
+        .sign                   = gss_cli_ctx_sign,
+        .verify                 = gss_cli_ctx_verify,
+        .seal                   = gss_cli_ctx_seal,
+        .unseal                 = gss_cli_ctx_unseal,
+        .wrap_bulk              = gss_cli_ctx_wrap_bulk,
+        .unwrap_bulk            = gss_cli_ctx_unwrap_bulk,
+};
+
+static struct ptlrpc_sec_cops gss_sec_pipefs_cops = {
+        .create_sec             = gss_sec_create_pf,
+        .destroy_sec            = gss_sec_destroy_pf,
+        .kill_sec               = gss_sec_kill,
+        .lookup_ctx             = gss_sec_lookup_ctx_pf,
+        .release_ctx            = gss_sec_release_ctx_pf,
+        .flush_ctx_cache        = gss_sec_flush_ctx_cache_pf,
+        .install_rctx           = gss_sec_install_rctx,
+        .alloc_reqbuf           = gss_alloc_reqbuf,
+        .free_reqbuf            = gss_free_reqbuf,
+        .alloc_repbuf           = gss_alloc_repbuf,
+        .free_repbuf            = gss_free_repbuf,
+        .enlarge_reqbuf         = gss_enlarge_reqbuf,
+};
+
+static struct ptlrpc_sec_sops gss_sec_pipefs_sops = {
+        .accept                 = gss_svc_accept_pf,
+        .invalidate_ctx         = gss_svc_invalidate_ctx,
+        .alloc_rs               = gss_svc_alloc_rs,
+        .authorize              = gss_svc_authorize,
+        .free_rs                = gss_svc_free_rs,
+        .free_ctx               = gss_svc_free_ctx,
+        .unwrap_bulk            = gss_svc_unwrap_bulk,
+        .wrap_bulk              = gss_svc_wrap_bulk,
+        .install_rctx           = gss_svc_install_rctx_pf,
+};
+
+static struct ptlrpc_sec_policy gss_policy_pipefs = {
+        .sp_owner               = THIS_MODULE,
+        .sp_name                = "gss.pipefs",
+        .sp_policy              = SPTLRPC_POLICY_GSS_PIPEFS,
+        .sp_cops                = &gss_sec_pipefs_cops,
+        .sp_sops                = &gss_sec_pipefs_sops,
+};
+
+static
+int __init gss_init_pipefs_upcall(void)
+{
+        struct dentry   *de;
+
+        /* pipe dir */
+        de = rpc_mkdir(LUSTRE_PIPE_ROOT, NULL);
+        if (IS_ERR(de) && PTR_ERR(de) != -EEXIST) {
+                CERROR("Failed to create gss pipe dir: %ld\n", PTR_ERR(de));
+                return PTR_ERR(de);
+        }
+
+        /* FIXME hack pipefs: dput will sometimes cause oops during module
+         * unload and lgssd close the pipe fds. */
+
+        /* krb5 mechanism */
+        de = rpc_mkpipe(LUSTRE_PIPE_KRB5, (void *) MECH_KRB5, &gss_upcall_ops,
+                        RPC_PIPE_WAIT_FOR_OPEN);
+        if (!de || IS_ERR(de)) {
+                CERROR("failed to make rpc_pipe %s: %ld\n",
+                       LUSTRE_PIPE_KRB5, PTR_ERR(de));
+                rpc_rmdir(LUSTRE_PIPE_ROOT);
+                return PTR_ERR(de);
+        }
+
+        de_pipes[MECH_KRB5] = de;
+	INIT_LIST_HEAD(&upcall_lists[MECH_KRB5]);
+	spin_lock_init(&upcall_locks[MECH_KRB5]);
+
+	return 0;
+}
+
+static
+void __exit gss_exit_pipefs_upcall(void)
+{
+	__u32 i;
+
+	for (i = 0; i < MECH_MAX; i++) {
+		LASSERT(list_empty(&upcall_lists[i]));
+
+		/* dput pipe dentry here might cause lgssd oops. */
+		de_pipes[i] = NULL;
+	}
+
+	rpc_unlink(LUSTRE_PIPE_KRB5);
+	rpc_rmdir(LUSTRE_PIPE_ROOT);
+}
+
+int __init gss_init_pipefs(void)
+{
+        int rc;
+
+        rc = gss_init_pipefs_upcall();
+        if (rc)
+                return rc;
+
+        rc = sptlrpc_register_policy(&gss_policy_pipefs);
+        if (rc) {
+                gss_exit_pipefs_upcall();
+                return rc;
+        }
+
+        return 0;
+}
+
+void __exit gss_exit_pipefs(void)
+{
+	gss_exit_pipefs_upcall();
+	sptlrpc_unregister_policy(&gss_policy_pipefs);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_rawobj.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_rawobj.c
new file mode 100644
index 0000000000000..79930bb67419d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_rawobj.c
@@ -0,0 +1,238 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/gss/gss_rawobj.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_sec.h>
+
+#include "gss_internal.h"
+
+int rawobj_empty(rawobj_t *obj)
+{
+        LASSERT(equi(obj->len, obj->data));
+        return (obj->len == 0);
+}
+
+int rawobj_alloc(rawobj_t *obj, char *buf, int len)
+{
+        LASSERT(obj);
+        LASSERT(len >= 0);
+
+        obj->len = len;
+        if (len) {
+                OBD_ALLOC_LARGE(obj->data, len);
+                if (!obj->data) {
+                        obj->len = 0;
+                        RETURN(-ENOMEM);
+                }
+                memcpy(obj->data, buf, len);
+        } else
+                obj->data = NULL;
+        return 0;
+}
+
+void rawobj_free(rawobj_t *obj)
+{
+        LASSERT(obj);
+
+        if (obj->len) {
+                LASSERT(obj->data);
+                OBD_FREE_LARGE(obj->data, obj->len);
+                obj->len = 0;
+                obj->data = NULL;
+        } else
+                LASSERT(!obj->data);
+}
+
+int rawobj_equal(rawobj_t *a, rawobj_t *b)
+{
+        LASSERT(a && b);
+
+        return (a->len == b->len &&
+                (!a->len || !memcmp(a->data, b->data, a->len)));
+}
+
+int rawobj_dup(rawobj_t *dest, rawobj_t *src)
+{
+        LASSERT(src && dest);
+
+        dest->len = src->len;
+        if (dest->len) {
+                OBD_ALLOC_LARGE(dest->data, dest->len);
+                if (!dest->data) {
+                        dest->len = 0;
+                        return -ENOMEM;
+                }
+                memcpy(dest->data, src->data, dest->len);
+        } else
+                dest->data = NULL;
+        return 0;
+}
+
+int rawobj_serialize(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+        __u32 len;
+
+        LASSERT(obj);
+        LASSERT(buf);
+        LASSERT(buflen);
+
+        len = cfs_size_round4(obj->len);
+
+        if (*buflen < 4 + len) {
+                CERROR("buflen %u <  %u\n", *buflen, 4 + len);
+                return -EINVAL;
+        }
+
+        *(*buf)++ = cpu_to_le32(obj->len);
+        memcpy(*buf, obj->data, obj->len);
+        *buf += (len >> 2);
+        *buflen -= (4 + len);
+
+        return 0;
+}
+
+static int __rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen,
+                            int alloc, int local)
+{
+        __u32 len;
+
+        if (*buflen < sizeof(__u32)) {
+                CERROR("buflen %u\n", *buflen);
+                return -EINVAL;
+        }
+
+        obj->len = *(*buf)++;
+        if (!local)
+                obj->len = le32_to_cpu(obj->len);
+        *buflen -= sizeof(__u32);
+
+        if (!obj->len) {
+                obj->data = NULL;
+                return 0;
+        }
+
+        len = local ? obj->len : cfs_size_round4(obj->len);
+        if (*buflen < len) {
+                CERROR("buflen %u < %u\n", *buflen, len);
+                obj->len = 0;
+                return -EINVAL;
+        }
+
+        if (!alloc)
+                obj->data = (__u8 *) *buf;
+        else {
+                OBD_ALLOC_LARGE(obj->data, obj->len);
+                if (!obj->data) {
+                        CERROR("fail to alloc %u bytes\n", obj->len);
+                        obj->len = 0;
+                        return -ENOMEM;
+                }
+                memcpy(obj->data, *buf, obj->len);
+        }
+
+        *((char **)buf) += len;
+        *buflen -= len;
+
+        return 0;
+}
+
+int rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+        return __rawobj_extract(obj, buf, buflen, 0, 0);
+}
+
+int rawobj_extract_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+        return __rawobj_extract(obj, buf, buflen, 1, 0);
+}
+
+int rawobj_extract_local(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+        return __rawobj_extract(obj, buf, buflen, 0, 1);
+}
+
+int rawobj_extract_local_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+        return __rawobj_extract(obj, buf, buflen, 1, 1);
+}
+
+int rawobj_from_netobj(rawobj_t *rawobj, netobj_t *netobj)
+{
+        rawobj->len = netobj->len;
+        rawobj->data = netobj->data;
+        return 0;
+}
+
+int rawobj_from_netobj_alloc(rawobj_t *rawobj, netobj_t *netobj)
+{
+        rawobj->len = 0;
+        rawobj->data = NULL;
+
+        if (netobj->len == 0)
+                return 0;
+                
+        OBD_ALLOC_LARGE(rawobj->data, netobj->len);
+        if (rawobj->data == NULL)
+                return -ENOMEM;
+
+        rawobj->len = netobj->len;
+        memcpy(rawobj->data, netobj->data, netobj->len);
+        return 0;
+}
+
+/****************************************
+ * misc more                            *
+ ****************************************/
+
+int buffer_extract_bytes(const void **buf, __u32 *buflen,
+                         void *res, __u32 reslen)
+{
+        if (*buflen < reslen) {
+                CERROR("buflen %u < %u\n", *buflen, reslen);
+                return -EINVAL;
+        }
+
+        memcpy(res, *buf, reslen);
+        *buf += reslen;
+        *buflen -= reslen;
+        return 0;
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c
new file mode 100644
index 0000000000000..fd1b071d6f549
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c
@@ -0,0 +1,969 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (C) 2013, 2015, Trustees of Indiana University
+ *
+ * Copyright (c) 2014, 2016, Intel Corporation.
+ *
+ * Author: Jeremy Filizetti <jfilizet@iu.edu>
+ * Author: Andrew Korty <ajk@iu.edu>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#include <linux/mutex.h>
+#include <crypto/ctr.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_user.h>
+
+#include "gss_err.h"
+#include "gss_crypto.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_asn1.h"
+
+#define SK_INTERFACE_VERSION 1
+#define SK_MSG_VERSION 1
+#define SK_MIN_SIZE 8
+#define SK_IV_SIZE 16
+
+/* Starting number for reverse contexts.  It is critical to security
+ * that reverse contexts use a different range of numbers than regular
+ * contexts because they are using the same key.  Therefore the IV/nonce
+ * combination must be unique for them.  To accomplish this reverse contexts
+ * use the the negative range of a 64-bit number and regular contexts use the
+ * postive range.  If the same IV/nonce combination were reused it would leak
+ * information about the plaintext. */
+#define SK_IV_REV_START (1ULL << 63)
+
+struct sk_ctx {
+	__u16			sc_hmac;
+	__u16			sc_crypt;
+	__u32			sc_expire;
+	__u32			sc_host_random;
+	__u32			sc_peer_random;
+	atomic64_t		sc_iv;
+	rawobj_t		sc_hmac_key;
+	struct gss_keyblock	sc_session_kb;
+};
+
+struct sk_hdr {
+	__u64			skh_version;
+	__u64			skh_iv;
+} __attribute__((packed));
+
+/* The format of SK wire data is similar to that of RFC3686 ESP Payload
+ * (section 3) except instead of just an IV there is a struct sk_hdr.
+ * ---------------------------------------------------------------------
+ * | struct sk_hdr | ciphertext (variable size) | HMAC (variable size) |
+ * --------------------------------------------------------------------- */
+struct sk_wire {
+	rawobj_t		skw_header;
+	rawobj_t		skw_cipher;
+	rawobj_t		skw_hmac;
+};
+
+static struct sk_crypt_type sk_crypt_types[] = {
+	[SK_CRYPT_AES256_CTR] = {
+		.sct_name = "ctr(aes)",
+		.sct_bytes = 32,
+	},
+};
+
+static struct sk_hmac_type sk_hmac_types[] = {
+	[SK_HMAC_SHA256] = {
+		.sht_name = "hmac(sha256)",
+		.sht_bytes = 32,
+	},
+	[SK_HMAC_SHA512] = {
+		.sht_name = "hmac(sha512)",
+		.sht_bytes = 64,
+	},
+};
+
+static inline unsigned long sk_block_mask(unsigned long len, int blocksize)
+{
+	return (len + blocksize - 1) & (~(blocksize - 1));
+}
+
+static int sk_fill_header(struct sk_ctx *skc, struct sk_hdr *skh)
+{
+	__u64 tmp_iv;
+	skh->skh_version = be64_to_cpu(SK_MSG_VERSION);
+
+	/* Always using inc_return so we don't use our initial numbers which
+	 * could be the reuse detecting numbers */
+	tmp_iv = atomic64_inc_return(&skc->sc_iv);
+	skh->skh_iv = be64_to_cpu(tmp_iv);
+	if (tmp_iv == 0 || tmp_iv == SK_IV_REV_START) {
+		CERROR("Counter looped, connection must be reset to avoid "
+		       "plaintext information\n");
+		return GSS_S_FAILURE;
+	}
+
+	return GSS_S_COMPLETE;
+}
+
+static int sk_verify_header(struct sk_hdr *skh)
+{
+	if (cpu_to_be64(skh->skh_version) != SK_MSG_VERSION)
+		return GSS_S_DEFECTIVE_TOKEN;
+
+	return GSS_S_COMPLETE;
+}
+
+void sk_construct_rfc3686_iv(__u8 *iv, __u32 nonce, __u64 partial_iv)
+{
+	__u32 ctr = cpu_to_be32(1);
+
+	memcpy(iv, &nonce, CTR_RFC3686_NONCE_SIZE);
+	iv += CTR_RFC3686_NONCE_SIZE;
+	memcpy(iv, &partial_iv, CTR_RFC3686_IV_SIZE);
+	iv += CTR_RFC3686_IV_SIZE;
+	memcpy(iv, &ctr, sizeof(ctr));
+}
+
+static int sk_init_keys(struct sk_ctx *skc)
+{
+	return gss_keyblock_init(&skc->sc_session_kb,
+				 sk_crypt_types[skc->sc_crypt].sct_name, 0);
+}
+
+static int sk_fill_context(rawobj_t *inbuf, struct sk_ctx *skc)
+{
+	char *ptr = inbuf->data;
+	char *end = inbuf->data + inbuf->len;
+	__u32 tmp;
+
+	/* see sk_serialize_kctx() for format from userspace side */
+	/*  1. Version */
+	if (gss_get_bytes(&ptr, end, &tmp, sizeof(tmp))) {
+		CERROR("Failed to read shared key interface version");
+		return -1;
+	}
+	if (tmp != SK_INTERFACE_VERSION) {
+		CERROR("Invalid shared key interface version: %d\n", tmp);
+		return -1;
+	}
+
+	/* 2. HMAC type */
+	if (gss_get_bytes(&ptr, end, &skc->sc_hmac, sizeof(skc->sc_hmac))) {
+		CERROR("Failed to read HMAC algorithm type");
+		return -1;
+	}
+	if (skc->sc_hmac <= SK_HMAC_EMPTY || skc->sc_hmac >= SK_HMAC_MAX) {
+		CERROR("Invalid hmac type: %d\n", skc->sc_hmac);
+		return -1;
+	}
+
+	/* 3. crypt type */
+	if (gss_get_bytes(&ptr, end, &skc->sc_crypt, sizeof(skc->sc_crypt))) {
+		CERROR("Failed to read crypt algorithm type");
+		return -1;
+	}
+	if (skc->sc_crypt <= SK_CRYPT_EMPTY || skc->sc_crypt >= SK_CRYPT_MAX) {
+		CERROR("Invalid crypt type: %d\n", skc->sc_crypt);
+		return -1;
+	}
+
+	/* 4. expiration time */
+	if (gss_get_bytes(&ptr, end, &tmp, sizeof(tmp))) {
+		CERROR("Failed to read context expiration time");
+		return -1;
+	}
+	skc->sc_expire = tmp + cfs_time_current_sec();
+
+	/* 5. host random is used as nonce for encryption */
+	if (gss_get_bytes(&ptr, end, &skc->sc_host_random,
+			  sizeof(skc->sc_host_random))) {
+		CERROR("Failed to read host random ");
+		return -1;
+	}
+
+	/* 6. peer random is used as nonce for decryption */
+	if (gss_get_bytes(&ptr, end, &skc->sc_peer_random,
+			  sizeof(skc->sc_peer_random))) {
+		CERROR("Failed to read peer random ");
+		return -1;
+	}
+
+	/* 7. HMAC key */
+	if (gss_get_rawobj(&ptr, end, &skc->sc_hmac_key)) {
+		CERROR("Failed to read HMAC key");
+		return -1;
+	}
+	if (skc->sc_hmac_key.len <= SK_MIN_SIZE) {
+		CERROR("HMAC key must key must be larger than %d bytes\n",
+		       SK_MIN_SIZE);
+		return -1;
+	}
+
+	/* 8. Session key, can be empty if not using privacy mode */
+	if (gss_get_rawobj(&ptr, end, &skc->sc_session_kb.kb_key)) {
+		CERROR("Failed to read session key");
+		return -1;
+	}
+
+	return 0;
+}
+
+static void sk_delete_context(struct sk_ctx *skc)
+{
+	if (!skc)
+		return;
+
+	rawobj_free(&skc->sc_hmac_key);
+	gss_keyblock_free(&skc->sc_session_kb);
+	OBD_FREE_PTR(skc);
+}
+
+static
+__u32 gss_import_sec_context_sk(rawobj_t *inbuf, struct gss_ctx *gss_context)
+{
+	struct sk_ctx *skc;
+	bool privacy = false;
+
+	if (inbuf == NULL || inbuf->data == NULL)
+		return GSS_S_FAILURE;
+
+	OBD_ALLOC_PTR(skc);
+	if (!skc)
+		return GSS_S_FAILURE;
+
+	atomic64_set(&skc->sc_iv, 0);
+
+	if (sk_fill_context(inbuf, skc))
+		goto out_err;
+
+	/* Only privacy mode needs to initialize keys */
+	if (skc->sc_session_kb.kb_key.len > 0) {
+		privacy = true;
+		if (sk_init_keys(skc))
+			goto out_err;
+	}
+
+	gss_context->internal_ctx_id = skc;
+	CDEBUG(D_SEC, "successfully imported sk%s context\n",
+	       privacy ? "pi" : "i");
+
+	return GSS_S_COMPLETE;
+
+out_err:
+	sk_delete_context(skc);
+	return GSS_S_FAILURE;
+}
+
+static
+__u32 gss_copy_reverse_context_sk(struct gss_ctx *gss_context_old,
+				  struct gss_ctx *gss_context_new)
+{
+	struct sk_ctx *skc_old = gss_context_old->internal_ctx_id;
+	struct sk_ctx *skc_new;
+
+	OBD_ALLOC_PTR(skc_new);
+	if (!skc_new)
+		return GSS_S_FAILURE;
+
+	skc_new->sc_hmac = skc_old->sc_hmac;
+	skc_new->sc_crypt = skc_old->sc_crypt;
+	skc_new->sc_expire = skc_old->sc_expire;
+	skc_new->sc_host_random = skc_old->sc_host_random;
+	skc_new->sc_peer_random = skc_old->sc_peer_random;
+
+	atomic64_set(&skc_new->sc_iv, SK_IV_REV_START);
+
+	if (rawobj_dup(&skc_new->sc_hmac_key, &skc_old->sc_hmac_key))
+		goto out_err;
+	if (gss_keyblock_dup(&skc_new->sc_session_kb, &skc_old->sc_session_kb))
+		goto out_err;
+
+	/* Only privacy mode needs to initialize keys */
+	if (skc_new->sc_session_kb.kb_key.len > 0)
+		if (sk_init_keys(skc_new))
+			goto out_err;
+
+	gss_context_new->internal_ctx_id = skc_new;
+	CDEBUG(D_SEC, "successfully copied reverse sk context\n");
+
+	return GSS_S_COMPLETE;
+
+out_err:
+	sk_delete_context(skc_new);
+	return GSS_S_FAILURE;
+}
+
+static
+__u32 gss_inquire_context_sk(struct gss_ctx *gss_context,
+			     unsigned long *endtime)
+{
+	struct sk_ctx *skc = gss_context->internal_ctx_id;
+
+	*endtime = skc->sc_expire;
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 sk_make_hmac(char *alg_name, rawobj_t *key, int msg_count, rawobj_t *msgs,
+		   int iov_count, lnet_kiov_t *iovs, rawobj_t *token)
+{
+	struct crypto_hash *tfm;
+	int rc;
+
+	tfm = crypto_alloc_hash(alg_name, 0, 0);
+	if (IS_ERR(tfm))
+		return GSS_S_FAILURE;
+
+	rc = GSS_S_FAILURE;
+	LASSERT(token->len >= crypto_hash_digestsize(tfm));
+	if (!gss_digest_hmac(tfm, key, NULL, msg_count, msgs, iov_count, iovs,
+			    token))
+		rc = GSS_S_COMPLETE;
+
+	crypto_free_hash(tfm);
+	return rc;
+}
+
+static
+__u32 gss_get_mic_sk(struct gss_ctx *gss_context,
+		     int message_count,
+		     rawobj_t *messages,
+		     int iov_count,
+		     lnet_kiov_t *iovs,
+		     rawobj_t *token)
+{
+	struct sk_ctx *skc = gss_context->internal_ctx_id;
+	return sk_make_hmac(sk_hmac_types[skc->sc_hmac].sht_name,
+			    &skc->sc_hmac_key, message_count, messages,
+			    iov_count, iovs, token);
+}
+
+static
+__u32 sk_verify_hmac(struct sk_hmac_type *sht, rawobj_t *key, int message_count,
+			 rawobj_t *messages, int iov_count, lnet_kiov_t *iovs,
+			 rawobj_t *token)
+{
+	rawobj_t checksum = RAWOBJ_EMPTY;
+	__u32 rc = GSS_S_FAILURE;
+
+	checksum.len = sht->sht_bytes;
+	if (token->len < checksum.len) {
+		CDEBUG(D_SEC, "Token received too short, expected %d "
+		       "received %d\n", token->len, checksum.len);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+
+	OBD_ALLOC_LARGE(checksum.data, checksum.len);
+	if (!checksum.data)
+		return rc;
+
+	if (sk_make_hmac(sht->sht_name, key, message_count, messages,
+			 iov_count, iovs, &checksum)) {
+		CDEBUG(D_SEC, "Failed to create checksum to validate\n");
+		goto cleanup;
+	}
+
+	if (memcmp(token->data, checksum.data, checksum.len)) {
+		CERROR("checksum mismatch\n");
+		rc = GSS_S_BAD_SIG;
+		goto cleanup;
+	}
+
+	rc = GSS_S_COMPLETE;
+
+cleanup:
+	OBD_FREE(checksum.data, checksum.len);
+	return rc;
+}
+
+/* sk_verify_bulk_hmac() differs slightly from sk_verify_hmac() because all
+ * encrypted pages in the bulk descriptor are populated although we only need
+ * to decrypt up to the number of bytes actually specified from the sender
+ * (bd_nob) otherwise the calulated HMAC will be incorrect. */
+static
+__u32 sk_verify_bulk_hmac(struct sk_hmac_type *sht, rawobj_t *key,
+			  int msgcnt, rawobj_t *msgs, int iovcnt,
+			  lnet_kiov_t *iovs, int iov_bytes, rawobj_t *token)
+{
+	rawobj_t checksum = RAWOBJ_EMPTY;
+	struct crypto_hash *tfm;
+	struct hash_desc desc = {
+		.tfm = NULL,
+		.flags = 0,
+	};
+	struct scatterlist sg[1];
+	struct sg_table sgt;
+	int bytes;
+	int i;
+	int rc = GSS_S_FAILURE;
+
+	checksum.len = sht->sht_bytes;
+	if (token->len < checksum.len) {
+		CDEBUG(D_SEC, "Token received too short, expected %d "
+		       "received %d\n", token->len, checksum.len);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+
+	OBD_ALLOC_LARGE(checksum.data, checksum.len);
+	if (!checksum.data)
+		return rc;
+
+	tfm = crypto_alloc_hash(sht->sht_name, 0, 0);
+	if (IS_ERR(tfm))
+		goto cleanup;
+
+	desc.tfm = tfm;
+
+	LASSERT(token->len >= crypto_hash_digestsize(tfm));
+
+	rc = crypto_hash_setkey(tfm, key->data, key->len);
+	if (rc)
+		goto hash_cleanup;
+
+	rc = crypto_hash_init(&desc);
+	if (rc)
+		goto hash_cleanup;
+
+	for (i = 0; i < msgcnt; i++) {
+		if (msgs[i].len == 0)
+			continue;
+
+		rc = gss_setup_sgtable(&sgt, sg, msgs[i].data, msgs[i].len);
+		if (rc != 0)
+			goto hash_cleanup;
+
+		rc = crypto_hash_update(&desc, sg, msgs[i].len);
+		if (rc) {
+			gss_teardown_sgtable(&sgt);
+			goto hash_cleanup;
+		}
+
+		gss_teardown_sgtable(&sgt);
+	}
+
+	for (i = 0; i < iovcnt && iov_bytes > 0; i++) {
+		if (iovs[i].kiov_len == 0)
+			continue;
+
+		bytes = min_t(int, iov_bytes, iovs[i].kiov_len);
+		iov_bytes -= bytes;
+
+		sg_init_table(sg, 1);
+		sg_set_page(&sg[0], iovs[i].kiov_page, bytes,
+			    iovs[i].kiov_offset);
+		rc = crypto_hash_update(&desc, sg, bytes);
+		if (rc)
+			goto hash_cleanup;
+	}
+
+	crypto_hash_final(&desc, checksum.data);
+
+	if (memcmp(token->data, checksum.data, checksum.len)) {
+		rc = GSS_S_BAD_SIG;
+		goto hash_cleanup;
+	}
+
+	rc = GSS_S_COMPLETE;
+
+hash_cleanup:
+	crypto_free_hash(tfm);
+
+cleanup:
+	OBD_FREE_LARGE(checksum.data, checksum.len);
+
+	return rc;
+}
+
+static
+__u32 gss_verify_mic_sk(struct gss_ctx *gss_context,
+			int message_count,
+			rawobj_t *messages,
+			int iov_count,
+			lnet_kiov_t *iovs,
+			rawobj_t *token)
+{
+	struct sk_ctx *skc = gss_context->internal_ctx_id;
+	return sk_verify_hmac(&sk_hmac_types[skc->sc_hmac], &skc->sc_hmac_key,
+			      message_count, messages, iov_count, iovs, token);
+}
+
+static
+__u32 gss_wrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header,
+		    rawobj_t *message, int message_buffer_length,
+		    rawobj_t *token)
+{
+	struct sk_ctx *skc = gss_context->internal_ctx_id;
+	struct sk_hmac_type *sht = &sk_hmac_types[skc->sc_hmac];
+	struct sk_wire skw;
+	struct sk_hdr skh;
+	rawobj_t msgbufs[3];
+	__u8 local_iv[SK_IV_SIZE];
+	unsigned int blocksize;
+
+	LASSERT(skc->sc_session_kb.kb_tfm);
+
+	blocksize = crypto_blkcipher_blocksize(skc->sc_session_kb.kb_tfm);
+	if (gss_add_padding(message, message_buffer_length, blocksize))
+		return GSS_S_FAILURE;
+
+	memset(token->data, 0, token->len);
+
+	if (sk_fill_header(skc, &skh) != GSS_S_COMPLETE)
+		return GSS_S_FAILURE;
+
+	skw.skw_header.data = token->data;
+	skw.skw_header.len = sizeof(skh);
+	memcpy(skw.skw_header.data, &skh, sizeof(skh));
+
+	sk_construct_rfc3686_iv(local_iv, skc->sc_host_random, skh.skh_iv);
+	skw.skw_cipher.data = skw.skw_header.data + skw.skw_header.len;
+	skw.skw_cipher.len = token->len - skw.skw_header.len - sht->sht_bytes;
+	if (gss_crypt_rawobjs(skc->sc_session_kb.kb_tfm, local_iv, 1, message,
+			      &skw.skw_cipher, 1))
+		return GSS_S_FAILURE;
+
+	/* HMAC covers the SK header, GSS header, and ciphertext */
+	msgbufs[0] = skw.skw_header;
+	msgbufs[1] = *gss_header;
+	msgbufs[2] = skw.skw_cipher;
+
+	skw.skw_hmac.data = skw.skw_cipher.data + skw.skw_cipher.len;
+	skw.skw_hmac.len = sht->sht_bytes;
+	if (sk_make_hmac(sht->sht_name, &skc->sc_hmac_key, 3, msgbufs, 0,
+			 NULL, &skw.skw_hmac))
+		return GSS_S_FAILURE;
+
+	token->len = skw.skw_header.len + skw.skw_cipher.len + skw.skw_hmac.len;
+
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_unwrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header,
+		      rawobj_t *token, rawobj_t *message)
+{
+	struct sk_ctx *skc = gss_context->internal_ctx_id;
+	struct sk_hmac_type *sht = &sk_hmac_types[skc->sc_hmac];
+	struct sk_wire skw;
+	struct sk_hdr *skh;
+	rawobj_t msgbufs[3];
+	__u8 local_iv[SK_IV_SIZE];
+	unsigned int blocksize;
+	int rc;
+
+	LASSERT(skc->sc_session_kb.kb_tfm);
+
+	if (token->len < sizeof(skh) + sht->sht_bytes)
+		return GSS_S_DEFECTIVE_TOKEN;
+
+	skw.skw_header.data = token->data;
+	skw.skw_header.len = sizeof(struct sk_hdr);
+	skw.skw_cipher.data = skw.skw_header.data + skw.skw_header.len;
+	skw.skw_cipher.len = token->len - skw.skw_header.len - sht->sht_bytes;
+	skw.skw_hmac.data = skw.skw_cipher.data + skw.skw_cipher.len;
+	skw.skw_hmac.len = sht->sht_bytes;
+
+	blocksize = crypto_blkcipher_blocksize(skc->sc_session_kb.kb_tfm);
+	if (skw.skw_cipher.len % blocksize != 0)
+		return GSS_S_DEFECTIVE_TOKEN;
+
+	skh = (struct sk_hdr *)skw.skw_header.data;
+	rc = sk_verify_header(skh);
+	if (rc != GSS_S_COMPLETE)
+		return rc;
+
+	/* HMAC covers the SK header, GSS header, and ciphertext */
+	msgbufs[0] = skw.skw_header;
+	msgbufs[1] = *gss_header;
+	msgbufs[2] = skw.skw_cipher;
+	rc = sk_verify_hmac(sht, &skc->sc_hmac_key, 3, msgbufs, 0, NULL,
+			    &skw.skw_hmac);
+	if (rc)
+		return rc;
+
+	sk_construct_rfc3686_iv(local_iv, skc->sc_peer_random, skh->skh_iv);
+	message->len = skw.skw_cipher.len;
+	if (gss_crypt_rawobjs(skc->sc_session_kb.kb_tfm, local_iv,
+			      1, &skw.skw_cipher, message, 0))
+		return GSS_S_FAILURE;
+
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_prep_bulk_sk(struct gss_ctx *gss_context,
+		       struct ptlrpc_bulk_desc *desc)
+{
+	struct sk_ctx *skc = gss_context->internal_ctx_id;
+	int blocksize;
+	int i;
+
+	LASSERT(skc->sc_session_kb.kb_tfm);
+	blocksize = crypto_blkcipher_blocksize(skc->sc_session_kb.kb_tfm);
+
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		if (BD_GET_KIOV(desc, i).kiov_offset & blocksize) {
+			CERROR("offset %d not blocksize aligned\n",
+			       BD_GET_KIOV(desc, i).kiov_offset);
+			return GSS_S_FAILURE;
+		}
+
+		BD_GET_ENC_KIOV(desc, i).kiov_offset =
+			BD_GET_KIOV(desc, i).kiov_offset;
+		BD_GET_ENC_KIOV(desc, i).kiov_len =
+			sk_block_mask(BD_GET_KIOV(desc, i).kiov_len, blocksize);
+	}
+
+	return GSS_S_COMPLETE;
+}
+
+static __u32 sk_encrypt_bulk(struct crypto_blkcipher *tfm, __u8 *iv,
+			     struct ptlrpc_bulk_desc *desc, rawobj_t *cipher,
+			     int adj_nob)
+{
+	struct blkcipher_desc cdesc = {
+		.tfm = tfm,
+		.info = iv,
+		.flags = 0,
+	};
+	struct scatterlist ptxt;
+	struct scatterlist ctxt;
+	int blocksize;
+	int i;
+	int rc;
+	int nob = 0;
+
+	blocksize = crypto_blkcipher_blocksize(tfm);
+
+	sg_init_table(&ptxt, 1);
+	sg_init_table(&ctxt, 1);
+
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		sg_set_page(&ptxt, BD_GET_KIOV(desc, i).kiov_page,
+			    sk_block_mask(BD_GET_KIOV(desc, i).kiov_len,
+					  blocksize),
+			    BD_GET_KIOV(desc, i).kiov_offset);
+		nob += ptxt.length;
+
+		sg_set_page(&ctxt, BD_GET_ENC_KIOV(desc, i).kiov_page,
+			    ptxt.length, ptxt.offset);
+
+		BD_GET_ENC_KIOV(desc, i).kiov_offset = ctxt.offset;
+		BD_GET_ENC_KIOV(desc, i).kiov_len = ctxt.length;
+
+		rc = crypto_blkcipher_encrypt_iv(&cdesc, &ctxt, &ptxt,
+						 ptxt.length);
+		if (rc) {
+			CERROR("failed to encrypt page: %d\n", rc);
+			return rc;
+		}
+	}
+
+	if (adj_nob)
+		desc->bd_nob = nob;
+
+	return 0;
+}
+
+static __u32 sk_decrypt_bulk(struct crypto_blkcipher *tfm, __u8 *iv,
+			     struct ptlrpc_bulk_desc *desc, rawobj_t *cipher,
+			     int adj_nob)
+{
+	struct blkcipher_desc cdesc = {
+		.tfm = tfm,
+		.info = iv,
+		.flags = 0,
+	};
+	struct scatterlist ptxt;
+	struct scatterlist ctxt;
+	int blocksize;
+	int i;
+	int rc;
+	int pnob = 0;
+	int cnob = 0;
+
+	sg_init_table(&ptxt, 1);
+	sg_init_table(&ctxt, 1);
+
+	blocksize = crypto_blkcipher_blocksize(tfm);
+	if (desc->bd_nob_transferred % blocksize != 0) {
+		CERROR("Transfer not a multiple of block size: %d\n",
+		       desc->bd_nob_transferred);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+
+	for (i = 0; i < desc->bd_iov_count && cnob < desc->bd_nob_transferred;
+	     i++) {
+		lnet_kiov_t *piov = &BD_GET_KIOV(desc, i);
+		lnet_kiov_t *ciov = &BD_GET_ENC_KIOV(desc, i);
+
+		if (ciov->kiov_offset % blocksize != 0 ||
+		    ciov->kiov_len % blocksize != 0) {
+			CERROR("Invalid bulk descriptor vector\n");
+			return GSS_S_DEFECTIVE_TOKEN;
+		}
+
+		/* Must adjust bytes here because we know the actual sizes after
+		 * decryption.  Similar to what gss_cli_ctx_unwrap_bulk does for
+		 * integrity only mode */
+		if (adj_nob) {
+			/* cipher text must not exceed transferred size */
+			if (ciov->kiov_len + cnob > desc->bd_nob_transferred)
+				ciov->kiov_len =
+					desc->bd_nob_transferred - cnob;
+
+			piov->kiov_len = ciov->kiov_len;
+
+			/* plain text must not exceed bulk's size */
+			if (ciov->kiov_len + pnob > desc->bd_nob)
+				piov->kiov_len = desc->bd_nob - pnob;
+		} else {
+			/* Taken from krb5_decrypt since it was not verified
+			 * whether or not LNET guarantees these */
+			if (ciov->kiov_len + cnob > desc->bd_nob_transferred ||
+			    piov->kiov_len > ciov->kiov_len) {
+				CERROR("Invalid decrypted length\n");
+				return GSS_S_FAILURE;
+			}
+		}
+
+		if (ciov->kiov_len == 0)
+			continue;
+
+		sg_init_table(&ctxt, 1);
+		sg_set_page(&ctxt, ciov->kiov_page, ciov->kiov_len,
+			    ciov->kiov_offset);
+		ptxt = ctxt;
+
+		/* In the event the plain text size is not a multiple
+		 * of blocksize we decrypt in place and copy the result
+		 * after the decryption */
+		if (piov->kiov_len % blocksize == 0)
+			sg_assign_page(&ptxt, piov->kiov_page);
+
+		rc = crypto_blkcipher_decrypt_iv(&cdesc, &ptxt, &ctxt,
+						 ctxt.length);
+		if (rc) {
+			CERROR("Decryption failed for page: %d\n", rc);
+			return GSS_S_FAILURE;
+		}
+
+		if (piov->kiov_len % blocksize != 0) {
+			memcpy(page_address(piov->kiov_page) +
+			       piov->kiov_offset,
+			       page_address(ciov->kiov_page) +
+			       ciov->kiov_offset,
+			       piov->kiov_len);
+		}
+
+		cnob += ciov->kiov_len;
+		pnob += piov->kiov_len;
+	}
+
+	/* if needed, clear up the rest unused iovs */
+	if (adj_nob)
+		while (i < desc->bd_iov_count)
+			BD_GET_KIOV(desc, i++).kiov_len = 0;
+
+	if (unlikely(cnob != desc->bd_nob_transferred)) {
+		CERROR("%d cipher text transferred but only %d decrypted\n",
+		       desc->bd_nob_transferred, cnob);
+		return GSS_S_FAILURE;
+	}
+
+	if (unlikely(!adj_nob && pnob != desc->bd_nob)) {
+		CERROR("%d plain text expected but only %d received\n",
+		       desc->bd_nob, pnob);
+		return GSS_S_FAILURE;
+	}
+
+	return 0;
+}
+
+static
+__u32 gss_wrap_bulk_sk(struct gss_ctx *gss_context,
+		       struct ptlrpc_bulk_desc *desc, rawobj_t *token,
+		       int adj_nob)
+{
+	struct sk_ctx *skc = gss_context->internal_ctx_id;
+	struct sk_hmac_type *sht = &sk_hmac_types[skc->sc_hmac];
+	struct sk_wire skw;
+	struct sk_hdr skh;
+	__u8 local_iv[SK_IV_SIZE];
+
+	LASSERT(skc->sc_session_kb.kb_tfm);
+
+	memset(token->data, 0, token->len);
+	if (sk_fill_header(skc, &skh) != GSS_S_COMPLETE)
+		return GSS_S_FAILURE;
+
+	skw.skw_header.data = token->data;
+	skw.skw_header.len = sizeof(skh);
+	memcpy(skw.skw_header.data, &skh, sizeof(skh));
+
+	sk_construct_rfc3686_iv(local_iv, skc->sc_host_random, skh.skh_iv);
+	skw.skw_cipher.data = skw.skw_header.data + skw.skw_header.len;
+	skw.skw_cipher.len = token->len - skw.skw_header.len - sht->sht_bytes;
+	if (sk_encrypt_bulk(skc->sc_session_kb.kb_tfm, local_iv,
+			    desc, &skw.skw_cipher, adj_nob))
+		return GSS_S_FAILURE;
+
+	skw.skw_hmac.data = skw.skw_cipher.data + skw.skw_cipher.len;
+	skw.skw_hmac.len = sht->sht_bytes;
+	if (sk_make_hmac(sht->sht_name, &skc->sc_hmac_key, 1, &skw.skw_cipher,
+			 desc->bd_iov_count, GET_ENC_KIOV(desc), &skw.skw_hmac))
+		return GSS_S_FAILURE;
+
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_unwrap_bulk_sk(struct gss_ctx *gss_context,
+			   struct ptlrpc_bulk_desc *desc,
+			   rawobj_t *token, int adj_nob)
+{
+	struct sk_ctx *skc = gss_context->internal_ctx_id;
+	struct sk_hmac_type *sht = &sk_hmac_types[skc->sc_hmac];
+	struct sk_wire skw;
+	struct sk_hdr *skh;
+	__u8 local_iv[SK_IV_SIZE];
+	int rc;
+
+	LASSERT(skc->sc_session_kb.kb_tfm);
+
+	if (token->len < sizeof(skh) + sht->sht_bytes)
+		return GSS_S_DEFECTIVE_TOKEN;
+
+	skw.skw_header.data = token->data;
+	skw.skw_header.len = sizeof(struct sk_hdr);
+	skw.skw_cipher.data = skw.skw_header.data + skw.skw_header.len;
+	skw.skw_cipher.len = token->len - skw.skw_header.len - sht->sht_bytes;
+	skw.skw_hmac.data = skw.skw_cipher.data + skw.skw_cipher.len;
+	skw.skw_hmac.len = sht->sht_bytes;
+
+	skh = (struct sk_hdr *)skw.skw_header.data;
+	rc = sk_verify_header(skh);
+	if (rc != GSS_S_COMPLETE)
+		return rc;
+
+	rc = sk_verify_bulk_hmac(&sk_hmac_types[skc->sc_hmac],
+				 &skc->sc_hmac_key, 1, &skw.skw_cipher,
+				 desc->bd_iov_count, GET_ENC_KIOV(desc),
+				 desc->bd_nob, &skw.skw_hmac);
+	if (rc)
+		return rc;
+
+	sk_construct_rfc3686_iv(local_iv, skc->sc_peer_random, skh->skh_iv);
+	rc = sk_decrypt_bulk(skc->sc_session_kb.kb_tfm, local_iv,
+			     desc, &skw.skw_cipher, adj_nob);
+	if (rc)
+		return rc;
+
+	return GSS_S_COMPLETE;
+}
+
+static
+void gss_delete_sec_context_sk(void *internal_context)
+{
+	struct sk_ctx *sk_context = internal_context;
+	sk_delete_context(sk_context);
+}
+
+int gss_display_sk(struct gss_ctx *gss_context, char *buf, int bufsize)
+{
+	return snprintf(buf, bufsize, "sk");
+}
+
+static struct gss_api_ops gss_sk_ops = {
+	.gss_import_sec_context     = gss_import_sec_context_sk,
+	.gss_copy_reverse_context   = gss_copy_reverse_context_sk,
+	.gss_inquire_context        = gss_inquire_context_sk,
+	.gss_get_mic                = gss_get_mic_sk,
+	.gss_verify_mic             = gss_verify_mic_sk,
+	.gss_wrap                   = gss_wrap_sk,
+	.gss_unwrap                 = gss_unwrap_sk,
+	.gss_prep_bulk              = gss_prep_bulk_sk,
+	.gss_wrap_bulk              = gss_wrap_bulk_sk,
+	.gss_unwrap_bulk            = gss_unwrap_bulk_sk,
+	.gss_delete_sec_context     = gss_delete_sec_context_sk,
+	.gss_display                = gss_display_sk,
+};
+
+static struct subflavor_desc gss_sk_sfs[] = {
+	{
+		.sf_subflavor   = SPTLRPC_SUBFLVR_SKN,
+		.sf_qop         = 0,
+		.sf_service     = SPTLRPC_SVC_NULL,
+		.sf_name        = "skn"
+	},
+	{
+		.sf_subflavor   = SPTLRPC_SUBFLVR_SKA,
+		.sf_qop         = 0,
+		.sf_service     = SPTLRPC_SVC_AUTH,
+		.sf_name        = "ska"
+	},
+	{
+		.sf_subflavor   = SPTLRPC_SUBFLVR_SKI,
+		.sf_qop         = 0,
+		.sf_service     = SPTLRPC_SVC_INTG,
+		.sf_name        = "ski"
+	},
+	{
+		.sf_subflavor   = SPTLRPC_SUBFLVR_SKPI,
+		.sf_qop         = 0,
+		.sf_service     = SPTLRPC_SVC_PRIV,
+		.sf_name        = "skpi"
+	},
+};
+
+static struct gss_api_mech gss_sk_mech = {
+	/* .gm_owner uses default NULL value for THIS_MODULE */
+	.gm_name        = "sk",
+	.gm_oid         = (rawobj_t) {
+		.len = 12,
+		.data = "\053\006\001\004\001\311\146\215\126\001\000\001",
+	},
+	.gm_ops         = &gss_sk_ops,
+	.gm_sf_num      = 4,
+	.gm_sfs         = gss_sk_sfs,
+};
+
+int __init init_sk_module(void)
+{
+	int status;
+
+	status = lgss_mech_register(&gss_sk_mech);
+	if (status)
+		CERROR("Failed to register sk gss mechanism!\n");
+
+	return status;
+}
+
+void cleanup_sk_module(void)
+{
+	lgss_mech_unregister(&gss_sk_mech);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_svc_upcall.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_svc_upcall.c
new file mode 100644
index 0000000000000..4798711dbe983
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_svc_upcall.c
@@ -0,0 +1,1199 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * Neil Brown <neilb@cse.unsw.edu.au>
+ * J. Bruce Fields <bfields@umich.edu>
+ * Andy Adamson <andros@umich.edu>
+ * Dug Song <dugsong@monkey.org>
+ *
+ * RPCSEC_GSS server authentication.
+ * This implements RPCSEC_GSS as defined in rfc2203 (rpcsec_gss) and rfc2078
+ * (gssapi)
+ *
+ * The RPCSEC_GSS involves three stages:
+ *  1/ context creation
+ *  2/ data exchange
+ *  3/ context destruction
+ *
+ * Context creation is handled largely by upcalls to user-space.
+ *  In particular, GSS_Accept_sec_context is handled by an upcall
+ * Data exchange is handled entirely within the kernel
+ *  In particular, GSS_GetMIC, GSS_VerifyMIC, GSS_Seal, GSS_Unseal are in-kernel.
+ * Context destruction is handled in-kernel
+ *  GSS_Delete_sec_context is in-kernel
+ *
+ * Context creation is initiated by a RPCSEC_GSS_INIT request arriving.
+ * The context handle and gss_token are used as a key into the rpcsec_init cache.
+ * The content of this cache includes some of the outputs of GSS_Accept_sec_context,
+ * being major_status, minor_status, context_handle, reply_token.
+ * These are sent back to the client.
+ * Sequence window management is handled by the kernel.  The window size if currently
+ * a compile time constant.
+ *
+ * When user-space is happy that a context is established, it places an entry
+ * in the rpcsec_context cache. The key for this cache is the context_handle.
+ * The content includes:
+ *   uid/gidlist - for determining access rights
+ *   mechanism type
+ *   mechanism specific information, such as a key
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+#include <linux/mutex.h>
+#include <linux/sunrpc/cache.h>
+#include <net/sock.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_import.h>
+#include <lustre_net.h>
+#include <lustre_nodemap.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+#define GSS_SVC_UPCALL_TIMEOUT  (20)
+
+static spinlock_t __ctx_index_lock;
+static __u64 __ctx_index;
+
+__u64 gss_get_next_ctx_index(void)
+{
+	__u64 idx;
+
+	spin_lock(&__ctx_index_lock);
+	idx = __ctx_index++;
+	spin_unlock(&__ctx_index_lock);
+
+	return idx;
+}
+
+static inline unsigned long hash_mem(char *buf, int length, int bits)
+{
+	unsigned long hash = 0;
+	unsigned long l = 0;
+	int len = 0;
+	unsigned char c;
+
+	do {
+		if (len == length) {
+			c = (char) len;
+			len = -1;
+		} else
+			c = *buf++;
+
+		l = (l << 8) | c;
+		len++;
+
+		if ((len & (BITS_PER_LONG/8-1)) == 0)
+			hash = hash_long(hash^l, BITS_PER_LONG);
+	} while (len);
+
+	return hash >> (BITS_PER_LONG - bits);
+}
+
+/* This compatibility can be removed once kernel 3.3 is used,
+ * since cache_register_net/cache_unregister_net are exported.
+ * Note that since kernel 3.4 cache_register and cache_unregister
+ * are removed.
+*/
+static inline int _cache_register_net(struct cache_detail *cd, struct net *net)
+{
+#ifdef HAVE_CACHE_REGISTER
+	return cache_register(cd);
+#else
+	return cache_register_net(cd, net);
+#endif
+}
+static inline void _cache_unregister_net(struct cache_detail *cd,
+					 struct net *net)
+{
+#ifdef HAVE_CACHE_REGISTER
+	cache_unregister(cd);
+#else
+	cache_unregister_net(cd, net);
+#endif
+}
+/****************************************
+ * rpc sec init (rsi) cache *
+ ****************************************/
+
+#define RSI_HASHBITS    (6)
+#define RSI_HASHMAX     (1 << RSI_HASHBITS)
+#define RSI_HASHMASK    (RSI_HASHMAX - 1)
+
+struct rsi {
+	struct cache_head       h;
+	__u32                   lustre_svc;
+	__u64                   nid;
+	char			nm_name[LUSTRE_NODEMAP_NAME_LENGTH + 1];
+	wait_queue_head_t       waitq;
+	rawobj_t                in_handle, in_token;
+	rawobj_t                out_handle, out_token;
+	int                     major_status, minor_status;
+};
+
+#ifdef HAVE_CACHE_HEAD_HLIST
+static struct hlist_head rsi_table[RSI_HASHMAX];
+#else
+static struct cache_head *rsi_table[RSI_HASHMAX];
+#endif
+static struct cache_detail rsi_cache;
+static struct rsi *rsi_update(struct rsi *new, struct rsi *old);
+static struct rsi *rsi_lookup(struct rsi *item);
+
+static inline int rsi_hash(struct rsi *item)
+{
+        return hash_mem((char *)item->in_handle.data, item->in_handle.len,
+                        RSI_HASHBITS) ^
+               hash_mem((char *)item->in_token.data, item->in_token.len,
+                        RSI_HASHBITS);
+}
+
+static inline int __rsi_match(struct rsi *item, struct rsi *tmp)
+{
+        return (rawobj_equal(&item->in_handle, &tmp->in_handle) &&
+                rawobj_equal(&item->in_token, &tmp->in_token));
+}
+
+static void rsi_free(struct rsi *rsi)
+{
+        rawobj_free(&rsi->in_handle);
+        rawobj_free(&rsi->in_token);
+        rawobj_free(&rsi->out_handle);
+        rawobj_free(&rsi->out_token);
+}
+
+/* See handle_channel_req() userspace for where the upcall data is read */
+static void rsi_request(struct cache_detail *cd,
+                        struct cache_head *h,
+                        char **bpp, int *blen)
+{
+	struct rsi *rsi = container_of(h, struct rsi, h);
+	__u64 index = 0;
+
+	/* if in_handle is null, provide kernel suggestion */
+	if (rsi->in_handle.len == 0)
+		index = gss_get_next_ctx_index();
+
+	qword_addhex(bpp, blen, (char *) &rsi->lustre_svc,
+			sizeof(rsi->lustre_svc));
+	qword_addhex(bpp, blen, (char *) &rsi->nid, sizeof(rsi->nid));
+	qword_addhex(bpp, blen, (char *) &index, sizeof(index));
+	qword_addhex(bpp, blen, (char *) rsi->nm_name,
+		     strlen(rsi->nm_name) + 1);
+	qword_addhex(bpp, blen, rsi->in_handle.data, rsi->in_handle.len);
+	qword_addhex(bpp, blen, rsi->in_token.data, rsi->in_token.len);
+	(*bpp)[-1] = '\n';
+}
+
+#ifdef HAVE_SUNRPC_UPCALL_HAS_3ARGS
+static int rsi_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+	return sunrpc_cache_pipe_upcall(cd, h, rsi_request);
+}
+#else
+
+static int rsi_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+	return sunrpc_cache_pipe_upcall(cd, h);
+}
+#endif
+
+static inline void __rsi_init(struct rsi *new, struct rsi *item)
+{
+	new->out_handle = RAWOBJ_EMPTY;
+	new->out_token = RAWOBJ_EMPTY;
+
+	new->in_handle = item->in_handle;
+	item->in_handle = RAWOBJ_EMPTY;
+	new->in_token = item->in_token;
+	item->in_token = RAWOBJ_EMPTY;
+
+	new->lustre_svc = item->lustre_svc;
+	new->nid = item->nid;
+	memcpy(new->nm_name, item->nm_name, sizeof(item->nm_name));
+	init_waitqueue_head(&new->waitq);
+}
+
+static inline void __rsi_update(struct rsi *new, struct rsi *item)
+{
+        LASSERT(new->out_handle.len == 0);
+        LASSERT(new->out_token.len == 0);
+
+        new->out_handle = item->out_handle;
+        item->out_handle = RAWOBJ_EMPTY;
+        new->out_token = item->out_token;
+        item->out_token = RAWOBJ_EMPTY;
+
+        new->major_status = item->major_status;
+        new->minor_status = item->minor_status;
+}
+
+static void rsi_put(struct kref *ref)
+{
+        struct rsi *rsi = container_of(ref, struct rsi, h.ref);
+
+#ifdef HAVE_CACHE_HEAD_HLIST
+	LASSERT(rsi->h.cache_list.next == NULL);
+#else
+	LASSERT(rsi->h.next == NULL);
+#endif
+        rsi_free(rsi);
+        OBD_FREE_PTR(rsi);
+}
+
+static int rsi_match(struct cache_head *a, struct cache_head *b)
+{
+        struct rsi *item = container_of(a, struct rsi, h);
+        struct rsi *tmp = container_of(b, struct rsi, h);
+
+        return __rsi_match(item, tmp);
+}
+
+static void rsi_init(struct cache_head *cnew, struct cache_head *citem)
+{
+        struct rsi *new = container_of(cnew, struct rsi, h);
+        struct rsi *item = container_of(citem, struct rsi, h);
+
+        __rsi_init(new, item);
+}
+
+static void update_rsi(struct cache_head *cnew, struct cache_head *citem)
+{
+        struct rsi *new = container_of(cnew, struct rsi, h);
+        struct rsi *item = container_of(citem, struct rsi, h);
+
+        __rsi_update(new, item);
+}
+
+static struct cache_head *rsi_alloc(void)
+{
+        struct rsi *rsi;
+
+        OBD_ALLOC_PTR(rsi);
+        if (rsi) 
+                return &rsi->h;
+        else
+                return NULL;
+}
+
+static int rsi_parse(struct cache_detail *cd, char *mesg, int mlen)
+{
+        char           *buf = mesg;
+        char           *ep;
+        int             len;
+        struct rsi      rsii, *rsip = NULL;
+        time_t          expiry;
+        int             status = -EINVAL;
+        ENTRY;
+
+
+        memset(&rsii, 0, sizeof(rsii));
+
+        /* handle */
+        len = qword_get(&mesg, buf, mlen);
+        if (len < 0)
+                goto out;
+        if (rawobj_alloc(&rsii.in_handle, buf, len)) {
+                status = -ENOMEM;
+                goto out;
+        }
+
+        /* token */
+        len = qword_get(&mesg, buf, mlen);
+        if (len < 0)
+                goto out;
+        if (rawobj_alloc(&rsii.in_token, buf, len)) {
+                status = -ENOMEM;
+                goto out;
+        }
+
+        rsip = rsi_lookup(&rsii);
+        if (!rsip)
+                goto out;
+
+        rsii.h.flags = 0;
+        /* expiry */
+        expiry = get_expiry(&mesg);
+        if (expiry == 0)
+                goto out;
+
+        len = qword_get(&mesg, buf, mlen);
+        if (len <= 0)
+                goto out;
+
+        /* major */
+        rsii.major_status = simple_strtol(buf, &ep, 10);
+        if (*ep)
+                goto out;
+
+        /* minor */
+        len = qword_get(&mesg, buf, mlen);
+        if (len <= 0)
+                goto out;
+        rsii.minor_status = simple_strtol(buf, &ep, 10);
+        if (*ep)
+                goto out;
+
+        /* out_handle */
+        len = qword_get(&mesg, buf, mlen);
+        if (len < 0)
+                goto out;
+        if (rawobj_alloc(&rsii.out_handle, buf, len)) {
+                status = -ENOMEM;
+                goto out;
+        }
+
+        /* out_token */
+        len = qword_get(&mesg, buf, mlen);
+        if (len < 0)
+                goto out;
+        if (rawobj_alloc(&rsii.out_token, buf, len)) {
+                status = -ENOMEM;
+                goto out;
+        }
+
+        rsii.h.expiry_time = expiry;
+        rsip = rsi_update(&rsii, rsip);
+        status = 0;
+out:
+	rsi_free(&rsii);
+	if (rsip) {
+		wake_up_all(&rsip->waitq);
+		cache_put(&rsip->h, &rsi_cache);
+	} else {
+		status = -ENOMEM;
+	}
+
+	if (status)
+		CERROR("rsi parse error %d\n", status);
+	RETURN(status);
+}
+
+static struct cache_detail rsi_cache = {
+	.hash_size	= RSI_HASHMAX,
+	.hash_table	= rsi_table,
+	.name		= "auth.sptlrpc.init",
+	.cache_put	= rsi_put,
+#ifndef HAVE_SUNRPC_UPCALL_HAS_3ARGS
+	.cache_request	= rsi_request,
+#endif
+	.cache_upcall	= rsi_upcall,
+	.cache_parse	= rsi_parse,
+	.match		= rsi_match,
+	.init		= rsi_init,
+	.update		= update_rsi,
+	.alloc		= rsi_alloc,
+};
+
+static struct rsi *rsi_lookup(struct rsi *item)
+{
+        struct cache_head *ch;
+        int hash = rsi_hash(item);
+
+        ch = sunrpc_cache_lookup(&rsi_cache, &item->h, hash);
+        if (ch)
+                return container_of(ch, struct rsi, h);
+        else
+                return NULL;
+}
+
+static struct rsi *rsi_update(struct rsi *new, struct rsi *old)
+{
+        struct cache_head *ch;
+        int hash = rsi_hash(new);
+
+        ch = sunrpc_cache_update(&rsi_cache, &new->h, &old->h, hash);
+        if (ch)
+                return container_of(ch, struct rsi, h);
+        else
+                return NULL;
+}
+
+/****************************************
+ * rpc sec context (rsc) cache                            *
+ ****************************************/
+
+#define RSC_HASHBITS    (10)
+#define RSC_HASHMAX     (1 << RSC_HASHBITS)
+#define RSC_HASHMASK    (RSC_HASHMAX - 1)
+
+struct rsc {
+        struct cache_head       h;
+        struct obd_device      *target;
+        rawobj_t                handle;
+        struct gss_svc_ctx      ctx;
+};
+
+#ifdef HAVE_CACHE_HEAD_HLIST
+static struct hlist_head rsc_table[RSC_HASHMAX];
+#else
+static struct cache_head *rsc_table[RSC_HASHMAX];
+#endif
+static struct cache_detail rsc_cache;
+static struct rsc *rsc_update(struct rsc *new, struct rsc *old);
+static struct rsc *rsc_lookup(struct rsc *item);
+
+static void rsc_free(struct rsc *rsci)
+{
+        rawobj_free(&rsci->handle);
+        rawobj_free(&rsci->ctx.gsc_rvs_hdl);
+        lgss_delete_sec_context(&rsci->ctx.gsc_mechctx);
+}
+
+static inline int rsc_hash(struct rsc *rsci)
+{
+        return hash_mem((char *)rsci->handle.data,
+                        rsci->handle.len, RSC_HASHBITS);
+}
+
+static inline int __rsc_match(struct rsc *new, struct rsc *tmp)
+{
+        return rawobj_equal(&new->handle, &tmp->handle);
+}
+
+static inline void __rsc_init(struct rsc *new, struct rsc *tmp)
+{
+        new->handle = tmp->handle;
+        tmp->handle = RAWOBJ_EMPTY;
+
+        new->target = NULL;
+        memset(&new->ctx, 0, sizeof(new->ctx));
+        new->ctx.gsc_rvs_hdl = RAWOBJ_EMPTY;
+}
+
+static inline void __rsc_update(struct rsc *new, struct rsc *tmp)
+{
+        new->ctx = tmp->ctx;
+        tmp->ctx.gsc_rvs_hdl = RAWOBJ_EMPTY;
+        tmp->ctx.gsc_mechctx = NULL;
+
+        memset(&new->ctx.gsc_seqdata, 0, sizeof(new->ctx.gsc_seqdata));
+	spin_lock_init(&new->ctx.gsc_seqdata.ssd_lock);
+}
+
+static void rsc_put(struct kref *ref)
+{
+        struct rsc *rsci = container_of(ref, struct rsc, h.ref);
+
+#ifdef HAVE_CACHE_HEAD_HLIST
+	LASSERT(rsci->h.cache_list.next == NULL);
+#else
+        LASSERT(rsci->h.next == NULL);
+#endif
+        rsc_free(rsci);
+        OBD_FREE_PTR(rsci);
+}
+
+static int rsc_match(struct cache_head *a, struct cache_head *b)
+{
+        struct rsc *new = container_of(a, struct rsc, h);
+        struct rsc *tmp = container_of(b, struct rsc, h);
+
+        return __rsc_match(new, tmp);
+}
+
+static void rsc_init(struct cache_head *cnew, struct cache_head *ctmp)
+{
+        struct rsc *new = container_of(cnew, struct rsc, h);
+        struct rsc *tmp = container_of(ctmp, struct rsc, h);
+
+        __rsc_init(new, tmp);
+}
+
+static void update_rsc(struct cache_head *cnew, struct cache_head *ctmp)
+{
+        struct rsc *new = container_of(cnew, struct rsc, h);
+        struct rsc *tmp = container_of(ctmp, struct rsc, h);
+
+        __rsc_update(new, tmp);
+}
+
+static struct cache_head * rsc_alloc(void)
+{
+        struct rsc *rsc;
+
+        OBD_ALLOC_PTR(rsc);
+        if (rsc)
+                return &rsc->h;
+        else
+                return NULL;
+}
+
+static int rsc_parse(struct cache_detail *cd, char *mesg, int mlen)
+{
+        char                *buf = mesg;
+        int                  len, rv, tmp_int;
+        struct rsc           rsci, *rscp = NULL;
+        time_t               expiry;
+        int                  status = -EINVAL;
+        struct gss_api_mech *gm = NULL;
+
+        memset(&rsci, 0, sizeof(rsci));
+
+        /* context handle */
+        len = qword_get(&mesg, buf, mlen);
+        if (len < 0) goto out;
+        status = -ENOMEM;
+        if (rawobj_alloc(&rsci.handle, buf, len))
+                goto out;
+
+        rsci.h.flags = 0;
+        /* expiry */
+        expiry = get_expiry(&mesg);
+        status = -EINVAL;
+        if (expiry == 0)
+                goto out;
+
+        /* remote flag */
+        rv = get_int(&mesg, &tmp_int);
+        if (rv) {
+                CERROR("fail to get remote flag\n");
+                goto out;
+        }
+        rsci.ctx.gsc_remote = (tmp_int != 0);
+
+	/* root user flag */
+	rv = get_int(&mesg, &tmp_int);
+	if (rv) {
+		CERROR("fail to get root user flag\n");
+		goto out;
+	}
+	rsci.ctx.gsc_usr_root = (tmp_int != 0);
+
+        /* mds user flag */
+        rv = get_int(&mesg, &tmp_int);
+        if (rv) {
+                CERROR("fail to get mds user flag\n");
+                goto out;
+        }
+        rsci.ctx.gsc_usr_mds = (tmp_int != 0);
+
+        /* oss user flag */
+        rv = get_int(&mesg, &tmp_int);
+        if (rv) {
+                CERROR("fail to get oss user flag\n");
+                goto out;
+        }
+        rsci.ctx.gsc_usr_oss = (tmp_int != 0);
+
+        /* mapped uid */
+        rv = get_int(&mesg, (int *) &rsci.ctx.gsc_mapped_uid);
+        if (rv) {
+                CERROR("fail to get mapped uid\n");
+                goto out;
+        }
+
+        rscp = rsc_lookup(&rsci);
+        if (!rscp)
+                goto out;
+
+        /* uid, or NEGATIVE */
+        rv = get_int(&mesg, (int *) &rsci.ctx.gsc_uid);
+        if (rv == -EINVAL)
+                goto out;
+        if (rv == -ENOENT) {
+                CERROR("NOENT? set rsc entry negative\n");
+		set_bit(CACHE_NEGATIVE, &rsci.h.flags);
+        } else {
+		rawobj_t tmp_buf;
+		time64_t ctx_expiry;
+
+		/* gid */
+		if (get_int(&mesg, (int *) &rsci.ctx.gsc_gid))
+			goto out;
+
+		/* mech name */
+		len = qword_get(&mesg, buf, mlen);
+		if (len < 0)
+			goto out;
+		gm = lgss_name_to_mech(buf);
+		status = -EOPNOTSUPP;
+		if (!gm)
+			goto out;
+
+		status = -EINVAL;
+		/* mech-specific data: */
+		len = qword_get(&mesg, buf, mlen);
+		if (len < 0)
+			goto out;
+
+		tmp_buf.len = len;
+		tmp_buf.data = (unsigned char *)buf;
+		if (lgss_import_sec_context(&tmp_buf, gm,
+					    &rsci.ctx.gsc_mechctx))
+			goto out;
+
+		/* set to seconds since machine booted */
+		expiry = ktime_get_seconds();
+
+		/* currently the expiry time passed down from user-space
+		 * is invalid, here we retrive it from mech.
+		 */
+		if (lgss_inquire_context(rsci.ctx.gsc_mechctx,
+					 (unsigned long *)&ctx_expiry)) {
+			CERROR("unable to get expire time, drop it\n");
+			goto out;
+		}
+
+		/* ctx_expiry is the number of seconds since Jan 1 1970.
+		 * We want just the  number of seconds into the future.
+		 */
+		expiry += ctx_expiry - ktime_get_real_seconds();
+        }
+
+        rsci.h.expiry_time = expiry;
+        rscp = rsc_update(&rsci, rscp);
+        status = 0;
+out:
+        if (gm)
+                lgss_mech_put(gm);
+        rsc_free(&rsci);
+        if (rscp)
+                cache_put(&rscp->h, &rsc_cache);
+        else
+                status = -ENOMEM;
+
+        if (status)
+                CERROR("parse rsc error %d\n", status);
+        return status;
+}
+
+static struct cache_detail rsc_cache = {
+        .hash_size      = RSC_HASHMAX,
+        .hash_table     = rsc_table,
+        .name           = "auth.sptlrpc.context",
+        .cache_put      = rsc_put,
+        .cache_parse    = rsc_parse,
+        .match          = rsc_match,
+        .init           = rsc_init,
+        .update         = update_rsc,
+        .alloc          = rsc_alloc,
+};
+
+static struct rsc *rsc_lookup(struct rsc *item)
+{
+        struct cache_head *ch;
+        int                hash = rsc_hash(item);
+
+        ch = sunrpc_cache_lookup(&rsc_cache, &item->h, hash);
+        if (ch)
+                return container_of(ch, struct rsc, h);
+        else
+                return NULL;
+}
+
+static struct rsc *rsc_update(struct rsc *new, struct rsc *old)
+{
+        struct cache_head *ch;
+        int                hash = rsc_hash(new);
+
+        ch = sunrpc_cache_update(&rsc_cache, &new->h, &old->h, hash);
+        if (ch)
+                return container_of(ch, struct rsc, h);
+        else
+                return NULL;
+}
+
+#define COMPAT_RSC_PUT(item, cd)        cache_put((item), (cd))
+
+/****************************************
+ * rsc cache flush                      *
+ ****************************************/
+
+typedef int rsc_entry_match(struct rsc *rscp, long data);
+
+static void rsc_flush(rsc_entry_match *match, long data)
+{
+#ifdef HAVE_CACHE_HEAD_HLIST
+	struct cache_head *ch = NULL;
+	struct hlist_head *head;
+#else
+	struct cache_head **ch;
+#endif
+        struct rsc *rscp;
+        int n;
+        ENTRY;
+
+	write_lock(&rsc_cache.hash_lock);
+        for (n = 0; n < RSC_HASHMAX; n++) {
+#ifdef HAVE_CACHE_HEAD_HLIST
+		head = &rsc_cache.hash_table[n];
+		hlist_for_each_entry(ch, head, cache_list) {
+			rscp = container_of(ch, struct rsc, h);
+#else
+		for (ch = &rsc_cache.hash_table[n]; *ch;) {
+			rscp = container_of(*ch, struct rsc, h);
+#endif
+
+                        if (!match(rscp, data)) {
+#ifndef HAVE_CACHE_HEAD_HLIST
+				ch = &((*ch)->next);
+#endif
+                                continue;
+                        }
+
+                        /* it seems simply set NEGATIVE doesn't work */
+#ifdef HAVE_CACHE_HEAD_HLIST
+			hlist_del_init(&ch->cache_list);
+#else
+			*ch = (*ch)->next;
+			rscp->h.next = NULL;
+#endif
+                        cache_get(&rscp->h);
+			set_bit(CACHE_NEGATIVE, &rscp->h.flags);
+                        COMPAT_RSC_PUT(&rscp->h, &rsc_cache);
+                        rsc_cache.entries--;
+                }
+        }
+	write_unlock(&rsc_cache.hash_lock);
+        EXIT;
+}
+
+static int match_uid(struct rsc *rscp, long uid)
+{
+        if ((int) uid == -1)
+                return 1;
+        return ((int) rscp->ctx.gsc_uid == (int) uid);
+}
+
+static int match_target(struct rsc *rscp, long target)
+{
+        return (rscp->target == (struct obd_device *) target);
+}
+
+static inline void rsc_flush_uid(int uid)
+{
+        if (uid == -1)
+                CWARN("flush all gss contexts...\n");
+
+        rsc_flush(match_uid, (long) uid);
+}
+
+static inline void rsc_flush_target(struct obd_device *target)
+{
+        rsc_flush(match_target, (long) target);
+}
+
+void gss_secsvc_flush(struct obd_device *target)
+{
+        rsc_flush_target(target);
+}
+
+static struct rsc *gss_svc_searchbyctx(rawobj_t *handle)
+{
+        struct rsc  rsci;
+        struct rsc *found;
+
+        memset(&rsci, 0, sizeof(rsci));
+        if (rawobj_dup(&rsci.handle, handle))
+                return NULL;
+
+        found = rsc_lookup(&rsci);
+        rsc_free(&rsci);
+        if (!found)
+                return NULL;
+        if (cache_check(&rsc_cache, &found->h, NULL))
+                return NULL;
+        return found;
+}
+
+int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp,
+                                   struct gss_sec *gsec,
+                                   struct gss_cli_ctx *gctx)
+{
+        struct rsc      rsci, *rscp = NULL;
+        unsigned long   ctx_expiry;
+        __u32           major;
+        int             rc;
+        ENTRY;
+
+        memset(&rsci, 0, sizeof(rsci));
+
+        if (rawobj_alloc(&rsci.handle, (char *) &gsec->gs_rvs_hdl,
+                         sizeof(gsec->gs_rvs_hdl)))
+                GOTO(out, rc = -ENOMEM);
+
+        rscp = rsc_lookup(&rsci);
+        if (rscp == NULL)
+                GOTO(out, rc = -ENOMEM);
+
+        major = lgss_copy_reverse_context(gctx->gc_mechctx,
+                                          &rsci.ctx.gsc_mechctx);
+        if (major != GSS_S_COMPLETE)
+                GOTO(out, rc = -ENOMEM);
+
+        if (lgss_inquire_context(rsci.ctx.gsc_mechctx, &ctx_expiry)) {
+                CERROR("unable to get expire time, drop it\n");
+                GOTO(out, rc = -EINVAL);
+        }
+        rsci.h.expiry_time = (time_t) ctx_expiry;
+
+	switch (imp->imp_obd->u.cli.cl_sp_to) {
+	case LUSTRE_SP_MDT:
+		rsci.ctx.gsc_usr_mds = 1;
+		break;
+	case LUSTRE_SP_OST:
+		rsci.ctx.gsc_usr_oss = 1;
+		break;
+	case LUSTRE_SP_CLI:
+		rsci.ctx.gsc_usr_root = 1;
+	default:
+		break;
+	}
+
+        rscp = rsc_update(&rsci, rscp);
+        if (rscp == NULL)
+                GOTO(out, rc = -ENOMEM);
+
+        rscp->target = imp->imp_obd;
+        rawobj_dup(&gctx->gc_svc_handle, &rscp->handle);
+
+	CWARN("create reverse svc ctx %p to %s: idx %#llx\n",
+              &rscp->ctx, obd2cli_tgt(imp->imp_obd), gsec->gs_rvs_hdl);
+        rc = 0;
+out:
+        if (rscp)
+                cache_put(&rscp->h, &rsc_cache);
+        rsc_free(&rsci);
+
+        if (rc)
+		CERROR("create reverse svc ctx: idx %#llx, rc %d\n",
+                       gsec->gs_rvs_hdl, rc);
+        RETURN(rc);
+}
+
+int gss_svc_upcall_expire_rvs_ctx(rawobj_t *handle)
+{
+        const cfs_time_t        expire = 20;
+        struct rsc             *rscp;
+
+        rscp = gss_svc_searchbyctx(handle);
+        if (rscp) {
+                CDEBUG(D_SEC, "reverse svcctx %p (rsc %p) expire soon\n",
+                       &rscp->ctx, rscp);
+
+                rscp->h.expiry_time = cfs_time_current_sec() + expire;
+                COMPAT_RSC_PUT(&rscp->h, &rsc_cache);
+        }
+        return 0;
+}
+
+int gss_svc_upcall_dup_handle(rawobj_t *handle, struct gss_svc_ctx *ctx)
+{
+        struct rsc *rscp = container_of(ctx, struct rsc, ctx);
+
+        return rawobj_dup(handle, &rscp->handle);
+}
+
+int gss_svc_upcall_update_sequence(rawobj_t *handle, __u32 seq)
+{
+        struct rsc             *rscp;
+
+        rscp = gss_svc_searchbyctx(handle);
+        if (rscp) {
+                CDEBUG(D_SEC, "reverse svcctx %p (rsc %p) update seq to %u\n",
+                       &rscp->ctx, rscp, seq + 1);
+
+                rscp->ctx.gsc_rvs_seq = seq + 1;
+                COMPAT_RSC_PUT(&rscp->h, &rsc_cache);
+        }
+        return 0;
+}
+
+static struct cache_deferred_req* cache_upcall_defer(struct cache_req *req)
+{
+        return NULL;
+}
+static struct cache_req cache_upcall_chandle = { cache_upcall_defer };
+
+int gss_svc_upcall_handle_init(struct ptlrpc_request *req,
+			       struct gss_svc_reqctx *grctx,
+			       struct gss_wire_ctx *gw,
+			       struct obd_device *target,
+			       __u32 lustre_svc,
+			       rawobj_t *rvs_hdl,
+			       rawobj_t *in_token)
+{
+	struct ptlrpc_reply_state *rs;
+	struct rsc                *rsci = NULL;
+	struct rsi                *rsip = NULL, rsikey;
+	wait_queue_entry_t wait;
+	int                        replen = sizeof(struct ptlrpc_body);
+	struct gss_rep_header     *rephdr;
+	int                        first_check = 1;
+	int                        rc = SECSVC_DROP;
+	ENTRY;
+
+	memset(&rsikey, 0, sizeof(rsikey));
+	rsikey.lustre_svc = lustre_svc;
+	rsikey.nid = (__u64) req->rq_peer.nid;
+	nodemap_test_nid(req->rq_peer.nid, rsikey.nm_name,
+			 sizeof(rsikey.nm_name));
+
+        /* duplicate context handle. for INIT it always 0 */
+        if (rawobj_dup(&rsikey.in_handle, &gw->gw_handle)) {
+                CERROR("fail to dup context handle\n");
+                GOTO(out, rc);
+        }
+
+        if (rawobj_dup(&rsikey.in_token, in_token)) {
+                CERROR("can't duplicate token\n");
+                rawobj_free(&rsikey.in_handle);
+                GOTO(out, rc);
+        }
+
+        rsip = rsi_lookup(&rsikey);
+        rsi_free(&rsikey);
+        if (!rsip) {
+                CERROR("error in rsi_lookup.\n");
+
+                if (!gss_pack_err_notify(req, GSS_S_FAILURE, 0))
+                        rc = SECSVC_COMPLETE;
+
+                GOTO(out, rc);
+        }
+
+	cache_get(&rsip->h); /* take an extra ref */
+	init_waitqueue_head(&rsip->waitq);
+	init_waitqueue_entry(&wait, current);
+	add_wait_queue(&rsip->waitq, &wait);
+
+cache_check:
+	/* Note each time cache_check() will drop a reference if return
+	 * non-zero. We hold an extra reference on initial rsip, but must
+	 * take care of following calls. */
+	rc = cache_check(&rsi_cache, &rsip->h, &cache_upcall_chandle);
+	switch (rc) {
+	case -ETIMEDOUT:
+	case -EAGAIN: {
+		int valid;
+
+		if (first_check) {
+			first_check = 0;
+
+			read_lock(&rsi_cache.hash_lock);
+			valid = test_bit(CACHE_VALID, &rsip->h.flags);
+			if (valid == 0)
+				set_current_state(TASK_INTERRUPTIBLE);
+			read_unlock(&rsi_cache.hash_lock);
+
+			if (valid == 0) {
+				unsigned long jiffies;
+				jiffies = msecs_to_jiffies(MSEC_PER_SEC *
+					  GSS_SVC_UPCALL_TIMEOUT);
+				schedule_timeout(jiffies);
+			}
+			cache_get(&rsip->h);
+			goto cache_check;
+		}
+		CWARN("waited %ds timeout, drop\n", GSS_SVC_UPCALL_TIMEOUT);
+		break;
+	}
+	case -ENOENT:
+		CDEBUG(D_SEC, "cache_check return ENOENT, drop\n");
+		break;
+	case 0:
+		/* if not the first check, we have to release the extra
+		 * reference we just added on it. */
+		if (!first_check)
+			cache_put(&rsip->h, &rsi_cache);
+		CDEBUG(D_SEC, "cache_check is good\n");
+		break;
+	}
+
+	remove_wait_queue(&rsip->waitq, &wait);
+	cache_put(&rsip->h, &rsi_cache);
+
+	if (rc)
+		GOTO(out, rc = SECSVC_DROP);
+
+        rc = SECSVC_DROP;
+        rsci = gss_svc_searchbyctx(&rsip->out_handle);
+        if (!rsci) {
+                CERROR("authentication failed\n");
+
+		/* gss mechanism returned major and minor code so we return
+		 * those in error message */
+		if (!gss_pack_err_notify(req, rsip->major_status,
+					 rsip->minor_status))
+			rc = SECSVC_COMPLETE;
+
+                GOTO(out, rc);
+        } else {
+                cache_get(&rsci->h);
+                grctx->src_ctx = &rsci->ctx;
+        }
+
+        if (rawobj_dup(&rsci->ctx.gsc_rvs_hdl, rvs_hdl)) {
+                CERROR("failed duplicate reverse handle\n");
+                GOTO(out, rc);
+        }
+
+        rsci->target = target;
+
+        CDEBUG(D_SEC, "server create rsc %p(%u->%s)\n",
+               rsci, rsci->ctx.gsc_uid, libcfs_nid2str(req->rq_peer.nid));
+
+        if (rsip->out_handle.len > PTLRPC_GSS_MAX_HANDLE_SIZE) {
+                CERROR("handle size %u too large\n", rsip->out_handle.len);
+                GOTO(out, rc = SECSVC_DROP);
+        }
+
+        grctx->src_init = 1;
+        grctx->src_reserve_len = cfs_size_round4(rsip->out_token.len);
+
+        rc = lustre_pack_reply_v2(req, 1, &replen, NULL, 0);
+        if (rc) {
+                CERROR("failed to pack reply: %d\n", rc);
+                GOTO(out, rc = SECSVC_DROP);
+        }
+
+        rs = req->rq_reply_state;
+        LASSERT(rs->rs_repbuf->lm_bufcount == 3);
+        LASSERT(rs->rs_repbuf->lm_buflens[0] >=
+                sizeof(*rephdr) + rsip->out_handle.len);
+        LASSERT(rs->rs_repbuf->lm_buflens[2] >= rsip->out_token.len);
+
+        rephdr = lustre_msg_buf(rs->rs_repbuf, 0, 0);
+        rephdr->gh_version = PTLRPC_GSS_VERSION;
+        rephdr->gh_flags = 0;
+        rephdr->gh_proc = PTLRPC_GSS_PROC_ERR;
+        rephdr->gh_major = rsip->major_status;
+        rephdr->gh_minor = rsip->minor_status;
+        rephdr->gh_seqwin = GSS_SEQ_WIN;
+        rephdr->gh_handle.len = rsip->out_handle.len;
+        memcpy(rephdr->gh_handle.data, rsip->out_handle.data,
+               rsip->out_handle.len);
+
+        memcpy(lustre_msg_buf(rs->rs_repbuf, 2, 0), rsip->out_token.data,
+               rsip->out_token.len);
+
+        rs->rs_repdata_len = lustre_shrink_msg(rs->rs_repbuf, 2,
+                                               rsip->out_token.len, 0);
+
+        rc = SECSVC_OK;
+
+out:
+	/* it looks like here we should put rsip also, but this mess up
+	 * with NFS cache mgmt code... FIXME
+	 * something like:
+	 * if (rsip)
+	 *     rsi_put(&rsip->h, &rsi_cache); */
+
+	if (rsci) {
+		/* if anything went wrong, we don't keep the context too */
+		if (rc != SECSVC_OK)
+			set_bit(CACHE_NEGATIVE, &rsci->h.flags);
+		else
+			CDEBUG(D_SEC, "create rsc with idx %#llx\n",
+			       gss_handle_to_u64(&rsci->handle));
+
+		COMPAT_RSC_PUT(&rsci->h, &rsc_cache);
+	}
+	RETURN(rc);
+}
+
+struct gss_svc_ctx *gss_svc_upcall_get_ctx(struct ptlrpc_request *req,
+                                           struct gss_wire_ctx *gw)
+{
+        struct rsc *rsc;
+
+        rsc = gss_svc_searchbyctx(&gw->gw_handle);
+        if (!rsc) {
+		CWARN("Invalid gss ctx idx %#llx from %s\n",
+                      gss_handle_to_u64(&gw->gw_handle),
+                      libcfs_nid2str(req->rq_peer.nid));
+                return NULL;
+        }
+
+        return &rsc->ctx;
+}
+
+void gss_svc_upcall_put_ctx(struct gss_svc_ctx *ctx)
+{
+        struct rsc *rsc = container_of(ctx, struct rsc, ctx);
+
+        COMPAT_RSC_PUT(&rsc->h, &rsc_cache);
+}
+
+void gss_svc_upcall_destroy_ctx(struct gss_svc_ctx *ctx)
+{
+        struct rsc *rsc = container_of(ctx, struct rsc, ctx);
+
+        /* can't be found */
+	set_bit(CACHE_NEGATIVE, &rsc->h.flags);
+        /* to be removed at next scan */
+        rsc->h.expiry_time = 1;
+}
+
+int __init gss_init_svc_upcall(void)
+{
+	int	i, rc;
+
+	spin_lock_init(&__ctx_index_lock);
+	/*
+	 * this helps reducing context index confliction. after server reboot,
+	 * conflicting request from clients might be filtered out by initial
+	 * sequence number checking, thus no chance to sent error notification
+	 * back to clients.
+	 */
+	cfs_get_random_bytes(&__ctx_index, sizeof(__ctx_index));
+
+	rc = _cache_register_net(&rsi_cache, &init_net);
+	if (rc != 0)
+		return rc;
+
+	rc = _cache_register_net(&rsc_cache, &init_net);
+	if (rc != 0) {
+		_cache_unregister_net(&rsi_cache, &init_net);
+		return rc;
+	}
+
+	/* FIXME this looks stupid. we intend to give lsvcgssd a chance to open
+	 * the init upcall channel, otherwise there's big chance that the first
+	 * upcall issued before the channel be opened thus nfsv4 cache code will
+	 * drop the request direclty, thus lead to unnecessary recovery time.
+	 * here we wait at miximum 1.5 seconds. */
+	for (i = 0; i < 6; i++) {
+		if (atomic_read(&rsi_cache.readers) > 0)
+			break;
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		LASSERT(msecs_to_jiffies(MSEC_PER_SEC) >= 4);
+		schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC / 4));
+	}
+
+	if (atomic_read(&rsi_cache.readers) == 0)
+		CWARN("Init channel is not opened by lsvcgssd, following "
+		      "request might be dropped until lsvcgssd is active\n");
+
+	return 0;
+}
+
+void gss_exit_svc_upcall(void)
+{
+	cache_purge(&rsi_cache);
+	_cache_unregister_net(&rsi_cache, &init_net);
+
+	cache_purge(&rsc_cache);
+	_cache_unregister_net(&rsc_cache, &init_net);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/lproc_gss.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/lproc_gss.c
new file mode 100644
index 0000000000000..610f0b38c8d4f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/lproc_gss.c
@@ -0,0 +1,226 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lprocfs_status.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+static struct proc_dir_entry *gss_proc_root = NULL;
+static struct proc_dir_entry *gss_proc_lk = NULL;
+
+/*
+ * statistic of "out-of-sequence-window"
+ */
+static struct {
+	spinlock_t	oos_lock;
+	atomic_t	oos_cli_count;		/* client occurrence */
+	int		oos_cli_behind;		/* client max seqs behind */
+	atomic_t	oos_svc_replay[3];	/* server replay detected */
+	atomic_t	oos_svc_pass[3];	/* server verified ok */
+} gss_stat_oos = {
+	.oos_cli_count	= ATOMIC_INIT(0),
+	.oos_cli_behind	= 0,
+	.oos_svc_replay	= { ATOMIC_INIT(0), },
+	.oos_svc_pass	= { ATOMIC_INIT(0), },
+};
+
+void gss_stat_oos_record_cli(int behind)
+{
+	atomic_inc(&gss_stat_oos.oos_cli_count);
+
+	spin_lock(&gss_stat_oos.oos_lock);
+	if (behind > gss_stat_oos.oos_cli_behind)
+		gss_stat_oos.oos_cli_behind = behind;
+	spin_unlock(&gss_stat_oos.oos_lock);
+}
+
+void gss_stat_oos_record_svc(int phase, int replay)
+{
+	LASSERT(phase >= 0 && phase <= 2);
+
+	if (replay)
+		atomic_inc(&gss_stat_oos.oos_svc_replay[phase]);
+	else
+		atomic_inc(&gss_stat_oos.oos_svc_pass[phase]);
+}
+
+static int gss_proc_oos_seq_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "seqwin:		   %u\n"
+		   "backwin:		%u\n"
+		   "client fall behind seqwin\n"
+		   "  occurrence:	%d\n"
+		   "  max seq behind:	%d\n"
+		   "server replay detected:\n"
+		   "  phase 0:		%d\n"
+		   "  phase 1:		%d\n"
+		   "  phase 2:		%d\n"
+		   "server verify ok:\n"
+		   "  phase 2:		%d\n",
+		   GSS_SEQ_WIN_MAIN,
+		   GSS_SEQ_WIN_BACK,
+		   atomic_read(&gss_stat_oos.oos_cli_count),
+		   gss_stat_oos.oos_cli_behind,
+		   atomic_read(&gss_stat_oos.oos_svc_replay[0]),
+		   atomic_read(&gss_stat_oos.oos_svc_replay[1]),
+		   atomic_read(&gss_stat_oos.oos_svc_replay[2]),
+		   atomic_read(&gss_stat_oos.oos_svc_pass[2]));
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(gss_proc_oos);
+
+static ssize_t
+gss_proc_write_secinit(struct file *file, const char *buffer,
+				  size_t count, loff_t *off)
+{
+        int rc;
+
+        rc = gss_do_ctx_init_rpc((char *) buffer, count);
+        if (rc) {
+                LASSERT(rc < 0);
+                return rc;
+        }
+	return count;
+}
+
+static const struct file_operations gss_proc_secinit = {
+	.write = gss_proc_write_secinit,
+};
+
+static struct lprocfs_vars gss_lprocfs_vars[] = {
+	{ .name	=	"replays",
+	  .fops	=	&gss_proc_oos_fops	},
+	{ .name	=	"init_channel",
+	  .fops	=	&gss_proc_secinit,
+	  .proc_mode =	0222			},
+	{ NULL }
+};
+
+/*
+ * for userspace helper lgss_keyring.
+ *
+ * debug_level: [0, 4], defined in utils/gss/lgss_utils.h
+ */
+static int gss_lk_debug_level = 1;
+
+static int gss_lk_proc_dl_seq_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%u\n", gss_lk_debug_level);
+	return 0;
+}
+
+static ssize_t
+gss_lk_proc_dl_seq_write(struct file *file, const char __user *buffer,
+				size_t count, loff_t *off)
+{
+	int rc;
+	__s64 val;
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc < 0)
+		return rc;
+
+	if (val < 0 || val > 4)
+		return -ERANGE;
+
+	gss_lk_debug_level = val;
+
+	return count;
+}
+LPROC_SEQ_FOPS(gss_lk_proc_dl);
+
+static struct lprocfs_vars gss_lk_lprocfs_vars[] = {
+	{ .name	=	"debug_level",
+	  .fops	=	&gss_lk_proc_dl_fops	},
+	{ NULL }
+};
+
+void gss_exit_lproc(void)
+{
+        if (gss_proc_lk) {
+                lprocfs_remove(&gss_proc_lk);
+                gss_proc_lk = NULL;
+        }
+
+        if (gss_proc_root) {
+                lprocfs_remove(&gss_proc_root);
+                gss_proc_root = NULL;
+        }
+}
+
+int gss_init_lproc(void)
+{
+	int	rc;
+
+	spin_lock_init(&gss_stat_oos.oos_lock);
+
+	gss_proc_root = lprocfs_register("gss", sptlrpc_proc_root,
+					 gss_lprocfs_vars, NULL);
+	if (IS_ERR(gss_proc_root)) {
+		rc = PTR_ERR(gss_proc_root);
+		gss_proc_root = NULL;
+		GOTO(out, rc);
+	}
+
+	gss_proc_lk = lprocfs_register("lgss_keyring", gss_proc_root,
+				       gss_lk_lprocfs_vars, NULL);
+	if (IS_ERR(gss_proc_lk)) {
+		rc = PTR_ERR(gss_proc_lk);
+		gss_proc_lk = NULL;
+		GOTO(out, rc);
+	}
+
+	return 0;
+
+out:
+	CERROR("failed to initialize gss lproc entries: %d\n", rc);
+	gss_exit_lproc();
+
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/sec_gss.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/sec_gss.c
new file mode 100644
index 0000000000000..bee52f3751356
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/sec_gss.c
@@ -0,0 +1,2926 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * linux/net/sunrpc/auth_gss.c
+ *
+ * RPCSEC_GSS client authentication.
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Dug Song       <dugsong@monkey.org>
+ *  Andy Adamson   <andros@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <asm/atomic.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <obd_cksum.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+#include <linux/crypto.h>
+#include <linux/crc32.h>
+
+/*
+ * early reply have fixed size, respectively in privacy and integrity mode.
+ * so we calculate them only once.
+ */
+static int gss_at_reply_off_integ;
+static int gss_at_reply_off_priv;
+
+
+static inline int msg_last_segidx(struct lustre_msg *msg)
+{
+        LASSERT(msg->lm_bufcount > 0);
+        return msg->lm_bufcount - 1;
+}
+static inline int msg_last_seglen(struct lustre_msg *msg)
+{
+        return msg->lm_buflens[msg_last_segidx(msg)];
+}
+
+/********************************************
+ * wire data swabber                        *
+ ********************************************/
+
+static
+void gss_header_swabber(struct gss_header *ghdr)
+{
+        __swab32s(&ghdr->gh_flags);
+        __swab32s(&ghdr->gh_proc);
+        __swab32s(&ghdr->gh_seq);
+        __swab32s(&ghdr->gh_svc);
+        __swab32s(&ghdr->gh_pad1);
+        __swab32s(&ghdr->gh_handle.len);
+}
+
+struct gss_header *gss_swab_header(struct lustre_msg *msg, int segment,
+                                   int swabbed)
+{
+        struct gss_header *ghdr;
+
+        ghdr = lustre_msg_buf(msg, segment, sizeof(*ghdr));
+        if (ghdr == NULL)
+                return NULL;
+
+        if (swabbed)
+                gss_header_swabber(ghdr);
+
+        if (sizeof(*ghdr) + ghdr->gh_handle.len > msg->lm_buflens[segment]) {
+                CERROR("gss header has length %d, now %u received\n",
+                       (int) sizeof(*ghdr) + ghdr->gh_handle.len,
+                       msg->lm_buflens[segment]);
+                return NULL;
+        }
+
+        return ghdr;
+}
+
+/*
+ * payload should be obtained from mechanism. but currently since we
+ * only support kerberos, we could simply use fixed value.
+ * krb5 "meta" data:
+ *  - krb5 header:      16
+ *  - krb5 checksum:    20
+ *
+ * for privacy mode, payload also include the cipher text which has the same
+ * size as plain text, plus possible confounder, padding both at maximum cipher
+ * block size.
+ */
+#define GSS_KRB5_INTEG_MAX_PAYLOAD      (40)
+
+static inline
+int gss_mech_payload(struct gss_ctx *mechctx, int msgsize, int privacy)
+{
+        if (privacy)
+                return GSS_KRB5_INTEG_MAX_PAYLOAD + 16 + 16 + 16 + msgsize;
+        else
+                return GSS_KRB5_INTEG_MAX_PAYLOAD;
+}
+
+/*
+ * return signature size, otherwise < 0 to indicate error
+ */
+static int gss_sign_msg(struct lustre_msg *msg,
+                        struct gss_ctx *mechctx,
+                        enum lustre_sec_part sp,
+                        __u32 flags, __u32 proc, __u32 seq, __u32 svc,
+                        rawobj_t *handle)
+{
+        struct gss_header      *ghdr;
+        rawobj_t                text[4], mic;
+        int                     textcnt, max_textcnt, mic_idx;
+        __u32                   major;
+
+        LASSERT(msg->lm_bufcount >= 2);
+
+        /* gss hdr */
+        LASSERT(msg->lm_buflens[0] >=
+                sizeof(*ghdr) + (handle ? handle->len : 0));
+        ghdr = lustre_msg_buf(msg, 0, 0);
+
+        ghdr->gh_version = PTLRPC_GSS_VERSION;
+        ghdr->gh_sp = (__u8) sp;
+        ghdr->gh_flags = flags;
+        ghdr->gh_proc = proc;
+        ghdr->gh_seq = seq;
+        ghdr->gh_svc = svc;
+        if (!handle) {
+                /* fill in a fake one */
+                ghdr->gh_handle.len = 0;
+        } else {
+                ghdr->gh_handle.len = handle->len;
+                memcpy(ghdr->gh_handle.data, handle->data, handle->len);
+        }
+
+        /* no actual signature for null mode */
+        if (svc == SPTLRPC_SVC_NULL)
+                return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+
+        /* MIC */
+        mic_idx = msg_last_segidx(msg);
+        max_textcnt = (svc == SPTLRPC_SVC_AUTH) ? 1 : mic_idx;
+
+        for (textcnt = 0; textcnt < max_textcnt; textcnt++) {
+                text[textcnt].len = msg->lm_buflens[textcnt];
+                text[textcnt].data = lustre_msg_buf(msg, textcnt, 0);
+        }
+
+        mic.len = msg->lm_buflens[mic_idx];
+        mic.data = lustre_msg_buf(msg, mic_idx, 0);
+
+        major = lgss_get_mic(mechctx, textcnt, text, 0, NULL, &mic);
+        if (major != GSS_S_COMPLETE) {
+                CERROR("fail to generate MIC: %08x\n", major);
+                return -EPERM;
+        }
+        LASSERT(mic.len <= msg->lm_buflens[mic_idx]);
+
+        return lustre_shrink_msg(msg, mic_idx, mic.len, 0);
+}
+
+/*
+ * return gss error
+ */
+static
+__u32 gss_verify_msg(struct lustre_msg *msg,
+                     struct gss_ctx *mechctx,
+                     __u32 svc)
+{
+        rawobj_t        text[4], mic;
+        int             textcnt, max_textcnt;
+        int             mic_idx;
+        __u32           major;
+
+        LASSERT(msg->lm_bufcount >= 2);
+
+        if (svc == SPTLRPC_SVC_NULL)
+                return GSS_S_COMPLETE;
+
+        mic_idx = msg_last_segidx(msg);
+        max_textcnt = (svc == SPTLRPC_SVC_AUTH) ? 1 : mic_idx;
+
+        for (textcnt = 0; textcnt < max_textcnt; textcnt++) {
+                text[textcnt].len = msg->lm_buflens[textcnt];
+                text[textcnt].data = lustre_msg_buf(msg, textcnt, 0);
+        }
+
+        mic.len = msg->lm_buflens[mic_idx];
+        mic.data = lustre_msg_buf(msg, mic_idx, 0);
+
+        major = lgss_verify_mic(mechctx, textcnt, text, 0, NULL, &mic);
+        if (major != GSS_S_COMPLETE)
+                CERROR("mic verify error: %08x\n", major);
+
+        return major;
+}
+
+/*
+ * return gss error code
+ */
+static
+__u32 gss_unseal_msg(struct gss_ctx *mechctx,
+                   struct lustre_msg *msgbuf,
+                   int *msg_len, int msgbuf_len)
+{
+        rawobj_t                 clear_obj, hdrobj, token;
+        __u8                    *clear_buf;
+        int                      clear_buflen;
+        __u32                    major;
+        ENTRY;
+
+        if (msgbuf->lm_bufcount != 2) {
+                CERROR("invalid bufcount %d\n", msgbuf->lm_bufcount);
+                RETURN(GSS_S_FAILURE);
+        }
+
+        /* allocate a temporary clear text buffer, same sized as token,
+         * we assume the final clear text size <= token size */
+        clear_buflen = lustre_msg_buflen(msgbuf, 1);
+        OBD_ALLOC_LARGE(clear_buf, clear_buflen);
+        if (!clear_buf)
+                RETURN(GSS_S_FAILURE);
+
+        /* buffer objects */
+        hdrobj.len = lustre_msg_buflen(msgbuf, 0);
+        hdrobj.data = lustre_msg_buf(msgbuf, 0, 0);
+        token.len = lustre_msg_buflen(msgbuf, 1);
+        token.data = lustre_msg_buf(msgbuf, 1, 0);
+        clear_obj.len = clear_buflen;
+        clear_obj.data = clear_buf;
+
+        major = lgss_unwrap(mechctx, &hdrobj, &token, &clear_obj);
+        if (major != GSS_S_COMPLETE) {
+                CERROR("unwrap message error: %08x\n", major);
+                GOTO(out_free, major = GSS_S_FAILURE);
+        }
+        LASSERT(clear_obj.len <= clear_buflen);
+        LASSERT(clear_obj.len <= msgbuf_len);
+
+        /* now the decrypted message */
+        memcpy(msgbuf, clear_obj.data, clear_obj.len);
+        *msg_len = clear_obj.len;
+
+        major = GSS_S_COMPLETE;
+out_free:
+        OBD_FREE_LARGE(clear_buf, clear_buflen);
+        RETURN(major);
+}
+
+/********************************************
+ * gss client context manipulation helpers  *
+ ********************************************/
+
+int cli_ctx_expire(struct ptlrpc_cli_ctx *ctx)
+{
+	LASSERT(atomic_read(&ctx->cc_refcount));
+
+	if (!test_and_set_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags)) {
+		if (!ctx->cc_early_expire)
+			clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags);
+
+		CWARN("ctx %p(%u->%s) get expired: %lu(%+lds)\n",
+		      ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
+		      ctx->cc_expire,
+		      ctx->cc_expire == 0 ? 0 :
+		      cfs_time_sub(ctx->cc_expire, cfs_time_current_sec()));
+
+		sptlrpc_cli_ctx_wakeup(ctx);
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
+ * return 1 if the context is dead.
+ */
+int cli_ctx_check_death(struct ptlrpc_cli_ctx *ctx)
+{
+        if (unlikely(cli_ctx_is_dead(ctx)))
+                return 1;
+
+        /* expire is 0 means never expire. a newly created gss context
+         * which during upcall may has 0 expiration */
+        if (ctx->cc_expire == 0)
+                return 0;
+
+        /* check real expiration */
+        if (cfs_time_after(ctx->cc_expire, cfs_time_current_sec()))
+                return 0;
+
+        cli_ctx_expire(ctx);
+        return 1;
+}
+
+void gss_cli_ctx_uptodate(struct gss_cli_ctx *gctx)
+{
+        struct ptlrpc_cli_ctx  *ctx = &gctx->gc_base;
+        unsigned long           ctx_expiry;
+
+        if (lgss_inquire_context(gctx->gc_mechctx, &ctx_expiry)) {
+                CERROR("ctx %p(%u): unable to inquire, expire it now\n",
+                       gctx, ctx->cc_vcred.vc_uid);
+                ctx_expiry = 1; /* make it expired now */
+        }
+
+        ctx->cc_expire = gss_round_ctx_expiry(ctx_expiry,
+                                              ctx->cc_sec->ps_flvr.sf_flags);
+
+        /* At this point this ctx might have been marked as dead by
+         * someone else, in which case nobody will make further use
+         * of it. we don't care, and mark it UPTODATE will help
+         * destroying server side context when it be destroyed. */
+	set_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags);
+
+	if (sec_is_reverse(ctx->cc_sec)) {
+		CWARN("server installed reverse ctx %p idx %#llx, "
+		      "expiry %lu(%+lds)\n", ctx,
+		      gss_handle_to_u64(&gctx->gc_handle),
+		      ctx->cc_expire,
+		      cfs_time_sub(ctx->cc_expire, cfs_time_current_sec()));
+        } else {
+		CWARN("client refreshed ctx %p idx %#llx (%u->%s), "
+		      "expiry %lu(%+lds)\n", ctx,
+		      gss_handle_to_u64(&gctx->gc_handle),
+		      ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
+		      ctx->cc_expire,
+		      cfs_time_sub(ctx->cc_expire, cfs_time_current_sec()));
+
+		/* install reverse svc ctx for root context */
+		if (ctx->cc_vcred.vc_uid == 0)
+			gss_sec_install_rctx(ctx->cc_sec->ps_import,
+					     ctx->cc_sec, ctx);
+	}
+
+        sptlrpc_cli_ctx_wakeup(ctx);
+}
+
+static void gss_cli_ctx_finalize(struct gss_cli_ctx *gctx)
+{
+        LASSERT(gctx->gc_base.cc_sec);
+
+        if (gctx->gc_mechctx) {
+                lgss_delete_sec_context(&gctx->gc_mechctx);
+                gctx->gc_mechctx = NULL;
+        }
+
+        if (!rawobj_empty(&gctx->gc_svc_handle)) {
+                /* forward ctx: mark buddy reverse svcctx soon-expire. */
+                if (!sec_is_reverse(gctx->gc_base.cc_sec) &&
+                    !rawobj_empty(&gctx->gc_svc_handle))
+                        gss_svc_upcall_expire_rvs_ctx(&gctx->gc_svc_handle);
+
+                rawobj_free(&gctx->gc_svc_handle);
+        }
+
+        rawobj_free(&gctx->gc_handle);
+}
+
+/**
+ * Based on sequence number algorithm as specified in RFC 2203.
+ *
+ * Modified for our own problem: arriving request has valid sequence number,
+ * but unwrapping request might cost a long time, after that its sequence
+ * are not valid anymore (fall behind the window). It rarely happen, mostly
+ * under extreme load.
+ *
+ * Note we should not check sequence before verifying the integrity of incoming
+ * request, because just one attacking request with high sequence number might
+ * cause all following requests be dropped.
+ *
+ * So here we use a multi-phase approach: prepare 2 sequence windows,
+ * "main window" for normal sequence and "back window" for fall behind sequence.
+ * and 3-phase checking mechanism:
+ *  0 - before integrity verification, perform an initial sequence checking in
+ *      main window, which only tries and doesn't actually set any bits. if the
+ *      sequence is high above the window or fits in the window and the bit
+ *      is 0, then accept and proceed to integrity verification. otherwise
+ *      reject this sequence.
+ *  1 - after integrity verification, check in main window again. if this
+ *      sequence is high above the window or fits in the window and the bit
+ *      is 0, then set the bit and accept; if it fits in the window but bit
+ *      already set, then reject; if it falls behind the window, then proceed
+ *      to phase 2.
+ *  2 - check in back window. if it is high above the window or fits in the
+ *      window and the bit is 0, then set the bit and accept. otherwise reject.
+ *
+ * \return	 1:	looks like a replay
+ * \return	 0:	is ok
+ * \return	-1:	is a replay
+ *
+ * Note phase 0 is necessary, because otherwise replay attacking request of
+ * sequence which between the 2 windows can't be detected.
+ *
+ * This mechanism can't totally solve the problem, but could help reduce the
+ * number of valid requests be dropped.
+ */
+static
+int gss_do_check_seq(unsigned long *window, __u32 win_size, __u32 *max_seq,
+                     __u32 seq_num, int phase)
+{
+        LASSERT(phase >= 0 && phase <= 2);
+
+        if (seq_num > *max_seq) {
+                /*
+                 * 1. high above the window
+                 */
+                if (phase == 0)
+                        return 0;
+
+                if (seq_num >= *max_seq + win_size) {
+                        memset(window, 0, win_size / 8);
+                        *max_seq = seq_num;
+                } else {
+                        while(*max_seq < seq_num) {
+                                (*max_seq)++;
+                                __clear_bit((*max_seq) % win_size, window);
+                        }
+                }
+                __set_bit(seq_num % win_size, window);
+        } else if (seq_num + win_size <= *max_seq) {
+                /*
+                 * 2. low behind the window
+                 */
+                if (phase == 0 || phase == 2)
+                        goto replay;
+
+                CWARN("seq %u is %u behind (size %d), check backup window\n",
+                      seq_num, *max_seq - win_size - seq_num, win_size);
+                return 1;
+        } else {
+                /*
+                 * 3. fit into the window
+                 */
+                switch (phase) {
+                case 0:
+			if (test_bit(seq_num % win_size, window))
+                                goto replay;
+                        break;
+                case 1:
+                case 2:
+                     if (__test_and_set_bit(seq_num % win_size, window))
+                                goto replay;
+                        break;
+                }
+        }
+
+        return 0;
+
+replay:
+        CERROR("seq %u (%s %s window) is a replay: max %u, winsize %d\n",
+               seq_num,
+               seq_num + win_size > *max_seq ? "in" : "behind",
+               phase == 2 ? "backup " : "main",
+               *max_seq, win_size);
+        return -1;
+}
+
+/*
+ * Based on sequence number algorithm as specified in RFC 2203.
+ *
+ * if @set == 0: initial check, don't set any bit in window
+ * if @sec == 1: final check, set bit in window
+ */
+int gss_check_seq_num(struct gss_svc_seq_data *ssd, __u32 seq_num, int set)
+{
+	int rc = 0;
+
+	spin_lock(&ssd->ssd_lock);
+
+        if (set == 0) {
+                /*
+                 * phase 0 testing
+                 */
+                rc = gss_do_check_seq(ssd->ssd_win_main, GSS_SEQ_WIN_MAIN,
+                                      &ssd->ssd_max_main, seq_num, 0);
+                if (unlikely(rc))
+                        gss_stat_oos_record_svc(0, 1);
+        } else {
+                /*
+                 * phase 1 checking main window
+                 */
+                rc = gss_do_check_seq(ssd->ssd_win_main, GSS_SEQ_WIN_MAIN,
+                                      &ssd->ssd_max_main, seq_num, 1);
+                switch (rc) {
+                case -1:
+                        gss_stat_oos_record_svc(1, 1);
+                        /* fall through */
+                case 0:
+                        goto exit;
+                }
+                /*
+                 * phase 2 checking back window
+                 */
+                rc = gss_do_check_seq(ssd->ssd_win_back, GSS_SEQ_WIN_BACK,
+                                      &ssd->ssd_max_back, seq_num, 2);
+                if (rc)
+                        gss_stat_oos_record_svc(2, 1);
+                else
+                        gss_stat_oos_record_svc(2, 0);
+        }
+exit:
+	spin_unlock(&ssd->ssd_lock);
+	return rc;
+}
+
+/***************************************
+ * cred APIs                           *
+ ***************************************/
+
+static inline int gss_cli_payload(struct ptlrpc_cli_ctx *ctx,
+                                  int msgsize, int privacy)
+{
+        return gss_mech_payload(NULL, msgsize, privacy);
+}
+
+static int gss_cli_bulk_payload(struct ptlrpc_cli_ctx *ctx,
+                                struct sptlrpc_flavor *flvr,
+                                int reply, int read)
+{
+        int     payload = sizeof(struct ptlrpc_bulk_sec_desc);
+
+        LASSERT(SPTLRPC_FLVR_BULK_TYPE(flvr->sf_rpc) == SPTLRPC_BULK_DEFAULT);
+
+        if ((!reply && !read) || (reply && read)) {
+                switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) {
+                case SPTLRPC_BULK_SVC_NULL:
+                        break;
+                case SPTLRPC_BULK_SVC_INTG:
+                        payload += gss_cli_payload(ctx, 0, 0);
+                        break;
+                case SPTLRPC_BULK_SVC_PRIV:
+                        payload += gss_cli_payload(ctx, 0, 1);
+                        break;
+                case SPTLRPC_BULK_SVC_AUTH:
+                default:
+                        LBUG();
+                }
+        }
+
+        return payload;
+}
+
+int gss_cli_ctx_match(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred)
+{
+        return (ctx->cc_vcred.vc_uid == vcred->vc_uid);
+}
+
+void gss_cli_ctx_flags2str(unsigned long flags, char *buf, int bufsize)
+{
+	buf[0] = '\0';
+
+	if (flags & PTLRPC_CTX_NEW)
+		strlcat(buf, "new,", bufsize);
+	if (flags & PTLRPC_CTX_UPTODATE)
+		strlcat(buf, "uptodate,", bufsize);
+	if (flags & PTLRPC_CTX_DEAD)
+		strlcat(buf, "dead,", bufsize);
+	if (flags & PTLRPC_CTX_ERROR)
+		strlcat(buf, "error,", bufsize);
+	if (flags & PTLRPC_CTX_CACHED)
+		strlcat(buf, "cached,", bufsize);
+	if (flags & PTLRPC_CTX_ETERNAL)
+		strlcat(buf, "eternal,", bufsize);
+	if (buf[0] == '\0')
+		strlcat(buf, "-,", bufsize);
+}
+
+int gss_cli_ctx_sign(struct ptlrpc_cli_ctx *ctx,
+                     struct ptlrpc_request *req)
+{
+        struct gss_cli_ctx      *gctx = ctx2gctx(ctx);
+        __u32                    flags = 0, seq, svc;
+        int                      rc;
+        ENTRY;
+
+        LASSERT(req->rq_reqbuf);
+        LASSERT(req->rq_reqbuf->lm_bufcount >= 2);
+        LASSERT(req->rq_cli_ctx == ctx);
+
+        /* nothing to do for context negotiation RPCs */
+        if (req->rq_ctx_init)
+                RETURN(0);
+
+        svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+        if (req->rq_pack_bulk)
+                flags |= LUSTRE_GSS_PACK_BULK;
+        if (req->rq_pack_udesc)
+                flags |= LUSTRE_GSS_PACK_USER;
+
+redo:
+	seq = atomic_inc_return(&gctx->gc_seq);
+
+	rc = gss_sign_msg(req->rq_reqbuf, gctx->gc_mechctx,
+			  ctx->cc_sec->ps_part,
+			  flags, gctx->gc_proc, seq, svc,
+			  &gctx->gc_handle);
+	if (rc < 0)
+		RETURN(rc);
+
+	/* gss_sign_msg() msg might take long time to finish, in which period
+	 * more rpcs could be wrapped up and sent out. if we found too many
+	 * of them we should repack this rpc, because sent it too late might
+	 * lead to the sequence number fall behind the window on server and
+	 * be dropped. also applies to gss_cli_ctx_seal().
+	 *
+	 * Note: null mode doesn't check sequence number. */
+	if (svc != SPTLRPC_SVC_NULL &&
+	    atomic_read(&gctx->gc_seq) - seq > GSS_SEQ_REPACK_THRESHOLD) {
+		int behind = atomic_read(&gctx->gc_seq) - seq;
+
+		gss_stat_oos_record_cli(behind);
+		CWARN("req %p: %u behind, retry signing\n", req, behind);
+		goto redo;
+	}
+
+	req->rq_reqdata_len = rc;
+	RETURN(0);
+}
+
+static
+int gss_cli_ctx_handle_err_notify(struct ptlrpc_cli_ctx *ctx,
+                                  struct ptlrpc_request *req,
+                                  struct gss_header *ghdr)
+{
+        struct gss_err_header *errhdr;
+        int rc;
+
+        LASSERT(ghdr->gh_proc == PTLRPC_GSS_PROC_ERR);
+
+        errhdr = (struct gss_err_header *) ghdr;
+
+	CWARN("req x%llu/t%llu, ctx %p idx %#llx(%u->%s): "
+              "%sserver respond (%08x/%08x)\n",
+              req->rq_xid, req->rq_transno, ctx,
+              gss_handle_to_u64(&ctx2gctx(ctx)->gc_handle),
+              ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
+              sec_is_reverse(ctx->cc_sec) ? "reverse" : "",
+              errhdr->gh_major, errhdr->gh_minor);
+
+        /* context fini rpc, let it failed */
+        if (req->rq_ctx_fini) {
+                CWARN("context fini rpc failed\n");
+                return -EINVAL;
+        }
+
+        /* reverse sec, just return error, don't expire this ctx because it's
+         * crucial to callback rpcs. note if the callback rpc failed because
+         * of bit flip during network transfer, the client will be evicted
+         * directly. so more gracefully we probably want let it retry for
+         * number of times. */
+        if (sec_is_reverse(ctx->cc_sec))
+                return -EINVAL;
+
+        if (errhdr->gh_major != GSS_S_NO_CONTEXT &&
+            errhdr->gh_major != GSS_S_BAD_SIG)
+                return -EACCES;
+
+        /* server return NO_CONTEXT might be caused by context expire
+         * or server reboot/failover. we try to refresh a new ctx which
+         * be transparent to upper layer.
+         *
+         * In some cases, our gss handle is possible to be incidentally
+         * identical to another handle since the handle itself is not
+         * fully random. In krb5 case, the GSS_S_BAD_SIG will be
+         * returned, maybe other gss error for other mechanism.
+         *
+         * if we add new mechanism, make sure the correct error are
+         * returned in this case. */
+        CWARN("%s: server might lost the context, retrying\n",
+              errhdr->gh_major == GSS_S_NO_CONTEXT ?  "NO_CONTEXT" : "BAD_SIG");
+
+        sptlrpc_cli_ctx_expire(ctx);
+
+        /* we need replace the ctx right here, otherwise during
+         * resent we'll hit the logic in sptlrpc_req_refresh_ctx()
+         * which keep the ctx with RESEND flag, thus we'll never
+         * get rid of this ctx. */
+        rc = sptlrpc_req_replace_dead_ctx(req);
+        if (rc == 0)
+                req->rq_resend = 1;
+
+        return rc;
+}
+
+int gss_cli_ctx_verify(struct ptlrpc_cli_ctx *ctx,
+                       struct ptlrpc_request *req)
+{
+        struct gss_cli_ctx     *gctx;
+        struct gss_header      *ghdr, *reqhdr;
+        struct lustre_msg      *msg = req->rq_repdata;
+        __u32                   major;
+        int                     pack_bulk, swabbed, rc = 0;
+        ENTRY;
+
+        LASSERT(req->rq_cli_ctx == ctx);
+        LASSERT(msg);
+
+        gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+
+        /* special case for context negotiation, rq_repmsg/rq_replen actually
+         * are not used currently. but early reply always be treated normally */
+        if (req->rq_ctx_init && !req->rq_early) {
+                req->rq_repmsg = lustre_msg_buf(msg, 1, 0);
+                req->rq_replen = msg->lm_buflens[1];
+                RETURN(0);
+        }
+
+        if (msg->lm_bufcount < 2 || msg->lm_bufcount > 4) {
+                CERROR("unexpected bufcount %u\n", msg->lm_bufcount);
+                RETURN(-EPROTO);
+        }
+
+        swabbed = ptlrpc_rep_need_swab(req);
+
+        ghdr = gss_swab_header(msg, 0, swabbed);
+        if (ghdr == NULL) {
+                CERROR("can't decode gss header\n");
+                RETURN(-EPROTO);
+        }
+
+        /* sanity checks */
+        reqhdr = lustre_msg_buf(msg, 0, sizeof(*reqhdr));
+        LASSERT(reqhdr);
+
+        if (ghdr->gh_version != reqhdr->gh_version) {
+                CERROR("gss version %u mismatch, expect %u\n",
+                       ghdr->gh_version, reqhdr->gh_version);
+                RETURN(-EPROTO);
+        }
+
+        switch (ghdr->gh_proc) {
+        case PTLRPC_GSS_PROC_DATA:
+                pack_bulk = ghdr->gh_flags & LUSTRE_GSS_PACK_BULK;
+
+                if (!req->rq_early && !equi(req->rq_pack_bulk == 1, pack_bulk)){
+                        CERROR("%s bulk flag in reply\n",
+                               req->rq_pack_bulk ? "missing" : "unexpected");
+                        RETURN(-EPROTO);
+                }
+
+                if (ghdr->gh_seq != reqhdr->gh_seq) {
+                        CERROR("seqnum %u mismatch, expect %u\n",
+                               ghdr->gh_seq, reqhdr->gh_seq);
+                        RETURN(-EPROTO);
+                }
+
+                if (ghdr->gh_svc != reqhdr->gh_svc) {
+                        CERROR("svc %u mismatch, expect %u\n",
+                               ghdr->gh_svc, reqhdr->gh_svc);
+                        RETURN(-EPROTO);
+                }
+
+                if (swabbed)
+                        gss_header_swabber(ghdr);
+
+                major = gss_verify_msg(msg, gctx->gc_mechctx, reqhdr->gh_svc);
+                if (major != GSS_S_COMPLETE) {
+                        CERROR("failed to verify reply: %x\n", major);
+                        RETURN(-EPERM);
+                }
+
+                if (req->rq_early && reqhdr->gh_svc == SPTLRPC_SVC_NULL) {
+                        __u32 cksum;
+
+                        cksum = crc32_le(!(__u32) 0,
+                                         lustre_msg_buf(msg, 1, 0),
+                                         lustre_msg_buflen(msg, 1));
+                        if (cksum != msg->lm_cksum) {
+                                CWARN("early reply checksum mismatch: "
+                                      "%08x != %08x\n", cksum, msg->lm_cksum);
+                                RETURN(-EPROTO);
+                        }
+                }
+
+                if (pack_bulk) {
+                        /* bulk checksum is right after the lustre msg */
+                        if (msg->lm_bufcount < 3) {
+                                CERROR("Invalid reply bufcount %u\n",
+                                       msg->lm_bufcount);
+                                RETURN(-EPROTO);
+                        }
+
+                        rc = bulk_sec_desc_unpack(msg, 2, swabbed);
+                        if (rc) {
+                                CERROR("unpack bulk desc: %d\n", rc);
+                                RETURN(rc);
+                        }
+                }
+
+                req->rq_repmsg = lustre_msg_buf(msg, 1, 0);
+                req->rq_replen = msg->lm_buflens[1];
+                break;
+        case PTLRPC_GSS_PROC_ERR:
+                if (req->rq_early) {
+                        CERROR("server return error with early reply\n");
+                        rc = -EPROTO;
+                } else {
+                        rc = gss_cli_ctx_handle_err_notify(ctx, req, ghdr);
+                }
+                break;
+        default:
+                CERROR("unknown gss proc %d\n", ghdr->gh_proc);
+                rc = -EPROTO;
+        }
+
+        RETURN(rc);
+}
+
+int gss_cli_ctx_seal(struct ptlrpc_cli_ctx *ctx,
+                     struct ptlrpc_request *req)
+{
+        struct gss_cli_ctx      *gctx;
+        rawobj_t                 hdrobj, msgobj, token;
+        struct gss_header       *ghdr;
+        __u32                    buflens[2], major;
+        int                      wiresize, rc;
+        ENTRY;
+
+        LASSERT(req->rq_clrbuf);
+        LASSERT(req->rq_cli_ctx == ctx);
+        LASSERT(req->rq_reqlen);
+
+        gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+
+        /* final clear data length */
+        req->rq_clrdata_len = lustre_msg_size_v2(req->rq_clrbuf->lm_bufcount,
+                                                 req->rq_clrbuf->lm_buflens);
+
+        /* calculate wire data length */
+        buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+        buflens[1] = gss_cli_payload(&gctx->gc_base, req->rq_clrdata_len, 1);
+        wiresize = lustre_msg_size_v2(2, buflens);
+
+        /* allocate wire buffer */
+        if (req->rq_pool) {
+                /* pre-allocated */
+                LASSERT(req->rq_reqbuf);
+                LASSERT(req->rq_reqbuf != req->rq_clrbuf);
+                LASSERT(req->rq_reqbuf_len >= wiresize);
+        } else {
+                OBD_ALLOC_LARGE(req->rq_reqbuf, wiresize);
+                if (!req->rq_reqbuf)
+                        RETURN(-ENOMEM);
+                req->rq_reqbuf_len = wiresize;
+        }
+
+        lustre_init_msg_v2(req->rq_reqbuf, 2, buflens, NULL);
+        req->rq_reqbuf->lm_secflvr = req->rq_flvr.sf_rpc;
+
+        /* gss header */
+        ghdr = lustre_msg_buf(req->rq_reqbuf, 0, 0);
+        ghdr->gh_version = PTLRPC_GSS_VERSION;
+        ghdr->gh_sp = (__u8) ctx->cc_sec->ps_part;
+        ghdr->gh_flags = 0;
+        ghdr->gh_proc = gctx->gc_proc;
+        ghdr->gh_svc = SPTLRPC_SVC_PRIV;
+        ghdr->gh_handle.len = gctx->gc_handle.len;
+        memcpy(ghdr->gh_handle.data, gctx->gc_handle.data, gctx->gc_handle.len);
+        if (req->rq_pack_bulk)
+                ghdr->gh_flags |= LUSTRE_GSS_PACK_BULK;
+        if (req->rq_pack_udesc)
+                ghdr->gh_flags |= LUSTRE_GSS_PACK_USER;
+
+redo:
+	ghdr->gh_seq = atomic_inc_return(&gctx->gc_seq);
+
+        /* buffer objects */
+        hdrobj.len = PTLRPC_GSS_HEADER_SIZE;
+        hdrobj.data = (__u8 *) ghdr;
+        msgobj.len = req->rq_clrdata_len;
+        msgobj.data = (__u8 *) req->rq_clrbuf;
+        token.len = lustre_msg_buflen(req->rq_reqbuf, 1);
+        token.data = lustre_msg_buf(req->rq_reqbuf, 1, 0);
+
+        major = lgss_wrap(gctx->gc_mechctx, &hdrobj, &msgobj,
+                          req->rq_clrbuf_len, &token);
+        if (major != GSS_S_COMPLETE) {
+                CERROR("priv: wrap message error: %08x\n", major);
+                GOTO(err_free, rc = -EPERM);
+        }
+        LASSERT(token.len <= buflens[1]);
+
+	/* see explain in gss_cli_ctx_sign() */
+	if (unlikely(atomic_read(&gctx->gc_seq) - ghdr->gh_seq >
+		     GSS_SEQ_REPACK_THRESHOLD)) {
+		int behind = atomic_read(&gctx->gc_seq) - ghdr->gh_seq;
+
+		gss_stat_oos_record_cli(behind);
+		CWARN("req %p: %u behind, retry sealing\n", req, behind);
+
+		ghdr->gh_seq = atomic_inc_return(&gctx->gc_seq);
+		goto redo;
+	}
+
+	/* now set the final wire data length */
+	req->rq_reqdata_len = lustre_shrink_msg(req->rq_reqbuf, 1, token.len,0);
+	RETURN(0);
+
+err_free:
+	if (!req->rq_pool) {
+		OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+		req->rq_reqbuf = NULL;
+		req->rq_reqbuf_len = 0;
+	}
+	RETURN(rc);
+}
+
+int gss_cli_ctx_unseal(struct ptlrpc_cli_ctx *ctx,
+                       struct ptlrpc_request *req)
+{
+        struct gss_cli_ctx      *gctx;
+        struct gss_header       *ghdr;
+        struct lustre_msg       *msg = req->rq_repdata;
+        int                      msglen, pack_bulk, swabbed, rc;
+        __u32                    major;
+        ENTRY;
+
+        LASSERT(req->rq_cli_ctx == ctx);
+        LASSERT(req->rq_ctx_init == 0);
+        LASSERT(msg);
+
+        gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+        swabbed = ptlrpc_rep_need_swab(req);
+
+        ghdr = gss_swab_header(msg, 0, swabbed);
+        if (ghdr == NULL) {
+                CERROR("can't decode gss header\n");
+                RETURN(-EPROTO);
+        }
+
+        /* sanity checks */
+        if (ghdr->gh_version != PTLRPC_GSS_VERSION) {
+                CERROR("gss version %u mismatch, expect %u\n",
+                       ghdr->gh_version, PTLRPC_GSS_VERSION);
+                RETURN(-EPROTO);
+        }
+
+        switch (ghdr->gh_proc) {
+        case PTLRPC_GSS_PROC_DATA:
+                pack_bulk = ghdr->gh_flags & LUSTRE_GSS_PACK_BULK;
+
+                if (!req->rq_early && !equi(req->rq_pack_bulk == 1, pack_bulk)){
+                        CERROR("%s bulk flag in reply\n",
+                               req->rq_pack_bulk ? "missing" : "unexpected");
+                        RETURN(-EPROTO);
+                }
+
+                if (swabbed)
+                        gss_header_swabber(ghdr);
+
+                /* use rq_repdata_len as buffer size, which assume unseal
+                 * doesn't need extra memory space. for precise control, we'd
+                 * better calculate out actual buffer size as
+                 * (repbuf_len - offset - repdata_len) */
+                major = gss_unseal_msg(gctx->gc_mechctx, msg,
+                                       &msglen, req->rq_repdata_len);
+                if (major != GSS_S_COMPLETE) {
+                        CERROR("failed to unwrap reply: %x\n", major);
+                        rc = -EPERM;
+                        break;
+                }
+
+                swabbed = __lustre_unpack_msg(msg, msglen);
+                if (swabbed < 0) {
+                        CERROR("Failed to unpack after decryption\n");
+                        RETURN(-EPROTO);
+                }
+
+                if (msg->lm_bufcount < 1) {
+                        CERROR("Invalid reply buffer: empty\n");
+                        RETURN(-EPROTO);
+                }
+
+                if (pack_bulk) {
+                        if (msg->lm_bufcount < 2) {
+                                CERROR("bufcount %u: missing bulk sec desc\n",
+                                       msg->lm_bufcount);
+                                RETURN(-EPROTO);
+                        }
+
+                        /* bulk checksum is the last segment */
+                        if (bulk_sec_desc_unpack(msg, msg->lm_bufcount - 1,
+                                                 swabbed))
+                                RETURN(-EPROTO);
+                }
+
+                req->rq_repmsg = lustre_msg_buf(msg, 0, 0);
+                req->rq_replen = msg->lm_buflens[0];
+
+                rc = 0;
+                break;
+        case PTLRPC_GSS_PROC_ERR:
+                if (req->rq_early) {
+                        CERROR("server return error with early reply\n");
+                        rc = -EPROTO;
+                } else {
+                        rc = gss_cli_ctx_handle_err_notify(ctx, req, ghdr);
+                }
+                break;
+        default:
+                CERROR("unexpected proc %d\n", ghdr->gh_proc);
+                rc = -EPERM;
+        }
+
+        RETURN(rc);
+}
+
+/*********************************************
+ * reverse context installation              *
+ *********************************************/
+
+static inline
+int gss_install_rvs_svc_ctx(struct obd_import *imp,
+                            struct gss_sec *gsec,
+                            struct gss_cli_ctx *gctx)
+{
+        return gss_svc_upcall_install_rvs_ctx(imp, gsec, gctx);
+}
+
+/*********************************************
+ * GSS security APIs                         *
+ *********************************************/
+int gss_sec_create_common(struct gss_sec *gsec,
+                          struct ptlrpc_sec_policy *policy,
+                          struct obd_import *imp,
+                          struct ptlrpc_svc_ctx *svcctx,
+                          struct sptlrpc_flavor *sf)
+{
+        struct ptlrpc_sec   *sec;
+
+        LASSERT(imp);
+        LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_GSS);
+
+        gsec->gs_mech = lgss_subflavor_to_mech(
+                                SPTLRPC_FLVR_BASE_SUB(sf->sf_rpc));
+        if (!gsec->gs_mech) {
+                CERROR("gss backend 0x%x not found\n",
+                       SPTLRPC_FLVR_BASE_SUB(sf->sf_rpc));
+                return -EOPNOTSUPP;
+        }
+
+	spin_lock_init(&gsec->gs_lock);
+        gsec->gs_rvs_hdl = 0ULL;
+
+	/* initialize upper ptlrpc_sec */
+	sec = &gsec->gs_base;
+	sec->ps_policy = policy;
+	atomic_set(&sec->ps_refcount, 0);
+	atomic_set(&sec->ps_nctx, 0);
+	sec->ps_id = sptlrpc_get_next_secid();
+	sec->ps_flvr = *sf;
+	sec->ps_import = class_import_get(imp);
+	spin_lock_init(&sec->ps_lock);
+	INIT_LIST_HEAD(&sec->ps_gc_list);
+
+        if (!svcctx) {
+                sec->ps_gc_interval = GSS_GC_INTERVAL;
+        } else {
+                LASSERT(sec_is_reverse(sec));
+
+                /* never do gc on reverse sec */
+                sec->ps_gc_interval = 0;
+        }
+
+        if (SPTLRPC_FLVR_BULK_SVC(sec->ps_flvr.sf_rpc) == SPTLRPC_BULK_SVC_PRIV)
+                sptlrpc_enc_pool_add_user();
+
+        CDEBUG(D_SEC, "create %s%s@%p\n", (svcctx ? "reverse " : ""),
+               policy->sp_name, gsec);
+        return 0;
+}
+
+void gss_sec_destroy_common(struct gss_sec *gsec)
+{
+	struct ptlrpc_sec	*sec = &gsec->gs_base;
+	ENTRY;
+
+	LASSERT(sec->ps_import);
+	LASSERT(atomic_read(&sec->ps_refcount) == 0);
+	LASSERT(atomic_read(&sec->ps_nctx) == 0);
+
+	if (gsec->gs_mech) {
+		lgss_mech_put(gsec->gs_mech);
+		gsec->gs_mech = NULL;
+	}
+
+	class_import_put(sec->ps_import);
+
+	if (SPTLRPC_FLVR_BULK_SVC(sec->ps_flvr.sf_rpc) == SPTLRPC_BULK_SVC_PRIV)
+		sptlrpc_enc_pool_del_user();
+
+	EXIT;
+}
+
+void gss_sec_kill(struct ptlrpc_sec *sec)
+{
+	sec->ps_dying = 1;
+}
+
+int gss_cli_ctx_init_common(struct ptlrpc_sec *sec,
+                            struct ptlrpc_cli_ctx *ctx,
+                            struct ptlrpc_ctx_ops *ctxops,
+                            struct vfs_cred *vcred)
+{
+	struct gss_cli_ctx	*gctx = ctx2gctx(ctx);
+
+	gctx->gc_win = 0;
+	atomic_set(&gctx->gc_seq, 0);
+
+	INIT_HLIST_NODE(&ctx->cc_cache);
+	atomic_set(&ctx->cc_refcount, 0);
+	ctx->cc_sec = sec;
+	ctx->cc_ops = ctxops;
+	ctx->cc_expire = 0;
+	ctx->cc_flags = PTLRPC_CTX_NEW;
+	ctx->cc_vcred = *vcred;
+	spin_lock_init(&ctx->cc_lock);
+	INIT_LIST_HEAD(&ctx->cc_req_list);
+	INIT_LIST_HEAD(&ctx->cc_gc_chain);
+
+	/* take a ref on belonging sec, balanced in ctx destroying */
+	atomic_inc(&sec->ps_refcount);
+	/* statistic only */
+	atomic_inc(&sec->ps_nctx);
+
+	CDEBUG(D_SEC, "%s@%p: create ctx %p(%u->%s)\n",
+	       sec->ps_policy->sp_name, ctx->cc_sec,
+	       ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+	return 0;
+}
+
+/*
+ * return value:
+ *   1: the context has been taken care of by someone else
+ *   0: proceed to really destroy the context locally
+ */
+int gss_cli_ctx_fini_common(struct ptlrpc_sec *sec,
+                            struct ptlrpc_cli_ctx *ctx)
+{
+	struct gss_cli_ctx *gctx = ctx2gctx(ctx);
+
+	LASSERT(atomic_read(&sec->ps_nctx) > 0);
+	LASSERT(atomic_read(&ctx->cc_refcount) == 0);
+	LASSERT(ctx->cc_sec == sec);
+
+	/*
+	 * remove UPTODATE flag of reverse ctx thus we won't send fini rpc,
+	 * this is to avoid potential problems of client side reverse svc ctx
+	 * be mis-destroyed in various recovery senarios. anyway client can
+	 * manage its reverse ctx well by associating it with its buddy ctx.
+	 */
+	if (sec_is_reverse(sec))
+		ctx->cc_flags &= ~PTLRPC_CTX_UPTODATE;
+
+	if (gctx->gc_mechctx) {
+		/* the final context fini rpc will use this ctx too, and it's
+		 * asynchronous which finished by request_out_callback(). so
+		 * we add refcount, whoever drop finally drop the refcount to
+		 * 0 should responsible for the rest of destroy. */
+		atomic_inc(&ctx->cc_refcount);
+
+		gss_do_ctx_fini_rpc(gctx);
+		gss_cli_ctx_finalize(gctx);
+
+		if (!atomic_dec_and_test(&ctx->cc_refcount))
+			return 1;
+	}
+
+	if (sec_is_reverse(sec))
+		CWARN("reverse sec %p: destroy ctx %p\n",
+		      ctx->cc_sec, ctx);
+	else
+		CWARN("%s@%p: destroy ctx %p(%u->%s)\n",
+		      sec->ps_policy->sp_name, ctx->cc_sec,
+		      ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+
+	return 0;
+}
+
+static
+int gss_alloc_reqbuf_intg(struct ptlrpc_sec *sec,
+                          struct ptlrpc_request *req,
+                          int svc, int msgsize)
+{
+        int                       bufsize, txtsize;
+        int                       bufcnt = 2;
+        __u32                     buflens[5];
+        ENTRY;
+
+        /*
+         * on-wire data layout:
+         * - gss header
+         * - lustre message
+         * - user descriptor (optional)
+         * - bulk sec descriptor (optional)
+         * - signature (optional)
+         *   - svc == NULL: NULL
+         *   - svc == AUTH: signature of gss header
+         *   - svc == INTG: signature of all above
+         *
+         * if this is context negotiation, reserver fixed space
+         * at the last (signature) segment regardless of svc mode.
+         */
+
+        buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+        txtsize = buflens[0];
+
+        buflens[1] = msgsize;
+        if (svc == SPTLRPC_SVC_INTG)
+                txtsize += buflens[1];
+
+        if (req->rq_pack_udesc) {
+                buflens[bufcnt] = sptlrpc_current_user_desc_size();
+                if (svc == SPTLRPC_SVC_INTG)
+                        txtsize += buflens[bufcnt];
+                bufcnt++;
+        }
+
+        if (req->rq_pack_bulk) {
+                buflens[bufcnt] = gss_cli_bulk_payload(req->rq_cli_ctx,
+                                                       &req->rq_flvr,
+                                                       0, req->rq_bulk_read);
+                if (svc == SPTLRPC_SVC_INTG)
+                        txtsize += buflens[bufcnt];
+                bufcnt++;
+        }
+
+        if (req->rq_ctx_init)
+                buflens[bufcnt++] = GSS_CTX_INIT_MAX_LEN;
+        else if (svc != SPTLRPC_SVC_NULL)
+                buflens[bufcnt++] = gss_cli_payload(req->rq_cli_ctx, txtsize,0);
+
+        bufsize = lustre_msg_size_v2(bufcnt, buflens);
+
+        if (!req->rq_reqbuf) {
+                bufsize = size_roundup_power2(bufsize);
+
+                OBD_ALLOC_LARGE(req->rq_reqbuf, bufsize);
+                if (!req->rq_reqbuf)
+                        RETURN(-ENOMEM);
+
+                req->rq_reqbuf_len = bufsize;
+        } else {
+                LASSERT(req->rq_pool);
+                LASSERT(req->rq_reqbuf_len >= bufsize);
+                memset(req->rq_reqbuf, 0, bufsize);
+        }
+
+        lustre_init_msg_v2(req->rq_reqbuf, bufcnt, buflens, NULL);
+        req->rq_reqbuf->lm_secflvr = req->rq_flvr.sf_rpc;
+
+        req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, 1, msgsize);
+        LASSERT(req->rq_reqmsg);
+
+        /* pack user desc here, later we might leave current user's process */
+        if (req->rq_pack_udesc)
+                sptlrpc_pack_user_desc(req->rq_reqbuf, 2);
+
+        RETURN(0);
+}
+
+static
+int gss_alloc_reqbuf_priv(struct ptlrpc_sec *sec,
+                          struct ptlrpc_request *req,
+                          int msgsize)
+{
+        __u32                     ibuflens[3], wbuflens[2];
+        int                       ibufcnt;
+        int                       clearsize, wiresize;
+        ENTRY;
+
+        LASSERT(req->rq_clrbuf == NULL);
+        LASSERT(req->rq_clrbuf_len == 0);
+
+        /* Inner (clear) buffers
+         *  - lustre message
+         *  - user descriptor (optional)
+         *  - bulk checksum (optional)
+         */
+        ibufcnt = 1;
+        ibuflens[0] = msgsize;
+
+        if (req->rq_pack_udesc)
+                ibuflens[ibufcnt++] = sptlrpc_current_user_desc_size();
+        if (req->rq_pack_bulk)
+                ibuflens[ibufcnt++] = gss_cli_bulk_payload(req->rq_cli_ctx,
+                                                           &req->rq_flvr, 0,
+                                                           req->rq_bulk_read);
+
+        clearsize = lustre_msg_size_v2(ibufcnt, ibuflens);
+        /* to allow append padding during encryption */
+        clearsize += GSS_MAX_CIPHER_BLOCK;
+
+        /* Wrapper (wire) buffers
+         *  - gss header
+         *  - cipher text
+         */
+        wbuflens[0] = PTLRPC_GSS_HEADER_SIZE;
+        wbuflens[1] = gss_cli_payload(req->rq_cli_ctx, clearsize, 1);
+        wiresize = lustre_msg_size_v2(2, wbuflens);
+
+        if (req->rq_pool) {
+                /* rq_reqbuf is preallocated */
+                LASSERT(req->rq_reqbuf);
+                LASSERT(req->rq_reqbuf_len >= wiresize);
+
+                memset(req->rq_reqbuf, 0, req->rq_reqbuf_len);
+
+                /* if the pre-allocated buffer is big enough, we just pack
+                 * both clear buf & request buf in it, to avoid more alloc. */
+                if (clearsize + wiresize <= req->rq_reqbuf_len) {
+                        req->rq_clrbuf =
+                                (void *) (((char *) req->rq_reqbuf) + wiresize);
+                } else {
+                        CWARN("pre-allocated buf size %d is not enough for "
+                              "both clear (%d) and cipher (%d) text, proceed "
+                              "with extra allocation\n", req->rq_reqbuf_len,
+                              clearsize, wiresize);
+                }
+        }
+
+        if (!req->rq_clrbuf) {
+                clearsize = size_roundup_power2(clearsize);
+
+                OBD_ALLOC_LARGE(req->rq_clrbuf, clearsize);
+                if (!req->rq_clrbuf)
+                        RETURN(-ENOMEM);
+        }
+        req->rq_clrbuf_len = clearsize;
+
+        lustre_init_msg_v2(req->rq_clrbuf, ibufcnt, ibuflens, NULL);
+        req->rq_reqmsg = lustre_msg_buf(req->rq_clrbuf, 0, msgsize);
+
+        if (req->rq_pack_udesc)
+                sptlrpc_pack_user_desc(req->rq_clrbuf, 1);
+
+        RETURN(0);
+}
+
+/*
+ * NOTE: any change of request buffer allocation should also consider
+ * changing enlarge_reqbuf() series functions.
+ */
+int gss_alloc_reqbuf(struct ptlrpc_sec *sec,
+                     struct ptlrpc_request *req,
+                     int msgsize)
+{
+        int     svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+
+        LASSERT(!req->rq_pack_bulk ||
+                (req->rq_bulk_read || req->rq_bulk_write));
+
+        switch (svc) {
+        case SPTLRPC_SVC_NULL:
+        case SPTLRPC_SVC_AUTH:
+        case SPTLRPC_SVC_INTG:
+                return gss_alloc_reqbuf_intg(sec, req, svc, msgsize);
+        case SPTLRPC_SVC_PRIV:
+                return gss_alloc_reqbuf_priv(sec, req, msgsize);
+        default:
+                LASSERTF(0, "bad rpc flavor %x\n", req->rq_flvr.sf_rpc);
+                return 0;
+        }
+}
+
+void gss_free_reqbuf(struct ptlrpc_sec *sec,
+                     struct ptlrpc_request *req)
+{
+        int     privacy;
+        ENTRY;
+
+        LASSERT(!req->rq_pool || req->rq_reqbuf);
+        privacy = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc) == SPTLRPC_SVC_PRIV;
+
+        if (!req->rq_clrbuf)
+                goto release_reqbuf;
+
+        /* release clear buffer */
+        LASSERT(privacy);
+        LASSERT(req->rq_clrbuf_len);
+
+        if (req->rq_pool == NULL ||
+            req->rq_clrbuf < req->rq_reqbuf ||
+            (char *) req->rq_clrbuf >=
+            (char *) req->rq_reqbuf + req->rq_reqbuf_len)
+                OBD_FREE_LARGE(req->rq_clrbuf, req->rq_clrbuf_len);
+
+        req->rq_clrbuf = NULL;
+        req->rq_clrbuf_len = 0;
+
+release_reqbuf:
+        if (!req->rq_pool && req->rq_reqbuf) {
+                LASSERT(req->rq_reqbuf_len);
+
+                OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+                req->rq_reqbuf = NULL;
+                req->rq_reqbuf_len = 0;
+        }
+
+        EXIT;
+}
+
+static int do_alloc_repbuf(struct ptlrpc_request *req, int bufsize)
+{
+        bufsize = size_roundup_power2(bufsize);
+
+        OBD_ALLOC_LARGE(req->rq_repbuf, bufsize);
+        if (!req->rq_repbuf)
+                return -ENOMEM;
+
+        req->rq_repbuf_len = bufsize;
+        return 0;
+}
+
+static
+int gss_alloc_repbuf_intg(struct ptlrpc_sec *sec,
+                          struct ptlrpc_request *req,
+                          int svc, int msgsize)
+{
+        int             txtsize;
+        __u32           buflens[4];
+        int             bufcnt = 2;
+        int             alloc_size;
+
+        /*
+         * on-wire data layout:
+         * - gss header
+         * - lustre message
+         * - bulk sec descriptor (optional)
+         * - signature (optional)
+         *   - svc == NULL: NULL
+         *   - svc == AUTH: signature of gss header
+         *   - svc == INTG: signature of all above
+         *
+         * if this is context negotiation, reserver fixed space
+         * at the last (signature) segment regardless of svc mode.
+         */
+
+        buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+        txtsize = buflens[0];
+
+        buflens[1] = msgsize;
+        if (svc == SPTLRPC_SVC_INTG)
+                txtsize += buflens[1];
+
+        if (req->rq_pack_bulk) {
+                buflens[bufcnt] = gss_cli_bulk_payload(req->rq_cli_ctx,
+                                                       &req->rq_flvr,
+                                                       1, req->rq_bulk_read);
+                if (svc == SPTLRPC_SVC_INTG)
+                        txtsize += buflens[bufcnt];
+                bufcnt++;
+        }
+
+        if (req->rq_ctx_init)
+                buflens[bufcnt++] = GSS_CTX_INIT_MAX_LEN;
+        else if (svc != SPTLRPC_SVC_NULL)
+                buflens[bufcnt++] = gss_cli_payload(req->rq_cli_ctx, txtsize,0);
+
+        alloc_size = lustre_msg_size_v2(bufcnt, buflens);
+
+        /* add space for early reply */
+        alloc_size += gss_at_reply_off_integ;
+
+        return do_alloc_repbuf(req, alloc_size);
+}
+
+static
+int gss_alloc_repbuf_priv(struct ptlrpc_sec *sec,
+                          struct ptlrpc_request *req,
+                          int msgsize)
+{
+        int             txtsize;
+        __u32           buflens[2];
+        int             bufcnt;
+        int             alloc_size;
+
+        /* inner buffers */
+        bufcnt = 1;
+        buflens[0] = msgsize;
+
+        if (req->rq_pack_bulk)
+                buflens[bufcnt++] = gss_cli_bulk_payload(req->rq_cli_ctx,
+                                                         &req->rq_flvr,
+                                                         1, req->rq_bulk_read);
+        txtsize = lustre_msg_size_v2(bufcnt, buflens);
+        txtsize += GSS_MAX_CIPHER_BLOCK;
+
+        /* wrapper buffers */
+        bufcnt = 2;
+        buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+        buflens[1] = gss_cli_payload(req->rq_cli_ctx, txtsize, 1);
+
+        alloc_size = lustre_msg_size_v2(bufcnt, buflens);
+        /* add space for early reply */
+        alloc_size += gss_at_reply_off_priv;
+
+        return do_alloc_repbuf(req, alloc_size);
+}
+
+int gss_alloc_repbuf(struct ptlrpc_sec *sec,
+                     struct ptlrpc_request *req,
+                     int msgsize)
+{
+        int     svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+        ENTRY;
+
+        LASSERT(!req->rq_pack_bulk ||
+                (req->rq_bulk_read || req->rq_bulk_write));
+
+        switch (svc) {
+        case SPTLRPC_SVC_NULL:
+        case SPTLRPC_SVC_AUTH:
+        case SPTLRPC_SVC_INTG:
+                return gss_alloc_repbuf_intg(sec, req, svc, msgsize);
+        case SPTLRPC_SVC_PRIV:
+                return gss_alloc_repbuf_priv(sec, req, msgsize);
+        default:
+                LASSERTF(0, "bad rpc flavor %x\n", req->rq_flvr.sf_rpc);
+                return 0;
+        }
+}
+
+void gss_free_repbuf(struct ptlrpc_sec *sec,
+                     struct ptlrpc_request *req)
+{
+        OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len);
+        req->rq_repbuf = NULL;
+        req->rq_repbuf_len = 0;
+        req->rq_repdata = NULL;
+        req->rq_repdata_len = 0;
+}
+
+static int get_enlarged_msgsize(struct lustre_msg *msg,
+                                int segment, int newsize)
+{
+        int save, newmsg_size;
+
+        LASSERT(newsize >= msg->lm_buflens[segment]);
+
+        save = msg->lm_buflens[segment];
+        msg->lm_buflens[segment] = newsize;
+        newmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+        msg->lm_buflens[segment] = save;
+
+        return newmsg_size;
+}
+
+static int get_enlarged_msgsize2(struct lustre_msg *msg,
+                                 int segment1, int newsize1,
+                                 int segment2, int newsize2)
+{
+        int save1, save2, newmsg_size;
+
+        LASSERT(newsize1 >= msg->lm_buflens[segment1]);
+        LASSERT(newsize2 >= msg->lm_buflens[segment2]);
+
+        save1 = msg->lm_buflens[segment1];
+        save2 = msg->lm_buflens[segment2];
+        msg->lm_buflens[segment1] = newsize1;
+        msg->lm_buflens[segment2] = newsize2;
+        newmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+        msg->lm_buflens[segment1] = save1;
+        msg->lm_buflens[segment2] = save2;
+
+        return newmsg_size;
+}
+
+static
+int gss_enlarge_reqbuf_intg(struct ptlrpc_sec *sec,
+                            struct ptlrpc_request *req,
+                            int svc,
+                            int segment, int newsize)
+{
+        struct lustre_msg      *newbuf;
+        int                     txtsize, sigsize = 0, i;
+        int                     newmsg_size, newbuf_size;
+
+        /*
+         * gss header is at seg 0;
+         * embedded msg is at seg 1;
+         * signature (if any) is at the last seg
+         */
+        LASSERT(req->rq_reqbuf);
+        LASSERT(req->rq_reqbuf_len > req->rq_reqlen);
+        LASSERT(req->rq_reqbuf->lm_bufcount >= 2);
+        LASSERT(lustre_msg_buf(req->rq_reqbuf, 1, 0) == req->rq_reqmsg);
+
+        /* 1. compute new embedded msg size */
+        newmsg_size = get_enlarged_msgsize(req->rq_reqmsg, segment, newsize);
+        LASSERT(newmsg_size >= req->rq_reqbuf->lm_buflens[1]);
+
+        /* 2. compute new wrapper msg size */
+        if (svc == SPTLRPC_SVC_NULL) {
+                /* no signature, get size directly */
+                newbuf_size = get_enlarged_msgsize(req->rq_reqbuf,
+                                                   1, newmsg_size);
+        } else {
+                txtsize = req->rq_reqbuf->lm_buflens[0];
+
+                if (svc == SPTLRPC_SVC_INTG) {
+                        for (i = 1; i < req->rq_reqbuf->lm_bufcount; i++)
+                                txtsize += req->rq_reqbuf->lm_buflens[i];
+                        txtsize += newmsg_size - req->rq_reqbuf->lm_buflens[1];
+                }
+
+                sigsize = gss_cli_payload(req->rq_cli_ctx, txtsize, 0);
+                LASSERT(sigsize >= msg_last_seglen(req->rq_reqbuf));
+
+                newbuf_size = get_enlarged_msgsize2(
+                                        req->rq_reqbuf,
+                                        1, newmsg_size,
+                                        msg_last_segidx(req->rq_reqbuf),
+                                        sigsize);
+        }
+
+        /* request from pool should always have enough buffer */
+        LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newbuf_size);
+
+        if (req->rq_reqbuf_len < newbuf_size) {
+                newbuf_size = size_roundup_power2(newbuf_size);
+
+                OBD_ALLOC_LARGE(newbuf, newbuf_size);
+                if (newbuf == NULL)
+                        RETURN(-ENOMEM);
+
+		/* Must lock this, so that otherwise unprotected change of
+		 * rq_reqmsg is not racing with parallel processing of
+		 * imp_replay_list traversing threads. See LU-3333
+		 * This is a bandaid at best, we really need to deal with this
+		 * in request enlarging code before unpacking that's already
+		 * there */
+		if (req->rq_import)
+			spin_lock(&req->rq_import->imp_lock);
+
+                memcpy(newbuf, req->rq_reqbuf, req->rq_reqbuf_len);
+
+                OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+                req->rq_reqbuf = newbuf;
+                req->rq_reqbuf_len = newbuf_size;
+                req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, 1, 0);
+
+		if (req->rq_import)
+			spin_unlock(&req->rq_import->imp_lock);
+        }
+
+        /* do enlargement, from wrapper to embedded, from end to begin */
+        if (svc != SPTLRPC_SVC_NULL)
+                _sptlrpc_enlarge_msg_inplace(req->rq_reqbuf,
+                                             msg_last_segidx(req->rq_reqbuf),
+                                             sigsize);
+
+        _sptlrpc_enlarge_msg_inplace(req->rq_reqbuf, 1, newmsg_size);
+        _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize);
+
+        req->rq_reqlen = newmsg_size;
+        RETURN(0);
+}
+
+static
+int gss_enlarge_reqbuf_priv(struct ptlrpc_sec *sec,
+                            struct ptlrpc_request *req,
+                            int segment, int newsize)
+{
+        struct lustre_msg      *newclrbuf;
+        int                     newmsg_size, newclrbuf_size, newcipbuf_size;
+        __u32                   buflens[3];
+
+        /*
+         * embedded msg is at seg 0 of clear buffer;
+         * cipher text is at seg 2 of cipher buffer;
+         */
+        LASSERT(req->rq_pool ||
+                (req->rq_reqbuf == NULL && req->rq_reqbuf_len == 0));
+        LASSERT(req->rq_reqbuf == NULL ||
+                (req->rq_pool && req->rq_reqbuf->lm_bufcount == 3));
+        LASSERT(req->rq_clrbuf);
+        LASSERT(req->rq_clrbuf_len > req->rq_reqlen);
+        LASSERT(lustre_msg_buf(req->rq_clrbuf, 0, 0) == req->rq_reqmsg);
+
+        /* compute new embedded msg size */
+        newmsg_size = get_enlarged_msgsize(req->rq_reqmsg, segment, newsize);
+
+        /* compute new clear buffer size */
+        newclrbuf_size = get_enlarged_msgsize(req->rq_clrbuf, 0, newmsg_size);
+        newclrbuf_size += GSS_MAX_CIPHER_BLOCK;
+
+        /* compute new cipher buffer size */
+        buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+        buflens[1] = gss_cli_payload(req->rq_cli_ctx, buflens[0], 0);
+        buflens[2] = gss_cli_payload(req->rq_cli_ctx, newclrbuf_size, 1);
+        newcipbuf_size = lustre_msg_size_v2(3, buflens);
+
+        /* handle the case that we put both clear buf and cipher buf into
+         * pre-allocated single buffer. */
+        if (unlikely(req->rq_pool) &&
+            req->rq_clrbuf >= req->rq_reqbuf &&
+            (char *) req->rq_clrbuf <
+            (char *) req->rq_reqbuf + req->rq_reqbuf_len) {
+                /* it couldn't be better we still fit into the
+                 * pre-allocated buffer. */
+                if (newclrbuf_size + newcipbuf_size <= req->rq_reqbuf_len) {
+                        void *src, *dst;
+
+			if (req->rq_import)
+				spin_lock(&req->rq_import->imp_lock);
+                        /* move clear text backward. */
+                        src = req->rq_clrbuf;
+                        dst = (char *) req->rq_reqbuf + newcipbuf_size;
+
+                        memmove(dst, src, req->rq_clrbuf_len);
+
+                        req->rq_clrbuf = (struct lustre_msg *) dst;
+                        req->rq_clrbuf_len = newclrbuf_size;
+                        req->rq_reqmsg = lustre_msg_buf(req->rq_clrbuf, 0, 0);
+
+			if (req->rq_import)
+				spin_unlock(&req->rq_import->imp_lock);
+                } else {
+                        /* sadly we have to split out the clear buffer */
+                        LASSERT(req->rq_reqbuf_len >= newcipbuf_size);
+                        LASSERT(req->rq_clrbuf_len < newclrbuf_size);
+                }
+        }
+
+        if (req->rq_clrbuf_len < newclrbuf_size) {
+                newclrbuf_size = size_roundup_power2(newclrbuf_size);
+
+                OBD_ALLOC_LARGE(newclrbuf, newclrbuf_size);
+                if (newclrbuf == NULL)
+                        RETURN(-ENOMEM);
+
+		/* Must lock this, so that otherwise unprotected change of
+		 * rq_reqmsg is not racing with parallel processing of
+		 * imp_replay_list traversing threads. See LU-3333
+		 * This is a bandaid at best, we really need to deal with this
+		 * in request enlarging code before unpacking that's already
+		 * there */
+		if (req->rq_import)
+			spin_lock(&req->rq_import->imp_lock);
+
+                memcpy(newclrbuf, req->rq_clrbuf, req->rq_clrbuf_len);
+
+                if (req->rq_reqbuf == NULL ||
+                    req->rq_clrbuf < req->rq_reqbuf ||
+                    (char *) req->rq_clrbuf >=
+                    (char *) req->rq_reqbuf + req->rq_reqbuf_len) {
+                        OBD_FREE_LARGE(req->rq_clrbuf, req->rq_clrbuf_len);
+                }
+
+                req->rq_clrbuf = newclrbuf;
+                req->rq_clrbuf_len = newclrbuf_size;
+                req->rq_reqmsg = lustre_msg_buf(req->rq_clrbuf, 0, 0);
+
+		if (req->rq_import)
+			spin_unlock(&req->rq_import->imp_lock);
+        }
+
+        _sptlrpc_enlarge_msg_inplace(req->rq_clrbuf, 0, newmsg_size);
+        _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize);
+        req->rq_reqlen = newmsg_size;
+
+        RETURN(0);
+}
+
+int gss_enlarge_reqbuf(struct ptlrpc_sec *sec,
+                       struct ptlrpc_request *req,
+                       int segment, int newsize)
+{
+        int     svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+
+        LASSERT(!req->rq_ctx_init && !req->rq_ctx_fini);
+
+        switch (svc) {
+        case SPTLRPC_SVC_NULL:
+        case SPTLRPC_SVC_AUTH:
+        case SPTLRPC_SVC_INTG:
+                return gss_enlarge_reqbuf_intg(sec, req, svc, segment, newsize);
+        case SPTLRPC_SVC_PRIV:
+                return gss_enlarge_reqbuf_priv(sec, req, segment, newsize);
+        default:
+                LASSERTF(0, "bad rpc flavor %x\n", req->rq_flvr.sf_rpc);
+                return 0;
+        }
+}
+
+int gss_sec_install_rctx(struct obd_import *imp,
+                         struct ptlrpc_sec *sec,
+                         struct ptlrpc_cli_ctx *ctx)
+{
+        struct gss_sec     *gsec;
+        struct gss_cli_ctx *gctx;
+        int                 rc;
+
+        gsec = container_of(sec, struct gss_sec, gs_base);
+        gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+
+        rc = gss_install_rvs_svc_ctx(imp, gsec, gctx);
+        return rc;
+}
+
+/********************************************
+ * server side API                          *
+ ********************************************/
+
+static inline
+int gss_svc_reqctx_is_special(struct gss_svc_reqctx *grctx)
+{
+        LASSERT(grctx);
+        return (grctx->src_init || grctx->src_init_continue ||
+                grctx->src_err_notify);
+}
+
+static
+void gss_svc_reqctx_free(struct gss_svc_reqctx *grctx)
+{
+        if (grctx->src_ctx)
+                gss_svc_upcall_put_ctx(grctx->src_ctx);
+
+        sptlrpc_policy_put(grctx->src_base.sc_policy);
+        OBD_FREE_PTR(grctx);
+}
+
+static inline
+void gss_svc_reqctx_addref(struct gss_svc_reqctx *grctx)
+{
+	LASSERT(atomic_read(&grctx->src_base.sc_refcount) > 0);
+	atomic_inc(&grctx->src_base.sc_refcount);
+}
+
+static inline
+void gss_svc_reqctx_decref(struct gss_svc_reqctx *grctx)
+{
+	LASSERT(atomic_read(&grctx->src_base.sc_refcount) > 0);
+
+	if (atomic_dec_and_test(&grctx->src_base.sc_refcount))
+		gss_svc_reqctx_free(grctx);
+}
+
+static
+int gss_svc_sign(struct ptlrpc_request *req,
+                 struct ptlrpc_reply_state *rs,
+                 struct gss_svc_reqctx *grctx,
+                 __u32 svc)
+{
+        __u32   flags = 0;
+        int     rc;
+        ENTRY;
+
+        LASSERT(rs->rs_msg == lustre_msg_buf(rs->rs_repbuf, 1, 0));
+
+        /* embedded lustre_msg might have been shrunk */
+        if (req->rq_replen != rs->rs_repbuf->lm_buflens[1])
+                lustre_shrink_msg(rs->rs_repbuf, 1, req->rq_replen, 1);
+
+        if (req->rq_pack_bulk)
+                flags |= LUSTRE_GSS_PACK_BULK;
+
+        rc = gss_sign_msg(rs->rs_repbuf, grctx->src_ctx->gsc_mechctx,
+                          LUSTRE_SP_ANY, flags, PTLRPC_GSS_PROC_DATA,
+                          grctx->src_wirectx.gw_seq, svc, NULL);
+        if (rc < 0)
+                RETURN(rc);
+
+        rs->rs_repdata_len = rc;
+
+        if (likely(req->rq_packed_final)) {
+                if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)
+                        req->rq_reply_off = gss_at_reply_off_integ;
+                else
+                        req->rq_reply_off = 0;
+        } else {
+                if (svc == SPTLRPC_SVC_NULL)
+                        rs->rs_repbuf->lm_cksum = crc32_le(!(__u32) 0,
+                                        lustre_msg_buf(rs->rs_repbuf, 1, 0),
+                                        lustre_msg_buflen(rs->rs_repbuf, 1));
+                req->rq_reply_off = 0;
+        }
+
+        RETURN(0);
+}
+
+int gss_pack_err_notify(struct ptlrpc_request *req, __u32 major, __u32 minor)
+{
+        struct gss_svc_reqctx     *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+        struct ptlrpc_reply_state *rs;
+        struct gss_err_header     *ghdr;
+        int                        replen = sizeof(struct ptlrpc_body);
+        int                        rc;
+        ENTRY;
+
+        //if (OBD_FAIL_CHECK_ORSET(OBD_FAIL_SVCGSS_ERR_NOTIFY, OBD_FAIL_ONCE))
+        //      RETURN(-EINVAL);
+
+        grctx->src_err_notify = 1;
+        grctx->src_reserve_len = 0;
+
+        rc = lustre_pack_reply_v2(req, 1, &replen, NULL, 0);
+        if (rc) {
+                CERROR("could not pack reply, err %d\n", rc);
+                RETURN(rc);
+        }
+
+        /* gss hdr */
+        rs = req->rq_reply_state;
+        LASSERT(rs->rs_repbuf->lm_buflens[1] >= sizeof(*ghdr));
+        ghdr = lustre_msg_buf(rs->rs_repbuf, 0, 0);
+        ghdr->gh_version = PTLRPC_GSS_VERSION;
+        ghdr->gh_flags = 0;
+        ghdr->gh_proc = PTLRPC_GSS_PROC_ERR;
+        ghdr->gh_major = major;
+        ghdr->gh_minor = minor;
+        ghdr->gh_handle.len = 0; /* fake context handle */
+
+        rs->rs_repdata_len = lustre_msg_size_v2(rs->rs_repbuf->lm_bufcount,
+                                                rs->rs_repbuf->lm_buflens);
+
+        CDEBUG(D_SEC, "prepare gss error notify(0x%x/0x%x) to %s\n",
+               major, minor, libcfs_nid2str(req->rq_peer.nid));
+        RETURN(0);
+}
+
+static
+int gss_svc_handle_init(struct ptlrpc_request *req,
+                        struct gss_wire_ctx *gw)
+{
+        struct gss_svc_reqctx     *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+        struct lustre_msg         *reqbuf = req->rq_reqbuf;
+        struct obd_uuid           *uuid;
+        struct obd_device         *target;
+        rawobj_t                   uuid_obj, rvs_hdl, in_token;
+        __u32                      lustre_svc;
+        __u32                     *secdata, seclen;
+        int                        swabbed, rc;
+        ENTRY;
+
+        CDEBUG(D_SEC, "processing gss init(%d) request from %s\n", gw->gw_proc,
+               libcfs_nid2str(req->rq_peer.nid));
+
+        req->rq_ctx_init = 1;
+
+        if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) {
+                CERROR("unexpected bulk flag\n");
+                RETURN(SECSVC_DROP);
+        }
+
+        if (gw->gw_proc == PTLRPC_GSS_PROC_INIT && gw->gw_handle.len != 0) {
+                CERROR("proc %u: invalid handle length %u\n",
+                       gw->gw_proc, gw->gw_handle.len);
+                RETURN(SECSVC_DROP);
+        }
+
+        if (reqbuf->lm_bufcount < 3 || reqbuf->lm_bufcount > 4){
+                CERROR("Invalid bufcount %d\n", reqbuf->lm_bufcount);
+                RETURN(SECSVC_DROP);
+        }
+
+        swabbed = ptlrpc_req_need_swab(req);
+
+        /* ctx initiate payload is in last segment */
+        secdata = lustre_msg_buf(reqbuf, reqbuf->lm_bufcount - 1, 0);
+        seclen = reqbuf->lm_buflens[reqbuf->lm_bufcount - 1];
+
+        if (seclen < 4 + 4) {
+                CERROR("sec size %d too small\n", seclen);
+                RETURN(SECSVC_DROP);
+        }
+
+        /* lustre svc type */
+        lustre_svc = le32_to_cpu(*secdata++);
+        seclen -= 4;
+
+        /* extract target uuid, note this code is somewhat fragile
+         * because touched internal structure of obd_uuid */
+        if (rawobj_extract(&uuid_obj, &secdata, &seclen)) {
+                CERROR("failed to extract target uuid\n");
+                RETURN(SECSVC_DROP);
+        }
+        uuid_obj.data[uuid_obj.len - 1] = '\0';
+
+        uuid = (struct obd_uuid *) uuid_obj.data;
+        target = class_uuid2obd(uuid);
+        if (!target || target->obd_stopping || !target->obd_set_up) {
+                CERROR("target '%s' is not available for context init (%s)\n",
+                       uuid->uuid, target == NULL ? "no target" :
+                       (target->obd_stopping ? "stopping" : "not set up"));
+                RETURN(SECSVC_DROP);
+        }
+
+        /* extract reverse handle */
+        if (rawobj_extract(&rvs_hdl, &secdata, &seclen)) {
+                CERROR("failed extract reverse handle\n");
+                RETURN(SECSVC_DROP);
+        }
+
+        /* extract token */
+        if (rawobj_extract(&in_token, &secdata, &seclen)) {
+                CERROR("can't extract token\n");
+                RETURN(SECSVC_DROP);
+        }
+
+        rc = gss_svc_upcall_handle_init(req, grctx, gw, target, lustre_svc,
+                                        &rvs_hdl, &in_token);
+        if (rc != SECSVC_OK)
+                RETURN(rc);
+
+        if (grctx->src_ctx->gsc_usr_mds || grctx->src_ctx->gsc_usr_oss ||
+            grctx->src_ctx->gsc_usr_root)
+                CWARN("create svc ctx %p: user from %s authenticated as %s\n",
+                      grctx->src_ctx, libcfs_nid2str(req->rq_peer.nid),
+                      grctx->src_ctx->gsc_usr_mds ? "mds" :
+                        (grctx->src_ctx->gsc_usr_oss ? "oss" : "root"));
+        else
+                CWARN("create svc ctx %p: accept user %u from %s\n",
+                      grctx->src_ctx, grctx->src_ctx->gsc_uid,
+                      libcfs_nid2str(req->rq_peer.nid));
+
+        if (gw->gw_flags & LUSTRE_GSS_PACK_USER) {
+                if (reqbuf->lm_bufcount < 4) {
+                        CERROR("missing user descriptor\n");
+                        RETURN(SECSVC_DROP);
+                }
+                if (sptlrpc_unpack_user_desc(reqbuf, 2, swabbed)) {
+                        CERROR("Mal-formed user descriptor\n");
+                        RETURN(SECSVC_DROP);
+                }
+
+                req->rq_pack_udesc = 1;
+                req->rq_user_desc = lustre_msg_buf(reqbuf, 2, 0);
+        }
+
+        req->rq_reqmsg = lustre_msg_buf(reqbuf, 1, 0);
+        req->rq_reqlen = lustre_msg_buflen(reqbuf, 1);
+
+        RETURN(rc);
+}
+
+/*
+ * last segment must be the gss signature.
+ */
+static
+int gss_svc_verify_request(struct ptlrpc_request *req,
+                           struct gss_svc_reqctx *grctx,
+                           struct gss_wire_ctx *gw,
+                           __u32 *major)
+{
+        struct gss_svc_ctx *gctx = grctx->src_ctx;
+        struct lustre_msg  *msg = req->rq_reqbuf;
+        int                 offset = 2;
+        int                 swabbed;
+        ENTRY;
+
+        *major = GSS_S_COMPLETE;
+
+        if (msg->lm_bufcount < 2) {
+                CERROR("Too few segments (%u) in request\n", msg->lm_bufcount);
+                RETURN(-EINVAL);
+        }
+
+        if (gw->gw_svc == SPTLRPC_SVC_NULL)
+                goto verified;
+
+        if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 0)) {
+                CERROR("phase 0: discard replayed req: seq %u\n", gw->gw_seq);
+                *major = GSS_S_DUPLICATE_TOKEN;
+                RETURN(-EACCES);
+        }
+
+        *major = gss_verify_msg(msg, gctx->gsc_mechctx, gw->gw_svc);
+        if (*major != GSS_S_COMPLETE) {
+                CERROR("failed to verify request: %x\n", *major);
+                RETURN(-EACCES);
+        }
+
+        if (gctx->gsc_reverse == 0 &&
+            gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 1)) {
+                CERROR("phase 1+: discard replayed req: seq %u\n", gw->gw_seq);
+                *major = GSS_S_DUPLICATE_TOKEN;
+                RETURN(-EACCES);
+        }
+
+verified:
+        swabbed = ptlrpc_req_need_swab(req);
+
+        /* user descriptor */
+        if (gw->gw_flags & LUSTRE_GSS_PACK_USER) {
+                if (msg->lm_bufcount < (offset + 1)) {
+                        CERROR("no user desc included\n");
+                        RETURN(-EINVAL);
+                }
+
+                if (sptlrpc_unpack_user_desc(msg, offset, swabbed)) {
+                        CERROR("Mal-formed user descriptor\n");
+                        RETURN(-EINVAL);
+                }
+
+                req->rq_pack_udesc = 1;
+                req->rq_user_desc = lustre_msg_buf(msg, offset, 0);
+                offset++;
+        }
+
+        /* check bulk_sec_desc data */
+        if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) {
+                if (msg->lm_bufcount < (offset + 1)) {
+                        CERROR("missing bulk sec descriptor\n");
+                        RETURN(-EINVAL);
+                }
+
+                if (bulk_sec_desc_unpack(msg, offset, swabbed))
+                        RETURN(-EINVAL);
+
+                req->rq_pack_bulk = 1;
+                grctx->src_reqbsd = lustre_msg_buf(msg, offset, 0);
+                grctx->src_reqbsd_size = lustre_msg_buflen(msg, offset);
+        }
+
+        req->rq_reqmsg = lustre_msg_buf(msg, 1, 0);
+        req->rq_reqlen = msg->lm_buflens[1];
+        RETURN(0);
+}
+
+static
+int gss_svc_unseal_request(struct ptlrpc_request *req,
+                           struct gss_svc_reqctx *grctx,
+                           struct gss_wire_ctx *gw,
+                           __u32 *major)
+{
+        struct gss_svc_ctx *gctx = grctx->src_ctx;
+        struct lustre_msg  *msg = req->rq_reqbuf;
+        int                 swabbed, msglen, offset = 1;
+        ENTRY;
+
+        if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 0)) {
+                CERROR("phase 0: discard replayed req: seq %u\n", gw->gw_seq);
+                *major = GSS_S_DUPLICATE_TOKEN;
+                RETURN(-EACCES);
+        }
+
+        *major = gss_unseal_msg(gctx->gsc_mechctx, msg,
+                               &msglen, req->rq_reqdata_len);
+        if (*major != GSS_S_COMPLETE) {
+                CERROR("failed to unwrap request: %x\n", *major);
+                RETURN(-EACCES);
+        }
+
+        if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 1)) {
+                CERROR("phase 1+: discard replayed req: seq %u\n", gw->gw_seq);
+                *major = GSS_S_DUPLICATE_TOKEN;
+                RETURN(-EACCES);
+        }
+
+        swabbed = __lustre_unpack_msg(msg, msglen);
+        if (swabbed < 0) {
+                CERROR("Failed to unpack after decryption\n");
+                RETURN(-EINVAL);
+        }
+        req->rq_reqdata_len = msglen;
+
+        if (msg->lm_bufcount < 1) {
+                CERROR("Invalid buffer: is empty\n");
+                RETURN(-EINVAL);
+        }
+
+        if (gw->gw_flags & LUSTRE_GSS_PACK_USER) {
+                if (msg->lm_bufcount < offset + 1) {
+                        CERROR("no user descriptor included\n");
+                        RETURN(-EINVAL);
+                }
+
+                if (sptlrpc_unpack_user_desc(msg, offset, swabbed)) {
+                        CERROR("Mal-formed user descriptor\n");
+                        RETURN(-EINVAL);
+                }
+
+                req->rq_pack_udesc = 1;
+                req->rq_user_desc = lustre_msg_buf(msg, offset, 0);
+                offset++;
+        }
+
+        if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) {
+                if (msg->lm_bufcount < offset + 1) {
+                        CERROR("no bulk checksum included\n");
+                        RETURN(-EINVAL);
+                }
+
+                if (bulk_sec_desc_unpack(msg, offset, swabbed))
+                        RETURN(-EINVAL);
+
+                req->rq_pack_bulk = 1;
+                grctx->src_reqbsd = lustre_msg_buf(msg, offset, 0);
+                grctx->src_reqbsd_size = lustre_msg_buflen(msg, offset);
+        }
+
+        req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, 0, 0);
+        req->rq_reqlen = req->rq_reqbuf->lm_buflens[0];
+        RETURN(0);
+}
+
+static
+int gss_svc_handle_data(struct ptlrpc_request *req,
+                        struct gss_wire_ctx *gw)
+{
+        struct gss_svc_reqctx *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+        __u32                  major = 0;
+        int                    rc = 0;
+        ENTRY;
+
+        grctx->src_ctx = gss_svc_upcall_get_ctx(req, gw);
+        if (!grctx->src_ctx) {
+                major = GSS_S_NO_CONTEXT;
+                goto error;
+        }
+
+        switch (gw->gw_svc) {
+        case SPTLRPC_SVC_NULL:
+        case SPTLRPC_SVC_AUTH:
+        case SPTLRPC_SVC_INTG:
+                rc = gss_svc_verify_request(req, grctx, gw, &major);
+                break;
+        case SPTLRPC_SVC_PRIV:
+                rc = gss_svc_unseal_request(req, grctx, gw, &major);
+                break;
+        default:
+                CERROR("unsupported gss service %d\n", gw->gw_svc);
+                rc = -EINVAL;
+        }
+
+        if (rc == 0)
+                RETURN(SECSVC_OK);
+
+	CERROR("svc %u failed: major 0x%08x: req xid %llu ctx %p idx "
+	       "%#llx(%u->%s)\n", gw->gw_svc, major, req->rq_xid,
+               grctx->src_ctx, gss_handle_to_u64(&gw->gw_handle),
+               grctx->src_ctx->gsc_uid, libcfs_nid2str(req->rq_peer.nid));
+error:
+        /* we only notify client in case of NO_CONTEXT/BAD_SIG, which
+         * might happen after server reboot, to allow recovery. */
+        if ((major == GSS_S_NO_CONTEXT || major == GSS_S_BAD_SIG) &&
+            gss_pack_err_notify(req, major, 0) == 0)
+                RETURN(SECSVC_COMPLETE);
+
+        RETURN(SECSVC_DROP);
+}
+
+static
+int gss_svc_handle_destroy(struct ptlrpc_request *req,
+                           struct gss_wire_ctx *gw)
+{
+        struct gss_svc_reqctx  *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+        __u32                   major;
+        ENTRY;
+
+        req->rq_ctx_fini = 1;
+        req->rq_no_reply = 1;
+
+        grctx->src_ctx = gss_svc_upcall_get_ctx(req, gw);
+        if (!grctx->src_ctx) {
+                CDEBUG(D_SEC, "invalid gss context handle for destroy.\n");
+                RETURN(SECSVC_DROP);
+        }
+
+        if (gw->gw_svc != SPTLRPC_SVC_INTG) {
+                CERROR("svc %u is not supported in destroy.\n", gw->gw_svc);
+                RETURN(SECSVC_DROP);
+        }
+
+        if (gss_svc_verify_request(req, grctx, gw, &major))
+                RETURN(SECSVC_DROP);
+
+	CWARN("destroy svc ctx %p idx %#llx (%u->%s)\n",
+              grctx->src_ctx, gss_handle_to_u64(&gw->gw_handle),
+              grctx->src_ctx->gsc_uid, libcfs_nid2str(req->rq_peer.nid));
+
+        gss_svc_upcall_destroy_ctx(grctx->src_ctx);
+
+        if (gw->gw_flags & LUSTRE_GSS_PACK_USER) {
+                if (req->rq_reqbuf->lm_bufcount < 4) {
+                        CERROR("missing user descriptor, ignore it\n");
+                        RETURN(SECSVC_OK);
+                }
+                if (sptlrpc_unpack_user_desc(req->rq_reqbuf, 2,
+                                             ptlrpc_req_need_swab(req))) {
+                        CERROR("Mal-formed user descriptor, ignore it\n");
+                        RETURN(SECSVC_OK);
+                }
+
+                req->rq_pack_udesc = 1;
+                req->rq_user_desc = lustre_msg_buf(req->rq_reqbuf, 2, 0);
+        }
+
+        RETURN(SECSVC_OK);
+}
+
+int gss_svc_accept(struct ptlrpc_sec_policy *policy, struct ptlrpc_request *req)
+{
+        struct gss_header      *ghdr;
+        struct gss_svc_reqctx  *grctx;
+        struct gss_wire_ctx    *gw;
+        int                     swabbed, rc;
+        ENTRY;
+
+        LASSERT(req->rq_reqbuf);
+        LASSERT(req->rq_svc_ctx == NULL);
+
+        if (req->rq_reqbuf->lm_bufcount < 2) {
+                CERROR("buf count only %d\n", req->rq_reqbuf->lm_bufcount);
+                RETURN(SECSVC_DROP);
+        }
+
+        swabbed = ptlrpc_req_need_swab(req);
+
+        ghdr = gss_swab_header(req->rq_reqbuf, 0, swabbed);
+        if (ghdr == NULL) {
+                CERROR("can't decode gss header\n");
+                RETURN(SECSVC_DROP);
+        }
+
+        /* sanity checks */
+        if (ghdr->gh_version != PTLRPC_GSS_VERSION) {
+                CERROR("gss version %u, expect %u\n", ghdr->gh_version,
+                       PTLRPC_GSS_VERSION);
+                RETURN(SECSVC_DROP);
+        }
+
+        req->rq_sp_from = ghdr->gh_sp;
+
+        /* alloc grctx data */
+        OBD_ALLOC_PTR(grctx);
+        if (!grctx)
+                RETURN(SECSVC_DROP);
+
+	grctx->src_base.sc_policy = sptlrpc_policy_get(policy);
+	atomic_set(&grctx->src_base.sc_refcount, 1);
+	req->rq_svc_ctx = &grctx->src_base;
+	gw = &grctx->src_wirectx;
+
+        /* save wire context */
+        gw->gw_flags = ghdr->gh_flags;
+        gw->gw_proc = ghdr->gh_proc;
+        gw->gw_seq = ghdr->gh_seq;
+        gw->gw_svc = ghdr->gh_svc;
+        rawobj_from_netobj(&gw->gw_handle, &ghdr->gh_handle);
+
+        /* keep original wire header which subject to checksum verification */
+        if (swabbed)
+                gss_header_swabber(ghdr);
+
+        switch(ghdr->gh_proc) {
+        case PTLRPC_GSS_PROC_INIT:
+        case PTLRPC_GSS_PROC_CONTINUE_INIT:
+                rc = gss_svc_handle_init(req, gw);
+                break;
+        case PTLRPC_GSS_PROC_DATA:
+                rc = gss_svc_handle_data(req, gw);
+                break;
+        case PTLRPC_GSS_PROC_DESTROY:
+                rc = gss_svc_handle_destroy(req, gw);
+                break;
+        default:
+                CERROR("unknown proc %u\n", gw->gw_proc);
+                rc = SECSVC_DROP;
+                break;
+        }
+
+        switch (rc) {
+        case SECSVC_OK:
+                LASSERT (grctx->src_ctx);
+
+                req->rq_auth_gss = 1;
+                req->rq_auth_usr_mdt = grctx->src_ctx->gsc_usr_mds;
+                req->rq_auth_usr_ost = grctx->src_ctx->gsc_usr_oss;
+                req->rq_auth_usr_root = grctx->src_ctx->gsc_usr_root;
+                req->rq_auth_uid = grctx->src_ctx->gsc_uid;
+                req->rq_auth_mapped_uid = grctx->src_ctx->gsc_mapped_uid;
+                break;
+        case SECSVC_COMPLETE:
+                break;
+        case SECSVC_DROP:
+                gss_svc_reqctx_free(grctx);
+                req->rq_svc_ctx = NULL;
+                break;
+        }
+
+        RETURN(rc);
+}
+
+void gss_svc_invalidate_ctx(struct ptlrpc_svc_ctx *svc_ctx)
+{
+        struct gss_svc_reqctx  *grctx;
+        ENTRY;
+
+        if (svc_ctx == NULL) {
+                EXIT;
+                return;
+        }
+
+        grctx = gss_svc_ctx2reqctx(svc_ctx);
+
+        CWARN("gss svc invalidate ctx %p(%u)\n",
+              grctx->src_ctx, grctx->src_ctx->gsc_uid);
+        gss_svc_upcall_destroy_ctx(grctx->src_ctx);
+
+        EXIT;
+}
+
+static inline
+int gss_svc_payload(struct gss_svc_reqctx *grctx, int early,
+                    int msgsize, int privacy)
+{
+        /* we should treat early reply normally, but which is actually sharing
+         * the same ctx with original request, so in this case we should
+         * ignore the special ctx's special flags */
+        if (early == 0 && gss_svc_reqctx_is_special(grctx))
+                return grctx->src_reserve_len;
+
+        return gss_mech_payload(NULL, msgsize, privacy);
+}
+
+static int gss_svc_bulk_payload(struct gss_svc_ctx *gctx,
+                                struct sptlrpc_flavor *flvr,
+                                int read)
+{
+        int     payload = sizeof(struct ptlrpc_bulk_sec_desc);
+
+        if (read) {
+                switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) {
+                case SPTLRPC_BULK_SVC_NULL:
+                        break;
+                case SPTLRPC_BULK_SVC_INTG:
+                        payload += gss_mech_payload(NULL, 0, 0);
+                        break;
+                case SPTLRPC_BULK_SVC_PRIV:
+                        payload += gss_mech_payload(NULL, 0, 1);
+                        break;
+                case SPTLRPC_BULK_SVC_AUTH:
+                default:
+                        LBUG();
+                }
+        }
+
+        return payload;
+}
+
+int gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen)
+{
+        struct gss_svc_reqctx       *grctx;
+        struct ptlrpc_reply_state   *rs;
+        int                          early, privacy, svc, bsd_off = 0;
+        __u32                        ibuflens[2], buflens[4];
+        int                          ibufcnt = 0, bufcnt;
+        int                          txtsize, wmsg_size, rs_size;
+        ENTRY;
+
+        LASSERT(msglen % 8 == 0);
+
+        if (req->rq_pack_bulk && !req->rq_bulk_read && !req->rq_bulk_write) {
+                CERROR("client request bulk sec on non-bulk rpc\n");
+                RETURN(-EPROTO);
+        }
+
+        svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+        early = (req->rq_packed_final == 0);
+
+        grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+        if (!early && gss_svc_reqctx_is_special(grctx))
+                privacy = 0;
+        else
+                privacy = (svc == SPTLRPC_SVC_PRIV);
+
+        if (privacy) {
+                /* inner clear buffers */
+                ibufcnt = 1;
+                ibuflens[0] = msglen;
+
+                if (req->rq_pack_bulk) {
+                        LASSERT(grctx->src_reqbsd);
+
+                        bsd_off = ibufcnt;
+                        ibuflens[ibufcnt++] = gss_svc_bulk_payload(
+                                                        grctx->src_ctx,
+                                                        &req->rq_flvr,
+                                                        req->rq_bulk_read);
+                }
+
+                txtsize = lustre_msg_size_v2(ibufcnt, ibuflens);
+                txtsize += GSS_MAX_CIPHER_BLOCK;
+
+                /* wrapper buffer */
+                bufcnt = 2;
+                buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+                buflens[1] = gss_svc_payload(grctx, early, txtsize, 1);
+        } else {
+                bufcnt = 2;
+                buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+                buflens[1] = msglen;
+
+                txtsize = buflens[0];
+                if (svc == SPTLRPC_SVC_INTG)
+                        txtsize += buflens[1];
+
+                if (req->rq_pack_bulk) {
+                        LASSERT(grctx->src_reqbsd);
+
+                        bsd_off = bufcnt;
+                        buflens[bufcnt] = gss_svc_bulk_payload(
+                                                        grctx->src_ctx,
+                                                        &req->rq_flvr,
+                                                        req->rq_bulk_read);
+                        if (svc == SPTLRPC_SVC_INTG)
+                                txtsize += buflens[bufcnt];
+                        bufcnt++;
+                }
+
+                if ((!early && gss_svc_reqctx_is_special(grctx)) ||
+                    svc != SPTLRPC_SVC_NULL)
+                        buflens[bufcnt++] = gss_svc_payload(grctx, early,
+                                                            txtsize, 0);
+        }
+
+        wmsg_size = lustre_msg_size_v2(bufcnt, buflens);
+
+        rs_size = sizeof(*rs) + wmsg_size;
+        rs = req->rq_reply_state;
+
+        if (rs) {
+                /* pre-allocated */
+                LASSERT(rs->rs_size >= rs_size);
+        } else {
+                OBD_ALLOC_LARGE(rs, rs_size);
+                if (rs == NULL)
+                        RETURN(-ENOMEM);
+
+                rs->rs_size = rs_size;
+        }
+
+        rs->rs_repbuf = (struct lustre_msg *) (rs + 1);
+        rs->rs_repbuf_len = wmsg_size;
+
+        /* initialize the buffer */
+        if (privacy) {
+                lustre_init_msg_v2(rs->rs_repbuf, ibufcnt, ibuflens, NULL);
+                rs->rs_msg = lustre_msg_buf(rs->rs_repbuf, 0, msglen);
+        } else {
+                lustre_init_msg_v2(rs->rs_repbuf, bufcnt, buflens, NULL);
+                rs->rs_repbuf->lm_secflvr = req->rq_flvr.sf_rpc;
+
+                rs->rs_msg = lustre_msg_buf(rs->rs_repbuf, 1, 0);
+        }
+
+        if (bsd_off) {
+                grctx->src_repbsd = lustre_msg_buf(rs->rs_repbuf, bsd_off, 0);
+                grctx->src_repbsd_size = lustre_msg_buflen(rs->rs_repbuf,
+                                                           bsd_off);
+        }
+
+        gss_svc_reqctx_addref(grctx);
+        rs->rs_svc_ctx = req->rq_svc_ctx;
+
+        LASSERT(rs->rs_msg);
+        req->rq_reply_state = rs;
+        RETURN(0);
+}
+
+static int gss_svc_seal(struct ptlrpc_request *req,
+                        struct ptlrpc_reply_state *rs,
+                        struct gss_svc_reqctx *grctx)
+{
+        struct gss_svc_ctx      *gctx = grctx->src_ctx;
+        rawobj_t                 hdrobj, msgobj, token;
+        struct gss_header       *ghdr;
+        __u8                    *token_buf;
+        int                      token_buflen; 
+        __u32                    buflens[2], major;
+        int                      msglen, rc;
+        ENTRY;
+
+        /* get clear data length. note embedded lustre_msg might
+         * have been shrunk */
+        if (req->rq_replen != lustre_msg_buflen(rs->rs_repbuf, 0))
+                msglen = lustre_shrink_msg(rs->rs_repbuf, 0, req->rq_replen, 1);
+        else 
+                msglen = lustre_msg_size_v2(rs->rs_repbuf->lm_bufcount,
+                                            rs->rs_repbuf->lm_buflens);
+
+        /* temporarily use tail of buffer to hold gss header data */
+        LASSERT(msglen + PTLRPC_GSS_HEADER_SIZE <= rs->rs_repbuf_len);
+        ghdr = (struct gss_header *) ((char *) rs->rs_repbuf +
+                                rs->rs_repbuf_len - PTLRPC_GSS_HEADER_SIZE);
+        ghdr->gh_version = PTLRPC_GSS_VERSION;
+        ghdr->gh_sp = LUSTRE_SP_ANY;
+        ghdr->gh_flags = 0;
+        ghdr->gh_proc = PTLRPC_GSS_PROC_DATA;
+        ghdr->gh_seq = grctx->src_wirectx.gw_seq;
+        ghdr->gh_svc = SPTLRPC_SVC_PRIV;
+        ghdr->gh_handle.len = 0;
+        if (req->rq_pack_bulk)
+                ghdr->gh_flags |= LUSTRE_GSS_PACK_BULK;
+
+        /* allocate temporary cipher buffer */
+        token_buflen = gss_mech_payload(gctx->gsc_mechctx, msglen, 1);
+        OBD_ALLOC_LARGE(token_buf, token_buflen);
+        if (token_buf == NULL)
+                RETURN(-ENOMEM);
+
+        hdrobj.len = PTLRPC_GSS_HEADER_SIZE;
+        hdrobj.data = (__u8 *) ghdr;
+        msgobj.len = msglen;
+        msgobj.data = (__u8 *) rs->rs_repbuf;
+        token.len = token_buflen;
+        token.data = token_buf;
+
+        major = lgss_wrap(gctx->gsc_mechctx, &hdrobj, &msgobj,
+                          rs->rs_repbuf_len - PTLRPC_GSS_HEADER_SIZE, &token);
+        if (major != GSS_S_COMPLETE) {
+                CERROR("wrap message error: %08x\n", major);
+                GOTO(out_free, rc = -EPERM);
+        }
+        LASSERT(token.len <= token_buflen);
+
+        /* we are about to override data at rs->rs_repbuf, nullify pointers
+         * to which to catch further illegal usage. */
+        if (req->rq_pack_bulk) {
+                grctx->src_repbsd = NULL;
+                grctx->src_repbsd_size = 0;
+        }
+
+        /* now fill the actual wire data
+         * - gss header
+         * - gss token
+         */
+        buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+        buflens[1] = token.len;
+
+        rs->rs_repdata_len = lustre_msg_size_v2(2, buflens);
+        LASSERT(rs->rs_repdata_len <= rs->rs_repbuf_len);
+
+        lustre_init_msg_v2(rs->rs_repbuf, 2, buflens, NULL);
+        rs->rs_repbuf->lm_secflvr = req->rq_flvr.sf_rpc;
+
+        memcpy(lustre_msg_buf(rs->rs_repbuf, 0, 0), ghdr,
+               PTLRPC_GSS_HEADER_SIZE);
+        memcpy(lustre_msg_buf(rs->rs_repbuf, 1, 0), token.data, token.len);
+
+        /* reply offset */
+        if (req->rq_packed_final &&
+            (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT))
+                req->rq_reply_off = gss_at_reply_off_priv;
+        else
+                req->rq_reply_off = 0;
+
+        /* to catch upper layer's further access */
+        rs->rs_msg = NULL;
+        req->rq_repmsg = NULL;
+        req->rq_replen = 0;
+
+        rc = 0;
+out_free:
+        OBD_FREE_LARGE(token_buf, token_buflen);
+        RETURN(rc);
+}
+
+int gss_svc_authorize(struct ptlrpc_request *req)
+{
+        struct ptlrpc_reply_state *rs = req->rq_reply_state;
+        struct gss_svc_reqctx     *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+        struct gss_wire_ctx       *gw = &grctx->src_wirectx;
+        int                        early, rc;
+        ENTRY;
+
+        early = (req->rq_packed_final == 0);
+
+        if (!early && gss_svc_reqctx_is_special(grctx)) {
+                LASSERT(rs->rs_repdata_len != 0);
+
+                req->rq_reply_off = gss_at_reply_off_integ;
+                RETURN(0);
+        }
+
+        /* early reply could happen in many cases */
+        if (!early &&
+            gw->gw_proc != PTLRPC_GSS_PROC_DATA &&
+            gw->gw_proc != PTLRPC_GSS_PROC_DESTROY) {
+                CERROR("proc %d not support\n", gw->gw_proc);
+                RETURN(-EINVAL);
+        }
+
+        LASSERT(grctx->src_ctx);
+
+        switch (gw->gw_svc) {
+        case SPTLRPC_SVC_NULL:
+        case SPTLRPC_SVC_AUTH:
+        case SPTLRPC_SVC_INTG:
+                rc = gss_svc_sign(req, rs, grctx, gw->gw_svc);
+                break;
+        case SPTLRPC_SVC_PRIV:
+                rc = gss_svc_seal(req, rs, grctx);
+                break;
+        default:
+                CERROR("Unknown service %d\n", gw->gw_svc);
+                GOTO(out, rc = -EINVAL);
+        }
+        rc = 0;
+
+out:
+        RETURN(rc);
+}
+
+void gss_svc_free_rs(struct ptlrpc_reply_state *rs)
+{
+        struct gss_svc_reqctx *grctx;
+
+        LASSERT(rs->rs_svc_ctx);
+        grctx = container_of(rs->rs_svc_ctx, struct gss_svc_reqctx, src_base);
+
+        gss_svc_reqctx_decref(grctx);
+        rs->rs_svc_ctx = NULL;
+
+        if (!rs->rs_prealloc)
+                OBD_FREE_LARGE(rs, rs->rs_size);
+}
+
+void gss_svc_free_ctx(struct ptlrpc_svc_ctx *ctx)
+{
+	LASSERT(atomic_read(&ctx->sc_refcount) == 0);
+	gss_svc_reqctx_free(gss_svc_ctx2reqctx(ctx));
+}
+
+int gss_copy_rvc_cli_ctx(struct ptlrpc_cli_ctx *cli_ctx,
+                         struct ptlrpc_svc_ctx *svc_ctx)
+{
+        struct gss_cli_ctx     *cli_gctx = ctx2gctx(cli_ctx);
+        struct gss_svc_ctx     *svc_gctx = gss_svc_ctx2gssctx(svc_ctx);
+        struct gss_ctx         *mechctx = NULL;
+
+        LASSERT(cli_gctx);
+        LASSERT(svc_gctx && svc_gctx->gsc_mechctx);
+
+        cli_gctx->gc_proc = PTLRPC_GSS_PROC_DATA;
+        cli_gctx->gc_win = GSS_SEQ_WIN;
+
+	/* The problem is the reverse ctx might get lost in some recovery
+	 * situations, and the same svc_ctx will be used to re-create it.
+	 * if there's callback be sentout before that, new reverse ctx start
+	 * with sequence 0 will lead to future callback rpc be treated as
+	 * replay.
+	 *
+	 * each reverse root ctx will record its latest sequence number on its
+	 * buddy svcctx before be destroyed, so here we continue use it.
+	 */
+	atomic_set(&cli_gctx->gc_seq, svc_gctx->gsc_rvs_seq);
+
+	if (gss_svc_upcall_dup_handle(&cli_gctx->gc_svc_handle, svc_gctx)) {
+		CERROR("failed to dup svc handle\n");
+		goto err_out;
+	}
+
+        if (lgss_copy_reverse_context(svc_gctx->gsc_mechctx, &mechctx) !=
+            GSS_S_COMPLETE) {
+                CERROR("failed to copy mech context\n");
+                goto err_svc_handle;
+        }
+
+        if (rawobj_dup(&cli_gctx->gc_handle, &svc_gctx->gsc_rvs_hdl)) {
+                CERROR("failed to dup reverse handle\n");
+                goto err_ctx;
+        }
+
+        cli_gctx->gc_mechctx = mechctx;
+        gss_cli_ctx_uptodate(cli_gctx);
+
+        return 0;
+
+err_ctx:
+        lgss_delete_sec_context(&mechctx);
+err_svc_handle:
+        rawobj_free(&cli_gctx->gc_svc_handle);
+err_out:
+        return -ENOMEM;
+}
+
+static void gss_init_at_reply_offset(void)
+{
+        __u32 buflens[3];
+        int clearsize;
+
+        buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+        buflens[1] = lustre_msg_early_size();
+        buflens[2] = gss_cli_payload(NULL, buflens[1], 0);
+        gss_at_reply_off_integ = lustre_msg_size_v2(3, buflens);
+
+        buflens[0] = lustre_msg_early_size();
+        clearsize = lustre_msg_size_v2(1, buflens);
+        buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+        buflens[1] = gss_cli_payload(NULL, clearsize, 0);
+        buflens[2] = gss_cli_payload(NULL, clearsize, 1);
+        gss_at_reply_off_priv = lustre_msg_size_v2(3, buflens);
+}
+
+static int __init sptlrpc_gss_init(void)
+{
+        int rc;
+
+        rc = gss_init_lproc();
+        if (rc)
+                return rc;
+
+        rc = gss_init_cli_upcall();
+        if (rc)
+                goto out_lproc;
+
+        rc = gss_init_svc_upcall();
+        if (rc)
+                goto out_cli_upcall;
+
+	rc = init_null_module();
+	if (rc)
+		goto out_svc_upcall;
+
+	rc = init_kerberos_module();
+	if (rc)
+		goto out_null;
+
+	rc = init_sk_module();
+	if (rc)
+		goto out_kerberos;
+
+	/* register policy after all other stuff be initialized, because it
+	 * might be in used immediately after the registration. */
+
+	rc = gss_init_keyring();
+	if (rc)
+		goto out_sk;
+
+	rc = gss_init_pipefs();
+	if (rc)
+		goto out_keyring;
+
+	gss_init_at_reply_offset();
+
+	return 0;
+
+out_keyring:
+	gss_exit_keyring();
+out_sk:
+	cleanup_sk_module();
+out_kerberos:
+	cleanup_kerberos_module();
+out_null:
+	cleanup_null_module();
+out_svc_upcall:
+	gss_exit_svc_upcall();
+out_cli_upcall:
+	gss_exit_cli_upcall();
+out_lproc:
+	gss_exit_lproc();
+	return rc;
+}
+
+static void __exit sptlrpc_gss_exit(void)
+{
+        gss_exit_keyring();
+        gss_exit_pipefs();
+        cleanup_kerberos_module();
+        gss_exit_svc_upcall();
+        gss_exit_cli_upcall();
+        gss_exit_lproc();
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre GSS security policy");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+MODULE_LICENSE("GPL");
+
+module_init(sptlrpc_gss_init);
+module_exit(sptlrpc_gss_exit);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/import.c b/drivers/staging/lustrefsx/lustre/ptlrpc/import.c
new file mode 100644
index 0000000000000..8338095a43bb1
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/import.c
@@ -0,0 +1,1784 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/import.c
+ *
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <linux/kthread.h>
+#include <obd_support.h>
+#include <lustre_ha.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_export.h>
+#include <obd.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+
+#include "ptlrpc_internal.h"
+
+struct ptlrpc_connect_async_args {
+         __u64 pcaa_peer_committed;
+        int pcaa_initial_connect;
+};
+
+/**
+ * Updates import \a imp current state to provided \a state value
+ * Helper function. Must be called under imp_lock.
+ */
+static void __import_set_state(struct obd_import *imp,
+                               enum lustre_imp_state state)
+{
+	switch (state) {
+	case LUSTRE_IMP_CLOSED:
+	case LUSTRE_IMP_NEW:
+	case LUSTRE_IMP_DISCON:
+	case LUSTRE_IMP_CONNECTING:
+		break;
+	case LUSTRE_IMP_REPLAY_WAIT:
+		imp->imp_replay_state = LUSTRE_IMP_REPLAY_LOCKS;
+		break;
+	default:
+		imp->imp_replay_state = LUSTRE_IMP_REPLAY;
+	}
+        imp->imp_state = state;
+        imp->imp_state_hist[imp->imp_state_hist_idx].ish_state = state;
+        imp->imp_state_hist[imp->imp_state_hist_idx].ish_time =
+		ktime_get_real_seconds();
+        imp->imp_state_hist_idx = (imp->imp_state_hist_idx + 1) %
+                IMP_STATE_HIST_LEN;
+}
+
+/* A CLOSED import should remain so. */
+#define IMPORT_SET_STATE_NOLOCK(imp, state)                                    \
+do {                                                                           \
+        if (imp->imp_state != LUSTRE_IMP_CLOSED) {                             \
+               CDEBUG(D_HA, "%p %s: changing import state from %s to %s\n",    \
+                      imp, obd2cli_tgt(imp->imp_obd),                          \
+                      ptlrpc_import_state_name(imp->imp_state),                \
+                      ptlrpc_import_state_name(state));                        \
+               __import_set_state(imp, state);                                 \
+        }                                                                      \
+} while(0)
+
+#define IMPORT_SET_STATE(imp, state)					\
+do {									\
+	spin_lock(&imp->imp_lock);					\
+	IMPORT_SET_STATE_NOLOCK(imp, state);				\
+	spin_unlock(&imp->imp_lock);					\
+} while(0)
+
+void ptlrpc_import_enter_resend(struct obd_import *imp)
+{
+	IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+}
+EXPORT_SYMBOL(ptlrpc_import_enter_resend);
+
+
+static int ptlrpc_connect_interpret(const struct lu_env *env,
+                                    struct ptlrpc_request *request,
+                                    void * data, int rc);
+int ptlrpc_import_recovery_state_machine(struct obd_import *imp);
+
+/* Only this function is allowed to change the import state when it is
+ * CLOSED. I would rather refcount the import and free it after
+ * disconnection like we do with exports. To do that, the client_obd
+ * will need to save the peer info somewhere other than in the import,
+ * though. */
+int ptlrpc_init_import(struct obd_import *imp)
+{
+	spin_lock(&imp->imp_lock);
+
+	imp->imp_generation++;
+	imp->imp_state =  LUSTRE_IMP_NEW;
+
+	spin_unlock(&imp->imp_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_init_import);
+
+#define UUID_STR "_UUID"
+void deuuidify(char *uuid, const char *prefix, char **uuid_start, int *uuid_len)
+{
+        *uuid_start = !prefix || strncmp(uuid, prefix, strlen(prefix))
+                ? uuid : uuid + strlen(prefix);
+
+        *uuid_len = strlen(*uuid_start);
+
+        if (*uuid_len < strlen(UUID_STR))
+                return;
+
+        if (!strncmp(*uuid_start + *uuid_len - strlen(UUID_STR),
+                    UUID_STR, strlen(UUID_STR)))
+                *uuid_len -= strlen(UUID_STR);
+}
+
+/**
+ * Returns true if import was FULL, false if import was already not
+ * connected.
+ * @imp - import to be disconnected
+ * @conn_cnt - connection count (epoch) of the request that timed out
+ *             and caused the disconnection.  In some cases, multiple
+ *             inflight requests can fail to a single target (e.g. OST
+ *             bulk requests) and if one has already caused a reconnection
+ *             (increasing the import->conn_cnt) the older failure should
+ *             not also cause a reconnection.  If zero it forces a reconnect.
+ */
+int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt)
+{
+	int rc = 0;
+
+	spin_lock(&imp->imp_lock);
+
+        if (imp->imp_state == LUSTRE_IMP_FULL &&
+            (conn_cnt == 0 || conn_cnt == imp->imp_conn_cnt)) {
+                char *target_start;
+                int   target_len;
+
+                deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+                          &target_start, &target_len);
+
+                if (imp->imp_replayable) {
+                        LCONSOLE_WARN("%s: Connection to %.*s (at %s) was "
+                               "lost; in progress operations using this "
+                               "service will wait for recovery to complete\n",
+                               imp->imp_obd->obd_name, target_len, target_start,
+                               libcfs_nid2str(imp->imp_connection->c_peer.nid));
+                } else {
+                        LCONSOLE_ERROR_MSG(0x166, "%s: Connection to "
+                               "%.*s (at %s) was lost; in progress "
+                               "operations using this service will fail\n",
+                               imp->imp_obd->obd_name,
+                               target_len, target_start,
+                               libcfs_nid2str(imp->imp_connection->c_peer.nid));
+                }
+                IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
+		spin_unlock(&imp->imp_lock);
+
+		if (obd_dump_on_timeout)
+			libcfs_debug_dumplog();
+
+		obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON);
+		rc = 1;
+	} else {
+		spin_unlock(&imp->imp_lock);
+                CDEBUG(D_HA, "%s: import %p already %s (conn %u, was %u): %s\n",
+                       imp->imp_client->cli_name, imp,
+                       (imp->imp_state == LUSTRE_IMP_FULL &&
+                        imp->imp_conn_cnt > conn_cnt) ?
+                       "reconnected" : "not connected", imp->imp_conn_cnt,
+                       conn_cnt, ptlrpc_import_state_name(imp->imp_state));
+        }
+
+        return rc;
+}
+
+/* Must be called with imp_lock held! */
+static void ptlrpc_deactivate_and_unlock_import(struct obd_import *imp)
+{
+	ENTRY;
+	assert_spin_locked(&imp->imp_lock);
+
+	CDEBUG(D_HA, "setting import %s INVALID\n", obd2cli_tgt(imp->imp_obd));
+	imp->imp_invalid = 1;
+	imp->imp_generation++;
+	spin_unlock(&imp->imp_lock);
+
+	ptlrpc_abort_inflight(imp);
+	obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
+
+	EXIT;
+}
+
+/*
+ * This acts as a barrier; all existing requests are rejected, and
+ * no new requests will be accepted until the import is valid again.
+ */
+void ptlrpc_deactivate_import(struct obd_import *imp)
+{
+	spin_lock(&imp->imp_lock);
+	ptlrpc_deactivate_and_unlock_import(imp);
+}
+EXPORT_SYMBOL(ptlrpc_deactivate_import);
+
+static unsigned int
+ptlrpc_inflight_deadline(struct ptlrpc_request *req, time64_t now)
+{
+        long dl;
+
+        if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
+              (req->rq_phase == RQ_PHASE_BULK) ||
+              (req->rq_phase == RQ_PHASE_NEW)))
+                return 0;
+
+        if (req->rq_timedout)
+                return 0;
+
+        if (req->rq_phase == RQ_PHASE_NEW)
+                dl = req->rq_sent;
+        else
+                dl = req->rq_deadline;
+
+        if (dl <= now)
+                return 0;
+
+        return dl - now;
+}
+
+static unsigned int ptlrpc_inflight_timeout(struct obd_import *imp)
+{
+	time64_t now = ktime_get_real_seconds();
+	struct list_head *tmp, *n;
+	struct ptlrpc_request *req;
+	unsigned int timeout = 0;
+
+	spin_lock(&imp->imp_lock);
+	list_for_each_safe(tmp, n, &imp->imp_sending_list) {
+		req = list_entry(tmp, struct ptlrpc_request, rq_list);
+		timeout = max(ptlrpc_inflight_deadline(req, now), timeout);
+	}
+	spin_unlock(&imp->imp_lock);
+	return timeout;
+}
+
+/**
+ * This function will invalidate the import, if necessary, then block
+ * for all the RPC completions, and finally notify the obd to
+ * invalidate its state (ie cancel locks, clear pending requests,
+ * etc).
+ */
+void ptlrpc_invalidate_import(struct obd_import *imp)
+{
+	struct list_head *tmp, *n;
+	struct ptlrpc_request *req;
+	struct l_wait_info lwi;
+	unsigned int timeout;
+	int rc;
+
+	atomic_inc(&imp->imp_inval_count);
+
+	if (!imp->imp_invalid || imp->imp_obd->obd_no_recov)
+		ptlrpc_deactivate_import(imp);
+
+	CFS_FAIL_TIMEOUT(OBD_FAIL_MGS_CONNECT_NET, 3 * cfs_fail_val / 2);
+	LASSERT(imp->imp_invalid);
+
+        /* Wait forever until inflight == 0. We really can't do it another
+         * way because in some cases we need to wait for very long reply
+         * unlink. We can't do anything before that because there is really
+         * no guarantee that some rdma transfer is not in progress right now. */
+        do {
+                /* Calculate max timeout for waiting on rpcs to error
+                 * out. Use obd_timeout if calculated value is smaller
+                 * than it. */
+                if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
+                        timeout = ptlrpc_inflight_timeout(imp);
+                        timeout += timeout / 3;
+
+                        if (timeout == 0)
+                                timeout = obd_timeout;
+                } else {
+                        /* decrease the interval to increase race condition */
+                        timeout = 1;
+                }
+
+                CDEBUG(D_RPCTRACE,"Sleeping %d sec for inflight to error out\n",
+                       timeout);
+
+		/* Wait for all requests to error out and call completion
+		 * callbacks. Cap it at obd_timeout -- these should all
+		 * have been locally cancelled by ptlrpc_abort_inflight. */
+		lwi = LWI_TIMEOUT_INTERVAL(
+			cfs_timeout_cap(cfs_time_seconds(timeout)),
+			(timeout > 1)?cfs_time_seconds(1):cfs_time_seconds(1)/2,
+			NULL, NULL);
+		rc = l_wait_event(imp->imp_recovery_waitq,
+				  (atomic_read(&imp->imp_inflight) == 0),
+				  &lwi);
+		if (rc) {
+			const char *cli_tgt = obd2cli_tgt(imp->imp_obd);
+
+			CERROR("%s: rc = %d waiting for callback (%d != 0)\n",
+			       cli_tgt, rc, atomic_read(&imp->imp_inflight));
+
+			spin_lock(&imp->imp_lock);
+			if (atomic_read(&imp->imp_inflight) == 0) {
+				int count = atomic_read(&imp->imp_unregistering);
+
+				/* We know that "unregistering" rpcs only can
+				 * survive in sending or delaying lists (they
+				 * maybe waiting for long reply unlink in
+				 * sluggish nets). Let's check this. If there
+				 * is no inflight and unregistering != 0, this
+				 * is bug. */
+				LASSERTF(count == 0, "Some RPCs are still "
+					 "unregistering: %d\n", count);
+
+				/* Let's save one loop as soon as inflight have
+				 * dropped to zero. No new inflights possible at
+				 * this point. */
+				rc = 0;
+			} else {
+				list_for_each_safe(tmp, n,
+						   &imp->imp_sending_list) {
+					req = list_entry(tmp,
+							 struct ptlrpc_request,
+							 rq_list);
+					DEBUG_REQ(D_ERROR, req,
+						  "still on sending list");
+				}
+				list_for_each_safe(tmp, n,
+						   &imp->imp_delayed_list) {
+					req = list_entry(tmp,
+							 struct ptlrpc_request,
+							 rq_list);
+					DEBUG_REQ(D_ERROR, req,
+						  "still on delayed list");
+				}
+
+				CERROR("%s: Unregistering RPCs found (%d). "
+				       "Network is sluggish? Waiting them "
+				       "to error out.\n", cli_tgt,
+				       atomic_read(&imp->imp_unregistering));
+			}
+			spin_unlock(&imp->imp_lock);
+		}
+	} while (rc != 0);
+
+	/*
+	 * Let's additionally check that no new rpcs added to import in
+	 * "invalidate" state.
+	 */
+	LASSERT(atomic_read(&imp->imp_inflight) == 0);
+	obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE);
+	sptlrpc_import_flush_all_ctx(imp);
+
+	atomic_dec(&imp->imp_inval_count);
+	wake_up_all(&imp->imp_recovery_waitq);
+}
+EXPORT_SYMBOL(ptlrpc_invalidate_import);
+
+/* unset imp_invalid */
+void ptlrpc_activate_import(struct obd_import *imp)
+{
+	struct obd_device *obd = imp->imp_obd;
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_deactive != 0) {
+		spin_unlock(&imp->imp_lock);
+		return;
+	}
+
+	imp->imp_invalid = 0;
+	spin_unlock(&imp->imp_lock);
+	obd_import_event(obd, imp, IMP_EVENT_ACTIVE);
+}
+EXPORT_SYMBOL(ptlrpc_activate_import);
+
+void ptlrpc_pinger_force(struct obd_import *imp)
+{
+	CDEBUG(D_HA, "%s: waking up pinger s:%s\n", obd2cli_tgt(imp->imp_obd),
+	       ptlrpc_import_state_name(imp->imp_state));
+
+	spin_lock(&imp->imp_lock);
+	imp->imp_force_verify = 1;
+	spin_unlock(&imp->imp_lock);
+
+	if (imp->imp_state != LUSTRE_IMP_CONNECTING)
+		ptlrpc_pinger_wake_up();
+}
+EXPORT_SYMBOL(ptlrpc_pinger_force);
+
+void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt)
+{
+        ENTRY;
+
+        LASSERT(!imp->imp_dlm_fake);
+
+        if (ptlrpc_set_import_discon(imp, conn_cnt)) {
+                if (!imp->imp_replayable) {
+                        CDEBUG(D_HA, "import %s@%s for %s not replayable, "
+                               "auto-deactivating\n",
+                               obd2cli_tgt(imp->imp_obd),
+                               imp->imp_connection->c_remote_uuid.uuid,
+                               imp->imp_obd->obd_name);
+                        ptlrpc_deactivate_import(imp);
+                }
+
+		ptlrpc_pinger_force(imp);
+	}
+	EXIT;
+}
+
+int ptlrpc_reconnect_import(struct obd_import *imp)
+{
+#ifdef ENABLE_PINGER
+	struct l_wait_info lwi;
+	int secs = cfs_time_seconds(obd_timeout);
+	int rc;
+
+	ptlrpc_pinger_force(imp);
+
+	CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n",
+	       obd2cli_tgt(imp->imp_obd), secs);
+
+	lwi = LWI_TIMEOUT(secs, NULL, NULL);
+	rc = l_wait_event(imp->imp_recovery_waitq,
+			  !ptlrpc_import_in_recovery(imp), &lwi);
+	CDEBUG(D_HA, "%s: recovery finished s:%s\n", obd2cli_tgt(imp->imp_obd),
+	       ptlrpc_import_state_name(imp->imp_state));
+	return rc;
+#else
+	ptlrpc_set_import_discon(imp, 0);
+	/* Force a new connect attempt */
+	ptlrpc_invalidate_import(imp);
+	/* Do a fresh connect next time by zeroing the handle */
+	ptlrpc_disconnect_import(imp, 1);
+	/* Wait for all invalidate calls to finish */
+	if (atomic_read(&imp->imp_inval_count) > 0) {
+		struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+		int rc;
+
+		rc = l_wait_event(imp->imp_recovery_waitq,
+				  (atomic_read(&imp->imp_inval_count) == 0),
+				  &lwi);
+		if (rc)
+			CERROR("Interrupted, inval=%d\n",
+			       atomic_read(&imp->imp_inval_count));
+	}
+
+	/* Allow reconnect attempts */
+	imp->imp_obd->obd_no_recov = 0;
+	/* Remove 'invalid' flag */
+	ptlrpc_activate_import(imp);
+	/* Attempt a new connect */
+	ptlrpc_recover_import(imp, NULL, 0);
+	return 0;
+#endif
+}
+EXPORT_SYMBOL(ptlrpc_reconnect_import);
+
+/**
+ * Connection on import \a imp is changed to another one (if more than one is
+ * present). We typically chose connection that we have not tried to connect to
+ * the longest
+ */
+static int import_select_connection(struct obd_import *imp)
+{
+        struct obd_import_conn *imp_conn = NULL, *conn;
+        struct obd_export *dlmexp;
+        char *target_start;
+        int target_len, tried_all = 1;
+        ENTRY;
+
+	spin_lock(&imp->imp_lock);
+
+	if (list_empty(&imp->imp_conn_list)) {
+		CERROR("%s: no connections available\n",
+		       imp->imp_obd->obd_name);
+		spin_unlock(&imp->imp_lock);
+		RETURN(-EINVAL);
+	}
+
+	list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+		CDEBUG(D_HA, "%s: connect to NID %s last attempt %llu\n",
+                       imp->imp_obd->obd_name,
+                       libcfs_nid2str(conn->oic_conn->c_peer.nid),
+                       conn->oic_last_attempt);
+
+                /* If we have not tried this connection since
+                   the last successful attempt, go with this one */
+                if ((conn->oic_last_attempt == 0) ||
+                    cfs_time_beforeq_64(conn->oic_last_attempt,
+                                       imp->imp_last_success_conn)) {
+                        imp_conn = conn;
+                        tried_all = 0;
+                        break;
+                }
+
+                /* If all of the connections have already been tried
+                   since the last successful connection; just choose the
+                   least recently used */
+                if (!imp_conn)
+                        imp_conn = conn;
+                else if (cfs_time_before_64(conn->oic_last_attempt,
+                                            imp_conn->oic_last_attempt))
+                        imp_conn = conn;
+        }
+
+        /* if not found, simply choose the current one */
+        if (!imp_conn || imp->imp_force_reconnect) {
+                LASSERT(imp->imp_conn_current);
+                imp_conn = imp->imp_conn_current;
+                tried_all = 0;
+        }
+        LASSERT(imp_conn->oic_conn);
+
+        /* If we've tried everything, and we're back to the beginning of the
+           list, increase our timeout and try again. It will be reset when
+           we do finally connect. (FIXME: really we should wait for all network
+           state associated with the last connection attempt to drain before
+           trying to reconnect on it.) */
+        if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) {
+		struct adaptive_timeout *at = &imp->imp_at.iat_net_latency;
+		if (at_get(at) < CONNECTION_SWITCH_MAX) {
+			at_measured(at, at_get(at) + CONNECTION_SWITCH_INC);
+			if (at_get(at) > CONNECTION_SWITCH_MAX)
+				at_reset(at, CONNECTION_SWITCH_MAX);
+		}
+		LASSERT(imp_conn->oic_last_attempt);
+		CDEBUG(D_HA, "%s: tried all connections, increasing latency "
+			"to %ds\n", imp->imp_obd->obd_name, at_get(at));
+	}
+
+        imp_conn->oic_last_attempt = cfs_time_current_64();
+
+        /* switch connection, don't mind if it's same as the current one */
+        if (imp->imp_connection)
+                ptlrpc_connection_put(imp->imp_connection);
+        imp->imp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
+
+        dlmexp =  class_conn2export(&imp->imp_dlm_handle);
+        LASSERT(dlmexp != NULL);
+        if (dlmexp->exp_connection)
+                ptlrpc_connection_put(dlmexp->exp_connection);
+        dlmexp->exp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
+        class_export_put(dlmexp);
+
+        if (imp->imp_conn_current != imp_conn) {
+                if (imp->imp_conn_current) {
+                        deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+                                  &target_start, &target_len);
+
+                        CDEBUG(D_HA, "%s: Connection changing to"
+                               " %.*s (at %s)\n",
+                               imp->imp_obd->obd_name,
+                               target_len, target_start,
+                               libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
+                }
+
+                imp->imp_conn_current = imp_conn;
+        }
+
+        CDEBUG(D_HA, "%s: import %p using connection %s/%s\n",
+               imp->imp_obd->obd_name, imp, imp_conn->oic_uuid.uuid,
+               libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
+
+	spin_unlock(&imp->imp_lock);
+
+	RETURN(0);
+}
+
+/*
+ * must be called under imp_lock
+ */
+static int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno)
+{
+	struct ptlrpc_request	*req;
+	struct list_head	*tmp;
+
+	/* The requests in committed_list always have smaller transnos than
+	 * the requests in replay_list */
+	if (!list_empty(&imp->imp_committed_list)) {
+		tmp = imp->imp_committed_list.next;
+		req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
+		*transno = req->rq_transno;
+		if (req->rq_transno == 0) {
+			DEBUG_REQ(D_ERROR, req, "zero transno in committed_list");
+			LBUG();
+		}
+		return 1;
+	}
+	if (!list_empty(&imp->imp_replay_list)) {
+		tmp = imp->imp_replay_list.next;
+		req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
+		*transno = req->rq_transno;
+		if (req->rq_transno == 0) {
+			DEBUG_REQ(D_ERROR, req, "zero transno in replay_list");
+			LBUG();
+		}
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * Attempt to (re)connect import \a imp. This includes all preparations,
+ * initializing CONNECT RPC request and passing it to ptlrpcd for
+ * actual sending.
+ * Returns 0 on success or error code.
+ */
+int ptlrpc_connect_import(struct obd_import *imp)
+{
+	struct obd_device *obd = imp->imp_obd;
+	int initial_connect = 0;
+	int set_transno = 0;
+	__u64 committed_before_reconnect = 0;
+	struct ptlrpc_request *request;
+	char *bufs[] = { NULL,
+			 obd2cli_tgt(imp->imp_obd),
+			 obd->obd_uuid.uuid,
+			 (char *)&imp->imp_dlm_handle,
+			 (char *)&imp->imp_connect_data };
+	struct ptlrpc_connect_async_args *aa;
+	int rc;
+	ENTRY;
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_CLOSED) {
+		spin_unlock(&imp->imp_lock);
+		CERROR("can't connect to a closed import\n");
+		RETURN(-EINVAL);
+	} else if (imp->imp_state == LUSTRE_IMP_FULL) {
+		spin_unlock(&imp->imp_lock);
+		CERROR("already connected\n");
+		RETURN(0);
+	} else if (imp->imp_state == LUSTRE_IMP_CONNECTING ||
+		   imp->imp_connected) {
+		spin_unlock(&imp->imp_lock);
+		CERROR("already connecting\n");
+		RETURN(-EALREADY);
+	}
+
+	IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CONNECTING);
+
+	imp->imp_conn_cnt++;
+	imp->imp_resend_replay = 0;
+
+	if (!lustre_handle_is_used(&imp->imp_remote_handle))
+		initial_connect = 1;
+	else
+		committed_before_reconnect = imp->imp_peer_committed_transno;
+
+	set_transno = ptlrpc_first_transno(imp,
+					   &imp->imp_connect_data.ocd_transno);
+	spin_unlock(&imp->imp_lock);
+
+	rc = import_select_connection(imp);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = sptlrpc_import_sec_adapt(imp, NULL, NULL);
+	if (rc)
+		GOTO(out, rc);
+
+	/* Reset connect flags to the originally requested flags, in case
+	 * the server is updated on-the-fly we will get the new features. */
+	imp->imp_connect_data.ocd_connect_flags = imp->imp_connect_flags_orig;
+	imp->imp_connect_data.ocd_connect_flags2 = imp->imp_connect_flags2_orig;
+	/* Reset ocd_version each time so the server knows the exact versions */
+	imp->imp_connect_data.ocd_version = LUSTRE_VERSION_CODE;
+	imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
+	imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18;
+
+	rc = obd_reconnect(NULL, imp->imp_obd->obd_self_export, obd,
+			   &obd->obd_uuid, &imp->imp_connect_data, NULL);
+	if (rc)
+		GOTO(out, rc);
+
+	request = ptlrpc_request_alloc(imp, &RQF_MDS_CONNECT);
+	if (request == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	rc = ptlrpc_request_bufs_pack(request, LUSTRE_OBD_VERSION,
+				      imp->imp_connect_op, bufs, NULL);
+	if (rc) {
+		ptlrpc_request_free(request);
+		GOTO(out, rc);
+	}
+
+	/* Report the rpc service time to the server so that it knows how long
+	 * to wait for clients to join recovery */
+	lustre_msg_set_service_time(request->rq_reqmsg,
+				    at_timeout2est(request->rq_timeout));
+
+	/* The amount of time we give the server to process the connect req.
+	 * import_select_connection will increase the net latency on
+	 * repeated reconnect attempts to cover slow networks.
+	 * We override/ignore the server rpc completion estimate here,
+	 * which may be large if this is a reconnect attempt */
+	request->rq_timeout = INITIAL_CONNECT_TIMEOUT;
+	lustre_msg_set_timeout(request->rq_reqmsg, request->rq_timeout);
+
+	request->rq_no_resend = request->rq_no_delay = 1;
+	request->rq_send_state = LUSTRE_IMP_CONNECTING;
+	/* Allow a slightly larger reply for future growth compatibility */
+	req_capsule_set_size(&request->rq_pill, &RMF_CONNECT_DATA, RCL_SERVER,
+			     sizeof(struct obd_connect_data)+16*sizeof(__u64));
+	ptlrpc_request_set_replen(request);
+	request->rq_interpret_reply = ptlrpc_connect_interpret;
+
+	CLASSERT(sizeof(*aa) <= sizeof(request->rq_async_args));
+	aa = ptlrpc_req_async_args(request);
+	memset(aa, 0, sizeof *aa);
+
+	aa->pcaa_peer_committed = committed_before_reconnect;
+	aa->pcaa_initial_connect = initial_connect;
+
+	if (aa->pcaa_initial_connect) {
+		spin_lock(&imp->imp_lock);
+		imp->imp_replayable = 1;
+		spin_unlock(&imp->imp_lock);
+		lustre_msg_add_op_flags(request->rq_reqmsg,
+					MSG_CONNECT_INITIAL);
+	}
+
+	if (set_transno)
+		lustre_msg_add_op_flags(request->rq_reqmsg,
+					MSG_CONNECT_TRANSNO);
+
+	DEBUG_REQ(D_RPCTRACE, request, "(re)connect request (timeout %d)",
+		  request->rq_timeout);
+	ptlrpcd_add_req(request);
+	rc = 0;
+out:
+	if (rc != 0)
+		IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_connect_import);
+
+static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp)
+{
+	int force_verify;
+
+	spin_lock(&imp->imp_lock);
+	force_verify = imp->imp_force_verify != 0;
+	spin_unlock(&imp->imp_lock);
+
+	if (force_verify)
+		ptlrpc_pinger_wake_up();
+}
+
+static int ptlrpc_busy_reconnect(int rc)
+{
+        return (rc == -EBUSY) || (rc == -EAGAIN);
+}
+
+static int ptlrpc_connect_set_flags(struct obd_import *imp,
+				     struct obd_connect_data *ocd,
+				     __u64 old_connect_flags,
+				     struct obd_export *exp, int init_connect)
+{
+	static bool warned;
+	struct client_obd *cli = &imp->imp_obd->u.cli;
+
+	if ((imp->imp_connect_flags_orig & OBD_CONNECT_IBITS) &&
+	    !(ocd->ocd_connect_flags & OBD_CONNECT_IBITS)) {
+		LCONSOLE_WARN("%s: MDS %s does not support ibits "
+			      "lock, either very old or invalid: "
+			      "requested %#llx, replied %#llx\n",
+			      imp->imp_obd->obd_name,
+			      imp->imp_connection->c_remote_uuid.uuid,
+			      imp->imp_connect_flags_orig,
+			      ocd->ocd_connect_flags);
+		return -EPROTO;
+	}
+
+	spin_lock(&imp->imp_lock);
+	list_del(&imp->imp_conn_current->oic_item);
+	list_add(&imp->imp_conn_current->oic_item,
+		 &imp->imp_conn_list);
+	imp->imp_last_success_conn =
+		imp->imp_conn_current->oic_last_attempt;
+
+	spin_unlock(&imp->imp_lock);
+
+
+	if (!warned && (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+	    (ocd->ocd_version > LUSTRE_VERSION_CODE +
+				LUSTRE_VERSION_OFFSET_WARN ||
+	     ocd->ocd_version < LUSTRE_VERSION_CODE -
+				LUSTRE_VERSION_OFFSET_WARN)) {
+		/* Sigh, some compilers do not like #ifdef in the middle
+		   of macro arguments */
+		const char *older = "older than client. "
+				    "Consider upgrading server";
+		const char *newer = "newer than client. "
+				    "Consider recompiling application";
+
+		LCONSOLE_WARN("Server %s version (%d.%d.%d.%d) "
+			      "is much %s (%s)\n",
+			      obd2cli_tgt(imp->imp_obd),
+			      OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
+			      OBD_OCD_VERSION_MINOR(ocd->ocd_version),
+			      OBD_OCD_VERSION_PATCH(ocd->ocd_version),
+			      OBD_OCD_VERSION_FIX(ocd->ocd_version),
+			      ocd->ocd_version > LUSTRE_VERSION_CODE ?
+			      newer : older, LUSTRE_VERSION_STRING);
+		warned = true;
+	}
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
+	/* Check if server has LU-1252 fix applied to not always swab
+	 * the IR MNE entries. Do this only once per connection.  This
+	 * fixup is version-limited, because we don't want to carry the
+	 * OBD_CONNECT_MNE_SWAB flag around forever, just so long as we
+	 * need interop with unpatched 2.2 servers.  For newer servers,
+	 * the client will do MNE swabbing only as needed.  LU-1644 */
+	if (unlikely((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+		     !(ocd->ocd_connect_flags & OBD_CONNECT_MNE_SWAB) &&
+		     OBD_OCD_VERSION_MAJOR(ocd->ocd_version) == 2 &&
+		     OBD_OCD_VERSION_MINOR(ocd->ocd_version) == 2 &&
+		     OBD_OCD_VERSION_PATCH(ocd->ocd_version) < 55 &&
+		     strcmp(imp->imp_obd->obd_type->typ_name,
+			    LUSTRE_MGC_NAME) == 0))
+		imp->imp_need_mne_swab = 1;
+	else /* clear if server was upgraded since last connect */
+		imp->imp_need_mne_swab = 0;
+#endif
+
+	if (ocd->ocd_connect_flags & OBD_CONNECT_CKSUM) {
+		/* We sent to the server ocd_cksum_types with bits set
+		 * for algorithms we understand. The server masked off
+		 * the checksum types it doesn't support */
+		if ((ocd->ocd_cksum_types &
+		     cksum_types_supported_client()) == 0) {
+			LCONSOLE_ERROR("The negotiation of the checksum "
+				       "alogrithm to use with server %s "
+				       "failed (%x/%x)\n",
+				       obd2cli_tgt(imp->imp_obd),
+				       ocd->ocd_cksum_types,
+				       cksum_types_supported_client());
+			return -EPROTO;
+		} else {
+			cli->cl_supp_cksum_types = ocd->ocd_cksum_types;
+		}
+	} else {
+		/* The server does not support OBD_CONNECT_CKSUM.
+		 * Enforce ADLER for backward compatibility*/
+		cli->cl_supp_cksum_types = OBD_CKSUM_ADLER;
+	}
+	cli->cl_cksum_type = cksum_type_select(cli->cl_supp_cksum_types);
+
+	if (ocd->ocd_connect_flags & OBD_CONNECT_BRW_SIZE)
+		cli->cl_max_pages_per_rpc =
+			min(ocd->ocd_brw_size >> PAGE_SHIFT,
+			    cli->cl_max_pages_per_rpc);
+	else if (imp->imp_connect_op == MDS_CONNECT ||
+		 imp->imp_connect_op == MGS_CONNECT)
+		cli->cl_max_pages_per_rpc = 1;
+
+	LASSERT((cli->cl_max_pages_per_rpc <= PTLRPC_MAX_BRW_PAGES) &&
+		(cli->cl_max_pages_per_rpc > 0));
+
+	client_adjust_max_dirty(cli);
+
+	/* Update client max modify RPCs in flight with value returned
+	 * by the server */
+	if (ocd->ocd_connect_flags & OBD_CONNECT_MULTIMODRPCS)
+		cli->cl_max_mod_rpcs_in_flight = min(
+					cli->cl_max_mod_rpcs_in_flight,
+					ocd->ocd_maxmodrpcs);
+	else
+		cli->cl_max_mod_rpcs_in_flight = 1;
+
+	/* Reset ns_connect_flags only for initial connect. It might be
+	 * changed in while using FS and if we reset it in reconnect
+	 * this leads to losing user settings done before such as
+	 * disable lru_resize, etc. */
+	if (old_connect_flags != exp_connect_flags(exp) || init_connect) {
+		CDEBUG(D_HA, "%s: Resetting ns_connect_flags to server "
+			     "flags: %#llx\n", imp->imp_obd->obd_name,
+			     ocd->ocd_connect_flags);
+		imp->imp_obd->obd_namespace->ns_connect_flags =
+			ocd->ocd_connect_flags;
+		imp->imp_obd->obd_namespace->ns_orig_connect_flags =
+			ocd->ocd_connect_flags;
+	}
+
+	if (ocd->ocd_connect_flags & OBD_CONNECT_AT)
+		/* We need a per-message support flag, because
+		 * a. we don't know if the incoming connect reply
+		 *    supports AT or not (in reply_in_callback)
+		 *    until we unpack it.
+		 * b. failovered server means export and flags are gone
+		 *    (in ptlrpc_send_reply).
+		 *    Can only be set when we know AT is supported at
+		 *    both ends */
+		imp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT;
+	else
+		imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
+
+	imp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18;
+
+	return 0;
+}
+
+/**
+ * Add all replay requests back to unreplied list before start replay,
+ * so that we can make sure the known replied XID is always increased
+ * only even if when replaying requests.
+ */
+static void ptlrpc_prepare_replay(struct obd_import *imp)
+{
+	struct ptlrpc_request *req;
+
+	if (imp->imp_state != LUSTRE_IMP_REPLAY ||
+	    imp->imp_resend_replay)
+		return;
+
+	/* If the server was restart during repaly, the requests may
+	 * have been added to the unreplied list in former replay. */
+	spin_lock(&imp->imp_lock);
+
+	list_for_each_entry(req, &imp->imp_committed_list, rq_replay_list) {
+		if (list_empty(&req->rq_unreplied_list))
+			ptlrpc_add_unreplied(req);
+	}
+
+	list_for_each_entry(req, &imp->imp_replay_list, rq_replay_list) {
+		if (list_empty(&req->rq_unreplied_list))
+			ptlrpc_add_unreplied(req);
+	}
+
+	imp->imp_known_replied_xid = ptlrpc_known_replied_xid(imp);
+	spin_unlock(&imp->imp_lock);
+}
+
+/**
+ * interpret_reply callback for connect RPCs.
+ * Looks into returned status of connect operation and decides
+ * what to do with the import - i.e enter recovery, promote it to
+ * full state for normal operations of disconnect it due to an error.
+ */
+static int ptlrpc_connect_interpret(const struct lu_env *env,
+                                    struct ptlrpc_request *request,
+                                    void *data, int rc)
+{
+        struct ptlrpc_connect_async_args *aa = data;
+        struct obd_import *imp = request->rq_import;
+        struct lustre_handle old_hdl;
+        __u64 old_connect_flags;
+        int msg_flags;
+	struct obd_connect_data *ocd;
+	struct obd_export *exp = NULL;
+	int ret;
+	ENTRY;
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_CLOSED) {
+		imp->imp_connect_tried = 1;
+		spin_unlock(&imp->imp_lock);
+		RETURN(0);
+	}
+
+	if (rc) {
+		/* if this reconnect to busy export - not need select new target
+		 * for connecting*/
+		imp->imp_force_reconnect = ptlrpc_busy_reconnect(rc);
+		spin_unlock(&imp->imp_lock);
+		ptlrpc_maybe_ping_import_soon(imp);
+		GOTO(out, rc);
+	}
+
+	/* LU-7558: indicate that we are interpretting connect reply,
+	 * pltrpc_connect_import() will not try to reconnect until
+	 * interpret will finish. */
+	imp->imp_connected = 1;
+	spin_unlock(&imp->imp_lock);
+
+	LASSERT(imp->imp_conn_current);
+
+	msg_flags = lustre_msg_get_op_flags(request->rq_repmsg);
+
+	ret = req_capsule_get_size(&request->rq_pill, &RMF_CONNECT_DATA,
+				   RCL_SERVER);
+	/* server replied obd_connect_data is always bigger */
+	ocd = req_capsule_server_sized_get(&request->rq_pill,
+					   &RMF_CONNECT_DATA, ret);
+
+	if (ocd == NULL) {
+		CERROR("%s: no connect data from server\n",
+		       imp->imp_obd->obd_name);
+		rc = -EPROTO;
+		GOTO(out, rc);
+	}
+
+	spin_lock(&imp->imp_lock);
+
+        /* All imports are pingable */
+        imp->imp_pingable = 1;
+        imp->imp_force_reconnect = 0;
+        imp->imp_force_verify = 0;
+
+	imp->imp_connect_data = *ocd;
+
+	CDEBUG(D_HA, "%s: connect to target with instance %u\n",
+	       imp->imp_obd->obd_name, ocd->ocd_instance);
+	exp = class_conn2export(&imp->imp_dlm_handle);
+
+	spin_unlock(&imp->imp_lock);
+
+	if (!exp) {
+		/* This could happen if export is cleaned during the
+		   connect attempt */
+		CERROR("%s: missing export after connect\n",
+		       imp->imp_obd->obd_name);
+		GOTO(out, rc = -ENODEV);
+	}
+
+	/* check that server granted subset of flags we asked for. */
+	if ((ocd->ocd_connect_flags & imp->imp_connect_flags_orig) !=
+	    ocd->ocd_connect_flags) {
+		CERROR("%s: Server didn't grant requested subset of flags: "
+		       "asked=%#llx granted=%#llx\n",
+		       imp->imp_obd->obd_name, imp->imp_connect_flags_orig,
+		       ocd->ocd_connect_flags);
+		GOTO(out, rc = -EPROTO);
+	}
+
+	if ((ocd->ocd_connect_flags2 & imp->imp_connect_flags2_orig) !=
+	    ocd->ocd_connect_flags2) {
+		CERROR("%s: Server didn't grant requested subset of flags2: "
+		       "asked=%#llx granted=%#llx\n",
+		       imp->imp_obd->obd_name, imp->imp_connect_flags2_orig,
+		       ocd->ocd_connect_flags2);
+		GOTO(out, rc = -EPROTO);
+	}
+
+	if (!(imp->imp_connect_flags_orig & OBD_CONNECT_LIGHTWEIGHT) &&
+	    (imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS) &&
+	    (imp->imp_connect_flags_orig & OBD_CONNECT_FID) &&
+	    (ocd->ocd_connect_flags & OBD_CONNECT_VERSION)) {
+		__u32 major = OBD_OCD_VERSION_MAJOR(ocd->ocd_version);
+		__u32 minor = OBD_OCD_VERSION_MINOR(ocd->ocd_version);
+		__u32 patch = OBD_OCD_VERSION_PATCH(ocd->ocd_version);
+
+		/* We do not support the MDT-MDT interoperations with
+		 * different version MDT because of protocol changes. */
+		if (unlikely(major != LUSTRE_MAJOR ||
+			     minor != LUSTRE_MINOR ||
+			     abs(patch - LUSTRE_PATCH) > 3)) {
+			LCONSOLE_WARN("%s: import %p (%u.%u.%u.%u) tried the "
+				      "connection to different version MDT "
+				      "(%d.%d.%d.%d) %s\n",
+				      imp->imp_obd->obd_name, imp, LUSTRE_MAJOR,
+				      LUSTRE_MINOR, LUSTRE_PATCH, LUSTRE_FIX,
+				      major, minor, patch,
+				      OBD_OCD_VERSION_FIX(ocd->ocd_version),
+				      imp->imp_connection->c_remote_uuid.uuid);
+
+			GOTO(out, rc = -EPROTO);
+		}
+	}
+
+	old_connect_flags = exp_connect_flags(exp);
+	exp->exp_connect_data = *ocd;
+	imp->imp_obd->obd_self_export->exp_connect_data = *ocd;
+
+	/* The net statistics after (re-)connect is not valid anymore,
+	 * because may reflect other routing, etc. */
+	at_reinit(&imp->imp_at.iat_net_latency, 0, 0);
+	ptlrpc_at_adj_net_latency(request,
+			lustre_msg_get_service_time(request->rq_repmsg));
+
+	/* Import flags should be updated before waking import at FULL state */
+	rc = ptlrpc_connect_set_flags(imp, ocd, old_connect_flags, exp,
+				      aa->pcaa_initial_connect);
+	class_export_put(exp);
+	exp = NULL;
+
+	if (rc != 0)
+		GOTO(out, rc);
+
+	obd_import_event(imp->imp_obd, imp, IMP_EVENT_OCD);
+
+	if (aa->pcaa_initial_connect) {
+		spin_lock(&imp->imp_lock);
+		if (msg_flags & MSG_CONNECT_REPLAYABLE) {
+			imp->imp_replayable = 1;
+			spin_unlock(&imp->imp_lock);
+			CDEBUG(D_HA, "connected to replayable target: %s\n",
+			       obd2cli_tgt(imp->imp_obd));
+		} else {
+			imp->imp_replayable = 0;
+			spin_unlock(&imp->imp_lock);
+		}
+
+                /* if applies, adjust the imp->imp_msg_magic here
+                 * according to reply flags */
+
+                imp->imp_remote_handle =
+                                *lustre_msg_get_handle(request->rq_repmsg);
+
+                /* Initial connects are allowed for clients with non-random
+                 * uuids when servers are in recovery.  Simply signal the
+                 * servers replay is complete and wait in REPLAY_WAIT. */
+                if (msg_flags & MSG_CONNECT_RECOVERING) {
+                        CDEBUG(D_HA, "connect to %s during recovery\n",
+                               obd2cli_tgt(imp->imp_obd));
+                        IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
+                } else {
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
+			ptlrpc_activate_import(imp);
+                }
+
+                GOTO(finish, rc = 0);
+        }
+
+        /* Determine what recovery state to move the import to. */
+        if (MSG_CONNECT_RECONNECT & msg_flags) {
+                memset(&old_hdl, 0, sizeof(old_hdl));
+                if (!memcmp(&old_hdl, lustre_msg_get_handle(request->rq_repmsg),
+                            sizeof (old_hdl))) {
+                        LCONSOLE_WARN("Reconnect to %s (at @%s) failed due "
+				      "bad handle %#llx\n",
+                                      obd2cli_tgt(imp->imp_obd),
+                                      imp->imp_connection->c_remote_uuid.uuid,
+                                      imp->imp_dlm_handle.cookie);
+                        GOTO(out, rc = -ENOTCONN);
+                }
+
+                if (memcmp(&imp->imp_remote_handle,
+                           lustre_msg_get_handle(request->rq_repmsg),
+                           sizeof(imp->imp_remote_handle))) {
+                        int level = msg_flags & MSG_CONNECT_RECOVERING ?
+                                D_HA : D_WARNING;
+
+                        /* Bug 16611/14775: if server handle have changed,
+                         * that means some sort of disconnection happened.
+                         * If the server is not in recovery, that also means it
+                         * already erased all of our state because of previous
+                         * eviction. If it is in recovery - we are safe to
+                         * participate since we can reestablish all of our state
+                         * with server again */
+                        if ((MSG_CONNECT_RECOVERING & msg_flags)) {
+                                CDEBUG(level,"%s@%s changed server handle from "
+				       "%#llx to %#llx"
+                                       " but is still in recovery\n",
+                                       obd2cli_tgt(imp->imp_obd),
+                                       imp->imp_connection->c_remote_uuid.uuid,
+                                       imp->imp_remote_handle.cookie,
+                                       lustre_msg_get_handle(
+                                       request->rq_repmsg)->cookie);
+                        } else {
+                                LCONSOLE_WARN("Evicted from %s (at %s) "
+                                              "after server handle changed from "
+					      "%#llx to %#llx\n",
+                                              obd2cli_tgt(imp->imp_obd),
+                                              imp->imp_connection-> \
+                                              c_remote_uuid.uuid,
+                                              imp->imp_remote_handle.cookie,
+                                              lustre_msg_get_handle(
+                                              request->rq_repmsg)->cookie);
+                        }
+
+
+                        imp->imp_remote_handle =
+                                     *lustre_msg_get_handle(request->rq_repmsg);
+
+                        if (!(MSG_CONNECT_RECOVERING & msg_flags)) {
+                                IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+                                GOTO(finish, rc = 0);
+                        }
+
+                } else {
+                        CDEBUG(D_HA, "reconnected to %s@%s after partition\n",
+                               obd2cli_tgt(imp->imp_obd),
+                               imp->imp_connection->c_remote_uuid.uuid);
+                }
+
+                if (imp->imp_invalid) {
+                        CDEBUG(D_HA, "%s: reconnected but import is invalid; "
+                               "marking evicted\n", imp->imp_obd->obd_name);
+                        IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+                } else if (MSG_CONNECT_RECOVERING & msg_flags) {
+                        CDEBUG(D_HA, "%s: reconnected to %s during replay\n",
+                               imp->imp_obd->obd_name,
+                               obd2cli_tgt(imp->imp_obd));
+
+			spin_lock(&imp->imp_lock);
+			imp->imp_resend_replay = 1;
+			spin_unlock(&imp->imp_lock);
+
+			IMPORT_SET_STATE(imp, imp->imp_replay_state);
+                } else {
+                        IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+                }
+        } else if ((MSG_CONNECT_RECOVERING & msg_flags) && !imp->imp_invalid) {
+                LASSERT(imp->imp_replayable);
+                imp->imp_remote_handle =
+                                *lustre_msg_get_handle(request->rq_repmsg);
+                imp->imp_last_replay_transno = 0;
+		imp->imp_replay_cursor = &imp->imp_committed_list;
+                IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
+        } else {
+                DEBUG_REQ(D_HA, request, "%s: evicting (reconnect/recover flags"
+                          " not set: %x)", imp->imp_obd->obd_name, msg_flags);
+                imp->imp_remote_handle =
+                                *lustre_msg_get_handle(request->rq_repmsg);
+                IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+        }
+
+        /* Sanity checks for a reconnected import. */
+        if (!(imp->imp_replayable) != !(msg_flags & MSG_CONNECT_REPLAYABLE)) {
+                CERROR("imp_replayable flag does not match server "
+                       "after reconnect. We should LBUG right here.\n");
+        }
+
+        if (lustre_msg_get_last_committed(request->rq_repmsg) > 0 &&
+            lustre_msg_get_last_committed(request->rq_repmsg) <
+            aa->pcaa_peer_committed) {
+		CERROR("%s went back in time (transno %lld"
+		       " was previously committed, server now claims %lld"
+                       ")!  See https://bugzilla.lustre.org/show_bug.cgi?"
+                       "id=9646\n",
+                       obd2cli_tgt(imp->imp_obd), aa->pcaa_peer_committed,
+                       lustre_msg_get_last_committed(request->rq_repmsg));
+        }
+
+finish:
+	ptlrpc_prepare_replay(imp);
+	rc = ptlrpc_import_recovery_state_machine(imp);
+	if (rc == -ENOTCONN) {
+		CDEBUG(D_HA, "evicted/aborted by %s@%s during recovery;"
+		       "invalidating and reconnecting\n",
+		       obd2cli_tgt(imp->imp_obd),
+		       imp->imp_connection->c_remote_uuid.uuid);
+		ptlrpc_connect_import(imp);
+		spin_lock(&imp->imp_lock);
+		imp->imp_connected = 0;
+		imp->imp_connect_tried = 1;
+		spin_unlock(&imp->imp_lock);
+		RETURN(0);
+	}
+
+out:
+	spin_lock(&imp->imp_lock);
+	imp->imp_connected = 0;
+	imp->imp_connect_tried = 1;
+	spin_unlock(&imp->imp_lock);
+
+	if (exp != NULL)
+		class_export_put(exp);
+
+        if (rc != 0) {
+                IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
+                if (rc == -EACCES) {
+                        /*
+                         * Give up trying to reconnect
+                         * EACCES means client has no permission for connection
+                         */
+                        imp->imp_obd->obd_no_recov = 1;
+                        ptlrpc_deactivate_import(imp);
+                }
+
+                if (rc == -EPROTO) {
+                        struct obd_connect_data *ocd;
+
+                        /* reply message might not be ready */
+                        if (request->rq_repmsg == NULL)
+                                RETURN(-EPROTO);
+
+                        ocd = req_capsule_server_get(&request->rq_pill,
+                                                     &RMF_CONNECT_DATA);
+                        if (ocd &&
+                            (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+                            (ocd->ocd_version != LUSTRE_VERSION_CODE)) {
+                           /* Actually servers are only supposed to refuse
+                              connection from liblustre clients, so we should
+                              never see this from VFS context */
+                                LCONSOLE_ERROR_MSG(0x16a, "Server %s version "
+                                        "(%d.%d.%d.%d)"
+                                        " refused connection from this client "
+                                        "with an incompatible version (%s).  "
+                                        "Client must be recompiled\n",
+                                        obd2cli_tgt(imp->imp_obd),
+                                        OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
+                                        OBD_OCD_VERSION_MINOR(ocd->ocd_version),
+                                        OBD_OCD_VERSION_PATCH(ocd->ocd_version),
+                                        OBD_OCD_VERSION_FIX(ocd->ocd_version),
+                                        LUSTRE_VERSION_STRING);
+                                ptlrpc_deactivate_import(imp);
+                                IMPORT_SET_STATE(imp, LUSTRE_IMP_CLOSED);
+                        }
+                        RETURN(-EPROTO);
+                }
+
+		ptlrpc_maybe_ping_import_soon(imp);
+
+		CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n",
+		       obd2cli_tgt(imp->imp_obd),
+		       (char *)imp->imp_connection->c_remote_uuid.uuid, rc);
+	}
+
+	wake_up_all(&imp->imp_recovery_waitq);
+	RETURN(rc);
+}
+
+/**
+ * interpret callback for "completed replay" RPCs.
+ * \see signal_completed_replay
+ */
+static int completed_replay_interpret(const struct lu_env *env,
+                                      struct ptlrpc_request *req,
+                                      void * data, int rc)
+{
+	ENTRY;
+	atomic_dec(&req->rq_import->imp_replay_inflight);
+	if (req->rq_status == 0 &&
+	    !req->rq_import->imp_vbr_failed) {
+		ptlrpc_import_recovery_state_machine(req->rq_import);
+	} else {
+		if (req->rq_import->imp_vbr_failed) {
+			CDEBUG(D_WARNING,
+			       "%s: version recovery fails, reconnecting\n",
+			       req->rq_import->imp_obd->obd_name);
+		} else {
+			CDEBUG(D_HA, "%s: LAST_REPLAY message error: %d, "
+				     "reconnecting\n",
+			       req->rq_import->imp_obd->obd_name,
+			       req->rq_status);
+		}
+		ptlrpc_connect_import(req->rq_import);
+	}
+
+	RETURN(0);
+}
+
+/**
+ * Let server know that we have no requests to replay anymore.
+ * Achieved by just sending a PING request
+ */
+static int signal_completed_replay(struct obd_import *imp)
+{
+	struct ptlrpc_request *req;
+	ENTRY;
+
+	if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_FINISH_REPLAY)))
+		RETURN(0);
+
+	LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
+	atomic_inc(&imp->imp_replay_inflight);
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING, LUSTRE_OBD_VERSION,
+					OBD_PING);
+	if (req == NULL) {
+		atomic_dec(&imp->imp_replay_inflight);
+		RETURN(-ENOMEM);
+	}
+
+	ptlrpc_request_set_replen(req);
+	req->rq_send_state = LUSTRE_IMP_REPLAY_WAIT;
+	lustre_msg_add_flags(req->rq_reqmsg,
+			     MSG_LOCK_REPLAY_DONE | MSG_REQ_REPLAY_DONE);
+	if (AT_OFF)
+		req->rq_timeout *= 3;
+	req->rq_interpret_reply = completed_replay_interpret;
+
+	ptlrpcd_add_req(req);
+	RETURN(0);
+}
+
+/**
+ * In kernel code all import invalidation happens in its own
+ * separate thread, so that whatever application happened to encounter
+ * a problem could still be killed or otherwise continue
+ */
+static int ptlrpc_invalidate_import_thread(void *data)
+{
+        struct obd_import *imp = data;
+
+        ENTRY;
+
+	unshare_fs_struct();
+
+        CDEBUG(D_HA, "thread invalidate import %s to %s@%s\n",
+               imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
+               imp->imp_connection->c_remote_uuid.uuid);
+
+        ptlrpc_invalidate_import(imp);
+
+        if (obd_dump_on_eviction) {
+                CERROR("dump the log upon eviction\n");
+                libcfs_debug_dumplog();
+        }
+
+        IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+        ptlrpc_import_recovery_state_machine(imp);
+
+        class_import_put(imp);
+        RETURN(0);
+}
+
+/**
+ * This is the state machine for client-side recovery on import.
+ *
+ * Typicaly we have two possibly paths. If we came to server and it is not
+ * in recovery, we just enter IMP_EVICTED state, invalidate our import
+ * state and reconnect from scratch.
+ * If we came to server that is in recovery, we enter IMP_REPLAY import state.
+ * We go through our list of requests to replay and send them to server one by
+ * one.
+ * After sending all request from the list we change import state to
+ * IMP_REPLAY_LOCKS and re-request all the locks we believe we have from server
+ * and also all the locks we don't yet have and wait for server to grant us.
+ * After that we send a special "replay completed" request and change import
+ * state to IMP_REPLAY_WAIT.
+ * Upon receiving reply to that "replay completed" RPC we enter IMP_RECOVER
+ * state and resend all requests from sending list.
+ * After that we promote import to FULL state and send all delayed requests
+ * and import is fully operational after that.
+ *
+ */
+int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
+{
+        int rc = 0;
+        int inflight;
+        char *target_start;
+        int target_len;
+
+        ENTRY;
+        if (imp->imp_state == LUSTRE_IMP_EVICTED) {
+                deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+                          &target_start, &target_len);
+                /* Don't care about MGC eviction */
+                if (strcmp(imp->imp_obd->obd_type->typ_name,
+                           LUSTRE_MGC_NAME) != 0) {
+			LCONSOLE_ERROR_MSG(0x167, "%s: This client was evicted "
+					   "by %.*s; in progress operations "
+					   "using this service will fail.\n",
+					   imp->imp_obd->obd_name, target_len,
+					   target_start);
+                }
+                CDEBUG(D_HA, "evicted from %s@%s; invalidating\n",
+                       obd2cli_tgt(imp->imp_obd),
+                       imp->imp_connection->c_remote_uuid.uuid);
+                /* reset vbr_failed flag upon eviction */
+		spin_lock(&imp->imp_lock);
+		imp->imp_vbr_failed = 0;
+		spin_unlock(&imp->imp_lock);
+
+		{
+		struct task_struct *task;
+		/* bug 17802:  XXX client_disconnect_export vs connect request
+		 * race. if client is evicted at this time then we start
+		 * invalidate thread without reference to import and import can
+		 * be freed at same time. */
+		class_import_get(imp);
+		task = kthread_run(ptlrpc_invalidate_import_thread, imp,
+				     "ll_imp_inval");
+		if (IS_ERR(task)) {
+			class_import_put(imp);
+			CERROR("error starting invalidate thread: %d\n", rc);
+			rc = PTR_ERR(task);
+		} else {
+			rc = 0;
+		}
+		RETURN(rc);
+		}
+        }
+
+	if (imp->imp_state == LUSTRE_IMP_REPLAY) {
+		CDEBUG(D_HA, "replay requested by %s\n",
+		       obd2cli_tgt(imp->imp_obd));
+		rc = ptlrpc_replay_next(imp, &inflight);
+		if (inflight == 0 &&
+		    atomic_read(&imp->imp_replay_inflight) == 0) {
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
+			rc = ldlm_replay_locks(imp);
+			if (rc)
+				GOTO(out, rc);
+		}
+		rc = 0;
+	}
+
+	if (imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS) {
+		if (atomic_read(&imp->imp_replay_inflight) == 0) {
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_WAIT);
+			rc = signal_completed_replay(imp);
+			if (rc)
+				GOTO(out, rc);
+		}
+	}
+
+	if (imp->imp_state == LUSTRE_IMP_REPLAY_WAIT) {
+		if (atomic_read(&imp->imp_replay_inflight) == 0) {
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+		}
+	}
+
+        if (imp->imp_state == LUSTRE_IMP_RECOVER) {
+		struct ptlrpc_connection *conn = imp->imp_connection;
+
+                rc = ptlrpc_resend(imp);
+                if (rc)
+                        GOTO(out, rc);
+                IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
+                ptlrpc_activate_import(imp);
+
+		LCONSOLE_INFO("%s: Connection restored to %s (at %s)\n",
+			      imp->imp_obd->obd_name,
+			      obd_uuid2str(&conn->c_remote_uuid),
+			      libcfs_nid2str(imp->imp_connection->c_peer.nid));
+        }
+
+	if (imp->imp_state == LUSTRE_IMP_FULL) {
+		wake_up_all(&imp->imp_recovery_waitq);
+		ptlrpc_wake_delayed(imp);
+	}
+
+out:
+	RETURN(rc);
+}
+
+int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
+{
+	struct ptlrpc_request *req;
+	int rq_opc, rc = 0;
+	ENTRY;
+
+	if (imp->imp_obd->obd_force)
+		GOTO(set_state, rc);
+
+	switch (imp->imp_connect_op) {
+	case OST_CONNECT:
+		rq_opc = OST_DISCONNECT;
+		break;
+	case MDS_CONNECT:
+		rq_opc = MDS_DISCONNECT;
+		break;
+	case MGS_CONNECT:
+		rq_opc = MGS_DISCONNECT;
+		break;
+	default:
+		rc = -EINVAL;
+		CERROR("%s: don't know how to disconnect from %s "
+		       "(connect_op %d): rc = %d\n",
+		       imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
+		       imp->imp_connect_op, rc);
+		RETURN(rc);
+	}
+
+        if (ptlrpc_import_in_recovery(imp)) {
+                struct l_wait_info lwi;
+                cfs_duration_t timeout;
+
+                if (AT_OFF) {
+                        if (imp->imp_server_timeout)
+                                timeout = cfs_time_seconds(obd_timeout / 2);
+                        else
+                                timeout = cfs_time_seconds(obd_timeout);
+                } else {
+                        int idx = import_at_get_index(imp,
+                                imp->imp_client->cli_request_portal);
+                        timeout = cfs_time_seconds(
+                                at_get(&imp->imp_at.iat_service_estimate[idx]));
+                }
+
+                lwi = LWI_TIMEOUT_INTR(cfs_timeout_cap(timeout),
+                                       back_to_sleep, LWI_ON_SIGNAL_NOOP, NULL);
+                rc = l_wait_event(imp->imp_recovery_waitq,
+                                  !ptlrpc_import_in_recovery(imp), &lwi);
+
+        }
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state != LUSTRE_IMP_FULL)
+		GOTO(out, rc);
+	spin_unlock(&imp->imp_lock);
+
+        req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_DISCONNECT,
+                                        LUSTRE_OBD_VERSION, rq_opc);
+        if (req) {
+                /* We are disconnecting, do not retry a failed DISCONNECT rpc if
+                 * it fails.  We can get through the above with a down server
+                 * if the client doesn't know the server is gone yet. */
+                req->rq_no_resend = 1;
+
+                /* We want client umounts to happen quickly, no matter the
+                   server state... */
+                req->rq_timeout = min_t(int, req->rq_timeout,
+                                        INITIAL_CONNECT_TIMEOUT);
+
+                IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING);
+                req->rq_send_state =  LUSTRE_IMP_CONNECTING;
+                ptlrpc_request_set_replen(req);
+                rc = ptlrpc_queue_wait(req);
+                ptlrpc_req_finished(req);
+        }
+
+set_state:
+	spin_lock(&imp->imp_lock);
+out:
+	if (noclose)
+		IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
+	else
+		IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
+	memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle));
+	spin_unlock(&imp->imp_lock);
+
+	if (rc == -ETIMEDOUT || rc == -ENOTCONN || rc == -ESHUTDOWN)
+		rc = 0;
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_disconnect_import);
+
+void ptlrpc_cleanup_imp(struct obd_import *imp)
+{
+	ENTRY;
+
+	spin_lock(&imp->imp_lock);
+	IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
+	imp->imp_generation++;
+	spin_unlock(&imp->imp_lock);
+	ptlrpc_abort_inflight(imp);
+
+	EXIT;
+}
+
+/* Adaptive Timeout utils */
+extern unsigned int at_min, at_max, at_history;
+
+/* Update at_current with the specified value (bounded by at_min and at_max),
+ * as well as the AT history "bins".
+ *  - Bin into timeslices using AT_BINS bins.
+ *  - This gives us a max of the last at_history seconds without the storage,
+ *    but still smoothing out a return to normalcy from a slow response.
+ *  - (E.g. remember the maximum latency in each minute of the last 4 minutes.)
+ */
+int at_measured(struct adaptive_timeout *at, unsigned int val)
+{
+        unsigned int old = at->at_current;
+	time64_t now = ktime_get_real_seconds();
+	long binlimit = max_t(long, at_history / AT_BINS, 1);
+
+        LASSERT(at);
+        CDEBUG(D_OTHER, "add %u to %p time=%lu v=%u (%u %u %u %u)\n",
+	       val, at, (long)(now - at->at_binstart), at->at_current,
+               at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]);
+
+        if (val == 0)
+                /* 0's don't count, because we never want our timeout to
+                   drop to 0, and because 0 could mean an error */
+                return 0;
+
+	spin_lock(&at->at_lock);
+
+        if (unlikely(at->at_binstart == 0)) {
+                /* Special case to remove default from history */
+                at->at_current = val;
+                at->at_worst_ever = val;
+                at->at_worst_time = now;
+                at->at_hist[0] = val;
+                at->at_binstart = now;
+        } else if (now - at->at_binstart < binlimit ) {
+                /* in bin 0 */
+                at->at_hist[0] = max(val, at->at_hist[0]);
+                at->at_current = max(val, at->at_current);
+        } else {
+                int i, shift;
+                unsigned int maxv = val;
+
+		/* move bins over */
+		shift = (u32)(now - at->at_binstart) / binlimit;
+                LASSERT(shift > 0);
+                for(i = AT_BINS - 1; i >= 0; i--) {
+                        if (i >= shift) {
+                                at->at_hist[i] = at->at_hist[i - shift];
+                                maxv = max(maxv, at->at_hist[i]);
+                        } else {
+                                at->at_hist[i] = 0;
+                        }
+                }
+                at->at_hist[0] = val;
+                at->at_current = maxv;
+                at->at_binstart += shift * binlimit;
+        }
+
+        if (at->at_current > at->at_worst_ever) {
+                at->at_worst_ever = at->at_current;
+                at->at_worst_time = now;
+        }
+
+        if (at->at_flags & AT_FLG_NOHIST)
+                /* Only keep last reported val; keeping the rest of the history
+                   for proc only */
+                at->at_current = val;
+
+        if (at_max > 0)
+                at->at_current =  min(at->at_current, at_max);
+        at->at_current =  max(at->at_current, at_min);
+
+        if (at->at_current != old)
+                CDEBUG(D_OTHER, "AT %p change: old=%u new=%u delta=%d "
+                       "(val=%u) hist %u %u %u %u\n", at,
+                       old, at->at_current, at->at_current - old, val,
+                       at->at_hist[0], at->at_hist[1], at->at_hist[2],
+                       at->at_hist[3]);
+
+        /* if we changed, report the old value */
+        old = (at->at_current != old) ? old : 0;
+
+	spin_unlock(&at->at_lock);
+        return old;
+}
+
+/* Find the imp_at index for a given portal; assign if space available */
+int import_at_get_index(struct obd_import *imp, int portal)
+{
+        struct imp_at *at = &imp->imp_at;
+        int i;
+
+        for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+                if (at->iat_portal[i] == portal)
+                        return i;
+                if (at->iat_portal[i] == 0)
+                        /* unused */
+                        break;
+        }
+
+        /* Not found in list, add it under a lock */
+	spin_lock(&imp->imp_lock);
+
+        /* Check unused under lock */
+        for (; i < IMP_AT_MAX_PORTALS; i++) {
+                if (at->iat_portal[i] == portal)
+                        goto out;
+                if (at->iat_portal[i] == 0)
+                        /* unused */
+                        break;
+        }
+
+        /* Not enough portals? */
+        LASSERT(i < IMP_AT_MAX_PORTALS);
+
+        at->iat_portal[i] = portal;
+out:
+	spin_unlock(&imp->imp_lock);
+	return i;
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/layout.c b/drivers/staging/lustrefsx/lustre/ptlrpc/layout.c
new file mode 100644
index 0000000000000..d720645bafc16
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/layout.c
@@ -0,0 +1,2541 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/layout.c
+ *
+ * Lustre Metadata Target (mdt) request handler
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+/*
+ * This file contains the "capsule/pill" abstraction layered above PTLRPC.
+ *
+ * Every struct ptlrpc_request contains a "pill", which points to a description
+ * of the format that the request conforms to.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <linux/module.h>
+
+#include <lustre/lustre_idl.h>
+
+#include <llog_swab.h>
+#include <lustre_debug.h>
+#include <lustre_swab.h>
+#include <lustre_ver.h>
+#include <obd.h>
+#include <obd_support.h>
+
+/* struct ptlrpc_request, lustre_msg* */
+#include <lustre_req_layout.h>
+#include <lustre_acl.h>
+
+/*
+ * RQFs (see below) refer to two struct req_msg_field arrays describing the
+ * client request and server reply, respectively.
+ */
+/* empty set of fields... for suitable definition of emptiness. */
+static const struct req_msg_field *empty[] = {
+        &RMF_PTLRPC_BODY
+};
+
+static const struct req_msg_field *mgs_target_info_only[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_MGS_TARGET_INFO
+};
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0)
+static const struct req_msg_field *mgs_set_info[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_MGS_SEND_PARAM
+};
+#endif
+
+static const struct req_msg_field *mgs_config_read_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_MGS_CONFIG_BODY
+};
+
+static const struct req_msg_field *mgs_config_read_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_MGS_CONFIG_RES
+};
+
+static const struct req_msg_field *log_cancel_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_LOGCOOKIES
+};
+
+static const struct req_msg_field *mdt_body_only[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_MDT_BODY
+};
+
+static const struct req_msg_field *mdt_body_capa[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_MDT_BODY,
+        &RMF_CAPA1
+};
+
+static const struct req_msg_field *quotactl_only[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_OBD_QUOTACTL
+};
+
+static const struct req_msg_field *quota_body_only[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_QUOTA_BODY
+};
+
+static const struct req_msg_field *ldlm_intent_quota_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_QUOTA_BODY
+};
+
+static const struct req_msg_field *ldlm_intent_quota_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP,
+	&RMF_DLM_LVB,
+	&RMF_QUOTA_BODY
+};
+
+static const struct req_msg_field *mdt_close_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_MDT_EPOCH,
+        &RMF_REC_REINT,
+        &RMF_CAPA1
+};
+
+static const struct req_msg_field *mdt_intent_close_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_EPOCH,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_CLOSE_DATA
+};
+
+static const struct req_msg_field *obd_statfs_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_OBD_STATFS
+};
+
+static const struct req_msg_field *seq_query_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_SEQ_OPC,
+        &RMF_SEQ_RANGE
+};
+
+static const struct req_msg_field *seq_query_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_SEQ_RANGE
+};
+
+static const struct req_msg_field *fld_query_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_FLD_OPC,
+        &RMF_FLD_MDFLD
+};
+
+static const struct req_msg_field *fld_query_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_FLD_MDFLD
+};
+
+static const struct req_msg_field *fld_read_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_FLD_MDFLD
+};
+
+static const struct req_msg_field *fld_read_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_GENERIC_DATA
+};
+
+static const struct req_msg_field *mds_getattr_name_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_MDT_BODY,
+        &RMF_CAPA1,
+        &RMF_NAME
+};
+
+static const struct req_msg_field *mds_reint_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_REC_REINT
+};
+
+static const struct req_msg_field *mds_reint_create_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_REC_REINT,
+        &RMF_CAPA1,
+        &RMF_NAME
+};
+
+static const struct req_msg_field *mds_reint_create_slave_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_REC_REINT,
+        &RMF_CAPA1,
+        &RMF_NAME,
+        &RMF_EADATA,
+        &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_create_acl_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_EADATA,
+	&RMF_DLM_REQ,
+	&RMF_FILE_SECCTX_NAME,
+	&RMF_FILE_SECCTX
+};
+
+static const struct req_msg_field *mds_reint_create_sym_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_SYMTGT,
+	&RMF_DLM_REQ,
+	&RMF_FILE_SECCTX_NAME,
+	&RMF_FILE_SECCTX
+};
+
+static const struct req_msg_field *mds_reint_open_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_NAME,
+	&RMF_EADATA,
+	&RMF_FILE_SECCTX_NAME,
+	&RMF_FILE_SECCTX
+};
+
+static const struct req_msg_field *mds_reint_open_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_MDT_BODY,
+        &RMF_MDT_MD,
+        &RMF_ACL,
+        &RMF_CAPA1,
+        &RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_reint_unlink_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_REC_REINT,
+        &RMF_CAPA1,
+        &RMF_NAME,
+        &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_link_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_REC_REINT,
+        &RMF_CAPA1,
+        &RMF_CAPA2,
+        &RMF_NAME,
+        &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_rename_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_REC_REINT,
+        &RMF_CAPA1,
+        &RMF_CAPA2,
+        &RMF_NAME,
+        &RMF_SYMTGT,
+        &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_migrate_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_NAME,
+	&RMF_SYMTGT,
+	&RMF_DLM_REQ,
+	&RMF_MDT_EPOCH,
+	&RMF_CLOSE_DATA
+};
+
+static const struct req_msg_field *mds_last_unlink_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_MDT_BODY,
+        &RMF_MDT_MD,
+        &RMF_LOGCOOKIES,
+        &RMF_CAPA1,
+        &RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_reint_setattr_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_REC_REINT,
+        &RMF_CAPA1,
+        &RMF_MDT_EPOCH,
+        &RMF_EADATA,
+        &RMF_LOGCOOKIES,
+        &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_setxattr_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_REC_REINT,
+        &RMF_CAPA1,
+        &RMF_NAME,
+        &RMF_EADATA,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mdt_swap_layouts[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_SWAP_LAYOUTS,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *obd_connect_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_TGTUUID,
+        &RMF_CLUUID,
+        &RMF_CONN,
+        &RMF_CONNECT_DATA
+};
+
+static const struct req_msg_field *obd_connect_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_CONNECT_DATA
+};
+
+static const struct req_msg_field *obd_set_info_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_SETINFO_KEY,
+        &RMF_SETINFO_VAL
+};
+
+static const struct req_msg_field *ost_grant_shrink_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_SETINFO_KEY,
+        &RMF_OST_BODY
+};
+
+static const struct req_msg_field *mds_getinfo_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_GETINFO_KEY,
+        &RMF_GETINFO_VALLEN
+};
+
+static const struct req_msg_field *mds_getinfo_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_GETINFO_VAL,
+};
+
+static const struct req_msg_field *ldlm_enqueue_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *ldlm_enqueue_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_DLM_REP
+};
+
+static const struct req_msg_field *ldlm_enqueue_lvb_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_DLM_REP,
+        &RMF_DLM_LVB
+};
+
+static const struct req_msg_field *ldlm_cp_callback_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_DLM_REQ,
+        &RMF_DLM_LVB
+};
+
+static const struct req_msg_field *ldlm_gl_callback_desc_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_DLM_GL_DESC
+};
+
+static const struct req_msg_field *ldlm_gl_callback_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_DLM_LVB
+};
+
+static const struct req_msg_field *ldlm_intent_basic_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+};
+
+static const struct req_msg_field *ldlm_intent_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_DLM_REQ,
+        &RMF_LDLM_INTENT,
+        &RMF_REC_REINT
+};
+
+static const struct req_msg_field *ldlm_intent_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_DLM_REP,
+        &RMF_MDT_BODY,
+        &RMF_MDT_MD,
+        &RMF_ACL
+};
+
+static const struct req_msg_field *ldlm_intent_layout_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_LAYOUT_INTENT,
+	&RMF_EADATA /* for new layout to be set up */
+};
+static const struct req_msg_field *ldlm_intent_open_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_DLM_REP,
+        &RMF_MDT_BODY,
+        &RMF_MDT_MD,
+        &RMF_ACL,
+        &RMF_CAPA1,
+        &RMF_CAPA2
+};
+
+static const struct req_msg_field *ldlm_intent_getattr_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_DLM_REQ,
+        &RMF_LDLM_INTENT,
+        &RMF_MDT_BODY,     /* coincides with mds_getattr_name_client[] */
+        &RMF_CAPA1,
+        &RMF_NAME
+};
+
+static const struct req_msg_field *ldlm_intent_getattr_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_DLM_REP,
+        &RMF_MDT_BODY,
+        &RMF_MDT_MD,
+        &RMF_ACL,
+        &RMF_CAPA1
+};
+
+static const struct req_msg_field *ldlm_intent_create_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_REC_REINT,    /* coincides with mds_reint_create_client[] */
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_EADATA,
+	&RMF_FILE_SECCTX_NAME,
+	&RMF_FILE_SECCTX
+};
+
+static const struct req_msg_field *ldlm_intent_open_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_REC_REINT,    /* coincides with mds_reint_open_client[] */
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_NAME,
+	&RMF_EADATA,
+	&RMF_FILE_SECCTX_NAME,
+	&RMF_FILE_SECCTX
+};
+
+static const struct req_msg_field *ldlm_intent_unlink_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_DLM_REQ,
+        &RMF_LDLM_INTENT,
+        &RMF_REC_REINT,    /* coincides with mds_reint_unlink_client[] */
+        &RMF_CAPA1,
+        &RMF_NAME
+};
+
+static const struct req_msg_field *ldlm_intent_getxattr_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_MDT_BODY,
+	&RMF_CAPA1,
+};
+
+static const struct req_msg_field *ldlm_intent_getxattr_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_ACL, /* for req_capsule_extend/mdt_intent_policy */
+	&RMF_EADATA,
+	&RMF_EAVALS,
+	&RMF_EAVALS_LENS
+};
+
+static const struct req_msg_field *mds_get_root_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_NAME
+};
+
+static const struct req_msg_field *mds_getxattr_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_MDT_BODY,
+        &RMF_CAPA1,
+        &RMF_NAME,
+        &RMF_EADATA
+};
+
+static const struct req_msg_field *mds_getxattr_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_MDT_BODY,
+        &RMF_EADATA
+};
+
+static const struct req_msg_field *mds_getattr_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_MDT_BODY,
+        &RMF_MDT_MD,
+        &RMF_ACL,
+        &RMF_CAPA1,
+        &RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_setattr_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_MDT_BODY,
+        &RMF_MDT_MD,
+        &RMF_ACL,
+        &RMF_CAPA1,
+        &RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_update_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OUT_UPDATE_HEADER,
+	&RMF_OUT_UPDATE_BUF,
+};
+
+static const struct req_msg_field *mds_update_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OUT_UPDATE_REPLY,
+};
+
+static const struct req_msg_field *llog_origin_handle_create_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_LLOGD_BODY,
+        &RMF_NAME
+};
+
+static const struct req_msg_field *llogd_body_only[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_LLOGD_BODY
+};
+
+static const struct req_msg_field *llog_log_hdr_only[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_LLOG_LOG_HDR
+};
+
+static const struct req_msg_field *llogd_conn_body_only[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_LLOGD_CONN_BODY
+};
+
+static const struct req_msg_field *llog_origin_handle_next_block_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_LLOGD_BODY,
+        &RMF_EADATA
+};
+
+static const struct req_msg_field *obd_idx_read_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_IDX_INFO
+};
+
+static const struct req_msg_field *obd_idx_read_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_IDX_INFO
+};
+
+static const struct req_msg_field *ost_body_only[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_OST_BODY
+};
+
+static const struct req_msg_field *ost_body_capa[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_OST_BODY,
+        &RMF_CAPA1
+};
+
+static const struct req_msg_field *ost_destroy_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_OST_BODY,
+        &RMF_DLM_REQ,
+        &RMF_CAPA1
+};
+
+
+static const struct req_msg_field *ost_brw_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_OST_BODY,
+        &RMF_OBD_IOOBJ,
+        &RMF_NIOBUF_REMOTE,
+        &RMF_CAPA1
+};
+
+static const struct req_msg_field *ost_brw_read_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_OST_BODY
+};
+
+static const struct req_msg_field *ost_brw_write_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_OST_BODY,
+        &RMF_RCS
+};
+
+static const struct req_msg_field *ost_get_info_generic_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_GENERIC_DATA,
+};
+
+static const struct req_msg_field *ost_get_info_generic_client[] = {
+        &RMF_PTLRPC_BODY,
+	&RMF_GETINFO_KEY
+};
+
+static const struct req_msg_field *ost_get_last_id_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_OBD_ID
+};
+
+static const struct req_msg_field *ost_get_last_fid_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_GETINFO_KEY,
+	&RMF_FID,
+};
+
+static const struct req_msg_field *ost_get_last_fid_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_FID,
+};
+
+static const struct req_msg_field *ost_get_fiemap_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_FIEMAP_KEY,
+        &RMF_FIEMAP_VAL
+};
+
+static const struct req_msg_field *ost_ladvise[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OST_BODY,
+	&RMF_OST_LADVISE_HDR,
+	&RMF_OST_LADVISE,
+};
+
+static const struct req_msg_field *ost_get_fiemap_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_FIEMAP_VAL
+};
+
+static const struct req_msg_field *mdt_hsm_progress[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDS_HSM_PROGRESS,
+};
+
+static const struct req_msg_field *mdt_hsm_ct_register[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDS_HSM_ARCHIVE,
+};
+
+static const struct req_msg_field *mdt_hsm_ct_unregister[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+};
+
+static const struct req_msg_field *mdt_hsm_action_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDS_HSM_CURRENT_ACTION,
+};
+
+static const struct req_msg_field *mdt_hsm_state_get_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_HSM_USER_STATE,
+};
+
+static const struct req_msg_field *mdt_hsm_state_set[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_CAPA1,
+	&RMF_HSM_STATE_SET,
+};
+
+static const struct req_msg_field *mdt_hsm_request[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDS_HSM_REQUEST,
+	&RMF_MDS_HSM_USER_ITEM,
+	&RMF_GENERIC_DATA,
+};
+
+static const struct req_msg_field *obd_lfsck_request[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_LFSCK_REQUEST,
+};
+
+static const struct req_msg_field *obd_lfsck_reply[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_LFSCK_REPLY,
+};
+
+static struct req_format *req_formats[] = {
+        &RQF_OBD_PING,
+        &RQF_OBD_SET_INFO,
+	&RQF_OBD_IDX_READ,
+        &RQF_SEC_CTX,
+        &RQF_MGS_TARGET_REG,
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0)
+        &RQF_MGS_SET_INFO,
+#endif
+        &RQF_MGS_CONFIG_READ,
+        &RQF_SEQ_QUERY,
+        &RQF_FLD_QUERY,
+	&RQF_FLD_READ,
+        &RQF_MDS_CONNECT,
+        &RQF_MDS_DISCONNECT,
+        &RQF_MDS_GET_INFO,
+	&RQF_MDS_GET_ROOT,
+        &RQF_MDS_STATFS,
+        &RQF_MDS_GETATTR,
+        &RQF_MDS_GETATTR_NAME,
+        &RQF_MDS_GETXATTR,
+        &RQF_MDS_SYNC,
+        &RQF_MDS_CLOSE,
+	&RQF_MDS_INTENT_CLOSE,
+	&RQF_MDS_READPAGE,
+	&RQF_MDS_REINT,
+	&RQF_MDS_REINT_CREATE,
+	&RQF_MDS_REINT_CREATE_ACL,
+        &RQF_MDS_REINT_CREATE_SLAVE,
+        &RQF_MDS_REINT_CREATE_SYM,
+        &RQF_MDS_REINT_OPEN,
+        &RQF_MDS_REINT_UNLINK,
+        &RQF_MDS_REINT_LINK,
+        &RQF_MDS_REINT_RENAME,
+	&RQF_MDS_REINT_MIGRATE,
+        &RQF_MDS_REINT_SETATTR,
+        &RQF_MDS_REINT_SETXATTR,
+        &RQF_MDS_QUOTACTL,
+	&RQF_MDS_HSM_PROGRESS,
+	&RQF_MDS_HSM_CT_REGISTER,
+	&RQF_MDS_HSM_CT_UNREGISTER,
+	&RQF_MDS_HSM_STATE_GET,
+	&RQF_MDS_HSM_STATE_SET,
+	&RQF_MDS_HSM_ACTION,
+	&RQF_MDS_HSM_REQUEST,
+	&RQF_MDS_SWAP_LAYOUTS,
+	&RQF_OUT_UPDATE,
+        &RQF_OST_CONNECT,
+        &RQF_OST_DISCONNECT,
+        &RQF_OST_QUOTACTL,
+        &RQF_OST_GETATTR,
+        &RQF_OST_SETATTR,
+        &RQF_OST_CREATE,
+        &RQF_OST_PUNCH,
+        &RQF_OST_SYNC,
+        &RQF_OST_DESTROY,
+        &RQF_OST_BRW_READ,
+        &RQF_OST_BRW_WRITE,
+        &RQF_OST_STATFS,
+        &RQF_OST_SET_GRANT_INFO,
+	&RQF_OST_GET_INFO,
+        &RQF_OST_GET_INFO_LAST_ID,
+	&RQF_OST_GET_INFO_LAST_FID,
+	&RQF_OST_SET_INFO_LAST_FID,
+	&RQF_OST_GET_INFO_FIEMAP,
+	&RQF_OST_LADVISE,
+	&RQF_LDLM_ENQUEUE,
+	&RQF_LDLM_ENQUEUE_LVB,
+	&RQF_LDLM_CONVERT,
+	&RQF_LDLM_CANCEL,
+	&RQF_LDLM_CALLBACK,
+        &RQF_LDLM_CP_CALLBACK,
+        &RQF_LDLM_BL_CALLBACK,
+        &RQF_LDLM_GL_CALLBACK,
+	&RQF_LDLM_GL_DESC_CALLBACK,
+        &RQF_LDLM_INTENT,
+	&RQF_LDLM_INTENT_BASIC,
+        &RQF_LDLM_INTENT_LAYOUT,
+        &RQF_LDLM_INTENT_GETATTR,
+        &RQF_LDLM_INTENT_OPEN,
+        &RQF_LDLM_INTENT_CREATE,
+        &RQF_LDLM_INTENT_UNLINK,
+	&RQF_LDLM_INTENT_GETXATTR,
+	&RQF_LDLM_INTENT_QUOTA,
+	&RQF_QUOTA_DQACQ,
+        &RQF_LOG_CANCEL,
+        &RQF_LLOG_ORIGIN_HANDLE_CREATE,
+        &RQF_LLOG_ORIGIN_HANDLE_DESTROY,
+        &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK,
+        &RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK,
+        &RQF_LLOG_ORIGIN_HANDLE_READ_HEADER,
+	&RQF_LLOG_ORIGIN_CONNECT,
+	&RQF_CONNECT,
+	&RQF_LFSCK_NOTIFY,
+	&RQF_LFSCK_QUERY,
+};
+
+struct req_msg_field {
+        const __u32 rmf_flags;
+        const char  *rmf_name;
+        /**
+         * Field length. (-1) means "variable length".  If the
+         * \a RMF_F_STRUCT_ARRAY flag is set the field is also variable-length,
+         * but the actual size must be a whole multiple of \a rmf_size.
+         */
+        const int   rmf_size;
+        void        (*rmf_swabber)(void *);
+        void        (*rmf_dumper)(void *);
+        int         rmf_offset[ARRAY_SIZE(req_formats)][RCL_NR];
+};
+
+enum rmf_flags {
+        /**
+         * The field is a string, must be NUL-terminated.
+         */
+        RMF_F_STRING = 1 << 0,
+        /**
+         * The field's buffer size need not match the declared \a rmf_size.
+         */
+        RMF_F_NO_SIZE_CHECK = 1 << 1,
+        /**
+         * The field's buffer size must be a whole multiple of the declared \a
+         * rmf_size and the \a rmf_swabber function must work on the declared \a
+         * rmf_size worth of bytes.
+         */
+        RMF_F_STRUCT_ARRAY = 1 << 2
+};
+
+struct req_capsule;
+
+/*
+ * Request fields.
+ */
+#define DEFINE_MSGF(name, flags, size, swabber, dumper) {       \
+        .rmf_name    = (name),                                  \
+        .rmf_flags   = (flags),                                 \
+        .rmf_size    = (size),                                  \
+        .rmf_swabber = (void (*)(void*))(swabber),              \
+        .rmf_dumper  = (void (*)(void*))(dumper)                \
+}
+
+struct req_msg_field RMF_GENERIC_DATA =
+        DEFINE_MSGF("generic_data", 0,
+                    -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_GENERIC_DATA);
+
+struct req_msg_field RMF_MGS_TARGET_INFO =
+        DEFINE_MSGF("mgs_target_info", 0,
+                    sizeof(struct mgs_target_info),
+                    lustre_swab_mgs_target_info, NULL);
+EXPORT_SYMBOL(RMF_MGS_TARGET_INFO);
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0)
+struct req_msg_field RMF_MGS_SEND_PARAM =
+        DEFINE_MSGF("mgs_send_param", 0,
+                    sizeof(struct mgs_send_param),
+                    NULL, NULL);
+EXPORT_SYMBOL(RMF_MGS_SEND_PARAM);
+#endif
+
+struct req_msg_field RMF_MGS_CONFIG_BODY =
+        DEFINE_MSGF("mgs_config_read request", 0,
+                    sizeof(struct mgs_config_body),
+                    lustre_swab_mgs_config_body, NULL);
+EXPORT_SYMBOL(RMF_MGS_CONFIG_BODY);
+
+struct req_msg_field RMF_MGS_CONFIG_RES =
+        DEFINE_MSGF("mgs_config_read reply ", 0,
+                    sizeof(struct mgs_config_res),
+                    lustre_swab_mgs_config_res, NULL);
+EXPORT_SYMBOL(RMF_MGS_CONFIG_RES);
+
+struct req_msg_field RMF_U32 =
+        DEFINE_MSGF("generic u32", 0,
+                    sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_U32);
+
+struct req_msg_field RMF_SETINFO_VAL =
+        DEFINE_MSGF("setinfo_val", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SETINFO_VAL);
+
+struct req_msg_field RMF_GETINFO_KEY =
+        DEFINE_MSGF("getinfo_key", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_GETINFO_KEY);
+
+struct req_msg_field RMF_GETINFO_VALLEN =
+        DEFINE_MSGF("getinfo_vallen", 0,
+                    sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_GETINFO_VALLEN);
+
+struct req_msg_field RMF_GETINFO_VAL =
+        DEFINE_MSGF("getinfo_val", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_GETINFO_VAL);
+
+struct req_msg_field RMF_SEQ_OPC =
+        DEFINE_MSGF("seq_query_opc", 0,
+                    sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_SEQ_OPC);
+
+struct req_msg_field RMF_SEQ_RANGE =
+        DEFINE_MSGF("seq_query_range", 0,
+                    sizeof(struct lu_seq_range),
+                    lustre_swab_lu_seq_range, NULL);
+EXPORT_SYMBOL(RMF_SEQ_RANGE);
+
+struct req_msg_field RMF_FLD_OPC =
+        DEFINE_MSGF("fld_query_opc", 0,
+                    sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_FLD_OPC);
+
+struct req_msg_field RMF_FLD_MDFLD =
+        DEFINE_MSGF("fld_query_mdfld", 0,
+                    sizeof(struct lu_seq_range),
+                    lustre_swab_lu_seq_range, NULL);
+EXPORT_SYMBOL(RMF_FLD_MDFLD);
+
+struct req_msg_field RMF_MDT_BODY =
+        DEFINE_MSGF("mdt_body", 0,
+                    sizeof(struct mdt_body), lustre_swab_mdt_body, NULL);
+EXPORT_SYMBOL(RMF_MDT_BODY);
+
+struct req_msg_field RMF_OBD_QUOTACTL =
+        DEFINE_MSGF("obd_quotactl", 0,
+                    sizeof(struct obd_quotactl),
+                    lustre_swab_obd_quotactl, NULL);
+EXPORT_SYMBOL(RMF_OBD_QUOTACTL);
+
+struct req_msg_field RMF_QUOTA_BODY =
+	DEFINE_MSGF("quota_body", 0,
+		    sizeof(struct quota_body), lustre_swab_quota_body, NULL);
+EXPORT_SYMBOL(RMF_QUOTA_BODY);
+
+struct req_msg_field RMF_MDT_EPOCH =
+        DEFINE_MSGF("mdt_ioepoch", 0,
+                    sizeof(struct mdt_ioepoch), lustre_swab_mdt_ioepoch, NULL);
+EXPORT_SYMBOL(RMF_MDT_EPOCH);
+
+struct req_msg_field RMF_PTLRPC_BODY =
+        DEFINE_MSGF("ptlrpc_body", 0,
+                    sizeof(struct ptlrpc_body), lustre_swab_ptlrpc_body, NULL);
+EXPORT_SYMBOL(RMF_PTLRPC_BODY);
+
+struct req_msg_field RMF_CLOSE_DATA =
+	DEFINE_MSGF("data_version", 0,
+		    sizeof(struct close_data), lustre_swab_close_data, NULL);
+EXPORT_SYMBOL(RMF_CLOSE_DATA);
+
+struct req_msg_field RMF_OBD_STATFS =
+        DEFINE_MSGF("obd_statfs", 0,
+                    sizeof(struct obd_statfs), lustre_swab_obd_statfs, NULL);
+EXPORT_SYMBOL(RMF_OBD_STATFS);
+
+struct req_msg_field RMF_SETINFO_KEY =
+        DEFINE_MSGF("setinfo_key", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SETINFO_KEY);
+
+struct req_msg_field RMF_NAME =
+        DEFINE_MSGF("name", RMF_F_STRING, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_NAME);
+
+struct req_msg_field RMF_SYMTGT =
+        DEFINE_MSGF("symtgt", RMF_F_STRING, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SYMTGT);
+
+struct req_msg_field RMF_TGTUUID =
+        DEFINE_MSGF("tgtuuid", RMF_F_STRING, sizeof(struct obd_uuid) - 1, NULL,
+        NULL);
+EXPORT_SYMBOL(RMF_TGTUUID);
+
+struct req_msg_field RMF_CLUUID =
+        DEFINE_MSGF("cluuid", RMF_F_STRING, sizeof(struct obd_uuid) - 1, NULL,
+        NULL);
+EXPORT_SYMBOL(RMF_CLUUID);
+
+struct req_msg_field RMF_STRING =
+        DEFINE_MSGF("string", RMF_F_STRING, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_STRING);
+
+struct req_msg_field RMF_FILE_SECCTX_NAME =
+	DEFINE_MSGF("file_secctx_name", RMF_F_STRING, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_FILE_SECCTX_NAME);
+
+struct req_msg_field RMF_FILE_SECCTX =
+	DEFINE_MSGF("file_secctx", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_FILE_SECCTX);
+
+struct req_msg_field RMF_LLOGD_BODY =
+        DEFINE_MSGF("llogd_body", 0,
+                    sizeof(struct llogd_body), lustre_swab_llogd_body, NULL);
+EXPORT_SYMBOL(RMF_LLOGD_BODY);
+
+struct req_msg_field RMF_LLOG_LOG_HDR =
+        DEFINE_MSGF("llog_log_hdr", 0,
+                    sizeof(struct llog_log_hdr), lustre_swab_llog_hdr, NULL);
+EXPORT_SYMBOL(RMF_LLOG_LOG_HDR);
+
+struct req_msg_field RMF_LLOGD_CONN_BODY =
+        DEFINE_MSGF("llogd_conn_body", 0,
+                    sizeof(struct llogd_conn_body),
+                    lustre_swab_llogd_conn_body, NULL);
+EXPORT_SYMBOL(RMF_LLOGD_CONN_BODY);
+
+/*
+ * connection handle received in MDS_CONNECT request.
+ *
+ * No swabbing needed because struct lustre_handle contains only a 64-bit cookie
+ * that the client does not interpret at all.
+ */
+struct req_msg_field RMF_CONN =
+        DEFINE_MSGF("conn", 0, sizeof(struct lustre_handle), NULL, NULL);
+EXPORT_SYMBOL(RMF_CONN);
+
+struct req_msg_field RMF_CONNECT_DATA =
+	DEFINE_MSGF("cdata",
+		    RMF_F_NO_SIZE_CHECK /* we allow extra space for interop */,
+		    sizeof(struct obd_connect_data),
+		    lustre_swab_connect, NULL);
+EXPORT_SYMBOL(RMF_CONNECT_DATA);
+
+struct req_msg_field RMF_DLM_REQ =
+        DEFINE_MSGF("dlm_req", RMF_F_NO_SIZE_CHECK /* ldlm_request_bufsize */,
+                    sizeof(struct ldlm_request),
+                    lustre_swab_ldlm_request, NULL);
+EXPORT_SYMBOL(RMF_DLM_REQ);
+
+struct req_msg_field RMF_DLM_REP =
+        DEFINE_MSGF("dlm_rep", 0,
+                    sizeof(struct ldlm_reply), lustre_swab_ldlm_reply, NULL);
+EXPORT_SYMBOL(RMF_DLM_REP);
+
+struct req_msg_field RMF_LDLM_INTENT =
+        DEFINE_MSGF("ldlm_intent", 0,
+                    sizeof(struct ldlm_intent), lustre_swab_ldlm_intent, NULL);
+EXPORT_SYMBOL(RMF_LDLM_INTENT);
+
+struct req_msg_field RMF_DLM_LVB =
+	DEFINE_MSGF("dlm_lvb", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_DLM_LVB);
+
+struct req_msg_field RMF_DLM_GL_DESC =
+	DEFINE_MSGF("dlm_gl_desc", 0, sizeof(union ldlm_gl_desc), NULL, NULL);
+EXPORT_SYMBOL(RMF_DLM_GL_DESC);
+
+struct req_msg_field RMF_MDT_MD =
+        DEFINE_MSGF("mdt_md", RMF_F_NO_SIZE_CHECK, MIN_MD_SIZE, NULL, NULL);
+EXPORT_SYMBOL(RMF_MDT_MD);
+
+struct req_msg_field RMF_REC_REINT =
+        DEFINE_MSGF("rec_reint", 0, sizeof(struct mdt_rec_reint),
+                    lustre_swab_mdt_rec_reint, NULL);
+EXPORT_SYMBOL(RMF_REC_REINT);
+
+/* FIXME: this length should be defined as a macro */
+struct req_msg_field RMF_EADATA = DEFINE_MSGF("eadata", 0, -1,
+                                                    NULL, NULL);
+EXPORT_SYMBOL(RMF_EADATA);
+
+struct req_msg_field RMF_EAVALS = DEFINE_MSGF("eavals", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_EAVALS);
+
+struct req_msg_field RMF_ACL = DEFINE_MSGF("acl", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_ACL);
+
+/* FIXME: this should be made to use RMF_F_STRUCT_ARRAY */
+struct req_msg_field RMF_LOGCOOKIES =
+        DEFINE_MSGF("logcookies", RMF_F_NO_SIZE_CHECK /* multiple cookies */,
+                    sizeof(struct llog_cookie), NULL, NULL);
+EXPORT_SYMBOL(RMF_LOGCOOKIES);
+
+struct req_msg_field RMF_CAPA1 =
+        DEFINE_MSGF("capa", 0, sizeof(struct lustre_capa),
+                    lustre_swab_lustre_capa, NULL);
+EXPORT_SYMBOL(RMF_CAPA1);
+
+struct req_msg_field RMF_CAPA2 =
+        DEFINE_MSGF("capa", 0, sizeof(struct lustre_capa),
+                    lustre_swab_lustre_capa, NULL);
+EXPORT_SYMBOL(RMF_CAPA2);
+
+struct req_msg_field RMF_LAYOUT_INTENT =
+	DEFINE_MSGF("layout_intent", 0,
+		    sizeof(struct layout_intent), lustre_swab_layout_intent,
+		    NULL);
+EXPORT_SYMBOL(RMF_LAYOUT_INTENT);
+
+/*
+ * OST request field.
+ */
+struct req_msg_field RMF_OST_BODY =
+	DEFINE_MSGF("ost_body", 0,
+		    sizeof(struct ost_body), lustre_swab_ost_body,
+		    dump_ost_body);
+EXPORT_SYMBOL(RMF_OST_BODY);
+
+struct req_msg_field RMF_OBD_IOOBJ =
+        DEFINE_MSGF("obd_ioobj", RMF_F_STRUCT_ARRAY,
+                    sizeof(struct obd_ioobj), lustre_swab_obd_ioobj, dump_ioo);
+EXPORT_SYMBOL(RMF_OBD_IOOBJ);
+
+struct req_msg_field RMF_NIOBUF_REMOTE =
+        DEFINE_MSGF("niobuf_remote", RMF_F_STRUCT_ARRAY,
+                    sizeof(struct niobuf_remote), lustre_swab_niobuf_remote,
+                    dump_rniobuf);
+EXPORT_SYMBOL(RMF_NIOBUF_REMOTE);
+
+struct req_msg_field RMF_RCS =
+        DEFINE_MSGF("niobuf_remote", RMF_F_STRUCT_ARRAY, sizeof(__u32),
+                    lustre_swab_generic_32s, dump_rcs);
+EXPORT_SYMBOL(RMF_RCS);
+
+struct req_msg_field RMF_EAVALS_LENS =
+	DEFINE_MSGF("eavals_lens", RMF_F_STRUCT_ARRAY, sizeof(__u32),
+		lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_EAVALS_LENS);
+
+struct req_msg_field RMF_OBD_ID =
+	DEFINE_MSGF("obd_id", 0,
+		    sizeof(__u64), lustre_swab_ost_last_id, NULL);
+EXPORT_SYMBOL(RMF_OBD_ID);
+
+struct req_msg_field RMF_FID =
+	DEFINE_MSGF("fid", 0,
+		    sizeof(struct lu_fid), lustre_swab_lu_fid, NULL);
+EXPORT_SYMBOL(RMF_FID);
+
+struct req_msg_field RMF_OST_ID =
+	DEFINE_MSGF("ost_id", 0,
+		    sizeof(struct ost_id), lustre_swab_ost_id, NULL);
+EXPORT_SYMBOL(RMF_OST_ID);
+
+struct req_msg_field RMF_FIEMAP_KEY =
+        DEFINE_MSGF("fiemap", 0, sizeof(struct ll_fiemap_info_key),
+                    lustre_swab_fiemap, NULL);
+EXPORT_SYMBOL(RMF_FIEMAP_KEY);
+
+struct req_msg_field RMF_FIEMAP_VAL =
+        DEFINE_MSGF("fiemap", 0, -1, lustre_swab_fiemap, NULL);
+EXPORT_SYMBOL(RMF_FIEMAP_VAL);
+
+struct req_msg_field RMF_IDX_INFO =
+	DEFINE_MSGF("idx_info", 0, sizeof(struct idx_info),
+		    lustre_swab_idx_info, NULL);
+EXPORT_SYMBOL(RMF_IDX_INFO);
+struct req_msg_field RMF_HSM_USER_STATE =
+	DEFINE_MSGF("hsm_user_state", 0, sizeof(struct hsm_user_state),
+		    lustre_swab_hsm_user_state, NULL);
+EXPORT_SYMBOL(RMF_HSM_USER_STATE);
+
+struct req_msg_field RMF_HSM_STATE_SET =
+	DEFINE_MSGF("hsm_state_set", 0, sizeof(struct hsm_state_set),
+		    lustre_swab_hsm_state_set, NULL);
+EXPORT_SYMBOL(RMF_HSM_STATE_SET);
+
+struct req_msg_field RMF_MDS_HSM_PROGRESS =
+	DEFINE_MSGF("hsm_progress", 0, sizeof(struct hsm_progress_kernel),
+		    lustre_swab_hsm_progress_kernel, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_PROGRESS);
+
+struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION =
+	DEFINE_MSGF("hsm_current_action", 0, sizeof(struct hsm_current_action),
+		    lustre_swab_hsm_current_action, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_CURRENT_ACTION);
+
+struct req_msg_field RMF_MDS_HSM_USER_ITEM =
+	DEFINE_MSGF("hsm_user_item", RMF_F_STRUCT_ARRAY,
+		    sizeof(struct hsm_user_item), lustre_swab_hsm_user_item,
+		    NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_USER_ITEM);
+
+struct req_msg_field RMF_MDS_HSM_ARCHIVE =
+	DEFINE_MSGF("hsm_archive", 0,
+		    sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_ARCHIVE);
+
+struct req_msg_field RMF_MDS_HSM_REQUEST =
+	DEFINE_MSGF("hsm_request", 0, sizeof(struct hsm_request),
+		    lustre_swab_hsm_request, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_REQUEST);
+
+struct req_msg_field RMF_OUT_UPDATE = DEFINE_MSGF("object_update", 0, -1,
+				lustre_swab_object_update_request, NULL);
+EXPORT_SYMBOL(RMF_OUT_UPDATE);
+
+struct req_msg_field RMF_OUT_UPDATE_REPLY =
+			DEFINE_MSGF("object_update_reply", 0, -1,
+				    lustre_swab_object_update_reply, NULL);
+EXPORT_SYMBOL(RMF_OUT_UPDATE_REPLY);
+
+struct req_msg_field RMF_SWAP_LAYOUTS =
+	DEFINE_MSGF("swap_layouts", 0, sizeof(struct  mdc_swap_layouts),
+		    lustre_swab_swap_layouts, NULL);
+EXPORT_SYMBOL(RMF_SWAP_LAYOUTS);
+
+struct req_msg_field RMF_LFSCK_REQUEST =
+	DEFINE_MSGF("lfsck_request", 0, sizeof(struct lfsck_request),
+		    lustre_swab_lfsck_request, NULL);
+EXPORT_SYMBOL(RMF_LFSCK_REQUEST);
+
+struct req_msg_field RMF_LFSCK_REPLY =
+	DEFINE_MSGF("lfsck_reply", 0, sizeof(struct lfsck_reply),
+		    lustre_swab_lfsck_reply, NULL);
+EXPORT_SYMBOL(RMF_LFSCK_REPLY);
+
+struct req_msg_field RMF_OST_LADVISE_HDR =
+	DEFINE_MSGF("ladvise_request", 0,
+		    sizeof(struct ladvise_hdr),
+		    lustre_swab_ladvise_hdr, NULL);
+EXPORT_SYMBOL(RMF_OST_LADVISE_HDR);
+
+struct req_msg_field RMF_OST_LADVISE =
+	DEFINE_MSGF("ladvise_request", RMF_F_STRUCT_ARRAY,
+		    sizeof(struct lu_ladvise),
+		    lustre_swab_ladvise, NULL);
+EXPORT_SYMBOL(RMF_OST_LADVISE);
+
+struct req_msg_field RMF_OUT_UPDATE_HEADER = DEFINE_MSGF("out_update_header", 0,
+				-1, lustre_swab_out_update_header, NULL);
+EXPORT_SYMBOL(RMF_OUT_UPDATE_HEADER);
+
+struct req_msg_field RMF_OUT_UPDATE_BUF = DEFINE_MSGF("update_buf",
+			RMF_F_STRUCT_ARRAY, sizeof(struct out_update_buffer),
+			lustre_swab_out_update_buffer, NULL);
+EXPORT_SYMBOL(RMF_OUT_UPDATE_BUF);
+
+/*
+ * Request formats.
+ */
+
+struct req_format {
+	const char *rf_name;
+	size_t	    rf_idx;
+	struct {
+		size_t			     nr;
+		const struct req_msg_field **d;
+	} rf_fields[RCL_NR];
+};
+
+#define DEFINE_REQ_FMT(name, client, client_nr, server, server_nr) {    \
+        .rf_name   = name,                                              \
+        .rf_fields = {                                                  \
+                [RCL_CLIENT] = {                                        \
+                        .nr = client_nr,                                \
+                        .d  = client                                    \
+                },                                                      \
+                [RCL_SERVER] = {                                        \
+                        .nr = server_nr,                                \
+                        .d  = server                                    \
+                }                                                       \
+        }                                                               \
+}
+
+#define DEFINE_REQ_FMT0(name, client, server)                                  \
+DEFINE_REQ_FMT(name, client, ARRAY_SIZE(client), server, ARRAY_SIZE(server))
+
+struct req_format RQF_OBD_PING =
+        DEFINE_REQ_FMT0("OBD_PING", empty, empty);
+EXPORT_SYMBOL(RQF_OBD_PING);
+
+struct req_format RQF_OBD_SET_INFO =
+        DEFINE_REQ_FMT0("OBD_SET_INFO", obd_set_info_client, empty);
+EXPORT_SYMBOL(RQF_OBD_SET_INFO);
+
+/* Read index file through the network */
+struct req_format RQF_OBD_IDX_READ =
+	DEFINE_REQ_FMT0("OBD_IDX_READ",
+			obd_idx_read_client, obd_idx_read_server);
+EXPORT_SYMBOL(RQF_OBD_IDX_READ);
+
+struct req_format RQF_SEC_CTX =
+        DEFINE_REQ_FMT0("SEC_CTX", empty, empty);
+EXPORT_SYMBOL(RQF_SEC_CTX);
+
+struct req_format RQF_MGS_TARGET_REG =
+        DEFINE_REQ_FMT0("MGS_TARGET_REG", mgs_target_info_only,
+                         mgs_target_info_only);
+EXPORT_SYMBOL(RQF_MGS_TARGET_REG);
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0)
+struct req_format RQF_MGS_SET_INFO =
+        DEFINE_REQ_FMT0("MGS_SET_INFO", mgs_set_info,
+                         mgs_set_info);
+EXPORT_SYMBOL(RQF_MGS_SET_INFO);
+#endif
+
+struct req_format RQF_MGS_CONFIG_READ =
+        DEFINE_REQ_FMT0("MGS_CONFIG_READ", mgs_config_read_client,
+                         mgs_config_read_server);
+EXPORT_SYMBOL(RQF_MGS_CONFIG_READ);
+
+struct req_format RQF_SEQ_QUERY =
+        DEFINE_REQ_FMT0("SEQ_QUERY", seq_query_client, seq_query_server);
+EXPORT_SYMBOL(RQF_SEQ_QUERY);
+
+struct req_format RQF_FLD_QUERY =
+        DEFINE_REQ_FMT0("FLD_QUERY", fld_query_client, fld_query_server);
+EXPORT_SYMBOL(RQF_FLD_QUERY);
+
+/* The 'fld_read_server' uses 'RMF_GENERIC_DATA' to hold the 'FLD_QUERY'
+ * RPC reply that is composed of 'struct lu_seq_range_array'. But there
+ * is not registered swabber function for 'RMF_GENERIC_DATA'. So the RPC
+ * peers need to handle the RPC reply with fixed little-endian format.
+ *
+ * In theory, we can define new structure with some swabber registered to
+ * handle the 'FLD_QUERY' RPC reply result automatically. But from the
+ * implementation view, it is not easy to be done within current "struct
+ * req_msg_field" framework. Because the sequence range array in the RPC
+ * reply is not fixed length, instead, its length depends on 'lu_seq_range'
+ * count, that is unknown when prepare the RPC buffer. Generally, for such
+ * flexible length RPC usage, there will be a field in the RPC layout to
+ * indicate the data length. But for the 'FLD_READ' RPC, we have no way to
+ * do that unless we add new length filed that will broken the on-wire RPC
+ * protocol and cause interoperability trouble with old peer. */
+struct req_format RQF_FLD_READ =
+	DEFINE_REQ_FMT0("FLD_READ", fld_read_client, fld_read_server);
+EXPORT_SYMBOL(RQF_FLD_READ);
+
+struct req_format RQF_LOG_CANCEL =
+        DEFINE_REQ_FMT0("OBD_LOG_CANCEL", log_cancel_client, empty);
+EXPORT_SYMBOL(RQF_LOG_CANCEL);
+
+struct req_format RQF_MDS_QUOTACTL =
+        DEFINE_REQ_FMT0("MDS_QUOTACTL", quotactl_only, quotactl_only);
+EXPORT_SYMBOL(RQF_MDS_QUOTACTL);
+
+struct req_format RQF_OST_QUOTACTL =
+        DEFINE_REQ_FMT0("OST_QUOTACTL", quotactl_only, quotactl_only);
+EXPORT_SYMBOL(RQF_OST_QUOTACTL);
+
+struct req_format RQF_QUOTA_DQACQ =
+	DEFINE_REQ_FMT0("QUOTA_DQACQ", quota_body_only, quota_body_only);
+EXPORT_SYMBOL(RQF_QUOTA_DQACQ);
+
+struct req_format RQF_LDLM_INTENT_QUOTA =
+	DEFINE_REQ_FMT0("LDLM_INTENT_QUOTA",
+			ldlm_intent_quota_client,
+			ldlm_intent_quota_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_QUOTA);
+
+struct req_format RQF_MDS_GET_ROOT =
+	DEFINE_REQ_FMT0("MDS_GET_ROOT", mds_get_root_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_GET_ROOT);
+
+struct req_format RQF_MDS_STATFS =
+        DEFINE_REQ_FMT0("MDS_STATFS", empty, obd_statfs_server);
+EXPORT_SYMBOL(RQF_MDS_STATFS);
+
+struct req_format RQF_MDS_SYNC =
+        DEFINE_REQ_FMT0("MDS_SYNC", mdt_body_capa, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_SYNC);
+
+struct req_format RQF_MDS_GETATTR =
+        DEFINE_REQ_FMT0("MDS_GETATTR", mdt_body_capa, mds_getattr_server);
+EXPORT_SYMBOL(RQF_MDS_GETATTR);
+
+struct req_format RQF_MDS_GETXATTR =
+        DEFINE_REQ_FMT0("MDS_GETXATTR",
+                        mds_getxattr_client, mds_getxattr_server);
+EXPORT_SYMBOL(RQF_MDS_GETXATTR);
+
+struct req_format RQF_MDS_GETATTR_NAME =
+        DEFINE_REQ_FMT0("MDS_GETATTR_NAME",
+                        mds_getattr_name_client, mds_getattr_server);
+EXPORT_SYMBOL(RQF_MDS_GETATTR_NAME);
+
+struct req_format RQF_MDS_REINT =
+        DEFINE_REQ_FMT0("MDS_REINT", mds_reint_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT);
+
+struct req_format RQF_MDS_REINT_CREATE =
+        DEFINE_REQ_FMT0("MDS_REINT_CREATE",
+                        mds_reint_create_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE);
+
+struct req_format RQF_MDS_REINT_CREATE_ACL =
+	DEFINE_REQ_FMT0("MDS_REINT_CREATE_ACL",
+			mds_reint_create_acl_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_ACL);
+
+struct req_format RQF_MDS_REINT_CREATE_SLAVE =
+        DEFINE_REQ_FMT0("MDS_REINT_CREATE_EA",
+                        mds_reint_create_slave_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_SLAVE);
+
+struct req_format RQF_MDS_REINT_CREATE_SYM =
+        DEFINE_REQ_FMT0("MDS_REINT_CREATE_SYM",
+                        mds_reint_create_sym_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_SYM);
+
+struct req_format RQF_MDS_REINT_OPEN =
+        DEFINE_REQ_FMT0("MDS_REINT_OPEN",
+                        mds_reint_open_client, mds_reint_open_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_OPEN);
+
+struct req_format RQF_MDS_REINT_UNLINK =
+        DEFINE_REQ_FMT0("MDS_REINT_UNLINK", mds_reint_unlink_client,
+                        mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_UNLINK);
+
+struct req_format RQF_MDS_REINT_LINK =
+        DEFINE_REQ_FMT0("MDS_REINT_LINK",
+                        mds_reint_link_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT_LINK);
+
+struct req_format RQF_MDS_REINT_RENAME =
+        DEFINE_REQ_FMT0("MDS_REINT_RENAME", mds_reint_rename_client,
+                        mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_RENAME);
+
+struct req_format RQF_MDS_REINT_MIGRATE =
+	DEFINE_REQ_FMT0("MDS_REINT_MIGRATE", mds_reint_migrate_client,
+			mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_MIGRATE);
+
+struct req_format RQF_MDS_REINT_SETATTR =
+        DEFINE_REQ_FMT0("MDS_REINT_SETATTR",
+                        mds_reint_setattr_client, mds_setattr_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_SETATTR);
+
+struct req_format RQF_MDS_REINT_SETXATTR =
+        DEFINE_REQ_FMT0("MDS_REINT_SETXATTR",
+			mds_reint_setxattr_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT_SETXATTR);
+
+struct req_format RQF_MDS_CONNECT =
+        DEFINE_REQ_FMT0("MDS_CONNECT",
+                        obd_connect_client, obd_connect_server);
+EXPORT_SYMBOL(RQF_MDS_CONNECT);
+
+struct req_format RQF_MDS_DISCONNECT =
+        DEFINE_REQ_FMT0("MDS_DISCONNECT", empty, empty);
+EXPORT_SYMBOL(RQF_MDS_DISCONNECT);
+
+struct req_format RQF_MDS_GET_INFO =
+        DEFINE_REQ_FMT0("MDS_GET_INFO", mds_getinfo_client,
+                        mds_getinfo_server);
+EXPORT_SYMBOL(RQF_MDS_GET_INFO);
+
+struct req_format RQF_OUT_UPDATE =
+	DEFINE_REQ_FMT0("OUT_UPDATE", mds_update_client,
+			mds_update_server);
+EXPORT_SYMBOL(RQF_OUT_UPDATE);
+
+struct req_format RQF_LDLM_ENQUEUE =
+        DEFINE_REQ_FMT0("LDLM_ENQUEUE",
+                        ldlm_enqueue_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_ENQUEUE);
+
+struct req_format RQF_LDLM_ENQUEUE_LVB =
+        DEFINE_REQ_FMT0("LDLM_ENQUEUE_LVB",
+                        ldlm_enqueue_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_ENQUEUE_LVB);
+
+struct req_format RQF_LDLM_CONVERT =
+        DEFINE_REQ_FMT0("LDLM_CONVERT",
+                        ldlm_enqueue_client, ldlm_enqueue_server);
+EXPORT_SYMBOL(RQF_LDLM_CONVERT);
+
+struct req_format RQF_LDLM_CANCEL =
+        DEFINE_REQ_FMT0("LDLM_CANCEL", ldlm_enqueue_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_CANCEL);
+
+struct req_format RQF_LDLM_CALLBACK =
+        DEFINE_REQ_FMT0("LDLM_CALLBACK", ldlm_enqueue_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_CALLBACK);
+
+struct req_format RQF_LDLM_CP_CALLBACK =
+        DEFINE_REQ_FMT0("LDLM_CP_CALLBACK", ldlm_cp_callback_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_CP_CALLBACK);
+
+struct req_format RQF_LDLM_BL_CALLBACK =
+        DEFINE_REQ_FMT0("LDLM_BL_CALLBACK", ldlm_enqueue_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_BL_CALLBACK);
+
+struct req_format RQF_LDLM_GL_CALLBACK =
+        DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_enqueue_client,
+                        ldlm_gl_callback_server);
+EXPORT_SYMBOL(RQF_LDLM_GL_CALLBACK);
+
+struct req_format RQF_LDLM_GL_DESC_CALLBACK =
+	DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_gl_callback_desc_client,
+			ldlm_gl_callback_server);
+EXPORT_SYMBOL(RQF_LDLM_GL_DESC_CALLBACK);
+
+struct req_format RQF_LDLM_INTENT_BASIC =
+	DEFINE_REQ_FMT0("LDLM_INTENT_BASIC",
+			ldlm_intent_basic_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_BASIC);
+
+struct req_format RQF_LDLM_INTENT =
+        DEFINE_REQ_FMT0("LDLM_INTENT",
+                        ldlm_intent_client, ldlm_intent_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT);
+
+struct req_format RQF_LDLM_INTENT_LAYOUT =
+	DEFINE_REQ_FMT0("LDLM_INTENT_LAYOUT ",
+			ldlm_intent_layout_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_LAYOUT);
+
+struct req_format RQF_LDLM_INTENT_GETATTR =
+        DEFINE_REQ_FMT0("LDLM_INTENT_GETATTR",
+                        ldlm_intent_getattr_client, ldlm_intent_getattr_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_GETATTR);
+
+struct req_format RQF_LDLM_INTENT_OPEN =
+        DEFINE_REQ_FMT0("LDLM_INTENT_OPEN",
+                        ldlm_intent_open_client, ldlm_intent_open_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_OPEN);
+
+struct req_format RQF_LDLM_INTENT_CREATE =
+        DEFINE_REQ_FMT0("LDLM_INTENT_CREATE",
+                        ldlm_intent_create_client, ldlm_intent_getattr_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_CREATE);
+
+struct req_format RQF_LDLM_INTENT_UNLINK =
+        DEFINE_REQ_FMT0("LDLM_INTENT_UNLINK",
+                        ldlm_intent_unlink_client, ldlm_intent_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_UNLINK);
+
+struct req_format RQF_LDLM_INTENT_GETXATTR =
+	DEFINE_REQ_FMT0("LDLM_INTENT_GETXATTR",
+			ldlm_intent_getxattr_client,
+			ldlm_intent_getxattr_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_GETXATTR);
+
+struct req_format RQF_MDS_CLOSE =
+        DEFINE_REQ_FMT0("MDS_CLOSE",
+                        mdt_close_client, mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_CLOSE);
+
+struct req_format RQF_MDS_INTENT_CLOSE =
+	DEFINE_REQ_FMT0("MDS_CLOSE",
+			mdt_intent_close_client, mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_INTENT_CLOSE);
+
+struct req_format RQF_MDS_READPAGE =
+        DEFINE_REQ_FMT0("MDS_READPAGE",
+                        mdt_body_capa, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_READPAGE);
+
+struct req_format RQF_MDS_HSM_ACTION =
+	DEFINE_REQ_FMT0("MDS_HSM_ACTION", mdt_body_capa, mdt_hsm_action_server);
+EXPORT_SYMBOL(RQF_MDS_HSM_ACTION);
+
+struct req_format RQF_MDS_HSM_PROGRESS =
+	DEFINE_REQ_FMT0("MDS_HSM_PROGRESS", mdt_hsm_progress, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_PROGRESS);
+
+struct req_format RQF_MDS_HSM_CT_REGISTER =
+	DEFINE_REQ_FMT0("MDS_HSM_CT_REGISTER", mdt_hsm_ct_register, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_CT_REGISTER);
+
+struct req_format RQF_MDS_HSM_CT_UNREGISTER =
+	DEFINE_REQ_FMT0("MDS_HSM_CT_UNREGISTER", mdt_hsm_ct_unregister, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_CT_UNREGISTER);
+
+struct req_format RQF_MDS_HSM_STATE_GET =
+	DEFINE_REQ_FMT0("MDS_HSM_STATE_GET",
+			mdt_body_capa, mdt_hsm_state_get_server);
+EXPORT_SYMBOL(RQF_MDS_HSM_STATE_GET);
+
+struct req_format RQF_MDS_HSM_STATE_SET =
+	DEFINE_REQ_FMT0("MDS_HSM_STATE_SET", mdt_hsm_state_set, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_STATE_SET);
+
+struct req_format RQF_MDS_HSM_REQUEST =
+	DEFINE_REQ_FMT0("MDS_HSM_REQUEST", mdt_hsm_request, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_REQUEST);
+
+struct req_format RQF_MDS_SWAP_LAYOUTS =
+	DEFINE_REQ_FMT0("MDS_SWAP_LAYOUTS",
+			mdt_swap_layouts, empty);
+EXPORT_SYMBOL(RQF_MDS_SWAP_LAYOUTS);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE =
+        DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_CREATE",
+                        llog_origin_handle_create_client, llogd_body_only);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_CREATE);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_DESTROY =
+        DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_DESTROY",
+                        llogd_body_only, llogd_body_only);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_DESTROY);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK =
+        DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_NEXT_BLOCK",
+                        llogd_body_only, llog_origin_handle_next_block_server);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK =
+        DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_PREV_BLOCK",
+                        llogd_body_only, llog_origin_handle_next_block_server);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER =
+        DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_READ_HEADER",
+                        llogd_body_only, llog_log_hdr_only);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_READ_HEADER);
+
+struct req_format RQF_LLOG_ORIGIN_CONNECT =
+        DEFINE_REQ_FMT0("LLOG_ORIGIN_CONNECT", llogd_conn_body_only, empty);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_CONNECT);
+
+struct req_format RQF_CONNECT =
+	DEFINE_REQ_FMT0("CONNECT", obd_connect_client, obd_connect_server);
+EXPORT_SYMBOL(RQF_CONNECT);
+
+struct req_format RQF_OST_CONNECT =
+        DEFINE_REQ_FMT0("OST_CONNECT",
+                        obd_connect_client, obd_connect_server);
+EXPORT_SYMBOL(RQF_OST_CONNECT);
+
+struct req_format RQF_OST_DISCONNECT =
+        DEFINE_REQ_FMT0("OST_DISCONNECT", empty, empty);
+EXPORT_SYMBOL(RQF_OST_DISCONNECT);
+
+struct req_format RQF_OST_GETATTR =
+        DEFINE_REQ_FMT0("OST_GETATTR", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_GETATTR);
+
+struct req_format RQF_OST_SETATTR =
+        DEFINE_REQ_FMT0("OST_SETATTR", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_SETATTR);
+
+struct req_format RQF_OST_CREATE =
+        DEFINE_REQ_FMT0("OST_CREATE", ost_body_only, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_CREATE);
+
+struct req_format RQF_OST_PUNCH =
+        DEFINE_REQ_FMT0("OST_PUNCH", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_PUNCH);
+
+struct req_format RQF_OST_SYNC =
+        DEFINE_REQ_FMT0("OST_SYNC", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_SYNC);
+
+struct req_format RQF_OST_DESTROY =
+        DEFINE_REQ_FMT0("OST_DESTROY", ost_destroy_client, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_DESTROY);
+
+struct req_format RQF_OST_BRW_READ =
+        DEFINE_REQ_FMT0("OST_BRW_READ", ost_brw_client, ost_brw_read_server);
+EXPORT_SYMBOL(RQF_OST_BRW_READ);
+
+struct req_format RQF_OST_BRW_WRITE =
+        DEFINE_REQ_FMT0("OST_BRW_WRITE", ost_brw_client, ost_brw_write_server);
+EXPORT_SYMBOL(RQF_OST_BRW_WRITE);
+
+struct req_format RQF_OST_STATFS =
+        DEFINE_REQ_FMT0("OST_STATFS", empty, obd_statfs_server);
+EXPORT_SYMBOL(RQF_OST_STATFS);
+
+struct req_format RQF_OST_SET_GRANT_INFO =
+        DEFINE_REQ_FMT0("OST_SET_GRANT_INFO", ost_grant_shrink_client,
+                         ost_body_only);
+EXPORT_SYMBOL(RQF_OST_SET_GRANT_INFO);
+
+struct req_format RQF_OST_GET_INFO =
+        DEFINE_REQ_FMT0("OST_GET_INFO", ost_get_info_generic_client,
+                                        ost_get_info_generic_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO);
+
+struct req_format RQF_OST_GET_INFO_LAST_ID =
+        DEFINE_REQ_FMT0("OST_GET_INFO_LAST_ID", ost_get_info_generic_client,
+                                                ost_get_last_id_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_LAST_ID);
+
+struct req_format RQF_OST_GET_INFO_LAST_FID =
+	DEFINE_REQ_FMT0("OST_GET_INFO_LAST_FID", ost_get_last_fid_client,
+						 ost_get_last_fid_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_LAST_FID);
+
+struct req_format RQF_OST_SET_INFO_LAST_FID =
+	DEFINE_REQ_FMT0("OST_SET_INFO_LAST_FID", obd_set_info_client,
+						 empty);
+EXPORT_SYMBOL(RQF_OST_SET_INFO_LAST_FID);
+
+struct req_format RQF_OST_GET_INFO_FIEMAP =
+        DEFINE_REQ_FMT0("OST_GET_INFO_FIEMAP", ost_get_fiemap_client,
+                                               ost_get_fiemap_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_FIEMAP);
+
+struct req_format RQF_LFSCK_NOTIFY =
+	DEFINE_REQ_FMT0("LFSCK_NOTIFY", obd_lfsck_request, empty);
+EXPORT_SYMBOL(RQF_LFSCK_NOTIFY);
+
+struct req_format RQF_LFSCK_QUERY =
+	DEFINE_REQ_FMT0("LFSCK_QUERY", obd_lfsck_request, obd_lfsck_reply);
+EXPORT_SYMBOL(RQF_LFSCK_QUERY);
+
+struct req_format RQF_OST_LADVISE =
+	DEFINE_REQ_FMT0("OST_LADVISE", ost_ladvise, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_LADVISE);
+
+/* Convenience macro */
+#define FMT_FIELD(fmt, i, j) (fmt)->rf_fields[(i)].d[(j)]
+
+/**
+ * Initializes the capsule abstraction by computing and setting the \a rf_idx
+ * field of RQFs and the \a rmf_offset field of RMFs.
+ */
+int req_layout_init(void)
+{
+	size_t i;
+	size_t j;
+	size_t k;
+        struct req_format *rf = NULL;
+
+        for (i = 0; i < ARRAY_SIZE(req_formats); ++i) {
+                rf = req_formats[i];
+                rf->rf_idx = i;
+                for (j = 0; j < RCL_NR; ++j) {
+                        LASSERT(rf->rf_fields[j].nr <= REQ_MAX_FIELD_NR);
+                        for (k = 0; k < rf->rf_fields[j].nr; ++k) {
+                                struct req_msg_field *field;
+
+                                field = (typeof(field))rf->rf_fields[j].d[k];
+                                LASSERT(!(field->rmf_flags & RMF_F_STRUCT_ARRAY)
+                                        || field->rmf_size > 0);
+                                LASSERT(field->rmf_offset[i][j] == 0);
+                                /*
+                                 * k + 1 to detect unused format/field
+                                 * combinations.
+                                 */
+                                field->rmf_offset[i][j] = k + 1;
+                        }
+                }
+        }
+        return 0;
+}
+EXPORT_SYMBOL(req_layout_init);
+
+void req_layout_fini(void)
+{
+}
+EXPORT_SYMBOL(req_layout_fini);
+
+/**
+ * Initializes the expected sizes of each RMF in a \a pill (\a rc_area) to -1.
+ *
+ * Actual/expected field sizes are set elsewhere in functions in this file:
+ * req_capsule_init(), req_capsule_server_pack(), req_capsule_set_size() and
+ * req_capsule_msg_size().  The \a rc_area information is used by.
+ * ptlrpc_request_set_replen().
+ */
+void req_capsule_init_area(struct req_capsule *pill)
+{
+	size_t i;
+
+        for (i = 0; i < ARRAY_SIZE(pill->rc_area[RCL_CLIENT]); i++) {
+                pill->rc_area[RCL_CLIENT][i] = -1;
+                pill->rc_area[RCL_SERVER][i] = -1;
+        }
+}
+EXPORT_SYMBOL(req_capsule_init_area);
+
+/**
+ * Initialize a pill.
+ *
+ * The \a location indicates whether the caller is executing on the client side
+ * (RCL_CLIENT) or server side (RCL_SERVER)..
+ */
+void req_capsule_init(struct req_capsule *pill,
+                      struct ptlrpc_request *req,
+                      enum req_location location)
+{
+        LASSERT(location == RCL_SERVER || location == RCL_CLIENT);
+
+        /*
+         * Today all capsules are embedded in ptlrpc_request structs,
+         * but just in case that ever isn't the case, we don't reach
+         * into req unless req != NULL and pill is the one embedded in
+         * the req.
+         *
+         * The req->rq_pill_init flag makes it safe to initialize a pill
+         * twice, which might happen in the OST paths as a result of the
+         * high-priority RPC queue getting peeked at before ost_handle()
+         * handles an OST RPC.
+         */
+        if (req != NULL && pill == &req->rq_pill && req->rq_pill_init)
+                return;
+
+        memset(pill, 0, sizeof *pill);
+        pill->rc_req = req;
+        pill->rc_loc = location;
+        req_capsule_init_area(pill);
+
+        if (req != NULL && pill == &req->rq_pill)
+                req->rq_pill_init = 1;
+}
+EXPORT_SYMBOL(req_capsule_init);
+
+void req_capsule_fini(struct req_capsule *pill)
+{
+}
+EXPORT_SYMBOL(req_capsule_fini);
+
+static int __req_format_is_sane(const struct req_format *fmt)
+{
+	return fmt->rf_idx < ARRAY_SIZE(req_formats) &&
+		req_formats[fmt->rf_idx] == fmt;
+}
+
+static struct lustre_msg *__req_msg(const struct req_capsule *pill,
+                                    enum req_location loc)
+{
+        struct ptlrpc_request *req;
+
+        req = pill->rc_req;
+        return loc == RCL_CLIENT ? req->rq_reqmsg : req->rq_repmsg;
+}
+
+/**
+ * Set the format (\a fmt) of a \a pill; format changes are not allowed here
+ * (see req_capsule_extend()).
+ */
+void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt)
+{
+        LASSERT(pill->rc_fmt == NULL || pill->rc_fmt == fmt);
+        LASSERT(__req_format_is_sane(fmt));
+
+        pill->rc_fmt = fmt;
+}
+EXPORT_SYMBOL(req_capsule_set);
+
+/**
+ * Fills in any parts of the \a rc_area of a \a pill that haven't been filled in
+ * yet.
+
+ * \a rc_area is an array of REQ_MAX_FIELD_NR elements, used to store sizes of
+ * variable-sized fields.  The field sizes come from the declared \a rmf_size
+ * field of a \a pill's \a rc_fmt's RMF's.
+ */
+size_t req_capsule_filled_sizes(struct req_capsule *pill,
+				enum req_location loc)
+{
+	const struct req_format *fmt = pill->rc_fmt;
+	size_t			 i;
+
+        LASSERT(fmt != NULL);
+
+        for (i = 0; i < fmt->rf_fields[loc].nr; ++i) {
+                if (pill->rc_area[loc][i] == -1) {
+                        pill->rc_area[loc][i] =
+                                            fmt->rf_fields[loc].d[i]->rmf_size;
+                        if (pill->rc_area[loc][i] == -1) {
+                                /*
+                                 * Skip the following fields.
+                                 *
+                                 * If this LASSERT() trips then you're missing a
+                                 * call to req_capsule_set_size().
+                                 */
+                                LASSERT(loc != RCL_SERVER);
+                                break;
+                        }
+                }
+        }
+        return i;
+}
+EXPORT_SYMBOL(req_capsule_filled_sizes);
+
+/**
+ * Capsule equivalent of lustre_pack_request() and lustre_pack_reply().
+ *
+ * This function uses the \a pill's \a rc_area as filled in by
+ * req_capsule_set_size() or req_capsule_filled_sizes() (the latter is called by
+ * this function).
+ */
+int req_capsule_server_pack(struct req_capsule *pill)
+{
+        const struct req_format *fmt;
+        int                      count;
+        int                      rc;
+
+        LASSERT(pill->rc_loc == RCL_SERVER);
+        fmt = pill->rc_fmt;
+        LASSERT(fmt != NULL);
+
+        count = req_capsule_filled_sizes(pill, RCL_SERVER);
+        rc = lustre_pack_reply(pill->rc_req, count,
+                               pill->rc_area[RCL_SERVER], NULL);
+        if (rc != 0) {
+                DEBUG_REQ(D_ERROR, pill->rc_req,
+                       "Cannot pack %d fields in format `%s': ",
+                       count, fmt->rf_name);
+        }
+        return rc;
+}
+EXPORT_SYMBOL(req_capsule_server_pack);
+
+/**
+ * Returns the PTLRPC request or reply (\a loc) buffer offset of a \a pill
+ * corresponding to the given RMF (\a field).
+ */
+__u32 __req_capsule_offset(const struct req_capsule *pill,
+			   const struct req_msg_field *field,
+			   enum req_location loc)
+{
+	unsigned int offset;
+
+	offset = field->rmf_offset[pill->rc_fmt->rf_idx][loc];
+	LASSERTF(offset > 0, "%s:%s, off=%d, loc=%d\n",
+			     pill->rc_fmt->rf_name,
+			     field->rmf_name, offset, loc);
+	offset--;
+
+	LASSERT(offset < REQ_MAX_FIELD_NR);
+        return offset;
+}
+
+/**
+ * Helper for __req_capsule_get(); swabs value / array of values and/or dumps
+ * them if desired.
+ */
+static
+void
+swabber_dumper_helper(struct req_capsule *pill,
+                      const struct req_msg_field *field,
+                      enum req_location loc,
+                      int offset,
+                      void *value, int len, int dump, void (*swabber)( void *))
+{
+        void    *p;
+        int     i;
+        int     n;
+        int     do_swab;
+        int     inout = loc == RCL_CLIENT;
+
+        swabber = swabber ?: field->rmf_swabber;
+
+        if (ptlrpc_buf_need_swab(pill->rc_req, inout, offset) &&
+            swabber != NULL && value != NULL)
+                do_swab = 1;
+        else
+                do_swab = 0;
+
+	if (!field->rmf_dumper)
+		dump = 0;
+
+        if (!(field->rmf_flags & RMF_F_STRUCT_ARRAY)) {
+                if (dump) {
+                        CDEBUG(D_RPCTRACE, "Dump of %sfield %s follows\n",
+                               do_swab ? "unswabbed " : "", field->rmf_name);
+                        field->rmf_dumper(value);
+                }
+                if (!do_swab)
+                        return;
+                swabber(value);
+                ptlrpc_buf_set_swabbed(pill->rc_req, inout, offset);
+		if (dump) {
+                        CDEBUG(D_RPCTRACE, "Dump of swabbed field %s "
+                               "follows\n", field->rmf_name);
+                        field->rmf_dumper(value);
+                }
+
+                return;
+        }
+
+        /*
+         * We're swabbing an array; swabber() swabs a single array element, so
+         * swab every element.
+         */
+        LASSERT((len % field->rmf_size) == 0);
+        for (p = value, i = 0, n = len / field->rmf_size;
+             i < n;
+             i++, p += field->rmf_size) {
+                if (dump) {
+                        CDEBUG(D_RPCTRACE, "Dump of %sarray field %s, "
+                               "element %d follows\n",
+                               do_swab ? "unswabbed " : "", field->rmf_name, i);
+                        field->rmf_dumper(p);
+                }
+                if (!do_swab)
+                        continue;
+                swabber(p);
+                if (dump) {
+                        CDEBUG(D_RPCTRACE, "Dump of swabbed array field %s, "
+                               "element %d follows\n", field->rmf_name, i);
+                        field->rmf_dumper(value);
+                }
+        }
+        if (do_swab)
+                ptlrpc_buf_set_swabbed(pill->rc_req, inout, offset);
+}
+
+/**
+ * Returns the pointer to a PTLRPC request or reply (\a loc) buffer of a \a pill
+ * corresponding to the given RMF (\a field).
+ *
+ * The buffer will be swabbed using the given \a swabber.  If \a swabber == NULL
+ * then the \a rmf_swabber from the RMF will be used.  Soon there will be no
+ * calls to __req_capsule_get() with a non-NULL \a swabber; \a swabber will then
+ * be removed.  Fields with the \a RMF_F_STRUCT_ARRAY flag set will have each
+ * element of the array swabbed.
+ */
+static void *__req_capsule_get(struct req_capsule *pill,
+                               const struct req_msg_field *field,
+                               enum req_location loc,
+                               void (*swabber)( void *),
+                               int dump)
+{
+        const struct req_format *fmt;
+        struct lustre_msg       *msg;
+        void                    *value;
+	__u32                    len;
+	__u32                    offset;
+
+	void *(*getter)(struct lustre_msg *m, __u32 n, __u32 minlen);
+
+        static const char *rcl_names[RCL_NR] = {
+                [RCL_CLIENT] = "client",
+                [RCL_SERVER] = "server"
+        };
+
+        LASSERT(pill != NULL);
+        LASSERT(pill != LP_POISON);
+        fmt = pill->rc_fmt;
+        LASSERT(fmt != NULL);
+        LASSERT(fmt != LP_POISON);
+        LASSERT(__req_format_is_sane(fmt));
+
+        offset = __req_capsule_offset(pill, field, loc);
+
+        msg = __req_msg(pill, loc);
+        LASSERT(msg != NULL);
+
+        getter = (field->rmf_flags & RMF_F_STRING) ?
+                (typeof(getter))lustre_msg_string : lustre_msg_buf;
+
+	if (field->rmf_flags & (RMF_F_STRUCT_ARRAY|RMF_F_NO_SIZE_CHECK)) {
+		/*
+		 * We've already asserted that field->rmf_size > 0 in
+		 * req_layout_init().
+		 */
+		len = lustre_msg_buflen(msg, offset);
+		if (!(field->rmf_flags & RMF_F_NO_SIZE_CHECK) &&
+		    (len % field->rmf_size) != 0) {
+			CERROR("%s: array field size mismatch "
+				"%d modulo %u != 0 (%d)\n",
+				field->rmf_name, len, field->rmf_size, loc);
+			return NULL;
+		}
+        } else if (pill->rc_area[loc][offset] != -1) {
+                len = pill->rc_area[loc][offset];
+        } else {
+		len = max_t(typeof(field->rmf_size), field->rmf_size, 0);
+        }
+        value = getter(msg, offset, len);
+
+        if (value == NULL) {
+                DEBUG_REQ(D_ERROR, pill->rc_req,
+			  "Wrong buffer for field `%s' (%u of %u) "
+			  "in format `%s': %u vs. %u (%s)\n",
+			  field->rmf_name, offset, lustre_msg_bufcount(msg),
+			  fmt->rf_name, lustre_msg_buflen(msg, offset), len,
+			  rcl_names[loc]);
+        } else {
+                swabber_dumper_helper(pill, field, loc, offset, value, len,
+                                      dump, swabber);
+        }
+
+        return value;
+}
+
+/**
+ * Dump a request and/or reply
+ */
+void __req_capsule_dump(struct req_capsule *pill, enum req_location loc)
+{
+	const struct    req_format *fmt;
+	const struct    req_msg_field *field;
+	__u32		len;
+	size_t		i;
+
+        fmt = pill->rc_fmt;
+
+        DEBUG_REQ(D_RPCTRACE, pill->rc_req, "BEGIN REQ CAPSULE DUMP\n");
+        for (i = 0; i < fmt->rf_fields[loc].nr; ++i) {
+                field = FMT_FIELD(fmt, loc, i);
+                if (field->rmf_dumper == NULL) {
+                        /*
+                         * FIXME Add a default hex dumper for fields that don't
+                         * have a specific dumper
+                         */
+                        len = req_capsule_get_size(pill, field, loc);
+			CDEBUG(D_RPCTRACE, "Field %s has no dumper function;"
+				"field size is %u\n", field->rmf_name, len);
+                } else {
+                        /* It's the dumping side-effect that we're interested in */
+                        (void) __req_capsule_get(pill, field, loc, NULL, 1);
+                }
+        }
+        CDEBUG(D_RPCTRACE, "END REQ CAPSULE DUMP\n");
+}
+
+/**
+ * Dump a request.
+ */
+void req_capsule_client_dump(struct req_capsule *pill)
+{
+        __req_capsule_dump(pill, RCL_CLIENT);
+}
+EXPORT_SYMBOL(req_capsule_client_dump);
+
+/**
+ * Dump a reply
+ */
+void req_capsule_server_dump(struct req_capsule *pill)
+{
+        __req_capsule_dump(pill, RCL_SERVER);
+}
+EXPORT_SYMBOL(req_capsule_server_dump);
+
+/**
+ * Trivial wrapper around __req_capsule_get(), that returns the PTLRPC request
+ * buffer corresponding to the given RMF (\a field) of a \a pill.
+ */
+void *req_capsule_client_get(struct req_capsule *pill,
+                             const struct req_msg_field *field)
+{
+        return __req_capsule_get(pill, field, RCL_CLIENT, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_client_get);
+
+/**
+ * Same as req_capsule_client_get(), but with a \a swabber argument.
+ *
+ * Currently unused; will be removed when req_capsule_server_swab_get() is
+ * unused too.
+ */
+void *req_capsule_client_swab_get(struct req_capsule *pill,
+				  const struct req_msg_field *field,
+				  void *swabber)
+{
+        return __req_capsule_get(pill, field, RCL_CLIENT, swabber, 0);
+}
+EXPORT_SYMBOL(req_capsule_client_swab_get);
+
+/**
+ * Utility that combines req_capsule_set_size() and req_capsule_client_get().
+ *
+ * First the \a pill's request \a field's size is set (\a rc_area) using
+ * req_capsule_set_size() with the given \a len.  Then the actual buffer is
+ * returned.
+ */
+void *req_capsule_client_sized_get(struct req_capsule *pill,
+				   const struct req_msg_field *field,
+				   __u32 len)
+{
+        req_capsule_set_size(pill, field, RCL_CLIENT, len);
+        return __req_capsule_get(pill, field, RCL_CLIENT, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_client_sized_get);
+
+/**
+ * Trivial wrapper around __req_capsule_get(), that returns the PTLRPC reply
+ * buffer corresponding to the given RMF (\a field) of a \a pill.
+ */
+void *req_capsule_server_get(struct req_capsule *pill,
+                             const struct req_msg_field *field)
+{
+        return __req_capsule_get(pill, field, RCL_SERVER, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_server_get);
+
+/**
+ * Same as req_capsule_server_get(), but with a \a swabber argument.
+ *
+ * Ideally all swabbing should be done pursuant to RMF definitions, with no
+ * swabbing done outside this capsule abstraction.
+ */
+void *req_capsule_server_swab_get(struct req_capsule *pill,
+                                  const struct req_msg_field *field,
+                                  void *swabber)
+{
+        return __req_capsule_get(pill, field, RCL_SERVER, swabber, 0);
+}
+EXPORT_SYMBOL(req_capsule_server_swab_get);
+
+/**
+ * Utility that combines req_capsule_set_size() and req_capsule_server_get().
+ *
+ * First the \a pill's request \a field's size is set (\a rc_area) using
+ * req_capsule_set_size() with the given \a len.  Then the actual buffer is
+ * returned.
+ */
+void *req_capsule_server_sized_get(struct req_capsule *pill,
+				   const struct req_msg_field *field,
+				   __u32 len)
+{
+        req_capsule_set_size(pill, field, RCL_SERVER, len);
+        return __req_capsule_get(pill, field, RCL_SERVER, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_server_sized_get);
+
+void *req_capsule_server_sized_swab_get(struct req_capsule *pill,
+					const struct req_msg_field *field,
+					__u32 len, void *swabber)
+{
+	req_capsule_set_size(pill, field, RCL_SERVER, len);
+	return __req_capsule_get(pill, field, RCL_SERVER, swabber, 0);
+}
+EXPORT_SYMBOL(req_capsule_server_sized_swab_get);
+
+/**
+ * Returns the buffer of a \a pill corresponding to the given \a field from the
+ * request (if the caller is executing on the server-side) or reply (if the
+ * caller is executing on the client-side).
+ *
+ * This function convienient for use is code that could be executed on the
+ * client and server alike.
+ */
+const void *req_capsule_other_get(struct req_capsule *pill,
+                                  const struct req_msg_field *field)
+{
+        return __req_capsule_get(pill, field, pill->rc_loc ^ 1, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_other_get);
+
+/**
+ * Set the size of the PTLRPC request/reply (\a loc) buffer for the given \a
+ * field of the given \a pill.
+ *
+ * This function must be used when constructing variable sized fields of a
+ * request or reply.
+ */
+void req_capsule_set_size(struct req_capsule *pill,
+			  const struct req_msg_field *field,
+			  enum req_location loc, __u32 size)
+{
+	LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+
+	if ((size != (__u32)field->rmf_size) &&
+	    (field->rmf_size != -1) &&
+	    !(field->rmf_flags & RMF_F_NO_SIZE_CHECK) &&
+	    (size > 0)) {
+		__u32 rmf_size = (__u32)field->rmf_size;
+		if ((field->rmf_flags & RMF_F_STRUCT_ARRAY) &&
+		    (size % rmf_size != 0)) {
+			CERROR("%s: array field size mismatch "
+				"%u %% %u != 0 (%d)\n",
+				field->rmf_name, size, rmf_size, loc);
+			LBUG();
+		} else if (!(field->rmf_flags & RMF_F_STRUCT_ARRAY) &&
+			   size < rmf_size) {
+			CERROR("%s: field size mismatch %u != %u (%d)\n",
+				field->rmf_name, size, rmf_size, loc);
+			LBUG();
+		}
+	}
+
+	pill->rc_area[loc][__req_capsule_offset(pill, field, loc)] = size;
+}
+EXPORT_SYMBOL(req_capsule_set_size);
+
+/**
+ * Return the actual PTLRPC buffer length of a request or reply (\a loc)
+ * for the given \a pill's given \a field.
+ *
+ * NB: this function doesn't correspond with req_capsule_set_size(), which
+ * actually sets the size in pill.rc_area[loc][offset], but this function
+ * returns the message buflen[offset], maybe we should use another name.
+ */
+__u32 req_capsule_get_size(const struct req_capsule *pill,
+                         const struct req_msg_field *field,
+                         enum req_location loc)
+{
+        LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+
+        return lustre_msg_buflen(__req_msg(pill, loc),
+                                 __req_capsule_offset(pill, field, loc));
+}
+EXPORT_SYMBOL(req_capsule_get_size);
+
+/**
+ * Wrapper around lustre_msg_size() that returns the PTLRPC size needed for the
+ * given \a pill's request or reply (\a loc) given the field size recorded in
+ * the \a pill's rc_area.
+ *
+ * See also req_capsule_set_size().
+ */
+__u32 req_capsule_msg_size(struct req_capsule *pill, enum req_location loc)
+{
+        return lustre_msg_size(pill->rc_req->rq_import->imp_msg_magic,
+                               pill->rc_fmt->rf_fields[loc].nr,
+                               pill->rc_area[loc]);
+}
+
+/**
+ * While req_capsule_msg_size() computes the size of a PTLRPC request or reply
+ * (\a loc) given a \a pill's \a rc_area, this function computes the size of a
+ * PTLRPC request or reply given only an RQF (\a fmt).
+ *
+ * This function should not be used for formats which contain variable size
+ * fields.
+ */
+__u32 req_capsule_fmt_size(__u32 magic, const struct req_format *fmt,
+                         enum req_location loc)
+{
+	__u32 size;
+	size_t i = 0;
+
+        /*
+         * This function should probably LASSERT() that fmt has no fields with
+         * RMF_F_STRUCT_ARRAY in rmf_flags, since we can't know here how many
+         * elements in the array there will ultimately be, but then, we could
+         * assume that there will be at least one element, and that's just what
+         * we do.
+         */
+        size = lustre_msg_hdr_size(magic, fmt->rf_fields[loc].nr);
+	if (size == 0)
+		return size;
+
+        for (; i < fmt->rf_fields[loc].nr; ++i)
+                if (fmt->rf_fields[loc].d[i]->rmf_size != -1)
+                        size += cfs_size_round(fmt->rf_fields[loc].d[i]->
+                                               rmf_size);
+        return size;
+}
+
+/**
+ * Changes the format of an RPC.
+ *
+ * The pill must already have been initialized, which means that it already has
+ * a request format.  The new format \a fmt must be an extension of the pill's
+ * old format.  Specifically: the new format must have as many request and reply
+ * fields as the old one, and all fields shared by the old and new format must
+ * be at least as large in the new format.
+ *
+ * The new format's fields may be of different "type" than the old format, but
+ * only for fields that are "opaque" blobs: fields which have a) have no
+ * \a rmf_swabber, b) \a rmf_flags == 0 or RMF_F_NO_SIZE_CHECK, and c) \a
+ * rmf_size == -1 or \a rmf_flags == RMF_F_NO_SIZE_CHECK.  For example,
+ * OBD_SET_INFO has a key field and an opaque value field that gets interpreted
+ * according to the key field.  When the value, according to the key, contains a
+ * structure (or array thereof) to be swabbed, the format should be changed to
+ * one where the value field has \a rmf_size/rmf_flags/rmf_swabber set
+ * accordingly.
+ */
+void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt)
+{
+	int i;
+	size_t j;
+
+        const struct req_format *old;
+
+        LASSERT(pill->rc_fmt != NULL);
+        LASSERT(__req_format_is_sane(fmt));
+
+        old = pill->rc_fmt;
+        /*
+         * Sanity checking...
+         */
+        for (i = 0; i < RCL_NR; ++i) {
+                LASSERT(fmt->rf_fields[i].nr >= old->rf_fields[i].nr);
+                for (j = 0; j < old->rf_fields[i].nr - 1; ++j) {
+                        const struct req_msg_field *ofield = FMT_FIELD(old, i, j);
+
+                        /* "opaque" fields can be transmogrified */
+                        if (ofield->rmf_swabber == NULL &&
+                            (ofield->rmf_flags & ~RMF_F_NO_SIZE_CHECK) == 0 &&
+                            (ofield->rmf_size == -1 ||
+                            ofield->rmf_flags == RMF_F_NO_SIZE_CHECK))
+                                continue;
+                        LASSERT(FMT_FIELD(fmt, i, j) == FMT_FIELD(old, i, j));
+                }
+                /*
+                 * Last field in old format can be shorter than in new.
+                 */
+                LASSERT(FMT_FIELD(fmt, i, j)->rmf_size >=
+                        FMT_FIELD(old, i, j)->rmf_size);
+        }
+
+        pill->rc_fmt = fmt;
+}
+EXPORT_SYMBOL(req_capsule_extend);
+
+/**
+ * This function returns a non-zero value if the given \a field is present in
+ * the format (\a rc_fmt) of \a pill's PTLRPC request or reply (\a loc), else it
+ * returns 0.
+ */
+int req_capsule_has_field(const struct req_capsule *pill,
+                          const struct req_msg_field *field,
+                          enum req_location loc)
+{
+        LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+
+        return field->rmf_offset[pill->rc_fmt->rf_idx][loc];
+}
+EXPORT_SYMBOL(req_capsule_has_field);
+
+/**
+ * Returns a non-zero value if the given \a field is present in the given \a
+ * pill's PTLRPC request or reply (\a loc), else it returns 0.
+ */
+int req_capsule_field_present(const struct req_capsule *pill,
+                              const struct req_msg_field *field,
+                              enum req_location loc)
+{
+	__u32 offset;
+
+        LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+        LASSERT(req_capsule_has_field(pill, field, loc));
+
+        offset = __req_capsule_offset(pill, field, loc);
+        return lustre_msg_bufcount(__req_msg(pill, loc)) > offset;
+}
+EXPORT_SYMBOL(req_capsule_field_present);
+
+/**
+ * This function shrinks the size of the _buffer_ of the \a pill's PTLRPC
+ * request or reply (\a loc).
+ *
+ * This is not the opposite of req_capsule_extend().
+ */
+void req_capsule_shrink(struct req_capsule *pill,
+			const struct req_msg_field *field,
+			__u32 newlen,
+			enum req_location loc)
+{
+        const struct req_format *fmt;
+        struct lustre_msg       *msg;
+	__u32			 len;
+        int                      offset;
+
+        fmt = pill->rc_fmt;
+        LASSERT(fmt != NULL);
+        LASSERT(__req_format_is_sane(fmt));
+        LASSERT(req_capsule_has_field(pill, field, loc));
+        LASSERT(req_capsule_field_present(pill, field, loc));
+
+        offset = __req_capsule_offset(pill, field, loc);
+
+        msg = __req_msg(pill, loc);
+        len = lustre_msg_buflen(msg, offset);
+	LASSERTF(newlen <= len, "%s:%s, oldlen=%u, newlen=%u\n",
+                                fmt->rf_name, field->rmf_name, len, newlen);
+
+        if (loc == RCL_CLIENT)
+                pill->rc_req->rq_reqlen = lustre_shrink_msg(msg, offset, newlen,
+                                                            1);
+        else
+                pill->rc_req->rq_replen = lustre_shrink_msg(msg, offset, newlen,
+                                                            1);
+}
+EXPORT_SYMBOL(req_capsule_shrink);
+
+int req_capsule_server_grow(struct req_capsule *pill,
+			    const struct req_msg_field *field,
+			    __u32 newlen)
+{
+        struct ptlrpc_reply_state *rs = pill->rc_req->rq_reply_state, *nrs;
+        char *from, *to;
+	int rc;
+	__u32 offset, len;
+
+        LASSERT(pill->rc_fmt != NULL);
+        LASSERT(__req_format_is_sane(pill->rc_fmt));
+        LASSERT(req_capsule_has_field(pill, field, RCL_SERVER));
+        LASSERT(req_capsule_field_present(pill, field, RCL_SERVER));
+
+        len = req_capsule_get_size(pill, field, RCL_SERVER);
+        offset = __req_capsule_offset(pill, field, RCL_SERVER);
+	if ((__u32)pill->rc_req->rq_repbuf_len >=
+            lustre_packed_msg_size(pill->rc_req->rq_repmsg) - len + newlen)
+                CERROR("Inplace repack might be done\n");
+
+        pill->rc_req->rq_reply_state = NULL;
+        req_capsule_set_size(pill, field, RCL_SERVER, newlen);
+        rc = req_capsule_server_pack(pill);
+        if (rc) {
+                /* put old rs back, the caller will decide what to do */
+                pill->rc_req->rq_reply_state = rs;
+                return rc;
+        }
+        nrs = pill->rc_req->rq_reply_state;
+        /* Now we need only buffers, copy first chunk */
+        to = lustre_msg_buf(nrs->rs_msg, 0, 0);
+        from = lustre_msg_buf(rs->rs_msg, 0, 0);
+        len = (char *)lustre_msg_buf(rs->rs_msg, offset, 0) - from;
+        memcpy(to, from, len);
+        /* check if we have tail and copy it too */
+        if (rs->rs_msg->lm_bufcount > offset + 1) {
+                to = lustre_msg_buf(nrs->rs_msg, offset + 1, 0);
+                from = lustre_msg_buf(rs->rs_msg, offset + 1, 0);
+                offset = rs->rs_msg->lm_bufcount - 1;
+                len = (char *)lustre_msg_buf(rs->rs_msg, offset, 0) +
+                      cfs_size_round(rs->rs_msg->lm_buflens[offset]) - from;
+                memcpy(to, from, len);
+        }
+        /* drop old reply if everything is fine */
+        if (rs->rs_difficult) {
+                /* copy rs data */
+                int i;
+
+                nrs->rs_difficult = 1;
+                nrs->rs_no_ack = rs->rs_no_ack;
+		nrs->rs_convert_lock = rs->rs_convert_lock;
+                for (i = 0; i < rs->rs_nlocks; i++) {
+                        nrs->rs_locks[i] = rs->rs_locks[i];
+                        nrs->rs_modes[i] = rs->rs_modes[i];
+                        nrs->rs_nlocks++;
+                }
+                rs->rs_nlocks = 0;
+                rs->rs_difficult = 0;
+                rs->rs_no_ack = 0;
+        }
+        ptlrpc_rs_decref(rs);
+        return 0;
+}
+EXPORT_SYMBOL(req_capsule_server_grow);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/llog_client.c b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_client.c
new file mode 100644
index 0000000000000..a39db55028dc5
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_client.c
@@ -0,0 +1,374 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/llog_client.c
+ *
+ * remote api for llog - client side
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include <linux/list.h>
+#include <libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include <lustre_net.h>
+
+#define LLOG_CLIENT_ENTRY(ctxt, imp) do {                             \
+	mutex_lock(&ctxt->loc_mutex);                             \
+        if (ctxt->loc_imp) {                                          \
+                imp = class_import_get(ctxt->loc_imp);                \
+        } else {                                                      \
+                CERROR("ctxt->loc_imp == NULL for context idx %d."    \
+                       "Unable to complete MDS/OSS recovery,"         \
+                       "but I'll try again next time.  Not fatal.\n", \
+                       ctxt->loc_idx);                                \
+                imp = NULL;                                           \
+		mutex_unlock(&ctxt->loc_mutex);                   \
+                return (-EINVAL);                                     \
+        }                                                             \
+	mutex_unlock(&ctxt->loc_mutex);                           \
+} while(0)
+
+#define LLOG_CLIENT_EXIT(ctxt, imp) do {                              \
+	mutex_lock(&ctxt->loc_mutex);                             \
+        if (ctxt->loc_imp != imp)                                     \
+                CWARN("loc_imp has changed from %p to %p\n",          \
+                       ctxt->loc_imp, imp);                           \
+        class_import_put(imp);                                        \
+	mutex_unlock(&ctxt->loc_mutex);                           \
+} while(0)
+
+/* This is a callback from the llog_* functions.
+ * Assumes caller has already pushed us into the kernel context. */
+static int llog_client_open(const struct lu_env *env,
+			    struct llog_handle *lgh, struct llog_logid *logid,
+			    char *name, enum llog_open_param open_param)
+{
+        struct obd_import     *imp;
+        struct llogd_body     *body;
+	struct llog_ctxt      *ctxt = lgh->lgh_ctxt;
+        struct ptlrpc_request *req = NULL;
+        int                    rc;
+        ENTRY;
+
+        LLOG_CLIENT_ENTRY(ctxt, imp);
+
+	/* client cannot create llog */
+	LASSERTF(open_param != LLOG_OPEN_NEW, "%#x\n", open_param);
+	LASSERT(lgh);
+
+	req = ptlrpc_request_alloc(imp, &RQF_LLOG_ORIGIN_HANDLE_CREATE);
+	if (req == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+        if (name)
+                req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                                     strlen(name) + 1);
+
+        rc = ptlrpc_request_pack(req, LUSTRE_LOG_VERSION,
+                                 LLOG_ORIGIN_HANDLE_CREATE);
+        if (rc) {
+                ptlrpc_request_free(req);
+                req = NULL;
+		GOTO(out, rc);
+        }
+        ptlrpc_request_set_replen(req);
+
+        body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+        if (logid)
+                body->lgd_logid = *logid;
+        body->lgd_ctxt_idx = ctxt->loc_idx - 1;
+
+        if (name) {
+                char *tmp;
+                tmp = req_capsule_client_sized_get(&req->rq_pill, &RMF_NAME,
+                                                   strlen(name) + 1);
+                LASSERT(tmp);
+                strcpy(tmp, name);
+        }
+
+        rc = ptlrpc_queue_wait(req);
+        if (rc)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EFAULT);
+
+	lgh->lgh_id = body->lgd_logid;
+	lgh->lgh_ctxt = ctxt;
+	EXIT;
+out:
+	LLOG_CLIENT_EXIT(ctxt, imp);
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int llog_client_destroy(const struct lu_env *env,
+			       struct llog_handle *loghandle,
+			       struct thandle *th)
+{
+        struct obd_import     *imp;
+        struct ptlrpc_request *req = NULL;
+        struct llogd_body     *body;
+        int                    rc;
+        ENTRY;
+
+        LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp);
+        req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_DESTROY,
+                                        LUSTRE_LOG_VERSION,
+                                        LLOG_ORIGIN_HANDLE_DESTROY);
+        if (req == NULL)
+                GOTO(err_exit, rc =-ENOMEM);
+
+        body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+        body->lgd_logid = loghandle->lgh_id;
+        body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags;
+
+	if (!(body->lgd_llh_flags & LLOG_F_IS_PLAIN))
+		CERROR("%s: wrong llog flags %x\n", imp->imp_obd->obd_name,
+		       body->lgd_llh_flags);
+
+        ptlrpc_request_set_replen(req);
+        rc = ptlrpc_queue_wait(req);
+
+        ptlrpc_req_finished(req);
+err_exit:
+        LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp);
+        RETURN(rc);
+}
+
+
+static int llog_client_next_block(const struct lu_env *env,
+				  struct llog_handle *loghandle,
+				  int *cur_idx, int next_idx,
+				  __u64 *cur_offset, void *buf, int len)
+{
+        struct obd_import     *imp;
+        struct ptlrpc_request *req = NULL;
+        struct llogd_body     *body;
+        void                  *ptr;
+        int                    rc;
+        ENTRY;
+
+        LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp);
+        req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK,
+                                        LUSTRE_LOG_VERSION,
+                                        LLOG_ORIGIN_HANDLE_NEXT_BLOCK);
+        if (req == NULL)
+                GOTO(err_exit, rc =-ENOMEM);
+
+        body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+        body->lgd_logid = loghandle->lgh_id;
+        body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1;
+        body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags;
+        body->lgd_index = next_idx;
+        body->lgd_saved_index = *cur_idx;
+        body->lgd_len = len;
+        body->lgd_cur_offset = *cur_offset;
+
+        req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, len);
+        ptlrpc_request_set_replen(req);
+        rc = ptlrpc_queue_wait(req);
+	/* -EIO has a special meaning here. If llog_osd_next_block()
+	 * reaches the end of the log without finding the desired
+	 * record then it updates *cur_offset and *cur_idx and returns
+	 * -EIO. In llog_process_thread() we use this to detect
+	 * EOF. But we must be careful to distinguish between -EIO
+	 * coming from llog_osd_next_block() and -EIO coming from
+	 * ptlrpc or below. */
+	if (rc == -EIO) {
+		if (req->rq_repmsg == NULL ||
+		    lustre_msg_get_status(req->rq_repmsg) != -EIO)
+			GOTO(out, rc);
+	} else if (rc < 0) {
+		GOTO(out, rc);
+	}
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EFAULT);
+
+	*cur_idx = body->lgd_saved_index;
+	*cur_offset = body->lgd_cur_offset;
+
+	if (rc < 0)
+		GOTO(out, rc);
+
+        /* The log records are swabbed as they are processed */
+        ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA);
+        if (ptr == NULL)
+                GOTO(out, rc =-EFAULT);
+
+        memcpy(buf, ptr, len);
+        EXIT;
+out:
+        ptlrpc_req_finished(req);
+err_exit:
+        LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp);
+        return rc;
+}
+
+static int llog_client_prev_block(const struct lu_env *env,
+				  struct llog_handle *loghandle,
+				  int prev_idx, void *buf, int len)
+{
+        struct obd_import     *imp;
+        struct ptlrpc_request *req = NULL;
+        struct llogd_body     *body;
+        void                  *ptr;
+        int                    rc;
+        ENTRY;
+
+        LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp);
+        req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK,
+                                        LUSTRE_LOG_VERSION,
+                                        LLOG_ORIGIN_HANDLE_PREV_BLOCK);
+        if (req == NULL)
+                GOTO(err_exit, rc = -ENOMEM);
+
+        body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+        body->lgd_logid = loghandle->lgh_id;
+        body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1;
+        body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags;
+        body->lgd_index = prev_idx;
+        body->lgd_len = len;
+
+        req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, len);
+        ptlrpc_request_set_replen(req);
+
+        rc = ptlrpc_queue_wait(req);
+        if (rc)
+                GOTO(out, rc);
+
+        body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+        if (body == NULL)
+                GOTO(out, rc =-EFAULT);
+
+        ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA);
+        if (ptr == NULL)
+                GOTO(out, rc =-EFAULT);
+
+        memcpy(buf, ptr, len);
+        EXIT;
+out:
+        ptlrpc_req_finished(req);
+err_exit:
+        LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp);
+        return rc;
+}
+
+static int llog_client_read_header(const struct lu_env *env,
+				   struct llog_handle *handle)
+{
+        struct obd_import     *imp;
+        struct ptlrpc_request *req = NULL;
+        struct llogd_body     *body;
+        struct llog_log_hdr   *hdr;
+        struct llog_rec_hdr   *llh_hdr;
+        int                    rc;
+        ENTRY;
+
+        LLOG_CLIENT_ENTRY(handle->lgh_ctxt, imp);
+        req = ptlrpc_request_alloc_pack(imp,&RQF_LLOG_ORIGIN_HANDLE_READ_HEADER,
+                                        LUSTRE_LOG_VERSION,
+                                        LLOG_ORIGIN_HANDLE_READ_HEADER);
+        if (req == NULL)
+                GOTO(err_exit, rc = -ENOMEM);
+
+        body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+        body->lgd_logid = handle->lgh_id;
+        body->lgd_ctxt_idx = handle->lgh_ctxt->loc_idx - 1;
+        body->lgd_llh_flags = handle->lgh_hdr->llh_flags;
+
+        ptlrpc_request_set_replen(req);
+        rc = ptlrpc_queue_wait(req);
+        if (rc)
+                GOTO(out, rc);
+
+        hdr = req_capsule_server_get(&req->rq_pill, &RMF_LLOG_LOG_HDR);
+        if (hdr == NULL)
+                GOTO(out, rc =-EFAULT);
+
+	if (handle->lgh_hdr_size < hdr->llh_hdr.lrh_len)
+		GOTO(out, rc = -EFAULT);
+
+	memcpy(handle->lgh_hdr, hdr, hdr->llh_hdr.lrh_len);
+	handle->lgh_last_idx = LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_index;
+
+	/* sanity checks */
+	llh_hdr = &handle->lgh_hdr->llh_hdr;
+	if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) {
+		CERROR("bad log header magic: %#x (expecting %#x)\n",
+		       llh_hdr->lrh_type, LLOG_HDR_MAGIC);
+		rc = -EIO;
+	} else if (llh_hdr->lrh_len !=
+		   LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_len ||
+		   (llh_hdr->lrh_len & (llh_hdr->lrh_len - 1)) != 0 ||
+		   llh_hdr->lrh_len < LLOG_MIN_CHUNK_SIZE ||
+		   llh_hdr->lrh_len > handle->lgh_hdr_size) {
+		CERROR("incorrectly sized log header: %#x, "
+		       "expecting %#x (power of two > 8192)\n",
+		       llh_hdr->lrh_len,
+		       LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_len);
+		CERROR("you may need to re-run lconf --write_conf.\n");
+		rc = -EIO;
+	}
+	EXIT;
+out:
+        ptlrpc_req_finished(req);
+err_exit:
+        LLOG_CLIENT_EXIT(handle->lgh_ctxt, imp);
+        return rc;
+}
+
+static int llog_client_close(const struct lu_env *env,
+			     struct llog_handle *handle)
+{
+        /* this doesn't call LLOG_ORIGIN_HANDLE_CLOSE because
+           the servers all close the file at the end of every
+           other LLOG_ RPC. */
+        return(0);
+}
+
+struct llog_operations llog_client_ops = {
+	.lop_next_block		= llog_client_next_block,
+	.lop_prev_block		= llog_client_prev_block,
+	.lop_read_header	= llog_client_read_header,
+	.lop_open		= llog_client_open,
+	.lop_destroy		= llog_client_destroy,
+	.lop_close		= llog_client_close,
+};
+EXPORT_SYMBOL(llog_client_ops);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/llog_net.c b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_net.c
new file mode 100644
index 0000000000000..9036491a1a89a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_net.c
@@ -0,0 +1,67 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/llog_net.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ *
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include <libcfs/libcfs.h>
+#include <obd_class.h>
+#include <lustre_log.h>
+
+int llog_initiator_connect(struct llog_ctxt *ctxt)
+{
+        struct obd_import *new_imp;
+        ENTRY;
+
+        LASSERT(ctxt);
+        new_imp = ctxt->loc_obd->u.cli.cl_import;
+        LASSERTF(ctxt->loc_imp == NULL || ctxt->loc_imp == new_imp,
+                 "%p - %p\n", ctxt->loc_imp, new_imp);
+	mutex_lock(&ctxt->loc_mutex);
+        if (ctxt->loc_imp != new_imp) {
+                if (ctxt->loc_imp)
+                        class_import_put(ctxt->loc_imp);
+                ctxt->loc_imp = class_import_get(new_imp);
+        }
+	mutex_unlock(&ctxt->loc_mutex);
+        RETURN(0);
+}
+EXPORT_SYMBOL(llog_initiator_connect);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/llog_server.c b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_server.c
new file mode 100644
index 0000000000000..4864b499120df
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_server.c
@@ -0,0 +1,338 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/llog_server.c
+ *
+ * remote api for llog - server side
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include <obd_class.h>
+#include <lu_target.h>
+#include <lustre_log.h>
+#include <lustre_net.h>
+
+static int llog_origin_close(const struct lu_env *env, struct llog_handle *lgh)
+{
+	if (lgh->lgh_hdr != NULL && lgh->lgh_hdr->llh_flags & LLOG_F_IS_CAT)
+		return llog_cat_close(env, lgh);
+	else
+		return llog_close(env, lgh);
+}
+
+/* Only open is supported, no new llog can be created remotely */
+int llog_origin_handle_open(struct ptlrpc_request *req)
+{
+	struct obd_export	*exp = req->rq_export;
+	struct obd_device	*obd = exp->exp_obd;
+	struct llog_handle	*loghandle;
+	struct llogd_body	*body;
+	struct llog_logid	*logid = NULL;
+	struct llog_ctxt	*ctxt;
+	char			*name = NULL;
+	int			 rc;
+
+	ENTRY;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL)
+		RETURN(err_serious(-EFAULT));
+
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc)
+		RETURN(err_serious(-ENOMEM));
+
+	if (ostid_id(&body->lgd_logid.lgl_oi) > 0)
+		logid = &body->lgd_logid;
+
+	if (req_capsule_field_present(&req->rq_pill, &RMF_NAME, RCL_CLIENT)) {
+		name = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+		if (name == NULL)
+			RETURN(-EFAULT);
+		CDEBUG(D_INFO, "%s: opening log %s\n", obd->obd_name, name);
+	}
+
+	if (body->lgd_ctxt_idx >= LLOG_MAX_CTXTS) {
+		CDEBUG(D_WARNING, "%s: bad ctxt ID: idx=%d name=%s\n",
+		       obd->obd_name, body->lgd_ctxt_idx, name);
+		RETURN(-EPROTO);
+	}
+
+	ctxt = llog_get_context(obd, body->lgd_ctxt_idx);
+	if (ctxt == NULL) {
+		CDEBUG(D_WARNING, "%s: no ctxt. group=%p idx=%d name=%s\n",
+		       obd->obd_name, &obd->obd_olg, body->lgd_ctxt_idx, name);
+		RETURN(-ENODEV);
+	}
+
+	rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle, logid,
+		       name, LLOG_OPEN_EXISTS);
+	if (rc)
+		GOTO(out_ctxt, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	body->lgd_logid = loghandle->lgh_id;
+
+	llog_origin_close(req->rq_svc_thread->t_env, loghandle);
+	EXIT;
+out_ctxt:
+	llog_ctxt_put(ctxt);
+	return rc;
+}
+
+int llog_origin_handle_destroy(struct ptlrpc_request *req)
+{
+	struct llogd_body	*body;
+	struct llog_logid	*logid = NULL;
+	struct llog_ctxt	*ctxt;
+	int			 rc;
+
+	ENTRY;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL)
+		RETURN(err_serious(-EFAULT));
+
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc < 0)
+		RETURN(err_serious(-ENOMEM));
+
+	if (ostid_id(&body->lgd_logid.lgl_oi) > 0)
+		logid = &body->lgd_logid;
+
+	if (!(body->lgd_llh_flags & LLOG_F_IS_PLAIN))
+		CERROR("%s: wrong llog flags %x\n",
+		       req->rq_export->exp_obd->obd_name, body->lgd_llh_flags);
+
+	if (body->lgd_ctxt_idx >= LLOG_MAX_CTXTS) {
+		CDEBUG(D_WARNING, "%s: bad ctxt ID: idx=%d\n",
+		       req->rq_export->exp_obd->obd_name, body->lgd_ctxt_idx);
+		RETURN(-EPROTO);
+	}
+
+	ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx);
+	if (ctxt == NULL)
+		RETURN(-ENODEV);
+
+	rc = llog_erase(req->rq_svc_thread->t_env, ctxt, logid, NULL);
+	llog_ctxt_put(ctxt);
+	RETURN(rc);
+}
+
+int llog_origin_handle_next_block(struct ptlrpc_request *req)
+{
+	struct llog_handle	*loghandle;
+	struct llogd_body	*body;
+	struct llogd_body	*repbody;
+	struct llog_ctxt	*ctxt;
+	__u32			 flags;
+	void			*ptr;
+	int			 rc;
+
+	ENTRY;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL)
+		RETURN(err_serious(-EFAULT));
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER,
+			     LLOG_MIN_CHUNK_SIZE);
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc)
+		RETURN(err_serious(-ENOMEM));
+
+	if (body->lgd_ctxt_idx >= LLOG_MAX_CTXTS) {
+		CDEBUG(D_WARNING, "%s: bad ctxt ID: idx=%d\n",
+		       req->rq_export->exp_obd->obd_name, body->lgd_ctxt_idx);
+		RETURN(-EPROTO);
+	}
+
+	ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx);
+	if (ctxt == NULL)
+		RETURN(-ENODEV);
+
+	rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle,
+		       &body->lgd_logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc)
+		GOTO(out_ctxt, rc);
+
+	flags = body->lgd_llh_flags;
+	rc = llog_init_handle(req->rq_svc_thread->t_env, loghandle, flags,
+			      NULL);
+	if (rc)
+		GOTO(out_close, rc);
+
+	repbody = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	*repbody = *body;
+
+	ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA);
+	rc = llog_next_block(req->rq_svc_thread->t_env, loghandle,
+			     &repbody->lgd_saved_index, repbody->lgd_index,
+			     &repbody->lgd_cur_offset, ptr,
+			     LLOG_MIN_CHUNK_SIZE);
+	if (rc)
+		GOTO(out_close, rc);
+	EXIT;
+out_close:
+	llog_origin_close(req->rq_svc_thread->t_env, loghandle);
+out_ctxt:
+	llog_ctxt_put(ctxt);
+	return rc;
+}
+
+int llog_origin_handle_prev_block(struct ptlrpc_request *req)
+{
+	struct llog_handle	*loghandle;
+	struct llogd_body	*body;
+	struct llogd_body	*repbody;
+	struct llog_ctxt	*ctxt;
+	__u32			 flags;
+	void			*ptr;
+	int			 rc;
+
+	ENTRY;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL)
+		RETURN(err_serious(-EFAULT));
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER,
+			     LLOG_MIN_CHUNK_SIZE);
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc)
+		RETURN(err_serious(-ENOMEM));
+
+	if (body->lgd_ctxt_idx >= LLOG_MAX_CTXTS) {
+		CDEBUG(D_WARNING, "%s: bad ctxt ID: idx=%d\n",
+		       req->rq_export->exp_obd->obd_name, body->lgd_ctxt_idx);
+		RETURN(-EPROTO);
+	}
+
+	ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx);
+	if (ctxt == NULL)
+		RETURN(-ENODEV);
+
+	rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle,
+			 &body->lgd_logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc)
+		GOTO(out_ctxt, rc);
+
+	flags = body->lgd_llh_flags;
+	rc = llog_init_handle(req->rq_svc_thread->t_env, loghandle, flags,
+			      NULL);
+	if (rc)
+		GOTO(out_close, rc);
+
+	repbody = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	*repbody = *body;
+
+	ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA);
+	rc = llog_prev_block(req->rq_svc_thread->t_env, loghandle,
+			     body->lgd_index, ptr, LLOG_MIN_CHUNK_SIZE);
+	if (rc)
+		GOTO(out_close, rc);
+
+	EXIT;
+out_close:
+	llog_origin_close(req->rq_svc_thread->t_env, loghandle);
+out_ctxt:
+	llog_ctxt_put(ctxt);
+	return rc;
+}
+
+int llog_origin_handle_read_header(struct ptlrpc_request *req)
+{
+	struct llog_handle	*loghandle;
+	struct llogd_body	*body;
+	struct llog_log_hdr	*hdr;
+	struct llog_ctxt	*ctxt;
+	__u32			 flags;
+	int			 rc;
+
+	ENTRY;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL)
+		RETURN(err_serious(-EFAULT));
+
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc)
+		RETURN(err_serious(-ENOMEM));
+
+	if (body->lgd_ctxt_idx >= LLOG_MAX_CTXTS) {
+		CDEBUG(D_WARNING, "%s: bad ctxt ID: idx=%d\n",
+		       req->rq_export->exp_obd->obd_name, body->lgd_ctxt_idx);
+		RETURN(-EPROTO);
+	}
+
+	ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx);
+	if (ctxt == NULL)
+		RETURN(-ENODEV);
+
+	rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle,
+		       &body->lgd_logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc)
+		GOTO(out_ctxt, rc);
+
+	/*
+	 * llog_init_handle() reads the llog header
+	 */
+	flags = body->lgd_llh_flags;
+	rc = llog_init_handle(req->rq_svc_thread->t_env, loghandle, flags,
+			      NULL);
+	if (rc)
+		GOTO(out_close, rc);
+	flags = loghandle->lgh_hdr->llh_flags;
+
+	hdr = req_capsule_server_get(&req->rq_pill, &RMF_LLOG_LOG_HDR);
+	*hdr = *loghandle->lgh_hdr;
+	EXIT;
+out_close:
+	llog_origin_close(req->rq_svc_thread->t_env, loghandle);
+out_ctxt:
+	llog_ctxt_put(ctxt);
+	return rc;
+}
+
+int llog_origin_handle_close(struct ptlrpc_request *req)
+{
+	int	 rc;
+
+	ENTRY;
+
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc)
+		RETURN(err_serious(-ENOMEM));
+	RETURN(0);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c b/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
new file mode 100644
index 0000000000000..ed2058a159ed2
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
@@ -0,0 +1,1364 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include <obd_support.h>
+#include <obd.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <obd_class.h>
+#include "ptlrpc_internal.h"
+
+
+static struct ll_rpc_opcode {
+     __u32       opcode;
+     const char *opname;
+} ll_rpc_opcode_table[LUSTRE_MAX_OPCODES] = {
+        { OST_REPLY,        "ost_reply" },
+        { OST_GETATTR,      "ost_getattr" },
+        { OST_SETATTR,      "ost_setattr" },
+        { OST_READ,         "ost_read" },
+        { OST_WRITE,        "ost_write" },
+        { OST_CREATE ,      "ost_create" },
+        { OST_DESTROY,      "ost_destroy" },
+        { OST_GET_INFO,     "ost_get_info" },
+        { OST_CONNECT,      "ost_connect" },
+        { OST_DISCONNECT,   "ost_disconnect" },
+        { OST_PUNCH,        "ost_punch" },
+        { OST_OPEN,         "ost_open" },
+        { OST_CLOSE,        "ost_close" },
+        { OST_STATFS,       "ost_statfs" },
+        { 14,                NULL },    /* formerly OST_SAN_READ */
+        { 15,                NULL },    /* formerly OST_SAN_WRITE */
+        { OST_SYNC,         "ost_sync" },
+        { OST_SET_INFO,     "ost_set_info" },
+        { OST_QUOTACHECK,   "ost_quotacheck" },
+        { OST_QUOTACTL,     "ost_quotactl" },
+        { OST_QUOTA_ADJUST_QUNIT, "ost_quota_adjust_qunit" },
+	{ OST_LADVISE,      "ost_ladvise" },
+        { MDS_GETATTR,      "mds_getattr" },
+        { MDS_GETATTR_NAME, "mds_getattr_lock" },
+        { MDS_CLOSE,        "mds_close" },
+        { MDS_REINT,        "mds_reint" },
+        { MDS_READPAGE,     "mds_readpage" },
+        { MDS_CONNECT,      "mds_connect" },
+        { MDS_DISCONNECT,   "mds_disconnect" },
+	{ MDS_GET_ROOT,     "mds_get_root" },
+        { MDS_STATFS,       "mds_statfs" },
+        { MDS_PIN,          "mds_pin" },
+        { MDS_UNPIN,        "mds_unpin" },
+        { MDS_SYNC,         "mds_sync" },
+        { MDS_DONE_WRITING, "mds_done_writing" },
+        { MDS_SET_INFO,     "mds_set_info" },
+        { MDS_QUOTACHECK,   "mds_quotacheck" },
+        { MDS_QUOTACTL,     "mds_quotactl" },
+        { MDS_GETXATTR,     "mds_getxattr" },
+        { MDS_SETXATTR,     "mds_setxattr" },
+        { MDS_WRITEPAGE,    "mds_writepage" },
+        { MDS_IS_SUBDIR,    "mds_is_subdir" },
+        { MDS_GET_INFO,     "mds_get_info" },
+	{ MDS_HSM_STATE_GET, "mds_hsm_state_get" },
+	{ MDS_HSM_STATE_SET, "mds_hsm_state_set" },
+	{ MDS_HSM_ACTION,   "mds_hsm_action" },
+	{ MDS_HSM_PROGRESS, "mds_hsm_progress" },
+	{ MDS_HSM_REQUEST,  "mds_hsm_request" },
+	{ MDS_HSM_CT_REGISTER, "mds_hsm_ct_register" },
+	{ MDS_HSM_CT_UNREGISTER, "mds_hsm_ct_unregister" },
+	{ MDS_SWAP_LAYOUTS,	"mds_swap_layouts" },
+        { LDLM_ENQUEUE,     "ldlm_enqueue" },
+        { LDLM_CONVERT,     "ldlm_convert" },
+        { LDLM_CANCEL,      "ldlm_cancel" },
+        { LDLM_BL_CALLBACK, "ldlm_bl_callback" },
+        { LDLM_CP_CALLBACK, "ldlm_cp_callback" },
+        { LDLM_GL_CALLBACK, "ldlm_gl_callback" },
+        { LDLM_SET_INFO,    "ldlm_set_info" },
+        { MGS_CONNECT,      "mgs_connect" },
+        { MGS_DISCONNECT,   "mgs_disconnect" },
+        { MGS_EXCEPTION,    "mgs_exception" },
+        { MGS_TARGET_REG,   "mgs_target_reg" },
+        { MGS_TARGET_DEL,   "mgs_target_del" },
+        { MGS_SET_INFO,     "mgs_set_info" },
+        { MGS_CONFIG_READ,  "mgs_config_read" },
+        { OBD_PING,         "obd_ping" },
+	{ OBD_LOG_CANCEL,	"llog_cancel" },
+        { OBD_QC_CALLBACK,  "obd_quota_callback" },
+	{ OBD_IDX_READ,	    "dt_index_read" },
+	{ LLOG_ORIGIN_HANDLE_CREATE,	 "llog_origin_handle_open" },
+        { LLOG_ORIGIN_HANDLE_NEXT_BLOCK, "llog_origin_handle_next_block" },
+        { LLOG_ORIGIN_HANDLE_READ_HEADER,"llog_origin_handle_read_header" },
+        { LLOG_ORIGIN_HANDLE_WRITE_REC,  "llog_origin_handle_write_rec" },
+        { LLOG_ORIGIN_HANDLE_CLOSE,      "llog_origin_handle_close" },
+        { LLOG_ORIGIN_CONNECT,           "llog_origin_connect" },
+        { LLOG_CATINFO,                  "llog_catinfo" },
+        { LLOG_ORIGIN_HANDLE_PREV_BLOCK, "llog_origin_handle_prev_block" },
+        { LLOG_ORIGIN_HANDLE_DESTROY,    "llog_origin_handle_destroy" },
+        { QUOTA_DQACQ,      "quota_acquire" },
+        { QUOTA_DQREL,      "quota_release" },
+        { SEQ_QUERY,        "seq_query" },
+        { SEC_CTX_INIT,     "sec_ctx_init" },
+        { SEC_CTX_INIT_CONT,"sec_ctx_init_cont" },
+        { SEC_CTX_FINI,     "sec_ctx_fini" },
+	{ FLD_QUERY,        "fld_query" },
+	{ FLD_READ,	    "fld_read" },
+	{ OUT_UPDATE,	    "out_update" },
+	{ LFSCK_NOTIFY,	    "lfsck_notify" },
+	{ LFSCK_QUERY,	    "lfsck_query" },
+};
+
+static struct ll_eopcode {
+     __u32       opcode;
+     const char *opname;
+} ll_eopcode_table[EXTRA_LAST_OPC] = {
+        { LDLM_GLIMPSE_ENQUEUE, "ldlm_glimpse_enqueue" },
+        { LDLM_PLAIN_ENQUEUE,   "ldlm_plain_enqueue" },
+        { LDLM_EXTENT_ENQUEUE,  "ldlm_extent_enqueue" },
+        { LDLM_FLOCK_ENQUEUE,   "ldlm_flock_enqueue" },
+        { LDLM_IBITS_ENQUEUE,   "ldlm_ibits_enqueue" },
+        { MDS_REINT_SETATTR,    "mds_reint_setattr" },
+        { MDS_REINT_CREATE,     "mds_reint_create" },
+        { MDS_REINT_LINK,       "mds_reint_link" },
+        { MDS_REINT_UNLINK,     "mds_reint_unlink" },
+        { MDS_REINT_RENAME,     "mds_reint_rename" },
+        { MDS_REINT_OPEN,       "mds_reint_open" },
+        { MDS_REINT_SETXATTR,   "mds_reint_setxattr" },
+        { BRW_READ_BYTES,       "read_bytes" },
+        { BRW_WRITE_BYTES,      "write_bytes" },
+};
+
+const char *ll_opcode2str(__u32 opcode)
+{
+        /* When one of the assertions below fail, chances are that:
+         *     1) A new opcode was added in include/lustre/lustre_idl.h,
+         *        but is missing from the table above.
+         * or  2) The opcode space was renumbered or rearranged,
+         *        and the opcode_offset() function in
+         *        ptlrpc_internal.h needs to be modified.
+         */
+        __u32 offset = opcode_offset(opcode);
+        LASSERTF(offset < LUSTRE_MAX_OPCODES,
+                 "offset %u >= LUSTRE_MAX_OPCODES %u\n",
+                 offset, LUSTRE_MAX_OPCODES);
+        LASSERTF(ll_rpc_opcode_table[offset].opcode == opcode,
+                 "ll_rpc_opcode_table[%u].opcode %u != opcode %u\n",
+                 offset, ll_rpc_opcode_table[offset].opcode, opcode);
+        return ll_rpc_opcode_table[offset].opname;
+}
+
+const int ll_str2opcode(const char *ops)
+{
+	int i;
+
+	for (i = 0; i < LUSTRE_MAX_OPCODES; i++) {
+		if (ll_rpc_opcode_table[i].opname != NULL &&
+		    strcmp(ll_rpc_opcode_table[i].opname, ops) == 0)
+			return ll_rpc_opcode_table[i].opcode;
+	}
+
+	return -EINVAL;
+}
+
+static const char *ll_eopcode2str(__u32 opcode)
+{
+        LASSERT(ll_eopcode_table[opcode].opcode == opcode);
+        return ll_eopcode_table[opcode].opname;
+}
+
+#ifdef CONFIG_PROC_FS
+static void ptlrpc_lprocfs_register(struct proc_dir_entry *root, char *dir,
+                             char *name, struct proc_dir_entry **procroot_ret,
+                             struct lprocfs_stats **stats_ret)
+{
+        struct proc_dir_entry *svc_procroot;
+        struct lprocfs_stats *svc_stats;
+        int i, rc;
+        unsigned int svc_counter_config = LPROCFS_CNTR_AVGMINMAX |
+                                          LPROCFS_CNTR_STDDEV;
+
+        LASSERT(*procroot_ret == NULL);
+        LASSERT(*stats_ret == NULL);
+
+        svc_stats = lprocfs_alloc_stats(EXTRA_MAX_OPCODES+LUSTRE_MAX_OPCODES,0);
+        if (svc_stats == NULL)
+                return;
+
+        if (dir) {
+		svc_procroot = lprocfs_register(dir, root, NULL, NULL);
+                if (IS_ERR(svc_procroot)) {
+                        lprocfs_free_stats(&svc_stats);
+                        return;
+                }
+        } else {
+                svc_procroot = root;
+        }
+
+        lprocfs_counter_init(svc_stats, PTLRPC_REQWAIT_CNTR,
+                             svc_counter_config, "req_waittime", "usec");
+        lprocfs_counter_init(svc_stats, PTLRPC_REQQDEPTH_CNTR,
+                             svc_counter_config, "req_qdepth", "reqs");
+        lprocfs_counter_init(svc_stats, PTLRPC_REQACTIVE_CNTR,
+                             svc_counter_config, "req_active", "reqs");
+        lprocfs_counter_init(svc_stats, PTLRPC_TIMEOUT,
+                             svc_counter_config, "req_timeout", "sec");
+        lprocfs_counter_init(svc_stats, PTLRPC_REQBUF_AVAIL_CNTR,
+                             svc_counter_config, "reqbuf_avail", "bufs");
+        for (i = 0; i < EXTRA_LAST_OPC; i++) {
+                char *units;
+
+                switch(i) {
+                case BRW_WRITE_BYTES:
+                case BRW_READ_BYTES:
+                        units = "bytes";
+                        break;
+                default:
+                        units = "reqs";
+                        break;
+                }
+                lprocfs_counter_init(svc_stats, PTLRPC_LAST_CNTR + i,
+                                     svc_counter_config,
+                                     ll_eopcode2str(i), units);
+        }
+        for (i = 0; i < LUSTRE_MAX_OPCODES; i++) {
+                __u32 opcode = ll_rpc_opcode_table[i].opcode;
+                lprocfs_counter_init(svc_stats,
+                                     EXTRA_MAX_OPCODES + i, svc_counter_config,
+                                     ll_opcode2str(opcode), "usec");
+        }
+
+        rc = lprocfs_register_stats(svc_procroot, name, svc_stats);
+        if (rc < 0) {
+                if (dir)
+                        lprocfs_remove(&svc_procroot);
+                lprocfs_free_stats(&svc_stats);
+        } else {
+                if (dir)
+                        *procroot_ret = svc_procroot;
+                *stats_ret = svc_stats;
+        }
+}
+
+static int
+ptlrpc_lprocfs_req_history_len_seq_show(struct seq_file *m, void *v)
+{
+	struct ptlrpc_service *svc = m->private;
+	struct ptlrpc_service_part *svcpt;
+	int	total = 0;
+	int	i;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc)
+		total += svcpt->scp_hist_nrqbds;
+
+	seq_printf(m, "%d\n", total);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_req_history_len);
+
+static int
+ptlrpc_lprocfs_req_history_max_seq_show(struct seq_file *m, void *n)
+{
+	struct ptlrpc_service *svc = m->private;
+	struct ptlrpc_service_part *svcpt;
+	int	total = 0;
+	int	i;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc)
+		total += svc->srv_hist_nrqbds_cpt_max;
+
+	seq_printf(m, "%d\n", total);
+	return 0;
+}
+
+static ssize_t
+ptlrpc_lprocfs_req_history_max_seq_write(struct file *file,
+					 const char __user *buffer,
+					 size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct ptlrpc_service *svc = m->private;
+	int bufpages;
+	__s64 val;
+	int rc;
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc < 0)
+		return rc;
+
+	if (val < 0 || val > INT_MAX)
+		return -ERANGE;
+
+	/* This sanity check is more of an insanity check; we can still
+	 * hose a kernel by allowing the request history to grow too
+	 * far. */
+	bufpages = (svc->srv_buf_size + PAGE_SIZE - 1) >>
+							PAGE_SHIFT;
+	if (val > totalram_pages/(2 * bufpages))
+		return -ERANGE;
+
+	spin_lock(&svc->srv_lock);
+
+	if (val == 0)
+		svc->srv_hist_nrqbds_cpt_max = 0;
+	else
+		svc->srv_hist_nrqbds_cpt_max =
+			max(1, ((int)val / svc->srv_ncpts));
+
+	spin_unlock(&svc->srv_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_req_history_max);
+
+static int
+ptlrpc_lprocfs_threads_min_seq_show(struct seq_file *m, void *n)
+{
+	struct ptlrpc_service *svc = m->private;
+
+	seq_printf(m, "%d\n",
+		   svc->srv_nthrs_cpt_init * svc->srv_ncpts);
+	return 0;
+}
+
+static ssize_t
+ptlrpc_lprocfs_threads_min_seq_write(struct file *file,
+				     const char __user *buffer,
+				     size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct ptlrpc_service *svc = m->private;
+	__s64 val;
+	int rc = lprocfs_str_to_s64(buffer, count, &val);
+
+	if (rc < 0)
+		return rc;
+
+	if (val / svc->srv_ncpts < PTLRPC_NTHRS_INIT)
+		return -ERANGE;
+
+	spin_lock(&svc->srv_lock);
+	if (val > svc->srv_nthrs_cpt_limit * svc->srv_ncpts) {
+		spin_unlock(&svc->srv_lock);
+		return -ERANGE;
+	}
+
+	svc->srv_nthrs_cpt_init = (int)val / svc->srv_ncpts;
+
+	spin_unlock(&svc->srv_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_threads_min);
+
+static int
+ptlrpc_lprocfs_threads_started_seq_show(struct seq_file *m, void *n)
+{
+	struct ptlrpc_service		*svc = m->private;
+	struct ptlrpc_service_part	*svcpt;
+	int	total = 0;
+	int	i;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc)
+		total += svcpt->scp_nthrs_running;
+
+	seq_printf(m, "%d\n", total);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_threads_started);
+
+static int
+ptlrpc_lprocfs_threads_max_seq_show(struct seq_file *m, void *n)
+{
+	struct ptlrpc_service *svc = m->private;
+
+	seq_printf(m, "%d\n",
+		   svc->srv_nthrs_cpt_limit * svc->srv_ncpts);
+	return 0;
+}
+
+static ssize_t
+ptlrpc_lprocfs_threads_max_seq_write(struct file *file,
+				     const char __user *buffer,
+				     size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct ptlrpc_service *svc = m->private;
+	__s64 val;
+	int rc = lprocfs_str_to_s64(buffer, count, &val);
+
+	if (rc < 0)
+		return rc;
+
+	if (val / svc->srv_ncpts < PTLRPC_NTHRS_INIT)
+		return -ERANGE;
+
+	spin_lock(&svc->srv_lock);
+	if (val < svc->srv_nthrs_cpt_init * svc->srv_ncpts) {
+		spin_unlock(&svc->srv_lock);
+		return -ERANGE;
+	}
+
+	svc->srv_nthrs_cpt_limit = (int)val / svc->srv_ncpts;
+
+	spin_unlock(&svc->srv_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_threads_max);
+
+/**
+ * Translates \e ptlrpc_nrs_pol_state values to human-readable strings.
+ *
+ * \param[in] state The policy state
+ */
+static const char *nrs_state2str(enum ptlrpc_nrs_pol_state state)
+{
+	switch (state) {
+	default:
+		LBUG();
+	case NRS_POL_STATE_INVALID:
+		return "invalid";
+	case NRS_POL_STATE_STOPPED:
+		return "stopped";
+	case NRS_POL_STATE_STOPPING:
+		return "stopping";
+	case NRS_POL_STATE_STARTING:
+		return "starting";
+	case NRS_POL_STATE_STARTED:
+		return "started";
+	}
+}
+
+/**
+ * Obtains status information for \a policy.
+ *
+ * Information is copied in \a info.
+ *
+ * \param[in] policy The policy
+ * \param[out] info  Holds returned status information
+ */
+void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy,
+				struct ptlrpc_nrs_pol_info *info)
+{
+	LASSERT(policy != NULL);
+	LASSERT(info != NULL);
+	assert_spin_locked(&policy->pol_nrs->nrs_lock);
+
+	LASSERT(sizeof(info->pi_arg) == sizeof(policy->pol_arg));
+	memcpy(info->pi_name, policy->pol_desc->pd_name, NRS_POL_NAME_MAX);
+	memcpy(info->pi_arg, policy->pol_arg, sizeof(policy->pol_arg));
+
+	info->pi_fallback    = !!(policy->pol_flags & PTLRPC_NRS_FL_FALLBACK);
+	info->pi_state	     = policy->pol_state;
+	/**
+	 * XXX: These are accessed without holding
+	 * ptlrpc_service_part::scp_req_lock.
+	 */
+	info->pi_req_queued  = policy->pol_req_queued;
+	info->pi_req_started = policy->pol_req_started;
+}
+
+/**
+ * Reads and prints policy status information for all policies of a PTLRPC
+ * service.
+ */
+static int ptlrpc_lprocfs_nrs_seq_show(struct seq_file *m, void *n)
+{
+	struct ptlrpc_service	       *svc = m->private;
+	struct ptlrpc_service_part     *svcpt;
+	struct ptlrpc_nrs	       *nrs;
+	struct ptlrpc_nrs_policy       *policy;
+	struct ptlrpc_nrs_pol_info     *infos;
+	struct ptlrpc_nrs_pol_info	tmp;
+	unsigned			num_pols;
+	unsigned			pol_idx = 0;
+	bool				hp = false;
+	int				i;
+	int				rc = 0;
+	ENTRY;
+
+	/**
+	 * Serialize NRS core lprocfs operations with policy registration/
+	 * unregistration.
+	 */
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	/**
+	 * Use the first service partition's regular NRS head in order to obtain
+	 * the number of policies registered with NRS heads of this service. All
+	 * service partitions will have the same number of policies.
+	 */
+	nrs = nrs_svcpt2nrs(svc->srv_parts[0], false);
+
+	spin_lock(&nrs->nrs_lock);
+	num_pols = svc->srv_parts[0]->scp_nrs_reg.nrs_num_pols;
+	spin_unlock(&nrs->nrs_lock);
+
+	OBD_ALLOC(infos, num_pols * sizeof(*infos));
+	if (infos == NULL)
+		GOTO(out, rc = -ENOMEM);
+again:
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		nrs = nrs_svcpt2nrs(svcpt, hp);
+		spin_lock(&nrs->nrs_lock);
+
+		pol_idx = 0;
+
+		list_for_each_entry(policy, &nrs->nrs_policy_list,
+				    pol_list) {
+			LASSERT(pol_idx < num_pols);
+
+			nrs_policy_get_info_locked(policy, &tmp);
+			/**
+			 * Copy values when handling the first service
+			 * partition.
+			 */
+			if (i == 0) {
+				memcpy(infos[pol_idx].pi_name, tmp.pi_name,
+				       NRS_POL_NAME_MAX);
+				memcpy(infos[pol_idx].pi_arg, tmp.pi_arg,
+				       sizeof(tmp.pi_arg));
+				memcpy(&infos[pol_idx].pi_state, &tmp.pi_state,
+				       sizeof(tmp.pi_state));
+				infos[pol_idx].pi_fallback = tmp.pi_fallback;
+				/**
+				 * For the rest of the service partitions
+				 * sanity-check the values we get.
+				 */
+			} else {
+				LASSERT(strncmp(infos[pol_idx].pi_name,
+						tmp.pi_name,
+						NRS_POL_NAME_MAX) == 0);
+				LASSERT(strncmp(infos[pol_idx].pi_arg,
+						tmp.pi_arg,
+						sizeof(tmp.pi_arg)) == 0);
+				/**
+				 * Not asserting ptlrpc_nrs_pol_info::pi_state,
+				 * because it may be different between
+				 * instances of the same policy in different
+				 * service partitions.
+				 */
+				LASSERT(infos[pol_idx].pi_fallback ==
+					tmp.pi_fallback);
+			}
+
+			infos[pol_idx].pi_req_queued += tmp.pi_req_queued;
+			infos[pol_idx].pi_req_started += tmp.pi_req_started;
+
+			pol_idx++;
+		}
+		spin_unlock(&nrs->nrs_lock);
+	}
+
+	/**
+	 * Policy status information output is in YAML format.
+	 * For example:
+	 *
+	 *	regular_requests:
+	 *	  - name: fifo
+	 *	    state: started
+	 *	    fallback: yes
+	 *	    queued: 0
+	 *	    active: 0
+	 *
+	 *	  - name: crrn
+	 *	    state: started
+	 *	    fallback: no
+	 *	    queued: 2015
+	 *	    active: 384
+	 *
+	 *	high_priority_requests:
+	 *	  - name: fifo
+	 *	    state: started
+	 *	    fallback: yes
+	 *	    queued: 0
+	 *	    active: 2
+	 *
+	 *	  - name: crrn
+	 *	    state: stopped
+	 *	    fallback: no
+	 *	    queued: 0
+	 *	    active: 0
+	 */
+	seq_printf(m, "%s\n", !hp ? "\nregular_requests:" :
+		   "high_priority_requests:");
+
+	for (pol_idx = 0; pol_idx < num_pols; pol_idx++) {
+		if (strlen(infos[pol_idx].pi_arg) > 0)
+			seq_printf(m, "  - name: %s %s\n",
+				   infos[pol_idx].pi_name,
+				   infos[pol_idx].pi_arg);
+		else
+			seq_printf(m, "  - name: %s\n",
+				   infos[pol_idx].pi_name);
+
+
+		seq_printf(m, "    state: %s\n"
+			   "    fallback: %s\n"
+			   "    queued: %-20d\n"
+			   "    active: %-20d\n\n",
+			   nrs_state2str(infos[pol_idx].pi_state),
+			   infos[pol_idx].pi_fallback ? "yes" : "no",
+			   (int)infos[pol_idx].pi_req_queued,
+			   (int)infos[pol_idx].pi_req_started);
+	}
+
+	if (!hp && nrs_svc_has_hp(svc)) {
+		memset(infos, 0, num_pols * sizeof(*infos));
+
+		/**
+		 * Redo the processing for the service's HP NRS heads' policies.
+		 */
+		hp = true;
+		goto again;
+	}
+
+out:
+	if (infos)
+		OBD_FREE(infos, num_pols * sizeof(*infos));
+
+	mutex_unlock(&nrs_core.nrs_mutex);
+
+	RETURN(rc);
+}
+
+
+#define LPROCFS_NRS_WR_MAX_ARG (1024)
+/**
+ * The longest valid command string is the maxium policy name size, plus the
+ * length of the " reg" substring, plus the lenght of argument
+ */
+#define LPROCFS_NRS_WR_MAX_CMD	(NRS_POL_NAME_MAX + sizeof(" reg") - 1 \
+				 + LPROCFS_NRS_WR_MAX_ARG)
+
+/**
+ * Starts and stops a given policy on a PTLRPC service.
+ *
+ * Commands consist of the policy name, followed by an optional [reg|hp] token;
+ * if the optional token is omitted, the operation is performed on both the
+ * regular and high-priority (if the service has one) NRS head.
+ */
+static ssize_t
+ptlrpc_lprocfs_nrs_seq_write(struct file *file, const char __user *buffer,
+			     size_t count, loff_t *off)
+{
+	struct seq_file		       *m = file->private_data;
+	struct ptlrpc_service	       *svc = m->private;
+	enum ptlrpc_nrs_queue_type	queue = PTLRPC_NRS_QUEUE_BOTH;
+	char			       *cmd;
+	char			       *cmd_copy = NULL;
+	char			       *policy_name;
+	char			       *queue_name;
+	int				rc = 0;
+	ENTRY;
+
+	if (count >= LPROCFS_NRS_WR_MAX_CMD)
+		GOTO(out, rc = -EINVAL);
+
+	OBD_ALLOC(cmd, LPROCFS_NRS_WR_MAX_CMD);
+	if (cmd == NULL)
+		GOTO(out, rc = -ENOMEM);
+	/**
+	 * strsep() modifies its argument, so keep a copy
+	 */
+	cmd_copy = cmd;
+
+	if (copy_from_user(cmd, buffer, count))
+		GOTO(out, rc = -EFAULT);
+
+	cmd[count] = '\0';
+
+	policy_name = strsep(&cmd, " ");
+
+	if (strlen(policy_name) > NRS_POL_NAME_MAX - 1)
+		GOTO(out, rc = -EINVAL);
+
+	/**
+	 * No [reg|hp] token has been specified
+	 */
+	if (cmd == NULL)
+		goto default_queue;
+
+	queue_name = strsep(&cmd, " ");
+	/**
+	 * The second token is either an optional [reg|hp] string,
+	 * or arguments
+	 */
+	if (strcmp(queue_name, "reg") == 0)
+		queue = PTLRPC_NRS_QUEUE_REG;
+	else if (strcmp(queue_name, "hp") == 0)
+		queue = PTLRPC_NRS_QUEUE_HP;
+	else {
+		if (cmd != NULL)
+			*(cmd - 1) = ' ';
+		cmd = queue_name;
+	}
+
+default_queue:
+
+	if (queue == PTLRPC_NRS_QUEUE_HP && !nrs_svc_has_hp(svc))
+		GOTO(out, rc = -ENODEV);
+	else if (queue == PTLRPC_NRS_QUEUE_BOTH && !nrs_svc_has_hp(svc))
+		queue = PTLRPC_NRS_QUEUE_REG;
+
+	/**
+	 * Serialize NRS core lprocfs operations with policy registration/
+	 * unregistration.
+	 */
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	rc = ptlrpc_nrs_policy_control(svc, queue, policy_name,
+				       PTLRPC_NRS_CTL_START,
+				       false, cmd);
+
+	mutex_unlock(&nrs_core.nrs_mutex);
+out:
+	if (cmd_copy)
+		OBD_FREE(cmd_copy, LPROCFS_NRS_WR_MAX_CMD);
+
+	RETURN(rc < 0 ? rc : count);
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs);
+
+/** @} nrs */
+
+struct ptlrpc_srh_iterator {
+	int			srhi_idx;
+	__u64			srhi_seq;
+	struct ptlrpc_request	*srhi_req;
+};
+
+static int
+ptlrpc_lprocfs_svc_req_history_seek(struct ptlrpc_service_part *svcpt,
+				    struct ptlrpc_srh_iterator *srhi,
+				    __u64 seq)
+{
+	struct list_head	*e;
+	struct ptlrpc_request	*req;
+
+	if (srhi->srhi_req != NULL &&
+	    srhi->srhi_seq > svcpt->scp_hist_seq_culled &&
+            srhi->srhi_seq <= seq) {
+                /* If srhi_req was set previously, hasn't been culled and
+                 * we're searching for a seq on or after it (i.e. more
+                 * recent), search from it onwards.
+                 * Since the service history is LRU (i.e. culled reqs will
+                 * be near the head), we shouldn't have to do long
+                 * re-scans */
+		LASSERTF(srhi->srhi_seq == srhi->srhi_req->rq_history_seq,
+			 "%s:%d: seek seq %llu, request seq %llu\n",
+			 svcpt->scp_service->srv_name, svcpt->scp_cpt,
+			 srhi->srhi_seq, srhi->srhi_req->rq_history_seq);
+		LASSERTF(!list_empty(&svcpt->scp_hist_reqs),
+			 "%s:%d: seek offset %llu, request seq %llu, "
+			 "last culled %llu\n",
+			 svcpt->scp_service->srv_name, svcpt->scp_cpt,
+			 seq, srhi->srhi_seq, svcpt->scp_hist_seq_culled);
+		e = &srhi->srhi_req->rq_history_list;
+	} else {
+		/* search from start */
+		e = svcpt->scp_hist_reqs.next;
+	}
+
+	while (e != &svcpt->scp_hist_reqs) {
+		req = list_entry(e, struct ptlrpc_request, rq_history_list);
+
+                if (req->rq_history_seq >= seq) {
+                        srhi->srhi_seq = req->rq_history_seq;
+                        srhi->srhi_req = req;
+                        return 0;
+                }
+                e = e->next;
+        }
+
+        return -ENOENT;
+}
+
+/*
+ * ptlrpc history sequence is used as "position" of seq_file, in some case,
+ * seq_read() will increase "position" to indicate reading the next
+ * element, however, low bits of history sequence are reserved for CPT id
+ * (check the details from comments before ptlrpc_req_add_history), which
+ * means seq_read() might change CPT id of history sequence and never
+ * finish reading of requests on a CPT. To make it work, we have to shift
+ * CPT id to high bits and timestamp to low bits, so seq_read() will only
+ * increase timestamp which can correctly indicate the next position.
+ */
+
+/* convert seq_file pos to cpt */
+#define PTLRPC_REQ_POS2CPT(svc, pos)			\
+	((svc)->srv_cpt_bits == 0 ? 0 :			\
+	 (__u64)(pos) >> (64 - (svc)->srv_cpt_bits))
+
+/* make up seq_file pos from cpt */
+#define PTLRPC_REQ_CPT2POS(svc, cpt)			\
+	((svc)->srv_cpt_bits == 0 ? 0 :			\
+	 (cpt) << (64 - (svc)->srv_cpt_bits))
+
+/* convert sequence to position */
+#define PTLRPC_REQ_SEQ2POS(svc, seq)			\
+	((svc)->srv_cpt_bits == 0 ? (seq) :		\
+	 ((seq) >> (svc)->srv_cpt_bits) |		\
+	 ((seq) << (64 - (svc)->srv_cpt_bits)))
+
+/* convert position to sequence */
+#define PTLRPC_REQ_POS2SEQ(svc, pos)			\
+	((svc)->srv_cpt_bits == 0 ? (pos) :		\
+	 ((__u64)(pos) << (svc)->srv_cpt_bits) |	\
+	 ((__u64)(pos) >> (64 - (svc)->srv_cpt_bits)))
+
+static void *
+ptlrpc_lprocfs_svc_req_history_start(struct seq_file *s, loff_t *pos)
+{
+	struct ptlrpc_service		*svc = s->private;
+	struct ptlrpc_service_part	*svcpt;
+	struct ptlrpc_srh_iterator	*srhi;
+	unsigned int			cpt;
+	int				rc;
+	int				i;
+
+	if (sizeof(loff_t) != sizeof(__u64)) { /* can't support */
+		CWARN("Failed to read request history because size of loff_t "
+		      "%d can't match size of u64\n", (int)sizeof(loff_t));
+		return NULL;
+	}
+
+	OBD_ALLOC(srhi, sizeof(*srhi));
+	if (srhi == NULL)
+		return NULL;
+
+	srhi->srhi_seq = 0;
+	srhi->srhi_req = NULL;
+
+	cpt = PTLRPC_REQ_POS2CPT(svc, *pos);
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (i < cpt) /* skip */
+			continue;
+		if (i > cpt) /* make up the lowest position for this CPT */
+			*pos = PTLRPC_REQ_CPT2POS(svc, i);
+
+		spin_lock(&svcpt->scp_lock);
+		rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi,
+				PTLRPC_REQ_POS2SEQ(svc, *pos));
+		spin_unlock(&svcpt->scp_lock);
+		if (rc == 0) {
+			*pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq);
+			srhi->srhi_idx = i;
+			return srhi;
+		}
+	}
+
+	OBD_FREE(srhi, sizeof(*srhi));
+	return NULL;
+}
+
+static void
+ptlrpc_lprocfs_svc_req_history_stop(struct seq_file *s, void *iter)
+{
+        struct ptlrpc_srh_iterator *srhi = iter;
+
+        if (srhi != NULL)
+                OBD_FREE(srhi, sizeof(*srhi));
+}
+
+static void *
+ptlrpc_lprocfs_svc_req_history_next(struct seq_file *s,
+				    void *iter, loff_t *pos)
+{
+	struct ptlrpc_service		*svc = s->private;
+	struct ptlrpc_srh_iterator	*srhi = iter;
+	struct ptlrpc_service_part	*svcpt;
+	__u64				seq;
+	int				rc;
+	int				i;
+
+	for (i = srhi->srhi_idx; i < svc->srv_ncpts; i++) {
+		svcpt = svc->srv_parts[i];
+
+		if (i > srhi->srhi_idx) { /* reset iterator for a new CPT */
+			srhi->srhi_req = NULL;
+			seq = srhi->srhi_seq = 0;
+		} else { /* the next sequence */
+			seq = srhi->srhi_seq + (1 << svc->srv_cpt_bits);
+		}
+
+		spin_lock(&svcpt->scp_lock);
+		rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, seq);
+		spin_unlock(&svcpt->scp_lock);
+		if (rc == 0) {
+			*pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq);
+			srhi->srhi_idx = i;
+			return srhi;
+		}
+	}
+
+	OBD_FREE(srhi, sizeof(*srhi));
+	return NULL;
+}
+
+/* common ost/mdt so_req_printer */
+void target_print_req(void *seq_file, struct ptlrpc_request *req)
+{
+        /* Called holding srv_lock with irqs disabled.
+         * Print specific req contents and a newline.
+         * CAVEAT EMPTOR: check request message length before printing!!!
+         * You might have received any old crap so you must be just as
+         * careful here as the service's request parser!!! */
+        struct seq_file *sf = seq_file;
+
+        switch (req->rq_phase) {
+        case RQ_PHASE_NEW:
+                /* still awaiting a service thread's attention, or rejected
+                 * because the generic request message didn't unpack */
+                seq_printf(sf, "<not swabbed>\n");
+                break;
+        case RQ_PHASE_INTERPRET:
+                /* being handled, so basic msg swabbed, and opc is valid
+                 * but racing with mds_handle() */
+        case RQ_PHASE_COMPLETE:
+                /* been handled by mds_handle() reply state possibly still
+                 * volatile */
+                seq_printf(sf, "opc %d\n", lustre_msg_get_opc(req->rq_reqmsg));
+                break;
+        default:
+                DEBUG_REQ(D_ERROR, req, "bad phase %d", req->rq_phase);
+        }
+}
+EXPORT_SYMBOL(target_print_req);
+
+static int ptlrpc_lprocfs_svc_req_history_show(struct seq_file *s, void *iter)
+{
+	struct ptlrpc_service		*svc = s->private;
+	struct ptlrpc_srh_iterator	*srhi = iter;
+	struct ptlrpc_service_part	*svcpt;
+	struct ptlrpc_request		*req;
+	int				rc;
+
+	LASSERT(srhi->srhi_idx < svc->srv_ncpts);
+
+	svcpt = svc->srv_parts[srhi->srhi_idx];
+
+	spin_lock(&svcpt->scp_lock);
+
+	rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, srhi->srhi_seq);
+
+	if (rc == 0) {
+		struct timespec64 arrival, sent, arrivaldiff;
+		char nidstr[LNET_NIDSTR_SIZE];
+
+		req = srhi->srhi_req;
+
+		libcfs_nid2str_r(req->rq_self, nidstr, sizeof(nidstr));
+		arrival.tv_sec = req->rq_arrival_time.tv_sec;
+		arrival.tv_nsec = req->rq_arrival_time.tv_nsec;
+		sent.tv_sec = req->rq_sent;
+		sent.tv_nsec = 0;
+		arrivaldiff = timespec64_sub(sent, arrival);
+
+		/* Print common req fields.
+		 * CAVEAT EMPTOR: we're racing with the service handler
+		 * here.  The request could contain any old crap, so you
+		 * must be just as careful as the service's request
+		 * parser. Currently I only print stuff here I know is OK
+		 * to look at coz it was set up in request_in_callback()!!!
+		 */
+		seq_printf(s, "%lld:%s:%s:x%llu:%d:%s:%lld.%06lld:%lld.%06llds(%+lld.0s) ",
+			   req->rq_history_seq, nidstr,
+			   libcfs_id2str(req->rq_peer), req->rq_xid,
+			   req->rq_reqlen, ptlrpc_rqphase2str(req),
+			   (s64)req->rq_arrival_time.tv_sec,
+			   (s64)(req->rq_arrival_time.tv_nsec / NSEC_PER_USEC),
+			   (s64)arrivaldiff.tv_sec,
+			   (s64)(arrivaldiff.tv_nsec / NSEC_PER_USEC),
+			   (s64)(req->rq_sent - req->rq_deadline));
+		if (svc->srv_ops.so_req_printer == NULL)
+			seq_printf(s, "\n");
+		else
+			svc->srv_ops.so_req_printer(s, srhi->srhi_req);
+	}
+
+	spin_unlock(&svcpt->scp_lock);
+	return rc;
+}
+
+static int
+ptlrpc_lprocfs_svc_req_history_open(struct inode *inode, struct file *file)
+{
+	static struct seq_operations sops = {
+		.start = ptlrpc_lprocfs_svc_req_history_start,
+		.stop  = ptlrpc_lprocfs_svc_req_history_stop,
+		.next  = ptlrpc_lprocfs_svc_req_history_next,
+		.show  = ptlrpc_lprocfs_svc_req_history_show,
+	};
+	struct seq_file	*seqf;
+	int		rc;
+
+	rc = LPROCFS_ENTRY_CHECK(inode);
+	if (rc < 0)
+		return rc;
+
+	rc = seq_open(file, &sops);
+	if (rc)
+		return rc;
+
+	seqf = file->private_data;
+	seqf->private = PDE_DATA(inode);
+	return 0;
+}
+
+/* See also lprocfs_rd_timeouts */
+static int ptlrpc_lprocfs_timeouts_seq_show(struct seq_file *m, void *n)
+{
+	struct ptlrpc_service		*svc = m->private;
+	struct ptlrpc_service_part	*svcpt;
+	time64_t worstt;
+	unsigned int			cur;
+	unsigned int			worst;
+	int				i;
+
+	if (AT_OFF) {
+		seq_printf(m, "adaptive timeouts off, using obd_timeout %u\n",
+			   obd_timeout);
+		return 0;
+	}
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		cur	= at_get(&svcpt->scp_at_estimate);
+		worst	= svcpt->scp_at_estimate.at_worst_ever;
+		worstt	= svcpt->scp_at_estimate.at_worst_time;
+
+		seq_printf(m, "%10s : cur %3u  worst %3u (at %lld, %llds ago) ",
+			   "service", cur, worst, (s64)worstt,
+			   (s64)(ktime_get_real_seconds() - worstt));
+
+		lprocfs_at_hist_helper(m, &svcpt->scp_at_estimate);
+	}
+
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_timeouts);
+
+static int ptlrpc_lprocfs_hp_ratio_seq_show(struct seq_file *m, void *v)
+{
+	struct ptlrpc_service *svc = m->private;
+	seq_printf(m, "%d\n", svc->srv_hpreq_ratio);
+	return 0;
+}
+
+static ssize_t
+ptlrpc_lprocfs_hp_ratio_seq_write(struct file *file, const char __user *buffer,
+				  size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct ptlrpc_service *svc = m->private;
+	int rc;
+	__s64 val;
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc < 0)
+		return rc;
+
+	if (val < 0 || val > INT_MAX)
+		return -ERANGE;
+
+	spin_lock(&svc->srv_lock);
+	svc->srv_hpreq_ratio = val;
+	spin_unlock(&svc->srv_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_hp_ratio);
+
+void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry,
+                                     struct ptlrpc_service *svc)
+{
+	struct lprocfs_vars lproc_vars[] = {
+		{ .name	= "high_priority_ratio",
+		  .fops	= &ptlrpc_lprocfs_hp_ratio_fops,
+		  .data = svc },
+		{ .name	= "req_buffer_history_len",
+		  .fops	= &ptlrpc_lprocfs_req_history_len_fops,
+		  .data	= svc },
+		{ .name = "req_buffer_history_max",
+		  .fops	= &ptlrpc_lprocfs_req_history_max_fops,
+		  .data	= svc },
+		{ .name = "threads_min",
+		  .fops = &ptlrpc_lprocfs_threads_min_fops,
+		  .data = svc },
+		{ .name = "threads_max",
+		  .fops = &ptlrpc_lprocfs_threads_max_fops,
+		  .data = svc },
+		{ .name = "threads_started",
+		  .fops = &ptlrpc_lprocfs_threads_started_fops,
+		  .data = svc },
+		{ .name = "timeouts",
+		  .fops = &ptlrpc_lprocfs_timeouts_fops,
+		  .data = svc },
+		{ .name = "nrs_policies",
+		  .fops = &ptlrpc_lprocfs_nrs_fops,
+		  .data = svc },
+		{ NULL }
+        };
+        static struct file_operations req_history_fops = {
+                .owner       = THIS_MODULE,
+                .open        = ptlrpc_lprocfs_svc_req_history_open,
+                .read        = seq_read,
+                .llseek      = seq_lseek,
+                .release     = lprocfs_seq_release,
+        };
+
+        int rc;
+
+        ptlrpc_lprocfs_register(entry, svc->srv_name,
+				"stats", &svc->srv_procroot,
+				&svc->srv_stats);
+	if (svc->srv_procroot == NULL)
+		return;
+
+	lprocfs_add_vars(svc->srv_procroot, lproc_vars, NULL);
+
+	rc = lprocfs_seq_create(svc->srv_procroot, "req_history",
+				0400, &req_history_fops, svc);
+	if (rc)
+		CWARN("Error adding the req_history file\n");
+}
+
+void ptlrpc_lprocfs_register_obd(struct obd_device *obddev)
+{
+        ptlrpc_lprocfs_register(obddev->obd_proc_entry, NULL, "stats",
+                                &obddev->obd_svc_procroot,
+                                &obddev->obd_svc_stats);
+}
+EXPORT_SYMBOL(ptlrpc_lprocfs_register_obd);
+
+void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount)
+{
+        struct lprocfs_stats *svc_stats;
+        __u32 op = lustre_msg_get_opc(req->rq_reqmsg);
+        int opc = opcode_offset(op);
+
+        svc_stats = req->rq_import->imp_obd->obd_svc_stats;
+        if (svc_stats == NULL || opc <= 0)
+                return;
+        LASSERT(opc < LUSTRE_MAX_OPCODES);
+        if (!(op == LDLM_ENQUEUE || op == MDS_REINT))
+                lprocfs_counter_add(svc_stats, opc + EXTRA_MAX_OPCODES, amount);
+}
+
+void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes)
+{
+        struct lprocfs_stats *svc_stats;
+        int idx;
+
+        if (!req->rq_import)
+                return;
+        svc_stats = req->rq_import->imp_obd->obd_svc_stats;
+        if (!svc_stats)
+                return;
+        idx = lustre_msg_get_opc(req->rq_reqmsg);
+        switch (idx) {
+        case OST_READ:
+                idx = BRW_READ_BYTES + PTLRPC_LAST_CNTR;
+                break;
+        case OST_WRITE:
+                idx = BRW_WRITE_BYTES + PTLRPC_LAST_CNTR;
+                break;
+        default:
+                LASSERTF(0, "unsupported opcode %u\n", idx);
+                break;
+        }
+
+        lprocfs_counter_add(svc_stats, idx, bytes);
+}
+
+EXPORT_SYMBOL(ptlrpc_lprocfs_brw);
+
+void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc)
+{
+        if (svc->srv_procroot != NULL)
+                lprocfs_remove(&svc->srv_procroot);
+
+        if (svc->srv_stats)
+                lprocfs_free_stats(&svc->srv_stats);
+}
+
+void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd)
+{
+	/* cleanup first to allow concurrent access to device's
+	 * stats via debugfs to complete safely
+	 */
+	lprocfs_obd_cleanup(obd);
+
+        if (obd->obd_svc_procroot)
+                lprocfs_remove(&obd->obd_svc_procroot);
+
+        if (obd->obd_svc_stats)
+                lprocfs_free_stats(&obd->obd_svc_stats);
+}
+EXPORT_SYMBOL(ptlrpc_lprocfs_unregister_obd);
+
+ssize_t
+lprocfs_ping_seq_write(struct file *file, const char __user *buffer,
+		       size_t count, loff_t *off)
+{
+	struct seq_file		*m = file->private_data;
+	struct obd_device	*obd = m->private;
+	struct ptlrpc_request	*req;
+	int			rc;
+	ENTRY;
+
+	LPROCFS_CLIMP_CHECK(obd);
+	req = ptlrpc_prep_ping(obd->u.cli.cl_import);
+	LPROCFS_CLIMP_EXIT(obd);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	req->rq_send_state = LUSTRE_IMP_FULL;
+
+	rc = ptlrpc_queue_wait(req);
+
+	ptlrpc_req_finished(req);
+	if (rc >= 0)
+		RETURN(count);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lprocfs_ping_seq_write);
+
+/* Write the connection UUID to this file to attempt to connect to that node.
+ * The connection UUID is a node's primary NID. For example,
+ * "echo connection=192.168.0.1@tcp0::instance > .../import".
+ */
+ssize_t
+lprocfs_import_seq_write(struct file *file, const char __user *buffer,
+			 size_t count, loff_t *off)
+{
+	struct seq_file	  *m	= file->private_data;
+	struct obd_device *obd	= m->private;
+	struct obd_import *imp	= obd->u.cli.cl_import;
+	char *kbuf = NULL;
+	char *uuid;
+	char *ptr;
+	int do_reconn = 1;
+	const char prefix[] = "connection=";
+	const int prefix_len = sizeof(prefix) - 1;
+
+	if (count > PAGE_SIZE - 1 || count <= prefix_len)
+		return -EINVAL;
+
+	OBD_ALLOC(kbuf, count + 1);
+	if (kbuf == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(kbuf, buffer, count))
+		GOTO(out, count = -EFAULT);
+
+	kbuf[count] = 0;
+
+	/* only support connection=uuid::instance now */
+	if (strncmp(prefix, kbuf, prefix_len) != 0)
+		GOTO(out, count = -EINVAL);
+
+	uuid = kbuf + prefix_len;
+	ptr = strstr(uuid, "::");
+	if (ptr) {
+		__u32 inst;
+		char *endptr;
+
+		*ptr = 0;
+		do_reconn = 0;
+		ptr += 2; /* Skip :: */
+		inst = simple_strtol(ptr, &endptr, 10);
+		if (*endptr) {
+			CERROR("config: wrong instance # %s\n", ptr);
+		} else if (inst != imp->imp_connect_data.ocd_instance) {
+			CDEBUG(D_INFO, "IR: %s is connecting to an obsoleted "
+			       "target(%u/%u), reconnecting...\n",
+			       imp->imp_obd->obd_name,
+			       imp->imp_connect_data.ocd_instance, inst);
+			do_reconn = 1;
+		} else {
+			CDEBUG(D_INFO, "IR: %s has already been connecting to "
+			       "new target(%u)\n",
+			       imp->imp_obd->obd_name, inst);
+		}
+	}
+
+	if (do_reconn)
+		ptlrpc_recover_import(imp, uuid, 1);
+
+out:
+	OBD_FREE(kbuf, count + 1);
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_import_seq_write);
+
+int lprocfs_pinger_recov_seq_show(struct seq_file *m, void *n)
+{
+	struct obd_device *obd = m->private;
+	struct obd_import *imp = obd->u.cli.cl_import;
+
+	LPROCFS_CLIMP_CHECK(obd);
+	seq_printf(m, "%d\n", !imp->imp_no_pinger_recover);
+	LPROCFS_CLIMP_EXIT(obd);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_pinger_recov_seq_show);
+
+ssize_t
+lprocfs_pinger_recov_seq_write(struct file *file, const char __user *buffer,
+			       size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct obd_device *obd = m->private;
+	struct client_obd *cli = &obd->u.cli;
+	struct obd_import *imp = cli->cl_import;
+	int rc;
+	__s64 val;
+
+	rc = lprocfs_str_to_s64(buffer, count, &val);
+	if (rc < 0)
+		return rc;
+
+	if (val != 0 && val != 1)
+		return -ERANGE;
+
+	LPROCFS_CLIMP_CHECK(obd);
+	spin_lock(&imp->imp_lock);
+	imp->imp_no_pinger_recover = !val;
+	spin_unlock(&imp->imp_lock);
+	LPROCFS_CLIMP_EXIT(obd);
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_pinger_recov_seq_write);
+
+#endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/niobuf.c b/drivers/staging/lustrefsx/lustre/ptlrpc/niobuf.c
new file mode 100644
index 0000000000000..999869000c35b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/niobuf.c
@@ -0,0 +1,992 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_lib.h>
+#include <obd.h>
+#include <obd_class.h>
+#include "ptlrpc_internal.h"
+
+/**
+ * Helper function. Sends \a len bytes from \a base at offset \a offset
+ * over \a conn connection to portal \a portal.
+ * Returns 0 on success or error code.
+ */
+static int ptl_send_buf(struct lnet_handle_md *mdh, void *base, int len,
+			enum lnet_ack_req ack, struct ptlrpc_cb_id *cbid,
+			lnet_nid_t self, struct lnet_process_id peer_id,
+			int portal, __u64 xid, unsigned int offset,
+			struct lnet_handle_md *bulk_cookie)
+{
+	int              rc;
+	struct lnet_md         md;
+	ENTRY;
+
+	LASSERT (portal != 0);
+	CDEBUG (D_INFO, "peer_id %s\n", libcfs_id2str(peer_id));
+	md.start     = base;
+	md.length    = len;
+	md.threshold = (ack == LNET_ACK_REQ) ? 2 : 1;
+	md.options   = PTLRPC_MD_OPTIONS;
+	md.user_ptr  = cbid;
+	md.eq_handle = ptlrpc_eq_h;
+	LNetInvalidateMDHandle(&md.bulk_handle);
+
+	if (bulk_cookie) {
+		md.bulk_handle = *bulk_cookie;
+		md.options |= LNET_MD_BULK_HANDLE;
+	}
+
+	if (unlikely(ack == LNET_ACK_REQ &&
+		     OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_ACK, OBD_FAIL_ONCE))){
+		/* don't ask for the ack to simulate failing client */
+		ack = LNET_NOACK_REQ;
+	}
+
+	rc = LNetMDBind (md, LNET_UNLINK, mdh);
+	if (unlikely(rc != 0)) {
+		CERROR ("LNetMDBind failed: %d\n", rc);
+		LASSERT (rc == -ENOMEM);
+		RETURN (-ENOMEM);
+	}
+
+	CDEBUG(D_NET, "Sending %d bytes to portal %d, xid %lld, offset %u\n",
+	       len, portal, xid, offset);
+
+	rc = LNetPut(self, *mdh, ack,
+		     peer_id, portal, xid, offset, 0);
+	if (unlikely(rc != 0)) {
+		int rc2;
+		/* We're going to get an UNLINK event when I unlink below,
+		 * which will complete just like any other failed send, so
+		 * I fall through and return success here! */
+		CERROR("LNetPut(%s, %d, %lld) failed: %d\n",
+		       libcfs_id2str(peer_id), portal, xid, rc);
+		rc2 = LNetMDUnlink(*mdh);
+		LASSERTF(rc2 == 0, "rc2 = %d\n", rc2);
+	}
+
+	RETURN (0);
+}
+
+static void mdunlink_iterate_helper(struct lnet_handle_md *bd_mds, int count)
+{
+	int i;
+
+	for (i = 0; i < count; i++)
+		LNetMDUnlink(bd_mds[i]);
+}
+
+#ifdef HAVE_SERVER_SUPPORT
+/**
+ * Prepare bulk descriptor for specified incoming request \a req that
+ * can fit \a nfrags * pages. \a type is bulk type. \a portal is where
+ * the bulk to be sent. Used on server-side after request was already
+ * received.
+ * Returns pointer to newly allocatrd initialized bulk descriptor or NULL on
+ * error.
+ */
+struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp(struct ptlrpc_request *req,
+					      unsigned nfrags, unsigned max_brw,
+					      unsigned int type,
+					      unsigned portal,
+					      const struct ptlrpc_bulk_frag_ops
+						*ops)
+{
+	struct obd_export *exp = req->rq_export;
+	struct ptlrpc_bulk_desc *desc;
+
+	ENTRY;
+	LASSERT(ptlrpc_is_bulk_op_active(type));
+
+	desc = ptlrpc_new_bulk(nfrags, max_brw, type, portal, ops);
+	if (desc == NULL)
+		RETURN(NULL);
+
+        desc->bd_export = class_export_get(exp);
+        desc->bd_req = req;
+
+        desc->bd_cbid.cbid_fn  = server_bulk_callback;
+        desc->bd_cbid.cbid_arg = desc;
+
+        /* NB we don't assign rq_bulk here; server-side requests are
+         * re-used, and the handler frees the bulk desc explicitly. */
+
+        return desc;
+}
+EXPORT_SYMBOL(ptlrpc_prep_bulk_exp);
+
+/**
+ * Starts bulk transfer for descriptor \a desc on the server.
+ * Returns 0 on success or error code.
+ */
+int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc)
+{
+	struct obd_export        *exp = desc->bd_export;
+	lnet_nid_t		  self_nid;
+	struct lnet_process_id	  peer_id;
+	int                       rc = 0;
+	__u64                     mbits;
+	int                       posted_md;
+	int                       total_md;
+	struct lnet_md                 md;
+	ENTRY;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_PUT_NET))
+		RETURN(0);
+
+	/* NB no locking required until desc is on the network */
+	LASSERT(desc->bd_md_count == 0);
+	LASSERT(ptlrpc_is_bulk_op_active(desc->bd_type));
+
+	LASSERT(desc->bd_cbid.cbid_fn == server_bulk_callback);
+	LASSERT(desc->bd_cbid.cbid_arg == desc);
+
+	/*
+	 * Multi-Rail: get the preferred self and peer NIDs from the
+	 * request, so they are based on the route taken by the
+	 * message.
+	 */
+	self_nid = desc->bd_req->rq_self;
+	peer_id = desc->bd_req->rq_source;
+
+	/* NB total length may be 0 for a read past EOF, so we send 0
+	 * length bulks, since the client expects bulk events.
+	 *
+	 * The client may not need all of the bulk mbits for the RPC. The RPC
+	 * used the mbits of the highest bulk mbits needed, and the server masks
+	 * off high bits to get bulk count for this RPC. LU-1431 */
+	mbits = desc->bd_req->rq_mbits & ~((__u64)desc->bd_md_max_brw - 1);
+	total_md = desc->bd_req->rq_mbits - mbits + 1;
+
+	desc->bd_md_count = total_md;
+	desc->bd_failure = 0;
+
+	md.user_ptr = &desc->bd_cbid;
+	md.eq_handle = ptlrpc_eq_h;
+	md.threshold = 2; /* SENT and ACK/REPLY */
+
+	for (posted_md = 0; posted_md < total_md; mbits++) {
+		md.options = PTLRPC_MD_OPTIONS;
+
+		/* NB it's assumed that source and sink buffer frags are
+		 * page-aligned. Otherwise we'd have to send client bulk
+		 * sizes over and split server buffer accordingly */
+		ptlrpc_fill_bulk_md(&md, desc, posted_md);
+		rc = LNetMDBind(md, LNET_UNLINK, &desc->bd_mds[posted_md]);
+		if (rc != 0) {
+			CERROR("%s: LNetMDBind failed for MD %u: rc = %d\n",
+			       exp->exp_obd->obd_name, posted_md, rc);
+			LASSERT(rc == -ENOMEM);
+			if (posted_md == 0) {
+				desc->bd_md_count = 0;
+				RETURN(-ENOMEM);
+			}
+			break;
+		}
+
+		/* LU-6441: last md is not sent and desc->bd_md_count == 1 */
+		if (OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB3,
+					 CFS_FAIL_ONCE) &&
+		    total_md > 1 && posted_md == total_md - 1) {
+			posted_md++;
+			continue;
+		}
+
+		/* Network is about to get at the memory */
+		if (ptlrpc_is_bulk_put_source(desc->bd_type))
+			rc = LNetPut(self_nid, desc->bd_mds[posted_md],
+				     LNET_ACK_REQ, peer_id,
+				     desc->bd_portal, mbits, 0, 0);
+		else
+			rc = LNetGet(self_nid, desc->bd_mds[posted_md],
+				     peer_id, desc->bd_portal, mbits, 0);
+
+		posted_md++;
+		if (rc != 0) {
+			CERROR("%s: failed bulk transfer with %s:%u x%llu: "
+			       "rc = %d\n", exp->exp_obd->obd_name,
+			       libcfs_id2str(peer_id), desc->bd_portal,
+			       mbits, rc);
+			break;
+		}
+	}
+
+	if (rc != 0) {
+		/* Can't send, so we unlink the MD bound above.  The UNLINK
+		 * event this creates will signal completion with failure,
+		 * so we return SUCCESS here! */
+		spin_lock(&desc->bd_lock);
+		desc->bd_md_count -= total_md - posted_md;
+		spin_unlock(&desc->bd_lock);
+		LASSERT(desc->bd_md_count >= 0);
+
+		mdunlink_iterate_helper(desc->bd_mds, posted_md);
+		RETURN(0);
+	}
+
+	CDEBUG(D_NET, "Transferring %u pages %u bytes via portal %d "
+	       "id %s mbits %#llx-%#llx\n", desc->bd_iov_count,
+	       desc->bd_nob, desc->bd_portal, libcfs_id2str(peer_id),
+	       mbits - posted_md, mbits - 1);
+
+	RETURN(0);
+}
+
+/**
+ * Server side bulk abort. Idempotent. Not thread-safe (i.e. only
+ * serialises with completion callback)
+ */
+void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc)
+{
+	struct l_wait_info       lwi;
+	int                      rc;
+
+	LASSERT(!in_interrupt());           /* might sleep */
+
+	if (!ptlrpc_server_bulk_active(desc))   /* completed or */
+		return;                         /* never started */
+
+	/* We used to poison the pages with 0xab here because we did not want to
+	 * send any meaningful data over the wire for evicted clients (bug 9297)
+	 * However, this is no longer safe now that we use the page cache on the
+	 * OSS (bug 20560) */
+
+	/* The unlink ensures the callback happens ASAP and is the last
+	 * one.  If it fails, it must be because completion just happened,
+	 * but we must still l_wait_event() in this case, to give liblustre
+	 * a chance to run server_bulk_callback()*/
+	mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw);
+
+	for (;;) {
+		/* Network access will complete in finite time but the HUGE
+		 * timeout lets us CWARN for visibility of sluggish NALs */
+		lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK),
+					   cfs_time_seconds(1), NULL, NULL);
+		rc = l_wait_event(desc->bd_waitq,
+				  !ptlrpc_server_bulk_active(desc), &lwi);
+		if (rc == 0)
+			return;
+
+		LASSERT(rc == -ETIMEDOUT);
+		CWARN("Unexpectedly long timeout: desc %p\n", desc);
+	}
+}
+#endif /* HAVE_SERVER_SUPPORT */
+
+/**
+ * Register bulk at the sender for later transfer.
+ * Returns 0 on success or error code.
+ */
+int ptlrpc_register_bulk(struct ptlrpc_request *req)
+{
+	struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+	struct lnet_process_id peer;
+	int rc = 0;
+	int rc2;
+	int posted_md;
+	int total_md;
+	__u64 mbits;
+	struct lnet_handle_me me_h;
+	struct lnet_md md;
+	ENTRY;
+
+        if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_GET_NET))
+                RETURN(0);
+
+	/* NB no locking required until desc is on the network */
+	LASSERT(desc->bd_nob > 0);
+	LASSERT(desc->bd_md_count == 0);
+	LASSERT(desc->bd_md_max_brw <= PTLRPC_BULK_OPS_COUNT);
+	LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES);
+	LASSERT(desc->bd_req != NULL);
+	LASSERT(ptlrpc_is_bulk_op_passive(desc->bd_type));
+
+	/* cleanup the state of the bulk for it will be reused */
+	if (req->rq_resend || req->rq_send_state == LUSTRE_IMP_REPLAY)
+		desc->bd_nob_transferred = 0;
+	else if (desc->bd_nob_transferred != 0)
+		/* If the network failed after an RPC was sent, this condition
+		 * could happen.  Rather than assert (was here before), return
+		 * an EIO error. */
+		RETURN(-EIO);
+
+	desc->bd_failure = 0;
+
+	peer = desc->bd_import->imp_connection->c_peer;
+
+	LASSERT(desc->bd_cbid.cbid_fn == client_bulk_callback);
+	LASSERT(desc->bd_cbid.cbid_arg == desc);
+
+	total_md = (desc->bd_iov_count + LNET_MAX_IOV - 1) / LNET_MAX_IOV;
+	/* rq_mbits is matchbits of the final bulk */
+	mbits = req->rq_mbits - total_md + 1;
+
+	LASSERTF(mbits == (req->rq_mbits & PTLRPC_BULK_OPS_MASK),
+		 "first mbits = x%llu, last mbits = x%llu\n",
+		 mbits, req->rq_mbits);
+	LASSERTF(!(desc->bd_registered &&
+		   req->rq_send_state != LUSTRE_IMP_REPLAY) ||
+		 mbits != desc->bd_last_mbits,
+		 "registered: %d  rq_mbits: %llu bd_last_mbits: %llu\n",
+		 desc->bd_registered, mbits, desc->bd_last_mbits);
+
+	desc->bd_registered = 1;
+	desc->bd_last_mbits = mbits;
+	desc->bd_md_count = total_md;
+	md.user_ptr = &desc->bd_cbid;
+	md.eq_handle = ptlrpc_eq_h;
+	md.threshold = 1;                       /* PUT or GET */
+
+	for (posted_md = 0; posted_md < total_md; posted_md++, mbits++) {
+		md.options = PTLRPC_MD_OPTIONS |
+			     (ptlrpc_is_bulk_op_get(desc->bd_type) ?
+			      LNET_MD_OP_GET : LNET_MD_OP_PUT);
+		ptlrpc_fill_bulk_md(&md, desc, posted_md);
+
+		rc = LNetMEAttach(desc->bd_portal, peer, mbits, 0,
+				  LNET_UNLINK, LNET_INS_AFTER, &me_h);
+		if (rc != 0) {
+			CERROR("%s: LNetMEAttach failed x%llu/%d: rc = %d\n",
+			       desc->bd_import->imp_obd->obd_name, mbits,
+			       posted_md, rc);
+			break;
+		}
+
+		/* About to let the network at it... */
+		rc = LNetMDAttach(me_h, md, LNET_UNLINK,
+				  &desc->bd_mds[posted_md]);
+		if (rc != 0) {
+			CERROR("%s: LNetMDAttach failed x%llu/%d: rc = %d\n",
+			       desc->bd_import->imp_obd->obd_name, mbits,
+			       posted_md, rc);
+			rc2 = LNetMEUnlink(me_h);
+			LASSERT(rc2 == 0);
+			break;
+		}
+	}
+
+	if (rc != 0) {
+		LASSERT(rc == -ENOMEM);
+		spin_lock(&desc->bd_lock);
+		desc->bd_md_count -= total_md - posted_md;
+		spin_unlock(&desc->bd_lock);
+		LASSERT(desc->bd_md_count >= 0);
+		mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw);
+		req->rq_status = -ENOMEM;
+		RETURN(-ENOMEM);
+	}
+
+	spin_lock(&desc->bd_lock);
+	/* Holler if peer manages to touch buffers before he knows the mbits */
+	if (desc->bd_md_count != total_md)
+		CWARN("%s: Peer %s touched %d buffers while I registered\n",
+		      desc->bd_import->imp_obd->obd_name, libcfs_id2str(peer),
+		      total_md - desc->bd_md_count);
+	spin_unlock(&desc->bd_lock);
+
+	CDEBUG(D_NET, "Setup %u bulk %s buffers: %u pages %u bytes, "
+	       "mbits x%#llx-%#llx, portal %u\n", desc->bd_md_count,
+	       ptlrpc_is_bulk_op_get(desc->bd_type) ? "get-source" : "put-sink",
+	       desc->bd_iov_count, desc->bd_nob,
+	       desc->bd_last_mbits, req->rq_mbits, desc->bd_portal);
+
+	RETURN(0);
+}
+
+/**
+ * Disconnect a bulk desc from the network. Idempotent. Not
+ * thread-safe (i.e. only interlocks with completion callback).
+ * Returns 1 on success or 0 if network unregistration failed for whatever
+ * reason.
+ */
+int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async)
+{
+	struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+	struct l_wait_info       lwi;
+	int                      rc;
+	ENTRY;
+
+	LASSERT(!in_interrupt());     /* might sleep */
+
+	/* Let's setup deadline for reply unlink. */
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) &&
+	    async && req->rq_bulk_deadline == 0 && cfs_fail_val == 0)
+		req->rq_bulk_deadline = ktime_get_real_seconds() + LONG_UNLINK;
+
+	if (ptlrpc_client_bulk_active(req) == 0)	/* completed or */
+		RETURN(1);				/* never registered */
+
+	LASSERT(desc->bd_req == req);  /* bd_req NULL until registered */
+
+	/* the unlink ensures the callback happens ASAP and is the last
+	 * one.  If it fails, it must be because completion just happened,
+	 * but we must still l_wait_event() in this case to give liblustre
+	 * a chance to run client_bulk_callback() */
+	mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw);
+
+	if (ptlrpc_client_bulk_active(req) == 0)	/* completed or */
+		RETURN(1);				/* never registered */
+
+	/* Move to "Unregistering" phase as bulk was not unlinked yet. */
+	ptlrpc_rqphase_move(req, RQ_PHASE_UNREG_BULK);
+
+	/* Do not wait for unlink to finish. */
+	if (async)
+		RETURN(0);
+
+	for (;;) {
+		/* The wq argument is ignored by user-space wait_event macros */
+		wait_queue_head_t *wq = (req->rq_set != NULL) ?
+					&req->rq_set->set_waitq :
+					&req->rq_reply_waitq;
+                /* Network access will complete in finite time but the HUGE
+                 * timeout lets us CWARN for visibility of sluggish NALs */
+                lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK),
+                                           cfs_time_seconds(1), NULL, NULL);
+                rc = l_wait_event(*wq, !ptlrpc_client_bulk_active(req), &lwi);
+                if (rc == 0) {
+                        ptlrpc_rqphase_move(req, req->rq_next_phase);
+                        RETURN(1);
+                }
+
+                LASSERT(rc == -ETIMEDOUT);
+                DEBUG_REQ(D_WARNING, req, "Unexpectedly long timeout: desc %p",
+                          desc);
+        }
+        RETURN(0);
+}
+
+static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags)
+{
+	struct ptlrpc_service_part	*svcpt = req->rq_rqbd->rqbd_svcpt;
+	struct ptlrpc_service		*svc = svcpt->scp_service;
+	int service_time = max_t(int, ktime_get_real_seconds() -
+                                 req->rq_arrival_time.tv_sec, 1);
+
+        if (!(flags & PTLRPC_REPLY_EARLY) &&
+            (req->rq_type != PTL_RPC_MSG_ERR) &&
+            (req->rq_reqmsg != NULL) &&
+            !(lustre_msg_get_flags(req->rq_reqmsg) &
+              (MSG_RESENT | MSG_REPLAY |
+               MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE))) {
+                /* early replies, errors and recovery requests don't count
+                 * toward our service time estimate */
+		int oldse = at_measured(&svcpt->scp_at_estimate, service_time);
+
+		if (oldse != 0) {
+			DEBUG_REQ(D_ADAPTTO, req,
+				  "svc %s changed estimate from %d to %d",
+				  svc->srv_name, oldse,
+				  at_get(&svcpt->scp_at_estimate));
+		}
+        }
+        /* Report actual service time for client latency calc */
+        lustre_msg_set_service_time(req->rq_repmsg, service_time);
+	/* Report service time estimate for future client reqs, but report 0
+	 * (to be ignored by client) if it's an error reply during recovery.
+	 * b=15815
+	 */
+	if (req->rq_type == PTL_RPC_MSG_ERR &&
+	    (req->rq_export == NULL ||
+	     req->rq_export->exp_obd->obd_recovering)) {
+		lustre_msg_set_timeout(req->rq_repmsg, 0);
+	} else {
+		time64_t timeout;
+
+		if (req->rq_export && req->rq_reqmsg != NULL &&
+		    (flags & PTLRPC_REPLY_EARLY) &&
+		    lustre_msg_get_flags(req->rq_reqmsg) &
+		    (MSG_REPLAY | MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE)) {
+			struct obd_device *exp_obd = req->rq_export->exp_obd;
+
+			timeout = ktime_get_real_seconds() -
+				  req->rq_arrival_time.tv_sec +
+				  min_t(time64_t, at_extra,
+					exp_obd->obd_recovery_timeout / 4);
+		} else {
+			timeout = at_get(&svcpt->scp_at_estimate);
+		}
+		lustre_msg_set_timeout(req->rq_repmsg, timeout);
+	}
+
+	if (req->rq_reqmsg &&
+	    !(lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) {
+		CDEBUG(D_ADAPTTO, "No early reply support: flags=%#x "
+		       "req_flags=%#x magic=%x/%x len=%d\n",
+		       flags, lustre_msg_get_flags(req->rq_reqmsg),
+		       lustre_msg_get_magic(req->rq_reqmsg),
+		       lustre_msg_get_magic(req->rq_repmsg), req->rq_replen);
+	}
+}
+
+/**
+ * Send request reply from request \a req reply buffer.
+ * \a flags defines reply types
+ * Returns 0 on success or error code
+ */
+int ptlrpc_send_reply(struct ptlrpc_request *req, int flags)
+{
+	struct ptlrpc_reply_state *rs = req->rq_reply_state;
+	struct ptlrpc_connection  *conn;
+	int                        rc;
+
+        /* We must already have a reply buffer (only ptlrpc_error() may be
+         * called without one). The reply generated by sptlrpc layer (e.g.
+         * error notify, etc.) might have NULL rq->reqmsg; Otherwise we must
+         * have a request buffer which is either the actual (swabbed) incoming
+         * request, or a saved copy if this is a req saved in
+         * target_queue_final_reply().
+         */
+        LASSERT (req->rq_no_reply == 0);
+        LASSERT (req->rq_reqbuf != NULL);
+        LASSERT (rs != NULL);
+        LASSERT ((flags & PTLRPC_REPLY_MAYBE_DIFFICULT) || !rs->rs_difficult);
+        LASSERT (req->rq_repmsg != NULL);
+        LASSERT (req->rq_repmsg == rs->rs_msg);
+        LASSERT (rs->rs_cb_id.cbid_fn == reply_out_callback);
+        LASSERT (rs->rs_cb_id.cbid_arg == rs);
+
+        /* There may be no rq_export during failover */
+
+        if (unlikely(req->rq_export && req->rq_export->exp_obd &&
+                     req->rq_export->exp_obd->obd_fail)) {
+                /* Failed obd's only send ENODEV */
+                req->rq_type = PTL_RPC_MSG_ERR;
+                req->rq_status = -ENODEV;
+                CDEBUG(D_HA, "sending ENODEV from failed obd %d\n",
+                       req->rq_export->exp_obd->obd_minor);
+        }
+
+	/* In order to keep interoprability with the client (< 2.3) which
+	 * doesn't have pb_jobid in ptlrpc_body, We have to shrink the
+	 * ptlrpc_body in reply buffer to ptlrpc_body_v2, otherwise, the
+	 * reply buffer on client will be overflow.
+	 *
+	 * XXX Remove this whenver we drop the interoprability with such client.
+	 */
+	req->rq_replen = lustre_shrink_msg(req->rq_repmsg, 0,
+					   sizeof(struct ptlrpc_body_v2), 1);
+
+        if (req->rq_type != PTL_RPC_MSG_ERR)
+                req->rq_type = PTL_RPC_MSG_REPLY;
+
+        lustre_msg_set_type(req->rq_repmsg, req->rq_type);
+	lustre_msg_set_status(req->rq_repmsg,
+			      ptlrpc_status_hton(req->rq_status));
+        lustre_msg_set_opc(req->rq_repmsg,
+                req->rq_reqmsg ? lustre_msg_get_opc(req->rq_reqmsg) : 0);
+
+        target_pack_pool_reply(req);
+
+        ptlrpc_at_set_reply(req, flags);
+
+        if (req->rq_export == NULL || req->rq_export->exp_connection == NULL)
+                conn = ptlrpc_connection_get(req->rq_peer, req->rq_self, NULL);
+        else
+                conn = ptlrpc_connection_addref(req->rq_export->exp_connection);
+
+        if (unlikely(conn == NULL)) {
+                CERROR("not replying on NULL connection\n"); /* bug 9635 */
+                return -ENOTCONN;
+        }
+        ptlrpc_rs_addref(rs);                   /* +1 ref for the network */
+
+        rc = sptlrpc_svc_wrap_reply(req);
+        if (unlikely(rc))
+                goto out;
+
+	req->rq_sent = ktime_get_real_seconds();
+
+	rc = ptl_send_buf(&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len,
+			  (rs->rs_difficult && !rs->rs_no_ack) ?
+			  LNET_ACK_REQ : LNET_NOACK_REQ,
+			  &rs->rs_cb_id, req->rq_self, req->rq_source,
+			  ptlrpc_req2svc(req)->srv_rep_portal,
+			  req->rq_xid, req->rq_reply_off, NULL);
+out:
+        if (unlikely(rc != 0))
+                ptlrpc_req_drop_rs(req);
+        ptlrpc_connection_put(conn);
+        return rc;
+}
+
+int ptlrpc_reply (struct ptlrpc_request *req)
+{
+        if (req->rq_no_reply)
+                return 0;
+        else
+                return (ptlrpc_send_reply(req, 0));
+}
+
+/**
+ * For request \a req send an error reply back. Create empty
+ * reply buffers if necessary.
+ */
+int ptlrpc_send_error(struct ptlrpc_request *req, int may_be_difficult)
+{
+        int rc;
+        ENTRY;
+
+        if (req->rq_no_reply)
+                RETURN(0);
+
+        if (!req->rq_repmsg) {
+                rc = lustre_pack_reply(req, 1, NULL, NULL);
+                if (rc)
+                        RETURN(rc);
+        }
+
+        if (req->rq_status != -ENOSPC && req->rq_status != -EACCES &&
+	    req->rq_status != -EPERM && req->rq_status != -ENOENT &&
+	    req->rq_status != -EINPROGRESS && req->rq_status != -EDQUOT)
+                req->rq_type = PTL_RPC_MSG_ERR;
+
+        rc = ptlrpc_send_reply(req, may_be_difficult);
+        RETURN(rc);
+}
+
+int ptlrpc_error(struct ptlrpc_request *req)
+{
+        return ptlrpc_send_error(req, 0);
+}
+
+/**
+ * Send request \a request.
+ * if \a noreply is set, don't expect any reply back and don't set up
+ * reply buffers.
+ * Returns 0 on success or error code.
+ */
+int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
+{
+	int rc;
+	int rc2;
+	int mpflag = 0;
+	struct lnet_handle_md bulk_cookie;
+	struct ptlrpc_connection *connection;
+	struct lnet_handle_me reply_me_h;
+	struct lnet_md reply_md;
+	struct obd_import *imp = request->rq_import;
+	struct obd_device *obd = imp->imp_obd;
+	ENTRY;
+
+	LNetInvalidateMDHandle(&bulk_cookie);
+
+        if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_RPC))
+                RETURN(0);
+
+        LASSERT(request->rq_type == PTL_RPC_MSG_REQUEST);
+        LASSERT(request->rq_wait_ctx == 0);
+
+        /* If this is a re-transmit, we're required to have disengaged
+         * cleanly from the previous attempt */
+        LASSERT(!request->rq_receiving_reply);
+	LASSERT(!((lustre_msg_get_flags(request->rq_reqmsg) & MSG_REPLAY) &&
+		(imp->imp_state == LUSTRE_IMP_FULL)));
+
+	if (unlikely(obd != NULL && obd->obd_fail)) {
+		CDEBUG(D_HA, "muting rpc for failed imp obd %s\n",
+			obd->obd_name);
+		/* this prevents us from waiting in ptlrpc_queue_wait */
+		spin_lock(&request->rq_lock);
+		request->rq_err = 1;
+		spin_unlock(&request->rq_lock);
+                request->rq_status = -ENODEV;
+                RETURN(-ENODEV);
+        }
+
+	connection = imp->imp_connection;
+
+	lustre_msg_set_handle(request->rq_reqmsg,
+			      &imp->imp_remote_handle);
+	lustre_msg_set_type(request->rq_reqmsg, PTL_RPC_MSG_REQUEST);
+	lustre_msg_set_conn_cnt(request->rq_reqmsg,
+				imp->imp_conn_cnt);
+	lustre_msghdr_set_flags(request->rq_reqmsg,
+				imp->imp_msghdr_flags);
+
+	/* If it's the first time to resend the request for EINPROGRESS,
+	 * we need to allocate a new XID (see after_reply()), it's different
+	 * from the resend for reply timeout. */
+	if (request->rq_nr_resend != 0 &&
+	    list_empty(&request->rq_unreplied_list)) {
+		__u64 min_xid = 0;
+		/* resend for EINPROGRESS, allocate new xid to avoid reply
+		 * reconstruction */
+		spin_lock(&imp->imp_lock);
+		ptlrpc_assign_next_xid_nolock(request);
+		min_xid = ptlrpc_known_replied_xid(imp);
+		spin_unlock(&imp->imp_lock);
+
+		lustre_msg_set_last_xid(request->rq_reqmsg, min_xid);
+		DEBUG_REQ(D_RPCTRACE, request, "Allocating new xid for "
+			  "resend on EINPROGRESS");
+	}
+
+	if (request->rq_bulk != NULL) {
+		ptlrpc_set_bulk_mbits(request);
+		lustre_msg_set_mbits(request->rq_reqmsg, request->rq_mbits);
+	}
+
+	if (list_empty(&request->rq_unreplied_list) ||
+	    request->rq_xid <= imp->imp_known_replied_xid) {
+		DEBUG_REQ(D_ERROR, request, "xid: %llu, replied: %llu, "
+			  "list_empty:%d\n", request->rq_xid,
+			  imp->imp_known_replied_xid,
+			  list_empty(&request->rq_unreplied_list));
+		LBUG();
+	}
+
+	/** For enabled AT all request should have AT_SUPPORT in the
+	 * FULL import state when OBD_CONNECT_AT is set */
+	LASSERT(AT_OFF || imp->imp_state != LUSTRE_IMP_FULL ||
+		(imp->imp_msghdr_flags & MSGHDR_AT_SUPPORT) ||
+		!(imp->imp_connect_data.ocd_connect_flags &
+		OBD_CONNECT_AT));
+
+	if (request->rq_resend) {
+		lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT);
+		if (request->rq_resend_cb != NULL)
+			request->rq_resend_cb(request, &request->rq_async_args);
+	}
+        if (request->rq_memalloc)
+                mpflag = cfs_memory_pressure_get_and_set();
+
+	rc = sptlrpc_cli_wrap_request(request);
+	if (rc)
+		GOTO(out, rc);
+
+	/* bulk register should be done after wrap_request() */
+	if (request->rq_bulk != NULL) {
+		rc = ptlrpc_register_bulk (request);
+		if (rc != 0)
+			GOTO(out, rc);
+		/*
+		 * All the mds in the request will have the same cpt
+		 * encoded in the cookie. So we can just get the first
+		 * one.
+		 */
+		bulk_cookie = request->rq_bulk->bd_mds[0];
+	}
+
+        if (!noreply) {
+                LASSERT (request->rq_replen != 0);
+                if (request->rq_repbuf == NULL) {
+                        LASSERT(request->rq_repdata == NULL);
+                        LASSERT(request->rq_repmsg == NULL);
+                        rc = sptlrpc_cli_alloc_repbuf(request,
+                                                      request->rq_replen);
+                        if (rc) {
+                                /* this prevents us from looping in
+                                 * ptlrpc_queue_wait */
+				spin_lock(&request->rq_lock);
+				request->rq_err = 1;
+				spin_unlock(&request->rq_lock);
+                                request->rq_status = rc;
+                                GOTO(cleanup_bulk, rc);
+                        }
+                } else {
+                        request->rq_repdata = NULL;
+                        request->rq_repmsg = NULL;
+                }
+
+                rc = LNetMEAttach(request->rq_reply_portal,/*XXX FIXME bug 249*/
+                                  connection->c_peer, request->rq_xid, 0,
+                                  LNET_UNLINK, LNET_INS_AFTER, &reply_me_h);
+                if (rc != 0) {
+                        CERROR("LNetMEAttach failed: %d\n", rc);
+                        LASSERT (rc == -ENOMEM);
+                        GOTO(cleanup_bulk, rc = -ENOMEM);
+                }
+        }
+
+	spin_lock(&request->rq_lock);
+	/* We are responsible for unlinking the reply buffer */
+	request->rq_reply_unlinked = noreply;
+	request->rq_receiving_reply = !noreply;
+	/* Clear any flags that may be present from previous sends. */
+	request->rq_req_unlinked = 0;
+        request->rq_replied = 0;
+        request->rq_err = 0;
+        request->rq_timedout = 0;
+        request->rq_net_err = 0;
+        request->rq_resend = 0;
+        request->rq_restart = 0;
+	request->rq_reply_truncated = 0;
+	spin_unlock(&request->rq_lock);
+
+        if (!noreply) {
+                reply_md.start     = request->rq_repbuf;
+                reply_md.length    = request->rq_repbuf_len;
+                /* Allow multiple early replies */
+                reply_md.threshold = LNET_MD_THRESH_INF;
+                /* Manage remote for early replies */
+                reply_md.options   = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT |
+                        LNET_MD_MANAGE_REMOTE |
+                        LNET_MD_TRUNCATE; /* allow to make EOVERFLOW error */;
+                reply_md.user_ptr  = &request->rq_reply_cbid;
+                reply_md.eq_handle = ptlrpc_eq_h;
+
+		/* We must see the unlink callback to set rq_reply_unlinked,
+		 * so we can't auto-unlink */
+                rc = LNetMDAttach(reply_me_h, reply_md, LNET_RETAIN,
+                                  &request->rq_reply_md_h);
+                if (rc != 0) {
+                        CERROR("LNetMDAttach failed: %d\n", rc);
+                        LASSERT (rc == -ENOMEM);
+			spin_lock(&request->rq_lock);
+			/* ...but the MD attach didn't succeed... */
+			request->rq_receiving_reply = 0;
+			spin_unlock(&request->rq_lock);
+                        GOTO(cleanup_me, rc = -ENOMEM);
+                }
+
+		CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid %llu"
+                       ", portal %u\n",
+                       request->rq_repbuf_len, request->rq_xid,
+                       request->rq_reply_portal);
+        }
+
+        /* add references on request for request_out_callback */
+        ptlrpc_request_addref(request);
+	if (obd != NULL && obd->obd_svc_stats != NULL)
+		lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQACTIVE_CNTR,
+			atomic_read(&imp->imp_inflight));
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5);
+
+	request->rq_sent_ns = ktime_get_real();
+	request->rq_sent = ktime_get_real_seconds();
+	/* We give the server rq_timeout secs to process the req, and
+	   add the network latency for our local timeout. */
+        request->rq_deadline = request->rq_sent + request->rq_timeout +
+                ptlrpc_at_get_net_latency(request);
+
+	ptlrpc_pinger_sending_on_import(imp);
+
+	DEBUG_REQ(D_INFO, request, "send flg=%x",
+		  lustre_msg_get_flags(request->rq_reqmsg));
+	rc = ptl_send_buf(&request->rq_req_md_h,
+			  request->rq_reqbuf, request->rq_reqdata_len,
+			  LNET_NOACK_REQ, &request->rq_req_cbid,
+			  LNET_NID_ANY, connection->c_peer,
+			  request->rq_request_portal,
+			  request->rq_xid, 0, &bulk_cookie);
+	if (likely(rc == 0))
+		GOTO(out, rc);
+
+	request->rq_req_unlinked = 1;
+        ptlrpc_req_finished(request);
+        if (noreply)
+                GOTO(out, rc);
+
+ cleanup_me:
+        /* MEUnlink is safe; the PUT didn't even get off the ground, and
+         * nobody apart from the PUT's target has the right nid+XID to
+         * access the reply buffer. */
+        rc2 = LNetMEUnlink(reply_me_h);
+        LASSERT (rc2 == 0);
+        /* UNLINKED callback called synchronously */
+        LASSERT(!request->rq_receiving_reply);
+
+ cleanup_bulk:
+        /* We do sync unlink here as there was no real transfer here so
+         * the chance to have long unlink to sluggish net is smaller here. */
+        ptlrpc_unregister_bulk(request, 0);
+ out:
+	if (rc == -ENOMEM) {
+		/* set rq_sent so that this request is treated
+		 * as a delayed send in the upper layers */
+		request->rq_sent = ktime_get_real_seconds();
+	}
+
+	if (request->rq_memalloc)
+		cfs_memory_pressure_restore(mpflag);
+
+	return rc;
+}
+EXPORT_SYMBOL(ptl_send_rpc);
+
+/**
+ * Register request buffer descriptor for request receiving.
+ */
+int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd)
+{
+	struct ptlrpc_service *service = rqbd->rqbd_svcpt->scp_service;
+	static struct lnet_process_id match_id = {LNET_NID_ANY, LNET_PID_ANY};
+	int rc;
+	struct lnet_md md;
+	struct lnet_handle_me me_h;
+
+        CDEBUG(D_NET, "LNetMEAttach: portal %d\n",
+               service->srv_req_portal);
+
+        if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_RQBD))
+                return (-ENOMEM);
+
+	/* NB: CPT affinity service should use new LNet flag LNET_INS_LOCAL,
+	 * which means buffer can only be attached on local CPT, and LND
+	 * threads can find it by grabbing a local lock */
+	rc = LNetMEAttach(service->srv_req_portal,
+			  match_id, 0, ~0, LNET_UNLINK,
+			  rqbd->rqbd_svcpt->scp_cpt >= 0 ?
+			  LNET_INS_LOCAL : LNET_INS_AFTER, &me_h);
+        if (rc != 0) {
+                CERROR("LNetMEAttach failed: %d\n", rc);
+                return (-ENOMEM);
+        }
+
+        LASSERT(rqbd->rqbd_refcount == 0);
+        rqbd->rqbd_refcount = 1;
+
+        md.start     = rqbd->rqbd_buffer;
+        md.length    = service->srv_buf_size;
+        md.max_size  = service->srv_max_req_size;
+        md.threshold = LNET_MD_THRESH_INF;
+        md.options   = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT | LNET_MD_MAX_SIZE;
+        md.user_ptr  = &rqbd->rqbd_cbid;
+        md.eq_handle = ptlrpc_eq_h;
+
+        rc = LNetMDAttach(me_h, md, LNET_UNLINK, &rqbd->rqbd_md_h);
+        if (rc == 0)
+                return (0);
+
+        CERROR("LNetMDAttach failed: %d; \n", rc);
+        LASSERT (rc == -ENOMEM);
+        rc = LNetMEUnlink (me_h);
+        LASSERT (rc == 0);
+        rqbd->rqbd_refcount = 0;
+
+        return (-ENOMEM);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nodemap_internal.h b/drivers/staging/lustrefsx/lustre/ptlrpc/nodemap_internal.h
new file mode 100644
index 0000000000000..851bdc0dc354a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nodemap_internal.h
@@ -0,0 +1,206 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (C) 2013, Trustees of Indiana University
+ *
+ * Copyright (c) 2013, 2014, Intel Corporation.
+ *
+ * Author: Joshua Walgenbach <jjw@iu.edu>
+ */
+
+#ifndef _NODEMAP_INTERNAL_H
+#define _NODEMAP_INTERNAL_H
+
+#include <lustre_nodemap.h>
+#include <interval_tree.h>
+
+#define DEFAULT_NODEMAP "default"
+
+/* Turn on proc debug interface to allow OSS and
+ * MDS nodes to configure nodemap independently of
+ * MGS (since the nodemap distribution is not written
+ * yet */
+#define NODEMAP_PROC_DEBUG 1
+
+/* Default nobody uid and gid values */
+
+#define NODEMAP_NOBODY_UID 99
+#define NODEMAP_NOBODY_GID 99
+
+struct lprocfs_static_vars;
+
+/* nodemap root proc directory under fs/lustre */
+extern struct proc_dir_entry *proc_lustre_nodemap_root;
+/* flag if nodemap is active */
+extern bool nodemap_active;
+
+extern struct mutex active_config_lock;
+extern struct nodemap_config *active_config;
+
+struct lu_nid_range {
+	/* unique id set by mgs */
+	unsigned int		 rn_id;
+	/* lu_nodemap containing this range */
+	struct lu_nodemap	*rn_nodemap;
+	/* list for nodemap */
+	struct list_head	 rn_list;
+	/* nid interval tree */
+	struct interval_node	 rn_node;
+};
+
+struct lu_idmap {
+	/* uid/gid of client */
+	__u32		id_client;
+	/* uid/gid on filesystem */
+	__u32		id_fs;
+	/* tree mapping client ids to filesystem ids */
+	struct rb_node	id_client_to_fs;
+	/* tree mappung filesystem to client */
+	struct rb_node	id_fs_to_client;
+};
+
+/* first 4 bits of the nodemap_id is the index type */
+struct nodemap_key {
+	__u32 nk_nodemap_id;
+	union {
+		__u32 nk_range_id;
+		__u32 nk_id_client;
+		__u32 nk_unused;
+	};
+};
+
+enum nodemap_idx_type {
+	NODEMAP_EMPTY_IDX = 0,		/* index created with blank record */
+	NODEMAP_CLUSTER_IDX = 1,	/* a nodemap cluster of nodes */
+	NODEMAP_RANGE_IDX = 2,		/* nid range assigned to a nm cluster */
+	NODEMAP_UIDMAP_IDX = 3,		/* uid map assigned to a nm cluster */
+	NODEMAP_GIDMAP_IDX = 4,		/* gid map assigned to a nm cluster */
+	NODEMAP_GLOBAL_IDX = 15,	/* stores nodemap activation status */
+};
+
+#define NM_TYPE_MASK 0x0FFFFFFF
+#define NM_TYPE_SHIFT 28
+
+static inline enum nodemap_idx_type nm_idx_get_type(unsigned int id)
+{
+	return id >> NM_TYPE_SHIFT;
+}
+
+static inline __u32 nm_idx_set_type(unsigned int id, enum nodemap_idx_type t)
+{
+	return (id & NM_TYPE_MASK) | (t << NM_TYPE_SHIFT);
+}
+
+void nodemap_config_set_active(struct nodemap_config *config);
+struct lu_nodemap *nodemap_create(const char *name,
+				  struct nodemap_config *config,
+				  bool is_default);
+void nodemap_putref(struct lu_nodemap *nodemap);
+struct lu_nodemap *nodemap_lookup(const char *name);
+
+int nodemap_procfs_init(void);
+void nodemap_procfs_exit(void);
+int lprocfs_nodemap_register(struct lu_nodemap *nodemap,
+			     bool is_default_nodemap);
+void lprocfs_nodemap_remove(struct nodemap_pde *nodemap_pde);
+struct lu_nid_range *nodemap_range_find(lnet_nid_t start_nid,
+					lnet_nid_t end_nid);
+struct lu_nid_range *range_create(struct nodemap_range_tree *nm_range_tree,
+				  lnet_nid_t start_nid, lnet_nid_t end_nid,
+				  struct lu_nodemap *nodemap,
+				  unsigned int range_id);
+void range_destroy(struct lu_nid_range *range);
+int range_insert(struct nodemap_range_tree *nm_range_tree,
+		 struct lu_nid_range *data);
+void range_delete(struct nodemap_range_tree *nm_range_tree,
+		  struct lu_nid_range *data);
+struct lu_nid_range *range_search(struct nodemap_range_tree *nm_range_tree,
+				  lnet_nid_t nid);
+struct lu_nid_range *range_find(struct nodemap_range_tree *nm_range_tree,
+				lnet_nid_t start_nid, lnet_nid_t end_nid);
+int range_parse_nidstring(char *range_string, lnet_nid_t *start_nid,
+			  lnet_nid_t *end_nid);
+void range_init_tree(void);
+struct lu_idmap *idmap_create(__u32 client_id, __u32 fs_id);
+struct lu_idmap *idmap_insert(enum nodemap_id_type id_type,
+			      struct lu_idmap *idmap,
+			      struct lu_nodemap *nodemap);
+void idmap_delete(enum nodemap_id_type id_type,  struct lu_idmap *idmap,
+		  struct lu_nodemap *nodemap);
+void idmap_delete_tree(struct lu_nodemap *nodemap);
+struct lu_idmap *idmap_search(struct lu_nodemap *nodemap,
+			      enum nodemap_tree_type,
+			      enum nodemap_id_type id_type,
+			      __u32 id);
+int nm_member_add(struct lu_nodemap *nodemap, struct obd_export *exp);
+void nm_member_del(struct lu_nodemap *nodemap, struct obd_export *exp);
+void nm_member_delete_list(struct lu_nodemap *nodemap);
+struct lu_nodemap *nodemap_classify_nid(lnet_nid_t nid);
+void nm_member_reclassify_nodemap(struct lu_nodemap *nodemap);
+void nm_member_revoke_locks(struct lu_nodemap *nodemap);
+void nm_member_revoke_locks_always(struct lu_nodemap *nodemap);
+void nm_member_revoke_all(void);
+
+int nodemap_add_idmap_helper(struct lu_nodemap *nodemap,
+			     enum nodemap_id_type id_type,
+			     const __u32 map[2]);
+int nodemap_add_range_helper(struct nodemap_config *config,
+			     struct lu_nodemap *nodemap,
+			     const lnet_nid_t nid[2],
+			     unsigned int range_id);
+
+struct rb_node *nm_rb_next_postorder(const struct rb_node *node);
+struct rb_node *nm_rb_first_postorder(const struct rb_root *root);
+void nodemap_getref(struct lu_nodemap *nodemap);
+void nodemap_putref(struct lu_nodemap *nodemap);
+int nm_hash_list_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+		    struct hlist_node *hnode,
+		    void *nodemap_list_head);
+
+#define nm_rbtree_postorder_for_each_entry_safe(pos, n,			\
+						root, field)		\
+	for (pos = nm_rb_first_postorder(root) ?			\
+		rb_entry(nm_rb_first_postorder(root), typeof(*pos),	\
+		field) : NULL,						\
+		n = (pos && nm_rb_next_postorder(&pos->field)) ?	\
+		rb_entry(nm_rb_next_postorder(&pos->field),		\
+		typeof(*pos), field) : NULL;				\
+		pos != NULL;						\
+		pos = n,						\
+		n = (pos && nm_rb_next_postorder(&pos->field)) ?	\
+		rb_entry(nm_rb_next_postorder(&pos->field),		\
+		typeof(*pos), field) : NULL)
+
+int nodemap_idx_nodemap_add(const struct lu_nodemap *nodemap);
+int nodemap_idx_nodemap_update(const struct lu_nodemap *nodemap);
+int nodemap_idx_nodemap_del(const struct lu_nodemap *nodemap);
+int nodemap_idx_idmap_add(const struct lu_nodemap *nodemap,
+			  enum nodemap_id_type id_type,
+			  const __u32 map[2]);
+int nodemap_idx_idmap_del(const struct lu_nodemap *nodemap,
+			  enum nodemap_id_type id_type,
+			  const __u32 map[2]);
+int nodemap_idx_range_add(const struct lu_nid_range *range,
+			  const lnet_nid_t nid[2]);
+int nodemap_idx_range_del(const struct lu_nid_range *range);
+int nodemap_idx_nodemap_activate(bool value);
+#endif  /* _NODEMAP_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs.c
new file mode 100644
index 0000000000000..52d3225deba6b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs.c
@@ -0,0 +1,1854 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2014, 2016, Intel Corporation.
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ * lustre/ptlrpc/nrs.c
+ *
+ * Network Request Scheduler (NRS)
+ *
+ * Allows to reorder the handling of RPCs at servers.
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ * Author: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
+ */
+/**
+ * \addtogoup nrs
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lprocfs_status.h>
+#include <libcfs/libcfs.h>
+#include "ptlrpc_internal.h"
+
+/**
+ * NRS core object.
+ */
+struct nrs_core nrs_core;
+
+static int nrs_policy_init(struct ptlrpc_nrs_policy *policy)
+{
+	return policy->pol_desc->pd_ops->op_policy_init != NULL ?
+	       policy->pol_desc->pd_ops->op_policy_init(policy) : 0;
+}
+
+static void nrs_policy_fini(struct ptlrpc_nrs_policy *policy)
+{
+	LASSERT(policy->pol_ref == 0);
+	LASSERT(policy->pol_req_queued == 0);
+
+	if (policy->pol_desc->pd_ops->op_policy_fini != NULL)
+		policy->pol_desc->pd_ops->op_policy_fini(policy);
+}
+
+static int nrs_policy_ctl_locked(struct ptlrpc_nrs_policy *policy,
+				 enum ptlrpc_nrs_ctl opc, void *arg)
+{
+	/**
+	 * The policy may be stopped, but the lprocfs files and
+	 * ptlrpc_nrs_policy instances remain present until unregistration time.
+	 * Do not perform the ctl operation if the policy is stopped, as
+	 * policy->pol_private will be NULL in such a case.
+	 */
+	if (policy->pol_state == NRS_POL_STATE_STOPPED)
+		RETURN(-ENODEV);
+
+	RETURN(policy->pol_desc->pd_ops->op_policy_ctl != NULL ?
+	       policy->pol_desc->pd_ops->op_policy_ctl(policy, opc, arg) :
+	       -ENOSYS);
+}
+
+static void nrs_policy_stop0(struct ptlrpc_nrs_policy *policy)
+{
+	ENTRY;
+
+	if (policy->pol_desc->pd_ops->op_policy_stop != NULL)
+		policy->pol_desc->pd_ops->op_policy_stop(policy);
+
+	LASSERT(list_empty(&policy->pol_list_queued));
+	LASSERT(policy->pol_req_queued == 0 &&
+		policy->pol_req_started == 0);
+
+	policy->pol_private = NULL;
+
+	policy->pol_state = NRS_POL_STATE_STOPPED;
+
+	if (atomic_dec_and_test(&policy->pol_desc->pd_refs))
+		module_put(policy->pol_desc->pd_owner);
+
+	EXIT;
+}
+
+static int nrs_policy_stop_locked(struct ptlrpc_nrs_policy *policy)
+{
+	struct ptlrpc_nrs *nrs = policy->pol_nrs;
+	ENTRY;
+
+	if (nrs->nrs_policy_fallback == policy && !nrs->nrs_stopping)
+		RETURN(-EPERM);
+
+	if (policy->pol_state == NRS_POL_STATE_STARTING)
+		RETURN(-EAGAIN);
+
+	/* In progress or already stopped */
+	if (policy->pol_state != NRS_POL_STATE_STARTED)
+		RETURN(0);
+
+	policy->pol_state = NRS_POL_STATE_STOPPING;
+
+	/* Immediately make it invisible */
+	if (nrs->nrs_policy_primary == policy) {
+		nrs->nrs_policy_primary = NULL;
+
+	} else {
+		LASSERT(nrs->nrs_policy_fallback == policy);
+		nrs->nrs_policy_fallback = NULL;
+	}
+
+	/* I have the only refcount */
+	if (policy->pol_ref == 1)
+		nrs_policy_stop0(policy);
+
+	RETURN(0);
+}
+
+/**
+ * Transitions the \a nrs NRS head's primary policy to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING and if the policy has no
+ * pending usage references, to ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED.
+ *
+ * \param[in] nrs the NRS head to carry out this operation on
+ */
+static void nrs_policy_stop_primary(struct ptlrpc_nrs *nrs)
+{
+	struct ptlrpc_nrs_policy *tmp = nrs->nrs_policy_primary;
+	ENTRY;
+
+	if (tmp == NULL) {
+		/**
+		 * XXX: This should really be RETURN_EXIT, but the latter does
+		 * not currently print anything out, and possibly should be
+		 * fixed to do so.
+		 */
+		EXIT;
+		return;
+	}
+
+	nrs->nrs_policy_primary = NULL;
+
+	LASSERT(tmp->pol_state == NRS_POL_STATE_STARTED);
+	tmp->pol_state = NRS_POL_STATE_STOPPING;
+
+	if (tmp->pol_ref == 0)
+		nrs_policy_stop0(tmp);
+	EXIT;
+}
+
+/**
+ * Transitions a policy across the ptlrpc_nrs_pol_state range of values, in
+ * response to an lprocfs command to start a policy.
+ *
+ * If a primary policy different to the current one is specified, this function
+ * will transition the new policy to the
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTING and then to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED, and will then transition
+ * the old primary policy (if there is one) to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING, and if there are no outstanding
+ * references on the policy to ptlrpc_nrs_pol_stae::NRS_POL_STATE_STOPPED.
+ *
+ * If the fallback policy is specified, this is taken to indicate an instruction
+ * to stop the current primary policy, without substituting it with another
+ * primary policy, so the primary policy (if any) is transitioned to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING, and if there are no outstanding
+ * references on the policy to ptlrpc_nrs_pol_stae::NRS_POL_STATE_STOPPED. In
+ * this case, the fallback policy is only left active in the NRS head.
+ */
+static int nrs_policy_start_locked(struct ptlrpc_nrs_policy *policy, char *arg)
+{
+	struct ptlrpc_nrs      *nrs = policy->pol_nrs;
+	int			rc = 0;
+	ENTRY;
+
+	/**
+	 * Don't allow multiple starting which is too complex, and has no real
+	 * benefit.
+	 */
+	if (nrs->nrs_policy_starting)
+		RETURN(-EAGAIN);
+
+	LASSERT(policy->pol_state != NRS_POL_STATE_STARTING);
+
+	if (policy->pol_state == NRS_POL_STATE_STOPPING)
+		RETURN(-EAGAIN);
+
+	if (policy->pol_flags & PTLRPC_NRS_FL_FALLBACK) {
+		/**
+		 * This is for cases in which the user sets the policy to the
+		 * fallback policy (currently fifo for all services); i.e. the
+		 * user is resetting the policy to the default; so we stop the
+		 * primary policy, if any.
+		 */
+		if (policy == nrs->nrs_policy_fallback) {
+			nrs_policy_stop_primary(nrs);
+			RETURN(0);
+		}
+
+		/**
+		 * If we reach here, we must be setting up the fallback policy
+		 * at service startup time, and only a single policy with the
+		 * nrs_policy_flags::PTLRPC_NRS_FL_FALLBACK flag set can
+		 * register with NRS core.
+		 */
+		LASSERT(nrs->nrs_policy_fallback == NULL);
+	} else {
+		/**
+		 * Shouldn't start primary policy if w/o fallback policy.
+		 */
+		if (nrs->nrs_policy_fallback == NULL)
+			RETURN(-EPERM);
+
+		if (policy->pol_state == NRS_POL_STATE_STARTED) {
+			/**
+			 * If the policy argument now is different from the last time,
+			 * stop the policy first and start it again with the new
+			 * argument.
+			 */
+			if ((arg != NULL) && (strlen(arg) >= NRS_POL_ARG_MAX))
+				return -EINVAL;
+
+			if ((arg == NULL && strlen(policy->pol_arg) == 0) ||
+			    (arg != NULL && strcmp(policy->pol_arg, arg) == 0))
+				RETURN(0);
+
+			rc = nrs_policy_stop_locked(policy);
+			if (rc)
+				RETURN(-EAGAIN);
+		}
+	}
+
+	/**
+	 * Increase the module usage count for policies registering from other
+	 * modules.
+	 */
+	if (atomic_inc_return(&policy->pol_desc->pd_refs) == 1 &&
+	    !try_module_get(policy->pol_desc->pd_owner)) {
+		atomic_dec(&policy->pol_desc->pd_refs);
+		CERROR("NRS: cannot get module for policy %s; is it alive?\n",
+		       policy->pol_desc->pd_name);
+		RETURN(-ENODEV);
+	}
+
+	/**
+	 * Serialize policy starting across the NRS head
+	 */
+	nrs->nrs_policy_starting = 1;
+
+	policy->pol_state = NRS_POL_STATE_STARTING;
+
+	if (policy->pol_desc->pd_ops->op_policy_start) {
+		spin_unlock(&nrs->nrs_lock);
+
+		rc = policy->pol_desc->pd_ops->op_policy_start(policy, arg);
+
+		spin_lock(&nrs->nrs_lock);
+		if (rc != 0) {
+			if (atomic_dec_and_test(&policy->pol_desc->pd_refs))
+				module_put(policy->pol_desc->pd_owner);
+
+			policy->pol_state = NRS_POL_STATE_STOPPED;
+			GOTO(out, rc);
+		}
+	}
+
+	if (arg != NULL) {
+		if (strlcpy(policy->pol_arg, arg, sizeof(policy->pol_arg)) >=
+		    sizeof(policy->pol_arg)) {
+			CERROR("NRS: arg '%s' is too long\n", arg);
+			GOTO(out, rc = -E2BIG);
+		}
+	} else {
+		policy->pol_arg[0] = '\0';
+	}
+
+	policy->pol_state = NRS_POL_STATE_STARTED;
+
+	if (policy->pol_flags & PTLRPC_NRS_FL_FALLBACK) {
+		/**
+		 * This path is only used at PTLRPC service setup time.
+		 */
+		nrs->nrs_policy_fallback = policy;
+	} else {
+		/*
+		 * Try to stop the current primary policy if there is one.
+		 */
+		nrs_policy_stop_primary(nrs);
+
+		/**
+		 * And set the newly-started policy as the primary one.
+		 */
+		nrs->nrs_policy_primary = policy;
+	}
+
+out:
+	nrs->nrs_policy_starting = 0;
+
+	RETURN(rc);
+}
+
+/**
+ * Increases the policy's usage reference count.
+ */
+static inline void nrs_policy_get_locked(struct ptlrpc_nrs_policy *policy)
+{
+	policy->pol_ref++;
+}
+
+/**
+ * Decreases the policy's usage reference count, and stops the policy in case it
+ * was already stopping and have no more outstanding usage references (which
+ * indicates it has no more queued or started requests, and can be safely
+ * stopped).
+ */
+static void nrs_policy_put_locked(struct ptlrpc_nrs_policy *policy)
+{
+	LASSERT(policy->pol_ref > 0);
+
+	policy->pol_ref--;
+	if (unlikely(policy->pol_ref == 0 &&
+	    policy->pol_state == NRS_POL_STATE_STOPPING))
+		nrs_policy_stop0(policy);
+}
+
+static void nrs_policy_put(struct ptlrpc_nrs_policy *policy)
+{
+	spin_lock(&policy->pol_nrs->nrs_lock);
+	nrs_policy_put_locked(policy);
+	spin_unlock(&policy->pol_nrs->nrs_lock);
+}
+
+/**
+ * Find and return a policy by name.
+ */
+static struct ptlrpc_nrs_policy * nrs_policy_find_locked(struct ptlrpc_nrs *nrs,
+							 char *name)
+{
+	struct ptlrpc_nrs_policy *tmp;
+
+	list_for_each_entry(tmp, &nrs->nrs_policy_list, pol_list) {
+		if (strncmp(tmp->pol_desc->pd_name, name,
+			    NRS_POL_NAME_MAX) == 0) {
+			nrs_policy_get_locked(tmp);
+			return tmp;
+		}
+	}
+	return NULL;
+}
+
+/**
+ * Release references for the resource hierarchy moving upwards towards the
+ * policy instance resource.
+ */
+static void nrs_resource_put(struct ptlrpc_nrs_resource *res)
+{
+	struct ptlrpc_nrs_policy *policy = res->res_policy;
+
+	if (policy->pol_desc->pd_ops->op_res_put != NULL) {
+		struct ptlrpc_nrs_resource *parent;
+
+		for (; res != NULL; res = parent) {
+			parent = res->res_parent;
+			policy->pol_desc->pd_ops->op_res_put(policy, res);
+		}
+	}
+}
+
+/**
+ * Obtains references for each resource in the resource hierarchy for request
+ * \a nrq if it is to be handled by \a policy.
+ *
+ * \param[in] policy	  the policy
+ * \param[in] nrq	  the request
+ * \param[in] moving_req  denotes whether this is a call to the function by
+ *			  ldlm_lock_reorder_req(), in order to move \a nrq to
+ *			  the high-priority NRS head; we should not sleep when
+ *			  set.
+ *
+ * \retval NULL		  resource hierarchy references not obtained
+ * \retval valid-pointer  the bottom level of the resource hierarchy
+ *
+ * \see ptlrpc_nrs_pol_ops::op_res_get()
+ */
+static
+struct ptlrpc_nrs_resource * nrs_resource_get(struct ptlrpc_nrs_policy *policy,
+					      struct ptlrpc_nrs_request *nrq,
+					      bool moving_req)
+{
+	/**
+	 * Set to NULL to traverse the resource hierarchy from the top.
+	 */
+	struct ptlrpc_nrs_resource *res = NULL;
+	struct ptlrpc_nrs_resource *tmp = NULL;
+	int			    rc;
+
+	while (1) {
+		rc = policy->pol_desc->pd_ops->op_res_get(policy, nrq, res,
+							  &tmp, moving_req);
+		if (rc < 0) {
+			if (res != NULL)
+				nrs_resource_put(res);
+			return NULL;
+		}
+
+		LASSERT(tmp != NULL);
+		tmp->res_parent = res;
+		tmp->res_policy = policy;
+		res = tmp;
+		tmp = NULL;
+		/**
+		 * Return once we have obtained a reference to the bottom level
+		 * of the resource hierarchy.
+		 */
+		if (rc > 0)
+			return res;
+	}
+}
+
+/**
+ * Obtains resources for the resource hierarchies and policy references for
+ * the fallback and current primary policy (if any), that will later be used
+ * to handle request \a nrq.
+ *
+ * \param[in]  nrs  the NRS head instance that will be handling request \a nrq.
+ * \param[in]  nrq  the request that is being handled.
+ * \param[out] resp the array where references to the resource hierarchy are
+ *		    stored.
+ * \param[in]  moving_req  is set when obtaining resources while moving a
+ *			   request from a policy on the regular NRS head to a
+ *			   policy on the HP NRS head (via
+ *			   ldlm_lock_reorder_req()). It signifies that
+ *			   allocations to get resources should be atomic; for
+ *			   a full explanation, see comment in
+ *			   ptlrpc_nrs_pol_ops::op_res_get().
+ */
+static void nrs_resource_get_safe(struct ptlrpc_nrs *nrs,
+				  struct ptlrpc_nrs_request *nrq,
+				  struct ptlrpc_nrs_resource **resp,
+				  bool moving_req)
+{
+	struct ptlrpc_nrs_policy   *primary = NULL;
+	struct ptlrpc_nrs_policy   *fallback = NULL;
+
+	memset(resp, 0, sizeof(resp[0]) * NRS_RES_MAX);
+
+	/**
+	 * Obtain policy references.
+	 */
+	spin_lock(&nrs->nrs_lock);
+
+	fallback = nrs->nrs_policy_fallback;
+	nrs_policy_get_locked(fallback);
+
+	primary = nrs->nrs_policy_primary;
+	if (primary != NULL)
+		nrs_policy_get_locked(primary);
+
+	spin_unlock(&nrs->nrs_lock);
+
+	/**
+	 * Obtain resource hierarchy references.
+	 */
+	resp[NRS_RES_FALLBACK] = nrs_resource_get(fallback, nrq, moving_req);
+	LASSERT(resp[NRS_RES_FALLBACK] != NULL);
+
+	if (primary != NULL) {
+		resp[NRS_RES_PRIMARY] = nrs_resource_get(primary, nrq,
+							 moving_req);
+		/**
+		 * A primary policy may exist which may not wish to serve a
+		 * particular request for different reasons; release the
+		 * reference on the policy as it will not be used for this
+		 * request.
+		 */
+		if (resp[NRS_RES_PRIMARY] == NULL)
+			nrs_policy_put(primary);
+	}
+}
+
+/**
+ * Releases references to resource hierarchies and policies, because they are no
+ * longer required; used when request handling has been completed, or the
+ * request is moving to the high priority NRS head.
+ *
+ * \param resp	the resource hierarchy that is being released
+ *
+ * \see ptlrpcnrs_req_hp_move()
+ * \see ptlrpc_nrs_req_finalize()
+ */
+static void nrs_resource_put_safe(struct ptlrpc_nrs_resource **resp)
+{
+	struct ptlrpc_nrs_policy *pols[NRS_RES_MAX];
+	struct ptlrpc_nrs	 *nrs = NULL;
+	int			  i;
+
+	for (i = 0; i < NRS_RES_MAX; i++) {
+		if (resp[i] != NULL) {
+			pols[i] = resp[i]->res_policy;
+			nrs_resource_put(resp[i]);
+			resp[i] = NULL;
+		} else {
+			pols[i] = NULL;
+		}
+	}
+
+	for (i = 0; i < NRS_RES_MAX; i++) {
+		if (pols[i] == NULL)
+			continue;
+
+		if (nrs == NULL) {
+			nrs = pols[i]->pol_nrs;
+			spin_lock(&nrs->nrs_lock);
+		}
+		nrs_policy_put_locked(pols[i]);
+	}
+
+	if (nrs != NULL)
+		spin_unlock(&nrs->nrs_lock);
+}
+
+/**
+ * Obtains an NRS request from \a policy for handling or examination; the
+ * request should be removed in the 'handling' case.
+ *
+ * Calling into this function implies we already know the policy has a request
+ * waiting to be handled.
+ *
+ * \param[in] policy the policy from which a request
+ * \param[in] peek   when set, signifies that we just want to examine the
+ *		     request, and not handle it, so the request is not removed
+ *		     from the policy.
+ * \param[in] force  when set, it will force a policy to return a request if it
+ *		     has one pending
+ *
+ * \retval the NRS request to be handled
+ */
+static inline
+struct ptlrpc_nrs_request * nrs_request_get(struct ptlrpc_nrs_policy *policy,
+					    bool peek, bool force)
+{
+	struct ptlrpc_nrs_request *nrq;
+
+	LASSERT(policy->pol_req_queued > 0);
+
+	nrq = policy->pol_desc->pd_ops->op_req_get(policy, peek, force);
+
+	LASSERT(ergo(nrq != NULL, nrs_request_policy(nrq) == policy));
+
+	return nrq;
+}
+
+/**
+ * Enqueues request \a nrq for later handling, via one one the policies for
+ * which resources where earlier obtained via nrs_resource_get_safe(). The
+ * function attempts to enqueue the request first on the primary policy
+ * (if any), since this is the preferred choice.
+ *
+ * \param nrq the request being enqueued
+ *
+ * \see nrs_resource_get_safe()
+ */
+static inline void nrs_request_enqueue(struct ptlrpc_nrs_request *nrq)
+{
+	struct ptlrpc_nrs_policy *policy;
+	int			  rc;
+	int			  i;
+
+	/**
+	 * Try in descending order, because the primary policy (if any) is
+	 * the preferred choice.
+	 */
+	for (i = NRS_RES_MAX - 1; i >= 0; i--) {
+		if (nrq->nr_res_ptrs[i] == NULL)
+			continue;
+
+		nrq->nr_res_idx = i;
+		policy = nrq->nr_res_ptrs[i]->res_policy;
+
+		rc = policy->pol_desc->pd_ops->op_req_enqueue(policy, nrq);
+		if (rc == 0) {
+			policy->pol_nrs->nrs_req_queued++;
+			policy->pol_req_queued++;
+			return;
+		}
+	}
+	/**
+	 * Should never get here, as at least the primary policy's
+	 * ptlrpc_nrs_pol_ops::op_req_enqueue() implementation should always
+	 * succeed.
+	 */
+	LBUG();
+}
+
+/**
+ * Called when a request has been handled
+ *
+ * \param[in] nrs the request that has been handled; can be used for
+ *		  job/resource control.
+ *
+ * \see ptlrpc_nrs_req_stop_nolock()
+ */
+static inline void nrs_request_stop(struct ptlrpc_nrs_request *nrq)
+{
+	struct ptlrpc_nrs_policy *policy = nrs_request_policy(nrq);
+
+	if (policy->pol_desc->pd_ops->op_req_stop)
+		policy->pol_desc->pd_ops->op_req_stop(policy, nrq);
+
+	LASSERT(policy->pol_nrs->nrs_req_started > 0);
+	LASSERT(policy->pol_req_started > 0);
+
+	policy->pol_nrs->nrs_req_started--;
+	policy->pol_req_started--;
+}
+
+/**
+ * Handler for operations that can be carried out on policies.
+ *
+ * Handles opcodes that are common to all policy types within NRS core, and
+ * passes any unknown opcodes to the policy-specific control function.
+ *
+ * \param[in]	  nrs  the NRS head this policy belongs to.
+ * \param[in]	  name the human-readable policy name; should be the same as
+ *		       ptlrpc_nrs_pol_desc::pd_name.
+ * \param[in]	  opc  the opcode of the operation being carried out.
+ * \param[in,out] arg  can be used to pass information in and out between when
+ *		       carrying an operation; usually data that is private to
+ *		       the policy at some level, or generic policy status
+ *		       information.
+ *
+ * \retval -ve error condition
+ * \retval   0 operation was carried out successfully
+ */
+static int nrs_policy_ctl(struct ptlrpc_nrs *nrs, char *name,
+			  enum ptlrpc_nrs_ctl opc, void *arg)
+{
+	struct ptlrpc_nrs_policy       *policy;
+	int				rc = 0;
+	ENTRY;
+
+	spin_lock(&nrs->nrs_lock);
+
+	policy = nrs_policy_find_locked(nrs, name);
+	if (policy == NULL)
+		GOTO(out, rc = -ENOENT);
+
+	if (policy->pol_state != NRS_POL_STATE_STARTED &&
+	    policy->pol_state != NRS_POL_STATE_STOPPED)
+		GOTO(out, rc = -EAGAIN);
+
+	switch (opc) {
+		/**
+		 * Unknown opcode, pass it down to the policy-specific control
+		 * function for handling.
+		 */
+	default:
+		rc = nrs_policy_ctl_locked(policy, opc, arg);
+		break;
+
+		/**
+		 * Start \e policy
+		 */
+	case PTLRPC_NRS_CTL_START:
+		rc = nrs_policy_start_locked(policy, arg);
+		break;
+	}
+out:
+	if (policy != NULL)
+		nrs_policy_put_locked(policy);
+
+	spin_unlock(&nrs->nrs_lock);
+
+	RETURN(rc);
+}
+
+/**
+ * Unregisters a policy by name.
+ *
+ * \param[in] nrs  the NRS head this policy belongs to.
+ * \param[in] name the human-readable policy name; should be the same as
+ *	           ptlrpc_nrs_pol_desc::pd_name
+ *
+ * \retval -ve error
+ * \retval   0 success
+ */
+static int nrs_policy_unregister(struct ptlrpc_nrs *nrs, char *name)
+{
+	struct ptlrpc_nrs_policy *policy = NULL;
+	ENTRY;
+
+	spin_lock(&nrs->nrs_lock);
+
+	policy = nrs_policy_find_locked(nrs, name);
+	if (policy == NULL) {
+		spin_unlock(&nrs->nrs_lock);
+
+		CERROR("Can't find NRS policy %s\n", name);
+		RETURN(-ENOENT);
+	}
+
+	if (policy->pol_ref > 1) {
+		CERROR("Policy %s is busy with %d references\n", name,
+		       (int)policy->pol_ref);
+		nrs_policy_put_locked(policy);
+
+		spin_unlock(&nrs->nrs_lock);
+		RETURN(-EBUSY);
+	}
+
+	LASSERT(policy->pol_req_queued == 0);
+	LASSERT(policy->pol_req_started == 0);
+
+	if (policy->pol_state != NRS_POL_STATE_STOPPED) {
+		nrs_policy_stop_locked(policy);
+		LASSERT(policy->pol_state == NRS_POL_STATE_STOPPED);
+	}
+
+	list_del(&policy->pol_list);
+	nrs->nrs_num_pols--;
+
+	nrs_policy_put_locked(policy);
+
+	spin_unlock(&nrs->nrs_lock);
+
+	nrs_policy_fini(policy);
+
+	LASSERT(policy->pol_private == NULL);
+	OBD_FREE_PTR(policy);
+
+	RETURN(0);
+}
+
+/**
+ * Register a policy from \policy descriptor \a desc with NRS head \a nrs.
+ *
+ * \param[in] nrs   the NRS head on which the policy will be registered.
+ * \param[in] desc  the policy descriptor from which the information will be
+ *		    obtained to register the policy.
+ *
+ * \retval -ve error
+ * \retval   0 success
+ */
+static int nrs_policy_register(struct ptlrpc_nrs *nrs,
+			       struct ptlrpc_nrs_pol_desc *desc)
+{
+	struct ptlrpc_nrs_policy       *policy;
+	struct ptlrpc_nrs_policy       *tmp;
+	struct ptlrpc_service_part     *svcpt = nrs->nrs_svcpt;
+	int				rc;
+	ENTRY;
+
+	LASSERT(svcpt != NULL);
+	LASSERT(desc->pd_ops != NULL);
+	LASSERT(desc->pd_ops->op_res_get != NULL);
+	LASSERT(desc->pd_ops->op_req_get != NULL);
+	LASSERT(desc->pd_ops->op_req_enqueue != NULL);
+	LASSERT(desc->pd_ops->op_req_dequeue != NULL);
+	LASSERT(desc->pd_compat != NULL);
+
+	OBD_CPT_ALLOC_GFP(policy, svcpt->scp_service->srv_cptable,
+			  svcpt->scp_cpt, sizeof(*policy), GFP_NOFS);
+	if (policy == NULL)
+		RETURN(-ENOMEM);
+
+	policy->pol_nrs     = nrs;
+	policy->pol_desc    = desc;
+	policy->pol_state   = NRS_POL_STATE_STOPPED;
+	policy->pol_flags   = desc->pd_flags;
+
+	INIT_LIST_HEAD(&policy->pol_list);
+	INIT_LIST_HEAD(&policy->pol_list_queued);
+
+	rc = nrs_policy_init(policy);
+	if (rc != 0) {
+		OBD_FREE_PTR(policy);
+		RETURN(rc);
+	}
+
+	spin_lock(&nrs->nrs_lock);
+
+	tmp = nrs_policy_find_locked(nrs, policy->pol_desc->pd_name);
+	if (tmp != NULL) {
+		CERROR("NRS policy %s has been registered, can't register it "
+		       "for %s\n", policy->pol_desc->pd_name,
+		       svcpt->scp_service->srv_name);
+		nrs_policy_put_locked(tmp);
+
+		spin_unlock(&nrs->nrs_lock);
+		nrs_policy_fini(policy);
+		OBD_FREE_PTR(policy);
+
+		RETURN(-EEXIST);
+	}
+
+	list_add_tail(&policy->pol_list, &nrs->nrs_policy_list);
+	nrs->nrs_num_pols++;
+
+	if (policy->pol_flags & PTLRPC_NRS_FL_REG_START)
+		rc = nrs_policy_start_locked(policy, NULL);
+
+	spin_unlock(&nrs->nrs_lock);
+
+	if (rc != 0)
+		(void) nrs_policy_unregister(nrs, policy->pol_desc->pd_name);
+
+	RETURN(rc);
+}
+
+/**
+ * Enqueue request \a req using one of the policies its resources are referring
+ * to.
+ *
+ * \param[in] req the request to enqueue.
+ */
+static void ptlrpc_nrs_req_add_nolock(struct ptlrpc_request *req)
+{
+	struct ptlrpc_nrs_policy       *policy;
+
+	LASSERT(req->rq_nrq.nr_initialized);
+	LASSERT(!req->rq_nrq.nr_enqueued);
+
+	nrs_request_enqueue(&req->rq_nrq);
+	req->rq_nrq.nr_enqueued = 1;
+
+	policy = nrs_request_policy(&req->rq_nrq);
+	/**
+	 * Add the policy to the NRS head's list of policies with enqueued
+	 * requests, if it has not been added there.
+	 */
+	if (unlikely(list_empty(&policy->pol_list_queued)))
+		list_add_tail(&policy->pol_list_queued,
+				  &policy->pol_nrs->nrs_policy_queued);
+}
+
+/**
+ * Enqueue a request on the high priority NRS head.
+ *
+ * \param req the request to enqueue.
+ */
+static void ptlrpc_nrs_hpreq_add_nolock(struct ptlrpc_request *req)
+{
+	int	opc = lustre_msg_get_opc(req->rq_reqmsg);
+	ENTRY;
+
+	spin_lock(&req->rq_lock);
+	req->rq_hp = 1;
+	ptlrpc_nrs_req_add_nolock(req);
+	if (opc != OBD_PING)
+		DEBUG_REQ(D_NET, req, "high priority req");
+	spin_unlock(&req->rq_lock);
+	EXIT;
+}
+
+/**
+ * Returns a boolean predicate indicating whether the policy described by
+ * \a desc is adequate for use with service \a svc.
+ *
+ * \param[in] svc  the service
+ * \param[in] desc the policy descriptor
+ *
+ * \retval false the policy is not compatible with the service
+ * \retval true	 the policy is compatible with the service
+ */
+static inline bool nrs_policy_compatible(const struct ptlrpc_service *svc,
+					 const struct ptlrpc_nrs_pol_desc *desc)
+{
+	return desc->pd_compat(svc, desc);
+}
+
+/**
+ * Registers all compatible policies in nrs_core.nrs_policies, for NRS head
+ * \a nrs.
+ *
+ * \param[in] nrs the NRS head
+ *
+ * \retval -ve error
+ * \retval   0 success
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ *
+ * \see ptlrpc_service_nrs_setup()
+ */
+static int nrs_register_policies_locked(struct ptlrpc_nrs *nrs)
+{
+	struct ptlrpc_nrs_pol_desc *desc;
+	/* for convenience */
+	struct ptlrpc_service_part	 *svcpt = nrs->nrs_svcpt;
+	struct ptlrpc_service		 *svc = svcpt->scp_service;
+	int				  rc = -EINVAL;
+	ENTRY;
+
+	LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+	list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) {
+		if (nrs_policy_compatible(svc, desc)) {
+			rc = nrs_policy_register(nrs, desc);
+			if (rc != 0) {
+				CERROR("Failed to register NRS policy %s for "
+				       "partition %d of service %s: %d\n",
+				       desc->pd_name, svcpt->scp_cpt,
+				       svc->srv_name, rc);
+				/**
+				 * Fail registration if any of the policies'
+				 * registration fails.
+				 */
+				break;
+			}
+		}
+	}
+
+	RETURN(rc);
+}
+
+/**
+ * Initializes NRS head \a nrs of service partition \a svcpt, and registers all
+ * compatible policies in NRS core, with the NRS head.
+ *
+ * \param[in] nrs   the NRS head
+ * \param[in] svcpt the PTLRPC service partition to setup
+ *
+ * \retval -ve error
+ * \retval   0 success
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ */
+static int nrs_svcpt_setup_locked0(struct ptlrpc_nrs *nrs,
+				   struct ptlrpc_service_part *svcpt)
+{
+	int				rc;
+	enum ptlrpc_nrs_queue_type	queue;
+
+	LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+	if (nrs == &svcpt->scp_nrs_reg)
+		queue = PTLRPC_NRS_QUEUE_REG;
+	else if (nrs == svcpt->scp_nrs_hp)
+		queue = PTLRPC_NRS_QUEUE_HP;
+	else
+		LBUG();
+
+	nrs->nrs_svcpt = svcpt;
+	nrs->nrs_queue_type = queue;
+	spin_lock_init(&nrs->nrs_lock);
+	INIT_LIST_HEAD(&nrs->nrs_policy_list);
+	INIT_LIST_HEAD(&nrs->nrs_policy_queued);
+	nrs->nrs_throttling = 0;
+
+	rc = nrs_register_policies_locked(nrs);
+
+	RETURN(rc);
+}
+
+/**
+ * Allocates a regular and optionally a high-priority NRS head (if the service
+ * handles high-priority RPCs), and then registers all available compatible
+ * policies on those NRS heads.
+ *
+ * \param[in,out] svcpt the PTLRPC service partition to setup
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ */
+static int nrs_svcpt_setup_locked(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_nrs	       *nrs;
+	int				rc;
+	ENTRY;
+
+	LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+	/**
+	 * Initialize the regular NRS head.
+	 */
+	nrs = nrs_svcpt2nrs(svcpt, false);
+	rc = nrs_svcpt_setup_locked0(nrs, svcpt);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	/**
+	 * Optionally allocate a high-priority NRS head.
+	 */
+	if (svcpt->scp_service->srv_ops.so_hpreq_handler == NULL)
+		GOTO(out, rc);
+
+	OBD_CPT_ALLOC_PTR(svcpt->scp_nrs_hp,
+			  svcpt->scp_service->srv_cptable,
+			  svcpt->scp_cpt);
+	if (svcpt->scp_nrs_hp == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	nrs = nrs_svcpt2nrs(svcpt, true);
+	rc = nrs_svcpt_setup_locked0(nrs, svcpt);
+
+out:
+	RETURN(rc);
+}
+
+/**
+ * Unregisters all policies on all available NRS heads in a service partition;
+ * called at PTLRPC service unregistration time.
+ *
+ * \param[in] svcpt the PTLRPC service partition
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ */
+static void nrs_svcpt_cleanup_locked(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_nrs	       *nrs;
+	struct ptlrpc_nrs_policy       *policy;
+	struct ptlrpc_nrs_policy       *tmp;
+	int				rc;
+	bool				hp = false;
+	ENTRY;
+
+	LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+again:
+	/* scp_nrs_hp could be NULL due to short of memory. */
+	nrs = hp ? svcpt->scp_nrs_hp : &svcpt->scp_nrs_reg;
+	/* check the nrs_svcpt to see if nrs is initialized. */
+	if (!nrs || !nrs->nrs_svcpt) {
+		EXIT;
+		return;
+	}
+	nrs->nrs_stopping = 1;
+
+	list_for_each_entry_safe(policy, tmp, &nrs->nrs_policy_list,
+				     pol_list) {
+		rc = nrs_policy_unregister(nrs, policy->pol_desc->pd_name);
+		LASSERT(rc == 0);
+	}
+
+	/**
+	 * If the service partition has an HP NRS head, clean that up as well.
+	 */
+	if (!hp && nrs_svcpt_has_hp(svcpt)) {
+		hp = true;
+		goto again;
+	}
+
+	if (hp)
+		OBD_FREE_PTR(nrs);
+
+	EXIT;
+}
+
+/**
+ * Returns the descriptor for a policy as identified by by \a name.
+ *
+ * \param[in] name the policy name
+ *
+ * \retval the policy descriptor
+ * \retval NULL
+ */
+static struct ptlrpc_nrs_pol_desc *nrs_policy_find_desc_locked(const char *name)
+{
+	struct ptlrpc_nrs_pol_desc     *tmp;
+	ENTRY;
+
+	list_for_each_entry(tmp, &nrs_core.nrs_policies, pd_list) {
+		if (strncmp(tmp->pd_name, name, NRS_POL_NAME_MAX) == 0)
+			RETURN(tmp);
+	}
+	RETURN(NULL);
+}
+
+/**
+ * Removes the policy from all supported NRS heads of all partitions of all
+ * PTLRPC services.
+ *
+ * \param[in] desc the policy descriptor to unregister
+ *
+ * \retval -ve error
+ * \retval  0  successfully unregistered policy on all supported NRS heads
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ * \pre mutex_is_locked(&ptlrpc_all_services_mutex)
+ */
+static int nrs_policy_unregister_locked(struct ptlrpc_nrs_pol_desc *desc)
+{
+	struct ptlrpc_nrs	       *nrs;
+	struct ptlrpc_service	       *svc;
+	struct ptlrpc_service_part     *svcpt;
+	int				i;
+	int				rc = 0;
+	ENTRY;
+
+	LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+	LASSERT(mutex_is_locked(&ptlrpc_all_services_mutex));
+
+	list_for_each_entry(svc, &ptlrpc_all_services, srv_list) {
+
+		if (!nrs_policy_compatible(svc, desc) ||
+		    unlikely(svc->srv_is_stopping))
+			continue;
+
+		ptlrpc_service_for_each_part(svcpt, i, svc) {
+			bool hp = false;
+
+again:
+			nrs = nrs_svcpt2nrs(svcpt, hp);
+			rc = nrs_policy_unregister(nrs, desc->pd_name);
+			/**
+			 * Ignore -ENOENT as the policy may not have registered
+			 * successfully on all service partitions.
+			 */
+			if (rc == -ENOENT) {
+				rc = 0;
+			} else if (rc != 0) {
+				CERROR("Failed to unregister NRS policy %s for "
+				       "partition %d of service %s: %d\n",
+				       desc->pd_name, svcpt->scp_cpt,
+				       svcpt->scp_service->srv_name, rc);
+				RETURN(rc);
+			}
+
+			if (!hp && nrs_svc_has_hp(svc)) {
+				hp = true;
+				goto again;
+			}
+		}
+
+		if (desc->pd_ops->op_lprocfs_fini != NULL)
+			desc->pd_ops->op_lprocfs_fini(svc);
+	}
+
+	RETURN(rc);
+}
+
+/**
+ * Registers a new policy with NRS core.
+ *
+ * The function will only succeed if policy registration with all compatible
+ * service partitions (if any) is successful.
+ *
+ * N.B. This function should be called either at ptlrpc module initialization
+ *	time when registering a policy that ships with NRS core, or in a
+ *	module's init() function for policies registering from other modules.
+ *
+ * \param[in] conf configuration information for the new policy to register
+ *
+ * \retval -ve error
+ * \retval   0 success
+ */
+int ptlrpc_nrs_policy_register(struct ptlrpc_nrs_pol_conf *conf)
+{
+        struct ptlrpc_service	       *svc;
+	struct ptlrpc_nrs_pol_desc     *desc;
+	int				rc = 0;
+	ENTRY;
+
+	LASSERT(conf != NULL);
+	LASSERT(conf->nc_ops != NULL);
+	LASSERT(conf->nc_compat != NULL);
+	LASSERT(ergo(conf->nc_compat == nrs_policy_compat_one,
+		conf->nc_compat_svc_name != NULL));
+	LASSERT(ergo((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) != 0,
+		     conf->nc_owner != NULL));
+
+	conf->nc_name[NRS_POL_NAME_MAX - 1] = '\0';
+
+	/**
+	 * External policies are not allowed to start immediately upon
+	 * registration, as there is a relatively higher chance that their
+	 * registration might fail. In such a case, some policy instances may
+	 * already have requests queued wen unregistration needs to happen as
+	 * part o cleanup; since there is currently no way to drain requests
+	 * from a policy unless the service is unregistering, we just disallow
+	 * this.
+	 */
+	if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) &&
+	    (conf->nc_flags & (PTLRPC_NRS_FL_FALLBACK |
+			       PTLRPC_NRS_FL_REG_START))) {
+		CERROR("NRS: failing to register policy %s. Please check "
+		       "policy flags; external policies cannot act as fallback "
+		       "policies, or be started immediately upon registration "
+		       "without interaction with lprocfs\n", conf->nc_name);
+		RETURN(-EINVAL);
+	}
+
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	if (nrs_policy_find_desc_locked(conf->nc_name) != NULL) {
+		CERROR("NRS: failing to register policy %s which has already "
+		       "been registered with NRS core!\n",
+		       conf->nc_name);
+		GOTO(fail, rc = -EEXIST);
+	}
+
+	OBD_ALLOC_PTR(desc);
+	if (desc == NULL)
+		GOTO(fail, rc = -ENOMEM);
+
+	if (strlcpy(desc->pd_name, conf->nc_name, sizeof(desc->pd_name)) >=
+	    sizeof(desc->pd_name)) {
+		OBD_FREE_PTR(desc);
+		GOTO(fail, rc = -E2BIG);
+	}
+	desc->pd_ops		 = conf->nc_ops;
+	desc->pd_compat		 = conf->nc_compat;
+	desc->pd_compat_svc_name = conf->nc_compat_svc_name;
+	if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) != 0)
+		desc->pd_owner	 = conf->nc_owner;
+	desc->pd_flags		 = conf->nc_flags;
+	atomic_set(&desc->pd_refs, 0);
+
+	/**
+	 * For policies that are held in the same module as NRS (currently
+	 * ptlrpc), do not register the policy with all compatible services,
+	 * as the services will not have started at this point, since we are
+	 * calling from ptlrpc module initialization code. In such cases each
+	 * service will register all compatible policies later, via
+	 * ptlrpc_service_nrs_setup().
+	 */
+	if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) == 0)
+		goto internal;
+
+	/**
+	 * Register the new policy on all compatible services
+	 */
+	mutex_lock(&ptlrpc_all_services_mutex);
+
+	list_for_each_entry(svc, &ptlrpc_all_services, srv_list) {
+		struct ptlrpc_service_part     *svcpt;
+		int				i;
+		int				rc2;
+
+		if (!nrs_policy_compatible(svc, desc) ||
+		    unlikely(svc->srv_is_stopping))
+			continue;
+
+		ptlrpc_service_for_each_part(svcpt, i, svc) {
+			struct ptlrpc_nrs      *nrs;
+			bool			hp = false;
+again:
+			nrs = nrs_svcpt2nrs(svcpt, hp);
+			rc = nrs_policy_register(nrs, desc);
+			if (rc != 0) {
+				CERROR("Failed to register NRS policy %s for "
+				       "partition %d of service %s: %d\n",
+				       desc->pd_name, svcpt->scp_cpt,
+				       svcpt->scp_service->srv_name, rc);
+
+				rc2 = nrs_policy_unregister_locked(desc);
+				/**
+				 * Should not fail at this point
+				 */
+				LASSERT(rc2 == 0);
+				mutex_unlock(&ptlrpc_all_services_mutex);
+				OBD_FREE_PTR(desc);
+				GOTO(fail, rc);
+			}
+
+			if (!hp && nrs_svc_has_hp(svc)) {
+				hp = true;
+				goto again;
+			}
+		}
+
+		/**
+		 * No need to take a reference to other modules here, as we
+		 * will be calling from the module's init() function.
+		 */
+		if (desc->pd_ops->op_lprocfs_init != NULL) {
+			rc = desc->pd_ops->op_lprocfs_init(svc);
+			if (rc != 0) {
+				rc2 = nrs_policy_unregister_locked(desc);
+				/**
+				 * Should not fail at this point
+				 */
+				LASSERT(rc2 == 0);
+				mutex_unlock(&ptlrpc_all_services_mutex);
+				OBD_FREE_PTR(desc);
+				GOTO(fail, rc);
+			}
+		}
+	}
+
+	mutex_unlock(&ptlrpc_all_services_mutex);
+internal:
+	list_add_tail(&desc->pd_list, &nrs_core.nrs_policies);
+fail:
+	mutex_unlock(&nrs_core.nrs_mutex);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_nrs_policy_register);
+
+/**
+ * Unregisters a previously registered policy with NRS core. All instances of
+ * the policy on all NRS heads of all supported services are removed.
+ *
+ * N.B. This function should only be called from a module's exit() function.
+ *	Although it can be used for policies that ship alongside NRS core, the
+ *	function is primarily intended for policies that register externally,
+ *	from other modules.
+ *
+ * \param[in] conf configuration information for the policy to unregister
+ *
+ * \retval -ve error
+ * \retval   0 success
+ */
+int ptlrpc_nrs_policy_unregister(struct ptlrpc_nrs_pol_conf *conf)
+{
+	struct ptlrpc_nrs_pol_desc	*desc;
+	int				 rc;
+	ENTRY;
+
+	LASSERT(conf != NULL);
+
+	if (conf->nc_flags & PTLRPC_NRS_FL_FALLBACK) {
+		CERROR("Unable to unregister a fallback policy, unless the "
+		       "PTLRPC service is stopping.\n");
+		RETURN(-EPERM);
+	}
+
+	conf->nc_name[NRS_POL_NAME_MAX - 1] = '\0';
+
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	desc = nrs_policy_find_desc_locked(conf->nc_name);
+	if (desc == NULL) {
+		CERROR("Failing to unregister NRS policy %s which has "
+		       "not been registered with NRS core!\n",
+		       conf->nc_name);
+		GOTO(not_exist, rc = -ENOENT);
+	}
+
+	mutex_lock(&ptlrpc_all_services_mutex);
+
+	rc = nrs_policy_unregister_locked(desc);
+	if (rc < 0) {
+		if (rc == -EBUSY)
+			CERROR("Please first stop policy %s on all service "
+			       "partitions and then retry to unregister the "
+			       "policy.\n", conf->nc_name);
+		GOTO(fail, rc);
+	}
+
+	CDEBUG(D_INFO, "Unregistering policy %s from NRS core.\n",
+	       conf->nc_name);
+
+	list_del(&desc->pd_list);
+	OBD_FREE_PTR(desc);
+
+fail:
+	mutex_unlock(&ptlrpc_all_services_mutex);
+
+not_exist:
+	mutex_unlock(&nrs_core.nrs_mutex);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_nrs_policy_unregister);
+
+/**
+ * Setup NRS heads on all service partitions of service \a svc, and register
+ * all compatible policies on those NRS heads.
+ *
+ * To be called from withing ptl
+ * \param[in] svc the service to setup
+ *
+ * \retval -ve error, the calling logic should eventually call
+ *		      ptlrpc_service_nrs_cleanup() to undo any work performed
+ *		      by this function.
+ *
+ * \see ptlrpc_register_service()
+ * \see ptlrpc_service_nrs_cleanup()
+ */
+int ptlrpc_service_nrs_setup(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part	       *svcpt;
+	const struct ptlrpc_nrs_pol_desc       *desc;
+	int					i;
+	int					rc = 0;
+
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	/**
+	 * Initialize NRS heads on all service CPTs.
+	 */
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		rc = nrs_svcpt_setup_locked(svcpt);
+		if (rc != 0)
+			GOTO(failed, rc);
+	}
+
+	/**
+	 * Set up lprocfs interfaces for all supported policies for the
+	 * service.
+	 */
+	list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) {
+		if (!nrs_policy_compatible(svc, desc))
+			continue;
+
+		if (desc->pd_ops->op_lprocfs_init != NULL) {
+			rc = desc->pd_ops->op_lprocfs_init(svc);
+			if (rc != 0)
+				GOTO(failed, rc);
+		}
+	}
+
+failed:
+
+	mutex_unlock(&nrs_core.nrs_mutex);
+
+	RETURN(rc);
+}
+
+/**
+ * Unregisters all policies on all service partitions of service \a svc.
+ *
+ * \param[in] svc the PTLRPC service to unregister
+ */
+void ptlrpc_service_nrs_cleanup(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part	     *svcpt;
+	const struct ptlrpc_nrs_pol_desc     *desc;
+	int				      i;
+
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	/**
+	 * Clean up NRS heads on all service partitions
+	 */
+	ptlrpc_service_for_each_part(svcpt, i, svc)
+		nrs_svcpt_cleanup_locked(svcpt);
+
+	/**
+	 * Clean up lprocfs interfaces for all supported policies for the
+	 * service.
+	 */
+	list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) {
+		if (!nrs_policy_compatible(svc, desc))
+			continue;
+
+		if (desc->pd_ops->op_lprocfs_fini != NULL)
+			desc->pd_ops->op_lprocfs_fini(svc);
+	}
+
+	mutex_unlock(&nrs_core.nrs_mutex);
+}
+
+/**
+ * Obtains NRS head resources for request \a req.
+ *
+ * These could be either on the regular or HP NRS head of \a svcpt; resources
+ * taken on the regular head can later be swapped for HP head resources by
+ * ldlm_lock_reorder_req().
+ *
+ * \param[in] svcpt the service partition
+ * \param[in] req   the request
+ * \param[in] hp    which NRS head of \a svcpt to use
+ */
+void ptlrpc_nrs_req_initialize(struct ptlrpc_service_part *svcpt,
+			       struct ptlrpc_request *req, bool hp)
+{
+	struct ptlrpc_nrs	*nrs = nrs_svcpt2nrs(svcpt, hp);
+
+	memset(&req->rq_nrq, 0, sizeof(req->rq_nrq));
+	nrs_resource_get_safe(nrs, &req->rq_nrq, req->rq_nrq.nr_res_ptrs,
+			      false);
+
+	/**
+	 * It is fine to access \e nr_initialized without locking as there is
+	 * no contention at this early stage.
+	 */
+	req->rq_nrq.nr_initialized = 1;
+}
+
+/**
+ * Releases resources for a request; is called after the request has been
+ * handled.
+ *
+ * \param[in] req the request
+ *
+ * \see ptlrpc_server_finish_request()
+ */
+void ptlrpc_nrs_req_finalize(struct ptlrpc_request *req)
+{
+	if (req->rq_nrq.nr_initialized) {
+		nrs_resource_put_safe(req->rq_nrq.nr_res_ptrs);
+		/* no protection on bit nr_initialized because no
+		 * contention at this late stage */
+		req->rq_nrq.nr_finalized = 1;
+	}
+}
+
+void ptlrpc_nrs_req_stop_nolock(struct ptlrpc_request *req)
+{
+	if (req->rq_nrq.nr_started)
+		nrs_request_stop(&req->rq_nrq);
+}
+
+/**
+ * Enqueues request \a req on either the regular or high-priority NRS head
+ * of service partition \a svcpt.
+ *
+ * \param[in] svcpt the service partition
+ * \param[in] req   the request to be enqueued
+ * \param[in] hp    whether to enqueue the request on the regular or
+ *		    high-priority NRS head.
+ */
+void ptlrpc_nrs_req_add(struct ptlrpc_service_part *svcpt,
+			struct ptlrpc_request *req, bool hp)
+{
+	spin_lock(&svcpt->scp_req_lock);
+
+	if (hp)
+		ptlrpc_nrs_hpreq_add_nolock(req);
+	else
+		ptlrpc_nrs_req_add_nolock(req);
+
+	spin_unlock(&svcpt->scp_req_lock);
+}
+
+static void nrs_request_removed(struct ptlrpc_nrs_policy *policy)
+{
+	LASSERT(policy->pol_nrs->nrs_req_queued > 0);
+	LASSERT(policy->pol_req_queued > 0);
+
+	policy->pol_nrs->nrs_req_queued--;
+	policy->pol_req_queued--;
+
+	/**
+	 * If the policy has no more requests queued, remove it from
+	 * ptlrpc_nrs::nrs_policy_queued.
+	 */
+	if (unlikely(policy->pol_req_queued == 0)) {
+		list_del_init(&policy->pol_list_queued);
+
+		/**
+		 * If there are other policies with queued requests, move the
+		 * current policy to the end so that we can round robin over
+		 * all policies and drain the requests.
+		 */
+	} else if (policy->pol_req_queued != policy->pol_nrs->nrs_req_queued) {
+		LASSERT(policy->pol_req_queued <
+			policy->pol_nrs->nrs_req_queued);
+
+		list_move_tail(&policy->pol_list_queued,
+				   &policy->pol_nrs->nrs_policy_queued);
+	}
+}
+
+/**
+ * Obtains a request for handling from an NRS head of service partition
+ * \a svcpt.
+ *
+ * \param[in] svcpt the service partition
+ * \param[in] hp    whether to obtain a request from the regular or
+ *		    high-priority NRS head.
+ * \param[in] peek  when set, signifies that we just want to examine the
+ *		    request, and not handle it, so the request is not removed
+ *		    from the policy.
+ * \param[in] force when set, it will force a policy to return a request if it
+ *		    has one pending
+ *
+ * \retval the	request to be handled
+ * \retval NULL the head has no requests to serve
+ */
+struct ptlrpc_request *
+ptlrpc_nrs_req_get_nolock0(struct ptlrpc_service_part *svcpt, bool hp,
+			   bool peek, bool force)
+{
+	struct ptlrpc_nrs	  *nrs = nrs_svcpt2nrs(svcpt, hp);
+	struct ptlrpc_nrs_policy  *policy;
+	struct ptlrpc_nrs_request *nrq;
+
+	/**
+	 * Always try to drain requests from all NRS polices even if they are
+	 * inactive, because the user can change policy status at runtime.
+	 */
+	list_for_each_entry(policy, &nrs->nrs_policy_queued,
+				pol_list_queued) {
+		nrq = nrs_request_get(policy, peek, force);
+		if (nrq != NULL) {
+			if (likely(!peek)) {
+				nrq->nr_started = 1;
+
+				policy->pol_req_started++;
+				policy->pol_nrs->nrs_req_started++;
+
+				nrs_request_removed(policy);
+			}
+
+			return container_of(nrq, struct ptlrpc_request, rq_nrq);
+		}
+	}
+
+	return NULL;
+}
+
+/**
+ * Dequeues request \a req from the policy it has been enqueued on.
+ *
+ * \param[in] req the request
+ */
+void ptlrpc_nrs_req_del_nolock(struct ptlrpc_request *req)
+{
+	struct ptlrpc_nrs_policy *policy = nrs_request_policy(&req->rq_nrq);
+
+	policy->pol_desc->pd_ops->op_req_dequeue(policy, &req->rq_nrq);
+
+	req->rq_nrq.nr_enqueued = 0;
+
+	nrs_request_removed(policy);
+}
+
+/**
+ * Returns whether there are any requests currently enqueued on any of the
+ * policies of service partition's \a svcpt NRS head specified by \a hp. Should
+ * be called while holding ptlrpc_service_part::scp_req_lock to get a reliable
+ * result.
+ *
+ * \param[in] svcpt the service partition to enquire.
+ * \param[in] hp    whether the regular or high-priority NRS head is to be
+ *		    enquired.
+ *
+ * \retval false the indicated NRS head has no enqueued requests.
+ * \retval true	 the indicated NRS head has some enqueued requests.
+ */
+bool ptlrpc_nrs_req_pending_nolock(struct ptlrpc_service_part *svcpt, bool hp)
+{
+	struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp);
+
+	return nrs->nrs_req_queued > 0;
+};
+
+/**
+ * Returns whether NRS policy is throttling reqeust
+ *
+ * \param[in] svcpt the service partition to enquire.
+ * \param[in] hp    whether the regular or high-priority NRS head is to be
+ *		    enquired.
+ *
+ * \retval false the indicated NRS head has no enqueued requests.
+ * \retval true	 the indicated NRS head has some enqueued requests.
+ */
+bool ptlrpc_nrs_req_throttling_nolock(struct ptlrpc_service_part *svcpt,
+				      bool hp)
+{
+	struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp);
+
+	return !!nrs->nrs_throttling;
+};
+
+/**
+ * Moves request \a req from the regular to the high-priority NRS head.
+ *
+ * \param[in] req the request to move
+ */
+void ptlrpc_nrs_req_hp_move(struct ptlrpc_request *req)
+{
+	struct ptlrpc_service_part	*svcpt = req->rq_rqbd->rqbd_svcpt;
+	struct ptlrpc_nrs_request	*nrq = &req->rq_nrq;
+	struct ptlrpc_nrs_resource	*res1[NRS_RES_MAX];
+	struct ptlrpc_nrs_resource	*res2[NRS_RES_MAX];
+	ENTRY;
+
+	/**
+	 * Obtain the high-priority NRS head resources.
+	 */
+	nrs_resource_get_safe(nrs_svcpt2nrs(svcpt, true), nrq, res1, true);
+
+	spin_lock(&svcpt->scp_req_lock);
+
+	if (!ptlrpc_nrs_req_can_move(req))
+		goto out;
+
+	ptlrpc_nrs_req_del_nolock(req);
+
+	memcpy(res2, nrq->nr_res_ptrs, NRS_RES_MAX * sizeof(res2[0]));
+	memcpy(nrq->nr_res_ptrs, res1, NRS_RES_MAX * sizeof(res1[0]));
+
+	ptlrpc_nrs_hpreq_add_nolock(req);
+
+	memcpy(res1, res2, NRS_RES_MAX * sizeof(res1[0]));
+out:
+	spin_unlock(&svcpt->scp_req_lock);
+
+	/**
+	 * Release either the regular NRS head resources if we moved the
+	 * request, or the high-priority NRS head resources if we took a
+	 * reference earlier in this function and ptlrpc_nrs_req_can_move()
+	 * returned false.
+	 */
+	nrs_resource_put_safe(res1);
+	EXIT;
+}
+
+/**
+ * Carries out a control operation \a opc on the policy identified by the
+ * human-readable \a name, on either all partitions, or only on the first
+ * partition of service \a svc.
+ *
+ * \param[in]	  svc	 the service the policy belongs to.
+ * \param[in]	  queue  whether to carry out the command on the policy which
+ *			 belongs to the regular, high-priority, or both NRS
+ *			 heads of service partitions of \a svc.
+ * \param[in]	  name   the policy to act upon, by human-readable name
+ * \param[in]	  opc	 the opcode of the operation to carry out
+ * \param[in]	  single when set, the operation will only be carried out on the
+ *			 NRS heads of the first service partition of \a svc.
+ *			 This is useful for some policies which e.g. share
+ *			 identical values on the same parameters of different
+ *			 service partitions; when reading these parameters via
+ *			 lprocfs, these policies may just want to obtain and
+ *			 print out the values from the first service partition.
+ *			 Storing these values centrally elsewhere then could be
+ *			 another solution for this.
+ * \param[in,out] arg	 can be used as a generic in/out buffer between control
+ *			 operations and the user environment.
+ *
+ *\retval -ve error condition
+ *\retval   0 operation was carried out successfully
+ */
+int ptlrpc_nrs_policy_control(const struct ptlrpc_service *svc,
+			      enum ptlrpc_nrs_queue_type queue, char *name,
+			      enum ptlrpc_nrs_ctl opc, bool single, void *arg)
+{
+	struct ptlrpc_service_part     *svcpt;
+	int				i;
+	int				rc = 0;
+	ENTRY;
+
+	LASSERT(opc != PTLRPC_NRS_CTL_INVALID);
+
+	if ((queue & PTLRPC_NRS_QUEUE_BOTH) == 0)
+		return -EINVAL;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) {
+			rc = nrs_policy_ctl(nrs_svcpt2nrs(svcpt, false), name,
+					    opc, arg);
+			if (rc != 0 || (queue == PTLRPC_NRS_QUEUE_REG &&
+					single))
+				GOTO(out, rc);
+		}
+
+		if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) {
+			/**
+			 * XXX: We could optionally check for
+			 * nrs_svc_has_hp(svc) here, and return an error if it
+			 * is false. Right now we rely on the policies' lprocfs
+			 * handlers that call the present function to make this
+			 * check; if they fail to do so, they might hit the
+			 * assertion inside nrs_svcpt2nrs() below.
+			 */
+			rc = nrs_policy_ctl(nrs_svcpt2nrs(svcpt, true), name,
+					    opc, arg);
+			if (rc != 0 || single)
+				GOTO(out, rc);
+		}
+	}
+out:
+	RETURN(rc);
+}
+
+/**
+ * Adds all policies that ship with the ptlrpc module, to NRS core's list of
+ * policies \e nrs_core.nrs_policies.
+ *
+ * \retval 0 all policies have been registered successfully
+ * \retval -ve error
+ */
+int ptlrpc_nrs_init(void)
+{
+	int	rc;
+	ENTRY;
+
+	mutex_init(&nrs_core.nrs_mutex);
+	INIT_LIST_HEAD(&nrs_core.nrs_policies);
+
+	rc = ptlrpc_nrs_policy_register(&nrs_conf_fifo);
+	if (rc != 0)
+		GOTO(fail, rc);
+
+#ifdef HAVE_SERVER_SUPPORT
+	rc = ptlrpc_nrs_policy_register(&nrs_conf_crrn);
+	if (rc != 0)
+		GOTO(fail, rc);
+
+	rc = ptlrpc_nrs_policy_register(&nrs_conf_orr);
+	if (rc != 0)
+		GOTO(fail, rc);
+
+	rc = ptlrpc_nrs_policy_register(&nrs_conf_trr);
+	if (rc != 0)
+		GOTO(fail, rc);
+	rc = ptlrpc_nrs_policy_register(&nrs_conf_tbf);
+	if (rc != 0)
+		GOTO(fail, rc);
+
+	rc = ptlrpc_nrs_policy_register(&nrs_conf_delay);
+	if (rc != 0)
+		GOTO(fail, rc);
+#endif /* HAVE_SERVER_SUPPORT */
+
+	RETURN(rc);
+fail:
+	/**
+	 * Since no PTLRPC services have been started at this point, all we need
+	 * to do for cleanup is to free the descriptors.
+	 */
+	ptlrpc_nrs_fini();
+
+	RETURN(rc);
+}
+
+/**
+ * Removes all policy descriptors from nrs_core::nrs_policies, and frees the
+ * policy descriptors.
+ *
+ * Since all PTLRPC services are stopped at this point, there are no more
+ * instances of any policies, because each service will have stopped its policy
+ * instances in ptlrpc_service_nrs_cleanup(), so we just need to free the
+ * descriptors here.
+ */
+void ptlrpc_nrs_fini(void)
+{
+	struct ptlrpc_nrs_pol_desc *desc;
+	struct ptlrpc_nrs_pol_desc *tmp;
+
+	list_for_each_entry_safe(desc, tmp, &nrs_core.nrs_policies,
+				     pd_list) {
+		list_del_init(&desc->pd_list);
+		OBD_FREE_PTR(desc);
+	}
+}
+
+/** @} nrs */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c
new file mode 100644
index 0000000000000..6a54fa0f775a5
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c
@@ -0,0 +1,883 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2016, Intel Corporation.
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ * lustre/ptlrpc/nrs_crr.c
+ *
+ * Network Request Scheduler (NRS) CRR-N policy
+ *
+ * Request ordering in a batched Round-Robin manner over client NIDs
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ * Author: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
+ */
+/**
+ * \addtogoup nrs
+ * @{
+ */
+#ifdef HAVE_SERVER_SUPPORT
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lprocfs_status.h>
+#include "ptlrpc_internal.h"
+
+/**
+ * \name CRR-N policy
+ *
+ * Client Round-Robin scheduling over client NIDs
+ *
+ * @{
+ *
+ */
+
+#define NRS_POL_NAME_CRRN	"crrn"
+
+/**
+ * Binary heap predicate.
+ *
+ * Uses ptlrpc_nrs_request::nr_u::crr::cr_round and
+ * ptlrpc_nrs_request::nr_u::crr::cr_sequence to compare two binheap nodes and
+ * produce a binary predicate that shows their relative priority, so that the
+ * binary heap can perform the necessary sorting operations.
+ *
+ * \param[in] e1 the first binheap node to compare
+ * \param[in] e2 the second binheap node to compare
+ *
+ * \retval 0 e1 > e2
+ * \retval 1 e1 <= e2
+ */
+static int
+crrn_req_compare(struct cfs_binheap_node *e1, struct cfs_binheap_node *e2)
+{
+	struct ptlrpc_nrs_request *nrq1;
+	struct ptlrpc_nrs_request *nrq2;
+
+	nrq1 = container_of(e1, struct ptlrpc_nrs_request, nr_node);
+	nrq2 = container_of(e2, struct ptlrpc_nrs_request, nr_node);
+
+	if (nrq1->nr_u.crr.cr_round < nrq2->nr_u.crr.cr_round)
+		return 1;
+	else if (nrq1->nr_u.crr.cr_round > nrq2->nr_u.crr.cr_round)
+		return 0;
+
+	return nrq1->nr_u.crr.cr_sequence < nrq2->nr_u.crr.cr_sequence;
+}
+
+static struct cfs_binheap_ops nrs_crrn_heap_ops = {
+	.hop_enter	= NULL,
+	.hop_exit	= NULL,
+	.hop_compare	= crrn_req_compare,
+};
+
+/**
+ * libcfs_hash operations for nrs_crrn_net::cn_cli_hash
+ *
+ * This uses ptlrpc_request::rq_peer.nid as its key, in order to hash
+ * nrs_crrn_client objects.
+ */
+#define NRS_NID_BKT_BITS	8
+#define NRS_NID_BITS		16
+
+static unsigned nrs_crrn_hop_hash(struct cfs_hash *hs, const void *key,
+				  unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, sizeof(lnet_nid_t), mask);
+}
+
+static int nrs_crrn_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+	lnet_nid_t		*nid = (lnet_nid_t *)key;
+	struct nrs_crrn_client	*cli = hlist_entry(hnode,
+						       struct nrs_crrn_client,
+						       cc_hnode);
+	return *nid == cli->cc_nid;
+}
+
+static void *nrs_crrn_hop_key(struct hlist_node *hnode)
+{
+	struct nrs_crrn_client	*cli = hlist_entry(hnode,
+						       struct nrs_crrn_client,
+						       cc_hnode);
+	return &cli->cc_nid;
+}
+
+static void *nrs_crrn_hop_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct nrs_crrn_client, cc_hnode);
+}
+
+static void nrs_crrn_hop_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_crrn_client *cli = hlist_entry(hnode,
+						      struct nrs_crrn_client,
+						      cc_hnode);
+	atomic_inc(&cli->cc_ref);
+}
+
+static void nrs_crrn_hop_put(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_crrn_client	*cli = hlist_entry(hnode,
+						       struct nrs_crrn_client,
+						       cc_hnode);
+	atomic_dec(&cli->cc_ref);
+}
+
+static void nrs_crrn_hop_exit(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_crrn_client	*cli = hlist_entry(hnode,
+						       struct nrs_crrn_client,
+						       cc_hnode);
+	LASSERTF(atomic_read(&cli->cc_ref) == 0,
+		 "Busy CRR-N object from client with NID %s, with %d refs\n",
+		 libcfs_nid2str(cli->cc_nid), atomic_read(&cli->cc_ref));
+
+	OBD_FREE_PTR(cli);
+}
+
+static struct cfs_hash_ops nrs_crrn_hash_ops = {
+	.hs_hash	= nrs_crrn_hop_hash,
+	.hs_keycmp	= nrs_crrn_hop_keycmp,
+	.hs_key		= nrs_crrn_hop_key,
+	.hs_object	= nrs_crrn_hop_object,
+	.hs_get		= nrs_crrn_hop_get,
+	.hs_put		= nrs_crrn_hop_put,
+	.hs_put_locked	= nrs_crrn_hop_put,
+	.hs_exit	= nrs_crrn_hop_exit,
+};
+
+/**
+ * Called when a CRR-N policy instance is started.
+ *
+ * \param[in] policy the policy
+ *
+ * \retval -ENOMEM OOM error
+ * \retval 0	   success
+ */
+static int nrs_crrn_start(struct ptlrpc_nrs_policy *policy, char *arg)
+{
+	struct nrs_crrn_net    *net;
+	int			rc = 0;
+	ENTRY;
+
+	OBD_CPT_ALLOC_PTR(net, nrs_pol2cptab(policy), nrs_pol2cptid(policy));
+	if (net == NULL)
+		RETURN(-ENOMEM);
+
+	net->cn_binheap = cfs_binheap_create(&nrs_crrn_heap_ops,
+					     CBH_FLAG_ATOMIC_GROW, 4096, NULL,
+					     nrs_pol2cptab(policy),
+					     nrs_pol2cptid(policy));
+	if (net->cn_binheap == NULL)
+		GOTO(out_net, rc = -ENOMEM);
+
+	net->cn_cli_hash = cfs_hash_create("nrs_crrn_nid_hash",
+					   NRS_NID_BITS, NRS_NID_BITS,
+					   NRS_NID_BKT_BITS, 0,
+					   CFS_HASH_MIN_THETA,
+					   CFS_HASH_MAX_THETA,
+					   &nrs_crrn_hash_ops,
+					   CFS_HASH_RW_BKTLOCK);
+	if (net->cn_cli_hash == NULL)
+		GOTO(out_binheap, rc = -ENOMEM);
+
+	/**
+	 * Set default quantum value to max_rpcs_in_flight for non-MDS OSCs;
+	 * there may be more RPCs pending from each struct nrs_crrn_client even
+	 * with the default max_rpcs_in_flight value, as we are scheduling over
+	 * NIDs, and there may be more than one mount point per client.
+	 */
+	net->cn_quantum = OBD_MAX_RIF_DEFAULT;
+	/**
+	 * Set to 1 so that the test inside nrs_crrn_req_add() can evaluate to
+	 * true.
+	 */
+	net->cn_sequence = 1;
+
+	policy->pol_private = net;
+
+	RETURN(rc);
+
+out_binheap:
+	cfs_binheap_destroy(net->cn_binheap);
+out_net:
+	OBD_FREE_PTR(net);
+
+	RETURN(rc);
+}
+
+/**
+ * Called when a CRR-N policy instance is stopped.
+ *
+ * Called when the policy has been instructed to transition to the
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state and has no more pending
+ * requests to serve.
+ *
+ * \param[in] policy the policy
+ */
+static void nrs_crrn_stop(struct ptlrpc_nrs_policy *policy)
+{
+	struct nrs_crrn_net	*net = policy->pol_private;
+	ENTRY;
+
+	LASSERT(net != NULL);
+	LASSERT(net->cn_binheap != NULL);
+	LASSERT(net->cn_cli_hash != NULL);
+	LASSERT(cfs_binheap_is_empty(net->cn_binheap));
+
+	cfs_binheap_destroy(net->cn_binheap);
+	cfs_hash_putref(net->cn_cli_hash);
+
+	OBD_FREE_PTR(net);
+}
+
+/**
+ * Performs a policy-specific ctl function on CRR-N policy instances; similar
+ * to ioctl.
+ *
+ * \param[in]	  policy the policy instance
+ * \param[in]	  opc	 the opcode
+ * \param[in,out] arg	 used for passing parameters and information
+ *
+ * \pre assert_spin_locked(&policy->pol_nrs->->nrs_lock)
+ * \post assert_spin_locked(&policy->pol_nrs->->nrs_lock)
+ *
+ * \retval 0   operation carried out successfully
+ * \retval -ve error
+ */
+static int nrs_crrn_ctl(struct ptlrpc_nrs_policy *policy,
+			enum ptlrpc_nrs_ctl opc,
+			void *arg)
+{
+	assert_spin_locked(&policy->pol_nrs->nrs_lock);
+
+	switch((enum nrs_ctl_crr)opc) {
+	default:
+		RETURN(-EINVAL);
+
+	/**
+	 * Read Round Robin quantum size of a policy instance.
+	 */
+	case NRS_CTL_CRRN_RD_QUANTUM: {
+		struct nrs_crrn_net	*net = policy->pol_private;
+
+		*(__u16 *)arg = net->cn_quantum;
+		}
+		break;
+
+	/**
+	 * Write Round Robin quantum size of a policy instance.
+	 */
+	case NRS_CTL_CRRN_WR_QUANTUM: {
+		struct nrs_crrn_net	*net = policy->pol_private;
+
+		net->cn_quantum = *(__u16 *)arg;
+		LASSERT(net->cn_quantum != 0);
+		}
+		break;
+	}
+
+	RETURN(0);
+}
+
+/**
+ * Obtains resources from CRR-N policy instances. The top-level resource lives
+ * inside \e nrs_crrn_net and the second-level resource inside
+ * \e nrs_crrn_client object instances.
+ *
+ * \param[in]  policy	  the policy for which resources are being taken for
+ *			  request \a nrq
+ * \param[in]  nrq	  the request for which resources are being taken
+ * \param[in]  parent	  parent resource, embedded in nrs_crrn_net for the
+ *			  CRR-N policy
+ * \param[out] resp	  resources references are placed in this array
+ * \param[in]  moving_req signifies limited caller context; used to perform
+ *			  memory allocations in an atomic context in this
+ *			  policy
+ *
+ * \retval 0   we are returning a top-level, parent resource, one that is
+ *	       embedded in an nrs_crrn_net object
+ * \retval 1   we are returning a bottom-level resource, one that is embedded
+ *	       in an nrs_crrn_client object
+ *
+ * \see nrs_resource_get_safe()
+ */
+static int nrs_crrn_res_get(struct ptlrpc_nrs_policy *policy,
+			    struct ptlrpc_nrs_request *nrq,
+			    const struct ptlrpc_nrs_resource *parent,
+			    struct ptlrpc_nrs_resource **resp, bool moving_req)
+{
+	struct nrs_crrn_net	*net;
+	struct nrs_crrn_client	*cli;
+	struct nrs_crrn_client	*tmp;
+	struct ptlrpc_request	*req;
+
+	if (parent == NULL) {
+		*resp = &((struct nrs_crrn_net *)policy->pol_private)->cn_res;
+		return 0;
+	}
+
+	net = container_of(parent, struct nrs_crrn_net, cn_res);
+	req = container_of(nrq, struct ptlrpc_request, rq_nrq);
+
+	cli = cfs_hash_lookup(net->cn_cli_hash, &req->rq_peer.nid);
+	if (cli != NULL)
+		goto out;
+
+	OBD_CPT_ALLOC_GFP(cli, nrs_pol2cptab(policy), nrs_pol2cptid(policy),
+			  sizeof(*cli), moving_req ? GFP_ATOMIC : GFP_NOFS);
+	if (cli == NULL)
+		return -ENOMEM;
+
+	cli->cc_nid = req->rq_peer.nid;
+
+	atomic_set(&cli->cc_ref, 1);
+	tmp = cfs_hash_findadd_unique(net->cn_cli_hash, &cli->cc_nid,
+				      &cli->cc_hnode);
+	if (tmp != cli) {
+		OBD_FREE_PTR(cli);
+		cli = tmp;
+	}
+out:
+	*resp = &cli->cc_res;
+
+	return 1;
+}
+
+/**
+ * Called when releasing references to the resource hierachy obtained for a
+ * request for scheduling using the CRR-N policy.
+ *
+ * \param[in] policy   the policy the resource belongs to
+ * \param[in] res      the resource to be released
+ */
+static void nrs_crrn_res_put(struct ptlrpc_nrs_policy *policy,
+			     const struct ptlrpc_nrs_resource *res)
+{
+	struct nrs_crrn_net	*net;
+	struct nrs_crrn_client	*cli;
+
+	/**
+	 * Do nothing for freeing parent, nrs_crrn_net resources
+	 */
+	if (res->res_parent == NULL)
+		return;
+
+	cli = container_of(res, struct nrs_crrn_client, cc_res);
+	net = container_of(res->res_parent, struct nrs_crrn_net, cn_res);
+
+	cfs_hash_put(net->cn_cli_hash, &cli->cc_hnode);
+}
+
+/**
+ * Called when getting a request from the CRR-N policy for handlingso that it can be served
+ *
+ * \param[in] policy the policy being polled
+ * \param[in] peek   when set, signifies that we just want to examine the
+ *		     request, and not handle it, so the request is not removed
+ *		     from the policy.
+ * \param[in] force  force the policy to return a request; unused in this policy
+ *
+ * \retval the request to be handled
+ * \retval NULL no request available
+ *
+ * \see ptlrpc_nrs_req_get_nolock()
+ * \see nrs_request_get()
+ */
+static
+struct ptlrpc_nrs_request *nrs_crrn_req_get(struct ptlrpc_nrs_policy *policy,
+					    bool peek, bool force)
+{
+	struct nrs_crrn_net	  *net = policy->pol_private;
+	struct cfs_binheap_node	  *node = cfs_binheap_root(net->cn_binheap);
+	struct ptlrpc_nrs_request *nrq;
+
+	nrq = unlikely(node == NULL) ? NULL :
+	      container_of(node, struct ptlrpc_nrs_request, nr_node);
+
+	if (likely(!peek && nrq != NULL)) {
+		struct nrs_crrn_client *cli;
+		struct ptlrpc_request *req = container_of(nrq,
+							  struct ptlrpc_request,
+							  rq_nrq);
+
+		cli = container_of(nrs_request_resource(nrq),
+				   struct nrs_crrn_client, cc_res);
+
+		LASSERT(nrq->nr_u.crr.cr_round <= cli->cc_round);
+
+		cfs_binheap_remove(net->cn_binheap, &nrq->nr_node);
+		cli->cc_active--;
+
+		CDEBUG(D_RPCTRACE,
+		       "NRS: starting to handle %s request from %s, with round "
+		       "%llu\n", NRS_POL_NAME_CRRN,
+		       libcfs_id2str(req->rq_peer), nrq->nr_u.crr.cr_round);
+
+		/** Peek at the next request to be served */
+		node = cfs_binheap_root(net->cn_binheap);
+
+		/** No more requests */
+		if (unlikely(node == NULL)) {
+			net->cn_round++;
+		} else {
+			struct ptlrpc_nrs_request *next;
+
+			next = container_of(node, struct ptlrpc_nrs_request,
+					    nr_node);
+
+			if (net->cn_round < next->nr_u.crr.cr_round)
+				net->cn_round = next->nr_u.crr.cr_round;
+		}
+	}
+
+	return nrq;
+}
+
+/**
+ * Adds request \a nrq to a CRR-N \a policy instance's set of queued requests
+ *
+ * A scheduling round is a stream of requests that have been sorted in batches
+ * according to the client that they originate from (as identified by its NID);
+ * there can be only one batch for each client in each round. The batches are of
+ * maximum size nrs_crrn_net:cn_quantum. When a new request arrives for
+ * scheduling from a client that has exhausted its quantum in its current round,
+ * it will start scheduling requests on the next scheduling round. Clients are
+ * allowed to schedule requests against a round until all requests for the round
+ * are serviced, so a client might miss a round if it is not generating requests
+ * for a long enough period of time. Clients that miss a round will continue
+ * with scheduling the next request that they generate, starting at the round
+ * that requests are being dispatched for, at the time of arrival of this new
+ * request.
+ *
+ * Requests are tagged with the round number and a sequence number; the sequence
+ * number indicates the relative ordering amongst the batches of requests in a
+ * round, and is identical for all requests in a batch, as is the round number.
+ * The round and sequence numbers are used by crrn_req_compare() in order to
+ * maintain an ordered set of rounds, with each round consisting of an ordered
+ * set of batches of requests.
+ *
+ * \param[in] policy the policy
+ * \param[in] nrq    the request to add
+ *
+ * \retval 0	request successfully added
+ * \retval != 0 error
+ */
+static int nrs_crrn_req_add(struct ptlrpc_nrs_policy *policy,
+			    struct ptlrpc_nrs_request *nrq)
+{
+	struct nrs_crrn_net	*net;
+	struct nrs_crrn_client	*cli;
+	int			 rc;
+
+	cli = container_of(nrs_request_resource(nrq),
+			   struct nrs_crrn_client, cc_res);
+	net = container_of(nrs_request_resource(nrq)->res_parent,
+			   struct nrs_crrn_net, cn_res);
+
+	if (cli->cc_quantum == 0 || cli->cc_round < net->cn_round ||
+	    (cli->cc_active == 0 && cli->cc_quantum > 0)) {
+
+		/**
+		 * If the client has no pending requests, and still some of its
+		 * quantum remaining unused, which implies it has not had a
+		 * chance to schedule up to its maximum allowed batch size of
+		 * requests in the previous round it participated, schedule this
+		 * next request on a new round; this avoids fragmentation of
+		 * request batches caused by client inactivity, at the expense
+		 * of potentially slightly increased service time for the
+		 * request batch this request will be a part of.
+		 */
+		if (cli->cc_active == 0 && cli->cc_quantum > 0)
+			cli->cc_round++;
+
+		/** A new scheduling round has commenced */
+		if (cli->cc_round < net->cn_round)
+			cli->cc_round = net->cn_round;
+
+		/** I was not the last client through here */
+		if (cli->cc_sequence < net->cn_sequence)
+			cli->cc_sequence = ++net->cn_sequence;
+		/**
+		 * Reset the quantum if we have reached the maximum quantum
+		 * size for this batch, or even if we have not managed to
+		 * complete a batch size up to its maximum allowed size.
+		 * XXX: Accessed unlocked
+		 */
+		cli->cc_quantum = net->cn_quantum;
+	}
+
+	nrq->nr_u.crr.cr_round = cli->cc_round;
+	nrq->nr_u.crr.cr_sequence = cli->cc_sequence;
+
+	rc = cfs_binheap_insert(net->cn_binheap, &nrq->nr_node);
+	if (rc == 0) {
+		cli->cc_active++;
+		if (--cli->cc_quantum == 0)
+			cli->cc_round++;
+	}
+	return rc;
+}
+
+/**
+ * Removes request \a nrq from a CRR-N \a policy instance's set of queued
+ * requests.
+ *
+ * \param[in] policy the policy
+ * \param[in] nrq    the request to remove
+ */
+static void nrs_crrn_req_del(struct ptlrpc_nrs_policy *policy,
+			     struct ptlrpc_nrs_request *nrq)
+{
+	struct nrs_crrn_net	*net;
+	struct nrs_crrn_client	*cli;
+	bool			 is_root;
+
+	cli = container_of(nrs_request_resource(nrq),
+			   struct nrs_crrn_client, cc_res);
+	net = container_of(nrs_request_resource(nrq)->res_parent,
+			   struct nrs_crrn_net, cn_res);
+
+	LASSERT(nrq->nr_u.crr.cr_round <= cli->cc_round);
+
+	is_root = &nrq->nr_node == cfs_binheap_root(net->cn_binheap);
+
+	cfs_binheap_remove(net->cn_binheap, &nrq->nr_node);
+	cli->cc_active--;
+
+	/**
+	 * If we just deleted the node at the root of the binheap, we may have
+	 * to adjust round numbers.
+	 */
+	if (unlikely(is_root)) {
+		/** Peek at the next request to be served */
+		struct cfs_binheap_node *node = cfs_binheap_root(net->cn_binheap);
+
+		/** No more requests */
+		if (unlikely(node == NULL)) {
+			net->cn_round++;
+		} else {
+			nrq = container_of(node, struct ptlrpc_nrs_request,
+					   nr_node);
+
+			if (net->cn_round < nrq->nr_u.crr.cr_round)
+				net->cn_round = nrq->nr_u.crr.cr_round;
+		}
+	}
+}
+
+/**
+ * Called right after the request \a nrq finishes being handled by CRR-N policy
+ * instance \a policy.
+ *
+ * \param[in] policy the policy that handled the request
+ * \param[in] nrq    the request that was handled
+ */
+static void nrs_crrn_req_stop(struct ptlrpc_nrs_policy *policy,
+			      struct ptlrpc_nrs_request *nrq)
+{
+	struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request,
+						  rq_nrq);
+
+	CDEBUG(D_RPCTRACE,
+	       "NRS: finished handling %s request from %s, with round %llu"
+	       "\n", NRS_POL_NAME_CRRN,
+	       libcfs_id2str(req->rq_peer), nrq->nr_u.crr.cr_round);
+}
+
+#ifdef CONFIG_PROC_FS
+
+/**
+ * lprocfs interface
+ */
+
+/**
+ * Retrieves the value of the Round Robin quantum (i.e. the maximum batch size)
+ * for CRR-N policy instances on both the regular and high-priority NRS head
+ * of a service, as long as a policy instance is not in the
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state; policy instances in this
+ * state are skipped later by nrs_crrn_ctl().
+ *
+ * Quantum values are in # of RPCs, and output is in YAML format.
+ *
+ * For example:
+ *
+ *	reg_quantum:8
+ *	hp_quantum:4
+ */
+static int
+ptlrpc_lprocfs_nrs_crrn_quantum_seq_show(struct seq_file *m, void *data)
+{
+	struct ptlrpc_service	*svc = m->private;
+	__u16			quantum;
+	int			rc;
+
+	/**
+	 * Perform two separate calls to this as only one of the NRS heads'
+	 * policies may be in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or
+	 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING state.
+	 */
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+				       NRS_POL_NAME_CRRN,
+				       NRS_CTL_CRRN_RD_QUANTUM,
+				       true, &quantum);
+	if (rc == 0) {
+		seq_printf(m, NRS_LPROCFS_QUANTUM_NAME_REG
+			   "%-5d\n", quantum);
+		/**
+		 * Ignore -ENODEV as the regular NRS head's policy may be in the
+		 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	} else if (rc != -ENODEV) {
+		return rc;
+	}
+
+	if (!nrs_svc_has_hp(svc))
+		goto no_hp;
+
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+				       NRS_POL_NAME_CRRN,
+				       NRS_CTL_CRRN_RD_QUANTUM,
+				       true, &quantum);
+	if (rc == 0) {
+		seq_printf(m, NRS_LPROCFS_QUANTUM_NAME_HP"%-5d\n", quantum);
+		/**
+		 * Ignore -ENODEV as the high priority NRS head's policy may be
+		 * in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	} else if (rc != -ENODEV) {
+		return rc;
+	}
+
+no_hp:
+	return rc;
+}
+
+/**
+ * Sets the value of the Round Robin quantum (i.e. the maximum batch size)
+ * for CRR-N policy instances of a service. The user can set the quantum size
+ * for the regular or high priority NRS head individually by specifying each
+ * value, or both together in a single invocation.
+ *
+ * For example:
+ *
+ * lctl set_param *.*.*.nrs_crrn_quantum=reg_quantum:32, to set the regular
+ * request quantum size on all PTLRPC services to 32
+ *
+ * lctl set_param *.*.*.nrs_crrn_quantum=hp_quantum:16, to set the high
+ * priority request quantum size on all PTLRPC services to 16, and
+ *
+ * lctl set_param *.*.ost_io.nrs_crrn_quantum=16, to set both the regular and
+ * high priority request quantum sizes of the ost_io service to 16.
+ *
+ * policy instances in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state
+ * are skipped later by nrs_crrn_ctl().
+ */
+static ssize_t
+ptlrpc_lprocfs_nrs_crrn_quantum_seq_write(struct file *file,
+					  const char __user *buffer,
+					  size_t count,
+					  loff_t *off)
+{
+	struct ptlrpc_service	    *svc = ((struct seq_file *)file->private_data)->private;
+	enum ptlrpc_nrs_queue_type   queue = 0;
+	char			     kernbuf[LPROCFS_NRS_WR_QUANTUM_MAX_CMD];
+	char			    *val;
+	long			     quantum_reg;
+	long			     quantum_hp;
+	/** lprocfs_find_named_value() modifies its argument, so keep a copy */
+	size_t			     count_copy;
+	int			     rc = 0;
+	int			     rc2 = 0;
+
+        if (count > (sizeof(kernbuf) - 1))
+                return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+
+        kernbuf[count] = '\0';
+
+	count_copy = count;
+
+	/**
+	 * Check if the regular quantum value has been specified
+	 */
+	val = lprocfs_find_named_value(kernbuf, NRS_LPROCFS_QUANTUM_NAME_REG,
+				       &count_copy);
+	if (val != kernbuf) {
+		quantum_reg = simple_strtol(val, NULL, 10);
+
+		queue |= PTLRPC_NRS_QUEUE_REG;
+	}
+
+	count_copy = count;
+
+	/**
+	 * Check if the high priority quantum value has been specified
+	 */
+	val = lprocfs_find_named_value(kernbuf, NRS_LPROCFS_QUANTUM_NAME_HP,
+				       &count_copy);
+	if (val != kernbuf) {
+		if (!nrs_svc_has_hp(svc))
+			return -ENODEV;
+
+		quantum_hp = simple_strtol(val, NULL, 10);
+
+		queue |= PTLRPC_NRS_QUEUE_HP;
+	}
+
+	/**
+	 * If none of the queues has been specified, look for a valid numerical
+	 * value
+	 */
+	if (queue == 0) {
+		if (!isdigit(kernbuf[0]))
+			return -EINVAL;
+
+		quantum_reg = simple_strtol(kernbuf, NULL, 10);
+
+		queue = PTLRPC_NRS_QUEUE_REG;
+
+		if (nrs_svc_has_hp(svc)) {
+			queue |= PTLRPC_NRS_QUEUE_HP;
+			quantum_hp = quantum_reg;
+		}
+	}
+
+	if ((((queue & PTLRPC_NRS_QUEUE_REG) != 0) &&
+	    ((quantum_reg > LPROCFS_NRS_QUANTUM_MAX || quantum_reg <= 0))) ||
+	    (((queue & PTLRPC_NRS_QUEUE_HP) != 0) &&
+	    ((quantum_hp > LPROCFS_NRS_QUANTUM_MAX || quantum_hp <= 0))))
+		return -EINVAL;
+
+	/**
+	 * We change the values on regular and HP NRS heads separately, so that
+	 * we do not exit early from ptlrpc_nrs_policy_control() with an error
+	 * returned by nrs_policy_ctl_locked(), in cases where the user has not
+	 * started the policy on either the regular or HP NRS head; i.e. we are
+	 * ignoring -ENODEV within nrs_policy_ctl_locked(). -ENODEV is returned
+	 * only if the operation fails with -ENODEV on all heads that have been
+	 * specified by the command; if at least one operation succeeds,
+	 * success is returned.
+	 */
+	if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) {
+		rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+					       NRS_POL_NAME_CRRN,
+					       NRS_CTL_CRRN_WR_QUANTUM, false,
+					       &quantum_reg);
+		if ((rc < 0 && rc != -ENODEV) ||
+		    (rc == -ENODEV && queue == PTLRPC_NRS_QUEUE_REG))
+			return rc;
+	}
+
+	if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) {
+		rc2 = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+						NRS_POL_NAME_CRRN,
+						NRS_CTL_CRRN_WR_QUANTUM, false,
+						&quantum_hp);
+		if ((rc2 < 0 && rc2 != -ENODEV) ||
+		    (rc2 == -ENODEV && queue == PTLRPC_NRS_QUEUE_HP))
+			return rc2;
+	}
+
+	return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count;
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_crrn_quantum);
+
+/**
+ * Initializes a CRR-N policy's lprocfs interface for service \a svc
+ *
+ * \param[in] svc the service
+ *
+ * \retval 0	success
+ * \retval != 0	error
+ */
+static int nrs_crrn_lprocfs_init(struct ptlrpc_service *svc)
+{
+	struct lprocfs_vars nrs_crrn_lprocfs_vars[] = {
+		{ .name		= "nrs_crrn_quantum",
+		  .fops		= &ptlrpc_lprocfs_nrs_crrn_quantum_fops,
+		  .data = svc },
+		{ NULL }
+	};
+
+	if (svc->srv_procroot == NULL)
+		return 0;
+
+	return lprocfs_add_vars(svc->srv_procroot, nrs_crrn_lprocfs_vars, NULL);
+}
+
+/**
+ * Cleans up a CRR-N policy's lprocfs interface for service \a svc
+ *
+ * \param[in] svc the service
+ */
+static void nrs_crrn_lprocfs_fini(struct ptlrpc_service *svc)
+{
+	if (svc->srv_procroot == NULL)
+		return;
+
+	lprocfs_remove_proc_entry("nrs_crrn_quantum", svc->srv_procroot);
+}
+
+#endif /* CONFIG_PROC_FS */
+
+/**
+ * CRR-N policy operations
+ */
+static const struct ptlrpc_nrs_pol_ops nrs_crrn_ops = {
+	.op_policy_start	= nrs_crrn_start,
+	.op_policy_stop		= nrs_crrn_stop,
+	.op_policy_ctl		= nrs_crrn_ctl,
+	.op_res_get		= nrs_crrn_res_get,
+	.op_res_put		= nrs_crrn_res_put,
+	.op_req_get		= nrs_crrn_req_get,
+	.op_req_enqueue		= nrs_crrn_req_add,
+	.op_req_dequeue		= nrs_crrn_req_del,
+	.op_req_stop		= nrs_crrn_req_stop,
+#ifdef CONFIG_PROC_FS
+	.op_lprocfs_init	= nrs_crrn_lprocfs_init,
+	.op_lprocfs_fini	= nrs_crrn_lprocfs_fini,
+#endif
+};
+
+/**
+ * CRR-N policy configuration
+ */
+struct ptlrpc_nrs_pol_conf nrs_conf_crrn = {
+	.nc_name		= NRS_POL_NAME_CRRN,
+	.nc_ops			= &nrs_crrn_ops,
+	.nc_compat		= nrs_policy_compat_all,
+};
+
+/** @} CRR-N policy */
+
+/** @} nrs */
+
+#endif /* HAVE_SERVER_SUPPORT */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c
new file mode 100644
index 0000000000000..77da1c1bfacf0
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c
@@ -0,0 +1,842 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Cray Inc. All Rights Reserved.
+ *
+ * Copyright (c) 2017, Intel Corporation.
+ */
+/*
+ * lustre/ptlrpc/nrs_delay.c
+ *
+ * Network Request Scheduler (NRS) Delay policy
+ *
+ * This policy will delay request handling for some configurable amount of
+ * time.
+ *
+ * Author: Chris Horn <hornc@cray.com>
+ */
+/**
+ * \addtogoup nrs
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include "ptlrpc_internal.h"
+
+/**
+ * \name delay
+ *
+ * The delay policy schedules RPCs so that they are only processed after some
+ * configurable amount of time (in seconds) has passed.
+ *
+ * The defaults were chosen arbitrarily.
+ *
+ * @{
+ */
+
+#define NRS_POL_NAME_DELAY	"delay"
+
+/* Default minimum delay in seconds. */
+#define NRS_DELAY_MIN_DEFAULT	5
+/* Default maximum delay, in seconds. */
+#define NRS_DELAY_MAX_DEFAULT	300
+/* Default percentage of delayed RPCs. */
+#define NRS_DELAY_PCT_DEFAULT	100
+
+/**
+ * Binary heap predicate.
+ *
+ * Elements are sorted according to the start time assigned to the requests
+ * upon enqueue. An element with an earlier start time is "less than" an
+ * element with a later start time.
+ *
+ * \retval 0 start_time(e1) > start_time(e2)
+ * \retval 1 start_time(e1) <= start_time(e2)
+ */
+static int delay_req_compare(struct cfs_binheap_node *e1,
+			     struct cfs_binheap_node *e2)
+{
+	struct ptlrpc_nrs_request *nrq1;
+	struct ptlrpc_nrs_request *nrq2;
+
+	nrq1 = container_of(e1, struct ptlrpc_nrs_request, nr_node);
+	nrq2 = container_of(e2, struct ptlrpc_nrs_request, nr_node);
+
+	return nrq1->nr_u.delay.req_start_time <=
+	       nrq2->nr_u.delay.req_start_time;
+}
+
+static struct cfs_binheap_ops nrs_delay_heap_ops = {
+	.hop_enter	= NULL,
+	.hop_exit	= NULL,
+	.hop_compare	= delay_req_compare,
+};
+
+/**
+ * Is called before the policy transitions into
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED; allocates and initializes
+ * the delay-specific private data structure.
+ *
+ * \param[in] policy The policy to start
+ * \param[in] Generic char buffer; unused in this policy
+ *
+ * \retval -ENOMEM OOM error
+ * \retval  0	   success
+ *
+ * \see nrs_policy_register()
+ * \see nrs_policy_ctl()
+ */
+static int nrs_delay_start(struct ptlrpc_nrs_policy *policy, char *arg)
+{
+	struct nrs_delay_data *delay_data;
+
+	ENTRY;
+
+	OBD_CPT_ALLOC_PTR(delay_data, nrs_pol2cptab(policy),
+			  nrs_pol2cptid(policy));
+	if (delay_data == NULL)
+		RETURN(-ENOMEM);
+
+	delay_data->delay_binheap = cfs_binheap_create(&nrs_delay_heap_ops,
+						       CBH_FLAG_ATOMIC_GROW,
+						       4096, NULL,
+						       nrs_pol2cptab(policy),
+						       nrs_pol2cptid(policy));
+
+	if (delay_data->delay_binheap == NULL) {
+		OBD_FREE_PTR(delay_data);
+		RETURN(-ENOMEM);
+	}
+
+	delay_data->min_delay = NRS_DELAY_MIN_DEFAULT;
+	delay_data->max_delay = NRS_DELAY_MAX_DEFAULT;
+	delay_data->delay_pct = NRS_DELAY_PCT_DEFAULT;
+
+	policy->pol_private = delay_data;
+
+	RETURN(0);
+}
+
+/**
+ * Is called before the policy transitions into
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED; deallocates the delay-specific
+ * private data structure.
+ *
+ * \param[in] policy The policy to stop
+ *
+ * \see nrs_policy_stop0()
+ */
+static void nrs_delay_stop(struct ptlrpc_nrs_policy *policy)
+{
+	struct nrs_delay_data *delay_data = policy->pol_private;
+
+	LASSERT(delay_data != NULL);
+	LASSERT(delay_data->delay_binheap != NULL);
+	LASSERT(cfs_binheap_is_empty(delay_data->delay_binheap));
+
+	cfs_binheap_destroy(delay_data->delay_binheap);
+
+	OBD_FREE_PTR(delay_data);
+}
+
+/**
+ * Is called for obtaining a delay policy resource.
+ *
+ * \param[in]  policy	  The policy on which the request is being asked for
+ * \param[in]  nrq	  The request for which resources are being taken
+ * \param[in]  parent	  Parent resource, unused in this policy
+ * \param[out] resp	  Resources references are placed in this array
+ * \param[in]  moving_req Signifies limited caller context; unused in this
+ *			  policy
+ *
+ * \retval 1 The delay policy only has a one-level resource hierarchy
+ *
+ * \see nrs_resource_get_safe()
+ */
+static int nrs_delay_res_get(struct ptlrpc_nrs_policy *policy,
+			     struct ptlrpc_nrs_request *nrq,
+			     const struct ptlrpc_nrs_resource *parent,
+			     struct ptlrpc_nrs_resource **resp, bool moving_req)
+{
+	/**
+	 * Just return the resource embedded inside nrs_delay_data, and end this
+	 * resource hierarchy reference request.
+	 */
+	*resp = &((struct nrs_delay_data *)policy->pol_private)->delay_res;
+	return 1;
+}
+
+/**
+ * Called when getting a request from the delay policy for handling, or just
+ * peeking; removes the request from the policy when it is to be handled.
+ * Requests are only removed from this policy when their start time has
+ * passed.
+ *
+ * \param[in] policy The policy
+ * \param[in] peek   When set, signifies that we just want to examine the
+ *		     request, and not handle it, so the request is not removed
+ *		     from the policy.
+ * \param[in] force  Force the policy to return a request
+ *
+ * \retval The request to be handled
+ * \retval NULL no request available
+ *
+ * \see ptlrpc_nrs_req_get_nolock()
+ * \see nrs_request_get()
+ */
+static
+struct ptlrpc_nrs_request *nrs_delay_req_get(struct ptlrpc_nrs_policy *policy,
+					     bool peek, bool force)
+{
+	struct nrs_delay_data *delay_data = policy->pol_private;
+	struct cfs_binheap_node *node;
+	struct ptlrpc_nrs_request *nrq;
+
+	node = cfs_binheap_root(delay_data->delay_binheap);
+	nrq = unlikely(node == NULL) ? NULL :
+	      container_of(node, struct ptlrpc_nrs_request, nr_node);
+
+	if (likely(nrq != NULL)) {
+		if (!force &&
+		    ktime_get_real_seconds() < nrq->nr_u.delay.req_start_time)
+			nrq = NULL;
+		else if (likely(!peek))
+			cfs_binheap_remove(delay_data->delay_binheap,
+					   &nrq->nr_node);
+	}
+
+	return nrq;
+}
+
+/**
+ * Adds request \a nrq to a delay \a policy instance's set of queued requests
+ *
+ * A percentage (delay_pct) of incoming requests are delayed by this policy.
+ * If selected for delay a request start time is calculated. A start time
+ * is the current time plus a random offset in the range [min_delay, max_delay]
+ * The start time is recorded in the request, and is then used by
+ * delay_req_compare() to maintain a set of requests ordered by their start
+ * times.
+ *
+ * \param[in] policy The policy
+ * \param[in] nrq    The request to add
+ *
+ * \retval 0 request added
+ * \retval 1 request not added
+ *
+ */
+static int nrs_delay_req_add(struct ptlrpc_nrs_policy *policy,
+			     struct ptlrpc_nrs_request *nrq)
+{
+	struct nrs_delay_data *delay_data = policy->pol_private;
+
+	if (delay_data->delay_pct == 0 || /* Not delaying anything */
+	    (delay_data->delay_pct != 100 &&
+	     delay_data->delay_pct < cfs_rand() % 100))
+		return 1;
+
+	nrq->nr_u.delay.req_start_time = ktime_get_real_seconds() + cfs_rand() %
+					 (delay_data->max_delay -
+					  delay_data->min_delay + 1) +
+					 delay_data->min_delay;
+
+	return cfs_binheap_insert(delay_data->delay_binheap, &nrq->nr_node);
+}
+
+/**
+ * Removes request \a nrq from \a policy's list of queued requests.
+ *
+ * \param[in] policy The policy
+ * \param[in] nrq    The request to remove
+ */
+static void nrs_delay_req_del(struct ptlrpc_nrs_policy *policy,
+			      struct ptlrpc_nrs_request *nrq)
+{
+	struct nrs_delay_data *delay_data = policy->pol_private;
+
+	cfs_binheap_remove(delay_data->delay_binheap, &nrq->nr_node);
+}
+
+/**
+ * Prints a debug statement right before the request \a nrq stops being
+ * handled.
+ *
+ * \param[in] policy The policy handling the request
+ * \param[in] nrq    The request being handled
+ *
+ * \see ptlrpc_server_finish_request()
+ * \see ptlrpc_nrs_req_stop_nolock()
+ */
+static void nrs_delay_req_stop(struct ptlrpc_nrs_policy *policy,
+			       struct ptlrpc_nrs_request *nrq)
+{
+	struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request,
+						  rq_nrq);
+
+	DEBUG_REQ(D_RPCTRACE, req,
+		  "NRS: finished delayed request from %s after %llds",
+		  libcfs_id2str(req->rq_peer),
+		  (s64)(nrq->nr_u.delay.req_start_time -
+			req->rq_srv.sr_arrival_time.tv_sec));
+}
+
+/**
+ * Performs ctl functions specific to delay policy instances; similar to ioctl
+ *
+ * \param[in]     policy the policy instance
+ * \param[in]     opc    the opcode
+ * \param[in,out] arg    used for passing parameters and information
+ *
+ * \pre assert_spin_locked(&policy->pol_nrs->->nrs_lock)
+ * \post assert_spin_locked(&policy->pol_nrs->->nrs_lock)
+ *
+ * \retval 0   operation carried out successfully
+ * \retval -ve error
+ */
+static int nrs_delay_ctl(struct ptlrpc_nrs_policy *policy,
+			 enum ptlrpc_nrs_ctl opc, void *arg)
+{
+	struct nrs_delay_data *delay_data = policy->pol_private;
+	__u32 *val = (__u32 *)arg;
+
+	assert_spin_locked(&policy->pol_nrs->nrs_lock);
+
+	switch ((enum nrs_ctl_delay)opc) {
+	default:
+		RETURN(-EINVAL);
+
+	case NRS_CTL_DELAY_RD_MIN:
+		*val = delay_data->min_delay;
+		break;
+
+	case NRS_CTL_DELAY_WR_MIN:
+		if (*val > delay_data->max_delay)
+			RETURN(-EINVAL);
+
+		delay_data->min_delay = *val;
+		break;
+
+	case NRS_CTL_DELAY_RD_MAX:
+		*val = delay_data->max_delay;
+		break;
+
+	case NRS_CTL_DELAY_WR_MAX:
+		if (*val < delay_data->min_delay)
+			RETURN(-EINVAL);
+
+		delay_data->max_delay = *val;
+		break;
+
+	case NRS_CTL_DELAY_RD_PCT:
+		*val = delay_data->delay_pct;
+		break;
+
+	case NRS_CTL_DELAY_WR_PCT:
+		if (*val < 0 || *val > 100)
+			RETURN(-EINVAL);
+
+		delay_data->delay_pct = *val;
+		break;
+	}
+	RETURN(0);
+}
+
+/**
+ * lprocfs interface
+ */
+
+#ifdef CONFIG_PROC_FS
+
+/* nrs_delay_min and nrs_delay_max are bounded by these values */
+#define LPROCFS_NRS_DELAY_LOWER_BOUND		0
+#define LPROCFS_NRS_DELAY_UPPER_BOUND		65535
+
+#define LPROCFS_NRS_DELAY_MIN_NAME		"delay_min:"
+#define LPROCFS_NRS_DELAY_MIN_NAME_REG		"reg_delay_min:"
+#define LPROCFS_NRS_DELAY_MIN_NAME_HP		"hp_delay_min:"
+
+/**
+ * Max size of the nrs_delay_min seq_write buffer. Needs to be large enough
+ * to hold the string: "reg_min_delay:65535 hp_min_delay:65535"
+ */
+#define LPROCFS_NRS_DELAY_MIN_SIZE					       \
+	sizeof(LPROCFS_NRS_DELAY_MIN_NAME_REG				       \
+	       __stringify(LPROCFS_NRS_DELAY_UPPER_BOUND)		       \
+	       " " LPROCFS_NRS_DELAY_MIN_NAME_HP			       \
+	       __stringify(LPROCFS_NRS_DELAY_UPPER_BOUND))
+
+#define LPROCFS_NRS_DELAY_MAX_NAME		"delay_max:"
+#define LPROCFS_NRS_DELAY_MAX_NAME_REG		"reg_delay_max:"
+#define LPROCFS_NRS_DELAY_MAX_NAME_HP		"hp_delay_max:"
+
+/**
+ * Similar to LPROCFS_NRS_DELAY_MIN_SIZE above, but for the nrs_delay_max
+ * variable.
+ */
+#define LPROCFS_NRS_DELAY_MAX_SIZE					       \
+	sizeof(LPROCFS_NRS_DELAY_MAX_NAME_REG				       \
+	       __stringify(LPROCFS_NRS_DELAY_UPPER_BOUND)		       \
+	       " " LPROCFS_NRS_DELAY_MAX_NAME_HP			       \
+	       __stringify(LPROCFS_NRS_DELAY_UPPER_BOUND))
+
+#define LPROCFS_NRS_DELAY_PCT_MIN_VAL		0
+#define LPROCFS_NRS_DELAY_PCT_MAX_VAL		100
+#define LPROCFS_NRS_DELAY_PCT_NAME		"delay_pct:"
+#define LPROCFS_NRS_DELAY_PCT_NAME_REG		"reg_delay_pct:"
+#define LPROCFS_NRS_DELAY_PCT_NAME_HP		"hp_delay_pct:"
+
+/**
+ * Similar to LPROCFS_NRS_DELAY_MIN_SIZE above, but for the nrs_delay_pct
+ * variable.
+ */
+#define LPROCFS_NRS_DELAY_PCT_SIZE					       \
+	sizeof(LPROCFS_NRS_DELAY_PCT_NAME_REG				       \
+	       __stringify(LPROCFS_NRS_DELAY_PCT_MAX_VAL)		       \
+	       " " LPROCFS_NRS_DELAY_PCT_NAME_HP			       \
+	       __stringify(LPROCFS_NRS_DELAY_PCT_MAX_VAL))
+
+/**
+ * Helper for delay's seq_write functions.
+ */
+static ssize_t
+lprocfs_nrs_delay_seq_write_common(const char __user *buffer,
+				   unsigned int bufsize, size_t count,
+				   const char *var_name, unsigned int min_val,
+				   unsigned int max_val,
+				   struct ptlrpc_service *svc, char *pol_name,
+				   enum ptlrpc_nrs_ctl opc, bool single)
+{
+	enum ptlrpc_nrs_queue_type queue = 0;
+	char *kernbuf;
+	char *val_str;
+	long unsigned int val_reg;
+	long unsigned int val_hp;
+	size_t count_copy;
+	int rc = 0;
+	char *tmp = NULL;
+	int tmpsize = 0;
+
+	if (count > bufsize - 1)
+		return -EINVAL;
+
+	OBD_ALLOC(kernbuf, bufsize);
+	if (kernbuf == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		GOTO(free_kernbuf, rc = -EFAULT);
+
+	tmpsize = strlen("reg_") + strlen(var_name) + 1;
+	OBD_ALLOC(tmp, tmpsize);
+	if (tmp == NULL)
+		GOTO(free_tmp, rc = -ENOMEM);
+
+	/* look for "reg_<var_name>" in kernbuf */
+	snprintf(tmp, tmpsize, "reg_%s", var_name);
+	count_copy = count;
+	val_str = lprocfs_find_named_value(kernbuf, tmp, &count_copy);
+	if (val_str != kernbuf) {
+		rc = kstrtoul(val_str, 10, &val_reg);
+		if (rc != 0)
+			GOTO(free_tmp, rc = -EINVAL);
+		queue |= PTLRPC_NRS_QUEUE_REG;
+	}
+
+	/* look for "hp_<var_name>" in kernbuf */
+	snprintf(tmp, tmpsize, "hp_%s", var_name);
+	count_copy = count;
+	val_str = lprocfs_find_named_value(kernbuf, tmp, &count_copy);
+	if (val_str != kernbuf) {
+		if (!nrs_svc_has_hp(svc))
+			GOTO(free_tmp, rc = -ENODEV);
+
+		rc = kstrtoul(val_str, 10, &val_hp);
+		if (rc != 0)
+			GOTO(free_tmp, rc = -EINVAL);
+		queue |= PTLRPC_NRS_QUEUE_HP;
+	}
+
+	if (queue == 0) {
+		if (!isdigit(kernbuf[0]))
+			GOTO(free_tmp, rc = -EINVAL);
+
+		rc = kstrtoul(kernbuf, 10, &val_reg);
+		if (rc != 0)
+			GOTO(free_tmp, rc = -EINVAL);
+
+		queue = PTLRPC_NRS_QUEUE_REG;
+
+		if (nrs_svc_has_hp(svc)) {
+			queue |= PTLRPC_NRS_QUEUE_HP;
+			val_hp = val_reg;
+		}
+	}
+
+	if (queue & PTLRPC_NRS_QUEUE_REG) {
+		if (val_reg > max_val || val_reg < min_val)
+			GOTO(free_tmp, rc = -EINVAL);
+
+		rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+					       pol_name, opc, single, &val_reg);
+		if ((rc < 0 && rc != -ENODEV) ||
+		    (rc == -ENODEV && queue == PTLRPC_NRS_QUEUE_REG))
+			GOTO(free_tmp, rc);
+	}
+
+	if (queue & PTLRPC_NRS_QUEUE_HP) {
+		int rc2 = 0;
+		if (val_hp > max_val || val_hp < min_val)
+			GOTO(free_tmp, rc = -EINVAL);
+
+		rc2 = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+						pol_name, opc, single, &val_hp);
+		if ((rc2 < 0 && rc2 != -ENODEV) ||
+		    (rc2 == -ENODEV && queue == PTLRPC_NRS_QUEUE_HP))
+			GOTO(free_tmp, rc = rc2);
+	}
+
+	/* If we've reached here then we want to return count */
+	rc = count;
+
+free_tmp:
+	OBD_FREE(tmp, tmpsize);
+free_kernbuf:
+	OBD_FREE(kernbuf, bufsize);
+
+	return rc;
+}
+
+/**
+ * Retrieves the value of the minimum delay for delay policy instances on both
+ * the regular and high-priority NRS head of a service, as long as a policy
+ * instance is not in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state;
+ */
+static int
+ptlrpc_lprocfs_nrs_delay_min_seq_show(struct seq_file *m, void *data)
+{
+	struct ptlrpc_service *svc = m->private;
+	unsigned int min_delay;
+	int rc;
+
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+				       NRS_POL_NAME_DELAY,
+				       NRS_CTL_DELAY_RD_MIN,
+				       true, &min_delay);
+
+	if (rc == 0)
+		seq_printf(m, LPROCFS_NRS_DELAY_MIN_NAME_REG"%-5d\n",
+			   min_delay);
+		/**
+		 * Ignore -ENODEV as the regular NRS head's policy may be in
+		 * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	else if (rc != -ENODEV)
+		return rc;
+
+	if (!nrs_svc_has_hp(svc))
+		return 0;
+
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+				       NRS_POL_NAME_DELAY,
+				       NRS_CTL_DELAY_RD_MIN,
+				       true, &min_delay);
+	if (rc == 0)
+		seq_printf(m, LPROCFS_NRS_DELAY_MIN_NAME_HP"%-5d\n",
+			   min_delay);
+		/**
+		 * Ignore -ENODEV as the regular NRS head's policy may be in
+		 * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	else if (rc == -ENODEV)
+		rc = 0;
+
+	return rc;
+}
+
+/**
+ * Sets the value of the minimum request delay for delay policy instances of a
+ * service. The user can set the minimum request delay for the regular or high
+ * priority NRS head individually by specifying each value, or both together in
+ * a single invocation.
+ *
+ * For example:
+ *
+ * lctl set_param *.*.*.nrs_delay_min=reg_delay_min:5, to set the regular
+ * request minimum delay on all PtlRPC services to 5 seconds
+ *
+ * lctl set_param *.*.*.nrs_delay_min=hp_delay_min:2, to set the high-priority
+ * request minimum delay on all PtlRPC services to 2 seconds, and
+ *
+ * lctl set_param *.*.ost_io.nrs_delay_min=8, to set both the regular and
+ * high priority request minimum delay of the ost_io service to 8 seconds.
+ */
+static ssize_t
+ptlrpc_lprocfs_nrs_delay_min_seq_write(struct file *file,
+				       const char __user *buffer, size_t count,
+				       loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct ptlrpc_service *svc = m->private;
+
+	return lprocfs_nrs_delay_seq_write_common(buffer,
+						  LPROCFS_NRS_DELAY_MIN_SIZE,
+						  count,
+						  LPROCFS_NRS_DELAY_MIN_NAME,
+						  LPROCFS_NRS_DELAY_LOWER_BOUND,
+						  LPROCFS_NRS_DELAY_UPPER_BOUND,
+						  svc, NRS_POL_NAME_DELAY,
+						  NRS_CTL_DELAY_WR_MIN, false);
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_min);
+
+/**
+ * Retrieves the value of the maximum delay for delay policy instances on both
+ * the regular and high-priority NRS head of a service, as long as a policy
+ * instance is not in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state;
+ */
+static int
+ptlrpc_lprocfs_nrs_delay_max_seq_show(struct seq_file *m, void *data)
+{
+	struct ptlrpc_service *svc = m->private;
+	unsigned int max_delay;
+	int rc;
+
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+				       NRS_POL_NAME_DELAY,
+				       NRS_CTL_DELAY_RD_MAX,
+				       true, &max_delay);
+
+	if (rc == 0)
+		seq_printf(m, LPROCFS_NRS_DELAY_MAX_NAME_REG"%-5d\n",
+			   max_delay);
+		/**
+		 * Ignore -ENODEV as the regular NRS head's policy may be in
+		 * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	else if (rc != -ENODEV)
+		return rc;
+
+	if (!nrs_svc_has_hp(svc))
+		return 0;
+
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+				       NRS_POL_NAME_DELAY,
+				       NRS_CTL_DELAY_RD_MAX,
+				       true, &max_delay);
+	if (rc == 0)
+		seq_printf(m, LPROCFS_NRS_DELAY_MAX_NAME_HP"%-5d\n",
+			   max_delay);
+		/**
+		 * Ignore -ENODEV as the regular NRS head's policy may be in
+		 * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	else if (rc == -ENODEV)
+		rc = 0;
+
+	return rc;
+}
+
+/**
+ * Sets the value of the maximum request delay for delay policy instances of a
+ * service. The user can set the maximum request delay for the regular or high
+ * priority NRS head individually by specifying each value, or both together in
+ * a single invocation.
+ *
+ * For example:
+ *
+ * lctl set_param *.*.*.nrs_delay_max=reg_delay_max:20, to set the regular
+ * request maximum delay on all PtlRPC services to 20 seconds
+ *
+ * lctl set_param *.*.*.nrs_delay_max=hp_delay_max:10, to set the high-priority
+ * request maximum delay on all PtlRPC services to 10 seconds, and
+ *
+ * lctl set_param *.*.ost_io.nrs_delay_max=35, to set both the regular and
+ * high priority request maximum delay of the ost_io service to 35 seconds.
+ */
+static ssize_t
+ptlrpc_lprocfs_nrs_delay_max_seq_write(struct file *file,
+				       const char __user *buffer, size_t count,
+				       loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct ptlrpc_service *svc = m->private;
+
+	return lprocfs_nrs_delay_seq_write_common(buffer,
+						  LPROCFS_NRS_DELAY_MAX_SIZE,
+						  count,
+						  LPROCFS_NRS_DELAY_MAX_NAME,
+						  LPROCFS_NRS_DELAY_LOWER_BOUND,
+						  LPROCFS_NRS_DELAY_UPPER_BOUND,
+						  svc, NRS_POL_NAME_DELAY,
+						  NRS_CTL_DELAY_WR_MAX, false);
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_max);
+
+/**
+ * Retrieves the value of the percentage of requests which should be delayed
+ * for delay policy instances on both the regular and high-priority NRS head
+ * of a service, as long as a policy instance is not in the
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state;
+ */
+static int
+ptlrpc_lprocfs_nrs_delay_pct_seq_show(struct seq_file *m, void *data)
+{
+	struct ptlrpc_service *svc = m->private;
+	unsigned int delay_pct;
+	int rc;
+
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+				       NRS_POL_NAME_DELAY,
+				       NRS_CTL_DELAY_RD_PCT,
+				       true, &delay_pct);
+
+	if (rc == 0)
+		seq_printf(m, LPROCFS_NRS_DELAY_PCT_NAME_REG"%-3d\n",
+			   delay_pct);
+		/**
+		 * Ignore -ENODEV as the regular NRS head's policy may be in
+		 * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	else if (rc != -ENODEV)
+		return rc;
+
+	if (!nrs_svc_has_hp(svc))
+		return 0;
+
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+				       NRS_POL_NAME_DELAY,
+				       NRS_CTL_DELAY_RD_PCT,
+				       true, &delay_pct);
+	if (rc == 0)
+		seq_printf(m, LPROCFS_NRS_DELAY_PCT_NAME_HP"%-3d\n",
+			   delay_pct);
+		/**
+		 * Ignore -ENODEV as the regular NRS head's policy may be in
+		 * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	else if (rc == -ENODEV)
+		rc = 0;
+
+	return rc;
+}
+
+/**
+ * Sets the value of the percentage of requests to be delayed for delay policy
+ * instances of a service. The user can set the percentage for the regular or
+ * high-priority NRS head individually by specifying each value, or both
+ * together in a single invocation.
+ *
+ * For example:
+ *
+ * lctl set_param *.*.*.nrs_delay_pct=reg_delay_pct:5, to delay 5 percent of
+ * regular requests on all PtlRPC services
+ *
+ * lctl set_param *.*.*.nrs_delay_pct=hp_delay_pct:2, to delay 2 percent of
+ * high-priority requests on all PtlRPC services, and
+ *
+ * lctl set_param *.*.ost_io.nrs_delay_pct=8, to delay 8 percent of both
+ * regular and high-priority requests of the ost_io service.
+ */
+static ssize_t
+ptlrpc_lprocfs_nrs_delay_pct_seq_write(struct file *file,
+				       const char __user *buffer, size_t count,
+				       loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct ptlrpc_service *svc = m->private;
+
+	return lprocfs_nrs_delay_seq_write_common(buffer,
+						  LPROCFS_NRS_DELAY_PCT_SIZE,
+						  count,
+						  LPROCFS_NRS_DELAY_PCT_NAME,
+						  LPROCFS_NRS_DELAY_PCT_MIN_VAL,
+						  LPROCFS_NRS_DELAY_PCT_MAX_VAL,
+						  svc, NRS_POL_NAME_DELAY,
+						  NRS_CTL_DELAY_WR_PCT, false);
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_pct);
+
+static int nrs_delay_lprocfs_init(struct ptlrpc_service *svc)
+{
+	struct lprocfs_vars nrs_delay_lprocfs_vars[] = {
+		{ .name		= "nrs_delay_min",
+		  .fops		= &ptlrpc_lprocfs_nrs_delay_min_fops,
+		  .data		= svc },
+		{ .name		= "nrs_delay_max",
+		  .fops		= &ptlrpc_lprocfs_nrs_delay_max_fops,
+		  .data		= svc },
+		{ .name		= "nrs_delay_pct",
+		  .fops		= &ptlrpc_lprocfs_nrs_delay_pct_fops,
+		  .data		= svc },
+		{ NULL }
+	};
+
+	if (svc->srv_procroot == NULL)
+		return 0;
+
+	return lprocfs_add_vars(svc->srv_procroot, nrs_delay_lprocfs_vars,
+				NULL);
+}
+
+static void nrs_delay_lprocfs_fini(struct ptlrpc_service *svc)
+{
+	if (svc->srv_procroot == NULL)
+		return;
+
+	lprocfs_remove_proc_entry("nrs_delay_min", svc->srv_procroot);
+	lprocfs_remove_proc_entry("nrs_delay_max", svc->srv_procroot);
+	lprocfs_remove_proc_entry("nrs_delay_pct", svc->srv_procroot);
+}
+
+#endif /* CONFIG_PROC_FS */
+
+/**
+ * Delay policy operations
+ */
+static const struct ptlrpc_nrs_pol_ops nrs_delay_ops = {
+	.op_policy_start	= nrs_delay_start,
+	.op_policy_stop		= nrs_delay_stop,
+	.op_policy_ctl		= nrs_delay_ctl,
+	.op_res_get		= nrs_delay_res_get,
+	.op_req_get		= nrs_delay_req_get,
+	.op_req_enqueue		= nrs_delay_req_add,
+	.op_req_dequeue		= nrs_delay_req_del,
+	.op_req_stop		= nrs_delay_req_stop,
+#ifdef CONFIG_PROC_FS
+	.op_lprocfs_init	= nrs_delay_lprocfs_init,
+	.op_lprocfs_fini	= nrs_delay_lprocfs_fini,
+#endif
+};
+
+/**
+ * Delay policy configuration
+ */
+struct ptlrpc_nrs_pol_conf nrs_conf_delay = {
+	.nc_name		= NRS_POL_NAME_DELAY,
+	.nc_ops			= &nrs_delay_ops,
+	.nc_compat		= nrs_policy_compat_all,
+};
+
+/** @} delay */
+
+/** @} nrs */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_fifo.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_fifo.c
new file mode 100644
index 0000000000000..369b59978b47f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_fifo.c
@@ -0,0 +1,271 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ * lustre/ptlrpc/nrs_fifo.c
+ *
+ * Network Request Scheduler (NRS) FIFO policy
+ *
+ * Handles RPCs in a FIFO manner, as received from the network. This policy is
+ * a logical wrapper around previous, non-NRS functionality. It is used as the
+ * default and fallback policy for all types of RPCs on all PTLRPC service
+ * partitions, for both regular and high-priority NRS heads. Default here means
+ * the policy is the one enabled at PTLRPC service partition startup time, and
+ * fallback means the policy is used to handle RPCs that are not handled
+ * successfully or are not handled at all by any primary policy that may be
+ * enabled on a given NRS head.
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ * Author: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
+ */
+/**
+ * \addtogoup nrs
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include <libcfs/libcfs.h>
+#include "ptlrpc_internal.h"
+
+/**
+ * \name fifo
+ *
+ * The FIFO policy is a logical wrapper around previous, non-NRS functionality.
+ * It schedules RPCs in the same order as they are queued from LNet.
+ *
+ * @{
+ */
+
+#define NRS_POL_NAME_FIFO	"fifo"
+
+/**
+ * Is called before the policy transitions into
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED; allocates and initializes a
+ * policy-specific private data structure.
+ *
+ * \param[in] policy The policy to start
+ *
+ * \retval -ENOMEM OOM error
+ * \retval  0	   success
+ *
+ * \see nrs_policy_register()
+ * \see nrs_policy_ctl()
+ */
+static int nrs_fifo_start(struct ptlrpc_nrs_policy *policy, char *arg)
+{
+	struct nrs_fifo_head *head;
+
+	OBD_CPT_ALLOC_PTR(head, nrs_pol2cptab(policy), nrs_pol2cptid(policy));
+	if (head == NULL)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&head->fh_list);
+	policy->pol_private = head;
+	return 0;
+}
+
+/**
+ * Is called before the policy transitions into
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED; deallocates the policy-specific
+ * private data structure.
+ *
+ * \param[in] policy The policy to stop
+ *
+ * \see nrs_policy_stop0()
+ */
+static void nrs_fifo_stop(struct ptlrpc_nrs_policy *policy)
+{
+	struct nrs_fifo_head *head = policy->pol_private;
+
+	LASSERT(head != NULL);
+	LASSERT(list_empty(&head->fh_list));
+
+	OBD_FREE_PTR(head);
+}
+
+/**
+ * Is called for obtaining a FIFO policy resource.
+ *
+ * \param[in]  policy	  The policy on which the request is being asked for
+ * \param[in]  nrq	  The request for which resources are being taken
+ * \param[in]  parent	  Parent resource, unused in this policy
+ * \param[out] resp	  Resources references are placed in this array
+ * \param[in]  moving_req Signifies limited caller context; unused in this
+ *			  policy
+ *
+ * \retval 1 The FIFO policy only has a one-level resource hierarchy, as since
+ *	     it implements a simple scheduling algorithm in which request
+ *	     priority is determined on the request arrival order, it does not
+ *	     need to maintain a set of resources that would otherwise be used
+ *	     to calculate a request's priority.
+ *
+ * \see nrs_resource_get_safe()
+ */
+static int nrs_fifo_res_get(struct ptlrpc_nrs_policy *policy,
+			    struct ptlrpc_nrs_request *nrq,
+			    const struct ptlrpc_nrs_resource *parent,
+			    struct ptlrpc_nrs_resource **resp, bool moving_req)
+{
+	/**
+	 * Just return the resource embedded inside nrs_fifo_head, and end this
+	 * resource hierarchy reference request.
+	 */
+	*resp = &((struct nrs_fifo_head *)policy->pol_private)->fh_res;
+	return 1;
+}
+
+/**
+ * Called when getting a request from the FIFO policy for handling, or just
+ * peeking; removes the request from the policy when it is to be handled.
+ *
+ * \param[in] policy The policy
+ * \param[in] peek   When set, signifies that we just want to examine the
+ *		     request, and not handle it, so the request is not removed
+ *		     from the policy.
+ * \param[in] force  Force the policy to return a request; unused in this
+ *		     policy
+ *
+ * \retval The request to be handled; this is the next request in the FIFO
+ *	   queue
+ *
+ * \see ptlrpc_nrs_req_get_nolock()
+ * \see nrs_request_get()
+ */
+static
+struct ptlrpc_nrs_request * nrs_fifo_req_get(struct ptlrpc_nrs_policy *policy,
+					     bool peek, bool force)
+{
+	struct nrs_fifo_head	  *head = policy->pol_private;
+	struct ptlrpc_nrs_request *nrq;
+
+	nrq = unlikely(list_empty(&head->fh_list)) ? NULL :
+	      list_entry(head->fh_list.next, struct ptlrpc_nrs_request,
+			     nr_u.fifo.fr_list);
+
+	if (likely(!peek && nrq != NULL)) {
+		struct ptlrpc_request *req = container_of(nrq,
+							  struct ptlrpc_request,
+							  rq_nrq);
+
+		list_del_init(&nrq->nr_u.fifo.fr_list);
+
+		CDEBUG(D_RPCTRACE, "NRS start %s request from %s, seq: %llu"
+		       "\n", policy->pol_desc->pd_name,
+		       libcfs_id2str(req->rq_peer), nrq->nr_u.fifo.fr_sequence);
+	}
+
+	return nrq;
+}
+
+/**
+ * Adds request \a nrq to \a policy's list of queued requests
+ *
+ * \param[in] policy The policy
+ * \param[in] nrq    The request to add
+ *
+ * \retval 0 success; nrs_request_enqueue() assumes this function will always
+ *		      succeed
+ */
+static int nrs_fifo_req_add(struct ptlrpc_nrs_policy *policy,
+			    struct ptlrpc_nrs_request *nrq)
+{
+	struct nrs_fifo_head *head;
+
+	head = container_of(nrs_request_resource(nrq), struct nrs_fifo_head,
+			    fh_res);
+	/**
+	 * Only used for debugging
+	 */
+	nrq->nr_u.fifo.fr_sequence = head->fh_sequence++;
+	list_add_tail(&nrq->nr_u.fifo.fr_list, &head->fh_list);
+
+	return 0;
+}
+
+/**
+ * Removes request \a nrq from \a policy's list of queued requests.
+ *
+ * \param[in] policy The policy
+ * \param[in] nrq    The request to remove
+ */
+static void nrs_fifo_req_del(struct ptlrpc_nrs_policy *policy,
+			     struct ptlrpc_nrs_request *nrq)
+{
+	LASSERT(!list_empty(&nrq->nr_u.fifo.fr_list));
+	list_del_init(&nrq->nr_u.fifo.fr_list);
+}
+
+/**
+ * Prints a debug statement right before the request \a nrq stops being
+ * handled.
+ *
+ * \param[in] policy The policy handling the request
+ * \param[in] nrq    The request being handled
+ *
+ * \see ptlrpc_server_finish_request()
+ * \see ptlrpc_nrs_req_stop_nolock()
+ */
+static void nrs_fifo_req_stop(struct ptlrpc_nrs_policy *policy,
+			      struct ptlrpc_nrs_request *nrq)
+{
+	struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request,
+						  rq_nrq);
+
+	CDEBUG(D_RPCTRACE, "NRS stop %s request from %s, seq: %llu\n",
+	       policy->pol_desc->pd_name, libcfs_id2str(req->rq_peer),
+	       nrq->nr_u.fifo.fr_sequence);
+}
+
+/**
+ * FIFO policy operations
+ */
+static const struct ptlrpc_nrs_pol_ops nrs_fifo_ops = {
+	.op_policy_start	= nrs_fifo_start,
+	.op_policy_stop		= nrs_fifo_stop,
+	.op_res_get		= nrs_fifo_res_get,
+	.op_req_get		= nrs_fifo_req_get,
+	.op_req_enqueue		= nrs_fifo_req_add,
+	.op_req_dequeue		= nrs_fifo_req_del,
+	.op_req_stop		= nrs_fifo_req_stop,
+};
+
+/**
+ * FIFO policy configuration
+ */
+struct ptlrpc_nrs_pol_conf nrs_conf_fifo = {
+	.nc_name		= NRS_POL_NAME_FIFO,
+	.nc_ops			= &nrs_fifo_ops,
+	.nc_compat		= nrs_policy_compat_all,
+	.nc_flags		= PTLRPC_NRS_FL_FALLBACK |
+				  PTLRPC_NRS_FL_REG_START
+};
+
+/** @} fifo */
+
+/** @} nrs */
+
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c
new file mode 100644
index 0000000000000..857b333a50b01
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c
@@ -0,0 +1,2000 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2016, Intel Corporation.
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ * lustre/ptlrpc/nrs_orr.c
+ *
+ * Network Request Scheduler (NRS) ORR and TRR policies
+ *
+ * Request scheduling in a Round-Robin manner over backend-fs objects and OSTs
+ * respectively
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ * Author: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
+ */
+#ifdef HAVE_SERVER_SUPPORT
+
+/**
+ * \addtogoup nrs
+ * @{
+ */
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_req_layout.h>
+#include "ptlrpc_internal.h"
+
+/**
+ * \name ORR/TRR policy
+ *
+ * ORR/TRR (Object-based Round Robin/Target-based Round Robin) NRS policies
+ *
+ * ORR performs batched Round Robin shceduling of brw RPCs, based on the FID of
+ * the backend-fs object that the brw RPC pertains to; the TRR policy performs
+ * batched Round Robin scheduling of brw RPCs, based on the OST index that the
+ * RPC pertains to. Both policies also order RPCs in each batch in ascending
+ * offset order, which is lprocfs-tunable between logical file offsets, and
+ * physical disk offsets, as reported by fiemap.
+ *
+ * The TRR policy reuses much of the functionality of ORR. These two scheduling
+ * algorithms could alternatively be implemented under a single NRS policy, that
+ * uses an lprocfs tunable in order to switch between the two types of
+ * scheduling behaviour. The two algorithms have been implemented as separate
+ * policies for reasons of clarity to the user, and to avoid issues that would
+ * otherwise arise at the point of switching between behaviours in the case of
+ * having a single policy, such as resource cleanup for nrs_orr_object
+ * instances. It is possible that this may need to be re-examined in the future,
+ * along with potentially coalescing other policies that perform batched request
+ * scheduling in a Round-Robin manner, all into one policy.
+ *
+ * @{
+ */
+
+#define NRS_POL_NAME_ORR	"orr"
+#define NRS_POL_NAME_TRR	"trr"
+
+/**
+ * Checks if the RPC type of \a nrq is currently handled by an ORR/TRR policy
+ *
+ * \param[in]  orrd   the ORR/TRR policy scheduler instance
+ * \param[in]  nrq    the request
+ * \param[out] opcode the opcode is saved here, just in order to avoid calling
+ *		      lustre_msg_get_opc() again later
+ *
+ * \retval true  request type is supported by the policy instance
+ * \retval false request type is not supported by the policy instance
+ */
+static bool nrs_orr_req_supported(struct nrs_orr_data *orrd,
+				  struct ptlrpc_nrs_request *nrq, __u32 *opcode)
+{
+	struct ptlrpc_request  *req = container_of(nrq, struct ptlrpc_request,
+						   rq_nrq);
+	__u32			opc = lustre_msg_get_opc(req->rq_reqmsg);
+	bool			rc = false;
+
+	/**
+	 * XXX: nrs_orr_data::od_supp accessed unlocked.
+	 */
+	switch (opc) {
+	case OST_READ:
+		rc = orrd->od_supp & NOS_OST_READ;
+		break;
+	case OST_WRITE:
+		rc = orrd->od_supp & NOS_OST_WRITE;
+		break;
+	}
+
+	if (rc)
+		*opcode = opc;
+
+	return rc;
+}
+
+/**
+ * Returns the ORR/TRR key fields for the request \a nrq in \a key.
+ *
+ * \param[in]  orrd the ORR/TRR policy scheduler instance
+ * \param[in]  nrq  the request
+ * \param[in]  opc  the request's opcode
+ * \param[in]  name the policy name
+ * \param[out] key  fields of the key are returned here.
+ *
+ * \retval 0   key filled successfully
+ * \retval < 0 error
+ */
+static int nrs_orr_key_fill(struct nrs_orr_data *orrd,
+			    struct ptlrpc_nrs_request *nrq, __u32 opc,
+			    char *name, struct nrs_orr_key *key)
+{
+	struct ptlrpc_request  *req = container_of(nrq, struct ptlrpc_request,
+						   rq_nrq);
+	struct ost_body        *body;
+	__u32			ost_idx;
+	bool			is_orr = strncmp(name, NRS_POL_NAME_ORR,
+						 NRS_POL_NAME_MAX) == 0;
+
+	LASSERT(req != NULL);
+
+	/**
+	 * This is an attempt to fill in the request key fields while
+	 * moving a request from the regular to the high-priority NRS
+	 * head (via ldlm_lock_reorder_req()), but the request key has
+	 * been adequately filled when nrs_orr_res_get() was called through
+	 * ptlrpc_nrs_req_initialize() for the regular NRS head's ORR/TRR
+	 * policy, so there is nothing to do.
+	 */
+	if ((is_orr && nrq->nr_u.orr.or_orr_set) ||
+	    (!is_orr && nrq->nr_u.orr.or_trr_set)) {
+		*key = nrq->nr_u.orr.or_key;
+		return 0;
+	}
+
+	/* Bounce unconnected requests to the default policy. */
+	if (req->rq_export == NULL)
+		return -ENOTCONN;
+
+	if (nrq->nr_u.orr.or_orr_set || nrq->nr_u.orr.or_trr_set)
+		memset(&nrq->nr_u.orr.or_key, 0, sizeof(nrq->nr_u.orr.or_key));
+
+	ost_idx = class_server_data(req->rq_export->exp_obd)->lsd_osd_index;
+
+	if (is_orr) {
+		int	rc;
+		/**
+		 * The request pill for OST_READ and OST_WRITE requests is
+		 * initialized in the ost_io service's
+		 * ptlrpc_service_ops::so_hpreq_handler, ost_io_hpreq_handler(),
+		 * so no need to redo it here.
+		 */
+		body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+		if (body == NULL)
+			RETURN(-EFAULT);
+
+		rc = ostid_to_fid(&key->ok_fid, &body->oa.o_oi, ost_idx);
+		if (rc < 0)
+			return rc;
+
+		nrq->nr_u.orr.or_orr_set = 1;
+	} else {
+		key->ok_idx = ost_idx;
+		nrq->nr_u.orr.or_trr_set = 1;
+	}
+
+	return 0;
+}
+
+/**
+ * Populates the range values in \a range with logical offsets obtained via
+ * \a nb.
+ *
+ * \param[in]  nb	niobuf_remote struct array for this request
+ * \param[in]  niocount	count of niobuf_remote structs for this request
+ * \param[out] range	the offset range is returned here
+ */
+static void nrs_orr_range_fill_logical(struct niobuf_remote *nb, int niocount,
+				       struct nrs_orr_req_range *range)
+{
+	/* Should we do this at page boundaries ? */
+	range->or_start = nb[0].rnb_offset & PAGE_MASK;
+	range->or_end = (nb[niocount - 1].rnb_offset +
+			 nb[niocount - 1].rnb_len - 1) | ~PAGE_MASK;
+}
+
+/**
+ * We obtain information just for a single extent, as the request can only be in
+ * a single place in the binary heap anyway.
+ */
+#define ORR_NUM_EXTENTS 1
+
+/**
+ * Converts the logical file offset range in \a range, to a physical disk offset
+ * range in \a range, for a request. Uses obd_get_info() in order to carry out a
+ * fiemap call and obtain backend-fs extent information. The returned range is
+ * in physical block numbers.
+ *
+ * \param[in]	  nrq	the request
+ * \param[in]	  oa	obdo struct for this request
+ * \param[in,out] range	the offset range in bytes; logical range in, physical
+ *			range out
+ *
+ * \retval 0	physical offsets obtained successfully
+ * \retvall < 0 error
+ */
+static int nrs_orr_range_fill_physical(struct ptlrpc_nrs_request *nrq,
+				       struct obdo *oa,
+				       struct nrs_orr_req_range *range)
+{
+	struct ptlrpc_request     *req = container_of(nrq,
+						      struct ptlrpc_request,
+						      rq_nrq);
+	char			   fiemap_buf[offsetof(struct fiemap,
+						  fm_extents[ORR_NUM_EXTENTS])];
+	struct fiemap              *fiemap = (struct fiemap *)fiemap_buf;
+	struct ll_fiemap_info_key  key;
+	loff_t			   start;
+	loff_t			   end;
+	int			   rc;
+
+	key = (typeof(key)) {
+		.lfik_name = KEY_FIEMAP,
+		.lfik_oa = *oa,
+		.lfik_fiemap = {
+			.fm_start = range->or_start,
+			.fm_length = range->or_end - range->or_start,
+			.fm_extent_count = ORR_NUM_EXTENTS
+		}
+	};
+
+	rc = obd_get_info(req->rq_svc_thread->t_env, req->rq_export,
+			  sizeof(key), &key, NULL, fiemap);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	if (fiemap->fm_mapped_extents == 0 ||
+	    fiemap->fm_mapped_extents > ORR_NUM_EXTENTS)
+		GOTO(out, rc = -EFAULT);
+
+	/**
+	 * Calculate the physical offset ranges for the request from the extent
+	 * information and the logical request offsets.
+	 */
+	start = fiemap->fm_extents[0].fe_physical + range->or_start -
+		fiemap->fm_extents[0].fe_logical;
+	end = start + range->or_end - range->or_start;
+
+	range->or_start = start;
+	range->or_end = end;
+
+	nrq->nr_u.orr.or_physical_set = 1;
+out:
+	return rc;
+}
+
+/**
+ * Sets the offset range the request covers; either in logical file
+ * offsets or in physical disk offsets.
+ *
+ * \param[in] nrq	 the request
+ * \param[in] orrd	 the ORR/TRR policy scheduler instance
+ * \param[in] opc	 the request's opcode
+ * \param[in] moving_req is the request in the process of moving onto the
+ *			 high-priority NRS head?
+ *
+ * \retval 0	range filled successfully
+ * \retval != 0 error
+ */
+static int nrs_orr_range_fill(struct ptlrpc_nrs_request *nrq,
+			      struct nrs_orr_data *orrd, __u32 opc,
+			      bool moving_req)
+{
+	struct ptlrpc_request	    *req = container_of(nrq,
+							struct ptlrpc_request,
+							rq_nrq);
+	struct obd_ioobj	    *ioo;
+	struct niobuf_remote	    *nb;
+	struct ost_body		    *body;
+	struct nrs_orr_req_range     range;
+	int			     niocount;
+	int			     rc = 0;
+
+	/**
+	 * If we are scheduling using physical disk offsets, but we have filled
+	 * the offset information in the request previously
+	 * (i.e. ldlm_lock_reorder_req() is moving the request to the
+	 * high-priority NRS head), there is no need to do anything, and we can
+	 * exit. Moreover than the lack of need, we would be unable to perform
+	 * the obd_get_info() call required in nrs_orr_range_fill_physical(),
+	 * because ldlm_lock_reorder_lock() calls into here while holding a
+	 * spinlock, and retrieving fiemap information via obd_get_info() is a
+	 * potentially sleeping operation.
+	 */
+	if (orrd->od_physical && nrq->nr_u.orr.or_physical_set)
+		return 0;
+
+	ioo = req_capsule_client_get(&req->rq_pill, &RMF_OBD_IOOBJ);
+	if (ioo == NULL)
+		GOTO(out, rc = -EFAULT);
+
+	niocount = ioo->ioo_bufcnt;
+
+	nb = req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE);
+	if (nb == NULL)
+		GOTO(out, rc = -EFAULT);
+
+	/**
+	 * Use logical information from niobuf_remote structures.
+	 */
+	nrs_orr_range_fill_logical(nb, niocount, &range);
+
+	/**
+	 * Obtain physical offsets if selected, and this is an OST_READ RPC
+	 * RPC. We do not enter this block if moving_req is set which indicates
+	 * that the request is being moved to the high-priority NRS head by
+	 * ldlm_lock_reorder_req(), as that function calls in here while holding
+	 * a spinlock, and nrs_orr_range_physical() can sleep, so we just use
+	 * logical file offsets for the range values for such requests.
+	 */
+	if (orrd->od_physical && opc == OST_READ && !moving_req) {
+		body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+		if (body == NULL)
+			GOTO(out, rc = -EFAULT);
+
+		/**
+		 * Translate to physical block offsets from backend filesystem
+		 * extents.
+		 * Ignore return values; if obtaining the physical offsets
+		 * fails, use the logical offsets.
+		 */
+		nrs_orr_range_fill_physical(nrq, &body->oa, &range);
+	}
+
+	nrq->nr_u.orr.or_range = range;
+out:
+	return rc;
+}
+
+/**
+ * Generates a character string that can be used in order to register uniquely
+ * named libcfs_hash and slab objects for ORR/TRR policy instances. The
+ * character string is unique per policy instance, as it includes the policy's
+ * name, the CPT number, and a {reg|hp} token, and there is one policy instance
+ * per NRS head on each CPT, and the policy is only compatible with the ost_io
+ * service.
+ *
+ * \param[in] policy the policy instance
+ * \param[out] name  the character array that will hold the generated name
+ */
+static void nrs_orr_genobjname(struct ptlrpc_nrs_policy *policy, char *name)
+{
+	snprintf(name, NRS_ORR_OBJ_NAME_MAX, "%s%s%s%d",
+		 "nrs_", policy->pol_desc->pd_name,
+		 policy->pol_nrs->nrs_queue_type == PTLRPC_NRS_QUEUE_REG ?
+		 "_reg_" : "_hp_", nrs_pol2cptid(policy));
+}
+
+/**
+ * ORR/TRR hash operations
+ */
+#define NRS_ORR_BITS		24
+#define NRS_ORR_BKT_BITS	12
+#define NRS_ORR_HASH_FLAGS	(CFS_HASH_SPIN_BKTLOCK | CFS_HASH_ASSERT_EMPTY)
+
+#define NRS_TRR_BITS		4
+#define NRS_TRR_BKT_BITS	2
+#define NRS_TRR_HASH_FLAGS	CFS_HASH_SPIN_BKTLOCK
+
+static unsigned
+nrs_orr_hop_hash(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, sizeof(struct nrs_orr_key), mask);
+}
+
+static void *nrs_orr_hop_key(struct hlist_node *hnode)
+{
+	struct nrs_orr_object *orro = hlist_entry(hnode,
+						      struct nrs_orr_object,
+						      oo_hnode);
+	return &orro->oo_key;
+}
+
+static int nrs_orr_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct nrs_orr_object *orro = hlist_entry(hnode,
+						      struct nrs_orr_object,
+						      oo_hnode);
+
+	return lu_fid_eq(&orro->oo_key.ok_fid,
+			 &((struct nrs_orr_key *)key)->ok_fid);
+}
+
+static void *nrs_orr_hop_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct nrs_orr_object, oo_hnode);
+}
+
+static void nrs_orr_hop_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_orr_object *orro = hlist_entry(hnode,
+						      struct nrs_orr_object,
+						      oo_hnode);
+	orro->oo_ref++;
+}
+
+/**
+ * Removes an nrs_orr_object the hash and frees its memory, if the object has
+ * no active users.
+ */
+static void nrs_orr_hop_put_free(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_orr_object *orro = hlist_entry(hnode,
+						      struct nrs_orr_object,
+						      oo_hnode);
+	struct nrs_orr_data   *orrd = container_of(orro->oo_res.res_parent,
+						   struct nrs_orr_data, od_res);
+	struct cfs_hash_bd     bd;
+
+	cfs_hash_bd_get_and_lock(hs, &orro->oo_key, &bd, 1);
+
+	if (--orro->oo_ref > 1) {
+		cfs_hash_bd_unlock(hs, &bd, 1);
+
+		return;
+	}
+	LASSERT(orro->oo_ref == 1);
+
+	cfs_hash_bd_del_locked(hs, &bd, hnode);
+	cfs_hash_bd_unlock(hs, &bd, 1);
+
+	OBD_SLAB_FREE_PTR(orro, orrd->od_cache);
+}
+
+static void nrs_orr_hop_put(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_orr_object *orro = hlist_entry(hnode,
+						      struct nrs_orr_object,
+						      oo_hnode);
+	orro->oo_ref--;
+}
+
+static int nrs_trr_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct nrs_orr_object *orro = hlist_entry(hnode,
+						      struct nrs_orr_object,
+						      oo_hnode);
+
+	return orro->oo_key.ok_idx == ((struct nrs_orr_key *)key)->ok_idx;
+}
+
+static void nrs_trr_hop_exit(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_orr_object *orro = hlist_entry(hnode,
+						      struct nrs_orr_object,
+						      oo_hnode);
+	struct nrs_orr_data   *orrd = container_of(orro->oo_res.res_parent,
+						   struct nrs_orr_data, od_res);
+
+	LASSERTF(orro->oo_ref == 0,
+		 "Busy NRS TRR policy object for OST with index %u, with %ld "
+		 "refs\n", orro->oo_key.ok_idx, orro->oo_ref);
+
+	OBD_SLAB_FREE_PTR(orro, orrd->od_cache);
+}
+
+static struct cfs_hash_ops nrs_orr_hash_ops = {
+	.hs_hash	= nrs_orr_hop_hash,
+	.hs_key		= nrs_orr_hop_key,
+	.hs_keycmp	= nrs_orr_hop_keycmp,
+	.hs_object	= nrs_orr_hop_object,
+	.hs_get		= nrs_orr_hop_get,
+	.hs_put		= nrs_orr_hop_put_free,
+	.hs_put_locked	= nrs_orr_hop_put,
+};
+
+static struct cfs_hash_ops nrs_trr_hash_ops = {
+	.hs_hash	= nrs_orr_hop_hash,
+	.hs_key		= nrs_orr_hop_key,
+	.hs_keycmp	= nrs_trr_hop_keycmp,
+	.hs_object	= nrs_orr_hop_object,
+	.hs_get		= nrs_orr_hop_get,
+	.hs_put		= nrs_orr_hop_put,
+	.hs_put_locked	= nrs_orr_hop_put,
+	.hs_exit	= nrs_trr_hop_exit,
+};
+
+#define NRS_ORR_QUANTUM_DFLT	256
+
+/**
+ * Binary heap predicate.
+ *
+ * Uses
+ * ptlrpc_nrs_request::nr_u::orr::or_round,
+ * ptlrpc_nrs_request::nr_u::orr::or_sequence, and
+ * ptlrpc_nrs_request::nr_u::orr::or_range to compare two binheap nodes and
+ * produce a binary predicate that indicates their relative priority, so that
+ * the binary heap can perform the necessary sorting operations.
+ *
+ * \param[in] e1 the first binheap node to compare
+ * \param[in] e2 the second binheap node to compare
+ *
+ * \retval 0 e1 > e2
+ * \retval 1 e1 < e2
+ */
+static int
+orr_req_compare(struct cfs_binheap_node *e1, struct cfs_binheap_node *e2)
+{
+	struct ptlrpc_nrs_request *nrq1;
+	struct ptlrpc_nrs_request *nrq2;
+
+	nrq1 = container_of(e1, struct ptlrpc_nrs_request, nr_node);
+	nrq2 = container_of(e2, struct ptlrpc_nrs_request, nr_node);
+
+	/**
+	 * Requests have been scheduled against a different scheduling round.
+	 */
+	if (nrq1->nr_u.orr.or_round < nrq2->nr_u.orr.or_round)
+		return 1;
+	else if (nrq1->nr_u.orr.or_round > nrq2->nr_u.orr.or_round)
+		return 0;
+
+	/**
+	 * Requests have been scheduled against the same scheduling round, but
+	 * belong to a different batch, i.e. they pertain to a different
+	 * backend-fs object (for ORR policy instances) or OST (for TRR policy
+	 * instances).
+	 */
+	if (nrq1->nr_u.orr.or_sequence < nrq2->nr_u.orr.or_sequence)
+		return 1;
+	else if (nrq1->nr_u.orr.or_sequence > nrq2->nr_u.orr.or_sequence)
+		return 0;
+
+	/**
+	 * If round numbers and sequence numbers are equal, the two requests
+	 * have been scheduled on the same round, and belong to the same batch,
+	 * which means they pertain to the same backend-fs object (if this is an
+	 * ORR policy instance), or to the same OST (if this is a TRR policy
+	 * instance), so these requests should be sorted by ascending offset
+	 * order.
+	 */
+	if (nrq1->nr_u.orr.or_range.or_start <
+	    nrq2->nr_u.orr.or_range.or_start) {
+		return 1;
+	} else if (nrq1->nr_u.orr.or_range.or_start >
+		 nrq2->nr_u.orr.or_range.or_start) {
+		return 0;
+	} else {
+		/**
+		 * Requests start from the same offset; Dispatch the shorter one
+		 * first; perhaps slightly more chances of hitting caches like
+		 * this.
+		 */
+		return nrq1->nr_u.orr.or_range.or_end <
+		       nrq2->nr_u.orr.or_range.or_end;
+	}
+}
+
+/**
+ * ORR binary heap operations
+ */
+static struct cfs_binheap_ops nrs_orr_heap_ops = {
+	.hop_enter	= NULL,
+	.hop_exit	= NULL,
+	.hop_compare	= orr_req_compare,
+};
+
+/**
+ * Prints a warning message if an ORR/TRR policy is started on a service with
+ * more than one CPT.  Not printed on the console for now, since we don't
+ * have any performance metrics in the first place, and it is annoying.
+ *
+ * \param[in] policy the policy instance
+ *
+ * \retval 0 success
+ */
+static int nrs_orr_init(struct ptlrpc_nrs_policy *policy)
+{
+	if (policy->pol_nrs->nrs_svcpt->scp_service->srv_ncpts > 1)
+		CDEBUG(D_CONFIG, "%s: The %s NRS policy was registered on a "
+		      "service with multiple service partitions. This policy "
+		      "may perform better with a single partition.\n",
+		      policy->pol_nrs->nrs_svcpt->scp_service->srv_name,
+		      policy->pol_desc->pd_name);
+
+	return 0;
+}
+
+/**
+ * Called when an ORR policy instance is started.
+ *
+ * \param[in] policy the policy
+ *
+ * \retval -ENOMEM OOM error
+ * \retval 0	   success
+ */
+static int nrs_orr_start(struct ptlrpc_nrs_policy *policy, char *arg)
+{
+	struct nrs_orr_data    *orrd;
+	struct cfs_hash_ops	       *ops;
+	unsigned		cur_bits;
+	unsigned		max_bits;
+	unsigned		bkt_bits;
+	unsigned		flags;
+	int			rc = 0;
+	ENTRY;
+
+	OBD_CPT_ALLOC_PTR(orrd, nrs_pol2cptab(policy), nrs_pol2cptid(policy));
+	if (orrd == NULL)
+		RETURN(-ENOMEM);
+
+	/*
+	 * Binary heap instance for sorted incoming requests.
+	 */
+	orrd->od_binheap = cfs_binheap_create(&nrs_orr_heap_ops,
+					      CBH_FLAG_ATOMIC_GROW, 4096, NULL,
+					      nrs_pol2cptab(policy),
+					      nrs_pol2cptid(policy));
+	if (orrd->od_binheap == NULL)
+		GOTO(out_orrd, rc = -ENOMEM);
+
+	nrs_orr_genobjname(policy, orrd->od_objname);
+
+	/**
+	 * Slab cache for NRS ORR/TRR objects.
+	 */
+	orrd->od_cache = kmem_cache_create(orrd->od_objname,
+					   sizeof(struct nrs_orr_object),
+					   0, 0, NULL);
+	if (orrd->od_cache == NULL)
+		GOTO(out_binheap, rc = -ENOMEM);
+
+	if (strncmp(policy->pol_desc->pd_name, NRS_POL_NAME_ORR,
+		    NRS_POL_NAME_MAX) == 0) {
+		ops = &nrs_orr_hash_ops;
+		cur_bits = NRS_ORR_BITS;
+		max_bits = NRS_ORR_BITS;
+		bkt_bits = NRS_ORR_BKT_BITS;
+		flags = NRS_ORR_HASH_FLAGS;
+	} else {
+		ops = &nrs_trr_hash_ops;
+		cur_bits = NRS_TRR_BITS;
+		max_bits = NRS_TRR_BITS;
+		bkt_bits = NRS_TRR_BKT_BITS;
+		flags = NRS_TRR_HASH_FLAGS;
+	}
+
+	/**
+	 * Hash for finding objects by struct nrs_orr_key.
+	 * XXX: For TRR, it might be better to avoid using libcfs_hash?
+	 * All that needs to be resolved are OST indices, and they
+	 * will stay relatively stable during an OSS node's lifetime.
+	 */
+	orrd->od_obj_hash = cfs_hash_create(orrd->od_objname, cur_bits,
+					    max_bits, bkt_bits, 0,
+					    CFS_HASH_MIN_THETA,
+					    CFS_HASH_MAX_THETA, ops, flags);
+	if (orrd->od_obj_hash == NULL)
+		GOTO(out_cache, rc = -ENOMEM);
+
+	/* XXX: Fields accessed unlocked */
+	orrd->od_quantum = NRS_ORR_QUANTUM_DFLT;
+	orrd->od_supp = NOS_DFLT;
+	orrd->od_physical = true;
+	/**
+	 * Set to 1 so that the test inside nrs_orr_req_add() can evaluate to
+	 * true.
+	 */
+	orrd->od_sequence = 1;
+
+	policy->pol_private = orrd;
+
+	RETURN(rc);
+
+out_cache:
+	kmem_cache_destroy(orrd->od_cache);
+out_binheap:
+	cfs_binheap_destroy(orrd->od_binheap);
+out_orrd:
+	OBD_FREE_PTR(orrd);
+
+	RETURN(rc);
+}
+
+/**
+ * Called when an ORR/TRR policy instance is stopped.
+ *
+ * Called when the policy has been instructed to transition to the
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state and has no more
+ * pending requests to serve.
+ *
+ * \param[in] policy the policy
+ */
+static void nrs_orr_stop(struct ptlrpc_nrs_policy *policy)
+{
+	struct nrs_orr_data *orrd = policy->pol_private;
+	ENTRY;
+
+	LASSERT(orrd != NULL);
+	LASSERT(orrd->od_binheap != NULL);
+	LASSERT(orrd->od_obj_hash != NULL);
+	LASSERT(orrd->od_cache != NULL);
+	LASSERT(cfs_binheap_is_empty(orrd->od_binheap));
+
+	cfs_binheap_destroy(orrd->od_binheap);
+	cfs_hash_putref(orrd->od_obj_hash);
+	kmem_cache_destroy(orrd->od_cache);
+
+	OBD_FREE_PTR(orrd);
+}
+
+/**
+ * Performs a policy-specific ctl function on ORR/TRR policy instances; similar
+ * to ioctl.
+ *
+ * \param[in]	  policy the policy instance
+ * \param[in]	  opc	 the opcode
+ * \param[in,out] arg	 used for passing parameters and information
+ *
+ * \pre assert_spin_locked(&policy->pol_nrs->->nrs_lock)
+ * \post assert_spin_locked(&policy->pol_nrs->->nrs_lock)
+ *
+ * \retval 0   operation carried successfully
+ * \retval -ve error
+ */
+static int nrs_orr_ctl(struct ptlrpc_nrs_policy *policy,
+		       enum ptlrpc_nrs_ctl opc, void *arg)
+{
+	assert_spin_locked(&policy->pol_nrs->nrs_lock);
+
+	switch((enum nrs_ctl_orr)opc) {
+	default:
+		RETURN(-EINVAL);
+
+	case NRS_CTL_ORR_RD_QUANTUM: {
+		struct nrs_orr_data	*orrd = policy->pol_private;
+
+		*(__u16 *)arg = orrd->od_quantum;
+		}
+		break;
+
+	case NRS_CTL_ORR_WR_QUANTUM: {
+		struct nrs_orr_data	*orrd = policy->pol_private;
+
+		orrd->od_quantum = *(__u16 *)arg;
+		LASSERT(orrd->od_quantum != 0);
+		}
+		break;
+
+	case NRS_CTL_ORR_RD_OFF_TYPE: {
+		struct nrs_orr_data	*orrd = policy->pol_private;
+
+		*(bool *)arg = orrd->od_physical;
+		}
+		break;
+
+	case NRS_CTL_ORR_WR_OFF_TYPE: {
+		struct nrs_orr_data	*orrd = policy->pol_private;
+
+		orrd->od_physical = *(bool *)arg;
+		}
+		break;
+
+	case NRS_CTL_ORR_RD_SUPP_REQ: {
+		struct nrs_orr_data	*orrd = policy->pol_private;
+
+		*(enum nrs_orr_supp *)arg = orrd->od_supp;
+		}
+		break;
+
+	case NRS_CTL_ORR_WR_SUPP_REQ: {
+		struct nrs_orr_data	*orrd = policy->pol_private;
+
+		orrd->od_supp = *(enum nrs_orr_supp *)arg;
+		LASSERT((orrd->od_supp & NOS_OST_RW) != 0);
+		}
+		break;
+	}
+	RETURN(0);
+}
+
+/**
+ * Obtains resources for ORR/TRR policy instances. The top-level resource lives
+ * inside \e nrs_orr_data and the second-level resource inside
+ * \e nrs_orr_object instances.
+ *
+ * \param[in]  policy	  the policy for which resources are being taken for
+ *			  request \a nrq
+ * \param[in]  nrq	  the request for which resources are being taken
+ * \param[in]  parent	  parent resource, embedded in nrs_orr_data for the
+ *			  ORR/TRR policies
+ * \param[out] resp	  used to return resource references
+ * \param[in]  moving_req signifies limited caller context; used to perform
+ *			  memory allocations in an atomic context in this
+ *			  policy
+ *
+ * \retval 0   we are returning a top-level, parent resource, one that is
+ *	       embedded in an nrs_orr_data object
+ * \retval 1   we are returning a bottom-level resource, one that is embedded
+ *	       in an nrs_orr_object object
+ *
+ * \see nrs_resource_get_safe()
+ */
+static int nrs_orr_res_get(struct ptlrpc_nrs_policy *policy,
+			   struct ptlrpc_nrs_request *nrq,
+			   const struct ptlrpc_nrs_resource *parent,
+			   struct ptlrpc_nrs_resource **resp, bool moving_req)
+{
+	struct nrs_orr_data	       *orrd;
+	struct nrs_orr_object	       *orro;
+	struct nrs_orr_object	       *tmp;
+	struct nrs_orr_key		key = { { { 0 } } };
+	__u32				opc;
+	int				rc = 0;
+
+	/**
+	 * struct nrs_orr_data is requested.
+	 */
+	if (parent == NULL) {
+		*resp = &((struct nrs_orr_data *)policy->pol_private)->od_res;
+		return 0;
+	}
+
+	orrd = container_of(parent, struct nrs_orr_data, od_res);
+
+	/**
+	 * If the request type is not supported, fail the enqueuing; the RPC
+	 * will be handled by the fallback NRS policy.
+	 */
+	if (!nrs_orr_req_supported(orrd, nrq, &opc))
+		return -1;
+
+	/**
+	 * Fill in the key for the request; OST FID for ORR policy instances,
+	 * and OST index for TRR policy instances.
+	 */
+	rc = nrs_orr_key_fill(orrd, nrq, opc, policy->pol_desc->pd_name, &key);
+	if (rc < 0)
+		RETURN(rc);
+
+	/**
+	 * Set the offset range the request covers
+	 */
+	rc = nrs_orr_range_fill(nrq, orrd, opc, moving_req);
+	if (rc < 0)
+		RETURN(rc);
+
+	orro = cfs_hash_lookup(orrd->od_obj_hash, &key);
+	if (orro != NULL)
+		goto out;
+
+	OBD_SLAB_CPT_ALLOC_PTR_GFP(orro, orrd->od_cache,
+				   nrs_pol2cptab(policy), nrs_pol2cptid(policy),
+				   moving_req ? GFP_ATOMIC : GFP_NOFS);
+	if (orro == NULL)
+		RETURN(-ENOMEM);
+
+	orro->oo_key = key;
+	orro->oo_ref = 1;
+
+	tmp = cfs_hash_findadd_unique(orrd->od_obj_hash, &orro->oo_key,
+				      &orro->oo_hnode);
+	if (tmp != orro) {
+		OBD_SLAB_FREE_PTR(orro, orrd->od_cache);
+		orro = tmp;
+	}
+out:
+	/**
+	 * For debugging purposes
+	 */
+	nrq->nr_u.orr.or_key = orro->oo_key;
+
+	*resp = &orro->oo_res;
+
+	return 1;
+}
+
+/**
+ * Called when releasing references to the resource hierachy obtained for a
+ * request for scheduling using ORR/TRR policy instances
+ *
+ * \param[in] policy   the policy the resource belongs to
+ * \param[in] res      the resource to be released
+ */
+static void nrs_orr_res_put(struct ptlrpc_nrs_policy *policy,
+			    const struct ptlrpc_nrs_resource *res)
+{
+	struct nrs_orr_data	*orrd;
+	struct nrs_orr_object	*orro;
+
+	/**
+	 * Do nothing for freeing parent, nrs_orr_data resources.
+	 */
+	if (res->res_parent == NULL)
+		return;
+
+	orro = container_of(res, struct nrs_orr_object, oo_res);
+	orrd = container_of(res->res_parent, struct nrs_orr_data, od_res);
+
+	cfs_hash_put(orrd->od_obj_hash, &orro->oo_hnode);
+}
+
+/**
+ * Called when polling an ORR/TRR policy instance for a request so that it can
+ * be served. Returns the request that is at the root of the binary heap, as
+ * that is the lowest priority one (i.e. libcfs_heap is an implementation of a
+ * min-heap)
+ *
+ * \param[in] policy the policy instance being polled
+ * \param[in] peek   when set, signifies that we just want to examine the
+ *		     request, and not handle it, so the request is not removed
+ *		     from the policy.
+ * \param[in] force  force the policy to return a request; unused in this policy
+ *
+ * \retval the request to be handled
+ * \retval NULL no request available
+ *
+ * \see ptlrpc_nrs_req_get_nolock()
+ * \see nrs_request_get()
+ */
+static
+struct ptlrpc_nrs_request *nrs_orr_req_get(struct ptlrpc_nrs_policy *policy,
+					   bool peek, bool force)
+{
+	struct nrs_orr_data	  *orrd = policy->pol_private;
+	struct cfs_binheap_node	  *node = cfs_binheap_root(orrd->od_binheap);
+	struct ptlrpc_nrs_request *nrq;
+
+	nrq = unlikely(node == NULL) ? NULL :
+	      container_of(node, struct ptlrpc_nrs_request, nr_node);
+
+	if (likely(!peek && nrq != NULL)) {
+		struct nrs_orr_object *orro;
+
+		orro = container_of(nrs_request_resource(nrq),
+				    struct nrs_orr_object, oo_res);
+
+		LASSERT(nrq->nr_u.orr.or_round <= orro->oo_round);
+
+		cfs_binheap_remove(orrd->od_binheap, &nrq->nr_node);
+		orro->oo_active--;
+
+		if (strncmp(policy->pol_desc->pd_name, NRS_POL_NAME_ORR,
+				 NRS_POL_NAME_MAX) == 0)
+			CDEBUG(D_RPCTRACE,
+			       "NRS: starting to handle %s request for object "
+			       "with FID "DFID", from OST with index %u, with "
+			       "round %llu\n", NRS_POL_NAME_ORR,
+			       PFID(&orro->oo_key.ok_fid),
+			       nrq->nr_u.orr.or_key.ok_idx,
+			       nrq->nr_u.orr.or_round);
+		else
+			CDEBUG(D_RPCTRACE,
+			       "NRS: starting to handle %s request from OST "
+			       "with index %u, with round %llu\n",
+			       NRS_POL_NAME_TRR, nrq->nr_u.orr.or_key.ok_idx,
+			       nrq->nr_u.orr.or_round);
+
+		/** Peek at the next request to be served */
+		node = cfs_binheap_root(orrd->od_binheap);
+
+		/** No more requests */
+		if (unlikely(node == NULL)) {
+			orrd->od_round++;
+		} else {
+			struct ptlrpc_nrs_request *next;
+
+			next = container_of(node, struct ptlrpc_nrs_request,
+					    nr_node);
+
+			if (orrd->od_round < next->nr_u.orr.or_round)
+				orrd->od_round = next->nr_u.orr.or_round;
+		}
+	}
+
+	return nrq;
+}
+
+/**
+ * Sort-adds request \a nrq to an ORR/TRR \a policy instance's set of queued
+ * requests in the policy's binary heap.
+ *
+ * A scheduling round is a stream of requests that have been sorted in batches
+ * according to the backend-fs object (for ORR policy instances) or OST (for TRR
+ * policy instances) that they pertain to (as identified by its IDIF FID or OST
+ * index respectively); there can be only one batch for each object or OST in
+ * each round. The batches are of maximum size nrs_orr_data:od_quantum. When a
+ * new request arrives for scheduling for an object or OST that has exhausted
+ * its quantum in its current round, the request will be scheduled on the next
+ * scheduling round. Requests are allowed to be scheduled against a round until
+ * all requests for the round are serviced, so an object or OST might miss a
+ * round if requests are not scheduled for it for a long enough period of time.
+ * Objects or OSTs that miss a round will continue with having their next
+ * request scheduled, starting at the round that requests are being dispatched
+ * for, at the time of arrival of this request.
+ *
+ * Requests are tagged with the round number and a sequence number; the sequence
+ * number indicates the relative ordering amongst the batches of requests in a
+ * round, and is identical for all requests in a batch, as is the round number.
+ * The round and sequence numbers are used by orr_req_compare() in order to use
+ * nrs_orr_data::od_binheap in order to maintain an ordered set of rounds, with
+ * each round consisting of an ordered set of batches of requests, and each
+ * batch consisting of an ordered set of requests according to their logical
+ * file or physical disk offsets.
+ *
+ * \param[in] policy the policy
+ * \param[in] nrq    the request to add
+ *
+ * \retval 0	request successfully added
+ * \retval != 0 error
+ */
+static int nrs_orr_req_add(struct ptlrpc_nrs_policy *policy,
+			   struct ptlrpc_nrs_request *nrq)
+{
+	struct nrs_orr_data	*orrd;
+	struct nrs_orr_object	*orro;
+	int			 rc;
+
+	orro = container_of(nrs_request_resource(nrq),
+			    struct nrs_orr_object, oo_res);
+	orrd = container_of(nrs_request_resource(nrq)->res_parent,
+			    struct nrs_orr_data, od_res);
+
+	if (orro->oo_quantum == 0 || orro->oo_round < orrd->od_round ||
+	    (orro->oo_active == 0 && orro->oo_quantum > 0)) {
+
+		/**
+		 * If there are no pending requests for the object/OST, but some
+		 * of its quantum still remains unused, which implies we did not
+		 * get a chance to schedule up to its maximum allowed batch size
+		 * of requests in the previous round this object/OST
+		 * participated in, schedule this next request on a new round;
+		 * this avoids fragmentation of request batches caused by
+		 * intermittent inactivity on the object/OST, at the expense of
+		 * potentially slightly increased service time for the request
+		 * batch this request will be a part of.
+		 */
+		if (orro->oo_active == 0 && orro->oo_quantum > 0)
+			orro->oo_round++;
+
+		/** A new scheduling round has commenced */
+		if (orro->oo_round < orrd->od_round)
+			orro->oo_round = orrd->od_round;
+
+		/** I was not the last object/OST that scheduled a request */
+		if (orro->oo_sequence < orrd->od_sequence)
+			orro->oo_sequence = ++orrd->od_sequence;
+		/**
+		 * Reset the quantum if we have reached the maximum quantum
+		 * size for this batch, or even if we have not managed to
+		 * complete a batch size up to its maximum allowed size.
+		 * XXX: Accessed unlocked
+		 */
+		orro->oo_quantum = orrd->od_quantum;
+	}
+
+	nrq->nr_u.orr.or_round = orro->oo_round;
+	nrq->nr_u.orr.or_sequence = orro->oo_sequence;
+
+	rc = cfs_binheap_insert(orrd->od_binheap, &nrq->nr_node);
+	if (rc == 0) {
+		orro->oo_active++;
+		if (--orro->oo_quantum == 0)
+			orro->oo_round++;
+	}
+	return rc;
+}
+
+/**
+ * Removes request \a nrq from an ORR/TRR \a policy instance's set of queued
+ * requests.
+ *
+ * \param[in] policy the policy
+ * \param[in] nrq    the request to remove
+ */
+static void nrs_orr_req_del(struct ptlrpc_nrs_policy *policy,
+			    struct ptlrpc_nrs_request *nrq)
+{
+	struct nrs_orr_data	*orrd;
+	struct nrs_orr_object	*orro;
+	bool			 is_root;
+
+	orro = container_of(nrs_request_resource(nrq),
+			    struct nrs_orr_object, oo_res);
+	orrd = container_of(nrs_request_resource(nrq)->res_parent,
+			    struct nrs_orr_data, od_res);
+
+	LASSERT(nrq->nr_u.orr.or_round <= orro->oo_round);
+
+	is_root = &nrq->nr_node == cfs_binheap_root(orrd->od_binheap);
+
+	cfs_binheap_remove(orrd->od_binheap, &nrq->nr_node);
+	orro->oo_active--;
+
+	/**
+	 * If we just deleted the node at the root of the binheap, we may have
+	 * to adjust round numbers.
+	 */
+	if (unlikely(is_root)) {
+		/** Peek at the next request to be served */
+		struct cfs_binheap_node *node = cfs_binheap_root(orrd->od_binheap);
+
+		/** No more requests */
+		if (unlikely(node == NULL)) {
+			orrd->od_round++;
+		} else {
+			nrq = container_of(node, struct ptlrpc_nrs_request,
+					   nr_node);
+
+			if (orrd->od_round < nrq->nr_u.orr.or_round)
+				orrd->od_round = nrq->nr_u.orr.or_round;
+		}
+	}
+}
+
+/**
+ * Called right after the request \a nrq finishes being handled by ORR policy
+ * instance \a policy.
+ *
+ * \param[in] policy the policy that handled the request
+ * \param[in] nrq    the request that was handled
+ */
+static void nrs_orr_req_stop(struct ptlrpc_nrs_policy *policy,
+			     struct ptlrpc_nrs_request *nrq)
+{
+	/** NB: resource control, credits etc can be added here */
+	if (strncmp(policy->pol_desc->pd_name, NRS_POL_NAME_ORR,
+		    NRS_POL_NAME_MAX) == 0)
+		CDEBUG(D_RPCTRACE,
+		       "NRS: finished handling %s request for object with FID "
+		       DFID", from OST with index %u, with round %llu\n",
+		       NRS_POL_NAME_ORR, PFID(&nrq->nr_u.orr.or_key.ok_fid),
+		       nrq->nr_u.orr.or_key.ok_idx, nrq->nr_u.orr.or_round);
+	else
+		CDEBUG(D_RPCTRACE,
+		       "NRS: finished handling %s request from OST with index %u,"
+		       " with round %llu\n",
+		       NRS_POL_NAME_TRR, nrq->nr_u.orr.or_key.ok_idx,
+		       nrq->nr_u.orr.or_round);
+}
+
+/**
+ * lprocfs interface
+ */
+
+#ifdef CONFIG_PROC_FS
+
+/**
+ * This allows to bundle the policy name into the lprocfs_vars::data pointer
+ * so that lprocfs read/write functions can be used by both the ORR and TRR
+ * policies.
+ */
+static struct nrs_lprocfs_orr_data {
+	struct ptlrpc_service	*svc;
+	char			*name;
+} lprocfs_orr_data = {
+	.name = NRS_POL_NAME_ORR
+}, lprocfs_trr_data = {
+	.name = NRS_POL_NAME_TRR
+};
+
+/**
+ * Retrieves the value of the Round Robin quantum (i.e. the maximum batch size)
+ * for ORR/TRR policy instances on both the regular and high-priority NRS head
+ * of a service, as long as a policy instance is not in the
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state; policy instances in this
+ * state are skipped later by nrs_orr_ctl().
+ *
+ * Quantum values are in # of RPCs, and the output is in YAML format.
+ *
+ * For example:
+ *
+ *	reg_quantum:256
+ *	hp_quantum:8
+ *
+ * XXX: the CRR-N version of this, ptlrpc_lprocfs_rd_nrs_crrn_quantum() is
+ * almost identical; it can be reworked and then reused for ORR/TRR.
+ */
+static int
+ptlrpc_lprocfs_nrs_orr_quantum_seq_show(struct seq_file *m, void *data)
+{
+	struct nrs_lprocfs_orr_data *orr_data = m->private;
+	struct ptlrpc_service	    *svc = orr_data->svc;
+	__u16			     quantum;
+	int			     rc;
+
+	/**
+	 * Perform two separate calls to this as only one of the NRS heads'
+	 * policies may be in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or
+	 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING state.
+	 */
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+				       orr_data->name,
+				       NRS_CTL_ORR_RD_QUANTUM,
+				       true, &quantum);
+	if (rc == 0) {
+		seq_printf(m, NRS_LPROCFS_QUANTUM_NAME_REG "%-5d\n", quantum);
+		/**
+		 * Ignore -ENODEV as the regular NRS head's policy may be in the
+		 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	} else if (rc != -ENODEV) {
+		return rc;
+	}
+
+	/**
+	 * We know the ost_io service which is the only one ORR/TRR policies are
+	 * compatible with, do have an HP NRS head, but it may be best to guard
+	 * against a possible change of this in the future.
+	 */
+	if (!nrs_svc_has_hp(svc))
+		goto no_hp;
+
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+				       orr_data->name, NRS_CTL_ORR_RD_QUANTUM,
+				       true, &quantum);
+	if (rc == 0) {
+		seq_printf(m, NRS_LPROCFS_QUANTUM_NAME_HP"%-5d\n", quantum);
+		/**
+		 * Ignore -ENODEV as the high priority NRS head's policy may be
+		 * in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	} else if (rc != -ENODEV) {
+		return rc;
+	}
+
+no_hp:
+
+	return rc;
+}
+
+/**
+ * Sets the value of the Round Robin quantum (i.e. the maximum batch size)
+ * for ORR/TRR policy instances of a service. The user can set the quantum size
+ * for the regular and high priority NRS head separately by specifying each
+ * value, or both together in a single invocation.
+ *
+ * For example:
+ *
+ * lctl set_param ost.OSS.ost_io.nrs_orr_quantum=req_quantum:64, to set the
+ * request quantum size of the ORR policy instance on the regular NRS head of
+ * the ost_io service to 64
+ *
+ * lctl set_param ost.OSS.ost_io.nrs_trr_quantum=hp_quantum:8 to set the request
+ * quantum size of the TRR policy instance on the high priority NRS head of the
+ * ost_io service to 8
+ *
+ * lctl set_param ost.OSS.ost_io.nrs_orr_quantum=32, to set both the request
+ * quantum size of the ORR policy instance on both the regular and the high
+ * priority NRS head of the ost_io service to 32
+ *
+ * policy instances in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state
+ * are skipped later by nrs_orr_ctl().
+ *
+ * XXX: the CRR-N version of this, ptlrpc_lprocfs_wr_nrs_crrn_quantum() is
+ * almost identical; it can be reworked and then reused for ORR/TRR.
+ */
+static ssize_t
+ptlrpc_lprocfs_nrs_orr_quantum_seq_write(struct file *file,
+					 const char __user *buffer,
+					 size_t count, loff_t *off)
+{
+	struct seq_file		    *m = file->private_data;
+	struct nrs_lprocfs_orr_data *orr_data = m->private;
+	struct ptlrpc_service	    *svc = orr_data->svc;
+	enum ptlrpc_nrs_queue_type   queue = 0;
+	char			     kernbuf[LPROCFS_NRS_WR_QUANTUM_MAX_CMD];
+	char			    *val;
+	long			     quantum_reg;
+	long			     quantum_hp;
+	/** lprocfs_find_named_value() modifies its argument, so keep a copy */
+	size_t			     count_copy;
+	int			     rc = 0;
+	int			     rc2 = 0;
+
+        if (count > (sizeof(kernbuf) - 1))
+                return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+
+        kernbuf[count] = '\0';
+
+	count_copy = count;
+
+	/**
+	 * Check if the regular quantum value has been specified
+	 */
+	val = lprocfs_find_named_value(kernbuf, NRS_LPROCFS_QUANTUM_NAME_REG,
+				       &count_copy);
+	if (val != kernbuf) {
+		quantum_reg = simple_strtol(val, NULL, 10);
+
+		queue |= PTLRPC_NRS_QUEUE_REG;
+	}
+
+	count_copy = count;
+
+	/**
+	 * Check if the high priority quantum value has been specified
+	 */
+	val = lprocfs_find_named_value(kernbuf, NRS_LPROCFS_QUANTUM_NAME_HP,
+				       &count_copy);
+	if (val != kernbuf) {
+		if (!nrs_svc_has_hp(svc))
+			return -ENODEV;
+
+		quantum_hp = simple_strtol(val, NULL, 10);
+
+		queue |= PTLRPC_NRS_QUEUE_HP;
+	}
+
+	/**
+	 * If none of the queues has been specified, look for a valid numerical
+	 * value
+	 */
+	if (queue == 0) {
+		if (!isdigit(kernbuf[0]))
+			return -EINVAL;
+
+		quantum_reg = simple_strtol(kernbuf, NULL, 10);
+
+		queue = PTLRPC_NRS_QUEUE_REG;
+
+		if (nrs_svc_has_hp(svc)) {
+			queue |= PTLRPC_NRS_QUEUE_HP;
+			quantum_hp = quantum_reg;
+		}
+	}
+
+	if ((((queue & PTLRPC_NRS_QUEUE_REG) != 0) &&
+	    ((quantum_reg > LPROCFS_NRS_QUANTUM_MAX || quantum_reg <= 0))) ||
+	    (((queue & PTLRPC_NRS_QUEUE_HP) != 0) &&
+	    ((quantum_hp > LPROCFS_NRS_QUANTUM_MAX || quantum_hp <= 0))))
+		return -EINVAL;
+
+	/**
+	 * We change the values on regular and HP NRS heads separately, so that
+	 * we do not exit early from ptlrpc_nrs_policy_control() with an error
+	 * returned by nrs_policy_ctl_locked(), in cases where the user has not
+	 * started the policy on either the regular or HP NRS head; i.e. we are
+	 * ignoring -ENODEV within nrs_policy_ctl_locked(). -ENODEV is returned
+	 * only if the operation fails with -ENODEV on all heads that have been
+	 * specified by the command; if at least one operation succeeds,
+	 * success is returned.
+	 */
+	if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) {
+		rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+					       orr_data->name,
+					       NRS_CTL_ORR_WR_QUANTUM, false,
+					       &quantum_reg);
+		if ((rc < 0 && rc != -ENODEV) ||
+		    (rc == -ENODEV && queue == PTLRPC_NRS_QUEUE_REG))
+			return rc;
+	}
+
+	if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) {
+		rc2 = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+						orr_data->name,
+						NRS_CTL_ORR_WR_QUANTUM, false,
+						&quantum_hp);
+		if ((rc2 < 0 && rc2 != -ENODEV) ||
+		    (rc2 == -ENODEV && queue == PTLRPC_NRS_QUEUE_HP))
+			return rc2;
+	}
+
+	return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count;
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_quantum);
+
+#define LPROCFS_NRS_OFF_NAME_REG		"reg_offset_type:"
+#define LPROCFS_NRS_OFF_NAME_HP			"hp_offset_type:"
+
+#define LPROCFS_NRS_OFF_NAME_PHYSICAL		"physical"
+#define LPROCFS_NRS_OFF_NAME_LOGICAL		"logical"
+
+/**
+ * Retrieves the offset type used by ORR/TRR policy instances on both the
+ * regular and high-priority NRS head of a service, as long as a policy
+ * instance is not in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state;
+ * policy instances in this state are skipped later by nrs_orr_ctl().
+ *
+ * Offset type information is a (physical|logical) string, and output is
+ * in YAML format.
+ *
+ * For example:
+ *
+ *	reg_offset_type:physical
+ *	hp_offset_type:logical
+ */
+static int
+ptlrpc_lprocfs_nrs_orr_offset_type_seq_show(struct seq_file *m, void *data)
+{
+	struct nrs_lprocfs_orr_data *orr_data = m->private;
+	struct ptlrpc_service	    *svc = orr_data->svc;
+	bool			     physical;
+	int			     rc;
+
+	/**
+	 * Perform two separate calls to this as only one of the NRS heads'
+	 * policies may be in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED
+	 * or ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING state.
+	 */
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+				       orr_data->name, NRS_CTL_ORR_RD_OFF_TYPE,
+				       true, &physical);
+	if (rc == 0) {
+		seq_printf(m, LPROCFS_NRS_OFF_NAME_REG"%s\n",
+			   physical ? LPROCFS_NRS_OFF_NAME_PHYSICAL :
+			   LPROCFS_NRS_OFF_NAME_LOGICAL);
+		/**
+		 * Ignore -ENODEV as the regular NRS head's policy may be in the
+		 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	} else if (rc != -ENODEV) {
+		return rc;
+	}
+
+	/**
+	 * We know the ost_io service which is the only one ORR/TRR policies are
+	 * compatible with, do have an HP NRS head, but it may be best to guard
+	 * against a possible change of this in the future.
+	 */
+	if (!nrs_svc_has_hp(svc))
+		goto no_hp;
+
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+				       orr_data->name, NRS_CTL_ORR_RD_OFF_TYPE,
+				       true, &physical);
+	if (rc == 0) {
+		seq_printf(m, LPROCFS_NRS_OFF_NAME_HP"%s\n",
+			   physical ? LPROCFS_NRS_OFF_NAME_PHYSICAL :
+			   LPROCFS_NRS_OFF_NAME_LOGICAL);
+		/**
+		 * Ignore -ENODEV as the high priority NRS head's policy may be
+		 * in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	} else if (rc != -ENODEV) {
+		return rc;
+	}
+
+no_hp:
+	return rc;
+}
+
+/**
+ * Max valid command string is the size of the labels, plus "physical" twice.
+ * plus a separating ' '
+ */
+#define LPROCFS_NRS_WR_OFF_TYPE_MAX_CMD					       \
+	sizeof(LPROCFS_NRS_OFF_NAME_REG LPROCFS_NRS_OFF_NAME_PHYSICAL " "      \
+	       LPROCFS_NRS_OFF_NAME_HP LPROCFS_NRS_OFF_NAME_PHYSICAL)
+
+/**
+ * Sets the type of offsets used to order RPCs in ORR/TRR policy instances. The
+ * user can set offset type for the regular or high priority NRS head
+ * separately by specifying each value, or both together in a single invocation.
+ *
+ * For example:
+ *
+ * lctl set_param ost.OSS.ost_io.nrs_orr_offset_type=
+ * reg_offset_type:physical, to enable the ORR policy instance on the regular
+ * NRS head of the ost_io service to use physical disk offset ordering.
+ *
+ * lctl set_param ost.OSS.ost_io.nrs_trr_offset_type=logical, to enable the TRR
+ * policy instances on both the regular ang high priority NRS heads of the
+ * ost_io service to use logical file offset ordering.
+ *
+ * policy instances in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state are
+ * are skipped later by nrs_orr_ctl().
+ */
+static ssize_t
+ptlrpc_lprocfs_nrs_orr_offset_type_seq_write(struct file *file,
+					     const char __user *buffer,
+					      size_t count,
+					     loff_t *off)
+{
+	struct seq_file		    *m = file->private_data;
+	struct nrs_lprocfs_orr_data *orr_data = m->private;
+	struct ptlrpc_service	    *svc = orr_data->svc;
+	enum ptlrpc_nrs_queue_type   queue = 0;
+	char			     kernbuf[LPROCFS_NRS_WR_OFF_TYPE_MAX_CMD];
+	char			    *val_reg;
+	char			    *val_hp;
+	bool			     physical_reg;
+	bool			     physical_hp;
+	size_t			     count_copy;
+	int			     rc = 0;
+	int			     rc2 = 0;
+
+        if (count > (sizeof(kernbuf) - 1))
+                return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+
+        kernbuf[count] = '\0';
+
+	count_copy = count;
+
+	/**
+	 * Check if the regular offset type has been specified
+	 */
+	val_reg = lprocfs_find_named_value(kernbuf,
+					   LPROCFS_NRS_OFF_NAME_REG,
+					   &count_copy);
+	if (val_reg != kernbuf)
+		queue |= PTLRPC_NRS_QUEUE_REG;
+
+	count_copy = count;
+
+	/**
+	 * Check if the high priority offset type has been specified
+	 */
+	val_hp = lprocfs_find_named_value(kernbuf, LPROCFS_NRS_OFF_NAME_HP,
+					  &count_copy);
+	if (val_hp != kernbuf) {
+		if (!nrs_svc_has_hp(svc))
+			return -ENODEV;
+
+		queue |= PTLRPC_NRS_QUEUE_HP;
+	}
+
+	/**
+	 * If none of the queues has been specified, there may be a valid
+	 * command string at the start of the buffer.
+	 */
+	if (queue == 0) {
+		queue = PTLRPC_NRS_QUEUE_REG;
+
+		if (nrs_svc_has_hp(svc))
+			queue |= PTLRPC_NRS_QUEUE_HP;
+	}
+
+	if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) {
+		if (strncmp(val_reg, LPROCFS_NRS_OFF_NAME_PHYSICAL,
+			    sizeof(LPROCFS_NRS_OFF_NAME_PHYSICAL) - 1) == 0)
+			physical_reg = true;
+		else if (strncmp(val_reg, LPROCFS_NRS_OFF_NAME_LOGICAL,
+			 sizeof(LPROCFS_NRS_OFF_NAME_LOGICAL) - 1) == 0)
+			physical_reg = false;
+		else
+			return -EINVAL;
+	}
+
+	if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) {
+		if (strncmp(val_hp, LPROCFS_NRS_OFF_NAME_PHYSICAL,
+			    sizeof(LPROCFS_NRS_OFF_NAME_PHYSICAL) - 1) == 0)
+			physical_hp = true;
+		else if (strncmp(val_hp, LPROCFS_NRS_OFF_NAME_LOGICAL,
+				 sizeof(LPROCFS_NRS_OFF_NAME_LOGICAL) - 1) == 0)
+			physical_hp = false;
+		else
+			return -EINVAL;
+	}
+
+	/**
+	 * We change the values on regular and HP NRS heads separately, so that
+	 * we do not exit early from ptlrpc_nrs_policy_control() with an error
+	 * returned by nrs_policy_ctl_locked(), in cases where the user has not
+	 * started the policy on either the regular or HP NRS head; i.e. we are
+	 * ignoring -ENODEV within nrs_policy_ctl_locked(). -ENODEV is returned
+	 * only if the operation fails with -ENODEV on all heads that have been
+	 * specified by the command; if at least one operation succeeds,
+	 * success is returned.
+	 */
+	if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) {
+		rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+					       orr_data->name,
+					       NRS_CTL_ORR_WR_OFF_TYPE, false,
+					       &physical_reg);
+		if ((rc < 0 && rc != -ENODEV) ||
+		    (rc == -ENODEV && queue == PTLRPC_NRS_QUEUE_REG))
+			return rc;
+	}
+
+	if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) {
+		rc2 = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+						orr_data->name,
+						NRS_CTL_ORR_WR_OFF_TYPE, false,
+						&physical_hp);
+		if ((rc2 < 0 && rc2 != -ENODEV) ||
+		    (rc2 == -ENODEV && queue == PTLRPC_NRS_QUEUE_HP))
+			return rc2;
+	}
+
+	return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count;
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_offset_type);
+
+#define NRS_LPROCFS_REQ_SUPP_NAME_REG		"reg_supported:"
+#define NRS_LPROCFS_REQ_SUPP_NAME_HP		"hp_supported:"
+
+#define LPROCFS_NRS_SUPP_NAME_READS		"reads"
+#define LPROCFS_NRS_SUPP_NAME_WRITES		"writes"
+#define LPROCFS_NRS_SUPP_NAME_READWRITES	"reads_and_writes"
+
+/**
+ * Translates enum nrs_orr_supp values to a corresponding string.
+ */
+static const char *nrs_orr_supp2str(enum nrs_orr_supp supp)
+{
+	switch(supp) {
+	default:
+		LBUG();
+	case NOS_OST_READ:
+		return LPROCFS_NRS_SUPP_NAME_READS;
+	case NOS_OST_WRITE:
+		return LPROCFS_NRS_SUPP_NAME_WRITES;
+	case NOS_OST_RW:
+		return LPROCFS_NRS_SUPP_NAME_READWRITES;
+	}
+}
+
+/**
+ * Translates strings to the corresponding enum nrs_orr_supp value
+ */
+static enum nrs_orr_supp nrs_orr_str2supp(const char *val)
+{
+	if (strncmp(val, LPROCFS_NRS_SUPP_NAME_READWRITES,
+		    sizeof(LPROCFS_NRS_SUPP_NAME_READWRITES) - 1) == 0)
+		return NOS_OST_RW;
+	else if (strncmp(val, LPROCFS_NRS_SUPP_NAME_READS,
+			 sizeof(LPROCFS_NRS_SUPP_NAME_READS) - 1) == 0)
+		return NOS_OST_READ;
+	else if (strncmp(val, LPROCFS_NRS_SUPP_NAME_WRITES,
+			 sizeof(LPROCFS_NRS_SUPP_NAME_WRITES) - 1) == 0)
+		return NOS_OST_WRITE;
+	else
+		return -EINVAL;
+}
+
+/**
+ * Retrieves the type of RPCs handled at the point of invocation by ORR/TRR
+ * policy instances on both the regular and high-priority NRS head of a service,
+ * as long as a policy instance is not in the
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state; policy instances in this
+ * state are skipped later by nrs_orr_ctl().
+ *
+ * Supported RPC type information is a (reads|writes|reads_and_writes) string,
+ * and output is in YAML format.
+ *
+ * For example:
+ *
+ *	reg_supported:reads
+ *	hp_supported:reads_and_writes
+ */
+static int
+ptlrpc_lprocfs_nrs_orr_supported_seq_show(struct seq_file *m, void *data)
+{
+	struct nrs_lprocfs_orr_data *orr_data = m->private;
+	struct ptlrpc_service	    *svc = orr_data->svc;
+	enum nrs_orr_supp	     supported;
+	int			     rc;
+
+	/**
+	 * Perform two separate calls to this as only one of the NRS heads'
+	 * policies may be in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED
+	 * or ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING state.
+	 */
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+				       orr_data->name,
+				       NRS_CTL_ORR_RD_SUPP_REQ, true,
+				       &supported);
+
+	if (rc == 0) {
+		seq_printf(m, NRS_LPROCFS_REQ_SUPP_NAME_REG"%s\n",
+			   nrs_orr_supp2str(supported));
+		/**
+		 * Ignore -ENODEV as the regular NRS head's policy may be in the
+		 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	} else if (rc != -ENODEV) {
+		return rc;
+	}
+
+	/**
+	 * We know the ost_io service which is the only one ORR/TRR policies are
+	 * compatible with, do have an HP NRS head, but it may be best to guard
+	 * against a possible change of this in the future.
+	 */
+	if (!nrs_svc_has_hp(svc))
+		goto no_hp;
+
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+				       orr_data->name,
+				       NRS_CTL_ORR_RD_SUPP_REQ, true,
+				       &supported);
+	if (rc == 0) {
+		seq_printf(m, NRS_LPROCFS_REQ_SUPP_NAME_HP"%s\n",
+			   nrs_orr_supp2str(supported));
+		/**
+		 * Ignore -ENODEV as the high priority NRS head's policy may be
+		 * in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	} else if (rc != -ENODEV) {
+		return rc;
+	}
+
+no_hp:
+
+	return rc;
+}
+
+/**
+ * Max valid command string is the size of the labels, plus "reads_and_writes"
+ * twice, plus a separating ' '
+ */
+#define LPROCFS_NRS_WR_REQ_SUPP_MAX_CMD					       \
+	sizeof(NRS_LPROCFS_REQ_SUPP_NAME_REG LPROCFS_NRS_SUPP_NAME_READWRITES  \
+	       NRS_LPROCFS_REQ_SUPP_NAME_HP LPROCFS_NRS_SUPP_NAME_READWRITES   \
+	       " ")
+
+/**
+ * Sets the type of RPCs handled by ORR/TRR policy instances. The user can
+ * modify this setting for the regular or high priority NRS heads separately, or
+ * both together in a single invocation.
+ *
+ * For example:
+ *
+ * lctl set_param ost.OSS.ost_io.nrs_orr_supported=
+ * "reg_supported:reads", to enable the ORR policy instance on the regular NRS
+ * head of the ost_io service to handle OST_READ RPCs.
+ *
+ * lctl set_param ost.OSS.ost_io.nrs_trr_supported=reads_and_writes, to enable
+ * the TRR policy instances on both the regular ang high priority NRS heads of
+ * the ost_io service to use handle OST_READ and OST_WRITE RPCs.
+ *
+ * policy instances in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state are
+ * are skipped later by nrs_orr_ctl().
+ */
+static ssize_t
+ptlrpc_lprocfs_nrs_orr_supported_seq_write(struct file *file,
+					   const char __user *buffer,
+					   size_t count,
+					   loff_t *off)
+{
+	struct seq_file		    *m = file->private_data;
+	struct nrs_lprocfs_orr_data *orr_data = m->private;
+	struct ptlrpc_service	    *svc = orr_data->svc;
+	enum ptlrpc_nrs_queue_type   queue = 0;
+	char			     kernbuf[LPROCFS_NRS_WR_REQ_SUPP_MAX_CMD];
+	char			    *val_reg;
+	char			    *val_hp;
+	enum nrs_orr_supp	     supp_reg;
+	enum nrs_orr_supp	     supp_hp;
+	size_t			     count_copy;
+	int			     rc = 0;
+	int			     rc2 = 0;
+
+        if (count > (sizeof(kernbuf) - 1))
+                return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+
+        kernbuf[count] = '\0';
+
+	count_copy = count;
+
+	/**
+	 * Check if the regular supported requests setting has been specified
+	 */
+	val_reg = lprocfs_find_named_value(kernbuf,
+					   NRS_LPROCFS_REQ_SUPP_NAME_REG,
+					   &count_copy);
+	if (val_reg != kernbuf)
+		queue |= PTLRPC_NRS_QUEUE_REG;
+
+	count_copy = count;
+
+	/**
+	 * Check if the high priority supported requests setting has been
+	 * specified
+	 */
+	val_hp = lprocfs_find_named_value(kernbuf, NRS_LPROCFS_REQ_SUPP_NAME_HP,
+					  &count_copy);
+	if (val_hp != kernbuf) {
+		if (!nrs_svc_has_hp(svc))
+			return -ENODEV;
+
+		queue |= PTLRPC_NRS_QUEUE_HP;
+	}
+
+	/**
+	 * If none of the queues has been specified, there may be a valid
+	 * command string at the start of the buffer.
+	 */
+	if (queue == 0) {
+		queue = PTLRPC_NRS_QUEUE_REG;
+
+		if (nrs_svc_has_hp(svc))
+			queue |= PTLRPC_NRS_QUEUE_HP;
+	}
+
+	if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) {
+		supp_reg = nrs_orr_str2supp(val_reg);
+		if (supp_reg == -EINVAL)
+			return -EINVAL;
+	}
+
+	if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) {
+		supp_hp = nrs_orr_str2supp(val_hp);
+		if (supp_hp == -EINVAL)
+			return -EINVAL;
+	}
+
+	/**
+	 * We change the values on regular and HP NRS heads separately, so that
+	 * we do not exit early from ptlrpc_nrs_policy_control() with an error
+	 * returned by nrs_policy_ctl_locked(), in cases where the user has not
+	 * started the policy on either the regular or HP NRS head; i.e. we are
+	 * ignoring -ENODEV within nrs_policy_ctl_locked(). -ENODEV is returned
+	 * only if the operation fails with -ENODEV on all heads that have been
+	 * specified by the command; if at least one operation succeeds,
+	 * success is returned.
+	 */
+	if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) {
+		rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+					       orr_data->name,
+					       NRS_CTL_ORR_WR_SUPP_REQ, false,
+					       &supp_reg);
+		if ((rc < 0 && rc != -ENODEV) ||
+		    (rc == -ENODEV && queue == PTLRPC_NRS_QUEUE_REG))
+			return rc;
+	}
+
+	if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) {
+		rc2 = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+						orr_data->name,
+						NRS_CTL_ORR_WR_SUPP_REQ, false,
+						&supp_hp);
+		if ((rc2 < 0 && rc2 != -ENODEV) ||
+		    (rc2 == -ENODEV && queue == PTLRPC_NRS_QUEUE_HP))
+			return rc2;
+	}
+
+	return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count;
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_supported);
+
+static int nrs_orr_lprocfs_init(struct ptlrpc_service *svc)
+{
+	int	i;
+
+	struct lprocfs_vars nrs_orr_lprocfs_vars[] = {
+		{ .name		= "nrs_orr_quantum",
+		  .fops		= &ptlrpc_lprocfs_nrs_orr_quantum_fops	},
+		{ .name		= "nrs_orr_offset_type",
+		  .fops		= &ptlrpc_lprocfs_nrs_orr_offset_type_fops },
+		{ .name		= "nrs_orr_supported",
+		  .fops		= &ptlrpc_lprocfs_nrs_orr_supported_fops },
+		{ NULL }
+	};
+
+	if (svc->srv_procroot == NULL)
+		return 0;
+
+	lprocfs_orr_data.svc = svc;
+
+	for (i = 0; i < ARRAY_SIZE(nrs_orr_lprocfs_vars); i++)
+		nrs_orr_lprocfs_vars[i].data = &lprocfs_orr_data;
+
+	return lprocfs_add_vars(svc->srv_procroot, nrs_orr_lprocfs_vars, NULL);
+}
+
+static void nrs_orr_lprocfs_fini(struct ptlrpc_service *svc)
+{
+	if (svc->srv_procroot == NULL)
+		return;
+
+	lprocfs_remove_proc_entry("nrs_orr_quantum", svc->srv_procroot);
+	lprocfs_remove_proc_entry("nrs_orr_offset_type", svc->srv_procroot);
+	lprocfs_remove_proc_entry("nrs_orr_supported", svc->srv_procroot);
+}
+
+#endif /* CONFIG_PROC_FS */
+
+static const struct ptlrpc_nrs_pol_ops nrs_orr_ops = {
+	.op_policy_init		= nrs_orr_init,
+	.op_policy_start	= nrs_orr_start,
+	.op_policy_stop		= nrs_orr_stop,
+	.op_policy_ctl		= nrs_orr_ctl,
+	.op_res_get		= nrs_orr_res_get,
+	.op_res_put		= nrs_orr_res_put,
+	.op_req_get		= nrs_orr_req_get,
+	.op_req_enqueue		= nrs_orr_req_add,
+	.op_req_dequeue		= nrs_orr_req_del,
+	.op_req_stop		= nrs_orr_req_stop,
+#ifdef CONFIG_PROC_FS
+	.op_lprocfs_init	= nrs_orr_lprocfs_init,
+	.op_lprocfs_fini	= nrs_orr_lprocfs_fini,
+#endif
+};
+
+struct ptlrpc_nrs_pol_conf nrs_conf_orr = {
+	.nc_name		= NRS_POL_NAME_ORR,
+	.nc_ops			= &nrs_orr_ops,
+	.nc_compat		= nrs_policy_compat_one,
+	.nc_compat_svc_name	= "ost_io",
+};
+
+/**
+ * TRR, Target-based Round Robin policy
+ *
+ * TRR reuses much of the functions and data structures of ORR
+ */
+
+#ifdef CONFIG_PROC_FS
+
+static int nrs_trr_lprocfs_init(struct ptlrpc_service *svc)
+{
+	int	i;
+
+	struct lprocfs_vars nrs_trr_lprocfs_vars[] = {
+		{ .name		= "nrs_trr_quantum",
+		  .fops		= &ptlrpc_lprocfs_nrs_orr_quantum_fops },
+		{ .name		= "nrs_trr_offset_type",
+		  .fops		= &ptlrpc_lprocfs_nrs_orr_offset_type_fops },
+		{ .name		= "nrs_trr_supported",
+		  .fops		= &ptlrpc_lprocfs_nrs_orr_supported_fops },
+		{ NULL }
+	};
+
+	if (svc->srv_procroot == NULL)
+		return 0;
+
+	lprocfs_trr_data.svc = svc;
+
+	for (i = 0; i < ARRAY_SIZE(nrs_trr_lprocfs_vars); i++)
+		nrs_trr_lprocfs_vars[i].data = &lprocfs_trr_data;
+
+	return lprocfs_add_vars(svc->srv_procroot, nrs_trr_lprocfs_vars, NULL);
+}
+
+static void nrs_trr_lprocfs_fini(struct ptlrpc_service *svc)
+{
+	if (svc->srv_procroot == NULL)
+		return;
+
+	lprocfs_remove_proc_entry("nrs_trr_quantum", svc->srv_procroot);
+	lprocfs_remove_proc_entry("nrs_trr_offset_type", svc->srv_procroot);
+	lprocfs_remove_proc_entry("nrs_trr_supported", svc->srv_procroot);
+}
+
+#endif /* CONFIG_PROC_FS */
+
+/**
+ * Reuse much of the ORR functionality for TRR.
+ */
+static const struct ptlrpc_nrs_pol_ops nrs_trr_ops = {
+	.op_policy_init		= nrs_orr_init,
+	.op_policy_start	= nrs_orr_start,
+	.op_policy_stop		= nrs_orr_stop,
+	.op_policy_ctl		= nrs_orr_ctl,
+	.op_res_get		= nrs_orr_res_get,
+	.op_res_put		= nrs_orr_res_put,
+	.op_req_get		= nrs_orr_req_get,
+	.op_req_enqueue		= nrs_orr_req_add,
+	.op_req_dequeue		= nrs_orr_req_del,
+	.op_req_stop		= nrs_orr_req_stop,
+#ifdef CONFIG_PROC_FS
+	.op_lprocfs_init	= nrs_trr_lprocfs_init,
+	.op_lprocfs_fini	= nrs_trr_lprocfs_fini,
+#endif
+};
+
+struct ptlrpc_nrs_pol_conf nrs_conf_trr = {
+	.nc_name		= NRS_POL_NAME_TRR,
+	.nc_ops			= &nrs_trr_ops,
+	.nc_compat		= nrs_policy_compat_one,
+	.nc_compat_svc_name	= "ost_io",
+};
+
+/** @} ORR/TRR policy */
+
+/** @} nrs */
+
+#endif /* HAVE_SERVER_SUPPORT */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c
new file mode 100644
index 0000000000000..6a042feb143e7
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c
@@ -0,0 +1,3088 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (C) 2013 DataDirect Networks, Inc.
+ *
+ * Copyright (c) 2014, 2016, Intel Corporation.
+ */
+/*
+ * lustre/ptlrpc/nrs_tbf.c
+ *
+ * Network Request Scheduler (NRS) Token Bucket Filter(TBF) policy
+ *
+ */
+
+#ifdef HAVE_SERVER_SUPPORT
+
+/**
+ * \addtogoup nrs
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include <libcfs/libcfs.h>
+#include "ptlrpc_internal.h"
+
+/**
+ * \name tbf
+ *
+ * Token Bucket Filter over client NIDs
+ *
+ * @{
+ */
+
+#define NRS_POL_NAME_TBF	"tbf"
+
+static int tbf_jobid_cache_size = 8192;
+module_param(tbf_jobid_cache_size, int, 0644);
+MODULE_PARM_DESC(tbf_jobid_cache_size, "The size of jobid cache");
+
+static int tbf_rate = 10000;
+module_param(tbf_rate, int, 0644);
+MODULE_PARM_DESC(tbf_rate, "Default rate limit in RPCs/s");
+
+static int tbf_depth = 3;
+module_param(tbf_depth, int, 0644);
+MODULE_PARM_DESC(tbf_depth, "How many tokens that a client can save up");
+
+static enum hrtimer_restart nrs_tbf_timer_cb(struct hrtimer *timer)
+{
+	struct nrs_tbf_head *head = container_of(timer, struct nrs_tbf_head,
+						 th_timer);
+	struct ptlrpc_nrs   *nrs = head->th_res.res_policy->pol_nrs;
+	struct ptlrpc_service_part *svcpt = nrs->nrs_svcpt;
+
+	nrs->nrs_throttling = 0;
+	wake_up(&svcpt->scp_waitq);
+
+	return HRTIMER_NORESTART;
+}
+
+#define NRS_TBF_DEFAULT_RULE "default"
+
+static void nrs_tbf_rule_fini(struct nrs_tbf_rule *rule)
+{
+	LASSERT(atomic_read(&rule->tr_ref) == 0);
+	LASSERT(list_empty(&rule->tr_cli_list));
+	LASSERT(list_empty(&rule->tr_linkage));
+
+	rule->tr_head->th_ops->o_rule_fini(rule);
+	OBD_FREE_PTR(rule);
+}
+
+/**
+ * Decreases the rule's usage reference count, and stops the rule in case it
+ * was already stopping and have no more outstanding usage references (which
+ * indicates it has no more queued or started requests, and can be safely
+ * stopped).
+ */
+static void nrs_tbf_rule_put(struct nrs_tbf_rule *rule)
+{
+	if (atomic_dec_and_test(&rule->tr_ref))
+		nrs_tbf_rule_fini(rule);
+}
+
+/**
+ * Increases the rule's usage reference count.
+ */
+static inline void nrs_tbf_rule_get(struct nrs_tbf_rule *rule)
+{
+	atomic_inc(&rule->tr_ref);
+}
+
+static void
+nrs_tbf_cli_rule_put(struct nrs_tbf_client *cli)
+{
+	LASSERT(!list_empty(&cli->tc_linkage));
+	LASSERT(cli->tc_rule);
+	spin_lock(&cli->tc_rule->tr_rule_lock);
+	list_del_init(&cli->tc_linkage);
+	spin_unlock(&cli->tc_rule->tr_rule_lock);
+	nrs_tbf_rule_put(cli->tc_rule);
+	cli->tc_rule = NULL;
+}
+
+static void
+nrs_tbf_cli_reset_value(struct nrs_tbf_head *head,
+			struct nrs_tbf_client *cli)
+
+{
+	struct nrs_tbf_rule *rule = cli->tc_rule;
+
+	cli->tc_rpc_rate = rule->tr_rpc_rate;
+	cli->tc_nsecs = rule->tr_nsecs;
+	cli->tc_depth = rule->tr_depth;
+	cli->tc_ntoken = rule->tr_depth;
+	cli->tc_check_time = ktime_to_ns(ktime_get());
+	cli->tc_rule_sequence = atomic_read(&head->th_rule_sequence);
+	cli->tc_rule_generation = rule->tr_generation;
+
+	if (cli->tc_in_heap)
+		cfs_binheap_relocate(head->th_binheap,
+				     &cli->tc_node);
+}
+
+static void
+nrs_tbf_cli_reset(struct nrs_tbf_head *head,
+		  struct nrs_tbf_rule *rule,
+		  struct nrs_tbf_client *cli)
+{
+	spin_lock(&cli->tc_rule_lock);
+	if (cli->tc_rule != NULL && !list_empty(&cli->tc_linkage)) {
+		LASSERT(rule != cli->tc_rule);
+		nrs_tbf_cli_rule_put(cli);
+	}
+	LASSERT(cli->tc_rule == NULL);
+	LASSERT(list_empty(&cli->tc_linkage));
+	/* Rule's ref is added before called */
+	cli->tc_rule = rule;
+	spin_lock(&rule->tr_rule_lock);
+	list_add_tail(&cli->tc_linkage, &rule->tr_cli_list);
+	spin_unlock(&rule->tr_rule_lock);
+	spin_unlock(&cli->tc_rule_lock);
+	nrs_tbf_cli_reset_value(head, cli);
+}
+
+static int
+nrs_tbf_rule_dump(struct nrs_tbf_rule *rule, struct seq_file *m)
+{
+	return rule->tr_head->th_ops->o_rule_dump(rule, m);
+}
+
+static int
+nrs_tbf_rule_dump_all(struct nrs_tbf_head *head, struct seq_file *m)
+{
+	struct nrs_tbf_rule *rule;
+	int rc = 0;
+
+	LASSERT(head != NULL);
+	spin_lock(&head->th_rule_lock);
+	/* List the rules from newest to oldest */
+	list_for_each_entry(rule, &head->th_list, tr_linkage) {
+		LASSERT((rule->tr_flags & NTRS_STOPPING) == 0);
+		rc = nrs_tbf_rule_dump(rule, m);
+		if (rc) {
+			rc = -ENOSPC;
+			break;
+		}
+	}
+	spin_unlock(&head->th_rule_lock);
+
+	return rc;
+}
+
+static struct nrs_tbf_rule *
+nrs_tbf_rule_find_nolock(struct nrs_tbf_head *head,
+			 const char *name)
+{
+	struct nrs_tbf_rule *rule;
+
+	LASSERT(head != NULL);
+	list_for_each_entry(rule, &head->th_list, tr_linkage) {
+		LASSERT((rule->tr_flags & NTRS_STOPPING) == 0);
+		if (strcmp(rule->tr_name, name) == 0) {
+			nrs_tbf_rule_get(rule);
+			return rule;
+		}
+	}
+	return NULL;
+}
+
+static struct nrs_tbf_rule *
+nrs_tbf_rule_find(struct nrs_tbf_head *head,
+		  const char *name)
+{
+	struct nrs_tbf_rule *rule;
+
+	LASSERT(head != NULL);
+	spin_lock(&head->th_rule_lock);
+	rule = nrs_tbf_rule_find_nolock(head, name);
+	spin_unlock(&head->th_rule_lock);
+	return rule;
+}
+
+static struct nrs_tbf_rule *
+nrs_tbf_rule_match(struct nrs_tbf_head *head,
+		   struct nrs_tbf_client *cli)
+{
+	struct nrs_tbf_rule *rule = NULL;
+	struct nrs_tbf_rule *tmp_rule;
+
+	spin_lock(&head->th_rule_lock);
+	/* Match the newest rule in the list */
+	list_for_each_entry(tmp_rule, &head->th_list, tr_linkage) {
+		LASSERT((tmp_rule->tr_flags & NTRS_STOPPING) == 0);
+		if (head->th_ops->o_rule_match(tmp_rule, cli)) {
+			rule = tmp_rule;
+			break;
+		}
+	}
+
+	if (rule == NULL)
+		rule = head->th_rule;
+
+	nrs_tbf_rule_get(rule);
+	spin_unlock(&head->th_rule_lock);
+	return rule;
+}
+
+static void
+nrs_tbf_cli_init(struct nrs_tbf_head *head,
+		 struct nrs_tbf_client *cli,
+		 struct ptlrpc_request *req)
+{
+	struct nrs_tbf_rule *rule;
+
+	memset(cli, 0, sizeof(*cli));
+	cli->tc_in_heap = false;
+	head->th_ops->o_cli_init(cli, req);
+	INIT_LIST_HEAD(&cli->tc_list);
+	INIT_LIST_HEAD(&cli->tc_linkage);
+	spin_lock_init(&cli->tc_rule_lock);
+	atomic_set(&cli->tc_ref, 1);
+	rule = nrs_tbf_rule_match(head, cli);
+	nrs_tbf_cli_reset(head, rule, cli);
+}
+
+static void
+nrs_tbf_cli_fini(struct nrs_tbf_client *cli)
+{
+	LASSERT(list_empty(&cli->tc_list));
+	LASSERT(!cli->tc_in_heap);
+	LASSERT(atomic_read(&cli->tc_ref) == 0);
+	spin_lock(&cli->tc_rule_lock);
+	nrs_tbf_cli_rule_put(cli);
+	spin_unlock(&cli->tc_rule_lock);
+	OBD_FREE_PTR(cli);
+}
+
+static int
+nrs_tbf_rule_start(struct ptlrpc_nrs_policy *policy,
+		   struct nrs_tbf_head *head,
+		   struct nrs_tbf_cmd *start)
+{
+	struct nrs_tbf_rule	*rule;
+	struct nrs_tbf_rule	*tmp_rule;
+	struct nrs_tbf_rule	*next_rule;
+	char			*next_name = start->u.tc_start.ts_next_name;
+	int			 rc;
+
+	rule = nrs_tbf_rule_find(head, start->tc_name);
+	if (rule) {
+		nrs_tbf_rule_put(rule);
+		return -EEXIST;
+	}
+
+	OBD_CPT_ALLOC_PTR(rule, nrs_pol2cptab(policy), nrs_pol2cptid(policy));
+	if (rule == NULL)
+		return -ENOMEM;
+
+	memcpy(rule->tr_name, start->tc_name, strlen(start->tc_name));
+	rule->tr_rpc_rate = start->u.tc_start.ts_rpc_rate;
+	rule->tr_nsecs = NSEC_PER_SEC;
+	do_div(rule->tr_nsecs, rule->tr_rpc_rate);
+	rule->tr_depth = tbf_depth;
+	atomic_set(&rule->tr_ref, 1);
+	INIT_LIST_HEAD(&rule->tr_cli_list);
+	INIT_LIST_HEAD(&rule->tr_nids);
+	INIT_LIST_HEAD(&rule->tr_linkage);
+	spin_lock_init(&rule->tr_rule_lock);
+	rule->tr_head = head;
+
+	rc = head->th_ops->o_rule_init(policy, rule, start);
+	if (rc) {
+		OBD_FREE_PTR(rule);
+		return rc;
+	}
+
+	/* Add as the newest rule */
+	spin_lock(&head->th_rule_lock);
+	tmp_rule = nrs_tbf_rule_find_nolock(head, start->tc_name);
+	if (tmp_rule) {
+		spin_unlock(&head->th_rule_lock);
+		nrs_tbf_rule_put(tmp_rule);
+		nrs_tbf_rule_put(rule);
+		return -EEXIST;
+	}
+
+	if (next_name) {
+		next_rule = nrs_tbf_rule_find_nolock(head, next_name);
+		if (!next_rule) {
+			spin_unlock(&head->th_rule_lock);
+			nrs_tbf_rule_put(rule);
+			return -ENOENT;
+		}
+
+		list_add(&rule->tr_linkage, next_rule->tr_linkage.prev);
+		nrs_tbf_rule_put(next_rule);
+	} else {
+		/* Add on the top of the rule list */
+		list_add(&rule->tr_linkage, &head->th_list);
+	}
+	spin_unlock(&head->th_rule_lock);
+	atomic_inc(&head->th_rule_sequence);
+	if (start->u.tc_start.ts_rule_flags & NTRS_DEFAULT) {
+		rule->tr_flags |= NTRS_DEFAULT;
+		LASSERT(head->th_rule == NULL);
+		head->th_rule = rule;
+	}
+
+	CDEBUG(D_RPCTRACE, "TBF starts rule@%p rate %llu gen %llu\n",
+	       rule, rule->tr_rpc_rate, rule->tr_generation);
+
+	return 0;
+}
+
+/**
+ * Change the rank of a rule in the rule list
+ *
+ * The matched rule will be moved to the position right before another
+ * given rule.
+ *
+ * \param[in] policy	the policy instance
+ * \param[in] head	the TBF policy instance
+ * \param[in] name	the rule name to be moved
+ * \param[in] next_name	the rule name before which the matched rule will be
+ *			moved
+ *
+ */
+static int
+nrs_tbf_rule_change_rank(struct ptlrpc_nrs_policy *policy,
+			 struct nrs_tbf_head *head,
+			 char *name,
+			 char *next_name)
+{
+	struct nrs_tbf_rule	*rule = NULL;
+	struct nrs_tbf_rule	*next_rule = NULL;
+	int			 rc = 0;
+
+	LASSERT(head != NULL);
+
+	spin_lock(&head->th_rule_lock);
+	rule = nrs_tbf_rule_find_nolock(head, name);
+	if (!rule)
+		GOTO(out, rc = -ENOENT);
+
+	if (strcmp(name, next_name) == 0)
+		GOTO(out_put, rc);
+
+	next_rule = nrs_tbf_rule_find_nolock(head, next_name);
+	if (!next_rule)
+		GOTO(out_put, rc = -ENOENT);
+
+	list_move(&rule->tr_linkage, next_rule->tr_linkage.prev);
+	nrs_tbf_rule_put(next_rule);
+out_put:
+	nrs_tbf_rule_put(rule);
+out:
+	spin_unlock(&head->th_rule_lock);
+	return rc;
+}
+
+static int
+nrs_tbf_rule_change_rate(struct ptlrpc_nrs_policy *policy,
+			 struct nrs_tbf_head *head,
+			 char *name,
+			 __u64 rate)
+{
+	struct nrs_tbf_rule *rule;
+
+	assert_spin_locked(&policy->pol_nrs->nrs_lock);
+
+	rule = nrs_tbf_rule_find(head, name);
+	if (rule == NULL)
+		return -ENOENT;
+
+	rule->tr_rpc_rate = rate;
+	rule->tr_nsecs = NSEC_PER_SEC;
+	do_div(rule->tr_nsecs, rule->tr_rpc_rate);
+	rule->tr_generation++;
+	nrs_tbf_rule_put(rule);
+
+	return 0;
+}
+
+static int
+nrs_tbf_rule_change(struct ptlrpc_nrs_policy *policy,
+		    struct nrs_tbf_head *head,
+		    struct nrs_tbf_cmd *change)
+{
+	__u64	 rate = change->u.tc_change.tc_rpc_rate;
+	char	*next_name = change->u.tc_change.tc_next_name;
+	int	 rc;
+
+	if (rate != 0) {
+		rc = nrs_tbf_rule_change_rate(policy, head, change->tc_name,
+					      rate);
+		if (rc)
+			return rc;
+	}
+
+	if (next_name) {
+		rc = nrs_tbf_rule_change_rank(policy, head, change->tc_name,
+					      next_name);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+static int
+nrs_tbf_rule_stop(struct ptlrpc_nrs_policy *policy,
+		  struct nrs_tbf_head *head,
+		  struct nrs_tbf_cmd *stop)
+{
+	struct nrs_tbf_rule *rule;
+
+	assert_spin_locked(&policy->pol_nrs->nrs_lock);
+
+	if (strcmp(stop->tc_name, NRS_TBF_DEFAULT_RULE) == 0)
+		return -EPERM;
+
+	rule = nrs_tbf_rule_find(head, stop->tc_name);
+	if (rule == NULL)
+		return -ENOENT;
+
+	list_del_init(&rule->tr_linkage);
+	rule->tr_flags |= NTRS_STOPPING;
+	nrs_tbf_rule_put(rule);
+	nrs_tbf_rule_put(rule);
+
+	return 0;
+}
+
+static int
+nrs_tbf_command(struct ptlrpc_nrs_policy *policy,
+		struct nrs_tbf_head *head,
+		struct nrs_tbf_cmd *cmd)
+{
+	int rc;
+
+	assert_spin_locked(&policy->pol_nrs->nrs_lock);
+
+	switch (cmd->tc_cmd) {
+	case NRS_CTL_TBF_START_RULE:
+		if (cmd->u.tc_start.ts_valid_type != head->th_type_flag)
+			return -EINVAL;
+
+		spin_unlock(&policy->pol_nrs->nrs_lock);
+		rc = nrs_tbf_rule_start(policy, head, cmd);
+		spin_lock(&policy->pol_nrs->nrs_lock);
+		return rc;
+	case NRS_CTL_TBF_CHANGE_RULE:
+		rc = nrs_tbf_rule_change(policy, head, cmd);
+		return rc;
+	case NRS_CTL_TBF_STOP_RULE:
+		rc = nrs_tbf_rule_stop(policy, head, cmd);
+		/* Take it as a success, if not exists at all */
+		return rc == -ENOENT ? 0 : rc;
+	default:
+		return -EFAULT;
+	}
+}
+
+/**
+ * Binary heap predicate.
+ *
+ * \param[in] e1 the first binheap node to compare
+ * \param[in] e2 the second binheap node to compare
+ *
+ * \retval 0 e1 > e2
+ * \retval 1 e1 < e2
+ */
+static int
+tbf_cli_compare(struct cfs_binheap_node *e1, struct cfs_binheap_node *e2)
+{
+	struct nrs_tbf_client *cli1;
+	struct nrs_tbf_client *cli2;
+
+	cli1 = container_of(e1, struct nrs_tbf_client, tc_node);
+	cli2 = container_of(e2, struct nrs_tbf_client, tc_node);
+
+	if (cli1->tc_check_time + cli1->tc_nsecs <
+	    cli2->tc_check_time + cli2->tc_nsecs)
+		return 1;
+	else if (cli1->tc_check_time + cli1->tc_nsecs >
+		 cli2->tc_check_time + cli2->tc_nsecs)
+		return 0;
+
+	if (cli1->tc_check_time < cli2->tc_check_time)
+		return 1;
+	else if (cli1->tc_check_time > cli2->tc_check_time)
+		return 0;
+
+	/* Maybe need more comparasion, e.g. request number in the rules */
+	return 1;
+}
+
+/**
+ * TBF binary heap operations
+ */
+static struct cfs_binheap_ops nrs_tbf_heap_ops = {
+	.hop_enter	= NULL,
+	.hop_exit	= NULL,
+	.hop_compare	= tbf_cli_compare,
+};
+
+static unsigned nrs_tbf_jobid_hop_hash(struct cfs_hash *hs, const void *key,
+				  unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, strlen(key), mask);
+}
+
+static int nrs_tbf_jobid_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						     struct nrs_tbf_client,
+						     tc_hnode);
+
+	return (strcmp(cli->tc_jobid, key) == 0);
+}
+
+static void *nrs_tbf_jobid_hop_key(struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						     struct nrs_tbf_client,
+						     tc_hnode);
+
+	return cli->tc_jobid;
+}
+
+static void *nrs_tbf_jobid_hop_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct nrs_tbf_client, tc_hnode);
+}
+
+static void nrs_tbf_jobid_hop_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						     struct nrs_tbf_client,
+						     tc_hnode);
+
+	atomic_inc(&cli->tc_ref);
+}
+
+static void nrs_tbf_jobid_hop_put(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						     struct nrs_tbf_client,
+						     tc_hnode);
+
+	atomic_dec(&cli->tc_ref);
+}
+
+static void
+nrs_tbf_jobid_hop_exit(struct cfs_hash *hs, struct hlist_node *hnode)
+
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	LASSERT(atomic_read(&cli->tc_ref) == 0);
+	nrs_tbf_cli_fini(cli);
+}
+
+static struct cfs_hash_ops nrs_tbf_jobid_hash_ops = {
+	.hs_hash	= nrs_tbf_jobid_hop_hash,
+	.hs_keycmp	= nrs_tbf_jobid_hop_keycmp,
+	.hs_key		= nrs_tbf_jobid_hop_key,
+	.hs_object	= nrs_tbf_jobid_hop_object,
+	.hs_get		= nrs_tbf_jobid_hop_get,
+	.hs_put		= nrs_tbf_jobid_hop_put,
+	.hs_put_locked	= nrs_tbf_jobid_hop_put,
+	.hs_exit	= nrs_tbf_jobid_hop_exit,
+};
+
+#define NRS_TBF_JOBID_HASH_FLAGS (CFS_HASH_SPIN_BKTLOCK | \
+				  CFS_HASH_NO_ITEMREF | \
+				  CFS_HASH_DEPTH)
+
+static struct nrs_tbf_client *
+nrs_tbf_jobid_hash_lookup(struct cfs_hash *hs,
+			  struct cfs_hash_bd *bd,
+			  const char *jobid)
+{
+	struct hlist_node *hnode;
+	struct nrs_tbf_client *cli;
+
+	hnode = cfs_hash_bd_lookup_locked(hs, bd, (void *)jobid);
+	if (hnode == NULL)
+		return NULL;
+
+	cli = container_of0(hnode, struct nrs_tbf_client, tc_hnode);
+	if (!list_empty(&cli->tc_lru))
+		list_del_init(&cli->tc_lru);
+	return cli;
+}
+
+#define NRS_TBF_JOBID_NULL ""
+
+static struct nrs_tbf_client *
+nrs_tbf_jobid_cli_find(struct nrs_tbf_head *head,
+		       struct ptlrpc_request *req)
+{
+	const char		*jobid;
+	struct nrs_tbf_client	*cli;
+	struct cfs_hash		*hs = head->th_cli_hash;
+	struct cfs_hash_bd		 bd;
+
+	jobid = lustre_msg_get_jobid(req->rq_reqmsg);
+	if (jobid == NULL)
+		jobid = NRS_TBF_JOBID_NULL;
+	cfs_hash_bd_get_and_lock(hs, (void *)jobid, &bd, 1);
+	cli = nrs_tbf_jobid_hash_lookup(hs, &bd, jobid);
+	cfs_hash_bd_unlock(hs, &bd, 1);
+
+	return cli;
+}
+
+static struct nrs_tbf_client *
+nrs_tbf_jobid_cli_findadd(struct nrs_tbf_head *head,
+			  struct nrs_tbf_client *cli)
+{
+	const char		*jobid;
+	struct nrs_tbf_client	*ret;
+	struct cfs_hash		*hs = head->th_cli_hash;
+	struct cfs_hash_bd		 bd;
+
+	jobid = cli->tc_jobid;
+	cfs_hash_bd_get_and_lock(hs, (void *)jobid, &bd, 1);
+	ret = nrs_tbf_jobid_hash_lookup(hs, &bd, jobid);
+	if (ret == NULL) {
+		cfs_hash_bd_add_locked(hs, &bd, &cli->tc_hnode);
+		ret = cli;
+	}
+	cfs_hash_bd_unlock(hs, &bd, 1);
+
+	return ret;
+}
+
+static void
+nrs_tbf_jobid_cli_put(struct nrs_tbf_head *head,
+		      struct nrs_tbf_client *cli)
+{
+	struct cfs_hash_bd		 bd;
+	struct cfs_hash		*hs = head->th_cli_hash;
+	struct nrs_tbf_bucket	*bkt;
+	int			 hw;
+	struct list_head	zombies;
+
+	INIT_LIST_HEAD(&zombies);
+	cfs_hash_bd_get(hs, &cli->tc_jobid, &bd);
+	bkt = cfs_hash_bd_extra_get(hs, &bd);
+	if (!cfs_hash_bd_dec_and_lock(hs, &bd, &cli->tc_ref))
+		return;
+	LASSERT(list_empty(&cli->tc_lru));
+	list_add_tail(&cli->tc_lru, &bkt->ntb_lru);
+
+	/*
+	 * Check and purge the LRU, there is at least one client in the LRU.
+	 */
+	hw = tbf_jobid_cache_size >>
+	     (hs->hs_cur_bits - hs->hs_bkt_bits);
+	while (cfs_hash_bd_count_get(&bd) > hw) {
+		if (unlikely(list_empty(&bkt->ntb_lru)))
+			break;
+		cli = list_entry(bkt->ntb_lru.next,
+				     struct nrs_tbf_client,
+				     tc_lru);
+		LASSERT(atomic_read(&cli->tc_ref) == 0);
+		cfs_hash_bd_del_locked(hs, &bd, &cli->tc_hnode);
+		list_move(&cli->tc_lru, &zombies);
+	}
+	cfs_hash_bd_unlock(head->th_cli_hash, &bd, 1);
+
+	while (!list_empty(&zombies)) {
+		cli = container_of0(zombies.next,
+				    struct nrs_tbf_client, tc_lru);
+		list_del_init(&cli->tc_lru);
+		nrs_tbf_cli_fini(cli);
+	}
+}
+
+static void
+nrs_tbf_jobid_cli_init(struct nrs_tbf_client *cli,
+		       struct ptlrpc_request *req)
+{
+	char *jobid = lustre_msg_get_jobid(req->rq_reqmsg);
+
+	if (jobid == NULL)
+		jobid = NRS_TBF_JOBID_NULL;
+	LASSERT(strlen(jobid) < LUSTRE_JOBID_SIZE);
+	INIT_LIST_HEAD(&cli->tc_lru);
+	memcpy(cli->tc_jobid, jobid, strlen(jobid));
+}
+
+static int nrs_tbf_jobid_hash_order(void)
+{
+	int bits;
+
+	for (bits = 1; (1 << bits) < tbf_jobid_cache_size; ++bits)
+		;
+
+	return bits;
+}
+
+#define NRS_TBF_JOBID_BKT_BITS 10
+
+static int
+nrs_tbf_jobid_startup(struct ptlrpc_nrs_policy *policy,
+		      struct nrs_tbf_head *head)
+{
+	struct nrs_tbf_cmd	 start;
+	struct nrs_tbf_bucket	*bkt;
+	int			 bits;
+	int			 i;
+	int			 rc;
+	struct cfs_hash_bd	 bd;
+
+	bits = nrs_tbf_jobid_hash_order();
+	if (bits < NRS_TBF_JOBID_BKT_BITS)
+		bits = NRS_TBF_JOBID_BKT_BITS;
+	head->th_cli_hash = cfs_hash_create("nrs_tbf_hash",
+					    bits,
+					    bits,
+					    NRS_TBF_JOBID_BKT_BITS,
+					    sizeof(*bkt),
+					    0,
+					    0,
+					    &nrs_tbf_jobid_hash_ops,
+					    NRS_TBF_JOBID_HASH_FLAGS);
+	if (head->th_cli_hash == NULL)
+		return -ENOMEM;
+
+	cfs_hash_for_each_bucket(head->th_cli_hash, &bd, i) {
+		bkt = cfs_hash_bd_extra_get(head->th_cli_hash, &bd);
+		INIT_LIST_HEAD(&bkt->ntb_lru);
+	}
+
+	memset(&start, 0, sizeof(start));
+	start.u.tc_start.ts_jobids_str = "*";
+
+	start.u.tc_start.ts_rpc_rate = tbf_rate;
+	start.u.tc_start.ts_rule_flags = NTRS_DEFAULT;
+	start.tc_name = NRS_TBF_DEFAULT_RULE;
+	INIT_LIST_HEAD(&start.u.tc_start.ts_jobids);
+	rc = nrs_tbf_rule_start(policy, head, &start);
+	if (rc) {
+		cfs_hash_putref(head->th_cli_hash);
+		head->th_cli_hash = NULL;
+	}
+
+	return rc;
+}
+
+/**
+ * Frees jobid of \a list.
+ *
+ */
+static void
+nrs_tbf_jobid_list_free(struct list_head *jobid_list)
+{
+	struct nrs_tbf_jobid *jobid, *n;
+
+	list_for_each_entry_safe(jobid, n, jobid_list, tj_linkage) {
+		OBD_FREE(jobid->tj_id, strlen(jobid->tj_id) + 1);
+		list_del(&jobid->tj_linkage);
+		OBD_FREE(jobid, sizeof(struct nrs_tbf_jobid));
+	}
+}
+
+static int
+nrs_tbf_jobid_list_add(struct cfs_lstr *id, struct list_head *jobid_list)
+{
+	struct nrs_tbf_jobid *jobid;
+	char *ptr;
+
+	OBD_ALLOC(jobid, sizeof(struct nrs_tbf_jobid));
+	if (jobid == NULL)
+		return -ENOMEM;
+
+	OBD_ALLOC(jobid->tj_id, id->ls_len + 1);
+	if (jobid->tj_id == NULL) {
+		OBD_FREE(jobid, sizeof(struct nrs_tbf_jobid));
+		return -ENOMEM;
+	}
+
+	memcpy(jobid->tj_id, id->ls_str, id->ls_len);
+	ptr = lprocfs_strnstr(id->ls_str, "*", id->ls_len);
+	if (ptr == NULL)
+		jobid->tj_match_flag = NRS_TBF_MATCH_FULL;
+	else
+		jobid->tj_match_flag = NRS_TBF_MATCH_WILDCARD;
+
+	list_add_tail(&jobid->tj_linkage, jobid_list);
+	return 0;
+}
+
+static bool
+cfs_match_wildcard(const char *pattern, const char *content)
+{
+	if (*pattern == '\0' && *content == '\0')
+		return true;
+
+	if (*pattern == '*' && *(pattern + 1) != '\0' && *content == '\0')
+		return false;
+
+	while (*pattern == *content) {
+		pattern++;
+		content++;
+		if (*pattern == '\0' && *content == '\0')
+			return true;
+
+		if (*pattern == '*' && *(pattern + 1) != '\0' &&
+		    *content == '\0')
+			return false;
+	}
+
+	if (*pattern == '*')
+		return (cfs_match_wildcard(pattern + 1, content) ||
+			cfs_match_wildcard(pattern, content + 1));
+
+	return false;
+}
+
+static inline bool
+nrs_tbf_jobid_match(const struct nrs_tbf_jobid *jobid, const char *id)
+{
+	if (jobid->tj_match_flag == NRS_TBF_MATCH_FULL)
+		return strcmp(jobid->tj_id, id) == 0;
+
+	if (jobid->tj_match_flag == NRS_TBF_MATCH_WILDCARD)
+		return cfs_match_wildcard(jobid->tj_id, id);
+
+	return false;
+}
+
+static int
+nrs_tbf_jobid_list_match(struct list_head *jobid_list, char *id)
+{
+	struct nrs_tbf_jobid *jobid;
+
+	list_for_each_entry(jobid, jobid_list, tj_linkage) {
+		if (nrs_tbf_jobid_match(jobid, id))
+			return 1;
+	}
+	return 0;
+}
+
+static int
+nrs_tbf_jobid_list_parse(char *str, int len, struct list_head *jobid_list)
+{
+	struct cfs_lstr src;
+	struct cfs_lstr res;
+	int rc = 0;
+	ENTRY;
+
+	src.ls_str = str;
+	src.ls_len = len;
+	INIT_LIST_HEAD(jobid_list);
+	while (src.ls_str) {
+		rc = cfs_gettok(&src, ' ', &res);
+		if (rc == 0) {
+			rc = -EINVAL;
+			break;
+		}
+		rc = nrs_tbf_jobid_list_add(&res, jobid_list);
+		if (rc)
+			break;
+	}
+	if (rc)
+		nrs_tbf_jobid_list_free(jobid_list);
+	RETURN(rc);
+}
+
+static void nrs_tbf_jobid_cmd_fini(struct nrs_tbf_cmd *cmd)
+{
+	if (!list_empty(&cmd->u.tc_start.ts_jobids))
+		nrs_tbf_jobid_list_free(&cmd->u.tc_start.ts_jobids);
+	if (cmd->u.tc_start.ts_jobids_str)
+		OBD_FREE(cmd->u.tc_start.ts_jobids_str,
+			 strlen(cmd->u.tc_start.ts_jobids_str) + 1);
+}
+
+static int nrs_tbf_check_id_value(struct cfs_lstr *src, char *key)
+{
+	struct cfs_lstr res;
+	int keylen = strlen(key);
+	int rc;
+
+	rc = cfs_gettok(src, '=', &res);
+	if (rc == 0 || res.ls_len != keylen ||
+	    strncmp(res.ls_str, key, keylen) != 0 ||
+	    src->ls_len <= 2 || src->ls_str[0] != '{' ||
+	    src->ls_str[src->ls_len - 1] != '}')
+		return -EINVAL;
+
+	/* Skip '{' and '}' */
+	src->ls_str++;
+	src->ls_len -= 2;
+	return 0;
+}
+
+static int nrs_tbf_jobid_parse(struct nrs_tbf_cmd *cmd, char *id)
+{
+	struct cfs_lstr src;
+	int rc;
+
+	src.ls_str = id;
+	src.ls_len = strlen(id);
+	rc = nrs_tbf_check_id_value(&src, "jobid");
+	if (rc)
+		return rc;
+
+	OBD_ALLOC(cmd->u.tc_start.ts_jobids_str, src.ls_len + 1);
+	if (cmd->u.tc_start.ts_jobids_str == NULL)
+		return -ENOMEM;
+
+	memcpy(cmd->u.tc_start.ts_jobids_str, src.ls_str, src.ls_len);
+
+	/* parse jobid list */
+	rc = nrs_tbf_jobid_list_parse(cmd->u.tc_start.ts_jobids_str,
+				      strlen(cmd->u.tc_start.ts_jobids_str),
+				      &cmd->u.tc_start.ts_jobids);
+	if (rc)
+		nrs_tbf_jobid_cmd_fini(cmd);
+
+	return rc;
+}
+
+static int nrs_tbf_jobid_rule_init(struct ptlrpc_nrs_policy *policy,
+				   struct nrs_tbf_rule *rule,
+				   struct nrs_tbf_cmd *start)
+{
+	int rc = 0;
+
+	LASSERT(start->u.tc_start.ts_jobids_str);
+	OBD_ALLOC(rule->tr_jobids_str,
+		  strlen(start->u.tc_start.ts_jobids_str) + 1);
+	if (rule->tr_jobids_str == NULL)
+		return -ENOMEM;
+
+	memcpy(rule->tr_jobids_str,
+	       start->u.tc_start.ts_jobids_str,
+	       strlen(start->u.tc_start.ts_jobids_str));
+
+	INIT_LIST_HEAD(&rule->tr_jobids);
+	if (!list_empty(&start->u.tc_start.ts_jobids)) {
+		rc = nrs_tbf_jobid_list_parse(rule->tr_jobids_str,
+					      strlen(rule->tr_jobids_str),
+					      &rule->tr_jobids);
+		if (rc)
+			CERROR("jobids {%s} illegal\n", rule->tr_jobids_str);
+	}
+	if (rc)
+		OBD_FREE(rule->tr_jobids_str,
+			 strlen(start->u.tc_start.ts_jobids_str) + 1);
+	return rc;
+}
+
+static int
+nrs_tbf_jobid_rule_dump(struct nrs_tbf_rule *rule, struct seq_file *m)
+{
+	seq_printf(m, "%s {%s} %llu, ref %d\n", rule->tr_name,
+		   rule->tr_jobids_str, rule->tr_rpc_rate,
+		   atomic_read(&rule->tr_ref) - 1);
+	return 0;
+}
+
+static int
+nrs_tbf_jobid_rule_match(struct nrs_tbf_rule *rule,
+			 struct nrs_tbf_client *cli)
+{
+	return nrs_tbf_jobid_list_match(&rule->tr_jobids, cli->tc_jobid);
+}
+
+static void nrs_tbf_jobid_rule_fini(struct nrs_tbf_rule *rule)
+{
+	if (!list_empty(&rule->tr_jobids))
+		nrs_tbf_jobid_list_free(&rule->tr_jobids);
+	LASSERT(rule->tr_jobids_str != NULL);
+	OBD_FREE(rule->tr_jobids_str, strlen(rule->tr_jobids_str) + 1);
+}
+
+static struct nrs_tbf_ops nrs_tbf_jobid_ops = {
+	.o_name = NRS_TBF_TYPE_JOBID,
+	.o_startup = nrs_tbf_jobid_startup,
+	.o_cli_find = nrs_tbf_jobid_cli_find,
+	.o_cli_findadd = nrs_tbf_jobid_cli_findadd,
+	.o_cli_put = nrs_tbf_jobid_cli_put,
+	.o_cli_init = nrs_tbf_jobid_cli_init,
+	.o_rule_init = nrs_tbf_jobid_rule_init,
+	.o_rule_dump = nrs_tbf_jobid_rule_dump,
+	.o_rule_match = nrs_tbf_jobid_rule_match,
+	.o_rule_fini = nrs_tbf_jobid_rule_fini,
+};
+
+/**
+ * libcfs_hash operations for nrs_tbf_net::cn_cli_hash
+ *
+ * This uses ptlrpc_request::rq_peer.nid as its key, in order to hash
+ * nrs_tbf_client objects.
+ */
+#define NRS_TBF_NID_BKT_BITS	8
+#define NRS_TBF_NID_BITS	16
+
+static unsigned nrs_tbf_nid_hop_hash(struct cfs_hash *hs, const void *key,
+				  unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, sizeof(lnet_nid_t), mask);
+}
+
+static int nrs_tbf_nid_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+	lnet_nid_t	      *nid = (lnet_nid_t *)key;
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						     struct nrs_tbf_client,
+						     tc_hnode);
+
+	return *nid == cli->tc_nid;
+}
+
+static void *nrs_tbf_nid_hop_key(struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						     struct nrs_tbf_client,
+						     tc_hnode);
+
+	return &cli->tc_nid;
+}
+
+static void *nrs_tbf_nid_hop_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct nrs_tbf_client, tc_hnode);
+}
+
+static void nrs_tbf_nid_hop_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						     struct nrs_tbf_client,
+						     tc_hnode);
+
+	atomic_inc(&cli->tc_ref);
+}
+
+static void nrs_tbf_nid_hop_put(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						     struct nrs_tbf_client,
+						     tc_hnode);
+
+	atomic_dec(&cli->tc_ref);
+}
+
+static void nrs_tbf_nid_hop_exit(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						     struct nrs_tbf_client,
+						     tc_hnode);
+
+	LASSERTF(atomic_read(&cli->tc_ref) == 0,
+		 "Busy TBF object from client with NID %s, with %d refs\n",
+		 libcfs_nid2str(cli->tc_nid), atomic_read(&cli->tc_ref));
+
+	nrs_tbf_cli_fini(cli);
+}
+
+static struct cfs_hash_ops nrs_tbf_nid_hash_ops = {
+	.hs_hash	= nrs_tbf_nid_hop_hash,
+	.hs_keycmp	= nrs_tbf_nid_hop_keycmp,
+	.hs_key		= nrs_tbf_nid_hop_key,
+	.hs_object	= nrs_tbf_nid_hop_object,
+	.hs_get		= nrs_tbf_nid_hop_get,
+	.hs_put		= nrs_tbf_nid_hop_put,
+	.hs_put_locked	= nrs_tbf_nid_hop_put,
+	.hs_exit	= nrs_tbf_nid_hop_exit,
+};
+
+static struct nrs_tbf_client *
+nrs_tbf_nid_cli_find(struct nrs_tbf_head *head,
+		     struct ptlrpc_request *req)
+{
+	return cfs_hash_lookup(head->th_cli_hash, &req->rq_peer.nid);
+}
+
+static struct nrs_tbf_client *
+nrs_tbf_nid_cli_findadd(struct nrs_tbf_head *head,
+			struct nrs_tbf_client *cli)
+{
+	return cfs_hash_findadd_unique(head->th_cli_hash, &cli->tc_nid,
+				       &cli->tc_hnode);
+}
+
+static void
+nrs_tbf_nid_cli_put(struct nrs_tbf_head *head,
+		      struct nrs_tbf_client *cli)
+{
+	cfs_hash_put(head->th_cli_hash, &cli->tc_hnode);
+}
+
+static int
+nrs_tbf_nid_startup(struct ptlrpc_nrs_policy *policy,
+		    struct nrs_tbf_head *head)
+{
+	struct nrs_tbf_cmd	start;
+	int rc;
+
+	head->th_cli_hash = cfs_hash_create("nrs_tbf_hash",
+					    NRS_TBF_NID_BITS,
+					    NRS_TBF_NID_BITS,
+					    NRS_TBF_NID_BKT_BITS, 0,
+					    CFS_HASH_MIN_THETA,
+					    CFS_HASH_MAX_THETA,
+					    &nrs_tbf_nid_hash_ops,
+					    CFS_HASH_RW_BKTLOCK);
+	if (head->th_cli_hash == NULL)
+		return -ENOMEM;
+
+	memset(&start, 0, sizeof(start));
+	start.u.tc_start.ts_nids_str = "*";
+
+	start.u.tc_start.ts_rpc_rate = tbf_rate;
+	start.u.tc_start.ts_rule_flags = NTRS_DEFAULT;
+	start.tc_name = NRS_TBF_DEFAULT_RULE;
+	INIT_LIST_HEAD(&start.u.tc_start.ts_nids);
+	rc = nrs_tbf_rule_start(policy, head, &start);
+	if (rc) {
+		cfs_hash_putref(head->th_cli_hash);
+		head->th_cli_hash = NULL;
+	}
+
+	return rc;
+}
+
+static void
+nrs_tbf_nid_cli_init(struct nrs_tbf_client *cli,
+			     struct ptlrpc_request *req)
+{
+	cli->tc_nid = req->rq_peer.nid;
+}
+
+static int nrs_tbf_nid_rule_init(struct ptlrpc_nrs_policy *policy,
+				 struct nrs_tbf_rule *rule,
+				 struct nrs_tbf_cmd *start)
+{
+	LASSERT(start->u.tc_start.ts_nids_str);
+	OBD_ALLOC(rule->tr_nids_str,
+		  strlen(start->u.tc_start.ts_nids_str) + 1);
+	if (rule->tr_nids_str == NULL)
+		return -ENOMEM;
+
+	memcpy(rule->tr_nids_str,
+	       start->u.tc_start.ts_nids_str,
+	       strlen(start->u.tc_start.ts_nids_str));
+
+	INIT_LIST_HEAD(&rule->tr_nids);
+	if (!list_empty(&start->u.tc_start.ts_nids)) {
+		if (cfs_parse_nidlist(rule->tr_nids_str,
+				      strlen(rule->tr_nids_str),
+				      &rule->tr_nids) <= 0) {
+			CERROR("nids {%s} illegal\n",
+			       rule->tr_nids_str);
+			OBD_FREE(rule->tr_nids_str,
+				 strlen(start->u.tc_start.ts_nids_str) + 1);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static int
+nrs_tbf_nid_rule_dump(struct nrs_tbf_rule *rule, struct seq_file *m)
+{
+	seq_printf(m, "%s {%s} %llu, ref %d\n", rule->tr_name,
+		   rule->tr_nids_str, rule->tr_rpc_rate,
+		   atomic_read(&rule->tr_ref) - 1);
+	return 0;
+}
+
+static int
+nrs_tbf_nid_rule_match(struct nrs_tbf_rule *rule,
+		       struct nrs_tbf_client *cli)
+{
+	return cfs_match_nid(cli->tc_nid, &rule->tr_nids);
+}
+
+static void nrs_tbf_nid_rule_fini(struct nrs_tbf_rule *rule)
+{
+	if (!list_empty(&rule->tr_nids))
+		cfs_free_nidlist(&rule->tr_nids);
+	LASSERT(rule->tr_nids_str != NULL);
+	OBD_FREE(rule->tr_nids_str, strlen(rule->tr_nids_str) + 1);
+}
+
+static void nrs_tbf_nid_cmd_fini(struct nrs_tbf_cmd *cmd)
+{
+	if (!list_empty(&cmd->u.tc_start.ts_nids))
+		cfs_free_nidlist(&cmd->u.tc_start.ts_nids);
+	if (cmd->u.tc_start.ts_nids_str)
+		OBD_FREE(cmd->u.tc_start.ts_nids_str,
+			 strlen(cmd->u.tc_start.ts_nids_str) + 1);
+}
+
+static int nrs_tbf_nid_parse(struct nrs_tbf_cmd *cmd, char *id)
+{
+	struct cfs_lstr src;
+	int rc;
+
+	src.ls_str = id;
+	src.ls_len = strlen(id);
+	rc = nrs_tbf_check_id_value(&src, "nid");
+	if (rc)
+		return rc;
+
+	OBD_ALLOC(cmd->u.tc_start.ts_nids_str, src.ls_len + 1);
+	if (cmd->u.tc_start.ts_nids_str == NULL)
+		return -ENOMEM;
+
+	memcpy(cmd->u.tc_start.ts_nids_str, src.ls_str, src.ls_len);
+
+	/* parse NID list */
+	if (cfs_parse_nidlist(cmd->u.tc_start.ts_nids_str,
+			      strlen(cmd->u.tc_start.ts_nids_str),
+			      &cmd->u.tc_start.ts_nids) <= 0) {
+		nrs_tbf_nid_cmd_fini(cmd);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct nrs_tbf_ops nrs_tbf_nid_ops = {
+	.o_name = NRS_TBF_TYPE_NID,
+	.o_startup = nrs_tbf_nid_startup,
+	.o_cli_find = nrs_tbf_nid_cli_find,
+	.o_cli_findadd = nrs_tbf_nid_cli_findadd,
+	.o_cli_put = nrs_tbf_nid_cli_put,
+	.o_cli_init = nrs_tbf_nid_cli_init,
+	.o_rule_init = nrs_tbf_nid_rule_init,
+	.o_rule_dump = nrs_tbf_nid_rule_dump,
+	.o_rule_match = nrs_tbf_nid_rule_match,
+	.o_rule_fini = nrs_tbf_nid_rule_fini,
+};
+
+static unsigned nrs_tbf_hop_hash(struct cfs_hash *hs, const void *key,
+				 unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, strlen(key), mask);
+}
+
+static int nrs_tbf_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	return (strcmp(cli->tc_key, key) == 0);
+}
+
+static void *nrs_tbf_hop_key(struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+	return cli->tc_key;
+}
+
+static void *nrs_tbf_hop_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct nrs_tbf_client, tc_hnode);
+}
+
+static void nrs_tbf_hop_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	atomic_inc(&cli->tc_ref);
+}
+
+static void nrs_tbf_hop_put(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	atomic_dec(&cli->tc_ref);
+}
+
+static void nrs_tbf_hop_exit(struct cfs_hash *hs, struct hlist_node *hnode)
+
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	LASSERT(atomic_read(&cli->tc_ref) == 0);
+	nrs_tbf_cli_fini(cli);
+}
+
+static struct cfs_hash_ops nrs_tbf_hash_ops = {
+	.hs_hash	= nrs_tbf_hop_hash,
+	.hs_keycmp      = nrs_tbf_hop_keycmp,
+	.hs_key		= nrs_tbf_hop_key,
+	.hs_object	= nrs_tbf_hop_object,
+	.hs_get		= nrs_tbf_hop_get,
+	.hs_put		= nrs_tbf_hop_put,
+	.hs_put_locked	= nrs_tbf_hop_put,
+	.hs_exit	= nrs_tbf_hop_exit,
+};
+
+#define NRS_TBF_GENERIC_BKT_BITS	10
+#define NRS_TBF_GENERIC_HASH_FLAGS	(CFS_HASH_SPIN_BKTLOCK | \
+					CFS_HASH_NO_ITEMREF | \
+					CFS_HASH_DEPTH)
+
+static int
+nrs_tbf_startup(struct ptlrpc_nrs_policy *policy, struct nrs_tbf_head *head)
+{
+	struct nrs_tbf_cmd	 start;
+	struct nrs_tbf_bucket	*bkt;
+	int			 bits;
+	int			 i;
+	int			 rc;
+	struct cfs_hash_bd	 bd;
+
+	bits = nrs_tbf_jobid_hash_order();
+	if (bits < NRS_TBF_GENERIC_BKT_BITS)
+		bits = NRS_TBF_GENERIC_BKT_BITS;
+	head->th_cli_hash = cfs_hash_create("nrs_tbf_hash",
+					    bits, bits,
+					    NRS_TBF_GENERIC_BKT_BITS,
+					    sizeof(*bkt), 0, 0,
+					    &nrs_tbf_hash_ops,
+					    NRS_TBF_GENERIC_HASH_FLAGS);
+	if (head->th_cli_hash == NULL)
+		return -ENOMEM;
+
+	cfs_hash_for_each_bucket(head->th_cli_hash, &bd, i) {
+		bkt = cfs_hash_bd_extra_get(head->th_cli_hash, &bd);
+		INIT_LIST_HEAD(&bkt->ntb_lru);
+	}
+
+	memset(&start, 0, sizeof(start));
+	start.u.tc_start.ts_conds_str = "*";
+
+	start.u.tc_start.ts_rpc_rate = tbf_rate;
+	start.u.tc_start.ts_rule_flags = NTRS_DEFAULT;
+	start.tc_name = NRS_TBF_DEFAULT_RULE;
+	INIT_LIST_HEAD(&start.u.tc_start.ts_conds);
+	rc = nrs_tbf_rule_start(policy, head, &start);
+	if (rc)
+		cfs_hash_putref(head->th_cli_hash);
+
+	return rc;
+}
+
+static struct nrs_tbf_client *
+nrs_tbf_cli_hash_lookup(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			const char *key)
+{
+	struct hlist_node *hnode;
+	struct nrs_tbf_client *cli;
+
+	hnode = cfs_hash_bd_lookup_locked(hs, bd, (void *)key);
+	if (hnode == NULL)
+		return NULL;
+
+	cli = container_of0(hnode, struct nrs_tbf_client, tc_hnode);
+	if (!list_empty(&cli->tc_lru))
+		list_del_init(&cli->tc_lru);
+	return cli;
+}
+
+static struct nrs_tbf_client *
+nrs_tbf_cli_find(struct nrs_tbf_head *head, struct ptlrpc_request *req)
+{
+	struct nrs_tbf_client *cli;
+	struct cfs_hash *hs = head->th_cli_hash;
+	struct cfs_hash_bd bd;
+	char keystr[NRS_TBF_KEY_LEN] = { '\0' };
+	const char *jobid;
+	__u32 opc;
+
+	jobid = lustre_msg_get_jobid(req->rq_reqmsg);
+	if (jobid == NULL)
+		jobid = NRS_TBF_JOBID_NULL;
+	opc = lustre_msg_get_opc(req->rq_reqmsg);
+	snprintf(keystr, sizeof(keystr), "%s_%s_%d", jobid,
+		 libcfs_nid2str(req->rq_peer.nid), opc);
+	LASSERT(strlen(keystr) < NRS_TBF_KEY_LEN);
+	cfs_hash_bd_get_and_lock(hs, (void *)keystr, &bd, 1);
+	cli = nrs_tbf_cli_hash_lookup(hs, &bd, keystr);
+	cfs_hash_bd_unlock(hs, &bd, 1);
+
+	return cli;
+}
+
+static struct nrs_tbf_client *
+nrs_tbf_cli_findadd(struct nrs_tbf_head *head,
+		    struct nrs_tbf_client *cli)
+{
+	const char		*key;
+	struct nrs_tbf_client	*ret;
+	struct cfs_hash		*hs = head->th_cli_hash;
+	struct cfs_hash_bd	 bd;
+
+	key = cli->tc_key;
+	cfs_hash_bd_get_and_lock(hs, (void *)key, &bd, 1);
+	ret = nrs_tbf_cli_hash_lookup(hs, &bd, key);
+	if (ret == NULL) {
+		cfs_hash_bd_add_locked(hs, &bd, &cli->tc_hnode);
+		ret = cli;
+	}
+	cfs_hash_bd_unlock(hs, &bd, 1);
+
+	return ret;
+}
+
+static void
+nrs_tbf_cli_put(struct nrs_tbf_head *head, struct nrs_tbf_client *cli)
+{
+	struct cfs_hash_bd	 bd;
+	struct cfs_hash		*hs = head->th_cli_hash;
+	struct nrs_tbf_bucket	*bkt;
+	int			 hw;
+	struct list_head	 zombies;
+
+	INIT_LIST_HEAD(&zombies);
+	cfs_hash_bd_get(hs, &cli->tc_key, &bd);
+	bkt = cfs_hash_bd_extra_get(hs, &bd);
+	if (!cfs_hash_bd_dec_and_lock(hs, &bd, &cli->tc_ref))
+		return;
+	LASSERT(list_empty(&cli->tc_lru));
+	list_add_tail(&cli->tc_lru, &bkt->ntb_lru);
+
+	/**
+	 * Check and purge the LRU, there is at least one client in the LRU.
+	 */
+	hw = tbf_jobid_cache_size >> (hs->hs_cur_bits - hs->hs_bkt_bits);
+	while (cfs_hash_bd_count_get(&bd) > hw) {
+		if (unlikely(list_empty(&bkt->ntb_lru)))
+			break;
+		cli = list_entry(bkt->ntb_lru.next,
+				 struct nrs_tbf_client,
+				 tc_lru);
+		LASSERT(atomic_read(&cli->tc_ref) == 0);
+		cfs_hash_bd_del_locked(hs, &bd, &cli->tc_hnode);
+		list_move(&cli->tc_lru, &zombies);
+	}
+	cfs_hash_bd_unlock(head->th_cli_hash, &bd, 1);
+
+	while (!list_empty(&zombies)) {
+		cli = container_of0(zombies.next,
+				    struct nrs_tbf_client, tc_lru);
+		list_del_init(&cli->tc_lru);
+		nrs_tbf_cli_fini(cli);
+	}
+}
+
+static void
+nrs_tbf_generic_cli_init(struct nrs_tbf_client *cli,
+			 struct ptlrpc_request *req)
+{
+	char keystr[NRS_TBF_KEY_LEN];
+	const char *jobid;
+	__u32 opc;
+
+	jobid = lustre_msg_get_jobid(req->rq_reqmsg);
+	if (jobid == NULL)
+		jobid = NRS_TBF_JOBID_NULL;
+	opc = lustre_msg_get_opc(req->rq_reqmsg);
+	snprintf(keystr, sizeof(keystr), "%s_%s_%d", jobid,
+		 libcfs_nid2str(req->rq_peer.nid), opc);
+
+	LASSERT(strlen(keystr) < NRS_TBF_KEY_LEN);
+	INIT_LIST_HEAD(&cli->tc_lru);
+	memcpy(cli->tc_key, keystr, strlen(keystr));
+	memcpy(cli->tc_jobid, jobid, strlen(jobid));
+	cli->tc_nid = req->rq_peer.nid;
+	cli->tc_opcode = opc;
+}
+
+static void
+nrs_tbf_expression_free(struct nrs_tbf_expression *expr)
+{
+	LASSERT(expr->te_field >= NRS_TBF_FIELD_NID &&
+		expr->te_field < NRS_TBF_FIELD_MAX);
+	switch (expr->te_field) {
+	case NRS_TBF_FIELD_NID:
+		cfs_free_nidlist(&expr->te_cond);
+		break;
+	case NRS_TBF_FIELD_JOBID:
+		nrs_tbf_jobid_list_free(&expr->te_cond);
+		break;
+	case NRS_TBF_FIELD_OPCODE:
+		CFS_FREE_BITMAP(expr->te_opcodes);
+		break;
+	default:
+		LBUG();
+	}
+	OBD_FREE_PTR(expr);
+}
+
+static void
+nrs_tbf_conjunction_free(struct nrs_tbf_conjunction *conjunction)
+{
+	struct nrs_tbf_expression *expression;
+	struct nrs_tbf_expression *n;
+
+	LASSERT(list_empty(&conjunction->tc_linkage));
+	list_for_each_entry_safe(expression, n,
+				 &conjunction->tc_expressions,
+				 te_linkage) {
+		list_del_init(&expression->te_linkage);
+		nrs_tbf_expression_free(expression);
+	}
+	OBD_FREE_PTR(conjunction);
+}
+
+static void
+nrs_tbf_conds_free(struct list_head *cond_list)
+{
+	struct nrs_tbf_conjunction *conjunction;
+	struct nrs_tbf_conjunction *n;
+
+	list_for_each_entry_safe(conjunction, n, cond_list, tc_linkage) {
+		list_del_init(&conjunction->tc_linkage);
+		nrs_tbf_conjunction_free(conjunction);
+	}
+}
+
+static void
+nrs_tbf_generic_cmd_fini(struct nrs_tbf_cmd *cmd)
+{
+	if (!list_empty(&cmd->u.tc_start.ts_conds))
+		nrs_tbf_conds_free(&cmd->u.tc_start.ts_conds);
+	if (cmd->u.tc_start.ts_conds_str)
+		OBD_FREE(cmd->u.tc_start.ts_conds_str,
+			 strlen(cmd->u.tc_start.ts_conds_str) + 1);
+}
+
+#define NRS_TBF_DISJUNCTION_DELIM	(',')
+#define NRS_TBF_CONJUNCTION_DELIM	('&')
+#define NRS_TBF_EXPRESSION_DELIM	('=')
+
+static inline bool
+nrs_tbf_check_field(struct cfs_lstr *field, char *str)
+{
+	int len = strlen(str);
+
+	return (field->ls_len == len &&
+		strncmp(field->ls_str, str, len) == 0);
+}
+
+static int
+nrs_tbf_opcode_list_parse(char *str, int len, struct cfs_bitmap **bitmaptr);
+
+static int
+nrs_tbf_expression_parse(struct cfs_lstr *src, struct list_head *cond_list)
+{
+	struct nrs_tbf_expression *expr;
+	struct cfs_lstr field;
+	int rc = 0;
+
+	OBD_ALLOC(expr, sizeof(struct nrs_tbf_expression));
+	if (expr == NULL)
+		return -ENOMEM;
+
+	rc = cfs_gettok(src, NRS_TBF_EXPRESSION_DELIM, &field);
+	if (rc == 0 || src->ls_len <= 2 || src->ls_str[0] != '{' ||
+	    src->ls_str[src->ls_len - 1] != '}')
+		GOTO(out, rc = -EINVAL);
+
+	/* Skip '{' and '}' */
+	src->ls_str++;
+	src->ls_len -= 2;
+
+	if (nrs_tbf_check_field(&field, "nid")) {
+		if (cfs_parse_nidlist(src->ls_str,
+				      src->ls_len,
+				      &expr->te_cond) <= 0)
+			GOTO(out, rc = -EINVAL);
+		expr->te_field = NRS_TBF_FIELD_NID;
+	} else if (nrs_tbf_check_field(&field, "jobid")) {
+		if (nrs_tbf_jobid_list_parse(src->ls_str,
+					     src->ls_len,
+					     &expr->te_cond) < 0)
+			GOTO(out, rc = -EINVAL);
+		expr->te_field = NRS_TBF_FIELD_JOBID;
+	} else if (nrs_tbf_check_field(&field, "opcode")) {
+		if (nrs_tbf_opcode_list_parse(src->ls_str,
+					      src->ls_len,
+					      &expr->te_opcodes) < 0)
+			GOTO(out, rc = -EINVAL);
+		expr->te_field = NRS_TBF_FIELD_OPCODE;
+	} else
+		GOTO(out, rc = -EINVAL);
+
+	list_add_tail(&expr->te_linkage, cond_list);
+	return 0;
+out:
+	OBD_FREE_PTR(expr);
+	return rc;
+}
+
+static int
+nrs_tbf_conjunction_parse(struct cfs_lstr *src, struct list_head *cond_list)
+{
+	struct nrs_tbf_conjunction *conjunction;
+	struct cfs_lstr expr;
+	int rc = 0;
+
+	OBD_ALLOC(conjunction, sizeof(struct nrs_tbf_conjunction));
+	if (conjunction == NULL)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&conjunction->tc_expressions);
+	list_add_tail(&conjunction->tc_linkage, cond_list);
+
+	while (src->ls_str) {
+		rc = cfs_gettok(src, NRS_TBF_CONJUNCTION_DELIM, &expr);
+		if (rc == 0) {
+			rc = -EINVAL;
+			break;
+		}
+		rc = nrs_tbf_expression_parse(&expr,
+					      &conjunction->tc_expressions);
+		if (rc)
+			break;
+	}
+	return rc;
+}
+
+static int
+nrs_tbf_conds_parse(char *str, int len, struct list_head *cond_list)
+{
+	struct cfs_lstr src;
+	struct cfs_lstr res;
+	int rc = 0;
+
+	src.ls_str = str;
+	src.ls_len = len;
+	INIT_LIST_HEAD(cond_list);
+	while (src.ls_str) {
+		rc = cfs_gettok(&src, NRS_TBF_DISJUNCTION_DELIM, &res);
+		if (rc == 0) {
+			rc = -EINVAL;
+			break;
+		}
+		rc = nrs_tbf_conjunction_parse(&res, cond_list);
+		if (rc)
+			break;
+	}
+	return rc;
+}
+
+static int
+nrs_tbf_generic_parse(struct nrs_tbf_cmd *cmd, const char *id)
+{
+	int rc;
+
+	OBD_ALLOC(cmd->u.tc_start.ts_conds_str, strlen(id) + 1);
+	if (cmd->u.tc_start.ts_conds_str == NULL)
+		return -ENOMEM;
+
+	memcpy(cmd->u.tc_start.ts_conds_str, id, strlen(id));
+
+	/* Parse hybird NID and JOBID conditions */
+	rc = nrs_tbf_conds_parse(cmd->u.tc_start.ts_conds_str,
+				 strlen(cmd->u.tc_start.ts_conds_str),
+				 &cmd->u.tc_start.ts_conds);
+	if (rc)
+		nrs_tbf_generic_cmd_fini(cmd);
+
+	return rc;
+}
+
+static int
+nrs_tbf_expression_match(struct nrs_tbf_expression *expr,
+			 struct nrs_tbf_rule *rule,
+			 struct nrs_tbf_client *cli)
+{
+	switch (expr->te_field) {
+	case NRS_TBF_FIELD_NID:
+		return cfs_match_nid(cli->tc_nid, &expr->te_cond);
+	case NRS_TBF_FIELD_JOBID:
+		return nrs_tbf_jobid_list_match(&expr->te_cond, cli->tc_jobid);
+	case NRS_TBF_FIELD_OPCODE:
+		return cfs_bitmap_check(expr->te_opcodes, cli->tc_opcode);
+	default:
+		return 0;
+	}
+}
+
+static int
+nrs_tbf_conjunction_match(struct nrs_tbf_conjunction *conjunction,
+			  struct nrs_tbf_rule *rule,
+			  struct nrs_tbf_client *cli)
+{
+	struct nrs_tbf_expression *expr;
+	int matched;
+
+	list_for_each_entry(expr, &conjunction->tc_expressions, te_linkage) {
+		matched = nrs_tbf_expression_match(expr, rule, cli);
+		if (!matched)
+			return 0;
+	}
+
+	return 1;
+}
+
+static int
+nrs_tbf_cond_match(struct nrs_tbf_rule *rule, struct nrs_tbf_client *cli)
+{
+	struct nrs_tbf_conjunction *conjunction;
+	int matched;
+
+	list_for_each_entry(conjunction, &rule->tr_conds, tc_linkage) {
+		matched = nrs_tbf_conjunction_match(conjunction, rule, cli);
+		if (matched)
+			return 1;
+	}
+
+	return 0;
+}
+
+static void
+nrs_tbf_generic_rule_fini(struct nrs_tbf_rule *rule)
+{
+	if (!list_empty(&rule->tr_conds))
+		nrs_tbf_conds_free(&rule->tr_conds);
+	LASSERT(rule->tr_conds_str != NULL);
+	OBD_FREE(rule->tr_conds_str, strlen(rule->tr_conds_str) + 1);
+}
+
+static int
+nrs_tbf_rule_init(struct ptlrpc_nrs_policy *policy,
+		  struct nrs_tbf_rule *rule, struct nrs_tbf_cmd *start)
+{
+	int rc = 0;
+
+	LASSERT(start->u.tc_start.ts_conds_str);
+	OBD_ALLOC(rule->tr_conds_str,
+		  strlen(start->u.tc_start.ts_conds_str) + 1);
+	if (rule->tr_conds_str == NULL)
+		return -ENOMEM;
+
+	memcpy(rule->tr_conds_str,
+	       start->u.tc_start.ts_conds_str,
+	       strlen(start->u.tc_start.ts_conds_str));
+
+	INIT_LIST_HEAD(&rule->tr_conds);
+	if (!list_empty(&start->u.tc_start.ts_conds)) {
+		rc = nrs_tbf_conds_parse(rule->tr_conds_str,
+					 strlen(rule->tr_conds_str),
+					 &rule->tr_conds);
+	}
+	if (rc)
+		nrs_tbf_generic_rule_fini(rule);
+
+	return rc;
+}
+
+static int
+nrs_tbf_generic_rule_dump(struct nrs_tbf_rule *rule, struct seq_file *m)
+{
+	seq_printf(m, "%s %s %llu, ref %d\n", rule->tr_name,
+		   rule->tr_conds_str, rule->tr_rpc_rate,
+		   atomic_read(&rule->tr_ref) - 1);
+	return 0;
+}
+
+static int
+nrs_tbf_generic_rule_match(struct nrs_tbf_rule *rule,
+			   struct nrs_tbf_client *cli)
+{
+	return nrs_tbf_cond_match(rule, cli);
+}
+
+static struct nrs_tbf_ops nrs_tbf_generic_ops = {
+	.o_name = NRS_TBF_TYPE_GENERIC,
+	.o_startup = nrs_tbf_startup,
+	.o_cli_find = nrs_tbf_cli_find,
+	.o_cli_findadd = nrs_tbf_cli_findadd,
+	.o_cli_put = nrs_tbf_cli_put,
+	.o_cli_init = nrs_tbf_generic_cli_init,
+	.o_rule_init = nrs_tbf_rule_init,
+	.o_rule_dump = nrs_tbf_generic_rule_dump,
+	.o_rule_match = nrs_tbf_generic_rule_match,
+	.o_rule_fini = nrs_tbf_generic_rule_fini,
+};
+
+static void nrs_tbf_opcode_rule_fini(struct nrs_tbf_rule *rule)
+{
+	if (rule->tr_opcodes != NULL)
+		CFS_FREE_BITMAP(rule->tr_opcodes);
+
+	LASSERT(rule->tr_opcodes_str != NULL);
+	OBD_FREE(rule->tr_opcodes_str, strlen(rule->tr_opcodes_str) + 1);
+}
+
+static unsigned nrs_tbf_opcode_hop_hash(struct cfs_hash *hs, const void *key,
+					unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, sizeof(__u32), mask);
+}
+
+static int nrs_tbf_opcode_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+	const __u32	*opc = key;
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	return *opc == cli->tc_opcode;
+}
+
+static void *nrs_tbf_opcode_hop_key(struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	return &cli->tc_opcode;
+}
+
+static void *nrs_tbf_opcode_hop_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct nrs_tbf_client, tc_hnode);
+}
+
+static void nrs_tbf_opcode_hop_get(struct cfs_hash *hs,
+				   struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	atomic_inc(&cli->tc_ref);
+}
+
+static void nrs_tbf_opcode_hop_put(struct cfs_hash *hs,
+				   struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	atomic_dec(&cli->tc_ref);
+}
+
+static void nrs_tbf_opcode_hop_exit(struct cfs_hash *hs,
+				    struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	LASSERTF(atomic_read(&cli->tc_ref) == 0,
+		 "Busy TBF object from client with opcode %s, with %d refs\n",
+		 ll_opcode2str(cli->tc_opcode),
+		 atomic_read(&cli->tc_ref));
+
+	nrs_tbf_cli_fini(cli);
+}
+static struct cfs_hash_ops nrs_tbf_opcode_hash_ops = {
+	.hs_hash	= nrs_tbf_opcode_hop_hash,
+	.hs_keycmp	= nrs_tbf_opcode_hop_keycmp,
+	.hs_key		= nrs_tbf_opcode_hop_key,
+	.hs_object	= nrs_tbf_opcode_hop_object,
+	.hs_get		= nrs_tbf_opcode_hop_get,
+	.hs_put		= nrs_tbf_opcode_hop_put,
+	.hs_put_locked	= nrs_tbf_opcode_hop_put,
+	.hs_exit	= nrs_tbf_opcode_hop_exit,
+};
+
+static int
+nrs_tbf_opcode_startup(struct ptlrpc_nrs_policy *policy,
+		    struct nrs_tbf_head *head)
+{
+	struct nrs_tbf_cmd	start = { 0 };
+	int rc;
+
+	head->th_cli_hash = cfs_hash_create("nrs_tbf_hash",
+					    NRS_TBF_NID_BITS,
+					    NRS_TBF_NID_BITS,
+					    NRS_TBF_NID_BKT_BITS, 0,
+					    CFS_HASH_MIN_THETA,
+					    CFS_HASH_MAX_THETA,
+					    &nrs_tbf_opcode_hash_ops,
+					    CFS_HASH_RW_BKTLOCK);
+	if (head->th_cli_hash == NULL)
+		return -ENOMEM;
+
+	start.u.tc_start.ts_opcodes = NULL;
+	start.u.tc_start.ts_opcodes_str = "*";
+
+	start.u.tc_start.ts_rpc_rate = tbf_rate;
+	start.u.tc_start.ts_rule_flags = NTRS_DEFAULT;
+	start.tc_name = NRS_TBF_DEFAULT_RULE;
+	rc = nrs_tbf_rule_start(policy, head, &start);
+
+	return rc;
+}
+
+static struct nrs_tbf_client *
+nrs_tbf_opcode_cli_find(struct nrs_tbf_head *head,
+			struct ptlrpc_request *req)
+{
+	__u32 opc;
+
+	opc = lustre_msg_get_opc(req->rq_reqmsg);
+	return cfs_hash_lookup(head->th_cli_hash, &opc);
+}
+
+static struct nrs_tbf_client *
+nrs_tbf_opcode_cli_findadd(struct nrs_tbf_head *head,
+			   struct nrs_tbf_client *cli)
+{
+	return cfs_hash_findadd_unique(head->th_cli_hash, &cli->tc_opcode,
+				       &cli->tc_hnode);
+}
+
+static void
+nrs_tbf_opcode_cli_init(struct nrs_tbf_client *cli,
+			struct ptlrpc_request *req)
+{
+	cli->tc_opcode = lustre_msg_get_opc(req->rq_reqmsg);
+}
+
+#define MAX_OPCODE_LEN	32
+static int
+nrs_tbf_opcode_set_bit(const struct cfs_lstr *id, struct cfs_bitmap *opcodes)
+{
+	int	op = 0;
+	char	opcode_str[MAX_OPCODE_LEN];
+
+	if (id->ls_len + 1 > MAX_OPCODE_LEN)
+		return -EINVAL;
+
+	memcpy(opcode_str, id->ls_str, id->ls_len);
+	opcode_str[id->ls_len] = '\0';
+
+	op = ll_str2opcode(opcode_str);
+	if (op < 0)
+		return -EINVAL;
+
+	cfs_bitmap_set(opcodes, op);
+	return 0;
+}
+
+static int
+nrs_tbf_opcode_list_parse(char *str, int len, struct cfs_bitmap **bitmaptr)
+{
+	struct cfs_bitmap *opcodes;
+	struct cfs_lstr src;
+	struct cfs_lstr res;
+	int rc = 0;
+	ENTRY;
+
+	opcodes = CFS_ALLOCATE_BITMAP(LUSTRE_MAX_OPCODES);
+	if (opcodes == NULL)
+		return -ENOMEM;
+
+	src.ls_str = str;
+	src.ls_len = len;
+	while (src.ls_str) {
+		rc = cfs_gettok(&src, ' ', &res);
+		if (rc == 0) {
+			rc = -EINVAL;
+			break;
+		}
+		rc = nrs_tbf_opcode_set_bit(&res, opcodes);
+		if (rc)
+			break;
+	}
+
+	if (rc == 0)
+		*bitmaptr = opcodes;
+	else
+		CFS_FREE_BITMAP(opcodes);
+
+	RETURN(rc);
+}
+
+static void nrs_tbf_opcode_cmd_fini(struct nrs_tbf_cmd *cmd)
+{
+	if (cmd->u.tc_start.ts_opcodes)
+		CFS_FREE_BITMAP(cmd->u.tc_start.ts_opcodes);
+
+	if (cmd->u.tc_start.ts_opcodes_str)
+		OBD_FREE(cmd->u.tc_start.ts_opcodes_str,
+			 strlen(cmd->u.tc_start.ts_opcodes_str) + 1);
+
+}
+
+static int nrs_tbf_opcode_parse(struct nrs_tbf_cmd *cmd, char *id)
+{
+	struct cfs_lstr src;
+	int rc;
+
+	src.ls_str = id;
+	src.ls_len = strlen(id);
+	rc = nrs_tbf_check_id_value(&src, "opcode");
+	if (rc)
+		return rc;
+
+	OBD_ALLOC(cmd->u.tc_start.ts_opcodes_str, src.ls_len + 1);
+	if (cmd->u.tc_start.ts_opcodes_str == NULL)
+		return -ENOMEM;
+
+	memcpy(cmd->u.tc_start.ts_opcodes_str, src.ls_str, src.ls_len);
+
+	/* parse opcode list */
+	rc = nrs_tbf_opcode_list_parse(cmd->u.tc_start.ts_opcodes_str,
+				       strlen(cmd->u.tc_start.ts_opcodes_str),
+				       &cmd->u.tc_start.ts_opcodes);
+	if (rc)
+		nrs_tbf_opcode_cmd_fini(cmd);
+
+	return rc;
+}
+
+static int
+nrs_tbf_opcode_rule_match(struct nrs_tbf_rule *rule,
+			  struct nrs_tbf_client *cli)
+{
+	if (rule->tr_opcodes == NULL)
+		return 0;
+
+	return cfs_bitmap_check(rule->tr_opcodes, cli->tc_opcode);
+}
+
+static int nrs_tbf_opcode_rule_init(struct ptlrpc_nrs_policy *policy,
+				    struct nrs_tbf_rule *rule,
+				    struct nrs_tbf_cmd *start)
+{
+	int rc = 0;
+
+	LASSERT(start->u.tc_start.ts_opcodes_str != NULL);
+	OBD_ALLOC(rule->tr_opcodes_str,
+		  strlen(start->u.tc_start.ts_opcodes_str) + 1);
+	if (rule->tr_opcodes_str == NULL)
+		return -ENOMEM;
+
+	strncpy(rule->tr_opcodes_str, start->u.tc_start.ts_opcodes_str,
+		strlen(start->u.tc_start.ts_opcodes_str) + 1);
+
+	/* Default rule '*' */
+	if (start->u.tc_start.ts_opcodes == NULL)
+		return 0;
+
+	rc = nrs_tbf_opcode_list_parse(rule->tr_opcodes_str,
+				       strlen(rule->tr_opcodes_str),
+				       &rule->tr_opcodes);
+	if (rc)
+		OBD_FREE(rule->tr_opcodes_str,
+			 strlen(start->u.tc_start.ts_opcodes_str) + 1);
+
+	return rc;
+}
+
+static int
+nrs_tbf_opcode_rule_dump(struct nrs_tbf_rule *rule, struct seq_file *m)
+{
+	seq_printf(m, "%s {%s} %llu, ref %d\n", rule->tr_name,
+		   rule->tr_opcodes_str, rule->tr_rpc_rate,
+		   atomic_read(&rule->tr_ref) - 1);
+	return 0;
+}
+
+
+struct nrs_tbf_ops nrs_tbf_opcode_ops = {
+	.o_name = NRS_TBF_TYPE_OPCODE,
+	.o_startup = nrs_tbf_opcode_startup,
+	.o_cli_find = nrs_tbf_opcode_cli_find,
+	.o_cli_findadd = nrs_tbf_opcode_cli_findadd,
+	.o_cli_put = nrs_tbf_nid_cli_put,
+	.o_cli_init = nrs_tbf_opcode_cli_init,
+	.o_rule_init = nrs_tbf_opcode_rule_init,
+	.o_rule_dump = nrs_tbf_opcode_rule_dump,
+	.o_rule_match = nrs_tbf_opcode_rule_match,
+	.o_rule_fini = nrs_tbf_opcode_rule_fini,
+};
+
+static struct nrs_tbf_type nrs_tbf_types[] = {
+	{
+		.ntt_name = NRS_TBF_TYPE_JOBID,
+		.ntt_flag = NRS_TBF_FLAG_JOBID,
+		.ntt_ops = &nrs_tbf_jobid_ops,
+	},
+	{
+		.ntt_name = NRS_TBF_TYPE_NID,
+		.ntt_flag = NRS_TBF_FLAG_NID,
+		.ntt_ops = &nrs_tbf_nid_ops,
+	},
+	{
+		.ntt_name = NRS_TBF_TYPE_OPCODE,
+		.ntt_flag = NRS_TBF_FLAG_OPCODE,
+		.ntt_ops = &nrs_tbf_opcode_ops,
+	},
+	{
+		.ntt_name = NRS_TBF_TYPE_GENERIC,
+		.ntt_flag = NRS_TBF_FLAG_GENERIC,
+		.ntt_ops = &nrs_tbf_generic_ops,
+	},
+};
+
+/**
+ * Is called before the policy transitions into
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED; allocates and initializes a
+ * policy-specific private data structure.
+ *
+ * \param[in] policy The policy to start
+ *
+ * \retval -ENOMEM OOM error
+ * \retval  0	   success
+ *
+ * \see nrs_policy_register()
+ * \see nrs_policy_ctl()
+ */
+static int nrs_tbf_start(struct ptlrpc_nrs_policy *policy, char *arg)
+{
+	struct nrs_tbf_head	*head;
+	struct nrs_tbf_ops	*ops;
+	__u32			 type;
+	char			*name;
+	int found = 0;
+	int i;
+	int rc = 0;
+
+	if (arg == NULL)
+		name = NRS_TBF_TYPE_GENERIC;
+	else if (strlen(arg) < NRS_TBF_TYPE_MAX_LEN)
+		name = arg;
+	else
+		GOTO(out, rc = -EINVAL);
+
+	for (i = 0; i < ARRAY_SIZE(nrs_tbf_types); i++) {
+		if (strcmp(name, nrs_tbf_types[i].ntt_name) == 0) {
+			ops = nrs_tbf_types[i].ntt_ops;
+			type = nrs_tbf_types[i].ntt_flag;
+			found = 1;
+			break;
+		}
+	}
+	if (found == 0)
+		GOTO(out, rc = -ENOTSUPP);
+
+	OBD_CPT_ALLOC_PTR(head, nrs_pol2cptab(policy), nrs_pol2cptid(policy));
+	if (head == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	memcpy(head->th_type, name, strlen(name));
+	head->th_type[strlen(name)] = '\0';
+	head->th_ops = ops;
+	head->th_type_flag = type;
+
+	head->th_binheap = cfs_binheap_create(&nrs_tbf_heap_ops,
+					      CBH_FLAG_ATOMIC_GROW, 4096, NULL,
+					      nrs_pol2cptab(policy),
+					      nrs_pol2cptid(policy));
+	if (head->th_binheap == NULL)
+		GOTO(out_free_head, rc = -ENOMEM);
+
+	atomic_set(&head->th_rule_sequence, 0);
+	spin_lock_init(&head->th_rule_lock);
+	INIT_LIST_HEAD(&head->th_list);
+	hrtimer_init(&head->th_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	head->th_timer.function = nrs_tbf_timer_cb;
+	rc = head->th_ops->o_startup(policy, head);
+	if (rc)
+		GOTO(out_free_heap, rc);
+
+	policy->pol_private = head;
+	return 0;
+out_free_heap:
+	cfs_binheap_destroy(head->th_binheap);
+out_free_head:
+	OBD_FREE_PTR(head);
+out:
+	return rc;
+}
+
+/**
+ * Is called before the policy transitions into
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED; deallocates the policy-specific
+ * private data structure.
+ *
+ * \param[in] policy The policy to stop
+ *
+ * \see nrs_policy_stop0()
+ */
+static void nrs_tbf_stop(struct ptlrpc_nrs_policy *policy)
+{
+	struct nrs_tbf_head *head = policy->pol_private;
+	struct ptlrpc_nrs *nrs = policy->pol_nrs;
+	struct nrs_tbf_rule *rule, *n;
+
+	LASSERT(head != NULL);
+	LASSERT(head->th_cli_hash != NULL);
+	hrtimer_cancel(&head->th_timer);
+	/* Should cleanup hash first before free rules */
+	cfs_hash_putref(head->th_cli_hash);
+	list_for_each_entry_safe(rule, n, &head->th_list, tr_linkage) {
+		list_del_init(&rule->tr_linkage);
+		nrs_tbf_rule_put(rule);
+	}
+	LASSERT(list_empty(&head->th_list));
+	LASSERT(head->th_binheap != NULL);
+	LASSERT(cfs_binheap_is_empty(head->th_binheap));
+	cfs_binheap_destroy(head->th_binheap);
+	OBD_FREE_PTR(head);
+	nrs->nrs_throttling = 0;
+	wake_up(&policy->pol_nrs->nrs_svcpt->scp_waitq);
+}
+
+/**
+ * Performs a policy-specific ctl function on TBF policy instances; similar
+ * to ioctl.
+ *
+ * \param[in]	  policy the policy instance
+ * \param[in]	  opc	 the opcode
+ * \param[in,out] arg	 used for passing parameters and information
+ *
+ * \pre assert_spin_locked(&policy->pol_nrs->->nrs_lock)
+ * \post assert_spin_locked(&policy->pol_nrs->->nrs_lock)
+ *
+ * \retval 0   operation carried out successfully
+ * \retval -ve error
+ */
+static int nrs_tbf_ctl(struct ptlrpc_nrs_policy *policy,
+		       enum ptlrpc_nrs_ctl opc,
+		       void *arg)
+{
+	int rc = 0;
+	ENTRY;
+
+	assert_spin_locked(&policy->pol_nrs->nrs_lock);
+
+	switch ((enum nrs_ctl_tbf)opc) {
+	default:
+		RETURN(-EINVAL);
+
+	/**
+	 * Read RPC rate size of a policy instance.
+	 */
+	case NRS_CTL_TBF_RD_RULE: {
+		struct nrs_tbf_head *head = policy->pol_private;
+		struct seq_file *m = (struct seq_file *) arg;
+		struct ptlrpc_service_part *svcpt;
+
+		svcpt = policy->pol_nrs->nrs_svcpt;
+		seq_printf(m, "CPT %d:\n", svcpt->scp_cpt);
+
+		rc = nrs_tbf_rule_dump_all(head, m);
+		}
+		break;
+
+	/**
+	 * Write RPC rate of a policy instance.
+	 */
+	case NRS_CTL_TBF_WR_RULE: {
+		struct nrs_tbf_head *head = policy->pol_private;
+		struct nrs_tbf_cmd *cmd;
+
+		cmd = (struct nrs_tbf_cmd *)arg;
+		rc = nrs_tbf_command(policy,
+				     head,
+				     cmd);
+		}
+		break;
+	/**
+	 * Read the TBF policy type of a policy instance.
+	 */
+	case NRS_CTL_TBF_RD_TYPE_FLAG: {
+		struct nrs_tbf_head *head = policy->pol_private;
+
+		*(__u32 *)arg = head->th_type_flag;
+		}
+		break;
+	}
+
+	RETURN(rc);
+}
+
+/**
+ * Is called for obtaining a TBF policy resource.
+ *
+ * \param[in]  policy	  The policy on which the request is being asked for
+ * \param[in]  nrq	  The request for which resources are being taken
+ * \param[in]  parent	  Parent resource, unused in this policy
+ * \param[out] resp	  Resources references are placed in this array
+ * \param[in]  moving_req Signifies limited caller context; unused in this
+ *			  policy
+ *
+ *
+ * \see nrs_resource_get_safe()
+ */
+static int nrs_tbf_res_get(struct ptlrpc_nrs_policy *policy,
+			   struct ptlrpc_nrs_request *nrq,
+			   const struct ptlrpc_nrs_resource *parent,
+			   struct ptlrpc_nrs_resource **resp,
+			   bool moving_req)
+{
+	struct nrs_tbf_head   *head;
+	struct nrs_tbf_client *cli;
+	struct nrs_tbf_client *tmp;
+	struct ptlrpc_request *req;
+
+	if (parent == NULL) {
+		*resp = &((struct nrs_tbf_head *)policy->pol_private)->th_res;
+		return 0;
+	}
+
+	head = container_of(parent, struct nrs_tbf_head, th_res);
+	req = container_of(nrq, struct ptlrpc_request, rq_nrq);
+	cli = head->th_ops->o_cli_find(head, req);
+	if (cli != NULL) {
+		spin_lock(&policy->pol_nrs->nrs_svcpt->scp_req_lock);
+		LASSERT(cli->tc_rule);
+		if (cli->tc_rule_sequence !=
+		    atomic_read(&head->th_rule_sequence) ||
+		    cli->tc_rule->tr_flags & NTRS_STOPPING) {
+			struct nrs_tbf_rule *rule;
+
+			CDEBUG(D_RPCTRACE,
+			       "TBF class@%p rate %llu sequence %d, "
+			       "rule flags %d, head sequence %d\n",
+			       cli, cli->tc_rpc_rate,
+			       cli->tc_rule_sequence,
+			       cli->tc_rule->tr_flags,
+			       atomic_read(&head->th_rule_sequence));
+			rule = nrs_tbf_rule_match(head, cli);
+			if (rule != cli->tc_rule) {
+				nrs_tbf_cli_reset(head, rule, cli);
+			} else {
+				if (cli->tc_rule_generation != rule->tr_generation)
+					nrs_tbf_cli_reset_value(head, cli);
+				nrs_tbf_rule_put(rule);
+			}
+		} else if (cli->tc_rule_generation !=
+			   cli->tc_rule->tr_generation) {
+			nrs_tbf_cli_reset_value(head, cli);
+		}
+		spin_unlock(&policy->pol_nrs->nrs_svcpt->scp_req_lock);
+		goto out;
+	}
+
+	OBD_CPT_ALLOC_GFP(cli, nrs_pol2cptab(policy), nrs_pol2cptid(policy),
+			  sizeof(*cli), moving_req ? GFP_ATOMIC : __GFP_IO);
+	if (cli == NULL)
+		return -ENOMEM;
+
+	nrs_tbf_cli_init(head, cli, req);
+	tmp = head->th_ops->o_cli_findadd(head, cli);
+	if (tmp != cli) {
+		atomic_dec(&cli->tc_ref);
+		nrs_tbf_cli_fini(cli);
+		cli = tmp;
+	}
+out:
+	*resp = &cli->tc_res;
+
+	return 1;
+}
+
+/**
+ * Called when releasing references to the resource hierachy obtained for a
+ * request for scheduling using the TBF policy.
+ *
+ * \param[in] policy   the policy the resource belongs to
+ * \param[in] res      the resource to be released
+ */
+static void nrs_tbf_res_put(struct ptlrpc_nrs_policy *policy,
+			    const struct ptlrpc_nrs_resource *res)
+{
+	struct nrs_tbf_head   *head;
+	struct nrs_tbf_client *cli;
+
+	/**
+	 * Do nothing for freeing parent, nrs_tbf_net resources
+	 */
+	if (res->res_parent == NULL)
+		return;
+
+	cli = container_of(res, struct nrs_tbf_client, tc_res);
+	head = container_of(res->res_parent, struct nrs_tbf_head, th_res);
+
+	head->th_ops->o_cli_put(head, cli);
+}
+
+/**
+ * Called when getting a request from the TBF policy for handling, or just
+ * peeking; removes the request from the policy when it is to be handled.
+ *
+ * \param[in] policy The policy
+ * \param[in] peek   When set, signifies that we just want to examine the
+ *		     request, and not handle it, so the request is not removed
+ *		     from the policy.
+ * \param[in] force  Force the policy to return a request; unused in this
+ *		     policy
+ *
+ * \retval The request to be handled; this is the next request in the TBF
+ *	   rule
+ *
+ * \see ptlrpc_nrs_req_get_nolock()
+ * \see nrs_request_get()
+ */
+static
+struct ptlrpc_nrs_request *nrs_tbf_req_get(struct ptlrpc_nrs_policy *policy,
+					   bool peek, bool force)
+{
+	struct nrs_tbf_head	  *head = policy->pol_private;
+	struct ptlrpc_nrs_request *nrq = NULL;
+	struct nrs_tbf_client     *cli;
+	struct cfs_binheap_node	  *node;
+
+	assert_spin_locked(&policy->pol_nrs->nrs_svcpt->scp_req_lock);
+
+	if (!peek && policy->pol_nrs->nrs_throttling)
+		return NULL;
+
+	node = cfs_binheap_root(head->th_binheap);
+	if (unlikely(node == NULL))
+		return NULL;
+
+	cli = container_of(node, struct nrs_tbf_client, tc_node);
+	LASSERT(cli->tc_in_heap);
+	if (peek) {
+		nrq = list_entry(cli->tc_list.next,
+				     struct ptlrpc_nrs_request,
+				     nr_u.tbf.tr_list);
+	} else {
+		__u64 now = ktime_to_ns(ktime_get());
+		__u64 passed;
+		__u64 ntoken;
+		__u64 deadline;
+
+		deadline = cli->tc_check_time +
+			  cli->tc_nsecs;
+		LASSERT(now >= cli->tc_check_time);
+		passed = now - cli->tc_check_time;
+		ntoken = passed * cli->tc_rpc_rate;
+		do_div(ntoken, NSEC_PER_SEC);
+		ntoken += cli->tc_ntoken;
+		if (ntoken > cli->tc_depth)
+			ntoken = cli->tc_depth;
+		if (ntoken > 0) {
+			struct ptlrpc_request *req;
+			nrq = list_entry(cli->tc_list.next,
+					     struct ptlrpc_nrs_request,
+					     nr_u.tbf.tr_list);
+			req = container_of(nrq,
+					   struct ptlrpc_request,
+					   rq_nrq);
+			ntoken--;
+			cli->tc_ntoken = ntoken;
+			cli->tc_check_time = now;
+			list_del_init(&nrq->nr_u.tbf.tr_list);
+			if (list_empty(&cli->tc_list)) {
+				cfs_binheap_remove(head->th_binheap,
+						   &cli->tc_node);
+				cli->tc_in_heap = false;
+			} else {
+				cfs_binheap_relocate(head->th_binheap,
+						     &cli->tc_node);
+			}
+			CDEBUG(D_RPCTRACE,
+			       "TBF dequeues: class@%p rate %llu gen %llu "
+			       "token %llu, rule@%p rate %llu gen %llu\n",
+			       cli, cli->tc_rpc_rate,
+			       cli->tc_rule_generation, cli->tc_ntoken,
+			       cli->tc_rule, cli->tc_rule->tr_rpc_rate,
+			       cli->tc_rule->tr_generation);
+		} else {
+			ktime_t time;
+
+			policy->pol_nrs->nrs_throttling = 1;
+			head->th_deadline = deadline;
+			time = ktime_set(0, 0);
+			time = ktime_add_ns(time, deadline);
+			hrtimer_start(&head->th_timer, time, HRTIMER_MODE_ABS);
+		}
+	}
+
+	return nrq;
+}
+
+/**
+ * Adds request \a nrq to \a policy's list of queued requests
+ *
+ * \param[in] policy The policy
+ * \param[in] nrq    The request to add
+ *
+ * \retval 0 success; nrs_request_enqueue() assumes this function will always
+ *		      succeed
+ */
+static int nrs_tbf_req_add(struct ptlrpc_nrs_policy *policy,
+			   struct ptlrpc_nrs_request *nrq)
+{
+	struct nrs_tbf_head   *head;
+	struct nrs_tbf_client *cli;
+	int		       rc = 0;
+
+	assert_spin_locked(&policy->pol_nrs->nrs_svcpt->scp_req_lock);
+
+	cli = container_of(nrs_request_resource(nrq),
+			   struct nrs_tbf_client, tc_res);
+	head = container_of(nrs_request_resource(nrq)->res_parent,
+			    struct nrs_tbf_head, th_res);
+	if (list_empty(&cli->tc_list)) {
+		LASSERT(!cli->tc_in_heap);
+		rc = cfs_binheap_insert(head->th_binheap, &cli->tc_node);
+		if (rc == 0) {
+			cli->tc_in_heap = true;
+			nrq->nr_u.tbf.tr_sequence = head->th_sequence++;
+			list_add_tail(&nrq->nr_u.tbf.tr_list,
+					  &cli->tc_list);
+			if (policy->pol_nrs->nrs_throttling) {
+				__u64 deadline = cli->tc_check_time +
+						 cli->tc_nsecs;
+				if ((head->th_deadline > deadline) &&
+				    (hrtimer_try_to_cancel(&head->th_timer)
+				     >= 0)) {
+					ktime_t time;
+					head->th_deadline = deadline;
+					time = ktime_set(0, 0);
+					time = ktime_add_ns(time, deadline);
+					hrtimer_start(&head->th_timer, time,
+						      HRTIMER_MODE_ABS);
+				}
+			}
+		}
+	} else {
+		LASSERT(cli->tc_in_heap);
+		nrq->nr_u.tbf.tr_sequence = head->th_sequence++;
+		list_add_tail(&nrq->nr_u.tbf.tr_list,
+				  &cli->tc_list);
+	}
+
+	if (rc == 0)
+		CDEBUG(D_RPCTRACE,
+		       "TBF enqueues: class@%p rate %llu gen %llu "
+		       "token %llu, rule@%p rate %llu gen %llu\n",
+		       cli, cli->tc_rpc_rate,
+		       cli->tc_rule_generation, cli->tc_ntoken,
+		       cli->tc_rule, cli->tc_rule->tr_rpc_rate,
+		       cli->tc_rule->tr_generation);
+
+	return rc;
+}
+
+/**
+ * Removes request \a nrq from \a policy's list of queued requests.
+ *
+ * \param[in] policy The policy
+ * \param[in] nrq    The request to remove
+ */
+static void nrs_tbf_req_del(struct ptlrpc_nrs_policy *policy,
+			     struct ptlrpc_nrs_request *nrq)
+{
+	struct nrs_tbf_head   *head;
+	struct nrs_tbf_client *cli;
+
+	assert_spin_locked(&policy->pol_nrs->nrs_svcpt->scp_req_lock);
+
+	cli = container_of(nrs_request_resource(nrq),
+			   struct nrs_tbf_client, tc_res);
+	head = container_of(nrs_request_resource(nrq)->res_parent,
+			    struct nrs_tbf_head, th_res);
+
+	LASSERT(!list_empty(&nrq->nr_u.tbf.tr_list));
+	list_del_init(&nrq->nr_u.tbf.tr_list);
+	if (list_empty(&cli->tc_list)) {
+		cfs_binheap_remove(head->th_binheap,
+				   &cli->tc_node);
+		cli->tc_in_heap = false;
+	} else {
+		cfs_binheap_relocate(head->th_binheap,
+				     &cli->tc_node);
+	}
+}
+
+/**
+ * Prints a debug statement right before the request \a nrq stops being
+ * handled.
+ *
+ * \param[in] policy The policy handling the request
+ * \param[in] nrq    The request being handled
+ *
+ * \see ptlrpc_server_finish_request()
+ * \see ptlrpc_nrs_req_stop_nolock()
+ */
+static void nrs_tbf_req_stop(struct ptlrpc_nrs_policy *policy,
+			      struct ptlrpc_nrs_request *nrq)
+{
+	struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request,
+						  rq_nrq);
+
+	assert_spin_locked(&policy->pol_nrs->nrs_svcpt->scp_req_lock);
+
+	CDEBUG(D_RPCTRACE, "NRS stop %s request from %s, seq: %llu\n",
+	       policy->pol_desc->pd_name, libcfs_id2str(req->rq_peer),
+	       nrq->nr_u.tbf.tr_sequence);
+}
+
+#ifdef CONFIG_PROC_FS
+
+/**
+ * lprocfs interface
+ */
+
+/**
+ * The maximum RPC rate.
+ */
+#define LPROCFS_NRS_RATE_MAX		65535
+
+static int
+ptlrpc_lprocfs_nrs_tbf_rule_seq_show(struct seq_file *m, void *data)
+{
+	struct ptlrpc_service	    *svc = m->private;
+	int			     rc;
+
+	seq_printf(m, "regular_requests:\n");
+	/**
+	 * Perform two separate calls to this as only one of the NRS heads'
+	 * policies may be in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or
+	 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING state.
+	 */
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+				       NRS_POL_NAME_TBF,
+				       NRS_CTL_TBF_RD_RULE,
+				       false, m);
+	if (rc == 0) {
+		/**
+		 * -ENOSPC means buf in the parameter m is overflow, return 0
+		 * here to let upper layer function seq_read alloc a larger
+		 * memory area and do this process again.
+		 */
+	} else if (rc == -ENOSPC) {
+		return 0;
+
+		/**
+		 * Ignore -ENODEV as the regular NRS head's policy may be in the
+		 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+		 */
+	} else if (rc != -ENODEV) {
+		return rc;
+	}
+
+	if (!nrs_svc_has_hp(svc))
+		goto no_hp;
+
+	seq_printf(m, "high_priority_requests:\n");
+	rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+				       NRS_POL_NAME_TBF,
+				       NRS_CTL_TBF_RD_RULE,
+				       false, m);
+	if (rc == 0) {
+		/**
+		 * -ENOSPC means buf in the parameter m is overflow, return 0
+		 * here to let upper layer function seq_read alloc a larger
+		 * memory area and do this process again.
+		 */
+	} else if (rc == -ENOSPC) {
+		return 0;
+	}
+
+no_hp:
+
+	return rc;
+}
+
+static int nrs_tbf_id_parse(struct nrs_tbf_cmd *cmd, char *token)
+{
+	int rc;
+
+	switch (cmd->u.tc_start.ts_valid_type) {
+	case NRS_TBF_FLAG_JOBID:
+		rc = nrs_tbf_jobid_parse(cmd, token);
+		break;
+	case NRS_TBF_FLAG_NID:
+		rc = nrs_tbf_nid_parse(cmd, token);
+		break;
+	case NRS_TBF_FLAG_OPCODE:
+		rc = nrs_tbf_opcode_parse(cmd, token);
+		break;
+	case NRS_TBF_FLAG_GENERIC:
+		rc = nrs_tbf_generic_parse(cmd, token);
+		break;
+	default:
+		RETURN(-EINVAL);
+	}
+
+	return rc;
+}
+
+static void nrs_tbf_cmd_fini(struct nrs_tbf_cmd *cmd)
+{
+	if (cmd->tc_cmd == NRS_CTL_TBF_START_RULE) {
+		if (cmd->u.tc_start.ts_valid_type == NRS_TBF_FLAG_JOBID)
+			nrs_tbf_jobid_cmd_fini(cmd);
+		else if (cmd->u.tc_start.ts_valid_type == NRS_TBF_FLAG_NID)
+			nrs_tbf_nid_cmd_fini(cmd);
+		else if (cmd->u.tc_start.ts_valid_type == NRS_TBF_FLAG_OPCODE)
+			nrs_tbf_opcode_cmd_fini(cmd);
+		else if (cmd->u.tc_start.ts_valid_type == NRS_TBF_FLAG_GENERIC)
+			nrs_tbf_generic_cmd_fini(cmd);
+	}
+}
+
+static bool name_is_valid(const char *name)
+{
+	int i;
+
+	for (i = 0; i < strlen(name); i++) {
+		if ((!isalnum(name[i])) &&
+		    (name[i] != '_'))
+			return false;
+	}
+	return true;
+}
+
+static int
+nrs_tbf_parse_value_pair(struct nrs_tbf_cmd *cmd, char *buffer)
+{
+	char	*key;
+	char	*val;
+	int	 rc;
+	__u64	 rate;
+
+	val = buffer;
+	key = strsep(&val, "=");
+	if (val == NULL || strlen(val) == 0)
+		return -EINVAL;
+
+	/* Key of the value pair */
+	if (strcmp(key, "rate") == 0) {
+		rc = kstrtoull(val, 10, &rate);
+		if (rc)
+			return rc;
+
+		if (rate <= 0 || rate >= LPROCFS_NRS_RATE_MAX)
+			return -EINVAL;
+
+		if (cmd->tc_cmd == NRS_CTL_TBF_START_RULE)
+			cmd->u.tc_start.ts_rpc_rate = rate;
+		else if (cmd->tc_cmd == NRS_CTL_TBF_CHANGE_RULE)
+			cmd->u.tc_change.tc_rpc_rate = rate;
+		else
+			return -EINVAL;
+	}  else if (strcmp(key, "rank") == 0) {
+		if (!name_is_valid(val))
+			return -EINVAL;
+
+		if (cmd->tc_cmd == NRS_CTL_TBF_START_RULE)
+			cmd->u.tc_start.ts_next_name = val;
+		else if (cmd->tc_cmd == NRS_CTL_TBF_CHANGE_RULE)
+			cmd->u.tc_change.tc_next_name = val;
+		else
+			return -EINVAL;
+	} else {
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int
+nrs_tbf_parse_value_pairs(struct nrs_tbf_cmd *cmd, char *buffer)
+{
+	char	*val;
+	char	*token;
+	int	 rc;
+
+	val = buffer;
+	while (val != NULL && strlen(val) != 0) {
+		token = strsep(&val, " ");
+		rc = nrs_tbf_parse_value_pair(cmd, token);
+		if (rc)
+			return rc;
+	}
+
+	switch (cmd->tc_cmd) {
+	case NRS_CTL_TBF_START_RULE:
+		if (cmd->u.tc_start.ts_rpc_rate == 0)
+			cmd->u.tc_start.ts_rpc_rate = tbf_rate;
+		break;
+	case NRS_CTL_TBF_CHANGE_RULE:
+		if (cmd->u.tc_change.tc_rpc_rate == 0 &&
+		    cmd->u.tc_change.tc_next_name == NULL)
+			return -EINVAL;
+		break;
+	case NRS_CTL_TBF_STOP_RULE:
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static struct nrs_tbf_cmd *
+nrs_tbf_parse_cmd(char *buffer, unsigned long count, __u32 type_flag)
+{
+	static struct nrs_tbf_cmd	*cmd;
+	char				*token;
+	char				*val;
+	int				 rc = 0;
+
+	OBD_ALLOC_PTR(cmd);
+	if (cmd == NULL)
+		GOTO(out, rc = -ENOMEM);
+	memset(cmd, 0, sizeof(*cmd));
+
+	val = buffer;
+	token = strsep(&val, " ");
+	if (val == NULL || strlen(val) == 0)
+		GOTO(out_free_cmd, rc = -EINVAL);
+
+	/* Type of the command */
+	if (strcmp(token, "start") == 0) {
+		cmd->tc_cmd = NRS_CTL_TBF_START_RULE;
+		cmd->u.tc_start.ts_valid_type = type_flag;
+	} else if (strcmp(token, "stop") == 0)
+		cmd->tc_cmd = NRS_CTL_TBF_STOP_RULE;
+	else if (strcmp(token, "change") == 0)
+		cmd->tc_cmd = NRS_CTL_TBF_CHANGE_RULE;
+	else
+		GOTO(out_free_cmd, rc = -EINVAL);
+
+	/* Name of the rule */
+	token = strsep(&val, " ");
+	if ((val == NULL && cmd->tc_cmd != NRS_CTL_TBF_STOP_RULE) ||
+	    !name_is_valid(token))
+		GOTO(out_free_cmd, rc = -EINVAL);
+	cmd->tc_name = token;
+
+	if (cmd->tc_cmd == NRS_CTL_TBF_START_RULE) {
+		/* List of ID */
+		LASSERT(val);
+		token = val;
+		val = strrchr(token, '}');
+		if (!val)
+			GOTO(out_free_cmd, rc = -EINVAL);
+
+		/* Skip '}' */
+		val++;
+		if (*val == '\0') {
+			val = NULL;
+		} else if (*val == ' ') {
+			*val = '\0';
+			val++;
+		} else
+			GOTO(out_free_cmd, rc = -EINVAL);
+
+		rc = nrs_tbf_id_parse(cmd, token);
+		if (rc)
+			GOTO(out_free_cmd, rc);
+	}
+
+	rc = nrs_tbf_parse_value_pairs(cmd, val);
+	if (rc)
+		GOTO(out_cmd_fini, rc = -EINVAL);
+	goto out;
+out_cmd_fini:
+	nrs_tbf_cmd_fini(cmd);
+out_free_cmd:
+	OBD_FREE_PTR(cmd);
+out:
+	if (rc)
+		cmd = ERR_PTR(rc);
+	return cmd;
+}
+
+/**
+ * Get the TBF policy type (nid, jobid, etc) preset by
+ * proc entry 'nrs_policies' for command buffer parsing.
+ *
+ * \param[in] svc the PTLRPC service
+ * \param[in] queue the NRS queue type
+ *
+ * \retval the preset TBF policy type flag
+ */
+static __u32
+nrs_tbf_type_flag(struct ptlrpc_service *svc, enum ptlrpc_nrs_queue_type queue)
+{
+	__u32	type;
+	int	rc;
+
+	rc = ptlrpc_nrs_policy_control(svc, queue,
+				       NRS_POL_NAME_TBF,
+				       NRS_CTL_TBF_RD_TYPE_FLAG,
+				       true, &type);
+	if (rc != 0)
+		type = NRS_TBF_FLAG_INVALID;
+
+	return type;
+}
+
+extern struct nrs_core nrs_core;
+#define LPROCFS_WR_NRS_TBF_MAX_CMD (4096)
+static ssize_t
+ptlrpc_lprocfs_nrs_tbf_rule_seq_write(struct file *file,
+				      const char __user *buffer,
+				      size_t count, loff_t *off)
+{
+	struct seq_file		  *m = file->private_data;
+	struct ptlrpc_service	  *svc = m->private;
+	char			  *kernbuf;
+	char			  *val;
+	int			   rc;
+	static struct nrs_tbf_cmd *cmd;
+	enum ptlrpc_nrs_queue_type queue = PTLRPC_NRS_QUEUE_BOTH;
+	unsigned long		   length;
+	char			  *token;
+
+	OBD_ALLOC(kernbuf, LPROCFS_WR_NRS_TBF_MAX_CMD);
+	if (kernbuf == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	if (count > LPROCFS_WR_NRS_TBF_MAX_CMD - 1)
+		GOTO(out_free_kernbuff, rc = -EINVAL);
+
+	if (copy_from_user(kernbuf, buffer, count))
+		GOTO(out_free_kernbuff, rc = -EFAULT);
+
+	val = kernbuf;
+	token = strsep(&val, " ");
+	if (val == NULL)
+		GOTO(out_free_kernbuff, rc = -EINVAL);
+
+	if (strcmp(token, "reg") == 0) {
+		queue = PTLRPC_NRS_QUEUE_REG;
+	} else if (strcmp(token, "hp") == 0) {
+		queue = PTLRPC_NRS_QUEUE_HP;
+	} else {
+		kernbuf[strlen(token)] = ' ';
+		val = kernbuf;
+	}
+	length = strlen(val);
+
+	if (length == 0)
+		GOTO(out_free_kernbuff, rc = -EINVAL);
+
+	if (queue == PTLRPC_NRS_QUEUE_HP && !nrs_svc_has_hp(svc))
+		GOTO(out_free_kernbuff, rc = -ENODEV);
+	else if (queue == PTLRPC_NRS_QUEUE_BOTH && !nrs_svc_has_hp(svc))
+		queue = PTLRPC_NRS_QUEUE_REG;
+
+	cmd = nrs_tbf_parse_cmd(val, length, nrs_tbf_type_flag(svc, queue));
+	if (IS_ERR(cmd))
+		GOTO(out_free_kernbuff, rc = PTR_ERR(cmd));
+
+	/**
+	 * Serialize NRS core lprocfs operations with policy registration/
+	 * unregistration.
+	 */
+	mutex_lock(&nrs_core.nrs_mutex);
+	rc = ptlrpc_nrs_policy_control(svc, queue,
+				       NRS_POL_NAME_TBF,
+				       NRS_CTL_TBF_WR_RULE,
+				       false, cmd);
+	mutex_unlock(&nrs_core.nrs_mutex);
+
+	nrs_tbf_cmd_fini(cmd);
+	OBD_FREE_PTR(cmd);
+out_free_kernbuff:
+	OBD_FREE(kernbuf, LPROCFS_WR_NRS_TBF_MAX_CMD);
+out:
+	return rc ? rc : count;
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_tbf_rule);
+
+/**
+ * Initializes a TBF policy's lprocfs interface for service \a svc
+ *
+ * \param[in] svc the service
+ *
+ * \retval 0	success
+ * \retval != 0	error
+ */
+static int nrs_tbf_lprocfs_init(struct ptlrpc_service *svc)
+{
+	struct lprocfs_vars nrs_tbf_lprocfs_vars[] = {
+		{ .name		= "nrs_tbf_rule",
+		  .fops		= &ptlrpc_lprocfs_nrs_tbf_rule_fops,
+		  .data = svc },
+		{ NULL }
+	};
+
+	if (svc->srv_procroot == NULL)
+		return 0;
+
+	return lprocfs_add_vars(svc->srv_procroot, nrs_tbf_lprocfs_vars, NULL);
+}
+
+/**
+ * Cleans up a TBF policy's lprocfs interface for service \a svc
+ *
+ * \param[in] svc the service
+ */
+static void nrs_tbf_lprocfs_fini(struct ptlrpc_service *svc)
+{
+	if (svc->srv_procroot == NULL)
+		return;
+
+	lprocfs_remove_proc_entry("nrs_tbf_rule", svc->srv_procroot);
+}
+
+#endif /* CONFIG_PROC_FS */
+
+/**
+ * TBF policy operations
+ */
+static const struct ptlrpc_nrs_pol_ops nrs_tbf_ops = {
+	.op_policy_start	= nrs_tbf_start,
+	.op_policy_stop		= nrs_tbf_stop,
+	.op_policy_ctl		= nrs_tbf_ctl,
+	.op_res_get		= nrs_tbf_res_get,
+	.op_res_put		= nrs_tbf_res_put,
+	.op_req_get		= nrs_tbf_req_get,
+	.op_req_enqueue		= nrs_tbf_req_add,
+	.op_req_dequeue		= nrs_tbf_req_del,
+	.op_req_stop		= nrs_tbf_req_stop,
+#ifdef CONFIG_PROC_FS
+	.op_lprocfs_init	= nrs_tbf_lprocfs_init,
+	.op_lprocfs_fini	= nrs_tbf_lprocfs_fini,
+#endif
+};
+
+/**
+ * TBF policy configuration
+ */
+struct ptlrpc_nrs_pol_conf nrs_conf_tbf = {
+	.nc_name		= NRS_POL_NAME_TBF,
+	.nc_ops			= &nrs_tbf_ops,
+	.nc_compat		= nrs_policy_compat_all,
+};
+
+/** @} tbf */
+
+/** @} nrs */
+
+#endif /* HAVE_SERVER_SUPPORT */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c b/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c
new file mode 100644
index 0000000000000..0b2a13753430c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c
@@ -0,0 +1,2821 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/pack_generic.c
+ *
+ * (Un)packing of OST requests
+ *
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Eric Barton <eeb@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <libcfs/libcfs.h>
+
+#include <lustre/ll_fiemap.h>
+
+#include <llog_swab.h>
+#include <lustre_net.h>
+#include <lustre_swab.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <obj_update.h>
+
+#include "ptlrpc_internal.h"
+
+static inline __u32 lustre_msg_hdr_size_v2(__u32 count)
+{
+        return cfs_size_round(offsetof(struct lustre_msg_v2,
+                                       lm_buflens[count]));
+}
+
+__u32 lustre_msg_hdr_size(__u32 magic, __u32 count)
+{
+        switch (magic) {
+        case LUSTRE_MSG_MAGIC_V2:
+                return lustre_msg_hdr_size_v2(count);
+        default:
+                LASSERTF(0, "incorrect message magic: %08x\n", magic);
+		return 0;
+        }
+}
+
+void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout,
+			    __u32 index)
+{
+        if (inout)
+                lustre_set_req_swabbed(req, index);
+        else
+                lustre_set_rep_swabbed(req, index);
+}
+
+int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout,
+			 __u32 index)
+{
+        if (inout)
+                return (ptlrpc_req_need_swab(req) &&
+                        !lustre_req_swabbed(req, index));
+        else
+                return (ptlrpc_rep_need_swab(req) &&
+                        !lustre_rep_swabbed(req, index));
+}
+
+static inline int lustre_msg_check_version_v2(struct lustre_msg_v2 *msg,
+						__u32 version)
+{
+        __u32 ver = lustre_msg_get_version(msg);
+        return (ver & LUSTRE_VERSION_MASK) != version;
+}
+
+int lustre_msg_check_version(struct lustre_msg *msg, __u32 version)
+{
+#define LUSTRE_MSG_MAGIC_V1 0x0BD00BD0
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+		CERROR("msg v1 not supported - please upgrade you system\n");
+		return -EINVAL;
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_msg_check_version_v2(msg, version);
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return -EPROTO;
+	}
+#undef LUSTRE_MSG_MAGIC_V1
+}
+
+/* early reply size */
+__u32 lustre_msg_early_size()
+{
+	static __u32 size;
+	if (!size) {
+		/* Always reply old ptlrpc_body_v2 to keep interoprability
+		 * with the old client (< 2.3) which doesn't have pb_jobid
+		 * in the ptlrpc_body.
+		 *
+		 * XXX Remove this whenever we dorp interoprability with such
+		 *     client.
+		 */
+		__u32 pblen = sizeof(struct ptlrpc_body_v2);
+		size = lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, &pblen);
+	}
+	return size;
+}
+EXPORT_SYMBOL(lustre_msg_early_size);
+
+__u32 lustre_msg_size_v2(int count, __u32 *lengths)
+{
+	__u32 size;
+        int i;
+
+        size = lustre_msg_hdr_size_v2(count);
+        for (i = 0; i < count; i++)
+                size += cfs_size_round(lengths[i]);
+
+        return size;
+}
+EXPORT_SYMBOL(lustre_msg_size_v2);
+
+/* This returns the size of the buffer that is required to hold a lustre_msg
+ * with the given sub-buffer lengths.
+ * NOTE: this should only be used for NEW requests, and should always be
+ *       in the form of a v2 request.  If this is a connection to a v1
+ *       target then the first buffer will be stripped because the ptlrpc
+ *       data is part of the lustre_msg_v1 header. b=14043 */
+__u32 lustre_msg_size(__u32 magic, int count, __u32 *lens)
+{
+        __u32 size[] = { sizeof(struct ptlrpc_body) };
+
+        if (!lens) {
+                LASSERT(count == 1);
+                lens = size;
+        }
+
+        LASSERT(count > 0);
+	LASSERT(lens[MSG_PTLRPC_BODY_OFF] >= sizeof(struct ptlrpc_body_v2));
+
+        switch (magic) {
+        case LUSTRE_MSG_MAGIC_V2:
+                return lustre_msg_size_v2(count, lens);
+        default:
+                LASSERTF(0, "incorrect message magic: %08x\n", magic);
+		return 0;
+        }
+}
+
+/* This is used to determine the size of a buffer that was already packed
+ * and will correctly handle the different message formats. */
+__u32 lustre_packed_msg_size(struct lustre_msg *msg)
+{
+        switch (msg->lm_magic) {
+        case LUSTRE_MSG_MAGIC_V2:
+                return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+        default:
+                CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+                return 0;
+        }
+}
+
+void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens,
+                        char **bufs)
+{
+        char *ptr;
+        int i;
+
+        msg->lm_bufcount = count;
+        /* XXX: lm_secflvr uninitialized here */
+        msg->lm_magic = LUSTRE_MSG_MAGIC_V2;
+
+        for (i = 0; i < count; i++)
+                msg->lm_buflens[i] = lens[i];
+
+        if (bufs == NULL)
+                return;
+
+	ptr = (char *)msg + lustre_msg_hdr_size_v2(count);
+	for (i = 0; i < count; i++) {
+		char *tmp = bufs[i];
+
+		if (tmp)
+			memcpy(ptr, tmp, lens[i]);
+		ptr += cfs_size_round(lens[i]);
+	}
+}
+EXPORT_SYMBOL(lustre_init_msg_v2);
+
+static int lustre_pack_request_v2(struct ptlrpc_request *req,
+                                  int count, __u32 *lens, char **bufs)
+{
+        int reqlen, rc;
+
+        reqlen = lustre_msg_size_v2(count, lens);
+
+        rc = sptlrpc_cli_alloc_reqbuf(req, reqlen);
+        if (rc)
+                return rc;
+
+        req->rq_reqlen = reqlen;
+
+        lustre_init_msg_v2(req->rq_reqmsg, count, lens, bufs);
+        lustre_msg_add_version(req->rq_reqmsg, PTLRPC_MSG_VERSION);
+        return 0;
+}
+
+int lustre_pack_request(struct ptlrpc_request *req, __u32 magic, int count,
+                        __u32 *lens, char **bufs)
+{
+        __u32 size[] = { sizeof(struct ptlrpc_body) };
+
+        if (!lens) {
+                LASSERT(count == 1);
+                lens = size;
+        }
+
+        LASSERT(count > 0);
+        LASSERT(lens[MSG_PTLRPC_BODY_OFF] == sizeof(struct ptlrpc_body));
+
+        /* only use new format, we don't need to be compatible with 1.4 */
+        magic = LUSTRE_MSG_MAGIC_V2;
+
+        switch (magic) {
+        case LUSTRE_MSG_MAGIC_V2:
+                return lustre_pack_request_v2(req, count, lens, bufs);
+        default:
+                LASSERTF(0, "incorrect message magic: %08x\n", magic);
+                return -EINVAL;
+        }
+}
+
+#if RS_DEBUG
+struct list_head ptlrpc_rs_debug_lru =
+	LIST_HEAD_INIT(ptlrpc_rs_debug_lru);
+spinlock_t ptlrpc_rs_debug_lock;
+
+#define PTLRPC_RS_DEBUG_LRU_ADD(rs)					\
+do {									\
+	spin_lock(&ptlrpc_rs_debug_lock);				\
+	list_add_tail(&(rs)->rs_debug_list, &ptlrpc_rs_debug_lru);	\
+	spin_unlock(&ptlrpc_rs_debug_lock);				\
+} while (0)
+
+#define PTLRPC_RS_DEBUG_LRU_DEL(rs)					\
+do {									\
+	spin_lock(&ptlrpc_rs_debug_lock);				\
+	list_del(&(rs)->rs_debug_list);				\
+	spin_unlock(&ptlrpc_rs_debug_lock);				\
+} while (0)
+#else
+# define PTLRPC_RS_DEBUG_LRU_ADD(rs) do {} while(0)
+# define PTLRPC_RS_DEBUG_LRU_DEL(rs) do {} while(0)
+#endif
+
+struct ptlrpc_reply_state *
+lustre_get_emerg_rs(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_reply_state *rs = NULL;
+
+	spin_lock(&svcpt->scp_rep_lock);
+
+	/* See if we have anything in a pool, and wait if nothing */
+	while (list_empty(&svcpt->scp_rep_idle)) {
+		struct l_wait_info	lwi;
+		int			rc;
+
+		spin_unlock(&svcpt->scp_rep_lock);
+		/* If we cannot get anything for some long time, we better
+		 * bail out instead of waiting infinitely */
+		lwi = LWI_TIMEOUT(cfs_time_seconds(10), NULL, NULL);
+		rc = l_wait_event(svcpt->scp_rep_waitq,
+				  !list_empty(&svcpt->scp_rep_idle), &lwi);
+		if (rc != 0)
+			goto out;
+		spin_lock(&svcpt->scp_rep_lock);
+	}
+
+	rs = list_entry(svcpt->scp_rep_idle.next,
+			    struct ptlrpc_reply_state, rs_list);
+	list_del(&rs->rs_list);
+
+	spin_unlock(&svcpt->scp_rep_lock);
+
+	memset(rs, 0, svcpt->scp_service->srv_max_reply_size);
+	rs->rs_size = svcpt->scp_service->srv_max_reply_size;
+	rs->rs_svcpt = svcpt;
+	rs->rs_prealloc = 1;
+out:
+	return rs;
+}
+
+void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs)
+{
+	struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+
+	spin_lock(&svcpt->scp_rep_lock);
+	list_add(&rs->rs_list, &svcpt->scp_rep_idle);
+	spin_unlock(&svcpt->scp_rep_lock);
+	wake_up(&svcpt->scp_rep_waitq);
+}
+
+int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
+                         __u32 *lens, char **bufs, int flags)
+{
+        struct ptlrpc_reply_state *rs;
+        int                        msg_len, rc;
+        ENTRY;
+
+        LASSERT(req->rq_reply_state == NULL);
+
+        if ((flags & LPRFL_EARLY_REPLY) == 0) {
+		spin_lock(&req->rq_lock);
+		req->rq_packed_final = 1;
+		spin_unlock(&req->rq_lock);
+        }
+
+        msg_len = lustre_msg_size_v2(count, lens);
+        rc = sptlrpc_svc_alloc_rs(req, msg_len);
+        if (rc)
+                RETURN(rc);
+
+	rs = req->rq_reply_state;
+	atomic_set(&rs->rs_refcount, 1);	/* 1 ref for rq_reply_state */
+	rs->rs_cb_id.cbid_fn = reply_out_callback;
+	rs->rs_cb_id.cbid_arg = rs;
+	rs->rs_svcpt = req->rq_rqbd->rqbd_svcpt;
+	INIT_LIST_HEAD(&rs->rs_exp_list);
+	INIT_LIST_HEAD(&rs->rs_obd_list);
+	INIT_LIST_HEAD(&rs->rs_list);
+	spin_lock_init(&rs->rs_lock);
+
+        req->rq_replen = msg_len;
+        req->rq_reply_state = rs;
+        req->rq_repmsg = rs->rs_msg;
+
+        lustre_init_msg_v2(rs->rs_msg, count, lens, bufs);
+        lustre_msg_add_version(rs->rs_msg, PTLRPC_MSG_VERSION);
+
+        PTLRPC_RS_DEBUG_LRU_ADD(rs);
+
+        RETURN(0);
+}
+EXPORT_SYMBOL(lustre_pack_reply_v2);
+
+int lustre_pack_reply_flags(struct ptlrpc_request *req, int count, __u32 *lens,
+                            char **bufs, int flags)
+{
+        int rc = 0;
+        __u32 size[] = { sizeof(struct ptlrpc_body) };
+
+        if (!lens) {
+                LASSERT(count == 1);
+                lens = size;
+        }
+
+        LASSERT(count > 0);
+        LASSERT(lens[MSG_PTLRPC_BODY_OFF] == sizeof(struct ptlrpc_body));
+
+        switch (req->rq_reqmsg->lm_magic) {
+        case LUSTRE_MSG_MAGIC_V2:
+                rc = lustre_pack_reply_v2(req, count, lens, bufs, flags);
+                break;
+        default:
+                LASSERTF(0, "incorrect message magic: %08x\n",
+                         req->rq_reqmsg->lm_magic);
+                rc = -EINVAL;
+        }
+        if (rc != 0)
+                CERROR("lustre_pack_reply failed: rc=%d size=%d\n", rc,
+                       lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens));
+        return rc;
+}
+
+int lustre_pack_reply(struct ptlrpc_request *req, int count, __u32 *lens,
+                      char **bufs)
+{
+        return lustre_pack_reply_flags(req, count, lens, bufs, 0);
+}
+EXPORT_SYMBOL(lustre_pack_reply);
+
+void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, __u32 n, __u32 min_size)
+{
+	__u32 i, offset, buflen, bufcount;
+
+        LASSERT(m != NULL);
+
+        bufcount = m->lm_bufcount;
+        if (unlikely(n >= bufcount)) {
+                CDEBUG(D_INFO, "msg %p buffer[%d] not present (count %d)\n",
+                       m, n, bufcount);
+                return NULL;
+        }
+
+        buflen = m->lm_buflens[n];
+        if (unlikely(buflen < min_size)) {
+                CERROR("msg %p buffer[%d] size %d too small "
+                       "(required %d, opc=%d)\n", m, n, buflen, min_size,
+                       n == MSG_PTLRPC_BODY_OFF ? -1 : lustre_msg_get_opc(m));
+                return NULL;
+        }
+
+        offset = lustre_msg_hdr_size_v2(bufcount);
+        for (i = 0; i < n; i++)
+                offset += cfs_size_round(m->lm_buflens[i]);
+
+        return (char *)m + offset;
+}
+
+void *lustre_msg_buf(struct lustre_msg *m, __u32 n, __u32 min_size)
+{
+	switch (m->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_msg_buf_v2(m, n, min_size);
+	default:
+		LASSERTF(0, "incorrect message magic: %08x (msg:%p)\n",
+			 m->lm_magic, m);
+		return NULL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_buf);
+
+static int lustre_shrink_msg_v2(struct lustre_msg_v2 *msg, __u32 segment,
+				unsigned int newlen, int move_data)
+{
+        char   *tail = NULL, *newpos;
+        int     tail_len = 0, n;
+
+        LASSERT(msg);
+        LASSERT(msg->lm_bufcount > segment);
+        LASSERT(msg->lm_buflens[segment] >= newlen);
+
+        if (msg->lm_buflens[segment] == newlen)
+                goto out;
+
+        if (move_data && msg->lm_bufcount > segment + 1) {
+                tail = lustre_msg_buf_v2(msg, segment + 1, 0);
+                for (n = segment + 1; n < msg->lm_bufcount; n++)
+                        tail_len += cfs_size_round(msg->lm_buflens[n]);
+        }
+
+        msg->lm_buflens[segment] = newlen;
+
+        if (tail && tail_len) {
+                newpos = lustre_msg_buf_v2(msg, segment + 1, 0);
+                LASSERT(newpos <= tail);
+                if (newpos != tail)
+                        memmove(newpos, tail, tail_len);
+        }
+out:
+        return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+}
+
+/*
+ * for @msg, shrink @segment to size @newlen. if @move_data is non-zero,
+ * we also move data forward from @segment + 1.
+ *
+ * if @newlen == 0, we remove the segment completely, but we still keep the
+ * totally bufcount the same to save possible data moving. this will leave a
+ * unused segment with size 0 at the tail, but that's ok.
+ *
+ * return new msg size after shrinking.
+ *
+ * CAUTION:
+ * + if any buffers higher than @segment has been filled in, must call shrink
+ *   with non-zero @move_data.
+ * + caller should NOT keep pointers to msg buffers which higher than @segment
+ *   after call shrink.
+ */
+int lustre_shrink_msg(struct lustre_msg *msg, int segment,
+                      unsigned int newlen, int move_data)
+{
+        switch (msg->lm_magic) {
+        case LUSTRE_MSG_MAGIC_V2:
+                return lustre_shrink_msg_v2(msg, segment, newlen, move_data);
+        default:
+                LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+        }
+}
+EXPORT_SYMBOL(lustre_shrink_msg);
+
+void lustre_free_reply_state(struct ptlrpc_reply_state *rs)
+{
+	PTLRPC_RS_DEBUG_LRU_DEL(rs);
+
+	LASSERT(atomic_read(&rs->rs_refcount) == 0);
+	LASSERT(!rs->rs_difficult || rs->rs_handled);
+	LASSERT(!rs->rs_on_net);
+	LASSERT(!rs->rs_scheduled);
+	LASSERT(rs->rs_export == NULL);
+	LASSERT(rs->rs_nlocks == 0);
+	LASSERT(list_empty(&rs->rs_exp_list));
+	LASSERT(list_empty(&rs->rs_obd_list));
+
+	sptlrpc_svc_free_rs(rs);
+}
+
+static int lustre_unpack_msg_v2(struct lustre_msg_v2 *m, int len)
+{
+        int swabbed, required_len, i;
+
+        /* Now we know the sender speaks my language. */
+        required_len = lustre_msg_hdr_size_v2(0);
+        if (len < required_len) {
+                /* can't even look inside the message */
+                CERROR("message length %d too small for lustre_msg\n", len);
+                return -EINVAL;
+        }
+
+        swabbed = (m->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED);
+
+        if (swabbed) {
+                __swab32s(&m->lm_magic);
+                __swab32s(&m->lm_bufcount);
+                __swab32s(&m->lm_secflvr);
+                __swab32s(&m->lm_repsize);
+                __swab32s(&m->lm_cksum);
+                __swab32s(&m->lm_flags);
+                CLASSERT(offsetof(typeof(*m), lm_padding_2) != 0);
+                CLASSERT(offsetof(typeof(*m), lm_padding_3) != 0);
+        }
+
+        required_len = lustre_msg_hdr_size_v2(m->lm_bufcount);
+        if (len < required_len) {
+                /* didn't receive all the buffer lengths */
+                CERROR ("message length %d too small for %d buflens\n",
+                        len, m->lm_bufcount);
+                return -EINVAL;
+        }
+
+        for (i = 0; i < m->lm_bufcount; i++) {
+                if (swabbed)
+                        __swab32s(&m->lm_buflens[i]);
+                required_len += cfs_size_round(m->lm_buflens[i]);
+        }
+
+        if (len < required_len) {
+                CERROR("len: %d, required_len %d\n", len, required_len);
+                CERROR("bufcount: %d\n", m->lm_bufcount);
+                for (i = 0; i < m->lm_bufcount; i++)
+                        CERROR("buffer %d length %d\n", i, m->lm_buflens[i]);
+                return -EINVAL;
+        }
+
+        return swabbed;
+}
+
+int __lustre_unpack_msg(struct lustre_msg *m, int len)
+{
+        int required_len, rc;
+        ENTRY;
+
+        /* We can provide a slightly better error log, if we check the
+         * message magic and version first.  In the future, struct
+         * lustre_msg may grow, and we'd like to log a version mismatch,
+         * rather than a short message.
+         *
+         */
+        required_len = offsetof(struct lustre_msg, lm_magic) +
+                       sizeof(m->lm_magic);
+        if (len < required_len) {
+                /* can't even look inside the message */
+                CERROR("message length %d too small for magic/version check\n",
+                       len);
+                RETURN(-EINVAL);
+        }
+
+        rc = lustre_unpack_msg_v2(m, len);
+
+        RETURN(rc);
+}
+EXPORT_SYMBOL(__lustre_unpack_msg);
+
+int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len)
+{
+        int rc;
+        rc = __lustre_unpack_msg(req->rq_reqmsg, len);
+        if (rc == 1) {
+                lustre_set_req_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+                rc = 0;
+        }
+        return rc;
+}
+
+int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len)
+{
+        int rc;
+        rc = __lustre_unpack_msg(req->rq_repmsg, len);
+        if (rc == 1) {
+                lustre_set_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+                rc = 0;
+        }
+        return rc;
+}
+
+static inline int lustre_unpack_ptlrpc_body_v2(struct ptlrpc_request *req,
+                                               const int inout, int offset)
+{
+        struct ptlrpc_body *pb;
+        struct lustre_msg_v2 *m = inout ? req->rq_reqmsg : req->rq_repmsg;
+
+	pb = lustre_msg_buf_v2(m, offset, sizeof(struct ptlrpc_body_v2));
+        if (!pb) {
+                CERROR("error unpacking ptlrpc body\n");
+                return -EFAULT;
+        }
+        if (ptlrpc_buf_need_swab(req, inout, offset)) {
+                lustre_swab_ptlrpc_body(pb);
+                ptlrpc_buf_set_swabbed(req, inout, offset);
+        }
+
+        if ((pb->pb_version & ~LUSTRE_VERSION_MASK) != PTLRPC_MSG_VERSION) {
+                 CERROR("wrong lustre_msg version %08x\n", pb->pb_version);
+                 return -EINVAL;
+        }
+
+	if (!inout)
+		pb->pb_status = ptlrpc_status_ntoh(pb->pb_status);
+
+        return 0;
+}
+
+int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset)
+{
+        switch (req->rq_reqmsg->lm_magic) {
+        case LUSTRE_MSG_MAGIC_V2:
+                return lustre_unpack_ptlrpc_body_v2(req, 1, offset);
+        default:
+                CERROR("bad lustre msg magic: %08x\n",
+                       req->rq_reqmsg->lm_magic);
+                return -EINVAL;
+        }
+}
+
+int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset)
+{
+        switch (req->rq_repmsg->lm_magic) {
+        case LUSTRE_MSG_MAGIC_V2:
+                return lustre_unpack_ptlrpc_body_v2(req, 0, offset);
+        default:
+                CERROR("bad lustre msg magic: %08x\n",
+                       req->rq_repmsg->lm_magic);
+                return -EINVAL;
+        }
+}
+
+static inline __u32 lustre_msg_buflen_v2(struct lustre_msg_v2 *m, __u32 n)
+{
+        if (n >= m->lm_bufcount)
+                return 0;
+
+        return m->lm_buflens[n];
+}
+
+/**
+ * lustre_msg_buflen - return the length of buffer \a n in message \a m
+ * \param m lustre_msg (request or reply) to look at
+ * \param n message index (base 0)
+ *
+ * returns zero for non-existent message indices
+ */
+__u32 lustre_msg_buflen(struct lustre_msg *m, __u32 n)
+{
+        switch (m->lm_magic) {
+        case LUSTRE_MSG_MAGIC_V2:
+                return lustre_msg_buflen_v2(m, n);
+        default:
+                CERROR("incorrect message magic: %08x\n", m->lm_magic);
+		return 0;
+        }
+}
+EXPORT_SYMBOL(lustre_msg_buflen);
+
+static inline void
+lustre_msg_set_buflen_v2(struct lustre_msg_v2 *m, __u32 n, __u32 len)
+{
+        if (n >= m->lm_bufcount)
+                LBUG();
+
+        m->lm_buflens[n] = len;
+}
+
+void lustre_msg_set_buflen(struct lustre_msg *m, __u32 n, __u32 len)
+{
+        switch (m->lm_magic) {
+        case LUSTRE_MSG_MAGIC_V2:
+                lustre_msg_set_buflen_v2(m, n, len);
+                return;
+        default:
+                LASSERTF(0, "incorrect message magic: %08x\n", m->lm_magic);
+        }
+}
+
+/* NB return the bufcount for lustre_msg_v2 format, so if message is packed
+ * in V1 format, the result is one bigger. (add struct ptlrpc_body). */
+__u32 lustre_msg_bufcount(struct lustre_msg *m)
+{
+        switch (m->lm_magic) {
+        case LUSTRE_MSG_MAGIC_V2:
+                return m->lm_bufcount;
+        default:
+                CERROR("incorrect message magic: %08x\n", m->lm_magic);
+		return 0;
+        }
+}
+
+char *lustre_msg_string(struct lustre_msg *m, __u32 index, __u32 max_len)
+{
+	/* max_len == 0 means the string should fill the buffer */
+	char *str;
+	__u32 slen, blen;
+
+        switch (m->lm_magic) {
+        case LUSTRE_MSG_MAGIC_V2:
+                str = lustre_msg_buf_v2(m, index, 0);
+                blen = lustre_msg_buflen_v2(m, index);
+                break;
+        default:
+                LASSERTF(0, "incorrect message magic: %08x\n", m->lm_magic);
+        }
+
+        if (str == NULL) {
+                CERROR ("can't unpack string in msg %p buffer[%d]\n", m, index);
+                return NULL;
+        }
+
+        slen = strnlen(str, blen);
+
+        if (slen == blen) {                     /* not NULL terminated */
+                CERROR("can't unpack non-NULL terminated string in "
+                        "msg %p buffer[%d] len %d\n", m, index, blen);
+                return NULL;
+        }
+
+        if (max_len == 0) {
+                if (slen != blen - 1) {
+                        CERROR("can't unpack short string in msg %p "
+                               "buffer[%d] len %d: strlen %d\n",
+                               m, index, blen, slen);
+                        return NULL;
+                }
+        } else if (slen > max_len) {
+                CERROR("can't unpack oversized string in msg %p "
+                       "buffer[%d] len %d strlen %d: max %d expected\n",
+                       m, index, blen, slen, max_len);
+                return NULL;
+        }
+
+        return str;
+}
+
+/* Wrap up the normal fixed length cases */
+static inline void *__lustre_swab_buf(struct lustre_msg *msg, __u32 index,
+				      __u32 min_size, void *swabber)
+{
+	void *ptr = NULL;
+
+	LASSERT(msg != NULL);
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		ptr = lustre_msg_buf_v2(msg, index, min_size);
+		break;
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+	}
+
+	if (ptr != NULL && swabber != NULL)
+		((void (*)(void *))swabber)(ptr);
+
+	return ptr;
+}
+
+static inline struct ptlrpc_body *lustre_msg_ptlrpc_body(struct lustre_msg *msg)
+{
+	return lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF,
+				 sizeof(struct ptlrpc_body_v2));
+}
+
+__u32 lustre_msghdr_get_flags(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		/* already in host endian */
+		return msg->lm_flags;
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msghdr_get_flags);
+
+void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		msg->lm_flags = flags;
+		return;
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+__u32 lustre_msg_get_flags(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb != NULL)
+			return pb->pb_flags;
+
+		CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+	}
+	/* no break */
+	default:
+		/* flags might be printed in debug code while message
+		 * uninitialized */
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_flags);
+
+void lustre_msg_add_flags(struct lustre_msg *msg, __u32 flags)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_flags |= flags;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_add_flags);
+
+void lustre_msg_set_flags(struct lustre_msg *msg, __u32 flags)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_flags = flags;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+void lustre_msg_clear_flags(struct lustre_msg *msg, __u32 flags)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_flags &= ~(MSG_GEN_FLAG_MASK & flags);
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_clear_flags);
+
+__u32 lustre_msg_get_op_flags(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb != NULL)
+			return pb->pb_op_flags;
+
+		CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+	}
+	/* no break */
+	default:
+		return 0;
+	}
+}
+
+void lustre_msg_add_op_flags(struct lustre_msg *msg, __u32 flags)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_op_flags |= flags;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_add_op_flags);
+
+struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return NULL;
+		}
+		return &pb->pb_handle;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return NULL;
+	}
+}
+
+__u32 lustre_msg_get_type(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return PTL_RPC_MSG_ERR;
+		}
+		return pb->pb_type;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return PTL_RPC_MSG_ERR;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_type);
+
+__u32 lustre_msg_get_version(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_version;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+
+void lustre_msg_add_version(struct lustre_msg *msg, __u32 version)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_version |= version;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+__u32 lustre_msg_get_opc(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_opc;
+	}
+	default:
+		CERROR("incorrect message magic: %08x (msg:%p)\n",
+		       msg->lm_magic, msg);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_opc);
+
+__u64 lustre_msg_get_last_xid(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_last_xid;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_last_xid);
+
+__u16 lustre_msg_get_tag(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_tag;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_tag);
+
+__u64 lustre_msg_get_last_committed(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_last_committed;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_last_committed);
+
+__u64 *lustre_msg_get_versions(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return NULL;
+		}
+		return pb->pb_pre_versions;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return NULL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_versions);
+
+__u64 lustre_msg_get_transno(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_transno;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_transno);
+
+int lustre_msg_get_status(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb != NULL)
+			return pb->pb_status;
+		CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+	}
+	/* no break */
+	default:
+		/* status might be printed in debug code while message
+		* uninitialized */
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_status);
+
+__u64 lustre_msg_get_slv(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return -EINVAL;
+		}
+		return pb->pb_slv;
+	}
+	default:
+		CERROR("invalid msg magic %08x\n", msg->lm_magic);
+		return -EINVAL;
+	}
+}
+
+
+void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return;
+		}
+		pb->pb_slv = slv;
+		return;
+	}
+	default:
+		CERROR("invalid msg magic %x\n", msg->lm_magic);
+		return;
+	}
+}
+
+__u32 lustre_msg_get_limit(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return -EINVAL;
+		}
+		return pb->pb_limit;
+	}
+	default:
+		CERROR("invalid msg magic %x\n", msg->lm_magic);
+		return -EINVAL;
+	}
+}
+
+
+void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return;
+		}
+		pb->pb_limit = limit;
+		return;
+	}
+	default:
+		CERROR("invalid msg magic %08x\n", msg->lm_magic);
+		return;
+	}
+}
+
+__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_conn_cnt;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_conn_cnt);
+
+__u32 lustre_msg_get_magic(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return msg->lm_magic;
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+
+__u32 lustre_msg_get_timeout(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_timeout;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+
+__u32 lustre_msg_get_service_time(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_service_time;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+
+char *lustre_msg_get_jobid(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb =
+			lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF,
+					  sizeof(struct ptlrpc_body));
+		if (!pb)
+			return NULL;
+
+		return pb->pb_jobid;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return NULL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_jobid);
+
+__u32 lustre_msg_get_cksum(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return msg->lm_cksum;
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+
+__u64 lustre_msg_get_mbits(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (pb == NULL) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_mbits;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		__u32 len = lustre_msg_buflen(msg, MSG_PTLRPC_BODY_OFF);
+
+		unsigned int hsize = 4;
+		__u32 crc;
+
+		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
+		cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, (unsigned char *)pb,
+				       len, NULL, 0, (unsigned char *)&crc,
+				       &hsize);
+		return crc;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+
+void lustre_msg_set_handle(struct lustre_msg *msg, struct lustre_handle *handle)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_handle = *handle;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+void lustre_msg_set_type(struct lustre_msg *msg, __u32 type)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_type = type;
+		return;
+		}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_opc = opc;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+void lustre_msg_set_last_xid(struct lustre_msg *msg, __u64 last_xid)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_last_xid = last_xid;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_last_xid);
+
+void lustre_msg_set_tag(struct lustre_msg *msg, __u16 tag)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_tag = tag;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_tag);
+
+void lustre_msg_set_last_committed(struct lustre_msg *msg, __u64 last_committed)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_last_committed = last_committed;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_pre_versions[0] = versions[0];
+		pb->pb_pre_versions[1] = versions[1];
+		pb->pb_pre_versions[2] = versions[2];
+		pb->pb_pre_versions[3] = versions[3];
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_versions);
+
+void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_transno = transno;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_transno);
+
+void lustre_msg_set_status(struct lustre_msg *msg, __u32 status)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_status = status;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_status);
+
+void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_conn_cnt = conn_cnt;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_timeout = timeout;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_service_time = service_time;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		__u32 opc = lustre_msg_get_opc(msg);
+		struct ptlrpc_body *pb;
+
+		/* Don't set jobid for ldlm ast RPCs, they've been shrinked.
+		 * See the comment in ptlrpc_request_pack(). */
+		if (!opc || opc == LDLM_BL_CALLBACK ||
+		    opc == LDLM_CP_CALLBACK || opc == LDLM_GL_CALLBACK)
+			return;
+
+		pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF,
+				       sizeof(struct ptlrpc_body));
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+
+		if (jobid != NULL)
+			memcpy(pb->pb_jobid, jobid, LUSTRE_JOBID_SIZE);
+		else if (pb->pb_jobid[0] == '\0')
+			lustre_get_jobid(pb->pb_jobid);
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_jobid);
+
+void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		msg->lm_cksum = cksum;
+		return;
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+void lustre_msg_set_mbits(struct lustre_msg *msg, __u64 mbits)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+
+		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_mbits = mbits;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+void ptlrpc_request_set_replen(struct ptlrpc_request *req)
+{
+        int count = req_capsule_filled_sizes(&req->rq_pill, RCL_SERVER);
+
+        req->rq_replen = lustre_msg_size(req->rq_reqmsg->lm_magic, count,
+                                         req->rq_pill.rc_area[RCL_SERVER]);
+        if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2)
+                req->rq_reqmsg->lm_repsize = req->rq_replen;
+}
+EXPORT_SYMBOL(ptlrpc_request_set_replen);
+
+void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *lens)
+{
+        req->rq_replen = lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens);
+        if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2)
+                req->rq_reqmsg->lm_repsize = req->rq_replen;
+}
+
+/**
+ * Send a remote set_info_async.
+ *
+ * This may go from client to server or server to client.
+ */
+int do_set_info_async(struct obd_import *imp,
+		      int opcode, int version,
+		      size_t keylen, void *key,
+		      size_t vallen, void *val,
+		      struct ptlrpc_request_set *set)
+{
+        struct ptlrpc_request *req;
+        char                  *tmp;
+        int                    rc;
+        ENTRY;
+
+        req = ptlrpc_request_alloc(imp, &RQF_OBD_SET_INFO);
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+        req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
+                             RCL_CLIENT, keylen);
+        req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL,
+                             RCL_CLIENT, vallen);
+        rc = ptlrpc_request_pack(req, version, opcode);
+        if (rc) {
+                ptlrpc_request_free(req);
+                RETURN(rc);
+        }
+
+        tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+        memcpy(tmp, key, keylen);
+        tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_VAL);
+        memcpy(tmp, val, vallen);
+
+        ptlrpc_request_set_replen(req);
+
+        if (set) {
+                ptlrpc_set_add_req(set, req);
+                ptlrpc_check_set(NULL, set);
+        } else {
+                rc = ptlrpc_queue_wait(req);
+                ptlrpc_req_finished(req);
+        }
+
+        RETURN(rc);
+}
+EXPORT_SYMBOL(do_set_info_async);
+
+/* byte flipping routines for all wire types declared in
+ * lustre_idl.h implemented here.
+ */
+void lustre_swab_ptlrpc_body(struct ptlrpc_body *b)
+{
+        __swab32s (&b->pb_type);
+        __swab32s (&b->pb_version);
+        __swab32s (&b->pb_opc);
+        __swab32s (&b->pb_status);
+        __swab64s (&b->pb_last_xid);
+	__swab16s (&b->pb_tag);
+        __swab64s (&b->pb_last_committed);
+        __swab64s (&b->pb_transno);
+        __swab32s (&b->pb_flags);
+        __swab32s (&b->pb_op_flags);
+        __swab32s (&b->pb_conn_cnt);
+        __swab32s (&b->pb_timeout);
+        __swab32s (&b->pb_service_time);
+        __swab32s (&b->pb_limit);
+        __swab64s (&b->pb_slv);
+        __swab64s (&b->pb_pre_versions[0]);
+        __swab64s (&b->pb_pre_versions[1]);
+        __swab64s (&b->pb_pre_versions[2]);
+        __swab64s (&b->pb_pre_versions[3]);
+	__swab64s(&b->pb_mbits);
+	CLASSERT(offsetof(typeof(*b), pb_padding0) != 0);
+	CLASSERT(offsetof(typeof(*b), pb_padding1) != 0);
+	CLASSERT(offsetof(typeof(*b), pb_padding64_0) != 0);
+	CLASSERT(offsetof(typeof(*b), pb_padding64_1) != 0);
+	CLASSERT(offsetof(typeof(*b), pb_padding64_2) != 0);
+	/* While we need to maintain compatibility between
+	 * clients and servers without ptlrpc_body_v2 (< 2.3)
+	 * do not swab any fields beyond pb_jobid, as we are
+	 * using this swab function for both ptlrpc_body
+	 * and ptlrpc_body_v2. */
+	CLASSERT(offsetof(typeof(*b), pb_jobid) != 0);
+}
+
+void lustre_swab_connect(struct obd_connect_data *ocd)
+{
+        __swab64s(&ocd->ocd_connect_flags);
+        __swab32s(&ocd->ocd_version);
+        __swab32s(&ocd->ocd_grant);
+        __swab64s(&ocd->ocd_ibits_known);
+        __swab32s(&ocd->ocd_index);
+        __swab32s(&ocd->ocd_brw_size);
+        /* ocd_blocksize and ocd_inodespace don't need to be swabbed because
+         * they are 8-byte values */
+	__swab16s(&ocd->ocd_grant_tax_kb);
+	__swab32s(&ocd->ocd_grant_max_blks);
+        __swab64s(&ocd->ocd_transno);
+        __swab32s(&ocd->ocd_group);
+        __swab32s(&ocd->ocd_cksum_types);
+        __swab32s(&ocd->ocd_instance);
+        /* Fields after ocd_cksum_types are only accessible by the receiver
+         * if the corresponding flag in ocd_connect_flags is set. Accessing
+         * any field after ocd_maxbytes on the receiver without a valid flag
+         * may result in out-of-bound memory access and kernel oops. */
+        if (ocd->ocd_connect_flags & OBD_CONNECT_MAX_EASIZE)
+                __swab32s(&ocd->ocd_max_easize);
+        if (ocd->ocd_connect_flags & OBD_CONNECT_MAXBYTES)
+                __swab64s(&ocd->ocd_maxbytes);
+	if (ocd->ocd_connect_flags & OBD_CONNECT_MULTIMODRPCS)
+		__swab16s(&ocd->ocd_maxmodrpcs);
+	CLASSERT(offsetof(typeof(*ocd), padding0) != 0);
+	CLASSERT(offsetof(typeof(*ocd), padding1) != 0);
+	if (ocd->ocd_connect_flags & OBD_CONNECT_FLAGS2)
+		__swab64s(&ocd->ocd_connect_flags2);
+        CLASSERT(offsetof(typeof(*ocd), padding3) != 0);
+        CLASSERT(offsetof(typeof(*ocd), padding4) != 0);
+        CLASSERT(offsetof(typeof(*ocd), padding5) != 0);
+        CLASSERT(offsetof(typeof(*ocd), padding6) != 0);
+        CLASSERT(offsetof(typeof(*ocd), padding7) != 0);
+        CLASSERT(offsetof(typeof(*ocd), padding8) != 0);
+        CLASSERT(offsetof(typeof(*ocd), padding9) != 0);
+        CLASSERT(offsetof(typeof(*ocd), paddingA) != 0);
+        CLASSERT(offsetof(typeof(*ocd), paddingB) != 0);
+        CLASSERT(offsetof(typeof(*ocd), paddingC) != 0);
+        CLASSERT(offsetof(typeof(*ocd), paddingD) != 0);
+        CLASSERT(offsetof(typeof(*ocd), paddingE) != 0);
+        CLASSERT(offsetof(typeof(*ocd), paddingF) != 0);
+}
+
+static void lustre_swab_ost_layout(struct ost_layout *ol)
+{
+	__swab32s(&ol->ol_stripe_size);
+	__swab32s(&ol->ol_stripe_count);
+	__swab64s(&ol->ol_comp_start);
+	__swab64s(&ol->ol_comp_end);
+	__swab32s(&ol->ol_comp_id);
+}
+
+void lustre_swab_obdo (struct obdo  *o)
+{
+	__swab64s(&o->o_valid);
+	lustre_swab_ost_id(&o->o_oi);
+	__swab64s(&o->o_parent_seq);
+	__swab64s(&o->o_size);
+	__swab64s(&o->o_mtime);
+	__swab64s(&o->o_atime);
+	__swab64s(&o->o_ctime);
+	__swab64s(&o->o_blocks);
+	__swab64s(&o->o_grant);
+	__swab32s(&o->o_blksize);
+	__swab32s(&o->o_mode);
+	__swab32s(&o->o_uid);
+	__swab32s(&o->o_gid);
+	__swab32s(&o->o_flags);
+	__swab32s(&o->o_nlink);
+	__swab32s(&o->o_parent_oid);
+	__swab32s(&o->o_misc);
+	__swab64s(&o->o_ioepoch);
+	__swab32s(&o->o_stripe_idx);
+	__swab32s(&o->o_parent_ver);
+	lustre_swab_ost_layout(&o->o_layout);
+	CLASSERT(offsetof(typeof(*o), o_padding_3) != 0);
+	__swab32s(&o->o_uid_h);
+	__swab32s(&o->o_gid_h);
+	__swab64s(&o->o_data_version);
+	__swab32s(&o->o_projid);
+	CLASSERT(offsetof(typeof(*o), o_padding_4) != 0);
+	CLASSERT(offsetof(typeof(*o), o_padding_5) != 0);
+	CLASSERT(offsetof(typeof(*o), o_padding_6) != 0);
+
+}
+EXPORT_SYMBOL(lustre_swab_obdo);
+
+void lustre_swab_obd_statfs (struct obd_statfs *os)
+{
+        __swab64s (&os->os_type);
+        __swab64s (&os->os_blocks);
+        __swab64s (&os->os_bfree);
+        __swab64s (&os->os_bavail);
+        __swab64s (&os->os_files);
+        __swab64s (&os->os_ffree);
+        /* no need to swab os_fsid */
+        __swab32s (&os->os_bsize);
+        __swab32s (&os->os_namelen);
+        __swab64s (&os->os_maxbytes);
+        __swab32s (&os->os_state);
+	CLASSERT(offsetof(typeof(*os), os_fprecreated) != 0);
+        CLASSERT(offsetof(typeof(*os), os_spare2) != 0);
+        CLASSERT(offsetof(typeof(*os), os_spare3) != 0);
+        CLASSERT(offsetof(typeof(*os), os_spare4) != 0);
+        CLASSERT(offsetof(typeof(*os), os_spare5) != 0);
+        CLASSERT(offsetof(typeof(*os), os_spare6) != 0);
+        CLASSERT(offsetof(typeof(*os), os_spare7) != 0);
+        CLASSERT(offsetof(typeof(*os), os_spare8) != 0);
+        CLASSERT(offsetof(typeof(*os), os_spare9) != 0);
+}
+
+void lustre_swab_obd_ioobj(struct obd_ioobj *ioo)
+{
+	lustre_swab_ost_id(&ioo->ioo_oid);
+	__swab32s(&ioo->ioo_max_brw);
+	__swab32s(&ioo->ioo_bufcnt);
+}
+
+void lustre_swab_niobuf_remote(struct niobuf_remote *nbr)
+{
+	__swab64s(&nbr->rnb_offset);
+	__swab32s(&nbr->rnb_len);
+	__swab32s(&nbr->rnb_flags);
+}
+
+void lustre_swab_ost_body (struct ost_body *b)
+{
+        lustre_swab_obdo (&b->oa);
+}
+
+void lustre_swab_ost_last_id(u64 *id)
+{
+        __swab64s(id);
+}
+
+void lustre_swab_generic_32s(__u32 *val)
+{
+        __swab32s(val);
+}
+
+void lustre_swab_gl_lquota_desc(struct ldlm_gl_lquota_desc *desc)
+{
+	lustre_swab_lu_fid(&desc->gl_id.qid_fid);
+	__swab64s(&desc->gl_flags);
+	__swab64s(&desc->gl_ver);
+	__swab64s(&desc->gl_hardlimit);
+	__swab64s(&desc->gl_softlimit);
+	__swab64s(&desc->gl_time);
+	CLASSERT(offsetof(typeof(*desc), gl_pad2) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_gl_lquota_desc);
+
+void lustre_swab_gl_barrier_desc(struct ldlm_gl_barrier_desc *desc)
+{
+	__swab32s(&desc->lgbd_status);
+	__swab32s(&desc->lgbd_timeout);
+	CLASSERT(offsetof(typeof(*desc), lgbd_padding) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_gl_barrier_desc);
+
+void lustre_swab_ost_lvb_v1(struct ost_lvb_v1 *lvb)
+{
+	__swab64s(&lvb->lvb_size);
+	__swab64s(&lvb->lvb_mtime);
+	__swab64s(&lvb->lvb_atime);
+	__swab64s(&lvb->lvb_ctime);
+	__swab64s(&lvb->lvb_blocks);
+}
+EXPORT_SYMBOL(lustre_swab_ost_lvb_v1);
+
+void lustre_swab_ost_lvb(struct ost_lvb *lvb)
+{
+	__swab64s(&lvb->lvb_size);
+	__swab64s(&lvb->lvb_mtime);
+	__swab64s(&lvb->lvb_atime);
+	__swab64s(&lvb->lvb_ctime);
+	__swab64s(&lvb->lvb_blocks);
+	__swab32s(&lvb->lvb_mtime_ns);
+	__swab32s(&lvb->lvb_atime_ns);
+	__swab32s(&lvb->lvb_ctime_ns);
+	__swab32s(&lvb->lvb_padding);
+}
+EXPORT_SYMBOL(lustre_swab_ost_lvb);
+
+void lustre_swab_lquota_lvb(struct lquota_lvb *lvb)
+{
+	__swab64s(&lvb->lvb_flags);
+	__swab64s(&lvb->lvb_id_may_rel);
+	__swab64s(&lvb->lvb_id_rel);
+	__swab64s(&lvb->lvb_id_qunit);
+	__swab64s(&lvb->lvb_pad1);
+}
+EXPORT_SYMBOL(lustre_swab_lquota_lvb);
+
+void lustre_swab_barrier_lvb(struct barrier_lvb *lvb)
+{
+	__swab32s(&lvb->lvb_status);
+	__swab32s(&lvb->lvb_index);
+	CLASSERT(offsetof(typeof(*lvb), lvb_padding) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_barrier_lvb);
+
+void lustre_swab_mdt_body (struct mdt_body *b)
+{
+	lustre_swab_lu_fid(&b->mbo_fid1);
+	lustre_swab_lu_fid(&b->mbo_fid2);
+	/* handle is opaque */
+	__swab64s(&b->mbo_valid);
+	__swab64s(&b->mbo_size);
+	__swab64s(&b->mbo_mtime);
+	__swab64s(&b->mbo_atime);
+	__swab64s(&b->mbo_ctime);
+	__swab64s(&b->mbo_blocks);
+	__swab64s(&b->mbo_ioepoch);
+	__swab64s(&b->mbo_t_state);
+	__swab32s(&b->mbo_fsuid);
+	__swab32s(&b->mbo_fsgid);
+	__swab32s(&b->mbo_capability);
+	__swab32s(&b->mbo_mode);
+	__swab32s(&b->mbo_uid);
+	__swab32s(&b->mbo_gid);
+	__swab32s(&b->mbo_flags);
+	__swab32s(&b->mbo_rdev);
+	__swab32s(&b->mbo_nlink);
+	CLASSERT(offsetof(typeof(*b), mbo_unused2) != 0);
+	__swab32s(&b->mbo_suppgid);
+	__swab32s(&b->mbo_eadatasize);
+	__swab32s(&b->mbo_aclsize);
+	__swab32s(&b->mbo_max_mdsize);
+	CLASSERT(offsetof(typeof(*b), mbo_unused3) != 0);
+	__swab32s(&b->mbo_uid_h);
+	__swab32s(&b->mbo_gid_h);
+	__swab32s(&b->mbo_projid);
+	CLASSERT(offsetof(typeof(*b), mbo_padding_6) != 0);
+	CLASSERT(offsetof(typeof(*b), mbo_padding_7) != 0);
+	CLASSERT(offsetof(typeof(*b), mbo_padding_8) != 0);
+	CLASSERT(offsetof(typeof(*b), mbo_padding_9) != 0);
+	CLASSERT(offsetof(typeof(*b), mbo_padding_10) != 0);
+}
+
+void lustre_swab_mdt_ioepoch(struct mdt_ioepoch *b)
+{
+	/* mio_handle is opaque */
+	CLASSERT(offsetof(typeof(*b), mio_unused1) != 0);
+	CLASSERT(offsetof(typeof(*b), mio_unused2) != 0);
+	CLASSERT(offsetof(typeof(*b), mio_padding) != 0);
+}
+
+void lustre_swab_mgs_target_info(struct mgs_target_info *mti)
+{
+        int i;
+        __swab32s(&mti->mti_lustre_ver);
+        __swab32s(&mti->mti_stripe_index);
+        __swab32s(&mti->mti_config_ver);
+        __swab32s(&mti->mti_flags);
+        __swab32s(&mti->mti_instance);
+        __swab32s(&mti->mti_nid_count);
+        CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
+        for (i = 0; i < MTI_NIDS_MAX; i++)
+                __swab64s(&mti->mti_nids[i]);
+}
+
+void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *entry)
+{
+	__u8 i;
+
+        __swab64s(&entry->mne_version);
+        __swab32s(&entry->mne_instance);
+        __swab32s(&entry->mne_index);
+        __swab32s(&entry->mne_length);
+
+        /* mne_nid_(count|type) must be one byte size because we're gonna
+         * access it w/o swapping. */
+        CLASSERT(sizeof(entry->mne_nid_count) == sizeof(__u8));
+        CLASSERT(sizeof(entry->mne_nid_type) == sizeof(__u8));
+
+        /* remove this assertion if ipv6 is supported. */
+        LASSERT(entry->mne_nid_type == 0);
+        for (i = 0; i < entry->mne_nid_count; i++) {
+                CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
+                __swab64s(&entry->u.nids[i]);
+        }
+}
+EXPORT_SYMBOL(lustre_swab_mgs_nidtbl_entry);
+
+void lustre_swab_mgs_config_body(struct mgs_config_body *body)
+{
+        __swab64s(&body->mcb_offset);
+        __swab32s(&body->mcb_units);
+        __swab16s(&body->mcb_type);
+}
+
+void lustre_swab_mgs_config_res(struct mgs_config_res *body)
+{
+        __swab64s(&body->mcr_offset);
+        __swab64s(&body->mcr_size);
+}
+
+static void lustre_swab_obd_dqinfo (struct obd_dqinfo *i)
+{
+        __swab64s (&i->dqi_bgrace);
+        __swab64s (&i->dqi_igrace);
+        __swab32s (&i->dqi_flags);
+        __swab32s (&i->dqi_valid);
+}
+
+static void lustre_swab_obd_dqblk (struct obd_dqblk *b)
+{
+        __swab64s (&b->dqb_ihardlimit);
+        __swab64s (&b->dqb_isoftlimit);
+        __swab64s (&b->dqb_curinodes);
+        __swab64s (&b->dqb_bhardlimit);
+        __swab64s (&b->dqb_bsoftlimit);
+        __swab64s (&b->dqb_curspace);
+        __swab64s (&b->dqb_btime);
+        __swab64s (&b->dqb_itime);
+        __swab32s (&b->dqb_valid);
+        CLASSERT(offsetof(typeof(*b), dqb_padding) != 0);
+}
+
+void lustre_swab_obd_quotactl (struct obd_quotactl *q)
+{
+        __swab32s (&q->qc_cmd);
+        __swab32s (&q->qc_type);
+        __swab32s (&q->qc_id);
+        __swab32s (&q->qc_stat);
+        lustre_swab_obd_dqinfo (&q->qc_dqinfo);
+        lustre_swab_obd_dqblk (&q->qc_dqblk);
+}
+
+void lustre_swab_fid2path(struct getinfo_fid2path *gf)
+{
+        lustre_swab_lu_fid(&gf->gf_fid);
+        __swab64s(&gf->gf_recno);
+        __swab32s(&gf->gf_linkno);
+        __swab32s(&gf->gf_pathlen);
+}
+EXPORT_SYMBOL(lustre_swab_fid2path);
+
+static void lustre_swab_fiemap_extent(struct fiemap_extent *fm_extent)
+{
+        __swab64s(&fm_extent->fe_logical);
+        __swab64s(&fm_extent->fe_physical);
+        __swab64s(&fm_extent->fe_length);
+        __swab32s(&fm_extent->fe_flags);
+        __swab32s(&fm_extent->fe_device);
+}
+
+void lustre_swab_fiemap(struct fiemap *fiemap)
+{
+	__u32 i;
+
+        __swab64s(&fiemap->fm_start);
+        __swab64s(&fiemap->fm_length);
+        __swab32s(&fiemap->fm_flags);
+        __swab32s(&fiemap->fm_mapped_extents);
+        __swab32s(&fiemap->fm_extent_count);
+        __swab32s(&fiemap->fm_reserved);
+
+        for (i = 0; i < fiemap->fm_mapped_extents; i++)
+                lustre_swab_fiemap_extent(&fiemap->fm_extents[i]);
+}
+
+void lustre_swab_idx_info(struct idx_info *ii)
+{
+	__swab32s(&ii->ii_magic);
+	__swab32s(&ii->ii_flags);
+	__swab16s(&ii->ii_count);
+	__swab32s(&ii->ii_attrs);
+	lustre_swab_lu_fid(&ii->ii_fid);
+	__swab64s(&ii->ii_version);
+	__swab64s(&ii->ii_hash_start);
+	__swab64s(&ii->ii_hash_end);
+	__swab16s(&ii->ii_keysize);
+	__swab16s(&ii->ii_recsize);
+}
+
+void lustre_swab_lip_header(struct lu_idxpage *lip)
+{
+	/* swab header */
+	__swab32s(&lip->lip_magic);
+	__swab16s(&lip->lip_flags);
+	__swab16s(&lip->lip_nr);
+}
+EXPORT_SYMBOL(lustre_swab_lip_header);
+
+void lustre_swab_mdt_rec_reint (struct mdt_rec_reint *rr)
+{
+	__swab32s(&rr->rr_opcode);
+	__swab32s(&rr->rr_cap);
+	__swab32s(&rr->rr_fsuid);
+	/* rr_fsuid_h is unused */
+	__swab32s(&rr->rr_fsgid);
+	/* rr_fsgid_h is unused */
+	__swab32s(&rr->rr_suppgid1);
+	/* rr_suppgid1_h is unused */
+	__swab32s(&rr->rr_suppgid2);
+	/* rr_suppgid2_h is unused */
+	lustre_swab_lu_fid(&rr->rr_fid1);
+	lustre_swab_lu_fid(&rr->rr_fid2);
+	__swab64s(&rr->rr_mtime);
+	__swab64s(&rr->rr_atime);
+	__swab64s(&rr->rr_ctime);
+	__swab64s(&rr->rr_size);
+	__swab64s(&rr->rr_blocks);
+	__swab32s(&rr->rr_bias);
+	__swab32s(&rr->rr_mode);
+	__swab32s(&rr->rr_flags);
+	__swab32s(&rr->rr_flags_h);
+	__swab32s(&rr->rr_umask);
+
+	CLASSERT(offsetof(typeof(*rr), rr_padding_4) != 0);
+};
+
+void lustre_swab_lov_desc (struct lov_desc *ld)
+{
+        __swab32s (&ld->ld_tgt_count);
+        __swab32s (&ld->ld_active_tgt_count);
+        __swab32s (&ld->ld_default_stripe_count);
+        __swab32s (&ld->ld_pattern);
+        __swab64s (&ld->ld_default_stripe_size);
+        __swab64s (&ld->ld_default_stripe_offset);
+        __swab32s (&ld->ld_qos_maxage);
+        /* uuid endian insensitive */
+}
+EXPORT_SYMBOL(lustre_swab_lov_desc);
+
+void lustre_swab_lmv_desc (struct lmv_desc *ld)
+{
+        __swab32s (&ld->ld_tgt_count);
+        __swab32s (&ld->ld_active_tgt_count);
+        __swab32s (&ld->ld_default_stripe_count);
+        __swab32s (&ld->ld_pattern);
+        __swab64s (&ld->ld_default_hash_size);
+        __swab32s (&ld->ld_qos_maxage);
+        /* uuid endian insensitive */
+}
+
+/* This structure is always in little-endian */
+static void lustre_swab_lmv_mds_md_v1(struct lmv_mds_md_v1 *lmm1)
+{
+	int i;
+
+	__swab32s(&lmm1->lmv_magic);
+	__swab32s(&lmm1->lmv_stripe_count);
+	__swab32s(&lmm1->lmv_master_mdt_index);
+	__swab32s(&lmm1->lmv_hash_type);
+	__swab32s(&lmm1->lmv_layout_version);
+	for (i = 0; i < lmm1->lmv_stripe_count; i++)
+		lustre_swab_lu_fid(&lmm1->lmv_stripe_fids[i]);
+}
+
+void lustre_swab_lmv_mds_md(union lmv_mds_md *lmm)
+{
+	switch (lmm->lmv_magic) {
+	case LMV_MAGIC_V1:
+		lustre_swab_lmv_mds_md_v1(&lmm->lmv_md_v1);
+		break;
+	default:
+		break;
+	}
+}
+EXPORT_SYMBOL(lustre_swab_lmv_mds_md);
+
+void lustre_swab_lmv_user_md(struct lmv_user_md *lum)
+{
+	__swab32s(&lum->lum_magic);
+	__swab32s(&lum->lum_stripe_count);
+	__swab32s(&lum->lum_stripe_offset);
+	__swab32s(&lum->lum_hash_type);
+	__swab32s(&lum->lum_type);
+	CLASSERT(offsetof(typeof(*lum), lum_padding1) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_lmv_user_md);
+
+static void lustre_print_v1v3(unsigned int lvl, struct lov_user_md *lum,
+			      const char *msg)
+{
+	CDEBUG(lvl, "%s lov_user_md %p:\n", msg, lum);
+	CDEBUG(lvl, "\tlmm_magic: %#x\n", lum->lmm_magic);
+	CDEBUG(lvl, "\tlmm_pattern: %#x\n", lum->lmm_pattern);
+	CDEBUG(lvl, "\tlmm_object_id: %llu\n", lmm_oi_id(&lum->lmm_oi));
+	CDEBUG(lvl, "\tlmm_object_gr: %llu\n", lmm_oi_seq(&lum->lmm_oi));
+	CDEBUG(lvl, "\tlmm_stripe_size: %#x\n", lum->lmm_stripe_size);
+	CDEBUG(lvl, "\tlmm_stripe_count: %#x\n", lum->lmm_stripe_count);
+	CDEBUG(lvl, "\tlmm_stripe_offset/lmm_layout_gen: %#x\n",
+	       lum->lmm_stripe_offset);
+	if (lum->lmm_magic == LOV_USER_MAGIC_V3) {
+		struct lov_user_md_v3 *v3 = (void *)lum;
+		CDEBUG(lvl, "\tlmm_pool_name: %s\n", v3->lmm_pool_name);
+	}
+	if (lum->lmm_magic == LOV_USER_MAGIC_SPECIFIC) {
+		struct lov_user_md_v3 *v3 = (void *)lum;
+		int i;
+
+		if (v3->lmm_pool_name[0] != '\0')
+			CDEBUG(lvl, "\tlmm_pool_name: %s\n", v3->lmm_pool_name);
+
+		CDEBUG(lvl, "\ttarget list:\n");
+		for (i = 0; i < v3->lmm_stripe_count; i++)
+			CDEBUG(lvl, "\t\t%u\n", v3->lmm_objects[i].l_ost_idx);
+	}
+}
+
+void lustre_print_user_md(unsigned int lvl, struct lov_user_md *lum,
+			  const char *msg)
+{
+	struct lov_comp_md_v1	*comp_v1;
+	int			 i;
+
+	if (likely(!cfs_cdebug_show(lvl, DEBUG_SUBSYSTEM)))
+		return;
+
+	if (lum->lmm_magic == LOV_USER_MAGIC_V1 ||
+	    lum->lmm_magic == LOV_USER_MAGIC_V3) {
+		lustre_print_v1v3(lvl, lum, msg);
+		return;
+	}
+
+	if (lum->lmm_magic != LOV_USER_MAGIC_COMP_V1) {
+		CDEBUG(lvl, "%s: bad magic: %x\n", msg, lum->lmm_magic);
+		return;
+	}
+
+	comp_v1 = (struct lov_comp_md_v1 *)lum;
+	CDEBUG(lvl, "%s: lov_comp_md_v1 %p:\n", msg, lum);
+	CDEBUG(lvl, "\tlcm_magic: %#x\n", comp_v1->lcm_magic);
+	CDEBUG(lvl, "\tlcm_size: %#x\n", comp_v1->lcm_size);
+	CDEBUG(lvl, "\tlcm_layout_gen: %#x\n", comp_v1->lcm_layout_gen);
+	CDEBUG(lvl, "\tlcm_flags: %#x\n", comp_v1->lcm_flags);
+	CDEBUG(lvl, "\tlcm_entry_count: %#x\n\n", comp_v1->lcm_entry_count);
+
+	for (i = 0; i < comp_v1->lcm_entry_count; i++) {
+		struct lov_comp_md_entry_v1 *ent = &comp_v1->lcm_entries[i];
+		struct lov_user_md *v1;
+
+		CDEBUG(lvl, "\tentry %d:\n", i);
+		CDEBUG(lvl, "\tlcme_id: %#x\n", ent->lcme_id);
+		CDEBUG(lvl, "\tlcme_flags: %#x\n", ent->lcme_flags);
+		CDEBUG(lvl, "\tlcme_extent.e_start: %llu\n",
+		       ent->lcme_extent.e_start);
+		CDEBUG(lvl, "\tlcme_extent.e_end: %llu\n",
+		       ent->lcme_extent.e_end);
+		CDEBUG(lvl, "\tlcme_offset: %#x\n", ent->lcme_offset);
+		CDEBUG(lvl, "\tlcme_size: %#x\n\n", ent->lcme_size);
+
+		v1 = (struct lov_user_md *)((char *)comp_v1 +
+				comp_v1->lcm_entries[i].lcme_offset);
+		lustre_print_v1v3(lvl, v1, msg);
+	}
+}
+EXPORT_SYMBOL(lustre_print_user_md);
+
+static void lustre_swab_lmm_oi(struct ost_id *oi)
+{
+	__swab64s(&oi->oi.oi_id);
+	__swab64s(&oi->oi.oi_seq);
+}
+
+static void lustre_swab_lov_user_md_common(struct lov_user_md_v1 *lum)
+{
+	ENTRY;
+	__swab32s(&lum->lmm_magic);
+	__swab32s(&lum->lmm_pattern);
+	lustre_swab_lmm_oi(&lum->lmm_oi);
+	__swab32s(&lum->lmm_stripe_size);
+	__swab16s(&lum->lmm_stripe_count);
+	__swab16s(&lum->lmm_stripe_offset);
+	EXIT;
+}
+
+void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum)
+{
+        ENTRY;
+        CDEBUG(D_IOCTL, "swabbing lov_user_md v1\n");
+        lustre_swab_lov_user_md_common(lum);
+        EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_lov_user_md_v1);
+
+void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum)
+{
+        ENTRY;
+        CDEBUG(D_IOCTL, "swabbing lov_user_md v3\n");
+        lustre_swab_lov_user_md_common((struct lov_user_md_v1 *)lum);
+        /* lmm_pool_name nothing to do with char */
+        EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_lov_user_md_v3);
+
+void lustre_swab_lov_comp_md_v1(struct lov_comp_md_v1 *lum)
+{
+	struct lov_comp_md_entry_v1	*ent;
+	struct lov_user_md_v1	*v1;
+	struct lov_user_md_v3	*v3;
+	int	i;
+	bool	cpu_endian;
+	__u32	off, size;
+	__u16	ent_count, stripe_count;
+	ENTRY;
+
+	cpu_endian = lum->lcm_magic == LOV_USER_MAGIC_COMP_V1;
+	ent_count = lum->lcm_entry_count;
+	if (!cpu_endian)
+		__swab16s(&ent_count);
+
+	CDEBUG(D_IOCTL, "swabbing lov_user_comp_md v1\n");
+	__swab32s(&lum->lcm_magic);
+	__swab32s(&lum->lcm_size);
+	__swab32s(&lum->lcm_layout_gen);
+	__swab16s(&lum->lcm_flags);
+	__swab16s(&lum->lcm_entry_count);
+	CLASSERT(offsetof(typeof(*lum), lcm_padding1) != 0);
+	CLASSERT(offsetof(typeof(*lum), lcm_padding2) != 0);
+
+	for (i = 0; i < ent_count; i++) {
+		ent = &lum->lcm_entries[i];
+		off = ent->lcme_offset;
+		size = ent->lcme_size;
+
+		if (!cpu_endian) {
+			__swab32s(&off);
+			__swab32s(&size);
+		}
+		__swab32s(&ent->lcme_id);
+		__swab32s(&ent->lcme_flags);
+		__swab64s(&ent->lcme_extent.e_start);
+		__swab64s(&ent->lcme_extent.e_end);
+		__swab32s(&ent->lcme_offset);
+		__swab32s(&ent->lcme_size);
+		CLASSERT(offsetof(typeof(*ent), lcme_padding) != 0);
+
+		v1 = (struct lov_user_md_v1 *)((char *)lum + off);
+		stripe_count = v1->lmm_stripe_count;
+		if (!cpu_endian)
+			__swab16s(&stripe_count);
+
+		if (v1->lmm_magic == __swab32(LOV_USER_MAGIC_V1) ||
+		    v1->lmm_magic == LOV_USER_MAGIC_V1) {
+			lustre_swab_lov_user_md_v1(v1);
+			if (size > sizeof(*v1))
+				lustre_swab_lov_user_md_objects(v1->lmm_objects,
+								stripe_count);
+		} else if (v1->lmm_magic == __swab32(LOV_USER_MAGIC_V3) ||
+			   v1->lmm_magic == LOV_USER_MAGIC_V3 ||
+			   v1->lmm_magic == __swab32(LOV_USER_MAGIC_SPECIFIC) ||
+			   v1->lmm_magic == LOV_USER_MAGIC_SPECIFIC) {
+			v3 = (struct lov_user_md_v3 *)v1;
+			lustre_swab_lov_user_md_v3(v3);
+			if (size > sizeof(*v3))
+				lustre_swab_lov_user_md_objects(v3->lmm_objects,
+								stripe_count);
+		} else {
+			CERROR("Invalid magic %#x\n", v1->lmm_magic);
+		}
+	}
+}
+EXPORT_SYMBOL(lustre_swab_lov_comp_md_v1);
+
+void lustre_swab_lov_mds_md(struct lov_mds_md *lmm)
+{
+	ENTRY;
+	CDEBUG(D_IOCTL, "swabbing lov_mds_md\n");
+	__swab32s(&lmm->lmm_magic);
+	__swab32s(&lmm->lmm_pattern);
+	lustre_swab_lmm_oi(&lmm->lmm_oi);
+	__swab32s(&lmm->lmm_stripe_size);
+	__swab16s(&lmm->lmm_stripe_count);
+	__swab16s(&lmm->lmm_layout_gen);
+	EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_lov_mds_md);
+
+void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
+                                     int stripe_count)
+{
+        int i;
+        ENTRY;
+        for (i = 0; i < stripe_count; i++) {
+                lustre_swab_ost_id(&(lod[i].l_ost_oi));
+                __swab32s(&(lod[i].l_ost_gen));
+                __swab32s(&(lod[i].l_ost_idx));
+        }
+        EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_lov_user_md_objects);
+
+void lustre_swab_ldlm_res_id (struct ldlm_res_id *id)
+{
+        int  i;
+
+        for (i = 0; i < RES_NAME_SIZE; i++)
+                __swab64s (&id->name[i]);
+}
+
+void lustre_swab_ldlm_policy_data(union ldlm_wire_policy_data *d)
+{
+	/* the lock data is a union and the first two fields are always an
+	 * extent so it's ok to process an LDLM_EXTENT and LDLM_FLOCK lock
+	 * data the same way. */
+	__swab64s(&d->l_extent.start);
+	__swab64s(&d->l_extent.end);
+	__swab64s(&d->l_extent.gid);
+	__swab64s(&d->l_flock.lfw_owner);
+	__swab32s(&d->l_flock.lfw_pid);
+}
+
+void lustre_swab_ldlm_intent (struct ldlm_intent *i)
+{
+	__swab64s(&i->opc);
+}
+
+void lustre_swab_ldlm_resource_desc(struct ldlm_resource_desc *r)
+{
+	__swab32s(&r->lr_type);
+	CLASSERT(offsetof(typeof(*r), lr_pad) != 0);
+	lustre_swab_ldlm_res_id(&r->lr_name);
+}
+
+void lustre_swab_ldlm_lock_desc (struct ldlm_lock_desc *l)
+{
+        lustre_swab_ldlm_resource_desc (&l->l_resource);
+        __swab32s (&l->l_req_mode);
+        __swab32s (&l->l_granted_mode);
+        lustre_swab_ldlm_policy_data (&l->l_policy_data);
+}
+
+void lustre_swab_ldlm_request (struct ldlm_request *rq)
+{
+        __swab32s (&rq->lock_flags);
+        lustre_swab_ldlm_lock_desc (&rq->lock_desc);
+        __swab32s (&rq->lock_count);
+        /* lock_handle[] opaque */
+}
+
+void lustre_swab_ldlm_reply (struct ldlm_reply *r)
+{
+        __swab32s (&r->lock_flags);
+        CLASSERT(offsetof(typeof(*r), lock_padding) != 0);
+        lustre_swab_ldlm_lock_desc (&r->lock_desc);
+        /* lock_handle opaque */
+        __swab64s (&r->lock_policy_res1);
+        __swab64s (&r->lock_policy_res2);
+}
+
+void lustre_swab_quota_body(struct quota_body *b)
+{
+	lustre_swab_lu_fid(&b->qb_fid);
+	lustre_swab_lu_fid((struct lu_fid *)&b->qb_id);
+	__swab32s(&b->qb_flags);
+	__swab64s(&b->qb_count);
+	__swab64s(&b->qb_usage);
+	__swab64s(&b->qb_slv_ver);
+}
+
+/* Dump functions */
+void dump_ioo(struct obd_ioobj *ioo)
+{
+	CDEBUG(D_RPCTRACE,
+	       "obd_ioobj: ioo_oid="DOSTID", ioo_max_brw=%#x, "
+	       "ioo_bufct=%d\n", POSTID(&ioo->ioo_oid), ioo->ioo_max_brw,
+	       ioo->ioo_bufcnt);
+}
+
+void dump_rniobuf(struct niobuf_remote *nb)
+{
+	CDEBUG(D_RPCTRACE, "niobuf_remote: offset=%llu, len=%d, flags=%x\n",
+	       nb->rnb_offset, nb->rnb_len, nb->rnb_flags);
+}
+
+void dump_obdo(struct obdo *oa)
+{
+	u64 valid = oa->o_valid;
+
+	CDEBUG(D_RPCTRACE, "obdo: o_valid = %#llx\n", valid);
+	if (valid & OBD_MD_FLID)
+		CDEBUG(D_RPCTRACE, "obdo: id = "DOSTID"\n", POSTID(&oa->o_oi));
+	if (valid & OBD_MD_FLFID)
+		CDEBUG(D_RPCTRACE, "obdo: o_parent_seq = %#llx\n",
+		       oa->o_parent_seq);
+        if (valid & OBD_MD_FLSIZE)
+		CDEBUG(D_RPCTRACE, "obdo: o_size = %lld\n", oa->o_size);
+        if (valid & OBD_MD_FLMTIME)
+		CDEBUG(D_RPCTRACE, "obdo: o_mtime = %lld\n", oa->o_mtime);
+        if (valid & OBD_MD_FLATIME)
+		CDEBUG(D_RPCTRACE, "obdo: o_atime = %lld\n", oa->o_atime);
+        if (valid & OBD_MD_FLCTIME)
+		CDEBUG(D_RPCTRACE, "obdo: o_ctime = %lld\n", oa->o_ctime);
+        if (valid & OBD_MD_FLBLOCKS)   /* allocation of space */
+		CDEBUG(D_RPCTRACE, "obdo: o_blocks = %lld\n", oa->o_blocks);
+        if (valid & OBD_MD_FLGRANT)
+		CDEBUG(D_RPCTRACE, "obdo: o_grant = %lld\n", oa->o_grant);
+        if (valid & OBD_MD_FLBLKSZ)
+                CDEBUG(D_RPCTRACE, "obdo: o_blksize = %d\n", oa->o_blksize);
+        if (valid & (OBD_MD_FLTYPE | OBD_MD_FLMODE))
+                CDEBUG(D_RPCTRACE, "obdo: o_mode = %o\n",
+                       oa->o_mode & ((valid & OBD_MD_FLTYPE ?  S_IFMT : 0) |
+                                     (valid & OBD_MD_FLMODE ? ~S_IFMT : 0)));
+        if (valid & OBD_MD_FLUID)
+                CDEBUG(D_RPCTRACE, "obdo: o_uid = %u\n", oa->o_uid);
+        if (valid & OBD_MD_FLUID)
+                CDEBUG(D_RPCTRACE, "obdo: o_uid_h = %u\n", oa->o_uid_h);
+        if (valid & OBD_MD_FLGID)
+                CDEBUG(D_RPCTRACE, "obdo: o_gid = %u\n", oa->o_gid);
+        if (valid & OBD_MD_FLGID)
+                CDEBUG(D_RPCTRACE, "obdo: o_gid_h = %u\n", oa->o_gid_h);
+        if (valid & OBD_MD_FLFLAGS)
+                CDEBUG(D_RPCTRACE, "obdo: o_flags = %x\n", oa->o_flags);
+        if (valid & OBD_MD_FLNLINK)
+                CDEBUG(D_RPCTRACE, "obdo: o_nlink = %u\n", oa->o_nlink);
+        else if (valid & OBD_MD_FLCKSUM)
+                CDEBUG(D_RPCTRACE, "obdo: o_checksum (o_nlink) = %u\n",
+                       oa->o_nlink);
+        if (valid & OBD_MD_FLGENER)
+                CDEBUG(D_RPCTRACE, "obdo: o_parent_oid = %x\n",
+                       oa->o_parent_oid);
+        if (valid & OBD_MD_FLEPOCH)
+		CDEBUG(D_RPCTRACE, "obdo: o_ioepoch = %lld\n",
+                       oa->o_ioepoch);
+        if (valid & OBD_MD_FLFID) {
+                CDEBUG(D_RPCTRACE, "obdo: o_stripe_idx = %u\n",
+                       oa->o_stripe_idx);
+                CDEBUG(D_RPCTRACE, "obdo: o_parent_ver = %x\n",
+                       oa->o_parent_ver);
+        }
+        if (valid & OBD_MD_FLHANDLE)
+		CDEBUG(D_RPCTRACE, "obdo: o_handle = %lld\n",
+                       oa->o_handle.cookie);
+}
+
+void dump_ost_body(struct ost_body *ob)
+{
+        dump_obdo(&ob->oa);
+}
+
+void dump_rcs(__u32 *rc)
+{
+        CDEBUG(D_RPCTRACE, "rmf_rcs: %d\n", *rc);
+}
+
+static inline int req_ptlrpc_body_swabbed(struct ptlrpc_request *req)
+{
+        LASSERT(req->rq_reqmsg);
+
+        switch (req->rq_reqmsg->lm_magic) {
+        case LUSTRE_MSG_MAGIC_V2:
+                return lustre_req_swabbed(req, MSG_PTLRPC_BODY_OFF);
+        default:
+                CERROR("bad lustre msg magic: %#08X\n",
+                       req->rq_reqmsg->lm_magic);
+        }
+        return 0;
+}
+
+static inline int rep_ptlrpc_body_swabbed(struct ptlrpc_request *req)
+{
+	if (unlikely(!req->rq_repmsg))
+		return 0;
+
+	switch (req->rq_repmsg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_rep_swabbed(req, MSG_PTLRPC_BODY_OFF);
+	default:
+		/* uninitialized yet */
+		return 0;
+	}
+}
+
+void _debug_req(struct ptlrpc_request *req,
+		struct libcfs_debug_msg_data *msgdata, const char *fmt, ...)
+{
+	bool req_ok = req->rq_reqmsg != NULL;
+	bool rep_ok = false;
+	lnet_nid_t nid = LNET_NID_ANY;
+	va_list args;
+	int rep_flags = -1;
+	int rep_status = -1;
+
+	spin_lock(&req->rq_early_free_lock);
+	if (req->rq_repmsg)
+		rep_ok = true;
+
+	if (ptlrpc_req_need_swab(req)) {
+		req_ok = req_ok && req_ptlrpc_body_swabbed(req);
+		rep_ok = rep_ok && rep_ptlrpc_body_swabbed(req);
+	}
+
+	if (rep_ok) {
+		rep_flags = lustre_msg_get_flags(req->rq_repmsg);
+		rep_status = lustre_msg_get_status(req->rq_repmsg);
+	}
+	spin_unlock(&req->rq_early_free_lock);
+
+	if (req->rq_import && req->rq_import->imp_connection)
+		nid = req->rq_import->imp_connection->c_peer.nid;
+	else if (req->rq_export && req->rq_export->exp_connection)
+		nid = req->rq_export->exp_connection->c_peer.nid;
+
+	va_start(args, fmt);
+	libcfs_debug_vmsg2(msgdata, fmt, args,
+			   " req@%p x%llu/t%lld(%lld) o%d->%s@%s:%d/%d lens %d/%d e %d to %lld dl %lld ref %d fl " REQ_FLAGS_FMT "/%x/%x rc %d/%d\n",
+			   req, req->rq_xid, req->rq_transno,
+			   req_ok ? lustre_msg_get_transno(req->rq_reqmsg) : 0,
+			   req_ok ? lustre_msg_get_opc(req->rq_reqmsg) : -1,
+			   req->rq_import ?
+				req->rq_import->imp_obd->obd_name :
+				req->rq_export ?
+					req->rq_export->exp_client_uuid.uuid :
+					"<?>",
+			   libcfs_nid2str(nid),
+			   req->rq_request_portal, req->rq_reply_portal,
+			   req->rq_reqlen, req->rq_replen,
+			   req->rq_early_count, (s64)req->rq_timedout,
+			   (s64)req->rq_deadline,
+			   atomic_read(&req->rq_refcount),
+			   DEBUG_REQ_FLAGS(req),
+			   req_ok ? lustre_msg_get_flags(req->rq_reqmsg) : -1,
+			   rep_flags, req->rq_status, rep_status);
+	va_end(args);
+}
+EXPORT_SYMBOL(_debug_req);
+
+void lustre_swab_lustre_capa(struct lustre_capa *c)
+{
+        lustre_swab_lu_fid(&c->lc_fid);
+        __swab64s (&c->lc_opc);
+        __swab64s (&c->lc_uid);
+        __swab64s (&c->lc_gid);
+        __swab32s (&c->lc_flags);
+        __swab32s (&c->lc_keyid);
+        __swab32s (&c->lc_timeout);
+        __swab32s (&c->lc_expiry);
+}
+
+void lustre_swab_lustre_capa_key(struct lustre_capa_key *k)
+{
+        __swab64s (&k->lk_seq);
+        __swab32s (&k->lk_keyid);
+        CLASSERT(offsetof(typeof(*k), lk_padding) != 0);
+}
+
+void lustre_swab_hsm_user_state(struct hsm_user_state *state)
+{
+	__swab32s(&state->hus_states);
+	__swab32s(&state->hus_archive_id);
+}
+
+void lustre_swab_hsm_state_set(struct hsm_state_set *hss)
+{
+	__swab32s(&hss->hss_valid);
+	__swab64s(&hss->hss_setmask);
+	__swab64s(&hss->hss_clearmask);
+	__swab32s(&hss->hss_archive_id);
+}
+
+static void lustre_swab_hsm_extent(struct hsm_extent *extent)
+{
+	__swab64s(&extent->offset);
+	__swab64s(&extent->length);
+}
+
+void lustre_swab_hsm_current_action(struct hsm_current_action *action)
+{
+	__swab32s(&action->hca_state);
+	__swab32s(&action->hca_action);
+	lustre_swab_hsm_extent(&action->hca_location);
+}
+
+void lustre_swab_hsm_user_item(struct hsm_user_item *hui)
+{
+	lustre_swab_lu_fid(&hui->hui_fid);
+	lustre_swab_hsm_extent(&hui->hui_extent);
+}
+
+void lustre_swab_layout_intent(struct layout_intent *li)
+{
+	__swab32s(&li->li_opc);
+	__swab32s(&li->li_flags);
+	__swab64s(&li->li_start);
+	__swab64s(&li->li_end);
+}
+
+void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk)
+{
+	lustre_swab_lu_fid(&hpk->hpk_fid);
+	__swab64s(&hpk->hpk_cookie);
+	__swab64s(&hpk->hpk_extent.offset);
+	__swab64s(&hpk->hpk_extent.length);
+	__swab16s(&hpk->hpk_flags);
+	__swab16s(&hpk->hpk_errval);
+}
+
+void lustre_swab_hsm_request(struct hsm_request *hr)
+{
+	__swab32s(&hr->hr_action);
+	__swab32s(&hr->hr_archive_id);
+	__swab64s(&hr->hr_flags);
+	__swab32s(&hr->hr_itemcount);
+	__swab32s(&hr->hr_data_len);
+}
+
+void lustre_swab_object_update(struct object_update *ou)
+{
+	struct object_update_param *param;
+	size_t	i;
+
+	__swab16s(&ou->ou_type);
+	__swab16s(&ou->ou_params_count);
+	__swab32s(&ou->ou_result_size);
+	__swab32s(&ou->ou_flags);
+	__swab32s(&ou->ou_padding1);
+	__swab64s(&ou->ou_batchid);
+	lustre_swab_lu_fid(&ou->ou_fid);
+	param = &ou->ou_params[0];
+	for (i = 0; i < ou->ou_params_count; i++) {
+		__swab16s(&param->oup_len);
+		__swab16s(&param->oup_padding);
+		__swab32s(&param->oup_padding2);
+		param = (struct object_update_param *)((char *)param +
+			 object_update_param_size(param));
+	}
+}
+
+void lustre_swab_object_update_request(struct object_update_request *our)
+{
+	size_t i;
+	__swab32s(&our->ourq_magic);
+	__swab16s(&our->ourq_count);
+	__swab16s(&our->ourq_padding);
+	for (i = 0; i < our->ourq_count; i++) {
+		struct object_update *ou;
+
+		ou = object_update_request_get(our, i, NULL);
+		if (ou == NULL)
+			return;
+		lustre_swab_object_update(ou);
+	}
+}
+
+void lustre_swab_object_update_result(struct object_update_result *our)
+{
+	__swab32s(&our->our_rc);
+	__swab16s(&our->our_datalen);
+	__swab16s(&our->our_padding);
+}
+
+void lustre_swab_object_update_reply(struct object_update_reply *our)
+{
+	size_t i;
+
+	__swab32s(&our->ourp_magic);
+	__swab16s(&our->ourp_count);
+	__swab16s(&our->ourp_padding);
+	for (i = 0; i < our->ourp_count; i++) {
+		struct object_update_result *ourp;
+
+		__swab16s(&our->ourp_lens[i]);
+		ourp = object_update_result_get(our, i, NULL);
+		if (ourp == NULL)
+			return;
+		lustre_swab_object_update_result(ourp);
+	}
+}
+
+void lustre_swab_out_update_header(struct out_update_header *ouh)
+{
+	__swab32s(&ouh->ouh_magic);
+	__swab32s(&ouh->ouh_count);
+	__swab32s(&ouh->ouh_inline_length);
+	__swab32s(&ouh->ouh_reply_size);
+}
+EXPORT_SYMBOL(lustre_swab_out_update_header);
+
+void lustre_swab_out_update_buffer(struct out_update_buffer *oub)
+{
+	__swab32s(&oub->oub_size);
+	__swab32s(&oub->oub_padding);
+}
+EXPORT_SYMBOL(lustre_swab_out_update_buffer);
+
+void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl)
+{
+	__swab64s(&msl->msl_flags);
+}
+
+void lustre_swab_close_data(struct close_data *cd)
+{
+	lustre_swab_lu_fid(&cd->cd_fid);
+	__swab64s(&cd->cd_data_version);
+}
+
+void lustre_swab_lfsck_request(struct lfsck_request *lr)
+{
+	__swab32s(&lr->lr_event);
+	__swab32s(&lr->lr_index);
+	__swab32s(&lr->lr_flags);
+	__swab32s(&lr->lr_valid);
+	__swab32s(&lr->lr_speed);
+	__swab16s(&lr->lr_version);
+	__swab16s(&lr->lr_active);
+	__swab16s(&lr->lr_param);
+	__swab16s(&lr->lr_async_windows);
+	__swab32s(&lr->lr_flags);
+	lustre_swab_lu_fid(&lr->lr_fid);
+	lustre_swab_lu_fid(&lr->lr_fid2);
+	__swab32s(&lr->lr_comp_id);
+	CLASSERT(offsetof(typeof(*lr), lr_padding_0) != 0);
+	CLASSERT(offsetof(typeof(*lr), lr_padding_1) != 0);
+	CLASSERT(offsetof(typeof(*lr), lr_padding_2) != 0);
+	CLASSERT(offsetof(typeof(*lr), lr_padding_3) != 0);
+}
+
+void lustre_swab_lfsck_reply(struct lfsck_reply *lr)
+{
+	__swab32s(&lr->lr_status);
+	CLASSERT(offsetof(typeof(*lr), lr_padding_1) != 0);
+	__swab64s(&lr->lr_repaired);
+}
+
+static void lustre_swab_orphan_rec(struct lu_orphan_rec *rec)
+{
+	lustre_swab_lu_fid(&rec->lor_fid);
+	__swab32s(&rec->lor_uid);
+	__swab32s(&rec->lor_gid);
+}
+
+void lustre_swab_orphan_ent(struct lu_orphan_ent *ent)
+{
+	lustre_swab_lu_fid(&ent->loe_key);
+	lustre_swab_orphan_rec(&ent->loe_rec);
+}
+EXPORT_SYMBOL(lustre_swab_orphan_ent);
+
+void lustre_swab_orphan_ent_v2(struct lu_orphan_ent_v2 *ent)
+{
+	lustre_swab_lu_fid(&ent->loe_key);
+	lustre_swab_orphan_rec(&ent->loe_rec.lor_rec);
+	lustre_swab_ost_layout(&ent->loe_rec.lor_layout);
+	CLASSERT(offsetof(typeof(ent->loe_rec), lor_padding) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_orphan_ent_v2);
+
+void lustre_swab_ladvise(struct lu_ladvise *ladvise)
+{
+	__swab16s(&ladvise->lla_advice);
+	__swab16s(&ladvise->lla_value1);
+	__swab32s(&ladvise->lla_value2);
+	__swab64s(&ladvise->lla_start);
+	__swab64s(&ladvise->lla_end);
+	__swab32s(&ladvise->lla_value3);
+	__swab32s(&ladvise->lla_value4);
+}
+EXPORT_SYMBOL(lustre_swab_ladvise);
+
+void lustre_swab_ladvise_hdr(struct ladvise_hdr *ladvise_hdr)
+{
+	__swab32s(&ladvise_hdr->lah_magic);
+	__swab32s(&ladvise_hdr->lah_count);
+	__swab64s(&ladvise_hdr->lah_flags);
+	__swab32s(&ladvise_hdr->lah_value1);
+	__swab32s(&ladvise_hdr->lah_value2);
+	__swab64s(&ladvise_hdr->lah_value3);
+}
+EXPORT_SYMBOL(lustre_swab_ladvise_hdr);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/pers.c b/drivers/staging/lustrefsx/lustre/ptlrpc/pers.c
new file mode 100644
index 0000000000000..51e17e2c2b459
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/pers.c
@@ -0,0 +1,74 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+#include <lustre_ha.h>
+#include <lustre_import.h>
+
+#include "ptlrpc_internal.h"
+
+
+void ptlrpc_fill_bulk_md(struct lnet_md *md, struct ptlrpc_bulk_desc *desc,
+			 int mdidx)
+{
+	CLASSERT(PTLRPC_MAX_BRW_PAGES < LI_POISON);
+
+	LASSERT(mdidx < desc->bd_md_max_brw);
+	LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES);
+	LASSERT(!(md->options & (LNET_MD_IOVEC | LNET_MD_KIOV |
+				 LNET_MD_PHYS)));
+
+	md->length = max(0, desc->bd_iov_count - mdidx * LNET_MAX_IOV);
+	md->length = min_t(unsigned int, LNET_MAX_IOV, md->length);
+
+	if (ptlrpc_is_bulk_desc_kiov(desc->bd_type)) {
+		md->options |= LNET_MD_KIOV;
+		if (GET_ENC_KIOV(desc))
+			md->start = &BD_GET_ENC_KIOV(desc, mdidx *
+						     LNET_MAX_IOV);
+		else
+			md->start = &BD_GET_KIOV(desc, mdidx * LNET_MAX_IOV);
+	} else if (ptlrpc_is_bulk_desc_kvec(desc->bd_type)) {
+		md->options |= LNET_MD_IOVEC;
+		if (GET_ENC_KVEC(desc))
+			md->start = &BD_GET_ENC_KVEC(desc, mdidx *
+						      LNET_MAX_IOV);
+		else
+			md->start = &BD_GET_KVEC(desc, mdidx * LNET_MAX_IOV);
+	}
+}
+
+
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/pinger.c b/drivers/staging/lustrefsx/lustre/ptlrpc/pinger.c
new file mode 100644
index 0000000000000..15fb0965241eb
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/pinger.c
@@ -0,0 +1,709 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/pinger.c
+ *
+ * Portal-RPC reconnection and replay operations, for use in recovery.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <linux/kthread.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include "ptlrpc_internal.h"
+
+static int suppress_pings;
+module_param(suppress_pings, int, 0644);
+MODULE_PARM_DESC(suppress_pings, "Suppress pings");
+
+struct mutex pinger_mutex;
+static struct list_head pinger_imports =
+		LIST_HEAD_INIT(pinger_imports);
+static struct list_head timeout_list =
+		LIST_HEAD_INIT(timeout_list);
+
+int ptlrpc_pinger_suppress_pings()
+{
+	return suppress_pings;
+}
+EXPORT_SYMBOL(ptlrpc_pinger_suppress_pings);
+
+struct ptlrpc_request *
+ptlrpc_prep_ping(struct obd_import *imp)
+{
+        struct ptlrpc_request *req;
+
+        req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING,
+                                        LUSTRE_OBD_VERSION, OBD_PING);
+        if (req) {
+                ptlrpc_request_set_replen(req);
+                req->rq_no_resend = req->rq_no_delay = 1;
+        }
+        return req;
+}
+
+int ptlrpc_obd_ping(struct obd_device *obd)
+{
+        int rc;
+        struct ptlrpc_request *req;
+        ENTRY;
+
+        req = ptlrpc_prep_ping(obd->u.cli.cl_import);
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+        req->rq_send_state = LUSTRE_IMP_FULL;
+
+        rc = ptlrpc_queue_wait(req);
+
+        ptlrpc_req_finished(req);
+
+        RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_obd_ping);
+
+static int ptlrpc_ping(struct obd_import *imp)
+{
+	struct ptlrpc_request	*req;
+	ENTRY;
+
+	req = ptlrpc_prep_ping(imp);
+	if (req == NULL) {
+		CERROR("OOM trying to ping %s->%s\n",
+		       imp->imp_obd->obd_uuid.uuid,
+		       obd2cli_tgt(imp->imp_obd));
+		RETURN(-ENOMEM);
+	}
+
+	DEBUG_REQ(D_INFO, req, "pinging %s->%s",
+		  imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
+	ptlrpcd_add_req(req);
+
+	RETURN(0);
+}
+
+static void ptlrpc_update_next_ping(struct obd_import *imp, int soon)
+{
+#ifdef ENABLE_PINGER
+        int time = soon ? PING_INTERVAL_SHORT : PING_INTERVAL;
+        if (imp->imp_state == LUSTRE_IMP_DISCON) {
+                int dtime = max_t(int, CONNECTION_SWITCH_MIN,
+                                  AT_OFF ? 0 :
+                                  at_get(&imp->imp_at.iat_net_latency));
+                time = min(time, dtime);
+        }
+        imp->imp_next_ping = cfs_time_shift(time);
+#endif /* ENABLE_PINGER */
+}
+
+void ptlrpc_ping_import_soon(struct obd_import *imp)
+{
+        imp->imp_next_ping = cfs_time_current();
+}
+
+static inline int imp_is_deactive(struct obd_import *imp)
+{
+        return (imp->imp_deactive ||
+                OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_IMP_DEACTIVE));
+}
+
+static inline int ptlrpc_next_reconnect(struct obd_import *imp)
+{
+        if (imp->imp_server_timeout)
+                return cfs_time_shift(obd_timeout / 2);
+        else
+                return cfs_time_shift(obd_timeout);
+}
+
+static cfs_duration_t pinger_check_timeout(cfs_time_t time)
+{
+        struct timeout_item *item;
+        cfs_time_t timeout = PING_INTERVAL;
+
+	/* This list is sorted in increasing timeout order */
+	mutex_lock(&pinger_mutex);
+	list_for_each_entry(item, &timeout_list, ti_chain) {
+		int ti_timeout = item->ti_timeout;
+		if (timeout > ti_timeout)
+			timeout = ti_timeout;
+		break;
+	}
+	mutex_unlock(&pinger_mutex);
+
+        return cfs_time_sub(cfs_time_add(time, cfs_time_seconds(timeout)),
+                                         cfs_time_current());
+}
+
+
+static bool ir_up;
+
+void ptlrpc_pinger_ir_up(void)
+{
+	CDEBUG(D_HA, "IR up\n");
+	ir_up = true;
+}
+EXPORT_SYMBOL(ptlrpc_pinger_ir_up);
+
+void ptlrpc_pinger_ir_down(void)
+{
+	CDEBUG(D_HA, "IR down\n");
+	ir_up = false;
+}
+EXPORT_SYMBOL(ptlrpc_pinger_ir_down);
+
+static void ptlrpc_pinger_process_import(struct obd_import *imp,
+                                         unsigned long this_ping)
+{
+	int level;
+	int force;
+	int force_next;
+	int suppress;
+
+	spin_lock(&imp->imp_lock);
+
+	level = imp->imp_state;
+	force = imp->imp_force_verify;
+	force_next = imp->imp_force_next_verify;
+	/*
+	 * This will be used below only if the import is "FULL".
+	 */
+	suppress = ir_up && OCD_HAS_FLAG(&imp->imp_connect_data, PINGLESS);
+
+	imp->imp_force_verify = 0;
+
+	if (cfs_time_aftereq(imp->imp_next_ping - 5 * CFS_TICK, this_ping) &&
+	    !force) {
+		spin_unlock(&imp->imp_lock);
+		return;
+	}
+
+	imp->imp_force_next_verify = 0;
+
+	spin_unlock(&imp->imp_lock);
+
+	CDEBUG(level == LUSTRE_IMP_FULL ? D_INFO : D_HA, "%s->%s: level %s/%u "
+	       "force %u force_next %u deactive %u pingable %u suppress %u\n",
+	       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
+	       ptlrpc_import_state_name(level), level, force, force_next,
+	       imp->imp_deactive, imp->imp_pingable, suppress);
+
+        if (level == LUSTRE_IMP_DISCON && !imp_is_deactive(imp)) {
+                /* wait for a while before trying recovery again */
+                imp->imp_next_ping = ptlrpc_next_reconnect(imp);
+                if (!imp->imp_no_pinger_recover)
+                        ptlrpc_initiate_recovery(imp);
+        } else if (level != LUSTRE_IMP_FULL ||
+                   imp->imp_obd->obd_no_recov ||
+                   imp_is_deactive(imp)) {
+		CDEBUG(D_HA, "%s->%s: not pinging (in recovery "
+		       "or recovery disabled: %s)\n",
+		       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
+		       ptlrpc_import_state_name(level));
+		if (force) {
+			spin_lock(&imp->imp_lock);
+			imp->imp_force_verify = 1;
+			spin_unlock(&imp->imp_lock);
+		}
+	} else if ((imp->imp_pingable && !suppress) || force_next || force) {
+		ptlrpc_ping(imp);
+	}
+}
+
+static int ptlrpc_pinger_main(void *arg)
+{
+	struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg;
+	ENTRY;
+
+	/* Record that the thread is running */
+	thread_set_flags(thread, SVC_RUNNING);
+	wake_up(&thread->t_ctl_waitq);
+
+	/* And now, loop forever, pinging as needed. */
+	while (1) {
+		cfs_time_t this_ping = cfs_time_current();
+		struct l_wait_info lwi;
+		cfs_duration_t time_to_next_wake;
+		struct timeout_item *item;
+		struct list_head *iter;
+
+		mutex_lock(&pinger_mutex);
+		list_for_each_entry(item, &timeout_list, ti_chain)
+                        item->ti_cb(item, item->ti_cb_data);
+
+		list_for_each(iter, &pinger_imports) {
+			struct obd_import *imp = list_entry(iter,
+							    struct obd_import,
+							    imp_pinger_chain);
+
+                        ptlrpc_pinger_process_import(imp, this_ping);
+                        /* obd_timeout might have changed */
+                        if (imp->imp_pingable && imp->imp_next_ping &&
+                            cfs_time_after(imp->imp_next_ping,
+                                           cfs_time_add(this_ping,
+                                                        cfs_time_seconds(PING_INTERVAL))))
+                                ptlrpc_update_next_ping(imp, 0);
+                }
+		mutex_unlock(&pinger_mutex);
+                /* update memory usage info */
+                obd_update_maxusage();
+
+                /* Wait until the next ping time, or until we're stopped. */
+                time_to_next_wake = pinger_check_timeout(this_ping);
+                /* The ping sent by ptlrpc_send_rpc may get sent out
+                   say .01 second after this.
+                   ptlrpc_pinger_sending_on_import will then set the
+                   next ping time to next_ping + .01 sec, which means
+                   we will SKIP the next ping at next_ping, and the
+                   ping will get sent 2 timeouts from now!  Beware. */
+		CDEBUG(D_INFO, "next wakeup in "CFS_DURATION_T" (%ld)\n",
+		       time_to_next_wake,
+                       cfs_time_add(this_ping,cfs_time_seconds(PING_INTERVAL)));
+                if (time_to_next_wake > 0) {
+                        lwi = LWI_TIMEOUT(max_t(cfs_duration_t,
+                                                time_to_next_wake,
+                                                cfs_time_seconds(1)),
+                                          NULL, NULL);
+                        l_wait_event(thread->t_ctl_waitq,
+                                     thread_is_stopping(thread) ||
+                                     thread_is_event(thread),
+                                     &lwi);
+                        if (thread_test_and_clear_flags(thread, SVC_STOPPING)) {
+                                EXIT;
+                                break;
+                        } else {
+                                /* woken after adding import to reset timer */
+                                thread_test_and_clear_flags(thread, SVC_EVENT);
+                        }
+                }
+        }
+
+	thread_set_flags(thread, SVC_STOPPED);
+	wake_up(&thread->t_ctl_waitq);
+
+	CDEBUG(D_NET, "pinger thread exiting, process %d\n", current_pid());
+	return 0;
+}
+
+static struct ptlrpc_thread pinger_thread;
+
+int ptlrpc_start_pinger(void)
+{
+	struct l_wait_info lwi = { 0 };
+	struct task_struct *task;
+	int rc;
+#ifndef ENABLE_PINGER
+	return 0;
+#endif
+	ENTRY;
+
+	if (!thread_is_init(&pinger_thread) &&
+	    !thread_is_stopped(&pinger_thread))
+		RETURN(-EALREADY);
+
+	init_waitqueue_head(&pinger_thread.t_ctl_waitq);
+
+	strcpy(pinger_thread.t_name, "ll_ping");
+
+	task = kthread_run(ptlrpc_pinger_main, &pinger_thread,
+			   pinger_thread.t_name);
+	if (IS_ERR(task)) {
+		rc = PTR_ERR(task);
+		CERROR("cannot start pinger thread: rc = %d\n", rc);
+		RETURN(rc);
+	}
+
+	l_wait_event(pinger_thread.t_ctl_waitq,
+		     thread_is_running(&pinger_thread), &lwi);
+
+	if (suppress_pings)
+		CWARN("Pings will be suppressed at the request of the "
+		      "administrator.  The configuration shall meet the "
+		      "additional requirements described in the manual.  "
+		      "(Search for the \"suppress_pings\" kernel module "
+		      "parameter.)\n");
+
+	RETURN(0);
+}
+
+int ptlrpc_pinger_remove_timeouts(void);
+
+int ptlrpc_stop_pinger(void)
+{
+	struct l_wait_info lwi = { 0 };
+#ifndef ENABLE_PINGER
+	return 0;
+#endif
+	ENTRY;
+
+	if (thread_is_init(&pinger_thread) ||
+	    thread_is_stopped(&pinger_thread))
+		RETURN(-EALREADY);
+
+	ptlrpc_pinger_remove_timeouts();
+
+	thread_set_flags(&pinger_thread, SVC_STOPPING);
+	wake_up(&pinger_thread.t_ctl_waitq);
+
+	l_wait_event(pinger_thread.t_ctl_waitq,
+		     thread_is_stopped(&pinger_thread), &lwi);
+	RETURN(0);
+}
+
+void ptlrpc_pinger_sending_on_import(struct obd_import *imp)
+{
+        ptlrpc_update_next_ping(imp, 0);
+}
+
+void ptlrpc_pinger_commit_expected(struct obd_import *imp)
+{
+	ptlrpc_update_next_ping(imp, 1);
+	assert_spin_locked(&imp->imp_lock);
+	/*
+	 * Avoid reading stale imp_connect_data.  When not sure if pings are
+	 * expected or not on next connection, we assume they are not and force
+	 * one anyway to guarantee the chance of updating
+	 * imp_peer_committed_transno.
+	 */
+	if (imp->imp_state != LUSTRE_IMP_FULL ||
+	    OCD_HAS_FLAG(&imp->imp_connect_data, PINGLESS))
+		imp->imp_force_next_verify = 1;
+}
+
+int ptlrpc_pinger_add_import(struct obd_import *imp)
+{
+        ENTRY;
+	if (!list_empty(&imp->imp_pinger_chain))
+                RETURN(-EALREADY);
+
+	mutex_lock(&pinger_mutex);
+        CDEBUG(D_HA, "adding pingable import %s->%s\n",
+               imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
+        /* if we add to pinger we want recovery on this import */
+        imp->imp_obd->obd_no_recov = 0;
+        ptlrpc_update_next_ping(imp, 0);
+        /* XXX sort, blah blah */
+	list_add_tail(&imp->imp_pinger_chain, &pinger_imports);
+        class_import_get(imp);
+
+        ptlrpc_pinger_wake_up();
+	mutex_unlock(&pinger_mutex);
+
+        RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_pinger_add_import);
+
+int ptlrpc_pinger_del_import(struct obd_import *imp)
+{
+	ENTRY;
+
+	if (list_empty(&imp->imp_pinger_chain))
+		RETURN(-ENOENT);
+
+	mutex_lock(&pinger_mutex);
+	list_del_init(&imp->imp_pinger_chain);
+	CDEBUG(D_HA, "removing pingable import %s->%s\n",
+	       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
+	/* if we remove from pinger we don't want recovery on this import */
+	imp->imp_obd->obd_no_recov = 1;
+	class_import_put(imp);
+	mutex_unlock(&pinger_mutex);
+	RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_pinger_del_import);
+
+/**
+ * Register a timeout callback to the pinger list, and the callback will
+ * be called when timeout happens.
+ */
+static struct timeout_item *ptlrpc_new_timeout(int time,
+					       enum timeout_event event,
+					       timeout_cb_t cb, void *data)
+{
+        struct timeout_item *ti;
+
+        OBD_ALLOC_PTR(ti);
+        if (!ti)
+                return(NULL);
+
+	INIT_LIST_HEAD(&ti->ti_obd_list);
+	INIT_LIST_HEAD(&ti->ti_chain);
+        ti->ti_timeout = time;
+        ti->ti_event = event;
+        ti->ti_cb = cb;
+        ti->ti_cb_data = data;
+
+        return ti;
+}
+
+/**
+ * Register timeout event on the the pinger thread.
+ * Note: the timeout list is an sorted list with increased timeout value.
+ */
+static struct timeout_item*
+ptlrpc_pinger_register_timeout(int time, enum timeout_event event,
+                               timeout_cb_t cb, void *data)
+{
+	struct timeout_item *item, *tmp;
+
+	LASSERT(mutex_is_locked(&pinger_mutex));
+
+	list_for_each_entry(item, &timeout_list, ti_chain)
+		if (item->ti_event == event)
+			goto out;
+
+	item = ptlrpc_new_timeout(time, event, cb, data);
+	if (item) {
+		list_for_each_entry_reverse(tmp, &timeout_list, ti_chain) {
+			if (tmp->ti_timeout < time) {
+				list_add(&item->ti_chain, &tmp->ti_chain);
+				goto out;
+			}
+		}
+		list_add(&item->ti_chain, &timeout_list);
+	}
+out:
+	return item;
+}
+
+/* Add a client_obd to the timeout event list, when timeout(@time)
+ * happens, the callback(@cb) will be called.
+ */
+int ptlrpc_add_timeout_client(int time, enum timeout_event event,
+                              timeout_cb_t cb, void *data,
+			      struct list_head *obd_list)
+{
+        struct timeout_item *ti;
+
+	mutex_lock(&pinger_mutex);
+        ti = ptlrpc_pinger_register_timeout(time, event, cb, data);
+        if (!ti) {
+		mutex_unlock(&pinger_mutex);
+                return (-EINVAL);
+        }
+	list_add(obd_list, &ti->ti_obd_list);
+	mutex_unlock(&pinger_mutex);
+        return 0;
+}
+EXPORT_SYMBOL(ptlrpc_add_timeout_client);
+
+int ptlrpc_del_timeout_client(struct list_head *obd_list,
+			      enum timeout_event event)
+{
+	struct timeout_item *ti = NULL, *item;
+
+	if (list_empty(obd_list))
+		return 0;
+	mutex_lock(&pinger_mutex);
+	list_del_init(obd_list);
+	/**
+	 * If there are no obd attached to the timeout event
+	 * list, remove this timeout event from the pinger
+	 */
+	list_for_each_entry(item, &timeout_list, ti_chain) {
+		if (item->ti_event == event) {
+			ti = item;
+			break;
+		}
+	}
+	LASSERTF(ti != NULL, "ti is NULL !\n");
+	if (list_empty(&ti->ti_obd_list)) {
+		list_del(&ti->ti_chain);
+		OBD_FREE_PTR(ti);
+	}
+	mutex_unlock(&pinger_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_del_timeout_client);
+
+int ptlrpc_pinger_remove_timeouts(void)
+{
+        struct timeout_item *item, *tmp;
+
+	mutex_lock(&pinger_mutex);
+	list_for_each_entry_safe(item, tmp, &timeout_list, ti_chain) {
+		LASSERT(list_empty(&item->ti_obd_list));
+		list_del(&item->ti_chain);
+                OBD_FREE_PTR(item);
+        }
+	mutex_unlock(&pinger_mutex);
+        return 0;
+}
+
+void ptlrpc_pinger_wake_up()
+{
+#ifdef ENABLE_PINGER
+	thread_add_flags(&pinger_thread, SVC_EVENT);
+	wake_up(&pinger_thread.t_ctl_waitq);
+#endif
+}
+
+/* Ping evictor thread */
+#define PET_READY     1
+#define PET_TERMINATE 2
+
+static int               pet_refcount = 0;
+static int               pet_state;
+static wait_queue_head_t pet_waitq;
+static struct list_head	 pet_list;
+static DEFINE_SPINLOCK(pet_lock);
+
+int ping_evictor_wake(struct obd_export *exp)
+{
+	struct obd_device *obd;
+
+	spin_lock(&pet_lock);
+	if (pet_state != PET_READY) {
+		/* eventually the new obd will call here again. */
+		spin_unlock(&pet_lock);
+		return 1;
+	}
+
+	obd = class_exp2obd(exp);
+	if (list_empty(&obd->obd_evict_list)) {
+		class_incref(obd, "evictor", obd);
+		list_add(&obd->obd_evict_list, &pet_list);
+	}
+	spin_unlock(&pet_lock);
+
+	wake_up(&pet_waitq);
+	return 0;
+}
+
+static int ping_evictor_main(void *arg)
+{
+        struct obd_device *obd;
+        struct obd_export *exp;
+        struct l_wait_info lwi = { 0 };
+        time_t expire_time;
+        ENTRY;
+
+	unshare_fs_struct();
+
+	CDEBUG(D_HA, "Starting Ping Evictor\n");
+	pet_state = PET_READY;
+	while (1) {
+		l_wait_event(pet_waitq, (!list_empty(&pet_list)) ||
+			    (pet_state == PET_TERMINATE), &lwi);
+
+		/* loop until all obd's will be removed */
+		if ((pet_state == PET_TERMINATE) && list_empty(&pet_list))
+			break;
+
+		/* we only get here if pet_exp != NULL, and the end of this
+		 * loop is the only place which sets it NULL again, so lock
+		 * is not strictly necessary. */
+		spin_lock(&pet_lock);
+		obd = list_entry(pet_list.next, struct obd_device,
+				 obd_evict_list);
+		spin_unlock(&pet_lock);
+
+		expire_time = cfs_time_current_sec() - PING_EVICT_TIMEOUT;
+
+		CDEBUG(D_HA, "evicting all exports of obd %s older than %ld\n",
+		       obd->obd_name, expire_time);
+
+		/* Exports can't be deleted out of the list while we hold
+		 * the obd lock (class_unlink_export), which means we can't
+		 * lose the last ref on the export.  If they've already been
+		 * removed from the list, we won't find them here. */
+		spin_lock(&obd->obd_dev_lock);
+		while (!list_empty(&obd->obd_exports_timed)) {
+			exp = list_entry(obd->obd_exports_timed.next,
+					 struct obd_export,
+					 exp_obd_chain_timed);
+			if (expire_time > exp->exp_last_request_time) {
+				class_export_get(exp);
+				spin_unlock(&obd->obd_dev_lock);
+				LCONSOLE_WARN("%s: haven't heard from client %s"
+                                              " (at %s) in %ld seconds. I think"
+                                              " it's dead, and I am evicting"
+                                              " it. exp %p, cur %ld expire %ld"
+                                              " last %ld\n",
+                                              obd->obd_name,
+                                              obd_uuid2str(&exp->exp_client_uuid),
+                                              obd_export_nid2str(exp),
+                                              (long)(cfs_time_current_sec() -
+                                                     exp->exp_last_request_time),
+                                              exp, (long)cfs_time_current_sec(),
+                                              (long)expire_time,
+                                              (long)exp->exp_last_request_time);
+                                CDEBUG(D_HA, "Last request was at %ld\n",
+                                       exp->exp_last_request_time);
+                                class_fail_export(exp);
+                                class_export_put(exp);
+				spin_lock(&obd->obd_dev_lock);
+			} else {
+				/* List is sorted, so everyone below is ok */
+				break;
+			}
+		}
+		spin_unlock(&obd->obd_dev_lock);
+
+		spin_lock(&pet_lock);
+		list_del_init(&obd->obd_evict_list);
+		spin_unlock(&pet_lock);
+
+                class_decref(obd, "evictor", obd);
+        }
+        CDEBUG(D_HA, "Exiting Ping Evictor\n");
+
+        RETURN(0);
+}
+
+void ping_evictor_start(void)
+{
+	struct task_struct *task;
+
+	if (++pet_refcount > 1)
+		return;
+
+	INIT_LIST_HEAD(&pet_list);
+	init_waitqueue_head(&pet_waitq);
+
+	task = kthread_run(ping_evictor_main, NULL, "ll_evictor");
+	if (IS_ERR(task)) {
+		pet_refcount--;
+		CERROR("Cannot start ping evictor thread: %ld\n",
+			PTR_ERR(task));
+	}
+}
+EXPORT_SYMBOL(ping_evictor_start);
+
+void ping_evictor_stop(void)
+{
+        if (--pet_refcount > 0)
+                return;
+
+        pet_state = PET_TERMINATE;
+	wake_up(&pet_waitq);
+}
+EXPORT_SYMBOL(ping_evictor_stop);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_internal.h b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_internal.h
new file mode 100644
index 0000000000000..cfd1de5bb3d45
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_internal.h
@@ -0,0 +1,414 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/* Intramodule declarations for ptlrpc. */
+
+#ifndef PTLRPC_INTERNAL_H
+#define PTLRPC_INTERNAL_H
+
+#include "../ldlm/ldlm_internal.h"
+
+struct ldlm_namespace;
+struct obd_import;
+struct ldlm_res_id;
+struct ptlrpc_request_set;
+extern int test_req_buffer_pressure;
+extern struct list_head ptlrpc_all_services;
+extern struct mutex ptlrpc_all_services_mutex;
+extern struct ptlrpc_nrs_pol_conf nrs_conf_fifo;
+
+#ifdef HAVE_SERVER_SUPPORT
+extern struct ptlrpc_nrs_pol_conf nrs_conf_crrn;
+extern struct ptlrpc_nrs_pol_conf nrs_conf_orr;
+extern struct ptlrpc_nrs_pol_conf nrs_conf_trr;
+extern struct ptlrpc_nrs_pol_conf nrs_conf_tbf;
+extern struct ptlrpc_nrs_pol_conf nrs_conf_delay;
+#endif /* HAVE_SERVER_SUPPORT */
+
+/**
+ * \addtogoup nrs
+ * @{
+ */
+extern struct nrs_core nrs_core;
+
+extern struct mutex ptlrpcd_mutex;
+extern struct mutex pinger_mutex;
+
+int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait);
+/* ptlrpcd.c */
+int ptlrpcd_start(struct ptlrpcd_ctl *pc);
+
+/* client.c */
+void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
+			       unsigned int service_time);
+struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw,
+					 enum ptlrpc_bulk_op_type type,
+					 unsigned portal,
+					 const struct ptlrpc_bulk_frag_ops
+						*ops);
+int ptlrpc_request_cache_init(void);
+void ptlrpc_request_cache_fini(void);
+struct ptlrpc_request *ptlrpc_request_cache_alloc(gfp_t flags);
+void ptlrpc_request_cache_free(struct ptlrpc_request *req);
+void ptlrpc_init_xid(void);
+void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
+			    struct ptlrpc_request *req);
+int ptlrpc_expired_set(void *data);
+int ptlrpc_set_next_timeout(struct ptlrpc_request_set *);
+void ptlrpc_resend_req(struct ptlrpc_request *request);
+void ptlrpc_set_bulk_mbits(struct ptlrpc_request *req);
+void ptlrpc_assign_next_xid_nolock(struct ptlrpc_request *req);
+__u64 ptlrpc_known_replied_xid(struct obd_import *imp);
+void ptlrpc_add_unreplied(struct ptlrpc_request *req);
+
+/* events.c */
+int ptlrpc_init_portals(void);
+void ptlrpc_exit_portals(void);
+
+void ptlrpc_request_handle_notconn(struct ptlrpc_request *);
+void lustre_assert_wire_constants(void);
+int ptlrpc_import_in_recovery(struct obd_import *imp);
+int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt);
+void ptlrpc_handle_failed_import(struct obd_import *imp);
+int ptlrpc_replay_next(struct obd_import *imp, int *inflight);
+void ptlrpc_initiate_recovery(struct obd_import *imp);
+
+int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset);
+int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset);
+
+#ifdef CONFIG_PROC_FS
+void ptlrpc_lprocfs_register_service(struct proc_dir_entry *proc_entry,
+                                     struct ptlrpc_service *svc);
+void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc);
+void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount);
+void ptlrpc_lprocfs_do_request_stat (struct ptlrpc_request *req,
+                                     long q_usec, long work_usec);
+#else
+#define ptlrpc_lprocfs_register_service(params...) do{}while(0)
+#define ptlrpc_lprocfs_unregister_service(params...) do{}while(0)
+#define ptlrpc_lprocfs_rpc_sent(params...) do{}while(0)
+#define ptlrpc_lprocfs_do_request_stat(params...) do{}while(0)
+#endif /* CONFIG_PROC_FS */
+
+/* NRS */
+
+/**
+ * NRS core object.
+ *
+ * Holds NRS core fields.
+ */
+struct nrs_core {
+	/**
+	 * Protects nrs_core::nrs_policies, serializes external policy
+	 * registration/unregistration, and NRS core lprocfs operations.
+	 */
+	struct mutex nrs_mutex;
+	/**
+	 * List of all policy descriptors registered with NRS core; protected
+	 * by nrs_core::nrs_mutex.
+	 */
+	struct list_head nrs_policies;
+};
+
+int ptlrpc_service_nrs_setup(struct ptlrpc_service *svc);
+void ptlrpc_service_nrs_cleanup(struct ptlrpc_service *svc);
+
+void ptlrpc_nrs_req_initialize(struct ptlrpc_service_part *svcpt,
+			       struct ptlrpc_request *req, bool hp);
+void ptlrpc_nrs_req_finalize(struct ptlrpc_request *req);
+void ptlrpc_nrs_req_stop_nolock(struct ptlrpc_request *req);
+void ptlrpc_nrs_req_add(struct ptlrpc_service_part *svcpt,
+			struct ptlrpc_request *req, bool hp);
+
+struct ptlrpc_request *
+ptlrpc_nrs_req_get_nolock0(struct ptlrpc_service_part *svcpt, bool hp,
+			   bool peek, bool force);
+
+static inline struct ptlrpc_request *
+ptlrpc_nrs_req_get_nolock(struct ptlrpc_service_part *svcpt, bool hp,
+			  bool force)
+{
+	return ptlrpc_nrs_req_get_nolock0(svcpt, hp, false, force);
+}
+
+static inline struct ptlrpc_request *
+ptlrpc_nrs_req_peek_nolock(struct ptlrpc_service_part *svcpt, bool hp)
+{
+	return ptlrpc_nrs_req_get_nolock0(svcpt, hp, true, false);
+}
+
+void ptlrpc_nrs_req_del_nolock(struct ptlrpc_request *req);
+bool ptlrpc_nrs_req_pending_nolock(struct ptlrpc_service_part *svcpt, bool hp);
+bool ptlrpc_nrs_req_throttling_nolock(struct ptlrpc_service_part *svcpt,
+				      bool hp);
+
+int ptlrpc_nrs_policy_control(const struct ptlrpc_service *svc,
+			      enum ptlrpc_nrs_queue_type queue, char *name,
+			      enum ptlrpc_nrs_ctl opc, bool single, void *arg);
+
+int ptlrpc_nrs_init(void);
+void ptlrpc_nrs_fini(void);
+
+static inline bool nrs_svcpt_has_hp(const struct ptlrpc_service_part *svcpt)
+{
+	return svcpt->scp_nrs_hp != NULL;
+}
+
+static inline bool nrs_svc_has_hp(const struct ptlrpc_service *svc)
+{
+	/**
+	 * If the first service partition has an HP NRS head, all service
+	 * partitions will.
+	 */
+	return nrs_svcpt_has_hp(svc->srv_parts[0]);
+}
+
+static inline
+struct ptlrpc_nrs *nrs_svcpt2nrs(struct ptlrpc_service_part *svcpt, bool hp)
+{
+	LASSERT(ergo(hp, nrs_svcpt_has_hp(svcpt)));
+	return hp ? svcpt->scp_nrs_hp : &svcpt->scp_nrs_reg;
+}
+
+static inline int nrs_pol2cptid(const struct ptlrpc_nrs_policy *policy)
+{
+	return policy->pol_nrs->nrs_svcpt->scp_cpt;
+}
+
+static inline
+struct ptlrpc_service *nrs_pol2svc(struct ptlrpc_nrs_policy *policy)
+{
+	return policy->pol_nrs->nrs_svcpt->scp_service;
+}
+
+static inline
+struct ptlrpc_service_part *nrs_pol2svcpt(struct ptlrpc_nrs_policy *policy)
+{
+	return policy->pol_nrs->nrs_svcpt;
+}
+
+static inline
+struct cfs_cpt_table *nrs_pol2cptab(struct ptlrpc_nrs_policy *policy)
+{
+	return nrs_pol2svc(policy)->srv_cptable;
+}
+
+static inline struct ptlrpc_nrs_resource *
+nrs_request_resource(struct ptlrpc_nrs_request *nrq)
+{
+	LASSERT(nrq->nr_initialized);
+	LASSERT(!nrq->nr_finalized);
+
+	return nrq->nr_res_ptrs[nrq->nr_res_idx];
+}
+
+static inline
+struct ptlrpc_nrs_policy *nrs_request_policy(struct ptlrpc_nrs_request *nrq)
+{
+	return nrs_request_resource(nrq)->res_policy;
+}
+
+#define NRS_LPROCFS_QUANTUM_NAME_REG	"reg_quantum:"
+#define NRS_LPROCFS_QUANTUM_NAME_HP	"hp_quantum:"
+
+/**
+ * the maximum size of nrs_crrn_client::cc_quantum and nrs_orr_data::od_quantum.
+ */
+#define LPROCFS_NRS_QUANTUM_MAX		65535
+
+/**
+ * Max valid command string is the size of the labels, plus "65535" twice, plus
+ * a separating space character.
+ */
+#define LPROCFS_NRS_WR_QUANTUM_MAX_CMD					       \
+ sizeof(NRS_LPROCFS_QUANTUM_NAME_REG __stringify(LPROCFS_NRS_QUANTUM_MAX) " "  \
+        NRS_LPROCFS_QUANTUM_NAME_HP __stringify(LPROCFS_NRS_QUANTUM_MAX))
+
+/* recovd_thread.c */
+
+int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink);
+
+/* pers.c */
+void ptlrpc_fill_bulk_md(struct lnet_md *md, struct ptlrpc_bulk_desc *desc,
+			 int mdcnt);
+
+/* pack_generic.c */
+struct ptlrpc_reply_state *
+lustre_get_emerg_rs(struct ptlrpc_service_part *svcpt);
+void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs);
+
+/* pinger.c */
+int ptlrpc_start_pinger(void);
+int ptlrpc_stop_pinger(void);
+void ptlrpc_pinger_sending_on_import(struct obd_import *imp);
+void ptlrpc_pinger_commit_expected(struct obd_import *imp);
+void ptlrpc_pinger_wake_up(void);
+void ptlrpc_ping_import_soon(struct obd_import *imp);
+int ping_evictor_wake(struct obd_export *exp);
+
+/* sec_null.c */
+int  sptlrpc_null_init(void);
+void sptlrpc_null_fini(void);
+
+/* sec_plain.c */
+int  sptlrpc_plain_init(void);
+void sptlrpc_plain_fini(void);
+
+/* sec_bulk.c */
+int  sptlrpc_enc_pool_init(void);
+void sptlrpc_enc_pool_fini(void);
+int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v);
+
+/* sec_lproc.c */
+int  sptlrpc_lproc_init(void);
+void sptlrpc_lproc_fini(void);
+
+/* sec_gc.c */
+int sptlrpc_gc_init(void);
+void sptlrpc_gc_fini(void);
+
+/* sec_config.c */
+void sptlrpc_conf_choose_flavor(enum lustre_sec_part from,
+                                enum lustre_sec_part to,
+                                struct obd_uuid *target,
+                                lnet_nid_t nid,
+                                struct sptlrpc_flavor *sf);
+int  sptlrpc_conf_init(void);
+void sptlrpc_conf_fini(void);
+
+/* sec.c */
+int  sptlrpc_init(void);
+void sptlrpc_fini(void);
+
+/* layout.c */
+__u32 __req_capsule_offset(const struct req_capsule *pill,
+			   const struct req_msg_field *field,
+			   enum req_location loc);
+
+static inline bool ptlrpc_recoverable_error(int rc)
+{
+	return (rc == -ENOTCONN || rc == -ENODEV);
+}
+
+#ifdef HAVE_SERVER_SUPPORT
+int tgt_mod_init(void);
+void tgt_mod_exit(void);
+int nodemap_mod_init(void);
+void nodemap_mod_exit(void);
+#else /* HAVE_SERVER_SUPPORT */
+static inline int tgt_mod_init(void)
+{
+	return 0;
+}
+
+static inline void tgt_mod_exit(void)
+{
+	return;
+}
+
+static inline int nodemap_mod_init(void)
+{
+	return 0;
+}
+
+static inline void nodemap_mod_exit(void)
+{
+	return;
+}
+#endif /* !HAVE_SERVER_SUPPORT */
+
+static inline void ptlrpc_reqset_put(struct ptlrpc_request_set *set)
+{
+	if (atomic_dec_and_test(&set->set_refcount))
+		OBD_FREE_PTR(set);
+}
+
+/** initialise ptlrpc common fields */
+static inline void ptlrpc_req_comm_init(struct ptlrpc_request *req)
+{
+	spin_lock_init(&req->rq_lock);
+	spin_lock_init(&req->rq_early_free_lock);
+	atomic_set(&req->rq_refcount, 1);
+	INIT_LIST_HEAD(&req->rq_list);
+	INIT_LIST_HEAD(&req->rq_replay_list);
+}
+
+/** initialise client side ptlrpc request */
+static inline void ptlrpc_cli_req_init(struct ptlrpc_request *req)
+{
+	struct ptlrpc_cli_req *cr = &req->rq_cli;
+
+	ptlrpc_req_comm_init(req);
+
+	req->rq_receiving_reply = 0;
+	req->rq_req_unlinked = req->rq_reply_unlinked = 1;
+
+	INIT_LIST_HEAD(&cr->cr_set_chain);
+	INIT_LIST_HEAD(&cr->cr_ctx_chain);
+	INIT_LIST_HEAD(&cr->cr_unreplied_list);
+	init_waitqueue_head(&cr->cr_reply_waitq);
+	init_waitqueue_head(&cr->cr_set_waitq);
+}
+
+/** initialise server side ptlrpc request */
+static inline void ptlrpc_srv_req_init(struct ptlrpc_request *req)
+{
+	struct ptlrpc_srv_req *sr = &req->rq_srv;
+
+	ptlrpc_req_comm_init(req);
+	req->rq_srv_req = 1;
+	INIT_LIST_HEAD(&sr->sr_exp_list);
+	INIT_LIST_HEAD(&sr->sr_timed_list);
+	INIT_LIST_HEAD(&sr->sr_hist_list);
+}
+
+static inline bool ptlrpc_req_is_connect(struct ptlrpc_request *req)
+{
+	if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_CONNECT ||
+	    lustre_msg_get_opc(req->rq_reqmsg) == OST_CONNECT ||
+	    lustre_msg_get_opc(req->rq_reqmsg) == MGS_CONNECT)
+		return true;
+	else
+		return false;
+}
+
+static inline bool ptlrpc_req_is_disconnect(struct ptlrpc_request *req)
+{
+	if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_DISCONNECT ||
+	    lustre_msg_get_opc(req->rq_reqmsg) == OST_DISCONNECT ||
+	    lustre_msg_get_opc(req->rq_reqmsg) == MGS_DISCONNECT)
+		return true;
+	else
+		return false;
+}
+
+#endif /* PTLRPC_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_module.c b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_module.c
new file mode 100644
index 0000000000000..b11c07d54ba23
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_module.c
@@ -0,0 +1,155 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_req_layout.h>
+
+#include "ptlrpc_internal.h"
+
+extern spinlock_t ptlrpc_last_xid_lock;
+#if RS_DEBUG
+extern spinlock_t ptlrpc_rs_debug_lock;
+#endif
+
+static __init int ptlrpc_init(void)
+{
+	int rc;
+
+	ENTRY;
+
+	lustre_assert_wire_constants();
+#if RS_DEBUG
+	spin_lock_init(&ptlrpc_rs_debug_lock);
+#endif
+	INIT_LIST_HEAD(&ptlrpc_all_services);
+	mutex_init(&ptlrpc_all_services_mutex);
+	mutex_init(&pinger_mutex);
+	mutex_init(&ptlrpcd_mutex);
+	ptlrpc_init_xid();
+
+	rc = req_layout_init();
+	if (rc)
+		RETURN(rc);
+
+	rc = tgt_mod_init();
+	if (rc)
+		GOTO(err_layout, rc);
+
+	rc = ptlrpc_hr_init();
+	if (rc)
+		GOTO(err_tgt, rc);
+
+	rc = ptlrpc_request_cache_init();
+	if (rc)
+		GOTO(err_hr, rc);
+
+	rc = ptlrpc_init_portals();
+	if (rc)
+		GOTO(err_cache, rc);
+
+	rc = ptlrpc_connection_init();
+	if (rc)
+		GOTO(err_portals, rc);
+
+	ptlrpc_put_connection_superhack = ptlrpc_connection_put;
+
+	rc = ptlrpc_start_pinger();
+	if (rc)
+		GOTO(err_conn, rc);
+
+	rc = ldlm_init();
+	if (rc)
+		GOTO(err_pinger, rc);
+
+	rc = sptlrpc_init();
+	if (rc)
+		GOTO(err_ldlm, rc);
+
+	rc = ptlrpc_nrs_init();
+	if (rc)
+		GOTO(err_sptlrpc, rc);
+
+	rc = nodemap_mod_init();
+	if (rc)
+		GOTO(err_nrs, rc);
+
+	RETURN(0);
+err_nrs:
+	ptlrpc_nrs_fini();
+err_sptlrpc:
+	sptlrpc_fini();
+err_ldlm:
+	ldlm_exit();
+err_pinger:
+	ptlrpc_stop_pinger();
+err_conn:
+	ptlrpc_connection_fini();
+err_portals:
+	ptlrpc_exit_portals();
+err_cache:
+	ptlrpc_request_cache_fini();
+err_hr:
+	ptlrpc_hr_fini();
+err_tgt:
+	tgt_mod_exit();
+err_layout:
+	req_layout_fini();
+	return rc;
+}
+
+static void __exit ptlrpc_exit(void)
+{
+	nodemap_mod_exit();
+	ptlrpc_nrs_fini();
+	sptlrpc_fini();
+	ldlm_exit();
+	ptlrpc_stop_pinger();
+	ptlrpc_exit_portals();
+	ptlrpc_request_cache_fini();
+	ptlrpc_hr_fini();
+	ptlrpc_connection_fini();
+	tgt_mod_exit();
+	req_layout_fini();
+}
+
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Request Processor and Lock Management");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+MODULE_LICENSE("GPL");
+
+module_init(ptlrpc_init);
+module_exit(ptlrpc_exit);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c
new file mode 100644
index 0000000000000..0532c4d22d8bd
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c
@@ -0,0 +1,965 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/ptlrpcd.c
+ */
+
+/** \defgroup ptlrpcd PortalRPC daemon
+ *
+ * ptlrpcd is a special thread with its own set where other user might add
+ * requests when they don't want to wait for their completion.
+ * PtlRPCD will take care of sending such requests and then processing their
+ * replies and calling completion callbacks as necessary.
+ * The callbacks are called directly from ptlrpcd context.
+ * It is important to never significantly block (esp. on RPCs!) within such
+ * completion handler or a deadlock might occur where ptlrpcd enters some
+ * callback that attempts to send another RPC and wait for it to return,
+ * during which time ptlrpcd is completely blocked, so e.g. if import
+ * fails, recovery cannot progress because connection requests are also
+ * sent by ptlrpcd.
+ *
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <linux/kthread.h>
+#include <libcfs/libcfs.h>
+#include <lustre_net.h>
+#include <lustre_lib.h>
+#include <lustre_ha.h>
+#include <obd_class.h>   /* for obd_zombie */
+#include <obd_support.h> /* for OBD_FAIL_CHECK */
+#include <cl_object.h> /* cl_env_{get,put}() */
+#include <lprocfs_status.h>
+
+#include "ptlrpc_internal.h"
+
+/* One of these per CPT. */
+struct ptlrpcd {
+	int			pd_size;
+	int			pd_index;
+	int			pd_cpt;
+	int			pd_cursor;
+	int			pd_nthreads;
+	int			pd_groupsize;
+	struct ptlrpcd_ctl	pd_threads[0];
+};
+
+/*
+ * max_ptlrpcds is obsolete, but retained to ensure that the kernel
+ * module will load on a system where it has been tuned.
+ * A value other than 0 implies it was tuned, in which case the value
+ * is used to derive a setting for ptlrpcd_per_cpt_max.
+ */
+static int max_ptlrpcds;
+module_param(max_ptlrpcds, int, 0644);
+MODULE_PARM_DESC(max_ptlrpcds,
+		 "Max ptlrpcd thread count to be started (obsolete).");
+
+/*
+ * ptlrpcd_bind_policy is obsolete, but retained to ensure that
+ * the kernel module will load on a system where it has been tuned.
+ * A value other than 0 implies it was tuned, in which case the value
+ * is used to derive a setting for ptlrpcd_partner_group_size.
+ */
+static int ptlrpcd_bind_policy;
+module_param(ptlrpcd_bind_policy, int, 0644);
+MODULE_PARM_DESC(ptlrpcd_bind_policy,
+		 "Ptlrpcd threads binding mode (obsolete).");
+
+/*
+ * ptlrpcd_per_cpt_max: The maximum number of ptlrpcd threads to run
+ * in a CPT.
+ */
+static int ptlrpcd_per_cpt_max;
+module_param(ptlrpcd_per_cpt_max, int, 0644);
+MODULE_PARM_DESC(ptlrpcd_per_cpt_max,
+		 "Max ptlrpcd thread count to be started per CPT.");
+
+/*
+ * ptlrpcd_partner_group_size: The desired number of threads in each
+ * ptlrpcd partner thread group. Default is 2, corresponding to the
+ * old PDB_POLICY_PAIR. A negative value makes all ptlrpcd threads in
+ * a CPT partners of each other.
+ */
+static int ptlrpcd_partner_group_size;
+module_param(ptlrpcd_partner_group_size, int, 0644);
+MODULE_PARM_DESC(ptlrpcd_partner_group_size,
+		 "Number of ptlrpcd threads in a partner group.");
+
+/*
+ * ptlrpcd_cpts: A CPT string describing the CPU partitions that
+ * ptlrpcd threads should run on. Used to make ptlrpcd threads run on
+ * a subset of all CPTs.
+ *
+ * ptlrpcd_cpts=2
+ * ptlrpcd_cpts=[2]
+ *   run ptlrpcd threads only on CPT 2.
+ *
+ * ptlrpcd_cpts=0-3
+ * ptlrpcd_cpts=[0-3]
+ *   run ptlrpcd threads on CPTs 0, 1, 2, and 3.
+ *
+ * ptlrpcd_cpts=[0-3,5,7]
+ *   run ptlrpcd threads on CPTS 0, 1, 2, 3, 5, and 7.
+ */
+static char *ptlrpcd_cpts;
+module_param(ptlrpcd_cpts, charp, 0644);
+MODULE_PARM_DESC(ptlrpcd_cpts,
+		 "CPU partitions ptlrpcd threads should run in");
+
+/* ptlrpcds_cpt_idx maps cpt numbers to an index in the ptlrpcds array. */
+static int		*ptlrpcds_cpt_idx;
+
+/* ptlrpcds_num is the number of entries in the ptlrpcds array. */
+static int		ptlrpcds_num;
+static struct ptlrpcd	**ptlrpcds;
+
+/*
+ * In addition to the regular thread pool above, there is a single
+ * global recovery thread. Recovery isn't critical for performance,
+ * and doesn't block, but must always be able to proceed, and it is
+ * possible that all normal ptlrpcd threads are blocked. Hence the
+ * need for a dedicated thread.
+ */
+static struct ptlrpcd_ctl ptlrpcd_rcv;
+
+struct mutex ptlrpcd_mutex;
+static int ptlrpcd_users = 0;
+
+void ptlrpcd_wake(struct ptlrpc_request *req)
+{
+	struct ptlrpc_request_set *set = req->rq_set;
+
+	LASSERT(set != NULL);
+	wake_up(&set->set_waitq);
+}
+EXPORT_SYMBOL(ptlrpcd_wake);
+
+static struct ptlrpcd_ctl *
+ptlrpcd_select_pc(struct ptlrpc_request *req)
+{
+	struct ptlrpcd	*pd;
+	int		cpt;
+	int		idx;
+
+	if (req != NULL && req->rq_send_state != LUSTRE_IMP_FULL)
+		return &ptlrpcd_rcv;
+
+	cpt = cfs_cpt_current(cfs_cpt_table, 1);
+	if (ptlrpcds_cpt_idx == NULL)
+		idx = cpt;
+	else
+		idx = ptlrpcds_cpt_idx[cpt];
+	pd = ptlrpcds[idx];
+
+	/* We do not care whether it is strict load balance. */
+	idx = pd->pd_cursor;
+	if (++idx == pd->pd_nthreads)
+		idx = 0;
+	pd->pd_cursor = idx;
+
+	return &pd->pd_threads[idx];
+}
+
+/**
+ * Move all request from an existing request set to the ptlrpcd queue.
+ * All requests from the set must be in phase RQ_PHASE_NEW.
+ */
+void ptlrpcd_add_rqset(struct ptlrpc_request_set *set)
+{
+	struct list_head *tmp, *pos;
+	struct ptlrpcd_ctl *pc;
+	struct ptlrpc_request_set *new;
+	int count, i;
+
+	pc = ptlrpcd_select_pc(NULL);
+	new = pc->pc_set;
+
+	list_for_each_safe(pos, tmp, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(pos, struct ptlrpc_request,
+				   rq_set_chain);
+
+		LASSERT(req->rq_phase == RQ_PHASE_NEW);
+		req->rq_set = new;
+		req->rq_queued_time = cfs_time_current();
+	}
+
+	spin_lock(&new->set_new_req_lock);
+	list_splice_init(&set->set_requests, &new->set_new_requests);
+	i = atomic_read(&set->set_remaining);
+	count = atomic_add_return(i, &new->set_new_count);
+	atomic_set(&set->set_remaining, 0);
+	spin_unlock(&new->set_new_req_lock);
+	if (count == i) {
+		wake_up(&new->set_waitq);
+
+		/* XXX: It maybe unnecessary to wakeup all the partners. But to
+		 *      guarantee the async RPC can be processed ASAP, we have
+		 *      no other better choice. It maybe fixed in future. */
+		for (i = 0; i < pc->pc_npartners; i++)
+			wake_up(&pc->pc_partners[i]->pc_set->set_waitq);
+	}
+}
+
+/**
+ * Return transferred RPCs count.
+ */
+static int ptlrpcd_steal_rqset(struct ptlrpc_request_set *des,
+                               struct ptlrpc_request_set *src)
+{
+	struct list_head *tmp, *pos;
+	struct ptlrpc_request *req;
+	int rc = 0;
+
+	spin_lock(&src->set_new_req_lock);
+	if (likely(!list_empty(&src->set_new_requests))) {
+		list_for_each_safe(pos, tmp, &src->set_new_requests) {
+			req = list_entry(pos, struct ptlrpc_request,
+					 rq_set_chain);
+			req->rq_set = des;
+		}
+		list_splice_init(&src->set_new_requests,
+				 &des->set_requests);
+		rc = atomic_read(&src->set_new_count);
+		atomic_add(rc, &des->set_remaining);
+		atomic_set(&src->set_new_count, 0);
+	}
+	spin_unlock(&src->set_new_req_lock);
+	return rc;
+}
+
+/**
+ * Requests that are added to the ptlrpcd queue are sent via
+ * ptlrpcd_check->ptlrpc_check_set().
+ */
+void ptlrpcd_add_req(struct ptlrpc_request *req)
+{
+	struct ptlrpcd_ctl *pc;
+
+	if (req->rq_reqmsg)
+		lustre_msg_set_jobid(req->rq_reqmsg, NULL);
+
+	spin_lock(&req->rq_lock);
+	if (req->rq_invalid_rqset) {
+		struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(5),
+						     back_to_sleep, NULL);
+
+		req->rq_invalid_rqset = 0;
+		spin_unlock(&req->rq_lock);
+		l_wait_event(req->rq_set_waitq, (req->rq_set == NULL), &lwi);
+	} else if (req->rq_set) {
+		/* If we have a vaid "rq_set", just reuse it to avoid double
+		 * linked. */
+		LASSERT(req->rq_phase == RQ_PHASE_NEW);
+		LASSERT(req->rq_send_state == LUSTRE_IMP_REPLAY);
+
+		/* ptlrpc_check_set will decrease the count */
+		atomic_inc(&req->rq_set->set_remaining);
+		spin_unlock(&req->rq_lock);
+		wake_up(&req->rq_set->set_waitq);
+		return;
+	} else {
+		spin_unlock(&req->rq_lock);
+	}
+
+	pc = ptlrpcd_select_pc(req);
+
+	DEBUG_REQ(D_INFO, req, "add req [%p] to pc [%s:%d]",
+		  req, pc->pc_name, pc->pc_index);
+
+	ptlrpc_set_add_new_req(pc, req);
+}
+EXPORT_SYMBOL(ptlrpcd_add_req);
+
+static inline void ptlrpc_reqset_get(struct ptlrpc_request_set *set)
+{
+	atomic_inc(&set->set_refcount);
+}
+
+/**
+ * Check if there is more work to do on ptlrpcd set.
+ * Returns 1 if yes.
+ */
+static int ptlrpcd_check(struct lu_env *env, struct ptlrpcd_ctl *pc)
+{
+	struct list_head *tmp, *pos;
+        struct ptlrpc_request *req;
+        struct ptlrpc_request_set *set = pc->pc_set;
+        int rc = 0;
+        int rc2;
+        ENTRY;
+
+	if (atomic_read(&set->set_new_count)) {
+		spin_lock(&set->set_new_req_lock);
+		if (likely(!list_empty(&set->set_new_requests))) {
+			list_splice_init(&set->set_new_requests,
+					     &set->set_requests);
+			atomic_add(atomic_read(&set->set_new_count),
+				   &set->set_remaining);
+			atomic_set(&set->set_new_count, 0);
+			/*
+			 * Need to calculate its timeout.
+			 */
+			rc = 1;
+		}
+		spin_unlock(&set->set_new_req_lock);
+	}
+
+	/* We should call lu_env_refill() before handling new requests to make
+	 * sure that env key the requests depending on really exists.
+	 */
+	rc2 = lu_env_refill(env);
+	if (rc2 != 0) {
+		/*
+		 * XXX This is very awkward situation, because
+		 * execution can neither continue (request
+		 * interpreters assume that env is set up), nor repeat
+		 * the loop (as this potentially results in a tight
+		 * loop of -ENOMEM's).
+		 *
+		 * Fortunately, refill only ever does something when
+		 * new modules are loaded, i.e., early during boot up.
+		 */
+		CERROR("Failure to refill session: %d\n", rc2);
+		RETURN(rc);
+	}
+
+	if (atomic_read(&set->set_remaining))
+		rc |= ptlrpc_check_set(env, set);
+
+	/* NB: ptlrpc_check_set has already moved complted request at the
+	 * head of seq::set_requests */
+	list_for_each_safe(pos, tmp, &set->set_requests) {
+		req = list_entry(pos, struct ptlrpc_request, rq_set_chain);
+		if (req->rq_phase != RQ_PHASE_COMPLETE)
+			break;
+
+		list_del_init(&req->rq_set_chain);
+		req->rq_set = NULL;
+		ptlrpc_req_finished(req);
+	}
+
+	if (rc == 0) {
+		/*
+		 * If new requests have been added, make sure to wake up.
+		 */
+		rc = atomic_read(&set->set_new_count);
+
+                /* If we have nothing to do, check whether we can take some
+                 * work from our partner threads. */
+                if (rc == 0 && pc->pc_npartners > 0) {
+                        struct ptlrpcd_ctl *partner;
+                        struct ptlrpc_request_set *ps;
+                        int first = pc->pc_cursor;
+
+                        do {
+                                partner = pc->pc_partners[pc->pc_cursor++];
+                                if (pc->pc_cursor >= pc->pc_npartners)
+                                        pc->pc_cursor = 0;
+                                if (partner == NULL)
+                                        continue;
+
+				spin_lock(&partner->pc_lock);
+				ps = partner->pc_set;
+				if (ps == NULL) {
+					spin_unlock(&partner->pc_lock);
+					continue;
+				}
+
+				ptlrpc_reqset_get(ps);
+				spin_unlock(&partner->pc_lock);
+
+				if (atomic_read(&ps->set_new_count)) {
+					rc = ptlrpcd_steal_rqset(set, ps);
+					if (rc > 0)
+						CDEBUG(D_RPCTRACE, "transfer %d"
+						       " async RPCs [%d->%d]\n",
+						       rc, partner->pc_index,
+						       pc->pc_index);
+				}
+				ptlrpc_reqset_put(ps);
+			} while (rc == 0 && pc->pc_cursor != first);
+		}
+	}
+
+	RETURN(rc || test_bit(LIOD_STOP, &pc->pc_flags));
+}
+
+/**
+ * Main ptlrpcd thread.
+ * ptlrpc's code paths like to execute in process context, so we have this
+ * thread which spins on a set which contains the rpcs and sends them.
+ *
+ */
+static int ptlrpcd(void *arg)
+{
+	struct ptlrpcd_ctl		*pc = arg;
+	struct ptlrpc_request_set	*set;
+	struct lu_context		ses = { 0 };
+	struct lu_env			env = { .le_ses = &ses };
+	int				rc = 0;
+	int				exit = 0;
+	ENTRY;
+
+	unshare_fs_struct();
+
+	if (cfs_cpt_bind(cfs_cpt_table, pc->pc_cpt) != 0)
+		CWARN("Failed to bind %s on CPT %d\n", pc->pc_name, pc->pc_cpt);
+
+	/*
+	 * Allocate the request set after the thread has been bound
+	 * above. This is safe because no requests will be queued
+	 * until all ptlrpcd threads have confirmed that they have
+	 * successfully started.
+	 */
+	set = ptlrpc_prep_set();
+	if (set == NULL)
+		GOTO(failed, rc = -ENOMEM);
+	spin_lock(&pc->pc_lock);
+	pc->pc_set = set;
+	spin_unlock(&pc->pc_lock);
+
+	/* Both client and server (MDT/OST) may use the environment. */
+	rc = lu_context_init(&env.le_ctx, LCT_MD_THREAD |
+					  LCT_DT_THREAD |
+					  LCT_CL_THREAD |
+					  LCT_REMEMBER  |
+					  LCT_NOREF);
+	if (rc != 0)
+		GOTO(failed, rc);
+	rc = lu_context_init(env.le_ses, LCT_SESSION  |
+					 LCT_REMEMBER |
+					 LCT_NOREF);
+	if (rc != 0) {
+		lu_context_fini(&env.le_ctx);
+		GOTO(failed, rc);
+	}
+
+	complete(&pc->pc_starting);
+
+        /*
+         * This mainloop strongly resembles ptlrpc_set_wait() except that our
+         * set never completes.  ptlrpcd_check() calls ptlrpc_check_set() when
+         * there are requests in the set. New requests come in on the set's
+         * new_req_list and ptlrpcd_check() moves them into the set.
+         */
+        do {
+                struct l_wait_info lwi;
+                int timeout;
+
+                timeout = ptlrpc_set_next_timeout(set);
+		lwi = LWI_TIMEOUT(cfs_time_seconds(timeout),
+				ptlrpc_expired_set, set);
+
+		lu_context_enter(&env.le_ctx);
+		lu_context_enter(env.le_ses);
+		l_wait_event(set->set_waitq, ptlrpcd_check(&env, pc), &lwi);
+		lu_context_exit(&env.le_ctx);
+		lu_context_exit(env.le_ses);
+
+		/*
+		 * Abort inflight rpcs for forced stop case.
+		 */
+		if (test_bit(LIOD_STOP, &pc->pc_flags)) {
+			if (test_bit(LIOD_FORCE, &pc->pc_flags))
+                                ptlrpc_abort_set(set);
+                        exit++;
+                }
+
+                /*
+                 * Let's make one more loop to make sure that ptlrpcd_check()
+                 * copied all raced new rpcs into the set so we can kill them.
+                 */
+        } while (exit < 2);
+
+        /*
+         * Wait for inflight requests to drain.
+         */
+	if (!list_empty(&set->set_requests))
+                ptlrpc_set_wait(set);
+	lu_context_fini(&env.le_ctx);
+	lu_context_fini(env.le_ses);
+
+	complete(&pc->pc_finishing);
+
+	return 0;
+
+failed:
+	pc->pc_error = rc;
+	complete(&pc->pc_starting);
+	RETURN(rc);
+}
+
+static void ptlrpcd_ctl_init(struct ptlrpcd_ctl *pc, int index, int cpt)
+{
+	ENTRY;
+
+	pc->pc_index = index;
+	pc->pc_cpt = cpt;
+	init_completion(&pc->pc_starting);
+	init_completion(&pc->pc_finishing);
+	spin_lock_init(&pc->pc_lock);
+
+	if (index < 0) {
+		/* Recovery thread. */
+		snprintf(pc->pc_name, sizeof(pc->pc_name), "ptlrpcd_rcv");
+	} else {
+		/* Regular thread. */
+		snprintf(pc->pc_name, sizeof(pc->pc_name),
+			 "ptlrpcd_%02d_%02d", cpt, index);
+	}
+
+	EXIT;
+}
+
+/* XXX: We want multiple CPU cores to share the async RPC load. So we
+ *      start many ptlrpcd threads. We also want to reduce the ptlrpcd
+ *      overhead caused by data transfer cross-CPU cores. So we bind
+ *      all ptlrpcd threads to a CPT, in the expectation that CPTs
+ *      will be defined in a way that matches these boundaries. Within
+ *      a CPT a ptlrpcd thread can be scheduled on any available core.
+ *
+ *      Each ptlrpcd thread has its own request queue. This can cause
+ *      response delay if the thread is already busy. To help with
+ *      this we define partner threads: these are other threads bound
+ *      to the same CPT which will check for work in each other's
+ *      request queues if they have no work to do.
+ *
+ *      The desired number of partner threads can be tuned by setting
+ *      ptlrpcd_partner_group_size. The default is to create pairs of
+ *      partner threads.
+ */
+static int ptlrpcd_partners(struct ptlrpcd *pd, int index)
+{
+	struct ptlrpcd_ctl	*pc;
+	struct ptlrpcd_ctl	**ppc;
+	int			first;
+	int			i;
+	int			rc = 0;
+	ENTRY;
+
+	LASSERT(index >= 0 && index < pd->pd_nthreads);
+	pc = &pd->pd_threads[index];
+	pc->pc_npartners = pd->pd_groupsize - 1;
+
+	if (pc->pc_npartners <= 0)
+		GOTO(out, rc);
+
+	OBD_CPT_ALLOC(pc->pc_partners, cfs_cpt_table, pc->pc_cpt,
+		      sizeof(struct ptlrpcd_ctl *) * pc->pc_npartners);
+	if (pc->pc_partners == NULL) {
+		pc->pc_npartners = 0;
+		GOTO(out, rc = -ENOMEM);
+	}
+
+	first = index - index % pd->pd_groupsize;
+	ppc = pc->pc_partners;
+	for (i = first; i < first + pd->pd_groupsize; i++) {
+		if (i != index)
+			*ppc++ = &pd->pd_threads[i];
+	}
+out:
+	RETURN(rc);
+}
+
+int ptlrpcd_start(struct ptlrpcd_ctl *pc)
+{
+	struct task_struct	*task;
+	int			rc = 0;
+	ENTRY;
+
+	/*
+	 * Do not allow starting a second thread for one pc.
+	 */
+	if (test_and_set_bit(LIOD_START, &pc->pc_flags)) {
+		CWARN("Starting second thread (%s) for same pc %p\n",
+		      pc->pc_name, pc);
+		RETURN(0);
+	}
+
+	task = kthread_run(ptlrpcd, pc, pc->pc_name);
+	if (IS_ERR(task))
+		GOTO(out_set, rc = PTR_ERR(task));
+
+	wait_for_completion(&pc->pc_starting);
+	rc = pc->pc_error;
+	if (rc != 0)
+		GOTO(out_set, rc);
+
+	RETURN(0);
+
+out_set:
+	if (pc->pc_set != NULL) {
+		struct ptlrpc_request_set *set = pc->pc_set;
+
+		spin_lock(&pc->pc_lock);
+		pc->pc_set = NULL;
+		spin_unlock(&pc->pc_lock);
+		ptlrpc_set_destroy(set);
+	}
+	clear_bit(LIOD_START, &pc->pc_flags);
+	RETURN(rc);
+}
+
+void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force)
+{
+	ENTRY;
+
+	if (!test_bit(LIOD_START, &pc->pc_flags)) {
+		CWARN("Thread for pc %p was not started\n", pc);
+		goto out;
+	}
+
+	set_bit(LIOD_STOP, &pc->pc_flags);
+	if (force)
+		set_bit(LIOD_FORCE, &pc->pc_flags);
+	wake_up(&pc->pc_set->set_waitq);
+
+out:
+	EXIT;
+}
+
+void ptlrpcd_free(struct ptlrpcd_ctl *pc)
+{
+	struct ptlrpc_request_set *set = pc->pc_set;
+	ENTRY;
+
+	if (!test_bit(LIOD_START, &pc->pc_flags)) {
+		CWARN("Thread for pc %p was not started\n", pc);
+		goto out;
+	}
+
+	wait_for_completion(&pc->pc_finishing);
+
+	spin_lock(&pc->pc_lock);
+	pc->pc_set = NULL;
+	spin_unlock(&pc->pc_lock);
+	ptlrpc_set_destroy(set);
+
+	clear_bit(LIOD_START, &pc->pc_flags);
+	clear_bit(LIOD_STOP, &pc->pc_flags);
+	clear_bit(LIOD_FORCE, &pc->pc_flags);
+
+out:
+        if (pc->pc_npartners > 0) {
+                LASSERT(pc->pc_partners != NULL);
+
+                OBD_FREE(pc->pc_partners,
+                         sizeof(struct ptlrpcd_ctl *) * pc->pc_npartners);
+                pc->pc_partners = NULL;
+        }
+        pc->pc_npartners = 0;
+	pc->pc_error = 0;
+        EXIT;
+}
+
+static void ptlrpcd_fini(void)
+{
+	int	i;
+	int	j;
+	int	ncpts;
+	ENTRY;
+
+	if (ptlrpcds != NULL) {
+		for (i = 0; i < ptlrpcds_num; i++) {
+			if (ptlrpcds[i] == NULL)
+				break;
+			for (j = 0; j < ptlrpcds[i]->pd_nthreads; j++)
+				ptlrpcd_stop(&ptlrpcds[i]->pd_threads[j], 0);
+			for (j = 0; j < ptlrpcds[i]->pd_nthreads; j++)
+				ptlrpcd_free(&ptlrpcds[i]->pd_threads[j]);
+			OBD_FREE(ptlrpcds[i], ptlrpcds[i]->pd_size);
+			ptlrpcds[i] = NULL;
+		}
+		OBD_FREE(ptlrpcds, sizeof(ptlrpcds[0]) * ptlrpcds_num);
+	}
+	ptlrpcds_num = 0;
+
+	ptlrpcd_stop(&ptlrpcd_rcv, 0);
+	ptlrpcd_free(&ptlrpcd_rcv);
+
+	if (ptlrpcds_cpt_idx != NULL) {
+		ncpts = cfs_cpt_number(cfs_cpt_table);
+		OBD_FREE(ptlrpcds_cpt_idx, ncpts * sizeof(ptlrpcds_cpt_idx[0]));
+		ptlrpcds_cpt_idx = NULL;
+	}
+
+	EXIT;
+}
+
+static int ptlrpcd_init(void)
+{
+	int			nthreads;
+	int			groupsize;
+	int			size;
+	int			i;
+	int			j;
+	int			rc = 0;
+	struct cfs_cpt_table	*cptable;
+	__u32			*cpts = NULL;
+	int			ncpts;
+	int			cpt;
+	struct ptlrpcd		*pd;
+	ENTRY;
+
+	/*
+	 * Determine the CPTs that ptlrpcd threads will run on.
+	 */
+	cptable = cfs_cpt_table;
+	ncpts = cfs_cpt_number(cptable);
+	if (ptlrpcd_cpts != NULL) {
+		struct cfs_expr_list	*el;
+
+		size = ncpts * sizeof(ptlrpcds_cpt_idx[0]);
+		OBD_ALLOC(ptlrpcds_cpt_idx, size);
+		if (ptlrpcds_cpt_idx == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		rc = cfs_expr_list_parse(ptlrpcd_cpts,
+					 strlen(ptlrpcd_cpts),
+					 0, ncpts - 1, &el);
+		if (rc != 0) {
+			CERROR("%s: invalid CPT pattern string: %s",
+			       "ptlrpcd_cpts", ptlrpcd_cpts);
+			GOTO(out, rc = -EINVAL);
+		}
+
+		rc = cfs_expr_list_values(el, ncpts, &cpts);
+		cfs_expr_list_free(el);
+		if (rc <= 0) {
+			CERROR("%s: failed to parse CPT array %s: %d\n",
+			       "ptlrpcd_cpts", ptlrpcd_cpts, rc);
+			if (rc == 0)
+				rc = -EINVAL;
+			GOTO(out, rc);
+		}
+
+		/*
+		 * Create the cpt-to-index map. When there is no match
+		 * in the cpt table, pick a cpt at random. This could
+		 * be changed to take the topology of the system into
+		 * account.
+		 */
+		for (cpt = 0; cpt < ncpts; cpt++) {
+			for (i = 0; i < rc; i++)
+				if (cpts[i] == cpt)
+					break;
+			if (i >= rc)
+				i = cpt % rc;
+			ptlrpcds_cpt_idx[cpt] = i;
+		}
+
+		cfs_expr_list_values_free(cpts, rc);
+		ncpts = rc;
+	}
+	ptlrpcds_num = ncpts;
+
+	size = ncpts * sizeof(ptlrpcds[0]);
+	OBD_ALLOC(ptlrpcds, size);
+	if (ptlrpcds == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	/*
+	 * The max_ptlrpcds parameter is obsolete, but do something
+	 * sane if it has been tuned, and complain if
+	 * ptlrpcd_per_cpt_max has also been tuned.
+	 */
+	if (max_ptlrpcds != 0) {
+		CWARN("max_ptlrpcds is obsolete.\n");
+		if (ptlrpcd_per_cpt_max == 0) {
+			ptlrpcd_per_cpt_max = max_ptlrpcds / ncpts;
+			/* Round up if there is a remainder. */
+			if (max_ptlrpcds % ncpts != 0)
+				ptlrpcd_per_cpt_max++;
+			CWARN("Setting ptlrpcd_per_cpt_max = %d\n",
+			      ptlrpcd_per_cpt_max);
+		} else {
+			CWARN("ptlrpd_per_cpt_max is also set!\n");
+		}
+	}
+
+	/*
+	 * The ptlrpcd_bind_policy parameter is obsolete, but do
+	 * something sane if it has been tuned, and complain if
+	 * ptlrpcd_partner_group_size is also tuned.
+	 */
+	if (ptlrpcd_bind_policy != 0) {
+		CWARN("ptlrpcd_bind_policy is obsolete.\n");
+		if (ptlrpcd_partner_group_size == 0) {
+			switch (ptlrpcd_bind_policy) {
+			case 1: /* PDB_POLICY_NONE */
+			case 2: /* PDB_POLICY_FULL */
+				ptlrpcd_partner_group_size = 1;
+				break;
+			case 3: /* PDB_POLICY_PAIR */
+				ptlrpcd_partner_group_size = 2;
+				break;
+			case 4:	/* PDB_POLICY_NEIGHBOR */
+#ifdef CONFIG_NUMA
+				ptlrpcd_partner_group_size = -1; /* CPT */
+#else
+				ptlrpcd_partner_group_size = 3; /* Triplets */
+#endif
+				break;
+			default: /* Illegal value, use the default. */
+				ptlrpcd_partner_group_size = 2;
+				break;
+			}
+			CWARN("Setting ptlrpcd_partner_group_size = %d\n",
+			      ptlrpcd_partner_group_size);
+		} else {
+			CWARN("ptlrpcd_partner_group_size is also set!\n");
+		}
+	}
+
+	if (ptlrpcd_partner_group_size == 0)
+		ptlrpcd_partner_group_size = 2;
+	else if (ptlrpcd_partner_group_size < 0)
+		ptlrpcd_partner_group_size = -1;
+	else if (ptlrpcd_per_cpt_max > 0 &&
+		 ptlrpcd_partner_group_size > ptlrpcd_per_cpt_max)
+		ptlrpcd_partner_group_size = ptlrpcd_per_cpt_max;
+
+	/*
+	 * Start the recovery thread first.
+	 */
+	set_bit(LIOD_RECOVERY, &ptlrpcd_rcv.pc_flags);
+	ptlrpcd_ctl_init(&ptlrpcd_rcv, -1, CFS_CPT_ANY);
+	rc = ptlrpcd_start(&ptlrpcd_rcv);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	for (i = 0; i < ncpts; i++) {
+		if (cpts == NULL)
+			cpt = i;
+		else
+			cpt = cpts[i];
+
+		nthreads = cfs_cpt_weight(cptable, cpt);
+		if (ptlrpcd_per_cpt_max > 0 && ptlrpcd_per_cpt_max < nthreads)
+			nthreads = ptlrpcd_per_cpt_max;
+		if (nthreads < 2)
+			nthreads = 2;
+
+		if (ptlrpcd_partner_group_size <= 0) {
+			groupsize = nthreads;
+		} else if (nthreads <= ptlrpcd_partner_group_size) {
+			groupsize = nthreads;
+		} else {
+			groupsize = ptlrpcd_partner_group_size;
+			if (nthreads % groupsize != 0)
+				nthreads += groupsize - (nthreads % groupsize);
+		}
+
+		size = offsetof(struct ptlrpcd, pd_threads[nthreads]);
+		OBD_CPT_ALLOC(pd, cptable, cpt, size);
+		if (!pd)
+			GOTO(out, rc = -ENOMEM);
+		pd->pd_size      = size;
+		pd->pd_index     = i;
+		pd->pd_cpt       = cpt;
+		pd->pd_cursor    = 0;
+		pd->pd_nthreads  = nthreads;
+		pd->pd_groupsize = groupsize;
+		ptlrpcds[i] = pd;
+
+		/*
+		 * The ptlrpcd threads in a partner group can access
+		 * each other's struct ptlrpcd_ctl, so these must be
+		 * initialized before any thead is started.
+		 */
+		for (j = 0; j < nthreads; j++) {
+			ptlrpcd_ctl_init(&pd->pd_threads[j], j, cpt);
+			rc = ptlrpcd_partners(pd, j);
+			if (rc < 0)
+				GOTO(out, rc);
+		}
+
+		/* XXX: We start nthreads ptlrpc daemons on this cpt.
+		 *      Each of them can process any non-recovery
+		 *      async RPC to improve overall async RPC
+		 *      efficiency.
+		 *
+		 *      But there are some issues with async I/O RPCs
+		 *      and async non-I/O RPCs processed in the same
+		 *      set under some cases. The ptlrpcd may be
+		 *      blocked by some async I/O RPC(s), then will
+		 *      cause other async non-I/O RPC(s) can not be
+		 *      processed in time.
+		 *
+		 *      Maybe we should distinguish blocked async RPCs
+		 *      from non-blocked async RPCs, and process them
+		 *      in different ptlrpcd sets to avoid unnecessary
+		 *      dependency. But how to distribute async RPCs
+		 *      load among all the ptlrpc daemons becomes
+		 *      another trouble.
+		 */
+		for (j = 0; j < nthreads; j++) {
+			rc = ptlrpcd_start(&pd->pd_threads[j]);
+			if (rc < 0)
+				GOTO(out, rc);
+		}
+	}
+out:
+	if (rc != 0)
+		ptlrpcd_fini();
+
+	RETURN(rc);
+}
+
+int ptlrpcd_addref(void)
+{
+        int rc = 0;
+        ENTRY;
+
+	mutex_lock(&ptlrpcd_mutex);
+        if (++ptlrpcd_users == 1) {
+		rc = ptlrpcd_init();
+		if (rc < 0)
+			ptlrpcd_users--;
+	}
+	mutex_unlock(&ptlrpcd_mutex);
+        RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpcd_addref);
+
+void ptlrpcd_decref(void)
+{
+	mutex_lock(&ptlrpcd_mutex);
+        if (--ptlrpcd_users == 0)
+                ptlrpcd_fini();
+	mutex_unlock(&ptlrpcd_mutex);
+}
+EXPORT_SYMBOL(ptlrpcd_decref);
+/** @} ptlrpcd */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/recover.c b/drivers/staging/lustrefsx/lustre/ptlrpc/recover.c
new file mode 100644
index 0000000000000..c526e9e5c65f7
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/recover.c
@@ -0,0 +1,386 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/recover.c
+ *
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <linux/list.h>
+#include <libcfs/libcfs.h>
+#include <obd_support.h>
+#include <lustre_ha.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_export.h>
+#include <obd.h>
+#include <obd_class.h>
+
+#include "ptlrpc_internal.h"
+
+/**
+ * Start recovery on disconnected import.
+ * This is done by just attempting a connect
+ */
+void ptlrpc_initiate_recovery(struct obd_import *imp)
+{
+        ENTRY;
+
+        CDEBUG(D_HA, "%s: starting recovery\n", obd2cli_tgt(imp->imp_obd));
+        ptlrpc_connect_import(imp);
+
+        EXIT;
+}
+
+/**
+ * Identify what request from replay list needs to be replayed next
+ * (based on what we have already replayed) and send it to server.
+ */
+int ptlrpc_replay_next(struct obd_import *imp, int *inflight)
+{
+        int rc = 0;
+	struct list_head *tmp, *pos;
+        struct ptlrpc_request *req = NULL;
+        __u64 last_transno;
+        ENTRY;
+
+        *inflight = 0;
+
+        /* It might have committed some after we last spoke, so make sure we
+         * get rid of them now.
+         */
+	spin_lock(&imp->imp_lock);
+	imp->imp_last_transno_checked = 0;
+	ptlrpc_free_committed(imp);
+	last_transno = imp->imp_last_replay_transno;
+
+	CDEBUG(D_HA, "import %p from %s committed %llu last %llu\n",
+	       imp, obd2cli_tgt(imp->imp_obd),
+	       imp->imp_peer_committed_transno, last_transno);
+
+	/* Replay all the committed open requests on committed_list first */
+	if (!list_empty(&imp->imp_committed_list)) {
+		tmp = imp->imp_committed_list.prev;
+		req = list_entry(tmp, struct ptlrpc_request,
+				     rq_replay_list);
+
+		/* The last request on committed_list hasn't been replayed */
+		if (req->rq_transno > last_transno) {
+			if (!imp->imp_resend_replay ||
+			    imp->imp_replay_cursor == &imp->imp_committed_list)
+				imp->imp_replay_cursor =
+					imp->imp_replay_cursor->next;
+
+			while (imp->imp_replay_cursor !=
+			       &imp->imp_committed_list) {
+				req = list_entry(imp->imp_replay_cursor,
+						     struct ptlrpc_request,
+						     rq_replay_list);
+				if (req->rq_transno > last_transno)
+					break;
+
+				req = NULL;
+				LASSERT(!list_empty(imp->imp_replay_cursor));
+				imp->imp_replay_cursor =
+					imp->imp_replay_cursor->next;
+			}
+		} else {
+			/* All requests on committed_list have been replayed */
+			imp->imp_replay_cursor = &imp->imp_committed_list;
+			req = NULL;
+		}
+	}
+
+	/* All the requests in committed list have been replayed, let's replay
+	 * the imp_replay_list */
+	if (req == NULL) {
+		list_for_each_safe(tmp, pos, &imp->imp_replay_list) {
+			req = list_entry(tmp, struct ptlrpc_request,
+					     rq_replay_list);
+
+			if (req->rq_transno > last_transno)
+				break;
+			req = NULL;
+		}
+	}
+
+	/* If need to resend the last sent transno (because a reconnect
+	 * has occurred), then stop on the matching req and send it again.
+	 * If, however, the last sent transno has been committed then we
+	 * continue replay from the next request. */
+	if (req != NULL && imp->imp_resend_replay)
+		lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT);
+
+	/* ptlrpc_prepare_replay() may fail to add the reqeust into unreplied
+	 * list if the request hasn't been added to replay list then. Another
+	 * exception is that resend replay could have been removed from the
+	 * unreplied list. */
+	if (req != NULL && list_empty(&req->rq_unreplied_list)) {
+		DEBUG_REQ(D_HA, req, "resend_replay: %d, last_transno: %llu\n",
+			  imp->imp_resend_replay, last_transno);
+		ptlrpc_add_unreplied(req);
+		imp->imp_known_replied_xid = ptlrpc_known_replied_xid(imp);
+	}
+
+	imp->imp_resend_replay = 0;
+	spin_unlock(&imp->imp_lock);
+
+	if (req != NULL) {
+		LASSERT(!list_empty(&req->rq_unreplied_list));
+
+		rc = ptlrpc_replay_req(req);
+		if (rc) {
+			CERROR("recovery replay error %d for req "
+			       "%llu\n", rc, req->rq_xid);
+			RETURN(rc);
+		}
+		*inflight = 1;
+	}
+	RETURN(rc);
+}
+
+/**
+ * Schedule resending of request on sending_list. This is done after
+ * we completed replaying of requests and locks.
+ */
+int ptlrpc_resend(struct obd_import *imp)
+{
+        struct ptlrpc_request *req, *next;
+
+        ENTRY;
+
+        /* As long as we're in recovery, nothing should be added to the sending
+         * list, so we don't need to hold the lock during this iteration and
+         * resend process.
+         */
+        /* Well... what if lctl recover is called twice at the same time?
+         */
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state != LUSTRE_IMP_RECOVER) {
+		spin_unlock(&imp->imp_lock);
+                RETURN(-1);
+        }
+
+	list_for_each_entry_safe(req, next, &imp->imp_sending_list, rq_list) {
+		LASSERTF((long)req > PAGE_SIZE && req != LP_POISON,
+			 "req %p bad\n", req);
+		LASSERTF(req->rq_type != LI_POISON, "req %p freed\n", req);
+
+		/* If the request is allowed to be sent during replay and it
+		 * is not timeout yet, then it does not need to be resent. */
+		if (!ptlrpc_no_resend(req) &&
+		    (req->rq_timedout || !req->rq_allow_replay))
+			ptlrpc_resend_req(req);
+	}
+	spin_unlock(&imp->imp_lock);
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT, 2);
+	RETURN(0);
+}
+
+/**
+ * Go through all requests in delayed list and wake their threads
+ * for resending
+ */
+void ptlrpc_wake_delayed(struct obd_import *imp)
+{
+	struct list_head *tmp, *pos;
+	struct ptlrpc_request *req;
+
+	spin_lock(&imp->imp_lock);
+	list_for_each_safe(tmp, pos, &imp->imp_delayed_list) {
+		req = list_entry(tmp, struct ptlrpc_request, rq_list);
+
+		DEBUG_REQ(D_HA, req, "waking (set %p):", req->rq_set);
+		ptlrpc_client_wake_req(req);
+	}
+	spin_unlock(&imp->imp_lock);
+}
+
+void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req)
+{
+        struct obd_import *imp = failed_req->rq_import;
+        ENTRY;
+
+        CDEBUG(D_HA, "import %s of %s@%s abruptly disconnected: reconnecting\n",
+               imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
+               imp->imp_connection->c_remote_uuid.uuid);
+
+        if (ptlrpc_set_import_discon(imp,
+                              lustre_msg_get_conn_cnt(failed_req->rq_reqmsg))) {
+                if (!imp->imp_replayable) {
+                        CDEBUG(D_HA, "import %s@%s for %s not replayable, "
+                               "auto-deactivating\n",
+                               obd2cli_tgt(imp->imp_obd),
+                               imp->imp_connection->c_remote_uuid.uuid,
+                               imp->imp_obd->obd_name);
+                        ptlrpc_deactivate_import(imp);
+                }
+                /* to control recovery via lctl {disable|enable}_recovery */
+                if (imp->imp_deactive == 0)
+                        ptlrpc_connect_import(imp);
+        }
+
+        /* Wait for recovery to complete and resend. If evicted, then
+           this request will be errored out later.*/
+	spin_lock(&failed_req->rq_lock);
+	if (!failed_req->rq_no_resend)
+		failed_req->rq_resend = 1;
+	spin_unlock(&failed_req->rq_lock);
+
+	EXIT;
+}
+
+/**
+ * Administratively active/deactive a client. 
+ * This should only be called by the ioctl interface, currently
+ *  - the lctl deactivate and activate commands
+ *  - echo 0/1 >> /proc/osc/XXX/active
+ *  - client umount -f (ll_umount_begin)
+ */
+int ptlrpc_set_import_active(struct obd_import *imp, int active)
+{
+        struct obd_device *obd = imp->imp_obd;
+        int rc = 0;
+
+        ENTRY;
+        LASSERT(obd);
+
+        /* When deactivating, mark import invalid, and abort in-flight
+         * requests. */
+        if (!active) {
+                LCONSOLE_WARN("setting import %s INACTIVE by administrator "
+                              "request\n", obd2cli_tgt(imp->imp_obd));
+
+                /* set before invalidate to avoid messages about imp_inval
+                 * set without imp_deactive in ptlrpc_import_delay_req */
+		spin_lock(&imp->imp_lock);
+		imp->imp_deactive = 1;
+		spin_unlock(&imp->imp_lock);
+
+                obd_import_event(imp->imp_obd, imp, IMP_EVENT_DEACTIVATE);
+
+                ptlrpc_invalidate_import(imp);
+        }
+
+        /* When activating, mark import valid, and attempt recovery */
+        if (active) {
+                CDEBUG(D_HA, "setting import %s VALID\n",
+                       obd2cli_tgt(imp->imp_obd));
+
+		spin_lock(&imp->imp_lock);
+		imp->imp_deactive = 0;
+		spin_unlock(&imp->imp_lock);
+                obd_import_event(imp->imp_obd, imp, IMP_EVENT_ACTIVATE);
+
+                rc = ptlrpc_recover_import(imp, NULL, 0);
+        }
+
+        RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_set_import_active);
+
+/* Attempt to reconnect an import */
+int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async)
+{
+	int rc = 0;
+	ENTRY;
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_NEW || imp->imp_deactive ||
+	    atomic_read(&imp->imp_inval_count))
+		rc = -EINVAL;
+	spin_unlock(&imp->imp_lock);
+        if (rc)
+                GOTO(out, rc);
+
+        /* force import to be disconnected. */
+        ptlrpc_set_import_discon(imp, 0);
+
+        if (new_uuid) {
+                struct obd_uuid uuid;
+
+                /* intruct import to use new uuid */
+                obd_str2uuid(&uuid, new_uuid);
+                rc = import_set_conn_priority(imp, &uuid);
+                if (rc)
+                        GOTO(out, rc);
+        }
+
+        /* Check if reconnect is already in progress */
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state != LUSTRE_IMP_DISCON) {
+		imp->imp_force_verify = 1;
+		rc = -EALREADY;
+	}
+	spin_unlock(&imp->imp_lock);
+        if (rc)
+                GOTO(out, rc);
+
+        rc = ptlrpc_connect_import(imp);
+        if (rc)
+                GOTO(out, rc);
+
+        if (!async) {
+                struct l_wait_info lwi;
+                int secs = cfs_time_seconds(obd_timeout);
+
+                CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n",
+                       obd2cli_tgt(imp->imp_obd), secs);
+
+                lwi = LWI_TIMEOUT(secs, NULL, NULL);
+                rc = l_wait_event(imp->imp_recovery_waitq,
+                                  !ptlrpc_import_in_recovery(imp), &lwi);
+                CDEBUG(D_HA, "%s: recovery finished\n",
+                       obd2cli_tgt(imp->imp_obd));
+        }
+        EXIT;
+
+out:
+        return rc;
+}
+EXPORT_SYMBOL(ptlrpc_recover_import);
+
+int ptlrpc_import_in_recovery(struct obd_import *imp)
+{
+	int in_recovery = 1;
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_FULL ||
+	    imp->imp_state == LUSTRE_IMP_CLOSED ||
+	    imp->imp_state == LUSTRE_IMP_DISCON ||
+	    imp->imp_obd->obd_no_recov)
+		in_recovery = 0;
+	spin_unlock(&imp->imp_lock);
+
+	return in_recovery;
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec.c
new file mode 100644
index 0000000000000..92d39ece51d16
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec.c
@@ -0,0 +1,2580 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/user_namespace.h>
+#ifdef HAVE_UIDGID_HEADER
+# include <linux/uidgid.h>
+#endif
+#include <linux/crypto.h>
+#include <linux/key.h>
+
+#include <libcfs/libcfs.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_dlm.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+/***********************************************
+ * policy registers                            *
+ ***********************************************/
+
+static rwlock_t policy_lock;
+static struct ptlrpc_sec_policy *policies[SPTLRPC_POLICY_MAX] = {
+        NULL,
+};
+
+int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy)
+{
+        __u16 number = policy->sp_policy;
+
+        LASSERT(policy->sp_name);
+        LASSERT(policy->sp_cops);
+        LASSERT(policy->sp_sops);
+
+        if (number >= SPTLRPC_POLICY_MAX)
+                return -EINVAL;
+
+	write_lock(&policy_lock);
+        if (unlikely(policies[number])) {
+		write_unlock(&policy_lock);
+                return -EALREADY;
+        }
+        policies[number] = policy;
+	write_unlock(&policy_lock);
+
+        CDEBUG(D_SEC, "%s: registered\n", policy->sp_name);
+        return 0;
+}
+EXPORT_SYMBOL(sptlrpc_register_policy);
+
+int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy)
+{
+        __u16 number = policy->sp_policy;
+
+        LASSERT(number < SPTLRPC_POLICY_MAX);
+
+	write_lock(&policy_lock);
+        if (unlikely(policies[number] == NULL)) {
+		write_unlock(&policy_lock);
+                CERROR("%s: already unregistered\n", policy->sp_name);
+                return -EINVAL;
+        }
+
+        LASSERT(policies[number] == policy);
+        policies[number] = NULL;
+	write_unlock(&policy_lock);
+
+        CDEBUG(D_SEC, "%s: unregistered\n", policy->sp_name);
+        return 0;
+}
+EXPORT_SYMBOL(sptlrpc_unregister_policy);
+
+static
+struct ptlrpc_sec_policy * sptlrpc_wireflavor2policy(__u32 flavor)
+{
+	static DEFINE_MUTEX(load_mutex);
+	static atomic_t		  loaded = ATOMIC_INIT(0);
+	struct ptlrpc_sec_policy *policy;
+	__u16			  number = SPTLRPC_FLVR_POLICY(flavor);
+	__u16			  flag = 0;
+
+	if (number >= SPTLRPC_POLICY_MAX)
+		return NULL;
+
+	while (1) {
+		read_lock(&policy_lock);
+		policy = policies[number];
+		if (policy && !try_module_get(policy->sp_owner))
+			policy = NULL;
+		if (policy == NULL)
+			flag = atomic_read(&loaded);
+		read_unlock(&policy_lock);
+
+		if (policy != NULL || flag != 0 ||
+		    number != SPTLRPC_POLICY_GSS)
+			break;
+
+		/* try to load gss module, once */
+		mutex_lock(&load_mutex);
+		if (atomic_read(&loaded) == 0) {
+			if (request_module("ptlrpc_gss") == 0)
+				CDEBUG(D_SEC,
+				       "module ptlrpc_gss loaded on demand\n");
+			else
+				CERROR("Unable to load module ptlrpc_gss\n");
+
+			atomic_set(&loaded, 1);
+		}
+		mutex_unlock(&load_mutex);
+	}
+
+	return policy;
+}
+
+__u32 sptlrpc_name2flavor_base(const char *name)
+{
+	if (!strcmp(name, "null"))
+		return SPTLRPC_FLVR_NULL;
+	if (!strcmp(name, "plain"))
+		return SPTLRPC_FLVR_PLAIN;
+	if (!strcmp(name, "gssnull"))
+		return SPTLRPC_FLVR_GSSNULL;
+        if (!strcmp(name, "krb5n"))
+                return SPTLRPC_FLVR_KRB5N;
+        if (!strcmp(name, "krb5a"))
+                return SPTLRPC_FLVR_KRB5A;
+        if (!strcmp(name, "krb5i"))
+                return SPTLRPC_FLVR_KRB5I;
+        if (!strcmp(name, "krb5p"))
+                return SPTLRPC_FLVR_KRB5P;
+	if (!strcmp(name, "skn"))
+		return SPTLRPC_FLVR_SKN;
+	if (!strcmp(name, "ska"))
+		return SPTLRPC_FLVR_SKA;
+	if (!strcmp(name, "ski"))
+		return SPTLRPC_FLVR_SKI;
+	if (!strcmp(name, "skpi"))
+		return SPTLRPC_FLVR_SKPI;
+
+	return SPTLRPC_FLVR_INVALID;
+}
+EXPORT_SYMBOL(sptlrpc_name2flavor_base);
+
+const char *sptlrpc_flavor2name_base(__u32 flvr)
+{
+        __u32   base = SPTLRPC_FLVR_BASE(flvr);
+
+        if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_NULL))
+                return "null";
+        else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN))
+                return "plain";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_GSSNULL))
+		return "gssnull";
+        else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5N))
+                return "krb5n";
+        else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5A))
+                return "krb5a";
+        else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5I))
+                return "krb5i";
+        else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5P))
+                return "krb5p";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_SKN))
+		return "skn";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_SKA))
+		return "ska";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_SKI))
+		return "ski";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_SKPI))
+		return "skpi";
+
+	CERROR("invalid wire flavor 0x%x\n", flvr);
+	return "invalid";
+}
+EXPORT_SYMBOL(sptlrpc_flavor2name_base);
+
+char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf,
+                               char *buf, int bufsize)
+{
+        if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN)
+                snprintf(buf, bufsize, "hash:%s",
+                         sptlrpc_get_hash_name(sf->u_bulk.hash.hash_alg));
+        else
+                snprintf(buf, bufsize, "%s",
+                         sptlrpc_flavor2name_base(sf->sf_rpc));
+
+        buf[bufsize - 1] = '\0';
+        return buf;
+}
+EXPORT_SYMBOL(sptlrpc_flavor2name_bulk);
+
+char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize)
+{
+        snprintf(buf, bufsize, "%s", sptlrpc_flavor2name_base(sf->sf_rpc));
+
+        /*
+         * currently we don't support customized bulk specification for
+         * flavors other than plain
+         */
+        if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN) {
+                char bspec[16];
+
+                bspec[0] = '-';
+                sptlrpc_flavor2name_bulk(sf, &bspec[1], sizeof(bspec) - 1);
+                strncat(buf, bspec, bufsize);
+        }
+
+        buf[bufsize - 1] = '\0';
+        return buf;
+}
+EXPORT_SYMBOL(sptlrpc_flavor2name);
+
+char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize)
+{
+	buf[0] = '\0';
+
+	if (flags & PTLRPC_SEC_FL_REVERSE)
+		strlcat(buf, "reverse,", bufsize);
+	if (flags & PTLRPC_SEC_FL_ROOTONLY)
+		strlcat(buf, "rootonly,", bufsize);
+	if (flags & PTLRPC_SEC_FL_UDESC)
+		strlcat(buf, "udesc,", bufsize);
+	if (flags & PTLRPC_SEC_FL_BULK)
+		strlcat(buf, "bulk,", bufsize);
+	if (buf[0] == '\0')
+		strlcat(buf, "-,", bufsize);
+
+	return buf;
+}
+EXPORT_SYMBOL(sptlrpc_secflags2str);
+
+/**************************************************
+ * client context APIs                            *
+ **************************************************/
+
+static
+struct ptlrpc_cli_ctx *get_my_ctx(struct ptlrpc_sec *sec)
+{
+        struct vfs_cred vcred;
+        int create = 1, remove_dead = 1;
+
+        LASSERT(sec);
+        LASSERT(sec->ps_policy->sp_cops->lookup_ctx);
+
+        if (sec->ps_flvr.sf_flags & (PTLRPC_SEC_FL_REVERSE |
+                                     PTLRPC_SEC_FL_ROOTONLY)) {
+                vcred.vc_uid = 0;
+                vcred.vc_gid = 0;
+                if (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_REVERSE) {
+                        create = 0;
+                        remove_dead = 0;
+                }
+	} else {
+		vcred.vc_uid = from_kuid(&init_user_ns, current_uid());
+		vcred.vc_gid = from_kgid(&init_user_ns, current_gid());
+	}
+
+	return sec->ps_policy->sp_cops->lookup_ctx(sec, &vcred, create,
+						   remove_dead);
+}
+
+struct ptlrpc_cli_ctx *sptlrpc_cli_ctx_get(struct ptlrpc_cli_ctx *ctx)
+{
+	atomic_inc(&ctx->cc_refcount);
+	return ctx;
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_get);
+
+void sptlrpc_cli_ctx_put(struct ptlrpc_cli_ctx *ctx, int sync)
+{
+	struct ptlrpc_sec *sec = ctx->cc_sec;
+
+	LASSERT(sec);
+	LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+	if (!atomic_dec_and_test(&ctx->cc_refcount))
+		return;
+
+	sec->ps_policy->sp_cops->release_ctx(sec, ctx, sync);
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_put);
+
+/**
+ * Expire the client context immediately.
+ *
+ * \pre Caller must hold at least 1 reference on the \a ctx.
+ */
+void sptlrpc_cli_ctx_expire(struct ptlrpc_cli_ctx *ctx)
+{
+        LASSERT(ctx->cc_ops->die);
+        ctx->cc_ops->die(ctx, 0);
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_expire);
+
+/**
+ * To wake up the threads who are waiting for this client context. Called
+ * after some status change happened on \a ctx.
+ */
+void sptlrpc_cli_ctx_wakeup(struct ptlrpc_cli_ctx *ctx)
+{
+	struct ptlrpc_request *req, *next;
+
+	spin_lock(&ctx->cc_lock);
+	list_for_each_entry_safe(req, next, &ctx->cc_req_list,
+				     rq_ctx_chain) {
+		list_del_init(&req->rq_ctx_chain);
+		ptlrpc_client_wake_req(req);
+	}
+	spin_unlock(&ctx->cc_lock);
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_wakeup);
+
+int sptlrpc_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize)
+{
+        LASSERT(ctx->cc_ops);
+
+        if (ctx->cc_ops->display == NULL)
+                return 0;
+
+        return ctx->cc_ops->display(ctx, buf, bufsize);
+}
+
+static int import_sec_check_expire(struct obd_import *imp)
+{
+	int     adapt = 0;
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_sec_expire &&
+	    imp->imp_sec_expire < ktime_get_real_seconds()) {
+		adapt = 1;
+		imp->imp_sec_expire = 0;
+	}
+	spin_unlock(&imp->imp_lock);
+
+        if (!adapt)
+                return 0;
+
+        CDEBUG(D_SEC, "found delayed sec adapt expired, do it now\n");
+	return sptlrpc_import_sec_adapt(imp, NULL, NULL);
+}
+
+/**
+ * Get and validate the client side ptlrpc security facilities from
+ * \a imp. There is a race condition on client reconnect when the import is
+ * being destroyed while there are outstanding client bound requests. In
+ * this case do not output any error messages if import secuity is not
+ * found.
+ *
+ * \param[in] imp obd import associated with client
+ * \param[out] sec client side ptlrpc security
+ *
+ * \retval 0 if security retrieved successfully
+ * \retval -ve errno if there was a problem
+ */
+static int import_sec_validate_get(struct obd_import *imp,
+				   struct ptlrpc_sec **sec)
+{
+	int     rc;
+
+	if (unlikely(imp->imp_sec_expire)) {
+		rc = import_sec_check_expire(imp);
+		if (rc)
+			return rc;
+	}
+
+	*sec = sptlrpc_import_sec_ref(imp);
+	/* Only output an error when the import is still active */
+	if (*sec == NULL) {
+		if (list_empty(&imp->imp_zombie_chain))
+			CERROR("import %p (%s) with no sec\n",
+				imp, ptlrpc_import_state_name(imp->imp_state));
+		return -EACCES;
+	}
+
+	if (unlikely((*sec)->ps_dying)) {
+		CERROR("attempt to use dying sec %p\n", sec);
+		sptlrpc_sec_put(*sec);
+		return -EACCES;
+	}
+
+	return 0;
+}
+
+/**
+ * Given a \a req, find or allocate an appropriate context for it.
+ * \pre req->rq_cli_ctx == NULL.
+ *
+ * \retval 0 succeed, and req->rq_cli_ctx is set.
+ * \retval -ev error number, and req->rq_cli_ctx == NULL.
+ */
+int sptlrpc_req_get_ctx(struct ptlrpc_request *req)
+{
+        struct obd_import *imp = req->rq_import;
+        struct ptlrpc_sec *sec;
+        int                rc;
+        ENTRY;
+
+        LASSERT(!req->rq_cli_ctx);
+        LASSERT(imp);
+
+        rc = import_sec_validate_get(imp, &sec);
+        if (rc)
+                RETURN(rc);
+
+        req->rq_cli_ctx = get_my_ctx(sec);
+
+        sptlrpc_sec_put(sec);
+
+	if (!req->rq_cli_ctx) {
+		CERROR("req %p: fail to get context\n", req);
+		RETURN(-ECONNREFUSED);
+	}
+
+        RETURN(0);
+}
+
+/**
+ * Drop the context for \a req.
+ * \pre req->rq_cli_ctx != NULL.
+ * \post req->rq_cli_ctx == NULL.
+ *
+ * If \a sync == 0, this function should return quickly without sleep;
+ * otherwise it might trigger and wait for the whole process of sending
+ * an context-destroying rpc to server.
+ */
+void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync)
+{
+        ENTRY;
+
+        LASSERT(req);
+        LASSERT(req->rq_cli_ctx);
+
+        /* request might be asked to release earlier while still
+         * in the context waiting list.
+         */
+	if (!list_empty(&req->rq_ctx_chain)) {
+		spin_lock(&req->rq_cli_ctx->cc_lock);
+		list_del_init(&req->rq_ctx_chain);
+		spin_unlock(&req->rq_cli_ctx->cc_lock);
+        }
+
+        sptlrpc_cli_ctx_put(req->rq_cli_ctx, sync);
+        req->rq_cli_ctx = NULL;
+        EXIT;
+}
+
+static
+int sptlrpc_req_ctx_switch(struct ptlrpc_request *req,
+                           struct ptlrpc_cli_ctx *oldctx,
+                           struct ptlrpc_cli_ctx *newctx)
+{
+        struct sptlrpc_flavor   old_flvr;
+        char                   *reqmsg = NULL; /* to workaround old gcc */
+        int                     reqmsg_size;
+        int                     rc = 0;
+
+        LASSERT(req->rq_reqmsg);
+        LASSERT(req->rq_reqlen);
+        LASSERT(req->rq_replen);
+
+        CDEBUG(D_SEC, "req %p: switch ctx %p(%u->%s) -> %p(%u->%s), "
+               "switch sec %p(%s) -> %p(%s)\n", req,
+               oldctx, oldctx->cc_vcred.vc_uid, sec2target_str(oldctx->cc_sec),
+               newctx, newctx->cc_vcred.vc_uid, sec2target_str(newctx->cc_sec),
+               oldctx->cc_sec, oldctx->cc_sec->ps_policy->sp_name,
+               newctx->cc_sec, newctx->cc_sec->ps_policy->sp_name);
+
+        /* save flavor */
+        old_flvr = req->rq_flvr;
+
+        /* save request message */
+        reqmsg_size = req->rq_reqlen;
+        if (reqmsg_size != 0) {
+                OBD_ALLOC_LARGE(reqmsg, reqmsg_size);
+                if (reqmsg == NULL)
+                        return -ENOMEM;
+                memcpy(reqmsg, req->rq_reqmsg, reqmsg_size);
+        }
+
+        /* release old req/rep buf */
+        req->rq_cli_ctx = oldctx;
+        sptlrpc_cli_free_reqbuf(req);
+        sptlrpc_cli_free_repbuf(req);
+        req->rq_cli_ctx = newctx;
+
+        /* recalculate the flavor */
+        sptlrpc_req_set_flavor(req, 0);
+
+        /* alloc new request buffer
+         * we don't need to alloc reply buffer here, leave it to the
+         * rest procedure of ptlrpc */
+        if (reqmsg_size != 0) {
+                rc = sptlrpc_cli_alloc_reqbuf(req, reqmsg_size);
+                if (!rc) {
+                        LASSERT(req->rq_reqmsg);
+                        memcpy(req->rq_reqmsg, reqmsg, reqmsg_size);
+                } else {
+                        CWARN("failed to alloc reqbuf: %d\n", rc);
+                        req->rq_flvr = old_flvr;
+                }
+
+                OBD_FREE_LARGE(reqmsg, reqmsg_size);
+        }
+        return rc;
+}
+
+/**
+ * If current context of \a req is dead somehow, e.g. we just switched flavor
+ * thus marked original contexts dead, we'll find a new context for it. if
+ * no switch is needed, \a req will end up with the same context.
+ *
+ * \note a request must have a context, to keep other parts of code happy.
+ * In any case of failure during the switching, we must restore the old one.
+ */
+int sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req)
+{
+        struct ptlrpc_cli_ctx *oldctx = req->rq_cli_ctx;
+        struct ptlrpc_cli_ctx *newctx;
+        int                    rc;
+        ENTRY;
+
+        LASSERT(oldctx);
+
+        sptlrpc_cli_ctx_get(oldctx);
+        sptlrpc_req_put_ctx(req, 0);
+
+        rc = sptlrpc_req_get_ctx(req);
+        if (unlikely(rc)) {
+                LASSERT(!req->rq_cli_ctx);
+
+                /* restore old ctx */
+                req->rq_cli_ctx = oldctx;
+                RETURN(rc);
+        }
+
+        newctx = req->rq_cli_ctx;
+        LASSERT(newctx);
+
+	if (unlikely(newctx == oldctx &&
+		     test_bit(PTLRPC_CTX_DEAD_BIT, &oldctx->cc_flags))) {
+                /*
+                 * still get the old dead ctx, usually means system too busy
+                 */
+                CDEBUG(D_SEC,
+                       "ctx (%p, fl %lx) doesn't switch, relax a little bit\n",
+                       newctx, newctx->cc_flags);
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC));
+	} else if (unlikely(test_bit(PTLRPC_CTX_UPTODATE_BIT, &newctx->cc_flags)
+			    == 0)) {
+		/*
+		 * new ctx not up to date yet
+		 */
+		CDEBUG(D_SEC,
+		       "ctx (%p, fl %lx) doesn't switch, not up to date yet\n",
+		       newctx, newctx->cc_flags);
+	} else {
+                /*
+                 * it's possible newctx == oldctx if we're switching
+                 * subflavor with the same sec.
+                 */
+                rc = sptlrpc_req_ctx_switch(req, oldctx, newctx);
+                if (rc) {
+                        /* restore old ctx */
+                        sptlrpc_req_put_ctx(req, 0);
+                        req->rq_cli_ctx = oldctx;
+                        RETURN(rc);
+                }
+
+                LASSERT(req->rq_cli_ctx == newctx);
+        }
+
+        sptlrpc_cli_ctx_put(oldctx, 1);
+        RETURN(0);
+}
+EXPORT_SYMBOL(sptlrpc_req_replace_dead_ctx);
+
+static
+int ctx_check_refresh(struct ptlrpc_cli_ctx *ctx)
+{
+        if (cli_ctx_is_refreshed(ctx))
+                return 1;
+        return 0;
+}
+
+static
+int ctx_refresh_timeout(void *data)
+{
+        struct ptlrpc_request *req = data;
+        int rc;
+
+        /* conn_cnt is needed in expire_one_request */
+        lustre_msg_set_conn_cnt(req->rq_reqmsg, req->rq_import->imp_conn_cnt);
+
+        rc = ptlrpc_expire_one_request(req, 1);
+        /* if we started recovery, we should mark this ctx dead; otherwise
+         * in case of lgssd died nobody would retire this ctx, following
+         * connecting will still find the same ctx thus cause deadlock.
+         * there's an assumption that expire time of the request should be
+         * later than the context refresh expire time.
+         */
+        if (rc == 0)
+                req->rq_cli_ctx->cc_ops->die(req->rq_cli_ctx, 0);
+        return rc;
+}
+
+static
+void ctx_refresh_interrupt(void *data)
+{
+	struct ptlrpc_request *req = data;
+
+	spin_lock(&req->rq_lock);
+	req->rq_intr = 1;
+	spin_unlock(&req->rq_lock);
+}
+
+static
+void req_off_ctx_list(struct ptlrpc_request *req, struct ptlrpc_cli_ctx *ctx)
+{
+	spin_lock(&ctx->cc_lock);
+	if (!list_empty(&req->rq_ctx_chain))
+		list_del_init(&req->rq_ctx_chain);
+	spin_unlock(&ctx->cc_lock);
+}
+
+/**
+ * To refresh the context of \req, if it's not up-to-date.
+ * \param timeout
+ * - < 0: don't wait
+ * - = 0: wait until success or fatal error occur
+ * - > 0: timeout value (in seconds)
+ *
+ * The status of the context could be subject to be changed by other threads
+ * at any time. We allow this race, but once we return with 0, the caller will
+ * suppose it's uptodated and keep using it until the owning rpc is done.
+ *
+ * \retval 0 only if the context is uptodated.
+ * \retval -ev error number.
+ */
+int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout)
+{
+        struct ptlrpc_cli_ctx  *ctx = req->rq_cli_ctx;
+        struct ptlrpc_sec      *sec;
+        struct l_wait_info      lwi;
+        int                     rc;
+        ENTRY;
+
+        LASSERT(ctx);
+
+        if (req->rq_ctx_init || req->rq_ctx_fini)
+                RETURN(0);
+
+        /*
+         * during the process a request's context might change type even
+         * (e.g. from gss ctx to null ctx), so each loop we need to re-check
+         * everything
+         */
+again:
+        rc = import_sec_validate_get(req->rq_import, &sec);
+        if (rc)
+                RETURN(rc);
+
+        if (sec->ps_flvr.sf_rpc != req->rq_flvr.sf_rpc) {
+                CDEBUG(D_SEC, "req %p: flavor has changed %x -> %x\n",
+                      req, req->rq_flvr.sf_rpc, sec->ps_flvr.sf_rpc);
+                req_off_ctx_list(req, ctx);
+                sptlrpc_req_replace_dead_ctx(req);
+                ctx = req->rq_cli_ctx;
+        }
+        sptlrpc_sec_put(sec);
+
+        if (cli_ctx_is_eternal(ctx))
+                RETURN(0);
+
+	if (unlikely(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags))) {
+                LASSERT(ctx->cc_ops->refresh);
+                ctx->cc_ops->refresh(ctx);
+        }
+	LASSERT(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags) == 0);
+
+        LASSERT(ctx->cc_ops->validate);
+        if (ctx->cc_ops->validate(ctx) == 0) {
+                req_off_ctx_list(req, ctx);
+                RETURN(0);
+        }
+
+	if (unlikely(test_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags))) {
+		spin_lock(&req->rq_lock);
+		req->rq_err = 1;
+		spin_unlock(&req->rq_lock);
+                req_off_ctx_list(req, ctx);
+                RETURN(-EPERM);
+        }
+
+        /*
+         * There's a subtle issue for resending RPCs, suppose following
+         * situation:
+         *  1. the request was sent to server.
+         *  2. recovery was kicked start, after finished the request was
+         *     marked as resent.
+         *  3. resend the request.
+         *  4. old reply from server received, we accept and verify the reply.
+         *     this has to be success, otherwise the error will be aware
+         *     by application.
+         *  5. new reply from server received, dropped by LNet.
+         *
+         * Note the xid of old & new request is the same. We can't simply
+         * change xid for the resent request because the server replies on
+         * it for reply reconstruction.
+         *
+         * Commonly the original context should be uptodate because we
+	 * have an expiry nice time; server will keep its context because
+         * we at least hold a ref of old context which prevent context
+	 * from destroying RPC being sent. So server still can accept the
+	 * request and finish the RPC. But if that's not the case:
+         *  1. If server side context has been trimmed, a NO_CONTEXT will
+         *     be returned, gss_cli_ctx_verify/unseal will switch to new
+         *     context by force.
+         *  2. Current context never be refreshed, then we are fine: we
+         *     never really send request with old context before.
+         */
+	if (test_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags) &&
+	    unlikely(req->rq_reqmsg) &&
+	    lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) {
+		req_off_ctx_list(req, ctx);
+		RETURN(0);
+	}
+
+	if (unlikely(test_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags))) {
+		req_off_ctx_list(req, ctx);
+		/*
+		 * don't switch ctx if import was deactivated
+		 */
+		if (req->rq_import->imp_deactive) {
+			spin_lock(&req->rq_lock);
+			req->rq_err = 1;
+			spin_unlock(&req->rq_lock);
+			RETURN(-EINTR);
+		}
+
+		rc = sptlrpc_req_replace_dead_ctx(req);
+		if (rc) {
+			LASSERT(ctx == req->rq_cli_ctx);
+			CERROR("req %p: failed to replace dead ctx %p: %d\n",
+			       req, ctx, rc);
+			spin_lock(&req->rq_lock);
+			req->rq_err = 1;
+			spin_unlock(&req->rq_lock);
+                        RETURN(rc);
+                }
+
+                ctx = req->rq_cli_ctx;
+                goto again;
+        }
+
+        /*
+         * Now we're sure this context is during upcall, add myself into
+         * waiting list
+         */
+	spin_lock(&ctx->cc_lock);
+	if (list_empty(&req->rq_ctx_chain))
+		list_add(&req->rq_ctx_chain, &ctx->cc_req_list);
+	spin_unlock(&ctx->cc_lock);
+
+	if (timeout < 0)
+		RETURN(-EWOULDBLOCK);
+
+	/* Clear any flags that may be present from previous sends */
+	LASSERT(req->rq_receiving_reply == 0);
+	spin_lock(&req->rq_lock);
+	req->rq_err = 0;
+	req->rq_timedout = 0;
+	req->rq_resend = 0;
+	req->rq_restart = 0;
+	spin_unlock(&req->rq_lock);
+
+	lwi = LWI_TIMEOUT_INTR(msecs_to_jiffies(timeout * MSEC_PER_SEC),
+			       ctx_refresh_timeout,
+			       ctx_refresh_interrupt, req);
+	rc = l_wait_event(req->rq_reply_waitq, ctx_check_refresh(ctx), &lwi);
+
+        /*
+         * following cases could lead us here:
+         * - successfully refreshed;
+         * - interrupted;
+         * - timedout, and we don't want recover from the failure;
+         * - timedout, and waked up upon recovery finished;
+         * - someone else mark this ctx dead by force;
+         * - someone invalidate the req and call ptlrpc_client_wake_req(),
+         *   e.g. ptlrpc_abort_inflight();
+         */
+        if (!cli_ctx_is_refreshed(ctx)) {
+                /* timed out or interruptted */
+                req_off_ctx_list(req, ctx);
+
+                LASSERT(rc != 0);
+                RETURN(rc);
+        }
+
+        goto again;
+}
+
+/**
+ * Initialize flavor settings for \a req, according to \a opcode.
+ *
+ * \note this could be called in two situations:
+ * - new request from ptlrpc_pre_req(), with proper @opcode
+ * - old request which changed ctx in the middle, with @opcode == 0
+ */
+void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode)
+{
+        struct ptlrpc_sec *sec;
+
+        LASSERT(req->rq_import);
+        LASSERT(req->rq_cli_ctx);
+        LASSERT(req->rq_cli_ctx->cc_sec);
+        LASSERT(req->rq_bulk_read == 0 || req->rq_bulk_write == 0);
+
+	/* special security flags according to opcode */
+        switch (opcode) {
+        case OST_READ:
+        case MDS_READPAGE:
+        case MGS_CONFIG_READ:
+	case OBD_IDX_READ:
+                req->rq_bulk_read = 1;
+                break;
+        case OST_WRITE:
+        case MDS_WRITEPAGE:
+                req->rq_bulk_write = 1;
+                break;
+        case SEC_CTX_INIT:
+                req->rq_ctx_init = 1;
+                break;
+        case SEC_CTX_FINI:
+                req->rq_ctx_fini = 1;
+                break;
+        case 0:
+                /* init/fini rpc won't be resend, so can't be here */
+                LASSERT(req->rq_ctx_init == 0);
+                LASSERT(req->rq_ctx_fini == 0);
+
+                /* cleanup flags, which should be recalculated */
+                req->rq_pack_udesc = 0;
+                req->rq_pack_bulk = 0;
+                break;
+        }
+
+        sec = req->rq_cli_ctx->cc_sec;
+
+	spin_lock(&sec->ps_lock);
+	req->rq_flvr = sec->ps_flvr;
+	spin_unlock(&sec->ps_lock);
+
+        /* force SVC_NULL for context initiation rpc, SVC_INTG for context
+         * destruction rpc */
+        if (unlikely(req->rq_ctx_init))
+                flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_NULL);
+        else if (unlikely(req->rq_ctx_fini))
+                flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_INTG);
+
+        /* user descriptor flag, null security can't do it anyway */
+        if ((sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_UDESC) &&
+            (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_NULL))
+                req->rq_pack_udesc = 1;
+
+        /* bulk security flag */
+        if ((req->rq_bulk_read || req->rq_bulk_write) &&
+            sptlrpc_flavor_has_bulk(&req->rq_flvr))
+                req->rq_pack_bulk = 1;
+}
+
+void sptlrpc_request_out_callback(struct ptlrpc_request *req)
+{
+        if (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc) != SPTLRPC_SVC_PRIV)
+                return;
+
+        LASSERT(req->rq_clrbuf);
+        if (req->rq_pool || !req->rq_reqbuf)
+                return;
+
+        OBD_FREE(req->rq_reqbuf, req->rq_reqbuf_len);
+        req->rq_reqbuf = NULL;
+        req->rq_reqbuf_len = 0;
+}
+
+/**
+ * Given an import \a imp, check whether current user has a valid context
+ * or not. We may create a new context and try to refresh it, and try
+ * repeatedly try in case of non-fatal errors. Return 0 means success.
+ */
+int sptlrpc_import_check_ctx(struct obd_import *imp)
+{
+	struct ptlrpc_sec     *sec;
+	struct ptlrpc_cli_ctx *ctx;
+	struct ptlrpc_request *req = NULL;
+	int rc;
+	ENTRY;
+
+	might_sleep();
+
+	sec = sptlrpc_import_sec_ref(imp);
+	ctx = get_my_ctx(sec);
+	sptlrpc_sec_put(sec);
+
+        if (!ctx)
+                RETURN(-ENOMEM);
+
+        if (cli_ctx_is_eternal(ctx) ||
+            ctx->cc_ops->validate(ctx) == 0) {
+                sptlrpc_cli_ctx_put(ctx, 1);
+                RETURN(0);
+        }
+
+        if (cli_ctx_is_error(ctx)) {
+                sptlrpc_cli_ctx_put(ctx, 1);
+                RETURN(-EACCES);
+        }
+
+	req = ptlrpc_request_cache_alloc(GFP_NOFS);
+	if (!req)
+		RETURN(-ENOMEM);
+
+	ptlrpc_cli_req_init(req);
+	atomic_set(&req->rq_refcount, 10000);
+
+	req->rq_import = imp;
+	req->rq_flvr = sec->ps_flvr;
+	req->rq_cli_ctx = ctx;
+
+        rc = sptlrpc_req_refresh_ctx(req, 0);
+	LASSERT(list_empty(&req->rq_ctx_chain));
+        sptlrpc_cli_ctx_put(req->rq_cli_ctx, 1);
+	ptlrpc_request_cache_free(req);
+
+	RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc client, to perform the pre-defined security transformation
+ * upon the request message of \a req. After this function called,
+ * req->rq_reqmsg is still accessible as clear text.
+ */
+int sptlrpc_cli_wrap_request(struct ptlrpc_request *req)
+{
+        struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+        int rc = 0;
+        ENTRY;
+
+        LASSERT(ctx);
+        LASSERT(ctx->cc_sec);
+        LASSERT(req->rq_reqbuf || req->rq_clrbuf);
+
+        /* we wrap bulk request here because now we can be sure
+         * the context is uptodate.
+         */
+        if (req->rq_bulk) {
+                rc = sptlrpc_cli_wrap_bulk(req, req->rq_bulk);
+                if (rc)
+                        RETURN(rc);
+        }
+
+        switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+        case SPTLRPC_SVC_NULL:
+        case SPTLRPC_SVC_AUTH:
+        case SPTLRPC_SVC_INTG:
+                LASSERT(ctx->cc_ops->sign);
+                rc = ctx->cc_ops->sign(ctx, req);
+                break;
+        case SPTLRPC_SVC_PRIV:
+                LASSERT(ctx->cc_ops->seal);
+                rc = ctx->cc_ops->seal(ctx, req);
+                break;
+        default:
+                LBUG();
+        }
+
+        if (rc == 0) {
+                LASSERT(req->rq_reqdata_len);
+                LASSERT(req->rq_reqdata_len % 8 == 0);
+                LASSERT(req->rq_reqdata_len <= req->rq_reqbuf_len);
+        }
+
+        RETURN(rc);
+}
+
+static int do_cli_unwrap_reply(struct ptlrpc_request *req)
+{
+        struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+        int                    rc;
+        ENTRY;
+
+        LASSERT(ctx);
+        LASSERT(ctx->cc_sec);
+        LASSERT(req->rq_repbuf);
+        LASSERT(req->rq_repdata);
+        LASSERT(req->rq_repmsg == NULL);
+
+        req->rq_rep_swab_mask = 0;
+
+        rc = __lustre_unpack_msg(req->rq_repdata, req->rq_repdata_len);
+        switch (rc) {
+        case 1:
+                lustre_set_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+        case 0:
+                break;
+        default:
+		CERROR("failed unpack reply: x%llu\n", req->rq_xid);
+                RETURN(-EPROTO);
+        }
+
+        if (req->rq_repdata_len < sizeof(struct lustre_msg)) {
+                CERROR("replied data length %d too small\n",
+                       req->rq_repdata_len);
+                RETURN(-EPROTO);
+        }
+
+        if (SPTLRPC_FLVR_POLICY(req->rq_repdata->lm_secflvr) !=
+            SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc)) {
+                CERROR("reply policy %u doesn't match request policy %u\n",
+                       SPTLRPC_FLVR_POLICY(req->rq_repdata->lm_secflvr),
+                       SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc));
+                RETURN(-EPROTO);
+        }
+
+        switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+        case SPTLRPC_SVC_NULL:
+        case SPTLRPC_SVC_AUTH:
+        case SPTLRPC_SVC_INTG:
+                LASSERT(ctx->cc_ops->verify);
+                rc = ctx->cc_ops->verify(ctx, req);
+                break;
+        case SPTLRPC_SVC_PRIV:
+                LASSERT(ctx->cc_ops->unseal);
+                rc = ctx->cc_ops->unseal(ctx, req);
+                break;
+        default:
+                LBUG();
+        }
+        LASSERT(rc || req->rq_repmsg || req->rq_resend);
+
+        if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL &&
+            !req->rq_ctx_init)
+                req->rq_rep_swab_mask = 0;
+        RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc client, to perform security transformation upon the reply
+ * message of \a req. After return successfully, req->rq_repmsg points to
+ * the reply message in clear text.
+ *
+ * \pre the reply buffer should have been un-posted from LNet, so nothing is
+ * going to change.
+ */
+int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req)
+{
+        LASSERT(req->rq_repbuf);
+        LASSERT(req->rq_repdata == NULL);
+        LASSERT(req->rq_repmsg == NULL);
+        LASSERT(req->rq_reply_off + req->rq_nob_received <= req->rq_repbuf_len);
+
+        if (req->rq_reply_off == 0 &&
+            (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) {
+                CERROR("real reply with offset 0\n");
+                return -EPROTO;
+        }
+
+        if (req->rq_reply_off % 8 != 0) {
+                CERROR("reply at odd offset %u\n", req->rq_reply_off);
+                return -EPROTO;
+        }
+
+        req->rq_repdata = (struct lustre_msg *)
+                                (req->rq_repbuf + req->rq_reply_off);
+        req->rq_repdata_len = req->rq_nob_received;
+
+        return do_cli_unwrap_reply(req);
+}
+
+/**
+ * Used by ptlrpc client, to perform security transformation upon the early
+ * reply message of \a req. We expect the rq_reply_off is 0, and
+ * rq_nob_received is the early reply size.
+ * 
+ * Because the receive buffer might be still posted, the reply data might be
+ * changed at any time, no matter we're holding rq_lock or not. For this reason
+ * we allocate a separate ptlrpc_request and reply buffer for early reply
+ * processing.
+ *
+ * \retval 0 success, \a req_ret is filled with a duplicated ptlrpc_request.
+ * Later the caller must call sptlrpc_cli_finish_early_reply() on the returned
+ * \a *req_ret to release it.
+ * \retval -ev error number, and \a req_ret will not be set.
+ */
+int sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req,
+				   struct ptlrpc_request **req_ret)
+{
+	struct ptlrpc_request  *early_req;
+	char		       *early_buf;
+	int			early_bufsz, early_size;
+	int			rc;
+	ENTRY;
+
+	early_req = ptlrpc_request_cache_alloc(GFP_NOFS);
+	if (early_req == NULL)
+		RETURN(-ENOMEM);
+
+	ptlrpc_cli_req_init(early_req);
+
+	early_size = req->rq_nob_received;
+	early_bufsz = size_roundup_power2(early_size);
+	OBD_ALLOC_LARGE(early_buf, early_bufsz);
+	if (early_buf == NULL)
+		GOTO(err_req, rc = -ENOMEM);
+
+	/* sanity checkings and copy data out, do it inside spinlock */
+	spin_lock(&req->rq_lock);
+
+	if (req->rq_replied) {
+		spin_unlock(&req->rq_lock);
+		GOTO(err_buf, rc = -EALREADY);
+	}
+
+	LASSERT(req->rq_repbuf);
+	LASSERT(req->rq_repdata == NULL);
+	LASSERT(req->rq_repmsg == NULL);
+
+	if (req->rq_reply_off != 0) {
+		CERROR("early reply with offset %u\n", req->rq_reply_off);
+		spin_unlock(&req->rq_lock);
+		GOTO(err_buf, rc = -EPROTO);
+	}
+
+	if (req->rq_nob_received != early_size) {
+		/* even another early arrived the size should be the same */
+		CERROR("data size has changed from %u to %u\n",
+		       early_size, req->rq_nob_received);
+		spin_unlock(&req->rq_lock);
+		GOTO(err_buf, rc = -EINVAL);
+	}
+
+	if (req->rq_nob_received < sizeof(struct lustre_msg)) {
+		CERROR("early reply length %d too small\n",
+		       req->rq_nob_received);
+		spin_unlock(&req->rq_lock);
+		GOTO(err_buf, rc = -EALREADY);
+	}
+
+	memcpy(early_buf, req->rq_repbuf, early_size);
+	spin_unlock(&req->rq_lock);
+
+        early_req->rq_cli_ctx = sptlrpc_cli_ctx_get(req->rq_cli_ctx);
+        early_req->rq_flvr = req->rq_flvr;
+        early_req->rq_repbuf = early_buf;
+        early_req->rq_repbuf_len = early_bufsz;
+        early_req->rq_repdata = (struct lustre_msg *) early_buf;
+        early_req->rq_repdata_len = early_size;
+        early_req->rq_early = 1;
+        early_req->rq_reqmsg = req->rq_reqmsg;
+
+        rc = do_cli_unwrap_reply(early_req);
+        if (rc) {
+                DEBUG_REQ(D_ADAPTTO, early_req,
+                          "error %d unwrap early reply", rc);
+                GOTO(err_ctx, rc);
+        }
+
+        LASSERT(early_req->rq_repmsg);
+        *req_ret = early_req;
+        RETURN(0);
+
+err_ctx:
+        sptlrpc_cli_ctx_put(early_req->rq_cli_ctx, 1);
+err_buf:
+        OBD_FREE_LARGE(early_buf, early_bufsz);
+err_req:
+	ptlrpc_request_cache_free(early_req);
+	RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc client, to release a processed early reply \a early_req.
+ *
+ * \pre \a early_req was obtained from calling sptlrpc_cli_unwrap_early_reply().
+ */
+void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req)
+{
+	LASSERT(early_req->rq_repbuf);
+	LASSERT(early_req->rq_repdata);
+	LASSERT(early_req->rq_repmsg);
+
+	sptlrpc_cli_ctx_put(early_req->rq_cli_ctx, 1);
+	OBD_FREE_LARGE(early_req->rq_repbuf, early_req->rq_repbuf_len);
+	ptlrpc_request_cache_free(early_req);
+}
+
+/**************************************************
+ * sec ID                                         *
+ **************************************************/
+
+/*
+ * "fixed" sec (e.g. null) use sec_id < 0
+ */
+static atomic_t sptlrpc_sec_id = ATOMIC_INIT(1);
+
+int sptlrpc_get_next_secid(void)
+{
+	return atomic_inc_return(&sptlrpc_sec_id);
+}
+EXPORT_SYMBOL(sptlrpc_get_next_secid);
+
+/**************************************************
+ * client side high-level security APIs           *
+ **************************************************/
+
+static int sec_cop_flush_ctx_cache(struct ptlrpc_sec *sec, uid_t uid,
+                                   int grace, int force)
+{
+        struct ptlrpc_sec_policy *policy = sec->ps_policy;
+
+        LASSERT(policy->sp_cops);
+        LASSERT(policy->sp_cops->flush_ctx_cache);
+
+        return policy->sp_cops->flush_ctx_cache(sec, uid, grace, force);
+}
+
+static void sec_cop_destroy_sec(struct ptlrpc_sec *sec)
+{
+        struct ptlrpc_sec_policy *policy = sec->ps_policy;
+
+        LASSERT_ATOMIC_ZERO(&sec->ps_refcount);
+        LASSERT_ATOMIC_ZERO(&sec->ps_nctx);
+        LASSERT(policy->sp_cops->destroy_sec);
+
+        CDEBUG(D_SEC, "%s@%p: being destroied\n", sec->ps_policy->sp_name, sec);
+
+        policy->sp_cops->destroy_sec(sec);
+        sptlrpc_policy_put(policy);
+}
+
+void sptlrpc_sec_destroy(struct ptlrpc_sec *sec)
+{
+        sec_cop_destroy_sec(sec);
+}
+EXPORT_SYMBOL(sptlrpc_sec_destroy);
+
+static void sptlrpc_sec_kill(struct ptlrpc_sec *sec)
+{
+        LASSERT_ATOMIC_POS(&sec->ps_refcount);
+
+        if (sec->ps_policy->sp_cops->kill_sec) {
+                sec->ps_policy->sp_cops->kill_sec(sec);
+
+                sec_cop_flush_ctx_cache(sec, -1, 1, 1);
+        }
+}
+
+struct ptlrpc_sec *sptlrpc_sec_get(struct ptlrpc_sec *sec)
+{
+	if (sec)
+		atomic_inc(&sec->ps_refcount);
+
+	return sec;
+}
+EXPORT_SYMBOL(sptlrpc_sec_get);
+
+void sptlrpc_sec_put(struct ptlrpc_sec *sec)
+{
+	if (sec) {
+		LASSERT_ATOMIC_POS(&sec->ps_refcount);
+
+		if (atomic_dec_and_test(&sec->ps_refcount)) {
+			sptlrpc_gc_del_sec(sec);
+			sec_cop_destroy_sec(sec);
+		}
+	}
+}
+EXPORT_SYMBOL(sptlrpc_sec_put);
+
+/*
+ * policy module is responsible for taking refrence of import
+ */
+static
+struct ptlrpc_sec * sptlrpc_sec_create(struct obd_import *imp,
+                                       struct ptlrpc_svc_ctx *svc_ctx,
+                                       struct sptlrpc_flavor *sf,
+                                       enum lustre_sec_part sp)
+{
+        struct ptlrpc_sec_policy *policy;
+        struct ptlrpc_sec        *sec;
+        char                      str[32];
+        ENTRY;
+
+        if (svc_ctx) {
+                LASSERT(imp->imp_dlm_fake == 1);
+
+                CDEBUG(D_SEC, "%s %s: reverse sec using flavor %s\n",
+                       imp->imp_obd->obd_type->typ_name,
+                       imp->imp_obd->obd_name,
+                       sptlrpc_flavor2name(sf, str, sizeof(str)));
+
+                policy = sptlrpc_policy_get(svc_ctx->sc_policy);
+                sf->sf_flags |= PTLRPC_SEC_FL_REVERSE | PTLRPC_SEC_FL_ROOTONLY;
+        } else {
+                LASSERT(imp->imp_dlm_fake == 0);
+
+                CDEBUG(D_SEC, "%s %s: select security flavor %s\n",
+                       imp->imp_obd->obd_type->typ_name,
+                       imp->imp_obd->obd_name,
+                       sptlrpc_flavor2name(sf, str, sizeof(str)));
+
+                policy = sptlrpc_wireflavor2policy(sf->sf_rpc);
+                if (!policy) {
+                        CERROR("invalid flavor 0x%x\n", sf->sf_rpc);
+                        RETURN(NULL);
+                }
+        }
+
+	sec = policy->sp_cops->create_sec(imp, svc_ctx, sf);
+	if (sec) {
+		atomic_inc(&sec->ps_refcount);
+
+		sec->ps_part = sp;
+
+		if (sec->ps_gc_interval && policy->sp_cops->gc_ctx)
+			sptlrpc_gc_add_sec(sec);
+	} else {
+		sptlrpc_policy_put(policy);
+	}
+
+	RETURN(sec);
+}
+
+struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp)
+{
+	struct ptlrpc_sec *sec;
+
+	spin_lock(&imp->imp_lock);
+	sec = sptlrpc_sec_get(imp->imp_sec);
+	spin_unlock(&imp->imp_lock);
+
+	return sec;
+}
+EXPORT_SYMBOL(sptlrpc_import_sec_ref);
+
+static void sptlrpc_import_sec_install(struct obd_import *imp,
+                                       struct ptlrpc_sec *sec)
+{
+	struct ptlrpc_sec *old_sec;
+
+	LASSERT_ATOMIC_POS(&sec->ps_refcount);
+
+	spin_lock(&imp->imp_lock);
+	old_sec = imp->imp_sec;
+	imp->imp_sec = sec;
+	spin_unlock(&imp->imp_lock);
+
+        if (old_sec) {
+                sptlrpc_sec_kill(old_sec);
+
+                /* balance the ref taken by this import */
+                sptlrpc_sec_put(old_sec);
+        }
+}
+
+static inline
+int flavor_equal(struct sptlrpc_flavor *sf1, struct sptlrpc_flavor *sf2)
+{
+        return (memcmp(sf1, sf2, sizeof(*sf1)) == 0);
+}
+
+static inline
+void flavor_copy(struct sptlrpc_flavor *dst, struct sptlrpc_flavor *src)
+{
+        *dst = *src;
+}
+
+/**
+ * To get an appropriate ptlrpc_sec for the \a imp, according to the current
+ * configuration. Upon called, imp->imp_sec may or may not be NULL.
+ *
+ *  - regular import: \a svc_ctx should be NULL and \a flvr is ignored;
+ *  - reverse import: \a svc_ctx and \a flvr are obtained from incoming request.
+ */
+int sptlrpc_import_sec_adapt(struct obd_import *imp,
+                             struct ptlrpc_svc_ctx *svc_ctx,
+                             struct sptlrpc_flavor *flvr)
+{
+	struct ptlrpc_connection   *conn;
+	struct sptlrpc_flavor       sf;
+	struct ptlrpc_sec          *sec, *newsec;
+	enum lustre_sec_part        sp;
+	char                        str[24];
+	int                         rc = 0;
+	ENTRY;
+
+	might_sleep();
+
+	if (imp == NULL)
+		RETURN(0);
+
+	conn = imp->imp_connection;
+
+        if (svc_ctx == NULL) {
+                struct client_obd *cliobd = &imp->imp_obd->u.cli;
+                /*
+                 * normal import, determine flavor from rule set, except
+                 * for mgc the flavor is predetermined.
+                 */
+                if (cliobd->cl_sp_me == LUSTRE_SP_MGC)
+                        sf = cliobd->cl_flvr_mgc;
+                else 
+                        sptlrpc_conf_choose_flavor(cliobd->cl_sp_me,
+                                                   cliobd->cl_sp_to,
+                                                   &cliobd->cl_target_uuid,
+                                                   conn->c_self, &sf);
+
+                sp = imp->imp_obd->u.cli.cl_sp_me;
+        } else {
+                /* reverse import, determine flavor from incoming reqeust */
+                sf = *flvr;
+
+                if (sf.sf_rpc != SPTLRPC_FLVR_NULL)
+                        sf.sf_flags = PTLRPC_SEC_FL_REVERSE |
+                                      PTLRPC_SEC_FL_ROOTONLY;
+
+                sp = sptlrpc_target_sec_part(imp->imp_obd);
+        }
+
+        sec = sptlrpc_import_sec_ref(imp);
+        if (sec) {
+                char    str2[24];
+
+                if (flavor_equal(&sf, &sec->ps_flvr))
+                        GOTO(out, rc);
+
+                CDEBUG(D_SEC, "import %s->%s: changing flavor %s -> %s\n",
+                       imp->imp_obd->obd_name,
+                       obd_uuid2str(&conn->c_remote_uuid),
+                       sptlrpc_flavor2name(&sec->ps_flvr, str, sizeof(str)),
+                       sptlrpc_flavor2name(&sf, str2, sizeof(str2)));
+        } else if (SPTLRPC_FLVR_BASE(sf.sf_rpc) !=
+                   SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_NULL)) {
+                CDEBUG(D_SEC, "import %s->%s netid %x: select flavor %s\n",
+                       imp->imp_obd->obd_name,
+                       obd_uuid2str(&conn->c_remote_uuid),
+                       LNET_NIDNET(conn->c_self),
+                       sptlrpc_flavor2name(&sf, str, sizeof(str)));
+        }
+
+	mutex_lock(&imp->imp_sec_mutex);
+
+        newsec = sptlrpc_sec_create(imp, svc_ctx, &sf, sp);
+        if (newsec) {
+                sptlrpc_import_sec_install(imp, newsec);
+        } else {
+                CERROR("import %s->%s: failed to create new sec\n",
+                       imp->imp_obd->obd_name,
+                       obd_uuid2str(&conn->c_remote_uuid));
+                rc = -EPERM;
+        }
+
+	mutex_unlock(&imp->imp_sec_mutex);
+out:
+        sptlrpc_sec_put(sec);
+        RETURN(rc);
+}
+
+void sptlrpc_import_sec_put(struct obd_import *imp)
+{
+        if (imp->imp_sec) {
+                sptlrpc_sec_kill(imp->imp_sec);
+
+                sptlrpc_sec_put(imp->imp_sec);
+                imp->imp_sec = NULL;
+        }
+}
+
+static void import_flush_ctx_common(struct obd_import *imp,
+                                    uid_t uid, int grace, int force)
+{
+        struct ptlrpc_sec *sec;
+
+        if (imp == NULL)
+                return;
+
+        sec = sptlrpc_import_sec_ref(imp);
+        if (sec == NULL)
+                return;
+
+        sec_cop_flush_ctx_cache(sec, uid, grace, force);
+        sptlrpc_sec_put(sec);
+}
+
+void sptlrpc_import_flush_root_ctx(struct obd_import *imp)
+{
+        /* it's important to use grace mode, see explain in
+         * sptlrpc_req_refresh_ctx() */
+        import_flush_ctx_common(imp, 0, 1, 1);
+}
+
+void sptlrpc_import_flush_my_ctx(struct obd_import *imp)
+{
+	import_flush_ctx_common(imp, from_kuid(&init_user_ns, current_uid()),
+				1, 1);
+}
+EXPORT_SYMBOL(sptlrpc_import_flush_my_ctx);
+
+void sptlrpc_import_flush_all_ctx(struct obd_import *imp)
+{
+        import_flush_ctx_common(imp, -1, 1, 1);
+}
+EXPORT_SYMBOL(sptlrpc_import_flush_all_ctx);
+
+/**
+ * Used by ptlrpc client to allocate request buffer of \a req. Upon return
+ * successfully, req->rq_reqmsg points to a buffer with size \a msgsize.
+ */
+int sptlrpc_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize)
+{
+        struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+        struct ptlrpc_sec_policy *policy;
+        int rc;
+
+        LASSERT(ctx);
+        LASSERT(ctx->cc_sec);
+        LASSERT(ctx->cc_sec->ps_policy);
+        LASSERT(req->rq_reqmsg == NULL);
+        LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+        policy = ctx->cc_sec->ps_policy;
+        rc = policy->sp_cops->alloc_reqbuf(ctx->cc_sec, req, msgsize);
+        if (!rc) {
+                LASSERT(req->rq_reqmsg);
+                LASSERT(req->rq_reqbuf || req->rq_clrbuf);
+
+                /* zeroing preallocated buffer */
+                if (req->rq_pool)
+                        memset(req->rq_reqmsg, 0, msgsize);
+        }
+
+        return rc;
+}
+
+/**
+ * Used by ptlrpc client to free request buffer of \a req. After this
+ * req->rq_reqmsg is set to NULL and should not be accessed anymore.
+ */
+void sptlrpc_cli_free_reqbuf(struct ptlrpc_request *req)
+{
+        struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+        struct ptlrpc_sec_policy *policy;
+
+        LASSERT(ctx);
+        LASSERT(ctx->cc_sec);
+        LASSERT(ctx->cc_sec->ps_policy);
+        LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+        if (req->rq_reqbuf == NULL && req->rq_clrbuf == NULL)
+                return;
+
+        policy = ctx->cc_sec->ps_policy;
+        policy->sp_cops->free_reqbuf(ctx->cc_sec, req);
+        req->rq_reqmsg = NULL;
+}
+
+/*
+ * NOTE caller must guarantee the buffer size is enough for the enlargement
+ */
+void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg,
+                                  int segment, int newsize)
+{
+        void   *src, *dst;
+        int     oldsize, oldmsg_size, movesize;
+
+        LASSERT(segment < msg->lm_bufcount);
+        LASSERT(msg->lm_buflens[segment] <= newsize);
+
+        if (msg->lm_buflens[segment] == newsize)
+                return;
+
+        /* nothing to do if we are enlarging the last segment */
+        if (segment == msg->lm_bufcount - 1) {
+                msg->lm_buflens[segment] = newsize;
+                return;
+        }
+
+        oldsize = msg->lm_buflens[segment];
+
+        src = lustre_msg_buf(msg, segment + 1, 0);
+        msg->lm_buflens[segment] = newsize;
+        dst = lustre_msg_buf(msg, segment + 1, 0);
+        msg->lm_buflens[segment] = oldsize;
+
+        /* move from segment + 1 to end segment */
+        LASSERT(msg->lm_magic == LUSTRE_MSG_MAGIC_V2);
+        oldmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+        movesize = oldmsg_size - ((unsigned long) src - (unsigned long) msg);
+        LASSERT(movesize >= 0);
+
+        if (movesize)
+                memmove(dst, src, movesize);
+
+        /* note we don't clear the ares where old data live, not secret */
+
+        /* finally set new segment size */
+        msg->lm_buflens[segment] = newsize;
+}
+EXPORT_SYMBOL(_sptlrpc_enlarge_msg_inplace);
+
+/**
+ * Used by ptlrpc client to enlarge the \a segment of request message pointed
+ * by req->rq_reqmsg to size \a newsize, all previously filled-in data will be
+ * preserved after the enlargement. this must be called after original request
+ * buffer being allocated.
+ *
+ * \note after this be called, rq_reqmsg and rq_reqlen might have been changed,
+ * so caller should refresh its local pointers if needed.
+ */
+int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req,
+			       const struct req_msg_field *field,
+			       int newsize)
+{
+	struct req_capsule *pill = &req->rq_pill;
+	struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+	struct ptlrpc_sec_cops *cops;
+	struct lustre_msg *msg = req->rq_reqmsg;
+	int segment = __req_capsule_offset(pill, field, RCL_CLIENT);
+
+        LASSERT(ctx);
+        LASSERT(msg);
+        LASSERT(msg->lm_bufcount > segment);
+        LASSERT(msg->lm_buflens[segment] <= newsize);
+
+        if (msg->lm_buflens[segment] == newsize)
+                return 0;
+
+        cops = ctx->cc_sec->ps_policy->sp_cops;
+        LASSERT(cops->enlarge_reqbuf);
+        return cops->enlarge_reqbuf(ctx->cc_sec, req, segment, newsize);
+}
+EXPORT_SYMBOL(sptlrpc_cli_enlarge_reqbuf);
+
+/**
+ * Used by ptlrpc client to allocate reply buffer of \a req.
+ *
+ * \note After this, req->rq_repmsg is still not accessible.
+ */
+int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize)
+{
+        struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+        struct ptlrpc_sec_policy *policy;
+        ENTRY;
+
+        LASSERT(ctx);
+        LASSERT(ctx->cc_sec);
+        LASSERT(ctx->cc_sec->ps_policy);
+
+        if (req->rq_repbuf)
+                RETURN(0);
+
+        policy = ctx->cc_sec->ps_policy;
+        RETURN(policy->sp_cops->alloc_repbuf(ctx->cc_sec, req, msgsize));
+}
+
+/**
+ * Used by ptlrpc client to free reply buffer of \a req. After this
+ * req->rq_repmsg is set to NULL and should not be accessed anymore.
+ */
+void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req)
+{
+        struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+        struct ptlrpc_sec_policy *policy;
+        ENTRY;
+
+        LASSERT(ctx);
+        LASSERT(ctx->cc_sec);
+        LASSERT(ctx->cc_sec->ps_policy);
+        LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+        if (req->rq_repbuf == NULL)
+                return;
+        LASSERT(req->rq_repbuf_len);
+
+        policy = ctx->cc_sec->ps_policy;
+        policy->sp_cops->free_repbuf(ctx->cc_sec, req);
+        req->rq_repmsg = NULL;
+        EXIT;
+}
+
+int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp,
+                                struct ptlrpc_cli_ctx *ctx)
+{
+        struct ptlrpc_sec_policy *policy = ctx->cc_sec->ps_policy;
+
+        if (!policy->sp_cops->install_rctx)
+                return 0;
+        return policy->sp_cops->install_rctx(imp, ctx->cc_sec, ctx);
+}
+
+int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp,
+                                struct ptlrpc_svc_ctx *ctx)
+{
+        struct ptlrpc_sec_policy *policy = ctx->sc_policy;
+
+        if (!policy->sp_sops->install_rctx)
+                return 0;
+        return policy->sp_sops->install_rctx(imp, ctx);
+}
+
+/****************************************
+ * server side security                 *
+ ****************************************/
+
+static int flavor_allowed(struct sptlrpc_flavor *exp,
+                          struct ptlrpc_request *req)
+{
+        struct sptlrpc_flavor *flvr = &req->rq_flvr;
+
+        if (exp->sf_rpc == SPTLRPC_FLVR_ANY || exp->sf_rpc == flvr->sf_rpc)
+                return 1;
+
+        if ((req->rq_ctx_init || req->rq_ctx_fini) &&
+            SPTLRPC_FLVR_POLICY(exp->sf_rpc) ==
+            SPTLRPC_FLVR_POLICY(flvr->sf_rpc) &&
+            SPTLRPC_FLVR_MECH(exp->sf_rpc) == SPTLRPC_FLVR_MECH(flvr->sf_rpc))
+                return 1;
+
+        return 0;
+}
+
+#define EXP_FLVR_UPDATE_EXPIRE      (OBD_TIMEOUT_DEFAULT + 10)
+
+/**
+ * Given an export \a exp, check whether the flavor of incoming \a req
+ * is allowed by the export \a exp. Main logic is about taking care of
+ * changing configurations. Return 0 means success.
+ */
+int sptlrpc_target_export_check(struct obd_export *exp,
+                                struct ptlrpc_request *req)
+{
+        struct sptlrpc_flavor   flavor;
+
+        if (exp == NULL)
+                return 0;
+
+        /* client side export has no imp_reverse, skip
+         * FIXME maybe we should check flavor this as well??? */
+        if (exp->exp_imp_reverse == NULL)
+                return 0;
+
+        /* don't care about ctx fini rpc */
+        if (req->rq_ctx_fini)
+                return 0;
+
+	spin_lock(&exp->exp_lock);
+
+        /* if flavor just changed (exp->exp_flvr_changed != 0), we wait for
+         * the first req with the new flavor, then treat it as current flavor,
+         * adapt reverse sec according to it.
+         * note the first rpc with new flavor might not be with root ctx, in
+         * which case delay the sec_adapt by leaving exp_flvr_adapt == 1. */
+        if (unlikely(exp->exp_flvr_changed) &&
+            flavor_allowed(&exp->exp_flvr_old[1], req)) {
+                /* make the new flavor as "current", and old ones as
+                 * about-to-expire */
+                CDEBUG(D_SEC, "exp %p: just changed: %x->%x\n", exp,
+                       exp->exp_flvr.sf_rpc, exp->exp_flvr_old[1].sf_rpc);
+                flavor = exp->exp_flvr_old[1];
+                exp->exp_flvr_old[1] = exp->exp_flvr_old[0];
+                exp->exp_flvr_expire[1] = exp->exp_flvr_expire[0];
+                exp->exp_flvr_old[0] = exp->exp_flvr;
+		exp->exp_flvr_expire[0] = ktime_get_real_seconds() +
+                                          EXP_FLVR_UPDATE_EXPIRE;
+                exp->exp_flvr = flavor;
+
+                /* flavor change finished */
+                exp->exp_flvr_changed = 0;
+                LASSERT(exp->exp_flvr_adapt == 1);
+
+                /* if it's gss, we only interested in root ctx init */
+                if (req->rq_auth_gss &&
+                    !(req->rq_ctx_init &&
+                      (req->rq_auth_usr_root || req->rq_auth_usr_mdt ||
+                       req->rq_auth_usr_ost))) {
+			spin_unlock(&exp->exp_lock);
+			CDEBUG(D_SEC, "is good but not root(%d:%d:%d:%d:%d)\n",
+			       req->rq_auth_gss, req->rq_ctx_init,
+			       req->rq_auth_usr_root, req->rq_auth_usr_mdt,
+			       req->rq_auth_usr_ost);
+			return 0;
+		}
+
+		exp->exp_flvr_adapt = 0;
+		spin_unlock(&exp->exp_lock);
+
+                return sptlrpc_import_sec_adapt(exp->exp_imp_reverse,
+                                                req->rq_svc_ctx, &flavor);
+        }
+
+        /* if it equals to the current flavor, we accept it, but need to
+         * dealing with reverse sec/ctx */
+        if (likely(flavor_allowed(&exp->exp_flvr, req))) {
+                /* most cases should return here, we only interested in
+                 * gss root ctx init */
+                if (!req->rq_auth_gss || !req->rq_ctx_init ||
+                    (!req->rq_auth_usr_root && !req->rq_auth_usr_mdt &&
+                     !req->rq_auth_usr_ost)) {
+			spin_unlock(&exp->exp_lock);
+			return 0;
+		}
+
+		/* if flavor just changed, we should not proceed, just leave
+		 * it and current flavor will be discovered and replaced
+		 * shortly, and let _this_ rpc pass through */
+		if (exp->exp_flvr_changed) {
+			LASSERT(exp->exp_flvr_adapt);
+			spin_unlock(&exp->exp_lock);
+			return 0;
+		}
+
+		if (exp->exp_flvr_adapt) {
+			exp->exp_flvr_adapt = 0;
+			CDEBUG(D_SEC, "exp %p (%x|%x|%x): do delayed adapt\n",
+			       exp, exp->exp_flvr.sf_rpc,
+			       exp->exp_flvr_old[0].sf_rpc,
+			       exp->exp_flvr_old[1].sf_rpc);
+			flavor = exp->exp_flvr;
+			spin_unlock(&exp->exp_lock);
+
+			return sptlrpc_import_sec_adapt(exp->exp_imp_reverse,
+							req->rq_svc_ctx,
+							&flavor);
+		} else {
+			CDEBUG(D_SEC, "exp %p (%x|%x|%x): is current flavor, "
+			       "install rvs ctx\n", exp, exp->exp_flvr.sf_rpc,
+			       exp->exp_flvr_old[0].sf_rpc,
+			       exp->exp_flvr_old[1].sf_rpc);
+			spin_unlock(&exp->exp_lock);
+
+                        return sptlrpc_svc_install_rvs_ctx(exp->exp_imp_reverse,
+                                                           req->rq_svc_ctx);
+                }
+        }
+
+        if (exp->exp_flvr_expire[0]) {
+		if (exp->exp_flvr_expire[0] >= ktime_get_real_seconds()) {
+                        if (flavor_allowed(&exp->exp_flvr_old[0], req)) {
+				CDEBUG(D_SEC, "exp %p (%x|%x|%x): match the middle one (%lld)\n", exp,
+                                       exp->exp_flvr.sf_rpc,
+                                       exp->exp_flvr_old[0].sf_rpc,
+                                       exp->exp_flvr_old[1].sf_rpc,
+				       (s64)(exp->exp_flvr_expire[0] -
+				       ktime_get_real_seconds()));
+				spin_unlock(&exp->exp_lock);
+                                return 0;
+                        }
+                } else {
+                        CDEBUG(D_SEC, "mark middle expired\n");
+                        exp->exp_flvr_expire[0] = 0;
+                }
+                CDEBUG(D_SEC, "exp %p (%x|%x|%x): %x not match middle\n", exp,
+                       exp->exp_flvr.sf_rpc,
+                       exp->exp_flvr_old[0].sf_rpc, exp->exp_flvr_old[1].sf_rpc,
+                       req->rq_flvr.sf_rpc);
+        }
+
+        /* now it doesn't match the current flavor, the only chance we can
+         * accept it is match the old flavors which is not expired. */
+        if (exp->exp_flvr_changed == 0 && exp->exp_flvr_expire[1]) {
+		if (exp->exp_flvr_expire[1] >= ktime_get_real_seconds()) {
+                        if (flavor_allowed(&exp->exp_flvr_old[1], req)) {
+				CDEBUG(D_SEC, "exp %p (%x|%x|%x): match the oldest one (%lld)\n",
+				       exp,
+                                       exp->exp_flvr.sf_rpc,
+                                       exp->exp_flvr_old[0].sf_rpc,
+                                       exp->exp_flvr_old[1].sf_rpc,
+				       (s64)(exp->exp_flvr_expire[1] -
+				       ktime_get_real_seconds()));
+				spin_unlock(&exp->exp_lock);
+                                return 0;
+                        }
+                } else {
+                        CDEBUG(D_SEC, "mark oldest expired\n");
+                        exp->exp_flvr_expire[1] = 0;
+                }
+                CDEBUG(D_SEC, "exp %p (%x|%x|%x): %x not match found\n",
+                       exp, exp->exp_flvr.sf_rpc,
+                       exp->exp_flvr_old[0].sf_rpc, exp->exp_flvr_old[1].sf_rpc,
+                       req->rq_flvr.sf_rpc);
+        } else {
+                CDEBUG(D_SEC, "exp %p (%x|%x|%x): skip the last one\n",
+                       exp, exp->exp_flvr.sf_rpc, exp->exp_flvr_old[0].sf_rpc,
+                       exp->exp_flvr_old[1].sf_rpc);
+        }
+
+	spin_unlock(&exp->exp_lock);
+
+	CWARN("exp %p(%s): req %p (%u|%u|%u|%u|%u|%u) with unauthorized flavor %x, expect %x|%x(%+lld)|%x(%+lld)\n",
+              exp, exp->exp_obd->obd_name,
+              req, req->rq_auth_gss, req->rq_ctx_init, req->rq_ctx_fini,
+              req->rq_auth_usr_root, req->rq_auth_usr_mdt, req->rq_auth_usr_ost,
+              req->rq_flvr.sf_rpc,
+              exp->exp_flvr.sf_rpc,
+              exp->exp_flvr_old[0].sf_rpc,
+              exp->exp_flvr_expire[0] ?
+	      (s64)(exp->exp_flvr_expire[0] - ktime_get_real_seconds()) : 0,
+              exp->exp_flvr_old[1].sf_rpc,
+              exp->exp_flvr_expire[1] ?
+	      (s64)(exp->exp_flvr_expire[1] - ktime_get_real_seconds()) : 0);
+        return -EACCES;
+}
+EXPORT_SYMBOL(sptlrpc_target_export_check);
+
+void sptlrpc_target_update_exp_flavor(struct obd_device *obd,
+                                      struct sptlrpc_rule_set *rset)
+{
+        struct obd_export       *exp;
+        struct sptlrpc_flavor    new_flvr;
+
+        LASSERT(obd);
+
+	spin_lock(&obd->obd_dev_lock);
+
+	list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) {
+		if (exp->exp_connection == NULL)
+			continue;
+
+		/* note if this export had just been updated flavor
+		 * (exp_flvr_changed == 1), this will override the
+		 * previous one. */
+		spin_lock(&exp->exp_lock);
+                sptlrpc_target_choose_flavor(rset, exp->exp_sp_peer,
+                                             exp->exp_connection->c_peer.nid,
+                                             &new_flvr);
+                if (exp->exp_flvr_changed ||
+                    !flavor_equal(&new_flvr, &exp->exp_flvr)) {
+                        exp->exp_flvr_old[1] = new_flvr;
+                        exp->exp_flvr_expire[1] = 0;
+                        exp->exp_flvr_changed = 1;
+                        exp->exp_flvr_adapt = 1;
+
+                        CDEBUG(D_SEC, "exp %p (%s): updated flavor %x->%x\n",
+                               exp, sptlrpc_part2name(exp->exp_sp_peer),
+                               exp->exp_flvr.sf_rpc,
+                               exp->exp_flvr_old[1].sf_rpc);
+                }
+		spin_unlock(&exp->exp_lock);
+	}
+
+	spin_unlock(&obd->obd_dev_lock);
+}
+EXPORT_SYMBOL(sptlrpc_target_update_exp_flavor);
+
+static int sptlrpc_svc_check_from(struct ptlrpc_request *req, int svc_rc)
+{
+        /* peer's claim is unreliable unless gss is being used */
+        if (!req->rq_auth_gss || svc_rc == SECSVC_DROP)
+                return svc_rc;
+
+        switch (req->rq_sp_from) {
+        case LUSTRE_SP_CLI:
+                if (req->rq_auth_usr_mdt || req->rq_auth_usr_ost) {
+                        DEBUG_REQ(D_ERROR, req, "faked source CLI");
+                        svc_rc = SECSVC_DROP;
+                }
+                break;
+        case LUSTRE_SP_MDT:
+                if (!req->rq_auth_usr_mdt) {
+                        DEBUG_REQ(D_ERROR, req, "faked source MDT");
+                        svc_rc = SECSVC_DROP;
+                }
+                break;
+        case LUSTRE_SP_OST:
+                if (!req->rq_auth_usr_ost) {
+                        DEBUG_REQ(D_ERROR, req, "faked source OST");
+                        svc_rc = SECSVC_DROP;
+                }
+                break;
+        case LUSTRE_SP_MGS:
+        case LUSTRE_SP_MGC:
+                if (!req->rq_auth_usr_root && !req->rq_auth_usr_mdt &&
+                    !req->rq_auth_usr_ost) {
+                        DEBUG_REQ(D_ERROR, req, "faked source MGC/MGS");
+                        svc_rc = SECSVC_DROP;
+                }
+                break;
+        case LUSTRE_SP_ANY:
+        default:
+                DEBUG_REQ(D_ERROR, req, "invalid source %u", req->rq_sp_from);
+                svc_rc = SECSVC_DROP;
+        }
+
+        return svc_rc;
+}
+
+/**
+ * Used by ptlrpc server, to perform transformation upon request message of
+ * incoming \a req. This must be the first thing to do with an incoming
+ * request in ptlrpc layer.
+ *
+ * \retval SECSVC_OK success, and req->rq_reqmsg point to request message in
+ * clear text, size is req->rq_reqlen; also req->rq_svc_ctx is set.
+ * \retval SECSVC_COMPLETE success, the request has been fully processed, and
+ * reply message has been prepared.
+ * \retval SECSVC_DROP failed, this request should be dropped.
+ */
+int sptlrpc_svc_unwrap_request(struct ptlrpc_request *req)
+{
+        struct ptlrpc_sec_policy *policy;
+        struct lustre_msg        *msg = req->rq_reqbuf;
+        int                       rc;
+        ENTRY;
+
+        LASSERT(msg);
+        LASSERT(req->rq_reqmsg == NULL);
+        LASSERT(req->rq_repmsg == NULL);
+        LASSERT(req->rq_svc_ctx == NULL);
+
+        req->rq_req_swab_mask = 0;
+
+        rc = __lustre_unpack_msg(msg, req->rq_reqdata_len);
+        switch (rc) {
+        case 1:
+                lustre_set_req_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+        case 0:
+                break;
+        default:
+		CERROR("error unpacking request from %s x%llu\n",
+                       libcfs_id2str(req->rq_peer), req->rq_xid);
+                RETURN(SECSVC_DROP);
+        }
+
+        req->rq_flvr.sf_rpc = WIRE_FLVR(msg->lm_secflvr);
+        req->rq_sp_from = LUSTRE_SP_ANY;
+	req->rq_auth_uid = -1;		/* set to INVALID_UID */
+	req->rq_auth_mapped_uid = -1;
+
+        policy = sptlrpc_wireflavor2policy(req->rq_flvr.sf_rpc);
+        if (!policy) {
+                CERROR("unsupported rpc flavor %x\n", req->rq_flvr.sf_rpc);
+                RETURN(SECSVC_DROP);
+        }
+
+        LASSERT(policy->sp_sops->accept);
+        rc = policy->sp_sops->accept(req);
+        sptlrpc_policy_put(policy);
+        LASSERT(req->rq_reqmsg || rc != SECSVC_OK);
+        LASSERT(req->rq_svc_ctx || rc == SECSVC_DROP);
+
+        /*
+         * if it's not null flavor (which means embedded packing msg),
+         * reset the swab mask for the comming inner msg unpacking.
+         */
+        if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL)
+                req->rq_req_swab_mask = 0;
+
+        /* sanity check for the request source */
+        rc = sptlrpc_svc_check_from(req, rc);
+        RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc server, to allocate reply buffer for \a req. If succeed,
+ * req->rq_reply_state is set, and req->rq_reply_state->rs_msg point to
+ * a buffer of \a msglen size.
+ */
+int sptlrpc_svc_alloc_rs(struct ptlrpc_request *req, int msglen)
+{
+        struct ptlrpc_sec_policy *policy;
+        struct ptlrpc_reply_state *rs;
+        int rc;
+        ENTRY;
+
+        LASSERT(req->rq_svc_ctx);
+        LASSERT(req->rq_svc_ctx->sc_policy);
+
+        policy = req->rq_svc_ctx->sc_policy;
+        LASSERT(policy->sp_sops->alloc_rs);
+
+        rc = policy->sp_sops->alloc_rs(req, msglen);
+        if (unlikely(rc == -ENOMEM)) {
+		struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
+		if (svcpt->scp_service->srv_max_reply_size <
+		   msglen + sizeof(struct ptlrpc_reply_state)) {
+			/* Just return failure if the size is too big */
+			CERROR("size of message is too big (%zd), %d allowed\n",
+				msglen + sizeof(struct ptlrpc_reply_state),
+				svcpt->scp_service->srv_max_reply_size);
+			RETURN(-ENOMEM);
+		}
+
+                /* failed alloc, try emergency pool */
+		rs = lustre_get_emerg_rs(svcpt);
+                if (rs == NULL)
+                        RETURN(-ENOMEM);
+
+                req->rq_reply_state = rs;
+                rc = policy->sp_sops->alloc_rs(req, msglen);
+                if (rc) {
+                        lustre_put_emerg_rs(rs);
+                        req->rq_reply_state = NULL;
+                }
+        }
+
+        LASSERT(rc != 0 ||
+                (req->rq_reply_state && req->rq_reply_state->rs_msg));
+
+        RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc server, to perform transformation upon reply message.
+ *
+ * \post req->rq_reply_off is set to approriate server-controlled reply offset.
+ * \post req->rq_repmsg and req->rq_reply_state->rs_msg becomes inaccessible.
+ */
+int sptlrpc_svc_wrap_reply(struct ptlrpc_request *req)
+{
+        struct ptlrpc_sec_policy *policy;
+        int rc;
+        ENTRY;
+
+        LASSERT(req->rq_svc_ctx);
+        LASSERT(req->rq_svc_ctx->sc_policy);
+
+        policy = req->rq_svc_ctx->sc_policy;
+        LASSERT(policy->sp_sops->authorize);
+
+        rc = policy->sp_sops->authorize(req);
+        LASSERT(rc || req->rq_reply_state->rs_repdata_len);
+
+        RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc server, to free reply_state.
+ */
+void sptlrpc_svc_free_rs(struct ptlrpc_reply_state *rs)
+{
+        struct ptlrpc_sec_policy *policy;
+        unsigned int prealloc;
+        ENTRY;
+
+        LASSERT(rs->rs_svc_ctx);
+        LASSERT(rs->rs_svc_ctx->sc_policy);
+
+        policy = rs->rs_svc_ctx->sc_policy;
+        LASSERT(policy->sp_sops->free_rs);
+
+        prealloc = rs->rs_prealloc;
+        policy->sp_sops->free_rs(rs);
+
+        if (prealloc)
+                lustre_put_emerg_rs(rs);
+        EXIT;
+}
+
+void sptlrpc_svc_ctx_addref(struct ptlrpc_request *req)
+{
+	struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx;
+
+	if (ctx != NULL)
+		atomic_inc(&ctx->sc_refcount);
+}
+
+void sptlrpc_svc_ctx_decref(struct ptlrpc_request *req)
+{
+	struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx;
+
+	if (ctx == NULL)
+		return;
+
+	LASSERT_ATOMIC_POS(&ctx->sc_refcount);
+	if (atomic_dec_and_test(&ctx->sc_refcount)) {
+		if (ctx->sc_policy->sp_sops->free_ctx)
+			ctx->sc_policy->sp_sops->free_ctx(ctx);
+	}
+	req->rq_svc_ctx = NULL;
+}
+
+void sptlrpc_svc_ctx_invalidate(struct ptlrpc_request *req)
+{
+        struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx;
+
+        if (ctx == NULL)
+                return;
+
+        LASSERT_ATOMIC_POS(&ctx->sc_refcount);
+        if (ctx->sc_policy->sp_sops->invalidate_ctx)
+                ctx->sc_policy->sp_sops->invalidate_ctx(ctx);
+}
+EXPORT_SYMBOL(sptlrpc_svc_ctx_invalidate);
+
+/****************************************
+ * bulk security                        *
+ ****************************************/
+
+/**
+ * Perform transformation upon bulk data pointed by \a desc. This is called
+ * before transforming the request message.
+ */
+int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req,
+                          struct ptlrpc_bulk_desc *desc)
+{
+        struct ptlrpc_cli_ctx *ctx;
+
+        LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+
+        if (!req->rq_pack_bulk)
+                return 0;
+
+        ctx = req->rq_cli_ctx;
+        if (ctx->cc_ops->wrap_bulk)
+                return ctx->cc_ops->wrap_bulk(ctx, req, desc);
+        return 0;
+}
+EXPORT_SYMBOL(sptlrpc_cli_wrap_bulk);
+
+/**
+ * This is called after unwrap the reply message.
+ * return nob of actual plain text size received, or error code.
+ */
+int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req,
+                                 struct ptlrpc_bulk_desc *desc,
+                                 int nob)
+{
+        struct ptlrpc_cli_ctx  *ctx;
+        int                     rc;
+
+        LASSERT(req->rq_bulk_read && !req->rq_bulk_write);
+
+        if (!req->rq_pack_bulk)
+                return desc->bd_nob_transferred;
+
+        ctx = req->rq_cli_ctx;
+        if (ctx->cc_ops->unwrap_bulk) {
+                rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc);
+                if (rc < 0)
+                        return rc;
+        }
+        return desc->bd_nob_transferred;
+}
+EXPORT_SYMBOL(sptlrpc_cli_unwrap_bulk_read);
+
+/**
+ * This is called after unwrap the reply message.
+ * return 0 for success or error code.
+ */
+int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req,
+                                  struct ptlrpc_bulk_desc *desc)
+{
+        struct ptlrpc_cli_ctx  *ctx;
+        int                     rc;
+
+        LASSERT(!req->rq_bulk_read && req->rq_bulk_write);
+
+        if (!req->rq_pack_bulk)
+                return 0;
+
+        ctx = req->rq_cli_ctx;
+        if (ctx->cc_ops->unwrap_bulk) {
+                rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc);
+                if (rc < 0)
+                        return rc;
+        }
+
+        /*
+         * if everything is going right, nob should equals to nob_transferred.
+         * in case of privacy mode, nob_transferred needs to be adjusted.
+         */
+        if (desc->bd_nob != desc->bd_nob_transferred) {
+		CERROR("nob %d doesn't match transferred nob %d\n",
+		       desc->bd_nob, desc->bd_nob_transferred);
+		return -EPROTO;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_cli_unwrap_bulk_write);
+
+#ifdef HAVE_SERVER_SUPPORT
+/**
+ * Performe transformation upon outgoing bulk read.
+ */
+int sptlrpc_svc_wrap_bulk(struct ptlrpc_request *req,
+                          struct ptlrpc_bulk_desc *desc)
+{
+        struct ptlrpc_svc_ctx *ctx;
+
+        LASSERT(req->rq_bulk_read);
+
+        if (!req->rq_pack_bulk)
+                return 0;
+
+        ctx = req->rq_svc_ctx;
+        if (ctx->sc_policy->sp_sops->wrap_bulk)
+                return ctx->sc_policy->sp_sops->wrap_bulk(req, desc);
+
+        return 0;
+}
+EXPORT_SYMBOL(sptlrpc_svc_wrap_bulk);
+
+/**
+ * Performe transformation upon incoming bulk write.
+ */
+int sptlrpc_svc_unwrap_bulk(struct ptlrpc_request *req,
+                            struct ptlrpc_bulk_desc *desc)
+{
+        struct ptlrpc_svc_ctx *ctx;
+        int                    rc;
+
+        LASSERT(req->rq_bulk_write);
+
+        /*
+         * if it's in privacy mode, transferred should >= expected; otherwise
+         * transferred should == expected.
+         */
+        if (desc->bd_nob_transferred < desc->bd_nob ||
+            (desc->bd_nob_transferred > desc->bd_nob &&
+             SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc) !=
+             SPTLRPC_BULK_SVC_PRIV)) {
+                DEBUG_REQ(D_ERROR, req, "truncated bulk GET %d(%d)",
+                          desc->bd_nob_transferred, desc->bd_nob);
+                return -ETIMEDOUT;
+        }
+
+        if (!req->rq_pack_bulk)
+                return 0;
+
+        ctx = req->rq_svc_ctx;
+        if (ctx->sc_policy->sp_sops->unwrap_bulk) {
+                rc = ctx->sc_policy->sp_sops->unwrap_bulk(req, desc);
+                if (rc)
+                        CERROR("error unwrap bulk: %d\n", rc);
+        }
+
+        /* return 0 to allow reply be sent */
+        return 0;
+}
+EXPORT_SYMBOL(sptlrpc_svc_unwrap_bulk);
+
+/**
+ * Prepare buffers for incoming bulk write.
+ */
+int sptlrpc_svc_prep_bulk(struct ptlrpc_request *req,
+                          struct ptlrpc_bulk_desc *desc)
+{
+        struct ptlrpc_svc_ctx *ctx;
+
+        LASSERT(req->rq_bulk_write);
+
+        if (!req->rq_pack_bulk)
+                return 0;
+
+        ctx = req->rq_svc_ctx;
+        if (ctx->sc_policy->sp_sops->prep_bulk)
+                return ctx->sc_policy->sp_sops->prep_bulk(req, desc);
+
+        return 0;
+}
+EXPORT_SYMBOL(sptlrpc_svc_prep_bulk);
+
+#endif /* HAVE_SERVER_SUPPORT */
+
+/****************************************
+ * user descriptor helpers              *
+ ****************************************/
+
+int sptlrpc_current_user_desc_size(void)
+{
+        int ngroups;
+
+        ngroups = current_ngroups;
+
+        if (ngroups > LUSTRE_MAX_GROUPS)
+                ngroups = LUSTRE_MAX_GROUPS;
+        return sptlrpc_user_desc_size(ngroups);
+}
+EXPORT_SYMBOL(sptlrpc_current_user_desc_size);
+
+int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset)
+{
+	struct ptlrpc_user_desc *pud;
+
+	pud = lustre_msg_buf(msg, offset, 0);
+
+	pud->pud_uid = from_kuid(&init_user_ns, current_uid());
+	pud->pud_gid = from_kgid(&init_user_ns, current_gid());
+	pud->pud_fsuid = from_kuid(&init_user_ns, current_fsuid());
+	pud->pud_fsgid = from_kgid(&init_user_ns, current_fsgid());
+	pud->pud_cap = cfs_curproc_cap_pack();
+	pud->pud_ngroups = (msg->lm_buflens[offset] - sizeof(*pud)) / 4;
+
+	task_lock(current);
+	if (pud->pud_ngroups > current_ngroups)
+		pud->pud_ngroups = current_ngroups;
+#ifdef HAVE_GROUP_INFO_GID
+	memcpy(pud->pud_groups, current_cred()->group_info->gid,
+	       pud->pud_ngroups * sizeof(__u32));
+#else /* !HAVE_GROUP_INFO_GID */
+	memcpy(pud->pud_groups, current_cred()->group_info->blocks[0],
+	       pud->pud_ngroups * sizeof(__u32));
+#endif /* HAVE_GROUP_INFO_GID */
+	task_unlock(current);
+
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_pack_user_desc);
+
+int sptlrpc_unpack_user_desc(struct lustre_msg *msg, int offset, int swabbed)
+{
+        struct ptlrpc_user_desc *pud;
+        int                      i;
+
+        pud = lustre_msg_buf(msg, offset, sizeof(*pud));
+        if (!pud)
+                return -EINVAL;
+
+        if (swabbed) {
+                __swab32s(&pud->pud_uid);
+                __swab32s(&pud->pud_gid);
+                __swab32s(&pud->pud_fsuid);
+                __swab32s(&pud->pud_fsgid);
+                __swab32s(&pud->pud_cap);
+                __swab32s(&pud->pud_ngroups);
+        }
+
+        if (pud->pud_ngroups > LUSTRE_MAX_GROUPS) {
+                CERROR("%u groups is too large\n", pud->pud_ngroups);
+                return -EINVAL;
+        }
+
+        if (sizeof(*pud) + pud->pud_ngroups * sizeof(__u32) >
+            msg->lm_buflens[offset]) {
+                CERROR("%u groups are claimed but bufsize only %u\n",
+                       pud->pud_ngroups, msg->lm_buflens[offset]);
+                return -EINVAL;
+        }
+
+        if (swabbed) {
+                for (i = 0; i < pud->pud_ngroups; i++)
+                        __swab32s(&pud->pud_groups[i]);
+        }
+
+        return 0;
+}
+EXPORT_SYMBOL(sptlrpc_unpack_user_desc);
+
+/****************************************
+ * misc helpers                         *
+ ****************************************/
+
+const char * sec2target_str(struct ptlrpc_sec *sec)
+{
+        if (!sec || !sec->ps_import || !sec->ps_import->imp_obd)
+                return "*";
+        if (sec_is_reverse(sec))
+                return "c";
+        return obd_uuid2str(&sec->ps_import->imp_obd->u.cli.cl_target_uuid);
+}
+EXPORT_SYMBOL(sec2target_str);
+
+/*
+ * return true if the bulk data is protected
+ */
+int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr)
+{
+        switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) {
+        case SPTLRPC_BULK_SVC_INTG:
+        case SPTLRPC_BULK_SVC_PRIV:
+                return 1;
+        default:
+                return 0;
+        }
+}
+EXPORT_SYMBOL(sptlrpc_flavor_has_bulk);
+
+/****************************************
+ * crypto API helper/alloc blkciper     *
+ ****************************************/
+
+/****************************************
+ * initialize/finalize                  *
+ ****************************************/
+
+int sptlrpc_init(void)
+{
+        int rc;
+
+	rwlock_init(&policy_lock);
+
+        rc = sptlrpc_gc_init();
+        if (rc)
+                goto out;
+
+        rc = sptlrpc_conf_init();
+        if (rc)
+                goto out_gc;
+
+        rc = sptlrpc_enc_pool_init();
+        if (rc)
+                goto out_conf;
+
+        rc = sptlrpc_null_init();
+        if (rc)
+                goto out_pool;
+
+        rc = sptlrpc_plain_init();
+        if (rc)
+                goto out_null;
+
+        rc = sptlrpc_lproc_init();
+        if (rc)
+                goto out_plain;
+
+        return 0;
+
+out_plain:
+        sptlrpc_plain_fini();
+out_null:
+        sptlrpc_null_fini();
+out_pool:
+        sptlrpc_enc_pool_fini();
+out_conf:
+        sptlrpc_conf_fini();
+out_gc:
+        sptlrpc_gc_fini();
+out:
+        return rc;
+}
+
+void sptlrpc_fini(void)
+{
+        sptlrpc_lproc_fini();
+        sptlrpc_plain_fini();
+        sptlrpc_null_fini();
+        sptlrpc_enc_pool_fini();
+        sptlrpc_conf_fini();
+        sptlrpc_gc_fini();
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c
new file mode 100644
index 0000000000000..748962d4f17f0
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c
@@ -0,0 +1,960 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_bulk.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <libcfs/libcfs.h>
+
+#include <obd.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_dlm.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+static int mult = 20 - PAGE_SHIFT;
+static int enc_pool_max_memory_mb;
+module_param(enc_pool_max_memory_mb, int, 0644);
+MODULE_PARM_DESC(enc_pool_max_memory_mb,
+		 "Encoding pool max memory (MB), 1/8 of total physical memory by default");
+
+/****************************************
+ * bulk encryption page pools           *
+ ****************************************/
+
+
+#define PTRS_PER_PAGE   (PAGE_SIZE / sizeof(void *))
+#define PAGES_PER_POOL  (PTRS_PER_PAGE)
+
+#define IDLE_IDX_MAX            (100)
+#define IDLE_IDX_WEIGHT         (3)
+
+#define CACHE_QUIESCENT_PERIOD  (20)
+
+static struct ptlrpc_enc_page_pool {
+        /*
+         * constants
+         */
+        unsigned long    epp_max_pages;   /* maximum pages can hold, const */
+        unsigned int     epp_max_pools;   /* number of pools, const */
+
+	/*
+	 * wait queue in case of not enough free pages.
+	 */
+	wait_queue_head_t    epp_waitq;       /* waiting threads */
+	unsigned int     epp_waitqlen;    /* wait queue length */
+	unsigned long    epp_pages_short; /* # of pages wanted of in-q users */
+	unsigned int     epp_growing:1;   /* during adding pages */
+
+        /*
+         * indicating how idle the pools are, from 0 to MAX_IDLE_IDX
+         * this is counted based on each time when getting pages from
+         * the pools, not based on time. which means in case that system
+         * is idled for a while but the idle_idx might still be low if no
+         * activities happened in the pools.
+         */
+        unsigned long    epp_idle_idx;
+
+        /* last shrink time due to mem tight */
+	time64_t	epp_last_shrink;
+	time64_t	epp_last_access;
+
+        /*
+         * in-pool pages bookkeeping
+         */
+	spinlock_t	 epp_lock;	   /* protect following fields */
+        unsigned long    epp_total_pages; /* total pages in pools */
+        unsigned long    epp_free_pages;  /* current pages available */
+
+        /*
+         * statistics
+         */
+        unsigned long    epp_st_max_pages;      /* # of pages ever reached */
+        unsigned int     epp_st_grows;          /* # of grows */
+        unsigned int     epp_st_grow_fails;     /* # of add pages failures */
+        unsigned int     epp_st_shrinks;        /* # of shrinks */
+        unsigned long    epp_st_access;         /* # of access */
+        unsigned long    epp_st_missings;       /* # of cache missing */
+        unsigned long    epp_st_lowfree;        /* lowest free pages reached */
+        unsigned int     epp_st_max_wqlen;      /* highest waitqueue length */
+        cfs_time_t       epp_st_max_wait;       /* in jeffies */
+	unsigned long	 epp_st_outofmem;	/* # of out of mem requests */
+	/*
+	 * pointers to pools, may be vmalloc'd
+	 */
+	struct page    ***epp_pools;
+} page_pools;
+
+/*
+ * memory shrinker
+ */
+static const int pools_shrinker_seeks = DEFAULT_SEEKS;
+static struct shrinker *pools_shrinker;
+
+
+/*
+ * /proc/fs/lustre/sptlrpc/encrypt_page_pools
+ */
+int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v)
+{
+	spin_lock(&page_pools.epp_lock);
+
+	seq_printf(m, "physical pages:          %lu\n"
+		   "pages per pool:          %lu\n"
+		   "max pages:               %lu\n"
+		   "max pools:               %u\n"
+		   "total pages:             %lu\n"
+		   "total free:              %lu\n"
+		   "idle index:              %lu/100\n"
+		   "last shrink:             %lds\n"
+		   "last access:             %lds\n"
+		   "max pages reached:       %lu\n"
+		   "grows:                   %u\n"
+		   "grows failure:           %u\n"
+		   "shrinks:                 %u\n"
+		   "cache access:            %lu\n"
+		   "cache missing:           %lu\n"
+		   "low free mark:           %lu\n"
+		   "max waitqueue depth:     %u\n"
+		   "max wait time:           %ld/%lu\n"
+		   "out of mem:              %lu\n",
+		   totalram_pages, PAGES_PER_POOL,
+		   page_pools.epp_max_pages,
+		   page_pools.epp_max_pools,
+		   page_pools.epp_total_pages,
+		   page_pools.epp_free_pages,
+		   page_pools.epp_idle_idx,
+		   (long)(ktime_get_seconds() - page_pools.epp_last_shrink),
+		   (long)(ktime_get_seconds() - page_pools.epp_last_access),
+		   page_pools.epp_st_max_pages,
+		   page_pools.epp_st_grows,
+		   page_pools.epp_st_grow_fails,
+		   page_pools.epp_st_shrinks,
+		   page_pools.epp_st_access,
+		   page_pools.epp_st_missings,
+		   page_pools.epp_st_lowfree,
+		   page_pools.epp_st_max_wqlen,
+		   page_pools.epp_st_max_wait,
+		   msecs_to_jiffies(MSEC_PER_SEC),
+		   page_pools.epp_st_outofmem);
+
+	spin_unlock(&page_pools.epp_lock);
+	return 0;
+}
+
+static void enc_pools_release_free_pages(long npages)
+{
+        int     p_idx, g_idx;
+        int     p_idx_max1, p_idx_max2;
+
+        LASSERT(npages > 0);
+        LASSERT(npages <= page_pools.epp_free_pages);
+        LASSERT(page_pools.epp_free_pages <= page_pools.epp_total_pages);
+
+        /* max pool index before the release */
+        p_idx_max2 = (page_pools.epp_total_pages - 1) / PAGES_PER_POOL;
+
+        page_pools.epp_free_pages -= npages;
+        page_pools.epp_total_pages -= npages;
+
+        /* max pool index after the release */
+        p_idx_max1 = page_pools.epp_total_pages == 0 ? -1 :
+                     ((page_pools.epp_total_pages - 1) / PAGES_PER_POOL);
+
+        p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+        g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+        LASSERT(page_pools.epp_pools[p_idx]);
+
+        while (npages--) {
+                LASSERT(page_pools.epp_pools[p_idx]);
+                LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL);
+
+		__free_page(page_pools.epp_pools[p_idx][g_idx]);
+                page_pools.epp_pools[p_idx][g_idx] = NULL;
+
+                if (++g_idx == PAGES_PER_POOL) {
+                        p_idx++;
+                        g_idx = 0;
+                }
+	}
+
+        /* free unused pools */
+        while (p_idx_max1 < p_idx_max2) {
+                LASSERT(page_pools.epp_pools[p_idx_max2]);
+		OBD_FREE(page_pools.epp_pools[p_idx_max2], PAGE_SIZE);
+                page_pools.epp_pools[p_idx_max2] = NULL;
+                p_idx_max2--;
+        }
+}
+
+/*
+ * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool.
+ */
+static unsigned long enc_pools_shrink_count(struct shrinker *s,
+					    struct shrink_control *sc)
+{
+	/*
+	 * if no pool access for a long time, we consider it's fully idle.
+	 * a little race here is fine.
+	 */
+	if (unlikely(ktime_get_real_seconds() - page_pools.epp_last_access >
+		     CACHE_QUIESCENT_PERIOD)) {
+		spin_lock(&page_pools.epp_lock);
+		page_pools.epp_idle_idx = IDLE_IDX_MAX;
+		spin_unlock(&page_pools.epp_lock);
+	}
+
+	LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX);
+	return (page_pools.epp_free_pages <= PTLRPC_MAX_BRW_PAGES) ? 0 :
+		(page_pools.epp_free_pages - PTLRPC_MAX_BRW_PAGES) *
+		(IDLE_IDX_MAX - page_pools.epp_idle_idx) / IDLE_IDX_MAX;
+}
+
+/*
+ * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool.
+ */
+static unsigned long enc_pools_shrink_scan(struct shrinker *s,
+					   struct shrink_control *sc)
+{
+	spin_lock(&page_pools.epp_lock);
+	if (page_pools.epp_free_pages <= PTLRPC_MAX_BRW_PAGES)
+		sc->nr_to_scan = 0;
+	else
+		sc->nr_to_scan = min_t(unsigned long, sc->nr_to_scan,
+			      page_pools.epp_free_pages - PTLRPC_MAX_BRW_PAGES);
+	if (sc->nr_to_scan > 0) {
+		enc_pools_release_free_pages(sc->nr_to_scan);
+		CDEBUG(D_SEC, "released %ld pages, %ld left\n",
+		       (long)sc->nr_to_scan, page_pools.epp_free_pages);
+
+		page_pools.epp_st_shrinks++;
+		page_pools.epp_last_shrink = ktime_get_real_seconds();
+	}
+	spin_unlock(&page_pools.epp_lock);
+
+	/*
+	 * if no pool access for a long time, we consider it's fully idle.
+	 * a little race here is fine.
+	 */
+	if (unlikely(ktime_get_real_seconds() - page_pools.epp_last_access >
+		     CACHE_QUIESCENT_PERIOD)) {
+		spin_lock(&page_pools.epp_lock);
+		page_pools.epp_idle_idx = IDLE_IDX_MAX;
+		spin_unlock(&page_pools.epp_lock);
+	}
+
+	LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX);
+	return sc->nr_to_scan;
+}
+
+#ifndef HAVE_SHRINKER_COUNT
+/*
+ * could be called frequently for query (@nr_to_scan == 0).
+ * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool.
+ */
+static int enc_pools_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
+{
+	struct shrink_control scv = {
+		.nr_to_scan = shrink_param(sc, nr_to_scan),
+		.gfp_mask   = shrink_param(sc, gfp_mask)
+	};
+#if !defined(HAVE_SHRINKER_WANT_SHRINK_PTR) && !defined(HAVE_SHRINK_CONTROL)
+	struct shrinker* shrinker = NULL;
+#endif
+
+	enc_pools_shrink_scan(shrinker, &scv);
+
+	return enc_pools_shrink_count(shrinker, &scv);
+}
+
+#endif /* HAVE_SHRINKER_COUNT */
+
+static inline
+int npages_to_npools(unsigned long npages)
+{
+        return (int) ((npages + PAGES_PER_POOL - 1) / PAGES_PER_POOL);
+}
+
+/*
+ * return how many pages cleaned up.
+ */
+static unsigned long enc_pools_cleanup(struct page ***pools, int npools)
+{
+	unsigned long cleaned = 0;
+	int           i, j;
+
+	for (i = 0; i < npools; i++) {
+		if (pools[i]) {
+			for (j = 0; j < PAGES_PER_POOL; j++) {
+				if (pools[i][j]) {
+					__free_page(pools[i][j]);
+					cleaned++;
+				}
+			}
+			OBD_FREE(pools[i], PAGE_SIZE);
+			pools[i] = NULL;
+		}
+	}
+
+	return cleaned;
+}
+
+/*
+ * merge @npools pointed by @pools which contains @npages new pages
+ * into current pools.
+ *
+ * we have options to avoid most memory copy with some tricks. but we choose
+ * the simplest way to avoid complexity. It's not frequently called.
+ */
+static void enc_pools_insert(struct page ***pools, int npools, int npages)
+{
+        int     freeslot;
+        int     op_idx, np_idx, og_idx, ng_idx;
+        int     cur_npools, end_npools;
+
+        LASSERT(npages > 0);
+        LASSERT(page_pools.epp_total_pages+npages <= page_pools.epp_max_pages);
+        LASSERT(npages_to_npools(npages) == npools);
+        LASSERT(page_pools.epp_growing);
+
+	spin_lock(&page_pools.epp_lock);
+
+        /*
+         * (1) fill all the free slots of current pools.
+         */
+        /* free slots are those left by rent pages, and the extra ones with
+         * index >= total_pages, locate at the tail of last pool. */
+        freeslot = page_pools.epp_total_pages % PAGES_PER_POOL;
+        if (freeslot != 0)
+                freeslot = PAGES_PER_POOL - freeslot;
+        freeslot += page_pools.epp_total_pages - page_pools.epp_free_pages;
+
+        op_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+        og_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+        np_idx = npools - 1;
+        ng_idx = (npages - 1) % PAGES_PER_POOL;
+
+        while (freeslot) {
+                LASSERT(page_pools.epp_pools[op_idx][og_idx] == NULL);
+                LASSERT(pools[np_idx][ng_idx] != NULL);
+
+                page_pools.epp_pools[op_idx][og_idx] = pools[np_idx][ng_idx];
+                pools[np_idx][ng_idx] = NULL;
+
+                freeslot--;
+
+                if (++og_idx == PAGES_PER_POOL) {
+                        op_idx++;
+                        og_idx = 0;
+                }
+                if (--ng_idx < 0) {
+                        if (np_idx == 0)
+                                break;
+                        np_idx--;
+                        ng_idx = PAGES_PER_POOL - 1;
+                }
+        }
+
+        /*
+         * (2) add pools if needed.
+         */
+        cur_npools = (page_pools.epp_total_pages + PAGES_PER_POOL - 1) /
+                     PAGES_PER_POOL;
+        end_npools = (page_pools.epp_total_pages + npages + PAGES_PER_POOL -1) /
+                     PAGES_PER_POOL;
+        LASSERT(end_npools <= page_pools.epp_max_pools);
+
+        np_idx = 0;
+        while (cur_npools < end_npools) {
+                LASSERT(page_pools.epp_pools[cur_npools] == NULL);
+                LASSERT(np_idx < npools);
+                LASSERT(pools[np_idx] != NULL);
+
+                page_pools.epp_pools[cur_npools++] = pools[np_idx];
+                pools[np_idx++] = NULL;
+        }
+
+        page_pools.epp_total_pages += npages;
+        page_pools.epp_free_pages += npages;
+        page_pools.epp_st_lowfree = page_pools.epp_free_pages;
+
+        if (page_pools.epp_total_pages > page_pools.epp_st_max_pages)
+                page_pools.epp_st_max_pages = page_pools.epp_total_pages;
+
+        CDEBUG(D_SEC, "add %d pages to total %lu\n", npages,
+               page_pools.epp_total_pages);
+
+	spin_unlock(&page_pools.epp_lock);
+}
+
+static int enc_pools_add_pages(int npages)
+{
+	static DEFINE_MUTEX(add_pages_mutex);
+	struct page   ***pools;
+	int             npools, alloced = 0;
+	int             i, j, rc = -ENOMEM;
+
+	if (npages < PTLRPC_MAX_BRW_PAGES)
+		npages = PTLRPC_MAX_BRW_PAGES;
+
+	mutex_lock(&add_pages_mutex);
+
+        if (npages + page_pools.epp_total_pages > page_pools.epp_max_pages)
+                npages = page_pools.epp_max_pages - page_pools.epp_total_pages;
+        LASSERT(npages > 0);
+
+        page_pools.epp_st_grows++;
+
+        npools = npages_to_npools(npages);
+        OBD_ALLOC(pools, npools * sizeof(*pools));
+        if (pools == NULL)
+                goto out;
+
+	for (i = 0; i < npools; i++) {
+		OBD_ALLOC(pools[i], PAGE_SIZE);
+		if (pools[i] == NULL)
+			goto out_pools;
+
+		for (j = 0; j < PAGES_PER_POOL && alloced < npages; j++) {
+			pools[i][j] = alloc_page(GFP_NOFS |
+						 __GFP_HIGHMEM);
+			if (pools[i][j] == NULL)
+				goto out_pools;
+
+			alloced++;
+		}
+	}
+	LASSERT(alloced == npages);
+
+        enc_pools_insert(pools, npools, npages);
+        CDEBUG(D_SEC, "added %d pages into pools\n", npages);
+        rc = 0;
+
+out_pools:
+        enc_pools_cleanup(pools, npools);
+        OBD_FREE(pools, npools * sizeof(*pools));
+out:
+        if (rc) {
+                page_pools.epp_st_grow_fails++;
+                CERROR("Failed to allocate %d enc pages\n", npages);
+        }
+
+	mutex_unlock(&add_pages_mutex);
+        return rc;
+}
+
+static inline void enc_pools_wakeup(void)
+{
+	assert_spin_locked(&page_pools.epp_lock);
+
+	if (unlikely(page_pools.epp_waitqlen)) {
+		LASSERT(waitqueue_active(&page_pools.epp_waitq));
+		wake_up_all(&page_pools.epp_waitq);
+	}
+}
+
+static int enc_pools_should_grow(int page_needed, time64_t now)
+{
+	/* don't grow if someone else is growing the pools right now,
+	 * or the pools has reached its full capacity
+	 */
+	if (page_pools.epp_growing ||
+	    page_pools.epp_total_pages == page_pools.epp_max_pages)
+		return 0;
+
+	/* if total pages is not enough, we need to grow */
+	if (page_pools.epp_total_pages < page_needed)
+		return 1;
+
+	/*
+	 * we wanted to return 0 here if there was a shrink just
+	 * happened a moment ago, but this may cause deadlock if both
+	 * client and ost live on single node.
+	 */
+
+	/*
+	 * here we perhaps need consider other factors like wait queue
+	 * length, idle index, etc. ?
+	 */
+
+	/* grow the pools in any other cases */
+	return 1;
+}
+
+/*
+ * Export the number of free pages in the pool
+ */
+int get_free_pages_in_pool(void)
+{
+	return page_pools.epp_free_pages;
+}
+EXPORT_SYMBOL(get_free_pages_in_pool);
+
+/*
+ * Let outside world know if enc_pool full capacity is reached
+ */
+int pool_is_at_full_capacity(void)
+{
+	return (page_pools.epp_total_pages == page_pools.epp_max_pages);
+}
+EXPORT_SYMBOL(pool_is_at_full_capacity);
+
+/*
+ * we allocate the requested pages atomically.
+ */
+int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc)
+{
+	wait_queue_entry_t waitlink;
+	unsigned long   this_idle = -1;
+	cfs_time_t      tick = 0;
+	long            now;
+	int             p_idx, g_idx;
+	int             i;
+
+	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
+	LASSERT(desc->bd_iov_count > 0);
+	LASSERT(desc->bd_iov_count <= page_pools.epp_max_pages);
+
+	/* resent bulk, enc iov might have been allocated previously */
+	if (GET_ENC_KIOV(desc) != NULL)
+		return 0;
+
+	OBD_ALLOC_LARGE(GET_ENC_KIOV(desc),
+		  desc->bd_iov_count * sizeof(*GET_ENC_KIOV(desc)));
+	if (GET_ENC_KIOV(desc) == NULL)
+		return -ENOMEM;
+
+	spin_lock(&page_pools.epp_lock);
+
+	page_pools.epp_st_access++;
+again:
+	if (unlikely(page_pools.epp_free_pages < desc->bd_iov_count)) {
+		if (tick == 0)
+			tick = cfs_time_current();
+
+		now = ktime_get_real_seconds();
+
+		page_pools.epp_st_missings++;
+		page_pools.epp_pages_short += desc->bd_iov_count;
+
+		if (enc_pools_should_grow(desc->bd_iov_count, now)) {
+			page_pools.epp_growing = 1;
+
+			spin_unlock(&page_pools.epp_lock);
+			enc_pools_add_pages(page_pools.epp_pages_short / 2);
+			spin_lock(&page_pools.epp_lock);
+
+			page_pools.epp_growing = 0;
+
+			enc_pools_wakeup();
+		} else {
+			if (page_pools.epp_growing) {
+				if (++page_pools.epp_waitqlen >
+				    page_pools.epp_st_max_wqlen)
+					page_pools.epp_st_max_wqlen =
+							page_pools.epp_waitqlen;
+
+				set_current_state(TASK_UNINTERRUPTIBLE);
+				init_waitqueue_entry(&waitlink, current);
+				add_wait_queue(&page_pools.epp_waitq,
+					       &waitlink);
+
+				spin_unlock(&page_pools.epp_lock);
+				schedule();
+				remove_wait_queue(&page_pools.epp_waitq,
+						  &waitlink);
+				LASSERT(page_pools.epp_waitqlen > 0);
+				spin_lock(&page_pools.epp_lock);
+				page_pools.epp_waitqlen--;
+			} else {
+				/* ptlrpcd thread should not sleep in that case,
+				 * or deadlock may occur!
+				 * Instead, return -ENOMEM so that upper layers
+				 * will put request back in queue. */
+				page_pools.epp_st_outofmem++;
+				spin_unlock(&page_pools.epp_lock);
+				OBD_FREE_LARGE(GET_ENC_KIOV(desc),
+					       desc->bd_iov_count *
+						sizeof(*GET_ENC_KIOV(desc)));
+				GET_ENC_KIOV(desc) = NULL;
+				return -ENOMEM;
+			}
+		}
+
+		LASSERT(page_pools.epp_pages_short >= desc->bd_iov_count);
+		page_pools.epp_pages_short -= desc->bd_iov_count;
+
+		this_idle = 0;
+		goto again;
+	}
+
+        /* record max wait time */
+        if (unlikely(tick != 0)) {
+                tick = cfs_time_current() - tick;
+                if (tick > page_pools.epp_st_max_wait)
+                        page_pools.epp_st_max_wait = tick;
+        }
+
+        /* proceed with rest of allocation */
+        page_pools.epp_free_pages -= desc->bd_iov_count;
+
+        p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+        g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL);
+		BD_GET_ENC_KIOV(desc, i).kiov_page =
+		       page_pools.epp_pools[p_idx][g_idx];
+		page_pools.epp_pools[p_idx][g_idx] = NULL;
+
+		if (++g_idx == PAGES_PER_POOL) {
+			p_idx++;
+			g_idx = 0;
+		}
+	}
+
+        if (page_pools.epp_free_pages < page_pools.epp_st_lowfree)
+                page_pools.epp_st_lowfree = page_pools.epp_free_pages;
+
+        /*
+         * new idle index = (old * weight + new) / (weight + 1)
+         */
+        if (this_idle == -1) {
+                this_idle = page_pools.epp_free_pages * IDLE_IDX_MAX /
+                            page_pools.epp_total_pages;
+        }
+        page_pools.epp_idle_idx = (page_pools.epp_idle_idx * IDLE_IDX_WEIGHT +
+                                   this_idle) /
+                                  (IDLE_IDX_WEIGHT + 1);
+
+	page_pools.epp_last_access = ktime_get_real_seconds();
+
+	spin_unlock(&page_pools.epp_lock);
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_get_pages);
+
+void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc)
+{
+	int     p_idx, g_idx;
+	int     i;
+
+	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
+
+	if (GET_ENC_KIOV(desc) == NULL)
+		return;
+
+	LASSERT(desc->bd_iov_count > 0);
+
+	spin_lock(&page_pools.epp_lock);
+
+	p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+	g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+
+	LASSERT(page_pools.epp_free_pages + desc->bd_iov_count <=
+		page_pools.epp_total_pages);
+	LASSERT(page_pools.epp_pools[p_idx]);
+
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		LASSERT(BD_GET_ENC_KIOV(desc, i).kiov_page != NULL);
+		LASSERT(g_idx != 0 || page_pools.epp_pools[p_idx]);
+		LASSERT(page_pools.epp_pools[p_idx][g_idx] == NULL);
+
+		page_pools.epp_pools[p_idx][g_idx] =
+			BD_GET_ENC_KIOV(desc, i).kiov_page;
+
+		if (++g_idx == PAGES_PER_POOL) {
+			p_idx++;
+			g_idx = 0;
+		}
+	}
+
+	page_pools.epp_free_pages += desc->bd_iov_count;
+
+	enc_pools_wakeup();
+
+	spin_unlock(&page_pools.epp_lock);
+
+	OBD_FREE_LARGE(GET_ENC_KIOV(desc),
+		 desc->bd_iov_count * sizeof(*GET_ENC_KIOV(desc)));
+	GET_ENC_KIOV(desc) = NULL;
+}
+
+/*
+ * we don't do much stuff for add_user/del_user anymore, except adding some
+ * initial pages in add_user() if current pools are empty, rest would be
+ * handled by the pools's self-adaption.
+ */
+int sptlrpc_enc_pool_add_user(void)
+{
+	int     need_grow = 0;
+
+	spin_lock(&page_pools.epp_lock);
+	if (page_pools.epp_growing == 0 && page_pools.epp_total_pages == 0) {
+		page_pools.epp_growing = 1;
+		need_grow = 1;
+	}
+	spin_unlock(&page_pools.epp_lock);
+
+	if (need_grow) {
+		enc_pools_add_pages(PTLRPC_MAX_BRW_PAGES +
+				    PTLRPC_MAX_BRW_PAGES);
+
+		spin_lock(&page_pools.epp_lock);
+		page_pools.epp_growing = 0;
+		enc_pools_wakeup();
+		spin_unlock(&page_pools.epp_lock);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_add_user);
+
+int sptlrpc_enc_pool_del_user(void)
+{
+        return 0;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_del_user);
+
+static inline void enc_pools_alloc(void)
+{
+	LASSERT(page_pools.epp_max_pools);
+	OBD_ALLOC_LARGE(page_pools.epp_pools,
+			page_pools.epp_max_pools *
+			sizeof(*page_pools.epp_pools));
+}
+
+static inline void enc_pools_free(void)
+{
+	LASSERT(page_pools.epp_max_pools);
+	LASSERT(page_pools.epp_pools);
+
+	OBD_FREE_LARGE(page_pools.epp_pools,
+		       page_pools.epp_max_pools *
+		       sizeof(*page_pools.epp_pools));
+}
+
+int sptlrpc_enc_pool_init(void)
+{
+	DEF_SHRINKER_VAR(shvar, enc_pools_shrink,
+			 enc_pools_shrink_count, enc_pools_shrink_scan);
+
+	page_pools.epp_max_pages = totalram_pages / 8;
+	if (enc_pool_max_memory_mb > 0 &&
+	    enc_pool_max_memory_mb <= (totalram_pages >> mult))
+		page_pools.epp_max_pages = enc_pool_max_memory_mb << mult;
+
+	page_pools.epp_max_pools = npages_to_npools(page_pools.epp_max_pages);
+
+	init_waitqueue_head(&page_pools.epp_waitq);
+	page_pools.epp_waitqlen = 0;
+	page_pools.epp_pages_short = 0;
+
+        page_pools.epp_growing = 0;
+
+        page_pools.epp_idle_idx = 0;
+	page_pools.epp_last_shrink = ktime_get_real_seconds();
+	page_pools.epp_last_access = ktime_get_real_seconds();
+
+	spin_lock_init(&page_pools.epp_lock);
+        page_pools.epp_total_pages = 0;
+        page_pools.epp_free_pages = 0;
+
+        page_pools.epp_st_max_pages = 0;
+        page_pools.epp_st_grows = 0;
+        page_pools.epp_st_grow_fails = 0;
+        page_pools.epp_st_shrinks = 0;
+        page_pools.epp_st_access = 0;
+        page_pools.epp_st_missings = 0;
+        page_pools.epp_st_lowfree = 0;
+        page_pools.epp_st_max_wqlen = 0;
+        page_pools.epp_st_max_wait = 0;
+	page_pools.epp_st_outofmem = 0;
+
+        enc_pools_alloc();
+        if (page_pools.epp_pools == NULL)
+                return -ENOMEM;
+
+	pools_shrinker = set_shrinker(pools_shrinker_seeks, &shvar);
+        if (pools_shrinker == NULL) {
+                enc_pools_free();
+                return -ENOMEM;
+        }
+
+        return 0;
+}
+
+void sptlrpc_enc_pool_fini(void)
+{
+        unsigned long cleaned, npools;
+
+        LASSERT(pools_shrinker);
+        LASSERT(page_pools.epp_pools);
+        LASSERT(page_pools.epp_total_pages == page_pools.epp_free_pages);
+
+	remove_shrinker(pools_shrinker);
+
+        npools = npages_to_npools(page_pools.epp_total_pages);
+        cleaned = enc_pools_cleanup(page_pools.epp_pools, npools);
+        LASSERT(cleaned == page_pools.epp_total_pages);
+
+        enc_pools_free();
+
+	if (page_pools.epp_st_access > 0) {
+		CDEBUG(D_SEC,
+		       "max pages %lu, grows %u, grow fails %u, shrinks %u, access %lu, missing %lu, max qlen %u, max wait %ld/%lu, out of mem %lu\n",
+		       page_pools.epp_st_max_pages, page_pools.epp_st_grows,
+		       page_pools.epp_st_grow_fails,
+		       page_pools.epp_st_shrinks, page_pools.epp_st_access,
+		       page_pools.epp_st_missings, page_pools.epp_st_max_wqlen,
+		       page_pools.epp_st_max_wait,
+		       msecs_to_jiffies(MSEC_PER_SEC),
+		       page_pools.epp_st_outofmem);
+	}
+}
+
+
+static int cfs_hash_alg_id[] = {
+	[BULK_HASH_ALG_NULL]	= CFS_HASH_ALG_NULL,
+	[BULK_HASH_ALG_ADLER32]	= CFS_HASH_ALG_ADLER32,
+	[BULK_HASH_ALG_CRC32]	= CFS_HASH_ALG_CRC32,
+	[BULK_HASH_ALG_MD5]	= CFS_HASH_ALG_MD5,
+	[BULK_HASH_ALG_SHA1]	= CFS_HASH_ALG_SHA1,
+	[BULK_HASH_ALG_SHA256]	= CFS_HASH_ALG_SHA256,
+	[BULK_HASH_ALG_SHA384]	= CFS_HASH_ALG_SHA384,
+	[BULK_HASH_ALG_SHA512]	= CFS_HASH_ALG_SHA512,
+};
+const char * sptlrpc_get_hash_name(__u8 hash_alg)
+{
+	return cfs_crypto_hash_name(cfs_hash_alg_id[hash_alg]);
+}
+
+__u8 sptlrpc_get_hash_alg(const char *algname)
+{
+	return cfs_crypto_hash_alg(algname);
+}
+
+int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed)
+{
+        struct ptlrpc_bulk_sec_desc *bsd;
+        int                          size = msg->lm_buflens[offset];
+
+        bsd = lustre_msg_buf(msg, offset, sizeof(*bsd));
+        if (bsd == NULL) {
+                CERROR("Invalid bulk sec desc: size %d\n", size);
+                return -EINVAL;
+        }
+
+        if (swabbed) {
+                __swab32s(&bsd->bsd_nob);
+        }
+
+        if (unlikely(bsd->bsd_version != 0)) {
+                CERROR("Unexpected version %u\n", bsd->bsd_version);
+                return -EPROTO;
+        }
+
+        if (unlikely(bsd->bsd_type >= SPTLRPC_BULK_MAX)) {
+                CERROR("Invalid type %u\n", bsd->bsd_type);
+                return -EPROTO;
+        }
+
+        /* FIXME more sanity check here */
+
+        if (unlikely(bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL &&
+                     bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG &&
+                     bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV)) {
+                CERROR("Invalid svc %u\n", bsd->bsd_svc);
+                return -EPROTO;
+        }
+
+        return 0;
+}
+EXPORT_SYMBOL(bulk_sec_desc_unpack);
+
+/*
+ * Compute the checksum of an RPC buffer payload.  If the return \a buflen
+ * is not large enough, truncate the result to fit so that it is possible
+ * to use a hash function with a large hash space, but only use a part of
+ * the resulting hash.
+ */
+int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
+			      void *buf, int buflen)
+{
+	struct cfs_crypto_hash_desc	*hdesc;
+	int				hashsize;
+	unsigned int			bufsize;
+	int				i, err;
+
+	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
+	LASSERT(alg > BULK_HASH_ALG_NULL && alg < BULK_HASH_ALG_MAX);
+	LASSERT(buflen >= 4);
+
+	hdesc = cfs_crypto_hash_init(cfs_hash_alg_id[alg], NULL, 0);
+	if (IS_ERR(hdesc)) {
+		CERROR("Unable to initialize checksum hash %s\n",
+		       cfs_crypto_hash_name(cfs_hash_alg_id[alg]));
+		return PTR_ERR(hdesc);
+	}
+
+	hashsize = cfs_crypto_hash_digestsize(cfs_hash_alg_id[alg]);
+
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		cfs_crypto_hash_update_page(hdesc,
+				  BD_GET_KIOV(desc, i).kiov_page,
+				  BD_GET_KIOV(desc, i).kiov_offset &
+					      ~PAGE_MASK,
+				  BD_GET_KIOV(desc, i).kiov_len);
+	}
+
+	if (hashsize > buflen) {
+		unsigned char hashbuf[CFS_CRYPTO_HASH_DIGESTSIZE_MAX];
+
+		bufsize = sizeof(hashbuf);
+		LASSERTF(bufsize >= hashsize, "bufsize = %u < hashsize %u\n",
+			 bufsize, hashsize);
+		err = cfs_crypto_hash_final(hdesc, hashbuf, &bufsize);
+		memcpy(buf, hashbuf, buflen);
+	} else {
+		bufsize = buflen;
+		err = cfs_crypto_hash_final(hdesc, buf, &bufsize);
+	}
+
+	return err;
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_config.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_config.c
new file mode 100644
index 0000000000000..550abeafceea0
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_config.c
@@ -0,0 +1,967 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <libcfs/libcfs.h>
+#include <linux/crypto.h>
+#include <linux/key.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_log.h>
+#include <lustre_disk.h>
+#include <lustre_dlm.h>
+#include <uapi/linux/lustre_param.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+const char *sptlrpc_part2name(enum lustre_sec_part part)
+{
+        switch (part) {
+        case LUSTRE_SP_CLI:
+                return "cli";
+        case LUSTRE_SP_MDT:
+                return "mdt";
+        case LUSTRE_SP_OST:
+                return "ost";
+        case LUSTRE_SP_MGC:
+                return "mgc";
+        case LUSTRE_SP_MGS:
+                return "mgs";
+        case LUSTRE_SP_ANY:
+                return "any";
+        default:
+                return "err";
+        }
+}
+EXPORT_SYMBOL(sptlrpc_part2name);
+
+enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd)
+{
+        const char *type = obd->obd_type->typ_name;
+
+        if (!strcmp(type, LUSTRE_MDT_NAME))
+                return LUSTRE_SP_MDT;
+        if (!strcmp(type, LUSTRE_OST_NAME))
+                return LUSTRE_SP_OST;
+        if (!strcmp(type, LUSTRE_MGS_NAME))
+                return LUSTRE_SP_MGS;
+
+        CERROR("unknown target %p(%s)\n", obd, type);
+        return LUSTRE_SP_ANY;
+}
+
+/****************************************
+ * user supplied flavor string parsing  *
+ ****************************************/
+
+/*
+ * format: <base_flavor>[-<bulk_type:alg_spec>]
+ */
+int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr)
+{
+        char            buf[32];
+        char           *bulk, *alg;
+
+        memset(flvr, 0, sizeof(*flvr));
+
+        if (str == NULL || str[0] == '\0') {
+                flvr->sf_rpc = SPTLRPC_FLVR_INVALID;
+                return 0;
+        }
+
+	strlcpy(buf, str, sizeof(buf));
+
+        bulk = strchr(buf, '-');
+        if (bulk)
+                *bulk++ = '\0';
+
+        flvr->sf_rpc = sptlrpc_name2flavor_base(buf);
+        if (flvr->sf_rpc == SPTLRPC_FLVR_INVALID)
+                goto err_out;
+
+        /*
+         * currently only base flavor "plain" can have bulk specification.
+         */
+        if (flvr->sf_rpc == SPTLRPC_FLVR_PLAIN) {
+                flvr->u_bulk.hash.hash_alg = BULK_HASH_ALG_ADLER32;
+                if (bulk) {
+                        /*
+                         * format: plain-hash:<hash_alg>
+                         */
+                        alg = strchr(bulk, ':');
+                        if (alg == NULL)
+                                goto err_out;
+                        *alg++ = '\0';
+
+                        if (strcmp(bulk, "hash"))
+                                goto err_out;
+
+                        flvr->u_bulk.hash.hash_alg = sptlrpc_get_hash_alg(alg);
+                        if (flvr->u_bulk.hash.hash_alg >= BULK_HASH_ALG_MAX)
+                                goto err_out;
+                }
+
+                if (flvr->u_bulk.hash.hash_alg == BULK_HASH_ALG_NULL)
+                        flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_NULL);
+                else
+                        flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_INTG);
+        } else {
+                if (bulk)
+                        goto err_out;
+        }
+
+        flvr->sf_flags = 0;
+        return 0;
+
+err_out:
+        CERROR("invalid flavor string: %s\n", str);
+        return -EINVAL;
+}
+EXPORT_SYMBOL(sptlrpc_parse_flavor);
+
+/****************************************
+ * configure rules                      *
+ ****************************************/
+
+static void get_default_flavor(struct sptlrpc_flavor *sf)
+{
+        memset(sf, 0, sizeof(*sf));
+
+        sf->sf_rpc = SPTLRPC_FLVR_NULL;
+        sf->sf_flags = 0;
+}
+
+static void sptlrpc_rule_init(struct sptlrpc_rule *rule)
+{
+        rule->sr_netid = LNET_NIDNET(LNET_NID_ANY);
+        rule->sr_from = LUSTRE_SP_ANY;
+        rule->sr_to = LUSTRE_SP_ANY;
+        rule->sr_padding = 0;
+
+        get_default_flavor(&rule->sr_flvr);
+}
+
+/*
+ * format: network[.direction]=flavor
+ */
+int sptlrpc_parse_rule(char *param, struct sptlrpc_rule *rule)
+{
+        char           *flavor, *dir;
+        int             rc;
+
+        sptlrpc_rule_init(rule);
+
+        flavor = strchr(param, '=');
+        if (flavor == NULL) {
+                CERROR("invalid param, no '='\n");
+                RETURN(-EINVAL);
+        }
+        *flavor++ = '\0';
+
+        dir = strchr(param, '.');
+        if (dir)
+                *dir++ = '\0';
+
+        /* 1.1 network */
+        if (strcmp(param, "default")) {
+                rule->sr_netid = libcfs_str2net(param);
+                if (rule->sr_netid == LNET_NIDNET(LNET_NID_ANY)) {
+                        CERROR("invalid network name: %s\n", param);
+                        RETURN(-EINVAL);
+                }
+        }
+
+        /* 1.2 direction */
+        if (dir) {
+                if (!strcmp(dir, "mdt2ost")) {
+                        rule->sr_from = LUSTRE_SP_MDT;
+                        rule->sr_to = LUSTRE_SP_OST;
+                } else if (!strcmp(dir, "mdt2mdt")) {
+                        rule->sr_from = LUSTRE_SP_MDT;
+                        rule->sr_to = LUSTRE_SP_MDT;
+                } else if (!strcmp(dir, "cli2ost")) {
+                        rule->sr_from = LUSTRE_SP_CLI;
+                        rule->sr_to = LUSTRE_SP_OST;
+                } else if (!strcmp(dir, "cli2mdt")) {
+                        rule->sr_from = LUSTRE_SP_CLI;
+                        rule->sr_to = LUSTRE_SP_MDT;
+                } else {
+                        CERROR("invalid rule dir segment: %s\n", dir);
+                        RETURN(-EINVAL);
+                }
+        }
+
+        /* 2.1 flavor */
+        rc = sptlrpc_parse_flavor(flavor, &rule->sr_flvr);
+        if (rc)
+                RETURN(-EINVAL);
+
+        RETURN(0);
+}
+EXPORT_SYMBOL(sptlrpc_parse_rule);
+
+void sptlrpc_rule_set_free(struct sptlrpc_rule_set *rset)
+{
+        LASSERT(rset->srs_nslot ||
+                (rset->srs_nrule == 0 && rset->srs_rules == NULL));
+
+        if (rset->srs_nslot) {
+                OBD_FREE(rset->srs_rules,
+                         rset->srs_nslot * sizeof(*rset->srs_rules));
+                sptlrpc_rule_set_init(rset);
+        }
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_free);
+
+/*
+ * return 0 if the rule set could accomodate one more rule.
+ */
+int sptlrpc_rule_set_expand(struct sptlrpc_rule_set *rset)
+{
+	struct sptlrpc_rule *rules;
+	int nslot;
+
+	might_sleep();
+
+	if (rset->srs_nrule < rset->srs_nslot)
+		return 0;
+
+	nslot = rset->srs_nslot + 8;
+
+        /* better use realloc() if available */
+        OBD_ALLOC(rules, nslot * sizeof(*rset->srs_rules));
+        if (rules == NULL)
+                return -ENOMEM;
+
+        if (rset->srs_nrule) {
+                LASSERT(rset->srs_nslot && rset->srs_rules);
+                memcpy(rules, rset->srs_rules,
+                       rset->srs_nrule * sizeof(*rset->srs_rules));
+
+                OBD_FREE(rset->srs_rules,
+                         rset->srs_nslot * sizeof(*rset->srs_rules));
+        }
+
+        rset->srs_rules = rules;
+        rset->srs_nslot = nslot;
+        return 0;
+}
+
+static inline int rule_spec_dir(struct sptlrpc_rule *rule)
+{
+        return (rule->sr_from != LUSTRE_SP_ANY ||
+                rule->sr_to != LUSTRE_SP_ANY);
+}
+static inline int rule_spec_net(struct sptlrpc_rule *rule)
+{
+        return (rule->sr_netid != LNET_NIDNET(LNET_NID_ANY));
+}
+static inline int rule_match_dir(struct sptlrpc_rule *r1,
+                                 struct sptlrpc_rule *r2)
+{
+        return (r1->sr_from == r2->sr_from && r1->sr_to == r2->sr_to);
+}
+static inline int rule_match_net(struct sptlrpc_rule *r1,
+                                 struct sptlrpc_rule *r2)
+{
+        return (r1->sr_netid == r2->sr_netid);
+}
+
+/*
+ * merge @rule into @rset.
+ * the @rset slots might be expanded.
+ */
+int sptlrpc_rule_set_merge(struct sptlrpc_rule_set *rset, 
+                           struct sptlrpc_rule *rule)
+{
+	struct sptlrpc_rule      *p = rset->srs_rules;
+	int                       spec_dir, spec_net;
+	int                       rc, n, match = 0;
+
+	might_sleep();
+
+	spec_net = rule_spec_net(rule);
+	spec_dir = rule_spec_dir(rule);
+
+        for (n = 0; n < rset->srs_nrule; n++) {
+                p = &rset->srs_rules[n]; 
+
+                /* test network match, if failed:
+                 * - spec rule: skip rules which is also spec rule match, until
+                 *   we hit a wild rule, which means no more chance
+                 * - wild rule: skip until reach the one which is also wild
+                 *   and matches
+                 */
+                if (!rule_match_net(p, rule)) {
+                        if (spec_net) {
+                                if (rule_spec_net(p))
+                                        continue;
+                                else
+                                        break;
+                        } else {
+                                continue;
+                        }
+                }
+
+                /* test dir match, same logic as net matching */
+                if (!rule_match_dir(p, rule)) {
+                        if (spec_dir) {
+                                if (rule_spec_dir(p))
+                                        continue;
+                                else
+                                        break;
+                        } else {
+                                continue;
+                        }
+                }
+
+                /* find a match */
+                match = 1;
+                break;
+        }
+
+        if (match) {
+                LASSERT(n >= 0 && n < rset->srs_nrule);
+
+                if (rule->sr_flvr.sf_rpc == SPTLRPC_FLVR_INVALID) {
+                        /* remove this rule */
+                        if (n < rset->srs_nrule - 1)
+                                memmove(&rset->srs_rules[n],
+                                        &rset->srs_rules[n + 1],
+                                        (rset->srs_nrule - n - 1) *
+                                        sizeof(*rule));
+                        rset->srs_nrule--;
+                } else {
+                        /* override the rule */
+                        memcpy(&rset->srs_rules[n], rule, sizeof(*rule));
+                }
+        } else {
+                LASSERT(n >= 0 && n <= rset->srs_nrule);
+
+                if (rule->sr_flvr.sf_rpc != SPTLRPC_FLVR_INVALID) {
+                        rc = sptlrpc_rule_set_expand(rset);
+                        if (rc)
+                                return rc;
+
+                        if (n < rset->srs_nrule)
+                                memmove(&rset->srs_rules[n + 1],
+                                        &rset->srs_rules[n],
+                                        (rset->srs_nrule - n) * sizeof(*rule));
+                        memcpy(&rset->srs_rules[n], rule, sizeof(*rule));
+                        rset->srs_nrule++;
+                } else {
+                        CDEBUG(D_CONFIG, "ignore the unmatched deletion\n");
+                }
+        }
+
+        return 0;
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_merge);
+
+/**
+ * given from/to/nid, determine a matching flavor in ruleset.
+ * return 1 if a match found, otherwise return 0.
+ */
+int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset,
+                            enum lustre_sec_part from,
+                            enum lustre_sec_part to,
+                            lnet_nid_t nid,
+                            struct sptlrpc_flavor *sf)
+{
+        struct sptlrpc_rule    *r;
+        int                     n;
+
+        for (n = 0; n < rset->srs_nrule; n++) {
+                r = &rset->srs_rules[n];
+
+                if (LNET_NIDNET(nid) != LNET_NIDNET(LNET_NID_ANY) &&
+                    r->sr_netid != LNET_NIDNET(LNET_NID_ANY) &&
+                    LNET_NIDNET(nid) != r->sr_netid)
+                        continue;
+
+                if (from != LUSTRE_SP_ANY && r->sr_from != LUSTRE_SP_ANY &&
+                    from != r->sr_from)
+                        continue;
+
+                if (to != LUSTRE_SP_ANY && r->sr_to != LUSTRE_SP_ANY &&
+                    to != r->sr_to)
+                        continue;
+
+                *sf = r->sr_flvr;
+                return 1;
+        }
+
+        return 0;
+}
+
+void sptlrpc_rule_set_dump(struct sptlrpc_rule_set *rset)
+{
+        struct sptlrpc_rule *r;
+        int     n;
+
+        for (n = 0; n < rset->srs_nrule; n++) {
+                r = &rset->srs_rules[n];
+                CDEBUG(D_SEC, "<%02d> from %x to %x, net %x, rpc %x\n", n,
+                       r->sr_from, r->sr_to, r->sr_netid, r->sr_flvr.sf_rpc);
+        }
+}
+
+static int sptlrpc_rule_set_extract(struct sptlrpc_rule_set *gen,
+                                    struct sptlrpc_rule_set *tgt,
+                                    enum lustre_sec_part from,
+                                    enum lustre_sec_part to,
+                                    struct sptlrpc_rule_set *rset)
+{
+	struct sptlrpc_rule_set *src[2] = { gen, tgt };
+	struct sptlrpc_rule     *rule;
+	int                      i, n, rc;
+
+	might_sleep();
+
+	/* merge general rules firstly, then target-specific rules */
+	for (i = 0; i < 2; i++) {
+		if (src[i] == NULL)
+			continue;
+
+                for (n = 0; n < src[i]->srs_nrule; n++) {
+                        rule = &src[i]->srs_rules[n];
+
+                        if (from != LUSTRE_SP_ANY &&
+                            rule->sr_from != LUSTRE_SP_ANY &&
+                            rule->sr_from != from)
+                                continue;
+                        if (to != LUSTRE_SP_ANY &&
+                            rule->sr_to != LUSTRE_SP_ANY &&
+                            rule->sr_to != to)
+                                continue;
+
+                        rc = sptlrpc_rule_set_merge(rset, rule);
+                        if (rc) {
+                                CERROR("can't merge: %d\n", rc);
+                                return rc;
+                        }
+                }
+        }
+
+        return 0;
+}
+
+/**********************************
+ * sptlrpc configuration support  *
+ **********************************/
+
+struct sptlrpc_conf_tgt {
+	struct list_head              sct_list;
+        char                    sct_name[MAX_OBD_NAME];
+        struct sptlrpc_rule_set sct_rset;
+};
+
+struct sptlrpc_conf {
+	struct list_head	sc_list;
+	char			sc_fsname[MTI_NAME_MAXLEN];
+	unsigned int		sc_modified;	/* modified during updating */
+	unsigned int		sc_updated:1,	/* updated copy from MGS */
+				sc_local:1;	/* local copy from target */
+	struct sptlrpc_rule_set	sc_rset;	/* fs general rules */
+	struct list_head	sc_tgts;	/* target-specific rules */
+};
+
+static struct mutex sptlrpc_conf_lock;
+static struct list_head sptlrpc_confs;
+
+static void sptlrpc_conf_free_rsets(struct sptlrpc_conf *conf)
+{
+	struct sptlrpc_conf_tgt *conf_tgt, *conf_tgt_next;
+
+	sptlrpc_rule_set_free(&conf->sc_rset);
+
+	list_for_each_entry_safe(conf_tgt, conf_tgt_next,
+				 &conf->sc_tgts, sct_list) {
+		sptlrpc_rule_set_free(&conf_tgt->sct_rset);
+		list_del(&conf_tgt->sct_list);
+		OBD_FREE_PTR(conf_tgt);
+	}
+	LASSERT(list_empty(&conf->sc_tgts));
+
+	conf->sc_updated = 0;
+	conf->sc_local = 0;
+}
+
+static void sptlrpc_conf_free(struct sptlrpc_conf *conf)
+{
+	CDEBUG(D_SEC, "free sptlrpc conf %s\n", conf->sc_fsname);
+
+	sptlrpc_conf_free_rsets(conf);
+	list_del(&conf->sc_list);
+	OBD_FREE_PTR(conf);
+}
+
+static
+struct sptlrpc_conf_tgt *sptlrpc_conf_get_tgt(struct sptlrpc_conf *conf,
+                                              const char *name,
+                                              int create)
+{
+        struct sptlrpc_conf_tgt *conf_tgt;
+
+	list_for_each_entry(conf_tgt, &conf->sc_tgts, sct_list) {
+                if (strcmp(conf_tgt->sct_name, name) == 0)
+                        return conf_tgt;
+        }
+
+        if (!create)
+                return NULL;
+
+        OBD_ALLOC_PTR(conf_tgt);
+        if (conf_tgt) {
+		strlcpy(conf_tgt->sct_name, name, sizeof(conf_tgt->sct_name));
+                sptlrpc_rule_set_init(&conf_tgt->sct_rset);
+		list_add(&conf_tgt->sct_list, &conf->sc_tgts);
+        }
+
+        return conf_tgt;
+}
+
+static
+struct sptlrpc_conf *sptlrpc_conf_get(const char *fsname,
+                                      int create)
+{
+        struct sptlrpc_conf *conf;
+
+	list_for_each_entry(conf, &sptlrpc_confs, sc_list) {
+                if (strcmp(conf->sc_fsname, fsname) == 0)
+                        return conf;
+        }
+
+        if (!create)
+                return NULL;
+
+        OBD_ALLOC_PTR(conf);
+        if (conf == NULL)
+                return NULL;
+
+	if (strlcpy(conf->sc_fsname, fsname, sizeof(conf->sc_fsname)) >=
+	    sizeof(conf->sc_fsname)) {
+		OBD_FREE_PTR(conf);
+		return NULL;
+	}
+        sptlrpc_rule_set_init(&conf->sc_rset);
+	INIT_LIST_HEAD(&conf->sc_tgts);
+	list_add(&conf->sc_list, &sptlrpc_confs);
+
+        CDEBUG(D_SEC, "create sptlrpc conf %s\n", conf->sc_fsname);
+        return conf;
+}
+
+/**
+ * caller must hold conf_lock already.
+ */
+static int sptlrpc_conf_merge_rule(struct sptlrpc_conf *conf,
+                                   const char *target,
+                                   struct sptlrpc_rule *rule)
+{
+        struct sptlrpc_conf_tgt  *conf_tgt;
+        struct sptlrpc_rule_set  *rule_set;
+
+        /* fsname == target means general rules for the whole fs */
+        if (strcmp(conf->sc_fsname, target) == 0) {
+                rule_set = &conf->sc_rset;
+        } else {
+                conf_tgt = sptlrpc_conf_get_tgt(conf, target, 1);
+                if (conf_tgt) {
+                        rule_set = &conf_tgt->sct_rset;
+                } else {
+                        CERROR("out of memory, can't merge rule!\n");
+                        return -ENOMEM;
+                }
+        }
+
+        return sptlrpc_rule_set_merge(rule_set, rule);
+}
+
+/**
+ * process one LCFG_SPTLRPC_CONF record. if \a conf is NULL, we
+ * find one through the target name in the record inside conf_lock;
+ * otherwise means caller already hold conf_lock.
+ */
+static int __sptlrpc_process_config(struct lustre_cfg *lcfg,
+                                    struct sptlrpc_conf *conf)
+{
+        char                   *target, *param;
+        char                    fsname[MTI_NAME_MAXLEN];
+        struct sptlrpc_rule     rule;
+        int                     rc;
+
+	ENTRY;
+	print_lustre_cfg(lcfg);
+
+        target = lustre_cfg_string(lcfg, 1);
+        if (target == NULL) {
+                CERROR("missing target name\n");
+                RETURN(-EINVAL);
+        }
+
+        param = lustre_cfg_string(lcfg, 2);
+        if (param == NULL) {
+                CERROR("missing parameter\n");
+                RETURN(-EINVAL);
+        }
+
+        CDEBUG(D_SEC, "processing rule: %s.%s\n", target, param);
+
+        /* parse rule to make sure the format is correct */
+        if (strncmp(param, PARAM_SRPC_FLVR, sizeof(PARAM_SRPC_FLVR) - 1) != 0) {
+                CERROR("Invalid sptlrpc parameter: %s\n", param);
+                RETURN(-EINVAL);
+        }
+        param += sizeof(PARAM_SRPC_FLVR) - 1;
+
+        rc = sptlrpc_parse_rule(param, &rule);
+        if (rc)
+                RETURN(-EINVAL);
+
+        if (conf == NULL) {
+		obdname2fsname(target, fsname, sizeof(fsname));
+
+		mutex_lock(&sptlrpc_conf_lock);
+                conf = sptlrpc_conf_get(fsname, 0);
+                if (conf == NULL) {
+                        CERROR("can't find conf\n");
+                        rc = -ENOMEM;
+                } else {
+                        rc = sptlrpc_conf_merge_rule(conf, target, &rule);
+                }
+		mutex_unlock(&sptlrpc_conf_lock);
+        } else {
+		LASSERT(mutex_is_locked(&sptlrpc_conf_lock));
+                rc = sptlrpc_conf_merge_rule(conf, target, &rule);
+        }
+
+        if (rc == 0)
+                conf->sc_modified++;
+
+        RETURN(rc);
+}
+
+int sptlrpc_process_config(struct lustre_cfg *lcfg)
+{
+        return __sptlrpc_process_config(lcfg, NULL);
+}
+EXPORT_SYMBOL(sptlrpc_process_config);
+
+static int logname2fsname(const char *logname, char *buf, int buflen)
+{
+        char   *ptr;
+        int     len;
+
+        ptr = strrchr(logname, '-');
+        if (ptr == NULL || strcmp(ptr, "-sptlrpc")) {
+                CERROR("%s is not a sptlrpc config log\n", logname);
+                return -EINVAL;
+        }
+
+        len = min((int) (ptr - logname), buflen - 1);
+
+        memcpy(buf, logname, len);
+        buf[len] = '\0';
+        return 0;
+}
+
+void sptlrpc_conf_log_update_begin(const char *logname)
+{
+        struct sptlrpc_conf *conf;
+        char                 fsname[16];
+
+        if (logname2fsname(logname, fsname, sizeof(fsname)))
+                return;
+
+	mutex_lock(&sptlrpc_conf_lock);
+
+        conf = sptlrpc_conf_get(fsname, 0);
+	if (conf) {
+		if (conf->sc_local) {
+			LASSERT(conf->sc_updated == 0);
+			sptlrpc_conf_free_rsets(conf);
+		}
+		conf->sc_modified = 0;
+	}
+
+	mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_update_begin);
+
+/**
+ * mark a config log has been updated
+ */
+void sptlrpc_conf_log_update_end(const char *logname)
+{
+        struct sptlrpc_conf *conf;
+        char                 fsname[16];
+
+        if (logname2fsname(logname, fsname, sizeof(fsname)))
+                return;
+
+	mutex_lock(&sptlrpc_conf_lock);
+
+        conf = sptlrpc_conf_get(fsname, 0);
+        if (conf) {
+                /*
+                 * if original state is not updated, make sure the
+                 * modified counter > 0 to enforce updating local copy.
+                 */
+                if (conf->sc_updated == 0)
+                        conf->sc_modified++;
+
+                conf->sc_updated = 1;
+        }
+
+	mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_update_end);
+
+void sptlrpc_conf_log_start(const char *logname)
+{
+        char                 fsname[16];
+
+        if (logname2fsname(logname, fsname, sizeof(fsname)))
+                return;
+
+	mutex_lock(&sptlrpc_conf_lock);
+        sptlrpc_conf_get(fsname, 1);
+	mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_start);
+
+void sptlrpc_conf_log_stop(const char *logname)
+{
+        struct sptlrpc_conf *conf;
+        char                 fsname[16];
+
+        if (logname2fsname(logname, fsname, sizeof(fsname)))
+                return;
+
+	mutex_lock(&sptlrpc_conf_lock);
+        conf = sptlrpc_conf_get(fsname, 0);
+        if (conf)
+                sptlrpc_conf_free(conf);
+	mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_stop);
+
+static void inline flavor_set_flags(struct sptlrpc_flavor *sf,
+                                    enum lustre_sec_part from,
+                                    enum lustre_sec_part to,
+                                    unsigned int fl_udesc)
+{
+        /*
+         * null flavor doesn't need to set any flavor, and in fact
+         * we'd better not do that because everybody share a single sec.
+         */
+        if (sf->sf_rpc == SPTLRPC_FLVR_NULL)
+                return;
+
+        if (from == LUSTRE_SP_MDT) {
+                /* MDT->MDT; MDT->OST */
+                sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY;
+        } else if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_OST) {
+                /* CLI->OST */
+                sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY | PTLRPC_SEC_FL_BULK;
+        } else if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_MDT) {
+                /* CLI->MDT */
+                if (fl_udesc && sf->sf_rpc != SPTLRPC_FLVR_NULL)
+                        sf->sf_flags |= PTLRPC_SEC_FL_UDESC;
+        }
+
+	/* Some flavors use a single uid (0) context */
+	if (flvr_is_rootonly(sf->sf_rpc))
+		sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY;
+
+	/* User descriptor might need to be cleared */
+	if (flvr_allows_user_desc(sf->sf_rpc) == 0)
+		sf->sf_flags &= ~PTLRPC_SEC_FL_UDESC;
+}
+
+void sptlrpc_conf_choose_flavor(enum lustre_sec_part from,
+                                enum lustre_sec_part to,
+                                struct obd_uuid *target,
+                                lnet_nid_t nid,
+                                struct sptlrpc_flavor *sf)
+{
+        struct sptlrpc_conf     *conf;
+        struct sptlrpc_conf_tgt *conf_tgt;
+        char                     name[MTI_NAME_MAXLEN];
+        int                      len, rc = 0;
+
+	obd_uuid2fsname(name, target->uuid, sizeof(name));
+
+	mutex_lock(&sptlrpc_conf_lock);
+
+        conf = sptlrpc_conf_get(name, 0);
+        if (conf == NULL)
+                goto out;
+
+        /* convert uuid name (supposed end with _UUID) to target name */
+        len = strlen(target->uuid);
+        LASSERT(len > 5);
+        memcpy(name, target->uuid, len - 5);
+        name[len - 5] = '\0';
+
+        conf_tgt = sptlrpc_conf_get_tgt(conf, name, 0);
+        if (conf_tgt) {
+                rc = sptlrpc_rule_set_choose(&conf_tgt->sct_rset,
+                                             from, to, nid, sf);
+                if (rc)
+                        goto out;
+        }
+
+        rc = sptlrpc_rule_set_choose(&conf->sc_rset, from, to, nid, sf);
+out:
+	mutex_unlock(&sptlrpc_conf_lock);
+
+        if (rc == 0)
+                get_default_flavor(sf);
+
+        flavor_set_flags(sf, from, to, 1);
+}
+
+/**
+ * called by target devices, determine the expected flavor from
+ * certain peer (from, nid).
+ */
+void sptlrpc_target_choose_flavor(struct sptlrpc_rule_set *rset,
+                                  enum lustre_sec_part from,
+                                  lnet_nid_t nid,
+                                  struct sptlrpc_flavor *sf)
+{
+        if (sptlrpc_rule_set_choose(rset, from, LUSTRE_SP_ANY, nid, sf) == 0)
+                get_default_flavor(sf);
+}
+
+#define SEC_ADAPT_DELAY         (10)
+
+/**
+ * called by client devices, notify the sptlrpc config has changed and
+ * do import_sec_adapt later.
+ */
+void sptlrpc_conf_client_adapt(struct obd_device *obd)
+{
+        struct obd_import  *imp;
+        ENTRY;
+
+	LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 ||
+		strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 ||
+		strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) == 0 ||
+		strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) == 0);
+	CDEBUG(D_SEC, "obd %s\n", obd->u.cli.cl_target_uuid.uuid);
+
+	/* serialize with connect/disconnect import */
+	down_read_nested(&obd->u.cli.cl_sem, OBD_CLI_SEM_MDCOSC);
+
+	imp = obd->u.cli.cl_import;
+	if (imp) {
+		spin_lock(&imp->imp_lock);
+		if (imp->imp_sec)
+			imp->imp_sec_expire = ktime_get_real_seconds() +
+				SEC_ADAPT_DELAY;
+		spin_unlock(&imp->imp_lock);
+	}
+
+	up_read(&obd->u.cli.cl_sem);
+	EXIT;
+}
+EXPORT_SYMBOL(sptlrpc_conf_client_adapt);
+
+/**
+ * called by target devices, extract sptlrpc rules which applies to
+ * this target, to be used for future rpc flavor checking.
+ */
+int sptlrpc_conf_target_get_rules(struct obd_device *obd,
+				  struct sptlrpc_rule_set *rset)
+{
+	struct sptlrpc_conf *conf;
+	struct sptlrpc_conf_tgt *conf_tgt;
+	enum lustre_sec_part sp_dst;
+	char fsname[MTI_NAME_MAXLEN];
+	int rc = 0;
+	ENTRY;
+
+	if (strcmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME) == 0) {
+		sp_dst = LUSTRE_SP_MDT;
+	} else if (strcmp(obd->obd_type->typ_name, LUSTRE_OST_NAME) == 0) {
+		sp_dst = LUSTRE_SP_OST;
+	} else {
+		CERROR("unexpected obd type %s\n", obd->obd_type->typ_name);
+		RETURN(-EINVAL);
+	}
+
+	obd_uuid2fsname(fsname, obd->obd_uuid.uuid, sizeof(fsname));
+
+	mutex_lock(&sptlrpc_conf_lock);
+	conf = sptlrpc_conf_get(fsname, 0);
+	if (conf == NULL) {
+		CERROR("missing sptlrpc config log\n");
+		rc = -EFAULT;
+	} else {
+		/* extract rule set for this target */
+		conf_tgt = sptlrpc_conf_get_tgt(conf, obd->obd_name, 0);
+
+		rc = sptlrpc_rule_set_extract(&conf->sc_rset,
+				      conf_tgt ? &conf_tgt->sct_rset : NULL,
+				      LUSTRE_SP_ANY, sp_dst, rset);
+	}
+	mutex_unlock(&sptlrpc_conf_lock);
+
+	RETURN(rc);
+}
+
+int  sptlrpc_conf_init(void)
+{
+	INIT_LIST_HEAD(&sptlrpc_confs);
+	mutex_init(&sptlrpc_conf_lock);
+	return 0;
+}
+
+void sptlrpc_conf_fini(void)
+{
+	struct sptlrpc_conf  *conf, *conf_next;
+
+	mutex_lock(&sptlrpc_conf_lock);
+	list_for_each_entry_safe(conf, conf_next, &sptlrpc_confs, sc_list)
+		sptlrpc_conf_free(conf);
+	LASSERT(list_empty(&sptlrpc_confs));
+	mutex_unlock(&sptlrpc_conf_lock);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c
new file mode 100644
index 0000000000000..c93e87dcc56d2
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c
@@ -0,0 +1,99 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ *
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2014, Intel Corporation.
+ */
+
+#define DEBUG_SUBSYSTEM S_FILTER
+
+#include <linux/fs.h>
+#include <libcfs/libcfs.h>
+#include <lvfs.h>
+#include <obd_class.h>
+
+#include "ptlrpc_internal.h"
+
+/* refine later and change to seqlock or simlar from libcfs */
+/* Debugging check only needed during development */
+#ifdef OBD_CTXT_DEBUG
+# define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC)
+# define ASSERT_NOT_KERNEL_CTXT(msg) LASSERTF(!segment_eq(get_fs(), get_ds()),\
+					      msg)
+# define ASSERT_KERNEL_CTXT(msg) LASSERTF(segment_eq(get_fs(), get_ds()), msg)
+#else
+# define ASSERT_CTXT_MAGIC(magic) do {} while(0)
+# define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0)
+# define ASSERT_KERNEL_CTXT(msg) do {} while(0)
+#endif
+
+/* push / pop to root of obd store */
+void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx)
+{
+	/* if there is underlaying dt_device then push_ctxt is not needed */
+	if (new_ctx->dt != NULL)
+		return;
+
+	//ASSERT_NOT_KERNEL_CTXT("already in kernel context!\n");
+	ASSERT_CTXT_MAGIC(new_ctx->magic);
+	OBD_SET_CTXT_MAGIC(save);
+
+	save->fs = get_fs();
+	LASSERT(ll_d_count(current->fs->pwd.dentry));
+	LASSERT(ll_d_count(new_ctx->pwd));
+	save->pwd = dget(current->fs->pwd.dentry);
+	save->pwdmnt = mntget(current->fs->pwd.mnt);
+	save->umask = current_umask();
+
+	LASSERT(save->pwd);
+	LASSERT(save->pwdmnt);
+	LASSERT(new_ctx->pwd);
+	LASSERT(new_ctx->pwdmnt);
+
+	current->fs->umask = 0; /* umask already applied on client */
+	set_fs(new_ctx->fs);
+	ll_set_fs_pwd(current->fs, new_ctx->pwdmnt, new_ctx->pwd);
+}
+EXPORT_SYMBOL(push_ctxt);
+
+void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx)
+{
+	/* if there is underlaying dt_device then pop_ctxt is not needed */
+	if (new_ctx->dt != NULL)
+		return;
+
+	ASSERT_CTXT_MAGIC(saved->magic);
+	ASSERT_KERNEL_CTXT("popping non-kernel context!\n");
+
+	LASSERTF(current->fs->pwd.dentry == new_ctx->pwd, "%p != %p\n",
+		 current->fs->pwd.dentry, new_ctx->pwd);
+	LASSERTF(current->fs->pwd.mnt == new_ctx->pwdmnt, "%p != %p\n",
+		 current->fs->pwd.mnt, new_ctx->pwdmnt);
+
+	set_fs(saved->fs);
+	ll_set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd);
+
+	dput(saved->pwd);
+	mntput(saved->pwdmnt);
+	current->fs->umask = saved->umask;
+}
+EXPORT_SYMBOL(pop_ctxt);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_gc.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_gc.c
new file mode 100644
index 0000000000000..f8ec60b1adb01
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_gc.c
@@ -0,0 +1,252 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_gc.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/kthread.h>
+#include <libcfs/libcfs.h>
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+#define SEC_GC_INTERVAL (30 * 60)
+
+
+static struct mutex sec_gc_mutex;
+static spinlock_t sec_gc_list_lock;
+static struct list_head sec_gc_list;
+
+static spinlock_t sec_gc_ctx_list_lock;
+static struct list_head sec_gc_ctx_list;
+
+static struct ptlrpc_thread sec_gc_thread;
+static atomic_t sec_gc_wait_del = ATOMIC_INIT(0);
+
+
+void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec)
+{
+        LASSERT(sec->ps_policy->sp_cops->gc_ctx);
+        LASSERT(sec->ps_gc_interval > 0);
+	LASSERT(list_empty(&sec->ps_gc_list));
+
+	sec->ps_gc_next = ktime_get_real_seconds() + sec->ps_gc_interval;
+
+	spin_lock(&sec_gc_list_lock);
+	list_add_tail(&sec->ps_gc_list, &sec_gc_list);
+	spin_unlock(&sec_gc_list_lock);
+
+	CDEBUG(D_SEC, "added sec %p(%s)\n", sec, sec->ps_policy->sp_name);
+}
+
+void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec)
+{
+	if (list_empty(&sec->ps_gc_list))
+		return;
+
+	might_sleep();
+
+	/* signal before list_del to make iteration in gc thread safe */
+	atomic_inc(&sec_gc_wait_del);
+
+	spin_lock(&sec_gc_list_lock);
+	list_del_init(&sec->ps_gc_list);
+	spin_unlock(&sec_gc_list_lock);
+
+	/* barrier */
+	mutex_lock(&sec_gc_mutex);
+	mutex_unlock(&sec_gc_mutex);
+
+	atomic_dec(&sec_gc_wait_del);
+
+	CDEBUG(D_SEC, "del sec %p(%s)\n", sec, sec->ps_policy->sp_name);
+}
+
+void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx)
+{
+	LASSERT(list_empty(&ctx->cc_gc_chain));
+
+	CDEBUG(D_SEC, "hand over ctx %p(%u->%s)\n",
+	       ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+	spin_lock(&sec_gc_ctx_list_lock);
+	list_add(&ctx->cc_gc_chain, &sec_gc_ctx_list);
+	spin_unlock(&sec_gc_ctx_list_lock);
+
+	thread_add_flags(&sec_gc_thread, SVC_SIGNAL);
+	wake_up(&sec_gc_thread.t_ctl_waitq);
+}
+EXPORT_SYMBOL(sptlrpc_gc_add_ctx);
+
+static void sec_process_ctx_list(void)
+{
+	struct ptlrpc_cli_ctx *ctx;
+
+	spin_lock(&sec_gc_ctx_list_lock);
+
+	while (!list_empty(&sec_gc_ctx_list)) {
+		ctx = list_entry(sec_gc_ctx_list.next,
+				     struct ptlrpc_cli_ctx, cc_gc_chain);
+		list_del_init(&ctx->cc_gc_chain);
+		spin_unlock(&sec_gc_ctx_list_lock);
+
+		LASSERT(ctx->cc_sec);
+		LASSERT(atomic_read(&ctx->cc_refcount) == 1);
+		CDEBUG(D_SEC, "gc pick up ctx %p(%u->%s)\n",
+		       ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+		sptlrpc_cli_ctx_put(ctx, 1);
+
+		spin_lock(&sec_gc_ctx_list_lock);
+	}
+
+	spin_unlock(&sec_gc_ctx_list_lock);
+}
+
+static void sec_do_gc(struct ptlrpc_sec *sec)
+{
+        LASSERT(sec->ps_policy->sp_cops->gc_ctx);
+
+        if (unlikely(sec->ps_gc_next == 0)) {
+                CDEBUG(D_SEC, "sec %p(%s) has 0 gc time\n",
+                      sec, sec->ps_policy->sp_name);
+                return;
+        }
+
+        CDEBUG(D_SEC, "check on sec %p(%s)\n", sec, sec->ps_policy->sp_name);
+
+	if (sec->ps_gc_next > ktime_get_real_seconds())
+                return;
+
+        sec->ps_policy->sp_cops->gc_ctx(sec);
+	sec->ps_gc_next = ktime_get_real_seconds() + sec->ps_gc_interval;
+}
+
+static int sec_gc_main(void *arg)
+{
+	struct ptlrpc_thread *thread = (struct ptlrpc_thread *) arg;
+	struct l_wait_info    lwi;
+
+	unshare_fs_struct();
+
+	/* Record that the thread is running */
+	thread_set_flags(thread, SVC_RUNNING);
+	wake_up(&thread->t_ctl_waitq);
+
+	while (1) {
+		struct ptlrpc_sec *sec;
+
+		thread_clear_flags(thread, SVC_SIGNAL);
+		sec_process_ctx_list();
+again:
+		/* go through sec list do gc.
+		 * FIXME here we iterate through the whole list each time which
+		 * is not optimal. we perhaps want to use balanced binary tree
+		 * to trace each sec as order of expiry time.
+		 * another issue here is we wakeup as fixed interval instead of
+		 * according to each sec's expiry time */
+		mutex_lock(&sec_gc_mutex);
+		list_for_each_entry(sec, &sec_gc_list, ps_gc_list) {
+			/* if someone is waiting to be deleted, let it
+			 * proceed as soon as possible. */
+			if (atomic_read(&sec_gc_wait_del)) {
+				CDEBUG(D_SEC, "deletion pending, start over\n");
+				mutex_unlock(&sec_gc_mutex);
+				goto again;
+			}
+
+			sec_do_gc(sec);
+		}
+		mutex_unlock(&sec_gc_mutex);
+
+		/* check ctx list again before sleep */
+		sec_process_ctx_list();
+
+		lwi = LWI_TIMEOUT(msecs_to_jiffies(SEC_GC_INTERVAL *
+						   MSEC_PER_SEC),
+				  NULL, NULL);
+		l_wait_event(thread->t_ctl_waitq,
+			     thread_is_stopping(thread) ||
+			     thread_is_signal(thread),
+			     &lwi);
+
+		if (thread_test_and_clear_flags(thread, SVC_STOPPING))
+			break;
+	}
+
+	thread_set_flags(thread, SVC_STOPPED);
+	wake_up(&thread->t_ctl_waitq);
+	return 0;
+}
+
+int sptlrpc_gc_init(void)
+{
+	struct l_wait_info lwi = { 0 };
+	struct task_struct *task;
+
+	mutex_init(&sec_gc_mutex);
+	spin_lock_init(&sec_gc_list_lock);
+	spin_lock_init(&sec_gc_ctx_list_lock);
+
+	INIT_LIST_HEAD(&sec_gc_list);
+	INIT_LIST_HEAD(&sec_gc_ctx_list);
+
+	/* initialize thread control */
+	memset(&sec_gc_thread, 0, sizeof(sec_gc_thread));
+	init_waitqueue_head(&sec_gc_thread.t_ctl_waitq);
+
+	task = kthread_run(sec_gc_main, &sec_gc_thread, "sptlrpc_gc");
+	if (IS_ERR(task)) {
+		CERROR("can't start gc thread: %ld\n", PTR_ERR(task));
+		return PTR_ERR(task);
+	}
+
+	l_wait_event(sec_gc_thread.t_ctl_waitq,
+		     thread_is_running(&sec_gc_thread), &lwi);
+	return 0;
+}
+
+void sptlrpc_gc_fini(void)
+{
+	struct l_wait_info lwi = { 0 };
+
+	thread_set_flags(&sec_gc_thread, SVC_STOPPING);
+	wake_up(&sec_gc_thread.t_ctl_waitq);
+
+	l_wait_event(sec_gc_thread.t_ctl_waitq,
+		     thread_is_stopped(&sec_gc_thread), &lwi);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_lproc.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_lproc.c
new file mode 100644
index 0000000000000..96acb183270e4
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_lproc.c
@@ -0,0 +1,204 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_lproc.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <libcfs/libcfs.h>
+#include <linux/crypto.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_dlm.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+
+struct proc_dir_entry *sptlrpc_proc_root = NULL;
+EXPORT_SYMBOL(sptlrpc_proc_root);
+
+static char *sec_flags2str(unsigned long flags, char *buf, int bufsize)
+{
+	buf[0] = '\0';
+
+	if (flags & PTLRPC_SEC_FL_REVERSE)
+		strlcat(buf, "reverse,", bufsize);
+	if (flags & PTLRPC_SEC_FL_ROOTONLY)
+		strlcat(buf, "rootonly,", bufsize);
+	if (flags & PTLRPC_SEC_FL_UDESC)
+		strlcat(buf, "udesc,", bufsize);
+	if (flags & PTLRPC_SEC_FL_BULK)
+		strlcat(buf, "bulk,", bufsize);
+	if (buf[0] == '\0')
+		strlcat(buf, "-,", bufsize);
+
+	return buf;
+}
+
+static int sptlrpc_info_lprocfs_seq_show(struct seq_file *seq, void *v)
+{
+        struct obd_device *dev = seq->private;
+        struct client_obd *cli = &dev->u.cli;
+        struct ptlrpc_sec *sec = NULL;
+        char               str[32];
+
+	LASSERT(strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 ||
+		strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 ||
+		strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) == 0 ||
+		strcmp(dev->obd_type->typ_name, LUSTRE_LWP_NAME) == 0 ||
+		strcmp(dev->obd_type->typ_name, LUSTRE_OSP_NAME) == 0);
+
+        if (cli->cl_import)
+                sec = sptlrpc_import_sec_ref(cli->cl_import);
+        if (sec == NULL)
+                goto out;
+
+        sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str));
+
+	seq_printf(seq, "rpc flavor:	%s\n",
+		   sptlrpc_flavor2name_base(sec->ps_flvr.sf_rpc));
+	seq_printf(seq, "bulk flavor:	%s\n",
+		   sptlrpc_flavor2name_bulk(&sec->ps_flvr, str, sizeof(str)));
+	seq_printf(seq, "flags:		%s\n",
+		   sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str)));
+	seq_printf(seq, "id:		%d\n", sec->ps_id);
+	seq_printf(seq, "refcount:	%d\n",
+		   atomic_read(&sec->ps_refcount));
+	seq_printf(seq, "nctx:	%d\n", atomic_read(&sec->ps_nctx));
+	seq_printf(seq, "gc internal	%lld\n", sec->ps_gc_interval);
+	seq_printf(seq, "gc next	%lld\n",
+		   sec->ps_gc_interval ?
+		   (s64)(sec->ps_gc_next - ktime_get_real_seconds()) : 0ll);
+
+	sptlrpc_sec_put(sec);
+out:
+        return 0;
+}
+LPROC_SEQ_FOPS_RO(sptlrpc_info_lprocfs);
+
+static int sptlrpc_ctxs_lprocfs_seq_show(struct seq_file *seq, void *v)
+{
+        struct obd_device *dev = seq->private;
+        struct client_obd *cli = &dev->u.cli;
+        struct ptlrpc_sec *sec = NULL;
+
+	LASSERT(strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 ||
+		strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 ||
+		strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) == 0 ||
+		strcmp(dev->obd_type->typ_name, LUSTRE_LWP_NAME) == 0 ||
+		strcmp(dev->obd_type->typ_name, LUSTRE_OSP_NAME) == 0);
+
+        if (cli->cl_import)
+                sec = sptlrpc_import_sec_ref(cli->cl_import);
+        if (sec == NULL)
+                goto out;
+
+        if (sec->ps_policy->sp_cops->display)
+                sec->ps_policy->sp_cops->display(sec, seq);
+
+        sptlrpc_sec_put(sec);
+out:
+        return 0;
+}
+LPROC_SEQ_FOPS_RO(sptlrpc_ctxs_lprocfs);
+
+int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev)
+{
+        int     rc;
+
+	if (strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) != 0 &&
+	    strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) != 0 &&
+	    strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) != 0 &&
+	    strcmp(dev->obd_type->typ_name, LUSTRE_LWP_NAME) != 0 &&
+	    strcmp(dev->obd_type->typ_name, LUSTRE_OSP_NAME) != 0) {
+		CERROR("can't register lproc for obd type %s\n",
+		       dev->obd_type->typ_name);
+		return -EINVAL;
+	}
+
+        rc = lprocfs_obd_seq_create(dev, "srpc_info", 0444,
+                                    &sptlrpc_info_lprocfs_fops, dev);
+        if (rc) {
+                CERROR("create proc entry srpc_info for %s: %d\n",
+                       dev->obd_name, rc);
+                return rc;
+        }
+
+        rc = lprocfs_obd_seq_create(dev, "srpc_contexts", 0444,
+                                    &sptlrpc_ctxs_lprocfs_fops, dev);
+        if (rc) {
+                CERROR("create proc entry srpc_contexts for %s: %d\n",
+                       dev->obd_name, rc);
+                return rc;
+        }
+
+        return 0;
+}
+EXPORT_SYMBOL(sptlrpc_lprocfs_cliobd_attach);
+
+LPROC_SEQ_FOPS_RO(sptlrpc_proc_enc_pool);
+static struct lprocfs_vars sptlrpc_lprocfs_vars[] = {
+	{ .name	=	"encrypt_page_pools",
+	  .fops	=	&sptlrpc_proc_enc_pool_fops	},
+	{ NULL }
+};
+
+int sptlrpc_lproc_init(void)
+{
+	int rc;
+
+	LASSERT(sptlrpc_proc_root == NULL);
+
+	sptlrpc_proc_root = lprocfs_register("sptlrpc", proc_lustre_root,
+					     sptlrpc_lprocfs_vars, NULL);
+	if (IS_ERR(sptlrpc_proc_root)) {
+		rc = PTR_ERR(sptlrpc_proc_root);
+		sptlrpc_proc_root = NULL;
+		return rc;
+	}
+	return 0;
+}
+
+void sptlrpc_lproc_fini(void)
+{
+        if (sptlrpc_proc_root) {
+                lprocfs_remove(&sptlrpc_proc_root);
+                sptlrpc_proc_root = NULL;
+        }
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_null.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_null.c
new file mode 100644
index 0000000000000..52af519a291d7
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_null.c
@@ -0,0 +1,456 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_null.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+
+#include <obd_support.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+static struct ptlrpc_sec_policy null_policy;
+static struct ptlrpc_sec        null_sec;
+static struct ptlrpc_cli_ctx    null_cli_ctx;
+static struct ptlrpc_svc_ctx    null_svc_ctx;
+
+/*
+ * we can temporarily use the topmost 8-bits of lm_secflvr to identify
+ * the source sec part.
+ */
+static inline
+void null_encode_sec_part(struct lustre_msg *msg, enum lustre_sec_part sp)
+{
+        msg->lm_secflvr |= (((__u32) sp) & 0xFF) << 24;
+}
+
+static inline
+enum lustre_sec_part null_decode_sec_part(struct lustre_msg *msg)
+{
+        return (msg->lm_secflvr >> 24) & 0xFF;
+}
+
+static int null_ctx_refresh(struct ptlrpc_cli_ctx *ctx)
+{
+        /* should never reach here */
+        LBUG();
+        return 0;
+}
+
+static
+int null_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+        req->rq_reqbuf->lm_secflvr = SPTLRPC_FLVR_NULL;
+
+        if (!req->rq_import->imp_dlm_fake) {
+                struct obd_device *obd = req->rq_import->imp_obd;
+                null_encode_sec_part(req->rq_reqbuf,
+                                     obd->u.cli.cl_sp_me);
+        }
+        req->rq_reqdata_len = req->rq_reqlen;
+        return 0;
+}
+
+static
+int null_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+        __u32   cksums, cksumc;
+
+        LASSERT(req->rq_repdata);
+
+        req->rq_repmsg = req->rq_repdata;
+        req->rq_replen = req->rq_repdata_len;
+
+        if (req->rq_early) {
+                cksums = lustre_msg_get_cksum(req->rq_repdata);
+		cksumc = lustre_msg_calc_cksum(req->rq_repmsg);
+
+                if (cksumc != cksums) {
+                        CDEBUG(D_SEC,
+                               "early reply checksum mismatch: %08x != %08x\n",
+                               cksumc, cksums);
+                        return -EINVAL;
+                }
+        }
+
+        return 0;
+}
+
+static
+struct ptlrpc_sec *null_create_sec(struct obd_import *imp,
+                                   struct ptlrpc_svc_ctx *svc_ctx,
+                                   struct sptlrpc_flavor *sf)
+{
+        LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_NULL);
+
+        /* general layer has take a module reference for us, because we never
+         * really destroy the sec, simply release the reference here.
+         */
+        sptlrpc_policy_put(&null_policy);
+        return &null_sec;
+}
+
+static
+void null_destroy_sec(struct ptlrpc_sec *sec)
+{
+        LASSERT(sec == &null_sec);
+}
+
+static
+struct ptlrpc_cli_ctx *null_lookup_ctx(struct ptlrpc_sec *sec,
+                                       struct vfs_cred *vcred,
+                                       int create, int remove_dead)
+{
+	atomic_inc(&null_cli_ctx.cc_refcount);
+	return &null_cli_ctx;
+}
+
+static
+int null_flush_ctx_cache(struct ptlrpc_sec *sec,
+                         uid_t uid,
+                         int grace, int force)
+{
+        return 0;
+}
+
+static
+int null_alloc_reqbuf(struct ptlrpc_sec *sec,
+                      struct ptlrpc_request *req,
+                      int msgsize)
+{
+	if (!req->rq_reqbuf) {
+		int alloc_size = size_roundup_power2(msgsize);
+
+		LASSERT(!req->rq_pool);
+		OBD_ALLOC_LARGE(req->rq_reqbuf, alloc_size);
+		if (!req->rq_reqbuf)
+			return -ENOMEM;
+
+		req->rq_reqbuf_len = alloc_size;
+	} else {
+		LASSERT(req->rq_pool);
+		LASSERT(req->rq_reqbuf_len >= msgsize);
+		memset(req->rq_reqbuf, 0, msgsize);
+	}
+
+	req->rq_reqmsg = req->rq_reqbuf;
+	return 0;
+}
+
+static
+void null_free_reqbuf(struct ptlrpc_sec *sec,
+                      struct ptlrpc_request *req)
+{
+        if (!req->rq_pool) {
+                LASSERTF(req->rq_reqmsg == req->rq_reqbuf,
+                         "req %p: reqmsg %p is not reqbuf %p in null sec\n",
+                         req, req->rq_reqmsg, req->rq_reqbuf);
+                LASSERTF(req->rq_reqbuf_len >= req->rq_reqlen,
+                         "req %p: reqlen %d should smaller than buflen %d\n",
+                         req, req->rq_reqlen, req->rq_reqbuf_len);
+
+                OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+                req->rq_reqbuf = NULL;
+                req->rq_reqbuf_len = 0;
+        }
+}
+
+static
+int null_alloc_repbuf(struct ptlrpc_sec *sec,
+                      struct ptlrpc_request *req,
+                      int msgsize)
+{
+	/* add space for early replied */
+	msgsize += lustre_msg_early_size();
+
+	msgsize = size_roundup_power2(msgsize);
+
+	OBD_ALLOC_LARGE(req->rq_repbuf, msgsize);
+	if (!req->rq_repbuf)
+		return -ENOMEM;
+
+	req->rq_repbuf_len = msgsize;
+	return 0;
+}
+
+static
+void null_free_repbuf(struct ptlrpc_sec *sec,
+                      struct ptlrpc_request *req)
+{
+        LASSERT(req->rq_repbuf);
+
+        OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len);
+        req->rq_repbuf = NULL;
+        req->rq_repbuf_len = 0;
+}
+
+static
+int null_enlarge_reqbuf(struct ptlrpc_sec *sec,
+                        struct ptlrpc_request *req,
+                        int segment, int newsize)
+{
+        struct lustre_msg      *newbuf;
+        struct lustre_msg      *oldbuf = req->rq_reqmsg;
+        int                     oldsize, newmsg_size, alloc_size;
+
+        LASSERT(req->rq_reqbuf);
+        LASSERT(req->rq_reqbuf == req->rq_reqmsg);
+        LASSERT(req->rq_reqbuf_len >= req->rq_reqlen);
+        LASSERT(req->rq_reqlen == lustre_packed_msg_size(oldbuf));
+
+        /* compute new message size */
+        oldsize = req->rq_reqbuf->lm_buflens[segment];
+        req->rq_reqbuf->lm_buflens[segment] = newsize;
+        newmsg_size = lustre_packed_msg_size(oldbuf);
+        req->rq_reqbuf->lm_buflens[segment] = oldsize;
+
+        /* request from pool should always have enough buffer */
+        LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newmsg_size);
+
+	if (req->rq_reqbuf_len < newmsg_size) {
+		alloc_size = size_roundup_power2(newmsg_size);
+
+		OBD_ALLOC_LARGE(newbuf, alloc_size);
+		if (newbuf == NULL)
+			return -ENOMEM;
+
+		/* Must lock this, so that otherwise unprotected change of
+		 * rq_reqmsg is not racing with parallel processing of
+		 * imp_replay_list traversing threads. See LU-3333
+		 * This is a bandaid at best, we really need to deal with this
+		 * in request enlarging code before unpacking that's already
+		 * there */
+		if (req->rq_import)
+			spin_lock(&req->rq_import->imp_lock);
+		memcpy(newbuf, req->rq_reqbuf, req->rq_reqlen);
+
+		OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+		req->rq_reqbuf = req->rq_reqmsg = newbuf;
+		req->rq_reqbuf_len = alloc_size;
+
+		if (req->rq_import)
+			spin_unlock(&req->rq_import->imp_lock);
+        }
+
+        _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize);
+        req->rq_reqlen = newmsg_size;
+
+        return 0;
+}
+
+static struct ptlrpc_svc_ctx null_svc_ctx = {
+	.sc_refcount    = ATOMIC_INIT(1),
+        .sc_policy      = &null_policy,
+};
+
+static
+int null_accept(struct ptlrpc_request *req)
+{
+	LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) ==
+		SPTLRPC_POLICY_NULL);
+
+	if (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_NULL) {
+		CERROR("Invalid rpc flavor 0x%x\n", req->rq_flvr.sf_rpc);
+		return SECSVC_DROP;
+	}
+
+	req->rq_sp_from = null_decode_sec_part(req->rq_reqbuf);
+
+	req->rq_reqmsg = req->rq_reqbuf;
+	req->rq_reqlen = req->rq_reqdata_len;
+
+	req->rq_svc_ctx = &null_svc_ctx;
+	atomic_inc(&req->rq_svc_ctx->sc_refcount);
+
+	return SECSVC_OK;
+}
+
+static
+int null_alloc_rs(struct ptlrpc_request *req, int msgsize)
+{
+        struct ptlrpc_reply_state *rs;
+        int rs_size = sizeof(*rs) + msgsize;
+
+        LASSERT(msgsize % 8 == 0);
+
+        rs = req->rq_reply_state;
+
+        if (rs) {
+                /* pre-allocated */
+                LASSERT(rs->rs_size >= rs_size);
+        } else {
+		OBD_ALLOC_LARGE(rs, rs_size);
+		if (rs == NULL)
+			return -ENOMEM;
+
+		rs->rs_size = rs_size;
+	}
+
+	rs->rs_svc_ctx = req->rq_svc_ctx;
+	atomic_inc(&req->rq_svc_ctx->sc_refcount);
+
+	rs->rs_repbuf = (struct lustre_msg *) (rs + 1);
+	rs->rs_repbuf_len = rs_size - sizeof(*rs);
+	rs->rs_msg = rs->rs_repbuf;
+
+	req->rq_reply_state = rs;
+	return 0;
+}
+
+static
+void null_free_rs(struct ptlrpc_reply_state *rs)
+{
+	LASSERT_ATOMIC_GT(&rs->rs_svc_ctx->sc_refcount, 1);
+	atomic_dec(&rs->rs_svc_ctx->sc_refcount);
+
+	if (!rs->rs_prealloc)
+		OBD_FREE_LARGE(rs, rs->rs_size);
+}
+
+static
+int null_authorize(struct ptlrpc_request *req)
+{
+        struct ptlrpc_reply_state *rs = req->rq_reply_state;
+
+        LASSERT(rs);
+
+        rs->rs_repbuf->lm_secflvr = SPTLRPC_FLVR_NULL;
+        rs->rs_repdata_len = req->rq_replen;
+
+        if (likely(req->rq_packed_final)) {
+                if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)
+                        req->rq_reply_off = lustre_msg_early_size();
+                else
+                        req->rq_reply_off = 0;
+        } else {
+                __u32 cksum;
+
+		cksum = lustre_msg_calc_cksum(rs->rs_repbuf);
+		lustre_msg_set_cksum(rs->rs_repbuf, cksum);
+		req->rq_reply_off = 0;
+	}
+
+	return 0;
+}
+
+static struct ptlrpc_ctx_ops null_ctx_ops = {
+        .refresh                = null_ctx_refresh,
+        .sign                   = null_ctx_sign,
+        .verify                 = null_ctx_verify,
+};
+
+static struct ptlrpc_sec_cops null_sec_cops = {
+        .create_sec             = null_create_sec,
+        .destroy_sec            = null_destroy_sec,
+        .lookup_ctx             = null_lookup_ctx,
+        .flush_ctx_cache        = null_flush_ctx_cache,
+        .alloc_reqbuf           = null_alloc_reqbuf,
+        .alloc_repbuf           = null_alloc_repbuf,
+        .free_reqbuf            = null_free_reqbuf,
+        .free_repbuf            = null_free_repbuf,
+        .enlarge_reqbuf         = null_enlarge_reqbuf,
+};
+
+static struct ptlrpc_sec_sops null_sec_sops = {
+        .accept                 = null_accept,
+        .alloc_rs               = null_alloc_rs,
+        .authorize              = null_authorize,
+        .free_rs                = null_free_rs,
+};
+
+static struct ptlrpc_sec_policy null_policy = {
+        .sp_owner               = THIS_MODULE,
+        .sp_name                = "sec.null",
+        .sp_policy              = SPTLRPC_POLICY_NULL,
+        .sp_cops                = &null_sec_cops,
+        .sp_sops                = &null_sec_sops,
+};
+
+static void null_init_internal(void)
+{
+	static HLIST_HEAD(__list);
+
+	null_sec.ps_policy = &null_policy;
+	atomic_set(&null_sec.ps_refcount, 1);	/* always busy */
+	null_sec.ps_id = -1;
+	null_sec.ps_import = NULL;
+	null_sec.ps_flvr.sf_rpc = SPTLRPC_FLVR_NULL;
+	null_sec.ps_flvr.sf_flags = 0;
+	null_sec.ps_part = LUSTRE_SP_ANY;
+	null_sec.ps_dying = 0;
+	spin_lock_init(&null_sec.ps_lock);
+	atomic_set(&null_sec.ps_nctx, 1);	/* for "null_cli_ctx" */
+	INIT_LIST_HEAD(&null_sec.ps_gc_list);
+	null_sec.ps_gc_interval = 0;
+	null_sec.ps_gc_next = 0;
+
+	hlist_add_head(&null_cli_ctx.cc_cache, &__list);
+	atomic_set(&null_cli_ctx.cc_refcount, 1);	/* for hash */
+	null_cli_ctx.cc_sec = &null_sec;
+	null_cli_ctx.cc_ops = &null_ctx_ops;
+	null_cli_ctx.cc_expire = 0;
+	null_cli_ctx.cc_flags = PTLRPC_CTX_CACHED | PTLRPC_CTX_ETERNAL |
+				PTLRPC_CTX_UPTODATE;
+	null_cli_ctx.cc_vcred.vc_uid = 0;
+	spin_lock_init(&null_cli_ctx.cc_lock);
+	INIT_LIST_HEAD(&null_cli_ctx.cc_req_list);
+	INIT_LIST_HEAD(&null_cli_ctx.cc_gc_chain);
+}
+
+int sptlrpc_null_init(void)
+{
+        int rc;
+
+        null_init_internal();
+
+        rc = sptlrpc_register_policy(&null_policy);
+        if (rc)
+                CERROR("failed to register %s: %d\n", null_policy.sp_name, rc);
+
+        return rc;
+}
+
+void sptlrpc_null_fini(void)
+{
+        int rc;
+
+        rc = sptlrpc_unregister_policy(&null_policy);
+        if (rc)
+                CERROR("failed to unregister %s: %d\n", null_policy.sp_name,rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_plain.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_plain.c
new file mode 100644
index 0000000000000..a0f192cecf633
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_plain.c
@@ -0,0 +1,1035 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_plain.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+
+#include <obd_support.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+struct plain_sec {
+        struct ptlrpc_sec       pls_base;
+	rwlock_t            pls_lock;
+        struct ptlrpc_cli_ctx  *pls_ctx;
+};
+
+static inline struct plain_sec *sec2plsec(struct ptlrpc_sec *sec)
+{
+        return container_of(sec, struct plain_sec, pls_base);
+}
+
+static struct ptlrpc_sec_policy plain_policy;
+static struct ptlrpc_ctx_ops    plain_ctx_ops;
+static struct ptlrpc_svc_ctx    plain_svc_ctx;
+
+static unsigned int plain_at_offset;
+
+/*
+ * for simplicity, plain policy rpc use fixed layout.
+ */
+#define PLAIN_PACK_SEGMENTS             (4)
+
+#define PLAIN_PACK_HDR_OFF              (0)
+#define PLAIN_PACK_MSG_OFF              (1)
+#define PLAIN_PACK_USER_OFF             (2)
+#define PLAIN_PACK_BULK_OFF             (3)
+
+#define PLAIN_FL_USER                   (0x01)
+#define PLAIN_FL_BULK                   (0x02)
+
+struct plain_header {
+        __u8            ph_ver;            /* 0 */
+        __u8            ph_flags;
+        __u8            ph_sp;             /* source */
+        __u8            ph_bulk_hash_alg;  /* complete flavor desc */
+        __u8            ph_pad[4];
+};
+
+struct plain_bulk_token {
+        __u8            pbt_hash[8];
+};
+
+#define PLAIN_BSD_SIZE \
+        (sizeof(struct ptlrpc_bulk_sec_desc) + sizeof(struct plain_bulk_token))
+
+/****************************************
+ * bulk checksum helpers                *
+ ****************************************/
+
+static int plain_unpack_bsd(struct lustre_msg *msg, int swabbed)
+{
+        struct ptlrpc_bulk_sec_desc *bsd;
+
+        if (bulk_sec_desc_unpack(msg, PLAIN_PACK_BULK_OFF, swabbed))
+                return -EPROTO;
+
+        bsd = lustre_msg_buf(msg, PLAIN_PACK_BULK_OFF, PLAIN_BSD_SIZE);
+        if (bsd == NULL) {
+                CERROR("bulk sec desc has short size %d\n",
+                       lustre_msg_buflen(msg, PLAIN_PACK_BULK_OFF));
+                return -EPROTO;
+        }
+
+        if (bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL &&
+            bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG) {
+                CERROR("invalid bulk svc %u\n", bsd->bsd_svc);
+                return -EPROTO;
+        }
+
+        return 0;
+}
+
+static int plain_generate_bulk_csum(struct ptlrpc_bulk_desc *desc,
+                                    __u8 hash_alg,
+                                    struct plain_bulk_token *token)
+{
+        if (hash_alg == BULK_HASH_ALG_NULL)
+                return 0;
+
+        memset(token->pbt_hash, 0, sizeof(token->pbt_hash));
+        return sptlrpc_get_bulk_checksum(desc, hash_alg, token->pbt_hash,
+                                         sizeof(token->pbt_hash));
+}
+
+static int plain_verify_bulk_csum(struct ptlrpc_bulk_desc *desc,
+                                  __u8 hash_alg,
+                                  struct plain_bulk_token *tokenr)
+{
+        struct plain_bulk_token tokenv;
+        int                     rc;
+
+        if (hash_alg == BULK_HASH_ALG_NULL)
+                return 0;
+
+        memset(&tokenv.pbt_hash, 0, sizeof(tokenv.pbt_hash));
+        rc = sptlrpc_get_bulk_checksum(desc, hash_alg, tokenv.pbt_hash,
+                                       sizeof(tokenv.pbt_hash));
+        if (rc)
+                return rc;
+
+        if (memcmp(tokenr->pbt_hash, tokenv.pbt_hash, sizeof(tokenr->pbt_hash)))
+                return -EACCES;
+        return 0;
+}
+
+static void corrupt_bulk_data(struct ptlrpc_bulk_desc *desc)
+{
+	char           *ptr;
+	unsigned int    off, i;
+
+	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
+
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		if (BD_GET_KIOV(desc, i).kiov_len == 0)
+			continue;
+
+		ptr = kmap(BD_GET_KIOV(desc, i).kiov_page);
+		off = BD_GET_KIOV(desc, i).kiov_offset & ~PAGE_MASK;
+		ptr[off] ^= 0x1;
+		kunmap(BD_GET_KIOV(desc, i).kiov_page);
+		return;
+	}
+}
+
+/****************************************
+ * cli_ctx apis                         *
+ ****************************************/
+
+static
+int plain_ctx_refresh(struct ptlrpc_cli_ctx *ctx)
+{
+        /* should never reach here */
+        LBUG();
+        return 0;
+}
+
+static
+int plain_ctx_validate(struct ptlrpc_cli_ctx *ctx)
+{
+        return 0;
+}
+
+static
+int plain_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+        struct lustre_msg   *msg = req->rq_reqbuf;
+        struct plain_header *phdr;
+        ENTRY;
+
+        msg->lm_secflvr = req->rq_flvr.sf_rpc;
+
+        phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0);
+        phdr->ph_ver = 0;
+        phdr->ph_flags = 0;
+        phdr->ph_sp = ctx->cc_sec->ps_part;
+        phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg;
+
+        if (req->rq_pack_udesc)
+                phdr->ph_flags |= PLAIN_FL_USER;
+        if (req->rq_pack_bulk)
+                phdr->ph_flags |= PLAIN_FL_BULK;
+
+        req->rq_reqdata_len = lustre_msg_size_v2(msg->lm_bufcount,
+                                                 msg->lm_buflens);
+        RETURN(0);
+}
+
+static
+int plain_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+        struct lustre_msg   *msg = req->rq_repdata;
+        struct plain_header *phdr;
+        __u32                cksum;
+        int                  swabbed;
+        ENTRY;
+
+        if (msg->lm_bufcount != PLAIN_PACK_SEGMENTS) {
+                CERROR("unexpected reply buf count %u\n", msg->lm_bufcount);
+                RETURN(-EPROTO);
+        }
+
+        swabbed = ptlrpc_rep_need_swab(req);
+
+        phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr));
+        if (phdr == NULL) {
+                CERROR("missing plain header\n");
+                RETURN(-EPROTO);
+        }
+
+        if (phdr->ph_ver != 0) {
+                CERROR("Invalid header version\n");
+                RETURN(-EPROTO);
+        }
+
+        /* expect no user desc in reply */
+        if (phdr->ph_flags & PLAIN_FL_USER) {
+                CERROR("Unexpected udesc flag in reply\n");
+                RETURN(-EPROTO);
+        }
+
+        if (phdr->ph_bulk_hash_alg != req->rq_flvr.u_bulk.hash.hash_alg) {
+                CERROR("reply bulk flavor %u != %u\n", phdr->ph_bulk_hash_alg,
+                       req->rq_flvr.u_bulk.hash.hash_alg);
+                RETURN(-EPROTO);
+        }
+
+        if (unlikely(req->rq_early)) {
+		unsigned int hsize = 4;
+
+		cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32,
+				lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0),
+				lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF),
+				NULL, 0, (unsigned char *)&cksum, &hsize);
+		if (cksum != msg->lm_cksum) {
+			CDEBUG(D_SEC,
+			       "early reply checksum mismatch: %08x != %08x\n",
+			       cpu_to_le32(cksum), msg->lm_cksum);
+			RETURN(-EINVAL);
+		}
+        } else {
+                /* whether we sent with bulk or not, we expect the same
+                 * in reply, except for early reply */
+                if (!req->rq_early &&
+                    !equi(req->rq_pack_bulk == 1,
+                          phdr->ph_flags & PLAIN_FL_BULK)) {
+                        CERROR("%s bulk checksum in reply\n",
+                               req->rq_pack_bulk ? "Missing" : "Unexpected");
+                        RETURN(-EPROTO);
+                }
+
+                if (phdr->ph_flags & PLAIN_FL_BULK) {
+                        if (plain_unpack_bsd(msg, swabbed))
+                                RETURN(-EPROTO);
+                }
+        }
+
+        req->rq_repmsg = lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0);
+        req->rq_replen = lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF);
+        RETURN(0);
+}
+
+static
+int plain_cli_wrap_bulk(struct ptlrpc_cli_ctx *ctx,
+                        struct ptlrpc_request *req,
+                        struct ptlrpc_bulk_desc *desc)
+{
+        struct ptlrpc_bulk_sec_desc *bsd;
+        struct plain_bulk_token     *token;
+        int                          rc;
+
+        LASSERT(req->rq_pack_bulk);
+        LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS);
+
+        bsd = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0);
+        token = (struct plain_bulk_token *) bsd->bsd_data;
+
+        bsd->bsd_version = 0;
+        bsd->bsd_flags = 0;
+        bsd->bsd_type = SPTLRPC_BULK_DEFAULT;
+        bsd->bsd_svc = SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc);
+
+        if (bsd->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+                RETURN(0);
+
+        if (req->rq_bulk_read)
+                RETURN(0);
+
+        rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+                                      token);
+        if (rc) {
+                CERROR("bulk write: failed to compute checksum: %d\n", rc);
+        } else {
+                /*
+                 * for sending we only compute the wrong checksum instead
+                 * of corrupting the data so it is still correct on a redo
+                 */
+                if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND) &&
+                    req->rq_flvr.u_bulk.hash.hash_alg != BULK_HASH_ALG_NULL)
+                        token->pbt_hash[0] ^= 0x1;
+        }
+
+        return rc;
+}
+
+static
+int plain_cli_unwrap_bulk(struct ptlrpc_cli_ctx *ctx,
+                          struct ptlrpc_request *req,
+                          struct ptlrpc_bulk_desc *desc)
+{
+        struct ptlrpc_bulk_sec_desc *bsdv;
+        struct plain_bulk_token     *tokenv;
+        int                          rc;
+        int                          i, nob;
+
+	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
+        LASSERT(req->rq_pack_bulk);
+        LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS);
+        LASSERT(req->rq_repdata->lm_bufcount == PLAIN_PACK_SEGMENTS);
+
+        bsdv = lustre_msg_buf(req->rq_repdata, PLAIN_PACK_BULK_OFF, 0);
+        tokenv = (struct plain_bulk_token *) bsdv->bsd_data;
+
+        if (req->rq_bulk_write) {
+                if (bsdv->bsd_flags & BSD_FL_ERR)
+                        return -EIO;
+                return 0;
+        }
+
+	/* fix the actual data size */
+	for (i = 0, nob = 0; i < desc->bd_iov_count; i++) {
+		if (BD_GET_KIOV(desc, i).kiov_len +
+		    nob > desc->bd_nob_transferred) {
+			BD_GET_KIOV(desc, i).kiov_len =
+				desc->bd_nob_transferred - nob;
+		}
+		nob += BD_GET_KIOV(desc, i).kiov_len;
+	}
+
+        rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+                                    tokenv);
+        if (rc)
+                CERROR("bulk read: client verify failed: %d\n", rc);
+
+        return rc;
+}
+
+/****************************************
+ * sec apis                             *
+ ****************************************/
+
+static
+struct ptlrpc_cli_ctx *plain_sec_install_ctx(struct plain_sec *plsec)
+{
+	struct ptlrpc_cli_ctx  *ctx, *ctx_new;
+
+	OBD_ALLOC_PTR(ctx_new);
+
+	write_lock(&plsec->pls_lock);
+
+	ctx = plsec->pls_ctx;
+	if (ctx) {
+		atomic_inc(&ctx->cc_refcount);
+
+		if (ctx_new)
+			OBD_FREE_PTR(ctx_new);
+	} else if (ctx_new) {
+		ctx = ctx_new;
+
+		atomic_set(&ctx->cc_refcount, 1);	/* for cache */
+		ctx->cc_sec = &plsec->pls_base;
+		ctx->cc_ops = &plain_ctx_ops;
+		ctx->cc_expire = 0;
+		ctx->cc_flags = PTLRPC_CTX_CACHED | PTLRPC_CTX_UPTODATE;
+		ctx->cc_vcred.vc_uid = 0;
+		spin_lock_init(&ctx->cc_lock);
+		INIT_LIST_HEAD(&ctx->cc_req_list);
+		INIT_LIST_HEAD(&ctx->cc_gc_chain);
+
+		plsec->pls_ctx = ctx;
+		atomic_inc(&plsec->pls_base.ps_nctx);
+		atomic_inc(&plsec->pls_base.ps_refcount);
+
+		atomic_inc(&ctx->cc_refcount);	/* for caller */
+	}
+
+	write_unlock(&plsec->pls_lock);
+
+	return ctx;
+}
+
+static
+void plain_destroy_sec(struct ptlrpc_sec *sec)
+{
+	struct plain_sec *plsec = sec2plsec(sec);
+	ENTRY;
+
+	LASSERT(sec->ps_policy == &plain_policy);
+	LASSERT(sec->ps_import);
+	LASSERT(atomic_read(&sec->ps_refcount) == 0);
+	LASSERT(atomic_read(&sec->ps_nctx) == 0);
+	LASSERT(plsec->pls_ctx == NULL);
+
+	class_import_put(sec->ps_import);
+
+	OBD_FREE_PTR(plsec);
+	EXIT;
+}
+
+static
+void plain_kill_sec(struct ptlrpc_sec *sec)
+{
+        sec->ps_dying = 1;
+}
+
+static
+struct ptlrpc_sec *plain_create_sec(struct obd_import *imp,
+                                    struct ptlrpc_svc_ctx *svc_ctx,
+                                    struct sptlrpc_flavor *sf)
+{
+        struct plain_sec       *plsec;
+        struct ptlrpc_sec      *sec;
+        struct ptlrpc_cli_ctx  *ctx;
+        ENTRY;
+
+        LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN);
+
+        OBD_ALLOC_PTR(plsec);
+        if (plsec == NULL)
+                RETURN(NULL);
+
+        /*
+         * initialize plain_sec
+         */
+	rwlock_init(&plsec->pls_lock);
+	plsec->pls_ctx = NULL;
+
+	sec = &plsec->pls_base;
+	sec->ps_policy = &plain_policy;
+	atomic_set(&sec->ps_refcount, 0);
+	atomic_set(&sec->ps_nctx, 0);
+	sec->ps_id = sptlrpc_get_next_secid();
+	sec->ps_import = class_import_get(imp);
+	sec->ps_flvr = *sf;
+	spin_lock_init(&sec->ps_lock);
+	INIT_LIST_HEAD(&sec->ps_gc_list);
+        sec->ps_gc_interval = 0;
+        sec->ps_gc_next = 0;
+
+        /* install ctx immediately if this is a reverse sec */
+        if (svc_ctx) {
+                ctx = plain_sec_install_ctx(plsec);
+                if (ctx == NULL) {
+                        plain_destroy_sec(sec);
+                        RETURN(NULL);
+                }
+                sptlrpc_cli_ctx_put(ctx, 1);
+        }
+
+        RETURN(sec);
+}
+
+static
+struct ptlrpc_cli_ctx *plain_lookup_ctx(struct ptlrpc_sec *sec,
+                                        struct vfs_cred *vcred,
+                                        int create, int remove_dead)
+{
+	struct plain_sec       *plsec = sec2plsec(sec);
+	struct ptlrpc_cli_ctx  *ctx;
+	ENTRY;
+
+	read_lock(&plsec->pls_lock);
+	ctx = plsec->pls_ctx;
+	if (ctx)
+		atomic_inc(&ctx->cc_refcount);
+	read_unlock(&plsec->pls_lock);
+
+	if (unlikely(ctx == NULL))
+		ctx = plain_sec_install_ctx(plsec);
+
+	RETURN(ctx);
+}
+
+static
+void plain_release_ctx(struct ptlrpc_sec *sec,
+                       struct ptlrpc_cli_ctx *ctx, int sync)
+{
+	LASSERT(atomic_read(&sec->ps_refcount) > 0);
+	LASSERT(atomic_read(&sec->ps_nctx) > 0);
+	LASSERT(atomic_read(&ctx->cc_refcount) == 0);
+	LASSERT(ctx->cc_sec == sec);
+
+	OBD_FREE_PTR(ctx);
+
+	atomic_dec(&sec->ps_nctx);
+	sptlrpc_sec_put(sec);
+}
+
+static
+int plain_flush_ctx_cache(struct ptlrpc_sec *sec,
+                          uid_t uid, int grace, int force)
+{
+        struct plain_sec       *plsec = sec2plsec(sec);
+        struct ptlrpc_cli_ctx  *ctx;
+        ENTRY;
+
+        /* do nothing unless caller want to flush for 'all' */
+        if (uid != -1)
+                RETURN(0);
+
+	write_lock(&plsec->pls_lock);
+        ctx = plsec->pls_ctx;
+        plsec->pls_ctx = NULL;
+	write_unlock(&plsec->pls_lock);
+
+        if (ctx)
+                sptlrpc_cli_ctx_put(ctx, 1);
+        RETURN(0);
+}
+
+static
+int plain_alloc_reqbuf(struct ptlrpc_sec *sec,
+                       struct ptlrpc_request *req,
+                       int msgsize)
+{
+        __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+        int   alloc_len;
+        ENTRY;
+
+        buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header);
+        buflens[PLAIN_PACK_MSG_OFF] = msgsize;
+
+        if (req->rq_pack_udesc)
+                buflens[PLAIN_PACK_USER_OFF] = sptlrpc_current_user_desc_size();
+
+        if (req->rq_pack_bulk) {
+                LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+                buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE;
+        }
+
+        alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+	if (!req->rq_reqbuf) {
+		LASSERT(!req->rq_pool);
+
+		alloc_len = size_roundup_power2(alloc_len);
+		OBD_ALLOC_LARGE(req->rq_reqbuf, alloc_len);
+		if (!req->rq_reqbuf)
+			RETURN(-ENOMEM);
+
+		req->rq_reqbuf_len = alloc_len;
+        } else {
+                LASSERT(req->rq_pool);
+                LASSERT(req->rq_reqbuf_len >= alloc_len);
+                memset(req->rq_reqbuf, 0, alloc_len);
+        }
+
+        lustre_init_msg_v2(req->rq_reqbuf, PLAIN_PACK_SEGMENTS, buflens, NULL);
+        req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, 0);
+
+        if (req->rq_pack_udesc)
+                sptlrpc_pack_user_desc(req->rq_reqbuf, PLAIN_PACK_USER_OFF);
+
+        RETURN(0);
+}
+
+static
+void plain_free_reqbuf(struct ptlrpc_sec *sec,
+                       struct ptlrpc_request *req)
+{
+	ENTRY;
+	if (!req->rq_pool) {
+		OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+		req->rq_reqbuf = NULL;
+		req->rq_reqbuf_len = 0;
+	}
+	EXIT;
+}
+
+static
+int plain_alloc_repbuf(struct ptlrpc_sec *sec,
+                       struct ptlrpc_request *req,
+                       int msgsize)
+{
+        __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+        int alloc_len;
+        ENTRY;
+
+        buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header);
+        buflens[PLAIN_PACK_MSG_OFF] = msgsize;
+
+        if (req->rq_pack_bulk) {
+                LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+                buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE;
+        }
+
+        alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+        /* add space for early reply */
+        alloc_len += plain_at_offset;
+
+        alloc_len = size_roundup_power2(alloc_len);
+
+	OBD_ALLOC_LARGE(req->rq_repbuf, alloc_len);
+	if (!req->rq_repbuf)
+		RETURN(-ENOMEM);
+
+	req->rq_repbuf_len = alloc_len;
+	RETURN(0);
+}
+
+static
+void plain_free_repbuf(struct ptlrpc_sec *sec,
+                       struct ptlrpc_request *req)
+{
+        ENTRY;
+        OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len);
+        req->rq_repbuf = NULL;
+        req->rq_repbuf_len = 0;
+        EXIT;
+}
+
+static
+int plain_enlarge_reqbuf(struct ptlrpc_sec *sec,
+                         struct ptlrpc_request *req,
+                         int segment, int newsize)
+{
+        struct lustre_msg      *newbuf;
+        int                     oldsize;
+        int                     newmsg_size, newbuf_size;
+        ENTRY;
+
+        LASSERT(req->rq_reqbuf);
+        LASSERT(req->rq_reqbuf_len >= req->rq_reqlen);
+        LASSERT(lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, 0) ==
+                req->rq_reqmsg);
+
+        /* compute new embedded msg size.  */
+        oldsize = req->rq_reqmsg->lm_buflens[segment];
+        req->rq_reqmsg->lm_buflens[segment] = newsize;
+        newmsg_size = lustre_msg_size_v2(req->rq_reqmsg->lm_bufcount,
+                                         req->rq_reqmsg->lm_buflens);
+        req->rq_reqmsg->lm_buflens[segment] = oldsize;
+
+        /* compute new wrapper msg size.  */
+        oldsize = req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF];
+        req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF] = newmsg_size;
+        newbuf_size = lustre_msg_size_v2(req->rq_reqbuf->lm_bufcount,
+                                         req->rq_reqbuf->lm_buflens);
+        req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF] = oldsize;
+
+        /* request from pool should always have enough buffer */
+        LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newbuf_size);
+
+	if (req->rq_reqbuf_len < newbuf_size) {
+		newbuf_size = size_roundup_power2(newbuf_size);
+
+		OBD_ALLOC_LARGE(newbuf, newbuf_size);
+		if (newbuf == NULL)
+			RETURN(-ENOMEM);
+
+		/* Must lock this, so that otherwise unprotected change of
+		 * rq_reqmsg is not racing with parallel processing of
+		 * imp_replay_list traversing threads. See LU-3333
+		 * This is a bandaid at best, we really need to deal with this
+		 * in request enlarging code before unpacking that's already
+		 * there */
+		if (req->rq_import)
+			spin_lock(&req->rq_import->imp_lock);
+
+		memcpy(newbuf, req->rq_reqbuf, req->rq_reqbuf_len);
+
+		OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+		req->rq_reqbuf = newbuf;
+		req->rq_reqbuf_len = newbuf_size;
+		req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf,
+						PLAIN_PACK_MSG_OFF, 0);
+
+		if (req->rq_import)
+			spin_unlock(&req->rq_import->imp_lock);
+	}
+
+	_sptlrpc_enlarge_msg_inplace(req->rq_reqbuf, PLAIN_PACK_MSG_OFF,
+				     newmsg_size);
+	_sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize);
+
+	req->rq_reqlen = newmsg_size;
+	RETURN(0);
+}
+
+/****************************************
+ * service apis                         *
+ ****************************************/
+
+static struct ptlrpc_svc_ctx plain_svc_ctx = {
+	.sc_refcount    = ATOMIC_INIT(1),
+        .sc_policy      = &plain_policy,
+};
+
+static
+int plain_accept(struct ptlrpc_request *req)
+{
+        struct lustre_msg   *msg = req->rq_reqbuf;
+        struct plain_header *phdr;
+        int                  swabbed;
+        ENTRY;
+
+        LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) ==
+                SPTLRPC_POLICY_PLAIN);
+
+        if (SPTLRPC_FLVR_BASE(req->rq_flvr.sf_rpc) !=
+            SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN) ||
+            SPTLRPC_FLVR_BULK_TYPE(req->rq_flvr.sf_rpc) !=
+            SPTLRPC_FLVR_BULK_TYPE(SPTLRPC_FLVR_PLAIN)) {
+                CERROR("Invalid rpc flavor %x\n", req->rq_flvr.sf_rpc);
+                RETURN(SECSVC_DROP);
+        }
+
+        if (msg->lm_bufcount < PLAIN_PACK_SEGMENTS) {
+                CERROR("unexpected request buf count %u\n", msg->lm_bufcount);
+                RETURN(SECSVC_DROP);
+        }
+
+        swabbed = ptlrpc_req_need_swab(req);
+
+        phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr));
+        if (phdr == NULL) {
+                CERROR("missing plain header\n");
+                RETURN(-EPROTO);
+        }
+
+        if (phdr->ph_ver != 0) {
+                CERROR("Invalid header version\n");
+                RETURN(-EPROTO);
+        }
+
+        if (phdr->ph_bulk_hash_alg >= BULK_HASH_ALG_MAX) {
+                CERROR("invalid hash algorithm: %u\n", phdr->ph_bulk_hash_alg);
+                RETURN(-EPROTO);
+        }
+
+        req->rq_sp_from = phdr->ph_sp;
+        req->rq_flvr.u_bulk.hash.hash_alg = phdr->ph_bulk_hash_alg;
+
+        if (phdr->ph_flags & PLAIN_FL_USER) {
+                if (sptlrpc_unpack_user_desc(msg, PLAIN_PACK_USER_OFF,
+                                             swabbed)) {
+                        CERROR("Mal-formed user descriptor\n");
+                        RETURN(SECSVC_DROP);
+                }
+
+                req->rq_pack_udesc = 1;
+                req->rq_user_desc = lustre_msg_buf(msg, PLAIN_PACK_USER_OFF, 0);
+        }
+
+        if (phdr->ph_flags & PLAIN_FL_BULK) {
+                if (plain_unpack_bsd(msg, swabbed))
+                        RETURN(SECSVC_DROP);
+
+                req->rq_pack_bulk = 1;
+        }
+
+	req->rq_reqmsg = lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0);
+	req->rq_reqlen = msg->lm_buflens[PLAIN_PACK_MSG_OFF];
+
+	req->rq_svc_ctx = &plain_svc_ctx;
+	atomic_inc(&req->rq_svc_ctx->sc_refcount);
+
+	RETURN(SECSVC_OK);
+}
+
+static
+int plain_alloc_rs(struct ptlrpc_request *req, int msgsize)
+{
+        struct ptlrpc_reply_state   *rs;
+        __u32                        buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+        int                          rs_size = sizeof(*rs);
+        ENTRY;
+
+        LASSERT(msgsize % 8 == 0);
+
+        buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header);
+        buflens[PLAIN_PACK_MSG_OFF] = msgsize;
+
+        if (req->rq_pack_bulk && (req->rq_bulk_read || req->rq_bulk_write))
+                buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE;
+
+        rs_size += lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+        rs = req->rq_reply_state;
+
+	if (rs) {
+		/* pre-allocated */
+		LASSERT(rs->rs_size >= rs_size);
+	} else {
+		OBD_ALLOC_LARGE(rs, rs_size);
+		if (rs == NULL)
+			RETURN(-ENOMEM);
+
+		rs->rs_size = rs_size;
+	}
+
+	rs->rs_svc_ctx = req->rq_svc_ctx;
+	atomic_inc(&req->rq_svc_ctx->sc_refcount);
+	rs->rs_repbuf = (struct lustre_msg *) (rs + 1);
+	rs->rs_repbuf_len = rs_size - sizeof(*rs);
+
+	lustre_init_msg_v2(rs->rs_repbuf, PLAIN_PACK_SEGMENTS, buflens, NULL);
+	rs->rs_msg = lustre_msg_buf_v2(rs->rs_repbuf, PLAIN_PACK_MSG_OFF, 0);
+
+	req->rq_reply_state = rs;
+	RETURN(0);
+}
+
+static
+void plain_free_rs(struct ptlrpc_reply_state *rs)
+{
+	ENTRY;
+
+	LASSERT(atomic_read(&rs->rs_svc_ctx->sc_refcount) > 1);
+	atomic_dec(&rs->rs_svc_ctx->sc_refcount);
+
+	if (!rs->rs_prealloc)
+		OBD_FREE_LARGE(rs, rs->rs_size);
+	EXIT;
+}
+
+static
+int plain_authorize(struct ptlrpc_request *req)
+{
+        struct ptlrpc_reply_state *rs = req->rq_reply_state;
+        struct lustre_msg_v2      *msg = rs->rs_repbuf;
+        struct plain_header       *phdr;
+        int                        len;
+        ENTRY;
+
+        LASSERT(rs);
+        LASSERT(msg);
+
+        if (req->rq_replen != msg->lm_buflens[PLAIN_PACK_MSG_OFF])
+                len = lustre_shrink_msg(msg, PLAIN_PACK_MSG_OFF,
+                                        req->rq_replen, 1);
+        else
+                len = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+
+        msg->lm_secflvr = req->rq_flvr.sf_rpc;
+
+        phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0);
+        phdr->ph_ver = 0;
+        phdr->ph_flags = 0;
+        phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg;
+
+        if (req->rq_pack_bulk)
+                phdr->ph_flags |= PLAIN_FL_BULK;
+
+        rs->rs_repdata_len = len;
+
+        if (likely(req->rq_packed_final)) {
+                if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)
+                        req->rq_reply_off = plain_at_offset;
+                else
+                        req->rq_reply_off = 0;
+        } else {
+		unsigned int hsize = 4;
+
+		cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32,
+			lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0),
+			lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF),
+			NULL, 0, (unsigned char *)&msg->lm_cksum, &hsize);
+		req->rq_reply_off = 0;
+        }
+
+        RETURN(0);
+}
+
+static
+int plain_svc_unwrap_bulk(struct ptlrpc_request *req,
+                          struct ptlrpc_bulk_desc *desc)
+{
+        struct ptlrpc_reply_state   *rs = req->rq_reply_state;
+        struct ptlrpc_bulk_sec_desc *bsdr, *bsdv;
+        struct plain_bulk_token     *tokenr;
+        int                          rc;
+
+        LASSERT(req->rq_bulk_write);
+        LASSERT(req->rq_pack_bulk);
+
+        bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0);
+        tokenr = (struct plain_bulk_token *) bsdr->bsd_data;
+        bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0);
+
+        bsdv->bsd_version = 0;
+        bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+        bsdv->bsd_svc = bsdr->bsd_svc;
+        bsdv->bsd_flags = 0;
+
+        if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+                return 0;
+
+        rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+                                    tokenr);
+        if (rc) {
+                bsdv->bsd_flags |= BSD_FL_ERR;
+                CERROR("bulk write: server verify failed: %d\n", rc);
+        }
+
+        return rc;
+}
+
+static
+int plain_svc_wrap_bulk(struct ptlrpc_request *req,
+                        struct ptlrpc_bulk_desc *desc)
+{
+        struct ptlrpc_reply_state   *rs = req->rq_reply_state;
+        struct ptlrpc_bulk_sec_desc *bsdr, *bsdv;
+        struct plain_bulk_token     *tokenv;
+        int                          rc;
+
+        LASSERT(req->rq_bulk_read);
+        LASSERT(req->rq_pack_bulk);
+
+        bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0);
+        bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0);
+        tokenv = (struct plain_bulk_token *) bsdv->bsd_data;
+
+        bsdv->bsd_version = 0;
+        bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+        bsdv->bsd_svc = bsdr->bsd_svc;
+        bsdv->bsd_flags = 0;
+
+        if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+                return 0;
+
+        rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+                                      tokenv);
+        if (rc) {
+                CERROR("bulk read: server failed to compute "
+                       "checksum: %d\n", rc);
+        } else {
+                if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE))
+                        corrupt_bulk_data(desc);
+        }
+
+        return rc;
+}
+
+static struct ptlrpc_ctx_ops plain_ctx_ops = {
+        .refresh                = plain_ctx_refresh,
+        .validate               = plain_ctx_validate,
+        .sign                   = plain_ctx_sign,
+        .verify                 = plain_ctx_verify,
+        .wrap_bulk              = plain_cli_wrap_bulk,
+        .unwrap_bulk            = plain_cli_unwrap_bulk,
+};
+
+static struct ptlrpc_sec_cops plain_sec_cops = {
+        .create_sec             = plain_create_sec,
+        .destroy_sec            = plain_destroy_sec,
+        .kill_sec               = plain_kill_sec,
+        .lookup_ctx             = plain_lookup_ctx,
+        .release_ctx            = plain_release_ctx,
+        .flush_ctx_cache        = plain_flush_ctx_cache,
+        .alloc_reqbuf           = plain_alloc_reqbuf,
+        .free_reqbuf            = plain_free_reqbuf,
+        .alloc_repbuf           = plain_alloc_repbuf,
+        .free_repbuf            = plain_free_repbuf,
+        .enlarge_reqbuf         = plain_enlarge_reqbuf,
+};
+
+static struct ptlrpc_sec_sops plain_sec_sops = {
+        .accept                 = plain_accept,
+        .alloc_rs               = plain_alloc_rs,
+        .authorize              = plain_authorize,
+        .free_rs                = plain_free_rs,
+        .unwrap_bulk            = plain_svc_unwrap_bulk,
+        .wrap_bulk              = plain_svc_wrap_bulk,
+};
+
+static struct ptlrpc_sec_policy plain_policy = {
+        .sp_owner               = THIS_MODULE,
+        .sp_name                = "plain",
+        .sp_policy              = SPTLRPC_POLICY_PLAIN,
+        .sp_cops                = &plain_sec_cops,
+        .sp_sops                = &plain_sec_sops,
+};
+
+int sptlrpc_plain_init(void)
+{
+        __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+        int rc;
+
+        buflens[PLAIN_PACK_MSG_OFF] = lustre_msg_early_size();
+        plain_at_offset = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+        rc = sptlrpc_register_policy(&plain_policy);
+        if (rc)
+                CERROR("failed to register: %d\n", rc);
+
+        return rc;
+}
+
+void sptlrpc_plain_fini(void)
+{
+        int rc;
+
+        rc = sptlrpc_unregister_policy(&plain_policy);
+        if (rc)
+                CERROR("cannot unregister: %d\n", rc);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/service.c b/drivers/staging/lustrefsx/lustre/ptlrpc/service.c
new file mode 100644
index 0000000000000..d304ec6c2416d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/service.c
@@ -0,0 +1,3309 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <linux/kthread.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lu_object.h>
+#include <lnet/types.h>
+#include "ptlrpc_internal.h"
+
+/* The following are visible and mutable through /sys/module/ptlrpc */
+int test_req_buffer_pressure = 0;
+module_param(test_req_buffer_pressure, int, 0444);
+MODULE_PARM_DESC(test_req_buffer_pressure, "set non-zero to put pressure on request buffer pools");
+module_param(at_min, int, 0644);
+MODULE_PARM_DESC(at_min, "Adaptive timeout minimum (sec)");
+module_param(at_max, int, 0644);
+MODULE_PARM_DESC(at_max, "Adaptive timeout maximum (sec)");
+module_param(at_history, int, 0644);
+MODULE_PARM_DESC(at_history,
+		 "Adaptive timeouts remember the slowest event that took place within this period (sec)");
+module_param(at_early_margin, int, 0644);
+MODULE_PARM_DESC(at_early_margin, "How soon before an RPC deadline to send an early reply");
+module_param(at_extra, int, 0644);
+MODULE_PARM_DESC(at_extra, "How much extra time to give with each early reply");
+
+/* forward ref */
+static int ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt);
+static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req);
+static void ptlrpc_at_remove_timed(struct ptlrpc_request *req);
+
+/** Holds a list of all PTLRPC services */
+struct list_head ptlrpc_all_services;
+/** Used to protect the \e ptlrpc_all_services list */
+struct mutex ptlrpc_all_services_mutex;
+
+static struct ptlrpc_request_buffer_desc *
+ptlrpc_alloc_rqbd(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_service		  *svc = svcpt->scp_service;
+	struct ptlrpc_request_buffer_desc *rqbd;
+
+	OBD_CPT_ALLOC_PTR(rqbd, svc->srv_cptable, svcpt->scp_cpt);
+	if (rqbd == NULL)
+		return NULL;
+
+	rqbd->rqbd_svcpt = svcpt;
+	rqbd->rqbd_refcount = 0;
+	rqbd->rqbd_cbid.cbid_fn = request_in_callback;
+	rqbd->rqbd_cbid.cbid_arg = rqbd;
+	INIT_LIST_HEAD(&rqbd->rqbd_reqs);
+	OBD_CPT_ALLOC_LARGE(rqbd->rqbd_buffer, svc->srv_cptable,
+			    svcpt->scp_cpt, svc->srv_buf_size);
+	if (rqbd->rqbd_buffer == NULL) {
+		OBD_FREE_PTR(rqbd);
+		return NULL;
+	}
+
+	spin_lock(&svcpt->scp_lock);
+	list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle);
+	svcpt->scp_nrqbds_total++;
+	spin_unlock(&svcpt->scp_lock);
+
+	return rqbd;
+}
+
+static void
+ptlrpc_free_rqbd(struct ptlrpc_request_buffer_desc *rqbd)
+{
+	struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt;
+
+	LASSERT(rqbd->rqbd_refcount == 0);
+	LASSERT(list_empty(&rqbd->rqbd_reqs));
+
+	spin_lock(&svcpt->scp_lock);
+	list_del(&rqbd->rqbd_list);
+	svcpt->scp_nrqbds_total--;
+	spin_unlock(&svcpt->scp_lock);
+
+	OBD_FREE_LARGE(rqbd->rqbd_buffer, svcpt->scp_service->srv_buf_size);
+	OBD_FREE_PTR(rqbd);
+}
+
+static int
+ptlrpc_grow_req_bufs(struct ptlrpc_service_part *svcpt, int post)
+{
+	struct ptlrpc_service		  *svc = svcpt->scp_service;
+        struct ptlrpc_request_buffer_desc *rqbd;
+        int                                rc = 0;
+        int                                i;
+
+	if (svcpt->scp_rqbd_allocating)
+		goto try_post;
+
+	spin_lock(&svcpt->scp_lock);
+	/* check again with lock */
+	if (svcpt->scp_rqbd_allocating) {
+		/* NB: we might allow more than one thread in the future */
+		LASSERT(svcpt->scp_rqbd_allocating == 1);
+		spin_unlock(&svcpt->scp_lock);
+		goto try_post;
+	}
+
+	svcpt->scp_rqbd_allocating++;
+	spin_unlock(&svcpt->scp_lock);
+
+
+        for (i = 0; i < svc->srv_nbuf_per_group; i++) {
+                /* NB: another thread might have recycled enough rqbds, we
+		 * need to make sure it wouldn't over-allocate, see LU-1212. */
+		if (svcpt->scp_nrqbds_posted >= svc->srv_nbuf_per_group)
+			break;
+
+		rqbd = ptlrpc_alloc_rqbd(svcpt);
+
+                if (rqbd == NULL) {
+                        CERROR("%s: Can't allocate request buffer\n",
+                               svc->srv_name);
+                        rc = -ENOMEM;
+                        break;
+                }
+	}
+
+	spin_lock(&svcpt->scp_lock);
+
+	LASSERT(svcpt->scp_rqbd_allocating == 1);
+	svcpt->scp_rqbd_allocating--;
+
+	spin_unlock(&svcpt->scp_lock);
+
+	CDEBUG(D_RPCTRACE,
+	       "%s: allocate %d new %d-byte reqbufs (%d/%d left), rc = %d\n",
+	       svc->srv_name, i, svc->srv_buf_size, svcpt->scp_nrqbds_posted,
+	       svcpt->scp_nrqbds_total, rc);
+
+ try_post:
+	if (post && rc == 0)
+		rc = ptlrpc_server_post_idle_rqbds(svcpt);
+
+	return rc;
+}
+
+/**
+ * Part of Rep-Ack logic.
+ * Puts a lock and its mode into reply state assotiated to request reply.
+ */
+void
+ptlrpc_save_lock(struct ptlrpc_request *req, struct lustre_handle *lock,
+		 int mode, bool no_ack, bool convert_lock)
+{
+	struct ptlrpc_reply_state *rs = req->rq_reply_state;
+	int idx;
+
+	LASSERT(rs != NULL);
+	LASSERT(rs->rs_nlocks < RS_MAX_LOCKS);
+
+	idx = rs->rs_nlocks++;
+	rs->rs_locks[idx] = *lock;
+	rs->rs_modes[idx] = mode;
+	rs->rs_difficult = 1;
+	rs->rs_no_ack = no_ack;
+	rs->rs_convert_lock = convert_lock;
+}
+EXPORT_SYMBOL(ptlrpc_save_lock);
+
+
+struct ptlrpc_hr_partition;
+
+struct ptlrpc_hr_thread {
+	int				hrt_id;		/* thread ID */
+	spinlock_t			hrt_lock;
+	wait_queue_head_t		hrt_waitq;
+	struct list_head		hrt_queue;
+	struct ptlrpc_hr_partition	*hrt_partition;
+};
+
+struct ptlrpc_hr_partition {
+	/* # of started threads */
+	atomic_t			hrp_nstarted;
+	/* # of stopped threads */
+	atomic_t			hrp_nstopped;
+	/* cpu partition id */
+	int				hrp_cpt;
+	/* round-robin rotor for choosing thread */
+	int				hrp_rotor;
+	/* total number of threads on this partition */
+	int				hrp_nthrs;
+	/* threads table */
+	struct ptlrpc_hr_thread		*hrp_thrs;
+};
+
+#define HRT_RUNNING 0
+#define HRT_STOPPING 1
+
+struct ptlrpc_hr_service {
+	/* CPU partition table, it's just cfs_cpt_table for now */
+	struct cfs_cpt_table		*hr_cpt_table;
+	/** controller sleep waitq */
+	wait_queue_head_t		hr_waitq;
+        unsigned int			hr_stopping;
+	/** roundrobin rotor for non-affinity service */
+	unsigned int			hr_rotor;
+	/* partition data */
+	struct ptlrpc_hr_partition	**hr_partitions;
+};
+
+struct rs_batch {
+	struct list_head			rsb_replies;
+	unsigned int			rsb_n_replies;
+	struct ptlrpc_service_part	*rsb_svcpt;
+};
+
+/** reply handling service. */
+static struct ptlrpc_hr_service		ptlrpc_hr;
+
+/**
+ * maximum mumber of replies scheduled in one batch
+ */
+#define MAX_SCHEDULED 256
+
+/**
+ * Initialize a reply batch.
+ *
+ * \param b batch
+ */
+static void rs_batch_init(struct rs_batch *b)
+{
+	memset(b, 0, sizeof *b);
+	INIT_LIST_HEAD(&b->rsb_replies);
+}
+
+/**
+ * Choose an hr thread to dispatch requests to.
+ */
+static struct ptlrpc_hr_thread *
+ptlrpc_hr_select(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_hr_partition	*hrp;
+	unsigned int			rotor;
+
+	if (svcpt->scp_cpt >= 0 &&
+	    svcpt->scp_service->srv_cptable == ptlrpc_hr.hr_cpt_table) {
+		/* directly match partition */
+		hrp = ptlrpc_hr.hr_partitions[svcpt->scp_cpt];
+
+	} else {
+		rotor = ptlrpc_hr.hr_rotor++;
+		rotor %= cfs_cpt_number(ptlrpc_hr.hr_cpt_table);
+
+		hrp = ptlrpc_hr.hr_partitions[rotor];
+	}
+
+	rotor = hrp->hrp_rotor++;
+	return &hrp->hrp_thrs[rotor % hrp->hrp_nthrs];
+}
+
+/**
+ * Dispatch all replies accumulated in the batch to one from
+ * dedicated reply handling threads.
+ *
+ * \param b batch
+ */
+static void rs_batch_dispatch(struct rs_batch *b)
+{
+	if (b->rsb_n_replies != 0) {
+		struct ptlrpc_hr_thread	*hrt;
+
+		hrt = ptlrpc_hr_select(b->rsb_svcpt);
+
+		spin_lock(&hrt->hrt_lock);
+		list_splice_init(&b->rsb_replies, &hrt->hrt_queue);
+		spin_unlock(&hrt->hrt_lock);
+
+		wake_up(&hrt->hrt_waitq);
+		b->rsb_n_replies = 0;
+	}
+}
+
+/**
+ * Add a reply to a batch.
+ * Add one reply object to a batch, schedule batched replies if overload.
+ *
+ * \param b batch
+ * \param rs reply
+ */
+static void rs_batch_add(struct rs_batch *b, struct ptlrpc_reply_state *rs)
+{
+	struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+
+	if (svcpt != b->rsb_svcpt || b->rsb_n_replies >= MAX_SCHEDULED) {
+		if (b->rsb_svcpt != NULL) {
+			rs_batch_dispatch(b);
+			spin_unlock(&b->rsb_svcpt->scp_rep_lock);
+		}
+		spin_lock(&svcpt->scp_rep_lock);
+		b->rsb_svcpt = svcpt;
+	}
+	spin_lock(&rs->rs_lock);
+	rs->rs_scheduled_ever = 1;
+	if (rs->rs_scheduled == 0) {
+		list_move(&rs->rs_list, &b->rsb_replies);
+		rs->rs_scheduled = 1;
+		b->rsb_n_replies++;
+	}
+	rs->rs_committed = 1;
+	spin_unlock(&rs->rs_lock);
+}
+
+/**
+ * Reply batch finalization.
+ * Dispatch remaining replies from the batch
+ * and release remaining spinlock.
+ *
+ * \param b batch
+ */
+static void rs_batch_fini(struct rs_batch *b)
+{
+	if (b->rsb_svcpt != NULL) {
+		rs_batch_dispatch(b);
+		spin_unlock(&b->rsb_svcpt->scp_rep_lock);
+	}
+}
+
+#define DECLARE_RS_BATCH(b)     struct rs_batch b
+
+
+/**
+ * Put reply state into a queue for processing because we received
+ * ACK from the client
+ */
+void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs)
+{
+	struct ptlrpc_hr_thread *hrt;
+	ENTRY;
+
+	LASSERT(list_empty(&rs->rs_list));
+
+	hrt = ptlrpc_hr_select(rs->rs_svcpt);
+
+	spin_lock(&hrt->hrt_lock);
+	list_add_tail(&rs->rs_list, &hrt->hrt_queue);
+	spin_unlock(&hrt->hrt_lock);
+
+	wake_up(&hrt->hrt_waitq);
+	EXIT;
+}
+
+void
+ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs)
+{
+	ENTRY;
+
+	assert_spin_locked(&rs->rs_svcpt->scp_rep_lock);
+	assert_spin_locked(&rs->rs_lock);
+	LASSERT (rs->rs_difficult);
+	rs->rs_scheduled_ever = 1;  /* flag any notification attempt */
+
+	if (rs->rs_scheduled) {     /* being set up or already notified */
+		EXIT;
+		return;
+	}
+
+	rs->rs_scheduled = 1;
+	list_del_init(&rs->rs_list);
+	ptlrpc_dispatch_difficult_reply(rs);
+	EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_schedule_difficult_reply);
+
+void ptlrpc_commit_replies(struct obd_export *exp)
+{
+        struct ptlrpc_reply_state *rs, *nxt;
+        DECLARE_RS_BATCH(batch);
+        ENTRY;
+
+        rs_batch_init(&batch);
+        /* Find any replies that have been committed and get their service
+         * to attend to complete them. */
+
+        /* CAVEAT EMPTOR: spinlock ordering!!! */
+	spin_lock(&exp->exp_uncommitted_replies_lock);
+	list_for_each_entry_safe(rs, nxt, &exp->exp_uncommitted_replies,
+                                     rs_obd_list) {
+                LASSERT (rs->rs_difficult);
+                /* VBR: per-export last_committed */
+                LASSERT(rs->rs_export);
+                if (rs->rs_transno <= exp->exp_last_committed) {
+			list_del_init(&rs->rs_obd_list);
+                        rs_batch_add(&batch, rs);
+                }
+        }
+	spin_unlock(&exp->exp_uncommitted_replies_lock);
+	rs_batch_fini(&batch);
+	EXIT;
+}
+
+static int
+ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_request_buffer_desc *rqbd;
+	int				  rc;
+	int				  posted = 0;
+
+	for (;;) {
+		spin_lock(&svcpt->scp_lock);
+
+		if (list_empty(&svcpt->scp_rqbd_idle)) {
+			spin_unlock(&svcpt->scp_lock);
+			return posted;
+		}
+
+		rqbd = list_entry(svcpt->scp_rqbd_idle.next,
+				      struct ptlrpc_request_buffer_desc,
+				      rqbd_list);
+		list_del(&rqbd->rqbd_list);
+
+		/* assume we will post successfully */
+		svcpt->scp_nrqbds_posted++;
+		list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_posted);
+
+		spin_unlock(&svcpt->scp_lock);
+
+		rc = ptlrpc_register_rqbd(rqbd);
+		if (rc != 0)
+			break;
+
+		posted = 1;
+	}
+
+	spin_lock(&svcpt->scp_lock);
+
+	svcpt->scp_nrqbds_posted--;
+	list_del(&rqbd->rqbd_list);
+	list_add_tail(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle);
+
+	/* Don't complain if no request buffers are posted right now; LNET
+	 * won't drop requests because we set the portal lazy! */
+
+	spin_unlock(&svcpt->scp_lock);
+
+	return -1;
+}
+
+static void ptlrpc_at_timer(unsigned long castmeharder)
+{
+	struct ptlrpc_service_part *svcpt;
+
+	svcpt = (struct ptlrpc_service_part *)castmeharder;
+
+	svcpt->scp_at_check = 1;
+	svcpt->scp_at_checktime = cfs_time_current();
+	wake_up(&svcpt->scp_waitq);
+}
+
+static void
+ptlrpc_server_nthreads_check(struct ptlrpc_service *svc,
+			     struct ptlrpc_service_conf *conf)
+{
+	struct ptlrpc_service_thr_conf	*tc = &conf->psc_thr;
+	unsigned			init;
+	unsigned			total;
+	unsigned			nthrs;
+	int				weight;
+
+	/*
+	 * Common code for estimating & validating threads number.
+	 * CPT affinity service could have percpt thread-pool instead
+	 * of a global thread-pool, which means user might not always
+	 * get the threads number they give it in conf::tc_nthrs_user
+	 * even they did set. It's because we need to validate threads
+	 * number for each CPT to guarantee each pool will have enough
+	 * threads to keep the service healthy.
+	 */
+	init = PTLRPC_NTHRS_INIT + (svc->srv_ops.so_hpreq_handler != NULL);
+	init = max_t(int, init, tc->tc_nthrs_init);
+
+	/* NB: please see comments in lustre_lnet.h for definition
+	 * details of these members */
+	LASSERT(tc->tc_nthrs_max != 0);
+
+	if (tc->tc_nthrs_user != 0) {
+		/* In case there is a reason to test a service with many
+		 * threads, we give a less strict check here, it can
+		 * be up to 8 * nthrs_max */
+		total = min(tc->tc_nthrs_max * 8, tc->tc_nthrs_user);
+		nthrs = total / svc->srv_ncpts;
+		init  = max(init, nthrs);
+		goto out;
+	}
+
+	total = tc->tc_nthrs_max;
+	if (tc->tc_nthrs_base == 0) {
+		/* don't care about base threads number per partition,
+		 * this is most for non-affinity service */
+		nthrs = total / svc->srv_ncpts;
+		goto out;
+	}
+
+	nthrs = tc->tc_nthrs_base;
+	if (svc->srv_ncpts == 1) {
+		int	i;
+
+		/* NB: Increase the base number if it's single partition
+		 * and total number of cores/HTs is larger or equal to 4.
+		 * result will always < 2 * nthrs_base */
+		weight = cfs_cpt_weight(svc->srv_cptable, CFS_CPT_ANY);
+		for (i = 1; (weight >> (i + 1)) != 0 && /* >= 4 cores/HTs */
+			    (tc->tc_nthrs_base >> i) != 0; i++)
+			nthrs += tc->tc_nthrs_base >> i;
+	}
+
+	if (tc->tc_thr_factor != 0) {
+		int	  factor = tc->tc_thr_factor;
+		const int fade = 4;
+
+		/*
+		 * User wants to increase number of threads with for
+		 * each CPU core/HT, most likely the factor is larger than
+		 * one thread/core because service threads are supposed to
+		 * be blocked by lock or wait for IO.
+		 */
+		/*
+		 * Amdahl's law says that adding processors wouldn't give
+		 * a linear increasing of parallelism, so it's nonsense to
+		 * have too many threads no matter how many cores/HTs
+		 * there are.
+		 */
+		if (cpumask_weight(topology_sibling_cpumask(smp_processor_id())) > 1) {
+			/* weight is # of HTs */
+			/* depress thread factor for hyper-thread */
+			factor = factor - (factor >> 1) + (factor >> 3);
+		}
+
+		weight = cfs_cpt_weight(svc->srv_cptable, 0);
+
+		for (; factor > 0 && weight > 0; factor--, weight -= fade)
+			nthrs += min(weight, fade) * factor;
+	}
+
+	if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) {
+		nthrs = max(tc->tc_nthrs_base,
+			    tc->tc_nthrs_max / svc->srv_ncpts);
+	}
+ out:
+	nthrs = max(nthrs, tc->tc_nthrs_init);
+	svc->srv_nthrs_cpt_limit = nthrs;
+	svc->srv_nthrs_cpt_init = init;
+
+	if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) {
+		CDEBUG(D_OTHER, "%s: This service may have more threads (%d) "
+		       "than the given soft limit (%d)\n",
+		       svc->srv_name, nthrs * svc->srv_ncpts,
+		       tc->tc_nthrs_max);
+	}
+}
+
+/**
+ * Initialize percpt data for a service
+ */
+static int
+ptlrpc_service_part_init(struct ptlrpc_service *svc,
+			 struct ptlrpc_service_part *svcpt, int cpt)
+{
+	struct ptlrpc_at_array	*array;
+	int			size;
+	int			index;
+	int			rc;
+
+	svcpt->scp_cpt = cpt;
+	INIT_LIST_HEAD(&svcpt->scp_threads);
+
+	/* rqbd and incoming request queue */
+	spin_lock_init(&svcpt->scp_lock);
+	INIT_LIST_HEAD(&svcpt->scp_rqbd_idle);
+	INIT_LIST_HEAD(&svcpt->scp_rqbd_posted);
+	INIT_LIST_HEAD(&svcpt->scp_req_incoming);
+	init_waitqueue_head(&svcpt->scp_waitq);
+	/* history request & rqbd list */
+	INIT_LIST_HEAD(&svcpt->scp_hist_reqs);
+	INIT_LIST_HEAD(&svcpt->scp_hist_rqbds);
+
+	/* acitve requests and hp requests */
+	spin_lock_init(&svcpt->scp_req_lock);
+
+	/* reply states */
+	spin_lock_init(&svcpt->scp_rep_lock);
+	INIT_LIST_HEAD(&svcpt->scp_rep_active);
+	INIT_LIST_HEAD(&svcpt->scp_rep_idle);
+	init_waitqueue_head(&svcpt->scp_rep_waitq);
+	atomic_set(&svcpt->scp_nreps_difficult, 0);
+
+	/* adaptive timeout */
+	spin_lock_init(&svcpt->scp_at_lock);
+	array = &svcpt->scp_at_array;
+
+	size = at_est2timeout(at_max);
+	array->paa_size     = size;
+	array->paa_count    = 0;
+	array->paa_deadline = -1;
+
+	/* allocate memory for scp_at_array (ptlrpc_at_array) */
+	OBD_CPT_ALLOC(array->paa_reqs_array,
+		      svc->srv_cptable, cpt, sizeof(struct list_head) * size);
+	if (array->paa_reqs_array == NULL)
+		return -ENOMEM;
+
+	for (index = 0; index < size; index++)
+		INIT_LIST_HEAD(&array->paa_reqs_array[index]);
+
+	OBD_CPT_ALLOC(array->paa_reqs_count,
+		      svc->srv_cptable, cpt, sizeof(__u32) * size);
+	if (array->paa_reqs_count == NULL)
+		goto failed;
+
+	setup_timer(&svcpt->scp_at_timer, ptlrpc_at_timer,
+		    (unsigned long)svcpt);
+
+	/* At SOW, service time should be quick; 10s seems generous. If client
+	 * timeout is less than this, we'll be sending an early reply. */
+	at_init(&svcpt->scp_at_estimate, 10, 0);
+
+	/* assign this before call ptlrpc_grow_req_bufs */
+	svcpt->scp_service = svc;
+	/* Now allocate the request buffers, but don't post them now */
+	rc = ptlrpc_grow_req_bufs(svcpt, 0);
+	/* We shouldn't be under memory pressure at startup, so
+	 * fail if we can't allocate all our buffers at this time. */
+	if (rc != 0)
+		goto failed;
+
+	return 0;
+
+ failed:
+	if (array->paa_reqs_count != NULL) {
+		OBD_FREE(array->paa_reqs_count, sizeof(__u32) * size);
+		array->paa_reqs_count = NULL;
+	}
+
+	if (array->paa_reqs_array != NULL) {
+		OBD_FREE(array->paa_reqs_array,
+			 sizeof(struct list_head) * array->paa_size);
+		array->paa_reqs_array = NULL;
+	}
+
+	return -ENOMEM;
+}
+
+/**
+ * Initialize service on a given portal.
+ * This includes starting serving threads , allocating and posting rqbds and
+ * so on.
+ */
+struct ptlrpc_service *
+ptlrpc_register_service(struct ptlrpc_service_conf *conf,
+			struct proc_dir_entry *proc_entry)
+{
+	struct ptlrpc_service_cpt_conf	*cconf = &conf->psc_cpt;
+	struct ptlrpc_service		*service;
+	struct ptlrpc_service_part	*svcpt;
+	struct cfs_cpt_table		*cptable;
+	__u32				*cpts = NULL;
+	int				ncpts;
+	int				cpt;
+	int				rc;
+	int				i;
+	ENTRY;
+
+	LASSERT(conf->psc_buf.bc_nbufs > 0);
+	LASSERT(conf->psc_buf.bc_buf_size >=
+		conf->psc_buf.bc_req_max_size + SPTLRPC_MAX_PAYLOAD);
+	LASSERT(conf->psc_thr.tc_ctx_tags != 0);
+
+	cptable = cconf->cc_cptable;
+	if (cptable == NULL)
+		cptable = cfs_cpt_table;
+
+	if (!conf->psc_thr.tc_cpu_affinity) {
+		ncpts = 1;
+	} else {
+		ncpts = cfs_cpt_number(cptable);
+		if (cconf->cc_pattern != NULL) {
+			struct cfs_expr_list	*el;
+
+			rc = cfs_expr_list_parse(cconf->cc_pattern,
+						 strlen(cconf->cc_pattern),
+						 0, ncpts - 1, &el);
+			if (rc != 0) {
+				CERROR("%s: invalid CPT pattern string: %s",
+				       conf->psc_name, cconf->cc_pattern);
+				RETURN(ERR_PTR(-EINVAL));
+			}
+
+			rc = cfs_expr_list_values(el, ncpts, &cpts);
+			cfs_expr_list_free(el);
+			if (rc <= 0) {
+				CERROR("%s: failed to parse CPT array %s: %d\n",
+				       conf->psc_name, cconf->cc_pattern, rc);
+				if (cpts != NULL)
+					OBD_FREE(cpts, sizeof(*cpts) * ncpts);
+				RETURN(ERR_PTR(rc < 0 ? rc : -EINVAL));
+			}
+			ncpts = rc;
+		}
+	}
+
+	OBD_ALLOC(service, offsetof(struct ptlrpc_service, srv_parts[ncpts]));
+	if (service == NULL) {
+		if (cpts != NULL)
+			OBD_FREE(cpts, sizeof(*cpts) * ncpts);
+		RETURN(ERR_PTR(-ENOMEM));
+	}
+
+	service->srv_cptable		= cptable;
+	service->srv_cpts		= cpts;
+	service->srv_ncpts		= ncpts;
+
+	service->srv_cpt_bits = 0; /* it's zero already, easy to read... */
+	while ((1 << service->srv_cpt_bits) < cfs_cpt_number(cptable))
+		service->srv_cpt_bits++;
+
+	/* public members */
+	spin_lock_init(&service->srv_lock);
+	service->srv_name		= conf->psc_name;
+	service->srv_watchdog_factor	= conf->psc_watchdog_factor;
+	INIT_LIST_HEAD(&service->srv_list); /* for safty of cleanup */
+
+	/* buffer configuration */
+	service->srv_nbuf_per_group	= test_req_buffer_pressure ?
+					  1 : conf->psc_buf.bc_nbufs;
+	service->srv_max_req_size	= conf->psc_buf.bc_req_max_size +
+					  SPTLRPC_MAX_PAYLOAD;
+	service->srv_buf_size		= conf->psc_buf.bc_buf_size;
+	service->srv_rep_portal		= conf->psc_buf.bc_rep_portal;
+	service->srv_req_portal		= conf->psc_buf.bc_req_portal;
+
+	/* Increase max reply size to next power of two */
+	service->srv_max_reply_size = 1;
+	while (service->srv_max_reply_size <
+	       conf->psc_buf.bc_rep_max_size + SPTLRPC_MAX_PAYLOAD)
+		service->srv_max_reply_size <<= 1;
+
+	service->srv_thread_name	= conf->psc_thr.tc_thr_name;
+	service->srv_ctx_tags		= conf->psc_thr.tc_ctx_tags;
+	service->srv_hpreq_ratio	= PTLRPC_SVC_HP_RATIO;
+	service->srv_ops		= conf->psc_ops;
+
+	for (i = 0; i < ncpts; i++) {
+		if (!conf->psc_thr.tc_cpu_affinity)
+			cpt = CFS_CPT_ANY;
+		else
+			cpt = cpts != NULL ? cpts[i] : i;
+
+		OBD_CPT_ALLOC(svcpt, cptable, cpt, sizeof(*svcpt));
+		if (svcpt == NULL)
+			GOTO(failed, rc = -ENOMEM);
+
+		service->srv_parts[i] = svcpt;
+		rc = ptlrpc_service_part_init(service, svcpt, cpt);
+		if (rc != 0)
+			GOTO(failed, rc);
+	}
+
+	ptlrpc_server_nthreads_check(service, conf);
+
+	rc = LNetSetLazyPortal(service->srv_req_portal);
+	LASSERT(rc == 0);
+
+	mutex_lock(&ptlrpc_all_services_mutex);
+	list_add(&service->srv_list, &ptlrpc_all_services);
+	mutex_unlock(&ptlrpc_all_services_mutex);
+
+	if (proc_entry != NULL)
+		ptlrpc_lprocfs_register_service(proc_entry, service);
+
+	rc = ptlrpc_service_nrs_setup(service);
+	if (rc != 0)
+		GOTO(failed, rc);
+
+	CDEBUG(D_NET, "%s: Started, listening on portal %d\n",
+	       service->srv_name, service->srv_req_portal);
+
+	rc = ptlrpc_start_threads(service);
+	if (rc != 0) {
+		CERROR("Failed to start threads for service %s: %d\n",
+		       service->srv_name, rc);
+		GOTO(failed, rc);
+	}
+
+	RETURN(service);
+failed:
+	ptlrpc_unregister_service(service);
+	RETURN(ERR_PTR(rc));
+}
+EXPORT_SYMBOL(ptlrpc_register_service);
+
+/**
+ * to actually free the request, must be called without holding svc_lock.
+ * note it's caller's responsibility to unlink req->rq_list.
+ */
+static void ptlrpc_server_free_request(struct ptlrpc_request *req)
+{
+	LASSERT(atomic_read(&req->rq_refcount) == 0);
+	LASSERT(list_empty(&req->rq_timed_list));
+
+	/* DEBUG_REQ() assumes the reply state of a request with a valid
+	 * ref will not be destroyed until that reference is dropped. */
+	ptlrpc_req_drop_rs(req);
+
+	sptlrpc_svc_ctx_decref(req);
+
+	if (req != &req->rq_rqbd->rqbd_req) {
+		/* NB request buffers use an embedded
+		 * req if the incoming req unlinked the
+		 * MD; this isn't one of them! */
+		ptlrpc_request_cache_free(req);
+	}
+}
+
+/**
+ * drop a reference count of the request. if it reaches 0, we either
+ * put it into history list, or free it immediately.
+ */
+void ptlrpc_server_drop_request(struct ptlrpc_request *req)
+{
+	struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd;
+	struct ptlrpc_service_part	  *svcpt = rqbd->rqbd_svcpt;
+	struct ptlrpc_service		  *svc = svcpt->scp_service;
+	int				   refcount;
+	struct list_head			  *tmp;
+	struct list_head			  *nxt;
+
+	if (!atomic_dec_and_test(&req->rq_refcount))
+		return;
+
+	if (req->rq_session.lc_state == LCS_ENTERED) {
+		lu_context_exit(&req->rq_session);
+		lu_context_fini(&req->rq_session);
+	}
+
+	if (req->rq_at_linked) {
+		spin_lock(&svcpt->scp_at_lock);
+		/* recheck with lock, in case it's unlinked by
+		 * ptlrpc_at_check_timed() */
+		if (likely(req->rq_at_linked))
+			ptlrpc_at_remove_timed(req);
+		spin_unlock(&svcpt->scp_at_lock);
+	}
+
+	LASSERT(list_empty(&req->rq_timed_list));
+
+	/* finalize request */
+	if (req->rq_export) {
+		class_export_put(req->rq_export);
+		req->rq_export = NULL;
+	}
+
+	spin_lock(&svcpt->scp_lock);
+
+	list_add(&req->rq_list, &rqbd->rqbd_reqs);
+
+	refcount = --(rqbd->rqbd_refcount);
+	if (refcount == 0) {
+		/* request buffer is now idle: add to history */
+		list_del(&rqbd->rqbd_list);
+
+		list_add_tail(&rqbd->rqbd_list, &svcpt->scp_hist_rqbds);
+		svcpt->scp_hist_nrqbds++;
+
+		/* cull some history?
+		 * I expect only about 1 or 2 rqbds need to be recycled here */
+		while (svcpt->scp_hist_nrqbds > svc->srv_hist_nrqbds_cpt_max) {
+			rqbd = list_entry(svcpt->scp_hist_rqbds.next,
+					  struct ptlrpc_request_buffer_desc,
+					  rqbd_list);
+
+			list_del(&rqbd->rqbd_list);
+			svcpt->scp_hist_nrqbds--;
+
+			/* remove rqbd's reqs from svc's req history while
+			 * I've got the service lock */
+			list_for_each(tmp, &rqbd->rqbd_reqs) {
+				req = list_entry(tmp, struct ptlrpc_request,
+						 rq_list);
+				/* Track the highest culled req seq */
+				if (req->rq_history_seq >
+				    svcpt->scp_hist_seq_culled) {
+					svcpt->scp_hist_seq_culled =
+						req->rq_history_seq;
+				}
+				list_del(&req->rq_history_list);
+			}
+
+			spin_unlock(&svcpt->scp_lock);
+
+			list_for_each_safe(tmp, nxt, &rqbd->rqbd_reqs) {
+				req = list_entry(rqbd->rqbd_reqs.next,
+						 struct ptlrpc_request,
+						 rq_list);
+				list_del(&req->rq_list);
+				ptlrpc_server_free_request(req);
+			}
+
+			spin_lock(&svcpt->scp_lock);
+			/*
+			 * now all reqs including the embedded req has been
+			 * disposed, schedule request buffer for re-use
+			 * or free it to drain some in excess.
+			 */
+			LASSERT(atomic_read(&rqbd->rqbd_req.rq_refcount) == 0);
+			if (svcpt->scp_nrqbds_posted >=
+			    svc->srv_nbuf_per_group &&
+			    !test_req_buffer_pressure) {
+				/* like in ptlrpc_free_rqbd() */
+				svcpt->scp_nrqbds_total--;
+				OBD_FREE_LARGE(rqbd->rqbd_buffer,
+					       svc->srv_buf_size);
+				OBD_FREE_PTR(rqbd);
+			} else {
+				list_add_tail(&rqbd->rqbd_list,
+					      &svcpt->scp_rqbd_idle);
+			}
+		}
+
+		spin_unlock(&svcpt->scp_lock);
+	} else if (req->rq_reply_state && req->rq_reply_state->rs_prealloc) {
+		/* If we are low on memory, we are not interested in history */
+		list_del(&req->rq_list);
+		list_del_init(&req->rq_history_list);
+
+		/* Track the highest culled req seq */
+		if (req->rq_history_seq > svcpt->scp_hist_seq_culled)
+			svcpt->scp_hist_seq_culled = req->rq_history_seq;
+
+		spin_unlock(&svcpt->scp_lock);
+
+		ptlrpc_server_free_request(req);
+	} else {
+		spin_unlock(&svcpt->scp_lock);
+	}
+}
+
+/** Change request export and move hp request from old export to new */
+void ptlrpc_request_change_export(struct ptlrpc_request *req,
+				  struct obd_export *export)
+{
+	if (req->rq_export != NULL) {
+		LASSERT(!list_empty(&req->rq_exp_list));
+		/* remove rq_exp_list from last export */
+		spin_lock_bh(&req->rq_export->exp_rpc_lock);
+		list_del_init(&req->rq_exp_list);
+		spin_unlock_bh(&req->rq_export->exp_rpc_lock);
+		/* export has one reference already, so it`s safe to
+		 * add req to export queue here and get another
+		 * reference for request later */
+		spin_lock_bh(&export->exp_rpc_lock);
+		if (req->rq_ops != NULL) /* hp request */
+			list_add(&req->rq_exp_list, &export->exp_hp_rpcs);
+		else
+			list_add(&req->rq_exp_list, &export->exp_reg_rpcs);
+		spin_unlock_bh(&export->exp_rpc_lock);
+
+		class_export_rpc_dec(req->rq_export);
+		class_export_put(req->rq_export);
+	}
+
+	/* request takes one export refcount */
+	req->rq_export = class_export_get(export);
+	class_export_rpc_inc(export);
+
+	return;
+}
+
+/**
+ * to finish a request: stop sending more early replies, and release
+ * the request.
+ */
+static void ptlrpc_server_finish_request(struct ptlrpc_service_part *svcpt,
+					 struct ptlrpc_request *req)
+{
+	ptlrpc_server_hpreq_fini(req);
+
+	ptlrpc_server_drop_request(req);
+}
+
+/**
+ * to finish an active request: stop sending more early replies, and release
+ * the request. should be called after we finished handling the request.
+ */
+static void ptlrpc_server_finish_active_request(
+					struct ptlrpc_service_part *svcpt,
+					struct ptlrpc_request *req)
+{
+	spin_lock(&svcpt->scp_req_lock);
+	ptlrpc_nrs_req_stop_nolock(req);
+	svcpt->scp_nreqs_active--;
+	if (req->rq_hp)
+		svcpt->scp_nhreqs_active--;
+	spin_unlock(&svcpt->scp_req_lock);
+
+	ptlrpc_nrs_req_finalize(req);
+
+	if (req->rq_export != NULL)
+		class_export_rpc_dec(req->rq_export);
+
+	ptlrpc_server_finish_request(svcpt, req);
+}
+
+/**
+ * This function makes sure dead exports are evicted in a timely manner.
+ * This function is only called when some export receives a message (i.e.,
+ * the network is up.)
+ */
+void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay)
+{
+        struct obd_export *oldest_exp;
+        time_t oldest_time, new_time;
+
+        ENTRY;
+
+        LASSERT(exp);
+
+        /* Compensate for slow machines, etc, by faking our request time
+           into the future.  Although this can break the strict time-ordering
+           of the list, we can be really lazy here - we don't have to evict
+           at the exact right moment.  Eventually, all silent exports
+           will make it to the top of the list. */
+
+        /* Do not pay attention on 1sec or smaller renewals. */
+        new_time = cfs_time_current_sec() + extra_delay;
+        if (exp->exp_last_request_time + 1 /*second */ >= new_time)
+                RETURN_EXIT;
+
+        exp->exp_last_request_time = new_time;
+
+	/* exports may get disconnected from the chain even though the
+	   export has references, so we must keep the spin lock while
+	   manipulating the lists */
+	spin_lock(&exp->exp_obd->obd_dev_lock);
+
+	if (list_empty(&exp->exp_obd_chain_timed)) {
+		/* this one is not timed */
+		spin_unlock(&exp->exp_obd->obd_dev_lock);
+		RETURN_EXIT;
+	}
+
+	list_move_tail(&exp->exp_obd_chain_timed,
+		       &exp->exp_obd->obd_exports_timed);
+
+	oldest_exp = list_entry(exp->exp_obd->obd_exports_timed.next,
+				struct obd_export, exp_obd_chain_timed);
+	oldest_time = oldest_exp->exp_last_request_time;
+	spin_unlock(&exp->exp_obd->obd_dev_lock);
+
+        if (exp->exp_obd->obd_recovering) {
+                /* be nice to everyone during recovery */
+                EXIT;
+                return;
+        }
+
+        /* Note - racing to start/reset the obd_eviction timer is safe */
+        if (exp->exp_obd->obd_eviction_timer == 0) {
+                /* Check if the oldest entry is expired. */
+                if (cfs_time_current_sec() > (oldest_time + PING_EVICT_TIMEOUT +
+                                              extra_delay)) {
+                        /* We need a second timer, in case the net was down and
+                         * it just came back. Since the pinger may skip every
+                         * other PING_INTERVAL (see note in ptlrpc_pinger_main),
+                         * we better wait for 3. */
+                        exp->exp_obd->obd_eviction_timer =
+                                cfs_time_current_sec() + 3 * PING_INTERVAL;
+			CDEBUG(D_HA, "%s: Think about evicting %s from %ld\n",
+                               exp->exp_obd->obd_name,
+                               obd_export_nid2str(oldest_exp), oldest_time);
+                }
+        } else {
+                if (cfs_time_current_sec() >
+                    (exp->exp_obd->obd_eviction_timer + extra_delay)) {
+                        /* The evictor won't evict anyone who we've heard from
+                         * recently, so we don't have to check before we start
+                         * it. */
+                        if (!ping_evictor_wake(exp))
+                                exp->exp_obd->obd_eviction_timer = 0;
+                }
+        }
+
+        EXIT;
+}
+
+/**
+ * Sanity check request \a req.
+ * Return 0 if all is ok, error code otherwise.
+ */
+static int ptlrpc_check_req(struct ptlrpc_request *req)
+{
+	struct obd_device *obd = req->rq_export->exp_obd;
+	int rc = 0;
+
+	if (unlikely(lustre_msg_get_conn_cnt(req->rq_reqmsg) <
+		     req->rq_export->exp_conn_cnt)) {
+		DEBUG_REQ(D_RPCTRACE, req,
+			  "DROPPING req from old connection %d < %d",
+			  lustre_msg_get_conn_cnt(req->rq_reqmsg),
+			  req->rq_export->exp_conn_cnt);
+		return -EEXIST;
+	}
+	if (unlikely(obd == NULL || obd->obd_fail)) {
+		/* Failing over, don't handle any more reqs,
+		 * send error response instead. */
+		CDEBUG(D_RPCTRACE, "Dropping req %p for failed obd %s\n",
+			req, (obd != NULL) ? obd->obd_name : "unknown");
+                rc = -ENODEV;
+        } else if (lustre_msg_get_flags(req->rq_reqmsg) &
+                   (MSG_REPLAY | MSG_REQ_REPLAY_DONE) &&
+		   !obd->obd_recovering) {
+                        DEBUG_REQ(D_ERROR, req,
+                                  "Invalid replay without recovery");
+                        class_fail_export(req->rq_export);
+                        rc = -ENODEV;
+        } else if (lustre_msg_get_transno(req->rq_reqmsg) != 0 &&
+		   !obd->obd_recovering) {
+                        DEBUG_REQ(D_ERROR, req, "Invalid req with transno "
+				  "%llu without recovery",
+                                  lustre_msg_get_transno(req->rq_reqmsg));
+                        class_fail_export(req->rq_export);
+                        rc = -ENODEV;
+        }
+
+        if (unlikely(rc < 0)) {
+                req->rq_status = rc;
+                ptlrpc_error(req);
+        }
+        return rc;
+}
+
+static void ptlrpc_at_set_timer(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_at_array *array = &svcpt->scp_at_array;
+	__s32 next;
+
+	if (array->paa_count == 0) {
+		del_timer(&svcpt->scp_at_timer);
+		return;
+	}
+
+	/* Set timer for closest deadline */
+	next = (__s32)(array->paa_deadline - ktime_get_real_seconds() -
+		       at_early_margin);
+	if (next <= 0) {
+		ptlrpc_at_timer((unsigned long)svcpt);
+	} else {
+		mod_timer(&svcpt->scp_at_timer, cfs_time_shift(next));
+		CDEBUG(D_INFO, "armed %s at %+ds\n",
+		       svcpt->scp_service->srv_name, next);
+	}
+}
+
+/* Add rpc to early reply check list */
+static int ptlrpc_at_add_timed(struct ptlrpc_request *req)
+{
+	struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
+	struct ptlrpc_at_array *array = &svcpt->scp_at_array;
+        struct ptlrpc_request *rq = NULL;
+        __u32 index;
+
+        if (AT_OFF)
+                return(0);
+
+        if (req->rq_no_reply)
+                return 0;
+
+        if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0)
+                return(-ENOSYS);
+
+	spin_lock(&svcpt->scp_at_lock);
+	LASSERT(list_empty(&req->rq_timed_list));
+
+	div_u64_rem(req->rq_deadline, array->paa_size, &index);
+        if (array->paa_reqs_count[index] > 0) {
+                /* latest rpcs will have the latest deadlines in the list,
+                 * so search backward. */
+		list_for_each_entry_reverse(rq,
+                                                &array->paa_reqs_array[index],
+                                                rq_timed_list) {
+                        if (req->rq_deadline >= rq->rq_deadline) {
+				list_add(&req->rq_timed_list,
+                                             &rq->rq_timed_list);
+                                break;
+                        }
+                }
+        }
+
+        /* Add the request at the head of the list */
+	if (list_empty(&req->rq_timed_list))
+		list_add(&req->rq_timed_list,
+                             &array->paa_reqs_array[index]);
+
+	spin_lock(&req->rq_lock);
+	req->rq_at_linked = 1;
+	spin_unlock(&req->rq_lock);
+	req->rq_at_index = index;
+	array->paa_reqs_count[index]++;
+	array->paa_count++;
+	if (array->paa_count == 1 || array->paa_deadline > req->rq_deadline) {
+		array->paa_deadline = req->rq_deadline;
+		ptlrpc_at_set_timer(svcpt);
+	}
+	spin_unlock(&svcpt->scp_at_lock);
+
+	return 0;
+}
+
+static void
+ptlrpc_at_remove_timed(struct ptlrpc_request *req)
+{
+	struct ptlrpc_at_array *array;
+
+	array = &req->rq_rqbd->rqbd_svcpt->scp_at_array;
+
+	/* NB: must call with hold svcpt::scp_at_lock */
+	LASSERT(!list_empty(&req->rq_timed_list));
+	list_del_init(&req->rq_timed_list);
+
+	spin_lock(&req->rq_lock);
+	req->rq_at_linked = 0;
+	spin_unlock(&req->rq_lock);
+
+	array->paa_reqs_count[req->rq_at_index]--;
+	array->paa_count--;
+}
+
+/*
+ * Attempt to extend the request deadline by sending an early reply to the
+ * client.
+ */
+static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req)
+{
+	struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
+	struct ptlrpc_request *reqcopy;
+	struct lustre_msg *reqmsg;
+	time64_t olddl = req->rq_deadline - ktime_get_real_seconds();
+	time64_t newdl;
+	int rc;
+
+	ENTRY;
+
+	if (CFS_FAIL_CHECK(OBD_FAIL_TGT_REPLAY_RECONNECT)) {
+		/* don't send early reply */
+		RETURN(1);
+	}
+
+        /* deadline is when the client expects us to reply, margin is the
+           difference between clients' and servers' expectations */
+        DEBUG_REQ(D_ADAPTTO, req,
+		  "%ssending early reply (deadline %+llds, margin %+llds) for "
+                  "%d+%d", AT_OFF ? "AT off - not " : "",
+		  (s64)olddl, (s64)(olddl - at_get(&svcpt->scp_at_estimate)),
+		  at_get(&svcpt->scp_at_estimate), at_extra);
+
+        if (AT_OFF)
+                RETURN(0);
+
+        if (olddl < 0) {
+		DEBUG_REQ(D_WARNING, req, "Already past deadline (%+llds), "
+                          "not sending early reply. Consider increasing "
+			  "at_early_margin (%d)?", (s64)olddl, at_early_margin);
+
+                /* Return an error so we're not re-added to the timed list. */
+                RETURN(-ETIMEDOUT);
+        }
+
+        if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0){
+                DEBUG_REQ(D_INFO, req, "Wanted to ask client for more time, "
+                          "but no AT support");
+                RETURN(-ENOSYS);
+        }
+
+	if (req->rq_export &&
+	    lustre_msg_get_flags(req->rq_reqmsg) &
+	    (MSG_REPLAY | MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE)) {
+		struct obd_device *obd_exp = req->rq_export->exp_obd;
+
+		/* During recovery, we don't want to send too many early
+		 * replies, but on the other hand we want to make sure the
+		 * client has enough time to resend if the rpc is lost. So
+		 * during the recovery period send at least 4 early replies,
+		 * spacing them every at_extra if we can. at_estimate should
+		 * always equal this fixed value during recovery.
+		 */
+		/* Don't account request processing time into AT history
+		 * during recovery, it is not service time we need but
+		 * includes also waiting time for recovering clients
+		 */
+		newdl = min_t(time64_t, at_extra,
+			      obd_exp->obd_recovery_timeout / 4) +
+			ktime_get_real_seconds();
+	} else {
+		/* We want to extend the request deadline by at_extra seconds,
+		 * so we set our service estimate to reflect how much time has
+		 * passed since this request arrived plus an additional
+		 * at_extra seconds. The client will calculate the new deadline
+		 * based on this service estimate (plus some additional time to
+		 * account for network latency). See ptlrpc_at_recv_early_reply
+		 */
+		at_measured(&svcpt->scp_at_estimate, at_extra +
+			    ktime_get_real_seconds() -
+			    req->rq_arrival_time.tv_sec);
+		newdl = req->rq_arrival_time.tv_sec +
+			at_get(&svcpt->scp_at_estimate);
+	}
+
+	/* Check to see if we've actually increased the deadline -
+	 * we may be past adaptive_max */
+	if (req->rq_deadline >= newdl) {
+		DEBUG_REQ(D_WARNING, req, "Couldn't add any time (%lld/%lld), not sending early reply\n",
+			  (s64)olddl, (s64)(newdl - ktime_get_real_seconds()));
+		RETURN(-ETIMEDOUT);
+	}
+
+	reqcopy = ptlrpc_request_cache_alloc(GFP_NOFS);
+	if (reqcopy == NULL)
+		RETURN(-ENOMEM);
+	OBD_ALLOC_LARGE(reqmsg, req->rq_reqlen);
+	if (!reqmsg)
+		GOTO(out_free, rc = -ENOMEM);
+
+        *reqcopy = *req;
+        reqcopy->rq_reply_state = NULL;
+        reqcopy->rq_rep_swab_mask = 0;
+        reqcopy->rq_pack_bulk = 0;
+        reqcopy->rq_pack_udesc = 0;
+        reqcopy->rq_packed_final = 0;
+        sptlrpc_svc_ctx_addref(reqcopy);
+        /* We only need the reqmsg for the magic */
+        reqcopy->rq_reqmsg = reqmsg;
+        memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen);
+
+	/*
+	 * tgt_brw_read() and tgt_brw_write() may have decided not to reply.
+	 * Without this check, we would fail the rq_no_reply assertion in
+	 * ptlrpc_send_reply().
+	 */
+	if (reqcopy->rq_no_reply)
+		GOTO(out, rc = -ETIMEDOUT);
+
+	LASSERT(atomic_read(&req->rq_refcount));
+	/** if it is last refcount then early reply isn't needed */
+	if (atomic_read(&req->rq_refcount) == 1) {
+		DEBUG_REQ(D_ADAPTTO, reqcopy, "Normal reply already sent out, "
+			  "abort sending early reply\n");
+		GOTO(out, rc = -EINVAL);
+	}
+
+        /* Connection ref */
+        reqcopy->rq_export = class_conn2export(
+                                     lustre_msg_get_handle(reqcopy->rq_reqmsg));
+        if (reqcopy->rq_export == NULL)
+                GOTO(out, rc = -ENODEV);
+
+        /* RPC ref */
+	class_export_rpc_inc(reqcopy->rq_export);
+        if (reqcopy->rq_export->exp_obd &&
+            reqcopy->rq_export->exp_obd->obd_fail)
+                GOTO(out_put, rc = -ENODEV);
+
+        rc = lustre_pack_reply_flags(reqcopy, 1, NULL, NULL, LPRFL_EARLY_REPLY);
+        if (rc)
+                GOTO(out_put, rc);
+
+        rc = ptlrpc_send_reply(reqcopy, PTLRPC_REPLY_EARLY);
+
+	if (!rc) {
+		/* Adjust our own deadline to what we told the client */
+		req->rq_deadline = newdl;
+		req->rq_early_count++; /* number sent, server side */
+	} else {
+		DEBUG_REQ(D_ERROR, req, "Early reply send failed %d", rc);
+	}
+
+        /* Free the (early) reply state from lustre_pack_reply.
+           (ptlrpc_send_reply takes it's own rs ref, so this is safe here) */
+        ptlrpc_req_drop_rs(reqcopy);
+
+out_put:
+	class_export_rpc_dec(reqcopy->rq_export);
+	class_export_put(reqcopy->rq_export);
+out:
+	sptlrpc_svc_ctx_decref(reqcopy);
+	OBD_FREE_LARGE(reqmsg, req->rq_reqlen);
+out_free:
+	ptlrpc_request_cache_free(reqcopy);
+	RETURN(rc);
+}
+
+/* Send early replies to everybody expiring within at_early_margin
+   asking for at_extra time */
+static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_at_array *array = &svcpt->scp_at_array;
+        struct ptlrpc_request *rq, *n;
+	struct list_head work_list;
+        __u32  index, count;
+	time64_t deadline;
+	time64_t now = ktime_get_real_seconds();
+        cfs_duration_t delay;
+        int first, counter = 0;
+        ENTRY;
+
+	spin_lock(&svcpt->scp_at_lock);
+	if (svcpt->scp_at_check == 0) {
+		spin_unlock(&svcpt->scp_at_lock);
+		RETURN(0);
+	}
+	delay = cfs_time_sub(cfs_time_current(), svcpt->scp_at_checktime);
+	svcpt->scp_at_check = 0;
+
+	if (array->paa_count == 0) {
+		spin_unlock(&svcpt->scp_at_lock);
+		RETURN(0);
+	}
+
+	/* The timer went off, but maybe the nearest rpc already completed. */
+	first = array->paa_deadline - now;
+	if (first > at_early_margin) {
+		/* We've still got plenty of time.  Reset the timer. */
+		ptlrpc_at_set_timer(svcpt);
+		spin_unlock(&svcpt->scp_at_lock);
+		RETURN(0);
+	}
+
+	/* We're close to a timeout, and we don't know how much longer the
+	   server will take. Send early replies to everyone expiring soon. */
+	INIT_LIST_HEAD(&work_list);
+	deadline = -1;
+	div_u64_rem(array->paa_deadline, array->paa_size, &index);
+	count = array->paa_count;
+	while (count > 0) {
+		count -= array->paa_reqs_count[index];
+		list_for_each_entry_safe(rq, n,
+					 &array->paa_reqs_array[index],
+					 rq_timed_list) {
+			if (rq->rq_deadline > now + at_early_margin) {
+				/* update the earliest deadline */
+				if (deadline == -1 ||
+				    rq->rq_deadline < deadline)
+					deadline = rq->rq_deadline;
+				break;
+			}
+
+			ptlrpc_at_remove_timed(rq);
+			/**
+			 * ptlrpc_server_drop_request() may drop
+			 * refcount to 0 already. Let's check this and
+			 * don't add entry to work_list
+			 */
+			if (likely(atomic_inc_not_zero(&rq->rq_refcount)))
+				list_add(&rq->rq_timed_list, &work_list);
+			counter++;
+		}
+
+		if (++index >= array->paa_size)
+			index = 0;
+	}
+	array->paa_deadline = deadline;
+	/* we have a new earliest deadline, restart the timer */
+	ptlrpc_at_set_timer(svcpt);
+
+	spin_unlock(&svcpt->scp_at_lock);
+
+        CDEBUG(D_ADAPTTO, "timeout in %+ds, asking for %d secs on %d early "
+               "replies\n", first, at_extra, counter);
+        if (first < 0) {
+                /* We're already past request deadlines before we even get a
+                   chance to send early replies */
+                LCONSOLE_WARN("%s: This server is not able to keep up with "
+			      "request traffic (cpu-bound).\n",
+			      svcpt->scp_service->srv_name);
+		CWARN("earlyQ=%d reqQ=%d recA=%d, svcEst=%d, delay=%ld(jiff)\n",
+		      counter, svcpt->scp_nreqs_incoming,
+		      svcpt->scp_nreqs_active,
+		      at_get(&svcpt->scp_at_estimate), delay);
+        }
+
+        /* we took additional refcount so entries can't be deleted from list, no
+         * locking is needed */
+	while (!list_empty(&work_list)) {
+		rq = list_entry(work_list.next, struct ptlrpc_request,
+                                    rq_timed_list);
+		list_del_init(&rq->rq_timed_list);
+
+                if (ptlrpc_at_send_early_reply(rq) == 0)
+                        ptlrpc_at_add_timed(rq);
+
+                ptlrpc_server_drop_request(rq);
+        }
+
+	RETURN(1); /* return "did_something" for liblustre */
+}
+
+/* Check if we are already handling earlier incarnation of this request.
+ * Called under &req->rq_export->exp_rpc_lock locked */
+static int ptlrpc_server_check_resend_in_progress(struct ptlrpc_request *req)
+{
+	struct ptlrpc_request	*tmp = NULL;
+
+	if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) ||
+	    (atomic_read(&req->rq_export->exp_rpc_count) == 0))
+		return 0;
+
+	/* bulk request are aborted upon reconnect, don't try to
+	 * find a match */
+	if (req->rq_bulk_write || req->rq_bulk_read)
+		return 0;
+
+	/* This list should not be longer than max_requests in
+	 * flights on the client, so it is not all that long.
+	 * Also we only hit this codepath in case of a resent
+	 * request which makes it even more rarely hit */
+	list_for_each_entry(tmp, &req->rq_export->exp_reg_rpcs,
+				rq_exp_list) {
+		/* Found duplicate one */
+		if (tmp->rq_xid == req->rq_xid)
+			goto found;
+	}
+	list_for_each_entry(tmp, &req->rq_export->exp_hp_rpcs,
+				rq_exp_list) {
+		/* Found duplicate one */
+		if (tmp->rq_xid == req->rq_xid)
+			goto found;
+	}
+	return 0;
+
+found:
+	DEBUG_REQ(D_HA, req, "Found duplicate req in processing");
+	DEBUG_REQ(D_HA, tmp, "Request being processed");
+	return -EBUSY;
+}
+
+/**
+ * Check if a request should be assigned with a high priority.
+ *
+ * \retval 	< 0: error occurred
+ *		  0: normal RPC request
+ *		 +1: high priority request
+ */
+static int ptlrpc_server_hpreq_init(struct ptlrpc_service_part *svcpt,
+				    struct ptlrpc_request *req)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL) {
+		rc = svcpt->scp_service->srv_ops.so_hpreq_handler(req);
+		if (rc < 0)
+			RETURN(rc);
+
+		LASSERT(rc == 0);
+	}
+
+	if (req->rq_export != NULL && req->rq_ops != NULL) {
+		/* Perform request specific check. We should do this
+		 * check before the request is added into exp_hp_rpcs
+		 * list otherwise it may hit swab race at LU-1044. */
+		if (req->rq_ops->hpreq_check != NULL) {
+			rc = req->rq_ops->hpreq_check(req);
+			if (rc == -ESTALE) {
+				req->rq_status = rc;
+				ptlrpc_error(req);
+			}
+			/** can only return error,
+			 * 0 for normal request,
+			 *  or 1 for high priority request */
+			LASSERT(rc <= 1);
+		}
+	}
+
+	RETURN(rc);
+}
+
+/** Remove the request from the export list. */
+static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req)
+{
+	ENTRY;
+	if (req->rq_export) {
+		/* refresh lock timeout again so that client has more
+		 * room to send lock cancel RPC. */
+		if (req->rq_ops && req->rq_ops->hpreq_fini)
+			req->rq_ops->hpreq_fini(req);
+
+		spin_lock_bh(&req->rq_export->exp_rpc_lock);
+		list_del_init(&req->rq_exp_list);
+		spin_unlock_bh(&req->rq_export->exp_rpc_lock);
+	}
+	EXIT;
+}
+
+static int ptlrpc_hpreq_check(struct ptlrpc_request *req)
+{
+	return 1;
+}
+
+static struct ptlrpc_hpreq_ops ptlrpc_hpreq_common = {
+	.hpreq_check       = ptlrpc_hpreq_check,
+};
+
+/* Hi-Priority RPC check by RPC operation code. */
+int ptlrpc_hpreq_handler(struct ptlrpc_request *req)
+{
+	int opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+	/* Check for export to let only reconnects for not yet evicted
+	 * export to become a HP rpc. */
+	if ((req->rq_export != NULL) &&
+	    (opc == OBD_PING || opc == MDS_CONNECT || opc == OST_CONNECT))
+		req->rq_ops = &ptlrpc_hpreq_common;
+
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_hpreq_handler);
+
+static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt,
+				     struct ptlrpc_request *req)
+{
+	int rc;
+	bool hp;
+	ENTRY;
+
+	rc = ptlrpc_server_hpreq_init(svcpt, req);
+	if (rc < 0)
+		RETURN(rc);
+
+	hp = rc > 0;
+	ptlrpc_nrs_req_initialize(svcpt, req, hp);
+
+	if (req->rq_export != NULL) {
+		struct obd_export *exp = req->rq_export;
+
+		/* do search for duplicated xid and the adding to the list
+		 * atomically */
+		spin_lock_bh(&exp->exp_rpc_lock);
+		rc = ptlrpc_server_check_resend_in_progress(req);
+		if (rc < 0) {
+			spin_unlock_bh(&exp->exp_rpc_lock);
+
+			ptlrpc_nrs_req_finalize(req);
+			RETURN(rc);
+		}
+
+		if (hp || req->rq_ops != NULL)
+			list_add(&req->rq_exp_list, &exp->exp_hp_rpcs);
+		else
+			list_add(&req->rq_exp_list, &exp->exp_reg_rpcs);
+		spin_unlock_bh(&exp->exp_rpc_lock);
+	}
+
+	/* the current thread is not the processing thread for this request
+	 * since that, but request is in exp_hp_list and can be find there.
+	 * Remove all relations between request and old thread. */
+	req->rq_svc_thread->t_env->le_ses = NULL;
+	req->rq_svc_thread = NULL;
+	req->rq_session.lc_thread = NULL;
+
+	ptlrpc_nrs_req_add(svcpt, req, hp);
+
+	RETURN(0);
+}
+
+/**
+ * Allow to handle high priority request
+ * User can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_req_lock to get reliable result
+ */
+static bool ptlrpc_server_allow_high(struct ptlrpc_service_part *svcpt,
+				     bool force)
+{
+	int running = svcpt->scp_nthrs_running;
+
+	if (!nrs_svcpt_has_hp(svcpt))
+		return false;
+
+	if (force)
+		return true;
+
+	if (ptlrpc_nrs_req_throttling_nolock(svcpt, true))
+		return false;
+
+	if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL &&
+		     CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) {
+		/* leave just 1 thread for normal RPCs */
+		running = PTLRPC_NTHRS_INIT;
+		if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL)
+			running += 1;
+	}
+
+	if (svcpt->scp_nreqs_active >= running - 1)
+		return false;
+
+	if (svcpt->scp_nhreqs_active == 0)
+		return true;
+
+	return !ptlrpc_nrs_req_pending_nolock(svcpt, false) ||
+	       svcpt->scp_hreq_count < svcpt->scp_service->srv_hpreq_ratio;
+}
+
+static bool ptlrpc_server_high_pending(struct ptlrpc_service_part *svcpt,
+				       bool force)
+{
+	return ptlrpc_server_allow_high(svcpt, force) &&
+	       ptlrpc_nrs_req_pending_nolock(svcpt, true);
+}
+
+/**
+ * Only allow normal priority requests on a service that has a high-priority
+ * queue if forced (i.e. cleanup), if there are other high priority requests
+ * already being processed (i.e. those threads can service more high-priority
+ * requests), or if there are enough idle threads that a later thread can do
+ * a high priority request.
+ * User can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_req_lock to get reliable result
+ */
+static bool ptlrpc_server_allow_normal(struct ptlrpc_service_part *svcpt,
+				       bool force)
+{
+	int running = svcpt->scp_nthrs_running;
+	if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL &&
+		     CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) {
+		/* leave just 1 thread for normal RPCs */
+		running = PTLRPC_NTHRS_INIT;
+		if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL)
+			running += 1;
+	}
+
+	if (force)
+		return true;
+
+	if (ptlrpc_nrs_req_throttling_nolock(svcpt, false))
+		return false;
+
+	if (svcpt->scp_nreqs_active < running - 2)
+		return true;
+
+	if (svcpt->scp_nreqs_active >= running - 1)
+		return false;
+
+	return svcpt->scp_nhreqs_active > 0 || !nrs_svcpt_has_hp(svcpt);
+}
+
+static bool ptlrpc_server_normal_pending(struct ptlrpc_service_part *svcpt,
+					 bool force)
+{
+	return ptlrpc_server_allow_normal(svcpt, force) &&
+	       ptlrpc_nrs_req_pending_nolock(svcpt, false);
+}
+
+/**
+ * Returns true if there are requests available in incoming
+ * request queue for processing and it is allowed to fetch them.
+ * User can call it w/o any lock but need to hold ptlrpc_service::scp_req_lock
+ * to get reliable result
+ * \see ptlrpc_server_allow_normal
+ * \see ptlrpc_server_allow high
+ */
+static inline bool
+ptlrpc_server_request_pending(struct ptlrpc_service_part *svcpt, bool force)
+{
+	return ptlrpc_server_high_pending(svcpt, force) ||
+	       ptlrpc_server_normal_pending(svcpt, force);
+}
+
+/**
+ * Fetch a request for processing from queue of unprocessed requests.
+ * Favors high-priority requests.
+ * Returns a pointer to fetched request.
+ */
+static struct ptlrpc_request *
+ptlrpc_server_request_get(struct ptlrpc_service_part *svcpt, bool force)
+{
+	struct ptlrpc_request *req = NULL;
+	ENTRY;
+
+	spin_lock(&svcpt->scp_req_lock);
+
+	if (ptlrpc_server_high_pending(svcpt, force)) {
+		req = ptlrpc_nrs_req_get_nolock(svcpt, true, force);
+		if (req != NULL) {
+			svcpt->scp_hreq_count++;
+			goto got_request;
+		}
+	}
+
+	if (ptlrpc_server_normal_pending(svcpt, force)) {
+		req = ptlrpc_nrs_req_get_nolock(svcpt, false, force);
+		if (req != NULL) {
+			svcpt->scp_hreq_count = 0;
+			goto got_request;
+		}
+	}
+
+	spin_unlock(&svcpt->scp_req_lock);
+	RETURN(NULL);
+
+got_request:
+	svcpt->scp_nreqs_active++;
+	if (req->rq_hp)
+		svcpt->scp_nhreqs_active++;
+
+	spin_unlock(&svcpt->scp_req_lock);
+
+	if (likely(req->rq_export))
+		class_export_rpc_inc(req->rq_export);
+
+	RETURN(req);
+}
+
+/**
+ * Handle freshly incoming reqs, add to timed early reply list,
+ * pass on to regular request queue.
+ * All incoming requests pass through here before getting into
+ * ptlrpc_server_handle_req later on.
+ */
+static int
+ptlrpc_server_handle_req_in(struct ptlrpc_service_part *svcpt,
+			    struct ptlrpc_thread *thread)
+{
+	struct ptlrpc_service	*svc = svcpt->scp_service;
+	struct ptlrpc_request	*req;
+	__u32			deadline;
+	int			rc;
+	ENTRY;
+
+	spin_lock(&svcpt->scp_lock);
+	if (list_empty(&svcpt->scp_req_incoming)) {
+		spin_unlock(&svcpt->scp_lock);
+		RETURN(0);
+	}
+
+	req = list_entry(svcpt->scp_req_incoming.next,
+			     struct ptlrpc_request, rq_list);
+	list_del_init(&req->rq_list);
+	svcpt->scp_nreqs_incoming--;
+	/* Consider this still a "queued" request as far as stats are
+	 * concerned */
+	spin_unlock(&svcpt->scp_lock);
+
+        /* go through security check/transform */
+        rc = sptlrpc_svc_unwrap_request(req);
+        switch (rc) {
+        case SECSVC_OK:
+                break;
+        case SECSVC_COMPLETE:
+                target_send_reply(req, 0, OBD_FAIL_MDS_ALL_REPLY_NET);
+                goto err_req;
+        case SECSVC_DROP:
+                goto err_req;
+        default:
+                LBUG();
+        }
+
+        /*
+         * for null-flavored rpc, msg has been unpacked by sptlrpc, although
+         * redo it wouldn't be harmful.
+         */
+        if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) {
+                rc = ptlrpc_unpack_req_msg(req, req->rq_reqlen);
+                if (rc != 0) {
+                        CERROR("error unpacking request: ptl %d from %s "
+			       "x%llu\n", svc->srv_req_portal,
+                               libcfs_id2str(req->rq_peer), req->rq_xid);
+                        goto err_req;
+                }
+        }
+
+        rc = lustre_unpack_req_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
+        if (rc) {
+                CERROR ("error unpacking ptlrpc body: ptl %d from %s x"
+			"%llu\n", svc->srv_req_portal,
+                        libcfs_id2str(req->rq_peer), req->rq_xid);
+                goto err_req;
+        }
+
+        if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_REQ_OPC) &&
+            lustre_msg_get_opc(req->rq_reqmsg) == cfs_fail_val) {
+		CERROR("drop incoming rpc opc %u, x%llu\n",
+                       cfs_fail_val, req->rq_xid);
+                goto err_req;
+        }
+
+        rc = -EINVAL;
+        if (lustre_msg_get_type(req->rq_reqmsg) != PTL_RPC_MSG_REQUEST) {
+                CERROR("wrong packet type received (type=%u) from %s\n",
+                       lustre_msg_get_type(req->rq_reqmsg),
+                       libcfs_id2str(req->rq_peer));
+                goto err_req;
+        }
+
+	switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+	case MDS_WRITEPAGE:
+	case OST_WRITE:
+	case OUT_UPDATE:
+		req->rq_bulk_write = 1;
+		break;
+	case MDS_READPAGE:
+	case OST_READ:
+	case MGS_CONFIG_READ:
+		req->rq_bulk_read = 1;
+		break;
+	}
+
+	CDEBUG(D_RPCTRACE, "got req x%llu\n", req->rq_xid);
+
+        req->rq_export = class_conn2export(
+                lustre_msg_get_handle(req->rq_reqmsg));
+        if (req->rq_export) {
+                rc = ptlrpc_check_req(req);
+                if (rc == 0) {
+                        rc = sptlrpc_target_export_check(req->rq_export, req);
+                        if (rc)
+                                DEBUG_REQ(D_ERROR, req, "DROPPING req with "
+                                          "illegal security flavor,");
+                }
+
+                if (rc)
+                        goto err_req;
+                ptlrpc_update_export_timer(req->rq_export, 0);
+        }
+
+        /* req_in handling should/must be fast */
+	if (ktime_get_real_seconds() - req->rq_arrival_time.tv_sec > 5)
+		DEBUG_REQ(D_WARNING, req, "Slow req_in handling %llds",
+			  (s64)(ktime_get_real_seconds() -
+				req->rq_arrival_time.tv_sec));
+
+        /* Set rpc server deadline and add it to the timed list */
+        deadline = (lustre_msghdr_get_flags(req->rq_reqmsg) &
+                    MSGHDR_AT_SUPPORT) ?
+                   /* The max time the client expects us to take */
+                   lustre_msg_get_timeout(req->rq_reqmsg) : obd_timeout;
+
+        req->rq_deadline = req->rq_arrival_time.tv_sec + deadline;
+        if (unlikely(deadline == 0)) {
+                DEBUG_REQ(D_ERROR, req, "Dropping request with 0 timeout");
+                goto err_req;
+        }
+
+	/* Skip early reply */
+	if (OBD_FAIL_PRECHECK(OBD_FAIL_MDS_RESEND))
+		req->rq_deadline += obd_timeout;
+
+	req->rq_svc_thread = thread;
+	if (thread != NULL) {
+		/* initialize request session, it is needed for request
+		 * processing by target */
+		rc = lu_context_init(&req->rq_session, LCT_SERVER_SESSION |
+						       LCT_NOREF);
+		if (rc) {
+			CERROR("%s: failure to initialize session: rc = %d\n",
+			       thread->t_name, rc);
+			goto err_req;
+		}
+		req->rq_session.lc_thread = thread;
+		lu_context_enter(&req->rq_session);
+		thread->t_env->le_ses = &req->rq_session;
+	}
+
+	ptlrpc_at_add_timed(req);
+
+	/* Move it over to the request processing queue */
+	rc = ptlrpc_server_request_add(svcpt, req);
+	if (rc)
+		GOTO(err_req, rc);
+
+	wake_up(&svcpt->scp_waitq);
+	RETURN(1);
+
+err_req:
+	ptlrpc_server_finish_request(svcpt, req);
+
+	RETURN(1);
+}
+
+/**
+ * Main incoming request handling logic.
+ * Calls handler function from service to do actual processing.
+ */
+static int
+ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
+			     struct ptlrpc_thread *thread)
+{
+	struct ptlrpc_service *svc = svcpt->scp_service;
+	struct ptlrpc_request *request;
+	ktime_t work_start;
+	ktime_t work_end;
+	ktime_t arrived;
+	s64 timediff_usecs;
+	s64 arrived_usecs;
+	int fail_opc = 0;
+
+	ENTRY;
+
+	request = ptlrpc_server_request_get(svcpt, false);
+	if (request == NULL)
+		RETURN(0);
+
+        if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT))
+                fail_opc = OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT;
+        else if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT))
+                fail_opc = OBD_FAIL_PTLRPC_HPREQ_TIMEOUT;
+
+        if (unlikely(fail_opc)) {
+		if (request->rq_export && request->rq_ops)
+			OBD_FAIL_TIMEOUT(fail_opc, 4);
+	}
+
+        ptlrpc_rqphase_move(request, RQ_PHASE_INTERPRET);
+
+	if(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DUMP_LOG))
+		libcfs_debug_dumplog();
+
+	work_start = ktime_get_real();
+	arrived = timespec64_to_ktime(request->rq_arrival_time);
+	timediff_usecs = ktime_us_delta(work_start, arrived);
+	if (likely(svc->srv_stats != NULL)) {
+                lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR,
+				    timediff_usecs);
+                lprocfs_counter_add(svc->srv_stats, PTLRPC_REQQDEPTH_CNTR,
+				    svcpt->scp_nreqs_incoming);
+		lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR,
+				    svcpt->scp_nreqs_active);
+		lprocfs_counter_add(svc->srv_stats, PTLRPC_TIMEOUT,
+				    at_get(&svcpt->scp_at_estimate));
+        }
+
+	if (likely(request->rq_export)) {
+		if (unlikely(ptlrpc_check_req(request)))
+			goto put_conn;
+		ptlrpc_update_export_timer(request->rq_export,
+					   timediff_usecs >> 19);
+        }
+
+        /* Discard requests queued for longer than the deadline.
+           The deadline is increased if we send an early reply. */
+	if (ktime_get_real_seconds() > request->rq_deadline) {
+		DEBUG_REQ(D_ERROR, request, "Dropping timed-out request from %s: deadline %lld:%llds ago\n",
+                          libcfs_id2str(request->rq_peer),
+			  request->rq_deadline -
+			  request->rq_arrival_time.tv_sec,
+			  ktime_get_real_seconds() - request->rq_deadline);
+                goto put_conn;
+        }
+
+	CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:nid:opc "
+	       "%s:%s+%d:%d:x%llu:%s:%d\n", current_comm(),
+	       (request->rq_export ?
+		(char *)request->rq_export->exp_client_uuid.uuid : "0"),
+	       (request->rq_export ?
+		atomic_read(&request->rq_export->exp_refcount) : -99),
+	       lustre_msg_get_status(request->rq_reqmsg), request->rq_xid,
+	       libcfs_id2str(request->rq_peer),
+	       lustre_msg_get_opc(request->rq_reqmsg));
+
+        if (lustre_msg_get_opc(request->rq_reqmsg) != OBD_PING)
+                CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, cfs_fail_val);
+
+	CDEBUG(D_NET, "got req %llu\n", request->rq_xid);
+
+	/* re-assign request and sesson thread to the current one */
+	request->rq_svc_thread = thread;
+	if (thread != NULL) {
+		LASSERT(request->rq_session.lc_thread == NULL);
+		request->rq_session.lc_thread = thread;
+		thread->t_env->le_ses = &request->rq_session;
+	}
+	svc->srv_ops.so_req_handler(request);
+
+	ptlrpc_rqphase_move(request, RQ_PHASE_COMPLETE);
+
+put_conn:
+	if (unlikely(ktime_get_real_seconds() > request->rq_deadline)) {
+		DEBUG_REQ(D_WARNING, request,
+			  "Request took longer than estimated (%lld:%llds); "
+			  "client may timeout.",
+			  request->rq_deadline -
+			  request->rq_arrival_time.tv_sec,
+			  ktime_get_real_seconds() - request->rq_deadline);
+	}
+
+	work_end = ktime_get_real();
+	timediff_usecs = ktime_us_delta(work_end, work_start);
+	arrived_usecs = ktime_us_delta(work_end, arrived);
+	CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc "
+	       "%s:%s+%d:%d:x%llu:%s:%d Request processed in %lldus "
+	       "(%lldus total) trans %llu rc %d/%d\n",
+	       current_comm(),
+	       (request->rq_export ?
+	       (char *)request->rq_export->exp_client_uuid.uuid : "0"),
+	       (request->rq_export ?
+	       atomic_read(&request->rq_export->exp_refcount) : -99),
+	       lustre_msg_get_status(request->rq_reqmsg),
+	       request->rq_xid,
+	       libcfs_id2str(request->rq_peer),
+	       lustre_msg_get_opc(request->rq_reqmsg),
+	       timediff_usecs,
+	       arrived_usecs,
+	       (request->rq_repmsg ?
+	       lustre_msg_get_transno(request->rq_repmsg) :
+	       request->rq_transno),
+	       request->rq_status,
+	       (request->rq_repmsg ?
+	       lustre_msg_get_status(request->rq_repmsg) : -999));
+	if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) {
+		__u32 op = lustre_msg_get_opc(request->rq_reqmsg);
+		int opc = opcode_offset(op);
+		if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) {
+			LASSERT(opc < LUSTRE_MAX_OPCODES);
+			lprocfs_counter_add(svc->srv_stats,
+					    opc + EXTRA_MAX_OPCODES,
+					    timediff_usecs);
+		}
+	}
+	if (unlikely(request->rq_early_count)) {
+		DEBUG_REQ(D_ADAPTTO, request,
+			  "sent %d early replies before finishing in %llds",
+			  request->rq_early_count,
+			  arrived_usecs / USEC_PER_SEC);
+	}
+
+	ptlrpc_server_finish_active_request(svcpt, request);
+
+	RETURN(1);
+}
+
+/**
+ * An internal function to process a single reply state object.
+ */
+static int
+ptlrpc_handle_rs(struct ptlrpc_reply_state *rs)
+{
+	struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+	struct ptlrpc_service     *svc = svcpt->scp_service;
+	struct obd_export         *exp;
+	int                        nlocks;
+	int                        been_handled;
+	ENTRY;
+
+	exp = rs->rs_export;
+
+	LASSERT(rs->rs_difficult);
+	LASSERT(rs->rs_scheduled);
+	LASSERT(list_empty(&rs->rs_list));
+
+	/* The disk commit callback holds exp_uncommitted_replies_lock while it
+	 * iterates over newly committed replies, removing them from
+	 * exp_uncommitted_replies.  It then drops this lock and schedules the
+	 * replies it found for handling here.
+	 *
+	 * We can avoid contention for exp_uncommitted_replies_lock between the
+	 * HRT threads and further commit callbacks by checking rs_committed
+	 * which is set in the commit callback while it holds both
+	 * rs_lock and exp_uncommitted_reples.
+	 *
+	 * If we see rs_committed clear, the commit callback _may_ not have
+	 * handled this reply yet and we race with it to grab
+	 * exp_uncommitted_replies_lock before removing the reply from
+	 * exp_uncommitted_replies.  Note that if we lose the race and the
+	 * reply has already been removed, list_del_init() is a noop.
+	 *
+	 * If we see rs_committed set, we know the commit callback is handling,
+	 * or has handled this reply since store reordering might allow us to
+	 * see rs_committed set out of sequence.  But since this is done
+	 * holding rs_lock, we can be sure it has all completed once we hold
+	 * rs_lock, which we do right next.
+	 */
+	if (!rs->rs_committed) {
+		/* if rs was commited, no need to convert locks, don't check
+		 * rs_committed here because rs may never be added into
+		 * exp_uncommitted_replies and this flag never be set, see
+		 * target_send_reply() */
+		if (rs->rs_convert_lock &&
+		    rs->rs_transno > exp->exp_last_committed) {
+			struct ldlm_lock *lock;
+			struct ldlm_lock *ack_locks[RS_MAX_LOCKS] = { NULL };
+
+			spin_lock(&rs->rs_lock);
+			if (rs->rs_convert_lock &&
+			    rs->rs_transno > exp->exp_last_committed) {
+				nlocks = rs->rs_nlocks;
+				while (nlocks-- > 0) {
+					/*
+					 * NB don't assume rs is always handled
+					 * by the same service thread (see
+					 * ptlrpc_hr_select, so REP-ACK hr may
+					 * race with trans commit, while the
+					 * latter will release locks, get locks
+					 * here early to convert to COS mode
+					 * safely.
+					 */
+					lock = ldlm_handle2lock(
+							&rs->rs_locks[nlocks]);
+					LASSERT(lock);
+					ack_locks[nlocks] = lock;
+					rs->rs_modes[nlocks] = LCK_COS;
+				}
+				nlocks = rs->rs_nlocks;
+				rs->rs_convert_lock = 0;
+				/* clear rs_scheduled so that commit callback
+				 * can schedule again */
+				rs->rs_scheduled = 0;
+				spin_unlock(&rs->rs_lock);
+
+				while (nlocks-- > 0) {
+					lock = ack_locks[nlocks];
+					ldlm_lock_downgrade(lock, LCK_COS);
+					LDLM_LOCK_PUT(lock);
+				}
+				RETURN(0);
+			}
+			spin_unlock(&rs->rs_lock);
+		}
+
+		spin_lock(&exp->exp_uncommitted_replies_lock);
+		list_del_init(&rs->rs_obd_list);
+		spin_unlock(&exp->exp_uncommitted_replies_lock);
+	}
+
+	spin_lock(&exp->exp_lock);
+	/* Noop if removed already */
+	list_del_init(&rs->rs_exp_list);
+	spin_unlock(&exp->exp_lock);
+
+	spin_lock(&rs->rs_lock);
+
+	been_handled = rs->rs_handled;
+	rs->rs_handled = 1;
+
+	nlocks = rs->rs_nlocks;                 /* atomic "steal", but */
+	rs->rs_nlocks = 0;                      /* locks still on rs_locks! */
+
+	if (nlocks == 0 && !been_handled) {
+		/* If we see this, we should already have seen the warning
+		 * in mds_steal_ack_locks()  */
+		CDEBUG(D_HA, "All locks stolen from rs %p x%lld.t%lld"
+		       " o%d NID %s\n",
+		       rs,
+		       rs->rs_xid, rs->rs_transno, rs->rs_opc,
+		       libcfs_nid2str(exp->exp_connection->c_peer.nid));
+	}
+
+	if ((!been_handled && rs->rs_on_net) || nlocks > 0) {
+		spin_unlock(&rs->rs_lock);
+
+		if (!been_handled && rs->rs_on_net) {
+			LNetMDUnlink(rs->rs_md_h);
+			/* Ignore return code; we're racing with completion */
+		}
+
+		while (nlocks-- > 0)
+			ldlm_lock_decref(&rs->rs_locks[nlocks],
+					 rs->rs_modes[nlocks]);
+
+		spin_lock(&rs->rs_lock);
+	}
+
+	rs->rs_scheduled = 0;
+	rs->rs_convert_lock = 0;
+
+	if (!rs->rs_on_net) {
+		/* Off the net */
+		spin_unlock(&rs->rs_lock);
+
+		class_export_put (exp);
+		rs->rs_export = NULL;
+		ptlrpc_rs_decref(rs);
+		if (atomic_dec_and_test(&svcpt->scp_nreps_difficult) &&
+		    svc->srv_is_stopping)
+			wake_up_all(&svcpt->scp_waitq);
+		RETURN(1);
+	}
+
+	/* still on the net; callback will schedule */
+	spin_unlock(&rs->rs_lock);
+	RETURN(1);
+}
+
+
+static void
+ptlrpc_check_rqbd_pool(struct ptlrpc_service_part *svcpt)
+{
+	int avail = svcpt->scp_nrqbds_posted;
+	int low_water = test_req_buffer_pressure ? 0 :
+			svcpt->scp_service->srv_nbuf_per_group / 2;
+
+        /* NB I'm not locking; just looking. */
+
+        /* CAVEAT EMPTOR: We might be allocating buffers here because we've
+         * allowed the request history to grow out of control.  We could put a
+         * sanity check on that here and cull some history if we need the
+         * space. */
+
+        if (avail <= low_water)
+		ptlrpc_grow_req_bufs(svcpt, 1);
+
+	if (svcpt->scp_service->srv_stats) {
+		lprocfs_counter_add(svcpt->scp_service->srv_stats,
+				    PTLRPC_REQBUF_AVAIL_CNTR, avail);
+	}
+}
+
+static int
+ptlrpc_retry_rqbds(void *arg)
+{
+	struct ptlrpc_service_part *svcpt = (struct ptlrpc_service_part *)arg;
+
+	svcpt->scp_rqbd_timeout = 0;
+	return -ETIMEDOUT;
+}
+
+static inline int
+ptlrpc_threads_enough(struct ptlrpc_service_part *svcpt)
+{
+	return svcpt->scp_nreqs_active <
+	       svcpt->scp_nthrs_running - 1 -
+	       (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL);
+}
+
+/**
+ * allowed to create more threads
+ * user can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_lock to get reliable result
+ */
+static inline int
+ptlrpc_threads_increasable(struct ptlrpc_service_part *svcpt)
+{
+	return svcpt->scp_nthrs_running +
+	       svcpt->scp_nthrs_starting <
+	       svcpt->scp_service->srv_nthrs_cpt_limit;
+}
+
+/**
+ * too many requests and allowed to create more threads
+ */
+static inline int
+ptlrpc_threads_need_create(struct ptlrpc_service_part *svcpt)
+{
+	return !ptlrpc_threads_enough(svcpt) &&
+		ptlrpc_threads_increasable(svcpt);
+}
+
+static inline int
+ptlrpc_thread_stopping(struct ptlrpc_thread *thread)
+{
+	return thread_is_stopping(thread) ||
+	       thread->t_svcpt->scp_service->srv_is_stopping;
+}
+
+static inline int
+ptlrpc_rqbd_pending(struct ptlrpc_service_part *svcpt)
+{
+	return !list_empty(&svcpt->scp_rqbd_idle) &&
+	       svcpt->scp_rqbd_timeout == 0;
+}
+
+static inline int
+ptlrpc_at_check(struct ptlrpc_service_part *svcpt)
+{
+	return svcpt->scp_at_check;
+}
+
+/**
+ * requests wait on preprocessing
+ * user can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_lock to get reliable result
+ */
+static inline int
+ptlrpc_server_request_incoming(struct ptlrpc_service_part *svcpt)
+{
+	return !list_empty(&svcpt->scp_req_incoming);
+}
+
+static __attribute__((__noinline__)) int
+ptlrpc_wait_event(struct ptlrpc_service_part *svcpt,
+		  struct ptlrpc_thread *thread)
+{
+	/* Don't exit while there are replies to be handled */
+	struct l_wait_info lwi = LWI_TIMEOUT(svcpt->scp_rqbd_timeout,
+					     ptlrpc_retry_rqbds, svcpt);
+
+	lc_watchdog_disable(thread->t_watchdog);
+
+	cond_resched();
+
+	l_wait_event_exclusive_head(svcpt->scp_waitq,
+				ptlrpc_thread_stopping(thread) ||
+				ptlrpc_server_request_incoming(svcpt) ||
+				ptlrpc_server_request_pending(svcpt, false) ||
+				ptlrpc_rqbd_pending(svcpt) ||
+				ptlrpc_at_check(svcpt), &lwi);
+
+	if (ptlrpc_thread_stopping(thread))
+		return -EINTR;
+
+	lc_watchdog_touch(thread->t_watchdog,
+			  ptlrpc_server_get_timeout(svcpt));
+	return 0;
+}
+
+/**
+ * Main thread body for service threads.
+ * Waits in a loop waiting for new requests to process to appear.
+ * Every time an incoming requests is added to its queue, a waitq
+ * is woken up and one of the threads will handle it.
+ */
+static int ptlrpc_main(void *arg)
+{
+	struct ptlrpc_thread		*thread = (struct ptlrpc_thread *)arg;
+	struct ptlrpc_service_part	*svcpt = thread->t_svcpt;
+	struct ptlrpc_service		*svc = svcpt->scp_service;
+	struct ptlrpc_reply_state	*rs;
+	struct group_info *ginfo = NULL;
+	struct lu_env *env;
+	int counter = 0, rc = 0;
+	ENTRY;
+
+	thread->t_pid = current_pid();
+	unshare_fs_struct();
+
+	/* NB: we will call cfs_cpt_bind() for all threads, because we
+	 * might want to run lustre server only on a subset of system CPUs,
+	 * in that case ->scp_cpt is CFS_CPT_ANY */
+	rc = cfs_cpt_bind(svc->srv_cptable, svcpt->scp_cpt);
+	if (rc != 0) {
+		CWARN("%s: failed to bind %s on CPT %d\n",
+		      svc->srv_name, thread->t_name, svcpt->scp_cpt);
+	}
+
+	ginfo = groups_alloc(0);
+	if (!ginfo) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	set_current_groups(ginfo);
+	put_group_info(ginfo);
+
+	if (svc->srv_ops.so_thr_init != NULL) {
+		rc = svc->srv_ops.so_thr_init(thread);
+                if (rc)
+                        goto out;
+        }
+
+        OBD_ALLOC_PTR(env);
+        if (env == NULL) {
+                rc = -ENOMEM;
+                goto out_srv_fini;
+        }
+
+        rc = lu_context_init(&env->le_ctx,
+                             svc->srv_ctx_tags|LCT_REMEMBER|LCT_NOREF);
+        if (rc)
+                goto out_srv_fini;
+
+        thread->t_env = env;
+        env->le_ctx.lc_thread = thread;
+        env->le_ctx.lc_cookie = 0x6;
+
+	while (!list_empty(&svcpt->scp_rqbd_idle)) {
+		rc = ptlrpc_server_post_idle_rqbds(svcpt);
+		if (rc >= 0)
+			continue;
+
+		CERROR("Failed to post rqbd for %s on CPT %d: %d\n",
+			svc->srv_name, svcpt->scp_cpt, rc);
+		goto out_srv_fini;
+	}
+
+	/* Alloc reply state structure for this one */
+	OBD_ALLOC_LARGE(rs, svc->srv_max_reply_size);
+	if (!rs) {
+		rc = -ENOMEM;
+		goto out_srv_fini;
+	}
+
+	spin_lock(&svcpt->scp_lock);
+
+	LASSERT(thread_is_starting(thread));
+	thread_clear_flags(thread, SVC_STARTING);
+
+	LASSERT(svcpt->scp_nthrs_starting == 1);
+	svcpt->scp_nthrs_starting--;
+
+	/* SVC_STOPPING may already be set here if someone else is trying
+	 * to stop the service while this new thread has been dynamically
+	 * forked. We still set SVC_RUNNING to let our creator know that
+	 * we are now running, however we will exit as soon as possible */
+	thread_add_flags(thread, SVC_RUNNING);
+	svcpt->scp_nthrs_running++;
+	spin_unlock(&svcpt->scp_lock);
+
+	/* wake up our creator in case he's still waiting. */
+	wake_up(&thread->t_ctl_waitq);
+
+	thread->t_watchdog = lc_watchdog_add(ptlrpc_server_get_timeout(svcpt),
+					     NULL, NULL);
+
+	spin_lock(&svcpt->scp_rep_lock);
+	list_add(&rs->rs_list, &svcpt->scp_rep_idle);
+	wake_up(&svcpt->scp_rep_waitq);
+	spin_unlock(&svcpt->scp_rep_lock);
+
+	CDEBUG(D_NET, "service thread %d (#%d) started\n", thread->t_id,
+	       svcpt->scp_nthrs_running);
+
+	/* XXX maintain a list of all managed devices: insert here */
+	while (!ptlrpc_thread_stopping(thread)) {
+		if (ptlrpc_wait_event(svcpt, thread))
+			break;
+
+		ptlrpc_check_rqbd_pool(svcpt);
+
+		if (ptlrpc_threads_need_create(svcpt)) {
+			/* Ignore return code - we tried... */
+			ptlrpc_start_thread(svcpt, 0);
+                }
+
+		/* reset le_ses to initial state */
+		env->le_ses = NULL;
+		/* Process all incoming reqs before handling any */
+		if (ptlrpc_server_request_incoming(svcpt)) {
+			lu_context_enter(&env->le_ctx);
+			ptlrpc_server_handle_req_in(svcpt, thread);
+			lu_context_exit(&env->le_ctx);
+
+			/* but limit ourselves in case of flood */
+			if (counter++ < 100)
+				continue;
+			counter = 0;
+		}
+
+		if (ptlrpc_at_check(svcpt))
+			ptlrpc_at_check_timed(svcpt);
+
+		if (ptlrpc_server_request_pending(svcpt, false)) {
+			lu_context_enter(&env->le_ctx);
+			ptlrpc_server_handle_request(svcpt, thread);
+			lu_context_exit(&env->le_ctx);
+                }
+
+		if (ptlrpc_rqbd_pending(svcpt) &&
+		    ptlrpc_server_post_idle_rqbds(svcpt) < 0) {
+			/* I just failed to repost request buffers.
+			 * Wait for a timeout (unless something else
+			 * happens) before I try again */
+			svcpt->scp_rqbd_timeout = cfs_time_seconds(1) / 10;
+			CDEBUG(D_RPCTRACE, "Posted buffers: %d\n",
+			       svcpt->scp_nrqbds_posted);
+                }
+        }
+
+        lc_watchdog_delete(thread->t_watchdog);
+        thread->t_watchdog = NULL;
+
+out_srv_fini:
+        /*
+         * deconstruct service specific state created by ptlrpc_start_thread()
+         */
+	if (svc->srv_ops.so_thr_done != NULL)
+		svc->srv_ops.so_thr_done(thread);
+
+        if (env != NULL) {
+                lu_context_fini(&env->le_ctx);
+                OBD_FREE_PTR(env);
+        }
+out:
+        CDEBUG(D_RPCTRACE, "service thread [ %p : %u ] %d exiting: rc %d\n",
+               thread, thread->t_pid, thread->t_id, rc);
+
+	spin_lock(&svcpt->scp_lock);
+	if (thread_test_and_clear_flags(thread, SVC_STARTING))
+		svcpt->scp_nthrs_starting--;
+
+	if (thread_test_and_clear_flags(thread, SVC_RUNNING)) {
+		/* must know immediately */
+		svcpt->scp_nthrs_running--;
+	}
+
+	thread->t_id = rc;
+	thread_add_flags(thread, SVC_STOPPED);
+
+	wake_up(&thread->t_ctl_waitq);
+	spin_unlock(&svcpt->scp_lock);
+
+	return rc;
+}
+
+static int hrt_dont_sleep(struct ptlrpc_hr_thread *hrt,
+			  struct list_head *replies)
+{
+	int result;
+
+	spin_lock(&hrt->hrt_lock);
+
+	list_splice_init(&hrt->hrt_queue, replies);
+	result = ptlrpc_hr.hr_stopping || !list_empty(replies);
+
+	spin_unlock(&hrt->hrt_lock);
+	return result;
+}
+
+/**
+ * Main body of "handle reply" function.
+ * It processes acked reply states
+ */
+static int ptlrpc_hr_main(void *arg)
+{
+	struct ptlrpc_hr_thread		*hrt = (struct ptlrpc_hr_thread *)arg;
+	struct ptlrpc_hr_partition	*hrp = hrt->hrt_partition;
+	struct list_head		replies;
+	int				rc;
+
+	INIT_LIST_HEAD(&replies);
+	unshare_fs_struct();
+
+	rc = cfs_cpt_bind(ptlrpc_hr.hr_cpt_table, hrp->hrp_cpt);
+	if (rc != 0) {
+		char threadname[20];
+
+		snprintf(threadname, sizeof(threadname), "ptlrpc_hr%02d_%03d",
+			 hrp->hrp_cpt, hrt->hrt_id);
+		CWARN("Failed to bind %s on CPT %d of CPT table %p: rc = %d\n",
+		      threadname, hrp->hrp_cpt, ptlrpc_hr.hr_cpt_table, rc);
+	}
+
+	atomic_inc(&hrp->hrp_nstarted);
+	wake_up(&ptlrpc_hr.hr_waitq);
+
+	while (!ptlrpc_hr.hr_stopping) {
+		l_wait_condition(hrt->hrt_waitq, hrt_dont_sleep(hrt, &replies));
+
+		while (!list_empty(&replies)) {
+			struct ptlrpc_reply_state *rs;
+
+			rs = list_entry(replies.prev,
+					struct ptlrpc_reply_state,
+					rs_list);
+			list_del_init(&rs->rs_list);
+			ptlrpc_handle_rs(rs);
+		}
+	}
+
+	atomic_inc(&hrp->hrp_nstopped);
+	wake_up(&ptlrpc_hr.hr_waitq);
+
+	return 0;
+}
+
+static void ptlrpc_stop_hr_threads(void)
+{
+	struct ptlrpc_hr_partition	*hrp;
+	int				i;
+	int				j;
+
+	ptlrpc_hr.hr_stopping = 1;
+
+	cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+		if (hrp->hrp_thrs == NULL)
+			continue; /* uninitialized */
+		for (j = 0; j < hrp->hrp_nthrs; j++)
+			wake_up_all(&hrp->hrp_thrs[j].hrt_waitq);
+	}
+
+	cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+		if (hrp->hrp_thrs == NULL)
+			continue; /* uninitialized */
+		wait_event(ptlrpc_hr.hr_waitq,
+			       atomic_read(&hrp->hrp_nstopped) ==
+			       atomic_read(&hrp->hrp_nstarted));
+	}
+}
+
+static int ptlrpc_start_hr_threads(void)
+{
+	struct ptlrpc_hr_partition	*hrp;
+	int				i;
+	int				j;
+	ENTRY;
+
+	cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+		int	rc = 0;
+
+		for (j = 0; j < hrp->hrp_nthrs; j++) {
+			struct ptlrpc_hr_thread *hrt = &hrp->hrp_thrs[j];
+			struct task_struct *task;
+
+			task = kthread_run(ptlrpc_hr_main,
+					   &hrp->hrp_thrs[j],
+					   "ptlrpc_hr%02d_%03d",
+					   hrp->hrp_cpt,
+					   hrt->hrt_id);
+			if (IS_ERR(task)) {
+				rc = PTR_ERR(task);
+				break;
+			}
+		}
+
+		wait_event(ptlrpc_hr.hr_waitq,
+			   atomic_read(&hrp->hrp_nstarted) == j);
+
+		if (rc < 0) {
+			CERROR("cannot start reply handler thread %d:%d: "
+			       "rc = %d\n", i, j, rc);
+			ptlrpc_stop_hr_threads();
+			RETURN(rc);
+		}
+	}
+
+	RETURN(0);
+}
+
+static void ptlrpc_svcpt_stop_threads(struct ptlrpc_service_part *svcpt)
+{
+	struct l_wait_info	lwi = { 0 };
+	struct ptlrpc_thread	*thread;
+	struct list_head	zombie;
+
+	ENTRY;
+
+	CDEBUG(D_INFO, "Stopping threads for service %s\n",
+	       svcpt->scp_service->srv_name);
+
+	INIT_LIST_HEAD(&zombie);
+	spin_lock(&svcpt->scp_lock);
+	/* let the thread know that we would like it to stop asap */
+	list_for_each_entry(thread, &svcpt->scp_threads, t_link) {
+		CDEBUG(D_INFO, "Stopping thread %s #%u\n",
+		       svcpt->scp_service->srv_thread_name, thread->t_id);
+		thread_add_flags(thread, SVC_STOPPING);
+	}
+
+	wake_up_all(&svcpt->scp_waitq);
+
+	while (!list_empty(&svcpt->scp_threads)) {
+		thread = list_entry(svcpt->scp_threads.next,
+					struct ptlrpc_thread, t_link);
+		if (thread_is_stopped(thread)) {
+			list_del(&thread->t_link);
+			list_add(&thread->t_link, &zombie);
+			continue;
+		}
+		spin_unlock(&svcpt->scp_lock);
+
+		CDEBUG(D_INFO, "waiting for stopping-thread %s #%u\n",
+		       svcpt->scp_service->srv_thread_name, thread->t_id);
+		l_wait_event(thread->t_ctl_waitq,
+			     thread_is_stopped(thread), &lwi);
+
+		spin_lock(&svcpt->scp_lock);
+	}
+
+	spin_unlock(&svcpt->scp_lock);
+
+	while (!list_empty(&zombie)) {
+		thread = list_entry(zombie.next,
+					struct ptlrpc_thread, t_link);
+		list_del(&thread->t_link);
+		OBD_FREE_PTR(thread);
+	}
+	EXIT;
+}
+
+/**
+ * Stops all threads of a particular service \a svc
+ */
+void ptlrpc_stop_all_threads(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part *svcpt;
+	int			   i;
+	ENTRY;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service != NULL)
+			ptlrpc_svcpt_stop_threads(svcpt);
+	}
+
+	EXIT;
+}
+
+int ptlrpc_start_threads(struct ptlrpc_service *svc)
+{
+	int	rc = 0;
+	int	i;
+	int	j;
+	ENTRY;
+
+	/* We require 2 threads min, see note in ptlrpc_server_handle_request */
+	LASSERT(svc->srv_nthrs_cpt_init >= PTLRPC_NTHRS_INIT);
+
+	for (i = 0; i < svc->srv_ncpts; i++) {
+		for (j = 0; j < svc->srv_nthrs_cpt_init; j++) {
+			rc = ptlrpc_start_thread(svc->srv_parts[i], 1);
+			if (rc == 0)
+				continue;
+
+			if (rc != -EMFILE)
+				goto failed;
+			/* We have enough threads, don't start more. b=15759 */
+			break;
+		}
+	}
+
+	RETURN(0);
+ failed:
+	CERROR("cannot start %s thread #%d_%d: rc %d\n",
+	       svc->srv_thread_name, i, j, rc);
+	ptlrpc_stop_all_threads(svc);
+	RETURN(rc);
+}
+
+int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait)
+{
+	struct l_wait_info	lwi = { 0 };
+	struct ptlrpc_thread	*thread;
+	struct ptlrpc_service	*svc;
+	struct task_struct	*task;
+	int			rc;
+	ENTRY;
+
+	LASSERT(svcpt != NULL);
+
+	svc = svcpt->scp_service;
+
+	CDEBUG(D_RPCTRACE, "%s[%d] started %d min %d max %d\n",
+	       svc->srv_name, svcpt->scp_cpt, svcpt->scp_nthrs_running,
+	       svc->srv_nthrs_cpt_init, svc->srv_nthrs_cpt_limit);
+
+ again:
+	if (unlikely(svc->srv_is_stopping))
+		RETURN(-ESRCH);
+
+	if (!ptlrpc_threads_increasable(svcpt) ||
+	    (OBD_FAIL_CHECK(OBD_FAIL_TGT_TOOMANY_THREADS) &&
+	     svcpt->scp_nthrs_running == svc->srv_nthrs_cpt_init - 1))
+		RETURN(-EMFILE);
+
+	OBD_CPT_ALLOC_PTR(thread, svc->srv_cptable, svcpt->scp_cpt);
+	if (thread == NULL)
+		RETURN(-ENOMEM);
+	init_waitqueue_head(&thread->t_ctl_waitq);
+
+	spin_lock(&svcpt->scp_lock);
+	if (!ptlrpc_threads_increasable(svcpt)) {
+		spin_unlock(&svcpt->scp_lock);
+		OBD_FREE_PTR(thread);
+		RETURN(-EMFILE);
+	}
+
+	if (svcpt->scp_nthrs_starting != 0) {
+		/* serialize starting because some modules (obdfilter)
+		 * might require unique and contiguous t_id */
+		LASSERT(svcpt->scp_nthrs_starting == 1);
+		spin_unlock(&svcpt->scp_lock);
+		OBD_FREE_PTR(thread);
+		if (wait) {
+			CDEBUG(D_INFO, "Waiting for creating thread %s #%d\n",
+			       svc->srv_thread_name, svcpt->scp_thr_nextid);
+			schedule();
+			goto again;
+		}
+
+		CDEBUG(D_INFO, "Creating thread %s #%d race, retry later\n",
+		       svc->srv_thread_name, svcpt->scp_thr_nextid);
+		RETURN(-EAGAIN);
+	}
+
+	svcpt->scp_nthrs_starting++;
+	thread->t_id = svcpt->scp_thr_nextid++;
+	thread_add_flags(thread, SVC_STARTING);
+	thread->t_svcpt = svcpt;
+
+	list_add(&thread->t_link, &svcpt->scp_threads);
+	spin_unlock(&svcpt->scp_lock);
+
+	if (svcpt->scp_cpt >= 0) {
+		snprintf(thread->t_name, PTLRPC_THR_NAME_LEN, "%s%02d_%03d",
+			 svc->srv_thread_name, svcpt->scp_cpt, thread->t_id);
+	} else {
+		snprintf(thread->t_name, PTLRPC_THR_NAME_LEN, "%s_%04d",
+			 svc->srv_thread_name, thread->t_id);
+	}
+
+	CDEBUG(D_RPCTRACE, "starting thread '%s'\n", thread->t_name);
+	task = kthread_run(ptlrpc_main, thread, "%s", thread->t_name);
+	if (IS_ERR(task)) {
+		rc = PTR_ERR(task);
+		CERROR("cannot start thread '%s': rc = %d\n",
+		       thread->t_name, rc);
+		spin_lock(&svcpt->scp_lock);
+		--svcpt->scp_nthrs_starting;
+		if (thread_is_stopping(thread)) {
+			/* this ptlrpc_thread is being hanled
+			 * by ptlrpc_svcpt_stop_threads now
+			 */
+			thread_add_flags(thread, SVC_STOPPED);
+			wake_up(&thread->t_ctl_waitq);
+			spin_unlock(&svcpt->scp_lock);
+		} else {
+			list_del(&thread->t_link);
+			spin_unlock(&svcpt->scp_lock);
+			OBD_FREE_PTR(thread);
+		}
+                RETURN(rc);
+        }
+
+	if (!wait)
+		RETURN(0);
+
+        l_wait_event(thread->t_ctl_waitq,
+                     thread_is_running(thread) || thread_is_stopped(thread),
+                     &lwi);
+
+        rc = thread_is_stopped(thread) ? thread->t_id : 0;
+        RETURN(rc);
+}
+
+int ptlrpc_hr_init(void)
+{
+	struct ptlrpc_hr_partition	*hrp;
+	struct ptlrpc_hr_thread		*hrt;
+	int				rc;
+	int				cpt;
+	int				i;
+	int				weight;
+	ENTRY;
+
+	memset(&ptlrpc_hr, 0, sizeof(ptlrpc_hr));
+	ptlrpc_hr.hr_cpt_table = cfs_cpt_table;
+
+	ptlrpc_hr.hr_partitions = cfs_percpt_alloc(ptlrpc_hr.hr_cpt_table,
+						   sizeof(*hrp));
+	if (ptlrpc_hr.hr_partitions == NULL)
+		RETURN(-ENOMEM);
+
+	init_waitqueue_head(&ptlrpc_hr.hr_waitq);
+
+	weight = cpumask_weight(topology_sibling_cpumask(smp_processor_id()));
+
+	cfs_percpt_for_each(hrp, cpt, ptlrpc_hr.hr_partitions) {
+		hrp->hrp_cpt = cpt;
+
+		atomic_set(&hrp->hrp_nstarted, 0);
+		atomic_set(&hrp->hrp_nstopped, 0);
+
+		hrp->hrp_nthrs = cfs_cpt_weight(ptlrpc_hr.hr_cpt_table, cpt);
+		hrp->hrp_nthrs /= weight;
+		if (hrp->hrp_nthrs == 0)
+			hrp->hrp_nthrs = 1;
+
+		OBD_CPT_ALLOC(hrp->hrp_thrs, ptlrpc_hr.hr_cpt_table, cpt,
+			      hrp->hrp_nthrs * sizeof(*hrt));
+		if (hrp->hrp_thrs == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		for (i = 0; i < hrp->hrp_nthrs; i++) {
+			hrt = &hrp->hrp_thrs[i];
+
+			hrt->hrt_id = i;
+			hrt->hrt_partition = hrp;
+			init_waitqueue_head(&hrt->hrt_waitq);
+			spin_lock_init(&hrt->hrt_lock);
+			INIT_LIST_HEAD(&hrt->hrt_queue);
+		}
+	}
+
+	rc = ptlrpc_start_hr_threads();
+out:
+	if (rc != 0)
+		ptlrpc_hr_fini();
+	RETURN(rc);
+}
+
+void ptlrpc_hr_fini(void)
+{
+	struct ptlrpc_hr_partition	*hrp;
+	int				cpt;
+
+	if (ptlrpc_hr.hr_partitions == NULL)
+		return;
+
+	ptlrpc_stop_hr_threads();
+
+	cfs_percpt_for_each(hrp, cpt, ptlrpc_hr.hr_partitions) {
+		if (hrp->hrp_thrs != NULL) {
+			OBD_FREE(hrp->hrp_thrs,
+				 hrp->hrp_nthrs * sizeof(hrp->hrp_thrs[0]));
+		}
+	}
+
+	cfs_percpt_free(ptlrpc_hr.hr_partitions);
+	ptlrpc_hr.hr_partitions = NULL;
+}
+
+
+/**
+ * Wait until all already scheduled replies are processed.
+ */
+static void ptlrpc_wait_replies(struct ptlrpc_service_part *svcpt)
+{
+	while (1) {
+		int rc;
+		struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(10),
+						     NULL, NULL);
+
+		rc = l_wait_event(svcpt->scp_waitq,
+		     atomic_read(&svcpt->scp_nreps_difficult) == 0, &lwi);
+		if (rc == 0)
+			break;
+		CWARN("Unexpectedly long timeout %s %p\n",
+		      svcpt->scp_service->srv_name, svcpt->scp_service);
+	}
+}
+
+static void
+ptlrpc_service_del_atimer(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part	*svcpt;
+	int				i;
+
+	/* early disarm AT timer... */
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service != NULL)
+			del_timer(&svcpt->scp_at_timer);
+	}
+}
+
+static void
+ptlrpc_service_unlink_rqbd(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part	  *svcpt;
+	struct ptlrpc_request_buffer_desc *rqbd;
+	struct l_wait_info		  lwi;
+	int				  rc;
+	int				  i;
+
+	/* All history will be culled when the next request buffer is
+	 * freed in ptlrpc_service_purge_all() */
+	svc->srv_hist_nrqbds_cpt_max = 0;
+
+	rc = LNetClearLazyPortal(svc->srv_req_portal);
+	LASSERT(rc == 0);
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service == NULL)
+			break;
+
+		/* Unlink all the request buffers.  This forces a 'final'
+		 * event with its 'unlink' flag set for each posted rqbd */
+		list_for_each_entry(rqbd, &svcpt->scp_rqbd_posted,
+					rqbd_list) {
+			rc = LNetMDUnlink(rqbd->rqbd_md_h);
+			LASSERT(rc == 0 || rc == -ENOENT);
+		}
+	}
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service == NULL)
+			break;
+
+		/* Wait for the network to release any buffers
+		 * it's currently filling */
+		spin_lock(&svcpt->scp_lock);
+		while (svcpt->scp_nrqbds_posted != 0) {
+			spin_unlock(&svcpt->scp_lock);
+			/* Network access will complete in finite time but
+			 * the HUGE timeout lets us CWARN for visibility
+			 * of sluggish NALs */
+			lwi = LWI_TIMEOUT_INTERVAL(
+					cfs_time_seconds(LONG_UNLINK),
+					cfs_time_seconds(1), NULL, NULL);
+			rc = l_wait_event(svcpt->scp_waitq,
+					  svcpt->scp_nrqbds_posted == 0, &lwi);
+			if (rc == -ETIMEDOUT) {
+				CWARN("Service %s waiting for "
+				      "request buffers\n",
+				      svcpt->scp_service->srv_name);
+			}
+			spin_lock(&svcpt->scp_lock);
+		}
+		spin_unlock(&svcpt->scp_lock);
+	}
+}
+
+static void
+ptlrpc_service_purge_all(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part		*svcpt;
+	struct ptlrpc_request_buffer_desc	*rqbd;
+	struct ptlrpc_request			*req;
+	struct ptlrpc_reply_state		*rs;
+	int					i;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service == NULL)
+			break;
+
+		spin_lock(&svcpt->scp_rep_lock);
+		while (!list_empty(&svcpt->scp_rep_active)) {
+			rs = list_entry(svcpt->scp_rep_active.next,
+					    struct ptlrpc_reply_state, rs_list);
+			spin_lock(&rs->rs_lock);
+			ptlrpc_schedule_difficult_reply(rs);
+			spin_unlock(&rs->rs_lock);
+		}
+		spin_unlock(&svcpt->scp_rep_lock);
+
+		/* purge the request queue.  NB No new replies (rqbds
+		 * all unlinked) and no service threads, so I'm the only
+		 * thread noodling the request queue now */
+		while (!list_empty(&svcpt->scp_req_incoming)) {
+			req = list_entry(svcpt->scp_req_incoming.next,
+					     struct ptlrpc_request, rq_list);
+
+			list_del(&req->rq_list);
+			svcpt->scp_nreqs_incoming--;
+			ptlrpc_server_finish_request(svcpt, req);
+		}
+
+		while (ptlrpc_server_request_pending(svcpt, true)) {
+			req = ptlrpc_server_request_get(svcpt, true);
+			ptlrpc_server_finish_active_request(svcpt, req);
+		}
+
+		LASSERT(list_empty(&svcpt->scp_rqbd_posted));
+		LASSERT(svcpt->scp_nreqs_incoming == 0);
+		LASSERT(svcpt->scp_nreqs_active == 0);
+		/* history should have been culled by
+		 * ptlrpc_server_finish_request */
+		LASSERT(svcpt->scp_hist_nrqbds == 0);
+
+		/* Now free all the request buffers since nothing
+		 * references them any more... */
+
+		while (!list_empty(&svcpt->scp_rqbd_idle)) {
+			rqbd = list_entry(svcpt->scp_rqbd_idle.next,
+					      struct ptlrpc_request_buffer_desc,
+					      rqbd_list);
+			ptlrpc_free_rqbd(rqbd);
+		}
+		ptlrpc_wait_replies(svcpt);
+
+		while (!list_empty(&svcpt->scp_rep_idle)) {
+			rs = list_entry(svcpt->scp_rep_idle.next,
+					    struct ptlrpc_reply_state,
+					    rs_list);
+			list_del(&rs->rs_list);
+			OBD_FREE_LARGE(rs, svc->srv_max_reply_size);
+		}
+	}
+}
+
+static void
+ptlrpc_service_free(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part	*svcpt;
+	struct ptlrpc_at_array		*array;
+	int				i;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service == NULL)
+			break;
+
+		/* In case somebody rearmed this in the meantime */
+		del_timer(&svcpt->scp_at_timer);
+		array = &svcpt->scp_at_array;
+
+		if (array->paa_reqs_array != NULL) {
+			OBD_FREE(array->paa_reqs_array,
+				 sizeof(struct list_head) * array->paa_size);
+			array->paa_reqs_array = NULL;
+		}
+
+		if (array->paa_reqs_count != NULL) {
+			OBD_FREE(array->paa_reqs_count,
+				 sizeof(__u32) * array->paa_size);
+			array->paa_reqs_count = NULL;
+		}
+	}
+
+	ptlrpc_service_for_each_part(svcpt, i, svc)
+		OBD_FREE_PTR(svcpt);
+
+	if (svc->srv_cpts != NULL)
+		cfs_expr_list_values_free(svc->srv_cpts, svc->srv_ncpts);
+
+	OBD_FREE(svc, offsetof(struct ptlrpc_service,
+			       srv_parts[svc->srv_ncpts]));
+}
+
+int ptlrpc_unregister_service(struct ptlrpc_service *service)
+{
+	ENTRY;
+
+	CDEBUG(D_NET, "%s: tearing down\n", service->srv_name);
+
+	service->srv_is_stopping = 1;
+
+	mutex_lock(&ptlrpc_all_services_mutex);
+	list_del_init(&service->srv_list);
+	mutex_unlock(&ptlrpc_all_services_mutex);
+
+	ptlrpc_service_del_atimer(service);
+	ptlrpc_stop_all_threads(service);
+
+	ptlrpc_service_unlink_rqbd(service);
+	ptlrpc_service_purge_all(service);
+	ptlrpc_service_nrs_cleanup(service);
+
+	ptlrpc_lprocfs_unregister_service(service);
+
+	ptlrpc_service_free(service);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_unregister_service);
+
+/**
+ * Returns 0 if the service is healthy.
+ *
+ * Right now, it just checks to make sure that requests aren't languishing
+ * in the queue.  We'll use this health check to govern whether a node needs
+ * to be shot, so it's intentionally non-aggressive. */
+static int ptlrpc_svcpt_health_check(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_request *request = NULL;
+	struct timespec64 right_now;
+	struct timespec64 timediff;
+
+	ktime_get_real_ts64(&right_now);
+
+	spin_lock(&svcpt->scp_req_lock);
+	/* How long has the next entry been waiting? */
+	if (ptlrpc_server_high_pending(svcpt, true))
+		request = ptlrpc_nrs_req_peek_nolock(svcpt, true);
+	else if (ptlrpc_server_normal_pending(svcpt, true))
+		request = ptlrpc_nrs_req_peek_nolock(svcpt, false);
+
+	if (request == NULL) {
+		spin_unlock(&svcpt->scp_req_lock);
+		return 0;
+	}
+
+	timediff = timespec64_sub(right_now, request->rq_arrival_time);
+	spin_unlock(&svcpt->scp_req_lock);
+
+	if ((timediff.tv_sec) >
+	    (AT_OFF ? obd_timeout * 3 / 2 : at_max)) {
+		CERROR("%s: unhealthy - request has been waiting %llds\n",
+		       svcpt->scp_service->srv_name, (s64)timediff.tv_sec);
+		return -1;
+	}
+
+	return 0;
+}
+
+int
+ptlrpc_service_health_check(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part	*svcpt;
+	int				i;
+
+	if (svc == NULL)
+		return 0;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		int rc = ptlrpc_svcpt_health_check(svcpt);
+
+		if (rc != 0)
+			return rc;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_service_health_check);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c b/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c
new file mode 100644
index 0000000000000..94828872d70ac
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c
@@ -0,0 +1,5300 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#ifdef CONFIG_FS_POSIX_ACL
+# include <linux/fs.h>
+# include <linux/posix_acl_xattr.h>
+#endif /* CONFIG_FS_POSIX_ACL */
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre/lustre_lfsck_user.h>
+#include <lustre_disk.h>
+void lustre_assert_wire_constants(void)
+{
+	 /* Wire protocol assertions generated by 'wirecheck'
+	  * (make -C lustre/utils newwiretest)
+	  * running on Linux centss05 2.6.32.431.29.2.el6_lustre #1 SMP Tue Sep 23 16:06:38 CDT 2014 x
+	  * with gcc version 4.4.7 20120313 (Red Hat 4.4.7-4) (GCC)  */
+
+
+	/* Constants... */
+	LASSERTF(PTL_RPC_MSG_REQUEST == 4711, "found %lld\n",
+		 (long long)PTL_RPC_MSG_REQUEST);
+	LASSERTF(PTL_RPC_MSG_ERR == 4712, "found %lld\n",
+		 (long long)PTL_RPC_MSG_ERR);
+	LASSERTF(PTL_RPC_MSG_REPLY == 4713, "found %lld\n",
+		 (long long)PTL_RPC_MSG_REPLY);
+	LASSERTF(MDS_DIR_END_OFF == 0xfffffffffffffffeULL, "found 0x%.16llxULL\n",
+		 MDS_DIR_END_OFF);
+	LASSERTF(DEAD_HANDLE_MAGIC == 0xdeadbeefcafebabeULL, "found 0x%.16llxULL\n",
+		 DEAD_HANDLE_MAGIC);
+	CLASSERT(MTI_NAME_MAXLEN == 64);
+	LASSERTF(OST_REPLY == 0, "found %lld\n",
+		 (long long)OST_REPLY);
+	LASSERTF(OST_GETATTR == 1, "found %lld\n",
+		 (long long)OST_GETATTR);
+	LASSERTF(OST_SETATTR == 2, "found %lld\n",
+		 (long long)OST_SETATTR);
+	LASSERTF(OST_READ == 3, "found %lld\n",
+		 (long long)OST_READ);
+	LASSERTF(OST_WRITE == 4, "found %lld\n",
+		 (long long)OST_WRITE);
+	LASSERTF(OST_CREATE == 5, "found %lld\n",
+		 (long long)OST_CREATE);
+	LASSERTF(OST_DESTROY == 6, "found %lld\n",
+		 (long long)OST_DESTROY);
+	LASSERTF(OST_GET_INFO == 7, "found %lld\n",
+		 (long long)OST_GET_INFO);
+	LASSERTF(OST_CONNECT == 8, "found %lld\n",
+		 (long long)OST_CONNECT);
+	LASSERTF(OST_DISCONNECT == 9, "found %lld\n",
+		 (long long)OST_DISCONNECT);
+	LASSERTF(OST_PUNCH == 10, "found %lld\n",
+		 (long long)OST_PUNCH);
+	LASSERTF(OST_OPEN == 11, "found %lld\n",
+		 (long long)OST_OPEN);
+	LASSERTF(OST_CLOSE == 12, "found %lld\n",
+		 (long long)OST_CLOSE);
+	LASSERTF(OST_STATFS == 13, "found %lld\n",
+		 (long long)OST_STATFS);
+	LASSERTF(OST_SYNC == 16, "found %lld\n",
+		 (long long)OST_SYNC);
+	LASSERTF(OST_SET_INFO == 17, "found %lld\n",
+		 (long long)OST_SET_INFO);
+	LASSERTF(OST_QUOTACHECK == 18, "found %lld\n",
+		 (long long)OST_QUOTACHECK);
+	LASSERTF(OST_QUOTACTL == 19, "found %lld\n",
+		 (long long)OST_QUOTACTL);
+	LASSERTF(OST_QUOTA_ADJUST_QUNIT == 20, "found %lld\n",
+		 (long long)OST_QUOTA_ADJUST_QUNIT);
+	LASSERTF(OST_LADVISE == 21, "found %lld\n",
+		 (long long)OST_LADVISE);
+	LASSERTF(OST_LAST_OPC == 22, "found %lld\n",
+		 (long long)OST_LAST_OPC);
+	LASSERTF(OBD_OBJECT_EOF == 0xffffffffffffffffULL, "found 0x%.16llxULL\n",
+		 OBD_OBJECT_EOF);
+	LASSERTF(OST_MIN_PRECREATE == 32, "found %lld\n",
+		 (long long)OST_MIN_PRECREATE);
+	LASSERTF(OST_MAX_PRECREATE == 20000, "found %lld\n",
+		 (long long)OST_MAX_PRECREATE);
+	LASSERTF(OST_LVB_ERR_INIT == 0xffbadbad80000000ULL, "found 0x%.16llxULL\n",
+		 OST_LVB_ERR_INIT);
+	LASSERTF(OST_LVB_ERR_MASK == 0xffbadbad00000000ULL, "found 0x%.16llxULL\n",
+		 OST_LVB_ERR_MASK);
+	LASSERTF(MDS_FIRST_OPC == 33, "found %lld\n",
+		 (long long)MDS_FIRST_OPC);
+	LASSERTF(MDS_GETATTR == 33, "found %lld\n",
+		 (long long)MDS_GETATTR);
+	LASSERTF(MDS_GETATTR_NAME == 34, "found %lld\n",
+		 (long long)MDS_GETATTR_NAME);
+	LASSERTF(MDS_CLOSE == 35, "found %lld\n",
+		 (long long)MDS_CLOSE);
+	LASSERTF(MDS_REINT == 36, "found %lld\n",
+		 (long long)MDS_REINT);
+	LASSERTF(MDS_READPAGE == 37, "found %lld\n",
+		 (long long)MDS_READPAGE);
+	LASSERTF(MDS_CONNECT == 38, "found %lld\n",
+		 (long long)MDS_CONNECT);
+	LASSERTF(MDS_DISCONNECT == 39, "found %lld\n",
+		 (long long)MDS_DISCONNECT);
+	LASSERTF(MDS_GET_ROOT == 40, "found %lld\n",
+		 (long long)MDS_GET_ROOT);
+	LASSERTF(MDS_STATFS == 41, "found %lld\n",
+		 (long long)MDS_STATFS);
+	LASSERTF(MDS_PIN == 42, "found %lld\n",
+		 (long long)MDS_PIN);
+	LASSERTF(MDS_UNPIN == 43, "found %lld\n",
+		 (long long)MDS_UNPIN);
+	LASSERTF(MDS_SYNC == 44, "found %lld\n",
+		 (long long)MDS_SYNC);
+	LASSERTF(MDS_DONE_WRITING == 45, "found %lld\n",
+		 (long long)MDS_DONE_WRITING);
+	LASSERTF(MDS_SET_INFO == 46, "found %lld\n",
+		 (long long)MDS_SET_INFO);
+	LASSERTF(MDS_QUOTACHECK == 47, "found %lld\n",
+		 (long long)MDS_QUOTACHECK);
+	LASSERTF(MDS_QUOTACTL == 48, "found %lld\n",
+		 (long long)MDS_QUOTACTL);
+	LASSERTF(MDS_GETXATTR == 49, "found %lld\n",
+		 (long long)MDS_GETXATTR);
+	LASSERTF(MDS_SETXATTR == 50, "found %lld\n",
+		 (long long)MDS_SETXATTR);
+	LASSERTF(MDS_WRITEPAGE == 51, "found %lld\n",
+		 (long long)MDS_WRITEPAGE);
+	LASSERTF(MDS_IS_SUBDIR == 52, "found %lld\n",
+		 (long long)MDS_IS_SUBDIR);
+	LASSERTF(MDS_GET_INFO == 53, "found %lld\n",
+		 (long long)MDS_GET_INFO);
+	LASSERTF(MDS_HSM_STATE_GET == 54, "found %lld\n",
+		 (long long)MDS_HSM_STATE_GET);
+	LASSERTF(MDS_HSM_STATE_SET == 55, "found %lld\n",
+		 (long long)MDS_HSM_STATE_SET);
+	LASSERTF(MDS_HSM_ACTION == 56, "found %lld\n",
+		 (long long)MDS_HSM_ACTION);
+	LASSERTF(MDS_HSM_PROGRESS == 57, "found %lld\n",
+		 (long long)MDS_HSM_PROGRESS);
+	LASSERTF(MDS_HSM_REQUEST == 58, "found %lld\n",
+		 (long long)MDS_HSM_REQUEST);
+	LASSERTF(MDS_HSM_CT_REGISTER == 59, "found %lld\n",
+		 (long long)MDS_HSM_CT_REGISTER);
+	LASSERTF(MDS_HSM_CT_UNREGISTER == 60, "found %lld\n",
+		 (long long)MDS_HSM_CT_UNREGISTER);
+	LASSERTF(MDS_SWAP_LAYOUTS == 61, "found %lld\n",
+		 (long long)MDS_SWAP_LAYOUTS);
+	LASSERTF(MDS_LAST_OPC == 62, "found %lld\n",
+		 (long long)MDS_LAST_OPC);
+	LASSERTF(REINT_SETATTR == 1, "found %lld\n",
+		 (long long)REINT_SETATTR);
+	LASSERTF(REINT_CREATE == 2, "found %lld\n",
+		 (long long)REINT_CREATE);
+	LASSERTF(REINT_LINK == 3, "found %lld\n",
+		 (long long)REINT_LINK);
+	LASSERTF(REINT_UNLINK == 4, "found %lld\n",
+		 (long long)REINT_UNLINK);
+	LASSERTF(REINT_RENAME == 5, "found %lld\n",
+		 (long long)REINT_RENAME);
+	LASSERTF(REINT_OPEN == 6, "found %lld\n",
+		 (long long)REINT_OPEN);
+	LASSERTF(REINT_SETXATTR == 7, "found %lld\n",
+		 (long long)REINT_SETXATTR);
+	LASSERTF(REINT_RMENTRY == 8, "found %lld\n",
+		 (long long)REINT_RMENTRY);
+	LASSERTF(REINT_MIGRATE == 9, "found %lld\n",
+		 (long long)REINT_MIGRATE);
+	LASSERTF(REINT_MAX == 10, "found %lld\n",
+		 (long long)REINT_MAX);
+	LASSERTF(DISP_IT_EXECD == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_IT_EXECD);
+	LASSERTF(DISP_LOOKUP_EXECD == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_LOOKUP_EXECD);
+	LASSERTF(DISP_LOOKUP_NEG == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_LOOKUP_NEG);
+	LASSERTF(DISP_LOOKUP_POS == 0x00000008UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_LOOKUP_POS);
+	LASSERTF(DISP_OPEN_CREATE == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_OPEN_CREATE);
+	LASSERTF(DISP_OPEN_OPEN == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_OPEN_OPEN);
+	LASSERTF(DISP_ENQ_COMPLETE == 0x00400000UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_ENQ_COMPLETE);
+	LASSERTF(DISP_ENQ_OPEN_REF == 0x00800000UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_ENQ_OPEN_REF);
+	LASSERTF(DISP_ENQ_CREATE_REF == 0x01000000UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_ENQ_CREATE_REF);
+	LASSERTF(DISP_OPEN_LOCK == 0x02000000UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_OPEN_LOCK);
+	LASSERTF(MDS_STATUS_CONN == 1, "found %lld\n",
+		 (long long)MDS_STATUS_CONN);
+	LASSERTF(MDS_STATUS_LOV == 2, "found %lld\n",
+		 (long long)MDS_STATUS_LOV);
+	LASSERTF(MDS_ATTR_MODE == 0x0000000000000001ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_MODE);
+	LASSERTF(MDS_ATTR_UID == 0x0000000000000002ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_UID);
+	LASSERTF(MDS_ATTR_GID == 0x0000000000000004ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_GID);
+	LASSERTF(MDS_ATTR_SIZE == 0x0000000000000008ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_SIZE);
+	LASSERTF(MDS_ATTR_ATIME == 0x0000000000000010ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_ATIME);
+	LASSERTF(MDS_ATTR_MTIME == 0x0000000000000020ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_MTIME);
+	LASSERTF(MDS_ATTR_CTIME == 0x0000000000000040ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_CTIME);
+	LASSERTF(MDS_ATTR_ATIME_SET == 0x0000000000000080ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_ATIME_SET);
+	LASSERTF(MDS_ATTR_MTIME_SET == 0x0000000000000100ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_MTIME_SET);
+	LASSERTF(MDS_ATTR_FORCE == 0x0000000000000200ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_FORCE);
+	LASSERTF(MDS_ATTR_ATTR_FLAG == 0x0000000000000400ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_ATTR_FLAG);
+	LASSERTF(MDS_ATTR_KILL_SUID == 0x0000000000000800ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_KILL_SUID);
+	LASSERTF(MDS_ATTR_KILL_SGID == 0x0000000000001000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_KILL_SGID);
+	LASSERTF(MDS_ATTR_CTIME_SET == 0x0000000000002000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_CTIME_SET);
+	LASSERTF(MDS_ATTR_FROM_OPEN == 0x0000000000004000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_FROM_OPEN);
+	LASSERTF(MDS_ATTR_BLOCKS == 0x0000000000008000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_BLOCKS);
+
+	LASSERTF(MDS_ATTR_PROJID == 0x0000000000010000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_PROJID);
+	LASSERTF(FLD_QUERY == 900, "found %lld\n",
+		 (long long)FLD_QUERY);
+	LASSERTF(FLD_READ == 901, "found %lld\n",
+		 (long long)FLD_READ);
+	LASSERTF(FLD_FIRST_OPC == 900, "found %lld\n",
+		 (long long)FLD_FIRST_OPC);
+	LASSERTF(FLD_LAST_OPC == 902, "found %lld\n",
+		 (long long)FLD_LAST_OPC);
+	LASSERTF(SEQ_QUERY == 700, "found %lld\n",
+		 (long long)SEQ_QUERY);
+	LASSERTF(SEQ_FIRST_OPC == 700, "found %lld\n",
+		 (long long)SEQ_FIRST_OPC);
+	LASSERTF(SEQ_LAST_OPC == 701, "found %lld\n",
+		 (long long)SEQ_LAST_OPC);
+	LASSERTF(LFSCK_NOTIFY == 1101, "found %lld\n",
+		 (long long)LFSCK_NOTIFY);
+	LASSERTF(LFSCK_QUERY == 1102, "found %lld\n",
+		 (long long)LFSCK_QUERY);
+	LASSERTF(LFSCK_FIRST_OPC == 1101, "found %lld\n",
+		 (long long)LFSCK_FIRST_OPC);
+	LASSERTF(LFSCK_LAST_OPC == 1103, "found %lld\n",
+		 (long long)LFSCK_LAST_OPC);
+	LASSERTF(SEQ_ALLOC_SUPER == 0, "found %lld\n",
+		 (long long)SEQ_ALLOC_SUPER);
+	LASSERTF(SEQ_ALLOC_META == 1, "found %lld\n",
+		 (long long)SEQ_ALLOC_META);
+	LASSERTF(LDLM_ENQUEUE == 101, "found %lld\n",
+		 (long long)LDLM_ENQUEUE);
+	LASSERTF(LDLM_CONVERT == 102, "found %lld\n",
+		 (long long)LDLM_CONVERT);
+	LASSERTF(LDLM_CANCEL == 103, "found %lld\n",
+		 (long long)LDLM_CANCEL);
+	LASSERTF(LDLM_BL_CALLBACK == 104, "found %lld\n",
+		 (long long)LDLM_BL_CALLBACK);
+	LASSERTF(LDLM_CP_CALLBACK == 105, "found %lld\n",
+		 (long long)LDLM_CP_CALLBACK);
+	LASSERTF(LDLM_GL_CALLBACK == 106, "found %lld\n",
+		 (long long)LDLM_GL_CALLBACK);
+	LASSERTF(LDLM_SET_INFO == 107, "found %lld\n",
+		 (long long)LDLM_SET_INFO);
+	LASSERTF(LDLM_LAST_OPC == 108, "found %lld\n",
+		 (long long)LDLM_LAST_OPC);
+	LASSERTF(LCK_MINMODE == 0, "found %lld\n",
+		 (long long)LCK_MINMODE);
+	LASSERTF(LCK_EX == 1, "found %lld\n",
+		 (long long)LCK_EX);
+	LASSERTF(LCK_PW == 2, "found %lld\n",
+		 (long long)LCK_PW);
+	LASSERTF(LCK_PR == 4, "found %lld\n",
+		 (long long)LCK_PR);
+	LASSERTF(LCK_CW == 8, "found %lld\n",
+		 (long long)LCK_CW);
+	LASSERTF(LCK_CR == 16, "found %lld\n",
+		 (long long)LCK_CR);
+	LASSERTF(LCK_NL == 32, "found %lld\n",
+		 (long long)LCK_NL);
+	LASSERTF(LCK_GROUP == 64, "found %lld\n",
+		 (long long)LCK_GROUP);
+	LASSERTF(LCK_COS == 128, "found %lld\n",
+		 (long long)LCK_COS);
+	LASSERTF(LCK_MAXMODE == 129, "found %lld\n",
+		 (long long)LCK_MAXMODE);
+	LASSERTF(LCK_MODE_NUM == 8, "found %lld\n",
+		 (long long)LCK_MODE_NUM);
+	CLASSERT(LDLM_PLAIN == 10);
+	CLASSERT(LDLM_EXTENT == 11);
+	CLASSERT(LDLM_FLOCK == 12);
+	CLASSERT(LDLM_IBITS == 13);
+	CLASSERT(LDLM_MAX_TYPE == 14);
+	CLASSERT(LUSTRE_RES_ID_SEQ_OFF == 0);
+	CLASSERT(LUSTRE_RES_ID_VER_OID_OFF == 1);
+	LASSERTF(OUT_UPDATE == 1000, "found %lld\n",
+		 (long long)OUT_UPDATE);
+	LASSERTF(OUT_UPDATE_LAST_OPC == 1001, "found %lld\n",
+		 (long long)OUT_UPDATE_LAST_OPC);
+	CLASSERT(LUSTRE_RES_ID_QUOTA_SEQ_OFF == 2);
+	CLASSERT(LUSTRE_RES_ID_QUOTA_VER_OID_OFF == 3);
+	CLASSERT(LUSTRE_RES_ID_HSH_OFF == 3);
+	CLASSERT(LQUOTA_TYPE_USR == 0);
+	CLASSERT(LQUOTA_TYPE_GRP == 1);
+	CLASSERT(LQUOTA_RES_MD == 1);
+	CLASSERT(LQUOTA_RES_DT == 2);
+	LASSERTF(OBD_PING == 400, "found %lld\n",
+		 (long long)OBD_PING);
+	LASSERTF(OBD_LOG_CANCEL == 401, "found %lld\n",
+		 (long long)OBD_LOG_CANCEL);
+	LASSERTF(OBD_QC_CALLBACK == 402, "found %lld\n",
+		 (long long)OBD_QC_CALLBACK);
+	LASSERTF(OBD_IDX_READ == 403, "found %lld\n",
+		 (long long)OBD_IDX_READ);
+	LASSERTF(OBD_LAST_OPC == 404, "found %lld\n",
+		 (long long)OBD_LAST_OPC);
+	LASSERTF(QUOTA_DQACQ == 601, "found %lld\n",
+		 (long long)QUOTA_DQACQ);
+	LASSERTF(QUOTA_DQREL == 602, "found %lld\n",
+		 (long long)QUOTA_DQREL);
+	LASSERTF(QUOTA_LAST_OPC == 603, "found %lld\n",
+		 (long long)QUOTA_LAST_OPC);
+	LASSERTF(MGS_CONNECT == 250, "found %lld\n",
+		 (long long)MGS_CONNECT);
+	LASSERTF(MGS_DISCONNECT == 251, "found %lld\n",
+		 (long long)MGS_DISCONNECT);
+	LASSERTF(MGS_EXCEPTION == 252, "found %lld\n",
+		 (long long)MGS_EXCEPTION);
+	LASSERTF(MGS_TARGET_REG == 253, "found %lld\n",
+		 (long long)MGS_TARGET_REG);
+	LASSERTF(MGS_TARGET_DEL == 254, "found %lld\n",
+		 (long long)MGS_TARGET_DEL);
+	LASSERTF(MGS_SET_INFO == 255, "found %lld\n",
+		 (long long)MGS_SET_INFO);
+	LASSERTF(MGS_LAST_OPC == 257, "found %lld\n",
+		 (long long)MGS_LAST_OPC);
+	LASSERTF(SEC_CTX_INIT == 801, "found %lld\n",
+		 (long long)SEC_CTX_INIT);
+	LASSERTF(SEC_CTX_INIT_CONT == 802, "found %lld\n",
+		 (long long)SEC_CTX_INIT_CONT);
+	LASSERTF(SEC_CTX_FINI == 803, "found %lld\n",
+		 (long long)SEC_CTX_FINI);
+	LASSERTF(SEC_LAST_OPC == 804, "found %lld\n",
+		 (long long)SEC_LAST_OPC);
+	/* Sizes and Offsets */
+
+	/* Checks for struct obd_uuid */
+	LASSERTF((int)sizeof(struct obd_uuid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_uuid));
+
+	/* Checks for struct lu_seq_range */
+	LASSERTF((int)sizeof(struct lu_seq_range) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct lu_seq_range));
+	LASSERTF((int)offsetof(struct lu_seq_range, lsr_start) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_seq_range, lsr_start));
+	LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_start));
+	LASSERTF((int)offsetof(struct lu_seq_range, lsr_end) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_seq_range, lsr_end));
+	LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_end));
+	LASSERTF((int)offsetof(struct lu_seq_range, lsr_index) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_seq_range, lsr_index));
+	LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_index));
+	LASSERTF((int)offsetof(struct lu_seq_range, lsr_flags) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_seq_range, lsr_flags));
+	LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_flags));
+	LASSERTF(LU_SEQ_RANGE_MDT == 0, "found %lld\n",
+		 (long long)LU_SEQ_RANGE_MDT);
+	LASSERTF(LU_SEQ_RANGE_OST == 1, "found %lld\n",
+		 (long long)LU_SEQ_RANGE_OST);
+
+	/* Checks for struct lustre_mdt_attrs */
+	LASSERTF((int)sizeof(struct lustre_mdt_attrs) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_mdt_attrs));
+	LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_compat) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_mdt_attrs, lma_compat));
+	LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_compat) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_compat));
+	LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_incompat) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_mdt_attrs, lma_incompat));
+	LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_incompat) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_incompat));
+	LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_self_fid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_mdt_attrs, lma_self_fid));
+	LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_self_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_self_fid));
+	LASSERTF(LMAC_HSM == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)LMAC_HSM);
+	LASSERTF(LMAC_NOT_IN_OI == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)LMAC_NOT_IN_OI);
+	LASSERTF(LMAC_FID_ON_OST == 0x00000008UL, "found 0x%.8xUL\n",
+		(unsigned)LMAC_FID_ON_OST);
+	LASSERTF(LMAC_STRIPE_INFO == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)LMAC_STRIPE_INFO);
+	LASSERTF(LMAC_COMP_INFO == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)LMAC_COMP_INFO);
+	LASSERTF(LMAI_RELEASED == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)LMAI_RELEASED);
+	LASSERTF(LMAI_AGENT == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)LMAI_AGENT);
+	LASSERTF(LMAI_REMOTE_PARENT == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)LMAI_REMOTE_PARENT);
+	LASSERTF(LMAI_STRIPED == 0x00000008UL, "found 0x%.8xUL\n",
+		(unsigned)LMAI_STRIPED);
+	LASSERTF(LMAI_ORPHAN == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)LMAI_ORPHAN);
+
+	/* Checks for struct lustre_ost_attrs */
+	LASSERTF((int)sizeof(struct lustre_ost_attrs) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_ost_attrs));
+	LASSERTF((int)offsetof(struct lustre_ost_attrs, loa_lma) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_ost_attrs, loa_lma));
+	LASSERTF((int)sizeof(((struct lustre_ost_attrs *)0)->loa_lma) == 24, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_ost_attrs *)0)->loa_lma));
+	LASSERTF((int)offsetof(struct lustre_ost_attrs, loa_parent_fid) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_ost_attrs, loa_parent_fid));
+	LASSERTF((int)sizeof(((struct lustre_ost_attrs *)0)->loa_parent_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_ost_attrs *)0)->loa_parent_fid));
+	LASSERTF((int)offsetof(struct lustre_ost_attrs, loa_stripe_size) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_ost_attrs, loa_stripe_size));
+	LASSERTF((int)sizeof(((struct lustre_ost_attrs *)0)->loa_stripe_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_ost_attrs *)0)->loa_stripe_size));
+	LASSERTF((int)offsetof(struct lustre_ost_attrs, loa_comp_id) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_ost_attrs, loa_comp_id));
+	LASSERTF((int)sizeof(((struct lustre_ost_attrs *)0)->loa_comp_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_ost_attrs *)0)->loa_comp_id));
+	LASSERTF((int)offsetof(struct lustre_ost_attrs, loa_comp_start) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_ost_attrs, loa_comp_start));
+	LASSERTF((int)sizeof(((struct lustre_ost_attrs *)0)->loa_comp_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_ost_attrs *)0)->loa_comp_start));
+	LASSERTF((int)offsetof(struct lustre_ost_attrs, loa_comp_end) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_ost_attrs, loa_comp_end));
+	LASSERTF((int)sizeof(((struct lustre_ost_attrs *)0)->loa_comp_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_ost_attrs *)0)->loa_comp_end));
+	LASSERTF(OUT_CREATE == 1, "found %lld\n",
+		 (long long)OUT_CREATE);
+	LASSERTF(OUT_DESTROY == 2, "found %lld\n",
+		 (long long)OUT_DESTROY);
+	LASSERTF(OUT_REF_ADD == 3, "found %lld\n",
+		 (long long)OUT_REF_ADD);
+	LASSERTF(OUT_REF_DEL == 4, "found %lld\n",
+		 (long long)OUT_REF_DEL);
+	LASSERTF(OUT_ATTR_SET == 5, "found %lld\n",
+		 (long long)OUT_ATTR_SET);
+	LASSERTF(OUT_ATTR_GET == 6, "found %lld\n",
+		 (long long)OUT_ATTR_GET);
+	LASSERTF(OUT_XATTR_SET == 7, "found %lld\n",
+		 (long long)OUT_XATTR_SET);
+	LASSERTF(OUT_XATTR_GET == 8, "found %lld\n",
+		 (long long)OUT_XATTR_GET);
+	LASSERTF(OUT_INDEX_LOOKUP == 9, "found %lld\n",
+		 (long long)OUT_INDEX_LOOKUP);
+	LASSERTF(OUT_INDEX_LOOKUP == 9, "found %lld\n",
+		 (long long)OUT_INDEX_LOOKUP);
+	LASSERTF(OUT_INDEX_INSERT == 10, "found %lld\n",
+		 (long long)OUT_INDEX_INSERT);
+	LASSERTF(OUT_INDEX_DELETE == 11, "found %lld\n",
+		 (long long)OUT_INDEX_DELETE);
+	LASSERTF(OUT_WRITE == 12, "found %lld\n",
+		 (long long)OUT_WRITE);
+	LASSERTF(OUT_XATTR_DEL == 13, "found %lld\n",
+		 (long long)OUT_XATTR_DEL);
+	LASSERTF(OUT_PUNCH == 14, "found %lld\n",
+		 (long long)OUT_PUNCH);
+	LASSERTF(OUT_READ == 15, "found %lld\n",
+		 (long long)OUT_READ);
+
+	/* Checks for struct hsm_attrs */
+	LASSERTF((int)sizeof(struct hsm_attrs) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_attrs));
+	LASSERTF((int)offsetof(struct hsm_attrs, hsm_compat) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_attrs, hsm_compat));
+	LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_compat) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_compat));
+	LASSERTF((int)offsetof(struct hsm_attrs, hsm_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_attrs, hsm_flags));
+	LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_flags));
+	LASSERTF((int)offsetof(struct hsm_attrs, hsm_arch_id) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_attrs, hsm_arch_id));
+	LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_arch_id) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_arch_id));
+	LASSERTF((int)offsetof(struct hsm_attrs, hsm_arch_ver) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_attrs, hsm_arch_ver));
+	LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_arch_ver) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_arch_ver));
+
+	/* Checks for struct ost_id */
+	LASSERTF((int)sizeof(struct ost_id) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct ost_id));
+	LASSERTF((int)offsetof(struct ost_id, oi) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_id, oi));
+	LASSERTF((int)sizeof(((struct ost_id *)0)->oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_id *)0)->oi));
+	LASSERTF(LUSTRE_FID_INIT_OID == 1, "found %lld\n",
+		 (long long)LUSTRE_FID_INIT_OID);
+	LASSERTF(FID_SEQ_OST_MDT0 == 0, "found %lld\n",
+		 (long long)FID_SEQ_OST_MDT0);
+	LASSERTF(FID_SEQ_LLOG == 1, "found %lld\n",
+		 (long long)FID_SEQ_LLOG);
+	LASSERTF(FID_SEQ_ECHO == 2, "found %lld\n",
+		 (long long)FID_SEQ_ECHO);
+	LASSERTF(FID_SEQ_UNUSED_START == 3, "found %lld\n",
+		 (long long)FID_SEQ_UNUSED_START);
+	LASSERTF(FID_SEQ_UNUSED_END == 9, "found %lld\n",
+		 (long long)FID_SEQ_UNUSED_END);
+	LASSERTF(FID_SEQ_RSVD == 11, "found %lld\n",
+		 (long long)FID_SEQ_RSVD);
+	LASSERTF(FID_SEQ_IGIF == 12, "found %lld\n",
+		 (long long)FID_SEQ_IGIF);
+	LASSERTF(FID_SEQ_IGIF_MAX == 0x00000000ffffffffULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_IGIF_MAX);
+	LASSERTF(FID_SEQ_IDIF == 0x0000000100000000ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_IDIF);
+	LASSERTF(FID_SEQ_IDIF_MAX == 0x00000001ffffffffULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_IDIF_MAX);
+	LASSERTF(FID_SEQ_START == 0x0000000200000000ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_START);
+	LASSERTF(FID_SEQ_LOCAL_FILE == 0x0000000200000001ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_LOCAL_FILE);
+	LASSERTF(FID_SEQ_DOT_LUSTRE == 0x0000000200000002ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_DOT_LUSTRE);
+	LASSERTF(FID_SEQ_SPECIAL == 0x0000000200000004ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_SPECIAL);
+	LASSERTF(FID_SEQ_QUOTA == 0x0000000200000005ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_QUOTA);
+	LASSERTF(FID_SEQ_QUOTA_GLB == 0x0000000200000006ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_QUOTA_GLB);
+	LASSERTF(FID_SEQ_ROOT == 0x0000000200000007ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_ROOT);
+	LASSERTF(FID_SEQ_LAYOUT_RBTREE == 0x0000000200000008ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_LAYOUT_RBTREE);
+	LASSERTF(FID_SEQ_UPDATE_LOG == 0x0000000200000009ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_UPDATE_LOG);
+	LASSERTF(FID_SEQ_UPDATE_LOG_DIR == 0x000000020000000aULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_UPDATE_LOG_DIR);
+	LASSERTF(FID_SEQ_NORMAL == 0x0000000200000400ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_NORMAL);
+	LASSERTF(FID_SEQ_LOV_DEFAULT == 0xffffffffffffffffULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_LOV_DEFAULT);
+	LASSERTF(FID_OID_SPECIAL_BFL == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)FID_OID_SPECIAL_BFL);
+	LASSERTF(FID_OID_DOT_LUSTRE == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)FID_OID_DOT_LUSTRE);
+	LASSERTF(FID_OID_DOT_LUSTRE_OBF == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)FID_OID_DOT_LUSTRE_OBF);
+
+	/* Checks for struct lu_dirent */
+	LASSERTF((int)sizeof(struct lu_dirent) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lu_dirent));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_fid));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_fid));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_hash) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_hash));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_hash) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_hash));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_reclen) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_reclen));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_reclen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_reclen));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_namelen) == 26, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_namelen));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_namelen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_namelen));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_attrs) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_attrs));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_attrs) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_attrs));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_name[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_name[0]));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_name[0]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_name[0]));
+	LASSERTF(LUDA_FID == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)LUDA_FID);
+	LASSERTF(LUDA_TYPE == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)LUDA_TYPE);
+	LASSERTF(LUDA_64BITHASH == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)LUDA_64BITHASH);
+
+	/* Checks for struct luda_type */
+	LASSERTF((int)sizeof(struct luda_type) == 2, "found %lld\n",
+		 (long long)(int)sizeof(struct luda_type));
+	LASSERTF((int)offsetof(struct luda_type, lt_type) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct luda_type, lt_type));
+	LASSERTF((int)sizeof(((struct luda_type *)0)->lt_type) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct luda_type *)0)->lt_type));
+
+	/* Checks for struct lu_dirpage */
+	LASSERTF((int)sizeof(struct lu_dirpage) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct lu_dirpage));
+	LASSERTF((int)offsetof(struct lu_dirpage, ldp_hash_start) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirpage, ldp_hash_start));
+	LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_hash_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_hash_start));
+	LASSERTF((int)offsetof(struct lu_dirpage, ldp_hash_end) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirpage, ldp_hash_end));
+	LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_hash_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_hash_end));
+	LASSERTF((int)offsetof(struct lu_dirpage, ldp_flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirpage, ldp_flags));
+	LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_flags));
+	LASSERTF((int)offsetof(struct lu_dirpage, ldp_pad0) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirpage, ldp_pad0));
+	LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_pad0) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_pad0));
+	LASSERTF((int)offsetof(struct lu_dirpage, ldp_entries[0]) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirpage, ldp_entries[0]));
+	LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_entries[0]) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_entries[0]));
+	LASSERTF(LDF_EMPTY == 1, "found %lld\n",
+		 (long long)LDF_EMPTY);
+	LASSERTF(LDF_COLLIDE == 2, "found %lld\n",
+		 (long long)LDF_COLLIDE);
+	LASSERTF(LU_PAGE_SIZE == 4096, "found %lld\n",
+		 (long long)LU_PAGE_SIZE);
+	/* Checks for union lu_page */
+	LASSERTF((int)sizeof(union lu_page) == 4096, "found %lld\n",
+		 (long long)(int)sizeof(union lu_page));
+
+	/* Checks for struct lustre_handle */
+	LASSERTF((int)sizeof(struct lustre_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_handle));
+	LASSERTF((int)offsetof(struct lustre_handle, cookie) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_handle, cookie));
+	LASSERTF((int)sizeof(((struct lustre_handle *)0)->cookie) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_handle *)0)->cookie));
+
+	/* Checks for struct lustre_msg_v2 */
+	LASSERTF((int)sizeof(struct lustre_msg_v2) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_msg_v2));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_bufcount) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_bufcount));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_bufcount) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_bufcount));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_secflvr) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_secflvr));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_secflvr) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_secflvr));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_magic) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_magic));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_magic));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_repsize) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_repsize));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_cksum) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_cksum));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_flags) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_flags));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_2) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_2));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_3) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_3));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_3));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_buflens[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_buflens[0]));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0]) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0]));
+	LASSERTF(LUSTRE_MSG_MAGIC_V2 == 0x0BD00BD3, "found 0x%.8x\n",
+		LUSTRE_MSG_MAGIC_V2);
+	LASSERTF(LUSTRE_MSG_MAGIC_V2_SWABBED == 0xD30BD00B, "found 0x%.8x\n",
+		LUSTRE_MSG_MAGIC_V2_SWABBED);
+
+	/* Checks for struct ptlrpc_body */
+	LASSERTF((int)sizeof(struct ptlrpc_body_v3) == 184, "found %lld\n",
+		 (long long)(int)sizeof(struct ptlrpc_body_v3));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_handle) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_handle));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_type) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_type));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_version) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_version));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_opc) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_opc));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_status) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_status));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_xid) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_xid));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_tag) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_tag));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_tag) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_tag));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding0) == 34, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding0));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding0) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding0));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding1) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding1));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding1));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_committed) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_committed));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_transno) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_transno));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_flags) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_flags));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_op_flags) == 60, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_op_flags));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_timeout) == 68, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_timeout));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_service_time) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_service_time));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_limit) == 76, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_limit));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_slv) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_slv));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv));
+	CLASSERT(PTLRPC_NUM_VERSIONS == 4);
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_pre_versions) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_pre_versions));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_mbits) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_mbits));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_mbits) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_mbits));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_0) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding64_0));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_0) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_0));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_1) == 136, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding64_1));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_1));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_2) == 144, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding64_2));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_2));
+	CLASSERT(LUSTRE_JOBID_SIZE == 32);
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_jobid) == 152, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_jobid));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_jobid) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_jobid));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_handle) == (int)offsetof(struct ptlrpc_body_v2, pb_handle), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_handle), (int)offsetof(struct ptlrpc_body_v2, pb_handle));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_handle), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_handle));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_type) == (int)offsetof(struct ptlrpc_body_v2, pb_type), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_type), (int)offsetof(struct ptlrpc_body_v2, pb_type));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_type), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_type));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_version) == (int)offsetof(struct ptlrpc_body_v2, pb_version), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_version), (int)offsetof(struct ptlrpc_body_v2, pb_version));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_version), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_version));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_opc) == (int)offsetof(struct ptlrpc_body_v2, pb_opc), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_opc), (int)offsetof(struct ptlrpc_body_v2, pb_opc));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_opc), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_opc));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_status) == (int)offsetof(struct ptlrpc_body_v2, pb_status), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_status), (int)offsetof(struct ptlrpc_body_v2, pb_status));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_status), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_status));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_xid) == (int)offsetof(struct ptlrpc_body_v2, pb_last_xid), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_last_xid), (int)offsetof(struct ptlrpc_body_v2, pb_last_xid));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_xid), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_xid));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_tag) == (int)offsetof(struct ptlrpc_body_v2, pb_tag), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_tag), (int)offsetof(struct ptlrpc_body_v2, pb_tag));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_tag) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_tag), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_tag), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_tag));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding0) == (int)offsetof(struct ptlrpc_body_v2, pb_padding0), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_padding0), (int)offsetof(struct ptlrpc_body_v2, pb_padding0));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding0) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding0), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding0), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding0));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding1) == (int)offsetof(struct ptlrpc_body_v2, pb_padding1), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_padding1), (int)offsetof(struct ptlrpc_body_v2, pb_padding1));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding1) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding1), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding1), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding1));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_committed) == (int)offsetof(struct ptlrpc_body_v2, pb_last_committed), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_last_committed), (int)offsetof(struct ptlrpc_body_v2, pb_last_committed));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_committed), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_committed));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_transno) == (int)offsetof(struct ptlrpc_body_v2, pb_transno), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_transno), (int)offsetof(struct ptlrpc_body_v2, pb_transno));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_transno), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_transno));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_flags) == (int)offsetof(struct ptlrpc_body_v2, pb_flags), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_flags), (int)offsetof(struct ptlrpc_body_v2, pb_flags));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_flags), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_flags));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_op_flags) == (int)offsetof(struct ptlrpc_body_v2, pb_op_flags), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_op_flags), (int)offsetof(struct ptlrpc_body_v2, pb_op_flags));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_op_flags), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_op_flags));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt) == (int)offsetof(struct ptlrpc_body_v2, pb_conn_cnt), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt), (int)offsetof(struct ptlrpc_body_v2, pb_conn_cnt));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_conn_cnt), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_conn_cnt));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_timeout) == (int)offsetof(struct ptlrpc_body_v2, pb_timeout), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_timeout), (int)offsetof(struct ptlrpc_body_v2, pb_timeout));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_timeout), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_timeout));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_service_time) == (int)offsetof(struct ptlrpc_body_v2, pb_service_time), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_service_time), (int)offsetof(struct ptlrpc_body_v2, pb_service_time));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_service_time), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_service_time));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_limit) == (int)offsetof(struct ptlrpc_body_v2, pb_limit), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_limit), (int)offsetof(struct ptlrpc_body_v2, pb_limit));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_limit), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_limit));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_slv) == (int)offsetof(struct ptlrpc_body_v2, pb_slv), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_slv), (int)offsetof(struct ptlrpc_body_v2, pb_slv));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_slv), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_slv));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_pre_versions) == (int)offsetof(struct ptlrpc_body_v2, pb_pre_versions), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_pre_versions), (int)offsetof(struct ptlrpc_body_v2, pb_pre_versions));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_pre_versions), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_pre_versions));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_mbits) == (int)offsetof(struct ptlrpc_body_v2, pb_mbits), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_mbits), (int)offsetof(struct ptlrpc_body_v2, pb_mbits));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_mbits) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_mbits), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_mbits), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_mbits));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_0) == (int)offsetof(struct ptlrpc_body_v2, pb_padding64_0), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_padding64_0), (int)offsetof(struct ptlrpc_body_v2, pb_padding64_0));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_0) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_0), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_0), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_0));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_1) == (int)offsetof(struct ptlrpc_body_v2, pb_padding64_1), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_padding64_1), (int)offsetof(struct ptlrpc_body_v2, pb_padding64_1));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_1) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_1), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_1), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_1));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_2) == (int)offsetof(struct ptlrpc_body_v2, pb_padding64_2), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_padding64_2), (int)offsetof(struct ptlrpc_body_v2, pb_padding64_2));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_2) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_2), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_2), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_2));
+	LASSERTF(MSG_PTLRPC_BODY_OFF == 0, "found %lld\n",
+		 (long long)MSG_PTLRPC_BODY_OFF);
+	LASSERTF(REQ_REC_OFF == 1, "found %lld\n",
+		 (long long)REQ_REC_OFF);
+	LASSERTF(REPLY_REC_OFF == 1, "found %lld\n",
+		 (long long)REPLY_REC_OFF);
+	LASSERTF(DLM_LOCKREQ_OFF == 1, "found %lld\n",
+		 (long long)DLM_LOCKREQ_OFF);
+	LASSERTF(DLM_REQ_REC_OFF == 2, "found %lld\n",
+		 (long long)DLM_REQ_REC_OFF);
+	LASSERTF(DLM_INTENT_IT_OFF == 2, "found %lld\n",
+		 (long long)DLM_INTENT_IT_OFF);
+	LASSERTF(DLM_INTENT_REC_OFF == 3, "found %lld\n",
+		 (long long)DLM_INTENT_REC_OFF);
+	LASSERTF(DLM_LOCKREPLY_OFF == 1, "found %lld\n",
+		 (long long)DLM_LOCKREPLY_OFF);
+	LASSERTF(DLM_REPLY_REC_OFF == 2, "found %lld\n",
+		 (long long)DLM_REPLY_REC_OFF);
+	LASSERTF(MSG_PTLRPC_HEADER_OFF == 31, "found %lld\n",
+		 (long long)MSG_PTLRPC_HEADER_OFF);
+	LASSERTF(PTLRPC_MSG_VERSION == 0x00000003, "found 0x%.8x\n",
+		PTLRPC_MSG_VERSION);
+	LASSERTF(LUSTRE_VERSION_MASK == 0xffff0000, "found 0x%.8x\n",
+		LUSTRE_VERSION_MASK);
+	LASSERTF(LUSTRE_OBD_VERSION == 0x00010000, "found 0x%.8x\n",
+		LUSTRE_OBD_VERSION);
+	LASSERTF(LUSTRE_MDS_VERSION == 0x00020000, "found 0x%.8x\n",
+		LUSTRE_MDS_VERSION);
+	LASSERTF(LUSTRE_OST_VERSION == 0x00030000, "found 0x%.8x\n",
+		LUSTRE_OST_VERSION);
+	LASSERTF(LUSTRE_DLM_VERSION == 0x00040000, "found 0x%.8x\n",
+		LUSTRE_DLM_VERSION);
+	LASSERTF(LUSTRE_LOG_VERSION == 0x00050000, "found 0x%.8x\n",
+		LUSTRE_LOG_VERSION);
+	LASSERTF(LUSTRE_MGS_VERSION == 0x00060000, "found 0x%.8x\n",
+		LUSTRE_MGS_VERSION);
+	LASSERTF(MSGHDR_AT_SUPPORT == 1, "found %lld\n",
+		 (long long)MSGHDR_AT_SUPPORT);
+	LASSERTF(MSGHDR_CKSUM_INCOMPAT18 == 2, "found %lld\n",
+		 (long long)MSGHDR_CKSUM_INCOMPAT18);
+	LASSERTF(MSG_OP_FLAG_MASK == 0xffff0000UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_OP_FLAG_MASK);
+	LASSERTF(MSG_OP_FLAG_SHIFT == 16, "found %lld\n",
+		 (long long)MSG_OP_FLAG_SHIFT);
+	LASSERTF(MSG_GEN_FLAG_MASK == 0x0000ffffUL, "found 0x%.8xUL\n",
+		(unsigned)MSG_GEN_FLAG_MASK);
+	LASSERTF(MSG_LAST_REPLAY == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_LAST_REPLAY);
+	LASSERTF(MSG_RESENT == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_RESENT);
+	LASSERTF(MSG_REPLAY == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_REPLAY);
+	LASSERTF(MSG_DELAY_REPLAY == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_DELAY_REPLAY);
+	LASSERTF(MSG_VERSION_REPLAY == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_VERSION_REPLAY);
+	LASSERTF(MSG_REQ_REPLAY_DONE == 0x00000040UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_REQ_REPLAY_DONE);
+	LASSERTF(MSG_LOCK_REPLAY_DONE == 0x00000080UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_LOCK_REPLAY_DONE);
+	LASSERTF(MSG_CONNECT_RECOVERING == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_RECOVERING);
+	LASSERTF(MSG_CONNECT_RECONNECT == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_RECONNECT);
+	LASSERTF(MSG_CONNECT_REPLAYABLE == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_REPLAYABLE);
+	LASSERTF(MSG_CONNECT_LIBCLIENT == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_LIBCLIENT);
+	LASSERTF(MSG_CONNECT_INITIAL == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_INITIAL);
+	LASSERTF(MSG_CONNECT_ASYNC == 0x00000040UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_ASYNC);
+	LASSERTF(MSG_CONNECT_NEXT_VER == 0x00000080UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_NEXT_VER);
+	LASSERTF(MSG_CONNECT_TRANSNO == 0x00000100UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_TRANSNO);
+
+	/* Checks for struct obd_connect_data */
+	LASSERTF((int)sizeof(struct obd_connect_data) == 192, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_connect_data));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_connect_flags) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_connect_flags));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_version) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_version));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_version));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_grant));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_index) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_index));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_index));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_brw_size) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_brw_size));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_brw_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_brw_size));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_ibits_known) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_ibits_known));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_blkbits) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_grant_blkbits));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_blkbits) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_blkbits));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_inobits) == 33, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_grant_inobits));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_inobits) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_inobits));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_tax_kb) == 34, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_grant_tax_kb));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_tax_kb) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_tax_kb));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_max_blks) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_grant_max_blks));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_max_blks) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_max_blks));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_transno) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_transno));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_transno) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_transno));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_group) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_group));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_group) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_group));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_cksum_types) == 52, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_cksum_types));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_cksum_types) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_cksum_types));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_max_easize) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_max_easize));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_max_easize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_max_easize));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_instance) == 60, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_instance));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_instance) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_instance));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_maxbytes) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_maxbytes));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_maxbytes) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_maxbytes));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_maxmodrpcs) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_maxmodrpcs));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_maxmodrpcs) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_maxmodrpcs));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding0) == 74, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding0));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding0) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding0));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding1) == 76, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding1));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding1));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_connect_flags2) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_connect_flags2));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags2));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding3) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding3));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding3));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding4) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding4));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding4));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding5) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding5));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding5) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding5));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding6) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding6));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding6) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding6));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding7) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding7));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding7) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding7));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding8) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding8));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding8) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding8));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding9) == 136, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding9));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding9) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding9));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingA) == 144, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingA));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingA) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingA));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingB) == 152, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingB));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingB) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingB));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingC) == 160, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingC));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingC) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingC));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingD) == 168, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingD));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingD) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingD));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingE) == 176, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingE));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingE) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingE));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingF) == 184, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingF));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingF) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingF));
+	LASSERTF(OBD_CONNECT_RDONLY == 0x1ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_RDONLY);
+	LASSERTF(OBD_CONNECT_INDEX == 0x2ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_INDEX);
+	LASSERTF(OBD_CONNECT_MDS == 0x4ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_MDS);
+	LASSERTF(OBD_CONNECT_GRANT == 0x8ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_GRANT);
+	LASSERTF(OBD_CONNECT_SRVLOCK == 0x10ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_SRVLOCK);
+	LASSERTF(OBD_CONNECT_VERSION == 0x20ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_VERSION);
+	LASSERTF(OBD_CONNECT_REQPORTAL == 0x40ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_REQPORTAL);
+	LASSERTF(OBD_CONNECT_ACL == 0x80ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_ACL);
+	LASSERTF(OBD_CONNECT_XATTR == 0x100ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_XATTR);
+	LASSERTF(OBD_CONNECT_LARGE_ACL == 0x200ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LARGE_ACL);
+	LASSERTF(OBD_CONNECT_TRUNCLOCK == 0x400ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_TRUNCLOCK);
+	LASSERTF(OBD_CONNECT_TRANSNO == 0x800ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_TRANSNO);
+	LASSERTF(OBD_CONNECT_IBITS == 0x1000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_IBITS);
+	LASSERTF(OBD_CONNECT_BARRIER == 0x2000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_BARRIER);
+	LASSERTF(OBD_CONNECT_ATTRFID == 0x4000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_ATTRFID);
+	LASSERTF(OBD_CONNECT_NODEVOH == 0x8000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_NODEVOH);
+	LASSERTF(OBD_CONNECT_RMT_CLIENT == 0x10000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_RMT_CLIENT);
+	LASSERTF(OBD_CONNECT_RMT_CLIENT_FORCE == 0x20000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_RMT_CLIENT_FORCE);
+	LASSERTF(OBD_CONNECT_BRW_SIZE == 0x40000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_BRW_SIZE);
+	LASSERTF(OBD_CONNECT_QUOTA64 == 0x80000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_QUOTA64);
+	LASSERTF(OBD_CONNECT_MDS_CAPA == 0x100000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_MDS_CAPA);
+	LASSERTF(OBD_CONNECT_OSS_CAPA == 0x200000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_OSS_CAPA);
+	LASSERTF(OBD_CONNECT_CANCELSET == 0x400000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_CANCELSET);
+	LASSERTF(OBD_CONNECT_SOM == 0x800000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_SOM);
+	LASSERTF(OBD_CONNECT_AT == 0x1000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_AT);
+	LASSERTF(OBD_CONNECT_LRU_RESIZE == 0x2000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LRU_RESIZE);
+	LASSERTF(OBD_CONNECT_MDS_MDS == 0x4000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_MDS_MDS);
+	LASSERTF(OBD_CONNECT_REAL == 0x8000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_REAL);
+	LASSERTF(OBD_CONNECT_CHANGE_QS == 0x10000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_CHANGE_QS);
+	LASSERTF(OBD_CONNECT_CKSUM == 0x20000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_CKSUM);
+	LASSERTF(OBD_CONNECT_FID == 0x40000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_FID);
+	LASSERTF(OBD_CONNECT_VBR == 0x80000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_VBR);
+	LASSERTF(OBD_CONNECT_LOV_V3 == 0x100000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LOV_V3);
+	LASSERTF(OBD_CONNECT_GRANT_SHRINK == 0x200000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_GRANT_SHRINK);
+	LASSERTF(OBD_CONNECT_SKIP_ORPHAN == 0x400000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_SKIP_ORPHAN);
+	LASSERTF(OBD_CONNECT_MAX_EASIZE == 0x800000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_MAX_EASIZE);
+	LASSERTF(OBD_CONNECT_FULL20 == 0x1000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_FULL20);
+	LASSERTF(OBD_CONNECT_LAYOUTLOCK == 0x2000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LAYOUTLOCK);
+	LASSERTF(OBD_CONNECT_64BITHASH == 0x4000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_64BITHASH);
+	LASSERTF(OBD_CONNECT_MAXBYTES == 0x8000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_MAXBYTES);
+	LASSERTF(OBD_CONNECT_IMP_RECOV == 0x10000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_IMP_RECOV);
+	LASSERTF(OBD_CONNECT_JOBSTATS == 0x20000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_JOBSTATS);
+	LASSERTF(OBD_CONNECT_UMASK == 0x40000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_UMASK);
+	LASSERTF(OBD_CONNECT_EINPROGRESS == 0x80000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_EINPROGRESS);
+	LASSERTF(OBD_CONNECT_GRANT_PARAM == 0x100000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_GRANT_PARAM);
+	LASSERTF(OBD_CONNECT_FLOCK_OWNER == 0x200000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_FLOCK_OWNER);
+	LASSERTF(OBD_CONNECT_LVB_TYPE == 0x400000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LVB_TYPE);
+	LASSERTF(OBD_CONNECT_NANOSEC_TIME == 0x800000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_NANOSEC_TIME);
+	LASSERTF(OBD_CONNECT_LIGHTWEIGHT == 0x1000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LIGHTWEIGHT);
+	LASSERTF(OBD_CONNECT_SHORTIO == 0x2000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_SHORTIO);
+	LASSERTF(OBD_CONNECT_PINGLESS == 0x4000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_PINGLESS);
+	LASSERTF(OBD_CONNECT_FLOCK_DEAD == 0x8000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_FLOCK_DEAD);
+	LASSERTF(OBD_CONNECT_OPEN_BY_FID == 0x20000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_OPEN_BY_FID);
+	LASSERTF(OBD_CONNECT_LFSCK == 0x40000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LFSCK);
+	LASSERTF(OBD_CONNECT_UNLINK_CLOSE == 0x100000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_UNLINK_CLOSE);
+	LASSERTF(OBD_CONNECT_MULTIMODRPCS == 0x200000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_MULTIMODRPCS);
+	LASSERTF(OBD_CONNECT_DIR_STRIPE == 0x400000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_DIR_STRIPE);
+	LASSERTF(OBD_CONNECT_SUBTREE == 0x800000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_SUBTREE);
+	LASSERTF(OBD_CONNECT_LOCK_AHEAD == 0x1000000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LOCK_AHEAD);
+	LASSERTF(OBD_CONNECT_BULK_MBITS == 0x2000000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_BULK_MBITS);
+	LASSERTF(OBD_CONNECT_OBDOPACK == 0x4000000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_OBDOPACK);
+	LASSERTF(OBD_CONNECT_FLAGS2 == 0x8000000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_FLAGS2);
+	LASSERTF(OBD_CONNECT2_FILE_SECCTX == 0x1ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_FILE_SECCTX);
+	LASSERTF(OBD_CKSUM_CRC32 == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_CRC32);
+	LASSERTF(OBD_CKSUM_ADLER == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_ADLER);
+	LASSERTF(OBD_CKSUM_CRC32C == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_CRC32C);
+
+	/* Checks for struct ost_layout */
+	LASSERTF((int)sizeof(struct ost_layout) == 28, "found %lld\n",
+		 (long long)(int)sizeof(struct ost_layout));
+	LASSERTF((int)offsetof(struct ost_layout, ol_stripe_size) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_layout, ol_stripe_size));
+	LASSERTF((int)sizeof(((struct ost_layout *)0)->ol_stripe_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_layout *)0)->ol_stripe_size));
+	LASSERTF((int)offsetof(struct ost_layout, ol_stripe_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_layout, ol_stripe_count));
+	LASSERTF((int)sizeof(((struct ost_layout *)0)->ol_stripe_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_layout *)0)->ol_stripe_count));
+	LASSERTF((int)offsetof(struct ost_layout, ol_comp_start) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_layout, ol_comp_start));
+	LASSERTF((int)sizeof(((struct ost_layout *)0)->ol_comp_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_layout *)0)->ol_comp_start));
+	LASSERTF((int)offsetof(struct ost_layout, ol_comp_end) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_layout, ol_comp_end));
+	LASSERTF((int)sizeof(((struct ost_layout *)0)->ol_comp_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_layout *)0)->ol_comp_end));
+	LASSERTF((int)offsetof(struct ost_layout, ol_comp_id) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_layout, ol_comp_id));
+	LASSERTF((int)sizeof(((struct ost_layout *)0)->ol_comp_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_layout *)0)->ol_comp_id));
+
+	/* Checks for struct obdo */
+	LASSERTF((int)sizeof(struct obdo) == 208, "found %lld\n",
+		 (long long)(int)sizeof(struct obdo));
+	LASSERTF((int)offsetof(struct obdo, o_valid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_valid));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_valid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_valid));
+	LASSERTF((int)offsetof(struct obdo, o_oi) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_oi));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_oi));
+	LASSERTF((int)offsetof(struct obdo, o_parent_seq) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_parent_seq));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_seq) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_parent_seq));
+	LASSERTF((int)offsetof(struct obdo, o_size) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_size));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_size));
+	LASSERTF((int)offsetof(struct obdo, o_mtime) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_mtime));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_mtime));
+	LASSERTF((int)offsetof(struct obdo, o_atime) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_atime));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_atime));
+	LASSERTF((int)offsetof(struct obdo, o_ctime) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_ctime));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_ctime));
+	LASSERTF((int)offsetof(struct obdo, o_blocks) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_blocks));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_blocks));
+	LASSERTF((int)offsetof(struct obdo, o_grant) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_grant));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_grant) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_grant));
+	LASSERTF((int)offsetof(struct obdo, o_blksize) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_blksize));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_blksize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_blksize));
+	LASSERTF((int)offsetof(struct obdo, o_mode) == 84, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_mode));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_mode));
+	LASSERTF((int)offsetof(struct obdo, o_uid) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_uid));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_uid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_uid));
+	LASSERTF((int)offsetof(struct obdo, o_gid) == 92, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_gid));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_gid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_gid));
+	LASSERTF((int)offsetof(struct obdo, o_flags) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_flags));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_flags));
+	LASSERTF((int)offsetof(struct obdo, o_nlink) == 100, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_nlink));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_nlink) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_nlink));
+	LASSERTF((int)offsetof(struct obdo, o_parent_oid) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_parent_oid));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_oid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_parent_oid));
+	LASSERTF((int)offsetof(struct obdo, o_misc) == 108, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_misc));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_misc) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_misc));
+	LASSERTF((int)offsetof(struct obdo, o_ioepoch) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_ioepoch));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_ioepoch) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_ioepoch));
+	LASSERTF((int)offsetof(struct obdo, o_stripe_idx) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_stripe_idx));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_stripe_idx) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_stripe_idx));
+	LASSERTF((int)offsetof(struct obdo, o_parent_ver) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_parent_ver));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_ver) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_parent_ver));
+	LASSERTF((int)offsetof(struct obdo, o_handle) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_handle));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_handle));
+	LASSERTF((int)offsetof(struct obdo, o_layout) == 136, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_layout));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_layout) == 28, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_layout));
+	LASSERTF((int)offsetof(struct obdo, o_padding_3) == 164, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_padding_3));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_padding_3));
+	LASSERTF((int)offsetof(struct obdo, o_uid_h) == 168, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_uid_h));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_uid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_uid_h));
+	LASSERTF((int)offsetof(struct obdo, o_gid_h) == 172, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_gid_h));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_gid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_gid_h));
+	LASSERTF((int)offsetof(struct obdo, o_data_version) == 176, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_data_version));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_data_version) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_data_version));
+	LASSERTF((int)offsetof(struct obdo, o_projid) == 184, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_projid));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_projid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_projid));
+	LASSERTF((int)offsetof(struct obdo, o_padding_4) == 188, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_padding_4));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_padding_4));
+	LASSERTF((int)offsetof(struct obdo, o_padding_5) == 192, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_padding_5));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_5) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_padding_5));
+	LASSERTF((int)offsetof(struct obdo, o_padding_6) == 200, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_padding_6));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_6) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_padding_6));
+	LASSERTF(OBD_MD_FLID == (0x00000001ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLID);
+	LASSERTF(OBD_MD_FLATIME == (0x00000002ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLATIME);
+	LASSERTF(OBD_MD_FLMTIME == (0x00000004ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLMTIME);
+	LASSERTF(OBD_MD_FLCTIME == (0x00000008ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLCTIME);
+	LASSERTF(OBD_MD_FLSIZE == (0x00000010ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLSIZE);
+	LASSERTF(OBD_MD_FLBLOCKS == (0x00000020ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLBLOCKS);
+	LASSERTF(OBD_MD_FLBLKSZ == (0x00000040ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLBLKSZ);
+	LASSERTF(OBD_MD_FLMODE == (0x00000080ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLMODE);
+	LASSERTF(OBD_MD_FLTYPE == (0x00000100ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLTYPE);
+	LASSERTF(OBD_MD_FLUID == (0x00000200ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLUID);
+	LASSERTF(OBD_MD_FLGID == (0x00000400ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGID);
+	LASSERTF(OBD_MD_FLFLAGS == (0x00000800ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLFLAGS);
+	LASSERTF(OBD_MD_FLNLINK == (0x00002000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLNLINK);
+	LASSERTF(OBD_MD_FLGENER == (0x00004000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGENER);
+	LASSERTF(OBD_MD_FLRDEV == (0x00010000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLRDEV);
+	LASSERTF(OBD_MD_FLEASIZE == (0x00020000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLEASIZE);
+	LASSERTF(OBD_MD_LINKNAME == (0x00040000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_LINKNAME);
+	LASSERTF(OBD_MD_FLHANDLE == (0x00080000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLHANDLE);
+	LASSERTF(OBD_MD_FLCKSUM == (0x00100000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLCKSUM);
+	LASSERTF(OBD_MD_FLQOS == (0x00200000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLQOS);
+	LASSERTF(OBD_MD_FLGROUP == (0x01000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGROUP);
+	LASSERTF(OBD_MD_FLFID == (0x02000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLFID);
+	LASSERTF(OBD_MD_FLEPOCH == (0x04000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLEPOCH);
+	LASSERTF(OBD_MD_FLGRANT == (0x08000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGRANT);
+	LASSERTF(OBD_MD_FLDIREA == (0x10000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLDIREA);
+	LASSERTF(OBD_MD_FLUSRQUOTA == (0x20000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLUSRQUOTA);
+	LASSERTF(OBD_MD_FLGRPQUOTA == (0x40000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGRPQUOTA);
+	LASSERTF(OBD_MD_FLMODEASIZE == (0x80000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLMODEASIZE);
+	LASSERTF(OBD_MD_MDS == (0x0000000100000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_MDS);
+	LASSERTF(OBD_MD_REINT == (0x0000000200000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_REINT);
+	LASSERTF(OBD_MD_MEA == (0x0000000400000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_MEA);
+	LASSERTF(OBD_MD_TSTATE == (0x0000000800000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_TSTATE);
+	LASSERTF(OBD_MD_FLXATTR == (0x0000001000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLXATTR);
+	LASSERTF(OBD_MD_FLXATTRLS == (0x0000002000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLXATTRLS);
+	LASSERTF(OBD_MD_FLXATTRRM == (0x0000004000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLXATTRRM);
+	LASSERTF(OBD_MD_FLACL == (0x0000008000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLACL);
+	LASSERTF(OBD_MD_FLMDSCAPA == (0x0000020000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLMDSCAPA);
+	LASSERTF(OBD_MD_FLOSSCAPA == (0x0000040000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLOSSCAPA);
+	LASSERTF(OBD_MD_FLCKSPLIT == (0x0000080000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLCKSPLIT);
+	LASSERTF(OBD_MD_FLCROSSREF == (0x0000100000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLCROSSREF);
+	LASSERTF(OBD_MD_FLGETATTRLOCK == (0x0000200000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGETATTRLOCK);
+	LASSERTF(OBD_MD_FLDATAVERSION == (0x0010000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLDATAVERSION);
+	LASSERTF(OBD_MD_CLOSE_INTENT_EXECED == (0x0020000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_CLOSE_INTENT_EXECED);
+	LASSERTF(OBD_MD_DEFAULT_MEA == (0x0040000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_DEFAULT_MEA);
+	LASSERTF(OBD_MD_FLOSTLAYOUT == (0x0080000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLOSTLAYOUT);
+
+	LASSERTF(OBD_MD_FLPROJID == (0x0100000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLPROJID);
+	CLASSERT(OBD_FL_INLINEDATA == 0x00000001);
+	CLASSERT(OBD_FL_OBDMDEXISTS == 0x00000002);
+	CLASSERT(OBD_FL_DELORPHAN == 0x00000004);
+	CLASSERT(OBD_FL_NORPC == 0x00000008);
+	CLASSERT(OBD_FL_IDONLY == 0x00000010);
+	CLASSERT(OBD_FL_RECREATE_OBJS == 0x00000020);
+	CLASSERT(OBD_FL_DEBUG_CHECK == 0x00000040);
+	CLASSERT(OBD_FL_NO_USRQUOTA == 0x00000100);
+	CLASSERT(OBD_FL_NO_GRPQUOTA == 0x00000200);
+	CLASSERT(OBD_FL_CREATE_CROW == 0x00000400);
+	CLASSERT(OBD_FL_SRVLOCK == 0x00000800);
+	CLASSERT(OBD_FL_CKSUM_CRC32 == 0x00001000);
+	CLASSERT(OBD_FL_CKSUM_ADLER == 0x00002000);
+	CLASSERT(OBD_FL_CKSUM_CRC32C == 0x00004000);
+	CLASSERT(OBD_FL_CKSUM_RSVD2 == 0x00008000);
+	CLASSERT(OBD_FL_CKSUM_RSVD3 == 0x00010000);
+	CLASSERT(OBD_FL_SHRINK_GRANT == 0x00020000);
+	CLASSERT(OBD_FL_MMAP == 0x00040000);
+	CLASSERT(OBD_FL_RECOV_RESEND == 0x00080000);
+	CLASSERT(OBD_FL_NOSPC_BLK == 0x00100000);
+	CLASSERT(OBD_FL_FLUSH == 0x00200000);
+	CLASSERT(OBD_FL_SHORT_IO == 0x00400000);
+
+	/* Checks for struct lov_ost_data_v1 */
+	LASSERTF((int)sizeof(struct lov_ost_data_v1) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct lov_ost_data_v1));
+	LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_oi) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_oi));
+	LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_oi));
+	LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_gen) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_gen));
+	LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen));
+	LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_idx) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_idx));
+	LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx));
+
+	/* Checks for struct lov_mds_md_v1 */
+	LASSERTF((int)sizeof(struct lov_mds_md_v1) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lov_mds_md_v1));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_magic));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_magic));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_pattern) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_pattern));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_pattern) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_pattern));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_oi) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_oi));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_oi));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_stripe_size) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_stripe_size));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_size));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_stripe_count) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_stripe_count));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_count) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_count));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_layout_gen) == 30, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_layout_gen));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_layout_gen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_layout_gen));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_objects[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_objects[0]));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_objects[0]) == 24, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_objects[0]));
+	CLASSERT(LOV_MAGIC_V1 == (0x0BD10000 | 0x0BD0));
+
+	/* Checks for struct lov_mds_md_v3 */
+	LASSERTF((int)sizeof(struct lov_mds_md_v3) == 48, "found %lld\n",
+		 (long long)(int)sizeof(struct lov_mds_md_v3));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_magic));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pattern) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pattern));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_oi) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_oi));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_oi));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_size) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_size));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_count) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_count));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_layout_gen) == 30, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_layout_gen));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_layout_gen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_layout_gen));
+	CLASSERT(LOV_MAXPOOLNAME == 15);
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pool_name[15 + 1]) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pool_name[15 + 1]));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name[15 + 1]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name[15 + 1]));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_objects[0]) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_objects[0]));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects[0]) == 24, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects[0]));
+	CLASSERT(LOV_MAGIC_V3 == (0x0BD30000 | 0x0BD0));
+	LASSERTF(LOV_PATTERN_RAID0 == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)LOV_PATTERN_RAID0);
+	LASSERTF(LOV_PATTERN_RAID1 == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)LOV_PATTERN_RAID1);
+	LASSERTF(LOV_PATTERN_FIRST == 0x00000100UL, "found 0x%.8xUL\n",
+		(unsigned)LOV_PATTERN_FIRST);
+	LASSERTF(LOV_PATTERN_CMOBD == 0x00000200UL, "found 0x%.8xUL\n",
+		(unsigned)LOV_PATTERN_CMOBD);
+
+	/* Checks for struct lov_comp_md_entry_v1 */
+	LASSERTF((int)sizeof(struct lov_comp_md_entry_v1) == 48, "found %lld\n",
+		 (long long)(int)sizeof(struct lov_comp_md_entry_v1));
+	LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_id) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_id));
+	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_id));
+	LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_flags));
+	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_flags));
+	LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_extent) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_extent));
+	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_extent) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_extent));
+	LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_offset) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_offset));
+	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_offset) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_offset));
+	LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_size) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_size));
+	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_size));
+	LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_padding) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_padding));
+	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_padding) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_padding));
+	LASSERTF(LCME_FL_INIT == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)LCME_FL_INIT);
+
+	/* Checks for struct lov_comp_md_v1 */
+	LASSERTF((int)sizeof(struct lov_comp_md_v1) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lov_comp_md_v1));
+	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_magic));
+	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_magic));
+	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_size) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_size));
+	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_size));
+	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_layout_gen) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_layout_gen));
+	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_layout_gen) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_layout_gen));
+	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_flags) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_flags));
+	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_flags));
+	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_entry_count) == 14, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_entry_count));
+	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entry_count) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entry_count));
+	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding1) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_padding1));
+	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1));
+	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding2) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_padding2));
+	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding2));
+	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_entries[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_entries[0]));
+	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entries[0]) == 48, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entries[0]));
+	CLASSERT(LOV_MAGIC_COMP_V1 == (0x0BD60000 | 0x0BD0));
+
+	/* Checks for struct lmv_mds_md_v1 */
+	LASSERTF((int)sizeof(struct lmv_mds_md_v1) == 56, "found %lld\n",
+		 (long long)(int)sizeof(struct lmv_mds_md_v1));
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_magic));
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_magic));
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_stripe_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_stripe_count));
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_stripe_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_stripe_count));
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_master_mdt_index) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_master_mdt_index));
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_master_mdt_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_master_mdt_index));
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_hash_type) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_hash_type));
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_hash_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_hash_type));
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_layout_version) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_layout_version));
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_layout_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_layout_version));
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_padding1) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_padding1));
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding1));
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_padding2) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_padding2));
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding2));
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_padding3) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_padding3));
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding3));
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_pool_name[15]) == 55, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_pool_name[15]));
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_pool_name[15]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_pool_name[15]));
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_stripe_fids[0]) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_stripe_fids[0]));
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_stripe_fids[0]) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_stripe_fids[0]));
+	CLASSERT(LMV_MAGIC_V1 == 0x0CD20CD0);
+	CLASSERT(LMV_MAGIC_STRIPE == 0x0CD40CD0);
+	CLASSERT(LMV_HASH_TYPE_MASK == 0x0000ffff);
+	CLASSERT(LMV_HASH_FLAG_MIGRATION == 0x80000000);
+	CLASSERT(LMV_HASH_FLAG_DEAD == 0x40000000);
+	CLASSERT(LMV_HASH_FLAG_BAD_TYPE == 0x20000000);
+	CLASSERT(LMV_HASH_FLAG_LOST_LMV == 0x10000000);
+
+	/* Checks for struct obd_statfs */
+	LASSERTF((int)sizeof(struct obd_statfs) == 144, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_statfs));
+	LASSERTF((int)offsetof(struct obd_statfs, os_type) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_type));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_type) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_type));
+	LASSERTF((int)offsetof(struct obd_statfs, os_blocks) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_blocks));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_blocks));
+	LASSERTF((int)offsetof(struct obd_statfs, os_bfree) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_bfree));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bfree) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_bfree));
+	LASSERTF((int)offsetof(struct obd_statfs, os_bavail) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_bavail));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bavail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_bavail));
+	LASSERTF((int)offsetof(struct obd_statfs, os_ffree) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_ffree));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_ffree) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_ffree));
+	LASSERTF((int)offsetof(struct obd_statfs, os_fsid) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_fsid));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fsid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_fsid));
+	LASSERTF((int)offsetof(struct obd_statfs, os_bsize) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_bsize));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bsize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_bsize));
+	LASSERTF((int)offsetof(struct obd_statfs, os_namelen) == 92, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_namelen));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_namelen) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_namelen));
+	LASSERTF((int)offsetof(struct obd_statfs, os_state) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_state));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_state) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_state));
+	LASSERTF((int)offsetof(struct obd_statfs, os_fprecreated) == 108, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_fprecreated));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fprecreated) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_fprecreated));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare2) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare2));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare2));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare3) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare3));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare3));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare4) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare4));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare4));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare5) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare5));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare5) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare5));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare6) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare6));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare6) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare6));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare7) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare7));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare7) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare7));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare8) == 136, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare8));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare8) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare8));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare9) == 140, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare9));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare9) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare9));
+
+	/* Checks for struct obd_ioobj */
+	LASSERTF((int)sizeof(struct obd_ioobj) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_ioobj));
+	LASSERTF((int)offsetof(struct obd_ioobj, ioo_oid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_ioobj, ioo_oid));
+	LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_oid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_oid));
+	LASSERTF((int)offsetof(struct obd_ioobj, ioo_max_brw) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_ioobj, ioo_max_brw));
+	LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_max_brw) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_max_brw));
+	LASSERTF((int)offsetof(struct obd_ioobj, ioo_bufcnt) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_ioobj, ioo_bufcnt));
+	LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_bufcnt) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_bufcnt));
+	LASSERTF(IOOBJ_MAX_BRW_BITS == 16, "found %lld\n",
+		 (long long)IOOBJ_MAX_BRW_BITS);
+
+	/* Checks for union lquota_id */
+	LASSERTF((int)sizeof(union lquota_id) == 16, "found %lld\n",
+		 (long long)(int)sizeof(union lquota_id));
+
+	LASSERTF(QUOTABLOCK_BITS == 10, "found %lld\n",
+		 (long long)QUOTABLOCK_BITS);
+	LASSERTF(QUOTABLOCK_SIZE == 1024, "found %lld\n",
+		 (long long)QUOTABLOCK_SIZE);
+
+	/* Checks for struct obd_quotactl */
+	LASSERTF((int)sizeof(struct obd_quotactl) == 112, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_quotactl));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_cmd) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_cmd));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_cmd) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_cmd));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_type) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_type));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_type));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_id) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_id));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_id));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_stat) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_stat));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_stat) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_stat));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_dqinfo) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_dqinfo));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_dqinfo) == 24, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_dqinfo));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_dqblk) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_dqblk));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_dqblk) == 72, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_dqblk));
+
+	/* Checks for struct obd_dqinfo */
+	LASSERTF((int)sizeof(struct obd_dqinfo) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_dqinfo));
+	LASSERTF((int)offsetof(struct obd_dqinfo, dqi_bgrace) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqinfo, dqi_bgrace));
+	LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_bgrace) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_bgrace));
+	LASSERTF((int)offsetof(struct obd_dqinfo, dqi_igrace) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqinfo, dqi_igrace));
+	LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_igrace) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_igrace));
+	LASSERTF((int)offsetof(struct obd_dqinfo, dqi_flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqinfo, dqi_flags));
+	LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_flags));
+	LASSERTF((int)offsetof(struct obd_dqinfo, dqi_valid) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqinfo, dqi_valid));
+	LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_valid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_valid));
+
+	/* Checks for struct obd_dqblk */
+	LASSERTF((int)sizeof(struct obd_dqblk) == 72, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_dqblk));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_bhardlimit) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_bhardlimit));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_bhardlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_bhardlimit));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_bsoftlimit) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_bsoftlimit));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_bsoftlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_bsoftlimit));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_curspace) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_curspace));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_curspace) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_curspace));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_ihardlimit) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_ihardlimit));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_ihardlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_ihardlimit));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_isoftlimit) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_isoftlimit));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_isoftlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_isoftlimit));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_curinodes) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_curinodes));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_curinodes) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_curinodes));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_btime) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_btime));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_btime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_btime));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_itime) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_itime));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_itime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_itime));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_valid) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_valid));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_valid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_valid));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_padding) == 68, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_padding));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_padding));
+	LASSERTF(Q_QUOTACHECK == 0x800100, "found 0x%.8x\n",
+		Q_QUOTACHECK);
+	LASSERTF(Q_INITQUOTA == 0x800101, "found 0x%.8x\n",
+		Q_INITQUOTA);
+	LASSERTF(Q_GETOINFO == 0x800102, "found 0x%.8x\n",
+		Q_GETOINFO);
+	LASSERTF(Q_GETOQUOTA == 0x800103, "found 0x%.8x\n",
+		Q_GETOQUOTA);
+	LASSERTF(Q_FINVALIDATE == 0x800104, "found 0x%.8x\n",
+		Q_FINVALIDATE);
+
+	/* Checks for struct lquota_acct_rec */
+	LASSERTF((int)sizeof(struct lquota_acct_rec) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct lquota_acct_rec));
+	LASSERTF((int)offsetof(struct lquota_acct_rec, bspace) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_acct_rec, bspace));
+	LASSERTF((int)sizeof(((struct lquota_acct_rec *)0)->bspace) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_acct_rec *)0)->bspace));
+	LASSERTF((int)offsetof(struct lquota_acct_rec, ispace) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_acct_rec, ispace));
+	LASSERTF((int)sizeof(((struct lquota_acct_rec *)0)->ispace) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_acct_rec *)0)->ispace));
+
+	/* Checks for struct lquota_glb_rec */
+	LASSERTF((int)sizeof(struct lquota_glb_rec) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lquota_glb_rec));
+	LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_hardlimit) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_glb_rec, qbr_hardlimit));
+	LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_hardlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_hardlimit));
+	LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_softlimit) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_glb_rec, qbr_softlimit));
+	LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_softlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_softlimit));
+	LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_time) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_glb_rec, qbr_time));
+	LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_time));
+	LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_granted) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_glb_rec, qbr_granted));
+	LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_granted) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_granted));
+
+	/* Checks for struct lquota_slv_rec */
+	LASSERTF((int)sizeof(struct lquota_slv_rec) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct lquota_slv_rec));
+	LASSERTF((int)offsetof(struct lquota_slv_rec, qsr_granted) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_slv_rec, qsr_granted));
+	LASSERTF((int)sizeof(((struct lquota_slv_rec *)0)->qsr_granted) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_slv_rec *)0)->qsr_granted));
+
+	/* Checks for struct idx_info */
+	LASSERTF((int)sizeof(struct idx_info) == 80, "found %lld\n",
+		 (long long)(int)sizeof(struct idx_info));
+	LASSERTF((int)offsetof(struct idx_info, ii_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_magic));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_magic));
+	LASSERTF((int)offsetof(struct idx_info, ii_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_flags));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_flags));
+	LASSERTF((int)offsetof(struct idx_info, ii_count) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_count));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_count) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_count));
+	LASSERTF((int)offsetof(struct idx_info, ii_pad0) == 10, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_pad0));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad0) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_pad0));
+	LASSERTF((int)offsetof(struct idx_info, ii_attrs) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_attrs));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_attrs) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_attrs));
+	LASSERTF((int)offsetof(struct idx_info, ii_fid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_fid));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_fid));
+	LASSERTF((int)offsetof(struct idx_info, ii_version) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_version));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_version) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_version));
+	LASSERTF((int)offsetof(struct idx_info, ii_hash_start) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_hash_start));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_hash_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_hash_start));
+	LASSERTF((int)offsetof(struct idx_info, ii_hash_end) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_hash_end));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_hash_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_hash_end));
+	LASSERTF((int)offsetof(struct idx_info, ii_keysize) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_keysize));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_keysize) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_keysize));
+	LASSERTF((int)offsetof(struct idx_info, ii_recsize) == 58, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_recsize));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_recsize) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_recsize));
+	LASSERTF((int)offsetof(struct idx_info, ii_pad1) == 60, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_pad1));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_pad1));
+	LASSERTF((int)offsetof(struct idx_info, ii_pad2) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_pad2));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_pad2));
+	LASSERTF((int)offsetof(struct idx_info, ii_pad3) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_pad3));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_pad3));
+	CLASSERT(IDX_INFO_MAGIC == 0x3D37CC37);
+
+	/* Checks for struct lu_idxpage */
+	LASSERTF((int)sizeof(struct lu_idxpage) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct lu_idxpage));
+	LASSERTF((int)offsetof(struct lu_idxpage, lip_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_idxpage, lip_magic));
+	LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_magic));
+	LASSERTF((int)offsetof(struct lu_idxpage, lip_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_idxpage, lip_flags));
+	LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_flags));
+	LASSERTF((int)offsetof(struct lu_idxpage, lip_nr) == 6, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_idxpage, lip_nr));
+	LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_nr) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_nr));
+	LASSERTF((int)offsetof(struct lu_idxpage, lip_pad0) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_idxpage, lip_pad0));
+	LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_pad0) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_pad0));
+	CLASSERT(LIP_MAGIC == 0x8A6D6B6C);
+	LASSERTF(LIP_HDR_SIZE == 16, "found %lld\n",
+		 (long long)LIP_HDR_SIZE);
+	LASSERTF(II_FL_NOHASH == 1, "found %lld\n",
+		 (long long)II_FL_NOHASH);
+	LASSERTF(II_FL_VARKEY == 2, "found %lld\n",
+		 (long long)II_FL_VARKEY);
+	LASSERTF(II_FL_VARREC == 4, "found %lld\n",
+		 (long long)II_FL_VARREC);
+	LASSERTF(II_FL_NONUNQ == 8, "found %lld\n",
+		 (long long)II_FL_NONUNQ);
+
+	/* Checks for struct niobuf_remote */
+	LASSERTF((int)sizeof(struct niobuf_remote) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct niobuf_remote));
+	LASSERTF((int)offsetof(struct niobuf_remote, rnb_offset) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct niobuf_remote, rnb_offset));
+	LASSERTF((int)sizeof(((struct niobuf_remote *)0)->rnb_offset) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct niobuf_remote *)0)->rnb_offset));
+	LASSERTF((int)offsetof(struct niobuf_remote, rnb_len) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct niobuf_remote, rnb_len));
+	LASSERTF((int)sizeof(((struct niobuf_remote *)0)->rnb_len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct niobuf_remote *)0)->rnb_len));
+	LASSERTF((int)offsetof(struct niobuf_remote, rnb_flags) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct niobuf_remote, rnb_flags));
+	LASSERTF((int)sizeof(((struct niobuf_remote *)0)->rnb_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct niobuf_remote *)0)->rnb_flags));
+	LASSERTF(OBD_BRW_READ == 0x01, "found 0x%.8x\n",
+		OBD_BRW_READ);
+	LASSERTF(OBD_BRW_WRITE == 0x02, "found 0x%.8x\n",
+		OBD_BRW_WRITE);
+	LASSERTF(OBD_BRW_SYNC == 0x08, "found 0x%.8x\n",
+		OBD_BRW_SYNC);
+	LASSERTF(OBD_BRW_CHECK == 0x10, "found 0x%.8x\n",
+		OBD_BRW_CHECK);
+	LASSERTF(OBD_BRW_FROM_GRANT == 0x20, "found 0x%.8x\n",
+		OBD_BRW_FROM_GRANT);
+	LASSERTF(OBD_BRW_GRANTED == 0x40, "found 0x%.8x\n",
+		OBD_BRW_GRANTED);
+	LASSERTF(OBD_BRW_NOCACHE == 0x80, "found 0x%.8x\n",
+		OBD_BRW_NOCACHE);
+	LASSERTF(OBD_BRW_NOQUOTA == 0x100, "found 0x%.8x\n",
+		OBD_BRW_NOQUOTA);
+	LASSERTF(OBD_BRW_SRVLOCK == 0x200, "found 0x%.8x\n",
+		OBD_BRW_SRVLOCK);
+	LASSERTF(OBD_BRW_ASYNC == 0x400, "found 0x%.8x\n",
+		OBD_BRW_ASYNC);
+	LASSERTF(OBD_BRW_MEMALLOC == 0x800, "found 0x%.8x\n",
+		OBD_BRW_MEMALLOC);
+	LASSERTF(OBD_BRW_OVER_USRQUOTA == 0x1000, "found 0x%.8x\n",
+		OBD_BRW_OVER_USRQUOTA);
+	LASSERTF(OBD_BRW_OVER_GRPQUOTA == 0x2000, "found 0x%.8x\n",
+		OBD_BRW_OVER_GRPQUOTA);
+	LASSERTF(OBD_BRW_SOFT_SYNC == 0x4000, "found 0x%.8x\n",
+		OBD_BRW_SOFT_SYNC);
+
+	/* Checks for struct ost_body */
+	LASSERTF((int)sizeof(struct ost_body) == 208, "found %lld\n",
+		 (long long)(int)sizeof(struct ost_body));
+	LASSERTF((int)offsetof(struct ost_body, oa) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_body, oa));
+	LASSERTF((int)sizeof(((struct ost_body *)0)->oa) == 208, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_body *)0)->oa));
+
+	/* Checks for struct ll_fid */
+	LASSERTF((int)sizeof(struct ll_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct ll_fid));
+	LASSERTF((int)offsetof(struct ll_fid, id) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fid, id));
+	LASSERTF((int)sizeof(((struct ll_fid *)0)->id) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fid *)0)->id));
+	LASSERTF((int)offsetof(struct ll_fid, generation) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fid, generation));
+	LASSERTF((int)sizeof(((struct ll_fid *)0)->generation) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fid *)0)->generation));
+	LASSERTF((int)offsetof(struct ll_fid, f_type) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fid, f_type));
+	LASSERTF((int)sizeof(((struct ll_fid *)0)->f_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fid *)0)->f_type));
+
+	/* Checks for struct mdt_body */
+	LASSERTF((int)sizeof(struct mdt_body) == 216, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_body));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_fid1) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_fid1));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_fid1));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_fid2) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_fid2));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_fid2));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_handle) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_handle));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_handle));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_valid) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_valid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_valid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_valid));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_size) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_size));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_size));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_mtime) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_mtime));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_mtime));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_atime) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_atime));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_atime));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_ctime) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_ctime));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_ctime));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_blocks) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_blocks));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_blocks));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_t_state) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_t_state));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_t_state) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_t_state));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_fsuid) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_fsuid));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_fsgid) == 108, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_fsgid));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_capability) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_capability));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_capability) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_capability));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_mode) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_mode));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_mode));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_uid) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_uid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_uid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_uid));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_gid) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_gid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_gid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_gid));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_flags) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_flags));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_flags));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_rdev) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_rdev));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_rdev) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_rdev));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_nlink) == 136, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_nlink));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_nlink) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_nlink));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_unused2) == 140, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_unused2));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_unused2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_unused2));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_suppgid) == 144, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_suppgid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_suppgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_suppgid));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_eadatasize) == 148, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_eadatasize));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_eadatasize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_eadatasize));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_aclsize) == 152, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_aclsize));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_aclsize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_aclsize));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_max_mdsize) == 156, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_max_mdsize));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_max_mdsize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_max_mdsize));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_unused3) == 160, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_unused3));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_unused3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_unused3));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_uid_h) == 164, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_uid_h));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_uid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_uid_h));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_gid_h) == 168, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_gid_h));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_gid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_gid_h));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_projid) == 172, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_projid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_projid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_projid));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_padding_6) == 176, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_padding_6));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_6) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_6));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_padding_7) == 184, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_padding_7));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_7) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_7));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_padding_8) == 192, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_padding_8));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_8) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_8));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_padding_9) == 200, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_padding_9));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_9) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_9));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_padding_10) == 208, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_padding_10));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_10) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_10));
+	LASSERTF(MDS_FMODE_CLOSED == 000000000000UL, "found 0%.11oUL\n",
+		MDS_FMODE_CLOSED);
+	LASSERTF(MDS_FMODE_EXEC == 000000000004UL, "found 0%.11oUL\n",
+		MDS_FMODE_EXEC);
+	LASSERTF(MDS_OPEN_CREATED == 000000000010UL, "found 0%.11oUL\n",
+		MDS_OPEN_CREATED);
+	LASSERTF(MDS_OPEN_CROSS == 000000000020UL, "found 0%.11oUL\n",
+		MDS_OPEN_CROSS);
+	LASSERTF(MDS_OPEN_CREAT == 000000000100UL, "found 0%.11oUL\n",
+		MDS_OPEN_CREAT);
+	LASSERTF(MDS_OPEN_EXCL == 000000000200UL, "found 0%.11oUL\n",
+		MDS_OPEN_EXCL);
+	LASSERTF(MDS_OPEN_TRUNC == 000000001000UL, "found 0%.11oUL\n",
+		MDS_OPEN_TRUNC);
+	LASSERTF(MDS_OPEN_APPEND == 000000002000UL, "found 0%.11oUL\n",
+		MDS_OPEN_APPEND);
+	LASSERTF(MDS_OPEN_SYNC == 000000010000UL, "found 0%.11oUL\n",
+		MDS_OPEN_SYNC);
+	LASSERTF(MDS_OPEN_DIRECTORY == 000000200000UL, "found 0%.11oUL\n",
+		MDS_OPEN_DIRECTORY);
+	LASSERTF(MDS_OPEN_BY_FID == 000040000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_BY_FID);
+	LASSERTF(MDS_OPEN_DELAY_CREATE == 000100000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_DELAY_CREATE);
+	LASSERTF(MDS_OPEN_OWNEROVERRIDE == 000200000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_OWNEROVERRIDE);
+	LASSERTF(MDS_OPEN_JOIN_FILE == 000400000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_JOIN_FILE);
+	LASSERTF(MDS_OPEN_LOCK == 004000000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_LOCK);
+	LASSERTF(MDS_OPEN_HAS_EA == 010000000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_HAS_EA);
+	LASSERTF(MDS_OPEN_HAS_OBJS == 020000000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_HAS_OBJS);
+	LASSERTF(MDS_OPEN_NORESTORE == 00000000000100000000000ULL, "found 0%.22lloULL\n",
+			(long long)MDS_OPEN_NORESTORE);
+	LASSERTF(MDS_OPEN_NEWSTRIPE == 00000000000200000000000ULL, "found 0%.22lloULL\n",
+			(long long)MDS_OPEN_NEWSTRIPE);
+	LASSERTF(MDS_OPEN_VOLATILE == 00000000000400000000000ULL, "found 0%.22lloULL\n",
+			(long long)MDS_OPEN_VOLATILE);
+	LASSERTF(LUSTRE_SYNC_FL == 0x00000008UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_SYNC_FL);
+	LASSERTF(LUSTRE_IMMUTABLE_FL == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_IMMUTABLE_FL);
+	LASSERTF(LUSTRE_APPEND_FL == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_APPEND_FL);
+	LASSERTF(LUSTRE_NODUMP_FL == 0x00000040UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_NODUMP_FL);
+	LASSERTF(LUSTRE_NOATIME_FL == 0x00000080UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_NOATIME_FL);
+	LASSERTF(LUSTRE_INDEX_FL == 0x00001000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_INDEX_FL);
+	LASSERTF(LUSTRE_ORPHAN_FL == 0x00002000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_ORPHAN_FL);
+	LASSERTF(LUSTRE_DIRSYNC_FL == 0x00010000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_DIRSYNC_FL);
+	LASSERTF(LUSTRE_TOPDIR_FL == 0x00020000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_TOPDIR_FL);
+	LASSERTF(LUSTRE_DIRECTIO_FL == 0x00100000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_DIRECTIO_FL);
+	LASSERTF(LUSTRE_INLINE_DATA_FL == 0x10000000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_INLINE_DATA_FL);
+	LASSERTF(MDS_INODELOCK_LOOKUP == 0x000001, "found 0x%.8x\n",
+		MDS_INODELOCK_LOOKUP);
+	LASSERTF(MDS_INODELOCK_UPDATE == 0x000002, "found 0x%.8x\n",
+		MDS_INODELOCK_UPDATE);
+	LASSERTF(MDS_INODELOCK_OPEN == 0x000004, "found 0x%.8x\n",
+		MDS_INODELOCK_OPEN);
+	LASSERTF(MDS_INODELOCK_LAYOUT == 0x000008, "found 0x%.8x\n",
+		MDS_INODELOCK_LAYOUT);
+
+	/* Checks for struct mdt_ioepoch */
+	LASSERTF((int)sizeof(struct mdt_ioepoch) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_ioepoch));
+	LASSERTF((int)offsetof(struct mdt_ioepoch, mio_handle) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_ioepoch, mio_handle));
+	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_ioepoch *)0)->mio_handle));
+	LASSERTF((int)offsetof(struct mdt_ioepoch, mio_unused1) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_ioepoch, mio_unused1));
+	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_unused1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_ioepoch *)0)->mio_unused1));
+	LASSERTF((int)offsetof(struct mdt_ioepoch, mio_unused2) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_ioepoch, mio_unused2));
+	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_unused2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_ioepoch *)0)->mio_unused2));
+	LASSERTF((int)offsetof(struct mdt_ioepoch, mio_padding) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_ioepoch, mio_padding));
+	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_ioepoch *)0)->mio_padding));
+
+	/* Checks for struct mdt_rec_setattr */
+	LASSERTF((int)sizeof(struct mdt_rec_setattr) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_setattr));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_suppgid) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_suppgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_suppgid_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_suppgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_1) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_1));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_1_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fid) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_fid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_valid) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_valid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_valid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_valid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_uid) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_uid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_uid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_uid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_gid) == 68, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_gid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_gid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_gid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_size) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_size));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_size));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_blocks) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_blocks));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_blocks));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_mtime) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_mtime));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_mtime));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_atime) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_atime));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_atime));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_ctime) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_ctime));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_ctime));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_attr_flags) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_attr_flags));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_attr_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_attr_flags));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_mode) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_mode));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_mode));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_bias) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_projid) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_projid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_projid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_projid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_4) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_4));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_5) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_5));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_5) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_5));
+
+	/* Checks for struct mdt_rec_create */
+	LASSERTF((int)sizeof(struct mdt_rec_create) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_create));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fid1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid1));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fid2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid2));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_old_handle) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_old_handle));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_old_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_old_handle));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_time) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_time));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_time));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_rdev) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_rdev));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_rdev) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_rdev));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_ioepoch) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_ioepoch));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_ioepoch) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_ioepoch));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_padding_1) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_padding_1));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_padding_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_padding_1));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_mode) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_mode));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_mode));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_bias) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_flags_l) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_flags_l));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_flags_l) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_flags_l));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_flags_h) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_flags_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_flags_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_flags_h));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_umask) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_umask));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_umask) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_umask));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_padding_4) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_padding_4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_padding_4));
+
+	/* Checks for struct mdt_rec_link */
+	LASSERTF((int)sizeof(struct mdt_rec_link) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_link));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fid1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fid1));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fid2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fid2));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_time) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_time));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_time));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_1) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_1));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_1));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_2) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_2));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_2));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_3) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_3));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_3));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_4) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_4));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_bias) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_5) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_5));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_5) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_5));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_6) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_6));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_6) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_6));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_7) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_7));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_7) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_7));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_8) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_8));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_8) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_8));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_9) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_9));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_9) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_9));
+
+	/* Checks for struct mdt_rec_unlink */
+	LASSERTF((int)sizeof(struct mdt_rec_unlink) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_unlink));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fid1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid1));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fid2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid2));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_time) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_time));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_time));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_2) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_2));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_2));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_3) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_3));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_3));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_4) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_4));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_5) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_5));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_5) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_5));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_bias) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_mode) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_mode));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_mode));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_6) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_6));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_6) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_6));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_7) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_7));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_7) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_7));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_8) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_8));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_8) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_8));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_9) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_9));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_9) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_9));
+
+	/* Checks for struct mdt_rec_rename */
+	LASSERTF((int)sizeof(struct mdt_rec_rename) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_rename));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fid1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fid1));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fid2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fid2));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_time) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_time));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_time));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_1) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_1));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_1));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_2) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_2));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_2));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_3) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_3));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_3));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_4) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_4));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_bias) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_mode) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_mode));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_mode));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_5) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_5));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_5) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_5));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_6) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_6));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_6) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_6));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_7) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_7));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_7) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_7));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_8) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_8));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_8) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_8));
+
+	/* Checks for struct mdt_rec_setxattr */
+	LASSERTF((int)sizeof(struct mdt_rec_setxattr) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_setxattr));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fid) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fid));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_1) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_1));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_1));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_2) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_2));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_2));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_3) == 68, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_3));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_3));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_valid) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_valid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_valid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_valid));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_time) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_time));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_time));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_5) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_5));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_5) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_5));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_6) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_6));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_6) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_6));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_7) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_7));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_7) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_7));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_size) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_size));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_size));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_flags) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_flags));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_flags));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_8) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_8));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_8) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_8));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_9) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_9));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_9) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_9));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_10) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_10));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_10) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_10));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_11) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_11));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11));
+
+	/* Checks for struct mdt_rec_reint */
+	LASSERTF((int)sizeof(struct mdt_rec_reint) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_reint));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fid1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fid1));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fid2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fid2));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mtime) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_mtime));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mtime));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_atime) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_atime));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_atime));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_ctime) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_ctime));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_ctime));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_size) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_size));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_size));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_blocks) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_blocks));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_blocks));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_bias) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mode) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_mode));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mode));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_flags) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_flags));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_flags));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_flags_h) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_flags_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_flags_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_flags_h));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_umask) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_umask));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_umask) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_umask));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_padding_4) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4));
+
+	/* Checks for struct lmv_desc */
+	LASSERTF((int)sizeof(struct lmv_desc) == 88, "found %lld\n",
+		 (long long)(int)sizeof(struct lmv_desc));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_tgt_count) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_tgt_count));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_tgt_count));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_active_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_active_tgt_count));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_active_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_active_tgt_count));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_default_stripe_count) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_default_stripe_count));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_default_stripe_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_default_stripe_count));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_pattern) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_pattern));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_pattern) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_pattern));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_default_hash_size) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_default_hash_size));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_default_hash_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_default_hash_size));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_padding_1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_padding_1));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_1));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_padding_2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_padding_2));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_2));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_qos_maxage) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_qos_maxage));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_qos_maxage) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_qos_maxage));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_padding_3) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_padding_3));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_3));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_padding_4) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_padding_4));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_4));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_uuid) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_uuid));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_uuid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_uuid));
+
+	/* Checks for struct lov_desc */
+	LASSERTF((int)sizeof(struct lov_desc) == 88, "found %lld\n",
+		 (long long)(int)sizeof(struct lov_desc));
+	LASSERTF((int)offsetof(struct lov_desc, ld_tgt_count) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_tgt_count));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_tgt_count));
+	LASSERTF((int)offsetof(struct lov_desc, ld_active_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_active_tgt_count));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_active_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_active_tgt_count));
+	LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_count) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_default_stripe_count));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_count));
+	LASSERTF((int)offsetof(struct lov_desc, ld_pattern) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_pattern));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_pattern) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_pattern));
+	LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_size) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_default_stripe_size));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_size));
+	LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_offset) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset));
+	LASSERTF((int)offsetof(struct lov_desc, ld_padding_0) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_padding_0));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_0) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_0));
+	LASSERTF((int)offsetof(struct lov_desc, ld_qos_maxage) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_qos_maxage));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_maxage) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_qos_maxage));
+	LASSERTF((int)offsetof(struct lov_desc, ld_padding_1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_padding_1));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_1));
+	LASSERTF((int)offsetof(struct lov_desc, ld_padding_2) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_padding_2));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_2));
+	LASSERTF((int)offsetof(struct lov_desc, ld_uuid) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_uuid));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_uuid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_uuid));
+	CLASSERT(LOV_DESC_MAGIC == 0xB0CCDE5C);
+
+	/* Checks for struct ldlm_res_id */
+	LASSERTF((int)sizeof(struct ldlm_res_id) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_res_id));
+	CLASSERT(RES_NAME_SIZE == 4);
+	LASSERTF((int)offsetof(struct ldlm_res_id, name[4]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_res_id, name[4]));
+	LASSERTF((int)sizeof(((struct ldlm_res_id *)0)->name[4]) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_res_id *)0)->name[4]));
+
+	/* Checks for struct ldlm_extent */
+	LASSERTF((int)sizeof(struct ldlm_extent) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_extent));
+	LASSERTF((int)offsetof(struct ldlm_extent, start) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_extent, start));
+	LASSERTF((int)sizeof(((struct ldlm_extent *)0)->start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_extent *)0)->start));
+	LASSERTF((int)offsetof(struct ldlm_extent, end) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_extent, end));
+	LASSERTF((int)sizeof(((struct ldlm_extent *)0)->end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_extent *)0)->end));
+	LASSERTF((int)offsetof(struct ldlm_extent, gid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_extent, gid));
+	LASSERTF((int)sizeof(((struct ldlm_extent *)0)->gid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_extent *)0)->gid));
+
+	/* Checks for struct ldlm_inodebits */
+	LASSERTF((int)sizeof(struct ldlm_inodebits) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_inodebits));
+	LASSERTF((int)offsetof(struct ldlm_inodebits, bits) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_inodebits, bits));
+	LASSERTF((int)sizeof(((struct ldlm_inodebits *)0)->bits) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_inodebits *)0)->bits));
+
+	/* Checks for struct ldlm_flock_wire */
+	LASSERTF((int)sizeof(struct ldlm_flock_wire) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_flock_wire));
+	LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_start) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_flock_wire, lfw_start));
+	LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_start));
+	LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_end) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_flock_wire, lfw_end));
+	LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_end));
+	LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_owner) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_flock_wire, lfw_owner));
+	LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_owner) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_owner));
+	LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_padding) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_flock_wire, lfw_padding));
+	LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_padding));
+	LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_pid) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_flock_wire, lfw_pid));
+	LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_pid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_pid));
+
+	/* Checks for struct ldlm_intent */
+	LASSERTF((int)sizeof(struct ldlm_intent) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_intent));
+	LASSERTF((int)offsetof(struct ldlm_intent, opc) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_intent, opc));
+	LASSERTF((int)sizeof(((struct ldlm_intent *)0)->opc) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_intent *)0)->opc));
+	LASSERTF(IT_OPEN == 1, "found %lld\n",
+		 (long long)IT_OPEN);
+	LASSERTF(IT_CREAT == 2, "found %lld\n",
+		 (long long)IT_CREAT);
+	LASSERTF(IT_READDIR == 4, "found %lld\n",
+		 (long long)IT_READDIR);
+	LASSERTF(IT_GETATTR == 8, "found %lld\n",
+		 (long long)IT_GETATTR);
+	LASSERTF(IT_LOOKUP == 16, "found %lld\n",
+		 (long long)IT_LOOKUP);
+	LASSERTF(IT_UNLINK == 32, "found %lld\n",
+		 (long long)IT_UNLINK);
+	LASSERTF(IT_TRUNC == 64, "found %lld\n",
+		 (long long)IT_TRUNC);
+	LASSERTF(IT_GETXATTR == 128, "found %lld\n",
+		 (long long)IT_GETXATTR);
+	LASSERTF(IT_EXEC == 256, "found %lld\n",
+		 (long long)IT_EXEC);
+	LASSERTF(IT_PIN == 512, "found %lld\n",
+		 (long long)IT_PIN);
+	LASSERTF(IT_LAYOUT == 1024, "found %lld\n",
+		 (long long)IT_LAYOUT);
+	LASSERTF(IT_QUOTA_DQACQ == 2048, "found %lld\n",
+		 (long long)IT_QUOTA_DQACQ);
+	LASSERTF(IT_QUOTA_CONN == 4096, "found %lld\n",
+		 (long long)IT_QUOTA_CONN);
+	LASSERTF(IT_SETXATTR == 8192, "found %lld\n",
+		 (long long)IT_SETXATTR);
+
+	/* Checks for struct ldlm_resource_desc */
+	LASSERTF((int)sizeof(struct ldlm_resource_desc) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_resource_desc));
+	LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_type) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_resource_desc, lr_type));
+	LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_type));
+	LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_pad) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_resource_desc, lr_pad));
+	LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_pad) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_pad));
+	LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_name) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_resource_desc, lr_name));
+	LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_name) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_name));
+
+	/* Checks for struct ldlm_lock_desc */
+	LASSERTF((int)sizeof(struct ldlm_lock_desc) == 80, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_lock_desc));
+	LASSERTF((int)offsetof(struct ldlm_lock_desc, l_resource) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_lock_desc, l_resource));
+	LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_resource) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_resource));
+	LASSERTF((int)offsetof(struct ldlm_lock_desc, l_req_mode) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_lock_desc, l_req_mode));
+	LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_req_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_req_mode));
+	LASSERTF((int)offsetof(struct ldlm_lock_desc, l_granted_mode) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_lock_desc, l_granted_mode));
+	LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_granted_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_granted_mode));
+	LASSERTF((int)offsetof(struct ldlm_lock_desc, l_policy_data) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_lock_desc, l_policy_data));
+	LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_policy_data) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_policy_data));
+
+	/* Checks for struct ldlm_request */
+	LASSERTF((int)sizeof(struct ldlm_request) == 104, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_request));
+	LASSERTF((int)offsetof(struct ldlm_request, lock_flags) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_request, lock_flags));
+	LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_request *)0)->lock_flags));
+	LASSERTF((int)offsetof(struct ldlm_request, lock_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_request, lock_count));
+	LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_request *)0)->lock_count));
+	LASSERTF((int)offsetof(struct ldlm_request, lock_desc) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_request, lock_desc));
+	LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_desc) == 80, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_request *)0)->lock_desc));
+	LASSERTF((int)offsetof(struct ldlm_request, lock_handle) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_request, lock_handle));
+	LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_handle) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_request *)0)->lock_handle));
+
+	/* Checks for struct ldlm_reply */
+	LASSERTF((int)sizeof(struct ldlm_reply) == 112, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_reply));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_flags) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_flags));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_flags));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_padding) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_padding));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_padding));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_desc) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_desc));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_desc) == 80, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_desc));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_handle) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_handle));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_handle));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_policy_res1) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_policy_res1));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_policy_res1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_policy_res1));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_policy_res2) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_policy_res2));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_policy_res2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_policy_res2));
+
+	/* Checks for struct ost_lvb_v1 */
+	LASSERTF((int)sizeof(struct ost_lvb_v1) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct ost_lvb_v1));
+	LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_size) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb_v1, lvb_size));
+	LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_size));
+	LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_mtime) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb_v1, lvb_mtime));
+	LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_mtime));
+	LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_atime) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb_v1, lvb_atime));
+	LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_atime));
+	LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_ctime) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb_v1, lvb_ctime));
+	LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_ctime));
+	LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_blocks) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb_v1, lvb_blocks));
+	LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_blocks));
+
+	/* Checks for struct ost_lvb */
+	LASSERTF((int)sizeof(struct ost_lvb) == 56, "found %lld\n",
+		 (long long)(int)sizeof(struct ost_lvb));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_size) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_size));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_size));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_mtime) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_mtime));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_mtime));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_atime) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_atime));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_atime));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_ctime) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_ctime));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_ctime));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_blocks) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_blocks));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_blocks));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_mtime_ns) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_mtime_ns));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_mtime_ns) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_mtime_ns));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_atime_ns) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_atime_ns));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_atime_ns) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_atime_ns));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_ctime_ns) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_ctime_ns));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_ctime_ns) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_ctime_ns));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_padding) == 52, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_padding));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_padding));
+
+	/* Checks for struct lquota_lvb */
+	LASSERTF((int)sizeof(struct lquota_lvb) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct lquota_lvb));
+	LASSERTF((int)offsetof(struct lquota_lvb, lvb_flags) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_lvb, lvb_flags));
+	LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_flags));
+	LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_may_rel) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_lvb, lvb_id_may_rel));
+	LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_may_rel) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_may_rel));
+	LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_rel) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_lvb, lvb_id_rel));
+	LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_rel) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_rel));
+	LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_qunit) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_lvb, lvb_id_qunit));
+	LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_qunit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_qunit));
+	LASSERTF((int)offsetof(struct lquota_lvb, lvb_pad1) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_lvb, lvb_pad1));
+	LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_pad1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_pad1));
+	LASSERTF(LQUOTA_FL_EDQUOT == 1, "found %lld\n",
+		 (long long)LQUOTA_FL_EDQUOT);
+
+	/* Checks for struct ldlm_gl_lquota_desc */
+	LASSERTF((int)sizeof(struct ldlm_gl_lquota_desc) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_gl_lquota_desc));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_id) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_id));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_id) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_id));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_flags));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_flags));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_ver) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_ver));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_ver) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_ver));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_hardlimit) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_hardlimit));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_hardlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_hardlimit));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_softlimit) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_softlimit));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_softlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_softlimit));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_time) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_time));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_time));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_pad2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_pad2));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_pad2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_pad2));
+
+	/* Checks for struct ldlm_gl_barrier_desc */
+	LASSERTF((int)sizeof(struct ldlm_gl_barrier_desc) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_gl_barrier_desc));
+	LASSERTF((int)offsetof(struct ldlm_gl_barrier_desc, lgbd_status) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_barrier_desc, lgbd_status));
+	LASSERTF((int)sizeof(((struct ldlm_gl_barrier_desc *)0)->lgbd_status) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_barrier_desc *)0)->lgbd_status));
+	LASSERTF((int)offsetof(struct ldlm_gl_barrier_desc, lgbd_timeout) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_barrier_desc, lgbd_timeout));
+	LASSERTF((int)sizeof(((struct ldlm_gl_barrier_desc *)0)->lgbd_timeout) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_barrier_desc *)0)->lgbd_timeout));
+	LASSERTF((int)offsetof(struct ldlm_gl_barrier_desc, lgbd_padding) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_barrier_desc, lgbd_padding));
+	LASSERTF((int)sizeof(((struct ldlm_gl_barrier_desc *)0)->lgbd_padding) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_barrier_desc *)0)->lgbd_padding));
+
+	/* Checks for struct barrier_lvb */
+	LASSERTF((int)sizeof(struct barrier_lvb) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct barrier_lvb));
+	LASSERTF((int)offsetof(struct barrier_lvb, lvb_status) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct barrier_lvb, lvb_status));
+	LASSERTF((int)sizeof(((struct barrier_lvb *)0)->lvb_status) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct barrier_lvb *)0)->lvb_status));
+	LASSERTF((int)offsetof(struct barrier_lvb, lvb_index) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct barrier_lvb, lvb_index));
+	LASSERTF((int)sizeof(((struct barrier_lvb *)0)->lvb_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct barrier_lvb *)0)->lvb_index));
+	LASSERTF((int)offsetof(struct barrier_lvb, lvb_padding) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct barrier_lvb, lvb_padding));
+	LASSERTF((int)sizeof(((struct barrier_lvb *)0)->lvb_padding) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct barrier_lvb *)0)->lvb_padding));
+
+	/* Checks for struct mgs_send_param */
+	LASSERTF((int)sizeof(struct mgs_send_param) == 1024, "found %lld\n",
+		 (long long)(int)sizeof(struct mgs_send_param));
+	CLASSERT(MGS_PARAM_MAXLEN == 1024);
+	LASSERTF((int)offsetof(struct mgs_send_param, mgs_param[1024]) == 1024, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_send_param, mgs_param[1024]));
+	LASSERTF((int)sizeof(((struct mgs_send_param *)0)->mgs_param[1024]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_send_param *)0)->mgs_param[1024]));
+
+	/* Checks for struct cfg_marker */
+	LASSERTF((int)sizeof(struct cfg_marker) == 160, "found %lld\n",
+		 (long long)(int)sizeof(struct cfg_marker));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_step) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_step));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_step) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_step));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_flags));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_flags));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_vers) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_vers));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_vers) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_vers));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_padding) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_padding));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_padding));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_createtime) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_createtime));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_createtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_createtime));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_canceltime) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_canceltime));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_canceltime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_canceltime));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_tgtname) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_tgtname));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_tgtname) == 64, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_tgtname));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_comment) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_comment));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_comment) == 64, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_comment));
+
+	/* Checks for struct llog_logid */
+	LASSERTF((int)sizeof(struct llog_logid) == 20, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_logid));
+	LASSERTF((int)offsetof(struct llog_logid, lgl_oi) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid, lgl_oi));
+	LASSERTF((int)sizeof(((struct llog_logid *)0)->lgl_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid *)0)->lgl_oi));
+	LASSERTF((int)offsetof(struct llog_logid, lgl_ogen) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid, lgl_ogen));
+	LASSERTF((int)sizeof(((struct llog_logid *)0)->lgl_ogen) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid *)0)->lgl_ogen));
+	CLASSERT(OST_SZ_REC == 274730752);
+	CLASSERT(MDS_UNLINK_REC == 274801668);
+	CLASSERT(MDS_UNLINK64_REC == 275325956);
+	CLASSERT(MDS_SETATTR64_REC == 275325953);
+	CLASSERT(OBD_CFG_REC == 274857984);
+	CLASSERT(LLOG_GEN_REC == 274989056);
+	CLASSERT(CHANGELOG_REC == 275120128);
+	CLASSERT(CHANGELOG_USER_REC == 275185664);
+	CLASSERT(LLOG_HDR_MAGIC == 275010873);
+	CLASSERT(LLOG_LOGID_MAGIC == 275010875);
+
+	/* Checks for struct llog_catid */
+	LASSERTF((int)sizeof(struct llog_catid) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_catid));
+	LASSERTF((int)offsetof(struct llog_catid, lci_logid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_catid, lci_logid));
+	LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_logid) == 20, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_catid *)0)->lci_logid));
+	LASSERTF((int)offsetof(struct llog_catid, lci_padding1) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_catid, lci_padding1));
+	LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding1));
+	LASSERTF((int)offsetof(struct llog_catid, lci_padding2) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_catid, lci_padding2));
+	LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding2));
+	LASSERTF((int)offsetof(struct llog_catid, lci_padding3) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_catid, lci_padding3));
+	LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding3));
+
+	/* Checks for struct llog_rec_hdr */
+	LASSERTF((int)sizeof(struct llog_rec_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_rec_hdr));
+	LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_len) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_hdr, lrh_len));
+	LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_len));
+	LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_index) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_hdr, lrh_index));
+	LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_index));
+	LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_type) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_hdr, lrh_type));
+	LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_type));
+	LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_id) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_hdr, lrh_id));
+	LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_id));
+
+	/* Checks for struct llog_rec_tail */
+	LASSERTF((int)sizeof(struct llog_rec_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_rec_tail));
+	LASSERTF((int)offsetof(struct llog_rec_tail, lrt_len) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_tail, lrt_len));
+	LASSERTF((int)sizeof(((struct llog_rec_tail *)0)->lrt_len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_tail *)0)->lrt_len));
+	LASSERTF((int)offsetof(struct llog_rec_tail, lrt_index) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_tail, lrt_index));
+	LASSERTF((int)sizeof(((struct llog_rec_tail *)0)->lrt_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_tail *)0)->lrt_index));
+
+	/* Checks for struct llog_logid_rec */
+	LASSERTF((int)sizeof(struct llog_logid_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_logid_rec));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_hdr));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_hdr));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_id) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_id));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_id) == 20, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_id));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding1) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_padding1));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding1));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding2) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_padding2));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding2));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding3) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_padding3));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding3));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_tail) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_tail));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_tail));
+
+	/* Checks for struct llog_unlink_rec */
+	LASSERTF((int)sizeof(struct llog_unlink_rec) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_unlink_rec));
+	LASSERTF((int)offsetof(struct llog_unlink_rec, lur_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink_rec, lur_hdr));
+	LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_hdr));
+	LASSERTF((int)offsetof(struct llog_unlink_rec, lur_oid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink_rec, lur_oid));
+	LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_oid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_oid));
+	LASSERTF((int)offsetof(struct llog_unlink_rec, lur_oseq) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink_rec, lur_oseq));
+	LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_oseq) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_oseq));
+	LASSERTF((int)offsetof(struct llog_unlink_rec, lur_count) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink_rec, lur_count));
+	LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_count));
+	LASSERTF((int)offsetof(struct llog_unlink_rec, lur_tail) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink_rec, lur_tail));
+	LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_tail));
+	/* Checks for struct llog_unlink64_rec */
+	LASSERTF((int)sizeof(struct llog_unlink64_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_unlink64_rec));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_hdr));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_hdr));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_fid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_fid));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_fid));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_count) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_count));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_count));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_tail) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_tail));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_tail));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding1) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding1));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding1));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding2) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding2));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding2));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding3) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding3));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding3));
+
+	/* Checks for struct llog_setattr64_rec */
+	LASSERTF((int)sizeof(struct llog_setattr64_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_setattr64_rec));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_hdr));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_oi) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_oi));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid_h));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid_h) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid_h));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_valid) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_valid));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_valid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_valid));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec_v2, lsr_projid) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec_v2, lsr_projid));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_projid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_projid));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec_v2, lsr_tail) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec_v2, lsr_tail));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_tail));
+
+	/* Checks for struct llog_size_change_rec */
+	LASSERTF((int)sizeof(struct llog_size_change_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_size_change_rec));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_hdr));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_hdr));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_fid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_fid));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_fid));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_ioepoch) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_ioepoch));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_ioepoch) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_ioepoch));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding1) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding1));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding1));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding2) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding2));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding2));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding3) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding3));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding3));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_tail) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_tail));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail));
+
+	/* Checks for struct changelog_rec */
+	LASSERTF((int)sizeof(struct changelog_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct changelog_rec));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_namelen) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_namelen));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_namelen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_namelen));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_flags) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_flags));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_flags));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_type) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_type));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_type));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_index) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_index));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_index) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_index));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_prev) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_prev));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_prev) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_prev));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_time) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_time));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_time));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_tfid) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_tfid));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_tfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_tfid));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_pfid) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_pfid));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_pfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_pfid));
+
+	/* Checks for struct changelog_ext_rename */
+	LASSERTF((int)sizeof(struct changelog_ext_rename) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct changelog_ext_rename));
+	LASSERTF((int)offsetof(struct changelog_ext_rename, cr_sfid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rename, cr_sfid));
+	LASSERTF((int)sizeof(((struct changelog_ext_rename *)0)->cr_sfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rename *)0)->cr_sfid));
+	LASSERTF((int)offsetof(struct changelog_ext_rename, cr_spfid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rename, cr_spfid));
+	LASSERTF((int)sizeof(((struct changelog_ext_rename *)0)->cr_spfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rename *)0)->cr_spfid));
+
+	/* Checks for struct changelog_ext_jobid */
+	LASSERTF((int)sizeof(struct changelog_ext_jobid) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct changelog_ext_jobid));
+	LASSERTF((int)offsetof(struct changelog_ext_jobid, cr_jobid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_jobid, cr_jobid));
+	LASSERTF((int)sizeof(((struct changelog_ext_jobid *)0)->cr_jobid) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_jobid *)0)->cr_jobid));
+
+	/* Checks for struct changelog_setinfo */
+	LASSERTF((int)sizeof(struct changelog_setinfo) == 12, "found %lld\n",
+		 (long long)(int)sizeof(struct changelog_setinfo));
+	LASSERTF((int)offsetof(struct changelog_setinfo, cs_recno) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_setinfo, cs_recno));
+	LASSERTF((int)sizeof(((struct changelog_setinfo *)0)->cs_recno) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_setinfo *)0)->cs_recno));
+	LASSERTF((int)offsetof(struct changelog_setinfo, cs_id) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_setinfo, cs_id));
+	LASSERTF((int)sizeof(((struct changelog_setinfo *)0)->cs_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_setinfo *)0)->cs_id));
+
+	/* Checks for struct llog_changelog_rec */
+	LASSERTF((int)sizeof(struct llog_changelog_rec) == 88, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_changelog_rec));
+	LASSERTF((int)offsetof(struct llog_changelog_rec, cr_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_rec, cr_hdr));
+	LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr_hdr));
+	LASSERTF((int)offsetof(struct llog_changelog_rec, cr) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_rec, cr));
+	LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr) == 64, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr));
+	LASSERTF((int)offsetof(struct llog_changelog_rec, cr_do_not_use) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_rec, cr_do_not_use));
+	LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr_do_not_use) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr_do_not_use));
+
+	/* Checks for struct llog_changelog_user_rec */
+	LASSERTF((int)sizeof(struct llog_changelog_user_rec) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_changelog_user_rec));
+	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_hdr));
+	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_hdr));
+	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_id) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_id));
+	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id));
+	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_padding) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_padding));
+	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_padding));
+	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_endrec) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_endrec));
+	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec));
+	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_tail) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_tail));
+	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_tail));
+
+	/* Checks for struct llog_gen */
+	LASSERTF((int)sizeof(struct llog_gen) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_gen));
+	LASSERTF((int)offsetof(struct llog_gen, mnt_cnt) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_gen, mnt_cnt));
+	LASSERTF((int)sizeof(((struct llog_gen *)0)->mnt_cnt) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_gen *)0)->mnt_cnt));
+	LASSERTF((int)offsetof(struct llog_gen, conn_cnt) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_gen, conn_cnt));
+	LASSERTF((int)sizeof(((struct llog_gen *)0)->conn_cnt) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_gen *)0)->conn_cnt));
+
+	/* Checks for struct llog_gen_rec */
+	LASSERTF((int)sizeof(struct llog_gen_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_gen_rec));
+	LASSERTF((int)offsetof(struct llog_gen_rec, lgr_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_gen_rec, lgr_hdr));
+	LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_hdr));
+	LASSERTF((int)offsetof(struct llog_gen_rec, lgr_gen) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_gen_rec, lgr_gen));
+	LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_gen) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_gen));
+	LASSERTF((int)offsetof(struct llog_gen_rec, lgr_tail) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_gen_rec, lgr_tail));
+	LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_tail));
+
+	/* Checks for struct llog_log_hdr */
+	LASSERTF((int)sizeof(struct llog_log_hdr) == 8192, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_log_hdr));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_hdr));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_hdr));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_timestamp) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_timestamp));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_timestamp) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_timestamp));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_count) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_count));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_count));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_bitmap_offset) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_bitmap_offset));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap_offset) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap_offset));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_size) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_size));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_size));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_flags) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_flags));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_flags));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_cat_idx) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_cat_idx));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_cat_idx) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_cat_idx));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_tgtuuid) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_tgtuuid));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_tgtuuid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_tgtuuid));
+
+	/* Checks for struct llogd_body */
+	LASSERTF((int)sizeof(struct llogd_body) == 48, "found %lld\n",
+		 (long long)(int)sizeof(struct llogd_body));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_logid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_logid));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_logid) == 20, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_logid));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_ctxt_idx) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_ctxt_idx));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_ctxt_idx) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_ctxt_idx));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_llh_flags) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_llh_flags));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_llh_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_llh_flags));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_index) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_index));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_index));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_saved_index) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_saved_index));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_saved_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_saved_index));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_len) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_len));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_len));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_cur_offset) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_cur_offset));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_cur_offset) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_cur_offset));
+	CLASSERT(LLOG_ORIGIN_HANDLE_CREATE == 501);
+	CLASSERT(LLOG_ORIGIN_HANDLE_NEXT_BLOCK == 502);
+	CLASSERT(LLOG_ORIGIN_HANDLE_READ_HEADER == 503);
+	CLASSERT(LLOG_ORIGIN_HANDLE_WRITE_REC == 504);
+	CLASSERT(LLOG_ORIGIN_HANDLE_CLOSE == 505);
+	CLASSERT(LLOG_ORIGIN_CONNECT == 506);
+	CLASSERT(LLOG_CATINFO == 507);
+	CLASSERT(LLOG_ORIGIN_HANDLE_PREV_BLOCK == 508);
+	CLASSERT(LLOG_ORIGIN_HANDLE_DESTROY == 509);
+	CLASSERT(LLOG_FIRST_OPC == 501);
+	CLASSERT(LLOG_LAST_OPC == 510);
+	CLASSERT(LLOG_CONFIG_ORIG_CTXT == 0);
+	CLASSERT(LLOG_CONFIG_REPL_CTXT == 1);
+	CLASSERT(LLOG_MDS_OST_ORIG_CTXT == 2);
+	CLASSERT(LLOG_MDS_OST_REPL_CTXT == 3);
+	CLASSERT(LLOG_SIZE_ORIG_CTXT == 4);
+	CLASSERT(LLOG_SIZE_REPL_CTXT == 5);
+	CLASSERT(LLOG_TEST_ORIG_CTXT == 8);
+	CLASSERT(LLOG_TEST_REPL_CTXT == 9);
+	CLASSERT(LLOG_CHANGELOG_ORIG_CTXT == 12);
+	CLASSERT(LLOG_CHANGELOG_REPL_CTXT == 13);
+	CLASSERT(LLOG_CHANGELOG_USER_ORIG_CTXT == 14);
+	CLASSERT(LLOG_AGENT_ORIG_CTXT == 15);
+	CLASSERT(LLOG_UPDATELOG_ORIG_CTXT == 16);
+	CLASSERT(LLOG_UPDATELOG_REPL_CTXT == 17);
+	CLASSERT(LLOG_MAX_CTXTS == 18);
+
+	/* Checks for struct llogd_conn_body */
+	LASSERTF((int)sizeof(struct llogd_conn_body) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct llogd_conn_body));
+	LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_gen) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_conn_body, lgdc_gen));
+	LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_gen) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_gen));
+	LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_logid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_conn_body, lgdc_logid));
+	LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_logid) == 20, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_logid));
+	LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_ctxt_idx) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_conn_body, lgdc_ctxt_idx));
+	LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx));
+
+	/* Checks for struct ll_fiemap_info_key */
+	LASSERTF((int)sizeof(struct ll_fiemap_info_key) == 248, "found %lld\n",
+		 (long long)(int)sizeof(struct ll_fiemap_info_key));
+	LASSERTF((int)offsetof(struct ll_fiemap_info_key, lfik_name[8]) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_info_key, lfik_name[8]));
+	LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_name[8]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_name[8]));
+	LASSERTF((int)offsetof(struct ll_fiemap_info_key, lfik_oa) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_info_key, lfik_oa));
+	LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_oa) == 208, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_oa));
+	LASSERTF((int)offsetof(struct ll_fiemap_info_key, lfik_fiemap) == 216, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_info_key, lfik_fiemap));
+	LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_fiemap) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_fiemap));
+
+	/* Checks for struct quota_body */
+	LASSERTF((int)sizeof(struct quota_body) == 112, "found %lld\n",
+		 (long long)(int)sizeof(struct quota_body));
+	LASSERTF((int)offsetof(struct quota_body, qb_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_fid));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_fid));
+	LASSERTF((int)offsetof(struct quota_body, qb_id) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_id));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_id) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_id));
+	LASSERTF((int)offsetof(struct quota_body, qb_flags) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_flags));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_flags));
+	LASSERTF((int)offsetof(struct quota_body, qb_padding) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_padding));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_padding));
+	LASSERTF((int)offsetof(struct quota_body, qb_count) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_count));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_count) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_count));
+	LASSERTF((int)offsetof(struct quota_body, qb_usage) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_usage));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_usage) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_usage));
+	LASSERTF((int)offsetof(struct quota_body, qb_slv_ver) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_slv_ver));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_slv_ver) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_slv_ver));
+	LASSERTF((int)offsetof(struct quota_body, qb_lockh) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_lockh));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_lockh) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_lockh));
+	LASSERTF((int)offsetof(struct quota_body, qb_glb_lockh) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_glb_lockh));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_glb_lockh) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_glb_lockh));
+	LASSERTF((int)offsetof(struct quota_body, qb_padding1[4]) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_padding1[4]));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_padding1[4]) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_padding1[4]));
+
+	/* Checks for struct mgs_target_info */
+	LASSERTF((int)sizeof(struct mgs_target_info) == 4544, "found %lld\n",
+		 (long long)(int)sizeof(struct mgs_target_info));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_lustre_ver) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_lustre_ver));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_lustre_ver) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_lustre_ver));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_stripe_index) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_stripe_index));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_stripe_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_stripe_index));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_config_ver) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_config_ver));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_config_ver) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_config_ver));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_flags) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_flags));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_flags));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_nid_count) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_nid_count));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_nid_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_nid_count));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_instance) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_instance));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_instance) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_instance));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_fsname) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_fsname));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_fsname) == 64, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_fsname));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_svname) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_svname));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_svname) == 64, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_svname));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_uuid) == 152, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_uuid));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_uuid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_uuid));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_nids) == 192, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_nids));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_nids) == 256, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_nids));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_params) == 448, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_params));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_params) == 4096, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_params));
+
+	/* Checks for struct mgs_nidtbl_entry */
+	LASSERTF((int)sizeof(struct mgs_nidtbl_entry) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct mgs_nidtbl_entry));
+	LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_version) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_version));
+	LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_version) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_version));
+	LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_instance) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_instance));
+	LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_instance) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_instance));
+	LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_index) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_index));
+	LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_index));
+	LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_length) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_length));
+	LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_length) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_length));
+	LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_type) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_type));
+	LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_type) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_type));
+	LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_nid_type) == 21, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_nid_type));
+	LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_nid_type) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_nid_type));
+	LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_nid_size) == 22, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_nid_size));
+	LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_nid_size) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_nid_size));
+	LASSERTF((int)offsetof(struct mgs_nidtbl_entry, mne_nid_count) == 23, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_nidtbl_entry, mne_nid_count));
+	LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_nid_count) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->mne_nid_count));
+	LASSERTF((int)offsetof(struct mgs_nidtbl_entry, u.nids[0]) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_nidtbl_entry, u.nids[0]));
+	LASSERTF((int)sizeof(((struct mgs_nidtbl_entry *)0)->u.nids[0]) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_nidtbl_entry *)0)->u.nids[0]));
+
+	/* Checks for struct mgs_config_body */
+	LASSERTF((int)sizeof(struct mgs_config_body) == 80, "found %lld\n",
+		 (long long)(int)sizeof(struct mgs_config_body));
+	LASSERTF((int)offsetof(struct mgs_config_body, mcb_name) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_config_body, mcb_name));
+	LASSERTF((int)sizeof(((struct mgs_config_body *)0)->mcb_name) == 64, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_config_body *)0)->mcb_name));
+	LASSERTF((int)offsetof(struct mgs_config_body, mcb_offset) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_config_body, mcb_offset));
+	LASSERTF((int)sizeof(((struct mgs_config_body *)0)->mcb_offset) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_config_body *)0)->mcb_offset));
+	LASSERTF((int)offsetof(struct mgs_config_body, mcb_type) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_config_body, mcb_type));
+	LASSERTF((int)sizeof(((struct mgs_config_body *)0)->mcb_type) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_config_body *)0)->mcb_type));
+	LASSERTF((int)offsetof(struct mgs_config_body, mcb_nm_cur_pass) == 74, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_config_body, mcb_nm_cur_pass));
+	LASSERTF((int)sizeof(((struct mgs_config_body *)0)->mcb_nm_cur_pass) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_config_body *)0)->mcb_nm_cur_pass));
+	LASSERTF((int)offsetof(struct mgs_config_body, mcb_bits) == 75, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_config_body, mcb_bits));
+	LASSERTF((int)sizeof(((struct mgs_config_body *)0)->mcb_bits) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_config_body *)0)->mcb_bits));
+	LASSERTF((int)offsetof(struct mgs_config_body, mcb_units) == 76, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_config_body, mcb_units));
+	LASSERTF((int)sizeof(((struct mgs_config_body *)0)->mcb_units) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_config_body *)0)->mcb_units));
+	CLASSERT(CONFIG_T_CONFIG == 0);
+	CLASSERT(CONFIG_T_SPTLRPC == 1);
+	CLASSERT(CONFIG_T_RECOVER == 2);
+	CLASSERT(CONFIG_T_PARAMS == 3);
+	CLASSERT(CONFIG_T_NODEMAP == 4);
+	CLASSERT(CONFIG_T_BARRIER == 5);
+
+	/* Checks for struct mgs_config_res */
+	LASSERTF((int)sizeof(struct mgs_config_res) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct mgs_config_res));
+	LASSERTF((int)offsetof(struct mgs_config_res, mcr_offset) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_config_res, mcr_offset));
+	LASSERTF((int)sizeof(((struct mgs_config_res *)0)->mcr_offset) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_config_res *)0)->mcr_offset));
+	LASSERTF((int)offsetof(struct mgs_config_res, mcr_size) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_config_res, mcr_size));
+	LASSERTF((int)sizeof(((struct mgs_config_res *)0)->mcr_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_config_res *)0)->mcr_size));
+
+	/* Checks for struct lustre_capa */
+	LASSERTF((int)sizeof(struct lustre_capa) == 120, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_capa));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_fid));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_fid));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_opc) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_opc));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_opc) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_opc));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_uid) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_uid));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_uid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_uid));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_gid) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_gid));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_gid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_gid));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_flags) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_flags));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_flags));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_keyid) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_keyid));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_keyid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_keyid));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_timeout) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_timeout));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_timeout) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_timeout));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_expiry) == 52, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_expiry));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_expiry) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_expiry));
+	CLASSERT(CAPA_HMAC_MAX_LEN == 64);
+	LASSERTF((int)offsetof(struct lustre_capa, lc_hmac[64]) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_hmac[64]));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_hmac[64]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_hmac[64]));
+
+	/* Checks for struct lustre_capa_key */
+	LASSERTF((int)sizeof(struct lustre_capa_key) == 72, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_capa_key));
+	LASSERTF((int)offsetof(struct lustre_capa_key, lk_seq) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa_key, lk_seq));
+	LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_seq) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_seq));
+	LASSERTF((int)offsetof(struct lustre_capa_key, lk_keyid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa_key, lk_keyid));
+	LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_keyid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_keyid));
+	LASSERTF((int)offsetof(struct lustre_capa_key, lk_padding) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa_key, lk_padding));
+	LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_padding));
+	CLASSERT(CAPA_HMAC_KEY_MAX_LEN == 56);
+	LASSERTF((int)offsetof(struct lustre_capa_key, lk_key[56]) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa_key, lk_key[56]));
+	LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_key[56]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_key[56]));
+
+	/* Checks for struct getinfo_fid2path */
+	LASSERTF((int)sizeof(struct getinfo_fid2path) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct getinfo_fid2path));
+	LASSERTF((int)offsetof(struct getinfo_fid2path, gf_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct getinfo_fid2path, gf_fid));
+	LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_fid));
+	LASSERTF((int)offsetof(struct getinfo_fid2path, gf_recno) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct getinfo_fid2path, gf_recno));
+	LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_recno) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_recno));
+	LASSERTF((int)offsetof(struct getinfo_fid2path, gf_linkno) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct getinfo_fid2path, gf_linkno));
+	LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_linkno) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_linkno));
+	LASSERTF((int)offsetof(struct getinfo_fid2path, gf_pathlen) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct getinfo_fid2path, gf_pathlen));
+	LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_pathlen) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_pathlen));
+	LASSERTF((int)offsetof(struct getinfo_fid2path, gf_u.gf_path[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct getinfo_fid2path, gf_u.gf_path[0]));
+	LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_u.gf_path[0]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_u.gf_path[0]));
+
+	/* Checks for struct fiemap */
+	LASSERTF((int)sizeof(struct fiemap) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct fiemap));
+	LASSERTF((int)offsetof(struct fiemap, fm_start) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct fiemap, fm_start));
+	LASSERTF((int)sizeof(((struct fiemap *)0)->fm_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct fiemap *)0)->fm_start));
+	LASSERTF((int)offsetof(struct fiemap, fm_length) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct fiemap, fm_length));
+	LASSERTF((int)sizeof(((struct fiemap *)0)->fm_length) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct fiemap *)0)->fm_length));
+	LASSERTF((int)offsetof(struct fiemap, fm_flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct fiemap, fm_flags));
+	LASSERTF((int)sizeof(((struct fiemap *)0)->fm_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct fiemap *)0)->fm_flags));
+	LASSERTF((int)offsetof(struct fiemap, fm_mapped_extents) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct fiemap, fm_mapped_extents));
+	LASSERTF((int)sizeof(((struct fiemap *)0)->fm_mapped_extents) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct fiemap *)0)->fm_mapped_extents));
+	LASSERTF((int)offsetof(struct fiemap, fm_extent_count) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct fiemap, fm_extent_count));
+	LASSERTF((int)sizeof(((struct fiemap *)0)->fm_extent_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct fiemap *)0)->fm_extent_count));
+	LASSERTF((int)offsetof(struct fiemap, fm_reserved) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct fiemap, fm_reserved));
+	LASSERTF((int)sizeof(((struct fiemap *)0)->fm_reserved) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct fiemap *)0)->fm_reserved));
+	LASSERTF((int)offsetof(struct fiemap, fm_extents) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct fiemap, fm_extents));
+	LASSERTF((int)sizeof(((struct fiemap *)0)->fm_extents) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct fiemap *)0)->fm_extents));
+	CLASSERT(FIEMAP_FLAG_SYNC == 0x00000001);
+	CLASSERT(FIEMAP_FLAG_XATTR == 0x00000002);
+	CLASSERT(FIEMAP_FLAG_DEVICE_ORDER == 0x40000000);
+
+	/* Checks for struct fiemap_extent */
+	LASSERTF((int)sizeof(struct fiemap_extent) == 56, "found %lld\n",
+		 (long long)(int)sizeof(struct fiemap_extent));
+	LASSERTF((int)offsetof(struct fiemap_extent, fe_logical) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct fiemap_extent, fe_logical));
+	LASSERTF((int)sizeof(((struct fiemap_extent *)0)->fe_logical) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct fiemap_extent *)0)->fe_logical));
+	LASSERTF((int)offsetof(struct fiemap_extent, fe_physical) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct fiemap_extent, fe_physical));
+	LASSERTF((int)sizeof(((struct fiemap_extent *)0)->fe_physical) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct fiemap_extent *)0)->fe_physical));
+	LASSERTF((int)offsetof(struct fiemap_extent, fe_length) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct fiemap_extent, fe_length));
+	LASSERTF((int)sizeof(((struct fiemap_extent *)0)->fe_length) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct fiemap_extent *)0)->fe_length));
+	LASSERTF((int)offsetof(struct fiemap_extent, fe_flags) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct fiemap_extent, fe_flags));
+	LASSERTF((int)sizeof(((struct fiemap_extent *)0)->fe_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct fiemap_extent *)0)->fe_flags));
+	LASSERTF((int)offsetof(struct fiemap_extent, fe_reserved[0]) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct fiemap_extent, fe_reserved[0]));
+	LASSERTF((int)sizeof(((struct fiemap_extent *)0)->fe_reserved[0]) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct fiemap_extent *)0)->fe_reserved[0]));
+	CLASSERT(FIEMAP_EXTENT_LAST == 0x00000001);
+	CLASSERT(FIEMAP_EXTENT_UNKNOWN == 0x00000002);
+	CLASSERT(FIEMAP_EXTENT_DELALLOC == 0x00000004);
+	CLASSERT(FIEMAP_EXTENT_ENCODED == 0x00000008);
+	CLASSERT(FIEMAP_EXTENT_DATA_ENCRYPTED == 0x00000080);
+	CLASSERT(FIEMAP_EXTENT_NOT_ALIGNED == 0x00000100);
+	CLASSERT(FIEMAP_EXTENT_DATA_INLINE == 0x00000200);
+	CLASSERT(FIEMAP_EXTENT_DATA_TAIL == 0x00000400);
+	CLASSERT(FIEMAP_EXTENT_UNWRITTEN == 0x00000800);
+	CLASSERT(FIEMAP_EXTENT_MERGED == 0x00001000);
+	CLASSERT(FIEMAP_EXTENT_NO_DIRECT == 0x40000000);
+	CLASSERT(FIEMAP_EXTENT_NET == 0x80000000);
+
+#ifdef CONFIG_FS_POSIX_ACL
+	/* Checks for type posix_acl_xattr_entry */
+	LASSERTF((int)sizeof(posix_acl_xattr_entry) == 8, "found %lld\n",
+		 (long long)(int)sizeof(posix_acl_xattr_entry));
+	LASSERTF((int)offsetof(posix_acl_xattr_entry, e_tag) == 0, "found %lld\n",
+		 (long long)(int)offsetof(posix_acl_xattr_entry, e_tag));
+	LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_tag) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_tag));
+	LASSERTF((int)offsetof(posix_acl_xattr_entry, e_perm) == 2, "found %lld\n",
+		 (long long)(int)offsetof(posix_acl_xattr_entry, e_perm));
+	LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_perm) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_perm));
+	LASSERTF((int)offsetof(posix_acl_xattr_entry, e_id) == 4, "found %lld\n",
+		 (long long)(int)offsetof(posix_acl_xattr_entry, e_id));
+	LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_id));
+#endif /* CONFIG_FS_POSIX_ACL */
+
+#ifdef CONFIG_FS_POSIX_ACL
+	/* Checks for type posix_acl_xattr_header */
+	LASSERTF((int)sizeof(posix_acl_xattr_header) == 4, "found %lld\n",
+		 (long long)(int)sizeof(posix_acl_xattr_header));
+	LASSERTF((int)offsetof(posix_acl_xattr_header, a_version) == 0, "found %lld\n",
+		 (long long)(int)offsetof(posix_acl_xattr_header, a_version));
+	LASSERTF((int)sizeof(((posix_acl_xattr_header *)0)->a_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((posix_acl_xattr_header *)0)->a_version));
+#ifndef HAVE_STRUCT_POSIX_ACL_XATTR
+	LASSERTF((int)offsetof(posix_acl_xattr_header, a_entries) == 4, "found %lld\n",
+		 (long long)(int)offsetof(posix_acl_xattr_header, a_entries));
+	LASSERTF((int)sizeof(((posix_acl_xattr_header *)0)->a_entries) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((posix_acl_xattr_header *)0)->a_entries));
+#endif /* HAVE_STRUCT_POSIX_ACL_XATTR */
+#endif /* CONFIG_FS_POSIX_ACL */
+
+	/* Checks for struct link_ea_header */
+	LASSERTF((int)sizeof(struct link_ea_header) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct link_ea_header));
+	LASSERTF((int)offsetof(struct link_ea_header, leh_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_header, leh_magic));
+	LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_header *)0)->leh_magic));
+	LASSERTF((int)offsetof(struct link_ea_header, leh_reccount) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_header, leh_reccount));
+	LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_reccount) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_header *)0)->leh_reccount));
+	LASSERTF((int)offsetof(struct link_ea_header, leh_len) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_header, leh_len));
+	LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_len) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_header *)0)->leh_len));
+	LASSERTF((int)offsetof(struct link_ea_header, leh_overflow_time) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_header, leh_overflow_time));
+	LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_overflow_time) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_header *)0)->leh_overflow_time));
+	LASSERTF((int)offsetof(struct link_ea_header, leh_padding) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_header, leh_padding));
+	LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_header *)0)->leh_padding));
+	CLASSERT(LINK_EA_MAGIC == 0x11EAF1DFUL);
+
+	/* Checks for struct link_ea_entry */
+	LASSERTF((int)sizeof(struct link_ea_entry) == 18, "found %lld\n",
+		 (long long)(int)sizeof(struct link_ea_entry));
+	LASSERTF((int)offsetof(struct link_ea_entry, lee_reclen) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_entry, lee_reclen));
+	LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_reclen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_reclen));
+	LASSERTF((int)offsetof(struct link_ea_entry, lee_parent_fid) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_entry, lee_parent_fid));
+	LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_parent_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_parent_fid));
+	LASSERTF((int)offsetof(struct link_ea_entry, lee_name) == 18, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_entry, lee_name));
+	LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_name) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_name));
+
+	/* Checks for struct layout_intent */
+	LASSERTF((int)sizeof(struct layout_intent) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct layout_intent));
+	LASSERTF((int)offsetof(struct layout_intent, li_opc) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct layout_intent, li_opc));
+	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_opc) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct layout_intent *)0)->li_opc));
+	LASSERTF((int)offsetof(struct layout_intent, li_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct layout_intent, li_flags));
+	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct layout_intent *)0)->li_flags));
+	LASSERTF((int)offsetof(struct layout_intent, li_start) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct layout_intent, li_start));
+	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct layout_intent *)0)->li_start));
+	LASSERTF((int)offsetof(struct layout_intent, li_end) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct layout_intent, li_end));
+	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct layout_intent *)0)->li_end));
+	LASSERTF(LAYOUT_INTENT_ACCESS == 0, "found %lld\n",
+		 (long long)LAYOUT_INTENT_ACCESS);
+	LASSERTF(LAYOUT_INTENT_READ == 1, "found %lld\n",
+		 (long long)LAYOUT_INTENT_READ);
+	LASSERTF(LAYOUT_INTENT_WRITE == 2, "found %lld\n",
+		 (long long)LAYOUT_INTENT_WRITE);
+	LASSERTF(LAYOUT_INTENT_GLIMPSE == 3, "found %lld\n",
+		 (long long)LAYOUT_INTENT_GLIMPSE);
+	LASSERTF(LAYOUT_INTENT_TRUNC == 4, "found %lld\n",
+		 (long long)LAYOUT_INTENT_TRUNC);
+	LASSERTF(LAYOUT_INTENT_RELEASE == 5, "found %lld\n",
+		 (long long)LAYOUT_INTENT_RELEASE);
+	LASSERTF(LAYOUT_INTENT_RESTORE == 6, "found %lld\n",
+		 (long long)LAYOUT_INTENT_RESTORE);
+
+	/* Checks for struct hsm_action_item */
+	LASSERTF((int)sizeof(struct hsm_action_item) == 72, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_action_item));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_len) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_len));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_len));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_action) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_action));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_action) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_action));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_fid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_fid));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_fid));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_dfid) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_dfid));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_dfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_dfid));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_extent) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_extent));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_extent) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_extent));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_cookie) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_cookie));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_cookie) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_cookie));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_gid) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_gid));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_gid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_gid));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_data) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_data));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_data) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_data));
+
+	/* Checks for struct hsm_action_list */
+	LASSERTF((int)sizeof(struct hsm_action_list) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_action_list));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_version) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_version));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_version));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_count));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_count));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_compound_id) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_compound_id));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_compound_id) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_compound_id));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_flags));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_flags));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_archive_id) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_archive_id));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_archive_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_archive_id));
+	LASSERTF((int)offsetof(struct hsm_action_list, padding1) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, padding1));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->padding1));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_fsname) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_fsname));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_fsname) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_fsname));
+
+	/* Checks for struct hsm_progress */
+	LASSERTF((int)sizeof(struct hsm_progress) == 48, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_progress));
+	LASSERTF((int)offsetof(struct hsm_progress, hp_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, hp_fid));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->hp_fid));
+	LASSERTF((int)offsetof(struct hsm_progress, hp_cookie) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, hp_cookie));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_cookie) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->hp_cookie));
+	LASSERTF((int)offsetof(struct hsm_progress, hp_extent) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, hp_extent));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_extent) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->hp_extent));
+	LASSERTF((int)offsetof(struct hsm_progress, hp_flags) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, hp_flags));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->hp_flags));
+	LASSERTF((int)offsetof(struct hsm_progress, hp_errval) == 42, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, hp_errval));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_errval) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->hp_errval));
+	LASSERTF((int)offsetof(struct hsm_progress, padding) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, padding));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->padding));
+	LASSERTF(HP_FLAG_COMPLETED == 0x01, "found 0x%.8x\n",
+		HP_FLAG_COMPLETED);
+	LASSERTF(HP_FLAG_RETRY == 0x02, "found 0x%.8x\n",
+		HP_FLAG_RETRY);
+
+	LASSERTF((int)offsetof(struct hsm_copy, hc_data_version) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_copy, hc_data_version));
+	LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_data_version) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_copy *)0)->hc_data_version));
+	LASSERTF((int)offsetof(struct hsm_copy, hc_flags) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_copy, hc_flags));
+	LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_copy *)0)->hc_flags));
+	LASSERTF((int)offsetof(struct hsm_copy, hc_errval) == 10, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_copy, hc_errval));
+	LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_errval) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_copy *)0)->hc_errval));
+	LASSERTF((int)offsetof(struct hsm_copy, padding) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_copy, padding));
+	LASSERTF((int)sizeof(((struct hsm_copy *)0)->padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_copy *)0)->padding));
+	LASSERTF((int)offsetof(struct hsm_copy, hc_hai) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_copy, hc_hai));
+	LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_hai) == 72, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_copy *)0)->hc_hai));
+
+	/* Checks for struct hsm_progress_kernel */
+	LASSERTF((int)sizeof(struct hsm_progress_kernel) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_progress_kernel));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_fid));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_fid));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_cookie) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_cookie));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_cookie) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_cookie));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_extent) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_extent));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_extent) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_extent));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_flags) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_flags));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_flags));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_errval) == 42, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_errval));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_errval) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_errval));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_padding1) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_padding1));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding1));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_data_version) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_data_version));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_data_version) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_data_version));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_padding2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_padding2));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding2));
+
+	/* Checks for struct hsm_user_item */
+	LASSERTF((int)sizeof(struct hsm_user_item) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_user_item));
+	LASSERTF((int)offsetof(struct hsm_user_item, hui_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_item, hui_fid));
+	LASSERTF((int)sizeof(((struct hsm_user_item *)0)->hui_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_item *)0)->hui_fid));
+	LASSERTF((int)offsetof(struct hsm_user_item, hui_extent) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_item, hui_extent));
+	LASSERTF((int)sizeof(((struct hsm_user_item *)0)->hui_extent) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_item *)0)->hui_extent));
+
+	/* Checks for struct hsm_user_state */
+	LASSERTF((int)sizeof(struct hsm_user_state) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_user_state));
+	LASSERTF((int)offsetof(struct hsm_user_state, hus_states) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_state, hus_states));
+	LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_states) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_states));
+	LASSERTF((int)offsetof(struct hsm_user_state, hus_archive_id) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_state, hus_archive_id));
+	LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_archive_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_archive_id));
+	LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_state) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_state));
+	LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_state) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_state));
+	LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_action) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_action));
+	LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_action) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_action));
+	LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_location) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_location));
+	LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_location) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_location));
+
+	/* Checks for struct hsm_state_set */
+	LASSERTF((int)sizeof(struct hsm_state_set) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_state_set));
+	LASSERTF((int)offsetof(struct hsm_state_set, hss_valid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_state_set, hss_valid));
+	LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_valid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_valid));
+	LASSERTF((int)offsetof(struct hsm_state_set, hss_archive_id) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_state_set, hss_archive_id));
+	LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_archive_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_archive_id));
+	LASSERTF((int)offsetof(struct hsm_state_set, hss_setmask) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_state_set, hss_setmask));
+	LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_setmask) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_setmask));
+	LASSERTF((int)offsetof(struct hsm_state_set, hss_clearmask) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_state_set, hss_clearmask));
+	LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_clearmask) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_clearmask));
+
+	/* Checks for struct hsm_current_action */
+	LASSERTF((int)sizeof(struct hsm_current_action) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_current_action));
+	LASSERTF((int)offsetof(struct hsm_current_action, hca_state) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_current_action, hca_state));
+	LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_state) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_state));
+	LASSERTF((int)offsetof(struct hsm_current_action, hca_action) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_current_action, hca_action));
+	LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_action) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_action));
+	LASSERTF((int)offsetof(struct hsm_current_action, hca_location) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_current_action, hca_location));
+	LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_location) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_location));
+
+	/* Checks for struct hsm_request */
+	LASSERTF((int)sizeof(struct hsm_request) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_request));
+	LASSERTF((int)offsetof(struct hsm_request, hr_action) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_request, hr_action));
+	LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_action) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_request *)0)->hr_action));
+	LASSERTF((int)offsetof(struct hsm_request, hr_archive_id) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_request, hr_archive_id));
+	LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_archive_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_request *)0)->hr_archive_id));
+	LASSERTF((int)offsetof(struct hsm_request, hr_flags) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_request, hr_flags));
+	LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_request *)0)->hr_flags));
+	LASSERTF((int)offsetof(struct hsm_request, hr_itemcount) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_request, hr_itemcount));
+	LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_itemcount) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_request *)0)->hr_itemcount));
+	LASSERTF((int)offsetof(struct hsm_request, hr_data_len) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_request, hr_data_len));
+	LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_data_len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_request *)0)->hr_data_len));
+	LASSERTF(HSM_FORCE_ACTION == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)HSM_FORCE_ACTION);
+	LASSERTF(HSM_GHOST_COPY == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)HSM_GHOST_COPY);
+
+	/* Checks for struct hsm_user_request */
+	LASSERTF((int)sizeof(struct hsm_user_request) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_user_request));
+	LASSERTF((int)offsetof(struct hsm_user_request, hur_request) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_request, hur_request));
+	LASSERTF((int)sizeof(((struct hsm_user_request *)0)->hur_request) == 24, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_request *)0)->hur_request));
+	LASSERTF((int)offsetof(struct hsm_user_request, hur_user_item) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_request, hur_user_item));
+	LASSERTF((int)sizeof(((struct hsm_user_request *)0)->hur_user_item) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_request *)0)->hur_user_item));
+
+	/* Checks for struct hsm_user_import */
+	LASSERTF((int)sizeof(struct hsm_user_import) == 48, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_user_import));
+	LASSERTF((int)offsetof(struct hsm_user_import, hui_size) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_import, hui_size));
+	LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_size));
+	LASSERTF((int)offsetof(struct hsm_user_import, hui_uid) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_import, hui_uid));
+	LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_uid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_uid));
+	LASSERTF((int)offsetof(struct hsm_user_import, hui_gid) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_import, hui_gid));
+	LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_gid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_gid));
+	LASSERTF((int)offsetof(struct hsm_user_import, hui_mode) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_import, hui_mode));
+	LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_mode));
+	LASSERTF((int)offsetof(struct hsm_user_import, hui_atime) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_import, hui_atime));
+	LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_atime));
+	LASSERTF((int)offsetof(struct hsm_user_import, hui_atime_ns) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_import, hui_atime_ns));
+	LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_atime_ns) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_atime_ns));
+	LASSERTF((int)offsetof(struct hsm_user_import, hui_mtime) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_import, hui_mtime));
+	LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_mtime));
+	LASSERTF((int)offsetof(struct hsm_user_import, hui_mtime_ns) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_import, hui_mtime_ns));
+	LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_mtime_ns) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_mtime_ns));
+	LASSERTF((int)offsetof(struct hsm_user_import, hui_archive_id) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_import, hui_archive_id));
+	LASSERTF((int)sizeof(((struct hsm_user_import *)0)->hui_archive_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_import *)0)->hui_archive_id));
+
+	/* Checks for struct object_update_param */
+	LASSERTF((int)sizeof(struct object_update_param) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct object_update_param));
+	LASSERTF((int)offsetof(struct object_update_param, oup_len) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_param, oup_len));
+	LASSERTF((int)sizeof(((struct object_update_param *)0)->oup_len) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_param *)0)->oup_len));
+	LASSERTF((int)offsetof(struct object_update_param, oup_padding) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_param, oup_padding));
+	LASSERTF((int)sizeof(((struct object_update_param *)0)->oup_padding) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_param *)0)->oup_padding));
+	LASSERTF((int)offsetof(struct object_update_param, oup_padding2) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_param, oup_padding2));
+	LASSERTF((int)sizeof(((struct object_update_param *)0)->oup_padding2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_param *)0)->oup_padding2));
+	LASSERTF((int)offsetof(struct object_update_param, oup_buf) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_param, oup_buf));
+	LASSERTF((int)sizeof(((struct object_update_param *)0)->oup_buf) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_param *)0)->oup_buf));
+
+	/* Checks for struct object_update */
+	LASSERTF((int)sizeof(struct object_update) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct object_update));
+	LASSERTF((int)offsetof(struct object_update, ou_type) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update, ou_type));
+	LASSERTF((int)sizeof(((struct object_update *)0)->ou_type) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update *)0)->ou_type));
+	LASSERTF((int)offsetof(struct object_update, ou_params_count) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update, ou_params_count));
+	LASSERTF((int)sizeof(((struct object_update *)0)->ou_params_count) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update *)0)->ou_params_count));
+	LASSERTF((int)offsetof(struct object_update, ou_result_size) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update, ou_result_size));
+	LASSERTF((int)sizeof(((struct object_update *)0)->ou_result_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update *)0)->ou_result_size));
+	LASSERTF((int)offsetof(struct object_update, ou_flags) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update, ou_flags));
+	LASSERTF((int)sizeof(((struct object_update *)0)->ou_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update *)0)->ou_flags));
+	LASSERTF((int)offsetof(struct object_update, ou_padding1) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update, ou_padding1));
+	LASSERTF((int)sizeof(((struct object_update *)0)->ou_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update *)0)->ou_padding1));
+	LASSERTF((int)offsetof(struct object_update, ou_batchid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update, ou_batchid));
+	LASSERTF((int)sizeof(((struct object_update *)0)->ou_batchid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update *)0)->ou_batchid));
+	LASSERTF((int)offsetof(struct object_update, ou_fid) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update, ou_fid));
+	LASSERTF((int)sizeof(((struct object_update *)0)->ou_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update *)0)->ou_fid));
+	LASSERTF((int)offsetof(struct object_update, ou_params) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update, ou_params));
+	LASSERTF((int)sizeof(((struct object_update *)0)->ou_params) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update *)0)->ou_params));
+
+	/* Checks for struct object_update_request */
+	LASSERTF((int)sizeof(struct object_update_request) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct object_update_request));
+	LASSERTF((int)offsetof(struct object_update_request, ourq_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_request, ourq_magic));
+	LASSERTF((int)sizeof(((struct object_update_request *)0)->ourq_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_request *)0)->ourq_magic));
+	LASSERTF((int)offsetof(struct object_update_request, ourq_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_request, ourq_count));
+	LASSERTF((int)sizeof(((struct object_update_request *)0)->ourq_count) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_request *)0)->ourq_count));
+	LASSERTF((int)offsetof(struct object_update_request, ourq_padding) == 6, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_request, ourq_padding));
+	LASSERTF((int)sizeof(((struct object_update_request *)0)->ourq_padding) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_request *)0)->ourq_padding));
+	LASSERTF((int)offsetof(struct object_update_request, ourq_updates) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_request, ourq_updates));
+	LASSERTF((int)sizeof(((struct object_update_request *)0)->ourq_updates) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_request *)0)->ourq_updates));
+
+	/* Checks for struct object_update_result */
+	LASSERTF((int)sizeof(struct object_update_result) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct object_update_result));
+	LASSERTF((int)offsetof(struct object_update_result, our_rc) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_result, our_rc));
+	LASSERTF((int)sizeof(((struct object_update_result *)0)->our_rc) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_result *)0)->our_rc));
+	LASSERTF((int)offsetof(struct object_update_result, our_datalen) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_result, our_datalen));
+	LASSERTF((int)sizeof(((struct object_update_result *)0)->our_datalen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_result *)0)->our_datalen));
+	LASSERTF((int)offsetof(struct object_update_result, our_padding) == 6, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_result, our_padding));
+	LASSERTF((int)sizeof(((struct object_update_result *)0)->our_padding) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_result *)0)->our_padding));
+	LASSERTF((int)offsetof(struct object_update_result, our_data) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_result, our_data));
+	LASSERTF((int)sizeof(((struct object_update_result *)0)->our_data) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_result *)0)->our_data));
+
+	/* Checks for struct object_update_reply */
+	LASSERTF((int)sizeof(struct object_update_reply) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct object_update_reply));
+	LASSERTF((int)offsetof(struct object_update_reply, ourp_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_reply, ourp_magic));
+	LASSERTF((int)sizeof(((struct object_update_reply *)0)->ourp_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_reply *)0)->ourp_magic));
+	LASSERTF((int)offsetof(struct object_update_reply, ourp_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_reply, ourp_count));
+	LASSERTF((int)sizeof(((struct object_update_reply *)0)->ourp_count) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_reply *)0)->ourp_count));
+	LASSERTF((int)offsetof(struct object_update_reply, ourp_padding) == 6, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_reply, ourp_padding));
+	LASSERTF((int)sizeof(((struct object_update_reply *)0)->ourp_padding) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_reply *)0)->ourp_padding));
+	LASSERTF((int)offsetof(struct object_update_reply, ourp_lens) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct object_update_reply, ourp_lens));
+	LASSERTF((int)sizeof(((struct object_update_reply *)0)->ourp_lens) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct object_update_reply *)0)->ourp_lens));
+
+	/* Checks for struct out_update_header */
+	LASSERTF((int)sizeof(struct out_update_header) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct out_update_header));
+	LASSERTF((int)offsetof(struct out_update_header, ouh_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct out_update_header, ouh_magic));
+	LASSERTF((int)sizeof(((struct out_update_header *)0)->ouh_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct out_update_header *)0)->ouh_magic));
+	LASSERTF((int)offsetof(struct out_update_header, ouh_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct out_update_header, ouh_count));
+	LASSERTF((int)sizeof(((struct out_update_header *)0)->ouh_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct out_update_header *)0)->ouh_count));
+	LASSERTF((int)offsetof(struct out_update_header, ouh_inline_length) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct out_update_header, ouh_inline_length));
+	LASSERTF((int)sizeof(((struct out_update_header *)0)->ouh_inline_length) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct out_update_header *)0)->ouh_inline_length));
+	LASSERTF((int)offsetof(struct out_update_header, ouh_reply_size) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct out_update_header, ouh_reply_size));
+	LASSERTF((int)sizeof(((struct out_update_header *)0)->ouh_reply_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct out_update_header *)0)->ouh_reply_size));
+	LASSERTF((int)offsetof(struct out_update_header, ouh_inline_data) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct out_update_header, ouh_inline_data));
+	LASSERTF((int)sizeof(((struct out_update_header *)0)->ouh_inline_data) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct out_update_header *)0)->ouh_inline_data));
+
+	/* Checks for struct out_update_buffer */
+	LASSERTF((int)sizeof(struct out_update_buffer) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct out_update_buffer));
+	LASSERTF((int)offsetof(struct out_update_buffer, oub_size) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct out_update_buffer, oub_size));
+	LASSERTF((int)sizeof(((struct out_update_buffer *)0)->oub_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct out_update_buffer *)0)->oub_size));
+	LASSERTF((int)offsetof(struct out_update_buffer, oub_padding) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct out_update_buffer, oub_padding));
+	LASSERTF((int)sizeof(((struct out_update_buffer *)0)->oub_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct out_update_buffer *)0)->oub_padding));
+
+	/* Checks for struct nodemap_cluster_rec */
+	LASSERTF((int)sizeof(struct nodemap_cluster_rec) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct nodemap_cluster_rec));
+	LASSERTF((int)offsetof(struct nodemap_cluster_rec, ncr_name) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_cluster_rec, ncr_name));
+	LASSERTF((int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_name) == 17, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_name));
+	LASSERTF((int)offsetof(struct nodemap_cluster_rec, ncr_flags) == 17, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_cluster_rec, ncr_flags));
+	LASSERTF((int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_flags) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_flags));
+	LASSERTF((int)offsetof(struct nodemap_cluster_rec, ncr_padding1) == 18, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_cluster_rec, ncr_padding1));
+	LASSERTF((int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_padding1) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_padding1));
+	LASSERTF((int)offsetof(struct nodemap_cluster_rec, ncr_padding2) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_cluster_rec, ncr_padding2));
+	LASSERTF((int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_padding2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_padding2));
+	LASSERTF((int)offsetof(struct nodemap_cluster_rec, ncr_squash_uid) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_cluster_rec, ncr_squash_uid));
+	LASSERTF((int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_squash_uid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_squash_uid));
+	LASSERTF((int)offsetof(struct nodemap_cluster_rec, ncr_squash_gid) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_cluster_rec, ncr_squash_gid));
+	LASSERTF((int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_squash_gid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_cluster_rec *)0)->ncr_squash_gid));
+
+	/* Checks for struct nodemap_range_rec */
+	LASSERTF((int)sizeof(struct nodemap_range_rec) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct nodemap_range_rec));
+	LASSERTF((int)offsetof(struct nodemap_range_rec, nrr_start_nid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_range_rec, nrr_start_nid));
+	LASSERTF((int)sizeof(((struct nodemap_range_rec *)0)->nrr_start_nid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_range_rec *)0)->nrr_start_nid));
+	LASSERTF((int)offsetof(struct nodemap_range_rec, nrr_end_nid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_range_rec, nrr_end_nid));
+	LASSERTF((int)sizeof(((struct nodemap_range_rec *)0)->nrr_end_nid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_range_rec *)0)->nrr_end_nid));
+	LASSERTF((int)offsetof(struct nodemap_range_rec, nrr_padding1) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_range_rec, nrr_padding1));
+	LASSERTF((int)sizeof(((struct nodemap_range_rec *)0)->nrr_padding1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_range_rec *)0)->nrr_padding1));
+	LASSERTF((int)offsetof(struct nodemap_range_rec, nrr_padding2) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_range_rec, nrr_padding2));
+	LASSERTF((int)sizeof(((struct nodemap_range_rec *)0)->nrr_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_range_rec *)0)->nrr_padding2));
+
+	/* Checks for struct nodemap_id_rec */
+	LASSERTF((int)sizeof(struct nodemap_id_rec) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct nodemap_id_rec));
+	LASSERTF((int)offsetof(struct nodemap_id_rec, nir_id_fs) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_id_rec, nir_id_fs));
+	LASSERTF((int)sizeof(((struct nodemap_id_rec *)0)->nir_id_fs) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_id_rec *)0)->nir_id_fs));
+	LASSERTF((int)offsetof(struct nodemap_id_rec, nir_padding1) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_id_rec, nir_padding1));
+	LASSERTF((int)sizeof(((struct nodemap_id_rec *)0)->nir_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_id_rec *)0)->nir_padding1));
+	LASSERTF((int)offsetof(struct nodemap_id_rec, nir_padding2) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_id_rec, nir_padding2));
+	LASSERTF((int)sizeof(((struct nodemap_id_rec *)0)->nir_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_id_rec *)0)->nir_padding2));
+	LASSERTF((int)offsetof(struct nodemap_id_rec, nir_padding3) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_id_rec, nir_padding3));
+	LASSERTF((int)sizeof(((struct nodemap_id_rec *)0)->nir_padding3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_id_rec *)0)->nir_padding3));
+	LASSERTF((int)offsetof(struct nodemap_id_rec, nir_padding4) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_id_rec, nir_padding4));
+	LASSERTF((int)sizeof(((struct nodemap_id_rec *)0)->nir_padding4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_id_rec *)0)->nir_padding4));
+
+	/* Checks for struct nodemap_global_rec */
+	LASSERTF((int)sizeof(struct nodemap_global_rec) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct nodemap_global_rec));
+	LASSERTF((int)offsetof(struct nodemap_global_rec, ngr_is_active) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_global_rec, ngr_is_active));
+	LASSERTF((int)sizeof(((struct nodemap_global_rec *)0)->ngr_is_active) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_global_rec *)0)->ngr_is_active));
+	LASSERTF((int)offsetof(struct nodemap_global_rec, ngr_padding1) == 1, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_global_rec, ngr_padding1));
+	LASSERTF((int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding1) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding1));
+	LASSERTF((int)offsetof(struct nodemap_global_rec, ngr_padding2) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_global_rec, ngr_padding2));
+	LASSERTF((int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding2) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding2));
+	LASSERTF((int)offsetof(struct nodemap_global_rec, ngr_padding3) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_global_rec, ngr_padding3));
+	LASSERTF((int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding3));
+	LASSERTF((int)offsetof(struct nodemap_global_rec, ngr_padding4) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_global_rec, ngr_padding4));
+	LASSERTF((int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding4));
+	LASSERTF((int)offsetof(struct nodemap_global_rec, ngr_padding5) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_global_rec, ngr_padding5));
+	LASSERTF((int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding5) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding5));
+	LASSERTF((int)offsetof(struct nodemap_global_rec, ngr_padding6) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct nodemap_global_rec, ngr_padding6));
+	LASSERTF((int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding6) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct nodemap_global_rec *)0)->ngr_padding6));
+
+	/* Checks for union nodemap_rec */
+	LASSERTF((int)sizeof(union nodemap_rec) == 32, "found %lld\n",
+		 (long long)(int)sizeof(union nodemap_rec));
+
+	/* Checks for struct lfsck_request */
+	LASSERTF((int)sizeof(struct lfsck_request) == 96, "found %lld\n",
+		 (long long)(int)sizeof(struct lfsck_request));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_event) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_event));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_event) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_event));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_index) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_index));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_index));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_flags) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_flags));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_flags));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_valid) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_valid));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_valid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_valid));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_speed) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_speed));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_speed) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_speed));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_version) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_version));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_version) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_version));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_active) == 22, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_active));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_active) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_active));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_param) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_param));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_param) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_param));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_async_windows) == 26, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_async_windows));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_async_windows) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_async_windows));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_flags) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_flags));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_flags));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_fid) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_fid));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_fid));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_fid2) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_fid2));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_fid2));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_comp_id) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_comp_id));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_comp_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_comp_id));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_padding_0) == 68, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_padding_0));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_padding_0) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_padding_0));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_padding_1) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_padding_1));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_padding_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_padding_1));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_padding_2) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_padding_2));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_padding_2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_padding_2));
+	LASSERTF((int)offsetof(struct lfsck_request, lr_padding_3) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_request, lr_padding_3));
+	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_padding_3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_padding_3));
+	LASSERTF(LFSCK_TYPE_SCRUB == 0x00000000UL, "found 0x%.8xUL\n",
+		(unsigned)LFSCK_TYPE_SCRUB);
+	LASSERTF(LFSCK_TYPE_LAYOUT == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)LFSCK_TYPE_LAYOUT);
+	LASSERTF(LFSCK_TYPE_NAMESPACE == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)LFSCK_TYPE_NAMESPACE);
+	LASSERTF(LE_LASTID_REBUILDING == 1, "found %lld\n",
+		 (long long)LE_LASTID_REBUILDING);
+	LASSERTF(LE_LASTID_REBUILT == 2, "found %lld\n",
+		 (long long)LE_LASTID_REBUILT);
+	LASSERTF(LE_PHASE1_DONE == 3, "found %lld\n",
+		 (long long)LE_PHASE1_DONE);
+	LASSERTF(LE_PHASE2_DONE == 4, "found %lld\n",
+		 (long long)LE_PHASE2_DONE);
+	LASSERTF(LE_START == 5, "found %lld\n",
+		 (long long)LE_START);
+	LASSERTF(LE_STOP == 6, "found %lld\n",
+		 (long long)LE_STOP);
+	LASSERTF(LE_QUERY == 7, "found %lld\n",
+		 (long long)LE_QUERY);
+	LASSERTF(LE_PEER_EXIT == 9, "found %lld\n",
+		 (long long)LE_PEER_EXIT);
+	LASSERTF(LE_CONDITIONAL_DESTROY == 10, "found %lld\n",
+		 (long long)LE_CONDITIONAL_DESTROY);
+	LASSERTF(LE_PAIRS_VERIFY == 11, "found %lld\n",
+		 (long long)LE_PAIRS_VERIFY);
+	LASSERTF(LE_SET_LMV_MASTER == 15, "found %lld\n",
+		 (long long)LE_SET_LMV_MASTER);
+	LASSERTF(LE_SET_LMV_SLAVE == 16, "found %lld\n",
+		 (long long)LE_SET_LMV_SLAVE);
+	LASSERTF(LEF_TO_OST == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)LEF_TO_OST);
+	LASSERTF(LEF_FROM_OST == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)LEF_FROM_OST);
+	LASSERTF(LEF_SET_LMV_HASH == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)LEF_SET_LMV_HASH);
+	LASSERTF(LEF_SET_LMV_ALL == 0x00000008UL, "found 0x%.8xUL\n",
+		(unsigned)LEF_SET_LMV_ALL);
+	LASSERTF(LEF_RECHECK_NAME_HASH == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)LEF_RECHECK_NAME_HASH);
+	LASSERTF(LEF_QUERY_ALL == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)LEF_QUERY_ALL);
+
+	/* Checks for struct lfsck_reply */
+	LASSERTF((int)sizeof(struct lfsck_reply) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct lfsck_reply));
+	LASSERTF((int)offsetof(struct lfsck_reply, lr_status) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_reply, lr_status));
+	LASSERTF((int)sizeof(((struct lfsck_reply *)0)->lr_status) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_reply *)0)->lr_status));
+	LASSERTF((int)offsetof(struct lfsck_reply, lr_padding_1) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_reply, lr_padding_1));
+	LASSERTF((int)sizeof(((struct lfsck_reply *)0)->lr_padding_1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_reply *)0)->lr_padding_1));
+	LASSERTF((int)offsetof(struct lfsck_reply, lr_repaired) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lfsck_reply, lr_repaired));
+	LASSERTF((int)sizeof(((struct lfsck_reply *)0)->lr_repaired) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lfsck_reply *)0)->lr_repaired));
+
+	/* Checks for struct update_params */
+	LASSERTF((int)sizeof(struct update_params) == 0, "found %lld\n",
+		 (long long)(int)sizeof(struct update_params));
+	LASSERTF((int)offsetof(struct update_params, up_params) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct update_params, up_params));
+	LASSERTF((int)sizeof(((struct update_params *)0)->up_params) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_params *)0)->up_params));
+
+	/* Checks for struct update_op */
+	LASSERTF((int)sizeof(struct update_op) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct update_op));
+	LASSERTF((int)offsetof(struct update_op, uop_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct update_op, uop_fid));
+	LASSERTF((int)sizeof(((struct update_op *)0)->uop_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_op *)0)->uop_fid));
+	LASSERTF((int)offsetof(struct update_op, uop_type) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct update_op, uop_type));
+	LASSERTF((int)sizeof(((struct update_op *)0)->uop_type) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_op *)0)->uop_type));
+	LASSERTF((int)offsetof(struct update_op, uop_param_count) == 18, "found %lld\n",
+		 (long long)(int)offsetof(struct update_op, uop_param_count));
+	LASSERTF((int)sizeof(((struct update_op *)0)->uop_param_count) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_op *)0)->uop_param_count));
+	LASSERTF((int)offsetof(struct update_op, uop_params_off) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct update_op, uop_params_off));
+	LASSERTF((int)sizeof(((struct update_op *)0)->uop_params_off) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_op *)0)->uop_params_off));
+
+	/* Checks for struct update_ops */
+	LASSERTF((int)sizeof(struct update_ops) == 0, "found %lld\n",
+		 (long long)(int)sizeof(struct update_ops));
+	LASSERTF((int)offsetof(struct update_ops, uops_op) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct update_ops, uops_op));
+	LASSERTF((int)sizeof(((struct update_ops *)0)->uops_op) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_ops *)0)->uops_op));
+
+	/* Checks for struct update_records */
+	LASSERTF((int)sizeof(struct update_records) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct update_records));
+	LASSERTF((int)offsetof(struct update_records, ur_master_transno) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct update_records, ur_master_transno));
+	LASSERTF((int)sizeof(((struct update_records *)0)->ur_master_transno) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_records *)0)->ur_master_transno));
+	LASSERTF((int)offsetof(struct update_records, ur_batchid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct update_records, ur_batchid));
+	LASSERTF((int)sizeof(((struct update_records *)0)->ur_batchid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_records *)0)->ur_batchid));
+	LASSERTF((int)offsetof(struct update_records, ur_flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct update_records, ur_flags));
+	LASSERTF((int)sizeof(((struct update_records *)0)->ur_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_records *)0)->ur_flags));
+	LASSERTF((int)offsetof(struct update_records, ur_index) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct update_records, ur_index));
+	LASSERTF((int)sizeof(((struct update_records *)0)->ur_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_records *)0)->ur_index));
+	LASSERTF((int)offsetof(struct update_records, ur_update_count) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct update_records, ur_update_count));
+	LASSERTF((int)sizeof(((struct update_records *)0)->ur_update_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_records *)0)->ur_update_count));
+	LASSERTF((int)offsetof(struct update_records, ur_param_count) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct update_records, ur_param_count));
+	LASSERTF((int)sizeof(((struct update_records *)0)->ur_param_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_records *)0)->ur_param_count));
+	LASSERTF(UPDATE_RECORD_CONTINUE == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)UPDATE_RECORD_CONTINUE);
+
+	/* Checks for struct llog_update_record */
+	LASSERTF((int)sizeof(struct llog_update_record) == 48, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_update_record));
+	LASSERTF((int)offsetof(struct llog_update_record, lur_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_update_record, lur_hdr));
+	LASSERTF((int)sizeof(((struct llog_update_record *)0)->lur_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_update_record *)0)->lur_hdr));
+	LASSERTF((int)offsetof(struct llog_update_record, lur_update_rec) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_update_record, lur_update_rec));
+	LASSERTF((int)sizeof(((struct llog_update_record *)0)->lur_update_rec) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_update_record *)0)->lur_update_rec));
+
+	/* Checks for struct lu_ladvise */
+	LASSERTF((int)sizeof(struct lu_ladvise) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lu_ladvise));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_advice) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_advice));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_advice) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_advice));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_value1) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_value1));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value1) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value1));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_value2) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_value2));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value2));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_start) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_start));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_start));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_end) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_end));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_end));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_value3) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_value3));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value3));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_value4) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_value4));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value4));
+	LASSERTF(LU_LADVISE_WILLREAD == 1, "found %lld\n",
+		 (long long)LU_LADVISE_WILLREAD);
+	LASSERTF(LU_LADVISE_DONTNEED == 2, "found %lld\n",
+		 (long long)LU_LADVISE_DONTNEED);
+
+	/* Checks for struct ladvise_hdr */
+	LASSERTF(LADVISE_MAGIC == 0x1ADF1CE0, "found 0x%.8x\n",
+		 LADVISE_MAGIC);
+	LASSERTF((int)sizeof(struct ladvise_hdr) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct ladvise_hdr));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_magic));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_magic));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_count));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_count));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_flags) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_flags));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_flags));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_value1) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_value1));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value1));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_value2) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_value2));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value2));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_value3) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_value3));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value3));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_advise) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_advise));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_advise) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_advise));
+	LASSERTF(LF_ASYNC == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)LF_ASYNC);
+}
diff --git a/drivers/staging/lustrefsx/lustre/target/barrier.c b/drivers/staging/lustrefsx/lustre/target/barrier.c
new file mode 100644
index 0000000000000..6145e0e37a711
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/target/barrier.c
@@ -0,0 +1,416 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2016, Intel Corporation.
+ *
+ * lustre/target/barrier.c
+ *
+ * Currently, the Lustre barrier is implemented as write barrier on all MDTs.
+ * For each MDT in the system, when it starts, it registers a barrier instance
+ * that will be used in handling subsequent barrier requests.
+ *
+ * Author: Fan, Yong <fan.yong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SNAPSHOT
+
+#include <linux/percpu_counter.h>
+
+#include <lustre/lustre_idl.h>
+#include <dt_object.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_barrier.h>
+#include <lustre/lustre_barrier_user.h>
+
+static LIST_HEAD(barrier_instance_list);
+static DEFINE_SPINLOCK(barrier_instance_lock);
+
+struct barrier_instance {
+	struct list_head	 bi_link;
+	struct dt_device	*bi_bottom;
+	struct dt_device	*bi_next;
+	wait_queue_head_t	 bi_waitq;
+	rwlock_t		 bi_rwlock;
+	struct percpu_counter	 bi_writers;
+	atomic_t		 bi_ref;
+	time_t			 bi_deadline;
+	__u32			 bi_status;
+};
+
+static inline char *barrier_barrier2name(struct barrier_instance *barrier)
+{
+	return barrier->bi_bottom->dd_lu_dev.ld_obd->obd_name;
+}
+
+static inline __u32 barrier_dev_idx(struct barrier_instance *barrier)
+{
+	return lu_site2seq(barrier->bi_bottom->dd_lu_dev.ld_site)->ss_node_id;
+}
+
+static void barrier_instance_cleanup(struct barrier_instance *barrier)
+{
+	LASSERT(list_empty(&barrier->bi_link));
+
+	percpu_counter_destroy(&barrier->bi_writers);
+	OBD_FREE_PTR(barrier);
+}
+
+static inline void barrier_instance_put(struct barrier_instance *barrier)
+{
+	if (atomic_dec_and_test(&barrier->bi_ref))
+		barrier_instance_cleanup(barrier);
+}
+
+static struct barrier_instance *
+barrier_instance_find_locked(struct dt_device *key)
+{
+	struct barrier_instance *barrier;
+
+	list_for_each_entry(barrier, &barrier_instance_list, bi_link) {
+		if (barrier->bi_bottom == key)
+			return barrier;
+	}
+
+	return NULL;
+}
+
+static void barrier_instance_add(struct barrier_instance *barrier)
+{
+	struct barrier_instance *tmp;
+
+	spin_lock(&barrier_instance_lock);
+	tmp = barrier_instance_find_locked(barrier->bi_bottom);
+	LASSERT(!tmp);
+
+	list_add_tail(&barrier->bi_link, &barrier_instance_list);
+	spin_unlock(&barrier_instance_lock);
+}
+
+static struct barrier_instance *barrier_instance_find(struct dt_device *key)
+{
+	struct barrier_instance *barrier;
+
+	spin_lock(&barrier_instance_lock);
+	barrier = barrier_instance_find_locked(key);
+	if (barrier)
+		atomic_inc(&barrier->bi_ref);
+	spin_unlock(&barrier_instance_lock);
+
+	return barrier;
+}
+
+static void barrier_set(struct barrier_instance *barrier, __u32 status)
+{
+	if (barrier->bi_status != status) {
+		CDEBUG(D_SNAPSHOT, "%s: change barrier status from %u to %u\n",
+		       barrier_barrier2name(barrier),
+		       barrier->bi_status, status);
+
+		barrier->bi_status = status;
+	}
+}
+
+/**
+ * Create the barrier for the given instance.
+ *
+ * We use two-phases barrier to guarantee that after the barrier setup:
+ * 1) All the MDT side pending async modification have been flushed.
+ * 2) Any subsequent modification will be blocked.
+ * 3) All async transactions on the MDTs have been committed.
+ *
+ * For phase1, we do the following:
+ *
+ * Firstly, it sets barrier flag on the instance that will block subsequent
+ * modifications from clients. (Note: server sponsored modification will be
+ * allowed for flush pending modifications)
+ *
+ * Secondly, it will flush all pending modification via dt_sync(), such as
+ * async OST-object destroy, async OST-object owner changes, and so on.
+ *
+ * If there are some on-handling clients sponsored modifications during the
+ * barrier freezing, then related modifications may cause pending requests
+ * after the first dt_sync(), so call dt_sync() again after all on-handling
+ * modifications done.
+ *
+ * With the phase1 barrier set, all pending cross-servers modification have
+ * been flushed to remote servers, and any new modification will be blocked.
+ * But it does not guarantees that all the updates have been committed to
+ * storage on remote servers. So when all the instances have done phase1
+ * barrier successfully, the MGS will notify all instances to do the phase2
+ * barrier as following:
+ *
+ * Every barrier instance will call dt_sync() to make all async transactions
+ * to be committed locally.
+ *
+ * \param[in] env	pointer to the thread context
+ * \param[in] barrier	pointer to the barrier instance
+ * \param[in] phase1	indicate whether it is phase1 barrier or not
+ *
+ * \retval		positive number for timeout
+ * \retval		0 for success
+ * \retval		negative error number on failure
+ */
+static int barrier_freeze(const struct lu_env *env,
+			  struct barrier_instance *barrier, bool phase1)
+{
+	int left;
+	int rc = 0;
+	__s64 inflight = 0;
+	ENTRY;
+
+	write_lock(&barrier->bi_rwlock);
+	barrier_set(barrier, phase1 ? BS_FREEZING_P1 : BS_FREEZING_P2);
+
+	/* Avoid out-of-order execution the barrier_set()
+	 * and the check of inflight modifications count. */
+	smp_mb();
+
+	if (phase1)
+		inflight = percpu_counter_sum(&barrier->bi_writers);
+	write_unlock(&barrier->bi_rwlock);
+
+	rc = dt_sync(env, barrier->bi_next);
+	if (rc)
+		RETURN(rc);
+
+	LASSERT(barrier->bi_deadline != 0);
+
+	left = barrier->bi_deadline - cfs_time_current_sec();
+	if (left <= 0)
+		RETURN(1);
+
+	if (phase1 && inflight != 0) {
+		struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(left),
+						     NULL, NULL);
+
+		rc = l_wait_event(barrier->bi_waitq,
+				  percpu_counter_sum(&barrier->bi_writers) == 0,
+				  &lwi);
+		if (rc)
+			RETURN(1);
+
+		/* sync again after all inflight modifications done. */
+		rc = dt_sync(env, barrier->bi_next);
+		if (rc)
+			RETURN(rc);
+
+		if (cfs_time_beforeq(barrier->bi_deadline,
+				     cfs_time_current_sec()))
+			RETURN(1);
+	}
+
+	CDEBUG(D_SNAPSHOT, "%s: barrier freezing %s done.\n",
+	       barrier_barrier2name(barrier), phase1 ? "phase1" : "phase2");
+
+	if (!phase1)
+		barrier_set(barrier, BS_FROZEN);
+
+	RETURN(0);
+}
+
+void barrier_init(void)
+{
+}
+
+void barrier_fini(void)
+{
+	LASSERT(list_empty(&barrier_instance_list));
+}
+
+bool barrier_entry(struct dt_device *key)
+{
+	struct barrier_instance *barrier;
+	bool entered = false;
+	ENTRY;
+
+	barrier = barrier_instance_find(key);
+	if (unlikely(!barrier))
+		/* Fail open */
+		RETURN(true);
+
+	read_lock(&barrier->bi_rwlock);
+	if (likely(barrier->bi_status != BS_FREEZING_P1 &&
+		   barrier->bi_status != BS_FREEZING_P2 &&
+		   barrier->bi_status != BS_FROZEN) ||
+	    cfs_time_beforeq(barrier->bi_deadline, cfs_time_current_sec())) {
+		percpu_counter_inc(&barrier->bi_writers);
+		entered = true;
+	}
+	read_unlock(&barrier->bi_rwlock);
+
+	barrier_instance_put(barrier);
+	return entered;
+}
+EXPORT_SYMBOL(barrier_entry);
+
+void barrier_exit(struct dt_device *key)
+{
+	struct barrier_instance *barrier;
+
+	barrier = barrier_instance_find(key);
+	if (likely(barrier)) {
+		percpu_counter_dec(&barrier->bi_writers);
+
+		/* Avoid out-of-order execution the decreasing inflight
+		 * modifications count and the check of barrier status. */
+		smp_mb();
+
+		if (unlikely(barrier->bi_status == BS_FREEZING_P1))
+			wake_up_all(&barrier->bi_waitq);
+		barrier_instance_put(barrier);
+	}
+}
+EXPORT_SYMBOL(barrier_exit);
+
+int barrier_handler(struct dt_device *key, struct ptlrpc_request *req)
+{
+	struct ldlm_gl_barrier_desc *desc;
+	struct barrier_instance *barrier;
+	struct barrier_lvb *lvb;
+	struct lu_env env;
+	int rc = 0;
+	ENTRY;
+
+	/* glimpse on barrier locks always packs a glimpse descriptor */
+	req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_DESC_CALLBACK);
+	desc = req_capsule_client_get(&req->rq_pill, &RMF_DLM_GL_DESC);
+	if (!desc)
+		GOTO(out, rc = -EPROTO);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+			      sizeof(struct barrier_lvb));
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc)
+		GOTO(out, rc);
+
+	lvb = req_capsule_server_get(&req->rq_pill, &RMF_DLM_LVB);
+	barrier = barrier_instance_find(key);
+	if (!barrier)
+		GOTO(out, rc = -ENODEV);
+
+	rc = lu_env_init(&env, LCT_MD_THREAD | LCT_DT_THREAD);
+	if (rc)
+		GOTO(out_barrier, rc);
+
+	CDEBUG(D_SNAPSHOT,
+	       "%s: handling barrier request: status %u, timeout %u\n",
+	       barrier_barrier2name(barrier),
+	       desc->lgbd_status, desc->lgbd_timeout);
+
+	switch (desc->lgbd_status) {
+	case BS_RESCAN:
+		barrier_set(barrier, BS_INIT);
+		break;
+	case BS_FREEZING_P1:
+	case BS_FREEZING_P2:
+		if (OBD_FAIL_CHECK(OBD_FAIL_BARRIER_FAILURE))
+			GOTO(fini, rc = -EINVAL);
+
+		barrier->bi_deadline = cfs_time_current_sec() +
+					desc->lgbd_timeout;
+		rc = barrier_freeze(&env, barrier,
+				    desc->lgbd_status == BS_FREEZING_P1);
+		break;
+	case BS_THAWING:
+	case BS_FAILED:
+	case BS_EXPIRED:
+		barrier_set(barrier, BS_THAWED);
+		break;
+	default:
+		CWARN("%s: unexpected barrier status %u\n",
+		      barrier_barrier2name(barrier), desc->lgbd_status);
+		rc = -EINVAL;
+		break;
+	}
+
+	GOTO(fini, rc);
+
+fini:
+	lu_env_fini(&env);
+
+out_barrier:
+	if (rc < 0)
+		barrier_set(barrier, BS_FAILED);
+	else if (rc > 0)
+		barrier_set(barrier, BS_EXPIRED);
+
+	lvb->lvb_status = barrier->bi_status;
+	lvb->lvb_index = barrier_dev_idx(barrier);
+
+	CDEBUG(D_SNAPSHOT, "%s: handled barrier request: status %u, "
+	       "deadline %lu: rc = %d\n", barrier_barrier2name(barrier),
+	       lvb->lvb_status, barrier->bi_deadline, rc);
+
+	barrier_instance_put(barrier);
+	rc = 0;
+
+out:
+	req->rq_status = rc;
+	return rc;
+}
+EXPORT_SYMBOL(barrier_handler);
+
+int barrier_register(struct dt_device *key, struct dt_device *next)
+{
+	struct barrier_instance	*barrier;
+	int rc;
+	ENTRY;
+
+	OBD_ALLOC_PTR(barrier);
+	if (!barrier)
+		RETURN(-ENOMEM);
+
+	INIT_LIST_HEAD(&barrier->bi_link);
+	barrier->bi_bottom = key;
+	barrier->bi_next = next;
+	init_waitqueue_head(&barrier->bi_waitq);
+	rwlock_init(&barrier->bi_rwlock);
+	atomic_set(&barrier->bi_ref, 1);
+#ifdef HAVE_PERCPU_COUNTER_INIT_GFP_FLAG
+	rc = percpu_counter_init(&barrier->bi_writers, 0, GFP_KERNEL);
+#else
+	rc = percpu_counter_init(&barrier->bi_writers, 0);
+#endif
+	if (rc)
+		barrier_instance_cleanup(barrier);
+	else
+		barrier_instance_add(barrier);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(barrier_register);
+
+void barrier_deregister(struct dt_device *key)
+{
+	struct barrier_instance *barrier;
+
+	spin_lock(&barrier_instance_lock);
+	barrier = barrier_instance_find_locked(key);
+	if (barrier)
+		list_del_init(&barrier->bi_link);
+	spin_unlock(&barrier_instance_lock);
+
+	if (barrier)
+		barrier_instance_put(barrier);
+}
+EXPORT_SYMBOL(barrier_deregister);
diff --git a/drivers/staging/lustrefsx/lustre/target/out_handler.c b/drivers/staging/lustrefsx/lustre/target/out_handler.c
new file mode 100644
index 0000000000000..c342ae41f95c0
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/target/out_handler.c
@@ -0,0 +1,1186 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2016, Intel Corporation.
+ *
+ * lustre/target/out_handler.c
+ *
+ * Object update handler between targets.
+ *
+ * Author: di.wang <di.wang@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <llog_swab.h>
+#include <lustre_obdo.h>
+#include <lustre_swab.h>
+#include <lustre_update.h>
+#include <md_object.h>
+#include <obd_class.h>
+#include "tgt_internal.h"
+
+static inline void orr_cpu_to_le(struct out_read_reply *orr_dst,
+				 const struct out_read_reply *orr_src)
+{
+	orr_dst->orr_size = cpu_to_le32(orr_src->orr_size);
+	orr_dst->orr_padding = cpu_to_le32(orr_src->orr_padding);
+	orr_dst->orr_offset = cpu_to_le64(orr_dst->orr_offset);
+}
+
+static void out_reconstruct(const struct lu_env *env, struct dt_device *dt,
+			    struct dt_object *obj,
+			    struct object_update_reply *reply,
+			    int index)
+{
+	CDEBUG(D_INFO, "%s: fork reply reply %p index %d: rc = %d\n",
+	       dt_obd_name(dt), reply, index, 0);
+
+	object_update_result_insert(reply, NULL, 0, index, 0);
+	return;
+}
+
+typedef void (*out_reconstruct_t)(const struct lu_env *env,
+				  struct dt_device *dt,
+				  struct dt_object *obj,
+				  struct object_update_reply *reply,
+				  int index);
+
+static inline int out_check_resent(const struct lu_env *env,
+				   struct dt_device *dt,
+				   struct dt_object *obj,
+				   struct ptlrpc_request *req,
+				   out_reconstruct_t reconstruct,
+				   struct object_update_reply *reply,
+				   int index)
+{
+	if (likely(!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT)))
+		return 0;
+
+	if (req_xid_is_last(req)) {
+		struct lsd_client_data *lcd;
+
+		/* XXX this does not support mulitple transactions yet, i.e.
+		 * only 1 update RPC each time betwee MDTs */
+		lcd = req->rq_export->exp_target_data.ted_lcd;
+
+		req->rq_transno = lcd->lcd_last_transno;
+		req->rq_status = lcd->lcd_last_result;
+		if (req->rq_status != 0)
+			req->rq_transno = 0;
+		lustre_msg_set_transno(req->rq_repmsg, req->rq_transno);
+		lustre_msg_set_status(req->rq_repmsg, req->rq_status);
+
+		DEBUG_REQ(D_RPCTRACE, req, "restoring resent RPC");
+
+		reconstruct(env, dt, obj, reply, index);
+		return 1;
+	}
+	DEBUG_REQ(D_HA, req, "no reply for RESENT req (have %lld)",
+		 req->rq_export->exp_target_data.ted_lcd->lcd_last_xid);
+	return 0;
+}
+
+static int out_create(struct tgt_session_info *tsi)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(tsi->tsi_env);
+	struct object_update	*update = tti->tti_u.update.tti_update;
+	struct dt_object        *obj = tti->tti_u.update.tti_dt_object;
+	struct dt_object_format	*dof = &tti->tti_u.update.tti_update_dof;
+	struct obdo		*lobdo = &tti->tti_u.update.tti_obdo;
+	struct lu_attr		*attr = &tti->tti_attr;
+	struct lu_fid		*fid = NULL;
+	struct obdo		*wobdo;
+	size_t			size;
+	int			rc;
+
+	ENTRY;
+
+	wobdo = object_update_param_get(update, 0, &size);
+	if (IS_ERR(wobdo) || size != sizeof(*wobdo)) {
+		CERROR("%s: obdo is NULL, invalid RPC: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), PTR_ERR(wobdo));
+		RETURN(PTR_ERR(wobdo));
+	}
+
+	if (ptlrpc_req_need_swab(tsi->tsi_pill->rc_req))
+		lustre_swab_obdo(wobdo);
+	lustre_get_wire_obdo(NULL, lobdo, wobdo);
+	la_from_obdo(attr, lobdo, lobdo->o_valid);
+
+	dof->dof_type = dt_mode_to_dft(attr->la_mode);
+	if (update->ou_params_count > 1) {
+		fid = object_update_param_get(update, 1, &size);
+		if (IS_ERR(fid) || size != sizeof(*fid)) {
+			CERROR("%s: invalid fid: rc = %ld\n",
+			       tgt_name(tsi->tsi_tgt), PTR_ERR(fid));
+			RETURN(PTR_ERR(fid));
+		}
+		if (ptlrpc_req_need_swab(tsi->tsi_pill->rc_req))
+			lustre_swab_lu_fid(fid);
+		if (!fid_is_sane(fid)) {
+			CERROR("%s: invalid fid "DFID": rc = %d\n",
+			       tgt_name(tsi->tsi_tgt), PFID(fid), -EPROTO);
+			RETURN(-EPROTO);
+		}
+	}
+
+	if (lu_object_exists(&obj->do_lu))
+		RETURN(-EEXIST);
+
+	rc = out_tx_create(tsi->tsi_env, obj, attr, fid, dof,
+			   &tti->tti_tea, tti->tti_tea.ta_handle,
+			   tti->tti_u.update.tti_update_reply,
+			   tti->tti_u.update.tti_update_reply_index);
+
+	RETURN(rc);
+}
+
+static int out_attr_set(struct tgt_session_info *tsi)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(tsi->tsi_env);
+	struct object_update	*update = tti->tti_u.update.tti_update;
+	struct lu_attr		*attr = &tti->tti_attr;
+	struct dt_object        *obj = tti->tti_u.update.tti_dt_object;
+	struct obdo		*lobdo = &tti->tti_u.update.tti_obdo;
+	struct obdo		*wobdo;
+	size_t			 size;
+	int			 rc;
+
+	ENTRY;
+
+	wobdo = object_update_param_get(update, 0, &size);
+	if (IS_ERR(wobdo) || size != sizeof(*wobdo)) {
+		CERROR("%s: empty obdo in the update: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), PTR_ERR(wobdo));
+		RETURN(PTR_ERR(wobdo));
+	}
+
+	attr->la_valid = 0;
+	attr->la_valid = 0;
+
+	if (ptlrpc_req_need_swab(tsi->tsi_pill->rc_req))
+		lustre_swab_obdo(wobdo);
+	lustre_get_wire_obdo(NULL, lobdo, wobdo);
+	la_from_obdo(attr, lobdo, lobdo->o_valid);
+
+	rc = out_tx_attr_set(tsi->tsi_env, obj, attr, &tti->tti_tea,
+			     tti->tti_tea.ta_handle,
+			     tti->tti_u.update.tti_update_reply,
+			     tti->tti_u.update.tti_update_reply_index);
+
+	RETURN(rc);
+}
+
+static int out_attr_get(struct tgt_session_info *tsi)
+{
+	const struct lu_env	*env = tsi->tsi_env;
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct object_update	*update = tti->tti_u.update.tti_update;
+	struct obdo		*obdo = &tti->tti_u.update.tti_obdo;
+	struct lu_attr		*la = &tti->tti_attr;
+	struct dt_object        *obj = tti->tti_u.update.tti_dt_object;
+	int			idx = tti->tti_u.update.tti_update_reply_index;
+	int			rc;
+
+	ENTRY;
+
+	if (unlikely(update->ou_result_size < sizeof(*obdo)))
+		return -EPROTO;
+
+	if (!lu_object_exists(&obj->do_lu)) {
+		/* Usually, this will be called when the master MDT try
+		 * to init a remote object(see osp_object_init), so if
+		 * the object does not exist on slave, we need set BANSHEE flag,
+		 * so the object can be removed from the cache immediately */
+		set_bit(LU_OBJECT_HEARD_BANSHEE,
+			&obj->do_lu.lo_header->loh_flags);
+		RETURN(-ENOENT);
+	}
+
+	dt_read_lock(env, obj, MOR_TGT_CHILD);
+	rc = dt_attr_get(env, obj, la);
+	if (rc)
+		GOTO(out_unlock, rc);
+
+	obdo->o_valid = 0;
+	obdo_from_la(obdo, la, la->la_valid);
+
+out_unlock:
+	dt_read_unlock(env, obj);
+
+	CDEBUG(D_INFO, "%s: insert attr get reply %p index %d: rc = %d\n",
+	       tgt_name(tsi->tsi_tgt), tti->tti_u.update.tti_update_reply,
+	       0, rc);
+
+	object_update_result_insert(tti->tti_u.update.tti_update_reply, obdo,
+				    sizeof(*obdo), idx, rc);
+
+	RETURN(rc);
+}
+
+static int out_xattr_get(struct tgt_session_info *tsi)
+{
+	const struct lu_env	   *env = tsi->tsi_env;
+	struct tgt_thread_info	   *tti = tgt_th_info(env);
+	struct object_update	   *update = tti->tti_u.update.tti_update;
+	struct lu_buf		   *lbuf = &tti->tti_buf;
+	struct object_update_reply *reply = tti->tti_u.update.tti_update_reply;
+	struct dt_object           *obj = tti->tti_u.update.tti_dt_object;
+	char			   *name;
+	struct object_update_result *update_result;
+	int			idx = tti->tti_u.update.tti_update_reply_index;
+	int			   rc;
+
+	ENTRY;
+
+	if (!lu_object_exists(&obj->do_lu)) {
+		set_bit(LU_OBJECT_HEARD_BANSHEE,
+			&obj->do_lu.lo_header->loh_flags);
+		RETURN(-ENOENT);
+	}
+
+	name = object_update_param_get(update, 0, NULL);
+	if (IS_ERR(name)) {
+		CERROR("%s: empty name for xattr get: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), PTR_ERR(name));
+		RETURN(PTR_ERR(name));
+	}
+
+	update_result = object_update_result_get(reply, idx, NULL);
+	if (update_result == NULL) {
+		CERROR("%s: empty name for xattr get: rc = %d\n",
+		       tgt_name(tsi->tsi_tgt), -EPROTO);
+		RETURN(-EPROTO);
+	}
+
+	lbuf->lb_len = (int)tti->tti_u.update.tti_update->ou_result_size;
+	if (lbuf->lb_len == 0)
+		lbuf->lb_buf = NULL;
+	else
+		lbuf->lb_buf = update_result->our_data;
+
+	dt_read_lock(env, obj, MOR_TGT_CHILD);
+	rc = dt_xattr_get(env, obj, lbuf, name);
+	dt_read_unlock(env, obj);
+	if (rc <= 0) {
+		lbuf->lb_len = 0;
+		if (unlikely(!rc))
+			rc = -ENODATA;
+	} else if (lbuf->lb_buf) {
+		lbuf->lb_len = rc;
+	}
+
+	CDEBUG(D_INFO, "%s: "DFID" get xattr %s len %d: rc = %d\n",
+	       tgt_name(tsi->tsi_tgt), PFID(lu_object_fid(&obj->do_lu)),
+	       name, (int)lbuf->lb_len, rc);
+
+	/* Since we directly use update_result->our_data as the lbuf->lb_buf,
+	 * then use NULL for result_insert to avoid unnecessary memory copy. */
+	object_update_result_insert(reply, NULL, lbuf->lb_len, idx, rc);
+
+	RETURN(0);
+}
+
+static int out_index_lookup(struct tgt_session_info *tsi)
+{
+	const struct lu_env	*env = tsi->tsi_env;
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct object_update	*update = tti->tti_u.update.tti_update;
+	struct dt_object	*obj = tti->tti_u.update.tti_dt_object;
+	char			*name;
+	int			 rc;
+
+	ENTRY;
+
+	if (unlikely(update->ou_result_size < sizeof(tti->tti_fid1)))
+		return -EPROTO;
+
+	if (!lu_object_exists(&obj->do_lu))
+		RETURN(-ENOENT);
+
+	name = object_update_param_get(update, 0, NULL);
+	if (IS_ERR(name)) {
+		CERROR("%s: empty name for lookup: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), PTR_ERR(name));
+		RETURN(PTR_ERR(name));
+	}
+
+	dt_read_lock(env, obj, MOR_TGT_CHILD);
+	if (!dt_try_as_dir(env, obj))
+		GOTO(out_unlock, rc = -ENOTDIR);
+
+	rc = dt_lookup(env, obj, (struct dt_rec *)&tti->tti_fid1,
+		       (struct dt_key *)name);
+
+	if (rc < 0)
+		GOTO(out_unlock, rc);
+
+	if (rc == 0)
+		rc += 1;
+
+out_unlock:
+	dt_read_unlock(env, obj);
+
+	CDEBUG(D_INFO, "lookup "DFID" %s get "DFID" rc %d\n",
+	       PFID(lu_object_fid(&obj->do_lu)), name,
+	       PFID(&tti->tti_fid1), rc);
+
+	CDEBUG(D_INFO, "%s: insert lookup reply %p index %d: rc = %d\n",
+	       tgt_name(tsi->tsi_tgt), tti->tti_u.update.tti_update_reply,
+	       0, rc);
+
+	object_update_result_insert(tti->tti_u.update.tti_update_reply,
+			    &tti->tti_fid1, sizeof(tti->tti_fid1),
+			    tti->tti_u.update.tti_update_reply_index, rc);
+	RETURN(rc);
+}
+
+static int out_xattr_set(struct tgt_session_info *tsi)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(tsi->tsi_env);
+	struct object_update	*update = tti->tti_u.update.tti_update;
+	struct dt_object	*obj = tti->tti_u.update.tti_dt_object;
+	struct lu_buf		*lbuf = &tti->tti_buf;
+	char			*name;
+	char			*buf;
+	__u32			*tmp;
+	size_t			 buf_len = 0;
+	int			 flag;
+	size_t			 size = 0;
+	int			 rc;
+	ENTRY;
+
+	name = object_update_param_get(update, 0, NULL);
+	if (IS_ERR(name)) {
+		CERROR("%s: empty name for xattr set: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), PTR_ERR(name));
+		RETURN(PTR_ERR(name));
+	}
+
+	/* If buffer == NULL (-ENODATA), then it might mean delete xattr */
+	buf = object_update_param_get(update, 1, &buf_len);
+	if (IS_ERR(buf) && PTR_ERR(buf) != -ENODATA)
+		RETURN(PTR_ERR(buf));
+
+	lbuf->lb_buf = buf;
+	lbuf->lb_len = buf_len;
+
+	tmp = object_update_param_get(update, 2, &size);
+	if (IS_ERR(tmp) || size != sizeof(*tmp)) {
+		CERROR("%s: emptry or wrong size %zu flag: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), size, PTR_ERR(tmp));
+		RETURN(PTR_ERR(tmp));
+	}
+
+	if (ptlrpc_req_need_swab(tsi->tsi_pill->rc_req))
+		__swab32s(tmp);
+	flag = *tmp;
+
+	rc = out_tx_xattr_set(tsi->tsi_env, obj, lbuf, name, flag,
+			      &tti->tti_tea, tti->tti_tea.ta_handle,
+			      tti->tti_u.update.tti_update_reply,
+			      tti->tti_u.update.tti_update_reply_index);
+	RETURN(rc);
+}
+
+static int out_xattr_del(struct tgt_session_info *tsi)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(tsi->tsi_env);
+	struct object_update	*update = tti->tti_u.update.tti_update;
+	struct dt_object	*obj = tti->tti_u.update.tti_dt_object;
+	char			*name;
+	int			 rc;
+	ENTRY;
+
+	name = object_update_param_get(update, 0, NULL);
+	if (IS_ERR(name)) {
+		CERROR("%s: empty name for xattr set: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), PTR_ERR(name));
+		RETURN(PTR_ERR(name));
+	}
+
+	rc = out_tx_xattr_del(tsi->tsi_env, obj, name, &tti->tti_tea,
+			      tti->tti_tea.ta_handle,
+			      tti->tti_u.update.tti_update_reply,
+			      tti->tti_u.update.tti_update_reply_index);
+	RETURN(rc);
+}
+
+/**
+ * increase ref of the object
+ **/
+static int out_ref_add(struct tgt_session_info *tsi)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(tsi->tsi_env);
+	struct dt_object	*obj = tti->tti_u.update.tti_dt_object;
+	int			 rc;
+
+	ENTRY;
+
+	rc = out_tx_ref_add(tsi->tsi_env, obj, &tti->tti_tea,
+			    tti->tti_tea.ta_handle,
+			    tti->tti_u.update.tti_update_reply,
+			    tti->tti_u.update.tti_update_reply_index);
+	RETURN(rc);
+}
+
+static int out_ref_del(struct tgt_session_info *tsi)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(tsi->tsi_env);
+	struct dt_object	*obj = tti->tti_u.update.tti_dt_object;
+	int			 rc;
+
+	ENTRY;
+
+	if (!lu_object_exists(&obj->do_lu))
+		RETURN(-ENOENT);
+
+	rc = out_tx_ref_del(tsi->tsi_env, obj, &tti->tti_tea,
+			    tti->tti_tea.ta_handle,
+			    tti->tti_u.update.tti_update_reply,
+			    tti->tti_u.update.tti_update_reply_index);
+	RETURN(rc);
+}
+
+static int out_index_insert(struct tgt_session_info *tsi)
+{
+	struct tgt_thread_info	*tti	= tgt_th_info(tsi->tsi_env);
+	struct object_update	*update	= tti->tti_u.update.tti_update;
+	struct dt_object	*obj	= tti->tti_u.update.tti_dt_object;
+	struct dt_insert_rec	*rec	= &tti->tti_rec;
+	struct lu_fid		*fid;
+	char			*name;
+	__u32			*ptype;
+	int			 rc	= 0;
+	size_t			 size;
+	ENTRY;
+
+	name = object_update_param_get(update, 0, NULL);
+	if (IS_ERR(name)) {
+		CERROR("%s: empty name for index insert: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), PTR_ERR(name));
+		RETURN(PTR_ERR(name));
+	}
+
+	fid = object_update_param_get(update, 1, &size);
+	if (IS_ERR(fid) || size != sizeof(*fid)) {
+		CERROR("%s: invalid fid: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), PTR_ERR(fid));
+		RETURN(PTR_ERR(fid));
+	}
+
+	if (ptlrpc_req_need_swab(tsi->tsi_pill->rc_req))
+		lustre_swab_lu_fid(fid);
+
+	if (!fid_is_sane(fid)) {
+		CERROR("%s: invalid FID "DFID": rc = %d\n",
+		       tgt_name(tsi->tsi_tgt), PFID(fid), -EPROTO);
+		RETURN(-EPROTO);
+	}
+
+	ptype = object_update_param_get(update, 2, &size);
+	if (IS_ERR(ptype) || size != sizeof(*ptype)) {
+		CERROR("%s: invalid type for index insert: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), PTR_ERR(ptype));
+		RETURN(PTR_ERR(ptype));
+	}
+
+	if (ptlrpc_req_need_swab(tsi->tsi_pill->rc_req))
+		__swab32s(ptype);
+
+	rec->rec_fid = fid;
+	rec->rec_type = *ptype;
+
+	rc = out_tx_index_insert(tsi->tsi_env, obj, (const struct dt_rec *)rec,
+				 (const struct dt_key *)name, &tti->tti_tea,
+				 tti->tti_tea.ta_handle,
+				 tti->tti_u.update.tti_update_reply,
+				 tti->tti_u.update.tti_update_reply_index);
+	RETURN(rc);
+}
+
+static int out_index_delete(struct tgt_session_info *tsi)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(tsi->tsi_env);
+	struct object_update	*update = tti->tti_u.update.tti_update;
+	struct dt_object	*obj = tti->tti_u.update.tti_dt_object;
+	char			*name;
+	int			 rc = 0;
+
+	if (!lu_object_exists(&obj->do_lu))
+		RETURN(-ENOENT);
+
+	name = object_update_param_get(update, 0, NULL);
+	if (IS_ERR(name)) {
+		CERROR("%s: empty name for index delete: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), PTR_ERR(name));
+		RETURN(PTR_ERR(name));
+	}
+
+	rc = out_tx_index_delete(tsi->tsi_env, obj, (const struct dt_key *)name,
+				 &tti->tti_tea, tti->tti_tea.ta_handle,
+				 tti->tti_u.update.tti_update_reply,
+				 tti->tti_u.update.tti_update_reply_index);
+	RETURN(rc);
+}
+
+static int out_destroy(struct tgt_session_info *tsi)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(tsi->tsi_env);
+	struct object_update	*update = tti->tti_u.update.tti_update;
+	struct dt_object	*obj = tti->tti_u.update.tti_dt_object;
+	struct lu_fid		*fid;
+	int			 rc;
+	ENTRY;
+
+	fid = &update->ou_fid;
+	if (!fid_is_sane(fid)) {
+		CERROR("%s: invalid FID "DFID": rc = %d\n",
+		       tgt_name(tsi->tsi_tgt), PFID(fid), -EPROTO);
+		RETURN(-EPROTO);
+	}
+
+	if (!lu_object_exists(&obj->do_lu))
+		RETURN(-ENOENT);
+
+	rc = out_tx_destroy(tsi->tsi_env, obj, &tti->tti_tea,
+			    tti->tti_tea.ta_handle,
+			    tti->tti_u.update.tti_update_reply,
+			    tti->tti_u.update.tti_update_reply_index);
+
+	RETURN(rc);
+}
+
+static int out_write(struct tgt_session_info *tsi)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(tsi->tsi_env);
+	struct object_update	*update = tti->tti_u.update.tti_update;
+	struct dt_object	*obj = tti->tti_u.update.tti_dt_object;
+	struct lu_buf		*lbuf = &tti->tti_buf;
+	char			*buf;
+	__u64			*tmp;
+	size_t			size = 0;
+	size_t			buf_len = 0;
+	loff_t			pos;
+	int			 rc;
+	ENTRY;
+
+	buf = object_update_param_get(update, 0, &buf_len);
+	if (IS_ERR(buf) || buf_len == 0) {
+		CERROR("%s: empty buf for xattr set: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), PTR_ERR(buf));
+		RETURN(PTR_ERR(buf));
+	}
+	lbuf->lb_buf = buf;
+	lbuf->lb_len = buf_len;
+
+	tmp = object_update_param_get(update, 1, &size);
+	if (IS_ERR(tmp) || size != sizeof(*tmp)) {
+		CERROR("%s: empty or wrong size %zu pos: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), size, PTR_ERR(tmp));
+		RETURN(PTR_ERR(tmp));
+	}
+
+	if (ptlrpc_req_need_swab(tsi->tsi_pill->rc_req))
+		__swab64s(tmp);
+	pos = *tmp;
+
+	rc = out_tx_write(tsi->tsi_env, obj, lbuf, pos,
+			  &tti->tti_tea, tti->tti_tea.ta_handle,
+			  tti->tti_u.update.tti_update_reply,
+			  tti->tti_u.update.tti_update_reply_index);
+	RETURN(rc);
+}
+
+static int out_read(struct tgt_session_info *tsi)
+{
+	const struct lu_env	*env = tsi->tsi_env;
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct object_update	*update = tti->tti_u.update.tti_update;
+	struct dt_object	*obj = tti->tti_u.update.tti_dt_object;
+	struct object_update_reply *reply = tti->tti_u.update.tti_update_reply;
+	int index = tti->tti_u.update.tti_update_reply_index;
+	struct lu_rdbuf	*rdbuf;
+	struct object_update_result *update_result;
+	struct out_read_reply	*orr;
+	void *tmp;
+	size_t size;
+	size_t total_size = 0;
+	__u64 pos;
+	unsigned int i;
+	unsigned int nbufs;
+	int rc = 0;
+	ENTRY;
+
+	update_result = object_update_result_get(reply, index, NULL);
+	LASSERT(update_result != NULL);
+	update_result->our_datalen = sizeof(*orr);
+
+	if (!lu_object_exists(&obj->do_lu))
+		GOTO(out, rc = -ENOENT);
+
+	tmp = object_update_param_get(update, 0, NULL);
+	if (IS_ERR(tmp)) {
+		CERROR("%s: empty size for read: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), PTR_ERR(tmp));
+		GOTO(out, rc = PTR_ERR(tmp));
+	}
+	size = le64_to_cpu(*(size_t *)(tmp));
+
+	tmp = object_update_param_get(update, 1, NULL);
+	if (IS_ERR(tmp)) {
+		CERROR("%s: empty pos for read: rc = %ld\n",
+		       tgt_name(tsi->tsi_tgt), PTR_ERR(tmp));
+		GOTO(out, rc = PTR_ERR(tmp));
+	}
+	pos = le64_to_cpu(*(__u64 *)(tmp));
+
+	/* Put the offset into the begining of the buffer in reply */
+	orr = (struct out_read_reply *)update_result->our_data;
+
+	nbufs = (size + OUT_BULK_BUFFER_SIZE - 1) / OUT_BULK_BUFFER_SIZE;
+	OBD_ALLOC(rdbuf, sizeof(struct lu_rdbuf) +
+			 nbufs * sizeof(rdbuf->rb_bufs[0]));
+	if (rdbuf == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	rdbuf->rb_nbufs = 0;
+	total_size = 0;
+	for (i = 0; i < nbufs; i++) {
+		__u32 read_size;
+
+		read_size = size > OUT_BULK_BUFFER_SIZE ?
+			    OUT_BULK_BUFFER_SIZE : size;
+		OBD_ALLOC(rdbuf->rb_bufs[i].lb_buf, read_size);
+		if (rdbuf->rb_bufs[i].lb_buf == NULL)
+			GOTO(out_free, rc = -ENOMEM);
+
+		rdbuf->rb_bufs[i].lb_len = read_size;
+		dt_read_lock(env, obj, MOR_TGT_CHILD);
+		rc = dt_read(env, obj, &rdbuf->rb_bufs[i], &pos);
+		dt_read_unlock(env, obj);
+
+		total_size += rc < 0 ? 0 : rc;
+		if (rc <= 0)
+			break;
+
+		rdbuf->rb_nbufs++;
+		size -= read_size;
+	}
+
+	/* send pages to client */
+	rc = tgt_send_buffer(tsi, rdbuf);
+	if (rc < 0)
+		GOTO(out_free, rc);
+
+	orr->orr_size = total_size;
+	orr->orr_offset = pos;
+
+	orr_cpu_to_le(orr, orr);
+	update_result->our_datalen += orr->orr_size;
+out_free:
+	for (i = 0; i < nbufs; i++) {
+		if (rdbuf->rb_bufs[i].lb_buf != NULL) {
+			OBD_FREE(rdbuf->rb_bufs[i].lb_buf,
+				 rdbuf->rb_bufs[i].lb_len);
+		}
+	}
+	OBD_FREE(rdbuf, sizeof(struct lu_rdbuf) +
+			nbufs * sizeof(rdbuf->rb_bufs[0]));
+out:
+	/* Insert read buffer */
+	update_result->our_rc = ptlrpc_status_hton(rc);
+	reply->ourp_lens[index] = cfs_size_round(update_result->our_datalen +
+						 sizeof(*update_result));
+	RETURN(rc);
+}
+
+static int out_noop(struct tgt_session_info *tsi)
+{
+	return 0;
+}
+
+#define DEF_OUT_HNDL(opc, name, flags, fn)     \
+[opc - OUT_CREATE] = {					\
+	.th_name    = name,				\
+	.th_fail_id = 0,				\
+	.th_opc     = opc,				\
+	.th_flags   = flags,				\
+	.th_act     = fn,				\
+	.th_fmt     = NULL,				\
+	.th_version = 0,				\
+}
+
+static struct tgt_handler out_update_ops[] = {
+	DEF_OUT_HNDL(OUT_CREATE, "out_create", MUTABOR | HABEO_REFERO,
+		     out_create),
+	DEF_OUT_HNDL(OUT_DESTROY, "out_create", MUTABOR | HABEO_REFERO,
+		     out_destroy),
+	DEF_OUT_HNDL(OUT_REF_ADD, "out_ref_add", MUTABOR | HABEO_REFERO,
+		     out_ref_add),
+	DEF_OUT_HNDL(OUT_REF_DEL, "out_ref_del", MUTABOR | HABEO_REFERO,
+		     out_ref_del),
+	DEF_OUT_HNDL(OUT_ATTR_SET, "out_attr_set",  MUTABOR | HABEO_REFERO,
+		     out_attr_set),
+	DEF_OUT_HNDL(OUT_ATTR_GET, "out_attr_get",  HABEO_REFERO,
+		     out_attr_get),
+	DEF_OUT_HNDL(OUT_XATTR_SET, "out_xattr_set", MUTABOR | HABEO_REFERO,
+		     out_xattr_set),
+	DEF_OUT_HNDL(OUT_XATTR_DEL, "out_xattr_del", MUTABOR | HABEO_REFERO,
+		     out_xattr_del),
+	DEF_OUT_HNDL(OUT_XATTR_GET, "out_xattr_get", HABEO_REFERO,
+		     out_xattr_get),
+	DEF_OUT_HNDL(OUT_INDEX_LOOKUP, "out_index_lookup", HABEO_REFERO,
+		     out_index_lookup),
+	DEF_OUT_HNDL(OUT_INDEX_INSERT, "out_index_insert",
+		     MUTABOR | HABEO_REFERO, out_index_insert),
+	DEF_OUT_HNDL(OUT_INDEX_DELETE, "out_index_delete",
+		     MUTABOR | HABEO_REFERO, out_index_delete),
+	DEF_OUT_HNDL(OUT_WRITE, "out_write", MUTABOR | HABEO_REFERO, out_write),
+	DEF_OUT_HNDL(OUT_READ, "out_read", HABEO_REFERO, out_read),
+	DEF_OUT_HNDL(OUT_NOOP, "out_noop", HABEO_REFERO, out_noop),
+};
+
+static struct tgt_handler *out_handler_find(__u32 opc)
+{
+	struct tgt_handler *h;
+
+	h = NULL;
+	if (OUT_CREATE <= opc && opc < OUT_LAST) {
+		h = &out_update_ops[opc - OUT_CREATE];
+		LASSERTF(h->th_opc == opc, "opcode mismatch %d != %d\n",
+			 h->th_opc, opc);
+	} else {
+		h = NULL; /* unsupported opc */
+	}
+	return h;
+}
+
+static int out_tx_start(const struct lu_env *env, struct dt_device *dt,
+			struct thandle_exec_args *ta, struct obd_export *exp)
+{
+	ta->ta_argno = 0;
+	ta->ta_handle = dt_trans_create(env, dt);
+	if (IS_ERR(ta->ta_handle)) {
+		int rc;
+
+		rc = PTR_ERR(ta->ta_handle);
+		ta->ta_handle = NULL;
+		CERROR("%s: start handle error: rc = %d\n", dt_obd_name(dt),
+		       rc);
+		return rc;
+	}
+	if (exp->exp_need_sync)
+		ta->ta_handle->th_sync = 1;
+
+	return 0;
+}
+
+static int out_trans_start(const struct lu_env *env,
+			   struct thandle_exec_args *ta)
+{
+	return dt_trans_start(env, ta->ta_handle->th_dev, ta->ta_handle);
+}
+
+static int out_trans_stop(const struct lu_env *env,
+			  struct thandle_exec_args *ta, int err)
+{
+	int i;
+	int rc;
+
+	ta->ta_handle->th_result = err;
+	rc = dt_trans_stop(env, ta->ta_handle->th_dev, ta->ta_handle);
+	for (i = 0; i < ta->ta_argno; i++) {
+		if (ta->ta_args[i]->object != NULL) {
+			dt_object_put(env, ta->ta_args[i]->object);
+			ta->ta_args[i]->object = NULL;
+		}
+	}
+	ta->ta_handle = NULL;
+	ta->ta_argno = 0;
+
+	return rc;
+}
+
+static int out_tx_end(const struct lu_env *env, struct thandle_exec_args *ta,
+		      int declare_ret)
+{
+	struct tgt_session_info	*tsi = tgt_ses_info(env);
+	int			i;
+	int			rc;
+	int			rc1;
+	ENTRY;
+
+	if (ta->ta_handle == NULL)
+		RETURN(0);
+
+	if (declare_ret != 0 || ta->ta_argno == 0)
+		GOTO(stop, rc = declare_ret);
+
+	LASSERT(ta->ta_handle->th_dev != NULL);
+	rc = out_trans_start(env, ta);
+	if (unlikely(rc != 0))
+		GOTO(stop, rc);
+
+	for (i = 0; i < ta->ta_argno; i++) {
+		rc = ta->ta_args[i]->exec_fn(env, ta->ta_handle,
+					     ta->ta_args[i]);
+		if (unlikely(rc != 0)) {
+			CDEBUG(D_INFO, "error during execution of #%u from"
+			       " %s:%d: rc = %d\n", i, ta->ta_args[i]->file,
+			       ta->ta_args[i]->line, rc);
+			while (--i >= 0) {
+				if (ta->ta_args[i]->undo_fn != NULL)
+					ta->ta_args[i]->undo_fn(env,
+							       ta->ta_handle,
+							       ta->ta_args[i]);
+				else
+					CERROR("%s: undo for %s:%d: rc = %d\n",
+					     dt_obd_name(ta->ta_handle->th_dev),
+					       ta->ta_args[i]->file,
+					       ta->ta_args[i]->line, -ENOTSUPP);
+			}
+			break;
+		}
+		CDEBUG(D_INFO, "%s: executed %u/%u: rc = %d\n",
+		       dt_obd_name(ta->ta_handle->th_dev), i, ta->ta_argno, rc);
+	}
+
+	/* Only fail for real updates, XXX right now llog updates will be
+	* ignore, whose updates count is usually 1, so failover test
+	* case will spot this FAIL_UPDATE_NET_REP precisely, and it will
+	* be removed after async update patch is landed. */
+	if (ta->ta_argno > 1)
+		tsi->tsi_reply_fail_id = OBD_FAIL_OUT_UPDATE_NET_REP;
+
+stop:
+	rc1 = out_trans_stop(env, ta, rc);
+	if (rc == 0)
+		rc = rc1;
+
+	ta->ta_handle = NULL;
+	ta->ta_argno = 0;
+
+	RETURN(rc);
+}
+
+/**
+ * Object updates between Targets. Because all the updates has been
+ * dis-assemblied into object updates at sender side, so OUT will
+ * call OSD API directly to execute these updates.
+ *
+ * In DNE phase I all of the updates in the request need to be executed
+ * in one transaction, and the transaction has to be synchronously.
+ *
+ * Please refer to lustre/include/lustre/lustre_idl.h for req/reply
+ * format.
+ */
+int out_handle(struct tgt_session_info *tsi)
+{
+	const struct lu_env		*env = tsi->tsi_env;
+	struct tgt_thread_info		*tti = tgt_th_info(env);
+	struct thandle_exec_args	*ta = &tti->tti_tea;
+	struct req_capsule		*pill = tsi->tsi_pill;
+	struct dt_device		*dt = tsi->tsi_tgt->lut_bottom;
+	struct out_update_header	*ouh;
+	struct out_update_buffer	*oub = NULL;
+	struct object_update		*update;
+	struct object_update_reply	*reply;
+	struct ptlrpc_bulk_desc		*desc = NULL;
+	struct l_wait_info		lwi;
+	void				**update_bufs;
+	int				current_batchid = -1;
+	__u32				update_buf_count;
+	unsigned int			i;
+	unsigned int			reply_index = 0;
+	int				rc = 0;
+	int				rc1 = 0;
+	int				ouh_size, reply_size;
+	int				updates;
+	ENTRY;
+
+	req_capsule_set(pill, &RQF_OUT_UPDATE);
+	ouh_size = req_capsule_get_size(pill, &RMF_OUT_UPDATE_HEADER,
+					RCL_CLIENT);
+	if (ouh_size <= 0)
+		RETURN(err_serious(-EPROTO));
+
+	ouh = req_capsule_client_get(pill, &RMF_OUT_UPDATE_HEADER);
+	if (ouh == NULL)
+		RETURN(err_serious(-EPROTO));
+
+	if (ouh->ouh_magic != OUT_UPDATE_HEADER_MAGIC) {
+		CERROR("%s: invalid update buffer magic %x expect %x: "
+		       "rc = %d\n", tgt_name(tsi->tsi_tgt), ouh->ouh_magic,
+		       UPDATE_REQUEST_MAGIC, -EPROTO);
+		RETURN(err_serious(-EPROTO));
+	}
+
+	update_buf_count = ouh->ouh_count;
+	if (update_buf_count == 0)
+		RETURN(err_serious(-EPROTO));
+
+	OBD_ALLOC(update_bufs, sizeof(*update_bufs) * update_buf_count);
+	if (update_bufs == NULL)
+		RETURN(err_serious(-ENOMEM));
+
+	if (ouh->ouh_inline_length > 0) {
+		update_bufs[0] = ouh->ouh_inline_data;
+	} else {
+		struct out_update_buffer *tmp;
+
+		oub = req_capsule_client_get(pill, &RMF_OUT_UPDATE_BUF);
+		if (oub == NULL)
+			GOTO(out_free, rc = err_serious(-EPROTO));
+
+		desc = ptlrpc_prep_bulk_exp(pill->rc_req, update_buf_count,
+					    PTLRPC_BULK_OPS_COUNT,
+					    PTLRPC_BULK_GET_SINK |
+					    PTLRPC_BULK_BUF_KVEC,
+					    MDS_BULK_PORTAL,
+					    &ptlrpc_bulk_kvec_ops);
+		if (desc == NULL)
+			GOTO(out_free, rc = err_serious(-ENOMEM));
+
+		tmp = oub;
+		for (i = 0; i < update_buf_count; i++, tmp++) {
+			if (tmp->oub_size >= OUT_MAXREQSIZE)
+				GOTO(out_free, rc = err_serious(-EPROTO));
+
+			OBD_ALLOC_LARGE(update_bufs[i], tmp->oub_size);
+			if (update_bufs[i] == NULL)
+				GOTO(out_free, rc = err_serious(-ENOMEM));
+
+			desc->bd_frag_ops->add_iov_frag(desc, update_bufs[i],
+							tmp->oub_size);
+		}
+
+		pill->rc_req->rq_bulk_write = 1;
+		rc = sptlrpc_svc_prep_bulk(pill->rc_req, desc);
+		if (rc != 0)
+			GOTO(out_free, rc = err_serious(rc));
+
+		rc = target_bulk_io(pill->rc_req->rq_export, desc, &lwi);
+		if (rc < 0)
+			GOTO(out_free, rc = err_serious(rc));
+	}
+	/* validate the request and calculate the total update count and
+	 * set it to reply */
+	reply_size = 0;
+	updates = 0;
+	for (i = 0; i < update_buf_count; i++) {
+		struct object_update_request	*our;
+		int				 j;
+
+		our = update_bufs[i];
+		if (ptlrpc_req_need_swab(pill->rc_req))
+			lustre_swab_object_update_request(our);
+
+		if (our->ourq_magic != UPDATE_REQUEST_MAGIC) {
+			CERROR("%s: invalid update buffer magic %x"
+			       " expect %x: rc = %d\n",
+			       tgt_name(tsi->tsi_tgt), our->ourq_magic,
+			       UPDATE_REQUEST_MAGIC, -EPROTO);
+			GOTO(out_free, rc = err_serious(-EPROTO));
+		}
+		updates += our->ourq_count;
+
+		/* need to calculate reply size */
+		for (j = 0; j < our->ourq_count; j++) {
+			update = object_update_request_get(our, j, NULL);
+			if (update == NULL)
+				GOTO(out, rc = err_serious(-EPROTO));
+			if (ptlrpc_req_need_swab(pill->rc_req))
+				lustre_swab_object_update(update);
+
+			if (!fid_is_sane(&update->ou_fid)) {
+				CERROR("%s: invalid FID "DFID": rc = %d\n",
+				       tgt_name(tsi->tsi_tgt),
+				       PFID(&update->ou_fid), -EPROTO);
+				GOTO(out, rc = err_serious(-EPROTO));
+			}
+
+			/* XXX: what ou_result_size can be considered safe? */
+
+			reply_size += sizeof(reply->ourp_lens[0]);
+			reply_size += sizeof(struct object_update_result);
+			reply_size += update->ou_result_size;
+		}
+ 	}
+	reply_size += sizeof(*reply);
+
+	if (unlikely(reply_size > ouh->ouh_reply_size)) {
+		CERROR("%s: too small reply buf %u for %u, need %u at least\n",
+		       tgt_name(tsi->tsi_tgt), ouh->ouh_reply_size,
+		       updates, reply_size);
+		GOTO(out_free, rc = err_serious(-EPROTO));
+	}
+
+	req_capsule_set_size(pill, &RMF_OUT_UPDATE_REPLY, RCL_SERVER,
+			     ouh->ouh_reply_size);
+	rc = req_capsule_server_pack(pill);
+	if (rc != 0) {
+		CERROR("%s: Can't pack response: rc = %d\n",
+		       tgt_name(tsi->tsi_tgt), rc);
+		GOTO(out_free, rc = err_serious(-EPROTO));
+	}
+
+	/* Prepare the update reply buffer */
+	reply = req_capsule_server_get(pill, &RMF_OUT_UPDATE_REPLY);
+	if (reply == NULL)
+		GOTO(out_free, rc = -EPROTO);
+	reply->ourp_magic = UPDATE_REPLY_MAGIC;
+	reply->ourp_count = updates;
+	tti->tti_u.update.tti_update_reply = reply;
+	tti->tti_mult_trans = !req_is_replay(tgt_ses_req(tsi));
+
+	/* Walk through updates in the request to execute them */
+	for (i = 0; i < update_buf_count; i++) {
+		struct tgt_handler	*h;
+		struct dt_object	*dt_obj;
+		int			update_count;
+		struct object_update_request *our;
+		int			j;
+
+		our = update_bufs[i];
+		update_count = our->ourq_count;
+		for (j = 0; j < update_count; j++) {
+			struct lu_object_conf conf;
+
+			update = object_update_request_get(our, j, NULL);
+			if (update->ou_type == OUT_CREATE)
+				conf.loc_flags = LOC_F_NEW;
+			else
+				conf.loc_flags = 0;
+
+			dt_obj = dt_locate_at(env, dt, &update->ou_fid,
+				dt->dd_lu_dev.ld_site->ls_top_dev, &conf);
+			if (IS_ERR(dt_obj))
+				GOTO(out, rc = PTR_ERR(dt_obj));
+
+			if (dt->dd_record_fid_accessed) {
+				struct lfsck_req_local *lrl = &tti->tti_lrl;
+
+				lfsck_pack_rfa(lrl,
+					       lu_object_fid(&dt_obj->do_lu),
+					       LEL_FID_ACCESSED,
+					       LFSCK_TYPE_LAYOUT);
+				tgt_lfsck_in_notify_local(env, dt, lrl, NULL);
+			}
+
+			tti->tti_u.update.tti_dt_object = dt_obj;
+			tti->tti_u.update.tti_update = update;
+			tti->tti_u.update.tti_update_reply_index = reply_index;
+
+			h = out_handler_find(update->ou_type);
+			if (unlikely(h == NULL)) {
+				CERROR("%s: unsupported opc: 0x%x\n",
+				       tgt_name(tsi->tsi_tgt), update->ou_type);
+				GOTO(next, rc = -ENOTSUPP);
+			}
+
+			/* Check resend case only for modifying RPC */
+			if (h->th_flags & MUTABOR) {
+				struct ptlrpc_request *req = tgt_ses_req(tsi);
+
+				if (out_check_resent(env, dt, dt_obj, req,
+						     out_reconstruct, reply,
+						     reply_index))
+					GOTO(next, rc = 0);
+
+				if (dt->dd_rdonly)
+					GOTO(next, rc = -EROFS);
+			}
+
+			/* start transaction for modification RPC only */
+			if (h->th_flags & MUTABOR && current_batchid == -1) {
+				current_batchid = update->ou_batchid;
+				rc = out_tx_start(env, dt, ta, tsi->tsi_exp);
+				if (rc != 0)
+					GOTO(next, rc);
+
+				if (update->ou_flags & UPDATE_FL_SYNC)
+					ta->ta_handle->th_sync = 1;
+			}
+
+			/* Stop the current update transaction, if the update
+			 * has different batchid, or read-only update */
+			if (((current_batchid != update->ou_batchid) ||
+			     !(h->th_flags & MUTABOR)) &&
+			     ta->ta_handle != NULL) {
+				rc = out_tx_end(env, ta, rc);
+				current_batchid = -1;
+				if (rc != 0)
+					GOTO(next, rc);
+
+				/* start a new transaction if needed */
+				if (h->th_flags & MUTABOR) {
+					rc = out_tx_start(env, dt, ta,
+							  tsi->tsi_exp);
+					if (rc != 0)
+						GOTO(next, rc);
+					if (update->ou_flags & UPDATE_FL_SYNC)
+						ta->ta_handle->th_sync = 1;
+					current_batchid = update->ou_batchid;
+				}
+			}
+
+			rc = h->th_act(tsi);
+next:
+			reply_index++;
+			dt_object_put(env, dt_obj);
+			if (rc < 0)
+				GOTO(out, rc);
+		}
+	}
+out:
+	if (current_batchid != -1) {
+		rc1 = out_tx_end(env, ta, rc);
+		if (rc == 0)
+			rc = rc1;
+	}
+
+out_free:
+	if (update_bufs != NULL) {
+		if (oub != NULL) {
+			for (i = 0; i < update_buf_count; i++, oub++) {
+				if (update_bufs[i] != NULL)
+					OBD_FREE_LARGE(update_bufs[i],
+						       oub->oub_size);
+			}
+		}
+
+		OBD_FREE(update_bufs, sizeof(*update_bufs) * update_buf_count);
+	}
+
+	if (desc != NULL)
+		ptlrpc_free_bulk(desc);
+
+	RETURN(rc);
+}
+
+struct tgt_handler tgt_out_handlers[] = {
+TGT_UPDATE_HDL(MUTABOR,	OUT_UPDATE,	out_handle),
+};
+EXPORT_SYMBOL(tgt_out_handlers);
+
diff --git a/drivers/staging/lustrefsx/lustre/target/out_lib.c b/drivers/staging/lustrefsx/lustre/target/out_lib.c
new file mode 100644
index 0000000000000..8e60dfff9995e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/target/out_lib.c
@@ -0,0 +1,1267 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2014, 2015, Intel Corporation.
+ */
+/*
+ * lustre/target/out_lib.c
+ *
+ * Author: Di Wang <di.wang@intel.com>
+ * Author: Fan, Yong <fan.yong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <lu_target.h>
+#include <lustre_obdo.h>
+#include <lustre_update.h>
+#include <md_object.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_linkea.h>
+
+#include "tgt_internal.h"
+
+const char *update_op_str(__u16 opc)
+{
+	static const char *opc_str[] = {
+		[OUT_START] = "start",
+		[OUT_CREATE] = "create",
+		[OUT_DESTROY] = "destroy",
+		[OUT_REF_ADD] = "ref_add",
+		[OUT_REF_DEL] = "ref_del" ,
+		[OUT_ATTR_SET] = "attr_set",
+		[OUT_ATTR_GET] = "attr_get",
+		[OUT_XATTR_SET] = "xattr_set",
+		[OUT_XATTR_GET] = "xattr_get",
+		[OUT_INDEX_LOOKUP] = "lookup",
+		[OUT_INDEX_INSERT] = "insert",
+		[OUT_INDEX_DELETE] = "delete",
+		[OUT_WRITE] = "write",
+		[OUT_XATTR_DEL] = "xattr_del",
+		[OUT_PUNCH] = "punch",
+		[OUT_READ] = "read",
+		[OUT_NOOP] = "noop",
+	};
+
+	if (opc < ARRAY_SIZE(opc_str) && opc_str[opc] != NULL)
+		return opc_str[opc];
+	else
+		return "unknown";
+}
+EXPORT_SYMBOL(update_op_str);
+
+/**
+ * Fill object update header
+ *
+ * Only fill the object update header, and parameters will be filled later
+ * in other functions.
+ *
+ * \params[in] env		execution environment
+ * \params[in] update		object update to be filled
+ * \params[in,out] max_update_size	maximum object update size, if the
+ *                                      current update length equals or
+ *                                      exceeds the size, it will return -E2BIG.
+ * \params[in] update_op	update type
+ * \params[in] fid		object FID of the update
+ * \params[in] param_count	the count of the update parameters
+ * \params[in] param_sizes	the length of each parameters
+ *
+ * \retval			0 if packing succeeds.
+ * \retval			-E2BIG if packing exceeds the maximum length.
+ */
+int out_update_header_pack(const struct lu_env *env,
+			   struct object_update *update,
+			   size_t *max_update_size,
+			   enum update_type update_op,
+			   const struct lu_fid *fid,
+			   unsigned int param_count,
+			   __u16 *param_sizes,
+			   __u32 reply_size)
+{
+	struct object_update_param	*param;
+	unsigned int			i;
+	size_t				update_size;
+
+	if (((reply_size + 7) >> 3) >= 1ULL << 16)
+		return -EINVAL;
+
+	/* Check whether the packing exceeding the maxima update length */
+	update_size = sizeof(*update);
+	for (i = 0; i < param_count; i++)
+		update_size += cfs_size_round(sizeof(*param) + param_sizes[i]);
+
+	if (unlikely(update_size >= *max_update_size)) {
+		*max_update_size = update_size;
+		return -E2BIG;
+	}
+
+	update->ou_fid = *fid;
+	update->ou_type = update_op;
+	update->ou_params_count = param_count;
+	update->ou_result_size = reply_size;
+	param = &update->ou_params[0];
+	for (i = 0; i < param_count; i++) {
+		param->oup_len = param_sizes[i];
+		param = (struct object_update_param *)((char *)param +
+			 object_update_param_size(param));
+	}
+
+	return 0;
+}
+
+/**
+ * Packs one update into the update_buffer.
+ *
+ * \param[in] env	execution environment
+ * \param[in] update	update to be packed
+ * \param[in] max_update_size	*maximum size of \a update
+ * \param[in] op	update operation (enum update_type)
+ * \param[in] fid	object FID for this update
+ * \param[in] param_count	number of parameters for this update
+ * \param[in] param_sizes	array of parameters length of this update
+ * \param[in] param_bufs	parameter buffers
+ *
+ * \retval		= 0 if updates packing succeeds
+ * \retval		negative errno if updates packing fails
+ **/
+int out_update_pack(const struct lu_env *env, struct object_update *update,
+		    size_t *max_update_size, enum update_type op,
+		    const struct lu_fid *fid, unsigned int param_count,
+		    __u16 *param_sizes, const void **param_bufs,
+		    __u32 reply_size)
+{
+	struct object_update_param	*param;
+	unsigned int			i;
+	int				rc;
+	ENTRY;
+
+	rc = out_update_header_pack(env, update, max_update_size, op, fid,
+				    param_count, param_sizes, reply_size);
+	if (rc != 0)
+		RETURN(rc);
+
+	param = &update->ou_params[0];
+	for (i = 0; i < param_count; i++) {
+		memcpy(&param->oup_buf[0], param_bufs[i], param_sizes[i]);
+		param = (struct object_update_param *)((char *)param +
+			 object_update_param_size(param));
+	}
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(out_update_pack);
+
+/**
+ * Pack various updates into the update_buffer.
+ *
+ * The following functions pack different updates into the update_buffer
+ * So parameters of these API is basically same as its correspondent OSD/OSP
+ * API, for detail description of these parameters see osd_handler.c or
+ * osp_md_object.c.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ubuf	update buffer
+ * \param[in] fid	fid of this object for the update
+ *
+ * \retval		0 if insertion succeeds.
+ * \retval		negative errno if insertion fails.
+ */
+int out_create_pack(const struct lu_env *env, struct object_update *update,
+		    size_t *max_update_size, const struct lu_fid *fid,
+		    const struct lu_attr *attr, struct dt_allocation_hint *hint,
+		    struct dt_object_format *dof)
+{
+	struct obdo		*obdo;
+	__u16			sizes[2] = {sizeof(*obdo), 0};
+	int			buf_count = 1;
+	const struct lu_fid	*parent_fid = NULL;
+	int			rc;
+	ENTRY;
+
+	if (hint != NULL && hint->dah_parent) {
+		parent_fid = lu_object_fid(&hint->dah_parent->do_lu);
+		sizes[1] = sizeof(*parent_fid);
+		buf_count++;
+	}
+
+	rc = out_update_header_pack(env, update, max_update_size, OUT_CREATE,
+				    fid, buf_count, sizes, 0);
+	if (rc != 0)
+		RETURN(rc);
+
+	obdo = object_update_param_get(update, 0, NULL);
+	if (IS_ERR(obdo))
+		RETURN(PTR_ERR(obdo));
+
+	obdo->o_valid = 0;
+	obdo_from_la(obdo, attr, attr->la_valid);
+
+	if (parent_fid != NULL) {
+		struct lu_fid *tmp;
+
+		tmp = object_update_param_get(update, 1, NULL);
+		if (IS_ERR(tmp))
+			RETURN(PTR_ERR(tmp));
+
+		fid_cpu_to_le(tmp, parent_fid);
+	}
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(out_create_pack);
+
+int out_ref_del_pack(const struct lu_env *env, struct object_update *update,
+		     size_t *max_update_size, const struct lu_fid *fid)
+{
+	return out_update_pack(env, update, max_update_size, OUT_REF_DEL, fid,
+			       0, NULL, NULL, 0);
+}
+EXPORT_SYMBOL(out_ref_del_pack);
+
+int out_ref_add_pack(const struct lu_env *env, struct object_update *update,
+		     size_t *max_update_size, const struct lu_fid *fid)
+{
+	return out_update_pack(env, update, max_update_size, OUT_REF_ADD, fid,
+			       0, NULL, NULL, 0);
+}
+EXPORT_SYMBOL(out_ref_add_pack);
+
+int out_attr_set_pack(const struct lu_env *env, struct object_update *update,
+		      size_t *max_update_size, const struct lu_fid *fid,
+		      const struct lu_attr *attr)
+{
+	struct obdo		*obdo;
+	__u16			size = sizeof(*obdo);
+	int			rc;
+	ENTRY;
+
+	rc = out_update_header_pack(env, update, max_update_size,
+				    OUT_ATTR_SET, fid, 1, &size, 0);
+	if (rc != 0)
+		RETURN(rc);
+
+	obdo = object_update_param_get(update, 0, NULL);
+	if (IS_ERR(obdo))
+		RETURN(PTR_ERR(obdo));
+
+	obdo->o_valid = 0;
+	obdo_from_la(obdo, attr, attr->la_valid);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(out_attr_set_pack);
+
+int out_xattr_set_pack(const struct lu_env *env, struct object_update *update,
+		       size_t *max_update_size, const struct lu_fid *fid,
+		       const struct lu_buf *buf, const char *name, __u32 flag)
+{
+	__u16	sizes[3] = {strlen(name) + 1, buf->lb_len, sizeof(flag)};
+	const void *bufs[3] = {(char *)name, (char *)buf->lb_buf,
+			       (char *)&flag};
+
+	return out_update_pack(env, update, max_update_size, OUT_XATTR_SET,
+			       fid, ARRAY_SIZE(sizes), sizes, bufs, 0);
+}
+EXPORT_SYMBOL(out_xattr_set_pack);
+
+int out_xattr_del_pack(const struct lu_env *env, struct object_update *update,
+		       size_t *max_update_size, const struct lu_fid *fid,
+		       const char *name)
+{
+	__u16	size = strlen(name) + 1;
+
+	return out_update_pack(env, update, max_update_size, OUT_XATTR_DEL,
+			       fid, 1, &size, (const void **)&name, 0);
+}
+EXPORT_SYMBOL(out_xattr_del_pack);
+
+int out_index_insert_pack(const struct lu_env *env,
+			  struct object_update *update,
+			  size_t *max_update_size, const struct lu_fid *fid,
+			  const struct dt_rec *rec, const struct dt_key *key)
+{
+	struct dt_insert_rec	   *rec1 = (struct dt_insert_rec *)rec;
+	struct lu_fid		   rec_fid;
+	__u32			    type = cpu_to_le32(rec1->rec_type);
+	__u16			    sizes[3] = { strlen((char *)key) + 1,
+						sizeof(rec_fid),
+						sizeof(type) };
+	const void		   *bufs[3] = { (char *)key,
+						(char *)&rec_fid,
+						(char *)&type };
+
+	fid_cpu_to_le(&rec_fid, rec1->rec_fid);
+
+	return out_update_pack(env, update, max_update_size, OUT_INDEX_INSERT,
+			       fid, ARRAY_SIZE(sizes), sizes, bufs, 0);
+}
+EXPORT_SYMBOL(out_index_insert_pack);
+
+int out_index_delete_pack(const struct lu_env *env,
+			  struct object_update *update,
+			  size_t *max_update_size, const struct lu_fid *fid,
+			  const struct dt_key *key)
+{
+	__u16	size = strlen((char *)key) + 1;
+	const void *buf = key;
+
+	return out_update_pack(env, update, max_update_size, OUT_INDEX_DELETE,
+			       fid, 1, &size, &buf, 0);
+}
+EXPORT_SYMBOL(out_index_delete_pack);
+
+int out_destroy_pack(const struct lu_env *env, struct object_update *update,
+		     size_t *max_update_size, const struct lu_fid *fid)
+{
+	return out_update_pack(env, update, max_update_size, OUT_DESTROY, fid,
+			       0, NULL, NULL, 0);
+}
+EXPORT_SYMBOL(out_destroy_pack);
+
+int out_write_pack(const struct lu_env *env, struct object_update *update,
+		   size_t *max_update_size, const struct lu_fid *fid,
+		   const struct lu_buf *buf, __u64 pos)
+{
+	__u16		sizes[2] = {buf->lb_len, sizeof(pos)};
+	const void	*bufs[2] = {(char *)buf->lb_buf, (char *)&pos};
+	int		rc;
+
+	pos = cpu_to_le64(pos);
+
+	rc = out_update_pack(env, update, max_update_size, OUT_WRITE, fid,
+			     ARRAY_SIZE(sizes), sizes, bufs, 0);
+	return rc;
+}
+EXPORT_SYMBOL(out_write_pack);
+
+/**
+ * Pack various readonly updates into the update_buffer.
+ *
+ * The following update funcs are only used by read-only ops, lookup,
+ * getattr etc, so it does not need transaction here. Currently they
+ * are only used by OSP.
+ *
+ * \param[in] env	execution environment
+ * \param[in] fid	fid of this object for the update
+ * \param[in] ubuf	update buffer
+ *
+ * \retval		= 0 pack succeed.
+ *                      < 0 pack failed.
+ **/
+int out_index_lookup_pack(const struct lu_env *env,
+			  struct object_update *update,
+			  size_t *max_update_size, const struct lu_fid *fid,
+			  struct dt_rec *rec, const struct dt_key *key)
+{
+	const void	*name = key;
+	__u16		size = strlen((char *)name) + 1;
+
+	/* XXX: this shouldn't be hardcoded */
+	return out_update_pack(env, update, max_update_size, OUT_INDEX_LOOKUP,
+			       fid, 1, &size, &name, 256);
+}
+EXPORT_SYMBOL(out_index_lookup_pack);
+
+int out_attr_get_pack(const struct lu_env *env, struct object_update *update,
+		      size_t *max_update_size, const struct lu_fid *fid)
+{
+	return out_update_pack(env, update, max_update_size, OUT_ATTR_GET,
+			       fid, 0, NULL, NULL, sizeof(struct obdo));
+}
+EXPORT_SYMBOL(out_attr_get_pack);
+
+int out_xattr_get_pack(const struct lu_env *env, struct object_update *update,
+		       size_t *max_update_size, const struct lu_fid *fid,
+		       const char *name, const int bufsize)
+{
+	__u16 size;
+
+	LASSERT(name != NULL);
+	size = strlen(name) + 1;
+
+	return out_update_pack(env, update, max_update_size, OUT_XATTR_GET,
+			       fid, 1, &size, (const void **)&name, bufsize);
+}
+EXPORT_SYMBOL(out_xattr_get_pack);
+
+int out_read_pack(const struct lu_env *env, struct object_update *update,
+		  size_t *max_update_size, const struct lu_fid *fid,
+		  size_t size, loff_t pos)
+{
+	__u16		sizes[2] = {sizeof(size), sizeof(pos)};
+	const void	*bufs[2] = {&size, &pos};
+
+	LASSERT(size > 0);
+	size = cpu_to_le64(size);
+	pos = cpu_to_le64(pos);
+
+	return out_update_pack(env, update, max_update_size, OUT_READ, fid,
+			       ARRAY_SIZE(sizes), sizes, bufs, size);
+}
+EXPORT_SYMBOL(out_read_pack);
+
+static int tx_extend_args(struct thandle_exec_args *ta, int new_alloc_ta)
+{
+	struct tx_arg	**new_ta;
+	int		i;
+	int		rc = 0;
+
+	if (ta->ta_alloc_args >= new_alloc_ta)
+		return 0;
+
+	OBD_ALLOC(new_ta, sizeof(*new_ta) * new_alloc_ta);
+	if (new_ta == NULL)
+		return -ENOMEM;
+
+	for (i = 0; i < new_alloc_ta; i++) {
+		if (i < ta->ta_alloc_args) {
+			/* copy the old args to new one */
+			new_ta[i] = ta->ta_args[i];
+		} else {
+			OBD_ALLOC_PTR(new_ta[i]);
+			if (new_ta[i] == NULL)
+				GOTO(out, rc = -ENOMEM);
+		}
+	}
+
+	/* free the old args */
+	if (ta->ta_args != NULL)
+		OBD_FREE(ta->ta_args, sizeof(ta->ta_args[0]) *
+				      ta->ta_alloc_args);
+
+	ta->ta_args = new_ta;
+	ta->ta_alloc_args = new_alloc_ta;
+out:
+	if (rc != 0) {
+		for (i = 0; i < new_alloc_ta; i++) {
+			if (new_ta[i] != NULL)
+				OBD_FREE_PTR(new_ta[i]);
+		}
+		OBD_FREE(new_ta, sizeof(*new_ta) * new_alloc_ta);
+	}
+	return rc;
+}
+
+#define TX_ALLOC_STEP	8
+struct tx_arg *tx_add_exec(struct thandle_exec_args *ta,
+			   tx_exec_func_t func, tx_exec_func_t undo,
+			   const char *file, int line)
+{
+	int rc;
+	int i;
+
+	LASSERT(ta != NULL);
+	LASSERT(func != NULL);
+
+	if (ta->ta_argno + 1 >= ta->ta_alloc_args) {
+		rc = tx_extend_args(ta, ta->ta_alloc_args + TX_ALLOC_STEP);
+		if (rc != 0)
+			return ERR_PTR(rc);
+	}
+
+	i = ta->ta_argno;
+
+	ta->ta_argno++;
+
+	ta->ta_args[i]->exec_fn = func;
+	ta->ta_args[i]->undo_fn = undo;
+	ta->ta_args[i]->file    = file;
+	ta->ta_args[i]->line    = line;
+
+	return ta->ta_args[i];
+}
+
+static int out_obj_destroy(const struct lu_env *env, struct dt_object *dt_obj,
+			   struct thandle *th)
+{
+	int rc;
+
+	CDEBUG(D_INFO, "%s: destroy "DFID"\n", dt_obd_name(th->th_dev),
+	       PFID(lu_object_fid(&dt_obj->do_lu)));
+
+	dt_write_lock(env, dt_obj, MOR_TGT_CHILD);
+	rc = dt_destroy(env, dt_obj, th);
+	dt_write_unlock(env, dt_obj);
+
+	return rc;
+}
+
+/**
+ * All of the xxx_undo will be used once execution failed,
+ * But because all of the required resource has been reserved in
+ * declare phase, i.e. if declare succeed, it should make sure
+ * the following executing phase succeed in anyway, so these undo
+ * should be useless for most of the time in Phase I
+ */
+static int out_tx_create_undo(const struct lu_env *env, struct thandle *th,
+			      struct tx_arg *arg)
+{
+	int rc;
+
+	rc = out_obj_destroy(env, arg->object, th);
+	if (rc != 0)
+		CERROR("%s: undo failure, we are doomed!: rc = %d\n",
+		       dt_obd_name(th->th_dev), rc);
+	return rc;
+}
+
+int out_tx_create_exec(const struct lu_env *env, struct thandle *th,
+		       struct tx_arg *arg)
+{
+	struct dt_object	*dt_obj = arg->object;
+	int			 rc;
+
+	CDEBUG(D_OTHER, "%s: create "DFID": dof %u, mode %o\n",
+	       dt_obd_name(th->th_dev),
+	       PFID(lu_object_fid(&arg->object->do_lu)),
+	       arg->u.create.dof.dof_type,
+	       arg->u.create.attr.la_mode & S_IFMT);
+
+	dt_write_lock(env, dt_obj, MOR_TGT_CHILD);
+	rc = dt_create(env, dt_obj, &arg->u.create.attr,
+		       &arg->u.create.hint, &arg->u.create.dof, th);
+
+	dt_write_unlock(env, dt_obj);
+
+	CDEBUG(D_INFO, "%s: insert create reply %p index %d: rc = %d\n",
+	       dt_obd_name(th->th_dev), arg->reply, arg->index, rc);
+
+	if (arg->reply != NULL)
+		object_update_result_insert(arg->reply, NULL, 0, arg->index,
+					    rc);
+
+	return rc;
+}
+
+/**
+ * Add create update to thandle
+ *
+ * Declare create updates and add the update to the thandle updates
+ * exec array.
+ *
+ * \param [in] env	execution environment
+ * \param [in] obj	object to be created
+ * \param [in] attr	attributes of the creation
+ * \param [in] parent_fid the fid of the parent
+ * \param [in] dof	dt object format of the creation
+ * \param [in] ta	thandle execuation args where all of updates
+ *                      of the transaction are stored
+ * \param [in] th	thandle for this update
+ * \param [in] reply	reply of the updates
+ * \param [in] index	index of the reply
+ * \param [in] file	the file name where the function is called,
+ *                      which is only for debugging purpose.
+ * \param [in] line	the line number where the funtion is called,
+ *                      which is only for debugging purpose.
+ *
+ * \retval		0 if updates is added successfully.
+ * \retval		negative errno if update adding fails.
+ */
+int out_create_add_exec(const struct lu_env *env, struct dt_object *obj,
+			struct lu_attr *attr, struct lu_fid *parent_fid,
+			struct dt_object_format *dof,
+			struct thandle_exec_args *ta,
+			struct thandle	*th,
+			struct object_update_reply *reply,
+			int index, const char *file, int line)
+{
+	struct tx_arg *arg;
+	int rc;
+
+	rc = dt_declare_create(env, obj, attr, NULL, dof, th);
+	if (rc != 0)
+		return rc;
+
+	arg = tx_add_exec(ta, out_tx_create_exec, out_tx_create_undo, file,
+			  line);
+	if (IS_ERR(arg))
+		return PTR_ERR(arg);
+
+	/* release the object in out_trans_stop */
+	lu_object_get(&obj->do_lu);
+	arg->object = obj;
+	arg->u.create.attr = *attr;
+	if (parent_fid != NULL)
+		arg->u.create.fid = *parent_fid;
+	memset(&arg->u.create.hint, 0, sizeof(arg->u.create.hint));
+	arg->u.create.dof  = *dof;
+	arg->reply = reply;
+	arg->index = index;
+
+	return 0;
+}
+
+static int out_tx_attr_set_undo(const struct lu_env *env,
+				struct thandle *th, struct tx_arg *arg)
+{
+	CERROR("%s: attr set undo "DFID" unimplemented yet!: rc = %d\n",
+	       dt_obd_name(th->th_dev),
+	       PFID(lu_object_fid(&arg->object->do_lu)), -ENOTSUPP);
+
+	return -ENOTSUPP;
+}
+
+static int out_tx_attr_set_exec(const struct lu_env *env, struct thandle *th,
+				struct tx_arg *arg)
+{
+	struct dt_object	*dt_obj = arg->object;
+	int			rc;
+
+	CDEBUG(D_OTHER, "%s: attr set "DFID"\n", dt_obd_name(th->th_dev),
+	       PFID(lu_object_fid(&dt_obj->do_lu)));
+
+	dt_write_lock(env, dt_obj, MOR_TGT_CHILD);
+	rc = dt_attr_set(env, dt_obj, &arg->u.attr_set.attr, th);
+	dt_write_unlock(env, dt_obj);
+
+	CDEBUG(D_INFO, "%s: insert attr_set reply %p index %d: rc = %d\n",
+	       dt_obd_name(th->th_dev), arg->reply, arg->index, rc);
+
+	if (arg->reply != NULL)
+		object_update_result_insert(arg->reply, NULL, 0,
+					    arg->index, rc);
+
+	return rc;
+}
+
+int out_attr_set_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
+			  const struct lu_attr *attr,
+			  struct thandle_exec_args *ta,
+			  struct thandle *th, struct object_update_reply *reply,
+			  int index, const char *file, int line)
+{
+	struct tx_arg	*arg;
+	int		rc;
+
+	rc = dt_declare_attr_set(env, dt_obj, attr, th);
+	if (rc != 0)
+		return rc;
+
+	if (attr->la_valid & LA_FLAGS &&
+	    attr->la_flags & LUSTRE_SET_SYNC_FL)
+		th->th_sync |= 1;
+
+	arg = tx_add_exec(ta, out_tx_attr_set_exec, out_tx_attr_set_undo,
+			  file, line);
+	if (IS_ERR(arg))
+		return PTR_ERR(arg);
+
+	lu_object_get(&dt_obj->do_lu);
+	arg->object = dt_obj;
+	arg->u.attr_set.attr = *attr;
+	arg->reply = reply;
+	arg->index = index;
+	return 0;
+}
+
+static int out_tx_write_exec(const struct lu_env *env, struct thandle *th,
+			     struct tx_arg *arg)
+{
+	struct dt_object *dt_obj = arg->object;
+	int rc;
+
+	CDEBUG(D_INFO, "write "DFID" pos %llu buf %p, len %lu\n",
+	       PFID(lu_object_fid(&dt_obj->do_lu)), arg->u.write.pos,
+	       arg->u.write.buf.lb_buf, (unsigned long)arg->u.write.buf.lb_len);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_OUT_ENOSPC)) {
+		rc = -ENOSPC;
+	} else {
+		dt_write_lock(env, dt_obj, MOR_TGT_CHILD);
+		rc = dt_record_write(env, dt_obj, &arg->u.write.buf,
+				     &arg->u.write.pos, th);
+		dt_write_unlock(env, dt_obj);
+
+		if (rc == 0)
+			rc = arg->u.write.buf.lb_len;
+	}
+
+	if (arg->reply != NULL)
+		object_update_result_insert(arg->reply, NULL, 0, arg->index,
+					    rc);
+
+	return rc > 0 ? 0 : rc;
+}
+
+int out_write_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
+		       const struct lu_buf *buf, loff_t pos,
+		       struct thandle_exec_args *ta, struct thandle *th,
+		       struct object_update_reply *reply, int index,
+		       const char *file, int line)
+{
+	struct tx_arg	*arg;
+	int		rc;
+
+	rc = dt_declare_record_write(env, dt_obj, buf, pos, th);
+	if (rc != 0)
+		return rc;
+
+	arg = tx_add_exec(ta, out_tx_write_exec, NULL, file, line);
+	if (IS_ERR(arg))
+		return PTR_ERR(arg);
+
+	lu_object_get(&dt_obj->do_lu);
+	arg->object = dt_obj;
+	arg->u.write.buf = *buf;
+	arg->u.write.pos = pos;
+	arg->reply = reply;
+	arg->index = index;
+	return 0;
+}
+
+static int out_tx_xattr_set_exec(const struct lu_env *env,
+				 struct thandle *th,
+				 struct tx_arg *arg)
+{
+	struct dt_object *dt_obj = arg->object;
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_INFO, "%s: set xattr buf %p name %s flag %d\n",
+	       dt_obd_name(th->th_dev), arg->u.xattr_set.buf.lb_buf,
+	       arg->u.xattr_set.name, arg->u.xattr_set.flags);
+
+	if (!lu_object_exists(&dt_obj->do_lu)) {
+		rc = -ENOENT;
+	} else {
+		struct linkea_data ldata = { 0 };
+		bool linkea;
+
+		ldata.ld_buf = &arg->u.xattr_set.buf;
+		if (strcmp(arg->u.xattr_set.name, XATTR_NAME_LINK) == 0) {
+			struct link_ea_header *leh;
+
+			linkea = true;
+			rc = linkea_init(&ldata);
+			if (unlikely(rc))
+				GOTO(out, rc == -ENODATA ? -EINVAL : rc);
+
+			leh = ldata.ld_leh;
+			LASSERT(leh != NULL);
+
+			/* If the new linkEA contains overflow timestamp,
+			 * then two cases:
+			 *
+			 * 1. The old linkEA for the object has already
+			 *    overflowed before current setting, the new
+			 *    linkEA does not contains new link entry. So
+			 *    the linkEA overflow timestamp is unchanged.
+			 *
+			 * 2. There are new link entry in the new linkEA,
+			 *    so its overflow timestamp is differnt from
+			 *    the old one. Usually, the overstamp in the
+			 *    given linkEA is newer. But because of clock
+			 *    drift among MDTs, the timestamp may become
+			 *    older. So here, we convert the timestamp to
+			 *    the server local time. Then namespace LFSCK
+			 *    that uses local time can handle it easily. */
+			if (unlikely(leh->leh_overflow_time)) {
+				struct lu_buf tbuf = { 0 };
+				bool update = false;
+
+				lu_buf_alloc(&tbuf, MAX_LINKEA_SIZE);
+				if (tbuf.lb_buf == NULL)
+					GOTO(unlock, rc = -ENOMEM);
+
+				rc = dt_xattr_get(env, dt_obj, &tbuf,
+						  XATTR_NAME_LINK);
+				if (rc > 0) {
+					struct linkea_data tdata = { 0 };
+
+					tdata.ld_buf = &tbuf;
+					rc = linkea_init(&tdata);
+					if (rc || leh->leh_overflow_time !=
+					    tdata.ld_leh->leh_overflow_time)
+						update = true;
+				} else {
+					/* Update the timestamp by force if
+					 * fail to load the old linkEA. */
+					update = true;
+				}
+
+				lu_buf_free(&tbuf);
+				if (update) {
+					leh->leh_overflow_time =
+							cfs_time_current_sec();
+					if (unlikely(!leh->leh_overflow_time))
+						leh->leh_overflow_time++;
+				}
+			}
+		} else {
+			linkea = false;
+		}
+
+		dt_write_lock(env, dt_obj, MOR_TGT_CHILD);
+
+again:
+		rc = dt_xattr_set(env, dt_obj, ldata.ld_buf,
+				  arg->u.xattr_set.name, arg->u.xattr_set.flags,
+				  th);
+		if (unlikely(rc == -ENOSPC && linkea)) {
+			rc = linkea_overflow_shrink(&ldata);
+			if (likely(rc > 0)) {
+				arg->u.xattr_set.buf.lb_len = rc;
+				goto again;
+			}
+		}
+
+unlock:
+		dt_write_unlock(env, dt_obj);
+	}
+
+	GOTO(out, rc);
+
+out:
+	CDEBUG(D_INFO, "%s: insert xattr set reply %p index %d: rc = %d\n",
+	       dt_obd_name(th->th_dev), arg->reply, arg->index, rc);
+
+	if (arg->reply != NULL)
+		object_update_result_insert(arg->reply, NULL, 0, arg->index,
+					    rc);
+
+	return rc;
+}
+
+int out_xattr_set_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
+			   const struct lu_buf *buf, const char *name,
+			   int flags, struct thandle_exec_args *ta,
+			   struct thandle *th,
+			   struct object_update_reply *reply,
+			   int index, const char *file, int line)
+{
+	struct tx_arg	*arg;
+	int		rc;
+
+	rc = dt_declare_xattr_set(env, dt_obj, buf, name, flags, th);
+	if (rc != 0)
+		return rc;
+
+	arg = tx_add_exec(ta, out_tx_xattr_set_exec, NULL, file, line);
+	if (IS_ERR(arg))
+		return PTR_ERR(arg);
+
+	lu_object_get(&dt_obj->do_lu);
+	arg->object = dt_obj;
+	arg->u.xattr_set.name = name;
+	arg->u.xattr_set.flags = flags;
+	arg->u.xattr_set.buf = *buf;
+	arg->reply = reply;
+	arg->index = index;
+	arg->u.xattr_set.csum = 0;
+	return 0;
+}
+
+static int out_tx_xattr_del_exec(const struct lu_env *env, struct thandle *th,
+				 struct tx_arg *arg)
+{
+	struct dt_object *dt_obj = arg->object;
+	int rc;
+
+	CDEBUG(D_INFO, "%s: del xattr name '%s' on "DFID"\n",
+	       dt_obd_name(th->th_dev), arg->u.xattr_set.name,
+	       PFID(lu_object_fid(&dt_obj->do_lu)));
+
+	if (!lu_object_exists(&dt_obj->do_lu))
+		GOTO(out, rc = -ENOENT);
+
+	dt_write_lock(env, dt_obj, MOR_TGT_CHILD);
+	rc = dt_xattr_del(env, dt_obj, arg->u.xattr_set.name,
+			  th);
+	dt_write_unlock(env, dt_obj);
+out:
+	CDEBUG(D_INFO, "%s: insert xattr del reply %p index %d: rc = %d\n",
+	       dt_obd_name(th->th_dev), arg->reply, arg->index, rc);
+
+	if (arg->reply != NULL)
+		object_update_result_insert(arg->reply, NULL, 0, arg->index,
+					    rc);
+
+	return rc;
+}
+
+int out_xattr_del_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
+			   const char *name, struct thandle_exec_args *ta,
+			   struct thandle *th,
+			   struct object_update_reply *reply, int index,
+			   const char *file, int line)
+{
+	struct tx_arg	*arg;
+	int		rc;
+
+	rc = dt_declare_xattr_del(env, dt_obj, name, th);
+	if (rc != 0)
+		return rc;
+
+	arg = tx_add_exec(ta, out_tx_xattr_del_exec, NULL, file, line);
+	if (IS_ERR(arg))
+		return PTR_ERR(arg);
+
+	lu_object_get(&dt_obj->do_lu);
+	arg->object = dt_obj;
+	arg->u.xattr_set.name = name;
+	arg->reply = reply;
+	arg->index = index;
+	return 0;
+}
+
+static int out_obj_ref_add(const struct lu_env *env,
+			   struct dt_object *dt_obj,
+			   struct thandle *th)
+{
+	int rc;
+
+	dt_write_lock(env, dt_obj, MOR_TGT_CHILD);
+	rc = dt_ref_add(env, dt_obj, th);
+	dt_write_unlock(env, dt_obj);
+
+	return rc;
+}
+
+static int out_obj_ref_del(const struct lu_env *env,
+			   struct dt_object *dt_obj,
+			   struct thandle *th)
+{
+	int rc;
+
+	dt_write_lock(env, dt_obj, MOR_TGT_CHILD);
+	rc = dt_ref_del(env, dt_obj, th);
+	dt_write_unlock(env, dt_obj);
+
+	return rc;
+}
+
+static int out_tx_ref_add_exec(const struct lu_env *env, struct thandle *th,
+			       struct tx_arg *arg)
+{
+	struct dt_object *dt_obj = arg->object;
+	int rc;
+
+	rc = out_obj_ref_add(env, dt_obj, th);
+
+	CDEBUG(D_INFO, "%s: insert ref_add reply %p index %d: rc = %d\n",
+	       dt_obd_name(th->th_dev), arg->reply, arg->index, rc);
+
+	if (arg->reply != NULL)
+		object_update_result_insert(arg->reply, NULL, 0, arg->index,
+					    rc);
+	return rc;
+}
+
+static int out_tx_ref_add_undo(const struct lu_env *env, struct thandle *th,
+			       struct tx_arg *arg)
+{
+	return out_obj_ref_del(env, arg->object, th);
+}
+
+int out_ref_add_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
+			 struct thandle_exec_args *ta,
+			 struct thandle *th,
+			 struct object_update_reply *reply, int index,
+			 const char *file, int line)
+{
+	struct tx_arg	*arg;
+	int		rc;
+
+	rc = dt_declare_ref_add(env, dt_obj, th);
+	if (rc != 0)
+		return rc;
+
+	arg = tx_add_exec(ta, out_tx_ref_add_exec, out_tx_ref_add_undo, file,
+			  line);
+	if (IS_ERR(arg))
+		return PTR_ERR(arg);
+
+	lu_object_get(&dt_obj->do_lu);
+	arg->object = dt_obj;
+	arg->reply = reply;
+	arg->index = index;
+	return 0;
+}
+
+static int out_tx_ref_del_exec(const struct lu_env *env, struct thandle *th,
+			       struct tx_arg *arg)
+{
+	struct dt_object	*dt_obj = arg->object;
+	int			 rc;
+
+	rc = out_obj_ref_del(env, dt_obj, th);
+
+	CDEBUG(D_INFO, "%s: insert ref_del reply %p index %d: rc = %d\n",
+	       dt_obd_name(th->th_dev), arg->reply, arg->index, 0);
+
+	if (arg->reply != NULL)
+		object_update_result_insert(arg->reply, NULL, 0, arg->index,
+					    rc);
+
+	return rc;
+}
+
+static int out_tx_ref_del_undo(const struct lu_env *env, struct thandle *th,
+			       struct tx_arg *arg)
+{
+	return out_obj_ref_add(env, arg->object, th);
+}
+
+int out_ref_del_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
+			 struct thandle_exec_args *ta,
+			 struct thandle *th,
+			 struct object_update_reply *reply, int index,
+			 const char *file, int line)
+{
+	struct tx_arg	*arg;
+	int		rc;
+
+	rc = dt_declare_ref_del(env, dt_obj, th);
+	if (rc != 0)
+		return rc;
+
+	arg = tx_add_exec(ta, out_tx_ref_del_exec, out_tx_ref_del_undo, file,
+			  line);
+	if (IS_ERR(arg))
+		return PTR_ERR(arg);
+
+	lu_object_get(&dt_obj->do_lu);
+	arg->object = dt_obj;
+	arg->reply = reply;
+	arg->index = index;
+	return 0;
+}
+
+static int out_obj_index_insert(const struct lu_env *env,
+				struct dt_object *dt_obj,
+				const struct dt_rec *rec,
+				const struct dt_key *key,
+				struct thandle *th)
+{
+	int rc;
+
+	CDEBUG(D_INFO, "%s: index insert "DFID" name: %s fid "DFID", type %u\n",
+	       dt_obd_name(th->th_dev), PFID(lu_object_fid(&dt_obj->do_lu)),
+	       (char *)key, PFID(((struct dt_insert_rec *)rec)->rec_fid),
+	       ((struct dt_insert_rec *)rec)->rec_type);
+
+	if (dt_try_as_dir(env, dt_obj) == 0)
+		return -ENOTDIR;
+
+	dt_write_lock(env, dt_obj, MOR_TGT_CHILD);
+	rc = dt_insert(env, dt_obj, rec, key, th, 0);
+	dt_write_unlock(env, dt_obj);
+
+	return rc;
+}
+
+static int out_obj_index_delete(const struct lu_env *env,
+				struct dt_object *dt_obj,
+				const struct dt_key *key,
+				struct thandle *th)
+{
+	int rc;
+
+	CDEBUG(D_INFO, "%s: index delete "DFID" name: %s\n",
+	       dt_obd_name(th->th_dev), PFID(lu_object_fid(&dt_obj->do_lu)),
+	       (char *)key);
+
+	if (dt_try_as_dir(env, dt_obj) == 0)
+		return -ENOTDIR;
+
+	dt_write_lock(env, dt_obj, MOR_TGT_CHILD);
+	rc = dt_delete(env, dt_obj, key, th);
+	dt_write_unlock(env, dt_obj);
+
+	return rc;
+}
+
+static int out_tx_index_insert_exec(const struct lu_env *env,
+				    struct thandle *th, struct tx_arg *arg)
+{
+	struct dt_object *dt_obj = arg->object;
+	int rc;
+
+	if (unlikely(!dt_object_exists(dt_obj)))
+		RETURN(-ESTALE);
+
+	rc = out_obj_index_insert(env, dt_obj,
+				  (const struct dt_rec *)&arg->u.insert.rec,
+				  arg->u.insert.key, th);
+
+	CDEBUG(D_INFO, "%s: insert idx insert reply %p index %d: rc = %d\n",
+	       dt_obd_name(th->th_dev), arg->reply, arg->index, rc);
+
+	if (arg->reply != NULL)
+		object_update_result_insert(arg->reply, NULL, 0, arg->index,
+					    rc);
+	return rc;
+}
+
+static int out_tx_index_insert_undo(const struct lu_env *env,
+				    struct thandle *th, struct tx_arg *arg)
+{
+	return out_obj_index_delete(env, arg->object, arg->u.insert.key, th);
+}
+
+int out_index_insert_add_exec(const struct lu_env *env,
+			      struct dt_object *dt_obj,
+			      const struct dt_rec *rec,
+			      const struct dt_key *key,
+			      struct thandle_exec_args *ta,
+			      struct thandle *th,
+			      struct object_update_reply *reply,
+			      int index, const char *file, int line)
+{
+	struct tx_arg	*arg;
+	int		rc;
+
+	if (dt_try_as_dir(env, dt_obj) == 0) {
+		rc = -ENOTDIR;
+		return rc;
+	}
+
+	rc = dt_declare_insert(env, dt_obj, rec, key, th);
+	if (rc != 0)
+		return rc;
+
+	arg = tx_add_exec(ta, out_tx_index_insert_exec,
+			  out_tx_index_insert_undo, file, line);
+	if (IS_ERR(arg))
+		return PTR_ERR(arg);
+
+	lu_object_get(&dt_obj->do_lu);
+	arg->object = dt_obj;
+	arg->reply = reply;
+	arg->index = index;
+	arg->u.insert.rec = *(const struct dt_insert_rec *)rec;
+	arg->u.insert.key = key;
+
+	return 0;
+}
+
+static int out_tx_index_delete_exec(const struct lu_env *env,
+				    struct thandle *th,
+				    struct tx_arg *arg)
+{
+	int rc;
+
+	rc = out_obj_index_delete(env, arg->object, arg->u.insert.key, th);
+
+	CDEBUG(D_INFO, "%s: delete idx insert reply %p index %d: rc = %d\n",
+	       dt_obd_name(th->th_dev), arg->reply, arg->index, rc);
+
+	if (arg->reply != NULL)
+		object_update_result_insert(arg->reply, NULL, 0, arg->index,
+					    rc);
+
+	return rc;
+}
+
+static int out_tx_index_delete_undo(const struct lu_env *env,
+				    struct thandle *th,
+				    struct tx_arg *arg)
+{
+	CERROR("%s: Oops, can not rollback index_delete yet: rc = %d\n",
+	       dt_obd_name(th->th_dev), -ENOTSUPP);
+	return -ENOTSUPP;
+}
+
+int out_index_delete_add_exec(const struct lu_env *env,
+			      struct dt_object *dt_obj,
+			      const struct dt_key *key,
+			      struct thandle_exec_args *ta,
+			      struct thandle *th,
+			      struct object_update_reply *reply,
+			      int index, const char *file, int line)
+{
+	struct tx_arg	*arg;
+	int		rc;
+
+	if (dt_try_as_dir(env, dt_obj) == 0) {
+		rc = -ENOTDIR;
+		return rc;
+	}
+
+	LASSERT(ta->ta_handle != NULL);
+	rc = dt_declare_delete(env, dt_obj, key, th);
+	if (rc != 0)
+		return rc;
+
+	arg = tx_add_exec(ta, out_tx_index_delete_exec,
+			  out_tx_index_delete_undo, file, line);
+	if (IS_ERR(arg))
+		return PTR_ERR(arg);
+
+	lu_object_get(&dt_obj->do_lu);
+	arg->object = dt_obj;
+	arg->reply = reply;
+	arg->index = index;
+	arg->u.insert.key = key;
+	return 0;
+}
+
+static int out_tx_destroy_exec(const struct lu_env *env, struct thandle *th,
+			       struct tx_arg *arg)
+{
+	struct dt_object *dt_obj = arg->object;
+	int rc;
+
+	rc = out_obj_destroy(env, dt_obj, th);
+
+	CDEBUG(D_INFO, "%s: insert destroy reply %p index %d: rc = %d\n",
+	       dt_obd_name(th->th_dev), arg->reply, arg->index, rc);
+
+	if (arg->reply != NULL)
+		object_update_result_insert(arg->reply, NULL, 0, arg->index,
+					    rc);
+
+	RETURN(rc);
+}
+
+static int out_tx_destroy_undo(const struct lu_env *env, struct thandle *th,
+			       struct tx_arg *arg)
+{
+	CERROR("%s: not support destroy undo yet!: rc = %d\n",
+	       dt_obd_name(th->th_dev), -ENOTSUPP);
+	return -ENOTSUPP;
+}
+
+int out_destroy_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
+			 struct thandle_exec_args *ta, struct thandle *th,
+			 struct object_update_reply *reply,
+			 int index, const char *file, int line)
+{
+	struct tx_arg	*arg;
+	int		rc;
+
+	rc = dt_declare_destroy(env, dt_obj, th);
+	if (rc != 0)
+		return rc;
+
+	arg = tx_add_exec(ta, out_tx_destroy_exec, out_tx_destroy_undo,
+			  file, line);
+	if (IS_ERR(arg))
+		return PTR_ERR(arg);
+
+	lu_object_get(&dt_obj->do_lu);
+	arg->object = dt_obj;
+	arg->reply = reply;
+	arg->index = index;
+	return 0;
+}
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_grant.c b/drivers/staging/lustrefsx/lustre/target/tgt_grant.c
new file mode 100644
index 0000000000000..78876235dcfd7
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_grant.c
@@ -0,0 +1,1507 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * lustre/target/tgt_grant.c
+ *
+ * This file provides code related to grant space management on Lustre Targets
+ * (OSTs and MDTs). Grant is a mechanism used by client nodes to reserve disk
+ * space on a target for the data writeback cache. The Lustre client is thus
+ * assured that enough space will be available when flushing dirty pages
+ * asynchronously. Each client node is granted an initial amount of reserved
+ * space at connect time and gets additional space back from target in bulk
+ * write reply.
+ *
+ * We actually support three different cases:
+ * - The client supports the new grant parameters (i.e. OBD_CONNECT_GRANT_PARAM)
+ *   which means that all grant overhead calculation happens on the client side.
+ *   The server reports at connect time the backend filesystem block size, the
+ *   maximum extent size as well as the extent insertion cost and it is then up
+ *   to the osc layer to the track dirty extents and consume grant accordingly
+ *   (see osc_cache.c). In each bulk write request, the client provides how much
+ *   grant space was consumed for this RPC.
+ * - The client does not support OBD_CONNECT_GRANT_PARAM and always assumes a
+ *   a backend file system block size of 4KB. We then have two cases:
+ *   - If the block size is really 4KB, then the client can deal with grant
+ *     allocation for partial block writes, but won't take extent insertion cost
+ *     into account. For such clients, we inflate grant by 100% on the server
+ *     side. It means that when 32MB of grant is hold by the client, 64MB of
+ *     grant space is actually reserved on the server. All grant counters
+ *     provided by such a client are inflated by 100%.
+ *   - The backend filesystem block size is bigger than 4KB, which isn't
+ *     supported by the client. In this case, we emulate a 4KB block size and
+ *     consume one block size on the server for each 4KB of grant returned to
+ *     client. With a 128KB blocksize, it means that 32MB dirty pages of 4KB
+ *     on the client will actually consume 1GB of grant on the server.
+ *     All grant counters provided by such a client are inflated by the block
+ *     size ratio.
+ *
+ * This file handles the core logic for:
+ * - grant allocation strategy
+ * - maintaining per-client as well as global grant space accounting
+ * - processing grant information packed in incoming requests
+ * - allocating server-side grant space for synchronous write RPCs which did not
+ *   consume grant on the client side (OBD_BRW_FROM_GRANT flag not set). If not
+ *   enough space is available, such RPCs fail with ENOSPC
+ *
+ * Author: Johann Lombardi <johann.lombardi@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FILTER
+
+#include <obd.h>
+#include <obd_class.h>
+
+#include "tgt_internal.h"
+
+/* Clients typically hold 2x their max_rpcs_in_flight of grant space */
+#define TGT_GRANT_SHRINK_LIMIT(exp)	(2ULL * 8 * exp_max_brw_size(exp))
+
+/* Helpers to inflate/deflate grants for clients that do not support the grant
+ * parameters */
+static inline u64 tgt_grant_inflate(struct tg_grants_data *tgd, u64 val)
+{
+	if (tgd->tgd_blockbits > COMPAT_BSIZE_SHIFT)
+		/* Client does not support such large block size, grant
+		 * is thus inflated. We already significantly overestimate
+		 * overhead, no need to add the extent tax in this case */
+		return val << (tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT);
+	return val;
+}
+
+/* Companion of tgt_grant_inflate() */
+static inline u64 tgt_grant_deflate(struct tg_grants_data *tgd, u64 val)
+{
+	if (tgd->tgd_blockbits > COMPAT_BSIZE_SHIFT)
+		return val >> (tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT);
+	return val;
+}
+
+/* Grant chunk is used as a unit for grant allocation. It should be inflated
+ * if the client does not support the grant paramaters.
+ * Check connection flag against \a data if not NULL. This is used during
+ * connection creation where exp->exp_connect_data isn't populated yet */
+static inline u64 tgt_grant_chunk(struct obd_export *exp,
+				  struct lu_target *lut,
+				  struct obd_connect_data *data)
+{
+	struct tg_grants_data *tgd = &lut->lut_tgd;
+	u64 chunk = exp_max_brw_size(exp);
+	u64 tax;
+
+	if (exp->exp_obd->obd_self_export == exp)
+		/* Grant enough space to handle a big precreate request */
+		return OST_MAX_PRECREATE * lut->lut_dt_conf.ddp_inodespace / 2;
+
+	if ((data == NULL && !(exp_grant_param_supp(exp))) ||
+	    (data != NULL && !OCD_HAS_FLAG(data, GRANT_PARAM)))
+		/* Try to grant enough space to send 2 full-size RPCs */
+		return tgt_grant_inflate(tgd, chunk) << 1;
+
+	/* Try to return enough to send two full-size RPCs
+	 * = 2 * (BRW_size + #extents_in_BRW * grant_tax) */
+	tax = 1ULL << tgd->tgd_blockbits;	     /* block size */
+	tax *= lut->lut_dt_conf.ddp_max_extent_blks; /* max extent size */
+	tax = (chunk + tax - 1) / tax;		     /* #extents in a RPC */
+	tax *= lut->lut_dt_conf.ddp_extent_tax;	     /* extent tax for a RPC */
+	chunk = (chunk + tax) * 2;		     /* we said two full RPCs */
+	return chunk;
+}
+
+static int tgt_check_export_grants(struct obd_export *exp, u64 *dirty,
+				   u64 *pending, u64 *granted, u64 maxsize)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+	int level = D_CACHE;
+
+	if (ted->ted_grant < 0 || ted->ted_pending < 0 || ted->ted_dirty < 0)
+		level = D_ERROR;
+	CDEBUG_LIMIT(level, "%s: cli %s/%p dirty %ld pend %ld grant %ld\n",
+		     exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp,
+		     ted->ted_dirty, ted->ted_pending, ted->ted_grant);
+
+	if (ted->ted_grant + ted->ted_pending > maxsize) {
+		CERROR("%s: cli %s/%p ted_grant(%ld) + ted_pending(%ld)"
+			" > maxsize(%llu)\n", exp->exp_obd->obd_name,
+			exp->exp_client_uuid.uuid, exp, ted->ted_grant,
+			ted->ted_pending, maxsize);
+		return -EFAULT;
+	}
+	if (ted->ted_dirty > maxsize) {
+		CERROR("%s: cli %s/%p ted_dirty(%ld) > maxsize(%llu)\n",
+			exp->exp_obd->obd_name, exp->exp_client_uuid.uuid,
+			exp, ted->ted_dirty, maxsize);
+		return -EFAULT;
+	}
+	*granted += ted->ted_grant + ted->ted_pending;
+	*pending += ted->ted_pending;
+	*dirty += ted->ted_dirty;
+	return 0;
+}
+
+/**
+ * Perform extra sanity checks for grant accounting.
+ *
+ * This function scans the export list, sanity checks per-export grant counters
+ * and verifies accuracy of global grant accounting. If an inconsistency is
+ * found, a CERROR is printed with the function name \func that was passed as
+ * argument. LBUG is only called in case of serious counter corruption (i.e.
+ * value larger than the device size).
+ * Those sanity checks can be pretty expensive and are disabled if the OBD
+ * device has more than 100 connected exports.
+ *
+ * \param[in] obd	OBD device for which grant accounting should be
+ *			verified
+ * \param[in] func	caller's function name
+ */
+void tgt_grant_sanity_check(struct obd_device *obd, const char *func)
+{
+	struct lu_target *lut = obd->u.obt.obt_lut;
+	struct tg_grants_data *tgd = &lut->lut_tgd;
+	struct obd_export *exp;
+	struct tg_export_data *ted;
+	u64		   maxsize;
+	u64		   tot_dirty = 0;
+	u64		   tot_pending = 0;
+	u64		   tot_granted = 0;
+	u64		   fo_tot_granted;
+	u64		   fo_tot_pending;
+	u64		   fo_tot_dirty;
+	int		   error;
+
+	if (list_empty(&obd->obd_exports))
+		return;
+
+	/* We don't want to do this for large machines that do lots of
+	 * mounts or unmounts.  It burns... */
+	if (obd->obd_num_exports > 100)
+		return;
+
+	maxsize = tgd->tgd_osfs.os_blocks << tgd->tgd_blockbits;
+
+	spin_lock(&obd->obd_dev_lock);
+	spin_lock(&tgd->tgd_grant_lock);
+	exp = obd->obd_self_export;
+	ted = &exp->exp_target_data;
+	CDEBUG(D_CACHE, "%s: processing self export: %ld %ld "
+	       "%ld\n", obd->obd_name, ted->ted_grant,
+	       ted->ted_pending, ted->ted_dirty);
+	tot_granted += ted->ted_grant + ted->ted_pending;
+	tot_pending += ted->ted_pending;
+	tot_dirty += ted->ted_dirty;
+
+	list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) {
+		error = tgt_check_export_grants(exp, &tot_dirty, &tot_pending,
+						&tot_granted, maxsize);
+		if (error < 0) {
+			spin_unlock(&obd->obd_dev_lock);
+			spin_unlock(&tgd->tgd_grant_lock);
+			LBUG();
+		}
+	}
+
+	/* exports about to be unlinked should also be taken into account since
+	 * they might still hold pending grant space to be released at
+	 * commit time */
+	list_for_each_entry(exp, &obd->obd_unlinked_exports, exp_obd_chain) {
+		error = tgt_check_export_grants(exp, &tot_dirty, &tot_pending,
+						&tot_granted, maxsize);
+		if (error < 0) {
+			spin_unlock(&obd->obd_dev_lock);
+			spin_unlock(&tgd->tgd_grant_lock);
+			LBUG();
+		}
+	}
+
+	fo_tot_granted = tgd->tgd_tot_granted;
+	fo_tot_pending = tgd->tgd_tot_pending;
+	fo_tot_dirty = tgd->tgd_tot_dirty;
+	spin_unlock(&obd->obd_dev_lock);
+	spin_unlock(&tgd->tgd_grant_lock);
+
+	if (tot_granted != fo_tot_granted)
+		CERROR("%s: tot_granted %llu != fo_tot_granted %llu\n",
+		       func, tot_granted, fo_tot_granted);
+	if (tot_pending != fo_tot_pending)
+		CERROR("%s: tot_pending %llu != fo_tot_pending %llu\n",
+		       func, tot_pending, fo_tot_pending);
+	if (tot_dirty != fo_tot_dirty)
+		CERROR("%s: tot_dirty %llu != fo_tot_dirty %llu\n",
+		       func, tot_dirty, fo_tot_dirty);
+	if (tot_pending > tot_granted)
+		CERROR("%s: tot_pending %llu > tot_granted %llu\n",
+		       func, tot_pending, tot_granted);
+	if (tot_granted > maxsize)
+		CERROR("%s: tot_granted %llu > maxsize %llu\n",
+		       func, tot_granted, maxsize);
+	if (tot_dirty > maxsize)
+		CERROR("%s: tot_dirty %llu > maxsize %llu\n",
+		       func, tot_dirty, maxsize);
+}
+EXPORT_SYMBOL(tgt_grant_sanity_check);
+
+/**
+ * Get file system statistics of target.
+ *
+ * Helper function for statfs(), also used by grant code.
+ * Implements caching for statistics to avoid calling OSD device each time.
+ *
+ * \param[in]  env	  execution environment
+ * \param[in]  lut	  LU target
+ * \param[out] osfs	  statistic data to return
+ * \param[in]  max_age	  maximum age for cached data
+ * \param[in]  from_cache show that data was get from cache or not
+ *
+ * \retval		0 if successful
+ * \retval		negative value on error
+ */
+int tgt_statfs_internal(const struct lu_env *env, struct lu_target *lut,
+			struct obd_statfs *osfs, __u64 max_age, int *from_cache)
+{
+	struct tg_grants_data *tgd = &lut->lut_tgd;
+	int rc = 0;
+	ENTRY;
+
+	spin_lock(&tgd->tgd_osfs_lock);
+	if (cfs_time_before_64(tgd->tgd_osfs_age, max_age) || max_age == 0) {
+		u64 unstable;
+
+		/* statfs data are too old, get up-to-date one.
+		 * we must be cautious here since multiple threads might be
+		 * willing to update statfs data concurrently and we must
+		 * grant that cached statfs data are always consistent */
+
+		if (tgd->tgd_statfs_inflight == 0)
+			/* clear inflight counter if no users, although it would
+			 * take a while to overflow this 64-bit counter ... */
+			tgd->tgd_osfs_inflight = 0;
+		/* notify tgt_grant_commit() that we want to track writes
+		 * completed as of now */
+		tgd->tgd_statfs_inflight++;
+		/* record value of inflight counter before running statfs to
+		 * compute the diff once statfs is completed */
+		unstable = tgd->tgd_osfs_inflight;
+		spin_unlock(&tgd->tgd_osfs_lock);
+
+		/* statfs can sleep ... hopefully not for too long since we can
+		 * call it fairly often as space fills up */
+		rc = dt_statfs(env, lut->lut_bottom, osfs);
+		if (unlikely(rc))
+			GOTO(out, rc);
+
+		spin_lock(&tgd->tgd_grant_lock);
+		spin_lock(&tgd->tgd_osfs_lock);
+		/* calculate how much space was written while we released the
+		 * tgd_osfs_lock */
+		unstable = tgd->tgd_osfs_inflight - unstable;
+		tgd->tgd_osfs_unstable = 0;
+		if (unstable) {
+			/* some writes committed while we were running statfs
+			 * w/o the tgd_osfs_lock. Those ones got added to
+			 * the cached statfs data that we are about to crunch.
+			 * Take them into account in the new statfs data */
+			osfs->os_bavail -= min_t(u64, osfs->os_bavail,
+					       unstable >> tgd->tgd_blockbits);
+			/* However, we don't really know if those writes got
+			 * accounted in the statfs call, so tell
+			 * tgt_grant_space_left() there is some uncertainty
+			 * on the accounting of those writes.
+			 * The purpose is to prevent spurious error messages in
+			 * tgt_grant_space_left() since those writes might be
+			 * accounted twice. */
+			tgd->tgd_osfs_unstable += unstable;
+		}
+		/* similarly, there is some uncertainty on write requests
+		 * between prepare & commit */
+		tgd->tgd_osfs_unstable += tgd->tgd_tot_pending;
+		spin_unlock(&tgd->tgd_grant_lock);
+
+		/* finally udpate cached statfs data */
+		tgd->tgd_osfs = *osfs;
+		tgd->tgd_osfs_age = cfs_time_current_64();
+
+		tgd->tgd_statfs_inflight--; /* stop tracking */
+		if (tgd->tgd_statfs_inflight == 0)
+			tgd->tgd_osfs_inflight = 0;
+		spin_unlock(&tgd->tgd_osfs_lock);
+
+		if (from_cache)
+			*from_cache = 0;
+	} else {
+		/* use cached statfs data */
+		*osfs = tgd->tgd_osfs;
+		spin_unlock(&tgd->tgd_osfs_lock);
+		if (from_cache)
+			*from_cache = 1;
+	}
+	GOTO(out, rc);
+
+out:
+	return rc;
+}
+EXPORT_SYMBOL(tgt_statfs_internal);
+
+/**
+ * Update cached statfs information from the OSD layer
+ *
+ * Refresh statfs information cached in tgd::tgd_osfs if the cache is older
+ * than 1s or if force is set. The OSD layer is in charge of estimating data &
+ * metadata overhead.
+ * This function can sleep so it should not be called with any spinlock held.
+ *
+ * \param[in] env		LU environment passed by the caller
+ * \param[in] exp		export used to print client info in debug
+ *				messages
+ * \param[in] force		force a refresh of statfs information
+ * \param[out] from_cache	returns whether the statfs information are
+ *				taken from cache
+ */
+static void tgt_grant_statfs(const struct lu_env *env, struct obd_export *exp,
+			     int force, int *from_cache)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lu_target	*lut = obd->u.obt.obt_lut;
+	struct tg_grants_data	*tgd = &lut->lut_tgd;
+	struct tgt_thread_info	*tti;
+	struct obd_statfs	*osfs;
+	__u64			 max_age;
+	int			 rc;
+
+	if (force)
+		max_age = 0; /* get fresh statfs data */
+	else
+		max_age = cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS);
+
+	tti = tgt_th_info(env);
+	osfs = &tti->tti_u.osfs;
+	rc = tgt_statfs_internal(env, lut, osfs, max_age, from_cache);
+	if (unlikely(rc)) {
+		if (from_cache)
+			*from_cache = 0;
+		return;
+	}
+
+	CDEBUG(D_CACHE, "%s: cli %s/%p free: %llu avail: %llu\n",
+	       obd->obd_name, exp->exp_client_uuid.uuid, exp,
+	       osfs->os_bfree << tgd->tgd_blockbits,
+	       osfs->os_bavail << tgd->tgd_blockbits);
+}
+
+/**
+ * Figure out how much space is available on the backend filesystem after
+ * removing grant space already booked by clients.
+ *
+ * This is done by accessing cached statfs data previously populated by
+ * tgt_grant_statfs(), from which we withdraw the space already granted to
+ * clients and the reserved space.
+ * Caller must hold tgd_grant_lock spinlock.
+ *
+ * \param[in] exp	export associated with the device for which the amount
+ *			of available space is requested
+ * \retval		amount of non-allocated space, in bytes
+ */
+static u64 tgt_grant_space_left(struct obd_export *exp)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lu_target	*lut = obd->u.obt.obt_lut;
+	struct tg_grants_data	*tgd = &lut->lut_tgd;
+	u64			 tot_granted;
+	u64			 left;
+	u64			 avail;
+	u64			 unstable;
+
+	ENTRY;
+	assert_spin_locked(&tgd->tgd_grant_lock);
+
+	spin_lock(&tgd->tgd_osfs_lock);
+	/* get available space from cached statfs data */
+	left = tgd->tgd_osfs.os_bavail << tgd->tgd_blockbits;
+	unstable = tgd->tgd_osfs_unstable; /* those might be accounted twice */
+	spin_unlock(&tgd->tgd_osfs_lock);
+
+	tot_granted = tgd->tgd_tot_granted;
+
+	if (left < tot_granted) {
+		int mask = (left + unstable <
+			    tot_granted - tgd->tgd_tot_pending) ?
+			    D_ERROR : D_CACHE;
+
+		CDEBUG_LIMIT(mask, "%s: cli %s/%p left %llu < tot_grant "
+			     "%llu unstable %llu pending %llu "
+			     "dirty %llu\n",
+			     obd->obd_name, exp->exp_client_uuid.uuid, exp,
+			     left, tot_granted, unstable,
+			     tgd->tgd_tot_pending,
+			     tgd->tgd_tot_dirty);
+		RETURN(0);
+	}
+
+	avail = left;
+	/* Withdraw space already granted to clients */
+	left -= tot_granted;
+
+	/* Align left on block size */
+	left &= ~((1ULL << tgd->tgd_blockbits) - 1);
+
+	CDEBUG(D_CACHE, "%s: cli %s/%p avail %llu left %llu unstable "
+	       "%llu tot_grant %llu pending %llu\n", obd->obd_name,
+	       exp->exp_client_uuid.uuid, exp, avail, left, unstable,
+	       tot_granted, tgd->tgd_tot_pending);
+
+	RETURN(left);
+}
+
+/**
+ * Process grant information from obdo structure packed in incoming BRW
+ * and inflate grant counters if required.
+ *
+ * Grab the dirty and seen grant announcements from the incoming obdo and
+ * inflate all grant counters passed in the request if the client does not
+ * support the grant parameters.
+ * We will later calculate the client's new grant and return it.
+ * Caller must hold tgd_grant_lock spinlock.
+ *
+ * \param[in] env	LU environment supplying osfs storage
+ * \param[in] exp	export for which we received the request
+ * \param[in,out] oa	incoming obdo sent by the client
+ */
+static void tgt_grant_incoming(const struct lu_env *env, struct obd_export *exp,
+			       struct obdo *oa, long chunk)
+{
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	struct obd_device	*obd = exp->exp_obd;
+	struct tg_grants_data	*tgd = &obd->u.obt.obt_lut->lut_tgd;
+	long			 dirty;
+	long			 dropped;
+	ENTRY;
+
+	assert_spin_locked(&tgd->tgd_grant_lock);
+
+	if ((oa->o_valid & (OBD_MD_FLBLOCKS|OBD_MD_FLGRANT)) !=
+					(OBD_MD_FLBLOCKS|OBD_MD_FLGRANT)) {
+		oa->o_valid &= ~OBD_MD_FLGRANT;
+		RETURN_EXIT;
+	}
+
+	/* Add some margin, since there is a small race if other RPCs arrive
+	 * out-or-order and have already consumed some grant.  We want to
+	 * leave this here in case there is a large error in accounting. */
+	CDEBUG(D_CACHE,
+	       "%s: cli %s/%p reports grant %llu dropped %u, local %lu\n",
+	       obd->obd_name, exp->exp_client_uuid.uuid, exp, oa->o_grant,
+	       oa->o_dropped, ted->ted_grant);
+
+	if ((long long)oa->o_dirty < 0)
+		oa->o_dirty = 0;
+
+	/* inflate grant counters if required */
+	if (!exp_grant_param_supp(exp)) {
+		oa->o_grant	= tgt_grant_inflate(tgd, oa->o_grant);
+		oa->o_dirty	= tgt_grant_inflate(tgd, oa->o_dirty);
+		oa->o_dropped	= tgt_grant_inflate(tgd, (u64)oa->o_dropped);
+		oa->o_undirty	= tgt_grant_inflate(tgd, oa->o_undirty);
+	}
+
+	dirty = oa->o_dirty;
+	dropped = oa->o_dropped;
+
+	/* Update our accounting now so that statfs takes it into account.
+	 * Note that ted_dirty is only approximate and can become incorrect
+	 * if RPCs arrive out-of-order.  No important calculations depend
+	 * on ted_dirty however, but we must check sanity to not assert. */
+	if (dirty > ted->ted_grant + 4 * chunk)
+		dirty = ted->ted_grant + 4 * chunk;
+	tgd->tgd_tot_dirty += dirty - ted->ted_dirty;
+	if (ted->ted_grant < dropped) {
+		CDEBUG(D_CACHE,
+		       "%s: cli %s/%p reports %lu dropped > grant %lu\n",
+		       obd->obd_name, exp->exp_client_uuid.uuid, exp, dropped,
+		       ted->ted_grant);
+		dropped = 0;
+	}
+	if (tgd->tgd_tot_granted < dropped) {
+		CERROR("%s: cli %s/%p reports %lu dropped > tot_grant %llu\n",
+		       obd->obd_name, exp->exp_client_uuid.uuid, exp,
+		       dropped, tgd->tgd_tot_granted);
+		dropped = 0;
+	}
+	tgd->tgd_tot_granted -= dropped;
+	ted->ted_grant -= dropped;
+	ted->ted_dirty = dirty;
+
+	if (ted->ted_dirty < 0 || ted->ted_grant < 0 || ted->ted_pending < 0) {
+		CERROR("%s: cli %s/%p dirty %ld pend %ld grant %ld\n",
+		       obd->obd_name, exp->exp_client_uuid.uuid, exp,
+		       ted->ted_dirty, ted->ted_pending, ted->ted_grant);
+		spin_unlock(&tgd->tgd_grant_lock);
+		LBUG();
+	}
+	EXIT;
+}
+
+/**
+ * Grant shrink request handler.
+ *
+ * Client nodes can explicitly release grant space (i.e. process called grant
+ * shrinking). This function proceeds with the shrink request when there is
+ * less ungranted space remaining than the amount all of the connected clients
+ * would consume if they used their full grant.
+ * Caller must hold tgd_grant_lock spinlock.
+ *
+ * \param[in] exp		export releasing grant space
+ * \param[in,out] oa		incoming obdo sent by the client
+ * \param[in] left_space	remaining free space with space already granted
+ *				taken out
+ */
+static void tgt_grant_shrink(struct obd_export *exp, struct obdo *oa,
+			     u64 left_space)
+{
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	struct obd_device	*obd = exp->exp_obd;
+	struct tg_grants_data	*tgd = &obd->u.obt.obt_lut->lut_tgd;
+	long			 grant_shrink;
+
+	assert_spin_locked(&tgd->tgd_grant_lock);
+	LASSERT(exp);
+	if (left_space >= tgd->tgd_tot_granted_clients *
+			  TGT_GRANT_SHRINK_LIMIT(exp))
+		return;
+
+	grant_shrink = oa->o_grant;
+
+	ted->ted_grant -= grant_shrink;
+	tgd->tgd_tot_granted -= grant_shrink;
+
+	CDEBUG(D_CACHE, "%s: cli %s/%p shrink %ld ted_grant %ld total %llu\n",
+	       obd->obd_name, exp->exp_client_uuid.uuid, exp, grant_shrink,
+	       ted->ted_grant, tgd->tgd_tot_granted);
+
+	/* client has just released some grant, don't grant any space back */
+	oa->o_grant = 0;
+}
+
+/**
+ * Calculate how much space is required to write a given network buffer
+ *
+ * This function takes block alignment into account to estimate how much on-disk
+ * space will be required to successfully write the whole niobuf.
+ * Estimated space is inflated if the export does not support
+ * OBD_CONNECT_GRANT_PARAM and if the backend filesystem has a block size
+ * larger than the minimal supported page size (i.e. 4KB).
+ *
+ * \param[in] exp	export associated which the write request
+ *			if NULL, then size estimate is done for server-side
+ *			grant allocation.
+ * \param[in] lut	LU target handling the request
+ * \param[in] rnb	network buffer to estimate size of
+ *
+ * \retval		space (in bytes) that will be consumed to write the
+ *			network buffer
+ */
+static inline u64 tgt_grant_rnb_size(struct obd_export *exp,
+				     struct lu_target *lut,
+				     struct niobuf_remote *rnb)
+{
+	struct tg_grants_data *tgd = &lut->lut_tgd;
+	u64 blksize;
+	u64 bytes;
+	u64 end;
+
+	if (exp && !exp_grant_param_supp(exp) &&
+	    tgd->tgd_blockbits > COMPAT_BSIZE_SHIFT)
+		blksize = 1ULL << COMPAT_BSIZE_SHIFT;
+	else
+		blksize = 1ULL << tgd->tgd_blockbits;
+
+	/* The network buffer might span several blocks, align it on block
+	 * boundaries */
+	bytes  = rnb->rnb_offset & (blksize - 1);
+	bytes += rnb->rnb_len;
+	end    = bytes & (blksize - 1);
+	if (end)
+		bytes += blksize - end;
+
+	if (exp == NULL || exp_grant_param_supp(exp)) {
+		/* add per-extent insertion cost */
+		u64 max_ext;
+		int nr_ext;
+
+		max_ext = blksize * lut->lut_dt_conf.ddp_max_extent_blks;
+		nr_ext = (bytes + max_ext - 1) / max_ext;
+		bytes += nr_ext * lut->lut_dt_conf.ddp_extent_tax;
+	} else {
+		/* Inflate grant space if client does not support extent-based
+		 * grant allocation */
+		bytes = tgt_grant_inflate(tgd, (u64)bytes);
+	}
+
+	return bytes;
+}
+
+/**
+ * Validate grant accounting for each incoming remote network buffer.
+ *
+ * When clients have dirtied as much space as they've been granted they
+ * fall through to sync writes. These sync writes haven't been expressed
+ * in grants and need to error with ENOSPC when there isn't room in the
+ * filesystem for them after grants are taken into account. However,
+ * writeback of the dirty data that was already granted space can write
+ * right on through.
+ * The OBD_BRW_GRANTED flag will be set in the rnb_flags of each network
+ * buffer which has been granted enough space to proceed. Buffers without
+ * this flag will fail to be written with -ENOSPC (see tgt_preprw_write().
+ * Caller must hold tgd_grant_lock spinlock.
+ *
+ * \param[in] env	LU environment passed by the caller
+ * \param[in] exp	export identifying the client which sent the RPC
+ * \param[in] oa	incoming obdo in which we should return the pack the
+ *			additional grant
+ * \param[in,out] rnb	the list of network buffers
+ * \param[in] niocount	the number of network buffers in the list
+ * \param[in] left	the remaining free space with space already granted
+ *			taken out
+ */
+static void tgt_grant_check(const struct lu_env *env, struct obd_export *exp,
+			    struct obdo *oa, struct niobuf_remote *rnb,
+			    int niocount, u64 *left)
+{
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	struct obd_device	*obd = exp->exp_obd;
+	struct lu_target	*lut = obd->u.obt.obt_lut;
+	struct tg_grants_data	*tgd = &lut->lut_tgd;
+	unsigned long		 ungranted = 0;
+	unsigned long		 granted = 0;
+	int			 i;
+	bool			 skip = false;
+
+	ENTRY;
+
+	assert_spin_locked(&tgd->tgd_grant_lock);
+
+	if (obd->obd_recovering) {
+		/* Replaying write. Grant info have been processed already so no
+		 * need to do any enforcement here. It is worth noting that only
+		 * bulk writes with all rnbs having OBD_BRW_FROM_GRANT can be
+		 * replayed. If one page hasn't OBD_BRW_FROM_GRANT set, then
+		 * the whole bulk is written synchronously */
+		skip = true;
+		CDEBUG(D_CACHE, "Replaying write, skipping accounting\n");
+	} else if ((oa->o_valid & OBD_MD_FLFLAGS) &&
+		   (oa->o_flags & OBD_FL_RECOV_RESEND)) {
+		/* Recoverable resend, grant info have already been processed as
+		 * well */
+		skip = true;
+		CDEBUG(D_CACHE, "Recoverable resend arrived, skipping "
+				"accounting\n");
+	} else if (exp_grant_param_supp(exp) && oa->o_grant_used > 0) {
+		/* Client supports the new grant parameters and is telling us
+		 * how much grant space it consumed for this bulk write.
+		 * Although all rnbs are supposed to have the OBD_BRW_FROM_GRANT
+		 * flag set, we will scan the rnb list and looks for non-cache
+		 * I/O in case it changes in the future */
+		if (ted->ted_grant >= oa->o_grant_used) {
+			/* skip grant accounting for rnbs with
+			 * OBD_BRW_FROM_GRANT and just used grant consumption
+			 * claimed in the request */
+			granted = oa->o_grant_used;
+			skip = true;
+		} else {
+			/* client has used more grants for this request that
+			 * it owns ... */
+			CERROR("%s: cli %s claims %lu GRANT, real grant %lu\n",
+			       exp->exp_obd->obd_name,
+			       exp->exp_client_uuid.uuid,
+			       (unsigned long)oa->o_grant_used, ted->ted_grant);
+
+			/* check whether we can fill the gap with unallocated
+			 * grant */
+			if (*left > (oa->o_grant_used - ted->ted_grant)) {
+				/* ouf .. we are safe for now */
+				granted = ted->ted_grant;
+				ungranted = oa->o_grant_used - granted;
+				*left -= ungranted;
+				skip = true;
+			}
+			/* too bad, but we cannot afford to blow up our grant
+			 * accounting. The loop below will handle each rnb in
+			 * case by case. */
+		}
+	}
+
+	for (i = 0; i < niocount; i++) {
+		int bytes;
+
+		if ((rnb[i].rnb_flags & OBD_BRW_FROM_GRANT)) {
+			if (skip) {
+				rnb[i].rnb_flags |= OBD_BRW_GRANTED;
+				continue;
+			}
+
+			/* compute how much grant space is actually needed for
+			 * this rnb, inflate grant if required */
+			bytes = tgt_grant_rnb_size(exp, lut, &rnb[i]);
+			if (ted->ted_grant >= granted + bytes) {
+				granted += bytes;
+				rnb[i].rnb_flags |= OBD_BRW_GRANTED;
+				continue;
+			}
+
+			CDEBUG(D_CACHE, "%s: cli %s/%p claims %ld+%d GRANT, "
+			       "real grant %lu idx %d\n", obd->obd_name,
+			       exp->exp_client_uuid.uuid, exp, granted, bytes,
+			       ted->ted_grant, i);
+		}
+
+		if (obd->obd_recovering)
+			CERROR("%s: cli %s is replaying OST_WRITE while one rnb"
+			       " hasn't OBD_BRW_FROM_GRANT set (0x%x)\n",
+			       obd->obd_name, exp->exp_client_uuid.uuid,
+			       rnb[i].rnb_flags);
+
+		/* Consume grant space on the server.
+		 * Unlike above, tgt_grant_rnb_size() is called with exp = NULL
+		 * so that the required grant space isn't inflated. This is
+		 * done on purpose since the server can deal with large block
+		 * size, unlike some clients */
+		bytes = tgt_grant_rnb_size(NULL, lut, &rnb[i]);
+		if (*left > bytes) {
+			/* if enough space, pretend it was granted */
+			ungranted += bytes;
+			*left -= bytes;
+			rnb[i].rnb_flags |= OBD_BRW_GRANTED;
+			continue;
+		}
+
+		/* We can't check for already-mapped blocks here (make sense
+		 * when backend filesystem does not use COW) as it requires
+		 * dropping the grant lock.
+		 * Instead, we clear OBD_BRW_GRANTED and in that case we need
+		 * to go through and verify if all of the blocks not marked
+		 *  BRW_GRANTED are already mapped and we can ignore this error.
+		 */
+		rnb[i].rnb_flags &= ~OBD_BRW_GRANTED;
+		CDEBUG(D_CACHE, "%s: cli %s/%p idx %d no space for %d\n",
+		       obd->obd_name, exp->exp_client_uuid.uuid, exp, i, bytes);
+	}
+
+	/* record in o_grant_used the actual space reserved for the I/O, will be
+	 * used later in tgt_grant_commmit() */
+	oa->o_grant_used = granted + ungranted;
+
+	/* record space used for the I/O, will be used in tgt_grant_commmit() */
+	/* Now substract what the clients has used already.  We don't subtract
+	 * this from the tot_granted yet, so that other client's can't grab
+	 * that space before we have actually allocated our blocks. That
+	 * happens in tgt_grant_commit() after the writes are done. */
+	ted->ted_grant -= granted;
+	ted->ted_pending += oa->o_grant_used;
+	tgd->tgd_tot_granted += ungranted;
+	tgd->tgd_tot_pending += oa->o_grant_used;
+
+	CDEBUG(D_CACHE,
+	       "%s: cli %s/%p granted: %lu ungranted: %lu grant: %lu dirty: %lu"
+	       "\n", obd->obd_name, exp->exp_client_uuid.uuid, exp,
+	       granted, ungranted, ted->ted_grant, ted->ted_dirty);
+
+	if (obd->obd_recovering || (oa->o_valid & OBD_MD_FLGRANT) == 0)
+		/* don't update dirty accounting during recovery or
+		 * if grant information got discarded (e.g. during resend) */
+		RETURN_EXIT;
+
+	if (ted->ted_dirty < granted) {
+		CWARN("%s: cli %s/%p claims granted %lu > ted_dirty %lu\n",
+		       obd->obd_name, exp->exp_client_uuid.uuid, exp,
+		       granted, ted->ted_dirty);
+		granted = ted->ted_dirty;
+	}
+	tgd->tgd_tot_dirty -= granted;
+	ted->ted_dirty -= granted;
+
+	if (ted->ted_dirty < 0 || ted->ted_grant < 0 || ted->ted_pending < 0) {
+		CERROR("%s: cli %s/%p dirty %ld pend %ld grant %ld\n",
+		       obd->obd_name, exp->exp_client_uuid.uuid, exp,
+		       ted->ted_dirty, ted->ted_pending, ted->ted_grant);
+		spin_unlock(&tgd->tgd_grant_lock);
+		LBUG();
+	}
+	EXIT;
+}
+
+/**
+ * Allocate additional grant space to a client
+ *
+ * Calculate how much grant space to return to client, based on how much space
+ * is currently free and how much of that is already granted.
+ * Caller must hold tgd_grant_lock spinlock.
+ *
+ * \param[in] exp		export of the client which sent the request
+ * \param[in] curgrant		current grant claimed by the client
+ * \param[in] want		how much grant space the client would like to
+ *				have
+ * \param[in] left		remaining free space with granted space taken
+ *				out
+ * \param[in] conservative	if set to true, the server should be cautious
+ *				and limit how much space is granted back to the
+ *				client. Otherwise, the server should try hard to
+ *				satisfy the client request.
+ *
+ * \retval			amount of grant space allocated
+ */
+static long tgt_grant_alloc(struct obd_export *exp, u64 curgrant,
+			    u64 want, u64 left, long chunk,
+			    bool conservative)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct tg_grants_data	*tgd = &obd->u.obt.obt_lut->lut_tgd;
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	u64			 grant;
+
+	ENTRY;
+
+	/* When tgd_grant_compat_disable is set, we don't grant any space to
+	 * clients not supporting OBD_CONNECT_GRANT_PARAM.
+	 * Otherwise, space granted to such a client is inflated since it
+	 * consumes PAGE_SIZE of grant space per block */
+	if ((obd->obd_self_export != exp && !exp_grant_param_supp(exp) &&
+	     tgd->tgd_grant_compat_disable) || left == 0 || exp->exp_failed)
+		RETURN(0);
+
+	if (want > 0x7fffffff) {
+		CERROR("%s: client %s/%p requesting > 2GB grant %llu\n",
+		       obd->obd_name, exp->exp_client_uuid.uuid, exp, want);
+		RETURN(0);
+	}
+
+	/* Grant some fraction of the client's requested grant space so that
+	 * they are not always waiting for write credits (not all of it to
+	 * avoid overgranting in face of multiple RPCs in flight).  This
+	 * essentially will be able to control the OSC_MAX_RIF for a client.
+	 *
+	 * If we do have a large disparity between what the client thinks it
+	 * has and what we think it has, don't grant very much and let the
+	 * client consume its grant first.  Either it just has lots of RPCs
+	 * in flight, or it was evicted and its grants will soon be used up. */
+	if (curgrant >= want || curgrant >= ted->ted_grant + chunk)
+		RETURN(0);
+
+	if (obd->obd_recovering)
+		conservative = false;
+
+	if (conservative)
+		/* don't grant more than 1/8th of the remaining free space in
+		 * one chunk */
+		left >>= 3;
+	grant = min(want - curgrant, left);
+	/* round grant up to the next block size */
+	grant = (grant + (1 << tgd->tgd_blockbits) - 1) &
+		~((1ULL << tgd->tgd_blockbits) - 1);
+
+	if (!grant)
+		RETURN(0);
+
+	/* Limit to grant_chunk if not reconnect/recovery */
+	if ((grant > chunk) && conservative)
+		grant = chunk;
+
+	tgd->tgd_tot_granted += grant;
+	ted->ted_grant += grant;
+
+	if (ted->ted_grant < 0) {
+		CERROR("%s: cli %s/%p grant %ld want %llu current %llu\n",
+		       obd->obd_name, exp->exp_client_uuid.uuid, exp,
+		       ted->ted_grant, want, curgrant);
+		spin_unlock(&tgd->tgd_grant_lock);
+		LBUG();
+	}
+
+	CDEBUG(D_CACHE,
+	       "%s: cli %s/%p wants: %llu current grant %llu"
+	       " granting: %llu\n", obd->obd_name, exp->exp_client_uuid.uuid,
+	       exp, want, curgrant, grant);
+	CDEBUG(D_CACHE,
+	       "%s: cli %s/%p tot cached:%llu granted:%llu"
+	       " num_exports: %d\n", obd->obd_name, exp->exp_client_uuid.uuid,
+	       exp, tgd->tgd_tot_dirty, tgd->tgd_tot_granted,
+	       obd->obd_num_exports);
+
+	RETURN(grant);
+}
+
+/**
+ * Handle grant space allocation on client connection & reconnection.
+ *
+ * A new non-readonly connection gets an initial grant allocation equals to
+ * tgt_grant_chunk() (i.e. twice the max BRW size in most of the cases).
+ * On reconnection, grant counters between client & target are resynchronized
+ * and additional space might be granted back if possible.
+ *
+ * \param[in] env	LU environment provided by the caller
+ * \param[in] exp	client's export which is (re)connecting
+ * \param[in,out] data	obd_connect_data structure sent by the client in the
+ *			connect request
+ * \param[in] new_conn	must set to true if this is a new connection and false
+ *			for a reconnection
+ */
+void tgt_grant_connect(const struct lu_env *env, struct obd_export *exp,
+		       struct obd_connect_data *data, bool new_conn)
+{
+	struct lu_target	*lut = exp->exp_obd->u.obt.obt_lut;
+	struct tg_grants_data	*tgd = &lut->lut_tgd;
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	u64			 left = 0;
+	u64			 want;
+	long			 chunk;
+	int			 from_cache;
+	int			 force = 0; /* can use cached data */
+
+	/* don't grant space to client with read-only access */
+	if (OCD_HAS_FLAG(data, RDONLY) ||
+	    (!OCD_HAS_FLAG(data, GRANT_PARAM) &&
+	     tgd->tgd_grant_compat_disable)) {
+		data->ocd_grant = 0;
+		data->ocd_connect_flags &= ~(OBD_CONNECT_GRANT |
+					     OBD_CONNECT_GRANT_PARAM);
+		RETURN_EXIT;
+	}
+
+	if (OCD_HAS_FLAG(data, GRANT_PARAM))
+		want = data->ocd_grant;
+	else
+		want = tgt_grant_inflate(tgd, data->ocd_grant);
+	chunk = tgt_grant_chunk(exp, lut, data);
+refresh:
+	tgt_grant_statfs(env, exp, force, &from_cache);
+
+	spin_lock(&tgd->tgd_grant_lock);
+
+	/* Grab free space from cached info and take out space already granted
+	 * to clients as well as reserved space */
+	left = tgt_grant_space_left(exp);
+
+	/* get fresh statfs data if we are short in ungranted space */
+	if (from_cache && left < 32 * chunk) {
+		spin_unlock(&tgd->tgd_grant_lock);
+		CDEBUG(D_CACHE, "fs has no space left and statfs too old\n");
+		force = 1;
+		goto refresh;
+	}
+
+	tgt_grant_alloc(exp, (u64)ted->ted_grant, want, left, chunk, new_conn);
+
+	/* return to client its current grant */
+	if (OCD_HAS_FLAG(data, GRANT_PARAM))
+		data->ocd_grant = ted->ted_grant;
+	else
+		/* deflate grant */
+		data->ocd_grant = tgt_grant_deflate(tgd, (u64)ted->ted_grant);
+
+	/* reset dirty accounting */
+	tgd->tgd_tot_dirty -= ted->ted_dirty;
+	ted->ted_dirty = 0;
+
+	if (new_conn && OCD_HAS_FLAG(data, GRANT))
+		tgd->tgd_tot_granted_clients++;
+
+	spin_unlock(&tgd->tgd_grant_lock);
+
+	CDEBUG(D_CACHE, "%s: cli %s/%p ocd_grant: %d want: %llu left: %llu\n",
+	       exp->exp_obd->obd_name, exp->exp_client_uuid.uuid,
+	       exp, data->ocd_grant, want, left);
+
+	EXIT;
+}
+EXPORT_SYMBOL(tgt_grant_connect);
+
+/**
+ * Release all grant space attached to a given export.
+ *
+ * Remove a client from the grant accounting totals.  We also remove
+ * the export from the obd device under the osfs and dev locks to ensure
+ * that the tgt_grant_sanity_check() calculations are always valid.
+ * The client should do something similar when it invalidates its import.
+ *
+ * \param[in] exp	client's export to remove from grant accounting
+ */
+void tgt_grant_discard(struct obd_export *exp)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct tg_grants_data	*tgd = &obd->u.obt.obt_lut->lut_tgd;
+	struct tg_export_data	*ted = &exp->exp_target_data;
+
+	spin_lock(&tgd->tgd_grant_lock);
+	LASSERTF(tgd->tgd_tot_granted >= ted->ted_grant,
+		 "%s: tot_granted %llu cli %s/%p ted_grant %ld\n",
+		 obd->obd_name, tgd->tgd_tot_granted,
+		 exp->exp_client_uuid.uuid, exp, ted->ted_grant);
+	tgd->tgd_tot_granted -= ted->ted_grant;
+	ted->ted_grant = 0;
+	LASSERTF(tgd->tgd_tot_pending >= ted->ted_pending,
+		 "%s: tot_pending %llu cli %s/%p ted_pending %ld\n",
+		 obd->obd_name, tgd->tgd_tot_pending,
+		 exp->exp_client_uuid.uuid, exp, ted->ted_pending);
+	/* tgd_tot_pending is handled in tgt_grant_commit as bulk
+	 * commmits */
+	LASSERTF(tgd->tgd_tot_dirty >= ted->ted_dirty,
+		 "%s: tot_dirty %llu cli %s/%p ted_dirty %ld\n",
+		 obd->obd_name, tgd->tgd_tot_dirty,
+		 exp->exp_client_uuid.uuid, exp, ted->ted_dirty);
+	tgd->tgd_tot_dirty -= ted->ted_dirty;
+	ted->ted_dirty = 0;
+	spin_unlock(&tgd->tgd_grant_lock);
+}
+EXPORT_SYMBOL(tgt_grant_discard);
+
+/**
+ * Process grant information from incoming bulk read request.
+ *
+ * Extract grant information packed in obdo structure (OBD_MD_FLGRANT set in
+ * o_valid). Bulk reads usually comes with grant announcements (number of dirty
+ * blocks, remaining amount of grant space, ...) and could also include a grant
+ * shrink request. Unlike bulk write, no additional grant space is returned on
+ * bulk read request.
+ *
+ * \param[in] env	is the lu environment provided by the caller
+ * \param[in] exp	is the export of the client which sent the request
+ * \param[in,out] oa	is the incoming obdo sent by the client
+ */
+void tgt_grant_prepare_read(const struct lu_env *env,
+			    struct obd_export *exp, struct obdo *oa)
+{
+	struct lu_target	*lut = exp->exp_obd->u.obt.obt_lut;
+	struct tg_grants_data	*tgd = &lut->lut_tgd;
+	int			 do_shrink;
+	u64			 left = 0;
+
+	ENTRY;
+
+	if (!oa)
+		RETURN_EXIT;
+
+	if ((oa->o_valid & OBD_MD_FLGRANT) == 0)
+		/* The read request does not contain any grant
+		 * information */
+		RETURN_EXIT;
+
+	if ((oa->o_valid & OBD_MD_FLFLAGS) &&
+	    (oa->o_flags & OBD_FL_SHRINK_GRANT)) {
+		/* To process grant shrink request, we need to know how much
+		 * available space remains on the backend filesystem.
+		 * Shrink requests are not so common, we always get fresh
+		 * statfs information. */
+		tgt_grant_statfs(env, exp, 1, NULL);
+
+		/* protect all grant counters */
+		spin_lock(&tgd->tgd_grant_lock);
+
+		/* Grab free space from cached statfs data and take out space
+		 * already granted to clients as well as reserved space */
+		left = tgt_grant_space_left(exp);
+
+		/* all set now to proceed with shrinking */
+		do_shrink = 1;
+	} else {
+		/* no grant shrinking request packed in the obdo and
+		 * since we don't grant space back on reads, no point
+		 * in running statfs, so just skip it and process
+		 * incoming grant data directly. */
+		spin_lock(&tgd->tgd_grant_lock);
+		do_shrink = 0;
+	}
+
+	/* extract incoming grant information provided by the client and
+	 * inflate grant counters if required */
+	tgt_grant_incoming(env, exp, oa, tgt_grant_chunk(exp, lut, NULL));
+
+	/* unlike writes, we don't return grants back on reads unless a grant
+	 * shrink request was packed and we decided to turn it down. */
+	if (do_shrink)
+		tgt_grant_shrink(exp, oa, left);
+	else
+		oa->o_grant = 0;
+
+	if (!exp_grant_param_supp(exp))
+		oa->o_grant = tgt_grant_deflate(tgd, oa->o_grant);
+	spin_unlock(&tgd->tgd_grant_lock);
+	EXIT;
+}
+EXPORT_SYMBOL(tgt_grant_prepare_read);
+
+/**
+ * Process grant information from incoming bulk write request.
+ *
+ * This function extracts client's grant announcements from incoming bulk write
+ * request and attempts to allocate grant space for network buffers that need it
+ * (i.e. OBD_BRW_FROM_GRANT not set in rnb_fags).
+ * Network buffers which aren't granted the OBD_BRW_GRANTED flag should not
+ * proceed further and should fail with -ENOSPC.
+ * Whenever possible, additional grant space will be returned to the client
+ * in the bulk write reply.
+ * tgt_grant_prepare_write() must be called before writting any buffers to
+ * the backend storage. This function works in pair with tgt_grant_commit()
+ * which must be invoked once all buffers have been written to disk in order
+ * to release space from the pending grant counter.
+ *
+ * \param[in] env	LU environment provided by the caller
+ * \param[in] exp	export of the client which sent the request
+ * \param[in] oa	incoming obdo sent by the client
+ * \param[in] rnb	list of network buffers
+ * \param[in] niocount	number of network buffers in the list
+ */
+void tgt_grant_prepare_write(const struct lu_env *env,
+			     struct obd_export *exp, struct obdo *oa,
+			     struct niobuf_remote *rnb, int niocount)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lu_target	*lut = obd->u.obt.obt_lut;
+	struct tg_grants_data	*tgd = &lut->lut_tgd;
+	u64			 left;
+	int			 from_cache;
+	int			 force = 0; /* can use cached data intially */
+	long			 chunk = tgt_grant_chunk(exp, lut, NULL);
+
+	ENTRY;
+
+refresh:
+	/* get statfs information from OSD layer */
+	tgt_grant_statfs(env, exp, force, &from_cache);
+
+	spin_lock(&tgd->tgd_grant_lock); /* protect all grant counters */
+
+	/* Grab free space from cached statfs data and take out space already
+	 * granted to clients as well as reserved space */
+	left = tgt_grant_space_left(exp);
+
+	/* Get fresh statfs data if we are short in ungranted space */
+	if (from_cache && left < 32 * chunk) {
+		spin_unlock(&tgd->tgd_grant_lock);
+		CDEBUG(D_CACHE, "%s: fs has no space left and statfs too old\n",
+		       obd->obd_name);
+		force = 1;
+		goto refresh;
+	}
+
+	/* When close to free space exhaustion, trigger a sync to force
+	 * writeback cache to consume required space immediately and release as
+	 * much space as possible. */
+	if (!obd->obd_recovering && force != 2 && left < chunk) {
+		bool from_grant = true;
+		int  i;
+
+		/* That said, it is worth running a sync only if some pages did
+		 * not consume grant space on the client and could thus fail
+		 * with ENOSPC later in tgt_grant_check() */
+		for (i = 0; i < niocount; i++)
+			if (!(rnb[i].rnb_flags & OBD_BRW_FROM_GRANT))
+				from_grant = false;
+
+		if (!from_grant) {
+			/* at least one network buffer requires acquiring grant
+			 * space on the server */
+			spin_unlock(&tgd->tgd_grant_lock);
+			/* discard errors, at least we tried ... */
+			dt_sync(env, lut->lut_bottom);
+			force = 2;
+			goto refresh;
+		}
+	}
+
+	/* extract incoming grant information provided by the client,
+	 * and inflate grant counters if required */
+	tgt_grant_incoming(env, exp, oa, chunk);
+
+	/* check limit */
+	tgt_grant_check(env, exp, oa, rnb, niocount, &left);
+
+	if (!(oa->o_valid & OBD_MD_FLGRANT)) {
+		spin_unlock(&tgd->tgd_grant_lock);
+		RETURN_EXIT;
+	}
+
+	/* if OBD_FL_SHRINK_GRANT is set, the client is willing to release some
+	 * grant space. */
+	if ((oa->o_valid & OBD_MD_FLFLAGS) &&
+	    (oa->o_flags & OBD_FL_SHRINK_GRANT))
+		tgt_grant_shrink(exp, oa, left);
+	else
+		/* grant more space back to the client if possible */
+		oa->o_grant = tgt_grant_alloc(exp, oa->o_grant, oa->o_undirty,
+					      left, chunk, true);
+
+	if (!exp_grant_param_supp(exp))
+		oa->o_grant = tgt_grant_deflate(tgd, oa->o_grant);
+	spin_unlock(&tgd->tgd_grant_lock);
+	EXIT;
+}
+EXPORT_SYMBOL(tgt_grant_prepare_write);
+
+/**
+ * Consume grant space reserved for object creation.
+ *
+ * Grant space is allocated to the local self export for object precreation.
+ * This is required to prevent object precreation from consuming grant space
+ * allocated to client nodes for the data writeback cache.
+ * This function consumes enough space to create \a nr objects and allocates
+ * more grant space to the self export for future precreation requests, if
+ * possible.
+ *
+ * \param[in] env	LU environment provided by the caller
+ * \param[in] exp	export holding the grant space for precreation (= self
+ *			export currently)
+ * \param[in] nr	number of objects to be created
+ *
+ * \retval >= 0		amount of grant space allocated to the precreate request
+ * \retval -ENOSPC	on failure
+ */
+long tgt_grant_create(const struct lu_env *env, struct obd_export *exp, s64 *nr)
+{
+	struct lu_target	*lut = exp->exp_obd->u.obt.obt_lut;
+	struct tg_grants_data	*tgd = &lut->lut_tgd;
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	u64			 left = 0;
+	unsigned long		 wanted;
+	unsigned long		 granted;
+	ENTRY;
+
+	if (exp->exp_obd->obd_recovering ||
+	    lut->lut_dt_conf.ddp_inodespace == 0)
+		/* don't enforce grant during recovery */
+		RETURN(0);
+
+	/* Update statfs data if required */
+	tgt_grant_statfs(env, exp, 1, NULL);
+
+	/* protect all grant counters */
+	spin_lock(&tgd->tgd_grant_lock);
+
+	/* fail precreate request if there is not enough blocks available for
+	 * writing */
+	if (tgd->tgd_osfs.os_bavail - (ted->ted_grant >> tgd->tgd_blockbits) <
+	    (tgd->tgd_osfs.os_blocks >> 10)) {
+		spin_unlock(&tgd->tgd_grant_lock);
+		CDEBUG(D_RPCTRACE, "%s: not enough space for create %llu\n",
+		       exp->exp_obd->obd_name,
+		       tgd->tgd_osfs.os_bavail * tgd->tgd_osfs.os_blocks);
+		RETURN(-ENOSPC);
+	}
+
+	/* Grab free space from cached statfs data and take out space
+	 * already granted to clients as well as reserved space */
+	left = tgt_grant_space_left(exp);
+
+	/* compute how much space is required to handle the precreation
+	 * request */
+	wanted = *nr * lut->lut_dt_conf.ddp_inodespace;
+	if (wanted > ted->ted_grant + left) {
+		/* that's beyond what remains, adjust the number of objects that
+		 * can be safely precreated */
+		wanted = ted->ted_grant + left;
+		*nr = wanted / lut->lut_dt_conf.ddp_inodespace;
+		if (*nr == 0) {
+			/* we really have no space any more for precreation,
+			 * fail the precreate request with ENOSPC */
+			spin_unlock(&tgd->tgd_grant_lock);
+			RETURN(-ENOSPC);
+		}
+		/* compute space needed for the new number of creations */
+		wanted = *nr * lut->lut_dt_conf.ddp_inodespace;
+	}
+	LASSERT(wanted <= ted->ted_grant + left);
+
+	if (wanted <= ted->ted_grant) {
+		/* we've enough grant space to handle this precreate request */
+		ted->ted_grant -= wanted;
+	} else {
+		/* we need to take some space from the ungranted pool */
+		tgd->tgd_tot_granted += wanted - ted->ted_grant;
+		left -= wanted - ted->ted_grant;
+		ted->ted_grant = 0;
+	}
+	granted = wanted;
+	ted->ted_pending += granted;
+	tgd->tgd_tot_pending += granted;
+
+	/* grant more space for precreate purpose if possible. */
+	wanted = OST_MAX_PRECREATE * lut->lut_dt_conf.ddp_inodespace / 2;
+	if (wanted > ted->ted_grant) {
+		long chunk;
+
+		/* always try to book enough space to handle a large precreate
+		 * request */
+		chunk = tgt_grant_chunk(exp, lut, NULL);
+		wanted -= ted->ted_grant;
+		tgt_grant_alloc(exp, ted->ted_grant, wanted, left, chunk,
+				false);
+	}
+	spin_unlock(&tgd->tgd_grant_lock);
+	RETURN(granted);
+}
+EXPORT_SYMBOL(tgt_grant_create);
+
+/**
+ * Release grant space added to the pending counter by tgt_grant_prepare_write()
+ *
+ * Update pending grant counter once buffers have been written to the disk.
+ *
+ * \param[in] exp	export of the client which sent the request
+ * \param[in] pending	amount of reserved space to be released
+ * \param[in] rc	return code of pre-commit operations
+ */
+void tgt_grant_commit(struct obd_export *exp, unsigned long pending,
+		      int rc)
+{
+	struct tg_grants_data *tgd = &exp->exp_obd->u.obt.obt_lut->lut_tgd;
+
+	ENTRY;
+
+	/* get space accounted in tot_pending for the I/O, set in
+	 * tgt_grant_check() */
+	if (pending == 0)
+		RETURN_EXIT;
+
+	spin_lock(&tgd->tgd_grant_lock);
+	/* Don't update statfs data for errors raised before commit (e.g.
+	 * bulk transfer failed, ...) since we know those writes have not been
+	 * processed. For other errors hit during commit, we cannot really tell
+	 * whether or not something was written, so we update statfs data.
+	 * In any case, this should not be fatal since we always get fresh
+	 * statfs data before failing a request with ENOSPC */
+	if (rc == 0) {
+		spin_lock(&tgd->tgd_osfs_lock);
+		/* Take pending out of cached statfs data */
+		tgd->tgd_osfs.os_bavail -= min_t(u64,
+						 tgd->tgd_osfs.os_bavail,
+						 pending >> tgd->tgd_blockbits);
+		if (tgd->tgd_statfs_inflight)
+			/* someone is running statfs and want to be notified of
+			 * writes happening meanwhile */
+			tgd->tgd_osfs_inflight += pending;
+		spin_unlock(&tgd->tgd_osfs_lock);
+	}
+
+	if (exp->exp_target_data.ted_pending < pending) {
+		CERROR("%s: cli %s/%p ted_pending(%lu) < grant_used(%lu)\n",
+		       exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp,
+		       exp->exp_target_data.ted_pending, pending);
+		spin_unlock(&tgd->tgd_grant_lock);
+		LBUG();
+	}
+	exp->exp_target_data.ted_pending -= pending;
+
+	if (tgd->tgd_tot_granted < pending) {
+		CERROR("%s: cli %s/%p tot_granted(%llu) < grant_used(%lu)\n",
+		       exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp,
+		       tgd->tgd_tot_granted, pending);
+		spin_unlock(&tgd->tgd_grant_lock);
+		LBUG();
+	}
+	tgd->tgd_tot_granted -= pending;
+
+	if (tgd->tgd_tot_pending < pending) {
+		CERROR("%s: cli %s/%p tot_pending(%llu) < grant_used(%lu)\n",
+		       exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp,
+		       tgd->tgd_tot_pending, pending);
+		spin_unlock(&tgd->tgd_grant_lock);
+		LBUG();
+	}
+	tgd->tgd_tot_pending -= pending;
+	spin_unlock(&tgd->tgd_grant_lock);
+	EXIT;
+}
+EXPORT_SYMBOL(tgt_grant_commit);
+
+struct tgt_grant_cb {
+	/* commit callback structure */
+	struct dt_txn_commit_cb	 tgc_cb;
+	/* export associated with the bulk write */
+	struct obd_export	*tgc_exp;
+	/* pending grant to be released */
+	unsigned long		 tgc_granted;
+};
+
+/**
+ * Callback function for grant releasing
+ *
+ * Release grant space reserved by the client node.
+ *
+ * \param[in] env	execution environment
+ * \param[in] th	transaction handle
+ * \param[in] cb	callback data
+ * \param[in] err	error code
+ */
+static void tgt_grant_commit_cb(struct lu_env *env, struct thandle *th,
+				struct dt_txn_commit_cb *cb, int err)
+{
+	struct tgt_grant_cb *tgc;
+
+	tgc = container_of(cb, struct tgt_grant_cb, tgc_cb);
+
+	tgt_grant_commit(tgc->tgc_exp, tgc->tgc_granted, err);
+	class_export_cb_put(tgc->tgc_exp);
+	OBD_FREE_PTR(tgc);
+}
+
+/**
+ * Add callback for grant releasing
+ *
+ * Register a commit callback to release grant space.
+ *
+ * \param[in] th	transaction handle
+ * \param[in] exp	OBD export of client
+ * \param[in] granted	amount of grant space to be released upon commit
+ *
+ * \retval		0 on successful callback adding
+ * \retval		negative value on error
+ */
+int tgt_grant_commit_cb_add(struct thandle *th, struct obd_export *exp,
+			    unsigned long granted)
+{
+	struct tgt_grant_cb	*tgc;
+	struct dt_txn_commit_cb	*dcb;
+	int			 rc;
+	ENTRY;
+
+	OBD_ALLOC_PTR(tgc);
+	if (tgc == NULL)
+		RETURN(-ENOMEM);
+
+	tgc->tgc_exp = class_export_cb_get(exp);
+	tgc->tgc_granted = granted;
+
+	dcb = &tgc->tgc_cb;
+	dcb->dcb_func = tgt_grant_commit_cb;
+	INIT_LIST_HEAD(&dcb->dcb_linkage);
+	strlcpy(dcb->dcb_name, "tgt_grant_commit_cb", sizeof(dcb->dcb_name));
+
+	rc = dt_trans_cb_add(th, dcb);
+	if (rc) {
+		class_export_cb_put(tgc->tgc_exp);
+		OBD_FREE_PTR(tgc);
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_grant_commit_cb_add);
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_handler.c b/drivers/staging/lustrefsx/lustre/target/tgt_handler.c
new file mode 100644
index 0000000000000..d2113af69436b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_handler.c
@@ -0,0 +1,2388 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2016, Intel Corporation.
+ */
+/*
+ * lustre/target/tgt_handler.c
+ *
+ * Lustre Unified Target request handler code
+ *
+ * Author: Brian Behlendorf <behlendorf1@llnl.gov>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/user_namespace.h>
+#ifdef HAVE_UIDGID_HEADER
+# include <linux/uidgid.h>
+#endif
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_cksum.h>
+#include <lustre_lfsck.h>
+#include <lustre_nodemap.h>
+#include <lustre_acl.h>
+
+#include "tgt_internal.h"
+
+char *tgt_name(struct lu_target *tgt)
+{
+	LASSERT(tgt->lut_obd != NULL);
+	return tgt->lut_obd->obd_name;
+}
+EXPORT_SYMBOL(tgt_name);
+
+/*
+ * Generic code handling requests that have struct mdt_body passed in:
+ *
+ *  - extract mdt_body from request and save it in @tsi, if present;
+ *
+ *  - create lu_object, corresponding to the fid in mdt_body, and save it in
+ *  @tsi;
+ *
+ *  - if HABEO_CORPUS flag is set for this request type check whether object
+ *  actually exists on storage (lu_object_exists()).
+ *
+ */
+static int tgt_mdt_body_unpack(struct tgt_session_info *tsi, __u32 flags)
+{
+	const struct mdt_body	*body;
+	struct lu_object	*obj;
+	struct req_capsule	*pill = tsi->tsi_pill;
+	int			 rc;
+
+	ENTRY;
+
+	body = req_capsule_client_get(pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		RETURN(-EFAULT);
+
+	tsi->tsi_mdt_body = body;
+
+	if (!(body->mbo_valid & OBD_MD_FLID))
+		RETURN(0);
+
+	/* mdc_pack_body() doesn't check if fid is zero and set OBD_ML_FID
+	 * in any case in pre-2.5 clients. Fix that here if needed */
+	if (unlikely(fid_is_zero(&body->mbo_fid1)))
+		RETURN(0);
+
+	if (!fid_is_sane(&body->mbo_fid1)) {
+		CERROR("%s: invalid FID: "DFID"\n", tgt_name(tsi->tsi_tgt),
+		       PFID(&body->mbo_fid1));
+		RETURN(-EINVAL);
+	}
+
+	obj = lu_object_find(tsi->tsi_env,
+			     &tsi->tsi_tgt->lut_bottom->dd_lu_dev,
+			     &body->mbo_fid1, NULL);
+	if (!IS_ERR(obj)) {
+		if ((flags & HABEO_CORPUS) && !lu_object_exists(obj)) {
+			lu_object_put(tsi->tsi_env, obj);
+			rc = -ENOENT;
+		} else {
+			tsi->tsi_corpus = obj;
+			rc = 0;
+		}
+	} else {
+		rc = PTR_ERR(obj);
+	}
+
+	tsi->tsi_fid = body->mbo_fid1;
+
+	RETURN(rc);
+}
+
+/**
+ * Validate oa from client.
+ * If the request comes from 2.0 clients, currently only RSVD seq and IDIF
+ * req are valid.
+ *    a. objects in Single MDT FS  seq = FID_SEQ_OST_MDT0, oi_id != 0
+ *    b. Echo objects(seq = 2), old echo client still use oi_id/oi_seq to
+ *       pack ost_id. Because non-zero oi_seq will make it diffcult to tell
+ *       whether this is oi_fid or real ostid. So it will check
+ *       OBD_CONNECT_FID, then convert the ostid to FID for old client.
+ *    c. Old FID-disable osc will send IDIF.
+ *    d. new FID-enable osc/osp will send normal FID.
+ *
+ * And also oi_id/f_oid should always start from 1. oi_id/f_oid = 0 will
+ * be used for LAST_ID file, and only being accessed inside OST now.
+ */
+int tgt_validate_obdo(struct tgt_session_info *tsi, struct obdo *oa)
+{
+	struct ost_id	*oi	= &oa->o_oi;
+	u64		 seq	= ostid_seq(oi);
+	u64		 id	= ostid_id(oi);
+	int		 rc;
+	ENTRY;
+
+	if (unlikely(!(exp_connect_flags(tsi->tsi_exp) & OBD_CONNECT_FID) &&
+		     fid_seq_is_echo(seq))) {
+		/* Sigh 2.[123] client still sends echo req with oi_id = 0
+		 * during create, and we will reset this to 1, since this
+		 * oi_id is basically useless in the following create process,
+		 * but oi_id == 0 will make it difficult to tell whether it is
+		 * real FID or ost_id. */
+		oi->oi_fid.f_seq = FID_SEQ_ECHO;
+		oi->oi_fid.f_oid = id ?: 1;
+		oi->oi_fid.f_ver = 0;
+	} else {
+		struct tgt_thread_info *tti = tgt_th_info(tsi->tsi_env);
+
+		if (unlikely((oa->o_valid & OBD_MD_FLID) && id == 0))
+			GOTO(out, rc = -EPROTO);
+
+		/* Note: this check might be forced in 2.5 or 2.6, i.e.
+		 * all of the requests are required to setup FLGROUP */
+		if (unlikely(!(oa->o_valid & OBD_MD_FLGROUP))) {
+			ostid_set_seq_mdt0(oi);
+			oa->o_valid |= OBD_MD_FLGROUP;
+			seq = ostid_seq(oi);
+		}
+
+		if (unlikely(!(fid_seq_is_idif(seq) || fid_seq_is_mdt0(seq) ||
+			       fid_seq_is_norm(seq) || fid_seq_is_echo(seq))))
+			GOTO(out, rc = -EPROTO);
+
+		rc = ostid_to_fid(&tti->tti_fid1, oi,
+				  tsi->tsi_tgt->lut_lsd.lsd_osd_index);
+		if (unlikely(rc != 0))
+			GOTO(out, rc);
+
+		oi->oi_fid = tti->tti_fid1;
+	}
+
+	RETURN(0);
+
+out:
+	CERROR("%s: client %s sent bad object "DOSTID": rc = %d\n",
+	       tgt_name(tsi->tsi_tgt), obd_export_nid2str(tsi->tsi_exp),
+	       seq, id, rc);
+	return rc;
+}
+EXPORT_SYMBOL(tgt_validate_obdo);
+
+static int tgt_io_data_unpack(struct tgt_session_info *tsi, struct ost_id *oi)
+{
+	unsigned		 max_brw;
+	struct niobuf_remote	*rnb;
+	struct obd_ioobj	*ioo;
+	int			 obj_count;
+
+	ENTRY;
+
+	ioo = req_capsule_client_get(tsi->tsi_pill, &RMF_OBD_IOOBJ);
+	if (ioo == NULL)
+		RETURN(-EPROTO);
+
+	rnb = req_capsule_client_get(tsi->tsi_pill, &RMF_NIOBUF_REMOTE);
+	if (rnb == NULL)
+		RETURN(-EPROTO);
+
+	max_brw = ioobj_max_brw_get(ioo);
+	if (unlikely((max_brw & (max_brw - 1)) != 0)) {
+		CERROR("%s: client %s sent bad ioobj max %u for "DOSTID
+		       ": rc = %d\n", tgt_name(tsi->tsi_tgt),
+		       obd_export_nid2str(tsi->tsi_exp), max_brw,
+		       POSTID(oi), -EPROTO);
+		RETURN(-EPROTO);
+	}
+	ioo->ioo_oid = *oi;
+
+	obj_count = req_capsule_get_size(tsi->tsi_pill, &RMF_OBD_IOOBJ,
+					RCL_CLIENT) / sizeof(*ioo);
+	if (obj_count == 0) {
+		CERROR("%s: short ioobj\n", tgt_name(tsi->tsi_tgt));
+		RETURN(-EPROTO);
+	} else if (obj_count > 1) {
+		CERROR("%s: too many ioobjs (%d)\n", tgt_name(tsi->tsi_tgt),
+		       obj_count);
+		RETURN(-EPROTO);
+	}
+
+	if (ioo->ioo_bufcnt == 0) {
+		CERROR("%s: ioo has zero bufcnt\n", tgt_name(tsi->tsi_tgt));
+		RETURN(-EPROTO);
+	}
+
+	if (ioo->ioo_bufcnt > PTLRPC_MAX_BRW_PAGES) {
+		DEBUG_REQ(D_RPCTRACE, tgt_ses_req(tsi),
+			  "bulk has too many pages (%d)",
+			  ioo->ioo_bufcnt);
+		RETURN(-EPROTO);
+	}
+
+	RETURN(0);
+}
+
+static int tgt_ost_body_unpack(struct tgt_session_info *tsi, __u32 flags)
+{
+	struct ost_body		*body;
+	struct req_capsule	*pill = tsi->tsi_pill;
+	struct lu_nodemap	*nodemap;
+	int			 rc;
+
+	ENTRY;
+
+	body = req_capsule_client_get(pill, &RMF_OST_BODY);
+	if (body == NULL)
+		RETURN(-EFAULT);
+
+	rc = tgt_validate_obdo(tsi, &body->oa);
+	if (rc)
+		RETURN(rc);
+
+	nodemap = nodemap_get_from_exp(tsi->tsi_exp);
+	if (IS_ERR(nodemap))
+		RETURN(PTR_ERR(nodemap));
+
+	body->oa.o_uid = nodemap_map_id(nodemap, NODEMAP_UID,
+					NODEMAP_CLIENT_TO_FS,
+					body->oa.o_uid);
+	body->oa.o_gid = nodemap_map_id(nodemap, NODEMAP_GID,
+					NODEMAP_CLIENT_TO_FS,
+					body->oa.o_gid);
+	nodemap_putref(nodemap);
+
+	tsi->tsi_ost_body = body;
+	tsi->tsi_fid = body->oa.o_oi.oi_fid;
+
+	if (req_capsule_has_field(pill, &RMF_OBD_IOOBJ, RCL_CLIENT)) {
+		rc = tgt_io_data_unpack(tsi, &body->oa.o_oi);
+		if (rc < 0)
+			RETURN(rc);
+	}
+
+	if (!(body->oa.o_valid & OBD_MD_FLID)) {
+		if (flags & HABEO_CORPUS) {
+			CERROR("%s: OBD_MD_FLID flag is not set in ost_body "
+			       "but OID/FID is mandatory with HABEO_CORPUS\n",
+			       tgt_name(tsi->tsi_tgt));
+			RETURN(-EPROTO);
+		} else {
+			RETURN(0);
+		}
+	}
+
+	ost_fid_build_resid(&tsi->tsi_fid, &tsi->tsi_resid);
+
+	/*
+	 * OST doesn't get object in advance for further use to prevent
+	 * situations with nested object_find which is potential deadlock.
+	 */
+	tsi->tsi_corpus = NULL;
+	RETURN(rc);
+}
+
+/*
+ * Do necessary preprocessing according to handler ->th_flags.
+ */
+static int tgt_request_preprocess(struct tgt_session_info *tsi,
+				  struct tgt_handler *h,
+				  struct ptlrpc_request *req)
+{
+	struct req_capsule	*pill = tsi->tsi_pill;
+	__u32			 flags = h->th_flags;
+	int			 rc = 0;
+
+	ENTRY;
+
+	if (tsi->tsi_preprocessed)
+		RETURN(0);
+
+	LASSERT(h->th_act != NULL);
+	LASSERT(h->th_opc == lustre_msg_get_opc(req->rq_reqmsg));
+	LASSERT(current->journal_info == NULL);
+
+	LASSERT(ergo(flags & (HABEO_CORPUS | HABEO_REFERO),
+		     h->th_fmt != NULL));
+	if (h->th_fmt != NULL) {
+		req_capsule_set(pill, h->th_fmt);
+		if (req_capsule_has_field(pill, &RMF_MDT_BODY, RCL_CLIENT)) {
+			rc = tgt_mdt_body_unpack(tsi, flags);
+			if (rc < 0)
+				RETURN(rc);
+		} else if (req_capsule_has_field(pill, &RMF_OST_BODY,
+						 RCL_CLIENT)) {
+			rc = tgt_ost_body_unpack(tsi, flags);
+			if (rc < 0)
+				RETURN(rc);
+		}
+	}
+
+	if (flags & MUTABOR && tgt_conn_flags(tsi) & OBD_CONNECT_RDONLY)
+		RETURN(-EROFS);
+
+	if (flags & HABEO_CLAVIS) {
+		struct ldlm_request *dlm_req;
+
+		LASSERT(h->th_fmt != NULL);
+
+		dlm_req = req_capsule_client_get(pill, &RMF_DLM_REQ);
+		if (dlm_req != NULL) {
+			if (unlikely(dlm_req->lock_desc.l_resource.lr_type ==
+				     LDLM_IBITS &&
+				     dlm_req->lock_desc.l_policy_data.\
+				     l_inodebits.bits == 0)) {
+				/*
+				 * Lock without inodebits makes no sense and
+				 * will oops later in ldlm. If client miss to
+				 * set such bits, do not trigger ASSERTION.
+				 *
+				 * For liblustre flock case, it maybe zero.
+				 */
+				rc = -EPROTO;
+			} else {
+				tsi->tsi_dlm_req = dlm_req;
+			}
+		} else {
+			rc = -EFAULT;
+		}
+	}
+	tsi->tsi_preprocessed = 1;
+	RETURN(rc);
+}
+
+/*
+ * Invoke handler for this request opc. Also do necessary preprocessing
+ * (according to handler ->th_flags), and post-processing (setting of
+ * ->last_{xid,committed}).
+ */
+static int tgt_handle_request0(struct tgt_session_info *tsi,
+			       struct tgt_handler *h,
+			       struct ptlrpc_request *req)
+{
+	int	 serious = 0;
+	int	 rc;
+	__u32    opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+	ENTRY;
+
+
+	/* When dealing with sec context requests, no export is associated yet,
+	 * because these requests are sent before *_CONNECT requests.
+	 * A NULL req->rq_export means the normal *_common_slice handlers will
+	 * not be called, because there is no reference to the target.
+	 * So deal with them by hand and jump directly to target_send_reply().
+	 */
+	switch (opc) {
+	case SEC_CTX_INIT:
+	case SEC_CTX_INIT_CONT:
+	case SEC_CTX_FINI:
+		CFS_FAIL_TIMEOUT(OBD_FAIL_SEC_CTX_HDL_PAUSE, cfs_fail_val);
+		GOTO(out, rc = 0);
+	}
+
+	/*
+	 * Checking for various OBD_FAIL_$PREF_$OPC_NET codes. _Do_ not try
+	 * to put same checks into handlers like mdt_close(), mdt_reint(),
+	 * etc., without talking to mdt authors first. Checking same thing
+	 * there again is useless and returning 0 error without packing reply
+	 * is buggy! Handlers either pack reply or return error.
+	 *
+	 * We return 0 here and do not send any reply in order to emulate
+	 * network failure. Do not send any reply in case any of NET related
+	 * fail_id has occured.
+	 */
+	if (OBD_FAIL_CHECK_ORSET(h->th_fail_id, OBD_FAIL_ONCE))
+		RETURN(0);
+	if (unlikely(lustre_msg_get_opc(req->rq_reqmsg) == MDS_REINT &&
+		     OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_MULTI_NET)))
+		RETURN(0);
+
+	rc = tgt_request_preprocess(tsi, h, req);
+	/* pack reply if reply format is fixed */
+	if (rc == 0 && h->th_flags & HABEO_REFERO) {
+		/* Pack reply */
+		if (req_capsule_has_field(tsi->tsi_pill, &RMF_MDT_MD,
+					  RCL_SERVER))
+			req_capsule_set_size(tsi->tsi_pill, &RMF_MDT_MD,
+					     RCL_SERVER,
+					     tsi->tsi_mdt_body->mbo_eadatasize);
+		if (req_capsule_has_field(tsi->tsi_pill, &RMF_LOGCOOKIES,
+					  RCL_SERVER))
+			req_capsule_set_size(tsi->tsi_pill, &RMF_LOGCOOKIES,
+					     RCL_SERVER, 0);
+		if (req_capsule_has_field(tsi->tsi_pill, &RMF_ACL, RCL_SERVER))
+			req_capsule_set_size(tsi->tsi_pill,
+					     &RMF_ACL, RCL_SERVER,
+					     LUSTRE_POSIX_ACL_MAX_SIZE_OLD);
+
+		rc = req_capsule_server_pack(tsi->tsi_pill);
+	}
+
+	if (likely(rc == 0)) {
+		/*
+		 * Process request, there can be two types of rc:
+		 * 1) errors with msg unpack/pack, other failures outside the
+		 * operation itself. This is counted as serious errors;
+		 * 2) errors during fs operation, should be placed in rq_status
+		 * only
+		 */
+		rc = h->th_act(tsi);
+		if (!is_serious(rc) &&
+		    !req->rq_no_reply && req->rq_reply_state == NULL) {
+			DEBUG_REQ(D_ERROR, req, "%s \"handler\" %s did not "
+				  "pack reply and returned 0 error\n",
+				  tgt_name(tsi->tsi_tgt), h->th_name);
+			LBUG();
+		}
+		serious = is_serious(rc);
+		rc = clear_serious(rc);
+	} else {
+		serious = 1;
+	}
+
+	req->rq_status = rc;
+
+	/*
+	 * ELDLM_* codes which > 0 should be in rq_status only as well as
+	 * all non-serious errors.
+	 */
+	if (rc > 0 || !serious)
+		rc = 0;
+
+	LASSERT(current->journal_info == NULL);
+
+	if (likely(rc == 0 && req->rq_export))
+		target_committed_to_req(req);
+
+out:
+	target_send_reply(req, rc, tsi->tsi_reply_fail_id);
+	RETURN(0);
+}
+
+static int tgt_filter_recovery_request(struct ptlrpc_request *req,
+				       struct obd_device *obd, int *process)
+{
+	switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+	case MDS_DISCONNECT:
+	case OST_DISCONNECT:
+	case OBD_IDX_READ:
+		*process = 1;
+		RETURN(0);
+	case MDS_CLOSE:
+	case MDS_SYNC: /* used in unmounting */
+	case OBD_PING:
+	case MDS_REINT:
+	case OUT_UPDATE:
+	case SEQ_QUERY:
+	case FLD_QUERY:
+	case FLD_READ:
+	case LDLM_ENQUEUE:
+	case OST_CREATE:
+	case OST_DESTROY:
+	case OST_PUNCH:
+	case OST_SETATTR:
+	case OST_SYNC:
+	case OST_WRITE:
+	case MDS_HSM_PROGRESS:
+	case MDS_HSM_STATE_SET:
+	case MDS_HSM_REQUEST:
+		*process = target_queue_recovery_request(req, obd);
+		RETURN(0);
+
+	default:
+		DEBUG_REQ(D_ERROR, req, "not permitted during recovery");
+		*process = -EAGAIN;
+		RETURN(0);
+	}
+}
+
+/*
+ * Handle recovery. Return:
+ *        +1: continue request processing;
+ *       -ve: abort immediately with the given error code;
+ *         0: send reply with error code in req->rq_status;
+ */
+static int tgt_handle_recovery(struct ptlrpc_request *req, int reply_fail_id)
+{
+	ENTRY;
+
+	switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+	case MDS_CONNECT:
+	case OST_CONNECT:
+	case MGS_CONNECT:
+	case SEC_CTX_INIT:
+	case SEC_CTX_INIT_CONT:
+	case SEC_CTX_FINI:
+		RETURN(+1);
+	}
+
+	if (!req->rq_export->exp_obd->obd_replayable)
+		RETURN(+1);
+
+	/* sanity check: if the xid matches, the request must be marked as a
+	 * resent or replayed */
+	if (req_can_reconstruct(req, NULL)) {
+		if (!(lustre_msg_get_flags(req->rq_reqmsg) &
+		      (MSG_RESENT | MSG_REPLAY))) {
+			DEBUG_REQ(D_WARNING, req, "rq_xid %llu matches "
+				  "saved xid, expected REPLAY or RESENT flag "
+				  "(%x)", req->rq_xid,
+				  lustre_msg_get_flags(req->rq_reqmsg));
+			req->rq_status = -ENOTCONN;
+			RETURN(-ENOTCONN);
+		}
+	}
+	/* else: note the opposite is not always true; a RESENT req after a
+	 * failover will usually not match the last_xid, since it was likely
+	 * never committed. A REPLAYed request will almost never match the
+	 * last xid, however it could for a committed, but still retained,
+	 * open. */
+
+	/* Check for aborted recovery... */
+	if (unlikely(req->rq_export->exp_obd->obd_recovering)) {
+		int rc;
+		int should_process;
+
+		DEBUG_REQ(D_INFO, req, "Got new replay");
+		rc = tgt_filter_recovery_request(req, req->rq_export->exp_obd,
+						 &should_process);
+		if (rc != 0 || !should_process)
+			RETURN(rc);
+		else if (should_process < 0) {
+			req->rq_status = should_process;
+			rc = ptlrpc_error(req);
+			RETURN(rc);
+		}
+	}
+	RETURN(+1);
+}
+
+/* Initial check for request, it is validation mostly */
+static struct tgt_handler *tgt_handler_find_check(struct ptlrpc_request *req)
+{
+	struct tgt_handler	*h;
+	struct tgt_opc_slice	*s;
+	struct lu_target	*tgt;
+	__u32			 opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+	ENTRY;
+
+	tgt = class_exp2tgt(req->rq_export);
+	if (unlikely(tgt == NULL)) {
+		DEBUG_REQ(D_ERROR, req, "%s: No target for connected export\n",
+			  class_exp2obd(req->rq_export)->obd_name);
+		RETURN(ERR_PTR(-EINVAL));
+	}
+
+	for (s = tgt->lut_slice; s->tos_hs != NULL; s++)
+		if (s->tos_opc_start <= opc && opc < s->tos_opc_end)
+			break;
+
+	/* opcode was not found in slice */
+	if (unlikely(s->tos_hs == NULL)) {
+		CERROR("%s: no handlers for opcode 0x%x\n", tgt_name(tgt),
+		       opc);
+		RETURN(ERR_PTR(-ENOTSUPP));
+	}
+
+	LASSERT(opc >= s->tos_opc_start && opc < s->tos_opc_end);
+	h = s->tos_hs + (opc - s->tos_opc_start);
+	if (unlikely(h->th_opc == 0)) {
+		CERROR("%s: unsupported opcode 0x%x\n", tgt_name(tgt), opc);
+		RETURN(ERR_PTR(-ENOTSUPP));
+	}
+
+	RETURN(h);
+}
+
+static int process_req_last_xid(struct ptlrpc_request *req)
+{
+	__u64	last_xid;
+	ENTRY;
+
+	/* check request's xid is consistent with export's last_xid */
+	last_xid = lustre_msg_get_last_xid(req->rq_reqmsg);
+	if (last_xid > req->rq_export->exp_last_xid)
+		req->rq_export->exp_last_xid = last_xid;
+
+	if (req->rq_xid == 0 ||
+	    (req->rq_xid <= req->rq_export->exp_last_xid)) {
+		DEBUG_REQ(D_ERROR, req, "Unexpected xid %llx vs. "
+			  "last_xid %llx\n", req->rq_xid,
+			  req->rq_export->exp_last_xid);
+		/* Some request is allowed to be sent during replay,
+		 * such as OUT update requests, FLD requests, so it
+		 * is possible that replay requests has smaller XID
+		 * than the exp_last_xid.
+		 *
+		 * Some non-replay requests may have smaller XID as
+		 * well:
+		 *
+		 * - Client send a no_resend RPC, like statfs;
+		 * - The RPC timedout (or some other error) on client,
+		 *   then it's removed from the unreplied list;
+		 * - Client send some other request to bump the
+		 *   exp_last_xid on server;
+		 * - The former RPC got chance to be processed;
+		 */
+		if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY))
+			RETURN(-EPROTO);
+	}
+
+	/* try to release in-memory reply data */
+	if (tgt_is_multimodrpcs_client(req->rq_export)) {
+		tgt_handle_received_xid(req->rq_export,
+				lustre_msg_get_last_xid(req->rq_reqmsg));
+		if (!(lustre_msg_get_flags(req->rq_reqmsg) &
+		      (MSG_RESENT | MSG_REPLAY)))
+			tgt_handle_tag(req->rq_export,
+				       lustre_msg_get_tag(req->rq_reqmsg));
+	}
+	RETURN(0);
+}
+
+int tgt_request_handle(struct ptlrpc_request *req)
+{
+	struct tgt_session_info	*tsi = tgt_ses_info(req->rq_svc_thread->t_env);
+
+	struct lustre_msg	*msg = req->rq_reqmsg;
+	struct tgt_handler	*h;
+	struct lu_target	*tgt;
+	int			 request_fail_id = 0;
+	__u32			 opc = lustre_msg_get_opc(msg);
+	struct obd_device	*obd;
+	int			 rc;
+	bool			 is_connect = false;
+	ENTRY;
+
+	/* Refill the context, to make sure all thread keys are allocated */
+	lu_env_refill(req->rq_svc_thread->t_env);
+
+	req_capsule_init(&req->rq_pill, req, RCL_SERVER);
+	tsi->tsi_pill = &req->rq_pill;
+	tsi->tsi_env = req->rq_svc_thread->t_env;
+
+	/* if request has export then get handlers slice from corresponding
+	 * target, otherwise that should be connect operation */
+	if (opc == MDS_CONNECT || opc == OST_CONNECT ||
+	    opc == MGS_CONNECT) {
+		is_connect = true;
+		req_capsule_set(&req->rq_pill, &RQF_CONNECT);
+		rc = target_handle_connect(req);
+		if (rc != 0) {
+			rc = ptlrpc_error(req);
+			GOTO(out, rc);
+		}
+		/* recovery-small test 18c asks to drop connect reply */
+		if (unlikely(opc == OST_CONNECT &&
+			     OBD_FAIL_CHECK(OBD_FAIL_OST_CONNECT_NET2)))
+			GOTO(out, rc = 0);
+	}
+
+	if (unlikely(!class_connected_export(req->rq_export))) {
+		if (opc == SEC_CTX_INIT || opc == SEC_CTX_INIT_CONT ||
+		    opc == SEC_CTX_FINI) {
+			/* sec context initialization has to be handled
+			 * by hand in tgt_handle_request0() */
+			tsi->tsi_reply_fail_id = OBD_FAIL_SEC_CTX_INIT_NET;
+			h = NULL;
+			GOTO(handle_recov, rc = 0);
+		}
+		CDEBUG(D_HA, "operation %d on unconnected OST from %s\n",
+		       opc, libcfs_id2str(req->rq_peer));
+		req->rq_status = -ENOTCONN;
+		rc = ptlrpc_error(req);
+		GOTO(out, rc);
+	}
+
+	tsi->tsi_tgt = tgt = class_exp2tgt(req->rq_export);
+	tsi->tsi_exp = req->rq_export;
+	if (exp_connect_flags(req->rq_export) & OBD_CONNECT_JOBSTATS)
+		tsi->tsi_jobid = lustre_msg_get_jobid(req->rq_reqmsg);
+	else
+		tsi->tsi_jobid = NULL;
+
+	if (tgt == NULL) {
+		DEBUG_REQ(D_ERROR, req, "%s: No target for connected export\n",
+			  class_exp2obd(req->rq_export)->obd_name);
+		req->rq_status = -EINVAL;
+		rc = ptlrpc_error(req);
+		GOTO(out, rc);
+	}
+
+	/* Skip last_xid processing for the recovery thread, otherwise, the
+	 * last_xid on same request could be processed twice: first time when
+	 * processing the incoming request, second time when the request is
+	 * being processed by recovery thread. */
+	obd = class_exp2obd(req->rq_export);
+	if (is_connect) {
+		/* reset the exp_last_xid on each connection. */
+		req->rq_export->exp_last_xid = 0;
+	} else if (obd->obd_recovery_data.trd_processing_task !=
+		   current_pid()) {
+		rc = process_req_last_xid(req);
+		if (rc) {
+			req->rq_status = rc;
+			rc = ptlrpc_error(req);
+			GOTO(out, rc);
+		}
+	}
+
+	request_fail_id = tgt->lut_request_fail_id;
+	tsi->tsi_reply_fail_id = tgt->lut_reply_fail_id;
+
+	h = tgt_handler_find_check(req);
+	if (IS_ERR(h)) {
+		req->rq_status = PTR_ERR(h);
+		rc = ptlrpc_error(req);
+		GOTO(out, rc);
+	}
+
+	LASSERTF(h->th_opc == opc, "opcode mismatch %d != %d\n",
+		 h->th_opc, opc);
+
+	if (CFS_FAIL_CHECK_ORSET(request_fail_id, CFS_FAIL_ONCE))
+		GOTO(out, rc = 0);
+
+	rc = lustre_msg_check_version(msg, h->th_version);
+	if (unlikely(rc)) {
+		DEBUG_REQ(D_ERROR, req, "%s: drop mal-formed request, version"
+			  " %08x, expecting %08x\n", tgt_name(tgt),
+			  lustre_msg_get_version(msg), h->th_version);
+		req->rq_status = -EINVAL;
+		rc = ptlrpc_error(req);
+		GOTO(out, rc);
+	}
+
+handle_recov:
+	rc = tgt_handle_recovery(req, tsi->tsi_reply_fail_id);
+	if (likely(rc == 1)) {
+		rc = tgt_handle_request0(tsi, h, req);
+		if (rc)
+			GOTO(out, rc);
+	}
+	EXIT;
+out:
+	req_capsule_fini(tsi->tsi_pill);
+	if (tsi->tsi_corpus != NULL) {
+		lu_object_put(tsi->tsi_env, tsi->tsi_corpus);
+		tsi->tsi_corpus = NULL;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(tgt_request_handle);
+
+/** Assign high priority operations to the request if needed. */
+int tgt_hpreq_handler(struct ptlrpc_request *req)
+{
+	struct tgt_session_info	*tsi = tgt_ses_info(req->rq_svc_thread->t_env);
+	struct tgt_handler	*h;
+	int			 rc;
+
+	ENTRY;
+
+	if (req->rq_export == NULL)
+		RETURN(0);
+
+	req_capsule_init(&req->rq_pill, req, RCL_SERVER);
+	tsi->tsi_pill = &req->rq_pill;
+	tsi->tsi_env = req->rq_svc_thread->t_env;
+	tsi->tsi_tgt = class_exp2tgt(req->rq_export);
+	tsi->tsi_exp = req->rq_export;
+
+	h = tgt_handler_find_check(req);
+	if (IS_ERR(h)) {
+		rc = PTR_ERR(h);
+		RETURN(rc);
+	}
+
+	rc = tgt_request_preprocess(tsi, h, req);
+	if (unlikely(rc != 0))
+		RETURN(rc);
+
+	if (h->th_hp != NULL)
+		h->th_hp(tsi);
+	RETURN(0);
+}
+EXPORT_SYMBOL(tgt_hpreq_handler);
+
+void tgt_counter_incr(struct obd_export *exp, int opcode)
+{
+	lprocfs_counter_incr(exp->exp_obd->obd_stats, opcode);
+	if (exp->exp_nid_stats && exp->exp_nid_stats->nid_stats != NULL)
+		lprocfs_counter_incr(exp->exp_nid_stats->nid_stats, opcode);
+}
+EXPORT_SYMBOL(tgt_counter_incr);
+
+/*
+ * Unified target generic handlers.
+ */
+
+int tgt_connect_check_sptlrpc(struct ptlrpc_request *req, struct obd_export *exp)
+{
+	struct lu_target	*tgt = class_exp2tgt(exp);
+	struct sptlrpc_flavor	 flvr;
+	int			 rc = 0;
+
+	LASSERT(tgt);
+	LASSERT(tgt->lut_obd);
+	LASSERT(tgt->lut_slice);
+
+	/* always allow ECHO client */
+	if (unlikely(strcmp(exp->exp_obd->obd_type->typ_name,
+			    LUSTRE_ECHO_NAME) == 0)) {
+		exp->exp_flvr.sf_rpc = SPTLRPC_FLVR_ANY;
+		return 0;
+	}
+
+	if (exp->exp_flvr.sf_rpc == SPTLRPC_FLVR_INVALID) {
+		read_lock(&tgt->lut_sptlrpc_lock);
+		sptlrpc_target_choose_flavor(&tgt->lut_sptlrpc_rset,
+					     req->rq_sp_from,
+					     req->rq_peer.nid,
+					     &flvr);
+		read_unlock(&tgt->lut_sptlrpc_lock);
+
+		spin_lock(&exp->exp_lock);
+		exp->exp_sp_peer = req->rq_sp_from;
+		exp->exp_flvr = flvr;
+
+		/* when on mgs, if no restriction is set, or if client
+		 * is loopback, allow any flavor */
+		if ((strcmp(exp->exp_obd->obd_type->typ_name,
+			   LUSTRE_MGS_NAME) == 0) &&
+		     (exp->exp_flvr.sf_rpc == SPTLRPC_FLVR_NULL ||
+		      LNET_NETTYP(LNET_NIDNET(exp->exp_connection->c_peer.nid))
+		      == LOLND))
+			exp->exp_flvr.sf_rpc = SPTLRPC_FLVR_ANY;
+
+		if (exp->exp_flvr.sf_rpc != SPTLRPC_FLVR_ANY &&
+		    exp->exp_flvr.sf_rpc != req->rq_flvr.sf_rpc) {
+			CERROR("%s: unauthorized rpc flavor %x from %s, "
+			       "expect %x\n", tgt_name(tgt),
+			       req->rq_flvr.sf_rpc,
+			       libcfs_nid2str(req->rq_peer.nid),
+			       exp->exp_flvr.sf_rpc);
+			rc = -EACCES;
+		}
+		spin_unlock(&exp->exp_lock);
+	} else {
+		if (exp->exp_sp_peer != req->rq_sp_from) {
+			CERROR("%s: RPC source %s doesn't match %s\n",
+			       tgt_name(tgt),
+			       sptlrpc_part2name(req->rq_sp_from),
+			       sptlrpc_part2name(exp->exp_sp_peer));
+			rc = -EACCES;
+		} else {
+			rc = sptlrpc_target_export_check(exp, req);
+		}
+	}
+
+	return rc;
+}
+
+int tgt_adapt_sptlrpc_conf(struct lu_target *tgt)
+{
+	struct sptlrpc_rule_set	 tmp_rset;
+	int			 rc;
+
+	if (unlikely(tgt == NULL)) {
+		CERROR("No target passed");
+		return -EINVAL;
+	}
+
+	sptlrpc_rule_set_init(&tmp_rset);
+	rc = sptlrpc_conf_target_get_rules(tgt->lut_obd, &tmp_rset);
+	if (rc) {
+		CERROR("%s: failed get sptlrpc rules: rc = %d\n",
+		       tgt_name(tgt), rc);
+		return rc;
+	}
+
+	sptlrpc_target_update_exp_flavor(tgt->lut_obd, &tmp_rset);
+
+	write_lock(&tgt->lut_sptlrpc_lock);
+	sptlrpc_rule_set_free(&tgt->lut_sptlrpc_rset);
+	tgt->lut_sptlrpc_rset = tmp_rset;
+	write_unlock(&tgt->lut_sptlrpc_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(tgt_adapt_sptlrpc_conf);
+
+int tgt_connect(struct tgt_session_info *tsi)
+{
+	struct ptlrpc_request	*req = tgt_ses_req(tsi);
+	struct obd_connect_data	*reply;
+	int			 rc;
+
+	ENTRY;
+
+	/* XXX: better to call this check right after getting new export but
+	 * before last_rcvd slot allocation to avoid server load upon insecure
+	 * connects. This is to be fixed after unifiyng all targets.
+	 */
+	rc = tgt_connect_check_sptlrpc(req, tsi->tsi_exp);
+	if (rc)
+		GOTO(out, rc);
+
+	/* To avoid exposing partially initialized connection flags, changes up
+	 * to this point have been staged in reply->ocd_connect_flags. Now that
+	 * connection handling has completed successfully, atomically update
+	 * the connect flags in the shared export data structure. LU-1623 */
+	reply = req_capsule_server_get(tsi->tsi_pill, &RMF_CONNECT_DATA);
+	spin_lock(&tsi->tsi_exp->exp_lock);
+	*exp_connect_flags_ptr(tsi->tsi_exp) = reply->ocd_connect_flags;
+	tsi->tsi_exp->exp_connect_data.ocd_brw_size = reply->ocd_brw_size;
+	spin_unlock(&tsi->tsi_exp->exp_lock);
+
+	RETURN(0);
+out:
+	obd_disconnect(class_export_get(tsi->tsi_exp));
+	return rc;
+}
+EXPORT_SYMBOL(tgt_connect);
+
+int tgt_disconnect(struct tgt_session_info *tsi)
+{
+	int rc;
+
+	ENTRY;
+
+	rc = target_handle_disconnect(tgt_ses_req(tsi));
+	if (rc)
+		RETURN(err_serious(rc));
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_disconnect);
+
+/*
+ * Unified target OBD handlers
+ */
+int tgt_obd_ping(struct tgt_session_info *tsi)
+{
+	int rc;
+
+	ENTRY;
+
+	rc = target_handle_ping(tgt_ses_req(tsi));
+	if (rc)
+		RETURN(err_serious(rc));
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_obd_ping);
+
+int tgt_obd_log_cancel(struct tgt_session_info *tsi)
+{
+	return err_serious(-EOPNOTSUPP);
+}
+
+int tgt_send_buffer(struct tgt_session_info *tsi, struct lu_rdbuf *rdbuf)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(tsi->tsi_env);
+	struct ptlrpc_request	*req = tgt_ses_req(tsi);
+	struct obd_export	*exp = req->rq_export;
+	struct ptlrpc_bulk_desc	*desc;
+	struct l_wait_info	*lwi = &tti->tti_u.update.tti_wait_info;
+	int			 i;
+	int			 rc;
+
+	ENTRY;
+
+	desc = ptlrpc_prep_bulk_exp(req, rdbuf->rb_nbufs, 1,
+				  PTLRPC_BULK_PUT_SOURCE | PTLRPC_BULK_BUF_KVEC,
+				    MDS_BULK_PORTAL, &ptlrpc_bulk_kvec_ops);
+	if (desc == NULL)
+		RETURN(-ENOMEM);
+
+	for (i = 0; i < rdbuf->rb_nbufs; i++)
+		desc->bd_frag_ops->add_iov_frag(desc,
+					rdbuf->rb_bufs[i].lb_buf,
+					rdbuf->rb_bufs[i].lb_len);
+
+	rc = target_bulk_io(exp, desc, lwi);
+	ptlrpc_free_bulk(desc);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_send_buffer);
+
+int tgt_sendpage(struct tgt_session_info *tsi, struct lu_rdpg *rdpg, int nob)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(tsi->tsi_env);
+	struct ptlrpc_request	*req = tgt_ses_req(tsi);
+	struct obd_export	*exp = req->rq_export;
+	struct ptlrpc_bulk_desc	*desc;
+	struct l_wait_info	*lwi = &tti->tti_u.rdpg.tti_wait_info;
+	int			 tmpcount;
+	int			 tmpsize;
+	int			 i;
+	int			 rc;
+
+	ENTRY;
+
+	desc = ptlrpc_prep_bulk_exp(req, rdpg->rp_npages, 1,
+				    PTLRPC_BULK_PUT_SOURCE |
+					PTLRPC_BULK_BUF_KIOV,
+				    MDS_BULK_PORTAL,
+				    &ptlrpc_bulk_kiov_pin_ops);
+	if (desc == NULL)
+		RETURN(-ENOMEM);
+
+	if (!(exp_connect_flags(exp) & OBD_CONNECT_BRW_SIZE))
+		/* old client requires reply size in it's PAGE_SIZE,
+		 * which is rdpg->rp_count */
+		nob = rdpg->rp_count;
+
+	for (i = 0, tmpcount = nob; i < rdpg->rp_npages && tmpcount > 0;
+	     i++, tmpcount -= tmpsize) {
+		tmpsize = min_t(int, tmpcount, PAGE_SIZE);
+		desc->bd_frag_ops->add_kiov_frag(desc, rdpg->rp_pages[i], 0,
+						 tmpsize);
+	}
+
+	LASSERT(desc->bd_nob == nob);
+	rc = target_bulk_io(exp, desc, lwi);
+	ptlrpc_free_bulk(desc);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_sendpage);
+
+/*
+ * OBD_IDX_READ handler
+ */
+static int tgt_obd_idx_read(struct tgt_session_info *tsi)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(tsi->tsi_env);
+	struct lu_rdpg		*rdpg = &tti->tti_u.rdpg.tti_rdpg;
+	struct idx_info		*req_ii, *rep_ii;
+	int			 rc, i;
+
+	ENTRY;
+
+	memset(rdpg, 0, sizeof(*rdpg));
+	req_capsule_set(tsi->tsi_pill, &RQF_OBD_IDX_READ);
+
+	/* extract idx_info buffer from request & reply */
+	req_ii = req_capsule_client_get(tsi->tsi_pill, &RMF_IDX_INFO);
+	if (req_ii == NULL || req_ii->ii_magic != IDX_INFO_MAGIC)
+		RETURN(err_serious(-EPROTO));
+
+	rc = req_capsule_server_pack(tsi->tsi_pill);
+	if (rc)
+		RETURN(err_serious(rc));
+
+	rep_ii = req_capsule_server_get(tsi->tsi_pill, &RMF_IDX_INFO);
+	if (rep_ii == NULL)
+		RETURN(err_serious(-EFAULT));
+	rep_ii->ii_magic = IDX_INFO_MAGIC;
+
+	/* extract hash to start with */
+	rdpg->rp_hash = req_ii->ii_hash_start;
+
+	/* extract requested attributes */
+	rdpg->rp_attrs = req_ii->ii_attrs;
+
+	/* check that fid packed in request is valid and supported */
+	if (!fid_is_sane(&req_ii->ii_fid))
+		RETURN(-EINVAL);
+	rep_ii->ii_fid = req_ii->ii_fid;
+
+	/* copy flags */
+	rep_ii->ii_flags = req_ii->ii_flags;
+
+	/* compute number of pages to allocate, ii_count is the number of 4KB
+	 * containers */
+	if (req_ii->ii_count <= 0)
+		GOTO(out, rc = -EFAULT);
+	rdpg->rp_count = min_t(unsigned int, req_ii->ii_count << LU_PAGE_SHIFT,
+			       exp_max_brw_size(tsi->tsi_exp));
+	rdpg->rp_npages = (rdpg->rp_count + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+	/* allocate pages to store the containers */
+	OBD_ALLOC(rdpg->rp_pages, rdpg->rp_npages * sizeof(rdpg->rp_pages[0]));
+	if (rdpg->rp_pages == NULL)
+		GOTO(out, rc = -ENOMEM);
+	for (i = 0; i < rdpg->rp_npages; i++) {
+		rdpg->rp_pages[i] = alloc_page(GFP_NOFS);
+		if (rdpg->rp_pages[i] == NULL)
+			GOTO(out, rc = -ENOMEM);
+	}
+
+	/* populate pages with key/record pairs */
+	rc = dt_index_read(tsi->tsi_env, tsi->tsi_tgt->lut_bottom, rep_ii, rdpg);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	LASSERTF(rc <= rdpg->rp_count, "dt_index_read() returned more than "
+		 "asked %d > %d\n", rc, rdpg->rp_count);
+
+	/* send pages to client */
+	rc = tgt_sendpage(tsi, rdpg, rc);
+	if (rc)
+		GOTO(out, rc);
+	EXIT;
+out:
+	if (rdpg->rp_pages) {
+		for (i = 0; i < rdpg->rp_npages; i++)
+			if (rdpg->rp_pages[i])
+				__free_page(rdpg->rp_pages[i]);
+		OBD_FREE(rdpg->rp_pages,
+			 rdpg->rp_npages * sizeof(rdpg->rp_pages[0]));
+	}
+	return rc;
+}
+
+struct tgt_handler tgt_obd_handlers[] = {
+TGT_OBD_HDL    (0,	OBD_PING,		tgt_obd_ping),
+TGT_OBD_HDL_VAR(0,	OBD_LOG_CANCEL,		tgt_obd_log_cancel),
+TGT_OBD_HDL    (0,	OBD_IDX_READ,		tgt_obd_idx_read)
+};
+EXPORT_SYMBOL(tgt_obd_handlers);
+
+int tgt_sync(const struct lu_env *env, struct lu_target *tgt,
+	     struct dt_object *obj, __u64 start, __u64 end)
+{
+	int rc = 0;
+
+	ENTRY;
+
+	/* if no objid is specified, it means "sync whole filesystem" */
+	if (obj == NULL) {
+		rc = dt_sync(env, tgt->lut_bottom);
+	} else if (dt_version_get(env, obj) >
+		   tgt->lut_obd->obd_last_committed) {
+		rc = dt_object_sync(env, obj, start, end);
+	}
+	atomic_inc(&tgt->lut_sync_count);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_sync);
+/*
+ * Unified target DLM handlers.
+ */
+
+/**
+ * Unified target BAST
+ *
+ * Ensure data and metadata are synced to disk when lock is canceled if Sync on
+ * Cancel (SOC) is enabled. If it's extent lock, normally sync obj is enough,
+ * but if it's cross-MDT lock, because remote object version is not set, a
+ * filesystem sync is needed.
+ *
+ * \param lock server side lock
+ * \param desc lock desc
+ * \param data ldlm_cb_set_arg
+ * \param flag	indicates whether this cancelling or blocking callback
+ * \retval	0 on success
+ * \retval	negative number on error
+ */
+static int tgt_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+			    void *data, int flag)
+{
+	struct lu_env		 env;
+	struct lu_target	*tgt;
+	struct dt_object	*obj = NULL;
+	struct lu_fid		 fid;
+	int			 rc = 0;
+
+	ENTRY;
+
+	tgt = class_exp2tgt(lock->l_export);
+
+	if (unlikely(tgt == NULL)) {
+		CDEBUG(D_ERROR, "%s: No target for connected export\n",
+		       class_exp2obd(lock->l_export)->obd_name);
+		RETURN(-EINVAL);
+	}
+
+	if (flag == LDLM_CB_CANCELING &&
+	    (lock->l_granted_mode & (LCK_EX | LCK_PW | LCK_GROUP)) &&
+	    (tgt->lut_sync_lock_cancel == ALWAYS_SYNC_ON_CANCEL ||
+	     (tgt->lut_sync_lock_cancel == BLOCKING_SYNC_ON_CANCEL &&
+	      ldlm_is_cbpending(lock))) &&
+	    ((exp_connect_flags(lock->l_export) & OBD_CONNECT_MDS_MDS) ||
+	     lock->l_resource->lr_type == LDLM_EXTENT)) {
+		__u64 start = 0;
+		__u64 end = OBD_OBJECT_EOF;
+
+		rc = lu_env_init(&env, LCT_DT_THREAD);
+		if (unlikely(rc != 0))
+			RETURN(rc);
+
+		ost_fid_from_resid(&fid, &lock->l_resource->lr_name,
+				   tgt->lut_lsd.lsd_osd_index);
+
+		if (lock->l_resource->lr_type == LDLM_EXTENT) {
+			obj = dt_locate(&env, tgt->lut_bottom, &fid);
+			if (IS_ERR(obj))
+				GOTO(err_env, rc = PTR_ERR(obj));
+
+			if (!dt_object_exists(obj))
+				GOTO(err_put, rc = -ENOENT);
+
+			start = lock->l_policy_data.l_extent.start;
+			end = lock->l_policy_data.l_extent.end;
+		}
+
+		rc = tgt_sync(&env, tgt, obj, start, end);
+		if (rc < 0) {
+			CERROR("%s: syncing "DFID" (%llu-%llu) on lock "
+			       "cancel: rc = %d\n",
+			       tgt_name(tgt), PFID(&fid),
+			       lock->l_policy_data.l_extent.start,
+			       lock->l_policy_data.l_extent.end, rc);
+		}
+err_put:
+		if (obj != NULL)
+			dt_object_put(&env, obj);
+err_env:
+		lu_env_fini(&env);
+	}
+
+	rc = ldlm_server_blocking_ast(lock, desc, data, flag);
+	RETURN(rc);
+}
+
+static struct ldlm_callback_suite tgt_dlm_cbs = {
+	.lcs_completion	= ldlm_server_completion_ast,
+	.lcs_blocking	= tgt_blocking_ast,
+	.lcs_glimpse	= ldlm_server_glimpse_ast
+};
+
+int tgt_enqueue(struct tgt_session_info *tsi)
+{
+	struct ptlrpc_request *req = tgt_ses_req(tsi);
+	int rc;
+
+	ENTRY;
+	/*
+	 * tsi->tsi_dlm_req was already swapped and (if necessary) converted,
+	 * tsi->tsi_dlm_cbs was set by the *_req_handle() function.
+	 */
+	LASSERT(tsi->tsi_dlm_req != NULL);
+	rc = ldlm_handle_enqueue0(tsi->tsi_exp->exp_obd->obd_namespace, req,
+				  tsi->tsi_dlm_req, &tgt_dlm_cbs);
+	if (rc)
+		RETURN(err_serious(rc));
+
+	switch (LUT_FAIL_CLASS(tsi->tsi_reply_fail_id)) {
+	case LUT_FAIL_MDT:
+		tsi->tsi_reply_fail_id = OBD_FAIL_MDS_LDLM_REPLY_NET;
+		break;
+	case LUT_FAIL_OST:
+		tsi->tsi_reply_fail_id = OBD_FAIL_OST_LDLM_REPLY_NET;
+		break;
+	case LUT_FAIL_MGT:
+		tsi->tsi_reply_fail_id = OBD_FAIL_MGS_LDLM_REPLY_NET;
+		break;
+	default:
+		tsi->tsi_reply_fail_id = OBD_FAIL_LDLM_REPLY;
+		break;
+	}
+	RETURN(req->rq_status);
+}
+EXPORT_SYMBOL(tgt_enqueue);
+
+int tgt_convert(struct tgt_session_info *tsi)
+{
+	struct ptlrpc_request *req = tgt_ses_req(tsi);
+	int rc;
+
+	ENTRY;
+	LASSERT(tsi->tsi_dlm_req);
+	rc = ldlm_handle_convert0(req, tsi->tsi_dlm_req);
+	if (rc)
+		RETURN(err_serious(rc));
+
+	RETURN(req->rq_status);
+}
+
+int tgt_bl_callback(struct tgt_session_info *tsi)
+{
+	return err_serious(-EOPNOTSUPP);
+}
+
+int tgt_cp_callback(struct tgt_session_info *tsi)
+{
+	return err_serious(-EOPNOTSUPP);
+}
+
+/* generic LDLM target handler */
+struct tgt_handler tgt_dlm_handlers[] = {
+TGT_DLM_HDL    (HABEO_CLAVIS,	LDLM_ENQUEUE,		tgt_enqueue),
+TGT_DLM_HDL_VAR(HABEO_CLAVIS,	LDLM_CONVERT,		tgt_convert),
+TGT_DLM_HDL_VAR(0,		LDLM_BL_CALLBACK,	tgt_bl_callback),
+TGT_DLM_HDL_VAR(0,		LDLM_CP_CALLBACK,	tgt_cp_callback)
+};
+EXPORT_SYMBOL(tgt_dlm_handlers);
+
+/*
+ * Unified target LLOG handlers.
+ */
+int tgt_llog_open(struct tgt_session_info *tsi)
+{
+	int rc;
+
+	ENTRY;
+
+	rc = llog_origin_handle_open(tgt_ses_req(tsi));
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_llog_open);
+
+int tgt_llog_close(struct tgt_session_info *tsi)
+{
+	int rc;
+
+	ENTRY;
+
+	rc = llog_origin_handle_close(tgt_ses_req(tsi));
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_llog_close);
+
+
+int tgt_llog_destroy(struct tgt_session_info *tsi)
+{
+	int rc;
+
+	ENTRY;
+
+	rc = llog_origin_handle_destroy(tgt_ses_req(tsi));
+
+	RETURN(rc);
+}
+
+int tgt_llog_read_header(struct tgt_session_info *tsi)
+{
+	int rc;
+
+	ENTRY;
+
+	rc = llog_origin_handle_read_header(tgt_ses_req(tsi));
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_llog_read_header);
+
+int tgt_llog_next_block(struct tgt_session_info *tsi)
+{
+	int rc;
+
+	ENTRY;
+
+	rc = llog_origin_handle_next_block(tgt_ses_req(tsi));
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_llog_next_block);
+
+int tgt_llog_prev_block(struct tgt_session_info *tsi)
+{
+	int rc;
+
+	ENTRY;
+
+	rc = llog_origin_handle_prev_block(tgt_ses_req(tsi));
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_llog_prev_block);
+
+/* generic llog target handler */
+struct tgt_handler tgt_llog_handlers[] = {
+TGT_LLOG_HDL    (0,	LLOG_ORIGIN_HANDLE_CREATE,	tgt_llog_open),
+TGT_LLOG_HDL    (0,	LLOG_ORIGIN_HANDLE_NEXT_BLOCK,	tgt_llog_next_block),
+TGT_LLOG_HDL    (0,	LLOG_ORIGIN_HANDLE_READ_HEADER,	tgt_llog_read_header),
+TGT_LLOG_HDL    (0,	LLOG_ORIGIN_HANDLE_PREV_BLOCK,	tgt_llog_prev_block),
+TGT_LLOG_HDL    (0,	LLOG_ORIGIN_HANDLE_DESTROY,	tgt_llog_destroy),
+TGT_LLOG_HDL_VAR(0,	LLOG_ORIGIN_HANDLE_CLOSE,	tgt_llog_close),
+};
+EXPORT_SYMBOL(tgt_llog_handlers);
+
+/*
+ * sec context handlers
+ */
+/* XXX: Implement based on mdt_sec_ctx_handle()? */
+static int tgt_sec_ctx_handle(struct tgt_session_info *tsi)
+{
+	return 0;
+}
+
+struct tgt_handler tgt_sec_ctx_handlers[] = {
+TGT_SEC_HDL_VAR(0,	SEC_CTX_INIT,		tgt_sec_ctx_handle),
+TGT_SEC_HDL_VAR(0,	SEC_CTX_INIT_CONT,	tgt_sec_ctx_handle),
+TGT_SEC_HDL_VAR(0,	SEC_CTX_FINI,		tgt_sec_ctx_handle),
+};
+EXPORT_SYMBOL(tgt_sec_ctx_handlers);
+
+int (*tgt_lfsck_in_notify_local)(const struct lu_env *env,
+				 struct dt_device *key,
+				 struct lfsck_req_local *lrl,
+				 struct thandle *th) = NULL;
+
+void tgt_register_lfsck_in_notify_local(int (*notify)(const struct lu_env *,
+						      struct dt_device *,
+						      struct lfsck_req_local *,
+						      struct thandle *))
+{
+	tgt_lfsck_in_notify_local = notify;
+}
+EXPORT_SYMBOL(tgt_register_lfsck_in_notify_local);
+
+int (*tgt_lfsck_in_notify)(const struct lu_env *env,
+			   struct dt_device *key,
+			   struct lfsck_request *lr) = NULL;
+
+void tgt_register_lfsck_in_notify(int (*notify)(const struct lu_env *,
+						struct dt_device *,
+						struct lfsck_request *))
+{
+	tgt_lfsck_in_notify = notify;
+}
+EXPORT_SYMBOL(tgt_register_lfsck_in_notify);
+
+static int (*tgt_lfsck_query)(const struct lu_env *env,
+			      struct dt_device *key,
+			      struct lfsck_request *req,
+			      struct lfsck_reply *rep,
+			      struct lfsck_query *que) = NULL;
+
+void tgt_register_lfsck_query(int (*query)(const struct lu_env *,
+					   struct dt_device *,
+					   struct lfsck_request *,
+					   struct lfsck_reply *,
+					   struct lfsck_query *))
+{
+	tgt_lfsck_query = query;
+}
+EXPORT_SYMBOL(tgt_register_lfsck_query);
+
+/* LFSCK request handlers */
+static int tgt_handle_lfsck_notify(struct tgt_session_info *tsi)
+{
+	const struct lu_env	*env = tsi->tsi_env;
+	struct dt_device	*key = tsi->tsi_tgt->lut_bottom;
+	struct lfsck_request	*lr;
+	int			 rc;
+	ENTRY;
+
+	lr = req_capsule_client_get(tsi->tsi_pill, &RMF_LFSCK_REQUEST);
+	if (lr == NULL)
+		RETURN(-EPROTO);
+
+	rc = tgt_lfsck_in_notify(env, key, lr);
+
+	RETURN(rc);
+}
+
+static int tgt_handle_lfsck_query(struct tgt_session_info *tsi)
+{
+	struct lfsck_request	*request;
+	struct lfsck_reply	*reply;
+	int			 rc;
+	ENTRY;
+
+	request = req_capsule_client_get(tsi->tsi_pill, &RMF_LFSCK_REQUEST);
+	if (request == NULL)
+		RETURN(-EPROTO);
+
+	reply = req_capsule_server_get(tsi->tsi_pill, &RMF_LFSCK_REPLY);
+	if (reply == NULL)
+		RETURN(-ENOMEM);
+
+	rc = tgt_lfsck_query(tsi->tsi_env, tsi->tsi_tgt->lut_bottom,
+			     request, reply, NULL);
+
+	RETURN(rc < 0 ? rc : 0);
+}
+
+struct tgt_handler tgt_lfsck_handlers[] = {
+TGT_LFSCK_HDL(HABEO_REFERO,	LFSCK_NOTIFY,	tgt_handle_lfsck_notify),
+TGT_LFSCK_HDL(HABEO_REFERO,	LFSCK_QUERY,	tgt_handle_lfsck_query),
+};
+EXPORT_SYMBOL(tgt_lfsck_handlers);
+
+/*
+ * initialize per-thread page pool (bug 5137).
+ */
+int tgt_io_thread_init(struct ptlrpc_thread *thread)
+{
+	struct tgt_thread_big_cache *tbc;
+
+	ENTRY;
+
+	LASSERT(thread != NULL);
+	LASSERT(thread->t_data == NULL);
+
+	OBD_ALLOC_LARGE(tbc, sizeof(*tbc));
+	if (tbc == NULL)
+		RETURN(-ENOMEM);
+	thread->t_data = tbc;
+	RETURN(0);
+}
+EXPORT_SYMBOL(tgt_io_thread_init);
+
+/*
+ * free per-thread pool created by tgt_thread_init().
+ */
+void tgt_io_thread_done(struct ptlrpc_thread *thread)
+{
+	struct tgt_thread_big_cache *tbc;
+
+	ENTRY;
+
+	LASSERT(thread != NULL);
+
+	/*
+	 * be prepared to handle partially-initialized pools (because this is
+	 * called from ost_io_thread_init() for cleanup.
+	 */
+	tbc = thread->t_data;
+	if (tbc != NULL) {
+		OBD_FREE_LARGE(tbc, sizeof(*tbc));
+		thread->t_data = NULL;
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(tgt_io_thread_done);
+/**
+ * Helper function for getting server side [start, start+count] DLM lock
+ * if asked by client.
+ */
+int tgt_extent_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
+		    __u64 start, __u64 end, struct lustre_handle *lh,
+		    int mode, __u64 *flags)
+{
+	union ldlm_policy_data policy;
+	int rc;
+
+	ENTRY;
+
+	LASSERT(lh != NULL);
+	LASSERT(ns != NULL);
+	LASSERT(!lustre_handle_is_used(lh));
+
+	policy.l_extent.gid = 0;
+	policy.l_extent.start = start & PAGE_MASK;
+
+	/*
+	 * If ->o_blocks is EOF it means "lock till the end of the file".
+	 * Otherwise, it's size of an extent or hole being punched (in bytes).
+	 */
+	if (end == OBD_OBJECT_EOF || end < start)
+		policy.l_extent.end = OBD_OBJECT_EOF;
+	else
+		policy.l_extent.end = end | ~PAGE_MASK;
+
+	rc = ldlm_cli_enqueue_local(ns, res_id, LDLM_EXTENT, &policy, mode,
+				    flags, ldlm_blocking_ast,
+				    ldlm_completion_ast, ldlm_glimpse_ast,
+				    NULL, 0, LVB_T_NONE, NULL, lh);
+	RETURN(rc == ELDLM_OK ? 0 : -EIO);
+}
+EXPORT_SYMBOL(tgt_extent_lock);
+
+void tgt_extent_unlock(struct lustre_handle *lh, enum ldlm_mode mode)
+{
+	LASSERT(lustre_handle_is_used(lh));
+	ldlm_lock_decref(lh, mode);
+}
+EXPORT_SYMBOL(tgt_extent_unlock);
+
+int tgt_brw_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
+		 struct obd_ioobj *obj, struct niobuf_remote *nb,
+		 struct lustre_handle *lh, enum ldlm_mode mode)
+{
+	__u64			 flags = 0;
+	int			 nrbufs = obj->ioo_bufcnt;
+	int			 i;
+
+	ENTRY;
+
+	LASSERT(mode == LCK_PR || mode == LCK_PW);
+	LASSERT(!lustre_handle_is_used(lh));
+
+	if (ns->ns_obd->obd_recovering)
+		RETURN(0);
+
+	if (nrbufs == 0 || !(nb[0].rnb_flags & OBD_BRW_SRVLOCK))
+		RETURN(0);
+
+	for (i = 1; i < nrbufs; i++)
+		if (!(nb[i].rnb_flags & OBD_BRW_SRVLOCK))
+			RETURN(-EFAULT);
+
+	RETURN(tgt_extent_lock(ns, res_id, nb[0].rnb_offset,
+			       nb[nrbufs - 1].rnb_offset +
+			       nb[nrbufs - 1].rnb_len - 1,
+			       lh, mode, &flags));
+}
+
+void tgt_brw_unlock(struct obd_ioobj *obj, struct niobuf_remote *niob,
+		    struct lustre_handle *lh, enum ldlm_mode mode)
+{
+	ENTRY;
+
+	LASSERT(mode == LCK_PR || mode == LCK_PW);
+	LASSERT((obj->ioo_bufcnt > 0 &&
+		 (niob[0].rnb_flags & OBD_BRW_SRVLOCK)) ==
+		lustre_handle_is_used(lh));
+
+	if (lustre_handle_is_used(lh))
+		tgt_extent_unlock(lh, mode);
+	EXIT;
+}
+
+static __u32 tgt_checksum_bulk(struct lu_target *tgt,
+			       struct ptlrpc_bulk_desc *desc, int opc,
+			       cksum_type_t cksum_type)
+{
+	struct cfs_crypto_hash_desc	*hdesc;
+	unsigned int			bufsize;
+	int				i, err;
+	unsigned char			cfs_alg = cksum_obd2cfs(cksum_type);
+	__u32				cksum;
+
+	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
+
+	hdesc = cfs_crypto_hash_init(cfs_alg, NULL, 0);
+	if (IS_ERR(hdesc)) {
+		CERROR("%s: unable to initialize checksum hash %s\n",
+		       tgt_name(tgt), cfs_crypto_hash_name(cfs_alg));
+		return PTR_ERR(hdesc);
+	}
+
+	CDEBUG(D_INFO, "Checksum for algo %s\n", cfs_crypto_hash_name(cfs_alg));
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		/* corrupt the data before we compute the checksum, to
+		 * simulate a client->OST data error */
+		if (i == 0 && opc == OST_WRITE &&
+		    OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_RECEIVE)) {
+			int off = BD_GET_KIOV(desc, i).kiov_offset &
+				~PAGE_MASK;
+			int len = BD_GET_KIOV(desc, i).kiov_len;
+			struct page *np = tgt_page_to_corrupt;
+			char *ptr = kmap(BD_GET_KIOV(desc, i).kiov_page) + off;
+
+			if (np) {
+				char *ptr2 = kmap(np) + off;
+
+				memcpy(ptr2, ptr, len);
+				memcpy(ptr2, "bad3", min(4, len));
+				kunmap(np);
+
+				/* LU-8376 to preserve original index for
+				 * display in dump_all_bulk_pages() */
+				np->index = BD_GET_KIOV(desc,
+							i).kiov_page->index;
+
+				BD_GET_KIOV(desc, i).kiov_page = np;
+			} else {
+				CERROR("%s: can't alloc page for corruption\n",
+				       tgt_name(tgt));
+			}
+		}
+		cfs_crypto_hash_update_page(hdesc,
+				  BD_GET_KIOV(desc, i).kiov_page,
+				  BD_GET_KIOV(desc, i).kiov_offset &
+					~PAGE_MASK,
+				  BD_GET_KIOV(desc, i).kiov_len);
+
+		 /* corrupt the data after we compute the checksum, to
+		 * simulate an OST->client data error */
+		if (i == 0 && opc == OST_READ &&
+		    OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_SEND)) {
+			int off = BD_GET_KIOV(desc, i).kiov_offset
+			  & ~PAGE_MASK;
+			int len = BD_GET_KIOV(desc, i).kiov_len;
+			struct page *np = tgt_page_to_corrupt;
+			char *ptr =
+			  kmap(BD_GET_KIOV(desc, i).kiov_page) + off;
+
+			if (np) {
+				char *ptr2 = kmap(np) + off;
+
+				memcpy(ptr2, ptr, len);
+				memcpy(ptr2, "bad4", min(4, len));
+				kunmap(np);
+
+				/* LU-8376 to preserve original index for
+				 * display in dump_all_bulk_pages() */
+				np->index = BD_GET_KIOV(desc,
+							i).kiov_page->index;
+
+				BD_GET_KIOV(desc, i).kiov_page = np;
+			} else {
+				CERROR("%s: can't alloc page for corruption\n",
+				       tgt_name(tgt));
+			}
+		}
+	}
+
+	bufsize = sizeof(cksum);
+	err = cfs_crypto_hash_final(hdesc, (unsigned char *)&cksum, &bufsize);
+
+	return cksum;
+}
+
+char dbgcksum_file_name[PATH_MAX];
+
+static void dump_all_bulk_pages(struct obdo *oa, int count,
+				    lnet_kiov_t *iov, __u32 server_cksum,
+				    __u32 client_cksum)
+{
+	struct file *filp;
+	int rc, i;
+	unsigned int len;
+	char *buf;
+
+	/* will only keep dump of pages on first error for the same range in
+	 * file/fid, not during the resends/retries. */
+	snprintf(dbgcksum_file_name, sizeof(dbgcksum_file_name),
+		 "%s-checksum_dump-ost-"DFID":[%llu-%llu]-%x-%x",
+		 (strncmp(libcfs_debug_file_path_arr, "NONE", 4) != 0 ?
+		  libcfs_debug_file_path_arr :
+		  LIBCFS_DEBUG_FILE_PATH_DEFAULT),
+		 oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0,
+		 oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
+		 oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
+		 (__u64)iov[0].kiov_page->index << PAGE_SHIFT,
+		 ((__u64)iov[count - 1].kiov_page->index << PAGE_SHIFT) +
+		 iov[count - 1].kiov_len - 1, client_cksum, server_cksum);
+	filp = filp_open(dbgcksum_file_name,
+			 O_CREAT | O_EXCL | O_WRONLY | O_LARGEFILE, 0600);
+	if (IS_ERR(filp)) {
+		rc = PTR_ERR(filp);
+		if (rc == -EEXIST)
+			CDEBUG(D_INFO, "%s: can't open to dump pages with "
+			       "checksum error: rc = %d\n", dbgcksum_file_name,
+			       rc);
+		else
+			CERROR("%s: can't open to dump pages with checksum "
+			       "error: rc = %d\n", dbgcksum_file_name, rc);
+		return;
+	}
+
+	for (i = 0; i < count; i++) {
+		len = iov[i].kiov_len;
+		buf = kmap(iov[i].kiov_page);
+		while (len != 0) {
+			rc = cfs_kernel_write(filp, buf, len, &filp->f_pos);
+			if (rc < 0) {
+				CERROR("%s: wanted to write %u but got %d "
+				       "error\n", dbgcksum_file_name, len, rc);
+				break;
+			}
+			len -= rc;
+			buf += rc;
+			CDEBUG(D_INFO, "%s: wrote %d bytes\n",
+			       dbgcksum_file_name, rc);
+		}
+		kunmap(iov[i].kiov_page);
+	}
+
+	rc = ll_vfs_fsync_range(filp, 0, LLONG_MAX, 1);
+	if (rc)
+		CERROR("%s: sync returns %d\n", dbgcksum_file_name, rc);
+	filp_close(filp, NULL);
+	return;
+}
+
+static int check_read_checksum(struct ptlrpc_bulk_desc *desc, struct obdo *oa,
+			       const lnet_process_id_t *peer,
+			       __u32 client_cksum, __u32 server_cksum,
+			       cksum_type_t server_cksum_type)
+{
+	char *msg;
+	cksum_type_t cksum_type;
+
+	/* unlikely to happen and only if resend does not occur due to cksum
+	 * control failure on Client */
+	if (unlikely(server_cksum == client_cksum)) {
+		CDEBUG(D_PAGE, "checksum %x confirmed upon retry\n",
+		       client_cksum);
+		return 0;
+	}
+
+	if (desc->bd_export->exp_obd->obd_checksum_dump)
+		dump_all_bulk_pages(oa, desc->bd_iov_count,
+				    &BD_GET_KIOV(desc, 0), server_cksum,
+				    client_cksum);
+
+	cksum_type = cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
+				       oa->o_flags : 0);
+
+	if (cksum_type != server_cksum_type)
+		msg = "the server may have not used the checksum type specified"
+		      " in the original request - likely a protocol problem";
+	else
+		msg = "should have changed on the client or in transit";
+
+	LCONSOLE_ERROR_MSG(0x132, "%s: BAD READ CHECKSUM: %s: from %s inode "
+		DFID " object "DOSTID" extent [%llu-%llu], client returned csum"
+		" %x (type %x), server csum %x (type %x)\n",
+		desc->bd_export->exp_obd->obd_name,
+		msg, libcfs_nid2str(peer->nid),
+		oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : 0ULL,
+		oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
+		oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
+		POSTID(&oa->o_oi),
+		(__u64)BD_GET_KIOV(desc, 0).kiov_page->index << PAGE_SHIFT,
+		((__u64)BD_GET_KIOV(desc,
+				    desc->bd_iov_count - 1).kiov_page->index
+			<< PAGE_SHIFT) +
+			BD_GET_KIOV(desc, desc->bd_iov_count - 1).kiov_len - 1,
+		client_cksum, cksum_type, server_cksum, server_cksum_type);
+	return 1;
+}
+
+int tgt_brw_read(struct tgt_session_info *tsi)
+{
+	struct ptlrpc_request	*req = tgt_ses_req(tsi);
+	struct ptlrpc_bulk_desc	*desc = NULL;
+	struct obd_export	*exp = tsi->tsi_exp;
+	struct niobuf_remote	*remote_nb;
+	struct niobuf_local	*local_nb;
+	struct obd_ioobj	*ioo;
+	struct ost_body		*body, *repbody;
+	struct l_wait_info	 lwi;
+	struct lustre_handle	 lockh = { 0 };
+	int			 npages, nob = 0, rc, i, no_reply = 0;
+	struct tgt_thread_big_cache *tbc = req->rq_svc_thread->t_data;
+
+	ENTRY;
+
+	if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL) {
+		CERROR("%s: deny read request from %s to portal %u\n",
+		       tgt_name(tsi->tsi_tgt),
+		       obd_export_nid2str(req->rq_export),
+		       ptlrpc_req2svc(req)->srv_req_portal);
+		RETURN(-EPROTO);
+	}
+
+	req->rq_bulk_read = 1;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_READ_BULK))
+		RETURN(-EIO);
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK, cfs_fail_val > 0 ?
+			 cfs_fail_val : (obd_timeout + 1) / 4);
+
+	/* Check if there is eviction in progress, and if so, wait for it to
+	 * finish */
+	if (unlikely(atomic_read(&exp->exp_obd->obd_evict_inprogress))) {
+		/* We do not care how long it takes */
+		lwi = LWI_INTR(NULL, NULL);
+		rc = l_wait_event(exp->exp_obd->obd_evict_inprogress_waitq,
+			 !atomic_read(&exp->exp_obd->obd_evict_inprogress),
+			 &lwi);
+	}
+
+	/* There must be big cache in current thread to process this request
+	 * if it is NULL then something went wrong and it wasn't allocated,
+	 * report -ENOMEM in that case */
+	if (tbc == NULL)
+		RETURN(-ENOMEM);
+
+	body = tsi->tsi_ost_body;
+	LASSERT(body != NULL);
+
+	ioo = req_capsule_client_get(tsi->tsi_pill, &RMF_OBD_IOOBJ);
+	LASSERT(ioo != NULL); /* must exists after tgt_ost_body_unpack */
+
+	remote_nb = req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE);
+	LASSERT(remote_nb != NULL); /* must exists after tgt_ost_body_unpack */
+
+	local_nb = tbc->local;
+
+	rc = tgt_brw_lock(exp->exp_obd->obd_namespace, &tsi->tsi_resid, ioo,
+			  remote_nb, &lockh, LCK_PR);
+	if (rc != 0)
+		RETURN(rc);
+
+	/*
+	 * If getting the lock took more time than
+	 * client was willing to wait, drop it. b=11330
+	 */
+	if (ktime_get_real_seconds() > req->rq_deadline ||
+	    OBD_FAIL_CHECK(OBD_FAIL_OST_DROP_REQ)) {
+		no_reply = 1;
+		CERROR("Dropping timed-out read from %s because locking object " DOSTID " took %lld seconds (limit was %lld).\n",
+		       libcfs_id2str(req->rq_peer), POSTID(&ioo->ioo_oid),
+		       ktime_get_real_seconds() - req->rq_arrival_time.tv_sec,
+		       req->rq_deadline - req->rq_arrival_time.tv_sec);
+		GOTO(out_lock, rc = -ETIMEDOUT);
+	}
+
+	repbody = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	repbody->oa = body->oa;
+
+	npages = PTLRPC_MAX_BRW_PAGES;
+	rc = obd_preprw(tsi->tsi_env, OBD_BRW_READ, exp, &repbody->oa, 1,
+			ioo, remote_nb, &npages, local_nb);
+	if (rc != 0)
+		GOTO(out_lock, rc);
+
+	desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
+				    PTLRPC_BULK_PUT_SOURCE |
+					PTLRPC_BULK_BUF_KIOV,
+				    OST_BULK_PORTAL,
+				    &ptlrpc_bulk_kiov_nopin_ops);
+	if (desc == NULL)
+		GOTO(out_commitrw, rc = -ENOMEM);
+
+	nob = 0;
+	for (i = 0; i < npages; i++) {
+		int page_rc = local_nb[i].lnb_rc;
+
+		if (page_rc < 0) {
+			rc = page_rc;
+			break;
+		}
+
+		nob += page_rc;
+		if (page_rc != 0) { /* some data! */
+			LASSERT(local_nb[i].lnb_page != NULL);
+			desc->bd_frag_ops->add_kiov_frag
+			  (desc, local_nb[i].lnb_page,
+			   local_nb[i].lnb_page_offset,
+			   page_rc);
+		}
+
+		if (page_rc != local_nb[i].lnb_len) { /* short read */
+			/* All subsequent pages should be 0 */
+			while (++i < npages)
+				LASSERT(local_nb[i].lnb_rc == 0);
+			break;
+		}
+	}
+	if (OBD_FAIL_CHECK(OBD_FAIL_OST_READ_SIZE) &&
+	    nob != cfs_fail_val)
+		rc = -E2BIG;
+
+	if (body->oa.o_valid & OBD_MD_FLCKSUM) {
+		cksum_type_t cksum_type =
+			cksum_type_unpack(body->oa.o_valid & OBD_MD_FLFLAGS ?
+					  body->oa.o_flags : 0);
+
+		repbody->oa.o_flags = cksum_type_pack(cksum_type);
+		repbody->oa.o_valid = OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+		repbody->oa.o_cksum = tgt_checksum_bulk(tsi->tsi_tgt, desc,
+							OST_READ, cksum_type);
+		CDEBUG(D_PAGE, "checksum at read origin: %x\n",
+		       repbody->oa.o_cksum);
+
+		/* if a resend it could be for a cksum error, so check Server
+		 * cksum with returned Client cksum (this should even cover
+		 * zero-cksum case) */
+		if ((body->oa.o_valid & OBD_MD_FLFLAGS) &&
+		    (body->oa.o_flags & OBD_FL_RECOV_RESEND))
+			check_read_checksum(desc, &body->oa, &req->rq_peer,
+					    body->oa.o_cksum,
+					    repbody->oa.o_cksum, cksum_type);
+	} else {
+		repbody->oa.o_valid = 0;
+	}
+	/* We're finishing using body->oa as an input variable */
+
+	/* Check if client was evicted while we were doing i/o before touching
+	 * network */
+	if (likely(rc == 0 &&
+		   !CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2) &&
+		   !CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_BULK))) {
+		rc = target_bulk_io(exp, desc, &lwi);
+		no_reply = rc != 0;
+	}
+
+out_commitrw:
+	/* Must commit after prep above in all cases */
+	rc = obd_commitrw(tsi->tsi_env, OBD_BRW_READ, exp, &repbody->oa, 1, ioo,
+			  remote_nb, npages, local_nb, rc);
+out_lock:
+	tgt_brw_unlock(ioo, remote_nb, &lockh, LCK_PR);
+
+	if (desc && !CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2))
+		ptlrpc_free_bulk(desc);
+
+	LASSERT(rc <= 0);
+	if (rc == 0) {
+		rc = nob;
+		ptlrpc_lprocfs_brw(req, nob);
+	} else if (no_reply) {
+		req->rq_no_reply = 1;
+		/* reply out callback would free */
+		ptlrpc_req_drop_rs(req);
+		LCONSOLE_WARN("%s: Bulk IO read error with %s (at %s), "
+			      "client will retry: rc %d\n",
+			      exp->exp_obd->obd_name,
+			      obd_uuid2str(&exp->exp_client_uuid),
+			      obd_export_nid2str(exp), rc);
+	}
+	/* send a bulk after reply to simulate a network delay or reordering
+	 * by a router */
+	if (unlikely(CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2))) {
+		wait_queue_head_t	 waitq;
+		struct l_wait_info	 lwi1;
+
+		CDEBUG(D_INFO, "reorder BULK\n");
+		init_waitqueue_head(&waitq);
+
+		lwi1 = LWI_TIMEOUT_INTR(cfs_time_seconds(3), NULL, NULL, NULL);
+		l_wait_event(waitq, 0, &lwi1);
+		target_bulk_io(exp, desc, &lwi);
+		ptlrpc_free_bulk(desc);
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_brw_read);
+
+static void tgt_warn_on_cksum(struct ptlrpc_request *req,
+			      struct ptlrpc_bulk_desc *desc,
+			      struct niobuf_local *local_nb, int npages,
+			      u32 client_cksum, u32 server_cksum,
+			      bool mmap)
+{
+	struct obd_export *exp = req->rq_export;
+	struct ost_body *body;
+	char *router = "";
+	char *via = "";
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body != NULL);
+
+	if (req->rq_peer.nid != desc->bd_sender) {
+		via = " via ";
+		router = libcfs_nid2str(desc->bd_sender);
+	}
+
+	if (exp->exp_obd->obd_checksum_dump)
+		dump_all_bulk_pages(&body->oa, desc->bd_iov_count,
+				    &BD_GET_KIOV(desc, 0), server_cksum,
+				    client_cksum);
+
+	if (mmap) {
+		CDEBUG_LIMIT(D_INFO, "client csum %x, server csum %x\n",
+			     client_cksum, server_cksum);
+		return;
+	}
+
+	LCONSOLE_ERROR_MSG(0x168, "%s: BAD WRITE CHECKSUM: from %s%s%s inode "
+			   DFID" object "DOSTID" extent [%llu-%llu"
+			   "]: client csum %x, server csum %x\n",
+			   exp->exp_obd->obd_name, libcfs_id2str(req->rq_peer),
+			   via, router,
+			   body->oa.o_valid & OBD_MD_FLFID ?
+			   body->oa.o_parent_seq : (__u64)0,
+			   body->oa.o_valid & OBD_MD_FLFID ?
+			   body->oa.o_parent_oid : 0,
+			   body->oa.o_valid & OBD_MD_FLFID ?
+			   body->oa.o_parent_ver : 0,
+			   POSTID(&body->oa.o_oi),
+			   local_nb[0].lnb_file_offset,
+			   local_nb[npages-1].lnb_file_offset +
+			   local_nb[npages - 1].lnb_len - 1,
+			   client_cksum, server_cksum);
+}
+
+int tgt_brw_write(struct tgt_session_info *tsi)
+{
+	struct ptlrpc_request	*req = tgt_ses_req(tsi);
+	struct ptlrpc_bulk_desc	*desc = NULL;
+	struct obd_export	*exp = req->rq_export;
+	struct niobuf_remote	*remote_nb;
+	struct niobuf_local	*local_nb;
+	struct obd_ioobj	*ioo;
+	struct ost_body		*body, *repbody;
+	struct l_wait_info	 lwi;
+	struct lustre_handle	 lockh = {0};
+	__u32			*rcs;
+	int			 objcount, niocount, npages;
+	int			 rc, i, j;
+	cksum_type_t		 cksum_type = OBD_CKSUM_CRC32;
+	bool			 no_reply = false, mmap;
+	struct tgt_thread_big_cache *tbc = req->rq_svc_thread->t_data;
+	bool wait_sync = false;
+
+	ENTRY;
+
+	if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL) {
+		CERROR("%s: deny write request from %s to portal %u\n",
+		       tgt_name(tsi->tsi_tgt),
+		       obd_export_nid2str(req->rq_export),
+		       ptlrpc_req2svc(req)->srv_req_portal);
+		RETURN(err_serious(-EPROTO));
+	}
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_OST_ENOSPC))
+		RETURN(err_serious(-ENOSPC));
+	if (OBD_FAIL_TIMEOUT(OBD_FAIL_OST_EROFS, 1))
+		RETURN(err_serious(-EROFS));
+
+	req->rq_bulk_write = 1;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_WRITE_BULK))
+		RETURN(err_serious(-EIO));
+	if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_WRITE_BULK2))
+		RETURN(err_serious(-EFAULT));
+
+	/* pause before transaction has been started */
+	CFS_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK, cfs_fail_val > 0 ?
+			 cfs_fail_val : (obd_timeout + 1) / 4);
+
+	/* There must be big cache in current thread to process this request
+	 * if it is NULL then something went wrong and it wasn't allocated,
+	 * report -ENOMEM in that case */
+	if (tbc == NULL)
+		RETURN(-ENOMEM);
+
+	body = tsi->tsi_ost_body;
+	LASSERT(body != NULL);
+
+	ioo = req_capsule_client_get(&req->rq_pill, &RMF_OBD_IOOBJ);
+	LASSERT(ioo != NULL); /* must exists after tgt_ost_body_unpack */
+
+	objcount = req_capsule_get_size(&req->rq_pill, &RMF_OBD_IOOBJ,
+					RCL_CLIENT) / sizeof(*ioo);
+
+	for (niocount = i = 0; i < objcount; i++)
+		niocount += ioo[i].ioo_bufcnt;
+
+	remote_nb = req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE);
+	LASSERT(remote_nb != NULL); /* must exists after tgt_ost_body_unpack */
+	if (niocount != req_capsule_get_size(&req->rq_pill,
+					     &RMF_NIOBUF_REMOTE, RCL_CLIENT) /
+			sizeof(*remote_nb))
+		RETURN(err_serious(-EPROTO));
+
+	if ((remote_nb[0].rnb_flags & OBD_BRW_MEMALLOC) &&
+	    ptlrpc_connection_is_local(exp->exp_connection))
+		memory_pressure_set();
+
+	req_capsule_set_size(&req->rq_pill, &RMF_RCS, RCL_SERVER,
+			     niocount * sizeof(*rcs));
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc != 0)
+		GOTO(out, rc = err_serious(rc));
+
+	CFS_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_PACK, cfs_fail_val);
+	rcs = req_capsule_server_get(&req->rq_pill, &RMF_RCS);
+
+	local_nb = tbc->local;
+
+	rc = tgt_brw_lock(exp->exp_obd->obd_namespace, &tsi->tsi_resid, ioo,
+			  remote_nb, &lockh, LCK_PW);
+	if (rc != 0)
+		GOTO(out, rc);
+
+	/*
+	 * If getting the lock took more time than
+	 * client was willing to wait, drop it. b=11330
+	 */
+	if (ktime_get_real_seconds() > req->rq_deadline ||
+	    OBD_FAIL_CHECK(OBD_FAIL_OST_DROP_REQ)) {
+		no_reply = true;
+		CERROR("%s: Dropping timed-out write from %s because locking object " DOSTID " took %lld seconds (limit was %lld).\n",
+		       tgt_name(tsi->tsi_tgt), libcfs_id2str(req->rq_peer),
+		       POSTID(&ioo->ioo_oid),
+		       ktime_get_real_seconds() - req->rq_arrival_time.tv_sec,
+		       req->rq_deadline - req->rq_arrival_time.tv_sec);
+		GOTO(out_lock, rc = -ETIMEDOUT);
+	}
+
+	/* Because we already sync grant info with client when reconnect,
+	 * grant info will be cleared for resent req, then fed_grant and
+	 * total_grant will not be modified in following preprw_write */
+	if (lustre_msg_get_flags(req->rq_reqmsg) & (MSG_RESENT | MSG_REPLAY)) {
+		DEBUG_REQ(D_CACHE, req, "clear resent/replay req grant info");
+		body->oa.o_valid &= ~OBD_MD_FLGRANT;
+	}
+
+	repbody = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (repbody == NULL)
+		GOTO(out_lock, rc = -ENOMEM);
+	repbody->oa = body->oa;
+
+	npages = PTLRPC_MAX_BRW_PAGES;
+	rc = obd_preprw(tsi->tsi_env, OBD_BRW_WRITE, exp, &repbody->oa,
+			objcount, ioo, remote_nb, &npages, local_nb);
+	if (rc < 0)
+		GOTO(out_lock, rc);
+
+	desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
+				    PTLRPC_BULK_GET_SINK | PTLRPC_BULK_BUF_KIOV,
+				    OST_BULK_PORTAL,
+				    &ptlrpc_bulk_kiov_nopin_ops);
+	if (desc == NULL)
+		GOTO(skip_transfer, rc = -ENOMEM);
+
+	/* NB Having prepped, we must commit... */
+	for (i = 0; i < npages; i++)
+		desc->bd_frag_ops->add_kiov_frag(desc,
+						 local_nb[i].lnb_page,
+						 local_nb[i].lnb_page_offset,
+						 local_nb[i].lnb_len);
+
+	rc = sptlrpc_svc_prep_bulk(req, desc);
+	if (rc != 0)
+		GOTO(skip_transfer, rc);
+
+	rc = target_bulk_io(exp, desc, &lwi);
+	no_reply = rc != 0;
+
+skip_transfer:
+	if (body->oa.o_valid & OBD_MD_FLCKSUM && rc == 0) {
+		static int cksum_counter;
+
+		if (body->oa.o_valid & OBD_MD_FLFLAGS)
+			cksum_type = cksum_type_unpack(body->oa.o_flags);
+
+		repbody->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+		repbody->oa.o_flags &= ~OBD_FL_CKSUM_ALL;
+		repbody->oa.o_flags |= cksum_type_pack(cksum_type);
+		repbody->oa.o_cksum = tgt_checksum_bulk(tsi->tsi_tgt, desc,
+							OST_WRITE, cksum_type);
+		cksum_counter++;
+
+		if (unlikely(body->oa.o_cksum != repbody->oa.o_cksum)) {
+			mmap = (body->oa.o_valid & OBD_MD_FLFLAGS &&
+				body->oa.o_flags & OBD_FL_MMAP);
+
+			tgt_warn_on_cksum(req, desc, local_nb, npages,
+					  body->oa.o_cksum,
+					  repbody->oa.o_cksum, mmap);
+			cksum_counter = 0;
+		} else if ((cksum_counter & (-cksum_counter)) ==
+			   cksum_counter) {
+			CDEBUG(D_INFO, "Checksum %u from %s OK: %x\n",
+			       cksum_counter, libcfs_id2str(req->rq_peer),
+			       repbody->oa.o_cksum);
+		}
+	}
+
+	/* Must commit after prep above in all cases */
+	rc = obd_commitrw(tsi->tsi_env, OBD_BRW_WRITE, exp, &repbody->oa,
+			  objcount, ioo, remote_nb, npages, local_nb, rc);
+	if (rc == -ENOTCONN)
+		/* quota acquire process has been given up because
+		 * either the client has been evicted or the client
+		 * has timed out the request already */
+		no_reply = true;
+
+	for (i = 0; i < niocount; i++) {
+		if (!(local_nb[i].lnb_flags & OBD_BRW_ASYNC)) {
+			wait_sync = true;
+			break;
+		}
+	}
+	/*
+	 * Disable sending mtime back to the client. If the client locked the
+	 * whole object, then it has already updated the mtime on its side,
+	 * otherwise it will have to glimpse anyway (see bug 21489, comment 32)
+	 */
+	repbody->oa.o_valid &= ~(OBD_MD_FLMTIME | OBD_MD_FLATIME);
+
+	if (rc == 0) {
+		int nob = 0;
+
+		/* set per-requested niobuf return codes */
+		for (i = j = 0; i < niocount; i++) {
+			int len = remote_nb[i].rnb_len;
+
+			nob += len;
+			rcs[i] = 0;
+			do {
+				LASSERT(j < npages);
+				if (local_nb[j].lnb_rc < 0)
+					rcs[i] = local_nb[j].lnb_rc;
+				len -= local_nb[j].lnb_len;
+				j++;
+			} while (len > 0);
+			LASSERT(len == 0);
+		}
+		LASSERT(j == npages);
+		ptlrpc_lprocfs_brw(req, nob);
+	}
+out_lock:
+	tgt_brw_unlock(ioo, remote_nb, &lockh, LCK_PW);
+	if (desc)
+		ptlrpc_free_bulk(desc);
+out:
+	if (unlikely(no_reply || (exp->exp_obd->obd_no_transno && wait_sync))) {
+		req->rq_no_reply = 1;
+		/* reply out callback would free */
+		ptlrpc_req_drop_rs(req);
+		if (!exp->exp_obd->obd_no_transno)
+			LCONSOLE_WARN("%s: Bulk IO write error with %s (at %s),"
+				      " client will retry: rc = %d\n",
+				      exp->exp_obd->obd_name,
+				      obd_uuid2str(&exp->exp_client_uuid),
+				      obd_export_nid2str(exp), rc);
+	}
+	memory_pressure_clr();
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_brw_write);
+
+/* Check if request can be reconstructed from saved reply data
+ * A copy of the reply data is returned in @trd if the pointer is not NULL
+ */
+bool req_can_reconstruct(struct ptlrpc_request *req,
+			 struct tg_reply_data *trd)
+{
+	struct tg_export_data *ted = &req->rq_export->exp_target_data;
+	struct lsd_client_data *lcd = ted->ted_lcd;
+	bool found;
+
+	if (tgt_is_multimodrpcs_client(req->rq_export))
+		return tgt_lookup_reply(req, trd);
+
+	mutex_lock(&ted->ted_lcd_lock);
+	found = req->rq_xid == lcd->lcd_last_xid ||
+		req->rq_xid == lcd->lcd_last_close_xid;
+
+	if (found && trd != NULL) {
+		if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_CLOSE) {
+			trd->trd_reply.lrd_xid = lcd->lcd_last_close_xid;
+			trd->trd_reply.lrd_transno =
+						lcd->lcd_last_close_transno;
+			trd->trd_reply.lrd_result = lcd->lcd_last_close_result;
+		} else {
+			trd->trd_reply.lrd_xid = lcd->lcd_last_xid;
+			trd->trd_reply.lrd_transno = lcd->lcd_last_transno;
+			trd->trd_reply.lrd_result = lcd->lcd_last_result;
+			trd->trd_reply.lrd_data = lcd->lcd_last_data;
+			trd->trd_pre_versions[0] = lcd->lcd_pre_versions[0];
+			trd->trd_pre_versions[1] = lcd->lcd_pre_versions[1];
+			trd->trd_pre_versions[2] = lcd->lcd_pre_versions[2];
+			trd->trd_pre_versions[3] = lcd->lcd_pre_versions[3];
+		}
+	}
+	mutex_unlock(&ted->ted_lcd_lock);
+
+	return found;
+}
+EXPORT_SYMBOL(req_can_reconstruct);
+
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_internal.h b/drivers/staging/lustrefsx/lustre/target/tgt_internal.h
new file mode 100644
index 0000000000000..981e2ab9f9ade
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_internal.h
@@ -0,0 +1,291 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * lustre/target/tgt_internal.h
+ *
+ * Lustre Unified Target header file
+ *
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#ifndef _TG_INTERNAL_H
+#define _TG_INTERNAL_H
+
+#include <lustre_net.h>
+#include <lustre/lustre_idl.h>
+#include <lu_target.h>
+#include <lustre_export.h>
+#include <lustre_fid.h>
+#include <lustre_fld.h>
+#include <lustre_req_layout.h>
+#include <lustre_sec.h>
+
+extern int (*tgt_lfsck_in_notify_local)(const struct lu_env *env,
+					struct dt_device *key,
+					struct lfsck_req_local *lrl,
+					struct thandle *th);
+/**
+ * Common data shared by tg-level handlers. This is allocated per-thread to
+ * reduce stack consumption.
+ */
+struct tgt_thread_info {
+	/* server and client data buffers */
+	struct lr_server_data	 tti_lsd;
+	struct lsd_client_data	 tti_lcd;
+	struct lsd_reply_data	 tti_lrd;
+	struct lu_buf		 tti_buf;
+	loff_t			 tti_off;
+
+	struct lu_attr		 tti_attr;
+	struct lu_fid		 tti_fid1;
+
+	/* transno storage during last_rcvd update */
+	__u64			 tti_transno;
+	__u32			 tti_has_trans:1,
+				 tti_mult_trans:1;
+
+	/* Updates data for OUT target */
+	struct thandle_exec_args tti_tea;
+	union {
+		struct {
+			/* for tgt_readpage()      */
+			struct lu_rdpg     tti_rdpg;
+			/* for tgt_sendpage()      */
+			struct l_wait_info tti_wait_info;
+		} rdpg;
+		struct {
+			struct dt_object_format	   tti_update_dof;
+			struct object_update_reply *tti_update_reply;
+			struct object_update	   *tti_update;
+			int			   tti_update_reply_index;
+			struct obdo		   tti_obdo;
+			struct dt_object	   *tti_dt_object;
+			struct l_wait_info tti_wait_info;
+		} update;
+		struct obd_statfs osfs; /* for obd_statfs() in OFD/MDT */
+	} tti_u;
+	struct lfsck_req_local tti_lrl;
+	struct dt_insert_rec tti_rec;
+};
+
+extern struct lu_context_key tgt_thread_key;
+
+static inline struct tgt_thread_info *tgt_th_info(const struct lu_env *env)
+{
+	struct tgt_thread_info *tti;
+
+	tti = lu_context_key_get(&env->le_ctx, &tgt_thread_key);
+	LASSERT(tti);
+	return tti;
+}
+
+#define MGS_SERVICE_WATCHDOG_FACTOR      (2)
+
+int tgt_request_handle(struct ptlrpc_request *req);
+
+/* check if request's xid is equal to last one or not*/
+static inline int req_xid_is_last(struct ptlrpc_request *req)
+{
+	struct lsd_client_data *lcd = req->rq_export->exp_target_data.ted_lcd;
+
+	LASSERT(lcd != NULL);
+	return (req->rq_xid == lcd->lcd_last_xid ||
+		req->rq_xid == lcd->lcd_last_close_xid);
+}
+
+static inline char *dt_obd_name(struct dt_device *dt)
+{
+	return dt->dd_lu_dev.ld_obd->obd_name;
+}
+
+/* out_lib.c */
+int out_tx_create_exec(const struct lu_env *env, struct thandle *th,
+		       struct tx_arg *arg);
+struct tx_arg *tx_add_exec(struct thandle_exec_args *ta,
+			   tx_exec_func_t func, tx_exec_func_t undo,
+			   const char *file, int line);
+
+int out_create_add_exec(const struct lu_env *env, struct dt_object *obj,
+			struct lu_attr *attr, struct lu_fid *parent_fid,
+			struct dt_object_format *dof,
+			struct thandle_exec_args *ta, struct thandle *th,
+			struct object_update_reply *reply,
+			int index, const char *file, int line);
+
+int out_attr_set_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
+			  const struct lu_attr *attr,
+			  struct thandle_exec_args *ta, struct thandle *th,
+			  struct object_update_reply *reply, int index,
+			  const char *file, int line);
+
+int out_write_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
+		       const struct lu_buf *buf, loff_t pos,
+		       struct thandle_exec_args *ta, struct thandle *th,
+		       struct object_update_reply *reply, int index,
+		       const char *file, int line);
+
+int out_xattr_set_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
+			   const struct lu_buf *buf, const char *name,
+			   int flags, struct thandle_exec_args *ta,
+			   struct thandle *th,
+			   struct object_update_reply *reply, int index,
+			   const char *file, int line);
+
+int out_xattr_del_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
+			   const char *name, struct thandle_exec_args *ta,
+			   struct thandle *th,
+			   struct object_update_reply *reply, int index,
+			   const char *file, int line);
+
+int out_ref_add_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
+			 struct thandle_exec_args *ta, struct thandle *th,
+			 struct object_update_reply *reply, int index,
+			 const char *file, int line);
+
+int out_ref_del_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
+			 struct thandle_exec_args *ta, struct thandle *th,
+			 struct object_update_reply *reply, int index,
+			 const char *file, int line);
+
+int out_index_insert_add_exec(const struct lu_env *env,
+			      struct dt_object *dt_obj,
+			      const struct dt_rec *rec,
+			      const struct dt_key *key,
+			      struct thandle_exec_args *ta,
+			      struct thandle *th,
+			      struct object_update_reply *reply,
+			      int index, const char *file, int line);
+
+int out_index_delete_add_exec(const struct lu_env *env,
+			      struct dt_object *dt_obj,
+			      const struct dt_key *key,
+			      struct thandle_exec_args *ta,
+			      struct thandle *th,
+			      struct object_update_reply *reply,
+			      int index, const char *file, int line);
+
+int out_destroy_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
+			 struct thandle_exec_args *ta, struct thandle *th,
+			 struct object_update_reply *reply,
+			 int index, const char *file, int line);
+
+/* Update handlers */
+int out_handle(struct tgt_session_info *tsi);
+
+#define out_tx_create(env, obj, attr, fid, dof, ta, th, reply, idx) \
+	out_create_add_exec(env, obj, attr, fid, dof, ta, th, reply, idx, \
+			    __FILE__, __LINE__)
+
+#define out_tx_attr_set(env, obj, attr, ta, th, reply, idx) \
+	out_attr_set_add_exec(env, obj, attr, ta, th, reply, idx, \
+			      __FILE__, __LINE__)
+
+#define out_tx_xattr_set(env, obj, buf, name, fl, ta, th, reply, idx)	\
+	out_xattr_set_add_exec(env, obj, buf, name, fl, ta, th, reply, idx, \
+			       __FILE__, __LINE__)
+
+#define out_tx_xattr_del(env, obj, name, ta, th, reply, idx)	\
+	out_xattr_del_add_exec(env, obj, name, ta, th, reply, idx,	\
+			       __FILE__, __LINE__)
+
+#define out_tx_ref_add(env, obj, ta, th, reply, idx) \
+	out_ref_add_add_exec(env, obj, ta, th, reply, idx,	\
+			     __FILE__, __LINE__)
+
+#define out_tx_ref_del(env, obj, ta, th, reply, idx) \
+	out_ref_del_add_exec(env, obj, ta, th, reply, idx,	\
+			     __FILE__, __LINE__)
+
+#define out_tx_index_insert(env, obj, rec, key, ta, th, reply, idx) \
+	out_index_insert_add_exec(env, obj, rec, key, ta, th, reply, idx, \
+				  __FILE__, __LINE__)
+
+#define out_tx_index_delete(env, obj, key, ta, th, reply, idx) \
+	out_index_delete_add_exec(env, obj, key, ta, th, reply, idx, \
+				  __FILE__, __LINE__)
+
+#define out_tx_destroy(env, obj, ta, th, reply, idx) \
+	out_destroy_add_exec(env, obj, ta, th, reply, idx,	\
+			     __FILE__, __LINE__)
+
+#define out_tx_write(env, obj, buf, pos, ta, th, reply, idx) \
+	out_write_add_exec(env, obj, buf, pos, ta, th, reply, idx,\
+			   __FILE__, __LINE__)
+
+const char *update_op_str(__u16 opcode);
+
+extern struct page *tgt_page_to_corrupt;
+
+int tgt_server_data_init(const struct lu_env *env, struct lu_target *tgt);
+int tgt_txn_start_cb(const struct lu_env *env, struct thandle *th,
+		     void *cookie);
+int tgt_txn_stop_cb(const struct lu_env *env, struct thandle *th,
+		    void *cookie);
+int tgt_handle_received_xid(struct obd_export *exp, __u64 rcvd_xid);
+int tgt_handle_tag(struct obd_export *exp, __u16 tag);
+
+void update_records_dump(const struct update_records *records,
+			 unsigned int mask, bool dump_updates);
+int check_and_prepare_update_record(const struct lu_env *env,
+				    struct thandle_update_records *tur);
+struct update_thread_info {
+	struct lu_attr			uti_attr;
+	struct lu_fid			uti_fid;
+	struct lu_buf			uti_buf;
+	struct thandle_update_records	uti_tur;
+	struct obdo			uti_obdo;
+	struct thandle_exec_args	uti_tea;
+	struct dt_insert_rec		uti_rec;
+	struct distribute_txn_replay_req *uti_dtrq;
+};
+
+extern struct lu_context_key update_thread_key;
+
+static inline struct update_thread_info *
+update_env_info(const struct lu_env *env)
+{
+	struct update_thread_info *uti;
+
+	uti = lu_context_key_get(&env->le_ctx, &update_thread_key);
+	LASSERT(uti != NULL);
+	return uti;
+}
+
+void update_info_init(void);
+void update_info_fini(void);
+struct sub_thandle *create_sub_thandle(struct top_multiple_thandle *tmt,
+				       struct dt_device *dt_dev);
+int sub_thandle_trans_create(const struct lu_env *env,
+			     struct top_thandle *top_th,
+			     struct sub_thandle *st);
+void distribute_txn_insert_by_batchid(struct top_multiple_thandle *new);
+int top_trans_create_tmt(const struct lu_env *env,
+			 struct top_thandle *top_th);
+
+void tgt_cancel_slc_locks(struct lu_target *tgt, __u64 transno);
+void barrier_init(void);
+void barrier_fini(void);
+#endif /* _TG_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_lastrcvd.c b/drivers/staging/lustrefsx/lustre/target/tgt_lastrcvd.c
new file mode 100644
index 0000000000000..c7aecdf2171ea
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_lastrcvd.c
@@ -0,0 +1,2117 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Lustre Unified Target
+ * These are common function to work with last_received file
+ *
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_fid.h>
+
+#include "tgt_internal.h"
+
+/** version recovery epoch */
+#define LR_EPOCH_BITS	32
+
+/* Allocate a bitmap for a chunk of reply data slots */
+static int tgt_bitmap_chunk_alloc(struct lu_target *lut, int chunk)
+{
+	unsigned long *bm;
+
+	OBD_ALLOC_LARGE(bm, BITS_TO_LONGS(LUT_REPLY_SLOTS_PER_CHUNK) *
+			sizeof(long));
+	if (bm == NULL)
+		return -ENOMEM;
+
+	spin_lock(&lut->lut_client_bitmap_lock);
+
+	if (lut->lut_reply_bitmap[chunk] != NULL) {
+		/* someone else already allocated the bitmap for this chunk */
+		spin_unlock(&lut->lut_client_bitmap_lock);
+		OBD_FREE_LARGE(bm, BITS_TO_LONGS(LUT_REPLY_SLOTS_PER_CHUNK) *
+			 sizeof(long));
+		return 0;
+	}
+
+	lut->lut_reply_bitmap[chunk] = bm;
+
+	spin_unlock(&lut->lut_client_bitmap_lock);
+
+	return 0;
+}
+
+/* Look for an available reply data slot in the bitmap
+ * of the target @lut
+ * Allocate bitmap chunk when first used
+ * XXX algo could be improved if this routine limits performance
+ */
+static int tgt_find_free_reply_slot(struct lu_target *lut)
+{
+	unsigned long *bmp;
+	int chunk = 0;
+	int rc;
+	int b;
+
+	for (chunk = 0; chunk < LUT_REPLY_SLOTS_MAX_CHUNKS; chunk++) {
+		/* allocate the bitmap chunk if necessary */
+		if (unlikely(lut->lut_reply_bitmap[chunk] == NULL)) {
+			rc = tgt_bitmap_chunk_alloc(lut, chunk);
+			if (rc != 0)
+				return rc;
+		}
+		bmp = lut->lut_reply_bitmap[chunk];
+
+		/* look for an available slot in this chunk */
+		do {
+			b = find_first_zero_bit(bmp, LUT_REPLY_SLOTS_PER_CHUNK);
+			if (b >= LUT_REPLY_SLOTS_PER_CHUNK)
+				break;
+
+			/* found one */
+			if (test_and_set_bit(b, bmp) == 0)
+				return chunk * LUT_REPLY_SLOTS_PER_CHUNK + b;
+		} while (true);
+	}
+
+	return -ENOSPC;
+}
+
+/* Mark the reply data slot @idx 'used' in the corresponding bitmap chunk
+ * of the target @lut
+ * Allocate the bitmap chunk if necessary
+ */
+static int tgt_set_reply_slot(struct lu_target *lut, int idx)
+{
+	int chunk;
+	int b;
+	int rc;
+
+	chunk = idx / LUT_REPLY_SLOTS_PER_CHUNK;
+	b = idx % LUT_REPLY_SLOTS_PER_CHUNK;
+
+	LASSERT(chunk < LUT_REPLY_SLOTS_MAX_CHUNKS);
+	LASSERT(b < LUT_REPLY_SLOTS_PER_CHUNK);
+
+	/* allocate the bitmap chunk if necessary */
+	if (unlikely(lut->lut_reply_bitmap[chunk] == NULL)) {
+		rc = tgt_bitmap_chunk_alloc(lut, chunk);
+		if (rc != 0)
+			return rc;
+	}
+
+	/* mark the slot 'used' in this chunk */
+	if (test_and_set_bit(b, lut->lut_reply_bitmap[chunk]) != 0) {
+		CERROR("%s: slot %d already set in bitmap\n",
+		       tgt_name(lut), idx);
+		return -EALREADY;
+	}
+
+	return 0;
+}
+
+
+/* Mark the reply data slot @idx 'unused' in the corresponding bitmap chunk
+ * of the target @lut
+ */
+static int tgt_clear_reply_slot(struct lu_target *lut, int idx)
+{
+	int chunk;
+	int b;
+
+	chunk = idx / LUT_REPLY_SLOTS_PER_CHUNK;
+	b = idx % LUT_REPLY_SLOTS_PER_CHUNK;
+
+	LASSERT(chunk < LUT_REPLY_SLOTS_MAX_CHUNKS);
+	LASSERT(b < LUT_REPLY_SLOTS_PER_CHUNK);
+
+	if (lut->lut_reply_bitmap[chunk] == NULL) {
+		CERROR("%s: slot %d not allocated\n",
+		       tgt_name(lut), idx);
+		return -ENOENT;
+	}
+
+	if (test_and_clear_bit(b, lut->lut_reply_bitmap[chunk]) == 0) {
+		CERROR("%s: slot %d already clear in bitmap\n",
+		       tgt_name(lut), idx);
+		return -EALREADY;
+	}
+
+	return 0;
+}
+
+
+/* Read header of reply_data file of target @tgt into structure @lrh */
+static int tgt_reply_header_read(const struct lu_env *env,
+				 struct lu_target *tgt,
+				 struct lsd_reply_header *lrh)
+{
+	int			 rc;
+	struct lsd_reply_header	 buf;
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+
+	tti->tti_off = 0;
+	tti->tti_buf.lb_buf = &buf;
+	tti->tti_buf.lb_len = sizeof(buf);
+
+	rc = dt_record_read(env, tgt->lut_reply_data, &tti->tti_buf,
+			    &tti->tti_off);
+	if (rc != 0)
+		return rc;
+
+	lrh->lrh_magic = le32_to_cpu(buf.lrh_magic);
+	lrh->lrh_header_size = le32_to_cpu(buf.lrh_header_size);
+	lrh->lrh_reply_size = le32_to_cpu(buf.lrh_reply_size);
+
+	CDEBUG(D_HA, "%s: read %s header. magic=0x%08x "
+	       "header_size=%d reply_size=%d\n",
+		tgt->lut_obd->obd_name, REPLY_DATA,
+		lrh->lrh_magic, lrh->lrh_header_size, lrh->lrh_reply_size);
+
+	return 0;
+}
+
+/* Write header into replay_data file of target @tgt from structure @lrh */
+static int tgt_reply_header_write(const struct lu_env *env,
+				  struct lu_target *tgt,
+				  struct lsd_reply_header *lrh)
+{
+	int			 rc;
+	struct lsd_reply_header	 buf;
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct thandle		*th;
+	struct dt_object	*dto;
+
+	CDEBUG(D_HA, "%s: write %s header. magic=0x%08x "
+	       "header_size=%d reply_size=%d\n",
+		tgt->lut_obd->obd_name, REPLY_DATA,
+		lrh->lrh_magic, lrh->lrh_header_size, lrh->lrh_reply_size);
+
+	if (tgt->lut_bottom->dd_rdonly)
+		RETURN(0);
+
+	buf.lrh_magic = cpu_to_le32(lrh->lrh_magic);
+	buf.lrh_header_size = cpu_to_le32(lrh->lrh_header_size);
+	buf.lrh_reply_size = cpu_to_le32(lrh->lrh_reply_size);
+
+	th = dt_trans_create(env, tgt->lut_bottom);
+	if (IS_ERR(th))
+		return PTR_ERR(th);
+	th->th_sync = 1;
+
+	tti->tti_off = 0;
+	tti->tti_buf.lb_buf = &buf;
+	tti->tti_buf.lb_len = sizeof(buf);
+
+	rc = dt_declare_record_write(env, tgt->lut_reply_data,
+				     &tti->tti_buf, tti->tti_off, th);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = dt_trans_start(env, tgt->lut_bottom, th);
+	if (rc)
+		GOTO(out, rc);
+
+	dto = dt_object_locate(tgt->lut_reply_data, th->th_dev);
+	rc = dt_record_write(env, dto, &tti->tti_buf, &tti->tti_off, th);
+out:
+	dt_trans_stop(env, tgt->lut_bottom, th);
+	return rc;
+}
+
+/* Write the reply data @lrd into reply_data file of target @tgt
+ * at offset @off
+ */
+static int tgt_reply_data_write(const struct lu_env *env, struct lu_target *tgt,
+				struct lsd_reply_data *lrd, loff_t off,
+				struct thandle *th)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct dt_object	*dto;
+	struct lsd_reply_data	*buf = &tti->tti_lrd;
+
+	lrd->lrd_result = ptlrpc_status_hton(lrd->lrd_result);
+
+	buf->lrd_transno	 = cpu_to_le64(lrd->lrd_transno);
+	buf->lrd_xid		 = cpu_to_le64(lrd->lrd_xid);
+	buf->lrd_data		 = cpu_to_le64(lrd->lrd_data);
+	buf->lrd_result		 = cpu_to_le32(lrd->lrd_result);
+	buf->lrd_client_gen	 = cpu_to_le32(lrd->lrd_client_gen);
+
+	lrd->lrd_result = ptlrpc_status_ntoh(lrd->lrd_result);
+
+	tti->tti_off = off;
+	tti->tti_buf.lb_buf = buf;
+	tti->tti_buf.lb_len = sizeof(*buf);
+
+	dto = dt_object_locate(tgt->lut_reply_data, th->th_dev);
+	return dt_record_write(env, dto, &tti->tti_buf, &tti->tti_off, th);
+}
+
+/* Read the reply data from reply_data file of target @tgt at offset @off
+ * into structure @lrd
+ */
+static int tgt_reply_data_read(const struct lu_env *env, struct lu_target *tgt,
+			       struct lsd_reply_data *lrd, loff_t off)
+{
+	int			 rc;
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct lsd_reply_data	*buf = &tti->tti_lrd;
+
+	tti->tti_off = off;
+	tti->tti_buf.lb_buf = buf;
+	tti->tti_buf.lb_len = sizeof(*buf);
+
+	rc = dt_record_read(env, tgt->lut_reply_data, &tti->tti_buf,
+			    &tti->tti_off);
+	if (rc != 0)
+		return rc;
+
+	lrd->lrd_transno	 = le64_to_cpu(buf->lrd_transno);
+	lrd->lrd_xid		 = le64_to_cpu(buf->lrd_xid);
+	lrd->lrd_data		 = le64_to_cpu(buf->lrd_data);
+	lrd->lrd_result		 = le32_to_cpu(buf->lrd_result);
+	lrd->lrd_client_gen	 = le32_to_cpu(buf->lrd_client_gen);
+
+	return 0;
+}
+
+
+/* Free the in-memory reply data structure @trd and release
+ * the corresponding slot in the reply_data file of target @lut
+ * Called with ted_lcd_lock held
+ */
+static void tgt_free_reply_data(struct lu_target *lut,
+				struct tg_export_data *ted,
+				struct tg_reply_data *trd)
+{
+	CDEBUG(D_TRACE, "%s: free reply data %p: xid %llu, transno %llu, "
+	       "client gen %u, slot idx %d\n",
+	       lut == NULL ? "" : tgt_name(lut), trd, trd->trd_reply.lrd_xid,
+	       trd->trd_reply.lrd_transno, trd->trd_reply.lrd_client_gen,
+	       trd->trd_index);
+
+	LASSERT(mutex_is_locked(&ted->ted_lcd_lock));
+
+	list_del(&trd->trd_list);
+	ted->ted_reply_cnt--;
+	if (lut != NULL)
+		tgt_clear_reply_slot(lut, trd->trd_index);
+	OBD_FREE_PTR(trd);
+}
+
+/* Release the reply data @trd from target @lut
+ * The reply data with the highest transno for this export
+ * is retained to ensure correctness of target recovery
+ * Called with ted_lcd_lock held
+ */
+static void tgt_release_reply_data(struct lu_target *lut,
+				   struct tg_export_data *ted,
+				   struct tg_reply_data *trd)
+{
+	CDEBUG(D_TRACE, "%s: release reply data %p: xid %llu, transno %llu, "
+	       "client gen %u, slot idx %d\n",
+	       lut == NULL ? "" : tgt_name(lut), trd, trd->trd_reply.lrd_xid,
+	       trd->trd_reply.lrd_transno, trd->trd_reply.lrd_client_gen,
+	       trd->trd_index);
+
+	LASSERT(mutex_is_locked(&ted->ted_lcd_lock));
+
+	/* Do not free the reply data corresponding to the
+	 * highest transno of this export.
+	 * This ensures on-disk reply data is kept and
+	 * last committed transno can be restored from disk in case
+	 * of target recovery
+	 */
+	if (trd->trd_reply.lrd_transno == ted->ted_lcd->lcd_last_transno) {
+		/* free previous retained reply */
+		if (ted->ted_reply_last != NULL)
+			tgt_free_reply_data(lut, ted, ted->ted_reply_last);
+		/* retain the reply */
+		list_del_init(&trd->trd_list);
+		ted->ted_reply_last = trd;
+	} else {
+		tgt_free_reply_data(lut, ted, trd);
+	}
+}
+
+static inline struct lu_buf *tti_buf_lsd(struct tgt_thread_info *tti)
+{
+	tti->tti_buf.lb_buf = &tti->tti_lsd;
+	tti->tti_buf.lb_len = sizeof(tti->tti_lsd);
+	return &tti->tti_buf;
+}
+
+static inline struct lu_buf *tti_buf_lcd(struct tgt_thread_info *tti)
+{
+	tti->tti_buf.lb_buf = &tti->tti_lcd;
+	tti->tti_buf.lb_len = sizeof(tti->tti_lcd);
+	return &tti->tti_buf;
+}
+
+/**
+ * Allocate in-memory data for client slot related to export.
+ */
+int tgt_client_alloc(struct obd_export *exp)
+{
+	ENTRY;
+	LASSERT(exp != exp->exp_obd->obd_self_export);
+
+	spin_lock_init(&exp->exp_target_data.ted_nodemap_lock);
+	INIT_LIST_HEAD(&exp->exp_target_data.ted_nodemap_member);
+
+	OBD_ALLOC_PTR(exp->exp_target_data.ted_lcd);
+	if (exp->exp_target_data.ted_lcd == NULL)
+		RETURN(-ENOMEM);
+	/* Mark that slot is not yet valid, 0 doesn't work here */
+	exp->exp_target_data.ted_lr_idx = -1;
+	INIT_LIST_HEAD(&exp->exp_target_data.ted_reply_list);
+	mutex_init(&exp->exp_target_data.ted_lcd_lock);
+	RETURN(0);
+}
+EXPORT_SYMBOL(tgt_client_alloc);
+
+/**
+ * Free in-memory data for client slot related to export.
+ */
+void tgt_client_free(struct obd_export *exp)
+{
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	struct lu_target	*lut = class_exp2tgt(exp);
+	struct tg_reply_data	*trd, *tmp;
+
+	LASSERT(exp != exp->exp_obd->obd_self_export);
+
+	/* free reply data */
+	mutex_lock(&ted->ted_lcd_lock);
+	list_for_each_entry_safe(trd, tmp, &ted->ted_reply_list, trd_list) {
+		tgt_release_reply_data(lut, ted, trd);
+	}
+	if (ted->ted_reply_last != NULL) {
+		tgt_free_reply_data(lut, ted, ted->ted_reply_last);
+		ted->ted_reply_last = NULL;
+	}
+	mutex_unlock(&ted->ted_lcd_lock);
+
+	if (!hlist_unhashed(&exp->exp_gen_hash))
+		cfs_hash_del(exp->exp_obd->obd_gen_hash,
+			     &ted->ted_lcd->lcd_generation,
+			     &exp->exp_gen_hash);
+
+	OBD_FREE_PTR(ted->ted_lcd);
+	ted->ted_lcd = NULL;
+
+	/* Target may have been freed (see LU-7430)
+	 * Slot may be not yet assigned */
+	if (exp->exp_obd->u.obt.obt_magic != OBT_MAGIC ||
+	    ted->ted_lr_idx < 0)
+		return;
+
+	/* Clear bit when lcd is freed */
+	LASSERT(lut && lut->lut_client_bitmap);
+	if (!test_and_clear_bit(ted->ted_lr_idx, lut->lut_client_bitmap)) {
+		CERROR("%s: client %u bit already clear in bitmap\n",
+		       exp->exp_obd->obd_name, ted->ted_lr_idx);
+		LBUG();
+	}
+
+	if (tgt_is_multimodrpcs_client(exp) && !exp->exp_obd->obd_stopping)
+		atomic_dec(&lut->lut_num_clients);
+}
+EXPORT_SYMBOL(tgt_client_free);
+
+static inline void tgt_check_lcd(const char *obd_name, int index,
+				 struct lsd_client_data *lcd)
+{
+	size_t uuid_size = sizeof(lcd->lcd_uuid);
+
+	if (strnlen((char*)lcd->lcd_uuid, uuid_size) == uuid_size) {
+		lcd->lcd_uuid[uuid_size - 1] = '\0';
+
+		LCONSOLE_ERROR("the client UUID (%s) on %s for exports stored in last_rcvd(index = %d) is bad!\n",
+			       lcd->lcd_uuid, obd_name, index);
+	}
+}
+
+static int tgt_client_data_read(const struct lu_env *env, struct lu_target *tgt,
+				struct lsd_client_data *lcd,
+				loff_t *off, int index)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	int			 rc;
+
+	tti_buf_lcd(tti);
+	rc = dt_record_read(env, tgt->lut_last_rcvd, &tti->tti_buf, off);
+	if (rc == 0) {
+		tgt_check_lcd(tgt->lut_obd->obd_name, index, &tti->tti_lcd);
+		lcd_le_to_cpu(&tti->tti_lcd, lcd);
+		lcd->lcd_last_result = ptlrpc_status_ntoh(lcd->lcd_last_result);
+		lcd->lcd_last_close_result =
+			ptlrpc_status_ntoh(lcd->lcd_last_close_result);
+	}
+
+	CDEBUG(D_INFO, "%s: read lcd @%lld uuid = %s, last_transno = %llu"
+	       ", last_xid = %llu, last_result = %u, last_data = %u, "
+	       "last_close_transno = %llu, last_close_xid = %llu, "
+	       "last_close_result = %u, rc = %d\n", tgt->lut_obd->obd_name,
+	       *off, lcd->lcd_uuid, lcd->lcd_last_transno, lcd->lcd_last_xid,
+	       lcd->lcd_last_result, lcd->lcd_last_data,
+	       lcd->lcd_last_close_transno, lcd->lcd_last_close_xid,
+	       lcd->lcd_last_close_result, rc);
+	return rc;
+}
+
+static int tgt_client_data_write(const struct lu_env *env,
+				 struct lu_target *tgt,
+				 struct lsd_client_data *lcd,
+				 loff_t *off, struct thandle *th)
+{
+	struct tgt_thread_info *tti = tgt_th_info(env);
+	struct dt_object	*dto;
+
+	lcd->lcd_last_result = ptlrpc_status_hton(lcd->lcd_last_result);
+	lcd->lcd_last_close_result =
+		ptlrpc_status_hton(lcd->lcd_last_close_result);
+	lcd_cpu_to_le(lcd, &tti->tti_lcd);
+	tti_buf_lcd(tti);
+
+	dto = dt_object_locate(tgt->lut_last_rcvd, th->th_dev);
+	return dt_record_write(env, dto, &tti->tti_buf, off, th);
+}
+
+struct tgt_new_client_callback {
+	struct dt_txn_commit_cb	 lncc_cb;
+	struct obd_export	*lncc_exp;
+};
+
+static void tgt_cb_new_client(struct lu_env *env, struct thandle *th,
+			      struct dt_txn_commit_cb *cb, int err)
+{
+	struct tgt_new_client_callback *ccb;
+
+	ccb = container_of0(cb, struct tgt_new_client_callback, lncc_cb);
+
+	LASSERT(ccb->lncc_exp->exp_obd);
+
+	CDEBUG(D_RPCTRACE, "%s: committing for initial connect of %s\n",
+	       ccb->lncc_exp->exp_obd->obd_name,
+	       ccb->lncc_exp->exp_client_uuid.uuid);
+
+	spin_lock(&ccb->lncc_exp->exp_lock);
+
+	ccb->lncc_exp->exp_need_sync = 0;
+
+	spin_unlock(&ccb->lncc_exp->exp_lock);
+	class_export_cb_put(ccb->lncc_exp);
+
+	OBD_FREE_PTR(ccb);
+}
+
+int tgt_new_client_cb_add(struct thandle *th, struct obd_export *exp)
+{
+	struct tgt_new_client_callback	*ccb;
+	struct dt_txn_commit_cb		*dcb;
+	int				 rc;
+
+	OBD_ALLOC_PTR(ccb);
+	if (ccb == NULL)
+		return -ENOMEM;
+
+	ccb->lncc_exp = class_export_cb_get(exp);
+
+	dcb = &ccb->lncc_cb;
+	dcb->dcb_func = tgt_cb_new_client;
+	INIT_LIST_HEAD(&dcb->dcb_linkage);
+	strlcpy(dcb->dcb_name, "tgt_cb_new_client", sizeof(dcb->dcb_name));
+
+	rc = dt_trans_cb_add(th, dcb);
+	if (rc) {
+		class_export_cb_put(exp);
+		OBD_FREE_PTR(ccb);
+	}
+	return rc;
+}
+
+/**
+ * Update client data in last_rcvd
+ */
+static int tgt_client_data_update(const struct lu_env *env,
+				  struct obd_export *exp)
+{
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	struct lu_target	*tgt = class_exp2tgt(exp);
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct thandle		*th;
+	int			 rc = 0;
+
+	ENTRY;
+
+	if (unlikely(tgt == NULL)) {
+		CDEBUG(D_ERROR, "%s: No target for connected export\n",
+			  class_exp2obd(exp)->obd_name);
+		RETURN(-EINVAL);
+	}
+
+	if (tgt->lut_bottom->dd_rdonly)
+		RETURN(0);
+
+	th = dt_trans_create(env, tgt->lut_bottom);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	tti_buf_lcd(tti);
+	rc = dt_declare_record_write(env, tgt->lut_last_rcvd,
+				     &tti->tti_buf,
+				     ted->ted_lr_off, th);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = dt_trans_start_local(env, tgt->lut_bottom, th);
+	if (rc)
+		GOTO(out, rc);
+
+	mutex_lock(&ted->ted_lcd_lock);
+
+	/*
+	 * Until this operations will be committed the sync is needed
+	 * for this export. This should be done _after_ starting the
+	 * transaction so that many connecting clients will not bring
+	 * server down with lots of sync writes.
+	 */
+	rc = tgt_new_client_cb_add(th, exp);
+	if (rc) {
+		/* can't add callback, do sync now */
+		th->th_sync = 1;
+	} else {
+		spin_lock(&exp->exp_lock);
+		exp->exp_need_sync = 1;
+		spin_unlock(&exp->exp_lock);
+	}
+
+	tti->tti_off = ted->ted_lr_off;
+	rc = tgt_client_data_write(env, tgt, ted->ted_lcd, &tti->tti_off, th);
+
+	mutex_unlock(&ted->ted_lcd_lock);
+
+	EXIT;
+out:
+	dt_trans_stop(env, tgt->lut_bottom, th);
+	CDEBUG(D_INFO, "%s: update last_rcvd client data for UUID = %s, "
+	       "last_transno = %llu: rc = %d\n", tgt->lut_obd->obd_name,
+	       tgt->lut_lsd.lsd_uuid, tgt->lut_lsd.lsd_last_transno, rc);
+
+	return rc;
+}
+
+static int tgt_server_data_read(const struct lu_env *env, struct lu_target *tgt)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	int			 rc;
+
+	tti->tti_off = 0;
+	tti_buf_lsd(tti);
+	rc = dt_record_read(env, tgt->lut_last_rcvd, &tti->tti_buf,
+			    &tti->tti_off);
+	if (rc == 0)
+		lsd_le_to_cpu(&tti->tti_lsd, &tgt->lut_lsd);
+
+	CDEBUG(D_INFO, "%s: read last_rcvd server data for UUID = %s, "
+	       "last_transno = %llu: rc = %d\n", tgt->lut_obd->obd_name,
+	       tgt->lut_lsd.lsd_uuid, tgt->lut_lsd.lsd_last_transno, rc);
+        return rc;
+}
+
+static int tgt_server_data_write(const struct lu_env *env,
+				 struct lu_target *tgt, struct thandle *th)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct dt_object	*dto;
+	int			 rc;
+
+	ENTRY;
+
+	tti->tti_off = 0;
+	tti_buf_lsd(tti);
+	lsd_cpu_to_le(&tgt->lut_lsd, &tti->tti_lsd);
+
+	dto = dt_object_locate(tgt->lut_last_rcvd, th->th_dev);
+	rc = dt_record_write(env, dto, &tti->tti_buf, &tti->tti_off, th);
+
+	CDEBUG(D_INFO, "%s: write last_rcvd server data for UUID = %s, "
+	       "last_transno = %llu: rc = %d\n", tgt->lut_obd->obd_name,
+	       tgt->lut_lsd.lsd_uuid, tgt->lut_lsd.lsd_last_transno, rc);
+
+	RETURN(rc);
+}
+
+/**
+ * Update server data in last_rcvd
+ */
+int tgt_server_data_update(const struct lu_env *env, struct lu_target *tgt,
+			   int sync)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct thandle		*th;
+	int			 rc = 0;
+
+	ENTRY;
+
+	CDEBUG(D_SUPER,
+	       "%s: mount_count is %llu, last_transno is %llu\n",
+	       tgt->lut_lsd.lsd_uuid, tgt->lut_obd->u.obt.obt_mount_count,
+	       tgt->lut_last_transno);
+
+	/* Always save latest transno to keep it fresh */
+	spin_lock(&tgt->lut_translock);
+	tgt->lut_lsd.lsd_last_transno = tgt->lut_last_transno;
+	spin_unlock(&tgt->lut_translock);
+
+	if (tgt->lut_bottom->dd_rdonly)
+		RETURN(0);
+
+	th = dt_trans_create(env, tgt->lut_bottom);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	th->th_sync = sync;
+
+	tti_buf_lsd(tti);
+	rc = dt_declare_record_write(env, tgt->lut_last_rcvd,
+				     &tti->tti_buf, tti->tti_off, th);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = dt_trans_start(env, tgt->lut_bottom, th);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = tgt_server_data_write(env, tgt, th);
+out:
+	dt_trans_stop(env, tgt->lut_bottom, th);
+
+	CDEBUG(D_INFO, "%s: update last_rcvd server data for UUID = %s, "
+	       "last_transno = %llu: rc = %d\n", tgt->lut_obd->obd_name,
+	       tgt->lut_lsd.lsd_uuid, tgt->lut_lsd.lsd_last_transno, rc);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_server_data_update);
+
+static int tgt_truncate_last_rcvd(const struct lu_env *env,
+				  struct lu_target *tgt, loff_t size)
+{
+	struct dt_object *dt = tgt->lut_last_rcvd;
+	struct thandle	 *th;
+	struct lu_attr	  attr;
+	int		  rc;
+
+	ENTRY;
+
+	if (tgt->lut_bottom->dd_rdonly)
+		RETURN(0);
+
+	attr.la_size = size;
+	attr.la_valid = LA_SIZE;
+
+	th = dt_trans_create(env, tgt->lut_bottom);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+	rc = dt_declare_punch(env, dt, size, OBD_OBJECT_EOF, th);
+	if (rc)
+		GOTO(cleanup, rc);
+	rc = dt_declare_attr_set(env, dt, &attr, th);
+	if (rc)
+		GOTO(cleanup, rc);
+	rc = dt_trans_start_local(env, tgt->lut_bottom, th);
+	if (rc)
+		GOTO(cleanup, rc);
+
+	rc = dt_punch(env, dt, size, OBD_OBJECT_EOF, th);
+	if (rc == 0)
+		rc = dt_attr_set(env, dt, &attr, th);
+
+cleanup:
+	dt_trans_stop(env, tgt->lut_bottom, th);
+
+	RETURN(rc);
+}
+
+static void tgt_client_epoch_update(const struct lu_env *env,
+				    struct obd_export *exp)
+{
+	struct lsd_client_data	*lcd = exp->exp_target_data.ted_lcd;
+	struct lu_target	*tgt = class_exp2tgt(exp);
+
+	LASSERT(tgt && tgt->lut_bottom);
+	/** VBR: set client last_epoch to current epoch */
+	if (lcd->lcd_last_epoch >= tgt->lut_lsd.lsd_start_epoch)
+		return;
+	lcd->lcd_last_epoch = tgt->lut_lsd.lsd_start_epoch;
+	tgt_client_data_update(env, exp);
+}
+
+/**
+ * Update boot epoch when recovery ends
+ */
+void tgt_boot_epoch_update(struct lu_target *tgt)
+{
+	struct lu_env		 env;
+	struct ptlrpc_request	*req;
+	__u32			 start_epoch;
+	struct list_head	 client_list;
+	int			 rc;
+
+	if (tgt->lut_obd->obd_stopping)
+		return;
+
+	rc = lu_env_init(&env, LCT_DT_THREAD);
+	if (rc) {
+		CERROR("%s: can't initialize environment: rc = %d\n",
+		        tgt->lut_obd->obd_name, rc);
+		return;
+	}
+
+	spin_lock(&tgt->lut_translock);
+	start_epoch = (tgt->lut_last_transno >> LR_EPOCH_BITS) + 1;
+	tgt->lut_last_transno = (__u64)start_epoch << LR_EPOCH_BITS;
+	tgt->lut_lsd.lsd_start_epoch = start_epoch;
+	spin_unlock(&tgt->lut_translock);
+
+	INIT_LIST_HEAD(&client_list);
+	/**
+	 * The recovery is not yet finished and final queue can still be updated
+	 * with resend requests. Move final list to separate one for processing
+	 */
+	spin_lock(&tgt->lut_obd->obd_recovery_task_lock);
+	list_splice_init(&tgt->lut_obd->obd_final_req_queue, &client_list);
+	spin_unlock(&tgt->lut_obd->obd_recovery_task_lock);
+
+	/**
+	 * go through list of exports participated in recovery and
+	 * set new epoch for them
+	 */
+	list_for_each_entry(req, &client_list, rq_list) {
+		LASSERT(!req->rq_export->exp_delayed);
+		if (!req->rq_export->exp_vbr_failed)
+			tgt_client_epoch_update(&env, req->rq_export);
+	}
+	/** return list back at once */
+	spin_lock(&tgt->lut_obd->obd_recovery_task_lock);
+	list_splice_init(&client_list, &tgt->lut_obd->obd_final_req_queue);
+	spin_unlock(&tgt->lut_obd->obd_recovery_task_lock);
+
+	/** Clear MULTI RPCS incompatibility flag if
+	 * - target is MDT and
+	 * - there is no client to recover or the recovery was aborted
+	 */
+	if (!strncmp(tgt->lut_obd->obd_type->typ_name, LUSTRE_MDT_NAME, 3) &&
+	    (tgt->lut_obd->obd_max_recoverable_clients == 0 ||
+	    tgt->lut_obd->obd_abort_recovery))
+		tgt->lut_lsd.lsd_feature_incompat &= ~OBD_INCOMPAT_MULTI_RPCS;
+
+	/** update server epoch */
+	tgt_server_data_update(&env, tgt, 1);
+	lu_env_fini(&env);
+}
+
+/**
+ * commit callback, need to update last_committed value
+ */
+struct tgt_last_committed_callback {
+	struct dt_txn_commit_cb	 llcc_cb;
+	struct lu_target	*llcc_tgt;
+	struct obd_export	*llcc_exp;
+	__u64			 llcc_transno;
+};
+
+static void tgt_cb_last_committed(struct lu_env *env, struct thandle *th,
+				  struct dt_txn_commit_cb *cb, int err)
+{
+	struct tgt_last_committed_callback *ccb;
+
+	ccb = container_of0(cb, struct tgt_last_committed_callback, llcc_cb);
+
+	LASSERT(ccb->llcc_exp);
+	LASSERT(ccb->llcc_tgt != NULL);
+	LASSERT(ccb->llcc_exp->exp_obd == ccb->llcc_tgt->lut_obd);
+
+	/* error hit, don't update last committed to provide chance to
+	 * replay data after fail */
+	if (err != 0)
+		goto out;
+
+	/* Fast path w/o spinlock, if exp_last_committed was updated
+	 * with higher transno, no need to take spinlock and check,
+	 * also no need to update obd_last_committed. */
+	if (ccb->llcc_transno <= ccb->llcc_exp->exp_last_committed)
+		goto out;
+	spin_lock(&ccb->llcc_tgt->lut_translock);
+	if (ccb->llcc_transno > ccb->llcc_tgt->lut_obd->obd_last_committed)
+		ccb->llcc_tgt->lut_obd->obd_last_committed = ccb->llcc_transno;
+
+	if (ccb->llcc_transno > ccb->llcc_exp->exp_last_committed) {
+		ccb->llcc_exp->exp_last_committed = ccb->llcc_transno;
+		spin_unlock(&ccb->llcc_tgt->lut_translock);
+
+		ptlrpc_commit_replies(ccb->llcc_exp);
+		tgt_cancel_slc_locks(ccb->llcc_tgt, ccb->llcc_transno);
+	} else {
+		spin_unlock(&ccb->llcc_tgt->lut_translock);
+	}
+
+	CDEBUG(D_HA, "%s: transno %lld is committed\n",
+	       ccb->llcc_tgt->lut_obd->obd_name, ccb->llcc_transno);
+
+out:
+	class_export_cb_put(ccb->llcc_exp);
+	OBD_FREE_PTR(ccb);
+}
+
+/**
+ * Add commit callback function, it returns a non-zero value to inform
+ * caller to use sync transaction if necessary.
+ */
+static int tgt_last_commit_cb_add(struct thandle *th, struct lu_target *tgt,
+				  struct obd_export *exp, __u64 transno)
+{
+	struct tgt_last_committed_callback	*ccb;
+	struct dt_txn_commit_cb			*dcb;
+	int					 rc;
+
+	OBD_ALLOC_PTR(ccb);
+	if (ccb == NULL)
+		return -ENOMEM;
+
+	ccb->llcc_tgt = tgt;
+	ccb->llcc_exp = class_export_cb_get(exp);
+	ccb->llcc_transno = transno;
+
+	dcb = &ccb->llcc_cb;
+	dcb->dcb_func = tgt_cb_last_committed;
+	INIT_LIST_HEAD(&dcb->dcb_linkage);
+	strlcpy(dcb->dcb_name, "tgt_cb_last_committed", sizeof(dcb->dcb_name));
+
+	rc = dt_trans_cb_add(th, dcb);
+	if (rc) {
+		class_export_cb_put(exp);
+		OBD_FREE_PTR(ccb);
+	}
+
+	if (exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT)
+		/* report failure to force synchronous operation */
+		return -EPERM;
+
+	/* if exp_need_sync is set, return non-zero value to force
+	 * a sync transaction. */
+	return rc ? rc : exp->exp_need_sync;
+}
+
+/**
+ * Add new client to the last_rcvd upon new connection.
+ *
+ * We use a bitmap to locate a free space in the last_rcvd file and initialize
+ * tg_export_data.
+ */
+int tgt_client_new(const struct lu_env *env, struct obd_export *exp)
+{
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	struct lu_target	*tgt = class_exp2tgt(exp);
+	int			 rc = 0, idx;
+
+	ENTRY;
+
+	LASSERT(tgt && tgt->lut_client_bitmap != NULL);
+	if (!strcmp(ted->ted_lcd->lcd_uuid, tgt->lut_obd->obd_uuid.uuid))
+		RETURN(0);
+
+	if (exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT)
+		RETURN(0);
+
+	/* the bitmap operations can handle cl_idx > sizeof(long) * 8, so
+	 * there's no need for extra complication here
+	 */
+	idx = find_first_zero_bit(tgt->lut_client_bitmap, LR_MAX_CLIENTS);
+repeat:
+	if (idx >= LR_MAX_CLIENTS ||
+	    OBD_FAIL_CHECK(OBD_FAIL_MDS_CLIENT_ADD)) {
+		CERROR("%s: no room for %u clients - fix LR_MAX_CLIENTS\n",
+		       tgt->lut_obd->obd_name,  idx);
+		RETURN(-EOVERFLOW);
+	}
+	if (test_and_set_bit(idx, tgt->lut_client_bitmap)) {
+		idx = find_next_zero_bit(tgt->lut_client_bitmap,
+					     LR_MAX_CLIENTS, idx);
+		goto repeat;
+	}
+
+	ted->ted_lr_idx = idx;
+	ted->ted_lr_off = tgt->lut_lsd.lsd_client_start +
+			  idx * tgt->lut_lsd.lsd_client_size;
+
+	LASSERTF(ted->ted_lr_off > 0, "ted_lr_off = %llu\n", ted->ted_lr_off);
+
+	if (tgt_is_multimodrpcs_client(exp)) {
+		/* Set MULTI RPCS incompatibility flag to prevent previous
+		 * Lustre versions to mount a target with reply_data file */
+		atomic_inc(&tgt->lut_num_clients);
+		if (!(tgt->lut_lsd.lsd_feature_incompat &
+		      OBD_INCOMPAT_MULTI_RPCS)) {
+			tgt->lut_lsd.lsd_feature_incompat |=
+							OBD_INCOMPAT_MULTI_RPCS;
+			rc = tgt_server_data_update(env, tgt, 1);
+			if (rc < 0) {
+				CERROR("%s: unable to set MULTI RPCS "
+				       "incompatibility flag\n",
+				       exp->exp_obd->obd_name);
+				RETURN(rc);
+			}
+		}
+
+		/* assign client slot generation */
+		ted->ted_lcd->lcd_generation =
+				atomic_inc_return(&tgt->lut_client_generation);
+	} else {
+		ted->ted_lcd->lcd_generation = 0;
+	}
+
+	CDEBUG(D_INFO, "%s: new client at index %d (%llu) with UUID '%s' "
+	       "generation %d\n",
+	       tgt->lut_obd->obd_name, ted->ted_lr_idx, ted->ted_lr_off,
+	       ted->ted_lcd->lcd_uuid, ted->ted_lcd->lcd_generation);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_TGT_CLIENT_ADD))
+		RETURN(-ENOSPC);
+
+	rc = tgt_client_data_update(env, exp);
+	if (rc)
+		CERROR("%s: Failed to write client lcd at idx %d, rc %d\n",
+		       tgt->lut_obd->obd_name, idx, rc);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_client_new);
+
+/* Add an existing client to the MDS in-memory state based on
+ * a client that was previously found in the last_rcvd file and
+ * already has an assigned slot (idx >= 0).
+ *
+ * It should not be possible to fail adding an existing client - otherwise
+ * mdt_init_server_data() callsite needs to be fixed.
+ */
+int tgt_client_add(const struct lu_env *env,  struct obd_export *exp, int idx)
+{
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	struct lu_target	*tgt = class_exp2tgt(exp);
+
+	ENTRY;
+
+	LASSERT(tgt && tgt->lut_client_bitmap != NULL);
+	LASSERTF(idx >= 0, "%d\n", idx);
+
+	if (!strcmp(ted->ted_lcd->lcd_uuid, tgt->lut_obd->obd_uuid.uuid) ||
+	    exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT)
+		RETURN(0);
+
+	if (test_and_set_bit(idx, tgt->lut_client_bitmap)) {
+		CERROR("%s: client %d: bit already set in bitmap!!\n",
+		       tgt->lut_obd->obd_name,  idx);
+		LBUG();
+	}
+	atomic_inc(&tgt->lut_num_clients);
+
+	CDEBUG(D_INFO, "%s: client at idx %d with UUID '%s' added, "
+	       "generation %d\n",
+	       tgt->lut_obd->obd_name, idx, ted->ted_lcd->lcd_uuid,
+	       ted->ted_lcd->lcd_generation);
+
+	ted->ted_lr_idx = idx;
+	ted->ted_lr_off = tgt->lut_lsd.lsd_client_start +
+			  idx * tgt->lut_lsd.lsd_client_size;
+
+	mutex_init(&ted->ted_lcd_lock);
+
+	LASSERTF(ted->ted_lr_off > 0, "ted_lr_off = %llu\n", ted->ted_lr_off);
+
+	RETURN(0);
+}
+
+int tgt_client_del(const struct lu_env *env, struct obd_export *exp)
+{
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	struct lu_target	*tgt = class_exp2tgt(exp);
+	int			 rc;
+
+	ENTRY;
+
+	LASSERT(ted->ted_lcd);
+
+	if (unlikely(tgt == NULL)) {
+		CDEBUG(D_ERROR, "%s: No target for connected export\n",
+		       class_exp2obd(exp)->obd_name);
+		RETURN(-EINVAL);
+	}
+
+	/* XXX if lcd_uuid were a real obd_uuid, I could use obd_uuid_equals */
+	if (!strcmp((char *)ted->ted_lcd->lcd_uuid,
+		    (char *)tgt->lut_obd->obd_uuid.uuid) ||
+	    exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT)
+		RETURN(0);
+
+	/* Slot may be not yet assigned, use case is race between Client
+	 * reconnect and forced eviction */
+	if (ted->ted_lr_idx < 0) {
+		CWARN("%s: client with UUID '%s' not in bitmap\n",
+		      tgt->lut_obd->obd_name, ted->ted_lcd->lcd_uuid);
+		RETURN(0);
+	}
+
+	CDEBUG(D_INFO, "%s: del client at idx %u, off %lld, UUID '%s'\n",
+	       tgt->lut_obd->obd_name, ted->ted_lr_idx, ted->ted_lr_off,
+	       ted->ted_lcd->lcd_uuid);
+
+	/* Clear the bit _after_ zeroing out the client so we don't
+	   race with filter_client_add and zero out new clients.*/
+	if (!test_bit(ted->ted_lr_idx, tgt->lut_client_bitmap)) {
+		CERROR("%s: client %u: bit already clear in bitmap!!\n",
+		       tgt->lut_obd->obd_name, ted->ted_lr_idx);
+		LBUG();
+	}
+
+	/* Do not erase record for recoverable client. */
+	if (exp->exp_flags & OBD_OPT_FAILOVER)
+		RETURN(0);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_TGT_CLIENT_DEL))
+		RETURN(0);
+
+	/* Make sure the server's last_transno is up to date.
+	 * This should be done before zeroing client slot so last_transno will
+	 * be in server data or in client data in case of failure */
+	rc = tgt_server_data_update(env, tgt, 0);
+	if (rc != 0) {
+		CERROR("%s: failed to update server data, skip client %s "
+		       "zeroing, rc %d\n", tgt->lut_obd->obd_name,
+		       ted->ted_lcd->lcd_uuid, rc);
+		RETURN(rc);
+	}
+
+	memset(ted->ted_lcd->lcd_uuid, 0, sizeof ted->ted_lcd->lcd_uuid);
+	rc = tgt_client_data_update(env, exp);
+
+	CDEBUG(rc == 0 ? D_INFO : D_ERROR,
+	       "%s: zeroing out client %s at idx %u (%llu), rc %d\n",
+	       tgt->lut_obd->obd_name, ted->ted_lcd->lcd_uuid,
+	       ted->ted_lr_idx, ted->ted_lr_off, rc);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_client_del);
+
+int tgt_add_reply_data(const struct lu_env *env, struct lu_target *tgt,
+		       struct tg_export_data *ted, struct tg_reply_data *trd,
+		       struct thandle *th, bool update_lrd_file)
+{
+	struct lsd_reply_data	*lrd;
+	int	i;
+
+	lrd = &trd->trd_reply;
+	/* update export last transno */
+	mutex_lock(&ted->ted_lcd_lock);
+	if (lrd->lrd_transno > ted->ted_lcd->lcd_last_transno)
+		ted->ted_lcd->lcd_last_transno = lrd->lrd_transno;
+	mutex_unlock(&ted->ted_lcd_lock);
+
+	/* find a empty slot */
+	i = tgt_find_free_reply_slot(tgt);
+	if (unlikely(i < 0)) {
+		CERROR("%s: couldn't find a slot for reply data: "
+		       "rc = %d\n", tgt_name(tgt), i);
+		RETURN(i);
+	}
+	trd->trd_index = i;
+
+	if (update_lrd_file) {
+		loff_t	off;
+		int	rc;
+
+		/* write reply data to disk */
+		off = sizeof(struct lsd_reply_header) + sizeof(*lrd) * i;
+		rc = tgt_reply_data_write(env, tgt, lrd, off, th);
+		if (unlikely(rc != 0)) {
+			CERROR("%s: can't update %s file: rc = %d\n",
+			       tgt_name(tgt), REPLY_DATA, rc);
+			RETURN(rc);
+		}
+	}
+	/* add reply data to target export's reply list */
+	mutex_lock(&ted->ted_lcd_lock);
+	list_add(&trd->trd_list, &ted->ted_reply_list);
+	ted->ted_reply_cnt++;
+	if (ted->ted_reply_cnt > ted->ted_reply_max)
+		ted->ted_reply_max = ted->ted_reply_cnt;
+	mutex_unlock(&ted->ted_lcd_lock);
+
+	CDEBUG(D_TRACE, "add reply %p: xid %llu, transno %llu, "
+	       "tag %hu, client gen %u, slot idx %d\n",
+	       trd, lrd->lrd_xid, lrd->lrd_transno,
+	       trd->trd_tag, lrd->lrd_client_gen, i);
+	RETURN(0);
+}
+EXPORT_SYMBOL(tgt_add_reply_data);
+
+/*
+ * last_rcvd & last_committed update callbacks
+ */
+static int tgt_last_rcvd_update(const struct lu_env *env, struct lu_target *tgt,
+				struct dt_object *obj, __u64 opdata,
+				struct thandle *th, struct ptlrpc_request *req)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct tgt_session_info *tsi = tgt_ses_info(env);
+	struct obd_export	*exp = tsi->tsi_exp;
+	struct tg_export_data	*ted;
+	__u64			*transno_p;
+	int			 rc = 0;
+	bool			 lw_client;
+
+	ENTRY;
+
+
+	LASSERT(exp != NULL);
+	ted = &exp->exp_target_data;
+
+	lw_client = exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT;
+	if (ted->ted_lr_idx < 0 && !lw_client)
+		/* ofd connect may cause transaction before export has
+		 * last_rcvd slot */
+		RETURN(0);
+
+	if (req != NULL)
+		tti->tti_transno = lustre_msg_get_transno(req->rq_reqmsg);
+	else
+		/* From update replay, tti_transno should be set already */
+		LASSERT(tti->tti_transno != 0);
+
+	spin_lock(&tgt->lut_translock);
+	if (th->th_result != 0) {
+		if (tti->tti_transno != 0) {
+			CERROR("%s: replay transno %llu failed: rc = %d\n",
+			       tgt_name(tgt), tti->tti_transno, th->th_result);
+		}
+	} else if (tti->tti_transno == 0) {
+		tti->tti_transno = ++tgt->lut_last_transno;
+	} else {
+		/* should be replay */
+		if (tti->tti_transno > tgt->lut_last_transno)
+			tgt->lut_last_transno = tti->tti_transno;
+	}
+	spin_unlock(&tgt->lut_translock);
+
+	/** VBR: set new versions */
+	if (th->th_result == 0 && obj != NULL) {
+		struct dt_object *dto = dt_object_locate(obj, th->th_dev);
+		dt_version_set(env, dto, tti->tti_transno, th);
+	}
+
+	/* filling reply data */
+	CDEBUG(D_INODE, "transno = %llu, last_committed = %llu\n",
+	       tti->tti_transno, tgt->lut_obd->obd_last_committed);
+
+	if (req != NULL) {
+		req->rq_transno = tti->tti_transno;
+		lustre_msg_set_transno(req->rq_repmsg, tti->tti_transno);
+	}
+
+	/* if can't add callback, do sync write */
+	th->th_sync |= !!tgt_last_commit_cb_add(th, tgt, exp, tti->tti_transno);
+
+	if (lw_client) {
+		/* All operations performed by LW clients are synchronous and
+		 * we store the committed transno in the last_rcvd header */
+		spin_lock(&tgt->lut_translock);
+		if (tti->tti_transno > tgt->lut_lsd.lsd_last_transno) {
+			tgt->lut_lsd.lsd_last_transno = tti->tti_transno;
+			spin_unlock(&tgt->lut_translock);
+			/* Although lightweight (LW) connections have no slot
+			 * in the last_rcvd, we still want to maintain
+			 * the in-memory lsd_client_data structure in order to
+			 * properly handle reply reconstruction. */
+			rc = tgt_server_data_write(env, tgt, th);
+		} else {
+			spin_unlock(&tgt->lut_translock);
+		}
+	} else if (ted->ted_lr_off == 0) {
+		CERROR("%s: client idx %d has offset %lld\n",
+		       tgt_name(tgt), ted->ted_lr_idx, ted->ted_lr_off);
+		RETURN(-EINVAL);
+	}
+
+	/* Target that supports multiple reply data */
+	if (tgt_is_multimodrpcs_client(exp)) {
+		struct tg_reply_data	*trd;
+		struct lsd_reply_data	*lrd;
+		__u64			*pre_versions;
+		bool			write_update;
+
+		OBD_ALLOC_PTR(trd);
+		if (unlikely(trd == NULL))
+			RETURN(-ENOMEM);
+
+		/* fill reply data information */
+		lrd = &trd->trd_reply;
+		lrd->lrd_transno = tti->tti_transno;
+		if (req != NULL) {
+			lrd->lrd_xid = req->rq_xid;
+			trd->trd_tag = lustre_msg_get_tag(req->rq_reqmsg);
+			pre_versions = lustre_msg_get_versions(req->rq_repmsg);
+			lrd->lrd_result = th->th_result;
+			lrd->lrd_client_gen = ted->ted_lcd->lcd_generation;
+			write_update = true;
+		} else {
+			LASSERT(tsi->tsi_xid != 0);
+			lrd->lrd_xid = tsi->tsi_xid;
+			lrd->lrd_result = tsi->tsi_result;
+			lrd->lrd_client_gen = tsi->tsi_client_gen;
+			trd->trd_tag = 0;
+			pre_versions = NULL;
+			write_update = false;
+		}
+
+		lrd->lrd_data = opdata;
+		if (pre_versions) {
+			trd->trd_pre_versions[0] = pre_versions[0];
+			trd->trd_pre_versions[1] = pre_versions[1];
+			trd->trd_pre_versions[2] = pre_versions[2];
+			trd->trd_pre_versions[3] = pre_versions[3];
+		}
+
+		rc = tgt_add_reply_data(env, tgt, ted, trd, th, write_update);
+		if (rc < 0)
+			OBD_FREE_PTR(trd);
+		return rc;
+	}
+
+	/* Enough for update replay, let's return */
+	if (req == NULL)
+		RETURN(rc);
+
+	mutex_lock(&ted->ted_lcd_lock);
+	LASSERT(ergo(tti->tti_transno == 0, th->th_result != 0));
+	if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_CLOSE) {
+		transno_p = &ted->ted_lcd->lcd_last_close_transno;
+		ted->ted_lcd->lcd_last_close_xid = req->rq_xid;
+		ted->ted_lcd->lcd_last_close_result = th->th_result;
+	} else {
+		/* VBR: save versions in last_rcvd for reconstruct. */
+		__u64 *pre_versions = lustre_msg_get_versions(req->rq_repmsg);
+
+		if (pre_versions) {
+			ted->ted_lcd->lcd_pre_versions[0] = pre_versions[0];
+			ted->ted_lcd->lcd_pre_versions[1] = pre_versions[1];
+			ted->ted_lcd->lcd_pre_versions[2] = pre_versions[2];
+			ted->ted_lcd->lcd_pre_versions[3] = pre_versions[3];
+		}
+		transno_p = &ted->ted_lcd->lcd_last_transno;
+		ted->ted_lcd->lcd_last_xid = req->rq_xid;
+		ted->ted_lcd->lcd_last_result = th->th_result;
+		/* XXX: lcd_last_data is __u32 but intent_dispostion is __u64,
+		 * see struct ldlm_reply->lock_policy_res1; */
+		ted->ted_lcd->lcd_last_data = opdata;
+	}
+
+	/* Update transno in slot only if non-zero number, i.e. no errors */
+	if (likely(tti->tti_transno != 0)) {
+		/* Don't overwrite bigger transaction number with lower one.
+		 * That is not sign of problem in all cases, but in any case
+		 * this value should be monotonically increased only. */
+		if (*transno_p > tti->tti_transno) {
+			if (!tgt->lut_no_reconstruct) {
+				CERROR("%s: trying to overwrite bigger transno:"
+				       "on-disk: %llu, new: %llu replay: "
+				       "%d. See LU-617.\n", tgt_name(tgt),
+				       *transno_p, tti->tti_transno,
+				       req_is_replay(req));
+				if (req_is_replay(req)) {
+					spin_lock(&req->rq_export->exp_lock);
+					req->rq_export->exp_vbr_failed = 1;
+					spin_unlock(&req->rq_export->exp_lock);
+				}
+				mutex_unlock(&ted->ted_lcd_lock);
+				RETURN(req_is_replay(req) ? -EOVERFLOW : 0);
+			}
+		} else {
+			*transno_p = tti->tti_transno;
+		}
+	}
+
+	if (!lw_client) {
+		tti->tti_off = ted->ted_lr_off;
+		if (CFS_FAIL_CHECK(OBD_FAIL_TGT_RCVD_EIO))
+			rc = -EIO;
+		else
+			rc = tgt_client_data_write(env, tgt, ted->ted_lcd,
+						   &tti->tti_off, th);
+		if (rc < 0) {
+			mutex_unlock(&ted->ted_lcd_lock);
+			RETURN(rc);
+		}
+	}
+	mutex_unlock(&ted->ted_lcd_lock);
+	RETURN(rc);
+}
+
+/*
+ * last_rcvd update for echo client simulation.
+ * It updates last_rcvd client slot and version of object in
+ * simple way but with all locks to simulate all drawbacks
+ */
+static int tgt_last_rcvd_update_echo(const struct lu_env *env,
+				     struct lu_target *tgt,
+				     struct dt_object *obj,
+				     struct thandle *th,
+				     struct obd_export *exp)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	int			 rc = 0;
+
+	ENTRY;
+
+	tti->tti_transno = 0;
+
+	spin_lock(&tgt->lut_translock);
+	if (th->th_result == 0)
+		tti->tti_transno = ++tgt->lut_last_transno;
+	spin_unlock(&tgt->lut_translock);
+
+	/** VBR: set new versions */
+	if (th->th_result == 0 && obj != NULL)
+		dt_version_set(env, obj, tti->tti_transno, th);
+
+	/* if can't add callback, do sync write */
+	th->th_sync |= !!tgt_last_commit_cb_add(th, tgt, exp,
+						tti->tti_transno);
+
+	LASSERT(ted->ted_lr_off > 0);
+
+	mutex_lock(&ted->ted_lcd_lock);
+	LASSERT(ergo(tti->tti_transno == 0, th->th_result != 0));
+	ted->ted_lcd->lcd_last_transno = tti->tti_transno;
+	ted->ted_lcd->lcd_last_result = th->th_result;
+
+	tti->tti_off = ted->ted_lr_off;
+	rc = tgt_client_data_write(env, tgt, ted->ted_lcd, &tti->tti_off, th);
+	mutex_unlock(&ted->ted_lcd_lock);
+	RETURN(rc);
+}
+
+static int tgt_clients_data_init(const struct lu_env *env,
+				 struct lu_target *tgt,
+				 unsigned long last_size)
+{
+	struct obd_device	*obd = tgt->lut_obd;
+	struct lr_server_data	*lsd = &tgt->lut_lsd;
+	struct lsd_client_data	*lcd = NULL;
+	struct tg_export_data	*ted;
+	int			 cl_idx;
+	int			 rc = 0;
+	loff_t			 off = lsd->lsd_client_start;
+	__u32			 generation = 0;
+	struct cfs_hash		*hash = NULL;
+
+	ENTRY;
+
+	if (tgt->lut_bottom->dd_rdonly)
+		RETURN(0);
+
+	CLASSERT(offsetof(struct lsd_client_data, lcd_padding) +
+		 sizeof(lcd->lcd_padding) == LR_CLIENT_SIZE);
+
+	OBD_ALLOC_PTR(lcd);
+	if (lcd == NULL)
+		RETURN(-ENOMEM);
+
+	hash = cfs_hash_getref(tgt->lut_obd->obd_gen_hash);
+	if (hash == NULL)
+		GOTO(err_out, rc = -ENODEV);
+
+	for (cl_idx = 0; off < last_size; cl_idx++) {
+		struct obd_export	*exp;
+		__u64			 last_transno;
+
+		/* Don't assume off is incremented properly by
+		 * read_record(), in case sizeof(*lcd)
+		 * isn't the same as fsd->lsd_client_size.  */
+		off = lsd->lsd_client_start + cl_idx * lsd->lsd_client_size;
+		rc = tgt_client_data_read(env, tgt, lcd, &off, cl_idx);
+		if (rc) {
+			CERROR("%s: error reading last_rcvd %s idx %d off "
+			       "%llu: rc = %d\n", tgt_name(tgt), LAST_RCVD,
+			       cl_idx, off, rc);
+			rc = 0;
+			break; /* read error shouldn't cause startup to fail */
+		}
+
+		if (lcd->lcd_uuid[0] == '\0') {
+			CDEBUG(D_INFO, "skipping zeroed client at offset %d\n",
+			       cl_idx);
+			continue;
+		}
+
+		last_transno = lcd_last_transno(lcd);
+
+		/* These exports are cleaned up by disconnect, so they
+		 * need to be set up like real exports as connect does.
+		 */
+		CDEBUG(D_HA, "RCVRNG CLIENT uuid: %s idx: %d lr: %llu"
+		       " srv lr: %llu lx: %llu gen %u\n", lcd->lcd_uuid,
+		       cl_idx, last_transno, lsd->lsd_last_transno,
+		       lcd_last_xid(lcd), lcd->lcd_generation);
+
+		exp = class_new_export(obd, (struct obd_uuid *)lcd->lcd_uuid);
+		if (IS_ERR(exp)) {
+			if (PTR_ERR(exp) == -EALREADY) {
+				/* export already exists, zero out this one */
+				CERROR("%s: Duplicate export %s!\n",
+				       tgt_name(tgt), lcd->lcd_uuid);
+				continue;
+			}
+			GOTO(err_out, rc = PTR_ERR(exp));
+		}
+
+		ted = &exp->exp_target_data;
+		*ted->ted_lcd = *lcd;
+
+		rc = tgt_client_add(env, exp, cl_idx);
+		LASSERTF(rc == 0, "rc = %d\n", rc); /* can't fail existing */
+		/* VBR: set export last committed version */
+		exp->exp_last_committed = last_transno;
+		spin_lock(&exp->exp_lock);
+		exp->exp_connecting = 0;
+		exp->exp_in_recovery = 0;
+		spin_unlock(&exp->exp_lock);
+		obd->obd_max_recoverable_clients++;
+
+		if (tgt->lut_lsd.lsd_feature_incompat &
+		    OBD_INCOMPAT_MULTI_RPCS &&
+		    lcd->lcd_generation != 0) {
+			/* compute the highest valid client generation */
+			generation = max(generation, lcd->lcd_generation);
+			/* fill client_generation <-> export hash table */
+			rc = cfs_hash_add_unique(hash, &lcd->lcd_generation,
+						 &exp->exp_gen_hash);
+			if (rc != 0) {
+				CERROR("%s: duplicate export for client "
+				       "generation %u\n",
+				       tgt_name(tgt), lcd->lcd_generation);
+				class_export_put(exp);
+				GOTO(err_out, rc);
+			}
+		}
+
+		class_export_put(exp);
+
+		rc = rev_import_init(exp);
+		if (rc != 0) {
+			class_unlink_export(exp);
+			GOTO(err_out, rc);
+		}
+
+		/* Need to check last_rcvd even for duplicated exports. */
+		CDEBUG(D_OTHER, "client at idx %d has last_transno = %llu\n",
+		       cl_idx, last_transno);
+
+		spin_lock(&tgt->lut_translock);
+		tgt->lut_last_transno = max(last_transno,
+					    tgt->lut_last_transno);
+		spin_unlock(&tgt->lut_translock);
+	}
+
+	/* record highest valid client generation */
+	atomic_set(&tgt->lut_client_generation, generation);
+
+err_out:
+	if (hash != NULL)
+		cfs_hash_putref(hash);
+	OBD_FREE_PTR(lcd);
+	RETURN(rc);
+}
+
+struct server_compat_data {
+	__u32 rocompat;
+	__u32 incompat;
+	__u32 rocinit;
+	__u32 incinit;
+};
+
+static struct server_compat_data tgt_scd[] = {
+	[LDD_F_SV_TYPE_MDT] = {
+		.rocompat = OBD_ROCOMPAT_LOVOBJID,
+		.incompat = OBD_INCOMPAT_MDT | OBD_INCOMPAT_COMMON_LR |
+			    OBD_INCOMPAT_FID | OBD_INCOMPAT_IAM_DIR |
+			    OBD_INCOMPAT_LMM_VER | OBD_INCOMPAT_MULTI_OI |
+			    OBD_INCOMPAT_MULTI_RPCS,
+		.rocinit = OBD_ROCOMPAT_LOVOBJID,
+		.incinit = OBD_INCOMPAT_MDT | OBD_INCOMPAT_COMMON_LR |
+			   OBD_INCOMPAT_MULTI_OI,
+	},
+	[LDD_F_SV_TYPE_OST] = {
+		.rocompat = OBD_ROCOMPAT_IDX_IN_IDIF,
+		.incompat = OBD_INCOMPAT_OST | OBD_INCOMPAT_COMMON_LR |
+			    OBD_INCOMPAT_FID,
+		.rocinit = OBD_ROCOMPAT_IDX_IN_IDIF,
+		.incinit = OBD_INCOMPAT_OST | OBD_INCOMPAT_COMMON_LR,
+	}
+};
+
+int tgt_server_data_init(const struct lu_env *env, struct lu_target *tgt)
+{
+	struct tgt_thread_info		*tti = tgt_th_info(env);
+	struct lr_server_data		*lsd = &tgt->lut_lsd;
+	unsigned long			 last_rcvd_size;
+	__u32				 index;
+	int				 rc, type;
+
+	rc = dt_attr_get(env, tgt->lut_last_rcvd, &tti->tti_attr);
+	if (rc)
+		RETURN(rc);
+
+	last_rcvd_size = (unsigned long)tti->tti_attr.la_size;
+
+	/* ensure padding in the struct is the correct size */
+	CLASSERT(offsetof(struct lr_server_data, lsd_padding) +
+		 sizeof(lsd->lsd_padding) == LR_SERVER_SIZE);
+
+	rc = server_name2index(tgt_name(tgt), &index, NULL);
+	if (rc < 0) {
+		CERROR("%s: Can not get index from name: rc = %d\n",
+		       tgt_name(tgt), rc);
+		RETURN(rc);
+	}
+	/* server_name2index() returns type */
+	type = rc;
+	if (type != LDD_F_SV_TYPE_MDT && type != LDD_F_SV_TYPE_OST) {
+		CERROR("%s: unknown target type %x\n", tgt_name(tgt), type);
+		RETURN(-EINVAL);
+	}
+
+	/* last_rcvd on OST doesn't provide reconstruct support because there
+	 * may be up to 8 in-flight write requests per single slot in
+	 * last_rcvd client data
+	 */
+	tgt->lut_no_reconstruct = (type == LDD_F_SV_TYPE_OST);
+
+	if (last_rcvd_size == 0) {
+		LCONSOLE_WARN("%s: new disk, initializing\n", tgt_name(tgt));
+
+		memcpy(lsd->lsd_uuid, tgt->lut_obd->obd_uuid.uuid,
+		       sizeof(lsd->lsd_uuid));
+		lsd->lsd_last_transno = 0;
+		lsd->lsd_mount_count = 0;
+		lsd->lsd_server_size = LR_SERVER_SIZE;
+		lsd->lsd_client_start = LR_CLIENT_START;
+		lsd->lsd_client_size = LR_CLIENT_SIZE;
+		lsd->lsd_subdir_count = OBJ_SUBDIR_COUNT;
+		lsd->lsd_osd_index = index;
+		lsd->lsd_feature_rocompat = tgt_scd[type].rocinit;
+		lsd->lsd_feature_incompat = tgt_scd[type].incinit;
+	} else {
+		rc = tgt_server_data_read(env, tgt);
+		if (rc) {
+			CERROR("%s: error reading LAST_RCVD: rc= %d\n",
+			       tgt_name(tgt), rc);
+			RETURN(rc);
+		}
+		if (strcmp(lsd->lsd_uuid, tgt->lut_obd->obd_uuid.uuid)) {
+			if (tgt->lut_bottom->dd_rdonly) {
+				/* Such difference may be caused by mounting
+				 * up snapshot with new fsname under rd_only
+				 * mode. But even if it was NOT, it will not
+				 * damage the system because of "rd_only". */
+				memcpy(lsd->lsd_uuid,
+				       tgt->lut_obd->obd_uuid.uuid,
+				       sizeof(lsd->lsd_uuid));
+			} else {
+				LCONSOLE_ERROR_MSG(0x157, "Trying to start "
+						   "OBD %s using the wrong "
+						   "disk %s. Were the /dev/ "
+						   "assignments rearranged?\n",
+						   tgt->lut_obd->obd_uuid.uuid,
+						   lsd->lsd_uuid);
+				RETURN(-EINVAL);
+			}
+		}
+
+		if (lsd->lsd_osd_index != index) {
+			LCONSOLE_ERROR_MSG(0x157, "%s: index %d in last rcvd "
+					   "is different with the index %d in"
+					   "config log, It might be disk"
+					   "corruption!\n", tgt_name(tgt),
+					   lsd->lsd_osd_index, index);
+			RETURN(-EINVAL);
+		}
+	}
+
+	if (lsd->lsd_feature_incompat & ~tgt_scd[type].incompat) {
+		CERROR("%s: unsupported incompat filesystem feature(s) %x\n",
+		       tgt_name(tgt),
+		       lsd->lsd_feature_incompat & ~tgt_scd[type].incompat);
+		RETURN(-EINVAL);
+	}
+
+	if (type == LDD_F_SV_TYPE_MDT)
+		lsd->lsd_feature_incompat |= OBD_INCOMPAT_FID;
+
+	if (lsd->lsd_feature_rocompat & ~tgt_scd[type].rocompat) {
+		CERROR("%s: unsupported read-only filesystem feature(s) %x\n",
+		       tgt_name(tgt),
+		       lsd->lsd_feature_rocompat & ~tgt_scd[type].rocompat);
+		RETURN(-EINVAL);
+	}
+	/** Interop: evict all clients at first boot with 1.8 last_rcvd */
+	if (type == LDD_F_SV_TYPE_MDT &&
+	    !(lsd->lsd_feature_compat & OBD_COMPAT_20)) {
+		if (last_rcvd_size > lsd->lsd_client_start) {
+			LCONSOLE_WARN("%s: mounting at first time on 1.8 FS, "
+				      "remove all clients for interop needs\n",
+				      tgt_name(tgt));
+			rc = tgt_truncate_last_rcvd(env, tgt,
+						    lsd->lsd_client_start);
+			if (rc)
+				RETURN(rc);
+			last_rcvd_size = lsd->lsd_client_start;
+		}
+		/** set 2.0 flag to upgrade/downgrade between 1.8 and 2.0 */
+		lsd->lsd_feature_compat |= OBD_COMPAT_20;
+	}
+
+	spin_lock(&tgt->lut_translock);
+	tgt->lut_last_transno = lsd->lsd_last_transno;
+	spin_unlock(&tgt->lut_translock);
+
+	lsd->lsd_mount_count++;
+
+	CDEBUG(D_INODE, "=======,=BEGIN DUMPING LAST_RCVD========\n");
+	CDEBUG(D_INODE, "%s: server last_transno: %llu\n",
+	       tgt_name(tgt), tgt->lut_last_transno);
+	CDEBUG(D_INODE, "%s: server mount_count: %llu\n",
+	       tgt_name(tgt), lsd->lsd_mount_count);
+	CDEBUG(D_INODE, "%s: server data size: %u\n",
+	       tgt_name(tgt), lsd->lsd_server_size);
+	CDEBUG(D_INODE, "%s: per-client data start: %u\n",
+	       tgt_name(tgt), lsd->lsd_client_start);
+	CDEBUG(D_INODE, "%s: per-client data size: %u\n",
+	       tgt_name(tgt), lsd->lsd_client_size);
+	CDEBUG(D_INODE, "%s: last_rcvd size: %lu\n",
+	       tgt_name(tgt), last_rcvd_size);
+	CDEBUG(D_INODE, "%s: server subdir_count: %u\n",
+	       tgt_name(tgt), lsd->lsd_subdir_count);
+	CDEBUG(D_INODE, "%s: last_rcvd clients: %lu\n", tgt_name(tgt),
+	       last_rcvd_size <= lsd->lsd_client_start ? 0 :
+	       (last_rcvd_size - lsd->lsd_client_start) /
+		lsd->lsd_client_size);
+	CDEBUG(D_INODE, "========END DUMPING LAST_RCVD========\n");
+
+	if (lsd->lsd_server_size == 0 || lsd->lsd_client_start == 0 ||
+	    lsd->lsd_client_size == 0) {
+		CERROR("%s: bad last_rcvd contents!\n", tgt_name(tgt));
+		RETURN(-EINVAL);
+	}
+
+	if (!tgt->lut_obd->obd_replayable)
+		CWARN("%s: recovery support OFF\n", tgt_name(tgt));
+
+	rc = tgt_clients_data_init(env, tgt, last_rcvd_size);
+	if (rc < 0)
+		GOTO(err_client, rc);
+
+	spin_lock(&tgt->lut_translock);
+	/* obd_last_committed is used for compatibility
+	 * with other lustre recovery code */
+	tgt->lut_obd->obd_last_committed = tgt->lut_last_transno;
+	spin_unlock(&tgt->lut_translock);
+
+	tgt->lut_obd->u.obt.obt_mount_count = lsd->lsd_mount_count;
+	tgt->lut_obd->u.obt.obt_instance = (__u32)lsd->lsd_mount_count;
+
+	/* save it, so mount count and last_transno is current */
+	rc = tgt_server_data_update(env, tgt, 0);
+	if (rc < 0)
+		GOTO(err_client, rc);
+
+	RETURN(0);
+
+err_client:
+	class_disconnect_exports(tgt->lut_obd);
+	return rc;
+}
+
+/* add credits for last_rcvd update */
+int tgt_txn_start_cb(const struct lu_env *env, struct thandle *th,
+		     void *cookie)
+{
+	struct lu_target	*tgt = cookie;
+	struct tgt_session_info	*tsi;
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct dt_object	*dto;
+	int			 rc;
+
+	/* For readonly case, the caller should have got failure
+	 * when start the transaction. If the logic comes here,
+	 * there must be something wrong. */
+	if (unlikely(tgt->lut_bottom->dd_rdonly)) {
+		dump_stack();
+		LBUG();
+	}
+
+	/* if there is no session, then this transaction is not result of
+	 * request processing but some local operation */
+	if (env->le_ses == NULL)
+		return 0;
+
+	LASSERT(tgt->lut_last_rcvd);
+	tsi = tgt_ses_info(env);
+	/* OFD may start transaction without export assigned */
+	if (tsi->tsi_exp == NULL)
+		return 0;
+
+	if (tgt_is_multimodrpcs_client(tsi->tsi_exp)) {
+		/*
+		 * Use maximum possible file offset for declaration to ensure
+		 * ZFS will reserve enough credits for a write anywhere in this
+		 * file, since we don't know where in the file the write will be
+		 * because a replay slot has not been assigned.  This should be
+		 * replaced by dmu_tx_hold_append() when available.
+		 */
+		tti->tti_buf.lb_buf = NULL;
+		tti->tti_buf.lb_len = sizeof(struct lsd_reply_data);
+		dto = dt_object_locate(tgt->lut_reply_data, th->th_dev);
+		rc = dt_declare_record_write(env, dto, &tti->tti_buf, -1, th);
+		if (rc)
+			return rc;
+	} else {
+		dto = dt_object_locate(tgt->lut_last_rcvd, th->th_dev);
+		tti_buf_lcd(tti);
+		tti->tti_off = tsi->tsi_exp->exp_target_data.ted_lr_off;
+		rc = dt_declare_record_write(env, dto, &tti->tti_buf,
+					     tti->tti_off, th);
+		if (rc)
+			return rc;
+	}
+
+	if (tsi->tsi_vbr_obj != NULL &&
+	    !lu_object_remote(&tsi->tsi_vbr_obj->do_lu)) {
+		dto = dt_object_locate(tsi->tsi_vbr_obj, th->th_dev);
+		rc = dt_declare_version_set(env, dto, th);
+	}
+
+	return rc;
+}
+
+/* Update last_rcvd records with latests transaction data */
+int tgt_txn_stop_cb(const struct lu_env *env, struct thandle *th,
+		    void *cookie)
+{
+	struct lu_target	*tgt = cookie;
+	struct tgt_session_info	*tsi;
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct dt_object	*obj = NULL;
+	int			 rc;
+	bool			 echo_client;
+
+	if (env->le_ses == NULL)
+		return 0;
+
+	tsi = tgt_ses_info(env);
+	/* OFD may start transaction without export assigned */
+	if (tsi->tsi_exp == NULL)
+		return 0;
+
+	echo_client = (tgt_ses_req(tsi) == NULL && tsi->tsi_xid == 0);
+
+	if (tti->tti_has_trans && !echo_client) {
+		if (tti->tti_mult_trans == 0) {
+			CDEBUG(D_HA, "More than one transaction %llu\n",
+			       tti->tti_transno);
+			RETURN(0);
+		}
+		/* we need another transno to be assigned */
+		tti->tti_transno = 0;
+	} else if (th->th_result == 0) {
+		tti->tti_has_trans = 1;
+	}
+
+	if (tsi->tsi_vbr_obj != NULL &&
+	    !lu_object_remote(&tsi->tsi_vbr_obj->do_lu)) {
+		obj = tsi->tsi_vbr_obj;
+	}
+
+	if (unlikely(echo_client)) /* echo client special case */
+		rc = tgt_last_rcvd_update_echo(env, tgt, obj, th,
+					       tsi->tsi_exp);
+	else
+		rc = tgt_last_rcvd_update(env, tgt, obj, tsi->tsi_opdata, th,
+					  tgt_ses_req(tsi));
+	return rc;
+}
+
+int tgt_reply_data_init(const struct lu_env *env, struct lu_target *tgt)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct lsd_reply_data	*lrd = &tti->tti_lrd;
+	unsigned long		 reply_data_size;
+	int			 rc;
+	struct lsd_reply_header	*lrh = NULL;
+	struct lsd_client_data  *lcd = NULL;
+	struct tg_reply_data	*trd = NULL;
+	int                      idx;
+	loff_t			 off;
+	struct cfs_hash		*hash = NULL;
+	struct obd_export	*exp;
+	struct tg_export_data   *ted;
+	int			 reply_data_recovered = 0;
+
+	rc = dt_attr_get(env, tgt->lut_reply_data, &tti->tti_attr);
+	if (rc)
+		GOTO(out, rc);
+	reply_data_size = (unsigned long)tti->tti_attr.la_size;
+
+	OBD_ALLOC_PTR(lrh);
+	if (lrh == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	if (reply_data_size == 0) {
+		CDEBUG(D_INFO, "%s: new reply_data file, initializing\n",
+		       tgt_name(tgt));
+		lrh->lrh_magic = LRH_MAGIC;
+		lrh->lrh_header_size = sizeof(struct lsd_reply_header);
+		lrh->lrh_reply_size = sizeof(struct lsd_reply_data);
+		rc = tgt_reply_header_write(env, tgt, lrh);
+		if (rc) {
+			CERROR("%s: error writing %s: rc = %d\n",
+			       tgt_name(tgt), REPLY_DATA, rc);
+			GOTO(out, rc);
+		}
+	} else {
+		rc = tgt_reply_header_read(env, tgt, lrh);
+		if (rc) {
+			CERROR("%s: error reading %s: rc = %d\n",
+			       tgt_name(tgt), REPLY_DATA, rc);
+			GOTO(out, rc);
+		}
+		if (lrh->lrh_magic != LRH_MAGIC ||
+		    lrh->lrh_header_size != sizeof(struct lsd_reply_header) ||
+		    lrh->lrh_reply_size != sizeof(struct lsd_reply_data)) {
+			CERROR("%s: invalid header in %s\n",
+			       tgt_name(tgt), REPLY_DATA);
+			GOTO(out, rc = -EINVAL);
+		}
+
+		hash = cfs_hash_getref(tgt->lut_obd->obd_gen_hash);
+		if (hash == NULL)
+			GOTO(out, rc = -ENODEV);
+
+		OBD_ALLOC_PTR(lcd);
+		if (lcd == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		OBD_ALLOC_PTR(trd);
+		if (trd == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		/* Load reply_data from disk */
+		for (idx = 0, off = sizeof(struct lsd_reply_header);
+		     off < reply_data_size;
+		     idx++, off += sizeof(struct lsd_reply_data)) {
+			rc = tgt_reply_data_read(env, tgt, lrd, off);
+			if (rc) {
+				CERROR("%s: error reading %s: rc = %d\n",
+				       tgt_name(tgt), REPLY_DATA, rc);
+				GOTO(out, rc);
+			}
+
+			exp = cfs_hash_lookup(hash, &lrd->lrd_client_gen);
+			if (exp == NULL) {
+				/* old reply data from a disconnected client */
+				continue;
+			}
+			ted = &exp->exp_target_data;
+			mutex_lock(&ted->ted_lcd_lock);
+
+			/* create in-memory reply_data and link it to
+			 * target export's reply list */
+			rc = tgt_set_reply_slot(tgt, idx);
+			if (rc != 0) {
+				mutex_unlock(&ted->ted_lcd_lock);
+				GOTO(out, rc);
+			}
+			trd->trd_reply = *lrd;
+			trd->trd_pre_versions[0] = 0;
+			trd->trd_pre_versions[1] = 0;
+			trd->trd_pre_versions[2] = 0;
+			trd->trd_pre_versions[3] = 0;
+			trd->trd_index = idx;
+			trd->trd_tag = 0;
+			list_add(&trd->trd_list, &ted->ted_reply_list);
+			ted->ted_reply_cnt++;
+			if (ted->ted_reply_cnt > ted->ted_reply_max)
+				ted->ted_reply_max = ted->ted_reply_cnt;
+
+			CDEBUG(D_HA, "%s: restore reply %p: xid %llu, "
+			       "transno %llu, client gen %u, slot idx %d\n",
+			       tgt_name(tgt), trd, lrd->lrd_xid,
+			       lrd->lrd_transno, lrd->lrd_client_gen,
+			       trd->trd_index);
+
+			/* update export last committed transation */
+			exp->exp_last_committed = max(exp->exp_last_committed,
+						      lrd->lrd_transno);
+
+			mutex_unlock(&ted->ted_lcd_lock);
+			class_export_put(exp);
+
+			/* update target last committed transaction */
+			spin_lock(&tgt->lut_translock);
+			tgt->lut_last_transno = max(tgt->lut_last_transno,
+						    lrd->lrd_transno);
+			spin_unlock(&tgt->lut_translock);
+
+			reply_data_recovered++;
+
+			OBD_ALLOC_PTR(trd);
+			if (trd == NULL)
+				GOTO(out, rc = -ENOMEM);
+		}
+		CDEBUG(D_INFO, "%s: %d reply data have been recovered\n",
+		       tgt_name(tgt), reply_data_recovered);
+	}
+
+	spin_lock(&tgt->lut_translock);
+	/* obd_last_committed is used for compatibility
+	 * with other lustre recovery code */
+	tgt->lut_obd->obd_last_committed = tgt->lut_last_transno;
+	spin_unlock(&tgt->lut_translock);
+
+	rc = 0;
+
+out:
+	if (hash != NULL)
+		cfs_hash_putref(hash);
+	if (lcd != NULL)
+		OBD_FREE_PTR(lcd);
+	if (trd != NULL)
+		OBD_FREE_PTR(trd);
+	if (lrh != NULL)
+		OBD_FREE_PTR(lrh);
+	return rc;
+}
+
+struct tg_reply_data *tgt_lookup_reply_by_xid(struct tg_export_data *ted,
+					      __u64 xid)
+{
+	struct tg_reply_data	*found = NULL;
+	struct tg_reply_data	*reply;
+
+	mutex_lock(&ted->ted_lcd_lock);
+	list_for_each_entry(reply, &ted->ted_reply_list, trd_list) {
+		if (reply->trd_reply.lrd_xid == xid) {
+			found = reply;
+			break;
+		}
+	}
+	mutex_unlock(&ted->ted_lcd_lock);
+	return found;
+}
+EXPORT_SYMBOL(tgt_lookup_reply_by_xid);
+
+/* Look for a reply data matching specified request @req
+ * A copy is returned in @trd if the pointer is not NULL
+ */
+bool tgt_lookup_reply(struct ptlrpc_request *req, struct tg_reply_data *trd)
+{
+	struct tg_export_data	*ted = &req->rq_export->exp_target_data;
+	struct tg_reply_data	*reply;
+	bool			 found = false;
+
+	reply = tgt_lookup_reply_by_xid(ted, req->rq_xid);
+	if (reply != NULL) {
+		found = true;
+		if (trd != NULL)
+			*trd = *reply;
+	}
+
+	CDEBUG(D_TRACE, "%s: lookup reply xid %llu, found %d\n",
+	       tgt_name(class_exp2tgt(req->rq_export)), req->rq_xid,
+	       found ? 1 : 0);
+
+	return found;
+}
+EXPORT_SYMBOL(tgt_lookup_reply);
+
+int tgt_handle_received_xid(struct obd_export *exp, __u64 rcvd_xid)
+{
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	struct lu_target	*lut = class_exp2tgt(exp);
+	struct tg_reply_data	*trd, *tmp;
+
+	mutex_lock(&ted->ted_lcd_lock);
+	list_for_each_entry_safe(trd, tmp, &ted->ted_reply_list, trd_list) {
+		if (trd->trd_reply.lrd_xid > rcvd_xid)
+			continue;
+		ted->ted_release_xid++;
+		tgt_release_reply_data(lut, ted, trd);
+	}
+	mutex_unlock(&ted->ted_lcd_lock);
+
+	return 0;
+}
+
+int tgt_handle_tag(struct obd_export *exp, __u16 tag)
+{
+	struct tg_export_data	*ted = &exp->exp_target_data;
+	struct lu_target	*lut = class_exp2tgt(exp);
+	struct tg_reply_data	*trd, *tmp;
+
+	if (tag == 0)
+		return 0;
+
+	mutex_lock(&ted->ted_lcd_lock);
+	list_for_each_entry_safe(trd, tmp, &ted->ted_reply_list, trd_list) {
+		if (trd->trd_tag != tag)
+			continue;
+		ted->ted_release_tag++;
+		tgt_release_reply_data(lut, ted, trd);
+		break;
+	}
+	mutex_unlock(&ted->ted_lcd_lock);
+
+	return 0;
+}
+
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_main.c b/drivers/staging/lustrefsx/lustre/target/tgt_main.c
new file mode 100644
index 0000000000000..4d3923723b2f1
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_main.c
@@ -0,0 +1,431 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, 2016, Intel Corporation.
+ */
+/*
+ * lustre/target/tgt_main.c
+ *
+ * Lustre Unified Target main initialization code
+ *
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd.h>
+#include "tgt_internal.h"
+#include "../ptlrpc/ptlrpc_internal.h"
+
+/*
+ * Save cross-MDT lock in lut_slc_locks.
+ *
+ * Lock R/W count is not saved, but released in unlock (not canceled remotely),
+ * instead only a refcount is taken, so that the remote MDT where the object
+ * resides can detect conflict with this lock there.
+ *
+ * \param lut target
+ * \param lock cross-MDT lock to save
+ * \param transno when the transaction with this transno is committed, this lock
+ *		  can be canceled.
+ */
+void tgt_save_slc_lock(struct lu_target *lut, struct ldlm_lock *lock,
+		       __u64 transno)
+{
+	spin_lock(&lut->lut_slc_locks_guard);
+	lock_res_and_lock(lock);
+	if (ldlm_is_cbpending(lock)) {
+		/* if it was canceld by server, don't save, because remote MDT
+		 * will do Sync-on-Cancel. */
+		LDLM_LOCK_PUT(lock);
+	} else {
+		lock->l_transno = transno;
+		/* if this lock is in the list already, there are two operations
+		 * both use this lock, and save it after use, so for the second
+		 * one, just put the refcount. */
+		if (list_empty(&lock->l_slc_link))
+			list_add_tail(&lock->l_slc_link, &lut->lut_slc_locks);
+		else
+			LDLM_LOCK_PUT(lock);
+	}
+	unlock_res_and_lock(lock);
+	spin_unlock(&lut->lut_slc_locks_guard);
+}
+EXPORT_SYMBOL(tgt_save_slc_lock);
+
+/*
+ * Discard cross-MDT lock from lut_slc_locks.
+ *
+ * This is called upon BAST, just remove lock from lut_slc_locks and put lock
+ * refcount. The BAST will cancel this lock.
+ *
+ * \param lut target
+ * \param lock cross-MDT lock to discard
+ */
+void tgt_discard_slc_lock(struct lu_target *lut, struct ldlm_lock *lock)
+{
+	spin_lock(&lut->lut_slc_locks_guard);
+	lock_res_and_lock(lock);
+	/* may race with tgt_cancel_slc_locks() */
+	if (lock->l_transno != 0) {
+		LASSERT(!list_empty(&lock->l_slc_link));
+		LASSERT(ldlm_is_cbpending(lock));
+		list_del_init(&lock->l_slc_link);
+		lock->l_transno = 0;
+		LDLM_LOCK_PUT(lock);
+	}
+	unlock_res_and_lock(lock);
+	spin_unlock(&lut->lut_slc_locks_guard);
+}
+EXPORT_SYMBOL(tgt_discard_slc_lock);
+
+/*
+ * Cancel cross-MDT locks upon transaction commit.
+ *
+ * Remove cross-MDT locks from lut_slc_locks, cancel them and put lock refcount.
+ *
+ * \param lut target
+ * \param transno transaction with this number was committed.
+ */
+void tgt_cancel_slc_locks(struct lu_target *lut, __u64 transno)
+{
+	struct ldlm_lock *lock, *next;
+	LIST_HEAD(list);
+	struct lustre_handle lockh;
+
+	spin_lock(&lut->lut_slc_locks_guard);
+	list_for_each_entry_safe(lock, next, &lut->lut_slc_locks,
+				 l_slc_link) {
+		lock_res_and_lock(lock);
+		LASSERT(lock->l_transno != 0);
+		if (lock->l_transno > transno) {
+			unlock_res_and_lock(lock);
+			continue;
+		}
+		/* ouch, another operation is using it after it's saved */
+		if (lock->l_readers != 0 || lock->l_writers != 0) {
+			unlock_res_and_lock(lock);
+			continue;
+		}
+		/* set CBPENDING so that this lock won't be used again */
+		ldlm_set_cbpending(lock);
+		lock->l_transno = 0;
+		list_move(&lock->l_slc_link, &list);
+		unlock_res_and_lock(lock);
+	}
+	spin_unlock(&lut->lut_slc_locks_guard);
+
+	list_for_each_entry_safe(lock, next, &list, l_slc_link) {
+		list_del_init(&lock->l_slc_link);
+		ldlm_lock2handle(lock, &lockh);
+		ldlm_cli_cancel(&lockh, LCF_ASYNC);
+		LDLM_LOCK_PUT(lock);
+	}
+}
+
+int tgt_init(const struct lu_env *env, struct lu_target *lut,
+	     struct obd_device *obd, struct dt_device *dt,
+	     struct tgt_opc_slice *slice, int request_fail_id,
+	     int reply_fail_id)
+{
+	struct dt_object_format	 dof;
+	struct lu_attr		 attr;
+	struct lu_fid		 fid;
+	struct dt_object	*o;
+	int i, rc = 0;
+
+	ENTRY;
+
+	LASSERT(lut);
+	LASSERT(obd);
+	lut->lut_obd = obd;
+	lut->lut_bottom = dt;
+	lut->lut_last_rcvd = NULL;
+	lut->lut_client_bitmap = NULL;
+	atomic_set(&lut->lut_num_clients, 0);
+	atomic_set(&lut->lut_client_generation, 0);
+	lut->lut_reply_data = NULL;
+	lut->lut_reply_bitmap = NULL;
+	obd->u.obt.obt_lut = lut;
+	obd->u.obt.obt_magic = OBT_MAGIC;
+
+	/* set request handler slice and parameters */
+	lut->lut_slice = slice;
+	lut->lut_reply_fail_id = reply_fail_id;
+	lut->lut_request_fail_id = request_fail_id;
+
+	/* sptlrcp variables init */
+	rwlock_init(&lut->lut_sptlrpc_lock);
+	sptlrpc_rule_set_init(&lut->lut_sptlrpc_rset);
+
+	spin_lock_init(&lut->lut_flags_lock);
+	lut->lut_sync_lock_cancel = NEVER_SYNC_ON_CANCEL;
+
+	spin_lock_init(&lut->lut_slc_locks_guard);
+	INIT_LIST_HEAD(&lut->lut_slc_locks);
+
+	/* last_rcvd initialization is needed by replayable targets only */
+	if (!obd->obd_replayable)
+		RETURN(0);
+
+	spin_lock_init(&lut->lut_translock);
+	spin_lock_init(&lut->lut_client_bitmap_lock);
+
+	OBD_ALLOC(lut->lut_client_bitmap, LR_MAX_CLIENTS >> 3);
+	if (lut->lut_client_bitmap == NULL)
+		RETURN(-ENOMEM);
+
+	memset(&attr, 0, sizeof(attr));
+	attr.la_valid = LA_MODE;
+	attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+	dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+	lu_local_obj_fid(&fid, LAST_RECV_OID);
+
+	o = dt_find_or_create(env, lut->lut_bottom, &fid, &dof, &attr);
+	if (IS_ERR(o)) {
+		rc = PTR_ERR(o);
+		CERROR("%s: cannot open LAST_RCVD: rc = %d\n", tgt_name(lut),
+		       rc);
+		GOTO(out_put, rc);
+	}
+
+	lut->lut_last_rcvd = o;
+	rc = tgt_server_data_init(env, lut);
+	if (rc < 0)
+		GOTO(out_put, rc);
+
+	/* prepare transactions callbacks */
+	lut->lut_txn_cb.dtc_txn_start = tgt_txn_start_cb;
+	lut->lut_txn_cb.dtc_txn_stop = tgt_txn_stop_cb;
+	lut->lut_txn_cb.dtc_txn_commit = NULL;
+	lut->lut_txn_cb.dtc_cookie = lut;
+	lut->lut_txn_cb.dtc_tag = LCT_DT_THREAD | LCT_MD_THREAD;
+	INIT_LIST_HEAD(&lut->lut_txn_cb.dtc_linkage);
+
+	dt_txn_callback_add(lut->lut_bottom, &lut->lut_txn_cb);
+	lut->lut_bottom->dd_lu_dev.ld_site->ls_tgt = lut;
+
+	/* reply_data is supported by MDT targets only for now */
+	if (strncmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME, 3) != 0)
+		RETURN(0);
+
+	OBD_ALLOC(lut->lut_reply_bitmap,
+		  LUT_REPLY_SLOTS_MAX_CHUNKS * sizeof(unsigned long *));
+	if (lut->lut_reply_bitmap == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	memset(&attr, 0, sizeof(attr));
+	attr.la_valid = LA_MODE;
+	attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+	dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+	lu_local_obj_fid(&fid, REPLY_DATA_OID);
+
+	o = dt_find_or_create(env, lut->lut_bottom, &fid, &dof, &attr);
+	if (IS_ERR(o)) {
+		rc = PTR_ERR(o);
+		CERROR("%s: cannot open REPLY_DATA: rc = %d\n", tgt_name(lut),
+		       rc);
+		GOTO(out, rc);
+	}
+	lut->lut_reply_data = o;
+
+	rc = tgt_reply_data_init(env, lut);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	atomic_set(&lut->lut_sync_count, 0);
+
+	RETURN(0);
+
+out:
+	dt_txn_callback_del(lut->lut_bottom, &lut->lut_txn_cb);
+out_put:
+	obd->u.obt.obt_magic = 0;
+	obd->u.obt.obt_lut = NULL;
+	if (lut->lut_last_rcvd != NULL) {
+		dt_object_put(env, lut->lut_last_rcvd);
+		lut->lut_last_rcvd = NULL;
+	}
+	if (lut->lut_client_bitmap != NULL)
+		OBD_FREE(lut->lut_client_bitmap, LR_MAX_CLIENTS >> 3);
+	lut->lut_client_bitmap = NULL;
+	if (lut->lut_reply_data != NULL)
+		dt_object_put(env, lut->lut_reply_data);
+	lut->lut_reply_data = NULL;
+	if (lut->lut_reply_bitmap != NULL) {
+		for (i = 0; i < LUT_REPLY_SLOTS_MAX_CHUNKS; i++) {
+			if (lut->lut_reply_bitmap[i] != NULL)
+				OBD_FREE_LARGE(lut->lut_reply_bitmap[i],
+				    BITS_TO_LONGS(LUT_REPLY_SLOTS_PER_CHUNK) *
+				    sizeof(long));
+			lut->lut_reply_bitmap[i] = NULL;
+		}
+		OBD_FREE(lut->lut_reply_bitmap,
+			 LUT_REPLY_SLOTS_MAX_CHUNKS * sizeof(unsigned long *));
+	}
+	lut->lut_reply_bitmap = NULL;
+	return rc;
+}
+EXPORT_SYMBOL(tgt_init);
+
+void tgt_fini(const struct lu_env *env, struct lu_target *lut)
+{
+	int i;
+	int rc;
+	ENTRY;
+
+	if (lut->lut_lsd.lsd_feature_incompat & OBD_INCOMPAT_MULTI_RPCS &&
+	    atomic_read(&lut->lut_num_clients) == 0) {
+		/* Clear MULTI RPCS incompatibility flag that prevents previous
+		 * Lustre versions to mount a target with reply_data file */
+		lut->lut_lsd.lsd_feature_incompat &= ~OBD_INCOMPAT_MULTI_RPCS;
+		rc = tgt_server_data_update(env, lut, 1);
+		if (rc < 0)
+			CERROR("%s: unable to clear MULTI RPCS "
+			       "incompatibility flag\n",
+			       lut->lut_obd->obd_name);
+	}
+
+	sptlrpc_rule_set_free(&lut->lut_sptlrpc_rset);
+
+	if (lut->lut_reply_data != NULL)
+		dt_object_put(env, lut->lut_reply_data);
+	lut->lut_reply_data = NULL;
+	if (lut->lut_reply_bitmap != NULL) {
+		for (i = 0; i < LUT_REPLY_SLOTS_MAX_CHUNKS; i++) {
+			if (lut->lut_reply_bitmap[i] != NULL)
+				OBD_FREE_LARGE(lut->lut_reply_bitmap[i],
+				    BITS_TO_LONGS(LUT_REPLY_SLOTS_PER_CHUNK) *
+				    sizeof(long));
+			lut->lut_reply_bitmap[i] = NULL;
+		}
+		OBD_FREE(lut->lut_reply_bitmap,
+			 LUT_REPLY_SLOTS_MAX_CHUNKS * sizeof(unsigned long *));
+	}
+	lut->lut_reply_bitmap = NULL;
+	if (lut->lut_client_bitmap) {
+		OBD_FREE(lut->lut_client_bitmap, LR_MAX_CLIENTS >> 3);
+		lut->lut_client_bitmap = NULL;
+	}
+	if (lut->lut_last_rcvd) {
+		dt_txn_callback_del(lut->lut_bottom, &lut->lut_txn_cb);
+		dt_object_put(env, lut->lut_last_rcvd);
+		lut->lut_last_rcvd = NULL;
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(tgt_fini);
+
+/* context key constructor/destructor: tg_key_init, tg_key_fini */
+LU_KEY_INIT(tgt, struct tgt_thread_info);
+
+static void tgt_key_fini(const struct lu_context *ctx,
+			 struct lu_context_key *key, void *data)
+{
+	struct tgt_thread_info		*info = data;
+	struct thandle_exec_args	*args = &info->tti_tea;
+	int				i;
+
+	for (i = 0; i < args->ta_alloc_args; i++) {
+		if (args->ta_args[i] != NULL)
+			OBD_FREE_PTR(args->ta_args[i]);
+	}
+
+	if (args->ta_args != NULL)
+		OBD_FREE(args->ta_args, sizeof(args->ta_args[0]) *
+					args->ta_alloc_args);
+	OBD_FREE_PTR(info);
+}
+
+static void tgt_key_exit(const struct lu_context *ctx,
+			 struct lu_context_key *key, void *data)
+{
+	struct tgt_thread_info *tti = data;
+
+	tti->tti_has_trans = 0;
+	tti->tti_mult_trans = 0;
+}
+
+/* context key: tg_thread_key */
+struct lu_context_key tgt_thread_key = {
+	.lct_tags = LCT_MD_THREAD | LCT_DT_THREAD,
+	.lct_init = tgt_key_init,
+	.lct_fini = tgt_key_fini,
+	.lct_exit = tgt_key_exit,
+};
+
+LU_KEY_INIT_GENERIC(tgt);
+
+/* context key constructor/destructor: tgt_ses_key_init, tgt_ses_key_fini */
+LU_KEY_INIT_FINI(tgt_ses, struct tgt_session_info);
+
+/* context key: tgt_session_key */
+struct lu_context_key tgt_session_key = {
+	.lct_tags = LCT_SERVER_SESSION,
+	.lct_init = tgt_ses_key_init,
+	.lct_fini = tgt_ses_key_fini,
+};
+EXPORT_SYMBOL(tgt_session_key);
+
+LU_KEY_INIT_GENERIC(tgt_ses);
+
+/*
+ * this page is allocated statically when module is initializing
+ * it is used to simulate data corruptions, see ost_checksum_bulk()
+ * for details. as the original pages provided by the layers below
+ * can be remain in the internal cache, we do not want to modify
+ * them.
+ */
+struct page *tgt_page_to_corrupt;
+
+int tgt_mod_init(void)
+{
+	ENTRY;
+
+	tgt_page_to_corrupt = alloc_page(GFP_KERNEL);
+
+	tgt_key_init_generic(&tgt_thread_key, NULL);
+	lu_context_key_register_many(&tgt_thread_key, NULL);
+
+	tgt_ses_key_init_generic(&tgt_session_key, NULL);
+	lu_context_key_register_many(&tgt_session_key, NULL);
+	barrier_init();
+
+	update_info_init();
+
+	RETURN(0);
+}
+
+void tgt_mod_exit(void)
+{
+	barrier_fini();
+	if (tgt_page_to_corrupt != NULL)
+		put_page(tgt_page_to_corrupt);
+
+	lu_context_key_degister(&tgt_thread_key);
+	lu_context_key_degister(&tgt_session_key);
+	update_info_fini();
+}
+
diff --git a/drivers/staging/lustrefsx/lustre/target/update_records.c b/drivers/staging/lustrefsx/lustre/target/update_records.c
new file mode 100644
index 0000000000000..a36d554525507
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/target/update_records.c
@@ -0,0 +1,1233 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2015, Intel Corporation.
+ */
+
+/*
+ * lustre/target/update_records.c
+ *
+ * This file implement the methods to pack updates as update records, which
+ * will be written to the disk as llog record, and might be used during
+ * recovery.
+ *
+ * For cross-MDT operation, all of updates of the operation needs to be
+ * recorded in the disk, then during recovery phase, the recovery thread
+ * will retrieve and redo these updates if it needed.
+ *
+ * See comments above struct update_records for the format of update_records.
+ *
+ * Author: Di Wang <di.wang@intel.com>
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <lu_target.h>
+#include <lustre_obdo.h>
+#include <lustre_update.h>
+#include <obd.h>
+#include <obd_class.h>
+
+#include "tgt_internal.h"
+
+#define UPDATE_RECORDS_BUFFER_SIZE	8192
+#define UPDATE_PARAMS_BUFFER_SIZE	8192
+/**
+ * Dump update record.
+ *
+ * Dump all of updates in the update_records, mostly for debugging purpose.
+ *
+ * \param[in] records	update records to be dumpped
+ * \param[in] mask	debug level mask
+ * \param[in] dump_params if dump all of updates the updates.
+ *
+ */
+void update_records_dump(const struct update_records *records,
+			 unsigned int mask, bool dump_updates)
+{
+	const struct update_ops	*ops;
+	const struct update_op	*op = NULL;
+	struct update_params	*params = NULL;
+	unsigned int		i;
+
+	CDEBUG(mask, "master transno = %llu batchid = %llu flags = %x"
+	       " ops = %d params = %d\n", records->ur_master_transno,
+	       records->ur_batchid, records->ur_flags, records->ur_update_count,
+	       records->ur_param_count);
+
+	if (records->ur_update_count == 0)
+		return;
+
+	if (!dump_updates)
+		return;
+
+	ops = &records->ur_ops;
+	if (records->ur_param_count > 0)
+		params = update_records_get_params(records);
+
+	op = &ops->uops_op[0];
+	for (i = 0; i < records->ur_update_count; i++,
+				  op = update_op_next_op(op)) {
+		unsigned int j;
+
+		CDEBUG(mask, "update %dth "DFID" %s params_count = %hu\n", i,
+		       PFID(&op->uop_fid), update_op_str(op->uop_type),
+		       op->uop_param_count);
+
+		if (params == NULL)
+			continue;
+
+		for (j = 0;  j < op->uop_param_count; j++) {
+			struct object_update_param *param;
+
+			param = update_params_get_param(params,
+				(unsigned int)op->uop_params_off[j],
+					records->ur_param_count);
+
+			if (param == NULL)
+				continue;
+			CDEBUG(mask, "param = %p %dth off = %hu size = %hu\n",
+			       param, j, op->uop_params_off[j], param->oup_len);
+		}
+	}
+}
+
+/**
+ * Pack parameters to update records
+ *
+ * Find and insert parameter to update records, if the parameter
+ * already exists in \a params, then just return the offset of this
+ * parameter, otherwise insert the parameter and return its offset
+ *
+ * \param[in] params	update params in which to insert parameter
+ * \param[in] new_param	parameters to be inserted.
+ * \param[in] new_param_size	the size of \a new_param
+ *
+ * \retval		index inside \a params if parameter insertion
+ *                      succeeds.
+ * \retval		negative errno if it fails.
+ */
+static unsigned int update_records_param_pack(struct update_params *params,
+					      const void *new_param,
+					      size_t new_param_size,
+					      unsigned int *param_count)
+{
+	struct object_update_param	*param;
+	unsigned int			i;
+
+	for (i = 0; i < *param_count; i++) {
+		struct object_update_param *param;
+
+		param = update_params_get_param(params, i, *param_count);
+		if ((new_param == NULL && param->oup_len == new_param_size) ||
+		    (param->oup_len == new_param_size &&
+		     memcmp(param->oup_buf, new_param, new_param_size) == 0))
+			/* Found the parameter and return its index */
+			return i;
+	}
+
+	param = (struct object_update_param *)((char *)params +
+				update_params_size(params, *param_count));
+
+	param->oup_len = new_param_size;
+	if (new_param != NULL)
+		memcpy(param->oup_buf, new_param, new_param_size);
+
+	*param_count = *param_count + 1;
+
+	return *param_count - 1;
+}
+
+/**
+ * Pack update to update records
+ *
+ * Pack the update and its parameters to the update records. First it will
+ * insert parameters, get the offset of these parameter, then fill the
+ * update with these offset. If insertion exceed the maximum size of
+ * current update records, it will return -E2BIG here, and the caller might
+ * extend the update_record size \see lod_updates_pack.
+ *
+ * \param[in] env	execution environment
+ * \param[in] fid	FID of the update.
+ * \param[in] op_type	operation type of the update
+ * \param[in] ops	ur_ops in update records
+ * \param[in|out] op_count	pointer to the count of ops
+ * \param[in|out] max_op_size maximum size of the update
+ * \param[in] params	ur_params in update records
+ * \param[in|out] param_count	pointer to the count of params
+ * \param[in|out] max_param_size maximum size of the parameter
+ * \param[in] param_bufs	buffers of parameters
+ * \param[in] params_buf_count	the count of the parameter buffers
+ * \param[in] param_size	sizes of parameters
+ *
+ * \retval		0 if packing succeeds
+ * \retval		negative errno if packing fails
+ */
+static int update_records_update_pack(const struct lu_env *env,
+				      const struct lu_fid *fid,
+				      enum update_type op_type,
+				      struct update_ops *ops,
+				      unsigned int *op_count,
+				      size_t *max_op_size,
+				      struct update_params *params,
+				      unsigned int *param_count,
+				      size_t *max_param_size,
+				      unsigned int param_bufs_count,
+				      const void **param_bufs,
+				      size_t *param_sizes)
+{
+	struct update_op	*op;
+	size_t			total_param_sizes = 0;
+	int			index;
+	unsigned int		i;
+
+	/* Check whether the packing exceeding the maximum update size */
+	if (unlikely(*max_op_size < update_op_size(param_bufs_count))) {
+		CDEBUG(D_INFO, "max_op_size = %zu update_op = %zu\n",
+		       *max_op_size, update_op_size(param_bufs_count));
+		*max_op_size = update_op_size(param_bufs_count);
+		return -E2BIG;
+	}
+
+	for (i = 0; i < param_bufs_count; i++)
+		total_param_sizes +=
+			cfs_size_round(sizeof(struct object_update_param) +
+				       param_sizes[i]);
+
+	/* Check whether the packing exceeding the maximum parameter size */
+	if (unlikely(*max_param_size < total_param_sizes)) {
+		CDEBUG(D_INFO, "max_param_size = %zu params size = %zu\n",
+		       *max_param_size, total_param_sizes);
+
+		*max_param_size = total_param_sizes;
+		return -E2BIG;
+	}
+
+	op = update_ops_get_op(ops, *op_count, *op_count);
+	op->uop_fid = *fid;
+	op->uop_type = op_type;
+	op->uop_param_count = param_bufs_count;
+	for (i = 0; i < param_bufs_count; i++) {
+		index = update_records_param_pack(params, param_bufs[i],
+						  param_sizes[i], param_count);
+		if (index < 0)
+			return index;
+
+		CDEBUG(D_INFO, "%s %uth param offset = %d size = %zu\n",
+		       update_op_str(op_type), i, index, param_sizes[i]);
+
+		op->uop_params_off[i] = index;
+	}
+	CDEBUG(D_INFO, "%huth "DFID" %s param_count = %u\n",
+	       *op_count, PFID(fid), update_op_str(op_type), *param_count);
+
+	*op_count = *op_count + 1;
+
+	return 0;
+}
+
+/**
+ * Calculate update_records size
+ *
+ * Calculate update_records size by param_count and param_sizes array.
+ *
+ * \param[in] param_count	the count of parameters
+ * \param[in] sizes		the size array of these parameters
+ *
+ * \retval			the size of this update
+ */
+static size_t update_records_update_size(__u32 param_count, size_t *sizes)
+{
+	int i;
+	size_t size;
+
+	/* Check whether the packing exceeding the maximum update size */
+	size = update_op_size(param_count);
+
+	for (i = 0; i < param_count; i++)
+		size += cfs_size_round(sizeof(struct object_update_param) +
+				       sizes[i]);
+
+	return size;
+}
+
+/**
+ * Calculate create update size
+ *
+ * \param[in] env	execution environment
+ * \param[in] ops	ur_ops in update records
+ * \param[in] fid	FID of the object to be created
+ * \param[in] attr	attribute of the object to be created
+ * \param[in] hint	creation hint
+ * \param[in] dof	creation format information
+ *
+ * \retval		size of create update.
+ */
+size_t update_records_create_size(const struct lu_env *env,
+				  const struct lu_fid *fid,
+				  const struct lu_attr *attr,
+				  const struct dt_allocation_hint *hint,
+				  struct dt_object_format *dof)
+{
+	size_t	sizes[2];
+	int	param_count = 0;
+
+	if (attr != NULL) {
+		sizes[param_count] = sizeof(struct obdo);
+		param_count++;
+	}
+
+	if (hint != NULL && hint->dah_parent != NULL) {
+		sizes[param_count] = sizeof(*fid);
+		param_count++;
+	}
+
+	return update_records_update_size(param_count, sizes);
+}
+EXPORT_SYMBOL(update_records_create_size);
+
+/**
+ * Pack create update
+ *
+ * Pack create update into update records.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ops	ur_ops in update records
+ * \param[in|out] op_count	pointer to the count of ops
+ * \param[in|out] max_op_size maximum size of the update
+ * \param[in] params	ur_params in update records
+ * \param[in|out] param_count	pointer to the count of params
+ * \param[in|out] max_param_size maximum size of the parameter
+ * \param[in] fid	FID of the object to be created
+ * \param[in] attr	attribute of the object to be created
+ * \param[in] hint	creation hint
+ * \param[in] dof	creation format information
+ *
+ * \retval		0 if packing succeeds.
+ * \retval		negative errno if packing fails.
+ */
+int update_records_create_pack(const struct lu_env *env,
+			       struct update_ops *ops,
+			       unsigned int *op_count,
+			       size_t *max_ops_size,
+			       struct update_params *params,
+			       unsigned int *param_count,
+			       size_t *max_param_size,
+			       const struct lu_fid *fid,
+			       const struct lu_attr *attr,
+			       const struct dt_allocation_hint *hint,
+			       struct dt_object_format *dof)
+{
+	size_t			sizes[2];
+	const void		*bufs[2];
+	int			buf_count = 0;
+	const struct lu_fid	*parent_fid = NULL;
+	struct lu_fid		tmp_fid;
+	int			rc;
+	struct obdo		*obdo;
+
+	if (attr != NULL) {
+		obdo = &update_env_info(env)->uti_obdo;
+		obdo->o_valid = 0;
+		obdo_from_la(obdo, attr, attr->la_valid);
+		bufs[buf_count] = obdo;
+		sizes[buf_count] = sizeof(*obdo);
+		buf_count++;
+	}
+
+	if (hint != NULL && hint->dah_parent != NULL) {
+		parent_fid = lu_object_fid(&hint->dah_parent->do_lu);
+		fid_cpu_to_le(&tmp_fid, parent_fid);
+		bufs[buf_count] = &tmp_fid;
+		sizes[buf_count] = sizeof(tmp_fid);
+		buf_count++;
+	}
+
+	rc = update_records_update_pack(env, fid, OUT_CREATE, ops, op_count,
+					max_ops_size, params, param_count,
+					max_param_size, buf_count, bufs, sizes);
+	return rc;
+}
+EXPORT_SYMBOL(update_records_create_pack);
+
+/**
+ * Calculate attr set update size
+ *
+ * \param[in] env	execution environment
+ * \param[in] ops	ur_ops in update records
+ * \param[in] fid	FID of the object to set attr
+ * \param[in] attr	attribute of attr set
+ *
+ * \retval		size of attr set update.
+ */
+size_t update_records_attr_set_size(const struct lu_env *env,
+				    const struct lu_fid *fid,
+				    const struct lu_attr *attr)
+{
+	size_t size = sizeof(struct obdo);
+
+	return update_records_update_size(1, &size);
+}
+EXPORT_SYMBOL(update_records_attr_set_size);
+
+/**
+ * Pack attr set update
+ *
+ * Pack attr_set update into update records.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ops	ur_ops in update records
+ * \param[in|out] op_count pointer to the count of ops
+ * \param[in|out] max_op_size maximum size of the update
+ * \param[in] params	ur_params in update records
+ * \param[in|out] param_count pointer to the count of params
+ * \param[in|out] max_param_size maximum size of the parameter
+ * \param[in] fid	FID of the object to set attr
+ * \param[in] attr	attribute of attr set
+ *
+ * \retval		0 if packing succeeds.
+ * \retval		negative errno if packing fails.
+ */
+int update_records_attr_set_pack(const struct lu_env *env,
+				 struct update_ops *ops,
+				 unsigned int *op_count,
+				 size_t *max_ops_size,
+				 struct update_params *params,
+				 unsigned int *param_count,
+				 size_t *max_param_size,
+				 const struct lu_fid *fid,
+				 const struct lu_attr *attr)
+{
+	struct obdo *obdo = &update_env_info(env)->uti_obdo;
+	size_t size = sizeof(*obdo);
+
+	obdo->o_valid = 0;
+	obdo_from_la(obdo, attr, attr->la_valid);
+	return update_records_update_pack(env, fid, OUT_ATTR_SET, ops, op_count,
+					  max_ops_size, params, param_count,
+					  max_param_size, 1,
+					  (const void **)&obdo, &size);
+}
+EXPORT_SYMBOL(update_records_attr_set_pack);
+
+/**
+ * Calculate ref add update size
+ *
+ * \param[in] env	execution environment
+ * \param[in] fid	FID of the object to add reference
+ *
+ * \retval		size of ref_add udpate.
+ */
+size_t update_records_ref_add_size(const struct lu_env *env,
+				   const struct lu_fid *fid)
+{
+	return update_records_update_size(0, NULL);
+}
+EXPORT_SYMBOL(update_records_ref_add_size);
+
+/**
+ * Pack ref add update
+ *
+ * Pack ref add update into update records.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ops	ur_ops in update records
+ * \param[in|out] op_count pointer to the count of ops
+ * \param[in|out] max_op_size maximum size of the update
+ * \param[in] params	ur_params in update records
+ * \param[in|out] param_count pointer to the count of params
+ * \param[in|out] max_param_size maximum size of the parameter
+ * \param[in] fid	FID of the object to add reference
+ *
+ * \retval		0 if packing succeeds.
+ * \retval		negative errno if packing fails.
+ */
+int update_records_ref_add_pack(const struct lu_env *env,
+				struct update_ops *ops,
+				unsigned int *op_count,
+				size_t *max_ops_size,
+				struct update_params *params,
+				unsigned int *param_count,
+				size_t *max_param_size,
+				const struct lu_fid *fid)
+{
+	return update_records_update_pack(env, fid, OUT_REF_ADD, ops, op_count,
+					  max_ops_size, params, param_count,
+					  max_param_size, 0, NULL, NULL);
+}
+EXPORT_SYMBOL(update_records_ref_add_pack);
+
+/**
+ * Pack noop update
+ *
+ * Pack no op update into update records. Note: no op means
+ * the update does not need do anything, which is only used
+ * in test case to verify large size record.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ops	ur_ops in update records
+ * \param[in|out] op_count pointer to the count of ops
+ * \param[in|out] max_op_size maximum size of the update
+ * \param[in] params	ur_params in update records
+ * \param[in|out] param_count pointer to the count of params
+ * \param[in|out] max_param_size maximum size of the parameter
+ * \param[in] fid	FID of the object to add reference
+ *
+ * \retval		0 if packing succeeds.
+ * \retval		negative errno if packing fails.
+ */
+int update_records_noop_pack(const struct lu_env *env,
+			     struct update_ops *ops,
+			     unsigned int *op_count,
+			     size_t *max_ops_size,
+			     struct update_params *params,
+			     unsigned int *param_count,
+			     size_t *max_param_size,
+			     const struct lu_fid *fid)
+{
+	return update_records_update_pack(env, fid, OUT_NOOP, ops, op_count,
+					  max_ops_size, params, param_count,
+					  max_param_size, 0, NULL, NULL);
+}
+EXPORT_SYMBOL(update_records_noop_pack);
+
+/**
+ * Calculate ref del update size
+ *
+ * \param[in] env	execution environment
+ * \param[in] fid	FID of the object to delete reference
+ *
+ * \retval		size of ref_del update.
+ */
+size_t update_records_ref_del_size(const struct lu_env *env,
+				   const struct lu_fid *fid)
+{
+	return update_records_update_size(0, NULL);
+}
+EXPORT_SYMBOL(update_records_ref_del_size);
+
+/**
+ * Pack ref del update
+ *
+ * Pack ref del update into update records.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ops	ur_ops in update records
+ * \param[in|out] op_count	pointer to the count of ops
+ * \param[in|out] max_op_size maximum size of the update
+ * \param[in] params	ur_params in update records
+ * \param[in] param_count	pointer to the count of params
+ * \param[in|out] max_param_size maximum size of the parameter
+ * \param[in] fid	FID of the object to delete reference
+ *
+ * \retval		0 if packing succeeds.
+ * \retval		negative errno if packing fails.
+ */
+int update_records_ref_del_pack(const struct lu_env *env,
+				struct update_ops *ops,
+				unsigned int *op_count,
+				size_t *max_ops_size,
+				struct update_params *params,
+				unsigned int *param_count,
+				size_t *max_param_size,
+				const struct lu_fid *fid)
+{
+	return update_records_update_pack(env, fid, OUT_REF_DEL, ops, op_count,
+					  max_ops_size, params, param_count,
+					  max_param_size, 0, NULL, NULL);
+}
+EXPORT_SYMBOL(update_records_ref_del_pack);
+
+/**
+ * Calculate object destroy update size
+ *
+ * \param[in] env	execution environment
+ * \param[in] fid	FID of the object to delete reference
+ *
+ * \retval		size of object destroy update.
+ */
+size_t update_records_destroy_size(const struct lu_env *env,
+					  const struct lu_fid *fid)
+{
+	return update_records_update_size(0, NULL);
+}
+EXPORT_SYMBOL(update_records_destroy_size);
+
+/**
+ * Pack object destroy update
+ *
+ * Pack object destroy update into update records.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ops	ur_ops in update records
+ * \param[in|out] op_count pointer to the count of ops
+ * \param[in|out] max_op_size maximum size of the update
+ * \param[in] params	ur_params in update records
+ * \param[in|out] param_count	pointer to the count of params
+ * \param[in|out] max_param_size maximum size of the parameter
+ * \param[in] fid	FID of the object to delete reference
+ *
+ * \retval		0 if packing succeeds.
+ * \retval		negative errno if packing fails.
+ */
+int update_records_destroy_pack(const struct lu_env *env,
+				       struct update_ops *ops,
+				       unsigned int *op_count,
+				       size_t *max_ops_size,
+				       struct update_params *params,
+				       unsigned int *param_count,
+				       size_t *max_param_size,
+				       const struct lu_fid *fid)
+{
+	return update_records_update_pack(env, fid, OUT_DESTROY, ops, op_count,
+					  max_ops_size, params, param_count,
+					  max_param_size, 0, NULL, NULL);
+}
+EXPORT_SYMBOL(update_records_destroy_pack);
+
+/**
+ * Calculate index insert update size
+ *
+ * \param[in] env	execution environment
+ * \param[in] fid	FID of the object to insert index
+ * \param[in] rec	record of insertion
+ * \param[in] key	key of insertion
+ *
+ * \retval		the size of index insert update.
+ */
+size_t update_records_index_insert_size(const struct lu_env *env,
+					const struct lu_fid *fid,
+					const struct dt_rec *rec,
+					const struct dt_key *key)
+{
+	size_t			   sizes[3] = { strlen((const char *)key) + 1,
+						sizeof(struct lu_fid),
+						sizeof(__u32) };
+	return update_records_update_size(3, sizes);
+}
+EXPORT_SYMBOL(update_records_index_insert_size);
+
+/**
+ * Pack index insert update
+ *
+ * Pack index insert update into update records.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ops	ur_ops in update records
+ * \param[in] op_count	pointer to the count of ops
+ * \param[in|out] max_op_size maximum size of the update
+ * \param[in] params	ur_params in update records
+ * \param[in] param_count	pointer to the count of params
+ * \param[in|out] max_param_size maximum size of the parameter
+ * \param[in] fid	FID of the object to insert index
+ * \param[in] rec	record of insertion
+ * \param[in] key	key of insertion
+ *
+ * \retval		0 if packing succeeds.
+ * \retval		negative errno if packing fails.
+ */
+int update_records_index_insert_pack(const struct lu_env *env,
+				     struct update_ops *ops,
+				     unsigned int *op_count,
+				     size_t *max_ops_size,
+				     struct update_params *params,
+				     unsigned int *param_count,
+				     size_t *max_param_size,
+				     const struct lu_fid *fid,
+				     const struct dt_rec *rec,
+				     const struct dt_key *key)
+{
+	struct dt_insert_rec	   *rec1 = (struct dt_insert_rec *)rec;
+	struct lu_fid		   rec_fid;
+	__u32			   type = cpu_to_le32(rec1->rec_type);
+	size_t			   sizes[3] = { strlen((const char *)key) + 1,
+						sizeof(rec_fid),
+						sizeof(type) };
+	const void		   *bufs[3] = { key,
+						&rec_fid,
+						&type };
+
+	fid_cpu_to_le(&rec_fid, rec1->rec_fid);
+
+	return update_records_update_pack(env, fid, OUT_INDEX_INSERT, ops,
+					  op_count, max_ops_size, params,
+					  param_count, max_param_size,
+					  3, bufs, sizes);
+}
+EXPORT_SYMBOL(update_records_index_insert_pack);
+
+/**
+ * Calculate index delete update size
+ *
+ * \param[in] env	execution environment
+ * \param[in] fid	FID of the object to delete index
+ * \param[in] key	key of deletion
+ *
+ * \retval		the size of index delete update
+ */
+size_t update_records_index_delete_size(const struct lu_env *env,
+					const struct lu_fid *fid,
+					const struct dt_key *key)
+{
+	size_t size = strlen((const char *)key) + 1;
+
+	return update_records_update_size(1, &size);
+}
+EXPORT_SYMBOL(update_records_index_delete_size);
+
+/**
+ * Pack index delete update
+ *
+ * Pack index delete update into update records.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ops	ur_ops in update records
+ * \param[in|out] op_count	pointer to the count of ops
+ * \param[in|out] max_op_size maximum size of the update
+ * \param[in] params	ur_params in update records
+ * \param[in|ount] param_count	pointer to the count of params
+ * \param[in|out] max_param_size maximum size of the parameter
+ * \param[in] fid	FID of the object to delete index
+ * \param[in] key	key of deletion
+ *
+ * \retval		0 if packing succeeds.
+ * \retval		negative errno if packing fails.
+ */
+int update_records_index_delete_pack(const struct lu_env *env,
+				     struct update_ops *ops,
+				     unsigned int *op_count,
+				     size_t *max_ops_size,
+				     struct update_params *params,
+				     unsigned int *param_count,
+				     size_t *max_param_size,
+				     const struct lu_fid *fid,
+				     const struct dt_key *key)
+{
+	size_t size = strlen((const char *)key) + 1;
+
+	return update_records_update_pack(env, fid, OUT_INDEX_DELETE, ops,
+					  op_count, max_ops_size, params,
+					  param_count, max_param_size,
+					  1, (const void **)&key, &size);
+}
+EXPORT_SYMBOL(update_records_index_delete_pack);
+
+/**
+ * Calculate xattr set size
+ *
+ * \param[in] env	execution environment
+ * \param[in] fid	FID of the object to set xattr
+ * \param[in] buf	xattr to be set
+ * \param[in] name	name of the xattr
+ * \param[in] flag	flag for setting xattr
+ *
+ * \retval		size of xattr set update.
+ */
+size_t update_records_xattr_set_size(const struct lu_env *env,
+				     const struct lu_fid *fid,
+				     const struct lu_buf *buf,
+				     const char *name, __u32 flag)
+{
+	size_t	sizes[3] = {strlen(name) + 1, buf->lb_len, sizeof(flag)};
+
+	return update_records_update_size(3, sizes);
+}
+EXPORT_SYMBOL(update_records_xattr_set_size);
+
+/**
+ * Pack xattr set update
+ *
+ * Pack xattr set update into update records.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ops	ur_ops in update records
+ * \param[in|out] op_count	pointer to the count of ops
+ * \param[in|out] max_op_size maximum size of the update
+ * \param[in] params	ur_params in update records
+ * \param[in|out] param_count	pointer to the count of params
+ * \param[in|out] max_param_size maximum size of the parameter
+ * \param[in] fid	FID of the object to set xattr
+ * \param[in] buf	xattr to be set
+ * \param[in] name	name of the xattr
+ * \param[in] flag	flag for setting xattr
+ *
+ * \retval		0 if packing succeeds.
+ * \retval		negative errno if packing fails.
+ */
+int update_records_xattr_set_pack(const struct lu_env *env,
+				  struct update_ops *ops,
+				  unsigned int *op_count,
+				  size_t *max_ops_size,
+				  struct update_params *params,
+				  unsigned int *param_count,
+				  size_t *max_param_size,
+				  const struct lu_fid *fid,
+				  const struct lu_buf *buf, const char *name,
+				  __u32 flag)
+{
+	size_t	sizes[3] = {strlen(name) + 1, buf->lb_len, sizeof(flag)};
+	const void *bufs[3] = {name, buf->lb_buf, &flag};
+
+	flag = cpu_to_le32(flag);
+
+	return update_records_update_pack(env, fid, OUT_XATTR_SET, ops,
+					  op_count, max_ops_size, params,
+					  param_count, max_param_size,
+					  3, bufs, sizes);
+}
+EXPORT_SYMBOL(update_records_xattr_set_pack);
+
+/**
+ * Calculate xattr delete update size.
+ *
+ * \param[in] env	execution environment
+ * \param[in] fid	FID of the object to delete xattr
+ * \param[in] name	name of the xattr
+ *
+ * \retval		size of xattr delet updatee.
+ */
+size_t update_records_xattr_del_size(const struct lu_env *env,
+				     const struct lu_fid *fid,
+				     const char *name)
+{
+	size_t	size = strlen(name) + 1;
+
+	return update_records_update_size(1, &size);
+}
+EXPORT_SYMBOL(update_records_xattr_del_size);
+
+/**
+ * Pack xattr delete update
+ *
+ * Pack xattr delete update into update records.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ops	ur_ops in update records
+ * \param[in|out] op_count	pointer to the count of ops
+ * \param[in|out] max_op_size maximum size of the update
+ * \param[in] params	ur_params in update records
+ * \param[in|out] param_count	pointer to the count of params
+ * \param[in|out] max_param_size maximum size of the parameter
+ * \param[in] fid	FID of the object to delete xattr
+ * \param[in] name	name of the xattr
+ *
+ * \retval		0 if packing succeeds.
+ * \retval		negative errno if packing fails.
+ */
+int update_records_xattr_del_pack(const struct lu_env *env,
+				  struct update_ops *ops,
+				  unsigned int *op_count,
+				  size_t *max_ops_size,
+				  struct update_params *params,
+				  unsigned int *param_count,
+				  size_t *max_param_size,
+				  const struct lu_fid *fid,
+				  const char *name)
+{
+	size_t	size = strlen(name) + 1;
+
+	return update_records_update_pack(env, fid, OUT_XATTR_DEL, ops,
+					  op_count, max_ops_size, params,
+					  param_count, max_param_size,
+					  1, (const void **)&name, &size);
+}
+EXPORT_SYMBOL(update_records_xattr_del_pack);
+
+/**
+ * Calculate write update size
+ *
+ * \param[in] env	execution environment
+ * \param[in] fid	FID of the object to write into
+ * \param[in] buf	buffer to write which includes an embedded size field
+ * \param[in] pos	offet in the object to start writing at
+ *
+ * \retval		size of write udpate.
+ */
+size_t update_records_write_size(const struct lu_env *env,
+				 const struct lu_fid *fid,
+				 const struct lu_buf *buf,
+				 __u64 pos)
+{
+	size_t	sizes[2] = {buf->lb_len, sizeof(pos)};
+
+	return update_records_update_size(2, sizes);
+}
+EXPORT_SYMBOL(update_records_write_size);
+
+/**
+ * Pack write update
+ *
+ * Pack write update into update records.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ops	ur_ops in update records
+ * \param[in|out] op_count	pointer to the count of ops
+ * \param[in|out] max_op_size maximum size of the update
+ * \param[in] params	ur_params in update records
+ * \param[in|out] param_count	pointer to the count of params
+ * \param[in|out] max_param_size maximum size of the parameter
+ * \param[in] fid	FID of the object to write into
+ * \param[in] buf	buffer to write which includes an embedded size field
+ * \param[in] pos	offet in the object to start writing at
+ *
+ * \retval		0 if packing succeeds.
+ * \retval		negative errno if packing fails.
+ */
+int update_records_write_pack(const struct lu_env *env,
+			      struct update_ops *ops,
+			      unsigned int *op_count,
+			      size_t *max_ops_size,
+			      struct update_params *params,
+			      unsigned int *param_count,
+			      size_t *max_param_size,
+			      const struct lu_fid *fid,
+			      const struct lu_buf *buf,
+			      __u64 pos)
+{
+	size_t		sizes[2] = {buf->lb_len, sizeof(pos)};
+	const void	*bufs[2] = {buf->lb_buf, &pos};
+
+	pos = cpu_to_le64(pos);
+
+	return update_records_update_pack(env, fid, OUT_WRITE, ops,
+					  op_count, max_ops_size, params,
+					  param_count, max_param_size,
+					  2, bufs, sizes);
+}
+EXPORT_SYMBOL(update_records_write_pack);
+
+/**
+ * Calculate size of punch update.
+ *
+ * \param[in] env	execution environment
+ * \param[in] fid	FID of the object to write into
+ * \param[in] start	start offset of punch
+ * \param[in] end	end offet of punch
+ *
+ * \retval		size of update punch.
+ */
+size_t update_records_punch_size(const struct lu_env *env,
+				 const struct lu_fid *fid,
+				 __u64 start, __u64 end)
+{
+	size_t	sizes[2] = {sizeof(start), sizeof(end)};
+
+	return update_records_update_size(2, sizes);
+}
+EXPORT_SYMBOL(update_records_punch_size);
+
+/**
+ * Pack punch
+ *
+ * Pack punch update into update records.
+ *
+ * \param[in] env	execution environment
+ * \param[in] ops	ur_ops in update records
+ * \param[in|out] op_count	pointer to the count of ops
+ * \param[in|out] max_op_size maximum size of the update
+ * \param[in] params	ur_params in update records
+ * \param[in|out] param_count	pointer to the count of params
+ * \param[in|out] max_param_size maximum size of the parameter
+ * \param[in] fid	FID of the object to write into
+ * \param[in] start	start offset of punch
+ * \param[in] end	end offet of punch
+ *
+ * \retval		0 if packing succeeds.
+ * \retval		negative errno if packing fails.
+ */
+int update_records_punch_pack(const struct lu_env *env,
+			      struct update_ops *ops,
+			      unsigned int *op_count,
+			      size_t *max_ops_size,
+			      struct update_params *params,
+			      unsigned int *param_count,
+			      size_t *max_param_size,
+			      const struct lu_fid *fid,
+			      __u64 start, __u64 end)
+{
+	size_t		sizes[2] = {sizeof(start), sizeof(end)};
+	const void	*bufs[2] = {&start, &end};
+
+	start = cpu_to_le64(start);
+	end = cpu_to_le64(end);
+
+	return update_records_update_pack(env, fid, OUT_PUNCH, ops, op_count,
+					  max_ops_size, params, param_count,
+					  max_param_size, 2, bufs, sizes);
+}
+EXPORT_SYMBOL(update_records_punch_pack);
+
+/**
+ * Create update records in thandle_update_records
+ *
+ * Allocate update_records for thandle_update_records, the initial size
+ * will be 4KB.
+ *
+ * \param[in] tur	thandle_update_records where update_records will be
+ *                      allocated
+ * \retval		0 if allocation succeeds.
+ * \retval		negative errno if allocation fails.
+ */
+static int tur_update_records_create(struct thandle_update_records *tur)
+{
+	if (tur->tur_update_records != NULL)
+		return 0;
+
+	OBD_ALLOC_LARGE(tur->tur_update_records,
+			UPDATE_RECORDS_BUFFER_SIZE);
+
+	if (tur->tur_update_records == NULL)
+		return -ENOMEM;
+
+	tur->tur_update_records_buf_size = UPDATE_RECORDS_BUFFER_SIZE;
+
+	return 0;
+}
+
+/**
+ * Extend update records
+ *
+ * Extend update_records to the new size in thandle_update_records.
+ *
+ * \param[in] tur	thandle_update_records where update_records will be
+ *                      extended.
+ * \retval		0 if extension succeeds.
+ * \retval		negative errno if extension fails.
+ */
+int tur_update_records_extend(struct thandle_update_records *tur,
+			      size_t new_size)
+{
+	struct llog_update_record	*record;
+
+	OBD_ALLOC_LARGE(record, new_size);
+	if (record == NULL)
+		return -ENOMEM;
+
+	if (tur->tur_update_records != NULL) {
+		memcpy(record, tur->tur_update_records,
+		       tur->tur_update_records_buf_size);
+		OBD_FREE_LARGE(tur->tur_update_records,
+			       tur->tur_update_records_buf_size);
+	}
+
+	tur->tur_update_records = record;
+	tur->tur_update_records_buf_size = new_size;
+
+	return 0;
+}
+EXPORT_SYMBOL(tur_update_records_extend);
+
+/**
+ * Extend update records
+ *
+ * Extend update records in thandle to make sure it is able to hold
+ * the update with certain update_op and params size.
+ *
+ * \param [in] tur	thandle_update_records to be extend
+ * \param [in] new_op_size update_op size of the update record
+ * \param [in] new_param_size params size of the update record
+ *
+ * \retval		0 if the update_records is being extended.
+ * \retval		negative errno if the update_records is not being
+ *                      extended.
+ */
+int tur_update_extend(struct thandle_update_records *tur,
+		      size_t new_op_size, size_t new_param_size)
+{
+	size_t record_size;
+	size_t params_size;
+	size_t extend_size;
+	int rc;
+	ENTRY;
+
+	record_size = llog_update_record_size(tur->tur_update_records);
+	/* extend update records buffer */
+	if (new_op_size >= (tur->tur_update_records_buf_size - record_size)) {
+		extend_size = round_up(new_op_size, UPDATE_RECORDS_BUFFER_SIZE);
+		rc = tur_update_records_extend(tur,
+				tur->tur_update_records_buf_size +
+				extend_size);
+		if (rc != 0)
+			RETURN(rc);
+	}
+
+	/* extend parameters buffer */
+	params_size = update_params_size(tur->tur_update_params,
+					 tur->tur_update_param_count);
+	if (new_param_size >= (tur->tur_update_params_buf_size -
+			      params_size)) {
+		extend_size = round_up(new_param_size,
+				       UPDATE_PARAMS_BUFFER_SIZE);
+		rc = tur_update_params_extend(tur,
+				tur->tur_update_params_buf_size +
+				extend_size);
+		if (rc != 0)
+			RETURN(rc);
+	}
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(tur_update_extend);
+
+/**
+ * Create update params in thandle_update_records
+ *
+ * Allocate update_params for thandle_update_records, the initial size
+ * will be 4KB.
+ *
+ * \param[in] tur	thandle_update_records where update_params will be
+ *                      allocated
+ * \retval		0 if allocation succeeds.
+ * \retval		negative errno if allocation fails.
+ */
+static int tur_update_params_create(struct thandle_update_records *tur)
+{
+	if (tur->tur_update_params != NULL)
+		return 0;
+
+	OBD_ALLOC_LARGE(tur->tur_update_params, UPDATE_PARAMS_BUFFER_SIZE);
+	if (tur->tur_update_params == NULL)
+		return -ENOMEM;
+
+	tur->tur_update_params_buf_size = UPDATE_PARAMS_BUFFER_SIZE;
+	return 0;
+}
+
+/**
+ * Extend update params
+ *
+ * Extend update_params to the new size in thandle_update_records.
+ *
+ * \param[in] tur	thandle_update_records where update_params will be
+ *                      extended.
+ * \retval		0 if extension succeeds.
+ * \retval		negative errno if extension fails.
+ */
+int tur_update_params_extend(struct thandle_update_records *tur,
+			     size_t new_size)
+{
+	struct update_params	*params;
+
+	OBD_ALLOC_LARGE(params, new_size);
+	if (params == NULL)
+		return -ENOMEM;
+
+	if (tur->tur_update_params != NULL) {
+		memcpy(params, tur->tur_update_params,
+		       tur->tur_update_params_buf_size);
+		OBD_FREE_LARGE(tur->tur_update_params,
+			       tur->tur_update_params_buf_size);
+	}
+
+	tur->tur_update_params = params;
+	tur->tur_update_params_buf_size = new_size;
+
+	return 0;
+}
+EXPORT_SYMBOL(tur_update_params_extend);
+
+/**
+ * Check and prepare whether it needs to record update.
+ *
+ * Checks if the transaction needs to record updates, and if it
+ * does, then initialize the update record buffer in the transaction.
+ *
+ * \param[in] env	execution environment
+ * \param[in] th	transaction handle
+ *
+ * \retval		0 if updates recording succeeds.
+ * \retval		negative errno if updates recording fails.
+ */
+int check_and_prepare_update_record(const struct lu_env *env,
+				    struct thandle_update_records *tur)
+{
+	struct llog_update_record	*lur;
+	int rc;
+
+	if (tur->tur_update_records == NULL) {
+		rc = tur_update_records_create(tur);
+		if (rc < 0)
+			RETURN(rc);
+	}
+
+	if (tur->tur_update_params == NULL) {
+		rc = tur_update_params_create(tur);
+		if (rc < 0)
+			RETURN(rc);
+	}
+
+	lur = tur->tur_update_records;
+	lur->lur_update_rec.ur_update_count = 0;
+	lur->lur_update_rec.ur_param_count = 0;
+	lur->lur_update_rec.ur_master_transno = 0;
+	lur->lur_update_rec.ur_batchid = 0;
+	lur->lur_update_rec.ur_flags = 0;
+	lur->lur_hdr.lrh_len = LLOG_MIN_CHUNK_SIZE;
+
+	tur->tur_update_param_count = 0;
+
+	RETURN(0);
+}
+
+static void update_key_fini(const struct lu_context *ctx,
+			    struct lu_context_key *key, void *data)
+{
+	struct update_thread_info *info = data;
+	struct thandle_exec_args  *args = &info->uti_tea;
+	int			  i;
+
+	for (i = 0; i < args->ta_alloc_args; i++) {
+		if (args->ta_args[i] != NULL)
+			OBD_FREE_PTR(args->ta_args[i]);
+	}
+
+	if (args->ta_args != NULL)
+		OBD_FREE(args->ta_args, sizeof(args->ta_args[0]) *
+			 args->ta_alloc_args);
+
+	if (info->uti_tur.tur_update_records != NULL)
+		OBD_FREE_LARGE(info->uti_tur.tur_update_records,
+			       info->uti_tur.tur_update_records_buf_size);
+	if (info->uti_tur.tur_update_params != NULL)
+		OBD_FREE_LARGE(info->uti_tur.tur_update_params,
+			       info->uti_tur.tur_update_params_buf_size);
+
+	OBD_FREE_PTR(info);
+}
+
+/* context key constructor/destructor: update_key_init, update_key_fini */
+LU_KEY_INIT(update, struct update_thread_info);
+/* context key: update_thread_key */
+LU_CONTEXT_KEY_DEFINE(update, LCT_MD_THREAD | LCT_MG_THREAD |
+			      LCT_DT_THREAD | LCT_LOCAL);
+EXPORT_SYMBOL(update_thread_key);
+LU_KEY_INIT_GENERIC(update);
+
+void update_info_init(void)
+{
+	update_key_init_generic(&update_thread_key, NULL);
+	lu_context_key_register(&update_thread_key);
+}
+
+void update_info_fini(void)
+{
+	lu_context_key_degister(&update_thread_key);
+}
diff --git a/drivers/staging/lustrefsx/lustre/target/update_recovery.c b/drivers/staging/lustrefsx/lustre/target/update_recovery.c
new file mode 100644
index 0000000000000..3769d09d19282
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/target/update_recovery.c
@@ -0,0 +1,1447 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2015, 2016, Intel Corporation.
+ */
+
+/*
+ * lustre/target/update_recovery.c
+ *
+ * This file implement the methods to handle the update recovery.
+ *
+ * During DNE recovery, the recovery thread will redo the operation according
+ * to the transaction no, and these replay are either from client replay req
+ * or update replay records(for distribute transaction) in the update log.
+ * For distribute transaction replay, the replay thread will call
+ * distribute_txn_replay_handle() to handle the updates.
+ *
+ * After the Master MDT restarts, it will retrieve the update records from all
+ * of MDTs, for each distributed operation, it will check updates on all MDTs,
+ * if some updates records are missing on some MDTs, the replay thread will redo
+ * updates on these MDTs.
+ *
+ * Author: Di Wang <di.wang@intel.com>
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <lu_target.h>
+#include <lustre_obdo.h>
+#include <lustre_update.h>
+#include <lustre_swab.h>
+#include <md_object.h>
+#include <obd.h>
+#include <obd_class.h>
+
+#include "tgt_internal.h"
+
+/**
+ * Lookup distribute_txn_replay req
+ *
+ * Lookup distribute_txn_replay in the replay list by batchid.
+ * It is assumed the list has been locked before calling this function.
+ *
+ * \param[in] tdtd	distribute_txn_data, which holds the replay
+ *                      list.
+ * \param[in] batchid	batchid used by lookup.
+ *
+ * \retval		pointer of the replay if succeeds.
+ * \retval		NULL if can not find it.
+ */
+static struct distribute_txn_replay_req *
+dtrq_lookup(struct target_distribute_txn_data *tdtd, __u64 batchid)
+{
+	struct distribute_txn_replay_req	*tmp;
+	struct distribute_txn_replay_req	*dtrq = NULL;
+
+	list_for_each_entry(tmp, &tdtd->tdtd_replay_list, dtrq_list) {
+		if (tmp->dtrq_batchid == batchid) {
+			dtrq = tmp;
+			break;
+		}
+	}
+	return dtrq;
+}
+
+/**
+ * insert distribute txn replay req
+ *
+ * Insert distribute txn replay to the replay list, and it assumes the
+ * list has been looked. Note: the replay list is a sorted list, which
+ * is sorted by master transno. It is assumed the replay list has been
+ * locked before calling this function.
+ *
+ * \param[in] tdtd	target distribute txn data where replay list is
+ * \param[in] new	distribute txn replay to be inserted
+ *
+ * \retval		0 if insertion succeeds
+ * \retval		EEXIST if the dtrq already exists
+ */
+static int dtrq_insert(struct target_distribute_txn_data *tdtd,
+			struct distribute_txn_replay_req *new)
+{
+	struct distribute_txn_replay_req *iter;
+
+	/* Check if the dtrq has been added to the list */
+	iter = dtrq_lookup(tdtd, new->dtrq_batchid);
+	if (iter != NULL)
+		return -EEXIST;
+
+	list_for_each_entry_reverse(iter, &tdtd->tdtd_replay_list, dtrq_list) {
+		if (iter->dtrq_master_transno > new->dtrq_master_transno)
+			continue;
+
+		/* If there are mulitple replay req with same transno, then
+		 * sort them with batchid */
+		if (iter->dtrq_master_transno == new->dtrq_master_transno &&
+		    iter->dtrq_batchid > new->dtrq_batchid)
+			continue;
+
+		list_add(&new->dtrq_list, &iter->dtrq_list);
+		break;
+	}
+
+	if (list_empty(&new->dtrq_list))
+		list_add(&new->dtrq_list, &tdtd->tdtd_replay_list);
+
+	return 0;
+}
+
+/**
+ * create distribute txn replay req
+ *
+ * Allocate distribute txn replay req according to the update records.
+ *
+ * \param[in] tdtd	target distribute txn data where replay list is.
+ * \param[in] record    update records from the update log.
+ *
+ * \retval		the pointer of distribute txn replay req if
+ *                      the creation succeeds.
+ * \retval		NULL if the creation fails.
+ */
+static struct distribute_txn_replay_req *
+dtrq_create(struct target_distribute_txn_data *tdtd,
+	    struct llog_update_record *lur)
+{
+	struct distribute_txn_replay_req *new;
+
+	OBD_ALLOC_PTR(new);
+	if (new == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	new->dtrq_lur_size = llog_update_record_size(lur);
+	OBD_ALLOC_LARGE(new->dtrq_lur, new->dtrq_lur_size);
+	if (new->dtrq_lur == NULL) {
+		OBD_FREE_PTR(new);
+		RETURN(ERR_PTR(-ENOMEM));
+	}
+
+	memcpy(new->dtrq_lur, lur, new->dtrq_lur_size);
+
+	/* If the transno in the update record is 0, it means the
+	 * update are from master MDT, and it will use the master
+	 * last committed transno as its master transno. Later, if
+	 * the update records are gotten from slave MDTs, then these
+	 * transno will be replaced.
+	 * See insert_update_records_to_replay_list(). */
+	if (lur->lur_update_rec.ur_master_transno == 0) {
+		new->dtrq_lur->lur_update_rec.ur_master_transno =
+				tdtd->tdtd_lut->lut_obd->obd_last_committed;
+		new->dtrq_master_transno =
+				tdtd->tdtd_lut->lut_obd->obd_last_committed;
+	} else {
+		new->dtrq_master_transno =
+				lur->lur_update_rec.ur_master_transno;
+	}
+
+	new->dtrq_batchid = lur->lur_update_rec.ur_batchid;
+
+	spin_lock_init(&new->dtrq_sub_list_lock);
+	INIT_LIST_HEAD(&new->dtrq_sub_list);
+	INIT_LIST_HEAD(&new->dtrq_list);
+
+	RETURN(new);
+}
+
+/**
+ * Lookup distribute sub replay
+ *
+ * Lookup distribute sub replay in the sub list of distribute_txn_replay by
+ * mdt_index.
+ *
+ * \param[in] distribute_txn_replay_req	the distribute txn replay req to lookup
+ * \param[in] mdt_index			the mdt_index as the key of lookup
+ *
+ * \retval		the pointer of sub replay if it can be found.
+ * \retval		NULL if it can not find.
+ */
+struct distribute_txn_replay_req_sub *
+dtrq_sub_lookup(struct distribute_txn_replay_req *dtrq, __u32 mdt_index)
+{
+	struct distribute_txn_replay_req_sub *dtrqs = NULL;
+	struct distribute_txn_replay_req_sub *tmp;
+
+	list_for_each_entry(tmp, &dtrq->dtrq_sub_list, dtrqs_list) {
+		if (tmp->dtrqs_mdt_index == mdt_index) {
+			dtrqs = tmp;
+			break;
+		}
+	}
+	return dtrqs;
+}
+
+/**
+ * Try to add cookie to sub distribute txn request
+ *
+ * Check if the update log cookie has been added to the request, if not,
+ * add it to the dtrqs_cookie_list.
+ *
+ * \param[in] dtrqs	sub replay req where cookies to be added.
+ * \param[in] cookie	cookie to be added.
+ *
+ * \retval		0 if the cookie is adding succeeds.
+ * \retval		negative errno if adding fails.
+ */
+static int dtrq_sub_add_cookie(struct distribute_txn_replay_req_sub *dtrqs,
+			       struct llog_cookie *cookie)
+{
+	struct sub_thandle_cookie *new;
+
+	OBD_ALLOC_PTR(new);
+	if (new == NULL)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&new->stc_list);
+	new->stc_cookie = *cookie;
+	/* Note: only single thread will access one sub_request each time,
+	 * so no need lock here */
+	list_add(&new->stc_list, &dtrqs->dtrqs_cookie_list);
+
+	return 0;
+}
+
+/**
+ * Insert distribute txn sub req replay
+ *
+ * Allocate sub replay req and insert distribute txn replay list.
+ *
+ * \param[in] dtrq	d to be added
+ * \param[in] cookie	the cookie of the update record
+ * \param[in] mdt_index	the mdt_index of the update record
+ *
+ * \retval		0 if the adding succeeds.
+ * \retval		negative errno if the adding fails.
+ */
+static int
+dtrq_sub_create_and_insert(struct distribute_txn_replay_req *dtrq,
+			   struct llog_cookie *cookie,
+			   __u32 mdt_index)
+{
+	struct distribute_txn_replay_req_sub	*dtrqs = NULL;
+	struct distribute_txn_replay_req_sub	*new;
+	int					rc;
+	ENTRY;
+
+	spin_lock(&dtrq->dtrq_sub_list_lock);
+	dtrqs = dtrq_sub_lookup(dtrq, mdt_index);
+	spin_unlock(&dtrq->dtrq_sub_list_lock);
+	if (dtrqs != NULL) {
+		rc = dtrq_sub_add_cookie(dtrqs, cookie);
+		RETURN(0);
+	}
+
+	OBD_ALLOC_PTR(new);
+	if (new == NULL)
+		RETURN(-ENOMEM);
+
+	INIT_LIST_HEAD(&new->dtrqs_list);
+	INIT_LIST_HEAD(&new->dtrqs_cookie_list);
+	new->dtrqs_mdt_index = mdt_index;
+	spin_lock(&dtrq->dtrq_sub_list_lock);
+	dtrqs = dtrq_sub_lookup(dtrq, mdt_index);
+	if (dtrqs == NULL) {
+		list_add(&new->dtrqs_list, &dtrq->dtrq_sub_list);
+		dtrqs = new;
+	} else {
+		OBD_FREE_PTR(new);
+	}
+	spin_unlock(&dtrq->dtrq_sub_list_lock);
+
+	rc = dtrq_sub_add_cookie(dtrqs, cookie);
+
+	RETURN(rc);
+}
+
+/**
+ * append updates to the current replay updates
+ *
+ * Append more updates to the existent replay update. And this is only
+ * used when combining mulitple updates into one large updates during
+ * replay.
+ *
+ * \param[in] dtrq	the update replay request where the new update
+ *                      records will be added.
+ * \param[in] lur	the new update record.
+ *
+ * \retval		0 if appending succeeds.
+ * \retval		negative errno if appending fails.
+ */
+static int dtrq_append_updates(struct distribute_txn_replay_req *dtrq,
+			       struct update_records *record)
+{
+	struct llog_update_record *new_lur;
+	size_t lur_size = dtrq->dtrq_lur_size;
+	void *ptr;
+	ENTRY;
+
+	/* Because several threads might retrieve the same records from
+	 * different targets, and we only need one copy of records. So
+	 * we will check if the records is in the next one, if not, just
+	 * skip it */
+	spin_lock(&dtrq->dtrq_sub_list_lock);
+	if (dtrq->dtrq_lur->lur_update_rec.ur_index + 1 != record->ur_index) {
+		spin_unlock(&dtrq->dtrq_sub_list_lock);
+		RETURN(0);
+	}
+	dtrq->dtrq_lur->lur_update_rec.ur_index++;
+	spin_unlock(&dtrq->dtrq_sub_list_lock);
+
+	lur_size += update_records_size(record);
+	OBD_ALLOC_LARGE(new_lur, lur_size);
+	if (new_lur == NULL) {
+		spin_lock(&dtrq->dtrq_sub_list_lock);
+		dtrq->dtrq_lur->lur_update_rec.ur_index--;
+		spin_unlock(&dtrq->dtrq_sub_list_lock);
+		RETURN(-ENOMEM);
+	}
+
+	/* Copy the old and new records to the new allocated buffer */
+	memcpy(new_lur, dtrq->dtrq_lur, dtrq->dtrq_lur_size);
+	ptr = (char *)&new_lur->lur_update_rec +
+		update_records_size(&new_lur->lur_update_rec);
+	memcpy(ptr, &record->ur_ops,
+	       update_records_size(record) -
+	       offsetof(struct update_records, ur_ops));
+
+	new_lur->lur_update_rec.ur_update_count += record->ur_update_count;
+	new_lur->lur_update_rec.ur_param_count += record->ur_param_count;
+	new_lur->lur_hdr.lrh_len = llog_update_record_size(new_lur);
+
+	/* Replace the records */
+	OBD_FREE_LARGE(dtrq->dtrq_lur, dtrq->dtrq_lur_size);
+	dtrq->dtrq_lur = new_lur;
+	dtrq->dtrq_lur_size = lur_size;
+	dtrq->dtrq_lur->lur_update_rec.ur_flags = record->ur_flags;
+	update_records_dump(&new_lur->lur_update_rec, D_INFO, true);
+	RETURN(0);
+}
+
+/**
+ * Insert update records to the replay list.
+ *
+ * Allocate distribute txn replay req and insert it into the replay
+ * list, then insert the update records into the replay req.
+ *
+ * \param[in] tdtd	distribute txn replay data where the replay list
+ *                      is.
+ * \param[in] record    the update record
+ * \param[in] cookie    cookie of the record
+ * \param[in] index	mdt index of the record
+ *
+ * \retval		0 if the adding succeeds.
+ * \retval		negative errno if the adding fails.
+ */
+int
+insert_update_records_to_replay_list(struct target_distribute_txn_data *tdtd,
+				     struct llog_update_record *lur,
+				     struct llog_cookie *cookie,
+				     __u32 mdt_index)
+{
+	struct distribute_txn_replay_req *dtrq;
+	struct update_records *record = &lur->lur_update_rec;
+	bool replace_record = false;
+	int rc = 0;
+	ENTRY;
+
+	CDEBUG(D_HA, "%s: insert record batchid = %llu transno = %llu"
+	       " mdt_index %u\n", tdtd->tdtd_lut->lut_obd->obd_name,
+	       record->ur_batchid, record->ur_master_transno, mdt_index);
+
+	/* Update batchid if necessary */
+	spin_lock(&tdtd->tdtd_batchid_lock);
+	if (record->ur_batchid >= tdtd->tdtd_batchid) {
+		CDEBUG(D_HA, "%s update batchid from %llu" " to %llu\n",
+		       tdtd->tdtd_lut->lut_obd->obd_name,
+		       tdtd->tdtd_batchid, record->ur_batchid);
+		tdtd->tdtd_batchid = record->ur_batchid + 1;
+	}
+	spin_unlock(&tdtd->tdtd_batchid_lock);
+
+again:
+	spin_lock(&tdtd->tdtd_replay_list_lock);
+	/* First try to build the replay update request with the records */
+	dtrq = dtrq_lookup(tdtd, record->ur_batchid);
+	if (dtrq == NULL) {
+		spin_unlock(&tdtd->tdtd_replay_list_lock);
+		dtrq = dtrq_create(tdtd, lur);
+		if (IS_ERR(dtrq))
+			RETURN(PTR_ERR(dtrq));
+
+		spin_lock(&tdtd->tdtd_replay_list_lock);
+		rc = dtrq_insert(tdtd, dtrq);
+		if (rc < 0) {
+			spin_unlock(&tdtd->tdtd_replay_list_lock);
+			dtrq_destroy(dtrq);
+			if (rc == -EEXIST)
+				goto again;
+			return rc;
+		}
+	} else {
+		/* If the master transno in update header is not
+		* matched with the one in the record, then it means
+		* the dtrq is originally created by master record,
+		* so we need update master transno and reposition
+		* the dtrq(by master transno) in the list and also
+		* replace update record */
+		if (record->ur_master_transno != 0 &&
+		    dtrq->dtrq_master_transno != record->ur_master_transno &&
+		    dtrq->dtrq_lur != NULL) {
+			list_del_init(&dtrq->dtrq_list);
+			dtrq->dtrq_lur->lur_update_rec.ur_master_transno =
+						record->ur_master_transno;
+
+			dtrq->dtrq_master_transno = record->ur_master_transno;
+			replace_record = true;
+			/* try to insert again */
+			rc = dtrq_insert(tdtd, dtrq);
+			if (rc < 0) {
+				spin_unlock(&tdtd->tdtd_replay_list_lock);
+				dtrq_destroy(dtrq);
+				return rc;
+			}
+		}
+	}
+	spin_unlock(&tdtd->tdtd_replay_list_lock);
+
+	/* Because there should be only thread access the update record, so
+	 * we do not need lock here */
+	if (replace_record) {
+		/* Replace the update record and master transno */
+		OBD_FREE_LARGE(dtrq->dtrq_lur, dtrq->dtrq_lur_size);
+		dtrq->dtrq_lur = NULL;
+		dtrq->dtrq_lur_size = llog_update_record_size(lur);
+		OBD_ALLOC_LARGE(dtrq->dtrq_lur, dtrq->dtrq_lur_size);
+		if (dtrq->dtrq_lur == NULL)
+			return -ENOMEM;
+
+		memcpy(dtrq->dtrq_lur, lur, dtrq->dtrq_lur_size);
+	}
+
+	/* This is a partial update records, let's try to append
+	 * the record to the current replay request */
+	if (record->ur_flags & UPDATE_RECORD_CONTINUE)
+		rc = dtrq_append_updates(dtrq, record);
+
+	/* Then create and add sub update request */
+	rc = dtrq_sub_create_and_insert(dtrq, cookie, mdt_index);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(insert_update_records_to_replay_list);
+
+/**
+ * Dump updates of distribute txns.
+ *
+ * Output all of recovery updates in the distribute txn list to the
+ * debug log.
+ *
+ * \param[in] tdtd	distribute txn data where all of distribute txn
+ *                      are listed.
+ * \param[in] mask	debug mask
+ */
+void dtrq_list_dump(struct target_distribute_txn_data *tdtd, unsigned int mask)
+{
+	struct distribute_txn_replay_req *dtrq;
+
+	spin_lock(&tdtd->tdtd_replay_list_lock);
+	list_for_each_entry(dtrq, &tdtd->tdtd_replay_list, dtrq_list)
+		update_records_dump(&dtrq->dtrq_lur->lur_update_rec, mask,
+				    false);
+	spin_unlock(&tdtd->tdtd_replay_list_lock);
+}
+EXPORT_SYMBOL(dtrq_list_dump);
+
+/**
+ * Destroy distribute txn replay req
+ *
+ * Destroy distribute txn replay req and all of subs.
+ *
+ * \param[in] dtrq	distribute txn replqy req to be destroyed.
+ */
+void dtrq_destroy(struct distribute_txn_replay_req *dtrq)
+{
+	struct distribute_txn_replay_req_sub	*dtrqs;
+	struct distribute_txn_replay_req_sub	*tmp;
+
+	LASSERT(list_empty(&dtrq->dtrq_list));
+	spin_lock(&dtrq->dtrq_sub_list_lock);
+	list_for_each_entry_safe(dtrqs, tmp, &dtrq->dtrq_sub_list, dtrqs_list) {
+		struct sub_thandle_cookie *stc;
+		struct sub_thandle_cookie *tmp;
+
+		list_del(&dtrqs->dtrqs_list);
+		list_for_each_entry_safe(stc, tmp, &dtrqs->dtrqs_cookie_list,
+					 stc_list) {
+			list_del(&stc->stc_list);
+			OBD_FREE_PTR(stc);
+		}
+		OBD_FREE_PTR(dtrqs);
+	}
+	spin_unlock(&dtrq->dtrq_sub_list_lock);
+
+	if (dtrq->dtrq_lur != NULL)
+		OBD_FREE_LARGE(dtrq->dtrq_lur, dtrq->dtrq_lur_size);
+
+	OBD_FREE_PTR(dtrq);
+}
+EXPORT_SYMBOL(dtrq_destroy);
+
+/**
+ * Destroy all of replay req.
+ *
+ * Destroy all of replay req in the replay list.
+ *
+ * \param[in] tdtd	target distribute txn data where the replay list is.
+ */
+void dtrq_list_destroy(struct target_distribute_txn_data *tdtd)
+{
+	struct distribute_txn_replay_req *dtrq;
+	struct distribute_txn_replay_req *tmp;
+
+	spin_lock(&tdtd->tdtd_replay_list_lock);
+	list_for_each_entry_safe(dtrq, tmp, &tdtd->tdtd_replay_list,
+				 dtrq_list) {
+		list_del_init(&dtrq->dtrq_list);
+		dtrq_destroy(dtrq);
+	}
+	list_for_each_entry_safe(dtrq, tmp, &tdtd->tdtd_replay_finish_list,
+				 dtrq_list) {
+		list_del_init(&dtrq->dtrq_list);
+		dtrq_destroy(dtrq);
+	}
+	spin_unlock(&tdtd->tdtd_replay_list_lock);
+}
+EXPORT_SYMBOL(dtrq_list_destroy);
+
+/**
+ * Get next req in the replay list
+ *
+ * Get next req needs to be replayed, since it is a sorted list
+ * (by master MDT transno)
+ *
+ * \param[in] tdtd	distribute txn data where the replay list is
+ *
+ * \retval		the pointer of update recovery header
+ */
+struct distribute_txn_replay_req *
+distribute_txn_get_next_req(struct target_distribute_txn_data *tdtd)
+{
+	struct distribute_txn_replay_req *dtrq = NULL;
+
+	spin_lock(&tdtd->tdtd_replay_list_lock);
+	if (!list_empty(&tdtd->tdtd_replay_list)) {
+		dtrq = list_entry(tdtd->tdtd_replay_list.next,
+				 struct distribute_txn_replay_req, dtrq_list);
+		list_del_init(&dtrq->dtrq_list);
+	}
+	spin_unlock(&tdtd->tdtd_replay_list_lock);
+
+	return dtrq;
+}
+EXPORT_SYMBOL(distribute_txn_get_next_req);
+
+/**
+ * Get next transno in the replay list, because this is the sorted
+ * list, so it will return the transno of next req in the list.
+ *
+ * \param[in] tdtd	distribute txn data where the replay list is
+ *
+ * \retval		the transno of next update in the list
+ */
+__u64 distribute_txn_get_next_transno(struct target_distribute_txn_data *tdtd)
+{
+	struct distribute_txn_replay_req	*dtrq = NULL;
+	__u64					transno = 0;
+
+	spin_lock(&tdtd->tdtd_replay_list_lock);
+	if (!list_empty(&tdtd->tdtd_replay_list)) {
+		dtrq = list_entry(tdtd->tdtd_replay_list.next,
+				 struct distribute_txn_replay_req, dtrq_list);
+		transno = dtrq->dtrq_master_transno;
+	}
+	spin_unlock(&tdtd->tdtd_replay_list_lock);
+
+	CDEBUG(D_HA, "%s: Next update transno %llu\n",
+	       tdtd->tdtd_lut->lut_obd->obd_name, transno);
+	return transno;
+}
+EXPORT_SYMBOL(distribute_txn_get_next_transno);
+
+struct distribute_txn_replay_req *
+distribute_txn_lookup_finish_list(struct target_distribute_txn_data *tdtd,
+				  __u64 xid)
+{
+	struct distribute_txn_replay_req *dtrq = NULL;
+	struct distribute_txn_replay_req *iter;
+
+	spin_lock(&tdtd->tdtd_replay_list_lock);
+	list_for_each_entry(iter, &tdtd->tdtd_replay_finish_list, dtrq_list) {
+		if (iter->dtrq_xid == xid) {
+			dtrq = iter;
+			break;
+		}
+	}
+	spin_unlock(&tdtd->tdtd_replay_list_lock);
+	return dtrq;
+}
+
+bool is_req_replayed_by_update(struct ptlrpc_request *req)
+{
+	struct lu_target *tgt = class_exp2tgt(req->rq_export);
+	struct distribute_txn_replay_req *dtrq;
+
+	if (tgt->lut_tdtd == NULL)
+		return false;
+
+	dtrq = distribute_txn_lookup_finish_list(tgt->lut_tdtd, req->rq_xid);
+	if (dtrq == NULL)
+		return false;
+
+	return true;
+}
+EXPORT_SYMBOL(is_req_replayed_by_update);
+
+/**
+ * Check if the update of one object is committed
+ *
+ * Check whether the update for the object is committed by checking whether
+ * the correspondent sub exists in the replay req. If it is committed, mark
+ * the committed flag in correspondent the sub thandle.
+ *
+ * \param[in] env	execution environment
+ * \param[in] dtrq	replay request
+ * \param[in] dt_obj	object for the update
+ * \param[in] top_th	top thandle
+ * \param[in] sub_th	sub thandle which the update belongs to
+ *
+ * \retval		1 if the update is not committed.
+ * \retval		0 if the update is committed.
+ * \retval		negative errno if some other failures happen.
+ */
+static int update_is_committed(const struct lu_env *env,
+			       struct distribute_txn_replay_req *dtrq,
+			       struct dt_object *dt_obj,
+			       struct top_thandle *top_th,
+			       struct sub_thandle *st)
+{
+	struct seq_server_site	*seq_site;
+	const struct lu_fid	*fid = lu_object_fid(&dt_obj->do_lu);
+	struct distribute_txn_replay_req_sub	*dtrqs;
+	__u32			mdt_index;
+	ENTRY;
+
+	if (st->st_sub_th != NULL)
+		RETURN(1);
+
+	if (st->st_committed)
+		RETURN(0);
+
+	seq_site = lu_site2seq(dt_obj->do_lu.lo_dev->ld_site);
+	if (fid_is_update_log(fid) || fid_is_update_log_dir(fid)) {
+		mdt_index = fid_oid(fid);
+	} else if (!fid_seq_in_fldb(fid_seq(fid))) {
+		mdt_index = seq_site->ss_node_id;
+	} else {
+		struct lu_server_fld *fld;
+		struct lu_seq_range range = {0};
+		int rc;
+
+		fld = seq_site->ss_server_fld;
+		fld_range_set_type(&range, LU_SEQ_RANGE_MDT);
+		LASSERT(fld->lsf_seq_lookup != NULL);
+		rc = fld->lsf_seq_lookup(env, fld, fid_seq(fid),
+					 &range);
+		if (rc < 0)
+			RETURN(rc);
+		mdt_index = range.lsr_index;
+	}
+
+	dtrqs = dtrq_sub_lookup(dtrq, mdt_index);
+	if (dtrqs != NULL || top_th->tt_multiple_thandle->tmt_committed) {
+		st->st_committed = 1;
+		if (dtrqs != NULL) {
+			struct sub_thandle_cookie *stc;
+			struct sub_thandle_cookie *tmp;
+
+			list_for_each_entry_safe(stc, tmp,
+						 &dtrqs->dtrqs_cookie_list,
+						 stc_list)
+				list_move(&stc->stc_list, &st->st_cookie_list);
+		}
+		RETURN(0);
+	}
+
+	CDEBUG(D_HA, "Update of "DFID "on MDT%u is not committed\n", PFID(fid),
+	       mdt_index);
+
+	RETURN(1);
+}
+
+/**
+ * Implementation of different update methods for update recovery.
+ *
+ * These following functions update_recovery_$(update_name) implement
+ * different updates recovery methods. They will extract the parameters
+ * from the common parameters area and call correspondent dt API to redo
+ * the update.
+ *
+ * \param[in] env	execution environment
+ * \param[in] op	update operation to be replayed
+ * \param[in] params	common update parameters which holds all parameters
+ *                      of the operation
+ * \param[in] th	transaction handle
+ * \param[in] declare	indicate it will do declare or real execution, true
+ *                      means declare, false means real execution
+ *
+ * \retval		0 if it succeeds.
+ * \retval		negative errno if it fails.
+ */
+static int update_recovery_create(const struct lu_env *env,
+				  struct dt_object *dt_obj,
+				  const struct update_op *op,
+				  const struct update_params *params,
+				  struct thandle_exec_args *ta,
+				  struct thandle *th)
+{
+	struct update_thread_info *uti = update_env_info(env);
+	struct llog_update_record *lur = uti->uti_dtrq->dtrq_lur;
+	struct lu_attr		*attr = &uti->uti_attr;
+	struct obdo		*wobdo;
+	struct obdo		*lobdo = &uti->uti_obdo;
+	struct dt_object_format	dof;
+	__u16			size;
+	unsigned int		param_count;
+	int rc;
+	ENTRY;
+
+	if (dt_object_exists(dt_obj))
+		RETURN(-EEXIST);
+
+	param_count = lur->lur_update_rec.ur_param_count;
+	wobdo = update_params_get_param_buf(params, op->uop_params_off[0],
+					    param_count, &size);
+	if (wobdo == NULL)
+		RETURN(-EIO);
+	if (size != sizeof(*wobdo))
+		RETURN(-EIO);
+
+	if (LLOG_REC_HDR_NEEDS_SWABBING(&lur->lur_hdr))
+		lustre_swab_obdo(wobdo);
+
+	lustre_get_wire_obdo(NULL, lobdo, wobdo);
+	la_from_obdo(attr, lobdo, lobdo->o_valid);
+
+	dof.dof_type = dt_mode_to_dft(attr->la_mode);
+
+	rc = out_tx_create(env, dt_obj, attr, NULL, &dof,
+			   ta, th, NULL, 0);
+
+	RETURN(rc);
+}
+
+static int update_recovery_destroy(const struct lu_env *env,
+				   struct dt_object *dt_obj,
+				   const struct update_op *op,
+				   const struct update_params *params,
+				   struct thandle_exec_args *ta,
+				   struct thandle *th)
+{
+	int rc;
+	ENTRY;
+
+	rc = out_tx_destroy(env, dt_obj, ta, th, NULL, 0);
+
+	RETURN(rc);
+}
+
+static int update_recovery_ref_add(const struct lu_env *env,
+				   struct dt_object *dt_obj,
+				   const struct update_op *op,
+				   const struct update_params *params,
+				   struct thandle_exec_args *ta,
+				   struct thandle *th)
+{
+	int rc;
+	ENTRY;
+
+	rc = out_tx_ref_add(env, dt_obj, ta, th, NULL, 0);
+
+	RETURN(rc);
+}
+
+static int update_recovery_ref_del(const struct lu_env *env,
+				   struct dt_object *dt_obj,
+				   const struct update_op *op,
+				   const struct update_params *params,
+				   struct thandle_exec_args *ta,
+				   struct thandle *th)
+{
+	int rc;
+	ENTRY;
+
+	rc = out_tx_ref_del(env, dt_obj, ta, th, NULL, 0);
+
+	RETURN(rc);
+}
+
+static int update_recovery_attr_set(const struct lu_env *env,
+				    struct dt_object *dt_obj,
+				    const struct update_op *op,
+				    const struct update_params *params,
+				    struct thandle_exec_args *ta,
+				    struct thandle *th)
+{
+	struct update_thread_info *uti = update_env_info(env);
+	struct llog_update_record *lur = uti->uti_dtrq->dtrq_lur;
+	struct obdo	*wobdo;
+	struct obdo	*lobdo = &uti->uti_obdo;
+	struct lu_attr	*attr = &uti->uti_attr;
+	__u16		size;
+	unsigned int	param_count;
+	int		rc;
+	ENTRY;
+
+	param_count = lur->lur_update_rec.ur_param_count;
+	wobdo = update_params_get_param_buf(params, op->uop_params_off[0],
+					    param_count, &size);
+	if (wobdo == NULL)
+		RETURN(-EIO);
+	if (size != sizeof(*wobdo))
+		RETURN(-EIO);
+
+	if (LLOG_REC_HDR_NEEDS_SWABBING(&lur->lur_hdr))
+		lustre_swab_obdo(wobdo);
+
+	lustre_get_wire_obdo(NULL, lobdo, wobdo);
+	la_from_obdo(attr, lobdo, lobdo->o_valid);
+
+	rc = out_tx_attr_set(env, dt_obj, attr, ta, th, NULL, 0);
+
+	RETURN(rc);
+}
+
+static int update_recovery_xattr_set(const struct lu_env *env,
+				     struct dt_object *dt_obj,
+				     const struct update_op *op,
+				     const struct update_params *params,
+				     struct thandle_exec_args *ta,
+				     struct thandle *th)
+{
+	struct update_thread_info *uti = update_env_info(env);
+	char		*buf;
+	char		*name;
+	int		fl;
+	__u16		size;
+	__u32		param_count;
+	int		rc;
+	ENTRY;
+
+	param_count = uti->uti_dtrq->dtrq_lur->lur_update_rec.ur_param_count;
+	name = update_params_get_param_buf(params,
+					   op->uop_params_off[0],
+					   param_count, &size);
+	if (name == NULL)
+		RETURN(-EIO);
+
+	buf = update_params_get_param_buf(params,
+					  op->uop_params_off[1],
+					  param_count, &size);
+	if (buf == NULL)
+		RETURN(-EIO);
+
+	uti->uti_buf.lb_buf = buf;
+	uti->uti_buf.lb_len = (size_t)size;
+
+	buf = update_params_get_param_buf(params, op->uop_params_off[2],
+					  param_count, &size);
+	if (buf == NULL)
+		RETURN(-EIO);
+	if (size != sizeof(fl))
+		RETURN(-EIO);
+
+	fl = le32_to_cpu(*(int *)buf);
+
+	rc = out_tx_xattr_set(env, dt_obj, &uti->uti_buf, name, fl, ta, th,
+			      NULL, 0);
+
+	RETURN(rc);
+}
+
+static int update_recovery_index_insert(const struct lu_env *env,
+					struct dt_object *dt_obj,
+					const struct update_op *op,
+					const struct update_params *params,
+					struct thandle_exec_args *ta,
+					struct thandle *th)
+{
+	struct update_thread_info *uti = update_env_info(env);
+	struct lu_fid		*fid;
+	char			*name;
+	__u32			param_count;
+	__u32			*ptype;
+	__u32			type;
+	__u16			size;
+	int rc;
+	ENTRY;
+
+	param_count = uti->uti_dtrq->dtrq_lur->lur_update_rec.ur_param_count;
+	name = update_params_get_param_buf(params, op->uop_params_off[0],
+					   param_count, &size);
+	if (name == NULL)
+		RETURN(-EIO);
+
+	fid = update_params_get_param_buf(params, op->uop_params_off[1],
+					  param_count, &size);
+	if (fid == NULL)
+		RETURN(-EIO);
+	if (size != sizeof(*fid))
+		RETURN(-EIO);
+
+	fid_le_to_cpu(fid, fid);
+
+	ptype = update_params_get_param_buf(params, op->uop_params_off[2],
+					    param_count, &size);
+	if (ptype == NULL)
+		RETURN(-EIO);
+	if (size != sizeof(*ptype))
+		RETURN(-EIO);
+	type = le32_to_cpu(*ptype);
+
+	if (dt_try_as_dir(env, dt_obj) == 0)
+		RETURN(-ENOTDIR);
+
+	uti->uti_rec.rec_fid = fid;
+	uti->uti_rec.rec_type = type;
+
+	rc = out_tx_index_insert(env, dt_obj,
+				 (const struct dt_rec *)&uti->uti_rec,
+				 (const struct dt_key *)name, ta, th,
+				 NULL, 0);
+
+	RETURN(rc);
+}
+
+static int update_recovery_index_delete(const struct lu_env *env,
+					struct dt_object *dt_obj,
+					const struct update_op *op,
+					const struct update_params *params,
+					struct thandle_exec_args *ta,
+					struct thandle *th)
+{
+	struct update_thread_info *uti = update_env_info(env);
+	__u32	param_count;
+	char	*name;
+	__u16	size;
+	int	rc;
+	ENTRY;
+
+	param_count = uti->uti_dtrq->dtrq_lur->lur_update_rec.ur_param_count;
+	name = update_params_get_param_buf(params, op->uop_params_off[0],
+					   param_count, &size);
+	if (name == NULL)
+		RETURN(-EIO);
+
+	if (dt_try_as_dir(env, dt_obj) == 0)
+		RETURN(-ENOTDIR);
+
+	rc = out_tx_index_delete(env, dt_obj,
+				 (const struct dt_key *)name, ta, th, NULL, 0);
+
+	RETURN(rc);
+}
+
+static int update_recovery_write(const struct lu_env *env,
+				 struct dt_object *dt_obj,
+				 const struct update_op *op,
+				 const struct update_params *params,
+				 struct thandle_exec_args *ta,
+				 struct thandle *th)
+{
+	struct update_thread_info *uti = update_env_info(env);
+	char		*buf;
+	__u32		param_count;
+	__u64		pos;
+	__u16		size;
+	int rc;
+	ENTRY;
+
+	param_count = uti->uti_dtrq->dtrq_lur->lur_update_rec.ur_param_count;
+	buf = update_params_get_param_buf(params, op->uop_params_off[0],
+					  param_count, &size);
+	if (buf == NULL)
+		RETURN(-EIO);
+
+	uti->uti_buf.lb_buf = buf;
+	uti->uti_buf.lb_len = size;
+
+	buf = update_params_get_param_buf(params, op->uop_params_off[1],
+					  param_count, &size);
+	if (buf == NULL)
+		RETURN(-EIO);
+
+	pos = le64_to_cpu(*(__u64 *)buf);
+
+	rc = out_tx_write(env, dt_obj, &uti->uti_buf, pos,
+			  ta, th, NULL, 0);
+
+	RETURN(rc);
+}
+
+static int update_recovery_xattr_del(const struct lu_env *env,
+				     struct dt_object *dt_obj,
+				     const struct update_op *op,
+				     const struct update_params *params,
+				     struct thandle_exec_args *ta,
+				     struct thandle *th)
+{
+	struct update_thread_info *uti = update_env_info(env);
+	__u32	param_count;
+	char	*name;
+	__u16	size;
+	int	rc;
+	ENTRY;
+
+	param_count = uti->uti_dtrq->dtrq_lur->lur_update_rec.ur_param_count;
+	name = update_params_get_param_buf(params, op->uop_params_off[0],
+					   param_count, &size);
+	if (name == NULL)
+		RETURN(-EIO);
+
+	rc = out_tx_xattr_del(env, dt_obj, name, ta, th, NULL, 0);
+
+	RETURN(rc);
+}
+
+/**
+ * Update session information
+ *
+ * Update session information so tgt_txn_stop_cb()->tgt_last_rcvd_update()
+ * can be called correctly during update replay.
+ *
+ * \param[in] env	execution environment.
+ * \param[in] tdtd	distribute data structure of the recovering tgt.
+ * \param[in] th	thandle of this update replay.
+ * \param[in] master_th	master sub thandle.
+ * \param[in] ta_arg	the tx arg structure to hold the update for updating
+ *                      reply data.
+ */
+static void update_recovery_update_ses(struct lu_env *env,
+				      struct target_distribute_txn_data *tdtd,
+				      struct thandle *th,
+				      struct thandle *master_th,
+				      struct distribute_txn_replay_req *dtrq,
+				      struct tx_arg *ta_arg)
+{
+	struct tgt_session_info	*tsi;
+	struct lu_target	*lut = tdtd->tdtd_lut;
+	struct obd_export	*export;
+	struct cfs_hash		*hash;
+	struct top_thandle	*top_th;
+	struct lsd_reply_data	*lrd;
+	size_t			size;
+
+	tsi = tgt_ses_info(env);
+	if (tsi->tsi_exp != NULL)
+		return;
+
+	size = ta_arg->u.write.buf.lb_len;
+	lrd = ta_arg->u.write.buf.lb_buf;
+	if (size != sizeof(*lrd) || lrd == NULL)
+		return;
+
+	lrd->lrd_transno         = le64_to_cpu(lrd->lrd_transno);
+	lrd->lrd_xid             = le64_to_cpu(lrd->lrd_xid);
+	lrd->lrd_data            = le64_to_cpu(lrd->lrd_data);
+	lrd->lrd_result          = le32_to_cpu(lrd->lrd_result);
+	lrd->lrd_client_gen      = le32_to_cpu(lrd->lrd_client_gen);
+
+	if (lrd->lrd_transno != tgt_th_info(env)->tti_transno)
+		return;
+
+	hash = cfs_hash_getref(lut->lut_obd->obd_gen_hash);
+	if (hash == NULL)
+		return;
+
+	export = cfs_hash_lookup(hash, &lrd->lrd_client_gen);
+	if (export == NULL) {
+		cfs_hash_putref(hash);
+		return;
+	}
+
+	tsi->tsi_exp = export;
+	tsi->tsi_xid = lrd->lrd_xid;
+	tsi->tsi_opdata = lrd->lrd_data;
+	tsi->tsi_result = lrd->lrd_result;
+	tsi->tsi_client_gen = lrd->lrd_client_gen;
+	dtrq->dtrq_xid = lrd->lrd_xid;
+	top_th = container_of(th, struct top_thandle, tt_super);
+	top_th->tt_master_sub_thandle = master_th;
+	cfs_hash_putref(hash);
+}
+
+/**
+ * Execute updates in the update replay records
+ *
+ * Declare distribute txn replay by update records and add the updates
+ * to the execution list. Note: it will check if the update has been
+ * committed, and only execute the updates if it is not committed to
+ * disk.
+ *
+ * \param[in] env	execution environment
+ * \param[in] tdtd	distribute txn replay data which hold all of replay
+ *                      reqs and all replay parameters.
+ * \param[in] dtrq	distribute transaction replay req.
+ * \param[in] ta	thandle execute args.
+ *
+ * \retval		0 if declare succeeds.
+ * \retval		negative errno if declare fails.
+ */
+static int update_recovery_exec(const struct lu_env *env,
+				struct target_distribute_txn_data *tdtd,
+				struct distribute_txn_replay_req *dtrq,
+				struct thandle_exec_args *ta)
+{
+	struct llog_update_record *lur = dtrq->dtrq_lur;
+	struct update_records	*records = &lur->lur_update_rec;
+	struct update_ops	*ops = &records->ur_ops;
+	struct update_params	*params = update_records_get_params(records);
+	struct top_thandle	*top_th = container_of(ta->ta_handle,
+						       struct top_thandle,
+						       tt_super);
+	struct top_multiple_thandle *tmt = top_th->tt_multiple_thandle;
+	struct update_op	*op;
+	unsigned int		i;
+	int			rc = 0;
+	ENTRY;
+
+	/* These records have been swabbed in llog_cat_process() */
+	for (i = 0, op = &ops->uops_op[0]; i < records->ur_update_count;
+	     i++, op = update_op_next_op(op)) {
+		struct lu_fid		*fid = &op->uop_fid;
+		struct dt_object	*dt_obj;
+		struct dt_object	*sub_dt_obj;
+		struct dt_device	*sub_dt;
+		struct sub_thandle	*st;
+
+		if (op->uop_type == OUT_NOOP)
+			continue;
+
+		dt_obj = dt_locate(env, tdtd->tdtd_dt, fid);
+		if (IS_ERR(dt_obj)) {
+			rc = PTR_ERR(dt_obj);
+			if (rc == -EREMCHG)
+				LCONSOLE_WARN("%.16s: hit invalid OI mapping "
+					      "for "DFID" during recovering, "
+					      "that may because auto scrub is "
+					      "disabled on related MDT, and "
+					      "will cause recovery failure. "
+					      "Please enable auto scrub and "
+					      "retry the recovery.\n",
+					      tdtd->tdtd_lut->lut_obd->obd_name,
+					      PFID(fid));
+
+			break;
+		}
+		sub_dt_obj = dt_object_child(dt_obj);
+
+		/* Create sub thandle if not */
+		sub_dt = lu2dt_dev(sub_dt_obj->do_lu.lo_dev);
+		st = lookup_sub_thandle(tmt, sub_dt);
+		if (st == NULL) {
+			st = create_sub_thandle(tmt, sub_dt);
+			if (IS_ERR(st))
+				GOTO(next, rc = PTR_ERR(st));
+		}
+
+		/* check if updates on the OSD/OSP are committed */
+		rc = update_is_committed(env, dtrq, dt_obj, top_th, st);
+		if (rc == 0)
+			/* If this is committed, goto next */
+			goto next;
+
+		if (rc < 0)
+			GOTO(next, rc);
+
+		/* Create thandle for sub thandle if needed */
+		if (st->st_sub_th == NULL) {
+			rc = sub_thandle_trans_create(env, top_th, st);
+			if (rc != 0)
+				GOTO(next, rc);
+		}
+
+		CDEBUG(D_HA, "replay %uth update\n", i);
+		switch (op->uop_type) {
+		case OUT_CREATE:
+			rc = update_recovery_create(env, sub_dt_obj,
+						    op, params, ta,
+						    st->st_sub_th);
+			break;
+		case OUT_DESTROY:
+			rc = update_recovery_destroy(env, sub_dt_obj,
+						     op, params, ta,
+						     st->st_sub_th);
+			break;
+		case OUT_REF_ADD:
+			rc = update_recovery_ref_add(env, sub_dt_obj,
+						     op, params, ta,
+						     st->st_sub_th);
+			break;
+		case OUT_REF_DEL:
+			rc = update_recovery_ref_del(env, sub_dt_obj,
+						     op, params, ta,
+						     st->st_sub_th);
+			break;
+		case OUT_ATTR_SET:
+			rc = update_recovery_attr_set(env, sub_dt_obj,
+						      op, params, ta,
+						      st->st_sub_th);
+			break;
+		case OUT_XATTR_SET:
+			rc = update_recovery_xattr_set(env, sub_dt_obj,
+						       op, params, ta,
+						       st->st_sub_th);
+			break;
+		case OUT_INDEX_INSERT:
+			rc = update_recovery_index_insert(env, sub_dt_obj,
+							  op, params, ta,
+							  st->st_sub_th);
+			break;
+		case OUT_INDEX_DELETE:
+			rc = update_recovery_index_delete(env, sub_dt_obj,
+							  op, params, ta,
+							  st->st_sub_th);
+			break;
+		case OUT_WRITE:
+			rc = update_recovery_write(env, sub_dt_obj,
+						   op, params, ta,
+						   st->st_sub_th);
+			break;
+		case OUT_XATTR_DEL:
+			rc = update_recovery_xattr_del(env, sub_dt_obj,
+						       op, params, ta,
+						       st->st_sub_th);
+			break;
+		default:
+			CERROR("Unknown update type %u\n", (__u32)op->uop_type);
+			rc = -EINVAL;
+			break;
+		}
+next:
+		dt_object_put(env, dt_obj);
+		if (rc < 0)
+			break;
+	}
+
+	ta->ta_handle->th_result = rc;
+	RETURN(rc);
+}
+
+/**
+ * redo updates on MDT if needed.
+ *
+ * During DNE recovery, the recovery thread (target_recovery_thread) will call
+ * this function to replay distribute txn updates on all MDTs. It only replay
+ * updates on the MDT where the update record is missing.
+ *
+ * If the update already exists on the MDT, then it does not need replay the
+ * updates on that MDT, and only mark the sub transaction has been committed
+ * there.
+ *
+ * \param[in] env	execution environment
+ * \param[in] tdtd	target distribute txn data, which holds the replay list
+ *                      and all parameters needed by replay process.
+ * \param[in] dtrq	distribute txn replay req.
+ *
+ * \retval		0 if replay succeeds.
+ * \retval		negative errno if replay failes.
+ */
+int distribute_txn_replay_handle(struct lu_env *env,
+				 struct target_distribute_txn_data *tdtd,
+				 struct distribute_txn_replay_req *dtrq)
+{
+	struct update_records	*records = &dtrq->dtrq_lur->lur_update_rec;
+	struct thandle_exec_args *ta;
+	struct lu_context	session_env;
+	struct thandle		*th = NULL;
+	struct top_thandle	*top_th;
+	struct top_multiple_thandle *tmt;
+	struct thandle_update_records *tur = NULL;
+	int			i;
+	int			rc = 0;
+	ENTRY;
+
+	/* initialize session, it is needed for the handler of target */
+	rc = lu_context_init(&session_env, LCT_SERVER_SESSION | LCT_NOREF);
+	if (rc) {
+		CERROR("%s: failure to initialize session: rc = %d\n",
+		       tdtd->tdtd_lut->lut_obd->obd_name, rc);
+		RETURN(rc);
+	}
+	lu_context_enter(&session_env);
+	env->le_ses = &session_env;
+	lu_env_refill(env);
+	update_records_dump(records, D_HA, true);
+	th = top_trans_create(env, NULL);
+	if (IS_ERR(th))
+		GOTO(exit_session, rc = PTR_ERR(th));
+
+	ta = &update_env_info(env)->uti_tea;
+	ta->ta_argno = 0;
+
+	update_env_info(env)->uti_dtrq = dtrq;
+	/* Create distribute transaction structure for this top thandle */
+	top_th = container_of(th, struct top_thandle, tt_super);
+	rc = top_trans_create_tmt(env, top_th);
+	if (rc < 0)
+		GOTO(stop_trans, rc);
+
+	th->th_dev = tdtd->tdtd_dt;
+	ta->ta_handle = th;
+
+	/* check if the distribute transaction has been committed */
+	tmt = top_th->tt_multiple_thandle;
+	tmt->tmt_master_sub_dt = tdtd->tdtd_lut->lut_bottom;
+	tmt->tmt_batchid = dtrq->dtrq_batchid;
+	tgt_th_info(env)->tti_transno = dtrq->dtrq_master_transno;
+
+	if (tmt->tmt_batchid <= tdtd->tdtd_committed_batchid)
+		tmt->tmt_committed = 1;
+
+	rc = update_recovery_exec(env, tdtd, dtrq, ta);
+	if (rc < 0)
+		GOTO(stop_trans, rc);
+
+	/* If no updates are needed to be replayed, then mark this records as
+	 * committed, so commit thread distribute_txn_commit_thread() will
+	 * delete the record */
+	if (ta->ta_argno == 0)
+		tmt->tmt_committed = 1;
+
+	tur = &update_env_info(env)->uti_tur;
+	tur->tur_update_records = dtrq->dtrq_lur;
+	tur->tur_update_records_buf_size = dtrq->dtrq_lur_size;
+	tur->tur_update_params = NULL;
+	tur->tur_update_param_count = 0;
+	tmt->tmt_update_records = tur;
+
+	distribute_txn_insert_by_batchid(tmt);
+	rc = top_trans_start(env, NULL, th);
+	if (rc < 0)
+		GOTO(stop_trans, rc);
+
+	for (i = 0; i < ta->ta_argno; i++) {
+		struct tx_arg		*ta_arg;
+		struct dt_object	*dt_obj;
+		struct dt_device	*sub_dt;
+		struct sub_thandle	*st;
+
+		ta_arg = ta->ta_args[i];
+		dt_obj = ta_arg->object;
+
+		LASSERT(tmt->tmt_committed == 0);
+		sub_dt = lu2dt_dev(dt_obj->do_lu.lo_dev);
+		st = lookup_sub_thandle(tmt, sub_dt);
+
+		LASSERT(st != NULL);
+		LASSERT(st->st_sub_th != NULL);
+		rc = ta->ta_args[i]->exec_fn(env, st->st_sub_th,
+					     ta->ta_args[i]);
+
+		/* If the update is to update the reply data, then
+		 * we need set the session information, so
+		 * tgt_last_rcvd_update() can be called correctly */
+		if (rc == 0 && dt_obj == tdtd->tdtd_lut->lut_reply_data)
+			update_recovery_update_ses(env, tdtd, th,
+						   st->st_sub_th, dtrq, ta_arg);
+
+		if (unlikely(rc < 0)) {
+			CDEBUG(D_HA, "error during execution of #%u from"
+			       " %s:%d: rc = %d\n", i, ta->ta_args[i]->file,
+			       ta->ta_args[i]->line, rc);
+			while (--i > 0) {
+				if (ta->ta_args[i]->undo_fn != NULL) {
+					dt_obj = ta->ta_args[i]->object;
+					sub_dt =
+						lu2dt_dev(dt_obj->do_lu.lo_dev);
+					st = lookup_sub_thandle(tmt, sub_dt);
+					LASSERT(st != NULL);
+					LASSERT(st->st_sub_th != NULL);
+
+					ta->ta_args[i]->undo_fn(env,
+							       st->st_sub_th,
+							       ta->ta_args[i]);
+				} else {
+					CERROR("%s: undo for %s:%d: rc = %d\n",
+					     dt_obd_name(ta->ta_handle->th_dev),
+					       ta->ta_args[i]->file,
+					       ta->ta_args[i]->line, -ENOTSUPP);
+				}
+			}
+			break;
+		}
+		CDEBUG(D_HA, "%s: executed %u/%u: rc = %d\n",
+		       dt_obd_name(sub_dt), i, ta->ta_argno, rc);
+	}
+
+stop_trans:
+	if (rc < 0)
+		th->th_result = rc;
+	rc = top_trans_stop(env, tdtd->tdtd_dt, th);
+	for (i = 0; i < ta->ta_argno; i++) {
+		if (ta->ta_args[i]->object != NULL) {
+			dt_object_put(env, ta->ta_args[i]->object);
+			ta->ta_args[i]->object = NULL;
+		}
+	}
+
+	if (tur != NULL)
+		tur->tur_update_records = NULL;
+
+	if (tgt_ses_info(env)->tsi_exp != NULL) {
+		class_export_put(tgt_ses_info(env)->tsi_exp);
+		tgt_ses_info(env)->tsi_exp = NULL;
+	}
+exit_session:
+	lu_context_exit(&session_env);
+	lu_context_fini(&session_env);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(distribute_txn_replay_handle);
diff --git a/drivers/staging/lustrefsx/lustre/target/update_trans.c b/drivers/staging/lustrefsx/lustre/target/update_trans.c
new file mode 100644
index 0000000000000..6c3e41438347c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/target/update_trans.c
@@ -0,0 +1,1752 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2015, 2016, Intel Corporation.
+ */
+/*
+ * lustre/target/update_trans.c
+ *
+ * This file implements the update distribute transaction API.
+ *
+ * To manage the cross-MDT operation (distribute operation) transaction,
+ * the transaction will also be separated two layers on MD stack, top
+ * transaction and sub transaction.
+ *
+ * During the distribute operation, top transaction is created in the LOD
+ * layer, and represent the operation. Sub transaction is created by
+ * each OSD or OSP. Top transaction start/stop will trigger all of its sub
+ * transaction start/stop. Top transaction (the whole operation) is committed
+ * only all of its sub transaction are committed.
+ *
+ * there are three kinds of transactions
+ * 1. local transaction: All updates are in a single local OSD.
+ * 2. Remote transaction: All Updates are only in the remote OSD,
+ *    i.e. locally all updates are in OSP.
+ * 3. Mixed transaction: Updates are both in local OSD and remote
+ *    OSD.
+ *
+ * Author: Di Wang <di.wang@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/kthread.h>
+#include <lu_target.h>
+#include <lustre_log.h>
+#include <lustre_update.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <tgt_internal.h>
+
+#include <tgt_internal.h>
+/**
+ * Dump top mulitple thandle
+ *
+ * Dump top multiple thandle and all of its sub thandle to the debug log.
+ *
+ * \param[in]mask	debug mask
+ * \param[in]top_th	top_thandle to be dumped
+ */
+static void top_multiple_thandle_dump(struct top_multiple_thandle *tmt,
+				      __u32 mask)
+{
+	struct sub_thandle	*st;
+
+	LASSERT(tmt->tmt_magic == TOP_THANDLE_MAGIC);
+	CDEBUG(mask, "%s tmt %p refcount %d committed %d result %d batchid %llu\n",
+	       tmt->tmt_master_sub_dt ?
+	       tmt->tmt_master_sub_dt->dd_lu_dev.ld_obd->obd_name :
+	       "NULL",
+	       tmt, atomic_read(&tmt->tmt_refcount), tmt->tmt_committed,
+	       tmt->tmt_result, tmt->tmt_batchid);
+
+	list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) {
+		struct sub_thandle_cookie *stc;
+
+		CDEBUG(mask, "st %p obd %s committed %d stopped %d sub_th %p\n",
+		       st, st->st_dt->dd_lu_dev.ld_obd->obd_name,
+		       st->st_committed, st->st_stopped, st->st_sub_th);
+
+		list_for_each_entry(stc, &st->st_cookie_list, stc_list) {
+			CDEBUG(mask, " cookie "DFID".%u\n",
+			       PFID(&stc->stc_cookie.lgc_lgl.lgl_oi.oi_fid),
+			       stc->stc_cookie.lgc_index);
+		}
+	}
+}
+
+/**
+ * Declare write update to sub device
+ *
+ * Declare Write updates llog records to the sub device during distribute
+ * transaction.
+ *
+ * \param[in] env	execution environment
+ * \param[in] record	update records being written
+ * \param[in] sub_th	sub transaction handle
+ * \param[in] record_size total update record size
+ *
+ * \retval		0 if writing succeeds
+ * \retval		negative errno if writing fails
+ */
+static int sub_declare_updates_write(const struct lu_env *env,
+				     struct llog_update_record *record,
+				     struct thandle *sub_th, size_t record_size)
+{
+	struct llog_ctxt	*ctxt;
+	struct dt_device	*dt = sub_th->th_dev;
+	int			left = record_size;
+	int rc;
+
+	/* If ctxt is NULL, it means not need to write update,
+	 * for example if the the OSP is used to connect to OST */
+	ctxt = llog_get_context(dt->dd_lu_dev.ld_obd,
+				LLOG_UPDATELOG_ORIG_CTXT);
+
+	/* Not ready to record updates yet. */
+	if (ctxt == NULL || ctxt->loc_handle == NULL) {
+		llog_ctxt_put(ctxt);
+		return 0;
+	}
+
+	rc = llog_declare_add(env, ctxt->loc_handle,
+			      &record->lur_hdr, sub_th);
+	if (rc < 0)
+		GOTO(out_put, rc);
+
+	while (left > ctxt->loc_chunk_size) {
+		rc = llog_declare_add(env, ctxt->loc_handle,
+				      &record->lur_hdr, sub_th);
+		if (rc < 0)
+			GOTO(out_put, rc);
+
+		left -= ctxt->loc_chunk_size;
+	}
+
+out_put:
+	llog_ctxt_put(ctxt);
+
+	return rc;
+}
+
+/**
+ * write update to sub device
+ *
+ * Write llog update record to the sub device during distribute
+ * transaction. If it succeeds, llog cookie of the record will be
+ * returned by @cookie.
+ *
+ * \param[in] env	execution environment
+ * \param[in] record	update records being written
+ * \param[in] sub_th	sub transaction handle
+ * \param[out] cookie	llog cookie of the update record.
+ *
+ * \retval		1 if writing succeeds
+ * \retval		negative errno if writing fails
+ */
+static int sub_updates_write(const struct lu_env *env,
+			     struct llog_update_record *record,
+			     struct sub_thandle *sub_th)
+{
+	struct dt_device *dt = sub_th->st_dt;
+	struct llog_ctxt *ctxt;
+	struct llog_update_record *lur = NULL;
+	__u32 update_count = 0;
+	__u32 param_count = 0;
+	__u32 last_update_count = 0;
+	__u32 last_param_count = 0;
+	char *start;
+	char *cur;
+	char *next;
+	struct sub_thandle_cookie *stc;
+	size_t reclen;
+	bool eof = false;
+	int rc;
+	ENTRY;
+
+	ctxt = llog_get_context(dt->dd_lu_dev.ld_obd,
+				LLOG_UPDATELOG_ORIG_CTXT);
+	/* If ctxt == NULL, then it means updates on OST (only happens
+	 * during migration), and we do not track those updates for now */
+	/* If ctxt->loc_handle == NULL, then it does not need to record
+	 * update, usually happens in error handler path */
+	if (ctxt == NULL || ctxt->loc_handle == NULL) {
+		llog_ctxt_put(ctxt);
+		RETURN(0);
+	}
+
+	/* Since the cross-MDT updates will includes both local
+	 * and remote updates, the update ops count must > 1 */
+	LASSERT(record->lur_update_rec.ur_update_count > 1);
+	LASSERTF(record->lur_hdr.lrh_len == llog_update_record_size(record),
+		 "lrh_len %u record_size %zu\n", record->lur_hdr.lrh_len,
+		 llog_update_record_size(record));
+
+	/*
+	 * If its size > llog chunk_size, then write current chunk to the update
+	 * llog, NB the padding should >= LLOG_MIN_REC_SIZE.
+	 *
+	 * So check padding length is either >= LLOG_MIN_REC_SIZE or is 0
+	 * (record length just matches the chunk size).
+	 */
+
+	reclen = record->lur_hdr.lrh_len;
+	if (reclen + LLOG_MIN_REC_SIZE <= ctxt->loc_chunk_size ||
+	    reclen == ctxt->loc_chunk_size) {
+		OBD_ALLOC_PTR(stc);
+		if (stc == NULL)
+			GOTO(llog_put, rc = -ENOMEM);
+		INIT_LIST_HEAD(&stc->stc_list);
+
+		rc = llog_add(env, ctxt->loc_handle, &record->lur_hdr,
+			      &stc->stc_cookie, sub_th->st_sub_th);
+
+		CDEBUG(D_INFO, "%s: Add update log "DFID".%u: rc = %d\n",
+		       dt->dd_lu_dev.ld_obd->obd_name,
+		       PFID(&stc->stc_cookie.lgc_lgl.lgl_oi.oi_fid),
+		       stc->stc_cookie.lgc_index, rc);
+
+		if (rc > 0) {
+			list_add(&stc->stc_list, &sub_th->st_cookie_list);
+			rc = 0;
+		} else {
+			OBD_FREE_PTR(stc);
+		}
+
+		GOTO(llog_put, rc);
+	}
+
+	/* Split the records into chunk_size update record */
+	OBD_ALLOC_LARGE(lur, ctxt->loc_chunk_size);
+	if (lur == NULL)
+		GOTO(llog_put, rc = -ENOMEM);
+
+	memcpy(lur, &record->lur_hdr, sizeof(record->lur_hdr));
+	lur->lur_update_rec.ur_update_count = 0;
+	lur->lur_update_rec.ur_param_count = 0;
+	start = (char *)&record->lur_update_rec.ur_ops;
+	cur = next = start;
+	do {
+		if (update_count < record->lur_update_rec.ur_update_count)
+			next = (char *)update_op_next_op(
+						(struct update_op *)cur);
+		else if (param_count < record->lur_update_rec.ur_param_count)
+			next = (char *)update_param_next_param(
+						(struct update_param *)cur);
+		else
+			eof = true;
+
+		reclen = __llog_update_record_size(
+				__update_records_size(next - start));
+		if ((reclen + LLOG_MIN_REC_SIZE <= ctxt->loc_chunk_size ||
+		     reclen == ctxt->loc_chunk_size) &&
+		    !eof) {
+			cur = next;
+
+			if (update_count <
+			    record->lur_update_rec.ur_update_count)
+				update_count++;
+			else if (param_count <
+				 record->lur_update_rec.ur_param_count)
+				param_count++;
+			continue;
+		}
+
+		lur->lur_update_rec.ur_update_count = update_count -
+						      last_update_count;
+		lur->lur_update_rec.ur_param_count = param_count -
+						     last_param_count;
+		memcpy(&lur->lur_update_rec.ur_ops, start, cur - start);
+		lur->lur_hdr.lrh_len = llog_update_record_size(lur);
+
+		LASSERT(lur->lur_hdr.lrh_len ==
+			 __llog_update_record_size(
+				__update_records_size(cur - start)));
+		LASSERT(lur->lur_hdr.lrh_len <= ctxt->loc_chunk_size);
+
+		update_records_dump(&lur->lur_update_rec, D_INFO, true);
+
+		OBD_ALLOC_PTR(stc);
+		if (stc == NULL)
+			GOTO(llog_put, rc = -ENOMEM);
+		INIT_LIST_HEAD(&stc->stc_list);
+
+		rc = llog_add(env, ctxt->loc_handle, &lur->lur_hdr,
+			      &stc->stc_cookie, sub_th->st_sub_th);
+
+		CDEBUG(D_INFO, "%s: Add update log "DFID".%u: rc = %d\n",
+			dt->dd_lu_dev.ld_obd->obd_name,
+			PFID(&stc->stc_cookie.lgc_lgl.lgl_oi.oi_fid),
+			stc->stc_cookie.lgc_index, rc);
+
+		if (rc > 0) {
+			list_add(&stc->stc_list, &sub_th->st_cookie_list);
+			rc = 0;
+		} else {
+			OBD_FREE_PTR(stc);
+			GOTO(llog_put, rc);
+		}
+
+		last_update_count = update_count;
+		last_param_count = param_count;
+		start = cur;
+		lur->lur_update_rec.ur_update_count = 0;
+		lur->lur_update_rec.ur_param_count = 0;
+		lur->lur_update_rec.ur_flags |= UPDATE_RECORD_CONTINUE;
+	} while (!eof);
+
+llog_put:
+	if (lur != NULL)
+		OBD_FREE_LARGE(lur, ctxt->loc_chunk_size);
+	llog_ctxt_put(ctxt);
+
+	RETURN(rc);
+}
+
+/**
+ * Prepare the update records.
+ *
+ * Merge params and ops into the update records, then initializing
+ * the update buffer.
+ *
+ * During transaction execution phase, parameters and update ops
+ * are collected in two different buffers (see lod_updates_pack()),
+ * during transaction stop, it needs to be merged in one buffer,
+ * so it will be written in the update log.
+ *
+ * \param[in] env	execution environment
+ * \param[in] tmt	top_multiple_thandle for distribute txn
+ *
+ * \retval		0 if merging succeeds.
+ * \retval		negaitive errno if merging fails.
+ */
+static int prepare_writing_updates(const struct lu_env *env,
+				   struct top_multiple_thandle *tmt)
+{
+	struct thandle_update_records	*tur = tmt->tmt_update_records;
+	struct llog_update_record	*lur;
+	struct update_params *params;
+	size_t params_size;
+	size_t update_size;
+
+	if (tur == NULL || tur->tur_update_records == NULL ||
+	    tur->tur_update_params == NULL)
+		return 0;
+
+	lur = tur->tur_update_records;
+	/* Extends the update records buffer if needed */
+	params_size = update_params_size(tur->tur_update_params,
+					 tur->tur_update_param_count);
+	LASSERT(lur->lur_update_rec.ur_param_count == 0);
+	update_size = llog_update_record_size(lur);
+	if (cfs_size_round(update_size + params_size) >
+	    tur->tur_update_records_buf_size) {
+		int rc;
+
+		rc = tur_update_records_extend(tur,
+			cfs_size_round(update_size + params_size));
+		if (rc < 0)
+			return rc;
+
+		lur = tur->tur_update_records;
+	}
+
+	params = update_records_get_params(&lur->lur_update_rec);
+	memcpy(params, tur->tur_update_params, params_size);
+
+	lur->lur_update_rec.ur_param_count = tur->tur_update_param_count;
+	lur->lur_update_rec.ur_batchid = tmt->tmt_batchid;
+	/* Init update record header */
+	lur->lur_hdr.lrh_len = llog_update_record_size(lur);
+	lur->lur_hdr.lrh_type = UPDATE_REC;
+
+	/* Dump updates for debugging purpose */
+	update_records_dump(&lur->lur_update_rec, D_INFO, true);
+
+	return 0;
+}
+
+static inline int
+distribute_txn_commit_thread_running(struct lu_target *lut)
+{
+	return lut->lut_tdtd_commit_thread.t_flags & SVC_RUNNING;
+}
+
+static inline int
+distribute_txn_commit_thread_stopped(struct lu_target *lut)
+{
+	return lut->lut_tdtd_commit_thread.t_flags & SVC_STOPPED;
+}
+
+/**
+ * Top thandle commit callback
+ *
+ * This callback will be called when all of sub transactions are committed.
+ *
+ * \param[in] th	top thandle to be committed.
+ */
+static void top_trans_committed_cb(struct top_multiple_thandle *tmt)
+{
+	struct lu_target *lut;
+	ENTRY;
+
+	LASSERT(atomic_read(&tmt->tmt_refcount) > 0);
+
+	top_multiple_thandle_dump(tmt, D_HA);
+	tmt->tmt_committed = 1;
+	lut = dt2lu_dev(tmt->tmt_master_sub_dt)->ld_site->ls_tgt;
+	if (distribute_txn_commit_thread_running(lut))
+		wake_up(&lut->lut_tdtd->tdtd_commit_thread_waitq);
+	RETURN_EXIT;
+}
+
+struct sub_thandle *lookup_sub_thandle(struct top_multiple_thandle *tmt,
+				       struct dt_device *dt_dev)
+{
+	struct sub_thandle *st;
+
+	list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) {
+		if (st->st_dt == dt_dev)
+			return st;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(lookup_sub_thandle);
+
+struct sub_thandle *create_sub_thandle(struct top_multiple_thandle *tmt,
+				       struct dt_device *dt_dev)
+{
+	struct sub_thandle *st;
+
+	OBD_ALLOC_PTR(st);
+	if (st == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	INIT_LIST_HEAD(&st->st_sub_list);
+	INIT_LIST_HEAD(&st->st_cookie_list);
+	st->st_dt = dt_dev;
+
+	list_add(&st->st_sub_list, &tmt->tmt_sub_thandle_list);
+	return st;
+}
+
+static void sub_trans_commit_cb_internal(struct top_multiple_thandle *tmt,
+					 struct thandle *sub_th, int err)
+{
+	struct sub_thandle	*st;
+	bool			all_committed = true;
+
+	/* Check if all sub thandles are committed */
+	spin_lock(&tmt->tmt_sub_lock);
+	list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) {
+		if (st->st_sub_th == sub_th) {
+			st->st_committed = 1;
+			st->st_result = err;
+		}
+		if (!st->st_committed)
+			all_committed = false;
+	}
+	spin_unlock(&tmt->tmt_sub_lock);
+
+	if (tmt->tmt_result == 0)
+		tmt->tmt_result = err;
+
+	if (all_committed)
+		top_trans_committed_cb(tmt);
+
+	top_multiple_thandle_dump(tmt, D_INFO);
+	top_multiple_thandle_put(tmt);
+	RETURN_EXIT;
+}
+
+/**
+ * sub thandle commit callback
+ *
+ * Mark the sub thandle to be committed and if all sub thandle are committed
+ * notify the top thandle.
+ *
+ * \param[in] env	execution environment
+ * \param[in] sub_th	sub thandle being committed
+ * \param[in] cb	commit callback
+ * \param[in] err	trans result
+ */
+static void sub_trans_commit_cb(struct lu_env *env,
+				struct thandle *sub_th,
+				struct dt_txn_commit_cb *cb, int err)
+{
+	struct top_multiple_thandle *tmt = cb->dcb_data;
+
+	sub_trans_commit_cb_internal(tmt, sub_th, err);
+}
+
+static void sub_thandle_register_commit_cb(struct sub_thandle *st,
+				    struct top_multiple_thandle *tmt)
+{
+	LASSERT(st->st_sub_th != NULL);
+	top_multiple_thandle_get(tmt);
+	st->st_commit_dcb.dcb_func = sub_trans_commit_cb;
+	st->st_commit_dcb.dcb_data = tmt;
+	INIT_LIST_HEAD(&st->st_commit_dcb.dcb_linkage);
+	dt_trans_cb_add(st->st_sub_th, &st->st_commit_dcb);
+}
+
+/**
+ * Sub thandle stop call back
+ *
+ * After sub thandle is stopped, it will call this callback to notify
+ * the top thandle.
+ *
+ * \param[in] th	sub thandle to be stopped
+ * \param[in] rc	result of sub trans
+ */
+static void sub_trans_stop_cb(struct lu_env *env,
+			      struct thandle *sub_th,
+			      struct dt_txn_commit_cb *cb, int err)
+{
+	struct sub_thandle		*st;
+	struct top_multiple_thandle	*tmt = cb->dcb_data;
+	ENTRY;
+
+	list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) {
+		if (st->st_stopped)
+			continue;
+
+		if (st->st_dt == sub_th->th_dev) {
+			st->st_stopped = 1;
+			st->st_result = err;
+			break;
+		}
+	}
+
+	wake_up(&tmt->tmt_stop_waitq);
+	RETURN_EXIT;
+}
+
+static void sub_thandle_register_stop_cb(struct sub_thandle *st,
+					 struct top_multiple_thandle *tmt)
+{
+	st->st_stop_dcb.dcb_func = sub_trans_stop_cb;
+	st->st_stop_dcb.dcb_data = tmt;
+	st->st_stop_dcb.dcb_flags = DCB_TRANS_STOP;
+	INIT_LIST_HEAD(&st->st_stop_dcb.dcb_linkage);
+	dt_trans_cb_add(st->st_sub_th, &st->st_stop_dcb);
+}
+
+/**
+ * Create sub thandle
+ *
+ * Create transaction handle for sub_thandle
+ *
+ * \param[in] env	execution environment
+ * \param[in] th	top thandle
+ * \param[in] st	sub_thandle
+ *
+ * \retval		0 if creation succeeds.
+ * \retval		negative errno if creation fails.
+ */
+int sub_thandle_trans_create(const struct lu_env *env,
+			     struct top_thandle *top_th,
+			     struct sub_thandle *st)
+{
+	struct thandle *sub_th;
+
+	sub_th = dt_trans_create(env, st->st_dt);
+	if (IS_ERR(sub_th))
+		return PTR_ERR(sub_th);
+
+	sub_th->th_top = &top_th->tt_super;
+	st->st_sub_th = sub_th;
+
+	sub_th->th_wait_submit = 1;
+	sub_thandle_register_stop_cb(st, top_th->tt_multiple_thandle);
+	return 0;
+}
+
+/**
+ * Create the top transaction.
+ *
+ * Create the top transaction on the master device. It will create a top
+ * thandle and a sub thandle on the master device.
+ *
+ * \param[in] env		execution environment
+ * \param[in] master_dev	master_dev the top thandle will be created
+ *
+ * \retval			pointer to the created thandle.
+ * \retval			ERR_PTR(errno) if creation failed.
+ */
+struct thandle *
+top_trans_create(const struct lu_env *env, struct dt_device *master_dev)
+{
+	struct top_thandle	*top_th;
+	struct thandle		*child_th;
+
+	OBD_ALLOC_GFP(top_th, sizeof(*top_th), __GFP_IO);
+	if (top_th == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	top_th->tt_super.th_top = &top_th->tt_super;
+
+	if (master_dev != NULL) {
+		child_th = dt_trans_create(env, master_dev);
+		if (IS_ERR(child_th)) {
+			OBD_FREE_PTR(top_th);
+			return child_th;
+		}
+
+		child_th->th_top = &top_th->tt_super;
+		child_th->th_wait_submit = 1;
+		top_th->tt_master_sub_thandle = child_th;
+	}
+	return &top_th->tt_super;
+}
+EXPORT_SYMBOL(top_trans_create);
+
+/**
+ * Declare write update transaction
+ *
+ * Check if there are updates being recorded in this transaction,
+ * it will write the record into the disk.
+ *
+ * \param[in] env	execution environment
+ * \param[in] tmt	top multiple transaction handle
+ *
+ * \retval		0 if writing succeeds
+ * \retval		negative errno if writing fails
+ */
+static int declare_updates_write(const struct lu_env *env,
+				 struct top_multiple_thandle *tmt)
+{
+	struct llog_update_record *record;
+	struct sub_thandle *st;
+	int rc = 0;
+
+	record = tmt->tmt_update_records->tur_update_records;
+	/* Declare update write for all other target */
+	list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) {
+		if (st->st_sub_th == NULL)
+			continue;
+
+		rc = sub_declare_updates_write(env, record, st->st_sub_th,
+					       tmt->tmt_record_size);
+		if (rc < 0)
+			break;
+	}
+
+	return rc;
+}
+
+/**
+ * Assign batchid to the distribute transaction.
+ *
+ * Assign batchid to the distribute transaction
+ *
+ * \param[in] tmt	distribute transaction
+ */
+static void distribute_txn_assign_batchid(struct top_multiple_thandle *new)
+{
+	struct target_distribute_txn_data *tdtd;
+	struct dt_device *dt = new->tmt_master_sub_dt;
+	struct sub_thandle *st;
+
+	LASSERT(dt != NULL);
+	tdtd = dt2lu_dev(dt)->ld_site->ls_tgt->lut_tdtd;
+	spin_lock(&tdtd->tdtd_batchid_lock);
+	new->tmt_batchid = tdtd->tdtd_batchid++;
+	list_add_tail(&new->tmt_commit_list, &tdtd->tdtd_list);
+	spin_unlock(&tdtd->tdtd_batchid_lock);
+	list_for_each_entry(st, &new->tmt_sub_thandle_list, st_sub_list) {
+		if (st->st_sub_th != NULL)
+			sub_thandle_register_commit_cb(st, new);
+	}
+	top_multiple_thandle_get(new);
+	top_multiple_thandle_dump(new, D_INFO);
+}
+
+/**
+ * Insert distribute transaction to the distribute txn list.
+ *
+ * Insert distribute transaction to the distribute txn list.
+ *
+ * \param[in] new	the distribute txn to be inserted.
+ */
+void distribute_txn_insert_by_batchid(struct top_multiple_thandle *new)
+{
+	struct dt_device *dt = new->tmt_master_sub_dt;
+	struct top_multiple_thandle *tmt;
+	struct target_distribute_txn_data *tdtd;
+	struct sub_thandle *st;
+	bool	at_head = false;
+
+	LASSERT(dt != NULL);
+	tdtd = dt2lu_dev(dt)->ld_site->ls_tgt->lut_tdtd;
+
+	spin_lock(&tdtd->tdtd_batchid_lock);
+	list_for_each_entry_reverse(tmt, &tdtd->tdtd_list, tmt_commit_list) {
+		if (new->tmt_batchid > tmt->tmt_batchid) {
+			list_add(&new->tmt_commit_list, &tmt->tmt_commit_list);
+			break;
+		}
+	}
+	if (list_empty(&new->tmt_commit_list)) {
+		at_head = true;
+		list_add(&new->tmt_commit_list, &tdtd->tdtd_list);
+	}
+	spin_unlock(&tdtd->tdtd_batchid_lock);
+
+	list_for_each_entry(st, &new->tmt_sub_thandle_list, st_sub_list) {
+		if (st->st_sub_th != NULL)
+			sub_thandle_register_commit_cb(st, new);
+	}
+
+	top_multiple_thandle_get(new);
+	top_multiple_thandle_dump(new, D_INFO);
+	if (new->tmt_committed && at_head)
+		wake_up(&tdtd->tdtd_commit_thread_waitq);
+}
+
+/**
+ * Prepare cross-MDT operation.
+ *
+ * Create the update record buffer to record updates for cross-MDT operation,
+ * add master sub transaction to tt_sub_trans_list, and declare the update
+ * writes.
+ *
+ * During updates packing, all of parameters will be packed in
+ * tur_update_params, and updates will be packed in tur_update_records.
+ * Then in transaction stop, parameters and updates will be merged
+ * into one updates buffer.
+ *
+ * And also master thandle will be added to the sub_th list, so it will be
+ * easy to track the commit status.
+ *
+ * \param[in] env	execution environment
+ * \param[in] th	top transaction handle
+ *
+ * \retval		0 if preparation succeeds.
+ * \retval		negative errno if preparation fails.
+ */
+static int prepare_multiple_node_trans(const struct lu_env *env,
+				       struct top_multiple_thandle *tmt)
+{
+	struct thandle_update_records	*tur;
+	int				rc;
+	ENTRY;
+
+	if (tmt->tmt_update_records == NULL) {
+		tur = &update_env_info(env)->uti_tur;
+		rc = check_and_prepare_update_record(env, tur);
+		if (rc < 0)
+			RETURN(rc);
+
+		tmt->tmt_update_records = tur;
+		distribute_txn_assign_batchid(tmt);
+	}
+
+	rc = declare_updates_write(env, tmt);
+
+	RETURN(rc);
+}
+
+/**
+ * start the top transaction.
+ *
+ * Start all of its sub transactions, then start master sub transaction.
+ *
+ * \param[in] env		execution environment
+ * \param[in] master_dev	master_dev the top thandle will be start
+ * \param[in] th		top thandle
+ *
+ * \retval			0 if transaction start succeeds.
+ * \retval			negative errno if start fails.
+ */
+int top_trans_start(const struct lu_env *env, struct dt_device *master_dev,
+		    struct thandle *th)
+{
+	struct top_thandle	*top_th = container_of(th, struct top_thandle,
+						       tt_super);
+	struct sub_thandle		*st;
+	struct top_multiple_thandle	*tmt = top_th->tt_multiple_thandle;
+	int				rc = 0;
+	ENTRY;
+
+	if (tmt == NULL) {
+		if (th->th_sync)
+			top_th->tt_master_sub_thandle->th_sync = th->th_sync;
+		if (th->th_local)
+			top_th->tt_master_sub_thandle->th_local = th->th_local;
+		rc = dt_trans_start(env, top_th->tt_master_sub_thandle->th_dev,
+				    top_th->tt_master_sub_thandle);
+		RETURN(rc);
+	}
+
+	tmt = top_th->tt_multiple_thandle;
+	rc = prepare_multiple_node_trans(env, tmt);
+	if (rc < 0)
+		RETURN(rc);
+
+	list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) {
+		if (st->st_sub_th == NULL)
+			continue;
+		if (th->th_sync)
+			st->st_sub_th->th_sync = th->th_sync;
+		if (th->th_local)
+			st->st_sub_th->th_local = th->th_local;
+		rc = dt_trans_start(env, st->st_sub_th->th_dev,
+				    st->st_sub_th);
+		if (rc != 0)
+			GOTO(out, rc);
+
+		LASSERT(st->st_started == 0);
+		st->st_started = 1;
+	}
+out:
+	th->th_result = rc;
+	RETURN(rc);
+}
+EXPORT_SYMBOL(top_trans_start);
+
+/**
+ * Check whether we need write updates record
+ *
+ * Check if the updates for the top_thandle needs to be writen
+ * to all targets. Only if the transaction succeeds and the updates
+ * number > 2, it will write the updates,
+ *
+ * \params [in] top_th	top thandle.
+ *
+ * \retval		true if it needs to write updates
+ * \retval		false if it does not need to write updates
+ **/
+static bool top_check_write_updates(struct top_thandle *top_th)
+{
+	struct top_multiple_thandle	*tmt;
+	struct thandle_update_records	*tur;
+
+	/* Do not write updates to records if the transaction fails */
+	if (top_th->tt_super.th_result != 0)
+		return false;
+
+	tmt = top_th->tt_multiple_thandle;
+	if (tmt == NULL)
+		return false;
+
+	tur = tmt->tmt_update_records;
+	if (tur == NULL)
+		return false;
+
+	/* Hmm, false update records, since the cross-MDT operation
+	 * should includes both local and remote updates, so the
+	 * updates count should >= 2 */
+	if (tur->tur_update_records == NULL ||
+	    tur->tur_update_records->lur_update_rec.ur_update_count <= 1)
+		return false;
+
+	return true;
+}
+
+/**
+ * Check if top transaction is stopped
+ *
+ * Check if top transaction is stopped, only if all sub transaction
+ * is stopped, then the top transaction is stopped.
+ *
+ * \param [in] top_th	top thandle
+ *
+ * \retval		true if the top transaction is stopped.
+ * \retval		false if the top transaction is not stopped.
+ */
+static bool top_trans_is_stopped(struct top_thandle *top_th)
+{
+	struct top_multiple_thandle	*tmt;
+	struct sub_thandle		*st;
+	bool			all_stopped = true;
+
+	tmt = top_th->tt_multiple_thandle;
+	list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) {
+		if (!st->st_stopped && st->st_sub_th != NULL) {
+			all_stopped = false;
+			break;
+		}
+
+		if (st->st_result != 0 &&
+		    top_th->tt_super.th_result == 0)
+			top_th->tt_super.th_result = st->st_result;
+	}
+
+	return all_stopped;
+}
+
+/**
+ * Wait result of top transaction
+ *
+ * Wait until all sub transaction get its result.
+ *
+ * \param [in] top_th	top thandle.
+ *
+ * \retval		the result of top thandle.
+ */
+static int top_trans_wait_result(struct top_thandle *top_th)
+{
+	struct l_wait_info	lwi = {0};
+
+	l_wait_event(top_th->tt_multiple_thandle->tmt_stop_waitq,
+		     top_trans_is_stopped(top_th), &lwi);
+
+	RETURN(top_th->tt_super.th_result);
+}
+
+/**
+ * Stop the top transaction.
+ *
+ * Stop the transaction on the master device first, then stop transactions
+ * on other sub devices.
+ *
+ * \param[in] env		execution environment
+ * \param[in] master_dev	master_dev the top thandle will be created
+ * \param[in] th		top thandle
+ *
+ * \retval			0 if stop transaction succeeds.
+ * \retval			negative errno if stop transaction fails.
+ */
+int top_trans_stop(const struct lu_env *env, struct dt_device *master_dev,
+		   struct thandle *th)
+{
+	struct top_thandle	*top_th = container_of(th, struct top_thandle,
+						       tt_super);
+	struct sub_thandle		*st;
+	struct sub_thandle		*master_st;
+	struct top_multiple_thandle	*tmt;
+	struct thandle_update_records	*tur;
+	bool				write_updates = false;
+	int			rc = 0;
+	ENTRY;
+
+	if (likely(top_th->tt_multiple_thandle == NULL)) {
+		LASSERT(master_dev != NULL);
+
+		if (th->th_sync)
+			top_th->tt_master_sub_thandle->th_sync = th->th_sync;
+		if (th->th_local)
+			top_th->tt_master_sub_thandle->th_local = th->th_local;
+		rc = dt_trans_stop(env, master_dev,
+				   top_th->tt_master_sub_thandle);
+		OBD_FREE_PTR(top_th);
+		RETURN(rc);
+	}
+
+	tmt = top_th->tt_multiple_thandle;
+	tur = tmt->tmt_update_records;
+
+	/* Note: we need stop the master thandle first, then the stop
+	 * callback will fill the master transno in the update logs,
+	 * then these update logs will be sent to other MDTs */
+	/* get the master sub thandle */
+	master_st = lookup_sub_thandle(tmt, tmt->tmt_master_sub_dt);
+	write_updates = top_check_write_updates(top_th);
+
+	/* Step 1: write the updates log on Master MDT */
+	if (master_st != NULL && master_st->st_sub_th != NULL &&
+	    write_updates) {
+		struct llog_update_record *lur;
+
+		/* Merge the parameters and updates into one buffer */
+		rc = prepare_writing_updates(env, tmt);
+		if (rc < 0) {
+			CERROR("%s: cannot prepare updates: rc = %d\n",
+			       master_dev->dd_lu_dev.ld_obd->obd_name, rc);
+			th->th_result = rc;
+			write_updates = false;
+			GOTO(stop_master_trans, rc);
+		}
+
+		lur = tur->tur_update_records;
+		/* Write updates to the master MDT */
+		rc = sub_updates_write(env, lur, master_st);
+
+		/* Cleanup the common parameters in the update records,
+		 * master transno callback might add more parameters.
+		 * and we need merge the update records again in the
+		 * following */
+		if (tur->tur_update_params != NULL)
+			lur->lur_update_rec.ur_param_count = 0;
+
+		if (rc < 0) {
+			CERROR("%s: write updates failed: rc = %d\n",
+			       master_dev->dd_lu_dev.ld_obd->obd_name, rc);
+			th->th_result = rc;
+			write_updates = false;
+			GOTO(stop_master_trans, rc);
+		}
+	}
+
+stop_master_trans:
+	/* Step 2: Stop the transaction on the master MDT, and fill the
+	 * master transno in the update logs to other MDT. */
+	if (master_st != NULL && master_st->st_sub_th != NULL) {
+		if (th->th_local)
+			master_st->st_sub_th->th_local = th->th_local;
+		if (th->th_sync)
+			master_st->st_sub_th->th_sync = th->th_sync;
+		master_st->st_sub_th->th_result = th->th_result;
+		rc = dt_trans_stop(env, master_st->st_dt, master_st->st_sub_th);
+		/* If it does not write_updates, then we call submit callback
+		 * here, otherwise callback is done through
+		 * osd(osp)_trans_commit_cb() */
+		if (!master_st->st_started &&
+		    !list_empty(&tmt->tmt_commit_list))
+			sub_trans_commit_cb_internal(tmt,
+						master_st->st_sub_th, rc);
+		if (rc < 0) {
+			th->th_result = rc;
+			GOTO(stop_other_trans, rc);
+		} else if (tur != NULL && tur->tur_update_records != NULL) {
+			struct llog_update_record *lur;
+
+			lur = tur->tur_update_records;
+			if (lur->lur_update_rec.ur_master_transno == 0)
+				/* Update master transno after master stop
+				 * callback */
+				lur->lur_update_rec.ur_master_transno =
+						tgt_th_info(env)->tti_transno;
+		}
+	}
+
+	/* Step 3: write updates to other MDTs */
+	if (write_updates) {
+		struct llog_update_record *lur;
+
+		/* Stop callback of master will add more updates and also update
+		 * master transno, so merge the parameters and updates into one
+		 * buffer again */
+		rc = prepare_writing_updates(env, tmt);
+		if (rc < 0) {
+			CERROR("%s: prepare updates failed: rc = %d\n",
+			       master_dev->dd_lu_dev.ld_obd->obd_name, rc);
+			th->th_result = rc;
+			GOTO(stop_other_trans, rc);
+		}
+		lur = tur->tur_update_records;
+		list_for_each_entry(st, &tmt->tmt_sub_thandle_list,
+				    st_sub_list) {
+			if (st->st_sub_th == NULL || st == master_st ||
+			    st->st_sub_th->th_result < 0)
+				continue;
+
+			rc = sub_updates_write(env, lur, st);
+			if (rc < 0) {
+				th->th_result = rc;
+				break;
+			}
+		}
+	}
+
+stop_other_trans:
+	/* Step 4: Stop the transaction on other MDTs */
+	list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) {
+		if (st == master_st || st->st_sub_th == NULL)
+			continue;
+
+		if (th->th_sync)
+			st->st_sub_th->th_sync = th->th_sync;
+		if (th->th_local)
+			st->st_sub_th->th_local = th->th_local;
+		st->st_sub_th->th_result = th->th_result;
+		rc = dt_trans_stop(env, st->st_sub_th->th_dev,
+				   st->st_sub_th);
+		if (unlikely(rc < 0 && th->th_result == 0))
+			th->th_result = rc;
+	}
+
+	rc = top_trans_wait_result(top_th);
+
+	tmt->tmt_result = rc;
+
+	/* Balance for the refcount in top_trans_create, Note: if it is NOT
+	 * multiple node transaction, the top transaction will be destroyed. */
+	top_multiple_thandle_put(tmt);
+	OBD_FREE_PTR(top_th);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(top_trans_stop);
+
+/**
+ * Create top_multiple_thandle for top_thandle
+ *
+ * Create top_mutilple_thandle to manage the mutiple node transaction
+ * for top_thandle, and it also needs to add master sub thandle to the
+ * sub trans list now.
+ *
+ * \param[in] env	execution environment
+ * \param[in] top_th	the top thandle
+ *
+ * \retval	0 if creation succeeds
+ * \retval	negative errno if creation fails
+ */
+int top_trans_create_tmt(const struct lu_env *env,
+			 struct top_thandle *top_th)
+{
+	struct top_multiple_thandle *tmt;
+
+	OBD_ALLOC_PTR(tmt);
+	if (tmt == NULL)
+		return -ENOMEM;
+
+	tmt->tmt_magic = TOP_THANDLE_MAGIC;
+	INIT_LIST_HEAD(&tmt->tmt_sub_thandle_list);
+	INIT_LIST_HEAD(&tmt->tmt_commit_list);
+	atomic_set(&tmt->tmt_refcount, 1);
+	spin_lock_init(&tmt->tmt_sub_lock);
+	init_waitqueue_head(&tmt->tmt_stop_waitq);
+
+	top_th->tt_multiple_thandle = tmt;
+
+	return 0;
+}
+
+static struct sub_thandle *
+create_sub_thandle_with_thandle(struct top_thandle *top_th,
+				struct thandle *sub_th)
+{
+	struct sub_thandle *st;
+
+	/* create and init sub th to the top trans list */
+	st = create_sub_thandle(top_th->tt_multiple_thandle,
+				sub_th->th_dev);
+	if (IS_ERR(st))
+		return st;
+
+	st->st_sub_th = sub_th;
+
+	sub_th->th_top = &top_th->tt_super;
+	sub_thandle_register_stop_cb(st, top_th->tt_multiple_thandle);
+	return st;
+}
+
+/**
+ * Get sub thandle.
+ *
+ * Get sub thandle from the top thandle according to the sub dt_device.
+ *
+ * \param[in] env	execution environment
+ * \param[in] th	thandle on the top layer.
+ * \param[in] sub_dt	sub dt_device used to get sub transaction
+ *
+ * \retval		thandle of sub transaction if succeed
+ * \retval		PTR_ERR(errno) if failed
+ */
+struct thandle *thandle_get_sub_by_dt(const struct lu_env *env,
+				      struct thandle *th,
+				      struct dt_device *sub_dt)
+{
+	struct sub_thandle	*st = NULL;
+	struct sub_thandle	*master_st = NULL;
+	struct top_thandle	*top_th;
+	struct thandle		*sub_th = NULL;
+	int			rc = 0;
+	ENTRY;
+
+	top_th = container_of(th, struct top_thandle, tt_super);
+
+	if (likely(sub_dt == top_th->tt_master_sub_thandle->th_dev))
+		RETURN(top_th->tt_master_sub_thandle);
+
+	if (top_th->tt_multiple_thandle != NULL) {
+		st = lookup_sub_thandle(top_th->tt_multiple_thandle, sub_dt);
+		if (st != NULL)
+			RETURN(st->st_sub_th);
+	}
+
+	sub_th = dt_trans_create(env, sub_dt);
+	if (IS_ERR(sub_th))
+		RETURN(sub_th);
+
+	/* Create top_multiple_thandle if necessary */
+	if (top_th->tt_multiple_thandle == NULL) {
+		struct top_multiple_thandle *tmt;
+
+		rc = top_trans_create_tmt(env, top_th);
+		if (rc < 0)
+			GOTO(stop_trans, rc);
+
+		tmt = top_th->tt_multiple_thandle;
+
+		/* Add master sub th to the top trans list */
+		tmt->tmt_master_sub_dt =
+			top_th->tt_master_sub_thandle->th_dev;
+		master_st = create_sub_thandle_with_thandle(top_th,
+					top_th->tt_master_sub_thandle);
+		if (IS_ERR(master_st)) {
+			rc = PTR_ERR(master_st);
+			master_st = NULL;
+			GOTO(stop_trans, rc);
+		}
+	}
+
+	/* create and init sub th to the top trans list */
+	st = create_sub_thandle_with_thandle(top_th, sub_th);
+	if (IS_ERR(st)) {
+		rc = PTR_ERR(st);
+		st = NULL;
+		GOTO(stop_trans, rc);
+	}
+	st->st_sub_th->th_wait_submit = 1;
+stop_trans:
+	if (rc < 0) {
+		if (master_st != NULL) {
+			list_del(&master_st->st_sub_list);
+			OBD_FREE_PTR(master_st);
+		}
+		sub_th->th_result = rc;
+		dt_trans_stop(env, sub_dt, sub_th);
+		sub_th = ERR_PTR(rc);
+	}
+
+	RETURN(sub_th);
+}
+EXPORT_SYMBOL(thandle_get_sub_by_dt);
+
+/**
+ * Top multiple thandle destroy
+ *
+ * Destroy multiple thandle and all its sub thandle.
+ *
+ * \param[in] tmt	top_multiple_thandle to be destroyed.
+ */
+void top_multiple_thandle_destroy(struct top_multiple_thandle *tmt)
+{
+	struct sub_thandle *st;
+	struct sub_thandle *tmp;
+
+	LASSERT(tmt->tmt_magic == TOP_THANDLE_MAGIC);
+	list_for_each_entry_safe(st, tmp, &tmt->tmt_sub_thandle_list,
+				 st_sub_list) {
+		struct sub_thandle_cookie *stc;
+		struct sub_thandle_cookie *tmp;
+
+		list_del(&st->st_sub_list);
+		list_for_each_entry_safe(stc, tmp, &st->st_cookie_list,
+					 stc_list) {
+			list_del(&stc->stc_list);
+			OBD_FREE_PTR(stc);
+		}
+		OBD_FREE_PTR(st);
+	}
+	OBD_FREE_PTR(tmt);
+}
+EXPORT_SYMBOL(top_multiple_thandle_destroy);
+
+/**
+ * Cancel the update log on MDTs
+ *
+ * Cancel the update log on MDTs then destroy the thandle.
+ *
+ * \param[in] env	execution environment
+ * \param[in] tmt	the top multiple thandle whose updates records
+ *                      will be cancelled.
+ *
+ * \retval		0 if cancellation succeeds.
+ * \retval		negative errno if cancellation fails.
+ */
+static int distribute_txn_cancel_records(const struct lu_env *env,
+					 struct top_multiple_thandle *tmt)
+{
+	struct sub_thandle *st;
+	ENTRY;
+
+	top_multiple_thandle_dump(tmt, D_INFO);
+	/* Cancel update logs on other MDTs */
+	list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) {
+		struct llog_ctxt	*ctxt;
+		struct obd_device	*obd;
+		struct llog_cookie	*cookie;
+		struct sub_thandle_cookie *stc;
+		int rc;
+
+		obd = st->st_dt->dd_lu_dev.ld_obd;
+		ctxt = llog_get_context(obd, LLOG_UPDATELOG_ORIG_CTXT);
+		if (ctxt == NULL)
+			continue;
+		list_for_each_entry(stc, &st->st_cookie_list, stc_list) {
+			cookie = &stc->stc_cookie;
+			if (fid_is_zero(&cookie->lgc_lgl.lgl_oi.oi_fid))
+				continue;
+
+			rc = llog_cat_cancel_records(env, ctxt->loc_handle, 1,
+						     cookie);
+			CDEBUG(D_HA, "%s: batchid %llu cancel update log "
+			       DFID".%u: rc = %d\n", obd->obd_name,
+			       tmt->tmt_batchid,
+			       PFID(&cookie->lgc_lgl.lgl_oi.oi_fid),
+			       cookie->lgc_index, rc);
+		}
+
+		llog_ctxt_put(ctxt);
+	}
+
+	RETURN(0);
+}
+
+/**
+ * Check if there are committed transaction
+ *
+ * Check if there are committed transaction in the distribute transaction
+ * list, then cancel the update records for those committed transaction.
+ * Because the distribute transaction in the list are sorted by batchid,
+ * and cancellation will be done by batchid order, so we only check the first
+ * the transaction(with lowest batchid) in the list.
+ *
+ * \param[in] lod	lod device where cancel thread is
+ *
+ * \retval		true if it is ready
+ * \retval		false if it is not ready
+ */
+static bool tdtd_ready_for_cancel_log(struct target_distribute_txn_data *tdtd)
+{
+	struct top_multiple_thandle	*tmt = NULL;
+	struct obd_device		*obd = tdtd->tdtd_lut->lut_obd;
+	bool	ready = false;
+
+	spin_lock(&tdtd->tdtd_batchid_lock);
+	if (!list_empty(&tdtd->tdtd_list)) {
+		tmt = list_entry(tdtd->tdtd_list.next,
+				 struct top_multiple_thandle, tmt_commit_list);
+		if (tmt->tmt_committed &&
+		    (!obd->obd_recovering || (obd->obd_recovering &&
+		    tmt->tmt_batchid <= tdtd->tdtd_committed_batchid)))
+			ready = true;
+	}
+	spin_unlock(&tdtd->tdtd_batchid_lock);
+
+	return ready;
+}
+
+struct distribute_txn_bid_data {
+	struct dt_txn_commit_cb  dtbd_cb;
+	struct target_distribute_txn_data      *dtbd_tdtd;
+	__u64                    dtbd_batchid;
+};
+
+/**
+ * callback of updating commit batchid
+ *
+ * Updating commit batchid then wake up the commit thread to cancel the
+ * records.
+ *
+ * \param[in]env	execution environment
+ * \param[in]th		thandle to updating commit batchid
+ * \param[in]cb		commit callback
+ * \param[in]err	result of thandle
+ */
+static void distribute_txn_batchid_cb(struct lu_env *env,
+				      struct thandle *th,
+				      struct dt_txn_commit_cb *cb,
+				      int err)
+{
+	struct distribute_txn_bid_data		*dtbd = NULL;
+	struct target_distribute_txn_data	*tdtd;
+
+	dtbd = container_of0(cb, struct distribute_txn_bid_data, dtbd_cb);
+	tdtd = dtbd->dtbd_tdtd;
+
+	CDEBUG(D_HA, "%s: %llu batchid updated\n",
+	      tdtd->tdtd_lut->lut_obd->obd_name, dtbd->dtbd_batchid);
+	spin_lock(&tdtd->tdtd_batchid_lock);
+	if (dtbd->dtbd_batchid > tdtd->tdtd_committed_batchid &&
+	    !tdtd->tdtd_lut->lut_obd->obd_no_transno)
+		tdtd->tdtd_committed_batchid = dtbd->dtbd_batchid;
+	spin_unlock(&tdtd->tdtd_batchid_lock);
+	atomic_dec(&tdtd->tdtd_refcount);
+	wake_up(&tdtd->tdtd_commit_thread_waitq);
+
+	OBD_FREE_PTR(dtbd);
+}
+
+/**
+ * Update the commit batchid in disk
+ *
+ * Update commit batchid in the disk, after this is committed, it can start
+ * to cancel the update records.
+ *
+ * \param[in] env	execution environment
+ * \param[in] tdtd	distribute transaction structure
+ * \param[in] batchid	commit batchid to be updated
+ *
+ * \retval		0 if update succeeds.
+ * \retval		negative errno if update fails.
+ */
+static int
+distribute_txn_commit_batchid_update(const struct lu_env *env,
+			      struct target_distribute_txn_data *tdtd,
+			      __u64 batchid)
+{
+	struct distribute_txn_bid_data	*dtbd = NULL;
+	struct thandle		*th;
+	struct lu_buf		 buf;
+	__u64			 tmp;
+	__u64			 off;
+	int			 rc;
+	ENTRY;
+
+	OBD_ALLOC_PTR(dtbd);
+	if (dtbd == NULL)
+		RETURN(-ENOMEM);
+	dtbd->dtbd_batchid = batchid;
+	dtbd->dtbd_tdtd = tdtd;
+	dtbd->dtbd_cb.dcb_func = distribute_txn_batchid_cb;
+	atomic_inc(&tdtd->tdtd_refcount);
+
+	th = dt_trans_create(env, tdtd->tdtd_lut->lut_bottom);
+	if (IS_ERR(th)) {
+		atomic_dec(&tdtd->tdtd_refcount);
+		OBD_FREE_PTR(dtbd);
+		RETURN(PTR_ERR(th));
+	}
+
+	tmp = cpu_to_le64(batchid);
+	buf.lb_buf = &tmp;
+	buf.lb_len = sizeof(tmp);
+	off = 0;
+
+	rc = dt_declare_record_write(env, tdtd->tdtd_batchid_obj, &buf, off,
+				     th);
+	if (rc < 0)
+		GOTO(stop, rc);
+
+	rc = dt_trans_start_local(env, tdtd->tdtd_lut->lut_bottom, th);
+	if (rc < 0)
+		GOTO(stop, rc);
+
+	rc = dt_trans_cb_add(th, &dtbd->dtbd_cb);
+	if (rc < 0)
+		GOTO(stop, rc);
+
+	rc = dt_record_write(env, tdtd->tdtd_batchid_obj, &buf,
+			     &off, th);
+
+	CDEBUG(D_INFO, "%s: update batchid %llu: rc = %d\n",
+	       tdtd->tdtd_lut->lut_obd->obd_name, batchid, rc);
+
+stop:
+	dt_trans_stop(env, tdtd->tdtd_lut->lut_bottom, th);
+	if (rc < 0) {
+		atomic_dec(&tdtd->tdtd_refcount);
+		OBD_FREE_PTR(dtbd);
+	}
+	RETURN(rc);
+}
+
+/**
+ * Init commit batchid for distribute transaction.
+ *
+ * Initialize the batchid object and get commit batchid from the object.
+ *
+ * \param[in] env	execution environment
+ * \param[in] tdtd	distribute transaction whose batchid is initialized.
+ *
+ * \retval		0 if initialization succeeds.
+ * \retval		negative errno if initialization fails.
+ **/
+static int
+distribute_txn_commit_batchid_init(const struct lu_env *env,
+				   struct target_distribute_txn_data *tdtd)
+{
+	struct tgt_thread_info	*tti = tgt_th_info(env);
+	struct lu_target	*lut = tdtd->tdtd_lut;
+	struct lu_attr		*attr = &tti->tti_attr;
+	struct lu_fid		*fid = &tti->tti_fid1;
+	struct dt_object_format	*dof = &tti->tti_u.update.tti_update_dof;
+	struct dt_object	*dt_obj = NULL;
+	struct lu_buf		buf;
+	__u64			tmp;
+	__u64			off;
+	int			rc;
+	ENTRY;
+
+	memset(attr, 0, sizeof(*attr));
+	attr->la_valid = LA_MODE;
+	attr->la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+	dof->dof_type = dt_mode_to_dft(S_IFREG);
+
+	lu_local_obj_fid(fid, BATCHID_COMMITTED_OID);
+
+	dt_obj = dt_find_or_create(env, lut->lut_bottom, fid, dof,
+				   attr);
+	if (IS_ERR(dt_obj)) {
+		rc = PTR_ERR(dt_obj);
+		dt_obj = NULL;
+		GOTO(out_put, rc);
+	}
+
+	tdtd->tdtd_batchid_obj = dt_obj;
+
+	buf.lb_buf = &tmp;
+	buf.lb_len = sizeof(tmp);
+	off = 0;
+	rc = dt_read(env, dt_obj, &buf, &off);
+	if (rc < 0 || (rc < buf.lb_len && rc > 0)) {
+		CERROR("%s can't read last committed batchid: rc = %d\n",
+		       tdtd->tdtd_lut->lut_obd->obd_name, rc);
+		if (rc > 0)
+			rc = -EINVAL;
+		GOTO(out_put, rc);
+	} else if (rc == buf.lb_len) {
+		tdtd->tdtd_committed_batchid = le64_to_cpu(tmp);
+		CDEBUG(D_HA, "%s: committed batchid %llu\n",
+		       tdtd->tdtd_lut->lut_obd->obd_name,
+		       tdtd->tdtd_committed_batchid);
+		rc = 0;
+	}
+
+out_put:
+	if (rc < 0 && dt_obj != NULL) {
+		dt_object_put(env, dt_obj);
+		tdtd->tdtd_batchid_obj = NULL;
+	}
+	return rc;
+}
+
+/**
+ * manage the distribute transaction thread
+ *
+ * Distribute transaction are linked to the list, and once the distribute
+ * transaction is committed, it will update the last committed batchid first,
+ * after it is committed, it will cancel the records.
+ *
+ * \param[in] _arg	argument for commit thread
+ *
+ * \retval		0 if thread is running successfully
+ * \retval		negative errno if the thread can not be run.
+ */
+static int distribute_txn_commit_thread(void *_arg)
+{
+	struct target_distribute_txn_data *tdtd = _arg;
+	struct lu_target	*lut = tdtd->tdtd_lut;
+	struct ptlrpc_thread	*thread = &lut->lut_tdtd_commit_thread;
+	struct l_wait_info	 lwi = { 0 };
+	struct lu_env		 env;
+	struct list_head	 list;
+	int			 rc;
+	struct top_multiple_thandle *tmt;
+	struct top_multiple_thandle *tmp;
+	__u64			 batchid = 0, committed;
+
+	ENTRY;
+
+	rc = lu_env_init(&env, LCT_LOCAL | LCT_MD_THREAD);
+	if (rc != 0)
+		RETURN(rc);
+
+	spin_lock(&tdtd->tdtd_batchid_lock);
+	thread->t_flags = SVC_RUNNING;
+	spin_unlock(&tdtd->tdtd_batchid_lock);
+	wake_up(&thread->t_ctl_waitq);
+	INIT_LIST_HEAD(&list);
+
+	CDEBUG(D_HA, "%s: start commit thread committed batchid %llu\n",
+	       tdtd->tdtd_lut->lut_obd->obd_name,
+	       tdtd->tdtd_committed_batchid);
+
+	while (distribute_txn_commit_thread_running(lut)) {
+		spin_lock(&tdtd->tdtd_batchid_lock);
+		list_for_each_entry_safe(tmt, tmp, &tdtd->tdtd_list,
+					 tmt_commit_list) {
+			if (tmt->tmt_committed == 0)
+				break;
+
+			/* Note: right now, replay is based on master MDT
+			 * transno, but cancellation is based on batchid.
+			 * so we do not try to cancel the update log until
+			 * the recoverying is done, unless the update records
+			 * batchid < committed_batchid. */
+			if (tmt->tmt_batchid <= tdtd->tdtd_committed_batchid) {
+				list_move_tail(&tmt->tmt_commit_list, &list);
+			} else if (!tdtd->tdtd_lut->lut_obd->obd_recovering) {
+				LASSERTF(tmt->tmt_batchid >= batchid,
+					 "tmt %p tmt_batchid: %llu, batchid "
+					  "%llu\n", tmt, tmt->tmt_batchid,
+					 batchid);
+				/* There are three types of distribution
+				 * transaction result
+				 *
+				 * 1. If tmt_result < 0, it means the
+				 * distribution transaction fails, which should
+				 * be rare, because once declare phase succeeds,
+				 * the operation should succeeds anyway. Note in
+				 * this case, we will still update batchid so
+				 * cancellation would be stopped.
+				 *
+				 * 2. If tmt_result == 0, it means the
+				 * distribution transaction succeeds, and we
+				 * will update batchid.
+				 *
+				 * 3. If tmt_result > 0, it means distribute
+				 * transaction is not yet committed on every
+				 * node, but we need release this tmt before
+				 * that, which usuually happens during umount.
+				 */
+				if (tmt->tmt_result <= 0)
+					batchid = tmt->tmt_batchid;
+				list_move_tail(&tmt->tmt_commit_list, &list);
+			}
+		}
+		spin_unlock(&tdtd->tdtd_batchid_lock);
+
+		CDEBUG(D_HA, "%s: batchid: %llu committed batchid "
+		       "%llu\n", tdtd->tdtd_lut->lut_obd->obd_name, batchid,
+		       tdtd->tdtd_committed_batchid);
+		/* update globally committed on a storage */
+		if (batchid > tdtd->tdtd_committed_batchid) {
+			rc = distribute_txn_commit_batchid_update(&env, tdtd,
+							     batchid);
+			if (rc == 0)
+				batchid = 0;
+		}
+		/* cancel the records for committed batchid's */
+		/* XXX: should we postpone cancel's till the end of recovery? */
+		committed = tdtd->tdtd_committed_batchid;
+		list_for_each_entry_safe(tmt, tmp, &list, tmt_commit_list) {
+			if (tmt->tmt_batchid > committed)
+				break;
+			list_del_init(&tmt->tmt_commit_list);
+			if (tmt->tmt_result <= 0)
+				distribute_txn_cancel_records(&env, tmt);
+			top_multiple_thandle_put(tmt);
+		}
+
+		l_wait_event(tdtd->tdtd_commit_thread_waitq,
+			     !distribute_txn_commit_thread_running(lut) ||
+			     committed < tdtd->tdtd_committed_batchid ||
+			     tdtd_ready_for_cancel_log(tdtd), &lwi);
+	};
+
+	l_wait_event(tdtd->tdtd_commit_thread_waitq,
+		     atomic_read(&tdtd->tdtd_refcount) == 0, &lwi);
+
+	spin_lock(&tdtd->tdtd_batchid_lock);
+	list_for_each_entry_safe(tmt, tmp, &tdtd->tdtd_list,
+				 tmt_commit_list)
+		list_move_tail(&tmt->tmt_commit_list, &list);
+	spin_unlock(&tdtd->tdtd_batchid_lock);
+
+	CDEBUG(D_INFO, "%s stopping distribute txn commit thread.\n",
+	       tdtd->tdtd_lut->lut_obd->obd_name);
+	list_for_each_entry_safe(tmt, tmp, &list, tmt_commit_list) {
+		list_del_init(&tmt->tmt_commit_list);
+		top_multiple_thandle_dump(tmt, D_HA);
+		top_multiple_thandle_put(tmt);
+	}
+
+	thread->t_flags = SVC_STOPPED;
+	lu_env_fini(&env);
+	wake_up(&thread->t_ctl_waitq);
+
+	RETURN(0);
+}
+
+/**
+ * Start llog cancel thread
+ *
+ * Start llog cancel(master/slave) thread on LOD
+ *
+ * \param[in]lclt	cancel log thread to be started.
+ *
+ * \retval		0 if the thread is started successfully.
+ * \retval		negative errno if the thread is not being
+ *                      started.
+ */
+int distribute_txn_init(const struct lu_env *env,
+			struct lu_target *lut,
+			struct target_distribute_txn_data *tdtd,
+			__u32 index)
+{
+	struct task_struct	*task;
+	struct l_wait_info	 lwi = { 0 };
+	int			rc;
+	ENTRY;
+
+	INIT_LIST_HEAD(&tdtd->tdtd_list);
+	INIT_LIST_HEAD(&tdtd->tdtd_replay_finish_list);
+	INIT_LIST_HEAD(&tdtd->tdtd_replay_list);
+	spin_lock_init(&tdtd->tdtd_batchid_lock);
+	spin_lock_init(&tdtd->tdtd_replay_list_lock);
+	tdtd->tdtd_replay_handler = distribute_txn_replay_handle;
+	tdtd->tdtd_replay_ready = 0;
+
+	tdtd->tdtd_batchid = lut->lut_last_transno + 1;
+
+	init_waitqueue_head(&lut->lut_tdtd_commit_thread.t_ctl_waitq);
+	init_waitqueue_head(&tdtd->tdtd_commit_thread_waitq);
+	init_waitqueue_head(&tdtd->tdtd_recovery_threads_waitq);
+	atomic_set(&tdtd->tdtd_refcount, 0);
+	atomic_set(&tdtd->tdtd_recovery_threads_count, 0);
+
+	tdtd->tdtd_lut = lut;
+	if (lut->lut_bottom->dd_rdonly)
+		RETURN(0);
+
+	rc = distribute_txn_commit_batchid_init(env, tdtd);
+	if (rc != 0)
+		RETURN(rc);
+
+	task = kthread_run(distribute_txn_commit_thread, tdtd, "dist_txn-%u",
+			   index);
+	if (IS_ERR(task))
+		RETURN(PTR_ERR(task));
+
+	l_wait_event(lut->lut_tdtd_commit_thread.t_ctl_waitq,
+		     distribute_txn_commit_thread_running(lut) ||
+		     distribute_txn_commit_thread_stopped(lut), &lwi);
+	RETURN(0);
+}
+EXPORT_SYMBOL(distribute_txn_init);
+
+/**
+ * Stop llog cancel thread
+ *
+ * Stop llog cancel(master/slave) thread on LOD and also destory
+ * all of transaction in the list.
+ *
+ * \param[in]lclt	cancel log thread to be stopped.
+ */
+void distribute_txn_fini(const struct lu_env *env,
+			 struct target_distribute_txn_data *tdtd)
+{
+	struct lu_target *lut = tdtd->tdtd_lut;
+
+	/* Stop cancel thread */
+	if (lut == NULL || !distribute_txn_commit_thread_running(lut))
+		return;
+
+	spin_lock(&tdtd->tdtd_batchid_lock);
+	lut->lut_tdtd_commit_thread.t_flags = SVC_STOPPING;
+	spin_unlock(&tdtd->tdtd_batchid_lock);
+	wake_up(&tdtd->tdtd_commit_thread_waitq);
+	wait_event(lut->lut_tdtd_commit_thread.t_ctl_waitq,
+		   lut->lut_tdtd_commit_thread.t_flags & SVC_STOPPED);
+
+	dtrq_list_destroy(tdtd);
+	if (tdtd->tdtd_batchid_obj != NULL) {
+		dt_object_put(env, tdtd->tdtd_batchid_obj);
+		tdtd->tdtd_batchid_obj = NULL;
+	}
+}
+EXPORT_SYMBOL(distribute_txn_fini);
diff --git a/drivers/staging/lustrefsx/undef.h b/drivers/staging/lustrefsx/undef.h
new file mode 100644
index 0000000000000..c0f5f459ae9c1
--- /dev/null
+++ b/drivers/staging/lustrefsx/undef.h
@@ -0,0 +1,990 @@
+
+/* enable libcfs CDEBUG, CWARN */
+#undef CDEBUG_ENABLED
+
+/* enable libcfs ENTRY/EXIT */
+#undef CDEBUG_ENTRY_EXIT
+
+/* enable page state tracking code */
+#undef CONFIG_DEBUG_PAGESTATE_TRACKING
+
+/* enable encryption for ldiskfs */
+#undef CONFIG_LDISKFS_FS_ENCRYPTION
+
+/* posix acls for ldiskfs */
+#undef CONFIG_LDISKFS_FS_POSIX_ACL
+
+/* enable rw access for ldiskfs */
+#undef CONFIG_LDISKFS_FS_RW
+
+/* fs security for ldiskfs */
+#undef CONFIG_LDISKFS_FS_SECURITY
+
+/* extened attributes for ldiskfs */
+#undef CONFIG_LDISKFS_FS_XATTR
+
+/* Max LNET payload */
+#undef CONFIG_LNET_MAX_PAYLOAD
+
+/* enable invariant checking */
+#undef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
+
+/* IOCTL Buffer Size */
+#undef CONFIG_LUSTRE_OBD_MAX_IOCTL_BUFFER
+
+/* kernel has cpu affinity support */
+#undef CPU_AFFINITY
+
+/* both i_dentry/d_alias uses list */
+#undef DATA_FOR_LLITE_IS_LIST
+
+/* name of ldiskfs debug program */
+#undef DEBUGFS
+
+/* name of ldiskfs dump program */
+#undef DUMPE2FS
+
+/* name of ldiskfs fsck program */
+#undef E2FSCK
+
+/* name of ldiskfs e2fsprogs package */
+#undef E2FSPROGS
+
+/* name of ldiskfs label program */
+#undef E2LABEL
+
+/* do data checksums */
+#undef ENABLE_CHECKSUM
+
+/* Use the Pinger */
+#undef ENABLE_PINGER
+
+/* Define to 1 if you have the <asm/types.h> header file. */
+#undef HAVE_ASM_TYPES_H
+
+/* backing_dev_info exist */
+#undef HAVE_BACKING_DEV_INFO
+
+/* BDI_CAP_MAP_COPY exist */
+#undef HAVE_BDI_CAP_MAP_COPY
+
+/* bio_endio takes only one argument */
+#undef HAVE_BIO_ENDIO_USES_ONE_ARG
+
+/* bio_end_sector is defined */
+#undef HAVE_BIO_END_SECTOR
+
+/* 'bio_integrity_enabled' is available */
+#undef HAVE_BIO_INTEGRITY_ENABLED
+
+/* 'bi_bdev' is available */
+#undef HAVE_BI_BDEV
+
+/* struct bio has bi_cnt */
+#undef HAVE_BI_CNT
+
+/* struct bio has bi_rw */
+#undef HAVE_BI_RW
+
+/* 'bi_status' is available */
+#undef HAVE_BI_STATUS
+
+/* blkdev_get_by_dev is exported by the kernel */
+#undef HAVE_BLKDEV_GET_BY_DEV
+
+/* Define to 1 if you have the <blkid/blkid.h> header file. */
+#undef HAVE_BLKID_BLKID_H
+
+/* blk_plug struct exists */
+#undef HAVE_BLK_PLUG
+
+/* blk_queue_max_segments is defined */
+#undef HAVE_BLK_QUEUE_MAX_SEGMENTS
+
+/* kernel has struct bvec_iter */
+#undef HAVE_BVEC_ITER
+
+/* cache_head has hlist cache_list */
+#undef HAVE_CACHE_HEAD_HLIST
+
+/* have cache_register */
+#undef HAVE_CACHE_REGISTER
+
+/* cancel_dirty_page is still available */
+#undef HAVE_CANCEL_DIRTY_PAGE
+
+/* kernel has clean_bdev_aliases */
+#undef HAVE_CLEAN_BDEV_ALIASES
+
+/* have clear_inode */
+#undef HAVE_CLEAR_INODE
+
+/* compat rdma found */
+#undef HAVE_COMPAT_RDMA
+
+/* cpumap_print_to_pagebuf is available */
+#undef HAVE_CPUMASK_PRINT_TO_PAGEBUF
+
+/* kernel compiled with CRC32 functions */
+#undef HAVE_CRC32
+
+/* struct cred has member tgcred */
+#undef HAVE_CRED_TGCRED
+
+/* crypto hash helper functions are available */
+#undef HAVE_CRYPTO_HASH_HELPERS
+
+/* current_time() has replaced CURRENT_TIME */
+#undef HAVE_CURRENT_TIME
+
+/* dcache_lock is exist */
+#undef HAVE_DCACHE_LOCK
+
+/* kernel export delete_from_page_cache */
+#undef HAVE_DELETE_FROM_PAGE_CACHE
+
+/* dentry.d_child exist */
+#undef HAVE_DENTRY_D_CHILD
+
+/* hlist dentry.d_u.d_alias exist */
+#undef HAVE_DENTRY_D_U_D_ALIAS
+
+/* dentry_open uses struct path as first argument */
+#undef HAVE_DENTRY_OPEN_USE_PATH
+
+/* direct_IO need 2 arguments */
+#undef HAVE_DIRECTIO_2ARGS
+
+/* direct IO uses iov_iter */
+#undef HAVE_DIRECTIO_ITER
+
+/* dirty_inode super_operation takes flag */
+#undef HAVE_DIRTY_INODE_HAS_FLAG
+
+/* dir_context exist */
+#undef HAVE_DIR_CONTEXT
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#undef HAVE_DLFCN_H
+
+/* Have dmu_object_alloc_dnsize in ZFS */
+#undef HAVE_DMU_OBJECT_ALLOC_DNSIZE
+
+/* Have dmu_objset_disown() with 3 args */
+#undef HAVE_DMU_OBJSET_DISOWN_3ARG
+
+/* Have dmu_objset_own() with 6 args */
+#undef HAVE_DMU_OBJSET_OWN_6ARG
+
+/* Have 6 argument dmu_pretch in ZFS */
+#undef HAVE_DMU_PREFETCH_6ARG
+
+/* Have dmu_read_by_dnode() in ZFS */
+#undef HAVE_DMU_READ_BY_DNODE
+
+/* Have dmu_tx_hold_write_by_dnode() in ZFS */
+#undef HAVE_DMU_TX_HOLD_WRITE_BY_DNODE
+
+/* Have dmu_tx_hold_zap_by_dnode() in ZFS */
+#undef HAVE_DMU_TX_HOLD_ZAP_BY_DNODE
+
+/* Have dmu_tx_mark_netfree */
+#undef HAVE_DMU_TX_MARK_NETFREE
+
+/* Have native dnode accounting in ZFS */
+#undef HAVE_DMU_USEROBJ_ACCOUNTING
+
+/* Have dmu_write_by_dnode() in ZFS */
+#undef HAVE_DMU_WRITE_BY_DNODE
+
+/* quotactl_ops.set_dqblk takes struct fs_disk_quota */
+#undef HAVE_DQUOT_FS_DISK_QUOTA
+
+/* quotactl_ops.set_dqblk takes struct kqid */
+#undef HAVE_DQUOT_KQID
+
+/* quotactl_ops.set_dqblk takes struct qc_dqblk */
+#undef HAVE_DQUOT_QC_DQBLK
+
+/* dquot_suspend is defined */
+#undef HAVE_DQUOT_SUSPEND
+
+/* Have dsl_pool_config_enter/exit in ZFS */
+#undef HAVE_DSL_POOL_CONFIG
+
+/* Have dsl_sync_task_do_nowait in ZFS */
+#undef HAVE_DSL_SYNC_TASK_DO_NOWAIT
+
+/* dump_trace want address argument */
+#undef HAVE_DUMP_TRACE_ADDRESS
+
+/* d_compare need 4 arguments */
+#undef HAVE_D_COMPARE_4ARGS
+
+/* d_compare need 5 arguments */
+#undef HAVE_D_COMPARE_5ARGS
+
+/* d_compare need 7 arguments */
+#undef HAVE_D_COMPARE_7ARGS
+
+/* d_count exist */
+#undef HAVE_D_COUNT
+
+/* d_delete first parameter declared is not const */
+#undef HAVE_D_DELETE_CONST
+
+/* have d_make_root */
+#undef HAVE_D_MAKE_ROOT
+
+/* have parent inode as parameter */
+#undef HAVE_ENCODE_FH_PARENT
+
+/* Define to 1 if you have the <endian.h> header file. */
+#undef HAVE_ENDIAN_H
+
+/* Define to 1 if you have the <ext2fs/ext2fs.h> header file. */
+#undef HAVE_EXT2FS_EXT2FS_H
+
+/* ext4_bread takes 4 arguments */
+#undef HAVE_EXT4_BREAD_4ARGS
+
+/* i_dquot is in ext4_inode_info */
+#undef HAVE_EXT4_INFO_DQUOT
+
+/* ext4_free_blocks do not require struct buffer_head */
+#undef HAVE_EXT_FREE_BLOCK_WITH_BUFFER_HEAD
+
+/* Linux kernel has ext_pblock */
+#undef HAVE_EXT_PBLOCK
+
+/* file handle and related syscalls are supported */
+#undef HAVE_FHANDLE_GLIBC_SUPPORT
+
+/* kernel supports fhandles and related syscalls */
+#undef HAVE_FHANDLE_SYSCALLS
+
+/* kernel has file_dentry */
+#undef HAVE_FILE_DENTRY
+
+/* file_operations.fsync takes 2 arguments */
+#undef HAVE_FILE_FSYNC_2ARGS
+
+/* file_operations.fsync takes 4 arguments */
+#undef HAVE_FILE_FSYNC_4ARGS
+
+/* struct file has member f_inode */
+#undef HAVE_FILE_F_INODE
+
+/* file_inode() has been defined */
+#undef HAVE_FILE_INODE
+
+/* generic_file_llseek_size is exported by the kernel */
+#undef HAVE_FILE_LLSEEK_SIZE
+
+/* kernel has generic_file_llseek_size with 5 args */
+#undef HAVE_FILE_LLSEEK_SIZE_5ARGS
+
+/* file_operations.[read|write]_iter functions exist */
+#undef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
+
+/* filldir_t needs struct dir_context as argument */
+#undef HAVE_FILLDIR_USE_CTX
+
+/* fpu/api.h is present */
+#undef HAVE_FPU_API_HEADER
+
+/* struct file_system_type has mount field */
+#undef HAVE_FSTYPE_MOUNT
+
+/* fs_struct.lock use rwlock */
+#undef HAVE_FS_STRUCT_RWLOCK
+
+/* fs_struct use seqcount */
+#undef HAVE_FS_STRUCT_SEQCOUNT
+
+/* full_name_hash need 3 arguments */
+#undef HAVE_FULL_NAME_HASH_3ARGS
+
+/* generic_permission taken 2 arguments */
+#undef HAVE_GENERIC_PERMISSION_2ARGS
+
+/* generic_permission taken 4 arguments */
+#undef HAVE_GENERIC_PERMISSION_4ARGS
+
+/* generic_write_sync need 2 arguments */
+#undef HAVE_GENERIC_WRITE_SYNC_2ARGS
+
+/* Define to 1 if you have the `gethostbyname' function. */
+#undef HAVE_GETHOSTBYNAME
+
+/* get_user_pages takes 6 arguments */
+#undef HAVE_GET_USER_PAGES_6ARG
+
+/* get_user_pages takes gup_flags in arguments */
+#undef HAVE_GET_USER_PAGES_GUP_FLAGS
+
+/* struct group_info has member gid */
+#undef HAVE_GROUP_INFO_GID
+
+/* Define this is if you enable gss */
+#undef HAVE_GSS
+
+/* Define this if you enable gss keyring backend */
+#undef HAVE_GSS_KEYRING
+
+/* Define this if the Kerberos GSS library supports gss_krb5_ccache_name */
+#undef HAVE_GSS_KRB5_CCACHE_NAME
+
+/* Define this if you have Heimdal Kerberos libraries */
+#undef HAVE_HEIMDAL
+
+/* hlist_add_after is available */
+#undef HAVE_HLIST_ADD_AFTER
+
+/* hlist_for_each_entry has 3 args */
+#undef HAVE_HLIST_FOR_EACH_3ARG
+
+/* hotplug state machine is supported */
+#undef HAVE_HOTPLUG_STATE_MACHINE
+
+/* ib_alloc_fast_reg_mr is defined */
+#undef HAVE_IB_ALLOC_FAST_REG_MR
+
+/* ib_alloc_pd has 2 arguments */
+#undef HAVE_IB_ALLOC_PD_2ARGS
+
+/* struct ib_cq_init_attr is used by ib_create_cq */
+#undef HAVE_IB_CQ_INIT_ATTR
+
+/* struct ib_device.attrs is defined */
+#undef HAVE_IB_DEVICE_ATTRS
+
+/* ib_get_dma_mr is defined */
+#undef HAVE_IB_GET_DMA_MR
+
+/* function ib_inc_rkey exist */
+#undef HAVE_IB_INC_RKEY
+
+/* ib_map_mr_sg exists */
+#undef HAVE_IB_MAP_MR_SG
+
+/* ib_map_mr_sg has 5 arguments */
+#undef HAVE_IB_MAP_MR_SG_5ARGS
+
+/* struct ib_rdma_wr is defined */
+#undef HAVE_IB_RDMA_WR
+
+/* inode_operations .getattr member function can gather advance stats */
+#undef HAVE_INODEOPS_ENHANCED_GETATTR
+
+/* inode_operations has .truncate member function */
+#undef HAVE_INODEOPS_TRUNCATE
+
+/* inode_operations use umode_t as parameter */
+#undef HAVE_INODEOPS_USE_UMODE_T
+
+/* inode->i_alloc_sem is killed and use inode_dio_wait */
+#undef HAVE_INODE_DIO_WAIT
+
+/* inode.i_rcu exists */
+#undef HAVE_INODE_I_RCU
+
+/* inode_lock is defined */
+#undef HAVE_INODE_LOCK
+
+/* inode_owner_or_capable exist */
+#undef HAVE_INODE_OWNER_OR_CAPABLE
+
+/* inode_operations->permission has two args */
+#undef HAVE_INODE_PERMISION_2ARGS
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#undef HAVE_INTTYPES_H
+
+/* address_space_operations.invalidatepage needs 3 arguments */
+#undef HAVE_INVALIDATE_RANGE
+
+/* have in_compat_syscall */
+#undef HAVE_IN_COMPAT_SYSCALL
+
+/* inode_operations->rename need flags as argument */
+#undef HAVE_IOPS_RENAME_WITH_FLAGS
+
+/* have iop atomic_open */
+#undef HAVE_IOP_ATOMIC_OPEN
+
+/* generic_readlink has been removed */
+#undef HAVE_IOP_GENERIC_READLINK
+
+/* inode_operations has .get_acl member function */
+#undef HAVE_IOP_GET_ACL
+
+/* have iop get_link */
+#undef HAVE_IOP_GET_LINK
+
+/* inode_operations has .set_acl member function */
+#undef HAVE_IOP_SET_ACL
+
+/* inode_operations has {get,set,remove}xattr members */
+#undef HAVE_IOP_XATTR
+
+/* iov_iter_init handles directional tag */
+#undef HAVE_IOV_ITER_INIT_DIRECTION
+
+/* iov_iter_rw exist */
+#undef HAVE_IOV_ITER_RW
+
+/* iov_iter_truncate exists */
+#undef HAVE_IOV_ITER_TRUNCATE
+
+/* is_sxid is defined */
+#undef HAVE_IS_SXID
+
+/* i_uid_read is present */
+#undef HAVE_I_UID_READ
+
+/* kernel_locked is defined */
+#undef HAVE_KERNEL_LOCKED
+
+/* 'struct sock' accept function requires bool argument */
+#undef HAVE_KERN_SOCK_ACCEPT_FLAG_ARG
+
+/* struct key_match_data exist */
+#undef HAVE_KEY_MATCH_DATA
+
+/* payload.data is an array */
+#undef HAVE_KEY_PAYLOAD_DATA_ARRAY
+
+/* key_type->instantiate has two args */
+#undef HAVE_KEY_TYPE_INSTANTIATE_2ARGS
+
+/* ki_left exist */
+#undef HAVE_KIOCB_KI_LEFT
+
+/* ki_nbytes field exist */
+#undef HAVE_KI_NBYTES
+
+/* have kmap_atomic has only 1 argument */
+#undef HAVE_KMAP_ATOMIC_HAS_1ARG
+
+/* kmap_to_page is exported by the kernel */
+#undef HAVE_KMAP_TO_PAGE
+
+/* Define this if you have MIT Kerberos libraries */
+#undef HAVE_KRB5
+
+/* Define this if the function krb5int_derive_key is available */
+#undef HAVE_KRB5INT_DERIVE_KEY
+
+/* Define this if the function krb5_derive_key is available */
+#undef HAVE_KRB5_DERIVE_KEY
+
+/* Define this if the function krb5_get_error_message is available */
+#undef HAVE_KRB5_GET_ERROR_MESSAGE
+
+/* Define this if the function krb5_get_init_creds_opt_set_addressless is
+   available */
+#undef HAVE_KRB5_GET_INIT_CREDS_OPT_SET_ADDRESSLESS
+
+/* kernel has kstrtoul */
+#undef HAVE_KSTRTOUL
+
+/* kthread_worker found */
+#undef HAVE_KTHREAD_WORK
+
+/* ktime_add is available */
+#undef HAVE_KTIME_ADD
+
+/* ktime_after is available */
+#undef HAVE_KTIME_AFTER
+
+/* ktime_before is available */
+#undef HAVE_KTIME_BEFORE
+
+/* ktime_compare is available */
+#undef HAVE_KTIME_COMPARE
+
+/* 'ktime_get_real_seconds' is available */
+#undef HAVE_KTIME_GET_REAL_SECONDS
+
+/* 'ktime_get_real_ts64' is available */
+#undef HAVE_KTIME_GET_REAL_TS64
+
+/* 'ktime_get_seconds' is available */
+#undef HAVE_KTIME_GET_SECONDS
+
+/* 'ktime_get_ts64' is available */
+#undef HAVE_KTIME_GET_TS64
+
+/* 'ktime_to_timespec64' is available */
+#undef HAVE_KTIME_TO_TIMESPEC64
+
+/* enable use of ldiskfsprogs package */
+#undef HAVE_LDISKFSPROGS
+
+/* kernel has ext4_map_blocks */
+#undef HAVE_LDISKFS_MAP_BLOCKS
+
+/* Enable ldiskfs osd */
+#undef HAVE_LDISKFS_OSD
+
+/* libefence support is requested */
+#undef HAVE_LIBEFENCE
+
+/* Define to 1 if you have the `keyutils' library (-lkeyutils). */
+#undef HAVE_LIBKEYUTILS
+
+/* build with libmount */
+#undef HAVE_LIBMOUNT
+
+/* use libpthread for libcfs library */
+#undef HAVE_LIBPTHREAD
+
+/* readline library is available */
+#undef HAVE_LIBREADLINE
+
+/* Define to 1 if you have the <linux/random.h> header file. */
+#undef HAVE_LINUX_RANDOM_H
+
+/* Define to 1 if you have the <linux/types.h> header file. */
+#undef HAVE_LINUX_TYPES_H
+
+/* Define to 1 if you have the <linux/unistd.h> header file. */
+#undef HAVE_LINUX_UNISTD_H
+
+/* Define to 1 if you have the <linux/version.h> header file. */
+#undef HAVE_LINUX_VERSION_H
+
+/* lock-manager ops renamed to lm_xxx */
+#undef HAVE_LM_XXX_LOCK_MANAGER_OPS
+
+/* kernel has locks_lock_file_wait */
+#undef HAVE_LOCKS_LOCK_FILE_WAIT
+
+/* kernel has LOOP_CTL_GET_FREE */
+#undef HAVE_LOOP_CTL_GET_FREE
+
+/* Enable lru resize support */
+#undef HAVE_LRU_RESIZE_SUPPORT
+
+/* Define this if the Kerberos GSS library supports
+   gss_krb5_export_lucid_sec_context */
+#undef HAVE_LUCID_CONTEXT_SUPPORT
+
+/* Define to 1 if you have the <memory.h> header file. */
+#undef HAVE_MEMORY_H
+
+/* address_space_operations.migratepage has 4 args */
+#undef HAVE_MIGRATEPAGE_4ARGS
+
+/* kernel has include/linux/migrate.h */
+#undef HAVE_MIGRATE_H
+
+/* kernel has include/linux/migrate_mode.h */
+#undef HAVE_MIGRATE_MODE_H
+
+/* kernel module loading is possible */
+#undef HAVE_MODULE_LOADING_SUPPORT
+
+/* Define to 1 if you have the `name_to_handle_at' function. */
+#undef HAVE_NAME_TO_HANDLE_AT
+
+/* Define to 1 if you have the <netdb.h> header file. */
+#undef HAVE_NETDB_H
+
+/* cancel_dirty_page with one arguement is available */
+#undef HAVE_NEW_CANCEL_DIRTY_PAGE
+
+/* 'kernel_write' aligns with read/write helpers */
+#undef HAVE_NEW_KERNEL_WRITE
+
+/* with oldsize */
+#undef HAVE_OLDSIZE_TRUNCATE_PAGECACHE
+
+/* OpenSSL HMAC functions needed for SSK */
+#undef HAVE_OPENSSL_SSK
+
+/* 'pagevec_init' takes one parameter */
+#undef HAVE_PAGEVEC_INIT_ONE_PARAM
+
+/* have PCLMULQDQ instruction */
+#undef HAVE_PCLMULQDQ
+
+/* percpu_counter_init uses GFP_* flag */
+#undef HAVE_PERCPU_COUNTER_INIT_GFP_FLAG
+
+/* 'struct nsproxy' has 'pid_ns_for_children' */
+#undef HAVE_PID_NS_FOR_CHILDREN
+
+/* posix_acl_to_xattr takes struct user_namespace */
+#undef HAVE_POSIXACL_USER_NS
+
+/* 'posix_acl_update_mode' is available */
+#undef HAVE_POSIX_ACL_UPDATE_MODE
+
+/* posix_acl_valid takes struct user_namespace */
+#undef HAVE_POSIX_ACL_VALID_USER_NS
+
+/* proc_remove is defined */
+#undef HAVE_PROC_REMOVE
+
+/* get_projid function exists */
+#undef HAVE_PROJECT_QUOTA
+
+/* inode->i_nlink is protected from direct modification */
+#undef HAVE_PROTECT_I_NLINK
+
+/* have quota64 */
+#undef HAVE_QUOTA64
+
+/* radix_tree_exceptional_entry exist */
+#undef HAVE_RADIX_EXCEPTION_ENTRY
+
+/* rdma_create_id wants 4 args */
+#undef HAVE_RDMA_CREATE_ID_4ARG
+
+/* rdma_create_id wants 5 args */
+#undef HAVE_RDMA_CREATE_ID_5ARG
+
+/* reinit_completion is exist */
+#undef HAVE_REINIT_COMPLETION
+
+/* kernel export remove_from_page_cache */
+#undef HAVE_REMOVE_FROM_PAGE_CACHE
+
+/* remove_proc_subtree is defined */
+#undef HAVE_REMOVE_PROC_SUBTREE
+
+/* Have sa_spill_alloc in ZFS */
+#undef HAVE_SA_SPILL_ALLOC
+
+/* super_operations.evict_inode() is exist in kernel */
+#undef HAVE_SBOPS_EVICT_INODE
+
+/* kernel supports wrapped FS freeze functions */
+#undef HAVE_SB_START_WRITE
+
+/* Define to 1 if you have the <sched.h> header file. */
+#undef HAVE_SCHED_H
+
+/* linux/sched header directory exist */
+#undef HAVE_SCHED_HEADERS
+
+/* security_dentry_init_security' is defined */
+#undef HAVE_SECURITY_DENTRY_INIT_SECURITY
+
+/* security_inode_init_security takes a callback to set xattrs */
+#undef HAVE_SECURITY_IINITSEC_CALLBACK
+
+/* security_inode_init_security takes a 'struct qstr' parameter */
+#undef HAVE_SECURITY_IINITSEC_QSTR
+
+/* support for selinux */
+#undef HAVE_SELINUX
+
+/* Define to 1 if you have the <selinux/selinux.h> header file. */
+#undef HAVE_SELINUX_SELINUX_H
+
+/* support server */
+#undef HAVE_SERVER_SUPPORT
+
+/* Define to 1 if you have the `setns' function. */
+#undef HAVE_SETNS
+
+/* Define this if the Kerberos GSS library supports
+   gss_krb5_set_allowable_enctypes */
+#undef HAVE_SET_ALLOWABLE_ENCTYPES
+
+/* shrinker has count_objects member */
+#undef HAVE_SHRINKER_COUNT
+
+/* shrinker want self pointer in handler */
+#undef HAVE_SHRINKER_WANT_SHRINK_PTR
+
+/* shrink_control is present */
+#undef HAVE_SHRINK_CONTROL
+
+/* simple_setattr is exported by the kernel */
+#undef HAVE_SIMPLE_SETATTR
+
+/* sk_data_ready uses only one argument */
+#undef HAVE_SK_DATA_READY_ONE_ARG
+
+/* kernel has sk_sleep */
+#undef HAVE_SK_SLEEP
+
+/* sock_alloc_file is exported */
+#undef HAVE_SOCK_ALLOC_FILE
+
+/* sock_alloc_file takes 3 arguments */
+#undef HAVE_SOCK_ALLOC_FILE_3ARGS
+
+/* sock_create_kern use net as first parameter */
+#undef HAVE_SOCK_CREATE_KERN_USE_NET
+
+/* Have spa_maxblocksize in ZFS */
+#undef HAVE_SPA_MAXBLOCKSIZE
+
+/* spinlock_t is defined */
+#undef HAVE_SPINLOCK_T
+
+/* struct stacktrace_ops exists */
+#undef HAVE_STACKTRACE_OPS
+
+/* stacktrace_ops.warning is exist */
+#undef HAVE_STACKTRACE_WARNING
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#undef HAVE_STDINT_H
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#undef HAVE_STDLIB_H
+
+/* Define to 1 if you have the <strings.h> header file. */
+#undef HAVE_STRINGS_H
+
+/* Define to 1 if you have the <string.h> header file. */
+#undef HAVE_STRING_H
+
+/* Define to 1 if you have the `strlcat' function. */
+#undef HAVE_STRLCAT
+
+/* Define to 1 if you have the `strlcpy' function. */
+#undef HAVE_STRLCPY
+
+/* Define to 1 if you have the `strnlen' function. */
+#undef HAVE_STRNLEN
+
+/* struct posix_acl_xattr_{header,entry} defined */
+#undef HAVE_STRUCT_POSIX_ACL_XATTR
+
+/* submit_bio takes two arguments */
+#undef HAVE_SUBMIT_BIO_2ARGS
+
+/* sunrpc_cache_pipe_upcall takes 3 args */
+#undef HAVE_SUNRPC_UPCALL_HAS_3ARGS
+
+/* super_operations use dentry as parameter */
+#undef HAVE_SUPEROPS_USE_DENTRY
+
+/* 'super_setup_bdi_name' is available */
+#undef HAVE_SUPER_SETUP_BDI_NAME
+
+/* symlink inode operations need struct nameidata argument */
+#undef HAVE_SYMLINK_OPS_USE_NAMEIDATA
+
+/* new_sync_[read|write] is exported by the kernel */
+#undef HAVE_SYNC_READ_WRITE
+
+/* ctl_table has ctl_name field */
+#undef HAVE_SYSCTL_CTLNAME
+
+/* Define to 1 if you have the <sys/ioctl.h> header file. */
+#undef HAVE_SYS_IOCTL_H
+
+/* Define to 1 if you have <sys/quota.h>. */
+#undef HAVE_SYS_QUOTA_H
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#undef HAVE_SYS_STAT_H
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#undef HAVE_SYS_TYPES_H
+
+/* tcp_sendpage use socket as first parameter */
+#undef HAVE_TCP_SENDPAGE_USE_SOCKET
+
+/* 'struct timespec64' is available */
+#undef HAVE_TIMESPEC64
+
+/* 'timespec64_sub' is available */
+#undef HAVE_TIMESPEC64_SUB
+
+/* 'timespec64_to_ktime' is available */
+#undef HAVE_TIMESPEC64_TO_KTIME
+
+/* topology_sibling_cpumask is available */
+#undef HAVE_TOPOLOGY_SIBLING_CPUMASK
+
+/* kernel export truncate_complete_page */
+#undef HAVE_TRUNCATE_COMPLETE_PAGE
+
+/* kernel has truncate_inode_pages_final */
+#undef HAVE_TRUNCATE_INODE_PAGES_FINAL
+
+/* uidgid.h is present */
+#undef HAVE_UIDGID_HEADER
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#undef HAVE_UNISTD_H
+
+/* kernel has vfs_rename with 5 args */
+#undef HAVE_VFS_RENAME_5ARGS
+
+/* kernel has vfs_rename with 6 args */
+#undef HAVE_VFS_RENAME_6ARGS
+
+/* '__vfs_setxattr is available */
+#undef HAVE_VFS_SETXATTR
+
+/* kernel has vfs_unlink with 3 args */
+#undef HAVE_VFS_UNLINK_3ARGS
+
+/* virtual_address has been replaced by address field */
+#undef HAVE_VM_FAULT_ADDRESS
+
+/* 'struct vm_operations' remove struct vm_area_struct argument */
+#undef HAVE_VM_OPS_USE_VM_FAULT_ONLY
+
+/* 'wait_queue_entry_t' is available */
+#undef HAVE_WAIT_QUEUE_ENTRY
+
+/* flags field exist */
+#undef HAVE_XATTR_HANDLER_FLAGS
+
+/* needs inode parameter */
+#undef HAVE_XATTR_HANDLER_INODE_PARAM
+
+/* handler pointer is parameter */
+#undef HAVE_XATTR_HANDLER_SIMPLIFIED
+
+/* Have zap_add_by_dnode() in ZFS */
+#undef HAVE_ZAP_ADD_BY_DNODE
+
+/* Have zap_lookup_by_dnode() in ZFS */
+#undef HAVE_ZAP_LOOKUP_BY_DNODE
+
+/* Have zap_remove_by_dnode() in ZFS */
+#undef HAVE_ZAP_REMOVE_ADD_BY_DNODE
+
+/* Enable zfs osd */
+#undef HAVE_ZFS_OSD
+
+/* __add_wait_queue_exclusive exists */
+#undef HAVE___ADD_WAIT_QUEUE_EXCLUSIVE
+
+/* ext4_journal_start takes 3 arguments */
+#undef JOURNAL_START_HAS_3ARGS
+
+/* Define this as the Kerberos version number */
+#undef KRB5_VERSION
+
+/* enable libcfs LASSERT, LASSERTF */
+#undef LIBCFS_DEBUG
+
+/* use dumplog on panic */
+#undef LNET_DUMP_ON_PANIC
+
+/* Define to the sub-directory in which libtool stores uninstalled libraries.
+   */
+#undef LT_OBJDIR
+
+/* Fourth number in the Lustre version */
+#undef LUSTRE_FIX
+
+/* First number in the Lustre version */
+#undef LUSTRE_MAJOR
+
+/* Second number in the Lustre version */
+#undef LUSTRE_MINOR
+
+/* Third number in the Lustre version */
+#undef LUSTRE_PATCH
+
+/* A copy of PACKAGE_VERSION */
+#undef LUSTRE_VERSION_STRING
+
+/* maximum number of MDS threads */
+#undef MDS_MAX_THREADS
+
+/* Report minimum OST free space */
+#undef MIN_DF
+
+/* name of ldiskfs mkfs program */
+#undef MKE2FS
+
+/* need pclmulqdq based crc32c */
+#undef NEED_CRC32C_ACCEL
+
+/* need pclmulqdq based crc32 */
+#undef NEED_CRC32_ACCEL
+
+/* 'ktime_get_real_ns' is not available */
+#undef NEED_KTIME_GET_REAL_NS
+
+/* enable nodemap proc debug support */
+#undef NODEMAP_PROC_DEBUG
+
+/* Name of package */
+#undef PACKAGE
+
+/* Define to the address where bug reports for this package should be sent. */
+#undef PACKAGE_BUGREPORT
+
+/* Define to the full name of this package. */
+#undef PACKAGE_NAME
+
+/* Define to the full name and version of this package. */
+#undef PACKAGE_STRING
+
+/* Define to the one symbol short name of this package. */
+#undef PACKAGE_TARNAME
+
+/* Define to the home page for this package. */
+#undef PACKAGE_URL
+
+/* Define to the version of this package. */
+#undef PACKAGE_VERSION
+
+/* name of parallel fsck program */
+#undef PFSCK
+
+/* enable randomly alloc failure */
+#undef RANDOM_FAIL_ALLOC
+
+/* The size of `unsigned long long', as computed by sizeof. */
+#undef SIZEOF_UNSIGNED_LONG_LONG
+
+/* use tunable backoff TCP */
+#undef SOCKNAL_BACKOFF
+
+/* tunable backoff TCP in ms */
+#undef SOCKNAL_BACKOFF_MS
+
+/* 'struct stacktrace_ops' address function returns an int */
+#undef STACKTRACE_OPS_ADDRESS_RETURN_INT
+
+/* 'struct stacktrace_ops' has 'walk_stack' field */
+#undef STACKTRACE_OPS_HAVE_WALK_STACK
+
+/* Define to 1 if you have the ANSI C header files. */
+#undef STDC_HEADERS
+
+/* name of ldiskfs tune program */
+#undef TUNE2FS
+
+/* Define this if the private function, gss_krb5_cache_name, must be used to
+   tell the Kerberos library which credentials cache to use. Otherwise, this
+   is done by setting the KRB5CCNAME environment variable */
+#undef USE_GSS_KRB5_CCACHE_NAME
+
+/* Write when Checking Health */
+#undef USE_HEALTH_CHECK_WRITE
+
+/* enable lu_ref reference tracking code */
+#undef USE_LU_REF
+
+/* Version number of package */
+#undef VERSION
+
+/* zfs fix version */
+#undef ZFS_FIX
+
+/* zfs major version */
+#undef ZFS_MAJOR
+
+/* zfs minor version */
+#undef ZFS_MINOR
+
+/* zfs patch version */
+#undef ZFS_PATCH

From 839033d4c43deaeeba48ea91b99aa8e50a288ded Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Fri, 1 Mar 2019 17:59:57 +0000
Subject: [PATCH 030/737] Config glue for lustre client.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/staging/Kconfig                       |  2 +
 drivers/staging/Makefile                      |  1 +
 drivers/staging/lustrefsx/Kconfig             |  3 ++
 drivers/staging/lustrefsx/Makefile            |  3 ++
 drivers/staging/lustrefsx/Makefile.rules      |  6 +++
 drivers/staging/lustrefsx/libcfs/Kconfig      |  3 ++
 drivers/staging/lustrefsx/libcfs/Makefile     |  1 +
 .../staging/lustrefsx/libcfs/libcfs/Makefile  | 19 +++++++++
 drivers/staging/lustrefsx/list                | 26 ++++++++++++
 drivers/staging/lustrefsx/lnet/Kconfig        | 37 +++++++++++++++++
 drivers/staging/lustrefsx/lnet/Makefile       |  3 ++
 drivers/staging/lustrefsx/lnet/klnds/Makefile |  2 +
 .../lustrefsx/lnet/klnds/o2iblnd/Makefile     |  5 +++
 .../lustrefsx/lnet/klnds/socklnd/Makefile     |  6 +++
 drivers/staging/lustrefsx/lnet/lnet/Makefile  |  8 ++++
 .../staging/lustrefsx/lnet/selftest/Makefile  |  6 +++
 drivers/staging/lustrefsx/lustre/Kconfig      | 41 +++++++++++++++++++
 drivers/staging/lustrefsx/lustre/Makefile     |  8 ++++
 drivers/staging/lustrefsx/lustre/fid/Makefile |  5 +++
 drivers/staging/lustrefsx/lustre/fld/Makefile |  8 ++++
 .../staging/lustrefsx/lustre/llite/Makefile   | 13 ++++++
 drivers/staging/lustrefsx/lustre/lmv/Makefile |  5 +++
 drivers/staging/lustrefsx/lustre/lov/Makefile |  8 ++++
 drivers/staging/lustrefsx/lustre/mdc/Makefile |  6 +++
 drivers/staging/lustrefsx/lustre/mgc/Makefile |  5 +++
 .../lustrefsx/lustre/obdclass/Makefile        | 16 ++++++++
 .../staging/lustrefsx/lustre/obdecho/Makefile |  5 +++
 drivers/staging/lustrefsx/lustre/osc/Makefile |  6 +++
 .../staging/lustrefsx/lustre/ptlrpc/Makefile  | 26 ++++++++++++
 29 files changed, 283 insertions(+)
 create mode 100644 drivers/staging/lustrefsx/Kconfig
 create mode 100644 drivers/staging/lustrefsx/Makefile
 create mode 100644 drivers/staging/lustrefsx/Makefile.rules
 create mode 100644 drivers/staging/lustrefsx/libcfs/Kconfig
 create mode 100644 drivers/staging/lustrefsx/libcfs/Makefile
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/Makefile
 create mode 100644 drivers/staging/lustrefsx/list
 create mode 100644 drivers/staging/lustrefsx/lnet/Kconfig
 create mode 100644 drivers/staging/lustrefsx/lnet/Makefile
 create mode 100644 drivers/staging/lustrefsx/lnet/klnds/Makefile
 create mode 100644 drivers/staging/lustrefsx/lnet/klnds/o2iblnd/Makefile
 create mode 100644 drivers/staging/lustrefsx/lnet/klnds/socklnd/Makefile
 create mode 100644 drivers/staging/lustrefsx/lnet/lnet/Makefile
 create mode 100644 drivers/staging/lustrefsx/lnet/selftest/Makefile
 create mode 100644 drivers/staging/lustrefsx/lustre/Kconfig
 create mode 100644 drivers/staging/lustrefsx/lustre/Makefile
 create mode 100644 drivers/staging/lustrefsx/lustre/fid/Makefile
 create mode 100644 drivers/staging/lustrefsx/lustre/fld/Makefile
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/Makefile
 create mode 100644 drivers/staging/lustrefsx/lustre/lmv/Makefile
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/Makefile
 create mode 100644 drivers/staging/lustrefsx/lustre/mdc/Makefile
 create mode 100644 drivers/staging/lustrefsx/lustre/mgc/Makefile
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/Makefile
 create mode 100644 drivers/staging/lustrefsx/lustre/obdecho/Makefile
 create mode 100644 drivers/staging/lustrefsx/lustre/osc/Makefile
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/Makefile

diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index 2d0310448eba0..7ac7b71259169 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -70,6 +70,8 @@ source "drivers/staging/fwserial/Kconfig"
 
 source "drivers/staging/goldfish/Kconfig"
 
+source "drivers/staging/lustrefsx/Kconfig"
+
 source "drivers/staging/netlogic/Kconfig"
 
 source "drivers/staging/gs_fpgaboot/Kconfig"
diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index 757a892ab5b9a..968031df9e110 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_STAGING_BOARD)	+= board/
 obj-$(CONFIG_LTE_GDM724X)	+= gdm724x/
 obj-$(CONFIG_FIREWIRE_SERIAL)	+= fwserial/
 obj-$(CONFIG_GOLDFISH)		+= goldfish/
+obj-$(CONFIG_LUSTREFSX_LNET)	+= lustrefsx/
 obj-$(CONFIG_GS_FPGABOOT)	+= gs_fpgaboot/
 obj-$(CONFIG_UNISYSSPAR)	+= unisys/
 obj-$(CONFIG_COMMON_CLK_XLNX_CLKWZRD)	+= clocking-wizard/
diff --git a/drivers/staging/lustrefsx/Kconfig b/drivers/staging/lustrefsx/Kconfig
new file mode 100644
index 0000000000000..81e9bc1043d76
--- /dev/null
+++ b/drivers/staging/lustrefsx/Kconfig
@@ -0,0 +1,3 @@
+source "drivers/staging/lustrefsx/libcfs/Kconfig"
+source "drivers/staging/lustrefsx/lnet/Kconfig"
+source "drivers/staging/lustrefsx/lustre/Kconfig"
diff --git a/drivers/staging/lustrefsx/Makefile b/drivers/staging/lustrefsx/Makefile
new file mode 100644
index 0000000000000..20c7929213c3f
--- /dev/null
+++ b/drivers/staging/lustrefsx/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_LUSTREFSX_LNET)	+= lnet/
+obj-$(CONFIG_LUSTREFSX_FS)	+= lustre/
+obj-$(CONFIG_LUSTREFSX_LIBCFS)	+= libcfs/
diff --git a/drivers/staging/lustrefsx/Makefile.rules b/drivers/staging/lustrefsx/Makefile.rules
new file mode 100644
index 0000000000000..a0d56e80f2ce7
--- /dev/null
+++ b/drivers/staging/lustrefsx/Makefile.rules
@@ -0,0 +1,6 @@
+ccflags-y += -include $(srctree)/drivers/staging/lustrefsx/undef.h
+ccflags-y += -include $(srctree)/drivers/staging/lustrefsx/config.h
+ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/libcfs/include
+ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/lnet/include
+ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/lustre/include
+ccflags-y += -Wno-format-truncation -Werror
diff --git a/drivers/staging/lustrefsx/libcfs/Kconfig b/drivers/staging/lustrefsx/libcfs/Kconfig
new file mode 100644
index 0000000000000..3675b8381af2e
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/Kconfig
@@ -0,0 +1,3 @@
+config LUSTREFSX_LIBCFS
+	depends on m
+	tristate "Lustre helper library"
diff --git a/drivers/staging/lustrefsx/libcfs/Makefile b/drivers/staging/lustrefsx/libcfs/Makefile
new file mode 100644
index 0000000000000..6c5ff83ac791a
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_LUSTREFSX_LIBCFS) +=	libcfs/
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/Makefile b/drivers/staging/lustrefsx/libcfs/libcfs/Makefile
new file mode 100644
index 0000000000000..a487ba0329342
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/Makefile
@@ -0,0 +1,19 @@
+obj-$(CONFIG_LUSTREFSX_LIBCFS)	+= libcfs.o
+
+libcfs-linux-objs	:= linux-tracefile.o linux-debug.o linux-prim.o
+libcfs-linux-objs	+= linux-cpu.o linux-curproc.o linux-module.o
+libcfs-linux-objs	+= linux-crypto.o linux-crypto-adler.o
+libcfs-linux-objs	+= linux-crypto-crc32.o
+
+libcfs-linux-objs	:= $(addprefix linux/,$(libcfs-linux-objs))
+
+libcfs-all-objs		:= debug.o fail.o module.o tracefile.o watchdog.o
+libcfs-all-objs		+= libcfs_string.o hash.o prng.o workitem.o
+libcfs-all-objs		+= libcfs_cpu.o libcfs_mem.o libcfs_lock.o heap.o
+libcfs-all-objs		+= libcfs_ptask.o
+
+libcfs-y		+= $(libcfs-linux-objs) $(libcfs-all-objs)
+
+ccflags-y		+= -I$(src)
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/list b/drivers/staging/lustrefsx/list
new file mode 100644
index 0000000000000..c69f3dc259a22
--- /dev/null
+++ b/drivers/staging/lustrefsx/list
@@ -0,0 +1,26 @@
+./lustre/osc/Makefile
+./lustre/fid/Makefile
+./lustre/mdc/Makefile
+./lustre/Makefile
+./lustre/ptlrpc/Makefile
+./lustre/obdclass/Makefile
+./lustre/llite/Makefile
+./lustre/obdecho/Makefile
+./lustre/lov/Makefile
+./lustre/lmv/Makefile
+./lustre/mgc/Makefile
+./lustre/fld/Makefile
+./Makefile.rules
+./libcfs/libcfs/Makefile
+./libcfs/Makefile
+./Makefile
+./lnet/selftest/Makefile
+./lnet/Makefile
+./lnet/lnet/Makefile
+./lnet/klnds/socklnd/Makefile
+./lnet/klnds/Makefile
+./lnet/klnds/o2iblnd/Makefile
+./lustre/Kconfig
+./Kconfig
+./libcfs/Kconfig
+./lnet/Kconfig
diff --git a/drivers/staging/lustrefsx/lnet/Kconfig b/drivers/staging/lustrefsx/lnet/Kconfig
new file mode 100644
index 0000000000000..0d0686a25fe1e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/Kconfig
@@ -0,0 +1,37 @@
+config LUSTREFSX_LNET
+	tristate "Lustre networking subsystem (LNet)"
+	select LUSTREFSX_LIBCFS
+	depends on m
+	depends on INET
+	help
+	  The Lustre network layer, also known as LNet, is a networking abstaction
+	  level API that was initially created to allow Lustre Filesystem to utilize
+	  very different networks like tcp and ib verbs in a uniform way. In the
+	  case of Lustre routers only the LNet layer is required. Lately other
+	  projects are also looking into using LNet as their networking API as well.
+
+config LUSTREFSX_LNET_SELFTEST
+	tristate "Lustre networking self testing"
+	depends on m
+	depends on LUSTREFSX_LNET
+	help
+	  Choose Y here if you want to do lnet self testing. To compile this
+	  as a module, choose M here: the module will be called lnet_selftest.
+
+	  If unsure, say N.
+
+	  See also http://wiki.lustre.org/
+
+config LUSTREFSX_LNET_XPRT_IB
+	tristate "LNET infiniband support"
+	depends on m
+	depends on LUSTREFSX_LNET && INFINIBAND && INFINIBAND_ADDR_TRANS
+	default LUSTREFSX_LNET && INFINIBAND
+	help
+	  This option allows the LNET users to use infiniband as an
+	  RDMA-enabled transport.
+
+	  To compile this as a kernel module, choose M here and it will be
+	  called ko2iblnd.
+
+	  If unsure, say N.
diff --git a/drivers/staging/lustrefsx/lnet/Makefile b/drivers/staging/lustrefsx/lnet/Makefile
new file mode 100644
index 0000000000000..85a225e57b290
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/Makefile
@@ -0,0 +1,3 @@
+subdir-$(CONFIG_LUSTREFSX_LNET)			+=	lnet/
+subdir-$(CONFIG_LUSTREFSX_LNET)			+=	klnds/
+subdir-$(CONFIG_LUSTREFSX_LNET_SELFTEST)	+=	selftest/
diff --git a/drivers/staging/lustrefsx/lnet/klnds/Makefile b/drivers/staging/lustrefsx/lnet/klnds/Makefile
new file mode 100644
index 0000000000000..cd375ca2cc67f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/klnds/Makefile
@@ -0,0 +1,2 @@
+obj-y		+= o2iblnd/
+obj-y		+= socklnd/
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/Makefile b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/Makefile
new file mode 100644
index 0000000000000..5ce6dc99ffe1a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTREFSX_LNET_XPRT_IB)	+= ko2iblnd.o
+
+ko2iblnd-y		:= o2iblnd.o o2iblnd_cb.o o2iblnd_modparams.o
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/Makefile b/drivers/staging/lustrefsx/lnet/klnds/socklnd/Makefile
new file mode 100644
index 0000000000000..6e6ec925b891f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/Makefile
@@ -0,0 +1,6 @@
+obj-$(CONFIG_LUSTREFSX_LNET)	+= ksocklnd.o
+
+ksocklnd-y		:= socklnd.o socklnd_cb.o socklnd_lib.o
+ksocklnd-y		+= socklnd_modparams.o socklnd_proto.o
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lnet/lnet/Makefile b/drivers/staging/lustrefsx/lnet/lnet/Makefile
new file mode 100644
index 0000000000000..330de0a670651
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/lnet/Makefile
@@ -0,0 +1,8 @@
+obj-$(CONFIG_LUSTREFSX_LNET)	+= lnet.o
+
+lnet-y 		:= api-ni.o config.o nidstrings.o
+lnet-y		+= lib-me.o lib-msg.o lib-eq.o lib-md.o lib-ptl.o
+lnet-y		+= lib-socket.o lib-move.o module.o lo.o
+lnet-y		+= router.o router_proc.o acceptor.o peer.o net_fault.o
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lnet/selftest/Makefile b/drivers/staging/lustrefsx/lnet/selftest/Makefile
new file mode 100644
index 0000000000000..5380812715f7f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/selftest/Makefile
@@ -0,0 +1,6 @@
+obj-$(CONFIG_LUSTREFSX_LNET_SELFTEST)	+= lnet_selftest.o
+
+lnet_selftest-y 	:= console.o conrpc.o conctl.o framework.o timer.o
+lnet_selftest-y		+= rpc.o module.o ping_test.o brw_test.o
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/Kconfig b/drivers/staging/lustrefsx/lustre/Kconfig
new file mode 100644
index 0000000000000..c565c870d805b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/Kconfig
@@ -0,0 +1,41 @@
+config LUSTREFSX_FS
+	tristate "Lustre file system client support"
+	depends on m
+	select LUSTREFSX_LIBCFS
+	depends on LUSTREFSX_LNET
+	select CRYPTO
+	select CRYPTO_CRC32
+	select CRYPTO_CRC32_PCLMUL if X86
+	select CRYPTO_CRC32C
+	select CRYPTO_MD5
+	select CRYPTO_SHA1
+	select CRYPTO_SHA256
+	select CRYPTO_SHA512
+	depends on MULTIUSER
+	help
+	  This option enables Lustre file system client support. Choose Y
+	  here if you want to access a Lustre file system cluster. To compile
+	  this file system support as a module, choose M here: the module will
+	  be called lustre.
+
+	  To mount Lustre file systems, you also need to install the user space
+	  mount.lustre and other user space commands which can be found in the
+	  lustre-client package.
+
+	  Lustre file system is the most popular cluster file system in high
+	  performance computing. Source code of both kernel space and user space
+	  Lustre components can also be found at
+	  http://git.whamcloud.com/?p=fs/lustre-release.git;a=summary
+
+	  If unsure, say N.
+
+	  See also http://wiki.lustre.org/
+
+config LUSTRE_DEBUG_EXPENSIVE_CHECK
+	bool "Enable Lustre DEBUG checks"
+	depends on LUSTREFSX_FS
+	help
+	  This option is mainly for debug purpose. It enables Lustre code to do
+	  expensive checks that may have a performance impact.
+
+	  Use with caution. If unsure, say N.
diff --git a/drivers/staging/lustrefsx/lustre/Makefile b/drivers/staging/lustrefsx/lustre/Makefile
new file mode 100644
index 0000000000000..207cab53c0633
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/Makefile
@@ -0,0 +1,8 @@
+obj-$(CONFIG_LUSTREFSX_FS) += fid/
+obj-$(CONFIG_LUSTREFSX_FS) += obdclass/
+obj-$(CONFIG_LUSTREFSX_FS) += ptlrpc/
+obj-$(CONFIG_LUSTREFSX_FS) += obdecho/
+obj-$(CONFIG_LUSTREFSX_FS) += mgc/
+obj-$(CONFIG_LUSTREFSX_FS) += lov/ osc/ mdc/ lmv/ llite/ fld/
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/fid/Makefile b/drivers/staging/lustrefsx/lustre/fid/Makefile
new file mode 100644
index 0000000000000..22be6773ba08f
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/fid/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTREFSX_LNET)	+= fid.o
+
+fid-y		:= fid_request.o lproc_fid.o
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/fld/Makefile b/drivers/staging/lustrefsx/lustre/fld/Makefile
new file mode 100644
index 0000000000000..722c19fe30409
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/fld/Makefile
@@ -0,0 +1,8 @@
+obj-$(CONFIG_LUSTREFSX_LNET)        += fld.o
+
+ccflags-y	+= -I$(srctree)/drivers/staging/lustrefsx/include
+
+fld-y		:= fld_request.o fld_cache.o lproc_fld.o
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
+
diff --git a/drivers/staging/lustrefsx/lustre/llite/Makefile b/drivers/staging/lustrefsx/lustre/llite/Makefile
new file mode 100644
index 0000000000000..96430e764665b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/Makefile
@@ -0,0 +1,13 @@
+obj-$(CONFIG_LUSTREFSX_FS)	+= lustre.o
+
+lustre-y	:= dcache.o dir.o file.o llite_lib.o llite_nfs.o
+lustre-y	+= rw.o lproc_llite.o namei.o symlink.o llite_mmap.o
+lustre-y	+= xattr.o xattr_cache.o
+lustre-y	+= rw26.o super25.o statahead.o xattr_security.o
+lustre-y	+= glimpse.o
+lustre-y	+= lcommon_cl.o
+lustre-y	+= lcommon_misc.o
+lustre-y	+= vvp_dev.o vvp_page.o vvp_lock.o vvp_io.o vvp_object.o
+lustre-y	+= range_lock.o
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/lmv/Makefile b/drivers/staging/lustrefsx/lustre/lmv/Makefile
new file mode 100644
index 0000000000000..40626f49283fb
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lmv/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTREFSX_FS)	+= lmv.o
+
+lmv-y	:= lmv_obd.o lmv_intent.o lmv_fld.o lproc_lmv.o
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/lov/Makefile b/drivers/staging/lustrefsx/lustre/lov/Makefile
new file mode 100644
index 0000000000000..e74389ed4c3e3
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/Makefile
@@ -0,0 +1,8 @@
+obj-$(CONFIG_LUSTREFSX_FS)	+= lov.o
+
+lov-y	:= lov_dev.o lov_ea.o lov_io.o lov_lock.o lov_merge.o lov_obd.o
+lov-y	+= lov_object.o lov_offset.o lov_pack.o lov_page.o lov_pool.o
+lov-y	+= lov_request.o lovsub_dev.o lovsub_lock.o lovsub_object.o
+lov-y	+= lovsub_page.o lproc_lov.o
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/mdc/Makefile b/drivers/staging/lustrefsx/lustre/mdc/Makefile
new file mode 100644
index 0000000000000..e13d6af6f9949
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/mdc/Makefile
@@ -0,0 +1,6 @@
+obj-$(CONFIG_LUSTREFSX_FS)	+= mdc.o
+
+mdc-y		:= mdc_request.o mdc_reint.o lproc_mdc.o mdc_lib.o mdc_locks.o
+mdc-y		+= mdc_changelog.o
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/mgc/Makefile b/drivers/staging/lustrefsx/lustre/mgc/Makefile
new file mode 100644
index 0000000000000..7353c95e42cca
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/mgc/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTREFSX_FS)	+= mgc.o
+
+mgc-y	:= mgc_request.o lproc_mgc.o
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/Makefile b/drivers/staging/lustrefsx/lustre/obdclass/Makefile
new file mode 100644
index 0000000000000..57450ea2824c1
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/Makefile
@@ -0,0 +1,16 @@
+obj-$(CONFIG_LUSTREFSX_FS)	+= obdclass.o
+
+obdclass-linux-objs := linux-module.o linux-obdo.o linux-sysctl.o
+obdclass-linux-objs := $(addprefix linux/,$(obdclass-linux-objs))
+
+obdclass-y := $(obdclass-linux-objs)
+obdclass-y += llog.o llog_cat.o llog_obd.o llog_swab.o llog_osd.o
+obdclass-y += class_obd.o debug.o genops.o uuid.o llog_ioctl.o
+obdclass-y += lprocfs_status.o lprocfs_counters.o
+obdclass-y += lustre_handles.o lustre_peer.o local_storage.o
+obdclass-y += statfs_pack.o obdo.o obd_config.o obd_mount.o
+obdclass-y += lu_object.o dt_object.o
+obdclass-y += cl_object.o cl_page.o cl_lock.o cl_io.o lu_ref.o
+obdclass-y += linkea.o kernelcomm.o
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/obdecho/Makefile b/drivers/staging/lustrefsx/lustre/obdecho/Makefile
new file mode 100644
index 0000000000000..8fdb779fdc085
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdecho/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTREFSX_FS)	+= obdecho.o
+
+obdecho-y	:= echo_client.o
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/osc/Makefile b/drivers/staging/lustrefsx/lustre/osc/Makefile
new file mode 100644
index 0000000000000..223e42283bf92
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/osc/Makefile
@@ -0,0 +1,6 @@
+obj-$(CONFIG_LUSTREFSX_FS)	+= osc.o
+
+osc-y := osc_request.o lproc_osc.o osc_dev.o osc_object.o osc_page.o
+osc-y += osc_lock.o osc_io.o osc_quota.o osc_cache.o
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/Makefile b/drivers/staging/lustrefsx/lustre/ptlrpc/Makefile
new file mode 100644
index 0000000000000..f192313597822
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/Makefile
@@ -0,0 +1,26 @@
+obj-$(CONFIG_LUSTREFSX_FS)	+= ptlrpc.o
+
+LDLM := ../../lustre/ldlm/
+TARGET := ../../lustre/target/
+
+ldlm_objs := $(LDLM)l_lock.o $(LDLM)ldlm_lock.o
+ldlm_objs += $(LDLM)ldlm_resource.o $(LDLM)ldlm_lib.o
+ldlm_objs += $(LDLM)ldlm_plain.o $(LDLM)ldlm_extent.o
+ldlm_objs += $(LDLM)ldlm_request.o $(LDLM)ldlm_lockd.o
+ldlm_objs += $(LDLM)ldlm_flock.o $(LDLM)ldlm_inodebits.o
+ldlm_objs += $(LDLM)ldlm_pool.o $(LDLM)interval_tree.o
+ldlm_objs += $(LDLM)ldlm_reclaim.o
+
+ptlrpc_objs := client.o recover.o connection.o niobuf.o pack_generic.o
+ptlrpc_objs += events.o ptlrpc_module.o service.o pinger.o
+ptlrpc_objs += llog_net.o llog_client.o llog_server.o import.o ptlrpcd.o
+ptlrpc_objs += pers.o lproc_ptlrpc.o wiretest.o layout.o
+ptlrpc_objs += sec.o sec_ctx.o sec_bulk.o sec_gc.o sec_config.o sec_lproc.o
+ptlrpc_objs += sec_null.o sec_plain.o nrs.o nrs_fifo.o nrs_crr.o nrs_orr.o
+ptlrpc_objs += nrs_tbf.o nrs_delay.o errno.o
+
+ptlrpc-y	:= $(ldlm_objs) $(ptlrpc_objs) $(TARGET)barrier.o
+
+ccflags-y	+= -I$(srctree)/drivers/staging/lustrefsx/ldlm
+
+include $(srctree)/drivers/staging/lustrefsx/Makefile.rules

From e2dcf444cb73dd25d2fcc2db98428e59f86b601f Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Mon, 4 Mar 2019 19:39:33 +0000
Subject: [PATCH 031/737] lustre: change printf format strings for 64bit time
 in struct inode

In kernel versions >= 4.18, struct inode uses timespec64 to store
{c,m,a}time. Change printf format strings in that case, from
%lu to %llu.

Linux commit: 95582b00838837fc ("vfs: change inode times to use struct
timespec64")

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/staging/lustrefsx/lustre/include/lustre_compat.h | 6 ++++++
 drivers/staging/lustrefsx/lustre/llite/llite_lib.c       | 5 +++--
 drivers/staging/lustrefsx/lustre/llite/namei.c           | 4 ++--
 drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c           | 3 ++-
 drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c         | 3 ++-
 drivers/staging/lustrefsx/lustre/obdclass/obdo.c         | 3 ++-
 6 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
index ef9ec2af53905..e44cd462079d9 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
@@ -103,6 +103,12 @@ static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
 
 #define LTIME_S(time)                   (time.tv_sec)
 
+#ifdef HAVE_INODE_TIME_64BIT
+#define LTIME_FMT "%llu"
+#else
+#define LTIME_FMT "%lu"
+#endif
+
 #ifdef HAVE_GENERIC_PERMISSION_2ARGS
 # define ll_generic_permission(inode, mask, flags, check_acl) \
 	 generic_permission(inode, mask)
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_lib.c b/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
index b1dbb7d0c3175..833d68d16381e 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
@@ -1611,7 +1611,8 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import)
         }
 
         if (attr->ia_valid & (ATTR_MTIME | ATTR_CTIME))
-		CDEBUG(D_INODE, "setting mtime %lu, ctime %lu, now = %llu\n",
+		CDEBUG(D_INODE, "setting mtime " LTIME_FMT ", ctime "
+		       LTIME_FMT ", now = %llu\n",
                        LTIME_S(attr->ia_mtime), LTIME_S(attr->ia_ctime),
 		       (s64)ktime_get_real_seconds());
 
@@ -1894,7 +1895,7 @@ int ll_update_inode(struct inode *inode, struct lustre_md *md)
 
 	if (body->mbo_valid & OBD_MD_FLMTIME) {
 		if (body->mbo_mtime > LTIME_S(inode->i_mtime)) {
-			CDEBUG(D_INODE, "setting ino %lu mtime from %lu "
+			CDEBUG(D_INODE, "setting ino %lu mtime from " LTIME_FMT
 			       "to %llu\n", inode->i_ino,
 			       LTIME_S(inode->i_mtime), body->mbo_mtime);
 			LTIME_S(inode->i_mtime) = body->mbo_mtime;
diff --git a/drivers/staging/lustrefsx/lustre/llite/namei.c b/drivers/staging/lustrefsx/lustre/llite/namei.c
index bf8b76efefb85..81d95cca601ed 100644
--- a/drivers/staging/lustrefsx/lustre/llite/namei.c
+++ b/drivers/staging/lustrefsx/lustre/llite/namei.c
@@ -1005,8 +1005,8 @@ void ll_update_times(struct ptlrpc_request *request, struct inode *inode)
 	LASSERT(body);
 	if (body->mbo_valid & OBD_MD_FLMTIME &&
 	    body->mbo_mtime > LTIME_S(inode->i_mtime)) {
-		CDEBUG(D_INODE, "setting fid "DFID" mtime from %lu to %llu"
-		       "\n", PFID(ll_inode2fid(inode)),
+		CDEBUG(D_INODE, "setting fid "DFID" mtime from " LTIME_FMT
+		       " to %llu\n", PFID(ll_inode2fid(inode)),
 		       LTIME_S(inode->i_mtime), body->mbo_mtime);
 		LTIME_S(inode->i_mtime) = body->mbo_mtime;
 	}
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c b/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c
index 3fed10fc1a1c0..15589c5b9c766 100644
--- a/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c
@@ -3097,7 +3097,8 @@ static int lmv_merge_attr(struct obd_export *exp,
 		struct inode *inode = lsm->lsm_md_oinfo[i].lmo_root;
 
 		CDEBUG(D_INFO, ""DFID" size %llu, blocks %llu nlink %u,"
-		       " atime %lu ctime %lu, mtime %lu.\n",
+		       " atime " LTIME_FMT " ctime " LTIME_FMT
+		       ", mtime " LTIME_FMT ".\n",
 		       PFID(&lsm->lsm_md_oinfo[i].lmo_fid),
 		       i_size_read(inode), (unsigned long long)inode->i_blocks,
 		       inode->i_nlink, LTIME_S(inode->i_atime),
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c
index 75ed568153305..5b7be3d8af586 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c
@@ -131,7 +131,8 @@ int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data,
 	}
 
         if (op_data->op_attr.ia_valid & (ATTR_MTIME | ATTR_CTIME))
-		CDEBUG(D_INODE, "setting mtime %ld, ctime %ld\n",
+		CDEBUG(D_INODE, "setting mtime " LTIME_FMT ", ctime "
+		       LTIME_FMT "\n",
                        LTIME_S(op_data->op_attr.ia_mtime),
                        LTIME_S(op_data->op_attr.ia_ctime));
 	mdc_setattr_pack(req, op_data, ea, ealen);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obdo.c b/drivers/staging/lustrefsx/lustre/obdclass/obdo.c
index dddb24b036fee..a7f7be3973222 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/obdo.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obdo.c
@@ -69,7 +69,8 @@ void obdo_from_inode(struct obdo *dst, struct inode *src, u64 valid)
 	u64 newvalid = 0;
 
 	if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
-		CDEBUG(D_INODE, "valid %#llx, new time %lu/%lu\n",
+		CDEBUG(D_INODE, "valid %#llx, new time "
+			LTIME_FMT "/" LTIME_FMT "\n",
 			valid, LTIME_S(src->i_mtime),
 			LTIME_S(src->i_ctime));
 

From bcbf93166c6066e49d7bba8a3a6b216a7fa8bb0a Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Thu, 7 Mar 2019 17:53:19 +0000
Subject: [PATCH 032/737] lustre: silence printk format warnings about
 timespec.tv_sec

tv_sec may be defined as long or long long depending on timespec64
availability and depending on whether this is a 64bit kernel or not.

This leads to printk format warnings. Silence them by always casting
the value to unsigned long - the value we're printing here is not
larger than 32 bits in any case.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c b/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c
index 359ca943e95d8..e432d86aae0a9 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c
@@ -174,7 +174,7 @@ static void lcw_dump_stack(struct lc_watchdog *lcw)
 	    lcw_recent_watchdog_count > 3) {
 		LCONSOLE_WARN("Service thread pid %u was inactive for %lu.%.02lus. Watchdog stack traces are limited to 3 per %d seconds, skipping this one.\n",
 			      (int)lcw->lcw_pid,
-			      timediff.tv_sec,
+			      (unsigned long)timediff.tv_sec,
 			      timediff.tv_nsec / (NSEC_PER_SEC / 100),
 			      libcfs_watchdog_ratelimit);
 	} else {
@@ -188,7 +188,7 @@ static void lcw_dump_stack(struct lc_watchdog *lcw)
 
 		LCONSOLE_WARN("Service thread pid %u was inactive for %lu.%.02lus. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:\n",
 			      (int)lcw->lcw_pid,
-			      timediff.tv_sec,
+			      (unsigned long)timediff.tv_sec,
 			      timediff.tv_nsec / (NSEC_PER_SEC / 100));
 		lcw_dump(lcw);
 	}
@@ -388,7 +388,7 @@ static void lcw_update_time(struct lc_watchdog *lcw, const char *message)
 		timediff = ktime_to_timespec64(lapse);
 		LCONSOLE_WARN("Service thread pid %u %s after %lu.%.02lus. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).\n",
 			      lcw->lcw_pid, message,
-			      timediff.tv_sec,
+			      (unsigned long)timediff.tv_sec,
 			      timediff.tv_nsec / (NSEC_PER_SEC / 100));
 	}
 	lcw->lcw_last_touched = newtime;

From 65fedce16f19c30b6faaf3ca29063b53bcc54554 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Wed, 6 Mar 2019 21:39:35 +0000
Subject: [PATCH 033/737] lustre: use SB_* instead of MS_* as superblock flags.

Later kernel versions (4.14 and up), have SB_* defined as the flags
to be used in the superblock, not MS_* mount flags. Still later,
in 4.20 and up, the MS_* flags were moved to a different include file,
and no longer used in the kernel.

To make the code compile in all these scenarios, use SB_* consistently in
the code. If the SB_* values are not defined (for older kernels), define
them as MS_*, in lustre_compat.h.

Linux commit: e262e32d6bde ("vfs: Suppress MS_* flag defs within the kernel
unless explicitly enabled")

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 .../lustrefsx/lustre/include/lustre_compat.h  | 26 ++++++++++++++++++-
 drivers/staging/lustrefsx/lustre/llite/file.c |  2 +-
 .../lustrefsx/lustre/llite/llite_lib.c        | 24 ++++++++---------
 .../staging/lustrefsx/lustre/llite/namei.c    |  4 +--
 .../lustre/obdclass/obd_mount_server.c        |  2 +-
 5 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
index e44cd462079d9..a900bba89839e 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
@@ -538,7 +538,31 @@ static inline bool is_sxid(umode_t mode)
 #define IS_NOSEC(inode)	(!is_sxid(inode->i_mode))
 #endif
 
-#ifndef MS_NOSEC
+/*
+ * Upstream Linux kernel commit e462ec50cb5fad19f6003a3d8087f4a0945dd2b1
+ * differentiated the MS_ values from SB_* values. We use SB_*
+ * throughout, but account here for older kernels that do not have
+ * SB_*. The values below are only the ones currently used in the Lustre
+ * code.
+ */
+
+#ifndef SB_RDONLY
+
+#define SB_RDONLY	MS_RDONLY
+#define SB_ACTIVE	MS_ACTIVE
+#define SB_NODIRATIME	MS_NODIRATIME
+
+#if defined(MS_POSIXACL)
+#define SB_POSIXACL MS_POSIXACL
+#endif
+
+#if defined(MS_NOSEC)
+#define SB_NOSEC MS_NOSEC
+#endif
+
+#endif
+
+#ifndef SB_NOSEC
 static inline void inode_has_no_xattr(struct inode *inode)
 {
 	return;
diff --git a/drivers/staging/lustrefsx/lustre/llite/file.c b/drivers/staging/lustrefsx/lustre/llite/file.c
index 1fefb8f63dc0c..45cd5a4094308 100644
--- a/drivers/staging/lustrefsx/lustre/llite/file.c
+++ b/drivers/staging/lustrefsx/lustre/llite/file.c
@@ -1073,7 +1073,7 @@ static bool file_is_noatime(const struct file *file)
 	if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
 		return true;
 
-	if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
+	if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
 		return true;
 
 	return false;
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_lib.c b/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
index 833d68d16381e..da5664db3dd51 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
@@ -236,16 +236,16 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
         data->ocd_ibits_known = MDS_INODELOCK_FULL;
         data->ocd_version = LUSTRE_VERSION_CODE;
 
-        if (sb->s_flags & MS_RDONLY)
+        if (sb->s_flags & SB_RDONLY)
                 data->ocd_connect_flags |= OBD_CONNECT_RDONLY;
         if (sbi->ll_flags & LL_SBI_USER_XATTR)
                 data->ocd_connect_flags |= OBD_CONNECT_XATTR;
 
-#ifdef MS_NOSEC
+#ifdef SB_NOSEC
 	/* Setting this indicates we correctly support S_NOSEC (See kernel
 	 * commit 9e1f1de02c2275d7172e18dc4e7c2065777611bf)
 	 */
-	sb->s_flags |= MS_NOSEC;
+	sb->s_flags |= SB_NOSEC;
 #endif
 
         if (sbi->ll_flags & LL_SBI_FLOCK)
@@ -345,14 +345,14 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
         }
 
         if (data->ocd_connect_flags & OBD_CONNECT_ACL) {
-#ifdef MS_POSIXACL
-                sb->s_flags |= MS_POSIXACL;
+#ifdef SB_POSIXACL
+                sb->s_flags |= SB_POSIXACL;
 #endif
                 sbi->ll_flags |= LL_SBI_ACL;
         } else {
                 LCONSOLE_INFO("client wants to enable acl, but mdt not!\n");
-#ifdef MS_POSIXACL
-                sb->s_flags &= ~MS_POSIXACL;
+#ifdef SB_POSIXACL
+                sb->s_flags &= ~SB_POSIXACL;
 #endif
                 sbi->ll_flags &= ~LL_SBI_ACL;
         }
@@ -724,7 +724,7 @@ void ll_kill_super(struct super_block *sb)
 	ENTRY;
 
         /* not init sb ?*/
-	if (!(sb->s_flags & MS_ACTIVE))
+	if (!(sb->s_flags & SB_ACTIVE))
 		return;
 
 	sbi = ll_s2sbi(sb);
@@ -2213,8 +2213,8 @@ int ll_remount_fs(struct super_block *sb, int *flags, char *data)
         int err;
         __u32 read_only;
 
-        if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
-                read_only = *flags & MS_RDONLY;
+        if ((*flags & SB_RDONLY) != (sb->s_flags & SB_RDONLY)) {
+                read_only = *flags & SB_RDONLY;
                 err = obd_set_info_async(NULL, sbi->ll_md_exp,
                                          sizeof(KEY_READ_ONLY),
                                          KEY_READ_ONLY, sizeof(read_only),
@@ -2227,9 +2227,9 @@ int ll_remount_fs(struct super_block *sb, int *flags, char *data)
                 }
 
                 if (read_only)
-                        sb->s_flags |= MS_RDONLY;
+                        sb->s_flags |= SB_RDONLY;
                 else
-                        sb->s_flags &= ~MS_RDONLY;
+                        sb->s_flags &= ~SB_RDONLY;
 
                 if (sbi->ll_flags & LL_SBI_VERBOSE)
                         LCONSOLE_WARN("Remounted %s %s\n", profilenm,
diff --git a/drivers/staging/lustrefsx/lustre/llite/namei.c b/drivers/staging/lustrefsx/lustre/llite/namei.c
index 81d95cca601ed..cf9f3e26d9a54 100644
--- a/drivers/staging/lustrefsx/lustre/llite/namei.c
+++ b/drivers/staging/lustrefsx/lustre/llite/namei.c
@@ -574,7 +574,7 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
 	}
 
 	if (it->it_op & IT_OPEN && it->it_flags & FMODE_WRITE &&
-	    dentry->d_sb->s_flags & MS_RDONLY)
+	    dentry->d_sb->s_flags & SB_RDONLY)
 		RETURN(ERR_PTR(-EROFS));
 
 	if (it->it_op & IT_CREAT)
@@ -855,7 +855,7 @@ static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry,
 
 			it = ll_convert_intent(&nd->intent.open, nd->flags,
 				(nd->path.mnt->mnt_flags & MNT_READONLY) ||
-				(nd->path.mnt->mnt_sb->s_flags & MS_RDONLY));
+				(nd->path.mnt->mnt_sb->s_flags & SB_RDONLY));
 			if (IS_ERR(it))
 				RETURN((struct dentry *)it);
 		}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c
index dc2d192dcb048..204fc889da45b 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c
@@ -1733,7 +1733,7 @@ static int server_fill_super_common(struct super_block *sb)
 	sb->s_blocksize_bits = log2(sb->s_blocksize);
 	sb->s_magic = LUSTRE_SUPER_MAGIC;
 	sb->s_maxbytes = 0; /* we don't allow file IO on server mountpoints */
-	sb->s_flags |= MS_RDONLY;
+	sb->s_flags |= SB_RDONLY;
 	sb->s_op = &server_ops;
 
 	root = new_inode(sb);

From 9523fe5e3e8f036441be95062b955da849b7e7f5 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Mon, 4 Mar 2019 20:22:24 +0000
Subject: [PATCH 034/737] lustre: adapt to setup_timer -> timer_setup change

Upstream Linux changed all setup_timer calls to timer_setup, and then
deprecrated the setup_timer interface. Add a Lustre upstream commit
("LU-10805 libcfs: timer_setup() API changes.") to address this.
---
 .../libcfs/include/libcfs/linux/linux-time.h  | 19 +++++++++++++++++++
 .../lustrefsx/libcfs/libcfs/watchdog.c        |  6 +++---
 .../staging/lustrefsx/lnet/lnet/net_fault.c   |  9 ++++-----
 .../lustrefsx/lustre/obdclass/genops.c        |  1 -
 .../staging/lustrefsx/lustre/ptlrpc/service.c | 10 +++++-----
 5 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
index 3095626dea428..fa972ff9ca16d 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
@@ -281,4 +281,23 @@ static inline int cfs_time_beforeq_64(__u64 t1, __u64 t2)
  */
 #define CFS_DURATION_T          "%ld"
 
+#ifdef HAVE_TIMER_SETUP
+#define cfs_timer_cb_arg_t struct timer_list *
+#define cfs_from_timer(var, callback_timer, timer_fieldname) \
+	from_timer(var, callback_timer, timer_fieldname)
+#define cfs_timer_setup(timer, callback, data, flags) \
+	timer_setup((timer), (callback), (flags))
+#define CFS_DEFINE_TIMER(_name, _function, _expires, _data) \
+	DEFINE_TIMER((_name), (_function))
+#define cfs_timer_cb_arg(var, timer_fieldname) (&(var)->timer_fieldname)
+#else
+#define cfs_timer_cb_arg_t unsigned long
+#define cfs_from_timer(var, data, timer_fieldname) (typeof(var))(data)
+#define cfs_timer_setup(timer, callback, data, flags) \
+	setup_timer((timer), (callback), (data))
+#define CFS_DEFINE_TIMER(_name, _function, _expires, _data) \
+	DEFINE_TIMER((_name), (_function), (_expires), (_data))
+#define cfs_timer_cb_arg(var, timer_fieldname) (cfs_timer_cb_arg_t)(var)
+#endif
+
 #endif /* __LIBCFS_LINUX_LINUX_TIME_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c b/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c
index e432d86aae0a9..f7170860f0277 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c
@@ -114,9 +114,9 @@ lcw_dump(struct lc_watchdog *lcw)
         EXIT;
 }
 
-static void lcw_cb(uintptr_t data)
+static void lcw_cb(cfs_timer_cb_arg_t data)
 {
-        struct lc_watchdog *lcw = (struct lc_watchdog *)data;
+	struct lc_watchdog *lcw = cfs_from_timer(lcw, data, lcw_timer);
         ENTRY;
 
         if (lcw->lcw_state != LC_WATCHDOG_ENABLED) {
@@ -359,7 +359,7 @@ struct lc_watchdog *lc_watchdog_add(int timeout,
 	lcw->lcw_state    = LC_WATCHDOG_DISABLED;
 
 	INIT_LIST_HEAD(&lcw->lcw_list);
-	setup_timer(&lcw->lcw_timer, lcw_cb, (unsigned long)lcw);
+	cfs_timer_setup(&lcw->lcw_timer, lcw_cb, (unsigned long)lcw, 0);
 
 	mutex_lock(&lcw_refcount_mutex);
 	if (++lcw_refcount == 1)
diff --git a/drivers/staging/lustrefsx/lnet/lnet/net_fault.c b/drivers/staging/lustrefsx/lnet/lnet/net_fault.c
index c43f8fe2c176e..56e62423f21c8 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/net_fault.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/net_fault.c
@@ -707,9 +707,9 @@ lnet_delay_rule_daemon(void *arg)
 }
 
 static void
-delay_timer_cb(unsigned long arg)
+delay_timer_cb(cfs_timer_cb_arg_t data)
 {
-	struct lnet_delay_rule *rule = (struct lnet_delay_rule *)arg;
+	struct lnet_delay_rule *rule = cfs_from_timer(rule, data, dl_timer);
 
 	spin_lock_bh(&delay_dd.dd_lock);
 	if (list_empty(&rule->dl_sched_link) && delay_dd.dd_running) {
@@ -772,9 +772,8 @@ lnet_delay_rule_add(struct lnet_fault_attr *attr)
 		wait_event(delay_dd.dd_ctl_waitq, delay_dd.dd_running);
 	}
 
-	init_timer(&rule->dl_timer);
-	rule->dl_timer.function = delay_timer_cb;
-	rule->dl_timer.data = (unsigned long)rule;
+	cfs_timer_setup(&rule->dl_timer, delay_timer_cb,
+			(unsigned long)rule, 0);
 
 	spin_lock_init(&rule->dl_lock);
 	INIT_LIST_HEAD(&rule->dl_msg_list);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/genops.c b/drivers/staging/lustrefsx/lustre/obdclass/genops.c
index 7fb129d889900..c2b1e35fe86e1 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/genops.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/genops.c
@@ -349,7 +349,6 @@ struct obd_device *class_newdev(const char *type_name, const char *name,
 	/* XXX belongs in setup not attach  */
 	init_rwsem(&newdev->obd_observer_link_sem);
 	/* recovery data */
-	init_timer(&newdev->obd_recovery_timer);
 	spin_lock_init(&newdev->obd_recovery_task_lock);
 	init_waitqueue_head(&newdev->obd_next_transno_waitq);
 	init_waitqueue_head(&newdev->obd_evict_inprogress_waitq);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/service.c b/drivers/staging/lustrefsx/lustre/ptlrpc/service.c
index d304ec6c2416d..6e3172cdeb5a7 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/service.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/service.c
@@ -472,11 +472,11 @@ ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt)
 	return -1;
 }
 
-static void ptlrpc_at_timer(unsigned long castmeharder)
+static void ptlrpc_at_timer(cfs_timer_cb_arg_t data)
 {
 	struct ptlrpc_service_part *svcpt;
 
-	svcpt = (struct ptlrpc_service_part *)castmeharder;
+	svcpt = cfs_from_timer(svcpt, data, scp_at_timer);
 
 	svcpt->scp_at_check = 1;
 	svcpt->scp_at_checktime = cfs_time_current();
@@ -643,8 +643,8 @@ ptlrpc_service_part_init(struct ptlrpc_service *svc,
 	if (array->paa_reqs_count == NULL)
 		goto failed;
 
-	setup_timer(&svcpt->scp_at_timer, ptlrpc_at_timer,
-		    (unsigned long)svcpt);
+	cfs_timer_setup(&svcpt->scp_at_timer, ptlrpc_at_timer,
+			(unsigned long)svcpt, 0);
 
 	/* At SOW, service time should be quick; 10s seems generous. If client
 	 * timeout is less than this, we'll be sending an early reply. */
@@ -1177,7 +1177,7 @@ static void ptlrpc_at_set_timer(struct ptlrpc_service_part *svcpt)
 	next = (__s32)(array->paa_deadline - ktime_get_real_seconds() -
 		       at_early_margin);
 	if (next <= 0) {
-		ptlrpc_at_timer((unsigned long)svcpt);
+		ptlrpc_at_timer(cfs_timer_cb_arg(svcpt, scp_at_timer));
 	} else {
 		mod_timer(&svcpt->scp_at_timer, cfs_time_shift(next));
 		CDEBUG(D_INFO, "armed %s at %+ds\n",

From 98690b6694bd09ef9c73160be6249bf3e4864bc4 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Mon, 4 Mar 2019 22:29:53 +0000
Subject: [PATCH 035/737] lustre: adapt to struct posix_acl atomic_t ->
 refcount_t change

In Linux >= 4.15, the posix_acl structure uses refcount_t instead of
atomic_t. Make things work in both of these cases.

Linux commit: 66717260545b ("posix_acl: convert posix_acl.a_refcount from
atomic_t to refcount_t")

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/staging/lustrefsx/lustre/llite/llite_lib.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_lib.c b/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
index da5664db3dd51..339dd8f1da4c4 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
@@ -1459,7 +1459,11 @@ void ll_clear_inode(struct inode *inode)
 #ifdef CONFIG_FS_POSIX_ACL
 	forget_all_cached_acls(inode);
 	if (lli->lli_posix_acl) {
+#ifdef HAVE_POSIX_ACL_REFCOUNT
+		LASSERT(refcount_read(&lli->lli_posix_acl->a_refcount) == 1);
+#else
 		LASSERT(atomic_read(&lli->lli_posix_acl->a_refcount) == 1);
+#endif
 		posix_acl_release(lli->lli_posix_acl);
 		lli->lli_posix_acl = NULL;
 	}

From 3356084177e2a1df03172698bf48ed8d83f15c1b Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Mon, 4 Mar 2019 22:39:51 +0000
Subject: [PATCH 036/737] lustre: adapt to sys_close -> ksys_close change

Linux >= 4.17 uses a ksys_close wrapper instead of sys_close for
in-kernel fd close calls. Make the code work with kernels before
and after this change.

Linux commit: 2ca2a09d6215 ("fs: add ksys_close() wrapper; remove
in-kernel calls to sys_close()")

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/staging/lustrefsx/lnet/lnet/lib-socket.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
index a0fcec9d8a444..a1b2b327a3840 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
@@ -100,7 +100,11 @@ lnet_sock_ioctl(int cmd, unsigned long arg)
 	fput(sock_filp);
 out:
 	if (fd >= 0)
+#ifdef HAVE_KSYS_CLOSE
+		ksys_close(fd);
+#else
 		sys_close(fd);
+#endif
 	return rc;
 }
 

From d6697fbbd87bd40e2c979c70632e64251b0ce624 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Mon, 4 Mar 2019 22:06:36 +0000
Subject: [PATCH 037/737] lustre: adapt to upstream struct address_space
 changes

In later kernels, two changes were made to struct address_space:
1. Locking was changed and page_tree was renamed to i_pages (4.17)
2. The page storage data structure was converted from a radix tree
   to an Xarray. (4.20)

Adapt for both these changes. The first change was made while (an
older version of) Lustre was still in the tree. The second change
was made after Lustre was removed from the tree, but a Lustre change
was still included in an earlier version of the patch series posted.
The code change was partially taken from this version of the patch
(https://lore.kernel.org/patchwork/patch/933300/).

Linux commit: b93b016313b3 ("page cache: use xa_lock")
Linux commit: eb797a8ee0ab ("page cache: Rearrange address_space")

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 .../lustrefsx/lustre/include/lustre_compat.h  | 20 +++++++++++++
 .../staging/lustrefsx/lustre/llite/glimpse.c  | 10 +++++++
 .../lustrefsx/lustre/mdc/mdc_request.c        | 28 +++++++++++++++++--
 3 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
index a900bba89839e..c5915f000a0af 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
@@ -694,4 +694,24 @@ static inline struct timespec current_time(struct inode *inode)
 #define __GFP_COLD 0
 #endif
 
+#ifndef HAVE_ADDRESS_SPACE_XARRAY
+static inline void lock_mappings(struct address_space *mappings)
+{
+#ifdef HAVE_ADDRESS_SPACE_IPAGES
+	xa_lock_irq(&mappings->i_pages);
+#else
+	spin_lock_irq(&mappings->tree_lock);
+#endif
+}
+
+static inline void unlock_mappings(struct address_space *mappings)
+{
+#ifdef HAVE_ADDRESS_SPACE_IPAGES
+	xa_unlock_irq(&mappings->i_pages);
+#else
+	spin_unlock_irq(&mappings->tree_lock);
+#endif
+}
+#endif
+
 #endif /* _LUSTRE_COMPAT_H */
diff --git a/drivers/staging/lustrefsx/lustre/llite/glimpse.c b/drivers/staging/lustrefsx/lustre/llite/glimpse.c
index d34be28747bdd..55deb8236bd40 100644
--- a/drivers/staging/lustrefsx/lustre/llite/glimpse.c
+++ b/drivers/staging/lustrefsx/lustre/llite/glimpse.c
@@ -66,12 +66,22 @@ blkcnt_t dirty_cnt(struct inode *inode)
 {
         blkcnt_t cnt = 0;
 	struct vvp_object *vob = cl_inode2vvp(inode);
+#ifndef HAVE_ADDRESS_SPACE_XARRAY
         void              *results[1];
 
         if (inode->i_mapping != NULL)
+#ifdef HAVE_ADDRESS_SPACE_IPAGES
+                cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->i_pages,
+#else
                 cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->page_tree,
+#endif
                                                   results, 0, 1,
                                                   PAGECACHE_TAG_DIRTY);
+#else
+	if (inode->i_mapping && mapping_tagged(inode->i_mapping,
+				PAGECACHE_TAG_DIRTY))
+		cnt = 1;
+#endif
 	if (cnt == 0 && atomic_read(&vob->vob_mmap_cnt) > 0)
 		cnt = 1;
 
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c
index 681e5bd94a6c9..1530f5e8c342a 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c
@@ -990,16 +990,34 @@ static struct page *mdc_page_locate(struct address_space *mapping, __u64 *hash,
 	 */
 	unsigned long offset = hash_x_index(*hash, hash64);
 	struct page *page;
+#ifdef HAVE_ADDRESS_SPACE_XARRAY
+	XA_STATE(xas, &mapping->i_pages, offset);
+
+	xas_lock_irq(&xas);
+	page = xas_find(&xas, ULONG_MAX);
+	if (xa_is_value(page))
+		page = NULL;
+	if (page) {
+#else
 	int found;
 
-	spin_lock_irq(&mapping->tree_lock);
+	lock_mappings(mapping);
+#ifdef HAVE_ADDRESS_SPACE_IPAGES
+	found = radix_tree_gang_lookup(&mapping->i_pages,
+#else
 	found = radix_tree_gang_lookup(&mapping->page_tree,
+#endif
 				       (void **)&page, offset, 1);
 	if (found > 0 && !radix_tree_exceptional_entry(page)) {
+#endif
 		struct lu_dirpage *dp;
 
 		get_page(page);
-		spin_unlock_irq(&mapping->tree_lock);
+#ifdef HAVE_ADDRESS_SPACE_XARRAY
+		xas_unlock_irq(&xas);
+#else
+		unlock_mappings(mapping);
+#endif
 		/*
 		 * In contrast to find_lock_page() we are sure that directory
 		 * page cannot be truncated (while DLM lock is held) and,
@@ -1048,8 +1066,12 @@ static struct page *mdc_page_locate(struct address_space *mapping, __u64 *hash,
 			page = ERR_PTR(-EIO);
 		}
 	} else {
-		spin_unlock_irq(&mapping->tree_lock);
+#ifdef HAVE_ADDRESS_SPACE_XARRAY
+		xas_unlock_irq(&xas);
+#else
+		unlock_mappings(mapping);
 		page = NULL;
+#endif
 	}
 	return page;
 }

From bb8ba55ef67baa04a5972d27db9fa7e910eded89 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Tue, 5 Mar 2019 00:27:22 +0000
Subject: [PATCH 038/737] lustre: adapt to upstream atomic_open interface
 change

An upstream Linux patch series integrated in 4.19 removed the
"opened" argument to the atomic_open fs entry point and finish_open.
Adapt the code to deal with this.

Linux commit: be12af3ef5e6 ("getting rid of 'opened' argument of ->atomic_open() - part 1")

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 .../staging/lustrefsx/lustre/llite/namei.c    | 22 +++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/lustrefsx/lustre/llite/namei.c b/drivers/staging/lustrefsx/lustre/llite/namei.c
index cf9f3e26d9a54..0a7b62dd8a5ec 100644
--- a/drivers/staging/lustrefsx/lustre/llite/namei.c
+++ b/drivers/staging/lustrefsx/lustre/llite/namei.c
@@ -705,7 +705,11 @@ static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry,
  */
 static int ll_atomic_open(struct inode *dir, struct dentry *dentry,
 			  struct file *file, unsigned open_flags,
-			  umode_t mode, int *opened)
+			  umode_t mode
+#ifndef HAVE_ATOMIC_OPEN_NO_OPENED
+			  , int *opened
+#endif
+			  )
 {
 	struct lookup_intent *it;
 	struct dentry *de;
@@ -715,10 +719,17 @@ static int ll_atomic_open(struct inode *dir, struct dentry *dentry,
 	int rc = 0;
 	ENTRY;
 
+#ifdef HAVE_ATOMIC_OPEN_NO_OPENED
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p), file %p,"
+			   "open_flags %x, mode %x\n",
+	       dentry->d_name.len, dentry->d_name.name,
+	       PFID(ll_inode2fid(dir)), dir, file, open_flags, mode);
+#else
 	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p), file %p,"
 			   "open_flags %x, mode %x opened %d\n",
 	       dentry->d_name.len, dentry->d_name.name,
 	       PFID(ll_inode2fid(dir)), dir, file, open_flags, mode, *opened);
+#endif
 
 	/* Only negative dentries enter here */
 	LASSERT(dentry->d_inode == NULL);
@@ -771,8 +782,11 @@ static int ll_atomic_open(struct inode *dir, struct dentry *dentry,
 					dput(de);
 				goto out_release;
 			}
-
+#ifdef HAVE_ATOMIC_OPEN_NO_OPENED
+			file->f_mode |= FMODE_CREATED;
+#else
 			*opened |= FILE_CREATED;
+#endif
 		}
 		if (dentry->d_inode && it_disposition(it, DISP_OPEN_OPEN)) {
 			/* Open dentry. */
@@ -783,7 +797,11 @@ static int ll_atomic_open(struct inode *dir, struct dentry *dentry,
 				rc = finish_no_open(file, de);
 			} else {
 				file->private_data = it;
+#ifdef HAVE_ATOMIC_OPEN_NO_OPENED
+				rc = finish_open(file, dentry, NULL);
+#else
 				rc = finish_open(file, dentry, NULL, opened);
+#endif
 				/* We dget in ll_splice_alias. finish_open takes
 				 * care of dget for fd open.
 				 */

From ee7d4616489a09f1d667a5e19c2266edbaf61dec Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Fri, 8 Mar 2019 23:50:49 +0000
Subject: [PATCH 039/737] lustre: adapt to changed kernel socket interfaces

Linux 4.16 changed kernel_get{sock,peer}name to return the lengh of the
sockaddr, instead of using an in/out param. Adapt to this change.

Linux commit: 9b2c45d479d0 ("net: make getname() functions return
length rather than use int* parameter")

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/staging/lustrefsx/lnet/lnet/lib-socket.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
index a1b2b327a3840..6bd3f4e21429f 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
@@ -502,14 +502,23 @@ int
 lnet_sock_getaddr(struct socket *sock, bool remote, __u32 *ip, int *port)
 {
 	struct sockaddr_in sin;
-	int		   len = sizeof(sin);
 	int		   rc;
 
+#ifdef HAVE_KERNSOCK_RETURNSLEN
+	if (remote)
+		rc = kernel_getpeername(sock, (struct sockaddr *)&sin);
+	else
+		rc = kernel_getsockname(sock, (struct sockaddr *)&sin);
+	if (rc < 0) {
+#else
+	int		   len = sizeof(sin);
+
 	if (remote)
 		rc = kernel_getpeername(sock, (struct sockaddr *)&sin, &len);
 	else
 		rc = kernel_getsockname(sock, (struct sockaddr *)&sin, &len);
 	if (rc != 0) {
+#endif
 		CERROR("Error %d getting sock %s IP/port\n",
 			rc, remote ? "peer" : "local");
 		return rc;

From 943e66a1e0e99168d4e5e6f18bfc4d7f4eae71bd Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Tue, 5 Mar 2019 19:08:42 +0000
Subject: [PATCH 040/737] lustre: reintroduce ATTR_ATTR_FLAG

In Linux kernels >= 4.19, the ATTR_ATTR_FLAG flag was removed because
nobody was using it.  But Lustre does. So define it if needed. It is
safe to do so, since it's not been replaced with a different flag with
the same value.

Linux commit: 4cdfffc8722e ("vfs: discard ATTR_ATTR_FLAG")

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 .../staging/lustrefsx/lustre/include/lustre_compat.h   | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
index c5915f000a0af..13582375848a1 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
@@ -84,6 +84,16 @@ static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
  */
 #define ATTR_BLOCKS    (1 << 27)
 
+/*
+ * In more recent kernels, this flag was removed because nobody was using it.
+ * But Lustre does. So define it if needed. It is safe to do so, since it's
+ * not been replaced with a different flag with the same value, and Lustre
+ * only uses it internally.
+ */
+#ifndef ATTR_ATTR_FLAG
+#define ATTR_ATTR_FLAG (1 << 10)
+#endif
+
 #define current_ngroups current_cred()->group_info->ngroups
 #define current_groups current_cred()->group_info->small_block
 

From 5f1c156d4d20447202412068adda8d7a7ded45d0 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Tue, 5 Mar 2019 20:58:46 +0000
Subject: [PATCH 041/737] lustre: adapt for totalram_pages change

totalram_pages was changed to a function in Linux 5.0. Adapt for this
change by hiding it behind a TOTALRAM_PAGES define, which is defined
differently depending on the HAVE_TOTALRAM_PAGES_FUNC config. Add the
right include files to the files hat need to check this configuration.

Linux commit: ca79b0c211af ("mm: convert totalram_pages and totalhigh_pages variables to atomic")

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 .../staging/lustrefsx/libcfs/include/libcfs/libcfs_prim.h | 4 ++--
 .../lustrefsx/libcfs/libcfs/linux/linux-tracefile.c       | 3 ++-
 drivers/staging/lustrefsx/lustre/include/lustre_compat.h  | 6 ++++++
 drivers/staging/lustrefsx/lustre/include/obd.h            | 4 ++--
 drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c          | 6 +++---
 drivers/staging/lustrefsx/lustre/llite/lproc_llite.c      | 8 ++++----
 drivers/staging/lustrefsx/lustre/obdclass/class_obd.c     | 6 +++---
 .../lustrefsx/lustre/obdclass/linux/linux-sysctl.c        | 2 +-
 drivers/staging/lustrefsx/lustre/obdclass/lu_object.c     | 2 +-
 drivers/staging/lustrefsx/lustre/osc/lproc_osc.c          | 2 +-
 drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c    | 2 +-
 drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c        | 6 +++---
 12 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_prim.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_prim.h
index c5923984d0dd0..32a629e25eb77 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_prim.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_prim.h
@@ -44,9 +44,9 @@
 #if BITS_PER_LONG == 32
 /* limit to lowmem on 32-bit systems */
 # define NUM_CACHEPAGES \
-	min(totalram_pages, 1UL << (30 - PAGE_SHIFT) * 3 / 4)
+	min(TOTALRAM_PAGES, 1UL << (30 - PAGE_SHIFT) * 3 / 4)
 #else
-# define NUM_CACHEPAGES totalram_pages
+# define NUM_CACHEPAGES TOTALRAM_PAGES
 #endif
 
 static inline unsigned int memory_pressure_get(void)
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-tracefile.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-tracefile.c
index 6e08612044541..274179dd54fe9 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-tracefile.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-tracefile.c
@@ -34,6 +34,7 @@
 #define LUSTRE_TRACEFILE_PRIVATE
 
 #include <libcfs/libcfs.h>
+#include <lustre_compat.h>
 #include "tracefile.h"
 
 /* percents to share the total debug memory for each type */
@@ -266,7 +267,7 @@ void cfs_print_to_console(struct ptldebug_header *hdr, int mask,
 
 int cfs_trace_max_debug_mb(void)
 {
-	int  total_mb = (totalram_pages >> (20 - PAGE_SHIFT));
+	int  total_mb = (TOTALRAM_PAGES >> (20 - PAGE_SHIFT));
 
 	return MAX(512, (total_mb * 80)/100);
 }
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
index 13582375848a1..115d7f2141488 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
@@ -724,4 +724,10 @@ static inline void unlock_mappings(struct address_space *mappings)
 }
 #endif
 
+#ifdef HAVE_TOTALRAM_PAGES_FUNC
+#define TOTALRAM_PAGES totalram_pages()
+#else
+#define TOTALRAM_PAGES totalram_pages
+#endif
+
 #endif /* _LUSTRE_COMPAT_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/obd.h b/drivers/staging/lustrefsx/lustre/include/obd.h
index ba31e450be2e0..9910a189aef4f 100644
--- a/drivers/staging/lustrefsx/lustre/include/obd.h
+++ b/drivers/staging/lustrefsx/lustre/include/obd.h
@@ -1220,8 +1220,8 @@ static inline void client_adjust_max_dirty(struct client_obd *cli)
 			cli->cl_dirty_max_pages = dirty_max;
 	}
 
-	if (cli->cl_dirty_max_pages > totalram_pages / 8)
-		cli->cl_dirty_max_pages = totalram_pages / 8;
+	if (cli->cl_dirty_max_pages > TOTALRAM_PAGES / 8)
+		cli->cl_dirty_max_pages = TOTALRAM_PAGES / 8;
 }
 
 #endif /* __OBD_H */
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c
index 3836f99d01aaf..208ab5d481c84 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c
@@ -415,11 +415,11 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
 
 	if (!strcmp(name, LUSTRE_MDC_NAME)) {
 		cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT;
-	} else if (totalram_pages >> (20 - PAGE_SHIFT) <= 128 /* MB */) {
+	} else if (TOTALRAM_PAGES >> (20 - PAGE_SHIFT) <= 128 /* MB */) {
 		cli->cl_max_rpcs_in_flight = 2;
-	} else if (totalram_pages >> (20 - PAGE_SHIFT) <= 256 /* MB */) {
+	} else if (TOTALRAM_PAGES >> (20 - PAGE_SHIFT) <= 256 /* MB */) {
 		cli->cl_max_rpcs_in_flight = 3;
-	} else if (totalram_pages >> (20 - PAGE_SHIFT) <= 512 /* MB */) {
+	} else if (TOTALRAM_PAGES >> (20 - PAGE_SHIFT) <= 512 /* MB */) {
 		cli->cl_max_rpcs_in_flight = 4;
 	} else {
 		if (osc_on_mdt(obddev->obd_name))
diff --git a/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c b/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
index 7dfb36d2873cd..d794bf9cd5f6f 100755
--- a/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
+++ b/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
@@ -308,12 +308,12 @@ ll_max_readahead_mb_seq_write(struct file *file, const char __user *buffer,
 
 	pages_number >>= PAGE_SHIFT;
 
-	if (pages_number < 0 || pages_number > totalram_pages / 2) {
+	if (pages_number < 0 || pages_number > TOTALRAM_PAGES / 2) {
 		/* 1/2 of RAM */
 		CERROR("%s: can't set max_readahead_mb=%lu > %luMB\n",
 		       ll_get_fsname(sb, NULL, 0),
 		       (unsigned long)pages_number >> (20 - PAGE_SHIFT),
-		       totalram_pages >> (20 - PAGE_SHIFT + 1));
+		       TOTALRAM_PAGES >> (20 - PAGE_SHIFT + 1));
 		return -ERANGE;
 	}
 
@@ -479,10 +479,10 @@ ll_max_cached_mb_seq_write(struct file *file, const char __user *buffer,
 
 	pages_number >>= PAGE_SHIFT;
 
-	if (pages_number < 0 || pages_number > totalram_pages) {
+	if (pages_number < 0 || pages_number > TOTALRAM_PAGES) {
 		CERROR("%s: can't set max cache more than %lu MB\n",
 		       ll_get_fsname(sb, NULL, 0),
-		       totalram_pages >> (20 - PAGE_SHIFT));
+		       TOTALRAM_PAGES >> (20 - PAGE_SHIFT));
 		RETURN(-ERANGE);
 	}
 	/* Allow enough cache so clients can make well-formed RPCs */
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c b/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c
index 913ae54465b4f..8e2d803fe3bc3 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c
@@ -540,10 +540,10 @@ static int __init obdclass_init(void)
 	/* Default the dirty page cache cap to 1/2 of system memory.
 	 * For clients with less memory, a larger fraction is needed
 	 * for other purposes (mostly for BGL). */
-	if (totalram_pages <= 512 << (20 - PAGE_SHIFT))
-		obd_max_dirty_pages = totalram_pages / 4;
+	if (TOTALRAM_PAGES <= 512 << (20 - PAGE_SHIFT))
+		obd_max_dirty_pages = TOTALRAM_PAGES / 4;
 	else
-		obd_max_dirty_pages = totalram_pages / 2;
+		obd_max_dirty_pages = TOTALRAM_PAGES / 2;
 
 	err = obd_init_caches();
 	if (err)
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-sysctl.c b/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-sysctl.c
index 19f95b8187ca9..a531c970c3fb6 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-sysctl.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-sysctl.c
@@ -111,7 +111,7 @@ static ssize_t max_dirty_mb_store(struct kobject *kobj, struct attribute *attr,
 
 	val *= 1 << (20 - PAGE_SHIFT); /* convert to pages */
 
-	if (val > ((totalram_pages / 10) * 9)) {
+	if (val > ((TOTALRAM_PAGES / 10) * 9)) {
 		/* Somebody wants to assign too much memory to dirty pages */
 		return -EINVAL;
 	}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c b/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c
index f6a043b7af9e8..96ebb4cf9b015 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c
@@ -888,7 +888,7 @@ static unsigned long lu_htable_order(struct lu_device *top)
          *
          * Size of lu_object is (arbitrary) taken as 1K (together with inode).
          */
-	cache_size = totalram_pages;
+	cache_size = TOTALRAM_PAGES;
 
 #if BITS_PER_LONG == 32
         /* limit hashtable size for lowmem systems to low RAM */
diff --git a/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c b/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c
index c7c3f0cd5bb23..6c71d4c6b19d7 100644
--- a/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c
+++ b/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c
@@ -160,7 +160,7 @@ static ssize_t osc_max_dirty_mb_seq_write(struct file *file,
 
 	if (pages_number <= 0 ||
 	    pages_number >= OSC_MAX_DIRTY_MB_MAX << (20 - PAGE_SHIFT) ||
-	    pages_number > totalram_pages / 4) /* 1/4 of RAM */
+	    pages_number > TOTALRAM_PAGES / 4) /* 1/4 of RAM */
 		return -ERANGE;
 
 	spin_lock(&cli->cl_loi_list_lock);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c b/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
index ed2058a159ed2..93871b4c7d090 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
@@ -321,7 +321,7 @@ ptlrpc_lprocfs_req_history_max_seq_write(struct file *file,
 	 * far. */
 	bufpages = (svc->srv_buf_size + PAGE_SIZE - 1) >>
 							PAGE_SHIFT;
-	if (val > totalram_pages/(2 * bufpages))
+	if (val > TOTALRAM_PAGES/(2 * bufpages))
 		return -ERANGE;
 
 	spin_lock(&svc->srv_lock);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c
index 748962d4f17f0..488b2398e4342 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c
@@ -155,7 +155,7 @@ int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v)
 		   "max waitqueue depth:     %u\n"
 		   "max wait time:           %ld/%lu\n"
 		   "out of mem:              %lu\n",
-		   totalram_pages, PAGES_PER_POOL,
+		   TOTALRAM_PAGES, PAGES_PER_POOL,
 		   page_pools.epp_max_pages,
 		   page_pools.epp_max_pools,
 		   page_pools.epp_total_pages,
@@ -775,9 +775,9 @@ int sptlrpc_enc_pool_init(void)
 	DEF_SHRINKER_VAR(shvar, enc_pools_shrink,
 			 enc_pools_shrink_count, enc_pools_shrink_scan);
 
-	page_pools.epp_max_pages = totalram_pages / 8;
+	page_pools.epp_max_pages = TOTALRAM_PAGES / 8;
 	if (enc_pool_max_memory_mb > 0 &&
-	    enc_pool_max_memory_mb <= (totalram_pages >> mult))
+	    enc_pool_max_memory_mb <= (TOTALRAM_PAGES >> mult))
 		page_pools.epp_max_pages = enc_pool_max_memory_mb << mult;
 
 	page_pools.epp_max_pools = npages_to_npools(page_pools.epp_max_pages);

From ab6529a101ee9537cb58ca150cca45ab7cfdd20d Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Wed, 6 Mar 2019 20:14:16 +0000
Subject: [PATCH 042/737] lustre: config.h file for Linux 5.0

Set the right config values for Linux 5.0 in the config.h file.
---
 drivers/staging/lustrefsx/config.h | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/drivers/staging/lustrefsx/config.h b/drivers/staging/lustrefsx/config.h
index a0c7aaa6a0472..657f3cca73c95 100644
--- a/drivers/staging/lustrefsx/config.h
+++ b/drivers/staging/lustrefsx/config.h
@@ -139,6 +139,9 @@
 /* current_time() has replaced CURRENT_TIME */
 #define HAVE_CURRENT_TIME 1
 
+/* inode times are timespec64 */
+#define HAVE_INODE_TIME_64BIT 1
+
 /* dcache_lock is exist */
 /* #undef HAVE_DCACHE_LOCK */
 
@@ -607,7 +610,7 @@
 /* #undef HAVE_OPENSSL_SSK */
 
 /* 'pagevec_init' takes one parameter */
-/* #undef HAVE_PAGEVEC_INIT_ONE_PARAM */
+#define HAVE_PAGEVEC_INIT_ONE_PARAM 1
 
 /* have PCLMULQDQ instruction */
 #define HAVE_PCLMULQDQ 1
@@ -796,6 +799,9 @@
 /* tcp_sendpage use socket as first parameter */
 /* #undef HAVE_TCP_SENDPAGE_USE_SOCKET */
 
+/* timer_setup has replaced setup_timer */
+#define HAVE_TIMER_SETUP
+
 /* 'struct timespec64' is available */
 #define HAVE_TIMESPEC64 1
 
@@ -865,6 +871,27 @@
 /* __add_wait_queue_exclusive exists */
 /* #undef HAVE___ADD_WAIT_QUEUE_EXCLUSIVE */
 
+/* struct address_space uses i_pages and xa_lock */
+/* #undef HAVE_ADDRESS_SPACE_IPAGES */
+
+/* struct address_space was converted to an Xarray */
+#define HAVE_ADDRESS_SPACE_XARRAY 1
+
+/* posix acl uses the refcount interface */
+#define HAVE_POSIX_ACL_REFCOUNT 1
+
+/* sys_close was converted to ksys_close for kernel use */
+#define HAVE_KSYS_CLOSE 1
+
+/* kernel_get{sock,peer}name was converted to return the sockaddr length */
+#define HAVE_KERNSOCK_RETURNSLEN 1
+
+/* the 'opened' argument to finish_open and atomic_open was removed */
+#define HAVE_ATOMIC_OPEN_NO_OPENED 1
+
+/* totalram_pages was turned in to a function */
+#define HAVE_TOTALRAM_PAGES_FUNC 1
+
 /* ext4_journal_start takes 3 arguments */
 /* #undef JOURNAL_START_HAS_3ARGS */
 

From 5c4f89567a419130356d9014b60a8ea3d7c821da Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Sun, 24 Mar 2019 20:16:13 +0000
Subject: [PATCH 043/737] lustre: account for the SO_*TIMEO -> SO_*TIME_OLD
 rename

Post 5.0, the in-kernel SO_SNDTIMEO and SO_RCVTIMEO defines
got a _OLD suffix, to deal with the 2038 64bit time_t interface
changes.

Deal with this by defining them to their _OLD values if the
_OLD values are defined.

Linux commit: 45bdc66159d49bfc7f75fe02d25bc74f5d2660cf ("socket: Rename SO_RCVTIMEO/ SO_SNDTIMEO with _OLD suffixes")

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/staging/lustrefsx/lnet/lnet/lib-socket.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
index 6bd3f4e21429f..faa8fc3a01101 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
@@ -43,6 +43,18 @@
 #include <libcfs/libcfs.h>
 #include <lnet/lib-lnet.h>
 
+/*
+ * Deal with the post-5.0 rename of these in-kernel values.
+ */
+#if !defined(SO_RCVTIMEO) && defined(SO_RCVTIMEO_OLD)
+#define SO_RCVTIMEO SO_RCVTIMEO_OLD
+#endif
+
+#if !defined(SO_SNDTIMEO) && defined(SO_SNDTIMEO_OLD)
+#define SO_SNDTIMEO SO_SNDTIMEO_OLD
+#endif
+
+
 static int
 kernel_sock_unlocked_ioctl(struct file *filp, int cmd, unsigned long arg)
 {

From 313b6209a9ac1a2834a501f7192526eeaf874d95 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Sun, 24 Mar 2019 22:41:38 +0000
Subject: [PATCH 044/737] lustre: adapt for fault / page_mkwrite return type
 change

The return type for the fault and page_mkwrite handlers
was changed to vm_fault_t - adapt.

Linux commit: 3d3539018d2cbd12e5af4a132636ee7fd8d43ef0 ("mm: create the new vm_fault_t type")

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/staging/lustrefsx/lustre/include/lustre_compat.h | 8 ++++++++
 drivers/staging/lustrefsx/lustre/llite/llite_mmap.c      | 4 ++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
index 115d7f2141488..2fb308a998ac1 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
@@ -730,4 +730,12 @@ static inline void unlock_mappings(struct address_space *mappings)
 #define TOTALRAM_PAGES totalram_pages
 #endif
 
+#ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY
+# ifdef HAVE_VM_FAULT_T
+#  define VM_FAULT_T vm_fault_t
+# else
+#  define VM_FAULT_T int
+# endif
+#endif
+
 #endif /* _LUSTRE_COMPAT_H */
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c b/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
index 0fff9b9663a9f..79adfa9378515 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
@@ -333,7 +333,7 @@ static int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 
 #ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY
-static int ll_fault(struct vm_fault *vmf)
+static VM_FAULT_T ll_fault(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 #else
@@ -382,7 +382,7 @@ static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 
 #ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY
-static int ll_page_mkwrite(struct vm_fault *vmf)
+static VM_FAULT_T ll_page_mkwrite(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 #else

From f179572d5c17738277e26b4c7956e6dfe003e90b Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Sun, 24 Mar 2019 22:47:54 +0000
Subject: [PATCH 045/737] lustre: add HAVE_VM_FAULT_T to config.h

Add HAVE_VM_FAULT_T to config.h for 5.1 and later kernels

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/staging/lustrefsx/config.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/staging/lustrefsx/config.h b/drivers/staging/lustrefsx/config.h
index 657f3cca73c95..5e042baf49d72 100644
--- a/drivers/staging/lustrefsx/config.h
+++ b/drivers/staging/lustrefsx/config.h
@@ -892,6 +892,9 @@
 /* totalram_pages was turned in to a function */
 #define HAVE_TOTALRAM_PAGES_FUNC 1
 
+/* vm_fault_t exists */
+#define HAVE_VM_FAULT_T 1
+
 /* ext4_journal_start takes 3 arguments */
 /* #undef JOURNAL_START_HAS_3ARGS */
 

From fc82b9bfbe497a674836ba04321efc070bbb3e09 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Sun, 24 Mar 2019 23:24:05 +0000
Subject: [PATCH 046/737] lustre: adapt to get_ds() removal

The ancient get_ds() function was removed upstream - replace
it with KERNEL_DS, as the commit also did.

Linux commit: 736706bee3298208343a76096370e4f6a5c55915 ("get rid of legacy 'get_ds()' function")

[Note: this is one of the pieces of lustre code that makes it
 x86 specific - the FSx team should address this]

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c
index c93e87dcc56d2..62e03bde77221 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c
@@ -37,9 +37,9 @@
 /* Debugging check only needed during development */
 #ifdef OBD_CTXT_DEBUG
 # define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC)
-# define ASSERT_NOT_KERNEL_CTXT(msg) LASSERTF(!segment_eq(get_fs(), get_ds()),\
+# define ASSERT_NOT_KERNEL_CTXT(msg) LASSERTF(!segment_eq(get_fs(), KERNEL_DS),\
 					      msg)
-# define ASSERT_KERNEL_CTXT(msg) LASSERTF(segment_eq(get_fs(), get_ds()), msg)
+# define ASSERT_KERNEL_CTXT(msg) LASSERTF(segment_eq(get_fs(), KERNEL_DS), msg)
 #else
 # define ASSERT_CTXT_MAGIC(magic) do {} while(0)
 # define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0)

From 9f682873bf249af7faf2fa4efe50252f5cf151f8 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Wed, 3 Apr 2019 15:40:50 +0000
Subject: [PATCH 047/737] lustre: fix ACL handling

This patch comes from Lustre 2.12 and is needed to behave
properly when using Lustre 2.10 and a recent kernel like 4.14.

Fix errors like:

  $ getfacl my-file
  getfacl: my-file: Invalid argument

The original patch is:

    LU-10785 llite: use xattr_handler name for ACLs

    If struct xattr_handler has a name member then use it (rather than
    prefix) for the ACL xattrs. This avoids a bug where ACL operations
    failed for some kernels.

    Signed-off-by: John L. Hammond <john.hammond@intel.com>
    Change-Id: I28f6c5dbe3cdc4155e93d388d2c413092e02c082
    Reviewed-on: https://review.whamcloud.com/31595

cr https://code.amazon.com/reviews/CR-4309162

Signed-off-by: Aurelien Degremont <degremoa@amazon.com>
Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/staging/lustrefsx/config.h            |  3 ++
 .../staging/lustrefsx/lustre/llite/xattr.c    | 30 ++++++++++++++-----
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/drivers/staging/lustrefsx/config.h b/drivers/staging/lustrefsx/config.h
index 5e042baf49d72..79c23e3d239f3 100644
--- a/drivers/staging/lustrefsx/config.h
+++ b/drivers/staging/lustrefsx/config.h
@@ -826,6 +826,9 @@
 /* Define to 1 if you have the <unistd.h> header file. */
 #define HAVE_UNISTD_H 1
 
+/* xattr_handler has a name member */
+#define HAVE_XATTR_HANDLER_NAME 1
+
 /* kernel has vfs_rename with 5 args */
 /* #undef HAVE_VFS_RENAME_5ARGS */
 
diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr.c b/drivers/staging/lustrefsx/lustre/llite/xattr.c
index 67cc5139f7366..e1e6e34dc0e02 100644
--- a/drivers/staging/lustrefsx/lustre/llite/xattr.c
+++ b/drivers/staging/lustrefsx/lustre/llite/xattr.c
@@ -45,17 +45,25 @@
 
 #include "llite_internal.h"
 
+#ifndef HAVE_XATTR_HANDLER_NAME
+static inline const char *xattr_prefix(const struct xattr_handler *handler)
+{
+	return handler->prefix;
+}
+#endif
+
 const struct xattr_handler *get_xattr_type(const char *name)
 {
-	int i = 0;
+	int i;
 
-	while (ll_xattr_handlers[i]) {
-		size_t len = strlen(ll_xattr_handlers[i]->prefix);
+	for (i = 0; ll_xattr_handlers[i]; i++) {
+		const char *prefix = xattr_prefix(ll_xattr_handlers[i]);
+		size_t prefix_len = strlen(prefix);
 
-		if (!strncmp(ll_xattr_handlers[i]->prefix, name, len))
+		if (!strncmp(prefix, name, prefix_len))
 			return ll_xattr_handlers[i];
-		i++;
 	}
+
 	return NULL;
 }
 
@@ -141,7 +149,7 @@ static int ll_xattr_set_common(const struct xattr_handler *handler,
 			RETURN(-EPERM);
 	}
 
-	fullname = kasprintf(GFP_KERNEL, "%s%s", handler->prefix, name);
+	fullname = kasprintf(GFP_KERNEL, "%s%s", xattr_prefix(handler), name);
 	if (!fullname)
 		RETURN(-ENOMEM);
 
@@ -451,7 +459,7 @@ static int ll_xattr_get_common(const struct xattr_handler *handler,
 		RETURN(-ENODATA);
 #endif
 
-	fullname = kasprintf(GFP_KERNEL, "%s%s", handler->prefix, name);
+	fullname = kasprintf(GFP_KERNEL, "%s%s", xattr_prefix(handler), name);
 	if (!fullname)
 		RETURN(-ENOMEM);
 
@@ -762,7 +770,11 @@ static const struct xattr_handler ll_security_xattr_handler = {
 };
 
 static const struct xattr_handler ll_acl_access_xattr_handler = {
+#ifdef HAVE_XATTR_HANDLER_NAME
+	.name = XATTR_NAME_POSIX_ACL_ACCESS,
+#else
 	.prefix = XATTR_NAME_POSIX_ACL_ACCESS,
+#endif
 	.flags = XATTR_ACL_ACCESS_T,
 #if defined(HAVE_XATTR_HANDLER_SIMPLIFIED)
 	.get = ll_xattr_get_common_4_3,
@@ -777,7 +789,11 @@ static const struct xattr_handler ll_acl_access_xattr_handler = {
 };
 
 static const struct xattr_handler ll_acl_default_xattr_handler = {
+#ifdef HAVE_XATTR_HANDLER_NAME
+	.name = XATTR_NAME_POSIX_ACL_DEFAULT,
+#else
 	.prefix = XATTR_NAME_POSIX_ACL_DEFAULT,
+#endif
 	.flags = XATTR_ACL_DEFAULT_T,
 #if defined(HAVE_XATTR_HANDLER_SIMPLIFIED)
 	.get = ll_xattr_get_common_4_3,

From 1760b9dca7b26214fe7348dd2e9911ee009c7d1f Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Mon, 20 May 2019 17:56:30 +0000
Subject: [PATCH 048/737] lustre: adapt to stacktrace infrastructure change

In 5.2, the stacktrace code was reworked. The lustre code needed to be
adapted.

Linux commit 214d8ca6ee854f696f75e75511fe66b409e656db ("stacktrace: Provide common infrastructure")

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/staging/lustrefsx/config.h                    |  3 +++
 .../lustrefsx/libcfs/libcfs/linux/linux-debug.c       | 11 ++++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/staging/lustrefsx/config.h b/drivers/staging/lustrefsx/config.h
index 79c23e3d239f3..409cdebc71437 100644
--- a/drivers/staging/lustrefsx/config.h
+++ b/drivers/staging/lustrefsx/config.h
@@ -898,6 +898,9 @@
 /* vm_fault_t exists */
 #define HAVE_VM_FAULT_T 1
 
+/* Common stacktrace infrastructure exists */
+#define HAVE_COMMON_STACKTRACE 1
+
 /* ext4_journal_start takes 3 arguments */
 /* #undef JOURNAL_START_HAS_3ARGS */
 
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-debug.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-debug.c
index edd19396dd69f..cbd2187a9f63f 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-debug.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-debug.c
@@ -120,20 +120,29 @@ static DEFINE_SPINLOCK(st_lock);
 
 static void libcfs_call_trace(struct task_struct *tsk)
 {
-	struct stack_trace trace;
 	static unsigned long entries[MAX_ST_ENTRIES];
+#ifdef HAVE_COMMON_STACKTRACE
+	unsigned int len;
+#else
+	struct stack_trace trace;
 
 	trace.nr_entries = 0;
 	trace.max_entries = MAX_ST_ENTRIES;
 	trace.entries = entries;
 	trace.skip = 0;
+#endif
 
 	spin_lock(&st_lock);
 	pr_info("Pid: %d, comm: %.20s %s %s\n", tsk->pid, tsk->comm,
 	       init_utsname()->release, init_utsname()->version);
 	pr_info("Call Trace:\n");
+#ifdef HAVE_COMMON_STACKTRACE
+	len = stack_trace_save(entries, MAX_ST_ENTRIES, 2);
+	stack_trace_print(entries, len, 1);
+#else
 	save_stack_trace_tsk(tsk, &trace);
 	print_stack_trace(&trace, 0);
+#endif
 	spin_unlock(&st_lock);
 }
 

From a29bcb9f3eb67320661f2f2e7abb4b1739b18774 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Fri, 13 Sep 2019 18:23:35 +0000
Subject: [PATCH 049/737] lustre: fix fall through warnings.

Fix fall through warnings in the lustre code now that
-Werror=implicit-fallthrough is enabled.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c   | 1 +
 drivers/staging/lustrefsx/lnet/lnet/net_fault.c          | 1 +
 drivers/staging/lustrefsx/lnet/selftest/conctl.c         | 1 +
 drivers/staging/lustrefsx/lnet/selftest/module.c         | 6 ++++++
 drivers/staging/lustrefsx/lnet/selftest/rpc.c            | 9 +++++++++
 drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c     | 1 +
 drivers/staging/lustrefsx/lustre/llite/namei.c           | 1 +
 .../staging/lustrefsx/lustre/obdclass/lprocfs_status.c   | 4 ++++
 drivers/staging/lustrefsx/lustre/obdecho/echo_client.c   | 5 +++++
 drivers/staging/lustrefsx/lustre/osc/osc_cache.c         | 1 +
 drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c   | 6 +++---
 11 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c
index c4700c0713948..c66870631aa98 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c
@@ -2299,6 +2299,7 @@ ksocknal_base_shutdown(void)
         switch (ksocknal_data.ksnd_init) {
         default:
                 LASSERT (0);
+		/* Fall through */
 
         case SOCKNAL_INIT_ALL:
         case SOCKNAL_INIT_DATA:
diff --git a/drivers/staging/lustrefsx/lnet/lnet/net_fault.c b/drivers/staging/lustrefsx/lnet/lnet/net_fault.c
index 56e62423f21c8..b3d5b907a827b 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/net_fault.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/net_fault.c
@@ -638,6 +638,7 @@ delayed_msg_process(struct list_head *msg_list, bool drop)
 			case LNET_CREDIT_OK:
 				lnet_ni_recv(ni, msg->msg_private, msg, 0,
 					     0, msg->msg_len, msg->msg_len);
+				/* Fall through */
 			case LNET_CREDIT_WAIT:
 				continue;
 			default: /* failures */
diff --git a/drivers/staging/lustrefsx/lnet/selftest/conctl.c b/drivers/staging/lustrefsx/lnet/selftest/conctl.c
index 5476097fbc1ba..e7b9d05d8cd32 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/conctl.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/conctl.c
@@ -151,6 +151,7 @@ lst_debug_ioctl(struct lstio_debug_args *args)
 
         case LST_OPC_BATCHSRV:
                 client = 0;
+		/* Fall through */
         case LST_OPC_BATCHCLI:
                 if (name == NULL)
                         goto out;
diff --git a/drivers/staging/lustrefsx/lnet/selftest/module.c b/drivers/staging/lustrefsx/lnet/selftest/module.c
index 2f87742142d4a..56212a840dcc4 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/module.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/module.c
@@ -57,10 +57,13 @@ lnet_selftest_exit(void)
         switch (lst_init_step) {
                 case LST_INIT_CONSOLE:
                         lstcon_console_fini();
+		/* Fall through */
                 case LST_INIT_FW:
                         sfw_shutdown();
+		/* Fall through */
                 case LST_INIT_RPC:
                         srpc_shutdown();
+		/* Fall through */
 		case LST_INIT_WI_TEST:
 			for (i = 0;
 			     i < cfs_cpt_number(lnet_cpt_table()); i++) {
@@ -72,12 +75,15 @@ lnet_selftest_exit(void)
 				    sizeof(lst_sched_test[0]) *
 				    cfs_cpt_number(lnet_cpt_table()));
 			lst_sched_test = NULL;
+		/* Fall through */
 
 		case LST_INIT_WI_SERIAL:
 			cfs_wi_sched_destroy(lst_sched_serial);
 			lst_sched_serial = NULL;
+		/* Fall through */
                 case LST_INIT_NONE:
                         break;
+		/* Fall through */
                 default:
                         LBUG();
         }
diff --git a/drivers/staging/lustrefsx/lnet/selftest/rpc.c b/drivers/staging/lustrefsx/lnet/selftest/rpc.c
index abed28104aa69..ed88dfeac7085 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/rpc.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/rpc.c
@@ -1020,6 +1020,7 @@ srpc_handle_rpc(swi_workitem_t *wi)
                         ev->ev_status = rc;
                 }
         }
+	/* Fall through */
         case SWI_STATE_BULK_STARTED:
                 LASSERT (rpc->srpc_bulk == NULL || ev->ev_fired);
 
@@ -1218,6 +1219,7 @@ srpc_send_rpc (swi_workitem_t *wi)
 
                 wi->swi_state = SWI_STATE_REQUEST_SENT;
                 /* perhaps more events, fall thru */
+		/* Fall through */
         case SWI_STATE_REQUEST_SENT: {
                 srpc_msg_type_t type = srpc_service2reply(rpc->crpc_service);
 
@@ -1249,6 +1251,7 @@ srpc_send_rpc (swi_workitem_t *wi)
 
                 wi->swi_state = SWI_STATE_REPLY_RECEIVED;
         }
+	/* Fall through */
         case SWI_STATE_REPLY_RECEIVED:
                 if (do_bulk && !rpc->crpc_bulkev.ev_fired) break;
 
@@ -1427,6 +1430,7 @@ srpc_lnet_ev_handler(struct lnet_event *ev)
 			srpc_data.rpc_counters.rpcs_sent++;
 			spin_unlock(&srpc_data.rpc_glock);
                 }
+		/* Fall through */
         case SRPC_REPLY_RCVD:
         case SRPC_BULK_REQ_RCVD:
                 crpc = rpcev->ev_data;
@@ -1546,6 +1550,7 @@ srpc_lnet_ev_handler(struct lnet_event *ev)
 
                 if (!ev->unlinked)
                         break; /* wait for final event */
+		/* Fall through */
 
         case SRPC_BULK_PUT_SENT:
                 if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
@@ -1558,6 +1563,7 @@ srpc_lnet_ev_handler(struct lnet_event *ev)
 
 			spin_unlock(&srpc_data.rpc_glock);
 		}
+		/* Fall through */
 	case SRPC_REPLY_SENT:
 		srpc = rpcev->ev_data;
 		scd  = srpc->srpc_scd;
@@ -1652,6 +1658,7 @@ srpc_shutdown (void)
 		spin_unlock(&srpc_data.rpc_glock);
 
                 stt_shutdown();
+		/* Fall through */
 
         case SRPC_STATE_EQ_INIT:
                 rc = LNetClearLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
@@ -1659,9 +1666,11 @@ srpc_shutdown (void)
                 LASSERT (rc == 0);
                 rc = LNetEQFree(srpc_data.rpc_lnet_eq);
                 LASSERT (rc == 0); /* the EQ should have no user by now */
+		/* Fall through */
 
         case SRPC_STATE_NI_INIT:
                 LNetNIFini();
+		/* Fall through */
         }
 
         return;
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c
index 14ac08ade0809..f932c9900dd29 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c
@@ -1485,6 +1485,7 @@ ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock,
 		case LDLM_IBITS:
 			if (ns->ns_cancel != NULL && ns->ns_cancel(lock) != 0)
 				break;
+			/* Fall through */
 		default:
 			result = LDLM_POLICY_SKIP_LOCK;
 			lock_res_and_lock(lock);
diff --git a/drivers/staging/lustrefsx/lustre/llite/namei.c b/drivers/staging/lustrefsx/lustre/llite/namei.c
index 0a7b62dd8a5ec..268e11a161e02 100644
--- a/drivers/staging/lustrefsx/lustre/llite/namei.c
+++ b/drivers/staging/lustrefsx/lustre/llite/namei.c
@@ -1162,6 +1162,7 @@ static int ll_mknod(struct inode *dir, struct dentry *dchild, ll_umode_t mode,
         switch (mode & S_IFMT) {
         case 0:
                 mode |= S_IFREG; /* for mode = 0 case, fallthrough */
+		/* Fall through */
         case S_IFREG:
         case S_IFCHR:
         case S_IFBLK:
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
index 0c0badb13a95d..d2d72dcde41d5 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
@@ -1833,15 +1833,19 @@ static int get_mult(char unit, __u64 *mult)
 	case 'p':
 	case 'P':
 		units <<= 10;
+		/* Fall through */
 	case 't':
 	case 'T':
 		units <<= 10;
+		/* Fall through */
 	case 'g':
 	case 'G':
 		units <<= 10;
+		/* Fall through */
 	case 'm':
 	case 'M':
 		units <<= 10;
+		/* Fall through */
 	case 'k':
 	case 'K':
 		units <<= 10;
diff --git a/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c b/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c
index 1b558f7e0e641..769d59ed21b7e 100644
--- a/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c
+++ b/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c
@@ -962,14 +962,19 @@ static struct lu_device *echo_device_alloc(const struct lu_env *env,
                         CERROR("Cleanup obd device %s error(%d)\n",
                                obd->obd_name, rc2);
         }
+	/* Fall through */
 
         case 3:
                 echo_site_fini(env, ed);
+		/* Fall through */
         case 2:
                 cl_device_fini(&ed->ed_cl);
+		/* Fall through */
         case 1:
                 OBD_FREE_PTR(ed);
+		/* Fall through */
         case 0:
+		/* Fall through */
         default:
                 break;
         }
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_cache.c b/drivers/staging/lustrefsx/lustre/osc/osc_cache.c
index 06da0c5333a3b..178340e255ac9 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_cache.c
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_cache.c
@@ -214,6 +214,7 @@ static int osc_extent_sanity_check0(struct osc_extent *ext,
 			GOTO(out, rc = 60);
 		if (ext->oe_fsync_wait && !ext->oe_urgent && !ext->oe_hp)
 			GOTO(out, rc = 65);
+		/* Fall through */
 	default:
 		if (atomic_read(&ext->oe_users) > 0)
 			GOTO(out, rc = 70);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c b/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c
index 0b2a13753430c..3e97aa6332ed3 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c
@@ -836,7 +836,7 @@ __u32 lustre_msg_get_flags(struct lustre_msg *msg)
 
 		CERROR("invalid msg %p: no ptlrpc body!\n", msg);
 	}
-	/* no break */
+	/* Fall through */
 	default:
 		/* flags might be printed in debug code while message
 		 * uninitialized */
@@ -899,7 +899,7 @@ __u32 lustre_msg_get_op_flags(struct lustre_msg *msg)
 
 		CERROR("invalid msg %p: no ptlrpc body!\n", msg);
 	}
-	/* no break */
+	/* Fall through */
 	default:
 		return 0;
 	}
@@ -1104,7 +1104,7 @@ int lustre_msg_get_status(struct lustre_msg *msg)
 			return pb->pb_status;
 		CERROR("invalid msg %p: no ptlrpc body!\n", msg);
 	}
-	/* no break */
+	/* Fall through */
 	default:
 		/* status might be printed in debug code while message
 		* uninitialized */

From 207d35e22c95f664ac377793206c4bfd58311f99 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Fri, 13 Sep 2019 18:24:33 +0000
Subject: [PATCH 050/737] lustre: fix file_lock usage

Now that lm_compare_owner is gone, make the usage of it conditional,
for older kernels.

Linux commit f85d93385e9fe6886a751f647f6812a89bf6bee3 ("locks: Cleanup lm_compare_owner and lm_owner_key")

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/staging/lustrefsx/lustre/llite/file.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/staging/lustrefsx/lustre/llite/file.c b/drivers/staging/lustrefsx/lustre/llite/file.c
index 45cd5a4094308..165141dc52cb5 100644
--- a/drivers/staging/lustrefsx/lustre/llite/file.c
+++ b/drivers/staging/lustrefsx/lustre/llite/file.c
@@ -3268,6 +3268,7 @@ ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
         }
         flock.l_flock.pid = file_lock->fl_pid;
 
+#ifdef HAVE_LM_COMPARE_OWNER
 	/* Somewhat ugly workaround for svc lockd.
 	 * lockd installs custom fl_lmops->lm_compare_owner that checks
 	 * for the fl_owner to be the same (which it always is on local node
@@ -3277,6 +3278,7 @@ ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
 	 * pointer space for current->files are not intersecting */
 	if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
 		flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
+#endif
 
 	switch (fl_type) {
         case F_RDLCK:

From 5369ffbc97a88b443f9fecb0c374f6aa8e43886b Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Fri, 13 Sep 2019 18:38:00 +0000
Subject: [PATCH 051/737] lustre: fix lnet makefile

Fix the lnet Makefile to use obj- instead of subdir-

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/staging/lustrefsx/lnet/Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/staging/lustrefsx/lnet/Makefile b/drivers/staging/lustrefsx/lnet/Makefile
index 85a225e57b290..7ee52eb559025 100644
--- a/drivers/staging/lustrefsx/lnet/Makefile
+++ b/drivers/staging/lustrefsx/lnet/Makefile
@@ -1,3 +1,3 @@
-subdir-$(CONFIG_LUSTREFSX_LNET)			+=	lnet/
-subdir-$(CONFIG_LUSTREFSX_LNET)			+=	klnds/
-subdir-$(CONFIG_LUSTREFSX_LNET_SELFTEST)	+=	selftest/
+obj-$(CONFIG_LUSTREFSX_LNET)			+=	lnet/
+obj-$(CONFIG_LUSTREFSX_LNET)			+=	klnds/
+obj-$(CONFIG_LUSTREFSX_LNET_SELFTEST)	+=	selftest/

From b182e2f18b7851f9b25e156554ea9cdd99e92746 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Sat, 12 Oct 2019 17:35:41 +0000
Subject: [PATCH 052/737] lustre: adapt to changed padata interfaces.

5.4 has changed padata interfaces, so adapt to them.

Linux commits:

b128a30409356df65f1a51cff3eb986cac8cfedc ("padata: allocate workqueue internally")
e6ce0e0807e90d38a2cefa524ac253d7a85c3f2f ("padata: make padata_do_parallel find alternate callback CPU")

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/staging/lustrefsx/config.h               |  3 +++
 .../libcfs/include/libcfs/libcfs_ptask.h         |  2 ++
 .../lustrefsx/libcfs/libcfs/libcfs_ptask.c       | 16 ++++++++++++++++
 3 files changed, 21 insertions(+)

diff --git a/drivers/staging/lustrefsx/config.h b/drivers/staging/lustrefsx/config.h
index 409cdebc71437..d5cf2879b0d98 100644
--- a/drivers/staging/lustrefsx/config.h
+++ b/drivers/staging/lustrefsx/config.h
@@ -901,6 +901,9 @@
 /* Common stacktrace infrastructure exists */
 #define HAVE_COMMON_STACKTRACE 1
 
+/* changed padata interface in 5.4 */
+#define HAVE_PADATA_INTERFACE_54
+
 /* ext4_journal_start takes 3 arguments */
 /* #undef JOURNAL_START_HAS_3ARGS */
 
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h
index be78b503d651e..586a04446cae8 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h
@@ -25,7 +25,9 @@ struct padata_instance {};
 
 struct cfs_ptask_engine {
 	struct padata_instance	*pte_pinst;
+#ifndef HAVE_PADATA_INTERFACE_54
 	struct workqueue_struct	*pte_wq;
+#endif
 	struct notifier_block	 pte_notifier;
 	int			 pte_weight;
 };
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c
index 275c01b74ad4e..1b7603cade8a9 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c
@@ -137,7 +137,11 @@ static int cfs_do_parallel(struct cfs_ptask_engine *engine,
 	ptask->pt_result = -EINPROGRESS;
 
 retry:
+#ifdef HAVE_PADATA_INTERFACE_54
+	rc = padata_do_parallel(engine->pte_pinst, padata, &ptask->pt_cbcpu);
+#else
 	rc = padata_do_parallel(engine->pte_pinst, padata, ptask->pt_cbcpu);
+#endif
 	if (rc == -EBUSY && cfs_ptask_is_retry(ptask)) {
 		/* too many tasks already in queue */
 		schedule_timeout_uninterruptible(1);
@@ -326,14 +330,18 @@ static int cfs_ptengine_padata_init(struct cfs_ptask_engine *engine,
 {
 	cpumask_var_t all_mask;
 	cpumask_var_t par_mask;
+#ifndef HAVE_PADATA_INTERFACE_54
 	unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_CPU_INTENSIVE;
+#endif
 	int rc;
 
 	get_online_cpus();
 
+#ifndef HAVE_PADATA_INTERFACE_54
 	engine->pte_wq = alloc_workqueue(name, wq_flags, 1);
 	if (engine->pte_wq == NULL)
 		GOTO(err, rc = -ENOMEM);
+#endif
 
 	if (!alloc_cpumask_var(&all_mask, GFP_KERNEL))
 		GOTO(err_destroy_workqueue, rc = -ENOMEM);
@@ -384,7 +392,11 @@ static int cfs_ptengine_padata_init(struct cfs_ptask_engine *engine,
 	}
 
 	engine->pte_weight = cpumask_weight(par_mask);
+#ifdef HAVE_PADATA_INTERFACE_54
+	engine->pte_pinst  = padata_alloc_possible(name);
+#else
 	engine->pte_pinst  = padata_alloc_possible(engine->pte_wq);
+#endif
 	if (engine->pte_pinst == NULL)
 		GOTO(err_free_par_mask, rc = -ENOMEM);
 
@@ -418,8 +430,10 @@ static int cfs_ptengine_padata_init(struct cfs_ptask_engine *engine,
 err_free_all_mask:
 	free_cpumask_var(all_mask);
 err_destroy_workqueue:
+#ifndef HAVE_PADATA_INTERFACE_54
 	destroy_workqueue(engine->pte_wq);
 err:
+#endif
 	put_online_cpus();
 	return rc;
 }
@@ -430,7 +444,9 @@ static void cfs_ptengine_padata_fini(struct cfs_ptask_engine *engine)
 	padata_unregister_cpumask_notifier(engine->pte_pinst,
 					   &engine->pte_notifier);
 	padata_free(engine->pte_pinst);
+#ifndef HAVE_PADATA_INTERFACE_54
 	destroy_workqueue(engine->pte_wq);
+#endif
 }
 
 #else  /* !CONFIG_PADATA */

From b2897d40295a76c1a83f8277e67c0f14a2e7aac5 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Thu, 29 Aug 2019 19:52:28 +0000
Subject: [PATCH 053/737] iommu: use config option to specify if iommu mode
 should be strict

Introduce a config (IOMMU_DEFAULT_STRICT) that specifies whether
the iommu mode should be strict by default. The normal default is 'n'.

This is setting is only used on arm64.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/iommu/Kconfig | 4 ++++
 drivers/iommu/iommu.c | 5 +++++
 2 files changed, 9 insertions(+)

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 04878caf6da49..806af9c742ab4 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -103,6 +103,10 @@ config IOMMU_DMA
 	select IRQ_MSI_IOMMU
 	select NEED_SG_DMA_LENGTH
 
+config IOMMU_DEFAULT_STRICT
+	def_bool n
+	depends on IOMMU_API && (ARM || ARM64)
+
 config FSL_PAMU
 	bool "Freescale IOMMU support"
 	depends on PCI
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 9d65557dfb2ce..1849078eb784c 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -29,7 +29,12 @@ static struct kset *iommu_group_kset;
 static DEFINE_IDA(iommu_group_ida);
 
 static unsigned int iommu_def_domain_type __read_mostly;
+
+#ifdef CONFIG_IOMMU_DEFAULT_STRICT
 static bool iommu_dma_strict __read_mostly = true;
+#else
+static bool iommu_dma_strict __read_mostly = false;
+#endif
 static u32 iommu_cmd_line __read_mostly;
 
 struct iommu_group {

From 50da0fb7bcb200ffa039ab757ba2eee9fff20575 Mon Sep 17 00:00:00 2001
From: Anchal Agarwal <anchalag@amazon.com>
Date: Thu, 15 Aug 2019 22:26:27 +0000
Subject: [PATCH 054/737] xen: Restore xen-pirqs on resume from hibernation

shutdown_pirq is invoked during hibernation path and hence
PIRQs should be restarted during resume. [Commit: xen: Only
restore the ACPI SCI interrupt in xen_restore_pirqs] restores
only ACPI SCI interrupt however, that is not the right thing
to do as all pirqs should be enabled as a part of
resume_device_irqs during suspend/resume device interrupts.
Apparently, chip->irq_startup is called only if IRQD_IRQ_STARTED
is unset during irq_startup on resume. This flag gets cleared by
free_irq->irq_shutdown during suspend.

free_irq() never gets explicitly called for ioapic-edge and
ioapic-level interrupts as respective drivers do nothing during
suspend/resume. So we shut them down explicitly in the first place
in syscore_suspend path to clear IRQ<>event channel mapping.
shutdown_pirq being called explicitly during suspend does not
clear this flags, hence .irq_enable is called in irq_startup
during resume instead and pirq's never start up.

This commit exports irq_state_clr_started API to clear the flag
during shutdown_pirq. Also, following the order in which
ipis/virqs/pirqs are restored during xen resume, the same order
should be followed for hibernation path. As per the flow of
hibernation_platform_enter, we should not restore pirqs explicitly
in syscore_resume ops and it should be done in resume devices path.

Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
---
 arch/x86/xen/suspend.c           |  1 -
 drivers/xen/events/events_base.c | 42 +-------------------------------
 include/linux/irq.h              |  2 ++
 kernel/irq/chip.c                |  4 +--
 4 files changed, 5 insertions(+), 44 deletions(-)

diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 39644923b623e..8be6ffa6bfbea 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -133,7 +133,6 @@ static void xen_syscore_resume(void)
 
 	gnttab_resume();
 
-	xen_restore_pirqs();
 }
 
 /*
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 064999298c393..d2088fae608a7 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -2066,50 +2066,10 @@ void xen_shutdown_pirqs(void)
 			continue;
 
 		shutdown_pirq(irq_get_irq_data(info->irq));
+		irq_state_clr_started(irq_to_desc(info->irq));
 	}
 }
 
-/*
- * For now, only restore the ACPI SCI pirq.
- */
-void xen_restore_pirqs(void)
-{
-#ifdef CONFIG_ACPI
-	int pirq, rc, irq, gsi;
-	struct physdev_map_pirq map_irq;
-	struct irq_info *info;
-
-	list_for_each_entry(info, &xen_irq_list_head, list) {
-		if (info->type != IRQT_PIRQ)
-			continue;
-
-		pirq = info->u.pirq.pirq;
-		gsi = info->u.pirq.gsi;
-		irq = info->irq;
-
-		if (gsi != acpi_gbl_FADT.sci_interrupt)
-			continue;
-
-		map_irq.domid = DOMID_SELF;
-		map_irq.type = MAP_PIRQ_TYPE_GSI;
-		map_irq.index = gsi;
-		map_irq.pirq = pirq;
-
-		rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
-		if (rc) {
-			pr_warn("xen: ACPI SCI interrupt map failed, rc=%d\n",
-				rc);
-			xen_free_irq(irq);
-			continue;
-		}
-
-		printk(KERN_DEBUG "xen: restored ACPI SCI interrupt\n");
-
-		__startup_pirq(irq);
-	}
-#endif
-}
-
 static struct irq_chip xen_dynamic_chip __read_mostly = {
 	.name			= "xen-dyn",
 
diff --git a/include/linux/irq.h b/include/linux/irq.h
index b89a8ac83d1bc..858fc5efd85fd 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -810,6 +810,8 @@ extern int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
 				struct msi_desc *entry);
 extern struct irq_data *irq_get_irq_data(unsigned int irq);
 
+extern void irq_state_clr_started(struct irq_desc *desc);
+
 static inline struct irq_chip *irq_get_chip(unsigned int irq)
 {
 	struct irq_data *d = irq_get_irq_data(irq);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index e7d284261d450..75c7aed4deaa6 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -173,11 +173,11 @@ static void irq_state_clr_masked(struct irq_desc *desc)
 	irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED);
 }
 
-static void irq_state_clr_started(struct irq_desc *desc)
+void irq_state_clr_started(struct irq_desc *desc)
 {
 	irqd_clear(&desc->irq_data, IRQD_IRQ_STARTED);
 }
-
+EXPORT_SYMBOL_GPL(irq_state_clr_started);
 static void irq_state_set_started(struct irq_desc *desc)
 {
 	irqd_set(&desc->irq_data, IRQD_IRQ_STARTED);

From a05978c40ffca9db2263e26fc434537b5553affc Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Wed, 27 Nov 2019 22:14:28 +0000
Subject: [PATCH 055/737] block/xen-blkfront: bump the maximum number of
 indirect segments up to 64

Bump the maximum number of indirect segments up to 64. For blk-mq without
a scheduler, which is the default for multi-queued devices, this is needed
to avoid a throughput regression for 'st1' EBS volumes.

On a c4.8xlarge instance:

sudo fio --bs=1M --name=seqread --ioengine=libaio --rw=read --direct=1 --filename=/dev/sdg --clocksource=clock_gettime --size=1G --numjobs=8

before:
READ: io=8192.0MB, aggrb=44772KB/s, minb=5596KB/s, maxb=5922KB/s, mint=177050msec, maxt=187361msec

after:
READ: io=8192.0MB, aggrb=136059KB/s, minb=17007KB/s, maxb=17022KB/s, mint=61600msec, maxt=61654msec

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/block/xen-blkfront.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index e9dbdf68b99b3..61520f25695ea 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -138,7 +138,7 @@ static LIST_HEAD(info_list);
  * by the backend driver.
  */
 
-static unsigned int xen_blkif_max_segments = 32;
+static unsigned int xen_blkif_max_segments = 64;
 module_param_named(max_indirect_segments, xen_blkif_max_segments, uint, 0444);
 MODULE_PARM_DESC(max_indirect_segments,
 		 "Maximum amount of segments in indirect requests (default is 32)");

From 009611e1bf6fd2ba6dde7f5079bdf45f5947cf9c Mon Sep 17 00:00:00 2001
From: Andy Strohman <astroh@amazon.com>
Date: Wed, 4 Dec 2019 20:04:51 +0000
Subject: [PATCH 056/737] lustre: hold lock while walking changelog dev list

This patch came from the 2.10.5 branch of AmazonFSxLustreClient repo.

The patch from that repo is:

    mdc: hold lock while walking changelog dev list (LU-12566)

    Prevent the following GPF which is causing stuck processes,
    when running mount and umount concurrently on the same host.

        general protection fault: 0000 [#1] SMP PTI
        ...
        RIP: 0010:mdc_changelog_cdev_finish+0x3f/0x1b9 [mdc]
        ...
        Call Trace:
           mdc_precleanup+0x2a/0x3c0 [mdc]

    Original patch was:

        LU-12566 mdc: hold lock while walking changelog dev list

        In mdc_changelog_cdev_finish() we need chlg_registered_dev_lock
        while walking and changing entries on the chlog_registered_devs
        and ced_obds lists in chlg_registered_dev_find_by_obd().

        Move the calling of chlg_registered_dev_find_by_obd() under the
        mutex, and add assertions to the places where the lists are walked
        and changed that the mutex is held.

        Lustre-change: https://review.whamcloud.com/35668
        Lustre-commit: a260c530801db7f58efa93b774f06b0ce72649a3

        Fixes: 1d40214d96dd ("LU-7659 mdc: expose changelog through char devices")
        Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
        Change-Id: Ib62fdff87cde6a4bcfb9bea24a2ea72a933ebbe5
        Signed-off-by: Minh Diep <mdiep@whamcloud.com>
        Reviewed-on: https://review.whamcloud.com/35835

Signed-off-by: Andy Strohman <astroh@amazon.com>
---
 drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c
index 8431b1c26622b..c99a3bacf24d6 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c
@@ -612,6 +612,7 @@ chlg_registered_dev_find_by_name(const char *name)
 {
 	struct chlg_registered_dev *dit;
 
+	LASSERT(mutex_is_locked(&chlg_registered_dev_lock));
 	list_for_each_entry(dit, &chlg_registered_devices, ced_link)
 		if (strcmp(name, dit->ced_name) == 0)
 			return dit;
@@ -630,6 +631,7 @@ chlg_registered_dev_find_by_obd(const struct obd_device *obd)
 	struct chlg_registered_dev *dit;
 	struct obd_device *oit;
 
+	LASSERT(mutex_is_locked(&chlg_registered_dev_lock));
 	list_for_each_entry(dit, &chlg_registered_devices, ced_link)
 		list_for_each_entry(oit, &dit->ced_obds,
 				    u.cli.cl_chg_dev_linkage)
@@ -702,6 +704,7 @@ static void chlg_dev_clear(struct kref *kref)
 						      ced_refs);
 	ENTRY;
 
+	LASSERT(mutex_is_locked(&chlg_registered_dev_lock));
 	list_del(&entry->ced_link);
 	misc_deregister(&entry->ced_misc);
 	OBD_FREE_PTR(entry);
@@ -713,10 +716,11 @@ static void chlg_dev_clear(struct kref *kref)
  */
 void mdc_changelog_cdev_finish(struct obd_device *obd)
 {
-	struct chlg_registered_dev *dev = chlg_registered_dev_find_by_obd(obd);
-	ENTRY;
+	struct chlg_registered_dev *dev;
 
+	ENTRY;
 	mutex_lock(&chlg_registered_dev_lock);
+	dev = chlg_registered_dev_find_by_obd(obd);
 	list_del_init(&obd->u.cli.cl_chg_dev_linkage);
 	kref_put(&dev->ced_refs, chlg_dev_clear);
 	mutex_unlock(&chlg_registered_dev_lock);

From ae03c200ea8e9fb7faf51aba8ed845564ba364d2 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Fri, 6 Mar 2020 00:11:57 +0000
Subject: [PATCH 057/737] ena: update to 2.2.3

Update the ena driver to version 2.2.3. Relase notes for versions since
2.1.3:

**Bug Fixes**
* Revert VXLAN TX checksum offloading support due to issues with other tunne
* Avoid unnecessary constant rearming of interrupt vector when busy-polling.

**Bug Fixes**
* Fix compilation error in SLES 12 SP5

**Bug Fixes**
* fix incorrect parameter to ena_indirection_table_get() in kernels in
  range [3.8, 3.16)

**New Features**
* Implement XDP support for DROP and TX actions.
* Add VXLAN TX checksum offloading support.
* Map rx buffers bidirectionally to support traffic mirroring.
* Introduce disable meta descriptor caching feature required by llq
  accelerated mode.
* Revert extra_properties feature implementation via ethtool priv-flags.
* Support set_channels() callback in ethtool.

**Bug Fixes**
* Fix multiple issues with the RSS feature.
* Fix uses of round_jiffies() in timer_service.
* Add missing ethtool TX timestamping indication.
* Fix ENA_REGS_RESET_DRIVER_INVALID_STATE error during hibernation.
* Fix race condition causing an incorrect wake up of a TX queue when it is
  down.
* Fix dim exported symbols conflicts by removing all EXPORT_SYMBOL directive
  from dim files.
* Fix first interrupt accounting in XDP by adding first_interrupt field to
  napi struct.
* Fix napi handler misbehavior when the napi budget is zero.
* Add max value check in ena_set_channels() to disalow setting the number
  of queues to a higher than allowed maximum number.
* Fix race condition when setting the number of queues immediately after
  loading the driver, which caused a crash when changing the number of queue
  to a larger number than currently set.
* Fix incorrect setting of number of msix vectors according to num_io_queues
  causing crash when changing the number of queues to a larger number after
  driver reset.
* Fix ena_tx_timeout() signature for kernels >= 5.5

**Minor Changes**
* Add RX drops and TX drops counters to ethtool -S command.
* Aggregate accelerated mode features under struct ena_admin_accel_mode_req
  currently including the new disable meta descriptor caching feature and
  the existing max tx burst size feature.
* Add debug prints to failed commands.
* Make ena rxfh support ETH_RSS_HASH_NO_CHANGE.
* Change MTU parameter to be unsigned in ena_com_set_dev_mtu().
* Remove unused ena_restore_ethtool_params() and relevant fields.
* Use combined channels instead of separate RX/TX channels in ethtool -l/L.
* Use SHUTDOWN as reset reason when closing interface.
* Change RSS default function on probe to Toeplitz.
* Enable setting the RSS hash function only without changing the key in
  ethtool.
* Remove superfulous print of number of queues during ena_up().
* Remove unnecessary parentheses to pass checkpatch.
* Add unmask interrupts statistics to ethtool.
* Arrange local variables in ena_com_fill_hash_function() in reverse christm
  tree order.
* Separate RSS hash function retrieval and RSS key retreival into 2 differen
  functions.

**New Features**
* Add support for the RX offset feature - where the device writes data
  with an offset from the beginning of an RX buffer.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/amazon/net/ena/Makefile          |    3 +-
 drivers/amazon/net/ena/dim.c             |   82 +
 drivers/amazon/net/ena/dim.h             |  338 ++++
 drivers/amazon/net/ena/ena_admin_defs.h  |   65 +-
 drivers/amazon/net/ena/ena_com.c         |  493 +++---
 drivers/amazon/net/ena/ena_com.h         |  261 +--
 drivers/amazon/net/ena/ena_common_defs.h |    3 +-
 drivers/amazon/net/ena/ena_eth_com.c     |  106 +-
 drivers/amazon/net/ena/ena_eth_com.h     |   29 +-
 drivers/amazon/net/ena/ena_eth_io_defs.h |    7 +-
 drivers/amazon/net/ena/ena_ethtool.c     |  384 +++--
 drivers/amazon/net/ena/ena_netdev.c      | 1885 ++++++++++++++++------
 drivers/amazon/net/ena/ena_netdev.h      |  150 +-
 drivers/amazon/net/ena/ena_regs_defs.h   |    4 +-
 drivers/amazon/net/ena/ena_sysfs.c       |  166 +-
 drivers/amazon/net/ena/kcompat.h         |  102 +-
 drivers/amazon/net/ena/net_dim.c         |  245 +++
 17 files changed, 2857 insertions(+), 1466 deletions(-)
 create mode 100644 drivers/amazon/net/ena/dim.c
 create mode 100644 drivers/amazon/net/ena/dim.h
 create mode 100644 drivers/amazon/net/ena/net_dim.c

diff --git a/drivers/amazon/net/ena/Makefile b/drivers/amazon/net/ena/Makefile
index 0e671d0b389d4..33b4a08d38f4a 100644
--- a/drivers/amazon/net/ena/Makefile
+++ b/drivers/amazon/net/ena/Makefile
@@ -6,6 +6,7 @@
 
 obj-$(CONFIG_AMAZON_ENA_ETHERNET) += ena.o
 
-ena-y := ena_netdev.o ena_com.o ena_eth_com.o ena_ethtool.o
+ena-y := ena_netdev.o ena_com.o ena_eth_com.o ena_ethtool.o net_dim.o \
+	dim.o
 
 ena-$(CONFIG_SYSFS) += ena_sysfs.o
diff --git a/drivers/amazon/net/ena/dim.c b/drivers/amazon/net/ena/dim.c
new file mode 100644
index 0000000000000..1b200be4b3709
--- /dev/null
+++ b/drivers/amazon/net/ena/dim.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2019, Mellanox Technologies inc.  All rights reserved.
+ */
+
+#include "dim.h"
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0)
+
+bool dim_on_top(struct dim *dim)
+{
+	switch (dim->tune_state) {
+	case DIM_PARKING_ON_TOP:
+	case DIM_PARKING_TIRED:
+		return true;
+	case DIM_GOING_RIGHT:
+		return (dim->steps_left > 1) && (dim->steps_right == 1);
+	default: /* DIM_GOING_LEFT */
+		return (dim->steps_right > 1) && (dim->steps_left == 1);
+	}
+}
+
+void dim_turn(struct dim *dim)
+{
+	switch (dim->tune_state) {
+	case DIM_PARKING_ON_TOP:
+	case DIM_PARKING_TIRED:
+		break;
+	case DIM_GOING_RIGHT:
+		dim->tune_state = DIM_GOING_LEFT;
+		dim->steps_left = 0;
+		break;
+	case DIM_GOING_LEFT:
+		dim->tune_state = DIM_GOING_RIGHT;
+		dim->steps_right = 0;
+		break;
+	}
+}
+
+void dim_park_on_top(struct dim *dim)
+{
+	dim->steps_right  = 0;
+	dim->steps_left   = 0;
+	dim->tired        = 0;
+	dim->tune_state   = DIM_PARKING_ON_TOP;
+}
+
+void dim_park_tired(struct dim *dim)
+{
+	dim->steps_right  = 0;
+	dim->steps_left   = 0;
+	dim->tune_state   = DIM_PARKING_TIRED;
+}
+
+void dim_calc_stats(struct dim_sample *start, struct dim_sample *end,
+		    struct dim_stats *curr_stats)
+{
+	/* u32 holds up to 71 minutes, should be enough */
+	u32 delta_us = ktime_us_delta(end->time, start->time);
+	u32 npkts = BIT_GAP(BITS_PER_TYPE(u32), end->pkt_ctr, start->pkt_ctr);
+	u32 nbytes = BIT_GAP(BITS_PER_TYPE(u32), end->byte_ctr,
+			     start->byte_ctr);
+	u32 ncomps = BIT_GAP(BITS_PER_TYPE(u32), end->comp_ctr,
+			     start->comp_ctr);
+
+	if (!delta_us)
+		return;
+
+	curr_stats->ppms = DIV_ROUND_UP(npkts * USEC_PER_MSEC, delta_us);
+	curr_stats->bpms = DIV_ROUND_UP(nbytes * USEC_PER_MSEC, delta_us);
+	curr_stats->epms = DIV_ROUND_UP(DIM_NEVENTS * USEC_PER_MSEC,
+					delta_us);
+	curr_stats->cpms = DIV_ROUND_UP(ncomps * USEC_PER_MSEC, delta_us);
+	if (curr_stats->epms != 0)
+		curr_stats->cpe_ratio = DIV_ROUND_DOWN_ULL(
+			curr_stats->cpms * 100, curr_stats->epms);
+	else
+		curr_stats->cpe_ratio = 0;
+
+}
+
+#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0) */
diff --git a/drivers/amazon/net/ena/dim.h b/drivers/amazon/net/ena/dim.h
new file mode 100644
index 0000000000000..633c2473e73ad
--- /dev/null
+++ b/drivers/amazon/net/ena/dim.h
@@ -0,0 +1,338 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef DIM_H
+#define DIM_H
+
+#include <linux/module.h>
+#include "kcompat.h"
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0)
+
+/**
+ * Number of events between DIM iterations.
+ * Causes a moderation of the algorithm run.
+ */
+#define DIM_NEVENTS 64
+
+/**
+ * Is a difference between values justifies taking an action.
+ * We consider 10% difference as significant.
+ */
+#define IS_SIGNIFICANT_DIFF(val, ref) \
+	(((100UL * abs((val) - (ref))) / (ref)) > 10)
+
+/**
+ * Calculate the gap between two values.
+ * Take wrap-around and variable size into consideration.
+ */
+#define BIT_GAP(bits, end, start) ((((end) - (start)) + BIT_ULL(bits)) \
+		& (BIT_ULL(bits) - 1))
+
+/**
+ * Structure for CQ moderation values.
+ * Used for communications between DIM and its consumer.
+ *
+ * @usec: CQ timer suggestion (by DIM)
+ * @pkts: CQ packet counter suggestion (by DIM)
+ * @cq_period_mode: CQ priod count mode (from CQE/EQE)
+ */
+struct dim_cq_moder {
+	u16 usec;
+	u16 pkts;
+	u16 comps;
+	u8 cq_period_mode;
+};
+
+/**
+ * Structure for DIM sample data.
+ * Used for communications between DIM and its consumer.
+ *
+ * @time: Sample timestamp
+ * @pkt_ctr: Number of packets
+ * @byte_ctr: Number of bytes
+ * @event_ctr: Number of events
+ */
+struct dim_sample {
+	ktime_t time;
+	u32 pkt_ctr;
+	u32 byte_ctr;
+	u16 event_ctr;
+	u32 comp_ctr;
+};
+
+/**
+ * Structure for DIM stats.
+ * Used for holding current measured rates.
+ *
+ * @ppms: Packets per msec
+ * @bpms: Bytes per msec
+ * @epms: Events per msec
+ */
+struct dim_stats {
+	int ppms; /* packets per msec */
+	int bpms; /* bytes per msec */
+	int epms; /* events per msec */
+	int cpms; /* completions per msec */
+	int cpe_ratio; /* ratio of completions to events */
+};
+
+/**
+ * Main structure for dynamic interrupt moderation (DIM).
+ * Used for holding all information about a specific DIM instance.
+ *
+ * @state: Algorithm state (see below)
+ * @prev_stats: Measured rates from previous iteration (for comparison)
+ * @start_sample: Sampled data at start of current iteration
+ * @work: Work to perform on action required
+ * @priv: A pointer to the struct that points to dim
+ * @profile_ix: Current moderation profile
+ * @mode: CQ period count mode
+ * @tune_state: Algorithm tuning state (see below)
+ * @steps_right: Number of steps taken towards higher moderation
+ * @steps_left: Number of steps taken towards lower moderation
+ * @tired: Parking depth counter
+ */
+struct dim {
+	u8 state;
+	struct dim_stats prev_stats;
+	struct dim_sample start_sample;
+	struct dim_sample measuring_sample;
+	struct work_struct work;
+	void *priv;
+	u8 profile_ix;
+	u8 mode;
+	u8 tune_state;
+	u8 steps_right;
+	u8 steps_left;
+	u8 tired;
+};
+
+/**
+ * enum dim_cq_period_mode
+ *
+ * These are the modes for CQ period count.
+ *
+ * @DIM_CQ_PERIOD_MODE_START_FROM_EQE: Start counting from EQE
+ * @DIM_CQ_PERIOD_MODE_START_FROM_CQE: Start counting from CQE (implies timer reset)
+ * @DIM_CQ_PERIOD_NUM_MODES: Number of modes
+ */
+enum {
+	DIM_CQ_PERIOD_MODE_START_FROM_EQE = 0x0,
+	DIM_CQ_PERIOD_MODE_START_FROM_CQE = 0x1,
+	DIM_CQ_PERIOD_NUM_MODES
+};
+
+/**
+ * enum dim_state
+ *
+ * These are the DIM algorithm states.
+ * These will determine if the algorithm is in a valid state to start an iteration.
+ *
+ * @DIM_START_MEASURE: This is the first iteration (also after applying a new profile)
+ * @DIM_MEASURE_IN_PROGRESS: Algorithm is already in progress - check if
+ * need to perform an action
+ * @DIM_APPLY_NEW_PROFILE: DIM consumer is currently applying a profile - no need to measure
+ */
+enum {
+	DIM_START_MEASURE,
+	DIM_MEASURE_IN_PROGRESS,
+	DIM_APPLY_NEW_PROFILE,
+};
+
+/**
+ * enum dim_tune_state
+ *
+ * These are the DIM algorithm tune states.
+ * These will determine which action the algorithm should perform.
+ *
+ * @DIM_PARKING_ON_TOP: Algorithm found a local top point - exit on significant difference
+ * @DIM_PARKING_TIRED: Algorithm found a deep top point - don't exit if tired > 0
+ * @DIM_GOING_RIGHT: Algorithm is currently trying higher moderation levels
+ * @DIM_GOING_LEFT: Algorithm is currently trying lower moderation levels
+ */
+enum {
+	DIM_PARKING_ON_TOP,
+	DIM_PARKING_TIRED,
+	DIM_GOING_RIGHT,
+	DIM_GOING_LEFT,
+};
+
+/**
+ * enum dim_stats_state
+ *
+ * These are the DIM algorithm statistics states.
+ * These will determine the verdict of current iteration.
+ *
+ * @DIM_STATS_WORSE: Current iteration shows worse performance than before
+ * @DIM_STATS_WORSE: Current iteration shows same performance than before
+ * @DIM_STATS_WORSE: Current iteration shows better performance than before
+ */
+enum {
+	DIM_STATS_WORSE,
+	DIM_STATS_SAME,
+	DIM_STATS_BETTER,
+};
+
+/**
+ * enum dim_step_result
+ *
+ * These are the DIM algorithm step results.
+ * These describe the result of a step.
+ *
+ * @DIM_STEPPED: Performed a regular step
+ * @DIM_TOO_TIRED: Same kind of step was done multiple times - should go to
+ * tired parking
+ * @DIM_ON_EDGE: Stepped to the most left/right profile
+ */
+enum {
+	DIM_STEPPED,
+	DIM_TOO_TIRED,
+	DIM_ON_EDGE,
+};
+
+/**
+ *	dim_on_top - check if current state is a good place to stop (top location)
+ *	@dim: DIM context
+ *
+ * Check if current profile is a good place to park at.
+ * This will result in reducing the DIM checks frequency as we assume we
+ * shouldn't probably change profiles, unless traffic pattern wasn't changed.
+ */
+bool dim_on_top(struct dim *dim);
+
+/**
+ *	dim_turn - change profile alterning direction
+ *	@dim: DIM context
+ *
+ * Go left if we were going right and vice-versa.
+ * Do nothing if currently parking.
+ */
+void dim_turn(struct dim *dim);
+
+/**
+ *	dim_park_on_top - enter a parking state on a top location
+ *	@dim: DIM context
+ *
+ * Enter parking state.
+ * Clear all movement history.
+ */
+void dim_park_on_top(struct dim *dim);
+
+/**
+ *	dim_park_tired - enter a tired parking state
+ *	@dim: DIM context
+ *
+ * Enter parking state.
+ * Clear all movement history and cause DIM checks frequency to reduce.
+ */
+void dim_park_tired(struct dim *dim);
+
+/**
+ *	dim_calc_stats - calculate the difference between two samples
+ *	@start: start sample
+ *	@end: end sample
+ *	@curr_stats: delta between samples
+ *
+ * Calculate the delta between two samples (in data rates).
+ * Takes into consideration counter wrap-around.
+ */
+void dim_calc_stats(struct dim_sample *start, struct dim_sample *end,
+		    struct dim_stats *curr_stats);
+
+/**
+ *	dim_update_sample - set a sample's fields with give values
+ *	@event_ctr: number of events to set
+ *	@packets: number of packets to set
+ *	@bytes: number of bytes to set
+ *	@s: DIM sample
+ */
+static inline void
+dim_update_sample(u16 event_ctr, u64 packets, u64 bytes, struct dim_sample *s)
+{
+	s->time	     = ktime_get();
+	s->pkt_ctr   = packets;
+	s->byte_ctr  = bytes;
+	s->event_ctr = event_ctr;
+}
+
+/**
+ *	dim_update_sample_with_comps - set a sample's fields with given
+ *	values including the completion parameter
+ *	@event_ctr: number of events to set
+ *	@packets: number of packets to set
+ *	@bytes: number of bytes to set
+ *	@comps: number of completions to set
+ *	@s: DIM sample
+ */
+static inline void
+dim_update_sample_with_comps(u16 event_ctr, u64 packets, u64 bytes, u64 comps,
+			     struct dim_sample *s)
+{
+	dim_update_sample(event_ctr, packets, bytes, s);
+	s->comp_ctr = comps;
+}
+
+/* Net DIM */
+
+/**
+ *	net_dim_get_rx_moderation - provide a CQ moderation object for the given RX profile
+ *	@cq_period_mode: CQ period mode
+ *	@ix: Profile index
+ */
+struct dim_cq_moder net_dim_get_rx_moderation(u8 cq_period_mode, int ix);
+
+/**
+ *	net_dim_get_def_rx_moderation - provide the default RX moderation
+ *	@cq_period_mode: CQ period mode
+ */
+struct dim_cq_moder net_dim_get_def_rx_moderation(u8 cq_period_mode);
+
+/**
+ *	net_dim_get_tx_moderation - provide a CQ moderation object for the given TX profile
+ *	@cq_period_mode: CQ period mode
+ *	@ix: Profile index
+ */
+struct dim_cq_moder net_dim_get_tx_moderation(u8 cq_period_mode, int ix);
+
+/**
+ *	net_dim_get_def_tx_moderation - provide the default TX moderation
+ *	@cq_period_mode: CQ period mode
+ */
+struct dim_cq_moder net_dim_get_def_tx_moderation(u8 cq_period_mode);
+
+/**
+ *	net_dim - main DIM algorithm entry point
+ *	@dim: DIM instance information
+ *	@end_sample: Current data measurement
+ *
+ * Called by the consumer.
+ * This is the main logic of the algorithm, where data is processed in order to decide on next
+ * required action.
+ */
+void net_dim(struct dim *dim, struct dim_sample end_sample);
+
+/* RDMA DIM */
+
+/*
+ * RDMA DIM profile:
+ * profile size must be of RDMA_DIM_PARAMS_NUM_PROFILES.
+ */
+#define RDMA_DIM_PARAMS_NUM_PROFILES 9
+#define RDMA_DIM_START_PROFILE 0
+
+/**
+ * rdma_dim - Runs the adaptive moderation.
+ * @dim: The moderation struct.
+ * @completions: The number of completions collected in this round.
+ *
+ * Each call to rdma_dim takes the latest amount of completions that
+ * have been collected and counts them as a new event.
+ * Once enough events have been collected the algorithm decides a new
+ * moderation level.
+ */
+void rdma_dim(struct dim *dim, u64 completions);
+
+#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0) */
+
+#endif /* DIM_H */
diff --git a/drivers/amazon/net/ena/ena_admin_defs.h b/drivers/amazon/net/ena/ena_admin_defs.h
index 8da5f41d28fdb..c1836183d2934 100755
--- a/drivers/amazon/net/ena/ena_admin_defs.h
+++ b/drivers/amazon/net/ena/ena_admin_defs.h
@@ -1,4 +1,3 @@
-/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
  * Copyright 2015 - 2018 Amazon.com, Inc. or its affiliates.
  *
@@ -409,6 +408,10 @@ struct ena_admin_basic_stats {
 	u32 rx_drops_low;
 
 	u32 rx_drops_high;
+
+	u32 tx_drops_low;
+
+	u32 tx_drops_high;
 };
 
 struct ena_admin_acq_get_stats_resp {
@@ -492,6 +495,36 @@ enum ena_admin_llq_stride_ctrl {
 	ENA_ADMIN_MULTIPLE_DESCS_PER_ENTRY          = 2,
 };
 
+enum ena_admin_accel_mode_feat {
+	ENA_ADMIN_DISABLE_META_CACHING              = 0,
+	ENA_ADMIN_LIMIT_TX_BURST                    = 1,
+};
+
+struct ena_admin_accel_mode_get {
+	/* bit field of enum ena_admin_accel_mode_feat */
+	u16 supported_flags;
+
+	/* maximum burst size between two doorbells. The size is in bytes */
+	u16 max_tx_burst_size;
+};
+
+struct ena_admin_accel_mode_set {
+	/* bit field of enum ena_admin_accel_mode_feat */
+	u16 enabled_flags;
+
+	u16 reserved;
+};
+
+struct ena_admin_accel_mode_req {
+	union {
+		u32 raw[2];
+
+		struct ena_admin_accel_mode_get get;
+
+		struct ena_admin_accel_mode_set set;
+	} u;
+};
+
 struct ena_admin_feature_llq_desc {
 	u32 max_llq_num;
 
@@ -537,10 +570,13 @@ struct ena_admin_feature_llq_desc {
 	/* the stride control the driver selected to use */
 	u16 descriptors_stride_ctrl_enabled;
 
-	/* Maximum size in bytes taken by llq entries in a single tx burst.
-	 * Set to 0 when there is no such limit.
+	/* reserved */
+	u32 reserved1;
+
+	/* accelerated low latency queues requirment. driver needs to
+	 * support those requirments in order to use accelerated llq
 	 */
-	u32 max_tx_burst_size;
+	struct ena_admin_accel_mode_req accel_mode;
 };
 
 struct ena_admin_queue_ext_feature_fields {
@@ -821,6 +857,14 @@ struct ena_admin_host_info {
 	u16 num_cpus;
 
 	u16 reserved;
+
+	/* 0 : mutable_rss_table_size
+	 * 1 : rx_offset
+	 * 2 : interrupt_moderation
+	 * 3 : map_rx_buf_bidirectional
+	 * 31:4 : reserved
+	 */
+	u32 driver_supported_features;
 };
 
 struct ena_admin_rss_ind_table_entry {
@@ -1033,6 +1077,10 @@ struct ena_admin_aenq_keep_alive_desc {
 	u32 rx_drops_low;
 
 	u32 rx_drops_high;
+
+	u32 tx_drops_low;
+
+	u32 tx_drops_high;
 };
 
 struct ena_admin_ena_mmio_req_read_less_resp {
@@ -1132,6 +1180,13 @@ struct ena_admin_ena_mmio_req_read_less_resp {
 #define ENA_ADMIN_HOST_INFO_DEVICE_MASK                     GENMASK(7, 3)
 #define ENA_ADMIN_HOST_INFO_BUS_SHIFT                       8
 #define ENA_ADMIN_HOST_INFO_BUS_MASK                        GENMASK(15, 8)
+#define ENA_ADMIN_HOST_INFO_MUTABLE_RSS_TABLE_SIZE_MASK     BIT(0)
+#define ENA_ADMIN_HOST_INFO_RX_OFFSET_SHIFT                 1
+#define ENA_ADMIN_HOST_INFO_RX_OFFSET_MASK                  BIT(1)
+#define ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_SHIFT      2
+#define ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_MASK       BIT(2)
+#define ENA_ADMIN_HOST_INFO_MAP_RX_BUF_BIDIRECTIONAL_SHIFT  3
+#define ENA_ADMIN_HOST_INFO_MAP_RX_BUF_BIDIRECTIONAL_MASK   BIT(3)
 
 /* feature_rss_ind_table */
 #define ENA_ADMIN_FEATURE_RSS_IND_TABLE_ONE_ENTRY_UPDATE_MASK BIT(0)
@@ -1142,4 +1197,4 @@ struct ena_admin_ena_mmio_req_read_less_resp {
 /* aenq_link_change_desc */
 #define ENA_ADMIN_AENQ_LINK_CHANGE_DESC_LINK_STATUS_MASK    BIT(0)
 
-#endif /*_ENA_ADMIN_H_ */
+#endif /* _ENA_ADMIN_H_ */
diff --git a/drivers/amazon/net/ena/ena_com.c b/drivers/amazon/net/ena/ena_com.c
index 0c6baa03cfba2..45278d4baf808 100644
--- a/drivers/amazon/net/ena/ena_com.c
+++ b/drivers/amazon/net/ena/ena_com.c
@@ -64,6 +64,15 @@
 
 #define ENA_POLL_MS	5
 
+/* Default Microsoft RSS key, used for HRSS. */
+static const u8 rss_hash_key[ENA_HASH_KEY_SIZE] = {
+		0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+		0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+		0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+		0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+		0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
+};
+
 /*****************************************************************************/
 /*****************************************************************************/
 /*****************************************************************************/
@@ -91,7 +100,7 @@ struct ena_com_stats_ctx {
 	struct ena_admin_acq_get_stats_resp get_resp;
 };
 
-static inline int ena_com_mem_addr_set(struct ena_com_dev *ena_dev,
+static int ena_com_mem_addr_set(struct ena_com_dev *ena_dev,
 				       struct ena_common_mem_addr *ena_addr,
 				       dma_addr_t addr)
 {
@@ -111,11 +120,11 @@ static int ena_com_admin_init_sq(struct ena_com_admin_queue *queue)
 	struct ena_com_admin_sq *sq = &queue->sq;
 	u16 size = ADMIN_SQ_SIZE(queue->q_depth);
 
-	sq->entries = dma_alloc_coherent(queue->q_dmadev, size, &sq->dma_addr,
+	sq->entries = dma_zalloc_coherent(queue->q_dmadev, size, &sq->dma_addr,
 					  GFP_KERNEL);
 
 	if (!sq->entries) {
-		pr_err("memory allocation failed");
+		pr_err("memory allocation failed\n");
 		return -ENOMEM;
 	}
 
@@ -133,11 +142,11 @@ static int ena_com_admin_init_cq(struct ena_com_admin_queue *queue)
 	struct ena_com_admin_cq *cq = &queue->cq;
 	u16 size = ADMIN_CQ_SIZE(queue->q_depth);
 
-	cq->entries = dma_alloc_coherent(queue->q_dmadev, size, &cq->dma_addr,
+	cq->entries = dma_zalloc_coherent(queue->q_dmadev, size, &cq->dma_addr,
 					  GFP_KERNEL);
 
 	if (!cq->entries) {
-		pr_err("memory allocation failed");
+		pr_err("memory allocation failed\n");
 		return -ENOMEM;
 	}
 
@@ -156,11 +165,11 @@ static int ena_com_admin_init_aenq(struct ena_com_dev *dev,
 
 	dev->aenq.q_depth = ENA_ASYNC_QUEUE_DEPTH;
 	size = ADMIN_AENQ_SIZE(ENA_ASYNC_QUEUE_DEPTH);
-	aenq->entries = dma_alloc_coherent(dev->dmadev, size, &aenq->dma_addr,
+	aenq->entries = dma_zalloc_coherent(dev->dmadev, size, &aenq->dma_addr,
 					    GFP_KERNEL);
 
 	if (!aenq->entries) {
-		pr_err("memory allocation failed");
+		pr_err("memory allocation failed\n");
 		return -ENOMEM;
 	}
 
@@ -190,7 +199,7 @@ static int ena_com_admin_init_aenq(struct ena_com_dev *dev,
 	return 0;
 }
 
-static inline void comp_ctxt_release(struct ena_com_admin_queue *queue,
+static void comp_ctxt_release(struct ena_com_admin_queue *queue,
 				     struct ena_comp_ctx *comp_ctx)
 {
 	comp_ctx->occupied = false;
@@ -206,6 +215,11 @@ static struct ena_comp_ctx *get_comp_ctxt(struct ena_com_admin_queue *queue,
 		return NULL;
 	}
 
+	if (unlikely(!queue->comp_ctx)) {
+		pr_err("Completion context is NULL\n");
+		return NULL;
+	}
+
 	if (unlikely(queue->comp_ctx[command_id].occupied && capture)) {
 		pr_err("Completion context is occupied\n");
 		return NULL;
@@ -277,7 +291,7 @@ static struct ena_comp_ctx *__ena_com_submit_admin_cmd(struct ena_com_admin_queu
 	return comp_ctx;
 }
 
-static inline int ena_com_init_comp_ctxt(struct ena_com_admin_queue *queue)
+static int ena_com_init_comp_ctxt(struct ena_com_admin_queue *queue)
 {
 	size_t size = queue->q_depth * sizeof(struct ena_comp_ctx);
 	struct ena_comp_ctx *comp_ctx;
@@ -285,7 +299,7 @@ static inline int ena_com_init_comp_ctxt(struct ena_com_admin_queue *queue)
 
 	queue->comp_ctx = devm_kzalloc(queue->q_dmadev, size, GFP_KERNEL);
 	if (unlikely(!queue->comp_ctx)) {
-		pr_err("memory allocation failed");
+		pr_err("memory allocation failed\n");
 		return -ENOMEM;
 	}
 
@@ -345,19 +359,19 @@ static int ena_com_init_io_sq(struct ena_com_dev *ena_dev,
 		dev_node = dev_to_node(ena_dev->dmadev);
 		set_dev_node(ena_dev->dmadev, ctx->numa_node);
 		io_sq->desc_addr.virt_addr =
-			dma_alloc_coherent(ena_dev->dmadev, size,
+			dma_zalloc_coherent(ena_dev->dmadev, size,
 					    &io_sq->desc_addr.phys_addr,
 					    GFP_KERNEL);
 		set_dev_node(ena_dev->dmadev, dev_node);
 		if (!io_sq->desc_addr.virt_addr) {
 			io_sq->desc_addr.virt_addr =
-				dma_alloc_coherent(ena_dev->dmadev, size,
+				dma_zalloc_coherent(ena_dev->dmadev, size,
 						    &io_sq->desc_addr.phys_addr,
 						    GFP_KERNEL);
 		}
 
 		if (!io_sq->desc_addr.virt_addr) {
-			pr_err("memory allocation failed");
+			pr_err("memory allocation failed\n");
 			return -ENOMEM;
 		}
 	}
@@ -383,7 +397,7 @@ static int ena_com_init_io_sq(struct ena_com_dev *ena_dev,
 				devm_kzalloc(ena_dev->dmadev, size, GFP_KERNEL);
 
 		if (!io_sq->bounce_buf_ctrl.base_buffer) {
-			pr_err("bounce buffer memory allocation failed");
+			pr_err("bounce buffer memory allocation failed\n");
 			return -ENOMEM;
 		}
 
@@ -397,6 +411,8 @@ static int ena_com_init_io_sq(struct ena_com_dev *ena_dev,
 		       0x0, io_sq->llq_info.desc_list_entry_size);
 		io_sq->llq_buf_ctrl.descs_left_in_line =
 			io_sq->llq_info.descs_num_before_header;
+		io_sq->disable_meta_caching =
+			io_sq->llq_info.disable_meta_caching;
 
 		if (io_sq->llq_info.max_entries_in_tx_burst > 0)
 			io_sq->entries_in_tx_burst_left =
@@ -431,18 +447,18 @@ static int ena_com_init_io_cq(struct ena_com_dev *ena_dev,
 	prev_node = dev_to_node(ena_dev->dmadev);
 	set_dev_node(ena_dev->dmadev, ctx->numa_node);
 	io_cq->cdesc_addr.virt_addr =
-		dma_alloc_coherent(ena_dev->dmadev, size,
+		dma_zalloc_coherent(ena_dev->dmadev, size,
 				    &io_cq->cdesc_addr.phys_addr, GFP_KERNEL);
 	set_dev_node(ena_dev->dmadev, prev_node);
 	if (!io_cq->cdesc_addr.virt_addr) {
 		io_cq->cdesc_addr.virt_addr =
-			dma_alloc_coherent(ena_dev->dmadev, size,
+			dma_zalloc_coherent(ena_dev->dmadev, size,
 					    &io_cq->cdesc_addr.phys_addr,
 					    GFP_KERNEL);
 	}
 
 	if (!io_cq->cdesc_addr.virt_addr) {
-		pr_err("memory allocation failed");
+		pr_err("memory allocation failed\n");
 		return -ENOMEM;
 	}
 
@@ -520,9 +536,6 @@ static int ena_com_comp_status_to_errno(u8 comp_status)
 	if (unlikely(comp_status != 0))
 		pr_err("admin command failed[%u]\n", comp_status);
 
-	if (unlikely(comp_status > ENA_ADMIN_UNKNOWN_ERROR))
-		return -EINVAL;
-
 	switch (comp_status) {
 	case ENA_ADMIN_SUCCESS:
 		return 0;
@@ -537,7 +550,7 @@ static int ena_com_comp_status_to_errno(u8 comp_status)
 		return -EINVAL;
 	}
 
-	return 0;
+	return -EINVAL;
 }
 
 static int ena_com_wait_and_process_admin_cq_polling(struct ena_comp_ctx *comp_ctx,
@@ -615,6 +628,14 @@ static int ena_com_set_llq(struct ena_com_dev *ena_dev)
 	cmd.u.llq.desc_num_before_header_enabled = llq_info->descs_num_before_header;
 	cmd.u.llq.descriptors_stride_ctrl_enabled = llq_info->desc_stride_ctrl;
 
+	if (llq_info->disable_meta_caching)
+		cmd.u.llq.accel_mode.u.set.enabled_flags |=
+			BIT(ENA_ADMIN_DISABLE_META_CACHING);
+
+	if (llq_info->max_entries_in_tx_burst)
+		cmd.u.llq.accel_mode.u.set.enabled_flags |=
+			BIT(ENA_ADMIN_LIMIT_TX_BURST);
+
 	ret = ena_com_execute_admin_command(admin_queue,
 					    (struct ena_admin_aq_entry *)&cmd,
 					    sizeof(cmd),
@@ -731,15 +752,21 @@ static int ena_com_config_llq_info(struct ena_com_dev *ena_dev,
 		       llq_default_cfg->llq_num_decs_before_header,
 		       supported_feat, llq_info->descs_num_before_header);
 	}
+	/* Check for accelerated queue supported */
+	llq_info->disable_meta_caching =
+		llq_features->accel_mode.u.get.supported_flags &
+		BIT(ENA_ADMIN_DISABLE_META_CACHING);
 
-	llq_info->max_entries_in_tx_burst =
-		(u16)(llq_features->max_tx_burst_size /	llq_default_cfg->llq_ring_entry_size_value);
+	if (llq_features->accel_mode.u.get.supported_flags & BIT(ENA_ADMIN_LIMIT_TX_BURST))
+		llq_info->max_entries_in_tx_burst =
+			llq_features->accel_mode.u.get.max_tx_burst_size /
+			llq_default_cfg->llq_ring_entry_size_value;
 
 	rc = ena_com_set_llq(ena_dev);
 	if (rc)
 		pr_err("Cannot set LLQ configuration: %d\n", rc);
 
-	return 0;
+	return rc;
 }
 
 static int ena_com_wait_and_process_admin_cq_interrupts(struct ena_comp_ctx *comp_ctx,
@@ -763,16 +790,26 @@ static int ena_com_wait_and_process_admin_cq_interrupts(struct ena_comp_ctx *com
 		admin_queue->stats.no_completion++;
 		spin_unlock_irqrestore(&admin_queue->q_lock, flags);
 
-		if (comp_ctx->status == ENA_CMD_COMPLETED)
-			pr_err("The ena device have completion but the driver didn't receive any MSI-X interrupt (cmd %d)\n",
-			       comp_ctx->cmd_opcode);
-		else
-			pr_err("The ena device doesn't send any completion for the admin cmd %d status %d\n",
+		if (comp_ctx->status == ENA_CMD_COMPLETED) {
+			pr_err("The ena device sent a completion but the driver didn't receive a MSI-X interrupt (cmd %d), autopolling mode is %s\n",
+			       comp_ctx->cmd_opcode,
+			       admin_queue->auto_polling ? "ON" : "OFF");
+			/* Check if fallback to polling is enabled */
+			if (admin_queue->auto_polling)
+				admin_queue->polling = true;
+		} else {
+			pr_err("The ena device didn't send a completion for the admin cmd %d status %d\n",
 			       comp_ctx->cmd_opcode, comp_ctx->status);
-
-		admin_queue->running_state = false;
-		ret = -ETIME;
-		goto err;
+		}
+		/* Check if shifted to polling mode.
+		 * This will happen if there is a completion without an interrupt
+		 * and autopolling mode is enabled. Continuing normal execution in such case
+		 */
+		if (!admin_queue->polling) {
+			admin_queue->running_state = false;
+			ret = -ETIME;
+			goto err;
+		}
 	}
 
 	ret = ena_com_comp_status_to_errno(comp_ctx->comp_status);
@@ -830,7 +867,7 @@ static u32 ena_com_reg_bar_read32(struct ena_com_dev *ena_dev, u16 offset)
 	}
 
 	if (read_resp->reg_off != offset) {
-		pr_err("Read failure: wrong offset provided");
+		pr_err("Read failure: wrong offset provided\n");
 		ret = ENA_MMIO_READ_TIMEOUT;
 	} else {
 		ret = read_resp->reg_val;
@@ -1032,12 +1069,30 @@ static int ena_com_get_feature(struct ena_com_dev *ena_dev,
 				      feature_ver);
 }
 
+int ena_com_get_current_hash_function(struct ena_com_dev *ena_dev)
+{
+	return ena_dev->rss.hash_func;
+}
+
+static void ena_com_hash_key_fill_default_key(struct ena_com_dev *ena_dev)
+{
+	struct ena_admin_feature_rss_flow_hash_control *hash_key =
+		(ena_dev->rss).hash_key;
+
+	memcpy(hash_key->key, rss_hash_key, sizeof(rss_hash_key));
+	/* The key is stored in the device in u32 array
+	 * as well as the API requires the key to be passed in this
+	 * format. Thus the size of our array should be divided by 4
+	 */
+	hash_key->keys_num = sizeof(rss_hash_key) / sizeof(u32);
+}
+
 static int ena_com_hash_key_allocate(struct ena_com_dev *ena_dev)
 {
 	struct ena_rss *rss = &ena_dev->rss;
 
 	rss->hash_key =
-		dma_alloc_coherent(ena_dev->dmadev, sizeof(*rss->hash_key),
+		dma_zalloc_coherent(ena_dev->dmadev, sizeof(*rss->hash_key),
 				    &rss->hash_key_dma_addr, GFP_KERNEL);
 
 	if (unlikely(!rss->hash_key))
@@ -1061,7 +1116,7 @@ static int ena_com_hash_ctrl_init(struct ena_com_dev *ena_dev)
 	struct ena_rss *rss = &ena_dev->rss;
 
 	rss->hash_ctrl =
-		dma_alloc_coherent(ena_dev->dmadev, sizeof(*rss->hash_ctrl),
+		dma_zalloc_coherent(ena_dev->dmadev, sizeof(*rss->hash_ctrl),
 				    &rss->hash_ctrl_dma_addr, GFP_KERNEL);
 
 	if (unlikely(!rss->hash_ctrl))
@@ -1105,7 +1160,7 @@ static int ena_com_indirect_table_allocate(struct ena_com_dev *ena_dev,
 		sizeof(struct ena_admin_rss_ind_table_entry);
 
 	rss->rss_ind_tbl =
-		dma_alloc_coherent(ena_dev->dmadev, tbl_size,
+		dma_zalloc_coherent(ena_dev->dmadev, tbl_size,
 				    &rss->rss_ind_tbl_dma_addr, GFP_KERNEL);
 	if (unlikely(!rss->rss_ind_tbl))
 		goto mem_err1;
@@ -1245,64 +1300,29 @@ static int ena_com_ind_tbl_convert_to_device(struct ena_com_dev *ena_dev)
 	return 0;
 }
 
-static int ena_com_ind_tbl_convert_from_device(struct ena_com_dev *ena_dev)
-{
-	u16 dev_idx_to_host_tbl[ENA_TOTAL_NUM_QUEUES] = { (u16)-1 };
-	struct ena_rss *rss = &ena_dev->rss;
-	u8 idx;
-	u16 i;
-
-	for (i = 0; i < ENA_TOTAL_NUM_QUEUES; i++)
-		dev_idx_to_host_tbl[ena_dev->io_sq_queues[i].idx] = i;
-
-	for (i = 0; i < 1 << rss->tbl_log_size; i++) {
-		if (rss->rss_ind_tbl[i].cq_idx > ENA_TOTAL_NUM_QUEUES)
-			return -EINVAL;
-		idx = (u8)rss->rss_ind_tbl[i].cq_idx;
-
-		if (dev_idx_to_host_tbl[idx] > ENA_TOTAL_NUM_QUEUES)
-			return -EINVAL;
-
-		rss->host_rss_ind_tbl[i] = dev_idx_to_host_tbl[idx];
-	}
-
-	return 0;
-}
-
-static int ena_com_init_interrupt_moderation_table(struct ena_com_dev *ena_dev)
-{
-	size_t size;
-
-	size = sizeof(struct ena_intr_moder_entry) * ENA_INTR_MAX_NUM_OF_LEVELS;
-
-	ena_dev->intr_moder_tbl =
-		devm_kzalloc(ena_dev->dmadev, size, GFP_KERNEL);
-	if (!ena_dev->intr_moder_tbl)
-		return -ENOMEM;
-
-	ena_com_config_default_interrupt_moderation_table(ena_dev);
-
-	return 0;
-}
-
 static void ena_com_update_intr_delay_resolution(struct ena_com_dev *ena_dev,
 						 u16 intr_delay_resolution)
 {
-	struct ena_intr_moder_entry *intr_moder_tbl = ena_dev->intr_moder_tbl;
-	unsigned int i;
+	u16 prev_intr_delay_resolution = ena_dev->intr_delay_resolution;
 
-	if (!intr_delay_resolution) {
+	if (unlikely(!intr_delay_resolution)) {
 		pr_err("Illegal intr_delay_resolution provided. Going to use default 1 usec resolution\n");
-		intr_delay_resolution = 1;
+		intr_delay_resolution = ENA_DEFAULT_INTR_DELAY_RESOLUTION;
 	}
-	ena_dev->intr_delay_resolution = intr_delay_resolution;
 
 	/* update Rx */
-	for (i = 0; i < ENA_INTR_MAX_NUM_OF_LEVELS; i++)
-		intr_moder_tbl[i].intr_moder_interval /= intr_delay_resolution;
+	ena_dev->intr_moder_rx_interval =
+		ena_dev->intr_moder_rx_interval *
+		prev_intr_delay_resolution /
+		intr_delay_resolution;
 
 	/* update Tx */
-	ena_dev->intr_moder_tx_interval /= intr_delay_resolution;
+	ena_dev->intr_moder_tx_interval =
+		ena_dev->intr_moder_tx_interval *
+		prev_intr_delay_resolution /
+		intr_delay_resolution;
+
+	ena_dev->intr_delay_resolution = intr_delay_resolution;
 }
 
 /*****************************************************************************/
@@ -1654,13 +1674,24 @@ void ena_com_set_admin_polling_mode(struct ena_com_dev *ena_dev, bool polling)
 	ena_dev->admin_queue.polling = polling;
 }
 
+bool ena_com_get_admin_polling_mode(struct ena_com_dev * ena_dev)
+{
+	return ena_dev->admin_queue.polling;
+}
+
+void ena_com_set_admin_auto_polling_mode(struct ena_com_dev *ena_dev,
+					 bool polling)
+{
+	ena_dev->admin_queue.auto_polling = polling;
+}
+
 int ena_com_mmio_reg_read_request_init(struct ena_com_dev *ena_dev)
 {
 	struct ena_com_mmio_read *mmio_read = &ena_dev->mmio_read;
 
 	spin_lock_init(&mmio_read->lock);
 	mmio_read->read_resp =
-		dma_alloc_coherent(ena_dev->dmadev,
+		dma_zalloc_coherent(ena_dev->dmadev,
 				    sizeof(*mmio_read->read_resp),
 				    &mmio_read->read_resp_dma_addr, GFP_KERNEL);
 	if (unlikely(!mmio_read->read_resp))
@@ -1882,61 +1913,6 @@ int ena_com_get_link_params(struct ena_com_dev *ena_dev,
 	return ena_com_get_feature(ena_dev, resp, ENA_ADMIN_LINK_CONFIG, 0);
 }
 
-int ena_com_extra_properties_strings_init(struct ena_com_dev *ena_dev)
-{
-	struct ena_admin_get_feat_resp resp;
-	struct ena_extra_properties_strings *extra_properties_strings =
-			&ena_dev->extra_properties_strings;
-	u32 rc;
-	extra_properties_strings->size = ENA_ADMIN_EXTRA_PROPERTIES_COUNT *
-		ENA_ADMIN_EXTRA_PROPERTIES_STRING_LEN;
-
-	extra_properties_strings->virt_addr =
-		dma_alloc_coherent(ena_dev->dmadev,
-				    extra_properties_strings->size,
-				    &extra_properties_strings->dma_addr,
-				    GFP_KERNEL);
-	if (unlikely(!extra_properties_strings->virt_addr)) {
-		pr_err("Failed to allocate extra properties strings\n");
-		return 0;
-	}
-
-	rc = ena_com_get_feature_ex(ena_dev, &resp,
-				    ENA_ADMIN_EXTRA_PROPERTIES_STRINGS,
-				    extra_properties_strings->dma_addr,
-				    extra_properties_strings->size, 0);
-	if (rc) {
-		pr_debug("Failed to get extra properties strings\n");
-		goto err;
-	}
-
-	return resp.u.extra_properties_strings.count;
-err:
-	ena_com_delete_extra_properties_strings(ena_dev);
-	return 0;
-}
-
-void ena_com_delete_extra_properties_strings(struct ena_com_dev *ena_dev)
-{
-	struct ena_extra_properties_strings *extra_properties_strings =
-				&ena_dev->extra_properties_strings;
-
-	if (extra_properties_strings->virt_addr) {
-		dma_free_coherent(ena_dev->dmadev,
-				  extra_properties_strings->size,
-				  extra_properties_strings->virt_addr,
-				  extra_properties_strings->dma_addr);
-		extra_properties_strings->virt_addr = NULL;
-	}
-}
-
-int ena_com_get_extra_properties_flags(struct ena_com_dev *ena_dev,
-				       struct ena_admin_get_feat_resp *resp)
-{
-	return ena_com_get_feature(ena_dev, resp,
-				   ENA_ADMIN_EXTRA_PROPERTIES_FLAGS, 0);
-}
-
 int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev,
 			      struct ena_com_dev_get_features_ctx *get_feat_ctx)
 {
@@ -2059,7 +2035,7 @@ void ena_com_aenq_intr_handler(struct ena_com_dev *dev, void *data)
 	struct ena_admin_aenq_entry *aenq_e;
 	struct ena_admin_aenq_common_desc *aenq_common;
 	struct ena_com_aenq *aenq  = &dev->aenq;
-	unsigned long long timestamp;
+	u64 timestamp;
 	ena_aenq_handler handler_cb;
 	u16 masked_head, processed = 0;
 	u8 phase;
@@ -2077,9 +2053,8 @@ void ena_com_aenq_intr_handler(struct ena_com_dev *dev, void *data)
 		 */
 		dma_rmb();
 
-		timestamp =
-			(unsigned long long)aenq_common->timestamp_low |
-			((unsigned long long)aenq_common->timestamp_high << 32);
+		timestamp = (u64)aenq_common->timestamp_low |
+			    ((u64)aenq_common->timestamp_high << 32);
 		pr_debug("AENQ! Group[%x] Syndrom[%x] timestamp: [%llus]\n",
 			 aenq_common->group, aenq_common->syndrom, timestamp);
 
@@ -2110,7 +2085,9 @@ void ena_com_aenq_intr_handler(struct ena_com_dev *dev, void *data)
 	/* write the aenq doorbell after all AENQ descriptors were read */
 	mb();
 	writel_relaxed((u32)aenq->head, dev->reg_bar + ENA_REGS_AENQ_HEAD_DB_OFF);
+#ifndef MMIOWB_NOT_DEFINED
 	mmiowb();
+#endif
 }
 
 int ena_com_dev_reset(struct ena_com_dev *ena_dev,
@@ -2217,7 +2194,7 @@ int ena_com_get_dev_basic_stats(struct ena_com_dev *ena_dev,
 	return ret;
 }
 
-int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, int mtu)
+int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, u32 mtu)
 {
 	struct ena_com_admin_queue *admin_queue;
 	struct ena_admin_set_feat_cmd cmd;
@@ -2289,7 +2266,7 @@ int ena_com_set_hash_function(struct ena_com_dev *ena_dev)
 	if (unlikely(ret))
 		return ret;
 
-	if (get_resp.u.flow_hash_func.supported_func & (1 << rss->hash_func)) {
+	if (!(get_resp.u.flow_hash_func.supported_func & BIT(rss->hash_func))) {
 		pr_err("Func hash %d isn't supported by device, abort\n",
 		       rss->hash_func);
 		return -EOPNOTSUPP;
@@ -2332,12 +2309,14 @@ int ena_com_fill_hash_function(struct ena_com_dev *ena_dev,
 			       enum ena_admin_hash_functions func,
 			       const u8 *key, u16 key_len, u32 init_val)
 {
-	struct ena_rss *rss = &ena_dev->rss;
+	struct ena_admin_feature_rss_flow_hash_control *hash_key;
 	struct ena_admin_get_feat_resp get_resp;
-	struct ena_admin_feature_rss_flow_hash_control *hash_key =
-		rss->hash_key;
+	enum ena_admin_hash_functions old_func;
+	struct ena_rss *rss = &ena_dev->rss;
 	int rc;
 
+	hash_key = rss->hash_key;
+
 	/* Make sure size is a mult of DWs */
 	if (unlikely(key_len & 0x3))
 		return -EINVAL;
@@ -2349,22 +2328,23 @@ int ena_com_fill_hash_function(struct ena_com_dev *ena_dev,
 	if (unlikely(rc))
 		return rc;
 
-	if (!((1 << func) & get_resp.u.flow_hash_func.supported_func)) {
+	if (!(BIT(func) & get_resp.u.flow_hash_func.supported_func)) {
 		pr_err("Flow hash function %d isn't supported\n", func);
 		return -EOPNOTSUPP;
 	}
 
 	switch (func) {
 	case ENA_ADMIN_TOEPLITZ:
-		if (key_len > sizeof(hash_key->key)) {
-			pr_err("key len (%hu) is bigger than the max supported (%zu)\n",
-			       key_len, sizeof(hash_key->key));
-			return -EINVAL;
+		if (key) {
+			if (key_len != sizeof(hash_key->key)) {
+				pr_err("key len (%hu) doesn't equal the supported size (%zu)\n",
+				       key_len, sizeof(hash_key->key));
+				return -EINVAL;
+			}
+			memcpy(hash_key->key, key, key_len);
+			rss->hash_init_val = init_val;
+			hash_key->keys_num = key_len >> 2;
 		}
-
-		memcpy(hash_key->key, key, key_len);
-		rss->hash_init_val = init_val;
-		hash_key->keys_num = key_len >> 2;
 		break;
 	case ENA_ADMIN_CRC32:
 		rss->hash_init_val = init_val;
@@ -2374,35 +2354,47 @@ int ena_com_fill_hash_function(struct ena_com_dev *ena_dev,
 		return -EINVAL;
 	}
 
+	old_func = rss->hash_func;
+	rss->hash_func = func;
 	rc = ena_com_set_hash_function(ena_dev);
 
 	/* Restore the old function */
 	if (unlikely(rc))
-		ena_com_get_hash_function(ena_dev, NULL, NULL);
+		rss->hash_func = old_func;
 
 	return rc;
 }
 
 int ena_com_get_hash_function(struct ena_com_dev *ena_dev,
-			      enum ena_admin_hash_functions *func,
-			      u8 *key)
+			      enum ena_admin_hash_functions *func)
 {
 	struct ena_rss *rss = &ena_dev->rss;
 	struct ena_admin_get_feat_resp get_resp;
-	struct ena_admin_feature_rss_flow_hash_control *hash_key =
-		rss->hash_key;
 	int rc;
 
-	rc = ena_com_get_feature_ex(ena_dev, &get_resp,
-				    ENA_ADMIN_RSS_HASH_FUNCTION,
-				    rss->hash_key_dma_addr,
-				    sizeof(*rss->hash_key), 0);
-	if (unlikely(rc))
-		return rc;
+	if (func) {
+		rc = ena_com_get_feature_ex(ena_dev, &get_resp,
+					    ENA_ADMIN_RSS_HASH_FUNCTION,
+					    rss->hash_key_dma_addr,
+					    sizeof(*rss->hash_key), 0);
+		if (unlikely(rc))
+			return rc;
+
+		/* ffs returns 1 in case the lsb is set */
+		rss->hash_func = ffs(get_resp.u.flow_hash_func.selected_func);
+		if (rss->hash_func)
+			rss->hash_func--;
 
-	rss->hash_func = get_resp.u.flow_hash_func.selected_func;
-	if (func)
 		*func = rss->hash_func;
+	}
+
+	return 0;
+}
+
+int ena_com_get_hash_key(struct ena_com_dev *ena_dev, u8 *key)
+{
+	struct ena_admin_feature_rss_flow_hash_control *hash_key =
+		ena_dev->rss.hash_key;
 
 	if (key)
 		memcpy(key, hash_key->key, (size_t)(hash_key->keys_num) << 2);
@@ -2664,10 +2656,6 @@ int ena_com_indirect_table_get(struct ena_com_dev *ena_dev, u32 *ind_tbl)
 	if (!ind_tbl)
 		return 0;
 
-	rc = ena_com_ind_tbl_convert_from_device(ena_dev);
-	if (unlikely(rc))
-		return rc;
-
 	for (i = 0; i < (1 << rss->tbl_log_size); i++)
 		ind_tbl[i] = rss->host_rss_ind_tbl[i];
 
@@ -2688,6 +2676,8 @@ int ena_com_rss_init(struct ena_com_dev *ena_dev, u16 indr_tbl_log_size)
 	if (unlikely(rc))
 		goto err_hash_key;
 
+	ena_com_hash_key_fill_default_key(ena_dev);
+
 	rc = ena_com_hash_ctrl_init(ena_dev);
 	if (unlikely(rc))
 		goto err_hash_ctrl;
@@ -2717,7 +2707,7 @@ int ena_com_allocate_host_info(struct ena_com_dev *ena_dev)
 	struct ena_host_attribute *host_attr = &ena_dev->host_attr;
 
 	host_attr->host_info =
-		dma_alloc_coherent(ena_dev->dmadev, SZ_4K,
+		dma_zalloc_coherent(ena_dev->dmadev, SZ_4K,
 				    &host_attr->host_info_dma_addr, GFP_KERNEL);
 	if (unlikely(!host_attr->host_info))
 		return -ENOMEM;
@@ -2735,7 +2725,7 @@ int ena_com_allocate_debug_area(struct ena_com_dev *ena_dev,
 	struct ena_host_attribute *host_attr = &ena_dev->host_attr;
 
 	host_attr->debug_area_virt_addr =
-		dma_alloc_coherent(ena_dev->dmadev, debug_area_size,
+		dma_zalloc_coherent(ena_dev->dmadev, debug_area_size,
 				    &host_attr->debug_area_dma_addr, GFP_KERNEL);
 	if (unlikely(!host_attr->debug_area_virt_addr)) {
 		host_attr->debug_area_size = 0;
@@ -2826,42 +2816,35 @@ bool ena_com_interrupt_moderation_supported(struct ena_com_dev *ena_dev)
 						  ENA_ADMIN_INTERRUPT_MODERATION);
 }
 
-int ena_com_update_nonadaptive_moderation_interval_tx(struct ena_com_dev *ena_dev,
-						      u32 tx_coalesce_usecs)
+static int ena_com_update_nonadaptive_moderation_interval(u32 coalesce_usecs,
+							  u32 intr_delay_resolution,
+							  u32 *intr_moder_interval)
 {
-	if (!ena_dev->intr_delay_resolution) {
+	if (!intr_delay_resolution) {
 		pr_err("Illegal interrupt delay granularity value\n");
 		return -EFAULT;
 	}
 
-	ena_dev->intr_moder_tx_interval = tx_coalesce_usecs /
-		ena_dev->intr_delay_resolution;
+	*intr_moder_interval = coalesce_usecs / intr_delay_resolution;
 
 	return 0;
 }
 
-int ena_com_update_nonadaptive_moderation_interval_rx(struct ena_com_dev *ena_dev,
-						      u32 rx_coalesce_usecs)
-{
-	if (!ena_dev->intr_delay_resolution) {
-		pr_err("Illegal interrupt delay granularity value\n");
-		return -EFAULT;
-	}
-
-	/* We use LOWEST entry of moderation table for storing
-	 * nonadaptive interrupt coalescing values
-	 */
-	ena_dev->intr_moder_tbl[ENA_INTR_MODER_LOWEST].intr_moder_interval =
-		rx_coalesce_usecs / ena_dev->intr_delay_resolution;
 
-	return 0;
+int ena_com_update_nonadaptive_moderation_interval_tx(struct ena_com_dev *ena_dev,
+						      u32 tx_coalesce_usecs)
+{
+	return ena_com_update_nonadaptive_moderation_interval(tx_coalesce_usecs,
+							      ena_dev->intr_delay_resolution,
+							      &ena_dev->intr_moder_tx_interval);
 }
 
-void ena_com_destroy_interrupt_moderation(struct ena_com_dev *ena_dev)
+int ena_com_update_nonadaptive_moderation_interval_rx(struct ena_com_dev *ena_dev,
+						      u32 rx_coalesce_usecs)
 {
-	if (ena_dev->intr_moder_tbl)
-		devm_kfree(ena_dev->dmadev, ena_dev->intr_moder_tbl);
-	ena_dev->intr_moder_tbl = NULL;
+	return ena_com_update_nonadaptive_moderation_interval(rx_coalesce_usecs,
+							      ena_dev->intr_delay_resolution,
+							      &ena_dev->intr_moder_rx_interval);
 }
 
 int ena_com_init_interrupt_moderation(struct ena_com_dev *ena_dev)
@@ -2888,62 +2871,14 @@ int ena_com_init_interrupt_moderation(struct ena_com_dev *ena_dev)
 		return rc;
 	}
 
-	rc = ena_com_init_interrupt_moderation_table(ena_dev);
-	if (rc)
-		goto err;
-
 	/* if moderation is supported by device we set adaptive moderation */
 	delay_resolution = get_resp.u.intr_moderation.intr_delay_resolution;
 	ena_com_update_intr_delay_resolution(ena_dev, delay_resolution);
-	ena_com_enable_adaptive_moderation(ena_dev);
-
-	return 0;
-err:
-	ena_com_destroy_interrupt_moderation(ena_dev);
-	return rc;
-}
 
-void ena_com_config_default_interrupt_moderation_table(struct ena_com_dev *ena_dev)
-{
-	struct ena_intr_moder_entry *intr_moder_tbl = ena_dev->intr_moder_tbl;
-
-	if (!intr_moder_tbl)
-		return;
+	/* Disable adaptive moderation by default - can be enabled later */
+	ena_com_disable_adaptive_moderation(ena_dev);
 
-	intr_moder_tbl[ENA_INTR_MODER_LOWEST].intr_moder_interval =
-		ENA_INTR_LOWEST_USECS;
-	intr_moder_tbl[ENA_INTR_MODER_LOWEST].pkts_per_interval =
-		ENA_INTR_LOWEST_PKTS;
-	intr_moder_tbl[ENA_INTR_MODER_LOWEST].bytes_per_interval =
-		ENA_INTR_LOWEST_BYTES;
-
-	intr_moder_tbl[ENA_INTR_MODER_LOW].intr_moder_interval =
-		ENA_INTR_LOW_USECS;
-	intr_moder_tbl[ENA_INTR_MODER_LOW].pkts_per_interval =
-		ENA_INTR_LOW_PKTS;
-	intr_moder_tbl[ENA_INTR_MODER_LOW].bytes_per_interval =
-		ENA_INTR_LOW_BYTES;
-
-	intr_moder_tbl[ENA_INTR_MODER_MID].intr_moder_interval =
-		ENA_INTR_MID_USECS;
-	intr_moder_tbl[ENA_INTR_MODER_MID].pkts_per_interval =
-		ENA_INTR_MID_PKTS;
-	intr_moder_tbl[ENA_INTR_MODER_MID].bytes_per_interval =
-		ENA_INTR_MID_BYTES;
-
-	intr_moder_tbl[ENA_INTR_MODER_HIGH].intr_moder_interval =
-		ENA_INTR_HIGH_USECS;
-	intr_moder_tbl[ENA_INTR_MODER_HIGH].pkts_per_interval =
-		ENA_INTR_HIGH_PKTS;
-	intr_moder_tbl[ENA_INTR_MODER_HIGH].bytes_per_interval =
-		ENA_INTR_HIGH_BYTES;
-
-	intr_moder_tbl[ENA_INTR_MODER_HIGHEST].intr_moder_interval =
-		ENA_INTR_HIGHEST_USECS;
-	intr_moder_tbl[ENA_INTR_MODER_HIGHEST].pkts_per_interval =
-		ENA_INTR_HIGHEST_PKTS;
-	intr_moder_tbl[ENA_INTR_MODER_HIGHEST].bytes_per_interval =
-		ENA_INTR_HIGHEST_BYTES;
+	return 0;
 }
 
 unsigned int ena_com_get_nonadaptive_moderation_interval_tx(struct ena_com_dev *ena_dev)
@@ -2953,49 +2888,7 @@ unsigned int ena_com_get_nonadaptive_moderation_interval_tx(struct ena_com_dev *
 
 unsigned int ena_com_get_nonadaptive_moderation_interval_rx(struct ena_com_dev *ena_dev)
 {
-	struct ena_intr_moder_entry *intr_moder_tbl = ena_dev->intr_moder_tbl;
-
-	if (intr_moder_tbl)
-		return intr_moder_tbl[ENA_INTR_MODER_LOWEST].intr_moder_interval;
-
-	return 0;
-}
-
-void ena_com_init_intr_moderation_entry(struct ena_com_dev *ena_dev,
-					enum ena_intr_moder_level level,
-					struct ena_intr_moder_entry *entry)
-{
-	struct ena_intr_moder_entry *intr_moder_tbl = ena_dev->intr_moder_tbl;
-
-	if (level >= ENA_INTR_MAX_NUM_OF_LEVELS)
-		return;
-
-	intr_moder_tbl[level].intr_moder_interval = entry->intr_moder_interval;
-	if (ena_dev->intr_delay_resolution)
-		intr_moder_tbl[level].intr_moder_interval /=
-			ena_dev->intr_delay_resolution;
-	intr_moder_tbl[level].pkts_per_interval = entry->pkts_per_interval;
-
-	/* use hardcoded value until ethtool supports bytecount parameter */
-	if (entry->bytes_per_interval != ENA_INTR_BYTE_COUNT_NOT_SUPPORTED)
-		intr_moder_tbl[level].bytes_per_interval = entry->bytes_per_interval;
-}
-
-void ena_com_get_intr_moderation_entry(struct ena_com_dev *ena_dev,
-				       enum ena_intr_moder_level level,
-				       struct ena_intr_moder_entry *entry)
-{
-	struct ena_intr_moder_entry *intr_moder_tbl = ena_dev->intr_moder_tbl;
-
-	if (level >= ENA_INTR_MAX_NUM_OF_LEVELS)
-		return;
-
-	entry->intr_moder_interval = intr_moder_tbl[level].intr_moder_interval;
-	if (ena_dev->intr_delay_resolution)
-		entry->intr_moder_interval *= ena_dev->intr_delay_resolution;
-	entry->pkts_per_interval =
-	intr_moder_tbl[level].pkts_per_interval;
-	entry->bytes_per_interval = intr_moder_tbl[level].bytes_per_interval;
+	return ena_dev->intr_moder_rx_interval;
 }
 
 int ena_com_config_dev_mode(struct ena_com_dev *ena_dev,
@@ -3003,7 +2896,7 @@ int ena_com_config_dev_mode(struct ena_com_dev *ena_dev,
 			    struct ena_llq_configurations *llq_default_cfg)
 {
 	int rc;
-	int size;
+	struct ena_com_llq_info *llq_info = &(ena_dev->llq_info);;
 
 	if (!llq_features->max_llq_num) {
 		ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
@@ -3014,12 +2907,10 @@ int ena_com_config_dev_mode(struct ena_com_dev *ena_dev,
 	if (rc)
 		return rc;
 
-	/* Validate the descriptor is not too big */
-	size = ena_dev->tx_max_header_size;
-	size += ena_dev->llq_info.descs_num_before_header *
-		sizeof(struct ena_eth_io_tx_desc);
+	ena_dev->tx_max_header_size = llq_info->desc_list_entry_size -
+		(llq_info->descs_num_before_header * sizeof(struct ena_eth_io_tx_desc));
 
-	if (unlikely(ena_dev->llq_info.desc_list_entry_size < size)) {
+	if (ena_dev->tx_max_header_size == 0) {
 		pr_err("the size of the LLQ entry is smaller than needed\n");
 		return -EINVAL;
 	}
diff --git a/drivers/amazon/net/ena/ena_com.h b/drivers/amazon/net/ena/ena_com.h
index 27a85750309f7..d753c824a86db 100644
--- a/drivers/amazon/net/ena/ena_com.h
+++ b/drivers/amazon/net/ena/ena_com.h
@@ -72,46 +72,16 @@
 /*****************************************************************************/
 /* ENA adaptive interrupt moderation settings */
 
-#define ENA_INTR_LOWEST_USECS           (0)
-#define ENA_INTR_LOWEST_PKTS            (3)
-#define ENA_INTR_LOWEST_BYTES           (2 * 1524)
+#define ENA_INTR_INITIAL_TX_INTERVAL_USECS		0
+#define ENA_INTR_INITIAL_RX_INTERVAL_USECS		0
+#define ENA_DEFAULT_INTR_DELAY_RESOLUTION		1
 
-#define ENA_INTR_LOW_USECS              (32)
-#define ENA_INTR_LOW_PKTS               (12)
-#define ENA_INTR_LOW_BYTES              (16 * 1024)
-
-#define ENA_INTR_MID_USECS              (80)
-#define ENA_INTR_MID_PKTS               (48)
-#define ENA_INTR_MID_BYTES              (64 * 1024)
-
-#define ENA_INTR_HIGH_USECS             (128)
-#define ENA_INTR_HIGH_PKTS              (96)
-#define ENA_INTR_HIGH_BYTES             (128 * 1024)
-
-#define ENA_INTR_HIGHEST_USECS          (192)
-#define ENA_INTR_HIGHEST_PKTS           (128)
-#define ENA_INTR_HIGHEST_BYTES          (192 * 1024)
-
-#define ENA_INTR_INITIAL_TX_INTERVAL_USECS		196
-#define ENA_INTR_INITIAL_RX_INTERVAL_USECS		4
-#define ENA_INTR_DELAY_OLD_VALUE_WEIGHT			6
-#define ENA_INTR_DELAY_NEW_VALUE_WEIGHT			4
-#define ENA_INTR_MODER_LEVEL_STRIDE			1
-#define ENA_INTR_BYTE_COUNT_NOT_SUPPORTED		0xFFFFFF
+#define ENA_HASH_KEY_SIZE                              40
 
 #define ENA_HW_HINTS_NO_TIMEOUT				0xFFFF
 
 #define ENA_FEATURE_MAX_QUEUE_EXT_VER	1
 
-enum ena_intr_moder_level {
-	ENA_INTR_MODER_LOWEST = 0,
-	ENA_INTR_MODER_LOW,
-	ENA_INTR_MODER_MID,
-	ENA_INTR_MODER_HIGH,
-	ENA_INTR_MODER_HIGHEST,
-	ENA_INTR_MAX_NUM_OF_LEVELS,
-};
-
 struct ena_llq_configurations {
 	enum ena_admin_llq_header_location llq_header_location;
 	enum ena_admin_llq_ring_entry_size llq_ring_entry_size;
@@ -120,12 +90,6 @@ struct ena_llq_configurations {
 	u16 llq_ring_entry_size_value;
 };
 
-struct ena_intr_moder_entry {
-	unsigned int intr_moder_interval;
-	unsigned int pkts_per_interval;
-	unsigned int bytes_per_interval;
-};
-
 enum queue_direction {
 	ENA_COM_IO_QUEUE_DIRECTION_TX,
 	ENA_COM_IO_QUEUE_DIRECTION_RX
@@ -162,6 +126,7 @@ struct ena_com_llq_info {
 	u16 descs_num_before_header;
 	u16 descs_per_entry;
 	u16 max_entries_in_tx_burst;
+	bool disable_meta_caching;
 };
 
 struct ena_com_io_cq {
@@ -226,6 +191,8 @@ struct ena_com_io_sq {
 	enum queue_direction direction;
 	enum ena_admin_placement_policy_type mem_queue_type;
 
+	bool disable_meta_caching;
+
 	u32 msix_vector;
 	struct ena_com_tx_meta cached_tx_meta;
 	struct ena_com_llq_info llq_info;
@@ -288,6 +255,9 @@ struct ena_com_admin_queue {
 	/* Indicate if the admin queue should poll for completion */
 	bool polling;
 
+	/* Define if fallback to polling mode should occur */
+	bool auto_polling;
+
 	u16 curr_cmd_id;
 
 	/* Indicate that the ena was initialized and can
@@ -352,12 +322,6 @@ struct ena_host_attribute {
 	dma_addr_t host_info_dma_addr;
 };
 
-struct ena_extra_properties_strings {
-	u8 *virt_addr;
-	dma_addr_t dma_addr;
-	u32 size;
-};
-
 /* Each ena_dev is a PCI function. */
 struct ena_com_dev {
 	struct ena_com_admin_queue admin_queue;
@@ -383,11 +347,16 @@ struct ena_com_dev {
 	struct ena_host_attribute host_attr;
 	bool adaptive_coalescing;
 	u16 intr_delay_resolution;
+
+	/* interrupt moderation intervals are in usec divided by
+	 * intr_delay_resolution, which is supplied by the device.
+	 */
 	u32 intr_moder_tx_interval;
+	u32 intr_moder_rx_interval;
+
 	struct ena_intr_moder_entry *intr_moder_tbl;
 
 	struct ena_com_llq_info llq_info;
-	struct ena_extra_properties_strings extra_properties_strings;
 };
 
 struct ena_com_dev_get_features_ctx {
@@ -433,7 +402,7 @@ struct ena_aenq_handlers {
  */
 int ena_com_mmio_reg_read_request_init(struct ena_com_dev *ena_dev);
 
-/* ena_com_set_mmio_read_mode - Enable/disable the mmio reg read mechanism
+/* ena_com_set_mmio_read_mode - Enable/disable the indirect mmio reg read mechanism
  * @ena_dev: ENA communication layer struct
  * @readless_supported: readless mode (enable/disable)
  */
@@ -541,7 +510,7 @@ bool ena_com_get_admin_running_state(struct ena_com_dev *ena_dev);
  */
 void ena_com_set_admin_polling_mode(struct ena_com_dev *ena_dev, bool polling);
 
-/* ena_com_set_admin_polling_mode - Get the admin completion queue polling mode
+/* ena_com_get_admin_polling_mode - Get the admin completion queue polling mode
  * @ena_dev: ENA communication layer struct
  *
  * Get the admin completion mode.
@@ -551,12 +520,23 @@ void ena_com_set_admin_polling_mode(struct ena_com_dev *ena_dev, bool polling);
  *
  * @return state
  */
-bool ena_com_get_ena_admin_polling_mode(struct ena_com_dev *ena_dev);
+bool ena_com_get_admin_polling_mode(struct ena_com_dev *ena_dev);
+
+/* ena_com_set_admin_auto_polling_mode - Enable autoswitch to polling mode
+ * @ena_dev: ENA communication layer struct
+ * @polling: Enable/Disable polling mode
+ *
+ * Set the autopolling mode.
+ * If autopolling is on:
+ * In case of missing interrupt when data is available switch to polling.
+ */
+void ena_com_set_admin_auto_polling_mode(struct ena_com_dev *ena_dev,
+					 bool polling);
 
 /* ena_com_admin_q_comp_intr_handler - admin queue interrupt handler
  * @ena_dev: ENA communication layer struct
  *
- * This method go over the admin completion queue and wake up all the pending
+ * This method goes over the admin completion queue and wakes up all the pending
  * threads that wait on the commands wait event.
  *
  * @note: Should be called after MSI-X interrupt.
@@ -566,7 +546,7 @@ void ena_com_admin_q_comp_intr_handler(struct ena_com_dev *ena_dev);
 /* ena_com_aenq_intr_handler - AENQ interrupt handler
  * @ena_dev: ENA communication layer struct
  *
- * This method go over the async event notification queue and call the proper
+ * This method goes over the async event notification queue and calls the proper
  * aenq handler.
  */
 void ena_com_aenq_intr_handler(struct ena_com_dev *dev, void *data);
@@ -583,14 +563,14 @@ void ena_com_abort_admin_commands(struct ena_com_dev *ena_dev);
 /* ena_com_wait_for_abort_completion - Wait for admin commands abort.
  * @ena_dev: ENA communication layer struct
  *
- * This method wait until all the outstanding admin commands will be completed.
+ * This method waits until all the outstanding admin commands are completed.
  */
 void ena_com_wait_for_abort_completion(struct ena_com_dev *ena_dev);
 
 /* ena_com_validate_version - Validate the device parameters
  * @ena_dev: ENA communication layer struct
  *
- * This method validate the device parameters are the same as the saved
+ * This method verifies the device parameters are the same as the saved
  * parameters in ena_dev.
  * This method is useful after device reset, to validate the device mac address
  * and the device offloads are the same as before the reset.
@@ -611,31 +591,6 @@ int ena_com_validate_version(struct ena_com_dev *ena_dev);
 int ena_com_get_link_params(struct ena_com_dev *ena_dev,
 			    struct ena_admin_get_feat_resp *resp);
 
-/* ena_com_extra_properties_strings_init - Initialize the extra properties strings buffer.
- * @ena_dev: ENA communication layer struct
- *
- * Initialize the extra properties strings buffer.
- */
-int ena_com_extra_properties_strings_init(struct ena_com_dev *ena_dev);
-
-/* ena_com_delete_extra_properties_strings - Free the extra properties strings buffer.
- * @ena_dev: ENA communication layer struct
- *
- * Free the allocated extra properties strings buffer.
- */
-void ena_com_delete_extra_properties_strings(struct ena_com_dev *ena_dev);
-
-/* ena_com_get_extra_properties_flags - Retrieve extra properties flags.
- * @ena_dev: ENA communication layer struct
- * @resp: Extra properties flags.
- *
- * Retrieve the extra properties flags.
- *
- * @return - 0 on Success negative value otherwise.
- */
-int ena_com_get_extra_properties_flags(struct ena_com_dev *ena_dev,
-				       struct ena_admin_get_feat_resp *resp);
-
 /* ena_com_get_dma_width - Retrieve physical dma address width the device
  * supports.
  * @ena_dev: ENA communication layer struct
@@ -680,7 +635,7 @@ int ena_com_get_dev_basic_stats(struct ena_com_dev *ena_dev,
  *
  * @return: 0 on Success and negative value otherwise.
  */
-int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, int mtu);
+int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, u32 mtu);
 
 /* ena_com_get_offload_settings - Retrieve the device offloads capabilities
  * @ena_dev: ENA communication layer struct
@@ -710,6 +665,14 @@ int ena_com_rss_init(struct ena_com_dev *ena_dev, u16 log_size);
  */
 void ena_com_rss_destroy(struct ena_com_dev *ena_dev);
 
+/* ena_com_get_current_hash_function - Get RSS hash function
+ * @ena_dev: ENA communication layer struct
+ *
+ * Return the current hash function.
+ * @return: 0 or one of the ena_admin_hash_functions values.
+ */
+int ena_com_get_current_hash_function(struct ena_com_dev *ena_dev);
+
 /* ena_com_fill_hash_function - Fill RSS hash function
  * @ena_dev: ENA communication layer struct
  * @func: The hash function (Toeplitz or crc)
@@ -741,13 +704,11 @@ int ena_com_fill_hash_function(struct ena_com_dev *ena_dev,
  */
 int ena_com_set_hash_function(struct ena_com_dev *ena_dev);
 
-/* ena_com_get_hash_function - Retrieve the hash function and the hash key
- * from the device.
+/* ena_com_get_hash_function - Retrieve the hash function from the device.
  * @ena_dev: ENA communication layer struct
  * @func: hash function
- * @key: hash key
  *
- * Retrieve the hash function and the hash key from the device.
+ * Retrieve the hash function from the device.
  *
  * @note: If the caller called ena_com_fill_hash_function but didn't flash
  * it to the device, the new configuration will be lost.
@@ -755,9 +716,20 @@ int ena_com_set_hash_function(struct ena_com_dev *ena_dev);
  * @return: 0 on Success and negative value otherwise.
  */
 int ena_com_get_hash_function(struct ena_com_dev *ena_dev,
-			      enum ena_admin_hash_functions *func,
-			      u8 *key);
+			      enum ena_admin_hash_functions *func);
 
+/* ena_com_get_hash_key - Retrieve the hash key
+ * @ena_dev: ENA communication layer struct
+ * @key: hash key
+ *
+ * Retrieve the hash key.
+ *
+ * @note: If the caller called ena_com_fill_hash_key but didn't flash
+ * it to the device, the new configuration will be lost.
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_get_hash_key(struct ena_com_dev *ena_dev, u8 *key);
 /* ena_com_fill_hash_ctrl - Fill RSS hash control
  * @ena_dev: ENA communication layer struct.
  * @proto: The protocol to configure.
@@ -792,7 +764,7 @@ int ena_com_set_hash_ctrl(struct ena_com_dev *ena_dev);
  *
  * Retrieve the hash control from the device.
  *
- * @note, If the caller called ena_com_fill_hash_ctrl but didn't flash
+ * @note: If the caller called ena_com_fill_hash_ctrl but didn't flash
  * it to the device, the new configuration will be lost.
  *
  * @return: 0 on Success and negative value otherwise.
@@ -844,7 +816,7 @@ int ena_com_indirect_table_set(struct ena_com_dev *ena_dev);
  *
  * Retrieve the RSS indirection table from the device.
  *
- * @note: If the caller called ena_com_indirect_table_fill_entry but didn't flash
+ * @note: If the caller called ena_com_indirect_table_fill_entry but didn't flush
  * it to the device, the new configuration will be lost.
  *
  * @return: 0 on Success and negative value otherwise.
@@ -870,14 +842,14 @@ int ena_com_allocate_debug_area(struct ena_com_dev *ena_dev,
 /* ena_com_delete_debug_area - Free the debug area resources.
  * @ena_dev: ENA communication layer struct
  *
- * Free the allocate debug area.
+ * Free the allocated debug area.
  */
 void ena_com_delete_debug_area(struct ena_com_dev *ena_dev);
 
 /* ena_com_delete_host_info - Free the host info resources.
  * @ena_dev: ENA communication layer struct
  *
- * Free the allocate host info.
+ * Free the allocated host info.
  */
 void ena_com_delete_host_info(struct ena_com_dev *ena_dev);
 
@@ -918,9 +890,9 @@ int ena_com_destroy_io_cq(struct ena_com_dev *ena_dev,
  * @cmd_completion: command completion return value.
  * @cmd_comp_size: command completion size.
 
- * Submit an admin command and then wait until the device will return a
+ * Submit an admin command and then wait until the device returns a
  * completion.
- * The completion will be copyed into cmd_comp.
+ * The completion will be copied into cmd_comp.
  *
  * @return - 0 on success, negative value on failure.
  */
@@ -937,11 +909,6 @@ int ena_com_execute_admin_command(struct ena_com_admin_queue *admin_queue,
  */
 int ena_com_init_interrupt_moderation(struct ena_com_dev *ena_dev);
 
-/* ena_com_destroy_interrupt_moderation - Destroy interrupt moderation resources
- * @ena_dev: ENA communication layer struct
- */
-void ena_com_destroy_interrupt_moderation(struct ena_com_dev *ena_dev);
-
 /* ena_com_interrupt_moderation_supported - Return if interrupt moderation
  * capability is supported by the device.
  *
@@ -949,12 +916,6 @@ void ena_com_destroy_interrupt_moderation(struct ena_com_dev *ena_dev);
  */
 bool ena_com_interrupt_moderation_supported(struct ena_com_dev *ena_dev);
 
-/* ena_com_config_default_interrupt_moderation_table - Restore the interrupt
- * moderation table back to the default parameters.
- * @ena_dev: ENA communication layer struct
- */
-void ena_com_config_default_interrupt_moderation_table(struct ena_com_dev *ena_dev);
-
 /* ena_com_update_nonadaptive_moderation_interval_tx - Update the
  * non-adaptive interval in Tx direction.
  * @ena_dev: ENA communication layer struct
@@ -991,29 +952,6 @@ unsigned int ena_com_get_nonadaptive_moderation_interval_tx(struct ena_com_dev *
  */
 unsigned int ena_com_get_nonadaptive_moderation_interval_rx(struct ena_com_dev *ena_dev);
 
-/* ena_com_init_intr_moderation_entry - Update a single entry in the interrupt
- * moderation table.
- * @ena_dev: ENA communication layer struct
- * @level: Interrupt moderation table level
- * @entry: Entry value
- *
- * Update a single entry in the interrupt moderation table.
- */
-void ena_com_init_intr_moderation_entry(struct ena_com_dev *ena_dev,
-					enum ena_intr_moder_level level,
-					struct ena_intr_moder_entry *entry);
-
-/* ena_com_get_intr_moderation_entry - Init ena_intr_moder_entry.
- * @ena_dev: ENA communication layer struct
- * @level: Interrupt moderation table level
- * @entry: Entry to fill.
- *
- * Initialize the entry according to the adaptive interrupt moderation table.
- */
-void ena_com_get_intr_moderation_entry(struct ena_com_dev *ena_dev,
-				       enum ena_intr_moder_level level,
-				       struct ena_intr_moder_entry *entry);
-
 /* ena_com_config_dev_mode - Configure the placement policy of the device.
  * @ena_dev: ENA communication layer struct
  * @llq_features: LLQ feature descriptor, retrieve via
@@ -1039,80 +977,11 @@ static inline void ena_com_disable_adaptive_moderation(struct ena_com_dev *ena_d
 	ena_dev->adaptive_coalescing = false;
 }
 
-/* ena_com_calculate_interrupt_delay - Calculate new interrupt delay
- * @ena_dev: ENA communication layer struct
- * @pkts: Number of packets since the last update
- * @bytes: Number of bytes received since the last update.
- * @smoothed_interval: Returned interval
- * @moder_tbl_idx: Current table level as input update new level as return
- * value.
- */
-static inline void ena_com_calculate_interrupt_delay(struct ena_com_dev *ena_dev,
-						     unsigned int pkts,
-						     unsigned int bytes,
-						     unsigned int *smoothed_interval,
-						     unsigned int *moder_tbl_idx)
-{
-	enum ena_intr_moder_level curr_moder_idx, new_moder_idx;
-	struct ena_intr_moder_entry *curr_moder_entry;
-	struct ena_intr_moder_entry *pred_moder_entry;
-	struct ena_intr_moder_entry *new_moder_entry;
-	struct ena_intr_moder_entry *intr_moder_tbl = ena_dev->intr_moder_tbl;
-	unsigned int interval;
-
-	/* We apply adaptive moderation on Rx path only.
-	 * Tx uses static interrupt moderation.
-	 */
-	if (!pkts || !bytes)
-		/* Tx interrupt, or spurious interrupt,
-		 * in both cases we just use same delay values
-		 */
-		return;
-
-	curr_moder_idx = (enum ena_intr_moder_level)(*moder_tbl_idx);
-	if (unlikely(curr_moder_idx >= ENA_INTR_MAX_NUM_OF_LEVELS)) {
-		pr_err("Wrong moderation index %u\n", curr_moder_idx);
-		return;
-	}
-
-	curr_moder_entry = &intr_moder_tbl[curr_moder_idx];
-	new_moder_idx = curr_moder_idx;
-
-	if (curr_moder_idx == ENA_INTR_MODER_LOWEST) {
-		if ((pkts > curr_moder_entry->pkts_per_interval) ||
-		    (bytes > curr_moder_entry->bytes_per_interval))
-			new_moder_idx =
-				(enum ena_intr_moder_level)(curr_moder_idx + ENA_INTR_MODER_LEVEL_STRIDE);
-	} else {
-		pred_moder_entry = &intr_moder_tbl[curr_moder_idx - ENA_INTR_MODER_LEVEL_STRIDE];
-
-		if ((pkts <= pred_moder_entry->pkts_per_interval) ||
-		    (bytes <= pred_moder_entry->bytes_per_interval))
-			new_moder_idx =
-				(enum ena_intr_moder_level)(curr_moder_idx - ENA_INTR_MODER_LEVEL_STRIDE);
-		else if ((pkts > curr_moder_entry->pkts_per_interval) ||
-			 (bytes > curr_moder_entry->bytes_per_interval)) {
-			if (curr_moder_idx != ENA_INTR_MODER_HIGHEST)
-				new_moder_idx =
-					(enum ena_intr_moder_level)(curr_moder_idx + ENA_INTR_MODER_LEVEL_STRIDE);
-		}
-	}
-	new_moder_entry = &intr_moder_tbl[new_moder_idx];
-
-	interval = new_moder_entry->intr_moder_interval;
-	*smoothed_interval = (
-		(interval * ENA_INTR_DELAY_NEW_VALUE_WEIGHT +
-		ENA_INTR_DELAY_OLD_VALUE_WEIGHT * (*smoothed_interval)) + 5) /
-		10;
-
-	*moder_tbl_idx = new_moder_idx;
-}
-
 /* ena_com_update_intr_reg - Prepare interrupt register
  * @intr_reg: interrupt register to update.
  * @rx_delay_interval: Rx interval in usecs
  * @tx_delay_interval: Tx interval in usecs
- * @unmask: unask enable/disable
+ * @unmask: unmask enable/disable
  *
  * Prepare interrupt update register with the supplied parameters.
  */
diff --git a/drivers/amazon/net/ena/ena_common_defs.h b/drivers/amazon/net/ena/ena_common_defs.h
index 450824ae7d895..77ab0c1f8c73f 100755
--- a/drivers/amazon/net/ena/ena_common_defs.h
+++ b/drivers/amazon/net/ena/ena_common_defs.h
@@ -1,4 +1,3 @@
-/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
  * Copyright 2015 - 2018 Amazon.com, Inc. or its affiliates.
  *
@@ -46,4 +45,4 @@ struct ena_common_mem_addr {
 	u16 reserved16;
 };
 
-#endif /*_ENA_COMMON_H_ */
+#endif /* _ENA_COMMON_H_ */
diff --git a/drivers/amazon/net/ena/ena_eth_com.c b/drivers/amazon/net/ena/ena_eth_com.c
index a2410241033fc..b23baf806bfdc 100644
--- a/drivers/amazon/net/ena/ena_eth_com.c
+++ b/drivers/amazon/net/ena/ena_eth_com.c
@@ -32,7 +32,7 @@
 
 #include "ena_eth_com.h"
 
-static inline struct ena_eth_io_rx_cdesc_base *ena_com_get_next_rx_cdesc(
+static struct ena_eth_io_rx_cdesc_base *ena_com_get_next_rx_cdesc(
 	struct ena_com_io_cq *io_cq)
 {
 	struct ena_eth_io_rx_cdesc_base *cdesc;
@@ -60,7 +60,7 @@ static inline struct ena_eth_io_rx_cdesc_base *ena_com_get_next_rx_cdesc(
 	return cdesc;
 }
 
-static inline void *get_sq_desc_regular_queue(struct ena_com_io_sq *io_sq)
+static void *get_sq_desc_regular_queue(struct ena_com_io_sq *io_sq)
 {
 	u16 tail_masked;
 	u32 offset;
@@ -72,7 +72,7 @@ static inline void *get_sq_desc_regular_queue(struct ena_com_io_sq *io_sq)
 	return (void *)((uintptr_t)io_sq->desc_addr.virt_addr + offset);
 }
 
-static inline int ena_com_write_bounce_buffer_to_dev(struct ena_com_io_sq *io_sq,
+static int ena_com_write_bounce_buffer_to_dev(struct ena_com_io_sq *io_sq,
 						     u8 *bounce_buffer)
 {
 	struct ena_com_llq_info *llq_info = &io_sq->llq_info;
@@ -112,7 +112,7 @@ static inline int ena_com_write_bounce_buffer_to_dev(struct ena_com_io_sq *io_sq
 	return 0;
 }
 
-static inline int ena_com_write_header_to_bounce(struct ena_com_io_sq *io_sq,
+static int ena_com_write_header_to_bounce(struct ena_com_io_sq *io_sq,
 						 u8 *header_src,
 						 u16 header_len)
 {
@@ -143,7 +143,7 @@ static inline int ena_com_write_header_to_bounce(struct ena_com_io_sq *io_sq,
 	return 0;
 }
 
-static inline void *get_sq_desc_llq(struct ena_com_io_sq *io_sq)
+static void *get_sq_desc_llq(struct ena_com_io_sq *io_sq)
 {
 	struct ena_com_llq_pkt_ctrl *pkt_ctrl = &io_sq->llq_buf_ctrl;
 	u8 *bounce_buffer;
@@ -163,7 +163,7 @@ static inline void *get_sq_desc_llq(struct ena_com_io_sq *io_sq)
 	return sq_desc;
 }
 
-static inline int ena_com_close_bounce_buffer(struct ena_com_io_sq *io_sq)
+static int ena_com_close_bounce_buffer(struct ena_com_io_sq *io_sq)
 {
 	struct ena_com_llq_pkt_ctrl *pkt_ctrl = &io_sq->llq_buf_ctrl;
 	struct ena_com_llq_info *llq_info = &io_sq->llq_info;
@@ -176,8 +176,10 @@ static inline int ena_com_close_bounce_buffer(struct ena_com_io_sq *io_sq)
 	if (pkt_ctrl->idx) {
 		rc = ena_com_write_bounce_buffer_to_dev(io_sq,
 							pkt_ctrl->curr_bounce_buf);
-		if (unlikely(rc))
+		if (unlikely(rc)) {
+			pr_err("failed to write bounce buffer to device\n");
 			return rc;
+		}
 
 		pkt_ctrl->curr_bounce_buf =
 			ena_com_get_next_bounce_buffer(&io_sq->bounce_buf_ctrl);
@@ -190,7 +192,7 @@ static inline int ena_com_close_bounce_buffer(struct ena_com_io_sq *io_sq)
 	return 0;
 }
 
-static inline void *get_sq_desc(struct ena_com_io_sq *io_sq)
+static void *get_sq_desc(struct ena_com_io_sq *io_sq)
 {
 	if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
 		return get_sq_desc_llq(io_sq);
@@ -198,7 +200,7 @@ static inline void *get_sq_desc(struct ena_com_io_sq *io_sq)
 	return get_sq_desc_regular_queue(io_sq);
 }
 
-static inline int ena_com_sq_update_llq_tail(struct ena_com_io_sq *io_sq)
+static int ena_com_sq_update_llq_tail(struct ena_com_io_sq *io_sq)
 {
 	struct ena_com_llq_pkt_ctrl *pkt_ctrl = &io_sq->llq_buf_ctrl;
 	struct ena_com_llq_info *llq_info = &io_sq->llq_info;
@@ -207,8 +209,10 @@ static inline int ena_com_sq_update_llq_tail(struct ena_com_io_sq *io_sq)
 	if (!pkt_ctrl->descs_left_in_line) {
 		rc = ena_com_write_bounce_buffer_to_dev(io_sq,
 							pkt_ctrl->curr_bounce_buf);
-		if (unlikely(rc))
+		if (unlikely(rc)) {
+			pr_err("failed to write bounce buffer to device\n");
 			return rc;
+		}
 
 		pkt_ctrl->curr_bounce_buf =
 			ena_com_get_next_bounce_buffer(&io_sq->bounce_buf_ctrl);
@@ -226,7 +230,7 @@ static inline int ena_com_sq_update_llq_tail(struct ena_com_io_sq *io_sq)
 	return 0;
 }
 
-static inline int ena_com_sq_update_tail(struct ena_com_io_sq *io_sq)
+static int ena_com_sq_update_tail(struct ena_com_io_sq *io_sq)
 {
 	if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
 		return ena_com_sq_update_llq_tail(io_sq);
@@ -240,7 +244,7 @@ static inline int ena_com_sq_update_tail(struct ena_com_io_sq *io_sq)
 	return 0;
 }
 
-static inline struct ena_eth_io_rx_cdesc_base *
+static struct ena_eth_io_rx_cdesc_base *
 	ena_com_rx_cdesc_idx_to_ptr(struct ena_com_io_cq *io_cq, u16 idx)
 {
 	idx &= (io_cq->q_depth - 1);
@@ -249,7 +253,7 @@ static inline struct ena_eth_io_rx_cdesc_base *
 		idx * io_cq->cdesc_entry_size_in_bytes);
 }
 
-static inline u16 ena_com_cdesc_rx_pkt_get(struct ena_com_io_cq *io_cq,
+static u16 ena_com_cdesc_rx_pkt_get(struct ena_com_io_cq *io_cq,
 					   u16 *first_cdesc_idx)
 {
 	struct ena_eth_io_rx_cdesc_base *cdesc;
@@ -287,11 +291,10 @@ static inline u16 ena_com_cdesc_rx_pkt_get(struct ena_com_io_cq *io_cq,
 	return count;
 }
 
-static inline int ena_com_create_and_store_tx_meta_desc(struct ena_com_io_sq *io_sq,
-							struct ena_com_tx_ctx *ena_tx_ctx)
+static int ena_com_create_meta(struct ena_com_io_sq *io_sq,
+			       struct ena_com_tx_meta *ena_meta)
 {
 	struct ena_eth_io_tx_meta_desc *meta_desc = NULL;
-	struct ena_com_tx_meta *ena_meta = &ena_tx_ctx->ena_meta;
 
 	meta_desc = get_sq_desc(io_sq);
 	memset(meta_desc, 0x0, sizeof(struct ena_eth_io_tx_meta_desc));
@@ -311,12 +314,13 @@ static inline int ena_com_create_and_store_tx_meta_desc(struct ena_com_io_sq *io
 
 	/* Extended meta desc */
 	meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_ETH_META_TYPE_MASK;
-	meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_META_STORE_MASK;
 	meta_desc->len_ctrl |= (io_sq->phase <<
 		ENA_ETH_IO_TX_META_DESC_PHASE_SHIFT) &
 		ENA_ETH_IO_TX_META_DESC_PHASE_MASK;
 
 	meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_FIRST_MASK;
+	meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_META_STORE_MASK;
+
 	meta_desc->word2 |= ena_meta->l3_hdr_len &
 		ENA_ETH_IO_TX_META_DESC_L3_HDR_LEN_MASK;
 	meta_desc->word2 |= (ena_meta->l3_hdr_offset <<
@@ -327,16 +331,37 @@ static inline int ena_com_create_and_store_tx_meta_desc(struct ena_com_io_sq *io
 		ENA_ETH_IO_TX_META_DESC_L4_HDR_LEN_IN_WORDS_SHIFT) &
 		ENA_ETH_IO_TX_META_DESC_L4_HDR_LEN_IN_WORDS_MASK;
 
-	meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_META_STORE_MASK;
+	return ena_com_sq_update_tail(io_sq);
+}
 
-	/* Cached the meta desc */
-	memcpy(&io_sq->cached_tx_meta, ena_meta,
-	       sizeof(struct ena_com_tx_meta));
+static int ena_com_create_and_store_tx_meta_desc(struct ena_com_io_sq *io_sq,
+						 struct ena_com_tx_ctx *ena_tx_ctx,
+						 bool *have_meta)
+{
+	struct ena_com_tx_meta *ena_meta = &ena_tx_ctx->ena_meta;
 
-	return ena_com_sq_update_tail(io_sq);
+	/* When disable meta caching is set, don't bother to save the meta and
+	 * compare it to the stored version, just create the meta
+	 */
+	if (io_sq->disable_meta_caching) {
+		if (unlikely(!ena_tx_ctx->meta_valid))
+			return -EINVAL;
+
+		*have_meta = true;
+		return ena_com_create_meta(io_sq, ena_meta);
+	} else if (ena_com_meta_desc_changed(io_sq, ena_tx_ctx)) {
+		*have_meta = true;
+		/* Cache the meta desc */
+		memcpy(&io_sq->cached_tx_meta, ena_meta,
+		       sizeof(struct ena_com_tx_meta));
+		return ena_com_create_meta(io_sq, ena_meta);
+	} else {
+		*have_meta = false;
+		return 0;
+	}
 }
 
-static inline void ena_com_rx_set_flags(struct ena_com_rx_ctx *ena_rx_ctx,
+static void ena_com_rx_set_flags(struct ena_com_rx_ctx *ena_rx_ctx,
 					struct ena_eth_io_rx_cdesc_base *cdesc)
 {
 	ena_rx_ctx->l3_proto = cdesc->status &
@@ -397,24 +422,26 @@ int ena_com_prepare_tx(struct ena_com_io_sq *io_sq,
 	}
 
 	if (unlikely(io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV &&
-		     !buffer_to_push))
+		     !buffer_to_push)) {
+		pr_err("push header wasn't provided on LLQ mode\n");
 		return -EINVAL;
+	}
 
 	rc = ena_com_write_header_to_bounce(io_sq, buffer_to_push, header_len);
 	if (unlikely(rc))
 		return rc;
 
-	have_meta = ena_tx_ctx->meta_valid && ena_com_meta_desc_changed(io_sq,
-			ena_tx_ctx);
-	if (have_meta) {
-		rc = ena_com_create_and_store_tx_meta_desc(io_sq, ena_tx_ctx);
-		if (unlikely(rc))
-			return rc;
+	rc = ena_com_create_and_store_tx_meta_desc(io_sq, ena_tx_ctx, &have_meta);
+	if (unlikely(rc)) {
+		pr_err("failed to create and store tx meta desc\n");
+		return rc;
 	}
 
 	/* If the caller doesn't want to send packets */
 	if (unlikely(!num_bufs && !header_len)) {
 		rc = ena_com_close_bounce_buffer(io_sq);
+		if (rc)
+			pr_err("failed to write buffers to LLQ\n");
 		*nb_hw_desc = io_sq->tail - start_tail;
 		return rc;
 	}
@@ -474,8 +501,10 @@ int ena_com_prepare_tx(struct ena_com_io_sq *io_sq,
 		/* The first desc share the same desc as the header */
 		if (likely(i != 0)) {
 			rc = ena_com_sq_update_tail(io_sq);
-			if (unlikely(rc))
+			if (unlikely(rc)) {
+				pr_err("failed to update sq tail\n");
 				return rc;
+			}
 
 			desc = get_sq_desc(io_sq);
 			if (unlikely(!desc))
@@ -504,10 +533,14 @@ int ena_com_prepare_tx(struct ena_com_io_sq *io_sq,
 	desc->len_ctrl |= ENA_ETH_IO_TX_DESC_LAST_MASK;
 
 	rc = ena_com_sq_update_tail(io_sq);
-	if (unlikely(rc))
+	if (unlikely(rc)) {
+		pr_err("failed to update sq tail of the last descriptor\n");
 		return rc;
+	}
 
 	rc = ena_com_close_bounce_buffer(io_sq);
+	if (rc)
+		pr_err("failed when closing bounce buffer\n");
 
 	*nb_hw_desc = io_sq->tail - start_tail;
 	return rc;
@@ -521,7 +554,7 @@ int ena_com_rx_pkt(struct ena_com_io_cq *io_cq,
 	struct ena_eth_io_rx_cdesc_base *cdesc = NULL;
 	u16 cdesc_idx = 0;
 	u16 nb_hw_desc;
-	u16 i;
+	u16 i = 0;
 
 	WARN(io_cq->direction != ENA_COM_IO_QUEUE_DIRECTION_RX, "wrong Q type");
 
@@ -540,13 +573,14 @@ int ena_com_rx_pkt(struct ena_com_io_cq *io_cq,
 		return -ENOSPC;
 	}
 
-	for (i = 0; i < nb_hw_desc; i++) {
-		cdesc = ena_com_rx_cdesc_idx_to_ptr(io_cq, cdesc_idx + i);
+	cdesc = ena_com_rx_cdesc_idx_to_ptr(io_cq, cdesc_idx);
+	ena_rx_ctx->pkt_offset = cdesc->offset;
 
+	do {
 		ena_buf->len = cdesc->length;
 		ena_buf->req_id = cdesc->req_id;
 		ena_buf++;
-	}
+	} while ((++i < nb_hw_desc) && (cdesc = ena_com_rx_cdesc_idx_to_ptr(io_cq, cdesc_idx + i)));
 
 	/* Update SQ head ptr */
 	io_sq->next_to_comp += nb_hw_desc;
diff --git a/drivers/amazon/net/ena/ena_eth_com.h b/drivers/amazon/net/ena/ena_eth_com.h
index 2a37463bc9569..8b1afd3b32f26 100644
--- a/drivers/amazon/net/ena/ena_eth_com.h
+++ b/drivers/amazon/net/ena/ena_eth_com.h
@@ -73,6 +73,7 @@ struct ena_com_rx_ctx {
 	u32 hash;
 	u16 descs;
 	int max_bufs;
+	u8 pkt_offset;
 };
 
 int ena_com_prepare_tx(struct ena_com_io_sq *io_sq,
@@ -95,7 +96,7 @@ static inline void ena_com_unmask_intr(struct ena_com_io_cq *io_cq,
 	writel(intr_reg->intr_control, io_cq->unmask_reg);
 }
 
-static inline int ena_com_free_desc(struct ena_com_io_sq *io_sq)
+static inline int ena_com_free_q_entries(struct ena_com_io_sq *io_sq)
 {
 	u16 tail, next_to_comp, cnt;
 
@@ -113,7 +114,7 @@ static inline bool ena_com_sq_have_enough_space(struct ena_com_io_sq *io_sq,
 	int temp;
 
 	if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST)
-		return ena_com_free_desc(io_sq) >= required_buffers;
+		return ena_com_free_q_entries(io_sq) >= required_buffers;
 
 	/* This calculation doesn't need to be 100% accurate. So to reduce
 	 * the calculation overhead just Subtract 2 lines from the free descs
@@ -122,7 +123,7 @@ static inline bool ena_com_sq_have_enough_space(struct ena_com_io_sq *io_sq,
 	 */
 	temp = required_buffers / io_sq->llq_info.descs_per_entry + 2;
 
-	return ena_com_free_desc(io_sq) > temp;
+	return ena_com_free_q_entries(io_sq) > temp;
 }
 
 static inline bool ena_com_meta_desc_changed(struct ena_com_io_sq *io_sq,
@@ -173,8 +174,8 @@ static inline bool ena_com_is_doorbell_needed(struct ena_com_io_sq *io_sq,
 
 static inline int ena_com_write_sq_doorbell(struct ena_com_io_sq *io_sq)
 {
-	u16 tail = io_sq->tail;
 	u16 max_entries_in_tx_burst = io_sq->llq_info.max_entries_in_tx_burst;
+	u16 tail = io_sq->tail;
 
 	pr_debug("write submission queue doorbell for queue: %d tail: %d\n",
 		 io_sq->qid, tail);
@@ -195,15 +196,17 @@ static inline int ena_com_update_dev_comp_head(struct ena_com_io_cq *io_cq)
 	u16 unreported_comp, head;
 	bool need_update;
 
-	head = io_cq->head;
-	unreported_comp = head - io_cq->last_head_update;
-	need_update = unreported_comp > (io_cq->q_depth / ENA_COMP_HEAD_THRESH);
-
-	if (io_cq->cq_head_db_reg && need_update) {
-		pr_debug("Write completion queue doorbell for queue %d: head: %d\n",
-			 io_cq->qid, head);
-		writel(head, io_cq->cq_head_db_reg);
-		io_cq->last_head_update = head;
+	if (unlikely(io_cq->cq_head_db_reg)) {
+		head = io_cq->head;
+		unreported_comp = head - io_cq->last_head_update;
+		need_update = unreported_comp > (io_cq->q_depth / ENA_COMP_HEAD_THRESH);
+
+		if (unlikely(need_update)) {
+			pr_debug("Write completion queue doorbell for queue %d: head: %d\n",
+				 io_cq->qid, head);
+			writel(head, io_cq->cq_head_db_reg);
+			io_cq->last_head_update = head;
+		}
 	}
 
 	return 0;
diff --git a/drivers/amazon/net/ena/ena_eth_io_defs.h b/drivers/amazon/net/ena/ena_eth_io_defs.h
index bab1591c8b9cd..4dd382e15ed34 100755
--- a/drivers/amazon/net/ena/ena_eth_io_defs.h
+++ b/drivers/amazon/net/ena/ena_eth_io_defs.h
@@ -1,4 +1,3 @@
-/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
  * Copyright 2015 - 2018 Amazon.com, Inc. or its affiliates.
  *
@@ -265,7 +264,9 @@ struct ena_eth_io_rx_cdesc_base {
 
 	u16 sub_qid;
 
-	u16 reserved;
+	u8 offset;
+
+	u8 reserved;
 };
 
 /* 8-word format */
@@ -413,4 +414,4 @@ struct ena_eth_io_numa_node_cfg_reg {
 #define ENA_ETH_IO_NUMA_NODE_CFG_REG_ENABLED_SHIFT          31
 #define ENA_ETH_IO_NUMA_NODE_CFG_REG_ENABLED_MASK           BIT(31)
 
-#endif /*_ENA_ETH_IO_H_ */
+#endif /* _ENA_ETH_IO_H_ */
diff --git a/drivers/amazon/net/ena/ena_ethtool.c b/drivers/amazon/net/ena/ena_ethtool.c
index 557fb2259278c..4169142bcc991 100755
--- a/drivers/amazon/net/ena/ena_ethtool.c
+++ b/drivers/amazon/net/ena/ena_ethtool.c
@@ -66,6 +66,8 @@ static const struct ena_stats ena_stats_global_strings[] = {
 	ENA_STAT_GLOBAL_ENTRY(interface_up),
 	ENA_STAT_GLOBAL_ENTRY(interface_down),
 	ENA_STAT_GLOBAL_ENTRY(admin_q_pause),
+	ENA_STAT_GLOBAL_ENTRY(rx_drops),
+	ENA_STAT_GLOBAL_ENTRY(tx_drops),
 };
 
 static const struct ena_stats ena_stats_tx_strings[] = {
@@ -83,18 +85,20 @@ static const struct ena_stats ena_stats_tx_strings[] = {
 	ENA_STAT_TX_ENTRY(bad_req_id),
 	ENA_STAT_TX_ENTRY(llq_buffer_copy),
 	ENA_STAT_TX_ENTRY(missed_tx),
+	ENA_STAT_TX_ENTRY(unmask_interrupt),
 };
 
 static const struct ena_stats ena_stats_rx_strings[] = {
 	ENA_STAT_RX_ENTRY(cnt),
 	ENA_STAT_RX_ENTRY(bytes),
+	ENA_STAT_RX_ENTRY(rx_copybreak_pkt),
+	ENA_STAT_RX_ENTRY(csum_good),
 	ENA_STAT_RX_ENTRY(refil_partial),
 	ENA_STAT_RX_ENTRY(bad_csum),
 	ENA_STAT_RX_ENTRY(page_alloc_fail),
 	ENA_STAT_RX_ENTRY(skb_alloc_fail),
 	ENA_STAT_RX_ENTRY(dma_mapping_err),
 	ENA_STAT_RX_ENTRY(bad_desc_num),
-	ENA_STAT_RX_ENTRY(rx_copybreak_pkt),
 #if ENA_BUSY_POLL_SUPPORT
 	ENA_STAT_RX_ENTRY(bp_yield),
 	ENA_STAT_RX_ENTRY(bp_missed),
@@ -137,7 +141,7 @@ static void ena_queue_stats(struct ena_adapter *adapter, u64 **data)
 	u64 *ptr;
 	int i, j;
 
-	for (i = 0; i < adapter->num_queues; i++) {
+	for (i = 0; i < adapter->num_io_queues; i++) {
 		/* Tx stats */
 		ring = &adapter->tx_ring[i];
 
@@ -202,24 +206,15 @@ static void ena_get_ethtool_stats(struct net_device *netdev,
 	ena_dev_admin_queue_stats(adapter, &data);
 }
 
-static int get_stats_sset_count(struct ena_adapter *adapter)
-{
-	return  adapter->num_queues * (ENA_STATS_ARRAY_TX + ENA_STATS_ARRAY_RX)
-		+ ENA_STATS_ARRAY_GLOBAL + ENA_STATS_ARRAY_ENA_COM;
-}
-
 int ena_get_sset_count(struct net_device *netdev, int sset)
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
 
-	switch (sset) {
-	case ETH_SS_STATS:
-		return get_stats_sset_count(adapter);
-	case ETH_SS_PRIV_FLAGS:
-		return adapter->ena_extra_properties_count;
-	default:
+	if (sset != ETH_SS_STATS)
 		return -EOPNOTSUPP;
-	}
+
+	return adapter->num_io_queues * (ENA_STATS_ARRAY_TX + ENA_STATS_ARRAY_RX)
+	       + ENA_STATS_ARRAY_GLOBAL + ENA_STATS_ARRAY_ENA_COM;
 }
 
 static void ena_queue_strings(struct ena_adapter *adapter, u8 **data)
@@ -227,7 +222,7 @@ static void ena_queue_strings(struct ena_adapter *adapter, u8 **data)
 	const struct ena_stats *ena_stats;
 	int i, j;
 
-	for (i = 0; i < adapter->num_queues; i++) {
+	for (i = 0; i < adapter->num_io_queues; i++) {
 		/* Tx stats */
 		for (j = 0; j < ENA_STATS_ARRAY_TX; j++) {
 			ena_stats = &ena_stats_tx_strings[j];
@@ -261,56 +256,25 @@ static void ena_com_dev_strings(u8 **data)
 	}
 }
 
-static void get_stats_strings(struct ena_adapter *adapter, u8 *data)
+static void ena_get_strings(struct net_device *netdev, u32 sset, u8 *data)
 {
+	struct ena_adapter *adapter = netdev_priv(netdev);
 	const struct ena_stats *ena_stats;
 	int i;
 
+	if (sset != ETH_SS_STATS)
+		return;
+
 	for (i = 0; i < ENA_STATS_ARRAY_GLOBAL; i++) {
 		ena_stats = &ena_stats_global_strings[i];
 		memcpy(data, ena_stats->name, ETH_GSTRING_LEN);
 		data += ETH_GSTRING_LEN;
 	}
+
 	ena_queue_strings(adapter, &data);
 	ena_com_dev_strings(&data);
 }
 
-static void get_private_flags_strings(struct ena_adapter *adapter, u8 *data)
-{
-	struct ena_com_dev *ena_dev = adapter->ena_dev;
-	u8 *strings = ena_dev->extra_properties_strings.virt_addr;
-	int i;
-
-	if (unlikely(!strings)) {
-		adapter->ena_extra_properties_count = 0;
-		netif_err(adapter, drv, adapter->netdev,
-			  "Failed to allocate extra properties strings\n");
-		return;
-	}
-
-	for (i = 0; i < adapter->ena_extra_properties_count; i++) {
-		snprintf(data, ETH_GSTRING_LEN, "%s",
-			 strings + ENA_ADMIN_EXTRA_PROPERTIES_STRING_LEN * i);
-		data += ETH_GSTRING_LEN;
-	}
-}
-
-static void ena_get_strings(struct net_device *netdev, u32 sset, u8 *data)
-{
-	struct ena_adapter *adapter = netdev_priv(netdev);
-
-	switch (sset) {
-	case ETH_SS_STATS:
-		get_stats_strings(adapter, data);
-		break;
-	case ETH_SS_PRIV_FLAGS:
-		get_private_flags_strings(adapter, data);
-		break;
-	default:
-		break;
-	}
-}
-
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0)
 static int ena_get_link_ksettings(struct net_device *netdev,
 				  struct ethtool_link_ksettings *link_ksettings)
@@ -382,35 +346,45 @@ static int ena_get_coalesce(struct net_device *net_dev,
 	struct ena_adapter *adapter = netdev_priv(net_dev);
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
 
-	if (!ena_com_interrupt_moderation_supported(ena_dev)) {
-		/* the devie doesn't support interrupt moderation */
+	if (!ena_com_interrupt_moderation_supported(ena_dev))
 		return -EOPNOTSUPP;
-	}
+
 	coalesce->tx_coalesce_usecs =
-		ena_com_get_nonadaptive_moderation_interval_tx(ena_dev) /
+		ena_com_get_nonadaptive_moderation_interval_tx(ena_dev) *
 			ena_dev->intr_delay_resolution;
-	if (!ena_com_get_adaptive_moderation_enabled(ena_dev)) {
-		coalesce->rx_coalesce_usecs =
-			ena_com_get_nonadaptive_moderation_interval_rx(ena_dev)
-			/ ena_dev->intr_delay_resolution;
-	}
+
+	coalesce->rx_coalesce_usecs =
+		ena_com_get_nonadaptive_moderation_interval_rx(ena_dev)
+		* ena_dev->intr_delay_resolution;
+
 	coalesce->use_adaptive_rx_coalesce =
 		ena_com_get_adaptive_moderation_enabled(ena_dev);
 
 	return 0;
 }
 
-static void ena_update_tx_rings_intr_moderation(struct ena_adapter *adapter)
+static void ena_update_tx_rings_nonadaptive_intr_moderation(struct ena_adapter *adapter)
 {
 	unsigned int val;
 	int i;
 
 	val = ena_com_get_nonadaptive_moderation_interval_tx(adapter->ena_dev);
 
-	for (i = 0; i < adapter->num_queues; i++)
+	for (i = 0; i < adapter->num_io_queues; i++)
 		adapter->tx_ring[i].smoothed_interval = val;
 }
 
+static void ena_update_rx_rings_nonadaptive_intr_moderation(struct ena_adapter *adapter)
+{
+	unsigned int val;
+	int i;
+
+	val = ena_com_get_nonadaptive_moderation_interval_rx(adapter->ena_dev);
+
+	for (i = 0; i < adapter->num_io_queues; i++)
+		adapter->rx_ring[i].smoothed_interval = val;
+}
+
 static int ena_set_coalesce(struct net_device *net_dev,
 			    struct ethtool_coalesce *coalesce)
 {
@@ -418,63 +392,30 @@ static int ena_set_coalesce(struct net_device *net_dev,
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
 	int rc;
 
-	if (!ena_com_interrupt_moderation_supported(ena_dev)) {
-		/* the devie doesn't support interrupt moderation */
+	if (!ena_com_interrupt_moderation_supported(ena_dev))
 		return -EOPNOTSUPP;
-	}
-
-	if (coalesce->rx_coalesce_usecs_irq ||
-	    coalesce->rx_max_coalesced_frames_irq ||
-	    coalesce->tx_coalesce_usecs_irq ||
-	    coalesce->tx_max_coalesced_frames ||
-	    coalesce->tx_max_coalesced_frames_irq ||
-	    coalesce->stats_block_coalesce_usecs ||
-	    coalesce->use_adaptive_tx_coalesce ||
-	    coalesce->pkt_rate_low ||
-	    coalesce->tx_coalesce_usecs_low ||
-	    coalesce->tx_max_coalesced_frames_low ||
-	    coalesce->pkt_rate_high ||
-	    coalesce->tx_coalesce_usecs_high ||
-	    coalesce->tx_max_coalesced_frames_high ||
-	    coalesce->rate_sample_interval)
-		return -EINVAL;
 
-	/* Note, adaptive coalescing settings are updated through sysfs */
-	if (coalesce->rx_max_coalesced_frames ||
-	    coalesce->rx_coalesce_usecs_low ||
-	    coalesce->rx_max_coalesced_frames_low ||
-	    coalesce->rx_coalesce_usecs_high ||
-	    coalesce->rx_max_coalesced_frames_high)
-		return -EINVAL;
 	rc = ena_com_update_nonadaptive_moderation_interval_tx(ena_dev,
 							       coalesce->tx_coalesce_usecs);
 	if (rc)
 		return rc;
 
-	ena_update_tx_rings_intr_moderation(adapter);
+	ena_update_tx_rings_nonadaptive_intr_moderation(adapter);
 
-	if (ena_com_get_adaptive_moderation_enabled(ena_dev)) {
-		if (!coalesce->use_adaptive_rx_coalesce) {
-			ena_com_disable_adaptive_moderation(ena_dev);
-			rc = ena_com_update_nonadaptive_moderation_interval_rx(ena_dev,
-									       coalesce->rx_coalesce_usecs);
-			return rc;
-		} else {
-			/* was in adaptive mode and remains in it,
-			 * allow to update only tx_usecs, rx is managed through sysfs
-			 */
-			if (coalesce->rx_coalesce_usecs)
-				return -EINVAL;
-		}
-	} else { /* was in non-adaptive mode */
-		if (coalesce->use_adaptive_rx_coalesce) {
-			ena_com_enable_adaptive_moderation(ena_dev);
-		} else {
-			rc = ena_com_update_nonadaptive_moderation_interval_rx(ena_dev,
-									       coalesce->rx_coalesce_usecs);
-			return rc;
-		}
-	}
+	rc = ena_com_update_nonadaptive_moderation_interval_rx(ena_dev,
+							       coalesce->rx_coalesce_usecs);
+	if (rc)
+		return rc;
+
+	ena_update_rx_rings_nonadaptive_intr_moderation(adapter);
+
+	if (coalesce->use_adaptive_rx_coalesce &&
+	    !ena_com_get_adaptive_moderation_enabled(ena_dev))
+		ena_com_enable_adaptive_moderation(ena_dev);
+
+	if (!coalesce->use_adaptive_rx_coalesce &&
+	    ena_com_get_adaptive_moderation_enabled(ena_dev))
+		ena_com_disable_adaptive_moderation(ena_dev);
 
 	return 0;
 }
@@ -502,20 +443,41 @@ static void ena_get_drvinfo(struct net_device *dev,
 	strlcpy(info->version, DRV_MODULE_VERSION, sizeof(info->version));
 	strlcpy(info->bus_info, pci_name(adapter->pdev),
 		sizeof(info->bus_info));
-	info->n_priv_flags = adapter->ena_extra_properties_count;
 }
 
 static void ena_get_ringparam(struct net_device *netdev,
 			      struct ethtool_ringparam *ring)
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
-	struct ena_ring *tx_ring = &adapter->tx_ring[0];
-	struct ena_ring *rx_ring = &adapter->rx_ring[0];
 
-	ring->rx_max_pending = rx_ring->ring_size;
-	ring->tx_max_pending = tx_ring->ring_size;
-	ring->rx_pending = rx_ring->ring_size;
-	ring->tx_pending = tx_ring->ring_size;
+	ring->tx_max_pending = adapter->max_tx_ring_size;
+	ring->rx_max_pending = adapter->max_rx_ring_size;
+	ring->tx_pending = adapter->tx_ring[0].ring_size;
+	ring->rx_pending = adapter->rx_ring[0].ring_size;
+}
+
+static int ena_set_ringparam(struct net_device *netdev,
+			     struct ethtool_ringparam *ring)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	u32 new_tx_size, new_rx_size;
+
+	if (ring->rx_mini_pending || ring->rx_jumbo_pending)
+		return -EINVAL;
+
+	new_tx_size = clamp_val(ring->tx_pending, ENA_MIN_RING_SIZE,
+				adapter->max_tx_ring_size);
+	new_tx_size = rounddown_pow_of_two(new_tx_size);
+
+	new_rx_size = clamp_val(ring->rx_pending, ENA_MIN_RING_SIZE,
+				adapter->max_rx_ring_size);
+	new_rx_size = rounddown_pow_of_two(new_rx_size);
+
+	if (new_tx_size == adapter->requested_tx_ring_size &&
+	    new_rx_size == adapter->requested_rx_ring_size)
+		return 0;
+
+	return ena_update_queue_sizes(adapter, new_tx_size, new_rx_size);
 }
 
 #ifdef ETHTOOL_GRXRINGS
@@ -691,7 +653,7 @@ static int ena_get_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *info,
 
 	switch (info->cmd) {
 	case ETHTOOL_GRXRINGS:
-		info->data = adapter->num_queues;
+		info->data = adapter->num_io_queues;
 		rc = 0;
 		break;
 	case ETHTOOL_GRXFH:
@@ -723,29 +685,82 @@ static u32 ena_get_rxfh_key_size(struct net_device *netdev)
 }
 #endif
 
+
+static int ena_indirection_table_set(struct ena_adapter *adapter,
+				     const u32 *indir)
+{
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	int i, rc;
+
+	for (i = 0; i < ENA_RX_RSS_TABLE_SIZE; i++) {
+		rc = ena_com_indirect_table_fill_entry(ena_dev,
+						       i,
+						       ENA_IO_RXQ_IDX(indir[i]));
+		if (unlikely(rc)) {
+			netif_err(adapter, drv, adapter->netdev,
+				  "Cannot fill indirect table (index is too large)\n");
+			return rc;
+		}
+	}
+
+	rc = ena_com_indirect_table_set(ena_dev);
+	if (rc) {
+		netif_err(adapter, drv, adapter->netdev,
+			  "Cannot set indirect table\n");
+		return rc == -EPERM ? -EOPNOTSUPP : rc;
+	}
+	return rc;
+}
+
+static int ena_indirection_table_get(struct ena_adapter *adapter, u32 *indir)
+{
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	int i, rc;
+
+	if (!indir)
+		return 0;
+
+	rc = ena_com_indirect_table_get(ena_dev, indir);
+	if (rc)
+		return rc;
+
+	/* Our internal representation of the indices is: even indices
+	 * for Tx and uneven indices for Rx. We need to convert the Rx
+	 * indices to be consecutive
+	 */
+	for (i = 0; i < ENA_RX_RSS_TABLE_SIZE; i++)
+		indir[i] = ENA_IO_RXQ_IDX_TO_COMBINED_IDX(indir[i]);
+
+	return rc;
+}
+
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0)
 static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
 			u8 *hfunc)
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
-	enum ena_admin_hash_functions ena_func;
+	enum ena_admin_hash_functions ena_func = ENA_ADMIN_TOEPLITZ;
 	u8 func;
 	int rc;
 
-	rc = ena_com_indirect_table_get(adapter->ena_dev, indir);
+	rc = ena_indirection_table_get(adapter, indir);
 	if (rc)
 		return rc;
 
-	rc = ena_com_get_hash_function(adapter->ena_dev, &ena_func, key);
+	rc = ena_com_get_hash_key(adapter->ena_dev, key);
 	if (rc)
 		return rc;
 
+	rc = ena_com_get_hash_function(adapter->ena_dev, &ena_func);
+	if (rc && rc != -EOPNOTSUPP)
+		return rc;
+
 	switch (ena_func) {
 	case ENA_ADMIN_TOEPLITZ:
 		func = ETH_RSS_HASH_TOP;
 		break;
 	case ENA_ADMIN_CRC32:
-		func = ETH_RSS_HASH_XOR;
+		func = ETH_RSS_HASH_CRC32;
 		break;
 	default:
 		netif_err(adapter, drv, netdev,
@@ -756,7 +771,7 @@ static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
 	if (hfunc)
 		*hfunc = func;
 
-	return rc;
+	return 0;
 }
 #elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
 static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key)
@@ -764,14 +779,13 @@ static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key)
 	struct ena_adapter *adapter = netdev_priv(netdev);
 	int rc;
 
-	rc = ena_com_indirect_table_get(adapter->ena_dev, indir);
+	rc = ena_indirection_table_get(adapter, indir);
 	if (rc)
 		return rc;
 
-	rc = ena_com_get_hash_function(adapter->ena_dev, NULL, key);
-	if (rc) {
+	rc = ena_com_get_hash_key(adapter->ena_dev, key);
+	if (rc)
 		return rc;
-	}
 
 	return rc;
 }
@@ -780,7 +794,7 @@ static int ena_get_rxfh(struct net_device *netdev, u32 *indir)
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
 
-	return ena_com_indirect_table_get(adapter->ena_dev, indir);
+	return ena_indirection_table_get(adapter, indir);
 }
 #endif /* >= 3.8.0 */
 
@@ -795,35 +809,24 @@ static int ena_set_rxfh(struct net_device *netdev, const u32 *indir,
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
-	enum ena_admin_hash_functions func;
-	int rc, i;
+	enum ena_admin_hash_functions func = 0;
+	int rc;
 
 	if (indir) {
-		for (i = 0; i < ENA_RX_RSS_TABLE_SIZE; i++) {
-			rc = ena_com_indirect_table_fill_entry(ena_dev,
-							       ENA_IO_RXQ_IDX(indir[i]),
-							       i);
-			if (unlikely(rc)) {
-				netif_err(adapter, drv, netdev,
-					  "Cannot fill indirect table (index is too large)\n");
-				return rc;
-			}
-		}
-
-		rc = ena_com_indirect_table_set(ena_dev);
-		if (rc) {
-			netif_err(adapter, drv, netdev,
-				  "Cannot set indirect table\n");
-			return rc == -EPERM ? -EOPNOTSUPP : rc;
-		}
+		rc = ena_indirection_table_set(adapter, indir);
+		if (rc)
+			return rc;
 	}
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0)
 	switch (hfunc) {
+	case ETH_RSS_HASH_NO_CHANGE:
+		func = ena_com_get_current_hash_function(ena_dev);
+		break;
 	case ETH_RSS_HASH_TOP:
 		func = ENA_ADMIN_TOEPLITZ;
 		break;
-	case ETH_RSS_HASH_XOR:
+	case ETH_RSS_HASH_CRC32:
 		func = ENA_ADMIN_CRC32;
 		break;
 	default:
@@ -835,7 +838,7 @@ static int ena_set_rxfh(struct net_device *netdev, const u32 *indir,
 	func = ENA_ADMIN_TOEPLITZ;
 #endif
 
-	if (key) {
+	if (key || func) {
 		rc = ena_com_fill_hash_function(ena_dev, func, key,
 						ENA_HASH_KEY_SIZE,
 						0xFFFFFFFF);
@@ -851,30 +854,14 @@ static int ena_set_rxfh(struct net_device *netdev, const u32 *indir,
 static int ena_set_rxfh(struct net_device *netdev, const u32 *indir)
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
-	struct ena_com_dev *ena_dev = adapter->ena_dev;
-	int rc, i;
-
-	if (!indir)
-		return 0;
-	for (i = 0; i < ENA_RX_RSS_TABLE_SIZE; i++) {
-		rc = ena_com_indirect_table_fill_entry(ena_dev, i,
-						       ENA_IO_RXQ_IDX(indir[i]));
-		if (unlikely(rc)) {
-			netif_err(adapter, drv, netdev,
-				  "Cannot fill indirect table (index is too large)\n");
-			return rc;
-		}
-	}
+	int rc = 0;
 
-	rc = ena_com_indirect_table_set(ena_dev);
-	if (unlikely(rc)) {
-		netif_err(adapter, drv, netdev, "Cannot set indirect table\n");
-		return rc == -EPERM ? -EOPNOTSUPP : rc;
-	}
+	if (indir)
+		rc = ena_indirection_table_set(adapter, indir);
 
-	return 0;
+	return rc;
 }
-#endif /* Kernel > 3.16 */
+#endif /* Kernel >= 3.8 */
 #endif /* ETHTOOL_GRXFH */
 #ifndef HAVE_RHEL6_ETHTOOL_OPS_EXT_STRUCT
 
@@ -884,14 +871,29 @@ static void ena_get_channels(struct net_device *netdev,
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
 
-	channels->max_rx = adapter->num_queues;
-	channels->max_tx = adapter->num_queues;
-	channels->max_other = 0;
-	channels->max_combined = 0;
-	channels->rx_count = adapter->num_queues;
-	channels->tx_count = adapter->num_queues;
-	channels->other_count = 0;
-	channels->combined_count = 0;
+	channels->max_combined = adapter->max_num_io_queues;
+	channels->combined_count = adapter->num_io_queues;
+}
+
+static int ena_set_channels(struct net_device *netdev,
+			    struct ethtool_channels *channels)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	u32 count = channels->combined_count;
+	/* The check for max value is already done in ethtool */
+#ifdef ENA_XDP_SUPPORT
+	if (count < ENA_MIN_NUM_IO_QUEUES ||
+	    (ena_xdp_present(adapter) &&
+	    !ena_xdp_legal_queue_count(adapter, channels->combined_count)))
+#else
+	if (count < ENA_MIN_NUM_IO_QUEUES)
+#endif /* ENA_XDP_SUPPORT */
+		return -EINVAL;
+	if (count > adapter->max_num_io_queues)
+		return -EINVAL;
+
+
+	return ena_update_queue_count(adapter, count);
 }
 #endif /* ETHTOOL_SCHANNELS */
 
@@ -941,20 +943,6 @@ static int ena_set_tunable(struct net_device *netdev,
 }
 #endif /* 3.18.0 */
 
-static u32 ena_get_priv_flags(struct net_device *netdev)
-{
-	struct ena_adapter *adapter = netdev_priv(netdev);
-	struct ena_com_dev *ena_dev = adapter->ena_dev;
-	struct ena_admin_get_feat_resp get_resp;
-	u32 rc;
-
-	rc = ena_com_get_extra_properties_flags(ena_dev, &get_resp);
-	if (!rc)
-		return get_resp.u.extra_properties_flags.flags;
-
-	return 0;
-}
-
 static const struct ethtool_ops ena_ethtool_ops = {
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0)
 	.get_link_ksettings	= ena_get_link_ksettings,
@@ -968,6 +956,7 @@ static const struct ethtool_ops ena_ethtool_ops = {
 	.get_coalesce		= ena_get_coalesce,
 	.set_coalesce		= ena_set_coalesce,
 	.get_ringparam		= ena_get_ringparam,
+	.set_ringparam		= ena_set_ringparam,
 	.get_sset_count         = ena_get_sset_count,
 	.get_strings		= ena_get_strings,
 	.get_ethtool_stats      = ena_get_ethtool_stats,
@@ -989,13 +978,16 @@ static const struct ethtool_ops ena_ethtool_ops = {
 #ifndef HAVE_RHEL6_ETHTOOL_OPS_EXT_STRUCT
 #ifdef ETHTOOL_SCHANNELS
 	.get_channels		= ena_get_channels,
+	.set_channels		= ena_set_channels,
 #endif /* ETHTOOL_SCHANNELS */
 #endif
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0)
 	.get_tunable		= ena_get_tunable,
 	.set_tunable		= ena_set_tunable,
 #endif
-	.get_priv_flags		= ena_get_priv_flags,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0)
+	.get_ts_info            = ethtool_op_get_ts_info,
+#endif
 };
 
 void ena_set_ethtool_ops(struct net_device *netdev)
diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c
index 4e3a5cc368c2e..6d7b628adf0d0 100755
--- a/drivers/amazon/net/ena/ena_netdev.c
+++ b/drivers/amazon/net/ena/ena_netdev.c
@@ -36,7 +36,6 @@
 #include <linux/cpu_rmap.h>
 #endif /* CONFIG_RFS_ACCEL */
 #include <linux/ethtool.h>
-#include <linux/if_vlan.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
@@ -51,6 +50,9 @@
 #include <net/ip.h>
 
 #include "ena_netdev.h"
+#ifdef ENA_XDP_SUPPORT
+#include <linux/bpf_trace.h>
+#endif /* ENA_XDP_SUPPORT */
 #include "ena_pci_id_tbl.h"
 #include "ena_sysfs.h"
 
@@ -76,6 +78,14 @@ static int rx_queue_size = ENA_DEFAULT_RING_SIZE;
 module_param(rx_queue_size, int, S_IRUGO);
 MODULE_PARM_DESC(rx_queue_size, "Rx queue size. The size should be a power of 2. Max value is 8K\n");
 
+static int force_large_llq_header;
+module_param(force_large_llq_header, int, S_IRUGO);
+MODULE_PARM_DESC(force_large_llq_header, "Increases maximum supported header size in LLQ mode to 224 bytes, while reducing the maximum TX queue size by half.\n");
+
+static int num_io_queues = ENA_MAX_NUM_IO_QUEUES;
+module_param(num_io_queues, int, 0444);
+MODULE_PARM_DESC(num_io_queues, "Sets number of RX/TX queues to allocate to device. The maximum value depends on the device and number of online CPUs.\n");
+
 static struct ena_aenq_handlers aenq_handlers;
 
 static struct workqueue_struct *ena_wq;
@@ -86,12 +96,44 @@ static int ena_rss_init_default(struct ena_adapter *adapter);
 static void check_for_admin_com_state(struct ena_adapter *adapter);
 static void ena_destroy_device(struct ena_adapter *adapter, bool graceful);
 static int ena_restore_device(struct ena_adapter *adapter);
-static int ena_calc_io_queue_num(struct pci_dev *pdev,
-				 struct ena_com_dev *ena_dev,
-				 struct ena_com_dev_get_features_ctx *get_feat_ctx);
-static int ena_calc_queue_size(struct ena_calc_queue_size_ctx *ctx);
 
+#ifdef ENA_XDP_SUPPORT
+static void ena_init_io_rings(struct ena_adapter *adapter,
+			      int first_index, int count);
+static void ena_init_napi_in_range(struct ena_adapter *adapter, int first_index,
+				   int count);
+static void ena_del_napi_in_range(struct ena_adapter *adapter, int first_index,
+				  int count);
+static int ena_setup_tx_resources(struct ena_adapter *adapter, int qid);
+static int ena_setup_tx_resources_in_range(struct ena_adapter *adapter,
+					   int first_index,
+					   int count);
+static int ena_create_io_tx_queue(struct ena_adapter *adapter, int qid);
+static void ena_free_tx_resources(struct ena_adapter *adapter, int qid);
+static int ena_clean_xdp_irq(struct ena_ring *xdp_ring, u32 budget);
+static void ena_destroy_all_tx_queues(struct ena_adapter *adapter);
+static void ena_free_all_io_tx_resources(struct ena_adapter *adapter);
+static void ena_napi_disable_in_range(struct ena_adapter *adapter,
+				      int first_index, int count);
+static void ena_napi_enable_in_range(struct ena_adapter *adapter,
+				     int first_index, int count);
+static int ena_up(struct ena_adapter *adapter);
+static void ena_down(struct ena_adapter *adapter);
+static void ena_unmask_interrupt(struct ena_ring *tx_ring,
+				 struct ena_ring *rx_ring);
+static void ena_update_ring_numa_node(struct ena_ring *tx_ring,
+				      struct ena_ring *rx_ring);
+static void ena_unmap_tx_buff(struct ena_ring *tx_ring,
+			      struct ena_tx_buffer *tx_info);
+static int ena_create_io_tx_queues_in_range(struct ena_adapter *adapter,
+					    int first_index, int count);
+#endif /* ENA_XDP_SUPPORT */
+
+#ifdef HAVE_NDO_TX_TIMEOUT_STUCK_QUEUE_PARAMETER
+static void ena_tx_timeout(struct net_device *dev, unsigned int txqueue)
+#else
 static void ena_tx_timeout(struct net_device *dev)
+#endif
 {
 	struct ena_adapter *adapter = netdev_priv(dev);
 
@@ -114,7 +156,7 @@ static void update_rx_ring_mtu(struct ena_adapter *adapter, int mtu)
 {
 	int i;
 
-	for (i = 0; i < adapter->num_queues; i++)
+	for (i = 0; i < adapter->num_io_queues; i++)
 		adapter->rx_ring[i].mtu = mtu;
 }
 
@@ -123,7 +165,7 @@ static int ena_change_mtu(struct net_device *dev, int new_mtu)
 	struct ena_adapter *adapter = netdev_priv(dev);
 	int ret;
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0)
+#ifndef HAVE_MTU_MIN_MAX_IN_NET_DEVICE
 	if ((new_mtu > adapter->max_mtu) || (new_mtu < ENA_MIN_MTU)) {
 		netif_err(adapter, drv, dev,
 			  "Invalid MTU setting. new_mtu: %d max mtu: %d min mtu: %d\n",
@@ -144,16 +186,463 @@ static int ena_change_mtu(struct net_device *dev, int new_mtu)
 	return ret;
 }
 
+static int ena_xmit_common(struct net_device *dev,
+			   struct ena_ring *ring,
+			   struct ena_tx_buffer *tx_info,
+			   struct ena_com_tx_ctx *ena_tx_ctx,
+			   u16 next_to_use,
+			   u32 bytes)
+{
+	struct ena_adapter *adapter = netdev_priv(dev);
+	int rc, nb_hw_desc;
+
+	if (unlikely(ena_com_is_doorbell_needed(ring->ena_com_io_sq,
+						ena_tx_ctx))) {
+		netif_dbg(adapter, tx_queued, dev,
+			  "llq tx max burst size of queue %d achieved, writing doorbell to send burst\n",
+			  ring->qid);
+		ena_com_write_sq_doorbell(ring->ena_com_io_sq);
+	}
+
+	/* prepare the packet's descriptors to dma engine */
+	rc = ena_com_prepare_tx(ring->ena_com_io_sq, ena_tx_ctx,
+				&nb_hw_desc);
+
+	/* In case there isn't enough space in the queue for the packet,
+	 * we simply drop it. All other failure reasons of
+	 * ena_com_prepare_tx() are fatal and therefore require a device reset.
+	 */
+	if (unlikely(rc)) {
+		netif_err(adapter, tx_queued, dev,
+			  "failed to prepare tx bufs\n");
+		u64_stats_update_begin(&ring->syncp);
+		ring->tx_stats.prepare_ctx_err++;
+		u64_stats_update_end(&ring->syncp);
+		if (rc != -ENOMEM) {
+			adapter->reset_reason =
+				ENA_REGS_RESET_DRIVER_INVALID_STATE;
+			set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+		}
+		return rc;
+	}
+
+	u64_stats_update_begin(&ring->syncp);
+	ring->tx_stats.cnt++;
+	ring->tx_stats.bytes += bytes;
+	u64_stats_update_end(&ring->syncp);
+
+	tx_info->tx_descs = nb_hw_desc;
+	tx_info->last_jiffies = jiffies;
+	tx_info->print_once = 0;
+
+	ring->next_to_use = ENA_TX_RING_IDX_NEXT(next_to_use,
+						 ring->ring_size);
+	return 0;
+}
+
+#ifdef ENA_XDP_SUPPORT
+/* This is the XDP napi callback. XDP queues use a separate napi callback
+ * than Rx/Tx queues.
+ */
+static int ena_xdp_io_poll(struct napi_struct *napi, int budget)
+{
+	struct ena_napi *ena_napi = container_of(napi, struct ena_napi, napi);
+	u32 xdp_work_done, xdp_budget;
+	struct ena_ring *xdp_ring;
+	int napi_comp_call = 0;
+	int ret;
+
+	xdp_ring = ena_napi->xdp_ring;
+	xdp_ring->first_interrupt = ena_napi->first_interrupt;
+
+	xdp_budget = budget;
+
+	if (!test_bit(ENA_FLAG_DEV_UP, &xdp_ring->adapter->flags) ||
+	    test_bit(ENA_FLAG_TRIGGER_RESET, &xdp_ring->adapter->flags)) {
+		napi_complete_done(napi, 0);
+		return 0;
+	}
+
+	xdp_work_done = ena_clean_xdp_irq(xdp_ring, xdp_budget);
+
+	/* If the device is about to reset or down, avoid unmask
+	 * the interrupt and return 0 so NAPI won't reschedule
+	 */
+	if (unlikely(!test_bit(ENA_FLAG_DEV_UP, &xdp_ring->adapter->flags))) {
+		napi_complete_done(napi, 0);
+		ret = 0;
+	} else if (xdp_budget > xdp_work_done) {
+		napi_comp_call = 1;
+		if (napi_complete_done(napi, xdp_work_done))
+			ena_unmask_interrupt(xdp_ring, NULL);
+		ena_update_ring_numa_node(xdp_ring, NULL);
+		ret = xdp_work_done;
+	} else {
+		ret = xdp_budget;
+	}
+
+	u64_stats_update_begin(&xdp_ring->syncp);
+	xdp_ring->tx_stats.napi_comp += napi_comp_call;
+	xdp_ring->tx_stats.tx_poll++;
+	u64_stats_update_end(&xdp_ring->syncp);
+
+	return ret;
+}
+
+static int ena_xdp_tx_map_buff(struct ena_ring *xdp_ring,
+			       struct ena_tx_buffer *tx_info,
+			       struct xdp_buff *xdp,
+			       void **push_hdr,
+			       u32 *push_len)
+{
+	struct ena_adapter *adapter = xdp_ring->adapter;
+	struct ena_com_buf *ena_buf;
+	dma_addr_t dma = 0;
+	u32 size;
+
+	tx_info->xdpf = convert_to_xdp_frame(xdp);
+	size = tx_info->xdpf->len;
+	ena_buf = tx_info->bufs;
+
+	/* llq push buffer */
+	*push_len = min_t(u32, size, xdp_ring->tx_max_header_size);
+	*push_hdr = tx_info->xdpf->data;
+
+	if (size - *push_len > 0) {
+		dma = dma_map_single(xdp_ring->dev,
+				     *push_hdr + *push_len,
+				     size - *push_len,
+				     DMA_TO_DEVICE);
+		if (unlikely(dma_mapping_error(xdp_ring->dev, dma)))
+			goto error_report_dma_error;
+
+		tx_info->map_linear_data = 1;
+		tx_info->num_of_bufs = 1;
+	}
+
+	ena_buf->paddr = dma;
+	ena_buf->len = size;
+
+	return 0;
+
+error_report_dma_error:
+	u64_stats_update_begin(&xdp_ring->syncp);
+	xdp_ring->tx_stats.dma_mapping_err++;
+	u64_stats_update_end(&xdp_ring->syncp);
+	netdev_warn(adapter->netdev, "failed to map xdp buff\n");
+
+	xdp_return_frame_rx_napi(tx_info->xdpf);
+	tx_info->xdpf = NULL;
+	tx_info->num_of_bufs = 0;
+
+	return -EINVAL;
+}
+
+static int ena_xdp_xmit_buff(struct net_device *dev,
+			     struct xdp_buff *xdp,
+			     int qid,
+			     struct ena_rx_buffer *rx_info)
+{
+	struct ena_adapter *adapter = netdev_priv(dev);
+	struct ena_com_tx_ctx ena_tx_ctx = {0};
+	struct ena_tx_buffer *tx_info;
+	struct ena_ring *xdp_ring;
+	struct ena_ring *rx_ring;
+	u16 next_to_use, req_id;
+	int rc;
+	void *push_hdr;
+	u32 push_len;
+
+	xdp_ring = &adapter->tx_ring[qid];
+	next_to_use = xdp_ring->next_to_use;
+	req_id = xdp_ring->free_ids[next_to_use];
+	tx_info = &xdp_ring->tx_buffer_info[req_id];
+	tx_info->num_of_bufs = 0;
+	rx_ring = &xdp_ring->adapter->rx_ring[qid -
+		  xdp_ring->adapter->xdp_first_ring];
+	page_ref_inc(rx_info->page);
+	tx_info->xdp_rx_page = rx_info->page;
+
+	rc = ena_xdp_tx_map_buff(xdp_ring, tx_info, xdp, &push_hdr, &push_len);
+	if (unlikely(rc))
+		goto error_drop_packet;
+
+	ena_tx_ctx.ena_bufs = tx_info->bufs;
+	ena_tx_ctx.push_header = push_hdr;
+	ena_tx_ctx.num_bufs = tx_info->num_of_bufs;
+	ena_tx_ctx.req_id = req_id;
+	ena_tx_ctx.header_len = push_len;
+
+	rc = ena_xmit_common(dev,
+			     xdp_ring,
+			     tx_info,
+			     &ena_tx_ctx,
+			     next_to_use,
+			     xdp->data_end - xdp->data);
+	if (rc)
+		goto error_unmap_dma;
+	/* trigger the dma engine. ena_com_write_sq_doorbell()
+	 * has a mb
+	 */
+	ena_com_write_sq_doorbell(xdp_ring->ena_com_io_sq);
+	u64_stats_update_begin(&xdp_ring->syncp);
+	xdp_ring->tx_stats.doorbells++;
+	u64_stats_update_end(&xdp_ring->syncp);
+
+	return NETDEV_TX_OK;
+
+error_unmap_dma:
+	ena_unmap_tx_buff(xdp_ring, tx_info);
+	tx_info->xdpf = NULL;
+error_drop_packet:
+
+	return NETDEV_TX_OK;
+}
+
+static int ena_xdp_execute(struct ena_ring *rx_ring,
+			   struct xdp_buff *xdp,
+			   struct ena_rx_buffer *rx_info)
+{
+	struct bpf_prog *xdp_prog;
+	u32 verdict = XDP_PASS;
+
+	rcu_read_lock();
+	xdp_prog = READ_ONCE(rx_ring->xdp_bpf_prog);
+
+	if (!xdp_prog)
+		goto out;
+
+	verdict = bpf_prog_run_xdp(xdp_prog, xdp);
+
+	if (verdict == XDP_TX)
+		ena_xdp_xmit_buff(rx_ring->netdev,
+				  xdp,
+				  rx_ring->qid + rx_ring->adapter->num_io_queues,
+				  rx_info);
+	else if (unlikely(verdict == XDP_ABORTED))
+		trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict);
+	else if (unlikely(verdict > XDP_TX))
+		bpf_warn_invalid_xdp_action(verdict);
+out:
+	rcu_read_unlock();
+	return verdict;
+}
+
+static void ena_init_all_xdp_queues(struct ena_adapter *adapter)
+{
+	adapter->xdp_first_ring = adapter->num_io_queues;
+	adapter->xdp_num_queues = adapter->num_io_queues;
+
+	ena_init_io_rings(adapter,
+			  adapter->xdp_first_ring,
+			  adapter->xdp_num_queues);
+}
+
+static int ena_setup_and_create_all_xdp_queues(struct ena_adapter *adapter)
+{
+	int rc = 0;
+
+	rc = ena_setup_tx_resources_in_range(adapter, adapter->xdp_first_ring,
+					     adapter->xdp_num_queues);
+	if (rc)
+		goto setup_err;
+
+	rc = ena_create_io_tx_queues_in_range(adapter,
+					      adapter->xdp_first_ring,
+					      adapter->xdp_num_queues);
+	if (rc)
+		goto create_err;
+
+	return 0;
+
+create_err:
+	ena_free_all_io_tx_resources(adapter);
+setup_err:
+	return rc;
+}
+
+/* Provides a way for both kernel and bpf-prog to know
+ * more about the RX-queue a given XDP frame arrived on.
+ */
+static int ena_xdp_register_rxq_info(struct ena_ring *rx_ring)
+{
+	int rc;
+
+	rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid);
+
+	if (rc) {
+		netif_err(rx_ring->adapter, ifup, rx_ring->netdev,
+			  "Failed to register xdp rx queue info. RX queue num %d rc: %d\n",
+			  rx_ring->qid, rc);
+		goto err;
+	}
+
+	rc = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq, MEM_TYPE_PAGE_SHARED,
+					NULL);
+
+	if (rc) {
+		netif_err(rx_ring->adapter, ifup, rx_ring->netdev,
+			  "Failed to register xdp rx queue info memory model. RX queue num %d rc: %d\n",
+			  rx_ring->qid, rc);
+		xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
+	}
+
+err:
+	return rc;
+}
+
+static void ena_xdp_unregister_rxq_info(struct ena_ring *rx_ring)
+{
+	xdp_rxq_info_unreg_mem_model(&rx_ring->xdp_rxq);
+	xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
+}
+
+void ena_xdp_exchange_program_rx_in_range(struct ena_adapter *adapter,
+					  struct bpf_prog *prog,
+					  int first,
+					  int count)
+{
+	struct ena_ring *rx_ring;
+	int i = 0;
+
+	for (i = first; i < count; i++) {
+		rx_ring = &adapter->rx_ring[i];
+		xchg(&rx_ring->xdp_bpf_prog, prog);
+		if (prog) {
+			ena_xdp_register_rxq_info(rx_ring);
+			rx_ring->rx_headroom = XDP_PACKET_HEADROOM;
+		} else {
+			ena_xdp_unregister_rxq_info(rx_ring);
+			rx_ring->rx_headroom = 0;
+		}
+	}
+}
+
+void ena_xdp_exchange_program(struct ena_adapter *adapter,
+			      struct bpf_prog *prog)
+{
+	struct bpf_prog *old_bpf_prog = xchg(&adapter->xdp_bpf_prog, prog);
+
+	ena_xdp_exchange_program_rx_in_range(adapter,
+					     prog,
+					     0,
+					     adapter->num_io_queues);
+
+	if (old_bpf_prog)
+		bpf_prog_put(old_bpf_prog);
+}
+
+static int ena_destroy_and_free_all_xdp_queues(struct ena_adapter *adapter)
+{
+	bool was_up;
+	int rc;
+
+	was_up = test_bit(ENA_FLAG_DEV_UP, &adapter->flags);
+
+	if (was_up)
+		ena_down(adapter);
+
+	adapter->xdp_first_ring = 0;
+	adapter->xdp_num_queues = 0;
+	ena_xdp_exchange_program(adapter, NULL);
+	if (was_up) {
+		rc = ena_up(adapter);
+		if (rc)
+			return rc;
+	}
+	return 0;
+}
+
+static int ena_xdp_set(struct net_device *netdev, struct netdev_bpf *bpf)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	struct bpf_prog *prog = bpf->prog;
+	struct bpf_prog *old_bpf_prog;
+	int rc, prev_mtu;
+	bool is_up;
+
+	is_up = test_bit(ENA_FLAG_DEV_UP, &adapter->flags);
+	rc = ena_xdp_allowed(adapter);
+	if (rc == ENA_XDP_ALLOWED) {
+		old_bpf_prog = adapter->xdp_bpf_prog;
+		if (prog) {
+			if (!is_up) {
+				ena_init_all_xdp_queues(adapter);
+			} else if (!old_bpf_prog) {
+				ena_down(adapter);
+				ena_init_all_xdp_queues(adapter);
+			}
+			ena_xdp_exchange_program(adapter, prog);
+
+			if (is_up && !old_bpf_prog) {
+				rc = ena_up(adapter);
+				if (rc)
+					return rc;
+			}
+		} else if (old_bpf_prog) {
+			rc = ena_destroy_and_free_all_xdp_queues(adapter);
+			if (rc)
+				return rc;
+		}
+
+		prev_mtu = netdev->max_mtu;
+		netdev->max_mtu = prog ? ENA_XDP_MAX_MTU : adapter->max_mtu;
+
+		if (!old_bpf_prog)
+			netif_info(adapter, drv, adapter->netdev,
+				   "xdp program set, changing the max_mtu from %d to %d",
+				   prev_mtu, netdev->max_mtu);
+
+	} else if (rc == ENA_XDP_CURRENT_MTU_TOO_LARGE) {
+		netif_err(adapter, drv, adapter->netdev,
+			  "Failed to set xdp program, the current MTU (%d) is larger than the maximum allowed MTU (%lu) while xdp is on",
+			  netdev->mtu, ENA_XDP_MAX_MTU);
+		NL_SET_ERR_MSG_MOD(bpf->extack,
+				   "Failed to set xdp program, the current MTU is larger than the maximum allowed MTU. Check the dmesg for more info");
+		return -EINVAL;
+	} else if (rc == ENA_XDP_NO_ENOUGH_QUEUES) {
+		netif_err(adapter, drv, adapter->netdev,
+			  "Failed to set xdp program, the Rx/Tx channel count should be at most half of the maximum allowed channel count. The current queue count (%d), the maximal queue count (%d)\n",
+			  adapter->num_io_queues, adapter->max_num_io_queues);
+		NL_SET_ERR_MSG_MOD(bpf->extack,
+				   "Failed to set xdp program, there is no enough space for allocating XDP queues, Check the dmesg for more info");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* This is the main xdp callback, it's used by the kernel to set/unset the xdp
+ * program as well as to query the current xdp program id.
+ */
+static int ena_xdp(struct net_device *netdev, struct netdev_bpf *bpf)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+
+	switch (bpf->command) {
+	case XDP_SETUP_PROG:
+		return ena_xdp_set(netdev, bpf);
+	case XDP_QUERY_PROG:
+		bpf->prog_id = adapter->xdp_bpf_prog ?
+			adapter->xdp_bpf_prog->aux->id : 0;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+#endif /* ENA_XDP_SUPPORT */
+
 static int ena_init_rx_cpu_rmap(struct ena_adapter *adapter)
 {
 #ifdef CONFIG_RFS_ACCEL
 	u32 i;
 	int rc;
 
-	adapter->netdev->rx_cpu_rmap = alloc_irq_cpu_rmap(adapter->num_queues);
+	adapter->netdev->rx_cpu_rmap = alloc_irq_cpu_rmap(adapter->num_io_queues);
 	if (!adapter->netdev->rx_cpu_rmap)
 		return -ENOMEM;
-	for (i = 0; i < adapter->num_queues; i++) {
+	for (i = 0; i < adapter->num_io_queues; i++) {
 		int irq_idx = ENA_IO_IRQ_IDX(i);
 
 		rc = irq_cpu_rmap_add(adapter->netdev->rx_cpu_rmap,
@@ -183,14 +672,14 @@ static void ena_init_io_rings_common(struct ena_adapter *adapter,
 	ring->adapter = adapter;
 	ring->ena_dev = adapter->ena_dev;
 	ring->per_napi_packets = 0;
-	ring->per_napi_bytes = 0;
 	ring->cpu = 0;
 	ring->first_interrupt = false;
 	ring->no_interrupt_event_cnt = 0;
 	u64_stats_init(&ring->syncp);
 }
 
-static void ena_init_io_rings(struct ena_adapter *adapter)
+static void ena_init_io_rings(struct ena_adapter *adapter,
+			      int first_index, int count)
 {
 	struct ena_com_dev *ena_dev;
 	struct ena_ring *txr, *rxr;
@@ -198,29 +687,35 @@ static void ena_init_io_rings(struct ena_adapter *adapter)
 
 	ena_dev = adapter->ena_dev;
 
-	for (i = 0; i < adapter->num_queues; i++) {
+	for (i = first_index; i < first_index + count; i++) {
 		txr = &adapter->tx_ring[i];
 		rxr = &adapter->rx_ring[i];
 
-		/* TX/RX common ring state */
+		/* TX common ring state */
 		ena_init_io_rings_common(adapter, txr, i);
-		ena_init_io_rings_common(adapter, rxr, i);
 
 		/* TX specific ring state */
-		txr->ring_size = adapter->tx_ring_size;
+		txr->ring_size = adapter->requested_tx_ring_size;
 		txr->tx_max_header_size = ena_dev->tx_max_header_size;
 		txr->tx_mem_queue_type = ena_dev->tx_mem_queue_type;
 		txr->sgl_size = adapter->max_tx_sgl_size;
 		txr->smoothed_interval =
 			ena_com_get_nonadaptive_moderation_interval_tx(ena_dev);
 
-		/* RX specific ring state */
-		rxr->ring_size = adapter->rx_ring_size;
-		rxr->rx_copybreak = adapter->rx_copybreak;
-		rxr->sgl_size = adapter->max_rx_sgl_size;
-		rxr->smoothed_interval =
-			ena_com_get_nonadaptive_moderation_interval_rx(ena_dev);
-		rxr->empty_rx_queue = 0;
+		/* Don't init RX queues for xdp queues */
+		if (!ENA_IS_XDP_INDEX(adapter, i)) {
+			/* RX common ring state */
+			ena_init_io_rings_common(adapter, rxr, i);
+
+			/* RX specific ring state */
+			rxr->ring_size = adapter->requested_rx_ring_size;
+			rxr->rx_copybreak = adapter->rx_copybreak;
+			rxr->sgl_size = adapter->max_rx_sgl_size;
+			rxr->smoothed_interval =
+				ena_com_get_nonadaptive_moderation_interval_rx(ena_dev);
+			rxr->empty_rx_queue = 0;
+			adapter->ena_napi[i].dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+		}
 	}
 }
 
@@ -249,33 +744,28 @@ static int ena_setup_tx_resources(struct ena_adapter *adapter, int qid)
 	if (!tx_ring->tx_buffer_info) {
 		tx_ring->tx_buffer_info = vzalloc(size);
 		if (!tx_ring->tx_buffer_info)
-			return -ENOMEM;
+			goto err_tx_buffer_info;
 	}
 
 	size = sizeof(u16) * tx_ring->ring_size;
-	tx_ring->free_tx_ids = vzalloc_node(size, node);
-	if (!tx_ring->free_tx_ids) {
-		tx_ring->free_tx_ids = vzalloc(size);
-		if (!tx_ring->free_tx_ids) {
-			vfree(tx_ring->tx_buffer_info);
-			return -ENOMEM;
-		}
+	tx_ring->free_ids = vzalloc_node(size, node);
+	if (!tx_ring->free_ids) {
+		tx_ring->free_ids = vzalloc(size);
+		if (!tx_ring->free_ids)
+			goto err_tx_free_ids;
 	}
 
 	size = tx_ring->tx_max_header_size;
 	tx_ring->push_buf_intermediate_buf = vzalloc_node(size, node);
 	if (!tx_ring->push_buf_intermediate_buf) {
 		tx_ring->push_buf_intermediate_buf = vzalloc(size);
-		if (!tx_ring->push_buf_intermediate_buf) {
-			vfree(tx_ring->tx_buffer_info);
-			vfree(tx_ring->free_tx_ids);
-			return -ENOMEM;
-		}
+		if (!tx_ring->push_buf_intermediate_buf)
+			goto err_push_buf_intermediate_buf;
 	}
 
 	/* Req id ring for TX out of order completions */
 	for (i = 0; i < tx_ring->ring_size; i++)
-		tx_ring->free_tx_ids[i] = i;
+		tx_ring->free_ids[i] = i;
 
 	/* Reset tx statistics */
 	memset(&tx_ring->tx_stats, 0x0, sizeof(tx_ring->tx_stats));
@@ -284,6 +774,15 @@ static int ena_setup_tx_resources(struct ena_adapter *adapter, int qid)
 	tx_ring->next_to_clean = 0;
 	tx_ring->cpu = ena_irq->cpu;
 	return 0;
+
+err_push_buf_intermediate_buf:
+	vfree(tx_ring->free_ids);
+	tx_ring->free_ids = NULL;
+err_tx_free_ids:
+	vfree(tx_ring->tx_buffer_info);
+	tx_ring->tx_buffer_info = NULL;
+err_tx_buffer_info:
+	return -ENOMEM;
 }
 
 /* ena_free_tx_resources - Free I/O Tx Resources per Queue
@@ -299,23 +798,20 @@ static void ena_free_tx_resources(struct ena_adapter *adapter, int qid)
 	vfree(tx_ring->tx_buffer_info);
 	tx_ring->tx_buffer_info = NULL;
 
-	vfree(tx_ring->free_tx_ids);
-	tx_ring->free_tx_ids = NULL;
+	vfree(tx_ring->free_ids);
+	tx_ring->free_ids = NULL;
 
 	vfree(tx_ring->push_buf_intermediate_buf);
 	tx_ring->push_buf_intermediate_buf = NULL;
 }
 
-/* ena_setup_all_tx_resources - allocate I/O Tx queues resources for All queues
- * @adapter: private structure
- *
- * Return 0 on success, negative on failure
- */
-static int ena_setup_all_tx_resources(struct ena_adapter *adapter)
+static int ena_setup_tx_resources_in_range(struct ena_adapter *adapter,
+					   int first_index,
+					   int count)
 {
 	int i, rc = 0;
 
-	for (i = 0; i < adapter->num_queues; i++) {
+	for (i = first_index; i < first_index + count; i++) {
 		rc = ena_setup_tx_resources(adapter, i);
 		if (rc)
 			goto err_setup_tx;
@@ -329,11 +825,20 @@ static int ena_setup_all_tx_resources(struct ena_adapter *adapter)
 		  "Tx queue %d: allocation failed\n", i);
 
 	/* rewind the index freeing the rings as we go */
-	while (i--)
+	while (first_index < i--)
 		ena_free_tx_resources(adapter, i);
 	return rc;
 }
 
+static void ena_free_all_io_tx_resources_in_range(struct ena_adapter *adapter,
+						  int first_index, int count)
+{
+	int i;
+
+	for (i = first_index; i < first_index + count; i++)
+		ena_free_tx_resources(adapter, i);
+}
+
 /* ena_free_all_io_tx_resources - Free I/O Tx Resources for All Queues
  * @adapter: board private structure
  *
@@ -341,13 +846,13 @@ static int ena_setup_all_tx_resources(struct ena_adapter *adapter)
  */
 static void ena_free_all_io_tx_resources(struct ena_adapter *adapter)
 {
-	int i;
-
-	for (i = 0; i < adapter->num_queues; i++)
-		ena_free_tx_resources(adapter, i);
+	ena_free_all_io_tx_resources_in_range(adapter,
+					      0,
+					      adapter->xdp_num_queues +
+					      adapter->num_io_queues);
 }
 
-static inline int validate_rx_req_id(struct ena_ring *rx_ring, u16 req_id)
+static int validate_rx_req_id(struct ena_ring *rx_ring, u16 req_id)
 {
 	if (likely(req_id < rx_ring->ring_size))
 		return 0;
@@ -398,18 +903,19 @@ static int ena_setup_rx_resources(struct ena_adapter *adapter,
 	}
 
 	size = sizeof(u16) * rx_ring->ring_size;
-	rx_ring->free_rx_ids = vzalloc_node(size, node);
-	if (!rx_ring->free_rx_ids) {
-		rx_ring->free_rx_ids = vzalloc(size);
-		if (!rx_ring->free_rx_ids) {
+	rx_ring->free_ids = vzalloc_node(size, node);
+	if (!rx_ring->free_ids) {
+		rx_ring->free_ids = vzalloc(size);
+		if (!rx_ring->free_ids) {
 			vfree(rx_ring->rx_buffer_info);
+			rx_ring->rx_buffer_info = NULL;
 			return -ENOMEM;
 		}
 	}
 
 	/* Req id ring for receiving RX pkts out of order */
 	for (i = 0; i < rx_ring->ring_size; i++)
-		rx_ring->free_rx_ids[i] = i;
+		rx_ring->free_ids[i] = i;
 
 	/* Reset rx statistics */
 	memset(&rx_ring->rx_stats, 0x0, sizeof(rx_ring->rx_stats));
@@ -438,8 +944,8 @@ static void ena_free_rx_resources(struct ena_adapter *adapter,
 	vfree(rx_ring->rx_buffer_info);
 	rx_ring->rx_buffer_info = NULL;
 
-	vfree(rx_ring->free_rx_ids);
-	rx_ring->free_rx_ids = NULL;
+	vfree(rx_ring->free_ids);
+	rx_ring->free_ids = NULL;
 }
 
 /* ena_setup_all_rx_resources - allocate I/O Rx queues resources for all queues
@@ -451,7 +957,7 @@ static int ena_setup_all_rx_resources(struct ena_adapter *adapter)
 {
 	int i, rc = 0;
 
-	for (i = 0; i < adapter->num_queues; i++) {
+	for (i = 0; i < adapter->num_io_queues; i++) {
 		rc = ena_setup_rx_resources(adapter, i);
 		if (rc)
 			goto err_setup_rx;
@@ -479,11 +985,11 @@ static void ena_free_all_io_rx_resources(struct ena_adapter *adapter)
 {
 	int i;
 
-	for (i = 0; i < adapter->num_queues; i++)
+	for (i = 0; i < adapter->num_io_queues; i++)
 		ena_free_rx_resources(adapter, i);
 }
 
-static inline int ena_alloc_rx_page(struct ena_ring *rx_ring,
+static int ena_alloc_rx_page(struct ena_ring *rx_ring,
 				    struct ena_rx_buffer *rx_info, gfp_t gfp)
 {
 	struct ena_com_buf *ena_buf;
@@ -503,7 +1009,7 @@ static inline int ena_alloc_rx_page(struct ena_ring *rx_ring,
 	}
 
 	dma = dma_map_page(rx_ring->dev, page, 0, ENA_PAGE_SIZE,
-			   DMA_FROM_DEVICE);
+			   DMA_BIDIRECTIONAL);
 	if (unlikely(dma_mapping_error(rx_ring->dev, dma))) {
 		u64_stats_update_begin(&rx_ring->syncp);
 		rx_ring->rx_stats.dma_mapping_err++;
@@ -518,8 +1024,8 @@ static inline int ena_alloc_rx_page(struct ena_ring *rx_ring,
 	rx_info->page = page;
 	rx_info->page_offset = 0;
 	ena_buf = &rx_info->ena_buf;
-	ena_buf->paddr = dma;
-	ena_buf->len = ENA_PAGE_SIZE;
+	ena_buf->paddr = dma + rx_ring->rx_headroom;
+	ena_buf->len = ENA_PAGE_SIZE - rx_ring->rx_headroom;
 
 	return 0;
 }
@@ -536,7 +1042,8 @@ static void ena_free_rx_page(struct ena_ring *rx_ring,
 		return;
 	}
 
-	dma_unmap_page(rx_ring->dev, ena_buf->paddr, ENA_PAGE_SIZE,
+	dma_unmap_page(rx_ring->dev, ena_buf->paddr - rx_ring->rx_headroom,
+		       ENA_PAGE_SIZE,
 		       DMA_FROM_DEVICE);
 
 	__free_page(page);
@@ -554,14 +1061,10 @@ static int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num)
 	for (i = 0; i < num; i++) {
 		struct ena_rx_buffer *rx_info;
 
-		req_id = rx_ring->free_rx_ids[next_to_use];
-		rc = validate_rx_req_id(rx_ring, req_id);
-		if (unlikely(rc < 0))
-			break;
+		req_id = rx_ring->free_ids[next_to_use];
 
 		rx_info = &rx_ring->rx_buffer_info[req_id];
 
-
 		rc = ena_alloc_rx_page(rx_ring, rx_info,
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 15, 0)
 				       GFP_ATOMIC | __GFP_COMP);
@@ -621,14 +1124,13 @@ static void ena_free_rx_bufs(struct ena_adapter *adapter,
 
 /* ena_refill_all_rx_bufs - allocate all queues Rx buffers
  * @adapter: board private structure
- *
  */
 static void ena_refill_all_rx_bufs(struct ena_adapter *adapter)
 {
 	struct ena_ring *rx_ring;
 	int i, rc, bufs_num;
 
-	for (i = 0; i < adapter->num_queues; i++) {
+	for (i = 0; i < adapter->num_io_queues; i++) {
 		rx_ring = &adapter->rx_ring[i];
 		bufs_num = rx_ring->ring_size - 1;
 		rc = ena_refill_rx_bufs(rx_ring, bufs_num);
@@ -644,12 +1146,12 @@ static void ena_free_all_rx_bufs(struct ena_adapter *adapter)
 {
 	int i;
 
-	for (i = 0; i < adapter->num_queues; i++)
+	for (i = 0; i < adapter->num_io_queues; i++)
 		ena_free_rx_bufs(adapter, i);
 }
 
-static inline void ena_unmap_tx_skb(struct ena_ring *tx_ring,
-				    struct ena_tx_buffer *tx_info)
+static void ena_unmap_tx_buff(struct ena_ring *tx_ring,
+			      struct ena_tx_buffer *tx_info)
 {
 	struct ena_com_buf *ena_buf;
 	u32 cnt;
@@ -703,7 +1205,7 @@ static void ena_free_tx_bufs(struct ena_ring *tx_ring)
 				   tx_ring->qid, i);
 		}
 
-		ena_unmap_tx_skb(tx_ring, tx_info);
+		ena_unmap_tx_buff(tx_ring, tx_info);
 
 		dev_kfree_skb_any(tx_info->skb);
 	}
@@ -716,7 +1218,7 @@ static void ena_free_all_tx_bufs(struct ena_adapter *adapter)
 	struct ena_ring *tx_ring;
 	int i;
 
-	for (i = 0; i < adapter->num_queues; i++) {
+	for (i = 0; i < adapter->num_io_queues + adapter->xdp_num_queues; i++) {
 		tx_ring = &adapter->tx_ring[i];
 		ena_free_tx_bufs(tx_ring);
 	}
@@ -727,7 +1229,7 @@ static void ena_destroy_all_tx_queues(struct ena_adapter *adapter)
 	u16 ena_qid;
 	int i;
 
-	for (i = 0; i < adapter->num_queues; i++) {
+	for (i = 0; i < adapter->num_io_queues + adapter->xdp_num_queues; i++) {
 		ena_qid = ENA_IO_TXQ_IDX(i);
 		ena_com_destroy_io_queue(adapter->ena_dev, ena_qid);
 	}
@@ -738,8 +1240,9 @@ static void ena_destroy_all_rx_queues(struct ena_adapter *adapter)
 	u16 ena_qid;
 	int i;
 
-	for (i = 0; i < adapter->num_queues; i++) {
+	for (i = 0; i < adapter->num_io_queues; i++) {
 		ena_qid = ENA_IO_RXQ_IDX(i);
+		cancel_work_sync(&adapter->ena_napi[i].dim.work);
 		ena_com_destroy_io_queue(adapter->ena_dev, ena_qid);
 	}
 }
@@ -750,6 +1253,32 @@ static void ena_destroy_all_io_queues(struct ena_adapter *adapter)
 	ena_destroy_all_rx_queues(adapter);
 }
 
+static int handle_invalid_req_id(struct ena_ring *ring, u16 req_id,
+				 struct ena_tx_buffer *tx_info, bool is_xdp)
+{
+	if (tx_info)
+		netif_err(ring->adapter,
+			  tx_done,
+			  ring->netdev,
+			  "tx_info doesn't have valid %s",
+			   is_xdp ? "xdp frame" : "skb");
+	else
+		netif_err(ring->adapter,
+			  tx_done,
+			  ring->netdev,
+			  "Invalid req_id: %hu\n",
+			  req_id);
+
+	u64_stats_update_begin(&ring->syncp);
+	ring->tx_stats.bad_req_id++;
+	u64_stats_update_end(&ring->syncp);
+
+	/* Trigger device reset */
+	ring->adapter->reset_reason = ENA_REGS_RESET_INV_TX_REQ_ID;
+	set_bit(ENA_FLAG_TRIGGER_RESET, &ring->adapter->flags);
+	return -EFAULT;
+}
+
 static int validate_tx_req_id(struct ena_ring *tx_ring, u16 req_id)
 {
 	struct ena_tx_buffer *tx_info = NULL;
@@ -760,22 +1289,23 @@ static int validate_tx_req_id(struct ena_ring *tx_ring, u16 req_id)
 			return 0;
 	}
 
-	if (tx_info)
-		netif_err(tx_ring->adapter, tx_done, tx_ring->netdev,
-			  "tx_info doesn't have valid skb\n");
-	else
-		netif_err(tx_ring->adapter, tx_done, tx_ring->netdev,
-			  "Invalid req_id: %hu\n", req_id);
+	return handle_invalid_req_id(tx_ring, req_id, tx_info, false);
+}
 
-	u64_stats_update_begin(&tx_ring->syncp);
-	tx_ring->tx_stats.bad_req_id++;
-	u64_stats_update_end(&tx_ring->syncp);
+#ifdef ENA_XDP_SUPPORT
+static int validate_xdp_req_id(struct ena_ring *xdp_ring, u16 req_id)
+{
+	struct ena_tx_buffer *tx_info = NULL;
 
-	/* Trigger device reset */
-	tx_ring->adapter->reset_reason = ENA_REGS_RESET_INV_TX_REQ_ID;
-	set_bit(ENA_FLAG_TRIGGER_RESET, &tx_ring->adapter->flags);
-	return -EFAULT;
+	if (likely(req_id < xdp_ring->ring_size)) {
+		tx_info = &xdp_ring->tx_buffer_info[req_id];
+		if (likely(tx_info->xdpf))
+			return 0;
+	}
+
+	return handle_invalid_req_id(xdp_ring, req_id, tx_info, true);
 }
+#endif /* ENA_XDP_SUPPORT */
 
 static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget)
 {
@@ -813,7 +1343,7 @@ static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget)
 		tx_info->skb = NULL;
 		tx_info->last_jiffies = 0;
 
-		ena_unmap_tx_skb(tx_ring, tx_info);
+		ena_unmap_tx_buff(tx_ring, tx_info);
 
 		netif_dbg(tx_ring->adapter, tx_done, tx_ring->netdev,
 			  "tx_poll: q %d skb %p completed\n", tx_ring->qid,
@@ -824,7 +1354,7 @@ static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget)
 		tx_pkts++;
 		total_done += tx_info->tx_descs;
 
-		tx_ring->free_tx_ids[next_to_clean] = req_id;
+		tx_ring->free_ids[next_to_clean] = req_id;
 		next_to_clean = ENA_TX_RING_IDX_NEXT(next_to_clean,
 						     tx_ring->ring_size);
 	}
@@ -849,7 +1379,8 @@ static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget)
 		above_thresh =
 			ena_com_sq_have_enough_space(tx_ring->ena_com_io_sq,
 						     ENA_TX_WAKEUP_THRESH);
-		if (netif_tx_queue_stopped(txq) && above_thresh) {
+		if (netif_tx_queue_stopped(txq) && above_thresh &&
+		    test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags)) {
 			netif_tx_wake_queue(txq);
 			u64_stats_update_begin(&tx_ring->syncp);
 			tx_ring->tx_stats.queue_wakeup++;
@@ -858,9 +1389,6 @@ static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget)
 		__netif_tx_unlock(txq);
 	}
 
-	tx_ring->per_napi_bytes += tx_bytes;
-	tx_ring->per_napi_packets += tx_pkts;
-
 	return tx_pkts;
 }
 
@@ -889,7 +1417,8 @@ static struct sk_buff *ena_alloc_skb(struct ena_ring *rx_ring, bool frags)
 static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 				  struct ena_com_rx_buf_info *ena_bufs,
 				  u32 descs,
-				  u16 *next_to_clean)
+				  u16 *next_to_clean,
+				  u8 offset)
 {
 	struct sk_buff *skb;
 	struct ena_rx_buffer *rx_info;
@@ -898,10 +1427,17 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 	bool polling;
 #endif
 	void *va;
+	int rc;
 
 	len = ena_bufs[buf].len;
 	req_id = ena_bufs[buf].req_id;
+
+	rc = validate_rx_req_id(rx_ring, req_id);
+	if (unlikely(rc < 0))
+		return NULL;
+
 	rx_info = &rx_ring->rx_buffer_info[req_id];
+	rx_info->page_offset = offset;
 
 	if (unlikely(!rx_info->page)) {
 		netif_err(rx_ring->adapter, rx_err, rx_ring->netdev,
@@ -942,7 +1478,7 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 		skb_mark_napi_id(skb, rx_ring->napi);
 #endif
 		skb->protocol = eth_type_trans(skb, rx_ring->netdev);
-		rx_ring->free_rx_ids[*next_to_clean] = req_id;
+		rx_ring->free_ids[*next_to_clean] = req_id;
 		*next_to_clean = ENA_RX_RING_IDX_ADD(*next_to_clean, descs,
 						     rx_ring->ring_size);
 		return skb;
@@ -965,6 +1501,8 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 
 		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_info->page,
 				rx_info->page_offset, len, ENA_PAGE_SIZE);
+		/* The offset is non zero only for the first buffer */
+		rx_info->page_offset = 0;
 
 		netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
 			  "rx skb updated. len %d. data_len %d\n",
@@ -972,7 +1510,7 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 
 		rx_info->page = NULL;
 
-		rx_ring->free_rx_ids[*next_to_clean] = req_id;
+		rx_ring->free_ids[*next_to_clean] = req_id;
 		*next_to_clean =
 			ENA_RX_RING_IDX_NEXT(*next_to_clean,
 					     rx_ring->ring_size);
@@ -982,6 +1520,12 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 		buf++;
 		len = ena_bufs[buf].len;
 		req_id = ena_bufs[buf].req_id;
+
+		rc = validate_rx_req_id(rx_ring, req_id);
+		if (unlikely(rc < 0)) {
+			return NULL;
+		}
+
 		rx_info = &rx_ring->rx_buffer_info[req_id];
 	} while (1);
 
@@ -1010,7 +1554,7 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
  * @ena_rx_ctx: received packet context/metadata
  * @skb: skb currently being received and modified
  */
-static inline void ena_rx_checksum(struct ena_ring *rx_ring,
+static void ena_rx_checksum(struct ena_ring *rx_ring,
 				   struct ena_com_rx_ctx *ena_rx_ctx,
 				   struct sk_buff *skb)
 {
@@ -1055,6 +1599,9 @@ static inline void ena_rx_checksum(struct ena_ring *rx_ring,
 
 		if (likely(ena_rx_ctx->l4_csum_checked)) {
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
+			u64_stats_update_begin(&rx_ring->syncp);
+			rx_ring->rx_stats.csum_good++;
+			u64_stats_update_end(&rx_ring->syncp);
 		} else {
 			u64_stats_update_begin(&rx_ring->syncp);
 			rx_ring->rx_stats.csum_unchecked++;
@@ -1092,6 +1639,35 @@ static void ena_set_rx_hash(struct ena_ring *rx_ring,
 #endif /* NETIF_F_RXHASH */
 }
 
+#ifdef ENA_XDP_SUPPORT
+int ena_xdp_handle_buff(struct ena_ring *rx_ring, struct xdp_buff *xdp)
+{
+	struct ena_rx_buffer *rx_info;
+	int ret;
+
+	rx_info = &rx_ring->rx_buffer_info[rx_ring->ena_bufs[0].req_id];
+	xdp->data = page_address(rx_info->page) +
+		rx_info->page_offset + rx_ring->rx_headroom;
+	xdp_set_data_meta_invalid(xdp);
+	xdp->data_hard_start = page_address(rx_info->page);
+	xdp->data_end = xdp->data + rx_ring->ena_bufs[0].len;
+	/* If for some reason we received a bigger packet than
+	 * we expect, then we simply drop it
+	 */
+	if (unlikely(rx_ring->ena_bufs[0].len > ENA_XDP_MAX_MTU))
+		return XDP_DROP;
+
+	ret = ena_xdp_execute(rx_ring, xdp, rx_info);
+
+	/* The xdp program might expand the headers */
+	if (ret == XDP_PASS) {
+		rx_info->page_offset = xdp->data - xdp->data_hard_start;
+		rx_ring->ena_bufs[0].len = xdp->data_end - xdp->data;
+	}
+
+	return ret;
+}
+#endif /* ENA_XDP_SUPPORT */
 /* ena_clean_rx_irq - Cleanup RX irq
  * @rx_ring: RX ring to clean
  * @napi: napi handler
@@ -1103,26 +1679,39 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 			    u32 budget)
 {
 	u16 next_to_clean = rx_ring->next_to_clean;
-	u32 res_budget, work_done;
-
 	struct ena_com_rx_ctx ena_rx_ctx;
 	struct ena_adapter *adapter;
+	u32 res_budget, work_done;
+	int rx_copybreak_pkt = 0;
+	int refill_threshold;
 	struct sk_buff *skb;
 	int refill_required;
-	int refill_threshold;
-	int rc = 0;
+#ifdef ENA_XDP_SUPPORT
+	struct xdp_buff xdp;
+#endif /* ENA_XDP_SUPPORT */
 	int total_len = 0;
-	int rx_copybreak_pkt = 0;
+#ifdef ENA_XDP_SUPPORT
+	int xdp_verdict;
+#endif /* ENA_XDP_SUPPORT */
+	int rc = 0;
 	int i;
 
 	netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
 		  "%s qid %d\n", __func__, rx_ring->qid);
 	res_budget = budget;
+#ifdef ENA_XDP_SUPPORT
+	xdp.rxq = &rx_ring->xdp_rxq;
+#endif /* ENA_XDP_SUPPORT */
 
 	do {
+#ifdef ENA_XDP_SUPPORT
+		xdp_verdict = XDP_PASS;
+		skb = NULL;
+#endif /* ENA_XDP_SUPPORT */
 		ena_rx_ctx.ena_bufs = rx_ring->ena_bufs;
 		ena_rx_ctx.max_bufs = rx_ring->sgl_size;
 		ena_rx_ctx.descs = 0;
+		ena_rx_ctx.pkt_offset = 0;
 		rc = ena_com_rx_pkt(rx_ring->ena_com_io_cq,
 				    rx_ring->ena_com_io_sq,
 				    &ena_rx_ctx);
@@ -1137,19 +1726,40 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 			  rx_ring->qid, ena_rx_ctx.descs, ena_rx_ctx.l3_proto,
 			  ena_rx_ctx.l4_proto, ena_rx_ctx.hash);
 
+#ifdef ENA_XDP_SUPPORT
+		if (ena_xdp_present_ring(rx_ring))
+			xdp_verdict = ena_xdp_handle_buff(rx_ring, &xdp);
+
 		/* allocate skb and fill it */
+		if (xdp_verdict == XDP_PASS)
+			skb = ena_rx_skb(rx_ring,
+					 rx_ring->ena_bufs,
+					 ena_rx_ctx.descs,
+					 &next_to_clean, ena_rx_ctx.pkt_offset);
+#else
 		skb = ena_rx_skb(rx_ring, rx_ring->ena_bufs, ena_rx_ctx.descs,
-				 &next_to_clean);
+				 &next_to_clean, ena_rx_ctx.pkt_offset);
+#endif /* ENA_XDP_SUPPORT */
 
-		/* exit if we failed to retrieve a buffer */
 		if (unlikely(!skb)) {
+#ifdef ENA_XDP_SUPPORT
+			if (xdp_verdict == XDP_TX) {
+				ena_free_rx_page(rx_ring,
+						 &rx_ring->rx_buffer_info[rx_ring->ena_bufs[0].req_id]);
+				res_budget--;
+			}
+#endif /* ENA_XDP_SUPPORT */
 			for (i = 0; i < ena_rx_ctx.descs; i++) {
-				rx_ring->free_tx_ids[next_to_clean] =
+				rx_ring->free_ids[next_to_clean] =
 					rx_ring->ena_bufs[i].req_id;
 				next_to_clean =
 					ENA_RX_RING_IDX_NEXT(next_to_clean,
 							     rx_ring->ring_size);
 			}
+#ifdef ENA_XDP_SUPPORT
+			if (xdp_verdict == XDP_TX || xdp_verdict == XDP_DROP)
+				continue;
+#endif /* ENA_XDP_SUPPORT */
 			break;
 		}
 
@@ -1186,7 +1796,6 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 	} while (likely(res_budget));
 
 	work_done = budget - res_budget;
-	rx_ring->per_napi_bytes += total_len;
 	rx_ring->per_napi_packets += work_done;
 	u64_stats_update_begin(&rx_ring->syncp);
 	rx_ring->rx_stats.bytes += total_len;
@@ -1196,7 +1805,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 
 	rx_ring->next_to_clean = next_to_clean;
 
-	refill_required = ena_com_free_desc(rx_ring->ena_com_io_sq);
+	refill_required = ena_com_free_q_entries(rx_ring->ena_com_io_sq);
 	refill_threshold =
 		min_t(int, rx_ring->ring_size / ENA_RX_REFILL_THRESH_DIVIDER,
 		      ENA_RX_REFILL_THRESH_PACKET);
@@ -1223,46 +1832,67 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 	return 0;
 }
 
-inline void ena_adjust_intr_moderation(struct ena_ring *rx_ring,
-				       struct ena_ring *tx_ring)
+static void ena_dim_work(struct work_struct *w)
 {
-	/* We apply adaptive moderation on Rx path only.
-	 * Tx uses static interrupt moderation.
-	 */
-	ena_com_calculate_interrupt_delay(rx_ring->ena_dev,
-					  rx_ring->per_napi_packets,
-					  rx_ring->per_napi_bytes,
-					  &rx_ring->smoothed_interval,
-					  &rx_ring->moder_tbl_idx);
-
-	/* Reset per napi packets/bytes */
-	tx_ring->per_napi_packets = 0;
-	tx_ring->per_napi_bytes = 0;
+	struct dim *dim = container_of(w, struct dim, work);
+	struct dim_cq_moder cur_moder =
+		net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
+	struct ena_napi *ena_napi = container_of(dim, struct ena_napi, dim);
+
+	ena_napi->rx_ring->smoothed_interval = cur_moder.usec;
+	dim->state = DIM_START_MEASURE;
+}
+
+static void ena_adjust_adaptive_rx_intr_moderation(struct ena_napi *ena_napi)
+{
+	struct dim_sample dim_sample;
+	struct ena_ring *rx_ring = ena_napi->rx_ring;
+
+	if (!rx_ring->per_napi_packets)
+		return;
+
+	rx_ring->non_empty_napi_events++;
+
+	dim_update_sample(rx_ring->non_empty_napi_events,
+			  rx_ring->rx_stats.cnt,
+			  rx_ring->rx_stats.bytes,
+			  &dim_sample);
+
+	net_dim(&ena_napi->dim, dim_sample);
+
 	rx_ring->per_napi_packets = 0;
-	rx_ring->per_napi_bytes = 0;
 }
 
-static inline void ena_unmask_interrupt(struct ena_ring *tx_ring,
+static void ena_unmask_interrupt(struct ena_ring *tx_ring,
 					struct ena_ring *rx_ring)
 {
 	struct ena_eth_io_intr_reg intr_reg;
+	u32 rx_interval = 0;
+	/* Rx ring can be NULL when for XDP tx queues which don't have an
+	 * accompanying rx_ring pair.
+	 */
+	if (rx_ring)
+		rx_interval = ena_com_get_adaptive_moderation_enabled(rx_ring->ena_dev) ?
+			rx_ring->smoothed_interval :
+			ena_com_get_nonadaptive_moderation_interval_rx(rx_ring->ena_dev);
 
 	/* Update intr register: rx intr delay,
 	 * tx intr delay and interrupt unmask
 	 */
 	ena_com_update_intr_reg(&intr_reg,
-				rx_ring->smoothed_interval,
+				rx_interval,
 				tx_ring->smoothed_interval,
 				true);
 
 	/* It is a shared MSI-X.
 	 * Tx and Rx CQ have pointer to it.
 	 * So we use one of them to reach the intr reg
+	 * The Tx ring is used because the rx_ring is NULL for XDP queues
 	 */
-	ena_com_unmask_intr(rx_ring->ena_com_io_cq, &intr_reg);
+	ena_com_unmask_intr(tx_ring->ena_com_io_cq, &intr_reg);
 }
 
-static inline void ena_update_ring_numa_node(struct ena_ring *tx_ring,
+static void ena_update_ring_numa_node(struct ena_ring *tx_ring,
 					     struct ena_ring *rx_ring)
 {
 	int cpu = get_cpu();
@@ -1277,24 +1907,86 @@ static inline void ena_update_ring_numa_node(struct ena_ring *tx_ring,
 
 	if (numa_node != NUMA_NO_NODE) {
 		ena_com_update_numa_node(tx_ring->ena_com_io_cq, numa_node);
-		ena_com_update_numa_node(rx_ring->ena_com_io_cq, numa_node);
+		if (rx_ring)
+			ena_com_update_numa_node(rx_ring->ena_com_io_cq,
+						 numa_node);
 	}
 
 	tx_ring->cpu = cpu;
-	rx_ring->cpu = cpu;
+	if (rx_ring)
+		rx_ring->cpu = cpu;
 
 	return;
 out:
 	put_cpu();
 }
 
+#ifdef ENA_XDP_SUPPORT
+static int ena_clean_xdp_irq(struct ena_ring *xdp_ring, u32 budget)
+{
+	u32 total_done = 0;
+	u16 next_to_clean;
+	u32 tx_bytes = 0;
+	int tx_pkts = 0;
+	u16 req_id;
+	int rc;
+
+	if (unlikely(!xdp_ring))
+		return 0;
+	next_to_clean = xdp_ring->next_to_clean;
+
+	while (tx_pkts < budget) {
+		struct ena_tx_buffer *tx_info;
+		struct xdp_frame *xdpf;
+
+		rc = ena_com_tx_comp_req_id_get(xdp_ring->ena_com_io_cq,
+						&req_id);
+		if (rc)
+			break;
+
+		rc = validate_xdp_req_id(xdp_ring, req_id);
+		if (rc)
+			break;
+
+		tx_info = &xdp_ring->tx_buffer_info[req_id];
+		xdpf = tx_info->xdpf;
+
+		tx_info->xdpf = NULL;
+		tx_info->last_jiffies = 0;
+		ena_unmap_tx_buff(xdp_ring, tx_info);
+
+		netif_dbg(xdp_ring->adapter, tx_done, xdp_ring->netdev,
+			  "tx_poll: q %d skb %p completed\n", xdp_ring->qid,
+			  xdpf);
+
+		tx_bytes += xdpf->len;
+		tx_pkts++;
+		total_done += tx_info->tx_descs;
+
+		__free_page(tx_info->xdp_rx_page);
+		xdp_ring->free_ids[next_to_clean] = req_id;
+		next_to_clean = ENA_TX_RING_IDX_NEXT(next_to_clean,
+						     xdp_ring->ring_size);
+	}
+
+	xdp_ring->next_to_clean = next_to_clean;
+	ena_com_comp_ack(xdp_ring->ena_com_io_sq, total_done);
+	ena_com_update_dev_comp_head(xdp_ring->ena_com_io_cq);
+
+	netif_dbg(xdp_ring->adapter, tx_done, xdp_ring->netdev,
+		  "tx_poll: q %d done. total pkts: %d\n",
+		  xdp_ring->qid, tx_pkts);
+
+	return tx_pkts;
+}
+#endif /* ENA_XDP_SUPPORT */
+
 static int ena_io_poll(struct napi_struct *napi, int budget)
 {
 	struct ena_napi *ena_napi = container_of(napi, struct ena_napi, napi);
 	struct ena_ring *tx_ring, *rx_ring;
-
-	u32 tx_work_done;
-	u32 rx_work_done;
+	int tx_work_done;
+	int rx_work_done = 0;
 	int tx_budget;
 	int napi_comp_call = 0;
 	int ret;
@@ -1302,6 +1994,9 @@ static int ena_io_poll(struct napi_struct *napi, int budget)
 	tx_ring = ena_napi->tx_ring;
 	rx_ring = ena_napi->rx_ring;
 
+	tx_ring->first_interrupt = ena_napi->first_interrupt;
+	rx_ring->first_interrupt = ena_napi->first_interrupt;
+
 	tx_budget = tx_ring->ring_size / ENA_TX_POLL_BUDGET_DIVIDER;
 
 	if (!test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags) ||
@@ -1315,7 +2010,11 @@ static int ena_io_poll(struct napi_struct *napi, int budget)
 #endif
 
 	tx_work_done = ena_clean_tx_irq(tx_ring, tx_budget);
-	rx_work_done = ena_clean_rx_irq(rx_ring, napi, budget);
+	/* On netpoll the budget is zero and the handler should only clean the
+	 * tx completions.
+	 */
+	if (likely(budget))
+		rx_work_done = ena_clean_rx_irq(rx_ring, napi, budget);
 
 	/* If the device is about to reset or down, avoid unmask
 	 * the interrupt and return 0 so NAPI won't reschedule
@@ -1332,15 +2031,21 @@ static int ena_io_poll(struct napi_struct *napi, int budget)
 		 * from the interrupt context (vs from sk_busy_loop)
 		 */
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
-		if (napi_complete_done(napi, rx_work_done)) {
+		if (napi_complete_done(napi, rx_work_done) &&
+		    atomic_cmpxchg(&ena_napi->unmask_interrupt, 1, 0)) {
 #else
 		napi_complete_done(napi, rx_work_done);
 		if (atomic_cmpxchg(&ena_napi->unmask_interrupt, 1, 0)) {
 #endif
-			/* Tx and Rx share the same interrupt vector */
+			/* We apply adaptive moderation on Rx path only.
+			 * Tx uses static interrupt moderation.
+			 */
 			if (ena_com_get_adaptive_moderation_enabled(rx_ring->ena_dev))
-				ena_adjust_intr_moderation(rx_ring, tx_ring);
+				ena_adjust_adaptive_rx_intr_moderation(ena_napi);
 
+			u64_stats_update_begin(&tx_ring->syncp);
+			tx_ring->tx_stats.unmask_interrupt++;
+			u64_stats_update_end(&tx_ring->syncp);
 			ena_unmask_interrupt(tx_ring, rx_ring);
 		}
 
@@ -1383,16 +2088,11 @@ static irqreturn_t ena_intr_msix_io(int irq, void *data)
 {
 	struct ena_napi *ena_napi = data;
 
-	ena_napi->tx_ring->first_interrupt = true;
-	ena_napi->rx_ring->first_interrupt = true;
+	ena_napi->first_interrupt = true;
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
-	napi_schedule_irqoff(&ena_napi->napi);
-#else
 	smp_mb__before_atomic();
 	atomic_set(&ena_napi->unmask_interrupt, 1);
 	napi_schedule_irqoff(&ena_napi->napi);
-#endif
 
 	return IRQ_HANDLED;
 }
@@ -1402,7 +2102,7 @@ static irqreturn_t ena_intr_msix_io(int irq, void *data)
  * the number of potential io queues is the minimum of what the device
  * supports and the number of vCPUs.
  */
-static int ena_enable_msix(struct ena_adapter *adapter, int num_queues)
+static int ena_enable_msix(struct ena_adapter *adapter)
 {
 	int msix_vecs, irq_cnt;
 #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
@@ -1416,7 +2116,7 @@ static int ena_enable_msix(struct ena_adapter *adapter, int num_queues)
 	}
 
 	/* Reserved the max msix vectors we might need */
-	msix_vecs = ENA_MAX_MSIX_VEC(num_queues);
+	msix_vecs = ENA_MAX_MSIX_VEC(adapter->max_num_io_queues);
 	netif_dbg(adapter, probe, adapter->netdev,
 		  "trying to enable MSI-X, vectors %d\n", msix_vecs);
 
@@ -1449,7 +2149,7 @@ static int ena_enable_msix(struct ena_adapter *adapter, int num_queues)
 		netif_notice(adapter, probe, adapter->netdev,
 			     "enable only %d MSI-X (out of %d), reduce the number of queues\n",
 			     irq_cnt, msix_vecs);
-		adapter->num_queues = irq_cnt - ENA_ADMIN_MSIX_VEC;
+		adapter->num_io_queues = irq_cnt - ENA_ADMIN_MSIX_VEC;
 	}
 
 	if (ena_init_rx_cpu_rmap(adapter))
@@ -1488,10 +2188,12 @@ static void ena_setup_io_intr(struct ena_adapter *adapter)
 {
 	struct net_device *netdev;
 	int irq_idx, i, cpu;
+	int io_queue_count;
 
 	netdev = adapter->netdev;
+	io_queue_count = adapter->num_io_queues + adapter->xdp_num_queues;
 
-	for (i = 0; i < adapter->num_queues; i++) {
+	for (i = 0; i < io_queue_count; i++) {
 		irq_idx = ENA_IO_IRQ_IDX(i);
 		cpu = i % num_online_cpus();
 
@@ -1536,6 +2238,7 @@ static int ena_request_mgmnt_irq(struct ena_adapter *adapter)
 
 static int ena_request_io_irq(struct ena_adapter *adapter)
 {
+	u32 io_queue_count = adapter->num_io_queues + adapter->xdp_num_queues;
 	unsigned long flags = 0;
 	struct ena_irq *irq;
 	int rc = 0, i, k;
@@ -1546,7 +2249,7 @@ static int ena_request_io_irq(struct ena_adapter *adapter)
 		return -EINVAL;
 	}
 
-	for (i = ENA_IO_IRQ_FIRST_IDX; i < adapter->msix_vecs; i++) {
+	for (i = ENA_IO_IRQ_FIRST_IDX; i < ENA_MAX_MSIX_VEC(io_queue_count); i++) {
 		irq = &adapter->irq_tbl[i];
 		rc = request_irq(irq->vector, irq->handler, flags, irq->name,
 				 irq->data);
@@ -1585,6 +2288,7 @@ static void ena_free_mgmnt_irq(struct ena_adapter *adapter)
 
 static void ena_free_io_irq(struct ena_adapter *adapter)
 {
+	u32 io_queue_count = adapter->num_io_queues + adapter->xdp_num_queues;
 	struct ena_irq *irq;
 	int i;
 
@@ -1595,7 +2299,7 @@ static void ena_free_io_irq(struct ena_adapter *adapter)
 	}
 #endif /* CONFIG_RFS_ACCEL */
 
-	for (i = ENA_IO_IRQ_FIRST_IDX; i < adapter->msix_vecs; i++) {
+	for (i = ENA_IO_IRQ_FIRST_IDX; i < ENA_MAX_MSIX_VEC(io_queue_count); i++) {
 		irq = &adapter->irq_tbl[i];
 		irq_set_affinity_hint(irq->vector, NULL);
 		free_irq(irq->vector, irq->data);
@@ -1619,55 +2323,86 @@ static void ena_disable_msix(struct ena_adapter *adapter)
 
 static void ena_disable_io_intr_sync(struct ena_adapter *adapter)
 {
+	u32 io_queue_count = adapter->num_io_queues + adapter->xdp_num_queues;
 	int i;
 
 	if (!netif_running(adapter->netdev))
 		return;
 
-	for (i = ENA_IO_IRQ_FIRST_IDX; i < adapter->msix_vecs; i++)
+	for (i = ENA_IO_IRQ_FIRST_IDX; i < ENA_MAX_MSIX_VEC(io_queue_count); i++)
 		synchronize_irq(adapter->irq_tbl[i].vector);
 }
 
-static void ena_del_napi(struct ena_adapter *adapter)
+static void ena_del_napi_in_range(struct ena_adapter *adapter,
+				  int first_index,
+				  int count)
 {
 	int i;
 
-	for (i = 0; i < adapter->num_queues; i++) {
+	for (i = first_index; i < first_index + count; i++) {
+#ifdef ENA_XDP_SUPPORT
+		/* Check if napi was initialized before */
+		if (!ENA_IS_XDP_INDEX(adapter, i) ||
+		    adapter->ena_napi[i].xdp_ring) {
+			napi_hash_del(&adapter->ena_napi[i].napi);
+			netif_napi_del(&adapter->ena_napi[i].napi);
+		}
+#else
 		napi_hash_del(&adapter->ena_napi[i].napi);
 		netif_napi_del(&adapter->ena_napi[i].napi);
+#endif /* ENA_XDP_SUPPORT */
+#ifdef ENA_XDP_SUPPORT
+		else
+			WARN_ON(ENA_IS_XDP_INDEX(adapter, i) &&
+				adapter->ena_napi[i].xdp_ring);
+#endif /* ENA_XDP_SUPPORT */
 	}
 }
 
-static void ena_init_napi(struct ena_adapter *adapter)
+static void ena_init_napi_in_range(struct ena_adapter *adapter,
+				   int first_index, int count)
 {
-	struct ena_napi *napi;
+	struct ena_napi *napi = {0};
 	int i;
 
-	for (i = 0; i < adapter->num_queues; i++) {
+	for (i = first_index; i < first_index + count; i++) {
 		napi = &adapter->ena_napi[i];
 
 		netif_napi_add(adapter->netdev,
 			       &adapter->ena_napi[i].napi,
+#ifdef ENA_XDP_SUPPORT
+			       ENA_IS_XDP_INDEX(adapter, i) ? ena_xdp_io_poll : ena_io_poll,
+#else
 			       ena_io_poll,
+#endif /* ENA_XDP_SUPPORT */
 			       ENA_NAPI_BUDGET);
-#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0)
-		napi_hash_add(&adapter->ena_napi[i].napi);
-#endif
-		napi->rx_ring = &adapter->rx_ring[i];
-		napi->tx_ring = &adapter->tx_ring[i];
+
+		if (!ENA_IS_XDP_INDEX(adapter, i)) {
+			napi->rx_ring = &adapter->rx_ring[i];
+			napi->tx_ring = &adapter->tx_ring[i];
+		} else {
+#ifdef ENA_XDP_SUPPORT
+			napi->xdp_ring = &adapter->tx_ring[i];
+#endif /* ENA_XDP_SUPPORT */
+		}
 		napi->qid = i;
 	}
 }
 
 #if ENA_BUSY_POLL_SUPPORT
-static void ena_napi_disable_all(struct ena_adapter *adapter)
+static void ena_napi_disable_in_range(struct ena_adapter *adapter,
+				      int first_index,
+				      int count)
 {
 	struct ena_ring *rx_ring;
 	int i, timeout;
 
-	for (i = 0; i < adapter->num_queues; i++) {
+	for (i = first_index; i < first_index + count; i++) {
 		napi_disable(&adapter->ena_napi[i].napi);
 
+		/* XDP doesn't have rx_ring */
+		if (ENA_IS_XDP_INDEX(adapter, i))
+			continue;
 		rx_ring = &adapter->rx_ring[i];
 		timeout = 100;
 		while (!ena_bp_disable(rx_ring)) {
@@ -1685,31 +2420,27 @@ static void ena_napi_disable_all(struct ena_adapter *adapter)
 	}
 }
 #else
-static void ena_napi_disable_all(struct ena_adapter *adapter)
+static void ena_napi_disable_in_range(struct ena_adapter *adapter,
+				      int first_index,
+				      int count)
 {
 	int i;
 
-	for (i = 0; i < adapter->num_queues; i++)
+	for (i = first_index; i < first_index + count; i++)
 		napi_disable(&adapter->ena_napi[i].napi);
 }
 #endif
 
-static void ena_napi_enable_all(struct ena_adapter *adapter)
+static void ena_napi_enable_in_range(struct ena_adapter *adapter,
+				     int first_index,
+				     int count)
 {
 	int i;
 
-	for (i = 0; i < adapter->num_queues; i++)
+	for (i = first_index; i < first_index + count; i++)
 		napi_enable(&adapter->ena_napi[i].napi);
 }
 
-static void ena_restore_ethtool_params(struct ena_adapter *adapter)
-{
-	adapter->tx_usecs = 0;
-	adapter->rx_usecs = 0;
-	adapter->tx_frames = 1;
-	adapter->rx_frames = 1;
-}
-
 /* Configure the Rx forwarding */
 static int ena_rss_configure(struct ena_adapter *adapter)
 {
@@ -1759,9 +2490,9 @@ static int ena_up_complete(struct ena_adapter *adapter)
 	/* enable transmits */
 	netif_tx_start_all_queues(adapter->netdev);
 
-	ena_restore_ethtool_params(adapter);
-
-	ena_napi_enable_all(adapter);
+	ena_napi_enable_in_range(adapter,
+				 0,
+				 adapter->xdp_num_queues + adapter->num_io_queues);
 
 	return 0;
 }
@@ -1787,7 +2518,7 @@ static int ena_create_io_tx_queue(struct ena_adapter *adapter, int qid)
 	ctx.qid = ena_qid;
 	ctx.mem_queue_type = ena_dev->tx_mem_queue_type;
 	ctx.msix_vector = msix_vector;
-	ctx.queue_size = adapter->tx_ring_size;
+	ctx.queue_size = tx_ring->ring_size;
 	ctx.numa_node = cpu_to_node(tx_ring->cpu);
 
 	rc = ena_com_create_io_queue(ena_dev, &ctx);
@@ -1813,12 +2544,13 @@ static int ena_create_io_tx_queue(struct ena_adapter *adapter, int qid)
 	return rc;
 }
 
-static int ena_create_all_io_tx_queues(struct ena_adapter *adapter)
+static int ena_create_io_tx_queues_in_range(struct ena_adapter *adapter,
+					    int first_index, int count)
 {
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
 	int rc, i;
 
-	for (i = 0; i < adapter->num_queues; i++) {
+	for (i = first_index; i < first_index + count; i++) {
 		rc = ena_create_io_tx_queue(adapter, i);
 		if (rc)
 			goto create_err;
@@ -1827,7 +2559,7 @@ static int ena_create_all_io_tx_queues(struct ena_adapter *adapter)
 	return 0;
 
 create_err:
-	while (i--)
+	while (i-- > first_index)
 		ena_com_destroy_io_queue(ena_dev, ENA_IO_TXQ_IDX(i));
 
 	return rc;
@@ -1854,7 +2586,7 @@ static int ena_create_io_rx_queue(struct ena_adapter *adapter, int qid)
 	ctx.direction = ENA_COM_IO_QUEUE_DIRECTION_RX;
 	ctx.mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
 	ctx.msix_vector = msix_vector;
-	ctx.queue_size = adapter->rx_ring_size;
+	ctx.queue_size = rx_ring->ring_size;
 	ctx.numa_node = cpu_to_node(rx_ring->cpu);
 
 	rc = ena_com_create_io_queue(ena_dev, &ctx);
@@ -1872,12 +2604,14 @@ static int ena_create_io_rx_queue(struct ena_adapter *adapter, int qid)
 		netif_err(adapter, ifup, adapter->netdev,
 			  "Failed to get RX queue handlers. RX queue num %d rc: %d\n",
 			  qid, rc);
-		ena_com_destroy_io_queue(ena_dev, ena_qid);
-		return rc;
+		goto err;
 	}
 
 	ena_com_update_numa_node(rx_ring->ena_com_io_cq, ctx.numa_node);
 
+	return rc;
+err:
+	ena_com_destroy_io_queue(ena_dev, ena_qid);
 	return rc;
 }
 
@@ -1886,8 +2620,9 @@ static int ena_create_all_io_rx_queues(struct ena_adapter *adapter)
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
 	int rc, i;
 
-	for (i = 0; i < adapter->num_queues; i++) {
+	for (i = 0; i < adapter->num_io_queues; i++) {
 		rc = ena_create_io_rx_queue(adapter, i);
+		INIT_WORK(&adapter->ena_napi[i].dim.work, ena_dim_work);
 		if (rc)
 			goto create_err;
 	}
@@ -1895,50 +2630,156 @@ static int ena_create_all_io_rx_queues(struct ena_adapter *adapter)
 	return 0;
 
 create_err:
-	while (i--)
+	while (i--) {
+		cancel_work_sync(&adapter->ena_napi[i].dim.work);
 		ena_com_destroy_io_queue(ena_dev, ENA_IO_RXQ_IDX(i));
+	}
 
 	return rc;
 }
 
+static void set_io_rings_size(struct ena_adapter *adapter,
+				     int new_tx_size, int new_rx_size)
+{
+	int i;
+
+	for (i = 0; i < adapter->num_io_queues; i++) {
+		adapter->tx_ring[i].ring_size = new_tx_size;
+		adapter->rx_ring[i].ring_size = new_rx_size;
+	}
+}
+
+/* This function allows queue allocation to backoff when the system is
+ * low on memory. If there is not enough memory to allocate io queues
+ * the driver will try to allocate smaller queues.
+ *
+ * The heuristic is as follows:
+ *
+ * 1. Try to allocate TX and RX and if successful return success.
+ * 2. If TX and RX are both smaller or equal to 256 return failure.
+ *
+ * 3. If TX and RX sizes differ:
+ * 3.1. Divide by 2 the size of the larger one and go back to 1.
+ *
+ * 4. Else (TX and RX sizes are the same)
+ * 4.1 Divide both RX and TX sizes by 2
+ * and go back to 1
+ */
+static int create_queues_with_size_backoff(struct ena_adapter *adapter)
+{
+	int rc, cur_rx_ring_size, cur_tx_ring_size;
+	int new_rx_ring_size, new_tx_ring_size;
+
+	/* current queue sizes might be set to smaller than the requested
+	 * ones due to past queue allocation failures.
+	 */
+	set_io_rings_size(adapter, adapter->requested_tx_ring_size,
+			  adapter->requested_rx_ring_size);
+
+	while (1) {
+#ifdef ENA_XDP_SUPPORT
+		if (ena_xdp_present(adapter)) {
+			rc = ena_setup_and_create_all_xdp_queues(adapter);
+
+			if (rc)
+				goto err_setup_tx;
+		}
+#endif /* ENA_XDP_SUPPORT */
+		rc = ena_setup_tx_resources_in_range(adapter,
+						     0,
+						     adapter->num_io_queues);
+		if (rc)
+			goto err_setup_tx;
+
+		rc = ena_create_io_tx_queues_in_range(adapter,
+						      0,
+						      adapter->num_io_queues);
+		if (rc)
+			goto err_create_tx_queues;
+
+		rc = ena_setup_all_rx_resources(adapter);
+		if (rc)
+			goto err_setup_rx;
+
+		rc = ena_create_all_io_rx_queues(adapter);
+		if (rc)
+			goto err_create_rx_queues;
+
+		return 0;
+
+err_create_rx_queues:
+		ena_free_all_io_rx_resources(adapter);
+err_setup_rx:
+		ena_destroy_all_tx_queues(adapter);
+err_create_tx_queues:
+		ena_free_all_io_tx_resources(adapter);
+err_setup_tx:
+		if (rc != -ENOMEM) {
+			netif_err(adapter, ifup, adapter->netdev,
+				  "Queue creation failed with error code %d\n",
+				  rc);
+			return rc;
+		}
+
+		cur_tx_ring_size = adapter->tx_ring[0].ring_size;
+		cur_rx_ring_size = adapter->rx_ring[0].ring_size;
+
+		netif_err(adapter, ifup, adapter->netdev,
+			  "Not enough memory to create queues with sizes TX=%d, RX=%d\n",
+			  cur_tx_ring_size, cur_rx_ring_size);
+
+		new_tx_ring_size = cur_tx_ring_size;
+		new_rx_ring_size = cur_rx_ring_size;
+
+		/* Decrease the size of the larger queue, or
+		 * decrease both if they are the same size.
+		 */
+		if (cur_rx_ring_size <= cur_tx_ring_size)
+			new_tx_ring_size = cur_tx_ring_size / 2;
+		if (cur_rx_ring_size >= cur_tx_ring_size)
+			new_rx_ring_size = cur_rx_ring_size / 2;
+
+		if (new_tx_ring_size < ENA_MIN_RING_SIZE ||
+		    new_rx_ring_size < ENA_MIN_RING_SIZE) {
+			netif_err(adapter, ifup, adapter->netdev,
+				  "Queue creation failed with the smallest possible queue size of %d for both queues. Not retrying with smaller queues\n",
+				  ENA_MIN_RING_SIZE);
+			return rc;
+		}
+
+		netif_err(adapter, ifup, adapter->netdev,
+			  "Retrying queue creation with sizes TX=%d, RX=%d\n",
+			  new_tx_ring_size,
+			  new_rx_ring_size);
+
+		set_io_rings_size(adapter, new_tx_ring_size,
+				  new_rx_ring_size);
+	}
+}
+
 static int ena_up(struct ena_adapter *adapter)
 {
-	int rc, i;
+	int io_queue_count, rc, i;
 
 	netdev_dbg(adapter->netdev, "%s\n", __func__);
 
+	io_queue_count = adapter->num_io_queues + adapter->xdp_num_queues;
 	ena_setup_io_intr(adapter);
 
 	/* napi poll functions should be initialized before running
 	 * request_irq(), to handle a rare condition where there is a pending
 	 * interrupt, causing the ISR to fire immediately while the poll
 	 * function wasn't set yet, causing a null dereference
-	 */
-	ena_init_napi(adapter);
-
-	rc = ena_request_io_irq(adapter);
-	if (rc)
-		goto err_req_irq;
-
-	/* allocate transmit descriptors */
-	rc = ena_setup_all_tx_resources(adapter);
-	if (rc)
-		goto err_setup_tx;
-
-	/* allocate receive descriptors */
-	rc = ena_setup_all_rx_resources(adapter);
-	if (rc)
-		goto err_setup_rx;
+	 */
+	ena_init_napi_in_range(adapter, 0, io_queue_count);
 
-	/* Create TX queues */
-	rc = ena_create_all_io_tx_queues(adapter);
+	rc = ena_request_io_irq(adapter);
 	if (rc)
-		goto err_create_tx_queues;
+		goto err_req_irq;
 
-	/* Create RX queues */
-	rc = ena_create_all_io_rx_queues(adapter);
+	rc = create_queues_with_size_backoff(adapter);
 	if (rc)
-		goto err_create_rx_queues;
+		goto err_create_queues_with_backoff;
 
 	rc = ena_up_complete(adapter);
 	if (rc)
@@ -1954,35 +2795,35 @@ static int ena_up(struct ena_adapter *adapter)
 	set_bit(ENA_FLAG_DEV_UP, &adapter->flags);
 
 	/* Enable completion queues interrupt */
-	for (i = 0; i < adapter->num_queues; i++)
+	for (i = 0; i < adapter->num_io_queues; i++)
 		ena_unmask_interrupt(&adapter->tx_ring[i],
 				     &adapter->rx_ring[i]);
 
 	/* schedule napi in case we had pending packets
 	 * from the last time we disable napi
 	 */
-	for (i = 0; i < adapter->num_queues; i++)
+	for (i = 0; i < io_queue_count; i++)
 		napi_schedule(&adapter->ena_napi[i].napi);
 
 	return rc;
 
 err_up:
-	ena_destroy_all_rx_queues(adapter);
-err_create_rx_queues:
 	ena_destroy_all_tx_queues(adapter);
-err_create_tx_queues:
-	ena_free_all_io_rx_resources(adapter);
-err_setup_rx:
 	ena_free_all_io_tx_resources(adapter);
-err_setup_tx:
+	ena_destroy_all_rx_queues(adapter);
+	ena_free_all_io_rx_resources(adapter);
+err_create_queues_with_backoff:
 	ena_free_io_irq(adapter);
 err_req_irq:
+	ena_del_napi_in_range(adapter, 0, io_queue_count);
 
 	return rc;
 }
 
 static void ena_down(struct ena_adapter *adapter)
 {
+	int io_queue_count = adapter->num_io_queues + adapter->xdp_num_queues;
+
 	netif_info(adapter, ifdown, adapter->netdev, "%s\n", __func__);
 
 	clear_bit(ENA_FLAG_DEV_UP, &adapter->flags);
@@ -1995,7 +2836,7 @@ static void ena_down(struct ena_adapter *adapter)
 	netif_tx_disable(adapter->netdev);
 
 	/* After this point the napi handler won't enable the tx queue */
-	ena_napi_disable_all(adapter);
+	ena_napi_disable_in_range(adapter, 0, io_queue_count);
 
 	/* After destroy the queue there won't be any new interrupts */
 
@@ -2013,7 +2854,7 @@ static void ena_down(struct ena_adapter *adapter)
 
 	ena_disable_io_intr_sync(adapter);
 	ena_free_io_irq(adapter);
-	ena_del_napi(adapter);
+	ena_del_napi_in_range(adapter, 0, io_queue_count);
 
 	ena_free_all_tx_bufs(adapter);
 	ena_free_all_rx_bufs(adapter);
@@ -2038,13 +2879,13 @@ static int ena_open(struct net_device *netdev)
 	int rc;
 
 	/* Notify the stack of the actual queue counts. */
-	rc = netif_set_real_num_tx_queues(netdev, adapter->num_queues);
+	rc = netif_set_real_num_tx_queues(netdev, adapter->num_io_queues);
 	if (rc) {
 		netif_err(adapter, ifup, netdev, "Can't set num tx queues\n");
 		return rc;
 	}
 
-	rc = netif_set_real_num_rx_queues(netdev, adapter->num_queues);
+	rc = netif_set_real_num_rx_queues(netdev, adapter->num_io_queues);
 	if (rc) {
 		netif_err(adapter, ifup, netdev, "Can't set num rx queues\n");
 		return rc;
@@ -2093,7 +2934,69 @@ static int ena_close(struct net_device *netdev)
 	return 0;
 }
 
-static void ena_tx_csum(struct ena_com_tx_ctx *ena_tx_ctx, struct sk_buff *skb)
+int ena_update_queue_sizes(struct ena_adapter *adapter,
+			   u32 new_tx_size,
+			   u32 new_rx_size)
+{
+	bool dev_was_up;
+
+	dev_was_up = test_bit(ENA_FLAG_DEV_UP, &adapter->flags);
+	ena_close(adapter->netdev);
+	adapter->requested_tx_ring_size = new_tx_size;
+	adapter->requested_rx_ring_size = new_rx_size;
+	ena_init_io_rings(adapter,
+			  0,
+			  adapter->xdp_num_queues +
+			  adapter->num_io_queues);
+	return dev_was_up ? ena_up(adapter) : 0;
+}
+
+int ena_update_queue_count(struct ena_adapter *adapter, u32 new_channel_count)
+{
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+#ifdef ENA_XDP_SUPPORT
+	int prev_channel_count;
+#endif /* ENA_XDP_SUPPORT */
+	bool dev_was_up;
+
+	dev_was_up = test_bit(ENA_FLAG_DEV_UP, &adapter->flags);
+	ena_close(adapter->netdev);
+#ifdef ENA_XDP_SUPPORT
+	prev_channel_count = adapter->num_io_queues;
+#endif /* ENA_XDP_SUPPORT */
+	adapter->num_io_queues = new_channel_count;
+#ifdef ENA_XDP_SUPPORT
+	if (ena_xdp_present(adapter) &&
+	    ena_xdp_allowed(adapter) == ENA_XDP_ALLOWED) {
+		adapter->xdp_first_ring = new_channel_count;
+		adapter->xdp_num_queues = new_channel_count;
+		if (prev_channel_count > new_channel_count)
+			ena_xdp_exchange_program_rx_in_range(adapter,
+							     NULL,
+							     new_channel_count,
+							     prev_channel_count);
+		else
+			ena_xdp_exchange_program_rx_in_range(adapter,
+							     adapter->xdp_bpf_prog,
+							     prev_channel_count,
+							     new_channel_count);
+	}
+#endif /* ENA_XDP_SUPPORT */
+
+	/* We need to destroy the rss table so that the indirection
+	 * table will be reinitialized by ena_up()
+	 */
+	ena_com_rss_destroy(ena_dev);
+	ena_init_io_rings(adapter,
+			  0,
+			  adapter->xdp_num_queues +
+			  adapter->num_io_queues);
+	return dev_was_up ? ena_open(adapter->netdev) : 0;
+}
+
+static void ena_tx_csum(struct ena_com_tx_ctx *ena_tx_ctx,
+			struct sk_buff *skb,
+			bool disable_meta_caching)
 {
 	u32 mss = skb_shinfo(skb)->gso_size;
 	struct ena_com_tx_meta *ena_meta = &ena_tx_ctx->ena_meta;
@@ -2137,7 +3040,9 @@ static void ena_tx_csum(struct ena_com_tx_ctx *ena_tx_ctx, struct sk_buff *skb)
 		ena_meta->l3_hdr_len = skb_network_header_len(skb);
 		ena_meta->l3_hdr_offset = skb_network_offset(skb);
 		ena_tx_ctx->meta_valid = 1;
-
+	} else if (disable_meta_caching) {
+		memset(ena_meta, 0, sizeof(*ena_meta));
+		ena_tx_ctx->meta_valid = 1;
 	} else {
 		ena_tx_ctx->meta_valid = 0;
 	}
@@ -2273,12 +3178,11 @@ static int ena_tx_map_skb(struct ena_ring *tx_ring,
 	tx_info->skb = NULL;
 
 	tx_info->num_of_bufs += i;
-	ena_unmap_tx_skb(tx_ring, tx_info);
+	ena_unmap_tx_buff(tx_ring, tx_info);
 
 	return -EINVAL;
 }
 
-
 /* Called with netif_tx_lock. */
 static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
@@ -2289,7 +3193,7 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct netdev_queue *txq;
 	void *push_hdr;
 	u16 next_to_use, req_id, header_len;
-	int qid, rc, nb_hw_desc;
+	int qid, rc;
 
 	netif_dbg(adapter, tx_queued, dev, "%s skb %p\n", __func__, skb);
 	/*  Determine which tx ring we will be placed on */
@@ -2304,7 +3208,7 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	skb_tx_timestamp(skb);
 
 	next_to_use = tx_ring->next_to_use;
-	req_id = tx_ring->free_tx_ids[next_to_use];
+	req_id = tx_ring->free_ids[next_to_use];
 	tx_info = &tx_ring->tx_buffer_info[req_id];
 	tx_info->num_of_bufs = 0;
 
@@ -2322,49 +3226,16 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	ena_tx_ctx.header_len = header_len;
 
 	/* set flags and meta data */
-	ena_tx_csum(&ena_tx_ctx, skb);
-
-	if (unlikely(ena_com_is_doorbell_needed(tx_ring->ena_com_io_sq, &ena_tx_ctx))) {
-		netif_dbg(adapter, tx_queued, dev,
-			  "llq tx max burst size of queue %d achieved, writing doorbell to send burst\n",
-			  qid);
-		ena_com_write_sq_doorbell(tx_ring->ena_com_io_sq);
-	}
-
-	/* prepare the packet's descriptors to dma engine */
-	rc = ena_com_prepare_tx(tx_ring->ena_com_io_sq, &ena_tx_ctx,
-				&nb_hw_desc);
-
-	/* ena_com_prepare_tx() can't fail due to overflow of tx queue,
-	 * since the number of free descriptors in the queue is checked
-	 * after sending the previous packet. In case there isn't enough
-	 * space in the queue for the next packet, it is stopped
-	 * until there is again enough available space in the queue.
-	 * All other failure reasons of ena_com_prepare_tx() are fatal
-	 * and therefore require a device reset.
-	 */
-	if (unlikely(rc)) {
-		netif_err(adapter, tx_queued, dev,
-			  "failed to prepare tx bufs\n");
-		u64_stats_update_begin(&tx_ring->syncp);
-		tx_ring->tx_stats.prepare_ctx_err++;
-		u64_stats_update_end(&tx_ring->syncp);
-		adapter->reset_reason = ENA_REGS_RESET_DRIVER_INVALID_STATE;
-		set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+	ena_tx_csum(&ena_tx_ctx, skb, tx_ring->disable_meta_caching);
+
+	rc = ena_xmit_common(dev,
+			     tx_ring,
+			     tx_info,
+			     &ena_tx_ctx,
+			     next_to_use,
+			     skb->len);
+	if (rc)
 		goto error_unmap_dma;
-	}
-
-	u64_stats_update_begin(&tx_ring->syncp);
-	tx_ring->tx_stats.cnt++;
-	tx_ring->tx_stats.bytes += skb->len;
-	u64_stats_update_end(&tx_ring->syncp);
-
-	tx_info->tx_descs = nb_hw_desc;
-	tx_info->last_jiffies = jiffies;
-	tx_info->print_once = 0;
-
-	tx_ring->next_to_use = ENA_TX_RING_IDX_NEXT(next_to_use,
-		tx_ring->ring_size);
 
 	/* stop the queue when no more space available, the packet can have up
 	 * to sgl_size + 2. one for the meta descriptor and one for header
@@ -2400,7 +3271,7 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0)
-	if (netif_xmit_stopped(txq) || !skb->xmit_more) {
+	if (netif_xmit_stopped(txq) || !netdev_xmit_more()) {
 #endif
 		/* trigger the dma engine. ena_com_write_sq_doorbell()
 		 * has a mb
@@ -2416,7 +3287,7 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	return NETDEV_TX_OK;
 
 error_unmap_dma:
-	ena_unmap_tx_skb(tx_ring, tx_info);
+	ena_unmap_tx_buff(tx_ring, tx_info);
 	tx_info->skb = NULL;
 
 error_drop_packet:
@@ -2424,7 +3295,10 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	return NETDEV_TX_OK;
 }
 
-#ifdef HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V2
+#if defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V3
+static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb,
+			    struct net_device *sb_dev)
+#elif defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V2
 static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb,
 			    struct net_device *sb_dev,
 			    select_queue_fallback_t fallback)
@@ -2448,7 +3322,9 @@ static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb)
 	if (skb_rx_queue_recorded(skb))
 		qid = skb_get_rx_queue(skb);
 	else
-#if (defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V2)
+#if (defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V3)
+		qid = netdev_pick_tx(dev, skb, NULL);
+#elif (defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V2)
 		qid = fallback(dev, skb, NULL);
 #elif (defined HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V1)
 		qid = fallback(dev, skb);
@@ -2500,7 +3376,7 @@ static void ena_config_host_info(struct ena_com_dev *ena_dev, struct pci_dev *pd
 	host_info->bdf = (pdev->bus->number << 8) | pdev->devfn;
 	host_info->os_type = ENA_ADMIN_OS_LINUX;
 	host_info->kernel_ver = LINUX_VERSION_CODE;
-	strncpy(host_info->kernel_ver_str, utsname()->version,
+	strlcpy(host_info->kernel_ver_str, utsname()->version,
 		sizeof(host_info->kernel_ver_str) - 1);
 	host_info->os_dist = 0;
 	strncpy(host_info->os_dist_str, utsname()->release,
@@ -2512,6 +3388,11 @@ static void ena_config_host_info(struct ena_com_dev *ena_dev, struct pci_dev *pd
 		("g"[0] << ENA_ADMIN_HOST_INFO_MODULE_TYPE_SHIFT);
 	host_info->num_cpus = num_online_cpus();
 
+	host_info->driver_supported_features =
+		ENA_ADMIN_HOST_INFO_RX_OFFSET_MASK |
+		ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_MASK |
+		ENA_ADMIN_HOST_INFO_MAP_RX_BUF_BIDIRECTIONAL_MASK;
+
 	rc = ena_com_set_host_attributes(ena_dev);
 	if (rc) {
 		if (rc == -EOPNOTSUPP)
@@ -2565,14 +3446,6 @@ static void ena_config_debug_area(struct ena_adapter *adapter)
 	ena_com_delete_debug_area(adapter->ena_dev);
 }
 
-static void ena_extra_properties_strings_destroy(struct net_device *netdev)
-{
-	struct ena_adapter *adapter = netdev_priv(netdev);
-
-	ena_com_delete_extra_properties_strings(adapter->ena_dev);
-	adapter->ena_extra_properties_count = 0;
-}
-
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36))
 #ifdef NDO_GET_STATS_64_V2
 static void ena_get_stats64(struct net_device *netdev,
@@ -2586,6 +3459,7 @@ static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
 	struct ena_ring *rx_ring, *tx_ring;
 	unsigned int start;
 	u64 rx_drops;
+	u64 tx_drops;
 	int i;
 
 	if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
@@ -2595,7 +3469,7 @@ static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
 		return NULL;
 #endif
 
-	for (i = 0; i < adapter->num_queues; i++) {
+	for (i = 0; i < adapter->num_io_queues; i++) {
 		u64 bytes, packets;
 
 		tx_ring = &adapter->tx_ring[i];
@@ -2624,9 +3498,11 @@ static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
 	do {
 		start = u64_stats_fetch_begin_irq(&adapter->syncp);
 		rx_drops = adapter->dev_stats.rx_drops;
+		tx_drops = adapter->dev_stats.tx_drops;
 	} while (u64_stats_fetch_retry_irq(&adapter->syncp, start));
 
 	stats->rx_dropped = rx_drops;
+	stats->tx_dropped = tx_drops;
 
 	stats->multicast = 0;
 	stats->collisions = 0;
@@ -2655,7 +3531,7 @@ static struct net_device_stats *ena_get_stats(struct net_device *netdev)
 	int i;
 
 	memset(stats, 0, sizeof(*stats));
-	for (i = 0; i < adapter->num_queues; i++) {
+	for (i = 0; i < adapter->num_io_queues; i++) {
 		unsigned long bytes, packets;
 
 		tx_ring = &adapter->tx_ring[i];
@@ -2751,6 +3627,9 @@ static const struct net_device_ops ena_netdev_ops = {
 #if ENA_BUSY_POLL_SUPPORT
 	.ndo_busy_poll		= ena_busy_poll,
 #endif
+#ifdef ENA_XDP_SUPPORT
+	.ndo_bpf		= ena_xdp,
+#endif /* ENA_XDP_SUPPORT */
 };
 
 static int ena_device_validate_params(struct ena_adapter *adapter,
@@ -2776,10 +3655,75 @@ static int ena_device_validate_params(struct ena_adapter *adapter,
 	return 0;
 }
 
+static void set_default_llq_configurations(struct ena_llq_configurations *llq_config,
+						  struct ena_admin_feature_llq_desc *llq)
+{
+	llq_config->llq_header_location = ENA_ADMIN_INLINE_HEADER;
+	llq_config->llq_stride_ctrl = ENA_ADMIN_MULTIPLE_DESCS_PER_ENTRY;
+	llq_config->llq_num_decs_before_header = ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_2;
+	if ((llq->entry_size_ctrl_supported & ENA_ADMIN_LIST_ENTRY_SIZE_256B) &&
+	    force_large_llq_header) {
+		llq_config->llq_ring_entry_size = ENA_ADMIN_LIST_ENTRY_SIZE_256B;
+		llq_config->llq_ring_entry_size_value = 256;
+	} else {
+		llq_config->llq_ring_entry_size = ENA_ADMIN_LIST_ENTRY_SIZE_128B;
+		llq_config->llq_ring_entry_size_value = 128;
+	}
+}
+
+static int ena_set_queues_placement_policy(struct pci_dev *pdev,
+					   struct ena_com_dev *ena_dev,
+					   struct ena_admin_feature_llq_desc *llq,
+					   struct ena_llq_configurations *llq_default_configurations)
+{
+	bool has_mem_bar;
+	int rc;
+	u32 llq_feature_mask;
+
+	llq_feature_mask = 1 << ENA_ADMIN_LLQ;
+	if (!(ena_dev->supported_features & llq_feature_mask)) {
+		dev_err(&pdev->dev,
+			"LLQ is not supported Fallback to host mode policy.\n");
+		ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
+		return 0;
+	}
+
+	has_mem_bar = pci_select_bars(pdev, IORESOURCE_MEM) & BIT(ENA_MEM_BAR);
+
+	rc = ena_com_config_dev_mode(ena_dev, llq, llq_default_configurations);
+	if (unlikely(rc)) {
+		dev_err(&pdev->dev,
+			"Failed to configure the device mode.  Fallback to host mode policy.\n");
+		ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
+		return 0;
+	}
+
+	/* Nothing to config, exit */
+	if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST)
+		return 0;
+
+	if (!has_mem_bar) {
+		dev_err(&pdev->dev,
+			"ENA device does not expose LLQ bar. Fallback to host mode policy.\n");
+		ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
+		return 0;
+	}
+
+	ena_dev->mem_bar = devm_ioremap_wc(&pdev->dev,
+					   pci_resource_start(pdev, ENA_MEM_BAR),
+					   pci_resource_len(pdev, ENA_MEM_BAR));
+
+	if (!ena_dev->mem_bar)
+		return -EFAULT;
+
+	return 0;
+}
+
 static int ena_device_init(struct ena_com_dev *ena_dev, struct pci_dev *pdev,
 			   struct ena_com_dev_get_features_ctx *get_feat_ctx,
 			   bool *wd_state)
 {
+	struct ena_llq_configurations llq_config;
 	struct device *dev = &pdev->dev;
 	bool readless_supported;
 	u32 aenq_groups;
@@ -2870,6 +3814,15 @@ static int ena_device_init(struct ena_com_dev *ena_dev, struct pci_dev *pdev,
 
 	*wd_state = !!(aenq_groups & BIT(ENA_ADMIN_KEEP_ALIVE));
 
+	set_default_llq_configurations(&llq_config, &get_feat_ctx->llq);
+
+	rc = ena_set_queues_placement_policy(pdev, ena_dev, &get_feat_ctx->llq,
+					       &llq_config);
+	if (rc) {
+		dev_err(&pdev->dev, "ena device init failed\n");
+		goto err_admin_init;
+	}
+
 	return 0;
 
 err_admin_init:
@@ -2881,14 +3834,13 @@ static int ena_device_init(struct ena_com_dev *ena_dev, struct pci_dev *pdev,
 	return rc;
 }
 
-static int ena_enable_msix_and_set_admin_interrupts(struct ena_adapter *adapter,
-						    int io_vectors)
+static int ena_enable_msix_and_set_admin_interrupts(struct ena_adapter *adapter)
 {
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
 	struct device *dev = &adapter->pdev->dev;
 	int rc;
 
-	rc = ena_enable_msix(adapter, io_vectors);
+	rc = ena_enable_msix(adapter);
 	if (rc) {
 		dev_err(dev, "Can not reserve msix vectors\n");
 		return rc;
@@ -2954,70 +3906,17 @@ static void ena_destroy_device(struct ena_adapter *adapter, bool graceful)
 
 	ena_com_mmio_reg_read_request_destroy(ena_dev);
 
+	/* return reset reason to default value */
 	adapter->reset_reason = ENA_REGS_RESET_NORMAL;
 
 	clear_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
 	clear_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags);
 }
 
-static int ena_handle_updated_queues(struct ena_adapter *adapter,
-				     struct ena_com_dev_get_features_ctx *get_feat_ctx)
-{
-	struct ena_com_dev *ena_dev = adapter->ena_dev;
-	struct pci_dev *pdev = adapter->pdev;
-	struct ena_calc_queue_size_ctx calc_queue_ctx = { 0 };
-	bool are_queues_changed = false;
-	int io_queue_num, rc;
-
-	calc_queue_ctx.ena_dev = ena_dev;
-	calc_queue_ctx.get_feat_ctx = get_feat_ctx;
-	calc_queue_ctx.pdev = pdev;
-
-	io_queue_num = ena_calc_io_queue_num(pdev, ena_dev, get_feat_ctx);
-	rc = ena_calc_queue_size(&calc_queue_ctx);
-	if (unlikely(rc || io_queue_num <= 0))
-		return -EFAULT;
-
-	if (unlikely(adapter->tx_ring_size > calc_queue_ctx.tx_queue_size ||
-		     adapter->rx_ring_size > calc_queue_ctx.rx_queue_size)) {
-		dev_err(&pdev->dev,
-			"Not enough resources to allocate requested queue sizes (TX,RX)=(%d,%d), falling back to queue sizes (TX,RX)=(%d,%d)\n",
-			adapter->tx_ring_size,
-			adapter->rx_ring_size,
-			calc_queue_ctx.tx_queue_size,
-			calc_queue_ctx.rx_queue_size);
-		adapter->tx_ring_size = calc_queue_ctx.tx_queue_size;
-		adapter->rx_ring_size = calc_queue_ctx.rx_queue_size;
-		adapter->max_tx_sgl_size = calc_queue_ctx.max_tx_sgl_size;
-		adapter->max_rx_sgl_size = calc_queue_ctx.max_rx_sgl_size;
-		are_queues_changed = true;
-	}
-
-	if (unlikely(adapter->num_queues > io_queue_num)) {
-		dev_err(&pdev->dev,
-			"Not enough resources to allocate %d queues, falling back to %d queues\n",
-			adapter->num_queues, io_queue_num);
-		adapter->num_queues = io_queue_num;
-		ena_com_rss_destroy(ena_dev);
-		rc = ena_rss_init_default(adapter);
-		if (unlikely(rc && (rc != -EOPNOTSUPP))) {
-			dev_err(&pdev->dev, "Cannot init RSS rc: %d\n", rc);
-			return rc;
-		}
-		are_queues_changed = true;
-	}
-
-	if (unlikely(are_queues_changed))
-		ena_init_io_rings(adapter);
-
-	return 0;
-}
-
 static int ena_restore_device(struct ena_adapter *adapter)
 {
 	struct ena_com_dev_get_features_ctx get_feat_ctx;
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
-	struct net_device *netdev = adapter->netdev;
 	struct pci_dev *pdev = adapter->pdev;
 	bool wd_state;
 	int rc;
@@ -3036,17 +3935,7 @@ static int ena_restore_device(struct ena_adapter *adapter)
 		goto err_device_destroy;
 	}
 
-	rc = ena_handle_updated_queues(adapter, &get_feat_ctx);
-	if (rc)
-		goto err_device_destroy;
-
-	clear_bit(ENA_FLAG_ONGOING_RESET, &adapter->flags);
-	/* Make sure we don't have a race with AENQ Links state handler */
-	if (test_bit(ENA_FLAG_LINK_UP, &adapter->flags))
-		netif_carrier_on(netdev);
-
-	rc = ena_enable_msix_and_set_admin_interrupts(adapter,
-						      adapter->num_queues);
+	rc = ena_enable_msix_and_set_admin_interrupts(adapter);
 	if (rc) {
 		dev_err(&pdev->dev, "Enable MSI-X failed\n");
 		goto err_device_destroy;
@@ -3058,7 +3947,7 @@ static int ena_restore_device(struct ena_adapter *adapter)
 	}
 	/* If the interface was up before the reset bring it up */
 	if (adapter->dev_up_before_reset) {
-		rc = ena_open(netdev);
+		rc = ena_up(adapter);
 		if (rc) {
 			dev_err(&pdev->dev, "Failed to create I/O queues\n");
 			goto err_sysfs_terminate;
@@ -3066,7 +3955,13 @@ static int ena_restore_device(struct ena_adapter *adapter)
 	}
 
 	set_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags);
+
+	clear_bit(ENA_FLAG_ONGOING_RESET, &adapter->flags);
+	if (test_bit(ENA_FLAG_LINK_UP, &adapter->flags))
+		netif_carrier_on(adapter->netdev);
+
 	mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ));
+	adapter->last_keep_alive_jiffies = jiffies;
 	dev_err(&pdev->dev,
 		"Device reset completed successfully, Driver info: %s\n",
 		version);
@@ -3198,7 +4093,9 @@ static void check_for_missing_completions(struct ena_adapter *adapter)
 	struct ena_ring *tx_ring;
 	struct ena_ring *rx_ring;
 	int i, budget, rc;
+	int io_queue_count;
 
+	io_queue_count = adapter->xdp_num_queues + adapter->num_io_queues;
 	/* Make sure the driver doesn't turn the device in other process */
 	smp_rmb();
 
@@ -3213,7 +4110,7 @@ static void check_for_missing_completions(struct ena_adapter *adapter)
 
 	budget = ENA_MONITORED_TX_QUEUES;
 
-	for (i = adapter->last_monitored_tx_qid; i < adapter->num_queues; i++) {
+	for (i = adapter->last_monitored_tx_qid; i < io_queue_count; i++) {
 		tx_ring = &adapter->tx_ring[i];
 		rx_ring = &adapter->rx_ring[i];
 
@@ -3221,7 +4118,8 @@ static void check_for_missing_completions(struct ena_adapter *adapter)
 		if (unlikely(rc))
 			return;
 
-		rc = check_for_rx_interrupt_queue(adapter, rx_ring);
+		rc =  !ENA_IS_XDP_INDEX(adapter, i) ?
+			check_for_rx_interrupt_queue(adapter, rx_ring) : 0;
 		if (unlikely(rc))
 			return;
 
@@ -3230,7 +4128,7 @@ static void check_for_missing_completions(struct ena_adapter *adapter)
 			break;
 	}
 
-	adapter->last_monitored_tx_qid = i % adapter->num_queues;
+	adapter->last_monitored_tx_qid = i % io_queue_count;
 }
 
 /* trigger napi schedule after 2 consecutive detections */
@@ -3260,10 +4158,10 @@ static void check_for_empty_rx_ring(struct ena_adapter *adapter)
 	if (test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))
 		return;
 
-	for (i = 0; i < adapter->num_queues; i++) {
+	for (i = 0; i < adapter->num_io_queues; i++) {
 		rx_ring = &adapter->rx_ring[i];
 
-		refill_required = ena_com_free_desc(rx_ring->ena_com_io_sq);
+		refill_required = ena_com_free_q_entries(rx_ring->ena_com_io_sq);
 		if (unlikely(refill_required == (rx_ring->ring_size - 1))) {
 			rx_ring->empty_rx_queue++;
 
@@ -3295,8 +4193,8 @@ static void check_for_missing_keep_alive(struct ena_adapter *adapter)
 	if (adapter->keep_alive_timeout == ENA_HW_HINTS_NO_TIMEOUT)
 		return;
 
-	keep_alive_expired = round_jiffies(adapter->last_keep_alive_jiffies +
-					   adapter->keep_alive_timeout);
+	keep_alive_expired = adapter->last_keep_alive_jiffies +
+			     adapter->keep_alive_timeout;
 	if (unlikely(time_is_before_jiffies(keep_alive_expired))) {
 		netif_err(adapter, drv, adapter->netdev,
 			  "Keep alive watchdog timeout.\n");
@@ -3404,19 +4302,19 @@ static void ena_timer_service(unsigned long data)
 	}
 
 	/* Reset the timer */
-	mod_timer(&adapter->timer_service, jiffies + HZ);
+	mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ));
 }
 
-static int ena_calc_io_queue_num(struct pci_dev *pdev,
-				 struct ena_com_dev *ena_dev,
-				 struct ena_com_dev_get_features_ctx *get_feat_ctx)
+static u32 ena_calc_max_io_queue_num(struct pci_dev *pdev,
+				     struct ena_com_dev *ena_dev,
+				     struct ena_com_dev_get_features_ctx *get_feat_ctx)
 {
-	int io_tx_sq_num, io_tx_cq_num, io_rx_num, io_queue_num;
+	u32 io_tx_sq_num, io_tx_cq_num, io_rx_num, max_num_io_queues;
 
 	if (ena_dev->supported_features & BIT(ENA_ADMIN_MAX_QUEUES_EXT)) {
 		struct ena_admin_queue_ext_feature_fields *max_queue_ext =
 			&get_feat_ctx->max_queue_ext.max_queue_ext;
-		io_rx_num = min_t(int, max_queue_ext->max_rx_sq_num,
+		io_rx_num = min_t(u32, max_queue_ext->max_rx_sq_num,
 				  max_queue_ext->max_rx_cq_num);
 
 		io_tx_sq_num = max_queue_ext->max_tx_sq_num;
@@ -3426,73 +4324,25 @@ static int ena_calc_io_queue_num(struct pci_dev *pdev,
 			&get_feat_ctx->max_queues;
 		io_tx_sq_num = max_queues->max_sq_num;
 		io_tx_cq_num = max_queues->max_cq_num;
-		io_rx_num = min_t(int, io_tx_sq_num, io_tx_cq_num);
+		io_rx_num = min_t(u32, io_tx_sq_num, io_tx_cq_num);
 	}
 
 	/* In case of LLQ use the llq fields for the tx SQ/CQ */
 	if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
 		io_tx_sq_num = get_feat_ctx->llq.max_llq_num;
 
-	io_queue_num = min_t(int, num_online_cpus(), ENA_MAX_NUM_IO_QUEUES);
-	io_queue_num = min_t(int, io_queue_num, io_rx_num);
-	io_queue_num = min_t(int, io_queue_num, io_tx_sq_num);
-	io_queue_num = min_t(int, io_queue_num, io_tx_cq_num);
+	max_num_io_queues = min_t(u32, num_online_cpus(), ENA_MAX_NUM_IO_QUEUES);
+	max_num_io_queues = min_t(u32, max_num_io_queues, io_rx_num);
+	max_num_io_queues = min_t(u32, max_num_io_queues, io_tx_sq_num);
+	max_num_io_queues = min_t(u32, max_num_io_queues, io_tx_cq_num);
 	/* 1 IRQ for for mgmnt and 1 IRQs for each IO direction */
-	io_queue_num = min_t(int, io_queue_num, pci_msix_vec_count(pdev) - 1);
-	if (unlikely(!io_queue_num)) {
+	max_num_io_queues = min_t(u32, max_num_io_queues, pci_msix_vec_count(pdev) - 1);
+	if (unlikely(!max_num_io_queues)) {
 		dev_err(&pdev->dev, "The device doesn't have io queues\n");
 		return -EFAULT;
 	}
 
-	return io_queue_num;
-}
-
-static int ena_set_queues_placement_policy(struct pci_dev *pdev,
-					   struct ena_com_dev *ena_dev,
-					   struct ena_admin_feature_llq_desc *llq,
-					   struct ena_llq_configurations *llq_default_configurations)
-{
-	bool has_mem_bar;
-	int rc;
-	u32 llq_feature_mask;
-
-	llq_feature_mask = 1 << ENA_ADMIN_LLQ;
-	if (!(ena_dev->supported_features & llq_feature_mask)) {
-		dev_err(&pdev->dev,
-			"LLQ is not supported Fallback to host mode policy.\n");
-		ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
-		return 0;
-	}
-
-	has_mem_bar = pci_select_bars(pdev, IORESOURCE_MEM) & BIT(ENA_MEM_BAR);
-
-	rc = ena_com_config_dev_mode(ena_dev, llq, llq_default_configurations);
-	if (unlikely(rc)) {
-		dev_err(&pdev->dev,
-			"Failed to configure the device mode.  Fallback to host mode policy.\n");
-		ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
-		return 0;
-	}
-
-	/* Nothing to config, exit */
-	if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST)
-		return 0;
-
-	if (!has_mem_bar) {
-		dev_err(&pdev->dev,
-			"ENA device does not expose LLQ bar. Fallback to host mode policy.\n");
-		ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
-		return 0;
-	}
-
-	ena_dev->mem_bar = devm_ioremap_wc(&pdev->dev,
-					   pci_resource_start(pdev, ENA_MEM_BAR),
-					   pci_resource_len(pdev, ENA_MEM_BAR));
-
-	if (!ena_dev->mem_bar)
-		return -EFAULT;
-
-	return 0;
+	return max_num_io_queues;
 }
 
 static void ena_set_dev_offloads(struct ena_com_dev_get_features_ctx *feat,
@@ -3566,7 +4416,7 @@ static void ena_set_conf_feat_params(struct ena_adapter *adapter,
 	ena_set_dev_offloads(feat, netdev);
 
 	adapter->max_mtu = feat->dev_attr.max_mtu;
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
+#ifdef HAVE_MTU_MIN_MAX_IN_NET_DEVICE
 	netdev->max_mtu = adapter->max_mtu;
 	netdev->min_mtu = ENA_MIN_MTU;
 #endif
@@ -3586,7 +4436,7 @@ static int ena_rss_init_default(struct ena_adapter *adapter)
 	}
 
 	for (i = 0; i < ENA_RX_RSS_TABLE_SIZE; i++) {
-		val = ethtool_rxfh_indir_default(i, adapter->num_queues);
+		val = ethtool_rxfh_indir_default(i, adapter->num_io_queues);
 		rc = ena_com_indirect_table_fill_entry(ena_dev, i,
 						       ENA_IO_RXQ_IDX(val));
 		if (unlikely(rc && (rc != -EOPNOTSUPP))) {
@@ -3595,7 +4445,7 @@ static int ena_rss_init_default(struct ena_adapter *adapter)
 		}
 	}
 
-	rc = ena_com_fill_hash_function(ena_dev, ENA_ADMIN_CRC32, NULL,
+	rc = ena_com_fill_hash_function(ena_dev, ENA_ADMIN_TOEPLITZ, NULL,
 					ENA_HASH_KEY_SIZE, 0xFFFFFFFF);
 	if (unlikely(rc && (rc != -EOPNOTSUPP))) {
 		dev_err(dev, "Cannot fill hash function\n");
@@ -3624,58 +4474,46 @@ static void ena_release_bars(struct ena_com_dev *ena_dev, struct pci_dev *pdev)
 	pci_release_selected_regions(pdev, release_bars);
 }
 
-static inline void set_default_llq_configurations(struct ena_llq_configurations *llq_config)
-{
-	llq_config->llq_header_location = ENA_ADMIN_INLINE_HEADER;
-	llq_config->llq_ring_entry_size = ENA_ADMIN_LIST_ENTRY_SIZE_128B;
-	llq_config->llq_stride_ctrl = ENA_ADMIN_MULTIPLE_DESCS_PER_ENTRY;
-	llq_config->llq_num_decs_before_header = ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_2;
-	llq_config->llq_ring_entry_size_value = 128;
-}
 
-static int ena_calc_queue_size(struct ena_calc_queue_size_ctx *ctx)
+static int ena_calc_io_queue_size(struct ena_calc_queue_size_ctx *ctx)
 {
 	struct ena_admin_feature_llq_desc *llq = &ctx->get_feat_ctx->llq;
 	struct ena_com_dev *ena_dev = ctx->ena_dev;
 	u32 tx_queue_size = ENA_DEFAULT_RING_SIZE;
+	u32 max_tx_queue_size;
+	u32 max_rx_queue_size;
 
-	if (ctx->ena_dev->supported_features & BIT(ENA_ADMIN_MAX_QUEUES_EXT)) {
+	if (ena_dev->supported_features & BIT(ENA_ADMIN_MAX_QUEUES_EXT)) {
 		struct ena_admin_queue_ext_feature_fields *max_queue_ext =
 			&ctx->get_feat_ctx->max_queue_ext.max_queue_ext;
-		rx_queue_size = min_t(u32, rx_queue_size,
-				      max_queue_ext->max_rx_cq_depth);
-		rx_queue_size = min_t(u32, rx_queue_size,
-				      max_queue_ext->max_rx_sq_depth);
-		tx_queue_size = min_t(u32, tx_queue_size,
-				      max_queue_ext->max_tx_cq_depth);
+		max_rx_queue_size = min_t(u32, max_queue_ext->max_rx_cq_depth,
+					  max_queue_ext->max_rx_sq_depth);
+		max_tx_queue_size = max_queue_ext->max_tx_cq_depth;
 
 		if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
-			tx_queue_size = min_t(u32, tx_queue_size,
-					      llq->max_llq_depth);
+			max_tx_queue_size = min_t(u32, max_tx_queue_size,
+						  llq->max_llq_depth);
 		else
-			tx_queue_size = min_t(u32, tx_queue_size,
-					      max_queue_ext->max_tx_sq_depth);
+			max_tx_queue_size = min_t(u32, max_tx_queue_size,
+						  max_queue_ext->max_tx_sq_depth);
 
-		ctx->max_rx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
-					     max_queue_ext->max_per_packet_rx_descs);
 		ctx->max_tx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
 					     max_queue_ext->max_per_packet_tx_descs);
+		ctx->max_rx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
+					     max_queue_ext->max_per_packet_rx_descs);
 	} else {
 		struct ena_admin_queue_feature_desc *max_queues =
 			&ctx->get_feat_ctx->max_queues;
-		rx_queue_size = min_t(u32, rx_queue_size,
-				      max_queues->max_cq_depth);
-		rx_queue_size = min_t(u32, rx_queue_size,
-				      max_queues->max_sq_depth);
-		tx_queue_size = min_t(u32, tx_queue_size,
-				      max_queues->max_cq_depth);
+		max_rx_queue_size = min_t(u32, max_queues->max_cq_depth,
+					  max_queues->max_sq_depth);
+		max_tx_queue_size = max_queues->max_cq_depth;
 
 		if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
-			tx_queue_size = min_t(u32, tx_queue_size,
-					      llq->max_llq_depth);
+			max_tx_queue_size = min_t(u32, max_tx_queue_size,
+						  llq->max_llq_depth);
 		else
-			tx_queue_size = min_t(u32, tx_queue_size,
-					      max_queues->max_sq_depth);
+			max_tx_queue_size = min_t(u32, max_tx_queue_size,
+						  max_queues->max_sq_depth);
 
 		ctx->max_tx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
 					     max_queues->max_packet_tx_descs);
@@ -3683,16 +4521,36 @@ static int ena_calc_queue_size(struct ena_calc_queue_size_ctx *ctx)
 					     max_queues->max_packet_rx_descs);
 	}
 
-	tx_queue_size = rounddown_pow_of_two(tx_queue_size);
-	rx_queue_size = rounddown_pow_of_two(rx_queue_size);
+	max_tx_queue_size = rounddown_pow_of_two(max_tx_queue_size);
+	max_rx_queue_size = rounddown_pow_of_two(max_rx_queue_size);
 
-	if (unlikely(!rx_queue_size || !tx_queue_size)) {
-		dev_err(&ctx->pdev->dev, "Invalid queue size\n");
-		return -EFAULT;
+	/* When forcing large headers, we multiply the entry size by 2,
+	 * and therefore divide the queue size by 2, leaving the amount
+	 * of memory used by the queues unchanged.
+	 */
+	if (force_large_llq_header) {
+		if ((llq->entry_size_ctrl_supported & ENA_ADMIN_LIST_ENTRY_SIZE_256B) &&
+		    (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)) {
+			max_tx_queue_size /= 2;
+			dev_info(&ctx->pdev->dev, "Forcing large headers and decreasing maximum TX queue size to %d\n",
+				 max_tx_queue_size);
+		} else {
+			dev_err(&ctx->pdev->dev, "Forcing large headers failed: LLQ is disabled or device does not support large headers\n");
+		}
 	}
 
-	ctx->rx_queue_size = rx_queue_size;
+	tx_queue_size = clamp_val(tx_queue_size, ENA_MIN_RING_SIZE,
+				  max_tx_queue_size);
+	rx_queue_size = clamp_val(rx_queue_size, ENA_MIN_RING_SIZE,
+				  max_rx_queue_size);
+
+	tx_queue_size = rounddown_pow_of_two(tx_queue_size);
+	rx_queue_size = rounddown_pow_of_two(rx_queue_size);
+
+	ctx->max_tx_queue_size = max_tx_queue_size;
+	ctx->max_rx_queue_size = max_rx_queue_size;
 	ctx->tx_queue_size = tx_queue_size;
+	ctx->rx_queue_size = rx_queue_size;
 
 	return 0;
 }
@@ -3709,22 +4567,19 @@ static int ena_calc_queue_size(struct ena_calc_queue_size_ctx *ctx)
  */
 static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
-	struct ena_com_dev_get_features_ctx get_feat_ctx;
 	struct ena_calc_queue_size_ctx calc_queue_ctx = { 0 };
-	static int version_printed;
-	struct net_device *netdev;
-	struct ena_adapter *adapter;
-	struct ena_llq_configurations llq_config;
+	struct ena_com_dev_get_features_ctx get_feat_ctx;
 	struct ena_com_dev *ena_dev = NULL;
-	char *queue_type_str;
+	struct ena_adapter *adapter;
+	struct net_device *netdev;
 	static int adapters_found;
-	int io_queue_num, bars, rc;
+	u32 max_num_io_queues;
 	bool wd_state;
+	int bars, rc;
 
 	dev_dbg(&pdev->dev, "%s\n", __func__);
 
-	if (version_printed++ == 0)
-		dev_info(&pdev->dev, "%s", version);
+	dev_info_once(&pdev->dev, "%s", version);
 
 	rc = pci_enable_device_mem(pdev);
 	if (rc) {
@@ -3767,39 +4622,25 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto err_free_region;
 	}
 
-	set_default_llq_configurations(&llq_config);
-
-	rc = ena_set_queues_placement_policy(pdev, ena_dev, &get_feat_ctx.llq,
-					     &llq_config);
-	if (rc) {
-		dev_err(&pdev->dev, "ena device init failed\n");
-		goto err_device_destroy;
-	}
-
 	calc_queue_ctx.ena_dev = ena_dev;
 	calc_queue_ctx.get_feat_ctx = &get_feat_ctx;
 	calc_queue_ctx.pdev = pdev;
 
-	/* initial Tx interrupt delay, Assumes 1 usec granularity.
-	* Updated during device initialization with the real granularity
-	*/
+	/* initial TX and RX interrupt delay, Assumes 1 usec granularity.
+	 * Updated during device initialization with the real granularity
+	 */
 	ena_dev->intr_moder_tx_interval = ENA_INTR_INITIAL_TX_INTERVAL_USECS;
-	io_queue_num = ena_calc_io_queue_num(pdev, ena_dev, &get_feat_ctx);
-	rc = ena_calc_queue_size(&calc_queue_ctx);
-	if (rc || io_queue_num <= 0) {
+	ena_dev->intr_moder_rx_interval = ENA_INTR_INITIAL_RX_INTERVAL_USECS;
+	ena_dev->intr_delay_resolution = ENA_DEFAULT_INTR_DELAY_RESOLUTION;
+	max_num_io_queues = ena_calc_max_io_queue_num(pdev, ena_dev, &get_feat_ctx);
+	rc = ena_calc_io_queue_size(&calc_queue_ctx);
+	if (rc || !max_num_io_queues) {
 		rc = -EFAULT;
 		goto err_device_destroy;
 	}
 
-	dev_info(&pdev->dev, "creating %d io queues. rx queue size: %d tx queue size. %d LLQ is %s\n",
-		 io_queue_num,
-		 calc_queue_ctx.rx_queue_size,
-		 calc_queue_ctx.tx_queue_size,
-		 (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) ?
-		 "ENABLED" : "DISABLED");
-
 	/* dev zeroed in init_etherdev */
-	netdev = alloc_etherdev_mq(sizeof(struct ena_adapter), io_queue_num);
+	netdev = alloc_etherdev_mq(sizeof(struct ena_adapter), max_num_io_queues);
 	if (!netdev) {
 		dev_err(&pdev->dev, "alloc_etherdev_mq failed\n");
 		rc = -ENOMEM;
@@ -3820,15 +4661,27 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	adapter->msg_enable = netif_msg_init(debug, DEFAULT_MSG_ENABLE);
 	adapter->reset_reason = ENA_REGS_RESET_NORMAL;
 
-	adapter->tx_ring_size = calc_queue_ctx.tx_queue_size;
-	adapter->rx_ring_size = calc_queue_ctx.rx_queue_size;
+	adapter->requested_tx_ring_size = calc_queue_ctx.tx_queue_size;
+	adapter->requested_rx_ring_size = calc_queue_ctx.rx_queue_size;
+	adapter->max_tx_ring_size = calc_queue_ctx.max_tx_queue_size;
+	adapter->max_rx_ring_size = calc_queue_ctx.max_rx_queue_size;
 	adapter->max_tx_sgl_size = calc_queue_ctx.max_tx_sgl_size;
 	adapter->max_rx_sgl_size = calc_queue_ctx.max_rx_sgl_size;
 
-	adapter->num_queues = io_queue_num;
+	adapter->num_io_queues = clamp_val(num_io_queues, ENA_MIN_NUM_IO_QUEUES,
+					   max_num_io_queues);
+	adapter->max_num_io_queues = max_num_io_queues;
 	adapter->last_monitored_tx_qid = 0;
 
+	adapter->xdp_first_ring = 0;
+	adapter->xdp_num_queues = 0;
+
 	adapter->rx_copybreak = ENA_DEFAULT_RX_COPYBREAK;
+	if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
+		adapter->disable_meta_caching =
+			!!(get_feat_ctx.llq.accel_mode.u.get.supported_flags &
+			   BIT(ENA_ADMIN_DISABLE_META_CACHING));
+
 	adapter->wd_state = wd_state;
 
 	snprintf(adapter->name, ENA_NAME_MAX_LEN, "ena_%d", adapters_found);
@@ -3839,7 +4692,10 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 			"Failed to query interrupt moderation feature\n");
 		goto err_netdev_destroy;
 	}
-	ena_init_io_rings(adapter);
+	ena_init_io_rings(adapter,
+			  0,
+			  adapter->xdp_num_queues +
+			  adapter->num_io_queues);
 
 	netdev->netdev_ops = &ena_netdev_ops;
 	netdev->watchdog_timeo = TX_TIMEOUT;
@@ -3854,7 +4710,7 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	u64_stats_init(&adapter->syncp);
 
-	rc = ena_enable_msix_and_set_admin_interrupts(adapter, io_queue_num);
+	rc = ena_enable_msix_and_set_admin_interrupts(adapter);
 	if (rc) {
 		dev_err(&pdev->dev,
 			"Failed to enable and set the admin interrupts\n");
@@ -3873,9 +4729,6 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	ena_config_debug_area(adapter);
 
-	adapter->ena_extra_properties_count =
-		ena_com_extra_properties_strings_init(ena_dev);
-
 	memcpy(adapter->netdev->perm_addr, adapter->mac_addr, netdev->addr_len);
 
 	netif_carrier_off(netdev);
@@ -3903,15 +4756,10 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 #endif
 	mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ));
 
-	if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST)
-		queue_type_str = "Regular";
-	else
-		queue_type_str = "Low Latency";
-
 	dev_info(&pdev->dev,
-		 "%s found at mem %lx, mac addr %pM Queues %d, Placement policy: %s\n",
+		 "%s found at mem %lx, mac addr %pM\n",
 		 DEVICE_NAME, (long)pci_resource_start(pdev, 0),
-		 netdev->dev_addr, io_queue_num, queue_type_str);
+		 netdev->dev_addr);
 
 	set_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags);
 
@@ -3920,7 +4768,6 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	return 0;
 
 err_rss:
-	ena_extra_properties_strings_destroy(netdev);
 	ena_com_delete_debug_area(ena_dev);
 	ena_com_rss_destroy(ena_dev);
 err_terminate_sysfs:
@@ -3932,7 +4779,6 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	ena_free_mgmnt_irq(adapter);
 	ena_disable_msix(adapter);
 err_worker_destroy:
-	ena_com_destroy_interrupt_moderation(ena_dev);
 	del_timer(&adapter->timer_service);
 err_netdev_destroy:
 	free_netdev(netdev);
@@ -3976,6 +4822,7 @@ static void ena_remove(struct pci_dev *pdev)
 	cancel_work_sync(&adapter->reset_task);
 
 	rtnl_lock();
+	adapter->reset_reason = ENA_REGS_RESET_SHUTDOWN;
 	ena_destroy_device(adapter, true);
 	rtnl_unlock();
 
@@ -3989,14 +4836,10 @@ static void ena_remove(struct pci_dev *pdev)
 
 	ena_com_delete_host_info(ena_dev);
 
-	ena_extra_properties_strings_destroy(netdev);
-
 	ena_release_bars(ena_dev, pdev);
 
 	pci_disable_device(pdev);
 
-	ena_com_destroy_interrupt_moderation(ena_dev);
-
 	vfree(ena_dev);
 }
 
@@ -4038,6 +4881,7 @@ static int ena_resume(struct pci_dev *pdev)
 	u64_stats_update_end(&adapter->syncp);
 
 	rtnl_lock();
+	pci_set_power_state(pdev, PCI_D0);
 	rc = ena_restore_device(adapter);
 	rtnl_unlock();
 	return rc;
@@ -4113,14 +4957,17 @@ static void ena_keep_alive_wd(void *adapter_data,
 	struct ena_adapter *adapter = (struct ena_adapter *)adapter_data;
 	struct ena_admin_aenq_keep_alive_desc *desc;
 	u64 rx_drops;
+	u64 tx_drops;
 
 	desc = (struct ena_admin_aenq_keep_alive_desc *)aenq_e;
 	adapter->last_keep_alive_jiffies = jiffies;
 
 	rx_drops = ((u64)desc->rx_drops_high << 32) | desc->rx_drops_low;
+	tx_drops = ((u64)desc->tx_drops_high << 32) | desc->tx_drops_low;
 
 	u64_stats_update_begin(&adapter->syncp);
 	adapter->dev_stats.rx_drops = rx_drops;
+	adapter->dev_stats.tx_drops = tx_drops;
 	u64_stats_update_end(&adapter->syncp);
 }
 
diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h
index 077f73455bfa9..1648be44e2eb2 100755
--- a/drivers/amazon/net/ena/ena_netdev.h
+++ b/drivers/amazon/net/ena/ena_netdev.h
@@ -33,21 +33,27 @@
 #ifndef ENA_H
 #define ENA_H
 
+#include "kcompat.h"
 #include <linux/bitops.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0)
+#include "dim.h"
+#else
+#include <linux/dim.h>
+#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0) */
 #include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
 #include <linux/inetdevice.h>
 #include <linux/interrupt.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/u64_stats_sync.h>
 
-#include "kcompat.h"
 #include "ena_com.h"
 #include "ena_eth_com.h"
 
 #define DRV_MODULE_VER_MAJOR	2
-#define DRV_MODULE_VER_MINOR	0
-#define DRV_MODULE_VER_SUBMINOR 2
+#define DRV_MODULE_VER_MINOR	2
+#define DRV_MODULE_VER_SUBMINOR 3
 
 #define DRV_MODULE_NAME		"ena"
 #ifndef DRV_MODULE_VERSION
@@ -81,6 +87,9 @@
 #define ENA_BAR_MASK (BIT(ENA_REG_BAR) | BIT(ENA_MEM_BAR))
 
 #define ENA_DEFAULT_RING_SIZE	(1024)
+#define ENA_MIN_RING_SIZE	(256)
+
+#define ENA_MIN_NUM_IO_QUEUES	(1)
 
 #define ENA_TX_WAKEUP_THRESH		(MAX_SKB_FRAGS + 2)
 #define ENA_DEFAULT_RX_COPYBREAK	(256 - NET_IP_ALIGN)
@@ -127,6 +136,8 @@
 
 #define ENA_IO_TXQ_IDX(q)	(2 * (q))
 #define ENA_IO_RXQ_IDX(q)	(2 * (q) + 1)
+#define ENA_IO_TXQ_IDX_TO_COMBINED_IDX(q)	((q) / 2)
+#define ENA_IO_RXQ_IDX_TO_COMBINED_IDX(q)	(((q) - 1) / 2)
 
 #define ENA_MGMNT_IRQ_IDX		0
 #define ENA_IO_IRQ_FIRST_IDX		1
@@ -140,6 +151,22 @@
 
 #define ENA_MMIO_DISABLE_REG_READ	BIT(0)
 
+/* The max MTU size is configured to be the ethernet frame size without
+ * the overhead of the ethernet header, which can have a VLAN header, and
+ * a frame check sequence (FCS).
+ * The buffer size we share with the device is defined to be ENA_PAGE_SIZE
+ */
+
+#ifdef ENA_XDP_SUPPORT
+#define ENA_XDP_MAX_MTU (ENA_PAGE_SIZE - ETH_HLEN - ETH_FCS_LEN - \
+				VLAN_HLEN - XDP_PACKET_HEADROOM)
+
+#define ENA_IS_XDP_INDEX(adapter, index) (((index) >= (adapter)->xdp_first_ring) && \
+	((index) < (adapter)->xdp_first_ring + (adapter)->xdp_num_queues))
+#else
+#define ENA_IS_XDP_INDEX(adapter, index) (false)
+#endif /* ENA_XDP_SUPPORT */
+
 struct ena_irq {
 	irq_handler_t handler;
 	void *data;
@@ -153,18 +180,23 @@ struct ena_napi {
 	struct napi_struct napi ____cacheline_aligned;
 	struct ena_ring *tx_ring;
 	struct ena_ring *rx_ring;
-#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0)
+#ifdef ENA_XDP_SUPPORT
+	struct ena_ring *xdp_ring;
+#endif /* ENA_XDP_SUPPORT */
+	bool first_interrupt;
 	atomic_t unmask_interrupt;
-#endif
 	u32 qid;
+	struct dim dim;
 };
 
 struct ena_calc_queue_size_ctx {
 	struct ena_com_dev_get_features_ctx *get_feat_ctx;
 	struct ena_com_dev *ena_dev;
 	struct pci_dev *pdev;
-	u16 rx_queue_size;
-	u16 tx_queue_size;
+	u32 tx_queue_size;
+	u32 rx_queue_size;
+	u32 max_tx_queue_size;
+	u32 max_rx_queue_size;
 	u16 max_tx_sgl_size;
 	u16 max_rx_sgl_size;
 };
@@ -178,6 +210,19 @@ struct ena_tx_buffer {
 	/* num of buffers used by this skb */
 	u32 num_of_bufs;
 
+#ifdef ENA_XDP_SUPPORT
+	/* XDP buffer structure which is used for sending packets in
+	 * the xdp queues
+	 */
+	struct xdp_frame *xdpf;
+	/* The rx page for the rx buffer that was received in rx and
+	 * re transmitted on xdp tx queues as a result of XDP_TX action.
+	 * We need to free the page once we finished cleaning the buffer in
+	 * clean_xdp_irq()
+	 */
+	struct page *xdp_rx_page;
+#endif /* ENA_XDP_SUPPORT */
+
 	/* Indicate if bufs[0] map the linear data of the skb. */
 	u8 map_linear_data;
 
@@ -218,18 +263,20 @@ struct ena_stats_tx {
 	u64 bad_req_id;
 	u64 llq_buffer_copy;
 	u64 missed_tx;
+	u64 unmask_interrupt;
 };
 
 struct ena_stats_rx {
 	u64 cnt;
 	u64 bytes;
+	u64 rx_copybreak_pkt;
+	u64 csum_good;
 	u64 refil_partial;
 	u64 bad_csum;
 	u64 page_alloc_fail;
 	u64 skb_alloc_fail;
 	u64 dma_mapping_err;
 	u64 bad_desc_num;
-	u64 rx_copybreak_pkt;
 #if ENA_BUSY_POLL_SUPPORT
 	u64 bp_yield;
 	u64 bp_missed;
@@ -241,13 +288,10 @@ struct ena_stats_rx {
 };
 
 struct ena_ring {
-	union {
-		/* Holds the empty requests for TX/RX
-		 * out of order completions
-		 */
-		u16 *free_tx_ids;
-		u16 *free_rx_ids;
-	};
+	/* Holds the empty requests for TX/RX
+	 * out of order completions
+	 */
+	u16 *free_ids;
 
 	union {
 		struct ena_tx_buffer *tx_buffer_info;
@@ -263,10 +307,15 @@ struct ena_ring {
 	struct ena_adapter *adapter;
 	struct ena_com_io_cq *ena_com_io_cq;
 	struct ena_com_io_sq *ena_com_io_sq;
+#ifdef ENA_XDP_SUPPORT
+	struct bpf_prog *xdp_bpf_prog;
+	struct xdp_rxq_info xdp_rxq;
+#endif
 
 	u16 next_to_use;
 	u16 next_to_clean;
 	u16 rx_copybreak;
+	u16 rx_headroom;
 	u16 qid;
 	u16 mtu;
 	u16 sgl_size;
@@ -275,6 +324,7 @@ struct ena_ring {
 	u8 tx_max_header_size;
 
 	bool first_interrupt;
+	bool disable_meta_caching;
 	u16 no_interrupt_event_cnt;
 
 	/* cpu for TPH */
@@ -287,8 +337,7 @@ struct ena_ring {
 	struct ena_com_rx_buf_info ena_bufs[ENA_PKT_MAX_BUFS];
 	u32  smoothed_interval;
 	u32  per_napi_packets;
-	u32  per_napi_bytes;
-	enum ena_intr_moder_level moder_tbl_idx;
+	u16 non_empty_napi_events;
 	struct u64_stats_sync syncp;
 	union {
 		struct ena_stats_tx tx_stats;
@@ -319,6 +368,7 @@ struct ena_stats_dev {
 	u64 interface_down;
 	u64 admin_q_pause;
 	u64 rx_drops;
+	u64 tx_drops;
 };
 
 enum ena_flags_t {
@@ -343,7 +393,8 @@ struct ena_adapter {
 	u32 rx_copybreak;
 	u32 max_mtu;
 
-	int num_queues;
+	u32 num_io_queues;
+	u32 max_num_io_queues;
 
 #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
 	struct msix_entry *msix_entries;
@@ -352,11 +403,11 @@ struct ena_adapter {
 
 	u32 missing_tx_completion_threshold;
 
-	u32 tx_usecs, rx_usecs; /* interrupt moderation */
-	u32 tx_frames, rx_frames; /* interrupt moderation */
+	u32 requested_tx_ring_size;
+	u32 requested_rx_ring_size;
 
-	u32 tx_ring_size;
-	u32 rx_ring_size;
+	u32 max_tx_ring_size;
+	u32 max_rx_ring_size;
 
 	u32 msg_enable;
 
@@ -389,6 +440,7 @@ struct ena_adapter {
 
 	bool wd_state;
 	bool dev_up_before_reset;
+	bool disable_meta_caching;
 	unsigned long last_keep_alive_jiffies;
 
 	struct u64_stats_sync syncp;
@@ -399,7 +451,11 @@ struct ena_adapter {
 
 	enum ena_regs_reset_reason_types reset_reason;
 
-	u8 ena_extra_properties_count;
+#ifdef ENA_XDP_SUPPORT
+	struct bpf_prog *xdp_bpf_prog;
+#endif
+	u32 xdp_first_ring;
+	u32 xdp_num_queues;
 };
 
 void ena_set_ethtool_ops(struct net_device *netdev);
@@ -408,6 +464,12 @@ void ena_dump_stats_to_dmesg(struct ena_adapter *adapter);
 
 void ena_dump_stats_to_buf(struct ena_adapter *adapter, u8 *buf);
 
+int ena_update_queue_sizes(struct ena_adapter *adapter,
+			   u32 new_tx_size,
+			   u32 new_rx_size);
+
+int ena_update_queue_count(struct ena_adapter *adapter, u32 new_channel_count);
+
 int ena_get_sset_count(struct net_device *netdev, int sset);
 
 #if ENA_BUSY_POLL_SUPPORT
@@ -512,4 +574,46 @@ static inline bool ena_bp_disable(struct ena_ring *rx_ring)
 }
 #endif /* ENA_BUSY_POLL_SUPPORT */
 
+
+#ifdef ENA_XDP_SUPPORT
+enum ena_xdp_errors_t {
+	ENA_XDP_ALLOWED = 0,
+	ENA_XDP_CURRENT_MTU_TOO_LARGE,
+	ENA_XDP_NO_ENOUGH_QUEUES,
+};
+
+static inline bool ena_xdp_queues_present(struct ena_adapter *adapter)
+{
+	return adapter->xdp_first_ring != 0;
+}
+
+static inline bool ena_xdp_present(struct ena_adapter *adapter)
+{
+	return !!adapter->xdp_bpf_prog;
+}
+
+static inline bool ena_xdp_present_ring(struct ena_ring *ring)
+{
+	return !!ring->xdp_bpf_prog;
+}
+
+static inline int ena_xdp_legal_queue_count(struct ena_adapter *adapter,
+					    u32 queues)
+{
+	return 2 * queues <= adapter->max_num_io_queues;
+}
+
+static inline enum ena_xdp_errors_t ena_xdp_allowed(struct ena_adapter *adapter)
+{
+	enum ena_xdp_errors_t rc = ENA_XDP_ALLOWED;
+
+	if (adapter->netdev->mtu > ENA_XDP_MAX_MTU)
+		rc = ENA_XDP_CURRENT_MTU_TOO_LARGE;
+	else if (!ena_xdp_legal_queue_count(adapter, adapter->num_io_queues))
+		rc = ENA_XDP_NO_ENOUGH_QUEUES;
+
+	return rc;
+}
+#endif /* ENA_XDP_SUPPORT */
+
 #endif /* !(ENA_H) */
diff --git a/drivers/amazon/net/ena/ena_regs_defs.h b/drivers/amazon/net/ena/ena_regs_defs.h
index 59bd75534a627..b9e8e48660e82 100755
--- a/drivers/amazon/net/ena/ena_regs_defs.h
+++ b/drivers/amazon/net/ena/ena_regs_defs.h
@@ -1,4 +1,3 @@
-/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
  * Copyright 2015 - 2018 Amazon.com, Inc. or its affiliates.
  *
@@ -49,6 +48,7 @@ enum ena_regs_reset_reason_types {
 	ENA_REGS_RESET_USER_TRIGGER                 = 12,
 	ENA_REGS_RESET_GENERIC                      = 13,
 	ENA_REGS_RESET_MISS_INTERRUPT               = 14,
+	ENA_REGS_RESET_LAST,
 };
 
 /* ena_registers offsets */
@@ -155,4 +155,4 @@ enum ena_regs_reset_reason_types {
 #define ENA_REGS_RSS_IND_ENTRY_UPDATE_CQ_IDX_SHIFT          16
 #define ENA_REGS_RSS_IND_ENTRY_UPDATE_CQ_IDX_MASK           0xffff0000
 
-#endif /*_ENA_REGS_H_ */
+#endif /* _ENA_REGS_H_ */
diff --git a/drivers/amazon/net/ena/ena_sysfs.c b/drivers/amazon/net/ena/ena_sysfs.c
index bea56370af77e..26850a7f31fba 100755
--- a/drivers/amazon/net/ena/ena_sysfs.c
+++ b/drivers/amazon/net/ena/ena_sysfs.c
@@ -39,12 +39,6 @@
 #include "ena_netdev.h"
 #include "ena_sysfs.h"
 
-struct dev_ext_ena_attribute {
-        struct device_attribute attr;
-        void *var;
-};
-
-#define to_ext_attr(x) container_of(x, struct dev_ext_ena_attribute, attr)
 
 static ssize_t ena_store_rx_copybreak(struct device *dev,
 				      struct device_attribute *attr,
@@ -65,7 +59,7 @@ static ssize_t ena_store_rx_copybreak(struct device *dev,
 	rtnl_lock();
 	adapter->rx_copybreak = rx_copybreak;
 
-	for (i = 0; i < adapter->num_queues; i++) {
+	for (i = 0; i < adapter->num_io_queues; i++) {
 		rx_ring = &adapter->rx_ring[i];
 		rx_ring->rx_copybreak = rx_copybreak;
 	}
@@ -86,159 +80,13 @@ static DEVICE_ATTR(rx_copybreak, S_IRUGO | S_IWUSR, ena_show_rx_copybreak,
 		   ena_store_rx_copybreak);
 
 
-/* adaptive interrupt moderation */
-static ssize_t ena_show_intr_moderation(struct device *dev,
-					struct device_attribute *attr,
-					char *buf)
-{
-	struct ena_intr_moder_entry entry;
-	struct dev_ext_ena_attribute *ea = to_ext_attr(attr);
-	enum ena_intr_moder_level level = (enum ena_intr_moder_level)ea->var;
-	struct ena_adapter *adapter = dev_get_drvdata(dev);
-	ssize_t rc = 0;
-
-	ena_com_get_intr_moderation_entry(adapter->ena_dev, level, &entry);
-
-	rc = sprintf(buf, "%u %u %u\n",
-		     entry.intr_moder_interval,
-		     entry.pkts_per_interval,
-		     entry.bytes_per_interval);
-
-	return rc;
-}
-
-static ssize_t ena_store_intr_moderation(struct device *dev,
-					 struct device_attribute *attr,
-					 const char *buf,
-					 size_t count)
-{
-	struct ena_intr_moder_entry entry;
-	struct dev_ext_ena_attribute *ea = to_ext_attr(attr);
-	struct ena_adapter *adapter = dev_get_drvdata(dev);
-	enum ena_intr_moder_level level = (enum ena_intr_moder_level)ea->var;
-	int cnt;
-
-	cnt = sscanf(buf, "%u %u %u",
-		     &entry.intr_moder_interval,
-		     &entry.pkts_per_interval,
-		     &entry.bytes_per_interval);
-
-	if (cnt != 3)
-		return -EINVAL;
-
-	ena_com_init_intr_moderation_entry(adapter->ena_dev, level, &entry);
-
-	return count;
-}
-
-static ssize_t ena_store_intr_moderation_restore_default(struct device *dev,
-							 struct device_attribute *attr,
-							 const char *buf,
-							 size_t len)
-{
-	struct ena_adapter *adapter = dev_get_drvdata(dev);
-	struct ena_com_dev *ena_dev = adapter->ena_dev;
-	unsigned long restore_default;
-	int err;
-
-	err = kstrtoul(buf, 10, &restore_default);
-	if (err < 0)
-		return err;
-
-	if (ena_com_interrupt_moderation_supported(ena_dev) && restore_default) {
-		ena_com_config_default_interrupt_moderation_table(ena_dev);
-		ena_com_enable_adaptive_moderation(ena_dev);
-	}
-
-	return len;
-}
-
-static ssize_t ena_store_enable_adaptive_intr_moderation(struct device *dev,
-							 struct device_attribute *attr,
-							 const char *buf,
-							 size_t len)
-{
-	struct ena_adapter *adapter = dev_get_drvdata(dev);
-	unsigned long enable_moderation;
-	int err;
-
-	err = kstrtoul(buf, 10, &enable_moderation);
-	if (err < 0)
-		return err;
-
-	if (enable_moderation == 0)
-		ena_com_disable_adaptive_moderation(adapter->ena_dev);
-	else
-		ena_com_enable_adaptive_moderation(adapter->ena_dev);
-
-	return len;
-}
-
-static ssize_t ena_show_enable_adaptive_intr_moderation(struct device *dev,
-							struct device_attribute *attr,
-							char *buf)
-{
-	struct ena_adapter *adapter = dev_get_drvdata(dev);
-
-	return sprintf(buf, "%d\n",
-		       ena_com_get_adaptive_moderation_enabled(adapter->ena_dev));
-}
-
-static DEVICE_ATTR(enable_adaptive_intr_moderation, S_IRUGO | S_IWUSR,
-		ena_show_enable_adaptive_intr_moderation,
-		ena_store_enable_adaptive_intr_moderation);
-
-static DEVICE_ATTR(intr_moderation_restore_default, S_IWUSR | S_IWGRP,
-		NULL, ena_store_intr_moderation_restore_default);
-
-#define INTR_MODERATION_PREPARE_ATTR(_name, _type) {			\
-	__ATTR(intr_moderation_##_name, (S_IRUGO | S_IWUSR | S_IWGRP),	\
-		ena_show_intr_moderation, ena_store_intr_moderation), \
-		(void *)_type }
-
-/* Device attrs - intr moderation */
-static struct dev_ext_ena_attribute dev_attr_intr_moderation[] = {
-	INTR_MODERATION_PREPARE_ATTR(lowest, ENA_INTR_MODER_LOWEST),
-	INTR_MODERATION_PREPARE_ATTR(low, ENA_INTR_MODER_LOW),
-	INTR_MODERATION_PREPARE_ATTR(mid, ENA_INTR_MODER_MID),
-	INTR_MODERATION_PREPARE_ATTR(high, ENA_INTR_MODER_HIGH),
-	INTR_MODERATION_PREPARE_ATTR(highest, ENA_INTR_MODER_HIGHEST),
-};
-
 /******************************************************************************
  *****************************************************************************/
 int ena_sysfs_init(struct device *dev)
 {
-	int i, rc;
-	struct ena_adapter *adapter = dev_get_drvdata(dev);
-
 
 	if (device_create_file(dev, &dev_attr_rx_copybreak))
 		dev_err(dev, "failed to create rx_copybreak sysfs entry");
-
-	if (ena_com_interrupt_moderation_supported(adapter->ena_dev)) {
-		if (device_create_file(dev,
-				       &dev_attr_intr_moderation_restore_default))
-			dev_err(dev,
-				"failed to create intr_moderation_restore_default");
-
-		if (device_create_file(dev,
-				       &dev_attr_enable_adaptive_intr_moderation))
-			dev_err(dev,
-				"failed to create adaptive_intr_moderation_enable");
-
-		for (i = 0; i < ARRAY_SIZE(dev_attr_intr_moderation); i++) {
-			rc = sysfs_create_file(&dev->kobj,
-					       &dev_attr_intr_moderation[i].attr.attr);
-			if (rc) {
-				dev_err(dev,
-					"%s: sysfs_create_file(intr_moderation %d) failed\n",
-					__func__, i);
-				return rc;
-			}
-		}
-	}
-
 	return 0;
 }
 
@@ -246,17 +94,5 @@ int ena_sysfs_init(struct device *dev)
  *****************************************************************************/
 void ena_sysfs_terminate(struct device *dev)
 {
-	struct ena_adapter *adapter = dev_get_drvdata(dev);
-	int i;
-
 	device_remove_file(dev, &dev_attr_rx_copybreak);
-	if (ena_com_interrupt_moderation_supported(adapter->ena_dev)) {
-		for (i = 0; i < ARRAY_SIZE(dev_attr_intr_moderation); i++)
-			sysfs_remove_file(&dev->kobj,
-					  &dev_attr_intr_moderation[i].attr.attr);
-		device_remove_file(dev,
-				   &dev_attr_enable_adaptive_intr_moderation);
-		device_remove_file(dev,
-				   &dev_attr_intr_moderation_restore_default);
-	}
 }
diff --git a/drivers/amazon/net/ena/kcompat.h b/drivers/amazon/net/ena/kcompat.h
index 6bdbceb809b26..eb4b99d573670 100755
--- a/drivers/amazon/net/ena/kcompat.h
+++ b/drivers/amazon/net/ena/kcompat.h
@@ -158,6 +158,9 @@ Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
 #define SLE_VERSION(a,b,c) KERNEL_VERSION(a,b,c)
 #endif
 #ifdef CONFIG_SUSE_KERNEL
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 12, 14)
+#include <linux/suse_version.h>
+#endif
 #if ( LINUX_VERSION_CODE >= KERNEL_VERSION(3,12,28) )
 /* SLES12 is at least 3.12.28+ based */
 #define SLE_VERSION_CODE SLE_VERSION(12,0,0)
@@ -166,6 +169,9 @@ Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
 #ifndef SLE_VERSION_CODE
 #define SLE_VERSION_CODE 0
 #endif /* SLE_VERSION_CODE */
+#ifndef SUSE_VERSION
+#define SUSE_VERSION 0
+#endif /* SUSE_VERSION */
 
 
 /******************************************************************************/
@@ -208,9 +214,17 @@ Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
 #define HAVE_RHEL6_ETHTOOL_OPS_EXT_STRUCT
 #endif /* RHEL >= 6.4 && RHEL < 7.0 */
 
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) || \
+	(SLE_VERSION_CODE && \
+	 LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,48)))
+#define HAVE_MTU_MIN_MAX_IN_NET_DEVICE
+#endif
+
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,11,0) || \
-	 (RHEL_RELEASE_CODE && \
-      RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5)))
+     (RHEL_RELEASE_CODE && \
+      RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5)) || \
+     (SLE_VERSION_CODE && \
+      LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,50)))
 #define NDO_GET_STATS_64_V2
 #endif
 
@@ -347,7 +361,13 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings)
 #endif
 #endif /* >= 3.8.0 */
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,19,0)
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5,2,0))
+#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V3
+#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(4,19,0)) || \
+      (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,0))) || \
+      (SUSE_VERSION && ((SUSE_VERSION == 12 && SUSE_PATCHLEVEL >= 5) || \
+		        (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 1) || \
+			(SUSE_VERSION > 15)))
 #define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V2
 #else
 
@@ -601,8 +621,82 @@ static inline void __iomem *devm_ioremap_wc(struct device *dev,
 #endif
 
 #if RHEL_RELEASE_CODE && \
-    RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5)
+    RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 5) && \
+    LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0)
 #define ndo_change_mtu ndo_change_mtu_rh74
 #endif
 
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0))
+#ifndef dma_zalloc_coherent
+#define dma_zalloc_coherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
+#endif
+#endif
+
+#ifndef dev_info_once
+#ifdef CONFIG_PRINTK
+#define dev_info_once(dev, fmt, ...)			\
+do {									\
+	static bool __print_once __read_mostly;				\
+									\
+	if (!__print_once) {						\
+		__print_once = true;					\
+		dev_info(dev, fmt, ##__VA_ARGS__);			\
+	}								\
+} while (0)
+#else
+#define dev_info_once(dev, fmt, ...)			\
+do {									\
+	if (0)								\
+		dev_info(dev, fmt, ##__VA_ARGS__);			\
+} while (0)
+#endif
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 2, 0))
+#define netdev_xmit_more() ((skb->xmit_more))
+#endif
+
+#ifndef mmiowb
+#define MMIOWB_NOT_DEFINED
+#endif
+
+/* In the driver we currently only support CRC32 and Toeplitz.
+ * Since in kernel erlier than 4.12 the CRC32 define didn't exist
+ * We define it here to be XOR. Any user who wishes to select CRC32
+ * as the hash function, can do so by choosing xor through ethtool.
+ */
+#ifndef ETH_RSS_HASH_CRC32
+#define ETH_RSS_HASH_CRC32 ETH_RSS_HASH_XOR
+#endif
+
+#ifndef _ULL
+#define _ULL(x) (_AC(x, ULL))
+#endif
+
+#ifndef ULL
+#define ULL(x) (_ULL(x))
+#endif
+
+#ifndef BIT_ULL
+#define BIT_ULL(nr) (ULL(1) << (nr))
+#endif
+
+#ifndef BITS_PER_TYPE
+#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE)
+#endif
+
+#ifndef DIV_ROUND_DOWN_ULL
+#define DIV_ROUND_DOWN_ULL(ll, d) \
+	({ unsigned long long _tmp = (ll); do_div(_tmp, d); _tmp; })
+#endif
+
+/* values are taken from here: https://github.com/iovisor/bcc/blob/master/docs/kernel-versions.md */
+
+#if defined(CONFIG_BPF) && LINUX_VERSION_CODE >= KERNEL_VERSION(5,0,0)
+#define ENA_XDP_SUPPORT
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,5,0)
+#define HAVE_NDO_TX_TIMEOUT_STUCK_QUEUE_PARAMETER
+#endif
 #endif /* _KCOMPAT_H_ */
diff --git a/drivers/amazon/net/ena/net_dim.c b/drivers/amazon/net/ena/net_dim.c
new file mode 100644
index 0000000000000..af46903cd53e2
--- /dev/null
+++ b/drivers/amazon/net/ena/net_dim.c
@@ -0,0 +1,245 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc.  All rights reserved.
+ */
+
+#include "dim.h"
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0)
+
+/*
+ * Net DIM profiles:
+ *        There are different set of profiles for each CQ period mode.
+ *        There are different set of profiles for RX/TX CQs.
+ *        Each profile size must be of NET_DIM_PARAMS_NUM_PROFILES
+ */
+#define NET_DIM_PARAMS_NUM_PROFILES 5
+#define NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE 256
+#define NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE 128
+#define NET_DIM_DEF_PROFILE_CQE 1
+#define NET_DIM_DEF_PROFILE_EQE 1
+
+#define NET_DIM_RX_EQE_PROFILES { \
+	{1,   NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+	{8,   NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+	{64,  NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+	{128, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+	{256, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+}
+
+#define NET_DIM_RX_CQE_PROFILES { \
+	{2,  256},             \
+	{8,  128},             \
+	{16, 64},              \
+	{32, 64},              \
+	{64, 64}               \
+}
+
+#define NET_DIM_TX_EQE_PROFILES { \
+	{1,   NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{8,   NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{32,  NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{64,  NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{128, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}   \
+}
+
+#define NET_DIM_TX_CQE_PROFILES { \
+	{5,  128},  \
+	{8,  64},  \
+	{16, 32},  \
+	{32, 32},  \
+	{64, 32}   \
+}
+
+static const struct dim_cq_moder
+rx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
+	NET_DIM_RX_EQE_PROFILES,
+	NET_DIM_RX_CQE_PROFILES,
+};
+
+static const struct dim_cq_moder
+tx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
+	NET_DIM_TX_EQE_PROFILES,
+	NET_DIM_TX_CQE_PROFILES,
+};
+
+struct dim_cq_moder
+net_dim_get_rx_moderation(u8 cq_period_mode, int ix)
+{
+	struct dim_cq_moder cq_moder = rx_profile[cq_period_mode][ix];
+
+	cq_moder.cq_period_mode = cq_period_mode;
+	return cq_moder;
+}
+
+struct dim_cq_moder
+net_dim_get_def_rx_moderation(u8 cq_period_mode)
+{
+	u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
+			NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE;
+
+	return net_dim_get_rx_moderation(cq_period_mode, profile_ix);
+}
+
+struct dim_cq_moder
+net_dim_get_tx_moderation(u8 cq_period_mode, int ix)
+{
+	struct dim_cq_moder cq_moder = tx_profile[cq_period_mode][ix];
+
+	cq_moder.cq_period_mode = cq_period_mode;
+	return cq_moder;
+}
+
+struct dim_cq_moder
+net_dim_get_def_tx_moderation(u8 cq_period_mode)
+{
+	u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
+			NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE;
+
+	return net_dim_get_tx_moderation(cq_period_mode, profile_ix);
+}
+
+static int net_dim_step(struct dim *dim)
+{
+	if (dim->tired == (NET_DIM_PARAMS_NUM_PROFILES * 2))
+		return DIM_TOO_TIRED;
+
+	switch (dim->tune_state) {
+	case DIM_PARKING_ON_TOP:
+	case DIM_PARKING_TIRED:
+		break;
+	case DIM_GOING_RIGHT:
+		if (dim->profile_ix == (NET_DIM_PARAMS_NUM_PROFILES - 1))
+			return DIM_ON_EDGE;
+		dim->profile_ix++;
+		dim->steps_right++;
+		break;
+	case DIM_GOING_LEFT:
+		if (dim->profile_ix == 0)
+			return DIM_ON_EDGE;
+		dim->profile_ix--;
+		dim->steps_left++;
+		break;
+	}
+
+	dim->tired++;
+	return DIM_STEPPED;
+}
+
+static void net_dim_exit_parking(struct dim *dim)
+{
+	dim->tune_state = dim->profile_ix ? DIM_GOING_LEFT : DIM_GOING_RIGHT;
+	net_dim_step(dim);
+}
+
+static int net_dim_stats_compare(struct dim_stats *curr,
+				 struct dim_stats *prev)
+{
+	if (!prev->bpms)
+		return curr->bpms ? DIM_STATS_BETTER : DIM_STATS_SAME;
+
+	if (IS_SIGNIFICANT_DIFF(curr->bpms, prev->bpms))
+		return (curr->bpms > prev->bpms) ? DIM_STATS_BETTER :
+						   DIM_STATS_WORSE;
+
+	if (!prev->ppms)
+		return curr->ppms ? DIM_STATS_BETTER :
+				    DIM_STATS_SAME;
+
+	if (IS_SIGNIFICANT_DIFF(curr->ppms, prev->ppms))
+		return (curr->ppms > prev->ppms) ? DIM_STATS_BETTER :
+						   DIM_STATS_WORSE;
+
+	if (!prev->epms)
+		return DIM_STATS_SAME;
+
+	if (IS_SIGNIFICANT_DIFF(curr->epms, prev->epms))
+		return (curr->epms < prev->epms) ? DIM_STATS_BETTER :
+						   DIM_STATS_WORSE;
+
+	return DIM_STATS_SAME;
+}
+
+static bool net_dim_decision(struct dim_stats *curr_stats, struct dim *dim)
+{
+	int prev_state = dim->tune_state;
+	int prev_ix = dim->profile_ix;
+	int stats_res;
+	int step_res;
+
+	switch (dim->tune_state) {
+	case DIM_PARKING_ON_TOP:
+		stats_res = net_dim_stats_compare(curr_stats,
+						  &dim->prev_stats);
+		if (stats_res != DIM_STATS_SAME)
+			net_dim_exit_parking(dim);
+		break;
+
+	case DIM_PARKING_TIRED:
+		dim->tired--;
+		if (!dim->tired)
+			net_dim_exit_parking(dim);
+		break;
+
+	case DIM_GOING_RIGHT:
+	case DIM_GOING_LEFT:
+		stats_res = net_dim_stats_compare(curr_stats,
+						  &dim->prev_stats);
+		if (stats_res != DIM_STATS_BETTER)
+			dim_turn(dim);
+
+		if (dim_on_top(dim)) {
+			dim_park_on_top(dim);
+			break;
+		}
+
+		step_res = net_dim_step(dim);
+		switch (step_res) {
+		case DIM_ON_EDGE:
+			dim_park_on_top(dim);
+			break;
+		case DIM_TOO_TIRED:
+			dim_park_tired(dim);
+			break;
+		}
+
+		break;
+	}
+
+	if (prev_state != DIM_PARKING_ON_TOP ||
+	    dim->tune_state != DIM_PARKING_ON_TOP)
+		dim->prev_stats = *curr_stats;
+
+	return dim->profile_ix != prev_ix;
+}
+
+void net_dim(struct dim *dim, struct dim_sample end_sample)
+{
+	struct dim_stats curr_stats;
+	u16 nevents;
+
+	switch (dim->state) {
+	case DIM_MEASURE_IN_PROGRESS:
+		nevents = BIT_GAP(BITS_PER_TYPE(u16),
+				  end_sample.event_ctr,
+				  dim->start_sample.event_ctr);
+		if (nevents < DIM_NEVENTS)
+			break;
+		dim_calc_stats(&dim->start_sample, &end_sample, &curr_stats);
+		if (net_dim_decision(&curr_stats, dim)) {
+			dim->state = DIM_APPLY_NEW_PROFILE;
+			schedule_work(&dim->work);
+			break;
+		}
+		/* fall through */
+	case DIM_START_MEASURE:
+		dim_update_sample(end_sample.event_ctr, end_sample.pkt_ctr,
+				  end_sample.byte_ctr, &dim->start_sample);
+		dim->state = DIM_MEASURE_IN_PROGRESS;
+		break;
+	case DIM_APPLY_NEW_PROFILE:
+		break;
+	}
+}
+
+#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(5, 3, 0) */

From 6496c398834dc9969a1bf603b4a5cae5d39fea25 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Wed, 12 Feb 2020 21:16:27 +0000
Subject: [PATCH 058/737] lustre: adapt to changed padata interfaces in 5.4
 -stable

The 5.4 -stable branch changed padata interfaces, as part of a
pcrypt fix. Adapt for this change. As can be seen in the commit
message for the below commit, a wrapper for padata instances
was created, so that the pcrypt code could avoid race conditions.

So, the lustre code simply needs to allocate such a wrapper and pass
it to the changed padata_do_parallel interface too.

Linux commit:

bbefa1dd6a6d53537c11624752219e39959d04fb ("crypto: pcrypt - Avoid deadlock by using per-instance padata queues")

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/staging/lustrefsx/config.h            |  3 +++
 .../libcfs/include/libcfs/libcfs_ptask.h      |  3 +++
 .../lustrefsx/libcfs/libcfs/libcfs_ptask.c    | 19 +++++++++++++++++--
 3 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/lustrefsx/config.h b/drivers/staging/lustrefsx/config.h
index d5cf2879b0d98..8925156518a43 100644
--- a/drivers/staging/lustrefsx/config.h
+++ b/drivers/staging/lustrefsx/config.h
@@ -904,6 +904,9 @@
 /* changed padata interface in 5.4 */
 #define HAVE_PADATA_INTERFACE_54
 
+/* changed padata interface in 5.6 (and the 5.4 -stable branch) */
+#define HAVE_PADATA_INTERFACE_56
+
 /* ext4_journal_start takes 3 arguments */
 /* #undef JOURNAL_START_HAS_3ARGS */
 
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h
index 586a04446cae8..b7c791cdf1ebb 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h
@@ -25,6 +25,9 @@ struct padata_instance {};
 
 struct cfs_ptask_engine {
 	struct padata_instance	*pte_pinst;
+#ifdef HAVE_PADATA_INTERFACE_56
+	struct padata_shell	*pte_pshell;
+#endif
 #ifndef HAVE_PADATA_INTERFACE_54
 	struct workqueue_struct	*pte_wq;
 #endif
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c
index 1b7603cade8a9..636a93f02e681 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c
@@ -137,7 +137,9 @@ static int cfs_do_parallel(struct cfs_ptask_engine *engine,
 	ptask->pt_result = -EINPROGRESS;
 
 retry:
-#ifdef HAVE_PADATA_INTERFACE_54
+#ifdef HAVE_PADATA_INTERFACE_56
+	rc = padata_do_parallel(engine->pte_pshell, padata, &ptask->pt_cbcpu);
+#elif HAVE_PADATA_INTERFACE_54
 	rc = padata_do_parallel(engine->pte_pinst, padata, &ptask->pt_cbcpu);
 #else
 	rc = padata_do_parallel(engine->pte_pinst, padata, ptask->pt_cbcpu);
@@ -400,11 +402,17 @@ static int cfs_ptengine_padata_init(struct cfs_ptask_engine *engine,
 	if (engine->pte_pinst == NULL)
 		GOTO(err_free_par_mask, rc = -ENOMEM);
 
+#ifdef HAVE_PADATA_INTERFACE_56
+	engine->pte_pshell = padata_alloc_shell(engine->pte_pinst);
+	if (engine->pte_pshell == NULL)
+		GOTO(err_free_padata, rc = -ENOMEM);
+#endif
+
 	engine->pte_notifier.notifier_call = cfs_ptask_cpumask_change_notify;
 	rc = padata_register_cpumask_notifier(engine->pte_pinst,
 					      &engine->pte_notifier);
 	if (rc)
-		GOTO(err_free_padata, rc);
+		GOTO(err_free_pashell, rc);
 
 	rc = cfs_ptengine_set_cpumask(engine, par_mask);
 	if (rc)
@@ -423,6 +431,10 @@ static int cfs_ptengine_padata_init(struct cfs_ptask_engine *engine,
 err_unregister:
 	padata_unregister_cpumask_notifier(engine->pte_pinst,
 					   &engine->pte_notifier);
+err_free_pashell:
+#ifdef HAVE_PADATA_INTERFACE_56
+	padata_free_shell(engine->pte_pshell);
+#endif
 err_free_padata:
 	padata_free(engine->pte_pinst);
 err_free_par_mask:
@@ -443,6 +455,9 @@ static void cfs_ptengine_padata_fini(struct cfs_ptask_engine *engine)
 	padata_stop(engine->pte_pinst);
 	padata_unregister_cpumask_notifier(engine->pte_pinst,
 					   &engine->pte_notifier);
+#ifdef HAVE_PADATA_INTERFACE_56
+	padata_free_shell(engine->pte_pshell);
+#endif
 	padata_free(engine->pte_pinst);
 #ifndef HAVE_PADATA_INTERFACE_54
 	destroy_workqueue(engine->pte_wq);

From d201df49afb513cc843887d3b798f97d59b2582a Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Wed, 19 Feb 2020 21:13:10 +0000
Subject: [PATCH 059/737] lustre: llite: ll_fault fixes

Import upstream Lustre commit LU-11403:

    Various error conditions in the fault path can cause us to
    not return a page in vm_fault.  Check if it's present
    before accessing it.

    Additionally, it's not valid to return VM_FAULT_NOPAGE for
    page faults.  The correct return when accessing a page that
    does not exist is VM_FAULT_SIGBUS.  Correcting this avoids
    looping infinitely in the testcase.

    Signed-off-by: Patrick Farrell <pfarrell@whamcloud.com>
    Change-Id: I53fc16d91462ac5d4555855dfa067d7fd6716c90
    Reviewed-on: https://review.whamcloud.com/34247
    Tested-by: Jenkins
    Tested-by: Maloo <maloo@whamcloud.com>
    Reviewed-by: Alex Zhuravlev <bzzz@whamcloud.com>
    Reviewed-by: Alexander Zarochentsev <c17826@cray.com>
    Reviewed-by: Oleg Drokin <green@whamcloud.com>

[fllinden: only pick the code part, we don't ship the tests]
Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/staging/lustrefsx/lustre/llite/llite_mmap.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c b/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
index 79adfa9378515..bf3c8636dff52 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
@@ -230,9 +230,6 @@ static inline int to_fault_error(int result)
 	case 0:
 		result = VM_FAULT_LOCKED;
 		break;
-	case -EFAULT:
-		result = VM_FAULT_NOPAGE;
-		break;
 	case -ENOMEM:
 		result = VM_FAULT_OOM;
 		break;
@@ -355,7 +352,8 @@ static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 restart:
 	result = ll_fault0(vma, vmf);
-	if (!(result & (VM_FAULT_RETRY | VM_FAULT_ERROR | VM_FAULT_LOCKED))) {
+	if (vmf->page &&
+	    !(result & (VM_FAULT_RETRY | VM_FAULT_ERROR | VM_FAULT_LOCKED))) {
                 struct page *vmpage = vmf->page;
 
                 /* check if this page has been truncated */

From c0646c16c3510d4dd785a3df976db85e6394bef6 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Tue, 7 Apr 2020 17:27:23 +0000
Subject: [PATCH 060/737] ena: update to 2.2.6

Update the ENA driver to version 2.2.6. Changelog since 2.2.3:

**Bug Fixes**
* Disable rx offset feature

**New Features**
* Enable dynamic interrupt moderation on ARM64

**Bug Fixes**
* Use random key to configure RSS instead of static one.
* Fix multiple issues with the RSS configuration.
* Restore accidentally deleted meta-descriptor-caching-related code.

**Minor Changes**
* Set default tx interrupt moderation interval to 64, aligning it to upstream.
* Align comments surrounding create_queues_with_size_backoff() to upstream code.
* Minor cosmetic changes.
* Remove redundant print from ena_init().
* Change DRV_MODULE_VERSION to DRV_MODULE_GENERATION as in upstream code.
* Remove redefinition of ENA_HASH_KEY_SIZE in ena_netdev.h.
* Add missing row to README.
* Remove unused variable in XDP code.
* Use HAVE_NETDEV_XMIT_MORE in kcompat.h.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/amazon/net/ena/ena_com.c     | 33 ++++++++--------
 drivers/amazon/net/ena/ena_com.h     | 19 +++++-----
 drivers/amazon/net/ena/ena_eth_com.c |  5 +--
 drivers/amazon/net/ena/ena_ethtool.c | 32 ++++++++++++----
 drivers/amazon/net/ena/ena_netdev.c  | 57 ++++++++++++++++------------
 drivers/amazon/net/ena/ena_netdev.h  | 18 ++++-----
 drivers/amazon/net/ena/kcompat.h     | 38 ++++++++++++++++++-
 7 files changed, 129 insertions(+), 73 deletions(-)

diff --git a/drivers/amazon/net/ena/ena_com.c b/drivers/amazon/net/ena/ena_com.c
index 45278d4baf808..e88884d2673f3 100644
--- a/drivers/amazon/net/ena/ena_com.c
+++ b/drivers/amazon/net/ena/ena_com.c
@@ -64,15 +64,6 @@
 
 #define ENA_POLL_MS	5
 
-/* Default Microsoft RSS key, used for HRSS. */
-static const u8 rss_hash_key[ENA_HASH_KEY_SIZE] = {
-		0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
-		0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
-		0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
-		0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
-		0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
-};
-
 /*****************************************************************************/
 /*****************************************************************************/
 /*****************************************************************************/
@@ -1079,18 +1070,22 @@ static void ena_com_hash_key_fill_default_key(struct ena_com_dev *ena_dev)
 	struct ena_admin_feature_rss_flow_hash_control *hash_key =
 		(ena_dev->rss).hash_key;
 
-	memcpy(hash_key->key, rss_hash_key, sizeof(rss_hash_key));
+	netdev_rss_key_fill(&hash_key->key, sizeof(hash_key->key));
 	/* The key is stored in the device in u32 array
 	 * as well as the API requires the key to be passed in this
 	 * format. Thus the size of our array should be divided by 4
 	 */
-	hash_key->keys_num = sizeof(rss_hash_key) / sizeof(u32);
+	hash_key->keys_num = sizeof(hash_key->key) / sizeof(u32);
 }
 
 static int ena_com_hash_key_allocate(struct ena_com_dev *ena_dev)
 {
 	struct ena_rss *rss = &ena_dev->rss;
 
+	if (!ena_com_check_supported_feature_id(ena_dev,
+						ENA_ADMIN_RSS_HASH_FUNCTION))
+		return -EOPNOTSUPP;
+
 	rss->hash_key =
 		dma_zalloc_coherent(ena_dev->dmadev, sizeof(*rss->hash_key),
 				    &rss->hash_key_dma_addr, GFP_KERNEL);
@@ -2194,7 +2189,7 @@ int ena_com_get_dev_basic_stats(struct ena_com_dev *ena_dev,
 	return ret;
 }
 
-int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, u32 mtu)
+int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, int mtu)
 {
 	struct ena_com_admin_queue *admin_queue;
 	struct ena_admin_set_feat_cmd cmd;
@@ -2672,12 +2667,16 @@ int ena_com_rss_init(struct ena_com_dev *ena_dev, u16 indr_tbl_log_size)
 	if (unlikely(rc))
 		goto err_indr_tbl;
 
+	/* The following function might return unsupported in case the
+	 * device doesn't support setting the key / hash function. We can safely
+	 * ignore this error and have indirection table support only.
+	 */
 	rc = ena_com_hash_key_allocate(ena_dev);
-	if (unlikely(rc))
+	if (likely(!rc))
+		ena_com_hash_key_fill_default_key(ena_dev);
+	else if (rc != -EOPNOTSUPP)
 		goto err_hash_key;
 
-	ena_com_hash_key_fill_default_key(ena_dev);
-
 	rc = ena_com_hash_ctrl_init(ena_dev);
 	if (unlikely(rc))
 		goto err_hash_ctrl;
@@ -2895,8 +2894,8 @@ int ena_com_config_dev_mode(struct ena_com_dev *ena_dev,
 			    struct ena_admin_feature_llq_desc *llq_features,
 			    struct ena_llq_configurations *llq_default_cfg)
 {
+	struct ena_com_llq_info *llq_info = &ena_dev->llq_info;
 	int rc;
-	struct ena_com_llq_info *llq_info = &(ena_dev->llq_info);;
 
 	if (!llq_features->max_llq_num) {
 		ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
@@ -2910,7 +2909,7 @@ int ena_com_config_dev_mode(struct ena_com_dev *ena_dev,
 	ena_dev->tx_max_header_size = llq_info->desc_list_entry_size -
 		(llq_info->descs_num_before_header * sizeof(struct ena_eth_io_tx_desc));
 
-	if (ena_dev->tx_max_header_size == 0) {
+	if (unlikely(ena_dev->tx_max_header_size == 0)) {
 		pr_err("the size of the LLQ entry is smaller than needed\n");
 		return -EINVAL;
 	}
diff --git a/drivers/amazon/net/ena/ena_com.h b/drivers/amazon/net/ena/ena_com.h
index d753c824a86db..ba117e6f9332e 100644
--- a/drivers/amazon/net/ena/ena_com.h
+++ b/drivers/amazon/net/ena/ena_com.h
@@ -43,6 +43,7 @@
 #include <linux/spinlock.h>
 #include <linux/types.h>
 #include <linux/wait.h>
+#include <linux/netdevice.h>
 
 #include "kcompat.h"
 #include "ena_common_defs.h"
@@ -53,9 +54,9 @@
 #undef pr_fmt
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#define ENA_MAX_NUM_IO_QUEUES		128U
+#define ENA_MAX_NUM_IO_QUEUES 128U
 /* We need to queues for each IO (on for Tx and one for Rx) */
-#define ENA_TOTAL_NUM_QUEUES		(2 * (ENA_MAX_NUM_IO_QUEUES))
+#define ENA_TOTAL_NUM_QUEUES (2 * (ENA_MAX_NUM_IO_QUEUES))
 
 #define ENA_MAX_HANDLERS 256
 
@@ -72,15 +73,15 @@
 /*****************************************************************************/
 /* ENA adaptive interrupt moderation settings */
 
-#define ENA_INTR_INITIAL_TX_INTERVAL_USECS		0
-#define ENA_INTR_INITIAL_RX_INTERVAL_USECS		0
-#define ENA_DEFAULT_INTR_DELAY_RESOLUTION		1
+#define ENA_INTR_INITIAL_TX_INTERVAL_USECS 64
+#define ENA_INTR_INITIAL_RX_INTERVAL_USECS 0
+#define ENA_DEFAULT_INTR_DELAY_RESOLUTION 1
 
-#define ENA_HASH_KEY_SIZE                              40
+#define ENA_HASH_KEY_SIZE 40
 
-#define ENA_HW_HINTS_NO_TIMEOUT				0xFFFF
+#define ENA_HW_HINTS_NO_TIMEOUT	0xFFFF
 
-#define ENA_FEATURE_MAX_QUEUE_EXT_VER	1
+#define ENA_FEATURE_MAX_QUEUE_EXT_VER 1
 
 struct ena_llq_configurations {
 	enum ena_admin_llq_header_location llq_header_location;
@@ -635,7 +636,7 @@ int ena_com_get_dev_basic_stats(struct ena_com_dev *ena_dev,
  *
  * @return: 0 on Success and negative value otherwise.
  */
-int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, u32 mtu);
+int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, int mtu);
 
 /* ena_com_get_offload_settings - Retrieve the device offloads capabilities
  * @ena_dev: ENA communication layer struct
diff --git a/drivers/amazon/net/ena/ena_eth_com.c b/drivers/amazon/net/ena/ena_eth_com.c
index b23baf806bfdc..e7533dae92e33 100644
--- a/drivers/amazon/net/ena/ena_eth_com.c
+++ b/drivers/amazon/net/ena/ena_eth_com.c
@@ -216,8 +216,8 @@ static int ena_com_sq_update_llq_tail(struct ena_com_io_sq *io_sq)
 
 		pkt_ctrl->curr_bounce_buf =
 			ena_com_get_next_bounce_buffer(&io_sq->bounce_buf_ctrl);
-			memset(io_sq->llq_buf_ctrl.curr_bounce_buf,
-			       0x0, llq_info->desc_list_entry_size);
+		memset(io_sq->llq_buf_ctrl.curr_bounce_buf,
+		       0x0, llq_info->desc_list_entry_size);
 
 		pkt_ctrl->idx = 0;
 		if (unlikely(llq_info->desc_stride_ctrl == ENA_ADMIN_SINGLE_DESC_PER_ENTRY))
@@ -574,7 +574,6 @@ int ena_com_rx_pkt(struct ena_com_io_cq *io_cq,
 	}
 
 	cdesc = ena_com_rx_cdesc_idx_to_ptr(io_cq, cdesc_idx);
-	ena_rx_ctx->pkt_offset = cdesc->offset;
 
 	do {
 		ena_buf->len = cdesc->length;
diff --git a/drivers/amazon/net/ena/ena_ethtool.c b/drivers/amazon/net/ena/ena_ethtool.c
index 4169142bcc991..5e0e4607270f2 100755
--- a/drivers/amazon/net/ena/ena_ethtool.c
+++ b/drivers/amazon/net/ena/ena_ethtool.c
@@ -214,7 +214,7 @@ int ena_get_sset_count(struct net_device *netdev, int sset)
 		return -EOPNOTSUPP;
 
 	return adapter->num_io_queues * (ENA_STATS_ARRAY_TX + ENA_STATS_ARRAY_RX)
-	       + ENA_STATS_ARRAY_GLOBAL + ENA_STATS_ARRAY_ENA_COM;
+		+ ENA_STATS_ARRAY_GLOBAL + ENA_STATS_ARRAY_ENA_COM;
 }
 
 static void ena_queue_strings(struct ena_adapter *adapter, u8 **data)
@@ -440,7 +440,7 @@ static void ena_get_drvinfo(struct net_device *dev,
 	struct ena_adapter *adapter = netdev_priv(dev);
 
 	strlcpy(info->driver, DRV_MODULE_NAME, sizeof(info->driver));
-	strlcpy(info->version, DRV_MODULE_VERSION, sizeof(info->version));
+	strlcpy(info->version, DRV_MODULE_GENERATION, sizeof(info->version));
 	strlcpy(info->bus_info, pci_name(adapter->pdev),
 		sizeof(info->bus_info));
 }
@@ -739,7 +739,7 @@ static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
 			u8 *hfunc)
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
-	enum ena_admin_hash_functions ena_func = ENA_ADMIN_TOEPLITZ;
+	enum ena_admin_hash_functions ena_func;
 	u8 func;
 	int rc;
 
@@ -747,12 +747,19 @@ static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
 	if (rc)
 		return rc;
 
-	rc = ena_com_get_hash_key(adapter->ena_dev, key);
-	if (rc)
+	/* We call this function in order to check if the device
+	 * supports getting/setting the hash function.
+	 */
+	rc = ena_com_get_hash_function(adapter->ena_dev, &ena_func);
+	if (rc) {
+		if (rc == -EOPNOTSUPP)
+			rc = 0;
+
 		return rc;
+	}
 
-	rc = ena_com_get_hash_function(adapter->ena_dev, &ena_func);
-	if (rc && rc != -EOPNOTSUPP)
+	rc = ena_com_get_hash_key(adapter->ena_dev, key);
+	if (rc)
 		return rc;
 
 	switch (ena_func) {
@@ -783,6 +790,17 @@ static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key)
 	if (rc)
 		return rc;
 
+	/* We call this function in order to check if the device
+	 * supports getting/setting the hash function.
+	 */
+	rc = ena_com_get_hash_function(adapter->ena_dev, &ena_func);
+	if (rc) {
+		if (rc == -EOPNOTSUPP)
+			rc = 0;
+
+		return rc;
+	}
+
 	rc = ena_com_get_hash_key(adapter->ena_dev, key);
 	if (rc)
 		return rc;
diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c
index 6d7b628adf0d0..561842a90cbab 100755
--- a/drivers/amazon/net/ena/ena_netdev.c
+++ b/drivers/amazon/net/ena/ena_netdev.c
@@ -56,12 +56,12 @@
 #include "ena_pci_id_tbl.h"
 #include "ena_sysfs.h"
 
-static char version[] = DEVICE_NAME " v" DRV_MODULE_VERSION "\n";
+static char version[] = DEVICE_NAME " v" DRV_MODULE_GENERATION "\n";
 
 MODULE_AUTHOR("Amazon.com, Inc. or its affiliates");
 MODULE_DESCRIPTION(DEVICE_NAME);
 MODULE_LICENSE("GPL");
-MODULE_VERSION(DRV_MODULE_VERSION);
+MODULE_VERSION(DRV_MODULE_GENERATION);
 
 /* Time in jiffies before concluding the transmitter is hung. */
 #define TX_TIMEOUT  (5 * HZ)
@@ -347,7 +347,6 @@ static int ena_xdp_xmit_buff(struct net_device *dev,
 	struct ena_com_tx_ctx ena_tx_ctx = {0};
 	struct ena_tx_buffer *tx_info;
 	struct ena_ring *xdp_ring;
-	struct ena_ring *rx_ring;
 	u16 next_to_use, req_id;
 	int rc;
 	void *push_hdr;
@@ -358,8 +357,6 @@ static int ena_xdp_xmit_buff(struct net_device *dev,
 	req_id = xdp_ring->free_ids[next_to_use];
 	tx_info = &xdp_ring->tx_buffer_info[req_id];
 	tx_info->num_of_bufs = 0;
-	rx_ring = &xdp_ring->adapter->rx_ring[qid -
-		  xdp_ring->adapter->xdp_first_ring];
 	page_ref_inc(rx_info->page);
 	tx_info->xdp_rx_page = rx_info->page;
 
@@ -701,6 +698,7 @@ static void ena_init_io_rings(struct ena_adapter *adapter,
 		txr->sgl_size = adapter->max_tx_sgl_size;
 		txr->smoothed_interval =
 			ena_com_get_nonadaptive_moderation_interval_tx(ena_dev);
+		txr->disable_meta_caching = adapter->disable_meta_caching;
 
 		/* Don't init RX queues for xdp queues */
 		if (!ENA_IS_XDP_INDEX(adapter, i)) {
@@ -1522,9 +1520,8 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 		req_id = ena_bufs[buf].req_id;
 
 		rc = validate_rx_req_id(rx_ring, req_id);
-		if (unlikely(rc < 0)) {
+		if (unlikely(rc < 0))
 			return NULL;
-		}
 
 		rx_info = &rx_ring->rx_buffer_info[req_id];
 	} while (1);
@@ -2639,7 +2636,8 @@ static int ena_create_all_io_rx_queues(struct ena_adapter *adapter)
 }
 
 static void set_io_rings_size(struct ena_adapter *adapter,
-				     int new_tx_size, int new_rx_size)
+			      int new_tx_size,
+			      int new_rx_size)
 {
 	int i;
 
@@ -2653,17 +2651,16 @@ static void set_io_rings_size(struct ena_adapter *adapter,
  * low on memory. If there is not enough memory to allocate io queues
  * the driver will try to allocate smaller queues.
  *
- * The heuristic is as follows:
+ * The backoff algorithm is as follows:
+ *  1. Try to allocate TX and RX and if successful.
+ *  1.1. return success
  *
- * 1. Try to allocate TX and RX and if successful return success.
- * 2. If TX and RX are both smaller or equal to 256 return failure.
+ *  2. Divide by 2 the size of the larger of RX and TX queues (or both if their size is the same).
  *
- * 3. If TX and RX sizes differ:
- * 3.1. Divide by 2 the size of the larger one and go back to 1.
- *
- * 4. Else (TX and RX sizes are the same)
- * 4.1 Divide both RX and TX sizes by 2
- * and go back to 1
+ *  3. If TX or RX is smaller than 256
+ *  3.1. return failure.
+ *  4. else
+ *  4.1. go back to 1.
  */
 static int create_queues_with_size_backoff(struct ena_adapter *adapter)
 {
@@ -2773,6 +2770,14 @@ static int ena_up(struct ena_adapter *adapter)
 	 */
 	ena_init_napi_in_range(adapter, 0, io_queue_count);
 
+#ifdef CONFIG_ARM64
+	/* enable DIM by default on ARM machines, also needs to happen
+	 * before enabling IRQs since DIM is ran from napi routine
+	 */
+	if (ena_com_interrupt_moderation_supported(adapter->ena_dev))
+		ena_com_enable_adaptive_moderation(adapter->ena_dev);
+#endif
+
 	rc = ena_request_io_irq(adapter);
 	if (rc)
 		goto err_req_irq;
@@ -3271,8 +3276,12 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0)
+#ifdef HAVE_NETDEV_XMIT_MORE
 	if (netif_xmit_stopped(txq) || !netdev_xmit_more()) {
-#endif
+#else
+	if (netif_xmit_stopped(txq) || !skb->xmit_more) {
+#endif /* HAVE_NETDEV_XMIT_MORE */
+#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0) */
 		/* trigger the dma engine. ena_com_write_sq_doorbell()
 		 * has a mb
 		 */
@@ -3382,14 +3391,13 @@ static void ena_config_host_info(struct ena_com_dev *ena_dev, struct pci_dev *pd
 	strncpy(host_info->os_dist_str, utsname()->release,
 		sizeof(host_info->os_dist_str) - 1);
 	host_info->driver_version =
-		(DRV_MODULE_VER_MAJOR) |
-		(DRV_MODULE_VER_MINOR << ENA_ADMIN_HOST_INFO_MINOR_SHIFT) |
-		(DRV_MODULE_VER_SUBMINOR << ENA_ADMIN_HOST_INFO_SUB_MINOR_SHIFT) |
+		(DRV_MODULE_GEN_MAJOR) |
+		(DRV_MODULE_GEN_MINOR << ENA_ADMIN_HOST_INFO_MINOR_SHIFT) |
+		(DRV_MODULE_GEN_SUBMINOR << ENA_ADMIN_HOST_INFO_SUB_MINOR_SHIFT) |
 		("g"[0] << ENA_ADMIN_HOST_INFO_MODULE_TYPE_SHIFT);
 	host_info->num_cpus = num_online_cpus();
 
 	host_info->driver_supported_features =
-		ENA_ADMIN_HOST_INFO_RX_OFFSET_MASK |
 		ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_MASK |
 		ENA_ADMIN_HOST_INFO_MAP_RX_BUF_BIDIRECTIONAL_MASK;
 
@@ -3962,6 +3970,7 @@ static int ena_restore_device(struct ena_adapter *adapter)
 
 	mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ));
 	adapter->last_keep_alive_jiffies = jiffies;
+
 	dev_err(&pdev->dev,
 		"Device reset completed successfully, Driver info: %s\n",
 		version);
@@ -4626,7 +4635,7 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	calc_queue_ctx.get_feat_ctx = &get_feat_ctx;
 	calc_queue_ctx.pdev = pdev;
 
-	/* initial TX and RX interrupt delay, Assumes 1 usec granularity.
+	/* Initial TX and RX interrupt delay. Assumes 1 usec granularity.
 	 * Updated during device initialization with the real granularity
 	 */
 	ena_dev->intr_moder_tx_interval = ENA_INTR_INITIAL_TX_INTERVAL_USECS;
@@ -4904,8 +4913,6 @@ static struct pci_driver ena_pci_driver = {
 
 static int __init ena_init(void)
 {
-	pr_info("%s", version);
-
 	ena_wq = create_singlethread_workqueue(DRV_MODULE_NAME);
 	if (!ena_wq) {
 		pr_err("Failed to create workqueue\n");
diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h
index 1648be44e2eb2..48f20d54830f0 100755
--- a/drivers/amazon/net/ena/ena_netdev.h
+++ b/drivers/amazon/net/ena/ena_netdev.h
@@ -51,16 +51,16 @@
 #include "ena_com.h"
 #include "ena_eth_com.h"
 
-#define DRV_MODULE_VER_MAJOR	2
-#define DRV_MODULE_VER_MINOR	2
-#define DRV_MODULE_VER_SUBMINOR 3
+#define DRV_MODULE_GEN_MAJOR	2
+#define DRV_MODULE_GEN_MINOR	2
+#define DRV_MODULE_GEN_SUBMINOR 6
 
 #define DRV_MODULE_NAME		"ena"
-#ifndef DRV_MODULE_VERSION
-#define DRV_MODULE_VERSION \
-	__stringify(DRV_MODULE_VER_MAJOR) "."	\
-	__stringify(DRV_MODULE_VER_MINOR) "."	\
-	__stringify(DRV_MODULE_VER_SUBMINOR) "g"
+#ifndef DRV_MODULE_GENERATION
+#define DRV_MODULE_GENERATION \
+	__stringify(DRV_MODULE_GEN_MAJOR) "."	\
+	__stringify(DRV_MODULE_GEN_MINOR) "."	\
+	__stringify(DRV_MODULE_GEN_SUBMINOR) "g"
 #endif
 
 #define DEVICE_NAME	"Elastic Network Adapter (ENA)"
@@ -110,8 +110,6 @@
 #define ENA_RX_RSS_TABLE_LOG_SIZE  7
 #define ENA_RX_RSS_TABLE_SIZE	(1 << ENA_RX_RSS_TABLE_LOG_SIZE)
 
-#define ENA_HASH_KEY_SIZE	40
-
 /* The number of tx packet completions that will be handled each NAPI poll
  * cycle is ring_size / ENA_TX_POLL_BUDGET_DIVIDER.
  */
diff --git a/drivers/amazon/net/ena/kcompat.h b/drivers/amazon/net/ena/kcompat.h
index eb4b99d573670..3a801efcec67e 100755
--- a/drivers/amazon/net/ena/kcompat.h
+++ b/drivers/amazon/net/ena/kcompat.h
@@ -652,8 +652,8 @@ do {									\
 #endif
 #endif
 
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 2, 0))
-#define netdev_xmit_more() ((skb->xmit_more))
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0))
+#define HAVE_NETDEV_XMIT_MORE
 #endif
 
 #ifndef mmiowb
@@ -699,4 +699,38 @@ do {									\
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(5,5,0)
 #define HAVE_NDO_TX_TIMEOUT_STUCK_QUEUE_PARAMETER
 #endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0) && \
+    !(RHEL_RELEASE_CODE && ((RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7, 1)) && \
+                            (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(6, 6)))) && \
+                            !(UBUNTU_VERSION_CODE) && \
+                            !defined(UEK3_RELEASE)
+
+#define DO_ONCE(func, ...)						     \
+	({								     \
+		static bool ___done = false;				     \
+		if (unlikely(!___done)) {				     \
+				func(__VA_ARGS__);			     \
+				___done = true;				     \
+		}							     \
+	})
+
+#define get_random_once(buf, nbytes)					     \
+	DO_ONCE(get_random_bytes, (buf), (nbytes))
+
+#define net_get_random_once(buf, nbytes)				     \
+	get_random_once((buf), (nbytes))
+
+/* RSS keys are 40 or 52 bytes long */
+#define NETDEV_RSS_KEY_LEN 52
+static u8 netdev_rss_key[NETDEV_RSS_KEY_LEN];
+
+static inline void netdev_rss_key_fill(void *buffer, size_t len)
+{
+	BUG_ON(len > sizeof(netdev_rss_key));
+	net_get_random_once(netdev_rss_key, sizeof(netdev_rss_key));
+	memcpy(buffer, netdev_rss_key, len);
+}
+#endif
+
 #endif /* _KCOMPAT_H_ */

From 9b0f2bd12cc2a4f31f27f20770f785b67ad7518c Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <surajjs@amazon.com>
Date: Tue, 14 Jul 2020 16:58:18 -0700
Subject: [PATCH 061/737] ena: Update to 2.2.10

This is the cumulative update from 2.2.6 to 2.2.7, 2.2.7 to 2.2.8,
2.2.8 to 2.2.9 and then to 2.2.10.

Sourced from upstream git repo: https://github.com/amzn/amzn-drivers/

Change Log from Upstream:

2.2.6 -> 2.2.7:
**Minor Changes**
* Expose additional PCI device ID

2.2.7 -> 2.2.8:
**New Features**
* Re-enable RX offset feature.

**Bug Fixes**
* Fix XDP PASS issue due to incorrect handling of offset in rx_info.
* Add PCI shutdown handler to allow safe kexec.
* Fix RHEL 8.2 compilation error.
* Fix kernel 5.5 compilation error.

**Minor Changes**
* Reduce driver load time.

2.2.8 -> 2.2.9:
**Bug Fixes**
* Fix memory leak in XDP_TX when TX queue is full.
* Fix napi budget accounting of XDP packets.
* Fix driver loading error in kernels >= 5.7 due to unreported
  interrupt coalescing capabilities.
* Fix is_doorbell_needed() to account for meta descriptors properly.

2.2.9 -> 2.2.10:
**New Features**
* Add new device statistics to ethtool command

Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
Reviewed-by: Anchal Agarwal <anchalag@amazon.com>
Reviewed-by: Benjamin Herrenschmidt <benh@amazon.com>
Reviewed-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/amazon/net/ena/ena_admin_defs.h |  37 +++++++-
 drivers/amazon/net/ena/ena_com.c        |  54 +++++++++---
 drivers/amazon/net/ena/ena_com.h        |  12 +++
 drivers/amazon/net/ena/ena_eth_com.c    |   1 +
 drivers/amazon/net/ena/ena_eth_com.h    |   3 +-
 drivers/amazon/net/ena/ena_ethtool.c    | 111 +++++++++++++++++++-----
 drivers/amazon/net/ena/ena_netdev.c     |  98 ++++++++++++++++-----
 drivers/amazon/net/ena/ena_netdev.h     |   8 +-
 drivers/amazon/net/ena/ena_pci_id_tbl.h |   5 ++
 drivers/amazon/net/ena/kcompat.h        |   6 +-
 10 files changed, 276 insertions(+), 59 deletions(-)

diff --git a/drivers/amazon/net/ena/ena_admin_defs.h b/drivers/amazon/net/ena/ena_admin_defs.h
index c1836183d2934..bad88b822773f 100755
--- a/drivers/amazon/net/ena/ena_admin_defs.h
+++ b/drivers/amazon/net/ena/ena_admin_defs.h
@@ -121,6 +121,8 @@ enum ena_admin_completion_policy_type {
 enum ena_admin_get_stats_type {
 	ENA_ADMIN_GET_STATS_TYPE_BASIC              = 0,
 	ENA_ADMIN_GET_STATS_TYPE_EXTENDED           = 1,
+	/* extra HW stats for specific network interface */
+	ENA_ADMIN_GET_STATS_TYPE_ENI                = 2,
 };
 
 enum ena_admin_get_stats_scope {
@@ -414,10 +416,43 @@ struct ena_admin_basic_stats {
 	u32 tx_drops_high;
 };
 
+/* ENI Statistics Command. */
+struct ena_admin_eni_stats {
+	/* The number of packets shaped due to inbound aggregate BW
+	 * allowance being exceeded
+	 */
+	u64 bw_in_allowance_exceeded;
+
+	/* The number of packets shaped due to outbound aggregate BW
+	 * allowance being exceeded
+	 */
+	u64 bw_out_allowance_exceeded;
+
+	/* The number of packets shaped due to PPS allowance being exceeded */
+	u64 pps_allowance_exceeded;
+
+	/* The number of packets shaped due to connection tracking
+	 * allowance being exceeded and leading to failure in establishment
+	 * of new connections
+	 */
+	u64 conntrack_allowance_exceeded;
+
+	/* The number of packets shaped due to linklocal packet rate
+	 * allowance being exceeded
+	 */
+	u64 linklocal_allowance_exceeded;
+};
+
 struct ena_admin_acq_get_stats_resp {
 	struct ena_admin_acq_common_desc acq_common_desc;
 
-	struct ena_admin_basic_stats basic_stats;
+	union {
+		u64 raw[7];
+
+		struct ena_admin_basic_stats basic_stats;
+
+		struct ena_admin_eni_stats eni_stats;
+	} u;
 };
 
 struct ena_admin_get_set_feature_common_desc {
diff --git a/drivers/amazon/net/ena/ena_com.c b/drivers/amazon/net/ena/ena_com.c
index e88884d2673f3..9eb3a2fcad1b9 100644
--- a/drivers/amazon/net/ena/ena_com.c
+++ b/drivers/amazon/net/ena/ena_com.c
@@ -62,7 +62,9 @@
 
 #define ENA_REGS_ADMIN_INTR_MASK 1
 
-#define ENA_POLL_MS	5
+#define ENA_MIN_POLL_US 100
+
+#define ENA_MAX_POLL_US 5000
 
 /*****************************************************************************/
 /*****************************************************************************/
@@ -544,12 +546,20 @@ static int ena_com_comp_status_to_errno(u8 comp_status)
 	return -EINVAL;
 }
 
+static inline void ena_delay_exponential_backoff_us(u32 exp, u32 delay_us)
+{
+	delay_us = max_t(u32, ENA_MIN_POLL_US, delay_us);
+	delay_us = min_t(u32, delay_us * (1 << exp), ENA_MAX_POLL_US);
+	usleep_range(delay_us, 2 * delay_us);
+}
+
 static int ena_com_wait_and_process_admin_cq_polling(struct ena_comp_ctx *comp_ctx,
 						     struct ena_com_admin_queue *admin_queue)
 {
 	unsigned long flags = 0;
 	unsigned long timeout;
 	int ret;
+	u32 exp = 0;
 
 	timeout = jiffies + usecs_to_jiffies(admin_queue->completion_timeout);
 
@@ -573,7 +583,7 @@ static int ena_com_wait_and_process_admin_cq_polling(struct ena_comp_ctx *comp_c
 			goto err;
 		}
 
-		msleep(ENA_POLL_MS);
+		ena_delay_exponential_backoff_us(exp++, admin_queue->ena_dev->ena_min_poll_delay_us);
 	}
 
 	if (unlikely(comp_ctx->status == ENA_CMD_ABORTED)) {
@@ -957,12 +967,13 @@ static void ena_com_io_queue_free(struct ena_com_dev *ena_dev,
 static int wait_for_reset_state(struct ena_com_dev *ena_dev, u32 timeout,
 				u16 exp_state)
 {
-	u32 val, i;
+	u32 val, exp = 0;
+	unsigned long timeout_stamp;
 
-	/* Convert timeout from resolution of 100ms to ENA_POLL_MS */
-	timeout = (timeout * 100) / ENA_POLL_MS;
+	/* Convert timeout from resolution of 100ms to us resolution. */
+	timeout_stamp = jiffies + usecs_to_jiffies(100 * 1000 * timeout);
 
-	for (i = 0; i < timeout; i++) {
+	while (1) {
 		val = ena_com_reg_bar_read32(ena_dev, ENA_REGS_DEV_STS_OFF);
 
 		if (unlikely(val == ENA_MMIO_READ_TIMEOUT)) {
@@ -974,10 +985,11 @@ static int wait_for_reset_state(struct ena_com_dev *ena_dev, u32 timeout,
 			exp_state)
 			return 0;
 
-		msleep(ENA_POLL_MS);
-	}
+		if (time_is_before_jiffies(timeout_stamp))
+			return -ETIME;
 
-	return -ETIME;
+		ena_delay_exponential_backoff_us(exp++, ena_dev->ena_min_poll_delay_us);
+	}
 }
 
 static bool ena_com_check_supported_feature_id(struct ena_com_dev *ena_dev,
@@ -1454,11 +1466,13 @@ void ena_com_wait_for_abort_completion(struct ena_com_dev *ena_dev)
 {
 	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
 	unsigned long flags = 0;
+	u32 exp = 0;
 
 	spin_lock_irqsave(&admin_queue->q_lock, flags);
 	while (atomic_read(&admin_queue->outstanding_cmds) != 0) {
 		spin_unlock_irqrestore(&admin_queue->q_lock, flags);
-		msleep(ENA_POLL_MS);
+		ena_delay_exponential_backoff_us(exp++,
+						 ena_dev->ena_min_poll_delay_us);
 		spin_lock_irqsave(&admin_queue->q_lock, flags);
 	}
 	spin_unlock_irqrestore(&admin_queue->q_lock, flags);
@@ -1812,6 +1826,7 @@ int ena_com_admin_init(struct ena_com_dev *ena_dev,
 	if (ret)
 		goto error;
 
+	admin_queue->ena_dev = ena_dev;
 	admin_queue->running_state = true;
 
 	return 0;
@@ -2174,6 +2189,21 @@ static int ena_get_dev_stats(struct ena_com_dev *ena_dev,
 	return ret;
 }
 
+int ena_com_get_eni_stats(struct ena_com_dev *ena_dev,
+			  struct ena_admin_eni_stats *stats)
+{
+	struct ena_com_stats_ctx ctx;
+	int ret;
+
+	memset(&ctx, 0x0, sizeof(ctx));
+	ret = ena_get_dev_stats(ena_dev, &ctx, ENA_ADMIN_GET_STATS_TYPE_ENI);
+	if (likely(ret == 0))
+		memcpy(stats, &ctx.get_resp.u.eni_stats,
+		       sizeof(ctx.get_resp.u.eni_stats));
+
+	return ret;
+}
+
 int ena_com_get_dev_basic_stats(struct ena_com_dev *ena_dev,
 				struct ena_admin_basic_stats *stats)
 {
@@ -2183,8 +2213,8 @@ int ena_com_get_dev_basic_stats(struct ena_com_dev *ena_dev,
 	memset(&ctx, 0x0, sizeof(ctx));
 	ret = ena_get_dev_stats(ena_dev, &ctx, ENA_ADMIN_GET_STATS_TYPE_BASIC);
 	if (likely(ret == 0))
-		memcpy(stats, &ctx.get_resp.basic_stats,
-		       sizeof(ctx.get_resp.basic_stats));
+		memcpy(stats, &ctx.get_resp.u.basic_stats,
+		       sizeof(ctx.get_resp.u.basic_stats));
 
 	return ret;
 }
diff --git a/drivers/amazon/net/ena/ena_com.h b/drivers/amazon/net/ena/ena_com.h
index ba117e6f9332e..8ee8c4864f221 100644
--- a/drivers/amazon/net/ena/ena_com.h
+++ b/drivers/amazon/net/ena/ena_com.h
@@ -245,6 +245,7 @@ struct ena_com_stats_admin {
 struct ena_com_admin_queue {
 	void *q_dmadev;
 	void *bus;
+	struct ena_com_dev *ena_dev;
 	spinlock_t q_lock; /* spinlock for the admin queue */
 
 	struct ena_comp_ctx *comp_ctx;
@@ -358,6 +359,8 @@ struct ena_com_dev {
 	struct ena_intr_moder_entry *intr_moder_tbl;
 
 	struct ena_com_llq_info llq_info;
+
+	u32 ena_min_poll_delay_us;
 };
 
 struct ena_com_dev_get_features_ctx {
@@ -630,6 +633,15 @@ int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev,
 int ena_com_get_dev_basic_stats(struct ena_com_dev *ena_dev,
 				struct ena_admin_basic_stats *stats);
 
+/* ena_com_get_eni_stats - Get extended network interface statistics
+ * @ena_dev: ENA communication layer struct
+ * @stats: stats return value
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_get_eni_stats(struct ena_com_dev *ena_dev,
+			  struct ena_admin_eni_stats *stats);
+
 /* ena_com_set_dev_mtu - Configure the device mtu.
  * @ena_dev: ENA communication layer struct
  * @mtu: mtu value
diff --git a/drivers/amazon/net/ena/ena_eth_com.c b/drivers/amazon/net/ena/ena_eth_com.c
index e7533dae92e33..51c0df9a857e5 100644
--- a/drivers/amazon/net/ena/ena_eth_com.c
+++ b/drivers/amazon/net/ena/ena_eth_com.c
@@ -574,6 +574,7 @@ int ena_com_rx_pkt(struct ena_com_io_cq *io_cq,
 	}
 
 	cdesc = ena_com_rx_cdesc_idx_to_ptr(io_cq, cdesc_idx);
+	ena_rx_ctx->pkt_offset = cdesc->offset;
 
 	do {
 		ena_buf->len = cdesc->length;
diff --git a/drivers/amazon/net/ena/ena_eth_com.h b/drivers/amazon/net/ena/ena_eth_com.h
index 8b1afd3b32f26..b6592cb93b045 100644
--- a/drivers/amazon/net/ena/ena_eth_com.h
+++ b/drivers/amazon/net/ena/ena_eth_com.h
@@ -157,7 +157,8 @@ static inline bool ena_com_is_doorbell_needed(struct ena_com_io_sq *io_sq,
 	llq_info = &io_sq->llq_info;
 	num_descs = ena_tx_ctx->num_bufs;
 
-	if (unlikely(ena_com_meta_desc_changed(io_sq, ena_tx_ctx)))
+	if (llq_info->disable_meta_caching ||
+	    unlikely(ena_com_meta_desc_changed(io_sq, ena_tx_ctx)))
 		++num_descs;
 
 	if (num_descs > llq_info->descs_num_before_header) {
diff --git a/drivers/amazon/net/ena/ena_ethtool.c b/drivers/amazon/net/ena/ena_ethtool.c
index 5e0e4607270f2..dfce4a2fe73fb 100755
--- a/drivers/amazon/net/ena/ena_ethtool.c
+++ b/drivers/amazon/net/ena/ena_ethtool.c
@@ -49,6 +49,11 @@ struct ena_stats {
 	.stat_offset = offsetof(struct ena_stats_##stat_type, stat) \
 }
 
+#define ENA_STAT_HW_ENTRY(stat, stat_type) { \
+	.name = #stat, \
+	.stat_offset = offsetof(struct ena_admin_##stat_type, stat) \
+}
+
 #define ENA_STAT_RX_ENTRY(stat) \
 	ENA_STAT_ENTRY(stat, rx)
 
@@ -58,6 +63,9 @@ struct ena_stats {
 #define ENA_STAT_GLOBAL_ENTRY(stat) \
 	ENA_STAT_ENTRY(stat, dev)
 
+#define ENA_STAT_ENI_ENTRY(stat) \
+	ENA_STAT_HW_ENTRY(stat, eni_stats)
+
 static const struct ena_stats ena_stats_global_strings[] = {
 	ENA_STAT_GLOBAL_ENTRY(tx_timeout),
 	ENA_STAT_GLOBAL_ENTRY(suspend),
@@ -66,8 +74,14 @@ static const struct ena_stats ena_stats_global_strings[] = {
 	ENA_STAT_GLOBAL_ENTRY(interface_up),
 	ENA_STAT_GLOBAL_ENTRY(interface_down),
 	ENA_STAT_GLOBAL_ENTRY(admin_q_pause),
-	ENA_STAT_GLOBAL_ENTRY(rx_drops),
-	ENA_STAT_GLOBAL_ENTRY(tx_drops),
+};
+
+static const struct ena_stats ena_stats_eni_strings[] = {
+	ENA_STAT_ENI_ENTRY(bw_in_allowance_exceeded),
+	ENA_STAT_ENI_ENTRY(bw_out_allowance_exceeded),
+	ENA_STAT_ENI_ENTRY(pps_allowance_exceeded),
+	ENA_STAT_ENI_ENTRY(conntrack_allowance_exceeded),
+	ENA_STAT_ENI_ENTRY(linklocal_allowance_exceeded),
 };
 
 static const struct ena_stats ena_stats_tx_strings[] = {
@@ -117,10 +131,12 @@ static const struct ena_stats ena_stats_ena_com_strings[] = {
 	ENA_STAT_ENA_COM_ENTRY(no_completion),
 };
 
-#define ENA_STATS_ARRAY_GLOBAL	ARRAY_SIZE(ena_stats_global_strings)
-#define ENA_STATS_ARRAY_TX	ARRAY_SIZE(ena_stats_tx_strings)
-#define ENA_STATS_ARRAY_RX	ARRAY_SIZE(ena_stats_rx_strings)
-#define ENA_STATS_ARRAY_ENA_COM	ARRAY_SIZE(ena_stats_ena_com_strings)
+#define ENA_STATS_ARRAY_GLOBAL		ARRAY_SIZE(ena_stats_global_strings)
+#define ENA_STATS_ARRAY_TX		ARRAY_SIZE(ena_stats_tx_strings)
+#define ENA_STATS_ARRAY_RX		ARRAY_SIZE(ena_stats_rx_strings)
+#define ENA_STATS_ARRAY_ENA_COM		ARRAY_SIZE(ena_stats_ena_com_strings)
+#define ENA_STATS_ARRAY_ENI(adapter)	\
+	(ARRAY_SIZE(ena_stats_eni_strings) * adapter->eni_stats_supported)
 
 static void ena_safe_update_stat(u64 *src, u64 *dst,
 				 struct u64_stats_sync *syncp)
@@ -184,11 +200,10 @@ static void ena_dev_admin_queue_stats(struct ena_adapter *adapter, u64 **data)
 	}
 }
 
-static void ena_get_ethtool_stats(struct net_device *netdev,
-				  struct ethtool_stats *stats,
-				  u64 *data)
+static void ena_get_stats(struct ena_adapter *adapter,
+			  u64 *data,
+			  bool eni_stats_needed)
 {
-	struct ena_adapter *adapter = netdev_priv(netdev);
 	const struct ena_stats *ena_stats;
 	u64 *ptr;
 	int i;
@@ -202,10 +217,42 @@ static void ena_get_ethtool_stats(struct net_device *netdev,
 		ena_safe_update_stat(ptr, data++, &adapter->syncp);
 	}
 
+	if (eni_stats_needed) {
+		ena_update_hw_stats(adapter);
+		for (i = 0; i < ENA_STATS_ARRAY_ENI(adapter); i++) {
+			ena_stats = &ena_stats_eni_strings[i];
+
+			ptr = (u64 *)((uintptr_t)&adapter->eni_stats +
+				(uintptr_t)ena_stats->stat_offset);
+
+			ena_safe_update_stat(ptr, data++, &adapter->syncp);
+		}
+	}
+
 	ena_queue_stats(adapter, &data);
 	ena_dev_admin_queue_stats(adapter, &data);
 }
 
+static void ena_get_ethtool_stats(struct net_device *netdev,
+				  struct ethtool_stats *stats,
+				  u64 *data)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+
+	ena_get_stats(adapter, data, adapter->eni_stats_supported);
+}
+
+static int ena_get_sw_stats_count(struct ena_adapter *adapter)
+{
+	return adapter->num_io_queues * (ENA_STATS_ARRAY_TX + ENA_STATS_ARRAY_RX)
+		+ ENA_STATS_ARRAY_GLOBAL + ENA_STATS_ARRAY_ENA_COM;
+}
+
+static int ena_get_hw_stats_count(struct ena_adapter *adapter)
+{
+	return ENA_STATS_ARRAY_ENI(adapter);
+}
+
 int ena_get_sset_count(struct net_device *netdev, int sset)
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
@@ -213,8 +260,7 @@ int ena_get_sset_count(struct net_device *netdev, int sset)
 	if (sset != ETH_SS_STATS)
 		return -EOPNOTSUPP;
 
-	return adapter->num_io_queues * (ENA_STATS_ARRAY_TX + ENA_STATS_ARRAY_RX)
-		+ ENA_STATS_ARRAY_GLOBAL + ENA_STATS_ARRAY_ENA_COM;
+	return ena_get_sw_stats_count(adapter) + ena_get_hw_stats_count(adapter);
 }
 
 static void ena_queue_strings(struct ena_adapter *adapter, u8 **data)
@@ -256,25 +302,43 @@ static void ena_com_dev_strings(u8 **data)
 	}
 }
 
-static void ena_get_strings(struct net_device *netdev, u32 sset, u8 *data)
+static void ena_get_strings(struct ena_adapter *adapter,
+			    u8 *data,
+			    bool eni_stats_needed)
 {
-	struct ena_adapter *adapter = netdev_priv(netdev);
 	const struct ena_stats *ena_stats;
 	int i;
 
-	if (sset != ETH_SS_STATS)
-		return;
-
 	for (i = 0; i < ENA_STATS_ARRAY_GLOBAL; i++) {
 		ena_stats = &ena_stats_global_strings[i];
 		memcpy(data, ena_stats->name, ETH_GSTRING_LEN);
 		data += ETH_GSTRING_LEN;
 	}
 
+	if (eni_stats_needed) {
+		for (i = 0; i < ENA_STATS_ARRAY_ENI(adapter); i++) {
+			ena_stats = &ena_stats_eni_strings[i];
+			memcpy(data, ena_stats->name, ETH_GSTRING_LEN);
+			data += ETH_GSTRING_LEN;
+		}
+	}
+
 	ena_queue_strings(adapter, &data);
 	ena_com_dev_strings(&data);
 }
 
+static void ena_get_ethtool_strings(struct net_device *netdev,
+				    u32 sset,
+				    u8 *data)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+
+	if (sset != ETH_SS_STATS)
+		return;
+
+	ena_get_strings(adapter, data, adapter->eni_stats_supported);
+}
+
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0)
 static int ena_get_link_ksettings(struct net_device *netdev,
 				  struct ethtool_link_ksettings *link_ksettings)
@@ -685,7 +749,6 @@ static u32 ena_get_rxfh_key_size(struct net_device *netdev)
 }
 #endif
 
-
 static int ena_indirection_table_set(struct ena_adapter *adapter,
 				     const u32 *indir)
 {
@@ -962,6 +1025,10 @@ static int ena_set_tunable(struct net_device *netdev,
 #endif /* 3.18.0 */
 
 static const struct ethtool_ops ena_ethtool_ops = {
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 7, 0)
+	.supported_coalesce_params = ETHTOOL_COALESCE_USECS |
+				     ETHTOOL_COALESCE_USE_ADAPTIVE_RX,
+#endif
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0)
 	.get_link_ksettings	= ena_get_link_ksettings,
 #else
@@ -976,7 +1043,7 @@ static const struct ethtool_ops ena_ethtool_ops = {
 	.get_ringparam		= ena_get_ringparam,
 	.set_ringparam		= ena_set_ringparam,
 	.get_sset_count         = ena_get_sset_count,
-	.get_strings		= ena_get_strings,
+	.get_strings		= ena_get_ethtool_strings,
 	.get_ethtool_stats      = ena_get_ethtool_stats,
 #ifdef ETHTOOL_GRXRINGS
 	.get_rxnfc		= ena_get_rxnfc,
@@ -1021,7 +1088,7 @@ static void ena_dump_stats_ex(struct ena_adapter *adapter, u8 *buf)
 	int strings_num;
 	int i, rc;
 
-	strings_num = ena_get_sset_count(netdev, ETH_SS_STATS);
+	strings_num = ena_get_sw_stats_count(adapter);
 	if (strings_num <= 0) {
 		netif_err(adapter, drv, netdev, "Can't get stats num\n");
 		return;
@@ -1046,8 +1113,8 @@ static void ena_dump_stats_ex(struct ena_adapter *adapter, u8 *buf)
 		return;
 	}
 
-	ena_get_strings(netdev, ETH_SS_STATS, strings_buf);
-	ena_get_ethtool_stats(netdev, NULL, data_buf);
+	ena_get_strings(adapter, strings_buf, false);
+	ena_get_stats(adapter, data_buf, false);
 
 	/* If there is a buffer, dump stats, otherwise print them to dmesg */
 	if (buf)
diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c
index 561842a90cbab..1d16d33ead6d9 100755
--- a/drivers/amazon/net/ena/ena_netdev.c
+++ b/drivers/amazon/net/ena/ena_netdev.c
@@ -392,7 +392,7 @@ static int ena_xdp_xmit_buff(struct net_device *dev,
 	ena_unmap_tx_buff(xdp_ring, tx_info);
 	tx_info->xdpf = NULL;
 error_drop_packet:
-
+	__free_page(tx_info->xdp_rx_page);
 	return NETDEV_TX_OK;
 }
 
@@ -1415,8 +1415,7 @@ static struct sk_buff *ena_alloc_skb(struct ena_ring *rx_ring, bool frags)
 static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 				  struct ena_com_rx_buf_info *ena_bufs,
 				  u32 descs,
-				  u16 *next_to_clean,
-				  u8 offset)
+				  u16 *next_to_clean)
 {
 	struct sk_buff *skb;
 	struct ena_rx_buffer *rx_info;
@@ -1435,7 +1434,6 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 		return NULL;
 
 	rx_info = &rx_ring->rx_buffer_info[req_id];
-	rx_info->page_offset = offset;
 
 	if (unlikely(!rx_info->page)) {
 		netif_err(rx_ring->adapter, rx_err, rx_ring->netdev,
@@ -1677,6 +1675,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 {
 	u16 next_to_clean = rx_ring->next_to_clean;
 	struct ena_com_rx_ctx ena_rx_ctx;
+	struct ena_rx_buffer *rx_info;
 	struct ena_adapter *adapter;
 	u32 res_budget, work_done;
 	int rx_copybreak_pkt = 0;
@@ -1718,6 +1717,9 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 		if (unlikely(ena_rx_ctx.descs == 0))
 			break;
 
+		rx_info = &rx_ring->rx_buffer_info[rx_ring->ena_bufs[0].req_id];
+		rx_info->page_offset = ena_rx_ctx.pkt_offset;
+
 		netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
 			  "rx_poll: q %d got packet from ena. descs #: %d l3 proto %d l4 proto %d hash: %x\n",
 			  rx_ring->qid, ena_rx_ctx.descs, ena_rx_ctx.l3_proto,
@@ -1732,19 +1734,22 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 			skb = ena_rx_skb(rx_ring,
 					 rx_ring->ena_bufs,
 					 ena_rx_ctx.descs,
-					 &next_to_clean, ena_rx_ctx.pkt_offset);
+					 &next_to_clean);
 #else
 		skb = ena_rx_skb(rx_ring, rx_ring->ena_bufs, ena_rx_ctx.descs,
-				 &next_to_clean, ena_rx_ctx.pkt_offset);
+				 &next_to_clean);
 #endif /* ENA_XDP_SUPPORT */
 
 		if (unlikely(!skb)) {
 #ifdef ENA_XDP_SUPPORT
-			if (xdp_verdict == XDP_TX) {
+			/* The page might not actually be freed here since the
+			 * page reference count is incremented in
+			 * ena_xdp_xmit_buff(), and it will be decreased only
+			 * when send completion was received from the device
+			 */
+			if (xdp_verdict == XDP_TX)
 				ena_free_rx_page(rx_ring,
 						 &rx_ring->rx_buffer_info[rx_ring->ena_bufs[0].req_id]);
-				res_budget--;
-			}
 #endif /* ENA_XDP_SUPPORT */
 			for (i = 0; i < ena_rx_ctx.descs; i++) {
 				rx_ring->free_ids[next_to_clean] =
@@ -1754,8 +1759,10 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 							     rx_ring->ring_size);
 			}
 #ifdef ENA_XDP_SUPPORT
-			if (xdp_verdict == XDP_TX || xdp_verdict == XDP_DROP)
+			if (xdp_verdict != XDP_PASS){
+				res_budget--;
 				continue;
+			}
 #endif /* ENA_XDP_SUPPORT */
 			break;
 		}
@@ -3398,6 +3405,7 @@ static void ena_config_host_info(struct ena_com_dev *ena_dev, struct pci_dev *pd
 	host_info->num_cpus = num_online_cpus();
 
 	host_info->driver_supported_features =
+		ENA_ADMIN_HOST_INFO_RX_OFFSET_MASK |
 		ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_MASK |
 		ENA_ADMIN_HOST_INFO_MAP_RX_BUF_BIDIRECTIONAL_MASK;
 
@@ -3454,6 +3462,19 @@ static void ena_config_debug_area(struct ena_adapter *adapter)
 	ena_com_delete_debug_area(adapter->ena_dev);
 }
 
+int ena_update_hw_stats(struct ena_adapter *adapter)
+{
+	int rc = 0;
+
+	rc = ena_com_get_eni_stats(adapter->ena_dev, &adapter->eni_stats);
+	if (rc) {
+		dev_info_once(&adapter->pdev->dev, "Failed to get ENI stats\n");
+		return rc;
+	}
+
+	return 0;
+}
+
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36))
 #ifdef NDO_GET_STATS_64_V2
 static void ena_get_stats64(struct net_device *netdev,
@@ -4621,6 +4642,7 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto err_free_region;
 	}
 
+	ena_dev->ena_min_poll_delay_us = ENA_POLL_DELAY_US;
 	ena_dev->dmadev = &pdev->dev;
 
 	rc = ena_device_init(ena_dev, pdev, &get_feat_ctx, &wd_state);
@@ -4738,6 +4760,11 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	ena_config_debug_area(adapter);
 
+	if (!ena_update_hw_stats(adapter))
+		adapter->eni_stats_supported = true;
+	else
+		adapter->eni_stats_supported = false;
+
 	memcpy(adapter->netdev->perm_addr, adapter->mac_addr, netdev->addr_len);
 
 	netif_carrier_off(netdev);
@@ -4805,13 +4832,15 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 /*****************************************************************************/
 
-/* ena_remove - Device Removal Routine
+/* __ena_shutoff - Helper used in both PCI remove/shutdown routines
  * @pdev: PCI device information struct
+ * @shutdown: Is it a shutdown operation? If false, means it is a removal
  *
- * ena_remove is called by the PCI subsystem to alert the driver
- * that it should release a PCI device.
+ * __ena_shutoff is a helper routine that does the real work on shutdown and
+ * removal paths; the difference between those paths is with regards to whether
+ * dettach or unregister the netdevice.
  */
-static void ena_remove(struct pci_dev *pdev)
+static void __ena_shutoff(struct pci_dev *pdev, bool shutdown)
 {
 	struct ena_adapter *adapter = pci_get_drvdata(pdev);
 	struct ena_com_dev *ena_dev;
@@ -4830,14 +4859,18 @@ static void ena_remove(struct pci_dev *pdev)
 
 	cancel_work_sync(&adapter->reset_task);
 
-	rtnl_lock();
+	rtnl_lock(); /* lock released inside the below if-else block */
 	adapter->reset_reason = ENA_REGS_RESET_SHUTDOWN;
 	ena_destroy_device(adapter, true);
-	rtnl_unlock();
-
-	unregister_netdev(netdev);
-
-	free_netdev(netdev);
+	if (shutdown) {
+		netif_device_detach(netdev);
+		dev_close(netdev);
+		rtnl_unlock();
+	} else {
+		rtnl_unlock();
+		unregister_netdev(netdev);
+		free_netdev(netdev);
+	}
 
 	ena_com_rss_destroy(ena_dev);
 
@@ -4852,6 +4885,30 @@ static void ena_remove(struct pci_dev *pdev)
 	vfree(ena_dev);
 }
 
+/* ena_remove - Device Removal Routine
+ * @pdev: PCI device information struct
+ *
+ * ena_remove is called by the PCI subsystem to alert the driver
+ * that it should release a PCI device.
+ */
+
+static void ena_remove(struct pci_dev *pdev)
+{
+	__ena_shutoff(pdev, false);
+}
+
+/* ena_shutdown - Device Shutdown Routine
+ * @pdev: PCI device information struct
+ *
+ * ena_shutdown is called by the PCI subsystem to alert the driver that
+ * a shutdown/reboot (or kexec) is happening and device must be disabled.
+ */
+
+static void ena_shutdown(struct pci_dev *pdev)
+{
+	__ena_shutoff(pdev, true);
+}
+
 #ifdef CONFIG_PM
 /* ena_suspend - PM suspend callback
  * @pdev: PCI device information struct
@@ -4902,6 +4959,7 @@ static struct pci_driver ena_pci_driver = {
 	.id_table	= ena_pci_tbl,
 	.probe		= ena_probe,
 	.remove		= ena_remove,
+	.shutdown	= ena_shutdown,
 #ifdef CONFIG_PM
 	.suspend    = ena_suspend,
 	.resume     = ena_resume,
diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h
index 48f20d54830f0..038fd6c873a0a 100755
--- a/drivers/amazon/net/ena/ena_netdev.h
+++ b/drivers/amazon/net/ena/ena_netdev.h
@@ -53,7 +53,7 @@
 
 #define DRV_MODULE_GEN_MAJOR	2
 #define DRV_MODULE_GEN_MINOR	2
-#define DRV_MODULE_GEN_SUBMINOR 6
+#define DRV_MODULE_GEN_SUBMINOR 10
 
 #define DRV_MODULE_NAME		"ena"
 #ifndef DRV_MODULE_GENERATION
@@ -141,6 +141,8 @@
 #define ENA_IO_IRQ_FIRST_IDX		1
 #define ENA_IO_IRQ_IDX(q)		(ENA_IO_IRQ_FIRST_IDX + (q))
 
+#define ENA_POLL_DELAY_US 5000
+
 /* ENA device should send keep alive msg every 1 sec.
  * We wait for 6 sec just to be on the safe side.
  */
@@ -443,6 +445,8 @@ struct ena_adapter {
 
 	struct u64_stats_sync syncp;
 	struct ena_stats_dev dev_stats;
+	struct ena_admin_eni_stats eni_stats;
+	bool eni_stats_supported;
 
 	/* last queue index that was checked for uncompleted tx packets */
 	u32 last_monitored_tx_qid;
@@ -462,6 +466,8 @@ void ena_dump_stats_to_dmesg(struct ena_adapter *adapter);
 
 void ena_dump_stats_to_buf(struct ena_adapter *adapter, u8 *buf);
 
+int ena_update_hw_stats(struct ena_adapter *adapter);
+
 int ena_update_queue_sizes(struct ena_adapter *adapter,
 			   u32 new_tx_size,
 			   u32 new_rx_size);
diff --git a/drivers/amazon/net/ena/ena_pci_id_tbl.h b/drivers/amazon/net/ena/ena_pci_id_tbl.h
index f80d2a47fa94a..426e57e10a7f0 100755
--- a/drivers/amazon/net/ena/ena_pci_id_tbl.h
+++ b/drivers/amazon/net/ena/ena_pci_id_tbl.h
@@ -53,10 +53,15 @@
 #define PCI_DEV_ID_ENA_LLQ_VF	0xec21
 #endif
 
+#ifndef PCI_DEV_ID_ENA_RESRV0
+#define PCI_DEV_ID_ENA_RESRV0	0x0051
+#endif
+
 #define ENA_PCI_ID_TABLE_ENTRY(devid) \
 	{PCI_DEVICE(PCI_VENDOR_ID_AMAZON, devid)},
 
 static const struct pci_device_id ena_pci_tbl[] = {
+	ENA_PCI_ID_TABLE_ENTRY(PCI_DEV_ID_ENA_RESRV0)
 	ENA_PCI_ID_TABLE_ENTRY(PCI_DEV_ID_ENA_PF)
 	ENA_PCI_ID_TABLE_ENTRY(PCI_DEV_ID_ENA_LLQ_PF)
 	ENA_PCI_ID_TABLE_ENTRY(PCI_DEV_ID_ENA_VF)
diff --git a/drivers/amazon/net/ena/kcompat.h b/drivers/amazon/net/ena/kcompat.h
index 3a801efcec67e..f155f2c65b59f 100755
--- a/drivers/amazon/net/ena/kcompat.h
+++ b/drivers/amazon/net/ena/kcompat.h
@@ -652,7 +652,9 @@ do {									\
 #endif
 #endif
 
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0))
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0)) || \
+	(RHEL_RELEASE_CODE && \
+	RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 2))
 #define HAVE_NETDEV_XMIT_MORE
 #endif
 
@@ -696,7 +698,7 @@ do {									\
 #define ENA_XDP_SUPPORT
 #endif
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,5,0)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,6,0)
 #define HAVE_NDO_TX_TIMEOUT_STUCK_QUEUE_PARAMETER
 #endif
 

From 6cc87e8a634026269b58298bd77b61fffabd1ae2 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Mon, 5 Oct 2020 17:06:13 +0000
Subject: [PATCH 062/737] drivers/amazon: config: don't use '--help--' anymore

--help-- has been deprecated. Replace it with plain 'help'.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/amazon/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/amazon/Kconfig b/drivers/amazon/Kconfig
index eb0f5450bb1d8..a51d1cfea64b4 100644
--- a/drivers/amazon/Kconfig
+++ b/drivers/amazon/Kconfig
@@ -6,7 +6,7 @@ config AMAZON_DRIVER_UPDATES
 	bool "Amazon Driver Updates"
 	default y
 	depends on PCI || EXPERIMENTAL
-	---help---
+	help
 	 Amazon driver updates includes out-of-tree drivers and/or modifeid
 	 versions of the drivers present in the stable kernel tree.
 
@@ -15,7 +15,7 @@ if AMAZON_DRIVER_UPDATES
 config AMAZON_ENA_ETHERNET
 	tristate "Elastic Network Adapter (ENA) support"
 	depends on PCI_MSI && !ENA_ETHERNET
-	---help---
+	help
 	  This driver supports Elastic Network Adapter (ENA)
 
 	  To compile this driver as a module, choose M here.

From 22d4bd8ae6676f52c991bdfcdbb3b6f04203be51 Mon Sep 17 00:00:00 2001
From: Andy Strohman <astroh@amazon.com>
Date: Tue, 26 May 2020 22:23:08 +0000
Subject: [PATCH 063/737] lustre: restore mgc binding for sptlrpc

This patch came from the b2_10 branch of AmazonFSxLustreClient repo.

The patch from that repo is:

    LU-10937 mgc: restore mgc binding for sptlrpc

    The work for LU-9034 mapped config logs to separate mgc devices.
    This change prevented the ability to configure sptlrpc. A later
    work around was introduced in LU-9567. Recently it was reported
    that the work around introduced can now cause a MGC failover
    panic. This patch is the proper fix in that the sptlrpc is
    properly bound to an mgc device.

    The sptlrpc config record expects 2 pieces of data:

      *  [0]: fs_name/target_name,
      *  [1]: rule string

    What was happening is that when you set cfg_instance it was used
    to create a new instance name of the form fsname-%p. For sptlrpc
    it expects it to only be fsname. The solution is to test if the
    config record is for sptlrpc and in that can keep the first
    record field as is. With this change we can drop cfg_obdname
    which only sptlrpc used.

    Test-Parameters: testlist=sanity-gss envdefinitions=ONLY=1,SHARED_KEY=true
    Test-Parameters: testlist=sanity-sec envdefinitions=SHARED_KEY=true

    Change-Id: I785f98264c6269f95c0d9a564b731d1b6ff0bcee
    Signed-off-by: James Simmons <uja.ornl@yahoo.com>
    Reviewed-on: https://review.whamcloud.com/33311
    Tested-by: Jenkins
    Reviewed-by: Sebastien Buisson <sbuisson@ddn.com>
    Tested-by: Maloo <hpdd-maloo@intel.com>
    Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
    Reviewed-by: Oleg Drokin <green@whamcloud.com>

Signed-off-by: Andy Strohman <astroh@amazon.com>
---
 drivers/staging/lustrefsx/lustre/include/obd_class.h   | 1 -
 drivers/staging/lustrefsx/lustre/mgc/mgc_request.c     | 7 +------
 drivers/staging/lustrefsx/lustre/obdclass/obd_config.c | 7 +++++--
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/staging/lustrefsx/lustre/include/obd_class.h b/drivers/staging/lustrefsx/lustre/include/obd_class.h
index 729d34ad91fe2..5223eedaae96c 100644
--- a/drivers/staging/lustrefsx/lustre/include/obd_class.h
+++ b/drivers/staging/lustrefsx/lustre/include/obd_class.h
@@ -173,7 +173,6 @@ int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg);
 
 /* Passed as data param to class_config_parse_llog */
 struct config_llog_instance {
-	char			*cfg_obdname;
 	void			*cfg_instance;
 	struct super_block	*cfg_sb;
 	struct obd_uuid		 cfg_uuid;
diff --git a/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c b/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c
index 151283328e485..a2a2bdd1f0732 100644
--- a/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c
+++ b/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c
@@ -235,10 +235,8 @@ struct config_llog_data *do_config_log_add(struct obd_device *obd,
 	/* Keep the mgc around until we are done */
 	cld->cld_mgcexp = class_export_get(obd->obd_self_export);
 
-	if (cld_is_sptlrpc(cld)) {
+	if (cld_is_sptlrpc(cld))
 		sptlrpc_conf_log_start(logname);
-		cld->cld_cfg.cfg_obdname = obd->obd_name;
-	}
 
 	spin_lock(&config_list_lock);
 	list_add(&cld->cld_list_chain, &config_llog_list);
@@ -297,9 +295,6 @@ static struct config_llog_data *config_log_find_or_add(struct obd_device *obd,
 
 	lcfg.cfg_instance = sb != NULL ? (void *)sb : (void *)obd;
 
-	if (type == CONFIG_T_SPTLRPC)
-		lcfg.cfg_instance = NULL;
-
 	cld = config_log_find(logname, &lcfg);
 	if (unlikely(cld != NULL))
 		return cld;
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
index 84f6a7ad0c146..936f1db0d70be 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
@@ -1558,6 +1558,7 @@ int class_config_llog_handler(const struct lu_env *env,
 		lustre_cfg_bufs_init(&bufs, lcfg);
 
 		if (cfg->cfg_instance &&
+		    lcfg->lcfg_command != LCFG_SPTLRPC_CONF &&
 		    LUSTRE_CFG_BUFLEN(lcfg, 0) > 0) {
 			inst_len = LUSTRE_CFG_BUFLEN(lcfg, 0) +
 				   sizeof(cfg->cfg_instance) * 2 + 4;
@@ -1586,14 +1587,16 @@ int class_config_llog_handler(const struct lu_env *env,
                  * moving them to index [1] and [2], and insert MGC's
                  * obdname at index [0].
                  */
-		if (cfg->cfg_instance == NULL &&
+		if (cfg->cfg_instance &&
 		    lcfg->lcfg_command == LCFG_SPTLRPC_CONF) {
+			struct obd_device *obd = cfg->cfg_instance;
+
 			lustre_cfg_bufs_set(&bufs, 2, bufs.lcfg_buf[1],
 					    bufs.lcfg_buflen[1]);
 			lustre_cfg_bufs_set(&bufs, 1, bufs.lcfg_buf[0],
 					    bufs.lcfg_buflen[0]);
 			lustre_cfg_bufs_set_string(&bufs, 0,
-						   cfg->cfg_obdname);
+						   obd->obd_name);
 		}
 
 		/* Add net info to setup command

From 32335540535a9fcdb22045f6d8a5e42ea267baba Mon Sep 17 00:00:00 2001
From: Andy Strohman <astroh@amazon.com>
Date: Mon, 22 Jun 2020 22:57:19 +0000
Subject: [PATCH 064/737] Update lustre to tag v2.10.8-5 in
 AmazonFSxLustreClient

Signed-off-by: Andy Strohman <astroh@amazon.com>
---
 drivers/staging/lustrefsx/config.h            | 122 +++---
 .../lustrefsx/libcfs/include/libcfs/libcfs.h  |  10 +
 .../libcfs/include/libcfs/libcfs_prim.h       |   4 +-
 .../libcfs/include/libcfs/libcfs_ptask.h      |  15 +-
 .../libcfs/include/libcfs/linux/linux-misc.h  |  18 +
 .../libcfs/include/libcfs/linux/linux-time.h  |   1 -
 .../lustrefsx/libcfs/libcfs/libcfs_ptask.c    |  33 +-
 .../libcfs/libcfs/linux/linux-curproc.c       |   4 +-
 .../libcfs/libcfs/linux/linux-debug.c         |  41 +-
 .../libcfs/libcfs/linux/linux-prim.c          |   2 +-
 .../libcfs/libcfs/linux/linux-tracefile.c     |   3 +-
 .../lustrefsx/libcfs/libcfs/util/l_ioctl.c    |   2 +-
 .../lustrefsx/libcfs/libcfs/watchdog.c        |  12 +-
 drivers/staging/lustrefsx/lnet/LICENSE        | 363 +++++++++++++++++
 .../lustrefsx/lnet/include/lnet/lib-lnet.h    |  24 +-
 .../lustrefsx/lnet/include/lnet/lnetctl.h     |   2 +-
 .../lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c    |  77 ++--
 .../lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h    |  36 +-
 .../lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c |  18 +-
 .../lustrefsx/lnet/klnds/socklnd/socklnd.c    |  13 +-
 .../lustrefsx/lnet/klnds/socklnd/socklnd_cb.c |   5 +-
 .../lnet/klnds/socklnd/socklnd_proto.c        |   2 +-
 .../staging/lustrefsx/lnet/lnet/acceptor.c    |  10 +-
 drivers/staging/lustrefsx/lnet/lnet/config.c  |  12 +-
 .../staging/lustrefsx/lnet/lnet/lib-socket.c  |  89 +++--
 drivers/staging/lustrefsx/lnet/lnet/router.c  |   2 +-
 .../staging/lustrefsx/lnet/lnet/router_proc.c |   2 +-
 drivers/staging/lustrefsx/lustre/LICENSE      | 372 ++++++++++++++++++
 .../lustrefsx/lustre/include/dt_object.h      |   2 -
 .../lustre/include/lustre/lustre_idl.h        |  15 +-
 .../lustre/include/lustre/lustre_user.h       |  62 ++-
 .../lustrefsx/lustre/include/lustre_compat.h  | 100 ++---
 .../lustrefsx/lustre/include/lustre_dlm.h     |  21 +-
 .../lustre/include/lustre_dlm_flags.h         |   8 -
 .../lustrefsx/lustre/include/lustre_fid.h     |   1 -
 .../lustrefsx/lustre/include/lustre_idmap.h   |   2 +
 .../staging/lustrefsx/lustre/include/obd.h    |   4 +-
 .../lustrefsx/lustre/include/obd_support.h    |   4 +
 .../lustre/include/uapi/linux/lustre_ioctl.h  |   1 +
 .../staging/lustrefsx/lustre/ldlm/ldlm_lib.c  |  40 +-
 .../staging/lustrefsx/lustre/ldlm/ldlm_lock.c |   4 +-
 .../lustrefsx/lustre/ldlm/ldlm_lockd.c        | 117 +++---
 .../lustrefsx/lustre/ldlm/ldlm_request.c      |  96 +++--
 .../lustrefsx/lustre/ldlm/ldlm_resource.c     |   1 +
 drivers/staging/lustrefsx/lustre/llite/file.c |  38 +-
 .../staging/lustrefsx/lustre/llite/glimpse.c  |  10 -
 .../lustrefsx/lustre/llite/lcommon_cl.c       |  69 ++--
 .../lustrefsx/lustre/llite/llite_lib.c        | 208 +++++-----
 .../lustrefsx/lustre/llite/llite_mmap.c       |  15 +-
 .../lustrefsx/lustre/llite/lproc_llite.c      |   8 +-
 .../staging/lustrefsx/lustre/llite/namei.c    |  63 +--
 .../staging/lustrefsx/lustre/llite/vvp_io.c   |  15 +-
 .../lustrefsx/lustre/llite/vvp_object.c       |   6 +-
 .../staging/lustrefsx/lustre/llite/xattr.c    |  19 +-
 .../lustrefsx/lustre/llite/xattr_security.c   |  20 +-
 .../staging/lustrefsx/lustre/lmv/lmv_intent.c |   6 +-
 .../staging/lustrefsx/lustre/lmv/lmv_obd.c    |  35 +-
 .../staging/lustrefsx/lustre/lov/lov_obd.c    |  46 ++-
 .../staging/lustrefsx/lustre/lov/lov_object.c |   7 +-
 .../staging/lustrefsx/lustre/lov/lov_pack.c   |  29 +-
 .../staging/lustrefsx/lustre/mdc/mdc_lib.c    |   6 +-
 .../staging/lustrefsx/lustre/mdc/mdc_locks.c  |   8 +-
 .../staging/lustrefsx/lustre/mdc/mdc_reint.c  |   7 +-
 .../lustrefsx/lustre/mdc/mdc_request.c        |  30 +-
 drivers/staging/lustrefsx/lustre/nodist       |   9 +
 .../lustrefsx/lustre/obdclass/class_obd.c     |   6 +-
 .../lustrefsx/lustre/obdclass/dt_object.c     |  18 -
 .../lustrefsx/lustre/obdclass/genops.c        |   8 +-
 .../lustre/obdclass/linux/linux-sysctl.c      |   2 +-
 .../lustrefsx/lustre/obdclass/llog_cat.c      |   3 +-
 .../lustrefsx/lustre/obdclass/lu_object.c     |   2 +-
 .../lustrefsx/lustre/obdclass/obd_config.c    |  70 +++-
 .../lustre/obdclass/obd_mount_server.c        |  19 +-
 .../staging/lustrefsx/lustre/obdclass/obdo.c  |  13 +-
 .../lustrefsx/lustre/obdecho/echo_client.c    |  45 +--
 .../staging/lustrefsx/lustre/osc/lproc_osc.c  |   2 +-
 .../lustrefsx/lustre/osc/osc_request.c        |  12 +-
 .../staging/lustrefsx/lustre/ptlrpc/client.c  |  10 +-
 .../staging/lustrefsx/lustre/ptlrpc/import.c  |  18 +-
 .../lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c    |   2 +-
 .../staging/lustrefsx/lustre/ptlrpc/recover.c |   2 +
 .../lustrefsx/lustre/ptlrpc/sec_bulk.c        |   6 +-
 .../staging/lustrefsx/lustre/ptlrpc/wirehdr.c |  44 +++
 .../staging/lustrefsx/lustre/target/out_lib.c |   4 -
 .../lustrefsx/lustre/target/tgt_grant.c       |  15 +-
 .../lustrefsx/lustre/target/tgt_main.c        |   1 -
 drivers/staging/lustrefsx/undef.h             |  54 +++
 87 files changed, 1917 insertions(+), 865 deletions(-)
 create mode 100644 drivers/staging/lustrefsx/lnet/LICENSE
 create mode 100644 drivers/staging/lustrefsx/lustre/LICENSE
 create mode 100644 drivers/staging/lustrefsx/lustre/nodist
 create mode 100644 drivers/staging/lustrefsx/lustre/ptlrpc/wirehdr.c

diff --git a/drivers/staging/lustrefsx/config.h b/drivers/staging/lustrefsx/config.h
index 8925156518a43..d3295c5726e99 100644
--- a/drivers/staging/lustrefsx/config.h
+++ b/drivers/staging/lustrefsx/config.h
@@ -139,9 +139,6 @@
 /* current_time() has replaced CURRENT_TIME */
 #define HAVE_CURRENT_TIME 1
 
-/* inode times are timespec64 */
-#define HAVE_INODE_TIME_64BIT 1
-
 /* dcache_lock is exist */
 /* #undef HAVE_DCACHE_LOCK */
 
@@ -296,7 +293,7 @@
 #define HAVE_FILLDIR_USE_CTX 1
 
 /* fpu/api.h is present */
-#define HAVE_FPU_API_HEADER 1
+/* #undef HAVE_FPU_API_HEADER */
 
 /* struct file_system_type has mount field */
 #define HAVE_FSTYPE_MOUNT 1
@@ -328,6 +325,9 @@
 /* get_user_pages takes gup_flags in arguments */
 #define HAVE_GET_USER_PAGES_GUP_FLAGS 1
 
+/* get_user_pages takes gup_flags in arguments with 7 args */
+/* #undef HAVE_GET_USER_PAGES_GUP_FLAGS_7ARGS */
+
 /* struct group_info has member gid */
 #define HAVE_GROUP_INFO_GID 1
 
@@ -364,6 +364,9 @@
 /* struct ib_device.attrs is defined */
 #define HAVE_IB_DEVICE_ATTRS 1
 
+/* if struct ib_device_ops is defined */
+#define HAVE_IB_DEVICE_OPS 1
+
 /* ib_get_dma_mr is defined */
 /* #undef HAVE_IB_GET_DMA_MR */
 
@@ -376,9 +379,15 @@
 /* ib_map_mr_sg has 5 arguments */
 #define HAVE_IB_MAP_MR_SG_5ARGS 1
 
+/* ib_post_send and ib_post_recv have const parameters */
+#define HAVE_IB_POST_SEND_RECV_CONST 1
+
 /* struct ib_rdma_wr is defined */
 #define HAVE_IB_RDMA_WR 1
 
+/* if ib_sg_dma_address wrapper exists */
+/* #undef HAVE_IB_SG_DMA_ADDRESS */
+
 /* inode_operations .getattr member function can gather advance stats */
 #define HAVE_INODEOPS_ENHANCED_GETATTR 1
 
@@ -433,6 +442,9 @@
 /* inode_operations has {get,set,remove}xattr members */
 /* #undef HAVE_IOP_XATTR */
 
+/* if iov_iter has member type */
+#define HAVE_IOV_ITER_HAS_TYPE_MEMBER 1
+
 /* iov_iter_init handles directional tag */
 #define HAVE_IOV_ITER_INIT_DIRECTION 1
 
@@ -442,9 +454,15 @@
 /* iov_iter_truncate exists */
 #define HAVE_IOV_ITER_TRUNCATE 1
 
+/* if iov_iter_type exists */
+#define HAVE_IOV_ITER_TYPE 1
+
 /* is_sxid is defined */
 #define HAVE_IS_SXID 1
 
+/* struct address_space has i_pages */
+#define HAVE_I_PAGES 1
+
 /* i_uid_read is present */
 #define HAVE_I_UID_READ 1
 
@@ -454,6 +472,9 @@
 /* 'struct sock' accept function requires bool argument */
 #define HAVE_KERN_SOCK_ACCEPT_FLAG_ARG 1
 
+/* 'getname' has two args */
+#define HAVE_KERN_SOCK_GETNAME_2ARGS 1
+
 /* struct key_match_data exist */
 #define HAVE_KEY_MATCH_DATA 1
 
@@ -461,7 +482,7 @@
 #define HAVE_KEY_PAYLOAD_DATA_ARRAY 1
 
 /* key_type->instantiate has two args */
-/* #undef HAVE_KEY_TYPE_INSTANTIATE_2ARGS */
+#define HAVE_KEY_TYPE_INSTANTIATE_2ARGS 1
 
 /* ki_left exist */
 /* #undef HAVE_KIOCB_KI_LEFT */
@@ -494,6 +515,9 @@
 /* kernel has kstrtoul */
 #define HAVE_KSTRTOUL 1
 
+/* kernel has ksys_close */
+#define HAVE_KSYS_CLOSE 1
+
 /* kthread_worker found */
 /* #undef HAVE_KTHREAD_WORK */
 
@@ -551,6 +575,9 @@
 /* Define to 1 if you have the <linux/random.h> header file. */
 #define HAVE_LINUX_RANDOM_H 1
 
+/* if linux/selinux.h exists */
+#define HAVE_LINUX_SELINUX_IS_ENABLED 1
+
 /* Define to 1 if you have the <linux/types.h> header file. */
 #define HAVE_LINUX_TYPES_H 1
 
@@ -560,6 +587,9 @@
 /* Define to 1 if you have the <linux/version.h> header file. */
 #define HAVE_LINUX_VERSION_H 1
 
+/* lock_manager_operations has lm_compare_owner */
+/* #undef HAVE_LM_COMPARE_OWNER */
+
 /* lock-manager ops renamed to lm_xxx */
 #define HAVE_LM_XXX_LOCK_MANAGER_OPS 1
 
@@ -613,7 +643,7 @@
 #define HAVE_PAGEVEC_INIT_ONE_PARAM 1
 
 /* have PCLMULQDQ instruction */
-#define HAVE_PCLMULQDQ 1
+/* #undef HAVE_PCLMULQDQ */
 
 /* percpu_counter_init uses GFP_* flag */
 #define HAVE_PERCPU_COUNTER_INIT_GFP_FLAG 1
@@ -640,10 +670,10 @@
 #define HAVE_PROTECT_I_NLINK 1
 
 /* have quota64 */
-/* #undef HAVE_QUOTA64 */
+#define HAVE_QUOTA64 1
 
 /* radix_tree_exceptional_entry exist */
-#define HAVE_RADIX_EXCEPTION_ENTRY 1
+/* #undef HAVE_RADIX_EXCEPTION_ENTRY */
 
 /* rdma_create_id wants 4 args */
 /* #undef HAVE_RDMA_CREATE_ID_4ARG */
@@ -685,10 +715,10 @@
 /* #undef HAVE_SECURITY_IINITSEC_QSTR */
 
 /* support for selinux */
-#define HAVE_SELINUX 1
+/* #undef HAVE_SELINUX */
 
 /* Define to 1 if you have the <selinux/selinux.h> header file. */
-#define HAVE_SELINUX_SELINUX_H 1
+/* #undef HAVE_SELINUX_SELINUX_H */
 
 /* support server */
 /* #undef HAVE_SERVER_SUPPORT */
@@ -739,6 +769,9 @@
 /* stacktrace_ops.warning is exist */
 /* #undef HAVE_STACKTRACE_WARNING */
 
+/* stack_trace_print() exists */
+#define HAVE_STACK_TRACE_PRINT 1
+
 /* Define to 1 if you have the <stdint.h> header file. */
 #define HAVE_STDINT_H 1
 
@@ -800,7 +833,7 @@
 /* #undef HAVE_TCP_SENDPAGE_USE_SOCKET */
 
 /* timer_setup has replaced setup_timer */
-#define HAVE_TIMER_SETUP
+#define HAVE_TIMER_SETUP 1
 
 /* 'struct timespec64' is available */
 #define HAVE_TIMESPEC64 1
@@ -814,21 +847,24 @@
 /* topology_sibling_cpumask is available */
 #define HAVE_TOPOLOGY_SIBLING_CPUMASK 1
 
+/* if totalram_pages is a function */
+#define HAVE_TOTALRAM_PAGES_AS_FUNC 1
+
 /* kernel export truncate_complete_page */
 /* #undef HAVE_TRUNCATE_COMPLETE_PAGE */
 
 /* kernel has truncate_inode_pages_final */
 #define HAVE_TRUNCATE_INODE_PAGES_FINAL 1
 
+/* if MS_RDONLY was moved to uapi/linux/mount.h */
+#define HAVE_UAPI_LINUX_MOUNT_H 1
+
 /* uidgid.h is present */
 #define HAVE_UIDGID_HEADER 1
 
 /* Define to 1 if you have the <unistd.h> header file. */
 #define HAVE_UNISTD_H 1
 
-/* xattr_handler has a name member */
-#define HAVE_XATTR_HANDLER_NAME 1
-
 /* kernel has vfs_rename with 5 args */
 /* #undef HAVE_VFS_RENAME_5ARGS */
 
@@ -844,6 +880,9 @@
 /* virtual_address has been replaced by address field */
 #define HAVE_VM_FAULT_ADDRESS 1
 
+/* if vm_fault_t type exists */
+#define HAVE_VM_FAULT_T 1
+
 /* 'struct vm_operations' remove struct vm_area_struct argument */
 #define HAVE_VM_OPS_USE_VM_FAULT_ONLY 1
 
@@ -856,9 +895,15 @@
 /* needs inode parameter */
 #define HAVE_XATTR_HANDLER_INODE_PARAM 1
 
+/* xattr_handler has a name member */
+#define HAVE_XATTR_HANDLER_NAME 1
+
 /* handler pointer is parameter */
 /* #undef HAVE_XATTR_HANDLER_SIMPLIFIED */
 
+/* xa_is_value exist */
+#define HAVE_XA_IS_VALUE 1
+
 /* Have zap_add_by_dnode() in ZFS */
 /* #undef HAVE_ZAP_ADD_BY_DNODE */
 
@@ -874,39 +919,6 @@
 /* __add_wait_queue_exclusive exists */
 /* #undef HAVE___ADD_WAIT_QUEUE_EXCLUSIVE */
 
-/* struct address_space uses i_pages and xa_lock */
-/* #undef HAVE_ADDRESS_SPACE_IPAGES */
-
-/* struct address_space was converted to an Xarray */
-#define HAVE_ADDRESS_SPACE_XARRAY 1
-
-/* posix acl uses the refcount interface */
-#define HAVE_POSIX_ACL_REFCOUNT 1
-
-/* sys_close was converted to ksys_close for kernel use */
-#define HAVE_KSYS_CLOSE 1
-
-/* kernel_get{sock,peer}name was converted to return the sockaddr length */
-#define HAVE_KERNSOCK_RETURNSLEN 1
-
-/* the 'opened' argument to finish_open and atomic_open was removed */
-#define HAVE_ATOMIC_OPEN_NO_OPENED 1
-
-/* totalram_pages was turned in to a function */
-#define HAVE_TOTALRAM_PAGES_FUNC 1
-
-/* vm_fault_t exists */
-#define HAVE_VM_FAULT_T 1
-
-/* Common stacktrace infrastructure exists */
-#define HAVE_COMMON_STACKTRACE 1
-
-/* changed padata interface in 5.4 */
-#define HAVE_PADATA_INTERFACE_54
-
-/* changed padata interface in 5.6 (and the 5.4 -stable branch) */
-#define HAVE_PADATA_INTERFACE_56
-
 /* ext4_journal_start takes 3 arguments */
 /* #undef JOURNAL_START_HAS_3ARGS */
 
@@ -933,10 +945,10 @@
 #define LUSTRE_MINOR 10
 
 /* Third number in the Lustre version */
-#define LUSTRE_PATCH 5
+#define LUSTRE_PATCH 8
 
 /* A copy of PACKAGE_VERSION */
-#define LUSTRE_VERSION_STRING "2.10.5"
+#define LUSTRE_VERSION_STRING "2.10.8"
 
 /* maximum number of MDS threads */
 /* #undef MDS_MAX_THREADS */
@@ -948,10 +960,10 @@
 #define MKE2FS "mke2fs"
 
 /* need pclmulqdq based crc32c */
-/* #undef NEED_CRC32C_ACCEL */
+#define NEED_CRC32C_ACCEL 1
 
 /* need pclmulqdq based crc32 */
-/* #undef NEED_CRC32_ACCEL */
+#define NEED_CRC32_ACCEL 1
 
 /* 'ktime_get_real_ns' is not available */
 /* #undef NEED_KTIME_GET_REAL_NS */
@@ -963,13 +975,13 @@
 #define PACKAGE "lustre"
 
 /* Define to the address where bug reports for this package should be sent. */
-#define PACKAGE_BUGREPORT "https://jira.hpdd.intel.com/"
+#define PACKAGE_BUGREPORT "https://jira.whamcloud.com/"
 
 /* Define to the full name of this package. */
 #define PACKAGE_NAME "Lustre"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "Lustre 2.10.5"
+#define PACKAGE_STRING "Lustre 2.10.8"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "lustre"
@@ -978,7 +990,7 @@
 #define PACKAGE_URL ""
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2.10.5"
+#define PACKAGE_VERSION "2.10.8"
 
 /* name of parallel fsck program */
 #define PFSCK "fsck"
@@ -1019,7 +1031,7 @@
 /* #undef USE_LU_REF */
 
 /* Version number of package */
-#define VERSION "2.10.5"
+#define VERSION "2.10.8"
 
 /* zfs fix version */
 /* #undef ZFS_FIX */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h
index 23f29d53224ee..8055d37510921 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h
@@ -72,6 +72,16 @@ void lc_watchdog_disable(struct lc_watchdog *lcw);
 /* Clean up the watchdog */
 void lc_watchdog_delete(struct lc_watchdog *lcw);
 
+#ifdef HAVE_TOTALRAM_PAGES_AS_FUNC
+ #ifndef cfs_totalram_pages
+  #define cfs_totalram_pages() totalram_pages()
+ #endif
+#else
+ #ifndef cfs_totalram_pages
+  #define cfs_totalram_pages() totalram_pages
+ #endif
+#endif
+
 /* need both kernel and user-land acceptor */
 #define LNET_ACCEPTOR_MIN_RESERVED_PORT    512
 #define LNET_ACCEPTOR_MAX_RESERVED_PORT    1023
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_prim.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_prim.h
index 32a629e25eb77..16bda0c460ebf 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_prim.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_prim.h
@@ -44,9 +44,9 @@
 #if BITS_PER_LONG == 32
 /* limit to lowmem on 32-bit systems */
 # define NUM_CACHEPAGES \
-	min(TOTALRAM_PAGES, 1UL << (30 - PAGE_SHIFT) * 3 / 4)
+	min(cfs_totalram_pages(), 1UL << (30 - PAGE_SHIFT) * 3 / 4)
 #else
-# define NUM_CACHEPAGES TOTALRAM_PAGES
+# define NUM_CACHEPAGES cfs_totalram_pages()
 #endif
 
 static inline unsigned int memory_pressure_get(void)
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h
index b7c791cdf1ebb..85925492dd5df 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h
@@ -9,6 +9,16 @@
 #include <linux/notifier.h>
 #include <linux/workqueue.h>
 #include <linux/completion.h>
+
+/*
+ * Unconditionaly disable PADATA.
+ *
+ * Padata is needed for PIO client feature. This feature is disabled by default
+ * and was removed from Lustre code during 2.13 development (2b0a34fe43bf).
+ * Instead of adapting the code to Linux 5.4+ change, just disable it.
+ */
+#undef CONFIG_PADATA
+
 #ifdef CONFIG_PADATA
 #include <linux/padata.h>
 #else
@@ -25,12 +35,7 @@ struct padata_instance {};
 
 struct cfs_ptask_engine {
 	struct padata_instance	*pte_pinst;
-#ifdef HAVE_PADATA_INTERFACE_56
-	struct padata_shell	*pte_pshell;
-#endif
-#ifndef HAVE_PADATA_INTERFACE_54
 	struct workqueue_struct	*pte_wq;
-#endif
 	struct notifier_block	 pte_notifier;
 	int			 pte_weight;
 };
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h
index 0ad585f913c94..8b3d398459c74 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h
@@ -34,6 +34,8 @@
 #define __LIBCFS_LINUX_MISC_H__
 
 #include <linux/fs.h>
+#include <linux/uio.h>
+
 #ifdef HAVE_SYSCTL_CTLNAME
 #define INIT_CTL_NAME	.ctl_name = CTL_UNNUMBERED,
 #define INIT_STRATEGY	.strategy = &sysctl_intvec,
@@ -42,6 +44,22 @@
 #define INIT_STRATEGY
 #endif
 
+#ifndef HAVE_IOV_ITER_TYPE
+#ifdef HAVE_IOV_ITER_HAS_TYPE_MEMBER
+#define iter_is_iovec(iter)		((iter)->type & ITER_IOVEC)
+#define iov_iter_is_kvec(iter)		((iter)->type & ITER_KVEC)
+#define iov_iter_is_bvec(iter)		((iter)->type & ITER_BVEC)
+#define iov_iter_is_pipe(iter)		((iter)->type & ITER_PIPE)
+#define iov_iter_is_discard(iter)	((iter)->type & ITER_DISCARD)
+#else
+#define iter_is_iovec(iter)		1
+#define iov_iter_is_kvec(iter)		0
+#define iov_iter_is_bvec(iter)		0
+#define iov_iter_is_pipe(iter)		0
+#define iov_iter_is_discard(iter)	0
+#endif
+#endif /* HAVE_IOV_ITER_TYPE */
+
 #ifndef HAVE_UIDGID_HEADER
 
 #ifndef _LINUX_UIDGID_H
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
index fa972ff9ca16d..64613de7bd6a8 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
@@ -280,7 +280,6 @@ static inline int cfs_time_beforeq_64(__u64 t1, __u64 t2)
  * One jiffy
  */
 #define CFS_DURATION_T          "%ld"
-
 #ifdef HAVE_TIMER_SETUP
 #define cfs_timer_cb_arg_t struct timer_list *
 #define cfs_from_timer(var, callback_timer, timer_fieldname) \
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c
index 636a93f02e681..275c01b74ad4e 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c
@@ -137,13 +137,7 @@ static int cfs_do_parallel(struct cfs_ptask_engine *engine,
 	ptask->pt_result = -EINPROGRESS;
 
 retry:
-#ifdef HAVE_PADATA_INTERFACE_56
-	rc = padata_do_parallel(engine->pte_pshell, padata, &ptask->pt_cbcpu);
-#elif HAVE_PADATA_INTERFACE_54
-	rc = padata_do_parallel(engine->pte_pinst, padata, &ptask->pt_cbcpu);
-#else
 	rc = padata_do_parallel(engine->pte_pinst, padata, ptask->pt_cbcpu);
-#endif
 	if (rc == -EBUSY && cfs_ptask_is_retry(ptask)) {
 		/* too many tasks already in queue */
 		schedule_timeout_uninterruptible(1);
@@ -332,18 +326,14 @@ static int cfs_ptengine_padata_init(struct cfs_ptask_engine *engine,
 {
 	cpumask_var_t all_mask;
 	cpumask_var_t par_mask;
-#ifndef HAVE_PADATA_INTERFACE_54
 	unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_CPU_INTENSIVE;
-#endif
 	int rc;
 
 	get_online_cpus();
 
-#ifndef HAVE_PADATA_INTERFACE_54
 	engine->pte_wq = alloc_workqueue(name, wq_flags, 1);
 	if (engine->pte_wq == NULL)
 		GOTO(err, rc = -ENOMEM);
-#endif
 
 	if (!alloc_cpumask_var(&all_mask, GFP_KERNEL))
 		GOTO(err_destroy_workqueue, rc = -ENOMEM);
@@ -394,25 +384,15 @@ static int cfs_ptengine_padata_init(struct cfs_ptask_engine *engine,
 	}
 
 	engine->pte_weight = cpumask_weight(par_mask);
-#ifdef HAVE_PADATA_INTERFACE_54
-	engine->pte_pinst  = padata_alloc_possible(name);
-#else
 	engine->pte_pinst  = padata_alloc_possible(engine->pte_wq);
-#endif
 	if (engine->pte_pinst == NULL)
 		GOTO(err_free_par_mask, rc = -ENOMEM);
 
-#ifdef HAVE_PADATA_INTERFACE_56
-	engine->pte_pshell = padata_alloc_shell(engine->pte_pinst);
-	if (engine->pte_pshell == NULL)
-		GOTO(err_free_padata, rc = -ENOMEM);
-#endif
-
 	engine->pte_notifier.notifier_call = cfs_ptask_cpumask_change_notify;
 	rc = padata_register_cpumask_notifier(engine->pte_pinst,
 					      &engine->pte_notifier);
 	if (rc)
-		GOTO(err_free_pashell, rc);
+		GOTO(err_free_padata, rc);
 
 	rc = cfs_ptengine_set_cpumask(engine, par_mask);
 	if (rc)
@@ -431,10 +411,6 @@ static int cfs_ptengine_padata_init(struct cfs_ptask_engine *engine,
 err_unregister:
 	padata_unregister_cpumask_notifier(engine->pte_pinst,
 					   &engine->pte_notifier);
-err_free_pashell:
-#ifdef HAVE_PADATA_INTERFACE_56
-	padata_free_shell(engine->pte_pshell);
-#endif
 err_free_padata:
 	padata_free(engine->pte_pinst);
 err_free_par_mask:
@@ -442,10 +418,8 @@ static int cfs_ptengine_padata_init(struct cfs_ptask_engine *engine,
 err_free_all_mask:
 	free_cpumask_var(all_mask);
 err_destroy_workqueue:
-#ifndef HAVE_PADATA_INTERFACE_54
 	destroy_workqueue(engine->pte_wq);
 err:
-#endif
 	put_online_cpus();
 	return rc;
 }
@@ -455,13 +429,8 @@ static void cfs_ptengine_padata_fini(struct cfs_ptask_engine *engine)
 	padata_stop(engine->pte_pinst);
 	padata_unregister_cpumask_notifier(engine->pte_pinst,
 					   &engine->pte_notifier);
-#ifdef HAVE_PADATA_INTERFACE_56
-	padata_free_shell(engine->pte_pshell);
-#endif
 	padata_free(engine->pte_pinst);
-#ifndef HAVE_PADATA_INTERFACE_54
 	destroy_workqueue(engine->pte_wq);
-#endif
 }
 
 #else  /* !CONFIG_PADATA */
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c
index 7b2e46e61b1bf..38ca4bc97be98 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c
@@ -149,7 +149,9 @@ static int cfs_access_process_vm(struct task_struct *tsk,
 		int bytes, rc, offset;
 		void *maddr;
 
-#if defined(HAVE_GET_USER_PAGES_GUP_FLAGS)
+#if defined(HAVE_GET_USER_PAGES_GUP_FLAGS_7ARGS)
+		rc = get_user_pages(tsk, mm, addr, 1, write ? FOLL_WRITE : 0, &page, &vma);
+#elif defined(HAVE_GET_USER_PAGES_GUP_FLAGS)
 		rc = get_user_pages(addr, 1, write ? FOLL_WRITE : 0, &page, &vma);
 #elif defined(HAVE_GET_USER_PAGES_6ARG)
 		rc = get_user_pages(addr, 1, write, 1, &page, &vma);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-debug.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-debug.c
index cbd2187a9f63f..048b2f34df5ba 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-debug.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-debug.c
@@ -118,32 +118,59 @@ EXPORT_SYMBOL(lbug_with_loc);
 #define MAX_ST_ENTRIES	100
 static DEFINE_SPINLOCK(st_lock);
 
+/*
+ * Linux v5.1-rc5 214d8ca6ee ("stacktrace: Provide common infrastructure")
+ * CONFIG_ARCH_STACKWALK indicates that save_stack_trace_tsk symbol is not
+ * exported. Use symbol_get() to find if save_stack_trace_tsk is available.
+ */
+#ifdef CONFIG_ARCH_STACKWALK
+typedef unsigned int (stack_trace_save_tsk_t)(struct task_struct *task,
+		unsigned long *store, unsigned int size,
+		unsigned int skipnr);
+static stack_trace_save_tsk_t *task_dump_stack;
+#endif
+
 static void libcfs_call_trace(struct task_struct *tsk)
 {
+#ifdef CONFIG_ARCH_STACKWALK
 	static unsigned long entries[MAX_ST_ENTRIES];
-#ifdef HAVE_COMMON_STACKTRACE
-	unsigned int len;
+	unsigned int i, nr_entries;
+
+	if (!task_dump_stack)
+		task_dump_stack = (stack_trace_save_tsk_t *)
+			symbol_get("stack_trace_save_tsk");
+
+	spin_lock(&st_lock);
+	pr_info("Pid: %d, comm: %.20s %s %s\n", tsk->pid, tsk->comm,
+	       init_utsname()->release, init_utsname()->version);
+	pr_info("Call Trace TBD:\n");
+	if (task_dump_stack) {
+		nr_entries = task_dump_stack(tsk, entries, MAX_ST_ENTRIES, 0);
+		for (i = 0; i < nr_entries; i++)
+			pr_info("[<0>] %pB\n", (void *)entries[i]);
+	}
+	spin_unlock(&st_lock);
 #else
 	struct stack_trace trace;
+	static unsigned long entries[MAX_ST_ENTRIES];
 
 	trace.nr_entries = 0;
 	trace.max_entries = MAX_ST_ENTRIES;
 	trace.entries = entries;
 	trace.skip = 0;
-#endif
 
 	spin_lock(&st_lock);
 	pr_info("Pid: %d, comm: %.20s %s %s\n", tsk->pid, tsk->comm,
 	       init_utsname()->release, init_utsname()->version);
 	pr_info("Call Trace:\n");
-#ifdef HAVE_COMMON_STACKTRACE
-	len = stack_trace_save(entries, MAX_ST_ENTRIES, 2);
-	stack_trace_print(entries, len, 1);
-#else
 	save_stack_trace_tsk(tsk, &trace);
+#ifdef HAVE_STACK_TRACE_PRINT
+	stack_trace_print(trace.entries, trace.nr_entries, 0);
+#else
 	print_stack_trace(&trace, 0);
 #endif
 	spin_unlock(&st_lock);
+#endif
 }
 
 #else /* !CONFIG_STACKTRACE */
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c
index a7d5679412f6c..e63f7317485d9 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c
@@ -106,7 +106,7 @@ int cfs_kernel_write(struct file *filp, const void *buf, size_t count,
 	mm_segment_t __old_fs = get_fs();
 	int rc;
 
-	set_fs(get_ds());
+	set_fs(KERNEL_DS);
 	rc = vfs_write(filp, (__force const char __user *)buf, count, pos);
 	set_fs(__old_fs);
 
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-tracefile.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-tracefile.c
index 274179dd54fe9..e0fd4c0de04f1 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-tracefile.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-tracefile.c
@@ -34,7 +34,6 @@
 #define LUSTRE_TRACEFILE_PRIVATE
 
 #include <libcfs/libcfs.h>
-#include <lustre_compat.h>
 #include "tracefile.h"
 
 /* percents to share the total debug memory for each type */
@@ -267,7 +266,7 @@ void cfs_print_to_console(struct ptldebug_header *hdr, int mask,
 
 int cfs_trace_max_debug_mb(void)
 {
-	int  total_mb = (TOTALRAM_PAGES >> (20 - PAGE_SHIFT));
+	int  total_mb = (cfs_totalram_pages() >> (20 - PAGE_SHIFT));
 
 	return MAX(512, (total_mb * 80)/100);
 }
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c
index ceec8703a829a..c3d5556ab1557 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 2014, Intel Corporation.
  *
- *   This file is part of Lustre, https://wiki.hpdd.intel.com/
+ *   This file is part of Lustre, https://wiki.whamcloud.com/
  *
  *   Portals is free software; you can redistribute it and/or
  *   modify it under the terms of version 2 of the GNU General Public
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c b/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c
index f7170860f0277..f9e4de58b8ed2 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c
@@ -172,9 +172,9 @@ static void lcw_dump_stack(struct lc_watchdog *lcw)
 	delta_time = current_time - lcw_last_watchdog_time;
 	if (delta_time < libcfs_watchdog_ratelimit &&
 	    lcw_recent_watchdog_count > 3) {
-		LCONSOLE_WARN("Service thread pid %u was inactive for %lu.%.02lus. Watchdog stack traces are limited to 3 per %d seconds, skipping this one.\n",
+		LCONSOLE_WARN("Service thread pid %u was inactive for %llu.%.02lus. Watchdog stack traces are limited to 3 per %d seconds, skipping this one.\n",
 			      (int)lcw->lcw_pid,
-			      (unsigned long)timediff.tv_sec,
+			      (unsigned long long)timediff.tv_sec,
 			      timediff.tv_nsec / (NSEC_PER_SEC / 100),
 			      libcfs_watchdog_ratelimit);
 	} else {
@@ -186,9 +186,9 @@ static void lcw_dump_stack(struct lc_watchdog *lcw)
 			lcw_recent_watchdog_count = 0;
 		}
 
-		LCONSOLE_WARN("Service thread pid %u was inactive for %lu.%.02lus. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:\n",
+		LCONSOLE_WARN("Service thread pid %u was inactive for %llu.%.02lus. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:\n",
 			      (int)lcw->lcw_pid,
-			      (unsigned long)timediff.tv_sec,
+			      (unsigned long long)timediff.tv_sec,
 			      timediff.tv_nsec / (NSEC_PER_SEC / 100));
 		lcw_dump(lcw);
 	}
@@ -386,9 +386,9 @@ static void lcw_update_time(struct lc_watchdog *lcw, const char *message)
 		struct timespec64 timediff;
 
 		timediff = ktime_to_timespec64(lapse);
-		LCONSOLE_WARN("Service thread pid %u %s after %lu.%.02lus. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).\n",
+		LCONSOLE_WARN("Service thread pid %u %s after %llu.%.02lus. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).\n",
 			      lcw->lcw_pid, message,
-			      (unsigned long)timediff.tv_sec,
+			      (unsigned long long)timediff.tv_sec,
 			      timediff.tv_nsec / (NSEC_PER_SEC / 100));
 	}
 	lcw->lcw_last_touched = newtime;
diff --git a/drivers/staging/lustrefsx/lnet/LICENSE b/drivers/staging/lustrefsx/lnet/LICENSE
new file mode 100644
index 0000000000000..92728f4d300d2
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/LICENSE
@@ -0,0 +1,363 @@
+Each file in this distribution should contain a header stating the
+copyright owner(s), and the licensing terms for that module.  Some
+files are not eligible for copyright protection, and contain neither.
+
+All files in this subtree are licensed under the terms and conditions
+of the GNU General Public License version 2.
+
+Reproduced below is the GPL v2, and Linus's clarifying statement from
+the Linux kernel source code:
+
+----------------------------------------
+
+   NOTE! This copyright does *not* cover user programs that use kernel
+ services by normal system calls - this is merely considered normal use
+ of the kernel, and does *not* fall under the heading of "derived work".
+ Also note that the GPL below is copyrighted by the Free Software
+ Foundation, but the instance of code that it refers to (the Linux
+ kernel) is copyrighted by me and others who actually wrote it.
+
+			Linus Torvalds
+
+----------------------------------------
+
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+                       59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) 19yy  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) 19yy name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h b/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h
index 48db6dd08a2a3..59386c0fdee2f 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h
@@ -71,6 +71,18 @@ extern struct lnet the_lnet;			/* THE network */
 /** exclusive lock */
 #define LNET_LOCK_EX		CFS_PERCPT_LOCK_EX
 
+#ifdef HAVE_KERN_SOCK_GETNAME_2ARGS
+#define lnet_kernel_getpeername(sock, addr, addrlen) \
+		kernel_getpeername(sock, addr)
+#define lnet_kernel_getsockname(sock, addr, addrlen) \
+		kernel_getsockname(sock, addr)
+#else
+#define lnet_kernel_getpeername(sock, addr, addrlen) \
+		kernel_getpeername(sock, addr, addrlen)
+#define lnet_kernel_getsockname(sock, addr, addrlen) \
+		kernel_getsockname(sock, addr, addrlen)
+#endif
+
 static inline int lnet_is_route_alive(struct lnet_route *route)
 {
 	if (!route->lr_gateway->lpni_alive)
@@ -757,7 +769,7 @@ void lnet_register_lnd(struct lnet_lnd *lnd);
 void lnet_unregister_lnd(struct lnet_lnd *lnd);
 
 int lnet_connect(struct socket **sockp, lnet_nid_t peer_nid,
-		 __u32 local_ip, __u32 peer_ip, int peer_port);
+		 __u32 local_ip, __u32 peer_ip, int peer_port, struct net *ns);
 void lnet_connect_console_error(int rc, lnet_nid_t peer_nid,
                                 __u32 peer_ip, int port);
 int lnet_count_acceptor_nets(void);
@@ -766,8 +778,9 @@ int lnet_acceptor_port(void);
 int lnet_acceptor_start(void);
 void lnet_acceptor_stop(void);
 
-int lnet_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask);
-int lnet_ipif_enumerate(char ***names);
+int lnet_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask,
+		    struct net *ns);
+int lnet_ipif_enumerate(char ***names, struct net *ns);
 void lnet_ipif_free_enumeration(char **names, int n);
 int lnet_sock_setbuf(struct socket *socket, int txbufsize, int rxbufsize);
 int lnet_sock_getbuf(struct socket *socket, int *txbufsize, int *rxbufsize);
@@ -775,11 +788,12 @@ int lnet_sock_getaddr(struct socket *socket, bool remote, __u32 *ip, int *port);
 int lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout);
 int lnet_sock_read(struct socket *sock, void *buffer, int nob, int timeout);
 
-int lnet_sock_listen(struct socket **sockp, __u32 ip, int port, int backlog);
+int lnet_sock_listen(struct socket **sockp, __u32 ip, int port, int backlog,
+		     struct net *ns);
 int lnet_sock_accept(struct socket **newsockp, struct socket *sock);
 int lnet_sock_connect(struct socket **sockp, int *fatal,
 			__u32 local_ip, int local_port,
-			__u32 peer_ip, int peer_port);
+			__u32 peer_ip, int peer_port, struct net *ns);
 
 int lnet_peers_start_down(void);
 int lnet_peer_buffer_credits(struct lnet_net *net);
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lnetctl.h b/drivers/staging/lustrefsx/lnet/include/lnet/lnetctl.h
index bdd0cb4f84083..4328135c5ec72 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/lnetctl.h
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/lnetctl.h
@@ -1,5 +1,5 @@
 /*
- *   This file is part of Lustre, https://wiki.hpdd.intel.com/
+ *   This file is part of Lustre, https://wiki.whamcloud.com/
  *
  *   Portals is free software; you can redistribute it and/or
  *   modify it under the terms of version 2 of the GNU General Public
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c
index 110b6e699f095..ba4090556550f 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c
@@ -728,6 +728,19 @@ kiblnd_get_scheduler(int cpt)
 	return NULL;
 }
 
+static unsigned int kiblnd_send_wrs(struct kib_conn *conn)
+{
+	/*
+	 * One WR for the LNet message
+	 * And ibc_max_frags for the transfer WRs
+	 */
+	unsigned int ret = 1 + conn->ibc_max_frags;
+
+	/* account for a maximum of ibc_queue_depth in-flight transfers */
+	ret *= conn->ibc_queue_depth;
+	return ret;
+}
+
 kib_conn_t *
 kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
 		   int state, int version)
@@ -881,8 +894,6 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
 
 	init_qp_attr->event_handler = kiblnd_qp_event;
 	init_qp_attr->qp_context = conn;
-	init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(conn);
-	init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn);
 	init_qp_attr->cap.max_send_sge = *kiblnd_tunables.kib_wrq_sge;
 	init_qp_attr->cap.max_recv_sge = 1;
 	init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
@@ -893,11 +904,14 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
 	conn->ibc_sched = sched;
 
 	do {
+		init_qp_attr->cap.max_send_wr = kiblnd_send_wrs(conn);
+		init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn);
+
 		rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
-		if (!rc || init_qp_attr->cap.max_send_wr < 16)
+		if (!rc || conn->ibc_queue_depth < 2)
 			break;
 
-		init_qp_attr->cap.max_send_wr -= init_qp_attr->cap.max_send_wr / 4;
+		conn->ibc_queue_depth--;
 	} while (rc);
 
 	if (rc) {
@@ -910,9 +924,12 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
 		goto failed_2;
 	}
 
-	if (init_qp_attr->cap.max_send_wr != IBLND_SEND_WRS(conn))
-		CDEBUG(D_NET, "original send wr %d, created with %d\n",
-			IBLND_SEND_WRS(conn), init_qp_attr->cap.max_send_wr);
+	if (conn->ibc_queue_depth != peer_ni->ibp_queue_depth)
+		CWARN("peer %s - queue depth reduced from %u to %u"
+		      "  to allow for qp creation\n",
+		      libcfs_nid2str(peer_ni->ibp_nid),
+		      peer_ni->ibp_queue_depth,
+		      conn->ibc_queue_depth);
 
 	LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
 
@@ -971,7 +988,6 @@ kiblnd_destroy_conn(kib_conn_t *conn, bool free_conn)
 {
 	struct rdma_cm_id *cmid = conn->ibc_cmid;
 	kib_peer_ni_t        *peer_ni = conn->ibc_peer;
-	int                rc;
 
 	LASSERT (!in_interrupt());
 	LASSERT (atomic_read(&conn->ibc_refcount) == 0);
@@ -1002,11 +1018,8 @@ kiblnd_destroy_conn(kib_conn_t *conn, bool free_conn)
 	if (cmid != NULL && cmid->qp != NULL)
 		rdma_destroy_qp(cmid);
 
-	if (conn->ibc_cq != NULL) {
-		rc = ib_destroy_cq(conn->ibc_cq);
-		if (rc != 0)
-			CWARN("Error destroying CQ: %d\n", rc);
-	}
+	if (conn->ibc_cq)
+		ib_destroy_cq(conn->ibc_cq);
 
 	if (conn->ibc_rx_pages != NULL)
 		kiblnd_unmap_rx_descs(conn);
@@ -1656,10 +1669,17 @@ kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t **pp_fpo)
 
 	/* Check for FMR or FastReg support */
 	fpo->fpo_is_fmr = 0;
+#ifdef HAVE_IB_DEVICE_OPS
+	if (fpo->fpo_hdev->ibh_ibdev->ops.alloc_fmr &&
+	    fpo->fpo_hdev->ibh_ibdev->ops.dealloc_fmr &&
+	    fpo->fpo_hdev->ibh_ibdev->ops.map_phys_fmr &&
+	    fpo->fpo_hdev->ibh_ibdev->ops.unmap_fmr) {
+#else
 	if (fpo->fpo_hdev->ibh_ibdev->alloc_fmr &&
 	    fpo->fpo_hdev->ibh_ibdev->dealloc_fmr &&
 	    fpo->fpo_hdev->ibh_ibdev->map_phys_fmr &&
 	    fpo->fpo_hdev->ibh_ibdev->unmap_fmr) {
+#endif
 		LCONSOLE_INFO("Using FMR for registration\n");
 		fpo->fpo_is_fmr = 1;
 	} else if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
@@ -1805,8 +1825,7 @@ kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status)
 	fps = fpo->fpo_owner;
 	if (fpo->fpo_is_fmr) {
 		if (fmr->fmr_pfmr) {
-			rc = ib_fmr_pool_unmap(fmr->fmr_pfmr);
-			LASSERT(!rc);
+			ib_fmr_pool_unmap(fmr->fmr_pfmr);
 			fmr->fmr_pfmr = NULL;
 		}
 
@@ -2644,7 +2663,7 @@ kiblnd_dummy_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
 }
 
 static int
-kiblnd_dev_need_failover(kib_dev_t *dev)
+kiblnd_dev_need_failover(kib_dev_t *dev, struct net *ns)
 {
         struct rdma_cm_id  *cmid;
         struct sockaddr_in  srcaddr;
@@ -2666,8 +2685,8 @@ kiblnd_dev_need_failover(kib_dev_t *dev)
          *
          * a. rdma_bind_addr(), it will conflict with listener cmid
          * b. rdma_resolve_addr() to zero addr */
-        cmid = kiblnd_rdma_create_id(kiblnd_dummy_callback, dev, RDMA_PS_TCP,
-                                     IB_QPT_RC);
+	cmid = kiblnd_rdma_create_id(ns, kiblnd_dummy_callback, dev,
+				     RDMA_PS_TCP, IB_QPT_RC);
         if (IS_ERR(cmid)) {
                 rc = PTR_ERR(cmid);
                 CERROR("Failed to create cmid for failover: %d\n", rc);
@@ -2696,7 +2715,7 @@ kiblnd_dev_need_failover(kib_dev_t *dev)
 }
 
 int
-kiblnd_dev_failover(kib_dev_t *dev)
+kiblnd_dev_failover(kib_dev_t *dev, struct net *ns)
 {
 	struct list_head    zombie_tpo = LIST_HEAD_INIT(zombie_tpo);
 	struct list_head    zombie_ppo = LIST_HEAD_INIT(zombie_ppo);
@@ -2715,7 +2734,7 @@ kiblnd_dev_failover(kib_dev_t *dev)
                  dev->ibd_can_failover ||
                  dev->ibd_hdev == NULL);
 
-        rc = kiblnd_dev_need_failover(dev);
+	rc = kiblnd_dev_need_failover(dev, ns);
         if (rc <= 0)
                 goto out;
 
@@ -2736,7 +2755,7 @@ kiblnd_dev_failover(kib_dev_t *dev)
                 rdma_destroy_id(cmid);
         }
 
-        cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, dev, RDMA_PS_TCP,
+	cmid = kiblnd_rdma_create_id(ns, kiblnd_cm_callback, dev, RDMA_PS_TCP,
                                      IB_QPT_RC);
         if (IS_ERR(cmid)) {
                 rc = PTR_ERR(cmid);
@@ -2857,7 +2876,7 @@ kiblnd_destroy_dev (kib_dev_t *dev)
 }
 
 static kib_dev_t *
-kiblnd_create_dev(char *ifname)
+kiblnd_create_dev(char *ifname, struct net *ns)
 {
         struct net_device *netdev;
         kib_dev_t         *dev;
@@ -2866,7 +2885,7 @@ kiblnd_create_dev(char *ifname)
         int                up;
         int                rc;
 
-	rc = lnet_ipif_query(ifname, &up, &ip, &netmask);
+	rc = lnet_ipif_query(ifname, &up, &ip, &netmask, ns);
         if (rc != 0) {
                 CERROR("Can't query IPoIB interface %s: %d\n",
                        ifname, rc);
@@ -2882,7 +2901,7 @@ kiblnd_create_dev(char *ifname)
         if (dev == NULL)
                 return NULL;
 
-        netdev = dev_get_by_name(&init_net, ifname);
+	netdev = dev_get_by_name(ns, ifname);
         if (netdev == NULL) {
                 dev->ibd_can_failover = 0;
         } else {
@@ -2897,7 +2916,7 @@ kiblnd_create_dev(char *ifname)
         strcpy(&dev->ibd_ifname[0], ifname);
 
         /* initialize the device */
-        rc = kiblnd_dev_failover(dev);
+	rc = kiblnd_dev_failover(dev, ns);
         if (rc != 0) {
                 CERROR("Can't initialize device: %d\n", rc);
                 LIBCFS_FREE(dev, sizeof(*dev));
@@ -3056,7 +3075,7 @@ kiblnd_shutdown(struct lnet_ni *ni)
 }
 
 static int
-kiblnd_base_startup(void)
+kiblnd_base_startup(struct net *ns)
 {
 	struct kib_sched_info	*sched;
 	int			rc;
@@ -3129,7 +3148,7 @@ kiblnd_base_startup(void)
         }
 
 	if (*kiblnd_tunables.kib_dev_failover != 0)
-		rc = kiblnd_thread_start(kiblnd_failover_thread, NULL,
+		rc = kiblnd_thread_start(kiblnd_failover_thread, ns,
 					 "kiblnd_failover");
 
         if (rc != 0) {
@@ -3262,7 +3281,7 @@ kiblnd_startup(struct lnet_ni *ni)
         LASSERT (ni->ni_net->net_lnd == &the_o2iblnd);
 
         if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
-                rc = kiblnd_base_startup();
+		rc = kiblnd_base_startup(ni->ni_net_ns);
                 if (rc != 0)
                         return rc;
         }
@@ -3300,7 +3319,7 @@ kiblnd_startup(struct lnet_ni *ni)
 	newdev = ibdev == NULL;
 	/* hmm...create kib_dev even for alias */
 	if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0)
-		ibdev = kiblnd_create_dev(ifname);
+		ibdev = kiblnd_create_dev(ifname, ni->ni_net_ns);
 
 	if (ibdev == NULL)
 		goto failed;
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h
index d8ad1421092d6..c7dabdf6b98b4 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h
@@ -43,7 +43,6 @@
 
 #endif
 
-#include <linux/version.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/kthread.h>
@@ -65,9 +64,6 @@
 #include <linux/kmod.h>
 #include <linux/sysctl.h>
 #include <linux/pci.h>
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,32)
-#include <linux/pci-dma.h>
-#endif
 
 #include <net/sock.h>
 #include <linux/in.h>
@@ -125,15 +121,16 @@ extern kib_tunables_t  kiblnd_tunables;
 					t->lnd_peercredits_hiw)
 
 #ifdef HAVE_RDMA_CREATE_ID_5ARG
-# define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(current->nsproxy->net_ns, \
-								cb, dev, \
-								ps, qpt)
+# define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) rdma_create_id(ns, cb, \
+								    dev, ps, \
+								    qpt)
 #else
 # ifdef HAVE_RDMA_CREATE_ID_4ARG
-#  define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(cb, dev, \
-								 ps, qpt)
+#  define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) rdma_create_id(cb, dev, \
+								     ps, qpt)
 # else
-#  define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(cb, dev, ps)
+#  define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) rdma_create_id(cb, dev, \
+								     ps)
 # endif
 #endif
 
@@ -161,10 +158,9 @@ extern kib_tunables_t  kiblnd_tunables;
 
 /* WRs and CQEs (per connection) */
 #define IBLND_RECV_WRS(c)            IBLND_RX_MSGS(c)
-#define IBLND_SEND_WRS(c)	\
-	((c->ibc_max_frags + 1) * kiblnd_concurrent_sends(c->ibc_version, \
-							  c->ibc_peer->ibp_ni))
-#define IBLND_CQ_ENTRIES(c)         (IBLND_RECV_WRS(c) + IBLND_SEND_WRS(c))
+
+/* 2 = LNet msg + Transfer chain */
+#define IBLND_CQ_ENTRIES(c)	(IBLND_RECV_WRS(c) + kiblnd_send_wrs(c))
 
 struct kib_hca_dev;
 
@@ -578,7 +574,7 @@ typedef struct kib_rx                           /* receive message */
 	/* message buffer (I/O addr) */
 	__u64			rx_msgaddr;
 	/* for dma_unmap_single() */
-	DECLARE_PCI_UNMAP_ADDR(rx_msgunmap);
+	DEFINE_DMA_UNMAP_ADDR(rx_msgunmap);
 	/* receive work item... */
 	struct ib_recv_wr	rx_wrq;
 	/* ...and its memory */
@@ -617,7 +613,7 @@ typedef struct kib_tx                           /* transmit message */
 	/* message buffer (I/O addr) */
 	__u64			tx_msgaddr;
 	/* for dma_unmap_single() */
-	DECLARE_PCI_UNMAP_ADDR(tx_msgunmap);
+	DEFINE_DMA_UNMAP_ADDR(tx_msgunmap);
 	/** sge for tx_msgaddr */
 	struct ib_sge		tx_msgsge;
 	/* # send work items */
@@ -1157,6 +1153,12 @@ static inline void kiblnd_dma_unmap_sg(struct ib_device *dev,
         ib_dma_unmap_sg(dev, sg, nents, direction);
 }
 
+#ifndef HAVE_IB_SG_DMA_ADDRESS
+#include <linux/scatterlist.h>
+#define ib_sg_dma_address(dev, sg)	sg_dma_address(sg)
+#define ib_sg_dma_len(dev, sg)		sg_dma_len(sg)
+#endif
+
 static inline __u64 kiblnd_sg_dma_address(struct ib_device *dev,
                                           struct scatterlist *sg)
 {
@@ -1204,7 +1206,7 @@ int  kiblnd_cm_callback(struct rdma_cm_id *cmid,
                         struct rdma_cm_event *event);
 int  kiblnd_translate_mtu(int value);
 
-int  kiblnd_dev_failover(kib_dev_t *dev);
+int  kiblnd_dev_failover(kib_dev_t *dev, struct net *ns);
 int kiblnd_create_peer(struct lnet_ni *ni, kib_peer_ni_t **peerp,
 		       lnet_nid_t nid);
 void kiblnd_destroy_peer (kib_peer_ni_t *peer);
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c
index 42147c7b01e68..751a4211c8356 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c
@@ -192,7 +192,12 @@ kiblnd_post_rx (kib_rx_t *rx, int credit)
 	 * own this rx (and rx::rx_conn) anymore, LU-5678.
 	 */
 	kiblnd_conn_addref(conn);
+#ifdef HAVE_IB_POST_SEND_RECV_CONST
+	rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq,
+			  (const struct ib_recv_wr **)&bad_wrq);
+#else
 	rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq, &bad_wrq);
+#endif
 	if (unlikely(rc != 0)) {
 		CERROR("Can't post rx for %s: %d, bad_wrq: %p\n",
 		       libcfs_nid2str(conn->ibc_peer->ibp_nid), rc, bad_wrq);
@@ -841,7 +846,12 @@ __must_hold(&conn->ibc_lock)
 			 libcfs_nid2str(conn->ibc_peer->ibp_nid));
 
 		bad = NULL;
+#ifdef HAVE_IB_POST_SEND_RECV_CONST
+		rc = ib_post_send(conn->ibc_cmid->qp, wr,
+				  (const struct ib_send_wr **)&bad);
+#else
 		rc = ib_post_send(conn->ibc_cmid->qp, wr, &bad);
+#endif
 	}
 
         conn->ibc_last_send = jiffies;
@@ -1252,8 +1262,9 @@ kiblnd_connect_peer (kib_peer_ni_t *peer_ni)
         LASSERT (net != NULL);
         LASSERT (peer_ni->ibp_connecting > 0);
 
-        cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, peer_ni, RDMA_PS_TCP,
-                                     IB_QPT_RC);
+	cmid = kiblnd_rdma_create_id(peer_ni->ibp_ni->ni_net_ns,
+				     kiblnd_cm_callback, peer_ni,
+				     RDMA_PS_TCP, IB_QPT_RC);
 
         if (IS_ERR(cmid)) {
                 CERROR("Can't create CMID for %s: %ld\n",
@@ -3680,6 +3691,7 @@ kiblnd_failover_thread(void *arg)
 {
 	rwlock_t	*glock = &kiblnd_data.kib_global_lock;
 	kib_dev_t	*dev;
+	struct net *ns = arg;
 	wait_queue_entry_t wait;
 	unsigned long	 flags;
 	int		 rc;
@@ -3709,7 +3721,7 @@ kiblnd_failover_thread(void *arg)
                         dev->ibd_failover = 1;
 			write_unlock_irqrestore(glock, flags);
 
-			rc = kiblnd_dev_failover(dev);
+			rc = kiblnd_dev_failover(dev, ns);
 
 			write_lock_irqsave(glock, flags);
 
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c
index c66870631aa98..541504ba88d1b 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c
@@ -2621,7 +2621,7 @@ ksocknal_shutdown(struct lnet_ni *ni)
 }
 
 static int
-ksocknal_enumerate_interfaces(ksock_net_t *net)
+ksocknal_enumerate_interfaces(ksock_net_t *net, struct net *ns)
 {
         char      **names;
         int         i;
@@ -2629,7 +2629,7 @@ ksocknal_enumerate_interfaces(ksock_net_t *net)
         int         rc;
         int         n;
 
-	n = lnet_ipif_enumerate(&names);
+	n = lnet_ipif_enumerate(&names, ns);
         if (n <= 0) {
                 CERROR("Can't enumerate interfaces: %d\n", n);
                 return n;
@@ -2643,7 +2643,7 @@ ksocknal_enumerate_interfaces(ksock_net_t *net)
                 if (!strcmp(names[i], "lo")) /* skip the loopback IF */
                         continue;
 
-		rc = lnet_ipif_query(names[i], &up, &ip, &mask);
+		rc = lnet_ipif_query(names[i], &up, &ip, &mask, ns);
                 if (rc != 0) {
                         CWARN("Can't get interface %s info: %d\n",
                               names[i], rc);
@@ -2830,7 +2830,7 @@ ksocknal_startup(struct lnet_ni *ni)
 
 
 	if (ni->ni_interfaces[0] == NULL) {
-		rc = ksocknal_enumerate_interfaces(net);
+		rc = ksocknal_enumerate_interfaces(net, ni->ni_net_ns);
 		if (rc <= 0)
 			goto fail_1;
 
@@ -2844,7 +2844,8 @@ ksocknal_startup(struct lnet_ni *ni)
 
 			rc = lnet_ipif_query(ni->ni_interfaces[i], &up,
 				&net->ksnn_interfaces[i].ksni_ipaddr,
-				&net->ksnn_interfaces[i].ksni_netmask);
+				&net->ksnn_interfaces[i].ksni_netmask,
+				ni->ni_net_ns);
 
 			if (rc != 0) {
 				CERROR("Can't get interface %s info: %d\n",
@@ -2866,7 +2867,7 @@ ksocknal_startup(struct lnet_ni *ni)
 		net->ksnn_ninterfaces = i;
 	}
 
-	net_dev = dev_get_by_name(&init_net,
+	net_dev = dev_get_by_name(ni->ni_net_ns,
 				  net->ksnn_interfaces[0].ksni_name);
 	if (net_dev != NULL) {
 		node_id = dev_to_node(&net_dev->dev);
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c
index 8892aad0403dd..83c6a2da2f4ae 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c
@@ -8,7 +8,7 @@
  *   Author: Phil Schwan <phil@clusterfs.com>
  *   Author: Eric Barton <eric@bartonsoftware.com>
  *
- *   This file is part of Lustre, https://wiki.hpdd.intel.com/
+ *   This file is part of Lustre, https://wiki.whamcloud.com/
  *
  *   Portals is free software; you can redistribute it and/or
  *   modify it under the terms of version 2 of the GNU General Public
@@ -1913,7 +1913,8 @@ ksocknal_connect (ksock_route_t *route)
 
                 rc = lnet_connect(&sock, peer_ni->ksnp_id.nid,
                                   route->ksnr_myipaddr,
-                                  route->ksnr_ipaddr, route->ksnr_port);
+				  route->ksnr_ipaddr, route->ksnr_port,
+				  peer_ni->ksnp_ni->ni_net_ns);
                 if (rc != 0)
                         goto failed;
 
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c
index 98109ec2ff7bc..42dff10fdb563 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c
@@ -8,7 +8,7 @@
  *   Author: Phil Schwan <phil@clusterfs.com>
  *   Author: Eric Barton <eric@bartonsoftware.com>
  *
- *   This file is part of Lustre, https://wiki.hpdd.intel.com/
+ *   This file is part of Lustre, https://wiki.whamcloud.com/
  *
  *   Portals is free software; you can redistribute it and/or
  *   modify it under the terms of version 2 of the GNU General Public
diff --git a/drivers/staging/lustrefsx/lnet/lnet/acceptor.c b/drivers/staging/lustrefsx/lnet/lnet/acceptor.c
index 885cd85a8c20f..8d3d6030d7d31 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/acceptor.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/acceptor.c
@@ -32,6 +32,7 @@
 
 #define DEBUG_SUBSYSTEM S_LNET
 
+#include <linux/nsproxy.h>
 #include <linux/completion.h>
 #include <net/sock.h>
 #include <lnet/lib-lnet.h>
@@ -44,6 +45,7 @@ static struct {
 	int			pta_shutdown;
 	struct socket		*pta_sock;
 	struct completion	pta_signal;
+	struct net		*pta_ns;
 } lnet_acceptor_state = {
 	.pta_shutdown = 1
 };
@@ -150,7 +152,7 @@ EXPORT_SYMBOL(lnet_connect_console_error);
 
 int
 lnet_connect(struct socket **sockp, lnet_nid_t peer_nid,
-	    __u32 local_ip, __u32 peer_ip, int peer_port)
+	    __u32 local_ip, __u32 peer_ip, int peer_port, struct net *ns)
 {
 	struct lnet_acceptor_connreq cr;
 	struct socket		*sock;
@@ -167,7 +169,7 @@ lnet_connect(struct socket **sockp, lnet_nid_t peer_nid,
 
 		rc = lnet_sock_connect(&sock, &fatal,
 					 local_ip, port,
-					 peer_ip, peer_port);
+					 peer_ip, peer_port, ns);
 		if (rc != 0) {
 			if (fatal)
 				goto failed;
@@ -354,7 +356,8 @@ lnet_acceptor(void *arg)
 	cfs_block_allsigs();
 
 	rc = lnet_sock_listen(&lnet_acceptor_state.pta_sock,
-				0, accept_port, accept_backlog);
+			      0, accept_port, accept_backlog,
+			      lnet_acceptor_state.pta_ns);
 	if (rc != 0) {
 		if (rc == -EADDRINUSE)
 			LCONSOLE_ERROR_MSG(0x122, "Can't start acceptor on port"
@@ -479,6 +482,7 @@ lnet_acceptor_start(void)
 	if (lnet_count_acceptor_nets() == 0)  /* not required */
 		return 0;
 
+	lnet_acceptor_state.pta_ns = current->nsproxy->net_ns;
 	task = kthread_run(lnet_acceptor, (void *)(uintptr_t)secure,
 			   "acceptor_%03ld", secure);
 	if (IS_ERR(task)) {
diff --git a/drivers/staging/lustrefsx/lnet/lnet/config.c b/drivers/staging/lustrefsx/lnet/lnet/config.c
index 2c15e1f5f79a2..8b790353c60a5 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/config.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/config.c
@@ -173,7 +173,7 @@ lnet_net_append_cpts(__u32 *cpts, __u32 ncpts, struct lnet_net *net)
 		LIBCFS_ALLOC(net->net_cpts, sizeof(*net->net_cpts) * ncpts);
 		if (net->net_cpts == NULL)
 			return -ENOMEM;
-		memcpy(net->net_cpts, cpts, ncpts);
+		memcpy(net->net_cpts, cpts, ncpts * sizeof(*net->net_cpts));
 		net->net_ncpts = ncpts;
 		return 0;
 	}
@@ -1607,7 +1607,7 @@ lnet_ipaddr_free_enumeration(__u32 *ipaddrs, int nip)
 }
 
 static int
-lnet_ipaddr_enumerate (__u32 **ipaddrsp)
+lnet_ipaddr_enumerate(__u32 **ipaddrsp, struct net *ns)
 {
 	int	   up;
 	__u32	   netmask;
@@ -1615,7 +1615,7 @@ lnet_ipaddr_enumerate (__u32 **ipaddrsp)
 	__u32	  *ipaddrs2;
 	int	   nip;
 	char	 **ifnames;
-	int	   nif = lnet_ipif_enumerate(&ifnames);
+	int	   nif = lnet_ipif_enumerate(&ifnames, ns);
 	int	   i;
 	int	   rc;
 
@@ -1634,7 +1634,7 @@ lnet_ipaddr_enumerate (__u32 **ipaddrsp)
 			continue;
 
 		rc = lnet_ipif_query(ifnames[i], &up,
-				       &ipaddrs[nip], &netmask);
+				       &ipaddrs[nip], &netmask, ns);
 		if (rc != 0) {
 			CWARN("Can't query interface %s: %d\n",
 			      ifnames[i], rc);
@@ -1676,9 +1676,11 @@ int
 lnet_parse_ip2nets (char **networksp, char *ip2nets)
 {
 	__u32	  *ipaddrs = NULL;
-	int	   nip = lnet_ipaddr_enumerate(&ipaddrs);
+	int	   nip;
 	int	   rc;
 
+	nip = lnet_ipaddr_enumerate(&ipaddrs, current->nsproxy->net_ns);
+
 	if (nip < 0) {
 		LCONSOLE_ERROR_MSG(0x117, "Error %d enumerating local IP "
 				   "interfaces for ip2nets to match\n", nip);
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
index faa8fc3a01101..c46ab84714768 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
@@ -44,16 +44,39 @@
 #include <lnet/lib-lnet.h>
 
 /*
- * Deal with the post-5.0 rename of these in-kernel values.
+ * kernel 5.1: commit 7f1bc6e95d7840d4305595b3e4025cddda88cee5
+ * Y2038 64-bit time.
+ *  SO_TIMESTAMP, SO_TIMESTAMPNS and SO_TIMESTAMPING options, the
+ *  way they are currently defined, are not y2038 safe.
+ *  Subsequent patches in the series add new y2038 safe versions
+ *  of these options which provide 64 bit timestamps on all
+ *  architectures uniformly.
+ *  Hence, rename existing options with OLD tag suffixes.
+ *
+ * NOTE: When updating to timespec64 change change these to '_NEW'.
+ *
  */
-#if !defined(SO_RCVTIMEO) && defined(SO_RCVTIMEO_OLD)
+#ifndef SO_SNDTIMEO
+#define SO_SNDTIMEO SO_SNDTIMEO_OLD
+#endif
+
+#ifndef SO_RCVTIMEO
 #define SO_RCVTIMEO SO_RCVTIMEO_OLD
 #endif
 
-#if !defined(SO_SNDTIMEO) && defined(SO_SNDTIMEO_OLD)
-#define SO_SNDTIMEO SO_SNDTIMEO_OLD
+static int
+lnet_sock_create_kern(struct socket **sock, struct net *ns)
+{
+	int rc;
+
+#ifdef HAVE_SOCK_CREATE_KERN_USE_NET
+	rc = sock_create_kern(ns, PF_INET, SOCK_STREAM, 0, sock);
+#else
+	rc = sock_create_kern(PF_INET, SOCK_STREAM, 0, sock);
 #endif
 
+	return rc;
+}
 
 static int
 kernel_sock_unlocked_ioctl(struct file *filp, int cmd, unsigned long arg)
@@ -69,18 +92,14 @@ kernel_sock_unlocked_ioctl(struct file *filp, int cmd, unsigned long arg)
 }
 
 static int
-lnet_sock_ioctl(int cmd, unsigned long arg)
+lnet_sock_ioctl(int cmd, unsigned long arg, struct net *ns)
 {
 	struct file    *sock_filp;
 	struct socket  *sock;
 	int		fd = -1;
 	int		rc;
 
-#ifdef HAVE_SOCK_CREATE_KERN_USE_NET
-	rc = sock_create_kern(&init_net, PF_INET, SOCK_STREAM, 0, &sock);
-#else
-	rc = sock_create_kern(PF_INET, SOCK_STREAM, 0, &sock);
-#endif
+	rc = lnet_sock_create_kern(&sock, ns);
 	if (rc != 0) {
 		CERROR("Can't create socket: %d\n", rc);
 		return rc;
@@ -121,7 +140,7 @@ lnet_sock_ioctl(int cmd, unsigned long arg)
 }
 
 int
-lnet_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask)
+lnet_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask, struct net *ns)
 {
 	struct ifreq	ifr;
 	int		nob;
@@ -140,7 +159,7 @@ lnet_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask)
 		return -E2BIG;
 	strncpy(ifr.ifr_name, name, sizeof(ifr.ifr_name));
 
-	rc = lnet_sock_ioctl(SIOCGIFFLAGS, (unsigned long)&ifr);
+	rc = lnet_sock_ioctl(SIOCGIFFLAGS, (unsigned long)&ifr, ns);
 	if (rc != 0) {
 		CERROR("Can't get flags for interface %s\n", name);
 		return rc;
@@ -159,7 +178,7 @@ lnet_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask)
 	strncpy(ifr.ifr_name, name, sizeof(ifr.ifr_name));
 
 	ifr.ifr_addr.sa_family = AF_INET;
-	rc = lnet_sock_ioctl(SIOCGIFADDR, (unsigned long)&ifr);
+	rc = lnet_sock_ioctl(SIOCGIFADDR, (unsigned long)&ifr, ns);
 
 	if (rc != 0) {
 		CERROR("Can't get IP address for interface %s\n", name);
@@ -174,7 +193,7 @@ lnet_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask)
 	strncpy(ifr.ifr_name, name, sizeof(ifr.ifr_name));
 
 	ifr.ifr_addr.sa_family = AF_INET;
-	rc = lnet_sock_ioctl(SIOCGIFNETMASK, (unsigned long)&ifr);
+	rc = lnet_sock_ioctl(SIOCGIFNETMASK, (unsigned long)&ifr, ns);
 	if (rc != 0) {
 		CERROR("Can't get netmask for interface %s\n", name);
 		return rc;
@@ -202,7 +221,7 @@ lnet_ipif_free_enumeration(char **names, int n)
 EXPORT_SYMBOL(lnet_ipif_free_enumeration);
 
 int
-lnet_ipif_enumerate(char ***namesp)
+lnet_ipif_enumerate(char ***namesp, struct net *ns)
 {
 	/* Allocate and fill in 'names', returning # interfaces/error */
 	char	      **names;
@@ -236,7 +255,7 @@ lnet_ipif_enumerate(char ***namesp)
 		ifc.ifc_buf = (char *)ifr;
 		ifc.ifc_len = nalloc * sizeof(*ifr);
 
-		rc = lnet_sock_ioctl(SIOCGIFCONF, (unsigned long)&ifc);
+		rc = lnet_sock_ioctl(SIOCGIFCONF, (unsigned long)&ifc, ns);
 		if (rc < 0) {
 			CERROR("Error %d enumerating interfaces\n", rc);
 			goto out1;
@@ -423,7 +442,7 @@ EXPORT_SYMBOL(lnet_sock_read);
 
 static int
 lnet_sock_create(struct socket **sockp, int *fatal,
-		 __u32 local_ip, int local_port)
+		 __u32 local_ip, int local_port, struct net *ns)
 {
 	struct sockaddr_in  locaddr;
 	struct socket	   *sock;
@@ -433,11 +452,7 @@ lnet_sock_create(struct socket **sockp, int *fatal,
 	/* All errors are fatal except bind failure if the port is in use */
 	*fatal = 1;
 
-#ifdef HAVE_SOCK_CREATE_KERN_USE_NET
-	rc = sock_create_kern(&init_net, PF_INET, SOCK_STREAM, 0, &sock);
-#else
-	rc = sock_create_kern(PF_INET, SOCK_STREAM, 0, &sock);
-#endif
+	rc = lnet_sock_create_kern(&sock, ns);
 	*sockp = sock;
 	if (rc != 0) {
 		CERROR("Can't create socket: %d\n", rc);
@@ -514,23 +529,18 @@ int
 lnet_sock_getaddr(struct socket *sock, bool remote, __u32 *ip, int *port)
 {
 	struct sockaddr_in sin;
-	int		   rc;
+	int rc;
+#ifndef HAVE_KERN_SOCK_GETNAME_2ARGS
+	int len = sizeof(sin);
+#endif
 
-#ifdef HAVE_KERNSOCK_RETURNSLEN
 	if (remote)
-		rc = kernel_getpeername(sock, (struct sockaddr *)&sin);
+		rc = lnet_kernel_getpeername(sock,
+					     (struct sockaddr *)&sin, &len);
 	else
-		rc = kernel_getsockname(sock, (struct sockaddr *)&sin);
+		rc = lnet_kernel_getsockname(sock,
+					     (struct sockaddr *)&sin, &len);
 	if (rc < 0) {
-#else
-	int		   len = sizeof(sin);
-
-	if (remote)
-		rc = kernel_getpeername(sock, (struct sockaddr *)&sin, &len);
-	else
-		rc = kernel_getsockname(sock, (struct sockaddr *)&sin, &len);
-	if (rc != 0) {
-#endif
 		CERROR("Error %d getting sock %s IP/port\n",
 			rc, remote ? "peer" : "local");
 		return rc;
@@ -561,12 +571,12 @@ EXPORT_SYMBOL(lnet_sock_getbuf);
 
 int
 lnet_sock_listen(struct socket **sockp,
-		   __u32 local_ip, int local_port, int backlog)
+		   __u32 local_ip, int local_port, int backlog, struct net *ns)
 {
 	int	 fatal;
 	int	 rc;
 
-	rc = lnet_sock_create(sockp, &fatal, local_ip, local_port);
+	rc = lnet_sock_create(sockp, &fatal, local_ip, local_port, ns);
 	if (rc != 0) {
 		if (!fatal)
 			CERROR("Can't create socket: port %d already in use\n",
@@ -640,12 +650,13 @@ lnet_sock_accept(struct socket **newsockp, struct socket *sock)
 int
 lnet_sock_connect(struct socket **sockp, int *fatal,
 		  __u32 local_ip, int local_port,
-		  __u32 peer_ip, int peer_port)
+		  __u32 peer_ip, int peer_port,
+		  struct net *ns)
 {
 	struct sockaddr_in  srvaddr;
 	int		    rc;
 
-	rc = lnet_sock_create(sockp, fatal, local_ip, local_port);
+	rc = lnet_sock_create(sockp, fatal, local_ip, local_port, ns);
 	if (rc != 0)
 		return rc;
 
diff --git a/drivers/staging/lustrefsx/lnet/lnet/router.c b/drivers/staging/lustrefsx/lnet/lnet/router.c
index f35b67e2d7bba..bd30963a960d1 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/router.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/router.c
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 2011, 2016, Intel Corporation.
  *
- *   This file is part of Lustre, https://wiki.hpdd.intel.com/
+ *   This file is part of Lustre, https://wiki.whamcloud.com/
  *
  *   Portals is free software; you can redistribute it and/or
  *   modify it under the terms of version 2 of the GNU General Public
diff --git a/drivers/staging/lustrefsx/lnet/lnet/router_proc.c b/drivers/staging/lustrefsx/lnet/lnet/router_proc.c
index 2f6b0c76d7b70..da73b32ea9371 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/router_proc.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/router_proc.c
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 2011, 2015, Intel Corporation.
  *
- *   This file is part of Lustre, https://wiki.hpdd.intel.com/
+ *   This file is part of Lustre, https://wiki.whamcloud.com/
  *
  *   Portals is free software; you can redistribute it and/or
  *   modify it under the terms of version 2 of the GNU General Public
diff --git a/drivers/staging/lustrefsx/lustre/LICENSE b/drivers/staging/lustrefsx/lustre/LICENSE
new file mode 100644
index 0000000000000..edb73cdedca6a
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/LICENSE
@@ -0,0 +1,372 @@
+Each file in this distribution contains a header stating the copyright
+owner(s), and the licensing terms for that file.  Some files are not
+eligible for copyright protection, and contain neither.
+
+There are many files which may be covered by a separate license that
+you signed or otherwise agreed to before downloading this software.
+If you did not agree to such an agreement, or if the file does not
+mention that license, then you can redistribute and/or modify it under
+the terms of version 2 of the GNU General Public License.  Each file
+is very clear about which license is applicable.
+
+In any case, Lustre is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the license
+text for more details.
+
+Reproduced below is the GNU General Public License version 2, and
+Linus's clarifying statement from the Linux kernel source code:
+
+----------------------------------------
+
+   NOTE! This copyright does *not* cover user programs that use kernel
+ services by normal system calls - this is merely considered normal use
+ of the kernel, and does *not* fall under the heading of "derived work".
+ Also note that the GPL below is copyrighted by the Free Software
+ Foundation, but the instance of code that it refers to (the Linux
+ kernel) is copyrighted by me and others who actually wrote it.
+
+			Linus Torvalds
+
+----------------------------------------
+
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+                       59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) 19yy  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) 19yy name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/drivers/staging/lustrefsx/lustre/include/dt_object.h b/drivers/staging/lustrefsx/lustre/include/dt_object.h
index 7d8e702d76d73..e872981b5284e 100644
--- a/drivers/staging/lustrefsx/lustre/include/dt_object.h
+++ b/drivers/staging/lustrefsx/lustre/include/dt_object.h
@@ -1919,7 +1919,6 @@ struct dt_txn_callback {
                              struct thandle *txn, void *cookie);
         int (*dtc_txn_stop)(const struct lu_env *env,
                             struct thandle *txn, void *cookie);
-        void (*dtc_txn_commit)(struct thandle *txn, void *cookie);
 	void			*dtc_cookie;
 	__u32			dtc_tag;
 	struct list_head	dtc_linkage;
@@ -1931,7 +1930,6 @@ void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb);
 int dt_txn_hook_start(const struct lu_env *env,
                       struct dt_device *dev, struct thandle *txn);
 int dt_txn_hook_stop(const struct lu_env *env, struct thandle *txn);
-void dt_txn_hook_commit(struct thandle *txn);
 
 int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj);
 
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_idl.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_idl.h
index e40b90ec65a20..f2c850c0f1848 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_idl.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_idl.h
@@ -722,7 +722,7 @@ struct ptlrpc_body_v2 {
 #define OBD_CONNECT_LARGE_ACL		0x200ULL /* more than 32 ACL entries */
 #define OBD_CONNECT_TRUNCLOCK           0x400ULL /*locks on server for punch */
 #define OBD_CONNECT_TRANSNO             0x800ULL /*replay sends init transno */
-#define OBD_CONNECT_IBITS              0x1000ULL /*support for inodebits locks*/
+#define OBD_CONNECT_IBITS	       0x1000ULL /* not checked in 2.11+ */
 #define OBD_CONNECT_BARRIER	       0x2000ULL /* write barrier */
 #define OBD_CONNECT_ATTRFID            0x4000ULL /*Server can GetAttr By Fid*/
 #define OBD_CONNECT_NODEVOH            0x8000ULL /*No open hndl on specl nodes*/
@@ -870,8 +870,8 @@ struct ptlrpc_body_v2 {
 #define MGS_CONNECT_SUPPORTED2 0
 
 /* Features required for this version of the client to work with server */
-#define CLIENT_CONNECT_MDT_REQD (OBD_CONNECT_IBITS | OBD_CONNECT_FID | \
-                                 OBD_CONNECT_FULL20)
+#define CLIENT_CONNECT_MDT_REQD (OBD_CONNECT_FID |	\
+				 OBD_CONNECT_FULL20)
 
 /* This structure is used for both request and reply.
  *
@@ -1062,7 +1062,11 @@ struct lov_mds_md_v1 {            /* LOV EA mds/wire data (little-endian) */
 	struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */
 };
 
-#define MAX_MD_SIZE (sizeof(struct lov_mds_md) + 4 * sizeof(struct lov_ost_data))
+#define MAX_MD_SIZE_OLD (sizeof(struct lov_mds_md) +			\
+			 4 * sizeof(struct lov_ost_data))
+#define MAX_MD_SIZE (sizeof(struct lov_comp_md_v1) +			\
+		     4 * (sizeof(struct lov_comp_md_entry_v1) +		\
+			  MAX_MD_SIZE_OLD))
 #define MIN_MD_SIZE (sizeof(struct lov_mds_md) + 1 * sizeof(struct lov_ost_data))
 
 /* This is the default MDT reply size allocated, should the striping be bigger,
@@ -1267,6 +1271,8 @@ struct hsm_state_set {
 
 #define OBD_BRW_LOCALS (OBD_BRW_LOCAL1)
 
+#define OBD_MAX_GRANT 0x7fffffffUL /* Max grant allowed to one client: 2 GiB */
+
 #define OBD_OBJECT_EOF LUSTRE_EOF
 
 #define OST_MIN_PRECREATE 32
@@ -1639,7 +1645,6 @@ enum {
 	 * 2. If these flags needs to be stored into inode, they will be
 	 * stored in LMA. see LMAI_XXXX */
 	LUSTRE_ORPHAN_FL = 0x00002000,
-	LUSTRE_SET_SYNC_FL = 0x00040000, /* Synchronous setattr on OSTs */
 
 	LUSTRE_LMA_FL_MASKS = LUSTRE_ORPHAN_FL,
 };
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h
index c014ed714919e..67ed9768fcb2f 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h
@@ -50,12 +50,12 @@
 # include <linux/string.h> /* snprintf() */
 # include <linux/version.h>
 #else /* !__KERNEL__ */
-# define NEED_QUOTA_DEFS
 # include <limits.h>
 # include <stdbool.h>
 # include <stdio.h> /* snprintf() */
 # include <string.h>
-# include <sys/quota.h>
+# define NEED_QUOTA_DEFS
+/* # include <sys/quota.h> - this causes complaints about caddr_t */
 # include <sys/stat.h>
 #endif /* __KERNEL__ */
 #include <lustre/ll_fiemap.h>
@@ -73,7 +73,15 @@
     "project",	/* PRJQUOTA */ \
     "undefined", \
 };
+#ifndef USRQUOTA
+#define USRQUOTA 0
+#endif
+#ifndef GRPQUOTA
+#define GRPQUOTA 1
+#endif
+#ifndef PRJQUOTA
 #define PRJQUOTA 2
+#endif
 
 #if defined(__x86_64__) || defined(__ia64__) || defined(__ppc64__) || \
     defined(__craynv) || defined(__mips64__) || defined(__powerpc64__) || \
@@ -183,16 +191,51 @@ struct ost_layout {
 	__u32	ol_comp_id;
 } __attribute__((packed));
 
-/* keep this one for compatibility */
-struct filter_fid_old {
-	struct lu_fid	ff_parent;
-	__u64		ff_objid;
-	__u64		ff_seq;
+/* The filter_fid structure has changed several times over its lifetime.
+ * For a long time "trusted.fid" held the MDT inode parent FID/IGIF and
+ * stripe_index and the "self FID" (objid/seq) to be able to recover the
+ * OST objects in case of corruption.  With the move to 2.4 and OSD-API for
+ * the OST, the "trusted.lma" xattr was added to the OST objects to store
+ * the "self FID" to be consistent with the MDT on-disk format, and the
+ * filter_fid only stored the MDT inode parent FID and stripe index.
+ *
+ * In 2.10, the addition of PFL composite layouts required more information
+ * to be stored into the filter_fid in order to be able to identify which
+ * component the OST object belonged.  As well, the stripe size may vary
+ * between components, so it was no longer safe to assume the stripe size
+ * or stripe_count of a file.  This is also more robust for plain layouts.
+ *
+ * For ldiskfs OSTs that were formatted with 256-byte inodes, there is not
+ * enough space to store both the filter_fid and LMA in the inode, so they
+ * are packed into struct lustre_ost_attrs on disk in trusted.lma to avoid
+ * an extra seek for every OST object access.
+ *
+ * In 2.11, FLR mirror layouts also need to store the layout version and
+ * range so that writes to old versions of the layout are not allowed.
+ * That ensures that mirrored objects are not modified by evicted clients,
+ * and ensures that the components are correctly marked stale on the MDT.
+ */
+struct filter_fid_18_23 {
+	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
+	__u64			ff_objid;
+	__u64			ff_seq;
+};
+
+struct filter_fid_24_29 {
+	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
+};
+
+struct filter_fid_210 {
+	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
+	struct ost_layout	ff_layout;
 };
 
 struct filter_fid {
-	struct lu_fid		ff_parent;
+	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
 	struct ost_layout	ff_layout;
+	__u32			ff_layout_version;
+	__u32			ff_range; /* range of layout version that
+					   * write are allowed */
 } __attribute__((packed));
 
 /* Userspace should treat lu_fid as opaque, and only use the following methods
@@ -410,6 +453,9 @@ enum ll_lease_type {
 /* To be compatible with old statically linked binary we keep the check for
  * the older 0100000000 flag.  This is already removed upstream.  LU-812. */
 #define O_LOV_DELAY_CREATE_1_8	0100000000 /* FMODE_NONOTIFY masked in 2.6.36 */
+#ifndef FASYNC
+#define FASYNC			00020000   /* fcntl, for BSD compatibility */
+#endif
 #define O_LOV_DELAY_CREATE_MASK	(O_NOCTTY | FASYNC)
 #define O_LOV_DELAY_CREATE		(O_LOV_DELAY_CREATE_1_8 | \
 					 O_LOV_DELAY_CREATE_MASK)
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
index 2fb308a998ac1..408efb8953cbc 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
@@ -38,6 +38,7 @@
 #include <linux/pagemap.h>
 #include <linux/bio.h>
 #include <linux/xattr.h>
+#include <linux/slab.h>
 
 #include <libcfs/libcfs.h>
 #include <lustre_patchless_compat.h>
@@ -111,14 +112,6 @@ static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
 #define MODULE_ALIAS_FS(name)
 #endif
 
-#define LTIME_S(time)                   (time.tv_sec)
-
-#ifdef HAVE_INODE_TIME_64BIT
-#define LTIME_FMT "%llu"
-#else
-#define LTIME_FMT "%lu"
-#endif
-
 #ifdef HAVE_GENERIC_PERMISSION_2ARGS
 # define ll_generic_permission(inode, mask, flags, check_acl) \
 	 generic_permission(inode, mask)
@@ -329,6 +322,10 @@ static inline void set_nlink(struct inode *inode, unsigned int nlink)
 # define ll_umode_t	int
 #endif
 
+#ifndef HAVE_VM_FAULT_T
+#define vm_fault_t int
+#endif
+
 #include <linux/dcache.h>
 #ifndef HAVE_D_MAKE_ROOT
 static inline struct dentry *d_make_root(struct inode *root)
@@ -394,15 +391,17 @@ static inline int radix_tree_exceptional_entry(void *arg)
 }
 #endif
 
+#ifndef HAVE_XA_IS_VALUE
+static inline bool xa_is_value(void *entry)
+{
+	return radix_tree_exceptional_entry(entry);
+}
+#endif
+
 #ifndef HAVE_TRUNCATE_INODE_PAGES_FINAL
 static inline void truncate_inode_pages_final(struct address_space *map)
 {
 	truncate_inode_pages(map, 0);
-		/* Workaround for LU-118 */
-	if (map->nrpages) {
-		spin_lock_irq(&map->tree_lock);
-		spin_unlock_irq(&map->tree_lock);
-	}	/* Workaround end */
 }
 #endif
 
@@ -549,34 +548,23 @@ static inline bool is_sxid(umode_t mode)
 #endif
 
 /*
- * Upstream Linux kernel commit e462ec50cb5fad19f6003a3d8087f4a0945dd2b1
- * differentiated the MS_ values from SB_* values. We use SB_*
- * throughout, but account here for older kernels that do not have
- * SB_*. The values below are only the ones currently used in the Lustre
- * code.
+ * mount MS_* flags split from superblock SB_* flags
+ * if the SB_* flags are not available use the MS_* flags
  */
-
-#ifndef SB_RDONLY
-
-#define SB_RDONLY	MS_RDONLY
-#define SB_ACTIVE	MS_ACTIVE
-#define SB_NODIRATIME	MS_NODIRATIME
-
-#if defined(MS_POSIXACL)
-#define SB_POSIXACL MS_POSIXACL
+#if !defined(SB_RDONLY) && defined(MS_RDONLY)
+# define SB_RDONLY MS_RDONLY
 #endif
-
-#if defined(MS_NOSEC)
-#define SB_NOSEC MS_NOSEC
+#if !defined(SB_ACTIVE) && defined(MS_ACTIVE)
+# define SB_ACTIVE MS_ACTIVE
 #endif
-
+#if !defined(SB_NOSEC) && defined(MS_NOSEC)
+# define SB_NOSEC MS_NOSEC
 #endif
-
-#ifndef SB_NOSEC
-static inline void inode_has_no_xattr(struct inode *inode)
-{
-	return;
-}
+#if !defined(SB_POSIXACL) && defined(MS_POSIXACL)
+# define SB_POSIXACL MS_POSIXACL
+#endif
+#if !defined(SB_NODIRATIME) && defined(MS_NODIRATIME)
+# define SB_NODIRATIME MS_NODIRATIME
 #endif
 
 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
@@ -704,38 +692,22 @@ static inline struct timespec current_time(struct inode *inode)
 #define __GFP_COLD 0
 #endif
 
-#ifndef HAVE_ADDRESS_SPACE_XARRAY
-static inline void lock_mappings(struct address_space *mappings)
-{
-#ifdef HAVE_ADDRESS_SPACE_IPAGES
-	xa_lock_irq(&mappings->i_pages);
+#ifdef HAVE_I_PAGES
+#define page_tree i_pages
 #else
-	spin_lock_irq(&mappings->tree_lock);
+#define i_pages tree_lock
+#define xa_lock_irq(lockp) spin_lock_irq(lockp)
+#define xa_unlock_irq(lockp) spin_unlock_irq(lockp)
 #endif
-}
 
-static inline void unlock_mappings(struct address_space *mappings)
-{
-#ifdef HAVE_ADDRESS_SPACE_IPAGES
-	xa_unlock_irq(&mappings->i_pages);
-#else
-	spin_unlock_irq(&mappings->tree_lock);
-#endif
-}
-#endif
-
-#ifdef HAVE_TOTALRAM_PAGES_FUNC
-#define TOTALRAM_PAGES totalram_pages()
-#else
-#define TOTALRAM_PAGES totalram_pages
+#ifndef HAVE_LINUX_SELINUX_IS_ENABLED
+#define selinux_is_enabled() 1
 #endif
 
-#ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY
-# ifdef HAVE_VM_FAULT_T
-#  define VM_FAULT_T vm_fault_t
-# else
-#  define VM_FAULT_T int
-# endif
+#ifndef KMEM_CACHE_USERCOPY
+#define kmem_cache_create_usercopy(name, size, align, flags, useroffset, \
+				   usersize, ctor)			 \
+	kmem_cache_create(name, size, align, flags, ctor)
 #endif
 
 #endif /* _LUSTRE_COMPAT_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_dlm.h b/drivers/staging/lustrefsx/lustre/include/lustre_dlm.h
index d1cb7c20cf82c..3eed4226f85a7 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_dlm.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_dlm.h
@@ -297,7 +297,7 @@ struct ldlm_valblock_ops {
 	/* Return size of lvb data appropriate RPC size can be reserved */
 	int (*lvbo_size)(struct ldlm_lock *lock);
 	/* Called to fill in lvb data to RPC buffer @buf */
-	int (*lvbo_fill)(struct ldlm_lock *lock, void *buf, int buflen);
+	int (*lvbo_fill)(struct ldlm_lock *lock, void *buf, int *buflen);
 };
 
 /**
@@ -416,6 +416,7 @@ struct ldlm_namespace {
 	struct list_head	ns_unused_list;
 	/** Number of locks in the LRU list above */
 	int			ns_nr_unused;
+	struct list_head	*ns_last_pos;
 
 	/**
 	 * Maximum number of locks permitted in the LRU. If 0, means locks
@@ -814,12 +815,6 @@ struct ldlm_lock {
 	 */
 	wait_queue_head_t	l_waitq;
 
-	/**
-	 * Seconds. It will be updated if there is any activity related to
-	 * the lock, e.g. enqueue the lock or send blocking AST.
-	 */
-	time64_t		l_last_activity;
-
 	/**
 	 * Time, in nanoseconds, last used by e.g. being matched by lock match.
 	 */
@@ -844,6 +839,16 @@ struct ldlm_lock {
 	/** Private storage for lock user. Opaque to LDLM. */
 	void			*l_ast_data;
 
+	union {
+	/**
+	 * Seconds. It will be updated if there is any activity related to
+	 * the lock at client, e.g. enqueue the lock. For server it is the
+	 * time when blocking ast was sent.
+	 */
+		time64_t        l_activity;
+		time64_t        l_blast_sent;
+	};
+
 	/*
 	 * Server-side-only members.
 	 */
@@ -1083,7 +1088,7 @@ static inline int ldlm_lvbo_size(struct ldlm_lock *lock)
 	return 0;
 }
 
-static inline int ldlm_lvbo_fill(struct ldlm_lock *lock, void *buf, int len)
+static inline int ldlm_lvbo_fill(struct ldlm_lock *lock, void *buf, int *len)
 {
 	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
 	int rc;
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h b/drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h
index 179cb71de3758..cab4e5f2f702a 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h
@@ -161,14 +161,6 @@
 #define ldlm_set_fail_loc(_l)           LDLM_SET_FLAG((  _l), 1ULL << 32)
 #define ldlm_clear_fail_loc(_l)         LDLM_CLEAR_FLAG((_l), 1ULL << 32)
 
-/**
- * Used while processing the unused list to know that we have already
- * handled this lock and decided to skip it. */
-#define LDLM_FL_SKIPPED                 0x0000000200000000ULL // bit  33
-#define ldlm_is_skipped(_l)             LDLM_TEST_FLAG(( _l), 1ULL << 33)
-#define ldlm_set_skipped(_l)            LDLM_SET_FLAG((  _l), 1ULL << 33)
-#define ldlm_clear_skipped(_l)          LDLM_CLEAR_FLAG((_l), 1ULL << 33)
-
 /** this lock is being destroyed */
 #define LDLM_FL_CBPENDING               0x0000000400000000ULL // bit  34
 #define ldlm_is_cbpending(_l)           LDLM_TEST_FLAG(( _l), 1ULL << 34)
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_fid.h b/drivers/staging/lustrefsx/lustre/include/lustre_fid.h
index 8759b31f91674..43d0c3419417d 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_fid.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_fid.h
@@ -345,7 +345,6 @@ static inline void ost_layout_le_to_cpu(struct ost_layout *dst,
 	dst->ol_comp_id = __le32_to_cpu(src->ol_comp_id);
 }
 
-/* Both filter_fid_*cpu* functions not currently used */
 static inline void filter_fid_cpu_to_le(struct filter_fid *dst,
 					const struct filter_fid *src, int size)
 {
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_idmap.h b/drivers/staging/lustrefsx/lustre/include/lustre_idmap.h
index 70d647d8a15f3..57a192359d118 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_idmap.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_idmap.h
@@ -58,6 +58,8 @@
 
 #endif /* HAVE_GROUP_INFO_GID */
 
+#include <linux/cred.h>
+
 struct lu_ucred;
 
 extern void lustre_groups_from_list(struct group_info *ginfo, gid_t *glist);
diff --git a/drivers/staging/lustrefsx/lustre/include/obd.h b/drivers/staging/lustrefsx/lustre/include/obd.h
index 9910a189aef4f..9d49ce5a2a17a 100644
--- a/drivers/staging/lustrefsx/lustre/include/obd.h
+++ b/drivers/staging/lustrefsx/lustre/include/obd.h
@@ -1220,8 +1220,8 @@ static inline void client_adjust_max_dirty(struct client_obd *cli)
 			cli->cl_dirty_max_pages = dirty_max;
 	}
 
-	if (cli->cl_dirty_max_pages > TOTALRAM_PAGES / 8)
-		cli->cl_dirty_max_pages = TOTALRAM_PAGES / 8;
+	if (cli->cl_dirty_max_pages > cfs_totalram_pages() / 8)
+		cli->cl_dirty_max_pages = cfs_totalram_pages() / 8;
 }
 
 #endif /* __OBD_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/obd_support.h b/drivers/staging/lustrefsx/lustre/include/obd_support.h
index 19c179dfb1507..59f3ab530c9ed 100644
--- a/drivers/staging/lustrefsx/lustre/include/obd_support.h
+++ b/drivers/staging/lustrefsx/lustre/include/obd_support.h
@@ -276,6 +276,8 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OSD_COMPAT_NO_ENTRY			0x196
 #define OBD_FAIL_OSD_OST_EA_FID_SET			0x197
 
+#define OBD_FAIL_OFD_SET_OID				0x1e0
+
 #define OBD_FAIL_OST                     0x200
 #define OBD_FAIL_OST_CONNECT_NET         0x201
 #define OBD_FAIL_OST_DISCONNECT_NET      0x202
@@ -424,6 +426,8 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_PTLRPC_LONG_REQ_UNLINK  0x51b
 #define OBD_FAIL_PTLRPC_LONG_BOTH_UNLINK 0x51c
 #define OBD_FAIL_PTLRPC_CLIENT_BULK_CB3  0x520
+#define OBD_FAIL_PTLRPC_BULK_ATTACH      0x521
+#define OBD_FAIL_PTLRPC_CONNECT_RACE	 0x531
 
 #define OBD_FAIL_OBD_PING_NET            0x600
 #define OBD_FAIL_OBD_LOG_CANCEL_NET      0x601
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ioctl.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ioctl.h
index cb4ec46373759..9fddf2b1b9bd3 100644
--- a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ioctl.h
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ioctl.h
@@ -217,6 +217,7 @@ static inline __u32 obd_ioctl_packlen(struct obd_ioctl_data *data)
 #define OBD_IOC_LLOG_CHECK	_IOWR('f', 195, OBD_IOC_DATA_TYPE)
 /*	OBD_IOC_LLOG_CATINFO	_IOWR('f', 196, OBD_IOC_DATA_TYPE) */
 #define OBD_IOC_NODEMAP		_IOWR('f', 197, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_CLEAR_CONFIGS   _IOWR('f', 198, OBD_IOC_DATA_TYPE)
 
 /*	ECHO_IOC_GET_STRIPE	_IOWR('f', 200, OBD_IOC_DATA_TYPE) */
 /*	ECHO_IOC_SET_STRIPE	_IOWR('f', 201, OBD_IOC_DATA_TYPE) */
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c
index 208ab5d481c84..9b84a0d0cd21e 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c
@@ -415,11 +415,11 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
 
 	if (!strcmp(name, LUSTRE_MDC_NAME)) {
 		cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT;
-	} else if (TOTALRAM_PAGES >> (20 - PAGE_SHIFT) <= 128 /* MB */) {
+	} else if (cfs_totalram_pages() >> (20 - PAGE_SHIFT) <= 128 /* MB */) {
 		cli->cl_max_rpcs_in_flight = 2;
-	} else if (TOTALRAM_PAGES >> (20 - PAGE_SHIFT) <= 256 /* MB */) {
+	} else if (cfs_totalram_pages() >> (20 - PAGE_SHIFT) <= 256 /* MB */) {
 		cli->cl_max_rpcs_in_flight = 3;
-	} else if (TOTALRAM_PAGES >> (20 - PAGE_SHIFT) <= 512 /* MB */) {
+	} else if (cfs_totalram_pages() >> (20 - PAGE_SHIFT) <= 512 /* MB */) {
 		cli->cl_max_rpcs_in_flight = 4;
 	} else {
 		if (osc_on_mdt(obddev->obd_name))
@@ -1164,6 +1164,7 @@ int target_handle_connect(struct ptlrpc_request *req)
 			 * cause namespace inconsistency */
 			spin_lock(&export->exp_lock);
 			export->exp_connecting = 1;
+			export->exp_conn_cnt = 0;
 			spin_unlock(&export->exp_lock);
 			conn.cookie = export->exp_handle.h_cookie;
 			rc = EALREADY;
@@ -1205,18 +1206,19 @@ int target_handle_connect(struct ptlrpc_request *req)
                               target->obd_name, cluuid.uuid,
                               libcfs_nid2str(req->rq_peer.nid),
 			      atomic_read(&export->exp_refcount));
-                GOTO(out, rc = -EBUSY);
-        } else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1) {
-                if (!strstr(cluuid.uuid, "mdt"))
-                        LCONSOLE_WARN("%s: Rejecting reconnect from the "
-                                      "known client %s (at %s) because it "
-                                      "is indicating it is a new client",
-                                      target->obd_name, cluuid.uuid,
-                                      libcfs_nid2str(req->rq_peer.nid));
-                GOTO(out, rc = -EALREADY);
-        } else {
-                OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_RECONNECT, 2 * obd_timeout);
-        }
+			GOTO(out, rc = -EBUSY);
+	} else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1 &&
+		   rc != EALREADY) {
+		if (!strstr(cluuid.uuid, "mdt"))
+			LCONSOLE_WARN("%s: Rejecting reconnect from the "
+				      "known client %s (at %s) because it "
+				      "is indicating it is a new client",
+				      target->obd_name, cluuid.uuid,
+				      libcfs_nid2str(req->rq_peer.nid));
+		GOTO(out, rc = -EALREADY);
+	} else {
+		OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_RECONNECT, 2 * obd_timeout);
+	}
 
         if (rc < 0) {
                 GOTO(out, rc);
@@ -2621,9 +2623,9 @@ void target_recovery_fini(struct obd_device *obd)
 }
 EXPORT_SYMBOL(target_recovery_fini);
 
-static void target_recovery_expired(unsigned long castmeharder)
+static void target_recovery_expired(cfs_timer_cb_arg_t data)
 {
-	struct obd_device *obd = (struct obd_device *)castmeharder;
+	struct obd_device *obd = cfs_from_timer(obd, data, obd_recovery_timer);
 	CDEBUG(D_HA, "%s: recovery timed out; %d clients are still in recovery"
 	       " after %llus (%d clients connected)\n",
 	       obd->obd_name, atomic_read(&obd->obd_lock_replay_clients),
@@ -2655,8 +2657,8 @@ void target_recovery_init(struct lu_target *lut, svc_handler_t handler)
         obd->obd_recovery_start = 0;
         obd->obd_recovery_end = 0;
 
-	setup_timer(&obd->obd_recovery_timer, target_recovery_expired,
-		    (unsigned long)obd);
+	cfs_timer_setup(&obd->obd_recovery_timer, target_recovery_expired,
+			(unsigned long)obd, 0);
 	target_start_recovery_thread(lut, handler);
 }
 EXPORT_SYMBOL(target_recovery_init);
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c
index ca171fe485f0b..df28b2d7b5131 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c
@@ -233,6 +233,8 @@ int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock)
 		struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
 
 		LASSERT(lock->l_resource->lr_type != LDLM_FLOCK);
+		if (ns->ns_last_pos == &lock->l_lru)
+			ns->ns_last_pos = lock->l_lru.prev;
 		list_del_init(&lock->l_lru);
 		LASSERT(ns->ns_nr_unused > 0);
 		ns->ns_nr_unused--;
@@ -283,7 +285,6 @@ void ldlm_lock_add_to_lru_nolock(struct ldlm_lock *lock)
 	LASSERT(list_empty(&lock->l_lru));
 	LASSERT(lock->l_resource->lr_type != LDLM_FLOCK);
 	list_add_tail(&lock->l_lru, &ns->ns_unused_list);
-	ldlm_clear_skipped(lock);
 	LASSERT(ns->ns_nr_unused >= 0);
 	ns->ns_nr_unused++;
 }
@@ -482,6 +483,7 @@ static struct ldlm_lock *ldlm_lock_new(struct ldlm_resource *resource)
         lu_ref_init(&lock->l_reference);
         lu_ref_add(&lock->l_reference, "hash", lock);
         lock->l_callback_timeout = 0;
+	lock->l_activity = 0;
 
 #if LUSTRE_TRACKS_LOCK_EXP_REFS
 	INIT_LIST_HEAD(&lock->l_exp_refs_link);
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c
index 356a30231142b..465ffda035dbe 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c
@@ -133,8 +133,8 @@ static DEFINE_SPINLOCK(waiting_locks_spinlock); /* BH lock (timer) */
  * All access to it should be under waiting_locks_spinlock.
  */
 static LIST_HEAD(waiting_locks_list);
-static void waiting_locks_callback(unsigned long unused);
-static DEFINE_TIMER(waiting_locks_timer, waiting_locks_callback, 0, 0);
+static void waiting_locks_callback(cfs_timer_cb_arg_t unused);
+static CFS_DEFINE_TIMER(waiting_locks_timer, waiting_locks_callback, 0, 0);
 
 enum elt_state {
 	ELT_STOPPED,
@@ -288,7 +288,7 @@ static int ldlm_lock_busy(struct ldlm_lock *lock)
 }
 
 /* This is called from within a timer interrupt and cannot schedule */
-static void waiting_locks_callback(unsigned long unused)
+static void waiting_locks_callback(cfs_timer_cb_arg_t unused)
 {
 	struct ldlm_lock	*lock;
 	int			need_dump = 0;
@@ -329,7 +329,7 @@ static void waiting_locks_callback(unsigned long unused)
                 ldlm_lock_to_ns(lock)->ns_timeouts++;
 		LDLM_ERROR(lock, "lock callback timer expired after %llds: "
                            "evicting client at %s ",
-			   ktime_get_real_seconds() - lock->l_last_activity,
+			   ktime_get_real_seconds() - lock->l_blast_sent,
                            libcfs_nid2str(
                                    lock->l_export->exp_connection->c_peer.nid));
 
@@ -459,7 +459,7 @@ static int ldlm_add_waiting_lock(struct ldlm_lock *lock)
 	}
 
 	ldlm_set_waited(lock);
-	lock->l_last_activity = ktime_get_real_seconds();
+	lock->l_blast_sent = ktime_get_real_seconds();
 	ret = __ldlm_add_waiting_lock(lock, timeout);
 	if (ret) {
 		/* grab ref on the lock if it has been added to the
@@ -939,8 +939,6 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock,
         if (AT_OFF)
                 req->rq_timeout = ldlm_get_rq_timeout();
 
-	lock->l_last_activity = ktime_get_real_seconds();
-
         if (lock->l_export && lock->l_export->exp_nid_stats &&
             lock->l_export->exp_nid_stats->nid_ldlm_stats)
                 lprocfs_counter_incr(lock->l_export->exp_nid_stats->nid_ldlm_stats,
@@ -1012,7 +1010,7 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
 	if (lvb_len > 0) {
 		void *lvb = req_capsule_client_get(&req->rq_pill, &RMF_DLM_LVB);
 
-		lvb_len = ldlm_lvbo_fill(lock, lvb, lvb_len);
+		lvb_len = ldlm_lvbo_fill(lock, lvb, &lvb_len);
 		if (lvb_len < 0) {
 			/* We still need to send the RPC to wake up the blocked
 			 * enqueue thread on the client.
@@ -1029,8 +1027,6 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
 		}
         }
 
-	lock->l_last_activity = ktime_get_real_seconds();
-
 	LDLM_DEBUG(lock, "server preparing completion AST");
 
         ptlrpc_request_set_replen(req);
@@ -1139,8 +1135,6 @@ int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data)
         if (AT_OFF)
                 req->rq_timeout = ldlm_get_rq_timeout();
 
-	lock->l_last_activity = ktime_get_real_seconds();
-
 	req->rq_interpret_reply = ldlm_cb_interpret;
 
         if (lock->l_export && lock->l_export->exp_nid_stats &&
@@ -1267,20 +1261,6 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
                 GOTO(out, rc = -EFAULT);
         }
 
-	if (exp_connect_flags(req->rq_export) & OBD_CONNECT_IBITS) {
-                if (unlikely(dlm_req->lock_desc.l_resource.lr_type ==
-                             LDLM_PLAIN)) {
-                        DEBUG_REQ(D_ERROR, req,
-                                  "PLAIN lock request from IBITS client?");
-                        GOTO(out, rc = -EPROTO);
-                }
-        } else if (unlikely(dlm_req->lock_desc.l_resource.lr_type ==
-                            LDLM_IBITS)) {
-                DEBUG_REQ(D_ERROR, req,
-                          "IBITS lock request from unaware client?");
-                GOTO(out, rc = -EPROTO);
-        }
-
 	if (unlikely((flags & LDLM_FL_REPLAY) ||
 		     (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT))) {
                 /* Find an existing lock in the per-export lock hash */
@@ -1474,43 +1454,59 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
 		LDLM_DEBUG(lock, "server-side enqueue handler, sending reply"
 			   "(err=%d, rc=%d)", err, rc);
 
-		if (rc == 0) {
-			if (req_capsule_has_field(&req->rq_pill, &RMF_DLM_LVB,
-						  RCL_SERVER) &&
-			    ldlm_lvbo_size(lock) > 0) {
-				void *buf;
-				int buflen;
-
-				buf = req_capsule_server_get(&req->rq_pill,
-							     &RMF_DLM_LVB);
-				LASSERTF(buf != NULL, "req %p, lock %p\n",
-					 req, lock);
-				buflen = req_capsule_get_size(&req->rq_pill,
-						&RMF_DLM_LVB, RCL_SERVER);
-				/* non-replayed lock, delayed lvb init may
-				 * need to be occur now */
-				if ((buflen > 0) && !(flags & LDLM_FL_REPLAY)) {
-					buflen = ldlm_lvbo_fill(lock, buf,
-								buflen);
-					if (buflen >= 0)
-						req_capsule_shrink(
+		if (rc == 0 &&
+		    req_capsule_has_field(&req->rq_pill, &RMF_DLM_LVB,
+					  RCL_SERVER) &&
+		    ldlm_lvbo_size(lock) > 0) {
+			void *buf;
+			int buflen;
+
+retry:
+			buf = req_capsule_server_get(&req->rq_pill,
+						     &RMF_DLM_LVB);
+			LASSERTF(buf != NULL, "req %p, lock %p\n", req, lock);
+			buflen = req_capsule_get_size(&req->rq_pill,
+					&RMF_DLM_LVB, RCL_SERVER);
+			/* non-replayed lock, delayed lvb init may
+			 * need to be occur now
+			 */
+			if ((buflen > 0) && !(flags & LDLM_FL_REPLAY)) {
+				int rc2;
+
+				rc2 = ldlm_lvbo_fill(lock, buf, &buflen);
+				if (rc2 >= 0) {
+					req_capsule_shrink(&req->rq_pill,
+							   &RMF_DLM_LVB,
+							   rc2, RCL_SERVER);
+				} else if (rc2 == -ERANGE) {
+					rc2 = req_capsule_server_grow(
 							&req->rq_pill,
-							&RMF_DLM_LVB,
-							buflen, RCL_SERVER);
-					else
-						rc = buflen;
-				} else if (flags & LDLM_FL_REPLAY) {
-					/* no LVB resend upon replay */
-					if (buflen > 0)
+							&RMF_DLM_LVB, buflen);
+					if (!rc2) {
+						goto retry;
+					} else {
+						/* if we can't grow the buffer,
+						 * it's ok to return empty lvb
+						 * to client.
+						 */
 						req_capsule_shrink(
 							&req->rq_pill,
-							&RMF_DLM_LVB,
-							0, RCL_SERVER);
-					else
-						rc = buflen;
+							&RMF_DLM_LVB, 0,
+							RCL_SERVER);
+					}
 				} else {
-					rc = buflen;
+					rc = rc2;
 				}
+			} else if (flags & LDLM_FL_REPLAY) {
+				/* no LVB resend upon replay */
+				if (buflen > 0)
+					req_capsule_shrink(&req->rq_pill,
+							   &RMF_DLM_LVB,
+							   0, RCL_SERVER);
+				else
+					rc = buflen;
+			} else {
+				rc = buflen;
 			}
 		}
 
@@ -1695,9 +1691,10 @@ int ldlm_request_cancel(struct ptlrpc_request *req,
                         pres = res;
                 }
 
-		if ((flags & LATF_STATS) && ldlm_is_ast_sent(lock)) {
+		if ((flags & LATF_STATS) && ldlm_is_ast_sent(lock) &&
+		    lock->l_blast_sent != 0) {
 			time64_t delay = ktime_get_real_seconds() -
-					 lock->l_last_activity;
+					 lock->l_blast_sent;
 			LDLM_DEBUG(lock, "server cancels blocked lock after %llds",
 				   (s64)delay);
 			at_measured(&lock->l_export->exp_bl_lock_at, delay);
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c
index f932c9900dd29..d15cff5fb27b6 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c
@@ -125,9 +125,9 @@ int ldlm_expired_completion_wait(void *data)
 
 		LDLM_ERROR(lock, "lock timed out (enqueued at %lld, %llds ago); "
 			   "not entering recovery in server code, just going back to sleep",
-			   (s64)lock->l_last_activity,
+			   (s64)lock->l_activity,
 			   (s64)(ktime_get_real_seconds() -
-				 lock->l_last_activity));
+				 lock->l_activity));
                 if (cfs_time_after(cfs_time_current(), next_dump)) {
                         last_dump = next_dump;
                         next_dump = cfs_time_shift(300);
@@ -143,8 +143,8 @@ int ldlm_expired_completion_wait(void *data)
         imp = obd->u.cli.cl_import;
         ptlrpc_fail_import(imp, lwd->lwd_conn_cnt);
 	LDLM_ERROR(lock, "lock timed out (enqueued at %lld, %llds ago), entering recovery for %s@%s",
-		  (s64)lock->l_last_activity,
-		  (s64)(ktime_get_real_seconds() - lock->l_last_activity),
+		  (s64)lock->l_activity,
+		  (s64)(ktime_get_real_seconds() - lock->l_activity),
                   obd2cli_tgt(obd), imp->imp_connection->c_remote_uuid.uuid);
 
         RETURN(0);
@@ -192,7 +192,7 @@ static int ldlm_completion_tail(struct ldlm_lock *lock, void *data)
 		LDLM_DEBUG(lock, "client-side enqueue: granted");
 	} else {
 		/* Take into AT only CP RPC, not immediately granted locks */
-		delay = ktime_get_real_seconds() - lock->l_last_activity;
+		delay = ktime_get_real_seconds() - lock->l_activity;
 		LDLM_DEBUG(lock, "client-side enqueue: granted after %llds",
 			   (s64)delay);
 
@@ -285,7 +285,7 @@ int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
 	timeout = ldlm_cp_timeout(lock);
 
 	lwd.lwd_lock = lock;
-	lock->l_last_activity = cfs_time_current_sec();
+	lock->l_activity = cfs_time_current_sec();
 
 	if (ldlm_is_no_timeout(lock)) {
                 LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT");
@@ -671,14 +671,15 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
 				GOTO(cleanup, rc = -ENOMEM);
 			LDLM_DEBUG(lock, "client-side enqueue, new resource");
 		}
-		if (with_policy)
-			if (!(type == LDLM_IBITS &&
-			      !(exp_connect_flags(exp) & OBD_CONNECT_IBITS)))
-				/* We assume lock type cannot change on server*/
-				ldlm_convert_policy_to_local(exp,
+
+		if (with_policy) {
+			/* We assume lock type cannot change on server*/
+			ldlm_convert_policy_to_local(exp,
 						lock->l_resource->lr_type,
 						&reply->lock_desc.l_policy_data,
 						&lock->l_policy_data);
+		}
+
                 if (type != LDLM_PLAIN)
                         LDLM_DEBUG(lock,"client-side enqueue, new policy data");
         }
@@ -947,7 +948,7 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
 	lock->l_export = NULL;
 	lock->l_blocking_ast = einfo->ei_cb_bl;
 	lock->l_flags |= (*flags & (LDLM_FL_NO_LRU | LDLM_FL_EXCL));
-        lock->l_last_activity = cfs_time_current_sec();
+        lock->l_activity = cfs_time_current_sec();
 
 	/* lock not sent to server yet */
 	if (reqp == NULL || *reqp == NULL) {
@@ -1488,9 +1489,6 @@ ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock,
 			/* Fall through */
 		default:
 			result = LDLM_POLICY_SKIP_LOCK;
-			lock_res_and_lock(lock);
-			ldlm_set_skipped(lock);
-			unlock_res_and_lock(lock);
 			break;
 	}
 
@@ -1711,53 +1709,47 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
 				 enum ldlm_lru_flags lru_flags)
 {
 	ldlm_cancel_lru_policy_t pf;
-	struct ldlm_lock *lock, *next;
-	int added = 0, unused, remained;
+	int added = 0;
 	int no_wait = lru_flags & LDLM_LRU_FLAG_NO_WAIT;
-	ENTRY;
 
-	spin_lock(&ns->ns_lock);
-	unused = ns->ns_nr_unused;
-	remained = unused;
+	ENTRY;
 
 	if (!ns_connect_lru_resize(ns))
-		count += unused - ns->ns_max_unused;
+		count += ns->ns_nr_unused - ns->ns_max_unused;
 
 	pf = ldlm_cancel_lru_policy(ns, lru_flags);
 	LASSERT(pf != NULL);
 
-	while (!list_empty(&ns->ns_unused_list)) {
+	/* For any flags, stop scanning if @max is reached. */
+	while (!list_empty(&ns->ns_unused_list) && (max == 0 || added < max)) {
+		struct ldlm_lock *lock;
+		struct list_head *item, *next;
 		enum ldlm_policy_res result;
 		ktime_t last_use = ktime_set(0, 0);
 
-		/* all unused locks */
-		if (remained-- <= 0)
-			break;
-
-		/* For any flags, stop scanning if @max is reached. */
-		if (max && added >= max)
-			break;
+		spin_lock(&ns->ns_lock);
+		item = no_wait ? ns->ns_last_pos : &ns->ns_unused_list;
+		for (item = item->next, next = item->next;
+		     item != &ns->ns_unused_list;
+		     item = next, next = item->next) {
+			lock = list_entry(item, struct ldlm_lock, l_lru);
 
-		list_for_each_entry_safe(lock, next, &ns->ns_unused_list,
-					 l_lru) {
 			/* No locks which got blocking requests. */
 			LASSERT(!ldlm_is_bl_ast(lock));
 
-			if (no_wait && ldlm_is_skipped(lock))
-				/* already processed */
-				continue;
-
-			last_use = lock->l_last_used;
-
-			/* Somebody is already doing CANCEL. No need for this
-			 * lock in LRU, do not traverse it again. */
 			if (!ldlm_is_canceling(lock))
 				break;
 
+			/* Somebody is already doing CANCEL. No need for this
+			 * lock in LRU, do not traverse it again. */
 			ldlm_lock_remove_from_lru_nolock(lock);
 		}
-		if (&lock->l_lru == &ns->ns_unused_list)
+		if (item == &ns->ns_unused_list) {
+			spin_unlock(&ns->ns_lock);
 			break;
+		}
+
+		last_use = lock->l_last_used;
 
 		LDLM_LOCK_GET(lock);
 		spin_unlock(&ns->ns_lock);
@@ -1776,19 +1768,23 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
 		 * old locks, but additionally choose them by
 		 * their weight. Big extent locks will stay in
 		 * the cache. */
-		result = pf(ns, lock, unused, added, count);
+		result = pf(ns, lock, ns->ns_nr_unused, added, count);
 		if (result == LDLM_POLICY_KEEP_LOCK) {
-			lu_ref_del(&lock->l_reference,
-				   __FUNCTION__, current);
+			lu_ref_del(&lock->l_reference, __func__, current);
 			LDLM_LOCK_RELEASE(lock);
-			spin_lock(&ns->ns_lock);
 			break;
 		}
+
 		if (result == LDLM_POLICY_SKIP_LOCK) {
-			lu_ref_del(&lock->l_reference,
-				   __func__, current);
+			lu_ref_del(&lock->l_reference, __func__, current);
 			LDLM_LOCK_RELEASE(lock);
-			spin_lock(&ns->ns_lock);
+			if (no_wait) {
+				spin_lock(&ns->ns_lock);
+				if (!list_empty(&lock->l_lru) &&
+				    lock->l_lru.prev == ns->ns_last_pos)
+					ns->ns_last_pos = &lock->l_lru;
+				spin_unlock(&ns->ns_lock);
+			}
 			continue;
 		}
 
@@ -1805,7 +1801,6 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
 			unlock_res_and_lock(lock);
 			lu_ref_del(&lock->l_reference, __FUNCTION__, current);
 			LDLM_LOCK_RELEASE(lock);
-			spin_lock(&ns->ns_lock);
 			continue;
 		}
 		LASSERT(!lock->l_readers && !lock->l_writers);
@@ -1840,11 +1835,8 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
 		list_add(&lock->l_bl_ast, cancels);
 		unlock_res_and_lock(lock);
 		lu_ref_del(&lock->l_reference, __FUNCTION__, current);
-		spin_lock(&ns->ns_lock);
 		added++;
-		unused--;
 	}
-	spin_unlock(&ns->ns_lock);
 	RETURN(added);
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c
index 225d3a7f01df7..8467a9b8abff4 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c
@@ -951,6 +951,7 @@ struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name,
         ns->ns_connect_flags      = 0;
         ns->ns_stopping           = 0;
 	ns->ns_reclaim_start	  = 0;
+	ns->ns_last_pos		  = &ns->ns_unused_list;
 
 	rc = ldlm_namespace_sysfs_register(ns);
 	if (rc) {
diff --git a/drivers/staging/lustrefsx/lustre/llite/file.c b/drivers/staging/lustrefsx/lustre/llite/file.c
index 165141dc52cb5..04cc72f451861 100644
--- a/drivers/staging/lustrefsx/lustre/llite/file.c
+++ b/drivers/staging/lustrefsx/lustre/llite/file.c
@@ -1008,17 +1008,19 @@ int ll_merge_attr(const struct lu_env *env, struct inode *inode)
 	 * if it's at least 'mdd.*.atime_diff' older.
 	 * All in all, the atime in Lustre does not strictly comply with
 	 * POSIX. Solving this problem needs to send an RPC to MDT for each
-	 * read, this will hurt performance. */
-	if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
-		LTIME_S(inode->i_atime) = lli->lli_atime;
+	 * read, this will hurt performance.
+	 */
+	if (inode->i_atime.tv_sec < lli->lli_atime ||
+	    lli->lli_update_atime) {
+		inode->i_atime.tv_sec = lli->lli_atime;
 		lli->lli_update_atime = 0;
 	}
-	LTIME_S(inode->i_mtime) = lli->lli_mtime;
-	LTIME_S(inode->i_ctime) = lli->lli_ctime;
+	inode->i_mtime.tv_sec = lli->lli_mtime;
+	inode->i_ctime.tv_sec = lli->lli_ctime;
 
-	atime = LTIME_S(inode->i_atime);
-	mtime = LTIME_S(inode->i_mtime);
-	ctime = LTIME_S(inode->i_ctime);
+	mtime = inode->i_mtime.tv_sec;
+	atime = inode->i_atime.tv_sec;
+	ctime = inode->i_ctime.tv_sec;
 
 	cl_object_attr_lock(obj);
 	rc = cl_object_attr_get(env, obj, attr);
@@ -1042,9 +1044,9 @@ int ll_merge_attr(const struct lu_env *env, struct inode *inode)
 	i_size_write(inode, attr->cat_size);
 	inode->i_blocks = attr->cat_blocks;
 
-	LTIME_S(inode->i_atime) = atime;
-	LTIME_S(inode->i_mtime) = mtime;
-	LTIME_S(inode->i_ctime) = ctime;
+	inode->i_mtime.tv_sec = mtime;
+	inode->i_atime.tv_sec = atime;
+	inode->i_ctime.tv_sec = ctime;
 
 out_size_unlock:
 	ll_inode_size_unlock(inode);
@@ -1300,11 +1302,13 @@ ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
 				pos += io->ci_nob;
 
 			args->u.normal.via_iocb->ki_pos = pos;
+			if (io->ci_pio) {
 #ifdef HAVE_KIOCB_KI_LEFT
-			args->u.normal.via_iocb->ki_left = count;
+				args->u.normal.via_iocb->ki_left = count;
 #elif defined(HAVE_KI_NBYTES)
-			args->u.normal.via_iocb->ki_nbytes = count;
+				args->u.normal.via_iocb->ki_nbytes = count;
 #endif
+			}
 		} else {
 			/* for splice */
 			pos = io->u.ci_rw.rw_range.cir_pos;
@@ -3268,7 +3272,7 @@ ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
         }
         flock.l_flock.pid = file_lock->fl_pid;
 
-#ifdef HAVE_LM_COMPARE_OWNER
+#if defined(HAVE_LM_COMPARE_OWNER) || defined(lm_compare_owner)
 	/* Somewhat ugly workaround for svc lockd.
 	 * lockd installs custom fl_lmops->lm_compare_owner that checks
 	 * for the fl_owner to be the same (which it always is on local node
@@ -3782,9 +3786,9 @@ ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
 				RETURN(rc);
 		}
 
-		LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
-		LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
-		LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
+		inode->i_atime.tv_sec = ll_i2info(inode)->lli_atime;
+		inode->i_mtime.tv_sec = ll_i2info(inode)->lli_mtime;
+		inode->i_ctime.tv_sec = ll_i2info(inode)->lli_ctime;
 	} else {
 		/* In case of restore, the MDT has the right size and has
 		 * already send it back without granting the layout lock,
diff --git a/drivers/staging/lustrefsx/lustre/llite/glimpse.c b/drivers/staging/lustrefsx/lustre/llite/glimpse.c
index 55deb8236bd40..d34be28747bdd 100644
--- a/drivers/staging/lustrefsx/lustre/llite/glimpse.c
+++ b/drivers/staging/lustrefsx/lustre/llite/glimpse.c
@@ -66,22 +66,12 @@ blkcnt_t dirty_cnt(struct inode *inode)
 {
         blkcnt_t cnt = 0;
 	struct vvp_object *vob = cl_inode2vvp(inode);
-#ifndef HAVE_ADDRESS_SPACE_XARRAY
         void              *results[1];
 
         if (inode->i_mapping != NULL)
-#ifdef HAVE_ADDRESS_SPACE_IPAGES
-                cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->i_pages,
-#else
                 cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->page_tree,
-#endif
                                                   results, 0, 1,
                                                   PAGECACHE_TAG_DIRTY);
-#else
-	if (inode->i_mapping && mapping_tagged(inode->i_mapping,
-				PAGECACHE_TAG_DIRTY))
-		cnt = 1;
-#endif
 	if (cnt == 0 && atomic_read(&vob->vob_mmap_cnt) > 0)
 		cnt = 1;
 
diff --git a/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c b/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c
index feaf1769b6e87..f6e429ba182c3 100644
--- a/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c
+++ b/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c
@@ -86,9 +86,9 @@ int cl_setattr_ost(struct cl_object *obj, const struct iattr *attr,
 	io->ci_obj = obj;
 	io->ci_verify_layout = 1;
 
-	io->u.ci_setattr.sa_attr.lvb_atime = LTIME_S(attr->ia_atime);
-	io->u.ci_setattr.sa_attr.lvb_mtime = LTIME_S(attr->ia_mtime);
-	io->u.ci_setattr.sa_attr.lvb_ctime = LTIME_S(attr->ia_ctime);
+	io->u.ci_setattr.sa_attr.lvb_atime = attr->ia_atime.tv_sec;
+	io->u.ci_setattr.sa_attr.lvb_mtime = attr->ia_mtime.tv_sec;
+	io->u.ci_setattr.sa_attr.lvb_ctime = attr->ia_ctime.tv_sec;
 	io->u.ci_setattr.sa_attr.lvb_size = attr->ia_size;
 	io->u.ci_setattr.sa_attr_flags = attr_flags;
 	io->u.ci_setattr.sa_valid = attr->ia_valid;
@@ -149,37 +149,48 @@ int cl_file_inode_init(struct inode *inode, struct lustre_md *md)
 
 	site = ll_i2sbi(inode)->ll_site;
 	lli  = ll_i2info(inode);
-        fid  = &lli->lli_fid;
-        LASSERT(fid_is_sane(fid));
-
-        if (lli->lli_clob == NULL) {
-                /* clob is slave of inode, empty lli_clob means for new inode,
-                 * there is no clob in cache with the given fid, so it is
-                 * unnecessary to perform lookup-alloc-lookup-insert, just
-                 * alloc and insert directly. */
-                LASSERT(inode->i_state & I_NEW);
-                conf.coc_lu.loc_flags = LOC_F_NEW;
-                clob = cl_object_find(env, lu2cl_dev(site->ls_top_dev),
-                                      fid, &conf);
-                if (!IS_ERR(clob)) {
-                        /*
-                         * No locking is necessary, as new inode is
-                         * locked by I_NEW bit.
-                         */
-                        lli->lli_clob = clob;
-                        lu_object_ref_add(&clob->co_lu, "inode", inode);
-                } else
-                        result = PTR_ERR(clob);
+	fid  = &lli->lli_fid;
+	LASSERT(fid_is_sane(fid));
+
+	if (lli->lli_clob == NULL) {
+		/* clob is slave of inode, empty lli_clob means for new inode,
+		 * there is no clob in cache with the given fid, so it is
+		 * unnecessary to perform lookup-alloc-lookup-insert, just
+		 * alloc and insert directly.
+		 */
+		if (!(inode->i_state & I_NEW)) {
+			result = -EIO;
+			CERROR("%s: unexpected not-NEW inode "DFID": rc = %d\n",
+			       ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid),
+			       result);
+			goto out;
+		}
+
+		conf.coc_lu.loc_flags = LOC_F_NEW;
+		clob = cl_object_find(env, lu2cl_dev(site->ls_top_dev),
+				      fid, &conf);
+		if (!IS_ERR(clob)) {
+			/*
+			 * No locking is necessary, as new inode is
+			 * locked by I_NEW bit.
+			 */
+			lli->lli_clob = clob;
+			lu_object_ref_add(&clob->co_lu, "inode", inode);
+		} else {
+			result = PTR_ERR(clob);
+		}
 	} else {
 		result = cl_conf_set(env, lli->lli_clob, &conf);
 	}
 
-        cl_env_put(env, &refcheck);
+	if (result != 0)
+		CERROR("%s: failed to initialize cl_object "DFID": rc = %d\n",
+			ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid), result);
+
+out:
+	cl_env_put(env, &refcheck);
 
-        if (result != 0)
-                CERROR("Failure to initialize cl object "DFID": %d\n",
-                       PFID(fid), result);
-        return result;
+	return result;
 }
 
 /**
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_lib.c b/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
index 339dd8f1da4c4..297622d3d88f6 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
@@ -49,6 +49,9 @@
 #include <linux/security.h>
 
 #include <uapi/linux/lustre_ioctl.h>
+#ifdef HAVE_UAPI_LINUX_MOUNT_H
+#include <uapi/linux/mount.h>
+#endif
 #include <lustre_ha.h>
 #include <lustre_dlm.h>
 #include <lprocfs_status.h>
@@ -231,15 +234,15 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	if (OBD_FAIL_CHECK(OBD_FAIL_MDC_LIGHTWEIGHT))
 		/* flag mdc connection as lightweight, only used for test
 		 * purpose, use with care */
-                data->ocd_connect_flags |= OBD_CONNECT_LIGHTWEIGHT;
+		data->ocd_connect_flags |= OBD_CONNECT_LIGHTWEIGHT;
 
-        data->ocd_ibits_known = MDS_INODELOCK_FULL;
-        data->ocd_version = LUSTRE_VERSION_CODE;
+	data->ocd_ibits_known = MDS_INODELOCK_FULL;
+	data->ocd_version = LUSTRE_VERSION_CODE;
 
-        if (sb->s_flags & SB_RDONLY)
-                data->ocd_connect_flags |= OBD_CONNECT_RDONLY;
-        if (sbi->ll_flags & LL_SBI_USER_XATTR)
-                data->ocd_connect_flags |= OBD_CONNECT_XATTR;
+	if (sb->s_flags & SB_RDONLY)
+		data->ocd_connect_flags |= OBD_CONNECT_RDONLY;
+	if (sbi->ll_flags & LL_SBI_USER_XATTR)
+		data->ocd_connect_flags |= OBD_CONNECT_XATTR;
 
 #ifdef SB_NOSEC
 	/* Setting this indicates we correctly support S_NOSEC (See kernel
@@ -248,12 +251,12 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	sb->s_flags |= SB_NOSEC;
 #endif
 
-        if (sbi->ll_flags & LL_SBI_FLOCK)
-                sbi->ll_fop = &ll_file_operations_flock;
-        else if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
-                sbi->ll_fop = &ll_file_operations;
-        else
-                sbi->ll_fop = &ll_file_operations_noflock;
+	if (sbi->ll_flags & LL_SBI_FLOCK)
+		sbi->ll_fop = &ll_file_operations_flock;
+	else if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
+		sbi->ll_fop = &ll_file_operations;
+	else
+		sbi->ll_fop = &ll_file_operations_noflock;
 
 	/* always ping even if server suppress_pings */
 	if (sbi->ll_flags & LL_SBI_ALWAYS_PING)
@@ -266,16 +269,16 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	data->ocd_brw_size = MD_MAX_BRW_SIZE;
 
         err = obd_connect(NULL, &sbi->ll_md_exp, obd, &sbi->ll_sb_uuid, data, NULL);
-        if (err == -EBUSY) {
-                LCONSOLE_ERROR_MSG(0x14f, "An MDT (md %s) is performing "
-                                   "recovery, of which this client is not a "
-                                   "part. Please wait for recovery to complete,"
-                                   " abort, or time out.\n", md);
-                GOTO(out, err);
-        } else if (err) {
-                CERROR("cannot connect to %s: rc = %d\n", md, err);
-                GOTO(out, err);
-        }
+	if (err == -EBUSY) {
+		LCONSOLE_ERROR_MSG(0x14f, "An MDT (md %s) is performing "
+				   "recovery, of which this client is not a "
+				   "part. Please wait for recovery to complete,"
+				   " abort, or time out.\n", md);
+		GOTO(out, err);
+	} else if (err) {
+		CERROR("cannot connect to %s: rc = %d\n", md, err);
+		GOTO(out, err);
+	}
 
 	sbi->ll_md_exp->exp_connect_data = *data;
 
@@ -337,28 +340,28 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	sbi->ll_namelen = osfs->os_namelen;
 	sbi->ll_mnt.mnt = current->fs->root.mnt;
 
-        if ((sbi->ll_flags & LL_SBI_USER_XATTR) &&
-            !(data->ocd_connect_flags & OBD_CONNECT_XATTR)) {
-                LCONSOLE_INFO("Disabling user_xattr feature because "
-                              "it is not supported on the server\n");
-                sbi->ll_flags &= ~LL_SBI_USER_XATTR;
-        }
+	if ((sbi->ll_flags & LL_SBI_USER_XATTR) &&
+	    !(data->ocd_connect_flags & OBD_CONNECT_XATTR)) {
+		LCONSOLE_INFO("Disabling user_xattr feature because "
+			      "it is not supported on the server\n");
+		sbi->ll_flags &= ~LL_SBI_USER_XATTR;
+	}
 
-        if (data->ocd_connect_flags & OBD_CONNECT_ACL) {
+	if (data->ocd_connect_flags & OBD_CONNECT_ACL) {
 #ifdef SB_POSIXACL
-                sb->s_flags |= SB_POSIXACL;
+		sb->s_flags |= SB_POSIXACL;
 #endif
-                sbi->ll_flags |= LL_SBI_ACL;
-        } else {
-                LCONSOLE_INFO("client wants to enable acl, but mdt not!\n");
+		sbi->ll_flags |= LL_SBI_ACL;
+	} else {
+		LCONSOLE_INFO("client wants to enable acl, but mdt not!\n");
 #ifdef SB_POSIXACL
-                sb->s_flags &= ~SB_POSIXACL;
+		sb->s_flags &= ~SB_POSIXACL;
 #endif
-                sbi->ll_flags &= ~LL_SBI_ACL;
-        }
+		sbi->ll_flags &= ~LL_SBI_ACL;
+	}
 
-        if (data->ocd_connect_flags & OBD_CONNECT_64BITHASH)
-                sbi->ll_flags |= LL_SBI_64BIT_HASH;
+	if (data->ocd_connect_flags & OBD_CONNECT_64BITHASH)
+		sbi->ll_flags |= LL_SBI_64BIT_HASH;
 
 	if (data->ocd_connect_flags & OBD_CONNECT_LAYOUTLOCK)
 		sbi->ll_flags |= LL_SBI_LAYOUT_LOCK;
@@ -723,7 +726,7 @@ void ll_kill_super(struct super_block *sb)
 	struct ll_sb_info *sbi;
 	ENTRY;
 
-        /* not init sb ?*/
+	/* not init sb ?*/
 	if (!(sb->s_flags & SB_ACTIVE))
 		return;
 
@@ -1230,9 +1233,9 @@ static struct inode *ll_iget_anon_dir(struct super_block *sb,
 		LASSERTF(S_ISDIR(inode->i_mode), "Not slave inode "DFID"\n",
 			 PFID(fid));
 
-		LTIME_S(inode->i_mtime) = 0;
-		LTIME_S(inode->i_atime) = 0;
-		LTIME_S(inode->i_ctime) = 0;
+		inode->i_mtime.tv_sec = 0;
+		inode->i_atime.tv_sec = 0;
+		inode->i_ctime.tv_sec = 0;
 		inode->i_rdev = 0;
 
 #ifdef HAVE_BACKING_DEV_INFO
@@ -1459,11 +1462,6 @@ void ll_clear_inode(struct inode *inode)
 #ifdef CONFIG_FS_POSIX_ACL
 	forget_all_cached_acls(inode);
 	if (lli->lli_posix_acl) {
-#ifdef HAVE_POSIX_ACL_REFCOUNT
-		LASSERT(refcount_read(&lli->lli_posix_acl->a_refcount) == 1);
-#else
-		LASSERT(atomic_read(&lli->lli_posix_acl->a_refcount) == 1);
-#endif
 		posix_acl_release(lli->lli_posix_acl);
 		lli->lli_posix_acl = NULL;
 	}
@@ -1615,10 +1613,9 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import)
         }
 
         if (attr->ia_valid & (ATTR_MTIME | ATTR_CTIME))
-		CDEBUG(D_INODE, "setting mtime " LTIME_FMT ", ctime "
-		       LTIME_FMT ", now = %llu\n",
-                       LTIME_S(attr->ia_mtime), LTIME_S(attr->ia_ctime),
-		       (s64)ktime_get_real_seconds());
+		CDEBUG(D_INODE, "setting mtime %lld, ctime %lld, now = %lld\n",
+		       (s64)attr->ia_mtime.tv_sec, (s64)attr->ia_ctime.tv_sec,
+		       ktime_get_real_seconds());
 
 	if (S_ISREG(inode->i_mode)) {
 		if (attr->ia_valid & ATTR_SIZE)
@@ -1892,24 +1889,25 @@ int ll_update_inode(struct inode *inode, struct lustre_md *md)
 	inode->i_generation = cl_fid_build_gen(&body->mbo_fid1);
 
 	if (body->mbo_valid & OBD_MD_FLATIME) {
-		if (body->mbo_atime > LTIME_S(inode->i_atime))
-			LTIME_S(inode->i_atime) = body->mbo_atime;
+		if (body->mbo_atime > inode->i_atime.tv_sec)
+			inode->i_atime.tv_sec = body->mbo_atime;
 		lli->lli_atime = body->mbo_atime;
 	}
 
 	if (body->mbo_valid & OBD_MD_FLMTIME) {
-		if (body->mbo_mtime > LTIME_S(inode->i_mtime)) {
-			CDEBUG(D_INODE, "setting ino %lu mtime from " LTIME_FMT
-			       "to %llu\n", inode->i_ino,
-			       LTIME_S(inode->i_mtime), body->mbo_mtime);
-			LTIME_S(inode->i_mtime) = body->mbo_mtime;
+		if (body->mbo_mtime > inode->i_mtime.tv_sec) {
+			CDEBUG(D_INODE,
+			       "setting ino %lu mtime from %lld to %llu\n",
+			       inode->i_ino, (s64)inode->i_mtime.tv_sec,
+			       body->mbo_mtime);
+			inode->i_mtime.tv_sec = body->mbo_mtime;
 		}
 		lli->lli_mtime = body->mbo_mtime;
 	}
 
 	if (body->mbo_valid & OBD_MD_FLCTIME) {
-		if (body->mbo_ctime > LTIME_S(inode->i_ctime))
-			LTIME_S(inode->i_ctime) = body->mbo_ctime;
+		if (body->mbo_ctime > inode->i_ctime.tv_sec)
+			inode->i_ctime.tv_sec = body->mbo_ctime;
 		lli->lli_ctime = body->mbo_ctime;
 	}
 
@@ -1995,11 +1993,12 @@ int ll_read_inode2(struct inode *inode, void *opaque)
         /* Core attributes from the MDS first.  This is a new inode, and
          * the VFS doesn't zero times in the core inode so we have to do
          * it ourselves.  They will be overwritten by either MDS or OST
-         * attributes - we just need to make sure they aren't newer. */
-        LTIME_S(inode->i_mtime) = 0;
-        LTIME_S(inode->i_atime) = 0;
-        LTIME_S(inode->i_ctime) = 0;
-        inode->i_rdev = 0;
+	 * attributes - we just need to make sure they aren't newer.
+	 */
+	inode->i_mtime.tv_sec = 0;
+	inode->i_atime.tv_sec = 0;
+	inode->i_ctime.tv_sec = 0;
+	inode->i_rdev = 0;
 	rc = ll_update_inode(inode, md);
 	if (rc != 0)
 		RETURN(rc);
@@ -2038,6 +2037,8 @@ int ll_read_inode2(struct inode *inode, void *opaque)
 void ll_delete_inode(struct inode *inode)
 {
 	struct ll_inode_info *lli = ll_i2info(inode);
+	struct address_space *mapping = &inode->i_data;
+	unsigned long nrpages;
 	ENTRY;
 
 	if (S_ISREG(inode->i_mode) && lli->lli_clob != NULL)
@@ -2045,11 +2046,26 @@ void ll_delete_inode(struct inode *inode)
 		 * otherwise we may lose data while umount */
 		cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, CL_FSYNC_LOCAL, 1);
 
-	truncate_inode_pages_final(&inode->i_data);
+	truncate_inode_pages_final(mapping);
 
-	LASSERTF(inode->i_data.nrpages == 0, "inode="DFID"(%p) nrpages=%lu, "
-		 "see https://jira.hpdd.intel.com/browse/LU-118\n",
-		 PFID(ll_inode2fid(inode)), inode, inode->i_data.nrpages);
+	/* Workaround for LU-118: Note nrpages may not be totally updated when
+	 * truncate_inode_pages() returns, as there can be a page in the process
+	 * of deletion (inside __delete_from_page_cache()) in the specified
+	 * range. Thus mapping->nrpages can be non-zero when this function
+	 * returns even after truncation of the whole mapping.  Only do this if
+	 * npages isn't already zero.
+	 */
+	nrpages = mapping->nrpages;
+	if (nrpages) {
+		xa_lock_irq(&mapping->i_pages);
+		nrpages = mapping->nrpages;
+		xa_unlock_irq(&mapping->i_pages);
+	} /* Workaround end */
+
+	LASSERTF(nrpages == 0, "%s: inode="DFID"(%p) nrpages=%lu, "
+		 "see https://jira.whamcloud.com/browse/LU-118\n",
+		 ll_get_fsname(inode->i_sb, NULL, 0),
+		 PFID(ll_inode2fid(inode)), inode, nrpages);
 
 #ifdef HAVE_SBOPS_EVICT_INODE
 	ll_clear_inode(inode);
@@ -2212,34 +2228,34 @@ void ll_umount_begin(struct super_block *sb)
 
 int ll_remount_fs(struct super_block *sb, int *flags, char *data)
 {
-        struct ll_sb_info *sbi = ll_s2sbi(sb);
-        char *profilenm = get_profile_name(sb);
-        int err;
-        __u32 read_only;
-
-        if ((*flags & SB_RDONLY) != (sb->s_flags & SB_RDONLY)) {
-                read_only = *flags & SB_RDONLY;
-                err = obd_set_info_async(NULL, sbi->ll_md_exp,
-                                         sizeof(KEY_READ_ONLY),
-                                         KEY_READ_ONLY, sizeof(read_only),
-                                         &read_only, NULL);
-                if (err) {
-                        LCONSOLE_WARN("Failed to remount %s %s (%d)\n",
-                                      profilenm, read_only ?
-                                      "read-only" : "read-write", err);
-                        return err;
-                }
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	char *profilenm = get_profile_name(sb);
+	int err;
+	__u32 read_only;
+
+	if ((*flags & MS_RDONLY) != (sb->s_flags & SB_RDONLY)) {
+		read_only = *flags & MS_RDONLY;
+		err = obd_set_info_async(NULL, sbi->ll_md_exp,
+					 sizeof(KEY_READ_ONLY),
+					 KEY_READ_ONLY, sizeof(read_only),
+					 &read_only, NULL);
+		if (err) {
+			LCONSOLE_WARN("Failed to remount %s %s (%d)\n",
+				      profilenm, read_only ?
+				      "read-only" : "read-write", err);
+			return err;
+		}
 
-                if (read_only)
-                        sb->s_flags |= SB_RDONLY;
-                else
-                        sb->s_flags &= ~SB_RDONLY;
+		if (read_only)
+			sb->s_flags |= SB_RDONLY;
+		else
+			sb->s_flags &= ~SB_RDONLY;
 
-                if (sbi->ll_flags & LL_SBI_VERBOSE)
-                        LCONSOLE_WARN("Remounted %s %s\n", profilenm,
-                                      read_only ?  "read-only" : "read-write");
-        }
-        return 0;
+		if (sbi->ll_flags & LL_SBI_VERBOSE)
+			LCONSOLE_WARN("Remounted %s %s\n", profilenm,
+				      read_only ?  "read-only" : "read-write");
+	}
+	return 0;
 }
 
 /**
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c b/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
index bf3c8636dff52..839b710764554 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
@@ -251,7 +251,7 @@ static inline int to_fault_error(int result)
  * \retval VM_FAULT_ERROR on general error
  * \retval NOPAGE_OOM not have memory for allocate new page
  */
-static int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf)
+static vm_fault_t ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct lu_env           *env;
 	struct cl_io            *io;
@@ -330,16 +330,16 @@ static int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 
 #ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY
-static VM_FAULT_T ll_fault(struct vm_fault *vmf)
+static vm_fault_t ll_fault(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 #else
-static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static vm_fault_t ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 #endif
 	int count = 0;
 	bool printed = false;
-	int result;
+	vm_fault_t result;
 	sigset_t set;
 
 	/* Only SIGKILL and SIGTERM is allowed for fault/nopage/mkwrite
@@ -380,17 +380,18 @@ static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 
 #ifdef HAVE_VM_OPS_USE_VM_FAULT_ONLY
-static VM_FAULT_T ll_page_mkwrite(struct vm_fault *vmf)
+static vm_fault_t ll_page_mkwrite(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 #else
-static int ll_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static vm_fault_t ll_page_mkwrite(struct vm_area_struct *vma,
+				  struct vm_fault *vmf)
 {
 #endif
 	int count = 0;
 	bool printed = false;
 	bool retry;
-	int result;
+	vm_fault_t result;
 
 	ll_stats_ops_tally(ll_i2sbi(file_inode(vma->vm_file)),
 			   LPROC_LL_MKWRITE, 1);
diff --git a/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c b/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
index d794bf9cd5f6f..93c767207f3eb 100755
--- a/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
+++ b/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
@@ -308,12 +308,12 @@ ll_max_readahead_mb_seq_write(struct file *file, const char __user *buffer,
 
 	pages_number >>= PAGE_SHIFT;
 
-	if (pages_number < 0 || pages_number > TOTALRAM_PAGES / 2) {
+	if (pages_number < 0 || pages_number > cfs_totalram_pages() / 2) {
 		/* 1/2 of RAM */
 		CERROR("%s: can't set max_readahead_mb=%lu > %luMB\n",
 		       ll_get_fsname(sb, NULL, 0),
 		       (unsigned long)pages_number >> (20 - PAGE_SHIFT),
-		       TOTALRAM_PAGES >> (20 - PAGE_SHIFT + 1));
+		       cfs_totalram_pages() >> (20 - PAGE_SHIFT + 1));
 		return -ERANGE;
 	}
 
@@ -479,10 +479,10 @@ ll_max_cached_mb_seq_write(struct file *file, const char __user *buffer,
 
 	pages_number >>= PAGE_SHIFT;
 
-	if (pages_number < 0 || pages_number > TOTALRAM_PAGES) {
+	if (pages_number < 0 || pages_number > cfs_totalram_pages()) {
 		CERROR("%s: can't set max cache more than %lu MB\n",
 		       ll_get_fsname(sb, NULL, 0),
-		       TOTALRAM_PAGES >> (20 - PAGE_SHIFT));
+		       cfs_totalram_pages() >> (20 - PAGE_SHIFT));
 		RETURN(-ERANGE);
 	}
 	/* Allow enough cache so clients can make well-formed RPCs */
diff --git a/drivers/staging/lustrefsx/lustre/llite/namei.c b/drivers/staging/lustrefsx/lustre/llite/namei.c
index 268e11a161e02..622f9a44f407c 100644
--- a/drivers/staging/lustrefsx/lustre/llite/namei.c
+++ b/drivers/staging/lustrefsx/lustre/llite/namei.c
@@ -699,17 +699,33 @@ static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry,
 	return de;
 }
 
+#ifdef FMODE_CREATED /* added in Linux v4.18-rc1-20-g73a09dd */
+# define ll_is_opened(o, f)		((f)->f_mode & FMODE_OPENED)
+# define ll_finish_open(f, d, o)	finish_open((f), (d), NULL)
+# define ll_last_arg
+# define ll_set_created(o, f)						\
+do {									\
+	(f)->f_mode |= FMODE_CREATED;					\
+} while (0)
+
+#else
+# define ll_is_opened(o, f)		(*(o))
+# define ll_finish_open(f, d, o)	finish_open((f), (d), NULL, (o))
+# define ll_last_arg			, int *opened
+# define ll_set_created(o, f)						\
+do {									\
+	*(o) |= FILE_CREATED;						\
+} while (0)
+
+#endif
+
 /*
  * For cached negative dentry and new dentry, handle lookup/create/open
  * together.
  */
 static int ll_atomic_open(struct inode *dir, struct dentry *dentry,
 			  struct file *file, unsigned open_flags,
-			  umode_t mode
-#ifndef HAVE_ATOMIC_OPEN_NO_OPENED
-			  , int *opened
-#endif
-			  )
+			  umode_t mode ll_last_arg)
 {
 	struct lookup_intent *it;
 	struct dentry *de;
@@ -719,17 +735,11 @@ static int ll_atomic_open(struct inode *dir, struct dentry *dentry,
 	int rc = 0;
 	ENTRY;
 
-#ifdef HAVE_ATOMIC_OPEN_NO_OPENED
-	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p), file %p,"
-			   "open_flags %x, mode %x\n",
-	       dentry->d_name.len, dentry->d_name.name,
-	       PFID(ll_inode2fid(dir)), dir, file, open_flags, mode);
-#else
 	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p), file %p,"
 			   "open_flags %x, mode %x opened %d\n",
 	       dentry->d_name.len, dentry->d_name.name,
-	       PFID(ll_inode2fid(dir)), dir, file, open_flags, mode, *opened);
-#endif
+	       PFID(ll_inode2fid(dir)), dir, file, open_flags, mode,
+	       ll_is_opened(opened, file));
 
 	/* Only negative dentries enter here */
 	LASSERT(dentry->d_inode == NULL);
@@ -782,11 +792,7 @@ static int ll_atomic_open(struct inode *dir, struct dentry *dentry,
 					dput(de);
 				goto out_release;
 			}
-#ifdef HAVE_ATOMIC_OPEN_NO_OPENED
-			file->f_mode |= FMODE_CREATED;
-#else
-			*opened |= FILE_CREATED;
-#endif
+			ll_set_created(opened, file);
 		}
 		if (dentry->d_inode && it_disposition(it, DISP_OPEN_OPEN)) {
 			/* Open dentry. */
@@ -797,11 +803,7 @@ static int ll_atomic_open(struct inode *dir, struct dentry *dentry,
 				rc = finish_no_open(file, de);
 			} else {
 				file->private_data = it;
-#ifdef HAVE_ATOMIC_OPEN_NO_OPENED
-				rc = finish_open(file, dentry, NULL);
-#else
-				rc = finish_open(file, dentry, NULL, opened);
-#endif
+				rc = ll_finish_open(file, dentry, opened);
 				/* We dget in ll_splice_alias. finish_open takes
 				 * care of dget for fd open.
 				 */
@@ -1022,16 +1024,17 @@ void ll_update_times(struct ptlrpc_request *request, struct inode *inode)
 
 	LASSERT(body);
 	if (body->mbo_valid & OBD_MD_FLMTIME &&
-	    body->mbo_mtime > LTIME_S(inode->i_mtime)) {
-		CDEBUG(D_INODE, "setting fid "DFID" mtime from " LTIME_FMT
-		       " to %llu\n", PFID(ll_inode2fid(inode)),
-		       LTIME_S(inode->i_mtime), body->mbo_mtime);
-		LTIME_S(inode->i_mtime) = body->mbo_mtime;
+	    body->mbo_mtime > inode->i_mtime.tv_sec) {
+		CDEBUG(D_INODE,
+		       "setting fid " DFID " mtime from %lld to %llu\n",
+		       PFID(ll_inode2fid(inode)),
+		       (s64)inode->i_mtime.tv_sec, body->mbo_mtime);
+		inode->i_mtime.tv_sec = body->mbo_mtime;
 	}
 
 	if (body->mbo_valid & OBD_MD_FLCTIME &&
-	    body->mbo_ctime > LTIME_S(inode->i_ctime))
-		LTIME_S(inode->i_ctime) = body->mbo_ctime;
+	    body->mbo_ctime > inode->i_ctime.tv_sec)
+		inode->i_ctime.tv_sec = body->mbo_ctime;
 }
 
 static int ll_new_node(struct inode *dir, struct dentry *dchild,
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_io.c b/drivers/staging/lustrefsx/lustre/llite/vvp_io.c
index 9de5f9b40cf20..404cee02a4692 100644
--- a/drivers/staging/lustrefsx/lustre/llite/vvp_io.c
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_io.c
@@ -41,6 +41,7 @@
 #include <obd.h>
 #include "llite_internal.h"
 #include "vvp_internal.h"
+#include <libcfs/linux/linux-misc.h>
 
 static struct vvp_io *cl2vvp_io(const struct lu_env *env,
 				const struct cl_io_slice *slice)
@@ -427,6 +428,8 @@ static int vvp_mmap_locks(const struct lu_env *env, struct cl_io *io)
 	union ldlm_policy_data policy;
 	struct iovec iov;
 	struct iov_iter i;
+	unsigned long addr;
+	ssize_t count;
 	int result = 0;
 	ENTRY;
 
@@ -439,9 +442,15 @@ static int vvp_mmap_locks(const struct lu_env *env, struct cl_io *io)
 	if (mm == NULL)
 		RETURN(0);
 
-	iov_for_each(iov, i, io->u.ci_rw.rw_iter) {
-		unsigned long addr = (unsigned long)iov.iov_base;
-		size_t count = iov.iov_len;
+	if (!iter_is_iovec(&io->u.ci_rw.rw_iter) && !iov_iter_is_kvec(&io->u.ci_rw.rw_iter))
+		RETURN(0);
+
+	for (i = io->u.ci_rw.rw_iter;
+	     iov_iter_count(&i);
+	     iov_iter_advance(&i, iov.iov_len)) {
+		iov = iov_iter_iovec(&i);
+		addr = (unsigned long)iov.iov_base;
+		count = iov.iov_len;
 
                 if (count == 0)
                         continue;
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_object.c b/drivers/staging/lustrefsx/lustre/llite/vvp_object.c
index 8904e45918386..fd7211f60c61f 100644
--- a/drivers/staging/lustrefsx/lustre/llite/vvp_object.c
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_object.c
@@ -178,9 +178,9 @@ static int vvp_object_glimpse(const struct lu_env *env,
 	struct inode *inode = vvp_object_inode(obj);
 
 	ENTRY;
-	lvb->lvb_mtime = LTIME_S(inode->i_mtime);
-	lvb->lvb_atime = LTIME_S(inode->i_atime);
-	lvb->lvb_ctime = LTIME_S(inode->i_ctime);
+	lvb->lvb_mtime = inode->i_mtime.tv_sec;
+	lvb->lvb_atime = inode->i_atime.tv_sec;
+	lvb->lvb_ctime = inode->i_ctime.tv_sec;
 
 	/*
 	 * LU-417: Add dirty pages block count lest i_blocks reports 0, some
diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr.c b/drivers/staging/lustrefsx/lustre/llite/xattr.c
index e1e6e34dc0e02..e76e0130d6669 100644
--- a/drivers/staging/lustrefsx/lustre/llite/xattr.c
+++ b/drivers/staging/lustrefsx/lustre/llite/xattr.c
@@ -34,7 +34,9 @@
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/xattr.h>
+#ifdef HAVE_LINUX_SELINUX_IS_ENABLED
 #include <linux/selinux.h>
+#endif
 
 #define DEBUG_SUBSYSTEM S_LLITE
 
@@ -52,6 +54,17 @@ static inline const char *xattr_prefix(const struct xattr_handler *handler)
 }
 #endif
 
+#ifdef HAVE_LINUX_SELINUX_IS_ENABLED
+# define test_xattr_is_selinux_disabled(handler, name) \
+		((handler)->flags == XATTR_SECURITY_T && \
+		!selinux_is_enabled() && \
+		strcmp((name), "selinux") == 0)
+#else
+# define test_xattr_is_selinux_disabled(handler, name) \
+		((handler)->flags == XATTR_SECURITY_T && \
+		strcmp((name), "selinux") == 0)
+#endif
+
 const struct xattr_handler *get_xattr_type(const char *name)
 {
 	int i;
@@ -136,8 +149,7 @@ static int ll_xattr_set_common(const struct xattr_handler *handler,
 		RETURN(0);
 
 	/* LU-549:  Disable security.selinux when selinux is disabled */
-	if (handler->flags == XATTR_SECURITY_T && !selinux_is_enabled() &&
-	    strcmp(name, "selinux") == 0)
+	if (test_xattr_is_selinux_disabled(handler, name))
 		RETURN(-EOPNOTSUPP);
 
 	/*
@@ -431,8 +443,7 @@ static int ll_xattr_get_common(const struct xattr_handler *handler,
 		RETURN(rc);
 
 	/* LU-549:  Disable security.selinux when selinux is disabled */
-	if (handler->flags == XATTR_SECURITY_T && !selinux_is_enabled() &&
-	    !strcmp(name, "selinux"))
+	if (test_xattr_is_selinux_disabled(handler, name))
 		RETURN(-EOPNOTSUPP);
 
 #ifdef CONFIG_FS_POSIX_ACL
diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr_security.c b/drivers/staging/lustrefsx/lustre/llite/xattr_security.c
index 82019cc8caef6..5a79702318717 100644
--- a/drivers/staging/lustrefsx/lustre/llite/xattr_security.c
+++ b/drivers/staging/lustrefsx/lustre/llite/xattr_security.c
@@ -33,7 +33,9 @@
 
 #include <linux/types.h>
 #include <linux/security.h>
+#ifdef HAVE_LINUX_SELINUX_IS_ENABLED
 #include <linux/selinux.h>
+#endif
 #include <linux/xattr.h>
 #include "llite_internal.h"
 
@@ -55,7 +57,8 @@ int ll_dentry_init_security(struct dentry *dentry, int mode, struct qstr *name,
 #ifdef HAVE_SECURITY_DENTRY_INIT_SECURITY
 	int rc;
 
-	/* security_dentry_init_security() is strange. Like
+	/*
+	 * security_dentry_init_security() is strange. Like
 	 * security_inode_init_security() it may return a context (provided a
 	 * Linux security module is enabled) but unlike
 	 * security_inode_init_security() it does not return to us the name of
@@ -65,13 +68,16 @@ int ll_dentry_init_security(struct dentry *dentry, int mode, struct qstr *name,
 	 * SELinux is the only module that implements
 	 * security_dentry_init_security(). Note that the NFS client code just
 	 * calls it and assumes that if anything is returned then it must come
-	 * from SELinux. */
+	 * from SELinux.
+	 */
 
 	if (!selinux_is_enabled())
 		return 0;
 
 	rc = security_dentry_init_security(dentry, mode, name, secctx,
 					   secctx_size);
+	if (rc == -EOPNOTSUPP)
+		return 0;
 	if (rc < 0)
 		return rc;
 
@@ -135,11 +141,17 @@ int
 ll_inode_init_security(struct dentry *dentry, struct inode *inode,
 		       struct inode *dir)
 {
+	int rc;
+
 	if (!selinux_is_enabled())
 		return 0;
 
-	return ll_security_inode_init_security(inode, dir, NULL, NULL, 0,
-					       &ll_initxattrs, dentry);
+	rc = ll_security_inode_init_security(inode, dir, NULL, NULL, 0,
+					      &ll_initxattrs, dentry);
+	if (rc == -EOPNOTSUPP)
+		return 0;
+
+	return rc;
 }
 #else /* !HAVE_SECURITY_IINITSEC_CALLBACK */
 /**
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c b/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c
index 08a5a609e3fdb..bb792e751e94f 100644
--- a/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c
@@ -233,9 +233,9 @@ int lmv_revalidate_slaves(struct obd_export *exp,
 			i_size_write(inode, body->mbo_size);
 			inode->i_blocks = body->mbo_blocks;
 			set_nlink(inode, body->mbo_nlink);
-			LTIME_S(inode->i_atime) = body->mbo_atime;
-			LTIME_S(inode->i_ctime) = body->mbo_ctime;
-			LTIME_S(inode->i_mtime) = body->mbo_mtime;
+			inode->i_atime.tv_sec = body->mbo_atime;
+			inode->i_ctime.tv_sec = body->mbo_ctime;
+			inode->i_mtime.tv_sec = body->mbo_mtime;
 		}
 
 		md_set_lock_data(tgt->ltd_exp, lockh, inode, NULL);
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c b/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c
index 15589c5b9c766..8b073a6d9846f 100644
--- a/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c
@@ -678,6 +678,7 @@ static int lmv_fid2path(struct obd_export *exp, int len, void *karg,
 	if (remote_gf != NULL) {
 		struct getinfo_fid2path *ori_gf;
 		char *ptr;
+		int len;
 
 		ori_gf = (struct getinfo_fid2path *)karg;
 		if (strlen(ori_gf->gf_u.gf_path) +
@@ -686,13 +687,12 @@ static int lmv_fid2path(struct obd_export *exp, int len, void *karg,
 
 		ptr = ori_gf->gf_u.gf_path;
 
-		memmove(ptr + strlen(gf->gf_u.gf_path) + 1, ptr,
-			strlen(ori_gf->gf_u.gf_path));
-
-		strncpy(ptr, gf->gf_u.gf_path,
-			strlen(gf->gf_u.gf_path));
-		ptr += strlen(gf->gf_u.gf_path);
-		*ptr = '/';
+		len = strlen(gf->gf_u.gf_path);
+		/* move the current path to the right to release space
+		 * for closer-to-root part */
+		memmove(ptr + len + 1, ptr, strlen(ori_gf->gf_u.gf_path));
+		memcpy(ptr, gf->gf_u.gf_path, len);
+		ptr[len] = '/';
 	}
 
 	CDEBUG(D_INFO, "%s: get path %s "DFID" rec: %llu ln: %u\n",
@@ -3096,13 +3096,12 @@ static int lmv_merge_attr(struct obd_export *exp,
 	for (i = 0; i < lsm->lsm_md_stripe_count; i++) {
 		struct inode *inode = lsm->lsm_md_oinfo[i].lmo_root;
 
-		CDEBUG(D_INFO, ""DFID" size %llu, blocks %llu nlink %u,"
-		       " atime " LTIME_FMT " ctime " LTIME_FMT
-		       ", mtime " LTIME_FMT ".\n",
+		CDEBUG(D_INFO,
+		       "" DFID " size %llu, blocks %llu nlink %u, atime %lld ctime %lld, mtime %lld.\n",
 		       PFID(&lsm->lsm_md_oinfo[i].lmo_fid),
 		       i_size_read(inode), (unsigned long long)inode->i_blocks,
-		       inode->i_nlink, LTIME_S(inode->i_atime),
-		       LTIME_S(inode->i_ctime), LTIME_S(inode->i_mtime));
+		       inode->i_nlink, (s64)inode->i_atime.tv_sec,
+		       (s64)inode->i_ctime.tv_sec, (s64)inode->i_mtime.tv_sec);
 
 		/* for slave stripe, it needs to subtract nlink for . and .. */
 		if (i != 0)
@@ -3113,14 +3112,14 @@ static int lmv_merge_attr(struct obd_export *exp,
 		attr->cat_size += i_size_read(inode);
 		attr->cat_blocks += inode->i_blocks;
 
-		if (attr->cat_atime < LTIME_S(inode->i_atime))
-			attr->cat_atime = LTIME_S(inode->i_atime);
+		if (attr->cat_atime < inode->i_atime.tv_sec)
+			attr->cat_atime = inode->i_atime.tv_sec;
 
-		if (attr->cat_ctime < LTIME_S(inode->i_ctime))
-			attr->cat_ctime = LTIME_S(inode->i_ctime);
+		if (attr->cat_ctime < inode->i_ctime.tv_sec)
+			attr->cat_ctime = inode->i_ctime.tv_sec;
 
-		if (attr->cat_mtime < LTIME_S(inode->i_mtime))
-			attr->cat_mtime = LTIME_S(inode->i_mtime);
+		if (attr->cat_mtime < inode->i_mtime.tv_sec)
+			attr->cat_mtime = inode->i_mtime.tv_sec;
 	}
 	return 0;
 }
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_obd.c b/drivers/staging/lustrefsx/lustre/lov/lov_obd.c
index e494abbaedf88..8cdd60fc90171 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_obd.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_obd.c
@@ -377,25 +377,8 @@ static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
 		tgt = lov->lov_tgts[index];
 		if (!tgt)
 			continue;
-		/*
-		 * LU-642, initially inactive OSC could miss the obd_connect,
-		 * we make up for it here.
-		 */
-		if (ev == OBD_NOTIFY_ACTIVATE && tgt->ltd_exp == NULL &&
-		    obd_uuid_equals(uuid, &tgt->ltd_uuid)) {
-			struct obd_uuid lov_osc_uuid = {"LOV_OSC_UUID"};
-
-			obd_connect(NULL, &tgt->ltd_exp, tgt->ltd_obd,
-				    &lov_osc_uuid, &lov->lov_ocd, NULL);
-		}
-		if (!tgt->ltd_exp)
-			continue;
-
-		CDEBUG(D_INFO, "lov idx %d is %s conn %#llx\n",
-                       index, obd_uuid2str(&tgt->ltd_uuid),
-                       tgt->ltd_exp->exp_handle.h_cookie);
-                if (obd_uuid_equals(uuid, &tgt->ltd_uuid))
-                        break;
+		if (obd_uuid_equals(uuid, &tgt->ltd_uuid))
+			break;
         }
 
         if (index == lov->desc.ld_tgt_count)
@@ -404,6 +387,27 @@ static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
         if (ev == OBD_NOTIFY_DEACTIVATE || ev == OBD_NOTIFY_ACTIVATE) {
                 activate = (ev == OBD_NOTIFY_ACTIVATE) ? 1 : 0;
 
+		/*
+		 * LU-642, initially inactive OSC could miss the obd_connect,
+		 * we make up for it here.
+		 */
+		if (activate && !tgt->ltd_exp) {
+			int rc;
+			struct obd_uuid lov_osc_uuid = {"LOV_OSC_UUID"};
+
+			rc = obd_connect(NULL, &tgt->ltd_exp, tgt->ltd_obd,
+					 &lov_osc_uuid, &lov->lov_ocd, NULL);
+			if (rc || tgt->ltd_exp == NULL)
+				GOTO(out, index = rc);
+			rc = obd_set_info_async(NULL, tgt->ltd_exp,
+						sizeof(KEY_CACHE_SET),
+						KEY_CACHE_SET,
+						sizeof(struct cl_client_cache),
+						lov->lov_cache, NULL);
+			if (rc < 0)
+				GOTO(out, index = rc);
+		}
+
                 if (lov->lov_tgts[index]->ltd_activate == activate) {
                         CDEBUG(D_INFO, "OSC %s already %sactivate!\n",
                                uuid->uuid, activate ? "" : "de");
@@ -438,6 +442,10 @@ static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
 		       ev, uuid->uuid);
 	}
 
+	if (tgt->ltd_exp)
+		CDEBUG(D_INFO, "%s: lov idx %d conn %llx\n", obd_uuid2str(uuid),
+		       index, tgt->ltd_exp->exp_handle.h_cookie);
+
  out:
 	obd_putref(obd);
 	RETURN(index);
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_object.c b/drivers/staging/lustrefsx/lustre/lov/lov_object.c
index 87d496d8a68e4..c1cf76367697e 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_object.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_object.c
@@ -1062,8 +1062,13 @@ int lov_io_init(const struct lu_env *env, struct cl_object *obj,
 	       PFID(lu_object_fid(&obj->co_lu)), io, io->ci_type,
 	       io->ci_ignore_layout, io->ci_verify_layout);
 
+	/* IO type CIT_MISC with ci_ignore_layout set are usually invoked from
+	 * the OSC layer. It shouldn't take lov layout conf lock in that case,
+	 * because as long as the OSC object exists, the layout can't be
+	 * reconfigured. */
 	return LOV_2DISPATCH_MAYLOCK(cl2lov(obj), llo_io_init,
-				     !io->ci_ignore_layout, env, obj, io);
+			!(io->ci_ignore_layout && io->ci_type == CIT_MISC),
+			env, obj, io);
 }
 
 /**
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_pack.c b/drivers/staging/lustrefsx/lustre/lov/lov_pack.c
index 940888afffdac..dd29ff51dcc1c 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_pack.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_pack.c
@@ -56,13 +56,13 @@ void lov_dump_lmm_common(int level, void *lmmp)
 	struct ost_id	oi;
 
 	lmm_oi_le_to_cpu(&oi, &lmm->lmm_oi);
-	CDEBUG(level, "objid "DOSTID", magic 0x%08x, pattern %#x\n",
-	       POSTID(&oi), le32_to_cpu(lmm->lmm_magic),
-	       le32_to_cpu(lmm->lmm_pattern));
-	CDEBUG(level, "stripe_size %u, stripe_count %u, layout_gen %u\n",
-	       le32_to_cpu(lmm->lmm_stripe_size),
-	       le16_to_cpu(lmm->lmm_stripe_count),
-	       le16_to_cpu(lmm->lmm_layout_gen));
+	CDEBUG_LIMIT(level, "objid "DOSTID", magic 0x%08x, pattern %#x\n",
+		     POSTID(&oi), le32_to_cpu(lmm->lmm_magic),
+		     le32_to_cpu(lmm->lmm_pattern));
+	CDEBUG_LIMIT(level, "stripe_size %u, stripe_count %u, layout_gen %u\n",
+		     le32_to_cpu(lmm->lmm_stripe_size),
+		     le16_to_cpu(lmm->lmm_stripe_count),
+		     le16_to_cpu(lmm->lmm_layout_gen));
 }
 
 static void lov_dump_lmm_objects(int level, struct lov_ost_data *lod,
@@ -71,8 +71,9 @@ static void lov_dump_lmm_objects(int level, struct lov_ost_data *lod,
 	int i;
 
 	if (stripe_count > LOV_V1_INSANE_STRIPE_COUNT) {
-		CDEBUG(level, "bad stripe_count %u > max_stripe_count %u\n",
-		       stripe_count, LOV_V1_INSANE_STRIPE_COUNT);
+		CDEBUG_LIMIT(level,
+			     "bad stripe_count %u > max_stripe_count %u\n",
+			     stripe_count, LOV_V1_INSANE_STRIPE_COUNT);
 		return;
 	}
 
@@ -80,8 +81,8 @@ static void lov_dump_lmm_objects(int level, struct lov_ost_data *lod,
 		struct ost_id oi;
 
 		ostid_le_to_cpu(&lod->l_ost_oi, &oi);
-		CDEBUG(level, "stripe %u idx %u subobj "DOSTID"\n", i,
-		       le32_to_cpu(lod->l_ost_idx), POSTID(&oi));
+		CDEBUG_LIMIT(level, "stripe %u idx %u subobj "DOSTID"\n", i,
+			     le32_to_cpu(lod->l_ost_idx), POSTID(&oi));
 	}
 }
 
@@ -95,7 +96,7 @@ void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm)
 void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm)
 {
 	lov_dump_lmm_common(level, lmm);
-	CDEBUG(level, "pool_name "LOV_POOLNAMEF"\n", lmm->lmm_pool_name);
+	CDEBUG_LIMIT(level, "pool_name "LOV_POOLNAMEF"\n", lmm->lmm_pool_name);
 	lov_dump_lmm_objects(level, lmm->lmm_objects,
 			     le16_to_cpu(lmm->lmm_stripe_count));
 }
@@ -113,8 +114,8 @@ void lov_dump_lmm(int level, void *lmm)
 		lov_dump_lmm_v3(level, (struct lov_mds_md_v3 *)lmm);
 		break;
 	default:
-		CDEBUG(level, "unrecognized lmm_magic %x, assuming %x\n",
-		       magic, LOV_MAGIC_V1);
+		CDEBUG_LIMIT(level, "unrecognized lmm_magic %x, assuming %x\n",
+			     magic, LOV_MAGIC_V1);
 		lov_dump_lmm_common(level, lmm);
 		break;
 	}
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c
index f02a8de80d4f9..c93ec985f6581 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c
@@ -344,9 +344,9 @@ static void mdc_setattr_pack_rec(struct mdt_rec_setattr *rec,
 	rec->sa_projid = op_data->op_projid;
 	rec->sa_size   = op_data->op_attr.ia_size;
 	rec->sa_blocks = op_data->op_attr_blocks;
-	rec->sa_atime  = LTIME_S(op_data->op_attr.ia_atime);
-	rec->sa_mtime  = LTIME_S(op_data->op_attr.ia_mtime);
-	rec->sa_ctime  = LTIME_S(op_data->op_attr.ia_ctime);
+	rec->sa_atime = op_data->op_attr.ia_atime.tv_sec;
+	rec->sa_mtime = op_data->op_attr.ia_mtime.tv_sec;
+	rec->sa_ctime = op_data->op_attr.ia_ctime.tv_sec;
 	rec->sa_attr_flags = op_data->op_attr_flags;
 	if ((op_data->op_attr.ia_valid & ATTR_GID) &&
 	     in_group_p(op_data->op_attr.ia_gid))
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c
index 4a532f0a7b500..cb809c2ce4b89 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c
@@ -613,14 +613,14 @@ static int mdc_finish_enqueue(struct obd_export *exp,
          * It's important that we do this first!  Otherwise we might exit the
          * function without doing so, and try to replay a failed create
          * (bug 3440) */
-        if (it->it_op & IT_OPEN && req->rq_replay &&
+	if (it->it_op & IT_OPEN && req->rq_replay &&
 	    (!it_disposition(it, DISP_OPEN_OPEN) || it->it_status != 0))
 		mdc_clear_replay_flag(req, it->it_status);
 
-	DEBUG_REQ(D_RPCTRACE, req, "op: %d disposition: %x, status: %d",
+	DEBUG_REQ(D_RPCTRACE, req, "op: %x disposition: %x, status: %d",
 		  it->it_op, it->it_disposition, it->it_status);
 
-        /* We know what to expect, so we do any byte flipping required here */
+	/* We know what to expect, so we do any byte flipping required here */
 	if (it_has_reply_body(it)) {
                 struct mdt_body *body;
 
@@ -683,6 +683,8 @@ static int mdc_finish_enqueue(struct obd_export *exp,
 		/* maybe the lock was granted right away and layout
 		 * is packed into RMF_DLM_LVB of req */
 		lvb_len = req_capsule_get_size(pill, &RMF_DLM_LVB, RCL_SERVER);
+		CDEBUG(D_INFO, "%s: layout return lvb %d transno %lld\n",
+		       class_exp2obd(exp)->obd_name, lvb_len, req->rq_transno);
 		if (lvb_len > 0) {
 			lvb_data = req_capsule_server_sized_get(pill,
 							&RMF_DLM_LVB, lvb_len);
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c
index 5b7be3d8af586..db2e665658746 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c
@@ -131,10 +131,9 @@ int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data,
 	}
 
         if (op_data->op_attr.ia_valid & (ATTR_MTIME | ATTR_CTIME))
-		CDEBUG(D_INODE, "setting mtime " LTIME_FMT ", ctime "
-		       LTIME_FMT "\n",
-                       LTIME_S(op_data->op_attr.ia_mtime),
-                       LTIME_S(op_data->op_attr.ia_ctime));
+		CDEBUG(D_INODE, "setting mtime %lld, ctime %lld\n",
+		       (s64)op_data->op_attr.ia_mtime.tv_sec,
+		       (s64)op_data->op_attr.ia_ctime.tv_sec);
 	mdc_setattr_pack(req, op_data, ea, ealen);
 
 	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c
index 1530f5e8c342a..6c8da5866a8b9 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c
@@ -990,34 +990,16 @@ static struct page *mdc_page_locate(struct address_space *mapping, __u64 *hash,
 	 */
 	unsigned long offset = hash_x_index(*hash, hash64);
 	struct page *page;
-#ifdef HAVE_ADDRESS_SPACE_XARRAY
-	XA_STATE(xas, &mapping->i_pages, offset);
-
-	xas_lock_irq(&xas);
-	page = xas_find(&xas, ULONG_MAX);
-	if (xa_is_value(page))
-		page = NULL;
-	if (page) {
-#else
 	int found;
 
-	lock_mappings(mapping);
-#ifdef HAVE_ADDRESS_SPACE_IPAGES
-	found = radix_tree_gang_lookup(&mapping->i_pages,
-#else
+	xa_lock_irq(&mapping->i_pages);
 	found = radix_tree_gang_lookup(&mapping->page_tree,
-#endif
 				       (void **)&page, offset, 1);
-	if (found > 0 && !radix_tree_exceptional_entry(page)) {
-#endif
+	if (found > 0 && !xa_is_value(page)) {
 		struct lu_dirpage *dp;
 
 		get_page(page);
-#ifdef HAVE_ADDRESS_SPACE_XARRAY
-		xas_unlock_irq(&xas);
-#else
-		unlock_mappings(mapping);
-#endif
+		xa_unlock_irq(&mapping->i_pages);
 		/*
 		 * In contrast to find_lock_page() we are sure that directory
 		 * page cannot be truncated (while DLM lock is held) and,
@@ -1066,12 +1048,8 @@ static struct page *mdc_page_locate(struct address_space *mapping, __u64 *hash,
 			page = ERR_PTR(-EIO);
 		}
 	} else {
-#ifdef HAVE_ADDRESS_SPACE_XARRAY
-		xas_unlock_irq(&xas);
-#else
-		unlock_mappings(mapping);
+		xa_unlock_irq(&mapping->i_pages);
 		page = NULL;
-#endif
 	}
 	return page;
 }
diff --git a/drivers/staging/lustrefsx/lustre/nodist b/drivers/staging/lustrefsx/lustre/nodist
new file mode 100644
index 0000000000000..24f55bb96b97d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/nodist
@@ -0,0 +1,9 @@
+obd-*/obd-*
+CVS
+*~
+make.rules
+config.*
+*.o
+*.orig
+*.backup
+.depfiles
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c b/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c
index 8e2d803fe3bc3..b6576eb9b52e0 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c
@@ -540,10 +540,10 @@ static int __init obdclass_init(void)
 	/* Default the dirty page cache cap to 1/2 of system memory.
 	 * For clients with less memory, a larger fraction is needed
 	 * for other purposes (mostly for BGL). */
-	if (TOTALRAM_PAGES <= 512 << (20 - PAGE_SHIFT))
-		obd_max_dirty_pages = TOTALRAM_PAGES / 4;
+	if (cfs_totalram_pages() <= 512 << (20 - PAGE_SHIFT))
+		obd_max_dirty_pages = cfs_totalram_pages() / 4;
 	else
-		obd_max_dirty_pages = TOTALRAM_PAGES / 2;
+		obd_max_dirty_pages = cfs_totalram_pages() / 2;
 
 	err = obd_init_caches();
 	if (err)
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/dt_object.c b/drivers/staging/lustrefsx/lustre/obdclass/dt_object.c
index a8f144fd4c0e0..a48e7cbe7ec18 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/dt_object.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/dt_object.c
@@ -136,24 +136,6 @@ int dt_txn_hook_stop(const struct lu_env *env, struct thandle *th)
 }
 EXPORT_SYMBOL(dt_txn_hook_stop);
 
-void dt_txn_hook_commit(struct thandle *th)
-{
-	struct dt_txn_callback *cb;
-
-	if (th->th_local)
-		return;
-
-	list_for_each_entry(cb, &th->th_dev->dd_txn_callbacks,
-			    dtc_linkage) {
-		/* Right now, the bottom device (OSD) will use this hook
-		 * commit to notify OSP, so we do not check and replace
-		 * the thandle to top thandle now */
-		if (cb->dtc_txn_commit)
-			cb->dtc_txn_commit(th, cb->dtc_cookie);
-	}
-}
-EXPORT_SYMBOL(dt_txn_hook_commit);
-
 int dt_device_init(struct dt_device *dev, struct lu_device_type *t)
 {
 	INIT_LIST_HEAD(&dev->dd_txn_callbacks);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/genops.c b/drivers/staging/lustrefsx/lustre/obdclass/genops.c
index c2b1e35fe86e1..ef84bfe45c930 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/genops.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/genops.c
@@ -366,7 +366,7 @@ struct obd_device *class_newdev(const char *type_name, const char *name,
 
 	newdev->obd_conn_inprogress = 0;
 
-	strncpy(newdev->obd_uuid.uuid, uuid, strlen(uuid));
+	strncpy(newdev->obd_uuid.uuid, uuid, UUID_MAX);
 
 	CDEBUG(D_IOCTL, "Allocate new device %s (%p)\n",
 	       newdev->obd_name, newdev);
@@ -839,9 +839,9 @@ int obd_init_caches(void)
 	ENTRY;
 
 	LASSERT(obd_device_cachep == NULL);
-	obd_device_cachep = kmem_cache_create("ll_obd_dev_cache",
-					      sizeof(struct obd_device),
-					      0, 0, NULL);
+	obd_device_cachep = kmem_cache_create_usercopy("ll_obd_dev_cache",
+				sizeof(struct obd_device),
+				0, 0, 0, sizeof(struct obd_device), NULL);
 	if (!obd_device_cachep)
 		GOTO(out, rc = -ENOMEM);
 
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-sysctl.c b/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-sysctl.c
index a531c970c3fb6..e8016c77c7506 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-sysctl.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-sysctl.c
@@ -111,7 +111,7 @@ static ssize_t max_dirty_mb_store(struct kobject *kobj, struct attribute *attr,
 
 	val *= 1 << (20 - PAGE_SHIFT); /* convert to pages */
 
-	if (val > ((TOTALRAM_PAGES / 10) * 9)) {
+	if (val > ((cfs_totalram_pages() / 10) * 9)) {
 		/* Somebody wants to assign too much memory to dirty pages */
 		return -EINVAL;
 	}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c
index 058d87e7fb3d1..e85e08bbd10c6 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c
@@ -386,8 +386,7 @@ static struct llog_handle *llog_cat_current_log(struct llog_handle *cathandle,
 
 		down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
 		llh = loghandle->lgh_hdr;
-		LASSERT(llh);
-		if (!llog_is_full(loghandle))
+		if (llh == NULL || !llog_is_full(loghandle))
 			GOTO(out_unlock, loghandle);
 		else
 			up_write(&loghandle->lgh_lock);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c b/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c
index 96ebb4cf9b015..21a137bad0bae 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c
@@ -888,7 +888,7 @@ static unsigned long lu_htable_order(struct lu_device *top)
          *
          * Size of lu_object is (arbitrary) taken as 1K (together with inode).
          */
-	cache_size = TOTALRAM_PAGES;
+	cache_size = cfs_totalram_pages();
 
 #if BITS_PER_LONG == 32
         /* limit hashtable size for lowmem systems to low RAM */
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
index 936f1db0d70be..46b2b941bae57 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
@@ -1752,7 +1752,7 @@ static struct lcfg_type_data {
 	{ LCFG_DEL_CONN, "del_conn", { "1", "2", "3", "4" }  },
 	{ LCFG_LOV_ADD_OBD, "add_osc", { "ost", "index", "gen", "UUID" } },
 	{ LCFG_LOV_DEL_OBD, "del_osc", { "1", "2", "3", "4" } },
-	{ LCFG_PARAM, "set_param", { "parameter", "value", "3", "4" } },
+	{ LCFG_PARAM, "conf_param", { "parameter", "value", "3", "4" } },
 	{ LCFG_MARKER, "marker", { "1", "2", "3", "4" } },
 	{ LCFG_LOG_START, "log_start", { "1", "2", "3", "4" } },
 	{ LCFG_LOG_END, "log_end", { "1", "2", "3", "4" } },
@@ -1766,6 +1766,7 @@ static struct lcfg_type_data {
 	{ LCFG_POOL_DEL, "del_pool", { "fsname", "pool", "3", "4" } },
 	{ LCFG_SET_LDLM_TIMEOUT, "set_ldlm_timeout",
 	  { "parameter", "2", "3", "4" } },
+	{ LCFG_SET_PARAM, "set_param", { "parameter", "value", "3", "4" } },
 	{ 0, NULL, { NULL, NULL, NULL, NULL } }
 };
 
@@ -1793,11 +1794,11 @@ static struct lcfg_type_data *lcfg_cmd2data(__u32 cmd)
  */
 int class_config_yaml_output(struct llog_rec_hdr *rec, char *buf, int size)
 {
-	struct lustre_cfg	*lcfg = (struct lustre_cfg *)(rec + 1);
-	char			*ptr = buf;
-	char			*end = buf + size;
-	int			 rc = 0, i;
-	struct lcfg_type_data	*ldata;
+	struct lustre_cfg *lcfg = (struct lustre_cfg *)(rec + 1);
+	char *ptr = buf;
+	char *end = buf + size;
+	int rc = 0, i;
+	struct lcfg_type_data *ldata;
 
 	LASSERT(rec->lrh_type == OBD_CFG_REC);
 	rc = lustre_cfg_sanity_check(lcfg, rec->lrh_len);
@@ -1814,35 +1815,82 @@ int class_config_yaml_output(struct llog_rec_hdr *rec, char *buf, int size)
 	/* form YAML entity */
 	ptr += snprintf(ptr, end - ptr, "- { index: %u, event: %s",
 			rec->lrh_index, ldata->ltd_name);
+	if (end - ptr <= 0)
+		goto out_overflow;
 
-	if (lcfg->lcfg_flags)
+	if (lcfg->lcfg_flags) {
 		ptr += snprintf(ptr, end - ptr, ", flags: %#08x",
 				lcfg->lcfg_flags);
-	if (lcfg->lcfg_num)
+		if (end - ptr <= 0)
+			goto out_overflow;
+	}
+	if (lcfg->lcfg_num) {
 		ptr += snprintf(ptr, end - ptr, ", num: %#08x",
 				lcfg->lcfg_num);
+		if (end - ptr <= 0)
+			goto out_overflow;
+	}
 	if (lcfg->lcfg_nid) {
 		char nidstr[LNET_NIDSTR_SIZE];
 
 		libcfs_nid2str_r(lcfg->lcfg_nid, nidstr, sizeof(nidstr));
 		ptr += snprintf(ptr, end - ptr, ", nid: %s(%#llx)",
 				nidstr, lcfg->lcfg_nid);
+		if (end - ptr <= 0)
+			goto out_overflow;
 	}
 
-	if (LUSTRE_CFG_BUFLEN(lcfg, 0) > 0)
+	if (LUSTRE_CFG_BUFLEN(lcfg, 0) > 0) {
 		ptr += snprintf(ptr, end - ptr, ", device: %s",
 				lustre_cfg_string(lcfg, 0));
+		if (end - ptr <= 0)
+			goto out_overflow;
+	}
+
+	if (lcfg->lcfg_command == LCFG_SET_PARAM) {
+		/*
+		 * set_param -P parameters have param=val here, separate
+		 * them through pointer magic and print them out in
+		 * native yamlese
+		 */
+		char *cfg_str = lustre_cfg_string(lcfg, 1);
+		char *tmp = strchr(cfg_str, '=');
+		size_t len;
+
+		if (tmp == NULL)
+			goto out_done;
+
+		ptr += snprintf(ptr, end - ptr, ", %s: ", ldata->ltd_bufs[0]);
+		len = tmp - cfg_str + 1;
+		snprintf(ptr, len, "%s", cfg_str);
+		ptr += len - 1;
+
+		ptr += snprintf(ptr, end - ptr, ", %s: ", ldata->ltd_bufs[1]);
+		ptr += snprintf(ptr, end - ptr, "%s", tmp + 1);
+
+		goto out_done;
+	}
 
 	for (i = 1; i < lcfg->lcfg_bufcount; i++) {
-		if (LUSTRE_CFG_BUFLEN(lcfg, i) > 0)
+		if (LUSTRE_CFG_BUFLEN(lcfg, i) > 0) {
 			ptr += snprintf(ptr, end - ptr, ", %s: %s",
 					ldata->ltd_bufs[i - 1],
 					lustre_cfg_string(lcfg, i));
+			if (end - ptr <= 0)
+				goto out_overflow;
+		}
 	}
 
+out_done:
 	ptr += snprintf(ptr, end - ptr, " }\n");
-	/* return consumed bytes */
+out_overflow:
+	/* Return consumed bytes.  If the buffer overflowed, zero last byte */
 	rc = ptr - buf;
+	if (rc > size) {
+		rc = -EOVERFLOW;
+		*(end - 1) = '\0';
+	}
+
 	return rc;
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c
index 204fc889da45b..b1f59d8f6b303 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c
@@ -43,7 +43,9 @@
 #define PRINT_MASK (D_SUPER | D_CONFIG)
 
 #include <linux/types.h>
+#ifdef HAVE_LINUX_SELINUX_IS_ENABLED
 #include <linux/selinux.h>
+#endif
 #include <linux/statfs.h>
 #include <linux/version.h>
 #ifdef HAVE_KERNEL_LOCKED
@@ -559,7 +561,7 @@ void lustre_notify_lwp_list(struct obd_export *exp)
 }
 EXPORT_SYMBOL(lustre_notify_lwp_list);
 
-static int lustre_lwp_connect(struct obd_device *lwp)
+static int lustre_lwp_connect(struct obd_device *lwp, bool is_mdt)
 {
 	struct lu_env		 env;
 	struct lu_context	 session_ctx;
@@ -585,11 +587,14 @@ static int lustre_lwp_connect(struct obd_device *lwp)
 
 	data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_INDEX;
 	data->ocd_version = LUSTRE_VERSION_CODE;
-	data->ocd_connect_flags |= OBD_CONNECT_MDS_MDS | OBD_CONNECT_FID |
-		OBD_CONNECT_AT | OBD_CONNECT_LRU_RESIZE |
-		OBD_CONNECT_FULL20 | OBD_CONNECT_LVB_TYPE |
-		OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_LFSCK |
-		OBD_CONNECT_BULK_MBITS;
+	data->ocd_connect_flags |= OBD_CONNECT_FID | OBD_CONNECT_AT |
+		OBD_CONNECT_LRU_RESIZE | OBD_CONNECT_FULL20 |
+		OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LIGHTWEIGHT |
+		OBD_CONNECT_LFSCK | OBD_CONNECT_BULK_MBITS;
+
+	if (is_mdt)
+		data->ocd_connect_flags |= OBD_CONNECT_MDS_MDS;
+
 	OBD_ALLOC_PTR(uuid);
 	if (uuid == NULL)
 		GOTO(out, rc = -ENOMEM);
@@ -673,7 +678,7 @@ static int lustre_lwp_setup(struct lustre_cfg *lcfg, struct lustre_sb_info *lsi,
 	obd = class_name2obd(lwpname);
 	LASSERT(obd != NULL);
 
-	rc = lustre_lwp_connect(obd);
+	rc = lustre_lwp_connect(obd, strstr(lsi->lsi_svname, "-MDT") != NULL);
 	if (rc == 0) {
 		obd->u.cli.cl_max_mds_easize = MAX_MD_SIZE;
 		spin_lock(&lsi->lsi_lwp_lock);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obdo.c b/drivers/staging/lustrefsx/lustre/obdclass/obdo.c
index a7f7be3973222..7d14851f799f0 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/obdo.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obdo.c
@@ -69,21 +69,20 @@ void obdo_from_inode(struct obdo *dst, struct inode *src, u64 valid)
 	u64 newvalid = 0;
 
 	if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
-		CDEBUG(D_INODE, "valid %#llx, new time "
-			LTIME_FMT "/" LTIME_FMT "\n",
-			valid, LTIME_S(src->i_mtime),
-			LTIME_S(src->i_ctime));
+		CDEBUG(D_INODE, "valid %#llx, new time %lld/%lld\n",
+		       valid, (s64) src->i_mtime.tv_sec,
+		       (s64) src->i_ctime.tv_sec);
 
         if (valid & OBD_MD_FLATIME) {
-                dst->o_atime = LTIME_S(src->i_atime);
+		dst->o_atime = src->i_atime.tv_sec;
                 newvalid |= OBD_MD_FLATIME;
         }
         if (valid & OBD_MD_FLMTIME) {
-                dst->o_mtime = LTIME_S(src->i_mtime);
+		dst->o_mtime = src->i_mtime.tv_sec;
                 newvalid |= OBD_MD_FLMTIME;
         }
         if (valid & OBD_MD_FLCTIME) {
-                dst->o_ctime = LTIME_S(src->i_ctime);
+		dst->o_ctime = src->i_ctime.tv_sec;
                 newvalid |= OBD_MD_FLCTIME;
         }
         if (valid & OBD_MD_FLSIZE) {
diff --git a/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c b/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c
index 769d59ed21b7e..26065b110e592 100644
--- a/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c
+++ b/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c
@@ -1454,45 +1454,41 @@ static int echo_attr_get_complex(const struct lu_env *env,
 {
 	struct echo_thread_info	*info = echo_env_info(env);
 	struct lu_buf		*buf = &info->eti_buf;
-	umode_t		 mode = lu_object_attr(&next->mo_lu);
-	int			 need = ma->ma_need;
+	umode_t			 mode = lu_object_attr(&next->mo_lu);
 	int			 rc = 0, rc2;
 
 	ENTRY;
 
 	ma->ma_valid = 0;
 
-	if (need & MA_INODE) {
-		ma->ma_need = MA_INODE;
+	if (ma->ma_need & MA_INODE) {
 		rc = mo_attr_get(env, next, ma);
 		if (rc)
 			GOTO(out, rc);
 		ma->ma_valid |= MA_INODE;
 	}
 
-	if (need & MA_LOV) {
-		if (S_ISREG(mode) || S_ISDIR(mode)) {
-			LASSERT(ma->ma_lmm_size > 0);
-			buf->lb_buf = ma->ma_lmm;
-			buf->lb_len = ma->ma_lmm_size;
-			rc2 = mo_xattr_get(env, next, buf, XATTR_NAME_LOV);
-			if (rc2 > 0) {
-				ma->ma_lmm_size = rc2;
-				ma->ma_valid |= MA_LOV;
-			} else if (rc2 == -ENODATA) {
-				/* no LOV EA */
-				ma->ma_lmm_size = 0;
-			} else if (rc2 == -ERANGE) {
-				rc2 = echo_big_lmm_get(env, next, ma);
-				if (rc2 < 0)
-					GOTO(out, rc = rc2);
-			} else {
+	if ((ma->ma_need & MA_LOV) && (S_ISREG(mode) || S_ISDIR(mode))) {
+		LASSERT(ma->ma_lmm_size > 0);
+		buf->lb_buf = ma->ma_lmm;
+		buf->lb_len = ma->ma_lmm_size;
+		rc2 = mo_xattr_get(env, next, buf, XATTR_NAME_LOV);
+		if (rc2 > 0) {
+			ma->ma_lmm_size = rc2;
+			ma->ma_valid |= MA_LOV;
+		} else if (rc2 == -ENODATA) {
+			/* no LOV EA */
+			ma->ma_lmm_size = 0;
+		} else if (rc2 == -ERANGE) {
+			rc2 = echo_big_lmm_get(env, next, ma);
+			if (rc2 < 0)
 				GOTO(out, rc = rc2);
-			}
+		} else {
+			GOTO(out, rc = rc2);
 		}
 	}
 
-	if (need & MA_LMV && S_ISDIR(mode)) {
+	if ((ma->ma_need & MA_LMV) && S_ISDIR(mode)) {
 		LASSERT(ma->ma_lmm_size > 0);
 		buf->lb_buf = ma->ma_lmm;
 		buf->lb_len = ma->ma_lmm_size;
@@ -1513,7 +1509,7 @@ static int echo_attr_get_complex(const struct lu_env *env,
 	}
 
 #ifdef CONFIG_FS_POSIX_ACL
-	if (need & MA_ACL_DEF && S_ISDIR(mode)) {
+	if ((ma->ma_need & MA_ACL_DEF) && S_ISDIR(mode)) {
 		buf->lb_buf = ma->ma_acl;
 		buf->lb_len = ma->ma_acl_size;
 		rc2 = mo_xattr_get(env, next, buf, XATTR_NAME_ACL_DEFAULT);
@@ -1529,7 +1525,6 @@ static int echo_attr_get_complex(const struct lu_env *env,
 	}
 #endif
 out:
-	ma->ma_need = need;
 	CDEBUG(D_INODE, "after getattr rc = %d, ma_valid = %#llx ma_lmm=%p\n",
 	       rc, ma->ma_valid, ma->ma_lmm);
 	RETURN(rc);
diff --git a/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c b/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c
index 6c71d4c6b19d7..5c9a8c6c2219c 100644
--- a/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c
+++ b/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c
@@ -160,7 +160,7 @@ static ssize_t osc_max_dirty_mb_seq_write(struct file *file,
 
 	if (pages_number <= 0 ||
 	    pages_number >= OSC_MAX_DIRTY_MB_MAX << (20 - PAGE_SHIFT) ||
-	    pages_number > TOTALRAM_PAGES / 4) /* 1/4 of RAM */
+	    pages_number > cfs_totalram_pages() / 4) /* 1/4 of RAM */
 		return -ERANGE;
 
 	spin_lock(&cli->cl_loi_list_lock);
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_request.c b/drivers/staging/lustrefsx/lustre/osc/osc_request.c
index db6ca2da2e4db..b50f4d6ee5019 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_request.c
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_request.c
@@ -697,11 +697,12 @@ static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
 		oa->o_undirty = 0;
 	} else {
 		unsigned long nrpages;
+		unsigned long undirty;
 
 		nrpages = cli->cl_max_pages_per_rpc;
 		nrpages *= cli->cl_max_rpcs_in_flight + 1;
 		nrpages = max(nrpages, cli->cl_dirty_max_pages);
-		oa->o_undirty = nrpages << PAGE_SHIFT;
+		undirty = nrpages << PAGE_SHIFT;
 		if (OCD_HAS_FLAG(&cli->cl_import->imp_connect_data,
 				 GRANT_PARAM)) {
 			int nrextents;
@@ -710,8 +711,13 @@ static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
 			 * grant space */
 			nrextents = (nrpages + cli->cl_max_extent_pages - 1)  /
 				     cli->cl_max_extent_pages;
-			oa->o_undirty += nrextents * cli->cl_grant_extent_tax;
+			undirty += nrextents * cli->cl_grant_extent_tax;
 		}
+		/* Do not ask for more than OBD_MAX_GRANT - a margin for server
+		 * to add extent tax, etc.
+		 */
+		oa->o_undirty = min(undirty, OBD_MAX_GRANT -
+				    (PTLRPC_MAX_BRW_PAGES << PAGE_SHIFT)*4UL);
         }
 	oa->o_grant = cli->cl_avail_grant + cli->cl_reserved_grant;
         oa->o_dropped = cli->cl_lost_grant;
@@ -1048,7 +1054,7 @@ static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2)
                  * safe to combine */
                 if (unlikely((p1->flag & mask) != (p2->flag & mask))) {
                         CWARN("Saw flags 0x%x and 0x%x in the same brw, please "
-                              "report this at https://jira.hpdd.intel.com/\n",
+                              "report this at https://jira.whamcloud.com/\n",
                               p1->flag, p2->flag);
                 }
                 return 0;
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/client.c b/drivers/staging/lustrefsx/lustre/ptlrpc/client.c
index d127e5e63bfdb..9642a5644009f 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/client.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/client.c
@@ -1896,10 +1896,6 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
 					spin_lock(&req->rq_lock);
 					req->rq_resend = 1;
 					spin_unlock(&req->rq_lock);
-
-					if (req->rq_bulk != NULL &&
-					    !ptlrpc_unregister_bulk(req, 1))
-						continue;
                                 }
                                 /*
                                  * rq_wait_ctx is only touched by ptlrpcd,
@@ -1926,6 +1922,12 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
 					spin_unlock(&req->rq_lock);
 				}
 
+				/* In any case, the previous bulk should be
+				 * cleaned up to prepare for the new sending */
+				if (req->rq_bulk != NULL &&
+				    !ptlrpc_unregister_bulk(req, 1))
+					continue;
+
 				rc = ptl_send_rpc(req, 0);
 				if (rc == -ENOMEM) {
 					spin_lock(&imp->imp_lock);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/import.c b/drivers/staging/lustrefsx/lustre/ptlrpc/import.c
index 8338095a43bb1..827a989f1e139 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/import.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/import.c
@@ -37,6 +37,7 @@
 #define DEBUG_SUBSYSTEM S_RPC
 
 #include <linux/kthread.h>
+#include <linux/delay.h>
 #include <obd_support.h>
 #include <lustre_ha.h>
 #include <lustre_net.h>
@@ -292,6 +293,10 @@ void ptlrpc_invalidate_import(struct obd_import *imp)
 	if (!imp->imp_invalid || imp->imp_obd->obd_no_recov)
 		ptlrpc_deactivate_import(imp);
 
+	if (OBD_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CONNECT_RACE)) {
+		OBD_RACE(OBD_FAIL_PTLRPC_CONNECT_RACE);
+		msleep(10 * MSEC_PER_SEC);
+	}
 	CFS_FAIL_TIMEOUT(OBD_FAIL_MGS_CONNECT_NET, 3 * cfs_fail_val / 2);
 	LASSERT(imp->imp_invalid);
 
@@ -666,6 +671,7 @@ int ptlrpc_connect_import(struct obd_import *imp)
 		CERROR("already connected\n");
 		RETURN(0);
 	} else if (imp->imp_state == LUSTRE_IMP_CONNECTING ||
+		   imp->imp_state == LUSTRE_IMP_EVICTED ||
 		   imp->imp_connected) {
 		spin_unlock(&imp->imp_lock);
 		CERROR("already connecting\n");
@@ -796,18 +802,6 @@ static int ptlrpc_connect_set_flags(struct obd_import *imp,
 	static bool warned;
 	struct client_obd *cli = &imp->imp_obd->u.cli;
 
-	if ((imp->imp_connect_flags_orig & OBD_CONNECT_IBITS) &&
-	    !(ocd->ocd_connect_flags & OBD_CONNECT_IBITS)) {
-		LCONSOLE_WARN("%s: MDS %s does not support ibits "
-			      "lock, either very old or invalid: "
-			      "requested %#llx, replied %#llx\n",
-			      imp->imp_obd->obd_name,
-			      imp->imp_connection->c_remote_uuid.uuid,
-			      imp->imp_connect_flags_orig,
-			      ocd->ocd_connect_flags);
-		return -EPROTO;
-	}
-
 	spin_lock(&imp->imp_lock);
 	list_del(&imp->imp_conn_current->oic_item);
 	list_add(&imp->imp_conn_current->oic_item,
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c b/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
index 93871b4c7d090..711c77650a569 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
@@ -321,7 +321,7 @@ ptlrpc_lprocfs_req_history_max_seq_write(struct file *file,
 	 * far. */
 	bufpages = (svc->srv_buf_size + PAGE_SIZE - 1) >>
 							PAGE_SHIFT;
-	if (val > TOTALRAM_PAGES/(2 * bufpages))
+	if (val > cfs_totalram_pages() / (2 * bufpages))
 		return -ERANGE;
 
 	spin_lock(&svc->srv_lock);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/recover.c b/drivers/staging/lustrefsx/lustre/ptlrpc/recover.c
index c526e9e5c65f7..aacb929beae23 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/recover.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/recover.c
@@ -346,6 +346,8 @@ int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async)
         if (rc)
                 GOTO(out, rc);
 
+	OBD_RACE(OBD_FAIL_PTLRPC_CONNECT_RACE);
+
         rc = ptlrpc_connect_import(imp);
         if (rc)
                 GOTO(out, rc);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c
index 488b2398e4342..42841f0c0aaf1 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c
@@ -155,7 +155,7 @@ int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v)
 		   "max waitqueue depth:     %u\n"
 		   "max wait time:           %ld/%lu\n"
 		   "out of mem:              %lu\n",
-		   TOTALRAM_PAGES, PAGES_PER_POOL,
+		   cfs_totalram_pages(), PAGES_PER_POOL,
 		   page_pools.epp_max_pages,
 		   page_pools.epp_max_pools,
 		   page_pools.epp_total_pages,
@@ -775,9 +775,9 @@ int sptlrpc_enc_pool_init(void)
 	DEF_SHRINKER_VAR(shvar, enc_pools_shrink,
 			 enc_pools_shrink_count, enc_pools_shrink_scan);
 
-	page_pools.epp_max_pages = TOTALRAM_PAGES / 8;
+	page_pools.epp_max_pages = cfs_totalram_pages() / 8;
 	if (enc_pool_max_memory_mb > 0 &&
-	    enc_pool_max_memory_mb <= (TOTALRAM_PAGES >> mult))
+	    enc_pool_max_memory_mb <= (cfs_totalram_pages() >> mult))
 		page_pools.epp_max_pages = enc_pool_max_memory_mb << mult;
 
 	page_pools.epp_max_pools = npages_to_npools(page_pools.epp_max_pages);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/wirehdr.c b/drivers/staging/lustrefsx/lustre/ptlrpc/wirehdr.c
new file mode 100644
index 0000000000000..3a9daf899c26e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/wirehdr.c
@@ -0,0 +1,44 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#ifdef CONFIG_FS_POSIX_ACL
+# include <linux/fs.h>
+# include <linux/posix_acl_xattr.h>
+#endif /* CONFIG_FS_POSIX_ACL */
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre/lustre_lfsck_user.h>
+#include <lustre_disk.h>
diff --git a/drivers/staging/lustrefsx/lustre/target/out_lib.c b/drivers/staging/lustrefsx/lustre/target/out_lib.c
index 8e60dfff9995e..c267ed20bf485 100644
--- a/drivers/staging/lustrefsx/lustre/target/out_lib.c
+++ b/drivers/staging/lustrefsx/lustre/target/out_lib.c
@@ -657,10 +657,6 @@ int out_attr_set_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
 	if (rc != 0)
 		return rc;
 
-	if (attr->la_valid & LA_FLAGS &&
-	    attr->la_flags & LUSTRE_SET_SYNC_FL)
-		th->th_sync |= 1;
-
 	arg = tx_add_exec(ta, out_tx_attr_set_exec, out_tx_attr_set_undo,
 			  file, line);
 	if (IS_ERR(arg))
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_grant.c b/drivers/staging/lustrefsx/lustre/target/tgt_grant.c
index 78876235dcfd7..bedf54ee863d1 100644
--- a/drivers/staging/lustrefsx/lustre/target/tgt_grant.c
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_grant.c
@@ -890,9 +890,10 @@ static long tgt_grant_alloc(struct obd_export *exp, u64 curgrant,
 	     tgd->tgd_grant_compat_disable) || left == 0 || exp->exp_failed)
 		RETURN(0);
 
-	if (want > 0x7fffffff) {
-		CERROR("%s: client %s/%p requesting > 2GB grant %llu\n",
-		       obd->obd_name, exp->exp_client_uuid.uuid, exp, want);
+	if (want > OBD_MAX_GRANT) {
+		CERROR("%s: client %s/%p requesting > max (%lu), %llu\n",
+		       obd->obd_name, exp->exp_client_uuid.uuid, exp,
+		       OBD_MAX_GRANT, want);
 		RETURN(0);
 	}
 
@@ -927,6 +928,14 @@ static long tgt_grant_alloc(struct obd_export *exp, u64 curgrant,
 	if ((grant > chunk) && conservative)
 		grant = chunk;
 
+	/*
+	 * Limit grant so that export' grant does not exceed what the
+	 * client would like to have by more than grants for 2 full
+	 * RPCs
+	 */
+	if (ted->ted_grant + grant > want + chunk)
+		grant = want + chunk - ted->ted_grant;
+
 	tgd->tgd_tot_granted += grant;
 	ted->ted_grant += grant;
 
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_main.c b/drivers/staging/lustrefsx/lustre/target/tgt_main.c
index 4d3923723b2f1..12f9fdc1c2138 100644
--- a/drivers/staging/lustrefsx/lustre/target/tgt_main.c
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_main.c
@@ -218,7 +218,6 @@ int tgt_init(const struct lu_env *env, struct lu_target *lut,
 	/* prepare transactions callbacks */
 	lut->lut_txn_cb.dtc_txn_start = tgt_txn_start_cb;
 	lut->lut_txn_cb.dtc_txn_stop = tgt_txn_stop_cb;
-	lut->lut_txn_cb.dtc_txn_commit = NULL;
 	lut->lut_txn_cb.dtc_cookie = lut;
 	lut->lut_txn_cb.dtc_tag = LCT_DT_THREAD | LCT_MD_THREAD;
 	INIT_LIST_HEAD(&lut->lut_txn_cb.dtc_linkage);
diff --git a/drivers/staging/lustrefsx/undef.h b/drivers/staging/lustrefsx/undef.h
index c0f5f459ae9c1..64189a1ac2606 100644
--- a/drivers/staging/lustrefsx/undef.h
+++ b/drivers/staging/lustrefsx/undef.h
@@ -323,6 +323,9 @@
 /* get_user_pages takes gup_flags in arguments */
 #undef HAVE_GET_USER_PAGES_GUP_FLAGS
 
+/* get_user_pages takes gup_flags in arguments with 7 args */
+#undef HAVE_GET_USER_PAGES_GUP_FLAGS_7ARGS
+
 /* struct group_info has member gid */
 #undef HAVE_GROUP_INFO_GID
 
@@ -359,6 +362,9 @@
 /* struct ib_device.attrs is defined */
 #undef HAVE_IB_DEVICE_ATTRS
 
+/* if struct ib_device_ops is defined */
+#undef HAVE_IB_DEVICE_OPS
+
 /* ib_get_dma_mr is defined */
 #undef HAVE_IB_GET_DMA_MR
 
@@ -371,9 +377,15 @@
 /* ib_map_mr_sg has 5 arguments */
 #undef HAVE_IB_MAP_MR_SG_5ARGS
 
+/* ib_post_send and ib_post_recv have const parameters */
+#undef HAVE_IB_POST_SEND_RECV_CONST
+
 /* struct ib_rdma_wr is defined */
 #undef HAVE_IB_RDMA_WR
 
+/* if ib_sg_dma_address wrapper exists */
+#undef HAVE_IB_SG_DMA_ADDRESS
+
 /* inode_operations .getattr member function can gather advance stats */
 #undef HAVE_INODEOPS_ENHANCED_GETATTR
 
@@ -428,6 +440,9 @@
 /* inode_operations has {get,set,remove}xattr members */
 #undef HAVE_IOP_XATTR
 
+/* if iov_iter has member type */
+#undef HAVE_IOV_ITER_HAS_TYPE_MEMBER
+
 /* iov_iter_init handles directional tag */
 #undef HAVE_IOV_ITER_INIT_DIRECTION
 
@@ -437,9 +452,15 @@
 /* iov_iter_truncate exists */
 #undef HAVE_IOV_ITER_TRUNCATE
 
+/* if iov_iter_type exists */
+#undef HAVE_IOV_ITER_TYPE
+
 /* is_sxid is defined */
 #undef HAVE_IS_SXID
 
+/* struct address_space has i_pages */
+#undef HAVE_I_PAGES
+
 /* i_uid_read is present */
 #undef HAVE_I_UID_READ
 
@@ -449,6 +470,9 @@
 /* 'struct sock' accept function requires bool argument */
 #undef HAVE_KERN_SOCK_ACCEPT_FLAG_ARG
 
+/* 'getname' has two args */
+#undef HAVE_KERN_SOCK_GETNAME_2ARGS
+
 /* struct key_match_data exist */
 #undef HAVE_KEY_MATCH_DATA
 
@@ -489,6 +513,9 @@
 /* kernel has kstrtoul */
 #undef HAVE_KSTRTOUL
 
+/* kernel has ksys_close */
+#undef HAVE_KSYS_CLOSE
+
 /* kthread_worker found */
 #undef HAVE_KTHREAD_WORK
 
@@ -546,6 +573,9 @@
 /* Define to 1 if you have the <linux/random.h> header file. */
 #undef HAVE_LINUX_RANDOM_H
 
+/* if linux/selinux.h exists */
+#undef HAVE_LINUX_SELINUX_IS_ENABLED
+
 /* Define to 1 if you have the <linux/types.h> header file. */
 #undef HAVE_LINUX_TYPES_H
 
@@ -555,6 +585,9 @@
 /* Define to 1 if you have the <linux/version.h> header file. */
 #undef HAVE_LINUX_VERSION_H
 
+/* lock_manager_operations has lm_compare_owner */
+#undef HAVE_LM_COMPARE_OWNER
+
 /* lock-manager ops renamed to lm_xxx */
 #undef HAVE_LM_XXX_LOCK_MANAGER_OPS
 
@@ -734,6 +767,9 @@
 /* stacktrace_ops.warning is exist */
 #undef HAVE_STACKTRACE_WARNING
 
+/* stack_trace_print() exists */
+#undef HAVE_STACK_TRACE_PRINT
+
 /* Define to 1 if you have the <stdint.h> header file. */
 #undef HAVE_STDINT_H
 
@@ -794,6 +830,9 @@
 /* tcp_sendpage use socket as first parameter */
 #undef HAVE_TCP_SENDPAGE_USE_SOCKET
 
+/* timer_setup has replaced setup_timer */
+#undef HAVE_TIMER_SETUP
+
 /* 'struct timespec64' is available */
 #undef HAVE_TIMESPEC64
 
@@ -806,12 +845,18 @@
 /* topology_sibling_cpumask is available */
 #undef HAVE_TOPOLOGY_SIBLING_CPUMASK
 
+/* if totalram_pages is a function */
+#undef HAVE_TOTALRAM_PAGES_AS_FUNC
+
 /* kernel export truncate_complete_page */
 #undef HAVE_TRUNCATE_COMPLETE_PAGE
 
 /* kernel has truncate_inode_pages_final */
 #undef HAVE_TRUNCATE_INODE_PAGES_FINAL
 
+/* if MS_RDONLY was moved to uapi/linux/mount.h */
+#undef HAVE_UAPI_LINUX_MOUNT_H
+
 /* uidgid.h is present */
 #undef HAVE_UIDGID_HEADER
 
@@ -833,6 +878,9 @@
 /* virtual_address has been replaced by address field */
 #undef HAVE_VM_FAULT_ADDRESS
 
+/* if vm_fault_t type exists */
+#undef HAVE_VM_FAULT_T
+
 /* 'struct vm_operations' remove struct vm_area_struct argument */
 #undef HAVE_VM_OPS_USE_VM_FAULT_ONLY
 
@@ -845,9 +893,15 @@
 /* needs inode parameter */
 #undef HAVE_XATTR_HANDLER_INODE_PARAM
 
+/* xattr_handler has a name member */
+#undef HAVE_XATTR_HANDLER_NAME
+
 /* handler pointer is parameter */
 #undef HAVE_XATTR_HANDLER_SIMPLIFIED
 
+/* xa_is_value exist */
+#undef HAVE_XA_IS_VALUE
+
 /* Have zap_add_by_dnode() in ZFS */
 #undef HAVE_ZAP_ADD_BY_DNODE
 

From eb5e5d3a2fce1d3c7cd731ed836a4c33568c12f3 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Tue, 13 Oct 2020 20:29:59 +0000
Subject: [PATCH 065/737] lustre: remove CRYPTO_TFM_RES_BAD_KEY_LEN

    LU-13344 libcfs: adler32: don't set CRYPTO_TFM_RES_BAD_KEY_LEN

    This flag was not being checked by most callers and has been removed since
    Commit 674f368a952c ("crypto: remove CRYPTO_TFM_RES_BAD_KEY_LEN")
    so remove from lustre too.

    Change-Id: Ia616430a98d05c7bb3b22a0be543e83855272d2f
    Test-Parameters: trivial
    Signed-off-by: Mr NeilBrown <neilb@suse.de>
    Reviewed-on: https://review.whamcloud.com/37870
    Reviewed-by: James Simmons <jsimmons@infradead.org>
    Reviewed-by: Shaun Tancheff <shaun.tancheff@hpe.com>
    Tested-by: jenkins <devops@whamcloud.com>
    Tested-by: Maloo <maloo@whamcloud.com>
    Reviewed-by: Sebastien Buisson <sbuisson@ddn.com>
    Reviewed-by: Oleg Drokin <green@whamcloud.com>

[fllinden - do the same thing for the crc32 key code which is still
 in the lustre 2.10 branch that we use]
Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 .../lustrefsx/libcfs/libcfs/linux/linux-crypto-adler.c       | 5 ++---
 .../lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32.c       | 5 ++---
 .../libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c         | 5 ++---
 .../lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c | 5 ++---
 4 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-adler.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-adler.c
index 90f502f35e580..b71d7f8bc9d68 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-adler.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-adler.c
@@ -51,10 +51,9 @@ static int adler32_setkey(struct crypto_shash *hash, const u8 *key,
 {
 	u32 *mctx = crypto_shash_ctx(hash);
 
-	if (keylen != sizeof(u32)) {
-		crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+	if (keylen != sizeof(u32))
 		return -EINVAL;
-	}
+
 	*mctx = *(u32 *)key;
 	return 0;
 }
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32.c
index 58e4691cfb3de..85fc287cb8847 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32.c
@@ -63,10 +63,9 @@ static int crc32_setkey(struct crypto_shash *hash, const u8 *key,
 {
 	u32 *mctx = crypto_shash_ctx(hash);
 
-	if (keylen != sizeof(u32)) {
-		crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+	if (keylen != sizeof(u32))
 		return -EINVAL;
-	}
+
 	*mctx = le32_to_cpup((__le32 *)key);
 	return 0;
 }
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c
index fc55ad7969fab..40d9e7416068b 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c
@@ -61,10 +61,9 @@ static int crc32c_pclmul_setkey(struct crypto_shash *hash, const u8 *key,
 {
 	u32 *mctx = crypto_shash_ctx(hash);
 
-	if (keylen != sizeof(u32)) {
-		crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+	if (keylen != sizeof(u32))
 		return -EINVAL;
-	}
+
 	*mctx = le32_to_cpup((__le32 *)key);
 	return 0;
 }
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c
index a238e4e39fce0..88e697897b15d 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c
@@ -100,10 +100,9 @@ static int crc32_pclmul_setkey(struct crypto_shash *hash, const u8 *key,
 {
 	u32 *mctx = crypto_shash_ctx(hash);
 
-	if (keylen != sizeof(u32)) {
-		crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+	if (keylen != sizeof(u32))
 		return -EINVAL;
-	}
+
 	*mctx = le32_to_cpup((__le32 *)key);
 	return 0;
 }

From e0c83efc22eb4dd09970a5a7b7c03938498127a2 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Tue, 13 Oct 2020 21:36:13 +0000
Subject: [PATCH 066/737] lustre: add time_t define

time_t is gone in more recent kernels (> 5.5). However, it is used in
the lustre code.

The upstream lustre code has undergone a lot of time type changes since
2.10 (which is the branch we're still using), so a full backport
of all those changes is not advisable.

Instead, just add a time_t define in the lustre code.

This will only work on 64bit kernels, so error out if __BITS_PER_LONG
is not 64.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 .../lustrefsx/libcfs/include/libcfs/linux/linux-time.h      | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
index 64613de7bd6a8..cc0f42d6360d2 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
@@ -143,6 +143,12 @@ static inline struct timespec timespec64_to_timespec(const struct timespec64 ts6
 
 #endif /* HAVE_TIMESPEC64 */
 
+#if __BITS_PER_LONG == 64
+#define time_t long
+#else
+#error "lustre is not supported on 32bit"
+#endif
+
 #ifndef HAVE_KTIME_ADD
 # define ktime_add(lhs, rhs) ({ (ktime_t){ .tv64 = (lhs).tv64 + (rhs).tv64 }; })
 #endif /* !HAVE_KTIME_ADD */

From 49b67933c953aa8536627080410a9bc50a1e30f2 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Tue, 13 Oct 2020 22:04:24 +0000
Subject: [PATCH 067/737] lustre: stop using struct timeval

    LU-13344 lnet: stop using struct timeval

    The struct timeval is not 2038 safe so the Linux kernel is moving
    away from its use. The use of rpe_stamp hasn't been used since
    Lustre 2.2 so remove the userland use of this field. This frees
    use to change rpe_stamp to an equivalent struct timespec64 for
    future use. Greatly simplify lnet_sock_[read|write] by using
    jiffies values of sk_sndtimeo, sk_rcvtimeo cached in struct sock.

    Change-Id: Ib58193756ec4a526e55bc810c05abd3920b2b269
    Signed-off-by: James Simmons <jsimmons@infradead.org>
    Reviewed-on: https://review.whamcloud.com/38105
    Tested-by: jenkins <devops@whamcloud.com>
    Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
    Reviewed-by: Shaun Tancheff <shaun.tancheff@hpe.com>
    Tested-by: Maloo <maloo@whamcloud.com>

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/staging/lustrefsx/config.h            |  3 +
 .../libcfs/include/libcfs/linux/linux-time.h  | 20 +++++
 .../lustrefsx/lnet/include/lnet/lnetst.h      |  8 +-
 .../staging/lustrefsx/lnet/lnet/lib-socket.c  | 76 ++++---------------
 .../staging/lustrefsx/lnet/selftest/conrpc.c  |  6 +-
 5 files changed, 48 insertions(+), 65 deletions(-)

diff --git a/drivers/staging/lustrefsx/config.h b/drivers/staging/lustrefsx/config.h
index d3295c5726e99..d040e2a786053 100644
--- a/drivers/staging/lustrefsx/config.h
+++ b/drivers/staging/lustrefsx/config.h
@@ -548,6 +548,9 @@
 /* 'ktime_to_timespec64' is available */
 #define HAVE_KTIME_TO_TIMESPEC64 1
 
+/* ns_to_timespec64 is available */
+#define HAVE_NS_TO_TIMESPEC64
+
 /* enable use of ldiskfsprogs package */
 /* #undef HAVE_LDISKFSPROGS */
 
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
index cc0f42d6360d2..d22bda9895fe7 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
@@ -149,6 +149,26 @@ static inline struct timespec timespec64_to_timespec(const struct timespec64 ts6
 #error "lustre is not supported on 32bit"
 #endif
 
+#ifndef HAVE_NS_TO_TIMESPEC64
+static inline struct timespec64 ns_to_timespec64(const s64 nsec)
+{
+	struct timespec64 ts;
+	s32 rem;
+
+	if (!nsec)
+		return (struct timespec64) {0, 0};
+
+	ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
+	if (unlikely(rem < 0)) {
+		ts.tv_sec--;
+		rem += NSEC_PER_SEC;
+	}
+	ts.tv_nsec = rem;
+
+	return ts;
+}
+#endif
+
 #ifndef HAVE_KTIME_ADD
 # define ktime_add(lhs, rhs) ({ (ktime_t){ .tv64 = (lhs).tv64 + (rhs).tv64 }; })
 #endif /* !HAVE_KTIME_ADD */
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lnetst.h b/drivers/staging/lustrefsx/lnet/include/lnet/lnetst.h
index a43978ff592f4..7071039d9aa38 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/lnetst.h
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/lnetst.h
@@ -120,7 +120,13 @@ struct lstcon_test_batch_ent {
 struct lstcon_rpc_ent {
 	struct list_head	rpe_link;		/* link chain */
 	struct lnet_process_id	rpe_peer;		/* peer's id */
-	struct timeval		rpe_stamp;		/* time stamp of RPC */
+	/* This has not been used since Lustre 2.2 so its safe to use.
+	 * Update to allow future use of timespec64
+	 */
+	struct {
+		__s64		tv_sec;
+		__s64		tv_nsec;
+	} rpe_stamp;					/* time stamp of RPC */
 	int			rpe_state;		/* peer's state */
 	int			rpe_rpc_errno;		/* RPC errno */
 
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
index c46ab84714768..fa96105097736 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
@@ -43,27 +43,6 @@
 #include <libcfs/libcfs.h>
 #include <lnet/lib-lnet.h>
 
-/*
- * kernel 5.1: commit 7f1bc6e95d7840d4305595b3e4025cddda88cee5
- * Y2038 64-bit time.
- *  SO_TIMESTAMP, SO_TIMESTAMPNS and SO_TIMESTAMPING options, the
- *  way they are currently defined, are not y2038 safe.
- *  Subsequent patches in the series add new y2038 safe versions
- *  of these options which provide 64 bit timestamps on all
- *  architectures uniformly.
- *  Hence, rename existing options with OLD tag suffixes.
- *
- * NOTE: When updating to timespec64 change change these to '_NEW'.
- *
- */
-#ifndef SO_SNDTIMEO
-#define SO_SNDTIMEO SO_SNDTIMEO_OLD
-#endif
-
-#ifndef SO_RCVTIMEO
-#define SO_RCVTIMEO SO_RCVTIMEO_OLD
-#endif
-
 static int
 lnet_sock_create_kern(struct socket **sock, struct net *ns)
 {
@@ -318,10 +297,9 @@ EXPORT_SYMBOL(lnet_ipif_enumerate);
 int
 lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout)
 {
-	int		rc;
-	long		jiffies_left = timeout * msecs_to_jiffies(MSEC_PER_SEC);
-	unsigned long	then;
-	struct timeval	tv;
+	int rc;
+	long jiffies_left = cfs_time_seconds(timeout);
+	unsigned long then;
 
 	LASSERT(nob > 0);
 	/* Caller may pass a zero timeout if she thinks the socket buffer is
@@ -337,24 +315,11 @@ lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout)
 		};
 
 		if (timeout != 0) {
-			/* Set send timeout to remaining time */
-			tv = (struct timeval) {
-				.tv_sec = jiffies_left /
-					  msecs_to_jiffies(MSEC_PER_SEC),
-				.tv_usec = ((jiffies_left %
-					     msecs_to_jiffies(MSEC_PER_SEC)) *
-					     USEC_PER_SEC) /
-					     msecs_to_jiffies(MSEC_PER_SEC)
-			};
-
-			rc = kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO,
-					       (char *)&tv, sizeof(tv));
-			if (rc != 0) {
-				CERROR("Can't set socket send timeout "
-				       "%ld.%06d: %d\n",
-				       (long)tv.tv_sec, (int)tv.tv_usec, rc);
-				return rc;
-			}
+			struct sock *sk = sock->sk;
+
+			lock_sock(sk);
+			sk->sk_sndtimeo = jiffies_left;
+			release_sock(sk);
 		}
 
 		then = jiffies;
@@ -385,10 +350,9 @@ EXPORT_SYMBOL(lnet_sock_write);
 int
 lnet_sock_read(struct socket *sock, void *buffer, int nob, int timeout)
 {
-	int		rc;
-	long		jiffies_left = timeout * msecs_to_jiffies(MSEC_PER_SEC);
-	unsigned long	then;
-	struct timeval	tv;
+	int rc;
+	long jiffies_left = cfs_time_seconds(timeout);
+	unsigned long then;
 
 	LASSERT(nob > 0);
 	LASSERT(jiffies_left > 0);
@@ -401,22 +365,12 @@ lnet_sock_read(struct socket *sock, void *buffer, int nob, int timeout)
 		struct msghdr msg = {
 			.msg_flags	= 0
 		};
+		struct sock *sk = sock->sk;
 
 		/* Set receive timeout to remaining time */
-		tv = (struct timeval) {
-			.tv_sec = jiffies_left / msecs_to_jiffies(MSEC_PER_SEC),
-			.tv_usec = ((jiffies_left %
-					msecs_to_jiffies(MSEC_PER_SEC)) *
-					USEC_PER_SEC) /
-					msecs_to_jiffies(MSEC_PER_SEC)
-		};
-		rc = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO,
-				       (char *)&tv, sizeof(tv));
-		if (rc != 0) {
-			CERROR("Can't set socket recv timeout %ld.%06d: %d\n",
-			       (long)tv.tv_sec, (int)tv.tv_usec, rc);
-			return rc;
-		}
+		lock_sock(sk);
+		sk->sk_rcvtimeo = jiffies_left;
+		release_sock(sk);
 
 		then = jiffies;
 		rc = kernel_recvmsg(sock, &msg, &iov, 1, nob, 0);
diff --git a/drivers/staging/lustrefsx/lnet/selftest/conrpc.c b/drivers/staging/lustrefsx/lnet/selftest/conrpc.c
index f9f6c71db2557..996acd87528eb 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/conrpc.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/conrpc.c
@@ -476,7 +476,7 @@ lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
         srpc_msg_t           *msg;
         lstcon_node_t        *nd;
         cfs_duration_t        dur;
-        struct timeval        tv;
+        struct timespec64     ts;
         int                   error;
 
 	LASSERT(head_up != NULL);
@@ -503,11 +503,11 @@ lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
 
                 dur = (cfs_duration_t)cfs_time_sub(crpc->crp_stamp,
                       (cfs_time_t)console_session.ses_id.ses_stamp);
-		jiffies_to_timeval(dur, &tv);
+		jiffies_to_timespec64(dur, &ts);
 
 		if (copy_to_user(&ent->rpe_peer,
 				 &nd->nd_id, sizeof(struct lnet_process_id)) ||
-		    copy_to_user(&ent->rpe_stamp, &tv, sizeof(tv)) ||
+		    copy_to_user(&ent->rpe_stamp, &ts, sizeof(ts)) ||
 		    copy_to_user(&ent->rpe_state,
 				 &nd->nd_state, sizeof(nd->nd_state)) ||
 		    copy_to_user(&ent->rpe_rpc_errno, &error,

From e7a6f8456c5dddb9a25a6dd3f63ee09f0dca1740 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Tue, 13 Oct 2020 22:21:47 +0000
Subject: [PATCH 068/737] lustre: handle removal of NR_UNSTABLE_NFS

    LU-13783 osc: handle removal of NR_UNSTABLE_NFS

    In Linux 5.8 the NR_UNSTABLE_NFS page counters are go.  All pages that
    have been writen but are not yet safe are now counted in NR_WRITEBACK.

    So change osc_page to count in NR_WRITEBACK, but if NR_UNSTABLE_NFS
    still exists in the kernel, use a #define to direct the updates to
    that counter.

    Signed-off-by: Mr NeilBrown <neilb@suse.de>
    Change-Id: I49cbc267fafaee949f45b2e559511aedcf4d8fed
    Reviewed-on: https://review.whamcloud.com/39260
    Tested-by: jenkins <devops@whamcloud.com>
    Reviewed-by: Shaun Tancheff <shaun.tancheff@hpe.com>
    Tested-by: Maloo <maloo@whamcloud.com>
    Reviewed-by: James Simmons <jsimmons@infradead.org>
    Reviewed-by: Oleg Drokin <green@whamcloud.com>

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/staging/lustrefsx/config.h              |  3 +++
 drivers/staging/lustrefsx/lustre/osc/osc_page.c | 11 +++++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/lustrefsx/config.h b/drivers/staging/lustrefsx/config.h
index d040e2a786053..4359ad014348f 100644
--- a/drivers/staging/lustrefsx/config.h
+++ b/drivers/staging/lustrefsx/config.h
@@ -922,6 +922,9 @@
 /* __add_wait_queue_exclusive exists */
 /* #undef HAVE___ADD_WAIT_QUEUE_EXCLUSIVE */
 
+/* NR_UNSTABLE_NFS is still in use */
+/* #undef HAVE_NR_UNSTABLE_NFS */
+
 /* ext4_journal_start takes 3 arguments */
 /* #undef JOURNAL_START_HAS_3ARGS */
 
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_page.c b/drivers/staging/lustrefsx/lustre/osc/osc_page.c
index 20ed2d75db79f..c89d11333357d 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_page.c
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_page.c
@@ -879,6 +879,13 @@ void osc_lru_unreserve(struct client_obd *cli, unsigned long npages)
  * In practice this can work pretty good because the pages in the same RPC
  * are likely from the same page zone.
  */
+#ifdef HAVE_NR_UNSTABLE_NFS
+/* Old kernels use a separate counter for unstable pages,
+ * newer kernels treat them like any other writeback.
+ */
+#define NR_WRITEBACK NR_UNSTABLE_NFS
+#endif
+
 static inline void unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
 					    int factor)
 {
@@ -898,7 +905,7 @@ static inline void unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
 		}
 
 		if (count > 0) {
-			mod_zone_page_state(zone, NR_UNSTABLE_NFS,
+			mod_zone_page_state(zone, NR_WRITEBACK,
 					    factor * count);
 			count = 0;
 		}
@@ -906,7 +913,7 @@ static inline void unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
 		++count;
 	}
 	if (count > 0)
-		mod_zone_page_state(zone, NR_UNSTABLE_NFS, factor * count);
+		mod_zone_page_state(zone, NR_WRITEBACK, factor * count);
 }
 
 static inline void add_unstable_page_accounting(struct ptlrpc_bulk_desc *desc)

From e8a972ba0acd8932a050e37fb1fbcf2b7d3eda25 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Tue, 13 Oct 2020 22:24:46 +0000
Subject: [PATCH 069/737] lustre: Fix compilation with MOFED 5.1

    LU-13761 o2ib: Fix compilation with MOFED 5.1

    A new argument was added to rdma_reject() in MOFED 5.1 and
    Linux 5.8.

    Add a cofigure check and support both versions of rdma_reject().

    Test-Parameters: trivial
    Signed-off-by: Sergey Gorenko <sergeygo@mellanox.com>
    Change-Id: I2b28991f335658b651b21a09899b7b17ab2a9d57
    Reviewed-on: https://review.whamcloud.com/39323
    Reviewed-by: Neil Brown <neilb@suse.de>
    Reviewed-by: Alexey Lyashkov <alexey.lyashkov@hpe.com>
    Reviewed-by: James Simmons <jsimmons@infradead.org>
    Tested-by: jenkins <devops@whamcloud.com>
    Tested-by: Maloo <maloo@whamcloud.com>
    Reviewed-by: Oleg Drokin <green@whamcloud.com>

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/staging/lustrefsx/config.h                        | 5 ++++-
 drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c | 4 ++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/staging/lustrefsx/config.h b/drivers/staging/lustrefsx/config.h
index 4359ad014348f..99c60fb260a7e 100644
--- a/drivers/staging/lustrefsx/config.h
+++ b/drivers/staging/lustrefsx/config.h
@@ -922,9 +922,12 @@
 /* __add_wait_queue_exclusive exists */
 /* #undef HAVE___ADD_WAIT_QUEUE_EXCLUSIVE */
 
-/* NR_UNSTABLE_NFS is still in use */
+/* NR_UNSTABLE_NFS is still in use. */
 /* #undef HAVE_NR_UNSTABLE_NFS */
 
+/* rdma_reject has 4 arguments */
+#define HAVE_RDMA_REJECT_4ARGS
+
 /* ext4_journal_start takes 3 arguments */
 /* #undef JOURNAL_START_HAS_3ARGS */
 
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c
index 751a4211c8356..a27a83748c37d 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c
@@ -2217,7 +2217,11 @@ kiblnd_reject(struct rdma_cm_id *cmid, kib_rej_t *rej)
 {
         int          rc;
 
+#ifdef HAVE_RDMA_REJECT_4ARGS
+	rc = rdma_reject(cmid, rej, sizeof(*rej), IB_CM_REJ_CONSUMER_DEFINED);
+#else
         rc = rdma_reject(cmid, rej, sizeof(*rej));
+#endif
 
         if (rc != 0)
                 CWARN("Error %d sending reject\n", rc);

From 9bc910aa0f7cdd4091f1623514132668cb5cb6c0 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Wed, 14 Oct 2020 19:31:21 +0000
Subject: [PATCH 070/737] lustre: seperate debugfs and procfs handling

This first part is a partial backport of:

    LU-13344 all: Separate debugfs and procfs handling

    Linux 5.6 introduces proc_ops with v5.5-8862-gd56c0d45f0e2
    proc: decouple proc from VFS with "struct proc_ops"

    Separate debugfs usage and procfs usage to prepare for the divergence
    of debugfs using file_operations and procfs using proc_ops

    HPE-bug-id: LUS-8589
    Signed-off-by: Shaun Tancheff <shaun.tancheff@hpe.com>
    Change-Id: I1746e563b55a9e89f90ac01843c304fe6b690d8b
    Reviewed-on: https://review.whamcloud.com/37834
    Reviewed-by: Petros Koutoupis <petros.koutoupis@hpe.com>
    Reviewed-by: Neil Brown <neilb@suse.de>
    Reviewed-by: James Simmons <jsimmons@infradead.org>
    Tested-by: jenkins <devops@whamcloud.com>
    Tested-by: Maloo <maloo@whamcloud.com>
    Reviewed-by: Oleg Drokin <green@whamcloud.com>

The backport is partial, because the debugfs code isn't actually used
in the Lustre 2.10 branch. But this is needed for the file_operations ->
proc_ops change in Linux 5.6, which hasn't been completed yet in upstream
Lustre. Get ready for that change by also changing file_operations
to proc_ops in lprocfs_var.

The second part os converting all remaining struct file_operations use
for proc ops to struct proc_ops.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 .../lustrefsx/lustre/fid/fid_handler.c        |  2 +-
 .../staging/lustrefsx/lustre/fid/lproc_fid.c  | 11 ++--
 .../lustrefsx/lustre/fld/fld_internal.h       |  2 +-
 .../staging/lustrefsx/lustre/fld/lproc_fld.c  |  7 ++-
 .../lustrefsx/lustre/include/lprocfs_status.h | 53 +++++++++++--------
 .../lustrefsx/lustre/ldlm/ldlm_internal.h     |  2 +-
 .../lustrefsx/lustre/ldlm/ldlm_resource.c     | 24 ++++-----
 .../lustrefsx/lustre/llite/lproc_llite.c      |  6 +--
 .../staging/lustrefsx/lustre/llite/vvp_dev.c  | 11 ++--
 .../lustrefsx/lustre/llite/vvp_internal.h     |  2 +-
 .../lustrefsx/lustre/lmv/lmv_internal.h       |  2 +-
 .../staging/lustrefsx/lustre/lmv/lproc_lmv.c  | 11 ++--
 .../lustrefsx/lustre/lov/lov_internal.h       |  2 +-
 .../staging/lustrefsx/lustre/lov/lov_pool.c   | 10 ++--
 .../staging/lustrefsx/lustre/lov/lproc_lov.c  | 11 ++--
 .../lustre/obdclass/lprocfs_jobstats.c        | 13 +++--
 .../lustre/obdclass/lprocfs_status.c          | 40 ++++++++------
 .../lustrefsx/lustre/obdclass/lu_ref.c        | 11 ++--
 .../lustrefsx/lustre/obdclass/obd_config.c    |  9 ++--
 .../lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c    | 11 ++--
 20 files changed, 126 insertions(+), 114 deletions(-)

diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_handler.c b/drivers/staging/lustrefsx/lustre/fid/fid_handler.c
index ef61772f0dcb2..18ac0209737c7 100644
--- a/drivers/staging/lustrefsx/lustre/fid/fid_handler.c
+++ b/drivers/staging/lustrefsx/lustre/fid/fid_handler.c
@@ -458,7 +458,7 @@ LU_KEY_INIT_FINI(seq, struct seq_thread_info);
 /* context key: seq_thread_key */
 LU_CONTEXT_KEY_DEFINE(seq, LCT_MD_THREAD | LCT_DT_THREAD);
 
-extern const struct file_operations seq_fld_proc_seq_fops;
+extern const struct proc_ops seq_fld_proc_seq_fops;
 
 static int seq_server_proc_init(struct lu_server_seq *seq)
 {
diff --git a/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c b/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c
index 11ae1b6996532..e504ed7ff6ceb 100644
--- a/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c
+++ b/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c
@@ -496,12 +496,11 @@ static ssize_t fldb_seq_write(struct file *file, const char __user *buf,
 	RETURN(rc < 0 ? rc : len);
 }
 
-const struct file_operations seq_fld_proc_seq_fops = {
-	.owner	 = THIS_MODULE,
-	.open	 = fldb_seq_open,
-	.read	 = seq_read,
-	.write	 = fldb_seq_write,
-	.release = fldb_seq_release,
+const struct proc_ops seq_fld_proc_seq_fops = {
+	.proc_open	 = fldb_seq_open,
+	.proc_read	 = seq_read,
+	.proc_write	 = fldb_seq_write,
+	.proc_release	= fldb_seq_release,
 };
 
 #endif /* HAVE_SERVER_SUPPORT */
diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_internal.h b/drivers/staging/lustrefsx/lustre/fld/fld_internal.h
index dcb24a3c2f22a..0be28746d6efc 100644
--- a/drivers/staging/lustrefsx/lustre/fld/fld_internal.h
+++ b/drivers/staging/lustrefsx/lustre/fld/fld_internal.h
@@ -179,7 +179,7 @@ void fld_server_mod_exit(void);
 int fld_server_read(const struct lu_env *env, struct lu_server_fld *fld,
 		    struct lu_seq_range *range, void *data, int data_len);
 #ifdef CONFIG_PROC_FS
-extern const struct file_operations fld_proc_seq_fops;
+extern const struct proc_ops fld_proc_seq_fops;
 extern struct lprocfs_vars fld_server_proc_list[];
 #endif
 
diff --git a/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c b/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c
index c7be5bf6ea97f..8fd39ef8160b0 100644
--- a/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c
+++ b/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c
@@ -356,10 +356,9 @@ static int fldb_seq_release(struct inode *inode, struct file *file)
 }
 
 const struct file_operations fld_proc_seq_fops = {
-	.owner   = THIS_MODULE,
-	.open    = fldb_seq_open,
-	.read    = seq_read,
-	.release = fldb_seq_release,
+	.proc_open    = fldb_seq_open,
+	.proc_read    = seq_read,
+	.proc_release = fldb_seq_release,
 };
 
 struct lprocfs_vars fld_server_proc_list[] = {
diff --git a/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h b/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h
index 646679d9aa45e..028d575e207d4 100644
--- a/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h
+++ b/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h
@@ -48,13 +48,25 @@
 #include <libcfs/libcfs.h>
 #include <lustre/lustre_idl.h>
 
+/*
+ * Linux 5.6 introduces proc_ops with v5.5-8862-gd56c0d45f0e2
+ * Now that proc and debugfs use separate operation vector types
+ * separate containers are also needed.
+ */
 struct lprocfs_vars {
+	const char			*name;
+	const struct proc_ops		*fops;
+	void				*data;
+	/** /proc file mode. */
+	mode_t				 proc_mode;
+};
+
+/** Provide a debugfs container */
+struct ldebugfs_vars {
 	const char			*name;
 	const struct file_operations	*fops;
 	void				*data;
-	/**
-	 * /proc file mode.
-	 */
+	/** debugfs file mode. */
 	mode_t				 proc_mode;
 };
 
@@ -478,7 +490,7 @@ static inline int lprocfs_exp_cleanup(struct obd_export *exp)
 #endif
 extern struct proc_dir_entry *
 lprocfs_add_simple(struct proc_dir_entry *root, char *name,
-		   void *data, const struct file_operations *fops);
+		   void *data, const struct proc_ops *fops);
 extern struct proc_dir_entry *
 lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent,
                     const char *format, ...);
@@ -495,14 +507,14 @@ extern int lprocfs_register_stats(struct proc_dir_entry *root, const char *name,
                                   struct lprocfs_stats *stats);
 
 /* lprocfs_status.c */
-extern int ldebugfs_add_vars(struct dentry *parent, struct lprocfs_vars *var,
+extern int ldebugfs_add_vars(struct dentry *parent, struct ldebugfs_vars *var,
 			     void *data);
 extern int lprocfs_add_vars(struct proc_dir_entry *root,
 			    struct lprocfs_vars *var, void *data);
 
 extern struct dentry *ldebugfs_register(const char *name,
 					struct dentry *parent,
-					struct lprocfs_vars *list,
+					struct ldebugfs_vars *list,
 					void *data);
 extern struct proc_dir_entry *
 lprocfs_register(const char *name, struct proc_dir_entry *parent,
@@ -537,7 +549,7 @@ static inline int LPROCFS_ENTRY_CHECK(struct inode *inode)
 extern int lprocfs_obd_setup(struct obd_device *dev);
 extern int lprocfs_obd_cleanup(struct obd_device *obd);
 #ifdef HAVE_SERVER_SUPPORT
-extern const struct file_operations lprocfs_evict_client_fops;
+extern const struct proc_ops lprocfs_evict_client_fops;
 #endif
 
 extern int ldebugfs_seq_create(struct dentry *parent, const char *name,
@@ -546,11 +558,11 @@ extern int ldebugfs_seq_create(struct dentry *parent, const char *name,
 			       void *data);
 extern int lprocfs_seq_create(struct proc_dir_entry *parent, const char *name,
 			      mode_t mode,
-			      const struct file_operations *seq_fops,
+			      const struct proc_ops *seq_fops,
 			      void *data);
 extern int lprocfs_obd_seq_create(struct obd_device *dev, const char *name,
 				  mode_t mode,
-				  const struct file_operations *seq_fops,
+				  const struct proc_ops *seq_fops,
 				  void *data);
 
 /* Generic callbacks */
@@ -678,13 +690,12 @@ static int name##_single_open(struct inode *inode, struct file *file)	\
 	return single_open(file, name##_seq_show,			\
 			   inode->i_private ? : PDE_DATA(inode));	\
 }									\
-static const struct file_operations name##_fops = {			\
-	.owner	 = THIS_MODULE,						\
-	.open	 = name##_single_open,					\
-	.read	 = seq_read,						\
-	.write	 = custom_seq_write,					\
-	.llseek	 = seq_lseek,						\
-	.release = lprocfs_single_release,				\
+static const struct proc_ops name##_fops = {			\
+	.proc_open	 = name##_single_open,				\
+	.proc_read	 = seq_read,					\
+	.proc_write	 = custom_seq_write,				\
+	.proc_lseek	 = seq_lseek,					\
+	.proc_release = lprocfs_single_release,				\
 }
 
 #define LPROC_SEQ_FOPS_RO(name)		__LPROC_SEQ_FOPS(name, NULL)
@@ -724,10 +735,10 @@ static const struct file_operations name##_fops = {			\
 		return single_open(file, NULL,				\
 				   inode->i_private ? : PDE_DATA(inode));\
 	}								\
-	static const struct file_operations name##_##type##_fops = {	\
-		.open	 = name##_##type##_open,			\
-		.write	 = name##_##type##_write,			\
-		.release = lprocfs_single_release,			\
+	static const struct proc_ops name##_##type##_fops = {	\
+		.proc_open	 = name##_##type##_open,		\
+		.proc_write	 = name##_##type##_write,		\
+		.proc_release = lprocfs_single_release,			\
 	};
 
 struct lustre_attr {
@@ -866,7 +877,7 @@ static inline int lprocfs_exp_cleanup(struct obd_export *exp)
 { return 0; }
 static inline struct proc_dir_entry *
 lprocfs_add_simple(struct proc_dir_entry *root, char *name,
-		   void *data, const struct file_operations *fops)
+		   void *data, const struct proc_ops *fops)
 {return 0; }
 static inline struct proc_dir_entry *
 lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent,
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h
index 83cd89e5960fe..779dec55882e5 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h
@@ -337,7 +337,7 @@ enum ldlm_policy_res {
 
 static inline void
 ldlm_add_var(struct lprocfs_vars *vars, struct proc_dir_entry *proc_dir,
-	     const char *name, void *data, const struct file_operations *ops)
+	     const char *name, void *data, const struct proc_ops *ops)
 {
 	snprintf((char *)vars->name, MAX_STRING_SIZE, "%s", name);
 	vars->data = data;
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c
index 8467a9b8abff4..2a55b2b242397 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c
@@ -147,13 +147,12 @@ static int seq_watermark_open(struct inode *inode, struct file *file)
 	return single_open(file, seq_watermark_show, PDE_DATA(inode));
 }
 
-static const struct file_operations ldlm_watermark_fops = {
-	.owner		= THIS_MODULE,
-	.open		= seq_watermark_open,
-	.read		= seq_read,
-	.write		= seq_watermark_write,
-	.llseek		= seq_lseek,
-	.release	= lprocfs_single_release,
+static const struct proc_ops ldlm_watermark_fops = {
+	.proc_open		= seq_watermark_open,
+	.proc_read		= seq_read,
+	.proc_write		= seq_watermark_write,
+	.proc_lseek		= seq_lseek,
+	.proc_release		= lprocfs_single_release,
 };
 
 static int seq_granted_show(struct seq_file *m, void *data)
@@ -168,12 +167,11 @@ static int seq_granted_open(struct inode *inode, struct file *file)
 	return single_open(file, seq_granted_show, PDE_DATA(inode));
 }
 
-static const struct file_operations ldlm_granted_fops = {
-	.owner	= THIS_MODULE,
-	.open	= seq_granted_open,
-	.read	= seq_read,
-	.llseek	= seq_lseek,
-	.release = seq_release,
+static const struct proc_ops ldlm_granted_fops = {
+	.proc_open	= seq_granted_open,
+	.proc_read	= seq_read,
+	.proc_lseek	= seq_lseek,
+	.proc_release	= seq_release,
 };
 
 #endif /* HAVE_SERVER_SUPPORT */
diff --git a/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c b/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
index 93c767207f3eb..3c3f5ddc4dfa4 100755
--- a/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
+++ b/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
@@ -47,9 +47,9 @@ struct proc_dir_entry *proc_lustre_fs_root;
 
 #ifdef CONFIG_PROC_FS
 /* /proc/lustre/llite mount point registration */
-static const struct file_operations ll_rw_extents_stats_fops;
-static const struct file_operations ll_rw_extents_stats_pp_fops;
-static const struct file_operations ll_rw_offset_stats_fops;
+static const struct proc_ops ll_rw_extents_stats_fops;
+static const struct proc_ops ll_rw_extents_stats_pp_fops;
+static const struct proc_ops ll_rw_offset_stats_fops;
 static __s64 ll_stats_pid_write(const char __user *buf, size_t len);
 
 static int ll_blksize_seq_show(struct seq_file *m, void *v)
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c b/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c
index ab92d303fc1e9..15eb72a35245c 100644
--- a/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c
@@ -646,10 +646,9 @@ static int vvp_dump_pgcache_seq_open(struct inode *inode, struct file *filp)
 	return result;
 }
 
-const struct file_operations vvp_dump_pgcache_file_ops = {
-        .owner   = THIS_MODULE,
-        .open    = vvp_dump_pgcache_seq_open,
-        .read    = seq_read,
-        .llseek	 = seq_lseek,
-        .release = seq_release,
+const struct proc_ops vvp_dump_pgcache_file_ops = {
+        .proc_open    = vvp_dump_pgcache_seq_open,
+        .proc_read    = seq_read,
+        .proc_lseek   = seq_lseek,
+        .proc_release = seq_release,
 };
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_internal.h b/drivers/staging/lustrefsx/lustre/llite/vvp_internal.h
index 645b4b5cfca6b..9973d646ae703 100644
--- a/drivers/staging/lustrefsx/lustre/llite/vvp_internal.h
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_internal.h
@@ -328,6 +328,6 @@ struct lu_object *vvp_object_alloc(const struct lu_env *env,
 int vvp_global_init(void);
 void vvp_global_fini(void);
 
-extern const struct file_operations vvp_dump_pgcache_file_ops;
+extern const struct proc_ops vvp_dump_pgcache_file_ops;
 
 #endif /* VVP_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h b/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h
index a9dd6644a2697..2a0c324856fe7 100644
--- a/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h
@@ -156,6 +156,6 @@ struct lmv_tgt_desc
 #ifdef CONFIG_PROC_FS
 extern struct lprocfs_vars lprocfs_lmv_obd_vars[];
 #endif
-extern struct file_operations lmv_proc_target_fops;
+extern struct proc_ops lmv_proc_target_fops;
 
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c b/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c
index cefa71d34a12d..e79781d444fb1 100644
--- a/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c
+++ b/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c
@@ -162,11 +162,10 @@ struct lprocfs_vars lprocfs_lmv_obd_vars[] = {
 	{ NULL }
 };
 
-struct file_operations lmv_proc_target_fops = {
-        .owner                = THIS_MODULE,
-        .open                 = lmv_target_seq_open,
-        .read                 = seq_read,
-        .llseek               = seq_lseek,
-        .release              = seq_release,
+struct proc_ops lmv_proc_target_fops = {
+        .proc_open                 = lmv_target_seq_open,
+        .proc_read                 = seq_read,
+        .proc_lseek                = seq_lseek,
+        .proc_release              = seq_release,
 };
 #endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_internal.h b/drivers/staging/lustrefsx/lustre/lov/lov_internal.h
index 4ced4d31f76b6..7ff0ffe81026e 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_internal.h
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_internal.h
@@ -298,7 +298,7 @@ void lsm_free_plain(struct lov_stripe_md *lsm);
 void dump_lsm(unsigned int level, const struct lov_stripe_md *lsm);
 
 /* lproc_lov.c */
-extern struct file_operations lov_proc_target_fops;
+extern struct proc_ops lov_proc_target_fops;
 #ifdef CONFIG_PROC_FS
 extern struct lprocfs_vars lprocfs_lov_obd_vars[];
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_pool.c b/drivers/staging/lustrefsx/lustre/lov/lov_pool.c
index 7a2b9ac32e92b..066b57df09482 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_pool.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_pool.c
@@ -286,11 +286,11 @@ static int pool_proc_open(struct inode *inode, struct file *file)
         return rc;
 }
 
-static struct file_operations pool_proc_operations = {
-        .open           = pool_proc_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = seq_release,
+static struct proc_ops pool_proc_operations = {
+        .proc_open           = pool_proc_open,
+        .proc_read           = seq_read,
+        .proc_lseek          = seq_lseek,
+        .proc_release        = seq_release,
 };
 #endif /* CONFIG_PROC_FS */
 
diff --git a/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c b/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c
index c101c64b66c20..8610727f9b9e1 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c
@@ -322,11 +322,10 @@ struct lprocfs_vars lprocfs_lov_obd_vars[] = {
 	{ NULL }
 };
 
-struct file_operations lov_proc_target_fops = {
-        .owner   = THIS_MODULE,
-        .open    = lov_target_seq_open,
-        .read    = seq_read,
-        .llseek  = seq_lseek,
-        .release = lprocfs_seq_release,
+struct proc_ops lov_proc_target_fops = {
+        .proc_open    = lov_target_seq_open,
+        .proc_read    = seq_read,
+        .proc_lseek   = seq_lseek,
+        .proc_release = lprocfs_seq_release,
 };
 #endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c
index a341794dc4226..74351918dc5bd 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c
@@ -564,13 +564,12 @@ static int lprocfs_jobstats_seq_release(struct inode *inode, struct file *file)
 	return lprocfs_seq_release(inode, file);
 }
 
-static const struct file_operations lprocfs_jobstats_seq_fops = {
-	.owner   = THIS_MODULE,
-	.open    = lprocfs_jobstats_seq_open,
-	.read    = seq_read,
-	.write   = lprocfs_jobstats_seq_write,
-	.llseek  = seq_lseek,
-	.release = lprocfs_jobstats_seq_release,
+static const struct proc_ops lprocfs_jobstats_seq_fops = {
+	.proc_open    = lprocfs_jobstats_seq_open,
+	.proc_read    = seq_read,
+	.proc_write   = lprocfs_jobstats_seq_write,
+	.proc_lseek   = seq_lseek,
+	.proc_release = lprocfs_jobstats_seq_release,
 };
 
 int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
index d2d72dcde41d5..07801fa617a9d 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
@@ -62,7 +62,7 @@ EXPORT_SYMBOL(lprocfs_seq_release);
 
 struct proc_dir_entry *
 lprocfs_add_simple(struct proc_dir_entry *root, char *name,
-		   void *data, const struct file_operations *fops)
+		   void *data, const struct proc_ops *fops)
 {
 	struct proc_dir_entry *proc;
 	mode_t mode = 0;
@@ -70,9 +70,9 @@ lprocfs_add_simple(struct proc_dir_entry *root, char *name,
 	if (root == NULL || name == NULL || fops == NULL)
                 return ERR_PTR(-EINVAL);
 
-	if (fops->read)
+	if (fops->proc_read)
 		mode = 0444;
-	if (fops->write)
+	if (fops->proc_write)
 		mode |= 0200;
 	proc = proc_create_data(name, mode, root, fops, data);
 	if (!proc) {
@@ -112,9 +112,9 @@ struct proc_dir_entry *lprocfs_add_symlink(const char *name,
 }
 EXPORT_SYMBOL(lprocfs_add_symlink);
 
-static const struct file_operations lprocfs_generic_fops = { };
+static const struct file_operations ldebugfs_empty_ops = { };
 
-int ldebugfs_add_vars(struct dentry *parent, struct lprocfs_vars *list,
+int ldebugfs_add_vars(struct dentry *parent, struct ldebugfs_vars *list,
 		      void *data)
 {
 	if (IS_ERR_OR_NULL(parent) || IS_ERR_OR_NULL(list))
@@ -134,7 +134,7 @@ int ldebugfs_add_vars(struct dentry *parent, struct lprocfs_vars *list,
 		}
 		entry = debugfs_create_file(list->name, mode, parent,
 					    list->data ? : data,
-					    list->fops ? : &lprocfs_generic_fops);
+					    list->fops ? : &ldebugfs_empty_ops);
 		if (IS_ERR_OR_NULL(entry))
 			return entry ? PTR_ERR(entry) : -ENOMEM;
 		list++;
@@ -143,6 +143,8 @@ int ldebugfs_add_vars(struct dentry *parent, struct lprocfs_vars *list,
 }
 EXPORT_SYMBOL_GPL(ldebugfs_add_vars);
 
+static const struct proc_ops lprocfs_empty_ops = { };
+
 /**
  * Add /proc entries.
  *
@@ -168,13 +170,13 @@ lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list,
 		if (list->proc_mode != 0000) {
 			mode = list->proc_mode;
 		} else if (list->fops) {
-			if (list->fops->read)
+			if (list->fops->proc_read)
 				mode = 0444;
-			if (list->fops->write)
+			if (list->fops->proc_write)
 				mode |= 0200;
 		}
 		proc = proc_create_data(list->name, mode, root,
-					list->fops ?: &lprocfs_generic_fops,
+					list->fops ?: &lprocfs_empty_ops,
 					list->data ?: data);
 		if (proc == NULL)
 			return -ENOMEM;
@@ -301,7 +303,7 @@ void lprocfs_remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 EXPORT_SYMBOL(lprocfs_remove_proc_entry);
 
 struct dentry *ldebugfs_register(const char *name, struct dentry *parent,
-				 struct lprocfs_vars *list, void *data)
+				 struct ldebugfs_vars *list, void *data)
 {
 	struct dentry *entry;
 
@@ -1497,7 +1499,15 @@ static int lprocfs_stats_seq_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static const struct file_operations lprocfs_stats_seq_fops = {
+static const struct proc_ops lprocfs_stats_seq_fops = {
+        .proc_open    = lprocfs_stats_seq_open,
+        .proc_read    = seq_read,
+        .proc_write   = lprocfs_stats_seq_write,
+        .proc_lseek   = seq_lseek,
+        .proc_release = lprocfs_seq_release,
+};
+
+static const struct file_operations ldebugfs_stats_seq_fops = {
         .owner   = THIS_MODULE,
         .open    = lprocfs_stats_seq_open,
         .read    = seq_read,
@@ -1514,7 +1524,7 @@ int ldebugfs_register_stats(struct dentry *parent, const char *name,
 	LASSERT(!IS_ERR_OR_NULL(parent));
 
 	entry = debugfs_create_file(name, 0644, parent, stats,
-				    &lprocfs_stats_seq_fops);
+				    &ldebugfs_stats_seq_fops);
 	if (IS_ERR_OR_NULL(entry))
 		return entry ? PTR_ERR(entry) : -ENOMEM;
 
@@ -2199,14 +2209,14 @@ EXPORT_SYMBOL_GPL(ldebugfs_seq_create);
 int lprocfs_seq_create(struct proc_dir_entry *parent,
 		       const char *name,
 		       mode_t mode,
-		       const struct file_operations *seq_fops,
+		       const struct proc_ops *seq_fops,
 		       void *data)
 {
 	struct proc_dir_entry *entry;
 	ENTRY;
 
 	/* Disallow secretly (un)writable entries. */
-	LASSERT((seq_fops->write == NULL) == ((mode & 0222) == 0));
+	LASSERT((seq_fops->proc_write == NULL) == ((mode & 0222) == 0));
 
 	entry = proc_create_data(name, mode, parent, seq_fops, data);
 
@@ -2220,7 +2230,7 @@ EXPORT_SYMBOL(lprocfs_seq_create);
 int lprocfs_obd_seq_create(struct obd_device *dev,
 			   const char *name,
 			   mode_t mode,
-			   const struct file_operations *seq_fops,
+			   const struct proc_ops *seq_fops,
 			   void *data)
 {
         return (lprocfs_seq_create(dev->obd_proc_entry, name,
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c b/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c
index bef29033f30ee..80d644f1092e0 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c
@@ -402,12 +402,11 @@ static int lu_ref_seq_release(struct inode *inode, struct file *file)
 	return seq_release(inode, file);
 }
 
-static struct file_operations lu_ref_dump_fops = {
-        .owner   = THIS_MODULE,
-        .open    = lu_ref_seq_open,
-        .read    = seq_read,
-        .llseek  = seq_lseek,
-        .release = lu_ref_seq_release
+static struct proc_ops lu_ref_dump_fops = {
+        .proc_open    = lu_ref_seq_open,
+        .proc_read    = seq_read,
+        .proc_lseek   = seq_lseek,
+        .proc_release = lu_ref_seq_release
 };
 
 #endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
index 46b2b941bae57..8182d872a8cec 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
@@ -1304,7 +1304,7 @@ int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
 		RETURN(-EINVAL);
 	}
 
-	/* fake a seq file so that var->fops->write can work... */
+	/* fake a seq file so that var->fops->proc_write can work... */
 	fakefile.private_data = &fake_seqfile;
 	fake_seqfile.private = data;
 	/* e.g. tunefs.lustre --param mdt.group_upcall=foo /r/tmp/lustre-mdt
@@ -1338,12 +1338,13 @@ int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
 				matched++;
 				rc = -EROFS;
 
-				if (var->fops && var->fops->write) {
+				if (var->fops && var->fops->proc_write) {
 					mm_segment_t oldfs;
 					oldfs = get_fs();
 					set_fs(KERNEL_DS);
-					rc = (var->fops->write)(&fakefile, sval,
-								vallen, NULL);
+					rc = (var->fops->proc_write)(&fakefile,
+								sval, vallen,
+								NULL);
 					set_fs(oldfs);
 				}
 				break;
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c b/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
index 711c77650a569..e13b9e2e4c8f5 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
@@ -1129,12 +1129,11 @@ void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry,
 		  .data = svc },
 		{ NULL }
         };
-        static struct file_operations req_history_fops = {
-                .owner       = THIS_MODULE,
-                .open        = ptlrpc_lprocfs_svc_req_history_open,
-                .read        = seq_read,
-                .llseek      = seq_lseek,
-                .release     = lprocfs_seq_release,
+        static struct proc_ops req_history_fops = {
+                .proc_open        = ptlrpc_lprocfs_svc_req_history_open,
+                .proc_read        = seq_read,
+                .proc_lseek       = seq_lseek,
+                .proc_release     = lprocfs_seq_release,
         };
 
         int rc;

From 5c45a049d3935a311a1c7af42ee0eab807dbb74f Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Wed, 14 Oct 2020 22:21:01 +0000
Subject: [PATCH 071/737] lustre: fix fiemap.h include

It shouldn't be inside ifndef __KERNEL__

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h b/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h
index cee135bf3c74f..9ec06c7fb8049 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h
@@ -43,8 +43,8 @@
 
 #ifndef __KERNEL__
 #include <stddef.h>
-#include <linux/fiemap.h>
 #endif
+#include <linux/fiemap.h>
 
 /* XXX: We use fiemap_extent::fe_reserved[0] */
 #define fe_device	fe_reserved[0]

From 7679540c304211cb348e62b7277fafcaccca03f4 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Wed, 14 Oct 2020 21:29:09 +0000
Subject: [PATCH 072/737] lustre: fixup for kernel_{get,set}sockopt removal

The kernel_getsockopt and kernel_setsockopt functions
were removed from upstream Linux.

Replace the one kernel_getsockopt call with a direct
call to the getsockopt socket op.

There are a number of kernel_setsockopt calls, so copy
kernel_setsockopt from 5.4, with a sockptr_t mod.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 .../lustrefsx/lnet/include/lnet/lib-lnet.h      |  5 +++++
 .../lustrefsx/lnet/klnds/socklnd/socklnd_lib.c  |  4 ++--
 .../staging/lustrefsx/lnet/lnet/lib-socket.c    | 17 +++++++++++++++++
 3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h b/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h
index 59386c0fdee2f..15b89a9f85042 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h
@@ -83,6 +83,11 @@ extern struct lnet the_lnet;			/* THE network */
 		kernel_getsockname(sock, addr, addrlen)
 #endif
 
+#ifndef HAVE_KERNEL_SETSOCKOPT
+int kernel_setsockopt(struct socket *sock, int level, int optname,
+                        char *optval, unsigned int optlen);
+#endif
+
 static inline int lnet_is_route_alive(struct lnet_route *route)
 {
 	if (!route->lr_gateway->lpni_alive)
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c
index 1215488b89d62..b72bf19541308 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c
@@ -435,8 +435,8 @@ ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int
 	rc = lnet_sock_getbuf(sock, txmem, rxmem);
         if (rc == 0) {
                 len = sizeof(*nagle);
-		rc = kernel_getsockopt(sock, SOL_TCP, TCP_NODELAY,
-				       (char *)nagle, &len);
+		rc = sock->ops->getsockopt(sock, SOL_TCP, TCP_NODELAY,
+					   (char *)nagle, &len);
         }
 
         ksocknal_connsock_decref(conn);
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
index fa96105097736..f19a3a3d4b61c 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
@@ -43,6 +43,23 @@
 #include <libcfs/libcfs.h>
 #include <lnet/lib-lnet.h>
 
+#ifndef HAVE_KERNEL_SETSOCKOPT
+int kernel_setsockopt(struct socket *sock, int level, int optname,
+			char *val, unsigned int optlen)
+{
+	sockptr_t optval = KERNEL_SOCKPTR(val);
+	int err;
+
+	if (level == SOL_SOCKET)
+		err = sock_setsockopt(sock, level, optname, optval, optlen);
+	else
+		err = sock->ops->setsockopt(sock, level, optname, optval,
+					    optlen);
+	return err;
+}
+EXPORT_SYMBOL(kernel_setsockopt);
+#endif
+
 static int
 lnet_sock_create_kern(struct socket **sock, struct net *ns)
 {

From 94b3316c101ecafcd03254df73367fc7295cbe0d Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Fri, 16 Oct 2020 15:29:59 +0000
Subject: [PATCH 073/737] lustre: mmap_sem -> mmap_lock

Commit da1c55f1b272 ("mmap locking API: rename mmap_sem to mmap_lock")
changed the name of mmap_sem, so follow suit.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 .../staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c | 8 ++++----
 drivers/staging/lustrefsx/lustre/llite/llite_mmap.c       | 4 ++--
 drivers/staging/lustrefsx/lustre/llite/vvp_io.c           | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c
index 38ca4bc97be98..36a4fdef2dc24 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c
@@ -137,11 +137,11 @@ static int cfs_access_process_vm(struct task_struct *tsk,
 	struct page *page;
 	void *old_buf = buf;
 
-	/* Avoid deadlocks on mmap_sem if called from sys_mmap_pgoff(),
-	 * which is already holding mmap_sem for writes.  If some other
+	/* Avoid deadlocks on mmap_lock if called from sys_mmap_pgoff(),
+	 * which is already holding mmap_lock for writes.  If some other
 	 * thread gets the write lock in the meantime, this thread will
 	 * block, but at least it won't deadlock on itself.  LU-1735 */
-	if (down_read_trylock(&mm->mmap_sem) == 0)
+	if (down_read_trylock(&mm->mmap_lock) == 0)
 		return -EDEADLK;
 
 	/* ignore errors, just check how much was successfully transferred */
@@ -181,7 +181,7 @@ static int cfs_access_process_vm(struct task_struct *tsk,
 		buf += bytes;
 		addr += bytes;
 	}
-	up_read(&mm->mmap_sem);
+	up_read(&mm->mmap_lock);
 
 	return buf - old_buf;
 }
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c b/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
index 839b710764554..30cf21b778811 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
@@ -57,8 +57,8 @@ struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr,
         struct vm_area_struct *vma, *ret = NULL;
         ENTRY;
 
-        /* mmap_sem must have been held by caller. */
-        LASSERT(!down_write_trylock(&mm->mmap_sem));
+        /* mmap_lock must have been held by caller. */
+        LASSERT(!down_write_trylock(&mm->mmap_lock));
 
         for(vma = find_vma(mm, addr);
             vma != NULL && vma->vm_start < (addr + count); vma = vma->vm_next) {
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_io.c b/drivers/staging/lustrefsx/lustre/llite/vvp_io.c
index 404cee02a4692..a71b7b60d90f5 100644
--- a/drivers/staging/lustrefsx/lustre/llite/vvp_io.c
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_io.c
@@ -458,7 +458,7 @@ static int vvp_mmap_locks(const struct lu_env *env, struct cl_io *io)
 		count += addr & ~PAGE_MASK;
 		addr &= PAGE_MASK;
 
-                down_read(&mm->mmap_sem);
+                down_read(&mm->mmap_lock);
                 while((vma = our_vma(mm, addr, count)) != NULL) {
 			struct dentry *de = file_dentry(vma->vm_file);
 			struct inode *inode = de->d_inode;
@@ -500,7 +500,7 @@ static int vvp_mmap_locks(const struct lu_env *env, struct cl_io *io)
 			count -= vma->vm_end - addr;
 			addr = vma->vm_end;
 		}
-		up_read(&mm->mmap_sem);
+		up_read(&mm->mmap_lock);
 		if (result < 0)
 			break;
 	}

From 64cff60b595148dc4158a4c939363fd4711e42f7 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Fri, 16 Oct 2020 15:51:29 +0000
Subject: [PATCH 074/737] lustre: remove the pgprot argument to __vmalloc

Commit 88dca4ca5a93 ("mm: remove the pgprot argument to __vmalloc")
removed the pgprot argument to __vmalloc, since it's always
PAGE_KERNEL now.

Adapt the call in the lustre code to do the same.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/staging/lustrefsx/lustre/include/obd_support.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/staging/lustrefsx/lustre/include/obd_support.h b/drivers/staging/lustrefsx/lustre/include/obd_support.h
index 59f3ab530c9ed..f8abcb83f0301 100644
--- a/drivers/staging/lustrefsx/lustre/include/obd_support.h
+++ b/drivers/staging/lustrefsx/lustre/include/obd_support.h
@@ -762,8 +762,7 @@ do {									      \
 #define __OBD_VMALLOC_VERBOSE(ptr, cptab, cpt, size)			      \
 do {									      \
 	(ptr) = cptab == NULL ?						      \
-		__vmalloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO,	      \
-			  PAGE_KERNEL) :				      \
+		__vmalloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO) :      \
 		cfs_cpt_vzalloc(cptab, cpt, size);			      \
 	if (unlikely((ptr) == NULL)) {                                        \
 		CERROR("vmalloc of '" #ptr "' (%d bytes) failed\n",           \

From f21227e1c56d28ce8084d88d4f489f5b082ae117 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Fri, 16 Oct 2020 16:08:39 +0000
Subject: [PATCH 075/737] lustre: use uaccess_kernel instead of segment_eq

Commit 428e2976a5bf ("uaccess: remove segment_eq") removed
the segment_eq macro, replacing it with uaccess_kernel
for the cases where it was still used.

Those cases match the use case in the lustre code, so convert
it in the same manner.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c
index 62e03bde77221..a703b0e5b4562 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c
@@ -37,9 +37,8 @@
 /* Debugging check only needed during development */
 #ifdef OBD_CTXT_DEBUG
 # define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC)
-# define ASSERT_NOT_KERNEL_CTXT(msg) LASSERTF(!segment_eq(get_fs(), KERNEL_DS),\
-					      msg)
-# define ASSERT_KERNEL_CTXT(msg) LASSERTF(segment_eq(get_fs(), KERNEL_DS), msg)
+# define ASSERT_NOT_KERNEL_CTXT(msg) LASSERTF(!uaccess_kernel(), msg)
+# define ASSERT_KERNEL_CTXT(msg) LASSERTF(uaccess_kernel(), msg)
 #else
 # define ASSERT_CTXT_MAGIC(magic) do {} while(0)
 # define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0)

From 19724b84a91575cc854dbfd4f57dc329b5a5c17b Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Wed, 2 Dec 2020 21:51:18 +0000
Subject: [PATCH 076/737] Disable HAVE_LINUX_SELINUX_IS_ENABLED, as it's gone
 for recent kernels.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/staging/lustrefsx/config.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/staging/lustrefsx/config.h b/drivers/staging/lustrefsx/config.h
index 99c60fb260a7e..0d000a1f58fb3 100644
--- a/drivers/staging/lustrefsx/config.h
+++ b/drivers/staging/lustrefsx/config.h
@@ -579,7 +579,7 @@
 #define HAVE_LINUX_RANDOM_H 1
 
 /* if linux/selinux.h exists */
-#define HAVE_LINUX_SELINUX_IS_ENABLED 1
+#undef HAVE_LINUX_SELINUX_IS_ENABLED
 
 /* Define to 1 if you have the <linux/types.h> header file. */
 #define HAVE_LINUX_TYPES_H 1

From 518d90cbd45a8608aa65cad8eef8adcb01d4c52b Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Tue, 22 Dec 2020 17:26:17 +0000
Subject: [PATCH 077/737] lustre: disable compiling sec_ctx.c

The exported functions (push_ctxt, pop_ctxt) in this file are not
used anywhere.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/staging/lustrefsx/lustre/ptlrpc/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/Makefile b/drivers/staging/lustrefsx/lustre/ptlrpc/Makefile
index f192313597822..bf464b8d7b53b 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/Makefile
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/Makefile
@@ -15,7 +15,7 @@ ptlrpc_objs := client.o recover.o connection.o niobuf.o pack_generic.o
 ptlrpc_objs += events.o ptlrpc_module.o service.o pinger.o
 ptlrpc_objs += llog_net.o llog_client.o llog_server.o import.o ptlrpcd.o
 ptlrpc_objs += pers.o lproc_ptlrpc.o wiretest.o layout.o
-ptlrpc_objs += sec.o sec_ctx.o sec_bulk.o sec_gc.o sec_config.o sec_lproc.o
+ptlrpc_objs += sec.o sec_bulk.o sec_gc.o sec_config.o sec_lproc.o
 ptlrpc_objs += sec_null.o sec_plain.o nrs.o nrs_fifo.o nrs_crr.o nrs_orr.o
 ptlrpc_objs += nrs_tbf.o nrs_delay.o errno.o
 

From 46d12a2fecd2e2f63d9e9da5cd9c652aa2da7cab Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Tue, 22 Dec 2020 17:56:32 +0000
Subject: [PATCH 078/737] lustre: remove get/set_fs from ptask code

Setting USER_DS is hidden inside kthread_[un]use_mm now, and
get/set_fs no longer exist. So, don't do it explicitly anymore.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 .../lustrefsx/libcfs/include/libcfs/libcfs_ptask.h       | 1 -
 drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c   | 9 ++-------
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h
index 85925492dd5df..ca40551dfc678 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h
@@ -46,7 +46,6 @@ typedef int (*cfs_ptask_cb_t)(struct cfs_ptask *);
 struct cfs_ptask {
 	struct padata_priv	 pt_padata;
 	struct completion	 pt_completion;
-	mm_segment_t		 pt_fs;
 	struct mm_struct	*pt_mm;
 	unsigned int		 pt_flags;
 	int			 pt_cbcpu;
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c
index 275c01b74ad4e..9786288cbad50 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c
@@ -87,7 +87,6 @@ static void cfs_ptask_complete(struct padata_priv *padata)
 static void cfs_ptask_execute(struct padata_priv *padata)
 {
 	struct cfs_ptask *ptask = cfs_padata2ptask(padata);
-	mm_segment_t old_fs = get_fs();
 	bool bh_enabled = false;
 
 	if (!cfs_ptask_is_atomic(ptask)) {
@@ -96,8 +95,7 @@ static void cfs_ptask_execute(struct padata_priv *padata)
 	}
 
 	if (cfs_ptask_use_user_mm(ptask) && ptask->pt_mm != NULL) {
-		use_mm(ptask->pt_mm);
-		set_fs(ptask->pt_fs);
+		kthread_use_mm(ptask->pt_mm);
 	}
 
 	if (ptask->pt_cbfunc != NULL)
@@ -106,8 +104,7 @@ static void cfs_ptask_execute(struct padata_priv *padata)
 		ptask->pt_result = -ENOSYS;
 
 	if (cfs_ptask_use_user_mm(ptask) && ptask->pt_mm != NULL) {
-		set_fs(old_fs);
-		unuse_mm(ptask->pt_mm);
+		kthread_unuse_mm(ptask->pt_mm);
 		mmput(ptask->pt_mm);
 		ptask->pt_mm = NULL;
 	}
@@ -132,7 +129,6 @@ static int cfs_do_parallel(struct cfs_ptask_engine *engine,
 
 	if (cfs_ptask_use_user_mm(ptask)) {
 		ptask->pt_mm = get_task_mm(current);
-		ptask->pt_fs = get_fs();
 	}
 	ptask->pt_result = -EINPROGRESS;
 
@@ -230,7 +226,6 @@ int cfs_ptask_init(struct cfs_ptask *ptask, cfs_ptask_cb_t cbfunc, void *cbdata,
 	ptask->pt_flags  = flags;
 	ptask->pt_cbcpu  = cpu;
 	ptask->pt_mm     = NULL; /* will be set in cfs_do_parallel() */
-	ptask->pt_fs     = get_fs();
 	ptask->pt_cbfunc = cbfunc;
 	ptask->pt_cbdata = cbdata;
 	ptask->pt_result = -EAGAIN;

From 20ce88e25bf5baad85b718b4909d36cba0cf5093 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Tue, 22 Dec 2020 23:43:56 +0000
Subject: [PATCH 079/737] lustre: get network interface configs directly

The Lustre code uses in-kernel ioctl calls to retrieve network
interface names/masks/addresses. Since these calls expect to
be used from userspace, the code uses set_fs(KERNEL_DS) to
trick copy_{from,to}_user to copy to/from kernel space.

This no longer works since Linux 5.10.

So, using the knowledge that Lustre only supports IPv4, open code
these calls by looking up the netdev and walking the IPv4 interfaces
directly.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 .../staging/lustrefsx/lnet/lnet/lib-socket.c  | 316 +++++++-----------
 1 file changed, 117 insertions(+), 199 deletions(-)

diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
index f19a3a3d4b61c..b01cdd55193aa 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
@@ -43,6 +43,8 @@
 #include <libcfs/libcfs.h>
 #include <lnet/lib-lnet.h>
 
+#include <linux/inetdevice.h>
+
 #ifndef HAVE_KERNEL_SETSOCKOPT
 int kernel_setsockopt(struct socket *sock, int level, int optname,
 			char *val, unsigned int optlen)
@@ -74,240 +76,156 @@ lnet_sock_create_kern(struct socket **sock, struct net *ns)
 	return rc;
 }
 
-static int
-kernel_sock_unlocked_ioctl(struct file *filp, int cmd, unsigned long arg)
-{
-	mm_segment_t oldfs = get_fs();
-	int err;
-
-	set_fs(KERNEL_DS);
-	err = filp->f_op->unlocked_ioctl(filp, cmd, arg);
-	set_fs(oldfs);
-
-	return err;
-}
-
-static int
-lnet_sock_ioctl(int cmd, unsigned long arg, struct net *ns)
-{
-	struct file    *sock_filp;
-	struct socket  *sock;
-	int		fd = -1;
-	int		rc;
-
-	rc = lnet_sock_create_kern(&sock, ns);
-	if (rc != 0) {
-		CERROR("Can't create socket: %d\n", rc);
-		return rc;
-	}
-
-#if !defined(HAVE_SOCK_ALLOC_FILE) && !defined(HAVE_SOCK_ALLOC_FILE_3ARGS)
-	fd = sock_map_fd(sock, 0);
-	if (fd < 0) {
-		rc = fd;
-		sock_release(sock);
-		goto out;
-	}
-	sock_filp = fget(fd);
-#else
-# ifdef HAVE_SOCK_ALLOC_FILE_3ARGS
-	sock_filp = sock_alloc_file(sock, 0, NULL);
-# else
-	sock_filp = sock_alloc_file(sock, 0);
-# endif
-#endif
-	if (IS_ERR(sock_filp)) {
-		rc = PTR_ERR(sock_filp);
-		sock_release(sock);
-		goto out;
-	}
-
-	rc = kernel_sock_unlocked_ioctl(sock_filp, cmd, arg);
-
-	fput(sock_filp);
-out:
-	if (fd >= 0)
-#ifdef HAVE_KSYS_CLOSE
-		ksys_close(fd);
-#else
-		sys_close(fd);
-#endif
-	return rc;
-}
-
 int
 lnet_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask, struct net *ns)
 {
-	struct ifreq	ifr;
-	int		nob;
-	int		rc;
-	__u32		val;
-
-	nob = strnlen(name, IFNAMSIZ);
-	if (nob == IFNAMSIZ) {
-		CERROR("Interface name %s too long\n", name);
-		return -EINVAL;
-	}
-
-	CLASSERT(sizeof(ifr.ifr_name) >= IFNAMSIZ);
-
-	if (strlen(name) > sizeof(ifr.ifr_name)-1)
-		return -E2BIG;
-	strncpy(ifr.ifr_name, name, sizeof(ifr.ifr_name));
-
-	rc = lnet_sock_ioctl(SIOCGIFFLAGS, (unsigned long)&ifr, ns);
-	if (rc != 0) {
-		CERROR("Can't get flags for interface %s\n", name);
-		return rc;
+	struct net_device *dev;
+	struct in_device *in_dev;
+	const struct in_ifaddr *ifa;
+	unsigned int flags;
+	char *colon, *ifname;
+	int ret;
+	size_t slen;
+
+	/*
+	 * Copy the interface name, since we may be about to modify it.
+	 */
+	slen = strlen(name) + 1;
+	ifname = kzalloc(slen, GFP_KERNEL);
+	if (ifname == NULL)
+		return -ENOMEM;
+
+	memcpy(ifname, name, slen);
+	colon = strchr(ifname, ':');
+	if (colon)
+		*colon = 0;
+
+	dev_load(ns, ifname);
+	ret = -ENODEV;
+
+	rtnl_lock();
+
+	dev = __dev_get_by_name(ns, ifname);
+
+	if (colon)
+		*colon = ':';
+
+	if (dev == NULL) {
+		CERROR("Can't find interface %s\n", name);
+		goto out;
 	}
 
-	if ((ifr.ifr_flags & IFF_UP) == 0) {
+	flags = dev_get_flags(dev);
+	if ((flags & IFF_UP) == 0) {
 		CDEBUG(D_NET, "Interface %s down\n", name);
 		*up = 0;
 		*ip = *mask = 0;
-		return 0;
+		ret = 0;
+		goto out;
 	}
-	*up = 1;
-
-	if (strlen(name) > sizeof(ifr.ifr_name)-1)
-		return -E2BIG;
-	strncpy(ifr.ifr_name, name, sizeof(ifr.ifr_name));
 
-	ifr.ifr_addr.sa_family = AF_INET;
-	rc = lnet_sock_ioctl(SIOCGIFADDR, (unsigned long)&ifr, ns);
+	/*
+	 * Only support IPv4, so just walk the list of IPv4 assigned
+	 * addresses to a device.
+	 */
+	in_dev = __in_dev_get_rtnl(dev);
 
-	if (rc != 0) {
-		CERROR("Can't get IP address for interface %s\n", name);
-		return rc;
+	in_dev_for_each_ifa_rtnl(ifa, in_dev) {
+		if (!strcmp(ifa->ifa_label, ifname))
+			break;
 	}
 
-	val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr;
-	*ip = ntohl(val);
-
-	if (strlen(name) > sizeof(ifr.ifr_name)-1)
-		return -E2BIG;
-	strncpy(ifr.ifr_name, name, sizeof(ifr.ifr_name));
-
-	ifr.ifr_addr.sa_family = AF_INET;
-	rc = lnet_sock_ioctl(SIOCGIFNETMASK, (unsigned long)&ifr, ns);
-	if (rc != 0) {
-		CERROR("Can't get netmask for interface %s\n", name);
-		return rc;
+	if (ifa != NULL) {
+		*up = 1;
+		*mask = ntohl(ifa->ifa_mask);
+		*ip = ntohl(ifa->ifa_local);
+		ret = 0;
+	} else {
+		CERROR("Can't get mask/ip for interface %s\n", name);
 	}
 
-	val = ((struct sockaddr_in *)&ifr.ifr_netmask)->sin_addr.s_addr;
-	*mask = ntohl(val);
-
-	return 0;
+out:
+	rtnl_unlock();
+	kfree(ifname);
+	return ret;
 }
 EXPORT_SYMBOL(lnet_ipif_query);
 
 void
 lnet_ipif_free_enumeration(char **names, int n)
 {
-	int	i;
-
-	LASSERT(n > 0);
-
-	for (i = 0; i < n && names[i] != NULL; i++)
-		LIBCFS_FREE(names[i], IFNAMSIZ);
-
-	LIBCFS_FREE(names, n * sizeof(*names));
+	LIBCFS_FREE(names, PAGE_SIZE / IFNAMSIZ);
+	LIBCFS_FREE(names[0], PAGE_SIZE);
 }
 EXPORT_SYMBOL(lnet_ipif_free_enumeration);
 
 int
 lnet_ipif_enumerate(char ***namesp, struct net *ns)
 {
-	/* Allocate and fill in 'names', returning # interfaces/error */
-	char	      **names;
-	int		toobig;
-	int		nalloc;
-	int		nfound;
-	struct ifreq   *ifr;
-	struct ifconf	ifc;
-	int		rc;
-	int		nob;
-	int		i;
-
-	nalloc = 16;	/* first guess at max interfaces */
+	char **names;
+	char *space;
+	const struct in_ifaddr *ifa;
+	struct net_device *dev;
+	struct in_device *in_dev;
+	int maxifs, nifs, toobig;
+	size_t used, slen;
+
+	maxifs = PAGE_SIZE / IFNAMSIZ;
+	nifs = 0;
+	used = 0;
 	toobig = 0;
-	for (;;) {
-		if (nalloc * sizeof(*ifr) > PAGE_SIZE) {
-			toobig = 1;
-			nalloc = PAGE_SIZE / sizeof(*ifr);
-			CWARN("Too many interfaces: only enumerating "
-			      "first %d\n", nalloc);
-		}
-
-		LIBCFS_ALLOC(ifr, nalloc * sizeof(*ifr));
-		if (ifr == NULL) {
-			CERROR("ENOMEM enumerating up to %d interfaces\n",
-			       nalloc);
-			rc = -ENOMEM;
-			goto out0;
-		}
-
-		ifc.ifc_buf = (char *)ifr;
-		ifc.ifc_len = nalloc * sizeof(*ifr);
-
-		rc = lnet_sock_ioctl(SIOCGIFCONF, (unsigned long)&ifc, ns);
-		if (rc < 0) {
-			CERROR("Error %d enumerating interfaces\n", rc);
-			goto out1;
-		}
-
-		LASSERT(rc == 0);
-
-		nfound = ifc.ifc_len/sizeof(*ifr);
-		LASSERT(nfound <= nalloc);
-
-		if (nfound < nalloc || toobig)
-			break;
 
-		LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
-		nalloc *= 2;
+	/*
+	 * For simplicity, just allocate the maximum number of names
+	 * that can be dealt with. The free function will ignore the
+	 * arg
+	 */
+	LIBCFS_ALLOC(names, maxifs * sizeof (*names));
+	if (names == NULL)
+		return -ENOMEM;
+
+	LIBCFS_ALLOC(space, PAGE_SIZE);
+	if (space == NULL) {
+		LIBCFS_FREE(names, maxifs * sizeof (*names));
+		return -ENOMEM;
 	}
 
-	if (nfound == 0)
-		goto out1;
-
-	LIBCFS_ALLOC(names, nfound * sizeof(*names));
-	if (names == NULL) {
-		rc = -ENOMEM;
-		goto out1;
-	}
-
-	for (i = 0; i < nfound; i++) {
-		nob = strnlen(ifr[i].ifr_name, IFNAMSIZ);
-		if (nob == IFNAMSIZ) {
-			/* no space for terminating NULL */
-			CERROR("interface name %.*s too long (%d max)\n",
-			       nob, ifr[i].ifr_name, IFNAMSIZ);
-			rc = -ENAMETOOLONG;
-			goto out2;
+	/*
+	 * Only IPv4 is supported, so just loop all network
+	 * devices, and loop the IPv4 interfaces (addresses)
+	 * assigned to each device.
+	 */
+	rtnl_lock();
+	for_each_netdev(ns, dev) {
+		in_dev = __in_dev_get_rtnl(dev);
+		if (!in_dev)
+			continue;
+
+		in_dev_for_each_ifa_rtnl(ifa, in_dev) {
+			nifs++;
+			if (toobig)
+				continue;
+
+			if (nifs > maxifs) {
+				toobig = 1;
+				continue;
+			}
+
+			slen = strlen(ifa->ifa_label) + 1;
+			if (used + slen > PAGE_SIZE) {
+				toobig = 1;
+				continue;
+			}
+			memcpy(space + used, ifa->ifa_label, slen);
+			names[nifs - 1] = space + used;
+			used += slen;
 		}
-
-		LIBCFS_ALLOC(names[i], IFNAMSIZ);
-		if (names[i] == NULL) {
-			rc = -ENOMEM;
-			goto out2;
-		}
-
-		memcpy(names[i], ifr[i].ifr_name, nob);
-		names[i][nob] = 0;
 	}
+	rtnl_unlock();
 
 	*namesp = names;
-	rc = nfound;
-
- out2:
-	if (rc < 0)
-		lnet_ipif_free_enumeration(names, nfound);
- out1:
-	LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
- out0:
-	return rc;
+
+	return nifs;
 }
 EXPORT_SYMBOL(lnet_ipif_enumerate);
 

From c090654e320c05d44827f60bd0aba17c55be6bb2 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Wed, 23 Dec 2020 22:21:43 +0000
Subject: [PATCH 080/737] lustre: lprocfs: work around set_fs removal

The lustre procfs code has some unfortunate code that internally
calls its procfs hooks to extract data (class_process_proc_param).
This function uses set/get_fs to divert copy_from_user from
user to kernel space, as called from the procfs proc_write entries.

However, set/get_fs are no more, as they are ugly and hackish.

Use a horrible hack to make this work again: since
class_process_proc_param() uses a fake, local file structure
to pass to the lustre procfs proc_write entry point. The only
thing the code cares about in that structure is the private_data
field, since that's where it extracts the seq_file struct from.

So, abuse one of the other fields, f_op. Set it to a dummy
file_operations structure, and have the lustre procfs proc_write
entry points recognize this as meaning that it's dealing with
a kernel space buffer, not user space.

Pass a file pointer down to where it's needed, and code a private
version of copy_from_user, lprocfs_copy_from_user, which takes
a file structure so that it can figure out if it's dealing with
kernel or user space.

Yes, this is awful, but it's the easiest that can be done for
Lustre 2.10. For Lustre 2.13, this particular set_fs use
has been removed, but the code churn is too big to backport
that, as lustre procfs was moved to sysfs.

So, we're stuck with this for now.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 .../staging/lustrefsx/lustre/fid/lproc_fid.c  | 14 +--
 .../staging/lustrefsx/lustre/fld/lproc_fld.c  |  2 +-
 .../lustrefsx/lustre/include/lprocfs_status.h | 26 ++++--
 .../lustrefsx/lustre/ldlm/ldlm_resource.c     |  2 +-
 .../lustrefsx/lustre/llite/lproc_llite.c      | 72 +++++++++-------
 .../staging/lustrefsx/lustre/lov/lproc_lov.c  |  8 +-
 .../staging/lustrefsx/lustre/mdc/lproc_mdc.c  |  6 +-
 .../lustre/obdclass/lprocfs_jobstats.c        |  4 +-
 .../lustre/obdclass/lprocfs_status.c          | 85 +++++++++++++++----
 .../lustre/obdclass/lprocfs_status_server.c   | 10 +--
 .../lustrefsx/lustre/obdclass/obd_config.c    |  7 +-
 .../staging/lustrefsx/lustre/osc/lproc_osc.c  | 28 +++---
 .../lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c    | 14 +--
 .../staging/lustrefsx/lustre/ptlrpc/nrs_crr.c |  2 +-
 .../lustrefsx/lustre/ptlrpc/nrs_delay.c       | 10 +--
 .../staging/lustrefsx/lustre/ptlrpc/nrs_orr.c |  6 +-
 .../staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c |  2 +-
 17 files changed, 187 insertions(+), 111 deletions(-)

diff --git a/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c b/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c
index e504ed7ff6ceb..6e5df75b37c9d 100644
--- a/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c
+++ b/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c
@@ -60,8 +60,8 @@
  * safe for production use.
  */
 static int
-lprocfs_fid_write_common(const char __user *buffer, size_t count,
-				struct lu_seq_range *range)
+lprocfs_fid_write_common(struct file *file, const char __user *buffer,
+				size_t count, struct lu_seq_range *range)
 {
 	struct lu_seq_range tmp = {
 		.lsr_start = 0,
@@ -74,7 +74,7 @@ lprocfs_fid_write_common(const char __user *buffer, size_t count,
 	if (count >= sizeof(kernbuf))
 		RETURN(-EINVAL);
 
-	if (copy_from_user(kernbuf, buffer, count))
+	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
 		RETURN(-EFAULT);
 
 	kernbuf[count] = 0;
@@ -110,7 +110,7 @@ lprocfs_server_fid_space_seq_write(struct file *file, const char __user *buffer,
 	LASSERT(seq != NULL);
 
 	mutex_lock(&seq->lss_mutex);
-	rc = lprocfs_fid_write_common(buffer, count, &seq->lss_space);
+	rc = lprocfs_fid_write_common(file, buffer, count, &seq->lss_space);
 	if (rc == 0) {
 		CDEBUG(D_INFO, "%s: Space: "DRANGE"\n",
 			seq->lss_name, PRANGE(&seq->lss_space));
@@ -171,7 +171,7 @@ lprocfs_server_fid_width_seq_write(struct file *file, const char __user *buffer,
 
 	mutex_lock(&seq->lss_mutex);
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc) {
 		CERROR("%s: invalid FID sequence width: rc = %d\n",
 		       seq->lss_name, rc);
@@ -517,7 +517,7 @@ lprocfs_client_fid_space_seq_write(struct file *file, const char __user *buffer,
 	LASSERT(seq != NULL);
 
 	mutex_lock(&seq->lcs_mutex);
-	rc = lprocfs_fid_write_common(buffer, count, &seq->lcs_space);
+	rc = lprocfs_fid_write_common(file, buffer, count, &seq->lcs_space);
 	if (rc == 0) {
 		CDEBUG(D_INFO, "%s: Space: "DRANGE"\n",
                        seq->lcs_name, PRANGE(&seq->lcs_space));
@@ -558,7 +558,7 @@ lprocfs_client_fid_width_seq_write(struct file *file, const char __user *buffer,
 
 	mutex_lock(&seq->lcs_mutex);
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc) {
 		GOTO(out_unlock, count = rc);
 	}
diff --git a/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c b/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c
index 8fd39ef8160b0..269d8d3976065 100644
--- a/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c
+++ b/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c
@@ -91,7 +91,7 @@ fld_proc_hash_seq_write(struct file *file, const char __user *buffer,
 	if (count > sizeof(fh_name))
 		return -ENAMETOOLONG;
 
-	if (copy_from_user(fh_name, buffer, count) != 0)
+	if (lprocfs_copy_from_user(file, fh_name, buffer, count) != 0)
 		return -EFAULT;
 
 	fld = ((struct seq_file *)file->private_data)->private;
diff --git a/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h b/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h
index 028d575e207d4..6a58c7129b033 100644
--- a/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h
+++ b/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h
@@ -621,9 +621,10 @@ extern int lprocfs_filesfree_seq_show(struct seq_file *m, void *data);
 extern int lprocfs_seq_read_frac_helper(struct seq_file *m, long val, int mult);
 extern int lprocfs_read_frac_helper(char *buffer, unsigned long count,
                                     long val, int mult);
-extern int lprocfs_str_to_s64(const char __user *buffer, unsigned long count,
-			      __s64 *val);
-extern int lprocfs_str_with_units_to_s64(const char __user *buffer,
+extern int lprocfs_str_to_s64(struct file *, const char __user *buffer,
+			      unsigned long count, __s64 *val);
+extern int lprocfs_str_with_units_to_s64(struct file *,
+					 const char __user *buffer,
 					 unsigned long count, __s64 *val,
 					 char defunit);
 
@@ -749,6 +750,19 @@ struct lustre_attr {
 			 const char *buf, size_t len);
 };
 
+/*
+ * Hacks to get around set_fs removal.
+ */
+void lprocfs_file_set_kernel(struct file *file);
+bool lprocfs_file_is_kernel(struct file *file);
+
+/*
+ * Version of copy_from_user() that uses the above hacks to determine
+ * whether it's dealing with user or kernel space.
+ */
+unsigned long lprocfs_copy_from_user(struct file *file, void *to,
+				     const void __user *from, unsigned long n);
+
 #define LUSTRE_ATTR(name, mode, show, store) \
 static struct lustre_attr lustre_attr_##name = __ATTR(name, mode, show, store)
 
@@ -795,9 +809,11 @@ ssize_t lprocfs_obd_max_pages_per_rpc_seq_write(struct file *file,
 						size_t count, loff_t *off);
 
 struct root_squash_info;
-int lprocfs_wr_root_squash(const char __user *buffer, unsigned long count,
+int lprocfs_wr_root_squash(struct file *file, const char __user *buffer,
+			   unsigned long count,
 			   struct root_squash_info *squash, char *name);
-int lprocfs_wr_nosquash_nids(const char __user *buffer, unsigned long count,
+int lprocfs_wr_nosquash_nids(struct file *file, const char __user *buffer,
+			     unsigned long count,
 			     struct root_squash_info *squash, char *name);
 
 #else /* !CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c
index 2a55b2b242397..6b8734adeb851 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c
@@ -97,7 +97,7 @@ static ssize_t seq_watermark_write(struct file *file,
 	bool wm_low = (data == &ldlm_reclaim_threshold_mb) ? true : false;
 	int rc;
 
-	rc = lprocfs_str_with_units_to_s64(buffer, count, &value, 'M');
+	rc = lprocfs_str_with_units_to_s64(file, buffer, count, &value, 'M');
 	if (rc) {
 		CERROR("Failed to set %s, rc = %d.\n",
 		       wm_low ? "lock_reclaim_threshold_mb" : "lock_limit_mb",
diff --git a/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c b/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
index 3c3f5ddc4dfa4..ee696ef0a4c79 100755
--- a/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
+++ b/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
@@ -50,7 +50,8 @@ struct proc_dir_entry *proc_lustre_fs_root;
 static const struct proc_ops ll_rw_extents_stats_fops;
 static const struct proc_ops ll_rw_extents_stats_pp_fops;
 static const struct proc_ops ll_rw_offset_stats_fops;
-static __s64 ll_stats_pid_write(const char __user *buf, size_t len);
+static __s64 ll_stats_pid_write(struct file *file,
+				const char __user *buf, size_t len);
 
 static int ll_blksize_seq_show(struct seq_file *m, void *v)
 {
@@ -86,7 +87,7 @@ static ssize_t ll_stat_blksize_seq_write(struct file *file,
 	__s64 val;
 	int rc;
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
@@ -248,7 +249,7 @@ static ssize_t ll_xattr_cache_seq_write(struct file *file,
 	__s64 val;
 	int rc;
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
@@ -302,7 +303,8 @@ ll_max_readahead_mb_seq_write(struct file *file, const char __user *buffer,
 	__s64 pages_number;
 	int rc;
 
-	rc = lprocfs_str_with_units_to_s64(buffer, count, &pages_number, 'M');
+	rc = lprocfs_str_with_units_to_s64(file, buffer, count,
+			&pages_number, 'M');
 	if (rc)
 		return rc;
 
@@ -350,7 +352,8 @@ ll_max_readahead_per_file_mb_seq_write(struct file *file,
 	int rc;
 	__s64 pages_number;
 
-	rc = lprocfs_str_with_units_to_s64(buffer, count, &pages_number, 'M');
+	rc = lprocfs_str_with_units_to_s64(file, buffer, count,
+			&pages_number, 'M');
 	if (rc)
 		return rc;
 
@@ -397,7 +400,8 @@ ll_max_read_ahead_whole_mb_seq_write(struct file *file,
 	int rc;
 	__s64 pages_number;
 
-	rc = lprocfs_str_with_units_to_s64(buffer, count, &pages_number, 'M');
+	rc = lprocfs_str_with_units_to_s64(file, buffer, count,
+			&pages_number, 'M');
 	if (rc)
 		return rc;
 
@@ -467,13 +471,14 @@ ll_max_cached_mb_seq_write(struct file *file, const char __user *buffer,
 	if (count >= sizeof(kernbuf))
 		RETURN(-EINVAL);
 
-	if (copy_from_user(kernbuf, buffer, count))
+	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
 		RETURN(-EFAULT);
 	kernbuf[count] = 0;
 
 	buffer += lprocfs_find_named_value(kernbuf, "max_cached_mb:", &count) -
 		  kernbuf;
-	rc = lprocfs_str_with_units_to_s64(buffer, count, &pages_number, 'M');
+	rc = lprocfs_str_with_units_to_s64(file, buffer, count,
+			&pages_number, 'M');
 	if (rc)
 		RETURN(rc);
 
@@ -577,7 +582,7 @@ static ssize_t ll_checksum_seq_write(struct file *file,
 		/* Not set up yet */
 		return -EAGAIN;
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 	if (val)
@@ -609,14 +614,15 @@ static int ll_rd_track_id(struct seq_file *m, enum stats_track_type type)
 	return 0;
 }
 
-static int ll_wr_track_id(const char __user *buffer, unsigned long count,
+static int ll_wr_track_id(struct file *file,
+			  const char __user *buffer, unsigned long count,
 			  void *data, enum stats_track_type type)
 {
 	struct super_block *sb = data;
 	int rc;
 	__s64 pid;
 
-	rc = lprocfs_str_to_s64(buffer, count, &pid);
+	rc = lprocfs_str_to_s64(file, buffer, count, &pid);
 	if (rc)
 		return rc;
 	if (pid > INT_MAX || pid < 0)
@@ -641,7 +647,8 @@ static ssize_t ll_track_pid_seq_write(struct file *file,
 				      size_t count, loff_t *off)
 {
 	struct seq_file *seq = file->private_data;
-	return ll_wr_track_id(buffer, count, seq->private, STATS_TRACK_PID);
+	return ll_wr_track_id(file, buffer, count, seq->private,
+			STATS_TRACK_PID);
 }
 LPROC_SEQ_FOPS(ll_track_pid);
 
@@ -655,7 +662,8 @@ static ssize_t ll_track_ppid_seq_write(struct file *file,
 				       size_t count, loff_t *off)
 {
 	struct seq_file *seq = file->private_data;
-	return ll_wr_track_id(buffer, count, seq->private, STATS_TRACK_PPID);
+	return ll_wr_track_id(file, buffer, count, seq->private,
+			STATS_TRACK_PPID);
 }
 LPROC_SEQ_FOPS(ll_track_ppid);
 
@@ -669,7 +677,8 @@ static ssize_t ll_track_gid_seq_write(struct file *file,
 				      size_t count, loff_t *off)
 {
 	struct seq_file *seq = file->private_data;
-	return ll_wr_track_id(buffer, count, seq->private, STATS_TRACK_GID);
+	return ll_wr_track_id(file, buffer, count, seq->private,
+			STATS_TRACK_GID);
 }
 LPROC_SEQ_FOPS(ll_track_gid);
 
@@ -692,7 +701,7 @@ static ssize_t ll_statahead_running_max_seq_write(struct file *file,
 	int rc;
 	__s64 val;
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
@@ -726,7 +735,7 @@ static ssize_t ll_statahead_max_seq_write(struct file *file,
 	int rc;
 	__s64 val;
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
@@ -760,7 +769,7 @@ static ssize_t ll_statahead_agl_seq_write(struct file *file,
 	int rc;
 	__s64 val;
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
@@ -807,7 +816,7 @@ static ssize_t ll_lazystatfs_seq_write(struct file *file,
 	int rc;
 	__s64 val;
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
@@ -891,7 +900,7 @@ static ssize_t ll_default_easize_seq_write(struct file *file,
 	if (count == 0)
 		return 0;
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 	if (val < 0 || val > INT_MAX)
@@ -948,7 +957,7 @@ ll_fast_read_seq_write(struct file *file, const char __user *buffer,
 	int rc;
 	__s64 val;
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
@@ -981,7 +990,7 @@ static ssize_t ll_pio_seq_write(struct file *file, const char __user *buffer,
 	int rc;
 	__s64 val;
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
@@ -1029,13 +1038,13 @@ static ssize_t ll_unstable_stats_seq_write(struct file *file,
 	if (count >= sizeof(kernbuf))
 		return -EINVAL;
 
-	if (copy_from_user(kernbuf, buffer, count))
+	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
 		return -EFAULT;
 	kernbuf[count] = 0;
 
 	buffer += lprocfs_find_named_value(kernbuf, "unstable_check:", &count) -
 		  kernbuf;
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc < 0)
 		return rc;
 
@@ -1067,7 +1076,7 @@ static ssize_t ll_root_squash_seq_write(struct file *file,
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
 	struct root_squash_info *squash = &sbi->ll_squash;
 
-	return lprocfs_wr_root_squash(buffer, count, squash,
+	return lprocfs_wr_root_squash(file, buffer, count, squash,
 				      ll_get_fsname(sb, NULL, 0));
 }
 LPROC_SEQ_FOPS(ll_root_squash);
@@ -1103,7 +1112,7 @@ static ssize_t ll_nosquash_nids_seq_write(struct file *file,
 	struct root_squash_info *squash = &sbi->ll_squash;
 	int rc;
 
-	rc = lprocfs_wr_nosquash_nids(buffer, count, squash,
+	rc = lprocfs_wr_nosquash_nids(file, buffer, count, squash,
 				      ll_get_fsname(sb, NULL, 0));
 	if (rc < 0)
 		return rc;
@@ -1525,7 +1534,7 @@ static ssize_t ll_rw_extents_stats_pp_seq_write(struct file *file,
 	if (len == 0)
 		return -EINVAL;
 
-	value = ll_stats_pid_write(buf, len);
+	value = ll_stats_pid_write(file, buf, len);
 
 	if (value == 0)
 		sbi->ll_rw_stats_on = 0;
@@ -1583,7 +1592,7 @@ static ssize_t ll_rw_extents_stats_seq_write(struct file *file,
 	if (len == 0)
 		return -EINVAL;
 
-	value = ll_stats_pid_write(buf, len);
+	value = ll_stats_pid_write(file, buf, len);
 
 	if (value == 0)
 		sbi->ll_rw_stats_on = 0;
@@ -1772,7 +1781,7 @@ static ssize_t ll_rw_offset_stats_seq_write(struct file *file,
 	if (len == 0)
 		return -EINVAL;
 
-	value = ll_stats_pid_write(buf, len);
+	value = ll_stats_pid_write(file, buf, len);
 
 	if (value == 0)
 		sbi->ll_rw_stats_on = 0;
@@ -1804,17 +1813,18 @@ static ssize_t ll_rw_offset_stats_seq_write(struct file *file,
  * equivalent of a number is written, that number is returned. Otherwise,
  * 1 is returned. Non-zero return values indicate collection should be enabled.
  */
-static __s64 ll_stats_pid_write(const char __user *buf, size_t len)
+static __s64 ll_stats_pid_write(struct file *file, const char __user *buf,
+				size_t len)
 {
 	__s64 value = 1;
 	int rc;
 	char kernbuf[16];
 
-	rc = lprocfs_str_to_s64(buf, len, &value);
+	rc = lprocfs_str_to_s64(file, buf, len, &value);
 
 	if (rc < 0 && len < sizeof(kernbuf)) {
 
-		if (copy_from_user(kernbuf, buf, len))
+		if (lprocfs_copy_from_user(file, kernbuf, buf, len))
 			return -EFAULT;
 		kernbuf[len] = 0;
 
diff --git a/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c b/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c
index 8610727f9b9e1..e8b9b88302055 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c
@@ -62,7 +62,7 @@ static ssize_t lov_stripesize_seq_write(struct file *file,
 
 	LASSERT(dev != NULL);
 	desc = &dev->u.lov.desc;
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 	if (val < 0)
@@ -97,7 +97,7 @@ static ssize_t lov_stripeoffset_seq_write(struct file *file,
 
 	LASSERT(dev != NULL);
 	desc = &dev->u.lov.desc;
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 	if (val < -1)
@@ -131,7 +131,7 @@ static ssize_t lov_stripetype_seq_write(struct file *file,
 
 	LASSERT(dev != NULL);
 	desc = &dev->u.lov.desc;
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 	if (val < INT_MIN || val > INT_MAX)
@@ -169,7 +169,7 @@ static ssize_t lov_stripecount_seq_write(struct file *file,
 
 	LASSERT(dev != NULL);
 	desc = &dev->u.lov.desc;
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 	if (val < -1)
diff --git a/drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c b/drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c
index 2ede98f67846d..57cd679138950 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c
+++ b/drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c
@@ -58,7 +58,7 @@ static ssize_t mdc_active_seq_write(struct file *file,
 	__s64 val;
 
 	dev = ((struct seq_file *)file->private_data)->private;
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 	if (val < 0 || val > 1)
@@ -95,7 +95,7 @@ static ssize_t mdc_max_rpcs_in_flight_seq_write(struct file *file,
 	int rc;
 
 	dev = ((struct seq_file *)file->private_data)->private;
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
@@ -130,7 +130,7 @@ static ssize_t mdc_max_mod_rpcs_in_flight_seq_write(struct file *file,
 	__s64 val;
 	int rc;
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c
index 74351918dc5bd..a78e4e7f6b316 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c
@@ -515,7 +515,7 @@ static ssize_t lprocfs_jobstats_seq_write(struct file *file,
 	if (stats->ojs_hash == NULL)
 		return -ENODEV;
 
-	if (copy_from_user(jobid, buf, len))
+	if (lprocfs_copy_from_user(file, jobid, buf, len))
 		return -EFAULT;
 	jobid[len] = 0;
 
@@ -655,7 +655,7 @@ lprocfs_job_interval_seq_write(struct file *file, const char __user *buffer,
 
 	stats = &obd->u.obt.obt_jobstats;
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 	if (val < 0 || val > UINT_MAX)
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
index 07801fa617a9d..0fcf859bdbb05 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
@@ -48,6 +48,43 @@ MODULE_PARM_DESC(lprocfs_no_percpu_stats, "Do not alloc percpu data for lprocfs
 
 #define MAX_STRING_SIZE 128
 
+static const struct file_operations lprocfs_kernel_dummy = {};
+
+/*
+ * Awful hacks to mark procfs seq writes as going to kernel space. Used
+ * to be done with set_fs(KERNEL_DS), but that function is no more.
+ * This should only be called from class_process_proc_param(), which passes
+ * in a fake file structure. It should never, ever be used for anything else.
+ */
+void lprocfs_file_set_kernel(struct file *file)
+{
+	LASSERT(file->f_op == NULL);
+	file->f_op = &lprocfs_kernel_dummy;
+}
+EXPORT_SYMBOL(lprocfs_file_set_kernel);
+
+bool lprocfs_file_is_kernel(struct file *file)
+{
+	return (file->f_op == &lprocfs_kernel_dummy);
+}
+EXPORT_SYMBOL(lprocfs_file_is_kernel);
+
+unsigned long
+lprocfs_copy_from_user(struct file *file, void *to,
+		       const void __user *from, unsigned long n)
+{
+	unsigned long res;
+
+	if (lprocfs_file_is_kernel(file)) {
+		memcpy(to, from, n);
+		res = 0;
+	} else
+		res = copy_from_user(to, from, n);
+
+	return res;
+}
+EXPORT_SYMBOL(lprocfs_copy_from_user);
+
 int lprocfs_single_release(struct inode *inode, struct file *file)
 {
         return single_release(inode, file);
@@ -370,7 +407,7 @@ int lprocfs_wr_uint(struct file *file, const char __user *buffer,
 	if (count == 0)
 		return 0;
 
-	if (copy_from_user(dummy, buffer, count))
+	if (lprocfs_copy_from_user(file, dummy, buffer, count))
 		return -EFAULT;
 
 	dummy[count] = 0;
@@ -391,7 +428,7 @@ ssize_t lprocfs_uint_seq_write(struct file *file, const char __user *buffer,
 	int rc;
 	__s64 val = 0;
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc < 0)
 		return rc;
 
@@ -424,7 +461,7 @@ lprocfs_atomic_seq_write(struct file *file, const char __user *buffer,
 	__s64 val = 0;
 	int rc;
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc < 0)
 		return rc;
 
@@ -2051,7 +2088,8 @@ static int str_to_u64_parse(char *buffer, unsigned long count,
  * of the signed integer.
  */
 static int str_to_s64_internal(const char __user *buffer, unsigned long count,
-			       __s64 *val, __u64 def_mult, bool allow_units)
+			       __s64 *val, __u64 def_mult, bool allow_units,
+			       bool kernel_space)
 {
 	char kernbuf[22];
 	__u64 tmp;
@@ -2063,8 +2101,12 @@ static int str_to_s64_internal(const char __user *buffer, unsigned long count,
 	if (count > (sizeof(kernbuf) - 1))
 		return -EINVAL;
 
-	if (copy_from_user(kernbuf, buffer, count))
-		return -EFAULT;
+	if (kernel_space) {
+		memcpy(kernbuf, buffer, count);
+	} else {
+		if (copy_from_user(kernbuf, buffer, count))
+			return -EFAULT;
+	}
 
 	kernbuf[count] = '\0';
 
@@ -2103,10 +2145,13 @@ static int str_to_s64_internal(const char __user *buffer, unsigned long count,
  * \retval		0 on success
  * \retval		negative number on error
  */
-int lprocfs_str_to_s64(const char __user *buffer, unsigned long count,
-		       __s64 *val)
+int lprocfs_str_to_s64(struct file *file, const char __user *buffer,
+		       unsigned long count, __s64 *val)
 {
-	return str_to_s64_internal(buffer, count, val, 1, false);
+	bool kernel_space;
+
+	kernel_space = lprocfs_file_is_kernel(file);
+	return str_to_s64_internal(buffer, count, val, 1, false, kernel_space);
 }
 EXPORT_SYMBOL(lprocfs_str_to_s64);
 
@@ -2127,11 +2172,12 @@ EXPORT_SYMBOL(lprocfs_str_to_s64);
  * \retval		0 on success
  * \retval		negative number on error
  */
-int lprocfs_str_with_units_to_s64(const char __user *buffer,
+int lprocfs_str_with_units_to_s64(struct file *file, const char __user *buffer,
 				  unsigned long count, __s64 *val, char defunit)
 {
 	__u64 mult = 1;
 	int rc;
+	bool kernel_space;
 
 	if (defunit != '1') {
 		rc = get_mult(defunit, &mult);
@@ -2139,7 +2185,10 @@ int lprocfs_str_with_units_to_s64(const char __user *buffer,
 			return rc;
 	}
 
-	return str_to_s64_internal(buffer, count, val, mult, true);
+	kernel_space = lprocfs_file_is_kernel(file);
+
+	return str_to_s64_internal(buffer, count, val, mult, true,
+			kernel_space);
 }
 EXPORT_SYMBOL(lprocfs_str_with_units_to_s64);
 
@@ -2326,7 +2375,7 @@ ssize_t lprocfs_obd_max_pages_per_rpc_seq_write(struct file *file,
 	int chunk_mask, rc;
 	__s64 val;
 
-	rc = lprocfs_str_with_units_to_s64(buffer, count, &val, '1');
+	rc = lprocfs_str_with_units_to_s64(file, buffer, count, &val, '1');
 	if (rc)
 		return rc;
 	if (val < 0)
@@ -2356,8 +2405,9 @@ ssize_t lprocfs_obd_max_pages_per_rpc_seq_write(struct file *file,
 }
 EXPORT_SYMBOL(lprocfs_obd_max_pages_per_rpc_seq_write);
 
-int lprocfs_wr_root_squash(const char __user *buffer, unsigned long count,
-			   struct root_squash_info *squash, char *name)
+int lprocfs_wr_root_squash(struct file *file, const char __user *buffer,
+			   unsigned long count, struct root_squash_info *squash,
+			   char *name)
 {
 	int rc;
 	char kernbuf[64], *tmp, *errmsg;
@@ -2368,7 +2418,7 @@ int lprocfs_wr_root_squash(const char __user *buffer, unsigned long count,
 		errmsg = "string too long";
 		GOTO(failed_noprint, rc = -EINVAL);
 	}
-	if (copy_from_user(kernbuf, buffer, count)) {
+	if (lprocfs_copy_from_user(file, kernbuf, buffer, count)) {
 		errmsg = "bad address";
 		GOTO(failed_noprint, rc = -EFAULT);
 	}
@@ -2418,7 +2468,8 @@ int lprocfs_wr_root_squash(const char __user *buffer, unsigned long count,
 EXPORT_SYMBOL(lprocfs_wr_root_squash);
 
 
-int lprocfs_wr_nosquash_nids(const char __user *buffer, unsigned long count,
+int lprocfs_wr_nosquash_nids(struct file *file, const char __user *buffer,
+			     unsigned long count,
 			     struct root_squash_info *squash, char *name)
 {
 	int rc;
@@ -2438,7 +2489,7 @@ int lprocfs_wr_nosquash_nids(const char __user *buffer, unsigned long count,
 		errmsg = "no memory";
 		GOTO(failed, rc = -ENOMEM);
 	}
-	if (copy_from_user(kernbuf, buffer, count)) {
+	if (lprocfs_copy_from_user(file, kernbuf, buffer, count)) {
 		errmsg = "bad address";
 		GOTO(failed, rc = -EFAULT);
 	}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c
index 46ad92df952f2..6d78831dd37fe 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c
@@ -79,7 +79,7 @@ lprocfs_evict_client_seq_write(struct file *file, const char __user *buffer,
 	 * bytes into kbuf, to ensure that the string is NUL-terminated.
 	 * UUID_MAX should include a trailing NUL already.
 	 */
-	if (copy_from_user(kbuf, buffer,
+	if (lprocfs_copy_from_user(file, kbuf, buffer,
 			   min_t(unsigned long, BUFLEN - 1, count))) {
 		count = -EFAULT;
 		goto out;
@@ -683,7 +683,7 @@ lprocfs_ir_factor_seq_write(struct file *file, const char __user *buffer,
 	__s64 val;
 
 	LASSERT(obd != NULL);
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
@@ -715,7 +715,7 @@ lprocfs_checksum_dump_seq_write(struct file *file, const char __user *buffer,
 	__s64 val;
 
 	LASSERT(obd != NULL);
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
@@ -745,7 +745,7 @@ lprocfs_recovery_time_soft_seq_write(struct file *file,
 	__s64 val;
 
 	LASSERT(obd != NULL);
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 	if (val < 0 || val > INT_MAX)
@@ -777,7 +777,7 @@ lprocfs_recovery_time_hard_seq_write(struct file *file,
 	__s64 val;
 
 	LASSERT(obd != NULL);
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 	if (val < 0 || val > INT_MAX)
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
index 8182d872a8cec..dbd4fdcbd996c 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
@@ -1290,7 +1290,7 @@ int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
 			     struct lustre_cfg *lcfg, void *data)
 {
 	struct lprocfs_vars *var;
-	struct file fakefile;
+	struct file fakefile = {};
 	struct seq_file fake_seqfile;
 	char *key, *sval;
 	int i, keylen, vallen;
@@ -1305,6 +1305,7 @@ int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
 	}
 
 	/* fake a seq file so that var->fops->proc_write can work... */
+	lprocfs_file_set_kernel(&fakefile);
 	fakefile.private_data = &fake_seqfile;
 	fake_seqfile.private = data;
 	/* e.g. tunefs.lustre --param mdt.group_upcall=foo /r/tmp/lustre-mdt
@@ -1339,13 +1340,9 @@ int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
 				rc = -EROFS;
 
 				if (var->fops && var->fops->proc_write) {
-					mm_segment_t oldfs;
-					oldfs = get_fs();
-					set_fs(KERNEL_DS);
 					rc = (var->fops->proc_write)(&fakefile,
 								sval, vallen,
 								NULL);
-					set_fs(oldfs);
 				}
 				break;
 			}
diff --git a/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c b/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c
index 5c9a8c6c2219c..d6123c61af113 100644
--- a/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c
+++ b/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c
@@ -58,7 +58,7 @@ static ssize_t osc_active_seq_write(struct file *file,
 	int rc;
 	__s64 val;
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 	if (val < 0 || val > 1)
@@ -96,7 +96,7 @@ static ssize_t osc_max_rpcs_in_flight_seq_write(struct file *file,
 	int adding, added, req_count;
 	__s64 val;
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 	if (val < 1 || val > OSC_MAX_RIF_MAX)
@@ -152,7 +152,8 @@ static ssize_t osc_max_dirty_mb_seq_write(struct file *file,
 	int rc;
 	__s64 pages_number;
 
-	rc = lprocfs_str_with_units_to_s64(buffer, count, &pages_number, 'M');
+	rc = lprocfs_str_with_units_to_s64(file, buffer, count,
+			&pages_number, 'M');
 	if (rc)
 		return rc;
 
@@ -203,13 +204,14 @@ osc_cached_mb_seq_write(struct file *file, const char __user *buffer,
 	if (count >= sizeof(kernbuf))
 		return -EINVAL;
 
-	if (copy_from_user(kernbuf, buffer, count))
+	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
 		return -EFAULT;
 	kernbuf[count] = 0;
 
 	buffer += lprocfs_find_named_value(kernbuf, "used_mb:", &count) -
 		  kernbuf;
-	rc = lprocfs_str_with_units_to_s64(buffer, count, &pages_number, 'M');
+	rc = lprocfs_str_with_units_to_s64(file, buffer, count,
+			&pages_number, 'M');
 	if (rc)
 		return rc;
 
@@ -269,7 +271,7 @@ static ssize_t osc_cur_grant_bytes_seq_write(struct file *file,
 	if (obd == NULL)
 		return 0;
 
-	rc = lprocfs_str_with_units_to_s64(buffer, count, &val, '1');
+	rc = lprocfs_str_with_units_to_s64(file, buffer, count, &val, '1');
 	if (rc)
 		return rc;
 	if (val < 0)
@@ -340,7 +342,7 @@ static ssize_t osc_grant_shrink_interval_seq_write(struct file *file,
 	if (obd == NULL)
 		return 0;
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
@@ -375,7 +377,7 @@ static ssize_t osc_checksum_seq_write(struct file *file,
 	if (obd == NULL)
 		return 0;
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
@@ -420,7 +422,7 @@ static ssize_t osc_checksum_type_seq_write(struct file *file,
 
         if (count > sizeof(kernbuf) - 1)
                 return -EINVAL;
-	if (copy_from_user(kernbuf, buffer, count))
+	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
                 return -EFAULT;
         if (count > 0 && kernbuf[count - 1] == '\n')
                 kernbuf[count - 1] = '\0';
@@ -455,7 +457,7 @@ static ssize_t osc_resend_count_seq_write(struct file *file,
 	int rc;
 	__s64 val;
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
@@ -491,7 +493,7 @@ static ssize_t osc_checksum_dump_seq_write(struct file *file,
 	if (obd == NULL)
 		return 0;
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
@@ -519,7 +521,7 @@ static ssize_t osc_contention_seconds_seq_write(struct file *file,
 	int rc;
 	__s64 val;
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 	if (val < 0 || val > INT_MAX)
@@ -549,7 +551,7 @@ static ssize_t osc_lockless_truncate_seq_write(struct file *file,
 	int rc;
 	__s64 val;
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 	if (val < 0)
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c b/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
index e13b9e2e4c8f5..5b5412e506317 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
@@ -309,7 +309,7 @@ ptlrpc_lprocfs_req_history_max_seq_write(struct file *file,
 	__s64 val;
 	int rc;
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc < 0)
 		return rc;
 
@@ -356,7 +356,7 @@ ptlrpc_lprocfs_threads_min_seq_write(struct file *file,
 	struct seq_file *m = file->private_data;
 	struct ptlrpc_service *svc = m->private;
 	__s64 val;
-	int rc = lprocfs_str_to_s64(buffer, count, &val);
+	int rc = lprocfs_str_to_s64(file, buffer, count, &val);
 
 	if (rc < 0)
 		return rc;
@@ -412,7 +412,7 @@ ptlrpc_lprocfs_threads_max_seq_write(struct file *file,
 	struct seq_file *m = file->private_data;
 	struct ptlrpc_service *svc = m->private;
 	__s64 val;
-	int rc = lprocfs_str_to_s64(buffer, count, &val);
+	int rc = lprocfs_str_to_s64(file, buffer, count, &val);
 
 	if (rc < 0)
 		return rc;
@@ -692,7 +692,7 @@ ptlrpc_lprocfs_nrs_seq_write(struct file *file, const char __user *buffer,
 	 */
 	cmd_copy = cmd;
 
-	if (copy_from_user(cmd, buffer, count))
+	if (lprocfs_copy_from_user(file, cmd, buffer, count))
 		GOTO(out, rc = -EFAULT);
 
 	cmd[count] = '\0';
@@ -1084,7 +1084,7 @@ ptlrpc_lprocfs_hp_ratio_seq_write(struct file *file, const char __user *buffer,
 	int rc;
 	__s64 val;
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc < 0)
 		return rc;
 
@@ -1278,7 +1278,7 @@ lprocfs_import_seq_write(struct file *file, const char __user *buffer,
 	if (kbuf == NULL)
 		return -ENOMEM;
 
-	if (copy_from_user(kbuf, buffer, count))
+	if (lprocfs_copy_from_user(file, kbuf, buffer, count))
 		GOTO(out, count = -EFAULT);
 
 	kbuf[count] = 0;
@@ -1344,7 +1344,7 @@ lprocfs_pinger_recov_seq_write(struct file *file, const char __user *buffer,
 	int rc;
 	__s64 val;
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc < 0)
 		return rc;
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c
index 6a54fa0f775a5..7423e981d9e37 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c
@@ -718,7 +718,7 @@ ptlrpc_lprocfs_nrs_crrn_quantum_seq_write(struct file *file,
         if (count > (sizeof(kernbuf) - 1))
                 return -EINVAL;
 
-	if (copy_from_user(kernbuf, buffer, count))
+	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
 		return -EFAULT;
 
         kernbuf[count] = '\0';
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c
index 77da1c1bfacf0..403b74efe6415 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c
@@ -419,7 +419,7 @@ static int nrs_delay_ctl(struct ptlrpc_nrs_policy *policy,
  * Helper for delay's seq_write functions.
  */
 static ssize_t
-lprocfs_nrs_delay_seq_write_common(const char __user *buffer,
+lprocfs_nrs_delay_seq_write_common(struct file *file, const char __user *buffer,
 				   unsigned int bufsize, size_t count,
 				   const char *var_name, unsigned int min_val,
 				   unsigned int max_val,
@@ -443,7 +443,7 @@ lprocfs_nrs_delay_seq_write_common(const char __user *buffer,
 	if (kernbuf == NULL)
 		return -ENOMEM;
 
-	if (copy_from_user(kernbuf, buffer, count))
+	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
 		GOTO(free_kernbuf, rc = -EFAULT);
 
 	tmpsize = strlen("reg_") + strlen(var_name) + 1;
@@ -598,7 +598,7 @@ ptlrpc_lprocfs_nrs_delay_min_seq_write(struct file *file,
 	struct seq_file *m = file->private_data;
 	struct ptlrpc_service *svc = m->private;
 
-	return lprocfs_nrs_delay_seq_write_common(buffer,
+	return lprocfs_nrs_delay_seq_write_common(file, buffer,
 						  LPROCFS_NRS_DELAY_MIN_SIZE,
 						  count,
 						  LPROCFS_NRS_DELAY_MIN_NAME,
@@ -681,7 +681,7 @@ ptlrpc_lprocfs_nrs_delay_max_seq_write(struct file *file,
 	struct seq_file *m = file->private_data;
 	struct ptlrpc_service *svc = m->private;
 
-	return lprocfs_nrs_delay_seq_write_common(buffer,
+	return lprocfs_nrs_delay_seq_write_common(file, buffer,
 						  LPROCFS_NRS_DELAY_MAX_SIZE,
 						  count,
 						  LPROCFS_NRS_DELAY_MAX_NAME,
@@ -765,7 +765,7 @@ ptlrpc_lprocfs_nrs_delay_pct_seq_write(struct file *file,
 	struct seq_file *m = file->private_data;
 	struct ptlrpc_service *svc = m->private;
 
-	return lprocfs_nrs_delay_seq_write_common(buffer,
+	return lprocfs_nrs_delay_seq_write_common(file, buffer,
 						  LPROCFS_NRS_DELAY_PCT_SIZE,
 						  count,
 						  LPROCFS_NRS_DELAY_PCT_NAME,
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c
index 857b333a50b01..96c3a6593d2dd 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c
@@ -1297,7 +1297,7 @@ ptlrpc_lprocfs_nrs_orr_quantum_seq_write(struct file *file,
         if (count > (sizeof(kernbuf) - 1))
                 return -EINVAL;
 
-	if (copy_from_user(kernbuf, buffer, count))
+	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
 		return -EFAULT;
 
         kernbuf[count] = '\0';
@@ -1512,7 +1512,7 @@ ptlrpc_lprocfs_nrs_orr_offset_type_seq_write(struct file *file,
         if (count > (sizeof(kernbuf) - 1))
                 return -EINVAL;
 
-	if (copy_from_user(kernbuf, buffer, count))
+	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
 		return -EFAULT;
 
         kernbuf[count] = '\0';
@@ -1772,7 +1772,7 @@ ptlrpc_lprocfs_nrs_orr_supported_seq_write(struct file *file,
         if (count > (sizeof(kernbuf) - 1))
                 return -EINVAL;
 
-	if (copy_from_user(kernbuf, buffer, count))
+	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
 		return -EFAULT;
 
         kernbuf[count] = '\0';
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c
index 6a042feb143e7..a81485554013b 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c
@@ -2965,7 +2965,7 @@ ptlrpc_lprocfs_nrs_tbf_rule_seq_write(struct file *file,
 	if (count > LPROCFS_WR_NRS_TBF_MAX_CMD - 1)
 		GOTO(out_free_kernbuff, rc = -EINVAL);
 
-	if (copy_from_user(kernbuf, buffer, count))
+	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
 		GOTO(out_free_kernbuff, rc = -EFAULT);
 
 	val = kernbuf;

From 56a344bfd14ef713882de27c7d7f5c1274312d1b Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <surajjs@amazon.com>
Date: Tue, 1 Dec 2020 12:04:04 -0800
Subject: [PATCH 081/737] ena: Update to 2.4.0

This is the cumulative update from 2.2.10 to 2.4.0.

Sourced from upstream git repo: https://github.com/amzn/amzn-drivers/

Change Log from Upstream:

2.2.10 -> 2.2.11
    **New Features**
    * Add stats printing to XDP queues
    * Add queue counters for xdp actions (number of XDP_PASS/XDP_TX etc.)
    * Add support for kernel v5.8
    * Add interrupts unmask statistics to xdp queues
    * Allow configuring RSS function and key

    ** Bug Fixes **
    * Drop incorrect and irrelevant llq - device id mappings.
    * Avoid remapping the llq mem bar every reset
    * Prevent reset after device destruction
    * Make DMA un-mapping of RX pages BIDIRECTIONAL to match
      their mapping. This is needed for Traffic Mirroring feature.

    **Minor Changes**
    * Change license string to SPDX format
    * Fix some spelling mistakes
    * Change variables and macros names to something more informative
    * Add masking and null checking in code that requires it
    * Align all log message from driver to have the driver's name at the
      beginning
    * Remove code duplications
    * Removed unnecessary LLQ acceleration mode negotiation
    * Switch to using regular bool values instead atomics for unmask_interrupt
      (whose name was changed to masked_interrupts).

2.2.11 -> 2.3.0
    **New Features**
    * Introduce XDP redirect implementation
    * Add support for new power management API for kernels >= 5.8
    * Provide interface and device information in logging messages

    ** Bug Fixes **
    * Performance: set initial DMA width to avoid intel iommu issue.
    * Fixed wrong expression in WARN_ON macro.
    * Fix Sparse static checker errors in xdp code.
    * Move napi declaration inside the loop to avoid Sparse static check warning.
    * Don't init DIM work (dim.work) in case queue creation fails
    * Make missed_tx stat incremental instead of reassigning it.
    * Fix packet's addresses where rx_offset wasn't taken into account.
    * Validate req_id in ena_com_rx_pkt().
    * Make sure timer and reset routine won't be called after freeing device resources.
    * Fix compilation error in RHEL 8.3

    **Minor Changes**
    * Initialize net_device earlier to allow its usage in netif_* and netdev_* prints.
      For more details see [https://www.spinics.net/lists/netdev/msg683250.html]
    * Add function to increase stats to reduce code duplication.
    * Ethtool: convert stat_offset to 8 bytes resolution to remove complex casts in the code.
    * XDP: queue count check: Fix coding style nits.
    * Cosmetic changes that fix alignment issues.
    * Change ena license SPDX comment style in headers.
    * Remove code duplication related to interrupt unmask stat.
    * Fix spelling mistake in XDP stat query code.
    * Move XDP_QUERY handling to the kernel for kernels >= 5.8.
    * Conversion of README from markdown to rst format.

2.3.0 -> 2.4.0:
    **New Features**
    * Implement local page cache (LPC) system

Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
Reviewed-by: Frank van der Linden <fllinden@amazon.com>
Reviewed-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 drivers/amazon/net/ena/Makefile          |    2 +-
 drivers/amazon/net/ena/ena_admin_defs.h  |  116 +--
 drivers/amazon/net/ena/ena_com.c         |  624 +++++++------
 drivers/amazon/net/ena/ena_com.h         |   71 +-
 drivers/amazon/net/ena/ena_common_defs.h |   31 +-
 drivers/amazon/net/ena/ena_eth_com.c     |  173 ++--
 drivers/amazon/net/ena/ena_eth_com.h     |   54 +-
 drivers/amazon/net/ena/ena_eth_io_defs.h |   31 +-
 drivers/amazon/net/ena/ena_ethtool.c     |  121 ++-
 drivers/amazon/net/ena/ena_netdev.c      | 1066 +++++++++++++++-------
 drivers/amazon/net/ena/ena_netdev.h      |   93 +-
 drivers/amazon/net/ena/ena_pci_id_tbl.h  |   31 +-
 drivers/amazon/net/ena/ena_regs_defs.h   |   31 +-
 drivers/amazon/net/ena/ena_sysfs.c       |   33 +-
 drivers/amazon/net/ena/ena_sysfs.h       |   31 +-
 drivers/amazon/net/ena/kcompat.h         |   68 +-
 16 files changed, 1428 insertions(+), 1148 deletions(-)

diff --git a/drivers/amazon/net/ena/Makefile b/drivers/amazon/net/ena/Makefile
index 33b4a08d38f4a..375448827df60 100644
--- a/drivers/amazon/net/ena/Makefile
+++ b/drivers/amazon/net/ena/Makefile
@@ -1,7 +1,7 @@
 #
 # Makefile for the Elastic Network Adapter (ENA) device drivers.
 # ENA Source is: https://github.com/amzn/amzn-drivers.
-# Current ENA source is based on ena_linux_1.5.0 tag.
+# Current ENA source is based on ena_linux_2.4.0 tag.
 #
 
 obj-$(CONFIG_AMAZON_ENA_ETHERNET) += ena.o
diff --git a/drivers/amazon/net/ena/ena_admin_defs.h b/drivers/amazon/net/ena/ena_admin_defs.h
index bad88b822773f..c89c501895e46 100755
--- a/drivers/amazon/net/ena/ena_admin_defs.h
+++ b/drivers/amazon/net/ena/ena_admin_defs.h
@@ -1,33 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
- * Copyright 2015 - 2018 Amazon.com, Inc. or its affiliates.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 #ifndef _ENA_ADMIN_H_
 #define _ENA_ADMIN_H_
@@ -35,6 +8,8 @@
 #define ENA_ADMIN_EXTRA_PROPERTIES_STRING_LEN 32
 #define ENA_ADMIN_EXTRA_PROPERTIES_COUNT     32
 
+#define ENA_ADMIN_RSS_KEY_PARTS              10
+
 enum ena_admin_aq_opcode {
 	ENA_ADMIN_CREATE_SQ                         = 1,
 	ENA_ADMIN_DESTROY_SQ                        = 2,
@@ -57,6 +32,7 @@ enum ena_admin_aq_completion_status {
 	ENA_ADMIN_RESOURCE_BUSY                     = 7,
 };
 
+/* subcommands for the set/get feature admin commands */
 enum ena_admin_aq_feature_id {
 	ENA_ADMIN_DEVICE_ATTRIBUTES                 = 1,
 	ENA_ADMIN_MAX_QUEUES_NUM                    = 2,
@@ -67,7 +43,7 @@ enum ena_admin_aq_feature_id {
 	ENA_ADMIN_MAX_QUEUES_EXT                    = 7,
 	ENA_ADMIN_RSS_HASH_FUNCTION                 = 10,
 	ENA_ADMIN_STATELESS_OFFLOAD_CONFIG          = 11,
-	ENA_ADMIN_RSS_REDIRECTION_TABLE_CONFIG      = 12,
+	ENA_ADMIN_RSS_INDIRECTION_TABLE_CONFIG      = 12,
 	ENA_ADMIN_MTU                               = 14,
 	ENA_ADMIN_RSS_HASH_INPUT                    = 18,
 	ENA_ADMIN_INTERRUPT_MODERATION              = 20,
@@ -199,7 +175,7 @@ struct ena_admin_acq_common_desc {
 	u16 extended_status;
 
 	/* indicates to the driver which AQ entry has been consumed by the
-	 *    device and could be reused
+	 * device and could be reused
 	 */
 	u16 sq_head_indx;
 };
@@ -244,8 +220,8 @@ struct ena_admin_aq_create_sq_cmd {
 	 */
 	u8 sq_caps_3;
 
-	/* associated completion queue id. This CQ must be created prior to
-	 *    SQ creation
+	/* associated completion queue id. This CQ must be created prior to SQ
+	 * creation
 	 */
 	u16 cq_idx;
 
@@ -384,7 +360,7 @@ struct ena_admin_aq_get_stats_cmd {
 	u16 queue_idx;
 
 	/* device id, value 0xFFFF means mine. only privileged device can get
-	 *    stats of other device
+	 * stats of other device
 	 */
 	u16 device_id;
 };
@@ -466,8 +442,8 @@ struct ena_admin_get_set_feature_common_desc {
 	u8 feature_id;
 
 	/* The driver specifies the max feature version it supports and the
-	 *    device responds with the currently supported feature version. The
-	 *    field is zero based
+	 * device responds with the currently supported feature version. The
+	 * field is zero based
 	 */
 	u8 feature_version;
 
@@ -479,7 +455,9 @@ struct ena_admin_device_attr_feature_desc {
 
 	u32 device_version;
 
-	/* bitmap of ena_admin_aq_feature_id */
+	/* bitmap of ena_admin_aq_feature_id, which represents supported
+	 * subcommands for the set/get feature admin commands.
+	 */
 	u32 supported_features;
 
 	u32 reserved3;
@@ -565,32 +543,30 @@ struct ena_admin_feature_llq_desc {
 
 	u32 max_llq_depth;
 
-	/*  specify the header locations the device supports. bitfield of
-	 *    enum ena_admin_llq_header_location.
+	/* specify the header locations the device supports. bitfield of enum
+	 * ena_admin_llq_header_location.
 	 */
 	u16 header_location_ctrl_supported;
 
 	/* the header location the driver selected to use. */
 	u16 header_location_ctrl_enabled;
 
-	/* if inline header is specified - this is the size of descriptor
-	 *    list entry. If header in a separate ring is specified - this is
-	 *    the size of header ring entry. bitfield of enum
-	 *    ena_admin_llq_ring_entry_size. specify the entry sizes the device
-	 *    supports
+	/* if inline header is specified - this is the size of descriptor list
+	 * entry. If header in a separate ring is specified - this is the size
+	 * of header ring entry. bitfield of enum ena_admin_llq_ring_entry_size.
+	 * specify the entry sizes the device supports
 	 */
 	u16 entry_size_ctrl_supported;
 
 	/* the entry size the driver selected to use. */
 	u16 entry_size_ctrl_enabled;
 
-	/* valid only if inline header is specified. First entry associated
-	 *    with the packet includes descriptors and header. Rest of the
-	 *    entries occupied by descriptors. This parameter defines the max
-	 *    number of descriptors precedding the header in the first entry.
-	 *    The field is bitfield of enum
-	 *    ena_admin_llq_num_descs_before_header and specify the values the
-	 *    device supports
+	/* valid only if inline header is specified. First entry associated with
+	 * the packet includes descriptors and header. Rest of the entries
+	 * occupied by descriptors. This parameter defines the max number of
+	 * descriptors precedding the header in the first entry. The field is
+	 * bitfield of enum ena_admin_llq_num_descs_before_header and specify
+	 * the values the device supports
 	 */
 	u16 desc_num_before_header_supported;
 
@@ -598,7 +574,7 @@ struct ena_admin_feature_llq_desc {
 	u16 desc_num_before_header_enabled;
 
 	/* valid only if inline was chosen. bitfield of enum
-	 *    ena_admin_llq_stride_ctrl
+	 * ena_admin_llq_stride_ctrl
 	 */
 	u16 descriptors_stride_ctrl_supported;
 
@@ -608,8 +584,8 @@ struct ena_admin_feature_llq_desc {
 	/* reserved */
 	u32 reserved1;
 
-	/* accelerated low latency queues requirment. driver needs to
-	 * support those requirments in order to use accelerated llq
+	/* accelerated low latency queues requirement. driver needs to
+	 * support those requirements in order to use accelerated llq
 	 */
 	struct ena_admin_accel_mode_req accel_mode;
 };
@@ -633,8 +609,8 @@ struct ena_admin_queue_ext_feature_fields {
 
 	u32 max_tx_header_size;
 
-	/* Maximum Descriptors number, including meta descriptor, allowed for
-	 *    a single Tx packet
+	/* Maximum Descriptors number, including meta descriptor, allowed for a
+	 * single Tx packet
 	 */
 	u16 max_per_packet_tx_descs;
 
@@ -657,8 +633,8 @@ struct ena_admin_queue_feature_desc {
 
 	u32 max_header_size;
 
-	/* Maximum Descriptors number, including meta descriptor, allowed for
-	 *    a single Tx packet
+	/* Maximum Descriptors number, including meta descriptor, allowed for a
+	 * single Tx packet
 	 */
 	u16 max_packet_tx_descs;
 
@@ -754,11 +730,11 @@ enum ena_admin_hash_functions {
 };
 
 struct ena_admin_feature_rss_flow_hash_control {
-	u32 keys_num;
+	u32 key_parts;
 
 	u32 reserved;
 
-	u32 key[10];
+	u32 key[ENA_ADMIN_RSS_KEY_PARTS];
 };
 
 struct ena_admin_feature_rss_flow_hash_function {
@@ -893,11 +869,12 @@ struct ena_admin_host_info {
 
 	u16 reserved;
 
-	/* 0 : mutable_rss_table_size
+	/* 0 : reserved
 	 * 1 : rx_offset
 	 * 2 : interrupt_moderation
-	 * 3 : map_rx_buf_bidirectional
-	 * 31:4 : reserved
+	 * 3 : rx_buf_mirroring
+	 * 4 : rss_configurable_function_key
+	 * 31:5 : reserved
 	 */
 	u32 driver_supported_features;
 };
@@ -979,7 +956,7 @@ struct ena_admin_queue_ext_feature_desc {
 		struct ena_admin_queue_ext_feature_fields max_queue_ext;
 
 		u32 raw[10];
-	} ;
+	};
 };
 
 struct ena_admin_get_feat_resp {
@@ -1062,7 +1039,7 @@ struct ena_admin_set_feat_resp {
 struct ena_admin_aenq_common_desc {
 	u16 group;
 
-	u16 syndrom;
+	u16 syndrome;
 
 	/* 0 : phase
 	 * 7:1 : reserved - MBZ
@@ -1086,7 +1063,7 @@ enum ena_admin_aenq_group {
 	ENA_ADMIN_AENQ_GROUPS_NUM                   = 5,
 };
 
-enum ena_admin_aenq_notification_syndrom {
+enum ena_admin_aenq_notification_syndrome {
 	ENA_ADMIN_SUSPEND                           = 0,
 	ENA_ADMIN_RESUME                            = 1,
 	ENA_ADMIN_UPDATE_HINTS                      = 2,
@@ -1215,13 +1192,14 @@ struct ena_admin_ena_mmio_req_read_less_resp {
 #define ENA_ADMIN_HOST_INFO_DEVICE_MASK                     GENMASK(7, 3)
 #define ENA_ADMIN_HOST_INFO_BUS_SHIFT                       8
 #define ENA_ADMIN_HOST_INFO_BUS_MASK                        GENMASK(15, 8)
-#define ENA_ADMIN_HOST_INFO_MUTABLE_RSS_TABLE_SIZE_MASK     BIT(0)
 #define ENA_ADMIN_HOST_INFO_RX_OFFSET_SHIFT                 1
 #define ENA_ADMIN_HOST_INFO_RX_OFFSET_MASK                  BIT(1)
 #define ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_SHIFT      2
 #define ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_MASK       BIT(2)
-#define ENA_ADMIN_HOST_INFO_MAP_RX_BUF_BIDIRECTIONAL_SHIFT  3
-#define ENA_ADMIN_HOST_INFO_MAP_RX_BUF_BIDIRECTIONAL_MASK   BIT(3)
+#define ENA_ADMIN_HOST_INFO_RX_BUF_MIRRORING_SHIFT          3
+#define ENA_ADMIN_HOST_INFO_RX_BUF_MIRRORING_MASK           BIT(3)
+#define ENA_ADMIN_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY_SHIFT 4
+#define ENA_ADMIN_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY_MASK BIT(4)
 
 /* feature_rss_ind_table */
 #define ENA_ADMIN_FEATURE_RSS_IND_TABLE_ONE_ENTRY_UPDATE_MASK BIT(0)
diff --git a/drivers/amazon/net/ena/ena_com.c b/drivers/amazon/net/ena/ena_com.c
index 9eb3a2fcad1b9..7a87dfb2a2a56 100644
--- a/drivers/amazon/net/ena/ena_com.c
+++ b/drivers/amazon/net/ena/ena_com.c
@@ -1,33 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /*
- * Copyright 2015 Amazon.com, Inc. or its affiliates.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include "ena_com.h"
@@ -62,9 +35,9 @@
 
 #define ENA_REGS_ADMIN_INTR_MASK 1
 
-#define ENA_MIN_POLL_US 100
+#define ENA_MIN_ADMIN_POLL_US 100
 
-#define ENA_MAX_POLL_US 5000
+#define ENA_MAX_ADMIN_POLL_US 5000
 
 /*****************************************************************************/
 /*****************************************************************************/
@@ -98,7 +71,8 @@ static int ena_com_mem_addr_set(struct ena_com_dev *ena_dev,
 				       dma_addr_t addr)
 {
 	if ((addr & GENMASK_ULL(ena_dev->dma_addr_bits - 1, 0)) != addr) {
-		pr_err("dma address has more bits that the device supports\n");
+		netdev_err(ena_dev->net_device,
+			   "DMA address has more bits that the device supports\n");
 		return -EINVAL;
 	}
 
@@ -108,16 +82,17 @@ static int ena_com_mem_addr_set(struct ena_com_dev *ena_dev,
 	return 0;
 }
 
-static int ena_com_admin_init_sq(struct ena_com_admin_queue *queue)
+static int ena_com_admin_init_sq(struct ena_com_admin_queue *admin_queue)
 {
-	struct ena_com_admin_sq *sq = &queue->sq;
-	u16 size = ADMIN_SQ_SIZE(queue->q_depth);
+	struct ena_com_dev *ena_dev = admin_queue->ena_dev;
+	struct ena_com_admin_sq *sq = &admin_queue->sq;
+	u16 size = ADMIN_SQ_SIZE(admin_queue->q_depth);
 
-	sq->entries = dma_zalloc_coherent(queue->q_dmadev, size, &sq->dma_addr,
-					  GFP_KERNEL);
+	sq->entries = dma_zalloc_coherent(admin_queue->q_dmadev, size,
+					  &sq->dma_addr, GFP_KERNEL);
 
 	if (!sq->entries) {
-		pr_err("memory allocation failed\n");
+		netdev_err(ena_dev->net_device, "Memory allocation failed\n");
 		return -ENOMEM;
 	}
 
@@ -130,16 +105,17 @@ static int ena_com_admin_init_sq(struct ena_com_admin_queue *queue)
 	return 0;
 }
 
-static int ena_com_admin_init_cq(struct ena_com_admin_queue *queue)
+static int ena_com_admin_init_cq(struct ena_com_admin_queue *admin_queue)
 {
-	struct ena_com_admin_cq *cq = &queue->cq;
-	u16 size = ADMIN_CQ_SIZE(queue->q_depth);
+	struct ena_com_dev *ena_dev = admin_queue->ena_dev;
+	struct ena_com_admin_cq *cq = &admin_queue->cq;
+	u16 size = ADMIN_CQ_SIZE(admin_queue->q_depth);
 
-	cq->entries = dma_zalloc_coherent(queue->q_dmadev, size, &cq->dma_addr,
-					  GFP_KERNEL);
+	cq->entries = dma_zalloc_coherent(admin_queue->q_dmadev, size,
+					  &cq->dma_addr, GFP_KERNEL);
 
 	if (!cq->entries) {
-		pr_err("memory allocation failed\n");
+		netdev_err(ena_dev->net_device, "Memory allocation failed\n");
 		return -ENOMEM;
 	}
 
@@ -149,20 +125,20 @@ static int ena_com_admin_init_cq(struct ena_com_admin_queue *queue)
 	return 0;
 }
 
-static int ena_com_admin_init_aenq(struct ena_com_dev *dev,
+static int ena_com_admin_init_aenq(struct ena_com_dev *ena_dev,
 				   struct ena_aenq_handlers *aenq_handlers)
 {
-	struct ena_com_aenq *aenq = &dev->aenq;
+	struct ena_com_aenq *aenq = &ena_dev->aenq;
 	u32 addr_low, addr_high, aenq_caps;
 	u16 size;
 
-	dev->aenq.q_depth = ENA_ASYNC_QUEUE_DEPTH;
+	ena_dev->aenq.q_depth = ENA_ASYNC_QUEUE_DEPTH;
 	size = ADMIN_AENQ_SIZE(ENA_ASYNC_QUEUE_DEPTH);
-	aenq->entries = dma_zalloc_coherent(dev->dmadev, size, &aenq->dma_addr,
-					    GFP_KERNEL);
+	aenq->entries = dma_zalloc_coherent(ena_dev->dmadev, size,
+					    &aenq->dma_addr, GFP_KERNEL);
 
 	if (!aenq->entries) {
-		pr_err("memory allocation failed\n");
+		netdev_err(ena_dev->net_device, "Memory allocation failed\n");
 		return -ENOMEM;
 	}
 
@@ -172,18 +148,19 @@ static int ena_com_admin_init_aenq(struct ena_com_dev *dev,
 	addr_low = ENA_DMA_ADDR_TO_UINT32_LOW(aenq->dma_addr);
 	addr_high = ENA_DMA_ADDR_TO_UINT32_HIGH(aenq->dma_addr);
 
-	writel(addr_low, dev->reg_bar + ENA_REGS_AENQ_BASE_LO_OFF);
-	writel(addr_high, dev->reg_bar + ENA_REGS_AENQ_BASE_HI_OFF);
+	writel(addr_low, ena_dev->reg_bar + ENA_REGS_AENQ_BASE_LO_OFF);
+	writel(addr_high, ena_dev->reg_bar + ENA_REGS_AENQ_BASE_HI_OFF);
 
 	aenq_caps = 0;
-	aenq_caps |= dev->aenq.q_depth & ENA_REGS_AENQ_CAPS_AENQ_DEPTH_MASK;
+	aenq_caps |= ena_dev->aenq.q_depth & ENA_REGS_AENQ_CAPS_AENQ_DEPTH_MASK;
 	aenq_caps |= (sizeof(struct ena_admin_aenq_entry)
 		      << ENA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_SHIFT) &
 		     ENA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_MASK;
-	writel(aenq_caps, dev->reg_bar + ENA_REGS_AENQ_CAPS_OFF);
+	writel(aenq_caps, ena_dev->reg_bar + ENA_REGS_AENQ_CAPS_OFF);
 
 	if (unlikely(!aenq_handlers)) {
-		pr_err("aenq handlers pointer is NULL\n");
+		netdev_err(ena_dev->net_device,
+			   "AENQ handlers pointer is NULL\n");
 		return -EINVAL;
 	}
 
@@ -199,31 +176,34 @@ static void comp_ctxt_release(struct ena_com_admin_queue *queue,
 	atomic_dec(&queue->outstanding_cmds);
 }
 
-static struct ena_comp_ctx *get_comp_ctxt(struct ena_com_admin_queue *queue,
+static struct ena_comp_ctx *get_comp_ctxt(struct ena_com_admin_queue *admin_queue,
 					  u16 command_id, bool capture)
 {
-	if (unlikely(command_id >= queue->q_depth)) {
-		pr_err("command id is larger than the queue size. cmd_id: %u queue size %d\n",
-		       command_id, queue->q_depth);
+	if (unlikely(command_id >= admin_queue->q_depth)) {
+		netdev_err(admin_queue->ena_dev->net_device,
+			   "Command id is larger than the queue size. cmd_id: %u queue size %d\n",
+			   command_id, admin_queue->q_depth);
 		return NULL;
 	}
 
-	if (unlikely(!queue->comp_ctx)) {
-		pr_err("Completion context is NULL\n");
+	if (unlikely(!admin_queue->comp_ctx)) {
+		netdev_err(admin_queue->ena_dev->net_device,
+			   "Completion context is NULL\n");
 		return NULL;
 	}
 
-	if (unlikely(queue->comp_ctx[command_id].occupied && capture)) {
-		pr_err("Completion context is occupied\n");
+	if (unlikely(admin_queue->comp_ctx[command_id].occupied && capture)) {
+		netdev_err(admin_queue->ena_dev->net_device,
+			   "Completion context is occupied\n");
 		return NULL;
 	}
 
 	if (capture) {
-		atomic_inc(&queue->outstanding_cmds);
-		queue->comp_ctx[command_id].occupied = true;
+		atomic_inc(&admin_queue->outstanding_cmds);
+		admin_queue->comp_ctx[command_id].occupied = true;
 	}
 
-	return &queue->comp_ctx[command_id];
+	return &admin_queue->comp_ctx[command_id];
 }
 
 static struct ena_comp_ctx *__ena_com_submit_admin_cmd(struct ena_com_admin_queue *admin_queue,
@@ -244,7 +224,8 @@ static struct ena_comp_ctx *__ena_com_submit_admin_cmd(struct ena_com_admin_queu
 	/* In case of queue FULL */
 	cnt = (u16)atomic_read(&admin_queue->outstanding_cmds);
 	if (cnt >= admin_queue->q_depth) {
-		pr_debug("admin queue is full.\n");
+		netdev_dbg(admin_queue->ena_dev->net_device,
+			   "Admin queue is full.\n");
 		admin_queue->stats.out_of_space++;
 		return ERR_PTR(-ENOSPC);
 	}
@@ -284,20 +265,22 @@ static struct ena_comp_ctx *__ena_com_submit_admin_cmd(struct ena_com_admin_queu
 	return comp_ctx;
 }
 
-static int ena_com_init_comp_ctxt(struct ena_com_admin_queue *queue)
+static int ena_com_init_comp_ctxt(struct ena_com_admin_queue *admin_queue)
 {
-	size_t size = queue->q_depth * sizeof(struct ena_comp_ctx);
+	struct ena_com_dev *ena_dev = admin_queue->ena_dev;
+	size_t size = admin_queue->q_depth * sizeof(struct ena_comp_ctx);
 	struct ena_comp_ctx *comp_ctx;
 	u16 i;
 
-	queue->comp_ctx = devm_kzalloc(queue->q_dmadev, size, GFP_KERNEL);
-	if (unlikely(!queue->comp_ctx)) {
-		pr_err("memory allocation failed\n");
+	admin_queue->comp_ctx =
+		devm_kzalloc(admin_queue->q_dmadev, size, GFP_KERNEL);
+	if (unlikely(!admin_queue->comp_ctx)) {
+		netdev_err(ena_dev->net_device, "Memory allocation failed\n");
 		return -ENOMEM;
 	}
 
-	for (i = 0; i < queue->q_depth; i++) {
-		comp_ctx = get_comp_ctxt(queue, i, false);
+	for (i = 0; i < admin_queue->q_depth; i++) {
+		comp_ctx = get_comp_ctxt(admin_queue, i, false);
 		if (comp_ctx)
 			init_completion(&comp_ctx->wait_event);
 	}
@@ -364,7 +347,8 @@ static int ena_com_init_io_sq(struct ena_com_dev *ena_dev,
 		}
 
 		if (!io_sq->desc_addr.virt_addr) {
-			pr_err("memory allocation failed\n");
+			netdev_err(ena_dev->net_device,
+				   "Memory allocation failed\n");
 			return -ENOMEM;
 		}
 	}
@@ -390,7 +374,8 @@ static int ena_com_init_io_sq(struct ena_com_dev *ena_dev,
 				devm_kzalloc(ena_dev->dmadev, size, GFP_KERNEL);
 
 		if (!io_sq->bounce_buf_ctrl.base_buffer) {
-			pr_err("bounce buffer memory allocation failed\n");
+			netdev_err(ena_dev->net_device,
+				   "Bounce buffer memory allocation failed\n");
 			return -ENOMEM;
 		}
 
@@ -451,7 +436,7 @@ static int ena_com_init_io_cq(struct ena_com_dev *ena_dev,
 	}
 
 	if (!io_cq->cdesc_addr.virt_addr) {
-		pr_err("memory allocation failed\n");
+		netdev_err(ena_dev->net_device, "Memory allocation failed\n");
 		return -ENOMEM;
 	}
 
@@ -472,7 +457,8 @@ static void ena_com_handle_single_admin_completion(struct ena_com_admin_queue *a
 
 	comp_ctx = get_comp_ctxt(admin_queue, cmd_id, false);
 	if (unlikely(!comp_ctx)) {
-		pr_err("comp_ctx is NULL. Changing the admin queue running state\n");
+		netdev_err(admin_queue->ena_dev->net_device,
+			   "comp_ctx is NULL. Changing the admin queue running state\n");
 		admin_queue->running_state = false;
 		return;
 	}
@@ -524,10 +510,12 @@ static void ena_com_handle_admin_completion(struct ena_com_admin_queue *admin_qu
 	admin_queue->stats.completed_cmd += comp_num;
 }
 
-static int ena_com_comp_status_to_errno(u8 comp_status)
+static int ena_com_comp_status_to_errno(struct ena_com_admin_queue *admin_queue,
+					u8 comp_status)
 {
 	if (unlikely(comp_status != 0))
-		pr_err("admin command failed[%u]\n", comp_status);
+		netdev_err(admin_queue->ena_dev->net_device,
+			   "Admin command failed[%u]\n", comp_status);
 
 	switch (comp_status) {
 	case ENA_ADMIN_SUCCESS:
@@ -541,15 +529,17 @@ static int ena_com_comp_status_to_errno(u8 comp_status)
 	case ENA_ADMIN_ILLEGAL_PARAMETER:
 	case ENA_ADMIN_UNKNOWN_ERROR:
 		return -EINVAL;
+	case ENA_ADMIN_RESOURCE_BUSY:
+		return -EAGAIN;
 	}
 
 	return -EINVAL;
 }
 
-static inline void ena_delay_exponential_backoff_us(u32 exp, u32 delay_us)
+static void ena_delay_exponential_backoff_us(u32 exp, u32 delay_us)
 {
-	delay_us = max_t(u32, ENA_MIN_POLL_US, delay_us);
-	delay_us = min_t(u32, delay_us * (1 << exp), ENA_MAX_POLL_US);
+	delay_us = max_t(u32, ENA_MIN_ADMIN_POLL_US, delay_us);
+	delay_us = min_t(u32, delay_us * (1U << exp), ENA_MAX_ADMIN_POLL_US);
 	usleep_range(delay_us, 2 * delay_us);
 }
 
@@ -572,7 +562,8 @@ static int ena_com_wait_and_process_admin_cq_polling(struct ena_comp_ctx *comp_c
 			break;
 
 		if (time_is_before_jiffies(timeout)) {
-			pr_err("Wait for completion (polling) timeout\n");
+			netdev_err(admin_queue->ena_dev->net_device,
+				   "Wait for completion (polling) timeout\n");
 			/* ENA didn't have any completion */
 			spin_lock_irqsave(&admin_queue->q_lock, flags);
 			admin_queue->stats.no_completion++;
@@ -583,11 +574,13 @@ static int ena_com_wait_and_process_admin_cq_polling(struct ena_comp_ctx *comp_c
 			goto err;
 		}
 
-		ena_delay_exponential_backoff_us(exp++, admin_queue->ena_dev->ena_min_poll_delay_us);
+		ena_delay_exponential_backoff_us(exp++,
+						 admin_queue->ena_dev->ena_min_poll_delay_us);
 	}
 
 	if (unlikely(comp_ctx->status == ENA_CMD_ABORTED)) {
-		pr_err("Command was aborted\n");
+		netdev_err(admin_queue->ena_dev->net_device,
+			   "Command was aborted\n");
 		spin_lock_irqsave(&admin_queue->q_lock, flags);
 		admin_queue->stats.aborted_cmd++;
 		spin_unlock_irqrestore(&admin_queue->q_lock, flags);
@@ -598,13 +591,13 @@ static int ena_com_wait_and_process_admin_cq_polling(struct ena_comp_ctx *comp_c
 	WARN(comp_ctx->status != ENA_CMD_COMPLETED, "Invalid comp status %d\n",
 	     comp_ctx->status);
 
-	ret = ena_com_comp_status_to_errno(comp_ctx->comp_status);
+	ret = ena_com_comp_status_to_errno(admin_queue, comp_ctx->comp_status);
 err:
 	comp_ctxt_release(admin_queue, comp_ctx);
 	return ret;
 }
 
-/**
+/*
  * Set the LLQ configurations of the firmware
  *
  * The driver provides only the enabled feature values to the device,
@@ -629,13 +622,9 @@ static int ena_com_set_llq(struct ena_com_dev *ena_dev)
 	cmd.u.llq.desc_num_before_header_enabled = llq_info->descs_num_before_header;
 	cmd.u.llq.descriptors_stride_ctrl_enabled = llq_info->desc_stride_ctrl;
 
-	if (llq_info->disable_meta_caching)
-		cmd.u.llq.accel_mode.u.set.enabled_flags |=
-			BIT(ENA_ADMIN_DISABLE_META_CACHING);
-
-	if (llq_info->max_entries_in_tx_burst)
-		cmd.u.llq.accel_mode.u.set.enabled_flags |=
-			BIT(ENA_ADMIN_LIMIT_TX_BURST);
+	cmd.u.llq.accel_mode.u.set.enabled_flags =
+		BIT(ENA_ADMIN_DISABLE_META_CACHING) |
+		BIT(ENA_ADMIN_LIMIT_TX_BURST);
 
 	ret = ena_com_execute_admin_command(admin_queue,
 					    (struct ena_admin_aq_entry *)&cmd,
@@ -644,7 +633,8 @@ static int ena_com_set_llq(struct ena_com_dev *ena_dev)
 					    sizeof(resp));
 
 	if (unlikely(ret))
-		pr_err("Failed to set LLQ configurations: %d\n", ret);
+		netdev_err(ena_dev->net_device,
+			   "Failed to set LLQ configurations: %d\n", ret);
 
 	return ret;
 }
@@ -654,6 +644,7 @@ static int ena_com_config_llq_info(struct ena_com_dev *ena_dev,
 				   struct ena_llq_configurations *llq_default_cfg)
 {
 	struct ena_com_llq_info *llq_info = &ena_dev->llq_info;
+	struct ena_admin_accel_mode_get llq_accel_mode_get;
 	u16 supported_feat;
 	int rc;
 
@@ -665,8 +656,9 @@ static int ena_com_config_llq_info(struct ena_com_dev *ena_dev,
 		llq_info->header_location_ctrl =
 			llq_default_cfg->llq_header_location;
 	} else {
-		pr_err("Invalid header location control, supported: 0x%x\n",
-		       supported_feat);
+		netdev_err(ena_dev->net_device,
+			   "Invalid header location control, supported: 0x%x\n",
+			   supported_feat);
 		return -EINVAL;
 	}
 
@@ -680,14 +672,16 @@ static int ena_com_config_llq_info(struct ena_com_dev *ena_dev,
 			} else if (supported_feat & ENA_ADMIN_SINGLE_DESC_PER_ENTRY) {
 				llq_info->desc_stride_ctrl = ENA_ADMIN_SINGLE_DESC_PER_ENTRY;
 			} else {
-				pr_err("Invalid desc_stride_ctrl, supported: 0x%x\n",
-				       supported_feat);
+				netdev_err(ena_dev->net_device,
+					   "Invalid desc_stride_ctrl, supported: 0x%x\n",
+					   supported_feat);
 				return -EINVAL;
 			}
 
-			pr_err("Default llq stride ctrl is not supported, performing fallback, default: 0x%x, supported: 0x%x, used: 0x%x\n",
-			       llq_default_cfg->llq_stride_ctrl, supported_feat,
-			       llq_info->desc_stride_ctrl);
+			netdev_err(ena_dev->net_device,
+				   "Default llq stride ctrl is not supported, performing fallback, default: 0x%x, supported: 0x%x, used: 0x%x\n",
+				   llq_default_cfg->llq_stride_ctrl,
+				   supported_feat, llq_info->desc_stride_ctrl);
 		}
 	} else {
 		llq_info->desc_stride_ctrl = 0;
@@ -708,20 +702,23 @@ static int ena_com_config_llq_info(struct ena_com_dev *ena_dev,
 			llq_info->desc_list_entry_size_ctrl = ENA_ADMIN_LIST_ENTRY_SIZE_256B;
 			llq_info->desc_list_entry_size = 256;
 		} else {
-			pr_err("Invalid entry_size_ctrl, supported: 0x%x\n",
-			       supported_feat);
+			netdev_err(ena_dev->net_device,
+				   "Invalid entry_size_ctrl, supported: 0x%x\n",
+				   supported_feat);
 			return -EINVAL;
 		}
 
-		pr_err("Default llq ring entry size is not supported, performing fallback, default: 0x%x, supported: 0x%x, used: 0x%x\n",
-		       llq_default_cfg->llq_ring_entry_size, supported_feat,
-		       llq_info->desc_list_entry_size);
+		netdev_err(ena_dev->net_device,
+			   "Default llq ring entry size is not supported, performing fallback, default: 0x%x, supported: 0x%x, used: 0x%x\n",
+			   llq_default_cfg->llq_ring_entry_size, supported_feat,
+			   llq_info->desc_list_entry_size);
 	}
 	if (unlikely(llq_info->desc_list_entry_size & 0x7)) {
 		/* The desc list entry size should be whole multiply of 8
 		 * This requirement comes from __iowrite64_copy()
 		 */
-		pr_err("illegal entry size %d\n", llq_info->desc_list_entry_size);
+		netdev_err(ena_dev->net_device, "Illegal entry size %d\n",
+			   llq_info->desc_list_entry_size);
 		return -EINVAL;
 	}
 
@@ -744,28 +741,33 @@ static int ena_com_config_llq_info(struct ena_com_dev *ena_dev,
 		} else if (supported_feat & ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_8) {
 			llq_info->descs_num_before_header = ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_8;
 		} else {
-			pr_err("Invalid descs_num_before_header, supported: 0x%x\n",
-			       supported_feat);
+			netdev_err(ena_dev->net_device,
+				   "Invalid descs_num_before_header, supported: 0x%x\n",
+				   supported_feat);
 			return -EINVAL;
 		}
 
-		pr_err("Default llq num descs before header is not supported, performing fallback, default: 0x%x, supported: 0x%x, used: 0x%x\n",
-		       llq_default_cfg->llq_num_decs_before_header,
-		       supported_feat, llq_info->descs_num_before_header);
+		netdev_err(ena_dev->net_device,
+			   "Default llq num descs before header is not supported, performing fallback, default: 0x%x, supported: 0x%x, used: 0x%x\n",
+			   llq_default_cfg->llq_num_decs_before_header,
+			   supported_feat, llq_info->descs_num_before_header);
 	}
 	/* Check for accelerated queue supported */
+	llq_accel_mode_get = llq_features->accel_mode.u.get;
+
 	llq_info->disable_meta_caching =
-		llq_features->accel_mode.u.get.supported_flags &
-		BIT(ENA_ADMIN_DISABLE_META_CACHING);
+		!!(llq_accel_mode_get.supported_flags &
+		   BIT(ENA_ADMIN_DISABLE_META_CACHING));
 
-	if (llq_features->accel_mode.u.get.supported_flags & BIT(ENA_ADMIN_LIMIT_TX_BURST))
+	if (llq_accel_mode_get.supported_flags & BIT(ENA_ADMIN_LIMIT_TX_BURST))
 		llq_info->max_entries_in_tx_burst =
-			llq_features->accel_mode.u.get.max_tx_burst_size /
+			llq_accel_mode_get.max_tx_burst_size /
 			llq_default_cfg->llq_ring_entry_size_value;
 
 	rc = ena_com_set_llq(ena_dev);
 	if (rc)
-		pr_err("Cannot set LLQ configuration: %d\n", rc);
+		netdev_err(ena_dev->net_device,
+			   "Cannot set LLQ configuration: %d\n", rc);
 
 	return rc;
 }
@@ -792,15 +794,17 @@ static int ena_com_wait_and_process_admin_cq_interrupts(struct ena_comp_ctx *com
 		spin_unlock_irqrestore(&admin_queue->q_lock, flags);
 
 		if (comp_ctx->status == ENA_CMD_COMPLETED) {
-			pr_err("The ena device sent a completion but the driver didn't receive a MSI-X interrupt (cmd %d), autopolling mode is %s\n",
-			       comp_ctx->cmd_opcode,
-			       admin_queue->auto_polling ? "ON" : "OFF");
+			netdev_err(admin_queue->ena_dev->net_device,
+				   "The ena device sent a completion but the driver didn't receive a MSI-X interrupt (cmd %d), autopolling mode is %s\n",
+				   comp_ctx->cmd_opcode,
+				   admin_queue->auto_polling ? "ON" : "OFF");
 			/* Check if fallback to polling is enabled */
 			if (admin_queue->auto_polling)
 				admin_queue->polling = true;
 		} else {
-			pr_err("The ena device didn't send a completion for the admin cmd %d status %d\n",
-			       comp_ctx->cmd_opcode, comp_ctx->status);
+			netdev_err(admin_queue->ena_dev->net_device,
+				   "The ena device didn't send a completion for the admin cmd %d status %d\n",
+				   comp_ctx->cmd_opcode, comp_ctx->status);
 		}
 		/* Check if shifted to polling mode.
 		 * This will happen if there is a completion without an interrupt
@@ -813,7 +817,7 @@ static int ena_com_wait_and_process_admin_cq_interrupts(struct ena_comp_ctx *com
 		}
 	}
 
-	ret = ena_com_comp_status_to_errno(comp_ctx->comp_status);
+	ret = ena_com_comp_status_to_errno(admin_queue, comp_ctx->comp_status);
 err:
 	comp_ctxt_release(admin_queue, comp_ctx);
 	return ret;
@@ -860,15 +864,17 @@ static u32 ena_com_reg_bar_read32(struct ena_com_dev *ena_dev, u16 offset)
 	}
 
 	if (unlikely(i == timeout)) {
-		pr_err("reading reg failed for timeout. expected: req id[%hu] offset[%hu] actual: req id[%hu] offset[%hu]\n",
-		       mmio_read->seq_num, offset, read_resp->req_id,
-		       read_resp->reg_off);
+		netdev_err(ena_dev->net_device,
+			   "Reading reg failed for timeout. expected: req id[%hu] offset[%hu] actual: req id[%hu] offset[%hu]\n",
+			   mmio_read->seq_num, offset, read_resp->req_id,
+			   read_resp->reg_off);
 		ret = ENA_MMIO_READ_TIMEOUT;
 		goto err;
 	}
 
 	if (read_resp->reg_off != offset) {
-		pr_err("Read failure: wrong offset provided\n");
+		netdev_err(ena_dev->net_device,
+			   "Read failure: wrong offset provided\n");
 		ret = ENA_MMIO_READ_TIMEOUT;
 	} else {
 		ret = read_resp->reg_val;
@@ -927,7 +933,8 @@ static int ena_com_destroy_io_sq(struct ena_com_dev *ena_dev,
 					    sizeof(destroy_resp));
 
 	if (unlikely(ret && (ret != -ENODEV)))
-		pr_err("failed to destroy io sq error: %d\n", ret);
+		netdev_err(ena_dev->net_device,
+			   "Failed to destroy io sq error: %d\n", ret);
 
 	return ret;
 }
@@ -977,7 +984,8 @@ static int wait_for_reset_state(struct ena_com_dev *ena_dev, u32 timeout,
 		val = ena_com_reg_bar_read32(ena_dev, ENA_REGS_DEV_STS_OFF);
 
 		if (unlikely(val == ENA_MMIO_READ_TIMEOUT)) {
-			pr_err("Reg read timeout occurred\n");
+			netdev_err(ena_dev->net_device,
+				   "Reg read timeout occurred\n");
 			return -ETIME;
 		}
 
@@ -1017,7 +1025,8 @@ static int ena_com_get_feature_ex(struct ena_com_dev *ena_dev,
 	int ret;
 
 	if (!ena_com_check_supported_feature_id(ena_dev, feature_id)) {
-		pr_debug("Feature %d isn't supported\n", feature_id);
+		netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n",
+			   feature_id);
 		return -EOPNOTSUPP;
 	}
 
@@ -1036,7 +1045,7 @@ static int ena_com_get_feature_ex(struct ena_com_dev *ena_dev,
 				   &get_cmd.control_buffer.address,
 				   control_buf_dma_addr);
 	if (unlikely(ret)) {
-		pr_err("memory address set failed\n");
+		netdev_err(ena_dev->net_device, "Memory address set failed\n");
 		return ret;
 	}
 
@@ -1053,8 +1062,9 @@ static int ena_com_get_feature_ex(struct ena_com_dev *ena_dev,
 					    sizeof(*get_resp));
 
 	if (unlikely(ret))
-		pr_err("Failed to submit get_feature command %d error: %d\n",
-		       feature_id, ret);
+		netdev_err(ena_dev->net_device,
+			   "Failed to submit get_feature command %d error: %d\n",
+			   feature_id, ret);
 
 	return ret;
 }
@@ -1083,11 +1093,10 @@ static void ena_com_hash_key_fill_default_key(struct ena_com_dev *ena_dev)
 		(ena_dev->rss).hash_key;
 
 	netdev_rss_key_fill(&hash_key->key, sizeof(hash_key->key));
-	/* The key is stored in the device in u32 array
-	 * as well as the API requires the key to be passed in this
-	 * format. Thus the size of our array should be divided by 4
+	/* The key buffer is stored in the device in an array of
+	 * uint32 elements.
 	 */
-	hash_key->keys_num = sizeof(hash_key->key) / sizeof(u32);
+	hash_key->key_parts = ENA_ADMIN_RSS_KEY_PARTS;
 }
 
 static int ena_com_hash_key_allocate(struct ena_com_dev *ena_dev)
@@ -1151,15 +1160,16 @@ static int ena_com_indirect_table_allocate(struct ena_com_dev *ena_dev,
 	int ret;
 
 	ret = ena_com_get_feature(ena_dev, &get_resp,
-				  ENA_ADMIN_RSS_REDIRECTION_TABLE_CONFIG, 0);
+				  ENA_ADMIN_RSS_INDIRECTION_TABLE_CONFIG, 0);
 	if (unlikely(ret))
 		return ret;
 
 	if ((get_resp.u.ind_table.min_size > log_size) ||
 	    (get_resp.u.ind_table.max_size < log_size)) {
-		pr_err("indirect table size doesn't fit. requested size: %d while min is:%d and max %d\n",
-		       1 << log_size, 1 << get_resp.u.ind_table.min_size,
-		       1 << get_resp.u.ind_table.max_size);
+		netdev_err(ena_dev->net_device,
+			   "Indirect table size doesn't fit. requested size: %d while min is:%d and max %d\n",
+			   1 << log_size, 1 << get_resp.u.ind_table.min_size,
+			   1 << get_resp.u.ind_table.max_size);
 		return -EINVAL;
 	}
 
@@ -1250,7 +1260,8 @@ static int ena_com_create_io_sq(struct ena_com_dev *ena_dev,
 					   &create_cmd.sq_ba,
 					   io_sq->desc_addr.phys_addr);
 		if (unlikely(ret)) {
-			pr_err("memory address set failed\n");
+			netdev_err(ena_dev->net_device,
+				   "Memory address set failed\n");
 			return ret;
 		}
 	}
@@ -1261,7 +1272,8 @@ static int ena_com_create_io_sq(struct ena_com_dev *ena_dev,
 					    (struct ena_admin_acq_entry *)&cmd_completion,
 					    sizeof(cmd_completion));
 	if (unlikely(ret)) {
-		pr_err("Failed to create IO SQ. error: %d\n", ret);
+		netdev_err(ena_dev->net_device,
+			   "Failed to create IO SQ. error: %d\n", ret);
 		return ret;
 	}
 
@@ -1279,7 +1291,8 @@ static int ena_com_create_io_sq(struct ena_com_dev *ena_dev,
 			cmd_completion.llq_descriptors_offset);
 	}
 
-	pr_debug("created sq[%u], depth[%u]\n", io_sq->idx, io_sq->q_depth);
+	netdev_dbg(ena_dev->net_device, "Created sq[%u], depth[%u]\n",
+		   io_sq->idx, io_sq->q_depth);
 
 	return ret;
 }
@@ -1313,7 +1326,8 @@ static void ena_com_update_intr_delay_resolution(struct ena_com_dev *ena_dev,
 	u16 prev_intr_delay_resolution = ena_dev->intr_delay_resolution;
 
 	if (unlikely(!intr_delay_resolution)) {
-		pr_err("Illegal intr_delay_resolution provided. Going to use default 1 usec resolution\n");
+		netdev_err(ena_dev->net_device,
+			   "Illegal intr_delay_resolution provided. Going to use default 1 usec resolution\n");
 		intr_delay_resolution = ENA_DEFAULT_INTR_DELAY_RESOLUTION;
 	}
 
@@ -1349,21 +1363,25 @@ int ena_com_execute_admin_command(struct ena_com_admin_queue *admin_queue,
 					    comp, comp_size);
 	if (IS_ERR(comp_ctx)) {
 		if (comp_ctx == ERR_PTR(-ENODEV))
-			pr_debug("Failed to submit command [%ld]\n",
-				 PTR_ERR(comp_ctx));
+			netdev_dbg(admin_queue->ena_dev->net_device,
+				   "Failed to submit command [%ld]\n",
+				   PTR_ERR(comp_ctx));
 		else
-			pr_err("Failed to submit command [%ld]\n",
-			       PTR_ERR(comp_ctx));
+			netdev_err(admin_queue->ena_dev->net_device,
+				   "Failed to submit command [%ld]\n",
+				   PTR_ERR(comp_ctx));
 
-		return PTR_ERR(comp_ctx);
+		return (int)PTR_ERR(comp_ctx);
 	}
 
 	ret = ena_com_wait_and_process_admin_cq(comp_ctx, admin_queue);
 	if (unlikely(ret)) {
 		if (admin_queue->running_state)
-			pr_err("Failed to process command. ret = %d\n", ret);
+			netdev_err(admin_queue->ena_dev->net_device,
+				   "Failed to process command. ret = %d\n", ret);
 		else
-			pr_debug("Failed to process command. ret = %d\n", ret);
+			netdev_dbg(admin_queue->ena_dev->net_device,
+				   "Failed to process command. ret = %d\n", ret);
 	}
 	return ret;
 }
@@ -1392,7 +1410,7 @@ int ena_com_create_io_cq(struct ena_com_dev *ena_dev,
 				   &create_cmd.cq_ba,
 				   io_cq->cdesc_addr.phys_addr);
 	if (unlikely(ret)) {
-		pr_err("memory address set failed\n");
+		netdev_err(ena_dev->net_device, "Memory address set failed\n");
 		return ret;
 	}
 
@@ -1402,7 +1420,8 @@ int ena_com_create_io_cq(struct ena_com_dev *ena_dev,
 					    (struct ena_admin_acq_entry *)&cmd_completion,
 					    sizeof(cmd_completion));
 	if (unlikely(ret)) {
-		pr_err("Failed to create IO CQ. error: %d\n", ret);
+		netdev_err(ena_dev->net_device,
+			   "Failed to create IO CQ. error: %d\n", ret);
 		return ret;
 	}
 
@@ -1421,7 +1440,8 @@ int ena_com_create_io_cq(struct ena_com_dev *ena_dev,
 			(u32 __iomem *)((uintptr_t)ena_dev->reg_bar +
 			cmd_completion.numa_node_register_offset);
 
-	pr_debug("created cq[%u], depth[%u]\n", io_cq->idx, io_cq->q_depth);
+	netdev_dbg(ena_dev->net_device, "Created cq[%u], depth[%u]\n",
+		   io_cq->idx, io_cq->q_depth);
 
 	return ret;
 }
@@ -1431,8 +1451,9 @@ int ena_com_get_io_handlers(struct ena_com_dev *ena_dev, u16 qid,
 			    struct ena_com_io_cq **io_cq)
 {
 	if (qid >= ENA_TOTAL_NUM_QUEUES) {
-		pr_err("Invalid queue number %d but the max is %d\n", qid,
-		       ENA_TOTAL_NUM_QUEUES);
+		netdev_err(ena_dev->net_device,
+			   "Invalid queue number %d but the max is %d\n", qid,
+			   ENA_TOTAL_NUM_QUEUES);
 		return -EINVAL;
 	}
 
@@ -1498,7 +1519,8 @@ int ena_com_destroy_io_cq(struct ena_com_dev *ena_dev,
 					    sizeof(destroy_resp));
 
 	if (unlikely(ret && (ret != -ENODEV)))
-		pr_err("Failed to destroy IO CQ. error: %d\n", ret);
+		netdev_err(ena_dev->net_device,
+			   "Failed to destroy IO CQ. error: %d\n", ret);
 
 	return ret;
 }
@@ -1540,13 +1562,14 @@ int ena_com_set_aenq_config(struct ena_com_dev *ena_dev, u32 groups_flag)
 
 	ret = ena_com_get_feature(ena_dev, &get_resp, ENA_ADMIN_AENQ_CONFIG, 0);
 	if (ret) {
-		pr_info("Can't get aenq configuration\n");
+		dev_info(ena_dev->dmadev, "Can't get aenq configuration\n");
 		return ret;
 	}
 
 	if ((get_resp.u.aenq.supported_groups & groups_flag) != groups_flag) {
-		pr_warn("Trying to set unsupported aenq events. supported flag: 0x%x asked flag: 0x%x\n",
-			get_resp.u.aenq.supported_groups, groups_flag);
+		netdev_warn(ena_dev->net_device,
+			    "Trying to set unsupported aenq events. supported flag: 0x%x asked flag: 0x%x\n",
+			    get_resp.u.aenq.supported_groups, groups_flag);
 		return -EOPNOTSUPP;
 	}
 
@@ -1565,7 +1588,8 @@ int ena_com_set_aenq_config(struct ena_com_dev *ena_dev, u32 groups_flag)
 					    sizeof(resp));
 
 	if (unlikely(ret))
-		pr_err("Failed to config AENQ ret: %d\n", ret);
+		netdev_err(ena_dev->net_device,
+			   "Failed to config AENQ ret: %d\n", ret);
 
 	return ret;
 }
@@ -1573,20 +1597,21 @@ int ena_com_set_aenq_config(struct ena_com_dev *ena_dev, u32 groups_flag)
 int ena_com_get_dma_width(struct ena_com_dev *ena_dev)
 {
 	u32 caps = ena_com_reg_bar_read32(ena_dev, ENA_REGS_CAPS_OFF);
-	int width;
+	u32 width;
 
 	if (unlikely(caps == ENA_MMIO_READ_TIMEOUT)) {
-		pr_err("Reg read timeout occurred\n");
+		netdev_err(ena_dev->net_device, "Reg read timeout occurred\n");
 		return -ETIME;
 	}
 
 	width = (caps & ENA_REGS_CAPS_DMA_ADDR_WIDTH_MASK) >>
 		ENA_REGS_CAPS_DMA_ADDR_WIDTH_SHIFT;
 
-	pr_debug("ENA dma width: %d\n", width);
+	netdev_dbg(ena_dev->net_device, "ENA dma width: %d\n", width);
 
 	if ((width < 32) || width > ENA_MAX_PHYS_ADDR_SIZE_BITS) {
-		pr_err("DMA width illegal value: %d\n", width);
+		netdev_err(ena_dev->net_device, "DMA width illegal value: %d\n",
+			   width);
 		return -EINVAL;
 	}
 
@@ -1610,23 +1635,24 @@ int ena_com_validate_version(struct ena_com_dev *ena_dev)
 
 	if (unlikely((ver == ENA_MMIO_READ_TIMEOUT) ||
 		     (ctrl_ver == ENA_MMIO_READ_TIMEOUT))) {
-		pr_err("Reg read timeout occurred\n");
+		netdev_err(ena_dev->net_device, "Reg read timeout occurred\n");
 		return -ETIME;
 	}
 
-	pr_info("ena device version: %d.%d\n",
-		(ver & ENA_REGS_VERSION_MAJOR_VERSION_MASK) >>
-			ENA_REGS_VERSION_MAJOR_VERSION_SHIFT,
-		ver & ENA_REGS_VERSION_MINOR_VERSION_MASK);
+	dev_info(ena_dev->dmadev, "ENA device version: %d.%d\n",
+		 (ver & ENA_REGS_VERSION_MAJOR_VERSION_MASK) >>
+			 ENA_REGS_VERSION_MAJOR_VERSION_SHIFT,
+		 ver & ENA_REGS_VERSION_MINOR_VERSION_MASK);
 
-	pr_info("ena controller version: %d.%d.%d implementation version %d\n",
-		(ctrl_ver & ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK) >>
-			ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT,
-		(ctrl_ver & ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK) >>
-			ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT,
-		(ctrl_ver & ENA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK),
-		(ctrl_ver & ENA_REGS_CONTROLLER_VERSION_IMPL_ID_MASK) >>
-			ENA_REGS_CONTROLLER_VERSION_IMPL_ID_SHIFT);
+	dev_info(ena_dev->dmadev,
+		 "ENA controller version: %d.%d.%d implementation version %d\n",
+		 (ctrl_ver & ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK) >>
+			 ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT,
+		 (ctrl_ver & ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK) >>
+			 ENA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT,
+		 (ctrl_ver & ENA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK),
+		 (ctrl_ver & ENA_REGS_CONTROLLER_VERSION_IMPL_ID_MASK) >>
+			 ENA_REGS_CONTROLLER_VERSION_IMPL_ID_SHIFT);
 
 	ctrl_ver_masked =
 		(ctrl_ver & ENA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK) |
@@ -1635,13 +1661,27 @@ int ena_com_validate_version(struct ena_com_dev *ena_dev)
 
 	/* Validate the ctrl version without the implementation ID */
 	if (ctrl_ver_masked < MIN_ENA_CTRL_VER) {
-		pr_err("ENA ctrl version is lower than the minimal ctrl version the driver supports\n");
+		netdev_err(ena_dev->net_device,
+			   "ENA ctrl version is lower than the minimal ctrl version the driver supports\n");
 		return -1;
 	}
 
 	return 0;
 }
 
+static void
+ena_com_free_ena_admin_queue_comp_ctx(struct ena_com_dev *ena_dev,
+				      struct ena_com_admin_queue *admin_queue)
+
+{
+	if (!admin_queue->comp_ctx)
+		return;
+
+	devm_kfree(ena_dev->dmadev, admin_queue->comp_ctx);
+
+	admin_queue->comp_ctx = NULL;
+}
+
 void ena_com_admin_destroy(struct ena_com_dev *ena_dev)
 {
 	struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
@@ -1650,9 +1690,8 @@ void ena_com_admin_destroy(struct ena_com_dev *ena_dev)
 	struct ena_com_aenq *aenq = &ena_dev->aenq;
 	u16 size;
 
-	if (admin_queue->comp_ctx)
-		devm_kfree(ena_dev->dmadev, admin_queue->comp_ctx);
-	admin_queue->comp_ctx = NULL;
+	ena_com_free_ena_admin_queue_comp_ctx(ena_dev, admin_queue);
+
 	size = ADMIN_SQ_SIZE(admin_queue->q_depth);
 	if (sq->entries)
 		dma_free_coherent(ena_dev->dmadev, size, sq->entries,
@@ -1683,7 +1722,7 @@ void ena_com_set_admin_polling_mode(struct ena_com_dev *ena_dev, bool polling)
 	ena_dev->admin_queue.polling = polling;
 }
 
-bool ena_com_get_admin_polling_mode(struct ena_com_dev * ena_dev)
+bool ena_com_get_admin_polling_mode(struct ena_com_dev *ena_dev)
 {
 	return ena_dev->admin_queue.polling;
 }
@@ -1761,12 +1800,13 @@ int ena_com_admin_init(struct ena_com_dev *ena_dev,
 	dev_sts = ena_com_reg_bar_read32(ena_dev, ENA_REGS_DEV_STS_OFF);
 
 	if (unlikely(dev_sts == ENA_MMIO_READ_TIMEOUT)) {
-		pr_err("Reg read timeout occurred\n");
+		netdev_err(ena_dev->net_device, "Reg read timeout occurred\n");
 		return -ETIME;
 	}
 
 	if (!(dev_sts & ENA_REGS_DEV_STS_READY_MASK)) {
-		pr_err("Device isn't ready, abort com init\n");
+		netdev_err(ena_dev->net_device,
+			   "Device isn't ready, abort com init\n");
 		return -ENODEV;
 	}
 
@@ -1844,8 +1884,9 @@ int ena_com_create_io_queue(struct ena_com_dev *ena_dev,
 	int ret;
 
 	if (ctx->qid >= ENA_TOTAL_NUM_QUEUES) {
-		pr_err("Qid (%d) is bigger than max num of queues (%d)\n",
-		       ctx->qid, ENA_TOTAL_NUM_QUEUES);
+		netdev_err(ena_dev->net_device,
+			   "Qid (%d) is bigger than max num of queues (%d)\n",
+			   ctx->qid, ENA_TOTAL_NUM_QUEUES);
 		return -EINVAL;
 	}
 
@@ -1903,8 +1944,9 @@ void ena_com_destroy_io_queue(struct ena_com_dev *ena_dev, u16 qid)
 	struct ena_com_io_cq *io_cq;
 
 	if (qid >= ENA_TOTAL_NUM_QUEUES) {
-		pr_err("Qid (%d) is bigger than max num of queues (%d)\n", qid,
-		       ENA_TOTAL_NUM_QUEUES);
+		netdev_err(ena_dev->net_device,
+			   "Qid (%d) is bigger than max num of queues (%d)\n",
+			   qid, ENA_TOTAL_NUM_QUEUES);
 		return;
 	}
 
@@ -1936,6 +1978,7 @@ int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev,
 
 	memcpy(&get_feat_ctx->dev_attr, &get_resp.u.dev_attr,
 	       sizeof(get_resp.u.dev_attr));
+
 	ena_dev->supported_features = get_resp.u.dev_attr.supported_features;
 
 	if (ena_dev->supported_features & BIT(ENA_ADMIN_MAX_QUEUES_EXT)) {
@@ -2003,17 +2046,6 @@ int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev,
 	else
 		return rc;
 
-	rc = ena_com_get_feature(ena_dev, &get_resp,
-				 ENA_ADMIN_RSS_REDIRECTION_TABLE_CONFIG, 0);
-	if (!rc)
-		memcpy(&get_feat_ctx->ind_table, &get_resp.u.ind_table,
-		       sizeof(get_resp.u.ind_table));
-	else if (rc == -EOPNOTSUPP)
-		memset(&get_feat_ctx->ind_table, 0x0,
-		       sizeof(get_feat_ctx->ind_table));
-	else
-		return rc;
-
 	return 0;
 }
 
@@ -2025,10 +2057,10 @@ void ena_com_admin_q_comp_intr_handler(struct ena_com_dev *ena_dev)
 /* ena_handle_specific_aenq_event:
  * return the handler that is relevant to the specific event group
  */
-static ena_aenq_handler ena_com_get_specific_aenq_cb(struct ena_com_dev *dev,
+static ena_aenq_handler ena_com_get_specific_aenq_cb(struct ena_com_dev *ena_dev,
 						     u16 group)
 {
-	struct ena_aenq_handlers *aenq_handlers = dev->aenq.aenq_handlers;
+	struct ena_aenq_handlers *aenq_handlers = ena_dev->aenq.aenq_handlers;
 
 	if ((group < ENA_MAX_HANDLERS) && aenq_handlers->handlers[group])
 		return aenq_handlers->handlers[group];
@@ -2040,11 +2072,11 @@ static ena_aenq_handler ena_com_get_specific_aenq_cb(struct ena_com_dev *dev,
  * handles the aenq incoming events.
  * pop events from the queue and apply the specific handler
  */
-void ena_com_aenq_intr_handler(struct ena_com_dev *dev, void *data)
+void ena_com_aenq_intr_handler(struct ena_com_dev *ena_dev, void *data)
 {
 	struct ena_admin_aenq_entry *aenq_e;
 	struct ena_admin_aenq_common_desc *aenq_common;
-	struct ena_com_aenq *aenq  = &dev->aenq;
+	struct ena_com_aenq *aenq  = &ena_dev->aenq;
 	u64 timestamp;
 	ena_aenq_handler handler_cb;
 	u16 masked_head, processed = 0;
@@ -2064,12 +2096,14 @@ void ena_com_aenq_intr_handler(struct ena_com_dev *dev, void *data)
 		dma_rmb();
 
 		timestamp = (u64)aenq_common->timestamp_low |
-			    ((u64)aenq_common->timestamp_high << 32);
-		pr_debug("AENQ! Group[%x] Syndrom[%x] timestamp: [%llus]\n",
-			 aenq_common->group, aenq_common->syndrom, timestamp);
+			((u64)aenq_common->timestamp_high << 32);
+
+		netdev_dbg(ena_dev->net_device,
+			   "AENQ! Group[%x] Syndrome[%x] timestamp: [%llus]\n",
+			   aenq_common->group, aenq_common->syndrome, timestamp);
 
 		/* Handle specific event*/
-		handler_cb = ena_com_get_specific_aenq_cb(dev,
+		handler_cb = ena_com_get_specific_aenq_cb(ena_dev,
 							  aenq_common->group);
 		handler_cb(data, aenq_e); /* call the actual event handler*/
 
@@ -2094,7 +2128,8 @@ void ena_com_aenq_intr_handler(struct ena_com_dev *dev, void *data)
 
 	/* write the aenq doorbell after all AENQ descriptors were read */
 	mb();
-	writel_relaxed((u32)aenq->head, dev->reg_bar + ENA_REGS_AENQ_HEAD_DB_OFF);
+	writel_relaxed((u32)aenq->head,
+		       ena_dev->reg_bar + ENA_REGS_AENQ_HEAD_DB_OFF);
 #ifndef MMIOWB_NOT_DEFINED
 	mmiowb();
 #endif
@@ -2111,19 +2146,20 @@ int ena_com_dev_reset(struct ena_com_dev *ena_dev,
 
 	if (unlikely((stat == ENA_MMIO_READ_TIMEOUT) ||
 		     (cap == ENA_MMIO_READ_TIMEOUT))) {
-		pr_err("Reg read32 timeout occurred\n");
+		netdev_err(ena_dev->net_device, "Reg read32 timeout occurred\n");
 		return -ETIME;
 	}
 
 	if ((stat & ENA_REGS_DEV_STS_READY_MASK) == 0) {
-		pr_err("Device isn't ready, can't reset device\n");
+		netdev_err(ena_dev->net_device,
+			   "Device isn't ready, can't reset device\n");
 		return -EINVAL;
 	}
 
 	timeout = (cap & ENA_REGS_CAPS_RESET_TIMEOUT_MASK) >>
 			ENA_REGS_CAPS_RESET_TIMEOUT_SHIFT;
 	if (timeout == 0) {
-		pr_err("Invalid timeout value\n");
+		netdev_err(ena_dev->net_device, "Invalid timeout value\n");
 		return -EINVAL;
 	}
 
@@ -2139,7 +2175,8 @@ int ena_com_dev_reset(struct ena_com_dev *ena_dev,
 	rc = wait_for_reset_state(ena_dev, timeout,
 				  ENA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK);
 	if (rc != 0) {
-		pr_err("Reset indication didn't turn on\n");
+		netdev_err(ena_dev->net_device,
+			   "Reset indication didn't turn on\n");
 		return rc;
 	}
 
@@ -2147,7 +2184,8 @@ int ena_com_dev_reset(struct ena_com_dev *ena_dev,
 	writel(0, ena_dev->reg_bar + ENA_REGS_DEV_CTL_OFF);
 	rc = wait_for_reset_state(ena_dev, timeout, 0);
 	if (rc != 0) {
-		pr_err("Reset indication didn't turn off\n");
+		netdev_err(ena_dev->net_device,
+			   "Reset indication didn't turn off\n");
 		return rc;
 	}
 
@@ -2184,7 +2222,8 @@ static int ena_get_dev_stats(struct ena_com_dev *ena_dev,
 					     sizeof(*get_resp));
 
 	if (unlikely(ret))
-		pr_err("Failed to get stats. error: %d\n", ret);
+		netdev_err(ena_dev->net_device,
+			   "Failed to get stats. error: %d\n", ret);
 
 	return ret;
 }
@@ -2227,7 +2266,8 @@ int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, int mtu)
 	int ret;
 
 	if (!ena_com_check_supported_feature_id(ena_dev, ENA_ADMIN_MTU)) {
-		pr_debug("Feature %d isn't supported\n", ENA_ADMIN_MTU);
+		netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n",
+			   ENA_ADMIN_MTU);
 		return -EOPNOTSUPP;
 	}
 
@@ -2237,7 +2277,7 @@ int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, int mtu)
 	cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE;
 	cmd.aq_common_descriptor.flags = 0;
 	cmd.feat_common.feature_id = ENA_ADMIN_MTU;
-	cmd.u.mtu.mtu = mtu;
+	cmd.u.mtu.mtu = (u32)mtu;
 
 	ret = ena_com_execute_admin_command(admin_queue,
 					    (struct ena_admin_aq_entry *)&cmd,
@@ -2246,7 +2286,8 @@ int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, int mtu)
 					    sizeof(resp));
 
 	if (unlikely(ret))
-		pr_err("Failed to set mtu %d. error: %d\n", mtu, ret);
+		netdev_err(ena_dev->net_device,
+			   "Failed to set mtu %d. error: %d\n", mtu, ret);
 
 	return ret;
 }
@@ -2260,7 +2301,8 @@ int ena_com_get_offload_settings(struct ena_com_dev *ena_dev,
 	ret = ena_com_get_feature(ena_dev, &resp,
 				  ENA_ADMIN_STATELESS_OFFLOAD_CONFIG, 0);
 	if (unlikely(ret)) {
-		pr_err("Failed to get offload capabilities %d\n", ret);
+		netdev_err(ena_dev->net_device,
+			   "Failed to get offload capabilities %d\n", ret);
 		return ret;
 	}
 
@@ -2280,8 +2322,8 @@ int ena_com_set_hash_function(struct ena_com_dev *ena_dev)
 
 	if (!ena_com_check_supported_feature_id(ena_dev,
 						ENA_ADMIN_RSS_HASH_FUNCTION)) {
-		pr_debug("Feature %d isn't supported\n",
-			 ENA_ADMIN_RSS_HASH_FUNCTION);
+		netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n",
+			   ENA_ADMIN_RSS_HASH_FUNCTION);
 		return -EOPNOTSUPP;
 	}
 
@@ -2292,8 +2334,9 @@ int ena_com_set_hash_function(struct ena_com_dev *ena_dev)
 		return ret;
 
 	if (!(get_resp.u.flow_hash_func.supported_func & BIT(rss->hash_func))) {
-		pr_err("Func hash %d isn't supported by device, abort\n",
-		       rss->hash_func);
+		netdev_err(ena_dev->net_device,
+			   "Func hash %d isn't supported by device, abort\n",
+			   rss->hash_func);
 		return -EOPNOTSUPP;
 	}
 
@@ -2310,7 +2353,7 @@ int ena_com_set_hash_function(struct ena_com_dev *ena_dev)
 				   &cmd.control_buffer.address,
 				   rss->hash_key_dma_addr);
 	if (unlikely(ret)) {
-		pr_err("memory address set failed\n");
+		netdev_err(ena_dev->net_device, "Memory address set failed\n");
 		return ret;
 	}
 
@@ -2322,8 +2365,9 @@ int ena_com_set_hash_function(struct ena_com_dev *ena_dev)
 					    (struct ena_admin_acq_entry *)&resp,
 					    sizeof(resp));
 	if (unlikely(ret)) {
-		pr_err("Failed to set hash function %d. error: %d\n",
-		       rss->hash_func, ret);
+		netdev_err(ena_dev->net_device,
+			   "Failed to set hash function %d. error: %d\n",
+			   rss->hash_func, ret);
 		return -EINVAL;
 	}
 
@@ -2354,7 +2398,8 @@ int ena_com_fill_hash_function(struct ena_com_dev *ena_dev,
 		return rc;
 
 	if (!(BIT(func) & get_resp.u.flow_hash_func.supported_func)) {
-		pr_err("Flow hash function %d isn't supported\n", func);
+		netdev_err(ena_dev->net_device,
+			   "Flow hash function %d isn't supported\n", func);
 		return -EOPNOTSUPP;
 	}
 
@@ -2362,20 +2407,22 @@ int ena_com_fill_hash_function(struct ena_com_dev *ena_dev,
 	case ENA_ADMIN_TOEPLITZ:
 		if (key) {
 			if (key_len != sizeof(hash_key->key)) {
-				pr_err("key len (%hu) doesn't equal the supported size (%zu)\n",
-				       key_len, sizeof(hash_key->key));
+				netdev_err(ena_dev->net_device,
+					   "key len (%hu) doesn't equal the supported size (%zu)\n",
+					   key_len, sizeof(hash_key->key));
 				return -EINVAL;
 			}
 			memcpy(hash_key->key, key, key_len);
 			rss->hash_init_val = init_val;
-			hash_key->keys_num = key_len >> 2;
+			hash_key->key_parts = key_len / sizeof(hash_key->key[0]);
 		}
 		break;
 	case ENA_ADMIN_CRC32:
 		rss->hash_init_val = init_val;
 		break;
 	default:
-		pr_err("Invalid hash function (%d)\n", func);
+		netdev_err(ena_dev->net_device, "Invalid hash function (%d)\n",
+			   func);
 		return -EINVAL;
 	}
 
@@ -2397,21 +2444,22 @@ int ena_com_get_hash_function(struct ena_com_dev *ena_dev,
 	struct ena_admin_get_feat_resp get_resp;
 	int rc;
 
-	if (func) {
-		rc = ena_com_get_feature_ex(ena_dev, &get_resp,
-					    ENA_ADMIN_RSS_HASH_FUNCTION,
-					    rss->hash_key_dma_addr,
-					    sizeof(*rss->hash_key), 0);
-		if (unlikely(rc))
-			return rc;
+	if (unlikely(!func))
+		return -EINVAL;
 
-		/* ffs returns 1 in case the lsb is set */
-		rss->hash_func = ffs(get_resp.u.flow_hash_func.selected_func);
-		if (rss->hash_func)
-			rss->hash_func--;
+	rc = ena_com_get_feature_ex(ena_dev, &get_resp,
+				    ENA_ADMIN_RSS_HASH_FUNCTION,
+				    rss->hash_key_dma_addr,
+				    sizeof(*rss->hash_key), 0);
+	if (unlikely(rc))
+		return rc;
 
-		*func = rss->hash_func;
-	}
+	/* ffs() returns 1 in case the lsb is set */
+	rss->hash_func = ffs(get_resp.u.flow_hash_func.selected_func);
+	if (rss->hash_func)
+		rss->hash_func--;
+
+	*func = rss->hash_func;
 
 	return 0;
 }
@@ -2422,7 +2470,8 @@ int ena_com_get_hash_key(struct ena_com_dev *ena_dev, u8 *key)
 		ena_dev->rss.hash_key;
 
 	if (key)
-		memcpy(key, hash_key->key, (size_t)(hash_key->keys_num) << 2);
+		memcpy(key, hash_key->key,
+		       (size_t)(hash_key->key_parts) * sizeof(hash_key->key[0]));
 
 	return 0;
 }
@@ -2459,8 +2508,8 @@ int ena_com_set_hash_ctrl(struct ena_com_dev *ena_dev)
 
 	if (!ena_com_check_supported_feature_id(ena_dev,
 						ENA_ADMIN_RSS_HASH_INPUT)) {
-		pr_debug("Feature %d isn't supported\n",
-			 ENA_ADMIN_RSS_HASH_INPUT);
+		netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n",
+			   ENA_ADMIN_RSS_HASH_INPUT);
 		return -EOPNOTSUPP;
 	}
 
@@ -2478,7 +2527,7 @@ int ena_com_set_hash_ctrl(struct ena_com_dev *ena_dev)
 				   &cmd.control_buffer.address,
 				   rss->hash_ctrl_dma_addr);
 	if (unlikely(ret)) {
-		pr_err("memory address set failed\n");
+		netdev_err(ena_dev->net_device, "Memory address set failed\n");
 		return ret;
 	}
 	cmd.control_buffer.length = sizeof(*hash_ctrl);
@@ -2489,7 +2538,8 @@ int ena_com_set_hash_ctrl(struct ena_com_dev *ena_dev)
 					    (struct ena_admin_acq_entry *)&resp,
 					    sizeof(resp));
 	if (unlikely(ret))
-		pr_err("Failed to set hash input. error: %d\n", ret);
+		netdev_err(ena_dev->net_device,
+			   "Failed to set hash input. error: %d\n", ret);
 
 	return ret;
 }
@@ -2539,9 +2589,10 @@ int ena_com_set_default_hash_ctrl(struct ena_com_dev *ena_dev)
 		available_fields = hash_ctrl->selected_fields[i].fields &
 				hash_ctrl->supported_fields[i].fields;
 		if (available_fields != hash_ctrl->selected_fields[i].fields) {
-			pr_err("hash control doesn't support all the desire configuration. proto %x supported %x selected %x\n",
-			       i, hash_ctrl->supported_fields[i].fields,
-			       hash_ctrl->selected_fields[i].fields);
+			netdev_err(ena_dev->net_device,
+				   "Hash control doesn't support all the desire configuration. proto %x supported %x selected %x\n",
+				   i, hash_ctrl->supported_fields[i].fields,
+				   hash_ctrl->selected_fields[i].fields);
 			return -EOPNOTSUPP;
 		}
 	}
@@ -2565,7 +2616,8 @@ int ena_com_fill_hash_ctrl(struct ena_com_dev *ena_dev,
 	int rc;
 
 	if (proto >= ENA_ADMIN_RSS_PROTO_NUM) {
-		pr_err("Invalid proto num (%u)\n", proto);
+		netdev_err(ena_dev->net_device, "Invalid proto num (%u)\n",
+			   proto);
 		return -EINVAL;
 	}
 
@@ -2577,8 +2629,9 @@ int ena_com_fill_hash_ctrl(struct ena_com_dev *ena_dev,
 	/* Make sure all the fields are supported */
 	supported_fields = hash_ctrl->supported_fields[proto].fields;
 	if ((hash_fields & supported_fields) != hash_fields) {
-		pr_err("proto %d doesn't support the required fields %x. supports only: %x\n",
-		       proto, hash_fields, supported_fields);
+		netdev_err(ena_dev->net_device,
+			   "Proto %d doesn't support the required fields %x. supports only: %x\n",
+			   proto, hash_fields, supported_fields);
 	}
 
 	hash_ctrl->selected_fields[proto].fields = hash_fields;
@@ -2617,15 +2670,16 @@ int ena_com_indirect_table_set(struct ena_com_dev *ena_dev)
 	int ret;
 
 	if (!ena_com_check_supported_feature_id(
-		    ena_dev, ENA_ADMIN_RSS_REDIRECTION_TABLE_CONFIG)) {
-		pr_debug("Feature %d isn't supported\n",
-			 ENA_ADMIN_RSS_REDIRECTION_TABLE_CONFIG);
+		    ena_dev, ENA_ADMIN_RSS_INDIRECTION_TABLE_CONFIG)) {
+		netdev_dbg(ena_dev->net_device, "Feature %d isn't supported\n",
+			   ENA_ADMIN_RSS_INDIRECTION_TABLE_CONFIG);
 		return -EOPNOTSUPP;
 	}
 
 	ret = ena_com_ind_tbl_convert_to_device(ena_dev);
 	if (ret) {
-		pr_err("Failed to convert host indirection table to device table\n");
+		netdev_err(ena_dev->net_device,
+			   "Failed to convert host indirection table to device table\n");
 		return ret;
 	}
 
@@ -2634,7 +2688,7 @@ int ena_com_indirect_table_set(struct ena_com_dev *ena_dev)
 	cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE;
 	cmd.aq_common_descriptor.flags =
 		ENA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK;
-	cmd.feat_common.feature_id = ENA_ADMIN_RSS_REDIRECTION_TABLE_CONFIG;
+	cmd.feat_common.feature_id = ENA_ADMIN_RSS_INDIRECTION_TABLE_CONFIG;
 	cmd.u.ind_table.size = rss->tbl_log_size;
 	cmd.u.ind_table.inline_index = 0xFFFFFFFF;
 
@@ -2642,11 +2696,11 @@ int ena_com_indirect_table_set(struct ena_com_dev *ena_dev)
 				   &cmd.control_buffer.address,
 				   rss->rss_ind_tbl_dma_addr);
 	if (unlikely(ret)) {
-		pr_err("memory address set failed\n");
+		netdev_err(ena_dev->net_device, "Memory address set failed\n");
 		return ret;
 	}
 
-	cmd.control_buffer.length = (1ULL << rss->tbl_log_size) *
+	cmd.control_buffer.length = (u32)(1ULL << rss->tbl_log_size) *
 		sizeof(struct ena_admin_rss_ind_table_entry);
 
 	ret = ena_com_execute_admin_command(admin_queue,
@@ -2656,7 +2710,8 @@ int ena_com_indirect_table_set(struct ena_com_dev *ena_dev)
 					    sizeof(resp));
 
 	if (unlikely(ret))
-		pr_err("Failed to set indirect table. error: %d\n", ret);
+		netdev_err(ena_dev->net_device,
+			   "Failed to set indirect table. error: %d\n", ret);
 
 	return ret;
 }
@@ -2668,11 +2723,11 @@ int ena_com_indirect_table_get(struct ena_com_dev *ena_dev, u32 *ind_tbl)
 	u32 tbl_size;
 	int i, rc;
 
-	tbl_size = (1ULL << rss->tbl_log_size) *
+	tbl_size = (u32)(1ULL << rss->tbl_log_size) *
 		sizeof(struct ena_admin_rss_ind_table_entry);
 
 	rc = ena_com_get_feature_ex(ena_dev, &get_resp,
-				    ENA_ADMIN_RSS_REDIRECTION_TABLE_CONFIG,
+				    ENA_ADMIN_RSS_INDIRECTION_TABLE_CONFIG,
 				    rss->rss_ind_tbl_dma_addr,
 				    tbl_size, 0);
 	if (unlikely(rc))
@@ -2812,7 +2867,7 @@ int ena_com_set_host_attributes(struct ena_com_dev *ena_dev)
 				   &cmd.u.host_attr.debug_ba,
 				   host_attr->debug_area_dma_addr);
 	if (unlikely(ret)) {
-		pr_err("memory address set failed\n");
+		netdev_err(ena_dev->net_device, "Memory address set failed\n");
 		return ret;
 	}
 
@@ -2820,7 +2875,7 @@ int ena_com_set_host_attributes(struct ena_com_dev *ena_dev)
 				   &cmd.u.host_attr.os_info_ba,
 				   host_attr->host_info_dma_addr);
 	if (unlikely(ret)) {
-		pr_err("memory address set failed\n");
+		netdev_err(ena_dev->net_device, "Memory address set failed\n");
 		return ret;
 	}
 
@@ -2833,7 +2888,8 @@ int ena_com_set_host_attributes(struct ena_com_dev *ena_dev)
 					    sizeof(resp));
 
 	if (unlikely(ret))
-		pr_err("Failed to set host attributes: %d\n", ret);
+		netdev_err(ena_dev->net_device,
+			   "Failed to set host attributes: %d\n", ret);
 
 	return ret;
 }
@@ -2845,12 +2901,14 @@ bool ena_com_interrupt_moderation_supported(struct ena_com_dev *ena_dev)
 						  ENA_ADMIN_INTERRUPT_MODERATION);
 }
 
-static int ena_com_update_nonadaptive_moderation_interval(u32 coalesce_usecs,
+static int ena_com_update_nonadaptive_moderation_interval(struct ena_com_dev *ena_dev,
+							  u32 coalesce_usecs,
 							  u32 intr_delay_resolution,
 							  u32 *intr_moder_interval)
 {
 	if (!intr_delay_resolution) {
-		pr_err("Illegal interrupt delay granularity value\n");
+		netdev_err(ena_dev->net_device,
+			   "Illegal interrupt delay granularity value\n");
 		return -EFAULT;
 	}
 
@@ -2859,11 +2917,11 @@ static int ena_com_update_nonadaptive_moderation_interval(u32 coalesce_usecs,
 	return 0;
 }
 
-
 int ena_com_update_nonadaptive_moderation_interval_tx(struct ena_com_dev *ena_dev,
 						      u32 tx_coalesce_usecs)
 {
-	return ena_com_update_nonadaptive_moderation_interval(tx_coalesce_usecs,
+	return ena_com_update_nonadaptive_moderation_interval(ena_dev,
+							      tx_coalesce_usecs,
 							      ena_dev->intr_delay_resolution,
 							      &ena_dev->intr_moder_tx_interval);
 }
@@ -2871,7 +2929,8 @@ int ena_com_update_nonadaptive_moderation_interval_tx(struct ena_com_dev *ena_de
 int ena_com_update_nonadaptive_moderation_interval_rx(struct ena_com_dev *ena_dev,
 						      u32 rx_coalesce_usecs)
 {
-	return ena_com_update_nonadaptive_moderation_interval(rx_coalesce_usecs,
+	return ena_com_update_nonadaptive_moderation_interval(ena_dev,
+							      rx_coalesce_usecs,
 							      ena_dev->intr_delay_resolution,
 							      &ena_dev->intr_moder_rx_interval);
 }
@@ -2887,12 +2946,14 @@ int ena_com_init_interrupt_moderation(struct ena_com_dev *ena_dev)
 
 	if (rc) {
 		if (rc == -EOPNOTSUPP) {
-			pr_debug("Feature %d isn't supported\n",
-				 ENA_ADMIN_INTERRUPT_MODERATION);
+			netdev_dbg(ena_dev->net_device,
+				   "Feature %d isn't supported\n",
+				   ENA_ADMIN_INTERRUPT_MODERATION);
 			rc = 0;
 		} else {
-			pr_err("Failed to get interrupt moderation admin cmd. rc: %d\n",
-			       rc);
+			netdev_err(ena_dev->net_device,
+				   "Failed to get interrupt moderation admin cmd. rc: %d\n",
+				   rc);
 		}
 
 		/* no moderation supported, disable adaptive support */
@@ -2940,7 +3001,8 @@ int ena_com_config_dev_mode(struct ena_com_dev *ena_dev,
 		(llq_info->descs_num_before_header * sizeof(struct ena_eth_io_tx_desc));
 
 	if (unlikely(ena_dev->tx_max_header_size == 0)) {
-		pr_err("the size of the LLQ entry is smaller than needed\n");
+		netdev_err(ena_dev->net_device,
+			   "The size of the LLQ entry is smaller than needed\n");
 		return -EINVAL;
 	}
 
diff --git a/drivers/amazon/net/ena/ena_com.h b/drivers/amazon/net/ena/ena_com.h
index 8ee8c4864f221..336c535f4ec94 100644
--- a/drivers/amazon/net/ena/ena_com.h
+++ b/drivers/amazon/net/ena/ena_com.h
@@ -1,33 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
- * Copyright 2015 Amazon.com, Inc. or its affiliates.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef ENA_COM
@@ -235,11 +208,11 @@ struct ena_com_admin_sq {
 };
 
 struct ena_com_stats_admin {
-	u32 aborted_cmd;
-	u32 submitted_cmd;
-	u32 completed_cmd;
-	u32 out_of_space;
-	u32 no_completion;
+	u64 aborted_cmd;
+	u64 submitted_cmd;
+	u64 completed_cmd;
+	u64 out_of_space;
+	u64 no_completion;
 };
 
 struct ena_com_admin_queue {
@@ -334,6 +307,7 @@ struct ena_com_dev {
 	void __iomem *mem_bar;
 	void *dmadev;
 	void *bus;
+	struct net_device *net_device;
 
 	enum ena_admin_placement_policy_type tx_mem_queue_type;
 	u32 tx_max_header_size;
@@ -371,7 +345,6 @@ struct ena_com_dev_get_features_ctx {
 	struct ena_admin_feature_offload_desc offload;
 	struct ena_admin_ena_hw_hints hw_hints;
 	struct ena_admin_feature_llq_desc llq;
-	struct ena_admin_feature_rss_ind_table ind_table;
 };
 
 struct ena_com_create_io_ctx {
@@ -553,7 +526,7 @@ void ena_com_admin_q_comp_intr_handler(struct ena_com_dev *ena_dev);
  * This method goes over the async event notification queue and calls the proper
  * aenq handler.
  */
-void ena_com_aenq_intr_handler(struct ena_com_dev *dev, void *data);
+void ena_com_aenq_intr_handler(struct ena_com_dev *ena_dev, void *data);
 
 /* ena_com_abort_admin_commands - Abort all the outstanding admin commands.
  * @ena_dev: ENA communication layer struct
@@ -723,7 +696,7 @@ int ena_com_set_hash_function(struct ena_com_dev *ena_dev);
  *
  * Retrieve the hash function from the device.
  *
- * @note: If the caller called ena_com_fill_hash_function but didn't flash
+ * @note: If the caller called ena_com_fill_hash_function but didn't flush
  * it to the device, the new configuration will be lost.
  *
  * @return: 0 on Success and negative value otherwise.
@@ -737,7 +710,7 @@ int ena_com_get_hash_function(struct ena_com_dev *ena_dev,
  *
  * Retrieve the hash key.
  *
- * @note: If the caller called ena_com_fill_hash_key but didn't flash
+ * @note: If the caller called ena_com_fill_hash_key but didn't flush
  * it to the device, the new configuration will be lost.
  *
  * @return: 0 on Success and negative value otherwise.
@@ -777,7 +750,7 @@ int ena_com_set_hash_ctrl(struct ena_com_dev *ena_dev);
  *
  * Retrieve the hash control from the device.
  *
- * @note: If the caller called ena_com_fill_hash_ctrl but didn't flash
+ * @note: If the caller called ena_com_fill_hash_ctrl but didn't flush
  * it to the device, the new configuration will be lost.
  *
  * @return: 0 on Success and negative value otherwise.
@@ -975,6 +948,26 @@ int ena_com_config_dev_mode(struct ena_com_dev *ena_dev,
 			    struct ena_admin_feature_llq_desc *llq_features,
 			    struct ena_llq_configurations *llq_default_config);
 
+/* ena_com_io_sq_to_ena_dev - Extract ena_com_dev using contained field io_sq.
+ * @io_sq: IO submit queue struct
+ *
+ * @return - ena_com_dev struct extracted from io_sq
+ */
+static inline struct ena_com_dev *ena_com_io_sq_to_ena_dev(struct ena_com_io_sq *io_sq)
+{
+	return container_of(io_sq, struct ena_com_dev, io_sq_queues[io_sq->qid]);
+}
+
+/* ena_com_io_cq_to_ena_dev - Extract ena_com_dev using contained field io_cq.
+ * @io_sq: IO submit queue struct
+ *
+ * @return - ena_com_dev struct extracted from io_sq
+ */
+static inline struct ena_com_dev *ena_com_io_cq_to_ena_dev(struct ena_com_io_cq *io_cq)
+{
+	return container_of(io_cq, struct ena_com_dev, io_cq_queues[io_cq->qid]);
+}
+
 static inline bool ena_com_get_adaptive_moderation_enabled(struct ena_com_dev *ena_dev)
 {
 	return ena_dev->adaptive_coalescing;
diff --git a/drivers/amazon/net/ena/ena_common_defs.h b/drivers/amazon/net/ena/ena_common_defs.h
index 77ab0c1f8c73f..e210c8a81fc0e 100755
--- a/drivers/amazon/net/ena/ena_common_defs.h
+++ b/drivers/amazon/net/ena/ena_common_defs.h
@@ -1,33 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
- * Copyright 2015 - 2018 Amazon.com, Inc. or its affiliates.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 #ifndef _ENA_COMMON_H_
 #define _ENA_COMMON_H_
diff --git a/drivers/amazon/net/ena/ena_eth_com.c b/drivers/amazon/net/ena/ena_eth_com.c
index 51c0df9a857e5..c3be751e7379f 100644
--- a/drivers/amazon/net/ena/ena_eth_com.c
+++ b/drivers/amazon/net/ena/ena_eth_com.c
@@ -1,33 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /*
- * Copyright 2015 Amazon.com, Inc. or its affiliates.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include "ena_eth_com.h"
@@ -85,13 +58,15 @@ static int ena_com_write_bounce_buffer_to_dev(struct ena_com_io_sq *io_sq,
 
 	if (is_llq_max_tx_burst_exists(io_sq)) {
 		if (unlikely(!io_sq->entries_in_tx_burst_left)) {
-			pr_err("Error: trying to send more packets than tx burst allows\n");
+			netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+				   "Error: trying to send more packets than tx burst allows\n");
 			return -ENOSPC;
 		}
 
 		io_sq->entries_in_tx_burst_left--;
-		pr_debug("decreasing entries_in_tx_burst_left of queue %d to %d\n",
-			 io_sq->qid, io_sq->entries_in_tx_burst_left);
+		netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+			   "Decreasing entries_in_tx_burst_left of queue %d to %d\n",
+			   io_sq->qid, io_sq->entries_in_tx_burst_left);
 	}
 
 	/* Make sure everything was written into the bounce buffer before
@@ -129,12 +104,14 @@ static int ena_com_write_header_to_bounce(struct ena_com_io_sq *io_sq,
 
 	if (unlikely((header_offset + header_len) >
 		     llq_info->desc_list_entry_size)) {
-		pr_err("trying to write header larger than llq entry can accommodate\n");
+		netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+			   "Trying to write header larger than llq entry can accommodate\n");
 		return -EFAULT;
 	}
 
 	if (unlikely(!bounce_buffer)) {
-		pr_err("bounce buffer is NULL\n");
+		netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+			   "Bounce buffer is NULL\n");
 		return -EFAULT;
 	}
 
@@ -152,7 +129,8 @@ static void *get_sq_desc_llq(struct ena_com_io_sq *io_sq)
 	bounce_buffer = pkt_ctrl->curr_bounce_buf;
 
 	if (unlikely(!bounce_buffer)) {
-		pr_err("bounce buffer is NULL\n");
+		netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+			   "Bounce buffer is NULL\n");
 		return NULL;
 	}
 
@@ -176,10 +154,8 @@ static int ena_com_close_bounce_buffer(struct ena_com_io_sq *io_sq)
 	if (pkt_ctrl->idx) {
 		rc = ena_com_write_bounce_buffer_to_dev(io_sq,
 							pkt_ctrl->curr_bounce_buf);
-		if (unlikely(rc)) {
-			pr_err("failed to write bounce buffer to device\n");
+		if (unlikely(rc))
 			return rc;
-		}
 
 		pkt_ctrl->curr_bounce_buf =
 			ena_com_get_next_bounce_buffer(&io_sq->bounce_buf_ctrl);
@@ -209,10 +185,8 @@ static int ena_com_sq_update_llq_tail(struct ena_com_io_sq *io_sq)
 	if (!pkt_ctrl->descs_left_in_line) {
 		rc = ena_com_write_bounce_buffer_to_dev(io_sq,
 							pkt_ctrl->curr_bounce_buf);
-		if (unlikely(rc)) {
-			pr_err("failed to write bounce buffer to device\n");
+		if (unlikely(rc))
 			return rc;
-		}
 
 		pkt_ctrl->curr_bounce_buf =
 			ena_com_get_next_bounce_buffer(&io_sq->bounce_buf_ctrl);
@@ -281,8 +255,9 @@ static u16 ena_com_cdesc_rx_pkt_get(struct ena_com_io_cq *io_cq,
 		io_cq->cur_rx_pkt_cdesc_count = 0;
 		io_cq->cur_rx_pkt_cdesc_start_idx = head_masked;
 
-		pr_debug("ena q_id: %d packets were completed. first desc idx %u descs# %d\n",
-			 io_cq->qid, *first_cdesc_idx, count);
+		netdev_dbg(ena_com_io_cq_to_ena_dev(io_cq)->net_device,
+			   "ENA q_id: %d packets were completed. first desc idx %u descs# %d\n",
+			   io_cq->qid, *first_cdesc_idx, count);
 	} else {
 		io_cq->cur_rx_pkt_cdesc_count += count;
 		count = 0;
@@ -297,6 +272,9 @@ static int ena_com_create_meta(struct ena_com_io_sq *io_sq,
 	struct ena_eth_io_tx_meta_desc *meta_desc = NULL;
 
 	meta_desc = get_sq_desc(io_sq);
+	if (unlikely(!meta_desc))
+		return -EFAULT;
+
 	memset(meta_desc, 0x0, sizeof(struct ena_eth_io_tx_meta_desc));
 
 	meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_META_DESC_MASK;
@@ -304,7 +282,7 @@ static int ena_com_create_meta(struct ena_com_io_sq *io_sq,
 	meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_EXT_VALID_MASK;
 
 	/* bits 0-9 of the mss */
-	meta_desc->word2 |= (ena_meta->mss <<
+	meta_desc->word2 |= ((u32)ena_meta->mss <<
 		ENA_ETH_IO_TX_META_DESC_MSS_LO_SHIFT) &
 		ENA_ETH_IO_TX_META_DESC_MSS_LO_MASK;
 	/* bits 10-13 of the mss */
@@ -314,7 +292,7 @@ static int ena_com_create_meta(struct ena_com_io_sq *io_sq,
 
 	/* Extended meta desc */
 	meta_desc->len_ctrl |= ENA_ETH_IO_TX_META_DESC_ETH_META_TYPE_MASK;
-	meta_desc->len_ctrl |= (io_sq->phase <<
+	meta_desc->len_ctrl |= ((u32)io_sq->phase <<
 		ENA_ETH_IO_TX_META_DESC_PHASE_SHIFT) &
 		ENA_ETH_IO_TX_META_DESC_PHASE_MASK;
 
@@ -327,7 +305,7 @@ static int ena_com_create_meta(struct ena_com_io_sq *io_sq,
 		ENA_ETH_IO_TX_META_DESC_L3_HDR_OFF_SHIFT) &
 		ENA_ETH_IO_TX_META_DESC_L3_HDR_OFF_MASK;
 
-	meta_desc->word2 |= (ena_meta->l4_hdr_len <<
+	meta_desc->word2 |= ((u32)ena_meta->l4_hdr_len <<
 		ENA_ETH_IO_TX_META_DESC_L4_HDR_LEN_IN_WORDS_SHIFT) &
 		ENA_ETH_IO_TX_META_DESC_L4_HDR_LEN_IN_WORDS_MASK;
 
@@ -349,20 +327,23 @@ static int ena_com_create_and_store_tx_meta_desc(struct ena_com_io_sq *io_sq,
 
 		*have_meta = true;
 		return ena_com_create_meta(io_sq, ena_meta);
-	} else if (ena_com_meta_desc_changed(io_sq, ena_tx_ctx)) {
+	}
+
+	if (ena_com_meta_desc_changed(io_sq, ena_tx_ctx)) {
 		*have_meta = true;
 		/* Cache the meta desc */
 		memcpy(&io_sq->cached_tx_meta, ena_meta,
 		       sizeof(struct ena_com_tx_meta));
 		return ena_com_create_meta(io_sq, ena_meta);
-	} else {
-		*have_meta = false;
-		return 0;
 	}
+
+	*have_meta = false;
+	return 0;
 }
 
-static void ena_com_rx_set_flags(struct ena_com_rx_ctx *ena_rx_ctx,
-					struct ena_eth_io_rx_cdesc_base *cdesc)
+static void ena_com_rx_set_flags(struct ena_com_io_cq *io_cq,
+				 struct ena_com_rx_ctx *ena_rx_ctx,
+				 struct ena_eth_io_rx_cdesc_base *cdesc)
 {
 	ena_rx_ctx->l3_proto = cdesc->status &
 		ENA_ETH_IO_RX_CDESC_BASE_L3_PROTO_IDX_MASK;
@@ -383,10 +364,11 @@ static void ena_com_rx_set_flags(struct ena_com_rx_ctx *ena_rx_ctx,
 		(cdesc->status & ENA_ETH_IO_RX_CDESC_BASE_IPV4_FRAG_MASK) >>
 		ENA_ETH_IO_RX_CDESC_BASE_IPV4_FRAG_SHIFT;
 
-	pr_debug("ena_rx_ctx->l3_proto %d ena_rx_ctx->l4_proto %d\nena_rx_ctx->l3_csum_err %d ena_rx_ctx->l4_csum_err %d\nhash frag %d frag: %d cdesc_status: %x\n",
-		 ena_rx_ctx->l3_proto, ena_rx_ctx->l4_proto,
-		 ena_rx_ctx->l3_csum_err, ena_rx_ctx->l4_csum_err,
-		 ena_rx_ctx->hash, ena_rx_ctx->frag, cdesc->status);
+	netdev_dbg(ena_com_io_cq_to_ena_dev(io_cq)->net_device,
+		   "l3_proto %d l4_proto %d l3_csum_err %d l4_csum_err %d hash %d frag %d cdesc_status %x\n",
+		   ena_rx_ctx->l3_proto, ena_rx_ctx->l4_proto,
+		   ena_rx_ctx->l3_csum_err, ena_rx_ctx->l4_csum_err,
+		   ena_rx_ctx->hash, ena_rx_ctx->frag, cdesc->status);
 }
 
 /*****************************************************************************/
@@ -411,21 +393,21 @@ int ena_com_prepare_tx(struct ena_com_io_sq *io_sq,
 
 	/* num_bufs +1 for potential meta desc */
 	if (unlikely(!ena_com_sq_have_enough_space(io_sq, num_bufs + 1))) {
-		pr_debug("Not enough space in the tx queue\n");
+		netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+			   "Not enough space in the tx queue\n");
 		return -ENOMEM;
 	}
 
 	if (unlikely(header_len > io_sq->tx_max_header_size)) {
-		pr_err("header size is too large %d max header: %d\n",
-		       header_len, io_sq->tx_max_header_size);
+		netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+			   "Header size is too large %d max header: %d\n",
+			   header_len, io_sq->tx_max_header_size);
 		return -EINVAL;
 	}
 
 	if (unlikely(io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV &&
-		     !buffer_to_push)) {
-		pr_err("push header wasn't provided on LLQ mode\n");
+		     !buffer_to_push))
 		return -EINVAL;
-	}
 
 	rc = ena_com_write_header_to_bounce(io_sq, buffer_to_push, header_len);
 	if (unlikely(rc))
@@ -433,15 +415,14 @@ int ena_com_prepare_tx(struct ena_com_io_sq *io_sq,
 
 	rc = ena_com_create_and_store_tx_meta_desc(io_sq, ena_tx_ctx, &have_meta);
 	if (unlikely(rc)) {
-		pr_err("failed to create and store tx meta desc\n");
+		netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+			   "Failed to create and store tx meta desc\n");
 		return rc;
 	}
 
 	/* If the caller doesn't want to send packets */
 	if (unlikely(!num_bufs && !header_len)) {
 		rc = ena_com_close_bounce_buffer(io_sq);
-		if (rc)
-			pr_err("failed to write buffers to LLQ\n");
 		*nb_hw_desc = io_sq->tail - start_tail;
 		return rc;
 	}
@@ -455,16 +436,16 @@ int ena_com_prepare_tx(struct ena_com_io_sq *io_sq,
 	if (!have_meta)
 		desc->len_ctrl |= ENA_ETH_IO_TX_DESC_FIRST_MASK;
 
-	desc->buff_addr_hi_hdr_sz |= (header_len <<
+	desc->buff_addr_hi_hdr_sz |= ((u32)header_len <<
 		ENA_ETH_IO_TX_DESC_HEADER_LENGTH_SHIFT) &
 		ENA_ETH_IO_TX_DESC_HEADER_LENGTH_MASK;
-	desc->len_ctrl |= (io_sq->phase << ENA_ETH_IO_TX_DESC_PHASE_SHIFT) &
+	desc->len_ctrl |= ((u32)io_sq->phase << ENA_ETH_IO_TX_DESC_PHASE_SHIFT) &
 		ENA_ETH_IO_TX_DESC_PHASE_MASK;
 
 	desc->len_ctrl |= ENA_ETH_IO_TX_DESC_COMP_REQ_MASK;
 
 	/* Bits 0-9 */
-	desc->meta_ctrl |= (ena_tx_ctx->req_id <<
+	desc->meta_ctrl |= ((u32)ena_tx_ctx->req_id <<
 		ENA_ETH_IO_TX_DESC_REQ_ID_LO_SHIFT) &
 		ENA_ETH_IO_TX_DESC_REQ_ID_LO_MASK;
 
@@ -501,10 +482,8 @@ int ena_com_prepare_tx(struct ena_com_io_sq *io_sq,
 		/* The first desc share the same desc as the header */
 		if (likely(i != 0)) {
 			rc = ena_com_sq_update_tail(io_sq);
-			if (unlikely(rc)) {
-				pr_err("failed to update sq tail\n");
+			if (unlikely(rc))
 				return rc;
-			}
 
 			desc = get_sq_desc(io_sq);
 			if (unlikely(!desc))
@@ -512,7 +491,7 @@ int ena_com_prepare_tx(struct ena_com_io_sq *io_sq,
 
 			memset(desc, 0x0, sizeof(struct ena_eth_io_tx_desc));
 
-			desc->len_ctrl |= (io_sq->phase <<
+			desc->len_ctrl |= ((u32)io_sq->phase <<
 				ENA_ETH_IO_TX_DESC_PHASE_SHIFT) &
 				ENA_ETH_IO_TX_DESC_PHASE_MASK;
 		}
@@ -533,14 +512,10 @@ int ena_com_prepare_tx(struct ena_com_io_sq *io_sq,
 	desc->len_ctrl |= ENA_ETH_IO_TX_DESC_LAST_MASK;
 
 	rc = ena_com_sq_update_tail(io_sq);
-	if (unlikely(rc)) {
-		pr_err("failed to update sq tail of the last descriptor\n");
+	if (unlikely(rc))
 		return rc;
-	}
 
 	rc = ena_com_close_bounce_buffer(io_sq);
-	if (rc)
-		pr_err("failed when closing bounce buffer\n");
 
 	*nb_hw_desc = io_sq->tail - start_tail;
 	return rc;
@@ -552,6 +527,7 @@ int ena_com_rx_pkt(struct ena_com_io_cq *io_cq,
 {
 	struct ena_com_rx_buf_info *ena_buf = &ena_rx_ctx->ena_bufs[0];
 	struct ena_eth_io_rx_cdesc_base *cdesc = NULL;
+	u16 q_depth = io_cq->q_depth;
 	u16 cdesc_idx = 0;
 	u16 nb_hw_desc;
 	u16 i = 0;
@@ -564,12 +540,14 @@ int ena_com_rx_pkt(struct ena_com_io_cq *io_cq,
 		return 0;
 	}
 
-	pr_debug("fetch rx packet: queue %d completed desc: %d\n", io_cq->qid,
-		 nb_hw_desc);
+	netdev_dbg(ena_com_io_cq_to_ena_dev(io_cq)->net_device,
+		   "Fetch rx packet: queue %d completed desc: %d\n", io_cq->qid,
+		   nb_hw_desc);
 
 	if (unlikely(nb_hw_desc > ena_rx_ctx->max_bufs)) {
-		pr_err("Too many RX cdescs (%d) > MAX(%d)\n", nb_hw_desc,
-		       ena_rx_ctx->max_bufs);
+		netdev_err(ena_com_io_cq_to_ena_dev(io_cq)->net_device,
+			   "Too many RX cdescs (%d) > MAX(%d)\n", nb_hw_desc,
+			   ena_rx_ctx->max_bufs);
 		return -ENOSPC;
 	}
 
@@ -577,21 +555,30 @@ int ena_com_rx_pkt(struct ena_com_io_cq *io_cq,
 	ena_rx_ctx->pkt_offset = cdesc->offset;
 
 	do {
-		ena_buf->len = cdesc->length;
-		ena_buf->req_id = cdesc->req_id;
-		ena_buf++;
-	} while ((++i < nb_hw_desc) && (cdesc = ena_com_rx_cdesc_idx_to_ptr(io_cq, cdesc_idx + i)));
+		ena_buf[i].len = cdesc->length;
+		ena_buf[i].req_id = cdesc->req_id;
+		if (unlikely(ena_buf[i].req_id >= q_depth))
+			return -EIO;
+
+		if (++i >= nb_hw_desc)
+			break;
+
+		cdesc = ena_com_rx_cdesc_idx_to_ptr(io_cq, cdesc_idx + i);
+
+	} while (1);
 
 	/* Update SQ head ptr */
 	io_sq->next_to_comp += nb_hw_desc;
 
-	pr_debug("[%s][QID#%d] Updating SQ head to: %d\n", __func__, io_sq->qid,
-		 io_sq->next_to_comp);
+	netdev_dbg(ena_com_io_cq_to_ena_dev(io_cq)->net_device,
+		   "[%s][QID#%d] Updating SQ head to: %d\n", __func__,
+		   io_sq->qid, io_sq->next_to_comp);
 
 	/* Get rx flags from the last pkt */
-	ena_com_rx_set_flags(ena_rx_ctx, cdesc);
+	ena_com_rx_set_flags(io_cq, ena_rx_ctx, cdesc);
 
 	ena_rx_ctx->descs = nb_hw_desc;
+
 	return 0;
 }
 
@@ -615,12 +602,16 @@ int ena_com_add_single_rx_desc(struct ena_com_io_sq *io_sq,
 	desc->length = ena_buf->len;
 
 	desc->ctrl = ENA_ETH_IO_RX_DESC_FIRST_MASK |
-		ENA_ETH_IO_RX_DESC_LAST_MASK |
-		(io_sq->phase & ENA_ETH_IO_RX_DESC_PHASE_MASK) |
-		ENA_ETH_IO_RX_DESC_COMP_REQ_MASK;
+		     ENA_ETH_IO_RX_DESC_LAST_MASK |
+		     ENA_ETH_IO_RX_DESC_COMP_REQ_MASK |
+		     (io_sq->phase & ENA_ETH_IO_RX_DESC_PHASE_MASK);
 
 	desc->req_id = req_id;
 
+	netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+		   "[%s] Adding single RX desc, Queue: %u, req_id: %u\n",
+		   __func__, io_sq->qid, req_id);
+
 	desc->buff_addr_lo = (u32)ena_buf->paddr;
 	desc->buff_addr_hi =
 		((ena_buf->paddr & GENMASK_ULL(io_sq->dma_addr_bits - 1, 32)) >> 32);
diff --git a/drivers/amazon/net/ena/ena_eth_com.h b/drivers/amazon/net/ena/ena_eth_com.h
index b6592cb93b045..689313ee25a80 100644
--- a/drivers/amazon/net/ena/ena_eth_com.h
+++ b/drivers/amazon/net/ena/ena_eth_com.h
@@ -1,33 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
- * Copyright 2015 Amazon.com, Inc. or its affiliates.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef ENA_ETH_COM_H_
@@ -167,8 +140,9 @@ static inline bool ena_com_is_doorbell_needed(struct ena_com_io_sq *io_sq,
 						   llq_info->descs_per_entry);
 	}
 
-	pr_debug("queue: %d num_descs: %d num_entries_needed: %d\n", io_sq->qid,
-		 num_descs, num_entries_needed);
+	netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+		   "Queue: %d num_descs: %d num_entries_needed: %d\n",
+		   io_sq->qid, num_descs, num_entries_needed);
 
 	return num_entries_needed > io_sq->entries_in_tx_burst_left;
 }
@@ -178,14 +152,16 @@ static inline int ena_com_write_sq_doorbell(struct ena_com_io_sq *io_sq)
 	u16 max_entries_in_tx_burst = io_sq->llq_info.max_entries_in_tx_burst;
 	u16 tail = io_sq->tail;
 
-	pr_debug("write submission queue doorbell for queue: %d tail: %d\n",
-		 io_sq->qid, tail);
+	netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+		   "Write submission queue doorbell for queue: %d tail: %d\n",
+		   io_sq->qid, tail);
 
 	writel(tail, io_sq->db_addr);
 
 	if (is_llq_max_tx_burst_exists(io_sq)) {
-		pr_debug("reset available entries in tx burst for queue %d to %d\n",
-			 io_sq->qid, max_entries_in_tx_burst);
+		netdev_dbg(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+			   "Reset available entries in tx burst for queue %d to %d\n",
+			   io_sq->qid, max_entries_in_tx_burst);
 		io_sq->entries_in_tx_burst_left = max_entries_in_tx_burst;
 	}
 
@@ -203,8 +179,9 @@ static inline int ena_com_update_dev_comp_head(struct ena_com_io_cq *io_cq)
 		need_update = unreported_comp > (io_cq->q_depth / ENA_COMP_HEAD_THRESH);
 
 		if (unlikely(need_update)) {
-			pr_debug("Write completion queue doorbell for queue %d: head: %d\n",
-				 io_cq->qid, head);
+			netdev_dbg(ena_com_io_cq_to_ena_dev(io_cq)->net_device,
+				   "Write completion queue doorbell for queue %d: head: %d\n",
+				   io_cq->qid, head);
 			writel(head, io_cq->cq_head_db_reg);
 			io_cq->last_head_update = head;
 		}
@@ -267,7 +244,8 @@ static inline int ena_com_tx_comp_req_id_get(struct ena_com_io_cq *io_cq,
 
 	*req_id = READ_ONCE(cdesc->req_id);
 	if (unlikely(*req_id >= io_cq->q_depth)) {
-		pr_err("Invalid req id %d\n", cdesc->req_id);
+		netdev_err(ena_com_io_cq_to_ena_dev(io_cq)->net_device,
+			   "Invalid req id %d\n", cdesc->req_id);
 		return -EINVAL;
 	}
 
diff --git a/drivers/amazon/net/ena/ena_eth_io_defs.h b/drivers/amazon/net/ena/ena_eth_io_defs.h
index 4dd382e15ed34..332ac0d28ac7a 100755
--- a/drivers/amazon/net/ena/ena_eth_io_defs.h
+++ b/drivers/amazon/net/ena/ena_eth_io_defs.h
@@ -1,33 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
- * Copyright 2015 - 2018 Amazon.com, Inc. or its affiliates.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 #ifndef _ENA_ETH_IO_H_
 #define _ENA_ETH_IO_H_
diff --git a/drivers/amazon/net/ena/ena_ethtool.c b/drivers/amazon/net/ena/ena_ethtool.c
index dfce4a2fe73fb..82334c247016c 100755
--- a/drivers/amazon/net/ena/ena_ethtool.c
+++ b/drivers/amazon/net/ena/ena_ethtool.c
@@ -1,33 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /*
- * Copyright 2015 Amazon.com, Inc. or its affiliates.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include <linux/pci.h>
@@ -41,17 +14,17 @@ struct ena_stats {
 
 #define ENA_STAT_ENA_COM_ENTRY(stat) { \
 	.name = #stat, \
-	.stat_offset = offsetof(struct ena_com_stats_admin, stat) \
+	.stat_offset = offsetof(struct ena_com_stats_admin, stat) / sizeof(u64) \
 }
 
 #define ENA_STAT_ENTRY(stat, stat_type) { \
 	.name = #stat, \
-	.stat_offset = offsetof(struct ena_stats_##stat_type, stat) \
+	.stat_offset = offsetof(struct ena_stats_##stat_type, stat) / sizeof(u64) \
 }
 
 #define ENA_STAT_HW_ENTRY(stat, stat_type) { \
 	.name = #stat, \
-	.stat_offset = offsetof(struct ena_admin_##stat_type, stat) \
+	.stat_offset = offsetof(struct ena_admin_##stat_type, stat) / sizeof(u64) \
 }
 
 #define ENA_STAT_RX_ENTRY(stat) \
@@ -121,6 +94,17 @@ static const struct ena_stats ena_stats_rx_strings[] = {
 	ENA_STAT_RX_ENTRY(bad_req_id),
 	ENA_STAT_RX_ENTRY(empty_rx_ring),
 	ENA_STAT_RX_ENTRY(csum_unchecked),
+#ifdef ENA_XDP_SUPPORT
+	ENA_STAT_RX_ENTRY(xdp_aborted),
+	ENA_STAT_RX_ENTRY(xdp_drop),
+	ENA_STAT_RX_ENTRY(xdp_pass),
+	ENA_STAT_RX_ENTRY(xdp_tx),
+	ENA_STAT_RX_ENTRY(xdp_invalid),
+	ENA_STAT_RX_ENTRY(xdp_redirect),
+#endif
+	ENA_STAT_RX_ENTRY(lpc_warm_up),
+	ENA_STAT_RX_ENTRY(lpc_full),
+	ENA_STAT_RX_ENTRY(lpc_wrong_numa),
 };
 
 static const struct ena_stats ena_stats_ena_com_strings[] = {
@@ -136,7 +120,7 @@ static const struct ena_stats ena_stats_ena_com_strings[] = {
 #define ENA_STATS_ARRAY_RX		ARRAY_SIZE(ena_stats_rx_strings)
 #define ENA_STATS_ARRAY_ENA_COM		ARRAY_SIZE(ena_stats_ena_com_strings)
 #define ENA_STATS_ARRAY_ENI(adapter)	\
-	(ARRAY_SIZE(ena_stats_eni_strings) * adapter->eni_stats_supported)
+	(ARRAY_SIZE(ena_stats_eni_strings) * (adapter)->eni_stats_supported)
 
 static void ena_safe_update_stat(u64 *src, u64 *dst,
 				 struct u64_stats_sync *syncp)
@@ -157,29 +141,30 @@ static void ena_queue_stats(struct ena_adapter *adapter, u64 **data)
 	u64 *ptr;
 	int i, j;
 
-	for (i = 0; i < adapter->num_io_queues; i++) {
+	for (i = 0; i < adapter->num_io_queues + adapter->xdp_num_queues; i++) {
 		/* Tx stats */
 		ring = &adapter->tx_ring[i];
 
 		for (j = 0; j < ENA_STATS_ARRAY_TX; j++) {
 			ena_stats = &ena_stats_tx_strings[j];
 
-			ptr = (u64 *)((uintptr_t)&ring->tx_stats +
-				(uintptr_t)ena_stats->stat_offset);
+			ptr = (u64 *)&ring->tx_stats + ena_stats->stat_offset;
 
 			ena_safe_update_stat(ptr, (*data)++, &ring->syncp);
 		}
+		/* XDP TX queues don't have a RX queue counterpart */
+		if (!ENA_IS_XDP_INDEX(adapter, i)) {
+			/* Rx stats */
+			ring = &adapter->rx_ring[i];
 
-		/* Rx stats */
-		ring = &adapter->rx_ring[i];
-
-		for (j = 0; j < ENA_STATS_ARRAY_RX; j++) {
-			ena_stats = &ena_stats_rx_strings[j];
+			for (j = 0; j < ENA_STATS_ARRAY_RX; j++) {
+				ena_stats = &ena_stats_rx_strings[j];
 
-			ptr = (u64 *)((uintptr_t)&ring->rx_stats +
-				(uintptr_t)ena_stats->stat_offset);
+				ptr = (u64 *)&ring->rx_stats +
+					ena_stats->stat_offset;
 
-			ena_safe_update_stat(ptr, (*data)++, &ring->syncp);
+				ena_safe_update_stat(ptr, (*data)++, &ring->syncp);
+			}
 		}
 	}
 }
@@ -187,14 +172,14 @@ static void ena_queue_stats(struct ena_adapter *adapter, u64 **data)
 static void ena_dev_admin_queue_stats(struct ena_adapter *adapter, u64 **data)
 {
 	const struct ena_stats *ena_stats;
-	u32 *ptr;
+	u64 *ptr;
 	int i;
 
 	for (i = 0; i < ENA_STATS_ARRAY_ENA_COM; i++) {
 		ena_stats = &ena_stats_ena_com_strings[i];
 
-		ptr = (u32 *)((uintptr_t)&adapter->ena_dev->admin_queue.stats +
-			(uintptr_t)ena_stats->stat_offset);
+		ptr = (u64 *)&adapter->ena_dev->admin_queue.stats +
+			ena_stats->stat_offset;
 
 		*(*data)++ = *ptr;
 	}
@@ -211,8 +196,7 @@ static void ena_get_stats(struct ena_adapter *adapter,
 	for (i = 0; i < ENA_STATS_ARRAY_GLOBAL; i++) {
 		ena_stats = &ena_stats_global_strings[i];
 
-		ptr = (u64 *)((uintptr_t)&adapter->dev_stats +
-			(uintptr_t)ena_stats->stat_offset);
+		ptr = (u64 *)&adapter->dev_stats + ena_stats->stat_offset;
 
 		ena_safe_update_stat(ptr, data++, &adapter->syncp);
 	}
@@ -222,8 +206,8 @@ static void ena_get_stats(struct ena_adapter *adapter,
 		for (i = 0; i < ENA_STATS_ARRAY_ENI(adapter); i++) {
 			ena_stats = &ena_stats_eni_strings[i];
 
-			ptr = (u64 *)((uintptr_t)&adapter->eni_stats +
-				(uintptr_t)ena_stats->stat_offset);
+			ptr = (u64 *)&adapter->eni_stats +
+				ena_stats->stat_offset;
 
 			ena_safe_update_stat(ptr, data++, &adapter->syncp);
 		}
@@ -245,6 +229,7 @@ static void ena_get_ethtool_stats(struct net_device *netdev,
 static int ena_get_sw_stats_count(struct ena_adapter *adapter)
 {
 	return adapter->num_io_queues * (ENA_STATS_ARRAY_TX + ENA_STATS_ARRAY_RX)
+		+ adapter->xdp_num_queues * ENA_STATS_ARRAY_TX
 		+ ENA_STATS_ARRAY_GLOBAL + ENA_STATS_ARRAY_ENA_COM;
 }
 
@@ -266,24 +251,32 @@ int ena_get_sset_count(struct net_device *netdev, int sset)
 static void ena_queue_strings(struct ena_adapter *adapter, u8 **data)
 {
 	const struct ena_stats *ena_stats;
+	bool is_xdp;
 	int i, j;
 
-	for (i = 0; i < adapter->num_io_queues; i++) {
+	for (i = 0; i < adapter->num_io_queues + adapter->xdp_num_queues; i++) {
+		is_xdp = ENA_IS_XDP_INDEX(adapter, i);
 		/* Tx stats */
 		for (j = 0; j < ENA_STATS_ARRAY_TX; j++) {
 			ena_stats = &ena_stats_tx_strings[j];
 
 			snprintf(*data, ETH_GSTRING_LEN,
-				 "queue_%u_tx_%s", i, ena_stats->name);
-			 (*data) += ETH_GSTRING_LEN;
+				 "queue_%u_%s_%s", i,
+				 is_xdp ? "xdp_tx" : "tx", ena_stats->name);
+			(*data) += ETH_GSTRING_LEN;
 		}
-		/* Rx stats */
-		for (j = 0; j < ENA_STATS_ARRAY_RX; j++) {
-			ena_stats = &ena_stats_rx_strings[j];
 
-			snprintf(*data, ETH_GSTRING_LEN,
-				 "queue_%u_rx_%s", i, ena_stats->name);
-			(*data) += ETH_GSTRING_LEN;
+		if (!is_xdp) {
+			/* RX stats, in XDP there isn't a RX queue
+			 * counterpart
+			 */
+			for (j = 0; j < ENA_STATS_ARRAY_RX; j++) {
+				ena_stats = &ena_stats_rx_strings[j];
+
+				snprintf(*data, ETH_GSTRING_LEN,
+					 "queue_%u_rx_%s", i, ena_stats->name);
+				(*data) += ETH_GSTRING_LEN;
+			}
 		}
 	}
 }
@@ -965,15 +958,15 @@ static int ena_set_channels(struct net_device *netdev,
 #ifdef ENA_XDP_SUPPORT
 	if (count < ENA_MIN_NUM_IO_QUEUES ||
 	    (ena_xdp_present(adapter) &&
-	    !ena_xdp_legal_queue_count(adapter, channels->combined_count)))
+	    !ena_xdp_legal_queue_count(adapter, count)))
 #else
 	if (count < ENA_MIN_NUM_IO_QUEUES)
 #endif /* ENA_XDP_SUPPORT */
 		return -EINVAL;
+
 	if (count > adapter->max_num_io_queues)
 		return -EINVAL;
 
-
 	return ena_update_queue_count(adapter, count);
 }
 #endif /* ETHTOOL_SCHANNELS */
@@ -1099,7 +1092,7 @@ static void ena_dump_stats_ex(struct ena_adapter *adapter, u8 *buf)
 				   GFP_ATOMIC);
 	if (!strings_buf) {
 		netif_err(adapter, drv, netdev,
-			  "failed to alloc strings_buf\n");
+			  "Failed to allocate strings_buf\n");
 		return;
 	}
 
@@ -1108,7 +1101,7 @@ static void ena_dump_stats_ex(struct ena_adapter *adapter, u8 *buf)
 				GFP_ATOMIC);
 	if (!data_buf) {
 		netif_err(adapter, drv, netdev,
-			  "failed to allocate data buf\n");
+			  "Failed to allocate data buf\n");
 		devm_kfree(&adapter->pdev->dev, strings_buf);
 		return;
 	}
diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c
index 1d16d33ead6d9..9fe9515be80c3 100755
--- a/drivers/amazon/net/ena/ena_netdev.c
+++ b/drivers/amazon/net/ena/ena_netdev.c
@@ -1,33 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /*
- * Copyright 2015 Amazon.com, Inc. or its affiliates.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -66,6 +39,8 @@ MODULE_VERSION(DRV_MODULE_GENERATION);
 /* Time in jiffies before concluding the transmitter is hung. */
 #define TX_TIMEOUT  (5 * HZ)
 
+#define ENA_MAX_RINGS min_t(unsigned int, ENA_MAX_NUM_IO_QUEUES, num_possible_cpus())
+
 #define ENA_NAPI_BUDGET 64
 
 #define DEFAULT_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_IFUP | \
@@ -86,6 +61,10 @@ static int num_io_queues = ENA_MAX_NUM_IO_QUEUES;
 module_param(num_io_queues, int, 0444);
 MODULE_PARM_DESC(num_io_queues, "Sets number of RX/TX queues to allocate to device. The maximum value depends on the device and number of online CPUs.\n");
 
+static int lpc_size = ENA_LPC_DEFAULT_MULTIPLIER;
+module_param(lpc_size, uint, 0);
+MODULE_PARM_DESC(lpc_size, "Each local page cache (lpc) holds N * 1024 pages. This parameter sets N which is rounded up to a multiplier of 2. If zero, the page cache is disabled. Max: 32\n");
+
 static struct ena_aenq_handlers aenq_handlers;
 
 static struct workqueue_struct *ena_wq;
@@ -96,6 +75,7 @@ static int ena_rss_init_default(struct ena_adapter *adapter);
 static void check_for_admin_com_state(struct ena_adapter *adapter);
 static void ena_destroy_device(struct ena_adapter *adapter, bool graceful);
 static int ena_restore_device(struct ena_adapter *adapter);
+static int ena_create_page_caches(struct ena_adapter *adapter);
 
 #ifdef ENA_XDP_SUPPORT
 static void ena_init_io_rings(struct ena_adapter *adapter,
@@ -129,6 +109,15 @@ static int ena_create_io_tx_queues_in_range(struct ena_adapter *adapter,
 					    int first_index, int count);
 #endif /* ENA_XDP_SUPPORT */
 
+/* Increase a stat by cnt while holding syncp seqlock */
+static void ena_increase_stat_atomic(u64 *statp, u64 cnt,
+				     struct u64_stats_sync *syncp)
+{
+       u64_stats_update_begin(syncp);
+       (*statp) += cnt;
+       u64_stats_update_end(syncp);
+}
+
 #ifdef HAVE_NDO_TX_TIMEOUT_STUCK_QUEUE_PARAMETER
 static void ena_tx_timeout(struct net_device *dev, unsigned int txqueue)
 #else
@@ -145,9 +134,8 @@ static void ena_tx_timeout(struct net_device *dev)
 		return;
 
 	adapter->reset_reason = ENA_REGS_RESET_OS_NETDEV_WD;
-	u64_stats_update_begin(&adapter->syncp);
-	adapter->dev_stats.tx_timeout++;
-	u64_stats_update_end(&adapter->syncp);
+	ena_increase_stat_atomic(&adapter->dev_stats.tx_timeout, 1,
+		&adapter->syncp);
 
 	netif_err(adapter, tx_err, dev, "Transmit time out\n");
 }
@@ -175,7 +163,7 @@ static int ena_change_mtu(struct net_device *dev, int new_mtu)
 #endif
 	ret = ena_com_set_dev_mtu(adapter->ena_dev, new_mtu);
 	if (!ret) {
-		netif_dbg(adapter, drv, dev, "set MTU to %d\n", new_mtu);
+		netif_dbg(adapter, drv, dev, "Set MTU to %d\n", new_mtu);
 		update_rx_ring_mtu(adapter, new_mtu);
 		dev->mtu = new_mtu;
 	} else {
@@ -214,10 +202,9 @@ static int ena_xmit_common(struct net_device *dev,
 	 */
 	if (unlikely(rc)) {
 		netif_err(adapter, tx_queued, dev,
-			  "failed to prepare tx bufs\n");
-		u64_stats_update_begin(&ring->syncp);
-		ring->tx_stats.prepare_ctx_err++;
-		u64_stats_update_end(&ring->syncp);
+			  "Failed to prepare tx bufs\n");
+		ena_increase_stat_atomic(&ring->tx_stats.prepare_ctx_err, 1,
+			&ring->syncp);
 		if (rc != -ENOMEM) {
 			adapter->reset_reason =
 				ENA_REGS_RESET_DRIVER_INVALID_STATE;
@@ -289,18 +276,18 @@ static int ena_xdp_io_poll(struct napi_struct *napi, int budget)
 	return ret;
 }
 
-static int ena_xdp_tx_map_buff(struct ena_ring *xdp_ring,
-			       struct ena_tx_buffer *tx_info,
-			       struct xdp_buff *xdp,
-			       void **push_hdr,
-			       u32 *push_len)
+static int ena_xdp_tx_map_frame(struct ena_ring *xdp_ring,
+				struct ena_tx_buffer *tx_info,
+				struct xdp_frame *xdpf,
+				void **push_hdr,
+				u32 *push_len)
 {
 	struct ena_adapter *adapter = xdp_ring->adapter;
 	struct ena_com_buf *ena_buf;
 	dma_addr_t dma = 0;
 	u32 size;
 
-	tx_info->xdpf = convert_to_xdp_frame(xdp);
+	tx_info->xdpf = xdpf;
 	size = tx_info->xdpf->len;
 	ena_buf = tx_info->bufs;
 
@@ -326,10 +313,9 @@ static int ena_xdp_tx_map_buff(struct ena_ring *xdp_ring,
 	return 0;
 
 error_report_dma_error:
-	u64_stats_update_begin(&xdp_ring->syncp);
-	xdp_ring->tx_stats.dma_mapping_err++;
-	u64_stats_update_end(&xdp_ring->syncp);
-	netdev_warn(adapter->netdev, "failed to map xdp buff\n");
+	ena_increase_stat_atomic(&xdp_ring->tx_stats.dma_mapping_err, 1,
+		&xdp_ring->syncp);
+	netif_warn(adapter, tx_queued, adapter->netdev, "Failed to map xdp buff\n");
 
 	xdp_return_frame_rx_napi(tx_info->xdpf);
 	tx_info->xdpf = NULL;
@@ -338,29 +324,24 @@ static int ena_xdp_tx_map_buff(struct ena_ring *xdp_ring,
 	return -EINVAL;
 }
 
-static int ena_xdp_xmit_buff(struct net_device *dev,
-			     struct xdp_buff *xdp,
-			     int qid,
-			     struct ena_rx_buffer *rx_info)
+static int ena_xdp_xmit_frame(struct ena_ring *xdp_ring,
+			      struct net_device *dev,
+			      struct xdp_frame *xdpf,
+			      int flags)
 {
-	struct ena_adapter *adapter = netdev_priv(dev);
-	struct ena_com_tx_ctx ena_tx_ctx = {0};
+	struct ena_com_tx_ctx ena_tx_ctx = {};
 	struct ena_tx_buffer *tx_info;
-	struct ena_ring *xdp_ring;
 	u16 next_to_use, req_id;
-	int rc;
 	void *push_hdr;
 	u32 push_len;
+	int rc;
 
-	xdp_ring = &adapter->tx_ring[qid];
 	next_to_use = xdp_ring->next_to_use;
 	req_id = xdp_ring->free_ids[next_to_use];
 	tx_info = &xdp_ring->tx_buffer_info[req_id];
 	tx_info->num_of_bufs = 0;
-	page_ref_inc(rx_info->page);
-	tx_info->xdp_rx_page = rx_info->page;
 
-	rc = ena_xdp_tx_map_buff(xdp_ring, tx_info, xdp, &push_hdr, &push_len);
+	rc = ena_xdp_tx_map_frame(xdp_ring, tx_info, xdpf, &push_hdr, &push_len);
 	if (unlikely(rc))
 		goto error_drop_packet;
 
@@ -375,33 +356,82 @@ static int ena_xdp_xmit_buff(struct net_device *dev,
 			     tx_info,
 			     &ena_tx_ctx,
 			     next_to_use,
-			     xdp->data_end - xdp->data);
+			     xdpf->len);
 	if (rc)
 		goto error_unmap_dma;
 	/* trigger the dma engine. ena_com_write_sq_doorbell()
 	 * has a mb
 	 */
-	ena_com_write_sq_doorbell(xdp_ring->ena_com_io_sq);
-	u64_stats_update_begin(&xdp_ring->syncp);
-	xdp_ring->tx_stats.doorbells++;
-	u64_stats_update_end(&xdp_ring->syncp);
+	if (flags & XDP_XMIT_FLUSH) {
+		ena_com_write_sq_doorbell(xdp_ring->ena_com_io_sq);
+		ena_increase_stat_atomic(&xdp_ring->tx_stats.doorbells, 1,
+			&xdp_ring->syncp);
+	}
 
-	return NETDEV_TX_OK;
+	return rc;
 
 error_unmap_dma:
 	ena_unmap_tx_buff(xdp_ring, tx_info);
 	tx_info->xdpf = NULL;
 error_drop_packet:
-	__free_page(tx_info->xdp_rx_page);
-	return NETDEV_TX_OK;
+	xdp_return_frame(xdpf);
+	return rc;
 }
 
-static int ena_xdp_execute(struct ena_ring *rx_ring,
-			   struct xdp_buff *xdp,
-			   struct ena_rx_buffer *rx_info)
+static int ena_xdp_xmit(struct net_device *dev, int n,
+			struct xdp_frame **frames, u32 flags)
+{
+	struct ena_adapter *adapter = netdev_priv(dev);
+	int qid, i, err, drops = 0;
+	struct ena_ring *xdp_ring;
+
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
+		return -EINVAL;
+
+	if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
+		return -ENETDOWN;
+
+	/* We assume that all rings have the same XDP program */
+	if (!READ_ONCE(adapter->rx_ring->xdp_bpf_prog))
+		return -ENXIO;
+
+	qid = smp_processor_id() % adapter->xdp_num_queues;
+	qid += adapter->xdp_first_ring;
+	xdp_ring = &adapter->tx_ring[qid];
+
+	/* Other CPU ids might try to send thorugh this queue */
+	spin_lock(&xdp_ring->xdp_tx_lock);
+
+	for (i = 0; i < n; i++) {
+		err = ena_xdp_xmit_frame(xdp_ring, dev, frames[i], 0);
+		/* The descriptor is freed by ena_xdp_xmit_frame in case
+		 * of an error.
+		 */
+		if (err)
+			drops++;
+	}
+
+	/* Ring doorbell to make device aware of the packets */
+	if (flags & XDP_XMIT_FLUSH) {
+		ena_com_write_sq_doorbell(xdp_ring->ena_com_io_sq);
+		ena_increase_stat_atomic(&xdp_ring->tx_stats.doorbells, 1,
+			&xdp_ring->syncp);
+	}
+
+	spin_unlock(&xdp_ring->xdp_tx_lock);
+
+	/* Return number of packets sent */
+	return n - drops;
+}
+
+static int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp)
 {
 	struct bpf_prog *xdp_prog;
+	struct ena_ring *xdp_ring;
+	struct xdp_frame *xdpf;
+	int qid;
 	u32 verdict = XDP_PASS;
+	u64 *xdp_stat;
 
 	rcu_read_lock();
 	xdp_prog = READ_ONCE(rx_ring->xdp_bpf_prog);
@@ -411,17 +441,48 @@ static int ena_xdp_execute(struct ena_ring *rx_ring,
 
 	verdict = bpf_prog_run_xdp(xdp_prog, xdp);
 
-	if (verdict == XDP_TX)
-		ena_xdp_xmit_buff(rx_ring->netdev,
-				  xdp,
-				  rx_ring->qid + rx_ring->adapter->num_io_queues,
-				  rx_info);
-	else if (unlikely(verdict == XDP_ABORTED))
+	switch (verdict) {
+	case XDP_TX:
+#ifdef XDP_CONVERT_TO_FRAME_NAME_CHANGED
+		xdpf = xdp_convert_buff_to_frame(xdp);
+#else
+		xdpf = convert_to_xdp_frame(xdp);
+#endif
+		/* Find xmit queue */
+		qid = rx_ring->qid + rx_ring->adapter->num_io_queues;
+		xdp_ring = &rx_ring->adapter->tx_ring[qid];
+
+		/* The XDP queues are shared between XDP_TX and XDP_REDIRECT */
+		spin_lock(&xdp_ring->xdp_tx_lock);
+
+		ena_xdp_xmit_frame(xdp_ring, rx_ring->netdev, xdpf, XDP_XMIT_FLUSH);
+
+		spin_unlock(&xdp_ring->xdp_tx_lock);
+		xdp_stat = &rx_ring->rx_stats.xdp_tx;
+		break;
+	case XDP_REDIRECT:
+		xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
+		xdp_stat = &rx_ring->rx_stats.xdp_redirect;
+		break;
+	case XDP_ABORTED:
 		trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict);
-	else if (unlikely(verdict > XDP_TX))
+		xdp_stat = &rx_ring->rx_stats.xdp_aborted;
+		break;
+	case XDP_DROP:
+		xdp_stat = &rx_ring->rx_stats.xdp_drop;
+		break;
+	case XDP_PASS:
+		xdp_stat = &rx_ring->rx_stats.xdp_pass;
+		break;
+	default:
 		bpf_warn_invalid_xdp_action(verdict);
+		xdp_stat = &rx_ring->rx_stats.xdp_invalid;
+	}
+
+	ena_increase_stat_atomic(xdp_stat, 1, &rx_ring->syncp);
 out:
 	rcu_read_unlock();
+
 	return verdict;
 }
 
@@ -494,10 +555,9 @@ static void ena_xdp_unregister_rxq_info(struct ena_ring *rx_ring)
 	xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
 }
 
-void ena_xdp_exchange_program_rx_in_range(struct ena_adapter *adapter,
-					  struct bpf_prog *prog,
-					  int first,
-					  int count)
+static void ena_xdp_exchange_program_rx_in_range(struct ena_adapter *adapter,
+						 struct bpf_prog *prog,
+						 int first, int count)
 {
 	struct ena_ring *rx_ring;
 	int i = 0;
@@ -515,8 +575,8 @@ void ena_xdp_exchange_program_rx_in_range(struct ena_adapter *adapter,
 	}
 }
 
-void ena_xdp_exchange_program(struct ena_adapter *adapter,
-			      struct bpf_prog *prog)
+static void ena_xdp_exchange_program(struct ena_adapter *adapter,
+				     struct bpf_prog *prog)
 {
 	struct bpf_prog *old_bpf_prog = xchg(&adapter->xdp_bpf_prog, prog);
 
@@ -587,7 +647,7 @@ static int ena_xdp_set(struct net_device *netdev, struct netdev_bpf *bpf)
 
 		if (!old_bpf_prog)
 			netif_info(adapter, drv, adapter->netdev,
-				   "xdp program set, changing the max_mtu from %d to %d",
+				   "XDP program is set, changing the max_mtu from %d to %d",
 				   prev_mtu, netdev->max_mtu);
 
 	} else if (rc == ENA_XDP_CURRENT_MTU_TOO_LARGE) {
@@ -614,15 +674,19 @@ static int ena_xdp_set(struct net_device *netdev, struct netdev_bpf *bpf)
  */
 static int ena_xdp(struct net_device *netdev, struct netdev_bpf *bpf)
 {
+#ifndef ENA_XDP_QUERY_IN_KERNEL
 	struct ena_adapter *adapter = netdev_priv(netdev);
 
+#endif /* ENA_XDP_QUERY_IN_KERNEL */
 	switch (bpf->command) {
 	case XDP_SETUP_PROG:
 		return ena_xdp_set(netdev, bpf);
+#ifndef ENA_XDP_QUERY_IN_KERNEL
 	case XDP_QUERY_PROG:
 		bpf->prog_id = adapter->xdp_bpf_prog ?
 			adapter->xdp_bpf_prog->aux->id : 0;
 		break;
+#endif
 	default:
 		return -EINVAL;
 	}
@@ -699,6 +763,9 @@ static void ena_init_io_rings(struct ena_adapter *adapter,
 		txr->smoothed_interval =
 			ena_com_get_nonadaptive_moderation_interval_tx(ena_dev);
 		txr->disable_meta_caching = adapter->disable_meta_caching;
+#ifdef ENA_XDP_SUPPORT
+		spin_lock_init(&txr->xdp_tx_lock);
+#endif
 
 		/* Don't init RX queues for xdp queues */
 		if (!ENA_IS_XDP_INDEX(adapter, i)) {
@@ -850,24 +917,6 @@ static void ena_free_all_io_tx_resources(struct ena_adapter *adapter)
 					      adapter->num_io_queues);
 }
 
-static int validate_rx_req_id(struct ena_ring *rx_ring, u16 req_id)
-{
-	if (likely(req_id < rx_ring->ring_size))
-		return 0;
-
-	netif_err(rx_ring->adapter, rx_err, rx_ring->netdev,
-		  "Invalid rx req_id: %hu\n", req_id);
-
-	u64_stats_update_begin(&rx_ring->syncp);
-	rx_ring->rx_stats.bad_req_id++;
-	u64_stats_update_end(&rx_ring->syncp);
-
-	/* Trigger device reset */
-	rx_ring->adapter->reset_reason = ENA_REGS_RESET_INV_RX_REQ_ID;
-	set_bit(ENA_FLAG_TRIGGER_RESET, &rx_ring->adapter->flags);
-	return -EFAULT;
-}
-
 /* ena_setup_rx_resources - allocate I/O Rx resources (Descriptors)
  * @adapter: network interface device structure
  * @qid: queue index
@@ -987,52 +1036,196 @@ static void ena_free_all_io_rx_resources(struct ena_adapter *adapter)
 		ena_free_rx_resources(adapter, i);
 }
 
+static void ena_put_unmap_cache_page(struct ena_ring *rx_ring, struct ena_page *ena_page)
+{
+	dma_unmap_page(rx_ring->dev, ena_page->dma_addr, ENA_PAGE_SIZE,
+		       DMA_BIDIRECTIONAL);
+
+	put_page(ena_page->page);
+}
+
+static struct page *ena_alloc_map_page(struct ena_ring *rx_ring, dma_addr_t *dma)
+{
+	struct page *page;
+
+	/* This would allocate the page on the same NUMA node the executing code
+	 * is running on.
+	 */
+	page = dev_alloc_page();
+	if (!page)
+		return NULL;
+
+	/* To enable NIC-side port-mirroring, AKA SPAN port,
+	 * we make the buffer readable from the nic as well
+	 */
+	*dma = dma_map_page(rx_ring->dev, page, 0, ENA_PAGE_SIZE,
+			    DMA_BIDIRECTIONAL);
+	if (unlikely(dma_mapping_error(rx_ring->dev, *dma))) {
+		__free_page(page);
+		return NULL;
+	}
+
+	return page;
+}
+
+/* Removes a page from page cache and allocate a new one instead. If an
+ * allocation of a new page fails, the cache entry isn't changed
+ */
+static void ena_replace_cache_page(struct ena_ring *rx_ring,
+				   struct ena_page *ena_page)
+{
+	struct page *new_page;
+	dma_addr_t dma;
+
+	new_page = ena_alloc_map_page(rx_ring, &dma);
+
+	if (likely(new_page)) {
+		ena_put_unmap_cache_page(rx_ring, ena_page);
+
+		ena_page->page = new_page;
+		ena_page->dma_addr = dma;
+	}
+}
+
+/* Mark the cache page as used and return it. If the page belongs to a different
+ * NUMA than the current one, free the cache page and allocate another one
+ * instead.
+ */
+static struct page *ena_return_cache_page(struct ena_ring *rx_ring,
+					  struct ena_page *ena_page,
+					  dma_addr_t *dma,
+					  int current_nid)
+{
+	/* Remove pages belonging to different node than current_nid from cache */
+	if (unlikely(page_to_nid(ena_page->page) != current_nid)) {
+		ena_increase_stat_atomic(&rx_ring->rx_stats.lpc_wrong_numa, 1, &rx_ring->syncp);
+		ena_replace_cache_page(rx_ring, ena_page);
+	}
+
+	/* Make sure no writes are pending for this page */
+	dma_sync_single_for_device(rx_ring->dev, ena_page->dma_addr,
+				   ENA_PAGE_SIZE,
+				   DMA_BIDIRECTIONAL);
+
+	/* Increase refcount to 2 so that the page is returned to the
+	 * cache after being freed
+	 */
+	page_ref_inc(ena_page->page);
+
+	*dma = ena_page->dma_addr;
+
+	return ena_page->page;
+}
+
+static struct page *ena_get_page(struct ena_ring *rx_ring, dma_addr_t *dma, int current_nid)
+{
+	struct ena_page_cache *page_cache = rx_ring->page_cache;
+	u32 head, cache_current_size;
+	struct ena_page *ena_page;
+
+	/* Cache size of zero indicates disabled cache */
+	if (!page_cache)
+		return ena_alloc_map_page(rx_ring, dma);
+
+	cache_current_size = page_cache->current_size;
+	head = page_cache->head;
+
+	ena_page = &page_cache->cache[head];
+	/* Warm up phase. We fill the pages for the first time. The
+	 * phase is done in the napi context to improve the chances we
+	 * allocate on the correct NUMA node
+	 */
+	if (unlikely(cache_current_size < page_cache->max_size)) {
+		/* Check if oldest allocated page is free */
+		if (ena_page->page && page_ref_count(ena_page->page) == 1) {
+			page_cache->head = (head + 1) % cache_current_size;
+			return ena_return_cache_page(rx_ring, ena_page, dma, current_nid);
+		}
+
+		ena_page = &page_cache->cache[cache_current_size];
+
+		/* Add a new page to the cache */
+		ena_page->page = ena_alloc_map_page(rx_ring, dma);
+		if (!ena_page->page)
+			return NULL;
+
+		ena_page->dma_addr = *dma;
+
+		/* Increase refcount to 2 so that the page is returned to the
+		 * cache after being freed
+		 */
+		page_ref_inc(ena_page->page);
+
+		page_cache->current_size++;
+
+		ena_increase_stat_atomic(&rx_ring->rx_stats.lpc_warm_up, 1, &rx_ring->syncp);
+
+		return ena_page->page;
+	}
+
+	/* Next page is still in use, so we allocate outside the cache */
+	if (unlikely(page_ref_count(ena_page->page) != 1)) {
+		ena_increase_stat_atomic(&rx_ring->rx_stats.lpc_full, 1, &rx_ring->syncp);
+		return ena_alloc_map_page(rx_ring, dma);
+	}
+
+	page_cache->head = (head + 1) & (page_cache->max_size - 1);
+
+	return ena_return_cache_page(rx_ring, ena_page, dma, current_nid);
+}
+
 static int ena_alloc_rx_page(struct ena_ring *rx_ring,
-				    struct ena_rx_buffer *rx_info, gfp_t gfp)
+			     struct ena_rx_buffer *rx_info, int current_nid)
 {
+	int headroom = rx_ring->rx_headroom;
 	struct ena_com_buf *ena_buf;
 	struct page *page;
 	dma_addr_t dma;
 
+	/* restore page offset value in case it has been changed by device */
+	rx_info->page_offset = headroom;
+
 	/* if previous allocated page is not used */
 	if (unlikely(rx_info->page))
 		return 0;
 
-	page = alloc_page(gfp);
+	/* We handle DMA here */
+	page = ena_get_page(rx_ring, &dma, current_nid);
 	if (unlikely(!page)) {
-		u64_stats_update_begin(&rx_ring->syncp);
-		rx_ring->rx_stats.page_alloc_fail++;
-		u64_stats_update_end(&rx_ring->syncp);
+		ena_increase_stat_atomic(&rx_ring->rx_stats.page_alloc_fail, 1,
+			&rx_ring->syncp);
 		return -ENOMEM;
 	}
 
-	dma = dma_map_page(rx_ring->dev, page, 0, ENA_PAGE_SIZE,
-			   DMA_BIDIRECTIONAL);
-	if (unlikely(dma_mapping_error(rx_ring->dev, dma))) {
-		u64_stats_update_begin(&rx_ring->syncp);
-		rx_ring->rx_stats.dma_mapping_err++;
-		u64_stats_update_end(&rx_ring->syncp);
-
-		__free_page(page);
-		return -EIO;
-	}
 	netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
-		  "alloc page %p, rx_info %p\n", page, rx_info);
+		  "Allocate page %p, rx_info %p\n", page, rx_info);
 
 	rx_info->page = page;
-	rx_info->page_offset = 0;
 	ena_buf = &rx_info->ena_buf;
-	ena_buf->paddr = dma + rx_ring->rx_headroom;
-	ena_buf->len = ENA_PAGE_SIZE - rx_ring->rx_headroom;
+	ena_buf->paddr = dma + headroom;
+	ena_buf->len = ENA_PAGE_SIZE - headroom;
 
 	return 0;
 }
 
+static void ena_unmap_rx_buff(struct ena_ring *rx_ring,
+			      struct ena_rx_buffer *rx_info)
+{
+	struct ena_com_buf *ena_buf = &rx_info->ena_buf;
+
+	/* If the ref count of the page is 2, then it belong to the page cache,
+	 * and it is up to it to unmap it.
+	 */
+	if (page_ref_count(rx_info->page) == 1)
+		dma_unmap_page(rx_ring->dev, ena_buf->paddr - rx_ring->rx_headroom,
+			       ENA_PAGE_SIZE,
+			       DMA_BIDIRECTIONAL);
+}
+
 static void ena_free_rx_page(struct ena_ring *rx_ring,
 			     struct ena_rx_buffer *rx_info)
 {
 	struct page *page = rx_info->page;
-	struct ena_com_buf *ena_buf = &rx_info->ena_buf;
 
 	if (unlikely(!page)) {
 		netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev,
@@ -1040,9 +1233,7 @@ static void ena_free_rx_page(struct ena_ring *rx_ring,
 		return;
 	}
 
-	dma_unmap_page(rx_ring->dev, ena_buf->paddr - rx_ring->rx_headroom,
-		       ENA_PAGE_SIZE,
-		       DMA_FROM_DEVICE);
+	ena_unmap_rx_buff(rx_ring, rx_info);
 
 	__free_page(page);
 	rx_info->page = NULL;
@@ -1051,9 +1242,13 @@ static void ena_free_rx_page(struct ena_ring *rx_ring,
 static int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num)
 {
 	u16 next_to_use, req_id;
+	int current_nid;
 	u32 i;
 	int rc;
 
+	/* Prefer pages to be allocate on the same NUMA as the CPU */
+	current_nid = numa_mem_id();
+
 	next_to_use = rx_ring->next_to_use;
 
 	for (i = 0; i < num; i++) {
@@ -1063,15 +1258,10 @@ static int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num)
 
 		rx_info = &rx_ring->rx_buffer_info[req_id];
 
-		rc = ena_alloc_rx_page(rx_ring, rx_info,
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 15, 0)
-				       GFP_ATOMIC | __GFP_COMP);
-#else
-				       __GFP_COLD | GFP_ATOMIC | __GFP_COMP);
-#endif
+		rc = ena_alloc_rx_page(rx_ring, rx_info, current_nid);
 		if (unlikely(rc < 0)) {
 			netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev,
-				   "failed to alloc buffer for rx queue %d\n",
+				   "Failed to allocate buffer for rx queue %d\n",
 				   rx_ring->qid);
 			break;
 		}
@@ -1080,7 +1270,7 @@ static int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num)
 						req_id);
 		if (unlikely(rc)) {
 			netif_warn(rx_ring->adapter, rx_status, rx_ring->netdev,
-				   "failed to add buffer for rx queue %d\n",
+				   "Failed to add buffer for rx queue %d\n",
 				   rx_ring->qid);
 			break;
 		}
@@ -1089,12 +1279,11 @@ static int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num)
 	}
 
 	if (unlikely(i < num)) {
-		u64_stats_update_begin(&rx_ring->syncp);
-		rx_ring->rx_stats.refil_partial++;
-		u64_stats_update_end(&rx_ring->syncp);
-		netdev_warn(rx_ring->netdev,
-			    "refilled rx qid %d with only %d buffers (from %d)\n",
-			    rx_ring->qid, i, num);
+		ena_increase_stat_atomic(&rx_ring->rx_stats.refil_partial, 1,
+			&rx_ring->syncp);
+		netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev,
+			   "Refilled rx qid %d with only %d buffers (from %d)\n",
+			   rx_ring->qid, i, num);
 	}
 
 	/* ena_com_write_sq_doorbell issues a wmb() */
@@ -1135,17 +1324,57 @@ static void ena_refill_all_rx_bufs(struct ena_adapter *adapter)
 
 		if (unlikely(rc != bufs_num))
 			netif_warn(rx_ring->adapter, rx_status, rx_ring->netdev,
-				   "refilling Queue %d failed. allocated %d buffers from: %d\n",
+				   "Refilling Queue %d failed. allocated %d buffers from: %d\n",
 				   i, rc, bufs_num);
 	}
 }
 
+/* Release all pages from the page cache */
+static void ena_free_ring_cache_pages(struct ena_adapter *adapter, int qid)
+{
+	struct ena_ring *rx_ring = &adapter->rx_ring[qid];
+	struct ena_page_cache *page_cache;
+	int i;
+
+	/* Page cache is disabled */
+	if (!rx_ring->page_cache)
+		return;
+
+	page_cache = rx_ring->page_cache;
+
+	/* We check size value to make sure we don't
+	 * free pages that weren't allocated.
+	 */
+	for (i = 0; i < page_cache->current_size; i++) {
+		struct ena_page *ena_page = &page_cache->cache[i];
+
+		/* The cache pages can be at most held by two entities */
+		WARN_ON(!ena_page->page || page_ref_count(ena_page->page) > 2);
+
+		dma_unmap_page(rx_ring->dev, ena_page->dma_addr,
+			       ENA_PAGE_SIZE,
+			       DMA_BIDIRECTIONAL);
+
+		/* If the page is also in the rx buffer, then this operation
+		 * would only decrease its reference count
+		 */
+		__free_page(ena_page->page);
+	}
+
+	page_cache->head = page_cache->current_size = 0;
+}
+
 static void ena_free_all_rx_bufs(struct ena_adapter *adapter)
 {
 	int i;
 
-	for (i = 0; i < adapter->num_io_queues; i++)
+	for (i = 0; i < adapter->num_io_queues; i++) {
+		/* The RX SQ's packet should be freed first, since they don't
+		 * unmap pages that belong to the page_cache.
+		 */
 		ena_free_rx_bufs(adapter, i);
+		ena_free_ring_cache_pages(adapter, i);
+	}
 }
 
 static void ena_unmap_tx_buff(struct ena_ring *tx_ring,
@@ -1193,14 +1422,14 @@ static void ena_free_tx_bufs(struct ena_ring *tx_ring)
 			continue;
 
 		if (print_once) {
-			netdev_notice(tx_ring->netdev,
-				      "free uncompleted tx skb qid %d idx 0x%x\n",
-				      tx_ring->qid, i);
+			netif_notice(tx_ring->adapter, ifdown, tx_ring->netdev,
+				     "Free uncompleted tx skb qid %d idx 0x%x\n",
+				     tx_ring->qid, i);
 			print_once = false;
 		} else {
-			netdev_dbg(tx_ring->netdev,
-				   "free uncompleted tx skb qid %d idx 0x%x\n",
-				   tx_ring->qid, i);
+			netif_dbg(tx_ring->adapter, ifdown, tx_ring->netdev,
+				  "Free uncompleted tx skb qid %d idx 0x%x\n",
+				  tx_ring->qid, i);
 		}
 
 		ena_unmap_tx_buff(tx_ring, tx_info);
@@ -1267,9 +1496,7 @@ static int handle_invalid_req_id(struct ena_ring *ring, u16 req_id,
 			  "Invalid req_id: %hu\n",
 			  req_id);
 
-	u64_stats_update_begin(&ring->syncp);
-	ring->tx_stats.bad_req_id++;
-	u64_stats_update_end(&ring->syncp);
+	ena_increase_stat_atomic(&ring->tx_stats.bad_req_id, 1, &ring->syncp);
 
 	/* Trigger device reset */
 	ring->adapter->reset_reason = ENA_REGS_RESET_INV_TX_REQ_ID;
@@ -1380,9 +1607,8 @@ static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget)
 		if (netif_tx_queue_stopped(txq) && above_thresh &&
 		    test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags)) {
 			netif_tx_wake_queue(txq);
-			u64_stats_update_begin(&tx_ring->syncp);
-			tx_ring->tx_stats.queue_wakeup++;
-			u64_stats_update_end(&tx_ring->syncp);
+			ena_increase_stat_atomic(&tx_ring->tx_stats.queue_wakeup, 1,
+				&tx_ring->syncp);
 		}
 		__netif_tx_unlock(txq);
 	}
@@ -1401,9 +1627,8 @@ static struct sk_buff *ena_alloc_skb(struct ena_ring *rx_ring, bool frags)
 						rx_ring->rx_copybreak);
 
 	if (unlikely(!skb)) {
-		u64_stats_update_begin(&rx_ring->syncp);
-		rx_ring->rx_stats.skb_alloc_fail++;
-		u64_stats_update_end(&rx_ring->syncp);
+		ena_increase_stat_atomic(&rx_ring->rx_stats.skb_alloc_fail, 1,
+			&rx_ring->syncp);
 		netif_dbg(rx_ring->adapter, rx_err, rx_ring->netdev,
 			  "Failed to allocate skb. frags: %d\n", frags);
 		return NULL;
@@ -1424,15 +1649,10 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 	bool polling;
 #endif
 	void *va;
-	int rc;
 
 	len = ena_bufs[buf].len;
 	req_id = ena_bufs[buf].req_id;
 
-	rc = validate_rx_req_id(rx_ring, req_id);
-	if (unlikely(rc < 0))
-		return NULL;
-
 	rx_info = &rx_ring->rx_buffer_info[req_id];
 
 	if (unlikely(!rx_info->page)) {
@@ -1447,7 +1667,8 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 
 	/* save virt address of first buffer */
 	va = page_address(rx_info->page) + rx_info->page_offset;
-	prefetch(va + NET_IP_ALIGN);
+
+	prefetch(va);
 
 	if (len <= rx_ring->rx_copybreak) {
 		skb = ena_alloc_skb(rx_ring, false);
@@ -1455,7 +1676,7 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 			return NULL;
 
 		netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
-			  "rx allocated small packet. len %d. data_len %d\n",
+			  "RX allocated small packet. len %d. data_len %d\n",
 			  skb->len, skb->data_len);
 
 		/* sync this buffer for CPU use */
@@ -1491,17 +1712,13 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 		return NULL;
 
 	do {
-		dma_unmap_page(rx_ring->dev,
-			       dma_unmap_addr(&rx_info->ena_buf, paddr),
-			       ENA_PAGE_SIZE, DMA_FROM_DEVICE);
+		ena_unmap_rx_buff(rx_ring, rx_info);
 
 		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_info->page,
 				rx_info->page_offset, len, ENA_PAGE_SIZE);
-		/* The offset is non zero only for the first buffer */
-		rx_info->page_offset = 0;
 
 		netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
-			  "rx skb updated. len %d. data_len %d\n",
+			  "RX skb updated. len %d. data_len %d\n",
 			  skb->len, skb->data_len);
 
 		rx_info->page = NULL;
@@ -1517,10 +1734,6 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 		len = ena_bufs[buf].len;
 		req_id = ena_bufs[buf].req_id;
 
-		rc = validate_rx_req_id(rx_ring, req_id);
-		if (unlikely(rc < 0))
-			return NULL;
-
 		rx_info = &rx_ring->rx_buffer_info[req_id];
 	} while (1);
 
@@ -1570,9 +1783,8 @@ static void ena_rx_checksum(struct ena_ring *rx_ring,
 		     (ena_rx_ctx->l3_csum_err))) {
 		/* ipv4 checksum error */
 		skb->ip_summed = CHECKSUM_NONE;
-		u64_stats_update_begin(&rx_ring->syncp);
-		rx_ring->rx_stats.bad_csum++;
-		u64_stats_update_end(&rx_ring->syncp);
+		ena_increase_stat_atomic(&rx_ring->rx_stats.bad_csum, 1,
+			&rx_ring->syncp);
 		netif_dbg(rx_ring->adapter, rx_err, rx_ring->netdev,
 			  "RX IPv4 header checksum error\n");
 		return;
@@ -1583,9 +1795,8 @@ static void ena_rx_checksum(struct ena_ring *rx_ring,
 		   (ena_rx_ctx->l4_proto == ENA_ETH_IO_L4_PROTO_UDP))) {
 		if (unlikely(ena_rx_ctx->l4_csum_err)) {
 			/* TCP/UDP checksum error */
-			u64_stats_update_begin(&rx_ring->syncp);
-			rx_ring->rx_stats.bad_csum++;
-			u64_stats_update_end(&rx_ring->syncp);
+			ena_increase_stat_atomic(&rx_ring->rx_stats.bad_csum, 1,
+				&rx_ring->syncp);
 			netif_dbg(rx_ring->adapter, rx_err, rx_ring->netdev,
 				  "RX L4 checksum error\n");
 			skb->ip_summed = CHECKSUM_NONE;
@@ -1594,13 +1805,11 @@ static void ena_rx_checksum(struct ena_ring *rx_ring,
 
 		if (likely(ena_rx_ctx->l4_csum_checked)) {
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
-			u64_stats_update_begin(&rx_ring->syncp);
-			rx_ring->rx_stats.csum_good++;
-			u64_stats_update_end(&rx_ring->syncp);
+			ena_increase_stat_atomic(&rx_ring->rx_stats.csum_good, 1,
+				&rx_ring->syncp);
 		} else {
-			u64_stats_update_begin(&rx_ring->syncp);
-			rx_ring->rx_stats.csum_unchecked++;
-			u64_stats_update_end(&rx_ring->syncp);
+			ena_increase_stat_atomic(&rx_ring->rx_stats.csum_unchecked, 1,
+				&rx_ring->syncp);
 			skb->ip_summed = CHECKSUM_NONE;
 		}
 	} else {
@@ -1635,14 +1844,13 @@ static void ena_set_rx_hash(struct ena_ring *rx_ring,
 }
 
 #ifdef ENA_XDP_SUPPORT
-int ena_xdp_handle_buff(struct ena_ring *rx_ring, struct xdp_buff *xdp)
+static int ena_xdp_handle_buff(struct ena_ring *rx_ring, struct xdp_buff *xdp)
 {
 	struct ena_rx_buffer *rx_info;
 	int ret;
 
 	rx_info = &rx_ring->rx_buffer_info[rx_ring->ena_bufs[0].req_id];
-	xdp->data = page_address(rx_info->page) +
-		rx_info->page_offset + rx_ring->rx_headroom;
+	xdp->data = page_address(rx_info->page) + rx_info->page_offset;
 	xdp_set_data_meta_invalid(xdp);
 	xdp->data_hard_start = page_address(rx_info->page);
 	xdp->data_end = xdp->data + rx_ring->ena_bufs[0].len;
@@ -1652,7 +1860,7 @@ int ena_xdp_handle_buff(struct ena_ring *rx_ring, struct xdp_buff *xdp)
 	if (unlikely(rx_ring->ena_bufs[0].len > ENA_XDP_MAX_MTU))
 		return XDP_DROP;
 
-	ret = ena_xdp_execute(rx_ring, xdp, rx_info);
+	ret = ena_xdp_execute(rx_ring, xdp);
 
 	/* The xdp program might expand the headers */
 	if (ret == XDP_PASS) {
@@ -1684,6 +1892,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 	int refill_required;
 #ifdef ENA_XDP_SUPPORT
 	struct xdp_buff xdp;
+	int xdp_flags = 0;
 #endif /* ENA_XDP_SUPPORT */
 	int total_len = 0;
 #ifdef ENA_XDP_SUPPORT
@@ -1697,6 +1906,9 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 	res_budget = budget;
 #ifdef ENA_XDP_SUPPORT
 	xdp.rxq = &rx_ring->xdp_rxq;
+#ifdef XDP_HAS_FRAME_SZ
+	xdp.frame_sz = ENA_PAGE_SIZE;
+#endif
 #endif /* ENA_XDP_SUPPORT */
 
 	do {
@@ -1717,8 +1929,9 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 		if (unlikely(ena_rx_ctx.descs == 0))
 			break;
 
+		/* First descriptor might have an offset set by the device */
 		rx_info = &rx_ring->rx_buffer_info[rx_ring->ena_bufs[0].req_id];
-		rx_info->page_offset = ena_rx_ctx.pkt_offset;
+		rx_info->page_offset += ena_rx_ctx.pkt_offset;
 
 		netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
 			  "rx_poll: q %d got packet from ena. descs #: %d l3 proto %d l4 proto %d hash: %x\n",
@@ -1741,25 +1954,28 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 #endif /* ENA_XDP_SUPPORT */
 
 		if (unlikely(!skb)) {
-#ifdef ENA_XDP_SUPPORT
-			/* The page might not actually be freed here since the
-			 * page reference count is incremented in
-			 * ena_xdp_xmit_buff(), and it will be decreased only
-			 * when send completion was received from the device
-			 */
-			if (xdp_verdict == XDP_TX)
-				ena_free_rx_page(rx_ring,
-						 &rx_ring->rx_buffer_info[rx_ring->ena_bufs[0].req_id]);
-#endif /* ENA_XDP_SUPPORT */
 			for (i = 0; i < ena_rx_ctx.descs; i++) {
-				rx_ring->free_ids[next_to_clean] =
-					rx_ring->ena_bufs[i].req_id;
+				int req_id = rx_ring->ena_bufs[i].req_id;
+
+				rx_ring->free_ids[next_to_clean] = req_id;
 				next_to_clean =
 					ENA_RX_RING_IDX_NEXT(next_to_clean,
 							     rx_ring->ring_size);
+
+#ifdef ENA_XDP_SUPPORT
+				/* Packets was passed for transmission, unmap it
+				 * from RX side.
+				 */
+				if (xdp_verdict == XDP_TX || xdp_verdict == XDP_REDIRECT) {
+					ena_unmap_rx_buff(rx_ring,
+							  &rx_ring->rx_buffer_info[req_id]);
+					rx_ring->rx_buffer_info[req_id].page = NULL;
+				}
+#endif /* ENA_XDP_SUPPORT */
 			}
 #ifdef ENA_XDP_SUPPORT
-			if (xdp_verdict != XDP_PASS){
+			if (xdp_verdict != XDP_PASS) {
+				xdp_flags |= xdp_verdict;
 				res_budget--;
 				continue;
 			}
@@ -1820,17 +2036,26 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 		ena_refill_rx_bufs(rx_ring, refill_required);
 	}
 
+#ifdef ENA_XDP_SUPPORT
+	if (xdp_flags & XDP_REDIRECT)
+		xdp_do_flush_map();
+#endif
+
 	return work_done;
 
 error:
 	adapter = netdev_priv(rx_ring->netdev);
 
-	u64_stats_update_begin(&rx_ring->syncp);
-	rx_ring->rx_stats.bad_desc_num++;
-	u64_stats_update_end(&rx_ring->syncp);
+	if (rc == -ENOSPC) {
+		ena_increase_stat_atomic(&rx_ring->rx_stats.bad_desc_num, 1,
+					 &rx_ring->syncp);
+		adapter->reset_reason = ENA_REGS_RESET_TOO_MANY_RX_DESCS;
+	} else {
+		ena_increase_stat_atomic(&rx_ring->rx_stats.bad_req_id, 1,
+					 &rx_ring->syncp);
+		adapter->reset_reason = ENA_REGS_RESET_INV_RX_REQ_ID;
+	}
 
-	/* Too many desc from the device. Trigger reset */
-	adapter->reset_reason = ENA_REGS_RESET_TOO_MANY_RX_DESCS;
 	set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
 
 	return 0;
@@ -1888,6 +2113,9 @@ static void ena_unmask_interrupt(struct ena_ring *tx_ring,
 				tx_ring->smoothed_interval,
 				true);
 
+	ena_increase_stat_atomic(&tx_ring->tx_stats.unmask_interrupt, 1,
+		&tx_ring->syncp);
+
 	/* It is a shared MSI-X.
 	 * Tx and Rx CQ have pointer to it.
 	 * So we use one of them to reach the intr reg
@@ -1967,7 +2195,7 @@ static int ena_clean_xdp_irq(struct ena_ring *xdp_ring, u32 budget)
 		tx_pkts++;
 		total_done += tx_info->tx_descs;
 
-		__free_page(tx_info->xdp_rx_page);
+		xdp_return_frame(xdpf);
 		xdp_ring->free_ids[next_to_clean] = req_id;
 		next_to_clean = ENA_TX_RING_IDX_NEXT(next_to_clean,
 						     xdp_ring->ring_size);
@@ -2036,20 +2264,19 @@ static int ena_io_poll(struct napi_struct *napi, int budget)
 		 */
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
 		if (napi_complete_done(napi, rx_work_done) &&
-		    atomic_cmpxchg(&ena_napi->unmask_interrupt, 1, 0)) {
+		    READ_ONCE(ena_napi->interrupts_masked)) {
 #else
 		napi_complete_done(napi, rx_work_done);
-		if (atomic_cmpxchg(&ena_napi->unmask_interrupt, 1, 0)) {
+		if (READ_ONCE(ena_napi->interrupts_masked)) {
 #endif
+			smp_rmb(); /* make sure interrupts_masked is read */
+			WRITE_ONCE(ena_napi->interrupts_masked, false);
 			/* We apply adaptive moderation on Rx path only.
 			 * Tx uses static interrupt moderation.
 			 */
 			if (ena_com_get_adaptive_moderation_enabled(rx_ring->ena_dev))
 				ena_adjust_adaptive_rx_intr_moderation(ena_napi);
 
-			u64_stats_update_begin(&tx_ring->syncp);
-			tx_ring->tx_stats.unmask_interrupt++;
-			u64_stats_update_end(&tx_ring->syncp);
 			ena_unmask_interrupt(tx_ring, rx_ring);
 		}
 
@@ -2094,8 +2321,9 @@ static irqreturn_t ena_intr_msix_io(int irq, void *data)
 
 	ena_napi->first_interrupt = true;
 
-	smp_mb__before_atomic();
-	atomic_set(&ena_napi->unmask_interrupt, 1);
+	WRITE_ONCE(ena_napi->interrupts_masked, true);
+	smp_wmb(); /* write interrupts_masked before calling napi */
+
 	napi_schedule_irqoff(&ena_napi->napi);
 
 	return IRQ_HANDLED;
@@ -2122,7 +2350,7 @@ static int ena_enable_msix(struct ena_adapter *adapter)
 	/* Reserved the max msix vectors we might need */
 	msix_vecs = ENA_MAX_MSIX_VEC(adapter->max_num_io_queues);
 	netif_dbg(adapter, probe, adapter->netdev,
-		  "trying to enable MSI-X, vectors %d\n", msix_vecs);
+		  "Trying to enable MSI-X, vectors %d\n", msix_vecs);
 
 #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
 	adapter->msix_entries = vzalloc(msix_vecs * sizeof(struct msix_entry));
@@ -2151,7 +2379,7 @@ static int ena_enable_msix(struct ena_adapter *adapter)
 
 	if (irq_cnt != msix_vecs) {
 		netif_notice(adapter, probe, adapter->netdev,
-			     "enable only %d MSI-X (out of %d), reduce the number of queues\n",
+			     "Enable only %d MSI-X (out of %d), reduce the number of queues\n",
 			     irq_cnt, msix_vecs);
 		adapter->num_io_queues = irq_cnt - ENA_ADMIN_MSIX_VEC;
 	}
@@ -2229,12 +2457,12 @@ static int ena_request_mgmnt_irq(struct ena_adapter *adapter)
 			 irq->data);
 	if (rc) {
 		netif_err(adapter, probe, adapter->netdev,
-			  "failed to request admin irq\n");
+			  "Failed to request admin irq\n");
 		return rc;
 	}
 
 	netif_dbg(adapter, probe, adapter->netdev,
-		  "set affinity hint of mgmnt irq.to 0x%lx (irq vector: %d)\n",
+		  "Set affinity hint of mgmnt irq.to 0x%lx (irq vector: %d)\n",
 		  irq->affinity_hint_mask.bits[0], irq->vector);
 
 	return rc;
@@ -2265,7 +2493,7 @@ static int ena_request_io_irq(struct ena_adapter *adapter)
 		}
 
 		netif_dbg(adapter, ifup, adapter->netdev,
-			  "set affinity hint of irq. index %d to 0x%lx (irq vector: %d)\n",
+			  "Set affinity hint of irq. index %d to 0x%lx (irq vector: %d)\n",
 			  i, irq->affinity_hint_mask.bits[0], irq->vector);
 	}
 
@@ -2344,21 +2572,12 @@ static void ena_del_napi_in_range(struct ena_adapter *adapter,
 	int i;
 
 	for (i = first_index; i < first_index + count; i++) {
-#ifdef ENA_XDP_SUPPORT
-		/* Check if napi was initialized before */
-		if (!ENA_IS_XDP_INDEX(adapter, i) ||
-		    adapter->ena_napi[i].xdp_ring) {
-			napi_hash_del(&adapter->ena_napi[i].napi);
-			netif_napi_del(&adapter->ena_napi[i].napi);
-		}
-#else
 		napi_hash_del(&adapter->ena_napi[i].napi);
 		netif_napi_del(&adapter->ena_napi[i].napi);
-#endif /* ENA_XDP_SUPPORT */
+
 #ifdef ENA_XDP_SUPPORT
-		else
-			WARN_ON(ENA_IS_XDP_INDEX(adapter, i) &&
-				adapter->ena_napi[i].xdp_ring);
+		WARN_ON(!ENA_IS_XDP_INDEX(adapter, i) &&
+			adapter->ena_napi[i].xdp_ring);
 #endif /* ENA_XDP_SUPPORT */
 	}
 }
@@ -2366,14 +2585,13 @@ static void ena_del_napi_in_range(struct ena_adapter *adapter,
 static void ena_init_napi_in_range(struct ena_adapter *adapter,
 				   int first_index, int count)
 {
-	struct ena_napi *napi = {0};
 	int i;
 
 	for (i = first_index; i < first_index + count; i++) {
-		napi = &adapter->ena_napi[i];
+		struct ena_napi *napi = &adapter->ena_napi[i];
 
 		netif_napi_add(adapter->netdev,
-			       &adapter->ena_napi[i].napi,
+			       &napi->napi,
 #ifdef ENA_XDP_SUPPORT
 			       ENA_IS_XDP_INDEX(adapter, i) ? ena_xdp_io_poll : ena_io_poll,
 #else
@@ -2626,9 +2844,9 @@ static int ena_create_all_io_rx_queues(struct ena_adapter *adapter)
 
 	for (i = 0; i < adapter->num_io_queues; i++) {
 		rc = ena_create_io_rx_queue(adapter, i);
-		INIT_WORK(&adapter->ena_napi[i].dim.work, ena_dim_work);
 		if (rc)
 			goto create_err;
+		INIT_WORK(&adapter->ena_napi[i].dim.work, ena_dim_work);
 	}
 
 	return 0;
@@ -2709,6 +2927,10 @@ static int create_queues_with_size_backoff(struct ena_adapter *adapter)
 		if (rc)
 			goto err_create_rx_queues;
 
+		rc = ena_create_page_caches(adapter);
+		if (rc) /* Cache memory is freed in case of failure */
+			goto err_create_rx_queues;
+
 		return 0;
 
 err_create_rx_queues:
@@ -2761,11 +2983,114 @@ static int create_queues_with_size_backoff(struct ena_adapter *adapter)
 	}
 }
 
+static void ena_free_ring_page_cache(struct ena_ring *rx_ring)
+{
+	if(!rx_ring->page_cache)
+		return;
+
+	vfree(rx_ring->page_cache);
+	rx_ring->page_cache = NULL;
+}
+
+/* Calculate the size of the Local Page Cache. If LPC should be disabled, return
+ * a size of 0.
+ */
+static u32 ena_calculate_cache_size(struct ena_adapter *adapter,
+				    struct ena_ring *rx_ring)
+{
+	int channels_nr = adapter->num_io_queues + adapter->xdp_num_queues;
+	u32 page_cache_size;
+
+	/* lpc_size == 0 means disabled cache */
+	if (lpc_size == 0)
+		return 0;
+
+	/* LPC is disabled below min number of queues */
+	if (channels_nr < ENA_LPC_MIN_NUM_OF_CHANNELS) {
+		netif_info(adapter, ifup, adapter->netdev,
+			   "Local page cache is disabled for less than %d channels\n",
+			   ENA_LPC_MIN_NUM_OF_CHANNELS);
+		return 0;
+	}
+
+	/* Clap the lpc_size to its maximum value */
+	if (lpc_size > ENA_LPC_MAX_MULTIPLIER) {
+		netif_info(adapter, ifup, adapter->netdev,
+			   "Provided lpc_size %d is too large, reducing to %d (max)\n",
+			   lpc_size, ENA_LPC_MAX_MULTIPLIER);
+		/* Override module param value to avoid printing this message
+		 * every up/down operation
+		 */
+		lpc_size = ENA_LPC_MAX_MULTIPLIER;
+	}
+
+#ifdef ENA_XDP_SUPPORT
+	/* We currently don't support page caches under XDP */
+	if (ena_xdp_present_ring(rx_ring)) {
+		netif_info(adapter, ifup, adapter->netdev,
+			   "Local page cache is disabled when using XDP\n");
+		return 0;
+	}
+#endif /* ENA_XDP_SUPPORT */
+
+	page_cache_size = lpc_size * ENA_LPC_MULTIPLIER_UNIT;
+	page_cache_size = roundup_pow_of_two(page_cache_size);
+
+	return page_cache_size;
+}
+
+static int ena_create_page_caches(struct ena_adapter *adapter)
+{
+	struct ena_page_cache *cache;
+	u32 page_cache_size;
+	int i;
+
+	for (i = 0; i < adapter->num_io_queues; i++) {
+		struct ena_ring *rx_ring = &adapter->rx_ring[i];
+
+		page_cache_size = ena_calculate_cache_size(adapter, rx_ring);
+
+		if (!page_cache_size)
+			return 0;
+
+		cache = vzalloc(sizeof(struct ena_page_cache) +
+				sizeof(struct ena_page) * page_cache_size);
+		if (!cache)
+			goto err_cache_alloc;
+
+		cache->max_size = page_cache_size;
+		rx_ring->page_cache = cache;
+	}
+
+	return 0;
+err_cache_alloc:
+	netif_err(adapter, ifup, adapter->netdev,
+		  "Failed to initialize local page caches (LPCs)\n");
+	while (--i >= 0) {
+		struct ena_ring *rx_ring = &adapter->rx_ring[i];
+
+		ena_free_ring_page_cache(rx_ring);
+	}
+
+	return -ENOMEM;
+}
+
+static void ena_free_page_caches(struct ena_adapter *adapter)
+{
+	int i;
+
+	for (i = 0; i < adapter->num_io_queues; i++) {
+		struct ena_ring *rx_ring = &adapter->rx_ring[i];
+
+		ena_free_ring_page_cache(rx_ring);
+	}
+}
+
 static int ena_up(struct ena_adapter *adapter)
 {
 	int io_queue_count, rc, i;
 
-	netdev_dbg(adapter->netdev, "%s\n", __func__);
+	netif_dbg(adapter, ifup, adapter->netdev, "%s\n", __func__);
 
 	io_queue_count = adapter->num_io_queues + adapter->xdp_num_queues;
 	ena_setup_io_intr(adapter);
@@ -2783,8 +3108,8 @@ static int ena_up(struct ena_adapter *adapter)
 	 */
 	if (ena_com_interrupt_moderation_supported(adapter->ena_dev))
 		ena_com_enable_adaptive_moderation(adapter->ena_dev);
-#endif
 
+#endif
 	rc = ena_request_io_irq(adapter);
 	if (rc)
 		goto err_req_irq;
@@ -2800,9 +3125,8 @@ static int ena_up(struct ena_adapter *adapter)
 	if (test_bit(ENA_FLAG_LINK_UP, &adapter->flags))
 		netif_carrier_on(adapter->netdev);
 
-	u64_stats_update_begin(&adapter->syncp);
-	adapter->dev_stats.interface_up++;
-	u64_stats_update_end(&adapter->syncp);
+	ena_increase_stat_atomic(&adapter->dev_stats.interface_up, 1,
+		&adapter->syncp);
 
 	set_bit(ENA_FLAG_DEV_UP, &adapter->flags);
 
@@ -2820,6 +3144,7 @@ static int ena_up(struct ena_adapter *adapter)
 	return rc;
 
 err_up:
+	ena_free_page_caches(adapter);
 	ena_destroy_all_tx_queues(adapter);
 	ena_free_all_io_tx_resources(adapter);
 	ena_destroy_all_rx_queues(adapter);
@@ -2840,9 +3165,8 @@ static void ena_down(struct ena_adapter *adapter)
 
 	clear_bit(ENA_FLAG_DEV_UP, &adapter->flags);
 
-	u64_stats_update_begin(&adapter->syncp);
-	adapter->dev_stats.interface_down++;
-	u64_stats_update_end(&adapter->syncp);
+	ena_increase_stat_atomic(&adapter->dev_stats.interface_down, 1,
+		&adapter->syncp);
 
 	netif_carrier_off(adapter->netdev);
 	netif_tx_disable(adapter->netdev);
@@ -2857,7 +3181,8 @@ static void ena_down(struct ena_adapter *adapter)
 
 		rc = ena_com_dev_reset(adapter->ena_dev, adapter->reset_reason);
 		if (rc)
-			dev_err(&adapter->pdev->dev, "Device reset failed\n");
+			netif_err(adapter, ifdown, adapter->netdev,
+				  "Device reset failed\n");
 		/* stop submitting admin commands on a device that was reset */
 		ena_com_set_admin_running_state(adapter->ena_dev, false);
 	}
@@ -2870,6 +3195,7 @@ static void ena_down(struct ena_adapter *adapter)
 
 	ena_free_all_tx_bufs(adapter);
 	ena_free_all_rx_bufs(adapter);
+	ena_free_page_caches(adapter);
 	ena_free_all_io_tx_resources(adapter);
 	ena_free_all_io_rx_resources(adapter);
 }
@@ -3075,15 +3401,13 @@ static int ena_check_and_linearize_skb(struct ena_ring *tx_ring,
 	    (header_len < tx_ring->tx_max_header_size))
 		return 0;
 
-	u64_stats_update_begin(&tx_ring->syncp);
-	tx_ring->tx_stats.linearize++;
-	u64_stats_update_end(&tx_ring->syncp);
+	ena_increase_stat_atomic(&tx_ring->tx_stats.linearize, 1,
+		&tx_ring->syncp);
 
 	rc = skb_linearize(skb);
 	if (unlikely(rc)) {
-		u64_stats_update_begin(&tx_ring->syncp);
-		tx_ring->tx_stats.linearize_failed++;
-		u64_stats_update_end(&tx_ring->syncp);
+		ena_increase_stat_atomic(&tx_ring->tx_stats.linearize_failed, 1,
+			&tx_ring->syncp);
 	}
 
 	return rc;
@@ -3123,9 +3447,8 @@ static int ena_tx_map_skb(struct ena_ring *tx_ring,
 					       tx_ring->push_buf_intermediate_buf);
 		*header_len = push_len;
 		if (unlikely(skb->data != *push_hdr)) {
-			u64_stats_update_begin(&tx_ring->syncp);
-			tx_ring->tx_stats.llq_buffer_copy++;
-			u64_stats_update_end(&tx_ring->syncp);
+			ena_increase_stat_atomic(&tx_ring->tx_stats.llq_buffer_copy, 1,
+				&tx_ring->syncp);
 
 			delta = push_len - skb_head_len;
 		}
@@ -3182,10 +3505,9 @@ static int ena_tx_map_skb(struct ena_ring *tx_ring,
 	return 0;
 
 error_report_dma_error:
-	u64_stats_update_begin(&tx_ring->syncp);
-	tx_ring->tx_stats.dma_mapping_err++;
-	u64_stats_update_end(&tx_ring->syncp);
-	netdev_warn(adapter->netdev, "failed to map skb\n");
+	ena_increase_stat_atomic(&tx_ring->tx_stats.dma_mapping_err, 1,
+		&tx_ring->syncp);
+	netif_warn(adapter, tx_queued, adapter->netdev, "Failed to map skb\n");
 
 	tx_info->skb = NULL;
 
@@ -3259,9 +3581,8 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 			  __func__, qid);
 
 		netif_tx_stop_queue(txq);
-		u64_stats_update_begin(&tx_ring->syncp);
-		tx_ring->tx_stats.queue_stop++;
-		u64_stats_update_end(&tx_ring->syncp);
+		ena_increase_stat_atomic(&tx_ring->tx_stats.queue_stop, 1,
+			&tx_ring->syncp);
 
 		/* There is a rare condition where this function decide to
 		 * stop the queue but meanwhile clean_tx_irq updates
@@ -3276,9 +3597,8 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		if (ena_com_sq_have_enough_space(tx_ring->ena_com_io_sq,
 						 ENA_TX_WAKEUP_THRESH)) {
 			netif_tx_wake_queue(txq);
-			u64_stats_update_begin(&tx_ring->syncp);
-			tx_ring->tx_stats.queue_wakeup++;
-			u64_stats_update_end(&tx_ring->syncp);
+			ena_increase_stat_atomic(&tx_ring->tx_stats.queue_wakeup, 1,
+				&tx_ring->syncp);
 		}
 	}
 
@@ -3293,9 +3613,8 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		 * has a mb
 		 */
 		ena_com_write_sq_doorbell(tx_ring->ena_com_io_sq);
-		u64_stats_update_begin(&tx_ring->syncp);
-		tx_ring->tx_stats.doorbells++;
-		u64_stats_update_end(&tx_ring->syncp);
+		ena_increase_stat_atomic(&tx_ring->tx_stats.doorbells, 1,
+			&tx_ring->syncp);
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0)
 	}
 #endif
@@ -3377,13 +3696,14 @@ static void ena_set_rx_mode(struct net_device *netdev)
 
 static void ena_config_host_info(struct ena_com_dev *ena_dev, struct pci_dev *pdev)
 {
+	struct device *dev = &pdev->dev;
 	struct ena_admin_host_info *host_info;
 	int rc;
 
 	/* Allocate only the host info */
 	rc = ena_com_allocate_host_info(ena_dev);
 	if (rc) {
-		pr_err("Cannot allocate host info\n");
+		dev_err(dev, "Cannot allocate host info\n");
 		return;
 	}
 
@@ -3407,14 +3727,15 @@ static void ena_config_host_info(struct ena_com_dev *ena_dev, struct pci_dev *pd
 	host_info->driver_supported_features =
 		ENA_ADMIN_HOST_INFO_RX_OFFSET_MASK |
 		ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_MASK |
-		ENA_ADMIN_HOST_INFO_MAP_RX_BUF_BIDIRECTIONAL_MASK;
+		ENA_ADMIN_HOST_INFO_RX_BUF_MIRRORING_MASK |
+		ENA_ADMIN_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY_MASK;
 
 	rc = ena_com_set_host_attributes(ena_dev);
 	if (rc) {
 		if (rc == -EOPNOTSUPP)
-			pr_warn("Cannot set host attributes\n");
+			dev_warn(dev, "Cannot set host attributes\n");
 		else
-			pr_err("Cannot set host attributes\n");
+			dev_err(dev, "Cannot set host attributes\n");
 
 		goto err;
 	}
@@ -3442,7 +3763,8 @@ static void ena_config_debug_area(struct ena_adapter *adapter)
 
 	rc = ena_com_allocate_debug_area(adapter->ena_dev, debug_area_size);
 	if (rc) {
-		pr_err("Cannot allocate debug area\n");
+		netif_err(adapter, drv, adapter->netdev,
+			  "Cannot allocate debug area\n");
 		return;
 	}
 
@@ -3658,6 +3980,7 @@ static const struct net_device_ops ena_netdev_ops = {
 #endif
 #ifdef ENA_XDP_SUPPORT
 	.ndo_bpf		= ena_xdp,
+	.ndo_xdp_xmit		= ena_xdp_xmit,
 #endif /* ENA_XDP_SUPPORT */
 };
 
@@ -3705,7 +4028,6 @@ static int ena_set_queues_placement_policy(struct pci_dev *pdev,
 					   struct ena_admin_feature_llq_desc *llq,
 					   struct ena_llq_configurations *llq_default_configurations)
 {
-	bool has_mem_bar;
 	int rc;
 	u32 llq_feature_mask;
 
@@ -3717,24 +4039,28 @@ static int ena_set_queues_placement_policy(struct pci_dev *pdev,
 		return 0;
 	}
 
-	has_mem_bar = pci_select_bars(pdev, IORESOURCE_MEM) & BIT(ENA_MEM_BAR);
-
 	rc = ena_com_config_dev_mode(ena_dev, llq, llq_default_configurations);
 	if (unlikely(rc)) {
 		dev_err(&pdev->dev,
 			"Failed to configure the device mode.  Fallback to host mode policy.\n");
 		ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
-		return 0;
 	}
 
-	/* Nothing to config, exit */
-	if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST)
-		return 0;
+	return 0;
+}
+
+static int ena_map_llq_mem_bar(struct pci_dev *pdev, struct ena_com_dev *ena_dev,
+			       int bars)
+{
+	bool has_mem_bar = !!(bars & BIT(ENA_MEM_BAR));
 
 	if (!has_mem_bar) {
-		dev_err(&pdev->dev,
-			"ENA device does not expose LLQ bar. Fallback to host mode policy.\n");
-		ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
+		if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
+			dev_err(&pdev->dev,
+				"ENA device does not expose LLQ bar. Fallback to host mode policy.\n");
+			ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
+		}
+
 		return 0;
 	}
 
@@ -3761,7 +4087,7 @@ static int ena_device_init(struct ena_com_dev *ena_dev, struct pci_dev *pdev,
 
 	rc = ena_com_mmio_reg_read_request_init(ena_dev);
 	if (rc) {
-		dev_err(dev, "failed to init mmio read less\n");
+		dev_err(dev, "Failed to init mmio read less\n");
 		return rc;
 	}
 
@@ -3779,7 +4105,7 @@ static int ena_device_init(struct ena_com_dev *ena_dev, struct pci_dev *pdev,
 
 	rc = ena_com_validate_version(ena_dev);
 	if (rc) {
-		dev_err(dev, "device version is too low\n");
+		dev_err(dev, "Device version is too low\n");
 		goto err_mmio_read_less;
 	}
 
@@ -3846,9 +4172,9 @@ static int ena_device_init(struct ena_com_dev *ena_dev, struct pci_dev *pdev,
 	set_default_llq_configurations(&llq_config, &get_feat_ctx->llq);
 
 	rc = ena_set_queues_placement_policy(pdev, ena_dev, &get_feat_ctx->llq,
-					       &llq_config);
+					     &llq_config);
 	if (rc) {
-		dev_err(&pdev->dev, "ena device init failed\n");
+		dev_err(dev, "ENA device init failed\n");
 		goto err_admin_init;
 	}
 
@@ -4021,16 +4347,14 @@ static void ena_fw_reset_device(struct work_struct *work)
 {
 	struct ena_adapter *adapter =
 		container_of(work, struct ena_adapter, reset_task);
-	struct pci_dev *pdev = adapter->pdev;
 
-	if (unlikely(!test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) {
-		dev_err(&pdev->dev,
-			"device reset schedule while reset bit is off\n");
-		return;
-	}
 	rtnl_lock();
-	ena_destroy_device(adapter, false);
-	ena_restore_device(adapter);
+
+	if (likely(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) {
+		ena_destroy_device(adapter, false);
+		ena_restore_device(adapter);
+	}
+
 	rtnl_unlock();
 }
 
@@ -4111,9 +4435,8 @@ static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
 		rc = -EIO;
 	}
 
-	u64_stats_update_begin(&tx_ring->syncp);
-	tx_ring->tx_stats.missed_tx = missed_tx;
-	u64_stats_update_end(&tx_ring->syncp);
+	ena_increase_stat_atomic(&tx_ring->tx_stats.missed_tx , missed_tx,
+		&tx_ring->syncp);
 
 	return rc;
 }
@@ -4196,12 +4519,11 @@ static void check_for_empty_rx_ring(struct ena_adapter *adapter)
 			rx_ring->empty_rx_queue++;
 
 			if (rx_ring->empty_rx_queue >= EMPTY_RX_REFILL) {
-				u64_stats_update_begin(&rx_ring->syncp);
-				rx_ring->rx_stats.empty_rx_ring++;
-				u64_stats_update_end(&rx_ring->syncp);
+				ena_increase_stat_atomic(&rx_ring->rx_stats.empty_rx_ring, 1,
+					&rx_ring->syncp);
 
 				netif_err(adapter, drv, adapter->netdev,
-					  "trigger refill for ring %d\n", i);
+					  "Trigger refill for ring %d\n", i);
 
 				napi_schedule(rx_ring->napi);
 				rx_ring->empty_rx_queue = 0;
@@ -4228,9 +4550,8 @@ static void check_for_missing_keep_alive(struct ena_adapter *adapter)
 	if (unlikely(time_is_before_jiffies(keep_alive_expired))) {
 		netif_err(adapter, drv, adapter->netdev,
 			  "Keep alive watchdog timeout.\n");
-		u64_stats_update_begin(&adapter->syncp);
-		adapter->dev_stats.wd_expired++;
-		u64_stats_update_end(&adapter->syncp);
+		ena_increase_stat_atomic(&adapter->dev_stats.wd_expired, 1,
+			&adapter->syncp);
 		adapter->reset_reason = ENA_REGS_RESET_KEEP_ALIVE_TO;
 		set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
 	}
@@ -4241,9 +4562,8 @@ static void check_for_admin_com_state(struct ena_adapter *adapter)
 	if (unlikely(!ena_com_get_admin_running_state(adapter->ena_dev))) {
 		netif_err(adapter, drv, adapter->netdev,
 			  "ENA admin queue is not in running state!\n");
-		u64_stats_update_begin(&adapter->syncp);
-		adapter->dev_stats.admin_q_pause++;
-		u64_stats_update_end(&adapter->syncp);
+		ena_increase_stat_atomic(&adapter->dev_stats.admin_q_pause, 1,
+			&adapter->syncp);
 		adapter->reset_reason = ENA_REGS_RESET_ADMIN_TO;
 		set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
 	}
@@ -4597,7 +4917,7 @@ static int ena_calc_io_queue_size(struct ena_calc_queue_size_ctx *ctx)
  */
 static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
-	struct ena_calc_queue_size_ctx calc_queue_ctx = { 0 };
+	struct ena_calc_queue_size_ctx calc_queue_ctx = {};
 	struct ena_com_dev_get_features_ctx get_feat_ctx;
 	struct ena_com_dev *ena_dev = NULL;
 	struct ena_adapter *adapter;
@@ -4617,6 +4937,19 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		return rc;
 	}
 
+	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(ENA_MAX_PHYS_ADDR_SIZE_BITS));
+	if (rc) {
+		dev_err(&pdev->dev, "pci_set_dma_mask failed %d\n", rc);
+		goto err_disable_device;
+	}
+
+	rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(ENA_MAX_PHYS_ADDR_SIZE_BITS));
+	if (rc) {
+		dev_err(&pdev->dev, "err_pci_set_consistent_dma_mask failed %d\n",
+			rc);
+		goto err_disable_device;
+	}
+
 	pci_set_master(pdev);
 
 	ena_dev = vzalloc(sizeof(*ena_dev));
@@ -4637,20 +4970,45 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 					pci_resource_start(pdev, ENA_REG_BAR),
 					pci_resource_len(pdev, ENA_REG_BAR));
 	if (!ena_dev->reg_bar) {
-		dev_err(&pdev->dev, "failed to remap regs bar\n");
+		dev_err(&pdev->dev, "Failed to remap regs bar\n");
 		rc = -EFAULT;
 		goto err_free_region;
 	}
 
-	ena_dev->ena_min_poll_delay_us = ENA_POLL_DELAY_US;
+	ena_dev->ena_min_poll_delay_us = ENA_ADMIN_POLL_DELAY_US;
+
 	ena_dev->dmadev = &pdev->dev;
 
+	netdev = alloc_etherdev_mq(sizeof(struct ena_adapter), ENA_MAX_RINGS);
+	if (!netdev) {
+		dev_err(&pdev->dev, "alloc_etherdev_mq failed\n");
+		rc = -ENOMEM;
+		goto err_free_region;
+	}
+
+	SET_NETDEV_DEV(netdev, &pdev->dev);
+	adapter = netdev_priv(netdev);
+	adapter->ena_dev = ena_dev;
+	adapter->netdev = netdev;
+	adapter->pdev = pdev;
+	adapter->msg_enable = netif_msg_init(debug, DEFAULT_MSG_ENABLE);
+
+	ena_dev->net_device = netdev;
+
+	pci_set_drvdata(pdev, adapter);
+
 	rc = ena_device_init(ena_dev, pdev, &get_feat_ctx, &wd_state);
 	if (rc) {
-		dev_err(&pdev->dev, "ena device init failed\n");
+		dev_err(&pdev->dev, "ENA device init failed\n");
 		if (rc == -ETIME)
 			rc = -EPROBE_DEFER;
-		goto err_free_region;
+		goto err_netdev_destroy;
+	}
+
+	rc = ena_map_llq_mem_bar(pdev, ena_dev, bars);
+	if (rc) {
+		dev_err(&pdev->dev, "ENA llq bar mapping failed\n");
+		goto err_device_destroy;
 	}
 
 	calc_queue_ctx.ena_dev = ena_dev;
@@ -4670,26 +5028,8 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto err_device_destroy;
 	}
 
-	/* dev zeroed in init_etherdev */
-	netdev = alloc_etherdev_mq(sizeof(struct ena_adapter), max_num_io_queues);
-	if (!netdev) {
-		dev_err(&pdev->dev, "alloc_etherdev_mq failed\n");
-		rc = -ENOMEM;
-		goto err_device_destroy;
-	}
-
-	SET_NETDEV_DEV(netdev, &pdev->dev);
-
-	adapter = netdev_priv(netdev);
-	pci_set_drvdata(pdev, adapter);
-
-	adapter->ena_dev = ena_dev;
-	adapter->netdev = netdev;
-	adapter->pdev = pdev;
-
 	ena_set_conf_feat_params(adapter, &get_feat_ctx);
 
-	adapter->msg_enable = netif_msg_init(debug, DEFAULT_MSG_ENABLE);
 	adapter->reset_reason = ENA_REGS_RESET_NORMAL;
 
 	adapter->requested_tx_ring_size = calc_queue_ctx.tx_queue_size;
@@ -4721,7 +5061,7 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (rc) {
 		dev_err(&pdev->dev,
 			"Failed to query interrupt moderation feature\n");
-		goto err_netdev_destroy;
+		goto err_device_destroy;
 	}
 	ena_init_io_rings(adapter,
 			  0,
@@ -4816,11 +5156,11 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	ena_disable_msix(adapter);
 err_worker_destroy:
 	del_timer(&adapter->timer_service);
-err_netdev_destroy:
-	free_netdev(netdev);
 err_device_destroy:
 	ena_com_delete_host_info(ena_dev);
 	ena_com_admin_destroy(ena_dev);
+err_netdev_destroy:
+	free_netdev(netdev);
 err_free_region:
 	ena_release_bars(ena_dev, pdev);
 err_free_ena_dev:
@@ -4855,8 +5195,11 @@ static void __ena_shutoff(struct pci_dev *pdev, bool shutdown)
 		netdev->rx_cpu_rmap = NULL;
 	}
 #endif /* CONFIG_RFS_ACCEL */
-	del_timer_sync(&adapter->timer_service);
 
+	/* Make sure timer and reset routine won't be called after
+	 * freeing device resources.
+	 */
+	del_timer_sync(&adapter->timer_service);
 	cancel_work_sync(&adapter->reset_task);
 
 	rtnl_lock(); /* lock released inside the below if-else block */
@@ -4910,22 +5253,30 @@ static void ena_shutdown(struct pci_dev *pdev)
 }
 
 #ifdef CONFIG_PM
+#ifdef ENA_GENERIC_PM_OPS
+/* ena_suspend - PM suspend callback
+ * @dev_d: Device information struct
+ */
+static int __maybe_unused ena_suspend(struct device *dev_d)
+{
+	struct pci_dev *pdev = to_pci_dev(dev_d);
+#else /* ENA_GENERIC_PM_OPS */
 /* ena_suspend - PM suspend callback
  * @pdev: PCI device information struct
  * @state:power state
  */
 static int ena_suspend(struct pci_dev *pdev,  pm_message_t state)
 {
+#endif /* ENA_GENERIC_PM_OPS */
 	struct ena_adapter *adapter = pci_get_drvdata(pdev);
 
-	u64_stats_update_begin(&adapter->syncp);
-	adapter->dev_stats.suspend++;
-	u64_stats_update_end(&adapter->syncp);
+	ena_increase_stat_atomic(&adapter->dev_stats.suspend, 1,
+		&adapter->syncp);
 
 	rtnl_lock();
 	if (unlikely(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) {
 		dev_err(&pdev->dev,
-			"ignoring device reset request as the device is being suspended\n");
+			"Ignoring device reset request as the device is being suspended\n");
 		clear_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
 	}
 	ena_destroy_device(adapter, true);
@@ -4933,6 +5284,14 @@ static int ena_suspend(struct pci_dev *pdev,  pm_message_t state)
 	return 0;
 }
 
+#ifdef ENA_GENERIC_PM_OPS
+/* ena_resume - PM resume callback
+ * @dev_d: Device information struct
+ */
+static int __maybe_unused ena_resume(struct device *dev_d)
+{
+	struct ena_adapter *adapter = dev_get_drvdata(dev_d);
+#else /* ENA_GENERIC_PM_OPS */
 /* ena_resume - PM resume callback
  * @pdev: PCI device information struct
  *
@@ -4940,19 +5299,25 @@ static int ena_suspend(struct pci_dev *pdev,  pm_message_t state)
 static int ena_resume(struct pci_dev *pdev)
 {
 	struct ena_adapter *adapter = pci_get_drvdata(pdev);
+#endif /* ENA_GENERIC_PM_OPS */
 	int rc;
 
-	u64_stats_update_begin(&adapter->syncp);
-	adapter->dev_stats.resume++;
-	u64_stats_update_end(&adapter->syncp);
+	ena_increase_stat_atomic(&adapter->dev_stats.resume, 1,
+		&adapter->syncp);
 
 	rtnl_lock();
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5,5,0)
 	pci_set_power_state(pdev, PCI_D0);
+#endif
 	rc = ena_restore_device(adapter);
 	rtnl_unlock();
 	return rc;
 }
-#endif
+#endif /* CONFIG_PM */
+#ifdef ENA_GENERIC_PM_OPS
+
+static SIMPLE_DEV_PM_OPS(ena_pm_ops, ena_suspend, ena_resume);
+#endif /* ENA_GENERIC_PM_OPS */
 
 static struct pci_driver ena_pci_driver = {
 	.name		= DRV_MODULE_NAME,
@@ -4960,10 +5325,14 @@ static struct pci_driver ena_pci_driver = {
 	.probe		= ena_probe,
 	.remove		= ena_remove,
 	.shutdown	= ena_shutdown,
+#ifdef ENA_GENERIC_PM_OPS
+	.driver.pm	= &ena_pm_ops,
+#else /* ENA_GENERIC_PM_OPS */
 #ifdef CONFIG_PM
 	.suspend    = ena_suspend,
 	.resume     = ena_resume,
-#endif
+#endif /* CONFIG_PM */
+#endif /* ENA_GENERIC_PM_OPS */
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,18,0)
 	.sriov_configure = pci_sriov_configure_simple,
 #endif
@@ -5006,7 +5375,7 @@ static void ena_update_on_link_change(void *adapter_data,
 		ENA_ADMIN_AENQ_LINK_CHANGE_DESC_LINK_STATUS_MASK;
 
 	if (status) {
-		netdev_dbg(adapter->netdev, "%s\n", __func__);
+		netif_dbg(adapter, ifup, adapter->netdev, "%s\n", __func__);
 		set_bit(ENA_FLAG_LINK_UP, &adapter->flags);
 		if (!test_bit(ENA_FLAG_ONGOING_RESET, &adapter->flags))
 			netif_carrier_on(adapter->netdev);
@@ -5031,6 +5400,9 @@ static void ena_keep_alive_wd(void *adapter_data,
 	tx_drops = ((u64)desc->tx_drops_high << 32) | desc->tx_drops_low;
 
 	u64_stats_update_begin(&adapter->syncp);
+	/* These stats are accumulated by the device, so the counters indicate
+	 * all drops since last reset.
+	 */
 	adapter->dev_stats.rx_drops = rx_drops;
 	adapter->dev_stats.tx_drops = tx_drops;
 	u64_stats_update_end(&adapter->syncp);
@@ -5047,7 +5419,7 @@ static void ena_notification(void *adapter_data,
 	     aenq_e->aenq_common_desc.group,
 	     ENA_ADMIN_NOTIFICATION);
 
-	switch (aenq_e->aenq_common_desc.syndrom) {
+	switch (aenq_e->aenq_common_desc.syndrome) {
 	case ENA_ADMIN_UPDATE_HINTS:
 		hints = (struct ena_admin_ena_hw_hints *)
 			(&aenq_e->inline_data_w4);
@@ -5056,7 +5428,7 @@ static void ena_notification(void *adapter_data,
 	default:
 		netif_err(adapter, drv, adapter->netdev,
 			  "Invalid aenq notification link state %d\n",
-			  aenq_e->aenq_common_desc.syndrom);
+			  aenq_e->aenq_common_desc.syndrome);
 	}
 }
 
diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h
index 038fd6c873a0a..48f648ef85f0b 100755
--- a/drivers/amazon/net/ena/ena_netdev.h
+++ b/drivers/amazon/net/ena/ena_netdev.h
@@ -1,33 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
- * Copyright 2015 Amazon.com, Inc. or its affiliates.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef ENA_H
@@ -52,8 +25,8 @@
 #include "ena_eth_com.h"
 
 #define DRV_MODULE_GEN_MAJOR	2
-#define DRV_MODULE_GEN_MINOR	2
-#define DRV_MODULE_GEN_SUBMINOR 10
+#define DRV_MODULE_GEN_MINOR	4
+#define DRV_MODULE_GEN_SUBMINOR 0
 
 #define DRV_MODULE_NAME		"ena"
 #ifndef DRV_MODULE_GENERATION
@@ -75,7 +48,7 @@
  * 16kB.
  */
 #if PAGE_SIZE > SZ_16K
-#define ENA_PAGE_SIZE SZ_16K
+#define ENA_PAGE_SIZE (_AC(SZ_16K, UL))
 #else
 #define ENA_PAGE_SIZE PAGE_SIZE
 #endif
@@ -141,7 +114,7 @@
 #define ENA_IO_IRQ_FIRST_IDX		1
 #define ENA_IO_IRQ_IDX(q)		(ENA_IO_IRQ_FIRST_IDX + (q))
 
-#define ENA_POLL_DELAY_US 5000
+#define ENA_ADMIN_POLL_DELAY_US 5000
 
 /* ENA device should send keep alive msg every 1 sec.
  * We wait for 6 sec just to be on the safe side.
@@ -158,8 +131,14 @@
  */
 
 #ifdef ENA_XDP_SUPPORT
+#ifdef XDP_HAS_FRAME_SZ
+#define ENA_XDP_MAX_MTU (ENA_PAGE_SIZE - ETH_HLEN - ETH_FCS_LEN -	\
+			 VLAN_HLEN - XDP_PACKET_HEADROOM -		\
+			 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
+#else
 #define ENA_XDP_MAX_MTU (ENA_PAGE_SIZE - ETH_HLEN - ETH_FCS_LEN - \
 				VLAN_HLEN - XDP_PACKET_HEADROOM)
+#endif
 
 #define ENA_IS_XDP_INDEX(adapter, index) (((index) >= (adapter)->xdp_first_ring) && \
 	((index) < (adapter)->xdp_first_ring + (adapter)->xdp_num_queues))
@@ -184,7 +163,7 @@ struct ena_napi {
 	struct ena_ring *xdp_ring;
 #endif /* ENA_XDP_SUPPORT */
 	bool first_interrupt;
-	atomic_t unmask_interrupt;
+	bool interrupts_masked;
 	u32 qid;
 	struct dim dim;
 };
@@ -215,12 +194,6 @@ struct ena_tx_buffer {
 	 * the xdp queues
 	 */
 	struct xdp_frame *xdpf;
-	/* The rx page for the rx buffer that was received in rx and
-	 * re transmitted on xdp tx queues as a result of XDP_TX action.
-	 * We need to free the page once we finished cleaning the buffer in
-	 * clean_xdp_irq()
-	 */
-	struct page *xdp_rx_page;
 #endif /* ENA_XDP_SUPPORT */
 
 	/* Indicate if bufs[0] map the linear data of the skb. */
@@ -285,8 +258,42 @@ struct ena_stats_rx {
 	u64 bad_req_id;
 	u64 empty_rx_ring;
 	u64 csum_unchecked;
+#ifdef ENA_XDP_SUPPORT
+	u64 xdp_aborted;
+	u64 xdp_drop;
+	u64 xdp_pass;
+	u64 xdp_tx;
+	u64 xdp_invalid;
+	u64 xdp_redirect;
+#endif
+	u64 lpc_warm_up;
+	u64 lpc_full;
+	u64 lpc_wrong_numa;
+};
+
+/* LPC definitions */
+#define ENA_LPC_DEFAULT_MULTIPLIER 2
+#define ENA_LPC_MAX_MULTIPLIER 32
+#define ENA_LPC_MULTIPLIER_UNIT 1024
+#define ENA_LPC_MIN_NUM_OF_CHANNELS 16
+
+/* Store DMA address along with the page */
+struct ena_page {
+	struct page *page;
+	dma_addr_t dma_addr;
 };
 
+struct ena_page_cache {
+	/* How many pages are produced */
+	u32 head;
+	/* How many of the entries were initialized */
+	u32 current_size;
+	/* Maximum number of pages the cache can hold */
+	u32 max_size;
+
+	struct ena_page cache[0];
+} ____cacheline_aligned;
+
 struct ena_ring {
 	/* Holds the empty requests for TX/RX
 	 * out of order completions
@@ -303,6 +310,7 @@ struct ena_ring {
 	struct pci_dev *pdev;
 	struct napi_struct *napi;
 	struct net_device *netdev;
+	struct ena_page_cache *page_cache;
 	struct ena_com_dev *ena_dev;
 	struct ena_adapter *adapter;
 	struct ena_com_io_cq *ena_com_io_cq;
@@ -310,6 +318,7 @@ struct ena_ring {
 #ifdef ENA_XDP_SUPPORT
 	struct bpf_prog *xdp_bpf_prog;
 	struct xdp_rxq_info xdp_rxq;
+	spinlock_t xdp_tx_lock;	/* synchronize XDP TX/Redirect traffic */
 #endif
 
 	u16 next_to_use;
@@ -601,8 +610,8 @@ static inline bool ena_xdp_present_ring(struct ena_ring *ring)
 	return !!ring->xdp_bpf_prog;
 }
 
-static inline int ena_xdp_legal_queue_count(struct ena_adapter *adapter,
-					    u32 queues)
+static inline bool ena_xdp_legal_queue_count(struct ena_adapter *adapter,
+					     u32 queues)
 {
 	return 2 * queues <= adapter->max_num_io_queues;
 }
diff --git a/drivers/amazon/net/ena/ena_pci_id_tbl.h b/drivers/amazon/net/ena/ena_pci_id_tbl.h
index 426e57e10a7f0..3ecdf29160ca7 100755
--- a/drivers/amazon/net/ena/ena_pci_id_tbl.h
+++ b/drivers/amazon/net/ena/ena_pci_id_tbl.h
@@ -1,33 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
- * Copyright 2015 Amazon.com, Inc. or its affiliates.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef ENA_PCI_ID_TBL_H_
diff --git a/drivers/amazon/net/ena/ena_regs_defs.h b/drivers/amazon/net/ena/ena_regs_defs.h
index b9e8e48660e82..568b26185fe9d 100755
--- a/drivers/amazon/net/ena/ena_regs_defs.h
+++ b/drivers/amazon/net/ena/ena_regs_defs.h
@@ -1,33 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
- * Copyright 2015 - 2018 Amazon.com, Inc. or its affiliates.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 #ifndef _ENA_REGS_H_
 #define _ENA_REGS_H_
diff --git a/drivers/amazon/net/ena/ena_sysfs.c b/drivers/amazon/net/ena/ena_sysfs.c
index 26850a7f31fba..53b8d84ddcc36 100755
--- a/drivers/amazon/net/ena/ena_sysfs.c
+++ b/drivers/amazon/net/ena/ena_sysfs.c
@@ -1,33 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /*
- * Copyright 2015 Amazon.com, Inc. or its affiliates.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include <linux/device.h>
@@ -86,7 +59,7 @@ int ena_sysfs_init(struct device *dev)
 {
 
 	if (device_create_file(dev, &dev_attr_rx_copybreak))
-		dev_err(dev, "failed to create rx_copybreak sysfs entry");
+		dev_err(dev, "Failed to create rx_copybreak sysfs entry");
 	return 0;
 }
 
diff --git a/drivers/amazon/net/ena/ena_sysfs.h b/drivers/amazon/net/ena/ena_sysfs.h
index dc0d4c90cd327..8c572eee268f3 100755
--- a/drivers/amazon/net/ena/ena_sysfs.h
+++ b/drivers/amazon/net/ena/ena_sysfs.h
@@ -1,33 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
- * Copyright 2015 Amazon.com, Inc. or its affiliates.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef __ENA_SYSFS_H__
diff --git a/drivers/amazon/net/ena/kcompat.h b/drivers/amazon/net/ena/kcompat.h
index f155f2c65b59f..729a13eeabf98 100755
--- a/drivers/amazon/net/ena/kcompat.h
+++ b/drivers/amazon/net/ena/kcompat.h
@@ -76,6 +76,9 @@ Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
 #include <linux/sizes.h>
 #endif
 
+/* For ACCESS_ONCE, WRITE_ONCE and READ_ONCE macros */
+#include<linux/compiler.h>
+
 #ifndef SZ_256
 #define SZ_256 0x0000100
 #endif
@@ -696,9 +699,19 @@ do {									\
 
 #if defined(CONFIG_BPF) && LINUX_VERSION_CODE >= KERNEL_VERSION(5,0,0)
 #define ENA_XDP_SUPPORT
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,8,0)
+#define XDP_HAS_FRAME_SZ
+#define XDP_CONVERT_TO_FRAME_NAME_CHANGED
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0)
+#define ENA_XDP_QUERY_IN_KERNEL
+#endif
+
 #endif
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,6,0)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,6,0) || \
+    (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 3)))
 #define HAVE_NDO_TX_TIMEOUT_STUCK_QUEUE_PARAMETER
 #endif
 
@@ -735,4 +748,57 @@ static inline void netdev_rss_key_fill(void *buffer, size_t len)
 }
 #endif
 
+#ifndef WRITE_ONCE
+#define WRITE_ONCE(x, val) (ACCESS_ONCE(x) = val)
+#endif
+#ifndef READ_ONCE
+#define READ_ONCE(x) ACCESS_ONCE(x)
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9 ,0)
+#define ENA_GENERIC_PM_OPS
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 5 ,0)
+static inline int page_ref_count(struct page *page)
+{
+	return atomic_read(&page->_count);
+}
+
+static inline void page_ref_inc(struct page *page)
+{
+	atomic_inc(&page->_count);
+}
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0)
+static inline struct page *dev_alloc_page()
+{
+	gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN;
+
+	gfp_mask |= __GFP_COLD | __GFP_COMP;
+
+	return alloc_pages_node(NUMA_NO_NODE, gfp_mask, 0);
+}
+#endif
+
+/* This entry might seem strange because of the #ifndef numa_mem_id(),
+ * but these defines were taken from the Linux kernel
+ */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 34)
+#ifndef numa_mem_id
+#ifdef CONFIG_HAVE_MEMORYLESS_NODES
+static inline int numa_mem_id(void)
+{
+	return __this_cpu_read(_numa_mem_);
+}
+#else /* CONFIG_HAVE_MEMORYLESS_NODES */
+static inline int numa_mem_id(void)
+{
+	return numa_node_id();
+}
+#endif /* CONFIG_HAVE_MEMORYLESS_NODES */
+#endif /* numa_mem_id */
+#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 34) */
+
 #endif /* _KCOMPAT_H_ */

From 7f623fa07ea1c5a5a2a59caf8ecb69d9932eb28b Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Thu, 31 Dec 2020 20:56:28 +0000
Subject: [PATCH 082/737] lustre: don't try fault fast path if we can't retry

Under certain circumstances, the lustre fault handler
can loop endlessly, see this commit in the lustre tree:

bb50c62c6f ("LU-13182 llite: Avoid eternel retry loops with MAP_POPULATE")

However, this commit isn't enough to avoid hangs while
doing a kernel build on a lustre fs.

Do something that is more clean for 5.10, that does work: use
the fault_flag_allow_retry_first() function to see if the fast
path should be taken. If either retries are not allowed, or
this isn't the first retry, then don't take the fast path
at all - the fast path should only be taken if we're prepared
to deal with a retry.

Of course, this still avoids the question as to why filemap_fault()
keeps returning a retry. Upstream also has not addressed this
question. This should be figured out upstream, we're mainly
concerned with getting this working right now.

A kernel compile shows that the slow path is only taken, as part
of a retry, about 4 out of 100,000 faults.

Signed-off-by: Fran van der Linden <fllinden@amazon.com>
---
 drivers/staging/lustrefsx/lustre/llite/llite_mmap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c b/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
index 30cf21b778811..7807f45396c94 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
@@ -267,7 +267,8 @@ static vm_fault_t ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (IS_ERR(env))
 		RETURN(PTR_ERR(env));
 
-	if (ll_sbi_has_fast_read(ll_i2sbi(file_inode(vma->vm_file)))) {
+	if (fault_flag_allow_retry_first(vmf->flags) &&
+	    ll_sbi_has_fast_read(ll_i2sbi(file_inode(vma->vm_file)))) {
 		/* do fast fault */
 		ll_cl_add(vma->vm_file, env, NULL, LCC_MMAP);
 		fault_ret = ll_filemap_fault(vma, vmf);

From 6a91fe6be3a39bb214f97d72549dcff8ebbeebf0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 1 Nov 2020 09:56:18 -0500
Subject: [PATCH 083/737] NFS: Ensure contents of struct nfs_open_dir_context
 are consistent

Ensure that the contents of struct nfs_open_dir_context are consistent
by setting them under the file->f_lock from a private copy (that is
known to be consistent).

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
(cherry picked from commit 2e7a46417952ae480cb5091ed5ade73078630b40)
---
 fs/nfs/dir.c | 72 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 43 insertions(+), 29 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 9f88ca7b20015..64f8803d2ddcb 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -144,20 +144,23 @@ struct nfs_cache_array {
 	struct nfs_cache_array_entry array[];
 };
 
-typedef struct {
+typedef struct nfs_readdir_descriptor {
 	struct file	*file;
 	struct page	*page;
 	struct dir_context *ctx;
 	unsigned long	page_index;
-	u64		*dir_cookie;
+	u64		dir_cookie;
 	u64		last_cookie;
+	u64		dup_cookie;
 	loff_t		current_index;
 	loff_t		prev_index;
 
 	unsigned long	dir_verifier;
 	unsigned long	timestamp;
 	unsigned long	gencount;
+	unsigned long	attr_gencount;
 	unsigned int	cache_entry_index;
+	signed char duped;
 	bool plus;
 	bool eof;
 } nfs_readdir_descriptor_t;
@@ -273,7 +276,7 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
 	}
 
 	index = (unsigned int)diff;
-	*desc->dir_cookie = array->array[index].cookie;
+	desc->dir_cookie = array->array[index].cookie;
 	desc->cache_entry_index = index;
 	return 0;
 out_eof:
@@ -298,33 +301,32 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
 	int status = -EAGAIN;
 
 	for (i = 0; i < array->size; i++) {
-		if (array->array[i].cookie == *desc->dir_cookie) {
+		if (array->array[i].cookie == desc->dir_cookie) {
 			struct nfs_inode *nfsi = NFS_I(file_inode(desc->file));
-			struct nfs_open_dir_context *ctx = desc->file->private_data;
 
 			new_pos = desc->current_index + i;
-			if (ctx->attr_gencount != nfsi->attr_gencount ||
+			if (desc->attr_gencount != nfsi->attr_gencount ||
 			    !nfs_readdir_inode_mapping_valid(nfsi)) {
-				ctx->duped = 0;
-				ctx->attr_gencount = nfsi->attr_gencount;
+				desc->duped = 0;
+				desc->attr_gencount = nfsi->attr_gencount;
 			} else if (new_pos < desc->prev_index) {
-				if (ctx->duped > 0
-				    && ctx->dup_cookie == *desc->dir_cookie) {
+				if (desc->duped > 0
+				    && desc->dup_cookie == desc->dir_cookie) {
 					if (printk_ratelimit()) {
 						pr_notice("NFS: directory %pD2 contains a readdir loop."
 								"Please contact your server vendor.  "
 								"The file: %.*s has duplicate cookie %llu\n",
 								desc->file, array->array[i].string.len,
-								array->array[i].string.name, *desc->dir_cookie);
+								array->array[i].string.name, desc->dir_cookie);
 					}
 					status = -ELOOP;
 					goto out;
 				}
-				ctx->dup_cookie = *desc->dir_cookie;
-				ctx->duped = -1;
+				desc->dup_cookie = desc->dir_cookie;
+				desc->duped = -1;
 			}
 			if (nfs_readdir_use_cookie(desc->file))
-				desc->ctx->pos = *desc->dir_cookie;
+				desc->ctx->pos = desc->dir_cookie;
 			else
 				desc->ctx->pos = new_pos;
 			desc->prev_index = new_pos;
@@ -334,7 +336,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
 	}
 	if (array->eof_index >= 0) {
 		status = -EBADCOOKIE;
-		if (*desc->dir_cookie == array->last_cookie)
+		if (desc->dir_cookie == array->last_cookie)
 			desc->eof = true;
 	}
 out:
@@ -349,7 +351,7 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
 
 	array = kmap(desc->page);
 
-	if (*desc->dir_cookie == 0)
+	if (desc->dir_cookie == 0)
 		status = nfs_readdir_search_for_pos(array, desc);
 	else
 		status = nfs_readdir_search_for_cookie(array, desc);
@@ -801,7 +803,6 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
 	int i = 0;
 	int res = 0;
 	struct nfs_cache_array *array = NULL;
-	struct nfs_open_dir_context *ctx = file->private_data;
 
 	array = kmap(desc->page);
 	for (i = desc->cache_entry_index; i < array->size; i++) {
@@ -814,22 +815,22 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
 			break;
 		}
 		if (i < (array->size-1))
-			*desc->dir_cookie = array->array[i+1].cookie;
+			desc->dir_cookie = array->array[i+1].cookie;
 		else
-			*desc->dir_cookie = array->last_cookie;
+			desc->dir_cookie = array->last_cookie;
 		if (nfs_readdir_use_cookie(file))
-			desc->ctx->pos = *desc->dir_cookie;
+			desc->ctx->pos = desc->dir_cookie;
 		else
 			desc->ctx->pos++;
-		if (ctx->duped != 0)
-			ctx->duped = 1;
+		if (desc->duped != 0)
+			desc->duped = 1;
 	}
 	if (array->eof_index >= 0)
 		desc->eof = true;
 
 	kunmap(desc->page);
 	dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
-			(unsigned long long)*desc->dir_cookie, res);
+			(unsigned long long)desc->dir_cookie, res);
 	return res;
 }
 
@@ -851,10 +852,9 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc)
 	struct page	*page = NULL;
 	int		status;
 	struct inode *inode = file_inode(desc->file);
-	struct nfs_open_dir_context *ctx = desc->file->private_data;
 
 	dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
-			(unsigned long long)*desc->dir_cookie);
+			(unsigned long long)desc->dir_cookie);
 
 	page = alloc_page(GFP_HIGHUSER);
 	if (!page) {
@@ -863,9 +863,9 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc)
 	}
 
 	desc->page_index = 0;
-	desc->last_cookie = *desc->dir_cookie;
+	desc->last_cookie = desc->dir_cookie;
 	desc->page = page;
-	ctx->duped = 0;
+	desc->duped = 0;
 
 	status = nfs_readdir_xdr_to_array(desc, page, inode);
 	if (status < 0)
@@ -894,7 +894,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	nfs_readdir_descriptor_t my_desc = {
 		.file = file,
 		.ctx = ctx,
-		.dir_cookie = &dir_ctx->dir_cookie,
 		.plus = nfs_use_readdirplus(inode, ctx),
 	},
 			*desc = &my_desc;
@@ -915,13 +914,20 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	if (res < 0)
 		goto out;
 
+	spin_lock(&file->f_lock);
+	desc->dir_cookie = dir_ctx->dir_cookie;
+	desc->dup_cookie = dir_ctx->dup_cookie;
+	desc->duped = dir_ctx->duped;
+	desc->attr_gencount = dir_ctx->attr_gencount;
+	spin_unlock(&file->f_lock);
+
 	do {
 		res = readdir_search_pagecache(desc);
 
 		if (res == -EBADCOOKIE) {
 			res = 0;
 			/* This means either end of directory */
-			if (*desc->dir_cookie && !desc->eof) {
+			if (desc->dir_cookie && !desc->eof) {
 				/* Or that the server has 'lost' a cookie */
 				res = uncached_readdir(desc);
 				if (res == 0)
@@ -946,6 +952,14 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 		if (res < 0)
 			break;
 	} while (!desc->eof);
+
+	spin_lock(&file->f_lock);
+	dir_ctx->dir_cookie = desc->dir_cookie;
+	dir_ctx->dup_cookie = desc->dup_cookie;
+	dir_ctx->duped = desc->duped;
+	dir_ctx->attr_gencount = desc->attr_gencount;
+	spin_unlock(&file->f_lock);
+
 out:
 	if (res > 0)
 		res = 0;

From 940cd12123108cb104a6e0cf32a1925d52c57466 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 1 Nov 2020 13:45:55 -0500
Subject: [PATCH 084/737] NFS: Clean up readdir struct nfs_cache_array

Since the 'eof_index' is only ever used as a flag, make it so.
Also add a flag to detect if the page has been completely filled.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
(cherry picked from commit b1e21c97437f64d0a00f8fea1f9e64e77e0e4242)
---
 fs/nfs/dir.c | 66 ++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 49 insertions(+), 17 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 64f8803d2ddcb..41042b3877a47 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -138,9 +138,10 @@ struct nfs_cache_array_entry {
 };
 
 struct nfs_cache_array {
-	int size;
-	int eof_index;
 	u64 last_cookie;
+	unsigned int size;
+	unsigned char page_full : 1,
+		      page_is_eof : 1;
 	struct nfs_cache_array_entry array[];
 };
 
@@ -172,7 +173,6 @@ void nfs_readdir_init_array(struct page *page)
 
 	array = kmap_atomic(page);
 	memset(array, 0, sizeof(struct nfs_cache_array));
-	array->eof_index = -1;
 	kunmap_atomic(array);
 }
 
@@ -192,6 +192,17 @@ void nfs_readdir_clear_array(struct page *page)
 	kunmap_atomic(array);
 }
 
+static void nfs_readdir_array_set_eof(struct nfs_cache_array *array)
+{
+	array->page_is_eof = 1;
+	array->page_full = 1;
+}
+
+static bool nfs_readdir_array_is_full(struct nfs_cache_array *array)
+{
+	return array->page_full;
+}
+
 /*
  * the caller is responsible for freeing qstr.name
  * when called by nfs_readdir_add_to_array, the strings will be freed in
@@ -213,6 +224,23 @@ int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int le
 	return 0;
 }
 
+/*
+ * Check that the next array entry lies entirely within the page bounds
+ */
+static int nfs_readdir_array_can_expand(struct nfs_cache_array *array)
+{
+	struct nfs_cache_array_entry *cache_entry;
+
+	if (array->page_full)
+		return -ENOSPC;
+	cache_entry = &array->array[array->size + 1];
+	if ((char *)cache_entry - (char *)array > PAGE_SIZE) {
+		array->page_full = 1;
+		return -ENOSPC;
+	}
+	return 0;
+}
+
 static
 int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
 {
@@ -220,13 +248,11 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
 	struct nfs_cache_array_entry *cache_entry;
 	int ret;
 
-	cache_entry = &array->array[array->size];
-
-	/* Check that this entry lies within the page bounds */
-	ret = -ENOSPC;
-	if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE)
+	ret = nfs_readdir_array_can_expand(array);
+	if (ret)
 		goto out;
 
+	cache_entry = &array->array[array->size];
 	cache_entry->cookie = entry->prev_cookie;
 	cache_entry->ino = entry->ino;
 	cache_entry->d_type = entry->d_type;
@@ -236,12 +262,21 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
 	array->last_cookie = entry->cookie;
 	array->size++;
 	if (entry->eof != 0)
-		array->eof_index = array->size;
+		nfs_readdir_array_set_eof(array);
 out:
 	kunmap(page);
 	return ret;
 }
 
+static void nfs_readdir_page_set_eof(struct page *page)
+{
+	struct nfs_cache_array *array;
+
+	array = kmap_atomic(page);
+	nfs_readdir_array_set_eof(array);
+	kunmap_atomic(array);
+}
+
 static inline
 int is_32bit_api(void)
 {
@@ -270,7 +305,7 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
 	if (diff < 0)
 		goto out_eof;
 	if (diff >= array->size) {
-		if (array->eof_index >= 0)
+		if (array->page_is_eof)
 			goto out_eof;
 		return -EAGAIN;
 	}
@@ -334,7 +369,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
 			return 0;
 		}
 	}
-	if (array->eof_index >= 0) {
+	if (array->page_is_eof) {
 		status = -EBADCOOKIE;
 		if (desc->dir_cookie == array->last_cookie)
 			desc->eof = true;
@@ -566,7 +601,6 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
 	struct xdr_stream stream;
 	struct xdr_buf buf;
 	struct page *scratch;
-	struct nfs_cache_array *array;
 	unsigned int count = 0;
 	int status;
 
@@ -604,10 +638,8 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
 
 out_nopages:
 	if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
-		array = kmap(page);
-		array->eof_index = array->size;
+		nfs_readdir_page_set_eof(page);
 		status = 0;
-		kunmap(page);
 	}
 
 	put_page(scratch);
@@ -689,7 +721,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 				status = 0;
 			break;
 		}
-	} while (array->eof_index < 0);
+	} while (!nfs_readdir_array_is_full(array));
 
 	nfs_readdir_free_pages(pages, array_size);
 out_release_array:
@@ -825,7 +857,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
 		if (desc->duped != 0)
 			desc->duped = 1;
 	}
-	if (array->eof_index >= 0)
+	if (array->page_is_eof)
 		desc->eof = true;
 
 	kunmap(desc->page);

From fa544fcf702843673a37ecfe18fc43ea334e5ed4 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 1 Nov 2020 17:15:43 -0500
Subject: [PATCH 085/737] NFS: Clean up nfs_readdir_page_filler()

Clean up handling of the case where there are no entries in the readdir
reply.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
(cherry picked from commit 972bcdf233096d36b2f3e02f34a80d0f073b6b05)
---
 fs/nfs/dir.c | 39 ++++++++++++++++++---------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 41042b3877a47..e08943048ec7d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -601,16 +601,12 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
 	struct xdr_stream stream;
 	struct xdr_buf buf;
 	struct page *scratch;
-	unsigned int count = 0;
 	int status;
 
 	scratch = alloc_page(GFP_KERNEL);
 	if (scratch == NULL)
 		return -ENOMEM;
 
-	if (buflen == 0)
-		goto out_nopages;
-
 	xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);
 	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
 
@@ -619,27 +615,27 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
 			entry->label->len = NFS4_MAXLABELLEN;
 
 		status = xdr_decode(desc, entry, &stream);
-		if (status != 0) {
-			if (status == -EAGAIN)
-				status = 0;
+		if (status != 0)
 			break;
-		}
-
-		count++;
 
 		if (desc->plus)
 			nfs_prime_dcache(file_dentry(desc->file), entry,
 					desc->dir_verifier);
 
 		status = nfs_readdir_add_to_array(entry, page);
-		if (status != 0)
-			break;
-	} while (!entry->eof);
+	} while (!status && !entry->eof);
 
-out_nopages:
-	if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
-		nfs_readdir_page_set_eof(page);
+	switch (status) {
+	case -EBADCOOKIE:
+		if (entry->eof) {
+			nfs_readdir_page_set_eof(page);
+			status = 0;
+		}
+		break;
+	case -ENOSPC:
+	case -EAGAIN:
 		status = 0;
+		break;
 	}
 
 	put_page(scratch);
@@ -714,14 +710,15 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 
 		if (status < 0)
 			break;
+
 		pglen = status;
-		status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen);
-		if (status < 0) {
-			if (status == -ENOSPC)
-				status = 0;
+		if (pglen == 0) {
+			nfs_readdir_page_set_eof(page);
 			break;
 		}
-	} while (!nfs_readdir_array_is_full(array));
+
+		status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen);
+	} while (!status && !nfs_readdir_array_is_full(array));
 
 	nfs_readdir_free_pages(pages, array_size);
 out_release_array:

From 06608bce64d654d272c3bcdfcd7d5dbff5f00ea5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 1 Nov 2020 12:34:43 -0500
Subject: [PATCH 086/737] NFS: Clean up directory array handling

Refactor to use pagecache_get_page() so that we can fill the page
in multiple stages.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
(cherry picked from commit 1f1d4aa4e4bcb4721d3c51f4c07dda790b6accd9)
---
 fs/nfs/dir.c | 138 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 77 insertions(+), 61 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e08943048ec7d..a98eab31d92ac 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -149,7 +149,7 @@ typedef struct nfs_readdir_descriptor {
 	struct file	*file;
 	struct page	*page;
 	struct dir_context *ctx;
-	unsigned long	page_index;
+	pgoff_t		page_index;
 	u64		dir_cookie;
 	u64		last_cookie;
 	u64		dup_cookie;
@@ -166,13 +166,18 @@ typedef struct nfs_readdir_descriptor {
 	bool eof;
 } nfs_readdir_descriptor_t;
 
-static
-void nfs_readdir_init_array(struct page *page)
+static void nfs_readdir_array_init(struct nfs_cache_array *array)
+{
+	memset(array, 0, sizeof(struct nfs_cache_array));
+}
+
+static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie)
 {
 	struct nfs_cache_array *array;
 
 	array = kmap_atomic(page);
-	memset(array, 0, sizeof(struct nfs_cache_array));
+	nfs_readdir_array_init(array);
+	array->last_cookie = last_cookie;
 	kunmap_atomic(array);
 }
 
@@ -188,7 +193,7 @@ void nfs_readdir_clear_array(struct page *page)
 	array = kmap_atomic(page);
 	for (i = 0; i < array->size; i++)
 		kfree(array->array[i].string.name);
-	array->size = 0;
+	nfs_readdir_array_init(array);
 	kunmap_atomic(array);
 }
 
@@ -268,6 +273,44 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
 	return ret;
 }
 
+static struct page *nfs_readdir_page_get_locked(struct address_space *mapping,
+						pgoff_t index, u64 last_cookie)
+{
+	struct page *page;
+
+	page = grab_cache_page(mapping, index);
+	if (page && !PageUptodate(page)) {
+		nfs_readdir_page_init_array(page, last_cookie);
+		if (invalidate_inode_pages2_range(mapping, index + 1, -1) < 0)
+			nfs_zap_mapping(mapping->host, mapping);
+		SetPageUptodate(page);
+	}
+
+	return page;
+}
+
+static u64 nfs_readdir_page_last_cookie(struct page *page)
+{
+	struct nfs_cache_array *array;
+	u64 ret;
+
+	array = kmap_atomic(page);
+	ret = array->last_cookie;
+	kunmap_atomic(array);
+	return ret;
+}
+
+static bool nfs_readdir_page_needs_filling(struct page *page)
+{
+	struct nfs_cache_array *array;
+	bool ret;
+
+	array = kmap_atomic(page);
+	ret = !nfs_readdir_array_is_full(array);
+	kunmap_atomic(array);
+	return ret;
+}
+
 static void nfs_readdir_page_set_eof(struct page *page)
 {
 	struct nfs_cache_array *array;
@@ -682,10 +725,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 	int status = -ENOMEM;
 	unsigned int array_size = ARRAY_SIZE(pages);
 
-	nfs_readdir_init_array(page);
-
 	entry.prev_cookie = 0;
-	entry.cookie = desc->last_cookie;
+	entry.cookie = nfs_readdir_page_last_cookie(page);
 	entry.eof = 0;
 	entry.fh = nfs_alloc_fhandle();
 	entry.fattr = nfs_alloc_fattr();
@@ -730,48 +771,25 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 	return status;
 }
 
-/*
- * Now we cache directories properly, by converting xdr information
- * to an array that can be used for lookups later.  This results in
- * fewer cache pages, since we can store more information on each page.
- * We only need to convert from xdr once so future lookups are much simpler
- */
-static
-int nfs_readdir_filler(void *data, struct page* page)
+static void nfs_readdir_page_put(struct nfs_readdir_descriptor *desc)
 {
-	nfs_readdir_descriptor_t *desc = data;
-	struct inode	*inode = file_inode(desc->file);
-	int ret;
-
-	ret = nfs_readdir_xdr_to_array(desc, page, inode);
-	if (ret < 0)
-		goto error;
-	SetPageUptodate(page);
-
-	if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
-		/* Should never happen */
-		nfs_zap_mapping(inode, inode->i_mapping);
-	}
-	unlock_page(page);
-	return 0;
- error:
-	nfs_readdir_clear_array(page);
-	unlock_page(page);
-	return ret;
+	put_page(desc->page);
+	desc->page = NULL;
 }
 
-static
-void cache_page_release(nfs_readdir_descriptor_t *desc)
+static void
+nfs_readdir_page_unlock_and_put_cached(struct nfs_readdir_descriptor *desc)
 {
-	put_page(desc->page);
-	desc->page = NULL;
+	unlock_page(desc->page);
+	nfs_readdir_page_put(desc);
 }
 
-static
-struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
+static struct page *
+nfs_readdir_page_get_cached(struct nfs_readdir_descriptor *desc)
 {
-	return read_cache_page(desc->file->f_mapping, desc->page_index,
-			nfs_readdir_filler, desc);
+	return nfs_readdir_page_get_locked(desc->file->f_mapping,
+					   desc->page_index,
+					   desc->last_cookie);
 }
 
 /*
@@ -785,23 +803,21 @@ int find_and_lock_cache_page(nfs_readdir_descriptor_t *desc)
 	struct nfs_inode *nfsi = NFS_I(inode);
 	int res;
 
-	desc->page = get_cache_page(desc);
-	if (IS_ERR(desc->page))
-		return PTR_ERR(desc->page);
-	res = lock_page_killable(desc->page);
-	if (res != 0)
-		goto error;
-	res = -EAGAIN;
-	if (desc->page->mapping != NULL) {
-		res = nfs_readdir_search_array(desc);
-		if (res == 0) {
-			nfsi->page_index = desc->page_index;
-			return 0;
-		}
+	desc->page = nfs_readdir_page_get_cached(desc);
+	if (!desc->page)
+		return -ENOMEM;
+	if (nfs_readdir_page_needs_filling(desc->page)) {
+		res = nfs_readdir_xdr_to_array(desc, desc->page, inode);
+		if (res < 0)
+			goto error;
+	}
+	res = nfs_readdir_search_array(desc);
+	if (res == 0) {
+		nfsi->page_index = desc->page_index;
+		return 0;
 	}
-	unlock_page(desc->page);
 error:
-	cache_page_release(desc);
+	nfs_readdir_page_unlock_and_put_cached(desc);
 	return res;
 }
 
@@ -896,6 +912,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc)
 	desc->page = page;
 	desc->duped = 0;
 
+	nfs_readdir_page_init_array(page, desc->dir_cookie);
 	status = nfs_readdir_xdr_to_array(desc, page, inode);
 	if (status < 0)
 		goto out_release;
@@ -904,7 +921,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc)
 
  out_release:
 	nfs_readdir_clear_array(desc->page);
-	cache_page_release(desc);
+	nfs_readdir_page_put(desc);
  out:
 	dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
 			__func__, status);
@@ -976,8 +993,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 			break;
 
 		res = nfs_do_filldir(desc);
-		unlock_page(desc->page);
-		cache_page_release(desc);
+		nfs_readdir_page_unlock_and_put_cached(desc);
 		if (res < 0)
 			break;
 	} while (!desc->eof);

From 648d0807eaa8091ed2d76d935e19027e62972285 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 1 Nov 2020 13:14:10 -0500
Subject: [PATCH 087/737] NFS: Don't discard readdir results

If a readdir call returns more data than we can fit into one page
cache page, then allocate a new one for that data rather than
discarding the data.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
(cherry picked from commit 3b2a09f127e025674945e82c1ec0c88d6740280e)
---
 fs/nfs/dir.c | 46 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 42 insertions(+), 4 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index a98eab31d92ac..abf43fb6c0e80 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -320,6 +320,26 @@ static void nfs_readdir_page_set_eof(struct page *page)
 	kunmap_atomic(array);
 }
 
+static void nfs_readdir_page_unlock_and_put(struct page *page)
+{
+	unlock_page(page);
+	put_page(page);
+}
+
+static struct page *nfs_readdir_page_get_next(struct address_space *mapping,
+					      pgoff_t index, u64 cookie)
+{
+	struct page *page;
+
+	page = nfs_readdir_page_get_locked(mapping, index, cookie);
+	if (page) {
+		if (nfs_readdir_page_last_cookie(page) == cookie)
+			return page;
+		nfs_readdir_page_unlock_and_put(page);
+	}
+	return NULL;
+}
+
 static inline
 int is_32bit_api(void)
 {
@@ -637,13 +657,15 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry,
 }
 
 /* Perform conversion from xdr to cache array */
-static
-int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
-				struct page **xdr_pages, struct page *page, unsigned int buflen)
+static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
+				   struct nfs_entry *entry,
+				   struct page **xdr_pages,
+				   struct page *fillme, unsigned int buflen)
 {
+	struct address_space *mapping = desc->file->f_mapping;
 	struct xdr_stream stream;
 	struct xdr_buf buf;
-	struct page *scratch;
+	struct page *scratch, *new, *page = fillme;
 	int status;
 
 	scratch = alloc_page(GFP_KERNEL);
@@ -666,6 +688,19 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
 					desc->dir_verifier);
 
 		status = nfs_readdir_add_to_array(entry, page);
+		if (status != -ENOSPC)
+			continue;
+
+		if (page->mapping != mapping)
+			break;
+		new = nfs_readdir_page_get_next(mapping, page->index + 1,
+						entry->prev_cookie);
+		if (!new)
+			break;
+		if (page != fillme)
+			nfs_readdir_page_unlock_and_put(page);
+		page = new;
+		status = nfs_readdir_add_to_array(entry, page);
 	} while (!status && !entry->eof);
 
 	switch (status) {
@@ -681,6 +716,9 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
 		break;
 	}
 
+	if (page != fillme)
+		nfs_readdir_page_unlock_and_put(page);
+
 	put_page(scratch);
 	return status;
 }

From 6373087b2ad3c2cdee3b20cb84347ee5682b85ee Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 1 Nov 2020 13:31:20 -0500
Subject: [PATCH 088/737] NFS: Remove unnecessary kmap in
 nfs_readdir_xdr_to_array()

The kmapped pointer is only used once per loop to check if we need to
exit.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
(cherry picked from commit e762a639816015a70bb1af8ea4baf54f4facb591)
---
 fs/nfs/dir.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index abf43fb6c0e80..1c2fe400344b0 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -759,7 +759,6 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 	struct page *pages[NFS_MAX_READDIR_PAGES];
 	struct nfs_entry entry;
 	struct file	*file = desc->file;
-	struct nfs_cache_array *array;
 	int status = -ENOMEM;
 	unsigned int array_size = ARRAY_SIZE(pages);
 
@@ -778,11 +777,9 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 		goto out;
 	}
 
-	array = kmap(page);
-
 	status = nfs_readdir_alloc_pages(pages, array_size);
 	if (status < 0)
-		goto out_release_array;
+		goto out_release_label;
 	do {
 		unsigned int pglen;
 		status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
@@ -797,11 +794,10 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 		}
 
 		status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen);
-	} while (!status && !nfs_readdir_array_is_full(array));
+	} while (!status && nfs_readdir_page_needs_filling(page));
 
 	nfs_readdir_free_pages(pages, array_size);
-out_release_array:
-	kunmap(page);
+out_release_label:
 	nfs4_label_free(entry.label);
 out:
 	nfs_free_fattr(entry.fattr);

From d8d68ff1d2d6a21d58b5452fd7dcb8e5d70ce92f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 1 Nov 2020 13:34:32 -0500
Subject: [PATCH 089/737] NFS: Replace kmap() with kmap_atomic() in
 nfs_readdir_search_array()

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
(cherry picked from commit ed09222d651dbd30e707f96180628229146b885c)
---
 fs/nfs/dir.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 1c2fe400344b0..8df86c8fe2490 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -447,7 +447,7 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
 	struct nfs_cache_array *array;
 	int status;
 
-	array = kmap(desc->page);
+	array = kmap_atomic(desc->page);
 
 	if (desc->dir_cookie == 0)
 		status = nfs_readdir_search_for_pos(array, desc);
@@ -459,7 +459,7 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
 		desc->current_index += array->size;
 		desc->page_index++;
 	}
-	kunmap(desc->page);
+	kunmap_atomic(array);
 	return status;
 }
 

From c85df5c4177cfa428fcda1903e196bc73de7afa6 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 1 Nov 2020 19:17:29 -0500
Subject: [PATCH 090/737] NFS: Simplify struct nfs_cache_array_entry

We don't need to store a hash, so replace struct qstr with a simple
const char pointer and length.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
(cherry picked from commit a52a8a6adad99e1162c27f70cd6495626a48064d)
---
 fs/nfs/dir.c | 46 +++++++++++++++++++++++++---------------------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 8df86c8fe2490..483de9c6064af 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -133,7 +133,8 @@ nfs_closedir(struct inode *inode, struct file *filp)
 struct nfs_cache_array_entry {
 	u64 cookie;
 	u64 ino;
-	struct qstr string;
+	const char *name;
+	unsigned int name_len;
 	unsigned char d_type;
 };
 
@@ -192,7 +193,7 @@ void nfs_readdir_clear_array(struct page *page)
 
 	array = kmap_atomic(page);
 	for (i = 0; i < array->size; i++)
-		kfree(array->array[i].string.name);
+		kfree(array->array[i].name);
 	nfs_readdir_array_init(array);
 	kunmap_atomic(array);
 }
@@ -213,20 +214,17 @@ static bool nfs_readdir_array_is_full(struct nfs_cache_array *array)
  * when called by nfs_readdir_add_to_array, the strings will be freed in
  * nfs_clear_readdir_array()
  */
-static
-int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len)
+static const char *nfs_readdir_copy_name(const char *name, unsigned int len)
 {
-	string->len = len;
-	string->name = kmemdup_nul(name, len, GFP_KERNEL);
-	if (string->name == NULL)
-		return -ENOMEM;
+	const char *ret = kmemdup_nul(name, len, GFP_KERNEL);
+
 	/*
 	 * Avoid a kmemleak false positive. The pointer to the name is stored
 	 * in a page cache page which kmemleak does not scan.
 	 */
-	kmemleak_not_leak(string->name);
-	string->hash = full_name_hash(NULL, name, len);
-	return 0;
+	if (ret != NULL)
+		kmemleak_not_leak(ret);
+	return ret;
 }
 
 /*
@@ -249,27 +247,34 @@ static int nfs_readdir_array_can_expand(struct nfs_cache_array *array)
 static
 int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
 {
-	struct nfs_cache_array *array = kmap(page);
+	struct nfs_cache_array *array;
 	struct nfs_cache_array_entry *cache_entry;
+	const char *name;
 	int ret;
 
+	name = nfs_readdir_copy_name(entry->name, entry->len);
+	if (!name)
+		return -ENOMEM;
+
+	array = kmap_atomic(page);
 	ret = nfs_readdir_array_can_expand(array);
-	if (ret)
+	if (ret) {
+		kfree(name);
 		goto out;
+	}
 
 	cache_entry = &array->array[array->size];
 	cache_entry->cookie = entry->prev_cookie;
 	cache_entry->ino = entry->ino;
 	cache_entry->d_type = entry->d_type;
-	ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len);
-	if (ret)
-		goto out;
+	cache_entry->name_len = entry->len;
+	cache_entry->name = name;
 	array->last_cookie = entry->cookie;
 	array->size++;
 	if (entry->eof != 0)
 		nfs_readdir_array_set_eof(array);
 out:
-	kunmap(page);
+	kunmap_atomic(array);
 	return ret;
 }
 
@@ -413,9 +418,8 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
 					if (printk_ratelimit()) {
 						pr_notice("NFS: directory %pD2 contains a readdir loop."
 								"Please contact your server vendor.  "
-								"The file: %.*s has duplicate cookie %llu\n",
-								desc->file, array->array[i].string.len,
-								array->array[i].string.name, desc->dir_cookie);
+								"The file: %s has duplicate cookie %llu\n",
+								desc->file, array->array[i].name, desc->dir_cookie);
 					}
 					status = -ELOOP;
 					goto out;
@@ -888,7 +892,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
 		struct nfs_cache_array_entry *ent;
 
 		ent = &array->array[i];
-		if (!dir_emit(desc->ctx, ent->string.name, ent->string.len,
+		if (!dir_emit(desc->ctx, ent->name, ent->name_len,
 		    nfs_compat_user_ino64(ent->ino), ent->d_type)) {
 			desc->eof = true;
 			break;

From 4fd75410760d537b3541ede4d8b932e9c1d06480 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 1 Nov 2020 14:26:47 -0500
Subject: [PATCH 091/737] NFS: Support larger readdir buffers

Support readdir buffers of up to 1MB in size so that we can read
large directories using few RPC calls.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
(cherry picked from commit 1a34c8c9a49ee10ccaf91091ddd98c25e4d567dd)
---
 fs/nfs/client.c   |  4 ++--
 fs/nfs/dir.c      | 33 +++++++++++++++++++--------------
 fs/nfs/internal.h |  6 ------
 3 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 818ff8b1b99da..2203ea6cf2684 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -781,8 +781,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
 	server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
 
 	server->dtsize = nfs_block_size(fsinfo->dtpref, NULL);
-	if (server->dtsize > PAGE_SIZE * NFS_MAX_READDIR_PAGES)
-		server->dtsize = PAGE_SIZE * NFS_MAX_READDIR_PAGES;
+	if (server->dtsize > NFS_MAX_FILE_IO_SIZE)
+		server->dtsize = NFS_MAX_FILE_IO_SIZE;
 	if (server->dtsize > server->rsize)
 		server->dtsize = server->rsize;
 
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 483de9c6064af..1470cc7384e42 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -727,44 +727,47 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
 	return status;
 }
 
-static
-void nfs_readdir_free_pages(struct page **pages, unsigned int npages)
+static void nfs_readdir_free_pages(struct page **pages, size_t npages)
 {
-	unsigned int i;
-	for (i = 0; i < npages; i++)
-		put_page(pages[i]);
+	while (npages--)
+		put_page(pages[npages]);
+	kfree(pages);
 }
 
 /*
  * nfs_readdir_alloc_pages() will allocate pages that must be freed with a call
  * to nfs_readdir_free_pages()
  */
-static
-int nfs_readdir_alloc_pages(struct page **pages, unsigned int npages)
+static struct page **nfs_readdir_alloc_pages(size_t npages)
 {
-	unsigned int i;
+	struct page **pages;
+	size_t i;
 
+	pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL);
+	if (!pages)
+		return NULL;
 	for (i = 0; i < npages; i++) {
 		struct page *page = alloc_page(GFP_KERNEL);
 		if (page == NULL)
 			goto out_freepages;
 		pages[i] = page;
 	}
-	return 0;
+	return pages;
 
 out_freepages:
 	nfs_readdir_free_pages(pages, i);
-	return -ENOMEM;
+	return NULL;
 }
 
 static
 int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
 {
-	struct page *pages[NFS_MAX_READDIR_PAGES];
+	struct page **pages;
 	struct nfs_entry entry;
 	struct file	*file = desc->file;
+	size_t array_size;
+	size_t dtsize = NFS_SERVER(inode)->dtsize;
 	int status = -ENOMEM;
-	unsigned int array_size = ARRAY_SIZE(pages);
 
 	entry.prev_cookie = 0;
 	entry.cookie = nfs_readdir_page_last_cookie(page);
@@ -781,9 +784,11 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 		goto out;
 	}
 
-	status = nfs_readdir_alloc_pages(pages, array_size);
-	if (status < 0)
+	array_size = (dtsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	pages = nfs_readdir_alloc_pages(array_size);
+	if (!pages)
 		goto out_release_label;
+
 	do {
 		unsigned int pglen;
 		status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index a7e0970b5bfe1..597adbfe15476 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -66,12 +66,6 @@ static inline fmode_t flags_to_mode(int flags)
 #define NFS_UNSPEC_RETRANS	(UINT_MAX)
 #define NFS_UNSPEC_TIMEO	(UINT_MAX)
 
-/*
- * Maximum number of pages that readdir can use for creating
- * a vmapped array of pages.
- */
-#define NFS_MAX_READDIR_PAGES 8
-
 struct nfs_client_initdata {
 	unsigned long init_flags;
 	const char *hostname;			/* Hostname of the server */

From 3459cb6cbd0b24fbdc980cd74974c82a4e411c10 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 1 Nov 2020 15:24:41 -0500
Subject: [PATCH 092/737] NFS: More readdir cleanups

Remove the redundant caching of the credential in struct
nfs_open_dir_context.
Pass the buffer size as an argument to nfs_readdir_xdr_filler().

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
(cherry picked from commit 93b8959a0a8cf1b1a493efee9e8328681e111862)
---
 fs/nfs/dir.c           | 25 +++++++++++--------------
 include/linux/nfs_fs.h |  1 -
 2 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 1470cc7384e42..e121b421b6e01 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -68,7 +68,7 @@ const struct address_space_operations nfs_dir_aops = {
 	.freepage = nfs_readdir_clear_array,
 };
 
-static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, const struct cred *cred)
+static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir)
 {
 	struct nfs_inode *nfsi = NFS_I(dir);
 	struct nfs_open_dir_context *ctx;
@@ -78,7 +78,6 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir
 		ctx->attr_gencount = nfsi->attr_gencount;
 		ctx->dir_cookie = 0;
 		ctx->dup_cookie = 0;
-		ctx->cred = get_cred(cred);
 		spin_lock(&dir->i_lock);
 		if (list_empty(&nfsi->open_files) &&
 		    (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
@@ -96,7 +95,6 @@ static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_cont
 	spin_lock(&dir->i_lock);
 	list_del(&ctx->list);
 	spin_unlock(&dir->i_lock);
-	put_cred(ctx->cred);
 	kfree(ctx);
 }
 
@@ -113,7 +111,7 @@ nfs_opendir(struct inode *inode, struct file *filp)
 
 	nfs_inc_stats(inode, NFSIOS_VFSOPEN);
 
-	ctx = alloc_nfs_open_dir_context(inode, current_cred());
+	ctx = alloc_nfs_open_dir_context(inode);
 	if (IS_ERR(ctx)) {
 		res = PTR_ERR(ctx);
 		goto out;
@@ -468,12 +466,12 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
 }
 
 /* Fill a page with xdr information before transferring to the cache page */
-static
-int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
-			struct nfs_entry *entry, struct file *file, struct inode *inode)
+static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc,
+				  u64 cookie, struct page **pages,
+				  size_t bufsize)
 {
-	struct nfs_open_dir_context *ctx = file->private_data;
-	const struct cred *cred = ctx->cred;
+	struct file *file = desc->file;
+	struct inode *inode = file_inode(file);
 	unsigned long	timestamp, gencount;
 	int		error;
 
@@ -481,8 +479,8 @@ int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
 	timestamp = jiffies;
 	gencount = nfs_inc_attr_generation_counter();
 	desc->dir_verifier = nfs_save_change_attribute(inode);
-	error = NFS_PROTO(inode)->readdir(file_dentry(file), cred, entry->cookie, pages,
-					  NFS_SERVER(inode)->dtsize, desc->plus);
+	error = NFS_PROTO(inode)->readdir(file_dentry(file), file->f_cred,
+					  cookie, pages, bufsize, desc->plus);
 	if (error < 0) {
 		/* We requested READDIRPLUS, but the server doesn't grok it */
 		if (error == -ENOTSUPP && desc->plus) {
@@ -764,7 +762,6 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 {
 	struct page **pages;
 	struct nfs_entry entry;
-	struct file	*file = desc->file;
 	size_t array_size;
 	size_t dtsize = NFS_SERVER(inode)->dtsize;
 	int status = -ENOMEM;
@@ -791,8 +788,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 
 	do {
 		unsigned int pglen;
-		status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
-
+		status = nfs_readdir_xdr_filler(desc, entry.cookie,
+						pages, dtsize);
 		if (status < 0)
 			break;
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index e39342945a80b..79a6c3018051a 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -89,7 +89,6 @@ struct nfs_open_context {
 
 struct nfs_open_dir_context {
 	struct list_head list;
-	const struct cred *cred;
 	unsigned long attr_gencount;
 	__u64 dir_cookie;
 	__u64 dup_cookie;

From 1a960a88fbbdd4418a8ed8f055ed6d207683abf4 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 1 Nov 2020 18:20:03 -0500
Subject: [PATCH 093/737] NFS: nfs_do_filldir() does not return a value

Clean up nfs_do_filldir().

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
(cherry picked from commit dbeaf8c984ca689c2c0966c41bd78dee178b5dfe)
---
 fs/nfs/dir.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e121b421b6e01..ca5c81094e588 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -881,13 +881,11 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
 /*
  * Once we've found the start of the dirent within a page: fill 'er up...
  */
-static 
-int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
+static void nfs_do_filldir(struct nfs_readdir_descriptor *desc)
 {
 	struct file	*file = desc->file;
-	int i = 0;
-	int res = 0;
-	struct nfs_cache_array *array = NULL;
+	struct nfs_cache_array *array;
+	unsigned int i = 0;
 
 	array = kmap(desc->page);
 	for (i = desc->cache_entry_index; i < array->size; i++) {
@@ -914,9 +912,8 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
 		desc->eof = true;
 
 	kunmap(desc->page);
-	dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
-			(unsigned long long)desc->dir_cookie, res);
-	return res;
+	dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %llu\n",
+			(unsigned long long)desc->dir_cookie);
 }
 
 /*
@@ -957,7 +954,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc)
 	if (status < 0)
 		goto out_release;
 
-	status = nfs_do_filldir(desc);
+	nfs_do_filldir(desc);
 
  out_release:
 	nfs_readdir_clear_array(desc->page);
@@ -1032,10 +1029,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 		if (res < 0)
 			break;
 
-		res = nfs_do_filldir(desc);
+		nfs_do_filldir(desc);
 		nfs_readdir_page_unlock_and_put_cached(desc);
-		if (res < 0)
-			break;
 	} while (!desc->eof);
 
 	spin_lock(&file->f_lock);
@@ -1046,8 +1041,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	spin_unlock(&file->f_lock);
 
 out:
-	if (res > 0)
-		res = 0;
 	dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res);
 	return res;
 }

From 32a6e14edbd6033611d04194207085679f17e972 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 2 Nov 2020 08:55:03 -0500
Subject: [PATCH 094/737] NFS: Reduce readdir stack usage

The descriptor and the struct nfs_entry are both large structures,
so don't allocate them from the stack.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
(cherry picked from commit 6b75cf9e309d18664f964889ac026096ba0d1919)
---
 fs/nfs/dir.c | 58 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 33 insertions(+), 25 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ca5c81094e588..f001b61678730 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -761,23 +761,24 @@ static
 int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
 {
 	struct page **pages;
-	struct nfs_entry entry;
+	struct nfs_entry *entry;
 	size_t array_size;
 	size_t dtsize = NFS_SERVER(inode)->dtsize;
 	int status = -ENOMEM;
 
-	entry.prev_cookie = 0;
-	entry.cookie = nfs_readdir_page_last_cookie(page);
-	entry.eof = 0;
-	entry.fh = nfs_alloc_fhandle();
-	entry.fattr = nfs_alloc_fattr();
-	entry.server = NFS_SERVER(inode);
-	if (entry.fh == NULL || entry.fattr == NULL)
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
+	entry->cookie = nfs_readdir_page_last_cookie(page);
+	entry->fh = nfs_alloc_fhandle();
+	entry->fattr = nfs_alloc_fattr();
+	entry->server = NFS_SERVER(inode);
+	if (entry->fh == NULL || entry->fattr == NULL)
 		goto out;
 
-	entry.label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
-	if (IS_ERR(entry.label)) {
-		status = PTR_ERR(entry.label);
+	entry->label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
+	if (IS_ERR(entry->label)) {
+		status = PTR_ERR(entry->label);
 		goto out;
 	}
 
@@ -788,7 +789,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 
 	do {
 		unsigned int pglen;
-		status = nfs_readdir_xdr_filler(desc, entry.cookie,
+		status = nfs_readdir_xdr_filler(desc, entry->cookie,
 						pages, dtsize);
 		if (status < 0)
 			break;
@@ -799,15 +800,16 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 			break;
 		}
 
-		status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen);
+		status = nfs_readdir_page_filler(desc, entry, pages, page, pglen);
 	} while (!status && nfs_readdir_page_needs_filling(page));
 
 	nfs_readdir_free_pages(pages, array_size);
 out_release_label:
-	nfs4_label_free(entry.label);
+	nfs4_label_free(entry->label);
 out:
-	nfs_free_fattr(entry.fattr);
-	nfs_free_fhandle(entry.fh);
+	nfs_free_fattr(entry->fattr);
+	nfs_free_fhandle(entry->fh);
+	kfree(entry);
 	return status;
 }
 
@@ -974,13 +976,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	struct dentry	*dentry = file_dentry(file);
 	struct inode	*inode = d_inode(dentry);
 	struct nfs_open_dir_context *dir_ctx = file->private_data;
-	nfs_readdir_descriptor_t my_desc = {
-		.file = file,
-		.ctx = ctx,
-		.plus = nfs_use_readdirplus(inode, ctx),
-	},
-			*desc = &my_desc;
-	int res = 0;
+	struct nfs_readdir_descriptor *desc;
+	int res;
 
 	dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n",
 			file, (long long)ctx->pos);
@@ -992,10 +989,19 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	 * to either find the entry with the appropriate number or
 	 * revalidate the cookie.
 	 */
-	if (ctx->pos == 0 || nfs_attribute_cache_expired(inode))
+	if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) {
 		res = nfs_revalidate_mapping(inode, file->f_mapping);
-	if (res < 0)
+		if (res < 0)
+			goto out;
+	}
+
+	res = -ENOMEM;
+	desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+	if (!desc)
 		goto out;
+	desc->file = file;
+	desc->ctx = ctx;
+	desc->plus = nfs_use_readdirplus(inode, ctx);
 
 	spin_lock(&file->f_lock);
 	desc->dir_cookie = dir_ctx->dir_cookie;
@@ -1040,6 +1046,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	dir_ctx->attr_gencount = desc->attr_gencount;
 	spin_unlock(&file->f_lock);
 
+	kfree(desc);
+
 out:
 	dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res);
 	return res;

From e4a845fe9022d070c1b7f3fab34c8fb8c5d65ce3 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 3 Nov 2020 07:42:04 -0500
Subject: [PATCH 095/737] NFS: Cleanup to remove nfs_readdir_descriptor_t
 typedef

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
(cherry picked from commit 6c981eff23b894ce429281dc45a5589359eef2c1)
---
 fs/nfs/dir.c | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index f001b61678730..ce84ddd7dc1e1 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -144,7 +144,7 @@ struct nfs_cache_array {
 	struct nfs_cache_array_entry array[];
 };
 
-typedef struct nfs_readdir_descriptor {
+struct nfs_readdir_descriptor {
 	struct file	*file;
 	struct page	*page;
 	struct dir_context *ctx;
@@ -163,7 +163,7 @@ typedef struct nfs_readdir_descriptor {
 	signed char duped;
 	bool plus;
 	bool eof;
-} nfs_readdir_descriptor_t;
+};
 
 static void nfs_readdir_array_init(struct nfs_cache_array *array)
 {
@@ -362,8 +362,8 @@ bool nfs_readdir_use_cookie(const struct file *filp)
 	return true;
 }
 
-static
-int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
+static int nfs_readdir_search_for_pos(struct nfs_cache_array *array,
+				      struct nfs_readdir_descriptor *desc)
 {
 	loff_t diff = desc->ctx->pos - desc->current_index;
 	unsigned int index;
@@ -394,8 +394,8 @@ nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi)
 	return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags);
 }
 
-static
-int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
+static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array,
+					 struct nfs_readdir_descriptor *desc)
 {
 	int i;
 	loff_t new_pos;
@@ -443,8 +443,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
 	return status;
 }
 
-static
-int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
+static int nfs_readdir_search_array(struct nfs_readdir_descriptor *desc)
 {
 	struct nfs_cache_array *array;
 	int status;
@@ -497,7 +496,7 @@ static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc,
 	return error;
 }
 
-static int xdr_decode(nfs_readdir_descriptor_t *desc,
+static int xdr_decode(struct nfs_readdir_descriptor *desc,
 		      struct nfs_entry *entry, struct xdr_stream *xdr)
 {
 	struct inode *inode = file_inode(desc->file);
@@ -757,8 +756,8 @@ static struct page **nfs_readdir_alloc_pages(size_t npages)
 	return NULL;
 }
 
-static
-int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
+static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
+				    struct page *page, struct inode *inode)
 {
 	struct page **pages;
 	struct nfs_entry *entry;
@@ -838,8 +837,7 @@ nfs_readdir_page_get_cached(struct nfs_readdir_descriptor *desc)
  * Returns 0 if desc->dir_cookie was found on page desc->page_index
  * and locks the page to prevent removal from the page cache.
  */
-static
-int find_and_lock_cache_page(nfs_readdir_descriptor_t *desc)
+static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
 {
 	struct inode *inode = file_inode(desc->file);
 	struct nfs_inode *nfsi = NFS_I(inode);
@@ -864,8 +862,7 @@ int find_and_lock_cache_page(nfs_readdir_descriptor_t *desc)
 }
 
 /* Search for desc->dir_cookie from the beginning of the page cache */
-static inline
-int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
+static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
 {
 	int res;
 
@@ -930,8 +927,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc)
  *	 we should already have a complete representation of the
  *	 directory in the page cache by the time we get here.
  */
-static inline
-int uncached_readdir(nfs_readdir_descriptor_t *desc)
+static int uncached_readdir(struct nfs_readdir_descriptor *desc)
 {
 	struct page	*page = NULL;
 	int		status;

From b38a61b31cccf305fe2cfd4f9527fd3a8a31f936 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 2 Nov 2020 17:34:23 -0500
Subject: [PATCH 096/737] NFS: Allow the NFS generic code to pass in a verifier
 to readdir

If we're ever going to allow support for servers that use the readdir
verifier, then that use needs to be managed by the middle layers as
those need to be able to reject cookies from other verifiers.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
(cherry picked from commit 82e22a5e6245873779db1607d3b0fec6f9ca07d0)
---
 fs/nfs/dir.c            | 23 ++++++++++++++++++-----
 fs/nfs/nfs3proc.c       | 35 +++++++++++++++++------------------
 fs/nfs/nfs4proc.c       | 36 +++++++++++++++++-------------------
 fs/nfs/proc.c           | 18 +++++++++---------
 include/linux/nfs_xdr.h | 17 +++++++++++++++--
 5 files changed, 76 insertions(+), 53 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ce84ddd7dc1e1..7eeedd55560e4 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -469,8 +469,20 @@ static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc,
 				  u64 cookie, struct page **pages,
 				  size_t bufsize)
 {
-	struct file *file = desc->file;
-	struct inode *inode = file_inode(file);
+	struct inode *inode = file_inode(desc->file);
+	__be32 verf_res[2];
+	struct nfs_readdir_arg arg = {
+		.dentry = file_dentry(desc->file),
+		.cred = desc->file->f_cred,
+		.verf = NFS_I(inode)->cookieverf,
+		.cookie = cookie,
+		.pages = pages,
+		.page_len = bufsize,
+		.plus = desc->plus,
+	};
+	struct nfs_readdir_res res = {
+		.verf = verf_res,
+	};
 	unsigned long	timestamp, gencount;
 	int		error;
 
@@ -478,20 +490,21 @@ static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc,
 	timestamp = jiffies;
 	gencount = nfs_inc_attr_generation_counter();
 	desc->dir_verifier = nfs_save_change_attribute(inode);
-	error = NFS_PROTO(inode)->readdir(file_dentry(file), file->f_cred,
-					  cookie, pages, bufsize, desc->plus);
+	error = NFS_PROTO(inode)->readdir(&arg, &res);
 	if (error < 0) {
 		/* We requested READDIRPLUS, but the server doesn't grok it */
 		if (error == -ENOTSUPP && desc->plus) {
 			NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS;
 			clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
-			desc->plus = false;
+			desc->plus = arg.plus = false;
 			goto again;
 		}
 		goto error;
 	}
 	desc->timestamp = timestamp;
 	desc->gencount = gencount;
+	memcpy(NFS_I(inode)->cookieverf, res.verf,
+	       sizeof(NFS_I(inode)->cookieverf));
 error:
 	return error;
 }
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index e1491def7124f..b915fe3abf355 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -637,37 +637,36 @@ nfs3_proc_rmdir(struct inode *dir, const struct qstr *name)
  * Also note that this implementation handles both plain readdir and
  * readdirplus.
  */
-static int
-nfs3_proc_readdir(struct dentry *dentry, const struct cred *cred,
-		  u64 cookie, struct page **pages, unsigned int count, bool plus)
+static int nfs3_proc_readdir(struct nfs_readdir_arg *nr_arg,
+			     struct nfs_readdir_res *nr_res)
 {
-	struct inode		*dir = d_inode(dentry);
-	__be32			*verf = NFS_I(dir)->cookieverf;
+	struct inode		*dir = d_inode(nr_arg->dentry);
 	struct nfs3_readdirargs	arg = {
 		.fh		= NFS_FH(dir),
-		.cookie		= cookie,
-		.verf		= {verf[0], verf[1]},
-		.plus		= plus,
-		.count		= count,
-		.pages		= pages
+		.cookie		= nr_arg->cookie,
+		.plus		= nr_arg->plus,
+		.count		= nr_arg->page_len,
+		.pages		= nr_arg->pages
 	};
 	struct nfs3_readdirres	res = {
-		.verf		= verf,
-		.plus		= plus
+		.verf		= nr_res->verf,
+		.plus		= nr_arg->plus,
 	};
 	struct rpc_message	msg = {
 		.rpc_proc	= &nfs3_procedures[NFS3PROC_READDIR],
 		.rpc_argp	= &arg,
 		.rpc_resp	= &res,
-		.rpc_cred	= cred,
+		.rpc_cred	= nr_arg->cred,
 	};
 	int status = -ENOMEM;
 
-	if (plus)
+	if (nr_arg->plus)
 		msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS];
+	if (arg.cookie)
+		memcpy(arg.verf, nr_arg->verf, sizeof(arg.verf));
 
-	dprintk("NFS call  readdir%s %d\n",
-			plus? "plus" : "", (unsigned int) cookie);
+	dprintk("NFS call  readdir%s %llu\n", nr_arg->plus ? "plus" : "",
+		(unsigned long long)nr_arg->cookie);
 
 	res.dir_attr = nfs_alloc_fattr();
 	if (res.dir_attr == NULL)
@@ -680,8 +679,8 @@ nfs3_proc_readdir(struct dentry *dentry, const struct cred *cred,
 
 	nfs_free_fattr(res.dir_attr);
 out:
-	dprintk("NFS reply readdir%s: %d\n",
-			plus? "plus" : "", status);
+	dprintk("NFS reply readdir%s: %d\n", nr_arg->plus ? "plus" : "",
+		status);
 	return status;
 }
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b9567cc8698ed..76aa978d2f9b1 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4985,41 +4985,40 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
 	return err;
 }
 
-static int _nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred,
-		u64 cookie, struct page **pages, unsigned int count, bool plus)
+static int _nfs4_proc_readdir(struct nfs_readdir_arg *nr_arg,
+			      struct nfs_readdir_res *nr_res)
 {
-	struct inode		*dir = d_inode(dentry);
+	struct inode		*dir = d_inode(nr_arg->dentry);
 	struct nfs_server	*server = NFS_SERVER(dir);
 	struct nfs4_readdir_arg args = {
 		.fh = NFS_FH(dir),
-		.pages = pages,
+		.pages = nr_arg->pages,
 		.pgbase = 0,
-		.count = count,
-		.plus = plus,
+		.count = nr_arg->page_len,
+		.plus = nr_arg->plus,
 	};
 	struct nfs4_readdir_res res;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READDIR],
 		.rpc_argp = &args,
 		.rpc_resp = &res,
-		.rpc_cred = cred,
+		.rpc_cred = nr_arg->cred,
 	};
 	int			status;
 
-	dprintk("%s: dentry = %pd2, cookie = %Lu\n", __func__,
-			dentry,
-			(unsigned long long)cookie);
+	dprintk("%s: dentry = %pd2, cookie = %llu\n", __func__,
+		nr_arg->dentry, (unsigned long long)nr_arg->cookie);
 	if (!(server->caps & NFS_CAP_SECURITY_LABEL))
 		args.bitmask = server->attr_bitmask_nl;
 	else
 		args.bitmask = server->attr_bitmask;
 
-	nfs4_setup_readdir(cookie, NFS_I(dir)->cookieverf, dentry, &args);
+	nfs4_setup_readdir(nr_arg->cookie, nr_arg->verf, nr_arg->dentry, &args);
 	res.pgbase = args.pgbase;
 	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
 			&res.seq_res, 0);
 	if (status >= 0) {
-		memcpy(NFS_I(dir)->cookieverf, res.verifier.data, NFS4_VERIFIER_SIZE);
+		memcpy(nr_res->verf, res.verifier.data, NFS4_VERIFIER_SIZE);
 		status += args.pgbase;
 	}
 
@@ -5029,19 +5028,18 @@ static int _nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred,
 	return status;
 }
 
-static int nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred,
-		u64 cookie, struct page **pages, unsigned int count, bool plus)
+static int nfs4_proc_readdir(struct nfs_readdir_arg *arg,
+			     struct nfs_readdir_res *res)
 {
 	struct nfs4_exception exception = {
 		.interruptible = true,
 	};
 	int err;
 	do {
-		err = _nfs4_proc_readdir(dentry, cred, cookie,
-				pages, count, plus);
-		trace_nfs4_readdir(d_inode(dentry), err);
-		err = nfs4_handle_exception(NFS_SERVER(d_inode(dentry)), err,
-				&exception);
+		err = _nfs4_proc_readdir(arg, res);
+		trace_nfs4_readdir(d_inode(arg->dentry), err);
+		err = nfs4_handle_exception(NFS_SERVER(d_inode(arg->dentry)),
+					    err, &exception);
 	} while (exception.retry);
 	return err;
 }
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 15c865cc837fa..73ab7c59d3a76 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -499,26 +499,26 @@ nfs_proc_rmdir(struct inode *dir, const struct qstr *name)
  * sure it is syntactically correct; the entries itself are decoded
  * from nfs_readdir by calling the decode_entry function directly.
  */
-static int
-nfs_proc_readdir(struct dentry *dentry, const struct cred *cred,
-		 u64 cookie, struct page **pages, unsigned int count, bool plus)
+static int nfs_proc_readdir(struct nfs_readdir_arg *nr_arg,
+			    struct nfs_readdir_res *nr_res)
 {
-	struct inode		*dir = d_inode(dentry);
+	struct inode		*dir = d_inode(nr_arg->dentry);
 	struct nfs_readdirargs	arg = {
 		.fh		= NFS_FH(dir),
-		.cookie		= cookie,
-		.count		= count,
-		.pages		= pages,
+		.cookie		= nr_arg->cookie,
+		.count		= nr_arg->page_len,
+		.pages		= nr_arg->pages,
 	};
 	struct rpc_message	msg = {
 		.rpc_proc	= &nfs_procedures[NFSPROC_READDIR],
 		.rpc_argp	= &arg,
-		.rpc_cred	= cred,
+		.rpc_cred	= nr_arg->cred,
 	};
 	int			status;
 
-	dprintk("NFS call  readdir %d\n", (unsigned int)cookie);
+	dprintk("NFS call  readdir %llu\n", (unsigned long long)nr_arg->cookie);
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	nr_res->verf[0] = nr_res->verf[1] = 0;
 
 	nfs_invalidate_atime(dir);
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 33442fd018a06..05cd8f3875681 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -755,6 +755,20 @@ struct nfs_entry {
 	struct nfs_server *	server;
 };
 
+struct nfs_readdir_arg {
+	struct dentry		*dentry;
+	const struct cred	*cred;
+	__be32			*verf;
+	u64			cookie;
+	struct page		**pages;
+	unsigned int		page_len;
+	bool			plus;
+};
+
+struct nfs_readdir_res {
+	__be32			*verf;
+};
+
 /*
  * The following types are for NFSv2 only.
  */
@@ -1749,8 +1763,7 @@ struct nfs_rpc_ops {
 			    unsigned int, struct iattr *);
 	int	(*mkdir)   (struct inode *, struct dentry *, struct iattr *);
 	int	(*rmdir)   (struct inode *, const struct qstr *);
-	int	(*readdir) (struct dentry *, const struct cred *,
-			    u64, struct page **, unsigned int, bool);
+	int	(*readdir) (struct nfs_readdir_arg *, struct nfs_readdir_res *);
 	int	(*mknod)   (struct inode *, struct dentry *, struct iattr *,
 			    dev_t);
 	int	(*statfs)  (struct nfs_server *, struct nfs_fh *,

From 9386c7c9640f09c3b46f95752c67a4862064c1e5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 2 Nov 2020 20:11:32 -0500
Subject: [PATCH 097/737] NFS: Handle NFS4ERR_NOT_SAME and NFSERR_BADCOOKIE
 from readdir calls

If the server returns NFS4ERR_NOT_SAME or tells us that the cookie is
bad in response to a READDIR call, then we should empty the page cache
so that we can fill it from scratch again.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
(cherry picked from commit 9fff59ed4c4d239125f8529a9971c46defd2e2b0)
---
 fs/nfs/dir.c      | 24 ++++++++++++++++--------
 fs/nfs/nfs4proc.c |  2 ++
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 7eeedd55560e4..45e3f3620bc1d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -861,15 +861,21 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
 		return -ENOMEM;
 	if (nfs_readdir_page_needs_filling(desc->page)) {
 		res = nfs_readdir_xdr_to_array(desc, desc->page, inode);
-		if (res < 0)
-			goto error;
+		if (res < 0) {
+			nfs_readdir_page_unlock_and_put_cached(desc);
+			if (res == -EBADCOOKIE || res == -ENOTSYNC) {
+				invalidate_inode_pages2(desc->file->f_mapping);
+				desc->page_index = 0;
+				return -EAGAIN;
+			}
+			return res;
+		}
 	}
 	res = nfs_readdir_search_array(desc);
 	if (res == 0) {
 		nfsi->page_index = desc->page_index;
 		return 0;
 	}
-error:
 	nfs_readdir_page_unlock_and_put_cached(desc);
 	return res;
 }
@@ -879,12 +885,12 @@ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
 {
 	int res;
 
-	if (desc->page_index == 0) {
-		desc->current_index = 0;
-		desc->prev_index = 0;
-		desc->last_cookie = 0;
-	}
 	do {
+		if (desc->page_index == 0) {
+			desc->current_index = 0;
+			desc->prev_index = 0;
+			desc->last_cookie = 0;
+		}
 		res = find_and_lock_cache_page(desc);
 	} while (res == -EAGAIN);
 	return res;
@@ -1030,6 +1036,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 				res = uncached_readdir(desc);
 				if (res == 0)
 					continue;
+				if (res == -EBADCOOKIE || res == -ENOTSYNC)
+					res = 0;
 			}
 			break;
 		}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 76aa978d2f9b1..bb2ecba49937b 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -190,6 +190,8 @@ static int nfs4_map_errors(int err)
 		return -EPROTONOSUPPORT;
 	case -NFS4ERR_FILE_OPEN:
 		return -EBUSY;
+	case -NFS4ERR_NOT_SAME:
+		return -ENOTSYNC;
 	default:
 		dprintk("%s could not handle NFSv4 error %d\n",
 				__func__, -err);

From a50a6c0bc894e7d406ea8fad16b8b182ba8ed535 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 2 Nov 2020 20:06:12 -0500
Subject: [PATCH 098/737] NFS: Improve handling of directory verifiers

If the server insists on using the readdir verifiers in order to allow
cookies to expire, then we should ensure that we cache the verifier
with the cookie, so that we can return an error if the application
tries to use the expired cookie.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
(cherry picked from commit b593c09f83a2732a0f0298c8f3468236a83cdd9f)
---
 fs/nfs/dir.c           | 35 +++++++++++++++++++++++------------
 fs/nfs/inode.c         |  7 -------
 include/linux/nfs_fs.h |  8 +++++++-
 3 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 45e3f3620bc1d..a6c7362c0e73d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -155,6 +155,7 @@ struct nfs_readdir_descriptor {
 	loff_t		current_index;
 	loff_t		prev_index;
 
+	__be32		verf[NFS_DIR_VERIFIER_SIZE];
 	unsigned long	dir_verifier;
 	unsigned long	timestamp;
 	unsigned long	gencount;
@@ -466,15 +467,15 @@ static int nfs_readdir_search_array(struct nfs_readdir_descriptor *desc)
 
 /* Fill a page with xdr information before transferring to the cache page */
 static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc,
-				  u64 cookie, struct page **pages,
-				  size_t bufsize)
+				  __be32 *verf, u64 cookie,
+				  struct page **pages, size_t bufsize,
+				  __be32 *verf_res)
 {
 	struct inode *inode = file_inode(desc->file);
-	__be32 verf_res[2];
 	struct nfs_readdir_arg arg = {
 		.dentry = file_dentry(desc->file),
 		.cred = desc->file->f_cred,
-		.verf = NFS_I(inode)->cookieverf,
+		.verf = verf,
 		.cookie = cookie,
 		.pages = pages,
 		.page_len = bufsize,
@@ -503,8 +504,6 @@ static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc,
 	}
 	desc->timestamp = timestamp;
 	desc->gencount = gencount;
-	memcpy(NFS_I(inode)->cookieverf, res.verf,
-	       sizeof(NFS_I(inode)->cookieverf));
 error:
 	return error;
 }
@@ -770,11 +769,13 @@ static struct page **nfs_readdir_alloc_pages(size_t npages)
 }
 
 static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
-				    struct page *page, struct inode *inode)
+				    struct page *page, __be32 *verf_arg,
+				    __be32 *verf_res)
 {
 	struct page **pages;
 	struct nfs_entry *entry;
 	size_t array_size;
+	struct inode *inode = file_inode(desc->file);
 	size_t dtsize = NFS_SERVER(inode)->dtsize;
 	int status = -ENOMEM;
 
@@ -801,8 +802,9 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
 
 	do {
 		unsigned int pglen;
-		status = nfs_readdir_xdr_filler(desc, entry->cookie,
-						pages, dtsize);
+		status = nfs_readdir_xdr_filler(desc, verf_arg, entry->cookie,
+						pages, dtsize,
+						verf_res);
 		if (status < 0)
 			break;
 
@@ -854,13 +856,15 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
 {
 	struct inode *inode = file_inode(desc->file);
 	struct nfs_inode *nfsi = NFS_I(inode);
+	__be32 verf[NFS_DIR_VERIFIER_SIZE];
 	int res;
 
 	desc->page = nfs_readdir_page_get_cached(desc);
 	if (!desc->page)
 		return -ENOMEM;
 	if (nfs_readdir_page_needs_filling(desc->page)) {
-		res = nfs_readdir_xdr_to_array(desc, desc->page, inode);
+		res = nfs_readdir_xdr_to_array(desc, desc->page,
+					       nfsi->cookieverf, verf);
 		if (res < 0) {
 			nfs_readdir_page_unlock_and_put_cached(desc);
 			if (res == -EBADCOOKIE || res == -ENOTSYNC) {
@@ -870,6 +874,7 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
 			}
 			return res;
 		}
+		memcpy(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf));
 	}
 	res = nfs_readdir_search_array(desc);
 	if (res == 0) {
@@ -902,6 +907,7 @@ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
 static void nfs_do_filldir(struct nfs_readdir_descriptor *desc)
 {
 	struct file	*file = desc->file;
+	struct nfs_inode *nfsi = NFS_I(file_inode(file));
 	struct nfs_cache_array *array;
 	unsigned int i = 0;
 
@@ -915,6 +921,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc)
 			desc->eof = true;
 			break;
 		}
+		memcpy(desc->verf, nfsi->cookieverf, sizeof(desc->verf));
 		if (i < (array->size-1))
 			desc->dir_cookie = array->array[i+1].cookie;
 		else
@@ -949,8 +956,8 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc)
 static int uncached_readdir(struct nfs_readdir_descriptor *desc)
 {
 	struct page	*page = NULL;
+	__be32		verf[NFS_DIR_VERIFIER_SIZE];
 	int		status;
-	struct inode *inode = file_inode(desc->file);
 
 	dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
 			(unsigned long long)desc->dir_cookie);
@@ -967,7 +974,7 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
 	desc->duped = 0;
 
 	nfs_readdir_page_init_array(page, desc->dir_cookie);
-	status = nfs_readdir_xdr_to_array(desc, page, inode);
+	status = nfs_readdir_xdr_to_array(desc, page, desc->verf, verf);
 	if (status < 0)
 		goto out_release;
 
@@ -1023,6 +1030,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	desc->dup_cookie = dir_ctx->dup_cookie;
 	desc->duped = dir_ctx->duped;
 	desc->attr_gencount = dir_ctx->attr_gencount;
+	memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf));
 	spin_unlock(&file->f_lock);
 
 	do {
@@ -1061,6 +1069,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	dir_ctx->dup_cookie = desc->dup_cookie;
 	dir_ctx->duped = desc->duped;
 	dir_ctx->attr_gencount = desc->attr_gencount;
+	memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf));
 	spin_unlock(&file->f_lock);
 
 	kfree(desc);
@@ -1101,6 +1110,8 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
 			dir_ctx->dir_cookie = offset;
 		else
 			dir_ctx->dir_cookie = 0;
+		if (offset == 0)
+			memset(dir_ctx->verf, 0, sizeof(dir_ctx->verf));
 		dir_ctx->duped = 0;
 	}
 	spin_unlock(&filp->f_lock);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 36f415278c042..16745122ba2c1 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -244,7 +244,6 @@ static void nfs_zap_caches_locked(struct inode *inode)
 	nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
 	nfsi->attrtimeo_timestamp = jiffies;
 
-	memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf));
 	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
 		nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
 					| NFS_INO_INVALID_DATA
@@ -1252,7 +1251,6 @@ EXPORT_SYMBOL_GPL(nfs_revalidate_inode);
 
 static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
 {
-	struct nfs_inode *nfsi = NFS_I(inode);
 	int ret;
 
 	if (mapping->nrpages != 0) {
@@ -1265,11 +1263,6 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
 		if (ret < 0)
 			return ret;
 	}
-	if (S_ISDIR(inode->i_mode)) {
-		spin_lock(&inode->i_lock);
-		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
-		spin_unlock(&inode->i_lock);
-	}
 	nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
 	nfs_fscache_wait_on_invalidate(inode);
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 79a6c3018051a..dcd1f99e92e22 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -45,6 +45,11 @@
  */
 #define NFS_RPC_SWAPFLAGS		(RPC_TASK_SWAPPER|RPC_TASK_ROOTCREDS)
 
+/*
+ * Size of the NFS directory verifier
+ */
+#define NFS_DIR_VERIFIER_SIZE		2
+
 /*
  * NFSv3/v4 Access mode cache entry
  */
@@ -90,6 +95,7 @@ struct nfs_open_context {
 struct nfs_open_dir_context {
 	struct list_head list;
 	unsigned long attr_gencount;
+	__be32	verf[NFS_DIR_VERIFIER_SIZE];
 	__u64 dir_cookie;
 	__u64 dup_cookie;
 	signed char duped;
@@ -157,7 +163,7 @@ struct nfs_inode {
 	 * This is the cookie verifier used for NFSv3 readdir
 	 * operations
 	 */
-	__be32			cookieverf[2];
+	__be32			cookieverf[NFS_DIR_VERIFIER_SIZE];
 
 	atomic_long_t		nrequests;
 	struct nfs_mds_commit_info commit_info;

From d074a68519bab9448a6ae95f9a0813305c3238de Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 4 Nov 2020 08:32:19 -0500
Subject: [PATCH 099/737] NFS: Optimisations for monotonically increasing
 readdir cookies

If the server is handing out monotonically increasing readdir cookie values,
then we can optimise away searches through pages that contain cookies that
lie outside our search range.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
(cherry picked from commit 762567b7c798afd08c22811ecfc66885a2b50f91)
---
 fs/nfs/dir.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index a6c7362c0e73d..3c2c9bb3e81f4 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -140,7 +140,8 @@ struct nfs_cache_array {
 	u64 last_cookie;
 	unsigned int size;
 	unsigned char page_full : 1,
-		      page_is_eof : 1;
+		      page_is_eof : 1,
+		      cookies_are_ordered : 1;
 	struct nfs_cache_array_entry array[];
 };
 
@@ -178,6 +179,7 @@ static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie)
 	array = kmap_atomic(page);
 	nfs_readdir_array_init(array);
 	array->last_cookie = last_cookie;
+	array->cookies_are_ordered = 1;
 	kunmap_atomic(array);
 }
 
@@ -269,6 +271,8 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
 	cache_entry->name_len = entry->len;
 	cache_entry->name = name;
 	array->last_cookie = entry->cookie;
+	if (array->last_cookie <= cache_entry->cookie)
+		array->cookies_are_ordered = 0;
 	array->size++;
 	if (entry->eof != 0)
 		nfs_readdir_array_set_eof(array);
@@ -395,6 +399,19 @@ nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi)
 	return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags);
 }
 
+static bool nfs_readdir_array_cookie_in_range(struct nfs_cache_array *array,
+					      u64 cookie)
+{
+	if (!array->cookies_are_ordered)
+		return true;
+	/* Optimisation for monotonically increasing cookies */
+	if (cookie >= array->last_cookie)
+		return false;
+	if (array->size && cookie < array->array[0].cookie)
+		return false;
+	return true;
+}
+
 static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array,
 					 struct nfs_readdir_descriptor *desc)
 {
@@ -402,6 +419,9 @@ static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array,
 	loff_t new_pos;
 	int status = -EAGAIN;
 
+	if (!nfs_readdir_array_cookie_in_range(array, desc->dir_cookie))
+		goto check_eof;
+
 	for (i = 0; i < array->size; i++) {
 		if (array->array[i].cookie == desc->dir_cookie) {
 			struct nfs_inode *nfsi = NFS_I(file_inode(desc->file));
@@ -435,6 +455,7 @@ static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array,
 			return 0;
 		}
 	}
+check_eof:
 	if (array->page_is_eof) {
 		status = -EBADCOOKIE;
 		if (desc->dir_cookie == array->last_cookie)

From 8eff98d6b5587312a7b7cee7216165e48e32287b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 6 Nov 2020 20:38:47 -0500
Subject: [PATCH 100/737] NFS: Reduce number of RPC calls when doing uncached
 readdir

If we're doing uncached readdir, allocate multiple pages in order to
try to avoid duplicate RPC calls for the same getdents() call.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
(cherry picked from commit 35df59d3ef693292840a61cdb04b39d8c9412f4e)
---
 fs/nfs/dir.c | 105 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 69 insertions(+), 36 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3c2c9bb3e81f4..89d1f82aa1300 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -199,6 +199,23 @@ void nfs_readdir_clear_array(struct page *page)
 	kunmap_atomic(array);
 }
 
+static struct page *
+nfs_readdir_page_array_alloc(u64 last_cookie, gfp_t gfp_flags)
+{
+	struct page *page = alloc_page(gfp_flags);
+	if (page)
+		nfs_readdir_page_init_array(page, last_cookie);
+	return page;
+}
+
+static void nfs_readdir_page_array_free(struct page *page)
+{
+	if (page) {
+		nfs_readdir_clear_array(page);
+		put_page(page);
+	}
+}
+
 static void nfs_readdir_array_set_eof(struct nfs_cache_array *array)
 {
 	array->page_is_eof = 1;
@@ -694,12 +711,14 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry,
 static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
 				   struct nfs_entry *entry,
 				   struct page **xdr_pages,
-				   struct page *fillme, unsigned int buflen)
+				   unsigned int buflen,
+				   struct page **arrays,
+				   size_t narrays)
 {
 	struct address_space *mapping = desc->file->f_mapping;
 	struct xdr_stream stream;
 	struct xdr_buf buf;
-	struct page *scratch, *new, *page = fillme;
+	struct page *scratch, *new, *page = *arrays;
 	int status;
 
 	scratch = alloc_page(GFP_KERNEL);
@@ -725,15 +744,25 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
 		if (status != -ENOSPC)
 			continue;
 
-		if (page->mapping != mapping)
-			break;
-		new = nfs_readdir_page_get_next(mapping, page->index + 1,
-						entry->prev_cookie);
-		if (!new)
-			break;
-		if (page != fillme)
-			nfs_readdir_page_unlock_and_put(page);
-		page = new;
+		if (page->mapping != mapping) {
+			if (!--narrays)
+				break;
+			new = nfs_readdir_page_array_alloc(entry->prev_cookie,
+							   GFP_KERNEL);
+			if (!new)
+				break;
+			arrays++;
+			*arrays = page = new;
+		} else {
+			new = nfs_readdir_page_get_next(mapping,
+							page->index + 1,
+							entry->prev_cookie);
+			if (!new)
+				break;
+			if (page != *arrays)
+				nfs_readdir_page_unlock_and_put(page);
+			page = new;
+		}
 		status = nfs_readdir_add_to_array(entry, page);
 	} while (!status && !entry->eof);
 
@@ -750,7 +779,7 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
 		break;
 	}
 
-	if (page != fillme)
+	if (page != *arrays)
 		nfs_readdir_page_unlock_and_put(page);
 
 	put_page(scratch);
@@ -790,10 +819,11 @@ static struct page **nfs_readdir_alloc_pages(size_t npages)
 }
 
 static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
-				    struct page *page, __be32 *verf_arg,
-				    __be32 *verf_res)
+				    __be32 *verf_arg, __be32 *verf_res,
+				    struct page **arrays, size_t narrays)
 {
 	struct page **pages;
+	struct page *page = *arrays;
 	struct nfs_entry *entry;
 	size_t array_size;
 	struct inode *inode = file_inode(desc->file);
@@ -835,7 +865,8 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
 			break;
 		}
 
-		status = nfs_readdir_page_filler(desc, entry, pages, page, pglen);
+		status = nfs_readdir_page_filler(desc, entry, pages, pglen,
+						 arrays, narrays);
 	} while (!status && nfs_readdir_page_needs_filling(page));
 
 	nfs_readdir_free_pages(pages, array_size);
@@ -884,8 +915,8 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
 	if (!desc->page)
 		return -ENOMEM;
 	if (nfs_readdir_page_needs_filling(desc->page)) {
-		res = nfs_readdir_xdr_to_array(desc, desc->page,
-					       nfsi->cookieverf, verf);
+		res = nfs_readdir_xdr_to_array(desc, nfsi->cookieverf, verf,
+					       &desc->page, 1);
 		if (res < 0) {
 			nfs_readdir_page_unlock_and_put_cached(desc);
 			if (res == -EBADCOOKIE || res == -ENOTSYNC) {
@@ -976,37 +1007,39 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc)
  */
 static int uncached_readdir(struct nfs_readdir_descriptor *desc)
 {
-	struct page	*page = NULL;
+	struct page	**arrays;
+	size_t		i, sz = 512;
 	__be32		verf[NFS_DIR_VERIFIER_SIZE];
-	int		status;
+	int		status = -ENOMEM;
 
-	dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
+	dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %llu\n",
 			(unsigned long long)desc->dir_cookie);
 
-	page = alloc_page(GFP_HIGHUSER);
-	if (!page) {
-		status = -ENOMEM;
+	arrays = kcalloc(sz, sizeof(*arrays), GFP_KERNEL);
+	if (!arrays)
+		goto out;
+	arrays[0] = nfs_readdir_page_array_alloc(desc->dir_cookie, GFP_KERNEL);
+	if (!arrays[0])
 		goto out;
-	}
 
 	desc->page_index = 0;
 	desc->last_cookie = desc->dir_cookie;
-	desc->page = page;
 	desc->duped = 0;
 
-	nfs_readdir_page_init_array(page, desc->dir_cookie);
-	status = nfs_readdir_xdr_to_array(desc, page, desc->verf, verf);
-	if (status < 0)
-		goto out_release;
+	status = nfs_readdir_xdr_to_array(desc, desc->verf, verf, arrays, sz);
 
-	nfs_do_filldir(desc);
+	for (i = 0; !desc->eof && i < sz && arrays[i]; i++) {
+		desc->page = arrays[i];
+		nfs_do_filldir(desc);
+	}
+	desc->page = NULL;
 
- out_release:
-	nfs_readdir_clear_array(desc->page);
-	nfs_readdir_page_put(desc);
- out:
-	dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
-			__func__, status);
+
+	for (i = 0; i < sz && arrays[i]; i++)
+		nfs_readdir_page_array_free(arrays[i]);
+out:
+	kfree(arrays);
+	dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status);
 	return status;
 }
 

From 38009397d7cee57558c404f15094a80bd534978a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 6 Nov 2020 20:47:05 -0500
Subject: [PATCH 101/737] NFS: Do uncached readdir when we're seeking a cookie
 in an empty page cache

If the directory is changing, causing the page cache to get invalidated
while we are listing the contents, then the NFS client is currently forced
to read in the entire directory contents from scratch, because it needs
to perform a linear search for the readdir cookie. While this is not
an issue for small directories, it does not scale to directories with
millions of entries.
In order to be able to deal with large directories that are changing,
add a heuristic to ensure that if the page cache is empty, and we are
searching for a cookie that is not the zero cookie, we just default to
performing uncached readdir.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
(cherry picked from commit 794092c57f89c2c833da00f82f38a0afcb4033bc)
---
 fs/nfs/dir.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 89d1f82aa1300..800ea6a74fffa 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -937,11 +937,28 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
 	return res;
 }
 
+static bool nfs_readdir_dont_search_cache(struct nfs_readdir_descriptor *desc)
+{
+	struct address_space *mapping = desc->file->f_mapping;
+	struct inode *dir = file_inode(desc->file);
+	unsigned int dtsize = NFS_SERVER(dir)->dtsize;
+	loff_t size = i_size_read(dir);
+
+	/*
+	 * Default to uncached readdir if the page cache is empty, and
+	 * we're looking for a non-zero cookie in a large directory.
+	 */
+	return desc->dir_cookie != 0 && mapping->nrpages == 0 && size > dtsize;
+}
+
 /* Search for desc->dir_cookie from the beginning of the page cache */
 static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
 {
 	int res;
 
+	if (nfs_readdir_dont_search_cache(desc))
+		return -EBADCOOKIE;
+
 	do {
 		if (desc->page_index == 0) {
 			desc->current_index = 0;

From f537ee3e9d8f64d70e949f0e1449896c9f635fe5 Mon Sep 17 00:00:00 2001
From: Anchal Agarwal <anchalag@amazon.com>
Date: Thu, 4 Feb 2021 06:30:35 +0000
Subject: [PATCH 102/737] ena: update to 2.4.1

Update the ENA driver to version 2.4.1. Changelog since 2.4.0:

**Bug Fixes**
* Fix compilation error in kernels >= 5.10

**Minor Changes**
* Make all module parameters readable

Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
---
 drivers/amazon/net/ena/ena_netdev.c | 11 +++++------
 drivers/amazon/net/ena/ena_netdev.h |  2 +-
 drivers/amazon/net/ena/kcompat.h    |  8 +++++---
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c
index 9fe9515be80c3..590762a05b7b5 100755
--- a/drivers/amazon/net/ena/ena_netdev.c
+++ b/drivers/amazon/net/ena/ena_netdev.c
@@ -46,15 +46,15 @@ MODULE_VERSION(DRV_MODULE_GENERATION);
 #define DEFAULT_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_IFUP | \
 		NETIF_MSG_TX_DONE | NETIF_MSG_TX_ERR | NETIF_MSG_RX_ERR)
 static int debug = -1;
-module_param(debug, int, 0);
-MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
+module_param(debug, int, 0444);
+MODULE_PARM_DESC(debug, "Debug level (-1=default,0=none,...,16=all)");
 
 static int rx_queue_size = ENA_DEFAULT_RING_SIZE;
-module_param(rx_queue_size, int, S_IRUGO);
+module_param(rx_queue_size, int, 0444);
 MODULE_PARM_DESC(rx_queue_size, "Rx queue size. The size should be a power of 2. Max value is 8K\n");
 
 static int force_large_llq_header;
-module_param(force_large_llq_header, int, S_IRUGO);
+module_param(force_large_llq_header, int, 0444);
 MODULE_PARM_DESC(force_large_llq_header, "Increases maximum supported header size in LLQ mode to 224 bytes, while reducing the maximum TX queue size by half.\n");
 
 static int num_io_queues = ENA_MAX_NUM_IO_QUEUES;
@@ -62,7 +62,7 @@ module_param(num_io_queues, int, 0444);
 MODULE_PARM_DESC(num_io_queues, "Sets number of RX/TX queues to allocate to device. The maximum value depends on the device and number of online CPUs.\n");
 
 static int lpc_size = ENA_LPC_DEFAULT_MULTIPLIER;
-module_param(lpc_size, uint, 0);
+module_param(lpc_size, uint, 0444);
 MODULE_PARM_DESC(lpc_size, "Each local page cache (lpc) holds N * 1024 pages. This parameter sets N which is rounded up to a multiplier of 2. If zero, the page cache is disabled. Max: 32\n");
 
 static struct ena_aenq_handlers aenq_handlers;
@@ -2572,7 +2572,6 @@ static void ena_del_napi_in_range(struct ena_adapter *adapter,
 	int i;
 
 	for (i = first_index; i < first_index + count; i++) {
-		napi_hash_del(&adapter->ena_napi[i].napi);
 		netif_napi_del(&adapter->ena_napi[i].napi);
 
 #ifdef ENA_XDP_SUPPORT
diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h
index 48f648ef85f0b..144bfb5378c04 100755
--- a/drivers/amazon/net/ena/ena_netdev.h
+++ b/drivers/amazon/net/ena/ena_netdev.h
@@ -26,7 +26,7 @@
 
 #define DRV_MODULE_GEN_MAJOR	2
 #define DRV_MODULE_GEN_MINOR	4
-#define DRV_MODULE_GEN_SUBMINOR 0
+#define DRV_MODULE_GEN_SUBMINOR 1
 
 #define DRV_MODULE_NAME		"ena"
 #ifndef DRV_MODULE_GENERATION
diff --git a/drivers/amazon/net/ena/kcompat.h b/drivers/amazon/net/ena/kcompat.h
index 729a13eeabf98..35ab08cada988 100755
--- a/drivers/amazon/net/ena/kcompat.h
+++ b/drivers/amazon/net/ena/kcompat.h
@@ -759,7 +759,8 @@ static inline void netdev_rss_key_fill(void *buffer, size_t len)
 #define ENA_GENERIC_PM_OPS
 #endif
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 5 ,0)
+#if ((LINUX_VERSION_CODE < KERNEL_VERSION(4, 5 ,0)) && \
+     !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,3)))
 static inline int page_ref_count(struct page *page)
 {
 	return atomic_read(&page->_count);
@@ -771,8 +772,9 @@ static inline void page_ref_inc(struct page *page)
 }
 #endif
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0)
-static inline struct page *dev_alloc_page()
+#if ((LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0)) && \
+     !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2)))
+static inline struct page *dev_alloc_page(void)
 {
 	gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN;
 

From 25e51b3e7e8306891eb2385635aded3c5f9d8dd6 Mon Sep 17 00:00:00 2001
From: Samuel Mendoza-Jonas <samjonas@amazon.com>
Date: Thu, 5 Sep 2019 16:25:03 +0000
Subject: [PATCH 103/737] Add Amazon EFA driver version 1.11.1

Add the EFA driver to the 5.10 tree. This squashes three cherry-picked
commits from amazon-5.4.y/master:
b85486ce "Add Amazon EFA driver version 1.4"
499ab763 "drivers/amazon: efa: update to 1.9.0"
2ba3701e "drivers/amazon: efa: update to 1.10.1"

Then the driver is updated to 1.11.1. This is done because an update to
config.h is necessary to solve a redefinition of structs in
include/rdma/ib_verbs.h, so we may as well update while we're here.

Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 drivers/amazon/Kconfig                       |    9 +
 drivers/amazon/net/Makefile                  |    1 +
 drivers/amazon/net/efa/Makefile              |   11 +
 drivers/amazon/net/efa/config.h              |  218 ++
 drivers/amazon/net/efa/efa-abi.h             |  163 +
 drivers/amazon/net/efa/efa.h                 |  343 ++
 drivers/amazon/net/efa/efa_admin_cmds_defs.h |  932 ++++++
 drivers/amazon/net/efa/efa_admin_defs.h      |  134 +
 drivers/amazon/net/efa/efa_com.c             | 1094 ++++++
 drivers/amazon/net/efa/efa_com.h             |  145 +
 drivers/amazon/net/efa/efa_com_cmd.c         |  781 +++++
 drivers/amazon/net/efa/efa_com_cmd.h         |  319 ++
 drivers/amazon/net/efa/efa_common_defs.h     |   31 +
 drivers/amazon/net/efa/efa_gdr.c             |  225 ++
 drivers/amazon/net/efa/efa_gdr.h             |   31 +
 drivers/amazon/net/efa/efa_main.c            |  951 ++++++
 drivers/amazon/net/efa/efa_regs_defs.h       |   96 +
 drivers/amazon/net/efa/efa_sysfs.c           |   38 +
 drivers/amazon/net/efa/efa_sysfs.h           |   15 +
 drivers/amazon/net/efa/efa_verbs.c           | 3140 ++++++++++++++++++
 drivers/amazon/net/efa/kcompat.h             |  189 ++
 21 files changed, 8866 insertions(+)
 create mode 100644 drivers/amazon/net/efa/Makefile
 create mode 100644 drivers/amazon/net/efa/config.h
 create mode 100644 drivers/amazon/net/efa/efa-abi.h
 create mode 100644 drivers/amazon/net/efa/efa.h
 create mode 100644 drivers/amazon/net/efa/efa_admin_cmds_defs.h
 create mode 100644 drivers/amazon/net/efa/efa_admin_defs.h
 create mode 100644 drivers/amazon/net/efa/efa_com.c
 create mode 100644 drivers/amazon/net/efa/efa_com.h
 create mode 100644 drivers/amazon/net/efa/efa_com_cmd.c
 create mode 100644 drivers/amazon/net/efa/efa_com_cmd.h
 create mode 100644 drivers/amazon/net/efa/efa_common_defs.h
 create mode 100644 drivers/amazon/net/efa/efa_gdr.c
 create mode 100644 drivers/amazon/net/efa/efa_gdr.h
 create mode 100644 drivers/amazon/net/efa/efa_main.c
 create mode 100644 drivers/amazon/net/efa/efa_regs_defs.h
 create mode 100644 drivers/amazon/net/efa/efa_sysfs.c
 create mode 100644 drivers/amazon/net/efa/efa_sysfs.h
 create mode 100644 drivers/amazon/net/efa/efa_verbs.c
 create mode 100644 drivers/amazon/net/efa/kcompat.h

diff --git a/drivers/amazon/Kconfig b/drivers/amazon/Kconfig
index a51d1cfea64b4..68b807d53c462 100644
--- a/drivers/amazon/Kconfig
+++ b/drivers/amazon/Kconfig
@@ -21,4 +21,13 @@ config AMAZON_ENA_ETHERNET
 	  To compile this driver as a module, choose M here.
 	  The module will be called ena.
 
+config AMAZON_EFA_INFINIBAND
+	tristate "Elastic Fabric Adapter (EFA) support"
+	depends on INFINIBAND_USER_ACCESS && AMAZON_ENA_ETHERNET
+	help
+	  This driver support Elastic Fabric Adapter (EFA)
+
+	  To compile this driver as a module, choose M here.
+	  The module will be called efa
+
 endif # AMAZON_DRIVER_UPDATES
diff --git a/drivers/amazon/net/Makefile b/drivers/amazon/net/Makefile
index d59ce86b1311d..197c9a061fdf0 100644
--- a/drivers/amazon/net/Makefile
+++ b/drivers/amazon/net/Makefile
@@ -2,3 +2,4 @@
 # Amazon Driver Updates
 #
 obj-$(CONFIG_AMAZON_ENA_ETHERNET)	+= ena/
+obj-$(CONFIG_AMAZON_EFA_INFINIBAND)	+= efa/
diff --git a/drivers/amazon/net/efa/Makefile b/drivers/amazon/net/efa/Makefile
new file mode 100644
index 0000000000000..9c4acbe2942a4
--- /dev/null
+++ b/drivers/amazon/net/efa/Makefile
@@ -0,0 +1,11 @@
+#
+# Makefile for the Elastic Fabric Adapter (EFA) device drivers.
+# EFA Source is: https://github.com/amzn/amzn-drivers.
+
+obj-$(CONFIG_AMAZON_EFA_INFINIBAND) += efa.o
+
+efa-y := efa_com.o efa_com_cmd.o efa_main.o efa_verbs.o
+
+efa-$(CONFIG_SYSFS) += efa_sysfs.o
+
+ccflags-y += -include $(srctree)/drivers/amazon/net/efa/config.h
diff --git a/drivers/amazon/net/efa/config.h b/drivers/amazon/net/efa/config.h
new file mode 100644
index 0000000000000..127e3a9f218e3
--- /dev/null
+++ b/drivers/amazon/net/efa/config.h
@@ -0,0 +1,218 @@
+/* src/config.h.  Generated from config.h.in by configure.  */
+/* src/config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* have ah core allocation */
+#define HAVE_AH_CORE_ALLOCATION 1
+
+/* destroy_ah has return code again */
+#define HAVE_AH_CORE_ALLOCATION_DESTROY_RC 1
+
+/* have device ops alloc_pd without ucontext */
+#define HAVE_ALLOC_PD_NO_UCONTEXT 1
+
+/* atomic64_fetch_inc exists */
+#define HAVE_ATOMIC64_FETCH_INC 1
+
+/* have bitfield.h */
+#define HAVE_BITFIELD_H 1
+
+/* have core mmap xarray */
+#define HAVE_CORE_MMAP_XA 1
+
+/* have cq core allocation */
+#define HAVE_CQ_CORE_ALLOCATION 1
+
+/* rdma_ah_init_attr exists */
+#define HAVE_CREATE_AH_INIT_ATTR 1
+
+/* create_ah doesn't have udata */
+/* #undef HAVE_CREATE_AH_NO_UDATA */
+
+/* create_ah has rdma_attr */
+#define HAVE_CREATE_AH_RDMA_ATTR 1
+
+/* create_ah has udata */
+/* #undef HAVE_CREATE_AH_UDATA */
+
+/* create_cq has attr param */
+#define HAVE_CREATE_CQ_ATTR 1
+
+/* have device ops create_cq without ucontext */
+/* #undef HAVE_CREATE_CQ_NO_UCONTEXT */
+
+/* create/destroy_ah has flags */
+/* #undef HAVE_CREATE_DESTROY_AH_FLAGS */
+
+/* have device ops dealloc pd has udata */
+/* #undef HAVE_DEALLOC_PD_UDATA */
+
+/* dealloc_pd has udata and return code */
+#define HAVE_DEALLOC_PD_UDATA_RC 1
+
+/* have device ops dereg mr udata */
+#define HAVE_DEREG_MR_UDATA 1
+
+/* have device ops destroy cq udata */
+#define HAVE_DESTROY_CQ_UDATA 1
+
+/* have device ops destroy qp udata */
+#define HAVE_DESTROY_QP_UDATA 1
+
+/* dev has parent field */
+#define HAVE_DEV_PARENT 1
+
+/* driver_id field exists */
+/* #undef HAVE_DRIVER_ID */
+
+/* efa gdr enabled */
+/* #undef HAVE_EFA_GDR */
+
+/* get_port_immutable exists */
+#define HAVE_GET_PORT_IMMUTABLE 1
+
+/* have hw_stats */
+#define HAVE_HW_STATS 1
+
+/* have ibdev print */
+#define HAVE_IBDEV_PRINT 1
+
+/* have ibdev ratelimited print */
+#define HAVE_IBDEV_PRINT_RATELIMITED 1
+
+/* IB_ACCESS_OPTIONAL exists */
+#define HAVE_IB_ACCESS_OPTIONAL 1
+
+/* ib_device_ops has common fields */
+#define HAVE_IB_DEVICE_OPS_COMMON 1
+
+/* struct ib_device_ops exists */
+#define HAVE_IB_DEV_OPS 1
+
+/* destroy_cq has return code again */
+#define HAVE_IB_INT_DESTROY_CQ 1
+
+/* have ib_is_udata_cleared */
+#define HAVE_IB_IS_UDATA_CLEARED 1
+
+/* ib_modify_qp_is_ok has four params */
+#define HAVE_IB_MODIFY_QP_IS_OK_FOUR_PARAMS 1
+
+/* ib_mr has length field */
+#define HAVE_IB_MR_LENGTH 1
+
+/* ib_mtu_int_to_enum exists */
+#define HAVE_IB_MTU_INT_TO_ENUM 1
+
+/* have ib port phys state link up */
+#define HAVE_IB_PORT_PHYS_STATE_LINK_UP 1
+
+/* have driver qpt */
+#define HAVE_IB_QPT_DRIVER 1
+
+/* query_device has udata */
+#define HAVE_IB_QUERY_DEVICE_UDATA 1
+
+/* ib_register_device has dma_device param */
+#define HAVE_IB_REGISTER_DEVICE_DMA_DEVICE_PARAM 1
+
+/* ib_register_device has name param */
+/* #undef HAVE_IB_REGISTER_DEVICE_NAME_PARAM */
+
+/* ib_register_device has two params */
+/* #undef HAVE_IB_REGISTER_DEVICE_TWO_PARAMS */
+
+/* ib_umem_find_single_pg_size exists */
+#define HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE 1
+
+/* have ib_umem_get device param */
+#define HAVE_IB_UMEM_GET_DEVICE_PARAM 1
+
+/* ib_umem_get has no dmasync parameter */
+#define HAVE_IB_UMEM_GET_NO_DMASYNC 1
+
+/* ib_umem_get has udata */
+/* #undef HAVE_IB_UMEM_GET_UDATA */
+
+/* ib_umem_num_dma_blocks exists */
+#define HAVE_IB_UMEM_NUM_DMA_BLOCKS 1
+
+/* have void destroy cq */
+/* #undef HAVE_IB_VOID_DESTROY_CQ */
+
+/* have kvzalloc */
+#define HAVE_KVZALLOC 1
+
+/* ib_device_attr has max_send_recv_sge */
+#define HAVE_MAX_SEND_RCV_SGE 1
+
+/* have no kverbs drivers */
+#define HAVE_NO_KVERBS_DRIVERS 1
+
+/* have pci_irq_vector */
+#define HAVE_PCI_IRQ_VECTOR 1
+
+/* have amazon pci id */
+#define HAVE_PCI_VENDOR_ID_AMAZON 1
+
+/* have pd core allocation */
+#define HAVE_PD_CORE_ALLOCATION 1
+
+/* have const wr in post verbs */
+#define HAVE_POST_CONST_WR 1
+
+/* have unspecified node type */
+#define HAVE_RDMA_NODE_UNSPECIFIED 1
+
+/* rdma_umem_for_each_dma_block exists */
+#define HAVE_RDMA_UMEM_FOR_EACH_DMA_BLOCK 1
+
+/* rdma_user_mmap_io exists */
+/* #undef HAVE_RDMA_USER_MMAP_IO */
+
+/* safe ib_alloc_device exists */
+#define HAVE_SAFE_IB_ALLOC_DEVICE 1
+
+/* for_each_sg_dma_page exists */
+#define HAVE_SG_DMA_PAGE_ITER 1
+
+/* have ucontext core allocation */
+#define HAVE_UCONTEXT_CORE_ALLOCATION 1
+
+/* rdma_udata_to_drv_context exists */
+#define HAVE_UDATA_TO_DRV_CONTEXT 1
+
+/* ib umem scatterlist exists */
+#define HAVE_UMEM_SCATTERLIST_IF 1
+
+/* have upstream efa */
+#define HAVE_UPSTREAM_EFA 1
+
+/* have uverbs command header fix */
+/* #undef HAVE_UVERBS_CMD_HDR_FIX */
+
+/* uverbs_cmd_mask is not needed */
+/* #undef HAVE_UVERBS_CMD_MASK_NOT_NEEDED */
+
+/* Name of package */
+#define PACKAGE "efa"
+
+/* Define to the address where bug reports for this package should be sent. */
+#define PACKAGE_BUGREPORT ""
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "efa"
+
+/* Define to the full name and version of this package. */
+#define PACKAGE_STRING "efa 1.11.1"
+
+/* Define to the one symbol short name of this package. */
+#define PACKAGE_TARNAME "efa"
+
+/* Define to the home page for this package. */
+#define PACKAGE_URL ""
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION "1.11.1"
+
+/* Version number of package */
+#define VERSION "1.11.1"
diff --git a/drivers/amazon/net/efa/efa-abi.h b/drivers/amazon/net/efa/efa-abi.h
new file mode 100644
index 0000000000000..fee906bc28bb6
--- /dev/null
+++ b/drivers/amazon/net/efa/efa-abi.h
@@ -0,0 +1,163 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */
+/*
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef EFA_ABI_USER_H
+#define EFA_ABI_USER_H
+
+#include <linux/types.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define EFA_UVERBS_ABI_VERSION 1
+
+/*
+ * Keep structs aligned to 8 bytes.
+ * Keep reserved fields as arrays of __u8 named reserved_XXX where XXX is the
+ * hex bit offset of the field.
+ */
+
+enum {
+	EFA_ALLOC_UCONTEXT_CMD_COMP_TX_BATCH  = 1 << 0,
+	EFA_ALLOC_UCONTEXT_CMD_COMP_MIN_SQ_WR = 1 << 1,
+};
+
+struct efa_ibv_alloc_ucontext_cmd {
+	__u32 comp_mask;
+	__u8 reserved_20[4];
+};
+
+enum efa_ibv_user_cmds_supp_udata {
+	EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE = 1 << 0,
+	EFA_USER_CMDS_SUPP_UDATA_CREATE_AH    = 1 << 1,
+};
+
+struct efa_ibv_alloc_ucontext_resp {
+	__u32 comp_mask;
+	__u32 cmds_supp_udata_mask;
+	__u16 sub_cqs_per_cq;
+	__u16 inline_buf_size;
+	__u32 max_llq_size; /* bytes */
+	__u16 max_tx_batch; /* units of 64 bytes */
+	__u16 min_sq_wr;
+	__u8 reserved_a0[4];
+};
+
+struct efa_ibv_alloc_pd_resp {
+	__u32 comp_mask;
+	__u16 pdn;
+	__u8 reserved_30[2];
+};
+
+struct efa_ibv_create_cq {
+	__u32 comp_mask;
+	__u32 cq_entry_size;
+	__u16 num_sub_cqs;
+	__u8 reserved_50[6];
+};
+
+struct efa_ibv_create_cq_resp {
+	__u32 comp_mask;
+	__u8 reserved_20[4];
+	__aligned_u64 q_mmap_key;
+	__aligned_u64 q_mmap_size;
+	__u16 cq_idx;
+	__u8 reserved_d0[6];
+};
+
+enum {
+	EFA_QP_DRIVER_TYPE_SRD = 0,
+};
+
+struct efa_ibv_create_qp {
+	__u32 comp_mask;
+	__u32 rq_ring_size; /* bytes */
+	__u32 sq_ring_size; /* bytes */
+	__u32 driver_qp_type;
+};
+
+struct efa_ibv_create_qp_resp {
+	__u32 comp_mask;
+	/* the offset inside the page of the rq db */
+	__u32 rq_db_offset;
+	/* the offset inside the page of the sq db */
+	__u32 sq_db_offset;
+	/* the offset inside the page of descriptors buffer */
+	__u32 llq_desc_offset;
+	__aligned_u64 rq_mmap_key;
+	__aligned_u64 rq_mmap_size;
+	__aligned_u64 rq_db_mmap_key;
+	__aligned_u64 sq_db_mmap_key;
+	__aligned_u64 llq_desc_mmap_key;
+	__u16 send_sub_cq_idx;
+	__u16 recv_sub_cq_idx;
+	__u8 reserved_1e0[4];
+};
+
+struct efa_ibv_create_ah_resp {
+	__u32 comp_mask;
+	__u16 efa_address_handle;
+	__u8 reserved_30[2];
+};
+
+enum {
+	EFA_QUERY_DEVICE_CAPS_RDMA_READ = 1 << 0,
+	EFA_QUERY_DEVICE_CAPS_RNR_RETRY = 1 << 1,
+};
+
+struct efa_ibv_ex_query_device_resp {
+	__u32 comp_mask;
+	__u32 max_sq_wr;
+	__u32 max_rq_wr;
+	__u16 max_sq_sge;
+	__u16 max_rq_sge;
+	__u32 max_rdma_size;
+	__u32 device_caps;
+};
+
+#ifdef HAVE_CUSTOM_COMMANDS
+/******************************************************************************/
+/*                            EFA CUSTOM COMMANDS                             */
+/******************************************************************************/
+#include <rdma/ib_user_verbs.h>
+
+enum efa_everbs_commands {
+	EFA_EVERBS_CMD_GET_AH = 1,
+	EFA_EVERBS_CMD_GET_EX_DEV_ATTRS,
+	EFA_EVERBS_CMD_MAX,
+};
+
+struct efa_everbs_get_ah {
+	__u32 comp_mask;
+	__u16 pdn;
+	__u8 reserved_30[2];
+	__aligned_u64 response;
+	__aligned_u64 user_handle;
+	__u8 gid[16];
+};
+
+struct efa_everbs_get_ah_resp {
+	__u32 comp_mask;
+	__u16 efa_address_handle;
+	__u8 reserved_30[2];
+};
+
+struct efa_everbs_get_ex_dev_attrs {
+	__u32 comp_mask;
+	__u8 reserved_20[4];
+	__aligned_u64 response;
+};
+
+struct efa_everbs_get_ex_dev_attrs_resp {
+	__u32 comp_mask;
+	__u32 max_sq_wr;
+	__u32 max_rq_wr;
+	__u16 max_sq_sge;
+	__u16 max_rq_sge;
+};
+#endif /* HAVE_CUSTOM_COMMANDS */
+
+#endif /* EFA_ABI_USER_H */
diff --git a/drivers/amazon/net/efa/efa.h b/drivers/amazon/net/efa/efa.h
new file mode 100644
index 0000000000000..ec19c69a8b81c
--- /dev/null
+++ b/drivers/amazon/net/efa/efa.h
@@ -0,0 +1,343 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_H_
+#define _EFA_H_
+
+#include "kcompat.h"
+#include <linux/bitops.h>
+#ifdef HAVE_CUSTOM_COMMANDS
+#include <linux/cdev.h>
+#include <linux/fs.h>
+#endif
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/version.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "efa-abi.h"
+#include "efa_com_cmd.h"
+
+#define DRV_MODULE_NAME         "efa"
+#define DEVICE_NAME             "Elastic Fabric Adapter (EFA)"
+
+#define EFA_IRQNAME_SIZE        40
+
+/* 1 for AENQ + ADMIN */
+#define EFA_NUM_MSIX_VEC                  1
+#define EFA_MGMNT_MSIX_VEC_IDX            0
+
+struct efa_irq {
+	irq_handler_t handler;
+	void *data;
+	int cpu;
+	u32 vector;
+	cpumask_t affinity_hint_mask;
+	char name[EFA_IRQNAME_SIZE];
+};
+
+/* Don't use anything other than atomic64 */
+struct efa_stats {
+	atomic64_t alloc_pd_err;
+	atomic64_t create_qp_err;
+	atomic64_t create_cq_err;
+	atomic64_t reg_mr_err;
+	atomic64_t alloc_ucontext_err;
+	atomic64_t create_ah_err;
+	atomic64_t mmap_err;
+	atomic64_t keep_alive_rcvd;
+};
+
+struct efa_dev {
+	struct ib_device ibdev;
+	struct efa_com_dev edev;
+	struct pci_dev *pdev;
+	struct efa_com_get_device_attr_result dev_attr;
+
+	u64 reg_bar_addr;
+	u64 reg_bar_len;
+	u64 mem_bar_addr;
+	u64 mem_bar_len;
+	u64 db_bar_addr;
+	u64 db_bar_len;
+
+#ifndef HAVE_PCI_IRQ_VECTOR
+	struct msix_entry admin_msix_entry;
+#else
+	int admin_msix_vector_idx;
+#endif
+	struct efa_irq admin_irq;
+
+#ifndef HAVE_CREATE_AH_UDATA
+	struct list_head efa_ah_list;
+	/* Protects efa_ah_list */
+	struct mutex ah_list_lock;
+#endif
+#ifdef HAVE_CUSTOM_COMMANDS
+	struct device *everbs_dev;
+	struct cdev cdev;
+#endif
+
+	struct efa_stats stats;
+};
+
+struct efa_ucontext {
+	struct ib_ucontext ibucontext;
+	u16 uarn;
+#ifndef HAVE_CORE_MMAP_XA
+	/* Protects ucontext state */
+	struct mutex lock;
+	struct list_head pending_mmaps;
+	u32 mmap_page;
+#endif /* !defined(HAVE_CORE_MMAP_XA) */
+};
+
+struct efa_pd {
+	struct ib_pd ibpd;
+	u16 pdn;
+};
+
+struct efa_mr {
+	struct ib_mr ibmr;
+	struct ib_umem *umem;
+#ifdef HAVE_EFA_GDR
+	struct efa_nvmem *nvmem;
+	u64 nvmem_ticket;
+#endif
+};
+
+struct efa_cq {
+	struct ib_cq ibcq;
+	struct efa_ucontext *ucontext;
+	dma_addr_t dma_addr;
+	void *cpu_addr;
+	struct rdma_user_mmap_entry *mmap_entry;
+	size_t size;
+	u16 cq_idx;
+};
+
+struct efa_qp {
+	struct ib_qp ibqp;
+	dma_addr_t rq_dma_addr;
+	void *rq_cpu_addr;
+	size_t rq_size;
+	enum ib_qp_state state;
+
+	/* Used for saving mmap_xa entries */
+	struct rdma_user_mmap_entry *sq_db_mmap_entry;
+	struct rdma_user_mmap_entry *llq_desc_mmap_entry;
+	struct rdma_user_mmap_entry *rq_db_mmap_entry;
+	struct rdma_user_mmap_entry *rq_mmap_entry;
+
+	u32 qp_handle;
+	u32 max_send_wr;
+	u32 max_recv_wr;
+	u32 max_send_sge;
+	u32 max_recv_sge;
+	u32 max_inline_data;
+};
+
+struct efa_ah {
+	struct ib_ah ibah;
+	u16 ah;
+	/* dest_addr */
+	u8 id[EFA_GID_SIZE];
+};
+
+#ifdef HAVE_IB_QUERY_DEVICE_UDATA
+int efa_query_device(struct ib_device *ibdev,
+		     struct ib_device_attr *props,
+		     struct ib_udata *udata);
+#else
+#warning deprecated api
+int efa_query_device(struct ib_device *ibdev,
+		     struct ib_device_attr *props);
+#endif
+int efa_query_port(struct ib_device *ibdev, u8 port,
+		   struct ib_port_attr *props);
+int efa_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+		 int qp_attr_mask,
+		 struct ib_qp_init_attr *qp_init_attr);
+int efa_query_gid(struct ib_device *ibdev, u8 port, int index,
+		  union ib_gid *gid);
+int efa_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+		   u16 *pkey);
+#ifdef HAVE_ALLOC_PD_NO_UCONTEXT
+int efa_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
+#else
+int efa_alloc_pd(struct ib_pd *ibpd,
+		 struct ib_ucontext *ibucontext,
+		 struct ib_udata *udata);
+#endif
+#ifdef HAVE_DEALLOC_PD_UDATA_RC
+int efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
+#elif defined(HAVE_DEALLOC_PD_UDATA)
+void efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
+#elif defined(HAVE_PD_CORE_ALLOCATION)
+void efa_dealloc_pd(struct ib_pd *ibpd);
+#else
+int efa_dealloc_pd(struct ib_pd *ibpd);
+struct ib_pd *efa_kzalloc_pd(struct ib_device *ibdev,
+			     struct ib_ucontext *ibucontext,
+			     struct ib_udata *udata);
+#endif
+#ifdef HAVE_DESTROY_QP_UDATA
+int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
+#else
+int efa_destroy_qp(struct ib_qp *ibqp);
+#endif
+struct ib_qp *efa_create_qp(struct ib_pd *ibpd,
+			    struct ib_qp_init_attr *init_attr,
+			    struct ib_udata *udata);
+#ifdef HAVE_IB_INT_DESTROY_CQ
+int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
+#elif defined(HAVE_IB_VOID_DESTROY_CQ)
+void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
+#elif defined(HAVE_DESTROY_CQ_UDATA)
+int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
+#else
+int efa_destroy_cq(struct ib_cq *ibcq);
+#endif
+#ifdef HAVE_CREATE_CQ_ATTR
+int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		  struct ib_udata *udata);
+#else
+#warning deprecated api
+int efa_create_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata);
+#endif
+#ifndef HAVE_CQ_CORE_ALLOCATION
+#ifdef HAVE_CREATE_CQ_NO_UCONTEXT
+struct ib_cq *efa_kzalloc_cq(struct ib_device *ibdev,
+			     const struct ib_cq_init_attr *attr,
+			     struct ib_udata *udata);
+#elif defined(HAVE_CREATE_CQ_ATTR)
+struct ib_cq *efa_kzalloc_cq(struct ib_device *ibdev,
+			     const struct ib_cq_init_attr *attr,
+			     struct ib_ucontext *ibucontext,
+			     struct ib_udata *udata);
+#else
+struct ib_cq *efa_kzalloc_cq(struct ib_device *ibdev, int entries,
+			     int vector,
+			     struct ib_ucontext *ibucontext,
+			     struct ib_udata *udata);
+#endif
+#endif
+struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
+			 u64 virt_addr, int access_flags,
+			 struct ib_udata *udata);
+#ifdef HAVE_DEREG_MR_UDATA
+int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
+#else
+int efa_dereg_mr(struct ib_mr *ibmr);
+#endif
+#ifdef HAVE_GET_PORT_IMMUTABLE
+int efa_get_port_immutable(struct ib_device *ibdev, u8 port_num,
+			   struct ib_port_immutable *immutable);
+#endif
+int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata);
+#ifdef HAVE_UCONTEXT_CORE_ALLOCATION
+void efa_dealloc_ucontext(struct ib_ucontext *ibucontext);
+#else
+int efa_dealloc_ucontext(struct ib_ucontext *ibucontext);
+struct ib_ucontext *efa_kzalloc_ucontext(struct ib_device *ibdev,
+					 struct ib_udata *udata);
+#endif
+int efa_mmap(struct ib_ucontext *ibucontext,
+	     struct vm_area_struct *vma);
+#ifdef HAVE_CORE_MMAP_XA
+void efa_mmap_free(struct rdma_user_mmap_entry *rdma_entry);
+#endif
+int efa_create_ah(struct ib_ah *ibah,
+#ifdef HAVE_CREATE_AH_INIT_ATTR
+		  struct rdma_ah_init_attr *init_attr,
+#else
+#ifdef HAVE_CREATE_AH_RDMA_ATTR
+		  struct rdma_ah_attr *ah_attr,
+#else
+		  struct ib_ah_attr *ah_attr,
+#endif
+		  u32 flags,
+#endif
+		  struct ib_udata *udata);
+#ifndef HAVE_AH_CORE_ALLOCATION
+#ifdef HAVE_CREATE_DESTROY_AH_FLAGS
+struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd,
+			     struct rdma_ah_attr *ah_attr,
+			     u32 flags,
+			     struct ib_udata *udata);
+#elif defined(HAVE_CREATE_AH_RDMA_ATTR)
+struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd,
+			     struct rdma_ah_attr *ah_attr,
+			     struct ib_udata *udata);
+#elif defined(HAVE_CREATE_AH_UDATA)
+struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd,
+			     struct ib_ah_attr *ah_attr,
+			     struct ib_udata *udata);
+#else
+struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd,
+			     struct ib_ah_attr *ah_attr);
+#endif
+#endif
+#ifdef HAVE_AH_CORE_ALLOCATION_DESTROY_RC
+int efa_destroy_ah(struct ib_ah *ibah, u32 flags);
+#elif defined(HAVE_AH_CORE_ALLOCATION)
+void efa_destroy_ah(struct ib_ah *ibah, u32 flags);
+#elif defined(HAVE_CREATE_DESTROY_AH_FLAGS)
+int efa_destroy_ah(struct ib_ah *ibah, u32 flags);
+#else
+int efa_destroy_ah(struct ib_ah *ibah);
+#endif
+#ifndef HAVE_NO_KVERBS_DRIVERS
+#ifdef HAVE_POST_CONST_WR
+int efa_post_send(struct ib_qp *ibqp,
+		  const struct ib_send_wr *wr,
+		  const struct ib_send_wr **bad_wr);
+#else
+int efa_post_send(struct ib_qp *ibqp,
+		  struct ib_send_wr *wr,
+		  struct ib_send_wr **bad_wr);
+#endif
+#ifdef HAVE_POST_CONST_WR
+int efa_post_recv(struct ib_qp *ibqp,
+		  const struct ib_recv_wr *wr,
+		  const struct ib_recv_wr **bad_wr);
+#else
+int efa_post_recv(struct ib_qp *ibqp,
+		  struct ib_recv_wr *wr,
+		  struct ib_recv_wr **bad_wr);
+#endif
+int efa_poll_cq(struct ib_cq *ibcq, int num_entries,
+		struct ib_wc *wc);
+int efa_req_notify_cq(struct ib_cq *ibcq,
+		      enum ib_cq_notify_flags flags);
+struct ib_mr *efa_get_dma_mr(struct ib_pd *ibpd, int acc);
+#endif
+int efa_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+		  int qp_attr_mask, struct ib_udata *udata);
+enum rdma_link_layer efa_port_link_layer(struct ib_device *ibdev,
+					 u8 port_num);
+#ifdef HAVE_HW_STATS
+struct rdma_hw_stats *efa_alloc_hw_stats(struct ib_device *ibdev, u8 port_num);
+int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
+		     u8 port_num, int index);
+#endif
+
+#ifdef HAVE_CUSTOM_COMMANDS
+#ifndef HAVE_CREATE_AH_UDATA
+ssize_t efa_everbs_cmd_get_ah(struct efa_dev *dev,
+			      const char __user *buf,
+			      int in_len,
+			      int out_len);
+#endif
+#ifndef HAVE_IB_QUERY_DEVICE_UDATA
+ssize_t efa_everbs_cmd_get_ex_dev_attrs(struct efa_dev *dev,
+					const char __user *buf,
+					int in_len,
+					int out_len);
+#endif
+#endif
+
+#endif /* _EFA_H_ */
diff --git a/drivers/amazon/net/efa/efa_admin_cmds_defs.h b/drivers/amazon/net/efa/efa_admin_cmds_defs.h
new file mode 100644
index 0000000000000..b199e4ac6cf9e
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_admin_cmds_defs.h
@@ -0,0 +1,932 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_ADMIN_CMDS_H_
+#define _EFA_ADMIN_CMDS_H_
+
+#define EFA_ADMIN_API_VERSION_MAJOR          0
+#define EFA_ADMIN_API_VERSION_MINOR          1
+
+/* EFA admin queue opcodes */
+enum efa_admin_aq_opcode {
+	EFA_ADMIN_CREATE_QP                         = 1,
+	EFA_ADMIN_MODIFY_QP                         = 2,
+	EFA_ADMIN_QUERY_QP                          = 3,
+	EFA_ADMIN_DESTROY_QP                        = 4,
+	EFA_ADMIN_CREATE_AH                         = 5,
+	EFA_ADMIN_DESTROY_AH                        = 6,
+	EFA_ADMIN_REG_MR                            = 7,
+	EFA_ADMIN_DEREG_MR                          = 8,
+	EFA_ADMIN_CREATE_CQ                         = 9,
+	EFA_ADMIN_DESTROY_CQ                        = 10,
+	EFA_ADMIN_GET_FEATURE                       = 11,
+	EFA_ADMIN_SET_FEATURE                       = 12,
+	EFA_ADMIN_GET_STATS                         = 13,
+	EFA_ADMIN_ALLOC_PD                          = 14,
+	EFA_ADMIN_DEALLOC_PD                        = 15,
+	EFA_ADMIN_ALLOC_UAR                         = 16,
+	EFA_ADMIN_DEALLOC_UAR                       = 17,
+	EFA_ADMIN_MAX_OPCODE                        = 17,
+};
+
+enum efa_admin_aq_feature_id {
+	EFA_ADMIN_DEVICE_ATTR                       = 1,
+	EFA_ADMIN_AENQ_CONFIG                       = 2,
+	EFA_ADMIN_NETWORK_ATTR                      = 3,
+	EFA_ADMIN_QUEUE_ATTR                        = 4,
+	EFA_ADMIN_HW_HINTS                          = 5,
+	EFA_ADMIN_HOST_INFO                         = 6,
+};
+
+/* QP transport type */
+enum efa_admin_qp_type {
+	/* Unreliable Datagram */
+	EFA_ADMIN_QP_TYPE_UD                        = 1,
+	/* Scalable Reliable Datagram */
+	EFA_ADMIN_QP_TYPE_SRD                       = 2,
+};
+
+/* QP state */
+enum efa_admin_qp_state {
+	EFA_ADMIN_QP_STATE_RESET                    = 0,
+	EFA_ADMIN_QP_STATE_INIT                     = 1,
+	EFA_ADMIN_QP_STATE_RTR                      = 2,
+	EFA_ADMIN_QP_STATE_RTS                      = 3,
+	EFA_ADMIN_QP_STATE_SQD                      = 4,
+	EFA_ADMIN_QP_STATE_SQE                      = 5,
+	EFA_ADMIN_QP_STATE_ERR                      = 6,
+};
+
+enum efa_admin_get_stats_type {
+	EFA_ADMIN_GET_STATS_TYPE_BASIC              = 0,
+	EFA_ADMIN_GET_STATS_TYPE_MESSAGES           = 1,
+	EFA_ADMIN_GET_STATS_TYPE_RDMA_READ          = 2,
+};
+
+enum efa_admin_get_stats_scope {
+	EFA_ADMIN_GET_STATS_SCOPE_ALL               = 0,
+	EFA_ADMIN_GET_STATS_SCOPE_QUEUE             = 1,
+};
+
+/*
+ * QP allocation sizes, converted by fabric QueuePair (QP) create command
+ * from QP capabilities.
+ */
+struct efa_admin_qp_alloc_size {
+	/* Send descriptor ring size in bytes */
+	u32 send_queue_ring_size;
+
+	/* Max number of WQEs that can be outstanding on send queue. */
+	u32 send_queue_depth;
+
+	/*
+	 * Recv descriptor ring size in bytes, sufficient for user-provided
+	 * number of WQEs
+	 */
+	u32 recv_queue_ring_size;
+
+	/* Max number of WQEs that can be outstanding on recv queue */
+	u32 recv_queue_depth;
+};
+
+struct efa_admin_create_qp_cmd {
+	/* Common Admin Queue descriptor */
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	/* Protection Domain associated with this QP */
+	u16 pd;
+
+	/* QP type */
+	u8 qp_type;
+
+	/*
+	 * 0 : sq_virt - If set, SQ ring base address is
+	 *    virtual (IOVA returned by MR registration)
+	 * 1 : rq_virt - If set, RQ ring base address is
+	 *    virtual (IOVA returned by MR registration)
+	 * 7:2 : reserved - MBZ
+	 */
+	u8 flags;
+
+	/*
+	 * Send queue (SQ) ring base physical address. This field is not
+	 * used if this is a Low Latency Queue(LLQ).
+	 */
+	u64 sq_base_addr;
+
+	/* Receive queue (RQ) ring base address. */
+	u64 rq_base_addr;
+
+	/* Index of CQ to be associated with Send Queue completions */
+	u32 send_cq_idx;
+
+	/* Index of CQ to be associated with Recv Queue completions */
+	u32 recv_cq_idx;
+
+	/*
+	 * Memory registration key for the SQ ring, used only when not in
+	 * LLQ mode and base address is virtual
+	 */
+	u32 sq_l_key;
+
+	/*
+	 * Memory registration key for the RQ ring, used only when base
+	 * address is virtual
+	 */
+	u32 rq_l_key;
+
+	/* Requested QP allocation sizes */
+	struct efa_admin_qp_alloc_size qp_alloc_size;
+
+	/* UAR number */
+	u16 uar;
+
+	/* MBZ */
+	u16 reserved;
+
+	/* MBZ */
+	u32 reserved2;
+};
+
+struct efa_admin_create_qp_resp {
+	/* Common Admin Queue completion descriptor */
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	/*
+	 * Opaque handle to be used for consequent admin operations on the
+	 * QP
+	 */
+	u32 qp_handle;
+
+	/*
+	 * QP number in the given EFA virtual device. Least-significant bits
+	 *    (as needed according to max_qp) carry unique QP ID
+	 */
+	u16 qp_num;
+
+	/* MBZ */
+	u16 reserved;
+
+	/* Index of sub-CQ for Send Queue completions */
+	u16 send_sub_cq_idx;
+
+	/* Index of sub-CQ for Receive Queue completions */
+	u16 recv_sub_cq_idx;
+
+	/* SQ doorbell address, as offset to PCIe DB BAR */
+	u32 sq_db_offset;
+
+	/* RQ doorbell address, as offset to PCIe DB BAR */
+	u32 rq_db_offset;
+
+	/*
+	 * low latency send queue ring base address as an offset to PCIe
+	 * MMIO LLQ_MEM BAR
+	 */
+	u32 llq_descriptors_offset;
+};
+
+struct efa_admin_modify_qp_cmd {
+	/* Common Admin Queue descriptor */
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	/*
+	 * Mask indicating which fields should be updated
+	 * 0 : qp_state
+	 * 1 : cur_qp_state
+	 * 2 : qkey
+	 * 3 : sq_psn
+	 * 4 : sq_drained_async_notify
+	 * 5 : rnr_retry
+	 * 31:6 : reserved
+	 */
+	u32 modify_mask;
+
+	/* QP handle returned by create_qp command */
+	u32 qp_handle;
+
+	/* QP state */
+	u32 qp_state;
+
+	/* Override current QP state (before applying the transition) */
+	u32 cur_qp_state;
+
+	/* QKey */
+	u32 qkey;
+
+	/* SQ PSN */
+	u32 sq_psn;
+
+	/* Enable async notification when SQ is drained */
+	u8 sq_drained_async_notify;
+
+	/* Number of RNR retries (valid only for SRD QPs) */
+	u8 rnr_retry;
+
+	/* MBZ */
+	u16 reserved2;
+};
+
+struct efa_admin_modify_qp_resp {
+	/* Common Admin Queue completion descriptor */
+	struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+struct efa_admin_query_qp_cmd {
+	/* Common Admin Queue descriptor */
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	/* QP handle returned by create_qp command */
+	u32 qp_handle;
+};
+
+struct efa_admin_query_qp_resp {
+	/* Common Admin Queue completion descriptor */
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	/* QP state */
+	u32 qp_state;
+
+	/* QKey */
+	u32 qkey;
+
+	/* SQ PSN */
+	u32 sq_psn;
+
+	/* Indicates that draining is in progress */
+	u8 sq_draining;
+
+	/* Number of RNR retries (valid only for SRD QPs) */
+	u8 rnr_retry;
+
+	/* MBZ */
+	u16 reserved2;
+};
+
+struct efa_admin_destroy_qp_cmd {
+	/* Common Admin Queue descriptor */
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	/* QP handle returned by create_qp command */
+	u32 qp_handle;
+};
+
+struct efa_admin_destroy_qp_resp {
+	/* Common Admin Queue completion descriptor */
+	struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+/*
+ * Create Address Handle command parameters. Must not be called more than
+ * once for the same destination
+ */
+struct efa_admin_create_ah_cmd {
+	/* Common Admin Queue descriptor */
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	/* Destination address in network byte order */
+	u8 dest_addr[16];
+
+	/* PD number */
+	u16 pd;
+
+	/* MBZ */
+	u16 reserved;
+};
+
+struct efa_admin_create_ah_resp {
+	/* Common Admin Queue completion descriptor */
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	/* Target interface address handle (opaque) */
+	u16 ah;
+
+	/* MBZ */
+	u16 reserved;
+};
+
+struct efa_admin_destroy_ah_cmd {
+	/* Common Admin Queue descriptor */
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	/* Target interface address handle (opaque) */
+	u16 ah;
+
+	/* PD number */
+	u16 pd;
+};
+
+struct efa_admin_destroy_ah_resp {
+	/* Common Admin Queue completion descriptor */
+	struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+/*
+ * Registration of MemoryRegion, required for QP working with Virtual
+ * Addresses. In standard verbs semantics, region length is limited to 2GB
+ * space, but EFA offers larger MR support for large memory space, to ease
+ * on users working with very large datasets (i.e. full GPU memory mapping).
+ */
+struct efa_admin_reg_mr_cmd {
+	/* Common Admin Queue descriptor */
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	/* Protection Domain */
+	u16 pd;
+
+	/* MBZ */
+	u16 reserved16_w1;
+
+	/* Physical Buffer List, each element is page-aligned. */
+	union {
+		/*
+		 * Inline array of guest-physical page addresses of user
+		 * memory pages (optimization for short region
+		 * registrations)
+		 */
+		u64 inline_pbl_array[4];
+
+		/* points to PBL (direct or indirect, chained if needed) */
+		struct efa_admin_ctrl_buff_info pbl;
+	} pbl;
+
+	/* Memory region length, in bytes. */
+	u64 mr_length;
+
+	/*
+	 * flags and page size
+	 * 4:0 : phys_page_size_shift - page size is (1 <<
+	 *    phys_page_size_shift). Page size is used for
+	 *    building the Virtual to Physical address mapping
+	 * 6:5 : reserved - MBZ
+	 * 7 : mem_addr_phy_mode_en - Enable bit for physical
+	 *    memory registration (no translation), can be used
+	 *    only by privileged clients. If set, PBL must
+	 *    contain a single entry.
+	 */
+	u8 flags;
+
+	/*
+	 * permissions
+	 * 0 : local_write_enable - Local write permissions:
+	 *    must be set for RQ buffers and buffers posted for
+	 *    RDMA Read requests
+	 * 1 : reserved1 - MBZ
+	 * 2 : remote_read_enable - Remote read permissions:
+	 *    must be set to enable RDMA read from the region
+	 * 7:3 : reserved2 - MBZ
+	 */
+	u8 permissions;
+
+	/* MBZ */
+	u16 reserved16_w5;
+
+	/* number of pages in PBL (redundant, could be calculated) */
+	u32 page_num;
+
+	/*
+	 * IO Virtual Address associated with this MR. If
+	 * mem_addr_phy_mode_en is set, contains the physical address of
+	 * the region.
+	 */
+	u64 iova;
+};
+
+struct efa_admin_reg_mr_resp {
+	/* Common Admin Queue completion descriptor */
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	/*
+	 * L_Key, to be used in conjunction with local buffer references in
+	 * SQ and RQ WQE, or with virtual RQ/CQ rings
+	 */
+	u32 l_key;
+
+	/*
+	 * R_Key, to be used in RDMA messages to refer to remotely accessed
+	 * memory region
+	 */
+	u32 r_key;
+};
+
+struct efa_admin_dereg_mr_cmd {
+	/* Common Admin Queue descriptor */
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	/* L_Key, memory region's l_key */
+	u32 l_key;
+};
+
+struct efa_admin_dereg_mr_resp {
+	/* Common Admin Queue completion descriptor */
+	struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+struct efa_admin_create_cq_cmd {
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	/*
+	 * 4:0 : reserved5 - MBZ
+	 * 5 : interrupt_mode_enabled - if set, cq operates
+	 *    in interrupt mode (i.e. CQ events and MSI-X are
+	 *    generated), otherwise - polling
+	 * 6 : virt - If set, ring base address is virtual
+	 *    (IOVA returned by MR registration)
+	 * 7 : reserved6 - MBZ
+	 */
+	u8 cq_caps_1;
+
+	/*
+	 * 4:0 : cq_entry_size_words - size of CQ entry in
+	 *    32-bit words, valid values: 4, 8.
+	 * 7:5 : reserved7 - MBZ
+	 */
+	u8 cq_caps_2;
+
+	/* completion queue depth in # of entries. must be power of 2 */
+	u16 cq_depth;
+
+	/* msix vector assigned to this cq */
+	u32 msix_vector_idx;
+
+	/*
+	 * CQ ring base address, virtual or physical depending on 'virt'
+	 * flag
+	 */
+	struct efa_common_mem_addr cq_ba;
+
+	/*
+	 * Memory registration key for the ring, used only when base
+	 * address is virtual
+	 */
+	u32 l_key;
+
+	/*
+	 * number of sub cqs - must be equal to sub_cqs_per_cq of queue
+	 *    attributes.
+	 */
+	u16 num_sub_cqs;
+
+	/* UAR number */
+	u16 uar;
+};
+
+struct efa_admin_create_cq_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	u16 cq_idx;
+
+	/* actual cq depth in number of entries */
+	u16 cq_actual_depth;
+};
+
+struct efa_admin_destroy_cq_cmd {
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	u16 cq_idx;
+
+	/* MBZ */
+	u16 reserved1;
+};
+
+struct efa_admin_destroy_cq_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+/*
+ * EFA AQ Get Statistics command. Extended statistics are placed in control
+ * buffer pointed by AQ entry
+ */
+struct efa_admin_aq_get_stats_cmd {
+	struct efa_admin_aq_common_desc aq_common_descriptor;
+
+	union {
+		/* command specific inline data */
+		u32 inline_data_w1[3];
+
+		struct efa_admin_ctrl_buff_info control_buffer;
+	} u;
+
+	/* stats type as defined in enum efa_admin_get_stats_type */
+	u8 type;
+
+	/* stats scope defined in enum efa_admin_get_stats_scope */
+	u8 scope;
+
+	u16 scope_modifier;
+};
+
+struct efa_admin_basic_stats {
+	u64 tx_bytes;
+
+	u64 tx_pkts;
+
+	u64 rx_bytes;
+
+	u64 rx_pkts;
+
+	u64 rx_drops;
+};
+
+struct efa_admin_messages_stats {
+	u64 send_bytes;
+
+	u64 send_wrs;
+
+	u64 recv_bytes;
+
+	u64 recv_wrs;
+};
+
+struct efa_admin_rdma_read_stats {
+	u64 read_wrs;
+
+	u64 read_bytes;
+
+	u64 read_wr_err;
+
+	u64 read_resp_bytes;
+};
+
+struct efa_admin_acq_get_stats_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	union {
+		struct efa_admin_basic_stats basic_stats;
+
+		struct efa_admin_messages_stats messages_stats;
+
+		struct efa_admin_rdma_read_stats rdma_read_stats;
+	} u;
+};
+
+struct efa_admin_get_set_feature_common_desc {
+	/*
+	 * 1:0 : select - 0x1 - current value; 0x3 - default
+	 *    value
+	 * 7:3 : reserved3 - MBZ
+	 */
+	u8 flags;
+
+	/* as appears in efa_admin_aq_feature_id */
+	u8 feature_id;
+
+	/* MBZ */
+	u16 reserved16;
+};
+
+struct efa_admin_feature_device_attr_desc {
+	/* Bitmap of efa_admin_aq_feature_id */
+	u64 supported_features;
+
+	/* Bitmap of supported page sizes in MR registrations */
+	u64 page_size_cap;
+
+	u32 fw_version;
+
+	u32 admin_api_version;
+
+	u32 device_version;
+
+	/* Bar used for SQ and RQ doorbells */
+	u16 db_bar;
+
+	/* Indicates how many bits are used on physical address access */
+	u8 phys_addr_width;
+
+	/* Indicates how many bits are used on virtual address access */
+	u8 virt_addr_width;
+
+	/*
+	 * 0 : rdma_read - If set, RDMA Read is supported on
+	 *    TX queues
+	 * 1 : rnr_retry - If set, RNR retry is supported on
+	 *    modify QP command
+	 * 31:2 : reserved - MBZ
+	 */
+	u32 device_caps;
+
+	/* Max RDMA transfer size in bytes */
+	u32 max_rdma_size;
+};
+
+struct efa_admin_feature_queue_attr_desc {
+	/* The maximum number of queue pairs supported */
+	u32 max_qp;
+
+	/* Maximum number of WQEs per Send Queue */
+	u32 max_sq_depth;
+
+	/* Maximum size of data that can be sent inline in a Send WQE */
+	u32 inline_buf_size;
+
+	/* Maximum number of buffer descriptors per Recv Queue */
+	u32 max_rq_depth;
+
+	/* The maximum number of completion queues supported per VF */
+	u32 max_cq;
+
+	/* Maximum number of CQEs per Completion Queue */
+	u32 max_cq_depth;
+
+	/* Number of sub-CQs to be created for each CQ */
+	u16 sub_cqs_per_cq;
+
+	/* Minimum number of WQEs per SQ */
+	u16 min_sq_depth;
+
+	/* Maximum number of SGEs (buffers) allowed for a single send WQE */
+	u16 max_wr_send_sges;
+
+	/* Maximum number of SGEs allowed for a single recv WQE */
+	u16 max_wr_recv_sges;
+
+	/* The maximum number of memory regions supported */
+	u32 max_mr;
+
+	/* The maximum number of pages can be registered */
+	u32 max_mr_pages;
+
+	/* The maximum number of protection domains supported */
+	u32 max_pd;
+
+	/* The maximum number of address handles supported */
+	u32 max_ah;
+
+	/* The maximum size of LLQ in bytes */
+	u32 max_llq_size;
+
+	/* Maximum number of SGEs for a single RDMA read WQE */
+	u16 max_wr_rdma_sges;
+
+	/*
+	 * Maximum number of bytes that can be written to SQ between two
+	 * consecutive doorbells (in units of 64B). Driver must ensure that only
+	 * complete WQEs are written to queue before issuing a doorbell.
+	 * Examples: max_tx_batch=16 and WQE size = 64B, means up to 16 WQEs can
+	 * be written to SQ between two consecutive doorbells. max_tx_batch=11
+	 * and WQE size = 128B, means up to 5 WQEs can be written to SQ between
+	 * two consecutive doorbells. Zero means unlimited.
+	 */
+	u16 max_tx_batch;
+};
+
+struct efa_admin_feature_aenq_desc {
+	/* bitmask for AENQ groups the device can report */
+	u32 supported_groups;
+
+	/* bitmask for AENQ groups to report */
+	u32 enabled_groups;
+};
+
+struct efa_admin_feature_network_attr_desc {
+	/* Raw address data in network byte order */
+	u8 addr[16];
+
+	/* max packet payload size in bytes */
+	u32 mtu;
+};
+
+/*
+ * When hint value is 0, hints capabilities are not supported or driver
+ * should use its own predefined value
+ */
+struct efa_admin_hw_hints {
+	/* value in ms */
+	u16 mmio_read_timeout;
+
+	/* value in ms */
+	u16 driver_watchdog_timeout;
+
+	/* value in ms */
+	u16 admin_completion_timeout;
+
+	/* poll interval in ms */
+	u16 poll_interval;
+};
+
+struct efa_admin_get_feature_cmd {
+	struct efa_admin_aq_common_desc aq_common_descriptor;
+
+	struct efa_admin_ctrl_buff_info control_buffer;
+
+	struct efa_admin_get_set_feature_common_desc feature_common;
+
+	u32 raw[11];
+};
+
+struct efa_admin_get_feature_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	union {
+		u32 raw[14];
+
+		struct efa_admin_feature_device_attr_desc device_attr;
+
+		struct efa_admin_feature_aenq_desc aenq;
+
+		struct efa_admin_feature_network_attr_desc network_attr;
+
+		struct efa_admin_feature_queue_attr_desc queue_attr;
+
+		struct efa_admin_hw_hints hw_hints;
+	} u;
+};
+
+struct efa_admin_set_feature_cmd {
+	struct efa_admin_aq_common_desc aq_common_descriptor;
+
+	struct efa_admin_ctrl_buff_info control_buffer;
+
+	struct efa_admin_get_set_feature_common_desc feature_common;
+
+	union {
+		u32 raw[11];
+
+		/* AENQ configuration */
+		struct efa_admin_feature_aenq_desc aenq;
+	} u;
+};
+
+struct efa_admin_set_feature_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	union {
+		u32 raw[14];
+	} u;
+};
+
+struct efa_admin_alloc_pd_cmd {
+	struct efa_admin_aq_common_desc aq_common_descriptor;
+};
+
+struct efa_admin_alloc_pd_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	/* PD number */
+	u16 pd;
+
+	/* MBZ */
+	u16 reserved;
+};
+
+struct efa_admin_dealloc_pd_cmd {
+	struct efa_admin_aq_common_desc aq_common_descriptor;
+
+	/* PD number */
+	u16 pd;
+
+	/* MBZ */
+	u16 reserved;
+};
+
+struct efa_admin_dealloc_pd_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+struct efa_admin_alloc_uar_cmd {
+	struct efa_admin_aq_common_desc aq_common_descriptor;
+};
+
+struct efa_admin_alloc_uar_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	/* UAR number */
+	u16 uar;
+
+	/* MBZ */
+	u16 reserved;
+};
+
+struct efa_admin_dealloc_uar_cmd {
+	struct efa_admin_aq_common_desc aq_common_descriptor;
+
+	/* UAR number */
+	u16 uar;
+
+	/* MBZ */
+	u16 reserved;
+};
+
+struct efa_admin_dealloc_uar_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+/* asynchronous event notification groups */
+enum efa_admin_aenq_group {
+	EFA_ADMIN_FATAL_ERROR                       = 1,
+	EFA_ADMIN_WARNING                           = 2,
+	EFA_ADMIN_NOTIFICATION                      = 3,
+	EFA_ADMIN_KEEP_ALIVE                        = 4,
+	EFA_ADMIN_AENQ_GROUPS_NUM                   = 5,
+};
+
+enum efa_admin_aenq_notification_syndrom {
+	EFA_ADMIN_SUSPEND                           = 0,
+	EFA_ADMIN_RESUME                            = 1,
+	EFA_ADMIN_UPDATE_HINTS                      = 2,
+};
+
+struct efa_admin_mmio_req_read_less_resp {
+	u16 req_id;
+
+	u16 reg_off;
+
+	/* value is valid when poll is cleared */
+	u32 reg_val;
+};
+
+enum efa_admin_os_type {
+	EFA_ADMIN_OS_LINUX                          = 0,
+};
+
+struct efa_admin_host_info {
+	/* OS distribution string format */
+	u8 os_dist_str[128];
+
+	/* Defined in enum efa_admin_os_type */
+	u32 os_type;
+
+	/* Kernel version string format */
+	u8 kernel_ver_str[32];
+
+	/* Kernel version numeric format */
+	u32 kernel_ver;
+
+	/*
+	 * 7:0 : driver_module_type
+	 * 15:8 : driver_sub_minor
+	 * 23:16 : driver_minor
+	 * 31:24 : driver_major
+	 */
+	u32 driver_ver;
+
+	/*
+	 * Device's Bus, Device and Function
+	 * 2:0 : function
+	 * 7:3 : device
+	 * 15:8 : bus
+	 */
+	u16 bdf;
+
+	/*
+	 * Spec version
+	 * 7:0 : spec_minor
+	 * 15:8 : spec_major
+	 */
+	u16 spec_ver;
+
+	/*
+	 * 0 : intree - Intree driver
+	 * 1 : gdr - GPUDirect RDMA supported
+	 * 31:2 : reserved2
+	 */
+	u32 flags;
+};
+
+/* create_qp_cmd */
+#define EFA_ADMIN_CREATE_QP_CMD_SQ_VIRT_MASK                BIT(0)
+#define EFA_ADMIN_CREATE_QP_CMD_RQ_VIRT_MASK                BIT(1)
+
+/* modify_qp_cmd */
+#define EFA_ADMIN_MODIFY_QP_CMD_QP_STATE_MASK               BIT(0)
+#define EFA_ADMIN_MODIFY_QP_CMD_CUR_QP_STATE_MASK           BIT(1)
+#define EFA_ADMIN_MODIFY_QP_CMD_QKEY_MASK                   BIT(2)
+#define EFA_ADMIN_MODIFY_QP_CMD_SQ_PSN_MASK                 BIT(3)
+#define EFA_ADMIN_MODIFY_QP_CMD_SQ_DRAINED_ASYNC_NOTIFY_MASK BIT(4)
+#define EFA_ADMIN_MODIFY_QP_CMD_RNR_RETRY_MASK              BIT(5)
+
+/* reg_mr_cmd */
+#define EFA_ADMIN_REG_MR_CMD_PHYS_PAGE_SIZE_SHIFT_MASK      GENMASK(4, 0)
+#define EFA_ADMIN_REG_MR_CMD_MEM_ADDR_PHY_MODE_EN_MASK      BIT(7)
+#define EFA_ADMIN_REG_MR_CMD_LOCAL_WRITE_ENABLE_MASK        BIT(0)
+#define EFA_ADMIN_REG_MR_CMD_REMOTE_READ_ENABLE_MASK        BIT(2)
+
+/* create_cq_cmd */
+#define EFA_ADMIN_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED_MASK BIT(5)
+#define EFA_ADMIN_CREATE_CQ_CMD_VIRT_MASK                   BIT(6)
+#define EFA_ADMIN_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS_MASK    GENMASK(4, 0)
+
+/* get_set_feature_common_desc */
+#define EFA_ADMIN_GET_SET_FEATURE_COMMON_DESC_SELECT_MASK   GENMASK(1, 0)
+
+/* feature_device_attr_desc */
+#define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RDMA_READ_MASK   BIT(0)
+#define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RNR_RETRY_MASK   BIT(1)
+
+/* host_info */
+#define EFA_ADMIN_HOST_INFO_DRIVER_MODULE_TYPE_MASK         GENMASK(7, 0)
+#define EFA_ADMIN_HOST_INFO_DRIVER_SUB_MINOR_MASK           GENMASK(15, 8)
+#define EFA_ADMIN_HOST_INFO_DRIVER_MINOR_MASK               GENMASK(23, 16)
+#define EFA_ADMIN_HOST_INFO_DRIVER_MAJOR_MASK               GENMASK(31, 24)
+#define EFA_ADMIN_HOST_INFO_FUNCTION_MASK                   GENMASK(2, 0)
+#define EFA_ADMIN_HOST_INFO_DEVICE_MASK                     GENMASK(7, 3)
+#define EFA_ADMIN_HOST_INFO_BUS_MASK                        GENMASK(15, 8)
+#define EFA_ADMIN_HOST_INFO_SPEC_MINOR_MASK                 GENMASK(7, 0)
+#define EFA_ADMIN_HOST_INFO_SPEC_MAJOR_MASK                 GENMASK(15, 8)
+#define EFA_ADMIN_HOST_INFO_INTREE_MASK                     BIT(0)
+#define EFA_ADMIN_HOST_INFO_GDR_MASK                        BIT(1)
+
+#endif /* _EFA_ADMIN_CMDS_H_ */
diff --git a/drivers/amazon/net/efa/efa_admin_defs.h b/drivers/amazon/net/efa/efa_admin_defs.h
new file mode 100644
index 0000000000000..29d53ed63b3ed
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_admin_defs.h
@@ -0,0 +1,134 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_ADMIN_H_
+#define _EFA_ADMIN_H_
+
+enum efa_admin_aq_completion_status {
+	EFA_ADMIN_SUCCESS                           = 0,
+	EFA_ADMIN_RESOURCE_ALLOCATION_FAILURE       = 1,
+	EFA_ADMIN_BAD_OPCODE                        = 2,
+	EFA_ADMIN_UNSUPPORTED_OPCODE                = 3,
+	EFA_ADMIN_MALFORMED_REQUEST                 = 4,
+	/* Additional status is provided in ACQ entry extended_status */
+	EFA_ADMIN_ILLEGAL_PARAMETER                 = 5,
+	EFA_ADMIN_UNKNOWN_ERROR                     = 6,
+	EFA_ADMIN_RESOURCE_BUSY                     = 7,
+};
+
+struct efa_admin_aq_common_desc {
+	/*
+	 * 11:0 : command_id
+	 * 15:12 : reserved12
+	 */
+	u16 command_id;
+
+	/* as appears in efa_admin_aq_opcode */
+	u8 opcode;
+
+	/*
+	 * 0 : phase
+	 * 1 : ctrl_data - control buffer address valid
+	 * 2 : ctrl_data_indirect - control buffer address
+	 *    points to list of pages with addresses of control
+	 *    buffers
+	 * 7:3 : reserved3
+	 */
+	u8 flags;
+};
+
+/*
+ * used in efa_admin_aq_entry. Can point directly to control data, or to a
+ * page list chunk. Used also at the end of indirect mode page list chunks,
+ * for chaining.
+ */
+struct efa_admin_ctrl_buff_info {
+	u32 length;
+
+	struct efa_common_mem_addr address;
+};
+
+struct efa_admin_aq_entry {
+	struct efa_admin_aq_common_desc aq_common_descriptor;
+
+	union {
+		u32 inline_data_w1[3];
+
+		struct efa_admin_ctrl_buff_info control_buffer;
+	} u;
+
+	u32 inline_data_w4[12];
+};
+
+struct efa_admin_acq_common_desc {
+	/*
+	 * command identifier to associate it with the aq descriptor
+	 * 11:0 : command_id
+	 * 15:12 : reserved12
+	 */
+	u16 command;
+
+	u8 status;
+
+	/*
+	 * 0 : phase
+	 * 7:1 : reserved1
+	 */
+	u8 flags;
+
+	u16 extended_status;
+
+	/*
+	 * indicates to the driver which AQ entry has been consumed by the
+	 *    device and could be reused
+	 */
+	u16 sq_head_indx;
+};
+
+struct efa_admin_acq_entry {
+	struct efa_admin_acq_common_desc acq_common_descriptor;
+
+	u32 response_specific_data[14];
+};
+
+struct efa_admin_aenq_common_desc {
+	u16 group;
+
+	u16 syndrom;
+
+	/*
+	 * 0 : phase
+	 * 7:1 : reserved - MBZ
+	 */
+	u8 flags;
+
+	u8 reserved1[3];
+
+	u32 timestamp_low;
+
+	u32 timestamp_high;
+};
+
+struct efa_admin_aenq_entry {
+	struct efa_admin_aenq_common_desc aenq_common_desc;
+
+	/* command specific inline data */
+	u32 inline_data_w4[12];
+};
+
+/* aq_common_desc */
+#define EFA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK            GENMASK(11, 0)
+#define EFA_ADMIN_AQ_COMMON_DESC_PHASE_MASK                 BIT(0)
+#define EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_MASK             BIT(1)
+#define EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK    BIT(2)
+
+/* acq_common_desc */
+#define EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK           GENMASK(11, 0)
+#define EFA_ADMIN_ACQ_COMMON_DESC_PHASE_MASK                BIT(0)
+
+/* aenq_common_desc */
+#define EFA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK               BIT(0)
+
+#endif /* _EFA_ADMIN_H_ */
diff --git a/drivers/amazon/net/efa/efa_com.c b/drivers/amazon/net/efa/efa_com.c
new file mode 100644
index 0000000000000..e8a0a0c3a90dc
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_com.c
@@ -0,0 +1,1094 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
+/*
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include "efa_com.h"
+#include "efa_regs_defs.h"
+
+#define ADMIN_CMD_TIMEOUT_US 30000000 /* usecs */
+
+#define EFA_REG_READ_TIMEOUT_US 50000 /* usecs */
+#define EFA_MMIO_READ_INVALID 0xffffffff
+
+#define EFA_POLL_INTERVAL_MS 100 /* msecs */
+
+#define EFA_ASYNC_QUEUE_DEPTH 16
+#define EFA_ADMIN_QUEUE_DEPTH 32
+
+#define EFA_CTRL_MAJOR          0
+#define EFA_CTRL_MINOR          0
+#define EFA_CTRL_SUB_MINOR      1
+
+#define EFA_DMA_ADDR_TO_UINT32_LOW(x)   ((u32)((u64)(x)))
+#define EFA_DMA_ADDR_TO_UINT32_HIGH(x)  ((u32)(((u64)(x)) >> 32))
+
+enum efa_cmd_status {
+	EFA_CMD_SUBMITTED,
+	EFA_CMD_COMPLETED,
+};
+
+struct efa_comp_ctx {
+	struct completion wait_event;
+	struct efa_admin_acq_entry *user_cqe;
+	u32 comp_size;
+	enum efa_cmd_status status;
+	/* status from the device */
+	u8 comp_status;
+	u8 cmd_opcode;
+	u8 occupied;
+};
+
+static const char *efa_com_cmd_str(u8 cmd)
+{
+#define EFA_CMD_STR_CASE(_cmd) case EFA_ADMIN_##_cmd: return #_cmd
+
+	switch (cmd) {
+	EFA_CMD_STR_CASE(CREATE_QP);
+	EFA_CMD_STR_CASE(MODIFY_QP);
+	EFA_CMD_STR_CASE(QUERY_QP);
+	EFA_CMD_STR_CASE(DESTROY_QP);
+	EFA_CMD_STR_CASE(CREATE_AH);
+	EFA_CMD_STR_CASE(DESTROY_AH);
+	EFA_CMD_STR_CASE(REG_MR);
+	EFA_CMD_STR_CASE(DEREG_MR);
+	EFA_CMD_STR_CASE(CREATE_CQ);
+	EFA_CMD_STR_CASE(DESTROY_CQ);
+	EFA_CMD_STR_CASE(GET_FEATURE);
+	EFA_CMD_STR_CASE(SET_FEATURE);
+	EFA_CMD_STR_CASE(GET_STATS);
+	EFA_CMD_STR_CASE(ALLOC_PD);
+	EFA_CMD_STR_CASE(DEALLOC_PD);
+	EFA_CMD_STR_CASE(ALLOC_UAR);
+	EFA_CMD_STR_CASE(DEALLOC_UAR);
+	default: return "unknown command opcode";
+	}
+#undef EFA_CMD_STR_CASE
+}
+
+static u32 efa_com_reg_read32(struct efa_com_dev *edev, u16 offset)
+{
+	struct efa_com_mmio_read *mmio_read = &edev->mmio_read;
+	struct efa_admin_mmio_req_read_less_resp *read_resp;
+	unsigned long exp_time;
+	u32 mmio_read_reg = 0;
+	u32 err;
+
+	read_resp = mmio_read->read_resp;
+
+	spin_lock(&mmio_read->lock);
+	mmio_read->seq_num++;
+
+	/* trash DMA req_id to identify when hardware is done */
+	read_resp->req_id = mmio_read->seq_num + 0x9aL;
+	EFA_SET(&mmio_read_reg, EFA_REGS_MMIO_REG_READ_REG_OFF, offset);
+	EFA_SET(&mmio_read_reg, EFA_REGS_MMIO_REG_READ_REQ_ID,
+		mmio_read->seq_num);
+
+	writel(mmio_read_reg, edev->reg_bar + EFA_REGS_MMIO_REG_READ_OFF);
+
+	exp_time = jiffies + usecs_to_jiffies(mmio_read->mmio_read_timeout);
+	do {
+		if (READ_ONCE(read_resp->req_id) == mmio_read->seq_num)
+			break;
+		udelay(1);
+	} while (time_is_after_jiffies(exp_time));
+
+	if (read_resp->req_id != mmio_read->seq_num) {
+		ibdev_err_ratelimited(
+			edev->efa_dev,
+			"Reading register timed out. expected: req id[%u] offset[%#x] actual: req id[%u] offset[%#x]\n",
+			mmio_read->seq_num, offset, read_resp->req_id,
+			read_resp->reg_off);
+		err = EFA_MMIO_READ_INVALID;
+		goto out;
+	}
+
+	if (read_resp->reg_off != offset) {
+		ibdev_err_ratelimited(
+			edev->efa_dev,
+			"Reading register failed: wrong offset provided\n");
+		err = EFA_MMIO_READ_INVALID;
+		goto out;
+	}
+
+	err = read_resp->reg_val;
+out:
+	spin_unlock(&mmio_read->lock);
+	return err;
+}
+
+static int efa_com_admin_init_sq(struct efa_com_dev *edev)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_com_admin_sq *sq = &aq->sq;
+	u16 size = aq->depth * sizeof(*sq->entries);
+	u32 aq_caps = 0;
+	u32 addr_high;
+	u32 addr_low;
+
+	sq->entries =
+		dma_alloc_coherent(aq->dmadev, size, &sq->dma_addr, GFP_KERNEL);
+	if (!sq->entries)
+		return -ENOMEM;
+
+	spin_lock_init(&sq->lock);
+
+	sq->cc = 0;
+	sq->pc = 0;
+	sq->phase = 1;
+
+	sq->db_addr = (u32 __iomem *)(edev->reg_bar + EFA_REGS_AQ_PROD_DB_OFF);
+
+	addr_high = EFA_DMA_ADDR_TO_UINT32_HIGH(sq->dma_addr);
+	addr_low = EFA_DMA_ADDR_TO_UINT32_LOW(sq->dma_addr);
+
+	writel(addr_low, edev->reg_bar + EFA_REGS_AQ_BASE_LO_OFF);
+	writel(addr_high, edev->reg_bar + EFA_REGS_AQ_BASE_HI_OFF);
+
+	EFA_SET(&aq_caps, EFA_REGS_AQ_CAPS_AQ_DEPTH, aq->depth);
+	EFA_SET(&aq_caps, EFA_REGS_AQ_CAPS_AQ_ENTRY_SIZE,
+		sizeof(struct efa_admin_aq_entry));
+
+	writel(aq_caps, edev->reg_bar + EFA_REGS_AQ_CAPS_OFF);
+
+	return 0;
+}
+
+static int efa_com_admin_init_cq(struct efa_com_dev *edev)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_com_admin_cq *cq = &aq->cq;
+	u16 size = aq->depth * sizeof(*cq->entries);
+	u32 acq_caps = 0;
+	u32 addr_high;
+	u32 addr_low;
+
+	cq->entries =
+		dma_alloc_coherent(aq->dmadev, size, &cq->dma_addr, GFP_KERNEL);
+	if (!cq->entries)
+		return -ENOMEM;
+
+	spin_lock_init(&cq->lock);
+
+	cq->cc = 0;
+	cq->phase = 1;
+
+	addr_high = EFA_DMA_ADDR_TO_UINT32_HIGH(cq->dma_addr);
+	addr_low = EFA_DMA_ADDR_TO_UINT32_LOW(cq->dma_addr);
+
+	writel(addr_low, edev->reg_bar + EFA_REGS_ACQ_BASE_LO_OFF);
+	writel(addr_high, edev->reg_bar + EFA_REGS_ACQ_BASE_HI_OFF);
+
+	EFA_SET(&acq_caps, EFA_REGS_ACQ_CAPS_ACQ_DEPTH, aq->depth);
+	EFA_SET(&acq_caps, EFA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE,
+		sizeof(struct efa_admin_acq_entry));
+	EFA_SET(&acq_caps, EFA_REGS_ACQ_CAPS_ACQ_MSIX_VECTOR,
+		aq->msix_vector_idx);
+
+	writel(acq_caps, edev->reg_bar + EFA_REGS_ACQ_CAPS_OFF);
+
+	return 0;
+}
+
+static int efa_com_admin_init_aenq(struct efa_com_dev *edev,
+				   struct efa_aenq_handlers *aenq_handlers)
+{
+	struct efa_com_aenq *aenq = &edev->aenq;
+	u32 addr_low, addr_high;
+	u32 aenq_caps = 0;
+	u16 size;
+
+	if (!aenq_handlers) {
+		ibdev_err(edev->efa_dev, "aenq handlers pointer is NULL\n");
+		return -EINVAL;
+	}
+
+	size = EFA_ASYNC_QUEUE_DEPTH * sizeof(*aenq->entries);
+	aenq->entries = dma_alloc_coherent(edev->dmadev, size, &aenq->dma_addr,
+					   GFP_KERNEL);
+	if (!aenq->entries)
+		return -ENOMEM;
+
+	aenq->aenq_handlers = aenq_handlers;
+	aenq->depth = EFA_ASYNC_QUEUE_DEPTH;
+	aenq->cc = 0;
+	aenq->phase = 1;
+
+	addr_low = EFA_DMA_ADDR_TO_UINT32_LOW(aenq->dma_addr);
+	addr_high = EFA_DMA_ADDR_TO_UINT32_HIGH(aenq->dma_addr);
+
+	writel(addr_low, edev->reg_bar + EFA_REGS_AENQ_BASE_LO_OFF);
+	writel(addr_high, edev->reg_bar + EFA_REGS_AENQ_BASE_HI_OFF);
+
+	EFA_SET(&aenq_caps, EFA_REGS_AENQ_CAPS_AENQ_DEPTH, aenq->depth);
+	EFA_SET(&aenq_caps, EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE,
+		sizeof(struct efa_admin_aenq_entry));
+	EFA_SET(&aenq_caps, EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR,
+		aenq->msix_vector_idx);
+	writel(aenq_caps, edev->reg_bar + EFA_REGS_AENQ_CAPS_OFF);
+
+	/*
+	 * Init cons_db to mark that all entries in the queue
+	 * are initially available
+	 */
+	writel(edev->aenq.cc, edev->reg_bar + EFA_REGS_AENQ_CONS_DB_OFF);
+
+	return 0;
+}
+
+/* ID to be used with efa_com_get_comp_ctx */
+static u16 efa_com_alloc_ctx_id(struct efa_com_admin_queue *aq)
+{
+	u16 ctx_id;
+
+	spin_lock(&aq->comp_ctx_lock);
+	ctx_id = aq->comp_ctx_pool[aq->comp_ctx_pool_next];
+	aq->comp_ctx_pool_next++;
+	spin_unlock(&aq->comp_ctx_lock);
+
+	return ctx_id;
+}
+
+static void efa_com_dealloc_ctx_id(struct efa_com_admin_queue *aq,
+				   u16 ctx_id)
+{
+	spin_lock(&aq->comp_ctx_lock);
+	aq->comp_ctx_pool_next--;
+	aq->comp_ctx_pool[aq->comp_ctx_pool_next] = ctx_id;
+	spin_unlock(&aq->comp_ctx_lock);
+}
+
+static inline void efa_com_put_comp_ctx(struct efa_com_admin_queue *aq,
+					struct efa_comp_ctx *comp_ctx)
+{
+	u16 cmd_id = EFA_GET(&comp_ctx->user_cqe->acq_common_descriptor.command,
+			     EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID);
+	u16 ctx_id = cmd_id & (aq->depth - 1);
+
+	ibdev_dbg(aq->efa_dev, "Put completion command_id %#x\n", cmd_id);
+	comp_ctx->occupied = 0;
+	efa_com_dealloc_ctx_id(aq, ctx_id);
+}
+
+static struct efa_comp_ctx *efa_com_get_comp_ctx(struct efa_com_admin_queue *aq,
+						 u16 cmd_id, bool capture)
+{
+	u16 ctx_id = cmd_id & (aq->depth - 1);
+
+	if (aq->comp_ctx[ctx_id].occupied && capture) {
+		ibdev_err_ratelimited(
+			aq->efa_dev,
+			"Completion context for command_id %#x is occupied\n",
+			cmd_id);
+		return NULL;
+	}
+
+	if (capture) {
+		aq->comp_ctx[ctx_id].occupied = 1;
+		ibdev_dbg(aq->efa_dev,
+			  "Take completion ctxt for command_id %#x\n", cmd_id);
+	}
+
+	return &aq->comp_ctx[ctx_id];
+}
+
+static struct efa_comp_ctx *__efa_com_submit_admin_cmd(struct efa_com_admin_queue *aq,
+						       struct efa_admin_aq_entry *cmd,
+						       size_t cmd_size_in_bytes,
+						       struct efa_admin_acq_entry *comp,
+						       size_t comp_size_in_bytes)
+{
+	struct efa_admin_aq_entry *aqe;
+	struct efa_comp_ctx *comp_ctx;
+	u16 queue_size_mask;
+	u16 cmd_id;
+	u16 ctx_id;
+	u16 pi;
+
+	queue_size_mask = aq->depth - 1;
+	pi = aq->sq.pc & queue_size_mask;
+
+	ctx_id = efa_com_alloc_ctx_id(aq);
+
+	/* cmd_id LSBs are the ctx_id and MSBs are entropy bits from pc */
+	cmd_id = ctx_id & queue_size_mask;
+	cmd_id |= aq->sq.pc & ~queue_size_mask;
+	cmd_id &= EFA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK;
+
+	cmd->aq_common_descriptor.command_id = cmd_id;
+	EFA_SET(&cmd->aq_common_descriptor.flags,
+		EFA_ADMIN_AQ_COMMON_DESC_PHASE, aq->sq.phase);
+
+	comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, true);
+	if (!comp_ctx) {
+		efa_com_dealloc_ctx_id(aq, ctx_id);
+		return ERR_PTR(-EINVAL);
+	}
+
+	comp_ctx->status = EFA_CMD_SUBMITTED;
+	comp_ctx->comp_size = comp_size_in_bytes;
+	comp_ctx->user_cqe = comp;
+	comp_ctx->cmd_opcode = cmd->aq_common_descriptor.opcode;
+
+	reinit_completion(&comp_ctx->wait_event);
+
+	aqe = &aq->sq.entries[pi];
+	memset(aqe, 0, sizeof(*aqe));
+	memcpy(aqe, cmd, cmd_size_in_bytes);
+
+	aq->sq.pc++;
+	atomic64_inc(&aq->stats.submitted_cmd);
+
+	if ((aq->sq.pc & queue_size_mask) == 0)
+		aq->sq.phase = !aq->sq.phase;
+
+	/* barrier not needed in case of writel */
+	writel(aq->sq.pc, aq->sq.db_addr);
+
+	return comp_ctx;
+}
+
+static inline int efa_com_init_comp_ctxt(struct efa_com_admin_queue *aq)
+{
+	size_t pool_size = aq->depth * sizeof(*aq->comp_ctx_pool);
+	size_t size = aq->depth * sizeof(struct efa_comp_ctx);
+	struct efa_comp_ctx *comp_ctx;
+	u16 i;
+
+	aq->comp_ctx = devm_kzalloc(aq->dmadev, size, GFP_KERNEL);
+	aq->comp_ctx_pool = devm_kzalloc(aq->dmadev, pool_size, GFP_KERNEL);
+	if (!aq->comp_ctx || !aq->comp_ctx_pool) {
+		devm_kfree(aq->dmadev, aq->comp_ctx_pool);
+		devm_kfree(aq->dmadev, aq->comp_ctx);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < aq->depth; i++) {
+		comp_ctx = efa_com_get_comp_ctx(aq, i, false);
+		if (comp_ctx)
+			init_completion(&comp_ctx->wait_event);
+
+		aq->comp_ctx_pool[i] = i;
+	}
+
+	spin_lock_init(&aq->comp_ctx_lock);
+
+	aq->comp_ctx_pool_next = 0;
+
+	return 0;
+}
+
+static struct efa_comp_ctx *efa_com_submit_admin_cmd(struct efa_com_admin_queue *aq,
+						     struct efa_admin_aq_entry *cmd,
+						     size_t cmd_size_in_bytes,
+						     struct efa_admin_acq_entry *comp,
+						     size_t comp_size_in_bytes)
+{
+	struct efa_comp_ctx *comp_ctx;
+
+	spin_lock(&aq->sq.lock);
+	if (!test_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state)) {
+		ibdev_err_ratelimited(aq->efa_dev, "Admin queue is closed\n");
+		spin_unlock(&aq->sq.lock);
+		return ERR_PTR(-ENODEV);
+	}
+
+	comp_ctx = __efa_com_submit_admin_cmd(aq, cmd, cmd_size_in_bytes, comp,
+					      comp_size_in_bytes);
+	spin_unlock(&aq->sq.lock);
+	if (IS_ERR(comp_ctx))
+		clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
+
+	return comp_ctx;
+}
+
+static void efa_com_handle_single_admin_completion(struct efa_com_admin_queue *aq,
+						   struct efa_admin_acq_entry *cqe)
+{
+	struct efa_comp_ctx *comp_ctx;
+	u16 cmd_id;
+
+	cmd_id = EFA_GET(&cqe->acq_common_descriptor.command,
+			 EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID);
+
+	comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, false);
+	if (!comp_ctx) {
+		ibdev_err(
+			aq->efa_dev,
+			"comp_ctx is NULL. Changing the admin queue running state\n");
+		clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
+		return;
+	}
+
+	comp_ctx->status = EFA_CMD_COMPLETED;
+	comp_ctx->comp_status = cqe->acq_common_descriptor.status;
+	if (comp_ctx->user_cqe)
+		memcpy(comp_ctx->user_cqe, cqe, comp_ctx->comp_size);
+
+	if (!test_bit(EFA_AQ_STATE_POLLING_BIT, &aq->state))
+		complete(&comp_ctx->wait_event);
+}
+
+static void efa_com_handle_admin_completion(struct efa_com_admin_queue *aq)
+{
+	struct efa_admin_acq_entry *cqe;
+	u16 queue_size_mask;
+	u16 comp_num = 0;
+	u8 phase;
+	u16 ci;
+
+	queue_size_mask = aq->depth - 1;
+
+	ci = aq->cq.cc & queue_size_mask;
+	phase = aq->cq.phase;
+
+	cqe = &aq->cq.entries[ci];
+
+	/* Go over all the completions */
+	while ((READ_ONCE(cqe->acq_common_descriptor.flags) &
+		EFA_ADMIN_ACQ_COMMON_DESC_PHASE_MASK) == phase) {
+		/*
+		 * Do not read the rest of the completion entry before the
+		 * phase bit was validated
+		 */
+		dma_rmb();
+		efa_com_handle_single_admin_completion(aq, cqe);
+
+		ci++;
+		comp_num++;
+		if (ci == aq->depth) {
+			ci = 0;
+			phase = !phase;
+		}
+
+		cqe = &aq->cq.entries[ci];
+	}
+
+	aq->cq.cc += comp_num;
+	aq->cq.phase = phase;
+	aq->sq.cc += comp_num;
+	atomic64_add(comp_num, &aq->stats.completed_cmd);
+}
+
+static int efa_com_comp_status_to_errno(u8 comp_status)
+{
+	switch (comp_status) {
+	case EFA_ADMIN_SUCCESS:
+		return 0;
+	case EFA_ADMIN_RESOURCE_ALLOCATION_FAILURE:
+		return -ENOMEM;
+	case EFA_ADMIN_UNSUPPORTED_OPCODE:
+		return -EOPNOTSUPP;
+	case EFA_ADMIN_BAD_OPCODE:
+	case EFA_ADMIN_MALFORMED_REQUEST:
+	case EFA_ADMIN_ILLEGAL_PARAMETER:
+	case EFA_ADMIN_UNKNOWN_ERROR:
+		return -EINVAL;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int efa_com_wait_and_process_admin_cq_polling(struct efa_comp_ctx *comp_ctx,
+						     struct efa_com_admin_queue *aq)
+{
+	unsigned long timeout;
+	unsigned long flags;
+	int err;
+
+	timeout = jiffies + usecs_to_jiffies(aq->completion_timeout);
+
+	while (1) {
+		spin_lock_irqsave(&aq->cq.lock, flags);
+		efa_com_handle_admin_completion(aq);
+		spin_unlock_irqrestore(&aq->cq.lock, flags);
+
+		if (comp_ctx->status != EFA_CMD_SUBMITTED)
+			break;
+
+		if (time_is_before_jiffies(timeout)) {
+			ibdev_err_ratelimited(
+				aq->efa_dev,
+				"Wait for completion (polling) timeout\n");
+			/* EFA didn't have any completion */
+			atomic64_inc(&aq->stats.no_completion);
+
+			clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
+			err = -ETIME;
+			goto out;
+		}
+
+		msleep(aq->poll_interval);
+	}
+
+	err = efa_com_comp_status_to_errno(comp_ctx->comp_status);
+out:
+	efa_com_put_comp_ctx(aq, comp_ctx);
+	return err;
+}
+
+static int efa_com_wait_and_process_admin_cq_interrupts(struct efa_comp_ctx *comp_ctx,
+							struct efa_com_admin_queue *aq)
+{
+	unsigned long flags;
+	int err;
+
+	wait_for_completion_timeout(&comp_ctx->wait_event,
+				    usecs_to_jiffies(aq->completion_timeout));
+
+	/*
+	 * In case the command wasn't completed find out the root cause.
+	 * There might be 2 kinds of errors
+	 * 1) No completion (timeout reached)
+	 * 2) There is completion but the device didn't get any msi-x interrupt.
+	 */
+	if (comp_ctx->status == EFA_CMD_SUBMITTED) {
+		spin_lock_irqsave(&aq->cq.lock, flags);
+		efa_com_handle_admin_completion(aq);
+		spin_unlock_irqrestore(&aq->cq.lock, flags);
+
+		atomic64_inc(&aq->stats.no_completion);
+
+		if (comp_ctx->status == EFA_CMD_COMPLETED)
+			ibdev_err_ratelimited(
+				aq->efa_dev,
+				"The device sent a completion but the driver didn't receive any MSI-X interrupt for admin cmd %s(%d) status %d (ctx: 0x%p, sq producer: %d, sq consumer: %d, cq consumer: %d)\n",
+				efa_com_cmd_str(comp_ctx->cmd_opcode),
+				comp_ctx->cmd_opcode, comp_ctx->status,
+				comp_ctx, aq->sq.pc, aq->sq.cc, aq->cq.cc);
+		else
+			ibdev_err_ratelimited(
+				aq->efa_dev,
+				"The device didn't send any completion for admin cmd %s(%d) status %d (ctx 0x%p, sq producer: %d, sq consumer: %d, cq consumer: %d)\n",
+				efa_com_cmd_str(comp_ctx->cmd_opcode),
+				comp_ctx->cmd_opcode, comp_ctx->status,
+				comp_ctx, aq->sq.pc, aq->sq.cc, aq->cq.cc);
+
+		clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
+		err = -ETIME;
+		goto out;
+	}
+
+	err = efa_com_comp_status_to_errno(comp_ctx->comp_status);
+out:
+	efa_com_put_comp_ctx(aq, comp_ctx);
+	return err;
+}
+
+/*
+ * There are two types to wait for completion.
+ * Polling mode - wait until the completion is available.
+ * Async mode - wait on wait queue until the completion is ready
+ * (or the timeout expired).
+ * It is expected that the IRQ called efa_com_handle_admin_completion
+ * to mark the completions.
+ */
+static int efa_com_wait_and_process_admin_cq(struct efa_comp_ctx *comp_ctx,
+					     struct efa_com_admin_queue *aq)
+{
+	if (test_bit(EFA_AQ_STATE_POLLING_BIT, &aq->state))
+		return efa_com_wait_and_process_admin_cq_polling(comp_ctx, aq);
+
+	return efa_com_wait_and_process_admin_cq_interrupts(comp_ctx, aq);
+}
+
+/**
+ * efa_com_cmd_exec - Execute admin command
+ * @aq: admin queue.
+ * @cmd: the admin command to execute.
+ * @cmd_size: the command size.
+ * @comp: command completion return entry.
+ * @comp_size: command completion size.
+ * Submit an admin command and then wait until the device will return a
+ * completion.
+ * The completion will be copied into comp.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int efa_com_cmd_exec(struct efa_com_admin_queue *aq,
+		     struct efa_admin_aq_entry *cmd,
+		     size_t cmd_size,
+		     struct efa_admin_acq_entry *comp,
+		     size_t comp_size)
+{
+	struct efa_comp_ctx *comp_ctx;
+	int err;
+
+	might_sleep();
+
+	/* In case of queue FULL */
+	down(&aq->avail_cmds);
+
+	ibdev_dbg(aq->efa_dev, "%s (opcode %d)\n",
+		  efa_com_cmd_str(cmd->aq_common_descriptor.opcode),
+		  cmd->aq_common_descriptor.opcode);
+	comp_ctx = efa_com_submit_admin_cmd(aq, cmd, cmd_size, comp, comp_size);
+	if (IS_ERR(comp_ctx)) {
+		ibdev_err_ratelimited(
+			aq->efa_dev,
+			"Failed to submit command %s (opcode %u) err %ld\n",
+			efa_com_cmd_str(cmd->aq_common_descriptor.opcode),
+			cmd->aq_common_descriptor.opcode, PTR_ERR(comp_ctx));
+
+		up(&aq->avail_cmds);
+		atomic64_inc(&aq->stats.cmd_err);
+		return PTR_ERR(comp_ctx);
+	}
+
+	err = efa_com_wait_and_process_admin_cq(comp_ctx, aq);
+	if (err) {
+		ibdev_err_ratelimited(
+			aq->efa_dev,
+			"Failed to process command %s (opcode %u) comp_status %d err %d\n",
+			efa_com_cmd_str(cmd->aq_common_descriptor.opcode),
+			cmd->aq_common_descriptor.opcode, comp_ctx->comp_status,
+			err);
+		atomic64_inc(&aq->stats.cmd_err);
+	}
+
+	up(&aq->avail_cmds);
+
+	return err;
+}
+
+/**
+ * efa_com_admin_destroy - Destroy the admin and the async events queues.
+ * @edev: EFA communication layer struct
+ */
+void efa_com_admin_destroy(struct efa_com_dev *edev)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_com_aenq *aenq = &edev->aenq;
+	struct efa_com_admin_cq *cq = &aq->cq;
+	struct efa_com_admin_sq *sq = &aq->sq;
+	u16 size;
+
+	clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
+
+	devm_kfree(edev->dmadev, aq->comp_ctx_pool);
+	devm_kfree(edev->dmadev, aq->comp_ctx);
+
+	size = aq->depth * sizeof(*sq->entries);
+	dma_free_coherent(edev->dmadev, size, sq->entries, sq->dma_addr);
+
+	size = aq->depth * sizeof(*cq->entries);
+	dma_free_coherent(edev->dmadev, size, cq->entries, cq->dma_addr);
+
+	size = aenq->depth * sizeof(*aenq->entries);
+	dma_free_coherent(edev->dmadev, size, aenq->entries, aenq->dma_addr);
+}
+
+/**
+ * efa_com_set_admin_polling_mode - Set the admin completion queue polling mode
+ * @edev: EFA communication layer struct
+ * @polling: Enable/Disable polling mode
+ *
+ * Set the admin completion mode.
+ */
+void efa_com_set_admin_polling_mode(struct efa_com_dev *edev, bool polling)
+{
+	u32 mask_value = 0;
+
+	if (polling)
+		EFA_SET(&mask_value, EFA_REGS_INTR_MASK_EN, 1);
+
+	writel(mask_value, edev->reg_bar + EFA_REGS_INTR_MASK_OFF);
+	if (polling)
+		set_bit(EFA_AQ_STATE_POLLING_BIT, &edev->aq.state);
+	else
+		clear_bit(EFA_AQ_STATE_POLLING_BIT, &edev->aq.state);
+}
+
+static void efa_com_stats_init(struct efa_com_dev *edev)
+{
+	atomic64_t *s = (atomic64_t *)&edev->aq.stats;
+	int i;
+
+	for (i = 0; i < sizeof(edev->aq.stats) / sizeof(*s); i++, s++)
+		atomic64_set(s, 0);
+}
+
+/**
+ * efa_com_admin_init - Init the admin and the async queues
+ * @edev: EFA communication layer struct
+ * @aenq_handlers: Those handlers to be called upon event.
+ *
+ * Initialize the admin submission and completion queues.
+ * Initialize the asynchronous events notification queues.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int efa_com_admin_init(struct efa_com_dev *edev,
+		       struct efa_aenq_handlers *aenq_handlers)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	u32 timeout;
+	u32 dev_sts;
+	u32 cap;
+	int err;
+
+	dev_sts = efa_com_reg_read32(edev, EFA_REGS_DEV_STS_OFF);
+	if (!EFA_GET(&dev_sts, EFA_REGS_DEV_STS_READY)) {
+		ibdev_err(edev->efa_dev,
+			  "Device isn't ready, abort com init %#x\n", dev_sts);
+		return -ENODEV;
+	}
+
+	aq->depth = EFA_ADMIN_QUEUE_DEPTH;
+
+	aq->dmadev = edev->dmadev;
+	aq->efa_dev = edev->efa_dev;
+	set_bit(EFA_AQ_STATE_POLLING_BIT, &aq->state);
+
+	sema_init(&aq->avail_cmds, aq->depth);
+
+	efa_com_stats_init(edev);
+
+	err = efa_com_init_comp_ctxt(aq);
+	if (err)
+		return err;
+
+	err = efa_com_admin_init_sq(edev);
+	if (err)
+		goto err_destroy_comp_ctxt;
+
+	err = efa_com_admin_init_cq(edev);
+	if (err)
+		goto err_destroy_sq;
+
+	efa_com_set_admin_polling_mode(edev, false);
+
+	err = efa_com_admin_init_aenq(edev, aenq_handlers);
+	if (err)
+		goto err_destroy_cq;
+
+	cap = efa_com_reg_read32(edev, EFA_REGS_CAPS_OFF);
+	timeout = EFA_GET(&cap, EFA_REGS_CAPS_ADMIN_CMD_TO);
+	if (timeout)
+		/* the resolution of timeout reg is 100ms */
+		aq->completion_timeout = timeout * 100000;
+	else
+		aq->completion_timeout = ADMIN_CMD_TIMEOUT_US;
+
+	aq->poll_interval = EFA_POLL_INTERVAL_MS;
+
+	set_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
+
+	return 0;
+
+err_destroy_cq:
+	dma_free_coherent(edev->dmadev, aq->depth * sizeof(*aq->cq.entries),
+			  aq->cq.entries, aq->cq.dma_addr);
+err_destroy_sq:
+	dma_free_coherent(edev->dmadev, aq->depth * sizeof(*aq->sq.entries),
+			  aq->sq.entries, aq->sq.dma_addr);
+err_destroy_comp_ctxt:
+	devm_kfree(edev->dmadev, aq->comp_ctx);
+
+	return err;
+}
+
+/**
+ * efa_com_admin_q_comp_intr_handler - admin queue interrupt handler
+ * @edev: EFA communication layer struct
+ *
+ * This method goes over the admin completion queue and wakes up
+ * all the pending threads that wait on the commands wait event.
+ *
+ * @note: Should be called after MSI-X interrupt.
+ */
+void efa_com_admin_q_comp_intr_handler(struct efa_com_dev *edev)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&edev->aq.cq.lock, flags);
+	efa_com_handle_admin_completion(&edev->aq);
+	spin_unlock_irqrestore(&edev->aq.cq.lock, flags);
+}
+
+/*
+ * efa_handle_specific_aenq_event:
+ * return the handler that is relevant to the specific event group
+ */
+static efa_aenq_handler efa_com_get_specific_aenq_cb(struct efa_com_dev *edev,
+						     u16 group)
+{
+	struct efa_aenq_handlers *aenq_handlers = edev->aenq.aenq_handlers;
+
+	if (group < EFA_MAX_HANDLERS && aenq_handlers->handlers[group])
+		return aenq_handlers->handlers[group];
+
+	return aenq_handlers->unimplemented_handler;
+}
+
+/**
+ * efa_com_aenq_intr_handler - AENQ interrupt handler
+ * @edev: EFA communication layer struct
+ * @data: Data of interrupt handler.
+ *
+ * Go over the async event notification queue and call the proper aenq handler.
+ */
+void efa_com_aenq_intr_handler(struct efa_com_dev *edev, void *data)
+{
+	struct efa_admin_aenq_common_desc *aenq_common;
+	struct efa_com_aenq *aenq = &edev->aenq;
+	struct efa_admin_aenq_entry *aenq_e;
+	efa_aenq_handler handler_cb;
+	u32 processed = 0;
+	u8 phase;
+	u32 ci;
+
+	ci = aenq->cc & (aenq->depth - 1);
+	phase = aenq->phase;
+	aenq_e = &aenq->entries[ci]; /* Get first entry */
+	aenq_common = &aenq_e->aenq_common_desc;
+
+	/* Go over all the events */
+	while ((READ_ONCE(aenq_common->flags) &
+		EFA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK) == phase) {
+		/*
+		 * Do not read the rest of the completion entry before the
+		 * phase bit was validated
+		 */
+		dma_rmb();
+
+		/* Handle specific event*/
+		handler_cb = efa_com_get_specific_aenq_cb(edev,
+							  aenq_common->group);
+		handler_cb(data, aenq_e); /* call the actual event handler*/
+
+		/* Get next event entry */
+		ci++;
+		processed++;
+
+		if (ci == aenq->depth) {
+			ci = 0;
+			phase = !phase;
+		}
+		aenq_e = &aenq->entries[ci];
+		aenq_common = &aenq_e->aenq_common_desc;
+	}
+
+	aenq->cc += processed;
+	aenq->phase = phase;
+
+	/* Don't update aenq doorbell if there weren't any processed events */
+	if (!processed)
+		return;
+
+	/* barrier not needed in case of writel */
+	writel(aenq->cc, edev->reg_bar + EFA_REGS_AENQ_CONS_DB_OFF);
+}
+
+static void efa_com_mmio_reg_read_resp_addr_init(struct efa_com_dev *edev)
+{
+	struct efa_com_mmio_read *mmio_read = &edev->mmio_read;
+	u32 addr_high;
+	u32 addr_low;
+
+	/* dma_addr_bits is unknown at this point */
+	addr_high = (mmio_read->read_resp_dma_addr >> 32) & GENMASK(31, 0);
+	addr_low = mmio_read->read_resp_dma_addr & GENMASK(31, 0);
+
+	writel(addr_high, edev->reg_bar + EFA_REGS_MMIO_RESP_HI_OFF);
+	writel(addr_low, edev->reg_bar + EFA_REGS_MMIO_RESP_LO_OFF);
+}
+
+int efa_com_mmio_reg_read_init(struct efa_com_dev *edev)
+{
+	struct efa_com_mmio_read *mmio_read = &edev->mmio_read;
+
+	spin_lock_init(&mmio_read->lock);
+	mmio_read->read_resp =
+		dma_alloc_coherent(edev->dmadev, sizeof(*mmio_read->read_resp),
+				   &mmio_read->read_resp_dma_addr, GFP_KERNEL);
+	if (!mmio_read->read_resp)
+		return -ENOMEM;
+
+	efa_com_mmio_reg_read_resp_addr_init(edev);
+
+	mmio_read->read_resp->req_id = 0;
+	mmio_read->seq_num = 0;
+	mmio_read->mmio_read_timeout = EFA_REG_READ_TIMEOUT_US;
+
+	return 0;
+}
+
+void efa_com_mmio_reg_read_destroy(struct efa_com_dev *edev)
+{
+	struct efa_com_mmio_read *mmio_read = &edev->mmio_read;
+
+	dma_free_coherent(edev->dmadev, sizeof(*mmio_read->read_resp),
+			  mmio_read->read_resp, mmio_read->read_resp_dma_addr);
+}
+
+int efa_com_validate_version(struct efa_com_dev *edev)
+{
+	u32 min_ctrl_ver = 0;
+	u32 ctrl_ver_masked;
+	u32 min_ver = 0;
+	u32 ctrl_ver;
+	u32 ver;
+
+	/*
+	 * Make sure the EFA version and the controller version are at least
+	 * as the driver expects
+	 */
+	ver = efa_com_reg_read32(edev, EFA_REGS_VERSION_OFF);
+	ctrl_ver = efa_com_reg_read32(edev,
+				      EFA_REGS_CONTROLLER_VERSION_OFF);
+
+	ibdev_dbg(edev->efa_dev, "efa device version: %d.%d\n",
+		  EFA_GET(&ver, EFA_REGS_VERSION_MAJOR_VERSION),
+		  EFA_GET(&ver, EFA_REGS_VERSION_MINOR_VERSION));
+
+	EFA_SET(&min_ver, EFA_REGS_VERSION_MAJOR_VERSION,
+		EFA_ADMIN_API_VERSION_MAJOR);
+	EFA_SET(&min_ver, EFA_REGS_VERSION_MINOR_VERSION,
+		EFA_ADMIN_API_VERSION_MINOR);
+	if (ver < min_ver) {
+		ibdev_err(
+			edev->efa_dev,
+			"EFA version is lower than the minimal version the driver supports\n");
+		return -EOPNOTSUPP;
+	}
+
+	ibdev_dbg(
+		edev->efa_dev,
+		"efa controller version: %d.%d.%d implementation version %d\n",
+		EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION),
+		EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION),
+		EFA_GET(&ctrl_ver,
+			EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION),
+		EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_IMPL_ID));
+
+	ctrl_ver_masked =
+		EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION) |
+		EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION) |
+		EFA_GET(&ctrl_ver,
+			EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION);
+
+	EFA_SET(&min_ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION,
+		EFA_CTRL_MAJOR);
+	EFA_SET(&min_ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION,
+		EFA_CTRL_MINOR);
+	EFA_SET(&min_ctrl_ver, EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION,
+		EFA_CTRL_SUB_MINOR);
+	/* Validate the ctrl version without the implementation ID */
+	if (ctrl_ver_masked < min_ctrl_ver) {
+		ibdev_err(
+			edev->efa_dev,
+			"EFA ctrl version is lower than the minimal ctrl version the driver supports\n");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+/**
+ * efa_com_get_dma_width - Retrieve physical dma address width the device
+ * supports.
+ * @edev: EFA communication layer struct
+ *
+ * Retrieve the maximum physical address bits the device can handle.
+ *
+ * @return: > 0 on Success and negative value otherwise.
+ */
+int efa_com_get_dma_width(struct efa_com_dev *edev)
+{
+	u32 caps = efa_com_reg_read32(edev, EFA_REGS_CAPS_OFF);
+	int width;
+
+	width = EFA_GET(&caps, EFA_REGS_CAPS_DMA_ADDR_WIDTH);
+
+	ibdev_dbg(edev->efa_dev, "DMA width: %d\n", width);
+
+	if (width < 32 || width > 64) {
+		ibdev_err(edev->efa_dev, "DMA width illegal value: %d\n",
+			  width);
+		return -EINVAL;
+	}
+
+	edev->dma_addr_bits = width;
+
+	return width;
+}
+
+static int wait_for_reset_state(struct efa_com_dev *edev, u32 timeout, int on)
+{
+	u32 val, i;
+
+	for (i = 0; i < timeout; i++) {
+		val = efa_com_reg_read32(edev, EFA_REGS_DEV_STS_OFF);
+
+		if (EFA_GET(&val, EFA_REGS_DEV_STS_RESET_IN_PROGRESS) == on)
+			return 0;
+
+		ibdev_dbg(edev->efa_dev, "Reset indication val %d\n", val);
+		msleep(EFA_POLL_INTERVAL_MS);
+	}
+
+	return -ETIME;
+}
+
+/**
+ * efa_com_dev_reset - Perform device FLR to the device.
+ * @edev: EFA communication layer struct
+ * @reset_reason: Specify what is the trigger for the reset in case of an error.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int efa_com_dev_reset(struct efa_com_dev *edev,
+		      enum efa_regs_reset_reason_types reset_reason)
+{
+	u32 stat, timeout, cap;
+	u32 reset_val = 0;
+	int err;
+
+	stat = efa_com_reg_read32(edev, EFA_REGS_DEV_STS_OFF);
+	cap = efa_com_reg_read32(edev, EFA_REGS_CAPS_OFF);
+
+	if (!EFA_GET(&stat, EFA_REGS_DEV_STS_READY)) {
+		ibdev_err(edev->efa_dev,
+			  "Device isn't ready, can't reset device\n");
+		return -EINVAL;
+	}
+
+	timeout = EFA_GET(&cap, EFA_REGS_CAPS_RESET_TIMEOUT);
+	if (!timeout) {
+		ibdev_err(edev->efa_dev, "Invalid timeout value\n");
+		return -EINVAL;
+	}
+
+	/* start reset */
+	EFA_SET(&reset_val, EFA_REGS_DEV_CTL_DEV_RESET, 1);
+	EFA_SET(&reset_val, EFA_REGS_DEV_CTL_RESET_REASON, reset_reason);
+	writel(reset_val, edev->reg_bar + EFA_REGS_DEV_CTL_OFF);
+
+	/* reset clears the mmio readless address, restore it */
+	efa_com_mmio_reg_read_resp_addr_init(edev);
+
+	err = wait_for_reset_state(edev, timeout, 1);
+	if (err) {
+		ibdev_err(edev->efa_dev, "Reset indication didn't turn on\n");
+		return err;
+	}
+
+	/* reset done */
+	writel(0, edev->reg_bar + EFA_REGS_DEV_CTL_OFF);
+	err = wait_for_reset_state(edev, timeout, 0);
+	if (err) {
+		ibdev_err(edev->efa_dev, "Reset indication didn't turn off\n");
+		return err;
+	}
+
+	timeout = EFA_GET(&cap, EFA_REGS_CAPS_ADMIN_CMD_TO);
+	if (timeout)
+		/* the resolution of timeout reg is 100ms */
+		edev->aq.completion_timeout = timeout * 100000;
+	else
+		edev->aq.completion_timeout = ADMIN_CMD_TIMEOUT_US;
+
+	return 0;
+}
diff --git a/drivers/amazon/net/efa/efa_com.h b/drivers/amazon/net/efa/efa_com.h
new file mode 100644
index 0000000000000..3857ec3359f0d
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_com.h
@@ -0,0 +1,145 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_COM_H_
+#define _EFA_COM_H_
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/semaphore.h>
+#include <linux/sched.h>
+
+#include <rdma/ib_verbs.h>
+#include "kcompat.h"
+
+#include "efa_common_defs.h"
+#include "efa_admin_defs.h"
+#include "efa_admin_cmds_defs.h"
+#include "efa_regs_defs.h"
+
+#define EFA_MAX_HANDLERS 256
+
+struct efa_com_admin_cq {
+	struct efa_admin_acq_entry *entries;
+	dma_addr_t dma_addr;
+	spinlock_t lock; /* Protects ACQ */
+
+	u16 cc; /* consumer counter */
+	u8 phase;
+};
+
+struct efa_com_admin_sq {
+	struct efa_admin_aq_entry *entries;
+	dma_addr_t dma_addr;
+	spinlock_t lock; /* Protects ASQ */
+
+	u32 __iomem *db_addr;
+
+	u16 cc; /* consumer counter */
+	u16 pc; /* producer counter */
+	u8 phase;
+
+};
+
+/* Don't use anything other than atomic64 */
+struct efa_com_stats_admin {
+	atomic64_t submitted_cmd;
+	atomic64_t completed_cmd;
+	atomic64_t cmd_err;
+	atomic64_t no_completion;
+};
+
+enum {
+	EFA_AQ_STATE_RUNNING_BIT = 0,
+	EFA_AQ_STATE_POLLING_BIT = 1,
+};
+
+struct efa_com_admin_queue {
+	void *dmadev;
+	void *efa_dev;
+	struct efa_comp_ctx *comp_ctx;
+	u32 completion_timeout; /* usecs */
+	u16 poll_interval; /* msecs */
+	u16 depth;
+	struct efa_com_admin_cq cq;
+	struct efa_com_admin_sq sq;
+	u16 msix_vector_idx;
+
+	unsigned long state;
+
+	/* Count the number of available admin commands */
+	struct semaphore avail_cmds;
+
+	struct efa_com_stats_admin stats;
+
+	spinlock_t comp_ctx_lock; /* Protects completion context pool */
+	u32 *comp_ctx_pool;
+	u16 comp_ctx_pool_next;
+};
+
+struct efa_aenq_handlers;
+
+struct efa_com_aenq {
+	struct efa_admin_aenq_entry *entries;
+	struct efa_aenq_handlers *aenq_handlers;
+	dma_addr_t dma_addr;
+	u32 cc; /* consumer counter */
+	u16 msix_vector_idx;
+	u16 depth;
+	u8 phase;
+};
+
+struct efa_com_mmio_read {
+	struct efa_admin_mmio_req_read_less_resp *read_resp;
+	dma_addr_t read_resp_dma_addr;
+	u16 seq_num;
+	u16 mmio_read_timeout; /* usecs */
+	/* serializes mmio reads */
+	spinlock_t lock;
+};
+
+struct efa_com_dev {
+	struct efa_com_admin_queue aq;
+	struct efa_com_aenq aenq;
+	u8 __iomem *reg_bar;
+	void *dmadev;
+	void *efa_dev;
+	u32 supported_features;
+	u32 dma_addr_bits;
+
+	struct efa_com_mmio_read mmio_read;
+};
+
+typedef void (*efa_aenq_handler)(void *data,
+	      struct efa_admin_aenq_entry *aenq_e);
+
+/* Holds aenq handlers. Indexed by AENQ event group */
+struct efa_aenq_handlers {
+	efa_aenq_handler handlers[EFA_MAX_HANDLERS];
+	efa_aenq_handler unimplemented_handler;
+};
+
+int efa_com_admin_init(struct efa_com_dev *edev,
+		       struct efa_aenq_handlers *aenq_handlers);
+void efa_com_admin_destroy(struct efa_com_dev *edev);
+int efa_com_dev_reset(struct efa_com_dev *edev,
+		      enum efa_regs_reset_reason_types reset_reason);
+void efa_com_set_admin_polling_mode(struct efa_com_dev *edev, bool polling);
+void efa_com_admin_q_comp_intr_handler(struct efa_com_dev *edev);
+int efa_com_mmio_reg_read_init(struct efa_com_dev *edev);
+void efa_com_mmio_reg_read_destroy(struct efa_com_dev *edev);
+
+int efa_com_validate_version(struct efa_com_dev *edev);
+int efa_com_get_dma_width(struct efa_com_dev *edev);
+
+int efa_com_cmd_exec(struct efa_com_admin_queue *aq,
+		     struct efa_admin_aq_entry *cmd,
+		     size_t cmd_size,
+		     struct efa_admin_acq_entry *comp,
+		     size_t comp_size);
+void efa_com_aenq_intr_handler(struct efa_com_dev *edev, void *data);
+
+#endif /* _EFA_COM_H_ */
diff --git a/drivers/amazon/net/efa/efa_com_cmd.c b/drivers/amazon/net/efa/efa_com_cmd.c
new file mode 100644
index 0000000000000..d2727cddf9703
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_com_cmd.c
@@ -0,0 +1,781 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
+/*
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include "efa_com.h"
+#include "efa_com_cmd.h"
+
+void efa_com_set_dma_addr(dma_addr_t addr, u32 *addr_high, u32 *addr_low)
+{
+	*addr_low = lower_32_bits(addr);
+	*addr_high = upper_32_bits(addr);
+}
+
+int efa_com_create_qp(struct efa_com_dev *edev,
+		      struct efa_com_create_qp_params *params,
+		      struct efa_com_create_qp_result *res)
+{
+	struct efa_admin_create_qp_cmd create_qp_cmd = {};
+	struct efa_admin_create_qp_resp cmd_completion;
+	struct efa_com_admin_queue *aq = &edev->aq;
+	int err;
+
+	create_qp_cmd.aq_common_desc.opcode = EFA_ADMIN_CREATE_QP;
+
+	create_qp_cmd.pd = params->pd;
+	create_qp_cmd.qp_type = params->qp_type;
+	create_qp_cmd.rq_base_addr = params->rq_base_addr;
+	create_qp_cmd.send_cq_idx = params->send_cq_idx;
+	create_qp_cmd.recv_cq_idx = params->recv_cq_idx;
+	create_qp_cmd.qp_alloc_size.send_queue_ring_size =
+		params->sq_ring_size_in_bytes;
+	create_qp_cmd.qp_alloc_size.send_queue_depth =
+			params->sq_depth;
+	create_qp_cmd.qp_alloc_size.recv_queue_ring_size =
+			params->rq_ring_size_in_bytes;
+	create_qp_cmd.qp_alloc_size.recv_queue_depth =
+			params->rq_depth;
+	create_qp_cmd.uar = params->uarn;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&create_qp_cmd,
+			       sizeof(create_qp_cmd),
+			       (struct efa_admin_acq_entry *)&cmd_completion,
+			       sizeof(cmd_completion));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to create qp [%d]\n", err);
+		return err;
+	}
+
+	res->qp_handle = cmd_completion.qp_handle;
+	res->qp_num = cmd_completion.qp_num;
+	res->sq_db_offset = cmd_completion.sq_db_offset;
+	res->rq_db_offset = cmd_completion.rq_db_offset;
+	res->llq_descriptors_offset = cmd_completion.llq_descriptors_offset;
+	res->send_sub_cq_idx = cmd_completion.send_sub_cq_idx;
+	res->recv_sub_cq_idx = cmd_completion.recv_sub_cq_idx;
+
+	return 0;
+}
+
+int efa_com_modify_qp(struct efa_com_dev *edev,
+		      struct efa_com_modify_qp_params *params)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_modify_qp_cmd cmd = {};
+	struct efa_admin_modify_qp_resp resp;
+	int err;
+
+	cmd.aq_common_desc.opcode = EFA_ADMIN_MODIFY_QP;
+	cmd.modify_mask = params->modify_mask;
+	cmd.qp_handle = params->qp_handle;
+	cmd.qp_state = params->qp_state;
+	cmd.cur_qp_state = params->cur_qp_state;
+	cmd.qkey = params->qkey;
+	cmd.sq_psn = params->sq_psn;
+	cmd.sq_drained_async_notify = params->sq_drained_async_notify;
+	cmd.rnr_retry = params->rnr_retry;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&cmd,
+			       sizeof(cmd),
+			       (struct efa_admin_acq_entry *)&resp,
+			       sizeof(resp));
+	if (err) {
+		ibdev_err_ratelimited(
+			edev->efa_dev,
+			"Failed to modify qp-%u modify_mask[%#x] [%d]\n",
+			cmd.qp_handle, cmd.modify_mask, err);
+		return err;
+	}
+
+	return 0;
+}
+
+int efa_com_query_qp(struct efa_com_dev *edev,
+		     struct efa_com_query_qp_params *params,
+		     struct efa_com_query_qp_result *result)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_query_qp_cmd cmd = {};
+	struct efa_admin_query_qp_resp resp;
+	int err;
+
+	cmd.aq_common_desc.opcode = EFA_ADMIN_QUERY_QP;
+	cmd.qp_handle = params->qp_handle;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&cmd,
+			       sizeof(cmd),
+			       (struct efa_admin_acq_entry *)&resp,
+			       sizeof(resp));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to query qp-%u [%d]\n",
+				      cmd.qp_handle, err);
+		return err;
+	}
+
+	result->qp_state = resp.qp_state;
+	result->qkey = resp.qkey;
+	result->sq_draining = resp.sq_draining;
+	result->sq_psn = resp.sq_psn;
+	result->rnr_retry = resp.rnr_retry;
+
+	return 0;
+}
+
+int efa_com_destroy_qp(struct efa_com_dev *edev,
+		       struct efa_com_destroy_qp_params *params)
+{
+	struct efa_admin_destroy_qp_resp cmd_completion;
+	struct efa_admin_destroy_qp_cmd qp_cmd = {};
+	struct efa_com_admin_queue *aq = &edev->aq;
+	int err;
+
+	qp_cmd.aq_common_desc.opcode = EFA_ADMIN_DESTROY_QP;
+	qp_cmd.qp_handle = params->qp_handle;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&qp_cmd,
+			       sizeof(qp_cmd),
+			       (struct efa_admin_acq_entry *)&cmd_completion,
+			       sizeof(cmd_completion));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to destroy qp-%u [%d]\n",
+				      qp_cmd.qp_handle, err);
+		return err;
+	}
+
+	return 0;
+}
+
+int efa_com_create_cq(struct efa_com_dev *edev,
+		      struct efa_com_create_cq_params *params,
+		      struct efa_com_create_cq_result *result)
+{
+	struct efa_admin_create_cq_resp cmd_completion;
+	struct efa_admin_create_cq_cmd create_cmd = {};
+	struct efa_com_admin_queue *aq = &edev->aq;
+	int err;
+
+	create_cmd.aq_common_desc.opcode = EFA_ADMIN_CREATE_CQ;
+	EFA_SET(&create_cmd.cq_caps_2,
+		EFA_ADMIN_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS,
+		params->entry_size_in_bytes / 4);
+	create_cmd.cq_depth = params->cq_depth;
+	create_cmd.num_sub_cqs = params->num_sub_cqs;
+	create_cmd.uar = params->uarn;
+
+	efa_com_set_dma_addr(params->dma_addr,
+			     &create_cmd.cq_ba.mem_addr_high,
+			     &create_cmd.cq_ba.mem_addr_low);
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&create_cmd,
+			       sizeof(create_cmd),
+			       (struct efa_admin_acq_entry *)&cmd_completion,
+			       sizeof(cmd_completion));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to create cq[%d]\n", err);
+		return err;
+	}
+
+	result->cq_idx = cmd_completion.cq_idx;
+	result->actual_depth = params->cq_depth;
+
+	return 0;
+}
+
+int efa_com_destroy_cq(struct efa_com_dev *edev,
+		       struct efa_com_destroy_cq_params *params)
+{
+	struct efa_admin_destroy_cq_cmd destroy_cmd = {};
+	struct efa_admin_destroy_cq_resp destroy_resp;
+	struct efa_com_admin_queue *aq = &edev->aq;
+	int err;
+
+	destroy_cmd.cq_idx = params->cq_idx;
+	destroy_cmd.aq_common_desc.opcode = EFA_ADMIN_DESTROY_CQ;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&destroy_cmd,
+			       sizeof(destroy_cmd),
+			       (struct efa_admin_acq_entry *)&destroy_resp,
+			       sizeof(destroy_resp));
+
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to destroy CQ-%u [%d]\n",
+				      params->cq_idx, err);
+		return err;
+	}
+
+	return 0;
+}
+
+int efa_com_register_mr(struct efa_com_dev *edev,
+			struct efa_com_reg_mr_params *params,
+			struct efa_com_reg_mr_result *result)
+{
+	struct efa_admin_reg_mr_resp cmd_completion;
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_reg_mr_cmd mr_cmd = {};
+	int err;
+
+	mr_cmd.aq_common_desc.opcode = EFA_ADMIN_REG_MR;
+	mr_cmd.pd = params->pd;
+	mr_cmd.mr_length = params->mr_length_in_bytes;
+	EFA_SET(&mr_cmd.flags, EFA_ADMIN_REG_MR_CMD_PHYS_PAGE_SIZE_SHIFT,
+		params->page_shift);
+	mr_cmd.iova = params->iova;
+	mr_cmd.permissions = params->permissions;
+
+	if (params->inline_pbl) {
+		memcpy(mr_cmd.pbl.inline_pbl_array,
+		       params->pbl.inline_pbl_array,
+		       sizeof(mr_cmd.pbl.inline_pbl_array));
+	} else {
+		mr_cmd.pbl.pbl.length = params->pbl.pbl.length;
+		mr_cmd.pbl.pbl.address.mem_addr_low =
+			params->pbl.pbl.address.mem_addr_low;
+		mr_cmd.pbl.pbl.address.mem_addr_high =
+			params->pbl.pbl.address.mem_addr_high;
+		EFA_SET(&mr_cmd.aq_common_desc.flags,
+			EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA, 1);
+		if (params->indirect)
+			EFA_SET(&mr_cmd.aq_common_desc.flags,
+				EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT, 1);
+	}
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&mr_cmd,
+			       sizeof(mr_cmd),
+			       (struct efa_admin_acq_entry *)&cmd_completion,
+			       sizeof(cmd_completion));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to register mr [%d]\n", err);
+		return err;
+	}
+
+	result->l_key = cmd_completion.l_key;
+	result->r_key = cmd_completion.r_key;
+
+	return 0;
+}
+
+int efa_com_dereg_mr(struct efa_com_dev *edev,
+		     struct efa_com_dereg_mr_params *params)
+{
+	struct efa_admin_dereg_mr_resp cmd_completion;
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_dereg_mr_cmd mr_cmd = {};
+	int err;
+
+	mr_cmd.aq_common_desc.opcode = EFA_ADMIN_DEREG_MR;
+	mr_cmd.l_key = params->l_key;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&mr_cmd,
+			       sizeof(mr_cmd),
+			       (struct efa_admin_acq_entry *)&cmd_completion,
+			       sizeof(cmd_completion));
+	if (err) {
+		ibdev_err_ratelimited(
+			edev->efa_dev,
+			"Failed to de-register mr(lkey-%u) [%d]\n",
+			mr_cmd.l_key, err);
+		return err;
+	}
+
+	return 0;
+}
+
+int efa_com_create_ah(struct efa_com_dev *edev,
+		      struct efa_com_create_ah_params *params,
+		      struct efa_com_create_ah_result *result)
+{
+	struct efa_admin_create_ah_resp cmd_completion;
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_create_ah_cmd ah_cmd = {};
+	int err;
+
+	ah_cmd.aq_common_desc.opcode = EFA_ADMIN_CREATE_AH;
+
+	memcpy(ah_cmd.dest_addr, params->dest_addr, sizeof(ah_cmd.dest_addr));
+	ah_cmd.pd = params->pdn;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&ah_cmd,
+			       sizeof(ah_cmd),
+			       (struct efa_admin_acq_entry *)&cmd_completion,
+			       sizeof(cmd_completion));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to create ah for %pI6 [%d]\n",
+				      ah_cmd.dest_addr, err);
+		return err;
+	}
+
+	result->ah = cmd_completion.ah;
+
+	return 0;
+}
+
+int efa_com_destroy_ah(struct efa_com_dev *edev,
+		       struct efa_com_destroy_ah_params *params)
+{
+	struct efa_admin_destroy_ah_resp cmd_completion;
+	struct efa_admin_destroy_ah_cmd ah_cmd = {};
+	struct efa_com_admin_queue *aq = &edev->aq;
+	int err;
+
+	ah_cmd.aq_common_desc.opcode = EFA_ADMIN_DESTROY_AH;
+	ah_cmd.ah = params->ah;
+	ah_cmd.pd = params->pdn;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&ah_cmd,
+			       sizeof(ah_cmd),
+			       (struct efa_admin_acq_entry *)&cmd_completion,
+			       sizeof(cmd_completion));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to destroy ah-%d pd-%d [%d]\n",
+				      ah_cmd.ah, ah_cmd.pd, err);
+		return err;
+	}
+
+	return 0;
+}
+
+bool
+efa_com_check_supported_feature_id(struct efa_com_dev *edev,
+				   enum efa_admin_aq_feature_id feature_id)
+{
+	u32 feature_mask = 1 << feature_id;
+
+	/* Device attributes is always supported */
+	if (feature_id != EFA_ADMIN_DEVICE_ATTR &&
+	    !(edev->supported_features & feature_mask))
+		return false;
+
+	return true;
+}
+
+static int efa_com_get_feature_ex(struct efa_com_dev *edev,
+				  struct efa_admin_get_feature_resp *get_resp,
+				  enum efa_admin_aq_feature_id feature_id,
+				  dma_addr_t control_buf_dma_addr,
+				  u32 control_buff_size)
+{
+	struct efa_admin_get_feature_cmd get_cmd = {};
+	struct efa_com_admin_queue *aq;
+	int err;
+
+	if (!efa_com_check_supported_feature_id(edev, feature_id)) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Feature %d isn't supported\n",
+				      feature_id);
+		return -EOPNOTSUPP;
+	}
+
+	aq = &edev->aq;
+
+	get_cmd.aq_common_descriptor.opcode = EFA_ADMIN_GET_FEATURE;
+
+	if (control_buff_size)
+		EFA_SET(&get_cmd.aq_common_descriptor.flags,
+			EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA, 1);
+
+	efa_com_set_dma_addr(control_buf_dma_addr,
+			     &get_cmd.control_buffer.address.mem_addr_high,
+			     &get_cmd.control_buffer.address.mem_addr_low);
+
+	get_cmd.control_buffer.length = control_buff_size;
+	get_cmd.feature_common.feature_id = feature_id;
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)
+			       &get_cmd,
+			       sizeof(get_cmd),
+			       (struct efa_admin_acq_entry *)
+			       get_resp,
+			       sizeof(*get_resp));
+
+	if (err) {
+		ibdev_err_ratelimited(
+			edev->efa_dev,
+			"Failed to submit get_feature command %d [%d]\n",
+			feature_id, err);
+		return err;
+	}
+
+	return 0;
+}
+
+static int efa_com_get_feature(struct efa_com_dev *edev,
+			       struct efa_admin_get_feature_resp *get_resp,
+			       enum efa_admin_aq_feature_id feature_id)
+{
+	return efa_com_get_feature_ex(edev, get_resp, feature_id, 0, 0);
+}
+
+int efa_com_get_device_attr(struct efa_com_dev *edev,
+			    struct efa_com_get_device_attr_result *result)
+{
+	struct efa_admin_get_feature_resp resp;
+	int err;
+
+	err = efa_com_get_feature(edev, &resp, EFA_ADMIN_DEVICE_ATTR);
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to get device attributes %d\n",
+				      err);
+		return err;
+	}
+
+	result->page_size_cap = resp.u.device_attr.page_size_cap;
+	result->fw_version = resp.u.device_attr.fw_version;
+	result->admin_api_version = resp.u.device_attr.admin_api_version;
+	result->device_version = resp.u.device_attr.device_version;
+	result->supported_features = resp.u.device_attr.supported_features;
+	result->phys_addr_width = resp.u.device_attr.phys_addr_width;
+	result->virt_addr_width = resp.u.device_attr.virt_addr_width;
+	result->db_bar = resp.u.device_attr.db_bar;
+	result->max_rdma_size = resp.u.device_attr.max_rdma_size;
+	result->device_caps = resp.u.device_attr.device_caps;
+
+	if (result->admin_api_version < 1) {
+		ibdev_err_ratelimited(
+			edev->efa_dev,
+			"Failed to get device attr api version [%u < 1]\n",
+			result->admin_api_version);
+		return -EINVAL;
+	}
+
+	edev->supported_features = resp.u.device_attr.supported_features;
+	err = efa_com_get_feature(edev, &resp,
+				  EFA_ADMIN_QUEUE_ATTR);
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to get queue attributes %d\n",
+				      err);
+		return err;
+	}
+
+	result->max_qp = resp.u.queue_attr.max_qp;
+	result->max_sq_depth = resp.u.queue_attr.max_sq_depth;
+	result->max_rq_depth = resp.u.queue_attr.max_rq_depth;
+	result->max_cq = resp.u.queue_attr.max_cq;
+	result->max_cq_depth = resp.u.queue_attr.max_cq_depth;
+	result->inline_buf_size = resp.u.queue_attr.inline_buf_size;
+	result->max_sq_sge = resp.u.queue_attr.max_wr_send_sges;
+	result->max_rq_sge = resp.u.queue_attr.max_wr_recv_sges;
+	result->max_mr = resp.u.queue_attr.max_mr;
+	result->max_mr_pages = resp.u.queue_attr.max_mr_pages;
+	result->max_pd = resp.u.queue_attr.max_pd;
+	result->max_ah = resp.u.queue_attr.max_ah;
+	result->max_llq_size = resp.u.queue_attr.max_llq_size;
+	result->sub_cqs_per_cq = resp.u.queue_attr.sub_cqs_per_cq;
+	result->max_wr_rdma_sge = resp.u.queue_attr.max_wr_rdma_sges;
+	result->max_tx_batch = resp.u.queue_attr.max_tx_batch;
+	result->min_sq_depth = resp.u.queue_attr.min_sq_depth;
+
+	err = efa_com_get_feature(edev, &resp, EFA_ADMIN_NETWORK_ATTR);
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to get network attributes %d\n",
+				      err);
+		return err;
+	}
+
+	memcpy(result->addr, resp.u.network_attr.addr,
+	       sizeof(resp.u.network_attr.addr));
+	result->mtu = resp.u.network_attr.mtu;
+
+	return 0;
+}
+
+int efa_com_get_hw_hints(struct efa_com_dev *edev,
+			 struct efa_com_get_hw_hints_result *result)
+{
+	struct efa_admin_get_feature_resp resp;
+	int err;
+
+	err = efa_com_get_feature(edev, &resp, EFA_ADMIN_HW_HINTS);
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to get hw hints %d\n", err);
+		return err;
+	}
+
+	result->admin_completion_timeout = resp.u.hw_hints.admin_completion_timeout;
+	result->driver_watchdog_timeout = resp.u.hw_hints.driver_watchdog_timeout;
+	result->mmio_read_timeout = resp.u.hw_hints.mmio_read_timeout;
+	result->poll_interval = resp.u.hw_hints.poll_interval;
+
+	return 0;
+}
+
+int efa_com_set_feature_ex(struct efa_com_dev *edev,
+			   struct efa_admin_set_feature_resp *set_resp,
+			   struct efa_admin_set_feature_cmd *set_cmd,
+			   enum efa_admin_aq_feature_id feature_id,
+			   dma_addr_t control_buf_dma_addr,
+			   u32 control_buff_size)
+{
+	struct efa_com_admin_queue *aq;
+	int err;
+
+	if (!efa_com_check_supported_feature_id(edev, feature_id)) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Feature %d isn't supported\n",
+				      feature_id);
+		return -EOPNOTSUPP;
+	}
+
+	aq = &edev->aq;
+
+	set_cmd->aq_common_descriptor.opcode = EFA_ADMIN_SET_FEATURE;
+	if (control_buff_size) {
+		set_cmd->aq_common_descriptor.flags = 0;
+		EFA_SET(&set_cmd->aq_common_descriptor.flags,
+			EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA, 1);
+		efa_com_set_dma_addr(control_buf_dma_addr,
+				     &set_cmd->control_buffer.address.mem_addr_high,
+				     &set_cmd->control_buffer.address.mem_addr_low);
+	}
+
+	set_cmd->control_buffer.length = control_buff_size;
+	set_cmd->feature_common.feature_id = feature_id;
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)set_cmd,
+			       sizeof(*set_cmd),
+			       (struct efa_admin_acq_entry *)set_resp,
+			       sizeof(*set_resp));
+
+	if (err) {
+		ibdev_err_ratelimited(
+			edev->efa_dev,
+			"Failed to submit set_feature command %d error: %d\n",
+			feature_id, err);
+		return err;
+	}
+
+	return 0;
+}
+
+static int efa_com_set_feature(struct efa_com_dev *edev,
+			       struct efa_admin_set_feature_resp *set_resp,
+			       struct efa_admin_set_feature_cmd *set_cmd,
+			       enum efa_admin_aq_feature_id feature_id)
+{
+	return efa_com_set_feature_ex(edev, set_resp, set_cmd, feature_id,
+				      0, 0);
+}
+
+int efa_com_set_aenq_config(struct efa_com_dev *edev, u32 groups)
+{
+	struct efa_admin_get_feature_resp get_resp;
+	struct efa_admin_set_feature_resp set_resp;
+	struct efa_admin_set_feature_cmd cmd = {};
+	int err;
+
+	ibdev_dbg(edev->efa_dev, "Configuring aenq with groups[%#x]\n", groups);
+
+	err = efa_com_get_feature(edev, &get_resp, EFA_ADMIN_AENQ_CONFIG);
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to get aenq attributes: %d\n",
+				      err);
+		return err;
+	}
+
+	ibdev_dbg(edev->efa_dev,
+		  "Get aenq groups: supported[%#x] enabled[%#x]\n",
+		  get_resp.u.aenq.supported_groups,
+		  get_resp.u.aenq.enabled_groups);
+
+	if ((get_resp.u.aenq.supported_groups & groups) != groups) {
+		ibdev_err_ratelimited(
+			edev->efa_dev,
+			"Trying to set unsupported aenq groups[%#x] supported[%#x]\n",
+			groups, get_resp.u.aenq.supported_groups);
+		return -EOPNOTSUPP;
+	}
+
+	cmd.u.aenq.enabled_groups = groups;
+	err = efa_com_set_feature(edev, &set_resp, &cmd,
+				  EFA_ADMIN_AENQ_CONFIG);
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to set aenq attributes: %d\n",
+				      err);
+		return err;
+	}
+
+	return 0;
+}
+
+int efa_com_alloc_pd(struct efa_com_dev *edev,
+		     struct efa_com_alloc_pd_result *result)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_alloc_pd_cmd cmd = {};
+	struct efa_admin_alloc_pd_resp resp;
+	int err;
+
+	cmd.aq_common_descriptor.opcode = EFA_ADMIN_ALLOC_PD;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&cmd,
+			       sizeof(cmd),
+			       (struct efa_admin_acq_entry *)&resp,
+			       sizeof(resp));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to allocate pd[%d]\n", err);
+		return err;
+	}
+
+	result->pdn = resp.pd;
+
+	return 0;
+}
+
+int efa_com_dealloc_pd(struct efa_com_dev *edev,
+		       struct efa_com_dealloc_pd_params *params)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_dealloc_pd_cmd cmd = {};
+	struct efa_admin_dealloc_pd_resp resp;
+	int err;
+
+	cmd.aq_common_descriptor.opcode = EFA_ADMIN_DEALLOC_PD;
+	cmd.pd = params->pdn;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&cmd,
+			       sizeof(cmd),
+			       (struct efa_admin_acq_entry *)&resp,
+			       sizeof(resp));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to deallocate pd-%u [%d]\n",
+				      cmd.pd, err);
+		return err;
+	}
+
+	return 0;
+}
+
+int efa_com_alloc_uar(struct efa_com_dev *edev,
+		      struct efa_com_alloc_uar_result *result)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_alloc_uar_cmd cmd = {};
+	struct efa_admin_alloc_uar_resp resp;
+	int err;
+
+	cmd.aq_common_descriptor.opcode = EFA_ADMIN_ALLOC_UAR;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&cmd,
+			       sizeof(cmd),
+			       (struct efa_admin_acq_entry *)&resp,
+			       sizeof(resp));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to allocate uar[%d]\n", err);
+		return err;
+	}
+
+	result->uarn = resp.uar;
+
+	return 0;
+}
+
+int efa_com_dealloc_uar(struct efa_com_dev *edev,
+			struct efa_com_dealloc_uar_params *params)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_dealloc_uar_cmd cmd = {};
+	struct efa_admin_dealloc_uar_resp resp;
+	int err;
+
+	cmd.aq_common_descriptor.opcode = EFA_ADMIN_DEALLOC_UAR;
+	cmd.uar = params->uarn;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&cmd,
+			       sizeof(cmd),
+			       (struct efa_admin_acq_entry *)&resp,
+			       sizeof(resp));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to deallocate uar-%u [%d]\n",
+				      cmd.uar, err);
+		return err;
+	}
+
+	return 0;
+}
+
+#ifdef HAVE_HW_STATS
+int efa_com_get_stats(struct efa_com_dev *edev,
+		      struct efa_com_get_stats_params *params,
+		      union efa_com_get_stats_result *result)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_aq_get_stats_cmd cmd = {};
+	struct efa_admin_acq_get_stats_resp resp;
+	int err;
+
+	cmd.aq_common_descriptor.opcode = EFA_ADMIN_GET_STATS;
+	cmd.type = params->type;
+	cmd.scope = params->scope;
+	cmd.scope_modifier = params->scope_modifier;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&cmd,
+			       sizeof(cmd),
+			       (struct efa_admin_acq_entry *)&resp,
+			       sizeof(resp));
+	if (err) {
+		ibdev_err_ratelimited(
+			edev->efa_dev,
+			"Failed to get stats type-%u scope-%u.%u [%d]\n",
+			cmd.type, cmd.scope, cmd.scope_modifier, err);
+		return err;
+	}
+
+	switch (cmd.type) {
+	case EFA_ADMIN_GET_STATS_TYPE_BASIC:
+		result->basic_stats.tx_bytes = resp.u.basic_stats.tx_bytes;
+		result->basic_stats.tx_pkts = resp.u.basic_stats.tx_pkts;
+		result->basic_stats.rx_bytes = resp.u.basic_stats.rx_bytes;
+		result->basic_stats.rx_pkts = resp.u.basic_stats.rx_pkts;
+		result->basic_stats.rx_drops = resp.u.basic_stats.rx_drops;
+		break;
+	case EFA_ADMIN_GET_STATS_TYPE_MESSAGES:
+		result->messages_stats.send_bytes = resp.u.messages_stats.send_bytes;
+		result->messages_stats.send_wrs = resp.u.messages_stats.send_wrs;
+		result->messages_stats.recv_bytes = resp.u.messages_stats.recv_bytes;
+		result->messages_stats.recv_wrs = resp.u.messages_stats.recv_wrs;
+		break;
+	case EFA_ADMIN_GET_STATS_TYPE_RDMA_READ:
+		result->rdma_read_stats.read_wrs = resp.u.rdma_read_stats.read_wrs;
+		result->rdma_read_stats.read_bytes = resp.u.rdma_read_stats.read_bytes;
+		result->rdma_read_stats.read_wr_err = resp.u.rdma_read_stats.read_wr_err;
+		result->rdma_read_stats.read_resp_bytes = resp.u.rdma_read_stats.read_resp_bytes;
+		break;
+	}
+
+	return 0;
+}
+#endif
diff --git a/drivers/amazon/net/efa/efa_com_cmd.h b/drivers/amazon/net/efa/efa_com_cmd.h
new file mode 100644
index 0000000000000..e572146af876e
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_com_cmd.h
@@ -0,0 +1,319 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_COM_CMD_H_
+#define _EFA_COM_CMD_H_
+
+#include "efa_com.h"
+
+#define EFA_GID_SIZE 16
+
+struct efa_com_create_qp_params {
+	u64 rq_base_addr;
+	u32 send_cq_idx;
+	u32 recv_cq_idx;
+	/*
+	 * Send descriptor ring size in bytes,
+	 * sufficient for user-provided number of WQEs and SGL size
+	 */
+	u32 sq_ring_size_in_bytes;
+	/* Max number of WQEs that will be posted on send queue */
+	u32 sq_depth;
+	/* Recv descriptor ring size in bytes */
+	u32 rq_ring_size_in_bytes;
+	u32 rq_depth;
+	u16 pd;
+	u16 uarn;
+	u8 qp_type;
+};
+
+struct efa_com_create_qp_result {
+	u32 qp_handle;
+	u32 qp_num;
+	u32 sq_db_offset;
+	u32 rq_db_offset;
+	u32 llq_descriptors_offset;
+	u16 send_sub_cq_idx;
+	u16 recv_sub_cq_idx;
+};
+
+struct efa_com_modify_qp_params {
+	u32 modify_mask;
+	u32 qp_handle;
+	u32 qp_state;
+	u32 cur_qp_state;
+	u32 qkey;
+	u32 sq_psn;
+	u8 sq_drained_async_notify;
+	u8 rnr_retry;
+};
+
+struct efa_com_query_qp_params {
+	u32 qp_handle;
+};
+
+struct efa_com_query_qp_result {
+	u32 qp_state;
+	u32 qkey;
+	u32 sq_draining;
+	u32 sq_psn;
+	u8 rnr_retry;
+};
+
+struct efa_com_destroy_qp_params {
+	u32 qp_handle;
+};
+
+struct efa_com_create_cq_params {
+	/* cq physical base address in OS memory */
+	dma_addr_t dma_addr;
+	/* completion queue depth in # of entries */
+	u16 cq_depth;
+	u16 num_sub_cqs;
+	u16 uarn;
+	u8 entry_size_in_bytes;
+};
+
+struct efa_com_create_cq_result {
+	/* cq identifier */
+	u16 cq_idx;
+	/* actual cq depth in # of entries */
+	u16 actual_depth;
+};
+
+struct efa_com_destroy_cq_params {
+	u16 cq_idx;
+};
+
+struct efa_com_create_ah_params {
+	u16 pdn;
+	/* Destination address in network byte order */
+	u8 dest_addr[EFA_GID_SIZE];
+};
+
+struct efa_com_create_ah_result {
+	u16 ah;
+};
+
+struct efa_com_destroy_ah_params {
+	u16 ah;
+	u16 pdn;
+};
+
+struct efa_com_get_device_attr_result {
+	u8 addr[EFA_GID_SIZE];
+	u64 page_size_cap;
+	u64 max_mr_pages;
+	u32 mtu;
+	u32 fw_version;
+	u32 admin_api_version;
+	u32 device_version;
+	u32 supported_features;
+	u32 phys_addr_width;
+	u32 virt_addr_width;
+	u32 max_qp;
+	u32 max_sq_depth; /* wqes */
+	u32 max_rq_depth; /* wqes */
+	u32 max_cq;
+	u32 max_cq_depth; /* cqes */
+	u32 inline_buf_size;
+	u32 max_mr;
+	u32 max_pd;
+	u32 max_ah;
+	u32 max_llq_size;
+	u32 max_rdma_size;
+	u32 device_caps;
+	u16 sub_cqs_per_cq;
+	u16 max_sq_sge;
+	u16 max_rq_sge;
+	u16 max_wr_rdma_sge;
+	u16 max_tx_batch;
+	u16 min_sq_depth;
+	u8 db_bar;
+};
+
+struct efa_com_get_hw_hints_result {
+	u16 mmio_read_timeout;
+	u16 driver_watchdog_timeout;
+	u16 admin_completion_timeout;
+	u16 poll_interval;
+	u32 reserved[4];
+};
+
+struct efa_com_mem_addr {
+	u32 mem_addr_low;
+	u32 mem_addr_high;
+};
+
+/* Used at indirect mode page list chunks for chaining */
+struct efa_com_ctrl_buff_info {
+	/* indicates length of the buffer pointed by control_buffer_address. */
+	u32 length;
+	/* points to control buffer (direct or indirect) */
+	struct efa_com_mem_addr address;
+};
+
+struct efa_com_reg_mr_params {
+	/* Memory region length, in bytes. */
+	u64 mr_length_in_bytes;
+	/* IO Virtual Address associated with this MR. */
+	u64 iova;
+	/* words 8:15: Physical Buffer List, each element is page-aligned. */
+	union {
+		/*
+		 * Inline array of physical addresses of app pages
+		 * (optimization for short region reservations)
+		 */
+		u64 inline_pbl_array[4];
+		/*
+		 * Describes the next physically contiguous chunk of indirect
+		 * page list. A page list contains physical addresses of command
+		 * data pages. Data pages are 4KB; page list chunks are
+		 * variable-sized.
+		 */
+		struct efa_com_ctrl_buff_info pbl;
+	} pbl;
+	/* number of pages in PBL (redundant, could be calculated) */
+	u32 page_num;
+	/* Protection Domain */
+	u16 pd;
+	/*
+	 * phys_page_size_shift - page size is (1 << phys_page_size_shift)
+	 * Page size is used for building the Virtual to Physical
+	 * address mapping
+	 */
+	u8 page_shift;
+	/* see permissions field of struct efa_admin_reg_mr_cmd */
+	u8 permissions;
+	u8 inline_pbl;
+	u8 indirect;
+};
+
+struct efa_com_reg_mr_result {
+	/*
+	 * To be used in conjunction with local buffers references in SQ and
+	 * RQ WQE
+	 */
+	u32 l_key;
+	/*
+	 * To be used in incoming RDMA semantics messages to refer to remotely
+	 * accessed memory region
+	 */
+	u32 r_key;
+};
+
+struct efa_com_dereg_mr_params {
+	u32 l_key;
+};
+
+struct efa_com_alloc_pd_result {
+	u16 pdn;
+};
+
+struct efa_com_dealloc_pd_params {
+	u16 pdn;
+};
+
+struct efa_com_alloc_uar_result {
+	u16 uarn;
+};
+
+struct efa_com_dealloc_uar_params {
+	u16 uarn;
+};
+
+#ifdef HAVE_HW_STATS
+struct efa_com_get_stats_params {
+	/* see enum efa_admin_get_stats_type */
+	u8 type;
+	/* see enum efa_admin_get_stats_scope */
+	u8 scope;
+	u16 scope_modifier;
+};
+
+struct efa_com_basic_stats {
+	u64 tx_bytes;
+	u64 tx_pkts;
+	u64 rx_bytes;
+	u64 rx_pkts;
+	u64 rx_drops;
+};
+
+struct efa_com_messages_stats {
+	u64 send_bytes;
+	u64 send_wrs;
+	u64 recv_bytes;
+	u64 recv_wrs;
+};
+
+struct efa_com_rdma_read_stats {
+	u64 read_wrs;
+	u64 read_bytes;
+	u64 read_wr_err;
+	u64 read_resp_bytes;
+};
+
+union efa_com_get_stats_result {
+	struct efa_com_basic_stats basic_stats;
+	struct efa_com_messages_stats messages_stats;
+	struct efa_com_rdma_read_stats rdma_read_stats;
+};
+#endif
+
+void efa_com_set_dma_addr(dma_addr_t addr, u32 *addr_high, u32 *addr_low);
+int efa_com_create_qp(struct efa_com_dev *edev,
+		      struct efa_com_create_qp_params *params,
+		      struct efa_com_create_qp_result *res);
+int efa_com_modify_qp(struct efa_com_dev *edev,
+		      struct efa_com_modify_qp_params *params);
+int efa_com_query_qp(struct efa_com_dev *edev,
+		     struct efa_com_query_qp_params *params,
+		     struct efa_com_query_qp_result *result);
+int efa_com_destroy_qp(struct efa_com_dev *edev,
+		       struct efa_com_destroy_qp_params *params);
+int efa_com_create_cq(struct efa_com_dev *edev,
+		      struct efa_com_create_cq_params *params,
+		      struct efa_com_create_cq_result *result);
+int efa_com_destroy_cq(struct efa_com_dev *edev,
+		       struct efa_com_destroy_cq_params *params);
+int efa_com_register_mr(struct efa_com_dev *edev,
+			struct efa_com_reg_mr_params *params,
+			struct efa_com_reg_mr_result *result);
+int efa_com_dereg_mr(struct efa_com_dev *edev,
+		     struct efa_com_dereg_mr_params *params);
+int efa_com_create_ah(struct efa_com_dev *edev,
+		      struct efa_com_create_ah_params *params,
+		      struct efa_com_create_ah_result *result);
+int efa_com_destroy_ah(struct efa_com_dev *edev,
+		       struct efa_com_destroy_ah_params *params);
+int efa_com_get_device_attr(struct efa_com_dev *edev,
+			    struct efa_com_get_device_attr_result *result);
+int efa_com_get_hw_hints(struct efa_com_dev *edev,
+			 struct efa_com_get_hw_hints_result *result);
+bool
+efa_com_check_supported_feature_id(struct efa_com_dev *edev,
+				   enum efa_admin_aq_feature_id feature_id);
+int efa_com_set_feature_ex(struct efa_com_dev *edev,
+			   struct efa_admin_set_feature_resp *set_resp,
+			   struct efa_admin_set_feature_cmd *set_cmd,
+			   enum efa_admin_aq_feature_id feature_id,
+			   dma_addr_t control_buf_dma_addr,
+			   u32 control_buff_size);
+int efa_com_set_aenq_config(struct efa_com_dev *edev, u32 groups);
+int efa_com_alloc_pd(struct efa_com_dev *edev,
+		     struct efa_com_alloc_pd_result *result);
+int efa_com_dealloc_pd(struct efa_com_dev *edev,
+		       struct efa_com_dealloc_pd_params *params);
+int efa_com_alloc_uar(struct efa_com_dev *edev,
+		      struct efa_com_alloc_uar_result *result);
+int efa_com_dealloc_uar(struct efa_com_dev *edev,
+			struct efa_com_dealloc_uar_params *params);
+#ifdef HAVE_HW_STATS
+int efa_com_get_stats(struct efa_com_dev *edev,
+		      struct efa_com_get_stats_params *params,
+		      union efa_com_get_stats_result *result);
+#endif
+
+#endif /* _EFA_COM_CMD_H_ */
diff --git a/drivers/amazon/net/efa/efa_common_defs.h b/drivers/amazon/net/efa/efa_common_defs.h
new file mode 100644
index 0000000000000..bbcf48f0eaca4
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_common_defs.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_COMMON_H_
+#define _EFA_COMMON_H_
+
+#ifdef HAVE_BITFIELD_H
+#include <linux/bitfield.h>
+#endif
+
+#define EFA_COMMON_SPEC_VERSION_MAJOR        2
+#define EFA_COMMON_SPEC_VERSION_MINOR        0
+
+#define EFA_GET(ptr, mask) FIELD_GET(mask##_MASK, *(ptr))
+
+#define EFA_SET(ptr, mask, value)                                              \
+	({                                                                     \
+		typeof(ptr) _ptr = ptr;                                        \
+		*_ptr = (*_ptr & ~(mask##_MASK)) |                             \
+			FIELD_PREP(mask##_MASK, value);                        \
+	})
+
+struct efa_common_mem_addr {
+	u32 mem_addr_low;
+
+	u32 mem_addr_high;
+};
+
+#endif /* _EFA_COMMON_H_ */
diff --git a/drivers/amazon/net/efa/efa_gdr.c b/drivers/amazon/net/efa/efa_gdr.c
new file mode 100644
index 0000000000000..5ec34afb6571a
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_gdr.c
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include "efa_gdr.h"
+
+#define GPU_PAGE_SHIFT 16
+#define GPU_PAGE_SIZE BIT_ULL(GPU_PAGE_SHIFT)
+
+static struct mutex nvmem_list_lock;
+static struct list_head nvmem_list;
+static atomic64_t next_nvmem_ticket;
+
+void nvmem_init(void)
+{
+	mutex_init(&nvmem_list_lock);
+	INIT_LIST_HEAD(&nvmem_list);
+	/*
+	 * Ideally, first ticket would be zero, but that would make callback
+	 * data NULL which is invalid.
+	 */
+	atomic64_set(&next_nvmem_ticket, 1);
+}
+
+static int nvmem_pgsz(enum nvidia_p2p_page_size_type pgszt)
+{
+	switch (pgszt) {
+	case NVIDIA_P2P_PAGE_SIZE_4KB:
+		return SZ_4K;
+	case NVIDIA_P2P_PAGE_SIZE_64KB:
+		return SZ_64K;
+	case NVIDIA_P2P_PAGE_SIZE_128KB:
+		return SZ_128K;
+	default:
+		return 0;
+	}
+}
+
+static struct efa_nvmem *ticket_to_nvmem(u64 ticket)
+{
+	struct efa_nvmem *nvmem;
+
+	lockdep_assert_held(&nvmem_list_lock);
+	list_for_each_entry(nvmem, &nvmem_list, list) {
+		if (nvmem->ticket == ticket)
+			return nvmem;
+	}
+
+	return NULL;
+}
+
+int nvmem_put(u64 ticket, bool in_cb)
+{
+	struct efa_com_dereg_mr_params params = {};
+	struct efa_nvmem *nvmem;
+	struct efa_dev *dev;
+	int err;
+
+	mutex_lock(&nvmem_list_lock);
+	nvmem = ticket_to_nvmem(ticket);
+	if (!nvmem) {
+		pr_debug("Ticket %llu not found in the nvmem list\n", ticket);
+		mutex_unlock(&nvmem_list_lock);
+		return 0;
+	}
+
+	dev = nvmem->dev;
+	if (nvmem->needs_dereg) {
+		params.l_key = nvmem->lkey;
+		err = efa_com_dereg_mr(&dev->edev, &params);
+		if (err) {
+			mutex_unlock(&nvmem_list_lock);
+			return err;
+		}
+		nvmem->needs_dereg = false;
+	}
+
+	nvmem_release(dev, nvmem, in_cb);
+
+	/* Dereg is the last nvmem consumer, delete the ticket */
+	if (!in_cb) {
+		list_del(&nvmem->list);
+		kfree(nvmem);
+	}
+	mutex_unlock(&nvmem_list_lock);
+
+	return 0;
+}
+
+static void nvmem_free_cb(void *data)
+{
+	pr_debug("Free callback ticket %llu\n", (u64)data);
+	nvmem_put((u64)data, true);
+}
+
+static int nvmem_get_pages(struct efa_dev *dev, struct efa_nvmem *nvmem,
+			   u64 addr, u64 size)
+{
+	int err;
+
+	err = nvidia_p2p_get_pages(0, 0, addr, size, &nvmem->pgtbl,
+				   nvmem_free_cb, (void *)nvmem->ticket);
+	if (err) {
+		ibdev_dbg(&dev->ibdev, "nvidia_p2p_get_pages failed %d\n", err);
+		return err;
+	}
+
+	if (!NVIDIA_P2P_PAGE_TABLE_VERSION_COMPATIBLE(nvmem->pgtbl)) {
+		ibdev_dbg(&dev->ibdev, "Incompatible page table version %#08x\n",
+			  nvmem->pgtbl->version);
+		nvidia_p2p_put_pages(0, 0, addr, nvmem->pgtbl);
+		nvmem->pgtbl = NULL;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int nvmem_dma_map(struct efa_dev *dev, struct efa_nvmem *nvmem)
+{
+	int err;
+
+	err = nvidia_p2p_dma_map_pages(dev->pdev, nvmem->pgtbl,
+				       &nvmem->dma_mapping);
+	if (err) {
+		ibdev_dbg(&dev->ibdev, "nvidia_p2p_dma_map_pages failed %d\n",
+			  err);
+		return err;
+	}
+
+	if (!NVIDIA_P2P_DMA_MAPPING_VERSION_COMPATIBLE(nvmem->dma_mapping)) {
+		ibdev_dbg(&dev->ibdev, "Incompatible DMA mapping version %#08x\n",
+			  nvmem->dma_mapping->version);
+		nvidia_p2p_dma_unmap_pages(dev->pdev, nvmem->pgtbl,
+					   nvmem->dma_mapping);
+		nvmem->dma_mapping = NULL;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+struct efa_nvmem *nvmem_get(struct efa_dev *dev, struct efa_mr *mr, u64 start,
+			    u64 length, unsigned int *pgsz)
+{
+	struct efa_nvmem *nvmem;
+	u64 virt_start;
+	u64 pinsz;
+	int err;
+
+	nvmem = kzalloc(sizeof(*nvmem), GFP_KERNEL);
+	if (!nvmem)
+		return NULL;
+
+	nvmem->ticket = atomic64_fetch_inc(&next_nvmem_ticket);
+	mr->nvmem_ticket = nvmem->ticket;
+	nvmem->dev = dev;
+	virt_start = ALIGN_DOWN(start, GPU_PAGE_SIZE);
+	pinsz = start + length - virt_start;
+	nvmem->virt_start = virt_start;
+
+	err = nvmem_get_pages(dev, nvmem, virt_start, pinsz);
+	if (err) {
+		/* Most likely cpu pages */
+		goto err_free;
+	}
+
+	err = nvmem_dma_map(dev, nvmem);
+	if (err)
+		goto err_put;
+
+	*pgsz = nvmem_pgsz(nvmem->pgtbl->page_size);
+	if (!*pgsz)
+		goto err_unmap;
+
+	mutex_lock(&nvmem_list_lock);
+	list_add(&nvmem->list, &nvmem_list);
+	mutex_unlock(&nvmem_list_lock);
+
+	return nvmem;
+
+err_unmap:
+	nvidia_p2p_dma_unmap_pages(dev->pdev, nvmem->pgtbl, nvmem->dma_mapping);
+err_put:
+	nvidia_p2p_put_pages(0, 0, start, nvmem->pgtbl);
+err_free:
+	kfree(nvmem);
+	return NULL;
+}
+
+int nvmem_to_page_list(struct efa_dev *dev, struct efa_nvmem *nvmem,
+		       u64 *page_list)
+{
+	struct nvidia_p2p_dma_mapping *dma_mapping = nvmem->dma_mapping;
+	int i;
+
+	for (i = 0; i < dma_mapping->entries; i++)
+		page_list[i] = dma_mapping->dma_addresses[i];
+
+	return 0;
+}
+
+void nvmem_release(struct efa_dev *dev, struct efa_nvmem *nvmem, bool in_cb)
+{
+	if (in_cb) {
+		if (nvmem->dma_mapping) {
+			nvidia_p2p_free_dma_mapping(nvmem->dma_mapping);
+			nvmem->dma_mapping = NULL;
+		}
+
+		if (nvmem->pgtbl) {
+			nvidia_p2p_free_page_table(nvmem->pgtbl);
+			nvmem->pgtbl = NULL;
+		}
+	} else {
+		if (nvmem->dma_mapping)
+			nvidia_p2p_dma_unmap_pages(dev->pdev, nvmem->pgtbl,
+						   nvmem->dma_mapping);
+
+		if (nvmem->pgtbl)
+			nvidia_p2p_put_pages(0, 0, nvmem->virt_start,
+					     nvmem->pgtbl);
+	}
+}
diff --git a/drivers/amazon/net/efa/efa_gdr.h b/drivers/amazon/net/efa/efa_gdr.h
new file mode 100644
index 0000000000000..497307c6305da
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_gdr.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_GDR_H_
+#define _EFA_GDR_H_
+
+#include "efa.h"
+#include "nv-p2p.h"
+
+struct efa_nvmem {
+	struct efa_dev *dev;
+	struct nvidia_p2p_page_table *pgtbl;
+	struct nvidia_p2p_dma_mapping *dma_mapping;
+	u64 virt_start;
+	u64 ticket;
+	u32 lkey;
+	bool needs_dereg;
+	struct list_head list; /* member of nvmem_list */
+};
+
+void nvmem_init(void);
+struct efa_nvmem *nvmem_get(struct efa_dev *dev, struct efa_mr *mr, u64 start,
+			    u64 length, unsigned int *pgsz);
+int nvmem_to_page_list(struct efa_dev *dev, struct efa_nvmem *nvmem,
+		       u64 *page_list);
+int nvmem_put(u64 ticket, bool in_cb);
+void nvmem_release(struct efa_dev *dev, struct efa_nvmem *nvmem, bool in_cb);
+
+#endif /* _EFA_GDR_H_ */
diff --git a/drivers/amazon/net/efa/efa_main.c b/drivers/amazon/net/efa/efa_main.c
new file mode 100644
index 0000000000000..2ca15287819d3
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_main.c
@@ -0,0 +1,951 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
+/*
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/utsname.h>
+#include <linux/version.h>
+
+#include <rdma/ib_user_verbs.h>
+
+#include "efa.h"
+#include "efa_sysfs.h"
+
+#ifdef HAVE_EFA_GDR
+#include "efa_gdr.h"
+#endif
+
+#ifndef HAVE_PCI_VENDOR_ID_AMAZON
+#define PCI_VENDOR_ID_AMAZON 0x1d0f
+#endif
+#define PCI_DEV_ID_EFA0_VF 0xefa0
+#define PCI_DEV_ID_EFA1_VF 0xefa1
+
+static const struct pci_device_id efa_pci_tbl[] = {
+	{ PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA0_VF) },
+	{ PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA1_VF) },
+	{ }
+};
+
+#define DRV_MODULE_VER_MAJOR           1
+#define DRV_MODULE_VER_MINOR           11
+#define DRV_MODULE_VER_SUBMINOR        1
+
+#ifndef DRV_MODULE_VERSION
+#define DRV_MODULE_VERSION \
+	__stringify(DRV_MODULE_VER_MAJOR) "."   \
+	__stringify(DRV_MODULE_VER_MINOR) "."   \
+	__stringify(DRV_MODULE_VER_SUBMINOR) "g"
+#endif
+
+MODULE_VERSION(DRV_MODULE_VERSION);
+MODULE_SOFTDEP("pre: ib_uverbs");
+
+static char version[] = DEVICE_NAME " v" DRV_MODULE_VERSION;
+
+MODULE_AUTHOR("Amazon.com, Inc. or its affiliates");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION(DEVICE_NAME);
+MODULE_DEVICE_TABLE(pci, efa_pci_tbl);
+#ifdef HAVE_EFA_GDR
+MODULE_INFO(gdr, "Y");
+#endif
+
+#define EFA_REG_BAR 0
+#define EFA_MEM_BAR 2
+#define EFA_BASE_BAR_MASK (BIT(EFA_REG_BAR) | BIT(EFA_MEM_BAR))
+
+#define EFA_AENQ_ENABLED_GROUPS \
+	(BIT(EFA_ADMIN_FATAL_ERROR) | BIT(EFA_ADMIN_WARNING) | \
+	 BIT(EFA_ADMIN_NOTIFICATION) | BIT(EFA_ADMIN_KEEP_ALIVE))
+
+#ifdef HAVE_CUSTOM_COMMANDS
+#define EFA_EVERBS_DEVICE_NAME "efa_everbs"
+#define EFA_EVERBS_MAX_DEVICES 64
+
+static struct class *efa_everbs_class;
+static unsigned int efa_everbs_major;
+
+static int efa_everbs_dev_init(struct efa_dev *dev, int devnum);
+static void efa_everbs_dev_destroy(struct efa_dev *dev);
+#endif
+
+/* This handler will called for unknown event group or unimplemented handlers */
+static void unimplemented_aenq_handler(void *data,
+				       struct efa_admin_aenq_entry *aenq_e)
+{
+	struct efa_dev *dev = (struct efa_dev *)data;
+
+	ibdev_err(&dev->ibdev,
+		  "Unknown event was received or event with unimplemented handler\n");
+}
+
+static void efa_keep_alive(void *data, struct efa_admin_aenq_entry *aenq_e)
+{
+	struct efa_dev *dev = (struct efa_dev *)data;
+
+	atomic64_inc(&dev->stats.keep_alive_rcvd);
+}
+
+static struct efa_aenq_handlers aenq_handlers = {
+	.handlers = {
+		[EFA_ADMIN_KEEP_ALIVE] = efa_keep_alive,
+	},
+	.unimplemented_handler = unimplemented_aenq_handler
+};
+
+static void efa_release_bars(struct efa_dev *dev, int bars_mask)
+{
+	struct pci_dev *pdev = dev->pdev;
+	int release_bars;
+
+	release_bars = pci_select_bars(pdev, IORESOURCE_MEM) & bars_mask;
+	pci_release_selected_regions(pdev, release_bars);
+}
+
+static irqreturn_t efa_intr_msix_mgmnt(int irq, void *data)
+{
+	struct efa_dev *dev = data;
+
+	efa_com_admin_q_comp_intr_handler(&dev->edev);
+	efa_com_aenq_intr_handler(&dev->edev, data);
+
+	return IRQ_HANDLED;
+}
+
+static int efa_request_mgmnt_irq(struct efa_dev *dev)
+{
+	struct efa_irq *irq;
+	int err;
+
+	irq = &dev->admin_irq;
+	err = request_irq(irq->vector, irq->handler, 0, irq->name,
+			  irq->data);
+	if (err) {
+		dev_err(&dev->pdev->dev, "Failed to request admin irq (%d)\n",
+			err);
+		return err;
+	}
+
+	dev_dbg(&dev->pdev->dev, "Set affinity hint of mgmnt irq to %*pbl (irq vector: %d)\n",
+		nr_cpumask_bits, &irq->affinity_hint_mask, irq->vector);
+	irq_set_affinity_hint(irq->vector, &irq->affinity_hint_mask);
+
+	return 0;
+}
+
+static void efa_setup_mgmnt_irq(struct efa_dev *dev)
+{
+	u32 cpu;
+
+	snprintf(dev->admin_irq.name, EFA_IRQNAME_SIZE,
+		 "efa-mgmnt@pci:%s", pci_name(dev->pdev));
+	dev->admin_irq.handler = efa_intr_msix_mgmnt;
+	dev->admin_irq.data = dev;
+	dev->admin_irq.vector =
+#ifndef HAVE_PCI_IRQ_VECTOR
+		dev->admin_msix_entry.vector;
+#else
+		pci_irq_vector(dev->pdev, dev->admin_msix_vector_idx);
+#endif
+	cpu = cpumask_first(cpu_online_mask);
+	dev->admin_irq.cpu = cpu;
+	cpumask_set_cpu(cpu,
+			&dev->admin_irq.affinity_hint_mask);
+	dev_info(&dev->pdev->dev, "Setup irq:0x%p vector:%d name:%s\n",
+		 &dev->admin_irq,
+		 dev->admin_irq.vector,
+		 dev->admin_irq.name);
+}
+
+static void efa_free_mgmnt_irq(struct efa_dev *dev)
+{
+	struct efa_irq *irq;
+
+	irq = &dev->admin_irq;
+	irq_set_affinity_hint(irq->vector, NULL);
+	free_irq(irq->vector, irq->data);
+}
+
+static int efa_set_mgmnt_irq(struct efa_dev *dev)
+{
+	efa_setup_mgmnt_irq(dev);
+
+	return efa_request_mgmnt_irq(dev);
+}
+
+static int efa_request_doorbell_bar(struct efa_dev *dev)
+{
+	u8 db_bar_idx = dev->dev_attr.db_bar;
+	struct pci_dev *pdev = dev->pdev;
+	int bars;
+	int err;
+
+	if (!(BIT(db_bar_idx) & EFA_BASE_BAR_MASK)) {
+		bars = pci_select_bars(pdev, IORESOURCE_MEM) & BIT(db_bar_idx);
+
+		err = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME);
+		if (err) {
+			dev_err(&dev->pdev->dev,
+				"pci_request_selected_regions for bar %d failed %d\n",
+				db_bar_idx, err);
+			return err;
+		}
+	}
+
+	dev->db_bar_addr = pci_resource_start(dev->pdev, db_bar_idx);
+	dev->db_bar_len = pci_resource_len(dev->pdev, db_bar_idx);
+
+	return 0;
+}
+
+static void efa_release_doorbell_bar(struct efa_dev *dev)
+{
+	if (!(BIT(dev->dev_attr.db_bar) & EFA_BASE_BAR_MASK))
+		efa_release_bars(dev, BIT(dev->dev_attr.db_bar));
+}
+
+static void efa_update_hw_hints(struct efa_dev *dev,
+				struct efa_com_get_hw_hints_result *hw_hints)
+{
+	struct efa_com_dev *edev = &dev->edev;
+
+	if (hw_hints->mmio_read_timeout)
+		edev->mmio_read.mmio_read_timeout =
+			hw_hints->mmio_read_timeout * 1000;
+
+	if (hw_hints->poll_interval)
+		edev->aq.poll_interval = hw_hints->poll_interval;
+
+	if (hw_hints->admin_completion_timeout)
+		edev->aq.completion_timeout =
+			hw_hints->admin_completion_timeout;
+}
+
+static void efa_stats_init(struct efa_dev *dev)
+{
+	atomic64_t *s = (atomic64_t *)&dev->stats;
+	int i;
+
+	for (i = 0; i < sizeof(dev->stats) / sizeof(*s); i++, s++)
+		atomic64_set(s, 0);
+}
+
+static void efa_set_host_info(struct efa_dev *dev)
+{
+	struct efa_admin_set_feature_resp resp = {};
+	struct efa_admin_set_feature_cmd cmd = {};
+	struct efa_admin_host_info *hinf;
+	u32 bufsz = sizeof(*hinf);
+	dma_addr_t hinf_dma;
+
+	if (!efa_com_check_supported_feature_id(&dev->edev,
+						EFA_ADMIN_HOST_INFO))
+		return;
+
+	/* Failures in host info set shall not disturb probe */
+	hinf = dma_alloc_coherent(&dev->pdev->dev, bufsz, &hinf_dma,
+				  GFP_KERNEL);
+	if (!hinf)
+		return;
+
+	strlcpy(hinf->os_dist_str, utsname()->release,
+		min(sizeof(hinf->os_dist_str), sizeof(utsname()->release)));
+	hinf->os_type = EFA_ADMIN_OS_LINUX;
+	strlcpy(hinf->kernel_ver_str, utsname()->version,
+		min(sizeof(hinf->kernel_ver_str), sizeof(utsname()->version)));
+	hinf->kernel_ver = LINUX_VERSION_CODE;
+	EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_MAJOR,
+		DRV_MODULE_VER_MAJOR);
+	EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_MINOR,
+		DRV_MODULE_VER_MINOR);
+	EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_SUB_MINOR,
+		DRV_MODULE_VER_SUBMINOR);
+	EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_MODULE_TYPE,
+		"g"[0]);
+	EFA_SET(&hinf->bdf, EFA_ADMIN_HOST_INFO_BUS, dev->pdev->bus->number);
+	EFA_SET(&hinf->bdf, EFA_ADMIN_HOST_INFO_DEVICE,
+		PCI_SLOT(dev->pdev->devfn));
+	EFA_SET(&hinf->bdf, EFA_ADMIN_HOST_INFO_FUNCTION,
+		PCI_FUNC(dev->pdev->devfn));
+	EFA_SET(&hinf->spec_ver, EFA_ADMIN_HOST_INFO_SPEC_MAJOR,
+		EFA_COMMON_SPEC_VERSION_MAJOR);
+	EFA_SET(&hinf->spec_ver, EFA_ADMIN_HOST_INFO_SPEC_MINOR,
+		EFA_COMMON_SPEC_VERSION_MINOR);
+#ifdef HAVE_EFA_GDR
+	EFA_SET(&hinf->flags, EFA_ADMIN_HOST_INFO_GDR, 1);
+#endif
+
+	efa_com_set_feature_ex(&dev->edev, &resp, &cmd, EFA_ADMIN_HOST_INFO,
+			       hinf_dma, bufsz);
+
+	dma_free_coherent(&dev->pdev->dev, bufsz, hinf, hinf_dma);
+}
+
+#ifdef HAVE_IB_DEV_OPS
+static const struct ib_device_ops efa_dev_ops = {
+#ifdef HAVE_IB_DEVICE_OPS_COMMON
+	.owner = THIS_MODULE,
+	.driver_id = RDMA_DRIVER_EFA,
+	.uverbs_abi_ver = EFA_UVERBS_ABI_VERSION,
+#endif
+
+	.alloc_hw_stats = efa_alloc_hw_stats,
+#ifdef HAVE_PD_CORE_ALLOCATION
+	.alloc_pd = efa_alloc_pd,
+#else
+	.alloc_pd = efa_kzalloc_pd,
+#endif
+#ifdef HAVE_UCONTEXT_CORE_ALLOCATION
+	.alloc_ucontext = efa_alloc_ucontext,
+#else
+	.alloc_ucontext = efa_kzalloc_ucontext,
+#endif
+#ifdef HAVE_AH_CORE_ALLOCATION
+	.create_ah = efa_create_ah,
+#else
+	.create_ah = efa_kzalloc_ah,
+#endif
+#ifdef HAVE_CQ_CORE_ALLOCATION
+	.create_cq = efa_create_cq,
+#else
+	.create_cq = efa_kzalloc_cq,
+#endif
+	.create_qp = efa_create_qp,
+#ifdef HAVE_UVERBS_CMD_MASK_NOT_NEEDED
+	.create_user_ah = efa_create_ah,
+#endif
+	.dealloc_pd = efa_dealloc_pd,
+	.dealloc_ucontext = efa_dealloc_ucontext,
+	.dereg_mr = efa_dereg_mr,
+	.destroy_ah = efa_destroy_ah,
+	.destroy_cq = efa_destroy_cq,
+	.destroy_qp = efa_destroy_qp,
+#ifndef HAVE_NO_KVERBS_DRIVERS
+	.get_dma_mr = efa_get_dma_mr,
+#endif
+	.get_hw_stats = efa_get_hw_stats,
+	.get_link_layer = efa_port_link_layer,
+	.get_port_immutable = efa_get_port_immutable,
+	.mmap = efa_mmap,
+#ifdef HAVE_CORE_MMAP_XA
+	.mmap_free = efa_mmap_free,
+#endif
+	.modify_qp = efa_modify_qp,
+#ifndef HAVE_NO_KVERBS_DRIVERS
+	.poll_cq = efa_poll_cq,
+	.post_recv = efa_post_recv,
+	.post_send = efa_post_send,
+#endif
+	.query_device = efa_query_device,
+	.query_gid = efa_query_gid,
+	.query_pkey = efa_query_pkey,
+	.query_port = efa_query_port,
+	.query_qp = efa_query_qp,
+	.reg_user_mr = efa_reg_mr,
+#ifndef HAVE_NO_KVERBS_DRIVERS
+	.req_notify_cq = efa_req_notify_cq,
+#endif
+
+#ifdef HAVE_AH_CORE_ALLOCATION
+	INIT_RDMA_OBJ_SIZE(ib_ah, efa_ah, ibah),
+#endif
+#ifdef HAVE_CQ_CORE_ALLOCATION
+	INIT_RDMA_OBJ_SIZE(ib_cq, efa_cq, ibcq),
+#endif
+#ifdef HAVE_PD_CORE_ALLOCATION
+	INIT_RDMA_OBJ_SIZE(ib_pd, efa_pd, ibpd),
+#endif
+#ifdef HAVE_UCONTEXT_CORE_ALLOCATION
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, efa_ucontext, ibucontext),
+#endif
+};
+#endif
+
+static int efa_ib_device_add(struct efa_dev *dev)
+{
+	struct efa_com_get_hw_hints_result hw_hints;
+	struct pci_dev *pdev = dev->pdev;
+#ifdef HAVE_CUSTOM_COMMANDS
+	int devnum;
+#endif
+	int err;
+
+#ifdef HAVE_CREATE_AH_NO_UDATA
+	INIT_LIST_HEAD(&dev->efa_ah_list);
+	mutex_init(&dev->ah_list_lock);
+#endif
+
+	efa_stats_init(dev);
+
+	err = efa_com_get_device_attr(&dev->edev, &dev->dev_attr);
+	if (err)
+		return err;
+
+	dev_dbg(&dev->pdev->dev, "Doorbells bar (%d)\n", dev->dev_attr.db_bar);
+	err = efa_request_doorbell_bar(dev);
+	if (err)
+		return err;
+
+	err = efa_com_get_hw_hints(&dev->edev, &hw_hints);
+	if (err)
+		goto err_release_doorbell_bar;
+
+	efa_update_hw_hints(dev, &hw_hints);
+
+	/* Try to enable all the available aenq groups */
+	err = efa_com_set_aenq_config(&dev->edev, EFA_AENQ_ENABLED_GROUPS);
+	if (err)
+		goto err_release_doorbell_bar;
+
+	efa_set_host_info(dev);
+
+	dev->ibdev.node_type = RDMA_NODE_UNSPECIFIED;
+	dev->ibdev.phys_port_cnt = 1;
+	dev->ibdev.num_comp_vectors = 1;
+#ifdef HAVE_DEV_PARENT
+	dev->ibdev.dev.parent = &pdev->dev;
+#else
+	dev->ibdev.dma_device = &pdev->dev;
+#endif
+
+#ifndef HAVE_UVERBS_CMD_MASK_NOT_NEEDED
+	dev->ibdev.uverbs_cmd_mask |=
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+		(1ull << IB_USER_VERBS_CMD_REG_MR) |
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP) |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_AH) |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_AH);
+#endif
+
+#if defined(HAVE_IB_QUERY_DEVICE_UDATA) && !defined(HAVE_UVERBS_CMD_MASK_NOT_NEEDED)
+	dev->ibdev.uverbs_ex_cmd_mask =
+		(1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE);
+#endif
+
+#ifndef HAVE_IB_DEVICE_OPS_COMMON
+#ifdef HAVE_DRIVER_ID
+	dev->ibdev.driver_id = RDMA_DRIVER_EFA;
+#endif
+	dev->ibdev.uverbs_abi_ver = EFA_UVERBS_ABI_VERSION;
+	dev->ibdev.owner = THIS_MODULE;
+#endif
+#ifdef HAVE_IB_DEV_OPS
+	ib_set_device_ops(&dev->ibdev, &efa_dev_ops);
+#else
+#ifdef HAVE_HW_STATS
+	dev->ibdev.alloc_hw_stats = efa_alloc_hw_stats;
+#endif
+	dev->ibdev.alloc_pd = efa_kzalloc_pd;
+	dev->ibdev.alloc_ucontext = efa_kzalloc_ucontext;
+	dev->ibdev.create_ah = efa_kzalloc_ah;
+	dev->ibdev.create_cq = efa_kzalloc_cq;
+	dev->ibdev.create_qp = efa_create_qp;
+	dev->ibdev.dealloc_pd = efa_dealloc_pd;
+	dev->ibdev.dealloc_ucontext = efa_dealloc_ucontext;
+	dev->ibdev.dereg_mr = efa_dereg_mr;
+	dev->ibdev.destroy_ah = efa_destroy_ah;
+	dev->ibdev.destroy_cq = efa_destroy_cq;
+	dev->ibdev.destroy_qp = efa_destroy_qp;
+	dev->ibdev.get_dma_mr = efa_get_dma_mr;
+#ifdef HAVE_HW_STATS
+	dev->ibdev.get_hw_stats = efa_get_hw_stats;
+#endif
+	dev->ibdev.get_link_layer = efa_port_link_layer;
+#ifdef HAVE_GET_PORT_IMMUTABLE
+	dev->ibdev.get_port_immutable = efa_get_port_immutable;
+#endif
+	dev->ibdev.mmap = efa_mmap;
+	dev->ibdev.modify_qp = efa_modify_qp;
+	dev->ibdev.poll_cq = efa_poll_cq;
+	dev->ibdev.post_recv = efa_post_recv;
+	dev->ibdev.post_send = efa_post_send;
+	dev->ibdev.query_device = efa_query_device;
+	dev->ibdev.query_gid = efa_query_gid;
+	dev->ibdev.query_pkey = efa_query_pkey;
+	dev->ibdev.query_port = efa_query_port;
+	dev->ibdev.query_qp = efa_query_qp;
+	dev->ibdev.reg_user_mr = efa_reg_mr;
+	dev->ibdev.req_notify_cq = efa_req_notify_cq;
+#endif
+
+#ifdef HAVE_IB_REGISTER_DEVICE_DMA_DEVICE_PARAM
+	err = ib_register_device(&dev->ibdev, "efa_%d", &pdev->dev);
+#elif defined(HAVE_IB_REGISTER_DEVICE_TWO_PARAMS)
+	err = ib_register_device(&dev->ibdev, "efa_%d");
+#elif defined(HAVE_IB_REGISTER_DEVICE_NAME_PARAM)
+	err = ib_register_device(&dev->ibdev, "efa_%d", NULL);
+#else
+	strlcpy(dev->ibdev.name, "efa_%d",
+		sizeof(dev->ibdev.name));
+
+	err = ib_register_device(&dev->ibdev, NULL);
+#endif
+	if (err)
+		goto err_release_doorbell_bar;
+
+	ibdev_info(&dev->ibdev, "IB device registered\n");
+
+#ifdef HAVE_CUSTOM_COMMANDS
+	if (sscanf(dev_name(&dev->ibdev.dev), "efa_%d\n", &devnum) != 1) {
+		err = -EINVAL;
+		goto err_unregister_ibdev;
+	}
+
+	err = efa_everbs_dev_init(dev, devnum);
+	if (err)
+		goto err_unregister_ibdev;
+	ibdev_info(&dev->ibdev, "Created everbs device %s%d\n",
+		   EFA_EVERBS_DEVICE_NAME, devnum);
+#endif
+
+	return 0;
+
+#ifdef HAVE_CUSTOM_COMMANDS
+err_unregister_ibdev:
+	ib_unregister_device(&dev->ibdev);
+#endif
+err_release_doorbell_bar:
+	efa_release_doorbell_bar(dev);
+	return err;
+}
+
+static void efa_ib_device_remove(struct efa_dev *dev)
+{
+#ifdef HAVE_CREATE_AH_NO_UDATA
+	WARN_ON(!list_empty(&dev->efa_ah_list));
+#endif
+	efa_com_dev_reset(&dev->edev, EFA_REGS_RESET_NORMAL);
+#ifdef HAVE_CUSTOM_COMMANDS
+	efa_everbs_dev_destroy(dev);
+#endif
+	ibdev_info(&dev->ibdev, "Unregister ib device\n");
+	ib_unregister_device(&dev->ibdev);
+	efa_release_doorbell_bar(dev);
+}
+
+static void efa_disable_msix(struct efa_dev *dev)
+{
+#ifndef HAVE_PCI_IRQ_VECTOR
+	pci_disable_msix(dev->pdev);
+#else
+	pci_free_irq_vectors(dev->pdev);
+#endif
+}
+
+static int efa_enable_msix(struct efa_dev *dev)
+{
+	int msix_vecs, irq_num;
+
+	/* Reserve the max msix vectors we might need */
+	msix_vecs = EFA_NUM_MSIX_VEC;
+	dev_dbg(&dev->pdev->dev, "Trying to enable MSI-X, vectors %d\n",
+		msix_vecs);
+
+#ifndef HAVE_PCI_IRQ_VECTOR
+	dev->admin_msix_entry.entry = EFA_MGMNT_MSIX_VEC_IDX;
+	irq_num = pci_enable_msix_range(dev->pdev,
+					&dev->admin_msix_entry,
+					msix_vecs, msix_vecs);
+#else
+	dev->admin_msix_vector_idx = EFA_MGMNT_MSIX_VEC_IDX;
+	irq_num = pci_alloc_irq_vectors(dev->pdev, msix_vecs,
+					msix_vecs, PCI_IRQ_MSIX);
+#endif
+
+	if (irq_num < 0) {
+		dev_err(&dev->pdev->dev, "Failed to enable MSI-X. irq_num %d\n",
+			irq_num);
+		return -ENOSPC;
+	}
+
+	if (irq_num != msix_vecs) {
+		dev_err(&dev->pdev->dev,
+			"Allocated %d MSI-X (out of %d requested)\n",
+			irq_num, msix_vecs);
+		return -ENOSPC;
+	}
+
+	return 0;
+}
+
+static int efa_device_init(struct efa_com_dev *edev, struct pci_dev *pdev)
+{
+	int dma_width;
+	int err;
+
+	err = efa_com_dev_reset(edev, EFA_REGS_RESET_NORMAL);
+	if (err)
+		return err;
+
+	err = efa_com_validate_version(edev);
+	if (err)
+		return err;
+
+	dma_width = efa_com_get_dma_width(edev);
+	if (dma_width < 0) {
+		err = dma_width;
+		return err;
+	}
+
+	err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(dma_width));
+	if (err) {
+		dev_err(&pdev->dev, "dma_set_mask_and_coherent failed %d\n", err);
+		return err;
+	}
+	dma_set_max_seg_size(&pdev->dev, UINT_MAX);
+	return 0;
+}
+
+static struct efa_dev *efa_probe_device(struct pci_dev *pdev)
+{
+	struct efa_com_dev *edev;
+	struct efa_dev *dev;
+	int bars;
+	int err;
+
+	err = pci_enable_device_mem(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "pci_enable_device_mem() failed!\n");
+		return ERR_PTR(err);
+	}
+
+	pci_set_master(pdev);
+
+#ifdef HAVE_SAFE_IB_ALLOC_DEVICE
+	dev = ib_alloc_device(efa_dev, ibdev);
+#else
+	dev = (struct efa_dev *)ib_alloc_device(sizeof(*dev));
+#endif
+	if (!dev) {
+		dev_err(&pdev->dev, "Device alloc failed\n");
+		err = -ENOMEM;
+		goto err_disable_device;
+	}
+
+	pci_set_drvdata(pdev, dev);
+	edev = &dev->edev;
+	edev->efa_dev = dev;
+	edev->dmadev = &pdev->dev;
+	dev->pdev = pdev;
+
+	bars = pci_select_bars(pdev, IORESOURCE_MEM) & EFA_BASE_BAR_MASK;
+	err = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME);
+	if (err) {
+		dev_err(&pdev->dev, "pci_request_selected_regions failed %d\n",
+			err);
+		goto err_ibdev_destroy;
+	}
+
+	dev->reg_bar_addr = pci_resource_start(pdev, EFA_REG_BAR);
+	dev->reg_bar_len = pci_resource_len(pdev, EFA_REG_BAR);
+	dev->mem_bar_addr = pci_resource_start(pdev, EFA_MEM_BAR);
+	dev->mem_bar_len = pci_resource_len(pdev, EFA_MEM_BAR);
+
+	edev->reg_bar = devm_ioremap(&pdev->dev,
+				     dev->reg_bar_addr,
+				     dev->reg_bar_len);
+	if (!edev->reg_bar) {
+		dev_err(&pdev->dev, "Failed to remap register bar\n");
+		err = -EFAULT;
+		goto err_release_bars;
+	}
+
+	err = efa_com_mmio_reg_read_init(edev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to init readless MMIO\n");
+		goto err_iounmap;
+	}
+
+	err = efa_device_init(edev, pdev);
+	if (err) {
+		dev_err(&pdev->dev, "EFA device init failed\n");
+		if (err == -ETIME)
+			err = -EPROBE_DEFER;
+		goto err_reg_read_destroy;
+	}
+
+	err = efa_enable_msix(dev);
+	if (err)
+		goto err_reg_read_destroy;
+
+#ifdef HAVE_PCI_IRQ_VECTOR
+	edev->aq.msix_vector_idx = dev->admin_msix_vector_idx;
+	edev->aenq.msix_vector_idx = dev->admin_msix_vector_idx;
+#else
+	edev->aq.msix_vector_idx = dev->admin_msix_entry.entry;
+	edev->aenq.msix_vector_idx = dev->admin_msix_entry.entry;
+#endif
+
+	err = efa_set_mgmnt_irq(dev);
+	if (err)
+		goto err_disable_msix;
+
+	err = efa_com_admin_init(edev, &aenq_handlers);
+	if (err)
+		goto err_free_mgmnt_irq;
+
+	err = efa_sysfs_init(dev);
+	if (err)
+		goto err_admin_destroy;
+
+	return dev;
+
+err_admin_destroy:
+	efa_com_admin_destroy(edev);
+err_free_mgmnt_irq:
+	efa_free_mgmnt_irq(dev);
+err_disable_msix:
+	efa_disable_msix(dev);
+err_reg_read_destroy:
+	efa_com_mmio_reg_read_destroy(edev);
+err_iounmap:
+	devm_iounmap(&pdev->dev, edev->reg_bar);
+err_release_bars:
+	efa_release_bars(dev, EFA_BASE_BAR_MASK);
+err_ibdev_destroy:
+	ib_dealloc_device(&dev->ibdev);
+err_disable_device:
+	pci_disable_device(pdev);
+	return ERR_PTR(err);
+}
+
+static void efa_remove_device(struct pci_dev *pdev)
+{
+	struct efa_dev *dev = pci_get_drvdata(pdev);
+	struct efa_com_dev *edev;
+
+	edev = &dev->edev;
+	efa_sysfs_destroy(dev);
+	efa_com_admin_destroy(edev);
+	efa_free_mgmnt_irq(dev);
+	efa_disable_msix(dev);
+	efa_com_mmio_reg_read_destroy(edev);
+	devm_iounmap(&pdev->dev, edev->reg_bar);
+	efa_release_bars(dev, EFA_BASE_BAR_MASK);
+	ib_dealloc_device(&dev->ibdev);
+	pci_disable_device(pdev);
+}
+
+static int efa_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	struct efa_dev *dev;
+	int err;
+
+	dev = efa_probe_device(pdev);
+	if (IS_ERR(dev))
+		return PTR_ERR(dev);
+
+	err = efa_ib_device_add(dev);
+	if (err)
+		goto err_remove_device;
+
+	return 0;
+
+err_remove_device:
+	efa_remove_device(pdev);
+	return err;
+}
+
+static void efa_remove(struct pci_dev *pdev)
+{
+	struct efa_dev *dev = pci_get_drvdata(pdev);
+
+	efa_ib_device_remove(dev);
+	efa_remove_device(pdev);
+}
+
+static struct pci_driver efa_pci_driver = {
+	.name           = DRV_MODULE_NAME,
+	.id_table       = efa_pci_tbl,
+	.probe          = efa_probe,
+	.remove         = efa_remove,
+};
+
+#ifdef HAVE_CUSTOM_COMMANDS
+static ssize_t
+(*efa_everbs_cmd_table[EFA_EVERBS_CMD_MAX])(struct efa_dev *dev,
+					    const char __user *buf, int in_len,
+					    int out_len) = {
+#ifdef HAVE_CREATE_AH_NO_UDATA
+	[EFA_EVERBS_CMD_GET_AH] = efa_everbs_cmd_get_ah,
+#endif
+#ifndef HAVE_IB_QUERY_DEVICE_UDATA
+	[EFA_EVERBS_CMD_GET_EX_DEV_ATTRS] = efa_everbs_cmd_get_ex_dev_attrs,
+#endif
+};
+
+static ssize_t efa_everbs_write(struct file *filp,
+				const char __user *buf,
+				size_t count,
+				loff_t *pos)
+{
+	struct efa_dev *dev = filp->private_data;
+	struct ib_uverbs_cmd_hdr hdr;
+
+	if (count < sizeof(hdr))
+		return -EINVAL;
+
+	if (copy_from_user(&hdr, buf, sizeof(hdr)))
+		return -EFAULT;
+
+	if (hdr.in_words * 4 != count)
+		return -EINVAL;
+
+	if (hdr.command >= ARRAY_SIZE(efa_everbs_cmd_table) ||
+	    !efa_everbs_cmd_table[hdr.command])
+		return -EINVAL;
+
+	return efa_everbs_cmd_table[hdr.command](dev,
+						 buf + sizeof(hdr),
+						 hdr.in_words * 4,
+						 hdr.out_words * 4);
+}
+
+static int efa_everbs_open(struct inode *inode, struct file *filp)
+{
+	struct efa_dev *dev;
+
+	dev = container_of(inode->i_cdev, struct efa_dev, cdev);
+
+	filp->private_data = dev;
+	return nonseekable_open(inode, filp);
+}
+
+static int efa_everbs_close(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+static char *efa_everbs_devnode(struct device *dev, umode_t *mode)
+{
+	if (mode)
+		*mode = 0666;
+	return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
+static const struct file_operations efa_everbs_fops = {
+	.owner   = THIS_MODULE,
+	.write   = efa_everbs_write,
+	.open    = efa_everbs_open,
+	.release = efa_everbs_close,
+	.llseek  = no_llseek,
+};
+
+static int efa_everbs_dev_init(struct efa_dev *dev, int devnum)
+{
+	dev_t devno = MKDEV(efa_everbs_major, devnum);
+	int err;
+
+	WARN_ON(devnum >= EFA_EVERBS_MAX_DEVICES);
+	cdev_init(&dev->cdev, &efa_everbs_fops);
+	dev->cdev.owner = THIS_MODULE;
+
+	err = cdev_add(&dev->cdev, devno, 1);
+	if (err)
+		return err;
+
+	dev->everbs_dev = device_create(efa_everbs_class,
+					&dev->pdev->dev,
+					devno,
+					dev,
+					EFA_EVERBS_DEVICE_NAME "%d",
+					devnum);
+	if (IS_ERR(dev->everbs_dev)) {
+		err = PTR_ERR(dev->everbs_dev);
+		ibdev_err(&dev->ibdev, "Failed to create device: %s%d [%d]\n",
+			  EFA_EVERBS_DEVICE_NAME, devnum, err);
+		goto err;
+	}
+
+	return 0;
+
+err:
+	cdev_del(&dev->cdev);
+	return err;
+}
+
+static void efa_everbs_dev_destroy(struct efa_dev *dev)
+{
+	if (!dev->everbs_dev)
+		return;
+
+	device_destroy(efa_everbs_class, dev->cdev.dev);
+	cdev_del(&dev->cdev);
+	dev->everbs_dev = NULL;
+}
+#endif /* HAVE_CUSTOM_COMMANDS */
+
+static int __init efa_init(void)
+{
+#ifdef HAVE_CUSTOM_COMMANDS
+	dev_t dev;
+#endif
+	int err;
+
+	pr_info("%s\n", version);
+#ifdef HAVE_CUSTOM_COMMANDS
+	err = alloc_chrdev_region(&dev, 0, EFA_EVERBS_MAX_DEVICES,
+				  EFA_EVERBS_DEVICE_NAME);
+	if (err) {
+		pr_err("Couldn't allocate efa_everbs device numbers\n");
+		goto out;
+	}
+	efa_everbs_major = MAJOR(dev);
+
+	efa_everbs_class = class_create(THIS_MODULE, EFA_EVERBS_DEVICE_NAME);
+	if (IS_ERR(efa_everbs_class)) {
+		err = PTR_ERR(efa_everbs_class);
+		pr_err("Couldn't create efa_everbs class\n");
+		goto err_class;
+	}
+	efa_everbs_class->devnode = efa_everbs_devnode;
+#endif
+
+	err = pci_register_driver(&efa_pci_driver);
+	if (err) {
+		pr_err("Couldn't register efa driver\n");
+		goto err_register;
+	}
+
+#ifdef HAVE_EFA_GDR
+	nvmem_init();
+#endif
+
+	return 0;
+
+err_register:
+#ifdef HAVE_CUSTOM_COMMANDS
+	class_destroy(efa_everbs_class);
+err_class:
+	unregister_chrdev_region(dev, EFA_EVERBS_MAX_DEVICES);
+out:
+#endif
+	return err;
+}
+
+static void __exit efa_exit(void)
+{
+	pci_unregister_driver(&efa_pci_driver);
+#ifdef HAVE_CUSTOM_COMMANDS
+	class_destroy(efa_everbs_class);
+	unregister_chrdev_region(MKDEV(efa_everbs_major, 0),
+				 EFA_EVERBS_MAX_DEVICES);
+#endif
+}
+
+module_init(efa_init);
+module_exit(efa_exit);
diff --git a/drivers/amazon/net/efa/efa_regs_defs.h b/drivers/amazon/net/efa/efa_regs_defs.h
new file mode 100644
index 0000000000000..4017982fe13b0
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_regs_defs.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_REGS_H_
+#define _EFA_REGS_H_
+
+enum efa_regs_reset_reason_types {
+	EFA_REGS_RESET_NORMAL                       = 0,
+	/* Keep alive timeout */
+	EFA_REGS_RESET_KEEP_ALIVE_TO                = 1,
+	EFA_REGS_RESET_ADMIN_TO                     = 2,
+	EFA_REGS_RESET_INIT_ERR                     = 3,
+	EFA_REGS_RESET_DRIVER_INVALID_STATE         = 4,
+	EFA_REGS_RESET_OS_TRIGGER                   = 5,
+	EFA_REGS_RESET_SHUTDOWN                     = 6,
+	EFA_REGS_RESET_USER_TRIGGER                 = 7,
+	EFA_REGS_RESET_GENERIC                      = 8,
+};
+
+/* efa_registers offsets */
+
+/* 0 base */
+#define EFA_REGS_VERSION_OFF                                0x0
+#define EFA_REGS_CONTROLLER_VERSION_OFF                     0x4
+#define EFA_REGS_CAPS_OFF                                   0x8
+#define EFA_REGS_AQ_BASE_LO_OFF                             0x10
+#define EFA_REGS_AQ_BASE_HI_OFF                             0x14
+#define EFA_REGS_AQ_CAPS_OFF                                0x18
+#define EFA_REGS_ACQ_BASE_LO_OFF                            0x20
+#define EFA_REGS_ACQ_BASE_HI_OFF                            0x24
+#define EFA_REGS_ACQ_CAPS_OFF                               0x28
+#define EFA_REGS_AQ_PROD_DB_OFF                             0x2c
+#define EFA_REGS_AENQ_CAPS_OFF                              0x34
+#define EFA_REGS_AENQ_BASE_LO_OFF                           0x38
+#define EFA_REGS_AENQ_BASE_HI_OFF                           0x3c
+#define EFA_REGS_AENQ_CONS_DB_OFF                           0x40
+#define EFA_REGS_INTR_MASK_OFF                              0x4c
+#define EFA_REGS_DEV_CTL_OFF                                0x54
+#define EFA_REGS_DEV_STS_OFF                                0x58
+#define EFA_REGS_MMIO_REG_READ_OFF                          0x5c
+#define EFA_REGS_MMIO_RESP_LO_OFF                           0x60
+#define EFA_REGS_MMIO_RESP_HI_OFF                           0x64
+
+/* version register */
+#define EFA_REGS_VERSION_MINOR_VERSION_MASK                 0xff
+#define EFA_REGS_VERSION_MAJOR_VERSION_MASK                 0xff00
+
+/* controller_version register */
+#define EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK   0xff
+#define EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK      0xff00
+#define EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK      0xff0000
+#define EFA_REGS_CONTROLLER_VERSION_IMPL_ID_MASK            0xff000000
+
+/* caps register */
+#define EFA_REGS_CAPS_CONTIGUOUS_QUEUE_REQUIRED_MASK        0x1
+#define EFA_REGS_CAPS_RESET_TIMEOUT_MASK                    0x3e
+#define EFA_REGS_CAPS_DMA_ADDR_WIDTH_MASK                   0xff00
+#define EFA_REGS_CAPS_ADMIN_CMD_TO_MASK                     0xf0000
+
+/* aq_caps register */
+#define EFA_REGS_AQ_CAPS_AQ_DEPTH_MASK                      0xffff
+#define EFA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_MASK                 0xffff0000
+
+/* acq_caps register */
+#define EFA_REGS_ACQ_CAPS_ACQ_DEPTH_MASK                    0xffff
+#define EFA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_MASK               0xff0000
+#define EFA_REGS_ACQ_CAPS_ACQ_MSIX_VECTOR_MASK              0xff000000
+
+/* aenq_caps register */
+#define EFA_REGS_AENQ_CAPS_AENQ_DEPTH_MASK                  0xffff
+#define EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_MASK             0xff0000
+#define EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR_MASK            0xff000000
+
+/* intr_mask register */
+#define EFA_REGS_INTR_MASK_EN_MASK                          0x1
+
+/* dev_ctl register */
+#define EFA_REGS_DEV_CTL_DEV_RESET_MASK                     0x1
+#define EFA_REGS_DEV_CTL_AQ_RESTART_MASK                    0x2
+#define EFA_REGS_DEV_CTL_RESET_REASON_MASK                  0xf0000000
+
+/* dev_sts register */
+#define EFA_REGS_DEV_STS_READY_MASK                         0x1
+#define EFA_REGS_DEV_STS_AQ_RESTART_IN_PROGRESS_MASK        0x2
+#define EFA_REGS_DEV_STS_AQ_RESTART_FINISHED_MASK           0x4
+#define EFA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK             0x8
+#define EFA_REGS_DEV_STS_RESET_FINISHED_MASK                0x10
+#define EFA_REGS_DEV_STS_FATAL_ERROR_MASK                   0x20
+
+/* mmio_reg_read register */
+#define EFA_REGS_MMIO_REG_READ_REQ_ID_MASK                  0xffff
+#define EFA_REGS_MMIO_REG_READ_REG_OFF_MASK                 0xffff0000
+
+#endif /* _EFA_REGS_H_ */
diff --git a/drivers/amazon/net/efa/efa_sysfs.c b/drivers/amazon/net/efa/efa_sysfs.c
new file mode 100644
index 0000000000000..67e3fe9e80ac2
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_sysfs.c
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
+/*
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include "efa_sysfs.h"
+#include "kcompat.h"
+
+#include <linux/device.h>
+#include <linux/sysfs.h>
+
+#ifdef HAVE_EFA_GDR
+static ssize_t gdr_show(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	return sprintf(buf, "1\n");
+}
+
+static DEVICE_ATTR_RO(gdr);
+#endif
+
+int efa_sysfs_init(struct efa_dev *dev)
+{
+#ifdef HAVE_EFA_GDR
+	struct device *device = &dev->pdev->dev;
+
+	if (device_create_file(device, &dev_attr_gdr))
+		dev_err(device, "Failed to create GDR sysfs file\n");
+#endif
+	return 0;
+}
+
+void efa_sysfs_destroy(struct efa_dev *dev)
+{
+#ifdef HAVE_EFA_GDR
+	device_remove_file(&dev->pdev->dev, &dev_attr_gdr);
+#endif
+}
diff --git a/drivers/amazon/net/efa/efa_sysfs.h b/drivers/amazon/net/efa/efa_sysfs.h
new file mode 100644
index 0000000000000..c390aa547e5a6
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_sysfs.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_SYSFS_H_
+#define _EFA_SYSFS_H_
+
+#include "efa.h"
+
+int efa_sysfs_init(struct efa_dev *dev);
+
+void efa_sysfs_destroy(struct efa_dev *dev);
+
+#endif /* _EFA_SYSFS_H_ */
diff --git a/drivers/amazon/net/efa/efa_verbs.c b/drivers/amazon/net/efa/efa_verbs.c
new file mode 100644
index 0000000000000..f50a6736c96c8
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_verbs.c
@@ -0,0 +1,3140 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include "kcompat.h"
+#include <linux/vmalloc.h>
+#include <linux/log2.h>
+
+#include <rdma/ib_addr.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_verbs.h>
+#ifdef HAVE_UDATA_TO_DRV_CONTEXT
+#include <rdma/uverbs_ioctl.h>
+#endif
+
+#include "efa.h"
+
+#ifdef HAVE_EFA_GDR
+#include "efa_gdr.h"
+#endif
+
+enum {
+	EFA_MMAP_DMA_PAGE = 0,
+	EFA_MMAP_IO_WC,
+	EFA_MMAP_IO_NC,
+};
+
+#define EFA_AENQ_ENABLED_GROUPS \
+	(BIT(EFA_ADMIN_FATAL_ERROR) | BIT(EFA_ADMIN_WARNING) | \
+	 BIT(EFA_ADMIN_NOTIFICATION) | BIT(EFA_ADMIN_KEEP_ALIVE))
+
+struct efa_user_mmap_entry {
+	struct rdma_user_mmap_entry rdma_entry;
+#ifndef HAVE_CORE_MMAP_XA
+	struct list_head list;
+#endif
+	u64 address;
+	u8 mmap_flag;
+};
+
+#ifdef HAVE_HW_STATS
+#define EFA_DEFINE_STATS(op) \
+	op(EFA_TX_BYTES, "tx_bytes") \
+	op(EFA_TX_PKTS, "tx_pkts") \
+	op(EFA_RX_BYTES, "rx_bytes") \
+	op(EFA_RX_PKTS, "rx_pkts") \
+	op(EFA_RX_DROPS, "rx_drops") \
+	op(EFA_SEND_BYTES, "send_bytes") \
+	op(EFA_SEND_WRS, "send_wrs") \
+	op(EFA_RECV_BYTES, "recv_bytes") \
+	op(EFA_RECV_WRS, "recv_wrs") \
+	op(EFA_RDMA_READ_WRS, "rdma_read_wrs") \
+	op(EFA_RDMA_READ_BYTES, "rdma_read_bytes") \
+	op(EFA_RDMA_READ_WR_ERR, "rdma_read_wr_err") \
+	op(EFA_RDMA_READ_RESP_BYTES, "rdma_read_resp_bytes") \
+	op(EFA_SUBMITTED_CMDS, "submitted_cmds") \
+	op(EFA_COMPLETED_CMDS, "completed_cmds") \
+	op(EFA_CMDS_ERR, "cmds_err") \
+	op(EFA_NO_COMPLETION_CMDS, "no_completion_cmds") \
+	op(EFA_KEEP_ALIVE_RCVD, "keep_alive_rcvd") \
+	op(EFA_ALLOC_PD_ERR, "alloc_pd_err") \
+	op(EFA_CREATE_QP_ERR, "create_qp_err") \
+	op(EFA_CREATE_CQ_ERR, "create_cq_err") \
+	op(EFA_REG_MR_ERR, "reg_mr_err") \
+	op(EFA_ALLOC_UCONTEXT_ERR, "alloc_ucontext_err") \
+	op(EFA_CREATE_AH_ERR, "create_ah_err") \
+	op(EFA_MMAP_ERR, "mmap_err")
+
+#define EFA_STATS_ENUM(ename, name) ename,
+#define EFA_STATS_STR(ename, name) [ename] = name,
+
+enum efa_hw_stats {
+	EFA_DEFINE_STATS(EFA_STATS_ENUM)
+};
+
+static const char *const efa_stats_names[] = {
+	EFA_DEFINE_STATS(EFA_STATS_STR)
+};
+#endif
+
+#define EFA_CHUNK_PAYLOAD_SHIFT       12
+#define EFA_CHUNK_PAYLOAD_SIZE        BIT(EFA_CHUNK_PAYLOAD_SHIFT)
+#define EFA_CHUNK_PAYLOAD_PTR_SIZE    8
+
+#define EFA_CHUNK_SHIFT               12
+#define EFA_CHUNK_SIZE                BIT(EFA_CHUNK_SHIFT)
+#define EFA_CHUNK_PTR_SIZE            sizeof(struct efa_com_ctrl_buff_info)
+
+#define EFA_PTRS_PER_CHUNK \
+	((EFA_CHUNK_SIZE - EFA_CHUNK_PTR_SIZE) / EFA_CHUNK_PAYLOAD_PTR_SIZE)
+
+#define EFA_CHUNK_USED_SIZE \
+	((EFA_PTRS_PER_CHUNK * EFA_CHUNK_PAYLOAD_PTR_SIZE) + EFA_CHUNK_PTR_SIZE)
+
+struct pbl_chunk {
+	dma_addr_t dma_addr;
+	u64 *buf;
+	u32 length;
+};
+
+struct pbl_chunk_list {
+	struct pbl_chunk *chunks;
+	unsigned int size;
+};
+
+struct pbl_context {
+	union {
+		struct {
+			dma_addr_t dma_addr;
+		} continuous;
+		struct {
+			u32 pbl_buf_size_in_pages;
+			struct scatterlist *sgl;
+			int sg_dma_cnt;
+			struct pbl_chunk_list chunk_list;
+		} indirect;
+	} phys;
+	u64 *pbl_buf;
+	u32 pbl_buf_size_in_bytes;
+	u8 physically_continuous;
+};
+
+static inline struct efa_dev *to_edev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct efa_dev, ibdev);
+}
+
+static inline struct efa_ucontext *to_eucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct efa_ucontext, ibucontext);
+}
+
+static inline struct efa_pd *to_epd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct efa_pd, ibpd);
+}
+
+static inline struct efa_mr *to_emr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct efa_mr, ibmr);
+}
+
+static inline struct efa_qp *to_eqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct efa_qp, ibqp);
+}
+
+static inline struct efa_cq *to_ecq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct efa_cq, ibcq);
+}
+
+static inline struct efa_ah *to_eah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct efa_ah, ibah);
+}
+
+static inline struct efa_user_mmap_entry *
+to_emmap(struct rdma_user_mmap_entry *rdma_entry)
+{
+	return container_of(rdma_entry, struct efa_user_mmap_entry, rdma_entry);
+}
+
+#define EFA_DEV_CAP(dev, cap) \
+	((dev)->dev_attr.device_caps & \
+	 EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_##cap##_MASK)
+
+#define is_reserved_cleared(reserved) \
+	!memchr_inv(reserved, 0, sizeof(reserved))
+
+static void *efa_zalloc_mapped(struct efa_dev *dev, dma_addr_t *dma_addr,
+			       size_t size, enum dma_data_direction dir)
+{
+	void *addr;
+
+	addr = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
+	if (!addr)
+		return NULL;
+
+	*dma_addr = dma_map_single(&dev->pdev->dev, addr, size, dir);
+	if (dma_mapping_error(&dev->pdev->dev, *dma_addr)) {
+		ibdev_err(&dev->ibdev, "Failed to map DMA address\n");
+		free_pages_exact(addr, size);
+		return NULL;
+	}
+
+	return addr;
+}
+
+static void efa_free_mapped(struct efa_dev *dev, void *cpu_addr,
+			    dma_addr_t dma_addr,
+			    size_t size, enum dma_data_direction dir)
+{
+	dma_unmap_single(&dev->pdev->dev, dma_addr, size, dir);
+	free_pages_exact(cpu_addr, size);
+}
+
+#ifndef HAVE_CORE_MMAP_XA
+/*
+ * This is only called when the ucontext is destroyed and there can be no
+ * concurrent query via mmap or allocate on the database, thus we can be sure no
+ * other thread is using the entry pointer. We also know that all the BAR
+ * pages have either been zap'd or munmaped at this point.  Normal pages are
+ * refcounted and will be freed at the proper time.
+ */
+static void mmap_entries_remove_free(struct efa_dev *dev,
+				     struct efa_ucontext *ucontext)
+{
+	struct efa_user_mmap_entry *entry, *tmp;
+
+	list_for_each_entry_safe(entry, tmp, &ucontext->pending_mmaps, list) {
+		list_del(&entry->list);
+		ibdev_dbg(
+			&dev->ibdev,
+			"mmap: key[%#llx] addr[%#llx] len[%#zx] removed\n",
+			rdma_user_mmap_get_offset(&entry->rdma_entry),
+			entry->address, entry->rdma_entry.npages * PAGE_SIZE);
+		kfree(entry);
+	}
+}
+
+static int mmap_entry_validate(struct efa_ucontext *ucontext,
+			       struct vm_area_struct *vma)
+{
+	size_t length = vma->vm_end - vma->vm_start;
+
+	if (length % PAGE_SIZE != 0 || !(vma->vm_flags & VM_SHARED)) {
+		ibdev_dbg(ucontext->ibucontext.device,
+			  "length[%#zx] is not page size aligned[%#lx] or VM_SHARED is not set [%#lx]\n",
+			  length, PAGE_SIZE, vma->vm_flags);
+		return -EINVAL;
+	}
+
+	if (vma->vm_flags & VM_EXEC) {
+		ibdev_dbg(ucontext->ibucontext.device,
+			  "Mapping executable pages is not permitted\n");
+		return -EPERM;
+	}
+
+	return 0;
+}
+
+struct rdma_user_mmap_entry *
+rdma_user_mmap_entry_get(struct ib_ucontext *ibucontext,
+			 struct vm_area_struct *vma)
+{
+	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
+	size_t length = vma->vm_end - vma->vm_start;
+	struct efa_user_mmap_entry *entry, *tmp;
+	u64 key = vma->vm_pgoff << PAGE_SHIFT;
+	int err;
+
+	err = mmap_entry_validate(ucontext, vma);
+	if (err)
+		return NULL;
+
+	mutex_lock(&ucontext->lock);
+	list_for_each_entry_safe(entry, tmp, &ucontext->pending_mmaps, list) {
+		if (rdma_user_mmap_get_offset(&entry->rdma_entry) == key &&
+		    entry->rdma_entry.npages * PAGE_SIZE == length) {
+			ibdev_dbg(ibucontext->device,
+				  "mmap: key[%#llx] addr[%#llx] len[%#zx] removed\n",
+				  key, entry->address,
+				  entry->rdma_entry.npages * PAGE_SIZE);
+			mutex_unlock(&ucontext->lock);
+			return &entry->rdma_entry;
+		}
+	}
+	mutex_unlock(&ucontext->lock);
+
+	return NULL;
+}
+#endif /* !defined (HAVE_CORE_MMAP_XA) */
+
+#ifdef HAVE_IB_QUERY_DEVICE_UDATA
+int efa_query_device(struct ib_device *ibdev,
+		     struct ib_device_attr *props,
+		     struct ib_udata *udata)
+#else
+int efa_query_device(struct ib_device *ibdev,
+		     struct ib_device_attr *props)
+#endif
+{
+	struct efa_com_get_device_attr_result *dev_attr;
+#ifdef HAVE_IB_QUERY_DEVICE_UDATA
+	struct efa_ibv_ex_query_device_resp resp = {};
+#endif
+	struct efa_dev *dev = to_edev(ibdev);
+#ifdef HAVE_IB_QUERY_DEVICE_UDATA
+	int err;
+
+	if (udata && udata->inlen &&
+	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
+		ibdev_dbg(ibdev,
+			  "Incompatible ABI params, udata not cleared\n");
+		return -EINVAL;
+	}
+#endif
+
+	dev_attr = &dev->dev_attr;
+
+	memset(props, 0, sizeof(*props));
+	props->max_mr_size = dev_attr->max_mr_pages * PAGE_SIZE;
+	props->page_size_cap = dev_attr->page_size_cap;
+	props->vendor_id = dev->pdev->vendor;
+	props->vendor_part_id = dev->pdev->device;
+	props->hw_ver = dev->pdev->subsystem_device;
+	props->max_qp = dev_attr->max_qp;
+	props->max_cq = dev_attr->max_cq;
+	props->max_pd = dev_attr->max_pd;
+	props->max_mr = dev_attr->max_mr;
+	props->max_ah = dev_attr->max_ah;
+	props->max_cqe = dev_attr->max_cq_depth;
+	props->max_qp_wr = min_t(u32, dev_attr->max_sq_depth,
+				 dev_attr->max_rq_depth);
+#ifdef HAVE_MAX_SEND_RCV_SGE
+	props->max_send_sge = dev_attr->max_sq_sge;
+	props->max_recv_sge = dev_attr->max_rq_sge;
+#else
+	props->max_sge = min_t(u16, dev_attr->max_sq_sge,
+			       dev_attr->max_rq_sge);
+#endif
+	props->max_sge_rd = dev_attr->max_wr_rdma_sge;
+	props->max_pkeys = 1;
+
+#ifdef HAVE_IB_QUERY_DEVICE_UDATA
+	if (udata && udata->outlen) {
+		resp.max_sq_sge = dev_attr->max_sq_sge;
+		resp.max_rq_sge = dev_attr->max_rq_sge;
+		resp.max_sq_wr = dev_attr->max_sq_depth;
+		resp.max_rq_wr = dev_attr->max_rq_depth;
+		resp.max_rdma_size = dev_attr->max_rdma_size;
+
+		if (EFA_DEV_CAP(dev, RDMA_READ))
+			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RDMA_READ;
+
+		if (EFA_DEV_CAP(dev, RNR_RETRY))
+			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RNR_RETRY;
+
+		err = ib_copy_to_udata(udata, &resp,
+				       min(sizeof(resp), udata->outlen));
+		if (err) {
+			ibdev_dbg(ibdev,
+				  "Failed to copy udata for query_device\n");
+			return err;
+		}
+	}
+#endif
+
+	return 0;
+}
+
+int efa_query_port(struct ib_device *ibdev, u8 port,
+		   struct ib_port_attr *props)
+{
+	struct efa_dev *dev = to_edev(ibdev);
+
+	props->lmc = 1;
+
+	props->state = IB_PORT_ACTIVE;
+	props->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
+	props->gid_tbl_len = 1;
+	props->pkey_tbl_len = 1;
+	props->active_speed = IB_SPEED_EDR;
+	props->active_width = IB_WIDTH_4X;
+#ifdef HAVE_IB_MTU_INT_TO_ENUM
+	props->max_mtu = ib_mtu_int_to_enum(dev->dev_attr.mtu);
+	props->active_mtu = ib_mtu_int_to_enum(dev->dev_attr.mtu);
+#else
+	props->max_mtu = IB_MTU_4096;
+	props->active_mtu = IB_MTU_4096;
+#endif
+	props->max_msg_sz = dev->dev_attr.mtu;
+	props->max_vl_num = 1;
+
+	return 0;
+}
+
+int efa_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+		 int qp_attr_mask,
+		 struct ib_qp_init_attr *qp_init_attr)
+{
+	struct efa_dev *dev = to_edev(ibqp->device);
+	struct efa_com_query_qp_params params = {};
+	struct efa_com_query_qp_result result;
+	struct efa_qp *qp = to_eqp(ibqp);
+	int err;
+
+#define EFA_QUERY_QP_SUPP_MASK \
+	(IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT | \
+	 IB_QP_QKEY | IB_QP_SQ_PSN | IB_QP_CAP | IB_QP_RNR_RETRY)
+
+	if (qp_attr_mask & ~EFA_QUERY_QP_SUPP_MASK) {
+		ibdev_dbg(&dev->ibdev,
+			  "Unsupported qp_attr_mask[%#x] supported[%#x]\n",
+			  qp_attr_mask, EFA_QUERY_QP_SUPP_MASK);
+		return -EOPNOTSUPP;
+	}
+
+	memset(qp_attr, 0, sizeof(*qp_attr));
+	memset(qp_init_attr, 0, sizeof(*qp_init_attr));
+
+	params.qp_handle = qp->qp_handle;
+	err = efa_com_query_qp(&dev->edev, &params, &result);
+	if (err)
+		return err;
+
+	qp_attr->qp_state = result.qp_state;
+	qp_attr->qkey = result.qkey;
+	qp_attr->sq_psn = result.sq_psn;
+	qp_attr->sq_draining = result.sq_draining;
+	qp_attr->port_num = 1;
+	qp_attr->rnr_retry = result.rnr_retry;
+
+	qp_attr->cap.max_send_wr = qp->max_send_wr;
+	qp_attr->cap.max_recv_wr = qp->max_recv_wr;
+	qp_attr->cap.max_send_sge = qp->max_send_sge;
+	qp_attr->cap.max_recv_sge = qp->max_recv_sge;
+	qp_attr->cap.max_inline_data = qp->max_inline_data;
+
+	qp_init_attr->qp_type = ibqp->qp_type;
+	qp_init_attr->recv_cq = ibqp->recv_cq;
+	qp_init_attr->send_cq = ibqp->send_cq;
+	qp_init_attr->qp_context = ibqp->qp_context;
+	qp_init_attr->cap = qp_attr->cap;
+
+	return 0;
+}
+
+int efa_query_gid(struct ib_device *ibdev, u8 port, int index,
+		  union ib_gid *gid)
+{
+	struct efa_dev *dev = to_edev(ibdev);
+
+	memcpy(gid->raw, dev->dev_attr.addr, sizeof(dev->dev_attr.addr));
+
+	return 0;
+}
+
+int efa_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+		   u16 *pkey)
+{
+	if (index > 0)
+		return -EINVAL;
+
+	*pkey = 0xffff;
+	return 0;
+}
+
+static int efa_pd_dealloc(struct efa_dev *dev, u16 pdn)
+{
+	struct efa_com_dealloc_pd_params params = {
+		.pdn = pdn,
+	};
+
+	return efa_com_dealloc_pd(&dev->edev, &params);
+}
+
+#ifdef HAVE_ALLOC_PD_NO_UCONTEXT
+int efa_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
+#else
+int efa_alloc_pd(struct ib_pd *ibpd,
+		 struct ib_ucontext *ibucontext,
+		 struct ib_udata *udata)
+#endif
+{
+	struct efa_dev *dev = to_edev(ibpd->device);
+	struct efa_ibv_alloc_pd_resp resp = {};
+	struct efa_com_alloc_pd_result result;
+	struct efa_pd *pd = to_epd(ibpd);
+	int err;
+
+#ifndef HAVE_NO_KVERBS_DRIVERS
+	if (!udata) {
+		ibdev_dbg(&dev->ibdev, "udata is NULL\n");
+		err = -EOPNOTSUPP;
+		goto err_out;
+	}
+#endif
+
+	if (udata->inlen &&
+#ifdef HAVE_UVERBS_CMD_HDR_FIX
+	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
+#else
+	    /* WA for e093111ddb6c ("IB/core: Fix input len in multiple user verbs") */
+	    !ib_is_udata_cleared(udata, 0, udata->inlen - sizeof(struct ib_uverbs_cmd_hdr))) {
+#endif
+		ibdev_dbg(&dev->ibdev,
+			  "Incompatible ABI params, udata not cleared\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	err = efa_com_alloc_pd(&dev->edev, &result);
+	if (err)
+		goto err_out;
+
+	pd->pdn = result.pdn;
+	resp.pdn = result.pdn;
+
+	if (udata->outlen) {
+		err = ib_copy_to_udata(udata, &resp,
+				       min(sizeof(resp), udata->outlen));
+		if (err) {
+			ibdev_dbg(&dev->ibdev,
+				  "Failed to copy udata for alloc_pd\n");
+			goto err_dealloc_pd;
+		}
+	}
+
+	ibdev_dbg(&dev->ibdev, "Allocated pd[%d]\n", pd->pdn);
+
+	return 0;
+
+err_dealloc_pd:
+	efa_pd_dealloc(dev, result.pdn);
+err_out:
+	atomic64_inc(&dev->stats.alloc_pd_err);
+	return err;
+}
+
+#ifndef HAVE_PD_CORE_ALLOCATION
+struct ib_pd *efa_kzalloc_pd(struct ib_device *ibdev,
+			     struct ib_ucontext *ibucontext,
+			     struct ib_udata *udata)
+{
+	struct efa_dev *dev = to_edev(ibdev);
+	struct efa_pd *pd;
+	int err;
+
+	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd) {
+		atomic64_inc(&dev->stats.alloc_pd_err);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	pd->ibpd.device = ibdev;
+
+#ifdef HAVE_ALLOC_PD_NO_UCONTEXT
+	err = efa_alloc_pd(&pd->ibpd, udata);
+#else
+	err = efa_alloc_pd(&pd->ibpd, ibucontext, udata);
+#endif
+	if (err)
+		goto err_free;
+
+	return &pd->ibpd;
+
+err_free:
+	kfree(pd);
+	return ERR_PTR(err);
+}
+#endif
+
+#ifdef HAVE_DEALLOC_PD_UDATA_RC
+int efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
+#elif defined(HAVE_DEALLOC_PD_UDATA)
+void efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
+#elif defined(HAVE_PD_CORE_ALLOCATION)
+void efa_dealloc_pd(struct ib_pd *ibpd)
+#else
+int efa_dealloc_pd(struct ib_pd *ibpd)
+#endif
+{
+	struct efa_dev *dev = to_edev(ibpd->device);
+	struct efa_pd *pd = to_epd(ibpd);
+
+	ibdev_dbg(&dev->ibdev, "Dealloc pd[%d]\n", pd->pdn);
+	efa_pd_dealloc(dev, pd->pdn);
+#ifndef HAVE_PD_CORE_ALLOCATION
+	kfree(pd);
+
+	return 0;
+#elif defined(HAVE_DEALLOC_PD_UDATA_RC)
+	return 0;
+#endif
+}
+
+static int efa_destroy_qp_handle(struct efa_dev *dev, u32 qp_handle)
+{
+	struct efa_com_destroy_qp_params params = { .qp_handle = qp_handle };
+
+	return efa_com_destroy_qp(&dev->edev, &params);
+}
+
+static void efa_qp_user_mmap_entries_remove(struct efa_qp *qp)
+{
+	rdma_user_mmap_entry_remove(qp->rq_mmap_entry);
+	rdma_user_mmap_entry_remove(qp->rq_db_mmap_entry);
+	rdma_user_mmap_entry_remove(qp->llq_desc_mmap_entry);
+	rdma_user_mmap_entry_remove(qp->sq_db_mmap_entry);
+}
+
+#ifdef HAVE_DESTROY_QP_UDATA
+int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
+#else
+int efa_destroy_qp(struct ib_qp *ibqp)
+#endif
+{
+	struct efa_dev *dev = to_edev(ibqp->pd->device);
+	struct efa_qp *qp = to_eqp(ibqp);
+	int err;
+
+	ibdev_dbg(&dev->ibdev, "Destroy qp[%u]\n", ibqp->qp_num);
+
+	efa_qp_user_mmap_entries_remove(qp);
+
+	err = efa_destroy_qp_handle(dev, qp->qp_handle);
+	if (err)
+		return err;
+
+	if (qp->rq_cpu_addr) {
+		ibdev_dbg(&dev->ibdev,
+			  "qp->cpu_addr[0x%p] freed: size[%lu], dma[%pad]\n",
+			  qp->rq_cpu_addr, qp->rq_size,
+			  &qp->rq_dma_addr);
+		efa_free_mapped(dev, qp->rq_cpu_addr, qp->rq_dma_addr,
+				qp->rq_size, DMA_TO_DEVICE);
+	}
+
+	kfree(qp);
+	return 0;
+}
+
+#ifdef HAVE_CORE_MMAP_XA
+static struct rdma_user_mmap_entry*
+efa_user_mmap_entry_insert(struct ib_ucontext *ucontext,
+			   u64 address, size_t length,
+			   u8 mmap_flag, u64 *offset)
+{
+	struct efa_user_mmap_entry *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	int err;
+
+	if (!entry)
+		return NULL;
+
+	entry->address = address;
+	entry->mmap_flag = mmap_flag;
+
+	err = rdma_user_mmap_entry_insert(ucontext, &entry->rdma_entry,
+					  length);
+	if (err) {
+		kfree(entry);
+		return NULL;
+	}
+	*offset = rdma_user_mmap_get_offset(&entry->rdma_entry);
+
+	return &entry->rdma_entry;
+}
+#else
+static struct rdma_user_mmap_entry *
+efa_user_mmap_entry_insert(struct ib_ucontext *ibucontext, u64 address,
+			   size_t length, u8 mmap_flag, u64 *offset)
+{
+	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
+	struct efa_user_mmap_entry *entry;
+	u64 next_mmap_page;
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return NULL;
+
+	entry->address = address;
+	entry->rdma_entry.npages = (u32)DIV_ROUND_UP(length, PAGE_SIZE);
+	entry->mmap_flag = mmap_flag;
+
+	mutex_lock(&ucontext->lock);
+	next_mmap_page = ucontext->mmap_page + (length >> PAGE_SHIFT);
+	if (next_mmap_page >= U32_MAX) {
+		ibdev_dbg(ucontext->ibucontext.device, "Too many mmap pages\n");
+		mutex_unlock(&ucontext->lock);
+		kfree(entry);
+		return NULL;
+	}
+
+	entry->rdma_entry.start_pgoff = ucontext->mmap_page;
+	ucontext->mmap_page = next_mmap_page;
+	list_add_tail(&entry->list, &ucontext->pending_mmaps);
+	mutex_unlock(&ucontext->lock);
+
+	*offset = rdma_user_mmap_get_offset(&entry->rdma_entry);
+	ibdev_dbg(
+		ucontext->ibucontext.device,
+		"mmap: addr[%#llx], len[%#zx], key[%#llx] inserted\n",
+		entry->address, entry->rdma_entry.npages * PAGE_SIZE,
+		rdma_user_mmap_get_offset(&entry->rdma_entry));
+
+	return &entry->rdma_entry;
+}
+#endif
+
+static int qp_mmap_entries_setup(struct efa_qp *qp,
+				 struct efa_dev *dev,
+				 struct efa_ucontext *ucontext,
+				 struct efa_com_create_qp_params *params,
+				 struct efa_ibv_create_qp_resp *resp)
+{
+	size_t length;
+	u64 address;
+
+	address = dev->db_bar_addr + resp->sq_db_offset;
+	qp->sq_db_mmap_entry =
+		efa_user_mmap_entry_insert(&ucontext->ibucontext,
+					   address,
+					   PAGE_SIZE, EFA_MMAP_IO_NC,
+					   &resp->sq_db_mmap_key);
+	if (!qp->sq_db_mmap_entry)
+		return -ENOMEM;
+
+	resp->sq_db_offset &= ~PAGE_MASK;
+
+	address = dev->mem_bar_addr + resp->llq_desc_offset;
+	length = PAGE_ALIGN(params->sq_ring_size_in_bytes +
+			    (resp->llq_desc_offset & ~PAGE_MASK));
+
+	qp->llq_desc_mmap_entry =
+		efa_user_mmap_entry_insert(&ucontext->ibucontext,
+					   address, length,
+					   EFA_MMAP_IO_WC,
+					   &resp->llq_desc_mmap_key);
+	if (!qp->llq_desc_mmap_entry)
+		goto err_remove_mmap;
+
+	resp->llq_desc_offset &= ~PAGE_MASK;
+
+	if (qp->rq_size) {
+		address = dev->db_bar_addr + resp->rq_db_offset;
+
+		qp->rq_db_mmap_entry =
+			efa_user_mmap_entry_insert(&ucontext->ibucontext,
+						   address, PAGE_SIZE,
+						   EFA_MMAP_IO_NC,
+						   &resp->rq_db_mmap_key);
+		if (!qp->rq_db_mmap_entry)
+			goto err_remove_mmap;
+
+		resp->rq_db_offset &= ~PAGE_MASK;
+
+		address = virt_to_phys(qp->rq_cpu_addr);
+		qp->rq_mmap_entry =
+			efa_user_mmap_entry_insert(&ucontext->ibucontext,
+						   address, qp->rq_size,
+						   EFA_MMAP_DMA_PAGE,
+						   &resp->rq_mmap_key);
+		if (!qp->rq_mmap_entry)
+			goto err_remove_mmap;
+
+		resp->rq_mmap_size = qp->rq_size;
+	}
+
+	return 0;
+
+err_remove_mmap:
+	efa_qp_user_mmap_entries_remove(qp);
+
+	return -ENOMEM;
+}
+
+static int efa_qp_validate_cap(struct efa_dev *dev,
+			       struct ib_qp_init_attr *init_attr)
+{
+	if (init_attr->cap.max_send_wr > dev->dev_attr.max_sq_depth) {
+		ibdev_dbg(&dev->ibdev,
+			  "qp: requested send wr[%u] exceeds the max[%u]\n",
+			  init_attr->cap.max_send_wr,
+			  dev->dev_attr.max_sq_depth);
+		return -EINVAL;
+	}
+	if (init_attr->cap.max_recv_wr > dev->dev_attr.max_rq_depth) {
+		ibdev_dbg(&dev->ibdev,
+			  "qp: requested receive wr[%u] exceeds the max[%u]\n",
+			  init_attr->cap.max_recv_wr,
+			  dev->dev_attr.max_rq_depth);
+		return -EINVAL;
+	}
+	if (init_attr->cap.max_send_sge > dev->dev_attr.max_sq_sge) {
+		ibdev_dbg(&dev->ibdev,
+			  "qp: requested sge send[%u] exceeds the max[%u]\n",
+			  init_attr->cap.max_send_sge, dev->dev_attr.max_sq_sge);
+		return -EINVAL;
+	}
+	if (init_attr->cap.max_recv_sge > dev->dev_attr.max_rq_sge) {
+		ibdev_dbg(&dev->ibdev,
+			  "qp: requested sge recv[%u] exceeds the max[%u]\n",
+			  init_attr->cap.max_recv_sge, dev->dev_attr.max_rq_sge);
+		return -EINVAL;
+	}
+	if (init_attr->cap.max_inline_data > dev->dev_attr.inline_buf_size) {
+		ibdev_dbg(&dev->ibdev,
+			  "qp: requested inline data[%u] exceeds the max[%u]\n",
+			  init_attr->cap.max_inline_data,
+			  dev->dev_attr.inline_buf_size);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int efa_qp_validate_attr(struct efa_dev *dev,
+				struct ib_qp_init_attr *init_attr)
+{
+	if (init_attr->qp_type != IB_QPT_DRIVER &&
+	    init_attr->qp_type != IB_QPT_UD) {
+		ibdev_dbg(&dev->ibdev,
+			  "Unsupported qp type %d\n", init_attr->qp_type);
+		return -EOPNOTSUPP;
+	}
+
+	if (init_attr->srq) {
+		ibdev_dbg(&dev->ibdev, "SRQ is not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (init_attr->create_flags) {
+		ibdev_dbg(&dev->ibdev, "Unsupported create flags\n");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+struct ib_qp *efa_create_qp(struct ib_pd *ibpd,
+			    struct ib_qp_init_attr *init_attr,
+			    struct ib_udata *udata)
+{
+	struct efa_com_create_qp_params create_qp_params = {};
+	struct efa_com_create_qp_result create_qp_resp;
+	struct efa_dev *dev = to_edev(ibpd->device);
+	struct efa_ibv_create_qp_resp resp = {};
+	struct efa_ibv_create_qp cmd = {};
+	struct efa_ucontext *ucontext;
+	struct efa_qp *qp;
+	int err;
+
+#ifndef HAVE_NO_KVERBS_DRIVERS
+	if (!udata) {
+		ibdev_dbg(&dev->ibdev, "udata is NULL\n");
+		err = -EOPNOTSUPP;
+		goto err_out;
+	}
+#endif
+
+#ifdef HAVE_UDATA_TO_DRV_CONTEXT
+	ucontext = rdma_udata_to_drv_context(udata, struct efa_ucontext,
+					     ibucontext);
+#else
+	ucontext = ibpd->uobject ? to_eucontext(ibpd->uobject->context) :
+				   NULL;
+#endif
+
+	err = efa_qp_validate_cap(dev, init_attr);
+	if (err)
+		goto err_out;
+
+	err = efa_qp_validate_attr(dev, init_attr);
+	if (err)
+		goto err_out;
+
+	if (offsetofend(typeof(cmd), driver_qp_type) > udata->inlen) {
+		ibdev_dbg(&dev->ibdev,
+			  "Incompatible ABI params, no input udata\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	if (udata->inlen > sizeof(cmd) &&
+#ifdef HAVE_UVERBS_CMD_HDR_FIX
+	    !ib_is_udata_cleared(udata, sizeof(cmd),
+				 udata->inlen - sizeof(cmd))) {
+#else
+	    /* WA for e093111ddb6c ("IB/core: Fix input len in multiple user verbs") */
+	    !ib_is_udata_cleared(udata, sizeof(cmd),
+				 udata->inlen - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr))) {
+#endif
+		ibdev_dbg(&dev->ibdev,
+			  "Incompatible ABI params, unknown fields in udata\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	err = ib_copy_from_udata(&cmd, udata,
+				 min(sizeof(cmd), udata->inlen));
+	if (err) {
+		ibdev_dbg(&dev->ibdev,
+			  "Cannot copy udata for create_qp\n");
+		goto err_out;
+	}
+
+	if (cmd.comp_mask) {
+		ibdev_dbg(&dev->ibdev,
+			  "Incompatible ABI params, unknown fields in udata\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+	if (!qp) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	create_qp_params.uarn = ucontext->uarn;
+	create_qp_params.pd = to_epd(ibpd)->pdn;
+
+	if (init_attr->qp_type == IB_QPT_UD) {
+		create_qp_params.qp_type = EFA_ADMIN_QP_TYPE_UD;
+	} else if (cmd.driver_qp_type == EFA_QP_DRIVER_TYPE_SRD) {
+		create_qp_params.qp_type = EFA_ADMIN_QP_TYPE_SRD;
+	} else {
+		ibdev_dbg(&dev->ibdev,
+			  "Unsupported qp type %d driver qp type %d\n",
+			  init_attr->qp_type, cmd.driver_qp_type);
+		err = -EOPNOTSUPP;
+		goto err_free_qp;
+	}
+
+	ibdev_dbg(&dev->ibdev, "Create QP: qp type %d driver qp type %#x\n",
+		  init_attr->qp_type, cmd.driver_qp_type);
+	create_qp_params.send_cq_idx = to_ecq(init_attr->send_cq)->cq_idx;
+	create_qp_params.recv_cq_idx = to_ecq(init_attr->recv_cq)->cq_idx;
+	create_qp_params.sq_depth = init_attr->cap.max_send_wr;
+	create_qp_params.sq_ring_size_in_bytes = cmd.sq_ring_size;
+
+	create_qp_params.rq_depth = init_attr->cap.max_recv_wr;
+	create_qp_params.rq_ring_size_in_bytes = cmd.rq_ring_size;
+	qp->rq_size = PAGE_ALIGN(create_qp_params.rq_ring_size_in_bytes);
+	if (qp->rq_size) {
+		qp->rq_cpu_addr = efa_zalloc_mapped(dev, &qp->rq_dma_addr,
+						    qp->rq_size, DMA_TO_DEVICE);
+		if (!qp->rq_cpu_addr) {
+			err = -ENOMEM;
+			goto err_free_qp;
+		}
+
+		ibdev_dbg(&dev->ibdev,
+			  "qp->cpu_addr[0x%p] allocated: size[%lu], dma[%pad]\n",
+			  qp->rq_cpu_addr, qp->rq_size, &qp->rq_dma_addr);
+		create_qp_params.rq_base_addr = qp->rq_dma_addr;
+	}
+
+	err = efa_com_create_qp(&dev->edev, &create_qp_params,
+				&create_qp_resp);
+	if (err)
+		goto err_free_mapped;
+
+	resp.sq_db_offset = create_qp_resp.sq_db_offset;
+	resp.rq_db_offset = create_qp_resp.rq_db_offset;
+	resp.llq_desc_offset = create_qp_resp.llq_descriptors_offset;
+	resp.send_sub_cq_idx = create_qp_resp.send_sub_cq_idx;
+	resp.recv_sub_cq_idx = create_qp_resp.recv_sub_cq_idx;
+
+	err = qp_mmap_entries_setup(qp, dev, ucontext, &create_qp_params,
+				    &resp);
+	if (err)
+		goto err_destroy_qp;
+
+	qp->qp_handle = create_qp_resp.qp_handle;
+	qp->ibqp.qp_num = create_qp_resp.qp_num;
+	qp->ibqp.qp_type = init_attr->qp_type;
+	qp->max_send_wr = init_attr->cap.max_send_wr;
+	qp->max_recv_wr = init_attr->cap.max_recv_wr;
+	qp->max_send_sge = init_attr->cap.max_send_sge;
+	qp->max_recv_sge = init_attr->cap.max_recv_sge;
+	qp->max_inline_data = init_attr->cap.max_inline_data;
+
+	if (udata->outlen) {
+		err = ib_copy_to_udata(udata, &resp,
+				       min(sizeof(resp), udata->outlen));
+		if (err) {
+			ibdev_dbg(&dev->ibdev,
+				  "Failed to copy udata for qp[%u]\n",
+				  create_qp_resp.qp_num);
+			goto err_remove_mmap_entries;
+		}
+	}
+
+	ibdev_dbg(&dev->ibdev, "Created qp[%d]\n", qp->ibqp.qp_num);
+
+	return &qp->ibqp;
+
+err_remove_mmap_entries:
+	efa_qp_user_mmap_entries_remove(qp);
+err_destroy_qp:
+	efa_destroy_qp_handle(dev, create_qp_resp.qp_handle);
+err_free_mapped:
+	if (qp->rq_size)
+		efa_free_mapped(dev, qp->rq_cpu_addr, qp->rq_dma_addr,
+				qp->rq_size, DMA_TO_DEVICE);
+err_free_qp:
+	kfree(qp);
+err_out:
+	atomic64_inc(&dev->stats.create_qp_err);
+	return ERR_PTR(err);
+}
+
+static const struct {
+	int			valid;
+	enum ib_qp_attr_mask	req_param;
+	enum ib_qp_attr_mask	opt_param;
+} srd_qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
+	[IB_QPS_RESET] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_INIT]  = {
+			.valid = 1,
+			.req_param = IB_QP_PKEY_INDEX |
+				     IB_QP_PORT |
+				     IB_QP_QKEY,
+		},
+	},
+	[IB_QPS_INIT] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR]   = { .valid = 1 },
+		[IB_QPS_INIT]  = {
+			.valid = 1,
+			.opt_param = IB_QP_PKEY_INDEX |
+				     IB_QP_PORT |
+				     IB_QP_QKEY,
+		},
+		[IB_QPS_RTR]   = {
+			.valid = 1,
+			.opt_param = IB_QP_PKEY_INDEX |
+				     IB_QP_QKEY,
+		},
+	},
+	[IB_QPS_RTR] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR]   = { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.req_param = IB_QP_SQ_PSN,
+			.opt_param = IB_QP_CUR_STATE |
+				     IB_QP_QKEY |
+				     IB_QP_RNR_RETRY,
+
+		}
+	},
+	[IB_QPS_RTS] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR]   = { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.opt_param = IB_QP_CUR_STATE |
+				     IB_QP_QKEY,
+		},
+		[IB_QPS_SQD] = {
+			.valid = 1,
+			.opt_param = IB_QP_EN_SQD_ASYNC_NOTIFY,
+		},
+	},
+	[IB_QPS_SQD] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR]   = { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.opt_param = IB_QP_CUR_STATE |
+				     IB_QP_QKEY,
+		},
+		[IB_QPS_SQD] = {
+			.valid = 1,
+			.opt_param = IB_QP_PKEY_INDEX |
+				     IB_QP_QKEY,
+		}
+	},
+	[IB_QPS_SQE] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR]   = { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.opt_param = IB_QP_CUR_STATE |
+				     IB_QP_QKEY,
+		}
+	},
+	[IB_QPS_ERR] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR]   = { .valid = 1 },
+	}
+};
+
+static bool efa_modify_srd_qp_is_ok(enum ib_qp_state cur_state,
+				    enum ib_qp_state next_state,
+				    enum ib_qp_attr_mask mask)
+{
+	enum ib_qp_attr_mask req_param, opt_param;
+
+	if (mask & IB_QP_CUR_STATE  &&
+	    cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS &&
+	    cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE)
+		return false;
+
+	if (!srd_qp_state_table[cur_state][next_state].valid)
+		return false;
+
+	req_param = srd_qp_state_table[cur_state][next_state].req_param;
+	opt_param = srd_qp_state_table[cur_state][next_state].opt_param;
+
+	if ((mask & req_param) != req_param)
+		return false;
+
+	if (mask & ~(req_param | opt_param | IB_QP_STATE))
+		return false;
+
+	return true;
+}
+
+static int efa_modify_qp_validate(struct efa_dev *dev, struct efa_qp *qp,
+				  struct ib_qp_attr *qp_attr, int qp_attr_mask,
+				  enum ib_qp_state cur_state,
+				  enum ib_qp_state new_state)
+{
+	int err;
+
+#define EFA_MODIFY_QP_SUPP_MASK \
+	(IB_QP_STATE | IB_QP_CUR_STATE | IB_QP_EN_SQD_ASYNC_NOTIFY | \
+	 IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_QKEY | IB_QP_SQ_PSN | \
+	 IB_QP_RNR_RETRY)
+
+	if (qp_attr_mask & ~EFA_MODIFY_QP_SUPP_MASK) {
+		ibdev_dbg(&dev->ibdev,
+			  "Unsupported qp_attr_mask[%#x] supported[%#x]\n",
+			  qp_attr_mask, EFA_MODIFY_QP_SUPP_MASK);
+		return -EOPNOTSUPP;
+	}
+
+	if (qp->ibqp.qp_type == IB_QPT_DRIVER)
+		err = !efa_modify_srd_qp_is_ok(cur_state, new_state,
+					       qp_attr_mask);
+	else
+#ifdef HAVE_IB_MODIFY_QP_IS_OK_FOUR_PARAMS
+		err = !ib_modify_qp_is_ok(cur_state, new_state, IB_QPT_UD,
+					  qp_attr_mask);
+#else
+		err = !ib_modify_qp_is_ok(cur_state, new_state, IB_QPT_UD,
+					  qp_attr_mask,
+					  IB_LINK_LAYER_UNSPECIFIED);
+#endif
+
+	if (err) {
+		ibdev_dbg(&dev->ibdev, "Invalid modify QP parameters\n");
+		return -EINVAL;
+	}
+
+	if ((qp_attr_mask & IB_QP_PORT) && qp_attr->port_num != 1) {
+		ibdev_dbg(&dev->ibdev, "Can't change port num\n");
+		return -EOPNOTSUPP;
+	}
+
+	if ((qp_attr_mask & IB_QP_PKEY_INDEX) && qp_attr->pkey_index) {
+		ibdev_dbg(&dev->ibdev, "Can't change pkey index\n");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+int efa_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+		  int qp_attr_mask, struct ib_udata *udata)
+{
+	struct efa_dev *dev = to_edev(ibqp->device);
+	struct efa_com_modify_qp_params params = {};
+	struct efa_qp *qp = to_eqp(ibqp);
+	enum ib_qp_state cur_state;
+	enum ib_qp_state new_state;
+	int err;
+
+#ifndef HAVE_NO_KVERBS_DRIVERS
+	if (!udata) {
+		ibdev_dbg(&dev->ibdev, "udata is NULL\n");
+		return -EOPNOTSUPP;
+	}
+#endif
+
+#ifdef HAVE_UVERBS_CMD_MASK_NOT_NEEDED
+	if (qp_attr_mask & ~IB_QP_ATTR_STANDARD_BITS)
+		return -EOPNOTSUPP;
+#endif
+
+	if (udata->inlen &&
+#ifdef HAVE_UVERBS_CMD_HDR_FIX
+	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
+#else
+	    /* WA for e093111ddb6c ("IB/core: Fix input len in multiple user verbs") */
+	    !ib_is_udata_cleared(udata, 0, udata->inlen - sizeof(struct ib_uverbs_cmd_hdr))) {
+#endif
+		ibdev_dbg(&dev->ibdev,
+			  "Incompatible ABI params, udata not cleared\n");
+		return -EINVAL;
+	}
+
+	cur_state = qp_attr_mask & IB_QP_CUR_STATE ? qp_attr->cur_qp_state :
+						     qp->state;
+	new_state = qp_attr_mask & IB_QP_STATE ? qp_attr->qp_state : cur_state;
+
+	err = efa_modify_qp_validate(dev, qp, qp_attr, qp_attr_mask, cur_state,
+				     new_state);
+	if (err)
+		return err;
+
+	params.qp_handle = qp->qp_handle;
+
+	if (qp_attr_mask & IB_QP_STATE) {
+		EFA_SET(&params.modify_mask, EFA_ADMIN_MODIFY_QP_CMD_QP_STATE,
+			1);
+		EFA_SET(&params.modify_mask,
+			EFA_ADMIN_MODIFY_QP_CMD_CUR_QP_STATE, 1);
+		params.cur_qp_state = cur_state;
+		params.qp_state = new_state;
+	}
+
+	if (qp_attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) {
+		EFA_SET(&params.modify_mask,
+			EFA_ADMIN_MODIFY_QP_CMD_SQ_DRAINED_ASYNC_NOTIFY, 1);
+		params.sq_drained_async_notify = qp_attr->en_sqd_async_notify;
+	}
+
+	if (qp_attr_mask & IB_QP_QKEY) {
+		EFA_SET(&params.modify_mask, EFA_ADMIN_MODIFY_QP_CMD_QKEY, 1);
+		params.qkey = qp_attr->qkey;
+	}
+
+	if (qp_attr_mask & IB_QP_SQ_PSN) {
+		EFA_SET(&params.modify_mask, EFA_ADMIN_MODIFY_QP_CMD_SQ_PSN, 1);
+		params.sq_psn = qp_attr->sq_psn;
+	}
+
+	if (qp_attr_mask & IB_QP_RNR_RETRY) {
+		EFA_SET(&params.modify_mask, EFA_ADMIN_MODIFY_QP_CMD_RNR_RETRY,
+			1);
+		params.rnr_retry = qp_attr->rnr_retry;
+	}
+
+	err = efa_com_modify_qp(&dev->edev, &params);
+	if (err)
+		return err;
+
+	qp->state = new_state;
+
+	return 0;
+}
+
+static int efa_destroy_cq_idx(struct efa_dev *dev, int cq_idx)
+{
+	struct efa_com_destroy_cq_params params = { .cq_idx = cq_idx };
+
+	return efa_com_destroy_cq(&dev->edev, &params);
+}
+
+#if defined(HAVE_IB_VOID_DESTROY_CQ) || defined(HAVE_IB_INT_DESTROY_CQ)
+#ifdef HAVE_IB_INT_DESTROY_CQ
+int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+#else
+void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+#endif
+{
+	struct efa_dev *dev = to_edev(ibcq->device);
+	struct efa_cq *cq = to_ecq(ibcq);
+
+	ibdev_dbg(&dev->ibdev,
+		  "Destroy cq[%d] virt[0x%p] freed: size[%lu], dma[%pad]\n",
+		  cq->cq_idx, cq->cpu_addr, cq->size, &cq->dma_addr);
+
+	rdma_user_mmap_entry_remove(cq->mmap_entry);
+	efa_destroy_cq_idx(dev, cq->cq_idx);
+	efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size,
+			DMA_FROM_DEVICE);
+#ifndef HAVE_CQ_CORE_ALLOCATION
+	kfree(cq);
+#endif
+#ifdef HAVE_IB_INT_DESTROY_CQ
+	return 0;
+#endif
+}
+#else
+#ifdef HAVE_DESTROY_CQ_UDATA
+int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+#else
+int efa_destroy_cq(struct ib_cq *ibcq)
+#endif
+{
+	struct efa_dev *dev = to_edev(ibcq->device);
+	struct efa_cq *cq = to_ecq(ibcq);
+	int err;
+
+	ibdev_dbg(&dev->ibdev,
+		  "Destroy cq[%d] virt[0x%p] freed: size[%lu], dma[%pad]\n",
+		  cq->cq_idx, cq->cpu_addr, cq->size, &cq->dma_addr);
+
+	rdma_user_mmap_entry_remove(cq->mmap_entry);
+	err = efa_destroy_cq_idx(dev, cq->cq_idx);
+	if (err)
+		return err;
+
+	efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size,
+			DMA_FROM_DEVICE);
+
+	kfree(cq);
+	return 0;
+}
+#endif
+
+static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq,
+				 struct efa_ibv_create_cq_resp *resp)
+{
+	resp->q_mmap_size = cq->size;
+	cq->mmap_entry = efa_user_mmap_entry_insert(&cq->ucontext->ibucontext,
+						    virt_to_phys(cq->cpu_addr),
+						    cq->size, EFA_MMAP_DMA_PAGE,
+						    &resp->q_mmap_key);
+	if (!cq->mmap_entry)
+		return -ENOMEM;
+
+	return 0;
+}
+
+#ifdef HAVE_CREATE_CQ_ATTR
+int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		  struct ib_udata *udata)
+#else
+int efa_create_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
+#endif
+{
+#ifdef HAVE_UDATA_TO_DRV_CONTEXT
+	struct efa_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct efa_ucontext, ibucontext);
+#else
+	struct efa_ucontext *ucontext = to_ecq(ibcq)->ucontext;
+#endif
+	struct efa_ibv_create_cq_resp resp = {};
+	struct efa_com_create_cq_params params;
+	struct efa_com_create_cq_result result;
+	struct ib_device *ibdev = ibcq->device;
+	struct efa_dev *dev = to_edev(ibdev);
+	struct efa_ibv_create_cq cmd = {};
+	struct efa_cq *cq = to_ecq(ibcq);
+#ifdef HAVE_CREATE_CQ_ATTR
+	int entries = attr->cqe;
+#endif
+	int err;
+
+	ibdev_dbg(ibdev, "create_cq entries %d\n", entries);
+
+#ifdef HAVE_CREATE_CQ_ATTR
+	if (attr->flags)
+		return -EOPNOTSUPP;
+#endif
+
+	if (entries < 1 || entries > dev->dev_attr.max_cq_depth) {
+		ibdev_dbg(ibdev,
+			  "cq: requested entries[%u] non-positive or greater than max[%u]\n",
+			  entries, dev->dev_attr.max_cq_depth);
+		err = -EINVAL;
+		goto err_out;
+	}
+
+#ifndef HAVE_NO_KVERBS_DRIVERS
+	if (!udata) {
+		ibdev_dbg(ibdev, "udata is NULL\n");
+		err = -EOPNOTSUPP;
+		goto err_out;
+	}
+#endif
+
+	if (offsetofend(typeof(cmd), num_sub_cqs) > udata->inlen) {
+		ibdev_dbg(ibdev,
+			  "Incompatible ABI params, no input udata\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	if (udata->inlen > sizeof(cmd) &&
+#ifdef HAVE_UVERBS_CMD_HDR_FIX
+	    !ib_is_udata_cleared(udata, sizeof(cmd),
+				 udata->inlen - sizeof(cmd))) {
+#else
+	    /* WA for e093111ddb6c ("IB/core: Fix input len in multiple user verbs") */
+	    !ib_is_udata_cleared(udata, sizeof(cmd),
+				 udata->inlen - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr))) {
+#endif
+		ibdev_dbg(ibdev,
+			  "Incompatible ABI params, unknown fields in udata\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	err = ib_copy_from_udata(&cmd, udata,
+				 min(sizeof(cmd), udata->inlen));
+	if (err) {
+		ibdev_dbg(ibdev, "Cannot copy udata for create_cq\n");
+		goto err_out;
+	}
+
+	if (cmd.comp_mask || !is_reserved_cleared(cmd.reserved_50)) {
+		ibdev_dbg(ibdev,
+			  "Incompatible ABI params, unknown fields in udata\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	if (!cmd.cq_entry_size) {
+		ibdev_dbg(ibdev,
+			  "Invalid entry size [%u]\n", cmd.cq_entry_size);
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	if (cmd.num_sub_cqs != dev->dev_attr.sub_cqs_per_cq) {
+		ibdev_dbg(ibdev,
+			  "Invalid number of sub cqs[%u] expected[%u]\n",
+			  cmd.num_sub_cqs, dev->dev_attr.sub_cqs_per_cq);
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	cq->ucontext = ucontext;
+	cq->size = PAGE_ALIGN(cmd.cq_entry_size * entries * cmd.num_sub_cqs);
+	cq->cpu_addr = efa_zalloc_mapped(dev, &cq->dma_addr, cq->size,
+					 DMA_FROM_DEVICE);
+	if (!cq->cpu_addr) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	params.uarn = cq->ucontext->uarn;
+	params.cq_depth = entries;
+	params.dma_addr = cq->dma_addr;
+	params.entry_size_in_bytes = cmd.cq_entry_size;
+	params.num_sub_cqs = cmd.num_sub_cqs;
+	err = efa_com_create_cq(&dev->edev, &params, &result);
+	if (err)
+		goto err_free_mapped;
+
+	resp.cq_idx = result.cq_idx;
+	cq->cq_idx = result.cq_idx;
+	cq->ibcq.cqe = result.actual_depth;
+	WARN_ON_ONCE(entries != result.actual_depth);
+
+	err = cq_mmap_entries_setup(dev, cq, &resp);
+	if (err) {
+		ibdev_dbg(ibdev, "Could not setup cq[%u] mmap entries\n",
+			  cq->cq_idx);
+		goto err_destroy_cq;
+	}
+
+	if (udata->outlen) {
+		err = ib_copy_to_udata(udata, &resp,
+				       min(sizeof(resp), udata->outlen));
+		if (err) {
+			ibdev_dbg(ibdev,
+				  "Failed to copy udata for create_cq\n");
+			goto err_remove_mmap;
+		}
+	}
+
+	ibdev_dbg(ibdev, "Created cq[%d], cq depth[%u]. dma[%pad] virt[0x%p]\n",
+		  cq->cq_idx, result.actual_depth, &cq->dma_addr, cq->cpu_addr);
+
+	return 0;
+
+err_remove_mmap:
+	rdma_user_mmap_entry_remove(cq->mmap_entry);
+err_destroy_cq:
+	efa_destroy_cq_idx(dev, cq->cq_idx);
+err_free_mapped:
+	efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size,
+			DMA_FROM_DEVICE);
+
+err_out:
+	atomic64_inc(&dev->stats.create_cq_err);
+	return err;
+}
+
+#ifndef HAVE_CQ_CORE_ALLOCATION
+#ifdef HAVE_CREATE_CQ_NO_UCONTEXT
+struct ib_cq *efa_kzalloc_cq(struct ib_device *ibdev,
+			     const struct ib_cq_init_attr *attr,
+			     struct ib_udata *udata)
+#elif defined(HAVE_CREATE_CQ_ATTR)
+struct ib_cq *efa_kzalloc_cq(struct ib_device *ibdev,
+			     const struct ib_cq_init_attr *attr,
+			     struct ib_ucontext *ibucontext,
+			     struct ib_udata *udata)
+#else
+struct ib_cq *efa_kzalloc_cq(struct ib_device *ibdev, int entries,
+			     int vector,
+			     struct ib_ucontext *ibucontext,
+			     struct ib_udata *udata)
+#endif
+{
+	struct efa_dev *dev = to_edev(ibdev);
+	struct efa_cq *cq;
+	int err;
+
+	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+	if (!cq) {
+		atomic64_inc(&dev->stats.create_cq_err);
+		return ERR_PTR(-ENOMEM);
+	}
+
+#ifdef HAVE_UDATA_TO_DRV_CONTEXT
+	cq->ucontext = rdma_udata_to_drv_context(udata, struct efa_ucontext,
+						 ibucontext);
+#else
+	cq->ucontext = to_eucontext(ibucontext);
+#endif
+
+	cq->ibcq.device = ibdev;
+#ifdef HAVE_CREATE_CQ_ATTR
+	err = efa_create_cq(&cq->ibcq, attr, udata);
+#else
+	err = efa_create_cq(&cq->ibcq, entries, udata);
+#endif
+	if (err)
+		goto err_free_cq;
+
+	return &cq->ibcq;
+
+err_free_cq:
+	kfree(cq);
+	return ERR_PTR(err);
+}
+#endif
+
+#ifdef HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE
+static int umem_to_page_list(struct efa_dev *dev,
+			     struct ib_umem *umem,
+			     u64 *page_list,
+			     u32 hp_cnt,
+			     u8 hp_shift)
+{
+	u32 pages_in_hp = BIT(hp_shift - PAGE_SHIFT);
+	struct ib_block_iter biter;
+	unsigned int hp_idx = 0;
+
+	ibdev_dbg(&dev->ibdev, "hp_cnt[%u], pages_in_hp[%u]\n",
+		  hp_cnt, pages_in_hp);
+
+	rdma_umem_for_each_dma_block(umem, &biter, BIT(hp_shift))
+		page_list[hp_idx++] = rdma_block_iter_dma_address(&biter);
+
+	return 0;
+}
+#elif defined(HAVE_SG_DMA_PAGE_ITER)
+static int umem_to_page_list(struct efa_dev *dev,
+			     struct ib_umem *umem,
+			     u64 *page_list,
+			     u32 hp_cnt,
+			     u8 hp_shift)
+{
+	u32 pages_in_hp = BIT(hp_shift - PAGE_SHIFT);
+	struct sg_dma_page_iter sg_iter;
+	unsigned int page_idx = 0;
+	unsigned int hp_idx = 0;
+
+	ibdev_dbg(&dev->ibdev, "hp_cnt[%u], pages_in_hp[%u]\n",
+		  hp_cnt, pages_in_hp);
+
+	for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
+		if (page_idx % pages_in_hp == 0) {
+			page_list[hp_idx] = sg_page_iter_dma_address(&sg_iter);
+			hp_idx++;
+		}
+
+		page_idx++;
+	}
+
+	return 0;
+}
+#elif defined(HAVE_UMEM_SCATTERLIST_IF)
+static int umem_to_page_list(struct efa_dev *dev,
+			     struct ib_umem *umem,
+			     u64 *page_list,
+			     u32 hp_cnt,
+			     u8 hp_shift)
+{
+	u32 pages_in_hp = BIT(hp_shift - PAGE_SHIFT);
+	unsigned int page_idx = 0;
+	unsigned int pages_in_sg;
+	unsigned int hp_idx = 0;
+	struct scatterlist *sg;
+	unsigned int entry;
+	unsigned int i;
+
+	ibdev_dbg(&dev->ibdev, "hp_cnt[%u], pages_in_hp[%u]\n",
+		  hp_cnt, pages_in_hp);
+
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+		if (sg_dma_len(sg) & ~PAGE_MASK) {
+			ibdev_dbg(&dev->ibdev,
+				  "sg_dma_len[%u] does not divide by PAGE_SIZE[%lu]\n",
+				  sg_dma_len(sg), PAGE_SIZE);
+			return -EINVAL;
+		}
+
+		pages_in_sg = sg_dma_len(sg) >> PAGE_SHIFT;
+		for (i = 0; i < pages_in_sg; i++) {
+			if (page_idx % pages_in_hp == 0) {
+				page_list[hp_idx] = sg_dma_address(sg) +
+						    i * PAGE_SIZE;
+				hp_idx++;
+			}
+
+			page_idx++;
+		}
+	}
+
+	return 0;
+}
+#else
+#warning deprecated api
+static int umem_to_page_list(struct efa_dev *dev,
+			     struct ib_umem *umem,
+			     u64 *page_list,
+			     u32 hp_cnt,
+			     u8 hp_shift)
+{
+	u32 pages_in_hp = BIT(hp_shift - PAGE_SHIFT);
+	struct ib_umem_chunk *chunk;
+	unsigned int page_idx = 0;
+	unsigned int pages_in_sg;
+	unsigned int hp_idx = 0;
+	unsigned int entry;
+	unsigned int i;
+
+	ibdev_dbg(&dev->ibdev, "hp_cnt[%u], pages_in_hp[%u]\n",
+		  hp_cnt, pages_in_hp);
+
+	list_for_each_entry(chunk, &umem->chunk_list, list) {
+		for (entry = 0; entry < chunk->nents; entry++) {
+			if (sg_dma_len(&chunk->page_list[entry]) & ~PAGE_MASK) {
+				ibdev_dbg(&dev->ibdev,
+					  "sg_dma_len[%u] does not divide by PAGE_SIZE[%lu]\n",
+					  sg_dma_len(&chunk->page_list[entry]),
+					  PAGE_SIZE);
+				return -EINVAL;
+			}
+
+			pages_in_sg = sg_dma_len(&chunk->page_list[entry])
+				      >> PAGE_SHIFT;
+			for (i = 0; i < pages_in_sg; i++) {
+				if (page_idx % pages_in_hp == 0) {
+					page_list[hp_idx] =
+						sg_dma_address(&chunk->page_list[entry]) +
+						i * PAGE_SIZE;
+					hp_idx++;
+				}
+
+				page_idx++;
+			}
+		}
+	}
+
+	return 0;
+}
+#endif
+
+static struct scatterlist *efa_vmalloc_buf_to_sg(u64 *buf, int page_cnt)
+{
+	struct scatterlist *sglist;
+	struct page *pg;
+	int i;
+
+	sglist = kmalloc_array(page_cnt, sizeof(*sglist), GFP_KERNEL);
+	if (!sglist)
+		return NULL;
+	sg_init_table(sglist, page_cnt);
+	for (i = 0; i < page_cnt; i++) {
+		pg = vmalloc_to_page(buf);
+		if (!pg)
+			goto err;
+		sg_set_page(&sglist[i], pg, PAGE_SIZE, 0);
+		buf += PAGE_SIZE / sizeof(*buf);
+	}
+	return sglist;
+
+err:
+	kfree(sglist);
+	return NULL;
+}
+
+/*
+ * create a chunk list of physical pages dma addresses from the supplied
+ * scatter gather list
+ */
+static int pbl_chunk_list_create(struct efa_dev *dev, struct pbl_context *pbl)
+{
+	struct pbl_chunk_list *chunk_list = &pbl->phys.indirect.chunk_list;
+	int page_cnt = pbl->phys.indirect.pbl_buf_size_in_pages;
+	struct scatterlist *pages_sgl = pbl->phys.indirect.sgl;
+	unsigned int chunk_list_size, chunk_idx, payload_idx;
+	int sg_dma_cnt = pbl->phys.indirect.sg_dma_cnt;
+	struct efa_com_ctrl_buff_info *ctrl_buf;
+	u64 *cur_chunk_buf, *prev_chunk_buf;
+#ifdef HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE
+	struct ib_block_iter biter;
+#else
+	struct scatterlist *sg;
+	unsigned int entry, payloads_in_sg;
+#endif
+	dma_addr_t dma_addr;
+	int i;
+
+	/* allocate a chunk list that consists of 4KB chunks */
+	chunk_list_size = DIV_ROUND_UP(page_cnt, EFA_PTRS_PER_CHUNK);
+
+	chunk_list->size = chunk_list_size;
+	chunk_list->chunks = kcalloc(chunk_list_size,
+				     sizeof(*chunk_list->chunks),
+				     GFP_KERNEL);
+	if (!chunk_list->chunks)
+		return -ENOMEM;
+
+	ibdev_dbg(&dev->ibdev,
+		  "chunk_list_size[%u] - pages[%u]\n", chunk_list_size,
+		  page_cnt);
+
+	/* allocate chunk buffers: */
+	for (i = 0; i < chunk_list_size; i++) {
+		chunk_list->chunks[i].buf = kzalloc(EFA_CHUNK_SIZE, GFP_KERNEL);
+		if (!chunk_list->chunks[i].buf)
+			goto chunk_list_dealloc;
+
+		chunk_list->chunks[i].length = EFA_CHUNK_USED_SIZE;
+	}
+	chunk_list->chunks[chunk_list_size - 1].length =
+		((page_cnt % EFA_PTRS_PER_CHUNK) * EFA_CHUNK_PAYLOAD_PTR_SIZE) +
+			EFA_CHUNK_PTR_SIZE;
+
+	/* fill the dma addresses of sg list pages to chunks: */
+	chunk_idx = 0;
+	payload_idx = 0;
+	cur_chunk_buf = chunk_list->chunks[0].buf;
+#ifdef HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE
+	rdma_for_each_block(pages_sgl, &biter, sg_dma_cnt,
+			    EFA_CHUNK_PAYLOAD_SIZE) {
+		cur_chunk_buf[payload_idx++] =
+			rdma_block_iter_dma_address(&biter);
+
+		if (payload_idx == EFA_PTRS_PER_CHUNK) {
+			chunk_idx++;
+			cur_chunk_buf = chunk_list->chunks[chunk_idx].buf;
+			payload_idx = 0;
+		}
+	}
+#else
+	for_each_sg(pages_sgl, sg, sg_dma_cnt, entry) {
+		payloads_in_sg = sg_dma_len(sg) >> EFA_CHUNK_PAYLOAD_SHIFT;
+		for (i = 0; i < payloads_in_sg; i++) {
+			cur_chunk_buf[payload_idx++] =
+				(sg_dma_address(sg) & ~(EFA_CHUNK_PAYLOAD_SIZE - 1)) +
+				(EFA_CHUNK_PAYLOAD_SIZE * i);
+
+			if (payload_idx == EFA_PTRS_PER_CHUNK) {
+				chunk_idx++;
+				cur_chunk_buf = chunk_list->chunks[chunk_idx].buf;
+				payload_idx = 0;
+			}
+		}
+	}
+#endif
+
+	/* map chunks to dma and fill chunks next ptrs */
+	for (i = chunk_list_size - 1; i >= 0; i--) {
+		dma_addr = dma_map_single(&dev->pdev->dev,
+					  chunk_list->chunks[i].buf,
+					  chunk_list->chunks[i].length,
+					  DMA_TO_DEVICE);
+		if (dma_mapping_error(&dev->pdev->dev, dma_addr)) {
+			ibdev_err(&dev->ibdev,
+				  "chunk[%u] dma_map_failed\n", i);
+			goto chunk_list_unmap;
+		}
+
+		chunk_list->chunks[i].dma_addr = dma_addr;
+		ibdev_dbg(&dev->ibdev,
+			  "chunk[%u] mapped at [%pad]\n", i, &dma_addr);
+
+		if (!i)
+			break;
+
+		prev_chunk_buf = chunk_list->chunks[i - 1].buf;
+
+		ctrl_buf = (struct efa_com_ctrl_buff_info *)
+				&prev_chunk_buf[EFA_PTRS_PER_CHUNK];
+		ctrl_buf->length = chunk_list->chunks[i].length;
+
+		efa_com_set_dma_addr(dma_addr,
+				     &ctrl_buf->address.mem_addr_high,
+				     &ctrl_buf->address.mem_addr_low);
+	}
+
+	return 0;
+
+chunk_list_unmap:
+	for (; i < chunk_list_size; i++) {
+		dma_unmap_single(&dev->pdev->dev, chunk_list->chunks[i].dma_addr,
+				 chunk_list->chunks[i].length, DMA_TO_DEVICE);
+	}
+chunk_list_dealloc:
+	for (i = 0; i < chunk_list_size; i++)
+		kfree(chunk_list->chunks[i].buf);
+
+	kfree(chunk_list->chunks);
+	return -ENOMEM;
+}
+
+static void pbl_chunk_list_destroy(struct efa_dev *dev, struct pbl_context *pbl)
+{
+	struct pbl_chunk_list *chunk_list = &pbl->phys.indirect.chunk_list;
+	int i;
+
+	for (i = 0; i < chunk_list->size; i++) {
+		dma_unmap_single(&dev->pdev->dev, chunk_list->chunks[i].dma_addr,
+				 chunk_list->chunks[i].length, DMA_TO_DEVICE);
+		kfree(chunk_list->chunks[i].buf);
+	}
+
+	kfree(chunk_list->chunks);
+}
+
+/* initialize pbl continuous mode: map pbl buffer to a dma address. */
+static int pbl_continuous_initialize(struct efa_dev *dev,
+				     struct pbl_context *pbl)
+{
+	dma_addr_t dma_addr;
+
+	dma_addr = dma_map_single(&dev->pdev->dev, pbl->pbl_buf,
+				  pbl->pbl_buf_size_in_bytes, DMA_TO_DEVICE);
+	if (dma_mapping_error(&dev->pdev->dev, dma_addr)) {
+		ibdev_err(&dev->ibdev, "Unable to map pbl to DMA address\n");
+		return -ENOMEM;
+	}
+
+	pbl->phys.continuous.dma_addr = dma_addr;
+	ibdev_dbg(&dev->ibdev,
+		  "pbl continuous - dma_addr = %pad, size[%u]\n",
+		  &dma_addr, pbl->pbl_buf_size_in_bytes);
+
+	return 0;
+}
+
+/*
+ * initialize pbl indirect mode:
+ * create a chunk list out of the dma addresses of the physical pages of
+ * pbl buffer.
+ */
+static int pbl_indirect_initialize(struct efa_dev *dev, struct pbl_context *pbl)
+{
+	u32 size_in_pages = DIV_ROUND_UP(pbl->pbl_buf_size_in_bytes, PAGE_SIZE);
+	struct scatterlist *sgl;
+	int sg_dma_cnt, err;
+
+	BUILD_BUG_ON(EFA_CHUNK_PAYLOAD_SIZE > PAGE_SIZE);
+	sgl = efa_vmalloc_buf_to_sg(pbl->pbl_buf, size_in_pages);
+	if (!sgl)
+		return -ENOMEM;
+
+	sg_dma_cnt = dma_map_sg(&dev->pdev->dev, sgl, size_in_pages, DMA_TO_DEVICE);
+	if (!sg_dma_cnt) {
+		err = -EINVAL;
+		goto err_map;
+	}
+
+	pbl->phys.indirect.pbl_buf_size_in_pages = size_in_pages;
+	pbl->phys.indirect.sgl = sgl;
+	pbl->phys.indirect.sg_dma_cnt = sg_dma_cnt;
+	err = pbl_chunk_list_create(dev, pbl);
+	if (err) {
+		ibdev_dbg(&dev->ibdev,
+			  "chunk_list creation failed[%d]\n", err);
+		goto err_chunk;
+	}
+
+	ibdev_dbg(&dev->ibdev,
+		  "pbl indirect - size[%u], chunks[%u]\n",
+		  pbl->pbl_buf_size_in_bytes,
+		  pbl->phys.indirect.chunk_list.size);
+
+	return 0;
+
+err_chunk:
+	dma_unmap_sg(&dev->pdev->dev, sgl, size_in_pages, DMA_TO_DEVICE);
+err_map:
+	kfree(sgl);
+	return err;
+}
+
+static void pbl_indirect_terminate(struct efa_dev *dev, struct pbl_context *pbl)
+{
+	pbl_chunk_list_destroy(dev, pbl);
+	dma_unmap_sg(&dev->pdev->dev, pbl->phys.indirect.sgl,
+		     pbl->phys.indirect.pbl_buf_size_in_pages, DMA_TO_DEVICE);
+	kfree(pbl->phys.indirect.sgl);
+}
+
+/* create a page buffer list from a mapped user memory region */
+static int pbl_create(struct efa_dev *dev,
+		      struct pbl_context *pbl,
+#ifdef HAVE_EFA_GDR
+		      struct efa_mr *mr,
+#else
+		      struct ib_umem *umem,
+#endif
+		      int hp_cnt,
+		      u8 hp_shift)
+{
+	int err;
+
+	pbl->pbl_buf_size_in_bytes = hp_cnt * EFA_CHUNK_PAYLOAD_PTR_SIZE;
+	pbl->pbl_buf = kvzalloc(pbl->pbl_buf_size_in_bytes, GFP_KERNEL);
+	if (!pbl->pbl_buf)
+		return -ENOMEM;
+
+	if (is_vmalloc_addr(pbl->pbl_buf)) {
+		pbl->physically_continuous = 0;
+#ifdef HAVE_EFA_GDR
+		if (mr->umem)
+			err = umem_to_page_list(dev, mr->umem, pbl->pbl_buf, hp_cnt,
+						hp_shift);
+		else
+			err = nvmem_to_page_list(dev, mr->nvmem, pbl->pbl_buf);
+#else
+		err = umem_to_page_list(dev, umem, pbl->pbl_buf, hp_cnt,
+					hp_shift);
+#endif
+		if (err)
+			goto err_free;
+
+		err = pbl_indirect_initialize(dev, pbl);
+		if (err)
+			goto err_free;
+	} else {
+		pbl->physically_continuous = 1;
+#ifdef HAVE_EFA_GDR
+		if (mr->umem)
+			err = umem_to_page_list(dev, mr->umem, pbl->pbl_buf, hp_cnt,
+						hp_shift);
+		else
+			err = nvmem_to_page_list(dev, mr->nvmem, pbl->pbl_buf);
+#else
+		err = umem_to_page_list(dev, umem, pbl->pbl_buf, hp_cnt,
+					hp_shift);
+#endif
+		if (err)
+			goto err_free;
+
+		err = pbl_continuous_initialize(dev, pbl);
+		if (err)
+			goto err_free;
+	}
+
+	ibdev_dbg(&dev->ibdev,
+		  "user_pbl_created: user_pages[%u], continuous[%u]\n",
+		  hp_cnt, pbl->physically_continuous);
+
+	return 0;
+
+err_free:
+	kvfree(pbl->pbl_buf);
+	return err;
+}
+
+static void pbl_destroy(struct efa_dev *dev, struct pbl_context *pbl)
+{
+	if (pbl->physically_continuous)
+		dma_unmap_single(&dev->pdev->dev, pbl->phys.continuous.dma_addr,
+				 pbl->pbl_buf_size_in_bytes, DMA_TO_DEVICE);
+	else
+		pbl_indirect_terminate(dev, pbl);
+
+	kvfree(pbl->pbl_buf);
+}
+
+static int efa_create_inline_pbl(struct efa_dev *dev, struct efa_mr *mr,
+				 struct efa_com_reg_mr_params *params)
+{
+	int err;
+
+	params->inline_pbl = 1;
+#ifdef HAVE_EFA_GDR
+	if (mr->umem)
+		err = umem_to_page_list(dev, mr->umem, params->pbl.inline_pbl_array,
+					params->page_num, params->page_shift);
+	else
+		err = nvmem_to_page_list(dev, mr->nvmem,
+					 params->pbl.inline_pbl_array);
+#else
+	err = umem_to_page_list(dev, mr->umem, params->pbl.inline_pbl_array,
+				params->page_num, params->page_shift);
+#endif
+	if (err)
+		return err;
+
+	ibdev_dbg(&dev->ibdev,
+		  "inline_pbl_array - pages[%u]\n", params->page_num);
+
+	return 0;
+}
+
+static int efa_create_pbl(struct efa_dev *dev,
+			  struct pbl_context *pbl,
+			  struct efa_mr *mr,
+			  struct efa_com_reg_mr_params *params)
+{
+	int err;
+
+#ifdef HAVE_EFA_GDR
+	err = pbl_create(dev, pbl, mr, params->page_num,
+			 params->page_shift);
+#else
+	err = pbl_create(dev, pbl, mr->umem, params->page_num,
+			 params->page_shift);
+#endif
+	if (err) {
+		ibdev_dbg(&dev->ibdev, "Failed to create pbl[%d]\n", err);
+		return err;
+	}
+
+	params->inline_pbl = 0;
+	params->indirect = !pbl->physically_continuous;
+	if (pbl->physically_continuous) {
+		params->pbl.pbl.length = pbl->pbl_buf_size_in_bytes;
+
+		efa_com_set_dma_addr(pbl->phys.continuous.dma_addr,
+				     &params->pbl.pbl.address.mem_addr_high,
+				     &params->pbl.pbl.address.mem_addr_low);
+	} else {
+		params->pbl.pbl.length =
+			pbl->phys.indirect.chunk_list.chunks[0].length;
+
+		efa_com_set_dma_addr(pbl->phys.indirect.chunk_list.chunks[0].dma_addr,
+				     &params->pbl.pbl.address.mem_addr_high,
+				     &params->pbl.pbl.address.mem_addr_low);
+	}
+
+	return 0;
+}
+
+#ifndef HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE
+static unsigned long efa_cont_pages(struct ib_umem *umem,
+				    unsigned long page_size_cap,
+				    u64 addr)
+{
+	unsigned long max_page_shift = fls64(page_size_cap);
+#ifndef HAVE_UMEM_SCATTERLIST_IF
+	struct ib_umem_chunk *chunk;
+#else
+	struct scatterlist *sg;
+#endif
+	u64 base = ~0, p = 0;
+	unsigned long tmp;
+	unsigned long m;
+	u64 len, pfn;
+	int i = 0;
+	int entry;
+
+	addr = addr >> PAGE_SHIFT;
+	tmp = (unsigned long)addr;
+	m = find_first_bit(&tmp, BITS_PER_LONG);
+	m = min_t(unsigned long, max_page_shift - PAGE_SHIFT, m);
+
+#ifndef HAVE_UMEM_SCATTERLIST_IF
+	list_for_each_entry(chunk, &umem->chunk_list, list) {
+		for (entry = 0; entry < chunk->nents; entry++) {
+			len = DIV_ROUND_UP(sg_dma_len(&chunk->page_list[entry]),
+					   PAGE_SIZE);
+			pfn = sg_dma_address(&chunk->page_list[entry]) >> PAGE_SHIFT;
+			if (base + p != pfn) {
+				/*
+				 * If either the offset or the new
+				 * base are unaligned update m
+				 */
+				tmp = (unsigned long)(pfn | p);
+				if (!IS_ALIGNED(tmp, 1 << m))
+					m = find_first_bit(&tmp, BITS_PER_LONG);
+
+				base = pfn;
+				p = 0;
+			}
+
+			p += len;
+			i += len;
+		}
+	}
+#else
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+		len = DIV_ROUND_UP(sg_dma_len(sg), PAGE_SIZE);
+		pfn = sg_dma_address(sg) >> PAGE_SHIFT;
+		if (base + p != pfn) {
+			/*
+			 * If either the offset or the new
+			 * base are unaligned update m
+			 */
+			tmp = (unsigned long)(pfn | p);
+			if (!IS_ALIGNED(tmp, 1 << m))
+				m = find_first_bit(&tmp, BITS_PER_LONG);
+
+			base = pfn;
+			p = 0;
+		}
+
+		p += len;
+		i += len;
+	}
+#endif
+
+	if (i)
+		m = min_t(unsigned long, ilog2(roundup_pow_of_two(i)), m);
+	else
+		m = 0;
+
+	return BIT(PAGE_SHIFT + m);
+}
+#endif
+
+struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
+			 u64 virt_addr, int access_flags,
+			 struct ib_udata *udata)
+{
+	struct efa_dev *dev = to_edev(ibpd->device);
+	struct efa_com_reg_mr_params params = {};
+	struct efa_com_reg_mr_result result = {};
+	struct pbl_context pbl;
+	int supp_access_flags;
+	unsigned int pg_sz;
+	struct efa_mr *mr;
+	int inline_size;
+	int err;
+
+#ifndef HAVE_NO_KVERBS_DRIVERS
+	if (!udata) {
+		ibdev_dbg(&dev->ibdev, "udata is NULL\n");
+		err = -EOPNOTSUPP;
+		goto err_out;
+	}
+#endif
+
+	if (udata && udata->inlen &&
+#ifdef HAVE_UVERBS_CMD_HDR_FIX
+	    !ib_is_udata_cleared(udata, 0, sizeof(udata->inlen))) {
+#else
+	    /* WA for e093111ddb6c ("IB/core: Fix input len in multiple user verbs") */
+	    !ib_is_udata_cleared(udata, 0, sizeof(udata->inlen) - sizeof(struct ib_uverbs_cmd_hdr))) {
+#endif
+		ibdev_dbg(&dev->ibdev,
+			  "Incompatible ABI params, udata not cleared\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	supp_access_flags =
+		IB_ACCESS_LOCAL_WRITE |
+		(EFA_DEV_CAP(dev, RDMA_READ) ? IB_ACCESS_REMOTE_READ : 0);
+
+#ifdef HAVE_IB_ACCESS_OPTIONAL
+	access_flags &= ~IB_ACCESS_OPTIONAL;
+#endif
+	if (access_flags & ~supp_access_flags) {
+		ibdev_dbg(&dev->ibdev,
+			  "Unsupported access flags[%#x], supported[%#x]\n",
+			  access_flags, supp_access_flags);
+		err = -EOPNOTSUPP;
+		goto err_out;
+	}
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+#ifdef HAVE_EFA_GDR
+	mr->nvmem = nvmem_get(dev, mr, start, length, &pg_sz);
+	if (!mr->nvmem) {
+#ifdef HAVE_IB_UMEM_GET_DEVICE_PARAM
+		mr->umem = ib_umem_get(ibpd->device, start, length,
+				       access_flags);
+#elif defined(HAVE_IB_UMEM_GET_NO_DMASYNC)
+		mr->umem = ib_umem_get(udata, start, length, access_flags);
+#elif defined(HAVE_IB_UMEM_GET_UDATA)
+		mr->umem = ib_umem_get(udata, start, length, access_flags, 0);
+#else
+		mr->umem = ib_umem_get(ibpd->uobject->context, start, length,
+				       access_flags, 0);
+#endif
+		if (IS_ERR(mr->umem)) {
+			err = PTR_ERR(mr->umem);
+			ibdev_dbg(&dev->ibdev,
+				  "Failed to pin and map user space memory[%d]\n",
+				  err);
+			goto err_free;
+		}
+
+#ifdef HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE
+		pg_sz = ib_umem_find_best_pgsz(mr->umem,
+					       dev->dev_attr.page_size_cap,
+					       virt_addr);
+		if (!pg_sz) {
+			err = -EOPNOTSUPP;
+			ibdev_dbg(&dev->ibdev, "Failed to find a suitable page size in page_size_cap %#llx\n",
+				  dev->dev_attr.page_size_cap);
+			goto err_unmap;
+		}
+#else
+		pg_sz = efa_cont_pages(mr->umem, dev->dev_attr.page_size_cap,
+				       virt_addr);
+#endif
+	}
+#else /* !defined(HAVE_EFA_GDR) */
+#ifdef HAVE_IB_UMEM_GET_DEVICE_PARAM
+	mr->umem = ib_umem_get(ibpd->device, start, length, access_flags);
+#elif defined(HAVE_IB_UMEM_GET_NO_DMASYNC)
+	mr->umem = ib_umem_get(udata, start, length, access_flags);
+#elif defined(HAVE_IB_UMEM_GET_UDATA)
+	mr->umem = ib_umem_get(udata, start, length, access_flags, 0);
+#else
+	mr->umem = ib_umem_get(ibpd->uobject->context, start, length,
+			       access_flags, 0);
+#endif
+	if (IS_ERR(mr->umem)) {
+		err = PTR_ERR(mr->umem);
+		ibdev_dbg(&dev->ibdev,
+			  "Failed to pin and map user space memory[%d]\n", err);
+		goto err_free;
+	}
+#endif /* defined(HAVE_EFA_GDR) */
+
+	params.pd = to_epd(ibpd)->pdn;
+	params.iova = virt_addr;
+	params.mr_length_in_bytes = length;
+	params.permissions = access_flags;
+
+#ifndef HAVE_EFA_GDR
+#ifdef HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE
+	pg_sz = ib_umem_find_best_pgsz(mr->umem,
+				       dev->dev_attr.page_size_cap,
+				       virt_addr);
+	if (!pg_sz) {
+		err = -EOPNOTSUPP;
+		ibdev_dbg(&dev->ibdev, "Failed to find a suitable page size in page_size_cap %#llx\n",
+			  dev->dev_attr.page_size_cap);
+		goto err_unmap;
+	}
+#else
+	pg_sz = efa_cont_pages(mr->umem, dev->dev_attr.page_size_cap,
+			       virt_addr);
+#endif /* defined(HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE) */
+#endif /* !defined(HAVE_EFA_GDR) */
+
+	params.page_shift = order_base_2(pg_sz);
+#ifdef HAVE_IB_UMEM_NUM_DMA_BLOCKS
+	params.page_num = ib_umem_num_dma_blocks(mr->umem, pg_sz);
+#else
+	params.page_num = DIV_ROUND_UP(length + (virt_addr & (pg_sz - 1)),
+				       pg_sz);
+#endif
+
+	ibdev_dbg(&dev->ibdev,
+		  "start %#llx length %#llx params.page_shift %u params.page_num %u\n",
+		  start, length, params.page_shift, params.page_num);
+
+	inline_size = ARRAY_SIZE(params.pbl.inline_pbl_array);
+	if (params.page_num <= inline_size) {
+		err = efa_create_inline_pbl(dev, mr, &params);
+		if (err)
+			goto err_unmap;
+
+		err = efa_com_register_mr(&dev->edev, &params, &result);
+		if (err)
+			goto err_unmap;
+	} else {
+		err = efa_create_pbl(dev, &pbl, mr, &params);
+		if (err)
+			goto err_unmap;
+
+		err = efa_com_register_mr(&dev->edev, &params, &result);
+		pbl_destroy(dev, &pbl);
+
+		if (err)
+			goto err_unmap;
+	}
+
+	mr->ibmr.lkey = result.l_key;
+	mr->ibmr.rkey = result.r_key;
+#ifdef HAVE_IB_MR_LENGTH
+	mr->ibmr.length = length;
+#endif
+#ifdef HAVE_EFA_GDR
+	if (mr->nvmem) {
+		mr->nvmem->lkey = result.l_key;
+		mr->nvmem->needs_dereg = true;
+	}
+#endif
+	ibdev_dbg(&dev->ibdev, "Registered mr[%d]\n", mr->ibmr.lkey);
+
+	return &mr->ibmr;
+
+err_unmap:
+#ifdef HAVE_EFA_GDR
+	if (mr->nvmem)
+		nvmem_release(dev, mr->nvmem, false);
+	else
+		ib_umem_release(mr->umem);
+#else
+	ib_umem_release(mr->umem);
+#endif
+err_free:
+	kfree(mr);
+err_out:
+	atomic64_inc(&dev->stats.reg_mr_err);
+	return ERR_PTR(err);
+}
+
+#ifdef HAVE_DEREG_MR_UDATA
+int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
+#else
+int efa_dereg_mr(struct ib_mr *ibmr)
+#endif
+{
+	struct efa_dev *dev = to_edev(ibmr->device);
+	struct efa_com_dereg_mr_params params;
+	struct efa_mr *mr = to_emr(ibmr);
+	int err;
+
+	ibdev_dbg(&dev->ibdev, "Deregister mr[%d]\n", ibmr->lkey);
+
+#ifdef HAVE_EFA_GDR
+	if (mr->nvmem){
+		err = nvmem_put(mr->nvmem_ticket, false);
+		if (err)
+			return err;
+
+		kfree(mr);
+		return 0;
+	}
+#endif
+	params.l_key = mr->ibmr.lkey;
+	err = efa_com_dereg_mr(&dev->edev, &params);
+	if (err)
+		return err;
+
+	ib_umem_release(mr->umem);
+	kfree(mr);
+
+	return 0;
+}
+
+#ifdef HAVE_GET_PORT_IMMUTABLE
+int efa_get_port_immutable(struct ib_device *ibdev, u8 port_num,
+			   struct ib_port_immutable *immutable)
+{
+	struct ib_port_attr attr;
+	int err;
+
+	err = ib_query_port(ibdev, port_num, &attr);
+	if (err) {
+		ibdev_dbg(ibdev, "Couldn't query port err[%d]\n", err);
+		return err;
+	}
+
+	immutable->pkey_tbl_len = attr.pkey_tbl_len;
+	immutable->gid_tbl_len = attr.gid_tbl_len;
+
+	return 0;
+}
+#endif
+
+static int efa_dealloc_uar(struct efa_dev *dev, u16 uarn)
+{
+	struct efa_com_dealloc_uar_params params = {
+		.uarn = uarn,
+	};
+
+	return efa_com_dealloc_uar(&dev->edev, &params);
+}
+
+#define EFA_CHECK_USER_COMP(_dev, _comp_mask, _attr, _mask, _attr_str) \
+	(_attr_str = (!(_dev)->dev_attr._attr || ((_comp_mask) & (_mask))) ? \
+		     NULL : #_attr)
+
+static int efa_user_comp_handshake(const struct ib_ucontext *ibucontext,
+				   const struct efa_ibv_alloc_ucontext_cmd *cmd)
+{
+	struct efa_dev *dev = to_edev(ibucontext->device);
+	char *attr_str;
+
+	if (EFA_CHECK_USER_COMP(dev, cmd->comp_mask, max_tx_batch,
+				EFA_ALLOC_UCONTEXT_CMD_COMP_TX_BATCH, attr_str))
+		goto err;
+
+	if (EFA_CHECK_USER_COMP(dev, cmd->comp_mask, min_sq_depth,
+				EFA_ALLOC_UCONTEXT_CMD_COMP_MIN_SQ_WR,
+				attr_str))
+		goto err;
+
+	return 0;
+
+err:
+	ibdev_dbg(&dev->ibdev, "Userspace handshake failed for %s attribute\n",
+		  attr_str);
+	return -EOPNOTSUPP;
+}
+
+int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata)
+{
+	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
+	struct efa_dev *dev = to_edev(ibucontext->device);
+	struct efa_ibv_alloc_ucontext_resp resp = {};
+	struct efa_ibv_alloc_ucontext_cmd cmd = {};
+	struct efa_com_alloc_uar_result result;
+	int err;
+
+	/*
+	 * it's fine if the driver does not know all request fields,
+	 * we will ack input fields in our response.
+	 */
+
+	err = ib_copy_from_udata(&cmd, udata,
+				 min(sizeof(cmd), udata->inlen));
+	if (err) {
+		ibdev_dbg(&dev->ibdev,
+			  "Cannot copy udata for alloc_ucontext\n");
+		goto err_out;
+	}
+
+	err = efa_user_comp_handshake(ibucontext, &cmd);
+	if (err)
+		goto err_out;
+
+	err = efa_com_alloc_uar(&dev->edev, &result);
+	if (err)
+		goto err_out;
+
+	ucontext->uarn = result.uarn;
+#ifndef HAVE_CORE_MMAP_XA
+	mutex_init(&ucontext->lock);
+	INIT_LIST_HEAD(&ucontext->pending_mmaps);
+#endif /* !defined(HAVE_CORE_MMAP_XA) */
+
+#ifdef HAVE_IB_QUERY_DEVICE_UDATA
+	resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE;
+#endif
+#ifndef HAVE_CREATE_AH_NO_UDATA
+	resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_CREATE_AH;
+#endif
+	resp.sub_cqs_per_cq = dev->dev_attr.sub_cqs_per_cq;
+	resp.inline_buf_size = dev->dev_attr.inline_buf_size;
+	resp.max_llq_size = dev->dev_attr.max_llq_size;
+	resp.max_tx_batch = dev->dev_attr.max_tx_batch;
+	resp.min_sq_wr = dev->dev_attr.min_sq_depth;
+
+	err = ib_copy_to_udata(udata, &resp,
+			       min(sizeof(resp), udata->outlen));
+	if (err)
+		goto err_dealloc_uar;
+
+	return 0;
+
+err_dealloc_uar:
+	efa_dealloc_uar(dev, result.uarn);
+err_out:
+	atomic64_inc(&dev->stats.alloc_ucontext_err);
+	return err;
+}
+
+#ifndef HAVE_UCONTEXT_CORE_ALLOCATION
+struct ib_ucontext *efa_kzalloc_ucontext(struct ib_device *ibdev,
+					 struct ib_udata *udata)
+{
+	struct efa_dev *dev = to_edev(ibdev);
+	struct efa_ucontext *ucontext;
+	int err;
+
+	/*
+	 * it's fine if the driver does not know all request fields,
+	 * we will ack input fields in our response.
+	 */
+
+	ucontext = kzalloc(sizeof(*ucontext), GFP_KERNEL);
+	if (!ucontext) {
+		atomic64_inc(&dev->stats.alloc_ucontext_err);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	ucontext->ibucontext.device = ibdev;
+	err = efa_alloc_ucontext(&ucontext->ibucontext, udata);
+	if (err)
+		goto err_free_ucontext;
+
+	return &ucontext->ibucontext;
+
+err_free_ucontext:
+	kfree(ucontext);
+	return ERR_PTR(err);
+}
+#endif
+
+#ifdef HAVE_UCONTEXT_CORE_ALLOCATION
+void efa_dealloc_ucontext(struct ib_ucontext *ibucontext)
+#else
+int efa_dealloc_ucontext(struct ib_ucontext *ibucontext)
+#endif
+{
+	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
+	struct efa_dev *dev = to_edev(ibucontext->device);
+
+#ifndef HAVE_CORE_MMAP_XA
+	mmap_entries_remove_free(dev, ucontext);
+#endif
+	efa_dealloc_uar(dev, ucontext->uarn);
+#ifndef HAVE_UCONTEXT_CORE_ALLOCATION
+	kfree(ucontext);
+
+	return 0;
+#endif
+}
+
+#ifdef HAVE_CORE_MMAP_XA
+void efa_mmap_free(struct rdma_user_mmap_entry *rdma_entry)
+{
+	struct efa_user_mmap_entry *entry = to_emmap(rdma_entry);
+
+	kfree(entry);
+}
+#endif
+
+static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext,
+		      struct vm_area_struct *vma)
+{
+	struct rdma_user_mmap_entry *rdma_entry;
+	struct efa_user_mmap_entry *entry;
+	unsigned long va;
+	int err = 0;
+	u64 pfn;
+
+	rdma_entry = rdma_user_mmap_entry_get(&ucontext->ibucontext, vma);
+	if (!rdma_entry) {
+		ibdev_dbg(&dev->ibdev,
+			  "pgoff[%#lx] does not have valid entry\n",
+			  vma->vm_pgoff);
+		atomic64_inc(&dev->stats.mmap_err);
+		return -EINVAL;
+	}
+	entry = to_emmap(rdma_entry);
+
+	ibdev_dbg(&dev->ibdev,
+		  "Mapping address[%#llx], length[%#zx], mmap_flag[%d]\n",
+		  entry->address, rdma_entry->npages * PAGE_SIZE,
+		  entry->mmap_flag);
+
+	pfn = entry->address >> PAGE_SHIFT;
+	switch (entry->mmap_flag) {
+	case EFA_MMAP_IO_NC:
+#ifdef HAVE_CORE_MMAP_XA
+		err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn,
+					entry->rdma_entry.npages * PAGE_SIZE,
+					pgprot_noncached(vma->vm_page_prot),
+					rdma_entry);
+#elif defined(HAVE_RDMA_USER_MMAP_IO)
+		err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn,
+					entry->rdma_entry.npages * PAGE_SIZE,
+					pgprot_noncached(vma->vm_page_prot));
+#else
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		err = io_remap_pfn_range(vma, vma->vm_start, pfn,
+					 entry->rdma_entry.npages * PAGE_SIZE,
+					 vma->vm_page_prot);
+#endif
+		break;
+	case EFA_MMAP_IO_WC:
+#ifdef HAVE_CORE_MMAP_XA
+		err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn,
+					entry->rdma_entry.npages * PAGE_SIZE,
+					pgprot_writecombine(vma->vm_page_prot),
+					rdma_entry);
+#elif defined(HAVE_RDMA_USER_MMAP_IO)
+		err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn,
+					entry->rdma_entry.npages * PAGE_SIZE,
+					pgprot_writecombine(vma->vm_page_prot));
+#else
+		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+		err = io_remap_pfn_range(vma, vma->vm_start, pfn,
+					 entry->rdma_entry.npages * PAGE_SIZE,
+					 vma->vm_page_prot);
+#endif
+		break;
+	case EFA_MMAP_DMA_PAGE:
+		for (va = vma->vm_start; va < vma->vm_end;
+		     va += PAGE_SIZE, pfn++) {
+			err = vm_insert_page(vma, va, pfn_to_page(pfn));
+			if (err)
+				break;
+		}
+		break;
+	default:
+		err = -EINVAL;
+	}
+
+	if (err) {
+		ibdev_dbg(
+			&dev->ibdev,
+			"Couldn't mmap address[%#llx] length[%#zx] mmap_flag[%d] err[%d]\n",
+			entry->address, rdma_entry->npages * PAGE_SIZE,
+			entry->mmap_flag, err);
+		atomic64_inc(&dev->stats.mmap_err);
+	}
+
+	rdma_user_mmap_entry_put(rdma_entry);
+	return err;
+}
+
+int efa_mmap(struct ib_ucontext *ibucontext,
+	     struct vm_area_struct *vma)
+{
+	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
+	struct efa_dev *dev = to_edev(ibucontext->device);
+	size_t length = vma->vm_end - vma->vm_start;
+
+	ibdev_dbg(&dev->ibdev,
+		  "start %#lx, end %#lx, length = %#zx, pgoff = %#lx\n",
+		  vma->vm_start, vma->vm_end, length, vma->vm_pgoff);
+
+	return __efa_mmap(dev, ucontext, vma);
+}
+
+#ifdef HAVE_CREATE_AH_NO_UDATA
+struct efa_ah_id {
+	struct list_head list;
+	/* dest_addr */
+	u8 id[EFA_GID_SIZE];
+	unsigned int ref_count;
+	u16 ah;
+};
+
+static inline bool efa_ah_id_equal(u8 *id1, u8 *id2)
+{
+	return !memcmp(id1, id2, EFA_GID_SIZE);
+}
+
+/* Must be called with dev->ah_list_lock held */
+static int efa_get_ah_id(struct efa_dev *dev, u8 *id, bool ref_update, u16 *ah)
+{
+	struct efa_ah_id *ah_id;
+
+	list_for_each_entry(ah_id, &dev->efa_ah_list, list) {
+		if (efa_ah_id_equal(ah_id->id, id)) {
+			if (ref_update)
+				ah_id->ref_count++;
+			if (ah)
+				*ah = ah_id->ah;
+			return 0;
+		}
+	}
+
+	return -EINVAL;
+}
+
+static void efa_put_ah_id(struct efa_dev *dev, u8 *id)
+{
+	struct efa_ah_id *ah_id, *tmp;
+
+	mutex_lock(&dev->ah_list_lock);
+	list_for_each_entry_safe(ah_id, tmp, &dev->efa_ah_list, list) {
+		if (efa_ah_id_equal(ah_id->id, id)) {
+			ah_id->ref_count--;
+			if (!ah_id->ref_count) {
+				list_del(&ah_id->list);
+				kfree(ah_id);
+				mutex_unlock(&dev->ah_list_lock);
+				return;
+			}
+		}
+	}
+	mutex_unlock(&dev->ah_list_lock);
+}
+
+/* Must be called with dev->ah_list_lock held */
+static struct efa_ah_id *efa_create_ah_id(struct efa_dev *dev, u8 *id, u16 ah)
+{
+	struct efa_ah_id *ah_id;
+
+	ah_id = kzalloc(sizeof(*ah_id), GFP_KERNEL);
+	if (!ah_id)
+		return NULL;
+
+	memcpy(ah_id->id, id, sizeof(ah_id->id));
+	ah_id->ref_count = 1;
+	ah_id->ah = ah;
+
+	return ah_id;
+}
+
+static int efa_add_ah_id(struct efa_dev *dev, u8 *id, u16 ah)
+{
+	struct efa_ah_id *ah_id;
+	int err;
+
+	mutex_lock(&dev->ah_list_lock);
+	err = efa_get_ah_id(dev, id, true, NULL);
+	if (err) {
+		ah_id = efa_create_ah_id(dev, id, ah);
+		if (!ah_id) {
+			err = -ENOMEM;
+			goto err_unlock;
+		}
+
+		list_add_tail(&ah_id->list, &dev->efa_ah_list);
+	}
+	mutex_unlock(&dev->ah_list_lock);
+
+	return 0;
+
+err_unlock:
+	mutex_unlock(&dev->ah_list_lock);
+	return err;
+}
+#endif
+
+static int efa_ah_destroy(struct efa_dev *dev, struct efa_ah *ah)
+{
+	struct efa_com_destroy_ah_params params = {
+		.ah = ah->ah,
+		.pdn = to_epd(ah->ibah.pd)->pdn,
+	};
+
+	return efa_com_destroy_ah(&dev->edev, &params);
+}
+
+int efa_create_ah(struct ib_ah *ibah,
+#ifdef HAVE_CREATE_AH_INIT_ATTR
+		  struct rdma_ah_init_attr *init_attr,
+#else
+#ifdef HAVE_CREATE_AH_RDMA_ATTR
+		  struct rdma_ah_attr *ah_attr,
+#else
+		  struct ib_ah_attr *ah_attr,
+#endif
+		  u32 flags,
+#endif
+		  struct ib_udata *udata)
+{
+#ifdef HAVE_CREATE_AH_INIT_ATTR
+	struct rdma_ah_attr *ah_attr = init_attr->ah_attr;
+#endif
+	struct efa_dev *dev = to_edev(ibah->device);
+	struct efa_com_create_ah_params params = {};
+#ifndef HAVE_CREATE_AH_NO_UDATA
+	struct efa_ibv_create_ah_resp resp = {};
+#endif
+	struct efa_com_create_ah_result result;
+	struct efa_ah *ah = to_eah(ibah);
+	int err;
+
+#if defined(HAVE_CREATE_DESTROY_AH_FLAGS) || defined(HAVE_CREATE_AH_INIT_ATTR)
+#ifdef HAVE_CREATE_AH_INIT_ATTR
+	if (!(init_attr->flags & RDMA_CREATE_AH_SLEEPABLE)) {
+#else
+	if (!(flags & RDMA_CREATE_AH_SLEEPABLE)) {
+#endif
+		ibdev_dbg(&dev->ibdev,
+			  "Create address handle is not supported in atomic context\n");
+		err = -EOPNOTSUPP;
+		goto err_out;
+	}
+#endif
+
+#ifndef HAVE_CREATE_AH_NO_UDATA
+#ifndef HAVE_NO_KVERBS_DRIVERS
+	if (!udata) {
+		ibdev_dbg(&dev->ibdev, "udata is NULL\n");
+		err = -EOPNOTSUPP;
+		goto err_out;
+	}
+#endif
+
+	if (udata->inlen &&
+#ifdef HAVE_UVERBS_CMD_HDR_FIX
+	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
+#else
+	    /* WA for e093111ddb6c ("IB/core: Fix input len in multiple user verbs") */
+	    !ib_is_udata_cleared(udata, 0, udata->inlen - sizeof(struct ib_uverbs_cmd_hdr))) {
+#endif
+		ibdev_dbg(&dev->ibdev, "Incompatible ABI params\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+#endif
+
+	memcpy(params.dest_addr, ah_attr->grh.dgid.raw,
+	       sizeof(params.dest_addr));
+	params.pdn = to_epd(ibah->pd)->pdn;
+	err = efa_com_create_ah(&dev->edev, &params, &result);
+	if (err)
+		goto err_out;
+
+	memcpy(ah->id, ah_attr->grh.dgid.raw, sizeof(ah->id));
+	ah->ah = result.ah;
+
+#ifndef HAVE_CREATE_AH_NO_UDATA
+	resp.efa_address_handle = result.ah;
+
+	if (udata->outlen) {
+		err = ib_copy_to_udata(udata, &resp,
+				       min(sizeof(resp), udata->outlen));
+		if (err) {
+			ibdev_dbg(&dev->ibdev,
+				  "Failed to copy udata for create_ah response\n");
+			goto err_destroy_ah;
+		}
+	}
+#else
+	err = efa_add_ah_id(dev, ah_attr->grh.dgid.raw, result.ah);
+	if (err) {
+		ibdev_dbg(&dev->ibdev, "Failed to add AH id\n");
+		goto err_destroy_ah;
+	}
+#endif
+	ibdev_dbg(&dev->ibdev, "Created ah[%d]\n", ah->ah);
+
+	return 0;
+
+err_destroy_ah:
+	efa_ah_destroy(dev, ah);
+err_out:
+	atomic64_inc(&dev->stats.create_ah_err);
+	return err;
+}
+
+#ifndef HAVE_AH_CORE_ALLOCATION
+#ifdef HAVE_CREATE_DESTROY_AH_FLAGS
+struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd,
+			     struct rdma_ah_attr *ah_attr,
+			     u32 flags,
+			     struct ib_udata *udata)
+#elif defined(HAVE_CREATE_AH_RDMA_ATTR)
+struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd,
+			     struct rdma_ah_attr *ah_attr,
+			     struct ib_udata *udata)
+#elif defined(HAVE_CREATE_AH_UDATA)
+struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd,
+			     struct ib_ah_attr *ah_attr,
+			     struct ib_udata *udata)
+#else
+struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd,
+			     struct ib_ah_attr *ah_attr)
+#endif
+{
+	struct efa_ah *ah;
+	int err;
+#ifndef HAVE_CREATE_DESTROY_AH_FLAGS
+	u32 flags = 0;
+#endif
+#ifdef HAVE_CREATE_AH_NO_UDATA
+	void *udata = NULL;
+#endif
+
+	ah = kzalloc(sizeof(*ah), GFP_KERNEL);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	ah->ibah.device = ibpd->device;
+	ah->ibah.pd = ibpd;
+	err = efa_create_ah(&ah->ibah, ah_attr, flags, udata);
+	if (err)
+		goto err_free;
+
+	return &ah->ibah;
+
+err_free:
+	kfree(ah);
+	return ERR_PTR(err);
+}
+#endif
+
+#ifdef HAVE_AH_CORE_ALLOCATION_DESTROY_RC
+int efa_destroy_ah(struct ib_ah *ibah, u32 flags)
+#elif defined(HAVE_AH_CORE_ALLOCATION)
+void efa_destroy_ah(struct ib_ah *ibah, u32 flags)
+#elif defined(HAVE_CREATE_DESTROY_AH_FLAGS)
+int efa_destroy_ah(struct ib_ah *ibah, u32 flags)
+#else
+int efa_destroy_ah(struct ib_ah *ibah)
+#endif
+{
+	struct efa_dev *dev = to_edev(ibah->pd->device);
+	struct efa_ah *ah = to_eah(ibah);
+#ifndef HAVE_AH_CORE_ALLOCATION
+	int err;
+#endif
+
+	ibdev_dbg(&dev->ibdev, "Destroy ah[%d]\n", ah->ah);
+
+#if defined(HAVE_CREATE_DESTROY_AH_FLAGS)
+	if (!(flags & RDMA_DESTROY_AH_SLEEPABLE)) {
+		ibdev_dbg(&dev->ibdev,
+			  "Destroy address handle is not supported in atomic context\n");
+#if defined(HAVE_AH_CORE_ALLOCATION) && !defined(HAVE_AH_CORE_ALLOCATION_DESTROY_RC)
+		return;
+#else
+		return -EOPNOTSUPP;
+#endif
+	}
+#endif
+
+#if defined(HAVE_AH_CORE_ALLOCATION) || defined(HAVE_AH_CORE_ALLOCATION_DESTROY_RC)
+	efa_ah_destroy(dev, ah);
+#ifdef HAVE_AH_CORE_ALLOCATION_DESTROY_RC
+	return 0;
+#endif
+#else
+	err = efa_ah_destroy(dev, ah);
+	if (err)
+		return err;
+#ifdef HAVE_CREATE_AH_NO_UDATA
+	efa_put_ah_id(dev, ah->id);
+#endif
+	kfree(ah);
+	return 0;
+#endif
+}
+
+#ifdef HAVE_HW_STATS
+struct rdma_hw_stats *efa_alloc_hw_stats(struct ib_device *ibdev, u8 port_num)
+{
+	return rdma_alloc_hw_stats_struct(efa_stats_names,
+					  ARRAY_SIZE(efa_stats_names),
+					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
+		     u8 port_num, int index)
+{
+	struct efa_com_get_stats_params params = {};
+	union efa_com_get_stats_result result;
+	struct efa_dev *dev = to_edev(ibdev);
+	struct efa_com_rdma_read_stats *rrs;
+	struct efa_com_messages_stats *ms;
+	struct efa_com_basic_stats *bs;
+	struct efa_com_stats_admin *as;
+	struct efa_stats *s;
+	int err;
+
+	params.scope = EFA_ADMIN_GET_STATS_SCOPE_ALL;
+	params.type = EFA_ADMIN_GET_STATS_TYPE_BASIC;
+
+	err = efa_com_get_stats(&dev->edev, &params, &result);
+	if (err)
+		return err;
+
+	bs = &result.basic_stats;
+	stats->value[EFA_TX_BYTES] = bs->tx_bytes;
+	stats->value[EFA_TX_PKTS] = bs->tx_pkts;
+	stats->value[EFA_RX_BYTES] = bs->rx_bytes;
+	stats->value[EFA_RX_PKTS] = bs->rx_pkts;
+	stats->value[EFA_RX_DROPS] = bs->rx_drops;
+
+	params.type = EFA_ADMIN_GET_STATS_TYPE_MESSAGES;
+	err = efa_com_get_stats(&dev->edev, &params, &result);
+	if (err)
+		return err;
+
+	ms = &result.messages_stats;
+	stats->value[EFA_SEND_BYTES] = ms->send_bytes;
+	stats->value[EFA_SEND_WRS] = ms->send_wrs;
+	stats->value[EFA_RECV_BYTES] = ms->recv_bytes;
+	stats->value[EFA_RECV_WRS] = ms->recv_wrs;
+
+	params.type = EFA_ADMIN_GET_STATS_TYPE_RDMA_READ;
+	err = efa_com_get_stats(&dev->edev, &params, &result);
+	if (err)
+		return err;
+
+	rrs = &result.rdma_read_stats;
+	stats->value[EFA_RDMA_READ_WRS] = rrs->read_wrs;
+	stats->value[EFA_RDMA_READ_BYTES] = rrs->read_bytes;
+	stats->value[EFA_RDMA_READ_WR_ERR] = rrs->read_wr_err;
+	stats->value[EFA_RDMA_READ_RESP_BYTES] = rrs->read_resp_bytes;
+
+	as = &dev->edev.aq.stats;
+	stats->value[EFA_SUBMITTED_CMDS] = atomic64_read(&as->submitted_cmd);
+	stats->value[EFA_COMPLETED_CMDS] = atomic64_read(&as->completed_cmd);
+	stats->value[EFA_CMDS_ERR] = atomic64_read(&as->cmd_err);
+	stats->value[EFA_NO_COMPLETION_CMDS] = atomic64_read(&as->no_completion);
+
+	s = &dev->stats;
+	stats->value[EFA_KEEP_ALIVE_RCVD] = atomic64_read(&s->keep_alive_rcvd);
+	stats->value[EFA_ALLOC_PD_ERR] = atomic64_read(&s->alloc_pd_err);
+	stats->value[EFA_CREATE_QP_ERR] = atomic64_read(&s->create_qp_err);
+	stats->value[EFA_CREATE_CQ_ERR] = atomic64_read(&s->create_cq_err);
+	stats->value[EFA_REG_MR_ERR] = atomic64_read(&s->reg_mr_err);
+	stats->value[EFA_ALLOC_UCONTEXT_ERR] =
+		atomic64_read(&s->alloc_ucontext_err);
+	stats->value[EFA_CREATE_AH_ERR] = atomic64_read(&s->create_ah_err);
+	stats->value[EFA_MMAP_ERR] = atomic64_read(&s->mmap_err);
+
+	return ARRAY_SIZE(efa_stats_names);
+}
+#endif
+
+#ifndef HAVE_NO_KVERBS_DRIVERS
+#ifdef HAVE_POST_CONST_WR
+int efa_post_send(struct ib_qp *ibqp,
+		  const struct ib_send_wr *wr,
+		  const struct ib_send_wr **bad_wr)
+#else
+int efa_post_send(struct ib_qp *ibqp,
+		  struct ib_send_wr *wr,
+		  struct ib_send_wr **bad_wr)
+#endif
+{
+	struct efa_dev *dev = to_edev(ibqp->device);
+
+	ibdev_warn(&dev->ibdev.dev, "Function not supported\n");
+	return -EOPNOTSUPP;
+}
+
+#ifdef HAVE_POST_CONST_WR
+int efa_post_recv(struct ib_qp *ibqp,
+		  const struct ib_recv_wr *wr,
+		  const struct ib_recv_wr **bad_wr)
+#else
+int efa_post_recv(struct ib_qp *ibqp,
+		  struct ib_recv_wr *wr,
+		  struct ib_recv_wr **bad_wr)
+#endif
+{
+	struct efa_dev *dev = to_edev(ibqp->device);
+
+	ibdev_warn(&dev->ibdev.dev, "Function not supported\n");
+	return -EOPNOTSUPP;
+}
+
+int efa_poll_cq(struct ib_cq *ibcq, int num_entries,
+		struct ib_wc *wc)
+{
+	struct efa_dev *dev = to_edev(ibcq->device);
+
+	ibdev_warn(&dev->ibdev.dev, "Function not supported\n");
+	return -EOPNOTSUPP;
+}
+
+int efa_req_notify_cq(struct ib_cq *ibcq,
+		      enum ib_cq_notify_flags flags)
+{
+	struct efa_dev *dev = to_edev(ibcq->device);
+
+	ibdev_warn(&dev->ibdev.dev, "Function not supported\n");
+	return -EOPNOTSUPP;
+}
+
+struct ib_mr *efa_get_dma_mr(struct ib_pd *ibpd, int acc)
+{
+	struct efa_dev *dev = to_edev(ibpd->device);
+
+	ibdev_warn(&dev->ibdev.dev, "Function not supported\n");
+	return ERR_PTR(-EOPNOTSUPP);
+}
+#endif
+
+enum rdma_link_layer efa_port_link_layer(struct ib_device *ibdev,
+					 u8 port_num)
+{
+	return IB_LINK_LAYER_UNSPECIFIED;
+}
+
+#ifdef HAVE_CUSTOM_COMMANDS
+#ifdef HAVE_CREATE_AH_NO_UDATA
+ssize_t efa_everbs_cmd_get_ah(struct efa_dev *dev,
+			      const char __user *buf,
+			      int in_len,
+			      int out_len)
+{
+	struct efa_everbs_get_ah_resp resp = {};
+	struct efa_everbs_get_ah cmd = {};
+	int err;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof(cmd)))
+		return -EFAULT;
+
+	if (cmd.comp_mask) {
+		ibdev_dbg(&dev->ibdev,
+			"Incompatible ABI params, unknown fields in udata\n");
+		return -EINVAL;
+	}
+
+	mutex_lock(&dev->ah_list_lock);
+	err = efa_get_ah_id(dev, cmd.gid, false, &resp.efa_address_handle);
+	mutex_unlock(&dev->ah_list_lock);
+	if (err) {
+		ibdev_dbg(&dev->ibdev,
+			"Couldn't find AH with specified GID\n");
+		return err;
+	}
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp,
+			 sizeof(resp)))
+		return -EFAULT;
+
+	return in_len;
+}
+#endif
+
+#ifndef HAVE_IB_QUERY_DEVICE_UDATA
+ssize_t efa_everbs_cmd_get_ex_dev_attrs(struct efa_dev *dev,
+					const char __user *buf,
+					int in_len,
+					int out_len)
+{
+	struct efa_com_get_device_attr_result *dev_attr = &dev->dev_attr;
+	struct efa_everbs_get_ex_dev_attrs_resp resp = {};
+	struct efa_everbs_get_ex_dev_attrs cmd = {};
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof(cmd)))
+		return -EFAULT;
+
+	if (cmd.comp_mask || !is_reserved_cleared(cmd.reserved_20)) {
+		ibdev_dbg(&dev->ibdev,
+			  "Incompatible ABI params, unknown fields in udata\n");
+		return -EINVAL;
+	}
+
+	resp.max_sq_sge = dev_attr->max_sq_sge;
+	resp.max_rq_sge = dev_attr->max_rq_sge;
+	resp.max_sq_wr = dev_attr->max_sq_depth;
+	resp.max_rq_wr = dev_attr->max_rq_depth;
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		return -EFAULT;
+
+	return in_len;
+}
+#endif
+#endif
diff --git a/drivers/amazon/net/efa/kcompat.h b/drivers/amazon/net/efa/kcompat.h
new file mode 100644
index 0000000000000..60575885f33a2
--- /dev/null
+++ b/drivers/amazon/net/efa/kcompat.h
@@ -0,0 +1,189 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _KCOMPAT_H_
+#define _KCOMPAT_H_
+
+#include "config.h"
+
+#if defined(HAVE_CREATE_AH_NO_UDATA) || !defined(HAVE_IB_QUERY_DEVICE_UDATA)
+#define HAVE_CUSTOM_COMMANDS
+#endif
+
+#ifndef ALIGN_DOWN
+#define ALIGN_DOWN(x, a)	__ALIGN_KERNEL((x) - ((a) - 1), (a))
+#endif
+
+#ifndef HAVE_IB_IS_UDATA_CLEARED
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <rdma/ib_verbs.h>
+
+static inline bool ib_is_udata_cleared(struct ib_udata *udata,
+				       size_t offset,
+				       size_t len)
+{
+	const void __user *p = udata->inbuf + offset;
+	bool ret = false;
+	u8 *buf;
+
+	if (len > USHRT_MAX)
+		return false;
+
+	buf = kmalloc(len, GFP_KERNEL);
+	if (!buf)
+		return false;
+
+	if (copy_from_user(buf, p, len))
+		goto free;
+
+	ret = !memchr_inv(buf, 0, len);
+
+free:
+	kfree(buf);
+	return ret;
+}
+#endif
+
+#ifndef HAVE_IB_QPT_DRIVER
+#define IB_QPT_DRIVER 0xFF
+#endif
+
+#if defined(HAVE_DRIVER_ID) && !defined(HAVE_UPSTREAM_EFA)
+#define RDMA_DRIVER_EFA 17
+#endif
+
+#ifndef HAVE_IBDEV_PRINT
+#define ibdev_err(_ibdev, format, arg...) \
+	dev_err(&((struct ib_device *)(_ibdev))->dev, format, ##arg)
+#define ibdev_dbg(_ibdev, format, arg...) \
+	dev_dbg(&((struct ib_device *)(_ibdev))->dev, format, ##arg)
+#define ibdev_warn(_ibdev, format, arg...) \
+	dev_warn(&((struct ib_device *)(_ibdev))->dev, format, ##arg)
+#define ibdev_info(_ibdev, format, arg...) \
+	dev_info(&((struct ib_device *)(_ibdev))->dev, format, ##arg)
+#endif
+
+#ifndef HAVE_IBDEV_PRINT_RATELIMITED
+#define ibdev_err_ratelimited(_ibdev, format, arg...) \
+	dev_err_ratelimited(&((struct ib_device *)(_ibdev))->dev, format, ##arg)
+#define ibdev_dbg_ratelimited(_ibdev, format, arg...) \
+	dev_dbg_ratelimited(&((struct ib_device *)(_ibdev))->dev, format, ##arg)
+#define ibdev_warn_ratelimited(_ibdev, format, arg...) \
+	dev_warn_ratelimited(&((struct ib_device *)(_ibdev))->dev, format, ##arg)
+#define ibdev_info_ratelimited(_ibdev, format, arg...) \
+	dev_info_ratelimited(&((struct ib_device *)(_ibdev))->dev, format, ##arg)
+#endif
+
+#ifndef HAVE_KVZALLOC
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+static inline void *kvzalloc(size_t size, gfp_t flags)
+{
+	void *addr;
+
+	addr = kzalloc(size, flags | __GFP_NOWARN);
+	if (addr)
+		return addr;
+
+	return vzalloc(size);
+}
+#endif
+
+#ifndef HAVE_IB_PORT_PHYS_STATE_LINK_UP
+#define IB_PORT_PHYS_STATE_LINK_UP 5
+#endif
+
+#ifndef HAVE_CORE_MMAP_XA
+#include <linux/types.h>
+#include <linux/device.h>
+
+struct rdma_user_mmap_entry {
+	struct ib_ucontext *ucontext;
+	unsigned long start_pgoff;
+	size_t npages;
+};
+
+/* Return the offset (in bytes) the user should pass to libc's mmap() */
+static inline u64
+rdma_user_mmap_get_offset(const struct rdma_user_mmap_entry *entry)
+{
+	return (u64)entry->start_pgoff << PAGE_SHIFT;
+}
+
+/*
+ * Backported kernels don't keep refcnt on entries, hence they should not
+ * be removed.
+ */
+static inline void
+rdma_user_mmap_entry_remove(struct rdma_user_mmap_entry *entry)
+{
+}
+
+static inline void rdma_user_mmap_entry_put(struct rdma_user_mmap_entry *entry)
+{
+}
+#endif
+
+#ifndef sizeof_field
+#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER))
+#endif
+
+#ifndef HAVE_BITFIELD_H
+#define __bf_shf(x) (__builtin_ffsll(x) - 1)
+
+#define FIELD_PREP(_mask, _val)                                         \
+	({                                                              \
+		((typeof(_mask))(_val) << __bf_shf(_mask)) & (_mask);   \
+	})
+
+#define FIELD_GET(_mask, _reg)                                          \
+	({                                                              \
+		(typeof(_mask))(((_reg) & (_mask)) >> __bf_shf(_mask)); \
+	})
+#endif
+
+#ifndef HAVE_RDMA_NODE_UNSPECIFIED
+enum {
+	RDMA_NODE_UNSPECIFIED = 7,
+};
+#endif
+
+#ifndef HAVE_ATOMIC64_FETCH_INC
+static __always_inline s64
+atomic64_fetch_inc(atomic64_t *v)
+{
+	return atomic64_inc_return(v) - 1;
+}
+#endif
+
+#if !defined(HAVE_RDMA_UMEM_FOR_EACH_DMA_BLOCK) && defined(HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE)
+#include <rdma/ib_umem.h>
+
+static inline void __rdma_umem_block_iter_start(struct ib_block_iter *biter,
+						struct ib_umem *umem,
+						unsigned long pgsz)
+{
+	__rdma_block_iter_start(biter, umem->sg_head.sgl, umem->nmap, pgsz);
+}
+
+/**
+ * rdma_umem_for_each_dma_block - iterate over contiguous DMA blocks of the umem
+ * @umem: umem to iterate over
+ * @pgsz: Page size to split the list into
+ *
+ * pgsz must be <= PAGE_SIZE or computed by ib_umem_find_best_pgsz(). The
+ * returned DMA blocks will be aligned to pgsz and span the range:
+ * ALIGN_DOWN(umem->address, pgsz) to ALIGN(umem->address + umem->length, pgsz)
+ *
+ * Performs exactly ib_umem_num_dma_blocks() iterations.
+ */
+#define rdma_umem_for_each_dma_block(umem, biter, pgsz)                        \
+	for (__rdma_umem_block_iter_start(biter, umem, pgsz);                  \
+	     __rdma_block_iter_next(biter);)
+#endif
+
+#endif /* _KCOMPAT_H_ */

From ff287ae2831aa14d48f22b8a5b761434e268fdb7 Mon Sep 17 00:00:00 2001
From: Anchal Agarwal <anchalag@amazon.com>
Date: Tue, 30 Mar 2021 14:49:39 +0000
Subject: [PATCH 104/737] Revert "vmlinux.lds.h: Add PGO and AutoFDO input
 sections"

This reverts commit eff8728fe69880d3f7983bec3fb6cea4c306261f.

Signed-off-by: Anchal Agarwal <anchalag@amazon.com>
---
 include/asm-generic/vmlinux.lds.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 44103f9487c9a..de322620f5431 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -593,10 +593,7 @@
  */
 #define TEXT_TEXT							\
 		ALIGN_FUNCTION();					\
-		*(.text.hot .text.hot.*)				\
-		*(TEXT_MAIN .text.fixup)				\
-		*(.text.unlikely .text.unlikely.*)			\
-		*(.text.unknown .text.unknown.*)			\
+		*(.text.hot TEXT_MAIN .text.fixup .text.unlikely)	\
 		NOINSTR_TEXT						\
 		*(.text..refcount)					\
 		*(.ref.text)						\

From 53467e13a0505482bb3aa9d96435331f0d4ac49f Mon Sep 17 00:00:00 2001
From: Vladimir Aerov <vaerov@amazon.com>
Date: Mon, 22 Feb 2021 15:56:51 -0800
Subject: [PATCH 105/737] arm64: Export acpi_psci_use_hvc() symbol

Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 arch/arm64/kernel/acpi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index cada0b816c8a3..765070aff31d2 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -117,6 +117,7 @@ bool acpi_psci_use_hvc(void)
 {
 	return acpi_gbl_FADT.arm_boot_flags & ACPI_FADT_PSCI_USE_HVC;
 }
+EXPORT_SYMBOL_GPL(acpi_psci_use_hvc);
 
 /*
  * acpi_fadt_sanity_check() - Check FADT presence and carry out sanity

From eba30fd148856113bae8ffac99b0d4de3617165f Mon Sep 17 00:00:00 2001
From: Vladimir Aerov <vaerov@amazon.com>
Date: Mon, 22 Feb 2021 16:01:09 -0800
Subject: [PATCH 106/737] hwrng: Add Gravition RNG driver

Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 drivers/char/hw_random/Kconfig        |  13 ++
 drivers/char/hw_random/Makefile       |   1 +
 drivers/char/hw_random/graviton-rng.c | 175 ++++++++++++++++++++++++++
 3 files changed, 189 insertions(+)
 create mode 100644 drivers/char/hw_random/graviton-rng.c

diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig
index a7d9e4600d40e..5ed8ef408cc09 100644
--- a/drivers/char/hw_random/Kconfig
+++ b/drivers/char/hw_random/Kconfig
@@ -536,6 +536,19 @@ config HW_RANDOM_XIPHERA
 	  To compile this driver as a module, choose M here: the
 	  module will be called xiphera-trng.
 
+config HW_RANDOM_GRAVITON
+	tristate "AWS Graviton Random Number Generator support"
+	depends on HW_RANDOM && ACPI && (ARM64 || COMPILE_TEST)
+	default HW_RANDOM
+	help
+	  This driver provides kernel-side support for the Random Number
+	  Generator SMC found on AWS Graviton systems.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called graviton-rng.
+
+	  If unsure, say Y.
+
 endif # HW_RANDOM
 
 config UML_RANDOM
diff --git a/drivers/char/hw_random/Makefile b/drivers/char/hw_random/Makefile
index 5da344509a4df..9ca4a225b0bca 100644
--- a/drivers/char/hw_random/Makefile
+++ b/drivers/char/hw_random/Makefile
@@ -46,3 +46,4 @@ obj-$(CONFIG_HW_RANDOM_OPTEE) += optee-rng.o
 obj-$(CONFIG_HW_RANDOM_NPCM) += npcm-rng.o
 obj-$(CONFIG_HW_RANDOM_CCTRNG) += cctrng.o
 obj-$(CONFIG_HW_RANDOM_XIPHERA) += xiphera-trng.o
+obj-$(CONFIG_HW_RANDOM_GRAVITON) += graviton-rng.o
diff --git a/drivers/char/hw_random/graviton-rng.c b/drivers/char/hw_random/graviton-rng.c
new file mode 100644
index 0000000000000..3a8f3fe35359b
--- /dev/null
+++ b/drivers/char/hw_random/graviton-rng.c
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * AWS Graviton TRNG driver
+ *
+ * Copyright (C) 2019 Amazon Corp.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+#include <linux/arm-smccc.h>
+#include <linux/device.h>
+#include <linux/hw_random.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/acpi.h>
+#include <linux/psci.h>
+#include <linux/module.h>
+
+#define ARM_SMCCC_SIP_GRAVITON_FEATURE_PROBE				\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_32,	\
+			   ARM_SMCCC_OWNER_SIP, 0x00ff)
+#define AWS_GRAVITON_UUID \
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_32, \
+			   ARM_SMCCC_OWNER_SIP, 0xFF01)
+#define AWS_GRAVITON_GET_VER \
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_32, \
+			   ARM_SMCCC_OWNER_SIP, 0xFF03)
+
+#define AWS_GRAVITON_GET_RND \
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_64, \
+			   ARM_SMCCC_OWNER_SIP, 0x60)
+#define AWS_GRAVITON_GET_RND_LEGACY \
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_32, \
+			   ARM_SMCCC_OWNER_SIP, 0x60)
+
+/**
+ *  UID of the Graviton TRNG API: eb4af8a0-89d4-49c9-bc8c5b38dc54308e
+ */
+#define GRVTN_TRNG_UUID_0		0xa0f83aeb
+#define GRVTN_TRNG_UUID_1		0xc949d489
+#define GRVTN_TRNG_UUID_2		0x385b8cbc
+#define GRVTN_TRNG_UUID_3		0x8e3054dc
+
+struct grvtn_rng {
+	u64 call_id;
+	struct hwrng rng;
+};
+
+static void grvtn_smccc_conduit(u64 call_id, struct arm_smccc_res *res)
+{
+	if (acpi_psci_use_hvc())
+		arm_smccc_1_1_hvc(call_id, res);
+	else
+		arm_smccc_1_1_smc(call_id, res);
+}
+
+static int grvtn_probe_sip_feature(unsigned long feature)
+{
+	struct arm_smccc_res res = {};
+
+	if (acpi_psci_use_hvc())
+		arm_smccc_1_1_hvc(ARM_SMCCC_SIP_GRAVITON_FEATURE_PROBE,
+			feature, 0, &res);
+	else
+		arm_smccc_1_1_smc(ARM_SMCCC_SIP_GRAVITON_FEATURE_PROBE,
+			feature, 0, &res);
+
+	return res.a0;
+}
+
+static int grvtn_trng_read(struct hwrng *rng, void *buf, size_t max, bool wait)
+{
+	struct grvtn_rng *priv = (struct grvtn_rng *)rng->priv;
+	struct arm_smccc_res res;
+	int err = 0;
+	/* timeout after one waiting period */
+	int iter_remain = 2;
+	size_t count = max > sizeof(ulong) * 2 ? sizeof(ulong) * 2 : max;
+	size_t total = count;
+
+	do {
+		if (err && wait)
+			/* Nominal wait is 5us */
+			udelay(err);
+
+		grvtn_smccc_conduit(priv->call_id, &res);
+
+		/* In the unlikely event of rolling back to legacy after probe was issued */
+		if (unlikely((res.a0 == SMCCC_RET_NOT_SUPPORTED) && (priv->call_id != AWS_GRAVITON_GET_RND_LEGACY))) {
+			grvtn_smccc_conduit(AWS_GRAVITON_GET_RND_LEGACY, &res);
+			priv->call_id = AWS_GRAVITON_GET_RND_LEGACY;
+		}
+
+		err = (int) res.a0;
+
+		if (err < 0)
+			return err;
+
+		iter_remain--;
+	} while (iter_remain && err && wait);
+
+	if (err)
+		return 0;
+
+	if (count > sizeof(ulong)) {
+		memcpy(buf, &res.a1, sizeof(ulong));
+		count -= sizeof(ulong);
+		buf += sizeof(ulong);
+	}
+	memcpy(buf, &res.a2, count);
+	return total;
+}
+
+static int grvtn_trng_probe(struct platform_device *pdev)
+{
+	int version;
+	int err;
+	struct arm_smccc_res res;
+	struct grvtn_rng *priv;
+
+	priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->rng.name = "graviton";
+	priv->rng.read = grvtn_trng_read;
+	priv->rng.priv = (unsigned long)priv;
+	priv->rng.quality = 1024; /* all bits are sourced from a HW TRNG */
+	priv->call_id = AWS_GRAVITON_GET_RND_LEGACY; /* default mode is legacy */
+
+	grvtn_smccc_conduit(AWS_GRAVITON_UUID, &res);
+
+	if (res.a0 != GRVTN_TRNG_UUID_0 || res.a1 != GRVTN_TRNG_UUID_1 ||
+	    res.a2 != GRVTN_TRNG_UUID_2 || res.a3 != GRVTN_TRNG_UUID_3) {
+		dev_err(&pdev->dev, "failed to match UUID\n");
+		return -ENXIO;
+	}
+
+	grvtn_smccc_conduit(AWS_GRAVITON_GET_VER, &res);
+	dev_info(&pdev->dev, "Graviton TRNG, SMC version %d.%d\n",
+		(u32)res.a0, (u32)res.a1);
+
+	version = grvtn_probe_sip_feature(AWS_GRAVITON_GET_RND);
+	if (version > 0)
+		priv->call_id = AWS_GRAVITON_GET_RND;
+
+	platform_set_drvdata(pdev, priv);
+	err = devm_hwrng_register(&pdev->dev, &priv->rng);
+	if (err)
+		dev_err(&pdev->dev, "failed to register hwrng");
+	return err;
+}
+
+static const struct acpi_device_id grvtn_trng_acpi_match[] = {
+	{ "AMZN0010", },
+	{}
+};
+
+MODULE_DEVICE_TABLE(acpi, grvtn_trng_acpi_match);
+
+static struct platform_driver grvtn_trng_driver = {
+	.probe  = grvtn_trng_probe,
+	.driver = {
+		.name = "graviton-rng",
+		.owner = THIS_MODULE,
+		.acpi_match_table = ACPI_PTR(grvtn_trng_acpi_match),
+	},
+};
+
+module_platform_driver(grvtn_trng_driver);
+
+MODULE_AUTHOR("Amazon.com, Inc. or it's affiliates");
+MODULE_DESCRIPTION("Graviton TRNG driver");
+MODULE_LICENSE("GPL v2");

From e49cb24e1505d5182df461d924694be6eaf47a11 Mon Sep 17 00:00:00 2001
From: Shaoying Xu <shaoyi@amazon.com>
Date: Mon, 26 Apr 2021 21:18:28 +0000
Subject: [PATCH 107/737] lustre: update to AmazonFSxLustreClient v2.10.8-7

Signed-off-by: Shaoying Xu <shaoyi@amazon.com>
---
 drivers/staging/lustrefsx/config.h            |  65 +++--
 .../lustrefsx/libcfs/include/libcfs/libcfs.h  |   3 +
 .../libcfs/include/libcfs/libcfs_private.h    |   5 +-
 .../include/libcfs/linux/linux-crypto.h       |   5 +
 .../libcfs/include/libcfs/linux/linux-fs.h    |  18 ++
 .../libcfs/include/libcfs/linux/linux-mem.h   |  38 +++
 .../libcfs/include/libcfs/linux/linux-misc.h  |   4 +
 .../libcfs/include/libcfs/linux/linux-net.h   |  72 +++++
 .../libcfs/include/libcfs/linux/linux-time.h  |  33 +--
 .../lustrefsx/libcfs/libcfs/libcfs_mem.c      |  60 +++++
 .../libcfs/libcfs/linux/linux-crypto-adler.c  |   3 +-
 .../libcfs/libcfs/linux/linux-crypto-crc32.c  |   4 +-
 .../libcfs/linux/linux-crypto-crc32c-pclmul.c |   4 +-
 .../libcfs/linux/linux-crypto-crc32pclmul.c   |   3 +-
 .../libcfs/libcfs/linux/linux-curproc.c       |   4 +-
 .../libcfs/libcfs/linux/linux-prim.c          |  12 +
 .../staging/lustrefsx/libcfs/libcfs/module.c  |   3 +-
 .../lustrefsx/libcfs/libcfs/tracefile.c       |  12 +
 .../lustrefsx/lnet/include/lnet/lib-lnet.h    |  26 +-
 .../lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c    | 189 ++++++-------
 .../lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h    |  10 +-
 .../lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c |   6 +-
 .../lustrefsx/lnet/klnds/socklnd/socklnd.c    | 158 ++++-------
 .../lustrefsx/lnet/klnds/socklnd/socklnd.h    |   2 +
 .../lnet/klnds/socklnd/socklnd_lib.c          | 251 ++++++++----------
 drivers/staging/lustrefsx/lnet/lnet/config.c  | 177 ++++++------
 .../staging/lustrefsx/lnet/lnet/lib-socket.c  | 239 +++--------------
 .../staging/lustrefsx/lnet/lnet/router_proc.c |  19 ++
 .../staging/lustrefsx/lnet/selftest/conctl.c  |   1 -
 .../staging/lustrefsx/lnet/selftest/conrpc.c  |   6 +-
 .../staging/lustrefsx/lnet/selftest/conrpc.h  |   1 -
 .../staging/lustrefsx/lnet/selftest/console.h |   1 -
 .../lustrefsx/lustre/fid/fid_handler.c        |   2 +-
 .../staging/lustrefsx/lustre/fid/lproc_fid.c  |  11 +-
 .../lustrefsx/lustre/fld/fld_internal.h       |   2 +-
 .../staging/lustrefsx/lustre/fld/lproc_fld.c  |   7 +-
 .../lustrefsx/lustre/include/lprocfs_status.h |  41 ++-
 .../lustre/include/lustre/ll_fiemap.h         |   2 -
 .../staging/lustrefsx/lustre/include/lvfs.h   |   1 -
 .../lustrefsx/lustre/include/obd_support.h    |   4 +-
 .../lustrefsx/lustre/ldlm/ldlm_resource.c     |  24 +-
 .../lustrefsx/lustre/llite/llite_mmap.c       |  27 +-
 drivers/staging/lustrefsx/lustre/llite/rw26.c |   4 +-
 .../staging/lustrefsx/lustre/llite/vvp_dev.c  |   1 +
 .../staging/lustrefsx/lustre/llite/vvp_io.c   |  48 ++--
 .../staging/lustrefsx/lustre/llite/xattr.c    |  17 +-
 .../lustrefsx/lustre/lmv/lmv_internal.h       |   2 +-
 .../staging/lustrefsx/lustre/lmv/lproc_lmv.c  |  11 +-
 .../lustrefsx/lustre/lov/lov_internal.h       |   2 +-
 .../staging/lustrefsx/lustre/lov/lov_pool.c   |   9 +-
 .../staging/lustrefsx/lustre/lov/lproc_lov.c  |  11 +-
 .../lustre/obdclass/lprocfs_jobstats.c        |  11 +-
 .../lustre/obdclass/lprocfs_status.c          |  56 ++--
 .../lustrefsx/lustre/obdclass/lu_ref.c        |  11 +-
 .../lustrefsx/lustre/obdclass/obd_config.c    |   5 +-
 .../lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c    |   9 +-
 .../staging/lustrefsx/lustre/ptlrpc/sec_ctx.c |   9 -
 drivers/staging/lustrefsx/undef.h             |  36 ++-
 58 files changed, 910 insertions(+), 887 deletions(-)
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-net.h

diff --git a/drivers/staging/lustrefsx/config.h b/drivers/staging/lustrefsx/config.h
index 0d000a1f58fb3..2ecd0c99d3809 100644
--- a/drivers/staging/lustrefsx/config.h
+++ b/drivers/staging/lustrefsx/config.h
@@ -116,7 +116,7 @@
 /* #undef HAVE_CANCEL_DIRTY_PAGE */
 
 /* kernel has clean_bdev_aliases */
-#define HAVE_CLEAN_BDEV_ALIASES 1
+/* #undef HAVE_CLEAN_BDEV_ALIASES */
 
 /* have clear_inode */
 #define HAVE_CLEAR_INODE 1
@@ -292,8 +292,11 @@
 /* filldir_t needs struct dir_context as argument */
 #define HAVE_FILLDIR_USE_CTX 1
 
+/* FMR pool API is available */
+/* #undef HAVE_FMR_POOL_API */
+
 /* fpu/api.h is present */
-/* #undef HAVE_FPU_API_HEADER */
+#define HAVE_FPU_API_HEADER 1
 
 /* struct file_system_type has mount field */
 #define HAVE_FSTYPE_MOUNT 1
@@ -302,7 +305,7 @@
 /* #undef HAVE_FS_STRUCT_RWLOCK */
 
 /* fs_struct use seqcount */
-#define HAVE_FS_STRUCT_SEQCOUNT 1
+/* #undef HAVE_FS_STRUCT_SEQCOUNT */
 
 /* full_name_hash need 3 arguments */
 #define HAVE_FULL_NAME_HASH_3ARGS 1
@@ -365,7 +368,7 @@
 #define HAVE_IB_DEVICE_ATTRS 1
 
 /* if struct ib_device_ops is defined */
-#define HAVE_IB_DEVICE_OPS 1
+/* #undef HAVE_IB_DEVICE_OPS */
 
 /* ib_get_dma_mr is defined */
 /* #undef HAVE_IB_GET_DMA_MR */
@@ -466,9 +469,15 @@
 /* i_uid_read is present */
 #define HAVE_I_UID_READ 1
 
+/* jiffies_to_timespec64() is available */
+#define HAVE_JIFFIES_TO_TIMESPEC64 1
+
 /* kernel_locked is defined */
 /* #undef HAVE_KERNEL_LOCKED */
 
+/* kernel_setsockopt still in use */
+/* #undef HAVE_KERNEL_SETSOCKOPT */
+
 /* 'struct sock' accept function requires bool argument */
 #define HAVE_KERN_SOCK_ACCEPT_FLAG_ARG 1
 
@@ -548,9 +557,6 @@
 /* 'ktime_to_timespec64' is available */
 #define HAVE_KTIME_TO_TIMESPEC64 1
 
-/* ns_to_timespec64 is available */
-#define HAVE_NS_TO_TIMESPEC64
-
 /* enable use of ldiskfsprogs package */
 /* #undef HAVE_LDISKFSPROGS */
 
@@ -579,7 +585,7 @@
 #define HAVE_LINUX_RANDOM_H 1
 
 /* if linux/selinux.h exists */
-#undef HAVE_LINUX_SELINUX_IS_ENABLED
+/* #undef HAVE_LINUX_SELINUX_IS_ENABLED */
 
 /* Define to 1 if you have the <linux/types.h> header file. */
 #define HAVE_LINUX_TYPES_H 1
@@ -621,6 +627,9 @@
 /* kernel has include/linux/migrate_mode.h */
 /* #undef HAVE_MIGRATE_MODE_H */
 
+/* mmap_lock API is available. */
+#define HAVE_MMAP_LOCK 1
+
 /* kernel module loading is possible */
 #define HAVE_MODULE_LOADING_SUPPORT 1
 
@@ -636,6 +645,9 @@
 /* 'kernel_write' aligns with read/write helpers */
 #define HAVE_NEW_KERNEL_WRITE 1
 
+/* NR_UNSTABLE_NFS is still in use. */
+/* #undef HAVE_NR_UNSTABLE_NFS */
+
 /* with oldsize */
 /* #undef HAVE_OLDSIZE_TRUNCATE_PAGECACHE */
 
@@ -646,7 +658,7 @@
 #define HAVE_PAGEVEC_INIT_ONE_PARAM 1
 
 /* have PCLMULQDQ instruction */
-/* #undef HAVE_PCLMULQDQ */
+#define HAVE_PCLMULQDQ 1
 
 /* percpu_counter_init uses GFP_* flag */
 #define HAVE_PERCPU_COUNTER_INIT_GFP_FLAG 1
@@ -663,6 +675,9 @@
 /* posix_acl_valid takes struct user_namespace */
 #define HAVE_POSIX_ACL_VALID_USER_NS 1
 
+/* struct proc_ops exists */
+#define HAVE_PROC_OPS 1
+
 /* proc_remove is defined */
 #define HAVE_PROC_REMOVE 1
 
@@ -684,6 +699,9 @@
 /* rdma_create_id wants 5 args */
 #define HAVE_RDMA_CREATE_ID_5ARG 1
 
+/* rdma_reject has 4 arguments */
+#define HAVE_RDMA_REJECT_4ARGS 1
+
 /* reinit_completion is exist */
 #define HAVE_REINIT_COMPLETION 1
 
@@ -718,10 +736,10 @@
 /* #undef HAVE_SECURITY_IINITSEC_QSTR */
 
 /* support for selinux */
-/* #undef HAVE_SELINUX */
+#define HAVE_SELINUX 1
 
 /* Define to 1 if you have the <selinux/selinux.h> header file. */
-/* #undef HAVE_SELINUX_SELINUX_H */
+#define HAVE_SELINUX_SELINUX_H 1
 
 /* support server */
 /* #undef HAVE_SERVER_SUPPORT */
@@ -751,12 +769,6 @@
 /* kernel has sk_sleep */
 #define HAVE_SK_SLEEP 1
 
-/* sock_alloc_file is exported */
-/* #undef HAVE_SOCK_ALLOC_FILE */
-
-/* sock_alloc_file takes 3 arguments */
-#define HAVE_SOCK_ALLOC_FILE_3ARGS 1
-
 /* sock_create_kern use net as first parameter */
 #define HAVE_SOCK_CREATE_KERN_USE_NET 1
 
@@ -847,6 +859,9 @@
 /* 'timespec64_to_ktime' is available */
 #define HAVE_TIMESPEC64_TO_KTIME 1
 
+/* have_time_t */
+/* #undef HAVE_TIME_T */
+
 /* topology_sibling_cpumask is available */
 #define HAVE_TOPOLOGY_SIBLING_CPUMASK 1
 
@@ -880,6 +895,9 @@
 /* kernel has vfs_unlink with 3 args */
 #define HAVE_VFS_UNLINK_3ARGS 1
 
+/* __vmalloc only takes 2 args. */
+#define HAVE_VMALLOC_2ARGS 1
+
 /* virtual_address has been replaced by address field */
 #define HAVE_VM_FAULT_ADDRESS 1
 
@@ -922,12 +940,6 @@
 /* __add_wait_queue_exclusive exists */
 /* #undef HAVE___ADD_WAIT_QUEUE_EXCLUSIVE */
 
-/* NR_UNSTABLE_NFS is still in use. */
-/* #undef HAVE_NR_UNSTABLE_NFS */
-
-/* rdma_reject has 4 arguments */
-#define HAVE_RDMA_REJECT_4ARGS
-
 /* ext4_journal_start takes 3 arguments */
 /* #undef JOURNAL_START_HAS_3ARGS */
 
@@ -969,10 +981,10 @@
 #define MKE2FS "mke2fs"
 
 /* need pclmulqdq based crc32c */
-#define NEED_CRC32C_ACCEL 1
+/* #undef NEED_CRC32C_ACCEL */
 
 /* need pclmulqdq based crc32 */
-#define NEED_CRC32_ACCEL 1
+/* #undef NEED_CRC32_ACCEL */
 
 /* 'ktime_get_real_ns' is not available */
 /* #undef NEED_KTIME_GET_REAL_NS */
@@ -1004,6 +1016,9 @@
 /* name of parallel fsck program */
 #define PFSCK "fsck"
 
+/* proc handler methods use __user */
+/* #undef PROC_HANDLER_USE_USER_ATTR */
+
 /* enable randomly alloc failure */
 #define RANDOM_FAIL_ALLOC 1
 
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h
index 8055d37510921..f01170c6e1d97 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h
@@ -144,4 +144,7 @@ static inline void *__container_of(const void *ptr, unsigned long shift)
 
 #endif /* __KERNEL__ */
 
+/* atomic-context safe vfree */
+void libcfs_vfree_atomic(const void *addr);
+
 #endif /* _LIBCFS_LIBCFS_H_ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h
index bcd7d56b65a94..ebcdc990203b2 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h
@@ -213,6 +213,9 @@ do {									    \
 #define LIBCFS_CPT_ALLOC(ptr, cptab, cpt, size)				    \
 	LIBCFS_CPT_ALLOC_GFP(ptr, cptab, cpt, size, GFP_NOFS)
 
+void init_libcfs_vfree_atomic(void);
+void exit_libcfs_vfree_atomic(void);
+
 #define LIBCFS_FREE(ptr, size)						\
 do {									\
 	int s = (size);                                                 \
@@ -225,7 +228,7 @@ do {									\
 	CDEBUG(D_MALLOC, "kfreed '" #ptr "': %d at %p (tot %d).\n",     \
 	       s, (ptr), libcfs_kmem_read());				\
 	if (unlikely(s > LIBCFS_VMALLOC_SIZE))                          \
-		vfree(ptr);						\
+		libcfs_vfree_atomic(ptr);						\
 	else								\
 		kfree(ptr);						\
 } while (0)
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-crypto.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-crypto.h
index 6346c59e516e7..a9c15a66ab207 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-crypto.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-crypto.h
@@ -26,6 +26,11 @@
  * Copyright 2012 Xyratex Technology Limited
  */
 
+/* Added in v4.15-rc4 (commit a208fa8f3303) */
+#ifndef CRYPTO_ALG_OPTIONAL_KEY
+#define CRYPTO_ALG_OPTIONAL_KEY 0x00004000
+#endif
+
 /**
  * Linux crypto hash specific functions.
  */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h
index 59d9874bbf978..dbc84de172146 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h
@@ -99,4 +99,22 @@ static inline struct dentry *file_dentry(const struct file *file)
 #ifndef HAVE_POSIX_ACL_VALID_USER_NS
 #define posix_acl_valid(a,b) posix_acl_valid(b)
 #endif
+
+#ifdef HAVE_PROC_OPS
+#define PROC_OWNER(_fn)
+#else
+#define proc_ops file_operations
+#define PROC_OWNER(_owner)		.owner = (_owner),
+#define proc_open			open
+#define proc_read			read
+#define proc_write			write
+#define proc_lseek			llseek
+#define proc_release			release
+#define proc_poll			poll
+#define proc_ioctl			unlocked_ioctl
+#define proc_compat_ioctl		compat_ioctl
+#define proc_mmap			mmap
+#define proc_get_unmapped_area		get_unmapped_area
+#endif
+
 #endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h
index 086d16baeaf13..f08d623bd8a84 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h
@@ -129,4 +129,42 @@ void remove_shrinker(struct shrinker *shrinker)
         kfree(shrinker);
 }
 
+#ifndef HAVE_MMAP_LOCK
+static inline void mmap_write_lock(struct mm_struct *mm)
+{
+	down_write(&mm->mmap_sem);
+}
+
+static inline bool mmap_write_trylock(struct mm_struct *mm)
+{
+	return down_write_trylock(&mm->mmap_sem) != 0;
+}
+
+static inline void mmap_write_unlock(struct mm_struct *mm)
+{
+	up_write(&mm->mmap_sem);
+}
+
+static inline void mmap_read_lock(struct mm_struct *mm)
+{
+	down_read(&mm->mmap_sem);
+}
+
+static inline bool mmap_read_trylock(struct mm_struct *mm)
+{
+	return down_read_trylock(&mm->mmap_sem) != 0;
+}
+
+static inline void mmap_read_unlock(struct mm_struct *mm)
+{
+	up_read(&mm->mmap_sem);
+}
+#endif
+
+#ifdef HAVE_VMALLOC_2ARGS
+#define __ll_vmalloc(size, flags) __vmalloc(size, flags)
+#else
+#define __ll_vmalloc(size, flags) __vmalloc(size, flags, PAGE_KERNEL)
+#endif
+
 #endif /* __LINUX_CFS_MEM_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h
index 8b3d398459c74..754f183050485 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h
@@ -60,6 +60,10 @@
 #endif
 #endif /* HAVE_IOV_ITER_TYPE */
 
+#ifndef HAVE_LINUX_SELINUX_IS_ENABLED
+bool selinux_is_enabled(void);
+#endif
+
 #ifndef HAVE_UIDGID_HEADER
 
 #ifndef _LINUX_UIDGID_H
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-net.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-net.h
new file mode 100644
index 0000000000000..41484bd3b44a4
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-net.h
@@ -0,0 +1,72 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+
+#ifndef __LIBCFS_LINUX_NET_H__
+#define __LIBCFS_LINUX_NET_H__
+
+#ifdef HAVE_KERNEL_SETSOCKOPT
+
+#include <net/tcp.h>
+
+static inline void tcp_sock_set_quickack(struct sock *sk, int opt)
+{
+	struct socket *sock = sk->sk_socket;
+
+	kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
+			  (char *)&opt, sizeof(opt));
+}
+
+static inline void tcp_sock_set_nodelay(struct sock *sk)
+{
+	int opt = 1;
+	struct socket *sock = sk->sk_socket;
+
+	kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
+			  (char *)&opt, sizeof(opt));
+}
+
+static inline int tcp_sock_set_keepidle(struct sock *sk, int opt)
+{
+	struct socket *sock = sk->sk_socket;
+
+	return kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE,
+				 (char *)&opt, sizeof(opt));
+}
+
+static inline int tcp_sock_set_keepintvl(struct sock *sk, int opt)
+{
+	struct socket *sock = sk->sk_socket;
+
+	return kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
+				 (char *)&opt, sizeof(opt));
+}
+
+static inline int tcp_sock_set_keepcnt(struct sock *sk, int opt)
+{
+	struct socket *sock = sk->sk_socket;
+
+	return kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
+				 (char *)&opt, sizeof(opt));
+}
+#endif /* HAVE_KERNEL_SETSOCKOPT */
+
+#endif /* __LIBCFS_LINUX_NET_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
index d22bda9895fe7..07dd2e05a6083 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
@@ -143,29 +143,22 @@ static inline struct timespec timespec64_to_timespec(const struct timespec64 ts6
 
 #endif /* HAVE_TIMESPEC64 */
 
-#if __BITS_PER_LONG == 64
-#define time_t long
-#else
-#error "lustre is not supported on 32bit"
+#ifndef HAVE_TIME_T
+typedef __kernel_old_time_t time_t;
 #endif
 
-#ifndef HAVE_NS_TO_TIMESPEC64
-static inline struct timespec64 ns_to_timespec64(const s64 nsec)
+#ifndef HAVE_JIFFIES_TO_TIMESPEC64
+static inline void
+jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value)
 {
-	struct timespec64 ts;
-	s32 rem;
-
-	if (!nsec)
-		return (struct timespec64) {0, 0};
-
-	ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
-	if (unlikely(rem < 0)) {
-		ts.tv_sec--;
-		rem += NSEC_PER_SEC;
-	}
-	ts.tv_nsec = rem;
-
-	return ts;
+	/*
+	 * Convert jiffies to nanoseconds and separate with
+	 * one divide.
+	 */
+	u32 rem;
+	value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
+					NSEC_PER_SEC, &rem);
+	value->tv_nsec = rem;
 }
 #endif
 
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c
index 3e83f50579913..2f401e74a7dd7 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c
@@ -33,6 +33,7 @@
 
 #define DEBUG_SUBSYSTEM S_LNET
 
+#include <linux/workqueue.h>
 #include <libcfs/libcfs.h>
 
 struct cfs_var_array {
@@ -170,3 +171,62 @@ cfs_array_alloc(int count, unsigned int size)
 	return (void *)&arr->va_ptrs[0];
 }
 EXPORT_SYMBOL(cfs_array_alloc);
+
+/*
+ * This is opencoding of vfree_atomic from Linux kernel added in 4.10 with
+ * minimum changes needed to work on older kernels too.
+ */
+
+#ifndef raw_cpu_ptr
+#define raw_cpu_ptr(p) __this_cpu_ptr(p)
+#endif
+
+#ifndef llist_for_each_safe
+#define llist_for_each_safe(pos, n, node)                       \
+		for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n))
+#endif
+
+struct vfree_deferred {
+		struct llist_head list;
+		struct work_struct wq;
+};
+static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
+
+static void free_work(struct work_struct *w)
+{
+	struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
+	struct llist_node *t, *llnode;
+
+	llist_for_each_safe(llnode, t, llist_del_all(&p->list))
+		vfree((void *)llnode);
+}
+
+void libcfs_vfree_atomic(const void *addr)
+{
+	struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
+
+	if (!addr)
+		return;
+
+	if (llist_add((struct llist_node *)addr, &p->list))
+		schedule_work(&p->wq);
+}
+EXPORT_SYMBOL(libcfs_vfree_atomic);
+
+void __init init_libcfs_vfree_atomic(void)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct vfree_deferred *p;
+
+		p = &per_cpu(vfree_deferred, i);
+		init_llist_head(&p->list);
+		INIT_WORK(&p->wq, free_work);
+	}
+}
+
+void __exit exit_libcfs_vfree_atomic(void)
+{
+	flush_scheduled_work();
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-adler.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-adler.c
index b71d7f8bc9d68..0f507d555e603 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-adler.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-adler.c
@@ -116,9 +116,10 @@ static struct shash_alg alg = {
 		.cra_name		= "adler32",
 		.cra_driver_name	= "adler32-zlib",
 		.cra_priority		= 100,
+		.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
 		.cra_blocksize		= CHKSUM_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(u32),
-		.cra_module		= THIS_MODULE,
+		.cra_module		= NULL,
 		.cra_init		= adler32_cra_init,
 	}
 };
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32.c
index 85fc287cb8847..c20e5e9a8194b 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32.c
@@ -65,7 +65,6 @@ static int crc32_setkey(struct crypto_shash *hash, const u8 *key,
 
 	if (keylen != sizeof(u32))
 		return -EINVAL;
-
 	*mctx = le32_to_cpup((__le32 *)key);
 	return 0;
 }
@@ -129,9 +128,10 @@ static struct shash_alg alg = {
 		.cra_name		= "crc32",
 		.cra_driver_name	= "crc32-table",
 		.cra_priority		= 100,
+		.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
 		.cra_blocksize		= CHKSUM_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(u32),
-		.cra_module		= THIS_MODULE,
+		.cra_module		= NULL,
 		.cra_init		= crc32_cra_init,
 	}
 };
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c
index 40d9e7416068b..5262f071b8a7a 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c
@@ -63,7 +63,6 @@ static int crc32c_pclmul_setkey(struct crypto_shash *hash, const u8 *key,
 
 	if (keylen != sizeof(u32))
 		return -EINVAL;
-
 	*mctx = le32_to_cpup((__le32 *)key);
 	return 0;
 }
@@ -132,9 +131,10 @@ static struct shash_alg alg = {
 			.cra_name		= "crc32c",
 			.cra_driver_name	= "crc32c-pclmul",
 			.cra_priority		= 150,
+			.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
 			.cra_blocksize		= CHKSUM_BLOCK_SIZE,
 			.cra_ctxsize		= sizeof(u32),
-			.cra_module		= THIS_MODULE,
+			.cra_module		= NULL,
 			.cra_init		= crc32c_pclmul_cra_init,
 	}
 };
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c
index 88e697897b15d..4ad3b7c310037 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c
@@ -102,7 +102,6 @@ static int crc32_pclmul_setkey(struct crypto_shash *hash, const u8 *key,
 
 	if (keylen != sizeof(u32))
 		return -EINVAL;
-
 	*mctx = le32_to_cpup((__le32 *)key);
 	return 0;
 }
@@ -170,7 +169,7 @@ static struct shash_alg alg = {
 			.cra_priority		= 200,
 			.cra_blocksize		= CHKSUM_BLOCK_SIZE,
 			.cra_ctxsize		= sizeof(u32),
-			.cra_module		= THIS_MODULE,
+			.cra_module		= NULL,
 			.cra_init		= crc32_pclmul_cra_init,
 	}
 };
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c
index 36a4fdef2dc24..cd00d0ae5717f 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c
@@ -141,7 +141,7 @@ static int cfs_access_process_vm(struct task_struct *tsk,
 	 * which is already holding mmap_lock for writes.  If some other
 	 * thread gets the write lock in the meantime, this thread will
 	 * block, but at least it won't deadlock on itself.  LU-1735 */
-	if (down_read_trylock(&mm->mmap_lock) == 0)
+	if (!mmap_read_trylock(mm))
 		return -EDEADLK;
 
 	/* ignore errors, just check how much was successfully transferred */
@@ -181,7 +181,7 @@ static int cfs_access_process_vm(struct task_struct *tsk,
 		buf += bytes;
 		addr += bytes;
 	}
-	up_read(&mm->mmap_lock);
+	mmap_read_unlock(mm);
 
 	return buf - old_buf;
 }
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c
index e63f7317485d9..4b73ed6e79a93 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c
@@ -97,6 +97,18 @@ time64_t ktime_get_seconds(void)
 EXPORT_SYMBOL(ktime_get_seconds);
 #endif /* HAVE_KTIME_GET_SECONDS */
 
+#ifndef HAVE_LINUX_SELINUX_IS_ENABLED
+static char **cfs_lsm_names;
+
+bool selinux_is_enabled(void)
+{
+	if (cfs_lsm_names)
+		return !!strstr("selinux", *cfs_lsm_names);
+	return false;
+}
+EXPORT_SYMBOL(selinux_is_enabled);
+#endif
+
 int cfs_kernel_write(struct file *filp, const void *buf, size_t count,
 		     loff_t *pos)
 {
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/module.c b/drivers/staging/lustrefsx/libcfs/libcfs/module.c
index 910a44bc97f48..f832a6fd02bce 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/module.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/module.c
@@ -727,7 +727,7 @@ static void remove_proc(void)
 static int __init libcfs_init(void)
 {
 	int rc;
-
+	init_libcfs_vfree_atomic();
 	rc = libcfs_debug_init(5 * 1024 * 1024);
 	if (rc < 0) {
 		printk(KERN_ERR "LustreError: libcfs_debug_init: %d\n", rc);
@@ -816,6 +816,7 @@ static void __exit libcfs_exit(void)
 	if (rc)
 		printk(KERN_ERR "LustreError: libcfs_debug_cleanup: %d\n",
 		       rc);
+	exit_libcfs_vfree_atomic();
 }
 
 MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c
index 798471bb694d9..ac762726fa5ce 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c
@@ -737,8 +737,12 @@ int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
         if (usr_buffer_nob > knl_buffer_nob)
                 return -EOVERFLOW;
 
+#ifdef PROC_HANDLER_USE_USER_ATTR
 	if (copy_from_user(knl_buffer, usr_buffer, usr_buffer_nob))
                 return -EFAULT;
+#else
+	memcpy(knl_buffer, usr_buffer, usr_buffer_nob);
+#endif
 
         nob = strnlen(knl_buffer, usr_buffer_nob);
         while (nob-- >= 0)                      /* strip trailing whitespace */
@@ -767,12 +771,20 @@ int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob,
         if (nob > usr_buffer_nob)
                 nob = usr_buffer_nob;
 
+#ifdef PROC_HANDLER_USE_USER_ATTR
 	if (copy_to_user(usr_buffer, knl_buffer, nob))
                 return -EFAULT;
+#else
+	memcpy(usr_buffer, knl_buffer, nob);
+#endif
 
         if (append != NULL && nob < usr_buffer_nob) {
+#ifdef PROC_HANDLER_USE_USER_ATTR
 		if (copy_to_user(usr_buffer + nob, append, 1))
                         return -EFAULT;
+#else
+		memcpy(usr_buffer + nob, append, 1);
+#endif
 
                 nob++;
         }
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h b/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h
index 15b89a9f85042..c905eda43b5b8 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h
@@ -37,9 +37,7 @@
 #ifndef __LNET_LIB_LNET_H__
 #define __LNET_LIB_LNET_H__
 
-#ifndef __KERNEL__
-# error This include is only for kernel use.
-#endif
+#include <linux/netdevice.h>
 
 #include <libcfs/linux/linux-misc.h>
 #include <libcfs/libcfs.h>
@@ -83,11 +81,6 @@ extern struct lnet the_lnet;			/* THE network */
 		kernel_getsockname(sock, addr, addrlen)
 #endif
 
-#ifndef HAVE_KERNEL_SETSOCKOPT
-int kernel_setsockopt(struct socket *sock, int level, int optname,
-                        char *optval, unsigned int optlen);
-#endif
-
 static inline int lnet_is_route_alive(struct lnet_route *route)
 {
 	if (!route->lr_gateway->lpni_alive)
@@ -783,12 +776,17 @@ int lnet_acceptor_port(void);
 int lnet_acceptor_start(void);
 void lnet_acceptor_stop(void);
 
-int lnet_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask,
-		    struct net *ns);
-int lnet_ipif_enumerate(char ***names, struct net *ns);
-void lnet_ipif_free_enumeration(char **names, int n);
-int lnet_sock_setbuf(struct socket *socket, int txbufsize, int rxbufsize);
-int lnet_sock_getbuf(struct socket *socket, int *txbufsize, int *rxbufsize);
+struct lnet_inetdev {
+	u32	li_cpt;
+	u32	li_flags;
+	u32	li_ipaddr;
+	u32	li_netmask;
+	char	li_name[IFNAMSIZ];
+};
+
+int lnet_inet_enumerate(struct lnet_inetdev **dev_list, struct net *ns);
+void lnet_sock_setbuf(struct socket *socket, int txbufsize, int rxbufsize);
+void lnet_sock_getbuf(struct socket *socket, int *txbufsize, int *rxbufsize);
 int lnet_sock_getaddr(struct socket *socket, bool remote, __u32 *ip, int *port);
 int lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout);
 int lnet_sock_read(struct socket *sock, void *buffer, int nob, int timeout);
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c
index ba4090556550f..90645f6388ea6 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c
@@ -1472,10 +1472,13 @@ kiblnd_destroy_fmr_pool(kib_fmr_pool_t *fpo)
 {
 	LASSERT(fpo->fpo_map_count == 0);
 
+#ifdef HAVE_FMR_POOL_API
 	if (fpo->fpo_is_fmr) {
 		if (fpo->fmr.fpo_fmr_pool)
 			ib_destroy_fmr_pool(fpo->fmr.fpo_fmr_pool);
-	} else {
+	} else
+#endif /* HAVE_FMR_POOL_API */
+	{
 		struct kib_fast_reg_descriptor *frd, *tmp;
 		int i = 0;
 
@@ -1529,6 +1532,7 @@ kiblnd_fmr_flush_trigger(struct lnet_ioctl_config_o2iblnd_tunables *tunables,
 	return max(IBLND_FMR_POOL_FLUSH, size);
 }
 
+#ifdef HAVE_FMR_POOL_API
 static int kiblnd_alloc_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t *fpo)
 {
 	struct ib_fmr_pool_param param = {
@@ -1555,6 +1559,7 @@ static int kiblnd_alloc_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t *fpo)
 
 	return rc;
 }
+#endif /* HAVE_FMR_POOL_API */
 
 static int kiblnd_alloc_freg_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t *fpo)
 {
@@ -1667,6 +1672,7 @@ kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t **pp_fpo)
 	}
 #endif
 
+#ifdef HAVE_FMR_POOL_API
 	/* Check for FMR or FastReg support */
 	fpo->fpo_is_fmr = 0;
 #ifdef HAVE_IB_DEVICE_OPS
@@ -1682,7 +1688,9 @@ kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t **pp_fpo)
 #endif
 		LCONSOLE_INFO("Using FMR for registration\n");
 		fpo->fpo_is_fmr = 1;
-	} else if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
+	} else
+#endif /* HAVE_FMR_POOL_API */
+	if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
 		LCONSOLE_INFO("Using FastReg for registration\n");
 	} else {
 		rc = -ENOSYS;
@@ -1690,9 +1698,11 @@ kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t **pp_fpo)
 		goto out_dev_attr;
 	}
 
+#ifdef HAVE_FMR_POOL_API
 	if (fpo->fpo_is_fmr)
 		rc = kiblnd_alloc_fmr_pool(fps, fpo);
 	else
+#endif /* HAVE_FMR_POOL_API */
 		rc = kiblnd_alloc_freg_pool(fps, fpo);
 	if (rc)
 		goto out_fpo;
@@ -1787,6 +1797,7 @@ kiblnd_fmr_pool_is_idle(kib_fmr_pool_t *fpo, cfs_time_t now)
         return cfs_time_aftereq(now, fpo->fpo_deadline);
 }
 
+#if defined(HAVE_FMR_POOL_API) || !defined(HAVE_IB_MAP_MR_SG)
 static int
 kiblnd_map_tx_pages(kib_tx_t *tx, kib_rdma_desc_t *rd)
 {
@@ -1808,6 +1819,7 @@ kiblnd_map_tx_pages(kib_tx_t *tx, kib_rdma_desc_t *rd)
 
 	return npages;
 }
+#endif
 
 void
 kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status)
@@ -1817,12 +1829,13 @@ kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status)
 	kib_fmr_poolset_t *fps;
 	cfs_time_t         now = cfs_time_current();
 	kib_fmr_pool_t    *tmp;
-	int                rc;
 
 	if (!fpo)
 		return;
 
 	fps = fpo->fpo_owner;
+
+#ifdef HAVE_FMR_POOL_API
 	if (fpo->fpo_is_fmr) {
 		if (fmr->fmr_pfmr) {
 			ib_fmr_pool_unmap(fmr->fmr_pfmr);
@@ -1830,10 +1843,12 @@ kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status)
 		}
 
 		if (status) {
-			rc = ib_flush_fmr_pool(fpo->fmr.fpo_fmr_pool);
+			int rc = ib_flush_fmr_pool(fpo->fmr.fpo_fmr_pool);
 			LASSERT(!rc);
 		}
-	} else {
+	} else
+#endif /* HAVE_FMR_POOL_API */
+	{
 		struct kib_fast_reg_descriptor *frd = fmr->fmr_frd;
 
 		if (frd) {
@@ -1870,11 +1885,13 @@ kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, kib_rdma_desc_t *rd,
 		    __u32 nob, __u64 iov, kib_fmr_t *fmr, bool *is_fastreg)
 {
 	kib_fmr_pool_t *fpo;
-	__u64 *pages = tx->tx_pages;
 	__u64 version;
 	bool is_rx = (rd != tx->tx_rd);
+#ifdef HAVE_FMR_POOL_API
+	__u64 *pages = tx->tx_pages;
 	bool tx_pages_mapped = 0;
 	int npages = 0;
+#endif
 	int rc;
 
 again:
@@ -1884,6 +1901,8 @@ kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, kib_rdma_desc_t *rd,
 		fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
 		fpo->fpo_map_count++;
 
+#ifdef HAVE_FMR_POOL_API
+		fmr->fmr_pfmr = NULL;
 		if (fpo->fpo_is_fmr) {
 			struct ib_pool_fmr *pfmr;
 
@@ -1906,7 +1925,9 @@ kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, kib_rdma_desc_t *rd,
 				return 0;
 			}
 			rc = PTR_ERR(pfmr);
-		} else {
+		} else
+#endif /* HAVE_FMR_POOL_API */
+		{
 			*is_fastreg = 1;
 			if (!list_empty(&fpo->fast_reg.fpo_pool_list)) {
 				struct kib_fast_reg_descriptor *frd;
@@ -1953,7 +1974,7 @@ kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, kib_rdma_desc_t *rd,
 #else
 				n = ib_map_mr_sg(mr, tx->tx_frags,
 						 tx->tx_nfrags, PAGE_SIZE);
-#endif
+#endif /* HAVE_IB_MAP_MR_SG_5ARGS */
 				if (unlikely(n != tx->tx_nfrags)) {
 					CERROR("Failed to map mr %d/%d "
 					       "elements\n", n, tx->tx_nfrags);
@@ -1971,7 +1992,7 @@ kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, kib_rdma_desc_t *rd,
 				wr->key = is_rx ? mr->rkey : mr->lkey;
 				wr->access = (IB_ACCESS_LOCAL_WRITE |
 					      IB_ACCESS_REMOTE_WRITE);
-#else
+#else /* HAVE_IB_MAP_MR_SG */
 				if (!tx_pages_mapped) {
 					npages = kiblnd_map_tx_pages(tx, rd);
 					tx_pages_mapped = 1;
@@ -1998,11 +2019,10 @@ kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, kib_rdma_desc_t *rd,
 				wr->wr.wr.fast_reg.access_flags =
 						(IB_ACCESS_LOCAL_WRITE |
 						 IB_ACCESS_REMOTE_WRITE);
-#endif
+#endif /* HAVE_IB_MAP_MR_SG */
 
 				fmr->fmr_key  = is_rx ? mr->rkey : mr->lkey;
 				fmr->fmr_frd  = frd;
-				fmr->fmr_pfmr = NULL;
 				fmr->fmr_pool = fpo;
 				return 0;
 			}
@@ -2875,59 +2895,6 @@ kiblnd_destroy_dev (kib_dev_t *dev)
         LIBCFS_FREE(dev, sizeof(*dev));
 }
 
-static kib_dev_t *
-kiblnd_create_dev(char *ifname, struct net *ns)
-{
-        struct net_device *netdev;
-        kib_dev_t         *dev;
-        __u32              netmask;
-        __u32              ip;
-        int                up;
-        int                rc;
-
-	rc = lnet_ipif_query(ifname, &up, &ip, &netmask, ns);
-        if (rc != 0) {
-                CERROR("Can't query IPoIB interface %s: %d\n",
-                       ifname, rc);
-                return NULL;
-        }
-
-        if (!up) {
-                CERROR("Can't query IPoIB interface %s: it's down\n", ifname);
-                return NULL;
-        }
-
-        LIBCFS_ALLOC(dev, sizeof(*dev));
-        if (dev == NULL)
-                return NULL;
-
-	netdev = dev_get_by_name(ns, ifname);
-        if (netdev == NULL) {
-                dev->ibd_can_failover = 0;
-        } else {
-                dev->ibd_can_failover = !!(netdev->flags & IFF_MASTER);
-                dev_put(netdev);
-        }
-
-	INIT_LIST_HEAD(&dev->ibd_nets);
-	INIT_LIST_HEAD(&dev->ibd_list); /* not yet in kib_devs */
-	INIT_LIST_HEAD(&dev->ibd_fail_list);
-        dev->ibd_ifip = ip;
-        strcpy(&dev->ibd_ifname[0], ifname);
-
-        /* initialize the device */
-	rc = kiblnd_dev_failover(dev, ns);
-        if (rc != 0) {
-                CERROR("Can't initialize device: %d\n", rc);
-                LIBCFS_FREE(dev, sizeof(*dev));
-                return NULL;
-        }
-
-	list_add_tail(&dev->ibd_list,
-                          &kiblnd_data.kib_devs);
-        return dev;
-}
-
 static void
 kiblnd_base_shutdown(void)
 {
@@ -3208,8 +3175,7 @@ kiblnd_start_schedulers(struct kib_sched_info *sched)
 	return rc;
 }
 
-static int
-kiblnd_dev_start_threads(kib_dev_t *dev, int newdev, __u32 *cpts, int ncpts)
+static int kiblnd_dev_start_threads(kib_dev_t *dev, __u32 *cpts, int ncpts)
 {
 	int	cpt;
 	int	rc;
@@ -3221,7 +3187,7 @@ kiblnd_dev_start_threads(kib_dev_t *dev, int newdev, __u32 *cpts, int ncpts)
 		cpt = (cpts == NULL) ? i : cpts[i];
 		sched = kiblnd_data.kib_scheds[cpt];
 
-		if (!newdev && sched->ibs_nthreads > 0)
+		if (sched->ibs_nthreads > 0)
 			continue;
 
 		rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]);
@@ -3234,49 +3200,16 @@ kiblnd_dev_start_threads(kib_dev_t *dev, int newdev, __u32 *cpts, int ncpts)
 	return 0;
 }
 
-static kib_dev_t *
-kiblnd_dev_search(char *ifname)
-{
-	kib_dev_t	*alias = NULL;
-	kib_dev_t	*dev;
-	char		*colon;
-	char		*colon2;
-
-	colon = strchr(ifname, ':');
-	list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
-		if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
-			return dev;
-
-		if (alias != NULL)
-			continue;
-
-		colon2 = strchr(dev->ibd_ifname, ':');
-		if (colon != NULL)
-			*colon = 0;
-		if (colon2 != NULL)
-			*colon2 = 0;
-
-		if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
-			alias = dev;
-
-		if (colon != NULL)
-			*colon = ':';
-		if (colon2 != NULL)
-			*colon2 = ':';
-	}
-	return alias;
-}
-
 static int
 kiblnd_startup(struct lnet_ni *ni)
 {
         char                     *ifname;
+	struct lnet_inetdev *ifaces = NULL;
         kib_dev_t                *ibdev = NULL;
         kib_net_t                *net;
         unsigned long             flags;
         int                       rc;
-	int			  newdev;
-	int			  node_id;
+	int i;
 
         LASSERT (ni->ni_net->net_lnd == &the_o2iblnd);
 
@@ -3297,10 +3230,8 @@ kiblnd_startup(struct lnet_ni *ni)
 
 	if (ni->ni_interfaces[0] != NULL) {
 		/* Use the IPoIB interface specified in 'networks=' */
-
-		CLASSERT(LNET_NUM_INTERFACES > 1);
 		if (ni->ni_interfaces[1] != NULL) {
-			CERROR("Multiple interfaces not supported\n");
+			CERROR("ko2iblnd: Multiple interfaces not supported\n");
 			goto failed;
 		}
 
@@ -3314,24 +3245,51 @@ kiblnd_startup(struct lnet_ni *ni)
                 goto failed;
         }
 
-	ibdev = kiblnd_dev_search(ifname);
+	rc = lnet_inet_enumerate(&ifaces, ni->ni_net_ns);
+	if (rc < 0)
+		goto failed;
+
+	for (i = 0; i < rc; i++) {
+		if (strcmp(ifname, ifaces[i].li_name) == 0)
+			break;
+	}
+
+	if (i == rc) {
+		CERROR("ko2iblnd: No matching interfaces\n");
+		rc = -ENOENT;
+		goto failed;
+	}
+
+	LIBCFS_ALLOC(ibdev, sizeof(*ibdev));
+	if (!ibdev) {
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	ibdev->ibd_ifip = ifaces[i].li_ipaddr;
+	strlcpy(ibdev->ibd_ifname, ifaces[i].li_name,
+		sizeof(ibdev->ibd_ifname));
+	ibdev->ibd_can_failover = !!(ifaces[i].li_flags & IFF_MASTER);
 
-	newdev = ibdev == NULL;
-	/* hmm...create kib_dev even for alias */
-	if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0)
-		ibdev = kiblnd_create_dev(ifname, ni->ni_net_ns);
+	INIT_LIST_HEAD(&ibdev->ibd_nets);
+	INIT_LIST_HEAD(&ibdev->ibd_list); /* not yet in kib_devs */
+	INIT_LIST_HEAD(&ibdev->ibd_fail_list);
 
-	if (ibdev == NULL)
+	/* initialize the device */
+	rc = kiblnd_dev_failover(ibdev, ni->ni_net_ns);
+	if (rc) {
+		CERROR("ko2iblnd: Can't initialize device: rc = %d\n", rc);
 		goto failed;
+	}
 
-	node_id = dev_to_node(ibdev->ibd_hdev->ibh_ibdev->dma_device);
-	ni->ni_dev_cpt = cfs_cpt_of_node(lnet_cpt_table(), node_id);
+	list_add_tail(&ibdev->ibd_list, &kiblnd_data.kib_devs);
 
 	net->ibn_dev = ibdev;
 	ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
 
-	rc = kiblnd_dev_start_threads(ibdev, newdev,
-				      ni->ni_cpts, ni->ni_ncpts);
+	ni->ni_dev_cpt = ifaces[i].li_cpt;
+
+	rc = kiblnd_dev_start_threads(ibdev, ni->ni_cpts, ni->ni_ncpts);
 	if (rc != 0)
 		goto failed;
 
@@ -3354,6 +3312,7 @@ kiblnd_startup(struct lnet_ni *ni)
 	if (net != NULL && net->ibn_dev == NULL && ibdev != NULL)
                 kiblnd_destroy_dev(ibdev);
 
+	kfree(ifaces);
         kiblnd_shutdown(ni);
 
         CDEBUG(D_NET, "kiblnd_startup failed\n");
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h
index c7dabdf6b98b4..7a9a1c3de16a4 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h
@@ -71,7 +71,9 @@
 #include <rdma/rdma_cm.h>
 #include <rdma/ib_cm.h>
 #include <rdma/ib_verbs.h>
+#ifdef HAVE_FMR_POOL_API
 #include <rdma/ib_fmr_pool.h>
+#endif
 
 #define DEBUG_SUBSYSTEM S_LND
 
@@ -334,24 +336,30 @@ typedef struct
 	struct list_head	fpo_list;	/* chain on pool list */
 	struct kib_hca_dev     *fpo_hdev;	/* device for this pool */
 	kib_fmr_poolset_t      *fpo_owner;	/* owner of this pool */
+#ifdef HAVE_FMR_POOL_API
 	union {
 		struct {
 			struct ib_fmr_pool *fpo_fmr_pool; /* IB FMR pool */
 		} fmr;
+#endif
 		struct { /* For fast registration */
 			struct list_head  fpo_pool_list;
 			int		  fpo_pool_size;
 		} fast_reg;
+#ifdef HAVE_FMR_POOL_API
 	};
+	int			fpo_is_fmr;
+#endif
 	cfs_time_t		fpo_deadline;	/* deadline of this pool */
 	int			fpo_failed;	/* fmr pool is failed */
 	int			fpo_map_count;	/* # of mapped FMR */
-	int			fpo_is_fmr;
 } kib_fmr_pool_t;
 
 typedef struct {
 	kib_fmr_pool_t			*fmr_pool;	/* pool of FMR */
+#ifdef HAVE_FMR_POOL_API
 	struct ib_pool_fmr		*fmr_pfmr;	/* IB pool fmr */
+#endif /* HAVE_FMR_POOL_API */
 	struct kib_fast_reg_descriptor	*fmr_frd;
 	u32				 fmr_key;
 } kib_fmr_t;
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c
index a27a83748c37d..707cb1510455d 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c
@@ -580,7 +580,11 @@ kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, __u32 nob)
 static void
 kiblnd_unmap_tx(kib_tx_t *tx)
 {
-	if (tx->fmr.fmr_pfmr || tx->fmr.fmr_frd)
+	if (
+#ifdef HAVE_FMR_POOL_API
+		tx->fmr.fmr_pfmr ||
+#endif
+		tx->fmr.fmr_frd)
 		kiblnd_fmr_pool_unmap(&tx->fmr, tx->tx_status);
 
 	if (tx->tx_nfrags != 0) {
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c
index 541504ba88d1b..d0b8756143580 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c
@@ -37,8 +37,8 @@
  * Author: Eric Barton <eric@bartonsoftware.com>
  */
 
-#include <linux/pci.h>
 #include "socklnd.h"
+#include <linux/inetdevice.h>
 
 static struct lnet_lnd the_ksocklnd;
 ksock_nal_data_t        ksocknal_data;
@@ -2620,63 +2620,6 @@ ksocknal_shutdown(struct lnet_ni *ni)
                 ksocknal_base_shutdown();
 }
 
-static int
-ksocknal_enumerate_interfaces(ksock_net_t *net, struct net *ns)
-{
-        char      **names;
-        int         i;
-        int         j;
-        int         rc;
-        int         n;
-
-	n = lnet_ipif_enumerate(&names, ns);
-        if (n <= 0) {
-                CERROR("Can't enumerate interfaces: %d\n", n);
-                return n;
-        }
-
-        for (i = j = 0; i < n; i++) {
-                int        up;
-                __u32      ip;
-                __u32      mask;
-
-                if (!strcmp(names[i], "lo")) /* skip the loopback IF */
-                        continue;
-
-		rc = lnet_ipif_query(names[i], &up, &ip, &mask, ns);
-                if (rc != 0) {
-                        CWARN("Can't get interface %s info: %d\n",
-                              names[i], rc);
-                        continue;
-                }
-
-                if (!up) {
-                        CWARN("Ignoring interface %s (down)\n",
-                              names[i]);
-                        continue;
-                }
-
-		if (j == LNET_NUM_INTERFACES) {
-			CWARN("Ignoring interface %s (too many interfaces)\n",
-			      names[i]);
-			continue;
-		}
-
-                net->ksnn_interfaces[j].ksni_ipaddr = ip;
-                net->ksnn_interfaces[j].ksni_netmask = mask;
-		strlcpy(net->ksnn_interfaces[j].ksni_name,
-			names[i], sizeof(net->ksnn_interfaces[j].ksni_name));
-                j++;
-        }
-
-	lnet_ipif_free_enumeration(names, n);
-
-        if (j == 0)
-                CERROR("Can't find any usable interfaces\n");
-
-        return j;
-}
-
 static int
 ksocknal_search_new_ipif(ksock_net_t *net)
 {
@@ -2796,10 +2739,10 @@ int
 ksocknal_startup(struct lnet_ni *ni)
 {
 	ksock_net_t  *net;
-	int           rc;
-	int           i;
-	struct net_device *net_dev;
-	int node_id;
+	ksock_interface_t *ksi = NULL;
+	struct lnet_inetdev *ifaces = NULL;
+	int i = 0;
+	int rc;
 
         LASSERT (ni->ni_net->net_lnd == &the_ksocklnd);
 
@@ -2829,52 +2772,69 @@ ksocknal_startup(struct lnet_ni *ni)
 	}
 
 
-	if (ni->ni_interfaces[0] == NULL) {
-		rc = ksocknal_enumerate_interfaces(net, ni->ni_net_ns);
-		if (rc <= 0)
-			goto fail_1;
+	rc = lnet_inet_enumerate(&ifaces, ni->ni_net_ns);
+	if (rc < 0)
+		goto fail_1;
 
+	if (!ni->ni_interfaces[0]) {
+		ksi = &net->ksnn_interfaces[0];
+
+		/* Use the first discovered interface */
 		net->ksnn_ninterfaces = 1;
+		ni->ni_dev_cpt = ifaces[0].li_cpt;
+		ksi->ksni_ipaddr = ifaces[0].li_ipaddr;
+		ksi->ksni_netmask = ifaces[0].li_netmask;
+		strlcpy(ksi->ksni_name, ifaces[0].li_name,
+			sizeof(ksi->ksni_name));
 	} else {
+		/* Before Multi-Rail ksocklnd would manage
+		 * multiple interfaces with its own tcp bonding.
+		 * If we encounter an old configuration using
+		 * this tcp bonding approach then we need to
+		 * handle more than one ni_interfaces.
+		 *
+		 * In Multi-Rail configuration only ONE ni_interface
+		 * should exist. Each IP alias should be mapped to
+		 * each 'struct net_ni'.
+		 */
 		for (i = 0; i < LNET_NUM_INTERFACES; i++) {
-			int up;
+			int j;
 
-			if (ni->ni_interfaces[i] == NULL)
+			if (!ni->ni_interfaces[i])
 				break;
 
-			rc = lnet_ipif_query(ni->ni_interfaces[i], &up,
-				&net->ksnn_interfaces[i].ksni_ipaddr,
-				&net->ksnn_interfaces[i].ksni_netmask,
-				ni->ni_net_ns);
-
-			if (rc != 0) {
-				CERROR("Can't get interface %s info: %d\n",
-				       ni->ni_interfaces[i], rc);
-				goto fail_1;
-			}
-
-			if (!up) {
-				CERROR("Interface %s is down\n",
-				       ni->ni_interfaces[i]);
-				goto fail_1;
+			for (j = 0; j < LNET_NUM_INTERFACES;  j++) {
+				if (i != j && ni->ni_interfaces[j] &&
+				    strcmp(ni->ni_interfaces[i],
+					   ni->ni_interfaces[j]) == 0) {
+					rc = -EEXIST;
+					CERROR("ksocklnd: found duplicate %s at %d and %d, rc = %d\n",
+					       ni->ni_interfaces[i], i, j, rc);
+					goto fail_1;
+				}
 			}
 
-			strlcpy(net->ksnn_interfaces[i].ksni_name,
-				ni->ni_interfaces[i],
-				sizeof(net->ksnn_interfaces[i].ksni_name));
+			for (j = 0; j < rc; j++) {
+				if (strcmp(ifaces[j].li_name,
+					   ni->ni_interfaces[i]) != 0)
+					continue;
 
+				ksi = &net->ksnn_interfaces[j];
+				ni->ni_dev_cpt = ifaces[j].li_cpt;
+				ksi->ksni_ipaddr = ifaces[j].li_ipaddr;
+				ksi->ksni_netmask = ifaces[j].li_netmask;
+				strlcpy(ksi->ksni_name, ifaces[j].li_name,
+					sizeof(ksi->ksni_name));
+				net->ksnn_ninterfaces++;
+				break;
+			}
+		}
+		/* ni_interfaces don't map to all network interfaces */
+		if (!ksi || net->ksnn_ninterfaces != i) {
+			CERROR("ksocklnd: requested %d but only %d interfaces found\n",
+			       i, net->ksnn_ninterfaces);
+			goto fail_1;
 		}
-		net->ksnn_ninterfaces = i;
-	}
-
-	net_dev = dev_get_by_name(ni->ni_net_ns,
-				  net->ksnn_interfaces[0].ksni_name);
-	if (net_dev != NULL) {
-		node_id = dev_to_node(&net_dev->dev);
-		ni->ni_dev_cpt = cfs_cpt_of_node(lnet_cpt_table(), node_id);
-		dev_put(net_dev);
-	} else {
-		ni->ni_dev_cpt = CFS_CPT_ANY;
 	}
 
 	/* call it before add it to ksocknal_data.ksnd_nets */
@@ -2882,8 +2842,8 @@ ksocknal_startup(struct lnet_ni *ni)
 	if (rc != 0)
 		goto fail_1;
 
-	ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid),
-				net->ksnn_interfaces[0].ksni_ipaddr);
+	LASSERT(ksi);
+	ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ksi->ksni_ipaddr);
 	list_add(&net->ksnn_list, &ksocknal_data.ksnd_nets);
 
         ksocknal_data.ksnd_nnets++;
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h
index 4668fc162ba34..12d6cb83ef4ac 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h
@@ -55,6 +55,8 @@
 #include <lnet/lib-lnet.h>
 #include <lnet/socklnd.h>
 
+#include <libcfs/linux/linux-net.h>
+
 #ifdef HAVE_TCP_SENDPAGE_USE_SOCKET
 # define cfs_tcp_sendpage(sk, page, offset, size, flags) \
 	tcp_sendpage((sk)->sk_socket, page, offset, size, flags)
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c
index b72bf19541308..91a9cf05e2ad8 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c
@@ -183,16 +183,14 @@ ksocknal_lib_send_kiov(ksock_conn_t *conn, ksock_tx_t *tx)
 void
 ksocknal_lib_eager_ack (ksock_conn_t *conn)
 {
-        int            opt = 1;
-        struct socket *sock = conn->ksnc_sock;
+	struct socket *sock = conn->ksnc_sock;
 
-        /* Remind the socket to ACK eagerly.  If I don't, the socket might
-         * think I'm about to send something it could piggy-back the ACK
-         * on, introducing delay in completing zero-copy sends in my
-         * peer_ni. */
+	/* Remind the socket to ACK eagerly.  If I don't, the socket might
+	 * think I'm about to send something it could piggy-back the ACK on,
+	 * introducing delay in completing zero-copy sends in my peer_ni.
+	 */
 
-	kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
-			  (char *)&opt, sizeof(opt));
+	tcp_sock_set_quickack(sock->sk, 1);
 }
 
 int
@@ -421,162 +419,132 @@ ksocknal_lib_csum_tx(ksock_tx_t *tx)
 int
 ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
 {
-        struct socket *sock = conn->ksnc_sock;
-        int            len;
-        int            rc;
+	struct socket *sock = conn->ksnc_sock;
+	struct tcp_sock *tp = tcp_sk(sock->sk);
+
+	if (ksocknal_connsock_addref(conn) < 0) {
+		LASSERT(conn->ksnc_closing);
+		*txmem = 0;
+		*rxmem = 0;
+		*nagle = 0;
+		return -ESHUTDOWN;
+	}
 
-        rc = ksocknal_connsock_addref(conn);
-        if (rc != 0) {
-                LASSERT (conn->ksnc_closing);
-                *txmem = *rxmem = *nagle = 0;
-                return (-ESHUTDOWN);
-        }
+	lnet_sock_getbuf(sock, txmem, rxmem);
 
-	rc = lnet_sock_getbuf(sock, txmem, rxmem);
-        if (rc == 0) {
-                len = sizeof(*nagle);
-		rc = sock->ops->getsockopt(sock, SOL_TCP, TCP_NODELAY,
-					   (char *)nagle, &len);
-        }
+	*nagle = !(tp->nonagle & TCP_NAGLE_OFF);
 
-        ksocknal_connsock_decref(conn);
+	ksocknal_connsock_decref(conn);
 
-        if (rc == 0)
-                *nagle = !*nagle;
-        else
-                *txmem = *rxmem = *nagle = 0;
 
-        return (rc);
+	return 0;
 }
 
 int
 ksocknal_lib_setup_sock (struct socket *sock)
 {
-        int             rc;
-        int             option;
-        int             keep_idle;
-        int             keep_intvl;
-        int             keep_count;
-        int             do_keepalive;
-        struct linger   linger;
+	int rc;
+	int keep_idle;
+	int keep_intvl;
+	int keep_count;
+	int do_keepalive;
+	struct tcp_sock *tp = tcp_sk(sock->sk);
 
-        sock->sk->sk_allocation = GFP_NOFS;
+	sock->sk->sk_allocation = GFP_NOFS;
 
-        /* Ensure this socket aborts active sends immediately when we close
-         * it. */
+	/* Ensure this socket aborts active sends immediately when closed. */
+	sock_reset_flag(sock->sk, SOCK_LINGER);
 
-        linger.l_onoff = 0;
-        linger.l_linger = 0;
+	tp->linger2 = -1;
 
-	rc = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
-			       (char *)&linger, sizeof(linger));
-        if (rc != 0) {
-                CERROR ("Can't set SO_LINGER: %d\n", rc);
-                return (rc);
-        }
+	if (!*ksocknal_tunables.ksnd_nagle)
+		tcp_sock_set_nodelay(sock->sk);
 
-        option = -1;
-	rc = kernel_setsockopt(sock, SOL_TCP, TCP_LINGER2,
-			       (char *)&option, sizeof(option));
-        if (rc != 0) {
-                CERROR ("Can't set SO_LINGER2: %d\n", rc);
-                return (rc);
-        }
-
-        if (!*ksocknal_tunables.ksnd_nagle) {
-                option = 1;
-
-		rc = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
-				       (char *)&option, sizeof(option));
-                if (rc != 0) {
-                        CERROR ("Can't disable nagle: %d\n", rc);
-                        return (rc);
-                }
-        }
-
-	rc = lnet_sock_setbuf(sock,
-			      *ksocknal_tunables.ksnd_tx_buffer_size,
-			      *ksocknal_tunables.ksnd_rx_buffer_size);
-        if (rc != 0) {
-                CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n",
-                        *ksocknal_tunables.ksnd_tx_buffer_size,
-                        *ksocknal_tunables.ksnd_rx_buffer_size, rc);
-                return (rc);
-        }
+	lnet_sock_setbuf(sock,
+			 *ksocknal_tunables.ksnd_tx_buffer_size,
+			 *ksocknal_tunables.ksnd_rx_buffer_size);
 
 /* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */
 #ifdef SOCKNAL_BACKOFF
-        if (*ksocknal_tunables.ksnd_backoff_init > 0) {
-                option = *ksocknal_tunables.ksnd_backoff_init;
+	if (*ksocknal_tunables.ksnd_backoff_init > 0) {
+		int option = *ksocknal_tunables.ksnd_backoff_init;
 #ifdef SOCKNAL_BACKOFF_MS
-                option *= 1000;
+		option *= 1000;
 #endif
 
 		rc = kernel_setsockopt(sock, SOL_TCP, TCP_BACKOFF_INIT,
 				       (char *)&option, sizeof(option));
-                if (rc != 0) {
-                        CERROR ("Can't set initial tcp backoff %d: %d\n",
-                                option, rc);
-                        return (rc);
-                }
-        }
+		if (rc != 0) {
+			CERROR("Can't set initial tcp backoff %d: %d\n",
+			       option, rc);
+			return rc;
+		}
+	}
 
-        if (*ksocknal_tunables.ksnd_backoff_max > 0) {
-                option = *ksocknal_tunables.ksnd_backoff_max;
+	if (*ksocknal_tunables.ksnd_backoff_max > 0) {
+		int option = *ksocknal_tunables.ksnd_backoff_max;
 #ifdef SOCKNAL_BACKOFF_MS
-                option *= 1000;
+		option *= 1000;
 #endif
 
 		rc = kernel_setsockopt(sock, SOL_TCP, TCP_BACKOFF_MAX,
 				       (char *)&option, sizeof(option));
-                if (rc != 0) {
-                        CERROR ("Can't set maximum tcp backoff %d: %d\n",
-                                option, rc);
-                        return (rc);
-                }
-        }
+		if (rc != 0) {
+			CERROR("Can't set maximum tcp backoff %d: %d\n",
+			       option, rc);
+			return rc;
+		}
+	}
 #endif
 
-        /* snapshot tunables */
-        keep_idle  = *ksocknal_tunables.ksnd_keepalive_idle;
-        keep_count = *ksocknal_tunables.ksnd_keepalive_count;
-        keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;
+	/* snapshot tunables */
+	keep_idle  = *ksocknal_tunables.ksnd_keepalive_idle;
+	keep_count = *ksocknal_tunables.ksnd_keepalive_count;
+	keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;
 
-        do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0);
+	do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0);
 
-        option = (do_keepalive ? 1 : 0);
-	rc = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
-			       (char *)&option, sizeof(option));
-        if (rc != 0) {
-                CERROR ("Can't set SO_KEEPALIVE: %d\n", rc);
-                return (rc);
-        }
-
-        if (!do_keepalive)
-                return (0);
-
-	rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE,
-			       (char *)&keep_idle, sizeof(keep_idle));
-        if (rc != 0) {
-                CERROR ("Can't set TCP_KEEPIDLE: %d\n", rc);
-                return (rc);
-        }
+#ifdef HAVE_KERNEL_SETSOCKOPT
+	/* open-coded version doesn't work in all kernels, and
+	 * there is no helper function, so call kernel_setsockopt()
+	 * directly.
+	 */
+	{
+		int option = (do_keepalive ? 1 : 0);
+		kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
+				  (char *)&option, sizeof(option));
+	}
+#else
+	if (sock->sk->sk_prot->keepalive)
+		sock->sk->sk_prot->keepalive(sock->sk, do_keepalive);
+	if (do_keepalive)
+		sock_set_flag(sock->sk, SOCK_KEEPOPEN);
+	else
+		sock_reset_flag(sock->sk, SOCK_KEEPOPEN);
+#endif /* HAVE_KERNEL_SETSOCKOPT */
+
+	if (!do_keepalive)
+		return (0);
+
+	rc = tcp_sock_set_keepidle(sock->sk, keep_idle);
+	if (rc != 0) {
+		CERROR("Can't set TCP_KEEPIDLE: %d\n", rc);
+		return rc;
+	}
 
-	rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
-			       (char *)&keep_intvl, sizeof(keep_intvl));
-        if (rc != 0) {
-                CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc);
-                return (rc);
-        }
+	rc = tcp_sock_set_keepintvl(sock->sk, keep_intvl);
+	if (rc != 0) {
+		CERROR("Can't set TCP_KEEPINTVL: %d\n", rc);
+		return rc;
+	}
 
-	rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
-			       (char *)&keep_count, sizeof(keep_count));
-        if (rc != 0) {
-                CERROR ("Can't set TCP_KEEPCNT: %d\n", rc);
-                return (rc);
-        }
+	rc = tcp_sock_set_keepcnt(sock->sk, keep_count);
+	if (rc != 0) {
+		CERROR("Can't set TCP_KEEPCNT: %d\n", rc);
+		return rc;
+	}
 
-        return (0);
+	return (0);
 }
 
 void
@@ -585,30 +553,27 @@ ksocknal_lib_push_conn (ksock_conn_t *conn)
         struct sock    *sk;
         struct tcp_sock *tp;
         int             nonagle;
-        int             val = 1;
         int             rc;
 
-        rc = ksocknal_connsock_addref(conn);
-        if (rc != 0)                            /* being shut down */
-                return;
+	rc = ksocknal_connsock_addref(conn);
+	if (rc != 0)                            /* being shut down */
+		return;
 
 	sk = conn->ksnc_sock->sk;
 	tp = tcp_sk(sk);
 
-        lock_sock (sk);
-        nonagle = tp->nonagle;
-        tp->nonagle = 1;
-        release_sock (sk);
+	lock_sock(sk);
+	nonagle = tp->nonagle;
+	tp->nonagle = TCP_NAGLE_OFF;
+	release_sock(sk);
 
-	rc = kernel_setsockopt(conn->ksnc_sock, SOL_TCP, TCP_NODELAY,
-			       (char *)&val, sizeof(val));
-        LASSERT (rc == 0);
+	tcp_sock_set_nodelay(conn->ksnc_sock->sk);
 
-        lock_sock (sk);
-        tp->nonagle = nonagle;
-        release_sock (sk);
+	lock_sock(sk);
+	tp->nonagle = nonagle;
+	release_sock(sk);
 
-        ksocknal_connsock_decref(conn);
+	ksocknal_connsock_decref(conn);
 }
 
 extern void ksocknal_read_callback (ksock_conn_t *conn);
diff --git a/drivers/staging/lustrefsx/lnet/lnet/config.c b/drivers/staging/lustrefsx/lnet/lnet/config.c
index 8b790353c60a5..2f90e90849ac3 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/config.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/config.c
@@ -31,6 +31,8 @@
  */
 
 #define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/inetdevice.h>
 #include <linux/nsproxy.h>
 #include <net/net_namespace.h>
 #include <lnet/lib-lnet.h>
@@ -1599,113 +1601,136 @@ lnet_match_networks (char **networksp, char *ip2nets, __u32 *ipaddrs, int nip)
 	*networksp = networks;
 	return count;
 }
-
-static void
-lnet_ipaddr_free_enumeration(__u32 *ipaddrs, int nip)
-{
-	LIBCFS_FREE(ipaddrs, nip * sizeof(*ipaddrs));
-}
-
-static int
-lnet_ipaddr_enumerate(__u32 **ipaddrsp, struct net *ns)
+/*
+ * kernel 5.3: commit ef11db3310e272d3d8dbe8739e0770820dd20e52
+ * added in_dev_for_each_ifa_rtnl and in_dev_for_each_ifa_rcu
+ * and removed for_ifa and endfor_ifa.
+ * Use the _rntl variant as the current locking is rtnl.
+ */
+#ifdef in_dev_for_each_ifa_rtnl
+#define DECLARE_CONST_IN_IFADDR(ifa)		const struct in_ifaddr *ifa
+#define endfor_ifa(in_dev)
+#else
+#define DECLARE_CONST_IN_IFADDR(ifa)
+#define in_dev_for_each_ifa_rtnl(ifa, in_dev)	for_ifa((in_dev))
+#endif
+
+int lnet_inet_enumerate(struct lnet_inetdev **dev_list, struct net *ns)
 {
-	int	   up;
-	__u32	   netmask;
-	__u32	  *ipaddrs;
-	__u32	  *ipaddrs2;
-	int	   nip;
-	char	 **ifnames;
-	int	   nif = lnet_ipif_enumerate(&ifnames, ns);
-	int	   i;
-	int	   rc;
-
-	if (nif <= 0)
-		return nif;
-
-	LIBCFS_ALLOC(ipaddrs, nif * sizeof(*ipaddrs));
-	if (ipaddrs == NULL) {
-		CERROR("Can't allocate ipaddrs[%d]\n", nif);
-		lnet_ipif_free_enumeration(ifnames, nif);
-		return -ENOMEM;
-	}
-
-	for (i = nip = 0; i < nif; i++) {
-		if (!strcmp(ifnames[i], "lo"))
+	struct lnet_inetdev *ifaces = NULL;
+	struct net_device *dev;
+	int nalloc = 0;
+	int nip = 0;
+	DECLARE_CONST_IN_IFADDR(ifa);
+
+	rtnl_lock();
+	for_each_netdev(ns, dev) {
+		int flags = dev_get_flags(dev);
+		struct in_device *in_dev;
+		int node_id;
+		int cpt;
+
+		if (flags & IFF_LOOPBACK) /* skip the loopback IF */
 			continue;
 
-		rc = lnet_ipif_query(ifnames[i], &up,
-				       &ipaddrs[nip], &netmask, ns);
-		if (rc != 0) {
-			CWARN("Can't query interface %s: %d\n",
-			      ifnames[i], rc);
+		if (!(flags & IFF_UP)) {
+			CWARN("lnet: Ignoring interface %s: it's down\n",
+			      dev->name);
 			continue;
 		}
 
-		if (!up) {
-			CWARN("Ignoring interface %s: it's down\n",
-			      ifnames[i]);
+		in_dev = __in_dev_get_rtnl(dev);
+		if (!in_dev) {
+			CWARN("lnet: Interface %s has no IPv4 status.\n",
+			      dev->name);
 			continue;
 		}
 
-		nip++;
-	}
-
-	lnet_ipif_free_enumeration(ifnames, nif);
-
-	if (nip == nif) {
-		*ipaddrsp = ipaddrs;
-	} else {
-		if (nip > 0) {
-			LIBCFS_ALLOC(ipaddrs2, nip * sizeof(*ipaddrs2));
-			if (ipaddrs2 == NULL) {
-				CERROR("Can't allocate ipaddrs[%d]\n", nip);
-				nip = -ENOMEM;
-			} else {
-				memcpy(ipaddrs2, ipaddrs,
-					nip * sizeof(*ipaddrs));
-				*ipaddrsp = ipaddrs2;
-				rc = nip;
+		node_id = dev_to_node(&dev->dev);
+		cpt = cfs_cpt_of_node(lnet_cpt_table(), node_id);
+
+		in_dev_for_each_ifa_rtnl(ifa, in_dev) {
+			if (nip >= nalloc) {
+				struct lnet_inetdev *tmp;
+
+				nalloc += LNET_NUM_INTERFACES;
+				tmp = krealloc(ifaces, nalloc * sizeof(*tmp),
+					       GFP_KERNEL);
+				if (!tmp) {
+					kfree(ifaces);
+					ifaces = NULL;
+					nip = -ENOMEM;
+					goto unlock_rtnl;
+				}
+				ifaces = tmp;
 			}
+
+			ifaces[nip].li_cpt = cpt;
+			ifaces[nip].li_flags = flags;
+			ifaces[nip].li_ipaddr = ntohl(ifa->ifa_local);
+			ifaces[nip].li_netmask = ntohl(ifa->ifa_mask);
+			strlcpy(ifaces[nip].li_name, ifa->ifa_label,
+				sizeof(ifaces[nip].li_name));
+			nip++;
 		}
-		lnet_ipaddr_free_enumeration(ipaddrs, nif);
+		endfor_ifa(in_dev);
+	}
+unlock_rtnl:
+	rtnl_unlock();
+
+	if (nip == 0) {
+		CERROR("lnet: Can't find any usable interfaces, rc = -ENOENT\n");
+		nip = -ENOENT;
 	}
+
+	*dev_list = ifaces;
 	return nip;
 }
+EXPORT_SYMBOL(lnet_inet_enumerate);
 
 int
 lnet_parse_ip2nets (char **networksp, char *ip2nets)
 {
+	struct lnet_inetdev *ifaces = NULL;
 	__u32	  *ipaddrs = NULL;
-	int	   nip;
+	int nip;
 	int	   rc;
+	int i;
 
-	nip = lnet_ipaddr_enumerate(&ipaddrs, current->nsproxy->net_ns);
-
+	nip = lnet_inet_enumerate(&ifaces, current->nsproxy->net_ns);
 	if (nip < 0) {
-		LCONSOLE_ERROR_MSG(0x117, "Error %d enumerating local IP "
-				   "interfaces for ip2nets to match\n", nip);
+		if (nip != -ENOENT) {
+			LCONSOLE_ERROR_MSG(0x117,
+					   "Error %d enumerating local IP interfaces for ip2nets to match\n",
+					   nip);
+		} else {
+			LCONSOLE_ERROR_MSG(0x118,
+					   "No local IP interfaces for ip2nets to match\n");
+		}
 		return nip;
 	}
 
-	if (nip == 0) {
-		LCONSOLE_ERROR_MSG(0x118, "No local IP interfaces "
-				   "for ip2nets to match\n");
-		return -ENOENT;
+	LIBCFS_ALLOC(ipaddrs, nip * sizeof(*ipaddrs));
+	if (!ipaddrs) {
+		rc = -ENOMEM;
+		CERROR("lnet: Can't allocate ipaddrs[%d], rc = %d\n",
+		       nip, rc);
+		goto out_free_addrs;
 	}
 
-	rc = lnet_match_networks(networksp, ip2nets, ipaddrs, nip);
-	lnet_ipaddr_free_enumeration(ipaddrs, nip);
+	for (i = 0; i < nip; i++)
+		ipaddrs[i] = ifaces[i].li_ipaddr;
 
+	rc = lnet_match_networks(networksp, ip2nets, ipaddrs, nip);
 	if (rc < 0) {
 		LCONSOLE_ERROR_MSG(0x119, "Error %d parsing ip2nets\n", rc);
-		return rc;
-	}
-
-	if (rc == 0) {
+	} else if (rc == 0) {
 		LCONSOLE_ERROR_MSG(0x11a, "ip2nets does not match "
 				   "any local IP interfaces\n");
-		return -ENOENT;
+		rc = -ENOENT;
 	}
-
-	return 0;
+	LIBCFS_FREE(ipaddrs, nip * sizeof(*ipaddrs));
+out_free_addrs:
+	kfree(ifaces);
+	return rc > 0 ? 0 : rc;
 }
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
index b01cdd55193aa..973587a2a1dc5 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
@@ -40,26 +40,30 @@
 #include <linux/syscalls.h>
 #include <net/sock.h>
 
+#include <libcfs/linux/linux-net.h>
 #include <libcfs/libcfs.h>
+#include <libcfs/linux/linux-time.h>
 #include <lnet/lib-lnet.h>
 
-#include <linux/inetdevice.h>
-
-#ifndef HAVE_KERNEL_SETSOCKOPT
-int kernel_setsockopt(struct socket *sock, int level, int optname,
-			char *val, unsigned int optlen)
-{
-	sockptr_t optval = KERNEL_SOCKPTR(val);
-	int err;
+/*
+ * kernel 5.1: commit 7f1bc6e95d7840d4305595b3e4025cddda88cee5
+ * Y2038 64-bit time.
+ *  SO_TIMESTAMP, SO_TIMESTAMPNS and SO_TIMESTAMPING options, the
+ *  way they are currently defined, are not y2038 safe.
+ *  Subsequent patches in the series add new y2038 safe versions
+ *  of these options which provide 64 bit timestamps on all
+ *  architectures uniformly.
+ *  Hence, rename existing options with OLD tag suffixes.
+ *
+ * NOTE: When updating to timespec64 change change these to '_NEW'.
+ *
+ */
+#ifndef SO_SNDTIMEO
+#define SO_SNDTIMEO SO_SNDTIMEO_OLD
+#endif
 
-	if (level == SOL_SOCKET)
-		err = sock_setsockopt(sock, level, optname, optval, optlen);
-	else
-		err = sock->ops->setsockopt(sock, level, optname, optval,
-					    optlen);
-	return err;
-}
-EXPORT_SYMBOL(kernel_setsockopt);
+#ifndef SO_RCVTIMEO
+#define SO_RCVTIMEO SO_RCVTIMEO_OLD
 #endif
 
 static int
@@ -76,164 +80,11 @@ lnet_sock_create_kern(struct socket **sock, struct net *ns)
 	return rc;
 }
 
-int
-lnet_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask, struct net *ns)
-{
-	struct net_device *dev;
-	struct in_device *in_dev;
-	const struct in_ifaddr *ifa;
-	unsigned int flags;
-	char *colon, *ifname;
-	int ret;
-	size_t slen;
-
-	/*
-	 * Copy the interface name, since we may be about to modify it.
-	 */
-	slen = strlen(name) + 1;
-	ifname = kzalloc(slen, GFP_KERNEL);
-	if (ifname == NULL)
-		return -ENOMEM;
-
-	memcpy(ifname, name, slen);
-	colon = strchr(ifname, ':');
-	if (colon)
-		*colon = 0;
-
-	dev_load(ns, ifname);
-	ret = -ENODEV;
-
-	rtnl_lock();
-
-	dev = __dev_get_by_name(ns, ifname);
-
-	if (colon)
-		*colon = ':';
-
-	if (dev == NULL) {
-		CERROR("Can't find interface %s\n", name);
-		goto out;
-	}
-
-	flags = dev_get_flags(dev);
-	if ((flags & IFF_UP) == 0) {
-		CDEBUG(D_NET, "Interface %s down\n", name);
-		*up = 0;
-		*ip = *mask = 0;
-		ret = 0;
-		goto out;
-	}
-
-	/*
-	 * Only support IPv4, so just walk the list of IPv4 assigned
-	 * addresses to a device.
-	 */
-	in_dev = __in_dev_get_rtnl(dev);
-
-	in_dev_for_each_ifa_rtnl(ifa, in_dev) {
-		if (!strcmp(ifa->ifa_label, ifname))
-			break;
-	}
-
-	if (ifa != NULL) {
-		*up = 1;
-		*mask = ntohl(ifa->ifa_mask);
-		*ip = ntohl(ifa->ifa_local);
-		ret = 0;
-	} else {
-		CERROR("Can't get mask/ip for interface %s\n", name);
-	}
-
-out:
-	rtnl_unlock();
-	kfree(ifname);
-	return ret;
-}
-EXPORT_SYMBOL(lnet_ipif_query);
-
-void
-lnet_ipif_free_enumeration(char **names, int n)
-{
-	LIBCFS_FREE(names, PAGE_SIZE / IFNAMSIZ);
-	LIBCFS_FREE(names[0], PAGE_SIZE);
-}
-EXPORT_SYMBOL(lnet_ipif_free_enumeration);
-
-int
-lnet_ipif_enumerate(char ***namesp, struct net *ns)
-{
-	char **names;
-	char *space;
-	const struct in_ifaddr *ifa;
-	struct net_device *dev;
-	struct in_device *in_dev;
-	int maxifs, nifs, toobig;
-	size_t used, slen;
-
-	maxifs = PAGE_SIZE / IFNAMSIZ;
-	nifs = 0;
-	used = 0;
-	toobig = 0;
-
-	/*
-	 * For simplicity, just allocate the maximum number of names
-	 * that can be dealt with. The free function will ignore the
-	 * arg
-	 */
-	LIBCFS_ALLOC(names, maxifs * sizeof (*names));
-	if (names == NULL)
-		return -ENOMEM;
-
-	LIBCFS_ALLOC(space, PAGE_SIZE);
-	if (space == NULL) {
-		LIBCFS_FREE(names, maxifs * sizeof (*names));
-		return -ENOMEM;
-	}
-
-	/*
-	 * Only IPv4 is supported, so just loop all network
-	 * devices, and loop the IPv4 interfaces (addresses)
-	 * assigned to each device.
-	 */
-	rtnl_lock();
-	for_each_netdev(ns, dev) {
-		in_dev = __in_dev_get_rtnl(dev);
-		if (!in_dev)
-			continue;
-
-		in_dev_for_each_ifa_rtnl(ifa, in_dev) {
-			nifs++;
-			if (toobig)
-				continue;
-
-			if (nifs > maxifs) {
-				toobig = 1;
-				continue;
-			}
-
-			slen = strlen(ifa->ifa_label) + 1;
-			if (used + slen > PAGE_SIZE) {
-				toobig = 1;
-				continue;
-			}
-			memcpy(space + used, ifa->ifa_label, slen);
-			names[nifs - 1] = space + used;
-			used += slen;
-		}
-	}
-	rtnl_unlock();
-
-	*namesp = names;
-
-	return nifs;
-}
-EXPORT_SYMBOL(lnet_ipif_enumerate);
-
 int
 lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout)
 {
 	int rc;
-	long jiffies_left = cfs_time_seconds(timeout);
+	long jiffies_left = timeout * msecs_to_jiffies(MSEC_PER_SEC);
 	unsigned long then;
 
 	LASSERT(nob > 0);
@@ -252,6 +103,7 @@ lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout)
 		if (timeout != 0) {
 			struct sock *sk = sock->sk;
 
+			/* Set send timeout to remaining time */
 			lock_sock(sk);
 			sk->sk_sndtimeo = jiffies_left;
 			release_sock(sk);
@@ -286,7 +138,7 @@ int
 lnet_sock_read(struct socket *sock, void *buffer, int nob, int timeout)
 {
 	int rc;
-	long jiffies_left = cfs_time_seconds(timeout);
+	long jiffies_left = timeout * msecs_to_jiffies(MSEC_PER_SEC);
 	unsigned long then;
 
 	LASSERT(nob > 0);
@@ -334,9 +186,8 @@ lnet_sock_create(struct socket **sockp, int *fatal,
 		 __u32 local_ip, int local_port, struct net *ns)
 {
 	struct sockaddr_in  locaddr;
-	struct socket	   *sock;
-	int		    rc;
-	int		    option;
+	struct socket *sock;
+	int rc;
 
 	/* All errors are fatal except bind failure if the port is in use */
 	*fatal = 1;
@@ -348,13 +199,7 @@ lnet_sock_create(struct socket **sockp, int *fatal,
 		return rc;
 	}
 
-	option = 1;
-	rc = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
-			       (char *)&option, sizeof(option));
-	if (rc != 0) {
-		CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc);
-		goto failed;
-	}
+	sock->sk->sk_reuseport = 1;
 
 	if (local_ip != 0 || local_port != 0) {
 		memset(&locaddr, 0, sizeof(locaddr));
@@ -383,34 +228,21 @@ lnet_sock_create(struct socket **sockp, int *fatal,
 	return rc;
 }
 
-int
+void
 lnet_sock_setbuf(struct socket *sock, int txbufsize, int rxbufsize)
 {
-	int		    option;
-	int		    rc;
+	struct sock *sk = sock->sk;
 
 	if (txbufsize != 0) {
-		option = txbufsize;
-		rc = kernel_setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
-				       (char *)&option, sizeof(option));
-		if (rc != 0) {
-			CERROR("Can't set send buffer %d: %d\n",
-				option, rc);
-			return rc;
-		}
+		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+		sk->sk_sndbuf = txbufsize;
+		sk->sk_write_space(sk);
 	}
 
 	if (rxbufsize != 0) {
-		option = rxbufsize;
-		rc = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUF,
-				       (char *)&option, sizeof(option));
-		if (rc != 0) {
-			CERROR("Can't set receive buffer %d: %d\n",
-				option, rc);
-			return rc;
-		}
+		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+		sk->sk_sndbuf = rxbufsize;
 	}
-	return 0;
 }
 EXPORT_SYMBOL(lnet_sock_setbuf);
 
@@ -445,16 +277,13 @@ lnet_sock_getaddr(struct socket *sock, bool remote, __u32 *ip, int *port)
 }
 EXPORT_SYMBOL(lnet_sock_getaddr);
 
-int
-lnet_sock_getbuf(struct socket *sock, int *txbufsize, int *rxbufsize)
+void lnet_sock_getbuf(struct socket *sock, int *txbufsize, int *rxbufsize)
 {
 	if (txbufsize != NULL)
 		*txbufsize = sock->sk->sk_sndbuf;
 
 	if (rxbufsize != NULL)
 		*rxbufsize = sock->sk->sk_rcvbuf;
-
-	return 0;
 }
 EXPORT_SYMBOL(lnet_sock_getbuf);
 
diff --git a/drivers/staging/lustrefsx/lnet/lnet/router_proc.c b/drivers/staging/lustrefsx/lnet/lnet/router_proc.c
index da73b32ea9371..b7d513521b433 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/router_proc.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/router_proc.c
@@ -244,9 +244,14 @@ proc_lnet_routes(struct ctl_table *table, int write, void __user *buffer,
 	if (len > *lenp) {    /* linux-supplied buffer is too small */
 		rc = -EINVAL;
 	} else if (len > 0) { /* wrote something */
+#ifdef PROC_HANDLER_USE_USER_ATTR
 		if (copy_to_user(buffer, tmpstr, len))
 			rc = -EFAULT;
 		else {
+#else
+		memcpy(buffer, tmpstr, len);
+		{
+#endif
 			off += 1;
 			*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
 		}
@@ -381,9 +386,14 @@ proc_lnet_routers(struct ctl_table *table, int write, void __user *buffer,
 	if (len > *lenp) {    /* linux-supplied buffer is too small */
 		rc = -EINVAL;
 	} else if (len > 0) { /* wrote something */
+#ifdef PROC_HANDLER_USE_USER_ATTR
 		if (copy_to_user(buffer, tmpstr, len))
 			rc = -EFAULT;
 		else {
+#else
+		memcpy(buffer, tmpstr, len);
+		{
+#endif
 			off += 1;
 			*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
 		}
@@ -577,9 +587,13 @@ proc_lnet_peers(struct ctl_table *table, int write, void __user *buffer,
 	if (len > *lenp) {    /* linux-supplied buffer is too small */
 		rc = -EINVAL;
 	} else if (len > 0) { /* wrote something */
+#ifdef PROC_HANDLER_USE_USER_ATTR
 		if (copy_to_user(buffer, tmpstr, len))
 			rc = -EFAULT;
 		else
+#else
+		memcpy(buffer, tmpstr, len);
+#endif
 			*ppos = LNET_PROC_POS_MAKE(cpt, ver, hash, hoff);
 	}
 
@@ -784,9 +798,14 @@ proc_lnet_nis(struct ctl_table *table, int write, void __user *buffer,
 	if (len > *lenp) {    /* linux-supplied buffer is too small */
 		rc = -EINVAL;
 	} else if (len > 0) { /* wrote something */
+
+#ifdef PROC_HANDLER_USE_USER_ATTR
 		if (copy_to_user(buffer, tmpstr, len))
 			rc = -EFAULT;
 		else
+#else
+		memcpy(buffer, tmpstr, len);
+#endif
 			*ppos += 1;
 	}
 
diff --git a/drivers/staging/lustrefsx/lnet/selftest/conctl.c b/drivers/staging/lustrefsx/lnet/selftest/conctl.c
index e7b9d05d8cd32..9e60d0d671df2 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/conctl.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/conctl.c
@@ -38,7 +38,6 @@
 
 #include <libcfs/libcfs.h>
 #include <lnet/lib-lnet.h>
-#include <lnet/lnetst.h>
 #include "console.h"
 
 static int
diff --git a/drivers/staging/lustrefsx/lnet/selftest/conrpc.c b/drivers/staging/lustrefsx/lnet/selftest/conrpc.c
index 996acd87528eb..a1ef9ada96804 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/conrpc.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/conrpc.c
@@ -476,7 +476,7 @@ lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
         srpc_msg_t           *msg;
         lstcon_node_t        *nd;
         cfs_duration_t        dur;
-        struct timespec64     ts;
+	struct timespec64 ts;
         int                   error;
 
 	LASSERT(head_up != NULL);
@@ -501,8 +501,8 @@ lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
 
                 nd = crpc->crp_node;
 
-                dur = (cfs_duration_t)cfs_time_sub(crpc->crp_stamp,
-                      (cfs_time_t)console_session.ses_id.ses_stamp);
+		dur = (cfs_duration_t)cfs_time_sub(crpc->crp_stamp,
+		       (cfs_time_t)console_session.ses_id.ses_stamp);
 		jiffies_to_timespec64(dur, &ts);
 
 		if (copy_to_user(&ent->rpe_peer,
diff --git a/drivers/staging/lustrefsx/lnet/selftest/conrpc.h b/drivers/staging/lustrefsx/lnet/selftest/conrpc.h
index 3ac70050b29a0..fd56e648491ce 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/conrpc.h
+++ b/drivers/staging/lustrefsx/lnet/selftest/conrpc.h
@@ -42,7 +42,6 @@
 #include <libcfs/libcfs.h>
 #include <lnet/lnet.h>
 #include <lnet/lib-types.h>
-#include <lnet/lnetst.h>
 #include "rpc.h"
 #include "selftest.h"
 
diff --git a/drivers/staging/lustrefsx/lnet/selftest/console.h b/drivers/staging/lustrefsx/lnet/selftest/console.h
index 0d597c45cb469..ae76a50b4d173 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/console.h
+++ b/drivers/staging/lustrefsx/lnet/selftest/console.h
@@ -43,7 +43,6 @@
 #include <libcfs/libcfs.h>
 #include <lnet/lnet.h>
 #include <lnet/lib-types.h>
-#include <lnet/lnetst.h>
 #include "selftest.h"
 #include "conrpc.h"
 
diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_handler.c b/drivers/staging/lustrefsx/lustre/fid/fid_handler.c
index 18ac0209737c7..ef61772f0dcb2 100644
--- a/drivers/staging/lustrefsx/lustre/fid/fid_handler.c
+++ b/drivers/staging/lustrefsx/lustre/fid/fid_handler.c
@@ -458,7 +458,7 @@ LU_KEY_INIT_FINI(seq, struct seq_thread_info);
 /* context key: seq_thread_key */
 LU_CONTEXT_KEY_DEFINE(seq, LCT_MD_THREAD | LCT_DT_THREAD);
 
-extern const struct proc_ops seq_fld_proc_seq_fops;
+extern const struct file_operations seq_fld_proc_seq_fops;
 
 static int seq_server_proc_init(struct lu_server_seq *seq)
 {
diff --git a/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c b/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c
index 6e5df75b37c9d..d95888f15cfcb 100644
--- a/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c
+++ b/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c
@@ -496,11 +496,12 @@ static ssize_t fldb_seq_write(struct file *file, const char __user *buf,
 	RETURN(rc < 0 ? rc : len);
 }
 
-const struct proc_ops seq_fld_proc_seq_fops = {
-	.proc_open	 = fldb_seq_open,
-	.proc_read	 = seq_read,
-	.proc_write	 = fldb_seq_write,
-	.proc_release	= fldb_seq_release,
+const struct file_operations seq_fld_proc_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = fldb_seq_open,
+	.read	 = seq_read,
+	.write	 = fldb_seq_write,
+	.release = fldb_seq_release,
 };
 
 #endif /* HAVE_SERVER_SUPPORT */
diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_internal.h b/drivers/staging/lustrefsx/lustre/fld/fld_internal.h
index 0be28746d6efc..dcb24a3c2f22a 100644
--- a/drivers/staging/lustrefsx/lustre/fld/fld_internal.h
+++ b/drivers/staging/lustrefsx/lustre/fld/fld_internal.h
@@ -179,7 +179,7 @@ void fld_server_mod_exit(void);
 int fld_server_read(const struct lu_env *env, struct lu_server_fld *fld,
 		    struct lu_seq_range *range, void *data, int data_len);
 #ifdef CONFIG_PROC_FS
-extern const struct proc_ops fld_proc_seq_fops;
+extern const struct file_operations fld_proc_seq_fops;
 extern struct lprocfs_vars fld_server_proc_list[];
 #endif
 
diff --git a/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c b/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c
index 269d8d3976065..926ed5598052b 100644
--- a/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c
+++ b/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c
@@ -356,9 +356,10 @@ static int fldb_seq_release(struct inode *inode, struct file *file)
 }
 
 const struct file_operations fld_proc_seq_fops = {
-	.proc_open    = fldb_seq_open,
-	.proc_read    = seq_read,
-	.proc_release = fldb_seq_release,
+	.owner   = THIS_MODULE,
+	.open    = fldb_seq_open,
+	.read    = seq_read,
+	.release = fldb_seq_release,
 };
 
 struct lprocfs_vars fld_server_proc_list[] = {
diff --git a/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h b/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h
index 6a58c7129b033..a9d6342f1b6c3 100644
--- a/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h
+++ b/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h
@@ -46,10 +46,11 @@
 #include <linux/seq_file.h>
 
 #include <libcfs/libcfs.h>
+#include <libcfs/linux/linux-fs.h>
 #include <lustre/lustre_idl.h>
 
 /*
- * Linux 5.6 introduces proc_ops with v5.5-8862-gd56c0d45f0e2
+ * Liuux 5.6 introduces proc_ops with v5.5-8862-gd56c0d45f0e2
  * Now that proc and debugfs use separate operation vector types
  * separate containers are also needed.
  */
@@ -57,16 +58,15 @@ struct lprocfs_vars {
 	const char			*name;
 	const struct proc_ops		*fops;
 	void				*data;
-	/** /proc file mode. */
+	/* /proc file mode. */
 	mode_t				 proc_mode;
 };
 
-/** Provide a debugfs container */
 struct ldebugfs_vars {
 	const char			*name;
 	const struct file_operations	*fops;
 	void				*data;
-	/** debugfs file mode. */
+	/* debugfs file mode. */
 	mode_t				 proc_mode;
 };
 
@@ -490,7 +490,7 @@ static inline int lprocfs_exp_cleanup(struct obd_export *exp)
 #endif
 extern struct proc_dir_entry *
 lprocfs_add_simple(struct proc_dir_entry *root, char *name,
-		   void *data, const struct proc_ops *fops);
+		   void *data, const struct proc_ops *ops);
 extern struct proc_dir_entry *
 lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent,
                     const char *format, ...);
@@ -549,7 +549,7 @@ static inline int LPROCFS_ENTRY_CHECK(struct inode *inode)
 extern int lprocfs_obd_setup(struct obd_device *dev);
 extern int lprocfs_obd_cleanup(struct obd_device *obd);
 #ifdef HAVE_SERVER_SUPPORT
-extern const struct proc_ops lprocfs_evict_client_fops;
+extern const struct file_operations lprocfs_evict_client_fops;
 #endif
 
 extern int ldebugfs_seq_create(struct dentry *parent, const char *name,
@@ -557,12 +557,10 @@ extern int ldebugfs_seq_create(struct dentry *parent, const char *name,
 			       const struct file_operations *seq_fops,
 			       void *data);
 extern int lprocfs_seq_create(struct proc_dir_entry *parent, const char *name,
-			      mode_t mode,
-			      const struct proc_ops *seq_fops,
+			      mode_t mode, const struct proc_ops *seq_fops,
 			      void *data);
 extern int lprocfs_obd_seq_create(struct obd_device *dev, const char *name,
-				  mode_t mode,
-				  const struct proc_ops *seq_fops,
+				  mode_t mode, const struct proc_ops *seq_fops,
 				  void *data);
 
 /* Generic callbacks */
@@ -691,12 +689,13 @@ static int name##_single_open(struct inode *inode, struct file *file)	\
 	return single_open(file, name##_seq_show,			\
 			   inode->i_private ? : PDE_DATA(inode));	\
 }									\
-static const struct proc_ops name##_fops = {			\
-	.proc_open	 = name##_single_open,				\
-	.proc_read	 = seq_read,					\
-	.proc_write	 = custom_seq_write,				\
-	.proc_lseek	 = seq_lseek,					\
-	.proc_release = lprocfs_single_release,				\
+static const struct proc_ops name##_fops = {				\
+	PROC_OWNER(THIS_MODULE)						\
+	.proc_open		= name##_single_open,			\
+	.proc_read		= seq_read,				\
+	.proc_write		= custom_seq_write,			\
+	.proc_lseek		= seq_lseek,				\
+	.proc_release		= lprocfs_single_release,		\
 }
 
 #define LPROC_SEQ_FOPS_RO(name)		__LPROC_SEQ_FOPS(name, NULL)
@@ -736,10 +735,10 @@ static const struct proc_ops name##_fops = {			\
 		return single_open(file, NULL,				\
 				   inode->i_private ? : PDE_DATA(inode));\
 	}								\
-	static const struct proc_ops name##_##type##_fops = {	\
-		.proc_open	 = name##_##type##_open,		\
-		.proc_write	 = name##_##type##_write,		\
-		.proc_release = lprocfs_single_release,			\
+	static const struct proc_ops name##_##type##_fops = {		\
+		.proc_open	= name##_##type##_open,			\
+		.proc_write	= name##_##type##_write,		\
+		.proc_release	= lprocfs_single_release,		\
 	};
 
 struct lustre_attr {
@@ -893,7 +892,7 @@ static inline int lprocfs_exp_cleanup(struct obd_export *exp)
 { return 0; }
 static inline struct proc_dir_entry *
 lprocfs_add_simple(struct proc_dir_entry *root, char *name,
-		   void *data, const struct proc_ops *fops)
+		   void *data, const struct file_operations *fops)
 {return 0; }
 static inline struct proc_dir_entry *
 lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent,
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h b/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h
index 9ec06c7fb8049..6f57a20a6a8ab 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h
@@ -41,9 +41,7 @@
 #ifndef _LUSTRE_FIEMAP_H
 #define _LUSTRE_FIEMAP_H
 
-#ifndef __KERNEL__
 #include <stddef.h>
-#endif
 #include <linux/fiemap.h>
 
 /* XXX: We use fiemap_extent::fe_reserved[0] */
diff --git a/drivers/staging/lustrefsx/lustre/include/lvfs.h b/drivers/staging/lustrefsx/lustre/include/lvfs.h
index 856ee1972aa06..f24aff819f668 100644
--- a/drivers/staging/lustrefsx/lustre/include/lvfs.h
+++ b/drivers/staging/lustrefsx/lustre/include/lvfs.h
@@ -52,7 +52,6 @@ struct dt_device;
 struct lvfs_run_ctxt {
 	struct vfsmount		*pwdmnt;
 	struct dentry		*pwd;
-	mm_segment_t		 fs;
 	int			 umask;
 	struct dt_device	*dt;
 #ifdef OBD_CTXT_DEBUG
diff --git a/drivers/staging/lustrefsx/lustre/include/obd_support.h b/drivers/staging/lustrefsx/lustre/include/obd_support.h
index f8abcb83f0301..c22e08fe8cdb2 100644
--- a/drivers/staging/lustrefsx/lustre/include/obd_support.h
+++ b/drivers/staging/lustrefsx/lustre/include/obd_support.h
@@ -762,7 +762,7 @@ do {									      \
 #define __OBD_VMALLOC_VERBOSE(ptr, cptab, cpt, size)			      \
 do {									      \
 	(ptr) = cptab == NULL ?						      \
-		__vmalloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO) :      \
+		__ll_vmalloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO):    \
 		cfs_cpt_vzalloc(cptab, cpt, size);			      \
 	if (unlikely((ptr) == NULL)) {                                        \
 		CERROR("vmalloc of '" #ptr "' (%d bytes) failed\n",           \
@@ -823,7 +823,7 @@ do {									      \
 do {									      \
 	if (is_vmalloc_addr(ptr)) {					      \
 		OBD_FREE_PRE(ptr, size, "vfreed");			      \
-		vfree(ptr);						      \
+		libcfs_vfree_atomic(ptr);						      \
 		POISON_PTR(ptr);					      \
 	} else {							      \
 		OBD_FREE(ptr, size);					      \
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c
index 6b8734adeb851..042633867837b 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c
@@ -147,12 +147,13 @@ static int seq_watermark_open(struct inode *inode, struct file *file)
 	return single_open(file, seq_watermark_show, PDE_DATA(inode));
 }
 
-static const struct proc_ops ldlm_watermark_fops = {
-	.proc_open		= seq_watermark_open,
-	.proc_read		= seq_read,
-	.proc_write		= seq_watermark_write,
-	.proc_lseek		= seq_lseek,
-	.proc_release		= lprocfs_single_release,
+static const struct file_operations ldlm_watermark_fops = {
+	.owner		= THIS_MODULE,
+	.open		= seq_watermark_open,
+	.read		= seq_read,
+	.write		= seq_watermark_write,
+	.llseek		= seq_lseek,
+	.release	= lprocfs_single_release,
 };
 
 static int seq_granted_show(struct seq_file *m, void *data)
@@ -167,11 +168,12 @@ static int seq_granted_open(struct inode *inode, struct file *file)
 	return single_open(file, seq_granted_show, PDE_DATA(inode));
 }
 
-static const struct proc_ops ldlm_granted_fops = {
-	.proc_open	= seq_granted_open,
-	.proc_read	= seq_read,
-	.proc_lseek	= seq_lseek,
-	.proc_release	= seq_release,
+static const struct file_operations ldlm_granted_fops = {
+	.owner	= THIS_MODULE,
+	.open	= seq_granted_open,
+	.read	= seq_read,
+	.llseek	= seq_lseek,
+	.release = seq_release,
 };
 
 #endif /* HAVE_SERVER_SUPPORT */
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c b/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
index 7807f45396c94..2c6c54f47af61 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
@@ -54,21 +54,22 @@ void policy_from_vma(union ldlm_policy_data *policy, struct vm_area_struct *vma,
 struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr,
                                size_t count)
 {
-        struct vm_area_struct *vma, *ret = NULL;
-        ENTRY;
+	struct vm_area_struct *vma, *ret = NULL;
+	ENTRY;
 
-        /* mmap_lock must have been held by caller. */
-        LASSERT(!down_write_trylock(&mm->mmap_lock));
+	/* mmap_lock must have been held by caller. */
+	LASSERT(!mmap_write_trylock(mm));
 
-        for(vma = find_vma(mm, addr);
-            vma != NULL && vma->vm_start < (addr + count); vma = vma->vm_next) {
-                if (vma->vm_ops && vma->vm_ops == &ll_file_vm_ops &&
-                    vma->vm_flags & VM_SHARED) {
-                        ret = vma;
-                        break;
-                }
-        }
-        RETURN(ret);
+	for (vma = find_vma(mm, addr);
+	     vma != NULL && vma->vm_start < (addr + count);
+	     vma = vma->vm_next) {
+		if (vma->vm_ops && vma->vm_ops == &ll_file_vm_ops &&
+		    vma->vm_flags & VM_SHARED) {
+			ret = vma;
+			break;
+		}
+	}
+	RETURN(ret);
 }
 
 /**
diff --git a/drivers/staging/lustrefsx/lustre/llite/rw26.c b/drivers/staging/lustrefsx/lustre/llite/rw26.c
index 528f2892e3b40..9cba2d0b5e8e3 100644
--- a/drivers/staging/lustrefsx/lustre/llite/rw26.c
+++ b/drivers/staging/lustrefsx/lustre/llite/rw26.c
@@ -453,11 +453,11 @@ static inline int ll_get_user_pages(int rw, unsigned long user_addr,
 
 	OBD_ALLOC_LARGE(*pages, *max_pages * sizeof(**pages));
 	if (*pages) {
-		down_read(&current->mm->mmap_sem);
+		mmap_read_lock(current->mm);
 		result = get_user_pages(current, current->mm, user_addr,
 					*max_pages, (rw == READ), 0, *pages,
 					NULL);
-		up_read(&current->mm->mmap_sem);
+		mmap_read_unlock(current->mm);
 		if (unlikely(result <= 0))
 			OBD_FREE_LARGE(*pages, *max_pages * sizeof(**pages));
 	}
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c b/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c
index 15eb72a35245c..2f640635afea2 100644
--- a/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c
@@ -647,6 +647,7 @@ static int vvp_dump_pgcache_seq_open(struct inode *inode, struct file *filp)
 }
 
 const struct proc_ops vvp_dump_pgcache_file_ops = {
+	PROC_OWNER(THIS_MODULE)
         .proc_open    = vvp_dump_pgcache_seq_open,
         .proc_read    = seq_read,
         .proc_lseek   = seq_lseek,
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_io.c b/drivers/staging/lustrefsx/lustre/llite/vvp_io.c
index a71b7b60d90f5..a1280f9bff131 100644
--- a/drivers/staging/lustrefsx/lustre/llite/vvp_io.c
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_io.c
@@ -452,17 +452,17 @@ static int vvp_mmap_locks(const struct lu_env *env, struct cl_io *io)
 		addr = (unsigned long)iov.iov_base;
 		count = iov.iov_len;
 
-                if (count == 0)
-                        continue;
+		if (count == 0)
+			continue;
 
 		count += addr & ~PAGE_MASK;
 		addr &= PAGE_MASK;
 
-                down_read(&mm->mmap_lock);
-                while((vma = our_vma(mm, addr, count)) != NULL) {
+		mmap_read_lock(mm);
+		while ((vma = our_vma(mm, addr, count)) != NULL) {
 			struct dentry *de = file_dentry(vma->vm_file);
 			struct inode *inode = de->d_inode;
-                        int flags = CEF_MUST;
+			int flags = CEF_MUST;
 
 			if (ll_file_nolock(vma->vm_file)) {
 				/*
@@ -472,24 +472,24 @@ static int vvp_mmap_locks(const struct lu_env *env, struct cl_io *io)
 				break;
 			}
 
-                        /*
-                         * XXX: Required lock mode can be weakened: CIT_WRITE
-                         * io only ever reads user level buffer, and CIT_READ
-                         * only writes on it.
-                         */
-                        policy_from_vma(&policy, vma, addr, count);
-                        descr->cld_mode = vvp_mode_from_vma(vma);
-                        descr->cld_obj = ll_i2info(inode)->lli_clob;
-                        descr->cld_start = cl_index(descr->cld_obj,
-                                                    policy.l_extent.start);
-                        descr->cld_end = cl_index(descr->cld_obj,
-                                                  policy.l_extent.end);
-                        descr->cld_enq_flags = flags;
-                        result = cl_io_lock_alloc_add(env, io, descr);
-
-                        CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n",
-                               descr->cld_mode, descr->cld_start,
-                               descr->cld_end);
+			/*
+			 * XXX: Required lock mode can be weakened: CIT_WRITE
+			 * io only ever reads user level buffer, and CIT_READ
+			 * only writes on it.
+			 */
+			policy_from_vma(&policy, vma, addr, count);
+			descr->cld_mode = vvp_mode_from_vma(vma);
+			descr->cld_obj = ll_i2info(inode)->lli_clob;
+			descr->cld_start = cl_index(descr->cld_obj,
+						    policy.l_extent.start);
+			descr->cld_end = cl_index(descr->cld_obj,
+						  policy.l_extent.end);
+			descr->cld_enq_flags = flags;
+			result = cl_io_lock_alloc_add(env, io, descr);
+
+			CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n",
+			       descr->cld_mode, descr->cld_start,
+			       descr->cld_end);
 
 			if (result < 0)
 				break;
@@ -500,7 +500,7 @@ static int vvp_mmap_locks(const struct lu_env *env, struct cl_io *io)
 			count -= vma->vm_end - addr;
 			addr = vma->vm_end;
 		}
-		up_read(&mm->mmap_lock);
+		mmap_read_unlock(mm);
 		if (result < 0)
 			break;
 	}
diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr.c b/drivers/staging/lustrefsx/lustre/llite/xattr.c
index e76e0130d6669..78c774ef738c4 100644
--- a/drivers/staging/lustrefsx/lustre/llite/xattr.c
+++ b/drivers/staging/lustrefsx/lustre/llite/xattr.c
@@ -54,17 +54,6 @@ static inline const char *xattr_prefix(const struct xattr_handler *handler)
 }
 #endif
 
-#ifdef HAVE_LINUX_SELINUX_IS_ENABLED
-# define test_xattr_is_selinux_disabled(handler, name) \
-		((handler)->flags == XATTR_SECURITY_T && \
-		!selinux_is_enabled() && \
-		strcmp((name), "selinux") == 0)
-#else
-# define test_xattr_is_selinux_disabled(handler, name) \
-		((handler)->flags == XATTR_SECURITY_T && \
-		strcmp((name), "selinux") == 0)
-#endif
-
 const struct xattr_handler *get_xattr_type(const char *name)
 {
 	int i;
@@ -149,7 +138,8 @@ static int ll_xattr_set_common(const struct xattr_handler *handler,
 		RETURN(0);
 
 	/* LU-549:  Disable security.selinux when selinux is disabled */
-	if (test_xattr_is_selinux_disabled(handler, name))
+	if (handler->flags == XATTR_SECURITY_T && !selinux_is_enabled() &&
+	    strcmp(name, "selinux") == 0)
 		RETURN(-EOPNOTSUPP);
 
 	/*
@@ -443,7 +433,8 @@ static int ll_xattr_get_common(const struct xattr_handler *handler,
 		RETURN(rc);
 
 	/* LU-549:  Disable security.selinux when selinux is disabled */
-	if (test_xattr_is_selinux_disabled(handler, name))
+	if (handler->flags == XATTR_SECURITY_T && !selinux_is_enabled() &&
+	    !strcmp(name, "selinux"))
 		RETURN(-EOPNOTSUPP);
 
 #ifdef CONFIG_FS_POSIX_ACL
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h b/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h
index 2a0c324856fe7..8ef0631f3301a 100644
--- a/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h
@@ -156,6 +156,6 @@ struct lmv_tgt_desc
 #ifdef CONFIG_PROC_FS
 extern struct lprocfs_vars lprocfs_lmv_obd_vars[];
 #endif
-extern struct proc_ops lmv_proc_target_fops;
+extern const struct proc_ops lmv_proc_target_fops;
 
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c b/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c
index e79781d444fb1..37c22a92de716 100644
--- a/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c
+++ b/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c
@@ -162,10 +162,11 @@ struct lprocfs_vars lprocfs_lmv_obd_vars[] = {
 	{ NULL }
 };
 
-struct proc_ops lmv_proc_target_fops = {
-        .proc_open                 = lmv_target_seq_open,
-        .proc_read                 = seq_read,
-        .proc_lseek                = seq_lseek,
-        .proc_release              = seq_release,
+const struct proc_ops lmv_proc_target_fops = {
+	PROC_OWNER(THIS_MODULE)
+	.proc_open	= lmv_target_seq_open,
+	.proc_read	= seq_read,
+	.proc_lseek	= seq_lseek,
+	.proc_release	= seq_release,
 };
 #endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_internal.h b/drivers/staging/lustrefsx/lustre/lov/lov_internal.h
index 7ff0ffe81026e..524b0a4eac681 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_internal.h
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_internal.h
@@ -298,7 +298,7 @@ void lsm_free_plain(struct lov_stripe_md *lsm);
 void dump_lsm(unsigned int level, const struct lov_stripe_md *lsm);
 
 /* lproc_lov.c */
-extern struct proc_ops lov_proc_target_fops;
+extern const struct proc_ops lov_proc_target_fops;
 #ifdef CONFIG_PROC_FS
 extern struct lprocfs_vars lprocfs_lov_obd_vars[];
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_pool.c b/drivers/staging/lustrefsx/lustre/lov/lov_pool.c
index 066b57df09482..02b8899cb1b68 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_pool.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_pool.c
@@ -41,6 +41,7 @@
 #define DEBUG_SUBSYSTEM S_LOV
 
 #include <libcfs/libcfs.h>
+#include <libcfs/linux/linux-fs.h>
 
 #include <obd.h>
 #include "lov_internal.h"
@@ -287,10 +288,10 @@ static int pool_proc_open(struct inode *inode, struct file *file)
 }
 
 static struct proc_ops pool_proc_operations = {
-        .proc_open           = pool_proc_open,
-        .proc_read           = seq_read,
-        .proc_lseek          = seq_lseek,
-        .proc_release        = seq_release,
+	.proc_open	= pool_proc_open,
+	.proc_read	= seq_read,
+	.proc_lseek	= seq_lseek,
+	.proc_release	= seq_release,
 };
 #endif /* CONFIG_PROC_FS */
 
diff --git a/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c b/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c
index e8b9b88302055..41215c11998ef 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c
@@ -322,10 +322,11 @@ struct lprocfs_vars lprocfs_lov_obd_vars[] = {
 	{ NULL }
 };
 
-struct proc_ops lov_proc_target_fops = {
-        .proc_open    = lov_target_seq_open,
-        .proc_read    = seq_read,
-        .proc_lseek   = seq_lseek,
-        .proc_release = lprocfs_seq_release,
+const struct proc_ops lov_proc_target_fops = {
+	PROC_OWNER(THIS_MODULE)
+	.proc_open	= lov_target_seq_open,
+	.proc_read	= seq_read,
+	.proc_lseek	= seq_lseek,
+	.proc_release	= lprocfs_seq_release,
 };
 #endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c
index a78e4e7f6b316..00395af273593 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c
@@ -565,11 +565,12 @@ static int lprocfs_jobstats_seq_release(struct inode *inode, struct file *file)
 }
 
 static const struct proc_ops lprocfs_jobstats_seq_fops = {
-	.proc_open    = lprocfs_jobstats_seq_open,
-	.proc_read    = seq_read,
-	.proc_write   = lprocfs_jobstats_seq_write,
-	.proc_lseek   = seq_lseek,
-	.proc_release = lprocfs_jobstats_seq_release,
+	PROC_OWNER(THIS_MODULE)
+	.proc_open	= lprocfs_jobstats_seq_open,
+	.proc_read	= seq_read,
+	.proc_write	= lprocfs_jobstats_seq_write,
+	.proc_lseek	= seq_lseek,
+	.proc_release	= lprocfs_jobstats_seq_release,
 };
 
 int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
index 0fcf859bdbb05..f3d2efc8403ba 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
@@ -97,20 +97,29 @@ int lprocfs_seq_release(struct inode *inode, struct file *file)
 }
 EXPORT_SYMBOL(lprocfs_seq_release);
 
+static umode_t default_mode(const struct proc_ops *ops)
+{
+	umode_t mode = 0;
+
+	if (ops->proc_read)
+		mode = 0444;
+	if (ops->proc_write)
+		mode |= 0200;
+
+	return mode;
+}
+
 struct proc_dir_entry *
 lprocfs_add_simple(struct proc_dir_entry *root, char *name,
 		   void *data, const struct proc_ops *fops)
 {
 	struct proc_dir_entry *proc;
-	mode_t mode = 0;
+	umode_t mode;
 
 	if (root == NULL || name == NULL || fops == NULL)
                 return ERR_PTR(-EINVAL);
 
-	if (fops->proc_read)
-		mode = 0444;
-	if (fops->proc_write)
-		mode |= 0200;
+	mode = default_mode(fops);
 	proc = proc_create_data(name, mode, root, fops, data);
 	if (!proc) {
 		CERROR("LprocFS: No memory to create /proc entry %s\n",
@@ -202,16 +211,12 @@ lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list,
 
 	while (list->name != NULL) {
 		struct proc_dir_entry *proc;
-		mode_t mode = 0;
+		umode_t mode = 0;
 
-		if (list->proc_mode != 0000) {
+		if (list->proc_mode)
 			mode = list->proc_mode;
-		} else if (list->fops) {
-			if (list->fops->proc_read)
-				mode = 0444;
-			if (list->fops->proc_write)
-				mode |= 0200;
-		}
+		else if (list->fops)
+			mode = default_mode(list->fops);
 		proc = proc_create_data(list->name, mode, root,
 					list->fops ?: &lprocfs_empty_ops,
 					list->data ?: data);
@@ -1537,20 +1542,21 @@ static int lprocfs_stats_seq_open(struct inode *inode, struct file *file)
 }
 
 static const struct proc_ops lprocfs_stats_seq_fops = {
-        .proc_open    = lprocfs_stats_seq_open,
-        .proc_read    = seq_read,
-        .proc_write   = lprocfs_stats_seq_write,
-        .proc_lseek   = seq_lseek,
-        .proc_release = lprocfs_seq_release,
+	PROC_OWNER(THIS_MODULE)
+	.proc_open	= lprocfs_stats_seq_open,
+	.proc_read	= seq_read,
+	.proc_write	= lprocfs_stats_seq_write,
+	.proc_lseek	= seq_lseek,
+	.proc_release	= lprocfs_seq_release,
 };
 
 static const struct file_operations ldebugfs_stats_seq_fops = {
-        .owner   = THIS_MODULE,
-        .open    = lprocfs_stats_seq_open,
-        .read    = seq_read,
-        .write   = lprocfs_stats_seq_write,
-        .llseek  = seq_lseek,
-        .release = lprocfs_seq_release,
+	.owner	 = THIS_MODULE,
+	.open	 = lprocfs_stats_seq_open,
+	.read	 = seq_read,
+	.write	 = lprocfs_stats_seq_write,
+	.llseek	 = seq_lseek,
+	.release = lprocfs_seq_release,
 };
 
 int ldebugfs_register_stats(struct dentry *parent, const char *name,
@@ -2265,7 +2271,7 @@ int lprocfs_seq_create(struct proc_dir_entry *parent,
 	ENTRY;
 
 	/* Disallow secretly (un)writable entries. */
-	LASSERT((seq_fops->proc_write == NULL) == ((mode & 0222) == 0));
+	LASSERT(!seq_fops->proc_write == !(mode & 0222));
 
 	entry = proc_create_data(name, mode, parent, seq_fops, data);
 
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c b/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c
index 80d644f1092e0..bef29033f30ee 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c
@@ -402,11 +402,12 @@ static int lu_ref_seq_release(struct inode *inode, struct file *file)
 	return seq_release(inode, file);
 }
 
-static struct proc_ops lu_ref_dump_fops = {
-        .proc_open    = lu_ref_seq_open,
-        .proc_read    = seq_read,
-        .proc_lseek   = seq_lseek,
-        .proc_release = lu_ref_seq_release
+static struct file_operations lu_ref_dump_fops = {
+        .owner   = THIS_MODULE,
+        .open    = lu_ref_seq_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = lu_ref_seq_release
 };
 
 #endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
index dbd4fdcbd996c..8068de9ebea64 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
@@ -1341,8 +1341,9 @@ int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
 
 				if (var->fops && var->fops->proc_write) {
 					rc = (var->fops->proc_write)(&fakefile,
-								sval, vallen,
-								NULL);
+								     sval,
+								     vallen,
+								     NULL);
 				}
 				break;
 			}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c b/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
index 5b5412e506317..933183a83dbb3 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
@@ -1130,10 +1130,11 @@ void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry,
 		{ NULL }
         };
         static struct proc_ops req_history_fops = {
-                .proc_open        = ptlrpc_lprocfs_svc_req_history_open,
-                .proc_read        = seq_read,
-                .proc_lseek       = seq_lseek,
-                .proc_release     = lprocfs_seq_release,
+		PROC_OWNER(THIS_MODULE)
+                .proc_open    = ptlrpc_lprocfs_svc_req_history_open,
+                .proc_read    = seq_read,
+                .proc_lseek   = seq_lseek,
+                .proc_release = lprocfs_seq_release,
         };
 
         int rc;
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c
index a703b0e5b4562..766b21d10c20c 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c
@@ -37,12 +37,8 @@
 /* Debugging check only needed during development */
 #ifdef OBD_CTXT_DEBUG
 # define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC)
-# define ASSERT_NOT_KERNEL_CTXT(msg) LASSERTF(!uaccess_kernel(), msg)
-# define ASSERT_KERNEL_CTXT(msg) LASSERTF(uaccess_kernel(), msg)
 #else
 # define ASSERT_CTXT_MAGIC(magic) do {} while(0)
-# define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0)
-# define ASSERT_KERNEL_CTXT(msg) do {} while(0)
 #endif
 
 /* push / pop to root of obd store */
@@ -52,11 +48,9 @@ void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx)
 	if (new_ctx->dt != NULL)
 		return;
 
-	//ASSERT_NOT_KERNEL_CTXT("already in kernel context!\n");
 	ASSERT_CTXT_MAGIC(new_ctx->magic);
 	OBD_SET_CTXT_MAGIC(save);
 
-	save->fs = get_fs();
 	LASSERT(ll_d_count(current->fs->pwd.dentry));
 	LASSERT(ll_d_count(new_ctx->pwd));
 	save->pwd = dget(current->fs->pwd.dentry);
@@ -69,7 +63,6 @@ void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx)
 	LASSERT(new_ctx->pwdmnt);
 
 	current->fs->umask = 0; /* umask already applied on client */
-	set_fs(new_ctx->fs);
 	ll_set_fs_pwd(current->fs, new_ctx->pwdmnt, new_ctx->pwd);
 }
 EXPORT_SYMBOL(push_ctxt);
@@ -81,14 +74,12 @@ void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx)
 		return;
 
 	ASSERT_CTXT_MAGIC(saved->magic);
-	ASSERT_KERNEL_CTXT("popping non-kernel context!\n");
 
 	LASSERTF(current->fs->pwd.dentry == new_ctx->pwd, "%p != %p\n",
 		 current->fs->pwd.dentry, new_ctx->pwd);
 	LASSERTF(current->fs->pwd.mnt == new_ctx->pwdmnt, "%p != %p\n",
 		 current->fs->pwd.mnt, new_ctx->pwdmnt);
 
-	set_fs(saved->fs);
 	ll_set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd);
 
 	dput(saved->pwd);
diff --git a/drivers/staging/lustrefsx/undef.h b/drivers/staging/lustrefsx/undef.h
index 64189a1ac2606..b1ea346eb8f4a 100644
--- a/drivers/staging/lustrefsx/undef.h
+++ b/drivers/staging/lustrefsx/undef.h
@@ -290,6 +290,9 @@
 /* filldir_t needs struct dir_context as argument */
 #undef HAVE_FILLDIR_USE_CTX
 
+/* FMR pool API is available */
+#undef HAVE_FMR_POOL_API
+
 /* fpu/api.h is present */
 #undef HAVE_FPU_API_HEADER
 
@@ -464,9 +467,15 @@
 /* i_uid_read is present */
 #undef HAVE_I_UID_READ
 
+/* jiffies_to_timespec64() is available */
+#undef HAVE_JIFFIES_TO_TIMESPEC64
+
 /* kernel_locked is defined */
 #undef HAVE_KERNEL_LOCKED
 
+/* kernel_setsockopt still in use */
+#undef HAVE_KERNEL_SETSOCKOPT
+
 /* 'struct sock' accept function requires bool argument */
 #undef HAVE_KERN_SOCK_ACCEPT_FLAG_ARG
 
@@ -616,6 +625,9 @@
 /* kernel has include/linux/migrate_mode.h */
 #undef HAVE_MIGRATE_MODE_H
 
+/* mmap_lock API is available. */
+#undef HAVE_MMAP_LOCK
+
 /* kernel module loading is possible */
 #undef HAVE_MODULE_LOADING_SUPPORT
 
@@ -631,6 +643,9 @@
 /* 'kernel_write' aligns with read/write helpers */
 #undef HAVE_NEW_KERNEL_WRITE
 
+/* NR_UNSTABLE_NFS is still in use. */
+#undef HAVE_NR_UNSTABLE_NFS
+
 /* with oldsize */
 #undef HAVE_OLDSIZE_TRUNCATE_PAGECACHE
 
@@ -658,6 +673,9 @@
 /* posix_acl_valid takes struct user_namespace */
 #undef HAVE_POSIX_ACL_VALID_USER_NS
 
+/* struct proc_ops exists */
+#undef HAVE_PROC_OPS
+
 /* proc_remove is defined */
 #undef HAVE_PROC_REMOVE
 
@@ -679,6 +697,9 @@
 /* rdma_create_id wants 5 args */
 #undef HAVE_RDMA_CREATE_ID_5ARG
 
+/* rdma_reject has 4 arguments */
+#undef HAVE_RDMA_REJECT_4ARGS
+
 /* reinit_completion is exist */
 #undef HAVE_REINIT_COMPLETION
 
@@ -746,12 +767,6 @@
 /* kernel has sk_sleep */
 #undef HAVE_SK_SLEEP
 
-/* sock_alloc_file is exported */
-#undef HAVE_SOCK_ALLOC_FILE
-
-/* sock_alloc_file takes 3 arguments */
-#undef HAVE_SOCK_ALLOC_FILE_3ARGS
-
 /* sock_create_kern use net as first parameter */
 #undef HAVE_SOCK_CREATE_KERN_USE_NET
 
@@ -842,6 +857,9 @@
 /* 'timespec64_to_ktime' is available */
 #undef HAVE_TIMESPEC64_TO_KTIME
 
+/* have_time_t */
+#undef HAVE_TIME_T
+
 /* topology_sibling_cpumask is available */
 #undef HAVE_TOPOLOGY_SIBLING_CPUMASK
 
@@ -875,6 +893,9 @@
 /* kernel has vfs_unlink with 3 args */
 #undef HAVE_VFS_UNLINK_3ARGS
 
+/* __vmalloc only takes 2 args. */
+#undef HAVE_VMALLOC_2ARGS
+
 /* virtual_address has been replaced by address field */
 #undef HAVE_VM_FAULT_ADDRESS
 
@@ -993,6 +1014,9 @@
 /* name of parallel fsck program */
 #undef PFSCK
 
+/* proc handler methods use __user */
+#undef PROC_HANDLER_USE_USER_ATTR
+
 /* enable randomly alloc failure */
 #undef RANDOM_FAIL_ALLOC
 

From 90ffedb51fdba8ac4617f566ce41507fd6e7f014 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@amazon.com>
Date: Wed, 12 May 2021 13:26:18 +1000
Subject: [PATCH 108/737] x86: Disable KASLR when Xen is detected

There's currently an issue with Xen and KASLR causing hibernation to
break (and possibly kexec/kdump too). Until we have got to the bottom
of this and fixed the root cause, let's disable KASLR at runtime when
running on Xen instances so we can enable it for Nitro.

This also adds a boot message to match ARM and help detect whether
this test worked as expected.

Signed-off-by: Benjamin Herrenschmidt <benh@amazon.com>
---
 arch/x86/boot/compressed/kaslr.c | 8 ++++++++
 arch/x86/kernel/setup.c          | 4 ++++
 2 files changed, 12 insertions(+)

diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index b92fffbe761fd..8c27630a774d0 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -1,3 +1,4 @@
+
 // SPDX-License-Identifier: GPL-2.0
 /*
  * kaslr.c
@@ -32,6 +33,9 @@
 #include <generated/utsrelease.h>
 #include <asm/efi.h>
 
+/* xen_cpuid_base/hypervisor_cpuid_base inlines */
+#include <asm/xen/hypervisor.h>
+
 /* Macros used by the included decompressor code below. */
 #define STATIC
 #include <linux/decompress/mm.h>
@@ -839,6 +843,10 @@ void choose_random_location(unsigned long input,
 		warn("KASLR disabled: 'nokaslr' on cmdline.");
 		return;
 	}
+	if (xen_cpuid_base() != 0) {
+		warn("KASLR disabled: Xen hypervisor detected.");
+		return;
+	}
 
 	boot_params->hdr.loadflags |= KASLR_FLAG;
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 065152d9265e4..784cb75bb998d 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -47,6 +47,7 @@
 #include <asm/proto.h>
 #include <asm/unwind.h>
 #include <asm/vsyscall.h>
+#include <asm/setup.h>
 #include <linux/vmalloc.h>
 
 /*
@@ -839,6 +840,9 @@ void __init setup_arch(char **cmdline_p)
 	printk(KERN_INFO "Command line: %s\n", boot_command_line);
 	boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS;
 #endif
+#ifdef CONFIG_RANDOMIZE_BASE
+	printk(KERN_INFO "KASLR %s\n", kaslr_enabled() ? "enabled" : "disabled");
+#endif
 
 	/*
 	 * If we have OLPC OFW, we might end up relocating the fixmap due to

From 1e24cacfe6da85d05b03189738559ced6927601e Mon Sep 17 00:00:00 2001
From: Ethan Chen <yishache@amazon.com>
Date: Mon, 7 Jun 2021 18:20:46 +0000
Subject: [PATCH 109/737] ena: Update to 2.5.0

Sourced from upstream git repo: https://github.com/amzn/amzn-drivers/

Change Log from Upstream:

2.4.1 -> 2.5.0
    **New Features**
    * Unify skb allocation path and use build_skb()
    * Support ethtool priv-flags and LPC state change

    **Bug Fixes**
    * Fix mapping function issues in XDP
    * Fix XDP redirection related failures
    * Fix page_ref_count() related checks to support older kernels correctly.
    * Don't define UBUNTU_VERSION_CODE when not in Ubuntu.
    * Drop unnecessary "#ifdef <function_name>" checks from kcompat.h.
    * Bug fixes and code improvements in legacy poll code

    **Minor Changes**
    * Add debug prints to failed commands
    * Minor performance improvements
    * Replace pci_set_dma_mask/_coherent with dma_set_mask_and_coherent
    * Change ena_increase_stat_atomic() function name
    * Change variable casting in ena_com.c
    * Add explicit include of ethtool.h to linux/ethtool.c
    * Change LLQ fallback print from error to warning
    * Remove unused ENA_DEFAULT_MIN_RX_BUFF_ALLOC_SIZE define
    * Remove unused SUSPEND/RESUME defines
    * Add mac OS defines.
    * Use WRITE/READ_ONCE macros for first_interrupt variable
    * Propagate upstream support for AF XDP busypoll
    * Add Jiffies of last napi call to stats
    * Add ena_ring_tx_doorbell() function
    * Cosmetic changes to LPC
    * Add is_lpc_page indication to help with page mapping
    * Back-propagate xdp helpers from upstream kernel
    * Fix RST format in README file

Signed-off-by: Ethan Chen <yishache@amazon.com>
---
 drivers/amazon/net/ena/ena_admin_defs.h |   5 +-
 drivers/amazon/net/ena/ena_com.c        |  19 +-
 drivers/amazon/net/ena/ena_com.h        |   2 +-
 drivers/amazon/net/ena/ena_eth_com.c    |  30 +-
 drivers/amazon/net/ena/ena_ethtool.c    |  56 ++-
 drivers/amazon/net/ena/ena_netdev.c     | 617 ++++++++++++++----------
 drivers/amazon/net/ena/ena_netdev.h     |  70 +--
 drivers/amazon/net/ena/kcompat.h        | 108 +++--
 8 files changed, 542 insertions(+), 365 deletions(-)
 mode change 100755 => 100644 drivers/amazon/net/ena/ena_admin_defs.h
 mode change 100755 => 100644 drivers/amazon/net/ena/ena_ethtool.c
 mode change 100755 => 100644 drivers/amazon/net/ena/ena_netdev.c
 mode change 100755 => 100644 drivers/amazon/net/ena/ena_netdev.h
 mode change 100755 => 100644 drivers/amazon/net/ena/kcompat.h

diff --git a/drivers/amazon/net/ena/ena_admin_defs.h b/drivers/amazon/net/ena/ena_admin_defs.h
old mode 100755
new mode 100644
index c89c501895e46..090198b02b888
--- a/drivers/amazon/net/ena/ena_admin_defs.h
+++ b/drivers/amazon/net/ena/ena_admin_defs.h
@@ -825,7 +825,8 @@ enum ena_admin_os_type {
 	ENA_ADMIN_OS_FREEBSD                        = 4,
 	ENA_ADMIN_OS_IPXE                           = 5,
 	ENA_ADMIN_OS_ESXI                           = 6,
-	ENA_ADMIN_OS_GROUPS_NUM                     = 6,
+	ENA_ADMIN_OS_MACOS                          = 7,
+	ENA_ADMIN_OS_GROUPS_NUM                     = 7,
 };
 
 struct ena_admin_host_info {
@@ -1064,8 +1065,6 @@ enum ena_admin_aenq_group {
 };
 
 enum ena_admin_aenq_notification_syndrome {
-	ENA_ADMIN_SUSPEND                           = 0,
-	ENA_ADMIN_RESUME                            = 1,
 	ENA_ADMIN_UPDATE_HINTS                      = 2,
 };
 
diff --git a/drivers/amazon/net/ena/ena_com.c b/drivers/amazon/net/ena/ena_com.c
index 7a87dfb2a2a56..5ce5d49800896 100644
--- a/drivers/amazon/net/ena/ena_com.c
+++ b/drivers/amazon/net/ena/ena_com.c
@@ -1362,16 +1362,15 @@ int ena_com_execute_admin_command(struct ena_com_admin_queue *admin_queue,
 	comp_ctx = ena_com_submit_admin_cmd(admin_queue, cmd, cmd_size,
 					    comp, comp_size);
 	if (IS_ERR(comp_ctx)) {
-		if (comp_ctx == ERR_PTR(-ENODEV))
+		ret = PTR_ERR(comp_ctx);
+		if (ret == -ENODEV)
 			netdev_dbg(admin_queue->ena_dev->net_device,
-				   "Failed to submit command [%ld]\n",
-				   PTR_ERR(comp_ctx));
+				   "Failed to submit command [%d]\n", ret);
 		else
 			netdev_err(admin_queue->ena_dev->net_device,
-				   "Failed to submit command [%ld]\n",
-				   PTR_ERR(comp_ctx));
+				   "Failed to submit command [%d]\n", ret);
 
-		return (int)PTR_ERR(comp_ctx);
+		return ret;
 	}
 
 	ret = ena_com_wait_and_process_admin_cq(comp_ctx, admin_queue);
@@ -2258,7 +2257,7 @@ int ena_com_get_dev_basic_stats(struct ena_com_dev *ena_dev,
 	return ret;
 }
 
-int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, int mtu)
+int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, u32 mtu)
 {
 	struct ena_com_admin_queue *admin_queue;
 	struct ena_admin_set_feat_cmd cmd;
@@ -2277,7 +2276,7 @@ int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, int mtu)
 	cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE;
 	cmd.aq_common_descriptor.flags = 0;
 	cmd.feat_common.feature_id = ENA_ADMIN_MTU;
-	cmd.u.mtu.mtu = (u32)mtu;
+	cmd.u.mtu.mtu = mtu;
 
 	ret = ena_com_execute_admin_command(admin_queue,
 					    (struct ena_admin_aq_entry *)&cmd,
@@ -2700,7 +2699,7 @@ int ena_com_indirect_table_set(struct ena_com_dev *ena_dev)
 		return ret;
 	}
 
-	cmd.control_buffer.length = (u32)(1ULL << rss->tbl_log_size) *
+	cmd.control_buffer.length = (1ULL << rss->tbl_log_size) *
 		sizeof(struct ena_admin_rss_ind_table_entry);
 
 	ret = ena_com_execute_admin_command(admin_queue,
@@ -2723,7 +2722,7 @@ int ena_com_indirect_table_get(struct ena_com_dev *ena_dev, u32 *ind_tbl)
 	u32 tbl_size;
 	int i, rc;
 
-	tbl_size = (u32)(1ULL << rss->tbl_log_size) *
+	tbl_size = (1ULL << rss->tbl_log_size) *
 		sizeof(struct ena_admin_rss_ind_table_entry);
 
 	rc = ena_com_get_feature_ex(ena_dev, &get_resp,
diff --git a/drivers/amazon/net/ena/ena_com.h b/drivers/amazon/net/ena/ena_com.h
index 336c535f4ec94..6ac41ca6f956b 100644
--- a/drivers/amazon/net/ena/ena_com.h
+++ b/drivers/amazon/net/ena/ena_com.h
@@ -621,7 +621,7 @@ int ena_com_get_eni_stats(struct ena_com_dev *ena_dev,
  *
  * @return: 0 on Success and negative value otherwise.
  */
-int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, int mtu);
+int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, u32 mtu);
 
 /* ena_com_get_offload_settings - Retrieve the device offloads capabilities
  * @ena_dev: ENA communication layer struct
diff --git a/drivers/amazon/net/ena/ena_eth_com.c b/drivers/amazon/net/ena/ena_eth_com.c
index c3be751e7379f..3d6f0a466a9ed 100644
--- a/drivers/amazon/net/ena/ena_eth_com.c
+++ b/drivers/amazon/net/ena/ena_eth_com.c
@@ -151,11 +151,14 @@ static int ena_com_close_bounce_buffer(struct ena_com_io_sq *io_sq)
 		return 0;
 
 	/* bounce buffer was used, so write it and get a new one */
-	if (pkt_ctrl->idx) {
+	if (likely(pkt_ctrl->idx)) {
 		rc = ena_com_write_bounce_buffer_to_dev(io_sq,
 							pkt_ctrl->curr_bounce_buf);
-		if (unlikely(rc))
+		if (unlikely(rc)) {
+			netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+				   "Failed to write bounce buffer to device\n");
 			return rc;
+		}
 
 		pkt_ctrl->curr_bounce_buf =
 			ena_com_get_next_bounce_buffer(&io_sq->bounce_buf_ctrl);
@@ -185,8 +188,11 @@ static int ena_com_sq_update_llq_tail(struct ena_com_io_sq *io_sq)
 	if (!pkt_ctrl->descs_left_in_line) {
 		rc = ena_com_write_bounce_buffer_to_dev(io_sq,
 							pkt_ctrl->curr_bounce_buf);
-		if (unlikely(rc))
+		if (unlikely(rc)) {
+			netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+				   "Failed to write bounce buffer to device\n");
 			return rc;
+		}
 
 		pkt_ctrl->curr_bounce_buf =
 			ena_com_get_next_bounce_buffer(&io_sq->bounce_buf_ctrl);
@@ -406,8 +412,11 @@ int ena_com_prepare_tx(struct ena_com_io_sq *io_sq,
 	}
 
 	if (unlikely(io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV &&
-		     !buffer_to_push))
+		     !buffer_to_push)) {
+		netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+			   "Push header wasn't provided in LLQ mode\n");
 		return -EINVAL;
+	}
 
 	rc = ena_com_write_header_to_bounce(io_sq, buffer_to_push, header_len);
 	if (unlikely(rc))
@@ -423,6 +432,9 @@ int ena_com_prepare_tx(struct ena_com_io_sq *io_sq,
 	/* If the caller doesn't want to send packets */
 	if (unlikely(!num_bufs && !header_len)) {
 		rc = ena_com_close_bounce_buffer(io_sq);
+		if (rc)
+			netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+				   "Failed to write buffers to LLQ\n");
 		*nb_hw_desc = io_sq->tail - start_tail;
 		return rc;
 	}
@@ -482,8 +494,11 @@ int ena_com_prepare_tx(struct ena_com_io_sq *io_sq,
 		/* The first desc share the same desc as the header */
 		if (likely(i != 0)) {
 			rc = ena_com_sq_update_tail(io_sq);
-			if (unlikely(rc))
+			if (unlikely(rc)) {
+				netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+					   "Failed to update sq tail\n");
 				return rc;
+			}
 
 			desc = get_sq_desc(io_sq);
 			if (unlikely(!desc))
@@ -512,8 +527,11 @@ int ena_com_prepare_tx(struct ena_com_io_sq *io_sq,
 	desc->len_ctrl |= ENA_ETH_IO_TX_DESC_LAST_MASK;
 
 	rc = ena_com_sq_update_tail(io_sq);
-	if (unlikely(rc))
+	if (unlikely(rc)) {
+		netdev_err(ena_com_io_sq_to_ena_dev(io_sq)->net_device,
+			   "Failed to update sq tail of the last descriptor\n");
 		return rc;
+	}
 
 	rc = ena_com_close_bounce_buffer(io_sq);
 
diff --git a/drivers/amazon/net/ena/ena_ethtool.c b/drivers/amazon/net/ena/ena_ethtool.c
old mode 100755
new mode 100644
index 82334c247016c..031274399b022
--- a/drivers/amazon/net/ena/ena_ethtool.c
+++ b/drivers/amazon/net/ena/ena_ethtool.c
@@ -3,6 +3,7 @@
  * Copyright 2015-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
+#include <linux/ethtool.h>
 #include <linux/pci.h>
 
 #include "ena_netdev.h"
@@ -86,7 +87,7 @@ static const struct ena_stats ena_stats_rx_strings[] = {
 	ENA_STAT_RX_ENTRY(skb_alloc_fail),
 	ENA_STAT_RX_ENTRY(dma_mapping_err),
 	ENA_STAT_RX_ENTRY(bad_desc_num),
-#if ENA_BUSY_POLL_SUPPORT
+#ifdef ENA_BUSY_POLL_SUPPORT
 	ENA_STAT_RX_ENTRY(bp_yield),
 	ENA_STAT_RX_ENTRY(bp_missed),
 	ENA_STAT_RX_ENTRY(bp_cleaned),
@@ -122,6 +123,13 @@ static const struct ena_stats ena_stats_ena_com_strings[] = {
 #define ENA_STATS_ARRAY_ENI(adapter)	\
 	(ARRAY_SIZE(ena_stats_eni_strings) * (adapter)->eni_stats_supported)
 
+static const char ena_priv_flags_strings[][ETH_GSTRING_LEN] = {
+#define ENA_PRIV_FLAGS_LPC	BIT(0)
+	"local_page_cache",
+};
+
+#define ENA_PRIV_FLAGS_NR ARRAY_SIZE(ena_priv_flags_strings)
+
 static void ena_safe_update_stat(u64 *src, u64 *dst,
 				 struct u64_stats_sync *syncp)
 {
@@ -242,10 +250,15 @@ int ena_get_sset_count(struct net_device *netdev, int sset)
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
 
-	if (sset != ETH_SS_STATS)
-		return -EOPNOTSUPP;
+	switch (sset) {
+	case ETH_SS_STATS:
+		return ena_get_sw_stats_count(adapter) +
+		       ena_get_hw_stats_count(adapter);
+	case ETH_SS_PRIV_FLAGS:
+		return ENA_PRIV_FLAGS_NR;
+	}
 
-	return ena_get_sw_stats_count(adapter) + ena_get_hw_stats_count(adapter);
+	return -EOPNOTSUPP;
 }
 
 static void ena_queue_strings(struct ena_adapter *adapter, u8 **data)
@@ -326,10 +339,14 @@ static void ena_get_ethtool_strings(struct net_device *netdev,
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
 
-	if (sset != ETH_SS_STATS)
-		return;
-
-	ena_get_strings(adapter, data, adapter->eni_stats_supported);
+	switch (sset) {
+	case ETH_SS_STATS:
+		ena_get_strings(adapter, data, adapter->eni_stats_supported);
+		break;
+	case ETH_SS_PRIV_FLAGS:
+		memcpy(data, ena_priv_flags_strings, sizeof(ena_priv_flags_strings));
+		break;
+	}
 }
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0)
@@ -500,6 +517,8 @@ static void ena_get_drvinfo(struct net_device *dev,
 	strlcpy(info->version, DRV_MODULE_GENERATION, sizeof(info->version));
 	strlcpy(info->bus_info, pci_name(adapter->pdev),
 		sizeof(info->bus_info));
+
+	info->n_priv_flags = ENA_PRIV_FLAGS_NR;
 }
 
 static void ena_get_ringparam(struct net_device *netdev,
@@ -1017,6 +1036,25 @@ static int ena_set_tunable(struct net_device *netdev,
 }
 #endif /* 3.18.0 */
 
+static u32 ena_get_priv_flags(struct net_device *netdev)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	u32 priv_flags = 0;
+
+	if (adapter->rx_ring->page_cache)
+		priv_flags |= ENA_PRIV_FLAGS_LPC;
+
+	return priv_flags;
+}
+
+static int ena_set_priv_flags(struct net_device *netdev, u32 priv_flags)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+
+	/* LPC is the only supported private flag for now */
+	return ena_set_lpc_state(adapter, !!(priv_flags & ENA_PRIV_FLAGS_LPC));
+}
+
 static const struct ethtool_ops ena_ethtool_ops = {
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 7, 0)
 	.supported_coalesce_params = ETHTOOL_COALESCE_USECS |
@@ -1066,6 +1104,8 @@ static const struct ethtool_ops ena_ethtool_ops = {
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0)
 	.get_ts_info            = ethtool_op_get_ts_info,
 #endif
+	.get_priv_flags		= ena_get_priv_flags,
+	.set_priv_flags		= ena_set_priv_flags,
 };
 
 void ena_set_ethtool_ops(struct net_device *netdev)
diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c
old mode 100755
new mode 100644
index 590762a05b7b5..4cccbba579dd4
--- a/drivers/amazon/net/ena/ena_netdev.c
+++ b/drivers/amazon/net/ena/ena_netdev.c
@@ -45,6 +45,10 @@ MODULE_VERSION(DRV_MODULE_GENERATION);
 
 #define DEFAULT_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_IFUP | \
 		NETIF_MSG_TX_DONE | NETIF_MSG_TX_ERR | NETIF_MSG_RX_ERR)
+#ifndef ENA_LINEAR_FRAG_SUPPORTED
+
+#define ENA_SKB_PULL_MIN_LEN 64
+#endif
 static int debug = -1;
 module_param(debug, int, 0444);
 MODULE_PARM_DESC(debug, "Debug level (-1=default,0=none,...,16=all)");
@@ -109,13 +113,19 @@ static int ena_create_io_tx_queues_in_range(struct ena_adapter *adapter,
 					    int first_index, int count);
 #endif /* ENA_XDP_SUPPORT */
 
-/* Increase a stat by cnt while holding syncp seqlock */
-static void ena_increase_stat_atomic(u64 *statp, u64 cnt,
-				     struct u64_stats_sync *syncp)
+/* Increase a stat by cnt while holding syncp seqlock on 32bit machines */
+static void ena_increase_stat(u64 *statp, u64 cnt,
+			      struct u64_stats_sync *syncp)
+{
+	u64_stats_update_begin(syncp);
+	(*statp) += cnt;
+	u64_stats_update_end(syncp);
+}
+
+static void ena_ring_tx_doorbell(struct ena_ring *tx_ring)
 {
-       u64_stats_update_begin(syncp);
-       (*statp) += cnt;
-       u64_stats_update_end(syncp);
+	ena_com_write_sq_doorbell(tx_ring->ena_com_io_sq);
+	ena_increase_stat(&tx_ring->tx_stats.doorbells, 1, &tx_ring->syncp);
 }
 
 #ifdef HAVE_NDO_TX_TIMEOUT_STUCK_QUEUE_PARAMETER
@@ -134,8 +144,7 @@ static void ena_tx_timeout(struct net_device *dev)
 		return;
 
 	adapter->reset_reason = ENA_REGS_RESET_OS_NETDEV_WD;
-	ena_increase_stat_atomic(&adapter->dev_stats.tx_timeout, 1,
-		&adapter->syncp);
+	ena_increase_stat(&adapter->dev_stats.tx_timeout, 1, &adapter->syncp);
 
 	netif_err(adapter, tx_err, dev, "Transmit time out\n");
 }
@@ -189,7 +198,7 @@ static int ena_xmit_common(struct net_device *dev,
 		netif_dbg(adapter, tx_queued, dev,
 			  "llq tx max burst size of queue %d achieved, writing doorbell to send burst\n",
 			  ring->qid);
-		ena_com_write_sq_doorbell(ring->ena_com_io_sq);
+		ena_ring_tx_doorbell(ring);
 	}
 
 	/* prepare the packet's descriptors to dma engine */
@@ -203,8 +212,8 @@ static int ena_xmit_common(struct net_device *dev,
 	if (unlikely(rc)) {
 		netif_err(adapter, tx_queued, dev,
 			  "Failed to prepare tx bufs\n");
-		ena_increase_stat_atomic(&ring->tx_stats.prepare_ctx_err, 1,
-			&ring->syncp);
+		ena_increase_stat(&ring->tx_stats.prepare_ctx_err, 1,
+				  &ring->syncp);
 		if (rc != -ENOMEM) {
 			adapter->reset_reason =
 				ENA_REGS_RESET_DRIVER_INVALID_STATE;
@@ -240,7 +249,6 @@ static int ena_xdp_io_poll(struct napi_struct *napi, int budget)
 	int ret;
 
 	xdp_ring = ena_napi->xdp_ring;
-	xdp_ring->first_interrupt = ena_napi->first_interrupt;
 
 	xdp_budget = budget;
 
@@ -272,6 +280,7 @@ static int ena_xdp_io_poll(struct napi_struct *napi, int budget)
 	xdp_ring->tx_stats.napi_comp += napi_comp_call;
 	xdp_ring->tx_stats.tx_poll++;
 	u64_stats_update_end(&xdp_ring->syncp);
+	xdp_ring->tx_stats.last_napi_jiffies = jiffies;
 
 	return ret;
 }
@@ -284,43 +293,48 @@ static int ena_xdp_tx_map_frame(struct ena_ring *xdp_ring,
 {
 	struct ena_adapter *adapter = xdp_ring->adapter;
 	struct ena_com_buf *ena_buf;
-	dma_addr_t dma = 0;
+	dma_addr_t dma;
+	void *data;
 	u32 size;
 
 	tx_info->xdpf = xdpf;
+	data = tx_info->xdpf->data;
 	size = tx_info->xdpf->len;
-	ena_buf = tx_info->bufs;
 
-	/* llq push buffer */
-	*push_len = min_t(u32, size, xdp_ring->tx_max_header_size);
-	*push_hdr = tx_info->xdpf->data;
+	*push_len = 0;
+
+	if (xdp_ring->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
+		/* LLQ push buffer */
+		*push_len = min_t(u32, size, xdp_ring->tx_max_header_size);
+		*push_hdr = data;
+
+		size -= *push_len;
+	} else {
+		*push_hdr = NULL;
+	}
 
-	if (size - *push_len > 0) {
+	if (size > 0) {
 		dma = dma_map_single(xdp_ring->dev,
-				     *push_hdr + *push_len,
-				     size - *push_len,
+				     data + *push_len,
+				     size,
 				     DMA_TO_DEVICE);
 		if (unlikely(dma_mapping_error(xdp_ring->dev, dma)))
 			goto error_report_dma_error;
 
-		tx_info->map_linear_data = 1;
+		tx_info->map_linear_data = 0;
 		tx_info->num_of_bufs = 1;
+		ena_buf = tx_info->bufs;
+		ena_buf->paddr = dma;
+		ena_buf->len = size;
 	}
 
-	ena_buf->paddr = dma;
-	ena_buf->len = size;
-
 	return 0;
 
 error_report_dma_error:
-	ena_increase_stat_atomic(&xdp_ring->tx_stats.dma_mapping_err, 1,
-		&xdp_ring->syncp);
+	ena_increase_stat(&xdp_ring->tx_stats.dma_mapping_err, 1,
+			  &xdp_ring->syncp);
 	netif_warn(adapter, tx_queued, adapter->netdev, "Failed to map xdp buff\n");
 
-	xdp_return_frame_rx_napi(tx_info->xdpf);
-	tx_info->xdpf = NULL;
-	tx_info->num_of_bufs = 0;
-
 	return -EINVAL;
 }
 
@@ -359,14 +373,11 @@ static int ena_xdp_xmit_frame(struct ena_ring *xdp_ring,
 			     xdpf->len);
 	if (rc)
 		goto error_unmap_dma;
-	/* trigger the dma engine. ena_com_write_sq_doorbell()
-	 * has a mb
+	/* trigger the dma engine. ena_ring_tx_doorbell()
+	 * calls a memory barrier inside it.
 	 */
-	if (flags & XDP_XMIT_FLUSH) {
-		ena_com_write_sq_doorbell(xdp_ring->ena_com_io_sq);
-		ena_increase_stat_atomic(&xdp_ring->tx_stats.doorbells, 1,
-			&xdp_ring->syncp);
-	}
+	if (flags & XDP_XMIT_FLUSH)
+		ena_ring_tx_doorbell(xdp_ring);
 
 	return rc;
 
@@ -412,11 +423,8 @@ static int ena_xdp_xmit(struct net_device *dev, int n,
 	}
 
 	/* Ring doorbell to make device aware of the packets */
-	if (flags & XDP_XMIT_FLUSH) {
-		ena_com_write_sq_doorbell(xdp_ring->ena_com_io_sq);
-		ena_increase_stat_atomic(&xdp_ring->tx_stats.doorbells, 1,
-			&xdp_ring->syncp);
-	}
+	if (flags & XDP_XMIT_FLUSH)
+		ena_ring_tx_doorbell(xdp_ring);
 
 	spin_unlock(&xdp_ring->xdp_tx_lock);
 
@@ -428,9 +436,8 @@ static int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp)
 {
 	struct bpf_prog *xdp_prog;
 	struct ena_ring *xdp_ring;
-	struct xdp_frame *xdpf;
-	int qid;
 	u32 verdict = XDP_PASS;
+	struct xdp_frame *xdpf;
 	u64 *xdp_stat;
 
 	rcu_read_lock();
@@ -448,9 +455,15 @@ static int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp)
 #else
 		xdpf = convert_to_xdp_frame(xdp);
 #endif
+		if (unlikely(!xdpf)) {
+			trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict);
+			xdp_stat = &rx_ring->rx_stats.xdp_aborted;
+			verdict = XDP_ABORTED;
+			break;
+		}
+
 		/* Find xmit queue */
-		qid = rx_ring->qid + rx_ring->adapter->num_io_queues;
-		xdp_ring = &rx_ring->adapter->tx_ring[qid];
+		xdp_ring = rx_ring->xdp_ring;
 
 		/* The XDP queues are shared between XDP_TX and XDP_REDIRECT */
 		spin_lock(&xdp_ring->xdp_tx_lock);
@@ -461,8 +474,13 @@ static int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp)
 		xdp_stat = &rx_ring->rx_stats.xdp_tx;
 		break;
 	case XDP_REDIRECT:
-		xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
-		xdp_stat = &rx_ring->rx_stats.xdp_redirect;
+		if (likely(!xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog))) {
+			xdp_stat = &rx_ring->rx_stats.xdp_redirect;
+			break;
+		}
+		trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict);
+		xdp_stat = &rx_ring->rx_stats.xdp_aborted;
+		verdict = XDP_ABORTED;
 		break;
 	case XDP_ABORTED:
 		trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict);
@@ -479,7 +497,7 @@ static int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp)
 		xdp_stat = &rx_ring->rx_stats.xdp_invalid;
 	}
 
-	ena_increase_stat_atomic(xdp_stat, 1, &rx_ring->syncp);
+	ena_increase_stat(xdp_stat, 1, &rx_ring->syncp);
 out:
 	rcu_read_unlock();
 
@@ -526,7 +544,11 @@ static int ena_xdp_register_rxq_info(struct ena_ring *rx_ring)
 {
 	int rc;
 
+#ifdef AF_XDP_BUSY_POLL_SUPPORTED
+	rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid, 0);
+#else
 	rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid);
+#endif
 
 	if (rc) {
 		netif_err(rx_ring->adapter, ifup, rx_ring->netdev,
@@ -570,7 +592,7 @@ static void ena_xdp_exchange_program_rx_in_range(struct ena_adapter *adapter,
 			rx_ring->rx_headroom = XDP_PACKET_HEADROOM;
 		} else {
 			ena_xdp_unregister_rxq_info(rx_ring);
-			rx_ring->rx_headroom = 0;
+			rx_ring->rx_headroom = NET_SKB_PAD;
 		}
 	}
 }
@@ -734,7 +756,6 @@ static void ena_init_io_rings_common(struct ena_adapter *adapter,
 	ring->ena_dev = adapter->ena_dev;
 	ring->per_napi_packets = 0;
 	ring->cpu = 0;
-	ring->first_interrupt = false;
 	ring->no_interrupt_event_cnt = 0;
 	u64_stats_init(&ring->syncp);
 }
@@ -779,7 +800,11 @@ static void ena_init_io_rings(struct ena_adapter *adapter,
 			rxr->smoothed_interval =
 				ena_com_get_nonadaptive_moderation_interval_rx(ena_dev);
 			rxr->empty_rx_queue = 0;
+			rxr->rx_headroom = NET_SKB_PAD;
 			adapter->ena_napi[i].dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+#ifdef ENA_XDP_SUPPORT
+			rxr->xdp_ring = &adapter->tx_ring[i + adapter->num_io_queues];
+#endif
 		}
 	}
 }
@@ -967,7 +992,7 @@ static int ena_setup_rx_resources(struct ena_adapter *adapter,
 	/* Reset rx statistics */
 	memset(&rx_ring->rx_stats, 0x0, sizeof(rx_ring->rx_stats));
 
-#if ENA_BUSY_POLL_SUPPORT
+#ifdef ENA_BUSY_POLL_SUPPORT
 	ena_bp_init_lock(rx_ring);
 #endif
 	rx_ring->next_to_clean = 0;
@@ -1098,7 +1123,7 @@ static struct page *ena_return_cache_page(struct ena_ring *rx_ring,
 {
 	/* Remove pages belonging to different node than current_nid from cache */
 	if (unlikely(page_to_nid(ena_page->page) != current_nid)) {
-		ena_increase_stat_atomic(&rx_ring->rx_stats.lpc_wrong_numa, 1, &rx_ring->syncp);
+		ena_increase_stat(&rx_ring->rx_stats.lpc_wrong_numa, 1, &rx_ring->syncp);
 		ena_replace_cache_page(rx_ring, ena_page);
 	}
 
@@ -1117,15 +1142,20 @@ static struct page *ena_return_cache_page(struct ena_ring *rx_ring,
 	return ena_page->page;
 }
 
-static struct page *ena_get_page(struct ena_ring *rx_ring, dma_addr_t *dma, int current_nid)
+static struct page *ena_get_page(struct ena_ring *rx_ring, dma_addr_t *dma,
+				 int current_nid, bool *is_lpc_page)
 {
 	struct ena_page_cache *page_cache = rx_ring->page_cache;
 	u32 head, cache_current_size;
 	struct ena_page *ena_page;
 
 	/* Cache size of zero indicates disabled cache */
-	if (!page_cache)
+	if (!page_cache) {
+		*is_lpc_page = false;
 		return ena_alloc_map_page(rx_ring, dma);
+	}
+
+	*is_lpc_page = true;
 
 	cache_current_size = page_cache->current_size;
 	head = page_cache->head;
@@ -1146,7 +1176,7 @@ static struct page *ena_get_page(struct ena_ring *rx_ring, dma_addr_t *dma, int
 
 		/* Add a new page to the cache */
 		ena_page->page = ena_alloc_map_page(rx_ring, dma);
-		if (!ena_page->page)
+		if (unlikely(!ena_page->page))
 			return NULL;
 
 		ena_page->dma_addr = *dma;
@@ -1158,14 +1188,15 @@ static struct page *ena_get_page(struct ena_ring *rx_ring, dma_addr_t *dma, int
 
 		page_cache->current_size++;
 
-		ena_increase_stat_atomic(&rx_ring->rx_stats.lpc_warm_up, 1, &rx_ring->syncp);
+		ena_increase_stat(&rx_ring->rx_stats.lpc_warm_up, 1, &rx_ring->syncp);
 
 		return ena_page->page;
 	}
 
 	/* Next page is still in use, so we allocate outside the cache */
 	if (unlikely(page_ref_count(ena_page->page) != 1)) {
-		ena_increase_stat_atomic(&rx_ring->rx_stats.lpc_full, 1, &rx_ring->syncp);
+		ena_increase_stat(&rx_ring->rx_stats.lpc_full, 1, &rx_ring->syncp);
+		*is_lpc_page = false;
 		return ena_alloc_map_page(rx_ring, dma);
 	}
 
@@ -1180,7 +1211,9 @@ static int ena_alloc_rx_page(struct ena_ring *rx_ring,
 	int headroom = rx_ring->rx_headroom;
 	struct ena_com_buf *ena_buf;
 	struct page *page;
+	bool is_lpc_page;
 	dma_addr_t dma;
+	int tailroom;
 
 	/* restore page offset value in case it has been changed by device */
 	rx_info->page_offset = headroom;
@@ -1190,20 +1223,23 @@ static int ena_alloc_rx_page(struct ena_ring *rx_ring,
 		return 0;
 
 	/* We handle DMA here */
-	page = ena_get_page(rx_ring, &dma, current_nid);
+	page = ena_get_page(rx_ring, &dma, current_nid, &is_lpc_page);
 	if (unlikely(!page)) {
-		ena_increase_stat_atomic(&rx_ring->rx_stats.page_alloc_fail, 1,
-			&rx_ring->syncp);
+		ena_increase_stat(&rx_ring->rx_stats.page_alloc_fail, 1,
+				  &rx_ring->syncp);
 		return -ENOMEM;
 	}
 
 	netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
 		  "Allocate page %p, rx_info %p\n", page, rx_info);
 
+	tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
 	rx_info->page = page;
+	rx_info->is_lpc_page = is_lpc_page;
 	ena_buf = &rx_info->ena_buf;
 	ena_buf->paddr = dma + headroom;
-	ena_buf->len = ENA_PAGE_SIZE - headroom;
+	ena_buf->len = ENA_PAGE_SIZE - headroom - tailroom;
 
 	return 0;
 }
@@ -1213,10 +1249,8 @@ static void ena_unmap_rx_buff(struct ena_ring *rx_ring,
 {
 	struct ena_com_buf *ena_buf = &rx_info->ena_buf;
 
-	/* If the ref count of the page is 2, then it belong to the page cache,
-	 * and it is up to it to unmap it.
-	 */
-	if (page_ref_count(rx_info->page) == 1)
+	/* LPC pages are unmapped at cache destruction */
+	if (!rx_info->is_lpc_page)
 		dma_unmap_page(rx_ring->dev, ena_buf->paddr - rx_ring->rx_headroom,
 			       ENA_PAGE_SIZE,
 			       DMA_BIDIRECTIONAL);
@@ -1279,8 +1313,8 @@ static int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num)
 	}
 
 	if (unlikely(i < num)) {
-		ena_increase_stat_atomic(&rx_ring->rx_stats.refil_partial, 1,
-			&rx_ring->syncp);
+		ena_increase_stat(&rx_ring->rx_stats.refil_partial, 1,
+				  &rx_ring->syncp);
 		netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev,
 			   "Refilled rx qid %d with only %d buffers (from %d)\n",
 			   rx_ring->qid, i, num);
@@ -1496,7 +1530,7 @@ static int handle_invalid_req_id(struct ena_ring *ring, u16 req_id,
 			  "Invalid req_id: %hu\n",
 			  req_id);
 
-	ena_increase_stat_atomic(&ring->tx_stats.bad_req_id, 1, &ring->syncp);
+	ena_increase_stat(&ring->tx_stats.bad_req_id, 1, &ring->syncp);
 
 	/* Trigger device reset */
 	ring->adapter->reset_reason = ENA_REGS_RESET_INV_TX_REQ_ID;
@@ -1607,8 +1641,8 @@ static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget)
 		if (netif_tx_queue_stopped(txq) && above_thresh &&
 		    test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags)) {
 			netif_tx_wake_queue(txq);
-			ena_increase_stat_atomic(&tx_ring->tx_stats.queue_wakeup, 1,
-				&tx_ring->syncp);
+			ena_increase_stat(&tx_ring->tx_stats.queue_wakeup, 1,
+					  &tx_ring->syncp);
 		}
 		__netif_tx_unlock(txq);
 	}
@@ -1616,21 +1650,30 @@ static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget)
 	return tx_pkts;
 }
 
-static struct sk_buff *ena_alloc_skb(struct ena_ring *rx_ring, bool frags)
+static struct sk_buff *ena_alloc_skb(struct ena_ring *rx_ring, void *first_frag)
 {
 	struct sk_buff *skb;
+#ifdef ENA_LINEAR_FRAG_SUPPORTED
 
-	if (frags)
-		skb = napi_get_frags(rx_ring->napi);
-	else
+	if (!first_frag)
 		skb = netdev_alloc_skb_ip_align(rx_ring->netdev,
 						rx_ring->rx_copybreak);
+	else
+		skb = build_skb(first_frag, ENA_PAGE_SIZE);
+#else
+	u32 linear_size = max_t(u32, ENA_SKB_PULL_MIN_LEN, rx_ring->rx_copybreak);
+
+	skb = netdev_alloc_skb_ip_align(rx_ring->netdev,
+					linear_size);
+#endif
 
 	if (unlikely(!skb)) {
-		ena_increase_stat_atomic(&rx_ring->rx_stats.skb_alloc_fail, 1,
-			&rx_ring->syncp);
+		ena_increase_stat(&rx_ring->rx_stats.skb_alloc_fail, 1,
+				  &rx_ring->syncp);
+
 		netif_dbg(rx_ring->adapter, rx_err, rx_ring->netdev,
-			  "Failed to allocate skb. frags: %d\n", frags);
+			  "Failed to allocate skb. first_frag %s\n",
+			  first_frag ? "provided" : "not provided");
 		return NULL;
 	}
 
@@ -1642,13 +1685,15 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 				  u32 descs,
 				  u16 *next_to_clean)
 {
-	struct sk_buff *skb;
 	struct ena_rx_buffer *rx_info;
 	u16 len, req_id, buf = 0;
-#if ENA_BUSY_POLL_SUPPORT
-	bool polling;
+	struct sk_buff *skb;
+	void *page_addr;
+	u32 page_offset;
+	void *data_addr;
+#ifndef ENA_LINEAR_FRAG_SUPPORTED
+	u16 hlen;
 #endif
-	void *va;
 
 	len = ena_bufs[buf].len;
 	req_id = ena_bufs[buf].req_id;
@@ -1666,12 +1711,14 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 		  rx_info, rx_info->page);
 
 	/* save virt address of first buffer */
-	va = page_address(rx_info->page) + rx_info->page_offset;
+	page_addr = page_address(rx_info->page);
+	page_offset = rx_info->page_offset;
+	data_addr = page_addr + page_offset;
 
-	prefetch(va);
+	prefetch(data_addr);
 
 	if (len <= rx_ring->rx_copybreak) {
-		skb = ena_alloc_skb(rx_ring, false);
+		skb = ena_alloc_skb(rx_ring, NULL);
 		if (unlikely(!skb))
 			return NULL;
 
@@ -1684,14 +1731,14 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 					dma_unmap_addr(&rx_info->ena_buf, paddr),
 					len,
 					DMA_FROM_DEVICE);
-		skb_copy_to_linear_data(skb, va, len);
+		skb_copy_to_linear_data(skb, data_addr, len);
 		dma_sync_single_for_device(rx_ring->dev,
 					   dma_unmap_addr(&rx_info->ena_buf, paddr),
 					   len,
 					   DMA_FROM_DEVICE);
 
 		skb_put(skb, len);
-#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0)
+#ifdef ENA_BUSY_POLL_SUPPORT
 		skb_mark_napi_id(skb, rx_ring->napi);
 #endif
 		skb->protocol = eth_type_trans(skb, rx_ring->netdev);
@@ -1701,22 +1748,30 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 		return skb;
 	}
 
-#if ENA_BUSY_POLL_SUPPORT
-	polling = ena_bp_busy_polling(rx_ring);
-	/* For busy poll don't allocate frag */
-	skb = ena_alloc_skb(rx_ring, !polling);
-#else
-	skb = ena_alloc_skb(rx_ring, true);
-#endif
+	ena_unmap_rx_buff(rx_ring, rx_info);
+
+	skb = ena_alloc_skb(rx_ring, page_addr);
 	if (unlikely(!skb))
 		return NULL;
 
-	do {
-		ena_unmap_rx_buff(rx_ring, rx_info);
-
-		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_info->page,
-				rx_info->page_offset, len, ENA_PAGE_SIZE);
+#ifdef ENA_LINEAR_FRAG_SUPPORTED
+	/* Populate skb's linear part */
+	skb_reserve(skb, page_offset);
+	skb_put(skb, len);
+#else
+	/* GRO expects us to have the ethernet header in the linear part.
+	 * Copy the first ENA_SKB_PULL_MIN_LEN bytes because it is more
+	 * efficient.
+	 */
+	hlen = min_t(u16, len, ENA_SKB_PULL_MIN_LEN);
+	memcpy(__skb_put(skb, hlen), data_addr, hlen);
+	if (hlen < len)
+		skb_add_rx_frag(skb, 0, rx_info->page,
+				page_offset + hlen, len - hlen, ENA_PAGE_SIZE);
+#endif
+	skb->protocol = eth_type_trans(skb, rx_ring->netdev);
 
+	do {
 		netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
 			  "RX skb updated. len %d. data_len %d\n",
 			  skb->len, skb->data_len);
@@ -1735,24 +1790,17 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 		req_id = ena_bufs[buf].req_id;
 
 		rx_info = &rx_ring->rx_buffer_info[req_id];
-	} while (1);
 
-#if ENA_BUSY_POLL_SUPPORT
-	if (polling) {
-		int hlen;
+		ena_unmap_rx_buff(rx_ring, rx_info);
+
+		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_info->page,
+				rx_info->page_offset, len, ENA_PAGE_SIZE);
+
+	} while (1);
 
-		/* copy header into the skb linear data */
-		hlen = rx_ring->rx_copybreak;
-		skb_copy_to_linear_data(skb, va, hlen);
+#ifdef ENA_BUSY_POLL_SUPPORT
+	skb_mark_napi_id(skb, rx_ring->napi);
 
-		/* adjust the first segment and skb len */
-		skb_shinfo(skb)->frags[0].page_offset += hlen;
-		skb_shinfo(skb)->frags[0].size -= hlen;
-		skb->data_len -= hlen;
-		skb->tail += hlen;
-		skb->protocol = eth_type_trans(skb, rx_ring->netdev);
-		skb_mark_napi_id(skb, rx_ring->napi);
-	}
 #endif
 	return skb;
 }
@@ -1783,8 +1831,8 @@ static void ena_rx_checksum(struct ena_ring *rx_ring,
 		     (ena_rx_ctx->l3_csum_err))) {
 		/* ipv4 checksum error */
 		skb->ip_summed = CHECKSUM_NONE;
-		ena_increase_stat_atomic(&rx_ring->rx_stats.bad_csum, 1,
-			&rx_ring->syncp);
+		ena_increase_stat(&rx_ring->rx_stats.bad_csum, 1,
+				  &rx_ring->syncp);
 		netif_dbg(rx_ring->adapter, rx_err, rx_ring->netdev,
 			  "RX IPv4 header checksum error\n");
 		return;
@@ -1795,8 +1843,8 @@ static void ena_rx_checksum(struct ena_ring *rx_ring,
 		   (ena_rx_ctx->l4_proto == ENA_ETH_IO_L4_PROTO_UDP))) {
 		if (unlikely(ena_rx_ctx->l4_csum_err)) {
 			/* TCP/UDP checksum error */
-			ena_increase_stat_atomic(&rx_ring->rx_stats.bad_csum, 1,
-				&rx_ring->syncp);
+			ena_increase_stat(&rx_ring->rx_stats.bad_csum, 1,
+					  &rx_ring->syncp);
 			netif_dbg(rx_ring->adapter, rx_err, rx_ring->netdev,
 				  "RX L4 checksum error\n");
 			skb->ip_summed = CHECKSUM_NONE;
@@ -1805,11 +1853,11 @@ static void ena_rx_checksum(struct ena_ring *rx_ring,
 
 		if (likely(ena_rx_ctx->l4_csum_checked)) {
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
-			ena_increase_stat_atomic(&rx_ring->rx_stats.csum_good, 1,
-				&rx_ring->syncp);
+			ena_increase_stat(&rx_ring->rx_stats.csum_good, 1,
+					  &rx_ring->syncp);
 		} else {
-			ena_increase_stat_atomic(&rx_ring->rx_stats.csum_unchecked, 1,
-				&rx_ring->syncp);
+			ena_increase_stat(&rx_ring->rx_stats.csum_unchecked, 1,
+					  &rx_ring->syncp);
 			skb->ip_summed = CHECKSUM_NONE;
 		}
 	} else {
@@ -1850,10 +1898,9 @@ static int ena_xdp_handle_buff(struct ena_ring *rx_ring, struct xdp_buff *xdp)
 	int ret;
 
 	rx_info = &rx_ring->rx_buffer_info[rx_ring->ena_bufs[0].req_id];
-	xdp->data = page_address(rx_info->page) + rx_info->page_offset;
-	xdp_set_data_meta_invalid(xdp);
-	xdp->data_hard_start = page_address(rx_info->page);
-	xdp->data_end = xdp->data + rx_ring->ena_bufs[0].len;
+	xdp_prepare_buff(xdp, page_address(rx_info->page),
+			 rx_info->page_offset,
+			 rx_ring->ena_bufs[0].len, false);
 	/* If for some reason we received a bigger packet than
 	 * we expect, then we simply drop it
 	 */
@@ -1905,10 +1952,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 		  "%s qid %d\n", __func__, rx_ring->qid);
 	res_budget = budget;
 #ifdef ENA_XDP_SUPPORT
-	xdp.rxq = &rx_ring->xdp_rxq;
-#ifdef XDP_HAS_FRAME_SZ
-	xdp.frame_sz = ENA_PAGE_SIZE;
-#endif
+	xdp_init_buff(&xdp, ENA_PAGE_SIZE, &rx_ring->xdp_rxq);
 #endif /* ENA_XDP_SUPPORT */
 
 	do {
@@ -1989,28 +2033,19 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 
 		skb_record_rx_queue(skb, rx_ring->qid);
 
-		if (rx_ring->ena_bufs[0].len <= rx_ring->rx_copybreak) {
-			total_len += rx_ring->ena_bufs[0].len;
+		if (rx_ring->ena_bufs[0].len <= rx_ring->rx_copybreak)
 			rx_copybreak_pkt++;
-#if ENA_BUSY_POLL_SUPPORT
-			if (ena_bp_busy_polling(rx_ring))
-				netif_receive_skb(skb);
-			else
-				napi_gro_receive(napi, skb);
-#else
+
+		total_len += skb->len;
+
+#ifdef ENA_BUSY_POLL_SUPPORT
+		if (ena_bp_busy_polling(rx_ring))
+			netif_receive_skb(skb);
+		else
 			napi_gro_receive(napi, skb);
-#endif
-		} else {
-			total_len += skb->len;
-#if ENA_BUSY_POLL_SUPPORT
-			if (ena_bp_busy_polling(rx_ring))
-				netif_receive_skb(skb);
-			else
-				napi_gro_frags(napi);
 #else
-			napi_gro_frags(napi);
-#endif
-		}
+		napi_gro_receive(napi, skb);
+#endif /* ENA_BUSY_POLL_SUPPORT */
 
 		res_budget--;
 	} while (likely(res_budget));
@@ -2047,12 +2082,12 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 	adapter = netdev_priv(rx_ring->netdev);
 
 	if (rc == -ENOSPC) {
-		ena_increase_stat_atomic(&rx_ring->rx_stats.bad_desc_num, 1,
-					 &rx_ring->syncp);
+		ena_increase_stat(&rx_ring->rx_stats.bad_desc_num, 1,
+				  &rx_ring->syncp);
 		adapter->reset_reason = ENA_REGS_RESET_TOO_MANY_RX_DESCS;
 	} else {
-		ena_increase_stat_atomic(&rx_ring->rx_stats.bad_req_id, 1,
-					 &rx_ring->syncp);
+		ena_increase_stat(&rx_ring->rx_stats.bad_req_id, 1,
+				  &rx_ring->syncp);
 		adapter->reset_reason = ENA_REGS_RESET_INV_RX_REQ_ID;
 	}
 
@@ -2113,8 +2148,8 @@ static void ena_unmask_interrupt(struct ena_ring *tx_ring,
 				tx_ring->smoothed_interval,
 				true);
 
-	ena_increase_stat_atomic(&tx_ring->tx_stats.unmask_interrupt, 1,
-		&tx_ring->syncp);
+	ena_increase_stat(&tx_ring->tx_stats.unmask_interrupt, 1,
+			  &tx_ring->syncp);
 
 	/* It is a shared MSI-X.
 	 * Tx and Rx CQ have pointer to it.
@@ -2226,9 +2261,6 @@ static int ena_io_poll(struct napi_struct *napi, int budget)
 	tx_ring = ena_napi->tx_ring;
 	rx_ring = ena_napi->rx_ring;
 
-	tx_ring->first_interrupt = ena_napi->first_interrupt;
-	rx_ring->first_interrupt = ena_napi->first_interrupt;
-
 	tx_budget = tx_ring->ring_size / ENA_TX_POLL_BUDGET_DIVIDER;
 
 	if (!test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags) ||
@@ -2236,7 +2268,7 @@ static int ena_io_poll(struct napi_struct *napi, int budget)
 		napi_complete_done(napi, 0);
 		return 0;
 	}
-#if ENA_BUSY_POLL_SUPPORT
+#ifdef ENA_BUSY_POLL_SUPPORT
 	if (!ena_bp_lock_napi(rx_ring))
 		return budget;
 #endif
@@ -2292,9 +2324,11 @@ static int ena_io_poll(struct napi_struct *napi, int budget)
 	tx_ring->tx_stats.tx_poll++;
 	u64_stats_update_end(&tx_ring->syncp);
 
-#if ENA_BUSY_POLL_SUPPORT
+#ifdef ENA_BUSY_POLL_SUPPORT
 	ena_bp_unlock_napi(rx_ring);
 #endif
+	tx_ring->tx_stats.last_napi_jiffies = jiffies;
+
 	return ret;
 }
 
@@ -2319,7 +2353,8 @@ static irqreturn_t ena_intr_msix_io(int irq, void *data)
 {
 	struct ena_napi *ena_napi = data;
 
-	ena_napi->first_interrupt = true;
+	/* Used to check HW health */
+	WRITE_ONCE(ena_napi->first_interrupt, true);
 
 	WRITE_ONCE(ena_napi->interrupts_masked, true);
 	smp_wmb(); /* write interrupts_masked before calling napi */
@@ -2572,6 +2607,9 @@ static void ena_del_napi_in_range(struct ena_adapter *adapter,
 	int i;
 
 	for (i = first_index; i < first_index + count; i++) {
+#ifdef ENA_BUSY_POLL_SUPPORT
+		napi_hash_del(&adapter->ena_napi[i].napi);
+#endif /* ENA_BUSY_POLL_SUPPORT */
 		netif_napi_del(&adapter->ena_napi[i].napi);
 
 #ifdef ENA_XDP_SUPPORT
@@ -2579,6 +2617,11 @@ static void ena_del_napi_in_range(struct ena_adapter *adapter,
 			adapter->ena_napi[i].xdp_ring);
 #endif /* ENA_XDP_SUPPORT */
 	}
+#ifdef ENA_BUSY_POLL_SUPPORT
+
+	/* Wait until all uses of napi struct complete */
+	synchronize_net();
+#endif /* ENA_BUSY_POLL_SUPPORT */
 }
 
 static void ena_init_napi_in_range(struct ena_adapter *adapter,
@@ -2598,6 +2641,10 @@ static void ena_init_napi_in_range(struct ena_adapter *adapter,
 #endif /* ENA_XDP_SUPPORT */
 			       ENA_NAPI_BUDGET);
 
+#ifdef ENA_BUSY_POLL_SUPPORT
+		napi_hash_add(&adapter->ena_napi[i].napi);
+
+#endif /* ENA_BUSY_POLL_SUPPORT */
 		if (!ENA_IS_XDP_INDEX(adapter, i)) {
 			napi->rx_ring = &adapter->rx_ring[i];
 			napi->tx_ring = &adapter->tx_ring[i];
@@ -2610,7 +2657,7 @@ static void ena_init_napi_in_range(struct ena_adapter *adapter,
 	}
 }
 
-#if ENA_BUSY_POLL_SUPPORT
+#ifdef ENA_BUSY_POLL_SUPPORT
 static void ena_napi_disable_in_range(struct ena_adapter *adapter,
 				      int first_index,
 				      int count)
@@ -2621,11 +2668,8 @@ static void ena_napi_disable_in_range(struct ena_adapter *adapter,
 	for (i = first_index; i < first_index + count; i++) {
 		napi_disable(&adapter->ena_napi[i].napi);
 
-		/* XDP doesn't have rx_ring */
-		if (ENA_IS_XDP_INDEX(adapter, i))
-			continue;
 		rx_ring = &adapter->rx_ring[i];
-		timeout = 100;
+		timeout = 1000;
 		while (!ena_bp_disable(rx_ring)) {
 			netif_info(adapter, ifdown, adapter->netdev,
 				   "Rx queue %d locked\n", i);
@@ -2633,9 +2677,9 @@ static void ena_napi_disable_in_range(struct ena_adapter *adapter,
 			timeout--;
 
 			if (!timeout) {
-				netif_err(adapter, ifdown, adapter->netdev,
-					  "Tx queue is stuck\n");
-				continue;
+				WARN(!ena_bp_disable(rx_ring),
+				     "Unable to disable busy poll at ring %d\n", i);
+				break;
 			}
 		}
 	}
@@ -2991,48 +3035,72 @@ static void ena_free_ring_page_cache(struct ena_ring *rx_ring)
 	rx_ring->page_cache = NULL;
 }
 
+static bool ena_is_lpc_supported(struct ena_adapter *adapter,
+				 struct ena_ring *rx_ring,
+				 bool error_print)
+{
+#ifdef ENA_NETDEV_LOGS_WITHOUT_RV
+	void (*print_log)(const struct net_device *dev, const char *format, ...);
+#else
+	int (*print_log)(const struct net_device *dev, const char *format, ...);
+#endif
+	int channels_nr = adapter->num_io_queues + adapter->xdp_num_queues;
+
+	print_log = (error_print) ? netdev_err : netdev_info;
+
+	/* LPC is disabled below min number of channels */
+	if (channels_nr < ENA_LPC_MIN_NUM_OF_CHANNELS) {
+		print_log(adapter->netdev,
+			  "Local page cache is disabled for less than %d channels\n",
+			  ENA_LPC_MIN_NUM_OF_CHANNELS);
+
+		/* Disable LPC for such case. It can enabled again through
+		 * ethtool private-flag.
+		 */
+		adapter->lpc_size = 0;
+
+		return false;
+	}
+#ifdef ENA_XDP_SUPPORT
+
+	/* The driver doesn't support page caches under XDP */
+	if (ena_xdp_present_ring(rx_ring)) {
+		print_log(adapter->netdev,
+			  "Local page cache is disabled when using XDP\n");
+		return false;
+	}
+#endif /* ENA_XDP_SUPPORT */
+
+	return true;
+}
+
 /* Calculate the size of the Local Page Cache. If LPC should be disabled, return
  * a size of 0.
  */
 static u32 ena_calculate_cache_size(struct ena_adapter *adapter,
 				    struct ena_ring *rx_ring)
 {
-	int channels_nr = adapter->num_io_queues + adapter->xdp_num_queues;
-	u32 page_cache_size;
+	u32 page_cache_size = adapter->lpc_size;
 
-	/* lpc_size == 0 means disabled cache */
-	if (lpc_size == 0)
+	/* LPC cache size of 0 means disabled cache */
+	if (page_cache_size == 0)
 		return 0;
 
-	/* LPC is disabled below min number of queues */
-	if (channels_nr < ENA_LPC_MIN_NUM_OF_CHANNELS) {
-		netif_info(adapter, ifup, adapter->netdev,
-			   "Local page cache is disabled for less than %d channels\n",
-			   ENA_LPC_MIN_NUM_OF_CHANNELS);
+	if (!ena_is_lpc_supported(adapter, rx_ring, false))
 		return 0;
-	}
 
-	/* Clap the lpc_size to its maximum value */
-	if (lpc_size > ENA_LPC_MAX_MULTIPLIER) {
-		netif_info(adapter, ifup, adapter->netdev,
-			   "Provided lpc_size %d is too large, reducing to %d (max)\n",
-			   lpc_size, ENA_LPC_MAX_MULTIPLIER);
-		/* Override module param value to avoid printing this message
+	/* Clap the LPC size to its maximum value */
+	if (page_cache_size > ENA_LPC_MAX_MULTIPLIER) {
+		netdev_info(adapter->netdev,
+			    "Provided lpc_size %d is too large, reducing to %d (max)\n",
+			    lpc_size, ENA_LPC_MAX_MULTIPLIER);
+		/* Override LPC size to avoid printing this message
 		 * every up/down operation
 		 */
-		lpc_size = ENA_LPC_MAX_MULTIPLIER;
+		adapter->lpc_size = page_cache_size = lpc_size = ENA_LPC_MAX_MULTIPLIER;
 	}
 
-#ifdef ENA_XDP_SUPPORT
-	/* We currently don't support page caches under XDP */
-	if (ena_xdp_present_ring(rx_ring)) {
-		netif_info(adapter, ifup, adapter->netdev,
-			   "Local page cache is disabled when using XDP\n");
-		return 0;
-	}
-#endif /* ENA_XDP_SUPPORT */
-
-	page_cache_size = lpc_size * ENA_LPC_MULTIPLIER_UNIT;
+	page_cache_size = page_cache_size * ENA_LPC_MULTIPLIER_UNIT;
 	page_cache_size = roundup_pow_of_two(page_cache_size);
 
 	return page_cache_size;
@@ -3124,8 +3192,8 @@ static int ena_up(struct ena_adapter *adapter)
 	if (test_bit(ENA_FLAG_LINK_UP, &adapter->flags))
 		netif_carrier_on(adapter->netdev);
 
-	ena_increase_stat_atomic(&adapter->dev_stats.interface_up, 1,
-		&adapter->syncp);
+	ena_increase_stat(&adapter->dev_stats.interface_up, 1,
+			  &adapter->syncp);
 
 	set_bit(ENA_FLAG_DEV_UP, &adapter->flags);
 
@@ -3164,8 +3232,8 @@ static void ena_down(struct ena_adapter *adapter)
 
 	clear_bit(ENA_FLAG_DEV_UP, &adapter->flags);
 
-	ena_increase_stat_atomic(&adapter->dev_stats.interface_down, 1,
-		&adapter->syncp);
+	ena_increase_stat(&adapter->dev_stats.interface_down, 1,
+			  &adapter->syncp);
 
 	netif_carrier_off(adapter->netdev);
 	netif_tx_disable(adapter->netdev);
@@ -3271,6 +3339,39 @@ static int ena_close(struct net_device *netdev)
 	return 0;
 }
 
+int ena_set_lpc_state(struct ena_adapter *adapter, bool enabled)
+{
+	/* In XDP, lpc_size might be positive even with LPC disabled, use cache
+	 * pointer instead.
+	 */
+	struct ena_page_cache *page_cache = adapter->rx_ring->page_cache;
+
+	/* Exit early if LPC state doesn't change */
+	if (enabled == !!page_cache)
+		return 0;
+
+	if (enabled && !ena_is_lpc_supported(adapter, adapter->rx_ring, true))
+		return -EOPNOTSUPP;
+
+	/* Prevent a case in which disabling LPC on startup, prevents it from
+	 * being enabled afterwards.
+	 */
+	if (!lpc_size)
+		lpc_size = ENA_LPC_DEFAULT_MULTIPLIER;
+
+	adapter->lpc_size = enabled ? lpc_size : 0;
+
+	/* rtnl lock is already obtained in dev_ioctl() layer, so it's safe to
+	 * re-initialize IO resources.
+	 */
+	if (test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) {
+		ena_close(adapter->netdev);
+		ena_up(adapter);
+	}
+
+	return 0;
+}
+
 int ena_update_queue_sizes(struct ena_adapter *adapter,
 			   u32 new_tx_size,
 			   u32 new_rx_size)
@@ -3400,13 +3501,12 @@ static int ena_check_and_linearize_skb(struct ena_ring *tx_ring,
 	    (header_len < tx_ring->tx_max_header_size))
 		return 0;
 
-	ena_increase_stat_atomic(&tx_ring->tx_stats.linearize, 1,
-		&tx_ring->syncp);
+	ena_increase_stat(&tx_ring->tx_stats.linearize, 1, &tx_ring->syncp);
 
 	rc = skb_linearize(skb);
 	if (unlikely(rc)) {
-		ena_increase_stat_atomic(&tx_ring->tx_stats.linearize_failed, 1,
-			&tx_ring->syncp);
+		ena_increase_stat(&tx_ring->tx_stats.linearize_failed, 1,
+				  &tx_ring->syncp);
 	}
 
 	return rc;
@@ -3446,8 +3546,8 @@ static int ena_tx_map_skb(struct ena_ring *tx_ring,
 					       tx_ring->push_buf_intermediate_buf);
 		*header_len = push_len;
 		if (unlikely(skb->data != *push_hdr)) {
-			ena_increase_stat_atomic(&tx_ring->tx_stats.llq_buffer_copy, 1,
-				&tx_ring->syncp);
+			ena_increase_stat(&tx_ring->tx_stats.llq_buffer_copy, 1,
+					  &tx_ring->syncp);
 
 			delta = push_len - skb_head_len;
 		}
@@ -3504,8 +3604,8 @@ static int ena_tx_map_skb(struct ena_ring *tx_ring,
 	return 0;
 
 error_report_dma_error:
-	ena_increase_stat_atomic(&tx_ring->tx_stats.dma_mapping_err, 1,
-		&tx_ring->syncp);
+	ena_increase_stat(&tx_ring->tx_stats.dma_mapping_err, 1,
+			  &tx_ring->syncp);
 	netif_warn(adapter, tx_queued, adapter->netdev, "Failed to map skb\n");
 
 	tx_info->skb = NULL;
@@ -3580,8 +3680,8 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 			  __func__, qid);
 
 		netif_tx_stop_queue(txq);
-		ena_increase_stat_atomic(&tx_ring->tx_stats.queue_stop, 1,
-			&tx_ring->syncp);
+		ena_increase_stat(&tx_ring->tx_stats.queue_stop, 1,
+				  &tx_ring->syncp);
 
 		/* There is a rare condition where this function decide to
 		 * stop the queue but meanwhile clean_tx_irq updates
@@ -3596,27 +3696,22 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		if (ena_com_sq_have_enough_space(tx_ring->ena_com_io_sq,
 						 ENA_TX_WAKEUP_THRESH)) {
 			netif_tx_wake_queue(txq);
-			ena_increase_stat_atomic(&tx_ring->tx_stats.queue_wakeup, 1,
-				&tx_ring->syncp);
+			ena_increase_stat(&tx_ring->tx_stats.queue_wakeup, 1,
+					  &tx_ring->syncp);
 		}
 	}
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0)
 #ifdef HAVE_NETDEV_XMIT_MORE
-	if (netif_xmit_stopped(txq) || !netdev_xmit_more()) {
+	if (netif_xmit_stopped(txq) || !netdev_xmit_more())
 #else
-	if (netif_xmit_stopped(txq) || !skb->xmit_more) {
+	if (netif_xmit_stopped(txq) || !skb->xmit_more)
 #endif /* HAVE_NETDEV_XMIT_MORE */
 #endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0) */
-		/* trigger the dma engine. ena_com_write_sq_doorbell()
-		 * has a mb
+		/* trigger the dma engine. ena_ring_tx_doorbell()
+		 * calls a memory barrier inside it.
 		 */
-		ena_com_write_sq_doorbell(tx_ring->ena_com_io_sq);
-		ena_increase_stat_atomic(&tx_ring->tx_stats.doorbells, 1,
-			&tx_ring->syncp);
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0)
-	}
-#endif
+		ena_ring_tx_doorbell(tx_ring);
 
 	return NETDEV_TX_OK;
 
@@ -3929,7 +4024,7 @@ static struct net_device_stats *ena_get_stats(struct net_device *netdev)
 	return stats;
 }
 #endif
-#if ENA_BUSY_POLL_SUPPORT
+#ifdef ENA_BUSY_POLL_SUPPORT
 
 #define ENA_BP_NAPI_BUDGET 8
 static int ena_busy_poll(struct napi_struct *napi)
@@ -3974,7 +4069,7 @@ static const struct net_device_ops ena_netdev_ops = {
 	.ndo_set_rx_mode	= ena_set_rx_mode,
 #endif
 	.ndo_validate_addr	= eth_validate_addr,
-#if ENA_BUSY_POLL_SUPPORT
+#ifdef ENA_BUSY_POLL_SUPPORT
 	.ndo_busy_poll		= ena_busy_poll,
 #endif
 #ifdef ENA_XDP_SUPPORT
@@ -4032,7 +4127,7 @@ static int ena_set_queues_placement_policy(struct pci_dev *pdev,
 
 	llq_feature_mask = 1 << ENA_ADMIN_LLQ;
 	if (!(ena_dev->supported_features & llq_feature_mask)) {
-		dev_err(&pdev->dev,
+		dev_warn(&pdev->dev,
 			"LLQ is not supported Fallback to host mode policy.\n");
 		ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
 		return 0;
@@ -4115,18 +4210,26 @@ static int ena_device_init(struct ena_com_dev *ena_dev, struct pci_dev *pdev,
 		goto err_mmio_read_less;
 	}
 
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0)
+	rc = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(dma_width));
+	if (rc) {
+		dev_err(dev, "dma_set_mask_and_coherent failed %d\n", rc);
+		goto err_mmio_read_less;
+	}
+#else
 	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(dma_width));
 	if (rc) {
-		dev_err(dev, "pci_set_dma_mask failed 0x%x\n", rc);
+		dev_err(dev, "pci_set_dma_mask failed %d\n", rc);
 		goto err_mmio_read_less;
 	}
 
 	rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(dma_width));
 	if (rc) {
-		dev_err(dev, "err_pci_set_consistent_dma_mask failed 0x%x\n",
+		dev_err(dev, "err_pci_set_consistent_dma_mask failed %d\n",
 			rc);
 		goto err_mmio_read_less;
 	}
+#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0) */
 
 	/* ENA admin level init */
 	rc = ena_com_admin_init(ena_dev, &aenq_handlers);
@@ -4360,7 +4463,9 @@ static void ena_fw_reset_device(struct work_struct *work)
 static int check_for_rx_interrupt_queue(struct ena_adapter *adapter,
 					struct ena_ring *rx_ring)
 {
-	if (likely(rx_ring->first_interrupt))
+	struct ena_napi *ena_napi = container_of(rx_ring->napi, struct ena_napi, napi);
+
+	if (likely(READ_ONCE(ena_napi->first_interrupt)))
 		return 0;
 
 	if (ena_com_cq_empty(rx_ring->ena_com_io_cq))
@@ -4384,6 +4489,10 @@ static int check_for_rx_interrupt_queue(struct ena_adapter *adapter,
 static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
 					  struct ena_ring *tx_ring)
 {
+	struct ena_napi *ena_napi = container_of(tx_ring->napi, struct ena_napi, napi);
+	unsigned int time_since_last_napi;
+	unsigned int missing_tx_comp_to;
+	bool is_tx_comp_time_expired;
 	struct ena_tx_buffer *tx_buf;
 	unsigned long last_jiffies;
 	u32 missed_tx = 0;
@@ -4397,8 +4506,10 @@ static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
 			/* no pending Tx at this location */
 			continue;
 
-		if (unlikely(!tx_ring->first_interrupt && time_is_before_jiffies(last_jiffies +
-			     2 * adapter->missing_tx_completion_to))) {
+		is_tx_comp_time_expired = time_is_before_jiffies(last_jiffies +
+			 2 * adapter->missing_tx_completion_to);
+
+		if (unlikely(!READ_ONCE(ena_napi->first_interrupt) && is_tx_comp_time_expired)) {
 			/* If after graceful period interrupt is still not
 			 * received, we schedule a reset
 			 */
@@ -4411,12 +4522,17 @@ static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
 			return -EIO;
 		}
 
-		if (unlikely(time_is_before_jiffies(last_jiffies +
-				adapter->missing_tx_completion_to))) {
-			if (!tx_buf->print_once)
+		is_tx_comp_time_expired = time_is_before_jiffies(last_jiffies +
+			adapter->missing_tx_completion_to);
+
+		if (unlikely(is_tx_comp_time_expired)) {
+			if (!tx_buf->print_once) {
+				time_since_last_napi = jiffies_to_usecs(jiffies - tx_ring->tx_stats.last_napi_jiffies);
+				missing_tx_comp_to = jiffies_to_msecs(adapter->missing_tx_completion_to);
 				netif_notice(adapter, tx_err, adapter->netdev,
-					     "Found a Tx that wasn't completed on time, qid %d, index %d.\n",
-					     tx_ring->qid, i);
+					     "Found a Tx that wasn't completed on time, qid %d, index %d. %u usecs have passed since last napi execution. Missing Tx timeout value %u msecs\n",
+					     tx_ring->qid, i, time_since_last_napi, missing_tx_comp_to);
+			}
 
 			tx_buf->print_once = 1;
 			missed_tx++;
@@ -4434,8 +4550,8 @@ static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
 		rc = -EIO;
 	}
 
-	ena_increase_stat_atomic(&tx_ring->tx_stats.missed_tx , missed_tx,
-		&tx_ring->syncp);
+	ena_increase_stat(&tx_ring->tx_stats.missed_tx, missed_tx,
+			  &tx_ring->syncp);
 
 	return rc;
 }
@@ -4518,8 +4634,8 @@ static void check_for_empty_rx_ring(struct ena_adapter *adapter)
 			rx_ring->empty_rx_queue++;
 
 			if (rx_ring->empty_rx_queue >= EMPTY_RX_REFILL) {
-				ena_increase_stat_atomic(&rx_ring->rx_stats.empty_rx_ring, 1,
-					&rx_ring->syncp);
+				ena_increase_stat(&rx_ring->rx_stats.empty_rx_ring, 1,
+						  &rx_ring->syncp);
 
 				netif_err(adapter, drv, adapter->netdev,
 					  "Trigger refill for ring %d\n", i);
@@ -4549,8 +4665,8 @@ static void check_for_missing_keep_alive(struct ena_adapter *adapter)
 	if (unlikely(time_is_before_jiffies(keep_alive_expired))) {
 		netif_err(adapter, drv, adapter->netdev,
 			  "Keep alive watchdog timeout.\n");
-		ena_increase_stat_atomic(&adapter->dev_stats.wd_expired, 1,
-			&adapter->syncp);
+		ena_increase_stat(&adapter->dev_stats.wd_expired, 1,
+				  &adapter->syncp);
 		adapter->reset_reason = ENA_REGS_RESET_KEEP_ALIVE_TO;
 		set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
 	}
@@ -4561,8 +4677,8 @@ static void check_for_admin_com_state(struct ena_adapter *adapter)
 	if (unlikely(!ena_com_get_admin_running_state(adapter->ena_dev))) {
 		netif_err(adapter, drv, adapter->netdev,
 			  "ENA admin queue is not in running state!\n");
-		ena_increase_stat_atomic(&adapter->dev_stats.admin_q_pause, 1,
-			&adapter->syncp);
+		ena_increase_stat(&adapter->dev_stats.admin_q_pause, 1,
+				  &adapter->syncp);
 		adapter->reset_reason = ENA_REGS_RESET_ADMIN_TO;
 		set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
 	}
@@ -4936,6 +5052,13 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		return rc;
 	}
 
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0)
+	rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(ENA_MAX_PHYS_ADDR_SIZE_BITS));
+	if (rc) {
+		dev_err(&pdev->dev, "dma_set_mask_and_coherent failed %d\n", rc);
+		goto err_disable_device;
+	}
+#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0) */
 	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(ENA_MAX_PHYS_ADDR_SIZE_BITS));
 	if (rc) {
 		dev_err(&pdev->dev, "pci_set_dma_mask failed %d\n", rc);
@@ -4948,6 +5071,7 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 			rc);
 		goto err_disable_device;
 	}
+#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0) */
 
 	pci_set_master(pdev);
 
@@ -5040,6 +5164,7 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	adapter->num_io_queues = clamp_val(num_io_queues, ENA_MIN_NUM_IO_QUEUES,
 					   max_num_io_queues);
+	adapter->lpc_size = lpc_size;
 	adapter->max_num_io_queues = max_num_io_queues;
 	adapter->last_monitored_tx_qid = 0;
 
@@ -5269,8 +5394,7 @@ static int ena_suspend(struct pci_dev *pdev,  pm_message_t state)
 #endif /* ENA_GENERIC_PM_OPS */
 	struct ena_adapter *adapter = pci_get_drvdata(pdev);
 
-	ena_increase_stat_atomic(&adapter->dev_stats.suspend, 1,
-		&adapter->syncp);
+	ena_increase_stat(&adapter->dev_stats.suspend, 1, &adapter->syncp);
 
 	rtnl_lock();
 	if (unlikely(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) {
@@ -5301,8 +5425,7 @@ static int ena_resume(struct pci_dev *pdev)
 #endif /* ENA_GENERIC_PM_OPS */
 	int rc;
 
-	ena_increase_stat_atomic(&adapter->dev_stats.resume, 1,
-		&adapter->syncp);
+	ena_increase_stat(&adapter->dev_stats.resume, 1, &adapter->syncp);
 
 	rtnl_lock();
 #if LINUX_VERSION_CODE < KERNEL_VERSION(5,5,0)
diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h
old mode 100755
new mode 100644
index 144bfb5378c04..1061a0a3d1499
--- a/drivers/amazon/net/ena/ena_netdev.h
+++ b/drivers/amazon/net/ena/ena_netdev.h
@@ -25,8 +25,8 @@
 #include "ena_eth_com.h"
 
 #define DRV_MODULE_GEN_MAJOR	2
-#define DRV_MODULE_GEN_MINOR	4
-#define DRV_MODULE_GEN_SUBMINOR 1
+#define DRV_MODULE_GEN_MINOR	5
+#define DRV_MODULE_GEN_SUBMINOR 0
 
 #define DRV_MODULE_NAME		"ena"
 #ifndef DRV_MODULE_GENERATION
@@ -67,12 +67,6 @@
 #define ENA_TX_WAKEUP_THRESH		(MAX_SKB_FRAGS + 2)
 #define ENA_DEFAULT_RX_COPYBREAK	(256 - NET_IP_ALIGN)
 
-/* limit the buffer size to 600 bytes to handle MTU changes from very
- * small to very large, in which case the number of buffers per packet
- * could exceed ENA_PKT_MAX_BUFS
- */
-#define ENA_DEFAULT_MIN_RX_BUFF_ALLOC_SIZE 600
-
 #define ENA_MIN_MTU		128
 
 #define ENA_NAME_MAX_LEN	20
@@ -156,14 +150,14 @@ struct ena_irq {
 };
 
 struct ena_napi {
-	struct napi_struct napi ____cacheline_aligned;
+	bool first_interrupt ____cacheline_aligned;
+	bool interrupts_masked;
+	struct napi_struct napi;
 	struct ena_ring *tx_ring;
 	struct ena_ring *rx_ring;
 #ifdef ENA_XDP_SUPPORT
 	struct ena_ring *xdp_ring;
 #endif /* ENA_XDP_SUPPORT */
-	bool first_interrupt;
-	bool interrupts_masked;
 	u32 qid;
 	struct dim dim;
 };
@@ -218,6 +212,7 @@ struct ena_rx_buffer {
 	struct sk_buff *skb;
 	struct page *page;
 	u32 page_offset;
+	bool is_lpc_page;
 	struct ena_com_buf ena_buf;
 } ____cacheline_aligned;
 
@@ -237,6 +232,7 @@ struct ena_stats_tx {
 	u64 llq_buffer_copy;
 	u64 missed_tx;
 	u64 unmask_interrupt;
+	u64 last_napi_jiffies;
 };
 
 struct ena_stats_rx {
@@ -250,7 +246,7 @@ struct ena_stats_rx {
 	u64 skb_alloc_fail;
 	u64 dma_mapping_err;
 	u64 bad_desc_num;
-#if ENA_BUSY_POLL_SUPPORT
+#ifdef ENA_BUSY_POLL_SUPPORT
 	u64 bp_yield;
 	u64 bp_missed;
 	u64 bp_cleaned;
@@ -319,6 +315,10 @@ struct ena_ring {
 	struct bpf_prog *xdp_bpf_prog;
 	struct xdp_rxq_info xdp_rxq;
 	spinlock_t xdp_tx_lock;	/* synchronize XDP TX/Redirect traffic */
+	/* Used for rx queues only to point to the xdp tx ring, to
+	 * which traffic should be redirected from this rx ring.
+	 */
+	struct ena_ring *xdp_ring;
 #endif
 
 	u16 next_to_use;
@@ -332,7 +332,6 @@ struct ena_ring {
 	/* The maximum header length the device can handle */
 	u8 tx_max_header_size;
 
-	bool first_interrupt;
 	bool disable_meta_caching;
 	u16 no_interrupt_event_cnt;
 
@@ -355,12 +354,12 @@ struct ena_ring {
 
 	u8 *push_buf_intermediate_buf;
 	int empty_rx_queue;
-#if ENA_BUSY_POLL_SUPPORT
+#ifdef ENA_BUSY_POLL_SUPPORT
 	atomic_t bp_state;
 #endif
 } ____cacheline_aligned;
 
-#if ENA_BUSY_POLL_SUPPORT
+#ifdef ENA_BUSY_POLL_SUPPORT
 enum ena_busy_poll_state_t {
 	ENA_BP_STATE_IDLE = 0,
 	ENA_BP_STATE_NAPI,
@@ -405,6 +404,9 @@ struct ena_adapter {
 	u32 num_io_queues;
 	u32 max_num_io_queues;
 
+	/* Local page cache size */
+	u32 lpc_size;
+
 #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
 	struct msix_entry *msix_entries;
 #endif
@@ -477,6 +479,8 @@ void ena_dump_stats_to_buf(struct ena_adapter *adapter, u8 *buf);
 
 int ena_update_hw_stats(struct ena_adapter *adapter);
 
+int ena_set_lpc_state(struct ena_adapter *adapter, bool enabled);
+
 int ena_update_queue_sizes(struct ena_adapter *adapter,
 			   u32 new_tx_size,
 			   u32 new_rx_size);
@@ -484,8 +488,7 @@ int ena_update_queue_sizes(struct ena_adapter *adapter,
 int ena_update_queue_count(struct ena_adapter *adapter, u32 new_channel_count);
 
 int ena_get_sset_count(struct net_device *netdev, int sset);
-
-#if ENA_BUSY_POLL_SUPPORT
+#ifdef ENA_BUSY_POLL_SUPPORT
 static inline void ena_bp_init_lock(struct ena_ring *rx_ring)
 {
 	/* reset state to idle */
@@ -553,41 +556,8 @@ static inline bool ena_bp_disable(struct ena_ring *rx_ring)
 
 	return rc == ENA_BP_STATE_IDLE;
 }
-#else
-static inline void ena_bp_init_lock(struct ena_ring *rx_ring)
-{
-}
-
-static inline bool ena_bp_lock_napi(struct ena_ring *rx_ring)
-{
-       return true;
-}
-
-static inline void ena_bp_unlock_napi(struct ena_ring *rx_ring)
-{
-}
-
-static inline bool ena_bp_lock_poll(struct ena_ring *rx_ring)
-{
-       return false;
-}
-
-static inline void ena_bp_unlock_poll(struct ena_ring *rx_ring)
-{
-}
-
-static inline bool ena_bp_busy_polling(struct ena_ring *rx_ring)
-{
-       return false;
-}
-
-static inline bool ena_bp_disable(struct ena_ring *rx_ring)
-{
-	return true;
-}
 #endif /* ENA_BUSY_POLL_SUPPORT */
 
-
 #ifdef ENA_XDP_SUPPORT
 enum ena_xdp_errors_t {
 	ENA_XDP_ALLOWED = 0,
diff --git a/drivers/amazon/net/ena/kcompat.h b/drivers/amazon/net/ena/kcompat.h
old mode 100755
new mode 100644
index 35ab08cada988..824128cd8dc61
--- a/drivers/amazon/net/ena/kcompat.h
+++ b/drivers/amazon/net/ena/kcompat.h
@@ -38,7 +38,9 @@ Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
 
 #ifndef LINUX_VERSION_CODE
 #include <linux/version.h>
-#else
+#endif
+
+#ifndef KERNEL_VERSION
 #define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c))
 #endif
 
@@ -99,10 +101,11 @@ Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
 #define __GFP_COLD 0
 #endif
 
-#define ENA_BUSY_POLL_SUPPORT defined(CONFIG_NET_RX_BUSY_POLL) && \
+#if defined(CONFIG_NET_RX_BUSY_POLL) && \
 	LINUX_VERSION_CODE < KERNEL_VERSION(4,5,0) && \
 	LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
-
+#define ENA_BUSY_POLL_SUPPORT
+#endif
 /******************************************************************************/
 /************************** Ubuntu macros *************************************/
 /******************************************************************************/
@@ -125,10 +128,7 @@ Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
  *  3.16.0-23-generic
  * ABI is 23
  */
-#ifndef UTS_UBUNTU_RELEASE_ABI
-#define UTS_UBUNTU_RELEASE_ABI 0
-#define UBUNTU_VERSION_CODE 0
-#else
+#ifdef UTS_UBUNTU_RELEASE_ABI
 
 #if UTS_UBUNTU_RELEASE_ABI > 255
 #undef UTS_UBUNTU_RELEASE_ABI
@@ -238,7 +238,9 @@ Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
 
 /*****************************************************************************/
 #if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) )
-#ifndef netif_set_real_num_tx_queues
+/* The function netif_set_real_num_tx_queues() doesn't return value for
+ * kernels < 2.6.37
+ */
 static inline int _kc_netif_set_real_num_tx_queues(struct net_device *dev,
                                                    unsigned int txq)
 {
@@ -247,16 +249,7 @@ static inline int _kc_netif_set_real_num_tx_queues(struct net_device *dev,
 }
 #define netif_set_real_num_tx_queues(dev, txq) \
         _kc_netif_set_real_num_tx_queues(dev, txq)
-#endif
-#ifndef netif_set_real_num_rx_queues
-static inline int __kc_netif_set_real_num_rx_queues(struct net_device __always_unused *dev,
-                                                    unsigned int __always_unused rxq)
-{
-        return 0;
-}
-#define netif_set_real_num_rx_queues(dev, rxq) \
-        __kc_netif_set_real_num_rx_queues((dev), (rxq))
-#endif
+
 #endif /* < 2.6.37 */
 
 #if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0) )
@@ -278,18 +271,6 @@ typedef u32 netdev_features_t;
 
 /******************************************************************************/
 #if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0) )
-#ifndef skb_add_rx_frag
-#define skb_add_rx_frag _kc_skb_add_rx_frag
-static inline void _kc_skb_add_rx_frag(struct sk_buff *skb, int i,
-				       struct page *page, int off, int size,
-				       unsigned int truesize)
-{
-	skb_fill_page_desc(skb, i, page, off, size);
-	skb->len += size;
-	skb->data_len += size;
-	skb->truesize += truesize;
-}
-#endif
 #ifdef NET_ADDR_RANDOM
 #define eth_hw_addr_random(N) do { \
 	eth_random_addr(N->dev_addr); \
@@ -383,7 +364,7 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings)
 #endif
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,13,0)
-#if UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,24)
+#if defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,24)
 #define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK_V1
 #else
 #define HAVE_NDO_SELECT_QUEUE_ACCEL
@@ -416,7 +397,7 @@ static inline void reinit_completion(struct completion *x)
        RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7,0))) \
      && !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0))&& \
      !defined(UEK3_RELEASE))) || \
-     (UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE < UBUNTU_VERSION(3,13,0,30))
+     (defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE < UBUNTU_VERSION(3,13,0,30))
 static inline int pci_enable_msix_range(struct pci_dev *dev,
 					struct msix_entry *entries,
 					int minvec,
@@ -456,7 +437,7 @@ static inline void *devm_kcalloc(struct device *dev,
 #if (( LINUX_VERSION_CODE < KERNEL_VERSION(3,13,8) ) && \
      !RHEL_RELEASE_CODE && \
      !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0))) || \
-     (UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE < UBUNTU_VERSION(3,13,0,30))
+     (defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE < UBUNTU_VERSION(3,13,0,30))
 enum pkt_hash_types {
 	PKT_HASH_TYPE_NONE,	/* Undefined type */
 	PKT_HASH_TYPE_L2,	/* Input: src_MAC, dest_MAC */
@@ -476,7 +457,7 @@ static inline void skb_set_hash(struct sk_buff *skb, __u32 hash,
 #if LINUX_VERSION_CODE < KERNEL_VERSION(3,14,0)
 #if !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7,0) && \
 			        RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(6,6)) \
-    && !(UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,105))
+    && !(defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,105))
 static inline int pci_msix_vec_count(struct pci_dev *dev)
 {
 	int pos;
@@ -504,7 +485,7 @@ static inline void ether_addr_copy(u8 *dst, const u8 *src)
 #endif
 
 #if ( LINUX_VERSION_CODE >= KERNEL_VERSION(3,15,0) || \
-	(UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,30))) || \
+	(defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,30))) || \
 	(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) || \
 	(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7,0) \
 	                     && RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7,1))
@@ -581,7 +562,7 @@ static inline void __napi_schedule_irqoff(struct napi_struct *n)
 	|| (RHEL_RELEASE_CODE && ((RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(6,7)) && \
 	                          (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0))) \
 	                      || RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2)) \
-	|| (UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,19,0,51))
+	|| (defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,19,0,51))
 #else
 static inline void napi_complete_done(struct napi_struct *n, int work_done)
 {
@@ -590,8 +571,8 @@ static inline void napi_complete_done(struct napi_struct *n, int work_done)
 #endif
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,1,0) \
-	|| (UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,126)) && \
-	(UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE < UBUNTU_VERSION(3,14,0,0)) \
+	|| (defined(UBUNTU_VERSION_CODE) && \
+	(UBUNTU_VERSION(3,13,0,126) <= UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE < UBUNTU_VERSION(3,14,0,0))) \
 	|| (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5))
 
 #else
@@ -718,7 +699,7 @@ do {									\
 #if LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0) && \
     !(RHEL_RELEASE_CODE && ((RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7, 1)) && \
                             (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(6, 6)))) && \
-                            !(UBUNTU_VERSION_CODE) && \
+                            !defined(UBUNTU_VERSION_CODE) && \
                             !defined(UEK3_RELEASE)
 
 #define DO_ONCE(func, ...)						     \
@@ -759,12 +740,19 @@ static inline void netdev_rss_key_fill(void *buffer, size_t len)
 #define ENA_GENERIC_PM_OPS
 #endif
 
-#if ((LINUX_VERSION_CODE < KERNEL_VERSION(4, 5 ,0)) && \
+#if ((LINUX_VERSION_CODE < KERNEL_VERSION(4, 6 ,0)) && \
      !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,3)))
+/* Linux versions 4.4.216 - 4.5 (non inclusive) back propagated page_ref_count
+ * function from kernel 4.6. To make things more difficult, Ubuntu didn't add
+ * these changes to its 4.4.* kernels
+ */
+#if !(KERNEL_VERSION(4, 4 ,216) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(4, 5 ,0)) ||\
+      defined(UBUNTU_VERSION_CODE)
 static inline int page_ref_count(struct page *page)
 {
 	return atomic_read(&page->_count);
 }
+#endif /* !(KERNEL_VERSION(4, 4 ,216) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(4, 5 ,0)) */
 
 static inline void page_ref_inc(struct page *page)
 {
@@ -803,4 +791,44 @@ static inline int numa_mem_id(void)
 #endif /* numa_mem_id */
 #endif /* LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 34) */
 
+#ifndef fallthrough
+#define fallthrough do {} while (0)  /* fallthrough */
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0)
+#define AF_XDP_BUSY_POLL_SUPPORTED
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0)
+#define ENA_LINEAR_FRAG_SUPPORTED
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0)
+#define ENA_NETDEV_LOGS_WITHOUT_RV
+#endif
+
+#if defined(ENA_XDP_SUPPORT) && LINUX_VERSION_CODE < KERNEL_VERSION(5, 12, 0)
+static __always_inline void
+xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq)
+{
+	xdp->rxq = rxq;
+#ifdef XDP_HAS_FRAME_SZ
+	xdp->frame_sz = frame_sz;
+#endif
+}
+
+static __always_inline void
+xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start,
+		 int headroom, int data_len, const bool meta_valid)
+{
+	unsigned char *data = hard_start + headroom;
+
+	xdp->data_hard_start = hard_start;
+	xdp->data = data;
+	xdp->data_end = data + data_len;
+	xdp->data_meta = meta_valid ? data : data + 1;
+}
+
+#endif /* defined(ENA_XDP_SUPPORT) && LINUX_VERSION_CODE <= KERNEL_VERSION(5, 12, 0) */
+
 #endif /* _KCOMPAT_H_ */

From 675c6ecc3197d71be4f1b5f0fc9d7ad0064c7d2b Mon Sep 17 00:00:00 2001
From: Hailey Mothershead <hailmo@amazon.com>
Date: Tue, 8 Jun 2021 21:31:07 +0000
Subject: [PATCH 110/737] Revert: crypto: jitterentropy - change back to
 module_init()

The original patch assumes that self tests are not run jitterentropy.
That is not true on this kernel. When self tests are called on jitterentropy
the kernel panics because the module is not yet loaded. Reverting this patch
causes jitternetropy to loaded with subsys_initcall(). This, in turn, causes
the module to be loaded in time for the self tests.
---
 crypto/jitterentropy-kcapi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crypto/jitterentropy-kcapi.c b/crypto/jitterentropy-kcapi.c
index e8a4165a18742..b1d7b5a6e61c1 100644
--- a/crypto/jitterentropy-kcapi.c
+++ b/crypto/jitterentropy-kcapi.c
@@ -214,7 +214,7 @@ static void __exit jent_mod_exit(void)
 	crypto_unregister_rng(&jent_alg);
 }
 
-module_init(jent_mod_init);
+subsys_initcall(jent_mod_init);
 module_exit(jent_mod_exit);
 
 MODULE_LICENSE("Dual BSD/GPL");

From b4b2b430d5ad67254dc610bd5bc009b82ba7f6f2 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Thu, 24 Jun 2021 19:55:31 +0000
Subject: [PATCH 111/737] igb_uio: add

Add the igb_uio driver from DPDK, used by some of our customers. Imported
from:

http://git.dpdk.org/dpdk-kmods/ @ e13d7af77a1bf98757f85c3c4083f6ee6d0d2372

Kbuild file replaced by a regular kernel module Makefile.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/amazon/Kconfig               |   9 +
 drivers/amazon/net/Makefile          |   1 +
 drivers/amazon/net/igb_uio/Makefile  |   1 +
 drivers/amazon/net/igb_uio/compat.h  | 154 ++++++
 drivers/amazon/net/igb_uio/igb_uio.c | 674 +++++++++++++++++++++++++++
 5 files changed, 839 insertions(+)
 create mode 100644 drivers/amazon/net/igb_uio/Makefile
 create mode 100644 drivers/amazon/net/igb_uio/compat.h
 create mode 100644 drivers/amazon/net/igb_uio/igb_uio.c

diff --git a/drivers/amazon/Kconfig b/drivers/amazon/Kconfig
index 68b807d53c462..2012cb50eb2a1 100644
--- a/drivers/amazon/Kconfig
+++ b/drivers/amazon/Kconfig
@@ -30,4 +30,13 @@ config AMAZON_EFA_INFINIBAND
 	  To compile this driver as a module, choose M here.
 	  The module will be called efa
 
+config AMAZON_IGB_UIO
+	tristate "DPDK igb_uio driver"
+	help
+	  This is the direct PCI access driver for igb and
+	  other PCI network devices, for DPDK.
+
+	  To compile this driver as a module, choose M here.
+	  The module will be called igb_uio.
+
 endif # AMAZON_DRIVER_UPDATES
diff --git a/drivers/amazon/net/Makefile b/drivers/amazon/net/Makefile
index 197c9a061fdf0..7eb6f214798ee 100644
--- a/drivers/amazon/net/Makefile
+++ b/drivers/amazon/net/Makefile
@@ -3,3 +3,4 @@
 #
 obj-$(CONFIG_AMAZON_ENA_ETHERNET)	+= ena/
 obj-$(CONFIG_AMAZON_EFA_INFINIBAND)	+= efa/
+obj-$(CONFIG_AMAZON_IGB_UIO)		+= igb_uio/
diff --git a/drivers/amazon/net/igb_uio/Makefile b/drivers/amazon/net/igb_uio/Makefile
new file mode 100644
index 0000000000000..ebced2786f7c8
--- /dev/null
+++ b/drivers/amazon/net/igb_uio/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_AMAZON_IGB_UIO) += igb_uio.o
diff --git a/drivers/amazon/net/igb_uio/compat.h b/drivers/amazon/net/igb_uio/compat.h
new file mode 100644
index 0000000000000..8dbb896ae1185
--- /dev/null
+++ b/drivers/amazon/net/igb_uio/compat.h
@@ -0,0 +1,154 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Minimal wrappers to allow compiling igb_uio on older kernels.
+ */
+
+#ifndef RHEL_RELEASE_VERSION
+#define RHEL_RELEASE_VERSION(a, b) (((a) << 8) + (b))
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 3, 0)
+#define pci_cfg_access_lock   pci_block_user_cfg_access
+#define pci_cfg_access_unlock pci_unblock_user_cfg_access
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0)
+#define HAVE_PTE_MASK_PAGE_IOMAP
+#endif
+
+#ifndef PCI_MSIX_ENTRY_SIZE
+#define PCI_MSIX_ENTRY_SIZE            16
+#define PCI_MSIX_ENTRY_VECTOR_CTRL     12
+#define PCI_MSIX_ENTRY_CTRL_MASKBIT    1
+#endif
+
+/*
+ * for kernels < 2.6.38 and backported patch that moves MSI-X entry definition
+ * to pci_regs.h Those kernels has PCI_MSIX_ENTRY_SIZE defined but not
+ * PCI_MSIX_ENTRY_CTRL_MASKBIT
+ */
+#ifndef PCI_MSIX_ENTRY_CTRL_MASKBIT
+#define PCI_MSIX_ENTRY_CTRL_MASKBIT    1
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 34) && \
+	(!(defined(RHEL_RELEASE_CODE) && \
+	 RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(5, 9)))
+
+static int pci_num_vf(struct pci_dev *dev)
+{
+	struct iov {
+		int pos;
+		int nres;
+		u32 cap;
+		u16 ctrl;
+		u16 total;
+		u16 initial;
+		u16 nr_virtfn;
+	} *iov = (struct iov *)dev->sriov;
+
+	if (!dev->is_physfn)
+		return 0;
+
+	return iov->nr_virtfn;
+}
+
+#endif /* < 2.6.34 */
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 39) && \
+	(!(defined(RHEL_RELEASE_CODE) && \
+	   RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 4)))
+
+#define kstrtoul strict_strtoul
+
+#endif /* < 2.6.39 */
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 3, 0) && \
+	(!(defined(RHEL_RELEASE_CODE) && \
+	   RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 3)))
+
+/* Check if INTX works to control irq's.
+ * Set's INTX_DISABLE flag and reads it back
+ */
+static bool pci_intx_mask_supported(struct pci_dev *pdev)
+{
+	bool mask_supported = false;
+	uint16_t orig, new;
+
+	pci_block_user_cfg_access(pdev);
+	pci_read_config_word(pdev, PCI_COMMAND, &orig);
+	pci_write_config_word(pdev, PCI_COMMAND,
+			      orig ^ PCI_COMMAND_INTX_DISABLE);
+	pci_read_config_word(pdev, PCI_COMMAND, &new);
+
+	if ((new ^ orig) & ~PCI_COMMAND_INTX_DISABLE) {
+		dev_err(&pdev->dev, "Command register changed from "
+			"0x%x to 0x%x: driver or hardware bug?\n", orig, new);
+	} else if ((new ^ orig) & PCI_COMMAND_INTX_DISABLE) {
+		mask_supported = true;
+		pci_write_config_word(pdev, PCI_COMMAND, orig);
+	}
+	pci_unblock_user_cfg_access(pdev);
+
+	return mask_supported;
+}
+
+static bool pci_check_and_mask_intx(struct pci_dev *pdev)
+{
+	bool pending;
+	uint32_t status;
+
+	pci_block_user_cfg_access(pdev);
+	pci_read_config_dword(pdev, PCI_COMMAND, &status);
+
+	/* interrupt is not ours, goes to out */
+	pending = (((status >> 16) & PCI_STATUS_INTERRUPT) != 0);
+	if (pending) {
+		uint16_t old, new;
+
+		old = status;
+		if (status != 0)
+			new = old & (~PCI_COMMAND_INTX_DISABLE);
+		else
+			new = old | PCI_COMMAND_INTX_DISABLE;
+
+		if (old != new)
+			pci_write_config_word(pdev, PCI_COMMAND, new);
+	}
+	pci_unblock_user_cfg_access(pdev);
+
+	return pending;
+}
+
+#endif /* < 3.3.0 */
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0)
+#define HAVE_PCI_IS_BRIDGE_API 1
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)
+#define HAVE_MSI_LIST_IN_GENERIC_DEVICE 1
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
+#define HAVE_PCI_MSI_MASK_IRQ 1
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 8, 0)
+#define HAVE_ALLOC_IRQ_VECTORS 1
+#endif
+
+static inline bool igbuio_kernel_is_locked_down(void)
+{
+#ifdef CONFIG_LOCK_DOWN_KERNEL
+#ifdef CONFIG_LOCK_DOWN_IN_EFI_SECURE_BOOT
+	return kernel_is_locked_down(NULL);
+#elif defined(CONFIG_EFI_SECURE_BOOT_LOCK_DOWN)
+	return kernel_is_locked_down();
+#else
+	return false;
+#endif
+#else
+	return false;
+#endif
+}
diff --git a/drivers/amazon/net/igb_uio/igb_uio.c b/drivers/amazon/net/igb_uio/igb_uio.c
new file mode 100644
index 0000000000000..ea439d131de1a
--- /dev/null
+++ b/drivers/amazon/net/igb_uio/igb_uio.c
@@ -0,0 +1,674 @@
+// SPDX-License-Identifier: GPL-2.0
+/*-
+ * Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/uio_driver.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/msi.h>
+#include <linux/version.h>
+#include <linux/slab.h>
+
+/**
+ * These enum and macro definitions are copied from the
+ * file rte_pci_dev_features.h
+ */
+enum rte_intr_mode {
+	RTE_INTR_MODE_NONE = 0,
+	RTE_INTR_MODE_LEGACY,
+	RTE_INTR_MODE_MSI,
+	RTE_INTR_MODE_MSIX
+};
+#define RTE_INTR_MODE_NONE_NAME "none"
+#define RTE_INTR_MODE_LEGACY_NAME "legacy"
+#define RTE_INTR_MODE_MSI_NAME "msi"
+#define RTE_INTR_MODE_MSIX_NAME "msix"
+
+
+#include "compat.h"
+
+/**
+ * A structure describing the private information for a uio device.
+ */
+struct rte_uio_pci_dev {
+	struct uio_info info;
+	struct pci_dev *pdev;
+	enum rte_intr_mode mode;
+	atomic_t refcnt;
+};
+
+static int wc_activate;
+static char *intr_mode;
+static enum rte_intr_mode igbuio_intr_mode_preferred = RTE_INTR_MODE_MSIX;
+/* sriov sysfs */
+static ssize_t
+show_max_vfs(struct device *dev, struct device_attribute *attr,
+	     char *buf)
+{
+	return snprintf(buf, 10, "%u\n", dev_num_vf(dev));
+}
+
+static ssize_t
+store_max_vfs(struct device *dev, struct device_attribute *attr,
+	      const char *buf, size_t count)
+{
+	int err = 0;
+	unsigned long max_vfs;
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	if (0 != kstrtoul(buf, 0, &max_vfs))
+		return -EINVAL;
+
+	if (0 == max_vfs)
+		pci_disable_sriov(pdev);
+	else if (0 == pci_num_vf(pdev))
+		err = pci_enable_sriov(pdev, max_vfs);
+	else /* do nothing if change max_vfs number */
+		err = -EINVAL;
+
+	return err ? err : count;
+}
+
+static DEVICE_ATTR(max_vfs, S_IRUGO | S_IWUSR, show_max_vfs, store_max_vfs);
+
+static struct attribute *dev_attrs[] = {
+	&dev_attr_max_vfs.attr,
+	NULL,
+};
+
+static const struct attribute_group dev_attr_grp = {
+	.attrs = dev_attrs,
+};
+
+#ifndef HAVE_PCI_MSI_MASK_IRQ
+/*
+ * It masks the msix on/off of generating MSI-X messages.
+ */
+static void
+igbuio_msix_mask_irq(struct msi_desc *desc, s32 state)
+{
+	u32 mask_bits = desc->masked;
+	unsigned int offset = desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
+						PCI_MSIX_ENTRY_VECTOR_CTRL;
+
+	if (state != 0)
+		mask_bits &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT;
+	else
+		mask_bits |= PCI_MSIX_ENTRY_CTRL_MASKBIT;
+
+	if (mask_bits != desc->masked) {
+		writel(mask_bits, desc->mask_base + offset);
+		readl(desc->mask_base);
+		desc->masked = mask_bits;
+	}
+}
+
+/*
+ * It masks the msi on/off of generating MSI messages.
+ */
+static void
+igbuio_msi_mask_irq(struct pci_dev *pdev, struct msi_desc *desc, int32_t state)
+{
+	u32 mask_bits = desc->masked;
+	u32 offset = desc->irq - pdev->irq;
+	u32 mask = 1 << offset;
+
+	if (!desc->msi_attrib.maskbit)
+		return;
+
+	if (state != 0)
+		mask_bits &= ~mask;
+	else
+		mask_bits |= mask;
+
+	if (mask_bits != desc->masked) {
+		pci_write_config_dword(pdev, desc->mask_pos, mask_bits);
+		desc->masked = mask_bits;
+	}
+}
+
+static void
+igbuio_mask_irq(struct pci_dev *pdev, enum rte_intr_mode mode, s32 irq_state)
+{
+	struct msi_desc *desc;
+	struct list_head *msi_list;
+
+#ifdef HAVE_MSI_LIST_IN_GENERIC_DEVICE
+	msi_list = &pdev->dev.msi_list;
+#else
+	msi_list = &pdev->msi_list;
+#endif
+
+	if (mode == RTE_INTR_MODE_MSIX) {
+		list_for_each_entry(desc, msi_list, list)
+			igbuio_msix_mask_irq(desc, irq_state);
+	} else if (mode == RTE_INTR_MODE_MSI) {
+		list_for_each_entry(desc, msi_list, list)
+			igbuio_msi_mask_irq(pdev, desc, irq_state);
+	}
+}
+#endif
+
+/**
+ * This is the irqcontrol callback to be registered to uio_info.
+ * It can be used to disable/enable interrupt from user space processes.
+ *
+ * @param info
+ *  pointer to uio_info.
+ * @param irq_state
+ *  state value. 1 to enable interrupt, 0 to disable interrupt.
+ *
+ * @return
+ *  - On success, 0.
+ *  - On failure, a negative value.
+ */
+static int
+igbuio_pci_irqcontrol(struct uio_info *info, s32 irq_state)
+{
+	struct rte_uio_pci_dev *udev = info->priv;
+	struct pci_dev *pdev = udev->pdev;
+
+#ifdef HAVE_PCI_MSI_MASK_IRQ
+	struct irq_data *irq = irq_get_irq_data(udev->info.irq);
+#endif
+
+	pci_cfg_access_lock(pdev);
+
+	if (udev->mode == RTE_INTR_MODE_MSIX || udev->mode == RTE_INTR_MODE_MSI) {
+#ifdef HAVE_PCI_MSI_MASK_IRQ
+		if (irq_state == 1)
+			pci_msi_unmask_irq(irq);
+		else
+			pci_msi_mask_irq(irq);
+#else
+		igbuio_mask_irq(pdev, udev->mode, irq_state);
+#endif
+	}
+
+	if (udev->mode == RTE_INTR_MODE_LEGACY)
+		pci_intx(pdev, !!irq_state);
+
+	pci_cfg_access_unlock(pdev);
+
+	return 0;
+}
+
+/**
+ * This is interrupt handler which will check if the interrupt is for the right device.
+ * If yes, disable it here and will be enable later.
+ */
+static irqreturn_t
+igbuio_pci_irqhandler(int irq, void *dev_id)
+{
+	struct rte_uio_pci_dev *udev = (struct rte_uio_pci_dev *)dev_id;
+	struct uio_info *info = &udev->info;
+
+	/* Legacy mode need to mask in hardware */
+	if (udev->mode == RTE_INTR_MODE_LEGACY &&
+	    !pci_check_and_mask_intx(udev->pdev))
+		return IRQ_NONE;
+
+	uio_event_notify(info);
+
+	/* Message signal mode, no share IRQ and automasked */
+	return IRQ_HANDLED;
+}
+
+static int
+igbuio_pci_enable_interrupts(struct rte_uio_pci_dev *udev)
+{
+	int err = 0;
+#ifndef HAVE_ALLOC_IRQ_VECTORS
+	struct msix_entry msix_entry;
+#endif
+
+	switch (igbuio_intr_mode_preferred) {
+	case RTE_INTR_MODE_MSIX:
+		/* Only 1 msi-x vector needed */
+#ifndef HAVE_ALLOC_IRQ_VECTORS
+		msix_entry.entry = 0;
+		if (pci_enable_msix(udev->pdev, &msix_entry, 1) == 0) {
+			dev_dbg(&udev->pdev->dev, "using MSI-X");
+			udev->info.irq_flags = IRQF_NO_THREAD;
+			udev->info.irq = msix_entry.vector;
+			udev->mode = RTE_INTR_MODE_MSIX;
+			break;
+		}
+#else
+		if (pci_alloc_irq_vectors(udev->pdev, 1, 1, PCI_IRQ_MSIX) == 1) {
+			dev_dbg(&udev->pdev->dev, "using MSI-X");
+			udev->info.irq_flags = IRQF_NO_THREAD;
+			udev->info.irq = pci_irq_vector(udev->pdev, 0);
+			udev->mode = RTE_INTR_MODE_MSIX;
+			break;
+		}
+#endif
+
+	/* falls through - to MSI */
+	case RTE_INTR_MODE_MSI:
+#ifndef HAVE_ALLOC_IRQ_VECTORS
+		if (pci_enable_msi(udev->pdev) == 0) {
+			dev_dbg(&udev->pdev->dev, "using MSI");
+			udev->info.irq_flags = IRQF_NO_THREAD;
+			udev->info.irq = udev->pdev->irq;
+			udev->mode = RTE_INTR_MODE_MSI;
+			break;
+		}
+#else
+		if (pci_alloc_irq_vectors(udev->pdev, 1, 1, PCI_IRQ_MSI) == 1) {
+			dev_dbg(&udev->pdev->dev, "using MSI");
+			udev->info.irq_flags = IRQF_NO_THREAD;
+			udev->info.irq = pci_irq_vector(udev->pdev, 0);
+			udev->mode = RTE_INTR_MODE_MSI;
+			break;
+		}
+#endif
+	/* falls through - to INTX */
+	case RTE_INTR_MODE_LEGACY:
+		if (pci_intx_mask_supported(udev->pdev)) {
+			dev_dbg(&udev->pdev->dev, "using INTX");
+			udev->info.irq_flags = IRQF_SHARED | IRQF_NO_THREAD;
+			udev->info.irq = udev->pdev->irq;
+			udev->mode = RTE_INTR_MODE_LEGACY;
+			break;
+		}
+		dev_notice(&udev->pdev->dev, "PCI INTX mask not supported\n");
+	/* falls through - to no IRQ */
+	case RTE_INTR_MODE_NONE:
+		udev->mode = RTE_INTR_MODE_NONE;
+		udev->info.irq = UIO_IRQ_NONE;
+		break;
+
+	default:
+		dev_err(&udev->pdev->dev, "invalid IRQ mode %u",
+			igbuio_intr_mode_preferred);
+		udev->info.irq = UIO_IRQ_NONE;
+		err = -EINVAL;
+	}
+
+	if (udev->info.irq != UIO_IRQ_NONE)
+		err = request_irq(udev->info.irq, igbuio_pci_irqhandler,
+				  udev->info.irq_flags, udev->info.name,
+				  udev);
+	dev_info(&udev->pdev->dev, "uio device registered with irq %ld\n",
+		 udev->info.irq);
+
+	return err;
+}
+
+static void
+igbuio_pci_disable_interrupts(struct rte_uio_pci_dev *udev)
+{
+	if (udev->info.irq) {
+		free_irq(udev->info.irq, udev);
+		udev->info.irq = 0;
+	}
+
+#ifndef HAVE_ALLOC_IRQ_VECTORS
+	if (udev->mode == RTE_INTR_MODE_MSIX)
+		pci_disable_msix(udev->pdev);
+	if (udev->mode == RTE_INTR_MODE_MSI)
+		pci_disable_msi(udev->pdev);
+#else
+	if (udev->mode == RTE_INTR_MODE_MSIX ||
+	    udev->mode == RTE_INTR_MODE_MSI)
+		pci_free_irq_vectors(udev->pdev);
+#endif
+}
+
+
+/**
+ * This gets called while opening uio device file.
+ */
+static int
+igbuio_pci_open(struct uio_info *info, struct inode *inode)
+{
+	struct rte_uio_pci_dev *udev = info->priv;
+	struct pci_dev *dev = udev->pdev;
+	int err;
+
+	if (atomic_inc_return(&udev->refcnt) != 1)
+		return 0;
+
+	/* set bus master, which was cleared by the reset function */
+	pci_set_master(dev);
+
+	/* enable interrupts */
+	err = igbuio_pci_enable_interrupts(udev);
+	if (err) {
+		atomic_dec(&udev->refcnt);
+		dev_err(&dev->dev, "Enable interrupt fails\n");
+	}
+	return err;
+}
+
+static int
+igbuio_pci_release(struct uio_info *info, struct inode *inode)
+{
+	struct rte_uio_pci_dev *udev = info->priv;
+	struct pci_dev *dev = udev->pdev;
+
+	if (atomic_dec_and_test(&udev->refcnt)) {
+		/* disable interrupts */
+		igbuio_pci_disable_interrupts(udev);
+
+		/* stop the device from further DMA */
+		pci_clear_master(dev);
+	}
+
+	return 0;
+}
+
+/* Remap pci resources described by bar #pci_bar in uio resource n. */
+static int
+igbuio_pci_setup_iomem(struct pci_dev *dev, struct uio_info *info,
+		       int n, int pci_bar, const char *name)
+{
+	unsigned long addr, len;
+	void *internal_addr;
+
+	if (n >= ARRAY_SIZE(info->mem))
+		return -EINVAL;
+
+	addr = pci_resource_start(dev, pci_bar);
+	len = pci_resource_len(dev, pci_bar);
+	if (addr == 0 || len == 0)
+		return -1;
+	if (wc_activate == 0) {
+		internal_addr = ioremap(addr, len);
+		if (internal_addr == NULL)
+			return -1;
+	} else {
+		internal_addr = NULL;
+	}
+	info->mem[n].name = name;
+	info->mem[n].addr = addr;
+	info->mem[n].internal_addr = internal_addr;
+	info->mem[n].size = len;
+	info->mem[n].memtype = UIO_MEM_PHYS;
+	return 0;
+}
+
+/* Get pci port io resources described by bar #pci_bar in uio resource n. */
+static int
+igbuio_pci_setup_ioport(struct pci_dev *dev, struct uio_info *info,
+		int n, int pci_bar, const char *name)
+{
+	unsigned long addr, len;
+
+	if (n >= ARRAY_SIZE(info->port))
+		return -EINVAL;
+
+	addr = pci_resource_start(dev, pci_bar);
+	len = pci_resource_len(dev, pci_bar);
+	if (addr == 0 || len == 0)
+		return -EINVAL;
+
+	info->port[n].name = name;
+	info->port[n].start = addr;
+	info->port[n].size = len;
+	info->port[n].porttype = UIO_PORT_X86;
+
+	return 0;
+}
+
+/* Unmap previously ioremap'd resources */
+static void
+igbuio_pci_release_iomem(struct uio_info *info)
+{
+	int i;
+
+	for (i = 0; i < MAX_UIO_MAPS; i++) {
+		if (info->mem[i].internal_addr)
+			iounmap(info->mem[i].internal_addr);
+	}
+}
+
+static int
+igbuio_setup_bars(struct pci_dev *dev, struct uio_info *info)
+{
+	int i, iom, iop, ret;
+	unsigned long flags;
+	static const char *bar_names[PCI_STD_RESOURCE_END + 1]  = {
+		"BAR0",
+		"BAR1",
+		"BAR2",
+		"BAR3",
+		"BAR4",
+		"BAR5",
+	};
+
+	iom = 0;
+	iop = 0;
+
+	for (i = 0; i < ARRAY_SIZE(bar_names); i++) {
+		if (pci_resource_len(dev, i) != 0 &&
+				pci_resource_start(dev, i) != 0) {
+			flags = pci_resource_flags(dev, i);
+			if (flags & IORESOURCE_MEM) {
+				ret = igbuio_pci_setup_iomem(dev, info, iom,
+							     i, bar_names[i]);
+				if (ret != 0)
+					return ret;
+				iom++;
+			} else if (flags & IORESOURCE_IO) {
+				ret = igbuio_pci_setup_ioport(dev, info, iop,
+							      i, bar_names[i]);
+				if (ret != 0)
+					return ret;
+				iop++;
+			}
+		}
+	}
+
+	return (iom != 0 || iop != 0) ? ret : -ENOENT;
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 8, 0)
+static int __devinit
+#else
+static int
+#endif
+igbuio_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
+{
+	struct rte_uio_pci_dev *udev;
+	dma_addr_t map_dma_addr;
+	void *map_addr;
+	int err;
+
+#ifdef HAVE_PCI_IS_BRIDGE_API
+	if (pci_is_bridge(dev)) {
+		dev_warn(&dev->dev, "Ignoring PCI bridge device\n");
+		return -ENODEV;
+	}
+#endif
+
+	udev = kzalloc(sizeof(struct rte_uio_pci_dev), GFP_KERNEL);
+	if (!udev)
+		return -ENOMEM;
+
+	/*
+	 * enable device: ask low-level code to enable I/O and
+	 * memory
+	 */
+	err = pci_enable_device(dev);
+	if (err != 0) {
+		dev_err(&dev->dev, "Cannot enable PCI device\n");
+		goto fail_free;
+	}
+
+	/* enable bus mastering on the device */
+	pci_set_master(dev);
+
+	/* remap IO memory */
+	err = igbuio_setup_bars(dev, &udev->info);
+	if (err != 0)
+		goto fail_release_iomem;
+
+	/* set 64-bit DMA mask */
+	err = pci_set_dma_mask(dev,  DMA_BIT_MASK(64));
+	if (err != 0) {
+		dev_err(&dev->dev, "Cannot set DMA mask\n");
+		goto fail_release_iomem;
+	}
+
+	err = pci_set_consistent_dma_mask(dev, DMA_BIT_MASK(64));
+	if (err != 0) {
+		dev_err(&dev->dev, "Cannot set consistent DMA mask\n");
+		goto fail_release_iomem;
+	}
+
+	/* fill uio infos */
+	udev->info.name = "igb_uio";
+	udev->info.version = "0.1";
+	udev->info.irqcontrol = igbuio_pci_irqcontrol;
+	udev->info.open = igbuio_pci_open;
+	udev->info.release = igbuio_pci_release;
+	udev->info.priv = udev;
+	udev->pdev = dev;
+	atomic_set(&udev->refcnt, 0);
+
+	err = sysfs_create_group(&dev->dev.kobj, &dev_attr_grp);
+	if (err != 0)
+		goto fail_release_iomem;
+
+	/* register uio driver */
+	err = uio_register_device(&dev->dev, &udev->info);
+	if (err != 0)
+		goto fail_remove_group;
+
+	pci_set_drvdata(dev, udev);
+
+	/*
+	 * Doing a harmless dma mapping for attaching the device to
+	 * the iommu identity mapping if kernel boots with iommu=pt.
+	 * Note this is not a problem if no IOMMU at all.
+	 */
+	map_addr = dma_alloc_coherent(&dev->dev, 1024, &map_dma_addr,
+			GFP_KERNEL);
+	if (map_addr)
+		memset(map_addr, 0, 1024);
+
+	if (!map_addr)
+		dev_info(&dev->dev, "dma mapping failed\n");
+	else {
+		dev_info(&dev->dev, "mapping 1K dma=%#llx host=%p\n",
+			 (unsigned long long)map_dma_addr, map_addr);
+
+		dma_free_coherent(&dev->dev, 1024, map_addr, map_dma_addr);
+		dev_info(&dev->dev, "unmapping 1K dma=%#llx host=%p\n",
+			 (unsigned long long)map_dma_addr, map_addr);
+	}
+
+	return 0;
+
+fail_remove_group:
+	sysfs_remove_group(&dev->dev.kobj, &dev_attr_grp);
+fail_release_iomem:
+	igbuio_pci_release_iomem(&udev->info);
+	pci_disable_device(dev);
+fail_free:
+	kfree(udev);
+
+	return err;
+}
+
+static void
+igbuio_pci_remove(struct pci_dev *dev)
+{
+	struct rte_uio_pci_dev *udev = pci_get_drvdata(dev);
+
+	igbuio_pci_release(&udev->info, NULL);
+
+	sysfs_remove_group(&dev->dev.kobj, &dev_attr_grp);
+	uio_unregister_device(&udev->info);
+	igbuio_pci_release_iomem(&udev->info);
+	pci_disable_device(dev);
+	pci_set_drvdata(dev, NULL);
+	kfree(udev);
+}
+
+static int
+igbuio_config_intr_mode(char *intr_str)
+{
+	if (!intr_str) {
+		pr_info("Use MSIX interrupt by default\n");
+		return 0;
+	}
+
+	if (!strcmp(intr_str, RTE_INTR_MODE_MSIX_NAME)) {
+		igbuio_intr_mode_preferred = RTE_INTR_MODE_MSIX;
+		pr_info("Use MSIX interrupt\n");
+	} else if (!strcmp(intr_str, RTE_INTR_MODE_MSI_NAME)) {
+		igbuio_intr_mode_preferred = RTE_INTR_MODE_MSI;
+		pr_info("Use MSI interrupt\n");
+	} else if (!strcmp(intr_str, RTE_INTR_MODE_LEGACY_NAME)) {
+		igbuio_intr_mode_preferred = RTE_INTR_MODE_LEGACY;
+		pr_info("Use legacy interrupt\n");
+	} else {
+		pr_info("Error: bad parameter - %s\n", intr_str);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct pci_driver igbuio_pci_driver = {
+	.name = "igb_uio",
+	.id_table = NULL,
+	.probe = igbuio_pci_probe,
+	.remove = igbuio_pci_remove,
+};
+
+static int __init
+igbuio_pci_init_module(void)
+{
+	int ret;
+
+	if (igbuio_kernel_is_locked_down()) {
+		pr_err("Not able to use module, kernel lock down is enabled\n");
+		return -EINVAL;
+	}
+
+	if (wc_activate != 0)
+		pr_info("wc_activate is set\n");
+
+	ret = igbuio_config_intr_mode(intr_mode);
+	if (ret < 0)
+		return ret;
+
+	return pci_register_driver(&igbuio_pci_driver);
+}
+
+static void __exit
+igbuio_pci_exit_module(void)
+{
+	pci_unregister_driver(&igbuio_pci_driver);
+}
+
+module_init(igbuio_pci_init_module);
+module_exit(igbuio_pci_exit_module);
+
+module_param(intr_mode, charp, S_IRUGO);
+MODULE_PARM_DESC(intr_mode,
+"igb_uio interrupt mode (default=msix):\n"
+"    " RTE_INTR_MODE_MSIX_NAME "       Use MSIX interrupt\n"
+"    " RTE_INTR_MODE_MSI_NAME "        Use MSI interrupt\n"
+"    " RTE_INTR_MODE_LEGACY_NAME "     Use Legacy interrupt\n"
+"\n");
+
+module_param(wc_activate, int, 0);
+MODULE_PARM_DESC(wc_activate,
+"Activate support for write combining (WC) (default=0)\n"
+"    0 - disable\n"
+"    other - enable\n");
+
+MODULE_DESCRIPTION("UIO driver for Intel IGB PCI cards");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Intel Corporation");

From 15bac25831308e4165b8b43a221f5db5a06540a1 Mon Sep 17 00:00:00 2001
From: Nagendra S Tomar <natomar@microsoft.com>
Date: Tue, 16 Mar 2021 10:25:14 +0000
Subject: [PATCH 112/737] nfs: Subsequent READDIR calls should carry non-zero
 cookieverifier

If the loop in nfs_readdir_xdr_to_array() runs more than once, subsequent
READDIR RPCs may wrongly carry a zero cookie verifier and non-zero cookie.
Make sure subsequent calls to READDIR carry the cookie verifier returned
by the first call.

Signed-off-by: Nagendra S Tomar <natomar@microsoft.com>
Fixes: b593c09f83a2 ("NFS: Improve handling of directory verifiers")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 800ea6a74fffa..bcf2ec2e1d14a 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -865,6 +865,8 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
 			break;
 		}
 
+		verf_arg = verf_res;
+
 		status = nfs_readdir_page_filler(desc, entry, pages, pglen,
 						 arrays, narrays);
 	} while (!status && nfs_readdir_page_needs_filling(page));

From c468edb2a891bc92c29e44afda72de22b552ae21 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 16 Mar 2021 07:57:40 -0400
Subject: [PATCH 113/737] NFS: Fix handling of cookie verifier in
 uncached_readdir()

If we're doing uncached readdir(), then the readdir cookie could be
different from the one cached in the nfs_inode. We should therefore
ensure that we save that one in the struct nfs_open_dir_context.

Fixes: 35df59d3ef69 ("NFS: Reduce number of RPC calls when doing uncached readdir")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index bcf2ec2e1d14a..21c48c94b2d26 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -975,10 +975,10 @@ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
 /*
  * Once we've found the start of the dirent within a page: fill 'er up...
  */
-static void nfs_do_filldir(struct nfs_readdir_descriptor *desc)
+static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
+			   const __be32 *verf)
 {
 	struct file	*file = desc->file;
-	struct nfs_inode *nfsi = NFS_I(file_inode(file));
 	struct nfs_cache_array *array;
 	unsigned int i = 0;
 
@@ -992,7 +992,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc)
 			desc->eof = true;
 			break;
 		}
-		memcpy(desc->verf, nfsi->cookieverf, sizeof(desc->verf));
+		memcpy(desc->verf, verf, sizeof(desc->verf));
 		if (i < (array->size-1))
 			desc->dir_cookie = array->array[i+1].cookie;
 		else
@@ -1049,7 +1049,7 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
 
 	for (i = 0; !desc->eof && i < sz && arrays[i]; i++) {
 		desc->page = arrays[i];
-		nfs_do_filldir(desc);
+		nfs_do_filldir(desc, verf);
 	}
 	desc->page = NULL;
 
@@ -1070,6 +1070,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct dentry	*dentry = file_dentry(file);
 	struct inode	*inode = d_inode(dentry);
+	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_open_dir_context *dir_ctx = file->private_data;
 	struct nfs_readdir_descriptor *desc;
 	int res;
@@ -1123,7 +1124,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 			break;
 		}
 		if (res == -ETOOSMALL && desc->plus) {
-			clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
+			clear_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
 			nfs_zap_caches(inode);
 			desc->page_index = 0;
 			desc->plus = false;
@@ -1133,7 +1134,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 		if (res < 0)
 			break;
 
-		nfs_do_filldir(desc);
+		nfs_do_filldir(desc, nfsi->cookieverf);
 		nfs_readdir_page_unlock_and_put_cached(desc);
 	} while (!desc->eof);
 

From 97cd31f0d054d5baf21db00475f51f96eedab4a7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 17 Mar 2021 08:46:19 -0400
Subject: [PATCH 114/737] NFS: Only change the cookie verifier if the directory
 page cache is empty

The cached NFSv3/v4 readdir cookies are associated with a verifier,
which is checked by the server on subsequent calls to readdir, and is
only expected to change when the cookies (and hence also the page cache
contents) are considered invalid.
We therefore do have to store the verifier, but only when the page cache
is empty.

Fixes: b593c09f83a2 ("NFS: Improve handling of directory verifiers")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 21c48c94b2d26..616add1720538 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -928,7 +928,12 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
 			}
 			return res;
 		}
-		memcpy(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf));
+		/*
+		 * Set the cookie verifier if the page cache was empty
+		 */
+		if (desc->page_index == 0)
+			memcpy(nfsi->cookieverf, verf,
+			       sizeof(nfsi->cookieverf));
 	}
 	res = nfs_readdir_search_array(desc);
 	if (res == 0) {

From 47c95cac6aa2e11ece1984d9077e737da37aec04 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Wed, 3 Apr 2019 10:00:01 +0530
Subject: [PATCH 115/737] Sysfs memory probe interface

(/sys/devices/system/memory/probe) can accept starting physical address of an
entire memory block to be hot added into the kernel. This is in addition to
the existing ACPI based interface. This just enables it with the required
config CONFIG_ARCH_MEMORY_PROBE.

Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
---
 arch/arm64/Kconfig | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 34bd4cba81e66..cd08651ca92bb 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -302,6 +302,15 @@ config ARCH_ENABLE_MEMORY_HOTPLUG
 config ARCH_ENABLE_MEMORY_HOTREMOVE
 	def_bool y
 
+config ARCH_MEMORY_PROBE
+	bool "Enable /sys/devices/system/memory/probe interface"
+	depends on MEMORY_HOTPLUG
+	help
+	  This option enables a sysfs /sys/devices/system/memory/probe
+	  interface for testing. See Documentation/memory-hotplug.txt
+	  for more information. If you are unsure how to answer this
+	  question, answer N.
+
 config SMP
 	def_bool y
 

From 94750c21f3b2f92c0486146109f5d717f826a77d Mon Sep 17 00:00:00 2001
From: Rohit Wali <rohiwali@amazon.com>
Date: Wed, 14 Jul 2021 17:30:08 +0000
Subject: [PATCH 116/737] arm64/mm: Enable sysfs based memory hot remove probe

Issue: Offlining non-boot memory on arm64 via
/sys/devices/system/memory/<mem_id>/state doesnt eliminate the struct page
memory associated with the offlined memory. As memory is offlined, total and
free memory reduce but the memory associated with struct page isnt given
back and is reported as 'used' memory instead. This is because offlining via
the sysfs 'state' probe doesnt remove the memmap associated with the memory
to be offlined.

Fix: Expose a sysfs probe that also removes memmap associated with the
memory block after offlining it. Probe exposed accepts the physical address of
a memory block to be removed.

Signed-off-by: Rohit Wali <rohiwali@amazon.com>
---
 arch/arm64/Kconfig    |  9 +++++++++
 drivers/base/memory.c | 31 +++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index cd08651ca92bb..63fc7f80ad918 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -311,6 +311,15 @@ config ARCH_MEMORY_PROBE
 	  for more information. If you are unsure how to answer this
 	  question, answer N.
 
+config ARCH_MEMORY_REMOVE
+	bool "Enable /sys/devices/system/memory/remove interface"
+	depends on MEMORY_HOTREMOVE
+	help
+	   This option enables a sysfs /sys/devices/system/memory/remove
+	   interface for testing. See Documentation/memory-hotplug.txt
+	   for more information. If you are unsure how to answer this
+	   question, answer N.
+
 config SMP
 	def_bool y
 
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 49eb14271f287..4f039436ac1ef 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -447,6 +447,34 @@ static ssize_t probe_store(struct device *dev, struct device_attribute *attr,
 static DEVICE_ATTR_WO(probe);
 #endif
 
+#ifdef CONFIG_ARCH_MEMORY_REMOVE
+static ssize_t remove_store(struct device *dev, struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	u64 phys_addr;
+	int nid, ret;
+	unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
+
+	ret = kstrtoull(buf, 0, &phys_addr);
+	if (ret)
+		return ret;
+
+	if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
+		return -EINVAL;
+
+	nid = memory_add_physaddr_to_nid(phys_addr);
+	ret = offline_and_remove_memory(nid, phys_addr, MIN_MEMORY_BLOCK_SIZE * sections_per_block);
+
+	if (ret)
+		return ret;
+
+	return count;
+}
+
+static DEVICE_ATTR_WO(remove);
+#endif
+
+
 #ifdef CONFIG_MEMORY_FAILURE
 /*
  * Support for offlining pages of memory
@@ -691,6 +719,9 @@ static struct attribute *memory_root_attrs[] = {
 #ifdef CONFIG_ARCH_MEMORY_PROBE
 	&dev_attr_probe.attr,
 #endif
+#ifdef CONFIG_ARCH_MEMORY_REMOVE
+	&dev_attr_remove.attr,
+#endif
 
 #ifdef CONFIG_MEMORY_FAILURE
 	&dev_attr_soft_offline_page.attr,

From 34a12ce0ff860e64bdd786752961db6f4236e783 Mon Sep 17 00:00:00 2001
From: Shaoying Xu <shaoyi@amazon.com>
Date: Tue, 27 Jul 2021 21:30:32 +0000
Subject: [PATCH 117/737] lustre: update to AmazonFSxLustreClient v2.10.8-8

Signed-off-by: Shaoying Xu <shaoyi@amazon.com>
---
 drivers/staging/lustrefsx/config.h                  | 12 ++++++++++++
 .../libcfs/include/libcfs/linux/linux-net.h         |  4 ++++
 .../lustrefsx/lustre/include/lustre_compat.h        | 13 +++++++++++++
 drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c |  4 ++++
 .../staging/lustrefsx/lustre/llite/llite_internal.h |  2 +-
 drivers/staging/lustrefsx/lustre/llite/llite_lib.c  |  5 ++---
 drivers/staging/lustrefsx/lustre/llite/llite_mmap.c | 13 +++++++++----
 drivers/staging/lustrefsx/lustre/llite/namei.c      |  3 +--
 drivers/staging/lustrefsx/lustre/llite/vvp_io.c     |  9 ++++++++-
 drivers/staging/lustrefsx/lustre/lov/lov_io.c       |  4 +++-
 drivers/staging/lustrefsx/undef.h                   | 12 ++++++++++++
 11 files changed, 69 insertions(+), 12 deletions(-)

diff --git a/drivers/staging/lustrefsx/config.h b/drivers/staging/lustrefsx/config.h
index 2ecd0c99d3809..cea872bd120de 100644
--- a/drivers/staging/lustrefsx/config.h
+++ b/drivers/staging/lustrefsx/config.h
@@ -735,6 +735,9 @@
 /* security_inode_init_security takes a 'struct qstr' parameter */
 /* #undef HAVE_SECURITY_IINITSEC_QSTR */
 
+/* security_release_secctx has 1 arg. */
+/* #undef HAVE_SEC_RELEASE_SECCTX_1ARG */
+
 /* support for selinux */
 #define HAVE_SELINUX 1
 
@@ -847,6 +850,12 @@
 /* tcp_sendpage use socket as first parameter */
 /* #undef HAVE_TCP_SENDPAGE_USE_SOCKET */
 
+/* 'tcp_sock_set_keepidle()' exists */
+#define HAVE_TCP_SOCK_SET_KEEPIDLE 1
+
+/* 'tcp_sock_set_nodelay()' exists */
+#define HAVE_TCP_SOCK_SET_NODELAY 1
+
 /* timer_setup has replaced setup_timer */
 #define HAVE_TIMER_SETUP 1
 
@@ -901,6 +910,9 @@
 /* virtual_address has been replaced by address field */
 #define HAVE_VM_FAULT_ADDRESS 1
 
+/* if VM_FAULT_RETRY is defined */
+#define HAVE_VM_FAULT_RETRY 1
+
 /* if vm_fault_t type exists */
 #define HAVE_VM_FAULT_T 1
 
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-net.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-net.h
index 41484bd3b44a4..98951f7a5d4bb 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-net.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-net.h
@@ -35,6 +35,7 @@ static inline void tcp_sock_set_quickack(struct sock *sk, int opt)
 			  (char *)&opt, sizeof(opt));
 }
 
+#if !defined(HAVE_TCP_SOCK_SET_NODELAY)
 static inline void tcp_sock_set_nodelay(struct sock *sk)
 {
 	int opt = 1;
@@ -43,7 +44,9 @@ static inline void tcp_sock_set_nodelay(struct sock *sk)
 	kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
 			  (char *)&opt, sizeof(opt));
 }
+#endif /* HAVE_TCP_SOCK_SET_NODELAY */
 
+#if !defined(HAVE_TCP_SOCK_SET_KEEPIDLE)
 static inline int tcp_sock_set_keepidle(struct sock *sk, int opt)
 {
 	struct socket *sock = sk->sk_socket;
@@ -51,6 +54,7 @@ static inline int tcp_sock_set_keepidle(struct sock *sk, int opt)
 	return kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE,
 				 (char *)&opt, sizeof(opt));
 }
+#endif /* HAVE_TCP_SOCK_SET_KEEPIDLE */
 
 static inline int tcp_sock_set_keepintvl(struct sock *sk, int opt)
 {
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
index 408efb8953cbc..441f737170daa 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
@@ -39,6 +39,7 @@
 #include <linux/bio.h>
 #include <linux/xattr.h>
 #include <linux/slab.h>
+#include <linux/security.h>
 
 #include <libcfs/libcfs.h>
 #include <lustre_patchless_compat.h>
@@ -710,4 +711,16 @@ static inline struct timespec current_time(struct inode *inode)
 	kmem_cache_create(name, size, align, flags, ctor)
 #endif
 
+static inline void ll_security_release_secctx(char *secdata, u32 seclen)
+{
+#ifdef HAVE_SEC_RELEASE_SECCTX_1ARG
+	struct lsmcontext context = { };
+
+	lsmcontext_init(&context, secdata, seclen, 0);
+	return security_release_secctx(&context);
+#else
+	return security_release_secctx(secdata, seclen);
+#endif
+}
+
 #endif /* _LUSTRE_COMPAT_H */
diff --git a/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c b/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c
index f6e429ba182c3..a5fe1978c66a2 100644
--- a/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c
+++ b/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c
@@ -181,6 +181,10 @@ int cl_file_inode_init(struct inode *inode, struct lustre_md *md)
 		}
 	} else {
 		result = cl_conf_set(env, lli->lli_clob, &conf);
+		if (result == -EBUSY) {
+			/* ignore the error since I/O will handle it later */
+			result = 0;
+		}
 	}
 
 	if (result != 0)
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_internal.h b/drivers/staging/lustrefsx/lustre/llite/llite_internal.h
index 4acb7cdcf2aff..ce05c17a2231f 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_internal.h
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_internal.h
@@ -54,7 +54,7 @@
 #define FMODE_EXEC 0
 #endif
 
-#ifndef VM_FAULT_RETRY
+#ifndef HAVE_VM_FAULT_RETRY
 #define VM_FAULT_RETRY 0
 #endif
 
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_lib.c b/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
index 297622d3d88f6..644b1c4e26d47 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
@@ -46,7 +46,6 @@
 #ifdef HAVE_UIDGID_HEADER
 # include <linux/uidgid.h>
 #endif
-#include <linux/security.h>
 
 #include <uapi/linux/lustre_ioctl.h>
 #ifdef HAVE_UAPI_LINUX_MOUNT_H
@@ -2530,8 +2529,8 @@ struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
 
 void ll_finish_md_op_data(struct md_op_data *op_data)
 {
-	security_release_secctx(op_data->op_file_secctx,
-				op_data->op_file_secctx_size);
+	ll_security_release_secctx(op_data->op_file_secctx,
+				   op_data->op_file_secctx_size);
         OBD_FREE_PTR(op_data);
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c b/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
index 2c6c54f47af61..e286c559c1f67 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
@@ -268,19 +268,24 @@ static vm_fault_t ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (IS_ERR(env))
 		RETURN(PTR_ERR(env));
 
-	if (fault_flag_allow_retry_first(vmf->flags) &&
-	    ll_sbi_has_fast_read(ll_i2sbi(file_inode(vma->vm_file)))) {
+	if (ll_sbi_has_fast_read(ll_i2sbi(file_inode(vma->vm_file)))) {
 		/* do fast fault */
+		bool has_retry = vmf->flags & FAULT_FLAG_RETRY_NOWAIT;
+
+		/* To avoid loops, instruct downstream to not drop mmap_sem */
+		vmf->flags |= FAULT_FLAG_RETRY_NOWAIT;
 		ll_cl_add(vma->vm_file, env, NULL, LCC_MMAP);
 		fault_ret = ll_filemap_fault(vma, vmf);
 		ll_cl_remove(vma->vm_file, env);
+		if (!has_retry)
+			vmf->flags &= ~FAULT_FLAG_RETRY_NOWAIT;
 
 		/* - If there is no error, then the page was found in cache and
 		 *   uptodate;
 		 * - If VM_FAULT_RETRY is set, the page existed but failed to
-		 *   lock. It will return to kernel and retry;
+		 *   lock. We will try slow path to avoid loops.
 		 * - Otherwise, it should try normal fault under DLM lock. */
-		if ((fault_ret & VM_FAULT_RETRY) ||
+		if (!(fault_ret & VM_FAULT_RETRY) &&
 		    !(fault_ret & VM_FAULT_ERROR))
 			GOTO(out, result = 0);
 
diff --git a/drivers/staging/lustrefsx/lustre/llite/namei.c b/drivers/staging/lustrefsx/lustre/llite/namei.c
index 622f9a44f407c..ae7101b1885f2 100644
--- a/drivers/staging/lustrefsx/lustre/llite/namei.c
+++ b/drivers/staging/lustrefsx/lustre/llite/namei.c
@@ -36,7 +36,6 @@
 #include <linux/quotaops.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
-#include <linux/security.h>
 #include <linux/user_namespace.h>
 #ifdef HAVE_UIDGID_HEADER
 # include <linux/uidgid.h>
@@ -785,7 +784,7 @@ static int ll_atomic_open(struct inode *dir, struct dentry *dentry,
 		if (it_disposition(it, DISP_OPEN_CREATE)) {
 			/* Dentry instantiated in ll_create_it. */
 			rc = ll_create_it(dir, dentry, it, secctx, secctxlen);
-			security_release_secctx(secctx, secctxlen);
+			ll_security_release_secctx(secctx, secctxlen);
 			if (rc) {
 				/* We dget in ll_splice_alias. */
 				if (de != NULL)
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_io.c b/drivers/staging/lustrefsx/lustre/llite/vvp_io.c
index a1280f9bff131..1bcadeb7cf0da 100644
--- a/drivers/staging/lustrefsx/lustre/llite/vvp_io.c
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_io.c
@@ -354,7 +354,14 @@ static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
 				end = start + io->u.ci_rw.rw_range.cir_count;
 			}
 		} else if (cl_io_is_trunc(io)) {
-			end = io->u.ci_setattr.sa_attr.lvb_size;
+			/* for writes, e_end is endpos, the location of the file
+			 * pointer after the write is completed, so it is not accessed.
+			 * For truncate, 'end' is the size, and *is* acccessed.
+			 * In other words, writes are [start, end), but truncate is
+			 * [start, size], where both are included.  So add 1 to the
+			 * size when creating the write intent to account for this.
+			 */
+			end = io->u.ci_setattr.sa_attr.lvb_size + 1;
 		} else { /* mkwrite */
 			pgoff_t index = io->u.ci_fault.ft_index;
 
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_io.c b/drivers/staging/lustrefsx/lustre/lov/lov_io.c
index f40dfa274c356..5544a9744b73e 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_io.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_io.c
@@ -569,7 +569,9 @@ static int lov_io_setattr_iter_init(const struct lu_env *env,
 	ENTRY;
 
 	if (cl_io_is_trunc(io) && lio->lis_pos > 0) {
-		index = lov_lsm_entry(lsm, lio->lis_pos - 1);
+		index = lov_lsm_entry(lsm, lio->lis_pos);
+		CDEBUG(D_VFSTRACE, "component[%d] flags %#x pos %llu\n",
+			index, lsm->lsm_entries[index]->lsme_flags, lio->lis_pos);
 		if (index > 0 && !lsm_entry_inited(lsm, index)) {
 			io->ci_need_write_intent = 1;
 			RETURN(io->ci_result = -ENODATA);
diff --git a/drivers/staging/lustrefsx/undef.h b/drivers/staging/lustrefsx/undef.h
index b1ea346eb8f4a..aa1343bf5a36d 100644
--- a/drivers/staging/lustrefsx/undef.h
+++ b/drivers/staging/lustrefsx/undef.h
@@ -733,6 +733,9 @@
 /* security_inode_init_security takes a 'struct qstr' parameter */
 #undef HAVE_SECURITY_IINITSEC_QSTR
 
+/* security_release_secctx has 1 arg. */
+#undef HAVE_SEC_RELEASE_SECCTX_1ARG
+
 /* support for selinux */
 #undef HAVE_SELINUX
 
@@ -845,6 +848,12 @@
 /* tcp_sendpage use socket as first parameter */
 #undef HAVE_TCP_SENDPAGE_USE_SOCKET
 
+/* 'tcp_sock_set_keepidle()' exists */
+#undef HAVE_TCP_SOCK_SET_KEEPIDLE
+
+/* 'tcp_sock_set_nodelay()' exists */
+#undef HAVE_TCP_SOCK_SET_NODELAY
+
 /* timer_setup has replaced setup_timer */
 #undef HAVE_TIMER_SETUP
 
@@ -899,6 +908,9 @@
 /* virtual_address has been replaced by address field */
 #undef HAVE_VM_FAULT_ADDRESS
 
+/* if VM_FAULT_RETRY is defined */
+#undef HAVE_VM_FAULT_RETRY
+
 /* if vm_fault_t type exists */
 #undef HAVE_VM_FAULT_T
 

From 73c9d9ce47cff292444b164e8ae071eebe1ffa0d Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Mon, 26 Jul 2021 16:51:55 +0200
Subject: [PATCH 118/737] efi/libstub: arm64: Warn when efi_random_alloc()
 fails

Randomization of the physical load address of the kernel image relies on
efi_random_alloc() returning successfully, and currently, we ignore any
failures and just carry on, using the ordinary, non-randomized page
allocator routine. This means we never find out if a failure occurs,
which could harm security, so let's at least warn about this condition.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Benjamin Herrenschmidt <benh@amazon.com>
---
 drivers/firmware/efi/libstub/arm64-stub.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c
index 7f4bafcd9d335..58943f9290b40 100644
--- a/drivers/firmware/efi/libstub/arm64-stub.c
+++ b/drivers/firmware/efi/libstub/arm64-stub.c
@@ -134,6 +134,8 @@ efi_status_t handle_kernel_image(unsigned long *image_addr,
 		 */
 		status = efi_random_alloc(*reserve_size, min_kimg_align,
 					  reserve_addr, phys_seed);
+		if (status != EFI_SUCCESS)
+			efi_warn("efi_random_alloc() failed: 0x%lx\n", status);
 	} else {
 		status = EFI_OUT_OF_RESOURCES;
 	}

From d81184b5d92838c419160107c085bd1099f5c768 Mon Sep 17 00:00:00 2001
From: Bharata B Rao <bharata@amd.com>
Date: Mon, 30 Aug 2021 17:46:02 +0530
Subject: [PATCH 119/737] mm/page_alloc: Print node fallback order

Print information message about the allocation fallback order
for each NUMA node during boot.

No functional changes here. This makes it easier to illustrate
the problem in the node fallback list generation, which the
next patch fixes.

Signed-off-by: Bharata B Rao <bharata@amd.com>
---
 mm/page_alloc.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d85435db35f37..cc37faa3852c3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5886,6 +5886,10 @@ static void build_zonelists(pg_data_t *pgdat)
 
 	build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
 	build_thisnode_zonelists(pgdat);
+	pr_info("Fallback order for Node %d: ", local_node);
+	for (node = 0; node < nr_nodes; node++)
+		pr_cont("%d ", node_order[node]);
+	pr_cont("\n");
 }
 
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES

From 1c3db2b9a74dc64fcf8c5339a32c30554e0c73ea Mon Sep 17 00:00:00 2001
From: Krupa Ramakrishnan <krupa.ramakrishnan@amd.com>
Date: Mon, 30 Aug 2021 17:46:03 +0530
Subject: [PATCH 120/737] mm/page_alloc: Use accumulated load when building
 node fallback list

In build_zonelists(), when the fallback list is built for the nodes,
the node load gets reinitialized during each iteration. This results
in nodes with same distances occupying the same slot in different
node fallback lists rather than appearing in the intended round-
robin manner. This results in one node getting picked for allocation
more compared to other nodes with the same distance.

As an example, consider a 4 node system with the following distance
matrix.

Node 0  1  2  3
----------------
0    10 12 32 32
1    12 10 32 32
2    32 32 10 12
3    32 32 12 10

For this case, the node fallback list gets built like this:

Node	Fallback list
---------------------
0	0 1 2 3
1	1 0 3 2
2	2 3 0 1
3	3 2 0 1 <-- Unexpected fallback order

In the fallback list for nodes 2 and 3, the nodes 0 and 1
appear in the same order which results in more allocations
getting satisfied from node 0 compared to node 1.

The effect of this on remote memory bandwidth as seen by stream
benchmark is shown below:

Case 1: Bandwidth from cores on nodes 2 & 3 to memory on nodes 0 & 1
	(numactl -m 0,1 ./stream_lowOverhead ... --cores <from 2, 3>)
Case 2: Bandwidth from cores on nodes 0 & 1 to memory on nodes 2 & 3
	(numactl -m 2,3 ./stream_lowOverhead ... --cores <from 0, 1>)

----------------------------------------
		BANDWIDTH (MB/s)
    TEST	Case 1		Case 2
----------------------------------------
    COPY	57479.6		110791.8
   SCALE	55372.9		105685.9
     ADD	50460.6		96734.2
  TRIADD	50397.6		97119.1
----------------------------------------

The bandwidth drop in Case 1 occurs because most of the allocations
get satisfied by node 0 as it appears first in the fallback order
for both nodes 2 and 3.

This can be fixed by accumulating the node load in build_zonelists()
rather than reinitializing it during each iteration. With this the
nodes with the same distance rightly get assigned in the round robin
manner. In fact this was how it was originally until the
commit f0c0b2b808f2 ("change zonelist order: zonelist order selection
logic") dropped the load accumulation and resorted to initializing
the load during each iteration. While zonelist ordering was removed by
commit c9bff3eebc09 ("mm, page_alloc: rip out ZONELIST_ORDER_ZONE"),
the change to the node load accumulation in build_zonelists() remained.
So essentially this patch reverts back to the accumulated node load
logic.

After this fix, the fallback order gets built like this:

Node Fallback list
------------------
0    0 1 2 3
1    1 0 3 2
2    2 3 0 1
3    3 2 1 0 <-- Note the change here

The bandwidth in Case 1 improves and matches Case 2 as shown below.

----------------------------------------
		BANDWIDTH (MB/s)
    TEST	Case 1		Case 2
----------------------------------------
    COPY	110438.9	110107.2
   SCALE	105930.5	105817.5
     ADD	97005.1		96159.8
  TRIADD	97441.5		96757.1
----------------------------------------

The correctness of the fallback list generation has been verified
for the above node configuration where the node 3 starts as
memory-less node and comes up online only during memory hotplug.

[bharata@amd.com: Added changelog, review, test validation]

Fixes: f0c0b2b808f2 ("change zonelist order: zonelist order selection
logic")
Signed-off-by: Krupa Ramakrishnan <krupa.ramakrishnan@amd.com>
Co-developed-by: Sadagopan Srinivasan <Sadagopan.Srinivasan@amd.com>
Signed-off-by: Sadagopan Srinivasan <Sadagopan.Srinivasan@amd.com>
Signed-off-by: Bharata B Rao <bharata@amd.com>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cc37faa3852c3..1606d6d726b92 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5877,7 +5877,7 @@ static void build_zonelists(pg_data_t *pgdat)
 		 */
 		if (node_distance(local_node, node) !=
 		    node_distance(local_node, prev_node))
-			node_load[node] = load;
+			node_load[node] += load;
 
 		node_order[nr_nodes++] = node;
 		prev_node = node;

From 47baad5aac70158453d870e1519bac8c4db2abf1 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 9 Dec 2020 14:09:24 +0800
Subject: [PATCH 121/737] arm/arm64: Probe for the presence of KVM hypervisor

Although the SMCCC specification provides some limited functionality for
describing the presence of hypervisor and firmware services, this is
generally applicable only to functions designated as "Arm Architecture
Service Functions" and no portable discovery mechanism is provided for
standard hypervisor services, despite having a designated range of
function identifiers reserved by the specification.

In an attempt to avoid the need for additional firmware changes every
time a new function is added, introduce a UID to identify the service
provider as being compatible with KVM. Once this has been established,
additional services can be discovered via a feature bitmap.

Reviewed-by: Steven Price <steven.price@arm.com>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
[maz: move code to its own file, plug it into PSCI]
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20201209060932.212364-2-jianyong.wu@arm.com
(cherry picked from commit 6e085e0ac9cf16298b5fefe0b1893f98ef765812)
---
 arch/arm/include/asm/hypervisor.h   |  3 ++
 arch/arm64/include/asm/hypervisor.h |  3 ++
 drivers/firmware/psci/psci.c        |  2 ++
 drivers/firmware/smccc/Makefile     |  2 +-
 drivers/firmware/smccc/kvm_guest.c  | 50 +++++++++++++++++++++++++++++
 include/linux/arm-smccc.h           | 25 +++++++++++++++
 6 files changed, 84 insertions(+), 1 deletion(-)
 create mode 100644 drivers/firmware/smccc/kvm_guest.c

diff --git a/arch/arm/include/asm/hypervisor.h b/arch/arm/include/asm/hypervisor.h
index df8524365637a..bd61502b97153 100644
--- a/arch/arm/include/asm/hypervisor.h
+++ b/arch/arm/include/asm/hypervisor.h
@@ -4,4 +4,7 @@
 
 #include <asm/xen/hypervisor.h>
 
+void kvm_init_hyp_services(void);
+bool kvm_arm_hyp_service_available(u32 func_id);
+
 #endif
diff --git a/arch/arm64/include/asm/hypervisor.h b/arch/arm64/include/asm/hypervisor.h
index f9cc1d0217915..0ae427f352c8c 100644
--- a/arch/arm64/include/asm/hypervisor.h
+++ b/arch/arm64/include/asm/hypervisor.h
@@ -4,4 +4,7 @@
 
 #include <asm/xen/hypervisor.h>
 
+void kvm_init_hyp_services(void);
+bool kvm_arm_hyp_service_available(u32 func_id);
+
 #endif
diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c
index 00af99b6f97c1..ffde5feb728d7 100644
--- a/drivers/firmware/psci/psci.c
+++ b/drivers/firmware/psci/psci.c
@@ -23,6 +23,7 @@
 
 #include <asm/cpuidle.h>
 #include <asm/cputype.h>
+#include <asm/hypervisor.h>
 #include <asm/system_misc.h>
 #include <asm/smp_plat.h>
 #include <asm/suspend.h>
@@ -470,6 +471,7 @@ static int __init psci_probe(void)
 		psci_init_cpu_suspend();
 		psci_init_system_suspend();
 		psci_init_system_reset2();
+		kvm_init_hyp_services();
 	}
 
 	return 0;
diff --git a/drivers/firmware/smccc/Makefile b/drivers/firmware/smccc/Makefile
index 72ab840428324..40d19144a8607 100644
--- a/drivers/firmware/smccc/Makefile
+++ b/drivers/firmware/smccc/Makefile
@@ -1,4 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
 #
-obj-$(CONFIG_HAVE_ARM_SMCCC_DISCOVERY)	+= smccc.o
+obj-$(CONFIG_HAVE_ARM_SMCCC_DISCOVERY)	+= smccc.o kvm_guest.o
 obj-$(CONFIG_ARM_SMCCC_SOC_ID)	+= soc_id.o
diff --git a/drivers/firmware/smccc/kvm_guest.c b/drivers/firmware/smccc/kvm_guest.c
new file mode 100644
index 0000000000000..2d3e866decaa6
--- /dev/null
+++ b/drivers/firmware/smccc/kvm_guest.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) "smccc: KVM: " fmt
+
+#include <linux/arm-smccc.h>
+#include <linux/bitmap.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+#include <asm/hypervisor.h>
+
+static DECLARE_BITMAP(__kvm_arm_hyp_services, ARM_SMCCC_KVM_NUM_FUNCS) __ro_after_init = { };
+
+void __init kvm_init_hyp_services(void)
+{
+	struct arm_smccc_res res;
+	u32 val[4];
+
+	if (arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_HVC)
+		return;
+
+	arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID, &res);
+	if (res.a0 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0 ||
+	    res.a1 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1 ||
+	    res.a2 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2 ||
+	    res.a3 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3)
+		return;
+
+	memset(&res, 0, sizeof(res));
+	arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID, &res);
+
+	val[0] = lower_32_bits(res.a0);
+	val[1] = lower_32_bits(res.a1);
+	val[2] = lower_32_bits(res.a2);
+	val[3] = lower_32_bits(res.a3);
+
+	bitmap_from_arr32(__kvm_arm_hyp_services, val, ARM_SMCCC_KVM_NUM_FUNCS);
+
+	pr_info("hypervisor services detected (0x%08lx 0x%08lx 0x%08lx 0x%08lx)\n",
+		 res.a3, res.a2, res.a1, res.a0);
+}
+
+bool kvm_arm_hyp_service_available(u32 func_id)
+{
+	if (func_id >= ARM_SMCCC_KVM_NUM_FUNCS)
+		return false;
+
+	return test_bit(func_id, __kvm_arm_hyp_services);
+}
+EXPORT_SYMBOL_GPL(kvm_arm_hyp_service_available);
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index ff38737475ecb..b789cb58bbd89 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -55,6 +55,8 @@
 #define ARM_SMCCC_OWNER_TRUSTED_OS	50
 #define ARM_SMCCC_OWNER_TRUSTED_OS_END	63
 
+#define ARM_SMCCC_FUNC_QUERY_CALL_UID  0xff01
+
 #define ARM_SMCCC_QUIRK_NONE		0
 #define ARM_SMCCC_QUIRK_QCOM_A6		1 /* Save/restore register a6 */
 
@@ -92,6 +94,29 @@
 			   ARM_SMCCC_SMC_32,				\
 			   0, 0x3fff)
 
+#define ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID				\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_32,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   ARM_SMCCC_FUNC_QUERY_CALL_UID)
+
+/* KVM UID value: 28b46fb6-2ec5-11e9-a9ca-4b564d003a74 */
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0	0xb66fb428U
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1	0xe911c52eU
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2	0x564bcaa9U
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3	0x743a004dU
+
+/* KVM "vendor specific" services */
+#define ARM_SMCCC_KVM_FUNC_FEATURES		0
+#define ARM_SMCCC_KVM_FUNC_FEATURES_2		127
+#define ARM_SMCCC_KVM_NUM_FUNCS			128
+
+#define ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID			\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_32,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   ARM_SMCCC_KVM_FUNC_FEATURES)
+
 #define SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED	1
 
 /* Paravirtualised time calls (defined by ARM DEN0057A) */

From c30528f7ea9197cae7ec8a56ca8a74538232321c Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 9 Dec 2020 14:09:25 +0800
Subject: [PATCH 122/737] KVM: arm64: Advertise KVM UID to guests via SMCCC

We can advertise ourselves to guests as KVM and provide a basic features
bitmap for discoverability of future hypervisor services.

Cc: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20201209060932.212364-3-jianyong.wu@arm.com
(cherry picked from commit 923961a7ff2e94d3d824d9ea7047178a5a123245)
---
 arch/arm64/kvm/hypercalls.c | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index bc111a1aff032..8e17242ef2c83 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -12,13 +12,13 @@
 int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 {
 	u32 func_id = smccc_get_function(vcpu);
-	long val = SMCCC_RET_NOT_SUPPORTED;
+	u64 val[4] = {SMCCC_RET_NOT_SUPPORTED};
 	u32 feature;
 	gpa_t gpa;
 
 	switch (func_id) {
 	case ARM_SMCCC_VERSION_FUNC_ID:
-		val = ARM_SMCCC_VERSION_1_1;
+		val[0] = ARM_SMCCC_VERSION_1_1;
 		break;
 	case ARM_SMCCC_ARCH_FEATURES_FUNC_ID:
 		feature = smccc_get_arg1(vcpu);
@@ -28,10 +28,10 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 			case SPECTRE_VULNERABLE:
 				break;
 			case SPECTRE_MITIGATED:
-				val = SMCCC_RET_SUCCESS;
+				val[0] = SMCCC_RET_SUCCESS;
 				break;
 			case SPECTRE_UNAFFECTED:
-				val = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED;
+				val[0] = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED;
 				break;
 			}
 			break;
@@ -54,7 +54,7 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 					break;
 				fallthrough;
 			case SPECTRE_UNAFFECTED:
-				val = SMCCC_RET_NOT_REQUIRED;
+				val[0] = SMCCC_RET_NOT_REQUIRED;
 				break;
 			}
 			break;
@@ -63,30 +63,39 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 			case SPECTRE_VULNERABLE:
 				break;
 			case SPECTRE_MITIGATED:
-				val = SMCCC_RET_SUCCESS;
+				val[0] = SMCCC_RET_SUCCESS;
 				break;
 			case SPECTRE_UNAFFECTED:
-				val = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED;
+				val[0] = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED;
 				break;
 			}
 			break;
 		case ARM_SMCCC_HV_PV_TIME_FEATURES:
-			val = SMCCC_RET_SUCCESS;
+			val[0] = SMCCC_RET_SUCCESS;
 			break;
 		}
 		break;
 	case ARM_SMCCC_HV_PV_TIME_FEATURES:
-		val = kvm_hypercall_pv_features(vcpu);
+		val[0] = kvm_hypercall_pv_features(vcpu);
 		break;
 	case ARM_SMCCC_HV_PV_TIME_ST:
 		gpa = kvm_init_stolen_time(vcpu);
 		if (gpa != GPA_INVALID)
-			val = gpa;
+			val[0] = gpa;
+		break;
+	case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID:
+		val[0] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0;
+		val[1] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1;
+		val[2] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2;
+		val[3] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3;
+		break;
+	case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
+		val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES);
 		break;
 	default:
 		return kvm_psci_call(vcpu);
 	}
 
-	smccc_set_retval(vcpu, val, 0, 0, 0);
+	smccc_set_retval(vcpu, val[0], val[1], val[2], val[3]);
 	return 1;
 }

From c9c790fbc33d90bcd32ee181a8cd4715b02af3e7 Mon Sep 17 00:00:00 2001
From: Jianyong Wu <jianyong.wu@arm.com>
Date: Wed, 9 Dec 2020 14:09:26 +0800
Subject: [PATCH 123/737] ptp: Reorganize ptp_kvm.c to make it arch-independent

Currently, the ptp_kvm module contains a lot of x86-specific code.
Let's move this code into a new arch-specific file in the same directory,
and rename the arch-independent file to ptp_kvm_common.c.

Acked-by: Richard Cochran <richardcochran@gmail.com>
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20201209060932.212364-4-jianyong.wu@arm.com
(cherry picked from commit a8cf291bdac5d415eadb55e79df1fca8c3f0dfef)
---
 drivers/ptp/Makefile                        |  1 +
 drivers/ptp/{ptp_kvm.c => ptp_kvm_common.c} | 84 +++++-------------
 drivers/ptp/ptp_kvm_x86.c                   | 97 +++++++++++++++++++++
 include/linux/ptp_kvm.h                     | 19 ++++
 4 files changed, 139 insertions(+), 62 deletions(-)
 rename drivers/ptp/{ptp_kvm.c => ptp_kvm_common.c} (60%)
 create mode 100644 drivers/ptp/ptp_kvm_x86.c
 create mode 100644 include/linux/ptp_kvm.h

diff --git a/drivers/ptp/Makefile b/drivers/ptp/Makefile
index 7aff75f745dca..699a4e4d19c2d 100644
--- a/drivers/ptp/Makefile
+++ b/drivers/ptp/Makefile
@@ -4,6 +4,7 @@
 #
 
 ptp-y					:= ptp_clock.o ptp_chardev.o ptp_sysfs.o
+ptp_kvm-$(CONFIG_X86)			:= ptp_kvm_x86.o ptp_kvm_common.o
 obj-$(CONFIG_PTP_1588_CLOCK)		+= ptp.o
 obj-$(CONFIG_PTP_1588_CLOCK_DTE)	+= ptp_dte.o
 obj-$(CONFIG_PTP_1588_CLOCK_INES)	+= ptp_ines.o
diff --git a/drivers/ptp/ptp_kvm.c b/drivers/ptp/ptp_kvm_common.c
similarity index 60%
rename from drivers/ptp/ptp_kvm.c
rename to drivers/ptp/ptp_kvm_common.c
index 658d33fc31952..721ddcede5e19 100644
--- a/drivers/ptp/ptp_kvm.c
+++ b/drivers/ptp/ptp_kvm_common.c
@@ -8,11 +8,11 @@
 #include <linux/err.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/ptp_kvm.h>
 #include <uapi/linux/kvm_para.h>
 #include <asm/kvm_para.h>
-#include <asm/pvclock.h>
-#include <asm/kvmclock.h>
 #include <uapi/asm/kvm_para.h>
 
 #include <linux/ptp_clock_kernel.h>
@@ -24,56 +24,29 @@ struct kvm_ptp_clock {
 
 static DEFINE_SPINLOCK(kvm_ptp_lock);
 
-static struct pvclock_vsyscall_time_info *hv_clock;
-
-static struct kvm_clock_pairing clock_pair;
-static phys_addr_t clock_pair_gpa;
-
 static int ptp_kvm_get_time_fn(ktime_t *device_time,
 			       struct system_counterval_t *system_counter,
 			       void *ctx)
 {
-	unsigned long ret;
+	long ret;
+	u64 cycle;
 	struct timespec64 tspec;
-	unsigned version;
-	int cpu;
-	struct pvclock_vcpu_time_info *src;
+	struct clocksource *cs;
 
 	spin_lock(&kvm_ptp_lock);
 
 	preempt_disable_notrace();
-	cpu = smp_processor_id();
-	src = &hv_clock[cpu].pvti;
-
-	do {
-		/*
-		 * We are using a TSC value read in the hosts
-		 * kvm_hc_clock_pairing handling.
-		 * So any changes to tsc_to_system_mul
-		 * and tsc_shift or any other pvclock
-		 * data invalidate that measurement.
-		 */
-		version = pvclock_read_begin(src);
-
-		ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
-				     clock_pair_gpa,
-				     KVM_CLOCK_PAIRING_WALLCLOCK);
-		if (ret != 0) {
-			pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret);
-			spin_unlock(&kvm_ptp_lock);
-			preempt_enable_notrace();
-			return -EOPNOTSUPP;
-		}
-
-		tspec.tv_sec = clock_pair.sec;
-		tspec.tv_nsec = clock_pair.nsec;
-		ret = __pvclock_read_cycles(src, clock_pair.tsc);
-	} while (pvclock_read_retry(src, version));
+	ret = kvm_arch_ptp_get_crosststamp(&cycle, &tspec, &cs);
+	if (ret) {
+		spin_unlock(&kvm_ptp_lock);
+		preempt_enable_notrace();
+		return ret;
+	}
 
 	preempt_enable_notrace();
 
-	system_counter->cycles = ret;
-	system_counter->cs = &kvm_clock;
+	system_counter->cycles = cycle;
+	system_counter->cs = cs;
 
 	*device_time = timespec64_to_ktime(tspec);
 
@@ -111,22 +84,17 @@ static int ptp_kvm_settime(struct ptp_clock_info *ptp,
 
 static int ptp_kvm_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
 {
-	unsigned long ret;
+	long ret;
 	struct timespec64 tspec;
 
 	spin_lock(&kvm_ptp_lock);
 
-	ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
-			     clock_pair_gpa,
-			     KVM_CLOCK_PAIRING_WALLCLOCK);
-	if (ret != 0) {
-		pr_err_ratelimited("clock offset hypercall ret %lu\n", ret);
+	ret = kvm_arch_ptp_get_clock(&tspec);
+	if (ret) {
 		spin_unlock(&kvm_ptp_lock);
-		return -EOPNOTSUPP;
+		return ret;
 	}
 
-	tspec.tv_sec = clock_pair.sec;
-	tspec.tv_nsec = clock_pair.nsec;
 	spin_unlock(&kvm_ptp_lock);
 
 	memcpy(ts, &tspec, sizeof(struct timespec64));
@@ -168,19 +136,11 @@ static int __init ptp_kvm_init(void)
 {
 	long ret;
 
-	if (!kvm_para_available())
-		return -ENODEV;
-
-	clock_pair_gpa = slow_virt_to_phys(&clock_pair);
-	hv_clock = pvclock_get_pvti_cpu0_va();
-
-	if (!hv_clock)
-		return -ENODEV;
-
-	ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa,
-			KVM_CLOCK_PAIRING_WALLCLOCK);
-	if (ret == -KVM_ENOSYS || ret == -KVM_EOPNOTSUPP)
-		return -ENODEV;
+	ret = kvm_arch_ptp_init();
+	if (ret) {
+		pr_err("fail to initialize ptp_kvm");
+		return ret;
+	}
 
 	kvm_ptp_clock.caps = ptp_kvm_caps;
 
diff --git a/drivers/ptp/ptp_kvm_x86.c b/drivers/ptp/ptp_kvm_x86.c
new file mode 100644
index 0000000000000..3dd519dfc473c
--- /dev/null
+++ b/drivers/ptp/ptp_kvm_x86.c
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Virtual PTP 1588 clock for use with KVM guests
+ *
+ * Copyright (C) 2017 Red Hat Inc.
+ */
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <asm/pvclock.h>
+#include <asm/kvmclock.h>
+#include <linux/module.h>
+#include <uapi/asm/kvm_para.h>
+#include <uapi/linux/kvm_para.h>
+#include <linux/ptp_clock_kernel.h>
+#include <linux/ptp_kvm.h>
+
+struct pvclock_vsyscall_time_info *hv_clock;
+
+static phys_addr_t clock_pair_gpa;
+static struct kvm_clock_pairing clock_pair;
+
+int kvm_arch_ptp_init(void)
+{
+	long ret;
+
+	if (!kvm_para_available())
+		return -ENODEV;
+
+	clock_pair_gpa = slow_virt_to_phys(&clock_pair);
+	hv_clock = pvclock_get_pvti_cpu0_va();
+	if (!hv_clock)
+		return -ENODEV;
+
+	ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa,
+			     KVM_CLOCK_PAIRING_WALLCLOCK);
+	if (ret == -KVM_ENOSYS || ret == -KVM_EOPNOTSUPP)
+		return -ENODEV;
+
+	return 0;
+}
+
+int kvm_arch_ptp_get_clock(struct timespec64 *ts)
+{
+	long ret;
+
+	ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
+			     clock_pair_gpa,
+			     KVM_CLOCK_PAIRING_WALLCLOCK);
+	if (ret != 0) {
+		pr_err_ratelimited("clock offset hypercall ret %lu\n", ret);
+		return -EOPNOTSUPP;
+	}
+
+	ts->tv_sec = clock_pair.sec;
+	ts->tv_nsec = clock_pair.nsec;
+
+	return 0;
+}
+
+int kvm_arch_ptp_get_crosststamp(u64 *cycle, struct timespec64 *tspec,
+			      struct clocksource **cs)
+{
+	struct pvclock_vcpu_time_info *src;
+	unsigned int version;
+	long ret;
+	int cpu;
+
+	cpu = smp_processor_id();
+	src = &hv_clock[cpu].pvti;
+
+	do {
+		/*
+		 * We are using a TSC value read in the hosts
+		 * kvm_hc_clock_pairing handling.
+		 * So any changes to tsc_to_system_mul
+		 * and tsc_shift or any other pvclock
+		 * data invalidate that measurement.
+		 */
+		version = pvclock_read_begin(src);
+
+		ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
+				     clock_pair_gpa,
+				     KVM_CLOCK_PAIRING_WALLCLOCK);
+		if (ret != 0) {
+			pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret);
+			return -EOPNOTSUPP;
+		}
+		tspec->tv_sec = clock_pair.sec;
+		tspec->tv_nsec = clock_pair.nsec;
+		*cycle = __pvclock_read_cycles(src, clock_pair.tsc);
+	} while (pvclock_read_retry(src, version));
+
+	*cs = &kvm_clock;
+
+	return 0;
+}
diff --git a/include/linux/ptp_kvm.h b/include/linux/ptp_kvm.h
new file mode 100644
index 0000000000000..f960a719f0d54
--- /dev/null
+++ b/include/linux/ptp_kvm.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Virtual PTP 1588 clock for use with KVM guests
+ *
+ * Copyright (C) 2017 Red Hat Inc.
+ */
+
+#ifndef _PTP_KVM_H_
+#define _PTP_KVM_H_
+
+struct timespec64;
+struct clocksource;
+
+int kvm_arch_ptp_init(void);
+int kvm_arch_ptp_get_clock(struct timespec64 *ts);
+int kvm_arch_ptp_get_crosststamp(u64 *cycle,
+		struct timespec64 *tspec, struct clocksource **cs);
+
+#endif /* _PTP_KVM_H_ */

From 59312da84610c47eae807bd8e40bb54f527e8c20 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 Dec 2020 14:09:27 +0800
Subject: [PATCH 124/737] time: Add mechanism to recognize clocksource in
 time_get_snapshot

System time snapshots are not conveying information about the current
clocksource which was used, but callers like the PTP KVM guest
implementation have the requirement to evaluate the clocksource type to
select the appropriate mechanism.

Introduce a clocksource id field in struct clocksource which is by default
set to CSID_GENERIC (0). Clocksource implementations can set that field to
a value which allows to identify the clocksource.

Store the clocksource id of the current clocksource in the
system_time_snapshot so callers can evaluate which clocksource was used to
take the snapshot and act accordingly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20201209060932.212364-5-jianyong.wu@arm.com
(cherry picked from commit b2c67cbe9f447312f5cdd7c6641b463f2349aec0)
---
 include/linux/clocksource.h     |  6 ++++++
 include/linux/clocksource_ids.h | 11 +++++++++++
 include/linux/timekeeping.h     | 12 +++++++-----
 kernel/time/clocksource.c       |  2 ++
 kernel/time/timekeeping.c       |  1 +
 5 files changed, 27 insertions(+), 5 deletions(-)
 create mode 100644 include/linux/clocksource_ids.h

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 8f87c1a6f3231..65783d0db2d59 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -17,6 +17,7 @@
 #include <linux/timer.h>
 #include <linux/init.h>
 #include <linux/of.h>
+#include <linux/clocksource_ids.h>
 #include <asm/div64.h>
 #include <asm/io.h>
 
@@ -64,6 +65,10 @@ struct module;
  *			400-499: Perfect
  *				The ideal clocksource. A must-use where
  *				available.
+ * @id:			Defaults to CSID_GENERIC. The id value is captured
+ *			in certain snapshot functions to allow callers to
+ *			validate the clocksource from which the snapshot was
+ *			taken.
  * @flags:		Flags describing special properties
  * @enable:		Optional function to enable the clocksource
  * @disable:		Optional function to disable the clocksource
@@ -103,6 +108,7 @@ struct clocksource {
 	const char		*name;
 	struct list_head	list;
 	int			rating;
+	enum clocksource_ids	id;
 	enum vdso_clock_mode	vdso_clock_mode;
 	unsigned long		flags;
 
diff --git a/include/linux/clocksource_ids.h b/include/linux/clocksource_ids.h
new file mode 100644
index 0000000000000..4d8e19e05328f
--- /dev/null
+++ b/include/linux/clocksource_ids.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CLOCKSOURCE_IDS_H
+#define _LINUX_CLOCKSOURCE_IDS_H
+
+/* Enum to give clocksources a unique identifier */
+enum clocksource_ids {
+	CSID_GENERIC		= 0,
+	CSID_MAX,
+};
+
+#endif
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 7f7e4a3f4394a..2ee05355333f6 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -3,6 +3,7 @@
 #define _LINUX_TIMEKEEPING_H
 
 #include <linux/errno.h>
+#include <linux/clocksource_ids.h>
 
 /* Included from linux/ktime.h */
 
@@ -244,11 +245,12 @@ struct ktime_timestamps {
  * @cs_was_changed_seq:	The sequence number of clocksource change events
  */
 struct system_time_snapshot {
-	u64		cycles;
-	ktime_t		real;
-	ktime_t		raw;
-	unsigned int	clock_was_set_seq;
-	u8		cs_was_changed_seq;
+	u64			cycles;
+	ktime_t			real;
+	ktime_t			raw;
+	enum clocksource_ids	cs_id;
+	unsigned int		clock_was_set_seq;
+	u8			cs_was_changed_seq;
 };
 
 /**
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 86e0fbe583f2b..7fd99cb7c22fe 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -1108,6 +1108,8 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 
 	clocksource_arch_init(cs);
 
+	if (WARN_ON_ONCE((unsigned int)cs->id >= CSID_MAX))
+		cs->id = CSID_GENERIC;
 	if (cs->vdso_clock_mode < 0 ||
 	    cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) {
 		pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n",
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d9b48f7a35e0d..630d00fe7ee3d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1055,6 +1055,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
 		now = tk_clock_read(&tk->tkr_mono);
+		systime_snapshot->cs_id = tk->tkr_mono.clock->id;
 		systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
 		systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
 		base_real = ktime_add(tk->tkr_mono.base,

From 71af5907041399c7e8af72430b002258f225dbb9 Mon Sep 17 00:00:00 2001
From: Jianyong Wu <jianyong.wu@arm.com>
Date: Wed, 9 Dec 2020 14:09:28 +0800
Subject: [PATCH 125/737] clocksource: Add clocksource id for arm arch counter

Add clocksource id to the ARM generic counter so that it can be easily
identified from callers such as ptp_kvm.

Cc: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20201209060932.212364-6-jianyong.wu@arm.com
(cherry picked from commit 100148d0fc7dcf8672fe0ac83f44dc5749b4da5c)
---
 drivers/clocksource/arm_arch_timer.c | 2 ++
 include/linux/clocksource_ids.h      | 1 +
 2 files changed, 3 insertions(+)

diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index f4881764bf8f4..6c211e04b7452 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -16,6 +16,7 @@
 #include <linux/cpu_pm.h>
 #include <linux/clockchips.h>
 #include <linux/clocksource.h>
+#include <linux/clocksource_ids.h>
 #include <linux/interrupt.h>
 #include <linux/of_irq.h>
 #include <linux/of_address.h>
@@ -191,6 +192,7 @@ static u64 arch_counter_read_cc(const struct cyclecounter *cc)
 
 static struct clocksource clocksource_counter = {
 	.name	= "arch_sys_counter",
+	.id	= CSID_ARM_ARCH_COUNTER,
 	.rating	= 400,
 	.read	= arch_counter_read,
 	.mask	= CLOCKSOURCE_MASK(56),
diff --git a/include/linux/clocksource_ids.h b/include/linux/clocksource_ids.h
index 4d8e19e05328f..16775d7d8f8d6 100644
--- a/include/linux/clocksource_ids.h
+++ b/include/linux/clocksource_ids.h
@@ -5,6 +5,7 @@
 /* Enum to give clocksources a unique identifier */
 enum clocksource_ids {
 	CSID_GENERIC		= 0,
+	CSID_ARM_ARCH_COUNTER,
 	CSID_MAX,
 };
 

From 198c8f105c892efd5d5c33e4dbbd17fc220b75de Mon Sep 17 00:00:00 2001
From: Jianyong Wu <jianyong.wu@arm.com>
Date: Wed, 9 Dec 2020 14:09:29 +0800
Subject: [PATCH 126/737] KVM: arm64: Add support for the KVM PTP service

Implement the hypervisor side of the KVM PTP interface.

The service offers wall time and cycle count from host to guest.
The caller must specify whether they want the host's view of
either the virtual or physical counter.

Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20201209060932.212364-7-jianyong.wu@arm.com
(cherry picked from commit 3bf725699bf62494b3e179f1795f08c7d749f061)
---
 Documentation/virt/kvm/api.rst         |  9 +++++
 Documentation/virt/kvm/arm/index.rst   |  1 +
 Documentation/virt/kvm/arm/ptp_kvm.rst | 25 ++++++++++++
 arch/arm64/kvm/arm.c                   |  1 +
 arch/arm64/kvm/hypercalls.c            | 53 ++++++++++++++++++++++++++
 include/linux/arm-smccc.h              | 16 ++++++++
 include/uapi/linux/kvm.h               |  1 +
 7 files changed, 106 insertions(+)
 create mode 100644 Documentation/virt/kvm/arm/ptp_kvm.rst

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 08295f488d057..02f821ca63c66 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6405,6 +6405,15 @@ guest according to the bits in the KVM_CPUID_FEATURES CPUID leaf
 (0x40000001). Otherwise, a guest may use the paravirtual features
 regardless of what has actually been exposed through the CPUID leaf.
 
+8.29 KVM_CAP_PTP_KVM
+--------------------
+
+:Architectures: arm64
+
+This capability indicates that the KVM virtual PTP service is
+supported in the host. A VMM can check whether the service is
+available to the guest on migration.
+
 9. Known KVM API problems
 =========================
 
diff --git a/Documentation/virt/kvm/arm/index.rst b/Documentation/virt/kvm/arm/index.rst
index 3e2b2aba90fcc..78a9b670aafee 100644
--- a/Documentation/virt/kvm/arm/index.rst
+++ b/Documentation/virt/kvm/arm/index.rst
@@ -10,3 +10,4 @@ ARM
    hyp-abi
    psci
    pvtime
+   ptp_kvm
diff --git a/Documentation/virt/kvm/arm/ptp_kvm.rst b/Documentation/virt/kvm/arm/ptp_kvm.rst
new file mode 100644
index 0000000000000..68cffb50d8bf0
--- /dev/null
+++ b/Documentation/virt/kvm/arm/ptp_kvm.rst
@@ -0,0 +1,25 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+PTP_KVM support for arm/arm64
+=============================
+
+PTP_KVM is used for high precision time sync between host and guests.
+It relies on transferring the wall clock and counter value from the
+host to the guest using a KVM-specific hypercall.
+
+* ARM_SMCCC_HYP_KVM_PTP_FUNC_ID: 0x86000001
+
+This hypercall uses the SMC32/HVC32 calling convention:
+
+ARM_SMCCC_HYP_KVM_PTP_FUNC_ID
+    =============    ==========    ==========
+    Function ID:     (uint32)      0x86000001
+    Arguments:       (uint32)      KVM_PTP_VIRT_COUNTER(0)
+                                   KVM_PTP_PHYS_COUNTER(1)
+    Return Values:   (int32)       NOT_SUPPORTED(-1) on error, or
+                     (uint32)      Upper 32 bits of wall clock time (r0)
+                     (uint32)      Lower 32 bits of wall clock time (r1)
+                     (uint32)      Upper 32 bits of counter (r2)
+                     (uint32)      Lower 32 bits of counter (r3)
+    Endianness:                    No Restrictions.
+    =============    ==========    ==========
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 4d63fcd7574b2..67b8d2271d61f 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -198,6 +198,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2:
 	case KVM_CAP_ARM_NISV_TO_USER:
 	case KVM_CAP_ARM_INJECT_EXT_DABT:
+	case KVM_CAP_PTP_KVM:
 		r = 1;
 		break;
 	case KVM_CAP_ARM_SET_DEVICE_ADDR:
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index 8e17242ef2c83..39e34d88acf60 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -9,6 +9,55 @@
 #include <kvm/arm_hypercalls.h>
 #include <kvm/arm_psci.h>
 
+static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val)
+{
+	struct system_time_snapshot systime_snapshot;
+	u64 cycles = ~0UL;
+	u32 feature;
+
+	/*
+	 * system time and counter value must captured at the same
+	 * time to keep consistency and precision.
+	 */
+	ktime_get_snapshot(&systime_snapshot);
+
+	/*
+	 * This is only valid if the current clocksource is the
+	 * architected counter, as this is the only one the guest
+	 * can see.
+	 */
+	if (systime_snapshot.cs_id != CSID_ARM_ARCH_COUNTER)
+		return;
+
+	/*
+	 * The guest selects one of the two reference counters
+	 * (virtual or physical) with the first argument of the SMCCC
+	 * call. In case the identifier is not supported, error out.
+	 */
+	feature = smccc_get_arg1(vcpu);
+	switch (feature) {
+	case KVM_PTP_VIRT_COUNTER:
+		cycles = systime_snapshot.cycles - vcpu_read_sys_reg(vcpu, CNTVOFF_EL2);
+		break;
+	case KVM_PTP_PHYS_COUNTER:
+		cycles = systime_snapshot.cycles;
+		break;
+	default:
+		return;
+	}
+
+	/*
+	 * This relies on the top bit of val[0] never being set for
+	 * valid values of system time, because that is *really* far
+	 * in the future (about 292 years from 1970, and at that stage
+	 * nobody will give a damn about it).
+	 */
+	val[0] = upper_32_bits(systime_snapshot.real);
+	val[1] = lower_32_bits(systime_snapshot.real);
+	val[2] = upper_32_bits(cycles);
+	val[3] = lower_32_bits(cycles);
+}
+
 int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 {
 	u32 func_id = smccc_get_function(vcpu);
@@ -91,6 +140,10 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 		break;
 	case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
 		val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES);
+		val[0] |= BIT(ARM_SMCCC_KVM_FUNC_PTP);
+		break;
+	case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID:
+		kvm_ptp_get_time(vcpu, val);
 		break;
 	default:
 		return kvm_psci_call(vcpu);
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index b789cb58bbd89..e25795bdde862 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -108,6 +108,7 @@
 
 /* KVM "vendor specific" services */
 #define ARM_SMCCC_KVM_FUNC_FEATURES		0
+#define ARM_SMCCC_KVM_FUNC_PTP			1
 #define ARM_SMCCC_KVM_FUNC_FEATURES_2		127
 #define ARM_SMCCC_KVM_NUM_FUNCS			128
 
@@ -119,6 +120,21 @@
 
 #define SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED	1
 
+/*
+ * ptp_kvm is a feature used for time sync between vm and host.
+ * ptp_kvm module in guest kernel will get service from host using
+ * this hypercall ID.
+ */
+#define ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID				\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_32,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   ARM_SMCCC_KVM_FUNC_PTP)
+
+/* ptp_kvm counter type ID */
+#define KVM_PTP_VIRT_COUNTER			0
+#define KVM_PTP_PHYS_COUNTER			1
+
 /* Paravirtualised time calls (defined by ARM DEN0057A) */
 #define ARM_SMCCC_HV_PV_TIME_FEATURES				\
 	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,			\
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index ca41220b40b8b..797c40bbc31fa 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1053,6 +1053,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_X86_USER_SPACE_MSR 188
 #define KVM_CAP_X86_MSR_FILTER 189
 #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
+#define KVM_CAP_PTP_KVM 191
 
 #ifdef KVM_CAP_IRQ_ROUTING
 

From bd3778cd4611c87e133bb5290ccc92e3b339c2eb Mon Sep 17 00:00:00 2001
From: Jianyong Wu <jianyong.wu@arm.com>
Date: Wed, 9 Dec 2020 14:09:30 +0800
Subject: [PATCH 127/737] ptp: arm/arm64: Enable ptp_kvm for arm/arm64

Currently, there is no mechanism to keep time sync between guest and host
in arm/arm64 virtualization environment. Time in guest will drift compared
with host after boot up as they may both use third party time sources
to correct their time respectively. The time deviation will be in order
of milliseconds. But in some scenarios,like in cloud environment, we ask
for higher time precision.

kvm ptp clock, which chooses the host clock source as a reference
clock to sync time between guest and host, has been adopted by x86
which takes the time sync order from milliseconds to nanoseconds.

This patch enables kvm ptp clock for arm/arm64 and improves clock sync precision
significantly.

Test result comparisons between with kvm ptp clock and without it in arm/arm64
are as follows. This test derived from the result of command 'chronyc
sources'. we should take more care of the last sample column which shows
the offset between the local clock and the source at the last measurement.

no kvm ptp in guest:
MS Name/IP address   Stratum Poll Reach LastRx Last sample
========================================================================
^* dns1.synet.edu.cn      2   6   377    13  +1040us[+1581us] +/-   21ms
^* dns1.synet.edu.cn      2   6   377    21  +1040us[+1581us] +/-   21ms
^* dns1.synet.edu.cn      2   6   377    29  +1040us[+1581us] +/-   21ms
^* dns1.synet.edu.cn      2   6   377    37  +1040us[+1581us] +/-   21ms
^* dns1.synet.edu.cn      2   6   377    45  +1040us[+1581us] +/-   21ms
^* dns1.synet.edu.cn      2   6   377    53  +1040us[+1581us] +/-   21ms
^* dns1.synet.edu.cn      2   6   377    61  +1040us[+1581us] +/-   21ms
^* dns1.synet.edu.cn      2   6   377     4   -130us[ +796us] +/-   21ms
^* dns1.synet.edu.cn      2   6   377    12   -130us[ +796us] +/-   21ms
^* dns1.synet.edu.cn      2   6   377    20   -130us[ +796us] +/-   21ms

in host:
MS Name/IP address   Stratum Poll Reach LastRx Last sample
========================================================================
^* 120.25.115.20          2   7   377    72   -470us[ -603us] +/-   18ms
^* 120.25.115.20          2   7   377    92   -470us[ -603us] +/-   18ms
^* 120.25.115.20          2   7   377   112   -470us[ -603us] +/-   18ms
^* 120.25.115.20          2   7   377     2   +872ns[-6808ns] +/-   17ms
^* 120.25.115.20          2   7   377    22   +872ns[-6808ns] +/-   17ms
^* 120.25.115.20          2   7   377    43   +872ns[-6808ns] +/-   17ms
^* 120.25.115.20          2   7   377    63   +872ns[-6808ns] +/-   17ms
^* 120.25.115.20          2   7   377    83   +872ns[-6808ns] +/-   17ms
^* 120.25.115.20          2   7   377   103   +872ns[-6808ns] +/-   17ms
^* 120.25.115.20          2   7   377   123   +872ns[-6808ns] +/-   17ms

The dns1.synet.edu.cn is the network reference clock for guest and
120.25.115.20 is the network reference clock for host. we can't get the
clock error between guest and host directly, but a roughly estimated value
will be in order of hundreds of us to ms.

with kvm ptp in guest:
chrony has been disabled in host to remove the disturb by network clock.

MS Name/IP address         Stratum Poll Reach LastRx Last sample
========================================================================
* PHC0                    0   3   377     8     -7ns[   +1ns] +/-    3ns
* PHC0                    0   3   377     8     +1ns[  +16ns] +/-    3ns
* PHC0                    0   3   377     6     -4ns[   -0ns] +/-    6ns
* PHC0                    0   3   377     6     -8ns[  -12ns] +/-    5ns
* PHC0                    0   3   377     5     +2ns[   +4ns] +/-    4ns
* PHC0                    0   3   377    13     +2ns[   +4ns] +/-    4ns
* PHC0                    0   3   377    12     -4ns[   -6ns] +/-    4ns
* PHC0                    0   3   377    11     -8ns[  -11ns] +/-    6ns
* PHC0                    0   3   377    10    -14ns[  -20ns] +/-    4ns
* PHC0                    0   3   377     8     +4ns[   +5ns] +/-    4ns

The PHC0 is the ptp clock which choose the host clock as its source
clock. So we can see that the clock difference between host and guest
is in order of ns.

Cc: Mark Rutland <mark.rutland@arm.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20201209060932.212364-8-jianyong.wu@arm.com
(cherry picked from commit 300bb1fe767183a1ca1dadf691409c53c4ecff4b)
---
 drivers/clocksource/arm_arch_timer.c | 34 ++++++++++++++++++++++++++++
 drivers/ptp/Kconfig                  |  2 +-
 drivers/ptp/Makefile                 |  1 +
 drivers/ptp/ptp_kvm_arm.c            | 28 +++++++++++++++++++++++
 4 files changed, 64 insertions(+), 1 deletion(-)
 create mode 100644 drivers/ptp/ptp_kvm_arm.c

diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index 6c211e04b7452..5aeab5445f0b4 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -25,6 +25,8 @@
 #include <linux/sched/clock.h>
 #include <linux/sched_clock.h>
 #include <linux/acpi.h>
+#include <linux/arm-smccc.h>
+#include <linux/ptp_kvm.h>
 
 #include <asm/arch_timer.h>
 #include <asm/virt.h>
@@ -1659,3 +1661,35 @@ static int __init arch_timer_acpi_init(struct acpi_table_header *table)
 }
 TIMER_ACPI_DECLARE(arch_timer, ACPI_SIG_GTDT, arch_timer_acpi_init);
 #endif
+
+int kvm_arch_ptp_get_crosststamp(u64 *cycle, struct timespec64 *ts,
+				 struct clocksource **cs)
+{
+	struct arm_smccc_res hvc_res;
+	u32 ptp_counter;
+	ktime_t ktime;
+
+	if (!IS_ENABLED(CONFIG_HAVE_ARM_SMCCC_DISCOVERY))
+		return -EOPNOTSUPP;
+
+	if (arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI)
+		ptp_counter = KVM_PTP_VIRT_COUNTER;
+	else
+		ptp_counter = KVM_PTP_PHYS_COUNTER;
+
+	arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID,
+			     ptp_counter, &hvc_res);
+
+	if ((int)(hvc_res.a0) < 0)
+		return -EOPNOTSUPP;
+
+	ktime = (u64)hvc_res.a0 << 32 | hvc_res.a1;
+	*ts = ktime_to_timespec64(ktime);
+	if (cycle)
+		*cycle = (u64)hvc_res.a2 << 32 | hvc_res.a3;
+	if (cs)
+		*cs = &clocksource_counter;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_arch_ptp_get_crosststamp);
diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig
index 3e377f3c69e5d..1b5834ac6ca0e 100644
--- a/drivers/ptp/Kconfig
+++ b/drivers/ptp/Kconfig
@@ -109,7 +109,7 @@ config PTP_1588_CLOCK_PCH
 config PTP_1588_CLOCK_KVM
 	tristate "KVM virtual PTP clock"
 	depends on PTP_1588_CLOCK
-	depends on KVM_GUEST && X86
+	depends on (KVM_GUEST && X86) || (HAVE_ARM_SMCCC_DISCOVERY && ARM_ARCH_TIMER)
 	default y
 	help
 	  This driver adds support for using kvm infrastructure as a PTP
diff --git a/drivers/ptp/Makefile b/drivers/ptp/Makefile
index 699a4e4d19c2d..9fa5ede44b2b2 100644
--- a/drivers/ptp/Makefile
+++ b/drivers/ptp/Makefile
@@ -5,6 +5,7 @@
 
 ptp-y					:= ptp_clock.o ptp_chardev.o ptp_sysfs.o
 ptp_kvm-$(CONFIG_X86)			:= ptp_kvm_x86.o ptp_kvm_common.o
+ptp_kvm-$(CONFIG_HAVE_ARM_SMCCC)	:= ptp_kvm_arm.o ptp_kvm_common.o
 obj-$(CONFIG_PTP_1588_CLOCK)		+= ptp.o
 obj-$(CONFIG_PTP_1588_CLOCK_DTE)	+= ptp_dte.o
 obj-$(CONFIG_PTP_1588_CLOCK_INES)	+= ptp_ines.o
diff --git a/drivers/ptp/ptp_kvm_arm.c b/drivers/ptp/ptp_kvm_arm.c
new file mode 100644
index 0000000000000..b7d28c8dfb84e
--- /dev/null
+++ b/drivers/ptp/ptp_kvm_arm.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  Virtual PTP 1588 clock for use with KVM guests
+ *  Copyright (C) 2019 ARM Ltd.
+ *  All Rights Reserved
+ */
+
+#include <linux/arm-smccc.h>
+#include <linux/ptp_kvm.h>
+
+#include <asm/arch_timer.h>
+#include <asm/hypervisor.h>
+
+int kvm_arch_ptp_init(void)
+{
+	int ret;
+
+	ret = kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_PTP);
+	if (ret <= 0)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+int kvm_arch_ptp_get_clock(struct timespec64 *ts)
+{
+	return kvm_arch_ptp_get_crosststamp(NULL, ts, NULL);
+}

From 0a075dafc51f23fbecf80422ec6cc68f2609fadf Mon Sep 17 00:00:00 2001
From: Jon Hunter <jonathanh@nvidia.com>
Date: Tue, 20 Apr 2021 14:24:19 +0100
Subject: [PATCH 128/737] ptp: Don't print an error if ptp_kvm is not supported

Commit 300bb1fe7671 ("ptp: arm/arm64: Enable ptp_kvm for arm/arm64")
enable ptp_kvm support for ARM platforms and for any ARM platform that
does not support this, the following error message is displayed ...

 ERR KERN fail to initialize ptp_kvm

For platforms that do not support ptp_kvm this error is a bit misleading
and so fix this by only printing this message if the error returned by
kvm_arch_ptp_init() is not -EOPNOTSUPP. Note that -EOPNOTSUPP is only
returned by ARM platforms today if ptp_kvm is not supported.

Fixes: 300bb1fe7671 ("ptp: arm/arm64: Enable ptp_kvm for arm/arm64")
Signed-off-by: Jon Hunter <jonathanh@nvidia.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210420132419.1318148-1-jonathanh@nvidia.com
(cherry picked from commit a86ed2cfa13c5175eb082c50a644f6bf29ac65cc)
---
 drivers/ptp/ptp_kvm_common.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/ptp/ptp_kvm_common.c b/drivers/ptp/ptp_kvm_common.c
index 721ddcede5e19..fcae32f56f25a 100644
--- a/drivers/ptp/ptp_kvm_common.c
+++ b/drivers/ptp/ptp_kvm_common.c
@@ -138,7 +138,8 @@ static int __init ptp_kvm_init(void)
 
 	ret = kvm_arch_ptp_init();
 	if (ret) {
-		pr_err("fail to initialize ptp_kvm");
+		if (ret != -EOPNOTSUPP)
+			pr_err("fail to initialize ptp_kvm");
 		return ret;
 	}
 

From 219737df824be22c9ed06e46de1d1e35a121d596 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sun, 9 May 2021 09:39:02 -0300
Subject: [PATCH 129/737] tools headers UAPI: Sync linux/kvm.h with the kernel
 sources

To pick the changes in:

  15fb7de1a7f5af0d ("KVM: SVM: Add KVM_SEV_RECEIVE_UPDATE_DATA command")
  3bf725699bf62494 ("KVM: arm64: Add support for the KVM PTP service")
  4cfdd47d6d95aca4 ("KVM: SVM: Add KVM_SEV SEND_START command")
  54526d1fd59338fd ("KVM: x86: Support KVM VMs sharing SEV context")
  5569e2e7a650dfff ("KVM: SVM: Add support for KVM_SEV_SEND_CANCEL command")
  8b13c36493d8cb56 ("KVM: introduce KVM_CAP_SET_GUEST_DEBUG2")
  af43cbbf954b50ca ("KVM: SVM: Add support for KVM_SEV_RECEIVE_START command")
  d3d1af85e2c75bb5 ("KVM: SVM: Add KVM_SEND_UPDATE_DATA command")
  fe7e948837f312d8 ("KVM: x86: Add capability to grant VM access to privileged SGX attribute")

That don't cause any change in tooling as it doesn't introduce any new
ioctl.

  $ grep kvm tools/perf/trace/beauty/*.sh
  tools/perf/trace/beauty/kvm_ioctl.sh:printf "static const char *kvm_ioctl_cmds[] = {\n"
  tools/perf/trace/beauty/kvm_ioctl.sh:egrep $regex ${header_dir}/kvm.h	| \
  $
  $ tools/perf/trace/beauty/kvm_ioctl.sh > before
  $ cp include/uapi/linux/kvm.h tools/include/uapi/linux/kvm.h
  $ tools/perf/trace/beauty/kvm_ioctl.sh > after
  $ diff -u before after
  $

This silences this perf build warning:

  Warning: Kernel ABI header at 'tools/include/uapi/linux/kvm.h' differs from latest version at 'include/uapi/linux/kvm.h'
  diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h

Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Jianyong Wu <jianyong.wu@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Nathan Tempelman <natet@google.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Sean Christopherson <seanjc@google.com>
Cc: Steve Rutherford <srutherford@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
(cherry picked from commit 0d943d5fde6070c2661a99618ea95b99655589ad)
---
 tools/include/uapi/linux/kvm.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index ca41220b40b8b..797c40bbc31fa 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -1053,6 +1053,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_X86_USER_SPACE_MSR 188
 #define KVM_CAP_X86_MSR_FILTER 189
 #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
+#define KVM_CAP_PTP_KVM 191
 
 #ifdef KVM_CAP_IRQ_ROUTING
 

From b3074e6c1e67353bd2e1ee39089faccf2f97aa89 Mon Sep 17 00:00:00 2001
From: Adrian Catangiu <acatan@amazon.com>
Date: Wed, 24 Feb 2021 10:47:31 +0200
Subject: [PATCH 130/737] drivers/misc: sysgenid: add system generation id
 driver

- Background and problem

The System Generation ID feature is required in virtualized or
containerized environments by applications that work with local copies
or caches of world-unique data such as random values, uuids,
monotonically increasing counters, etc.
Such applications can be negatively affected by VM or container
snapshotting when the VM or container is either cloned or returned to
an earlier point in time.

Furthermore, simply finding out about a system generation change is
only the starting point of a process to renew internal states of
possibly multiple applications across the system. This process requires
a standard interface that applications can rely on and through which
orchestration can be easily done.

- Solution

The System Generation ID is meant to help in these scenarios by
providing a monotonically increasing u32 counter that changes each time
the VM or container is restored from a snapshot.

The `sysgenid` driver exposes a monotonic incremental System Generation
u32 counter via a char-dev filesystem interface accessible
through `/dev/sysgenid`. It provides synchronous and asynchronous SysGen
counter update notifications, as well as counter retrieval and
confirmation mechanisms.
The counter starts from zero when the driver is initialized and
monotonically increments every time the system generation changes.

Userspace applications or libraries can (a)synchronously consume the
system generation counter through the provided filesystem interface, to
make any necessary internal adjustments following a system generation
update.

The provided filesystem interface operations can be used to build a
system level safe workflow that guest software can follow to protect
itself from negative system snapshot effects.

The `sysgenid` driver exports the `void sysgenid_bump_generation()`
symbol which can be used by backend drivers to drive system generation
changes based on hardware events.
System generation changes can also be driven by userspace software
through a dedicated driver ioctl.

**Please note**, SysGenID alone does not guarantee complete snapshot
safety to applications using it. A certain workflow needs to be
followed at the system level, in order to make the system
snapshot-resilient. Please see the "Snapshot Safety Prerequisites"
section in the included documentation.

Signed-off-by: Adrian Catangiu <acatan@amazon.com>
---
 Documentation/misc-devices/sysgenid.rst       | 229 +++++++++++++
 .../userspace-api/ioctl/ioctl-number.rst      |   1 +
 MAINTAINERS                                   |   8 +
 drivers/misc/Kconfig                          |  15 +
 drivers/misc/Makefile                         |   1 +
 drivers/misc/sysgenid.c                       | 322 ++++++++++++++++++
 include/uapi/linux/sysgenid.h                 |  18 +
 7 files changed, 594 insertions(+)
 create mode 100644 Documentation/misc-devices/sysgenid.rst
 create mode 100644 drivers/misc/sysgenid.c
 create mode 100644 include/uapi/linux/sysgenid.h

diff --git a/Documentation/misc-devices/sysgenid.rst b/Documentation/misc-devices/sysgenid.rst
new file mode 100644
index 0000000000000..0b8199b8d5163
--- /dev/null
+++ b/Documentation/misc-devices/sysgenid.rst
@@ -0,0 +1,229 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+========
+SYSGENID
+========
+
+The System Generation ID feature is required in virtualized or
+containerized environments by applications that work with local copies
+or caches of world-unique data such as random values, UUIDs,
+monotonically increasing counters, etc.
+Such applications can be negatively affected by VM or container
+snapshotting when the VM or container is either cloned or returned to
+an earlier point in time.
+
+The System Generation ID is meant to help in these scenarios by
+providing a monotonically increasing counter that changes each time the
+VM or container is restored from a snapshot. The driver for it lives at
+``drivers/misc/sysgenid.c``.
+
+The ``sysgenid`` driver exposes a monotonic incremental System
+Generation u32 counter via a char-dev filesystem interface accessible
+through ``/dev/sysgenid`` that provides sync and async SysGen counter
+update notifications. It also provides SysGen counter retrieval and
+confirmation mechanisms.
+
+The counter starts from zero when the driver is initialized and
+monotonically increments every time the system generation changes.
+
+The ``sysgenid`` driver exports the ``void sysgenid_bump_generation()``
+symbol which can be used by backend drivers to drive system generation
+changes based on hardware events.
+System generation changes can also be driven by userspace software
+through a dedicated driver ioctl.
+
+Userspace applications or libraries can (a)synchronously consume the
+system generation counter through the provided filesystem interface, to
+make any necessary internal adjustments following a system generation
+update.
+
+**Please note**, SysGenID alone does not guarantee complete snapshot
+safety to applications using it. A certain workflow needs to be
+followed at the system level, in order to make the system
+snapshot-resilient. Please see the "Snapshot Safety Prerequisites"
+section below.
+
+Driver filesystem interface
+===========================
+
+``open()``:
+  When the device is opened, a copy of the current SysGenID (counter)
+  is associated with the open file descriptor. Every open file
+  descriptor will have readable data available (EPOLLIN) while its
+  current copy of the SysGenID is outdated. Reading from the fd will
+  provide the latest SysGenID, while writing to the fd will update the
+  fd-local copy of the SysGenID and is used as a confirmation
+  mechanism.
+
+``read()``:
+  Read is meant to provide the *new* system generation counter when a
+  generation change takes place. The read operation blocks until the
+  associated counter is no longer up to date, at which point the new
+  counter is provided/returned.  Nonblocking ``read()`` returns
+  ``EAGAIN`` to signal that there is no *new* counter value available.
+  The generation counter is considered *new* for each open file
+  descriptor that hasn't confirmed the new value following a generation
+  change. Therefore, once a generation change takes place, all
+  ``read()`` calls will immediately return the new generation counter
+  and will continue to do so until the new value is confirmed back to
+  the driver through ``write()``.
+  Partial reads are not allowed - read buffer needs to be at least
+  32 bits in size.
+
+``write()``:
+  Write is used to confirm the up-to-date SysGenID counter back to the
+  driver.
+  Following a VM generation change, all existing watchers are marked
+  as *outdated*. Each file descriptor will maintain the *outdated*
+  status until a ``write()`` containing the new up-to-date generation
+  counter is used as an update confirmation mechanism.
+  Partial writes are not allowed - write buffer should be exactly
+  32 bits in size.
+
+``poll()``:
+  Poll is implemented to allow polling for generation counter updates.
+  Such updates result in ``EPOLLIN`` polling status until the new
+  up-to-date counter is confirmed back to the driver through a
+  ``write()``.
+
+``ioctl()``:
+  The driver also adds support for waiting on open file descriptors
+  that haven't acknowledged a generation counter update, as well as a
+  mechanism for userspace to *trigger* a generation update:
+
+  - SYSGENID_SET_WATCHER_TRACKING: takes a bool argument to set tracking
+    status for current file descriptor. When watcher tracking is
+    enabled, the driver tracks this file descriptor as an independent
+    *watcher*. The driver keeps accounting of how many watchers have
+    confirmed the latest Sys-Gen-Id counter and how many of them are
+    *outdated*; an outdated watcher is a *tracked* open file descriptor
+    that has lived through a Sys-Gen-Id change but has not yet confirmed
+    the new generation counter.
+    Software that wants to be waited on by the system while it adjusts
+    to generation changes, should turn tracking on. The sysgenid driver
+    then keeps track of it and can block system-level adjustment process
+    until the software has finished adjusting and confirmed it through a
+    ``write()``.
+    Tracking is disabled by default and file descriptors need to
+    explicitly opt-in using this IOCTL.
+  - SYSGENID_WAIT_WATCHERS: blocks until there are no more *outdated*
+    tracked watchers or, if a ``timeout`` argument is provided, until
+    the timeout expires.
+    If the current caller is *outdated* or a generation change happens
+    while waiting (thus making current caller *outdated*), the ioctl
+    returns ``-EINTR`` to signal the user to handle event and retry.
+  - SYSGENID_TRIGGER_GEN_UPDATE: triggers a generation counter increment.
+    It takes a ``minimum-generation`` argument which represents the
+    minimum value the generation counter will be set to. For example if
+    current generation is ``5`` and ``SYSGENID_TRIGGER_GEN_UPDATE(8)``
+    is called, the generation counter will increment to ``8``.
+    This IOCTL can only be used by processes with CAP_CHECKPOINT_RESTORE
+    or CAP_SYS_ADMIN capabilities.
+
+``mmap()``:
+  The driver supports ``PROT_READ, MAP_SHARED`` mmaps of a single page
+  in size. The first 4 bytes of the mapped page will contain an
+  up-to-date u32 copy of the system generation counter.
+  The mapped memory can be used as a low-latency generation counter
+  probe mechanism in critical sections.
+  The mmap() interface is targeted at libraries or code that needs to
+  check for generation changes in-line, where an event loop is not
+  available or read()/write() syscalls are too expensive.
+  In such cases, logic can be added in-line with the sensitive code to
+  check and trigger on-demand/just-in-time readjustments when changes
+  are detected on the memory mapped generation counter.
+  Users of this interface that plan to lazily adjust should not enable
+  watcher tracking, since waiting on them doesn't make sense.
+
+``close()``:
+  Removes the file descriptor as a system generation counter *watcher*.
+
+Snapshot Safety Prerequisites
+=============================
+
+If VM, container or other system-level snapshots happen asynchronously,
+at arbitrary times during an active workload there is no practical way
+to ensure that in-flight local copies or caches of world-unique data
+such as random values, secrets, UUIDs, etc are properly scrubbed and
+regenerated.
+The challenge stems from the fact that the categorization of data as
+snapshot-sensitive is only known to the software working with it, and
+this software has no logical control over the moment in time when an
+external system snapshot occurs.
+
+Let's take an OpenSSL session token for example. Even if the library
+code is made 100% snapshot-safe, meaning the library guarantees that
+the session token is unique (any snapshot that happened during the
+library call did not duplicate or leak the token), the token is still
+vulnerable to snapshot events while it transits the various layers of
+the library caller, then the various layers of the OS before leaving
+the system.
+
+To catch a secret while it's in-flight, we'd have to validate system
+generation at every layer, every step of the way. Even if that would
+be deemed the right solution, it would be a long road and a whole
+universe to patch before we get there.
+
+Bottom line is we don't have a way to track all of these in-flight
+secrets and dynamically scrub them from existence with snapshot
+events happening arbitrarily.
+
+Simplifyng assumption - safety prerequisite
+-------------------------------------------
+
+**Control the snapshot flow**, disallow snapshots coming at arbitrary
+moments in the workload lifetime.
+
+Use a system-level overseer entity that quiesces the system before
+snapshot, and post-snapshot-resume oversees that software components
+have readjusted to new environment, to the new generation. Only after,
+will the overseer un-quiesce the system and allow active workloads.
+
+Software components can choose whether they want to be tracked and
+waited on by the overseer by using the ``SYSGENID_SET_WATCHER_TRACKING``
+IOCTL.
+
+The sysgenid framework standardizes the API for system software to
+find out about needing to readjust and at the same time provides a
+mechanism for the overseer entity to wait for everyone to be done, the
+system to have readjusted, so it can un-quiesce.
+
+Example snapshot-safe workflow
+------------------------------
+
+1) Before taking a snapshot, quiesce the VM/container/system. Exactly
+   how this is achieved is very workload-specific, but the general
+   description is to get all software to an expected state where their
+   event loops dry up and they are effectively quiesced.
+2) Take snapshot.
+3) Resume the VM/container/system from said snapshot.
+4) SysGenID counter will either automatically increment if there is
+   a vmgenid backend (hw-driven), or overseer will trigger generation
+   bump using ``SYSGENID_TRIGGER_GEN_UPDATE`` IOCLT (sw-driven).
+5) Software components which have ``/dev/sysgenid`` in their event
+   loops (either using ``poll()`` or ``read()``) are notified of the
+   generation change.
+   They do their specific internal adjustments. Some may have requested
+   to be tracked and waited on by the overseer, others might choose to
+   do their adjustments out of band and not block the overseer.
+   Tracked ones *must* signal when they are done/ready with a ``write()``
+   while the rest *should* also do so for cleanliness, but it's not
+   mandatory.
+6) Overseer will block and wait for all tracked watchers by using the
+   ``SYSGENID_WAIT_WATCHERS`` IOCTL. Once all tracked watchers are done
+   in step 5, this overseer will return from this blocking ioctl knowing
+   that the system has readjusted and is ready for active workload.
+7) Overseer un-quiesces system.
+8) There is a class of software, usually libraries, most notably PRNGs
+   or SSLs, that don't fit the event-loop model and also have strict
+   latency requirements. These can take advantage of the ``mmap()``
+   interface and lazily adjust on-demand whenever they are called after
+   un-quiesce.
+   For a well-designed service stack, these libraries should not be
+   called while system is quiesced. When workload is resumed by the
+   overseer, on the first call into these libs, they will safely JIT
+   readjust.
+   Users of this lazy on-demand readjustment model should not enable
+   watcher tracking since doing so would introduce a logical deadlock:
+   lazy adjustments happen only after un-quiesce, but un-quiesce is
+   blocked until all tracked watchers are up-to-date.
diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index a7373d4e3984c..16efa0199c8df 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -354,6 +354,7 @@ Code  Seq#    Include File                                           Comments
 0xDB  00-0F  drivers/char/mwave/mwavepub.h
 0xDD  00-3F                                                          ZFCP device driver see drivers/s390/scsi/
                                                                      <mailto:aherrman@de.ibm.com>
+0xE4  01-03  uapi/linux/sysgenid.h                                   SysGenID misc driver
 0xE5  00-3F  linux/fuse.h
 0xEC  00-01  drivers/platform/chrome/cros_ec_dev.h                   ChromeOS EC driver
 0xF3  00-3F  drivers/usb/misc/sisusbvga/sisusb.h                     sisfb (in development)
diff --git a/MAINTAINERS b/MAINTAINERS
index cdb5f1f22f4c4..b47c8af49faef 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16935,6 +16935,14 @@ L:	linux-mmc@vger.kernel.org
 S:	Maintained
 F:	drivers/mmc/host/sdhci-pci-dwc-mshc.c
 
+SYSGENID
+M:	Adrian Catangiu <acatan@amazon.com>
+L:	linux-kernel@vger.kernel.org
+S:	Supported
+F:	Documentation/misc-devices/sysgenid.rst
+F:	drivers/misc/sysgenid.c
+F:	include/uapi/linux/sysgenid.h
+
 SYSTEM CONFIGURATION (SYSCON)
 M:	Lee Jones <lee.jones@linaro.org>
 M:	Arnd Bergmann <arnd@arndb.de>
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index fafa8b0d80996..a2b7cae7a3595 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -456,6 +456,21 @@ config PVPANIC
 	  a paravirtualized device provided by QEMU; it lets a virtual machine
 	  (guest) communicate panic events to the host.
 
+config SYSGENID
+	tristate "System Generation ID driver"
+	help
+	  This is a System Generation ID driver which provides a system
+	  generation counter. The driver exposes FS ops on /dev/sysgenid
+	  through which it can provide information and notifications on system
+	  generation changes that happen because of VM or container snapshots
+	  or cloning.
+	  This enables applications and libraries that store or cache
+	  sensitive information, to know that they need to regenerate it
+	  after process memory has been exposed to potential copying.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called sysgenid.
+
 config HISI_HIKEY_USB
 	tristate "USB GPIO Hub on HiSilicon Hikey 960/970 Platform"
 	depends on (OF && GPIOLIB) || COMPILE_TEST
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index d23231e733303..4b4933d0619dc 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -57,3 +57,4 @@ obj-$(CONFIG_HABANA_AI)		+= habanalabs/
 obj-$(CONFIG_UACCE)		+= uacce/
 obj-$(CONFIG_XILINX_SDFEC)	+= xilinx_sdfec.o
 obj-$(CONFIG_HISI_HIKEY_USB)	+= hisi_hikey_usb.o
+obj-$(CONFIG_SYSGENID)		+= sysgenid.o
diff --git a/drivers/misc/sysgenid.c b/drivers/misc/sysgenid.c
new file mode 100644
index 0000000000000..ace292b83be4a
--- /dev/null
+++ b/drivers/misc/sysgenid.c
@@ -0,0 +1,322 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * System Generation ID driver
+ *
+ * Copyright (C) 2020 Amazon. All rights reserved.
+ *
+ *	Authors:
+ *	  Adrian Catangiu <acatan@amazon.com>
+ *
+ */
+#include <linux/acpi.h>
+#include <linux/kernel.h>
+#include <linux/minmax.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/poll.h>
+#include <linux/random.h>
+#include <linux/uuid.h>
+#include <linux/sysgenid.h>
+
+struct sysgenid_data {
+	unsigned long		map_buf;
+	wait_queue_head_t	read_waitq;
+	atomic_t		generation_counter;
+
+	unsigned int		watchers;
+	atomic_t		outdated_watchers;
+	wait_queue_head_t	outdated_waitq;
+	spinlock_t		lock;
+};
+static struct sysgenid_data sysgenid_data;
+
+struct file_data {
+	bool tracked_watcher;
+	int acked_gen_counter;
+};
+
+static int equals_gen_counter(unsigned int counter)
+{
+	return counter == atomic_read(&sysgenid_data.generation_counter);
+}
+
+static void _bump_generation(int min_gen)
+{
+	unsigned long flags;
+	int counter;
+
+	spin_lock_irqsave(&sysgenid_data.lock, flags);
+	counter = max(min_gen, 1 + atomic_read(&sysgenid_data.generation_counter));
+	atomic_set(&sysgenid_data.generation_counter, counter);
+	*((int *) sysgenid_data.map_buf) = counter;
+	atomic_set(&sysgenid_data.outdated_watchers, sysgenid_data.watchers);
+
+	wake_up_interruptible(&sysgenid_data.read_waitq);
+	wake_up_interruptible(&sysgenid_data.outdated_waitq);
+	spin_unlock_irqrestore(&sysgenid_data.lock, flags);
+}
+
+void sysgenid_bump_generation(void)
+{
+	_bump_generation(0);
+}
+EXPORT_SYMBOL_GPL(sysgenid_bump_generation);
+
+static void put_outdated_watchers(void)
+{
+	if (atomic_dec_and_test(&sysgenid_data.outdated_watchers))
+		wake_up_interruptible(&sysgenid_data.outdated_waitq);
+}
+
+static void start_fd_tracking(struct file_data *fdata)
+{
+	unsigned long flags;
+
+	if (!fdata->tracked_watcher) {
+		/* enable tracking this fd as a watcher */
+		spin_lock_irqsave(&sysgenid_data.lock, flags);
+			fdata->tracked_watcher = 1;
+			++sysgenid_data.watchers;
+			if (!equals_gen_counter(fdata->acked_gen_counter))
+				atomic_inc(&sysgenid_data.outdated_watchers);
+		spin_unlock_irqrestore(&sysgenid_data.lock, flags);
+	}
+}
+
+static void stop_fd_tracking(struct file_data *fdata)
+{
+	unsigned long flags;
+
+	if (fdata->tracked_watcher) {
+		/* stop tracking this fd as a watcher */
+		spin_lock_irqsave(&sysgenid_data.lock, flags);
+		if (!equals_gen_counter(fdata->acked_gen_counter))
+			put_outdated_watchers();
+		--sysgenid_data.watchers;
+		fdata->tracked_watcher = 0;
+		spin_unlock_irqrestore(&sysgenid_data.lock, flags);
+	}
+}
+
+static int sysgenid_open(struct inode *inode, struct file *file)
+{
+	struct file_data *fdata = kzalloc(sizeof(struct file_data), GFP_KERNEL);
+
+	if (!fdata)
+		return -ENOMEM;
+	fdata->tracked_watcher = 0;
+	fdata->acked_gen_counter = atomic_read(&sysgenid_data.generation_counter);
+	file->private_data = fdata;
+
+	return 0;
+}
+
+static int sysgenid_close(struct inode *inode, struct file *file)
+{
+	struct file_data *fdata = file->private_data;
+
+	stop_fd_tracking(fdata);
+	kfree(fdata);
+
+	return 0;
+}
+
+static ssize_t sysgenid_read(struct file *file, char __user *ubuf,
+		size_t nbytes, loff_t *ppos)
+{
+	struct file_data *fdata = file->private_data;
+	ssize_t ret;
+	int gen_counter;
+
+	if (nbytes == 0)
+		return 0;
+	/* disallow partial reads */
+	if (nbytes < sizeof(gen_counter))
+		return -EINVAL;
+
+	if (equals_gen_counter(fdata->acked_gen_counter)) {
+		if (file->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+		ret = wait_event_interruptible(
+			sysgenid_data.read_waitq,
+			!equals_gen_counter(fdata->acked_gen_counter)
+		);
+		if (ret)
+			return ret;
+	}
+
+	gen_counter = atomic_read(&sysgenid_data.generation_counter);
+	ret = copy_to_user(ubuf, &gen_counter, sizeof(gen_counter));
+	if (ret)
+		return -EFAULT;
+
+	return sizeof(gen_counter);
+}
+
+static ssize_t sysgenid_write(struct file *file, const char __user *ubuf,
+		size_t count, loff_t *ppos)
+{
+	struct file_data *fdata = file->private_data;
+	unsigned int new_acked_gen;
+	unsigned long flags;
+
+	/* disallow partial writes */
+	if (count != sizeof(new_acked_gen))
+		return -ENOBUFS;
+	if (copy_from_user(&new_acked_gen, ubuf, count))
+		return -EFAULT;
+
+	spin_lock_irqsave(&sysgenid_data.lock, flags);
+	/* wrong gen-counter acknowledged */
+	if (!equals_gen_counter(new_acked_gen)) {
+		spin_unlock_irqrestore(&sysgenid_data.lock, flags);
+		return -EINVAL;
+	}
+	/* update acked gen-counter if necessary */
+	if (!equals_gen_counter(fdata->acked_gen_counter)) {
+		fdata->acked_gen_counter = new_acked_gen;
+		if (fdata->tracked_watcher)
+			put_outdated_watchers();
+	}
+	spin_unlock_irqrestore(&sysgenid_data.lock, flags);
+
+	return (ssize_t)count;
+}
+
+static __poll_t sysgenid_poll(struct file *file, poll_table *wait)
+{
+	__poll_t mask = 0;
+	struct file_data *fdata = file->private_data;
+
+	if (!equals_gen_counter(fdata->acked_gen_counter))
+		return EPOLLIN | EPOLLRDNORM;
+
+	poll_wait(file, &sysgenid_data.read_waitq, wait);
+
+	if (!equals_gen_counter(fdata->acked_gen_counter))
+		mask = EPOLLIN | EPOLLRDNORM;
+
+	return mask;
+}
+
+static long sysgenid_ioctl(struct file *file,
+		unsigned int cmd, unsigned long arg)
+{
+	struct file_data *fdata = file->private_data;
+	bool tracking = !!arg;
+	unsigned long timeout_ns, min_gen;
+	ktime_t until;
+	int ret = 0;
+
+	switch (cmd) {
+	case SYSGENID_SET_WATCHER_TRACKING:
+		if (tracking)
+			start_fd_tracking(fdata);
+		else
+			stop_fd_tracking(fdata);
+		break;
+	case SYSGENID_WAIT_WATCHERS:
+		timeout_ns = arg * NSEC_PER_MSEC;
+		until = timeout_ns ? ktime_set(0, timeout_ns) : KTIME_MAX;
+
+		ret = wait_event_interruptible_hrtimeout(
+			sysgenid_data.outdated_waitq,
+			(!atomic_read(&sysgenid_data.outdated_watchers) ||
+					!equals_gen_counter(fdata->acked_gen_counter)),
+			until
+		);
+		if (!equals_gen_counter(fdata->acked_gen_counter))
+			ret = -EINTR;
+		break;
+	case SYSGENID_TRIGGER_GEN_UPDATE:
+		if (!checkpoint_restore_ns_capable(current_user_ns()))
+			return -EACCES;
+		min_gen = arg;
+		_bump_generation(min_gen);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	return ret;
+}
+
+static int sysgenid_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct file_data *fdata = file->private_data;
+
+	if (vma->vm_pgoff != 0 || vma_pages(vma) > 1)
+		return -EINVAL;
+
+	if ((vma->vm_flags & VM_WRITE) != 0)
+		return -EPERM;
+
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+	vma->vm_flags &= ~VM_MAYWRITE;
+	vma->vm_private_data = fdata;
+
+	return vm_insert_page(vma, vma->vm_start,
+			virt_to_page(sysgenid_data.map_buf));
+}
+
+static const struct file_operations fops = {
+	.owner		= THIS_MODULE,
+	.mmap		= sysgenid_mmap,
+	.open		= sysgenid_open,
+	.release	= sysgenid_close,
+	.read		= sysgenid_read,
+	.write		= sysgenid_write,
+	.poll		= sysgenid_poll,
+	.unlocked_ioctl	= sysgenid_ioctl,
+};
+
+static struct miscdevice sysgenid_misc = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "sysgenid",
+	.fops = &fops,
+};
+
+static int __init sysgenid_init(void)
+{
+	int ret;
+
+	sysgenid_data.map_buf = get_zeroed_page(GFP_KERNEL);
+	if (!sysgenid_data.map_buf)
+		return -ENOMEM;
+
+	atomic_set(&sysgenid_data.generation_counter, 0);
+	atomic_set(&sysgenid_data.outdated_watchers, 0);
+	init_waitqueue_head(&sysgenid_data.read_waitq);
+	init_waitqueue_head(&sysgenid_data.outdated_waitq);
+	spin_lock_init(&sysgenid_data.lock);
+
+	ret = misc_register(&sysgenid_misc);
+	if (ret < 0) {
+		pr_err("misc_register() failed for sysgenid\n");
+		goto err;
+	}
+
+	return 0;
+
+err:
+	free_pages(sysgenid_data.map_buf, 0);
+	sysgenid_data.map_buf = 0;
+
+	return ret;
+}
+
+static void __exit sysgenid_exit(void)
+{
+	misc_deregister(&sysgenid_misc);
+	free_pages(sysgenid_data.map_buf, 0);
+	sysgenid_data.map_buf = 0;
+}
+
+module_init(sysgenid_init);
+module_exit(sysgenid_exit);
+
+MODULE_AUTHOR("Adrian Catangiu");
+MODULE_DESCRIPTION("System Generation ID");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("0.1");
diff --git a/include/uapi/linux/sysgenid.h b/include/uapi/linux/sysgenid.h
new file mode 100644
index 0000000000000..7279df61bd84b
--- /dev/null
+++ b/include/uapi/linux/sysgenid.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+
+#ifndef _UAPI_LINUX_SYSGENID_H
+#define _UAPI_LINUX_SYSGENID_H
+
+#include <linux/ioctl.h>
+
+#define SYSGENID_IOCTL			0xE4
+#define SYSGENID_SET_WATCHER_TRACKING	_IO(SYSGENID_IOCTL, 1)
+#define SYSGENID_WAIT_WATCHERS		_IO(SYSGENID_IOCTL, 2)
+#define SYSGENID_TRIGGER_GEN_UPDATE	_IO(SYSGENID_IOCTL, 3)
+
+#ifdef __KERNEL__
+void sysgenid_bump_generation(void);
+#endif /* __KERNEL__ */
+
+#endif /* _UAPI_LINUX_SYSGENID_H */
+

From 944ff83c535d426a2e13190d2d3cc967263cadd4 Mon Sep 17 00:00:00 2001
From: Adrian Catangiu <acatan@amazon.com>
Date: Wed, 24 Feb 2021 10:47:32 +0200
Subject: [PATCH 131/737] drivers/virt: vmgenid: add vm generation id driver

The VM Generation ID is a feature defined by Microsoft (paper:
http://go.microsoft.com/fwlink/?LinkId=260709) and supported by
multiple hypervisor vendors.

The feature can be used to drive the `sysgenid` mechanism required in
virtualized environments by software that works with local copies and
caches of world-unique data such as random values, uuids, monotonically
increasing counters, etc.

The VM Generation ID is a hypervisor/hardware provided 128-bit unique
ID that changes each time the VM is restored from a snapshot. It can be
used to differentiate between VMs or different generations of the same
VM.
This VM Generation ID is exposed through an ACPI device by multiple
hypervisor vendors.

The `vmgenid` driver acts as a backend for the `sysgenid` kernel module
(`drivers/misc/sysgenid.c`, `Documentation/misc-devices/sysgenid.rst`)
to drive changes to the "System Generation Id" which is further exposed
to userspace as a monotonically increasing counter.

The driver uses ACPI events to be notified by hardware of changes to the
128-bit Vm Gen Id UUID. Since the actual UUID value is not directly exposed
to userspace, but only used to drive the System Generation Counter, the
driver also adds it as device randomness to improve kernel entropy
following VM snapshot events.

This patch builds on top of Or Idgar <oridgar@gmail.com>'s proposal
https://lkml.org/lkml/2018/3/1/498

Signed-off-by: Adrian Catangiu <acatan@amazon.com>
---
 Documentation/virt/vmgenid.rst |  36 ++++++++
 MAINTAINERS                    |   7 ++
 drivers/virt/Kconfig           |  13 +++
 drivers/virt/Makefile          |   1 +
 drivers/virt/vmgenid.c         | 153 +++++++++++++++++++++++++++++++++
 5 files changed, 210 insertions(+)
 create mode 100644 Documentation/virt/vmgenid.rst
 create mode 100644 drivers/virt/vmgenid.c

diff --git a/Documentation/virt/vmgenid.rst b/Documentation/virt/vmgenid.rst
new file mode 100644
index 0000000000000..a429c2a347ef3
--- /dev/null
+++ b/Documentation/virt/vmgenid.rst
@@ -0,0 +1,36 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=======
+VMGENID
+=======
+
+The VM Generation ID is a feature defined by Microsoft (paper:
+http://go.microsoft.com/fwlink/?LinkId=260709) and supported by
+multiple hypervisor vendors.
+
+The feature is required in virtualized environments by applications
+that work with local copies/caches of world-unique data such as random
+values, UUIDs, monotonically increasing counters, etc.
+Such applications can be negatively affected by VM snapshotting when
+the VM is either cloned or returned to an earlier point in time.
+
+The VM Generation ID is a simple concept through which a hypevisor
+notifies its guest that a snapshot has taken place. The vmgenid device
+provides a unique ID that changes each time the VM is restored from a
+snapshot. The hardware provided UUID value can be used to differentiate
+between VMs or different generations of the same VM.
+
+The VM Generation ID is exposed through an ACPI device by multiple
+hypervisor vendors. The driver for it lives at
+``drivers/virt/vmgenid.c``
+
+The ``vmgenid`` driver acts as a backend for the ``sysgenid`` kernel module
+(``drivers/misc/sysgenid.c``, ``Documentation/misc-devices/sysgenid.rst``)
+to drive changes to the "System Generation Id" which is further exposed
+to userspace as a monotonically increasing counter.
+
+The driver uses ACPI events to be notified by hardware of changes to the
+128-bit Vm Gen Id UUID. Since the actual UUID value is not directly exposed
+to userspace, but only used to drive the System Generation Counter, the
+driver also adds it as device randomness to improve kernel entropy
+following VM snapshot events.
diff --git a/MAINTAINERS b/MAINTAINERS
index b47c8af49faef..621cdab9fe585 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -18715,6 +18715,13 @@ F:	drivers/staging/vme/
 F:	drivers/vme/
 F:	include/linux/vme*
 
+VMGENID
+M:	Adrian Catangiu <acatan@amazon.com>
+L:	linux-kernel@vger.kernel.org
+S:	Supported
+F:	Documentation/virt/vmgenid.rst
+F:	drivers/virt/vmgenid.c
+
 VMWARE BALLOON DRIVER
 M:	Nadav Amit <namit@vmware.com>
 M:	"VMware, Inc." <pv-drivers@vmware.com>
diff --git a/drivers/virt/Kconfig b/drivers/virt/Kconfig
index 80c5f9c16ec17..95d82c97fdc4d 100644
--- a/drivers/virt/Kconfig
+++ b/drivers/virt/Kconfig
@@ -13,6 +13,19 @@ menuconfig VIRT_DRIVERS
 
 if VIRT_DRIVERS
 
+config VMGENID
+	tristate "Virtual Machine Generation ID driver"
+	depends on ACPI && SYSGENID
+	help
+	  The driver uses the hypervisor provided Virtual Machine Generation ID
+	  to drive the system generation counter mechanism exposed by sysgenid.
+	  The vmgenid changes on VM snapshots or VM cloning. The hypervisor
+	  provided 128-bit vmgenid is also used as device randomness to improve
+	  kernel entropy following VM snapshot events.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called vmgenid.
+
 config FSL_HV_MANAGER
 	tristate "Freescale hypervisor management driver"
 	depends on FSL_SOC
diff --git a/drivers/virt/Makefile b/drivers/virt/Makefile
index f28425ce4b39b..889be010884b4 100644
--- a/drivers/virt/Makefile
+++ b/drivers/virt/Makefile
@@ -4,6 +4,7 @@
 #
 
 obj-$(CONFIG_FSL_HV_MANAGER)	+= fsl_hypervisor.o
+obj-$(CONFIG_VMGENID)		+= vmgenid.o
 obj-y				+= vboxguest/
 
 obj-$(CONFIG_NITRO_ENCLAVES)	+= nitro_enclaves/
diff --git a/drivers/virt/vmgenid.c b/drivers/virt/vmgenid.c
new file mode 100644
index 0000000000000..d9d089a6c4a59
--- /dev/null
+++ b/drivers/virt/vmgenid.c
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Virtual Machine Generation ID driver
+ *
+ * Copyright (C) 2018 Red Hat Inc. All rights reserved.
+ *
+ * Copyright (C) 2020 Amazon. All rights reserved.
+ *
+ *	Authors:
+ *	  Adrian Catangiu <acatan@amazon.com>
+ *	  Or Idgar <oridgar@gmail.com>
+ *	  Gal Hammer <ghammer@redhat.com>
+ *
+ */
+#include <linux/acpi.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/uuid.h>
+#include <linux/sysgenid.h>
+
+#define DEV_NAME "vmgenid"
+ACPI_MODULE_NAME(DEV_NAME);
+
+struct vmgenid_data {
+	uuid_t uuid;
+	void *uuid_iomap;
+};
+static struct vmgenid_data vmgenid_data;
+
+static int vmgenid_acpi_map(struct vmgenid_data *priv, acpi_handle handle)
+{
+	int i;
+	phys_addr_t phys_addr;
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	acpi_status status;
+	union acpi_object *pss;
+	union acpi_object *element;
+
+	status = acpi_evaluate_object(handle, "ADDR", NULL, &buffer);
+	if (ACPI_FAILURE(status)) {
+		ACPI_EXCEPTION((AE_INFO, status, "Evaluating ADDR"));
+		return -ENODEV;
+	}
+	pss = buffer.pointer;
+	if (!pss || pss->type != ACPI_TYPE_PACKAGE || pss->package.count != 2)
+		return -EINVAL;
+
+	phys_addr = 0;
+	for (i = 0; i < pss->package.count; i++) {
+		element = &(pss->package.elements[i]);
+		if (element->type != ACPI_TYPE_INTEGER)
+			return -EINVAL;
+		phys_addr |= element->integer.value << i * 32;
+	}
+
+	priv->uuid_iomap = acpi_os_map_memory(phys_addr, sizeof(uuid_t));
+	if (!priv->uuid_iomap) {
+		pr_err("Could not map memory at 0x%llx, size %u\n",
+			   phys_addr,
+			   (u32) sizeof(uuid_t));
+		return -ENOMEM;
+	}
+
+	memcpy_fromio(&priv->uuid, priv->uuid_iomap, sizeof(uuid_t));
+
+	return 0;
+}
+
+static int vmgenid_acpi_add(struct acpi_device *device)
+{
+	int ret;
+
+	if (!device)
+		return -EINVAL;
+	device->driver_data = &vmgenid_data;
+
+	ret = vmgenid_acpi_map(device->driver_data, device->handle);
+	if (ret < 0) {
+		pr_err("vmgenid: failed to map acpi device\n");
+		device->driver_data = NULL;
+	}
+
+	return ret;
+}
+
+static int vmgenid_acpi_remove(struct acpi_device *device)
+{
+	if (!device || acpi_driver_data(device) != &vmgenid_data)
+		return -EINVAL;
+	device->driver_data = NULL;
+
+	if (vmgenid_data.uuid_iomap)
+		acpi_os_unmap_memory(vmgenid_data.uuid_iomap, sizeof(uuid_t));
+	vmgenid_data.uuid_iomap = NULL;
+
+	return 0;
+}
+
+static void vmgenid_acpi_notify(struct acpi_device *device, u32 event)
+{
+	uuid_t old_uuid;
+
+	if (!device || acpi_driver_data(device) != &vmgenid_data) {
+		pr_err("VMGENID notify with unexpected driver private data\n");
+		return;
+	}
+
+	/* update VM Generation UUID */
+	old_uuid = vmgenid_data.uuid;
+	memcpy_fromio(&vmgenid_data.uuid, vmgenid_data.uuid_iomap, sizeof(uuid_t));
+
+	if (memcmp(&old_uuid, &vmgenid_data.uuid, sizeof(uuid_t))) {
+		/* HW uuid updated */
+		sysgenid_bump_generation();
+		add_device_randomness(&vmgenid_data.uuid, sizeof(uuid_t));
+	}
+}
+
+static const struct acpi_device_id vmgenid_ids[] = {
+	{"VMGENID", 0},
+	{"QEMUVGID", 0},
+	{"", 0},
+};
+
+static struct acpi_driver acpi_vmgenid_driver = {
+	.name = "vm_generation_id",
+	.ids = vmgenid_ids,
+	.owner = THIS_MODULE,
+	.ops = {
+		.add = vmgenid_acpi_add,
+		.remove = vmgenid_acpi_remove,
+		.notify = vmgenid_acpi_notify,
+	}
+};
+
+static int __init vmgenid_init(void)
+{
+	return acpi_bus_register_driver(&acpi_vmgenid_driver);
+}
+
+static void __exit vmgenid_exit(void)
+{
+	acpi_bus_unregister_driver(&acpi_vmgenid_driver);
+}
+
+module_init(vmgenid_init);
+module_exit(vmgenid_exit);
+
+MODULE_AUTHOR("Adrian Catangiu");
+MODULE_DESCRIPTION("Virtual Machine Generation ID");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("0.1");

From eaa20ad1afe62f7d9c859b0bea648e74ae025e9b Mon Sep 17 00:00:00 2001
From: Shaoying Xu <shaoyi@amazon.com>
Date: Wed, 15 Sep 2021 23:41:15 +0000
Subject: [PATCH 132/737] mm, memcg: throttle the memory reclaim given
 dirty/writeback pages to avoid early OOMs

This is the improved workaround to avoid early OOMs within cgroup v1
by throttling the memory reclaim given dirty/writeback pages
under the GFP_NOFS allocations. Increment sleeping time exponentialy
until a limit after half the number of maximum retries when writeback+dirty
pages goes beyond a certain threshold before next retry occurs.
This solution can not only help to prevent early OOMs on some extreme
workload but also avoid unnecessary throttling on general cases.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=207273
Suggested-by: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Shaoying Xu <shaoyi@amazon.com>
---
 mm/memcontrol.c | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 751e3670d7b0c..504f9210df1b0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2679,6 +2679,7 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 {
 	unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
 	int nr_retries = MAX_RECLAIM_RETRIES;
+	int timeout = 1;
 	struct mem_cgroup *mem_over_limit;
 	struct page_counter *counter;
 	enum oom_status oom_status;
@@ -2770,7 +2771,25 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	 */
 	if (mem_cgroup_wait_acct_move(mem_over_limit))
 		goto retry;
-
+	/*
+	 * Legacy memcg relies on dirty data throttling during the reclaim
+	 * but this cannot be done for GFP_NOFS requests so we might trigger
+	 * the oom way too early. Throttle here if we have way too many
+	 * dirty/writeback pages.
+	 */
+	if ((nr_retries < MAX_RECLAIM_RETRIES/2) &&
+	    !cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
+	    !(gfp_mask & __GFP_FS)) {
+		unsigned long dirty = memcg_page_state(memcg, NR_FILE_DIRTY);
+		unsigned long writeback = memcg_page_state(memcg, NR_WRITEBACK);
+
+		if (4*(dirty + writeback) >
+		    3*page_counter_read(&memcg->memory)) {
+			schedule_timeout_interruptible(timeout);
+			if (timeout < 32)
+				timeout *= 2;
+		}
+	}
 	if (nr_retries--)
 		goto retry;
 
@@ -2794,6 +2813,7 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	if (oom_status == OOM_SUCCESS) {
 		passed_oom = true;
 		nr_retries = MAX_RECLAIM_RETRIES;
+		timeout = 1;
 		goto retry;
 	}
 nomem:

From 283eb987d8a0cf08b0ef88bff0948a56708fca7b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 25 Mar 2021 11:08:15 -0700
Subject: [PATCH 133/737] ipv4: convert ip_forward_update_priority sysctl to u8

This sysctl uses ip_fwd_update_priority() helper,
so the conversion needs to change it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 1c69dedc8fa7c9684d48dc89994b4e0aceeae588)
---
 include/net/netns/ipv4.h   | 2 +-
 net/ipv4/sysctl_net_ipv4.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 9b0d8649ae5b8..03d8c6c42db15 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -98,7 +98,7 @@ struct netns_ipv4 {
 	u8 sysctl_ip_default_ttl;
 	u8 sysctl_ip_no_pmtu_disc;
 	u8 sysctl_ip_fwd_use_pmtu;
-	int sysctl_ip_fwd_update_priority;
+	u8 sysctl_ip_fwd_update_priority;
 	u8 sysctl_ip_nonlocal_bind;
 	u8 sysctl_ip_autobind_reuse;
 	/* Shall we try to damage output packets if routing dev changes? */
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 59ba518a85b9c..92f9786e8f553 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -209,7 +209,7 @@ static int ipv4_fwd_update_priority(struct ctl_table *table, int write,
 
 	net = container_of(table->data, struct net,
 			   ipv4.sysctl_ip_fwd_update_priority);
-	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
 	if (write && ret == 0)
 		call_netevent_notifiers(NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE,
 					net);
@@ -688,7 +688,7 @@ static struct ctl_table ipv4_net_table[] = {
 	{
 		.procname	= "ip_forward_update_priority",
 		.data		= &init_net.ipv4.sysctl_ip_fwd_update_priority,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(u8),
 		.mode		= 0644,
 		.proc_handler   = ipv4_fwd_update_priority,
 		.extra1		= SYSCTL_ZERO,

From 60c4c491eac9622355ca90c87c5b11c83525cc20 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sjpark@amazon.com>
Date: Wed, 16 Nov 2022 00:58:37 +0000
Subject: [PATCH 134/737] inet: convert tcp_early_demux and udp_early_demux to
 u8

For these sysctls, their dedicated helpers have
to use proc_dou8vec_minmax().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 2932bcda070d9a02548e57119b1ada8f018c40b5)
Signed-off-by: SeongJae Park <sjpark@amazon.com>
---
 include/net/netns/ipv4.h   | 4 ++--
 net/ipv4/sysctl_net_ipv4.c | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 03d8c6c42db15..0bdfdb1ac8ac0 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -107,8 +107,8 @@ struct netns_ipv4 {
 #ifdef CONFIG_NET_L3_MASTER_DEV
 	u8 sysctl_raw_l3mdev_accept;
 #endif
-	int sysctl_tcp_early_demux;
-	int sysctl_udp_early_demux;
+	u8 sysctl_tcp_early_demux;
+	u8 sysctl_udp_early_demux;
 
 	u8 sysctl_nexthop_compat_mode;
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 92f9786e8f553..2afa5434c0421 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -628,16 +628,16 @@ static struct ctl_table ipv4_net_table[] = {
 	{
 		.procname       = "udp_early_demux",
 		.data           = &init_net.ipv4.sysctl_udp_early_demux,
-		.maxlen         = sizeof(int),
+		.maxlen         = sizeof(u8),
 		.mode           = 0644,
-		.proc_handler   = proc_douintvec_minmax,
+		.proc_handler   = proc_dou8vec_minmax,
 	},
 	{
 		.procname       = "tcp_early_demux",
 		.data           = &init_net.ipv4.sysctl_tcp_early_demux,
-		.maxlen         = sizeof(int),
+		.maxlen         = sizeof(u8),
 		.mode           = 0644,
-		.proc_handler   = proc_douintvec_minmax,
+		.proc_handler   = proc_dou8vec_minmax,
 	},
 	{
 		.procname       = "nexthop_compat_mode",

From adbc80c03ed580a9bebfd14b70850969d8dc89c1 Mon Sep 17 00:00:00 2001
From: Florent Revest <revest@chromium.org>
Date: Wed, 10 Feb 2021 12:14:03 +0100
Subject: [PATCH 135/737] bpf: Expose bpf_get_socket_cookie to tracing programs

This needs a new helper that:
- can work in a sleepable context (using sock_gen_cookie)
- takes a struct sock pointer and checks that it's not NULL

Signed-off-by: Florent Revest <revest@chromium.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: KP Singh <kpsingh@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210210111406.785541-2-revest@chromium.org
(cherry picked from commit c5dbb89fc2ac013afe67b9e4fcb3743c02b567cd)
---
 include/linux/bpf.h            |  1 +
 include/uapi/linux/bpf.h       |  8 ++++++++
 kernel/trace/bpf_trace.c       |  2 ++
 net/core/filter.c              | 12 ++++++++++++
 tools/include/uapi/linux/bpf.h |  8 ++++++++
 5 files changed, 31 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b010d45a1ecd5..791e936fc5971 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1899,6 +1899,7 @@ extern const struct bpf_func_proto bpf_copy_from_user_proto;
 extern const struct bpf_func_proto bpf_snprintf_btf_proto;
 extern const struct bpf_func_proto bpf_per_cpu_ptr_proto;
 extern const struct bpf_func_proto bpf_this_cpu_ptr_proto;
+extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto;
 
 const struct bpf_func_proto *bpf_tracing_func_proto(
 	enum bpf_func_id func_id, const struct bpf_prog *prog);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2a234023821e3..0e3582d59c7b4 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1661,6 +1661,14 @@ union bpf_attr {
  * 	Return
  * 		A 8-byte long non-decreasing number.
  *
+ * u64 bpf_get_socket_cookie(struct sock *sk)
+ * 	Description
+ * 		Equivalent to **bpf_get_socket_cookie**\ () helper that accepts
+ * 		*sk*, but gets socket from a BTF **struct sock**. This helper
+ * 		also works for sleepable programs.
+ * 	Return
+ * 		A 8-byte long unique number or 0 if *sk* is NULL.
+ *
  * u32 bpf_get_socket_uid(struct sk_buff *skb)
  * 	Return
  * 		The owner UID of the socket associated to *skb*. If the socket
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 1de9a6bf84711..46e7d3b67eb5b 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1733,6 +1733,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_skc_to_tcp_request_sock_proto;
 	case BPF_FUNC_skc_to_udp6_sock:
 		return &bpf_skc_to_udp6_sock_proto;
+	case BPF_FUNC_get_socket_cookie:
+		return &bpf_get_socket_ptr_cookie_proto;
 #endif
 	case BPF_FUNC_seq_printf:
 		return prog->expected_attach_type == BPF_TRACE_ITER ?
diff --git a/net/core/filter.c b/net/core/filter.c
index b9c954182b375..d105b30ecb463 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4640,6 +4640,18 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 };
 
+BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk)
+{
+	return sk ? sock_gen_cookie(sk) : 0;
+}
+
+const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = {
+	.func		= bpf_get_socket_ptr_cookie,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+};
+
 BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
 {
 	return __sock_gen_cookie(ctx->sk);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 7943e748916d4..d07ceba1b6b5c 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1661,6 +1661,14 @@ union bpf_attr {
  * 	Return
  * 		A 8-byte long non-decreasing number.
  *
+ * u64 bpf_get_socket_cookie(struct sock *sk)
+ * 	Description
+ * 		Equivalent to **bpf_get_socket_cookie**\ () helper that accepts
+ * 		*sk*, but gets socket from a BTF **struct sock**. This helper
+ * 		also works for sleepable programs.
+ * 	Return
+ * 		A 8-byte long unique number or 0 if *sk* is NULL.
+ *
  * u32 bpf_get_socket_uid(struct sk_buff *skb)
  * 	Return
  * 		The owner UID of the socket associated to *skb*. If the socket

From bb948c42a7d4f1acd8204bfe05ec0929cf8009d8 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Thu, 1 Jul 2021 16:12:56 +0000
Subject: [PATCH 136/737] bpf: Add ASSERT_NEQ(), ASSERT_FALSE(), and
 ASSERT_GE() for selftest.

ASSERT_NEQ() is copied from 197389da2fbf ("selftests/bpf: Add split BTF
basic test"), ASSERT_FALSE() and ASSERT_GE() are from 7a2fa70aaffc
("selftests/bpf: Add remaining ASSERT_xxx() variants").

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
---
 tools/testing/selftests/bpf/test_progs.h | 29 ++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h
index 1d429d67f8ddc..e673d4936e42d 100644
--- a/tools/testing/selftests/bpf/test_progs.h
+++ b/tools/testing/selftests/bpf/test_progs.h
@@ -129,6 +129,13 @@ extern int test__join_cgroup(const char *path);
 #define CHECK_ATTR(condition, tag, format...) \
 	_CHECK(condition, tag, tattr.duration, format)
 
+#define ASSERT_FALSE(actual, name) ({					\
+	static int duration = 0;					\
+	bool ___ok = !(actual);						\
+	CHECK(!___ok, (name), "unexpected %s: got TRUE\n", (name));	\
+	___ok;								\
+})
+
 #define ASSERT_EQ(actual, expected, name) ({				\
 	static int duration = 0;					\
 	typeof(actual) ___act = (actual);				\
@@ -140,6 +147,28 @@ extern int test__join_cgroup(const char *path);
 	___ok;								\
 })
 
+#define ASSERT_NEQ(actual, expected, name) ({				\
+	static int duration = 0;					\
+	typeof(actual) ___act = (actual);				\
+	typeof(expected) ___exp = (expected);				\
+	bool ___ok = ___act != ___exp;					\
+	CHECK(!___ok, (name),						\
+	      "unexpected %s: actual %lld == expected %lld\n",		\
+	      (name), (long long)(___act), (long long)(___exp));	\
+	___ok;								\
+})
+
+#define ASSERT_GE(actual, expected, name) ({				\
+	static int duration = 0;					\
+	typeof(actual) ___act = (actual);				\
+	typeof(expected) ___exp = (expected);				\
+	bool ___ok = ___act >= ___exp;					\
+	CHECK(!___ok, (name),						\
+	      "unexpected %s: actual %lld < expected %lld\n",		\
+	      (name), (long long)(___act), (long long)(___exp));	\
+	___ok;								\
+})
+
 #define ASSERT_STREQ(actual, expected, name) ({				\
 	static int duration = 0;					\
 	const char *___act = actual;					\

From bdc20d31068db337c250f4b6148eb87965ff4393 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Sat, 12 Jun 2021 21:32:15 +0900
Subject: [PATCH 137/737] tcp: Add num_closed_socks to struct sock_reuseport.

As noted in the following commit, a closed listener has to hold the
reference to the reuseport group for socket migration. This patch adds a
field (num_closed_socks) to struct sock_reuseport to manage closed sockets
within the same reuseport group. Moreover, this and the following commits
introduce some helper functions to split socks[] into two sections and keep
TCP_LISTEN and TCP_CLOSE sockets in each section. Like a double-ended
queue, we will place TCP_LISTEN sockets from the front and TCP_CLOSE
sockets from the end.

  TCP_LISTEN---------->       <-------TCP_CLOSE
  +---+---+  ---  +---+  ---  +---+  ---  +---+
  | 0 | 1 |  ...  | i |  ...  | j |  ...  | k |
  +---+---+  ---  +---+  ---  +---+  ---  +---+

  i = num_socks - 1
  j = max_socks - num_closed_socks
  k = max_socks - 1

This patch also extends reuseport_add_sock() and reuseport_grow() to
support num_closed_socks.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210612123224.12525-3-kuniyu@amazon.co.jp
(cherry picked from commit 5c040eaf5d1753aafe12989ca712175df0b9c436)
---
 include/net/sock_reuseport.h |  5 ++-
 net/core/sock_reuseport.c    | 75 +++++++++++++++++++++++++++---------
 2 files changed, 60 insertions(+), 20 deletions(-)

diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index 3eac185ae2e8a..6348c6f26903e 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -13,8 +13,9 @@ extern spinlock_t reuseport_lock;
 struct sock_reuseport {
 	struct rcu_head		rcu;
 
-	u16			max_socks;	/* length of socks */
-	u16			num_socks;	/* elements in socks */
+	u16			max_socks;		/* length of socks */
+	u16			num_socks;		/* elements in socks */
+	u16			num_closed_socks;	/* closed elements in socks */
 	/* The last synq overflow event timestamp of this
 	 * reuse->socks[] group.
 	 */
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 49f9c2c4ffd5a..364cf6c6912bb 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -34,6 +34,49 @@ void reuseport_has_conns_set(struct sock *sk)
 }
 EXPORT_SYMBOL(reuseport_has_conns_set);
 
+static int reuseport_sock_index(struct sock *sk,
+				const struct sock_reuseport *reuse,
+				bool closed)
+{
+	int left, right;
+
+	if (!closed) {
+		left = 0;
+		right = reuse->num_socks;
+	} else {
+		left = reuse->max_socks - reuse->num_closed_socks;
+		right = reuse->max_socks;
+	}
+
+	for (; left < right; left++)
+		if (reuse->socks[left] == sk)
+			return left;
+	return -1;
+}
+
+static void __reuseport_add_sock(struct sock *sk,
+				 struct sock_reuseport *reuse)
+{
+	reuse->socks[reuse->num_socks] = sk;
+	/* paired with smp_rmb() in reuseport_select_sock() */
+	smp_wmb();
+	reuse->num_socks++;
+}
+
+static bool __reuseport_detach_sock(struct sock *sk,
+				    struct sock_reuseport *reuse)
+{
+	int i = reuseport_sock_index(sk, reuse, false);
+
+	if (i == -1)
+		return false;
+
+	reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
+	reuse->num_socks--;
+
+	return true;
+}
+
 static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
 {
 	unsigned int size = sizeof(struct sock_reuseport) +
@@ -88,9 +131,9 @@ int reuseport_alloc(struct sock *sk, bool bind_inany)
 	}
 
 	reuse->reuseport_id = id;
+	reuse->bind_inany = bind_inany;
 	reuse->socks[0] = sk;
 	reuse->num_socks = 1;
-	reuse->bind_inany = bind_inany;
 	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
 
 out:
@@ -114,6 +157,7 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
 		return NULL;
 
 	more_reuse->num_socks = reuse->num_socks;
+	more_reuse->num_closed_socks = reuse->num_closed_socks;
 	more_reuse->prog = reuse->prog;
 	more_reuse->reuseport_id = reuse->reuseport_id;
 	more_reuse->bind_inany = reuse->bind_inany;
@@ -121,9 +165,13 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
 
 	memcpy(more_reuse->socks, reuse->socks,
 	       reuse->num_socks * sizeof(struct sock *));
+	memcpy(more_reuse->socks +
+	       (more_reuse->max_socks - more_reuse->num_closed_socks),
+	       reuse->socks + (reuse->max_socks - reuse->num_closed_socks),
+	       reuse->num_closed_socks * sizeof(struct sock *));
 	more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
 
-	for (i = 0; i < reuse->num_socks; ++i)
+	for (i = 0; i < reuse->max_socks; ++i)
 		rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
 				   more_reuse);
 
@@ -174,7 +222,7 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
 		return -EBUSY;
 	}
 
-	if (reuse->num_socks == reuse->max_socks) {
+	if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
 		reuse = reuseport_grow(reuse);
 		if (!reuse) {
 			spin_unlock_bh(&reuseport_lock);
@@ -182,10 +230,7 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
 		}
 	}
 
-	reuse->socks[reuse->num_socks] = sk;
-	/* paired with smp_rmb() in reuseport_select_sock() */
-	smp_wmb();
-	reuse->num_socks++;
+	__reuseport_add_sock(sk, reuse);
 	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
 
 	spin_unlock_bh(&reuseport_lock);
@@ -199,7 +244,6 @@ EXPORT_SYMBOL(reuseport_add_sock);
 void reuseport_detach_sock(struct sock *sk)
 {
 	struct sock_reuseport *reuse;
-	int i;
 
 	spin_lock_bh(&reuseport_lock);
 	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
@@ -216,16 +260,11 @@ void reuseport_detach_sock(struct sock *sk)
 	bpf_sk_reuseport_detach(sk);
 
 	rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
+	__reuseport_detach_sock(sk, reuse);
+
+	if (reuse->num_socks + reuse->num_closed_socks == 0)
+		call_rcu(&reuse->rcu, reuseport_free_rcu);
 
-	for (i = 0; i < reuse->num_socks; i++) {
-		if (reuse->socks[i] == sk) {
-			reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
-			reuse->num_socks--;
-			if (reuse->num_socks == 0)
-				call_rcu(&reuse->rcu, reuseport_free_rcu);
-			break;
-		}
-	}
 	spin_unlock_bh(&reuseport_lock);
 }
 EXPORT_SYMBOL(reuseport_detach_sock);
@@ -290,7 +329,7 @@ struct sock *reuseport_select_sock(struct sock *sk,
 	prog = rcu_dereference(reuse->prog);
 	socks = READ_ONCE(reuse->num_socks);
 	if (likely(socks)) {
-		/* paired with smp_wmb() in reuseport_add_sock() */
+		/* paired with smp_wmb() in __reuseport_add_sock() */
 		smp_rmb();
 
 		if (!prog || !skb)

From 1a65c6605d100b4b877137666b341af1b8f870fd Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Sat, 12 Jun 2021 21:32:16 +0900
Subject: [PATCH 138/737] tcp: Keep TCP_CLOSE sockets in the reuseport group.

When we close a listening socket, to migrate its connections to another
listener in the same reuseport group, we have to handle two kinds of child
sockets. One is that a listening socket has a reference to, and the other
is not.

The former is the TCP_ESTABLISHED/TCP_SYN_RECV sockets, and they are in the
accept queue of their listening socket. So we can pop them out and push
them into another listener's queue at close() or shutdown() syscalls. On
the other hand, the latter, the TCP_NEW_SYN_RECV socket is during the
three-way handshake and not in the accept queue. Thus, we cannot access
such sockets at close() or shutdown() syscalls. Accordingly, we have to
migrate immature sockets after their listening socket has been closed.

Currently, if their listening socket has been closed, TCP_NEW_SYN_RECV
sockets are freed at receiving the final ACK or retransmitting SYN+ACKs. At
that time, if we could select a new listener from the same reuseport group,
no connection would be aborted. However, we cannot do that because
reuseport_detach_sock() sets NULL to sk_reuseport_cb and forbids access to
the reuseport group from closed sockets.

This patch allows TCP_CLOSE sockets to remain in the reuseport group and
access it while any child socket references them. The point is that
reuseport_detach_sock() was called twice from inet_unhash() and
sk_destruct(). This patch replaces the first reuseport_detach_sock() with
reuseport_stop_listen_sock(), which checks if the reuseport group is
capable of migration. If capable, it decrements num_socks, moves the socket
backwards in socks[] and increments num_closed_socks. When all connections
are migrated, sk_destruct() calls reuseport_detach_sock() to remove the
socket from socks[], decrement num_closed_socks, and set NULL to
sk_reuseport_cb.

By this change, closed or shutdowned sockets can keep sk_reuseport_cb.
Consequently, calling listen() after shutdown() can cause EADDRINUSE or
EBUSY in inet_csk_bind_conflict() or reuseport_add_sock() which expects
such sockets not to have the reuseport group. Therefore, this patch also
loosens such validation rules so that a socket can listen again if it has a
reuseport group with num_closed_socks more than 0.

When such sockets listen again, we handle them in reuseport_resurrect(). If
there is an existing reuseport group (reuseport_add_sock() path), we move
the socket from the old group to the new one and free the old one if
necessary. If there is no existing group (reuseport_alloc() path), we
allocate a new reuseport group, detach sk from the old one, and free it if
necessary, not to break the current shutdown behaviour:

  - we cannot carry over the eBPF prog of shutdowned sockets
  - we cannot attach/detach an eBPF prog to/from listening sockets via
    shutdowned sockets

Note that when the number of sockets gets over U16_MAX, we try to detach a
closed socket randomly to make room for the new listening socket in
reuseport_grow().

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/bpf/20210612123224.12525-4-kuniyu@amazon.co.jp
(cherry picked from commit 333bb73f620e1a5f2e0b8df2c0d25300fab36d89)
---
 include/net/sock_reuseport.h    |   1 +
 net/core/sock_reuseport.c       | 182 ++++++++++++++++++++++++++++++--
 net/ipv4/inet_connection_sock.c |  12 ++-
 net/ipv4/inet_hashtables.c      |   2 +-
 4 files changed, 186 insertions(+), 11 deletions(-)

diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index 6348c6f26903e..7e8a4ed0e8eac 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -32,6 +32,7 @@ extern int reuseport_alloc(struct sock *sk, bool bind_inany);
 extern int reuseport_add_sock(struct sock *sk, struct sock *sk2,
 			      bool bind_inany);
 extern void reuseport_detach_sock(struct sock *sk);
+void reuseport_stop_listen_sock(struct sock *sk);
 extern struct sock *reuseport_select_sock(struct sock *sk,
 					  u32 hash,
 					  struct sk_buff *skb,
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 364cf6c6912bb..3a9415648be64 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -17,6 +17,8 @@
 DEFINE_SPINLOCK(reuseport_lock);
 
 static DEFINE_IDA(reuseport_ida);
+static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
+			       struct sock_reuseport *reuse, bool bind_inany);
 
 void reuseport_has_conns_set(struct sock *sk)
 {
@@ -77,6 +79,29 @@ static bool __reuseport_detach_sock(struct sock *sk,
 	return true;
 }
 
+static void __reuseport_add_closed_sock(struct sock *sk,
+					struct sock_reuseport *reuse)
+{
+	reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk;
+	/* paired with READ_ONCE() in inet_csk_bind_conflict() */
+	WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1);
+}
+
+static bool __reuseport_detach_closed_sock(struct sock *sk,
+					   struct sock_reuseport *reuse)
+{
+	int i = reuseport_sock_index(sk, reuse, true);
+
+	if (i == -1)
+		return false;
+
+	reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
+	/* paired with READ_ONCE() in inet_csk_bind_conflict() */
+	WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks - 1);
+
+	return true;
+}
+
 static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
 {
 	unsigned int size = sizeof(struct sock_reuseport) +
@@ -108,6 +133,12 @@ int reuseport_alloc(struct sock *sk, bool bind_inany)
 	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
 					  lockdep_is_held(&reuseport_lock));
 	if (reuse) {
+		if (reuse->num_closed_socks) {
+			/* sk was shutdown()ed before */
+			ret = reuseport_resurrect(sk, reuse, NULL, bind_inany);
+			goto out;
+		}
+
 		/* Only set reuse->bind_inany if the bind_inany is true.
 		 * Otherwise, it will overwrite the reuse->bind_inany
 		 * which was set by the bind/hash path.
@@ -149,8 +180,23 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
 	u32 more_socks_size, i;
 
 	more_socks_size = reuse->max_socks * 2U;
-	if (more_socks_size > U16_MAX)
+	if (more_socks_size > U16_MAX) {
+		if (reuse->num_closed_socks) {
+			/* Make room by removing a closed sk.
+			 * The child has already been migrated.
+			 * Only reqsk left at this point.
+			 */
+			struct sock *sk;
+
+			sk = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
+			RCU_INIT_POINTER(sk->sk_reuseport_cb, NULL);
+			__reuseport_detach_closed_sock(sk, reuse);
+
+			return reuse;
+		}
+
 		return NULL;
+	}
 
 	more_reuse = __reuseport_alloc(more_socks_size);
 	if (!more_reuse)
@@ -216,7 +262,15 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
 	reuse = rcu_dereference_protected(sk2->sk_reuseport_cb,
 					  lockdep_is_held(&reuseport_lock));
 	old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
-					     lockdep_is_held(&reuseport_lock));
+					      lockdep_is_held(&reuseport_lock));
+	if (old_reuse && old_reuse->num_closed_socks) {
+		/* sk was shutdown()ed before */
+		int err = reuseport_resurrect(sk, old_reuse, reuse, reuse->bind_inany);
+
+		spin_unlock_bh(&reuseport_lock);
+		return err;
+	}
+
 	if (old_reuse && old_reuse->num_socks != 1) {
 		spin_unlock_bh(&reuseport_lock);
 		return -EBUSY;
@@ -241,6 +295,65 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
 }
 EXPORT_SYMBOL(reuseport_add_sock);
 
+static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
+			       struct sock_reuseport *reuse, bool bind_inany)
+{
+	if (old_reuse == reuse) {
+		/* If sk was in the same reuseport group, just pop sk out of
+		 * the closed section and push sk into the listening section.
+		 */
+		__reuseport_detach_closed_sock(sk, old_reuse);
+		__reuseport_add_sock(sk, old_reuse);
+		return 0;
+	}
+
+	if (!reuse) {
+		/* In bind()/listen() path, we cannot carry over the eBPF prog
+		 * for the shutdown()ed socket. In setsockopt() path, we should
+		 * not change the eBPF prog of listening sockets by attaching a
+		 * prog to the shutdown()ed socket. Thus, we will allocate a new
+		 * reuseport group and detach sk from the old group.
+		 */
+		int id;
+
+		reuse = __reuseport_alloc(INIT_SOCKS);
+		if (!reuse)
+			return -ENOMEM;
+
+		id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
+		if (id < 0) {
+			kfree(reuse);
+			return id;
+		}
+
+		reuse->reuseport_id = id;
+		reuse->bind_inany = bind_inany;
+	} else {
+		/* Move sk from the old group to the new one if
+		 * - all the other listeners in the old group were close()d or
+		 *   shutdown()ed, and then sk2 has listen()ed on the same port
+		 * OR
+		 * - sk listen()ed without bind() (or with autobind), was
+		 *   shutdown()ed, and then listen()s on another port which
+		 *   sk2 listen()s on.
+		 */
+		if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
+			reuse = reuseport_grow(reuse);
+			if (!reuse)
+				return -ENOMEM;
+		}
+	}
+
+	__reuseport_detach_closed_sock(sk, old_reuse);
+	__reuseport_add_sock(sk, reuse);
+	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
+
+	if (old_reuse->num_socks + old_reuse->num_closed_socks == 0)
+		call_rcu(&old_reuse->rcu, reuseport_free_rcu);
+
+	return 0;
+}
+
 void reuseport_detach_sock(struct sock *sk)
 {
 	struct sock_reuseport *reuse;
@@ -249,6 +362,10 @@ void reuseport_detach_sock(struct sock *sk)
 	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
 					  lockdep_is_held(&reuseport_lock));
 
+	/* reuseport_grow() has detached a closed sk */
+	if (!reuse)
+		goto out;
+
 	/* Notify the bpf side. The sk may be added to a sockarray
 	 * map. If so, sockarray logic will remove it from the map.
 	 *
@@ -260,15 +377,49 @@ void reuseport_detach_sock(struct sock *sk)
 	bpf_sk_reuseport_detach(sk);
 
 	rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
-	__reuseport_detach_sock(sk, reuse);
+
+	if (!__reuseport_detach_closed_sock(sk, reuse))
+		__reuseport_detach_sock(sk, reuse);
 
 	if (reuse->num_socks + reuse->num_closed_socks == 0)
 		call_rcu(&reuse->rcu, reuseport_free_rcu);
 
+out:
 	spin_unlock_bh(&reuseport_lock);
 }
 EXPORT_SYMBOL(reuseport_detach_sock);
 
+void reuseport_stop_listen_sock(struct sock *sk)
+{
+	if (sk->sk_protocol == IPPROTO_TCP) {
+		struct sock_reuseport *reuse;
+
+		spin_lock_bh(&reuseport_lock);
+
+		reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+						  lockdep_is_held(&reuseport_lock));
+
+		if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req) {
+			/* Migration capable, move sk from the listening section
+			 * to the closed section.
+			 */
+			bpf_sk_reuseport_detach(sk);
+
+			__reuseport_detach_sock(sk, reuse);
+			__reuseport_add_closed_sock(sk, reuse);
+
+			spin_unlock_bh(&reuseport_lock);
+			return;
+		}
+
+		spin_unlock_bh(&reuseport_lock);
+	}
+
+	/* Not capable to do migration, detach immediately */
+	reuseport_detach_sock(sk);
+}
+EXPORT_SYMBOL(reuseport_stop_listen_sock);
+
 static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
 				   struct bpf_prog *prog, struct sk_buff *skb,
 				   int hdr_len)
@@ -368,9 +519,13 @@ int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
 	struct sock_reuseport *reuse;
 	struct bpf_prog *old_prog;
 
-	if (sk_unhashed(sk) && sk->sk_reuseport) {
-		int err = reuseport_alloc(sk, false);
+	if (sk_unhashed(sk)) {
+		int err;
 
+		if (!sk->sk_reuseport)
+			return -EINVAL;
+
+		err = reuseport_alloc(sk, false);
 		if (err)
 			return err;
 	} else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
@@ -396,13 +551,24 @@ int reuseport_detach_prog(struct sock *sk)
 	struct sock_reuseport *reuse;
 	struct bpf_prog *old_prog;
 
-	if (!rcu_access_pointer(sk->sk_reuseport_cb))
-		return sk->sk_reuseport ? -ENOENT : -EINVAL;
-
 	old_prog = NULL;
 	spin_lock_bh(&reuseport_lock);
 	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
 					  lockdep_is_held(&reuseport_lock));
+
+	/* reuse must be checked after acquiring the reuseport_lock
+	 * because reuseport_grow() can detach a closed sk.
+	 */
+	if (!reuse) {
+		spin_unlock_bh(&reuseport_lock);
+		return sk->sk_reuseport ? -ENOENT : -EINVAL;
+	}
+
+	if (sk_unhashed(sk) && reuse->num_closed_socks) {
+		spin_unlock_bh(&reuseport_lock);
+		return -ENOENT;
+	}
+
 	old_prog = rcu_replace_pointer(reuse->prog, old_prog,
 				       lockdep_is_held(&reuseport_lock));
 	spin_unlock_bh(&reuseport_lock);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 5f71a1c74e7e0..473e8a1e4105b 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -135,10 +135,18 @@ static int inet_csk_bind_conflict(const struct sock *sk,
 				  bool relax, bool reuseport_ok)
 {
 	struct sock *sk2;
+	bool reuseport_cb_ok;
 	bool reuse = sk->sk_reuse;
 	bool reuseport = !!sk->sk_reuseport;
+	struct sock_reuseport *reuseport_cb;
 	kuid_t uid = sock_i_uid((struct sock *)sk);
 
+	rcu_read_lock();
+	reuseport_cb = rcu_dereference(sk->sk_reuseport_cb);
+	/* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */
+	reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks);
+	rcu_read_unlock();
+
 	/*
 	 * Unlike other sk lookup places we do not check
 	 * for sk_net here, since _all_ the socks listed
@@ -160,14 +168,14 @@ static int inet_csk_bind_conflict(const struct sock *sk,
 				if ((!relax ||
 				     (!reuseport_ok &&
 				      reuseport && sk2->sk_reuseport &&
-				      !rcu_access_pointer(sk->sk_reuseport_cb) &&
+				      reuseport_cb_ok &&
 				      (sk2->sk_state == TCP_TIME_WAIT ||
 				       uid_eq(uid, sock_i_uid(sk2))))) &&
 				    inet_rcv_saddr_equal(sk, sk2, true))
 					break;
 			} else if (!reuseport_ok ||
 				   !reuseport || !sk2->sk_reuseport ||
-				   rcu_access_pointer(sk->sk_reuseport_cb) ||
+				   !reuseport_cb_ok ||
 				   (sk2->sk_state != TCP_TIME_WAIT &&
 				    !uid_eq(uid, sock_i_uid(sk2)))) {
 				if (inet_rcv_saddr_equal(sk, sk2, true))
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index ad050f8476b8e..196acc78d8a2e 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -679,7 +679,7 @@ static void __inet_unhash(struct sock *sk, struct inet_listen_hashbucket *ilb)
 		return;
 
 	if (rcu_access_pointer(sk->sk_reuseport_cb))
-		reuseport_detach_sock(sk);
+		reuseport_stop_listen_sock(sk);
 	if (ilb) {
 		struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
 

From 435dba56ac4bee2b4bbd038567998c7e7f35e93b Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Sat, 12 Jun 2021 21:32:17 +0900
Subject: [PATCH 139/737] tcp: Add reuseport_migrate_sock() to select a new
 listener.

reuseport_migrate_sock() does the same check done in
reuseport_listen_stop_sock(). If the reuseport group is capable of
migration, reuseport_migrate_sock() selects a new listener by the child
socket hash and increments the listener's sk_refcnt beforehand. Thus, if we
fail in the migration, we have to decrement it later.

We will support migration by eBPF in the later commits.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/bpf/20210612123224.12525-5-kuniyu@amazon.co.jp
(cherry picked from commit 1cd62c21572c1df6e7090ea4cabf4cf509616dbb)
---
 include/net/sock_reuseport.h |  3 ++
 net/core/sock_reuseport.c    | 78 +++++++++++++++++++++++++++++-------
 2 files changed, 67 insertions(+), 14 deletions(-)

diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index 7e8a4ed0e8eac..efc9085c68927 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -37,6 +37,9 @@ extern struct sock *reuseport_select_sock(struct sock *sk,
 					  u32 hash,
 					  struct sk_buff *skb,
 					  int hdr_len);
+struct sock *reuseport_migrate_sock(struct sock *sk,
+				    struct sock *migrating_sk,
+				    struct sk_buff *skb);
 extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog);
 extern int reuseport_detach_prog(struct sock *sk);
 
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 3a9415648be64..3bdabad5f1dd8 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -60,7 +60,7 @@ static void __reuseport_add_sock(struct sock *sk,
 				 struct sock_reuseport *reuse)
 {
 	reuse->socks[reuse->num_socks] = sk;
-	/* paired with smp_rmb() in reuseport_select_sock() */
+	/* paired with smp_rmb() in reuseport_(select|migrate)_sock() */
 	smp_wmb();
 	reuse->num_socks++;
 }
@@ -450,6 +450,23 @@ static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
 	return reuse->socks[index];
 }
 
+static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse,
+						  u32 hash, u16 num_socks)
+{
+	int i, j;
+
+	i = j = reciprocal_scale(hash, num_socks);
+	while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
+		i++;
+		if (i >= num_socks)
+			i = 0;
+		if (i == j)
+			return NULL;
+	}
+
+	return reuse->socks[i];
+}
+
 /**
  *  reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
  *  @sk: First socket in the group.
@@ -493,19 +510,8 @@ struct sock *reuseport_select_sock(struct sock *sk,
 
 select_by_hash:
 		/* no bpf or invalid bpf result: fall back to hash usage */
-		if (!sk2) {
-			int i, j;
-
-			i = j = reciprocal_scale(hash, socks);
-			while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
-				i++;
-				if (i >= socks)
-					i = 0;
-				if (i == j)
-					goto out;
-			}
-			sk2 = reuse->socks[i];
-		}
+		if (!sk2)
+			sk2 = reuseport_select_sock_by_hash(reuse, hash, socks);
 	}
 
 out:
@@ -514,6 +520,50 @@ struct sock *reuseport_select_sock(struct sock *sk,
 }
 EXPORT_SYMBOL(reuseport_select_sock);
 
+/**
+ *  reuseport_migrate_sock - Select a socket from an SO_REUSEPORT group.
+ *  @sk: close()ed or shutdown()ed socket in the group.
+ *  @migrating_sk: ESTABLISHED/SYN_RECV full socket in the accept queue or
+ *    NEW_SYN_RECV request socket during 3WHS.
+ *  @skb: skb to run through BPF filter.
+ *  Returns a socket (with sk_refcnt +1) that should accept the child socket
+ *  (or NULL on error).
+ */
+struct sock *reuseport_migrate_sock(struct sock *sk,
+				    struct sock *migrating_sk,
+				    struct sk_buff *skb)
+{
+	struct sock_reuseport *reuse;
+	struct sock *nsk = NULL;
+	u16 socks;
+	u32 hash;
+
+	rcu_read_lock();
+
+	reuse = rcu_dereference(sk->sk_reuseport_cb);
+	if (!reuse)
+		goto out;
+
+	socks = READ_ONCE(reuse->num_socks);
+	if (unlikely(!socks))
+		goto out;
+
+	/* paired with smp_wmb() in __reuseport_add_sock() */
+	smp_rmb();
+
+	hash = migrating_sk->sk_hash;
+	if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req)
+		nsk = reuseport_select_sock_by_hash(reuse, hash, socks);
+
+	if (nsk && unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt)))
+		nsk = NULL;
+
+out:
+	rcu_read_unlock();
+	return nsk;
+}
+EXPORT_SYMBOL(reuseport_migrate_sock);
+
 int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
 {
 	struct sock_reuseport *reuse;

From ed4347422e04ec07e842a939260bac61e54a3118 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Sat, 12 Jun 2021 21:32:18 +0900
Subject: [PATCH 140/737] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in
 accept queues.

When we call close() or shutdown() for listening sockets, each child socket
in the accept queue are freed at inet_csk_listen_stop(). If we can get a
new listener by reuseport_migrate_sock() and clone the request by
inet_reqsk_clone(), we try to add it into the new listener's accept queue
by inet_csk_reqsk_queue_add(). If it fails, we have to call __reqsk_free()
to call sock_put() for its listener and free the cloned request.

After putting the full socket into ehash, tcp_v[46]_syn_recv_sock() sets
NULL to ireq_opt/pktopts in struct inet_request_sock, but ipv6_opt can be
non-NULL. So, we have to set NULL to ipv6_opt of the old request to avoid
double free.

Note that we do not update req->rsk_listener and instead clone the req to
migrate because another path may reference the original request. If we
protected it by RCU, we would need to add rcu_read_lock() in many places.

Suggested-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/netdev/20201209030903.hhow5r53l6fmozjn@kafai-mbp.dhcp.thefacebook.com/
Link: https://lore.kernel.org/bpf/20210612123224.12525-6-kuniyu@amazon.co.jp
(cherry picked from commit 54b92e84193749c9968aff2dd46e3b0f42643e18)
---
 net/ipv4/inet_connection_sock.c | 70 ++++++++++++++++++++++++++++++++-
 1 file changed, 69 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 473e8a1e4105b..e1a46ae6501e5 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -699,6 +699,52 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)
 }
 EXPORT_SYMBOL(inet_rtx_syn_ack);
 
+static struct request_sock *inet_reqsk_clone(struct request_sock *req,
+					     struct sock *sk)
+{
+	struct sock *req_sk, *nreq_sk;
+	struct request_sock *nreq;
+
+	nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN);
+	if (!nreq) {
+		/* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */
+		sock_put(sk);
+		return NULL;
+	}
+
+	req_sk = req_to_sk(req);
+	nreq_sk = req_to_sk(nreq);
+
+	memcpy(nreq_sk, req_sk,
+	       offsetof(struct sock, sk_dontcopy_begin));
+	memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end,
+	       req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end));
+
+	sk_node_init(&nreq_sk->sk_node);
+	nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping;
+#ifdef CONFIG_XPS
+	nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping;
+#endif
+	nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu;
+
+	nreq->rsk_listener = sk;
+
+	/* We need not acquire fastopenq->lock
+	 * because the child socket is locked in inet_csk_listen_stop().
+	 */
+	if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(nreq)->tfo_listener)
+		rcu_assign_pointer(tcp_sk(nreq->sk)->fastopen_rsk, nreq);
+
+	return nreq;
+}
+
+static void reqsk_migrate_reset(struct request_sock *req)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	inet_rsk(req)->ipv6_opt = NULL;
+#endif
+}
+
 /* return true if req was found in the ehash table */
 static bool reqsk_queue_unlink(struct request_sock *req)
 {
@@ -1057,14 +1103,36 @@ void inet_csk_listen_stop(struct sock *sk)
 	 * of the variants now.			--ANK
 	 */
 	while ((req = reqsk_queue_remove(queue, sk)) != NULL) {
-		struct sock *child = req->sk;
+		struct sock *child = req->sk, *nsk;
+		struct request_sock *nreq;
 
 		local_bh_disable();
 		bh_lock_sock(child);
 		WARN_ON(sock_owned_by_user(child));
 		sock_hold(child);
 
+		nsk = reuseport_migrate_sock(sk, child, NULL);
+		if (nsk) {
+			nreq = inet_reqsk_clone(req, nsk);
+			if (nreq) {
+				refcount_set(&nreq->rsk_refcnt, 1);
+
+				if (inet_csk_reqsk_queue_add(nsk, nreq, child)) {
+					reqsk_migrate_reset(req);
+				} else {
+					reqsk_migrate_reset(nreq);
+					__reqsk_free(nreq);
+				}
+
+				/* inet_csk_reqsk_queue_add() has already
+				 * called inet_child_forget() on failure case.
+				 */
+				goto skip_child_forget;
+			}
+		}
+
 		inet_child_forget(sk, req, child);
+skip_child_forget:
 		reqsk_put(req);
 		bh_unlock_sock(child);
 		local_bh_enable();

From 13c8d8fcc80ecaa3deec4673d6292eaa3e2dedb4 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Sat, 12 Jun 2021 21:32:19 +0900
Subject: [PATCH 141/737] tcp: Migrate TCP_NEW_SYN_RECV requests at
 retransmitting SYN+ACKs.

As with the preceding patch, this patch changes reqsk_timer_handler() to
call reuseport_migrate_sock() and inet_reqsk_clone() to migrate in-flight
requests at retransmitting SYN+ACKs. If we can select a new listener and
clone the request, we resume setting the SYN+ACK timer for the new req. If
we can set the timer, we call inet_ehash_insert() to unhash the old req and
put the new req into ehash.

The noteworthy point here is that by unhashing the old req, another CPU
processing it may lose the "own_req" race in tcp_v[46]_syn_recv_sock() and
drop the final ACK packet. However, the new timer will recover this
situation.

[ Hailmo: resolved conflict with upstream patches 7b00849 and 9168bd8
when rebasing onto 5.10.190 ]

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210612123224.12525-7-kuniyu@amazon.co.jp
(cherry picked from commit c905dee62232db583b50fe214080b98db623151e)
---
 net/ipv4/inet_connection_sock.c | 75 ++++++++++++++++++++++++++++++---
 1 file changed, 69 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index e1a46ae6501e5..08685631d5631 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -738,10 +738,22 @@ static struct request_sock *inet_reqsk_clone(struct request_sock *req,
 	return nreq;
 }
 
+static void reqsk_queue_migrated(struct request_sock_queue *queue,
+				 const struct request_sock *req)
+{
+	if (req->num_timeout == 0)
+		atomic_inc(&queue->young);
+	atomic_inc(&queue->qlen);
+}
+
 static void reqsk_migrate_reset(struct request_sock *req)
 {
+	req->saved_syn = NULL;
 #if IS_ENABLED(CONFIG_IPV6)
 	inet_rsk(req)->ipv6_opt = NULL;
+	inet_rsk(req)->pktopts = NULL;
+#else
+	inet_rsk(req)->ireq_opt = NULL;
 #endif
 }
 
@@ -785,15 +797,39 @@ EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put);
 static void reqsk_timer_handler(struct timer_list *t)
 {
 	struct request_sock *req = from_timer(req, t, rsk_timer);
+	struct request_sock *nreq = NULL, *oreq = req;
 	struct sock *sk_listener = req->rsk_listener;
-	struct net *net = sock_net(sk_listener);
-	struct inet_connection_sock *icsk = inet_csk(sk_listener);
-	struct request_sock_queue *queue = &icsk->icsk_accept_queue;
+	struct inet_connection_sock *icsk;
+	struct request_sock_queue *queue;
+	struct net *net;
 	int max_syn_ack_retries, qlen, expire = 0, resend = 0;
 
-	if (inet_sk_state_load(sk_listener) != TCP_LISTEN)
-		goto drop;
+	if (inet_sk_state_load(sk_listener) != TCP_LISTEN) {
+		struct sock *nsk;
+
+		nsk = reuseport_migrate_sock(sk_listener, req_to_sk(req), NULL);
+		if (!nsk)
+			goto drop;
+
+		nreq = inet_reqsk_clone(req, nsk);
+		if (!nreq)
+			goto drop;
+
+		/* The new timer for the cloned req can decrease the 2
+		 * by calling inet_csk_reqsk_queue_drop_and_put(), so
+		 * hold another count to prevent use-after-free and
+		 * call reqsk_put() just before return.
+		 */
+		refcount_set(&nreq->rsk_refcnt, 2 + 1);
+		timer_setup(&nreq->rsk_timer, reqsk_timer_handler, TIMER_PINNED);
+		reqsk_queue_migrated(&inet_csk(nsk)->icsk_accept_queue, req);
+
+		req = nreq;
+		sk_listener = nsk;
+	}
 
+	icsk = inet_csk(sk_listener);
+	net = sock_net(sk_listener);
 	max_syn_ack_retries = READ_ONCE(icsk->icsk_syn_retries) ? :
 		READ_ONCE(net->ipv4.sysctl_tcp_synack_retries);
 	/* Normally all the openreqs are young and become mature
@@ -813,6 +849,7 @@ static void reqsk_timer_handler(struct timer_list *t)
 	 * embrions; and abort old ones without pity, if old
 	 * ones are about to clog our table.
 	 */
+	queue = &icsk->icsk_accept_queue;
 	qlen = reqsk_queue_len(queue);
 	if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) {
 		int young = reqsk_queue_len_young(queue) << 1;
@@ -837,10 +874,36 @@ static void reqsk_timer_handler(struct timer_list *t)
 			atomic_dec(&queue->young);
 		timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
 		mod_timer(&req->rsk_timer, jiffies + timeo);
+
+		if (!nreq)
+			return;
+
+		if (!inet_ehash_insert(req_to_sk(nreq), req_to_sk(oreq), NULL)) {
+			/* delete timer */
+			inet_csk_reqsk_queue_drop(sk_listener, nreq);
+			goto drop;
+		}
+
+		reqsk_migrate_reset(oreq);
+		reqsk_queue_removed(&inet_csk(oreq->rsk_listener)->icsk_accept_queue, oreq);
+		reqsk_put(oreq);
+
+		reqsk_put(nreq);
 		return;
 	}
+
 drop:
-	inet_csk_reqsk_queue_drop_and_put(sk_listener, req);
+	/* Even if we can clone the req, we may need not retransmit any more
+	 * SYN+ACKs (nreq->num_timeout > max_syn_ack_retries, etc), or another
+	 * CPU may win the "own_req" race so that inet_ehash_insert() fails.
+	 */
+	if (nreq) {
+		reqsk_migrate_reset(nreq);
+		reqsk_queue_removed(queue, nreq);
+		__reqsk_free(nreq);
+	}
+
+	inet_csk_reqsk_queue_drop_and_put(oreq->rsk_listener, oreq);
 }
 
 static void reqsk_queue_hash_req(struct request_sock *req,

From 4f932e779186b7277479c1646e2da675679b7703 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Sat, 12 Jun 2021 21:32:20 +0900
Subject: [PATCH 142/737] tcp: Migrate TCP_NEW_SYN_RECV requests at receiving
 the final ACK.

This patch also changes the code to call reuseport_migrate_sock() and
inet_reqsk_clone(), but unlike the other cases, we do not call
inet_reqsk_clone() right after reuseport_migrate_sock().

Currently, in the receive path for TCP_NEW_SYN_RECV sockets, its listener
has three kinds of refcnt:

  (A) for listener itself
  (B) carried by reuqest_sock
  (C) sock_hold() in tcp_v[46]_rcv()

While processing the req, (A) may disappear by close(listener). Also, (B)
can disappear by accept(listener) once we put the req into the accept
queue. So, we have to hold another refcnt (C) for the listener to prevent
use-after-free.

For socket migration, we call reuseport_migrate_sock() to select a listener
with (A) and to increment the new listener's refcnt in tcp_v[46]_rcv().
This refcnt corresponds to (C) and is cleaned up later in tcp_v[46]_rcv().
Thus we have to take another refcnt (B) for the newly cloned request_sock.

In inet_csk_complete_hashdance(), we hold the count (B), clone the req, and
try to put the new req into the accept queue. By migrating req after
winning the "own_req" race, we can avoid such a worst situation:

  CPU 1 looks up req1
  CPU 2 looks up req1, unhashes it, then CPU 1 loses the race
  CPU 3 looks up req2, unhashes it, then CPU 2 loses the race
  ...

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210612123224.12525-8-kuniyu@amazon.co.jp
(cherry picked from commit d4f2c86b2b7e2e606e0868b38c8c6c49cc193a8e)
---
 net/ipv4/inet_connection_sock.c | 34 ++++++++++++++++++++++++++++++---
 net/ipv4/tcp_ipv4.c             | 20 +++++++++++++------
 net/ipv4/tcp_minisocks.c        |  4 ++--
 net/ipv6/tcp_ipv6.c             | 14 +++++++++++---
 4 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 08685631d5631..4939dd81a6e7e 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -1135,12 +1135,40 @@ struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
 					 struct request_sock *req, bool own_req)
 {
 	if (own_req) {
-		inet_csk_reqsk_queue_drop(sk, req);
-		reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
-		if (inet_csk_reqsk_queue_add(sk, req, child))
+		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
+		reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req);
+
+		if (sk != req->rsk_listener) {
+			/* another listening sk has been selected,
+			 * migrate the req to it.
+			 */
+			struct request_sock *nreq;
+
+			/* hold a refcnt for the nreq->rsk_listener
+			 * which is assigned in inet_reqsk_clone()
+			 */
+			sock_hold(sk);
+			nreq = inet_reqsk_clone(req, sk);
+			if (!nreq) {
+				inet_child_forget(sk, req, child);
+				goto child_put;
+			}
+
+			refcount_set(&nreq->rsk_refcnt, 1);
+			if (inet_csk_reqsk_queue_add(sk, nreq, child)) {
+				reqsk_migrate_reset(req);
+				reqsk_put(req);
+				return child;
+			}
+
+			reqsk_migrate_reset(nreq);
+			__reqsk_free(nreq);
+		} else if (inet_csk_reqsk_queue_add(sk, req, child)) {
 			return child;
+		}
 	}
 	/* Too bad, another child took ownership of the request, undo. */
+child_put:
 	bh_unlock_sock(child);
 	sock_put(child);
 	return NULL;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b40780fde7915..33d89d89baeec 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2006,13 +2006,21 @@ int tcp_v4_rcv(struct sk_buff *skb)
 			goto csum_error;
 		}
 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
-			inet_csk_reqsk_queue_drop_and_put(sk, req);
-			goto lookup;
+			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
+			if (!nsk) {
+				inet_csk_reqsk_queue_drop_and_put(sk, req);
+				goto lookup;
+			}
+			sk = nsk;
+			/* reuseport_migrate_sock() has already held one sk_refcnt
+			 * before returning.
+			 */
+		} else {
+			/* We own a reference on the listener, increase it again
+			 * as we might lose it too soon.
+			 */
+			sock_hold(sk);
 		}
-		/* We own a reference on the listener, increase it again
-		 * as we might lose it too soon.
-		 */
-		sock_hold(sk);
 		refcounted = true;
 		nsk = NULL;
 		if (!tcp_filter(sk, skb)) {
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 01e27620b7ee5..43d47046d3700 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -781,8 +781,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 		goto listen_overflow;
 
 	if (own_req && rsk_drop_req(req)) {
-		reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
-		inet_csk_reqsk_queue_drop_and_put(sk, req);
+		reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req);
+		inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req);
 		return child;
 	}
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 79d6f6ea3c546..23253155e726a 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1663,10 +1663,18 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
 			goto csum_error;
 		}
 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
-			inet_csk_reqsk_queue_drop_and_put(sk, req);
-			goto lookup;
+			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
+			if (!nsk) {
+				inet_csk_reqsk_queue_drop_and_put(sk, req);
+				goto lookup;
+			}
+			sk = nsk;
+			/* reuseport_migrate_sock() has already held one sk_refcnt
+			 * before returning.
+			 */
+		} else {
+			sock_hold(sk);
 		}
-		sock_hold(sk);
 		refcounted = true;
 		nsk = NULL;
 		if (!tcp_filter(sk, skb)) {

From 3a6aa36e08f63a3da0997ae0fa46c9b7ca900119 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Sat, 12 Jun 2021 21:32:21 +0900
Subject: [PATCH 143/737] bpf: Support BPF_FUNC_get_socket_cookie() for
 BPF_PROG_TYPE_SK_REUSEPORT.

We will call sock_reuseport.prog for socket migration in the next commit,
so the eBPF program has to know which listener is closing to select a new
listener.

We can currently get a unique ID of each listener in the userspace by
calling bpf_map_lookup_elem() for BPF_MAP_TYPE_REUSEPORT_SOCKARRAY map.

This patch makes the pointer of sk available in sk_reuseport_md so that we
can get the ID by BPF_FUNC_get_socket_cookie() in the eBPF program.

Suggested-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/netdev/20201119001154.kapwihc2plp4f7zc@kafai-mbp.dhcp.thefacebook.com/
Link: https://lore.kernel.org/bpf/20210612123224.12525-9-kuniyu@amazon.co.jp
(cherry picked from commit e061047684af63f2d4f1338ec73140f6e29eb59f)
---
 include/uapi/linux/bpf.h       |  1 +
 net/core/filter.c              | 10 ++++++++++
 tools/include/uapi/linux/bpf.h |  1 +
 3 files changed, 12 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0e3582d59c7b4..d83e1d2f05145 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4364,6 +4364,7 @@ struct sk_reuseport_md {
 	__u32 ip_protocol;	/* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
 	__u32 bind_inany;	/* Is sock bound to an INANY address? */
 	__u32 hash;		/* A hash of the packet 4 tuples */
+	__bpf_md_ptr(struct bpf_sock *, sk);
 };
 
 #define BPF_TAG_SIZE	8
diff --git a/net/core/filter.c b/net/core/filter.c
index d105b30ecb463..64ab40ed95c79 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -10070,6 +10070,8 @@ sk_reuseport_func_proto(enum bpf_func_id func_id,
 		return &sk_reuseport_load_bytes_proto;
 	case BPF_FUNC_skb_load_bytes_relative:
 		return &sk_reuseport_load_bytes_relative_proto;
+	case BPF_FUNC_get_socket_cookie:
+		return &bpf_get_socket_ptr_cookie_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -10099,6 +10101,10 @@ sk_reuseport_is_valid_access(int off, int size,
 	case offsetof(struct sk_reuseport_md, hash):
 		return size == size_default;
 
+	case offsetof(struct sk_reuseport_md, sk):
+		info->reg_type = PTR_TO_SOCKET;
+		return size == sizeof(__u64);
+
 	/* Fields that allow narrowing */
 	case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
 		if (size < sizeof_field(struct sk_buff, protocol))
@@ -10171,6 +10177,10 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
 	case offsetof(struct sk_reuseport_md, bind_inany):
 		SK_REUSEPORT_LOAD_FIELD(bind_inany);
 		break;
+
+	case offsetof(struct sk_reuseport_md, sk):
+		SK_REUSEPORT_LOAD_FIELD(sk);
+		break;
 	}
 
 	return insn - insn_buf;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d07ceba1b6b5c..7b7bd15fe78f6 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4364,6 +4364,7 @@ struct sk_reuseport_md {
 	__u32 ip_protocol;	/* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
 	__u32 bind_inany;	/* Is sock bound to an INANY address? */
 	__u32 hash;		/* A hash of the packet 4 tuples */
+	__bpf_md_ptr(struct bpf_sock *, sk);
 };
 
 #define BPF_TAG_SIZE	8

From adbcbce21d53bb19eb0e101975925e8817f8b56f Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Sat, 12 Jun 2021 21:32:22 +0900
Subject: [PATCH 144/737] bpf: Support socket migration by eBPF.

This patch introduces a new bpf_attach_type for BPF_PROG_TYPE_SK_REUSEPORT
to check if the attached eBPF program is capable of migrating sockets. When
the eBPF program is attached, we run it for socket migration if the
expected_attach_type is BPF_SK_REUSEPORT_SELECT_OR_MIGRATE or
net.ipv4.tcp_migrate_req is enabled.

Currently, the expected_attach_type is not enforced for the
BPF_PROG_TYPE_SK_REUSEPORT type of program. Thus, this commit follows the
earlier idea in the commit aac3fc320d94 ("bpf: Post-hooks for sys_bind") to
fix up the zero expected_attach_type in bpf_prog_load_fixup_attach_type().

Moreover, this patch adds a new field (migrating_sk) to sk_reuseport_md to
select a new listener based on the child socket. migrating_sk varies
depending on if it is migrating a request in the accept queue or during
3WHS.

  - accept_queue : sock (ESTABLISHED/SYN_RECV)
  - 3WHS         : request_sock (NEW_SYN_RECV)

In the eBPF program, we can select a new listener by
BPF_FUNC_sk_select_reuseport(). Also, we can cancel migration by returning
SK_DROP. This feature is useful when listeners have different settings at
the socket API level or when we want to free resources as soon as possible.

  - SK_PASS with selected_sk, select it as a new listener
  - SK_PASS with selected_sk NULL, fallbacks to the random selection
  - SK_DROP, cancel the migration.

There is a noteworthy point. We select a listening socket in three places,
but we do not have struct skb at closing a listener or retransmitting a
SYN+ACK. On the other hand, some helper functions do not expect skb is NULL
(e.g. skb_header_pointer() in BPF_FUNC_skb_load_bytes(), skb_tail_pointer()
in BPF_FUNC_skb_load_bytes_relative()). So we allocate an empty skb
temporarily before running the eBPF program.

Suggested-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/netdev/20201123003828.xjpjdtk4ygl6tg6h@kafai-mbp.dhcp.thefacebook.com/
Link: https://lore.kernel.org/netdev/20201203042402.6cskdlit5f3mw4ru@kafai-mbp.dhcp.thefacebook.com/
Link: https://lore.kernel.org/netdev/20201209030903.hhow5r53l6fmozjn@kafai-mbp.dhcp.thefacebook.com/
Link: https://lore.kernel.org/bpf/20210612123224.12525-10-kuniyu@amazon.co.jp
(cherry picked from commit d5e4ddaeb6ab2c3c7fbb7b247a6d34bb0b18d87e)
---
 include/linux/bpf.h            |  1 +
 include/linux/filter.h         |  2 ++
 include/uapi/linux/bpf.h       | 15 +++++++++++++++
 kernel/bpf/syscall.c           | 13 +++++++++++++
 net/core/filter.c              | 13 ++++++++++++-
 net/core/sock_reuseport.c      | 34 ++++++++++++++++++++++++++++++----
 tools/include/uapi/linux/bpf.h | 15 +++++++++++++++
 7 files changed, 88 insertions(+), 5 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 791e936fc5971..5fdc7ec87961e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1951,6 +1951,7 @@ struct sk_reuseport_kern {
 	struct sk_buff *skb;
 	struct sock *sk;
 	struct sock *selected_sk;
+	struct sock *migrating_sk;
 	void *data_end;
 	u32 hash;
 	u32 reuseport_id;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index bc6ce4b202a80..cce9f97e4ead8 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -982,11 +982,13 @@ void bpf_warn_invalid_xdp_action(u32 act);
 #ifdef CONFIG_INET
 struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
 				  struct bpf_prog *prog, struct sk_buff *skb,
+				  struct sock *migrating_sk,
 				  u32 hash);
 #else
 static inline struct sock *
 bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
 		     struct bpf_prog *prog, struct sk_buff *skb,
+		     struct sock *migrating_sk,
 		     u32 hash)
 {
 	return NULL;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d83e1d2f05145..20a6a9282a571 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -240,6 +240,8 @@ enum bpf_attach_type {
 	BPF_XDP_CPUMAP,
 	BPF_SK_LOOKUP,
 	BPF_XDP,
+	BPF_SK_REUSEPORT_SELECT,
+	BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -4364,7 +4366,20 @@ struct sk_reuseport_md {
 	__u32 ip_protocol;	/* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
 	__u32 bind_inany;	/* Is sock bound to an INANY address? */
 	__u32 hash;		/* A hash of the packet 4 tuples */
+	/* When reuse->migrating_sk is NULL, it is selecting a sk for the
+	 * new incoming connection request (e.g. selecting a listen sk for
+	 * the received SYN in the TCP case).  reuse->sk is one of the sk
+	 * in the reuseport group. The bpf prog can use reuse->sk to learn
+	 * the local listening ip/port without looking into the skb.
+	 *
+	 * When reuse->migrating_sk is not NULL, reuse->sk is closed and
+	 * reuse->migrating_sk is the socket that needs to be migrated
+	 * to another listening socket.  migrating_sk could be a fullsock
+	 * sk that is fully established or a reqsk that is in-the-middle
+	 * of 3-way handshake.
+	 */
 	__bpf_md_ptr(struct bpf_sock *, sk);
+	__bpf_md_ptr(struct bpf_sock *, migrating_sk);
 };
 
 #define BPF_TAG_SIZE	8
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index aaad2dce2be6f..c8777e574cbaf 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1980,6 +1980,11 @@ static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
 			attr->expected_attach_type =
 				BPF_CGROUP_INET_SOCK_CREATE;
 		break;
+	case BPF_PROG_TYPE_SK_REUSEPORT:
+		if (!attr->expected_attach_type)
+			attr->expected_attach_type =
+				BPF_SK_REUSEPORT_SELECT;
+		break;
 	}
 }
 
@@ -2056,6 +2061,14 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
 		if (expected_attach_type == BPF_SK_LOOKUP)
 			return 0;
 		return -EINVAL;
+	case BPF_PROG_TYPE_SK_REUSEPORT:
+		switch (expected_attach_type) {
+		case BPF_SK_REUSEPORT_SELECT:
+		case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
+			return 0;
+		default:
+			return -EINVAL;
+		}
 	case BPF_PROG_TYPE_EXT:
 		if (expected_attach_type)
 			return -EINVAL;
diff --git a/net/core/filter.c b/net/core/filter.c
index 64ab40ed95c79..a887814660681 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -9942,11 +9942,13 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
 static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
 				    struct sock_reuseport *reuse,
 				    struct sock *sk, struct sk_buff *skb,
+				    struct sock *migrating_sk,
 				    u32 hash)
 {
 	reuse_kern->skb = skb;
 	reuse_kern->sk = sk;
 	reuse_kern->selected_sk = NULL;
+	reuse_kern->migrating_sk = migrating_sk;
 	reuse_kern->data_end = skb->data + skb_headlen(skb);
 	reuse_kern->hash = hash;
 	reuse_kern->reuseport_id = reuse->reuseport_id;
@@ -9955,12 +9957,13 @@ static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
 
 struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
 				  struct bpf_prog *prog, struct sk_buff *skb,
+				  struct sock *migrating_sk,
 				  u32 hash)
 {
 	struct sk_reuseport_kern reuse_kern;
 	enum sk_action action;
 
-	bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash);
+	bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash);
 	action = BPF_PROG_RUN(prog, &reuse_kern);
 
 	if (action == SK_PASS)
@@ -10105,6 +10108,10 @@ sk_reuseport_is_valid_access(int off, int size,
 		info->reg_type = PTR_TO_SOCKET;
 		return size == sizeof(__u64);
 
+	case offsetof(struct sk_reuseport_md, migrating_sk):
+		info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
+		return size == sizeof(__u64);
+
 	/* Fields that allow narrowing */
 	case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
 		if (size < sizeof_field(struct sk_buff, protocol))
@@ -10181,6 +10188,10 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
 	case offsetof(struct sk_reuseport_md, sk):
 		SK_REUSEPORT_LOAD_FIELD(sk);
 		break;
+
+	case offsetof(struct sk_reuseport_md, migrating_sk):
+		SK_REUSEPORT_LOAD_FIELD(migrating_sk);
+		break;
 	}
 
 	return insn - insn_buf;
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 3bdabad5f1dd8..71cdaa3777695 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -393,13 +393,17 @@ void reuseport_stop_listen_sock(struct sock *sk)
 {
 	if (sk->sk_protocol == IPPROTO_TCP) {
 		struct sock_reuseport *reuse;
+		struct bpf_prog *prog;
 
 		spin_lock_bh(&reuseport_lock);
 
 		reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
 						  lockdep_is_held(&reuseport_lock));
+		prog = rcu_dereference_protected(reuse->prog,
+						 lockdep_is_held(&reuseport_lock));
 
-		if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req) {
+		if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req ||
+		    (prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) {
 			/* Migration capable, move sk from the listening section
 			 * to the closed section.
 			 */
@@ -504,7 +508,7 @@ struct sock *reuseport_select_sock(struct sock *sk,
 			goto select_by_hash;
 
 		if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
-			sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash);
+			sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, NULL, hash);
 		else
 			sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
 
@@ -535,6 +539,8 @@ struct sock *reuseport_migrate_sock(struct sock *sk,
 {
 	struct sock_reuseport *reuse;
 	struct sock *nsk = NULL;
+	bool allocated = false;
+	struct bpf_prog *prog;
 	u16 socks;
 	u32 hash;
 
@@ -552,10 +558,30 @@ struct sock *reuseport_migrate_sock(struct sock *sk,
 	smp_rmb();
 
 	hash = migrating_sk->sk_hash;
-	if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req)
+	prog = rcu_dereference(reuse->prog);
+	if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) {
+		if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req)
+			goto select_by_hash;
+		goto out;
+	}
+
+	if (!skb) {
+		skb = alloc_skb(0, GFP_ATOMIC);
+		if (!skb)
+			goto out;
+		allocated = true;
+	}
+
+	nsk = bpf_run_sk_reuseport(reuse, sk, prog, skb, migrating_sk, hash);
+
+	if (allocated)
+		kfree_skb(skb);
+
+select_by_hash:
+	if (!nsk)
 		nsk = reuseport_select_sock_by_hash(reuse, hash, socks);
 
-	if (nsk && unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt)))
+	if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt)))
 		nsk = NULL;
 
 out:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 7b7bd15fe78f6..c38cb426ee15c 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -240,6 +240,8 @@ enum bpf_attach_type {
 	BPF_XDP_CPUMAP,
 	BPF_SK_LOOKUP,
 	BPF_XDP,
+	BPF_SK_REUSEPORT_SELECT,
+	BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -4364,7 +4366,20 @@ struct sk_reuseport_md {
 	__u32 ip_protocol;	/* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
 	__u32 bind_inany;	/* Is sock bound to an INANY address? */
 	__u32 hash;		/* A hash of the packet 4 tuples */
+	/* When reuse->migrating_sk is NULL, it is selecting a sk for the
+	 * new incoming connection request (e.g. selecting a listen sk for
+	 * the received SYN in the TCP case).  reuse->sk is one of the sk
+	 * in the reuseport group. The bpf prog can use reuse->sk to learn
+	 * the local listening ip/port without looking into the skb.
+	 *
+	 * When reuse->migrating_sk is not NULL, reuse->sk is closed and
+	 * reuse->migrating_sk is the socket that needs to be migrated
+	 * to another listening socket.  migrating_sk could be a fullsock
+	 * sk that is fully established or a reqsk that is in-the-middle
+	 * of 3-way handshake.
+	 */
 	__bpf_md_ptr(struct bpf_sock *, sk);
+	__bpf_md_ptr(struct bpf_sock *, migrating_sk);
 };
 
 #define BPF_TAG_SIZE	8

From 799226bc930fc8d349dfd9ecec9fc4921165c0ed Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Sat, 12 Jun 2021 21:32:23 +0900
Subject: [PATCH 145/737] libbpf: Set expected_attach_type for
 BPF_PROG_TYPE_SK_REUSEPORT.

This commit introduces a new section (sk_reuseport/migrate) and sets
expected_attach_type to two each section in BPF_PROG_TYPE_SK_REUSEPORT
program.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210612123224.12525-11-kuniyu@amazon.co.jp
(cherry picked from commit 50501271e773c51afe602918915c6beb62ac369f)
---
 tools/lib/bpf/libbpf.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 015ed8253f739..1eeb779e3728c 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -8321,7 +8321,10 @@ static struct bpf_link *attach_iter(const struct bpf_sec_def *sec,
 
 static const struct bpf_sec_def section_defs[] = {
 	BPF_PROG_SEC("socket",			BPF_PROG_TYPE_SOCKET_FILTER),
-	BPF_PROG_SEC("sk_reuseport",		BPF_PROG_TYPE_SK_REUSEPORT),
+	BPF_EAPROG_SEC("sk_reuseport/migrate",	BPF_PROG_TYPE_SK_REUSEPORT,
+						BPF_SK_REUSEPORT_SELECT_OR_MIGRATE),
+	BPF_EAPROG_SEC("sk_reuseport",		BPF_PROG_TYPE_SK_REUSEPORT,
+						BPF_SK_REUSEPORT_SELECT),
 	SEC_DEF("kprobe/", KPROBE,
 		.attach_fn = attach_kprobe),
 	BPF_PROG_SEC("uprobe/",			BPF_PROG_TYPE_KPROBE),

From 67a38b32056cc8518a28635fa71d2b496b93c7b6 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Sat, 12 Jun 2021 21:32:24 +0900
Subject: [PATCH 146/737] bpf: Test BPF_SK_REUSEPORT_SELECT_OR_MIGRATE.

This patch adds a test for BPF_SK_REUSEPORT_SELECT_OR_MIGRATE and
removes 'static' from settimeo() in network_helpers.c.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210612123224.12525-12-kuniyu@amazon.co.jp
(cherry picked from commit c9d0bdef89a6c943e98c851e8cc10c9c534329e6)
---
 tools/testing/selftests/bpf/network_helpers.c |   2 +-
 tools/testing/selftests/bpf/network_helpers.h |   1 +
 .../bpf/prog_tests/migrate_reuseport.c        | 555 ++++++++++++++++++
 .../bpf/progs/test_migrate_reuseport.c        | 135 +++++
 4 files changed, 692 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_migrate_reuseport.c

diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c
index 12ee40284da02..2060bc122c530 100644
--- a/tools/testing/selftests/bpf/network_helpers.c
+++ b/tools/testing/selftests/bpf/network_helpers.c
@@ -40,7 +40,7 @@ struct ipv6_packet pkt_v6 = {
 	.tcp.doff = 5,
 };
 
-static int settimeo(int fd, int timeout_ms)
+int settimeo(int fd, int timeout_ms)
 {
 	struct timeval timeout = { .tv_sec = 3 };
 
diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h
index 7205f8afdba11..5e0d51c07b632 100644
--- a/tools/testing/selftests/bpf/network_helpers.h
+++ b/tools/testing/selftests/bpf/network_helpers.h
@@ -33,6 +33,7 @@ struct ipv6_packet {
 } __packed;
 extern struct ipv6_packet pkt_v6;
 
+int settimeo(int fd, int timeout_ms);
 int start_server(int family, int type, const char *addr, __u16 port,
 		 int timeout_ms);
 int connect_to_fd(int server_fd, int timeout_ms);
diff --git a/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c b/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
new file mode 100644
index 0000000000000..0fa3f750567de
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
@@ -0,0 +1,555 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Check if we can migrate child sockets.
+ *
+ *   1. call listen() for 4 server sockets.
+ *   2. call connect() for 25 client sockets.
+ *   3. call listen() for 1 server socket. (migration target)
+ *   4. update a map to migrate all child sockets
+ *        to the last server socket (migrate_map[cookie] = 4)
+ *   5. call shutdown() for first 4 server sockets
+ *        and migrate the requests in the accept queue
+ *        to the last server socket.
+ *   6. call listen() for the second server socket.
+ *   7. call shutdown() for the last server
+ *        and migrate the requests in the accept queue
+ *        to the second server socket.
+ *   8. call listen() for the last server.
+ *   9. call shutdown() for the second server
+ *        and migrate the requests in the accept queue
+ *        to the last server socket.
+ *  10. call accept() for the last server socket.
+ *
+ * Author: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
+ */
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "test_progs.h"
+#include "test_migrate_reuseport.skel.h"
+#include "network_helpers.h"
+
+#define IFINDEX_LO 1
+
+#define NR_SERVERS 5
+#define NR_CLIENTS (NR_SERVERS * 5)
+#define MIGRATED_TO (NR_SERVERS - 1)
+
+/* fastopenq->max_qlen and sk->sk_max_ack_backlog */
+#define QLEN (NR_CLIENTS * 5)
+
+#define MSG "Hello World\0"
+#define MSGLEN 12
+
+static struct migrate_reuseport_test_case {
+	const char *name;
+	__s64 servers[NR_SERVERS];
+	__s64 clients[NR_CLIENTS];
+	struct sockaddr_storage addr;
+	socklen_t addrlen;
+	int family;
+	int state;
+	bool drop_ack;
+	bool expire_synack_timer;
+	bool fastopen;
+	struct bpf_link *link;
+} test_cases[] = {
+	{
+		.name = "IPv4 TCP_ESTABLISHED  inet_csk_listen_stop",
+		.family = AF_INET,
+		.state = BPF_TCP_ESTABLISHED,
+		.drop_ack = false,
+		.expire_synack_timer = false,
+		.fastopen = false,
+	},
+	{
+		.name = "IPv4 TCP_SYN_RECV     inet_csk_listen_stop",
+		.family = AF_INET,
+		.state = BPF_TCP_SYN_RECV,
+		.drop_ack = true,
+		.expire_synack_timer = false,
+		.fastopen = true,
+	},
+	{
+		.name = "IPv4 TCP_NEW_SYN_RECV reqsk_timer_handler",
+		.family = AF_INET,
+		.state = BPF_TCP_NEW_SYN_RECV,
+		.drop_ack = true,
+		.expire_synack_timer = true,
+		.fastopen = false,
+	},
+	{
+		.name = "IPv4 TCP_NEW_SYN_RECV inet_csk_complete_hashdance",
+		.family = AF_INET,
+		.state = BPF_TCP_NEW_SYN_RECV,
+		.drop_ack = true,
+		.expire_synack_timer = false,
+		.fastopen = false,
+	},
+	{
+		.name = "IPv6 TCP_ESTABLISHED  inet_csk_listen_stop",
+		.family = AF_INET6,
+		.state = BPF_TCP_ESTABLISHED,
+		.drop_ack = false,
+		.expire_synack_timer = false,
+		.fastopen = false,
+	},
+	{
+		.name = "IPv6 TCP_SYN_RECV     inet_csk_listen_stop",
+		.family = AF_INET6,
+		.state = BPF_TCP_SYN_RECV,
+		.drop_ack = true,
+		.expire_synack_timer = false,
+		.fastopen = true,
+	},
+	{
+		.name = "IPv6 TCP_NEW_SYN_RECV reqsk_timer_handler",
+		.family = AF_INET6,
+		.state = BPF_TCP_NEW_SYN_RECV,
+		.drop_ack = true,
+		.expire_synack_timer = true,
+		.fastopen = false,
+	},
+	{
+		.name = "IPv6 TCP_NEW_SYN_RECV inet_csk_complete_hashdance",
+		.family = AF_INET6,
+		.state = BPF_TCP_NEW_SYN_RECV,
+		.drop_ack = true,
+		.expire_synack_timer = false,
+		.fastopen = false,
+	}
+};
+
+static void init_fds(__s64 fds[], int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		fds[i] = -1;
+}
+
+static void close_fds(__s64 fds[], int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++) {
+		if (fds[i] != -1) {
+			close(fds[i]);
+			fds[i] = -1;
+		}
+	}
+}
+
+static int setup_fastopen(char *buf, int size, int *saved_len, bool restore)
+{
+	int err = 0, fd, len;
+
+	fd = open("/proc/sys/net/ipv4/tcp_fastopen", O_RDWR);
+	if (!ASSERT_NEQ(fd, -1, "open"))
+		return -1;
+
+	if (restore) {
+		len = write(fd, buf, *saved_len);
+		if (!ASSERT_EQ(len, *saved_len, "write - restore"))
+			err = -1;
+	} else {
+		*saved_len = read(fd, buf, size);
+		if (!ASSERT_GE(*saved_len, 1, "read")) {
+			err = -1;
+			goto close;
+		}
+
+		err = lseek(fd, 0, SEEK_SET);
+		if (!ASSERT_OK(err, "lseek"))
+			goto close;
+
+		/* (TFO_CLIENT_ENABLE | TFO_SERVER_ENABLE |
+		 *  TFO_CLIENT_NO_COOKIE | TFO_SERVER_COOKIE_NOT_REQD)
+		 */
+		len = write(fd, "519", 3);
+		if (!ASSERT_EQ(len, 3, "write - setup"))
+			err = -1;
+	}
+
+close:
+	close(fd);
+
+	return err;
+}
+
+static int drop_ack(struct migrate_reuseport_test_case *test_case,
+		    struct test_migrate_reuseport *skel)
+{
+	if (test_case->family == AF_INET)
+		skel->bss->server_port = ((struct sockaddr_in *)
+					  &test_case->addr)->sin_port;
+	else
+		skel->bss->server_port = ((struct sockaddr_in6 *)
+					  &test_case->addr)->sin6_port;
+
+	test_case->link = bpf_program__attach_xdp(skel->progs.drop_ack,
+						  IFINDEX_LO);
+	if (!ASSERT_OK_PTR(test_case->link, "bpf_program__attach_xdp"))
+		return -1;
+
+	return 0;
+}
+
+static int pass_ack(struct migrate_reuseport_test_case *test_case)
+{
+	int err;
+
+	err = bpf_link__detach(test_case->link);
+	if (!ASSERT_OK(err, "bpf_link__detach"))
+		return -1;
+
+	test_case->link = NULL;
+
+	return 0;
+}
+
+static int start_servers(struct migrate_reuseport_test_case *test_case,
+			 struct test_migrate_reuseport *skel)
+{
+	int i, err, prog_fd, reuseport = 1, qlen = QLEN;
+
+	prog_fd = bpf_program__fd(skel->progs.migrate_reuseport);
+
+	make_sockaddr(test_case->family,
+		      test_case->family == AF_INET ? "127.0.0.1" : "::1", 0,
+		      &test_case->addr, &test_case->addrlen);
+
+	for (i = 0; i < NR_SERVERS; i++) {
+		test_case->servers[i] = socket(test_case->family, SOCK_STREAM,
+					       IPPROTO_TCP);
+		if (!ASSERT_NEQ(test_case->servers[i], -1, "socket"))
+			return -1;
+
+		err = setsockopt(test_case->servers[i], SOL_SOCKET,
+				 SO_REUSEPORT, &reuseport, sizeof(reuseport));
+		if (!ASSERT_OK(err, "setsockopt - SO_REUSEPORT"))
+			return -1;
+
+		err = bind(test_case->servers[i],
+			   (struct sockaddr *)&test_case->addr,
+			   test_case->addrlen);
+		if (!ASSERT_OK(err, "bind"))
+			return -1;
+
+		if (i == 0) {
+			err = setsockopt(test_case->servers[i], SOL_SOCKET,
+					 SO_ATTACH_REUSEPORT_EBPF,
+					 &prog_fd, sizeof(prog_fd));
+			if (!ASSERT_OK(err,
+				       "setsockopt - SO_ATTACH_REUSEPORT_EBPF"))
+				return -1;
+
+			err = getsockname(test_case->servers[i],
+					  (struct sockaddr *)&test_case->addr,
+					  &test_case->addrlen);
+			if (!ASSERT_OK(err, "getsockname"))
+				return -1;
+		}
+
+		if (test_case->fastopen) {
+			err = setsockopt(test_case->servers[i],
+					 SOL_TCP, TCP_FASTOPEN,
+					 &qlen, sizeof(qlen));
+			if (!ASSERT_OK(err, "setsockopt - TCP_FASTOPEN"))
+				return -1;
+		}
+
+		/* All requests will be tied to the first four listeners */
+		if (i != MIGRATED_TO) {
+			err = listen(test_case->servers[i], qlen);
+			if (!ASSERT_OK(err, "listen"))
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+static int start_clients(struct migrate_reuseport_test_case *test_case)
+{
+	char buf[MSGLEN] = MSG;
+	int i, err;
+
+	for (i = 0; i < NR_CLIENTS; i++) {
+		test_case->clients[i] = socket(test_case->family, SOCK_STREAM,
+					       IPPROTO_TCP);
+		if (!ASSERT_NEQ(test_case->clients[i], -1, "socket"))
+			return -1;
+
+		/* The attached XDP program drops only the final ACK, so
+		 * clients will transition to TCP_ESTABLISHED immediately.
+		 */
+		err = settimeo(test_case->clients[i], 100);
+		if (!ASSERT_OK(err, "settimeo"))
+			return -1;
+
+		if (test_case->fastopen) {
+			int fastopen = 1;
+
+			err = setsockopt(test_case->clients[i], IPPROTO_TCP,
+					 TCP_FASTOPEN_CONNECT, &fastopen,
+					 sizeof(fastopen));
+			if (!ASSERT_OK(err,
+				       "setsockopt - TCP_FASTOPEN_CONNECT"))
+				return -1;
+		}
+
+		err = connect(test_case->clients[i],
+			      (struct sockaddr *)&test_case->addr,
+			      test_case->addrlen);
+		if (!ASSERT_OK(err, "connect"))
+			return -1;
+
+		err = write(test_case->clients[i], buf, MSGLEN);
+		if (!ASSERT_EQ(err, MSGLEN, "write"))
+			return -1;
+	}
+
+	return 0;
+}
+
+static int update_maps(struct migrate_reuseport_test_case *test_case,
+		       struct test_migrate_reuseport *skel)
+{
+	int i, err, migrated_to = MIGRATED_TO;
+	int reuseport_map_fd, migrate_map_fd;
+	__u64 value;
+
+	reuseport_map_fd = bpf_map__fd(skel->maps.reuseport_map);
+	migrate_map_fd = bpf_map__fd(skel->maps.migrate_map);
+
+	for (i = 0; i < NR_SERVERS; i++) {
+		value = (__u64)test_case->servers[i];
+		err = bpf_map_update_elem(reuseport_map_fd, &i, &value,
+					  BPF_NOEXIST);
+		if (!ASSERT_OK(err, "bpf_map_update_elem - reuseport_map"))
+			return -1;
+
+		err = bpf_map_lookup_elem(reuseport_map_fd, &i, &value);
+		if (!ASSERT_OK(err, "bpf_map_lookup_elem - reuseport_map"))
+			return -1;
+
+		err = bpf_map_update_elem(migrate_map_fd, &value, &migrated_to,
+					  BPF_NOEXIST);
+		if (!ASSERT_OK(err, "bpf_map_update_elem - migrate_map"))
+			return -1;
+	}
+
+	return 0;
+}
+
+static int migrate_dance(struct migrate_reuseport_test_case *test_case)
+{
+	int i, err;
+
+	/* Migrate TCP_ESTABLISHED and TCP_SYN_RECV requests
+	 * to the last listener based on eBPF.
+	 */
+	for (i = 0; i < MIGRATED_TO; i++) {
+		err = shutdown(test_case->servers[i], SHUT_RDWR);
+		if (!ASSERT_OK(err, "shutdown"))
+			return -1;
+	}
+
+	/* No dance for TCP_NEW_SYN_RECV to migrate based on eBPF */
+	if (test_case->state == BPF_TCP_NEW_SYN_RECV)
+		return 0;
+
+	/* Note that we use the second listener instead of the
+	 * first one here.
+	 *
+	 * The fist listener is bind()ed with port 0 and,
+	 * SOCK_BINDPORT_LOCK is not set to sk_userlocks, so
+	 * calling listen() again will bind() the first listener
+	 * on a new ephemeral port and detach it from the existing
+	 * reuseport group.  (See: __inet_bind(), tcp_set_state())
+	 *
+	 * OTOH, the second one is bind()ed with a specific port,
+	 * and SOCK_BINDPORT_LOCK is set. Thus, re-listen() will
+	 * resurrect the listener on the existing reuseport group.
+	 */
+	err = listen(test_case->servers[1], QLEN);
+	if (!ASSERT_OK(err, "listen"))
+		return -1;
+
+	/* Migrate from the last listener to the second one.
+	 *
+	 * All listeners were detached out of the reuseport_map,
+	 * so migration will be done by kernel random pick from here.
+	 */
+	err = shutdown(test_case->servers[MIGRATED_TO], SHUT_RDWR);
+	if (!ASSERT_OK(err, "shutdown"))
+		return -1;
+
+	/* Back to the existing reuseport group */
+	err = listen(test_case->servers[MIGRATED_TO], QLEN);
+	if (!ASSERT_OK(err, "listen"))
+		return -1;
+
+	/* Migrate back to the last one from the second one */
+	err = shutdown(test_case->servers[1], SHUT_RDWR);
+	if (!ASSERT_OK(err, "shutdown"))
+		return -1;
+
+	return 0;
+}
+
+static void count_requests(struct migrate_reuseport_test_case *test_case,
+			   struct test_migrate_reuseport *skel)
+{
+	struct sockaddr_storage addr;
+	socklen_t len = sizeof(addr);
+	int err, cnt = 0, client;
+	char buf[MSGLEN];
+
+	err = settimeo(test_case->servers[MIGRATED_TO], 4000);
+	if (!ASSERT_OK(err, "settimeo"))
+		goto out;
+
+	for (; cnt < NR_CLIENTS; cnt++) {
+		client = accept(test_case->servers[MIGRATED_TO],
+				(struct sockaddr *)&addr, &len);
+		if (!ASSERT_NEQ(client, -1, "accept"))
+			goto out;
+
+		memset(buf, 0, MSGLEN);
+		read(client, &buf, MSGLEN);
+		close(client);
+
+		if (!ASSERT_STREQ(buf, MSG, "read"))
+			goto out;
+	}
+
+out:
+	ASSERT_EQ(cnt, NR_CLIENTS, "count in userspace");
+
+	switch (test_case->state) {
+	case BPF_TCP_ESTABLISHED:
+		cnt = skel->bss->migrated_at_close;
+		break;
+	case BPF_TCP_SYN_RECV:
+		cnt = skel->bss->migrated_at_close_fastopen;
+		break;
+	case BPF_TCP_NEW_SYN_RECV:
+		if (test_case->expire_synack_timer)
+			cnt = skel->bss->migrated_at_send_synack;
+		else
+			cnt = skel->bss->migrated_at_recv_ack;
+		break;
+	default:
+		cnt = 0;
+	}
+
+	ASSERT_EQ(cnt, NR_CLIENTS, "count in BPF prog");
+}
+
+static void run_test(struct migrate_reuseport_test_case *test_case,
+		     struct test_migrate_reuseport *skel)
+{
+	int err, saved_len;
+	char buf[16];
+
+	skel->bss->migrated_at_close = 0;
+	skel->bss->migrated_at_close_fastopen = 0;
+	skel->bss->migrated_at_send_synack = 0;
+	skel->bss->migrated_at_recv_ack = 0;
+
+	init_fds(test_case->servers, NR_SERVERS);
+	init_fds(test_case->clients, NR_CLIENTS);
+
+	if (test_case->fastopen) {
+		memset(buf, 0, sizeof(buf));
+
+		err = setup_fastopen(buf, sizeof(buf), &saved_len, false);
+		if (!ASSERT_OK(err, "setup_fastopen - setup"))
+			return;
+	}
+
+	err = start_servers(test_case, skel);
+	if (!ASSERT_OK(err, "start_servers"))
+		goto close_servers;
+
+	if (test_case->drop_ack) {
+		/* Drop the final ACK of the 3-way handshake and stick the
+		 * in-flight requests on TCP_SYN_RECV or TCP_NEW_SYN_RECV.
+		 */
+		err = drop_ack(test_case, skel);
+		if (!ASSERT_OK(err, "drop_ack"))
+			goto close_servers;
+	}
+
+	/* Tie requests to the first four listners */
+	err = start_clients(test_case);
+	if (!ASSERT_OK(err, "start_clients"))
+		goto close_clients;
+
+	err = listen(test_case->servers[MIGRATED_TO], QLEN);
+	if (!ASSERT_OK(err, "listen"))
+		goto close_clients;
+
+	err = update_maps(test_case, skel);
+	if (!ASSERT_OK(err, "fill_maps"))
+		goto close_clients;
+
+	/* Migrate the requests in the accept queue only.
+	 * TCP_NEW_SYN_RECV requests are not migrated at this point.
+	 */
+	err = migrate_dance(test_case);
+	if (!ASSERT_OK(err, "migrate_dance"))
+		goto close_clients;
+
+	if (test_case->expire_synack_timer) {
+		/* Wait for SYN+ACK timers to expire so that
+		 * reqsk_timer_handler() migrates TCP_NEW_SYN_RECV requests.
+		 */
+		sleep(1);
+	}
+
+	if (test_case->link) {
+		/* Resume 3WHS and migrate TCP_NEW_SYN_RECV requests */
+		err = pass_ack(test_case);
+		if (!ASSERT_OK(err, "pass_ack"))
+			goto close_clients;
+	}
+
+	count_requests(test_case, skel);
+
+close_clients:
+	close_fds(test_case->clients, NR_CLIENTS);
+
+	if (test_case->link) {
+		err = pass_ack(test_case);
+		ASSERT_OK(err, "pass_ack - clean up");
+	}
+
+close_servers:
+	close_fds(test_case->servers, NR_SERVERS);
+
+	if (test_case->fastopen) {
+		err = setup_fastopen(buf, sizeof(buf), &saved_len, true);
+		ASSERT_OK(err, "setup_fastopen - restore");
+	}
+}
+
+void test_migrate_reuseport(void)
+{
+	struct test_migrate_reuseport *skel;
+	int i;
+
+	skel = test_migrate_reuseport__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open_and_load"))
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(test_cases); i++) {
+		test__start_subtest(test_cases[i].name);
+		run_test(&test_cases[i], skel);
+	}
+
+	test_migrate_reuseport__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_migrate_reuseport.c b/tools/testing/selftests/bpf/progs/test_migrate_reuseport.c
new file mode 100644
index 0000000000000..27df571abf5b5
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_migrate_reuseport.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Check if we can migrate child sockets.
+ *
+ *   1. If reuse_md->migrating_sk is NULL (SYN packet),
+ *        return SK_PASS without selecting a listener.
+ *   2. If reuse_md->migrating_sk is not NULL (socket migration),
+ *        select a listener (reuseport_map[migrate_map[cookie]])
+ *
+ * Author: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/in.h>
+#include <bpf/bpf_endian.h>
+#include <bpf/bpf_helpers.h>
+
+struct {
+	__uint(type, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY);
+	__uint(max_entries, 256);
+	__type(key, int);
+	__type(value, __u64);
+} reuseport_map SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, 256);
+	__type(key, __u64);
+	__type(value, int);
+} migrate_map SEC(".maps");
+
+int migrated_at_close = 0;
+int migrated_at_close_fastopen = 0;
+int migrated_at_send_synack = 0;
+int migrated_at_recv_ack = 0;
+__be16 server_port;
+
+SEC("xdp")
+int drop_ack(struct xdp_md *xdp)
+{
+	void *data_end = (void *)(long)xdp->data_end;
+	void *data = (void *)(long)xdp->data;
+	struct ethhdr *eth = data;
+	struct tcphdr *tcp = NULL;
+
+	if (eth + 1 > data_end)
+		goto pass;
+
+	switch (bpf_ntohs(eth->h_proto)) {
+	case ETH_P_IP: {
+		struct iphdr *ip = (struct iphdr *)(eth + 1);
+
+		if (ip + 1 > data_end)
+			goto pass;
+
+		if (ip->protocol != IPPROTO_TCP)
+			goto pass;
+
+		tcp = (struct tcphdr *)((void *)ip + ip->ihl * 4);
+		break;
+	}
+	case ETH_P_IPV6: {
+		struct ipv6hdr *ipv6 = (struct ipv6hdr *)(eth + 1);
+
+		if (ipv6 + 1 > data_end)
+			goto pass;
+
+		if (ipv6->nexthdr != IPPROTO_TCP)
+			goto pass;
+
+		tcp = (struct tcphdr *)(ipv6 + 1);
+		break;
+	}
+	default:
+		goto pass;
+	}
+
+	if (tcp + 1 > data_end)
+		goto pass;
+
+	if (tcp->dest != server_port)
+		goto pass;
+
+	if (!tcp->syn && tcp->ack)
+		return XDP_DROP;
+
+pass:
+	return XDP_PASS;
+}
+
+SEC("sk_reuseport/migrate")
+int migrate_reuseport(struct sk_reuseport_md *reuse_md)
+{
+	int *key, flags = 0, state, err;
+	__u64 cookie;
+
+	if (!reuse_md->migrating_sk)
+		return SK_PASS;
+
+	state = reuse_md->migrating_sk->state;
+	cookie = bpf_get_socket_cookie(reuse_md->sk);
+
+	key = bpf_map_lookup_elem(&migrate_map, &cookie);
+	if (!key)
+		return SK_DROP;
+
+	err = bpf_sk_select_reuseport(reuse_md, &reuseport_map, key, flags);
+	if (err)
+		return SK_PASS;
+
+	switch (state) {
+	case BPF_TCP_ESTABLISHED:
+		__sync_fetch_and_add(&migrated_at_close, 1);
+		break;
+	case BPF_TCP_SYN_RECV:
+		__sync_fetch_and_add(&migrated_at_close_fastopen, 1);
+		break;
+	case BPF_TCP_NEW_SYN_RECV:
+		if (!reuse_md->len)
+			__sync_fetch_and_add(&migrated_at_send_synack, 1);
+		else
+			__sync_fetch_and_add(&migrated_at_recv_ack, 1);
+		break;
+	}
+
+	return SK_PASS;
+}
+
+char _license[] SEC("license") = "GPL";

From 5b3d3dab70255e52081555b35b21247129e32e43 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Wed, 23 Jun 2021 08:35:29 +0900
Subject: [PATCH 147/737] tcp: Add stats for socket migration.

This commit adds two stats for the socket migration feature to evaluate the
effectiveness: LINUX_MIB_TCPMIGRATEREQ(SUCCESS|FAILURE).

If the migration fails because of the own_req race in receiving ACK and
sending SYN+ACK paths, we do not increment the failure stat. Then another
CPU is responsible for the req.

Link: https://lore.kernel.org/bpf/CAK6E8=cgFKuGecTzSCSQ8z3YJ_163C0uwO9yRvfDSE7vOe9mJA@mail.gmail.com/
Suggested-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 55d444b310c64b084dcc62ba3e4dc3862269fb96)
---
 include/uapi/linux/snmp.h       |  2 ++
 net/core/sock_reuseport.c       | 15 +++++++++++----
 net/ipv4/inet_connection_sock.c | 15 +++++++++++++--
 net/ipv4/proc.c                 |  2 ++
 net/ipv4/tcp_minisocks.c        |  3 +++
 5 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index f84e7bcad6deb..4a16fbe247596 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -289,6 +289,8 @@ enum
 	LINUX_MIB_TCPDUPLICATEDATAREHASH,	/* TCPDuplicateDataRehash */
 	LINUX_MIB_TCPDSACKRECVSEGS,		/* TCPDSACKRecvSegs */
 	LINUX_MIB_TCPDSACKIGNOREDDUBIOUS,	/* TCPDSACKIgnoredDubious */
+	LINUX_MIB_TCPMIGRATEREQSUCCESS,		/* TCPMigrateReqSuccess */
+	LINUX_MIB_TCPMIGRATEREQFAILURE,		/* TCPMigrateReqFailure */
 	__LINUX_MIB_MAX
 };
 
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 71cdaa3777695..bf4ef0d8ca0d2 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -6,6 +6,7 @@
  * selecting the socket index from the array of available sockets.
  */
 
+#include <net/ip.h>
 #include <net/sock_reuseport.h>
 #include <linux/bpf.h>
 #include <linux/idr.h>
@@ -552,7 +553,7 @@ struct sock *reuseport_migrate_sock(struct sock *sk,
 
 	socks = READ_ONCE(reuse->num_socks);
 	if (unlikely(!socks))
-		goto out;
+		goto failure;
 
 	/* paired with smp_wmb() in __reuseport_add_sock() */
 	smp_rmb();
@@ -562,13 +563,13 @@ struct sock *reuseport_migrate_sock(struct sock *sk,
 	if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) {
 		if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req)
 			goto select_by_hash;
-		goto out;
+		goto failure;
 	}
 
 	if (!skb) {
 		skb = alloc_skb(0, GFP_ATOMIC);
 		if (!skb)
-			goto out;
+			goto failure;
 		allocated = true;
 	}
 
@@ -581,12 +582,18 @@ struct sock *reuseport_migrate_sock(struct sock *sk,
 	if (!nsk)
 		nsk = reuseport_select_sock_by_hash(reuse, hash, socks);
 
-	if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt)))
+	if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt))) {
 		nsk = NULL;
+		goto failure;
+	}
 
 out:
 	rcu_read_unlock();
 	return nsk;
+
+failure:
+	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
+	goto out;
 }
 EXPORT_SYMBOL(reuseport_migrate_sock);
 
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 4939dd81a6e7e..1e7a011626ca7 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -707,6 +707,8 @@ static struct request_sock *inet_reqsk_clone(struct request_sock *req,
 
 	nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN);
 	if (!nreq) {
+		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
+
 		/* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */
 		sock_put(sk);
 		return NULL;
@@ -881,9 +883,10 @@ static void reqsk_timer_handler(struct timer_list *t)
 		if (!inet_ehash_insert(req_to_sk(nreq), req_to_sk(oreq), NULL)) {
 			/* delete timer */
 			inet_csk_reqsk_queue_drop(sk_listener, nreq);
-			goto drop;
+			goto no_ownership;
 		}
 
+		__NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQSUCCESS);
 		reqsk_migrate_reset(oreq);
 		reqsk_queue_removed(&inet_csk(oreq->rsk_listener)->icsk_accept_queue, oreq);
 		reqsk_put(oreq);
@@ -892,17 +895,19 @@ static void reqsk_timer_handler(struct timer_list *t)
 		return;
 	}
 
-drop:
 	/* Even if we can clone the req, we may need not retransmit any more
 	 * SYN+ACKs (nreq->num_timeout > max_syn_ack_retries, etc), or another
 	 * CPU may win the "own_req" race so that inet_ehash_insert() fails.
 	 */
 	if (nreq) {
+		__NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQFAILURE);
+no_ownership:
 		reqsk_migrate_reset(nreq);
 		reqsk_queue_removed(queue, nreq);
 		__reqsk_free(nreq);
 	}
 
+drop:
 	inet_csk_reqsk_queue_drop_and_put(oreq->rsk_listener, oreq);
 }
 
@@ -1156,11 +1161,13 @@ struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
 
 			refcount_set(&nreq->rsk_refcnt, 1);
 			if (inet_csk_reqsk_queue_add(sk, nreq, child)) {
+				__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQSUCCESS);
 				reqsk_migrate_reset(req);
 				reqsk_put(req);
 				return child;
 			}
 
+			__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
 			reqsk_migrate_reset(nreq);
 			__reqsk_free(nreq);
 		} else if (inet_csk_reqsk_queue_add(sk, req, child)) {
@@ -1209,8 +1216,12 @@ void inet_csk_listen_stop(struct sock *sk)
 				refcount_set(&nreq->rsk_refcnt, 1);
 
 				if (inet_csk_reqsk_queue_add(nsk, nreq, child)) {
+					__NET_INC_STATS(sock_net(nsk),
+							LINUX_MIB_TCPMIGRATEREQSUCCESS);
 					reqsk_migrate_reset(req);
 				} else {
+					__NET_INC_STATS(sock_net(nsk),
+							LINUX_MIB_TCPMIGRATEREQFAILURE);
 					reqsk_migrate_reset(nreq);
 					__reqsk_free(nreq);
 				}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 80d13d8f982dc..c1dbc41088a33 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -294,6 +294,8 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH),
 	SNMP_MIB_ITEM("TCPDSACKRecvSegs", LINUX_MIB_TCPDSACKRECVSEGS),
 	SNMP_MIB_ITEM("TCPDSACKIgnoredDubious", LINUX_MIB_TCPDSACKIGNOREDDUBIOUS),
+	SNMP_MIB_ITEM("TCPMigrateReqSuccess", LINUX_MIB_TCPMIGRATEREQSUCCESS),
+	SNMP_MIB_ITEM("TCPMigrateReqFailure", LINUX_MIB_TCPMIGRATEREQFAILURE),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 43d47046d3700..6dababd60215c 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -792,6 +792,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 	return inet_csk_complete_hashdance(sk, child, req, own_req);
 
 listen_overflow:
+	if (sk != req->rsk_listener)
+		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
+
 	if (!sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow) {
 		inet_rsk(req)->acked = 1;
 		return NULL;

From c9189cf524edb448b0a5fef639441465dfe8f81c Mon Sep 17 00:00:00 2001
From: Ilias Stamatis <ilstam@amazon.com>
Date: Wed, 26 May 2021 19:44:08 +0100
Subject: [PATCH 148/737] math64.h: Add mul_s64_u64_shr()

This function is needed for KVM's nested virtualization. The nested TSC
scaling implementation requires multiplying the signed TSC offset with
the unsigned TSC multiplier.

Note: Backported to 5.10

Signed-off-by: Ilias Stamatis <ilstam@amazon.com>
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Message-Id: <20210526184418.28881-2-ilstam@amazon.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/math64.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/include/linux/math64.h b/include/linux/math64.h
index 66deb1fdc2ef6..302f380b535a7 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -3,6 +3,7 @@
 #define _LINUX_MATH64_H
 
 #include <linux/types.h>
+#include <linux/kernel.h>
 #include <vdso/math64.h>
 #include <asm/div64.h>
 
@@ -234,6 +235,24 @@ static inline u64 mul_u64_u64_shr(u64 a, u64 b, unsigned int shift)
 
 #endif
 
+#ifndef mul_s64_u64_shr
+static inline u64 mul_s64_u64_shr(s64 a, u64 b, unsigned int shift)
+{
+	u64 ret;
+
+	/*
+	 * Extract the sign before the multiplication and put it back
+	 * afterwards if needed.
+	 */
+	ret = mul_u64_u64_shr(abs(a), b, shift);
+
+	if (a < 0)
+		ret = -((s64) ret);
+
+	return ret;
+}
+#endif /* mul_s64_u64_shr */
+
 #ifndef mul_u64_u32_div
 static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
 {

From 9a5fd05c7e8651388b45417fbd8337890ee521de Mon Sep 17 00:00:00 2001
From: Ilias Stamatis <ilstam@amazon.com>
Date: Wed, 26 May 2021 19:44:09 +0100
Subject: [PATCH 149/737] KVM: X86: Store L1's TSC scaling ratio in 'struct
 kvm_vcpu_arch'

Store L1's scaling ratio in the kvm_vcpu_arch struct like we already do
for L1's TSC offset. This allows for easy save/restore when we enter and
then exit the nested guest.

Note: Backported to 5.10

Signed-off-by: Ilias Stamatis <ilstam@amazon.com>
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Message-Id: <20210526184418.28881-3-ilstam@amazon.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 5 +++--
 arch/x86/kvm/vmx/vmx.c          | 4 ++--
 arch/x86/kvm/x86.c              | 6 ++++--
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3e9f1c820edbf..98bcb121bfefd 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -668,7 +668,7 @@ struct kvm_vcpu_arch {
 	} st;
 
 	u64 l1_tsc_offset;
-	u64 tsc_offset;
+	u64 tsc_offset; /* current tsc offset */
 	u64 last_guest_tsc;
 	u64 last_host_tsc;
 	u64 tsc_offset_adjustment;
@@ -682,7 +682,8 @@ struct kvm_vcpu_arch {
 	u32 virtual_tsc_khz;
 	s64 ia32_tsc_adjust_msr;
 	u64 msr_ia32_power_ctl;
-	u64 tsc_scaling_ratio;
+	u64 l1_tsc_scaling_ratio;
+	u64 tsc_scaling_ratio; /* current scaling ratio */
 
 	atomic_t nmi_queued;  /* unprocessed asynchronous NMIs */
 	unsigned nmi_pending; /* NMI queued after currently running handler */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 2445c61038954..9a00e67da4316 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7622,10 +7622,10 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
 		delta_tsc = 0;
 
 	/* Convert to host delta tsc if tsc scaling is enabled */
-	if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
+	if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
 	    delta_tsc && u64_shl_div_u64(delta_tsc,
 				kvm_tsc_scaling_ratio_frac_bits,
-				vcpu->arch.tsc_scaling_ratio, &delta_tsc))
+				vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc))
 		return -ERANGE;
 
 	/*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cf47392005663..a8abb0caaf23e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2120,6 +2120,7 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
 
 	/* Guest TSC same frequency as host TSC? */
 	if (!scale) {
+		vcpu->arch.l1_tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
 		vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
 		return 0;
 	}
@@ -2146,7 +2147,7 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
 		return -1;
 	}
 
-	vcpu->arch.tsc_scaling_ratio = ratio;
+	vcpu->arch.l1_tsc_scaling_ratio = vcpu->arch.tsc_scaling_ratio = ratio;
 	return 0;
 }
 
@@ -2158,6 +2159,7 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
 	/* tsc_khz can be zero if TSC calibration fails */
 	if (user_tsc_khz == 0) {
 		/* set tsc_scaling_ratio to a safe value */
+		vcpu->arch.l1_tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
 		vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
 		return -1;
 	}
@@ -2394,7 +2396,7 @@ static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
 
 static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
 {
-	if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
+	if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
 		WARN_ON(adjustment < 0);
 	adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
 	adjust_tsc_offset_guest(vcpu, adjustment);

From 904337d510333b45a04fd4e544afe5c8a806ed2f Mon Sep 17 00:00:00 2001
From: Ilias Stamatis <ilstam@amazon.com>
Date: Wed, 26 May 2021 19:44:10 +0100
Subject: [PATCH 150/737] KVM: X86: Rename kvm_compute_tsc_offset() to
 kvm_compute_l1_tsc_offset()

All existing code uses kvm_compute_tsc_offset() passing L1 TSC values to
it. Let's document this by renaming it to kvm_compute_l1_tsc_offset().

Signed-off-by: Ilias Stamatis <ilstam@amazon.com>
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Message-Id: <20210526184418.28881-4-ilstam@amazon.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a8abb0caaf23e..08aa86968827c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2254,7 +2254,7 @@ u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
 }
 EXPORT_SYMBOL_GPL(kvm_scale_tsc);
 
-static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
+static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
 {
 	u64 tsc;
 
@@ -2298,7 +2298,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
 	bool synchronizing = false;
 
 	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
-	offset = kvm_compute_tsc_offset(vcpu, data);
+	offset = kvm_compute_l1_tsc_offset(vcpu, data);
 	ns = get_kvmclock_base_ns();
 	elapsed = ns - kvm->arch.last_tsc_nsec;
 
@@ -2337,7 +2337,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
 		} else {
 			u64 delta = nsec_to_cycles(vcpu, elapsed);
 			data += delta;
-			offset = kvm_compute_tsc_offset(vcpu, data);
+			offset = kvm_compute_l1_tsc_offset(vcpu, data);
 		}
 		matched = true;
 		already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
@@ -3242,7 +3242,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (msr_info->host_initiated) {
 			kvm_synchronize_tsc(vcpu, data);
 		} else {
-			u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
+			u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
 			adjust_tsc_offset_guest(vcpu, adj);
 			vcpu->arch.ia32_tsc_adjust_msr += adj;
 		}
@@ -4080,7 +4080,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 			mark_tsc_unstable("KVM discovered backwards TSC");
 
 		if (kvm_check_tsc_unstable()) {
-			u64 offset = kvm_compute_tsc_offset(vcpu,
+			u64 offset = kvm_compute_l1_tsc_offset(vcpu,
 						vcpu->arch.last_guest_tsc);
 			kvm_vcpu_write_tsc_offset(vcpu, offset);
 			vcpu->arch.tsc_catchup = 1;

From ae74c70e0d6139471f6991096a706c003064a14d Mon Sep 17 00:00:00 2001
From: Ilias Stamatis <ilstam@amazon.com>
Date: Wed, 26 May 2021 19:44:11 +0100
Subject: [PATCH 151/737] KVM: X86: Add a ratio parameter to kvm_scale_tsc()

Sometimes kvm_scale_tsc() needs to use the current scaling ratio and
other times (like when reading the TSC from user space) it needs to use
L1's scaling ratio. Have the caller specify this by passing the ratio as
a parameter.

Signed-off-by: Ilias Stamatis <ilstam@amazon.com>
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Message-Id: <20210526184418.28881-5-ilstam@amazon.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/x86.c              | 27 ++++++++++++++++++---------
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 98bcb121bfefd..3efab8406e75e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1683,7 +1683,7 @@ void kvm_define_user_return_msr(unsigned index, u32 msr);
 int kvm_probe_user_return_msr(u32 msr);
 int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask);
 
-u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc);
+u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio);
 u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc);
 
 unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 08aa86968827c..266c785b33574 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2242,10 +2242,9 @@ static inline u64 __scale_tsc(u64 ratio, u64 tsc)
 	return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
 }
 
-u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
+u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio)
 {
 	u64 _tsc = tsc;
-	u64 ratio = vcpu->arch.tsc_scaling_ratio;
 
 	if (ratio != kvm_default_tsc_scaling_ratio)
 		_tsc = __scale_tsc(ratio, tsc);
@@ -2258,14 +2257,15 @@ static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
 {
 	u64 tsc;
 
-	tsc = kvm_scale_tsc(vcpu, rdtsc());
+	tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio);
 
 	return target_tsc - tsc;
 }
 
 u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
 {
-	return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
+	return vcpu->arch.l1_tsc_offset +
+		kvm_scale_tsc(vcpu, host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
 }
 EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
 
@@ -2398,7 +2398,8 @@ static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
 {
 	if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
 		WARN_ON(adjustment < 0);
-	adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
+	adjustment = kvm_scale_tsc(vcpu, (u64) adjustment,
+				   vcpu->arch.l1_tsc_scaling_ratio);
 	adjust_tsc_offset_guest(vcpu, adjustment);
 }
 
@@ -2775,7 +2776,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	/* With all the info we got, fill in the values */
 
 	if (kvm_has_tsc_control)
-		tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
+		tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz,
+					    v->arch.l1_tsc_scaling_ratio);
 
 	if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
 		kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
@@ -3540,10 +3542,17 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		 * return L1's TSC value to ensure backwards-compatible
 		 * behavior for migration.
 		 */
-		u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset :
-							    vcpu->arch.tsc_offset;
+		u64 offset, ratio;
 
-		msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset;
+		if (msr_info->host_initiated) {
+			offset = vcpu->arch.l1_tsc_offset;
+			ratio = vcpu->arch.l1_tsc_scaling_ratio;
+		} else {
+			offset = vcpu->arch.tsc_offset;
+			ratio = vcpu->arch.tsc_scaling_ratio;
+		}
+
+		msr_info->data = kvm_scale_tsc(vcpu, rdtsc(), ratio) + offset;
 		break;
 	}
 	case MSR_MTRRcap:

From 33dfc13156b40684cea795ec9a254a7711dcf448 Mon Sep 17 00:00:00 2001
From: Ilias Stamatis <ilstam@amazon.com>
Date: Wed, 26 May 2021 19:44:12 +0100
Subject: [PATCH 152/737] KVM: nVMX: Add a TSC multiplier field in VMCS12

This is required for supporting nested TSC scaling.

Note: Backported to 5.10

Signed-off-by: Ilias Stamatis <ilstam@amazon.com>
Reviewed-by: Jim Mattson <jmattson@google.com>
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Message-Id: <20210526184418.28881-6-ilstam@amazon.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmcs12.c | 1 +
 arch/x86/kvm/vmx/vmcs12.h | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
index c8e51c004f782..989e867e4056f 100644
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -37,6 +37,7 @@ const unsigned short vmcs_field_to_offset_table[] = {
 	FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
 	FIELD64(PML_ADDRESS, pml_address),
 	FIELD64(TSC_OFFSET, tsc_offset),
+	FIELD64(TSC_MULTIPLIER, tsc_multiplier),
 	FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
 	FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
 	FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 80232daf00ff1..2bfb9a4466b49 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -69,7 +69,8 @@ struct __packed vmcs12 {
 	u64 vm_function_control;
 	u64 eptp_list_address;
 	u64 pml_address;
-	u64 padding64[3]; /* room for future expansion */
+	u64 tsc_multiplier;
+	u64 padding64[2]; /* room for future expansion */
 	/*
 	 * To allow migration of L1 (complete with its L2 guests) between
 	 * machines of different natural widths (32 or 64 bit), we cannot have
@@ -256,6 +257,7 @@ static inline void vmx_check_vmcs12_offsets(void)
 	CHECK_OFFSET(vm_function_control, 296);
 	CHECK_OFFSET(eptp_list_address, 304);
 	CHECK_OFFSET(pml_address, 312);
+	CHECK_OFFSET(tsc_multiplier, 320);
 	CHECK_OFFSET(cr0_guest_host_mask, 344);
 	CHECK_OFFSET(cr4_guest_host_mask, 352);
 	CHECK_OFFSET(cr0_read_shadow, 360);

From aebdc8e6cf5d8e19a40c61aa28092c52c2ec5b64 Mon Sep 17 00:00:00 2001
From: Ilias Stamatis <ilstam@amazon.com>
Date: Wed, 26 May 2021 19:44:13 +0100
Subject: [PATCH 153/737] KVM: X86: Add functions for retrieving L2 TSC fields
 from common code

In order to implement as much of the nested TSC scaling logic as
possible in common code, we need these vendor callbacks for retrieving
the TSC offset and the TSC multiplier that L1 has set for L2.

Note: Backported to 5.10

Signed-off-by: Ilias Stamatis <ilstam@amazon.com>
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Message-Id: <20210526184418.28881-7-ilstam@amazon.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/svm/svm.c          | 14 ++++++++++++++
 arch/x86/kvm/vmx/vmx.c          | 23 +++++++++++++++++++++++
 arch/x86/kvm/vmx/vmx.h          |  3 +++
 4 files changed, 42 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3efab8406e75e..1a0ab4a517b21 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1196,6 +1196,8 @@ struct kvm_x86_ops {
 
 	bool (*has_wbinvd_exit)(void);
 
+	u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
+	u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu);
 	/* Returns actual tsc_offset set in active VMCS */
 	u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
 
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 8544bca6b3356..7e310b66f3e5e 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1061,6 +1061,18 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
 	seg->base = 0;
 }
 
+static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	return svm->nested.ctl.tsc_offset;
+}
+
+static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+	return kvm_default_tsc_scaling_ratio;
+}
+
 static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -4294,6 +4306,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 
 	.has_wbinvd_exit = svm_has_wbinvd_exit,
 
+	.get_l2_tsc_offset = svm_get_l2_tsc_offset,
+	.get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
 	.write_l1_tsc_offset = svm_write_l1_tsc_offset,
 
 	.load_mmu_pgd = svm_load_mmu_pgd,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 9a00e67da4316..de5c2afd2235a 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1826,6 +1826,27 @@ static void setup_msrs(struct vcpu_vmx *vmx)
 		vmx_update_msr_bitmap(&vmx->vcpu);
 }
 
+u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
+{
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+	if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING))
+		return vmcs12->tsc_offset;
+
+	return 0;
+}
+
+u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+	if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) &&
+	    nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
+		return vmcs12->tsc_multiplier;
+
+	return kvm_default_tsc_scaling_ratio;
+}
+
 static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -7877,6 +7898,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 
 	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
 
+	.get_l2_tsc_offset = vmx_get_l2_tsc_offset,
+	.get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
 	.write_l1_tsc_offset = vmx_write_l1_tsc_offset,
 
 	.load_mmu_pgd = vmx_load_mmu_pgd,
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index ed4b6da83aa87..70779eaf0736f 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -373,6 +373,9 @@ bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs,
 int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr);
 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu);
 
+u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu);
+u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu);
+
 static inline u8 vmx_get_rvi(void)
 {
 	return vmcs_read16(GUEST_INTR_STATUS) & 0xff;

From 2b8acc9785d3dfcff587f27c347e6b2efcb4a939 Mon Sep 17 00:00:00 2001
From: Ilias Stamatis <ilstam@amazon.com>
Date: Wed, 26 May 2021 19:44:14 +0100
Subject: [PATCH 154/737] KVM: X86: Add functions that calculate the nested TSC
 fields

When L2 is entered we need to "merge" the TSC multiplier and TSC offset
values of 01 and 12 together.

The merging is done using the following equations:
  offset_02 = ((offset_01 * mult_12) >> shift_bits) + offset_12
  mult_02 = (mult_01 * mult_12) >> shift_bits

Where shift_bits is kvm_tsc_scaling_ratio_frac_bits.

Signed-off-by: Ilias Stamatis <ilstam@amazon.com>
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Message-Id: <20210526184418.28881-8-ilstam@amazon.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/x86.c              | 25 +++++++++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1a0ab4a517b21..271173063b492 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1687,6 +1687,8 @@ int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask);
 
 u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio);
 u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc);
+u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier);
+u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier);
 
 unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu);
 bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 266c785b33574..d1203ecdb58a9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2269,6 +2269,31 @@ u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
 }
 EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
 
+u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier)
+{
+	u64 nested_offset;
+
+	if (l2_multiplier == kvm_default_tsc_scaling_ratio)
+		nested_offset = l1_offset;
+	else
+		nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier,
+						kvm_tsc_scaling_ratio_frac_bits);
+
+	nested_offset += l2_offset;
+	return nested_offset;
+}
+EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset);
+
+u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier)
+{
+	if (l2_multiplier != kvm_default_tsc_scaling_ratio)
+		return mul_u64_u64_shr(l1_multiplier, l2_multiplier,
+				       kvm_tsc_scaling_ratio_frac_bits);
+
+	return l1_multiplier;
+}
+EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier);
+
 static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
 	vcpu->arch.l1_tsc_offset = offset;

From 755cb8eaa34286f1ba915a673ee33543968d7e13 Mon Sep 17 00:00:00 2001
From: Ilias Stamatis <ilstam@amazon.com>
Date: Wed, 26 May 2021 19:44:15 +0100
Subject: [PATCH 155/737] KVM: X86: Move write_l1_tsc_offset() logic to common
 code and rename it

The write_l1_tsc_offset() callback has a misleading name. It does not
set L1's TSC offset, it rather updates the current TSC offset which
might be different if a nested guest is executing. Additionally, both
the vmx and svm implementations use the same logic for calculating the
current TSC before writing it to hardware.

Rename the function and move the common logic to the caller. The vmx/svm
specific code now merely sets the given offset to the corresponding
hardware structure.

Note: Backported to 5.10

Signed-off-by: Ilias Stamatis <ilstam@amazon.com>
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Message-Id: <20210526184418.28881-9-ilstam@amazon.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  3 +--
 arch/x86/kvm/svm/svm.c          | 21 ++++-----------------
 arch/x86/kvm/vmx/vmx.c          | 23 +++--------------------
 arch/x86/kvm/x86.c              | 24 +++++++++++++++++++++---
 4 files changed, 29 insertions(+), 42 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 271173063b492..9d67d42164767 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1198,8 +1198,7 @@ struct kvm_x86_ops {
 
 	u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
 	u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu);
-	/* Returns actual tsc_offset set in active VMCS */
-	u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
+	void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
 
 	/*
 	 * Retrieve somewhat arbitrary exit information.  Intended to be used
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 7e310b66f3e5e..d5d66703c85dc 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1073,26 +1073,13 @@ static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
 	return kvm_default_tsc_scaling_ratio;
 }
 
-static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
-	u64 g_tsc_offset = 0;
-
-	if (is_guest_mode(vcpu)) {
-		/* Write L1's TSC offset.  */
-		g_tsc_offset = svm->vmcb->control.tsc_offset -
-			       svm->nested.hsave->control.tsc_offset;
-		svm->nested.hsave->control.tsc_offset = offset;
-	}
-
-	trace_kvm_write_tsc_offset(vcpu->vcpu_id,
-				   svm->vmcb->control.tsc_offset - g_tsc_offset,
-				   offset);
-
-	svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
 
+	svm->nested.hsave->control.tsc_offset = vcpu->arch.l1_tsc_offset;
+	svm->vmcb->control.tsc_offset = offset;
 	vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
-	return svm->vmcb->control.tsc_offset;
 }
 
 static void svm_check_invpcid(struct vcpu_svm *svm)
@@ -4308,7 +4295,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 
 	.get_l2_tsc_offset = svm_get_l2_tsc_offset,
 	.get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
-	.write_l1_tsc_offset = svm_write_l1_tsc_offset,
+	.write_tsc_offset = svm_write_tsc_offset,
 
 	.load_mmu_pgd = svm_load_mmu_pgd,
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index de5c2afd2235a..3b802e32752ac 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1847,26 +1847,9 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
 	return kvm_default_tsc_scaling_ratio;
 }
 
-static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
-	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-	u64 g_tsc_offset = 0;
-
-	/*
-	 * We're here if L1 chose not to trap WRMSR to TSC. According
-	 * to the spec, this should set L1's TSC; The offset that L1
-	 * set for L2 remains unchanged, and still needs to be added
-	 * to the newly set TSC to get L2's TSC.
-	 */
-	if (is_guest_mode(vcpu) &&
-	    (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
-		g_tsc_offset = vmcs12->tsc_offset;
-
-	trace_kvm_write_tsc_offset(vcpu->vcpu_id,
-				   vcpu->arch.tsc_offset - g_tsc_offset,
-				   offset);
-	vmcs_write64(TSC_OFFSET, offset + g_tsc_offset);
-	return offset + g_tsc_offset;
+	vmcs_write64(TSC_OFFSET, offset);
 }
 
 /*
@@ -7900,7 +7883,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 
 	.get_l2_tsc_offset = vmx_get_l2_tsc_offset,
 	.get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
-	.write_l1_tsc_offset = vmx_write_l1_tsc_offset,
+	.write_tsc_offset = vmx_write_tsc_offset,
 
 	.load_mmu_pgd = vmx_load_mmu_pgd,
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d1203ecdb58a9..79bcf8eb820db 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2294,10 +2294,28 @@ u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier)
 }
 EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier);
 
-static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
 {
-	vcpu->arch.l1_tsc_offset = offset;
-	vcpu->arch.tsc_offset = kvm_x86_ops.write_l1_tsc_offset(vcpu, offset);
+	trace_kvm_write_tsc_offset(vcpu->vcpu_id,
+				   vcpu->arch.l1_tsc_offset,
+				   l1_offset);
+
+	vcpu->arch.l1_tsc_offset = l1_offset;
+
+	/*
+	 * If we are here because L1 chose not to trap WRMSR to TSC then
+	 * according to the spec this should set L1's TSC (as opposed to
+	 * setting L1's offset for L2).
+	 */
+	if (is_guest_mode(vcpu))
+		vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
+			l1_offset,
+			kvm_x86_ops.get_l2_tsc_offset(vcpu),
+			kvm_x86_ops.get_l2_tsc_multiplier(vcpu));
+	else
+		vcpu->arch.tsc_offset = l1_offset;
+
+	kvm_x86_ops.write_tsc_offset(vcpu, vcpu->arch.tsc_offset);
 }
 
 static inline bool kvm_check_tsc_unstable(void)

From 9618a180aede6b1edac51ce0855385f0f35a9d0d Mon Sep 17 00:00:00 2001
From: Ilias Stamatis <ilstam@amazon.com>
Date: Mon, 7 Jun 2021 11:54:38 +0100
Subject: [PATCH 156/737] KVM: X86: Add vendor callbacks for writing the TSC
 multiplier

Currently vmx_vcpu_load_vmcs() writes the TSC_MULTIPLIER field of the
VMCS every time the VMCS is loaded. Instead of doing this, set this
field from common code on initialization and whenever the scaling ratio
changes.

Additionally remove vmx->current_tsc_ratio. This field is redundant as
vcpu->arch.tsc_scaling_ratio already tracks the current TSC scaling
ratio. The vmx->current_tsc_ratio field is only used for avoiding
unnecessary writes but it is no longer needed after removing the code
from the VMCS load path.

Note: Backported to 5.10

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Ilias Stamatis <ilstam@amazon.com>
Message-Id: <20210607105438.16541-1-ilstam@amazon.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/svm/svm.c          |  6 ++++++
 arch/x86/kvm/vmx/nested.c       |  9 ++++-----
 arch/x86/kvm/vmx/vmx.c          | 11 ++++++-----
 arch/x86/kvm/vmx/vmx.h          |  8 --------
 arch/x86/kvm/x86.c              | 30 +++++++++++++++++++++++-------
 6 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9d67d42164767..64cd6fb22b325 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1199,6 +1199,7 @@ struct kvm_x86_ops {
 	u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
 	u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu);
 	void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
+	void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu, u64 multiplier);
 
 	/*
 	 * Retrieve somewhat arbitrary exit information.  Intended to be used
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index d5d66703c85dc..0c1e265409eba 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1082,6 +1082,11 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 	vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 }
 
+static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
+{
+	wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
+}
+
 static void svm_check_invpcid(struct vcpu_svm *svm)
 {
 	/*
@@ -4296,6 +4301,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.get_l2_tsc_offset = svm_get_l2_tsc_offset,
 	.get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
 	.write_tsc_offset = svm_write_tsc_offset,
+	.write_tsc_multiplier = svm_write_tsc_multiplier,
 
 	.load_mmu_pgd = svm_load_mmu_pgd,
 
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index c165ddbb672fe..72925e1c0827c 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2570,9 +2570,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	}
 
 	vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
-
 	if (kvm_has_tsc_control)
-		decache_tsc_multiplier(vmx);
+		vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
 
 	nested_vmx_transition_tlb_flush(vcpu, vmcs12, true);
 
@@ -4577,12 +4576,12 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
 	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
 	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
 	vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+	if (kvm_has_tsc_control)
+		vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
+
 	if (vmx->nested.l1_tpr_threshold != -1)
 		vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
 
-	if (kvm_has_tsc_control)
-		decache_tsc_multiplier(vmx);
-
 	if (vmx->nested.change_vmcs01_virtual_apic_mode) {
 		vmx->nested.change_vmcs01_virtual_apic_mode = false;
 		vmx_set_virtual_apic_mode(vcpu);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 3b802e32752ac..c1f433afe83d6 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1462,11 +1462,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
 
 		vmx->loaded_vmcs->cpu = cpu;
 	}
-
-	/* Setup TSC multiplier */
-	if (kvm_has_tsc_control &&
-	    vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
-		decache_tsc_multiplier(vmx);
 }
 
 /*
@@ -1852,6 +1847,11 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 	vmcs_write64(TSC_OFFSET, offset);
 }
 
+static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
+{
+	vmcs_write64(TSC_MULTIPLIER, multiplier);
+}
+
 /*
  * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
  * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
@@ -7884,6 +7884,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.get_l2_tsc_offset = vmx_get_l2_tsc_offset,
 	.get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
 	.write_tsc_offset = vmx_write_tsc_offset,
+	.write_tsc_multiplier = vmx_write_tsc_multiplier,
 
 	.load_mmu_pgd = vmx_load_mmu_pgd,
 
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 70779eaf0736f..6dbb13725a11e 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -289,8 +289,6 @@ struct vcpu_vmx {
 	/* apic deadline value in host tsc */
 	u64 hv_deadline_tsc;
 
-	u64 current_tsc_ratio;
-
 	unsigned long host_debugctlmsr;
 
 	/*
@@ -494,12 +492,6 @@ static inline struct vmcs *alloc_vmcs(bool shadow)
 			      GFP_KERNEL_ACCOUNT);
 }
 
-static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx)
-{
-	vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
-	vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
-}
-
 static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
 {
 	return secondary_exec_controls_get(vmx) &
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 79bcf8eb820db..dc03f6ebae306 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2114,14 +2114,15 @@ static u32 adjust_tsc_khz(u32 khz, s32 ppm)
 	return v;
 }
 
+static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier);
+
 static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
 {
 	u64 ratio;
 
 	/* Guest TSC same frequency as host TSC? */
 	if (!scale) {
-		vcpu->arch.l1_tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
-		vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+		kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
 		return 0;
 	}
 
@@ -2147,7 +2148,7 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
 		return -1;
 	}
 
-	vcpu->arch.l1_tsc_scaling_ratio = vcpu->arch.tsc_scaling_ratio = ratio;
+	kvm_vcpu_write_tsc_multiplier(vcpu, ratio);
 	return 0;
 }
 
@@ -2159,8 +2160,7 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
 	/* tsc_khz can be zero if TSC calibration fails */
 	if (user_tsc_khz == 0) {
 		/* set tsc_scaling_ratio to a safe value */
-		vcpu->arch.l1_tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
-		vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+		kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
 		return -1;
 	}
 
@@ -2318,6 +2318,23 @@ static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
 	kvm_x86_ops.write_tsc_offset(vcpu, vcpu->arch.tsc_offset);
 }
 
+static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier)
+{
+	vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier;
+
+	/* Userspace is changing the multiplier while L2 is active */
+	if (is_guest_mode(vcpu))
+		vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
+			l1_multiplier,
+			kvm_x86_ops.get_l2_tsc_multiplier(vcpu));
+	else
+		vcpu->arch.tsc_scaling_ratio = l1_multiplier;
+
+	if (kvm_has_tsc_control)
+		kvm_x86_ops.write_tsc_multiplier(
+			vcpu, vcpu->arch.tsc_scaling_ratio);
+}
+
 static inline bool kvm_check_tsc_unstable(void)
 {
 #ifdef CONFIG_X86_64
@@ -10254,8 +10271,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 	else
 		vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
 
-	kvm_set_tsc_khz(vcpu, max_tsc_khz);
-
 	r = kvm_mmu_create(vcpu);
 	if (r < 0)
 		return r;
@@ -10325,6 +10340,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 	vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
 	kvm_vcpu_mtrr_init(vcpu);
 	vcpu_load(vcpu);
+	kvm_set_tsc_khz(vcpu, max_tsc_khz);
 	kvm_vcpu_reset(vcpu, false);
 	kvm_init_mmu(vcpu, false);
 	vcpu_put(vcpu);

From 613e8848146348bec3c4abcd208eb0d35bfc65cc Mon Sep 17 00:00:00 2001
From: Ilias Stamatis <ilstam@amazon.com>
Date: Wed, 26 May 2021 19:44:17 +0100
Subject: [PATCH 157/737] KVM: nVMX: Enable nested TSC scaling

Calculate the TSC offset and multiplier on nested transitions and expose
the TSC scaling feature to L1.

Signed-off-by: Ilias Stamatis <ilstam@amazon.com>
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Message-Id: <20210526184418.28881-11-ilstam@amazon.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 72925e1c0827c..96e7d3b8b7181 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2308,8 +2308,8 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
 				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
 				  SECONDARY_EXEC_APIC_REGISTER_VIRT |
 				  SECONDARY_EXEC_ENABLE_VMFUNC |
-				  SECONDARY_EXEC_DESC);
-
+				  SECONDARY_EXEC_DESC |
+				  SECONDARY_EXEC_TSC_SCALING);
 		if (nested_cpu_has(vmcs12,
 				   CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
 			vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
@@ -2569,6 +2569,15 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
 	}
 
+	vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
+			vcpu->arch.l1_tsc_offset,
+			vmx_get_l2_tsc_offset(vcpu),
+			vmx_get_l2_tsc_multiplier(vcpu));
+
+	vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
+			vcpu->arch.l1_tsc_scaling_ratio,
+			vmx_get_l2_tsc_multiplier(vcpu));
+
 	vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
 	if (kvm_has_tsc_control)
 		vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
@@ -3409,8 +3418,6 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
 	}
 
 	enter_guest_mode(vcpu);
-	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
-		vcpu->arch.tsc_offset += vmcs12->tsc_offset;
 
 	if (prepare_vmcs02(vcpu, vmcs12, &entry_failure_code)) {
 		exit_reason.basic = EXIT_REASON_INVALID_STATE;
@@ -4516,8 +4523,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
 	if (nested_cpu_has_preemption_timer(vmcs12))
 		hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
 
-	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
-		vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
+	if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) {
+		vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset;
+		if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
+			vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
+	}
 
 	if (likely(!vmx->fail)) {
 		sync_vmcs02_to_vmcs12(vcpu, vmcs12);
@@ -6546,7 +6556,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
 		SECONDARY_EXEC_RDRAND_EXITING |
 		SECONDARY_EXEC_ENABLE_INVPCID |
 		SECONDARY_EXEC_RDSEED_EXITING |
-		SECONDARY_EXEC_XSAVES;
+		SECONDARY_EXEC_XSAVES |
+		SECONDARY_EXEC_TSC_SCALING;
 
 	/*
 	 * We can emulate "VMCS shadowing," even if the hardware

From 809327c63f6d91dcde0fcb269f970e8e3161046b Mon Sep 17 00:00:00 2001
From: Ilias Stamatis <ilstam@amazon.com>
Date: Wed, 26 May 2021 19:44:18 +0100
Subject: [PATCH 158/737] KVM: selftests: x86: Add vmx_nested_tsc_scaling_test

Test that nested TSC scaling works as expected with both L1 and L2
scaled.

Signed-off-by: Ilias Stamatis <ilstam@amazon.com>
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Message-Id: <20210526184418.28881-12-ilstam@amazon.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/.gitignore        |   1 +
 tools/testing/selftests/kvm/Makefile          |   1 +
 .../kvm/x86_64/vmx_nested_tsc_scaling_test.c  | 242 ++++++++++++++++++
 3 files changed, 244 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c

diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index 7a2c242b7152e..dee90d1a69c8b 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -23,6 +23,7 @@
 /x86_64/vmx_dirty_log_test
 /x86_64/vmx_set_nested_state_test
 /x86_64/vmx_tsc_adjust_test
+/x86_64/vmx_nested_tsc_scaling_test
 /x86_64/xss_msr_test
 /clear_dirty_log_test
 /demand_paging_test
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 3d14ef77755e5..6c20dcefa5657 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -55,6 +55,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test
 TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test
 TEST_GEN_PROGS_x86_64 += x86_64/debug_regs
 TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
new file mode 100644
index 0000000000000..280c01fd24126
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
@@ -0,0 +1,242 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vmx_nested_tsc_scaling_test
+ *
+ * Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * This test case verifies that nested TSC scaling behaves as expected when
+ * both L1 and L2 are scaled using different ratios. For this test we scale
+ * L1 down and scale L2 up.
+ */
+
+#include <time.h>
+
+#include "kvm_util.h"
+#include "vmx.h"
+#include "kselftest.h"
+
+
+#define VCPU_ID 0
+
+/* L2 is scaled up (from L1's perspective) by this factor */
+#define L2_SCALE_FACTOR 4ULL
+
+#define TSC_OFFSET_L2 ((uint64_t) -33125236320908)
+#define TSC_MULTIPLIER_L2 (L2_SCALE_FACTOR << 48)
+
+#define L2_GUEST_STACK_SIZE 64
+
+enum { USLEEP, UCHECK_L1, UCHECK_L2 };
+#define GUEST_SLEEP(sec)         ucall(UCALL_SYNC, 2, USLEEP, sec)
+#define GUEST_CHECK(level, freq) ucall(UCALL_SYNC, 2, level, freq)
+
+
+/*
+ * This function checks whether the "actual" TSC frequency of a guest matches
+ * its expected frequency. In order to account for delays in taking the TSC
+ * measurements, a difference of 1% between the actual and the expected value
+ * is tolerated.
+ */
+static void compare_tsc_freq(uint64_t actual, uint64_t expected)
+{
+	uint64_t tolerance, thresh_low, thresh_high;
+
+	tolerance = expected / 100;
+	thresh_low = expected - tolerance;
+	thresh_high = expected + tolerance;
+
+	TEST_ASSERT(thresh_low < actual,
+		"TSC freq is expected to be between %"PRIu64" and %"PRIu64
+		" but it actually is %"PRIu64,
+		thresh_low, thresh_high, actual);
+	TEST_ASSERT(thresh_high > actual,
+		"TSC freq is expected to be between %"PRIu64" and %"PRIu64
+		" but it actually is %"PRIu64,
+		thresh_low, thresh_high, actual);
+}
+
+static void check_tsc_freq(int level)
+{
+	uint64_t tsc_start, tsc_end, tsc_freq;
+
+	/*
+	 * Reading the TSC twice with about a second's difference should give
+	 * us an approximation of the TSC frequency from the guest's
+	 * perspective. Now, this won't be completely accurate, but it should
+	 * be good enough for the purposes of this test.
+	 */
+	tsc_start = rdmsr(MSR_IA32_TSC);
+	GUEST_SLEEP(1);
+	tsc_end = rdmsr(MSR_IA32_TSC);
+
+	tsc_freq = tsc_end - tsc_start;
+
+	GUEST_CHECK(level, tsc_freq);
+}
+
+static void l2_guest_code(void)
+{
+	check_tsc_freq(UCHECK_L2);
+
+	/* exit to L1 */
+	__asm__ __volatile__("vmcall");
+}
+
+static void l1_guest_code(struct vmx_pages *vmx_pages)
+{
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	uint32_t control;
+
+	/* check that L1's frequency looks alright before launching L2 */
+	check_tsc_freq(UCHECK_L1);
+
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+	GUEST_ASSERT(load_vmcs(vmx_pages));
+
+	/* prepare the VMCS for L2 execution */
+	prepare_vmcs(vmx_pages, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	/* enable TSC offsetting and TSC scaling for L2 */
+	control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
+	control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING;
+	vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
+
+	control = vmreadz(SECONDARY_VM_EXEC_CONTROL);
+	control |= SECONDARY_EXEC_TSC_SCALING;
+	vmwrite(SECONDARY_VM_EXEC_CONTROL, control);
+
+	vmwrite(TSC_OFFSET, TSC_OFFSET_L2);
+	vmwrite(TSC_MULTIPLIER, TSC_MULTIPLIER_L2);
+	vmwrite(TSC_MULTIPLIER_HIGH, TSC_MULTIPLIER_L2 >> 32);
+
+	/* launch L2 */
+	GUEST_ASSERT(!vmlaunch());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+	/* check that L1's frequency still looks good */
+	check_tsc_freq(UCHECK_L1);
+
+	GUEST_DONE();
+}
+
+static void tsc_scaling_check_supported(void)
+{
+	if (!kvm_check_cap(KVM_CAP_TSC_CONTROL)) {
+		print_skip("TSC scaling not supported by the HW");
+		exit(KSFT_SKIP);
+	}
+}
+
+static void stable_tsc_check_supported(void)
+{
+	FILE *fp;
+	char buf[4];
+
+	fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r");
+	if (fp == NULL)
+		goto skip_test;
+
+	if (fgets(buf, sizeof(buf), fp) == NULL)
+		goto skip_test;
+
+	if (strncmp(buf, "tsc", sizeof(buf)))
+		goto skip_test;
+
+	return;
+skip_test:
+	print_skip("Kernel does not use TSC clocksource - assuming that host TSC is not stable");
+	exit(KSFT_SKIP);
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vm *vm;
+	vm_vaddr_t vmx_pages_gva;
+
+	uint64_t tsc_start, tsc_end;
+	uint64_t tsc_khz;
+	uint64_t l1_scale_factor;
+	uint64_t l0_tsc_freq = 0;
+	uint64_t l1_tsc_freq = 0;
+	uint64_t l2_tsc_freq = 0;
+
+	nested_vmx_check_supported();
+	tsc_scaling_check_supported();
+	stable_tsc_check_supported();
+
+	/*
+	 * We set L1's scale factor to be a random number from 2 to 10.
+	 * Ideally we would do the same for L2's factor but that one is
+	 * referenced by both main() and l1_guest_code() and using a global
+	 * variable does not work.
+	 */
+	srand(time(NULL));
+	l1_scale_factor = (rand() % 9) + 2;
+	printf("L1's scale down factor is: %"PRIu64"\n", l1_scale_factor);
+	printf("L2's scale up factor is: %llu\n", L2_SCALE_FACTOR);
+
+	tsc_start = rdtsc();
+	sleep(1);
+	tsc_end = rdtsc();
+
+	l0_tsc_freq = tsc_end - tsc_start;
+	printf("real TSC frequency is around: %"PRIu64"\n", l0_tsc_freq);
+
+	vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code);
+	vcpu_alloc_vmx(vm, &vmx_pages_gva);
+	vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
+
+	tsc_khz = _vcpu_ioctl(vm, VCPU_ID, KVM_GET_TSC_KHZ, NULL);
+	TEST_ASSERT(tsc_khz != -1, "vcpu ioctl KVM_GET_TSC_KHZ failed");
+
+	/* scale down L1's TSC frequency */
+	vcpu_ioctl(vm, VCPU_ID, KVM_SET_TSC_KHZ,
+		  (void *) (tsc_khz / l1_scale_factor));
+
+	for (;;) {
+		volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+		struct ucall uc;
+
+		vcpu_run(vm, VCPU_ID);
+		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+			    "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
+			    run->exit_reason,
+			    exit_reason_str(run->exit_reason));
+
+		switch (get_ucall(vm, VCPU_ID, &uc)) {
+		case UCALL_ABORT:
+			TEST_FAIL("%s", (const char *) uc.args[0]);
+		case UCALL_SYNC:
+			switch (uc.args[0]) {
+			case USLEEP:
+				sleep(uc.args[1]);
+				break;
+			case UCHECK_L1:
+				l1_tsc_freq = uc.args[1];
+				printf("L1's TSC frequency is around: %"PRIu64
+				       "\n", l1_tsc_freq);
+
+				compare_tsc_freq(l1_tsc_freq,
+						 l0_tsc_freq / l1_scale_factor);
+				break;
+			case UCHECK_L2:
+				l2_tsc_freq = uc.args[1];
+				printf("L2's TSC frequency is around: %"PRIu64
+				       "\n", l2_tsc_freq);
+
+				compare_tsc_freq(l2_tsc_freq,
+						 l1_tsc_freq * L2_SCALE_FACTOR);
+				break;
+			}
+			break;
+		case UCALL_DONE:
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+	}
+
+done:
+	kvm_vm_free(vm);
+	return 0;
+}

From 7fa2c70ecddd777ed5d174db4c1fc2c2b5600ae8 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 18 Jun 2021 14:46:58 -0700
Subject: [PATCH 159/737] KVM: nVMX: Dynamically compute max VMCS index for
 vmcs12

Calculate the max VMCS index for vmcs12 by walking the array to find the
actual max index.  Hardcoding the index is prone to bitrot, and the
calculation is only done on KVM bringup (albeit on every CPU, but there
aren't _that_ many null entries in the array).

Fixes: 3c0f99366e34 ("KVM: nVMX: Add a TSC multiplier field in VMCS12")
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20210618214658.2700765-1-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 37 +++++++++++++++++++++++++++++++++++--
 arch/x86/kvm/vmx/vmcs.h   |  8 ++++++++
 arch/x86/kvm/vmx/vmcs12.h |  6 ------
 3 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 96e7d3b8b7181..f545e62acc04f 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -6420,6 +6420,40 @@ void nested_vmx_set_vmcs_shadowing_bitmap(void)
 	}
 }
 
+/*
+ * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6.  Undo
+ * that madness to get the encoding for comparison.
+ */
+#define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10)))
+
+static u64 nested_vmx_calc_vmcs_enum_msr(void)
+{
+	/*
+	 * Note these are the so called "index" of the VMCS field encoding, not
+	 * the index into vmcs12.
+	 */
+	unsigned int max_idx, idx;
+	int i;
+
+	/*
+	 * For better or worse, KVM allows VMREAD/VMWRITE to all fields in
+	 * vmcs12, regardless of whether or not the associated feature is
+	 * exposed to L1.  Simply find the field with the highest index.
+	 */
+	max_idx = 0;
+	for (i = 0; i < nr_vmcs12_fields; i++) {
+		/* The vmcs12 table is very, very sparsely populated. */
+		if (!vmcs_field_to_offset_table[i])
+			continue;
+
+		idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i));
+		if (idx > max_idx)
+			max_idx = idx;
+	}
+
+	return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT;
+}
+
 /*
  * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
  * returned for the various VMX controls MSRs when nested VMX is enabled.
@@ -6661,8 +6695,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
 	rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
 	rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
 
-	/* highest index: VMX_PREEMPTION_TIMER_VALUE */
-	msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
+	msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr();
 }
 
 void nested_vmx_hardware_unsetup(void)
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 69c147df957fd..4b0601a82f7fa 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -174,4 +174,12 @@ static inline int vmcs_field_readonly(unsigned long field)
 	return (((field >> 10) & 0x3) == 1);
 }
 
+#define VMCS_FIELD_INDEX_SHIFT		(1)
+#define VMCS_FIELD_INDEX_MASK		GENMASK(9, 1)
+
+static inline unsigned int vmcs_field_index(unsigned long field)
+{
+	return (field & VMCS_FIELD_INDEX_MASK) >> VMCS_FIELD_INDEX_SHIFT;
+}
+
 #endif /* __KVM_X86_VMX_VMCS_H */
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 2bfb9a4466b49..d87a6f828a2b1 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -204,12 +204,6 @@ struct __packed vmcs12 {
  */
 #define VMCS12_SIZE		KVM_STATE_NESTED_VMX_VMCS_SIZE
 
-/*
- * VMCS12_MAX_FIELD_INDEX is the highest index value used in any
- * supported VMCS12 field encoding.
- */
-#define VMCS12_MAX_FIELD_INDEX 0x17
-
 /*
  * For save/restore compatibility, the vmcs12 field offsets must not change.
  */

From 1000444cf15a0fc543e26670987755ec9df0fe8e Mon Sep 17 00:00:00 2001
From: James Gowans <jgowans@amazon.com>
Date: Fri, 17 Sep 2021 00:45:10 +0200
Subject: [PATCH 160/737] Introduce page touching DMA ops binding

Allows enabling page touching via a kernel command line parameter.
When enabled, devices which don't have an IOMMU assigned to them will be
assigned the page touching DMA map ops which ensures that any memory
mapped for DMA by that devices will be accessed by the CPU to make it
resident.

Signed-off-by: James Gowans <jgowans@amazon.com>
Cc-Team: kaos-brimstone <kaos-brimstone@amazon.com>
Cc-Team: ec2-memo <ec2-memo@amazon.com>
---
 MAINTAINERS                       |   7 ++
 arch/arm64/mm/dma-mapping.c       |   6 ++
 include/linux/dma-page-touching.h |  39 +++++++++
 kernel/dma/Kconfig                |  10 +++
 kernel/dma/Makefile               |   1 +
 kernel/dma/page_touching.c        | 134 ++++++++++++++++++++++++++++++
 6 files changed, 197 insertions(+)
 create mode 100644 include/linux/dma-page-touching.h
 create mode 100644 kernel/dma/page_touching.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 621cdab9fe585..67f55f85d2044 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13184,6 +13184,13 @@ F:	include/net/page_pool.h
 F:	include/trace/events/page_pool.h
 F:	net/core/page_pool.c
 
+PAGE TOUCHING DMA
+M:	James Gowans <jgowans@amazon.com>
+L:	ec2-memo@amazon.com
+S:	Supported
+F:	include/linux/dma-page-touching.h
+F:	kernel/dma/page_touching.c
+
 PANASONIC LAPTOP ACPI EXTRAS DRIVER
 M:	Harald Welte <laforge@gnumonks.org>
 L:	platform-driver-x86@vger.kernel.org
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 93e87b2875567..e0f70ae4d34ed 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -8,6 +8,7 @@
 #include <linux/cache.h>
 #include <linux/dma-map-ops.h>
 #include <linux/dma-iommu.h>
+#include <linux/dma-page-touching.h>
 #include <xen/xen.h>
 #include <xen/swiotlb-xen.h>
 
@@ -56,4 +57,9 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 	if (xen_initial_domain())
 		dev->dma_ops = &xen_swiotlb_dma_ops;
 #endif
+
+#ifdef CONFIG_DMA_PAGE_TOUCHING
+	if (!dev->dma_ops)
+		setup_dma_page_touching_ops(dev);
+#endif
 }
diff --git a/include/linux/dma-page-touching.h b/include/linux/dma-page-touching.h
new file mode 100644
index 0000000000000..8ff9856e994c9
--- /dev/null
+++ b/include/linux/dma-page-touching.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2021 Amazon.com, Inc. or its affiliates.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * Sets the supplied device's DMA ops to the page toucing DMA ops if
+ * page touching is enabled and the device does not already have
+ * DMA ops assigned.
+ */
+void setup_dma_page_touching_ops(struct device *dev);
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index c99de4a214588..7ee3515d29be8 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -20,6 +20,16 @@ config DMA_OPS
 config DMA_OPS_BYPASS
 	bool
 
+config DMA_PAGE_TOUCHING
+	bool "Support touching pages when allocated for DMA"
+	help
+	  Builds in support for binding page touching DMA ops to devices which
+	  don't have an IOMMU. Memory mapped for DMA by those devices will be
+	  access by the CPU via the page touching dma_map_ops to ensure that
+	  the memory is resident when running on a memory overcommit host.
+	  The capacility must still be set up at boot time via the
+	  page_touching.dma_page_touching_enable kernel command line param.
+
 config NEED_SG_DMA_LENGTH
 	bool
 
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index dc755ab68aabf..242d75defc736 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -10,3 +10,4 @@ obj-$(CONFIG_DMA_API_DEBUG)		+= debug.o
 obj-$(CONFIG_SWIOTLB)			+= swiotlb.o
 obj-$(CONFIG_DMA_COHERENT_POOL)		+= pool.o
 obj-$(CONFIG_DMA_REMAP)			+= remap.o
+obj-$(CONFIG_DMA_PAGE_TOUCHING)		+= page_touching.o
diff --git a/kernel/dma/page_touching.c b/kernel/dma/page_touching.c
new file mode 100644
index 0000000000000..a9bb7901d769e
--- /dev/null
+++ b/kernel/dma/page_touching.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
+/*
+ * Copyright 2020 Amazon.com, Inc. or its affiliates.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/dma-map-ops.h>
+#include "direct.h"
+#include <linux/moduleparam.h>
+
+/*
+ * A wrapper around dma_direct which does a readl on the memory being mapped
+ * for DMA to ensure that it becomes resident.
+ * Useful when running in a memory overcommit environment with lazy allocation
+ * and free page reporting.
+ */
+
+/*
+ * Set with kernel cmd line param:
+ * page_touching.dma_page_touching_enable=y
+ */
+static bool dma_page_touching_enable __ro_after_init;
+module_param_named(dma_page_touching_enable, dma_page_touching_enable, bool, 0400);
+MODULE_PARM_DESC(dma_page_touching_enable,
+		"Touch pages allocated for DMA to ensure they are resident");
+
+static void touch_each_page(void *start_addr, size_t size)
+{
+	int addr_offset;
+
+	for (addr_offset = 0; addr_offset < size; addr_offset += PAGE_SIZE)
+		__raw_readl((char *)start_addr + addr_offset);
+}
+
+static void *page_touching_dma_alloc(struct device *dev, size_t size,
+			dma_addr_t *dma_handle, gfp_t gfp,
+			unsigned long attrs)
+{
+	char *kaddr = dma_direct_alloc(dev, size, dma_handle, gfp, attrs);
+
+	if (!kaddr)
+		return NULL;
+	touch_each_page(kaddr, size);
+	return kaddr;
+
+}
+
+static dma_addr_t page_touching_dma_map_page(struct device *dev, struct page *page,
+		       unsigned long offset, size_t size,
+		       enum dma_data_direction dir,
+		       unsigned long attrs)
+{
+	dma_addr_t dma_handle = dma_direct_map_page(dev, page, offset, size, dir, attrs);
+
+	if (!(dma_mapping_error(dev, dma_handle)))
+		touch_each_page(page_to_virt(page) + offset, size);
+	return dma_handle;
+}
+
+static int page_touching_dma_map_sg(struct device *dev, struct scatterlist *sglist,
+	      int nents, enum dma_data_direction dir,
+	      unsigned long attrs)
+{
+	struct scatterlist *sg;
+	int i, ret = dma_direct_map_sg(dev, sglist, nents, dir, attrs);
+
+	if (!ret)
+		goto out;
+
+	for_each_sg(sglist, sg, nents, i)
+		touch_each_page(page_to_virt(sg_page(sg)) + sg->offset, sg->length);
+
+out:
+	return ret;
+
+}
+
+/*
+ * Only a portion of the dma_map_ops interface is implemented here; enough for
+ * the EC2 ENA / NVMe drivers to work.
+ * Notibly missing is alloc_pages.
+ */
+const static struct dma_map_ops page_touching_dma_ops = {
+	.alloc			= page_touching_dma_alloc,
+	.free			= dma_direct_free,
+	.mmap			= dma_common_mmap,
+	.map_page		= page_touching_dma_map_page,
+	.unmap_page		= dma_direct_unmap_page,
+	.map_sg			= page_touching_dma_map_sg,
+	.unmap_sg		= dma_direct_unmap_sg,
+	.dma_supported		= dma_direct_supported,
+	.sync_single_for_cpu	= dma_direct_sync_single_for_cpu,
+	.sync_single_for_device	= dma_direct_sync_single_for_device,
+	.sync_sg_for_cpu	= dma_direct_sync_sg_for_cpu,
+	.dma_supported		= dma_direct_supported,
+	.get_required_mask	= dma_direct_get_required_mask,
+	.max_mapping_size	= dma_direct_max_mapping_size,
+};
+
+void setup_dma_page_touching_ops(struct device *dev)
+{
+	if (!dma_page_touching_enable || dev->dma_ops)
+		return;
+
+	dev_info(dev, "binding to page touching DMA ops\n");
+	dev->dma_ops = &page_touching_dma_ops;
+}

From e0b038833fa8c2b91e6ac9eb338308fa322110ea Mon Sep 17 00:00:00 2001
From: "Longpeng(Mike)" <longpeng2@huawei.com>
Date: Mon, 21 Jun 2021 08:40:46 +0800
Subject: [PATCH 161/737] nitro_enclaves: Set Bus Master for the NE PCI device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Enable Bus Master for the NE PCI device, according to the PCI spec
for submitting memory or I/O requests:

 Master Enable – Controls the ability of a PCI Express
  Endpoint to issue Memory and I/O Read/Write Requests, and
  the ability of a Root or Switch Port to forward Memory and
  I/O Read/Write Requests in the Upstream direction

Cc: Andra Paraschiv <andraprs@amazon.com>
Cc: Alexandru Vasile <lexnv@amazon.com>
Cc: Alexandru Ciobotaru <alcioa@amazon.com>
Reviewed-by: Andra Paraschiv <andraprs@amazon.com>
Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
Link: https://lore.kernel.org/r/20210621004046.1419-1-longpeng2@huawei.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit d874742f6a734c73c22235f9d56b8f10bcf17c5f)
---
 drivers/virt/nitro_enclaves/ne_pci_dev.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/virt/nitro_enclaves/ne_pci_dev.c b/drivers/virt/nitro_enclaves/ne_pci_dev.c
index b9c1de41e300c..143207e9b9698 100644
--- a/drivers/virt/nitro_enclaves/ne_pci_dev.c
+++ b/drivers/virt/nitro_enclaves/ne_pci_dev.c
@@ -480,6 +480,8 @@ static int ne_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto free_ne_pci_dev;
 	}
 
+	pci_set_master(pdev);
+
 	rc = pci_request_regions_exclusive(pdev, "nitro_enclaves");
 	if (rc < 0) {
 		dev_err(&pdev->dev, "Error in pci request regions [rc=%d]\n", rc);

From 3d586c314146ebab24bd8b6164ed8cfdae06a20b Mon Sep 17 00:00:00 2001
From: Andra Paraschiv <andraprs@amazon.com>
Date: Fri, 27 Aug 2021 18:49:24 +0300
Subject: [PATCH 162/737] nitro_enclaves: Enable Arm64 support

Update the kernel config to enable the Nitro Enclaves kernel driver for
Arm64 support.

Signed-off-by: Andra Paraschiv <andraprs@amazon.com>
Acked-by: Stefano Garzarella <sgarzare@redhat.com>
---
 drivers/virt/nitro_enclaves/Kconfig | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/virt/nitro_enclaves/Kconfig b/drivers/virt/nitro_enclaves/Kconfig
index 8c9387a232df8..f53740b941c0f 100644
--- a/drivers/virt/nitro_enclaves/Kconfig
+++ b/drivers/virt/nitro_enclaves/Kconfig
@@ -1,17 +1,13 @@
 # SPDX-License-Identifier: GPL-2.0
 #
-# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2020-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 
 # Amazon Nitro Enclaves (NE) support.
 # Nitro is a hypervisor that has been developed by Amazon.
 
-# TODO: Add dependency for ARM64 once NE is supported on Arm platforms. For now,
-# the NE kernel driver can be built for aarch64 arch.
-# depends on (ARM64 || X86) && HOTPLUG_CPU && PCI && SMP
-
 config NITRO_ENCLAVES
 	tristate "Nitro Enclaves Support"
-	depends on X86 && HOTPLUG_CPU && PCI && SMP
+	depends on (ARM64 || X86) && HOTPLUG_CPU && PCI && SMP
 	help
 	  This driver consists of support for enclave lifetime management
 	  for Nitro Enclaves (NE).

From 3939be4ba907046d61b4c27cdaf66ddf389b0b8b Mon Sep 17 00:00:00 2001
From: Andra Paraschiv <andraprs@amazon.com>
Date: Fri, 27 Aug 2021 18:49:25 +0300
Subject: [PATCH 163/737] nitro_enclaves: Update documentation for Arm64
 support

Add references for hugepages and booting steps for Arm64.

Include info about the current supported architectures for the
NE kernel driver.

Signed-off-by: Andra Paraschiv <andraprs@amazon.com>
---
 Documentation/virt/ne_overview.rst | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/Documentation/virt/ne_overview.rst b/Documentation/virt/ne_overview.rst
index 39b0c8fe2654a..74c2f5919c886 100644
--- a/Documentation/virt/ne_overview.rst
+++ b/Documentation/virt/ne_overview.rst
@@ -14,12 +14,15 @@ instances [1].
 For example, an application that processes sensitive data and runs in a VM,
 can be separated from other applications running in the same VM. This
 application then runs in a separate VM than the primary VM, namely an enclave.
+It runs alongside the VM that spawned it. This setup matches low latency
+applications needs.
 
-An enclave runs alongside the VM that spawned it. This setup matches low latency
-applications needs. The resources that are allocated for the enclave, such as
-memory and CPUs, are carved out of the primary VM. Each enclave is mapped to a
-process running in the primary VM, that communicates with the NE driver via an
-ioctl interface.
+The current supported architectures for the NE kernel driver, available in the
+upstream Linux kernel, are x86 and ARM64.
+
+The resources that are allocated for the enclave, such as memory and CPUs, are
+carved out of the primary VM. Each enclave is mapped to a process running in the
+primary VM, that communicates with the NE kernel driver via an ioctl interface.
 
 In this sense, there are two components:
 
@@ -43,8 +46,8 @@ for the enclave VM. An enclave does not have persistent storage attached.
 The memory regions carved out of the primary VM and given to an enclave need to
 be aligned 2 MiB / 1 GiB physically contiguous memory regions (or multiple of
 this size e.g. 8 MiB). The memory can be allocated e.g. by using hugetlbfs from
-user space [2][3]. The memory size for an enclave needs to be at least 64 MiB.
-The enclave memory and CPUs need to be from the same NUMA node.
+user space [2][3][7]. The memory size for an enclave needs to be at least
+64 MiB. The enclave memory and CPUs need to be from the same NUMA node.
 
 An enclave runs on dedicated cores. CPU 0 and its CPU siblings need to remain
 available for the primary VM. A CPU pool has to be set for NE purposes by an
@@ -61,7 +64,7 @@ device is placed in memory below the typical 4 GiB.
 The application that runs in the enclave needs to be packaged in an enclave
 image together with the OS ( e.g. kernel, ramdisk, init ) that will run in the
 enclave VM. The enclave VM has its own kernel and follows the standard Linux
-boot protocol [6].
+boot protocol [6][8].
 
 The kernel bzImage, the kernel command line, the ramdisk(s) are part of the
 Enclave Image Format (EIF); plus an EIF header including metadata such as magic
@@ -93,3 +96,5 @@ enclave process can exit.
 [4] https://www.kernel.org/doc/html/latest/admin-guide/kernel-parameters.html
 [5] https://man7.org/linux/man-pages/man7/vsock.7.html
 [6] https://www.kernel.org/doc/html/latest/x86/boot.html
+[7] https://www.kernel.org/doc/html/latest/arm64/hugetlbpage.html
+[8] https://www.kernel.org/doc/html/latest/arm64/booting.html

From b14aac52b6001129ebc26bf2c9fd1efe3eb8988c Mon Sep 17 00:00:00 2001
From: Andra Paraschiv <andraprs@amazon.com>
Date: Fri, 27 Aug 2021 18:49:26 +0300
Subject: [PATCH 164/737] nitro_enclaves: Add fix for the kernel-doc report

Fix the reported issue from the kernel-doc script, to have a comment per
identifier.

Signed-off-by: Andra Paraschiv <andraprs@amazon.com>
---
 drivers/virt/nitro_enclaves/ne_pci_dev.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/virt/nitro_enclaves/ne_pci_dev.h b/drivers/virt/nitro_enclaves/ne_pci_dev.h
index 8bfbc66078185..6e9f28971a4e0 100644
--- a/drivers/virt/nitro_enclaves/ne_pci_dev.h
+++ b/drivers/virt/nitro_enclaves/ne_pci_dev.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
- * Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * Copyright 2020-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  */
 
 #ifndef _NE_PCI_DEV_H_
@@ -84,9 +84,13 @@
  */
 
 /**
- * NE_SEND_DATA_SIZE / NE_RECV_DATA_SIZE - 240 bytes for send / recv buffer.
+ * NE_SEND_DATA_SIZE - Size of the send buffer, in bytes.
  */
 #define NE_SEND_DATA_SIZE	(240)
+
+/**
+ * NE_RECV_DATA_SIZE - Size of the receive buffer, in bytes.
+ */
 #define NE_RECV_DATA_SIZE	(240)
 
 /**

From 65ba2256e49097d02793c0eb266d3cd12c2c12e1 Mon Sep 17 00:00:00 2001
From: Andra Paraschiv <andraprs@amazon.com>
Date: Fri, 27 Aug 2021 18:49:27 +0300
Subject: [PATCH 165/737] nitro_enclaves: Update copyright statement to include
 2021

Update the copyright statement to include 2021, as a change has been
made over this year.

Check commit d874742f6a73 ("nitro_enclaves: Set Bus Master for the NE
PCI device") for the codebase update from this file (ne_pci_dev.c).

Signed-off-by: Andra Paraschiv <andraprs@amazon.com>
---
 drivers/virt/nitro_enclaves/ne_pci_dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/virt/nitro_enclaves/ne_pci_dev.c b/drivers/virt/nitro_enclaves/ne_pci_dev.c
index 143207e9b9698..40b49ec8e30b1 100644
--- a/drivers/virt/nitro_enclaves/ne_pci_dev.c
+++ b/drivers/virt/nitro_enclaves/ne_pci_dev.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * Copyright 2020-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  */
 
 /**

From 93512ac1c54e827e863170b7fd91e063e18e8a04 Mon Sep 17 00:00:00 2001
From: Andra Paraschiv <andraprs@amazon.com>
Date: Fri, 27 Aug 2021 18:49:28 +0300
Subject: [PATCH 166/737] nitro_enclaves: Add fixes for checkpatch match open
 parenthesis reports

Update the codebase formatting to fix the reports from the checkpatch
script, to match the open parenthesis.

Signed-off-by: Andra Paraschiv <andraprs@amazon.com>
---
 drivers/virt/nitro_enclaves/ne_misc_dev.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/virt/nitro_enclaves/ne_misc_dev.c b/drivers/virt/nitro_enclaves/ne_misc_dev.c
index e21e1e86ad15f..8939612ee0e08 100644
--- a/drivers/virt/nitro_enclaves/ne_misc_dev.c
+++ b/drivers/virt/nitro_enclaves/ne_misc_dev.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * Copyright 2020-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  */
 
 /**
@@ -284,8 +284,8 @@ static int ne_setup_cpu_pool(const char *ne_cpu_list)
 	ne_cpu_pool.nr_parent_vm_cores = nr_cpu_ids / ne_cpu_pool.nr_threads_per_core;
 
 	ne_cpu_pool.avail_threads_per_core = kcalloc(ne_cpu_pool.nr_parent_vm_cores,
-					     sizeof(*ne_cpu_pool.avail_threads_per_core),
-					     GFP_KERNEL);
+						     sizeof(*ne_cpu_pool.avail_threads_per_core),
+						     GFP_KERNEL);
 	if (!ne_cpu_pool.avail_threads_per_core) {
 		rc = -ENOMEM;
 
@@ -735,7 +735,7 @@ static int ne_add_vcpu_ioctl(struct ne_enclave *ne_enclave, u32 vcpu_id)
  * * Negative return value on failure.
  */
 static int ne_sanity_check_user_mem_region(struct ne_enclave *ne_enclave,
-	struct ne_user_memory_region mem_region)
+					   struct ne_user_memory_region mem_region)
 {
 	struct ne_mem_region *ne_mem_region = NULL;
 
@@ -771,7 +771,7 @@ static int ne_sanity_check_user_mem_region(struct ne_enclave *ne_enclave,
 		u64 userspace_addr = ne_mem_region->userspace_addr;
 
 		if ((userspace_addr <= mem_region.userspace_addr &&
-		    mem_region.userspace_addr < (userspace_addr + memory_size)) ||
+		     mem_region.userspace_addr < (userspace_addr + memory_size)) ||
 		    (mem_region.userspace_addr <= userspace_addr &&
 		    (mem_region.userspace_addr + mem_region.memory_size) > userspace_addr)) {
 			dev_err_ratelimited(ne_misc_dev.this_device,
@@ -836,7 +836,7 @@ static int ne_sanity_check_user_mem_region_page(struct ne_enclave *ne_enclave,
  * * Negative return value on failure.
  */
 static int ne_set_user_memory_region_ioctl(struct ne_enclave *ne_enclave,
-	struct ne_user_memory_region mem_region)
+					   struct ne_user_memory_region mem_region)
 {
 	long gup_rc = 0;
 	unsigned long i = 0;
@@ -1014,7 +1014,7 @@ static int ne_set_user_memory_region_ioctl(struct ne_enclave *ne_enclave,
  * * Negative return value on failure.
  */
 static int ne_start_enclave_ioctl(struct ne_enclave *ne_enclave,
-	struct ne_enclave_start_info *enclave_start_info)
+				  struct ne_enclave_start_info *enclave_start_info)
 {
 	struct ne_pci_dev_cmd_reply cmd_reply = {};
 	unsigned int cpu = 0;
@@ -1574,7 +1574,8 @@ static int ne_create_vm_ioctl(struct ne_pci_dev *ne_pci_dev, u64 __user *slot_ui
 	mutex_unlock(&ne_cpu_pool.mutex);
 
 	ne_enclave->threads_per_core = kcalloc(ne_enclave->nr_parent_vm_cores,
-		sizeof(*ne_enclave->threads_per_core), GFP_KERNEL);
+					       sizeof(*ne_enclave->threads_per_core),
+					       GFP_KERNEL);
 	if (!ne_enclave->threads_per_core) {
 		rc = -ENOMEM;
 

From a959a5acd745e64d89ebffe54371018c7905fdfc Mon Sep 17 00:00:00 2001
From: Andra Paraschiv <andraprs@amazon.com>
Date: Fri, 27 Aug 2021 18:49:29 +0300
Subject: [PATCH 167/737] nitro_enclaves: Add fixes for checkpatch spell check
 reports

Fix the typos in the words spelling as per the checkpatch script
reports.

Signed-off-by: Andra Paraschiv <andraprs@amazon.com>
---
 include/uapi/linux/nitro_enclaves.h      | 10 +++++-----
 samples/nitro_enclaves/ne_ioctl_sample.c |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/uapi/linux/nitro_enclaves.h b/include/uapi/linux/nitro_enclaves.h
index b945073fe544d..e808f5ba124d4 100644
--- a/include/uapi/linux/nitro_enclaves.h
+++ b/include/uapi/linux/nitro_enclaves.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
- * Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * Copyright 2020-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  */
 
 #ifndef _UAPI_LINUX_NITRO_ENCLAVES_H_
@@ -60,7 +60,7 @@
  *
  * Context: Process context.
  * Return:
- * * 0					- Logic succesfully completed.
+ * * 0					- Logic successfully completed.
  * *  -1				- There was a failure in the ioctl logic.
  * On failure, errno is set to:
  * * EFAULT				- copy_from_user() / copy_to_user() failure.
@@ -95,7 +95,7 @@
  *
  * Context: Process context.
  * Return:
- * * 0				- Logic succesfully completed.
+ * * 0				- Logic successfully completed.
  * *  -1			- There was a failure in the ioctl logic.
  * On failure, errno is set to:
  * * EFAULT			- copy_from_user() / copy_to_user() failure.
@@ -118,7 +118,7 @@
  *
  * Context: Process context.
  * Return:
- * * 0					- Logic succesfully completed.
+ * * 0					- Logic successfully completed.
  * *  -1				- There was a failure in the ioctl logic.
  * On failure, errno is set to:
  * * EFAULT				- copy_from_user() failure.
@@ -161,7 +161,7 @@
  *
  * Context: Process context.
  * Return:
- * * 0					- Logic succesfully completed.
+ * * 0					- Logic successfully completed.
  * *  -1				- There was a failure in the ioctl logic.
  * On failure, errno is set to:
  * * EFAULT				- copy_from_user() / copy_to_user() failure.
diff --git a/samples/nitro_enclaves/ne_ioctl_sample.c b/samples/nitro_enclaves/ne_ioctl_sample.c
index 480b763142b34..6a60990b2e202 100644
--- a/samples/nitro_enclaves/ne_ioctl_sample.c
+++ b/samples/nitro_enclaves/ne_ioctl_sample.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * Copyright 2020-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  */
 
 /**
@@ -638,7 +638,7 @@ static int ne_start_enclave(int enclave_fd,  struct ne_enclave_start_info *encla
 }
 
 /**
- * ne_start_enclave_check_booted() - Start the enclave and wait for a hearbeat
+ * ne_start_enclave_check_booted() - Start the enclave and wait for a heartbeat
  *				     from it, on a newly created vsock channel,
  *				     to check it has booted.
  * @enclave_fd :	The file descriptor associated with the enclave.

From dfff9f97bf74462b077308e18c1e2cde579088e6 Mon Sep 17 00:00:00 2001
From: Andra Paraschiv <andraprs@amazon.com>
Date: Fri, 27 Aug 2021 18:49:30 +0300
Subject: [PATCH 168/737] nitro_enclaves: Add fixes for checkpatch blank line
 reports

Remove blank lines that are not necessary, fixing the checkpatch script
reports. While at it, add a blank line after the switch default block,
similar to the other parts of the codebase.

Signed-off-by: Andra Paraschiv <andraprs@amazon.com>
---
 samples/nitro_enclaves/ne_ioctl_sample.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/samples/nitro_enclaves/ne_ioctl_sample.c b/samples/nitro_enclaves/ne_ioctl_sample.c
index 6a60990b2e202..765b131c73190 100644
--- a/samples/nitro_enclaves/ne_ioctl_sample.c
+++ b/samples/nitro_enclaves/ne_ioctl_sample.c
@@ -185,7 +185,6 @@ static int ne_create_vm(int ne_dev_fd, unsigned long *slot_uid, int *enclave_fd)
 	return 0;
 }
 
-
 /**
  * ne_poll_enclave_fd() - Thread function for polling the enclave fd.
  * @data:	Argument provided for the polling function.
@@ -560,8 +559,8 @@ static int ne_add_vcpu(int enclave_fd, unsigned int *vcpu_id)
 
 		default:
 			printf("Error in add vcpu [%m]\n");
-
 		}
+
 		return rc;
 	}
 

From 5ee5ac962bacdaa6909fd30647e5a5d2d7ce3d73 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sjpark@amazon.de>
Date: Tue, 7 Sep 2021 19:56:28 -0700
Subject: [PATCH 169/737] mm: introduce Data Access MONitor (DAMON)

Patch series "Introduce Data Access MONitor (DAMON)", v34.

Introduction
============

DAMON is a data access monitoring framework for the Linux kernel.  The
core mechanisms of DAMON called 'region based sampling' and 'adaptive
regions adjustment' (refer to 'mechanisms.rst' in the 11th patch of this
patchset for the detail) make it

- accurate (The monitored information is useful for DRAM level memory
  management.  It might not appropriate for Cache-level accuracy,
  though.),

- light-weight (The monitoring overhead is low enough to be applied
  online while making no impact on the performance of the target
  workloads.), and

- scalable (the upper-bound of the instrumentation overhead is
  controllable regardless of the size of target workloads.).

Using this framework, therefore, several memory management mechanisms such
as reclamation and THP can be optimized to aware real data access
patterns.  Experimental access pattern aware memory management
optimization works that incurring high instrumentation overhead will be
able to have another try.

Though DAMON is for kernel subsystems, it can be easily exposed to the
user space by writing a DAMON-wrapper kernel subsystem.  Then, user space
users who have some special workloads will be able to write personalized
tools or applications for deeper understanding and specialized
optimizations of their systems.

DAMON is also merged in two public Amazon Linux kernel trees that based on
v5.4.y[1] and v5.10.y[2].

[1] https://github.com/amazonlinux/linux/tree/amazon-5.4.y/master/mm/damon
[2] https://github.com/amazonlinux/linux/tree/amazon-5.10.y/master/mm/damon

The userspace tool[1] is available, released under GPLv2, and actively
being maintained.  I am also planning to implement another basic user
interface in perf[2].  Also, the basic test suite for DAMON is available
under GPLv2[3].

[1] https://github.com/awslabs/damo
[2] https://lore.kernel.org/linux-mm/20210107120729.22328-1-sjpark@amazon.com/
[3] https://github.com/awslabs/damon-tests

Long-term Plan
--------------

DAMON is a part of a project called Data Access-aware Operating System
(DAOS).  As the name implies, I want to improve the performance and
efficiency of systems using fine-grained data access patterns.  The
optimizations are for both kernel and user spaces.  I will therefore
modify or create kernel subsystems, export some of those to user space and
implement user space library / tools.  Below shows the layers and
components for the project.

    ---------------------------------------------------------------------------
    Primitives:     PTE Accessed bit, PG_idle, rmap, (Intel CMT), ...
    Framework:      DAMON
    Features:       DAMOS, virtual addr, physical addr, ...
    Applications:   DAMON-debugfs, (DARC), ...
    ^^^^^^^^^^^^^^^^^^^^^^^    KERNEL SPACE    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

    Raw Interface:  debugfs, (sysfs), (damonfs), tracepoints, (sys_damon), ...

    vvvvvvvvvvvvvvvvvvvvvvv    USER SPACE      vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
    Library:        (libdamon), ...
    Tools:          DAMO, (perf), ...
    ---------------------------------------------------------------------------

The components in parentheses or marked as '...' are not implemented yet
but in the future plan.  IOW, those are the TODO tasks of DAOS project.
For more detail, please refer to the plans:
https://lore.kernel.org/linux-mm/20201202082731.24828-1-sjpark@amazon.com/

Evaluations
===========

We evaluated DAMON's overhead, monitoring quality and usefulness using 24
realistic workloads on my QEMU/KVM based virtual machine running a kernel
that v24 DAMON patchset is applied.

DAMON is lightweight.  It increases system memory usage by 0.39% and slows
target workloads down by 1.16%.

DAMON is accurate and useful for memory management optimizations.  An
experimental DAMON-based operation scheme for THP, namely 'ethp', removes
76.15% of THP memory overheads while preserving 51.25% of THP speedup.
Another experimental DAMON-based 'proactive reclamation' implementation,
'prcl', reduces 93.38% of residential sets and 23.63% of system memory
footprint while incurring only 1.22% runtime overhead in the best case
(parsec3/freqmine).

NOTE that the experimental THP optimization and proactive reclamation are
not for production but only for proof of concepts.

Please refer to the official document[1] or "Documentation/admin-guide/mm:
Add a document for DAMON" patch in this patchset for detailed evaluation
setup and results.

[1] https://damonitor.github.io/doc/html/latest-damon/admin-guide/mm/damon/eval.html

Real-world User Story
=====================

In summary, DAMON has used on production systems and proved its usefulness.

DAMON as a profiler
-------------------

We analyzed characteristics of a large scale production systems of our
customers using DAMON.  The systems utilize 70GB DRAM and 36 CPUs.  From
this, we were able to find interesting things below.

There were obviously different access pattern under idle workload and
active workload.  Under the idle workload, it accessed large memory
regions with low frequency, while the active workload accessed small
memory regions with high freuqnecy.

DAMON found a 7GB memory region that showing obviously high access
frequency under the active workload.  We believe this is the
performance-effective working set and need to be protected.

There was a 4KB memory region that showing highest access frequency under
not only active but also idle workloads.  We think this must be a hottest
code section like thing that should never be paged out.

For this analysis, DAMON used only 0.3-1% of single CPU time.  Because we
used recording-based analysis, it consumed about 3-12 MB of disk space per
20 minutes.  This is only small amount of disk space, but we can further
reduce the disk usage by using non-recording-based DAMON features.  I'd
like to argue that only DAMON can do such detailed analysis (finding 4KB
highest region in 70GB memory) with the light overhead.

DAMON as a system optimization tool
-----------------------------------

We also found below potential performance problems on the systems and made
DAMON-based solutions.

The system doesn't want to make the workload suffer from the page
reclamation and thus it utilizes enough DRAM but no swap device.  However,
we found the system is actively reclaiming file-backed pages, because the
system has intensive file IO.  The file IO turned out to be not
performance critical for the workload, but the customer wanted to ensure
performance critical file-backed pages like code section to not mistakenly
be evicted.

Using direct IO should or `mlock()` would be a straightforward solution,
but modifying the user space code is not easy for the customer.
Alternatively, we could use DAMON-based operation scheme[1].  By using it,
we can ask DAMON to track access frequency of each region and make
'process_madvise(MADV_WILLNEED)[2]' call for regions having specific size
and access frequency for a time interval.

We also found the system is having high number of TLB misses.  We tried
'always' THP enabled policy and it greatly reduced TLB misses, but the
page reclamation also been more frequent due to the THP internal
fragmentation caused memory bloat.  We could try another DAMON-based
operation scheme that applies 'MADV_HUGEPAGE' to memory regions having
>=2MB size and high access frequency, while applying 'MADV_NOHUGEPAGE' to
regions having <2MB size and low access frequency.

We do not own the systems so we only reported the analysis results and
possible optimization solutions to the customers.  The customers satisfied
about the analysis results and promised to try the optimization guides.

[1] https://lore.kernel.org/linux-mm/20201006123931.5847-1-sjpark@amazon.com/
[2] https://lore.kernel.org/linux-api/20200622192900.22757-4-minchan@kernel.org/

Comparison with Idle Page Tracking
==================================

Idle Page Tracking allows users to set and read idleness of pages using a
bitmap file which represents each page with each bit of the file.  One
recommended usage of it is working set size detection.  Users can do that
by

    1. find PFN of each page for workloads in interest,
    2. set all the pages as idle by doing writes to the bitmap file,
    3. wait until the workload accesses its working set, and
    4. read the idleness of the pages again and count pages became not idle.

NOTE: While Idle Page Tracking is for user space users, DAMON is primarily
designed for kernel subsystems though it can easily exposed to the user
space.  Hence, this section only assumes such user space use of DAMON.

For what use cases Idle Page Tracking would be better?
------------------------------------------------------

1. Flexible usecases other than hotness monitoring.

Because Idle Page Tracking allows users to control the primitive (Page
idleness) by themselves, Idle Page Tracking users can do anything they
want.  Meanwhile, DAMON is primarily designed to monitor the hotness of
each memory region.  For this, DAMON asks users to provide sampling
interval and aggregation interval.  For the reason, there could be some
use case that using Idle Page Tracking is simpler.

2. Physical memory monitoring.

Idle Page Tracking receives PFN range as input, so natively supports
physical memory monitoring.

DAMON is designed to be extensible for multiple address spaces and use
cases by implementing and using primitives for the given use case.
Therefore, by theory, DAMON has no limitation in the type of target
address space as long as primitives for the given address space exists.
However, the default primitives introduced by this patchset supports only
virtual address spaces.

Therefore, for physical memory monitoring, you should implement your own
primitives and use it, or simply use Idle Page Tracking.

Nonetheless, RFC patchsets[1] for the physical memory address space
primitives is already available.  It also supports user memory same to
Idle Page Tracking.

[1] https://lore.kernel.org/linux-mm/20200831104730.28970-1-sjpark@amazon.com/

For what use cases DAMON is better?
-----------------------------------

1. Hotness Monitoring.

Idle Page Tracking let users know only if a page frame is accessed or not.
For hotness check, the user should write more code and use more memory.
DAMON do that by itself.

2. Low Monitoring Overhead

DAMON receives user's monitoring request with one step and then provide
the results.  So, roughly speaking, DAMON require only O(1) user/kernel
context switches.

In case of Idle Page Tracking, however, because the interface receives
contiguous page frames, the number of user/kernel context switches
increases as the monitoring target becomes complex and huge.  As a result,
the context switch overhead could be not negligible.

Moreover, DAMON is born to handle with the monitoring overhead.  Because
the core mechanism is pure logical, Idle Page Tracking users might be able
to implement the mechanism on their own, but it would be time consuming
and the user/kernel context switching will still more frequent than that
of DAMON.  Also, the kernel subsystems cannot use the logic in this case.

3. Page granularity working set size detection.

Until v22 of this patchset, this was categorized as the thing Idle Page
Tracking could do better, because DAMON basically maintains additional
metadata for each of the monitoring target regions.  So, in the page
granularity working set size detection use case, DAMON would incur (number
of monitoring target pages * size of metadata) memory overhead.  Size of
the single metadata item is about 54 bytes, so assuming 4KB pages, about
1.3% of monitoring target pages will be additionally used.

All essential metadata for Idle Page Tracking are embedded in 'struct
page' and page table entries.  Therefore, in this use case, only one
counter variable for working set size accounting is required if Idle Page
Tracking is used.

There are more details to consider, but roughly speaking, this is true in
most cases.

However, the situation changed from v23.  Now DAMON supports arbitrary
types of monitoring targets, which don't use the metadata.  Using that,
DAMON can do the working set size detection with no additional space
overhead but less user-kernel context switch.  A first draft for the
implementation of monitoring primitives for this usage is available in a
DAMON development tree[1].  An RFC patchset for it based on this patchset
will also be available soon.

Since v24, the arbitrary type support is dropped from this patchset
because this patchset doesn't introduce real use of the type.  You can
still get it from the DAMON development tree[2], though.

[1] https://github.com/sjp38/linux/tree/damon/pgidle_hack
[2] https://github.com/sjp38/linux/tree/damon/master

4. More future usecases

While Idle Page Tracking has tight coupling with base primitives (PG_Idle
and page table Accessed bits), DAMON is designed to be extensible for many
use cases and address spaces.  If you need some special address type or
want to use special h/w access check primitives, you can write your own
primitives for that and configure DAMON to use those.  Therefore, if your
use case could be changed a lot in future, using DAMON could be better.

Can I use both Idle Page Tracking and DAMON?
--------------------------------------------

Yes, though using them concurrently for overlapping memory regions could
result in interference to each other.  Nevertheless, such use case would
be rare or makes no sense at all.  Even in the case, the noise would bot
be really significant.  So, you can choose whatever you want depending on
the characteristics of your use cases.

More Information
================

We prepared a showcase web site[1] that you can get more information.
There are

- the official documentations[2],
- the heatmap format dynamic access pattern of various realistic workloads for
  heap area[3], mmap()-ed area[4], and stack[5] area,
- the dynamic working set size distribution[6] and chronological working set
  size changes[7], and
- the latest performance test results[8].

[1] https://damonitor.github.io/_index
[2] https://damonitor.github.io/doc/html/latest-damon
[3] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.0.png.html
[4] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.1.png.html
[5] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.2.png.html
[6] https://damonitor.github.io/test/result/visual/latest/rec.wss_sz.png.html
[7] https://damonitor.github.io/test/result/visual/latest/rec.wss_time.png.html
[8] https://damonitor.github.io/test/result/perf/latest/html/index.html

Baseline and Complete Git Trees
===============================

The patches are based on the latest -mm tree, specifically
v5.14-rc1-mmots-2021-07-15-18-47 of https://github.com/hnaz/linux-mm.  You can
also clone the complete git tree:

    $ git clone git://github.com/sjp38/linux -b damon/patches/v34

The web is also available:
https://github.com/sjp38/linux/releases/tag/damon/patches/v34

Development Trees
-----------------

There are a couple of trees for entire DAMON patchset series and features
for future release.

- For latest release: https://github.com/sjp38/linux/tree/damon/master
- For next release: https://github.com/sjp38/linux/tree/damon/next

Long-term Support Trees
-----------------------

For people who want to test DAMON but using LTS kernels, there are another
couple of trees based on two latest LTS kernels respectively and
containing the 'damon/master' backports.

- For v5.4.y: https://github.com/sjp38/linux/tree/damon/for-v5.4.y
- For v5.10.y: https://github.com/sjp38/linux/tree/damon/for-v5.10.y

Amazon Linux Kernel Trees
-------------------------

DAMON is also merged in two public Amazon Linux kernel trees that based on
v5.4.y[1] and v5.10.y[2].

[1] https://github.com/amazonlinux/linux/tree/amazon-5.4.y/master/mm/damon
[2] https://github.com/amazonlinux/linux/tree/amazon-5.10.y/master/mm/damon

Git Tree for Diff of Patches
============================

For easy review of diff between different versions of each patch, I
prepared a git tree containing all versions of the DAMON patchset series:
https://github.com/sjp38/damon-patches

You can clone it and use 'diff' for easy review of changes between
different versions of the patchset.  For example:

    $ git clone https://github.com/sjp38/damon-patches && cd damon-patches
    $ diff -u damon/v33 damon/v34

Sequence Of Patches
===================

First three patches implement the core logics of DAMON.  The 1st patch
introduces basic sampling based hotness monitoring for arbitrary types of
targets.  Following two patches implement the core mechanisms for control
of overhead and accuracy, namely regions based sampling (patch 2) and
adaptive regions adjustment (patch 3).

Now the essential parts of DAMON is complete, but it cannot work unless
someone provides monitoring primitives for a specific use case.  The
following two patches make it just work for virtual address spaces
monitoring.  The 4th patch makes 'PG_idle' can be used by DAMON and the
5th patch implements the virtual memory address space specific monitoring
primitives using page table Accessed bits and the 'PG_idle' page flag.

Now DAMON just works for virtual address space monitoring via the kernel
space api.  To let the user space users can use DAMON, following four
patches add interfaces for them.  The 6th patch adds a tracepoint for
monitoring results.  The 7th patch implements a DAMON application kernel
module, namely damon-dbgfs, that simply wraps DAMON and exposes DAMON
interface to the user space via the debugfs interface.  The 8th patch
further exports pid of monitoring thread (kdamond) to user space for
easier cpu usage accounting, and the 9th patch makes the debugfs interface
to support multiple contexts.

Three patches for maintainability follows.  The 10th patch adds
documentations for both the user space and the kernel space.  The 11th
patch provides unit tests (based on the kunit) while the 12th patch adds
user space tests (based on the kselftest).

Finally, the last patch (13th) updates the MAINTAINERS file.

This patch (of 13):

DAMON is a data access monitoring framework for the Linux kernel.  The
core mechanisms of DAMON make it

 - accurate (the monitoring output is useful enough for DRAM level
   performance-centric memory management; It might be inappropriate for
   CPU cache levels, though),
 - light-weight (the monitoring overhead is normally low enough to be
   applied online), and
 - scalable (the upper-bound of the overhead is in constant range
   regardless of the size of target workloads).

Using this framework, hence, we can easily write efficient kernel space
data access monitoring applications.  For example, the kernel's memory
management mechanisms can make advanced decisions using this.
Experimental data access aware optimization works that incurring high
access monitoring overhead could again be implemented on top of this.

Due to its simple and flexible interface, providing user space interface
would be also easy.  Then, user space users who have some special
workloads can write personalized applications for better understanding and
optimizations of their workloads and systems.

===

Nevertheless, this commit is defining and implementing only basic access
check part without the overhead-accuracy handling core logic.  The basic
access check is as below.

The output of DAMON says what memory regions are how frequently accessed
for a given duration.  The resolution of the access frequency is
controlled by setting ``sampling interval`` and ``aggregation interval``.
In detail, DAMON checks access to each page per ``sampling interval`` and
aggregates the results.  In other words, counts the number of the accesses
to each region.  After each ``aggregation interval`` passes, DAMON calls
callback functions that previously registered by users so that users can
read the aggregated results and then clears the results.  This can be
described in below simple pseudo-code::

    init()
    while monitoring_on:
        for page in monitoring_target:
            if accessed(page):
                nr_accesses[page] += 1
        if time() % aggregation_interval == 0:
            for callback in user_registered_callbacks:
                callback(monitoring_target, nr_accesses)
            for page in monitoring_target:
                nr_accesses[page] = 0
        if time() % update_interval == 0:
            update()
        sleep(sampling interval)

The target regions constructed at the beginning of the monitoring and
updated after each ``regions_update_interval``, because the target regions
could be dynamically changed (e.g., mmap() or memory hotplug).  The
monitoring overhead of this mechanism will arbitrarily increase as the
size of the target workload grows.

The basic monitoring primitives for actual access check and dynamic target
regions construction aren't in the core part of DAMON.  Instead, it allows
users to implement their own primitives that are optimized for their use
case and configure DAMON to use those.  In other words, users cannot use
current version of DAMON without some additional works.

Following commits will implement the core mechanisms for the
overhead-accuracy control and default primitives implementations.

Link: https://lkml.kernel.org/r/20210716081449.22187-1-sj38.park@gmail.com
Link: https://lkml.kernel.org/r/20210716081449.22187-2-sj38.park@gmail.com
Signed-off-by: SeongJae Park <sjpark@amazon.de>
Reviewed-by: Leonard Foerster <foersleo@amazon.de>
Reviewed-by: Fernand Sieber <sieberf@amazon.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Marco Elver <elver@google.com>
Cc: Fan Du <fan.du@intel.com>
Cc: Greg Kroah-Hartman <greg@kroah.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Joe Perches <joe@perches.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Maximilian Heyne <mheyne@amazon.de>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 167 ++++++++++++++++++++++
 mm/Kconfig            |   2 +
 mm/Makefile           |   1 +
 mm/damon/Kconfig      |  15 ++
 mm/damon/Makefile     |   3 +
 mm/damon/core.c       | 320 ++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 508 insertions(+)
 create mode 100644 include/linux/damon.h
 create mode 100644 mm/damon/Kconfig
 create mode 100644 mm/damon/Makefile
 create mode 100644 mm/damon/core.c

diff --git a/include/linux/damon.h b/include/linux/damon.h
new file mode 100644
index 0000000000000..2f652602b1eaa
--- /dev/null
+++ b/include/linux/damon.h
@@ -0,0 +1,167 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * DAMON api
+ *
+ * Author: SeongJae Park <sjpark@amazon.de>
+ */
+
+#ifndef _DAMON_H_
+#define _DAMON_H_
+
+#include <linux/mutex.h>
+#include <linux/time64.h>
+#include <linux/types.h>
+
+struct damon_ctx;
+
+/**
+ * struct damon_primitive	Monitoring primitives for given use cases.
+ *
+ * @init:			Initialize primitive-internal data structures.
+ * @update:			Update primitive-internal data structures.
+ * @prepare_access_checks:	Prepare next access check of target regions.
+ * @check_accesses:		Check the accesses to target regions.
+ * @reset_aggregated:		Reset aggregated accesses monitoring results.
+ * @target_valid:		Determine if the target is valid.
+ * @cleanup:			Clean up the context.
+ *
+ * DAMON can be extended for various address spaces and usages.  For this,
+ * users should register the low level primitives for their target address
+ * space and usecase via the &damon_ctx.primitive.  Then, the monitoring thread
+ * (&damon_ctx.kdamond) calls @init and @prepare_access_checks before starting
+ * the monitoring, @update after each &damon_ctx.primitive_update_interval, and
+ * @check_accesses, @target_valid and @prepare_access_checks after each
+ * &damon_ctx.sample_interval.  Finally, @reset_aggregated is called after each
+ * &damon_ctx.aggr_interval.
+ *
+ * @init should initialize primitive-internal data structures.  For example,
+ * this could be used to construct proper monitoring target regions and link
+ * those to @damon_ctx.target.
+ * @update should update the primitive-internal data structures.  For example,
+ * this could be used to update monitoring target regions for current status.
+ * @prepare_access_checks should manipulate the monitoring regions to be
+ * prepared for the next access check.
+ * @check_accesses should check the accesses to each region that made after the
+ * last preparation and update the number of observed accesses of each region.
+ * @reset_aggregated should reset the access monitoring results that aggregated
+ * by @check_accesses.
+ * @target_valid should check whether the target is still valid for the
+ * monitoring.
+ * @cleanup is called from @kdamond just before its termination.
+ */
+struct damon_primitive {
+	void (*init)(struct damon_ctx *context);
+	void (*update)(struct damon_ctx *context);
+	void (*prepare_access_checks)(struct damon_ctx *context);
+	void (*check_accesses)(struct damon_ctx *context);
+	void (*reset_aggregated)(struct damon_ctx *context);
+	bool (*target_valid)(void *target);
+	void (*cleanup)(struct damon_ctx *context);
+};
+
+/*
+ * struct damon_callback	Monitoring events notification callbacks.
+ *
+ * @before_start:	Called before starting the monitoring.
+ * @after_sampling:	Called after each sampling.
+ * @after_aggregation:	Called after each aggregation.
+ * @before_terminate:	Called before terminating the monitoring.
+ * @private:		User private data.
+ *
+ * The monitoring thread (&damon_ctx.kdamond) calls @before_start and
+ * @before_terminate just before starting and finishing the monitoring,
+ * respectively.  Therefore, those are good places for installing and cleaning
+ * @private.
+ *
+ * The monitoring thread calls @after_sampling and @after_aggregation for each
+ * of the sampling intervals and aggregation intervals, respectively.
+ * Therefore, users can safely access the monitoring results without additional
+ * protection.  For the reason, users are recommended to use these callback for
+ * the accesses to the results.
+ *
+ * If any callback returns non-zero, monitoring stops.
+ */
+struct damon_callback {
+	void *private;
+
+	int (*before_start)(struct damon_ctx *context);
+	int (*after_sampling)(struct damon_ctx *context);
+	int (*after_aggregation)(struct damon_ctx *context);
+	int (*before_terminate)(struct damon_ctx *context);
+};
+
+/**
+ * struct damon_ctx - Represents a context for each monitoring.  This is the
+ * main interface that allows users to set the attributes and get the results
+ * of the monitoring.
+ *
+ * @sample_interval:		The time between access samplings.
+ * @aggr_interval:		The time between monitor results aggregations.
+ * @primitive_update_interval:	The time between monitoring primitive updates.
+ *
+ * For each @sample_interval, DAMON checks whether each region is accessed or
+ * not.  It aggregates and keeps the access information (number of accesses to
+ * each region) for @aggr_interval time.  DAMON also checks whether the target
+ * memory regions need update (e.g., by ``mmap()`` calls from the application,
+ * in case of virtual memory monitoring) and applies the changes for each
+ * @primitive_update_interval.  All time intervals are in micro-seconds.
+ * Please refer to &struct damon_primitive and &struct damon_callback for more
+ * detail.
+ *
+ * @kdamond:		Kernel thread who does the monitoring.
+ * @kdamond_stop:	Notifies whether kdamond should stop.
+ * @kdamond_lock:	Mutex for the synchronizations with @kdamond.
+ *
+ * For each monitoring context, one kernel thread for the monitoring is
+ * created.  The pointer to the thread is stored in @kdamond.
+ *
+ * Once started, the monitoring thread runs until explicitly required to be
+ * terminated or every monitoring target is invalid.  The validity of the
+ * targets is checked via the &damon_primitive.target_valid of @primitive.  The
+ * termination can also be explicitly requested by writing non-zero to
+ * @kdamond_stop.  The thread sets @kdamond to NULL when it terminates.
+ * Therefore, users can know whether the monitoring is ongoing or terminated by
+ * reading @kdamond.  Reads and writes to @kdamond and @kdamond_stop from
+ * outside of the monitoring thread must be protected by @kdamond_lock.
+ *
+ * Note that the monitoring thread protects only @kdamond and @kdamond_stop via
+ * @kdamond_lock.  Accesses to other fields must be protected by themselves.
+ *
+ * @primitive:	Set of monitoring primitives for given use cases.
+ * @callback:	Set of callbacks for monitoring events notifications.
+ *
+ * @target:	Pointer to the user-defined monitoring target.
+ */
+struct damon_ctx {
+	unsigned long sample_interval;
+	unsigned long aggr_interval;
+	unsigned long primitive_update_interval;
+
+/* private: internal use only */
+	struct timespec64 last_aggregation;
+	struct timespec64 last_primitive_update;
+
+/* public: */
+	struct task_struct *kdamond;
+	bool kdamond_stop;
+	struct mutex kdamond_lock;
+
+	struct damon_primitive primitive;
+	struct damon_callback callback;
+
+	void *target;
+};
+
+#ifdef CONFIG_DAMON
+
+struct damon_ctx *damon_new_ctx(void);
+void damon_destroy_ctx(struct damon_ctx *ctx);
+int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
+		unsigned long aggr_int, unsigned long primitive_upd_int);
+
+int damon_start(struct damon_ctx **ctxs, int nr_ctxs);
+int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
+
+#endif	/* CONFIG_DAMON */
+
+#endif	/* _DAMON_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index 390165ffbb0fc..b97f2e8ab83f5 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -859,4 +859,6 @@ config ARCH_HAS_HUGEPD
 config MAPPING_DIRTY_HELPERS
         bool
 
+source "mm/damon/Kconfig"
+
 endmenu
diff --git a/mm/Makefile b/mm/Makefile
index d73aed0fc99c1..6fd576b8ba8ec 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -112,6 +112,7 @@ obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
 obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
 obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
 obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
+obj-$(CONFIG_DAMON) += damon/
 obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
 obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
 obj-$(CONFIG_ZONE_DEVICE) += memremap.o
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
new file mode 100644
index 0000000000000..d00e99ac1a154
--- /dev/null
+++ b/mm/damon/Kconfig
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+menu "Data Access Monitoring"
+
+config DAMON
+	bool "DAMON: Data Access Monitoring Framework"
+	help
+	  This builds a framework that allows kernel subsystems to monitor
+	  access frequency of each memory region. The information can be useful
+	  for performance-centric DRAM level memory management.
+
+	  See https://damonitor.github.io/doc/html/latest-damon/index.html for
+	  more information.
+
+endmenu
diff --git a/mm/damon/Makefile b/mm/damon/Makefile
new file mode 100644
index 0000000000000..4fd2edb4becfb
--- /dev/null
+++ b/mm/damon/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_DAMON)		:= core.o
diff --git a/mm/damon/core.c b/mm/damon/core.c
new file mode 100644
index 0000000000000..651590bf49b17
--- /dev/null
+++ b/mm/damon/core.c
@@ -0,0 +1,320 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Data Access Monitor
+ *
+ * Author: SeongJae Park <sjpark@amazon.de>
+ */
+
+#define pr_fmt(fmt) "damon: " fmt
+
+#include <linux/damon.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+
+static DEFINE_MUTEX(damon_lock);
+static int nr_running_ctxs;
+
+struct damon_ctx *damon_new_ctx(void)
+{
+	struct damon_ctx *ctx;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return NULL;
+
+	ctx->sample_interval = 5 * 1000;
+	ctx->aggr_interval = 100 * 1000;
+	ctx->primitive_update_interval = 60 * 1000 * 1000;
+
+	ktime_get_coarse_ts64(&ctx->last_aggregation);
+	ctx->last_primitive_update = ctx->last_aggregation;
+
+	mutex_init(&ctx->kdamond_lock);
+
+	ctx->target = NULL;
+
+	return ctx;
+}
+
+void damon_destroy_ctx(struct damon_ctx *ctx)
+{
+	if (ctx->primitive.cleanup)
+		ctx->primitive.cleanup(ctx);
+	kfree(ctx);
+}
+
+/**
+ * damon_set_attrs() - Set attributes for the monitoring.
+ * @ctx:		monitoring context
+ * @sample_int:		time interval between samplings
+ * @aggr_int:		time interval between aggregations
+ * @primitive_upd_int:	time interval between monitoring primitive updates
+ *
+ * This function should not be called while the kdamond is running.
+ * Every time interval is in micro-seconds.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
+		    unsigned long aggr_int, unsigned long primitive_upd_int)
+{
+	ctx->sample_interval = sample_int;
+	ctx->aggr_interval = aggr_int;
+	ctx->primitive_update_interval = primitive_upd_int;
+
+	return 0;
+}
+
+static bool damon_kdamond_running(struct damon_ctx *ctx)
+{
+	bool running;
+
+	mutex_lock(&ctx->kdamond_lock);
+	running = ctx->kdamond != NULL;
+	mutex_unlock(&ctx->kdamond_lock);
+
+	return running;
+}
+
+static int kdamond_fn(void *data);
+
+/*
+ * __damon_start() - Starts monitoring with given context.
+ * @ctx:	monitoring context
+ *
+ * This function should be called while damon_lock is hold.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+static int __damon_start(struct damon_ctx *ctx)
+{
+	int err = -EBUSY;
+
+	mutex_lock(&ctx->kdamond_lock);
+	if (!ctx->kdamond) {
+		err = 0;
+		ctx->kdamond_stop = false;
+		ctx->kdamond = kthread_run(kdamond_fn, ctx, "kdamond.%d",
+				nr_running_ctxs);
+		if (IS_ERR(ctx->kdamond)) {
+			err = PTR_ERR(ctx->kdamond);
+			ctx->kdamond = 0;
+		}
+	}
+	mutex_unlock(&ctx->kdamond_lock);
+
+	return err;
+}
+
+/**
+ * damon_start() - Starts the monitorings for a given group of contexts.
+ * @ctxs:	an array of the pointers for contexts to start monitoring
+ * @nr_ctxs:	size of @ctxs
+ *
+ * This function starts a group of monitoring threads for a group of monitoring
+ * contexts.  One thread per each context is created and run in parallel.  The
+ * caller should handle synchronization between the threads by itself.  If a
+ * group of threads that created by other 'damon_start()' call is currently
+ * running, this function does nothing but returns -EBUSY.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int damon_start(struct damon_ctx **ctxs, int nr_ctxs)
+{
+	int i;
+	int err = 0;
+
+	mutex_lock(&damon_lock);
+	if (nr_running_ctxs) {
+		mutex_unlock(&damon_lock);
+		return -EBUSY;
+	}
+
+	for (i = 0; i < nr_ctxs; i++) {
+		err = __damon_start(ctxs[i]);
+		if (err)
+			break;
+		nr_running_ctxs++;
+	}
+	mutex_unlock(&damon_lock);
+
+	return err;
+}
+
+/*
+ * __damon_stop() - Stops monitoring of given context.
+ * @ctx:	monitoring context
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+static int __damon_stop(struct damon_ctx *ctx)
+{
+	mutex_lock(&ctx->kdamond_lock);
+	if (ctx->kdamond) {
+		ctx->kdamond_stop = true;
+		mutex_unlock(&ctx->kdamond_lock);
+		while (damon_kdamond_running(ctx))
+			usleep_range(ctx->sample_interval,
+					ctx->sample_interval * 2);
+		return 0;
+	}
+	mutex_unlock(&ctx->kdamond_lock);
+
+	return -EPERM;
+}
+
+/**
+ * damon_stop() - Stops the monitorings for a given group of contexts.
+ * @ctxs:	an array of the pointers for contexts to stop monitoring
+ * @nr_ctxs:	size of @ctxs
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int damon_stop(struct damon_ctx **ctxs, int nr_ctxs)
+{
+	int i, err = 0;
+
+	for (i = 0; i < nr_ctxs; i++) {
+		/* nr_running_ctxs is decremented in kdamond_fn */
+		err = __damon_stop(ctxs[i]);
+		if (err)
+			return err;
+	}
+
+	return err;
+}
+
+/*
+ * damon_check_reset_time_interval() - Check if a time interval is elapsed.
+ * @baseline:	the time to check whether the interval has elapsed since
+ * @interval:	the time interval (microseconds)
+ *
+ * See whether the given time interval has passed since the given baseline
+ * time.  If so, it also updates the baseline to current time for next check.
+ *
+ * Return:	true if the time interval has passed, or false otherwise.
+ */
+static bool damon_check_reset_time_interval(struct timespec64 *baseline,
+		unsigned long interval)
+{
+	struct timespec64 now;
+
+	ktime_get_coarse_ts64(&now);
+	if ((timespec64_to_ns(&now) - timespec64_to_ns(baseline)) <
+			interval * 1000)
+		return false;
+	*baseline = now;
+	return true;
+}
+
+/*
+ * Check whether it is time to flush the aggregated information
+ */
+static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx)
+{
+	return damon_check_reset_time_interval(&ctx->last_aggregation,
+			ctx->aggr_interval);
+}
+
+/*
+ * Check whether it is time to check and apply the target monitoring regions
+ *
+ * Returns true if it is.
+ */
+static bool kdamond_need_update_primitive(struct damon_ctx *ctx)
+{
+	return damon_check_reset_time_interval(&ctx->last_primitive_update,
+			ctx->primitive_update_interval);
+}
+
+/*
+ * Check whether current monitoring should be stopped
+ *
+ * The monitoring is stopped when either the user requested to stop, or all
+ * monitoring targets are invalid.
+ *
+ * Returns true if need to stop current monitoring.
+ */
+static bool kdamond_need_stop(struct damon_ctx *ctx)
+{
+	bool stop;
+
+	mutex_lock(&ctx->kdamond_lock);
+	stop = ctx->kdamond_stop;
+	mutex_unlock(&ctx->kdamond_lock);
+	if (stop)
+		return true;
+
+	if (!ctx->primitive.target_valid)
+		return false;
+
+	return !ctx->primitive.target_valid(ctx->target);
+}
+
+static void set_kdamond_stop(struct damon_ctx *ctx)
+{
+	mutex_lock(&ctx->kdamond_lock);
+	ctx->kdamond_stop = true;
+	mutex_unlock(&ctx->kdamond_lock);
+}
+
+/*
+ * The monitoring daemon that runs as a kernel thread
+ */
+static int kdamond_fn(void *data)
+{
+	struct damon_ctx *ctx = (struct damon_ctx *)data;
+
+	mutex_lock(&ctx->kdamond_lock);
+	pr_info("kdamond (%d) starts\n", ctx->kdamond->pid);
+	mutex_unlock(&ctx->kdamond_lock);
+
+	if (ctx->primitive.init)
+		ctx->primitive.init(ctx);
+	if (ctx->callback.before_start && ctx->callback.before_start(ctx))
+		set_kdamond_stop(ctx);
+
+	while (!kdamond_need_stop(ctx)) {
+		if (ctx->primitive.prepare_access_checks)
+			ctx->primitive.prepare_access_checks(ctx);
+		if (ctx->callback.after_sampling &&
+				ctx->callback.after_sampling(ctx))
+			set_kdamond_stop(ctx);
+
+		usleep_range(ctx->sample_interval, ctx->sample_interval + 1);
+
+		if (ctx->primitive.check_accesses)
+			ctx->primitive.check_accesses(ctx);
+
+		if (kdamond_aggregate_interval_passed(ctx)) {
+			if (ctx->callback.after_aggregation &&
+					ctx->callback.after_aggregation(ctx))
+				set_kdamond_stop(ctx);
+			if (ctx->primitive.reset_aggregated)
+				ctx->primitive.reset_aggregated(ctx);
+		}
+
+		if (kdamond_need_update_primitive(ctx)) {
+			if (ctx->primitive.update)
+				ctx->primitive.update(ctx);
+		}
+	}
+
+	if (ctx->callback.before_terminate &&
+			ctx->callback.before_terminate(ctx))
+		set_kdamond_stop(ctx);
+	if (ctx->primitive.cleanup)
+		ctx->primitive.cleanup(ctx);
+
+	pr_debug("kdamond (%d) finishes\n", ctx->kdamond->pid);
+	mutex_lock(&ctx->kdamond_lock);
+	ctx->kdamond = NULL;
+	mutex_unlock(&ctx->kdamond_lock);
+
+	mutex_lock(&damon_lock);
+	nr_running_ctxs--;
+	mutex_unlock(&damon_lock);
+
+	do_exit(0);
+}

From 1e391c8f9a6fb3eed927a480266418fdb66567d4 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sjpark@amazon.de>
Date: Tue, 7 Sep 2021 19:56:32 -0700
Subject: [PATCH 170/737] mm/damon/core: implement region-based sampling

To avoid the unbounded increase of the overhead, DAMON groups adjacent
pages that are assumed to have the same access frequencies into a
region.  As long as the assumption (pages in a region have the same
access frequencies) is kept, only one page in the region is required to
be checked.  Thus, for each ``sampling interval``,

 1. the 'prepare_access_checks' primitive picks one page in each region,
 2. waits for one ``sampling interval``,
 3. checks whether the page is accessed meanwhile, and
 4. increases the access count of the region if so.

Therefore, the monitoring overhead is controllable by adjusting the
number of regions.  DAMON allows both the underlying primitives and user
callbacks to adjust regions for the trade-off.  In other words, this
commit makes DAMON to use not only time-based sampling but also
space-based sampling.

This scheme, however, cannot preserve the quality of the output if the
assumption is not guaranteed.  Next commit will address this problem.

Link: https://lkml.kernel.org/r/20210716081449.22187-3-sj38.park@gmail.com
Signed-off-by: SeongJae Park <sjpark@amazon.de>
Reviewed-by: Leonard Foerster <foersleo@amazon.de>
Reviewed-by: Fernand Sieber <sieberf@amazon.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Fan Du <fan.du@intel.com>
Cc: Greg Kroah-Hartman <greg@kroah.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joe Perches <joe@perches.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Maximilian Heyne <mheyne@amazon.de>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h |  77 ++++++++++++++++++++++-
 mm/damon/core.c       | 143 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 213 insertions(+), 7 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 2f652602b1eaa..67db309ad61be 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -12,6 +12,48 @@
 #include <linux/time64.h>
 #include <linux/types.h>
 
+/**
+ * struct damon_addr_range - Represents an address region of [@start, @end).
+ * @start:	Start address of the region (inclusive).
+ * @end:	End address of the region (exclusive).
+ */
+struct damon_addr_range {
+	unsigned long start;
+	unsigned long end;
+};
+
+/**
+ * struct damon_region - Represents a monitoring target region.
+ * @ar:			The address range of the region.
+ * @sampling_addr:	Address of the sample for the next access check.
+ * @nr_accesses:	Access frequency of this region.
+ * @list:		List head for siblings.
+ */
+struct damon_region {
+	struct damon_addr_range ar;
+	unsigned long sampling_addr;
+	unsigned int nr_accesses;
+	struct list_head list;
+};
+
+/**
+ * struct damon_target - Represents a monitoring target.
+ * @id:			Unique identifier for this target.
+ * @regions_list:	Head of the monitoring target regions of this target.
+ * @list:		List head for siblings.
+ *
+ * Each monitoring context could have multiple targets.  For example, a context
+ * for virtual memory address spaces could have multiple target processes.  The
+ * @id of each target should be unique among the targets of the context.  For
+ * example, in the virtual address monitoring context, it could be a pidfd or
+ * an address of an mm_struct.
+ */
+struct damon_target {
+	unsigned long id;
+	struct list_head regions_list;
+	struct list_head list;
+};
+
 struct damon_ctx;
 
 /**
@@ -36,7 +78,7 @@ struct damon_ctx;
  *
  * @init should initialize primitive-internal data structures.  For example,
  * this could be used to construct proper monitoring target regions and link
- * those to @damon_ctx.target.
+ * those to @damon_ctx.adaptive_targets.
  * @update should update the primitive-internal data structures.  For example,
  * this could be used to update monitoring target regions for current status.
  * @prepare_access_checks should manipulate the monitoring regions to be
@@ -130,7 +172,7 @@ struct damon_callback {
  * @primitive:	Set of monitoring primitives for given use cases.
  * @callback:	Set of callbacks for monitoring events notifications.
  *
- * @target:	Pointer to the user-defined monitoring target.
+ * @region_targets:	Head of monitoring targets (&damon_target) list.
  */
 struct damon_ctx {
 	unsigned long sample_interval;
@@ -149,11 +191,40 @@ struct damon_ctx {
 	struct damon_primitive primitive;
 	struct damon_callback callback;
 
-	void *target;
+	struct list_head region_targets;
 };
 
+#define damon_next_region(r) \
+	(container_of(r->list.next, struct damon_region, list))
+
+#define damon_prev_region(r) \
+	(container_of(r->list.prev, struct damon_region, list))
+
+#define damon_for_each_region(r, t) \
+	list_for_each_entry(r, &t->regions_list, list)
+
+#define damon_for_each_region_safe(r, next, t) \
+	list_for_each_entry_safe(r, next, &t->regions_list, list)
+
+#define damon_for_each_target(t, ctx) \
+	list_for_each_entry(t, &(ctx)->region_targets, list)
+
+#define damon_for_each_target_safe(t, next, ctx)	\
+	list_for_each_entry_safe(t, next, &(ctx)->region_targets, list)
+
 #ifdef CONFIG_DAMON
 
+struct damon_region *damon_new_region(unsigned long start, unsigned long end);
+inline void damon_insert_region(struct damon_region *r,
+		struct damon_region *prev, struct damon_region *next);
+void damon_add_region(struct damon_region *r, struct damon_target *t);
+void damon_destroy_region(struct damon_region *r);
+
+struct damon_target *damon_new_target(unsigned long id);
+void damon_add_target(struct damon_ctx *ctx, struct damon_target *t);
+void damon_free_target(struct damon_target *t);
+void damon_destroy_target(struct damon_target *t);
+
 struct damon_ctx *damon_new_ctx(void);
 void damon_destroy_ctx(struct damon_ctx *ctx);
 int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 651590bf49b17..947486a150ce2 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -15,6 +15,101 @@
 static DEFINE_MUTEX(damon_lock);
 static int nr_running_ctxs;
 
+/*
+ * Construct a damon_region struct
+ *
+ * Returns the pointer to the new struct if success, or NULL otherwise
+ */
+struct damon_region *damon_new_region(unsigned long start, unsigned long end)
+{
+	struct damon_region *region;
+
+	region = kmalloc(sizeof(*region), GFP_KERNEL);
+	if (!region)
+		return NULL;
+
+	region->ar.start = start;
+	region->ar.end = end;
+	region->nr_accesses = 0;
+	INIT_LIST_HEAD(&region->list);
+
+	return region;
+}
+
+/*
+ * Add a region between two other regions
+ */
+inline void damon_insert_region(struct damon_region *r,
+		struct damon_region *prev, struct damon_region *next)
+{
+	__list_add(&r->list, &prev->list, &next->list);
+}
+
+void damon_add_region(struct damon_region *r, struct damon_target *t)
+{
+	list_add_tail(&r->list, &t->regions_list);
+}
+
+static void damon_del_region(struct damon_region *r)
+{
+	list_del(&r->list);
+}
+
+static void damon_free_region(struct damon_region *r)
+{
+	kfree(r);
+}
+
+void damon_destroy_region(struct damon_region *r)
+{
+	damon_del_region(r);
+	damon_free_region(r);
+}
+
+/*
+ * Construct a damon_target struct
+ *
+ * Returns the pointer to the new struct if success, or NULL otherwise
+ */
+struct damon_target *damon_new_target(unsigned long id)
+{
+	struct damon_target *t;
+
+	t = kmalloc(sizeof(*t), GFP_KERNEL);
+	if (!t)
+		return NULL;
+
+	t->id = id;
+	INIT_LIST_HEAD(&t->regions_list);
+
+	return t;
+}
+
+void damon_add_target(struct damon_ctx *ctx, struct damon_target *t)
+{
+	list_add_tail(&t->list, &ctx->region_targets);
+}
+
+static void damon_del_target(struct damon_target *t)
+{
+	list_del(&t->list);
+}
+
+void damon_free_target(struct damon_target *t)
+{
+	struct damon_region *r, *next;
+
+	damon_for_each_region_safe(r, next, t)
+		damon_free_region(r);
+	kfree(t);
+}
+
+void damon_destroy_target(struct damon_target *t)
+{
+	damon_del_target(t);
+	damon_free_target(t);
+}
+
 struct damon_ctx *damon_new_ctx(void)
 {
 	struct damon_ctx *ctx;
@@ -32,15 +127,27 @@ struct damon_ctx *damon_new_ctx(void)
 
 	mutex_init(&ctx->kdamond_lock);
 
-	ctx->target = NULL;
+	INIT_LIST_HEAD(&ctx->region_targets);
 
 	return ctx;
 }
 
-void damon_destroy_ctx(struct damon_ctx *ctx)
+static void damon_destroy_targets(struct damon_ctx *ctx)
 {
-	if (ctx->primitive.cleanup)
+	struct damon_target *t, *next_t;
+
+	if (ctx->primitive.cleanup) {
 		ctx->primitive.cleanup(ctx);
+		return;
+	}
+
+	damon_for_each_target_safe(t, next_t, ctx)
+		damon_destroy_target(t);
+}
+
+void damon_destroy_ctx(struct damon_ctx *ctx)
+{
+	damon_destroy_targets(ctx);
 	kfree(ctx);
 }
 
@@ -217,6 +324,21 @@ static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx)
 			ctx->aggr_interval);
 }
 
+/*
+ * Reset the aggregated monitoring results ('nr_accesses' of each region).
+ */
+static void kdamond_reset_aggregated(struct damon_ctx *c)
+{
+	struct damon_target *t;
+
+	damon_for_each_target(t, c) {
+		struct damon_region *r;
+
+		damon_for_each_region(r, t)
+			r->nr_accesses = 0;
+	}
+}
+
 /*
  * Check whether it is time to check and apply the target monitoring regions
  *
@@ -238,6 +360,7 @@ static bool kdamond_need_update_primitive(struct damon_ctx *ctx)
  */
 static bool kdamond_need_stop(struct damon_ctx *ctx)
 {
+	struct damon_target *t;
 	bool stop;
 
 	mutex_lock(&ctx->kdamond_lock);
@@ -249,7 +372,12 @@ static bool kdamond_need_stop(struct damon_ctx *ctx)
 	if (!ctx->primitive.target_valid)
 		return false;
 
-	return !ctx->primitive.target_valid(ctx->target);
+	damon_for_each_target(t, ctx) {
+		if (ctx->primitive.target_valid(t))
+			return false;
+	}
+
+	return true;
 }
 
 static void set_kdamond_stop(struct damon_ctx *ctx)
@@ -265,6 +393,8 @@ static void set_kdamond_stop(struct damon_ctx *ctx)
 static int kdamond_fn(void *data)
 {
 	struct damon_ctx *ctx = (struct damon_ctx *)data;
+	struct damon_target *t;
+	struct damon_region *r, *next;
 
 	mutex_lock(&ctx->kdamond_lock);
 	pr_info("kdamond (%d) starts\n", ctx->kdamond->pid);
@@ -291,6 +421,7 @@ static int kdamond_fn(void *data)
 			if (ctx->callback.after_aggregation &&
 					ctx->callback.after_aggregation(ctx))
 				set_kdamond_stop(ctx);
+			kdamond_reset_aggregated(ctx);
 			if (ctx->primitive.reset_aggregated)
 				ctx->primitive.reset_aggregated(ctx);
 		}
@@ -300,6 +431,10 @@ static int kdamond_fn(void *data)
 				ctx->primitive.update(ctx);
 		}
 	}
+	damon_for_each_target(t, ctx) {
+		damon_for_each_region_safe(r, next, t)
+			damon_destroy_region(r);
+	}
 
 	if (ctx->callback.before_terminate &&
 			ctx->callback.before_terminate(ctx))

From 4d581af6d2fe1e77007b79f276e4e91e8ec443eb Mon Sep 17 00:00:00 2001
From: SeongJae Park <sjpark@amazon.de>
Date: Tue, 7 Sep 2021 19:56:36 -0700
Subject: [PATCH 171/737] mm/damon: adaptively adjust regions

Even somehow the initial monitoring target regions are well constructed to
fulfill the assumption (pages in same region have similar access
frequencies), the data access pattern can be dynamically changed.  This
will result in low monitoring quality.  To keep the assumption as much as
possible, DAMON adaptively merges and splits each region based on their
access frequency.

For each ``aggregation interval``, it compares the access frequencies of
adjacent regions and merges those if the frequency difference is small.
Then, after it reports and clears the aggregated access frequency of each
region, it splits each region into two or three regions if the total
number of regions will not exceed the user-specified maximum number of
regions after the split.

In this way, DAMON provides its best-effort quality and minimal overhead
while keeping the upper-bound overhead that users set.

Link: https://lkml.kernel.org/r/20210716081449.22187-4-sj38.park@gmail.com
Signed-off-by: SeongJae Park <sjpark@amazon.de>
Reviewed-by: Leonard Foerster <foersleo@amazon.de>
Reviewed-by: Fernand Sieber <sieberf@amazon.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Fan Du <fan.du@intel.com>
Cc: Greg Kroah-Hartman <greg@kroah.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joe Perches <joe@perches.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Maximilian Heyne <mheyne@amazon.de>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h |  30 ++++--
 mm/damon/core.c       | 224 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 237 insertions(+), 17 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 67db309ad61be..ce2a84b26cd74 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -12,6 +12,9 @@
 #include <linux/time64.h>
 #include <linux/types.h>
 
+/* Minimal region size.  Every damon_region is aligned by this. */
+#define DAMON_MIN_REGION	PAGE_SIZE
+
 /**
  * struct damon_addr_range - Represents an address region of [@start, @end).
  * @start:	Start address of the region (inclusive).
@@ -39,6 +42,7 @@ struct damon_region {
 /**
  * struct damon_target - Represents a monitoring target.
  * @id:			Unique identifier for this target.
+ * @nr_regions:		Number of monitoring target regions of this target.
  * @regions_list:	Head of the monitoring target regions of this target.
  * @list:		List head for siblings.
  *
@@ -50,6 +54,7 @@ struct damon_region {
  */
 struct damon_target {
 	unsigned long id;
+	unsigned int nr_regions;
 	struct list_head regions_list;
 	struct list_head list;
 };
@@ -85,6 +90,8 @@ struct damon_ctx;
  * prepared for the next access check.
  * @check_accesses should check the accesses to each region that made after the
  * last preparation and update the number of observed accesses of each region.
+ * It should also return max number of observed accesses that made as a result
+ * of its update.  The value will be used for regions adjustment threshold.
  * @reset_aggregated should reset the access monitoring results that aggregated
  * by @check_accesses.
  * @target_valid should check whether the target is still valid for the
@@ -95,7 +102,7 @@ struct damon_primitive {
 	void (*init)(struct damon_ctx *context);
 	void (*update)(struct damon_ctx *context);
 	void (*prepare_access_checks)(struct damon_ctx *context);
-	void (*check_accesses)(struct damon_ctx *context);
+	unsigned int (*check_accesses)(struct damon_ctx *context);
 	void (*reset_aggregated)(struct damon_ctx *context);
 	bool (*target_valid)(void *target);
 	void (*cleanup)(struct damon_ctx *context);
@@ -172,7 +179,9 @@ struct damon_callback {
  * @primitive:	Set of monitoring primitives for given use cases.
  * @callback:	Set of callbacks for monitoring events notifications.
  *
- * @region_targets:	Head of monitoring targets (&damon_target) list.
+ * @min_nr_regions:	The minimum number of adaptive monitoring regions.
+ * @max_nr_regions:	The maximum number of adaptive monitoring regions.
+ * @adaptive_targets:	Head of monitoring targets (&damon_target) list.
  */
 struct damon_ctx {
 	unsigned long sample_interval;
@@ -191,7 +200,9 @@ struct damon_ctx {
 	struct damon_primitive primitive;
 	struct damon_callback callback;
 
-	struct list_head region_targets;
+	unsigned long min_nr_regions;
+	unsigned long max_nr_regions;
+	struct list_head adaptive_targets;
 };
 
 #define damon_next_region(r) \
@@ -207,28 +218,31 @@ struct damon_ctx {
 	list_for_each_entry_safe(r, next, &t->regions_list, list)
 
 #define damon_for_each_target(t, ctx) \
-	list_for_each_entry(t, &(ctx)->region_targets, list)
+	list_for_each_entry(t, &(ctx)->adaptive_targets, list)
 
 #define damon_for_each_target_safe(t, next, ctx)	\
-	list_for_each_entry_safe(t, next, &(ctx)->region_targets, list)
+	list_for_each_entry_safe(t, next, &(ctx)->adaptive_targets, list)
 
 #ifdef CONFIG_DAMON
 
 struct damon_region *damon_new_region(unsigned long start, unsigned long end);
 inline void damon_insert_region(struct damon_region *r,
-		struct damon_region *prev, struct damon_region *next);
+		struct damon_region *prev, struct damon_region *next,
+		struct damon_target *t);
 void damon_add_region(struct damon_region *r, struct damon_target *t);
-void damon_destroy_region(struct damon_region *r);
+void damon_destroy_region(struct damon_region *r, struct damon_target *t);
 
 struct damon_target *damon_new_target(unsigned long id);
 void damon_add_target(struct damon_ctx *ctx, struct damon_target *t);
 void damon_free_target(struct damon_target *t);
 void damon_destroy_target(struct damon_target *t);
+unsigned int damon_nr_regions(struct damon_target *t);
 
 struct damon_ctx *damon_new_ctx(void);
 void damon_destroy_ctx(struct damon_ctx *ctx);
 int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
-		unsigned long aggr_int, unsigned long primitive_upd_int);
+		unsigned long aggr_int, unsigned long primitive_upd_int,
+		unsigned long min_nr_reg, unsigned long max_nr_reg);
 
 int damon_start(struct damon_ctx **ctxs, int nr_ctxs);
 int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 947486a150ce2..28a2c78914faa 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -10,8 +10,12 @@
 #include <linux/damon.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
+#include <linux/random.h>
 #include <linux/slab.h>
 
+/* Get a random number in [l, r) */
+#define damon_rand(l, r) (l + prandom_u32_max(r - l))
+
 static DEFINE_MUTEX(damon_lock);
 static int nr_running_ctxs;
 
@@ -40,19 +44,23 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end)
  * Add a region between two other regions
  */
 inline void damon_insert_region(struct damon_region *r,
-		struct damon_region *prev, struct damon_region *next)
+		struct damon_region *prev, struct damon_region *next,
+		struct damon_target *t)
 {
 	__list_add(&r->list, &prev->list, &next->list);
+	t->nr_regions++;
 }
 
 void damon_add_region(struct damon_region *r, struct damon_target *t)
 {
 	list_add_tail(&r->list, &t->regions_list);
+	t->nr_regions++;
 }
 
-static void damon_del_region(struct damon_region *r)
+static void damon_del_region(struct damon_region *r, struct damon_target *t)
 {
 	list_del(&r->list);
+	t->nr_regions--;
 }
 
 static void damon_free_region(struct damon_region *r)
@@ -60,9 +68,9 @@ static void damon_free_region(struct damon_region *r)
 	kfree(r);
 }
 
-void damon_destroy_region(struct damon_region *r)
+void damon_destroy_region(struct damon_region *r, struct damon_target *t)
 {
-	damon_del_region(r);
+	damon_del_region(r, t);
 	damon_free_region(r);
 }
 
@@ -80,6 +88,7 @@ struct damon_target *damon_new_target(unsigned long id)
 		return NULL;
 
 	t->id = id;
+	t->nr_regions = 0;
 	INIT_LIST_HEAD(&t->regions_list);
 
 	return t;
@@ -87,7 +96,7 @@ struct damon_target *damon_new_target(unsigned long id)
 
 void damon_add_target(struct damon_ctx *ctx, struct damon_target *t)
 {
-	list_add_tail(&t->list, &ctx->region_targets);
+	list_add_tail(&t->list, &ctx->adaptive_targets);
 }
 
 static void damon_del_target(struct damon_target *t)
@@ -110,6 +119,11 @@ void damon_destroy_target(struct damon_target *t)
 	damon_free_target(t);
 }
 
+unsigned int damon_nr_regions(struct damon_target *t)
+{
+	return t->nr_regions;
+}
+
 struct damon_ctx *damon_new_ctx(void)
 {
 	struct damon_ctx *ctx;
@@ -127,7 +141,10 @@ struct damon_ctx *damon_new_ctx(void)
 
 	mutex_init(&ctx->kdamond_lock);
 
-	INIT_LIST_HEAD(&ctx->region_targets);
+	ctx->min_nr_regions = 10;
+	ctx->max_nr_regions = 1000;
+
+	INIT_LIST_HEAD(&ctx->adaptive_targets);
 
 	return ctx;
 }
@@ -157,6 +174,8 @@ void damon_destroy_ctx(struct damon_ctx *ctx)
  * @sample_int:		time interval between samplings
  * @aggr_int:		time interval between aggregations
  * @primitive_upd_int:	time interval between monitoring primitive updates
+ * @min_nr_reg:		minimal number of regions
+ * @max_nr_reg:		maximum number of regions
  *
  * This function should not be called while the kdamond is running.
  * Every time interval is in micro-seconds.
@@ -164,15 +183,49 @@ void damon_destroy_ctx(struct damon_ctx *ctx)
  * Return: 0 on success, negative error code otherwise.
  */
 int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
-		    unsigned long aggr_int, unsigned long primitive_upd_int)
+		    unsigned long aggr_int, unsigned long primitive_upd_int,
+		    unsigned long min_nr_reg, unsigned long max_nr_reg)
 {
+	if (min_nr_reg < 3) {
+		pr_err("min_nr_regions (%lu) must be at least 3\n",
+				min_nr_reg);
+		return -EINVAL;
+	}
+	if (min_nr_reg > max_nr_reg) {
+		pr_err("invalid nr_regions.  min (%lu) > max (%lu)\n",
+				min_nr_reg, max_nr_reg);
+		return -EINVAL;
+	}
+
 	ctx->sample_interval = sample_int;
 	ctx->aggr_interval = aggr_int;
 	ctx->primitive_update_interval = primitive_upd_int;
+	ctx->min_nr_regions = min_nr_reg;
+	ctx->max_nr_regions = max_nr_reg;
 
 	return 0;
 }
 
+/* Returns the size upper limit for each monitoring region */
+static unsigned long damon_region_sz_limit(struct damon_ctx *ctx)
+{
+	struct damon_target *t;
+	struct damon_region *r;
+	unsigned long sz = 0;
+
+	damon_for_each_target(t, ctx) {
+		damon_for_each_region(r, t)
+			sz += r->ar.end - r->ar.start;
+	}
+
+	if (ctx->min_nr_regions)
+		sz /= ctx->min_nr_regions;
+	if (sz < DAMON_MIN_REGION)
+		sz = DAMON_MIN_REGION;
+
+	return sz;
+}
+
 static bool damon_kdamond_running(struct damon_ctx *ctx)
 {
 	bool running;
@@ -339,6 +392,150 @@ static void kdamond_reset_aggregated(struct damon_ctx *c)
 	}
 }
 
+#define sz_damon_region(r) (r->ar.end - r->ar.start)
+
+/*
+ * Merge two adjacent regions into one region
+ */
+static void damon_merge_two_regions(struct damon_target *t,
+		struct damon_region *l, struct damon_region *r)
+{
+	unsigned long sz_l = sz_damon_region(l), sz_r = sz_damon_region(r);
+
+	l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) /
+			(sz_l + sz_r);
+	l->ar.end = r->ar.end;
+	damon_destroy_region(r, t);
+}
+
+#define diff_of(a, b) (a > b ? a - b : b - a)
+
+/*
+ * Merge adjacent regions having similar access frequencies
+ *
+ * t		target affected by this merge operation
+ * thres	'->nr_accesses' diff threshold for the merge
+ * sz_limit	size upper limit of each region
+ */
+static void damon_merge_regions_of(struct damon_target *t, unsigned int thres,
+				   unsigned long sz_limit)
+{
+	struct damon_region *r, *prev = NULL, *next;
+
+	damon_for_each_region_safe(r, next, t) {
+		if (prev && prev->ar.end == r->ar.start &&
+		    diff_of(prev->nr_accesses, r->nr_accesses) <= thres &&
+		    sz_damon_region(prev) + sz_damon_region(r) <= sz_limit)
+			damon_merge_two_regions(t, prev, r);
+		else
+			prev = r;
+	}
+}
+
+/*
+ * Merge adjacent regions having similar access frequencies
+ *
+ * threshold	'->nr_accesses' diff threshold for the merge
+ * sz_limit	size upper limit of each region
+ *
+ * This function merges monitoring target regions which are adjacent and their
+ * access frequencies are similar.  This is for minimizing the monitoring
+ * overhead under the dynamically changeable access pattern.  If a merge was
+ * unnecessarily made, later 'kdamond_split_regions()' will revert it.
+ */
+static void kdamond_merge_regions(struct damon_ctx *c, unsigned int threshold,
+				  unsigned long sz_limit)
+{
+	struct damon_target *t;
+
+	damon_for_each_target(t, c)
+		damon_merge_regions_of(t, threshold, sz_limit);
+}
+
+/*
+ * Split a region in two
+ *
+ * r		the region to be split
+ * sz_r		size of the first sub-region that will be made
+ */
+static void damon_split_region_at(struct damon_ctx *ctx,
+		struct damon_target *t, struct damon_region *r,
+		unsigned long sz_r)
+{
+	struct damon_region *new;
+
+	new = damon_new_region(r->ar.start + sz_r, r->ar.end);
+	if (!new)
+		return;
+
+	r->ar.end = new->ar.start;
+
+	damon_insert_region(new, r, damon_next_region(r), t);
+}
+
+/* Split every region in the given target into 'nr_subs' regions */
+static void damon_split_regions_of(struct damon_ctx *ctx,
+				     struct damon_target *t, int nr_subs)
+{
+	struct damon_region *r, *next;
+	unsigned long sz_region, sz_sub = 0;
+	int i;
+
+	damon_for_each_region_safe(r, next, t) {
+		sz_region = r->ar.end - r->ar.start;
+
+		for (i = 0; i < nr_subs - 1 &&
+				sz_region > 2 * DAMON_MIN_REGION; i++) {
+			/*
+			 * Randomly select size of left sub-region to be at
+			 * least 10 percent and at most 90% of original region
+			 */
+			sz_sub = ALIGN_DOWN(damon_rand(1, 10) *
+					sz_region / 10, DAMON_MIN_REGION);
+			/* Do not allow blank region */
+			if (sz_sub == 0 || sz_sub >= sz_region)
+				continue;
+
+			damon_split_region_at(ctx, t, r, sz_sub);
+			sz_region = sz_sub;
+		}
+	}
+}
+
+/*
+ * Split every target region into randomly-sized small regions
+ *
+ * This function splits every target region into random-sized small regions if
+ * current total number of the regions is equal or smaller than half of the
+ * user-specified maximum number of regions.  This is for maximizing the
+ * monitoring accuracy under the dynamically changeable access patterns.  If a
+ * split was unnecessarily made, later 'kdamond_merge_regions()' will revert
+ * it.
+ */
+static void kdamond_split_regions(struct damon_ctx *ctx)
+{
+	struct damon_target *t;
+	unsigned int nr_regions = 0;
+	static unsigned int last_nr_regions;
+	int nr_subregions = 2;
+
+	damon_for_each_target(t, ctx)
+		nr_regions += damon_nr_regions(t);
+
+	if (nr_regions > ctx->max_nr_regions / 2)
+		return;
+
+	/* Maybe the middle of the region has different access frequency */
+	if (last_nr_regions == nr_regions &&
+			nr_regions < ctx->max_nr_regions / 3)
+		nr_subregions = 3;
+
+	damon_for_each_target(t, ctx)
+		damon_split_regions_of(ctx, t, nr_subregions);
+
+	last_nr_regions = nr_regions;
+}
+
 /*
  * Check whether it is time to check and apply the target monitoring regions
  *
@@ -395,6 +592,8 @@ static int kdamond_fn(void *data)
 	struct damon_ctx *ctx = (struct damon_ctx *)data;
 	struct damon_target *t;
 	struct damon_region *r, *next;
+	unsigned int max_nr_accesses = 0;
+	unsigned long sz_limit = 0;
 
 	mutex_lock(&ctx->kdamond_lock);
 	pr_info("kdamond (%d) starts\n", ctx->kdamond->pid);
@@ -405,6 +604,8 @@ static int kdamond_fn(void *data)
 	if (ctx->callback.before_start && ctx->callback.before_start(ctx))
 		set_kdamond_stop(ctx);
 
+	sz_limit = damon_region_sz_limit(ctx);
+
 	while (!kdamond_need_stop(ctx)) {
 		if (ctx->primitive.prepare_access_checks)
 			ctx->primitive.prepare_access_checks(ctx);
@@ -415,13 +616,17 @@ static int kdamond_fn(void *data)
 		usleep_range(ctx->sample_interval, ctx->sample_interval + 1);
 
 		if (ctx->primitive.check_accesses)
-			ctx->primitive.check_accesses(ctx);
+			max_nr_accesses = ctx->primitive.check_accesses(ctx);
 
 		if (kdamond_aggregate_interval_passed(ctx)) {
+			kdamond_merge_regions(ctx,
+					max_nr_accesses / 10,
+					sz_limit);
 			if (ctx->callback.after_aggregation &&
 					ctx->callback.after_aggregation(ctx))
 				set_kdamond_stop(ctx);
 			kdamond_reset_aggregated(ctx);
+			kdamond_split_regions(ctx);
 			if (ctx->primitive.reset_aggregated)
 				ctx->primitive.reset_aggregated(ctx);
 		}
@@ -429,11 +634,12 @@ static int kdamond_fn(void *data)
 		if (kdamond_need_update_primitive(ctx)) {
 			if (ctx->primitive.update)
 				ctx->primitive.update(ctx);
+			sz_limit = damon_region_sz_limit(ctx);
 		}
 	}
 	damon_for_each_target(t, ctx) {
 		damon_for_each_region_safe(r, next, t)
-			damon_destroy_region(r);
+			damon_destroy_region(r, t);
 	}
 
 	if (ctx->callback.before_terminate &&

From 31b1872b2f417830a56aa9b35df2d2df213ec957 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sjpark@amazon.de>
Date: Tue, 7 Sep 2021 19:56:40 -0700
Subject: [PATCH 172/737] mm/idle_page_tracking: make PG_idle reusable

PG_idle and PG_young allow the two PTE Accessed bit users, Idle Page
Tracking and the reclaim logic concurrently work while not interfering
with each other.  That is, when they need to clear the Accessed bit, they
set PG_young to represent the previous state of the bit, respectively.
And when they need to read the bit, if the bit is cleared, they further
read the PG_young to know whether the other has cleared the bit meanwhile
or not.

For yet another user of the PTE Accessed bit, we could add another page
flag, or extend the mechanism to use the flags.  For the DAMON usecase,
however, we don't need to do that just yet.  IDLE_PAGE_TRACKING and DAMON
are mutually exclusive, so there's only ever going to be one user of the
current set of flags.

In this commit, we split out the CONFIG options to allow for the use of
PG_young and PG_idle outside of idle page tracking.

In the next commit, DAMON's reference implementation of the virtual memory
address space monitoring primitives will use it.

[sjpark@amazon.de: set PAGE_EXTENSION for non-64BIT]
  Link: https://lkml.kernel.org/r/20210806095153.6444-1-sj38.park@gmail.com
[akpm@linux-foundation.org: tweak Kconfig text]
[sjpark@amazon.de: hide PAGE_IDLE_FLAG from users]
  Link: https://lkml.kernel.org/r/20210813081238.34705-1-sj38.park@gmail.com

Link: https://lkml.kernel.org/r/20210716081449.22187-5-sj38.park@gmail.com
Signed-off-by: SeongJae Park <sjpark@amazon.de>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Fernand Sieber <sieberf@amazon.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Fan Du <fan.du@intel.com>
Cc: Greg Kroah-Hartman <greg@kroah.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joe Perches <joe@perches.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Maximilian Heyne <mheyne@amazon.de>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h     |  4 ++--
 include/linux/page_ext.h       |  2 +-
 include/linux/page_idle.h      |  6 +++---
 include/trace/events/mmflags.h |  2 +-
 mm/Kconfig                     | 10 +++++++++-
 mm/page_ext.c                  | 12 +++++++++++-
 mm/page_idle.c                 | 10 ----------
 7 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 4f6ba93791121..0f010fc7f1c4d 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -132,7 +132,7 @@ enum pageflags {
 #ifdef CONFIG_MEMORY_FAILURE
 	PG_hwpoison,		/* hardware poisoned page. Don't touch */
 #endif
-#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
+#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
 	PG_young,
 	PG_idle,
 #endif
@@ -437,7 +437,7 @@ PAGEFLAG_FALSE(HWPoison)
 #define __PG_HWPOISON 0
 #endif
 
-#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
+#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
 TESTPAGEFLAG(Young, young, PF_ANY)
 SETPAGEFLAG(Young, young, PF_ANY)
 TESTCLEARFLAG(Young, young, PF_ANY)
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index cfce186f0c4e0..c9cbc97560116 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -19,7 +19,7 @@ struct page_ext_operations {
 enum page_ext_flags {
 	PAGE_EXT_OWNER,
 	PAGE_EXT_OWNER_ALLOCATED,
-#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
+#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
 	PAGE_EXT_YOUNG,
 	PAGE_EXT_IDLE,
 #endif
diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index 1e894d34bdceb..d8a6aecf99cb9 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -6,7 +6,7 @@
 #include <linux/page-flags.h>
 #include <linux/page_ext.h>
 
-#ifdef CONFIG_IDLE_PAGE_TRACKING
+#ifdef CONFIG_PAGE_IDLE_FLAG
 
 #ifdef CONFIG_64BIT
 static inline bool page_is_young(struct page *page)
@@ -106,7 +106,7 @@ static inline void clear_page_idle(struct page *page)
 }
 #endif /* CONFIG_64BIT */
 
-#else /* !CONFIG_IDLE_PAGE_TRACKING */
+#else /* !CONFIG_PAGE_IDLE_FLAG */
 
 static inline bool page_is_young(struct page *page)
 {
@@ -135,6 +135,6 @@ static inline void clear_page_idle(struct page *page)
 {
 }
 
-#endif /* CONFIG_IDLE_PAGE_TRACKING */
+#endif /* CONFIG_PAGE_IDLE_FLAG */
 
 #endif /* _LINUX_MM_PAGE_IDLE_H */
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 67018d367b9f4..ebee94c397a67 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -73,7 +73,7 @@
 #define IF_HAVE_PG_HWPOISON(flag,string)
 #endif
 
-#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
+#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
 #define IF_HAVE_PG_IDLE(flag,string) ,{1UL << flag, string}
 #else
 #define IF_HAVE_PG_IDLE(flag,string)
diff --git a/mm/Kconfig b/mm/Kconfig
index b97f2e8ab83f5..55460ca97fd8d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -749,10 +749,18 @@ config DEFERRED_STRUCT_PAGE_INIT
 	  lifetime of the system until these kthreads finish the
 	  initialisation.
 
+config PAGE_IDLE_FLAG
+	bool
+	select PAGE_EXTENSION if !64BIT
+	help
+	  This adds PG_idle and PG_young flags to 'struct page'.  PTE Accessed
+	  bit writers can set the state of the bit in the flags so that PTE
+	  Accessed bit readers may avoid disturbance.
+
 config IDLE_PAGE_TRACKING
 	bool "Enable idle page tracking"
 	depends on SYSFS && MMU
-	select PAGE_EXTENSION if !64BIT
+	select PAGE_IDLE_FLAG
 	help
 	  This feature allows to estimate the amount of user pages that have
 	  not been touched during a given period of time. This information can
diff --git a/mm/page_ext.c b/mm/page_ext.c
index a3616f7a0e9e9..f9a6ff65ac0a9 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -58,11 +58,21 @@
  * can utilize this callback to initialize the state of it correctly.
  */
 
+#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
+static bool need_page_idle(void)
+{
+	return true;
+}
+struct page_ext_operations page_idle_ops = {
+	.need = need_page_idle,
+};
+#endif
+
 static struct page_ext_operations *page_ext_ops[] = {
 #ifdef CONFIG_PAGE_OWNER
 	&page_owner_ops,
 #endif
-#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
+#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
 	&page_idle_ops,
 #endif
 };
diff --git a/mm/page_idle.c b/mm/page_idle.c
index 057c61df12dba..144fb4ed961d7 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -211,16 +211,6 @@ static const struct attribute_group page_idle_attr_group = {
 	.name = "page_idle",
 };
 
-#ifndef CONFIG_64BIT
-static bool need_page_idle(void)
-{
-	return true;
-}
-struct page_ext_operations page_idle_ops = {
-	.need = need_page_idle,
-};
-#endif
-
 static int __init page_idle_init(void)
 {
 	int err;

From 06ef136005c3d6bb4cccaf8dc39865a3e9d288dd Mon Sep 17 00:00:00 2001
From: SeongJae Park <sjpark@amazon.de>
Date: Tue, 7 Sep 2021 19:56:44 -0700
Subject: [PATCH 173/737] mm/damon: implement primitives for the virtual memory
 address spaces

This commit introduces a reference implementation of the address space
specific low level primitives for the virtual address space, so that users
of DAMON can easily monitor the data accesses on virtual address spaces of
specific processes by simply configuring the implementation to be used by
DAMON.

The low level primitives for the fundamental access monitoring are defined
in two parts:

1. Identification of the monitoring target address range for the address
   space.
2. Access check of specific address range in the target space.

The reference implementation for the virtual address space does the works
as below.

PTE Accessed-bit Based Access Check
-----------------------------------

The implementation uses PTE Accessed-bit for basic access checks.  That
is, it clears the bit for the next sampling target page and checks whether
it is set again after one sampling period.  This could disturb the reclaim
logic.  DAMON uses ``PG_idle`` and ``PG_young`` page flags to solve the
conflict, as Idle page tracking does.

VMA-based Target Address Range Construction
-------------------------------------------

Only small parts in the super-huge virtual address space of the processes
are mapped to physical memory and accessed.  Thus, tracking the unmapped
address regions is just wasteful.  However, because DAMON can deal with
some level of noise using the adaptive regions adjustment mechanism,
tracking every mapping is not strictly required but could even incur a
high overhead in some cases.  That said, too huge unmapped areas inside
the monitoring target should be removed to not take the time for the
adaptive mechanism.

For the reason, this implementation converts the complex mappings to three
distinct regions that cover every mapped area of the address space.  Also,
the two gaps between the three regions are the two biggest unmapped areas
in the given address space.  The two biggest unmapped areas would be the
gap between the heap and the uppermost mmap()-ed region, and the gap
between the lowermost mmap()-ed region and the stack in most of the cases.
Because these gaps are exceptionally huge in usual address spaces,
excluding these will be sufficient to make a reasonable trade-off.  Below
shows this in detail::

    <heap>
    <BIG UNMAPPED REGION 1>
    <uppermost mmap()-ed region>
    (small mmap()-ed regions and munmap()-ed regions)
    <lowermost mmap()-ed region>
    <BIG UNMAPPED REGION 2>
    <stack>

[akpm@linux-foundation.org: mm/damon/vaddr.c needs highmem.h for kunmap_atomic()]
[sjpark@amazon.de: remove unnecessary PAGE_EXTENSION setup]
  Link: https://lkml.kernel.org/r/20210806095153.6444-2-sj38.park@gmail.com
[sjpark@amazon.de: safely walk page table]
  Link: https://lkml.kernel.org/r/20210831161800.29419-1-sj38.park@gmail.com

Link: https://lkml.kernel.org/r/20210716081449.22187-6-sj38.park@gmail.com
Signed-off-by: SeongJae Park <sjpark@amazon.de>
Reviewed-by: Leonard Foerster <foersleo@amazon.de>
Reviewed-by: Fernand Sieber <sieberf@amazon.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Fan Du <fan.du@intel.com>
Cc: Greg Kroah-Hartman <greg@kroah.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joe Perches <joe@perches.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Maximilian Heyne <mheyne@amazon.de>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h |  13 +
 mm/damon/Kconfig      |   8 +
 mm/damon/Makefile     |   1 +
 mm/damon/vaddr.c      | 665 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 687 insertions(+)
 create mode 100644 mm/damon/vaddr.c

diff --git a/include/linux/damon.h b/include/linux/damon.h
index ce2a84b26cd74..edb350e52b934 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -249,4 +249,17 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
 
 #endif	/* CONFIG_DAMON */
 
+#ifdef CONFIG_DAMON_VADDR
+
+/* Monitoring primitives for virtual memory address spaces */
+void damon_va_init(struct damon_ctx *ctx);
+void damon_va_update(struct damon_ctx *ctx);
+void damon_va_prepare_access_checks(struct damon_ctx *ctx);
+unsigned int damon_va_check_accesses(struct damon_ctx *ctx);
+bool damon_va_target_valid(void *t);
+void damon_va_cleanup(struct damon_ctx *ctx);
+void damon_va_set_primitives(struct damon_ctx *ctx);
+
+#endif	/* CONFIG_DAMON_VADDR */
+
 #endif	/* _DAMON_H */
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index d00e99ac1a154..5cbb5db541587 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -12,4 +12,12 @@ config DAMON
 	  See https://damonitor.github.io/doc/html/latest-damon/index.html for
 	  more information.
 
+config DAMON_VADDR
+	bool "Data access monitoring primitives for virtual address spaces"
+	depends on DAMON && MMU
+	select PAGE_IDLE_FLAG
+	help
+	  This builds the default data access monitoring primitives for DAMON
+	  that works for virtual address spaces.
+
 endmenu
diff --git a/mm/damon/Makefile b/mm/damon/Makefile
index 4fd2edb4becfb..6ebbd08aed673 100644
--- a/mm/damon/Makefile
+++ b/mm/damon/Makefile
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
 
 obj-$(CONFIG_DAMON)		:= core.o
+obj-$(CONFIG_DAMON_VADDR)	+= vaddr.o
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
new file mode 100644
index 0000000000000..897aa8cf96c83
--- /dev/null
+++ b/mm/damon/vaddr.c
@@ -0,0 +1,665 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DAMON Primitives for Virtual Address Spaces
+ *
+ * Author: SeongJae Park <sjpark@amazon.de>
+ */
+
+#define pr_fmt(fmt) "damon-va: " fmt
+
+#include <linux/damon.h>
+#include <linux/hugetlb.h>
+#include <linux/mm.h>
+#include <linux/mmu_notifier.h>
+#include <linux/highmem.h>
+#include <linux/page_idle.h>
+#include <linux/pagewalk.h>
+#include <linux/random.h>
+#include <linux/sched/mm.h>
+#include <linux/slab.h>
+
+/* Get a random number in [l, r) */
+#define damon_rand(l, r) (l + prandom_u32_max(r - l))
+
+/*
+ * 't->id' should be the pointer to the relevant 'struct pid' having reference
+ * count.  Caller must put the returned task, unless it is NULL.
+ */
+#define damon_get_task_struct(t) \
+	(get_pid_task((struct pid *)t->id, PIDTYPE_PID))
+
+/*
+ * Get the mm_struct of the given target
+ *
+ * Caller _must_ put the mm_struct after use, unless it is NULL.
+ *
+ * Returns the mm_struct of the target on success, NULL on failure
+ */
+static struct mm_struct *damon_get_mm(struct damon_target *t)
+{
+	struct task_struct *task;
+	struct mm_struct *mm;
+
+	task = damon_get_task_struct(t);
+	if (!task)
+		return NULL;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	return mm;
+}
+
+/*
+ * Functions for the initial monitoring target regions construction
+ */
+
+/*
+ * Size-evenly split a region into 'nr_pieces' small regions
+ *
+ * Returns 0 on success, or negative error code otherwise.
+ */
+static int damon_va_evenly_split_region(struct damon_target *t,
+		struct damon_region *r, unsigned int nr_pieces)
+{
+	unsigned long sz_orig, sz_piece, orig_end;
+	struct damon_region *n = NULL, *next;
+	unsigned long start;
+
+	if (!r || !nr_pieces)
+		return -EINVAL;
+
+	orig_end = r->ar.end;
+	sz_orig = r->ar.end - r->ar.start;
+	sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION);
+
+	if (!sz_piece)
+		return -EINVAL;
+
+	r->ar.end = r->ar.start + sz_piece;
+	next = damon_next_region(r);
+	for (start = r->ar.end; start + sz_piece <= orig_end;
+			start += sz_piece) {
+		n = damon_new_region(start, start + sz_piece);
+		if (!n)
+			return -ENOMEM;
+		damon_insert_region(n, r, next, t);
+		r = n;
+	}
+	/* complement last region for possible rounding error */
+	if (n)
+		n->ar.end = orig_end;
+
+	return 0;
+}
+
+static unsigned long sz_range(struct damon_addr_range *r)
+{
+	return r->end - r->start;
+}
+
+static void swap_ranges(struct damon_addr_range *r1,
+			struct damon_addr_range *r2)
+{
+	struct damon_addr_range tmp;
+
+	tmp = *r1;
+	*r1 = *r2;
+	*r2 = tmp;
+}
+
+/*
+ * Find three regions separated by two biggest unmapped regions
+ *
+ * vma		the head vma of the target address space
+ * regions	an array of three address ranges that results will be saved
+ *
+ * This function receives an address space and finds three regions in it which
+ * separated by the two biggest unmapped regions in the space.  Please refer to
+ * below comments of '__damon_va_init_regions()' function to know why this is
+ * necessary.
+ *
+ * Returns 0 if success, or negative error code otherwise.
+ */
+static int __damon_va_three_regions(struct vm_area_struct *vma,
+				       struct damon_addr_range regions[3])
+{
+	struct damon_addr_range gap = {0}, first_gap = {0}, second_gap = {0};
+	struct vm_area_struct *last_vma = NULL;
+	unsigned long start = 0;
+	struct rb_root rbroot;
+
+	/* Find two biggest gaps so that first_gap > second_gap > others */
+	for (; vma; vma = vma->vm_next) {
+		if (!last_vma) {
+			start = vma->vm_start;
+			goto next;
+		}
+
+		if (vma->rb_subtree_gap <= sz_range(&second_gap)) {
+			rbroot.rb_node = &vma->vm_rb;
+			vma = rb_entry(rb_last(&rbroot),
+					struct vm_area_struct, vm_rb);
+			goto next;
+		}
+
+		gap.start = last_vma->vm_end;
+		gap.end = vma->vm_start;
+		if (sz_range(&gap) > sz_range(&second_gap)) {
+			swap_ranges(&gap, &second_gap);
+			if (sz_range(&second_gap) > sz_range(&first_gap))
+				swap_ranges(&second_gap, &first_gap);
+		}
+next:
+		last_vma = vma;
+	}
+
+	if (!sz_range(&second_gap) || !sz_range(&first_gap))
+		return -EINVAL;
+
+	/* Sort the two biggest gaps by address */
+	if (first_gap.start > second_gap.start)
+		swap_ranges(&first_gap, &second_gap);
+
+	/* Store the result */
+	regions[0].start = ALIGN(start, DAMON_MIN_REGION);
+	regions[0].end = ALIGN(first_gap.start, DAMON_MIN_REGION);
+	regions[1].start = ALIGN(first_gap.end, DAMON_MIN_REGION);
+	regions[1].end = ALIGN(second_gap.start, DAMON_MIN_REGION);
+	regions[2].start = ALIGN(second_gap.end, DAMON_MIN_REGION);
+	regions[2].end = ALIGN(last_vma->vm_end, DAMON_MIN_REGION);
+
+	return 0;
+}
+
+/*
+ * Get the three regions in the given target (task)
+ *
+ * Returns 0 on success, negative error code otherwise.
+ */
+static int damon_va_three_regions(struct damon_target *t,
+				struct damon_addr_range regions[3])
+{
+	struct mm_struct *mm;
+	int rc;
+
+	mm = damon_get_mm(t);
+	if (!mm)
+		return -EINVAL;
+
+	mmap_read_lock(mm);
+	rc = __damon_va_three_regions(mm->mmap, regions);
+	mmap_read_unlock(mm);
+
+	mmput(mm);
+	return rc;
+}
+
+/*
+ * Initialize the monitoring target regions for the given target (task)
+ *
+ * t	the given target
+ *
+ * Because only a number of small portions of the entire address space
+ * is actually mapped to the memory and accessed, monitoring the unmapped
+ * regions is wasteful.  That said, because we can deal with small noises,
+ * tracking every mapping is not strictly required but could even incur a high
+ * overhead if the mapping frequently changes or the number of mappings is
+ * high.  The adaptive regions adjustment mechanism will further help to deal
+ * with the noise by simply identifying the unmapped areas as a region that
+ * has no access.  Moreover, applying the real mappings that would have many
+ * unmapped areas inside will make the adaptive mechanism quite complex.  That
+ * said, too huge unmapped areas inside the monitoring target should be removed
+ * to not take the time for the adaptive mechanism.
+ *
+ * For the reason, we convert the complex mappings to three distinct regions
+ * that cover every mapped area of the address space.  Also the two gaps
+ * between the three regions are the two biggest unmapped areas in the given
+ * address space.  In detail, this function first identifies the start and the
+ * end of the mappings and the two biggest unmapped areas of the address space.
+ * Then, it constructs the three regions as below:
+ *
+ *     [mappings[0]->start, big_two_unmapped_areas[0]->start)
+ *     [big_two_unmapped_areas[0]->end, big_two_unmapped_areas[1]->start)
+ *     [big_two_unmapped_areas[1]->end, mappings[nr_mappings - 1]->end)
+ *
+ * As usual memory map of processes is as below, the gap between the heap and
+ * the uppermost mmap()-ed region, and the gap between the lowermost mmap()-ed
+ * region and the stack will be two biggest unmapped regions.  Because these
+ * gaps are exceptionally huge areas in usual address space, excluding these
+ * two biggest unmapped regions will be sufficient to make a trade-off.
+ *
+ *   <heap>
+ *   <BIG UNMAPPED REGION 1>
+ *   <uppermost mmap()-ed region>
+ *   (other mmap()-ed regions and small unmapped regions)
+ *   <lowermost mmap()-ed region>
+ *   <BIG UNMAPPED REGION 2>
+ *   <stack>
+ */
+static void __damon_va_init_regions(struct damon_ctx *ctx,
+				     struct damon_target *t)
+{
+	struct damon_region *r;
+	struct damon_addr_range regions[3];
+	unsigned long sz = 0, nr_pieces;
+	int i;
+
+	if (damon_va_three_regions(t, regions)) {
+		pr_err("Failed to get three regions of target %lu\n", t->id);
+		return;
+	}
+
+	for (i = 0; i < 3; i++)
+		sz += regions[i].end - regions[i].start;
+	if (ctx->min_nr_regions)
+		sz /= ctx->min_nr_regions;
+	if (sz < DAMON_MIN_REGION)
+		sz = DAMON_MIN_REGION;
+
+	/* Set the initial three regions of the target */
+	for (i = 0; i < 3; i++) {
+		r = damon_new_region(regions[i].start, regions[i].end);
+		if (!r) {
+			pr_err("%d'th init region creation failed\n", i);
+			return;
+		}
+		damon_add_region(r, t);
+
+		nr_pieces = (regions[i].end - regions[i].start) / sz;
+		damon_va_evenly_split_region(t, r, nr_pieces);
+	}
+}
+
+/* Initialize '->regions_list' of every target (task) */
+void damon_va_init(struct damon_ctx *ctx)
+{
+	struct damon_target *t;
+
+	damon_for_each_target(t, ctx) {
+		/* the user may set the target regions as they want */
+		if (!damon_nr_regions(t))
+			__damon_va_init_regions(ctx, t);
+	}
+}
+
+/*
+ * Functions for the dynamic monitoring target regions update
+ */
+
+/*
+ * Check whether a region is intersecting an address range
+ *
+ * Returns true if it is.
+ */
+static bool damon_intersect(struct damon_region *r, struct damon_addr_range *re)
+{
+	return !(r->ar.end <= re->start || re->end <= r->ar.start);
+}
+
+/*
+ * Update damon regions for the three big regions of the given target
+ *
+ * t		the given target
+ * bregions	the three big regions of the target
+ */
+static void damon_va_apply_three_regions(struct damon_target *t,
+		struct damon_addr_range bregions[3])
+{
+	struct damon_region *r, *next;
+	unsigned int i = 0;
+
+	/* Remove regions which are not in the three big regions now */
+	damon_for_each_region_safe(r, next, t) {
+		for (i = 0; i < 3; i++) {
+			if (damon_intersect(r, &bregions[i]))
+				break;
+		}
+		if (i == 3)
+			damon_destroy_region(r, t);
+	}
+
+	/* Adjust intersecting regions to fit with the three big regions */
+	for (i = 0; i < 3; i++) {
+		struct damon_region *first = NULL, *last;
+		struct damon_region *newr;
+		struct damon_addr_range *br;
+
+		br = &bregions[i];
+		/* Get the first and last regions which intersects with br */
+		damon_for_each_region(r, t) {
+			if (damon_intersect(r, br)) {
+				if (!first)
+					first = r;
+				last = r;
+			}
+			if (r->ar.start >= br->end)
+				break;
+		}
+		if (!first) {
+			/* no damon_region intersects with this big region */
+			newr = damon_new_region(
+					ALIGN_DOWN(br->start,
+						DAMON_MIN_REGION),
+					ALIGN(br->end, DAMON_MIN_REGION));
+			if (!newr)
+				continue;
+			damon_insert_region(newr, damon_prev_region(r), r, t);
+		} else {
+			first->ar.start = ALIGN_DOWN(br->start,
+					DAMON_MIN_REGION);
+			last->ar.end = ALIGN(br->end, DAMON_MIN_REGION);
+		}
+	}
+}
+
+/*
+ * Update regions for current memory mappings
+ */
+void damon_va_update(struct damon_ctx *ctx)
+{
+	struct damon_addr_range three_regions[3];
+	struct damon_target *t;
+
+	damon_for_each_target(t, ctx) {
+		if (damon_va_three_regions(t, three_regions))
+			continue;
+		damon_va_apply_three_regions(t, three_regions);
+	}
+}
+
+/*
+ * Get an online page for a pfn if it's in the LRU list.  Otherwise, returns
+ * NULL.
+ *
+ * The body of this function is stolen from the 'page_idle_get_page()'.  We
+ * steal rather than reuse it because the code is quite simple.
+ */
+static struct page *damon_get_page(unsigned long pfn)
+{
+	struct page *page = pfn_to_online_page(pfn);
+
+	if (!page || !PageLRU(page) || !get_page_unless_zero(page))
+		return NULL;
+
+	if (unlikely(!PageLRU(page))) {
+		put_page(page);
+		page = NULL;
+	}
+	return page;
+}
+
+static void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm,
+			     unsigned long addr)
+{
+	bool referenced = false;
+	struct page *page = damon_get_page(pte_pfn(*pte));
+
+	if (!page)
+		return;
+
+	if (pte_young(*pte)) {
+		referenced = true;
+		*pte = pte_mkold(*pte);
+	}
+
+#ifdef CONFIG_MMU_NOTIFIER
+	if (mmu_notifier_clear_young(mm, addr, addr + PAGE_SIZE))
+		referenced = true;
+#endif /* CONFIG_MMU_NOTIFIER */
+
+	if (referenced)
+		set_page_young(page);
+
+	set_page_idle(page);
+	put_page(page);
+}
+
+static void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm,
+			     unsigned long addr)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	bool referenced = false;
+	struct page *page = damon_get_page(pmd_pfn(*pmd));
+
+	if (!page)
+		return;
+
+	if (pmd_young(*pmd)) {
+		referenced = true;
+		*pmd = pmd_mkold(*pmd);
+	}
+
+#ifdef CONFIG_MMU_NOTIFIER
+	if (mmu_notifier_clear_young(mm, addr,
+				addr + ((1UL) << HPAGE_PMD_SHIFT)))
+		referenced = true;
+#endif /* CONFIG_MMU_NOTIFIER */
+
+	if (referenced)
+		set_page_young(page);
+
+	set_page_idle(page);
+	put_page(page);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+}
+
+static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
+		unsigned long next, struct mm_walk *walk)
+{
+	pte_t *pte;
+	spinlock_t *ptl;
+
+	if (pmd_huge(*pmd)) {
+		ptl = pmd_lock(walk->mm, pmd);
+		if (pmd_huge(*pmd)) {
+			damon_pmdp_mkold(pmd, walk->mm, addr);
+			spin_unlock(ptl);
+			return 0;
+		}
+		spin_unlock(ptl);
+	}
+
+	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+		return 0;
+	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+	if (!pte_present(*pte))
+		goto out;
+	damon_ptep_mkold(pte, walk->mm, addr);
+out:
+	pte_unmap_unlock(pte, ptl);
+	return 0;
+}
+
+static struct mm_walk_ops damon_mkold_ops = {
+	.pmd_entry = damon_mkold_pmd_entry,
+};
+
+static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
+{
+	mmap_read_lock(mm);
+	walk_page_range(mm, addr, addr + 1, &damon_mkold_ops, NULL);
+	mmap_read_unlock(mm);
+}
+
+/*
+ * Functions for the access checking of the regions
+ */
+
+static void damon_va_prepare_access_check(struct damon_ctx *ctx,
+			struct mm_struct *mm, struct damon_region *r)
+{
+	r->sampling_addr = damon_rand(r->ar.start, r->ar.end);
+
+	damon_va_mkold(mm, r->sampling_addr);
+}
+
+void damon_va_prepare_access_checks(struct damon_ctx *ctx)
+{
+	struct damon_target *t;
+	struct mm_struct *mm;
+	struct damon_region *r;
+
+	damon_for_each_target(t, ctx) {
+		mm = damon_get_mm(t);
+		if (!mm)
+			continue;
+		damon_for_each_region(r, t)
+			damon_va_prepare_access_check(ctx, mm, r);
+		mmput(mm);
+	}
+}
+
+struct damon_young_walk_private {
+	unsigned long *page_sz;
+	bool young;
+};
+
+static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
+		unsigned long next, struct mm_walk *walk)
+{
+	pte_t *pte;
+	spinlock_t *ptl;
+	struct page *page;
+	struct damon_young_walk_private *priv = walk->private;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	if (pmd_huge(*pmd)) {
+		ptl = pmd_lock(walk->mm, pmd);
+		if (!pmd_huge(*pmd)) {
+			spin_unlock(ptl);
+			goto regular_page;
+		}
+		page = damon_get_page(pmd_pfn(*pmd));
+		if (!page)
+			goto huge_out;
+		if (pmd_young(*pmd) || !page_is_idle(page) ||
+					mmu_notifier_test_young(walk->mm,
+						addr)) {
+			*priv->page_sz = ((1UL) << HPAGE_PMD_SHIFT);
+			priv->young = true;
+		}
+		put_page(page);
+huge_out:
+		spin_unlock(ptl);
+		return 0;
+	}
+
+regular_page:
+#endif	/* CONFIG_TRANSPARENT_HUGEPAGE */
+
+	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+		return -EINVAL;
+	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+	if (!pte_present(*pte))
+		goto out;
+	page = damon_get_page(pte_pfn(*pte));
+	if (!page)
+		goto out;
+	if (pte_young(*pte) || !page_is_idle(page) ||
+			mmu_notifier_test_young(walk->mm, addr)) {
+		*priv->page_sz = PAGE_SIZE;
+		priv->young = true;
+	}
+	put_page(page);
+out:
+	pte_unmap_unlock(pte, ptl);
+	return 0;
+}
+
+static struct mm_walk_ops damon_young_ops = {
+	.pmd_entry = damon_young_pmd_entry,
+};
+
+static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
+		unsigned long *page_sz)
+{
+	struct damon_young_walk_private arg = {
+		.page_sz = page_sz,
+		.young = false,
+	};
+
+	mmap_read_lock(mm);
+	walk_page_range(mm, addr, addr + 1, &damon_young_ops, &arg);
+	mmap_read_unlock(mm);
+	return arg.young;
+}
+
+/*
+ * Check whether the region was accessed after the last preparation
+ *
+ * mm	'mm_struct' for the given virtual address space
+ * r	the region to be checked
+ */
+static void damon_va_check_access(struct damon_ctx *ctx,
+			       struct mm_struct *mm, struct damon_region *r)
+{
+	static struct mm_struct *last_mm;
+	static unsigned long last_addr;
+	static unsigned long last_page_sz = PAGE_SIZE;
+	static bool last_accessed;
+
+	/* If the region is in the last checked page, reuse the result */
+	if (mm == last_mm && (ALIGN_DOWN(last_addr, last_page_sz) ==
+				ALIGN_DOWN(r->sampling_addr, last_page_sz))) {
+		if (last_accessed)
+			r->nr_accesses++;
+		return;
+	}
+
+	last_accessed = damon_va_young(mm, r->sampling_addr, &last_page_sz);
+	if (last_accessed)
+		r->nr_accesses++;
+
+	last_mm = mm;
+	last_addr = r->sampling_addr;
+}
+
+unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
+{
+	struct damon_target *t;
+	struct mm_struct *mm;
+	struct damon_region *r;
+	unsigned int max_nr_accesses = 0;
+
+	damon_for_each_target(t, ctx) {
+		mm = damon_get_mm(t);
+		if (!mm)
+			continue;
+		damon_for_each_region(r, t) {
+			damon_va_check_access(ctx, mm, r);
+			max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
+		}
+		mmput(mm);
+	}
+
+	return max_nr_accesses;
+}
+
+/*
+ * Functions for the target validity check and cleanup
+ */
+
+bool damon_va_target_valid(void *target)
+{
+	struct damon_target *t = target;
+	struct task_struct *task;
+
+	task = damon_get_task_struct(t);
+	if (task) {
+		put_task_struct(task);
+		return true;
+	}
+
+	return false;
+}
+
+void damon_va_set_primitives(struct damon_ctx *ctx)
+{
+	ctx->primitive.init = damon_va_init;
+	ctx->primitive.update = damon_va_update;
+	ctx->primitive.prepare_access_checks = damon_va_prepare_access_checks;
+	ctx->primitive.check_accesses = damon_va_check_accesses;
+	ctx->primitive.reset_aggregated = NULL;
+	ctx->primitive.target_valid = damon_va_target_valid;
+	ctx->primitive.cleanup = NULL;
+}

From ff066b2165f1b357b682549180631c956508e7f0 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sjpark@amazon.de>
Date: Tue, 7 Sep 2021 19:56:48 -0700
Subject: [PATCH 174/737] mm/damon: add a tracepoint

This commit adds a tracepoint for DAMON.  It traces the monitoring results
of each region for each aggregation interval.  Using this, DAMON can
easily integrated with tracepoints supporting tools such as perf.

Link: https://lkml.kernel.org/r/20210716081449.22187-7-sj38.park@gmail.com
Signed-off-by: SeongJae Park <sjpark@amazon.de>
Reviewed-by: Leonard Foerster <foersleo@amazon.de>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Reviewed-by: Fernand Sieber <sieberf@amazon.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Fan Du <fan.du@intel.com>
Cc: Greg Kroah-Hartman <greg@kroah.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joe Perches <joe@perches.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Maximilian Heyne <mheyne@amazon.de>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/trace/events/damon.h | 43 ++++++++++++++++++++++++++++++++++++
 mm/damon/core.c              |  7 +++++-
 2 files changed, 49 insertions(+), 1 deletion(-)
 create mode 100644 include/trace/events/damon.h

diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h
new file mode 100644
index 0000000000000..2f422f4f1fb9e
--- /dev/null
+++ b/include/trace/events/damon.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM damon
+
+#if !defined(_TRACE_DAMON_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_DAMON_H
+
+#include <linux/damon.h>
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(damon_aggregated,
+
+	TP_PROTO(struct damon_target *t, struct damon_region *r,
+		unsigned int nr_regions),
+
+	TP_ARGS(t, r, nr_regions),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, target_id)
+		__field(unsigned int, nr_regions)
+		__field(unsigned long, start)
+		__field(unsigned long, end)
+		__field(unsigned int, nr_accesses)
+	),
+
+	TP_fast_assign(
+		__entry->target_id = t->id;
+		__entry->nr_regions = nr_regions;
+		__entry->start = r->ar.start;
+		__entry->end = r->ar.end;
+		__entry->nr_accesses = r->nr_accesses;
+	),
+
+	TP_printk("target_id=%lu nr_regions=%u %lu-%lu: %u",
+			__entry->target_id, __entry->nr_regions,
+			__entry->start, __entry->end, __entry->nr_accesses)
+);
+
+#endif /* _TRACE_DAMON_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 28a2c78914faa..ee24d64e8019e 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -13,6 +13,9 @@
 #include <linux/random.h>
 #include <linux/slab.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/damon.h>
+
 /* Get a random number in [l, r) */
 #define damon_rand(l, r) (l + prandom_u32_max(r - l))
 
@@ -387,8 +390,10 @@ static void kdamond_reset_aggregated(struct damon_ctx *c)
 	damon_for_each_target(t, c) {
 		struct damon_region *r;
 
-		damon_for_each_region(r, t)
+		damon_for_each_region(r, t) {
+			trace_damon_aggregated(t, r, damon_nr_regions(t));
 			r->nr_accesses = 0;
+		}
 	}
 }
 

From 5aa369e48664ed2ac8f8a3fc5fea223793159a16 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sjpark@amazon.de>
Date: Tue, 7 Sep 2021 19:56:53 -0700
Subject: [PATCH 175/737] mm/damon: implement a debugfs-based user space
 interface

DAMON is designed to be used by kernel space code such as the memory
management subsystems, and therefore it provides only kernel space API.
That said, letting the user space control DAMON could provide some
benefits to them.  For example, it will allow user space to analyze their
specific workloads and make their own special optimizations.

For such cases, this commit implements a simple DAMON application kernel
module, namely 'damon-dbgfs', which merely wraps the DAMON api and exports
those to the user space via the debugfs.

'damon-dbgfs' exports three files, ``attrs``, ``target_ids``, and
``monitor_on`` under its debugfs directory, ``<debugfs>/damon/``.

Attributes
----------

Users can read and write the ``sampling interval``, ``aggregation
interval``, ``regions update interval``, and min/max number of monitoring
target regions by reading from and writing to the ``attrs`` file.  For
example, below commands set those values to 5 ms, 100 ms, 1,000 ms, 10,
1000 and check it again::

    # cd <debugfs>/damon
    # echo 5000 100000 1000000 10 1000 > attrs
    # cat attrs
    5000 100000 1000000 10 1000

Target IDs
----------

Some types of address spaces supports multiple monitoring target.  For
example, the virtual memory address spaces monitoring can have multiple
processes as the monitoring targets.  Users can set the targets by writing
relevant id values of the targets to, and get the ids of the current
targets by reading from the ``target_ids`` file.  In case of the virtual
address spaces monitoring, the values should be pids of the monitoring
target processes.  For example, below commands set processes having pids
42 and 4242 as the monitoring targets and check it again::

    # cd <debugfs>/damon
    # echo 42 4242 > target_ids
    # cat target_ids
    42 4242

Note that setting the target ids doesn't start the monitoring.

Turning On/Off
--------------

Setting the files as described above doesn't incur effect unless you
explicitly start the monitoring.  You can start, stop, and check the
current status of the monitoring by writing to and reading from the
``monitor_on`` file.  Writing ``on`` to the file starts the monitoring of
the targets with the attributes.  Writing ``off`` to the file stops those.
DAMON also stops if every targets are invalidated (in case of the virtual
memory monitoring, target processes are invalidated when terminated).
Below example commands turn on, off, and check the status of DAMON::

    # cd <debugfs>/damon
    # echo on > monitor_on
    # echo off > monitor_on
    # cat monitor_on
    off

Please note that you cannot write to the above-mentioned debugfs files
while the monitoring is turned on.  If you write to the files while DAMON
is running, an error code such as ``-EBUSY`` will be returned.

[akpm@linux-foundation.org: remove unneeded "alloc failed" printks]
[akpm@linux-foundation.org: replace macro with static inline]

Link: https://lkml.kernel.org/r/20210716081449.22187-8-sj38.park@gmail.com
Signed-off-by: SeongJae Park <sjpark@amazon.de>
Reviewed-by: Leonard Foerster <foersleo@amazon.de>
Reviewed-by: Fernand Sieber <sieberf@amazon.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Fan Du <fan.du@intel.com>
Cc: Greg Kroah-Hartman <greg@kroah.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joe Perches <joe@perches.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Maximilian Heyne <mheyne@amazon.de>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h |   3 +
 mm/damon/Kconfig      |   9 +
 mm/damon/Makefile     |   1 +
 mm/damon/core.c       |  47 +++++
 mm/damon/dbgfs.c      | 397 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 457 insertions(+)
 create mode 100644 mm/damon/dbgfs.c

diff --git a/include/linux/damon.h b/include/linux/damon.h
index edb350e52b934..d68b67b8d458d 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -240,9 +240,12 @@ unsigned int damon_nr_regions(struct damon_target *t);
 
 struct damon_ctx *damon_new_ctx(void);
 void damon_destroy_ctx(struct damon_ctx *ctx);
+int damon_set_targets(struct damon_ctx *ctx,
+		unsigned long *ids, ssize_t nr_ids);
 int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
 		unsigned long aggr_int, unsigned long primitive_upd_int,
 		unsigned long min_nr_reg, unsigned long max_nr_reg);
+int damon_nr_running_ctxs(void);
 
 int damon_start(struct damon_ctx **ctxs, int nr_ctxs);
 int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index 5cbb5db541587..c8e3dba6fb4cf 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -20,4 +20,13 @@ config DAMON_VADDR
 	  This builds the default data access monitoring primitives for DAMON
 	  that works for virtual address spaces.
 
+config DAMON_DBGFS
+	bool "DAMON debugfs interface"
+	depends on DAMON_VADDR && DEBUG_FS
+	help
+	  This builds the debugfs interface for DAMON.  The user space admins
+	  can use the interface for arbitrary data access monitoring.
+
+	  If unsure, say N.
+
 endmenu
diff --git a/mm/damon/Makefile b/mm/damon/Makefile
index 6ebbd08aed673..fed4be3bace3e 100644
--- a/mm/damon/Makefile
+++ b/mm/damon/Makefile
@@ -2,3 +2,4 @@
 
 obj-$(CONFIG_DAMON)		:= core.o
 obj-$(CONFIG_DAMON_VADDR)	+= vaddr.o
+obj-$(CONFIG_DAMON_DBGFS)	+= dbgfs.o
diff --git a/mm/damon/core.c b/mm/damon/core.c
index ee24d64e8019e..59033488402e8 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -171,6 +171,39 @@ void damon_destroy_ctx(struct damon_ctx *ctx)
 	kfree(ctx);
 }
 
+/**
+ * damon_set_targets() - Set monitoring targets.
+ * @ctx:	monitoring context
+ * @ids:	array of target ids
+ * @nr_ids:	number of entries in @ids
+ *
+ * This function should not be called while the kdamond is running.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int damon_set_targets(struct damon_ctx *ctx,
+		      unsigned long *ids, ssize_t nr_ids)
+{
+	ssize_t i;
+	struct damon_target *t, *next;
+
+	damon_destroy_targets(ctx);
+
+	for (i = 0; i < nr_ids; i++) {
+		t = damon_new_target(ids[i]);
+		if (!t) {
+			pr_err("Failed to alloc damon_target\n");
+			/* The caller should do cleanup of the ids itself */
+			damon_for_each_target_safe(t, next, ctx)
+				damon_destroy_target(t);
+			return -ENOMEM;
+		}
+		damon_add_target(ctx, t);
+	}
+
+	return 0;
+}
+
 /**
  * damon_set_attrs() - Set attributes for the monitoring.
  * @ctx:		monitoring context
@@ -209,6 +242,20 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
 	return 0;
 }
 
+/**
+ * damon_nr_running_ctxs() - Return number of currently running contexts.
+ */
+int damon_nr_running_ctxs(void)
+{
+	int nr_ctxs;
+
+	mutex_lock(&damon_lock);
+	nr_ctxs = nr_running_ctxs;
+	mutex_unlock(&damon_lock);
+
+	return nr_ctxs;
+}
+
 /* Returns the size upper limit for each monitoring region */
 static unsigned long damon_region_sz_limit(struct damon_ctx *ctx)
 {
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
new file mode 100644
index 0000000000000..d2e0a547eb3f4
--- /dev/null
+++ b/mm/damon/dbgfs.c
@@ -0,0 +1,397 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DAMON Debugfs Interface
+ *
+ * Author: SeongJae Park <sjpark@amazon.de>
+ */
+
+#define pr_fmt(fmt) "damon-dbgfs: " fmt
+
+#include <linux/damon.h>
+#include <linux/debugfs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/page_idle.h>
+#include <linux/slab.h>
+
+static struct damon_ctx **dbgfs_ctxs;
+static int dbgfs_nr_ctxs;
+static struct dentry **dbgfs_dirs;
+
+/*
+ * Returns non-empty string on success, negative error code otherwise.
+ */
+static char *user_input_str(const char __user *buf, size_t count, loff_t *ppos)
+{
+	char *kbuf;
+	ssize_t ret;
+
+	/* We do not accept continuous write */
+	if (*ppos)
+		return ERR_PTR(-EINVAL);
+
+	kbuf = kmalloc(count + 1, GFP_KERNEL);
+	if (!kbuf)
+		return ERR_PTR(-ENOMEM);
+
+	ret = simple_write_to_buffer(kbuf, count + 1, ppos, buf, count);
+	if (ret != count) {
+		kfree(kbuf);
+		return ERR_PTR(-EIO);
+	}
+	kbuf[ret] = '\0';
+
+	return kbuf;
+}
+
+static ssize_t dbgfs_attrs_read(struct file *file,
+		char __user *buf, size_t count, loff_t *ppos)
+{
+	struct damon_ctx *ctx = file->private_data;
+	char kbuf[128];
+	int ret;
+
+	mutex_lock(&ctx->kdamond_lock);
+	ret = scnprintf(kbuf, ARRAY_SIZE(kbuf), "%lu %lu %lu %lu %lu\n",
+			ctx->sample_interval, ctx->aggr_interval,
+			ctx->primitive_update_interval, ctx->min_nr_regions,
+			ctx->max_nr_regions);
+	mutex_unlock(&ctx->kdamond_lock);
+
+	return simple_read_from_buffer(buf, count, ppos, kbuf, ret);
+}
+
+static ssize_t dbgfs_attrs_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *ppos)
+{
+	struct damon_ctx *ctx = file->private_data;
+	unsigned long s, a, r, minr, maxr;
+	char *kbuf;
+	ssize_t ret = count;
+	int err;
+
+	kbuf = user_input_str(buf, count, ppos);
+	if (IS_ERR(kbuf))
+		return PTR_ERR(kbuf);
+
+	if (sscanf(kbuf, "%lu %lu %lu %lu %lu",
+				&s, &a, &r, &minr, &maxr) != 5) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	mutex_lock(&ctx->kdamond_lock);
+	if (ctx->kdamond) {
+		ret = -EBUSY;
+		goto unlock_out;
+	}
+
+	err = damon_set_attrs(ctx, s, a, r, minr, maxr);
+	if (err)
+		ret = err;
+unlock_out:
+	mutex_unlock(&ctx->kdamond_lock);
+out:
+	kfree(kbuf);
+	return ret;
+}
+
+static inline bool targetid_is_pid(const struct damon_ctx *ctx)
+{
+	return ctx->primitive.target_valid == damon_va_target_valid;
+}
+
+static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len)
+{
+	struct damon_target *t;
+	unsigned long id;
+	int written = 0;
+	int rc;
+
+	damon_for_each_target(t, ctx) {
+		id = t->id;
+		if (targetid_is_pid(ctx))
+			/* Show pid numbers to debugfs users */
+			id = (unsigned long)pid_vnr((struct pid *)id);
+
+		rc = scnprintf(&buf[written], len - written, "%lu ", id);
+		if (!rc)
+			return -ENOMEM;
+		written += rc;
+	}
+	if (written)
+		written -= 1;
+	written += scnprintf(&buf[written], len - written, "\n");
+	return written;
+}
+
+static ssize_t dbgfs_target_ids_read(struct file *file,
+		char __user *buf, size_t count, loff_t *ppos)
+{
+	struct damon_ctx *ctx = file->private_data;
+	ssize_t len;
+	char ids_buf[320];
+
+	mutex_lock(&ctx->kdamond_lock);
+	len = sprint_target_ids(ctx, ids_buf, 320);
+	mutex_unlock(&ctx->kdamond_lock);
+	if (len < 0)
+		return len;
+
+	return simple_read_from_buffer(buf, count, ppos, ids_buf, len);
+}
+
+/*
+ * Converts a string into an array of unsigned long integers
+ *
+ * Returns an array of unsigned long integers if the conversion success, or
+ * NULL otherwise.
+ */
+static unsigned long *str_to_target_ids(const char *str, ssize_t len,
+					ssize_t *nr_ids)
+{
+	unsigned long *ids;
+	const int max_nr_ids = 32;
+	unsigned long id;
+	int pos = 0, parsed, ret;
+
+	*nr_ids = 0;
+	ids = kmalloc_array(max_nr_ids, sizeof(id), GFP_KERNEL);
+	if (!ids)
+		return NULL;
+	while (*nr_ids < max_nr_ids && pos < len) {
+		ret = sscanf(&str[pos], "%lu%n", &id, &parsed);
+		pos += parsed;
+		if (ret != 1)
+			break;
+		ids[*nr_ids] = id;
+		*nr_ids += 1;
+	}
+
+	return ids;
+}
+
+static void dbgfs_put_pids(unsigned long *ids, int nr_ids)
+{
+	int i;
+
+	for (i = 0; i < nr_ids; i++)
+		put_pid((struct pid *)ids[i]);
+}
+
+static ssize_t dbgfs_target_ids_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *ppos)
+{
+	struct damon_ctx *ctx = file->private_data;
+	char *kbuf, *nrs;
+	unsigned long *targets;
+	ssize_t nr_targets;
+	ssize_t ret = count;
+	int i;
+	int err;
+
+	kbuf = user_input_str(buf, count, ppos);
+	if (IS_ERR(kbuf))
+		return PTR_ERR(kbuf);
+
+	nrs = kbuf;
+
+	targets = str_to_target_ids(nrs, ret, &nr_targets);
+	if (!targets) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (targetid_is_pid(ctx)) {
+		for (i = 0; i < nr_targets; i++) {
+			targets[i] = (unsigned long)find_get_pid(
+					(int)targets[i]);
+			if (!targets[i]) {
+				dbgfs_put_pids(targets, i);
+				ret = -EINVAL;
+				goto free_targets_out;
+			}
+		}
+	}
+
+	mutex_lock(&ctx->kdamond_lock);
+	if (ctx->kdamond) {
+		if (targetid_is_pid(ctx))
+			dbgfs_put_pids(targets, nr_targets);
+		ret = -EBUSY;
+		goto unlock_out;
+	}
+
+	err = damon_set_targets(ctx, targets, nr_targets);
+	if (err) {
+		if (targetid_is_pid(ctx))
+			dbgfs_put_pids(targets, nr_targets);
+		ret = err;
+	}
+
+unlock_out:
+	mutex_unlock(&ctx->kdamond_lock);
+free_targets_out:
+	kfree(targets);
+out:
+	kfree(kbuf);
+	return ret;
+}
+
+static int damon_dbgfs_open(struct inode *inode, struct file *file)
+{
+	file->private_data = inode->i_private;
+
+	return nonseekable_open(inode, file);
+}
+
+static const struct file_operations attrs_fops = {
+	.open = damon_dbgfs_open,
+	.read = dbgfs_attrs_read,
+	.write = dbgfs_attrs_write,
+};
+
+static const struct file_operations target_ids_fops = {
+	.open = damon_dbgfs_open,
+	.read = dbgfs_target_ids_read,
+	.write = dbgfs_target_ids_write,
+};
+
+static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx)
+{
+	const char * const file_names[] = {"attrs", "target_ids"};
+	const struct file_operations *fops[] = {&attrs_fops, &target_ids_fops};
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(file_names); i++)
+		debugfs_create_file(file_names[i], 0600, dir, ctx, fops[i]);
+}
+
+static int dbgfs_before_terminate(struct damon_ctx *ctx)
+{
+	struct damon_target *t, *next;
+
+	if (!targetid_is_pid(ctx))
+		return 0;
+
+	damon_for_each_target_safe(t, next, ctx) {
+		put_pid((struct pid *)t->id);
+		damon_destroy_target(t);
+	}
+	return 0;
+}
+
+static struct damon_ctx *dbgfs_new_ctx(void)
+{
+	struct damon_ctx *ctx;
+
+	ctx = damon_new_ctx();
+	if (!ctx)
+		return NULL;
+
+	damon_va_set_primitives(ctx);
+	ctx->callback.before_terminate = dbgfs_before_terminate;
+	return ctx;
+}
+
+static ssize_t dbgfs_monitor_on_read(struct file *file,
+		char __user *buf, size_t count, loff_t *ppos)
+{
+	char monitor_on_buf[5];
+	bool monitor_on = damon_nr_running_ctxs() != 0;
+	int len;
+
+	len = scnprintf(monitor_on_buf, 5, monitor_on ? "on\n" : "off\n");
+
+	return simple_read_from_buffer(buf, count, ppos, monitor_on_buf, len);
+}
+
+static ssize_t dbgfs_monitor_on_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *ppos)
+{
+	ssize_t ret = count;
+	char *kbuf;
+	int err;
+
+	kbuf = user_input_str(buf, count, ppos);
+	if (IS_ERR(kbuf))
+		return PTR_ERR(kbuf);
+
+	/* Remove white space */
+	if (sscanf(kbuf, "%s", kbuf) != 1) {
+		kfree(kbuf);
+		return -EINVAL;
+	}
+
+	if (!strncmp(kbuf, "on", count))
+		err = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs);
+	else if (!strncmp(kbuf, "off", count))
+		err = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs);
+	else
+		err = -EINVAL;
+
+	if (err)
+		ret = err;
+	kfree(kbuf);
+	return ret;
+}
+
+static const struct file_operations monitor_on_fops = {
+	.read = dbgfs_monitor_on_read,
+	.write = dbgfs_monitor_on_write,
+};
+
+static int __init __damon_dbgfs_init(void)
+{
+	struct dentry *dbgfs_root;
+	const char * const file_names[] = {"monitor_on"};
+	const struct file_operations *fops[] = {&monitor_on_fops};
+	int i;
+
+	dbgfs_root = debugfs_create_dir("damon", NULL);
+
+	for (i = 0; i < ARRAY_SIZE(file_names); i++)
+		debugfs_create_file(file_names[i], 0600, dbgfs_root, NULL,
+				fops[i]);
+	dbgfs_fill_ctx_dir(dbgfs_root, dbgfs_ctxs[0]);
+
+	dbgfs_dirs = kmalloc_array(1, sizeof(dbgfs_root), GFP_KERNEL);
+	if (!dbgfs_dirs) {
+		debugfs_remove(dbgfs_root);
+		return -ENOMEM;
+	}
+	dbgfs_dirs[0] = dbgfs_root;
+
+	return 0;
+}
+
+/*
+ * Functions for the initialization
+ */
+
+static int __init damon_dbgfs_init(void)
+{
+	int rc;
+
+	dbgfs_ctxs = kmalloc(sizeof(*dbgfs_ctxs), GFP_KERNEL);
+	if (!dbgfs_ctxs)
+		return -ENOMEM;
+	dbgfs_ctxs[0] = dbgfs_new_ctx();
+	if (!dbgfs_ctxs[0]) {
+		kfree(dbgfs_ctxs);
+		return -ENOMEM;
+	}
+	dbgfs_nr_ctxs = 1;
+
+	rc = __damon_dbgfs_init();
+	if (rc) {
+		kfree(dbgfs_ctxs[0]);
+		kfree(dbgfs_ctxs);
+		pr_err("%s: dbgfs init failed\n", __func__);
+	}
+
+	return rc;
+}
+
+module_init(damon_dbgfs_init);

From c199cab8e7e829e1f06a755fafa361f4cfeb590b Mon Sep 17 00:00:00 2001
From: SeongJae Park <sjpark@amazon.de>
Date: Tue, 7 Sep 2021 19:56:57 -0700
Subject: [PATCH 176/737] mm/damon/dbgfs: export kdamond pid to the user space

For CPU usage accounting, knowing pid of the monitoring thread could be
helpful.  For example, users could use cpuaccount cgroups with the pid.

This commit therefore exports the pid of currently running monitoring
thread to the user space via 'kdamond_pid' file in the debugfs directory.

Link: https://lkml.kernel.org/r/20210716081449.22187-9-sj38.park@gmail.com
Signed-off-by: SeongJae Park <sjpark@amazon.de>
Reviewed-by: Fernand Sieber <sieberf@amazon.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Fan Du <fan.du@intel.com>
Cc: Greg Kroah-Hartman <greg@kroah.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joe Perches <joe@perches.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Maximilian Heyne <mheyne@amazon.de>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/dbgfs.c | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index d2e0a547eb3f4..e850be4077f55 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -239,6 +239,32 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
 	return ret;
 }
 
+static ssize_t dbgfs_kdamond_pid_read(struct file *file,
+		char __user *buf, size_t count, loff_t *ppos)
+{
+	struct damon_ctx *ctx = file->private_data;
+	char *kbuf;
+	ssize_t len;
+
+	kbuf = kmalloc(count, GFP_KERNEL);
+	if (!kbuf)
+		return -ENOMEM;
+
+	mutex_lock(&ctx->kdamond_lock);
+	if (ctx->kdamond)
+		len = scnprintf(kbuf, count, "%d\n", ctx->kdamond->pid);
+	else
+		len = scnprintf(kbuf, count, "none\n");
+	mutex_unlock(&ctx->kdamond_lock);
+	if (!len)
+		goto out;
+	len = simple_read_from_buffer(buf, count, ppos, kbuf, len);
+
+out:
+	kfree(kbuf);
+	return len;
+}
+
 static int damon_dbgfs_open(struct inode *inode, struct file *file)
 {
 	file->private_data = inode->i_private;
@@ -258,10 +284,17 @@ static const struct file_operations target_ids_fops = {
 	.write = dbgfs_target_ids_write,
 };
 
+static const struct file_operations kdamond_pid_fops = {
+	.open = damon_dbgfs_open,
+	.read = dbgfs_kdamond_pid_read,
+};
+
 static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx)
 {
-	const char * const file_names[] = {"attrs", "target_ids"};
-	const struct file_operations *fops[] = {&attrs_fops, &target_ids_fops};
+	const char * const file_names[] = {"attrs", "target_ids",
+		"kdamond_pid"};
+	const struct file_operations *fops[] = {&attrs_fops, &target_ids_fops,
+		&kdamond_pid_fops};
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(file_names); i++)

From d3080038d070e3c8cf2b4c9f22e6556b1f64b002 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sjpark@amazon.de>
Date: Tue, 7 Sep 2021 19:57:01 -0700
Subject: [PATCH 177/737] mm/damon/dbgfs: support multiple contexts

In some use cases, users would want to run multiple monitoring context.
For example, if a user wants a high precision monitoring and dedicating
multiple CPUs for the job is ok, because DAMON creates one monitoring
thread per one context, the user can split the monitoring target regions
into multiple small regions and create one context for each region.  Or,
someone might want to simultaneously monitor different address spaces,
e.g., both virtual address space and physical address space.

The DAMON's API allows such usage, but 'damon-dbgfs' does not.  Therefore,
only kernel space DAMON users can do multiple contexts monitoring.

This commit allows the user space DAMON users to use multiple contexts
monitoring by introducing two new 'damon-dbgfs' debugfs files,
'mk_context' and 'rm_context'.  Users can create a new monitoring context
by writing the desired name of the new context to 'mk_context'.  Then, a
new directory with the name and having the files for setting of the
context ('attrs', 'target_ids' and 'record') will be created under the
debugfs directory.  Writing the name of the context to remove to
'rm_context' will remove the related context and directory.

Link: https://lkml.kernel.org/r/20210716081449.22187-10-sj38.park@gmail.com
Signed-off-by: SeongJae Park <sjpark@amazon.de>
Reviewed-by: Fernand Sieber <sieberf@amazon.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Fan Du <fan.du@intel.com>
Cc: Greg Kroah-Hartman <greg@kroah.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joe Perches <joe@perches.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Maximilian Heyne <mheyne@amazon.de>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/dbgfs.c | 195 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 193 insertions(+), 2 deletions(-)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index e850be4077f55..31ad550ecba2d 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -18,6 +18,7 @@
 static struct damon_ctx **dbgfs_ctxs;
 static int dbgfs_nr_ctxs;
 static struct dentry **dbgfs_dirs;
+static DEFINE_MUTEX(damon_dbgfs_lock);
 
 /*
  * Returns non-empty string on success, negative error code otherwise.
@@ -328,6 +329,186 @@ static struct damon_ctx *dbgfs_new_ctx(void)
 	return ctx;
 }
 
+static void dbgfs_destroy_ctx(struct damon_ctx *ctx)
+{
+	damon_destroy_ctx(ctx);
+}
+
+/*
+ * Make a context of @name and create a debugfs directory for it.
+ *
+ * This function should be called while holding damon_dbgfs_lock.
+ *
+ * Returns 0 on success, negative error code otherwise.
+ */
+static int dbgfs_mk_context(char *name)
+{
+	struct dentry *root, **new_dirs, *new_dir;
+	struct damon_ctx **new_ctxs, *new_ctx;
+
+	if (damon_nr_running_ctxs())
+		return -EBUSY;
+
+	new_ctxs = krealloc(dbgfs_ctxs, sizeof(*dbgfs_ctxs) *
+			(dbgfs_nr_ctxs + 1), GFP_KERNEL);
+	if (!new_ctxs)
+		return -ENOMEM;
+	dbgfs_ctxs = new_ctxs;
+
+	new_dirs = krealloc(dbgfs_dirs, sizeof(*dbgfs_dirs) *
+			(dbgfs_nr_ctxs + 1), GFP_KERNEL);
+	if (!new_dirs)
+		return -ENOMEM;
+	dbgfs_dirs = new_dirs;
+
+	root = dbgfs_dirs[0];
+	if (!root)
+		return -ENOENT;
+
+	new_dir = debugfs_create_dir(name, root);
+	dbgfs_dirs[dbgfs_nr_ctxs] = new_dir;
+
+	new_ctx = dbgfs_new_ctx();
+	if (!new_ctx) {
+		debugfs_remove(new_dir);
+		dbgfs_dirs[dbgfs_nr_ctxs] = NULL;
+		return -ENOMEM;
+	}
+
+	dbgfs_ctxs[dbgfs_nr_ctxs] = new_ctx;
+	dbgfs_fill_ctx_dir(dbgfs_dirs[dbgfs_nr_ctxs],
+			dbgfs_ctxs[dbgfs_nr_ctxs]);
+	dbgfs_nr_ctxs++;
+
+	return 0;
+}
+
+static ssize_t dbgfs_mk_context_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *ppos)
+{
+	char *kbuf;
+	char *ctx_name;
+	ssize_t ret = count;
+	int err;
+
+	kbuf = user_input_str(buf, count, ppos);
+	if (IS_ERR(kbuf))
+		return PTR_ERR(kbuf);
+	ctx_name = kmalloc(count + 1, GFP_KERNEL);
+	if (!ctx_name) {
+		kfree(kbuf);
+		return -ENOMEM;
+	}
+
+	/* Trim white space */
+	if (sscanf(kbuf, "%s", ctx_name) != 1) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	mutex_lock(&damon_dbgfs_lock);
+	err = dbgfs_mk_context(ctx_name);
+	if (err)
+		ret = err;
+	mutex_unlock(&damon_dbgfs_lock);
+
+out:
+	kfree(kbuf);
+	kfree(ctx_name);
+	return ret;
+}
+
+/*
+ * Remove a context of @name and its debugfs directory.
+ *
+ * This function should be called while holding damon_dbgfs_lock.
+ *
+ * Return 0 on success, negative error code otherwise.
+ */
+static int dbgfs_rm_context(char *name)
+{
+	struct dentry *root, *dir, **new_dirs;
+	struct damon_ctx **new_ctxs;
+	int i, j;
+
+	if (damon_nr_running_ctxs())
+		return -EBUSY;
+
+	root = dbgfs_dirs[0];
+	if (!root)
+		return -ENOENT;
+
+	dir = debugfs_lookup(name, root);
+	if (!dir)
+		return -ENOENT;
+
+	new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs),
+			GFP_KERNEL);
+	if (!new_dirs)
+		return -ENOMEM;
+
+	new_ctxs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_ctxs),
+			GFP_KERNEL);
+	if (!new_ctxs) {
+		kfree(new_dirs);
+		return -ENOMEM;
+	}
+
+	for (i = 0, j = 0; i < dbgfs_nr_ctxs; i++) {
+		if (dbgfs_dirs[i] == dir) {
+			debugfs_remove(dbgfs_dirs[i]);
+			dbgfs_destroy_ctx(dbgfs_ctxs[i]);
+			continue;
+		}
+		new_dirs[j] = dbgfs_dirs[i];
+		new_ctxs[j++] = dbgfs_ctxs[i];
+	}
+
+	kfree(dbgfs_dirs);
+	kfree(dbgfs_ctxs);
+
+	dbgfs_dirs = new_dirs;
+	dbgfs_ctxs = new_ctxs;
+	dbgfs_nr_ctxs--;
+
+	return 0;
+}
+
+static ssize_t dbgfs_rm_context_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *ppos)
+{
+	char *kbuf;
+	ssize_t ret = count;
+	int err;
+	char *ctx_name;
+
+	kbuf = user_input_str(buf, count, ppos);
+	if (IS_ERR(kbuf))
+		return PTR_ERR(kbuf);
+	ctx_name = kmalloc(count + 1, GFP_KERNEL);
+	if (!ctx_name) {
+		kfree(kbuf);
+		return -ENOMEM;
+	}
+
+	/* Trim white space */
+	if (sscanf(kbuf, "%s", ctx_name) != 1) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	mutex_lock(&damon_dbgfs_lock);
+	err = dbgfs_rm_context(ctx_name);
+	if (err)
+		ret = err;
+	mutex_unlock(&damon_dbgfs_lock);
+
+out:
+	kfree(kbuf);
+	kfree(ctx_name);
+	return ret;
+}
+
 static ssize_t dbgfs_monitor_on_read(struct file *file,
 		char __user *buf, size_t count, loff_t *ppos)
 {
@@ -370,6 +551,14 @@ static ssize_t dbgfs_monitor_on_write(struct file *file,
 	return ret;
 }
 
+static const struct file_operations mk_contexts_fops = {
+	.write = dbgfs_mk_context_write,
+};
+
+static const struct file_operations rm_contexts_fops = {
+	.write = dbgfs_rm_context_write,
+};
+
 static const struct file_operations monitor_on_fops = {
 	.read = dbgfs_monitor_on_read,
 	.write = dbgfs_monitor_on_write,
@@ -378,8 +567,10 @@ static const struct file_operations monitor_on_fops = {
 static int __init __damon_dbgfs_init(void)
 {
 	struct dentry *dbgfs_root;
-	const char * const file_names[] = {"monitor_on"};
-	const struct file_operations *fops[] = {&monitor_on_fops};
+	const char * const file_names[] = {"mk_contexts", "rm_contexts",
+		"monitor_on"};
+	const struct file_operations *fops[] = {&mk_contexts_fops,
+		&rm_contexts_fops, &monitor_on_fops};
 	int i;
 
 	dbgfs_root = debugfs_create_dir("damon", NULL);

From 5fe381676c1f13e3358a9fe4e8215e7c0252beca Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 30 Jun 2020 14:02:48 +0100
Subject: [PATCH 178/737] arm64: lto: Strengthen READ_ONCE() to acquire when
 CONFIG_LTO=y

When building with LTO, there is an increased risk of the compiler
converting an address dependency headed by a READ_ONCE() invocation
into a control dependency and consequently allowing for harmful
reordering by the CPU.

Ensure that such transformations are harmless by overriding the generic
READ_ONCE() definition with one that provides acquire semantics when
building with LTO.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit e35123d83ee35c31f64ecfbdfabbe5142d3025b8)
---
 arch/arm64/include/asm/rwonce.h   | 73 +++++++++++++++++++++++++++++++
 arch/arm64/kernel/vdso/Makefile   |  2 +-
 arch/arm64/kernel/vdso32/Makefile |  2 +-
 arch/arm64/kernel/vmlinux.lds.S   |  2 +-
 4 files changed, 76 insertions(+), 3 deletions(-)
 create mode 100644 arch/arm64/include/asm/rwonce.h

diff --git a/arch/arm64/include/asm/rwonce.h b/arch/arm64/include/asm/rwonce.h
new file mode 100644
index 0000000000000..1bce62fa908a3
--- /dev/null
+++ b/arch/arm64/include/asm/rwonce.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020 Google LLC.
+ */
+#ifndef __ASM_RWONCE_H
+#define __ASM_RWONCE_H
+
+#ifdef CONFIG_LTO
+
+#include <linux/compiler_types.h>
+#include <asm/alternative-macros.h>
+
+#ifndef BUILD_VDSO
+
+#ifdef CONFIG_AS_HAS_LDAPR
+#define __LOAD_RCPC(sfx, regs...)					\
+	ALTERNATIVE(							\
+		"ldar"	#sfx "\t" #regs,				\
+		".arch_extension rcpc\n"				\
+		"ldapr"	#sfx "\t" #regs,				\
+	ARM64_HAS_LDAPR)
+#else
+#define __LOAD_RCPC(sfx, regs...)	"ldar" #sfx "\t" #regs
+#endif /* CONFIG_AS_HAS_LDAPR */
+
+/*
+ * When building with LTO, there is an increased risk of the compiler
+ * converting an address dependency headed by a READ_ONCE() invocation
+ * into a control dependency and consequently allowing for harmful
+ * reordering by the CPU.
+ *
+ * Ensure that such transformations are harmless by overriding the generic
+ * READ_ONCE() definition with one that provides RCpc acquire semantics
+ * when building with LTO.
+ */
+#define __READ_ONCE(x)							\
+({									\
+	typeof(&(x)) __x = &(x);					\
+	int atomic = 1;							\
+	union { __unqual_scalar_typeof(*__x) __val; char __c[1]; } __u;	\
+	switch (sizeof(x)) {						\
+	case 1:								\
+		asm volatile(__LOAD_RCPC(b, %w0, %1)			\
+			: "=r" (*(__u8 *)__u.__c)			\
+			: "Q" (*__x) : "memory");			\
+		break;							\
+	case 2:								\
+		asm volatile(__LOAD_RCPC(h, %w0, %1)			\
+			: "=r" (*(__u16 *)__u.__c)			\
+			: "Q" (*__x) : "memory");			\
+		break;							\
+	case 4:								\
+		asm volatile(__LOAD_RCPC(, %w0, %1)			\
+			: "=r" (*(__u32 *)__u.__c)			\
+			: "Q" (*__x) : "memory");			\
+		break;							\
+	case 8:								\
+		asm volatile(__LOAD_RCPC(, %0, %1)			\
+			: "=r" (*(__u64 *)__u.__c)			\
+			: "Q" (*__x) : "memory");			\
+		break;							\
+	default:							\
+		atomic = 0;						\
+	}								\
+	atomic ? (typeof(*__x))__u.__val : (*(volatile typeof(__x))__x);\
+})
+
+#endif	/* !BUILD_VDSO */
+#endif	/* CONFIG_LTO */
+
+#include <asm-generic/rwonce.h>
+
+#endif	/* __ASM_RWONCE_H */
diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile
index d65f52264abae..a8f8e409e2bfb 100644
--- a/arch/arm64/kernel/vdso/Makefile
+++ b/arch/arm64/kernel/vdso/Makefile
@@ -28,7 +28,7 @@ ldflags-y := -shared -nostdlib -soname=linux-vdso.so.1 --hash-style=sysv	\
 	     $(btildflags-y) -T
 
 ccflags-y := -fno-common -fno-builtin -fno-stack-protector -ffixed-x18
-ccflags-y += -DDISABLE_BRANCH_PROFILING
+ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
 
 CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) $(GCC_PLUGINS_CFLAGS)
 KASAN_SANITIZE			:= n
diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile
index abad38c576e1d..57b28f1e5d97c 100644
--- a/arch/arm64/kernel/vdso32/Makefile
+++ b/arch/arm64/kernel/vdso32/Makefile
@@ -37,7 +37,7 @@ cc32-as-instr = $(call try-run,\
 # As a result we set our own flags here.
 
 # KBUILD_CPPFLAGS and NOSTDINC_FLAGS from top-level Makefile
-VDSO_CPPFLAGS := -D__KERNEL__ -nostdinc
+VDSO_CPPFLAGS := -DBUILD_VDSO -D__KERNEL__ -nostdinc
 VDSO_CPPFLAGS += -isystem $(shell $(CC_COMPAT) -print-file-name=include 2>/dev/null)
 VDSO_CPPFLAGS += $(LINUXINCLUDE)
 
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 71f4b5f24d15f..31fdb55cd4e22 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -199,7 +199,7 @@ SECTIONS
 		INIT_CALLS
 		CON_INITCALL
 		INIT_RAM_FS
-		*(.init.rodata.* .init.bss)	/* from the EFI stub */
+		*(.init.altinstructions .init.rodata.* .init.bss)	/* from the EFI stub */
 	}
 	.exit.data : {
 		EXIT_DATA

From 2289d282c90f55f862f20e7479b2efaf5ea0d16a Mon Sep 17 00:00:00 2001
From: Youling Tang <tangyouling@loongson.cn>
Date: Thu, 19 Nov 2020 09:45:40 +0800
Subject: [PATCH 179/737] arm64: vmlinux.lds.S: Drop redundant *.init.rodata.*

We currently try to emit *.init.rodata.* twice, once in INIT_DATA, and once
in the line immediately following it. As the two section definitions are
identical, the latter is redundant and can be dropped.

This patch drops the redundant *.init.rodata.* section definition.

Signed-off-by: Youling Tang <tangyouling@loongson.cn>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/1605750340-910-1-git-send-email-tangyouling@loongson.cn
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
(cherry picked from commit 344f2db2a18af45faafce13133c84c4f076876a6)
---
 arch/arm64/kernel/vmlinux.lds.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 31fdb55cd4e22..b079c1b1259f4 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -199,7 +199,7 @@ SECTIONS
 		INIT_CALLS
 		CON_INITCALL
 		INIT_RAM_FS
-		*(.init.altinstructions .init.rodata.* .init.bss)	/* from the EFI stub */
+		*(.init.altinstructions .init.bss)	/* from the EFI stub */
 	}
 	.exit.data : {
 		EXIT_DATA

From 3856d77eab56e88e191793586ecee40090d0e29f Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Fri, 11 Dec 2020 10:46:32 -0800
Subject: [PATCH 180/737] arm64: disable recordmcount with
 DYNAMIC_FTRACE_WITH_REGS

DYNAMIC_FTRACE_WITH_REGS uses -fpatchable-function-entry, which makes
running recordmcount unnecessary as there are no mcount calls in object
files, and __mcount_loc doesn't need to be generated.

While there's normally no harm in running recordmcount even when it's
not strictly needed, this won't work with LTO as we have LLVM bitcode
instead of ELF objects.

This change selects FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY, which
disables recordmcount when patchable function entries are used instead.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Acked-by: Will Deacon <will@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20201211184633.3213045-16-samitolvanen@google.com
(cherry picked from commit a31d793dbabd9251b5f46fb885a307b042bc79fe)
---
 arch/arm64/Kconfig | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 63fc7f80ad918..9bb7463c09404 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -159,6 +159,8 @@ config ARM64
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS \
 		if $(cc-option,-fpatchable-function-entry=2)
+	select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY \
+		if DYNAMIC_FTRACE_WITH_REGS
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
 	select HAVE_FAST_GUP
 	select HAVE_FTRACE_MCOUNT_RECORD

From 6fa290b9c69891658a1a47215f18424f57417af3 Mon Sep 17 00:00:00 2001
From: Jianlin Lv <Jianlin.Lv@arm.com>
Date: Tue, 12 Jan 2021 09:58:13 +0800
Subject: [PATCH 181/737] arm64: rename S_FRAME_SIZE to PT_REGS_SIZE

S_FRAME_SIZE is the size of the pt_regs structure, no longer the size of
the kernel stack frame, the name is misleading. In keeping with arm32,
rename S_FRAME_SIZE to PT_REGS_SIZE.

Signed-off-by: Jianlin Lv <Jianlin.Lv@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20210112015813.2340969-1-Jianlin.Lv@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
(cherry picked from commit 71e70184f1d1314ad56e834d1befc07daa2af8e6)
---
 arch/arm64/kernel/asm-offsets.c               |  2 +-
 arch/arm64/kernel/entry-ftrace.S              | 12 ++++++------
 arch/arm64/kernel/entry.S                     | 18 +++++++++---------
 arch/arm64/kernel/probes/kprobes_trampoline.S |  6 +++---
 4 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 7d32fc959b1a0..c10c4d8313aa7 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -73,7 +73,7 @@ int main(void)
   DEFINE(S_ORIG_ADDR_LIMIT,	offsetof(struct pt_regs, orig_addr_limit));
   DEFINE(S_PMR_SAVE,		offsetof(struct pt_regs, pmr_save));
   DEFINE(S_STACKFRAME,		offsetof(struct pt_regs, stackframe));
-  DEFINE(S_FRAME_SIZE,		sizeof(struct pt_regs));
+  DEFINE(PT_REGS_SIZE,		sizeof(struct pt_regs));
   BLANK();
 #ifdef CONFIG_COMPAT
   DEFINE(COMPAT_SIGFRAME_REGS_OFFSET,		offsetof(struct compat_sigframe, uc.uc_mcontext.arm_r0));
diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S
index 67f68c9ef94c4..8cf970d219f5d 100644
--- a/arch/arm64/kernel/entry-ftrace.S
+++ b/arch/arm64/kernel/entry-ftrace.S
@@ -35,7 +35,7 @@
  */
 	.macro  ftrace_regs_entry, allregs=0
 	/* Make room for pt_regs, plus a callee frame */
-	sub	sp, sp, #(S_FRAME_SIZE + 16)
+	sub	sp, sp, #(PT_REGS_SIZE + 16)
 
 	/* Save function arguments (and x9 for simplicity) */
 	stp	x0, x1, [sp, #S_X0]
@@ -61,15 +61,15 @@
 	.endif
 
 	/* Save the callsite's SP and LR */
-	add	x10, sp, #(S_FRAME_SIZE + 16)
+	add	x10, sp, #(PT_REGS_SIZE + 16)
 	stp	x9, x10, [sp, #S_LR]
 
 	/* Save the PC after the ftrace callsite */
 	str	x30, [sp, #S_PC]
 
 	/* Create a frame record for the callsite above pt_regs */
-	stp	x29, x9, [sp, #S_FRAME_SIZE]
-	add	x29, sp, #S_FRAME_SIZE
+	stp	x29, x9, [sp, #PT_REGS_SIZE]
+	add	x29, sp, #PT_REGS_SIZE
 
 	/* Create our frame record within pt_regs. */
 	stp	x29, x30, [sp, #S_STACKFRAME]
@@ -126,7 +126,7 @@ ftrace_common_return:
 	ldr	x9, [sp, #S_PC]
 
 	/* Restore the callsite's SP */
-	add	sp, sp, #S_FRAME_SIZE + 16
+	add	sp, sp, #PT_REGS_SIZE + 16
 
 	ret	x9
 SYM_CODE_END(ftrace_common)
@@ -136,7 +136,7 @@ SYM_CODE_START(ftrace_graph_caller)
 	ldr	x0, [sp, #S_PC]
 	sub	x0, x0, #AARCH64_INSN_SIZE	// ip (callsite's BL insn)
 	add	x1, sp, #S_LR			// parent_ip (callsite's LR)
-	ldr	x2, [sp, #S_FRAME_SIZE]	   	// parent fp (callsite's FP)
+	ldr	x2, [sp, #PT_REGS_SIZE]	   	// parent fp (callsite's FP)
 	bl	prepare_ftrace_return
 	b	ftrace_common_return
 SYM_CODE_END(ftrace_graph_caller)
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 55e477f73158d..f0d938540a6ef 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -78,7 +78,7 @@
 .Lskip_tramp_vectors_cleanup\@:
 	.endif
 
-	sub	sp, sp, #S_FRAME_SIZE
+	sub	sp, sp, #PT_REGS_SIZE
 #ifdef CONFIG_VMAP_STACK
 	/*
 	 * Test whether the SP has overflowed, without corrupting a GPR.
@@ -99,7 +99,7 @@
 	 * userspace, and can clobber EL0 registers to free up GPRs.
 	 */
 
-	/* Stash the original SP (minus S_FRAME_SIZE) in tpidr_el0. */
+	/* Stash the original SP (minus PT_REGS_SIZE) in tpidr_el0. */
 	msr	tpidr_el0, x0
 
 	/* Recover the original x0 value and stash it in tpidrro_el0 */
@@ -223,7 +223,7 @@ alternative_else_nop_endif
 
 	scs_load_current
 	.else
-	add	x21, sp, #S_FRAME_SIZE
+	add	x21, sp, #PT_REGS_SIZE
 	get_current_task tsk
 	/* Save the task's original addr_limit and set USER_DS */
 	ldr	x20, [tsk, #TSK_TI_ADDR_LIMIT]
@@ -362,7 +362,7 @@ alternative_else_nop_endif
 	.if	\el == 0
 alternative_if_not ARM64_UNMAP_KERNEL_AT_EL0
 	ldr	lr, [sp, #S_LR]
-	add	sp, sp, #S_FRAME_SIZE		// restore sp
+	add	sp, sp, #PT_REGS_SIZE		// restore sp
 	eret
 alternative_else_nop_endif
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
@@ -376,7 +376,7 @@ alternative_else_nop_endif
 #endif
 	.else
 	ldr	lr, [sp, #S_LR]
-	add	sp, sp, #S_FRAME_SIZE		// restore sp
+	add	sp, sp, #PT_REGS_SIZE		// restore sp
 
 	/* Ensure any device/NC reads complete */
 	alternative_insn nop, "dmb sy", ARM64_WORKAROUND_1508412
@@ -591,12 +591,12 @@ __bad_stack:
 
 	/*
 	 * Store the original GPRs to the new stack. The orginal SP (minus
-	 * S_FRAME_SIZE) was stashed in tpidr_el0 by kernel_ventry.
+	 * PT_REGS_SIZE) was stashed in tpidr_el0 by kernel_ventry.
 	 */
-	sub	sp, sp, #S_FRAME_SIZE
+	sub	sp, sp, #PT_REGS_SIZE
 	kernel_entry 1
 	mrs	x0, tpidr_el0
-	add	x0, x0, #S_FRAME_SIZE
+	add	x0, x0, #PT_REGS_SIZE
 	str	x0, [sp, #S_SP]
 
 	/* Stash the regs for handle_bad_stack */
@@ -892,7 +892,7 @@ alternative_else_nop_endif
 	.if	\regsize == 64
 	mrs	x29, far_el1
 	.endif
-	add	sp, sp, #S_FRAME_SIZE		// restore sp
+	add	sp, sp, #PT_REGS_SIZE		// restore sp
 	eret
 	sb
 	.endm
diff --git a/arch/arm64/kernel/probes/kprobes_trampoline.S b/arch/arm64/kernel/probes/kprobes_trampoline.S
index 890ca72c5a514..288a84e253ccb 100644
--- a/arch/arm64/kernel/probes/kprobes_trampoline.S
+++ b/arch/arm64/kernel/probes/kprobes_trampoline.S
@@ -25,7 +25,7 @@
 	stp x24, x25, [sp, #S_X24]
 	stp x26, x27, [sp, #S_X26]
 	stp x28, x29, [sp, #S_X28]
-	add x0, sp, #S_FRAME_SIZE
+	add x0, sp, #PT_REGS_SIZE
 	stp lr, x0, [sp, #S_LR]
 	/*
 	 * Construct a useful saved PSTATE
@@ -62,7 +62,7 @@
 	.endm
 
 SYM_CODE_START(kretprobe_trampoline)
-	sub sp, sp, #S_FRAME_SIZE
+	sub sp, sp, #PT_REGS_SIZE
 
 	save_all_base_regs
 
@@ -76,7 +76,7 @@ SYM_CODE_START(kretprobe_trampoline)
 
 	restore_all_base_regs
 
-	add sp, sp, #S_FRAME_SIZE
+	add sp, sp, #PT_REGS_SIZE
 	ret
 
 SYM_CODE_END(kretprobe_trampoline)

From 942a32539d2eedbdbdec193472e8860a40dedf6a Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 13 Jan 2021 17:31:55 +0000
Subject: [PATCH 182/737] arm64: remove EL0 exception frame record

When entering an exception from EL0, the entry code creates a synthetic
frame record with a NULL PC. This was used by the code introduced in
commit:

  7326749801396105 ("arm64: unwind: reference pt_regs via embedded stack frame")

... to discover exception entries on the stack and dump the associated
pt_regs. Since the NULL PC was undesirable for the stacktrace, we added
a special case to unwind_frame() to prevent the NULL PC from being
logged.

Since commit:

  a25ffd3a6302a678 ("arm64: traps: Don't print stack or raw PC/LR values in backtraces")

... we no longer try to dump the pt_regs as part of a stacktrace, and
hence no longer need the synthetic exception record.

This patch removes the synthetic exception record and the associated
special case in unwind_frame(). Instead, EL0 exceptions set the FP to
NULL, as is the case for other terminal records (e.g. when a kernel
thread starts). The synthetic record for exceptions from EL1 is
retrained as this has useful unwind information for the interrupted
context.

To make the terminal case a bit clearer, an explicit check is added to
the start of unwind_frame(). This would otherwise be caught implicitly
by the on_accessible_stack() checks.

Reported-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20210113173155.43063-1-broonie@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 6106e1112cc69a367f495da2e66f13e2bca369fb)
---
 arch/arm64/kernel/entry.S      | 10 +++++-----
 arch/arm64/kernel/stacktrace.c | 13 ++++---------
 2 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index f0d938540a6ef..3e699df094814 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -237,16 +237,16 @@ alternative_else_nop_endif
 	stp	lr, x21, [sp, #S_LR]
 
 	/*
-	 * In order to be able to dump the contents of struct pt_regs at the
-	 * time the exception was taken (in case we attempt to walk the call
-	 * stack later), chain it together with the stack frames.
+	 * For exceptions from EL0, terminate the callchain here.
+	 * For exceptions from EL1, create a synthetic frame record so the
+	 * interrupted code shows up in the backtrace.
 	 */
 	.if \el == 0
-	stp	xzr, xzr, [sp, #S_STACKFRAME]
+	mov	x29, xzr
 	.else
 	stp	x29, x22, [sp, #S_STACKFRAME]
-	.endif
 	add	x29, sp, #S_STACKFRAME
+	.endif
 
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
 alternative_if_not ARM64_HAS_PAN
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index c445828ecc3aa..7aeb6d863c67a 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -44,6 +44,10 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 	unsigned long fp = frame->fp;
 	struct stack_info info;
 
+	/* Terminal record; nothing to unwind */
+	if (!fp)
+		return -EINVAL;
+
 	if (fp & 0xf)
 		return -EINVAL;
 
@@ -104,15 +108,6 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 
 	frame->pc = ptrauth_strip_insn_pac(frame->pc);
 
-	/*
-	 * Frames created upon entry from EL0 have NULL FP and PC values, so
-	 * don't bother reporting these. Frames created by __noreturn functions
-	 * might have a valid FP even if PC is bogus, so only terminate where
-	 * both are NULL.
-	 */
-	if (!frame->fp && !frame->pc)
-		return -EINVAL;
-
 	return 0;
 }
 NOKPROBE_SYMBOL(unwind_frame);

From 8d45c26caaa41ead36947d0e5bf8b544fa17fbb9 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 24 Feb 2021 16:50:37 +0000
Subject: [PATCH 183/737] arm64: stacktrace: Report when we reach the end of
 the stack

Currently the arm64 unwinder code returns -EINVAL whenever it can't find
the next stack frame, not distinguishing between cases where the stack has
been corrupted or is otherwise in a state it shouldn't be and cases
where we have reached the end of the stack. At the minute none of the
callers care what error code is returned but this will be important for
reliable stack trace which needs to be sure that the stack is intact.

Change to return -ENOENT in the case where we reach the bottom of the
stack. The error codes from this function are only used in kernel, this
particular code is chosen as we are indicating that we know there is no
frame there.

Signed-off-by: Mark Brown <broonie@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20210224165037.24138-1-broonie@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 3c02600144bdb0a1280a9090d3a7e37e2f9fdcc8)
---
 arch/arm64/kernel/stacktrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 7aeb6d863c67a..9b3f5efa71190 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -46,7 +46,7 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 
 	/* Terminal record; nothing to unwind */
 	if (!fp)
-		return -EINVAL;
+		return -ENOENT;
 
 	if (fp & 0xf)
 		return -EINVAL;

From fe278a21e0f4fc332ed0202a34fac40426bfa4c4 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 29 Apr 2021 11:20:04 +0100
Subject: [PATCH 184/737] arm64: stacktrace: restore terminal records

We removed the terminal frame records in commit:

   6106e1112cc69a36 ("arm64: remove EL0 exception frame record")

... on the assumption that as we no longer used them to find the pt_regs
at exception boundaries, they were no longer necessary.

However, Leo reports that as an unintended side-effect, this causes
traces which cross secondary_start_kernel to terminate one entry too
late, with a spurious "0" entry.

There are a few ways we could sovle this, but as we're planning to use
terminal records for RELIABLE_STACKTRACE, let's revert the logic change
for now, keeping the update comments and accounting for the changes in
commit:

  3c02600144bdb0a1 ("arm64: stacktrace: Report when we reach the end of the stack")

This is effectively a partial revert of commit:

  6106e1112cc69a36 ("arm64: remove EL0 exception frame record")

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Fixes: 6106e1112cc6 ("arm64: remove EL0 exception frame record")
Reported-by: Leo Yan <leo.yan@linaro.org>
Tested-by: Leo Yan <leo.yan@linaro.org>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: "Madhavan T. Venkataraman" <madvenka@linux.microsoft.com>
Link: https://lore.kernel.org/r/20210429104813.GA33550@C02TD0UTHF1T.local
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
(cherry picked from commit 8533d5bfad41e74b7dd80d292fd484913cdfb374)
---
 arch/arm64/kernel/entry.S      |  6 +++---
 arch/arm64/kernel/stacktrace.c | 10 ++++++----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 3e699df094814..b2338888901e5 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -237,16 +237,16 @@ alternative_else_nop_endif
 	stp	lr, x21, [sp, #S_LR]
 
 	/*
-	 * For exceptions from EL0, terminate the callchain here.
+	 * For exceptions from EL0, create a terminal frame record.
 	 * For exceptions from EL1, create a synthetic frame record so the
 	 * interrupted code shows up in the backtrace.
 	 */
 	.if \el == 0
-	mov	x29, xzr
+	stp	xzr, xzr, [sp, #S_STACKFRAME]
 	.else
 	stp	x29, x22, [sp, #S_STACKFRAME]
-	add	x29, sp, #S_STACKFRAME
 	.endif
+	add	x29, sp, #S_STACKFRAME
 
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
 alternative_if_not ARM64_HAS_PAN
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 9b3f5efa71190..b1518ce7c9543 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -44,10 +44,6 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 	unsigned long fp = frame->fp;
 	struct stack_info info;
 
-	/* Terminal record; nothing to unwind */
-	if (!fp)
-		return -ENOENT;
-
 	if (fp & 0xf)
 		return -EINVAL;
 
@@ -108,6 +104,12 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 
 	frame->pc = ptrauth_strip_insn_pac(frame->pc);
 
+	/*
+	 * This is a terminal record, so we have finished unwinding.
+	 */
+	if (!frame->fp && !frame->pc)
+		return -ENOENT;
+
 	return 0;
 }
 NOKPROBE_SYMBOL(unwind_frame);

From 97e56fbd40802783bf80c68a586a99781426d8b3 Mon Sep 17 00:00:00 2001
From: "Madhavan T. Venkataraman" <madvenka@linux.microsoft.com>
Date: Mon, 10 May 2021 12:00:26 +0100
Subject: [PATCH 185/737] arm64: Implement stack trace termination record

Reliable stacktracing requires that we identify when a stacktrace is
terminated early. We can do this by ensuring all tasks have a final
frame record at a known location on their task stack, and checking
that this is the final frame record in the chain.

We'd like to use task_pt_regs(task)->stackframe as the final frame
record, as this is already setup upon exception entry from EL0. For
kernel tasks we need to consistently reserve the pt_regs and point x29
at this, which we can do with small changes to __primary_switched,
__secondary_switched, and copy_process().

Since the final frame record must be at a specific location, we must
create the final frame record in __primary_switched and
__secondary_switched rather than leaving this to start_kernel and
secondary_start_kernel. Thus, __primary_switched and
__secondary_switched will now show up in stacktraces for the idle tasks.

Since the final frame record is now identified by its location rather
than by its contents, we identify it at the start of unwind_frame(),
before we read any values from it.

External debuggers may terminate the stack trace when FP == 0. In the
pt_regs->stackframe, the PC is 0 as well. So, stack traces taken in the
debugger may print an extra record 0x0 at the end. While this is not
pretty, this does not do any harm. This is a small price to pay for
having reliable stack trace termination in the kernel. That said, gdb
does not show the extra record probably because it uses DWARF and not
frame pointers for stack traces.

Signed-off-by: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
[Mark: rebase, use ASM_BUG(), update comments, update commit message]
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20210510110026.18061-1-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 7d7b720a4b8049446cffce870b1dd3ffa89d4b40)
---
 arch/arm64/kernel/entry.S      |  2 +-
 arch/arm64/kernel/head.S       | 25 +++++++++++++++++++------
 arch/arm64/kernel/process.c    |  5 +++++
 arch/arm64/kernel/stacktrace.c | 16 +++++++---------
 4 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index b2338888901e5..45c3d259d03c2 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -237,7 +237,7 @@ alternative_else_nop_endif
 	stp	lr, x21, [sp, #S_LR]
 
 	/*
-	 * For exceptions from EL0, create a terminal frame record.
+	 * For exceptions from EL0, create a final frame record.
 	 * For exceptions from EL1, create a synthetic frame record so the
 	 * interrupted code shows up in the backtrace.
 	 */
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 351ee64c7deb4..f969e0b3cf969 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -17,6 +17,7 @@
 #include <asm/asm_pointer_auth.h>
 #include <asm/assembler.h>
 #include <asm/boot.h>
+#include <asm/bug.h>
 #include <asm/ptrace.h>
 #include <asm/asm-offsets.h>
 #include <asm/cache.h>
@@ -409,6 +410,18 @@ SYM_FUNC_START_LOCAL(__create_page_tables)
 	ret	x28
 SYM_FUNC_END(__create_page_tables)
 
+	/*
+	 * Create a final frame record at task_pt_regs(current)->stackframe, so
+	 * that the unwinder can identify the final frame record of any task by
+	 * its location in the task stack. We reserve the entire pt_regs space
+	 * for consistency with user tasks and kthreads.
+	 */
+	.macro setup_final_frame
+	sub	sp, sp, #PT_REGS_SIZE
+	stp	xzr, xzr, [sp, #S_STACKFRAME]
+	add	x29, sp, #S_STACKFRAME
+	.endm
+
 /*
  * The following fragment of code is executed with the MMU enabled.
  *
@@ -464,9 +477,9 @@ SYM_FUNC_START_LOCAL(__primary_switched)
 0:
 #endif
 	add	sp, sp, #16
-	mov	x29, #0
-	mov	x30, #0
-	b	start_kernel
+	setup_final_frame
+	bl	start_kernel
+	ASM_BUG()
 SYM_FUNC_END(__primary_switched)
 
 	.pushsection ".rodata", "a"
@@ -748,14 +761,14 @@ SYM_FUNC_START_LOCAL(__secondary_switched)
 	cbz	x2, __secondary_too_slow
 	msr	sp_el0, x2
 	scs_load_current
-	mov	x29, #0
-	mov	x30, #0
+	setup_final_frame
 
 #ifdef CONFIG_ARM64_PTR_AUTH
 	ptrauth_keys_init_cpu x2, x3, x4, x5
 #endif
 
-	b	secondary_start_kernel
+	bl	secondary_start_kernel
+	ASM_BUG()
 SYM_FUNC_END(__secondary_switched)
 
 SYM_FUNC_START_LOCAL(__secondary_too_slow)
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 3696dbcbfa80c..f80bc9dc43df8 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -438,6 +438,11 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
 	}
 	p->thread.cpu_context.pc = (unsigned long)ret_from_fork;
 	p->thread.cpu_context.sp = (unsigned long)childregs;
+	/*
+	 * For the benefit of the unwinder, set up childregs->stackframe
+	 * as the final frame for the new task.
+	 */
+	p->thread.cpu_context.fp = (unsigned long)childregs->stackframe;
 
 	ptrace_hw_copy_thread(p);
 
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index b1518ce7c9543..fcbe659d1aaee 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -44,12 +44,16 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 	unsigned long fp = frame->fp;
 	struct stack_info info;
 
-	if (fp & 0xf)
-		return -EINVAL;
-
 	if (!tsk)
 		tsk = current;
 
+	/* Final frame; nothing to unwind */
+	if (fp == (unsigned long)task_pt_regs(tsk)->stackframe)
+		return -ENOENT;
+
+	if (fp & 0xf)
+		return -EINVAL;
+
 	if (!on_accessible_stack(tsk, fp, &info))
 		return -EINVAL;
 
@@ -104,12 +108,6 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 
 	frame->pc = ptrauth_strip_insn_pac(frame->pc);
 
-	/*
-	 * This is a terminal record, so we have finished unwinding.
-	 */
-	if (!frame->fp && !frame->pc)
-		return -ENOENT;
-
 	return 0;
 }
 NOKPROBE_SYMBOL(unwind_frame);

From ecad26179955284579e31d87ff93d7bd50683ce3 Mon Sep 17 00:00:00 2001
From: "Madhavan T. Venkataraman" <madvenka@linux.microsoft.com>
Date: Wed, 26 May 2021 16:49:16 -0500
Subject: [PATCH 186/737] arm64: Introduce stack trace reliability checks in
 the unwinder

The unwinder should check for the presence of various features and
conditions that can render the stack trace unreliable and mark the
the stack trace as unreliable for the benefit of the caller.

Introduce the first reliability check - If a return PC is not a valid
kernel text address, consider the stack trace unreliable. It could be
some generated code.

Other reliability checks will be added in the future.

Signed-off-by: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
---
 arch/arm64/include/asm/stacktrace.h |  9 +++++++
 arch/arm64/kernel/stacktrace.c      | 38 +++++++++++++++++++++++++----
 2 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h
index eb29b1fe8255e..4c822ef7f5885 100644
--- a/arch/arm64/include/asm/stacktrace.h
+++ b/arch/arm64/include/asm/stacktrace.h
@@ -49,6 +49,13 @@ struct stack_info {
  *
  * @graph:       When FUNCTION_GRAPH_TRACER is selected, holds the index of a
  *               replacement lr value in the ftrace graph stack.
+ *
+ * @reliable:	Is this stack frame reliable? There are several checks that
+ *              need to be performed in unwind_frame() before a stack frame
+ *              is truly reliable. Until all the checks are present, this flag
+ *              is just a place holder. Once all the checks are implemented,
+ *              this comment will be updated and the flag can be used by the
+ *              caller of unwind_frame().
  */
 struct stackframe {
 	unsigned long fp;
@@ -59,6 +66,7 @@ struct stackframe {
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	int graph;
 #endif
+	bool reliable;
 };
 
 extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame);
@@ -169,6 +177,7 @@ static inline void start_backtrace(struct stackframe *frame,
 	bitmap_zero(frame->stacks_done, __NR_STACK_TYPES);
 	frame->prev_fp = 0;
 	frame->prev_type = STACK_TYPE_UNKNOWN;
+	frame->reliable = true;
 }
 
 #endif	/* __ASM_STACKTRACE_H */
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index fcbe659d1aaee..b213c3ab7c281 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -44,6 +44,8 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 	unsigned long fp = frame->fp;
 	struct stack_info info;
 
+	frame->reliable = true;
+
 	if (!tsk)
 		tsk = current;
 
@@ -51,14 +53,20 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 	if (fp == (unsigned long)task_pt_regs(tsk)->stackframe)
 		return -ENOENT;
 
-	if (fp & 0xf)
+	if (fp & 0xf) {
+		frame->reliable = false;
 		return -EINVAL;
+	}
 
-	if (!on_accessible_stack(tsk, fp, &info))
+	if (!on_accessible_stack(tsk, fp, &info)) {
+		frame->reliable = false;
 		return -EINVAL;
+	}
 
-	if (test_bit(info.type, frame->stacks_done))
+	if (test_bit(info.type, frame->stacks_done)) {
+		frame->reliable = false;
 		return -EINVAL;
+	}
 
 	/*
 	 * As stacks grow downward, any valid record on the same stack must be
@@ -74,8 +82,10 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 	 * stack.
 	 */
 	if (info.type == frame->prev_type) {
-		if (fp <= frame->prev_fp)
+		if (fp <= frame->prev_fp) {
+			frame->reliable = false;
 			return -EINVAL;
+		}
 	} else {
 		set_bit(frame->prev_type, frame->stacks_done);
 	}
@@ -100,14 +110,32 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 		 * So replace it to an original value.
 		 */
 		ret_stack = ftrace_graph_get_ret_stack(tsk, frame->graph++);
-		if (WARN_ON_ONCE(!ret_stack))
+		if (WARN_ON_ONCE(!ret_stack)) {
+			frame->reliable = false;
 			return -EINVAL;
+		}
 		frame->pc = ret_stack->ret;
 	}
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
 	frame->pc = ptrauth_strip_insn_pac(frame->pc);
 
+	/*
+	 * Check the return PC for conditions that make unwinding unreliable.
+	 * In each case, mark the stack trace as such.
+	 */
+
+	/*
+	 * Make sure that the return address is a proper kernel text address.
+	 * A NULL or invalid return address could mean:
+	 *
+	 *	- generated code such as eBPF and optprobe trampolines
+	 *	- Foreign code (e.g. EFI runtime services)
+	 *	- Procedure Linkage Table (PLT) entries and veneer functions
+	 */
+	if (!__kernel_text_address(frame->pc))
+		frame->reliable = false;
+
 	return 0;
 }
 NOKPROBE_SYMBOL(unwind_frame);

From 9b08f52ca6b815721cb19f81271f2128c90d8a26 Mon Sep 17 00:00:00 2001
From: "Madhavan T. Venkataraman" <madvenka@linux.microsoft.com>
Date: Wed, 26 May 2021 16:49:17 -0500
Subject: [PATCH 187/737] arm64: Create a list of SYM_CODE functions, check
 return PC against list

The unwinder should check if the return PC falls in any function that
is considered unreliable from an unwinding perspective. If it does,
mark the stack trace unreliable.

Function types
==============

The compiler generates code for C functions and assigns the type STT_FUNC
to them.

Assembly functions are manually assigned a type:

	- STT_FUNC for functions defined with SYM_FUNC*() macros

	- STT_NONE for functions defined with SYM_CODE*() macros

In the future, STT_FUNC functions will be analyzed by objtool and "fixed"
as necessary. So, they are not "interesting" to the reliable unwinder in
the kernel.

That leaves SYM_CODE*() functions. These contain low-level code that is
difficult or impossible for objtool to analyze. So, objtool ignores them
leaving them to the reliable unwinder. These functions must be considered
unreliable from an unwinding perspective.

Define a special section for unreliable functions
=================================================

Define a SYM_CODE_END() macro for arm64 that adds the function address
range to a new section called "sym_code_functions".

Linker file
===========

Include the "sym_code_functions" section under initdata in vmlinux.lds.S.

Initialization
==============

Define an early_initcall() to copy the function address ranges from the
"sym_code_functions" section to an array by the same name.

Unwinder check
==============

Define a function called unwinder_is_unreliable() that compares a return
PC with sym_code_functions[]. If there is a match, then mark the stack trace
as unreliable. Call unwinder_is_unreliable() from unwind_frame().

Signed-off-by: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
[Move final frame check; if a SYM_CODE function occurs in the very last frame in
 the stack trace then it is not considered unreliable because there is no more
 unwinding to do]
Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/include/asm/linkage.h  |  12 +++
 arch/arm64/include/asm/sections.h |   1 +
 arch/arm64/kernel/stacktrace.c    | 127 ++++++++++++++++++++++++++++--
 arch/arm64/kernel/vmlinux.lds.S   |   7 ++
 4 files changed, 142 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/linkage.h b/arch/arm64/include/asm/linkage.h
index ba89a9af820ab..3b5f1fd332b01 100644
--- a/arch/arm64/include/asm/linkage.h
+++ b/arch/arm64/include/asm/linkage.h
@@ -60,4 +60,16 @@
 		SYM_FUNC_END(x);		\
 		SYM_FUNC_END_ALIAS(__pi_##x)
 
+/*
+ * Record the address range of each SYM_CODE function in a struct code_range
+ * in a special section.
+ */
+#define SYM_CODE_END(name)				\
+	SYM_END(name, SYM_T_NONE)			;\
+	99:						;\
+	.pushsection "sym_code_functions", "aw"		;\
+	.quad	name					;\
+	.quad	99b					;\
+	.popsection
+
 #endif
diff --git a/arch/arm64/include/asm/sections.h b/arch/arm64/include/asm/sections.h
index 6a45c26da46e3..f17dbece80bb2 100644
--- a/arch/arm64/include/asm/sections.h
+++ b/arch/arm64/include/asm/sections.h
@@ -18,6 +18,7 @@ extern char __exittext_begin[], __exittext_end[];
 extern char __irqentry_text_start[], __irqentry_text_end[];
 extern char __mmuoff_data_start[], __mmuoff_data_end[];
 extern char __entry_tramp_text_start[], __entry_tramp_text_end[];
+extern char __sym_code_functions_start[], __sym_code_functions_end[];
 
 static inline size_t entry_tramp_text_size(void)
 {
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index b213c3ab7c281..500925251ef70 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -12,12 +12,116 @@
 #include <linux/sched/debug.h>
 #include <linux/sched/task_stack.h>
 #include <linux/stacktrace.h>
+#include <linux/slab.h>
 
 #include <asm/irq.h>
 #include <asm/pointer_auth.h>
 #include <asm/stack_pointer.h>
 #include <asm/stacktrace.h>
 
+struct code_range {
+	unsigned long	start;
+	unsigned long	end;
+};
+
+static struct code_range	*sym_code_functions;
+static int			num_sym_code_functions;
+
+int __init init_sym_code_functions(void)
+{
+	size_t size;
+
+	size = (unsigned long)__sym_code_functions_end -
+	       (unsigned long)__sym_code_functions_start;
+
+	sym_code_functions = kmalloc(size, GFP_KERNEL);
+	if (!sym_code_functions)
+		return -ENOMEM;
+
+	memcpy(sym_code_functions, __sym_code_functions_start, size);
+	/* Update num_sym_code_functions after copying sym_code_functions. */
+	smp_mb();
+	num_sym_code_functions = size / sizeof(struct code_range);
+
+	return 0;
+}
+early_initcall(init_sym_code_functions);
+
+/*
+ * Check the return PC against sym_code_functions[]. If there is a match, then
+ * the consider the stack frame unreliable. These functions contain low-level
+ * code where the frame pointer and/or the return address register cannot be
+ * relied upon. This addresses the following situations:
+ *
+ *	- Exception handlers and entry assembly
+ *	- Trampoline assembly (e.g., ftrace, kprobes)
+ *	- Hypervisor-related assembly
+ *	- Hibernation-related assembly
+ *	- CPU start-stop, suspend-resume assembly
+ *	- Kernel relocation assembly
+ *
+ * Some special cases covered by sym_code_functions[] deserve a mention here:
+ *
+ *	- All EL1 interrupt and exception stack traces will be considered
+ *	  unreliable. This is the correct behavior as interrupts and exceptions
+ *	  can happen on any instruction including ones in the frame pointer
+ *	  prolog and epilog. Unless stack metadata is available so the unwinder
+ *	  can unwind through these special cases, such stack traces will be
+ *	  considered unreliable.
+ *
+ *	- A task can get preempted at the end of an interrupt. Stack traces
+ *	  of preempted tasks will show the interrupt frame in the stack trace
+ *	  and will be considered unreliable.
+ *
+ *	- Breakpoints are exceptions. So, all stack traces in the break point
+ *	  handler (including probes) will be considered unreliable.
+ *
+ *	- All of the ftrace entry trampolines are considered unreliable. So,
+ *	  all stack traces taken from tracer functions will be considered
+ *	  unreliable.
+ *
+ *	- The Function Graph Tracer return trampoline (return_to_handler)
+ *	  and the Kretprobe return trampoline (kretprobe_trampoline) are
+ *	  also considered unreliable.
+ *
+ * Some of the special cases above can be unwound through using special logic
+ * in unwind_frame().
+ *
+ *	- return_to_handler() is handled by the unwinder by attempting to
+ *	  retrieve the original return address from the per-task return
+ *	  address stack.
+ *
+ *	- kretprobe_trampoline() can be handled in a similar fashion by
+ *	  attempting to retrieve the original return address from the per-task
+ *	  kretprobe instance list.
+ *
+ *	- I reckon optprobes can be handled in a similar fashion in the future?
+ *
+ *	- Stack traces taken from the FTrace tracer functions can be handled
+ *	  as well. ftrace_call is an inner label defined in the Ftrace entry
+ *	  trampoline. This is the location where the call to a tracer function
+ *	  is patched. So, if the return PC equals ftrace_call+4, it is
+ *	  reliable. At that point, proper stack frames have already been set
+ *	  up for the traced function and its caller.
+ */
+static bool unwinder_is_unreliable(unsigned long pc)
+{
+	const struct code_range *range;
+	int i;
+
+	/*
+	 * If sym_code_functions[] were sorted, a binary search could be
+	 * done to make this more performant.
+	 */
+	for (i = 0; i < num_sym_code_functions; i++) {
+		range = &sym_code_functions[i];
+		if (pc >= range->start && pc < range->end)
+			return true;
+	}
+
+	return false;
+}
+
 /*
  * AArch64 PCS assigns the frame pointer to x29.
  *
@@ -49,10 +153,6 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 	if (!tsk)
 		tsk = current;
 
-	/* Final frame; nothing to unwind */
-	if (fp == (unsigned long)task_pt_regs(tsk)->stackframe)
-		return -ENOENT;
-
 	if (fp & 0xf) {
 		frame->reliable = false;
 		return -EINVAL;
@@ -133,7 +233,24 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 	 *	- Foreign code (e.g. EFI runtime services)
 	 *	- Procedure Linkage Table (PLT) entries and veneer functions
 	 */
-	if (!__kernel_text_address(frame->pc))
+	if (!__kernel_text_address(frame->pc)) {
+		frame->reliable = false;
+		return 0;
+	}
+
+	/*
+	 * If the final frame has been reached, there is no more unwinding
+	 * to do. There is no need to check if the return PC is considered
+	 * unreliable by the unwinder.
+	 */
+	if (!frame->fp)
+		return 0;
+
+	/* Final frame; nothing to unwind */
+	if (frame->fp == (unsigned long)task_pt_regs(tsk)->stackframe)
+		return -ENOENT;
+
+	if (unwinder_is_unreliable(frame->pc))
 		frame->reliable = false;
 
 	return 0;
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index b079c1b1259f4..df578971e373a 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -85,6 +85,12 @@ jiffies = jiffies_64;
 #define TRAMP_TEXT
 #endif
 
+#define SYM_CODE_FUNCTIONS                                     \
+       . = ALIGN(16);                                           \
+       __sym_code_functions_start = .;                         \
+       KEEP(*(sym_code_functions))                             \
+       __sym_code_functions_end = .;
+
 /*
  * The size of the PE/COFF section that covers the kernel image, which
  * runs from _stext to _edata, must be a round multiple of the PE/COFF
@@ -200,6 +206,7 @@ SECTIONS
 		CON_INITCALL
 		INIT_RAM_FS
 		*(.init.altinstructions .init.bss)	/* from the EFI stub */
+               SYM_CODE_FUNCTIONS
 	}
 	.exit.data : {
 		EXIT_DATA

From 7019b0788290831c96b81792a1c2ac5e39d5d1e0 Mon Sep 17 00:00:00 2001
From: "Madhavan T. Venkataraman" <madvenka@linux.microsoft.com>
Date: Mon, 15 Mar 2021 11:58:00 -0500
Subject: [PATCH 188/737] arm64: Implement arch_stack_walk_reliable()

unwind_frame() already sets the reliable flag in the stack frame during
a stack walk to indicate whether the stack trace is reliable or not.

Implement arch_stack_walk_reliable() like arch_stack_walk() but abort
the stack walk as soon as the reliable flag is set to false for any
reason.

Signed-off-by: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
---
 arch/arm64/Kconfig             |  1 +
 arch/arm64/kernel/stacktrace.c | 35 ++++++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 9bb7463c09404..398068e28a173 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -161,6 +161,7 @@ config ARM64
 		if $(cc-option,-fpatchable-function-entry=2)
 	select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY \
 		if DYNAMIC_FTRACE_WITH_REGS
+	select HAVE_RELIABLE_STACKTRACE
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
 	select HAVE_FAST_GUP
 	select HAVE_FTRACE_MCOUNT_RECORD
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 500925251ef70..4fd04613ebc94 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -358,4 +358,39 @@ noinline notrace void arch_stack_walk(stack_trace_consume_fn consume_entry,
 	walk_stackframe(task, &frame, consume_entry, cookie);
 }
 
+/*
+ * Walk the stack like arch_stack_walk() but stop the walk as soon as
+ * some unreliability is detected in the stack.
+ */
+int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
+			      void *cookie, struct task_struct *task)
+{
+	struct stackframe frame;
+	int ret = 0;
+
+	if (task == current) {
+		start_backtrace(&frame,
+				(unsigned long)__builtin_frame_address(0),
+				(unsigned long)arch_stack_walk_reliable);
+	} else {
+		/*
+		 * The task must not be running anywhere for the duration of
+		 * arch_stack_walk_reliable(). The caller must guarantee
+		 * this.
+		 */
+		start_backtrace(&frame, thread_saved_fp(task),
+				thread_saved_pc(task));
+	}
+
+	while (!ret) {
+		if (!frame.reliable)
+			return -EINVAL;
+		if (!consume_entry(cookie, frame.pc))
+			return -EINVAL;
+		ret = unwind_frame(task, &frame);
+	}
+
+	return ret == -ENOENT ? 0 : -EINVAL;
+}
+
 #endif

From 623b38261c7c95d8168251ff521cc7638fe0e33d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 29 Apr 2020 17:03:22 +0200
Subject: [PATCH 189/737] rbtree: Add generic add and find helpers

I've always been bothered by the endless (fragile) boilerplate for
rbtree, and I recently wrote some rbtree helpers for objtool and
figured I should lift them into the kernel and use them more widely.

Provide:

partial-order; less() based:
 - rb_add(): add a new entry to the rbtree
 - rb_add_cached(): like rb_add(), but for a rb_root_cached

total-order; cmp() based:
 - rb_find(): find an entry in an rbtree
 - rb_find_add(): find an entry, and add if not found

 - rb_find_first(): find the first (leftmost) matching entry
 - rb_next_match(): continue from rb_find_first()
 - rb_for_each(): iterate a sub-tree using the previous two

Inlining and constant propagation should see the compiler inline the
whole thing, including the various compare functions.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Michel Lespinasse <walken@google.com>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
(cherry picked from commit 2d24dd5798d0474d9bf705bfca8725e7d20f9d54)
---
 include/linux/rbtree.h       | 190 ++++++++++++++++++++++++++++++++++
 tools/include/linux/rbtree.h | 192 ++++++++++++++++++++++++++++++++++-
 tools/objtool/elf.c          |  73 ++-----------
 3 files changed, 392 insertions(+), 63 deletions(-)

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index d7db179963221..e0b300de8f3fa 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -158,4 +158,194 @@ static inline void rb_replace_node_cached(struct rb_node *victim,
 	rb_replace_node(victim, new, &root->rb_root);
 }
 
+/*
+ * The below helper functions use 2 operators with 3 different
+ * calling conventions. The operators are related like:
+ *
+ *	comp(a->key,b) < 0  := less(a,b)
+ *	comp(a->key,b) > 0  := less(b,a)
+ *	comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
+ *
+ * If these operators define a partial order on the elements we make no
+ * guarantee on which of the elements matching the key is found. See
+ * rb_find().
+ *
+ * The reason for this is to allow the find() interface without requiring an
+ * on-stack dummy object, which might not be feasible due to object size.
+ */
+
+/**
+ * rb_add_cached() - insert @node into the leftmost cached tree @tree
+ * @node: node to insert
+ * @tree: leftmost cached tree to insert @node into
+ * @less: operator defining the (partial) node order
+ */
+static __always_inline void
+rb_add_cached(struct rb_node *node, struct rb_root_cached *tree,
+	      bool (*less)(struct rb_node *, const struct rb_node *))
+{
+	struct rb_node **link = &tree->rb_root.rb_node;
+	struct rb_node *parent = NULL;
+	bool leftmost = true;
+
+	while (*link) {
+		parent = *link;
+		if (less(node, parent)) {
+			link = &parent->rb_left;
+		} else {
+			link = &parent->rb_right;
+			leftmost = false;
+		}
+	}
+
+	rb_link_node(node, parent, link);
+	rb_insert_color_cached(node, tree, leftmost);
+}
+
+/**
+ * rb_add() - insert @node into @tree
+ * @node: node to insert
+ * @tree: tree to insert @node into
+ * @less: operator defining the (partial) node order
+ */
+static __always_inline void
+rb_add(struct rb_node *node, struct rb_root *tree,
+       bool (*less)(struct rb_node *, const struct rb_node *))
+{
+	struct rb_node **link = &tree->rb_node;
+	struct rb_node *parent = NULL;
+
+	while (*link) {
+		parent = *link;
+		if (less(node, parent))
+			link = &parent->rb_left;
+		else
+			link = &parent->rb_right;
+	}
+
+	rb_link_node(node, parent, link);
+	rb_insert_color(node, tree);
+}
+
+/**
+ * rb_find_add() - find equivalent @node in @tree, or add @node
+ * @node: node to look-for / insert
+ * @tree: tree to search / modify
+ * @cmp: operator defining the node order
+ *
+ * Returns the rb_node matching @node, or NULL when no match is found and @node
+ * is inserted.
+ */
+static __always_inline struct rb_node *
+rb_find_add(struct rb_node *node, struct rb_root *tree,
+	    int (*cmp)(struct rb_node *, const struct rb_node *))
+{
+	struct rb_node **link = &tree->rb_node;
+	struct rb_node *parent = NULL;
+	int c;
+
+	while (*link) {
+		parent = *link;
+		c = cmp(node, parent);
+
+		if (c < 0)
+			link = &parent->rb_left;
+		else if (c > 0)
+			link = &parent->rb_right;
+		else
+			return parent;
+	}
+
+	rb_link_node(node, parent, link);
+	rb_insert_color(node, tree);
+	return NULL;
+}
+
+/**
+ * rb_find() - find @key in tree @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining the node order
+ *
+ * Returns the rb_node matching @key or NULL.
+ */
+static __always_inline struct rb_node *
+rb_find(const void *key, const struct rb_root *tree,
+	int (*cmp)(const void *key, const struct rb_node *))
+{
+	struct rb_node *node = tree->rb_node;
+
+	while (node) {
+		int c = cmp(key, node);
+
+		if (c < 0)
+			node = node->rb_left;
+		else if (c > 0)
+			node = node->rb_right;
+		else
+			return node;
+	}
+
+	return NULL;
+}
+
+/**
+ * rb_find_first() - find the first @key in @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ *
+ * Returns the leftmost node matching @key, or NULL.
+ */
+static __always_inline struct rb_node *
+rb_find_first(const void *key, const struct rb_root *tree,
+	      int (*cmp)(const void *key, const struct rb_node *))
+{
+	struct rb_node *node = tree->rb_node;
+	struct rb_node *match = NULL;
+
+	while (node) {
+		int c = cmp(key, node);
+
+		if (c <= 0) {
+			if (!c)
+				match = node;
+			node = node->rb_left;
+		} else if (c > 0) {
+			node = node->rb_right;
+		}
+	}
+
+	return match;
+}
+
+/**
+ * rb_next_match() - find the next @key in @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ *
+ * Returns the next node matching @key, or NULL.
+ */
+static __always_inline struct rb_node *
+rb_next_match(const void *key, struct rb_node *node,
+	      int (*cmp)(const void *key, const struct rb_node *))
+{
+	node = rb_next(node);
+	if (node && cmp(key, node))
+		node = NULL;
+	return node;
+}
+
+/**
+ * rb_for_each() - iterates a subtree matching @key
+ * @node: iterator
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ */
+#define rb_for_each(node, key, tree, cmp) \
+	for ((node) = rb_find_first((key), (tree), (cmp)); \
+	     (node); (node) = rb_next_match((key), (node), (cmp)))
+
 #endif	/* _LINUX_RBTREE_H */
diff --git a/tools/include/linux/rbtree.h b/tools/include/linux/rbtree.h
index 30dd21f976c30..2680f2edb837a 100644
--- a/tools/include/linux/rbtree.h
+++ b/tools/include/linux/rbtree.h
@@ -152,4 +152,194 @@ static inline void rb_replace_node_cached(struct rb_node *victim,
 	rb_replace_node(victim, new, &root->rb_root);
 }
 
-#endif /* __TOOLS_LINUX_PERF_RBTREE_H */
+/*
+ * The below helper functions use 2 operators with 3 different
+ * calling conventions. The operators are related like:
+ *
+ *	comp(a->key,b) < 0  := less(a,b)
+ *	comp(a->key,b) > 0  := less(b,a)
+ *	comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
+ *
+ * If these operators define a partial order on the elements we make no
+ * guarantee on which of the elements matching the key is found. See
+ * rb_find().
+ *
+ * The reason for this is to allow the find() interface without requiring an
+ * on-stack dummy object, which might not be feasible due to object size.
+ */
+
+/**
+ * rb_add_cached() - insert @node into the leftmost cached tree @tree
+ * @node: node to insert
+ * @tree: leftmost cached tree to insert @node into
+ * @less: operator defining the (partial) node order
+ */
+static __always_inline void
+rb_add_cached(struct rb_node *node, struct rb_root_cached *tree,
+	      bool (*less)(struct rb_node *, const struct rb_node *))
+{
+	struct rb_node **link = &tree->rb_root.rb_node;
+	struct rb_node *parent = NULL;
+	bool leftmost = true;
+
+	while (*link) {
+		parent = *link;
+		if (less(node, parent)) {
+			link = &parent->rb_left;
+		} else {
+			link = &parent->rb_right;
+			leftmost = false;
+		}
+	}
+
+	rb_link_node(node, parent, link);
+	rb_insert_color_cached(node, tree, leftmost);
+}
+
+/**
+ * rb_add() - insert @node into @tree
+ * @node: node to insert
+ * @tree: tree to insert @node into
+ * @less: operator defining the (partial) node order
+ */
+static __always_inline void
+rb_add(struct rb_node *node, struct rb_root *tree,
+       bool (*less)(struct rb_node *, const struct rb_node *))
+{
+	struct rb_node **link = &tree->rb_node;
+	struct rb_node *parent = NULL;
+
+	while (*link) {
+		parent = *link;
+		if (less(node, parent))
+			link = &parent->rb_left;
+		else
+			link = &parent->rb_right;
+	}
+
+	rb_link_node(node, parent, link);
+	rb_insert_color(node, tree);
+}
+
+/**
+ * rb_find_add() - find equivalent @node in @tree, or add @node
+ * @node: node to look-for / insert
+ * @tree: tree to search / modify
+ * @cmp: operator defining the node order
+ *
+ * Returns the rb_node matching @node, or NULL when no match is found and @node
+ * is inserted.
+ */
+static __always_inline struct rb_node *
+rb_find_add(struct rb_node *node, struct rb_root *tree,
+	    int (*cmp)(struct rb_node *, const struct rb_node *))
+{
+	struct rb_node **link = &tree->rb_node;
+	struct rb_node *parent = NULL;
+	int c;
+
+	while (*link) {
+		parent = *link;
+		c = cmp(node, parent);
+
+		if (c < 0)
+			link = &parent->rb_left;
+		else if (c > 0)
+			link = &parent->rb_right;
+		else
+			return parent;
+	}
+
+	rb_link_node(node, parent, link);
+	rb_insert_color(node, tree);
+	return NULL;
+}
+
+/**
+ * rb_find() - find @key in tree @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining the node order
+ *
+ * Returns the rb_node matching @key or NULL.
+ */
+static __always_inline struct rb_node *
+rb_find(const void *key, const struct rb_root *tree,
+	int (*cmp)(const void *key, const struct rb_node *))
+{
+	struct rb_node *node = tree->rb_node;
+
+	while (node) {
+		int c = cmp(key, node);
+
+		if (c < 0)
+			node = node->rb_left;
+		else if (c > 0)
+			node = node->rb_right;
+		else
+			return node;
+	}
+
+	return NULL;
+}
+
+/**
+ * rb_find_first() - find the first @key in @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ *
+ * Returns the leftmost node matching @key, or NULL.
+ */
+static __always_inline struct rb_node *
+rb_find_first(const void *key, const struct rb_root *tree,
+	      int (*cmp)(const void *key, const struct rb_node *))
+{
+	struct rb_node *node = tree->rb_node;
+	struct rb_node *match = NULL;
+
+	while (node) {
+		int c = cmp(key, node);
+
+		if (c <= 0) {
+			if (!c)
+				match = node;
+			node = node->rb_left;
+		} else if (c > 0) {
+			node = node->rb_right;
+		}
+	}
+
+	return match;
+}
+
+/**
+ * rb_next_match() - find the next @key in @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ *
+ * Returns the next node matching @key, or NULL.
+ */
+static __always_inline struct rb_node *
+rb_next_match(const void *key, struct rb_node *node,
+	      int (*cmp)(const void *key, const struct rb_node *))
+{
+	node = rb_next(node);
+	if (node && cmp(key, node))
+		node = NULL;
+	return node;
+}
+
+/**
+ * rb_for_each() - iterates a subtree matching @key
+ * @node: iterator
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ */
+#define rb_for_each(node, key, tree, cmp) \
+	for ((node) = rb_find_first((key), (tree), (cmp)); \
+	     (node); (node) = rb_next_match((key), (node), (cmp)))
+
+#endif	/* __TOOLS_LINUX_PERF_RBTREE_H */
diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index a2ea3931e01d5..41f26913f5455 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -43,75 +43,24 @@ static void elf_hash_init(struct hlist_head *table)
 #define elf_hash_for_each_possible(name, obj, member, key)			\
 	hlist_for_each_entry(obj, &name[hash_min(key, elf_hash_bits())], member)
 
-static void rb_add(struct rb_root *tree, struct rb_node *node,
-		   int (*cmp)(struct rb_node *, const struct rb_node *))
-{
-	struct rb_node **link = &tree->rb_node;
-	struct rb_node *parent = NULL;
-
-	while (*link) {
-		parent = *link;
-		if (cmp(node, parent) < 0)
-			link = &parent->rb_left;
-		else
-			link = &parent->rb_right;
-	}
-
-	rb_link_node(node, parent, link);
-	rb_insert_color(node, tree);
-}
-
-static struct rb_node *rb_find_first(const struct rb_root *tree, const void *key,
-			       int (*cmp)(const void *key, const struct rb_node *))
-{
-	struct rb_node *node = tree->rb_node;
-	struct rb_node *match = NULL;
-
-	while (node) {
-		int c = cmp(key, node);
-		if (c <= 0) {
-			if (!c)
-				match = node;
-			node = node->rb_left;
-		} else if (c > 0) {
-			node = node->rb_right;
-		}
-	}
-
-	return match;
-}
-
-static struct rb_node *rb_next_match(struct rb_node *node, const void *key,
-				    int (*cmp)(const void *key, const struct rb_node *))
-{
-	node = rb_next(node);
-	if (node && cmp(key, node))
-		node = NULL;
-	return node;
-}
-
-#define rb_for_each(tree, node, key, cmp) \
-	for ((node) = rb_find_first((tree), (key), (cmp)); \
-	     (node); (node) = rb_next_match((node), (key), (cmp)))
-
-static int symbol_to_offset(struct rb_node *a, const struct rb_node *b)
+static bool symbol_to_offset(struct rb_node *a, const struct rb_node *b)
 {
 	struct symbol *sa = rb_entry(a, struct symbol, node);
 	struct symbol *sb = rb_entry(b, struct symbol, node);
 
 	if (sa->offset < sb->offset)
-		return -1;
+		return true;
 	if (sa->offset > sb->offset)
-		return 1;
+		return false;
 
 	if (sa->len < sb->len)
-		return -1;
+		return true;
 	if (sa->len > sb->len)
-		return 1;
+		return false;
 
 	sa->alias = sb;
 
-	return 0;
+	return false;
 }
 
 static int symbol_by_offset(const void *key, const struct rb_node *node)
@@ -165,7 +114,7 @@ struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset)
 {
 	struct rb_node *node;
 
-	rb_for_each(&sec->symbol_tree, node, &offset, symbol_by_offset) {
+	rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) {
 		struct symbol *s = rb_entry(node, struct symbol, node);
 
 		if (s->offset == offset && s->type != STT_SECTION)
@@ -179,7 +128,7 @@ struct symbol *find_func_by_offset(struct section *sec, unsigned long offset)
 {
 	struct rb_node *node;
 
-	rb_for_each(&sec->symbol_tree, node, &offset, symbol_by_offset) {
+	rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) {
 		struct symbol *s = rb_entry(node, struct symbol, node);
 
 		if (s->offset == offset && s->type == STT_FUNC)
@@ -193,7 +142,7 @@ struct symbol *find_symbol_containing(const struct section *sec, unsigned long o
 {
 	struct rb_node *node;
 
-	rb_for_each(&sec->symbol_tree, node, &offset, symbol_by_offset) {
+	rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) {
 		struct symbol *s = rb_entry(node, struct symbol, node);
 
 		if (s->type != STT_SECTION)
@@ -207,7 +156,7 @@ struct symbol *find_func_containing(struct section *sec, unsigned long offset)
 {
 	struct rb_node *node;
 
-	rb_for_each(&sec->symbol_tree, node, &offset, symbol_by_offset) {
+	rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) {
 		struct symbol *s = rb_entry(node, struct symbol, node);
 
 		if (s->type == STT_FUNC)
@@ -354,7 +303,7 @@ static void elf_add_symbol(struct elf *elf, struct symbol *sym)
 	sym->offset = sym->sym.st_value;
 	sym->len = sym->sym.st_size;
 
-	rb_add(&sym->sec->symbol_tree, &sym->node, symbol_to_offset);
+	rb_add(&sym->node, &sym->sec->symbol_tree, symbol_to_offset);
 	pnode = rb_prev(&sym->node);
 	if (pnode)
 		entry = &rb_entry(pnode, struct symbol, node)->list;

From d93abe10dc916cc20ebe9cfd43a99c558c1567d8 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Wed, 14 Oct 2020 08:38:00 +0100
Subject: [PATCH 190/737] objtool: Fully validate the stack frame

A valid stack frame should contain both the return address and the
previous frame pointer value.

On x86, the return value is placed on the stack by the calling
instructions. On other architectures, the callee needs to explicitly
save the return address on the stack.

Add the necessary checks to verify a function properly sets up all the
elements of the stack frame.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
(cherry picked from commit fb084fde0c8106bc86df243411751c3421c07c08)
---
 tools/objtool/check.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 9a0a54194636c..1b7eb66c0593a 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -2112,12 +2112,20 @@ static bool has_modified_stack_frame(struct instruction *insn, struct insn_state
 	return false;
 }
 
+static bool check_reg_frame_pos(const struct cfi_reg *reg,
+				int expected_offset)
+{
+	return reg->base == CFI_CFA &&
+	       reg->offset == expected_offset;
+}
+
 static bool has_valid_stack_frame(struct insn_state *state)
 {
 	struct cfi_state *cfi = &state->cfi;
 
-	if (cfi->cfa.base == CFI_BP && cfi->regs[CFI_BP].base == CFI_CFA &&
-	    cfi->regs[CFI_BP].offset == -16)
+	if (cfi->cfa.base == CFI_BP &&
+	    check_reg_frame_pos(&cfi->regs[CFI_BP], -cfi->cfa.offset) &&
+	    check_reg_frame_pos(&cfi->regs[CFI_RA], -cfi->cfa.offset + 8))
 		return true;
 
 	if (cfi->drap && cfi->regs[CFI_BP].base == CFI_BP)
@@ -2246,8 +2254,7 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi,
 		case OP_SRC_REG:
 			if (op->src.reg == CFI_SP && op->dest.reg == CFI_BP &&
 			    cfa->base == CFI_SP &&
-			    regs[CFI_BP].base == CFI_CFA &&
-			    regs[CFI_BP].offset == -cfa->offset) {
+			    check_reg_frame_pos(&regs[CFI_BP], -cfa->offset)) {
 
 				/* mov %rsp, %rbp */
 				cfa->base = op->dest.reg;

From 9422fdd0e0e0f3c21838ee3564878c0acdfeb753 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Wed, 14 Oct 2020 08:38:01 +0100
Subject: [PATCH 191/737] objtool: Support addition to set CFA base

On arm64, the compiler can set the frame pointer either
with a move operation or with and add operation like:

    add (SP + constant), BP

For a simple move operation, the CFA base is changed from SP to BP.
Handle also changing the CFA base when the frame pointer is set with
an addition instruction.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
(cherry picked from commit 468af56a7bbaa626da5a4578bedc930d731fba13)
---
 tools/objtool/check.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 1b7eb66c0593a..8f6c414bf6709 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -2339,6 +2339,17 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi,
 				break;
 			}
 
+			if (!cfi->drap && op->src.reg == CFI_SP &&
+			    op->dest.reg == CFI_BP && cfa->base == CFI_SP &&
+			    check_reg_frame_pos(&regs[CFI_BP], -cfa->offset + op->src.offset)) {
+
+				/* lea disp(%rsp), %rbp */
+				cfa->base = CFI_BP;
+				cfa->offset -= op->src.offset;
+				cfi->bp_scratch = false;
+				break;
+			}
+
 			if (op->src.reg == CFI_SP && cfa->base == CFI_SP) {
 
 				/* drap: lea disp(%rsp), %drap */

From 9ae7856480c51139e4cbdf524cc57e064da7bc64 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Wed, 14 Oct 2020 08:38:02 +0100
Subject: [PATCH 192/737] objtool: Make SP memory operation match PUSH/POP
 semantics

Architectures without PUSH/POP instructions will always access the stack
though memory operations (SRC/DEST_INDIRECT). Make those operations have
the same effect on the CFA as PUSH/POP, with no stack pointer
modification.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
(cherry picked from commit 201ef5a974e24112953b74cc9f33dcfc4cbcc1cb)
---
 tools/objtool/check.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 8f6c414bf6709..f94a6a5756cab 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -2444,6 +2444,14 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi,
 			break;
 
 		case OP_SRC_REG_INDIRECT:
+			if (!cfi->drap && op->dest.reg == cfa->base &&
+			    op->dest.reg == CFI_BP) {
+
+				/* mov disp(%rsp), %rbp */
+				cfa->base = CFI_SP;
+				cfa->offset = cfi->stack_size;
+			}
+
 			if (cfi->drap && op->src.reg == CFI_BP &&
 			    op->src.offset == cfi->drap_offset) {
 
@@ -2465,6 +2473,12 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi,
 				/* mov disp(%rbp), %reg */
 				/* mov disp(%rsp), %reg */
 				restore_reg(cfi, op->dest.reg);
+
+			} else if (op->src.reg == CFI_SP &&
+				   op->src.offset == regs[op->dest.reg].offset + cfi->stack_size) {
+
+				/* mov disp(%rsp), %reg */
+				restore_reg(cfi, op->dest.reg);
 			}
 
 			break;
@@ -2542,6 +2556,12 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi,
 			/* mov reg, disp(%rsp) */
 			save_reg(cfi, op->src.reg, CFI_CFA,
 				 op->dest.offset - cfi->cfa.offset);
+
+		} else if (op->dest.reg == CFI_SP) {
+
+			/* mov reg, disp(%rsp) */
+			save_reg(cfi, op->src.reg, CFI_CFA,
+				 op->dest.offset - cfi->stack_size);
 		}
 
 		break;

From 58684a75081b08a0654e545a5ad54113431ba7a2 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 13 Nov 2020 00:03:26 +0100
Subject: [PATCH 193/737] objtool: Fix reloc generation on big endian
 cross-compiles

Relocations generated in elf_rebuild_rel[a]_reloc_section() are broken
if objtool is built and run on a big endian system.

The following errors pop up during x86 cross-compilation:

  x86_64-9.1.0-ld: fs/efivarfs/inode.o: bad reloc symbol index (0x2000000 >= 0x22) for offset 0 in section `.orc_unwind_ip'
  x86_64-9.1.0-ld: final link failed: bad value

Convert those functions to use gelf_update_rel[a](), similar to what
elf_write_reloc() does.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Co-developed-by: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
(cherry picked from commit a1a664ece586457e9f7652b0bc5b08386259e358)
---
 tools/objtool/elf.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index 41f26913f5455..8a876b417d5b6 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -1082,25 +1082,27 @@ static int elf_rebuild_rel_reloc_section(struct section *sec, int nr)
 {
 	struct reloc *reloc;
 	int idx = 0, size;
-	GElf_Rel *relocs;
+	void *buf;
 
 	/* Allocate a buffer for relocations */
-	size = nr * sizeof(*relocs);
-	relocs = malloc(size);
-	if (!relocs) {
+	size = nr * sizeof(GElf_Rel);
+	buf = malloc(size);
+	if (!buf) {
 		perror("malloc");
 		return -1;
 	}
 
-	sec->data->d_buf = relocs;
+	sec->data->d_buf = buf;
 	sec->data->d_size = size;
+	sec->data->d_type = ELF_T_REL;
 
 	sec->sh.sh_size = size;
 
 	idx = 0;
 	list_for_each_entry(reloc, &sec->reloc_list, list) {
-		relocs[idx].r_offset = reloc->offset;
-		relocs[idx].r_info = GELF_R_INFO(reloc->sym->idx, reloc->type);
+		reloc->rel.r_offset = reloc->offset;
+		reloc->rel.r_info = GELF_R_INFO(reloc->sym->idx, reloc->type);
+		gelf_update_rel(sec->data, idx, &reloc->rel);
 		idx++;
 	}
 
@@ -1111,26 +1113,28 @@ static int elf_rebuild_rela_reloc_section(struct section *sec, int nr)
 {
 	struct reloc *reloc;
 	int idx = 0, size;
-	GElf_Rela *relocs;
+	void *buf;
 
 	/* Allocate a buffer for relocations with addends */
-	size = nr * sizeof(*relocs);
-	relocs = malloc(size);
-	if (!relocs) {
+	size = nr * sizeof(GElf_Rela);
+	buf = malloc(size);
+	if (!buf) {
 		perror("malloc");
 		return -1;
 	}
 
-	sec->data->d_buf = relocs;
+	sec->data->d_buf = buf;
 	sec->data->d_size = size;
+	sec->data->d_type = ELF_T_RELA;
 
 	sec->sh.sh_size = size;
 
 	idx = 0;
 	list_for_each_entry(reloc, &sec->reloc_list, list) {
-		relocs[idx].r_offset = reloc->offset;
-		relocs[idx].r_addend = reloc->addend;
-		relocs[idx].r_info = GELF_R_INFO(reloc->sym->idx, reloc->type);
+		reloc->rela.r_offset = reloc->offset;
+		reloc->rela.r_addend = reloc->addend;
+		reloc->rela.r_info = GELF_R_INFO(reloc->sym->idx, reloc->type);
+		gelf_update_rela(sec->data, idx, &reloc->rela);
 		idx++;
 	}
 

From 15d299a751cb5cce23abc8ba40047258c70ac02c Mon Sep 17 00:00:00 2001
From: Vasily Gorbik <gor@linux.ibm.com>
Date: Fri, 13 Nov 2020 00:03:29 +0100
Subject: [PATCH 194/737] objtool: Fix x86 orc generation on big endian
 cross-compiles

Correct objtool orc generation endianness problems to enable fully
functional x86 cross-compiles on big endian hardware.

Introduce bswap_if_needed() macro, which does a byte swap if target
endianness doesn't match the host, i.e. cross-compilation for little
endian on big endian and vice versa.  The macro is used for conversion
of multi-byte values which are read from / about to be written to a
target native endianness ELF file.

Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
(cherry picked from commit 8bfe273238d77d3cee18e4c03b2f26ae360b5661)
---
 arch/x86/include/asm/orc_types.h              | 10 +++++
 tools/arch/x86/include/asm/orc_types.h        | 10 +++++
 .../arch/x86/include/arch_endianness.h        |  9 +++++
 tools/objtool/check.c                         |  3 +-
 tools/objtool/endianness.h                    | 38 +++++++++++++++++++
 tools/objtool/orc_dump.c                      |  5 ++-
 tools/objtool/orc_gen.c                       |  3 ++
 tools/objtool/special.c                       |  6 ++-
 8 files changed, 79 insertions(+), 5 deletions(-)
 create mode 100644 tools/objtool/arch/x86/include/arch_endianness.h
 create mode 100644 tools/objtool/endianness.h

diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h
index fdbffec4cfdea..5a2baf28a1dcd 100644
--- a/arch/x86/include/asm/orc_types.h
+++ b/arch/x86/include/asm/orc_types.h
@@ -40,6 +40,8 @@
 #define ORC_REG_MAX			15
 
 #ifndef __ASSEMBLY__
+#include <asm/byteorder.h>
+
 /*
  * This struct is more or less a vastly simplified version of the DWARF Call
  * Frame Information standard.  It contains only the necessary parts of DWARF
@@ -51,10 +53,18 @@
 struct orc_entry {
 	s16		sp_offset;
 	s16		bp_offset;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
 	unsigned	sp_reg:4;
 	unsigned	bp_reg:4;
 	unsigned	type:2;
 	unsigned	end:1;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	unsigned	bp_reg:4;
+	unsigned	sp_reg:4;
+	unsigned	unused:5;
+	unsigned	end:1;
+	unsigned	type:2;
+#endif
 } __packed;
 
 #endif /* __ASSEMBLY__ */
diff --git a/tools/arch/x86/include/asm/orc_types.h b/tools/arch/x86/include/asm/orc_types.h
index fdbffec4cfdea..5a2baf28a1dcd 100644
--- a/tools/arch/x86/include/asm/orc_types.h
+++ b/tools/arch/x86/include/asm/orc_types.h
@@ -40,6 +40,8 @@
 #define ORC_REG_MAX			15
 
 #ifndef __ASSEMBLY__
+#include <asm/byteorder.h>
+
 /*
  * This struct is more or less a vastly simplified version of the DWARF Call
  * Frame Information standard.  It contains only the necessary parts of DWARF
@@ -51,10 +53,18 @@
 struct orc_entry {
 	s16		sp_offset;
 	s16		bp_offset;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
 	unsigned	sp_reg:4;
 	unsigned	bp_reg:4;
 	unsigned	type:2;
 	unsigned	end:1;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	unsigned	bp_reg:4;
+	unsigned	sp_reg:4;
+	unsigned	unused:5;
+	unsigned	end:1;
+	unsigned	type:2;
+#endif
 } __packed;
 
 #endif /* __ASSEMBLY__ */
diff --git a/tools/objtool/arch/x86/include/arch_endianness.h b/tools/objtool/arch/x86/include/arch_endianness.h
new file mode 100644
index 0000000000000..7c362527da205
--- /dev/null
+++ b/tools/objtool/arch/x86/include/arch_endianness.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _ARCH_ENDIANNESS_H
+#define _ARCH_ENDIANNESS_H
+
+#include <endian.h>
+
+#define __TARGET_BYTE_ORDER __LITTLE_ENDIAN
+
+#endif /* _ARCH_ENDIANNESS_H */
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index f94a6a5756cab..1b0da45e4ce2f 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -15,6 +15,7 @@
 #include "special.h"
 #include "warn.h"
 #include "arch_elf.h"
+#include "endianness.h"
 
 #include <linux/objtool.h>
 #include <linux/hashtable.h>
@@ -1806,7 +1807,7 @@ static int read_unwind_hints(struct objtool_file *file)
 			return -1;
 		}
 
-		cfi.cfa.offset = hint->sp_offset;
+		cfi.cfa.offset = bswap_if_needed(hint->sp_offset);
 		cfi.type = hint->type;
 		cfi.end = hint->end;
 
diff --git a/tools/objtool/endianness.h b/tools/objtool/endianness.h
new file mode 100644
index 0000000000000..ebece3191b581
--- /dev/null
+++ b/tools/objtool/endianness.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _OBJTOOL_ENDIANNESS_H
+#define _OBJTOOL_ENDIANNESS_H
+
+#include <linux/kernel.h>
+#include <endian.h>
+#include "arch_endianness.h"
+
+#ifndef __TARGET_BYTE_ORDER
+#error undefined arch __TARGET_BYTE_ORDER
+#endif
+
+#if __BYTE_ORDER != __TARGET_BYTE_ORDER
+#define __NEED_BSWAP 1
+#else
+#define __NEED_BSWAP 0
+#endif
+
+/*
+ * Does a byte swap if target endianness doesn't match the host, i.e. cross
+ * compilation for little endian on big endian and vice versa.
+ * To be used for multi-byte values conversion, which are read from / about
+ * to be written to a target native endianness ELF file.
+ */
+#define bswap_if_needed(val)						\
+({									\
+	__typeof__(val) __ret;						\
+	switch (sizeof(val)) {						\
+	case 8: __ret = __NEED_BSWAP ? bswap_64(val) : (val); break;	\
+	case 4: __ret = __NEED_BSWAP ? bswap_32(val) : (val); break;	\
+	case 2: __ret = __NEED_BSWAP ? bswap_16(val) : (val); break;	\
+	default:							\
+		BUILD_BUG(); break;					\
+	}								\
+	__ret;								\
+})
+
+#endif /* _OBJTOOL_ENDIANNESS_H */
diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c
index 5e6a95368d351..4e818a22e44b9 100644
--- a/tools/objtool/orc_dump.c
+++ b/tools/objtool/orc_dump.c
@@ -8,6 +8,7 @@
 #include <asm/orc_types.h>
 #include "objtool.h"
 #include "warn.h"
+#include "endianness.h"
 
 static const char *reg_name(unsigned int reg)
 {
@@ -197,11 +198,11 @@ int orc_dump(const char *_objname)
 
 		printf(" sp:");
 
-		print_reg(orc[i].sp_reg, orc[i].sp_offset);
+		print_reg(orc[i].sp_reg, bswap_if_needed(orc[i].sp_offset));
 
 		printf(" bp:");
 
-		print_reg(orc[i].bp_reg, orc[i].bp_offset);
+		print_reg(orc[i].bp_reg, bswap_if_needed(orc[i].bp_offset));
 
 		printf(" type:%s end:%d\n",
 		       orc_type_name(orc[i].type), orc[i].end);
diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c
index 812b33ed9f652..a1e7b5f8d4340 100644
--- a/tools/objtool/orc_gen.c
+++ b/tools/objtool/orc_gen.c
@@ -11,6 +11,7 @@
 
 #include "check.h"
 #include "warn.h"
+#include "endianness.h"
 
 static int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi,
 			  struct instruction *insn)
@@ -96,6 +97,8 @@ static int write_orc_entry(struct elf *elf, struct section *orc_sec,
 	/* populate ORC data */
 	orc = (struct orc_entry *)orc_sec->data->d_buf + idx;
 	memcpy(orc, o, sizeof(*orc));
+	orc->sp_offset = bswap_if_needed(orc->sp_offset);
+	orc->bp_offset = bswap_if_needed(orc->bp_offset);
 
 	/* populate reloc for ip */
 	if (elf_add_reloc_to_insn(elf, ip_sec, idx * sizeof(int), R_X86_64_PC32,
diff --git a/tools/objtool/special.c b/tools/objtool/special.c
index aff0cee7bac17..82ae9492682a7 100644
--- a/tools/objtool/special.c
+++ b/tools/objtool/special.c
@@ -15,6 +15,7 @@
 #include "special.h"
 #include "warn.h"
 #include "arch_special.h"
+#include "endianness.h"
 
 struct special_entry {
 	const char *sec;
@@ -84,8 +85,9 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry,
 	if (entry->feature) {
 		unsigned short feature;
 
-		feature = *(unsigned short *)(sec->data->d_buf + offset +
-					      entry->feature);
+		feature = bswap_if_needed(*(unsigned short *)(sec->data->d_buf +
+							      offset +
+							      entry->feature));
 		arch_handle_alternative(feature, alt);
 	}
 

From 920d99207588624548401b5e83b1c5d54b583088 Mon Sep 17 00:00:00 2001
From: Vasily Gorbik <gor@linux.ibm.com>
Date: Fri, 13 Nov 2020 00:03:32 +0100
Subject: [PATCH 195/737] objtool: Rework header include paths

Currently objtool headers are being included either by their base name
or included via ../ from a parent directory. In case of a base name usage:

 #include "warn.h"
 #include "arch_elf.h"

it does not make it apparent from which directory the file comes from.
To make it slightly better, and actually to avoid name clashes some arch
specific files have "arch_" suffix. And files from an arch folder have
to revert to including via ../ e.g:
 #include "../../elf.h"

With additional architectures support and the code base growth there is
a need for clearer headers naming scheme for multiple reasons:
1. to make it instantly obvious where these files come from (objtool
   itself / objtool arch|generic folders / some other external files),
2. to avoid name clashes of objtool arch specific headers, potential
   obtool arch generic headers and the system header files (there is
   /usr/include/elf.h already),
3. to avoid ../ includes and improve code readability.
4. to give a warm fuzzy feeling to developers who are mostly kernel
   developers and are accustomed to linux kernel headers arranging
   scheme.

Doesn't this make it instantly obvious where are these files come from?

 #include <objtool/warn.h>
 #include <arch/elf.h>

And doesn't it look nicer to avoid ugly ../ includes? Which also
guarantees this is elf.h from the objtool and not /usr/include/elf.h.

 #include <objtool/elf.h>

This patch defines and implements new objtool headers arranging
scheme. Which is:
- all generic headers go to include/objtool (similar to include/linux)
- all arch headers go to arch/$(SRCARCH)/include/arch (to get arch
  prefix). This is similar to linux arch specific "asm/*" headers but we
  are not abusing "asm" name and calling it what it is. This also helps
  to prevent name clashes (arch is not used in system headers or kernel
  exports).

To bring objtool to this state the following things are done:
1. current top level tools/objtool/ headers are moved into
   include/objtool/ subdirectory,
2. arch specific headers, currently only arch/x86/include/ are moved into
   arch/x86/include/arch/ and were stripped of "arch_" suffix,
3. new -I$(srctree)/tools/objtool/include include path to make
   includes like <objtool/warn.h> possible,
4. rewriting file includes,
5. make git not to ignore include/objtool/ subdirectory.

Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
(cherry picked from commit 7786032e52cb02982a7154993b5d88c9c7a31ba5)
[ v5.10.133:
  - Fix header include path for "arch_elf.h" in tools/objtool/arch/x86/decode.c
]
Signed-off-by; Suraj Jitindar Singh <surajjs@amazon.com>
---
 tools/objtool/.gitignore                         |  2 +-
 tools/objtool/Makefile                           |  1 +
 tools/objtool/arch/x86/decode.c                  | 10 +++++-----
 .../arch/x86/include/{ => arch}/cfi_regs.h       |  0
 .../arch/x86/include/{arch_elf.h => arch/elf.h}  |  0
 .../{arch_endianness.h => arch/endianness.h}     |  0
 .../include/{arch_special.h => arch/special.h}   |  0
 tools/objtool/arch/x86/special.c                 |  4 ++--
 tools/objtool/builtin-check.c                    |  4 ++--
 tools/objtool/builtin-orc.c                      |  4 ++--
 tools/objtool/check.c                            | 16 ++++++++--------
 tools/objtool/elf.c                              |  6 +++---
 tools/objtool/{ => include/objtool}/arch.h       |  4 ++--
 tools/objtool/{ => include/objtool}/builtin.h    |  0
 tools/objtool/{ => include/objtool}/cfi.h        |  2 +-
 tools/objtool/{ => include/objtool}/check.h      |  4 ++--
 tools/objtool/{ => include/objtool}/elf.h        |  0
 tools/objtool/{ => include/objtool}/endianness.h |  2 +-
 tools/objtool/{ => include/objtool}/objtool.h    |  2 +-
 tools/objtool/{ => include/objtool}/special.h    |  4 ++--
 tools/objtool/{ => include/objtool}/warn.h       |  2 +-
 tools/objtool/objtool.c                          |  6 +++---
 tools/objtool/orc_dump.c                         |  6 +++---
 tools/objtool/orc_gen.c                          |  6 +++---
 tools/objtool/special.c                          | 10 +++++-----
 tools/objtool/weak.c                             |  2 +-
 26 files changed, 49 insertions(+), 48 deletions(-)
 rename tools/objtool/arch/x86/include/{ => arch}/cfi_regs.h (100%)
 rename tools/objtool/arch/x86/include/{arch_elf.h => arch/elf.h} (100%)
 rename tools/objtool/arch/x86/include/{arch_endianness.h => arch/endianness.h} (100%)
 rename tools/objtool/arch/x86/include/{arch_special.h => arch/special.h} (100%)
 rename tools/objtool/{ => include/objtool}/arch.h (97%)
 rename tools/objtool/{ => include/objtool}/builtin.h (100%)
 rename tools/objtool/{ => include/objtool}/cfi.h (96%)
 rename tools/objtool/{ => include/objtool}/check.h (97%)
 rename tools/objtool/{ => include/objtool}/elf.h (100%)
 rename tools/objtool/{ => include/objtool}/endianness.h (97%)
 rename tools/objtool/{ => include/objtool}/objtool.h (96%)
 rename tools/objtool/{ => include/objtool}/special.h (94%)
 rename tools/objtool/{ => include/objtool}/warn.h (98%)

diff --git a/tools/objtool/.gitignore b/tools/objtool/.gitignore
index 45cefda24c7b1..14236db3677f6 100644
--- a/tools/objtool/.gitignore
+++ b/tools/objtool/.gitignore
@@ -1,4 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
 arch/x86/lib/inat-tables.c
-objtool
+/objtool
 fixdep
diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile
index a43096f713c7b..92ce4fce7bc73 100644
--- a/tools/objtool/Makefile
+++ b/tools/objtool/Makefile
@@ -27,6 +27,7 @@ all: $(OBJTOOL)
 INCLUDES := -I$(srctree)/tools/include \
 	    -I$(srctree)/tools/arch/$(HOSTARCH)/include/uapi \
 	    -I$(srctree)/tools/arch/$(SRCARCH)/include	\
+	    -I$(srctree)/tools/objtool/include \
 	    -I$(srctree)/tools/objtool/arch/$(SRCARCH)/include
 WARNINGS := $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -Wno-packed -Wno-nested-externs
 CFLAGS   := -Werror $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES) $(LIBELF_FLAGS)
diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index 5b915ebb61163..a5ef77bd6e5f2 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -11,12 +11,12 @@
 #include "../../../arch/x86/lib/inat.c"
 #include "../../../arch/x86/lib/insn.c"
 
-#include "../../check.h"
-#include "../../elf.h"
-#include "../../arch.h"
-#include "../../warn.h"
 #include <asm/orc_types.h>
-#include "arch_elf.h"
+#include "arch/elf.h"
+#include <objtool/check.h>
+#include <objtool/elf.h>
+#include <objtool/arch.h>
+#include <objtool/warn.h>
 
 static unsigned char op_to_cfi_reg[][2] = {
 	{CFI_AX, CFI_R8},
diff --git a/tools/objtool/arch/x86/include/cfi_regs.h b/tools/objtool/arch/x86/include/arch/cfi_regs.h
similarity index 100%
rename from tools/objtool/arch/x86/include/cfi_regs.h
rename to tools/objtool/arch/x86/include/arch/cfi_regs.h
diff --git a/tools/objtool/arch/x86/include/arch_elf.h b/tools/objtool/arch/x86/include/arch/elf.h
similarity index 100%
rename from tools/objtool/arch/x86/include/arch_elf.h
rename to tools/objtool/arch/x86/include/arch/elf.h
diff --git a/tools/objtool/arch/x86/include/arch_endianness.h b/tools/objtool/arch/x86/include/arch/endianness.h
similarity index 100%
rename from tools/objtool/arch/x86/include/arch_endianness.h
rename to tools/objtool/arch/x86/include/arch/endianness.h
diff --git a/tools/objtool/arch/x86/include/arch_special.h b/tools/objtool/arch/x86/include/arch/special.h
similarity index 100%
rename from tools/objtool/arch/x86/include/arch_special.h
rename to tools/objtool/arch/x86/include/arch/special.h
diff --git a/tools/objtool/arch/x86/special.c b/tools/objtool/arch/x86/special.c
index 151b13d0a2676..e707d9bcd1616 100644
--- a/tools/objtool/arch/x86/special.c
+++ b/tools/objtool/arch/x86/special.c
@@ -1,8 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 #include <string.h>
 
-#include "../../special.h"
-#include "../../builtin.h"
+#include <objtool/special.h>
+#include <objtool/builtin.h>
 
 #define X86_FEATURE_POPCNT (4 * 32 + 23)
 #define X86_FEATURE_SMAP   (9 * 32 + 20)
diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c
index 447a49c03abb3..21033efc91e6d 100644
--- a/tools/objtool/builtin-check.c
+++ b/tools/objtool/builtin-check.c
@@ -15,8 +15,8 @@
 
 #include <subcmd/parse-options.h>
 #include <string.h>
-#include "builtin.h"
-#include "objtool.h"
+#include <objtool/builtin.h>
+#include <objtool/objtool.h>
 
 bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats,
      validate_dup, vmlinux, sls, unret, rethunk;
diff --git a/tools/objtool/builtin-orc.c b/tools/objtool/builtin-orc.c
index 508bdf6ae8dc6..8273bbf7cebb1 100644
--- a/tools/objtool/builtin-orc.c
+++ b/tools/objtool/builtin-orc.c
@@ -13,8 +13,8 @@
  */
 
 #include <string.h>
-#include "builtin.h"
-#include "objtool.h"
+#include <objtool/builtin.h>
+#include <objtool/objtool.h>
 
 static const char *orc_usage[] = {
 	"objtool orc generate [<options>] file.o",
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 1b0da45e4ce2f..b5c1a2f70a188 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -8,14 +8,14 @@
 #include <inttypes.h>
 #include <sys/mman.h>
 
-#include "builtin.h"
-#include "cfi.h"
-#include "arch.h"
-#include "check.h"
-#include "special.h"
-#include "warn.h"
-#include "arch_elf.h"
-#include "endianness.h"
+#include <arch/elf.h>
+#include <objtool/builtin.h>
+#include <objtool/cfi.h>
+#include <objtool/arch.h>
+#include <objtool/check.h>
+#include <objtool/special.h>
+#include <objtool/warn.h>
+#include <objtool/endianness.h>
 
 #include <linux/objtool.h>
 #include <linux/hashtable.h>
diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index 8a876b417d5b6..1a77492021a62 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -15,10 +15,10 @@
 #include <string.h>
 #include <unistd.h>
 #include <errno.h>
-#include "builtin.h"
+#include <objtool/builtin.h>
 
-#include "elf.h"
-#include "warn.h"
+#include <objtool/elf.h>
+#include <objtool/warn.h>
 
 #define MAX_NAME_LEN 128
 
diff --git a/tools/objtool/arch.h b/tools/objtool/include/objtool/arch.h
similarity index 97%
rename from tools/objtool/arch.h
rename to tools/objtool/include/objtool/arch.h
index 580ce18575857..9be79f05c779c 100644
--- a/tools/objtool/arch.h
+++ b/tools/objtool/include/objtool/arch.h
@@ -8,8 +8,8 @@
 
 #include <stdbool.h>
 #include <linux/list.h>
-#include "objtool.h"
-#include "cfi.h"
+#include <objtool/objtool.h>
+#include <objtool/cfi.h>
 
 enum insn_type {
 	INSN_JUMP_CONDITIONAL,
diff --git a/tools/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h
similarity index 100%
rename from tools/objtool/builtin.h
rename to tools/objtool/include/objtool/builtin.h
diff --git a/tools/objtool/cfi.h b/tools/objtool/include/objtool/cfi.h
similarity index 96%
rename from tools/objtool/cfi.h
rename to tools/objtool/include/objtool/cfi.h
index f579802d7ec24..f11d1ac1dadf1 100644
--- a/tools/objtool/cfi.h
+++ b/tools/objtool/include/objtool/cfi.h
@@ -6,7 +6,7 @@
 #ifndef _OBJTOOL_CFI_H
 #define _OBJTOOL_CFI_H
 
-#include "cfi_regs.h"
+#include <arch/cfi_regs.h>
 #include <linux/list.h>
 
 #define CFI_UNDEFINED		-1
diff --git a/tools/objtool/check.h b/tools/objtool/include/objtool/check.h
similarity index 97%
rename from tools/objtool/check.h
rename to tools/objtool/include/objtool/check.h
index 7f34a7f9ca523..4ba041db304f9 100644
--- a/tools/objtool/check.h
+++ b/tools/objtool/include/objtool/check.h
@@ -7,8 +7,8 @@
 #define _CHECK_H
 
 #include <stdbool.h>
-#include "cfi.h"
-#include "arch.h"
+#include <objtool/cfi.h>
+#include <objtool/arch.h>
 
 struct insn_state {
 	struct cfi_state cfi;
diff --git a/tools/objtool/elf.h b/tools/objtool/include/objtool/elf.h
similarity index 100%
rename from tools/objtool/elf.h
rename to tools/objtool/include/objtool/elf.h
diff --git a/tools/objtool/endianness.h b/tools/objtool/include/objtool/endianness.h
similarity index 97%
rename from tools/objtool/endianness.h
rename to tools/objtool/include/objtool/endianness.h
index ebece3191b581..10241341eff35 100644
--- a/tools/objtool/endianness.h
+++ b/tools/objtool/include/objtool/endianness.h
@@ -2,9 +2,9 @@
 #ifndef _OBJTOOL_ENDIANNESS_H
 #define _OBJTOOL_ENDIANNESS_H
 
+#include <arch/endianness.h>
 #include <linux/kernel.h>
 #include <endian.h>
-#include "arch_endianness.h"
 
 #ifndef __TARGET_BYTE_ORDER
 #error undefined arch __TARGET_BYTE_ORDER
diff --git a/tools/objtool/objtool.h b/tools/objtool/include/objtool/objtool.h
similarity index 96%
rename from tools/objtool/objtool.h
rename to tools/objtool/include/objtool/objtool.h
index bf64946e749bc..372225fa1259d 100644
--- a/tools/objtool/objtool.h
+++ b/tools/objtool/include/objtool/objtool.h
@@ -10,7 +10,7 @@
 #include <linux/list.h>
 #include <linux/hashtable.h>
 
-#include "elf.h"
+#include <objtool/elf.h>
 
 #define __weak __attribute__((weak))
 
diff --git a/tools/objtool/special.h b/tools/objtool/include/objtool/special.h
similarity index 94%
rename from tools/objtool/special.h
rename to tools/objtool/include/objtool/special.h
index abddf38ef3346..8a09f4e9d480e 100644
--- a/tools/objtool/special.h
+++ b/tools/objtool/include/objtool/special.h
@@ -7,8 +7,8 @@
 #define _SPECIAL_H
 
 #include <stdbool.h>
-#include "check.h"
-#include "elf.h"
+#include <objtool/check.h>
+#include <objtool/elf.h>
 
 #define C_JUMP_TABLE_SECTION ".rodata..c_jump_table"
 
diff --git a/tools/objtool/warn.h b/tools/objtool/include/objtool/warn.h
similarity index 98%
rename from tools/objtool/warn.h
rename to tools/objtool/include/objtool/warn.h
index 7799f60de80af..d99c4675e4a5f 100644
--- a/tools/objtool/warn.h
+++ b/tools/objtool/include/objtool/warn.h
@@ -11,7 +11,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
-#include "elf.h"
+#include <objtool/elf.h>
 
 extern const char *objname;
 
diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c
index cb2c6acd9667f..f46077e484f84 100644
--- a/tools/objtool/objtool.c
+++ b/tools/objtool/objtool.c
@@ -21,9 +21,9 @@
 #include <subcmd/pager.h>
 #include <linux/kernel.h>
 
-#include "builtin.h"
-#include "objtool.h"
-#include "warn.h"
+#include <objtool/builtin.h>
+#include <objtool/objtool.h>
+#include <objtool/warn.h>
 
 struct cmd_struct {
 	const char *name;
diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c
index 4e818a22e44b9..c53fae9dbe93b 100644
--- a/tools/objtool/orc_dump.c
+++ b/tools/objtool/orc_dump.c
@@ -6,9 +6,9 @@
 #include <unistd.h>
 #include <linux/objtool.h>
 #include <asm/orc_types.h>
-#include "objtool.h"
-#include "warn.h"
-#include "endianness.h"
+#include <objtool/objtool.h>
+#include <objtool/warn.h>
+#include <objtool/endianness.h>
 
 static const char *reg_name(unsigned int reg)
 {
diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c
index a1e7b5f8d4340..ddacb42157485 100644
--- a/tools/objtool/orc_gen.c
+++ b/tools/objtool/orc_gen.c
@@ -9,9 +9,9 @@
 #include <linux/objtool.h>
 #include <asm/orc_types.h>
 
-#include "check.h"
-#include "warn.h"
-#include "endianness.h"
+#include <objtool/check.h>
+#include <objtool/warn.h>
+#include <objtool/endianness.h>
 
 static int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi,
 			  struct instruction *insn)
diff --git a/tools/objtool/special.c b/tools/objtool/special.c
index 82ae9492682a7..603ce9f2c4b10 100644
--- a/tools/objtool/special.c
+++ b/tools/objtool/special.c
@@ -11,11 +11,11 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "builtin.h"
-#include "special.h"
-#include "warn.h"
-#include "arch_special.h"
-#include "endianness.h"
+#include <arch/special.h>
+#include <objtool/builtin.h>
+#include <objtool/special.h>
+#include <objtool/warn.h>
+#include <objtool/endianness.h>
 
 struct special_entry {
 	const char *sec;
diff --git a/tools/objtool/weak.c b/tools/objtool/weak.c
index 553ec9ce51ba8..8314e824db4ae 100644
--- a/tools/objtool/weak.c
+++ b/tools/objtool/weak.c
@@ -7,7 +7,7 @@
 
 #include <stdbool.h>
 #include <errno.h>
-#include "objtool.h"
+#include <objtool/objtool.h>
 
 #define UNSUPPORTED(name)						\
 ({									\

From 8bbe0c9ed185436fd8f3f0bc51b8027a86668804 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2021 15:29:23 -0600
Subject: [PATCH 196/737] objtool: Add asm version of STACK_FRAME_NON_STANDARD

To be used for adding asm functions to the ignore list.  The "aw" is
needed to help the ELF section metadata match GCC-created sections.
Otherwise the linker creates duplicate sections instead of combining
them.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/8faa476f9a5ac89af27944ec184c89f95f3c6c49.1611263462.git.jpoimboe@redhat.com
(cherry picked from commit 081df94301e317e84c3413686043987da2c3e39d)
---
 include/linux/objtool.h       | 8 ++++++++
 tools/include/linux/objtool.h | 8 ++++++++
 2 files changed, 16 insertions(+)

diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 662f19374bd98..a2042c4186864 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -117,6 +117,12 @@ struct unwind_hint {
 	.popsection
 .endm
 
+.macro STACK_FRAME_NON_STANDARD func:req
+	.pushsection .discard.func_stack_frame_non_standard, "aw"
+		.long \func - .
+	.popsection
+.endm
+
 #endif /* __ASSEMBLY__ */
 
 #else /* !CONFIG_STACK_VALIDATION */
@@ -130,6 +136,8 @@ struct unwind_hint {
 #define ANNOTATE_INTRA_FUNCTION_CALL
 .macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0
 .endm
+.macro STACK_FRAME_NON_STANDARD func:req
+.endm
 #endif
 
 #endif /* CONFIG_STACK_VALIDATION */
diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h
index 662f19374bd98..a2042c4186864 100644
--- a/tools/include/linux/objtool.h
+++ b/tools/include/linux/objtool.h
@@ -117,6 +117,12 @@ struct unwind_hint {
 	.popsection
 .endm
 
+.macro STACK_FRAME_NON_STANDARD func:req
+	.pushsection .discard.func_stack_frame_non_standard, "aw"
+		.long \func - .
+	.popsection
+.endm
+
 #endif /* __ASSEMBLY__ */
 
 #else /* !CONFIG_STACK_VALIDATION */
@@ -130,6 +136,8 @@ struct unwind_hint {
 #define ANNOTATE_INTRA_FUNCTION_CALL
 .macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0
 .endm
+.macro STACK_FRAME_NON_STANDARD func:req
+.endm
 #endif
 
 #endif /* CONFIG_STACK_VALIDATION */

From 46f8a9547d6bcd717980071e48badfd369040492 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2021 15:29:31 -0600
Subject: [PATCH 197/737] x86/ftrace: Support objtool vmlinux.o validation in
 ftrace_64.S

With objtool vmlinux.o validation of return_to_handler(), now that
objtool has visibility inside the retpoline, jumping from EMPTY state to
a proper function state results in a stack state mismatch.

return_to_handler() is actually quite normal despite the underlying
magic.  Just annotate it as a normal function.

Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/14f48e623f61dbdcd84cf27a56ed8ccae73199ef.1611263462.git.jpoimboe@redhat.com
(cherry picked from commit 7cae4b1cf1cc42f490422e20662169e8656c915a)
---
 arch/x86/kernel/ftrace_64.S | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 5b2dabedcf664..b3b7ecbed0a32 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -322,8 +322,7 @@ SYM_FUNC_START(ftrace_graph_caller)
 	RET
 SYM_FUNC_END(ftrace_graph_caller)
 
-SYM_CODE_START(return_to_handler)
-	UNWIND_HINT_EMPTY
+SYM_FUNC_START(return_to_handler)
 	subq  $16, %rsp
 
 	/* Save the return values */
@@ -350,5 +349,5 @@ SYM_CODE_START(return_to_handler)
 	mov %rdi, (%rsp)
 	UNWIND_HINT_FUNC
 	RET
-SYM_CODE_END(return_to_handler)
+SYM_FUNC_END(return_to_handler)
 #endif

From e7b10eda0084c7f2337e400ccf51bc560662c2d4 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2021 15:29:33 -0600
Subject: [PATCH 198/737] x86/acpi: Support objtool validation in wakeup_64.S

The OBJECT_FILES_NON_STANDARD annotation is used to tell objtool to
ignore a file.  File-level ignores won't work when validating vmlinux.o.

Instead, tell objtool to ignore do_suspend_lowlevel() directly with the
STACK_FRAME_NON_STANDARD annotation.

Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Len Brown <len.brown@intel.com>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/269eda576c53bc9ecc8167c211989111013a67aa.1611263462.git.jpoimboe@redhat.com
(cherry picked from commit aeb818fcc94071e44203b3a36392562e5b88d9ec)
---
 arch/x86/kernel/acpi/Makefile    | 1 -
 arch/x86/kernel/acpi/wakeup_64.S | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index f1bb57b0e41ea..cf340d85946a8 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
-OBJECT_FILES_NON_STANDARD_wakeup_$(BITS).o := y
 
 obj-$(CONFIG_ACPI)		+= boot.o
 obj-$(CONFIG_ACPI_SLEEP)	+= sleep.o wakeup_$(BITS).o
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index c8daa92f38dcd..b57333f567bcf 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 .text
 #include <linux/linkage.h>
+#include <linux/objtool.h>
 #include <asm/segment.h>
 #include <asm/pgtable_types.h>
 #include <asm/page_types.h>
@@ -126,6 +127,7 @@ SYM_FUNC_START(do_suspend_lowlevel)
 	FRAME_END
 	jmp	restore_processor_state
 SYM_FUNC_END(do_suspend_lowlevel)
+STACK_FRAME_NON_STANDARD do_suspend_lowlevel
 
 .data
 saved_rbp:		.quad	0

From 25d889272a3076ac70e8803fc53a2a1fabda15fa Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2021 15:29:35 -0600
Subject: [PATCH 199/737] x86/power: Move restore_registers() to top of the
 file

Because restore_registers() is page-aligned, the assembler inexplicably
adds an unreachable jump from after the end of the previous function to
the beginning of restore_registers().

That confuses objtool, understandably.  It also creates significant text
fragmentation.  As a result, most of the object file is wasted text
(nops).

Move restore_registers() to the beginning of the file to both prevent
the text fragmentation and avoid the dead jump instruction.

$ size /tmp/hibernate_asm_64.before.o /tmp/hibernate_asm_64.after.o
   text	   data	    bss	    dec	    hex	filename
   4415	      0	      0	   4415	   113f	/tmp/hibernate_asm_64.before.o
    524	      0	      0	    524	    20c	/tmp/hibernate_asm_64.after.o

Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/8c7f634201d26453d73fe55032cbbdc05d004387.1611263462.git.jpoimboe@redhat.com
(cherry picked from commit 125f0b7d24216f37a9683b3899fa45101090f098)
---
 arch/x86/power/hibernate_asm_64.S | 123 +++++++++++-------------------
 1 file changed, 46 insertions(+), 77 deletions(-)

diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S
index 3ae7a3d7d61e5..c388eba2940da 100644
--- a/arch/x86/power/hibernate_asm_64.S
+++ b/arch/x86/power/hibernate_asm_64.S
@@ -22,6 +22,52 @@
 #include <asm/processor-flags.h>
 #include <asm/frame.h>
 
+	 /* code below belongs to the image kernel */
+	.align PAGE_SIZE
+SYM_FUNC_START(restore_registers)
+	/* go back to the original page tables */
+	movq    %r9, %cr3
+
+	/* Flush TLB, including "global" things (vmalloc) */
+	movq	mmu_cr4_features(%rip), %rax
+	movq	%rax, %rdx
+	andq	$~(X86_CR4_PGE), %rdx
+	movq	%rdx, %cr4;  # turn off PGE
+	movq	%cr3, %rcx;  # flush TLB
+	movq	%rcx, %cr3
+	movq	%rax, %cr4;  # turn PGE back on
+
+	/* We don't restore %rax, it must be 0 anyway */
+	movq	$saved_context, %rax
+	movq	pt_regs_sp(%rax), %rsp
+	movq	pt_regs_bp(%rax), %rbp
+	movq	pt_regs_si(%rax), %rsi
+	movq	pt_regs_di(%rax), %rdi
+	movq	pt_regs_bx(%rax), %rbx
+	movq	pt_regs_cx(%rax), %rcx
+	movq	pt_regs_dx(%rax), %rdx
+	mov	pt_regs_r8(%rax), %r8
+	movq	pt_regs_r9(%rax), %r9
+	movq	pt_regs_r10(%rax), %r10
+	movq	pt_regs_r11(%rax), %r11
+	movq	pt_regs_r12(%rax), %r12
+	movq	pt_regs_r13(%rax), %r13
+	movq	pt_regs_r14(%rax), %r14
+	movq	pt_regs_r15(%rax), %r15
+	pushq	pt_regs_flags(%rax)
+	popfq
+
+	/* Saved in save_processor_state. */
+	lgdt	saved_context_gdt_desc(%rax)
+
+	xorl	%eax, %eax
+
+	/* tell the hibernation core that we've just restored the memory */
+	movq	%rax, in_suspend(%rip)
+
+	RET
+SYM_FUNC_END(restore_registers)
+
 SYM_FUNC_START(swsusp_arch_suspend)
 	movq	$saved_context, %rax
 	movq	%rsp, pt_regs_sp(%rax)
@@ -68,80 +114,3 @@ SYM_CODE_START(restore_image)
 	movq	relocated_restore_code(%rip), %rcx
 	jmpq	*%rcx
 SYM_CODE_END(restore_image)
-
-	/* code below has been relocated to a safe page */
-SYM_CODE_START(core_restore_code)
-	/* switch to temporary page tables */
-	movq	%rax, %cr3
-	/* flush TLB */
-	movq	%rbx, %rcx
-	andq	$~(X86_CR4_PGE), %rcx
-	movq	%rcx, %cr4;  # turn off PGE
-	movq	%cr3, %rcx;  # flush TLB
-	movq	%rcx, %cr3;
-	movq	%rbx, %cr4;  # turn PGE back on
-.Lloop:
-	testq	%rdx, %rdx
-	jz	.Ldone
-
-	/* get addresses from the pbe and copy the page */
-	movq	pbe_address(%rdx), %rsi
-	movq	pbe_orig_address(%rdx), %rdi
-	movq	$(PAGE_SIZE >> 3), %rcx
-	rep
-	movsq
-
-	/* progress to the next pbe */
-	movq	pbe_next(%rdx), %rdx
-	jmp	.Lloop
-
-.Ldone:
-	/* jump to the restore_registers address from the image header */
-	jmpq	*%r8
-SYM_CODE_END(core_restore_code)
-
-	 /* code below belongs to the image kernel */
-	.align PAGE_SIZE
-SYM_FUNC_START(restore_registers)
-	/* go back to the original page tables */
-	movq    %r9, %cr3
-
-	/* Flush TLB, including "global" things (vmalloc) */
-	movq	mmu_cr4_features(%rip), %rax
-	movq	%rax, %rdx
-	andq	$~(X86_CR4_PGE), %rdx
-	movq	%rdx, %cr4;  # turn off PGE
-	movq	%cr3, %rcx;  # flush TLB
-	movq	%rcx, %cr3
-	movq	%rax, %cr4;  # turn PGE back on
-
-	/* We don't restore %rax, it must be 0 anyway */
-	movq	$saved_context, %rax
-	movq	pt_regs_sp(%rax), %rsp
-	movq	pt_regs_bp(%rax), %rbp
-	movq	pt_regs_si(%rax), %rsi
-	movq	pt_regs_di(%rax), %rdi
-	movq	pt_regs_bx(%rax), %rbx
-	movq	pt_regs_cx(%rax), %rcx
-	movq	pt_regs_dx(%rax), %rdx
-	movq	pt_regs_r8(%rax), %r8
-	movq	pt_regs_r9(%rax), %r9
-	movq	pt_regs_r10(%rax), %r10
-	movq	pt_regs_r11(%rax), %r11
-	movq	pt_regs_r12(%rax), %r12
-	movq	pt_regs_r13(%rax), %r13
-	movq	pt_regs_r14(%rax), %r14
-	movq	pt_regs_r15(%rax), %r15
-	pushq	pt_regs_flags(%rax)
-	popfq
-
-	/* Saved in save_processor_state. */
-	lgdt	saved_context_gdt_desc(%rax)
-
-	xorl	%eax, %eax
-
-	/* tell the hibernation core that we've just restored the memory */
-	movq	%rax, in_suspend(%rip)
-
-	RET
-SYM_FUNC_END(restore_registers)

From a6228a52bd6f55f5bd9cd31c1d9bec120e954582 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2021 15:29:36 -0600
Subject: [PATCH 200/737] x86/power: Support objtool validation in
 hibernate_asm_64.S

The OBJECT_FILES_NON_STANDARD annotation is used to tell objtool to
ignore a file.  File-level ignores won't work when validating vmlinux.o.

Instead, convert restore_image() and core_restore_code() to be ELF
functions.  Their code is conventional enough for objtool to be able to
understand them.

Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/974f8ceb5385e470f72e93974c70ab5c894bb0dc.1611263462.git.jpoimboe@redhat.com
(cherry picked from commit 9077c016a39c78054f03e0354ad8409b47af68dc)
---
 arch/x86/power/Makefile           |  1 -
 arch/x86/power/hibernate_asm_64.S | 35 +++++++++++++++++++++++++++++--
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile
index 6907b523e856b..3ff80156f21a6 100644
--- a/arch/x86/power/Makefile
+++ b/arch/x86/power/Makefile
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
-OBJECT_FILES_NON_STANDARD_hibernate_asm_$(BITS).o := y
 
 # __restore_processor_state() restores %gs after S3 resume and so should not
 # itself be stack-protected
diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S
index c388eba2940da..186d884e29836 100644
--- a/arch/x86/power/hibernate_asm_64.S
+++ b/arch/x86/power/hibernate_asm_64.S
@@ -98,7 +98,7 @@ SYM_FUNC_START(swsusp_arch_suspend)
 	RET
 SYM_FUNC_END(swsusp_arch_suspend)
 
-SYM_CODE_START(restore_image)
+SYM_FUNC_START(restore_image)
 	/* prepare to jump to the image kernel */
 	movq	restore_jump_address(%rip), %r8
 	movq	restore_cr3(%rip), %r9
@@ -113,4 +113,35 @@ SYM_CODE_START(restore_image)
 	/* jump to relocated restore code */
 	movq	relocated_restore_code(%rip), %rcx
 	jmpq	*%rcx
-SYM_CODE_END(restore_image)
+SYM_FUNC_END(restore_image)
+
+	/* code below has been relocated to a safe page */
+SYM_FUNC_START(core_restore_code)
+	/* switch to temporary page tables */
+	movq	%rax, %cr3
+	/* flush TLB */
+	movq	%rbx, %rcx
+	andq	$~(X86_CR4_PGE), %rcx
+	movq	%rcx, %cr4;  # turn off PGE
+	movq	%cr3, %rcx;  # flush TLB
+	movq	%rcx, %cr3;
+	movq	%rbx, %cr4;  # turn PGE back on
+.Lloop:
+	testq	%rdx, %rdx
+	jz	.Ldone
+
+	/* get addresses from the pbe and copy the page */
+	movq	pbe_address(%rdx), %rsi
+	movq	pbe_orig_address(%rdx), %rdi
+	movq	$(PAGE_SIZE >> 3), %rcx
+	rep
+	movsq
+
+	/* progress to the next pbe */
+	movq	pbe_next(%rdx), %rdx
+	jmp	.Lloop
+
+.Ldone:
+	/* jump to the restore_registers address from the image header */
+	jmpq	*%r8
+SYM_FUNC_END(core_restore_code)

From ebf6aaad087c6c9baf8614aada25268be091e089 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 3 Feb 2021 12:02:18 +0100
Subject: [PATCH 201/737] objtool,x86: Additionally decode: mov %rsp, (%reg)

Where we already decode: mov %rsp, %reg, also decode mov %rsp, (%reg).

Nothing should match for this new stack-op.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
(cherry picked from commit 2a512829840eb97a8b52eca7058e56d484468f2d)
---
 tools/objtool/arch/x86/decode.c | 42 ++++++++++++++++++++++++++-------
 1 file changed, 34 insertions(+), 8 deletions(-)

diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index a5ef77bd6e5f2..7b6b089978d12 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -223,15 +223,38 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 		break;
 
 	case 0x89:
-		if (rex_w && !rex_r && modrm_mod == 3 && modrm_reg == 4) {
+		if (rex_w && !rex_r && modrm_reg == 4) {
 
-			/* mov %rsp, reg */
-			ADD_OP(op) {
-				op->src.type = OP_SRC_REG;
-				op->src.reg = CFI_SP;
-				op->dest.type = OP_DEST_REG;
-				op->dest.reg = op_to_cfi_reg[modrm_rm][rex_b];
+			if (modrm_mod == 3) {
+				/* mov %rsp, reg */
+				ADD_OP(op) {
+					op->src.type = OP_SRC_REG;
+					op->src.reg = CFI_SP;
+					op->dest.type = OP_DEST_REG;
+					op->dest.reg = op_to_cfi_reg[modrm_rm][rex_b];
+				}
+				break;
+
+			} else {
+				/* skip nontrivial SIB */
+				if (modrm_rm == 4 && !(sib == 0x24 && rex_b == rex_x))
+					break;
+
+				/* skip RIP relative displacement */
+				if (modrm_rm == 5 && modrm_mod == 0)
+					break;
+
+				/* mov %rsp, disp(%reg) */
+				ADD_OP(op) {
+					op->src.type = OP_SRC_REG;
+					op->src.reg = CFI_SP;
+					op->dest.type = OP_DEST_REG_INDIRECT;
+					op->dest.reg = op_to_cfi_reg[modrm_rm][rex_b];
+					op->dest.offset = insn.displacement.value;
+				}
+				break;
 			}
+
 			break;
 		}
 
@@ -260,8 +283,10 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 				op->dest.reg = CFI_BP;
 				op->dest.offset = insn.displacement.value;
 			}
+			break;
+		}
 
-		} else if (rex_w && !rex_b && modrm_rm == 4 && sib == 0x24) {
+		if (rex_w && !rex_b && modrm_rm == 4 && sib == 0x24) {
 
 			/* mov reg, disp(%rsp) */
 			ADD_OP(op) {
@@ -271,6 +296,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 				op->dest.reg = CFI_SP;
 				op->dest.offset = insn.displacement.value;
 			}
+			break;
 		}
 
 		break;

From 72943f2093fa34e529c987e68c405c10571aed95 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 3 Feb 2021 12:02:17 +0100
Subject: [PATCH 202/737] objtool: Support stack-swizzle

Natively support the stack swizzle pattern:

	mov %rsp, (%[tos])
	mov %[tos], %rsp
	...
	pop %rsp

It uses the vals[] array to link the first two stack-ops, and detect
the SP to SP_INDIRECT swizzle.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
(cherry picked from commit aafeb14e9da29e323b0605f8f1bae0d45d5f3acf)
---
 tools/objtool/check.c | 45 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index b5c1a2f70a188..0dcf63ea19b74 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -2315,6 +2315,38 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi,
 					cfa->offset = -cfi->vals[op->src.reg].offset;
 					cfi->stack_size = cfa->offset;
 
+				} else if (cfa->base == CFI_SP &&
+					   cfi->vals[op->src.reg].base == CFI_SP_INDIRECT &&
+					   cfi->vals[op->src.reg].offset == cfa->offset) {
+
+					/*
+					 * Stack swizzle:
+					 *
+					 * 1: mov %rsp, (%[tos])
+					 * 2: mov %[tos], %rsp
+					 *    ...
+					 * 3: pop %rsp
+					 *
+					 * Where:
+					 *
+					 * 1 - places a pointer to the previous
+					 *     stack at the Top-of-Stack of the
+					 *     new stack.
+					 *
+					 * 2 - switches to the new stack.
+					 *
+					 * 3 - pops the Top-of-Stack to restore
+					 *     the original stack.
+					 *
+					 * Note: we set base to SP_INDIRECT
+					 * here and preserve offset. Therefore
+					 * when the unwinder reaches ToS it
+					 * will dereference SP and then add the
+					 * offset to find the next frame, IOW:
+					 * (%rsp) + offset.
+					 */
+					cfa->base = CFI_SP_INDIRECT;
+
 				} else {
 					cfa->base = CFI_UNDEFINED;
 					cfa->offset = 0;
@@ -2417,6 +2449,13 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi,
 
 		case OP_SRC_POP:
 		case OP_SRC_POPF:
+			if (op->dest.reg == CFI_SP && cfa->base == CFI_SP_INDIRECT) {
+
+				/* pop %rsp; # restore from a stack swizzle */
+				cfa->base = CFI_SP;
+				break;
+			}
+
 			if (!cfi->drap && op->dest.reg == cfa->base) {
 
 				/* pop %rbp */
@@ -2563,6 +2602,12 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi,
 			/* mov reg, disp(%rsp) */
 			save_reg(cfi, op->src.reg, CFI_CFA,
 				 op->dest.offset - cfi->stack_size);
+
+		} else if (op->src.reg == CFI_SP && op->dest.offset == 0) {
+
+			/* mov %rsp, (%reg); # setup a stack swizzle. */
+			cfi->vals[op->dest.reg].base = CFI_SP_INDIRECT;
+			cfi->vals[op->dest.reg].offset = cfa->offset;
 		}
 
 		break;

From 3ed05afb950b38d626ec19f6c09acfd0be895c8c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 18 Feb 2021 17:14:10 +0100
Subject: [PATCH 203/737] objtool: Fix stack-swizzle for FRAME_POINTER=y

When objtool encounters the stack-swizzle:

	mov %rsp, (%[tos])
	mov %[tos], %rsp
	...
	pop %rsp

Inside a FRAME_POINTER=y build, things go a little screwy because
clearly we're not adjusting the cfa->base. This then results in the
pop %rsp not being detected as a restore of cfa->base so it will turn
into a regular POP and offset the stack, resulting in:

  kernel/softirq.o: warning: objtool: do_softirq()+0xdb: return with modified stack frame

Therefore, have "mov %[tos], %rsp" act like a PUSH (it sorta is
anyway) to balance the things out. We're not too concerned with the
actual stack_size for frame-pointer builds, since we don't generate
ORC data for them anyway.

Fixes: aafeb14e9da2 ("objtool: Support stack-swizzle")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lkml.kernel.org/r/YC6UC+rc9KKmQrkd@hirez.programming.kicks-ass.net
(cherry picked from commit 724c8a23d589d8a002d2e39633c2f9a5a429616f)
---
 tools/objtool/check.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 0dcf63ea19b74..e2c21e638f3a9 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -2353,6 +2353,20 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi,
 				}
 			}
 
+			else if (op->dest.reg == CFI_SP &&
+				 cfi->vals[op->src.reg].base == CFI_SP_INDIRECT &&
+				 cfi->vals[op->src.reg].offset == cfa->offset) {
+
+				/*
+				 * The same stack swizzle case 2) as above. But
+				 * because we can't change cfa->base, case 3)
+				 * will become a regular POP. Pretend we're a
+				 * PUSH so things don't go unbalanced.
+				 */
+				cfi->stack_size += 8;
+			}
+
+
 			break;
 
 		case OP_SRC_ADD:

From cd91e43e6b933508eca80b72fb5c1468547fbd60 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 6 Aug 2020 15:14:09 -0700
Subject: [PATCH 204/737] objtool: Add a pass for generating __mcount_loc

Add the --mcount option for generating __mcount_loc sections
needed for dynamic ftrace. Using this pass requires the kernel to
be compiled with -mfentry and CC_USING_NOP_MCOUNT to be defined
in Makefile.

Link: https://lore.kernel.org/lkml/20200625200235.GQ4781@hirez.programming.kicks-ass.net/
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
[Sami: rebased, dropped config changes, fixed to actually use --mcount,
       and wrote a commit message.]
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
(cherry picked from commit 99d0021569c71c325f41a7dd0a08a380010ce95c)
[ Rebase to 5.10.133:
  - Un-static elf_create_reloc_section()
  - Adapt to modified elf_add_reloc() function signature
]
Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 tools/objtool/builtin-check.c           |  3 +-
 tools/objtool/check.c                   | 85 +++++++++++++++++++++++++
 tools/objtool/elf.c                     |  6 +-
 tools/objtool/include/objtool/builtin.h |  2 +-
 tools/objtool/include/objtool/check.h   |  1 +
 tools/objtool/include/objtool/elf.h     |  1 +
 tools/objtool/include/objtool/objtool.h |  1 +
 tools/objtool/objtool.c                 |  1 +
 8 files changed, 93 insertions(+), 7 deletions(-)

diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c
index 21033efc91e6d..01e6071fda761 100644
--- a/tools/objtool/builtin-check.c
+++ b/tools/objtool/builtin-check.c
@@ -19,7 +19,7 @@
 #include <objtool/objtool.h>
 
 bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats,
-     validate_dup, vmlinux, sls, unret, rethunk;
+     validate_dup, vmlinux, mcount, sls, unret, rethunk;
 
 static const char * const check_usage[] = {
 	"objtool check [<options>] file.o",
@@ -38,6 +38,7 @@ const struct option check_options[] = {
 	OPT_BOOLEAN('s', "stats", &stats, "print statistics"),
 	OPT_BOOLEAN('d', "duplicate", &validate_dup, "duplicate validation for vmlinux.o"),
 	OPT_BOOLEAN('l', "vmlinux", &vmlinux, "vmlinux.o validation"),
+	OPT_BOOLEAN('M', "mcount", &mcount, "generate __mcount_loc"),
 	OPT_BOOLEAN('S', "sls", &sls, "validate straight-line-speculation"),
 	OPT_END(),
 };
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index e2c21e638f3a9..18d7090952358 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -705,6 +705,68 @@ static int create_return_sites_sections(struct objtool_file *file)
 	return 0;
 }
 
+static int create_mcount_loc_sections(struct objtool_file *file)
+{
+	struct section *sec, *reloc_sec;
+	unsigned long *loc;
+	struct instruction *insn;
+	int idx;
+
+	sec = find_section_by_name(file->elf, "__mcount_loc");
+	if (sec) {
+		INIT_LIST_HEAD(&file->mcount_loc_list);
+		WARN("file already has __mcount_loc section, skipping");
+		return 0;
+	}
+
+	if (list_empty(&file->mcount_loc_list))
+		return 0;
+
+	idx = 0;
+	list_for_each_entry(insn, &file->mcount_loc_list, mcount_loc_node)
+		idx++;
+
+	sec = elf_create_section(file->elf, "__mcount_loc", 0, sizeof(unsigned long), idx);
+	if (!sec)
+		return -1;
+
+	reloc_sec = elf_create_reloc_section(file->elf, sec, SHT_RELA);
+	if (!reloc_sec)
+		return -1;
+
+	idx = 0;
+	list_for_each_entry(insn, &file->mcount_loc_list, mcount_loc_node) {
+		struct symbol *sym;
+		int addend;
+
+		loc = (unsigned long *)sec->data->d_buf + idx;
+		memset(loc, 0, sizeof(unsigned long));
+
+		if (insn->sec->sym) {
+			sym = insn->sec->sym;
+			addend = insn->offset;
+		} else {
+			sym = find_symbol_containing(insn->sec, insn->offset);
+
+			if (!sym) {
+				WARN("missing symbol for insn at offset 0x%lx\n",
+				     insn->offset);
+				return -1;
+			}
+
+			addend = insn->offset - sym->offset;
+		}
+
+		if (elf_add_reloc(file->elf, sec, idx * sizeof(unsigned long),
+				  R_X86_64_64, sym, addend))
+			return -1;
+
+		idx++;
+	}
+
+	return 0;
+}
+
 /*
  * Warnings shouldn't be reported for ignored functions.
  */
@@ -1285,6 +1347,22 @@ static int add_call_destinations(struct objtool_file *file)
 
 		} else
 			add_call_dest(file, insn, reloc->sym, false);
+
+		if (mcount && !strcmp(insn->call_dest->name, "__fentry__")) {
+			if (reloc) {
+				reloc->type = R_NONE;
+				elf_write_reloc(file->elf, reloc);
+			}
+
+			elf_write_insn(file->elf, insn->sec,
+				       insn->offset, insn->len,
+				       arch_nop_insn(insn->len));
+
+			insn->type = INSN_NOP;
+
+			list_add_tail(&insn->mcount_loc_node,
+				      &file->mcount_loc_list);
+		}
 	}
 
 	return 0;
@@ -3652,6 +3730,13 @@ int check(struct objtool_file *file)
 		warnings += ret;
 	}
 
+	if (mcount) {
+		ret = create_mcount_loc_sections(file);
+		if (ret < 0)
+			goto out;
+		warnings += ret;
+	}
+
 	if (rethunk) {
 		ret = create_return_sites_sections(file);
 		if (ret < 0)
diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index 1a77492021a62..9aa65bd9ab12a 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -454,10 +454,6 @@ static int read_symbols(struct elf *elf)
 	return -1;
 }
 
-static struct section *elf_create_reloc_section(struct elf *elf,
-						struct section *base,
-						int reltype);
-
 int elf_add_reloc(struct elf *elf, struct section *sec, unsigned long offset,
 		  unsigned int type, struct symbol *sym, s64 addend)
 {
@@ -1067,7 +1063,7 @@ static struct section *elf_create_rela_reloc_section(struct elf *elf, struct sec
 	return sec;
 }
 
-static struct section *elf_create_reloc_section(struct elf *elf,
+struct section *elf_create_reloc_section(struct elf *elf,
 					 struct section *base,
 					 int reltype)
 {
diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h
index 61d8d49dbc657..c3c19e81bb865 100644
--- a/tools/objtool/include/objtool/builtin.h
+++ b/tools/objtool/include/objtool/builtin.h
@@ -9,7 +9,7 @@
 
 extern const struct option check_options[];
 extern bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats,
-            validate_dup, vmlinux, sls, unret, rethunk;
+            validate_dup, vmlinux, mcount, sls, unret, rethunk;
 
 extern int cmd_check(int argc, const char **argv);
 extern int cmd_orc(int argc, const char **argv);
diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h
index 4ba041db304f9..f704b3760dfbb 100644
--- a/tools/objtool/include/objtool/check.h
+++ b/tools/objtool/include/objtool/check.h
@@ -40,6 +40,7 @@ struct instruction {
 	struct list_head list;
 	struct hlist_node hash;
 	struct list_head call_node;
+	struct list_head mcount_loc_node;
 	struct section *sec;
 	unsigned long offset;
 	unsigned int len;
diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h
index a1863eb35fbbc..d2c5ed827ed18 100644
--- a/tools/objtool/include/objtool/elf.h
+++ b/tools/objtool/include/objtool/elf.h
@@ -126,6 +126,7 @@ static inline u32 reloc_hash(struct reloc *reloc)
 
 struct elf *elf_open_read(const char *name, int flags);
 struct section *elf_create_section(struct elf *elf, const char *name, unsigned int sh_flags, size_t entsize, int nr);
+struct section *elf_create_reloc_section(struct elf *elf, struct section *base, int reltype);
 
 int elf_add_reloc(struct elf *elf, struct section *sec, unsigned long offset,
 		  unsigned int type, struct symbol *sym, s64 addend);
diff --git a/tools/objtool/include/objtool/objtool.h b/tools/objtool/include/objtool/objtool.h
index 372225fa1259d..1e1147c0eaffd 100644
--- a/tools/objtool/include/objtool/objtool.h
+++ b/tools/objtool/include/objtool/objtool.h
@@ -21,6 +21,7 @@ struct objtool_file {
 	struct list_head retpoline_call_list;
 	struct list_head return_thunk_list;
 	struct list_head static_call_list;
+	struct list_head mcount_loc_list;
 	bool ignore_unreachables, c_file, hints, rodata;
 };
 
diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c
index f46077e484f84..ee86437ab814f 100644
--- a/tools/objtool/objtool.c
+++ b/tools/objtool/objtool.c
@@ -64,6 +64,7 @@ struct objtool_file *objtool_open_read(const char *_objname)
 	INIT_LIST_HEAD(&file.retpoline_call_list);
 	INIT_LIST_HEAD(&file.return_thunk_list);
 	INIT_LIST_HEAD(&file.static_call_list);
+	INIT_LIST_HEAD(&file.mcount_loc_list);
 	file.c_file = !vmlinux && find_section_by_name(file.elf, ".comment");
 	file.ignore_unreachables = no_unreachable;
 	file.hints = false;

From cafc8e65d98b365222592e603c3753ed308a253a Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Fri, 17 Jul 2020 12:04:27 -0700
Subject: [PATCH 205/737] objtool: Don't autodetect vmlinux.o

With LTO, we run objtool on vmlinux.o, but don't want noinstr
validation. This change requires --vmlinux to be passed to objtool
explicitly.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
(cherry picked from commit 0e731dbc18241d68318e0a7d2c2c0087c9073fb9)
---
 scripts/link-vmlinux.sh       | 2 +-
 tools/objtool/builtin-check.c | 6 +-----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh
index acd07a70a2f4e..01fc9d8b264a2 100755
--- a/scripts/link-vmlinux.sh
+++ b/scripts/link-vmlinux.sh
@@ -64,7 +64,7 @@ objtool_link()
 	local objtoolopt;
 
 	if [ -n "${CONFIG_VMLINUX_VALIDATION}" ]; then
-		objtoolopt="check"
+		objtoolopt="check --vmlinux"
 		if [ -n "${CONFIG_CPU_UNRET_ENTRY}" ]; then
 			objtoolopt="${objtoolopt} --unret"
 		fi
diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c
index 01e6071fda761..26ad6d8819718 100644
--- a/tools/objtool/builtin-check.c
+++ b/tools/objtool/builtin-check.c
@@ -45,7 +45,7 @@ const struct option check_options[] = {
 
 int cmd_check(int argc, const char **argv)
 {
-	const char *objname, *s;
+	const char *objname;
 	struct objtool_file *file;
 	int ret;
 
@@ -56,10 +56,6 @@ int cmd_check(int argc, const char **argv)
 
 	objname = argv[0];
 
-	s = strstr(objname, "vmlinux.o");
-	if (s && !s[9])
-		vmlinux = true;
-
 	file = objtool_open_read(objname);
 	if (!file)
 		return 1;

From b52c9de43889ce29db519b6174390905dcb9c508 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Wed, 30 Sep 2020 14:36:59 -0700
Subject: [PATCH 206/737] objtool: Split noinstr validation from --vmlinux

This change adds a --noinstr flag to objtool to allow us to specify
that we're processing vmlinux.o without also enabling noinstr
validation. This is needed to avoid false positives with LTO when we
run objtool on vmlinux.o without CONFIG_DEBUG_ENTRY.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
(cherry picked from commit 41425ebe20245c99b44d6ba0f017be9bfc28414f)
---
 scripts/link-vmlinux.sh                 | 2 +-
 tools/objtool/builtin-check.c           | 3 ++-
 tools/objtool/check.c                   | 2 +-
 tools/objtool/include/objtool/builtin.h | 2 +-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh
index 01fc9d8b264a2..7f6d67b02984f 100755
--- a/scripts/link-vmlinux.sh
+++ b/scripts/link-vmlinux.sh
@@ -64,7 +64,7 @@ objtool_link()
 	local objtoolopt;
 
 	if [ -n "${CONFIG_VMLINUX_VALIDATION}" ]; then
-		objtoolopt="check --vmlinux"
+		objtoolopt="check --vmlinux --noinstr"
 		if [ -n "${CONFIG_CPU_UNRET_ENTRY}" ]; then
 			objtoolopt="${objtoolopt} --unret"
 		fi
diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c
index 26ad6d8819718..faefd9922a54d 100644
--- a/tools/objtool/builtin-check.c
+++ b/tools/objtool/builtin-check.c
@@ -19,7 +19,7 @@
 #include <objtool/objtool.h>
 
 bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats,
-     validate_dup, vmlinux, mcount, sls, unret, rethunk;
+     validate_dup, vmlinux, mcount, noinstr, sls, unret, rethunk;
 
 static const char * const check_usage[] = {
 	"objtool check [<options>] file.o",
@@ -37,6 +37,7 @@ const struct option check_options[] = {
 	OPT_BOOLEAN('a', "uaccess", &uaccess, "enable uaccess checking"),
 	OPT_BOOLEAN('s', "stats", &stats, "print statistics"),
 	OPT_BOOLEAN('d', "duplicate", &validate_dup, "duplicate validation for vmlinux.o"),
+	OPT_BOOLEAN('n', "noinstr", &noinstr, "noinstr validation for vmlinux.o"),
 	OPT_BOOLEAN('l', "vmlinux", &vmlinux, "vmlinux.o validation"),
 	OPT_BOOLEAN('M', "mcount", &mcount, "generate __mcount_loc"),
 	OPT_BOOLEAN('S', "sls", &sls, "validate straight-line-speculation"),
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 18d7090952358..1ef2a091d5fc6 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -270,7 +270,7 @@ static void init_insn_state(struct insn_state *state, struct section *sec)
 	 * not correctly determine insn->call_dest->sec (external symbols do
 	 * not have a section).
 	 */
-	if (vmlinux && sec)
+	if (vmlinux && noinstr && sec)
 		state->noinstr = sec->noinstr;
 }
 
diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h
index c3c19e81bb865..94e653d4d230c 100644
--- a/tools/objtool/include/objtool/builtin.h
+++ b/tools/objtool/include/objtool/builtin.h
@@ -9,7 +9,7 @@
 
 extern const struct option check_options[];
 extern bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats,
-            validate_dup, vmlinux, mcount, sls, unret, rethunk;
+            validate_dup, vmlinux, mcount, noinstr, sls, unret, rethunk;
 
 extern int cmd_check(int argc, const char **argv);
 extern int cmd_orc(int argc, const char **argv);

From ce242bc8cdb4f3c3695d5a094eb56c2b7c627421 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 8 Mar 2021 15:46:04 +0100
Subject: [PATCH 207/737] objtool,x86: Fix uaccess PUSHF/POPF validation

Commit ab234a260b1f ("x86/pv: Rework arch_local_irq_restore() to not
use popf") replaced "push %reg; popf" with something like: "test
$0x200, %reg; jz 1f; sti; 1:", which breaks the pushf/popf symmetry
that commit ea24213d8088 ("objtool: Add UACCESS validation") relies
on.

The result is:

  drivers/gpu/drm/amd/amdgpu/si.o: warning: objtool: si_common_hw_init()+0xf36: PUSHF stack exhausted

Meanwhile, commit c9c324dc22aa ("objtool: Support stack layout changes
in alternatives") makes that we can actually use stack-ops in
alternatives, which means we can revert 1ff865e343c2 ("x86,smap: Fix
smap_{save,restore}() alternatives").

That in turn means we can limit the PUSHF/POPF handling of
ea24213d8088 to those instructions that are in alternatives.

Fixes: ab234a260b1f ("x86/pv: Rework arch_local_irq_restore() to not use popf")
Reported-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lkml.kernel.org/r/YEY4rIbQYa5fnnEp@hirez.programming.kicks-ass.net
(cherry picked from commit ba08abca66d46381df60842f64f70099d5482b92)
---
 arch/x86/include/asm/smap.h | 10 ++++------
 tools/objtool/check.c       |  3 +++
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h
index ea1d8eb644cb7..d17b39893b797 100644
--- a/arch/x86/include/asm/smap.h
+++ b/arch/x86/include/asm/smap.h
@@ -55,9 +55,8 @@ static __always_inline unsigned long smap_save(void)
 	unsigned long flags;
 
 	asm volatile ("# smap_save\n\t"
-		      ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP)
-		      "pushf; pop %0; " __ASM_CLAC "\n\t"
-		      "1:"
+		      ALTERNATIVE("", "pushf; pop %0; " __ASM_CLAC "\n\t",
+				  X86_FEATURE_SMAP)
 		      : "=rm" (flags) : : "memory", "cc");
 
 	return flags;
@@ -66,9 +65,8 @@ static __always_inline unsigned long smap_save(void)
 static __always_inline void smap_restore(unsigned long flags)
 {
 	asm volatile ("# smap_restore\n\t"
-		      ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP)
-		      "push %0; popf\n\t"
-		      "1:"
+		      ALTERNATIVE("", "push %0; popf\n\t",
+				  X86_FEATURE_SMAP)
 		      : : "g" (flags) : "memory", "cc");
 }
 
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 1ef2a091d5fc6..2a6be378aa9c1 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -2794,6 +2794,9 @@ static int handle_insn_ops(struct instruction *insn, struct insn_state *state)
 		if (update_cfi_state(insn, &state->cfi, op))
 			return 1;
 
+		if (!insn->alt_group)
+			continue;
+
 		if (op->dest.type == OP_DEST_PUSHF) {
 			if (!state->uaccess_stack) {
 				state->uaccess_stack = 1;

From e9bd8f21e7dd8174b620cd6f3f7022e8438a5ab8 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 26 Oct 2020 13:31:47 +0000
Subject: [PATCH 208/737] arm64: uaccess: move uao_* alternatives to
 asm-uaccess.h

The uao_* alternative asm macros are only used by the uaccess assembly
routines in arch/arm64/lib/, where they are included indirectly via
asm-uaccess.h. Since they're specific to the uaccess assembly (and will
lose the alternatives in subsequent patches), let's move them into
asm-uaccess.h.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will@kernel.org>
[will: update #include in mte.S to pull in uao asm macros]
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit e2a2190a80ca0ebddd52c766caf08908d71fb949)
---
 arch/arm64/include/asm/alternative.h | 59 ----------------------------
 arch/arm64/include/asm/asm-uaccess.h | 59 ++++++++++++++++++++++++++++
 arch/arm64/lib/mte.S                 |  2 +-
 3 files changed, 60 insertions(+), 60 deletions(-)

diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h
index 3cb3c4ab3ea56..451ce45e6c0ed 100644
--- a/arch/arm64/include/asm/alternative.h
+++ b/arch/arm64/include/asm/alternative.h
@@ -224,65 +224,6 @@ alternative_endif
 	_asm_extable 9999b, \label
 .endm
 
-/*
- * Generate the assembly for UAO alternatives with exception table entries.
- * This is complicated as there is no post-increment or pair versions of the
- * unprivileged instructions, and USER() only works for single instructions.
- */
-#ifdef CONFIG_ARM64_UAO
-	.macro uao_ldp l, reg1, reg2, addr, post_inc
-		alternative_if_not ARM64_HAS_UAO
-8888:			ldp	\reg1, \reg2, [\addr], \post_inc;
-8889:			nop;
-			nop;
-		alternative_else
-			ldtr	\reg1, [\addr];
-			ldtr	\reg2, [\addr, #8];
-			add	\addr, \addr, \post_inc;
-		alternative_endif
-
-		_asm_extable	8888b,\l;
-		_asm_extable	8889b,\l;
-	.endm
-
-	.macro uao_stp l, reg1, reg2, addr, post_inc
-		alternative_if_not ARM64_HAS_UAO
-8888:			stp	\reg1, \reg2, [\addr], \post_inc;
-8889:			nop;
-			nop;
-		alternative_else
-			sttr	\reg1, [\addr];
-			sttr	\reg2, [\addr, #8];
-			add	\addr, \addr, \post_inc;
-		alternative_endif
-
-		_asm_extable	8888b,\l;
-		_asm_extable	8889b,\l;
-	.endm
-
-	.macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc
-		alternative_if_not ARM64_HAS_UAO
-8888:			\inst	\reg, [\addr], \post_inc;
-			nop;
-		alternative_else
-			\alt_inst	\reg, [\addr];
-			add		\addr, \addr, \post_inc;
-		alternative_endif
-
-		_asm_extable	8888b,\l;
-	.endm
-#else
-	.macro uao_ldp l, reg1, reg2, addr, post_inc
-		USER(\l, ldp \reg1, \reg2, [\addr], \post_inc)
-	.endm
-	.macro uao_stp l, reg1, reg2, addr, post_inc
-		USER(\l, stp \reg1, \reg2, [\addr], \post_inc)
-	.endm
-	.macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc
-		USER(\l, \inst \reg, [\addr], \post_inc)
-	.endm
-#endif
-
 #endif  /*  __ASSEMBLY__  */
 
 /*
diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
index 5ef624fef44a2..d2c8e312b0191 100644
--- a/arch/arm64/include/asm/asm-uaccess.h
+++ b/arch/arm64/include/asm/asm-uaccess.h
@@ -58,4 +58,63 @@ alternative_else_nop_endif
 	.endm
 #endif
 
+/*
+ * Generate the assembly for UAO alternatives with exception table entries.
+ * This is complicated as there is no post-increment or pair versions of the
+ * unprivileged instructions, and USER() only works for single instructions.
+ */
+#ifdef CONFIG_ARM64_UAO
+	.macro uao_ldp l, reg1, reg2, addr, post_inc
+		alternative_if_not ARM64_HAS_UAO
+8888:			ldp	\reg1, \reg2, [\addr], \post_inc;
+8889:			nop;
+			nop;
+		alternative_else
+			ldtr	\reg1, [\addr];
+			ldtr	\reg2, [\addr, #8];
+			add	\addr, \addr, \post_inc;
+		alternative_endif
+
+		_asm_extable	8888b,\l;
+		_asm_extable	8889b,\l;
+	.endm
+
+	.macro uao_stp l, reg1, reg2, addr, post_inc
+		alternative_if_not ARM64_HAS_UAO
+8888:			stp	\reg1, \reg2, [\addr], \post_inc;
+8889:			nop;
+			nop;
+		alternative_else
+			sttr	\reg1, [\addr];
+			sttr	\reg2, [\addr, #8];
+			add	\addr, \addr, \post_inc;
+		alternative_endif
+
+		_asm_extable	8888b,\l;
+		_asm_extable	8889b,\l;
+	.endm
+
+	.macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc
+		alternative_if_not ARM64_HAS_UAO
+8888:			\inst	\reg, [\addr], \post_inc;
+			nop;
+		alternative_else
+			\alt_inst	\reg, [\addr];
+			add		\addr, \addr, \post_inc;
+		alternative_endif
+
+		_asm_extable	8888b,\l;
+	.endm
+#else
+	.macro uao_ldp l, reg1, reg2, addr, post_inc
+		USER(\l, ldp \reg1, \reg2, [\addr], \post_inc)
+	.endm
+	.macro uao_stp l, reg1, reg2, addr, post_inc
+		USER(\l, stp \reg1, \reg2, [\addr], \post_inc)
+	.endm
+	.macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc
+		USER(\l, \inst \reg, [\addr], \post_inc)
+	.endm
+#endif
+
 #endif
diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S
index 03ca6d8b86706..cceed41bba153 100644
--- a/arch/arm64/lib/mte.S
+++ b/arch/arm64/lib/mte.S
@@ -4,7 +4,7 @@
  */
 #include <linux/linkage.h>
 
-#include <asm/alternative.h>
+#include <asm/asm-uaccess.h>
 #include <asm/assembler.h>
 #include <asm/mte.h>
 #include <asm/page.h>

From a9685b1c238c774c62d03e595df51775af07c41b Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 30 Jun 2020 13:55:59 +0100
Subject: [PATCH 209/737] arm64: alternatives: Split up alternative.h

asm/alternative.h contains both the macros needed to use alternatives,
as well the type definitions and function prototypes for applying them.

Split the header in two, so that alternatives can be used from core
header files such as linux/compiler.h without the risk of circular
includes

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 7cda23da52ad793a578d290e7fcc9cdc1698bba8)
---
 arch/arm64/include/asm/alternative-macros.h | 217 ++++++++++++++++++++
 arch/arm64/include/asm/alternative.h        | 208 +------------------
 arch/arm64/include/asm/asm-uaccess.h        |   2 +-
 arch/arm64/include/asm/insn.h               |   3 +-
 4 files changed, 221 insertions(+), 209 deletions(-)
 create mode 100644 arch/arm64/include/asm/alternative-macros.h

diff --git a/arch/arm64/include/asm/alternative-macros.h b/arch/arm64/include/asm/alternative-macros.h
new file mode 100644
index 0000000000000..5df500dcc627a
--- /dev/null
+++ b/arch/arm64/include/asm/alternative-macros.h
@@ -0,0 +1,217 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_ALTERNATIVE_MACROS_H
+#define __ASM_ALTERNATIVE_MACROS_H
+
+#include <asm/cpucaps.h>
+
+#define ARM64_CB_PATCH ARM64_NCAPS
+
+/* A64 instructions are always 32 bits. */
+#define	AARCH64_INSN_SIZE		4
+
+#ifndef __ASSEMBLY__
+
+#include <linux/stringify.h>
+
+#define ALTINSTR_ENTRY(feature)					              \
+	" .word 661b - .\n"				/* label           */ \
+	" .word 663f - .\n"				/* new instruction */ \
+	" .hword " __stringify(feature) "\n"		/* feature bit     */ \
+	" .byte 662b-661b\n"				/* source len      */ \
+	" .byte 664f-663f\n"				/* replacement len */
+
+#define ALTINSTR_ENTRY_CB(feature, cb)					      \
+	" .word 661b - .\n"				/* label           */ \
+	" .word " __stringify(cb) "- .\n"		/* callback */	      \
+	" .hword " __stringify(feature) "\n"		/* feature bit     */ \
+	" .byte 662b-661b\n"				/* source len      */ \
+	" .byte 664f-663f\n"				/* replacement len */
+
+/*
+ * alternative assembly primitive:
+ *
+ * If any of these .org directive fail, it means that insn1 and insn2
+ * don't have the same length. This used to be written as
+ *
+ * .if ((664b-663b) != (662b-661b))
+ * 	.error "Alternatives instruction length mismatch"
+ * .endif
+ *
+ * but most assemblers die if insn1 or insn2 have a .inst. This should
+ * be fixed in a binutils release posterior to 2.25.51.0.2 (anything
+ * containing commit 4e4d08cf7399b606 or c1baaddf8861).
+ *
+ * Alternatives with callbacks do not generate replacement instructions.
+ */
+#define __ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg_enabled)	\
+	".if "__stringify(cfg_enabled)" == 1\n"				\
+	"661:\n\t"							\
+	oldinstr "\n"							\
+	"662:\n"							\
+	".pushsection .altinstructions,\"a\"\n"				\
+	ALTINSTR_ENTRY(feature)						\
+	".popsection\n"							\
+	".subsection 1\n"						\
+	"663:\n\t"							\
+	newinstr "\n"							\
+	"664:\n\t"							\
+	".org	. - (664b-663b) + (662b-661b)\n\t"			\
+	".org	. - (662b-661b) + (664b-663b)\n\t"			\
+	".previous\n"							\
+	".endif\n"
+
+#define __ALTERNATIVE_CFG_CB(oldinstr, feature, cfg_enabled, cb)	\
+	".if "__stringify(cfg_enabled)" == 1\n"				\
+	"661:\n\t"							\
+	oldinstr "\n"							\
+	"662:\n"							\
+	".pushsection .altinstructions,\"a\"\n"				\
+	ALTINSTR_ENTRY_CB(feature, cb)					\
+	".popsection\n"							\
+	"663:\n\t"							\
+	"664:\n\t"							\
+	".endif\n"
+
+#define _ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg, ...)	\
+	__ALTERNATIVE_CFG(oldinstr, newinstr, feature, IS_ENABLED(cfg))
+
+#define ALTERNATIVE_CB(oldinstr, cb) \
+	__ALTERNATIVE_CFG_CB(oldinstr, ARM64_CB_PATCH, 1, cb)
+#else
+
+#include <asm/assembler.h>
+
+.macro altinstruction_entry orig_offset alt_offset feature orig_len alt_len
+	.word \orig_offset - .
+	.word \alt_offset - .
+	.hword \feature
+	.byte \orig_len
+	.byte \alt_len
+.endm
+
+.macro alternative_insn insn1, insn2, cap, enable = 1
+	.if \enable
+661:	\insn1
+662:	.pushsection .altinstructions, "a"
+	altinstruction_entry 661b, 663f, \cap, 662b-661b, 664f-663f
+	.popsection
+	.subsection 1
+663:	\insn2
+664:	.previous
+	.org	. - (664b-663b) + (662b-661b)
+	.org	. - (662b-661b) + (664b-663b)
+	.endif
+.endm
+
+/*
+ * Alternative sequences
+ *
+ * The code for the case where the capability is not present will be
+ * assembled and linked as normal. There are no restrictions on this
+ * code.
+ *
+ * The code for the case where the capability is present will be
+ * assembled into a special section to be used for dynamic patching.
+ * Code for that case must:
+ *
+ * 1. Be exactly the same length (in bytes) as the default code
+ *    sequence.
+ *
+ * 2. Not contain a branch target that is used outside of the
+ *    alternative sequence it is defined in (branches into an
+ *    alternative sequence are not fixed up).
+ */
+
+/*
+ * Begin an alternative code sequence.
+ */
+.macro alternative_if_not cap
+	.set .Lasm_alt_mode, 0
+	.pushsection .altinstructions, "a"
+	altinstruction_entry 661f, 663f, \cap, 662f-661f, 664f-663f
+	.popsection
+661:
+.endm
+
+.macro alternative_if cap
+	.set .Lasm_alt_mode, 1
+	.pushsection .altinstructions, "a"
+	altinstruction_entry 663f, 661f, \cap, 664f-663f, 662f-661f
+	.popsection
+	.subsection 1
+	.align 2	/* So GAS knows label 661 is suitably aligned */
+661:
+.endm
+
+.macro alternative_cb cb
+	.set .Lasm_alt_mode, 0
+	.pushsection .altinstructions, "a"
+	altinstruction_entry 661f, \cb, ARM64_CB_PATCH, 662f-661f, 0
+	.popsection
+661:
+.endm
+
+/*
+ * Provide the other half of the alternative code sequence.
+ */
+.macro alternative_else
+662:
+	.if .Lasm_alt_mode==0
+	.subsection 1
+	.else
+	.previous
+	.endif
+663:
+.endm
+
+/*
+ * Complete an alternative code sequence.
+ */
+.macro alternative_endif
+664:
+	.if .Lasm_alt_mode==0
+	.previous
+	.endif
+	.org	. - (664b-663b) + (662b-661b)
+	.org	. - (662b-661b) + (664b-663b)
+.endm
+
+/*
+ * Callback-based alternative epilogue
+ */
+.macro alternative_cb_end
+662:
+.endm
+
+/*
+ * Provides a trivial alternative or default sequence consisting solely
+ * of NOPs. The number of NOPs is chosen automatically to match the
+ * previous case.
+ */
+.macro alternative_else_nop_endif
+alternative_else
+	nops	(662b-661b) / AARCH64_INSN_SIZE
+alternative_endif
+.endm
+
+#define _ALTERNATIVE_CFG(insn1, insn2, cap, cfg, ...)	\
+	alternative_insn insn1, insn2, cap, IS_ENABLED(cfg)
+
+.macro user_alt, label, oldinstr, newinstr, cond
+9999:	alternative_insn "\oldinstr", "\newinstr", \cond
+	_asm_extable 9999b, \label
+.endm
+
+#endif  /*  __ASSEMBLY__  */
+
+/*
+ * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature));
+ *
+ * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature, CONFIG_FOO));
+ * N.B. If CONFIG_FOO is specified, but not selected, the whole block
+ *      will be omitted, including oldinstr.
+ */
+#define ALTERNATIVE(oldinstr, newinstr, ...)   \
+	_ALTERNATIVE_CFG(oldinstr, newinstr, __VA_ARGS__, 1)
+
+#endif /* __ASM_ALTERNATIVE_MACROS_H */
diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h
index 451ce45e6c0ed..a38b92e11811e 100644
--- a/arch/arm64/include/asm/alternative.h
+++ b/arch/arm64/include/asm/alternative.h
@@ -2,17 +2,13 @@
 #ifndef __ASM_ALTERNATIVE_H
 #define __ASM_ALTERNATIVE_H
 
-#include <asm/cpucaps.h>
-#include <asm/insn.h>
-
-#define ARM64_CB_PATCH ARM64_NCAPS
+#include <asm/alternative-macros.h>
 
 #ifndef __ASSEMBLY__
 
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/stddef.h>
-#include <linux/stringify.h>
 
 struct alt_instr {
 	s32 orig_offset;	/* offset to original instruction */
@@ -35,205 +31,5 @@ void apply_alternatives_module(void *start, size_t length);
 static inline void apply_alternatives_module(void *start, size_t length) { }
 #endif
 
-#define ALTINSTR_ENTRY(feature)					              \
-	" .word 661b - .\n"				/* label           */ \
-	" .word 663f - .\n"				/* new instruction */ \
-	" .hword " __stringify(feature) "\n"		/* feature bit     */ \
-	" .byte 662b-661b\n"				/* source len      */ \
-	" .byte 664f-663f\n"				/* replacement len */
-
-#define ALTINSTR_ENTRY_CB(feature, cb)					      \
-	" .word 661b - .\n"				/* label           */ \
-	" .word " __stringify(cb) "- .\n"		/* callback */	      \
-	" .hword " __stringify(feature) "\n"		/* feature bit     */ \
-	" .byte 662b-661b\n"				/* source len      */ \
-	" .byte 664f-663f\n"				/* replacement len */
-
-/*
- * alternative assembly primitive:
- *
- * If any of these .org directive fail, it means that insn1 and insn2
- * don't have the same length. This used to be written as
- *
- * .if ((664b-663b) != (662b-661b))
- * 	.error "Alternatives instruction length mismatch"
- * .endif
- *
- * but most assemblers die if insn1 or insn2 have a .inst. This should
- * be fixed in a binutils release posterior to 2.25.51.0.2 (anything
- * containing commit 4e4d08cf7399b606 or c1baaddf8861).
- *
- * Alternatives with callbacks do not generate replacement instructions.
- */
-#define __ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg_enabled)	\
-	".if "__stringify(cfg_enabled)" == 1\n"				\
-	"661:\n\t"							\
-	oldinstr "\n"							\
-	"662:\n"							\
-	".pushsection .altinstructions,\"a\"\n"				\
-	ALTINSTR_ENTRY(feature)						\
-	".popsection\n"							\
-	".subsection 1\n"						\
-	"663:\n\t"							\
-	newinstr "\n"							\
-	"664:\n\t"							\
-	".org	. - (664b-663b) + (662b-661b)\n\t"			\
-	".org	. - (662b-661b) + (664b-663b)\n\t"			\
-	".previous\n"							\
-	".endif\n"
-
-#define __ALTERNATIVE_CFG_CB(oldinstr, feature, cfg_enabled, cb)	\
-	".if "__stringify(cfg_enabled)" == 1\n"				\
-	"661:\n\t"							\
-	oldinstr "\n"							\
-	"662:\n"							\
-	".pushsection .altinstructions,\"a\"\n"				\
-	ALTINSTR_ENTRY_CB(feature, cb)					\
-	".popsection\n"							\
-	"663:\n\t"							\
-	"664:\n\t"							\
-	".endif\n"
-
-#define _ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg, ...)	\
-	__ALTERNATIVE_CFG(oldinstr, newinstr, feature, IS_ENABLED(cfg))
-
-#define ALTERNATIVE_CB(oldinstr, cb) \
-	__ALTERNATIVE_CFG_CB(oldinstr, ARM64_CB_PATCH, 1, cb)
-#else
-
-#include <asm/assembler.h>
-
-.macro altinstruction_entry orig_offset alt_offset feature orig_len alt_len
-	.word \orig_offset - .
-	.word \alt_offset - .
-	.hword \feature
-	.byte \orig_len
-	.byte \alt_len
-.endm
-
-.macro alternative_insn insn1, insn2, cap, enable = 1
-	.if \enable
-661:	\insn1
-662:	.pushsection .altinstructions, "a"
-	altinstruction_entry 661b, 663f, \cap, 662b-661b, 664f-663f
-	.popsection
-	.subsection 1
-663:	\insn2
-664:	.org	. - (664b-663b) + (662b-661b)
-	.org	. - (662b-661b) + (664b-663b)
-	.previous
-	.endif
-.endm
-
-/*
- * Alternative sequences
- *
- * The code for the case where the capability is not present will be
- * assembled and linked as normal. There are no restrictions on this
- * code.
- *
- * The code for the case where the capability is present will be
- * assembled into a special section to be used for dynamic patching.
- * Code for that case must:
- *
- * 1. Be exactly the same length (in bytes) as the default code
- *    sequence.
- *
- * 2. Not contain a branch target that is used outside of the
- *    alternative sequence it is defined in (branches into an
- *    alternative sequence are not fixed up).
- */
-
-/*
- * Begin an alternative code sequence.
- */
-.macro alternative_if_not cap
-	.set .Lasm_alt_mode, 0
-	.pushsection .altinstructions, "a"
-	altinstruction_entry 661f, 663f, \cap, 662f-661f, 664f-663f
-	.popsection
-661:
-.endm
-
-.macro alternative_if cap
-	.set .Lasm_alt_mode, 1
-	.pushsection .altinstructions, "a"
-	altinstruction_entry 663f, 661f, \cap, 664f-663f, 662f-661f
-	.popsection
-	.subsection 1
-	.align 2	/* So GAS knows label 661 is suitably aligned */
-661:
-.endm
-
-.macro alternative_cb cb
-	.set .Lasm_alt_mode, 0
-	.pushsection .altinstructions, "a"
-	altinstruction_entry 661f, \cb, ARM64_CB_PATCH, 662f-661f, 0
-	.popsection
-661:
-.endm
-
-/*
- * Provide the other half of the alternative code sequence.
- */
-.macro alternative_else
-662:
-	.if .Lasm_alt_mode==0
-	.subsection 1
-	.else
-	.previous
-	.endif
-663:
-.endm
-
-/*
- * Complete an alternative code sequence.
- */
-.macro alternative_endif
-664:
-	.org	. - (664b-663b) + (662b-661b)
-	.org	. - (662b-661b) + (664b-663b)
-	.if .Lasm_alt_mode==0
-	.previous
-	.endif
-.endm
-
-/*
- * Callback-based alternative epilogue
- */
-.macro alternative_cb_end
-662:
-.endm
-
-/*
- * Provides a trivial alternative or default sequence consisting solely
- * of NOPs. The number of NOPs is chosen automatically to match the
- * previous case.
- */
-.macro alternative_else_nop_endif
-alternative_else
-	nops	(662b-661b) / AARCH64_INSN_SIZE
-alternative_endif
-.endm
-
-#define _ALTERNATIVE_CFG(insn1, insn2, cap, cfg, ...)	\
-	alternative_insn insn1, insn2, cap, IS_ENABLED(cfg)
-
-.macro user_alt, label, oldinstr, newinstr, cond
-9999:	alternative_insn "\oldinstr", "\newinstr", \cond
-	_asm_extable 9999b, \label
-.endm
-
-#endif  /*  __ASSEMBLY__  */
-
-/*
- * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature));
- *
- * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature, CONFIG_FOO));
- * N.B. If CONFIG_FOO is specified, but not selected, the whole block
- *      will be omitted, including oldinstr.
- */
-#define ALTERNATIVE(oldinstr, newinstr, ...)   \
-	_ALTERNATIVE_CFG(oldinstr, newinstr, __VA_ARGS__, 1)
-
+#endif /* __ASSEMBLY__ */
 #endif /* __ASM_ALTERNATIVE_H */
diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
index d2c8e312b0191..54611cebfca77 100644
--- a/arch/arm64/include/asm/asm-uaccess.h
+++ b/arch/arm64/include/asm/asm-uaccess.h
@@ -2,7 +2,7 @@
 #ifndef __ASM_ASM_UACCESS_H
 #define __ASM_ASM_UACCESS_H
 
-#include <asm/alternative.h>
+#include <asm/alternative-macros.h>
 #include <asm/kernel-pgtable.h>
 #include <asm/mmu.h>
 #include <asm/sysreg.h>
diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index d45b42295254d..c0973345e6e1d 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -10,8 +10,7 @@
 #include <linux/build_bug.h>
 #include <linux/types.h>
 
-/* A64 instructions are always 32 bits. */
-#define	AARCH64_INSN_SIZE		4
+#include <asm/alternative.h>
 
 #ifndef __ASSEMBLY__
 /*

From a31f9785b59813618000339264ff9297124685fb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 11 Feb 2021 13:03:28 +0100
Subject: [PATCH 210/737] objtool: Allow UNWIND_HINT to suppress dodgy stack
 modifications

rewind_stack_do_exit()
	UNWIND_HINT_FUNC
	/* Prevent any naive code from trying to unwind to our caller. */

	xorl	%ebp, %ebp
	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rax
	leaq	-PTREGS_SIZE(%rax), %rsp
	UNWIND_HINT_REGS

	call	do_exit

Does unspeakable things to the stack, which objtool currently fails to
detect due to a limitation in instruction decoding. This will be
rectified after which the above will result in:

arch/x86/entry/entry_64.o: warning: objtool: .text+0xab: unsupported stack register modification

Allow the UNWIND_HINT on the next instruction to suppress this, it
will overwrite the state anyway.

Suggested-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Tested-by: Nick Desaulniers <ndesaulniers@google.com>
Link: https://lkml.kernel.org/r/20210211173626.918498579@infradead.org
(cherry picked from commit d54dba41999498b38a40940e1123019d50b26496)
---
 tools/objtool/check.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 2a6be378aa9c1..605efaedc79f0 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -2306,8 +2306,9 @@ static void restore_reg(struct cfi_state *cfi, unsigned char reg)
  *   41 5d			pop    %r13
  *   c3				retq
  */
-static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi,
-			     struct stack_op *op)
+static int update_cfi_state(struct instruction *insn,
+			    struct instruction *next_insn,
+			    struct cfi_state *cfi, struct stack_op *op)
 {
 	struct cfi_reg *cfa = &cfi->cfa;
 	struct cfi_reg *regs = cfi->regs;
@@ -2508,7 +2509,7 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi,
 				break;
 			}
 
-			if (op->dest.reg == cfi->cfa.base) {
+			if (op->dest.reg == cfi->cfa.base && !(next_insn && next_insn->hint)) {
 				WARN_FUNC("unsupported stack register modification",
 					  insn->sec, insn->offset);
 				return -1;
@@ -2785,13 +2786,15 @@ static int propagate_alt_cfi(struct objtool_file *file, struct instruction *insn
 	return 0;
 }
 
-static int handle_insn_ops(struct instruction *insn, struct insn_state *state)
+static int handle_insn_ops(struct instruction *insn,
+			   struct instruction *next_insn,
+			   struct insn_state *state)
 {
 	struct stack_op *op;
 
 	list_for_each_entry(op, &insn->stack_ops, list) {
 
-		if (update_cfi_state(insn, &state->cfi, op))
+		if (update_cfi_state(insn, next_insn, &state->cfi, op))
 			return 1;
 
 		if (!insn->alt_group)
@@ -3118,7 +3121,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func,
 				return 0;
 		}
 
-		if (handle_insn_ops(insn, &state))
+		if (handle_insn_ops(insn, next_insn, &state))
 			return 1;
 
 		switch (insn->type) {

From 623540c2aa35873d2f85334b66a01ae4d4f6ebfd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 9 Feb 2021 20:18:21 +0100
Subject: [PATCH 211/737] objtool,x86: Renumber CFI_reg

Make them match the instruction encoding numbering.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Tested-by: Nick Desaulniers <ndesaulniers@google.com>
Link: https://lkml.kernel.org/r/20210211173627.033720313@infradead.org
(cherry picked from commit d473b18b2ef62563fb874f9cae6e123f99129e3f)
---
 tools/objtool/arch/x86/include/arch/cfi_regs.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/objtool/arch/x86/include/arch/cfi_regs.h b/tools/objtool/arch/x86/include/arch/cfi_regs.h
index 79bc517efba85..0579d22c433cd 100644
--- a/tools/objtool/arch/x86/include/arch/cfi_regs.h
+++ b/tools/objtool/arch/x86/include/arch/cfi_regs.h
@@ -4,13 +4,13 @@
 #define _OBJTOOL_CFI_REGS_H
 
 #define CFI_AX			0
-#define CFI_DX			1
-#define CFI_CX			2
+#define CFI_CX			1
+#define CFI_DX			2
 #define CFI_BX			3
-#define CFI_SI			4
-#define CFI_DI			5
-#define CFI_BP			6
-#define CFI_SP			7
+#define CFI_SP			4
+#define CFI_BP			5
+#define CFI_SI			6
+#define CFI_DI			7
 #define CFI_R8			8
 #define CFI_R9			9
 #define CFI_R10			10

From 221ec91cc3e10a33d66602c107b3ce7eca236e97 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 9 Feb 2021 21:29:16 +0100
Subject: [PATCH 212/737] objtool,x86: Rewrite LEA decode

Current LEA decoding is a bunch of special cases, properly decode the
instruction, with exception of full SIB and RIP-relative modes.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Tested-by: Nick Desaulniers <ndesaulniers@google.com>
Link: https://lkml.kernel.org/r/20210211173627.143250641@infradead.org
(cherry picked from commit 2ee0c363492f1acc1082125218e6a80c0d7d502b)
---
 tools/objtool/arch/x86/decode.c | 86 +++++++++++----------------------
 1 file changed, 28 insertions(+), 58 deletions(-)

diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index 7b6b089978d12..f4ca459193ffe 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -92,9 +92,10 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 {
 	struct insn insn;
 	int x86_64, sign;
-	unsigned char op1, op2, rex = 0, rex_b = 0, rex_r = 0, rex_w = 0,
-		      rex_x = 0, modrm = 0, modrm_mod = 0, modrm_rm = 0,
-		      modrm_reg = 0, sib = 0;
+	unsigned char op1, op2,
+		      rex = 0, rex_b = 0, rex_r = 0, rex_w = 0, rex_x = 0,
+		      modrm = 0, modrm_mod = 0, modrm_rm = 0, modrm_reg = 0,
+		      sib = 0;
 	struct stack_op *op = NULL;
 	struct symbol *sym;
 
@@ -329,68 +330,37 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 		break;
 
 	case 0x8d:
-		if (sib == 0x24 && rex_w && !rex_b && !rex_x) {
-
-			ADD_OP(op) {
-				if (!insn.displacement.value) {
-					/* lea (%rsp), reg */
-					op->src.type = OP_SRC_REG;
-				} else {
-					/* lea disp(%rsp), reg */
-					op->src.type = OP_SRC_ADD;
-					op->src.offset = insn.displacement.value;
-				}
-				op->src.reg = CFI_SP;
-				op->dest.type = OP_DEST_REG;
-				op->dest.reg = op_to_cfi_reg[modrm_reg][rex_r];
-			}
-
-		} else if (rex == 0x48 && modrm == 0x65) {
-
-			/* lea disp(%rbp), %rsp */
-			ADD_OP(op) {
-				op->src.type = OP_SRC_ADD;
-				op->src.reg = CFI_BP;
-				op->src.offset = insn.displacement.value;
-				op->dest.type = OP_DEST_REG;
-				op->dest.reg = CFI_SP;
-			}
+		if (modrm_mod == 3) {
+			WARN("invalid LEA encoding at %s:0x%lx", sec->name, offset);
+			break;
+		}
 
-		} else if (rex == 0x49 && modrm == 0x62 &&
-			   insn.displacement.value == -8) {
+		/* skip non 64bit ops */
+		if (!rex_w)
+			break;
 
-			/*
-			 * lea -0x8(%r10), %rsp
-			 *
-			 * Restoring rsp back to its original value after a
-			 * stack realignment.
-			 */
-			ADD_OP(op) {
-				op->src.type = OP_SRC_ADD;
-				op->src.reg = CFI_R10;
-				op->src.offset = -8;
-				op->dest.type = OP_DEST_REG;
-				op->dest.reg = CFI_SP;
-			}
+		/* skip nontrivial SIB */
+		if (modrm_rm == 4 && !(sib == 0x24 && rex_b == rex_x))
+			break;
 
-		} else if (rex == 0x49 && modrm == 0x65 &&
-			   insn.displacement.value == -16) {
+		/* skip RIP relative displacement */
+		if (modrm_rm == 5 && modrm_mod == 0)
+			break;
 
-			/*
-			 * lea -0x10(%r13), %rsp
-			 *
-			 * Restoring rsp back to its original value after a
-			 * stack realignment.
-			 */
-			ADD_OP(op) {
+		/* lea disp(%src), %dst */
+		ADD_OP(op) {
+			op->src.offset = insn.displacement.value;
+			if (!op->src.offset) {
+				/* lea (%src), %dst */
+				op->src.type = OP_SRC_REG;
+			} else {
+				/* lea disp(%src), %dst */
 				op->src.type = OP_SRC_ADD;
-				op->src.reg = CFI_R13;
-				op->src.offset = -16;
-				op->dest.type = OP_DEST_REG;
-				op->dest.reg = CFI_SP;
 			}
+			op->src.reg = op_to_cfi_reg[modrm_rm][rex_b];
+			op->dest.type = OP_DEST_REG;
+			op->dest.reg = op_to_cfi_reg[modrm_reg][rex_r];
 		}
-
 		break;
 
 	case 0x8f:

From 317862b3905fec1ce59241078dd7bbd5d83d10a9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 9 Feb 2021 19:59:43 +0100
Subject: [PATCH 213/737] objtool,x86: Simplify register decode

Since the CFI_reg number now matches the instruction encoding order do
away with the op_to_cfi_reg[] and use direct assignment.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Tested-by: Nick Desaulniers <ndesaulniers@google.com>
Link: https://lkml.kernel.org/r/20210211173627.362004522@infradead.org
(cherry picked from commit 16ef7f159c503c7befec7018ee0e82fdc311721e)
---
 tools/objtool/arch/x86/decode.c | 79 ++++++++++++++++-----------------
 1 file changed, 39 insertions(+), 40 deletions(-)

diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index f4ca459193ffe..fbecd2e417974 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -18,17 +18,6 @@
 #include <objtool/arch.h>
 #include <objtool/warn.h>
 
-static unsigned char op_to_cfi_reg[][2] = {
-	{CFI_AX, CFI_R8},
-	{CFI_CX, CFI_R9},
-	{CFI_DX, CFI_R10},
-	{CFI_BX, CFI_R11},
-	{CFI_SP, CFI_R12},
-	{CFI_BP, CFI_R13},
-	{CFI_SI, CFI_R14},
-	{CFI_DI, CFI_R15},
-};
-
 static int is_x86_64(const struct elf *elf)
 {
 	switch (elf->ehdr.e_machine) {
@@ -95,7 +84,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 	unsigned char op1, op2,
 		      rex = 0, rex_b = 0, rex_r = 0, rex_w = 0, rex_x = 0,
 		      modrm = 0, modrm_mod = 0, modrm_rm = 0, modrm_reg = 0,
-		      sib = 0;
+		      sib = 0 /* , sib_scale = 0, sib_index = 0, sib_base = 0 */;
 	struct stack_op *op = NULL;
 	struct symbol *sym;
 
@@ -131,23 +120,29 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 	if (insn.modrm.nbytes) {
 		modrm = insn.modrm.bytes[0];
 		modrm_mod = X86_MODRM_MOD(modrm);
-		modrm_reg = X86_MODRM_REG(modrm);
-		modrm_rm = X86_MODRM_RM(modrm);
+		modrm_reg = X86_MODRM_REG(modrm) + 8*rex_r;
+		modrm_rm  = X86_MODRM_RM(modrm)  + 8*rex_b;
 	}
 
-	if (insn.sib.nbytes)
+	if (insn.sib.nbytes) {
 		sib = insn.sib.bytes[0];
+		/*
+		sib_scale = X86_SIB_SCALE(sib);
+		sib_index = X86_SIB_INDEX(sib) + 8*rex_x;
+		sib_base  = X86_SIB_BASE(sib)  + 8*rex_b;
+		 */
+	}
 
 	switch (op1) {
 
 	case 0x1:
 	case 0x29:
-		if (rex_w && !rex_b && modrm_mod == 3 && modrm_rm == 4) {
+		if (rex_w && modrm_mod == 3 && modrm_rm == CFI_SP) {
 
 			/* add/sub reg, %rsp */
 			ADD_OP(op) {
 				op->src.type = OP_SRC_ADD;
-				op->src.reg = op_to_cfi_reg[modrm_reg][rex_r];
+				op->src.reg = modrm_reg;
 				op->dest.type = OP_DEST_REG;
 				op->dest.reg = CFI_SP;
 			}
@@ -159,7 +154,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 		/* push reg */
 		ADD_OP(op) {
 			op->src.type = OP_SRC_REG;
-			op->src.reg = op_to_cfi_reg[op1 & 0x7][rex_b];
+			op->src.reg = (op1 & 0x7) + 8*rex_b;
 			op->dest.type = OP_DEST_PUSH;
 		}
 
@@ -171,7 +166,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 		ADD_OP(op) {
 			op->src.type = OP_SRC_POP;
 			op->dest.type = OP_DEST_REG;
-			op->dest.reg = op_to_cfi_reg[op1 & 0x7][rex_b];
+			op->dest.reg = (op1 & 0x7) + 8*rex_b;
 		}
 
 		break;
@@ -224,7 +219,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 		break;
 
 	case 0x89:
-		if (rex_w && !rex_r && modrm_reg == 4) {
+		if (rex_w && modrm_reg == CFI_SP) {
 
 			if (modrm_mod == 3) {
 				/* mov %rsp, reg */
@@ -232,17 +227,17 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 					op->src.type = OP_SRC_REG;
 					op->src.reg = CFI_SP;
 					op->dest.type = OP_DEST_REG;
-					op->dest.reg = op_to_cfi_reg[modrm_rm][rex_b];
+					op->dest.reg = modrm_rm;
 				}
 				break;
 
 			} else {
 				/* skip nontrivial SIB */
-				if (modrm_rm == 4 && !(sib == 0x24 && rex_b == rex_x))
+				if ((modrm_rm & 7) == 4 && !(sib == 0x24 && rex_b == rex_x))
 					break;
 
 				/* skip RIP relative displacement */
-				if (modrm_rm == 5 && modrm_mod == 0)
+				if ((modrm_rm & 7) == 5 && modrm_mod == 0)
 					break;
 
 				/* mov %rsp, disp(%reg) */
@@ -250,7 +245,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 					op->src.type = OP_SRC_REG;
 					op->src.reg = CFI_SP;
 					op->dest.type = OP_DEST_REG_INDIRECT;
-					op->dest.reg = op_to_cfi_reg[modrm_rm][rex_b];
+					op->dest.reg = modrm_rm;
 					op->dest.offset = insn.displacement.value;
 				}
 				break;
@@ -259,12 +254,12 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 			break;
 		}
 
-		if (rex_w && !rex_b && modrm_mod == 3 && modrm_rm == 4) {
+		if (rex_w && modrm_mod == 3 && modrm_rm == CFI_SP) {
 
 			/* mov reg, %rsp */
 			ADD_OP(op) {
 				op->src.type = OP_SRC_REG;
-				op->src.reg = op_to_cfi_reg[modrm_reg][rex_r];
+				op->src.reg = modrm_reg;
 				op->dest.type = OP_DEST_REG;
 				op->dest.reg = CFI_SP;
 			}
@@ -273,13 +268,12 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 
 		/* fallthrough */
 	case 0x88:
-		if (!rex_b &&
-		    (modrm_mod == 1 || modrm_mod == 2) && modrm_rm == 5) {
+		if ((modrm_mod == 1 || modrm_mod == 2) && modrm_rm == CFI_BP) {
 
 			/* mov reg, disp(%rbp) */
 			ADD_OP(op) {
 				op->src.type = OP_SRC_REG;
-				op->src.reg = op_to_cfi_reg[modrm_reg][rex_r];
+				op->src.reg = modrm_reg;
 				op->dest.type = OP_DEST_REG_INDIRECT;
 				op->dest.reg = CFI_BP;
 				op->dest.offset = insn.displacement.value;
@@ -287,12 +281,12 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 			break;
 		}
 
-		if (rex_w && !rex_b && modrm_rm == 4 && sib == 0x24) {
+		if (rex_w && modrm_rm == CFI_SP && sib == 0x24) {
 
 			/* mov reg, disp(%rsp) */
 			ADD_OP(op) {
 				op->src.type = OP_SRC_REG;
-				op->src.reg = op_to_cfi_reg[modrm_reg][rex_r];
+				op->src.reg = modrm_reg;
 				op->dest.type = OP_DEST_REG_INDIRECT;
 				op->dest.reg = CFI_SP;
 				op->dest.offset = insn.displacement.value;
@@ -303,7 +297,10 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 		break;
 
 	case 0x8b:
-		if (rex_w && !rex_b && modrm_mod == 1 && modrm_rm == 5) {
+		if (!rex_w)
+			break;
+
+		if (modrm_mod == 1 && modrm_rm == CFI_BP) {
 
 			/* mov disp(%rbp), reg */
 			ADD_OP(op) {
@@ -311,11 +308,12 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 				op->src.reg = CFI_BP;
 				op->src.offset = insn.displacement.value;
 				op->dest.type = OP_DEST_REG;
-				op->dest.reg = op_to_cfi_reg[modrm_reg][rex_r];
+				op->dest.reg = modrm_reg;
 			}
+			break;
+		}
 
-		} else if (rex_w && !rex_b && sib == 0x24 &&
-			   modrm_mod != 3 && modrm_rm == 4) {
+		if (modrm_mod != 3 && modrm_rm == CFI_SP && sib == 0x24) {
 
 			/* mov disp(%rsp), reg */
 			ADD_OP(op) {
@@ -323,8 +321,9 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 				op->src.reg = CFI_SP;
 				op->src.offset = insn.displacement.value;
 				op->dest.type = OP_DEST_REG;
-				op->dest.reg = op_to_cfi_reg[modrm_reg][rex_r];
+				op->dest.reg = modrm_reg;
 			}
+			break;
 		}
 
 		break;
@@ -340,11 +339,11 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 			break;
 
 		/* skip nontrivial SIB */
-		if (modrm_rm == 4 && !(sib == 0x24 && rex_b == rex_x))
+		if ((modrm_rm & 7) == 4 && !(sib == 0x24 && rex_b == rex_x))
 			break;
 
 		/* skip RIP relative displacement */
-		if (modrm_rm == 5 && modrm_mod == 0)
+		if ((modrm_rm & 7) == 5 && modrm_mod == 0)
 			break;
 
 		/* lea disp(%src), %dst */
@@ -357,9 +356,9 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 				/* lea disp(%src), %dst */
 				op->src.type = OP_SRC_ADD;
 			}
-			op->src.reg = op_to_cfi_reg[modrm_rm][rex_b];
+			op->src.reg = modrm_rm;
 			op->dest.type = OP_DEST_REG;
-			op->dest.reg = op_to_cfi_reg[modrm_reg][rex_r];
+			op->dest.reg = modrm_reg;
 		}
 		break;
 

From 49b63570849c3b38f36ebc4b5d09c276b37e56e5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 10 Feb 2021 11:47:35 +0100
Subject: [PATCH 214/737] objtool,x86: Support %riz encodings

When there's a SIB byte, the register otherwise denoted by r/m will
then be denoted by SIB.base REX.b will now extend this. SIB.index == SP
is magic and notes an index value zero.

This means that there's a bunch of alternative (longer) encodings for
the same thing. Eg. 'ModRM.mod != 3, ModRM.r/m = AX' can be encoded as
'ModRM.mod != 3, ModRM.r/m = SP, SIB.base = AX, SIB.index = SP' which is actually 4
different encodings because the value of SIB.scale is irrelevant,
giving rise to 5 different but equal encodings.

Support these encodings and clean up the SIB handling in general.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Tested-by: Nick Desaulniers <ndesaulniers@google.com>
Link: https://lkml.kernel.org/r/20210211173627.472967498@infradead.org
(cherry picked from commit 78df6245c3c82484200b9f8e306dc86fb19e9c02)
---
 tools/objtool/arch/x86/decode.c | 67 +++++++++++++++++++++++----------
 1 file changed, 48 insertions(+), 19 deletions(-)

diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index fbecd2e417974..ff78829bb5fac 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -73,6 +73,25 @@ unsigned long arch_jump_destination(struct instruction *insn)
 		return -1; \
 	else for (list_add_tail(&op->list, ops_list); op; op = NULL)
 
+/*
+ * Helpers to decode ModRM/SIB:
+ *
+ * r/m| AX  CX  DX  BX |  SP |  BP |  SI  DI |
+ *    | R8  R9 R10 R11 | R12 | R13 | R14 R15 |
+ * Mod+----------------+-----+-----+---------+
+ * 00 |    [r/m]       |[SIB]|[IP+]|  [r/m]  |
+ * 01 |  [r/m + d8]    |[S+d]|   [r/m + d8]  |
+ * 10 |  [r/m + d32]   |[S+D]|   [r/m + d32] |
+ * 11 |                   r/ m               |
+ *
+ */
+#define is_RIP()   ((modrm_rm & 7) == CFI_BP && modrm_mod == 0)
+#define have_SIB() ((modrm_rm & 7) == CFI_SP && modrm_mod != 3)
+
+#define rm_is(reg) (have_SIB() ? \
+		    sib_base == (reg) && sib_index == CFI_SP : \
+		    modrm_rm == (reg))
+
 int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 			    unsigned long offset, unsigned int maxlen,
 			    unsigned int *len, enum insn_type *type,
@@ -84,7 +103,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 	unsigned char op1, op2,
 		      rex = 0, rex_b = 0, rex_r = 0, rex_w = 0, rex_x = 0,
 		      modrm = 0, modrm_mod = 0, modrm_rm = 0, modrm_reg = 0,
-		      sib = 0 /* , sib_scale = 0, sib_index = 0, sib_base = 0 */;
+		      sib = 0, /* sib_scale = 0, */ sib_index = 0, sib_base = 0;
 	struct stack_op *op = NULL;
 	struct symbol *sym;
 
@@ -126,11 +145,9 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 
 	if (insn.sib.nbytes) {
 		sib = insn.sib.bytes[0];
-		/*
-		sib_scale = X86_SIB_SCALE(sib);
+		/* sib_scale = X86_SIB_SCALE(sib); */
 		sib_index = X86_SIB_INDEX(sib) + 8*rex_x;
 		sib_base  = X86_SIB_BASE(sib)  + 8*rex_b;
-		 */
 	}
 
 	switch (op1) {
@@ -219,7 +236,10 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 		break;
 
 	case 0x89:
-		if (rex_w && modrm_reg == CFI_SP) {
+		if (!rex_w)
+			break;
+
+		if (modrm_reg == CFI_SP) {
 
 			if (modrm_mod == 3) {
 				/* mov %rsp, reg */
@@ -232,14 +252,17 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 				break;
 
 			} else {
-				/* skip nontrivial SIB */
-				if ((modrm_rm & 7) == 4 && !(sib == 0x24 && rex_b == rex_x))
-					break;
-
 				/* skip RIP relative displacement */
-				if ((modrm_rm & 7) == 5 && modrm_mod == 0)
+				if (is_RIP())
 					break;
 
+				/* skip nontrivial SIB */
+				if (have_SIB()) {
+					modrm_rm = sib_base;
+					if (sib_index != CFI_SP)
+						break;
+				}
+
 				/* mov %rsp, disp(%reg) */
 				ADD_OP(op) {
 					op->src.type = OP_SRC_REG;
@@ -254,7 +277,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 			break;
 		}
 
-		if (rex_w && modrm_mod == 3 && modrm_rm == CFI_SP) {
+		if (modrm_mod == 3 && modrm_rm == CFI_SP) {
 
 			/* mov reg, %rsp */
 			ADD_OP(op) {
@@ -268,6 +291,9 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 
 		/* fallthrough */
 	case 0x88:
+		if (!rex_w)
+			break;
+
 		if ((modrm_mod == 1 || modrm_mod == 2) && modrm_rm == CFI_BP) {
 
 			/* mov reg, disp(%rbp) */
@@ -281,7 +307,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 			break;
 		}
 
-		if (rex_w && modrm_rm == CFI_SP && sib == 0x24) {
+		if (modrm_mod != 3 && rm_is(CFI_SP)) {
 
 			/* mov reg, disp(%rsp) */
 			ADD_OP(op) {
@@ -300,7 +326,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 		if (!rex_w)
 			break;
 
-		if (modrm_mod == 1 && modrm_rm == CFI_BP) {
+		if ((modrm_mod == 1 || modrm_mod == 2) && modrm_rm == CFI_BP) {
 
 			/* mov disp(%rbp), reg */
 			ADD_OP(op) {
@@ -313,7 +339,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 			break;
 		}
 
-		if (modrm_mod != 3 && modrm_rm == CFI_SP && sib == 0x24) {
+		if (modrm_mod != 3 && rm_is(CFI_SP)) {
 
 			/* mov disp(%rsp), reg */
 			ADD_OP(op) {
@@ -338,14 +364,17 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 		if (!rex_w)
 			break;
 
-		/* skip nontrivial SIB */
-		if ((modrm_rm & 7) == 4 && !(sib == 0x24 && rex_b == rex_x))
-			break;
-
 		/* skip RIP relative displacement */
-		if ((modrm_rm & 7) == 5 && modrm_mod == 0)
+		if (is_RIP())
 			break;
 
+		/* skip nontrivial SIB */
+		if (have_SIB()) {
+			modrm_rm = sib_base;
+			if (sib_index != CFI_SP)
+				break;
+		}
+
 		/* lea disp(%src), %dst */
 		ADD_OP(op) {
 			op->src.offset = insn.displacement.value;

From f3ab6c82bf7758cb55407aa27e789d6ccd04c343 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 10 Feb 2021 14:11:30 +0100
Subject: [PATCH 215/737] objtool,x86: Rewrite ADD/SUB/AND

Support sign extending and imm8 forms.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Tested-by: Nick Desaulniers <ndesaulniers@google.com>
Link: https://lkml.kernel.org/r/20210211173627.588366777@infradead.org
(cherry picked from commit 961d83b9073b1ce5834af50d3c69e5e2461c6fd3)
---
 tools/objtool/arch/x86/decode.c | 70 ++++++++++++++++++++++++---------
 1 file changed, 51 insertions(+), 19 deletions(-)

diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index ff78829bb5fac..31a078747d1c5 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -99,13 +99,14 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 			    struct list_head *ops_list)
 {
 	struct insn insn;
-	int x86_64, sign;
+	int x86_64;
 	unsigned char op1, op2,
 		      rex = 0, rex_b = 0, rex_r = 0, rex_w = 0, rex_x = 0,
 		      modrm = 0, modrm_mod = 0, modrm_rm = 0, modrm_reg = 0,
 		      sib = 0, /* sib_scale = 0, */ sib_index = 0, sib_base = 0;
 	struct stack_op *op = NULL;
 	struct symbol *sym;
+	u64 imm;
 
 	x86_64 = is_x86_64(elf);
 	if (x86_64 == -1)
@@ -201,12 +202,54 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 		*type = INSN_JUMP_CONDITIONAL;
 		break;
 
-	case 0x81:
-	case 0x83:
-		if (rex != 0x48)
+	case 0x80 ... 0x83:
+		/*
+		 * 1000 00sw : mod OP r/m : immediate
+		 *
+		 * s - sign extend immediate
+		 * w - imm8 / imm32
+		 *
+		 * OP: 000 ADD    100 AND
+		 *     001 OR     101 SUB
+		 *     010 ADC    110 XOR
+		 *     011 SBB    111 CMP
+		 */
+
+		/* 64bit only */
+		if (!rex_w)
+			break;
+
+		/* %rsp target only */
+		if (!(modrm_mod == 3 && modrm_rm == CFI_SP))
+			break;
+
+		imm = insn.immediate.value;
+		if (op1 & 2) { /* sign extend */
+			if (op1 & 1) { /* imm32 */
+				imm <<= 32;
+				imm = (s64)imm >> 32;
+			} else { /* imm8 */
+				imm <<= 56;
+				imm = (s64)imm >> 56;
+			}
+		}
+
+		switch (modrm_reg & 7) {
+		case 5:
+			imm = -imm;
+			/* fallthrough */
+		case 0:
+			/* add/sub imm, %rsp */
+			ADD_OP(op) {
+				op->src.type = OP_SRC_ADD;
+				op->src.reg = CFI_SP;
+				op->src.offset = imm;
+				op->dest.type = OP_DEST_REG;
+				op->dest.reg = CFI_SP;
+			}
 			break;
 
-		if (modrm == 0xe4) {
+		case 4:
 			/* and imm, %rsp */
 			ADD_OP(op) {
 				op->src.type = OP_SRC_AND;
@@ -216,23 +259,12 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 				op->dest.reg = CFI_SP;
 			}
 			break;
-		}
 
-		if (modrm == 0xc4)
-			sign = 1;
-		else if (modrm == 0xec)
-			sign = -1;
-		else
+		default:
+			/* WARN ? */
 			break;
-
-		/* add/sub imm, %rsp */
-		ADD_OP(op) {
-			op->src.type = OP_SRC_ADD;
-			op->src.reg = CFI_SP;
-			op->src.offset = insn.immediate.value * sign;
-			op->dest.type = OP_DEST_REG;
-			op->dest.reg = CFI_SP;
 		}
+
 		break;
 
 	case 0x89:

From 22991115cca84f11ea2f381b67a1c7c18a565fa4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 12 Feb 2021 09:13:00 +0100
Subject: [PATCH 216/737] objtool,x86: More ModRM sugar

Better helpers to decode ModRM.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lkml.kernel.org/r/YCZB/ljatFXqQbm8@hirez.programming.kicks-ass.net
(cherry picked from commit 36d92e43d01cbeeec99abdf405362243051d6b3f)
---
 tools/objtool/arch/x86/decode.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index 31a078747d1c5..bee22d4e672e2 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -83,15 +83,21 @@ unsigned long arch_jump_destination(struct instruction *insn)
  * 01 |  [r/m + d8]    |[S+d]|   [r/m + d8]  |
  * 10 |  [r/m + d32]   |[S+D]|   [r/m + d32] |
  * 11 |                   r/ m               |
- *
  */
+
+#define mod_is_mem()	(modrm_mod != 3)
+#define mod_is_reg()	(modrm_mod == 3)
+
 #define is_RIP()   ((modrm_rm & 7) == CFI_BP && modrm_mod == 0)
-#define have_SIB() ((modrm_rm & 7) == CFI_SP && modrm_mod != 3)
+#define have_SIB() ((modrm_rm & 7) == CFI_SP && mod_is_mem())
 
 #define rm_is(reg) (have_SIB() ? \
 		    sib_base == (reg) && sib_index == CFI_SP : \
 		    modrm_rm == (reg))
 
+#define rm_is_mem(reg)	(mod_is_mem() && !is_RIP() && rm_is(reg))
+#define rm_is_reg(reg)	(mod_is_reg() && modrm_rm == (reg))
+
 int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 			    unsigned long offset, unsigned int maxlen,
 			    unsigned int *len, enum insn_type *type,
@@ -155,7 +161,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 
 	case 0x1:
 	case 0x29:
-		if (rex_w && modrm_mod == 3 && modrm_rm == CFI_SP) {
+		if (rex_w && rm_is_reg(CFI_SP)) {
 
 			/* add/sub reg, %rsp */
 			ADD_OP(op) {
@@ -220,7 +226,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 			break;
 
 		/* %rsp target only */
-		if (!(modrm_mod == 3 && modrm_rm == CFI_SP))
+		if (!rm_is_reg(CFI_SP))
 			break;
 
 		imm = insn.immediate.value;
@@ -273,7 +279,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 
 		if (modrm_reg == CFI_SP) {
 
-			if (modrm_mod == 3) {
+			if (mod_is_reg()) {
 				/* mov %rsp, reg */
 				ADD_OP(op) {
 					op->src.type = OP_SRC_REG;
@@ -309,7 +315,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 			break;
 		}
 
-		if (modrm_mod == 3 && modrm_rm == CFI_SP) {
+		if (rm_is_reg(CFI_SP)) {
 
 			/* mov reg, %rsp */
 			ADD_OP(op) {
@@ -326,7 +332,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 		if (!rex_w)
 			break;
 
-		if ((modrm_mod == 1 || modrm_mod == 2) && modrm_rm == CFI_BP) {
+		if (rm_is_mem(CFI_BP)) {
 
 			/* mov reg, disp(%rbp) */
 			ADD_OP(op) {
@@ -339,7 +345,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 			break;
 		}
 
-		if (modrm_mod != 3 && rm_is(CFI_SP)) {
+		if (rm_is_mem(CFI_SP)) {
 
 			/* mov reg, disp(%rsp) */
 			ADD_OP(op) {
@@ -358,7 +364,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 		if (!rex_w)
 			break;
 
-		if ((modrm_mod == 1 || modrm_mod == 2) && modrm_rm == CFI_BP) {
+		if (rm_is_mem(CFI_BP)) {
 
 			/* mov disp(%rbp), reg */
 			ADD_OP(op) {
@@ -371,7 +377,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 			break;
 		}
 
-		if (modrm_mod != 3 && rm_is(CFI_SP)) {
+		if (rm_is_mem(CFI_SP)) {
 
 			/* mov disp(%rsp), reg */
 			ADD_OP(op) {
@@ -387,7 +393,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 		break;
 
 	case 0x8d:
-		if (modrm_mod == 3) {
+		if (mod_is_reg()) {
 			WARN("invalid LEA encoding at %s:0x%lx", sec->name, offset);
 			break;
 		}

From 209fa24e7ce0a66c321a8f0e8fed9abdf3edec4c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 26 Feb 2021 10:59:59 +0100
Subject: [PATCH 217/737] objtool: Add --backup

Teach objtool to write backups files, such that it becomes easier to
see what objtool did to the object file.

Backup files will be ${name}.orig.

Suggested-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Borislav Petkov <bp@suse.de>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lkml.kernel.org/r/YD4obT3aoXPWl7Ax@hirez.programming.kicks-ass.net
(cherry picked from commit 8ad15c6900840e8a2163012f4581c52127622e02)
---
 tools/objtool/builtin-check.c           |  3 +-
 tools/objtool/include/objtool/builtin.h |  2 +-
 tools/objtool/objtool.c                 | 64 +++++++++++++++++++++++++
 3 files changed, 67 insertions(+), 2 deletions(-)

diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c
index faefd9922a54d..091b65269239e 100644
--- a/tools/objtool/builtin-check.c
+++ b/tools/objtool/builtin-check.c
@@ -19,7 +19,7 @@
 #include <objtool/objtool.h>
 
 bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats,
-     validate_dup, vmlinux, mcount, noinstr, sls, unret, rethunk;
+     validate_dup, vmlinux, mcount, noinstr, backup, sls, unret, rethunk;
 
 static const char * const check_usage[] = {
 	"objtool check [<options>] file.o",
@@ -40,6 +40,7 @@ const struct option check_options[] = {
 	OPT_BOOLEAN('n', "noinstr", &noinstr, "noinstr validation for vmlinux.o"),
 	OPT_BOOLEAN('l', "vmlinux", &vmlinux, "vmlinux.o validation"),
 	OPT_BOOLEAN('M', "mcount", &mcount, "generate __mcount_loc"),
+	OPT_BOOLEAN('B', "backup", &backup, "create .orig files before modification"),
 	OPT_BOOLEAN('S', "sls", &sls, "validate straight-line-speculation"),
 	OPT_END(),
 };
diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h
index 94e653d4d230c..d4bfee522ae8b 100644
--- a/tools/objtool/include/objtool/builtin.h
+++ b/tools/objtool/include/objtool/builtin.h
@@ -9,7 +9,7 @@
 
 extern const struct option check_options[];
 extern bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats,
-            validate_dup, vmlinux, mcount, noinstr, sls, unret, rethunk;
+            validate_dup, vmlinux, mcount, noinstr, backup, sls, unret, rethunk;
 
 extern int cmd_check(int argc, const char **argv);
 extern int cmd_orc(int argc, const char **argv);
diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c
index ee86437ab814f..24650d533d85c 100644
--- a/tools/objtool/objtool.c
+++ b/tools/objtool/objtool.c
@@ -17,6 +17,7 @@
 #include <stdbool.h>
 #include <string.h>
 #include <stdlib.h>
+#include <unistd.h>
 #include <subcmd/exec-cmd.h>
 #include <subcmd/pager.h>
 #include <linux/kernel.h>
@@ -44,6 +45,64 @@ bool help;
 const char *objname;
 static struct objtool_file file;
 
+static bool objtool_create_backup(const char *_objname)
+{
+	int len = strlen(_objname);
+	char *buf, *base, *name = malloc(len+6);
+	int s, d, l, t;
+
+	if (!name) {
+		perror("failed backup name malloc");
+		return false;
+	}
+
+	strcpy(name, _objname);
+	strcpy(name + len, ".orig");
+
+	d = open(name, O_CREAT|O_WRONLY|O_TRUNC, 0644);
+	if (d < 0) {
+		perror("failed to create backup file");
+		return false;
+	}
+
+	s = open(_objname, O_RDONLY);
+	if (s < 0) {
+		perror("failed to open orig file");
+		return false;
+	}
+
+	buf = malloc(4096);
+	if (!buf) {
+		perror("failed backup data malloc");
+		return false;
+	}
+
+	while ((l = read(s, buf, 4096)) > 0) {
+		base = buf;
+		do {
+			t = write(d, base, l);
+			if (t < 0) {
+				perror("failed backup write");
+				return false;
+			}
+			base += t;
+			l -= t;
+		} while (l);
+	}
+
+	if (l < 0) {
+		perror("failed backup read");
+		return false;
+	}
+
+	free(name);
+	free(buf);
+	close(d);
+	close(s);
+
+	return true;
+}
+
 struct objtool_file *objtool_open_read(const char *_objname)
 {
 	if (objname) {
@@ -59,6 +118,11 @@ struct objtool_file *objtool_open_read(const char *_objname)
 	if (!file.elf)
 		return NULL;
 
+	if (backup && !objtool_create_backup(objname)) {
+		WARN("can't create backup file");
+		return NULL;
+	}
+
 	INIT_LIST_HEAD(&file.insn_list);
 	hash_init(file.insn_hash);
 	INIT_LIST_HEAD(&file.retpoline_call_list);

From 5170540619c1cfe3b02f6e78c72933475be11898 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 26 Feb 2021 11:18:24 +0100
Subject: [PATCH 218/737] objtool: Collate parse_options() users

Ensure there's a single place that parses check_options, in
preparation for extending where to get options from.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lkml.kernel.org/r/20210226110004.193108106@infradead.org
(cherry picked from commit a2f605f9ff57397d05a8e2f282b78a69f574d305)
---
 tools/objtool/builtin-check.c           | 14 +++++++++-----
 tools/objtool/builtin-orc.c             |  5 +----
 tools/objtool/include/objtool/builtin.h |  2 ++
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c
index 091b65269239e..e3b674759d560 100644
--- a/tools/objtool/builtin-check.c
+++ b/tools/objtool/builtin-check.c
@@ -45,17 +45,21 @@ const struct option check_options[] = {
 	OPT_END(),
 };
 
+int cmd_parse_options(int argc, const char **argv, const char * const usage[])
+{
+	argc = parse_options(argc, argv, check_options, usage, 0);
+	if (argc != 1)
+		usage_with_options(usage, check_options);
+	return argc;
+}
+
 int cmd_check(int argc, const char **argv)
 {
 	const char *objname;
 	struct objtool_file *file;
 	int ret;
 
-	argc = parse_options(argc, argv, check_options, check_usage, 0);
-
-	if (argc != 1)
-		usage_with_options(check_usage, check_options);
-
+	argc = cmd_parse_options(argc, argv, check_usage);
 	objname = argv[0];
 
 	file = objtool_open_read(objname);
diff --git a/tools/objtool/builtin-orc.c b/tools/objtool/builtin-orc.c
index 8273bbf7cebb1..17f8b93077381 100644
--- a/tools/objtool/builtin-orc.c
+++ b/tools/objtool/builtin-orc.c
@@ -34,10 +34,7 @@ int cmd_orc(int argc, const char **argv)
 		struct objtool_file *file;
 		int ret;
 
-		argc = parse_options(argc, argv, check_options, orc_usage, 0);
-		if (argc != 1)
-			usage_with_options(orc_usage, check_options);
-
+		argc = cmd_parse_options(argc, argv, orc_usage);
 		objname = argv[0];
 
 		file = objtool_open_read(objname);
diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h
index d4bfee522ae8b..66ad30ec58182 100644
--- a/tools/objtool/include/objtool/builtin.h
+++ b/tools/objtool/include/objtool/builtin.h
@@ -11,6 +11,8 @@ extern const struct option check_options[];
 extern bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats,
             validate_dup, vmlinux, mcount, noinstr, backup, sls, unret, rethunk;
 
+extern int cmd_parse_options(int argc, const char **argv, const char * const usage[]);
+
 extern int cmd_check(int argc, const char **argv);
 extern int cmd_orc(int argc, const char **argv);
 

From 377f86b90960063d97267931549543e8dbc33686 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 26 Feb 2021 11:32:30 +0100
Subject: [PATCH 219/737] objtool: Parse options from OBJTOOL_ARGS

Teach objtool to parse options from the OBJTOOL_ARGS environment
variable.

This enables things like:

  $ OBJTOOL_ARGS="--backup" make O=defconfig-build/ kernel/ponies.o

to obtain both defconfig-build/kernel/ponies.o{,.orig} and easily
inspect what objtool actually did.

Suggested-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lkml.kernel.org/r/20210226110004.252553847@infradead.org
(cherry picked from commit 900b4df347bbac4874149a226143a556909faba8)
---
 tools/objtool/builtin-check.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c
index e3b674759d560..35081fe373203 100644
--- a/tools/objtool/builtin-check.c
+++ b/tools/objtool/builtin-check.c
@@ -15,6 +15,7 @@
 
 #include <subcmd/parse-options.h>
 #include <string.h>
+#include <stdlib.h>
 #include <objtool/builtin.h>
 #include <objtool/objtool.h>
 
@@ -26,6 +27,11 @@ static const char * const check_usage[] = {
 	NULL,
 };
 
+static const char * const env_usage[] = {
+	"OBJTOOL_ARGS=\"<options>\"",
+	NULL,
+};
+
 const struct option check_options[] = {
 	OPT_BOOLEAN('f', "no-fp", &no_fp, "Skip frame pointer validation"),
 	OPT_BOOLEAN('u', "no-unreachable", &no_unreachable, "Skip 'unreachable instruction' warnings"),
@@ -47,6 +53,25 @@ const struct option check_options[] = {
 
 int cmd_parse_options(int argc, const char **argv, const char * const usage[])
 {
+	const char *envv[16] = { };
+	char *env;
+	int envc;
+
+	env = getenv("OBJTOOL_ARGS");
+	if (env) {
+		envv[0] = "OBJTOOL_ARGS";
+		for (envc = 1; envc < ARRAY_SIZE(envv); ) {
+			envv[envc++] = env;
+			env = strchr(env, ' ');
+			if (!env)
+				break;
+			*env = '\0';
+			env++;
+		}
+
+		parse_options(envc, envv, check_options, env_usage, 0);
+	}
+
 	argc = parse_options(argc, argv, check_options, usage, 0);
 	if (argc != 1)
 		usage_with_options(usage, check_options);

From 3c33377e83bd77d2cc8ac9cc9198354103eb7134 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Wed, 10 Feb 2021 11:34:08 +0100
Subject: [PATCH 220/737] arm64: Move patching utilities out of instruction
 encoding/decoding

Files insn.[c|h] containt some functions used for instruction patching.
In order to reuse the instruction encoder/decoder, move the patching
utilities to their own file.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
[ Add missing #include <asm/patching.h> to some files ]
Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/include/asm/insn.h      |   5 -
 arch/arm64/include/asm/patching.h  |  13 +++
 arch/arm64/kernel/Makefile         |   2 +-
 arch/arm64/kernel/ftrace.c         |   1 +
 arch/arm64/kernel/insn.c           | 149 +----------------------------
 arch/arm64/kernel/jump_label.c     |   1 +
 arch/arm64/kernel/kgdb.c           |   1 +
 arch/arm64/kernel/patching.c       | 148 ++++++++++++++++++++++++++++
 arch/arm64/kernel/probes/kprobes.c |   1 +
 arch/arm64/kernel/traps.c          |   1 +
 10 files changed, 170 insertions(+), 152 deletions(-)
 create mode 100644 arch/arm64/include/asm/patching.h
 create mode 100644 arch/arm64/kernel/patching.c

diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index c0973345e6e1d..f67db3d763999 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -380,8 +380,6 @@ static inline bool aarch64_insn_is_adr_adrp(u32 insn)
 	return aarch64_insn_is_adr(insn) || aarch64_insn_is_adrp(insn);
 }
 
-int aarch64_insn_read(void *addr, u32 *insnp);
-int aarch64_insn_write(void *addr, u32 insn);
 enum aarch64_insn_encoding_class aarch64_get_insn_class(u32 insn);
 bool aarch64_insn_uses_literal(u32 insn);
 bool aarch64_insn_is_branch(u32 insn);
@@ -488,9 +486,6 @@ u32 aarch64_insn_gen_prefetch(enum aarch64_insn_register base,
 s32 aarch64_get_branch_offset(u32 insn);
 u32 aarch64_set_branch_offset(u32 insn, s32 offset);
 
-int aarch64_insn_patch_text_nosync(void *addr, u32 insn);
-int aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt);
-
 s32 aarch64_insn_adrp_get_offset(u32 insn);
 u32 aarch64_insn_adrp_set_offset(u32 insn, s32 offset);
 
diff --git a/arch/arm64/include/asm/patching.h b/arch/arm64/include/asm/patching.h
new file mode 100644
index 0000000000000..6bf5adc562950
--- /dev/null
+++ b/arch/arm64/include/asm/patching.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef	__ASM_PATCHING_H
+#define	__ASM_PATCHING_H
+
+#include <linux/types.h>
+
+int aarch64_insn_read(void *addr, u32 *insnp);
+int aarch64_insn_write(void *addr, u32 insn);
+
+int aarch64_insn_patch_text_nosync(void *addr, u32 insn);
+int aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt);
+
+#endif	/* __ASM_PATCHING_H */
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index bbaf0bc4ad609..a22556ef7652c 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -17,7 +17,7 @@ obj-y			:= debug-monitors.o entry.o irq.o fpsimd.o		\
 			   return_address.o cpuinfo.o cpu_errata.o		\
 			   cpufeature.o alternative.o cacheinfo.o		\
 			   smp.o smp_spin_table.o topology.o smccc-call.o	\
-			   syscall.o proton-pack.o
+			   syscall.o proton-pack.o patching.o
 
 targets			+= efi-entry.o
 
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index 402a24f845b9e..cf49256e007af 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -15,6 +15,7 @@
 #include <asm/debug-monitors.h>
 #include <asm/ftrace.h>
 #include <asm/insn.h>
+#include <asm/patching.h>
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 /*
diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
index 7d4fdf9745428..952e7d6fe60e2 100644
--- a/arch/arm64/kernel/insn.c
+++ b/arch/arm64/kernel/insn.c
@@ -7,21 +7,14 @@
  */
 #include <linux/bitops.h>
 #include <linux/bug.h>
-#include <linux/compiler.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/spinlock.h>
-#include <linux/stop_machine.h>
+#include <linux/printk.h>
+#include <linux/sizes.h>
 #include <linux/types.h>
-#include <linux/uaccess.h>
 
-#include <asm/cacheflush.h>
 #include <asm/debug-monitors.h>
-#include <asm/fixmap.h>
+#include <asm/errno.h>
 #include <asm/insn.h>
 #include <asm/kprobes.h>
-#include <asm/sections.h>
 
 #define AARCH64_INSN_SF_BIT	BIT(31)
 #define AARCH64_INSN_N_BIT	BIT(22)
@@ -83,81 +76,6 @@ bool aarch64_insn_is_branch_imm(u32 insn)
 		aarch64_insn_is_bcond(insn));
 }
 
-static DEFINE_RAW_SPINLOCK(patch_lock);
-
-static bool is_exit_text(unsigned long addr)
-{
-	/* discarded with init text/data */
-	return system_state < SYSTEM_RUNNING &&
-		addr >= (unsigned long)__exittext_begin &&
-		addr < (unsigned long)__exittext_end;
-}
-
-static bool is_image_text(unsigned long addr)
-{
-	return core_kernel_text(addr) || is_exit_text(addr);
-}
-
-static void __kprobes *patch_map(void *addr, int fixmap)
-{
-	unsigned long uintaddr = (uintptr_t) addr;
-	bool image = is_image_text(uintaddr);
-	struct page *page;
-
-	if (image)
-		page = phys_to_page(__pa_symbol(addr));
-	else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
-		page = vmalloc_to_page(addr);
-	else
-		return addr;
-
-	BUG_ON(!page);
-	return (void *)set_fixmap_offset(fixmap, page_to_phys(page) +
-			(uintaddr & ~PAGE_MASK));
-}
-
-static void __kprobes patch_unmap(int fixmap)
-{
-	clear_fixmap(fixmap);
-}
-/*
- * In ARMv8-A, A64 instructions have a fixed length of 32 bits and are always
- * little-endian.
- */
-int __kprobes aarch64_insn_read(void *addr, u32 *insnp)
-{
-	int ret;
-	__le32 val;
-
-	ret = copy_from_kernel_nofault(&val, addr, AARCH64_INSN_SIZE);
-	if (!ret)
-		*insnp = le32_to_cpu(val);
-
-	return ret;
-}
-
-static int __kprobes __aarch64_insn_write(void *addr, __le32 insn)
-{
-	void *waddr = addr;
-	unsigned long flags = 0;
-	int ret;
-
-	raw_spin_lock_irqsave(&patch_lock, flags);
-	waddr = patch_map(addr, FIX_TEXT_POKE0);
-
-	ret = copy_to_kernel_nofault(waddr, &insn, AARCH64_INSN_SIZE);
-
-	patch_unmap(FIX_TEXT_POKE0);
-	raw_spin_unlock_irqrestore(&patch_lock, flags);
-
-	return ret;
-}
-
-int __kprobes aarch64_insn_write(void *addr, u32 insn)
-{
-	return __aarch64_insn_write(addr, cpu_to_le32(insn));
-}
-
 bool __kprobes aarch64_insn_uses_literal(u32 insn)
 {
 	/* ldr/ldrsw (literal), prfm */
@@ -187,67 +105,6 @@ bool __kprobes aarch64_insn_is_branch(u32 insn)
 		aarch64_insn_is_bcond(insn);
 }
 
-int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn)
-{
-	u32 *tp = addr;
-	int ret;
-
-	/* A64 instructions must be word aligned */
-	if ((uintptr_t)tp & 0x3)
-		return -EINVAL;
-
-	ret = aarch64_insn_write(tp, insn);
-	if (ret == 0)
-		__flush_icache_range((uintptr_t)tp,
-				     (uintptr_t)tp + AARCH64_INSN_SIZE);
-
-	return ret;
-}
-
-struct aarch64_insn_patch {
-	void		**text_addrs;
-	u32		*new_insns;
-	int		insn_cnt;
-	atomic_t	cpu_count;
-};
-
-static int __kprobes aarch64_insn_patch_text_cb(void *arg)
-{
-	int i, ret = 0;
-	struct aarch64_insn_patch *pp = arg;
-
-	/* The last CPU becomes master */
-	if (atomic_inc_return(&pp->cpu_count) == num_online_cpus()) {
-		for (i = 0; ret == 0 && i < pp->insn_cnt; i++)
-			ret = aarch64_insn_patch_text_nosync(pp->text_addrs[i],
-							     pp->new_insns[i]);
-		/* Notify other processors with an additional increment. */
-		atomic_inc(&pp->cpu_count);
-	} else {
-		while (atomic_read(&pp->cpu_count) <= num_online_cpus())
-			cpu_relax();
-		isb();
-	}
-
-	return ret;
-}
-
-int __kprobes aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt)
-{
-	struct aarch64_insn_patch patch = {
-		.text_addrs = addrs,
-		.new_insns = insns,
-		.insn_cnt = cnt,
-		.cpu_count = ATOMIC_INIT(0),
-	};
-
-	if (cnt <= 0)
-		return -EINVAL;
-
-	return stop_machine_cpuslocked(aarch64_insn_patch_text_cb, &patch,
-				       cpu_online_mask);
-}
-
 static int __kprobes aarch64_get_imm_shift_mask(enum aarch64_insn_imm_type type,
 						u32 *maskp, int *shiftp)
 {
diff --git a/arch/arm64/kernel/jump_label.c b/arch/arm64/kernel/jump_label.c
index 9a8a0ae1e75f8..fc98037e12205 100644
--- a/arch/arm64/kernel/jump_label.c
+++ b/arch/arm64/kernel/jump_label.c
@@ -8,6 +8,7 @@
 #include <linux/kernel.h>
 #include <linux/jump_label.h>
 #include <asm/insn.h>
+#include <asm/patching.h>
 
 void arch_jump_label_transform(struct jump_entry *entry,
 			       enum jump_label_type type)
diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c
index e4e95821b1f6c..1a25c912572df 100644
--- a/arch/arm64/kernel/kgdb.c
+++ b/arch/arm64/kernel/kgdb.c
@@ -18,6 +18,7 @@
 #include <asm/debug-monitors.h>
 #include <asm/insn.h>
 #include <asm/traps.h>
+#include <asm/patching.h>
 
 struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = {
 	{ "x0", 8, offsetof(struct pt_regs, regs[0])},
diff --git a/arch/arm64/kernel/patching.c b/arch/arm64/kernel/patching.c
new file mode 100644
index 0000000000000..65942fa5dc48c
--- /dev/null
+++ b/arch/arm64/kernel/patching.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+#include <linux/stop_machine.h>
+#include <linux/uaccess.h>
+
+#include <asm/cacheflush.h>
+#include <asm/fixmap.h>
+#include <asm/kprobes.h>
+#include <asm/sections.h>
+
+static DEFINE_RAW_SPINLOCK(patch_lock);
+
+static bool is_exit_text(unsigned long addr)
+{
+	/* discarded with init text/data */
+	return system_state < SYSTEM_RUNNING &&
+		addr >= (unsigned long)__exittext_begin &&
+		addr < (unsigned long)__exittext_end;
+}
+
+static bool is_image_text(unsigned long addr)
+{
+	return core_kernel_text(addr) || is_exit_text(addr);
+}
+
+static void __kprobes *patch_map(void *addr, int fixmap)
+{
+	unsigned long uintaddr = (uintptr_t) addr;
+	bool image = is_image_text(uintaddr);
+	struct page *page;
+
+	if (image)
+		page = phys_to_page(__pa_symbol(addr));
+	else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+		page = vmalloc_to_page(addr);
+	else
+		return addr;
+
+	BUG_ON(!page);
+	return (void *)set_fixmap_offset(fixmap, page_to_phys(page) +
+			(uintaddr & ~PAGE_MASK));
+}
+
+static void __kprobes patch_unmap(int fixmap)
+{
+	clear_fixmap(fixmap);
+}
+/*
+ * In ARMv8-A, A64 instructions have a fixed length of 32 bits and are always
+ * little-endian.
+ */
+int __kprobes aarch64_insn_read(void *addr, u32 *insnp)
+{
+	int ret;
+	__le32 val;
+
+	ret = copy_from_kernel_nofault(&val, addr, AARCH64_INSN_SIZE);
+	if (!ret)
+		*insnp = le32_to_cpu(val);
+
+	return ret;
+}
+
+static int __kprobes __aarch64_insn_write(void *addr, __le32 insn)
+{
+	void *waddr = addr;
+	unsigned long flags = 0;
+	int ret;
+
+	raw_spin_lock_irqsave(&patch_lock, flags);
+	waddr = patch_map(addr, FIX_TEXT_POKE0);
+
+	ret = copy_to_kernel_nofault(waddr, &insn, AARCH64_INSN_SIZE);
+
+	patch_unmap(FIX_TEXT_POKE0);
+	raw_spin_unlock_irqrestore(&patch_lock, flags);
+
+	return ret;
+}
+
+int __kprobes aarch64_insn_write(void *addr, u32 insn)
+{
+	return __aarch64_insn_write(addr, cpu_to_le32(insn));
+}
+
+int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn)
+{
+	u32 *tp = addr;
+	int ret;
+
+	/* A64 instructions must be word aligned */
+	if ((uintptr_t)tp & 0x3)
+		return -EINVAL;
+
+	ret = aarch64_insn_write(tp, insn);
+	if (ret == 0)
+		__flush_icache_range((uintptr_t)tp,
+				     (uintptr_t)tp + AARCH64_INSN_SIZE);
+
+	return ret;
+}
+
+struct aarch64_insn_patch {
+	void		**text_addrs;
+	u32		*new_insns;
+	int		insn_cnt;
+	atomic_t	cpu_count;
+};
+
+static int __kprobes aarch64_insn_patch_text_cb(void *arg)
+{
+	int i, ret = 0;
+	struct aarch64_insn_patch *pp = arg;
+
+	/* The last CPU becomes master */
+	if (atomic_inc_return(&pp->cpu_count) == num_online_cpus()) {
+		for (i = 0; ret == 0 && i < pp->insn_cnt; i++)
+			ret = aarch64_insn_patch_text_nosync(pp->text_addrs[i],
+							     pp->new_insns[i]);
+		/* Notify other processors with an additional increment. */
+		atomic_inc(&pp->cpu_count);
+	} else {
+		while (atomic_read(&pp->cpu_count) <= num_online_cpus())
+			cpu_relax();
+		isb();
+	}
+
+	return ret;
+}
+
+int __kprobes aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt)
+{
+	struct aarch64_insn_patch patch = {
+		.text_addrs = addrs,
+		.new_insns = insns,
+		.insn_cnt = cnt,
+		.cpu_count = ATOMIC_INIT(0),
+	};
+
+	if (cnt <= 0)
+		return -EINVAL;
+
+	return stop_machine_cpuslocked(aarch64_insn_patch_text_cb, &patch,
+				       cpu_online_mask);
+}
diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c
index 798c3e78b84bb..155daf11e56df 100644
--- a/arch/arm64/kernel/probes/kprobes.c
+++ b/arch/arm64/kernel/probes/kprobes.c
@@ -27,6 +27,7 @@
 #include <linux/uaccess.h>
 #include <asm/irq.h>
 #include <asm/sections.h>
+#include <asm/patching.h>
 
 #include "decode-insn.h"
 
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 2cdd53425509d..1e010cf19e307 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -38,6 +38,7 @@
 #include <asm/extable.h>
 #include <asm/insn.h>
 #include <asm/kprobes.h>
+#include <asm/patching.h>
 #include <asm/traps.h>
 #include <asm/smp.h>
 #include <asm/stack_pointer.h>

From 6249ca8651de10026ae92fb10717c88a1fde6e0d Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Wed, 10 Feb 2021 14:36:12 +0100
Subject: [PATCH 221/737] arm64: insn: Reduce header dependencies of
 instruction decoder

The instruction encoder/decoder depends on alternative headers only
for single macro definitions that could be part of the instruction
decoder.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/include/asm/alternative-macros.h | 3 ---
 arch/arm64/include/asm/insn.h               | 3 ++-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/alternative-macros.h b/arch/arm64/include/asm/alternative-macros.h
index 5df500dcc627a..c01edf4d988db 100644
--- a/arch/arm64/include/asm/alternative-macros.h
+++ b/arch/arm64/include/asm/alternative-macros.h
@@ -6,9 +6,6 @@
 
 #define ARM64_CB_PATCH ARM64_NCAPS
 
-/* A64 instructions are always 32 bits. */
-#define	AARCH64_INSN_SIZE		4
-
 #ifndef __ASSEMBLY__
 
 #include <linux/stringify.h>
diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index f67db3d763999..747d733ecd565 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -10,7 +10,8 @@
 #include <linux/build_bug.h>
 #include <linux/types.h>
 
-#include <asm/alternative.h>
+/* A64 instructions are always 32 bits. */
+#define AARCH64_INSN_SIZE	4
 
 #ifndef __ASSEMBLY__
 /*

From 9e550863e7b8e48f533749b3053a34f9a1626e1e Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Wed, 10 Feb 2021 17:03:26 +0100
Subject: [PATCH 222/737] arm64: Move instruction encoder/decoder under lib/

Aarch64 instruction set encoding and decoding logic can prove useful
for some features/tools both part of the kernel and outside the kernel.

Isolate the function dealing only with encoding/decoding instructions,
with minimal dependency on kernel utilities in order to be able to reuse
that code.

Code was only moved, no code should have been added, removed nor
modifier.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/kernel/Makefile        | 2 +-
 arch/arm64/lib/Makefile           | 6 +++---
 arch/arm64/{kernel => lib}/insn.c | 0
 3 files changed, 4 insertions(+), 4 deletions(-)
 rename arch/arm64/{kernel => lib}/insn.c (100%)

diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index a22556ef7652c..643acbc605be6 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -13,7 +13,7 @@ CFLAGS_REMOVE_return_address.o = $(CC_FLAGS_FTRACE)
 obj-y			:= debug-monitors.o entry.o irq.o fpsimd.o		\
 			   entry-common.o entry-fpsimd.o process.o ptrace.o	\
 			   setup.o signal.o sys.o stacktrace.o time.o traps.o	\
-			   io.o vdso.o hyp-stub.o psci.o cpu_ops.o insn.o	\
+			   io.o vdso.o hyp-stub.o psci.o cpu_ops.o		\
 			   return_address.o cpuinfo.o cpu_errata.o		\
 			   cpufeature.o alternative.o cacheinfo.o		\
 			   smp.o smp_spin_table.o topology.o smccc-call.o	\
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index d31e1169d9b8e..9cd83908717da 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0
 lib-y		:= clear_user.o delay.o copy_from_user.o		\
 		   copy_to_user.o copy_in_user.o copy_page.o		\
-		   clear_page.o csum.o memchr.o memcpy.o memmove.o	\
-		   memset.o memcmp.o strcmp.o strncmp.o strlen.o	\
-		   strnlen.o strchr.o strrchr.o tishift.o
+		   clear_page.o csum.o insn.o memchr.o memcpy.o		\
+		   memmove.o memset.o memcmp.o strcmp.o strncmp.o	\
+		   strlen.o strnlen.o strchr.o strrchr.o tishift.o
 
 ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
 obj-$(CONFIG_XOR_BLOCKS)	+= xor-neon.o
diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/lib/insn.c
similarity index 100%
rename from arch/arm64/kernel/insn.c
rename to arch/arm64/lib/insn.c

From de1740fe8e9213e79aacbddc2582ab3e40a5ca73 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Wed, 10 Feb 2021 17:09:07 +0100
Subject: [PATCH 223/737] arm64: insn: Add SVE instruction class

SVE has been public for some time now. Let the decoder acknowledge
its existence.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/include/asm/insn.h | 1 +
 arch/arm64/lib/insn.c         | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index 747d733ecd565..94d3abbe1b794 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -31,6 +31,7 @@
  */
 enum aarch64_insn_encoding_class {
 	AARCH64_INSN_CLS_UNKNOWN,	/* UNALLOCATED */
+	AARCH64_INSN_CLS_SVE,		/* SVE instructions */
 	AARCH64_INSN_CLS_DP_IMM,	/* Data processing - immediate */
 	AARCH64_INSN_CLS_DP_REG,	/* Data processing - register */
 	AARCH64_INSN_CLS_DP_FPSIMD,	/* Data processing - SIMD and FP */
diff --git a/arch/arm64/lib/insn.c b/arch/arm64/lib/insn.c
index 952e7d6fe60e2..bfa3a6ec0b13a 100644
--- a/arch/arm64/lib/insn.c
+++ b/arch/arm64/lib/insn.c
@@ -23,7 +23,7 @@
 static const int aarch64_insn_encoding_class[] = {
 	AARCH64_INSN_CLS_UNKNOWN,
 	AARCH64_INSN_CLS_UNKNOWN,
-	AARCH64_INSN_CLS_UNKNOWN,
+	AARCH64_INSN_CLS_SVE,
 	AARCH64_INSN_CLS_UNKNOWN,
 	AARCH64_INSN_CLS_LDST,
 	AARCH64_INSN_CLS_DP_REG,

From 8711d0198ef383234b755c69253207d81d175b4b Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Wed, 19 Feb 2020 17:06:09 +0000
Subject: [PATCH 224/737] arm64: insn: Add barrier encodings

Create necessary functions to encode/decode aarch64 barrier
instructions.

DSB needs special case handling as it has multiple encodings.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/include/asm/insn.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index 94d3abbe1b794..63a78bab0fd26 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -371,6 +371,14 @@ __AARCH64_INSN_FUNCS(eret_auth,	0xFFFFFBFF, 0xD69F0BFF)
 __AARCH64_INSN_FUNCS(mrs,	0xFFF00000, 0xD5300000)
 __AARCH64_INSN_FUNCS(msr_imm,	0xFFF8F01F, 0xD500401F)
 __AARCH64_INSN_FUNCS(msr_reg,	0xFFF00000, 0xD5100000)
+__AARCH64_INSN_FUNCS(dmb,	0xFFFFF0FF, 0xD50330BF)
+__AARCH64_INSN_FUNCS(dsb_base,	0xFFFFF0FF, 0xD503309F)
+__AARCH64_INSN_FUNCS(dsb_nxs,	0xFFFFF3FF, 0xD503323F)
+__AARCH64_INSN_FUNCS(isb,	0xFFFFF0FF, 0xD50330DF)
+__AARCH64_INSN_FUNCS(sb,	0xFFFFFFFF, 0xD50330FF)
+__AARCH64_INSN_FUNCS(clrex,	0xFFFFF0FF, 0xD503305F)
+__AARCH64_INSN_FUNCS(ssbb,	0xFFFFFFFF, 0xD503309F)
+__AARCH64_INSN_FUNCS(pssbb,	0xFFFFFFFF, 0xD503349F)
 
 #undef	__AARCH64_INSN_FUNCS
 
@@ -382,6 +390,20 @@ static inline bool aarch64_insn_is_adr_adrp(u32 insn)
 	return aarch64_insn_is_adr(insn) || aarch64_insn_is_adrp(insn);
 }
 
+static inline bool aarch64_insn_is_dsb(u32 insn)
+{
+	return (aarch64_insn_is_dsb_base(insn) && (insn & 0xb00)) ||
+		aarch64_insn_is_dsb_nxs(insn);
+}
+
+static inline bool aarch64_insn_is_barrier(u32 insn)
+{
+	return aarch64_insn_is_dmb(insn) || aarch64_insn_is_dsb(insn) ||
+	       aarch64_insn_is_isb(insn) || aarch64_insn_is_sb(insn) ||
+	       aarch64_insn_is_clrex(insn) || aarch64_insn_is_ssbb(insn) ||
+	       aarch64_insn_is_pssbb(insn);
+}
+
 enum aarch64_insn_encoding_class aarch64_get_insn_class(u32 insn);
 bool aarch64_insn_uses_literal(u32 insn);
 bool aarch64_insn_is_branch(u32 insn);

From 32ece194991f68c98a373fae604377097a9388dc Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Fri, 14 Feb 2020 17:22:34 +0000
Subject: [PATCH 225/737] arm64: insn: Add some opcodes to instruction decoder

Add decoding capability for some instructions that objtool will need
to decode.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/include/asm/insn.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index 63a78bab0fd26..4177a744d2633 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -297,6 +297,12 @@ __AARCH64_INSN_FUNCS(adr,	0x9F000000, 0x10000000)
 __AARCH64_INSN_FUNCS(adrp,	0x9F000000, 0x90000000)
 __AARCH64_INSN_FUNCS(prfm,	0x3FC00000, 0x39800000)
 __AARCH64_INSN_FUNCS(prfm_lit,	0xFF000000, 0xD8000000)
+__AARCH64_INSN_FUNCS(store_imm,	0x3FC00000, 0x39000000)
+__AARCH64_INSN_FUNCS(load_imm,	0x3FC00000, 0x39400000)
+__AARCH64_INSN_FUNCS(store_pre,	0x3FE00C00, 0x38000C00)
+__AARCH64_INSN_FUNCS(load_pre,	0x3FE00C00, 0x38400C00)
+__AARCH64_INSN_FUNCS(store_post,	0x3FE00C00, 0x38000400)
+__AARCH64_INSN_FUNCS(load_post,	0x3FE00C00, 0x38400400)
 __AARCH64_INSN_FUNCS(str_reg,	0x3FE0EC00, 0x38206800)
 __AARCH64_INSN_FUNCS(ldadd,	0x3F20FC00, 0x38200000)
 __AARCH64_INSN_FUNCS(ldr_reg,	0x3FE0EC00, 0x38606800)
@@ -305,6 +311,8 @@ __AARCH64_INSN_FUNCS(ldrsw_lit,	0xFF000000, 0x98000000)
 __AARCH64_INSN_FUNCS(exclusive,	0x3F800000, 0x08000000)
 __AARCH64_INSN_FUNCS(load_ex,	0x3F400000, 0x08400000)
 __AARCH64_INSN_FUNCS(store_ex,	0x3F400000, 0x08000000)
+__AARCH64_INSN_FUNCS(stp,	0x7FC00000, 0x29000000)
+__AARCH64_INSN_FUNCS(ldp,	0x7FC00000, 0x29400000)
 __AARCH64_INSN_FUNCS(stp_post,	0x7FC00000, 0x28800000)
 __AARCH64_INSN_FUNCS(ldp_post,	0x7FC00000, 0x28C00000)
 __AARCH64_INSN_FUNCS(stp_pre,	0x7FC00000, 0x29800000)
@@ -337,6 +345,7 @@ __AARCH64_INSN_FUNCS(rev64,	0x7FFFFC00, 0x5AC00C00)
 __AARCH64_INSN_FUNCS(and,	0x7F200000, 0x0A000000)
 __AARCH64_INSN_FUNCS(bic,	0x7F200000, 0x0A200000)
 __AARCH64_INSN_FUNCS(orr,	0x7F200000, 0x2A000000)
+__AARCH64_INSN_FUNCS(mov_reg,	0x7FE0FFE0, 0x2A0003E0)
 __AARCH64_INSN_FUNCS(orn,	0x7F200000, 0x2A200000)
 __AARCH64_INSN_FUNCS(eor,	0x7F200000, 0x4A000000)
 __AARCH64_INSN_FUNCS(eon,	0x7F200000, 0x4A200000)

From 6193ae016df821f41beaeb5ce759d7a259510451 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Fri, 14 Feb 2020 17:38:19 +0000
Subject: [PATCH 226/737] arm64: insn: Add load/store decoding helpers

Provide some function to group different load/store instructions.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/include/asm/insn.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index 4177a744d2633..c9e95848042b9 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -413,6 +413,34 @@ static inline bool aarch64_insn_is_barrier(u32 insn)
 	       aarch64_insn_is_pssbb(insn);
 }
 
+static inline bool aarch64_insn_is_store_single(u32 insn)
+{
+	return aarch64_insn_is_store_imm(insn) ||
+	       aarch64_insn_is_store_pre(insn) ||
+	       aarch64_insn_is_store_post(insn);
+}
+
+static inline bool aarch64_insn_is_store_pair(u32 insn)
+{
+	return aarch64_insn_is_stp(insn) ||
+	       aarch64_insn_is_stp_pre(insn) ||
+	       aarch64_insn_is_stp_post(insn);
+}
+
+static inline bool aarch64_insn_is_load_single(u32 insn)
+{
+	return aarch64_insn_is_load_imm(insn) ||
+	       aarch64_insn_is_load_pre(insn) ||
+	       aarch64_insn_is_load_post(insn);
+}
+
+static inline bool aarch64_insn_is_load_pair(u32 insn)
+{
+	return aarch64_insn_is_ldp(insn) ||
+	       aarch64_insn_is_ldp_pre(insn) ||
+	       aarch64_insn_is_ldp_post(insn);
+}
+
 enum aarch64_insn_encoding_class aarch64_get_insn_class(u32 insn);
 bool aarch64_insn_uses_literal(u32 insn);
 bool aarch64_insn_is_branch(u32 insn);

From b96ebbb5e51a01d4579b28f57d04deef4dc98c80 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Sat, 15 Feb 2020 09:26:26 +0000
Subject: [PATCH 227/737] tools: Add some generic functions and headers

These will be needed to be able to use arm64 instruction decoder in
userland tools.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 tools/include/asm-generic/bitops/__ffs.h | 11 +++++++
 tools/include/linux/kernel.h             | 21 +++++++++++++
 tools/include/linux/printk.h             | 40 ++++++++++++++++++++++++
 3 files changed, 72 insertions(+)
 create mode 100644 tools/include/linux/printk.h

diff --git a/tools/include/asm-generic/bitops/__ffs.h b/tools/include/asm-generic/bitops/__ffs.h
index 9d13105194970..963f8a22212fd 100644
--- a/tools/include/asm-generic/bitops/__ffs.h
+++ b/tools/include/asm-generic/bitops/__ffs.h
@@ -42,4 +42,15 @@ static __always_inline unsigned long __ffs(unsigned long word)
 	return num;
 }
 
+static inline unsigned long __ffs64(u64 word)
+{
+#if BITS_PER_LONG == 32
+	if (((u32)word) == 0UL)
+		return __ffs((u32)(word >> 32)) + 32;
+#elif BITS_PER_LONG != 64
+#error BITS_PER_LONG not 32 or 64
+#endif
+	return __ffs((unsigned long)word);
+}
+
 #endif /* _TOOLS_LINUX_ASM_GENERIC_BITOPS___FFS_H_ */
diff --git a/tools/include/linux/kernel.h b/tools/include/linux/kernel.h
index a7e54a08fb54c..e748982ed5c1a 100644
--- a/tools/include/linux/kernel.h
+++ b/tools/include/linux/kernel.h
@@ -114,6 +114,27 @@ int scnprintf_pad(char * buf, size_t size, const char * fmt, ...);
 #define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
 #define round_down(x, y) ((x) & ~__round_mask(x, y))
 
+/**
+ * upper_32_bits - return bits 32-63 of a number
+ * @n: the number we're accessing
+ *
+ * A basic shift-right of a 64- or 32-bit quantity.  Use this to suppress
+ * the "right shift count >= width of type" warning when that quantity is
+ * 32-bits.
+ */
+#define upper_32_bits(n) ((u32)(((n) >> 16) >> 16))
+
+/**
+ * lower_32_bits - return bits 0-31 of a number
+ * @n: the number we're accessing
+ */
+#define lower_32_bits(n) ((u32)(n))
+
+/* Inspired from ALIGN_*_KERNEL */
+#define __ALIGN_MASK(x, mask)	(((x) + (mask)) & ~(mask))
+#define __ALIGN(x, a)		__ALIGN_MASK(x, (typeof(x))(a) - 1)
+#define ALIGN_DOWN(x, a)	__ALIGN((x) - ((a) - 1), (a))
+
 #define current_gfp_context(k) 0
 #define synchronize_rcu()
 
diff --git a/tools/include/linux/printk.h b/tools/include/linux/printk.h
new file mode 100644
index 0000000000000..515ebdc47e6e1
--- /dev/null
+++ b/tools/include/linux/printk.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _TOOLS_LINUX_KERNEL_PRINTK_H_
+#define _TOOLS_LINUX_KERNEL_PRINTK_H_
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define printk(fmt, ...) fprintf(stdout, fmt, ##__VA_ARGS__)
+#define pr_info printk
+#define pr_notice printk
+#define pr_cont printk
+
+#define pr_warn(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__)
+#define pr_err pr_warn
+#define pr_alert pr_warn
+#define pr_emerg pr_warn
+#define pr_crit pr_warn
+
+/*
+ * Dummy printk for disabled debugging statements to use whilst maintaining
+ * gcc's format checking.
+ */
+#define no_printk(fmt, ...)				\
+({							\
+	if (0)						\
+		printk(fmt, ##__VA_ARGS__);		\
+	0;						\
+})
+
+/* pr_devel() should produce zero code unless DEBUG is defined */
+#ifdef DEBUG
+#define pr_devel(fmt, ...) printk
+#else
+#define pr_devel(fmt, ...) no_printk
+#endif
+
+#define pr_debug pr_devel
+
+#endif /* _TOOLS_LINUX_KERNEL_PRINTK_H_ */

From d853d7db274491581fdc2821b4b9c2d9b109e87b Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Sat, 15 Feb 2020 09:32:12 +0000
Subject: [PATCH 228/737] tools: arm64: Make aarch64 instruction decoder
 available to tools

Add aarch64 encoder/decoder implementation under tools/ as well
as the necessary arm64 headers.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 tools/arch/arm64/include/asm/insn.h |  565 +++++++++++
 tools/arch/arm64/lib/insn.c         | 1456 +++++++++++++++++++++++++++
 2 files changed, 2021 insertions(+)
 create mode 100644 tools/arch/arm64/include/asm/insn.h
 create mode 100644 tools/arch/arm64/lib/insn.c

diff --git a/tools/arch/arm64/include/asm/insn.h b/tools/arch/arm64/include/asm/insn.h
new file mode 100644
index 0000000000000..71de52d1532ff
--- /dev/null
+++ b/tools/arch/arm64/include/asm/insn.h
@@ -0,0 +1,565 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2013 Huawei Ltd.
+ * Author: Jiang Liu <liuj97@gmail.com>
+ *
+ * Copyright (C) 2014 Zi Shen Lim <zlim.lnx@gmail.com>
+ */
+#ifndef	__ASM_INSN_H
+#define	__ASM_INSN_H
+#include <linux/build_bug.h>
+#include <linux/types.h>
+
+/* A64 instructions are always 32 bits. */
+#define AARCH64_INSN_SIZE	4
+
+#ifndef __ASSEMBLY__
+/*
+ * ARM Architecture Reference Manual for ARMv8 Profile-A, Issue A.a
+ * Section C3.1 "A64 instruction index by encoding":
+ * AArch64 main encoding table
+ *  Bit position
+ *   28 27 26 25	Encoding Group
+ *   0  0  -  -		Unallocated
+ *   1  0  0  -		Data processing, immediate
+ *   1  0  1  -		Branch, exception generation and system instructions
+ *   -  1  -  0		Loads and stores
+ *   -  1  0  1		Data processing - register
+ *   0  1  1  1		Data processing - SIMD and floating point
+ *   1  1  1  1		Data processing - SIMD and floating point
+ * "-" means "don't care"
+ */
+enum aarch64_insn_encoding_class {
+	AARCH64_INSN_CLS_UNKNOWN,	/* UNALLOCATED */
+	AARCH64_INSN_CLS_SVE,		/* SVE instructions */
+	AARCH64_INSN_CLS_DP_IMM,	/* Data processing - immediate */
+	AARCH64_INSN_CLS_DP_REG,	/* Data processing - register */
+	AARCH64_INSN_CLS_DP_FPSIMD,	/* Data processing - SIMD and FP */
+	AARCH64_INSN_CLS_LDST,		/* Loads and stores */
+	AARCH64_INSN_CLS_BR_SYS,	/* Branch, exception generation and
+					 * system instructions */
+};
+
+enum aarch64_insn_hint_cr_op {
+	AARCH64_INSN_HINT_NOP	= 0x0 << 5,
+	AARCH64_INSN_HINT_YIELD	= 0x1 << 5,
+	AARCH64_INSN_HINT_WFE	= 0x2 << 5,
+	AARCH64_INSN_HINT_WFI	= 0x3 << 5,
+	AARCH64_INSN_HINT_SEV	= 0x4 << 5,
+	AARCH64_INSN_HINT_SEVL	= 0x5 << 5,
+
+	AARCH64_INSN_HINT_XPACLRI    = 0x07 << 5,
+	AARCH64_INSN_HINT_PACIA_1716 = 0x08 << 5,
+	AARCH64_INSN_HINT_PACIB_1716 = 0x0A << 5,
+	AARCH64_INSN_HINT_AUTIA_1716 = 0x0C << 5,
+	AARCH64_INSN_HINT_AUTIB_1716 = 0x0E << 5,
+	AARCH64_INSN_HINT_PACIAZ     = 0x18 << 5,
+	AARCH64_INSN_HINT_PACIASP    = 0x19 << 5,
+	AARCH64_INSN_HINT_PACIBZ     = 0x1A << 5,
+	AARCH64_INSN_HINT_PACIBSP    = 0x1B << 5,
+	AARCH64_INSN_HINT_AUTIAZ     = 0x1C << 5,
+	AARCH64_INSN_HINT_AUTIASP    = 0x1D << 5,
+	AARCH64_INSN_HINT_AUTIBZ     = 0x1E << 5,
+	AARCH64_INSN_HINT_AUTIBSP    = 0x1F << 5,
+
+	AARCH64_INSN_HINT_ESB  = 0x10 << 5,
+	AARCH64_INSN_HINT_PSB  = 0x11 << 5,
+	AARCH64_INSN_HINT_TSB  = 0x12 << 5,
+	AARCH64_INSN_HINT_CSDB = 0x14 << 5,
+
+	AARCH64_INSN_HINT_BTI   = 0x20 << 5,
+	AARCH64_INSN_HINT_BTIC  = 0x22 << 5,
+	AARCH64_INSN_HINT_BTIJ  = 0x24 << 5,
+	AARCH64_INSN_HINT_BTIJC = 0x26 << 5,
+};
+
+enum aarch64_insn_imm_type {
+	AARCH64_INSN_IMM_ADR,
+	AARCH64_INSN_IMM_26,
+	AARCH64_INSN_IMM_19,
+	AARCH64_INSN_IMM_16,
+	AARCH64_INSN_IMM_14,
+	AARCH64_INSN_IMM_12,
+	AARCH64_INSN_IMM_9,
+	AARCH64_INSN_IMM_7,
+	AARCH64_INSN_IMM_6,
+	AARCH64_INSN_IMM_S,
+	AARCH64_INSN_IMM_R,
+	AARCH64_INSN_IMM_N,
+	AARCH64_INSN_IMM_MAX
+};
+
+enum aarch64_insn_register_type {
+	AARCH64_INSN_REGTYPE_RT,
+	AARCH64_INSN_REGTYPE_RN,
+	AARCH64_INSN_REGTYPE_RT2,
+	AARCH64_INSN_REGTYPE_RM,
+	AARCH64_INSN_REGTYPE_RD,
+	AARCH64_INSN_REGTYPE_RA,
+	AARCH64_INSN_REGTYPE_RS,
+};
+
+enum aarch64_insn_register {
+	AARCH64_INSN_REG_0  = 0,
+	AARCH64_INSN_REG_1  = 1,
+	AARCH64_INSN_REG_2  = 2,
+	AARCH64_INSN_REG_3  = 3,
+	AARCH64_INSN_REG_4  = 4,
+	AARCH64_INSN_REG_5  = 5,
+	AARCH64_INSN_REG_6  = 6,
+	AARCH64_INSN_REG_7  = 7,
+	AARCH64_INSN_REG_8  = 8,
+	AARCH64_INSN_REG_9  = 9,
+	AARCH64_INSN_REG_10 = 10,
+	AARCH64_INSN_REG_11 = 11,
+	AARCH64_INSN_REG_12 = 12,
+	AARCH64_INSN_REG_13 = 13,
+	AARCH64_INSN_REG_14 = 14,
+	AARCH64_INSN_REG_15 = 15,
+	AARCH64_INSN_REG_16 = 16,
+	AARCH64_INSN_REG_17 = 17,
+	AARCH64_INSN_REG_18 = 18,
+	AARCH64_INSN_REG_19 = 19,
+	AARCH64_INSN_REG_20 = 20,
+	AARCH64_INSN_REG_21 = 21,
+	AARCH64_INSN_REG_22 = 22,
+	AARCH64_INSN_REG_23 = 23,
+	AARCH64_INSN_REG_24 = 24,
+	AARCH64_INSN_REG_25 = 25,
+	AARCH64_INSN_REG_26 = 26,
+	AARCH64_INSN_REG_27 = 27,
+	AARCH64_INSN_REG_28 = 28,
+	AARCH64_INSN_REG_29 = 29,
+	AARCH64_INSN_REG_FP = 29, /* Frame pointer */
+	AARCH64_INSN_REG_30 = 30,
+	AARCH64_INSN_REG_LR = 30, /* Link register */
+	AARCH64_INSN_REG_ZR = 31, /* Zero: as source register */
+	AARCH64_INSN_REG_SP = 31  /* Stack pointer: as load/store base reg */
+};
+
+enum aarch64_insn_special_register {
+	AARCH64_INSN_SPCLREG_SPSR_EL1	= 0xC200,
+	AARCH64_INSN_SPCLREG_ELR_EL1	= 0xC201,
+	AARCH64_INSN_SPCLREG_SP_EL0	= 0xC208,
+	AARCH64_INSN_SPCLREG_SPSEL	= 0xC210,
+	AARCH64_INSN_SPCLREG_CURRENTEL	= 0xC212,
+	AARCH64_INSN_SPCLREG_DAIF	= 0xDA11,
+	AARCH64_INSN_SPCLREG_NZCV	= 0xDA10,
+	AARCH64_INSN_SPCLREG_FPCR	= 0xDA20,
+	AARCH64_INSN_SPCLREG_DSPSR_EL0	= 0xDA28,
+	AARCH64_INSN_SPCLREG_DLR_EL0	= 0xDA29,
+	AARCH64_INSN_SPCLREG_SPSR_EL2	= 0xE200,
+	AARCH64_INSN_SPCLREG_ELR_EL2	= 0xE201,
+	AARCH64_INSN_SPCLREG_SP_EL1	= 0xE208,
+	AARCH64_INSN_SPCLREG_SPSR_INQ	= 0xE218,
+	AARCH64_INSN_SPCLREG_SPSR_ABT	= 0xE219,
+	AARCH64_INSN_SPCLREG_SPSR_UND	= 0xE21A,
+	AARCH64_INSN_SPCLREG_SPSR_FIQ	= 0xE21B,
+	AARCH64_INSN_SPCLREG_SPSR_EL3	= 0xF200,
+	AARCH64_INSN_SPCLREG_ELR_EL3	= 0xF201,
+	AARCH64_INSN_SPCLREG_SP_EL2	= 0xF210
+};
+
+enum aarch64_insn_variant {
+	AARCH64_INSN_VARIANT_32BIT,
+	AARCH64_INSN_VARIANT_64BIT
+};
+
+enum aarch64_insn_condition {
+	AARCH64_INSN_COND_EQ = 0x0, /* == */
+	AARCH64_INSN_COND_NE = 0x1, /* != */
+	AARCH64_INSN_COND_CS = 0x2, /* unsigned >= */
+	AARCH64_INSN_COND_CC = 0x3, /* unsigned < */
+	AARCH64_INSN_COND_MI = 0x4, /* < 0 */
+	AARCH64_INSN_COND_PL = 0x5, /* >= 0 */
+	AARCH64_INSN_COND_VS = 0x6, /* overflow */
+	AARCH64_INSN_COND_VC = 0x7, /* no overflow */
+	AARCH64_INSN_COND_HI = 0x8, /* unsigned > */
+	AARCH64_INSN_COND_LS = 0x9, /* unsigned <= */
+	AARCH64_INSN_COND_GE = 0xa, /* signed >= */
+	AARCH64_INSN_COND_LT = 0xb, /* signed < */
+	AARCH64_INSN_COND_GT = 0xc, /* signed > */
+	AARCH64_INSN_COND_LE = 0xd, /* signed <= */
+	AARCH64_INSN_COND_AL = 0xe, /* always */
+};
+
+enum aarch64_insn_branch_type {
+	AARCH64_INSN_BRANCH_NOLINK,
+	AARCH64_INSN_BRANCH_LINK,
+	AARCH64_INSN_BRANCH_RETURN,
+	AARCH64_INSN_BRANCH_COMP_ZERO,
+	AARCH64_INSN_BRANCH_COMP_NONZERO,
+};
+
+enum aarch64_insn_size_type {
+	AARCH64_INSN_SIZE_8,
+	AARCH64_INSN_SIZE_16,
+	AARCH64_INSN_SIZE_32,
+	AARCH64_INSN_SIZE_64,
+};
+
+enum aarch64_insn_ldst_type {
+	AARCH64_INSN_LDST_LOAD_REG_OFFSET,
+	AARCH64_INSN_LDST_STORE_REG_OFFSET,
+	AARCH64_INSN_LDST_LOAD_PAIR_PRE_INDEX,
+	AARCH64_INSN_LDST_STORE_PAIR_PRE_INDEX,
+	AARCH64_INSN_LDST_LOAD_PAIR_POST_INDEX,
+	AARCH64_INSN_LDST_STORE_PAIR_POST_INDEX,
+	AARCH64_INSN_LDST_LOAD_EX,
+	AARCH64_INSN_LDST_STORE_EX,
+};
+
+enum aarch64_insn_adsb_type {
+	AARCH64_INSN_ADSB_ADD,
+	AARCH64_INSN_ADSB_SUB,
+	AARCH64_INSN_ADSB_ADD_SETFLAGS,
+	AARCH64_INSN_ADSB_SUB_SETFLAGS
+};
+
+enum aarch64_insn_movewide_type {
+	AARCH64_INSN_MOVEWIDE_ZERO,
+	AARCH64_INSN_MOVEWIDE_KEEP,
+	AARCH64_INSN_MOVEWIDE_INVERSE
+};
+
+enum aarch64_insn_bitfield_type {
+	AARCH64_INSN_BITFIELD_MOVE,
+	AARCH64_INSN_BITFIELD_MOVE_UNSIGNED,
+	AARCH64_INSN_BITFIELD_MOVE_SIGNED
+};
+
+enum aarch64_insn_data1_type {
+	AARCH64_INSN_DATA1_REVERSE_16,
+	AARCH64_INSN_DATA1_REVERSE_32,
+	AARCH64_INSN_DATA1_REVERSE_64,
+};
+
+enum aarch64_insn_data2_type {
+	AARCH64_INSN_DATA2_UDIV,
+	AARCH64_INSN_DATA2_SDIV,
+	AARCH64_INSN_DATA2_LSLV,
+	AARCH64_INSN_DATA2_LSRV,
+	AARCH64_INSN_DATA2_ASRV,
+	AARCH64_INSN_DATA2_RORV,
+};
+
+enum aarch64_insn_data3_type {
+	AARCH64_INSN_DATA3_MADD,
+	AARCH64_INSN_DATA3_MSUB,
+};
+
+enum aarch64_insn_logic_type {
+	AARCH64_INSN_LOGIC_AND,
+	AARCH64_INSN_LOGIC_BIC,
+	AARCH64_INSN_LOGIC_ORR,
+	AARCH64_INSN_LOGIC_ORN,
+	AARCH64_INSN_LOGIC_EOR,
+	AARCH64_INSN_LOGIC_EON,
+	AARCH64_INSN_LOGIC_AND_SETFLAGS,
+	AARCH64_INSN_LOGIC_BIC_SETFLAGS
+};
+
+enum aarch64_insn_prfm_type {
+	AARCH64_INSN_PRFM_TYPE_PLD,
+	AARCH64_INSN_PRFM_TYPE_PLI,
+	AARCH64_INSN_PRFM_TYPE_PST,
+};
+
+enum aarch64_insn_prfm_target {
+	AARCH64_INSN_PRFM_TARGET_L1,
+	AARCH64_INSN_PRFM_TARGET_L2,
+	AARCH64_INSN_PRFM_TARGET_L3,
+};
+
+enum aarch64_insn_prfm_policy {
+	AARCH64_INSN_PRFM_POLICY_KEEP,
+	AARCH64_INSN_PRFM_POLICY_STRM,
+};
+
+enum aarch64_insn_adr_type {
+	AARCH64_INSN_ADR_TYPE_ADRP,
+	AARCH64_INSN_ADR_TYPE_ADR,
+};
+
+#define	__AARCH64_INSN_FUNCS(abbr, mask, val)				\
+static __always_inline bool aarch64_insn_is_##abbr(u32 code)		\
+{									\
+	BUILD_BUG_ON(~(mask) & (val));					\
+	return (code & (mask)) == (val);				\
+}									\
+static __always_inline u32 aarch64_insn_get_##abbr##_value(void)	\
+{									\
+	return (val);							\
+}
+
+__AARCH64_INSN_FUNCS(adr,	0x9F000000, 0x10000000)
+__AARCH64_INSN_FUNCS(adrp,	0x9F000000, 0x90000000)
+__AARCH64_INSN_FUNCS(prfm,	0x3FC00000, 0x39800000)
+__AARCH64_INSN_FUNCS(prfm_lit,	0xFF000000, 0xD8000000)
+__AARCH64_INSN_FUNCS(store_imm,	0x3FC00000, 0x39000000)
+__AARCH64_INSN_FUNCS(load_imm,	0x3FC00000, 0x39400000)
+__AARCH64_INSN_FUNCS(store_pre,	0x3FE00C00, 0x38000C00)
+__AARCH64_INSN_FUNCS(load_pre,	0x3FE00C00, 0x38400C00)
+__AARCH64_INSN_FUNCS(store_post,	0x3FE00C00, 0x38000400)
+__AARCH64_INSN_FUNCS(load_post,	0x3FE00C00, 0x38400400)
+__AARCH64_INSN_FUNCS(str_reg,	0x3FE0EC00, 0x38206800)
+__AARCH64_INSN_FUNCS(ldadd,	0x3F20FC00, 0x38200000)
+__AARCH64_INSN_FUNCS(ldr_reg,	0x3FE0EC00, 0x38606800)
+__AARCH64_INSN_FUNCS(ldr_lit,	0xBF000000, 0x18000000)
+__AARCH64_INSN_FUNCS(ldrsw_lit,	0xFF000000, 0x98000000)
+__AARCH64_INSN_FUNCS(exclusive,	0x3F800000, 0x08000000)
+__AARCH64_INSN_FUNCS(load_ex,	0x3F400000, 0x08400000)
+__AARCH64_INSN_FUNCS(store_ex,	0x3F400000, 0x08000000)
+__AARCH64_INSN_FUNCS(stp,	0x7FC00000, 0x29000000)
+__AARCH64_INSN_FUNCS(ldp,	0x7FC00000, 0x29400000)
+__AARCH64_INSN_FUNCS(stp_post,	0x7FC00000, 0x28800000)
+__AARCH64_INSN_FUNCS(ldp_post,	0x7FC00000, 0x28C00000)
+__AARCH64_INSN_FUNCS(stp_pre,	0x7FC00000, 0x29800000)
+__AARCH64_INSN_FUNCS(ldp_pre,	0x7FC00000, 0x29C00000)
+__AARCH64_INSN_FUNCS(add_imm,	0x7F000000, 0x11000000)
+__AARCH64_INSN_FUNCS(adds_imm,	0x7F000000, 0x31000000)
+__AARCH64_INSN_FUNCS(sub_imm,	0x7F000000, 0x51000000)
+__AARCH64_INSN_FUNCS(subs_imm,	0x7F000000, 0x71000000)
+__AARCH64_INSN_FUNCS(movn,	0x7F800000, 0x12800000)
+__AARCH64_INSN_FUNCS(sbfm,	0x7F800000, 0x13000000)
+__AARCH64_INSN_FUNCS(bfm,	0x7F800000, 0x33000000)
+__AARCH64_INSN_FUNCS(movz,	0x7F800000, 0x52800000)
+__AARCH64_INSN_FUNCS(ubfm,	0x7F800000, 0x53000000)
+__AARCH64_INSN_FUNCS(movk,	0x7F800000, 0x72800000)
+__AARCH64_INSN_FUNCS(add,	0x7F200000, 0x0B000000)
+__AARCH64_INSN_FUNCS(adds,	0x7F200000, 0x2B000000)
+__AARCH64_INSN_FUNCS(sub,	0x7F200000, 0x4B000000)
+__AARCH64_INSN_FUNCS(subs,	0x7F200000, 0x6B000000)
+__AARCH64_INSN_FUNCS(madd,	0x7FE08000, 0x1B000000)
+__AARCH64_INSN_FUNCS(msub,	0x7FE08000, 0x1B008000)
+__AARCH64_INSN_FUNCS(udiv,	0x7FE0FC00, 0x1AC00800)
+__AARCH64_INSN_FUNCS(sdiv,	0x7FE0FC00, 0x1AC00C00)
+__AARCH64_INSN_FUNCS(lslv,	0x7FE0FC00, 0x1AC02000)
+__AARCH64_INSN_FUNCS(lsrv,	0x7FE0FC00, 0x1AC02400)
+__AARCH64_INSN_FUNCS(asrv,	0x7FE0FC00, 0x1AC02800)
+__AARCH64_INSN_FUNCS(rorv,	0x7FE0FC00, 0x1AC02C00)
+__AARCH64_INSN_FUNCS(rev16,	0x7FFFFC00, 0x5AC00400)
+__AARCH64_INSN_FUNCS(rev32,	0x7FFFFC00, 0x5AC00800)
+__AARCH64_INSN_FUNCS(rev64,	0x7FFFFC00, 0x5AC00C00)
+__AARCH64_INSN_FUNCS(and,	0x7F200000, 0x0A000000)
+__AARCH64_INSN_FUNCS(bic,	0x7F200000, 0x0A200000)
+__AARCH64_INSN_FUNCS(orr,	0x7F200000, 0x2A000000)
+__AARCH64_INSN_FUNCS(mov_reg,	0x7FE0FFE0, 0x2A0003E0)
+__AARCH64_INSN_FUNCS(orn,	0x7F200000, 0x2A200000)
+__AARCH64_INSN_FUNCS(eor,	0x7F200000, 0x4A000000)
+__AARCH64_INSN_FUNCS(eon,	0x7F200000, 0x4A200000)
+__AARCH64_INSN_FUNCS(ands,	0x7F200000, 0x6A000000)
+__AARCH64_INSN_FUNCS(bics,	0x7F200000, 0x6A200000)
+__AARCH64_INSN_FUNCS(and_imm,	0x7F800000, 0x12000000)
+__AARCH64_INSN_FUNCS(orr_imm,	0x7F800000, 0x32000000)
+__AARCH64_INSN_FUNCS(eor_imm,	0x7F800000, 0x52000000)
+__AARCH64_INSN_FUNCS(ands_imm,	0x7F800000, 0x72000000)
+__AARCH64_INSN_FUNCS(extr,	0x7FA00000, 0x13800000)
+__AARCH64_INSN_FUNCS(b,		0xFC000000, 0x14000000)
+__AARCH64_INSN_FUNCS(bl,	0xFC000000, 0x94000000)
+__AARCH64_INSN_FUNCS(cbz,	0x7F000000, 0x34000000)
+__AARCH64_INSN_FUNCS(cbnz,	0x7F000000, 0x35000000)
+__AARCH64_INSN_FUNCS(tbz,	0x7F000000, 0x36000000)
+__AARCH64_INSN_FUNCS(tbnz,	0x7F000000, 0x37000000)
+__AARCH64_INSN_FUNCS(bcond,	0xFF000010, 0x54000000)
+__AARCH64_INSN_FUNCS(svc,	0xFFE0001F, 0xD4000001)
+__AARCH64_INSN_FUNCS(hvc,	0xFFE0001F, 0xD4000002)
+__AARCH64_INSN_FUNCS(smc,	0xFFE0001F, 0xD4000003)
+__AARCH64_INSN_FUNCS(brk,	0xFFE0001F, 0xD4200000)
+__AARCH64_INSN_FUNCS(exception,	0xFF000000, 0xD4000000)
+__AARCH64_INSN_FUNCS(hint,	0xFFFFF01F, 0xD503201F)
+__AARCH64_INSN_FUNCS(br,	0xFFFFFC1F, 0xD61F0000)
+__AARCH64_INSN_FUNCS(br_auth,	0xFEFFF800, 0xD61F0800)
+__AARCH64_INSN_FUNCS(blr,	0xFFFFFC1F, 0xD63F0000)
+__AARCH64_INSN_FUNCS(blr_auth,	0xFEFFF800, 0xD63F0800)
+__AARCH64_INSN_FUNCS(ret,	0xFFFFFC1F, 0xD65F0000)
+__AARCH64_INSN_FUNCS(ret_auth,	0xFFFFFBFF, 0xD65F0BFF)
+__AARCH64_INSN_FUNCS(eret,	0xFFFFFFFF, 0xD69F03E0)
+__AARCH64_INSN_FUNCS(eret_auth,	0xFFFFFBFF, 0xD69F0BFF)
+__AARCH64_INSN_FUNCS(mrs,	0xFFF00000, 0xD5300000)
+__AARCH64_INSN_FUNCS(msr_imm,	0xFFF8F01F, 0xD500401F)
+__AARCH64_INSN_FUNCS(msr_reg,	0xFFF00000, 0xD5100000)
+__AARCH64_INSN_FUNCS(dmb,	0xFFFFF0FF, 0xD50330BF)
+__AARCH64_INSN_FUNCS(dsb_base,	0xFFFFF0FF, 0xD503309F)
+__AARCH64_INSN_FUNCS(dsb_nxs,	0xFFFFF3FF, 0xD503323F)
+__AARCH64_INSN_FUNCS(isb,	0xFFFFF0FF, 0xD50330DF)
+__AARCH64_INSN_FUNCS(sb,	0xFFFFFFFF, 0xD50330FF)
+__AARCH64_INSN_FUNCS(clrex,	0xFFFFF0FF, 0xD503305F)
+__AARCH64_INSN_FUNCS(ssbb,	0xFFFFFFFF, 0xD503309F)
+__AARCH64_INSN_FUNCS(pssbb,	0xFFFFFFFF, 0xD503349F)
+
+#undef	__AARCH64_INSN_FUNCS
+
+bool aarch64_insn_is_steppable_hint(u32 insn);
+bool aarch64_insn_is_branch_imm(u32 insn);
+
+static inline bool aarch64_insn_is_adr_adrp(u32 insn)
+{
+	return aarch64_insn_is_adr(insn) || aarch64_insn_is_adrp(insn);
+}
+
+static inline bool aarch64_insn_is_dsb(u32 insn)
+{
+	return (aarch64_insn_is_dsb_base(insn) && (insn & 0xb00)) ||
+		aarch64_insn_is_dsb_nxs(insn);
+}
+
+static inline bool aarch64_insn_is_barrier(u32 insn)
+{
+	return aarch64_insn_is_dmb(insn) || aarch64_insn_is_dsb(insn) ||
+	       aarch64_insn_is_isb(insn) || aarch64_insn_is_sb(insn) ||
+	       aarch64_insn_is_clrex(insn) || aarch64_insn_is_ssbb(insn) ||
+	       aarch64_insn_is_pssbb(insn);
+}
+
+static inline bool aarch64_insn_is_store_single(u32 insn)
+{
+	return aarch64_insn_is_store_imm(insn) ||
+	       aarch64_insn_is_store_pre(insn) ||
+	       aarch64_insn_is_store_post(insn);
+}
+
+static inline bool aarch64_insn_is_store_pair(u32 insn)
+{
+	return aarch64_insn_is_stp(insn) ||
+	       aarch64_insn_is_stp_pre(insn) ||
+	       aarch64_insn_is_stp_post(insn);
+}
+
+static inline bool aarch64_insn_is_load_single(u32 insn)
+{
+	return aarch64_insn_is_load_imm(insn) ||
+	       aarch64_insn_is_load_pre(insn) ||
+	       aarch64_insn_is_load_post(insn);
+}
+
+static inline bool aarch64_insn_is_load_pair(u32 insn)
+{
+	return aarch64_insn_is_ldp(insn) ||
+	       aarch64_insn_is_ldp_pre(insn) ||
+	       aarch64_insn_is_ldp_post(insn);
+}
+
+enum aarch64_insn_encoding_class aarch64_get_insn_class(u32 insn);
+bool aarch64_insn_uses_literal(u32 insn);
+bool aarch64_insn_is_branch(u32 insn);
+u64 aarch64_insn_decode_immediate(enum aarch64_insn_imm_type type, u32 insn);
+u32 aarch64_insn_encode_immediate(enum aarch64_insn_imm_type type,
+				  u32 insn, u64 imm);
+u32 aarch64_insn_decode_register(enum aarch64_insn_register_type type,
+					 u32 insn);
+u32 aarch64_insn_gen_branch_imm(unsigned long pc, unsigned long addr,
+				enum aarch64_insn_branch_type type);
+u32 aarch64_insn_gen_comp_branch_imm(unsigned long pc, unsigned long addr,
+				     enum aarch64_insn_register reg,
+				     enum aarch64_insn_variant variant,
+				     enum aarch64_insn_branch_type type);
+u32 aarch64_insn_gen_cond_branch_imm(unsigned long pc, unsigned long addr,
+				     enum aarch64_insn_condition cond);
+u32 aarch64_insn_gen_hint(enum aarch64_insn_hint_cr_op op);
+u32 aarch64_insn_gen_nop(void);
+u32 aarch64_insn_gen_branch_reg(enum aarch64_insn_register reg,
+				enum aarch64_insn_branch_type type);
+u32 aarch64_insn_gen_load_store_reg(enum aarch64_insn_register reg,
+				    enum aarch64_insn_register base,
+				    enum aarch64_insn_register offset,
+				    enum aarch64_insn_size_type size,
+				    enum aarch64_insn_ldst_type type);
+u32 aarch64_insn_gen_load_store_pair(enum aarch64_insn_register reg1,
+				     enum aarch64_insn_register reg2,
+				     enum aarch64_insn_register base,
+				     int offset,
+				     enum aarch64_insn_variant variant,
+				     enum aarch64_insn_ldst_type type);
+u32 aarch64_insn_gen_load_store_ex(enum aarch64_insn_register reg,
+				   enum aarch64_insn_register base,
+				   enum aarch64_insn_register state,
+				   enum aarch64_insn_size_type size,
+				   enum aarch64_insn_ldst_type type);
+u32 aarch64_insn_gen_ldadd(enum aarch64_insn_register result,
+			   enum aarch64_insn_register address,
+			   enum aarch64_insn_register value,
+			   enum aarch64_insn_size_type size);
+u32 aarch64_insn_gen_stadd(enum aarch64_insn_register address,
+			   enum aarch64_insn_register value,
+			   enum aarch64_insn_size_type size);
+u32 aarch64_insn_gen_add_sub_imm(enum aarch64_insn_register dst,
+				 enum aarch64_insn_register src,
+				 int imm, enum aarch64_insn_variant variant,
+				 enum aarch64_insn_adsb_type type);
+u32 aarch64_insn_gen_adr(unsigned long pc, unsigned long addr,
+			 enum aarch64_insn_register reg,
+			 enum aarch64_insn_adr_type type);
+u32 aarch64_insn_gen_bitfield(enum aarch64_insn_register dst,
+			      enum aarch64_insn_register src,
+			      int immr, int imms,
+			      enum aarch64_insn_variant variant,
+			      enum aarch64_insn_bitfield_type type);
+u32 aarch64_insn_gen_movewide(enum aarch64_insn_register dst,
+			      int imm, int shift,
+			      enum aarch64_insn_variant variant,
+			      enum aarch64_insn_movewide_type type);
+u32 aarch64_insn_gen_add_sub_shifted_reg(enum aarch64_insn_register dst,
+					 enum aarch64_insn_register src,
+					 enum aarch64_insn_register reg,
+					 int shift,
+					 enum aarch64_insn_variant variant,
+					 enum aarch64_insn_adsb_type type);
+u32 aarch64_insn_gen_data1(enum aarch64_insn_register dst,
+			   enum aarch64_insn_register src,
+			   enum aarch64_insn_variant variant,
+			   enum aarch64_insn_data1_type type);
+u32 aarch64_insn_gen_data2(enum aarch64_insn_register dst,
+			   enum aarch64_insn_register src,
+			   enum aarch64_insn_register reg,
+			   enum aarch64_insn_variant variant,
+			   enum aarch64_insn_data2_type type);
+u32 aarch64_insn_gen_data3(enum aarch64_insn_register dst,
+			   enum aarch64_insn_register src,
+			   enum aarch64_insn_register reg1,
+			   enum aarch64_insn_register reg2,
+			   enum aarch64_insn_variant variant,
+			   enum aarch64_insn_data3_type type);
+u32 aarch64_insn_gen_logical_shifted_reg(enum aarch64_insn_register dst,
+					 enum aarch64_insn_register src,
+					 enum aarch64_insn_register reg,
+					 int shift,
+					 enum aarch64_insn_variant variant,
+					 enum aarch64_insn_logic_type type);
+u32 aarch64_insn_gen_move_reg(enum aarch64_insn_register dst,
+			      enum aarch64_insn_register src,
+			      enum aarch64_insn_variant variant);
+u32 aarch64_insn_gen_logical_immediate(enum aarch64_insn_logic_type type,
+				       enum aarch64_insn_variant variant,
+				       enum aarch64_insn_register Rn,
+				       enum aarch64_insn_register Rd,
+				       u64 imm);
+u32 aarch64_insn_gen_extr(enum aarch64_insn_variant variant,
+			  enum aarch64_insn_register Rm,
+			  enum aarch64_insn_register Rn,
+			  enum aarch64_insn_register Rd,
+			  u8 lsb);
+u32 aarch64_insn_gen_prefetch(enum aarch64_insn_register base,
+			      enum aarch64_insn_prfm_type type,
+			      enum aarch64_insn_prfm_target target,
+			      enum aarch64_insn_prfm_policy policy);
+s32 aarch64_get_branch_offset(u32 insn);
+u32 aarch64_set_branch_offset(u32 insn, s32 offset);
+
+s32 aarch64_insn_adrp_get_offset(u32 insn);
+u32 aarch64_insn_adrp_set_offset(u32 insn, s32 offset);
+
+bool aarch32_insn_is_wide(u32 insn);
+
+#define A32_RN_OFFSET	16
+#define A32_RT_OFFSET	12
+#define A32_RT2_OFFSET	 0
+
+u32 aarch64_insn_extract_system_reg(u32 insn);
+u32 aarch32_insn_extract_reg_num(u32 insn, int offset);
+u32 aarch32_insn_mcr_extract_opc2(u32 insn);
+u32 aarch32_insn_mcr_extract_crm(u32 insn);
+
+#endif /* __ASSEMBLY__ */
+
+#endif	/* __ASM_INSN_H */
diff --git a/tools/arch/arm64/lib/insn.c b/tools/arch/arm64/lib/insn.c
new file mode 100644
index 0000000000000..b24407ed03982
--- /dev/null
+++ b/tools/arch/arm64/lib/insn.c
@@ -0,0 +1,1456 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2013 Huawei Ltd.
+ * Author: Jiang Liu <liuj97@gmail.com>
+ *
+ * Copyright (C) 2014-2016 Zi Shen Lim <zlim.lnx@gmail.com>
+ */
+#include <linux/bitops.h>
+#include <linux/bug.h>
+#include <linux/printk.h>
+#include <linux/sizes.h>
+#include <linux/types.h>
+
+#include <asm/errno.h>
+#include <asm/insn.h>
+
+#define AARCH64_INSN_SF_BIT	BIT(31)
+#define AARCH64_INSN_N_BIT	BIT(22)
+#define AARCH64_INSN_LSL_12	BIT(22)
+
+static const int aarch64_insn_encoding_class[] = {
+	AARCH64_INSN_CLS_UNKNOWN,
+	AARCH64_INSN_CLS_UNKNOWN,
+	AARCH64_INSN_CLS_SVE,
+	AARCH64_INSN_CLS_UNKNOWN,
+	AARCH64_INSN_CLS_LDST,
+	AARCH64_INSN_CLS_DP_REG,
+	AARCH64_INSN_CLS_LDST,
+	AARCH64_INSN_CLS_DP_FPSIMD,
+	AARCH64_INSN_CLS_DP_IMM,
+	AARCH64_INSN_CLS_DP_IMM,
+	AARCH64_INSN_CLS_BR_SYS,
+	AARCH64_INSN_CLS_BR_SYS,
+	AARCH64_INSN_CLS_LDST,
+	AARCH64_INSN_CLS_DP_REG,
+	AARCH64_INSN_CLS_LDST,
+	AARCH64_INSN_CLS_DP_FPSIMD,
+};
+
+enum aarch64_insn_encoding_class __kprobes aarch64_get_insn_class(u32 insn)
+{
+	return aarch64_insn_encoding_class[(insn >> 25) & 0xf];
+}
+
+bool __kprobes aarch64_insn_is_steppable_hint(u32 insn)
+{
+	if (!aarch64_insn_is_hint(insn))
+		return false;
+
+	switch (insn & 0xFE0) {
+	case AARCH64_INSN_HINT_XPACLRI:
+	case AARCH64_INSN_HINT_PACIA_1716:
+	case AARCH64_INSN_HINT_PACIB_1716:
+	case AARCH64_INSN_HINT_PACIAZ:
+	case AARCH64_INSN_HINT_PACIASP:
+	case AARCH64_INSN_HINT_PACIBZ:
+	case AARCH64_INSN_HINT_PACIBSP:
+	case AARCH64_INSN_HINT_BTI:
+	case AARCH64_INSN_HINT_BTIC:
+	case AARCH64_INSN_HINT_BTIJ:
+	case AARCH64_INSN_HINT_BTIJC:
+	case AARCH64_INSN_HINT_NOP:
+		return true;
+	default:
+		return false;
+	}
+}
+
+bool aarch64_insn_is_branch_imm(u32 insn)
+{
+	return (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn) ||
+		aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn) ||
+		aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) ||
+		aarch64_insn_is_bcond(insn));
+}
+
+bool __kprobes aarch64_insn_uses_literal(u32 insn)
+{
+	/* ldr/ldrsw (literal), prfm */
+
+	return aarch64_insn_is_ldr_lit(insn) ||
+		aarch64_insn_is_ldrsw_lit(insn) ||
+		aarch64_insn_is_adr_adrp(insn) ||
+		aarch64_insn_is_prfm_lit(insn);
+}
+
+bool __kprobes aarch64_insn_is_branch(u32 insn)
+{
+	/* b, bl, cb*, tb*, ret*, b.cond, br*, blr* */
+
+	return aarch64_insn_is_b(insn) ||
+		aarch64_insn_is_bl(insn) ||
+		aarch64_insn_is_cbz(insn) ||
+		aarch64_insn_is_cbnz(insn) ||
+		aarch64_insn_is_tbz(insn) ||
+		aarch64_insn_is_tbnz(insn) ||
+		aarch64_insn_is_ret(insn) ||
+		aarch64_insn_is_ret_auth(insn) ||
+		aarch64_insn_is_br(insn) ||
+		aarch64_insn_is_br_auth(insn) ||
+		aarch64_insn_is_blr(insn) ||
+		aarch64_insn_is_blr_auth(insn) ||
+		aarch64_insn_is_bcond(insn);
+}
+
+static int __kprobes aarch64_get_imm_shift_mask(enum aarch64_insn_imm_type type,
+						u32 *maskp, int *shiftp)
+{
+	u32 mask;
+	int shift;
+
+	switch (type) {
+	case AARCH64_INSN_IMM_26:
+		mask = BIT(26) - 1;
+		shift = 0;
+		break;
+	case AARCH64_INSN_IMM_19:
+		mask = BIT(19) - 1;
+		shift = 5;
+		break;
+	case AARCH64_INSN_IMM_16:
+		mask = BIT(16) - 1;
+		shift = 5;
+		break;
+	case AARCH64_INSN_IMM_14:
+		mask = BIT(14) - 1;
+		shift = 5;
+		break;
+	case AARCH64_INSN_IMM_12:
+		mask = BIT(12) - 1;
+		shift = 10;
+		break;
+	case AARCH64_INSN_IMM_9:
+		mask = BIT(9) - 1;
+		shift = 12;
+		break;
+	case AARCH64_INSN_IMM_7:
+		mask = BIT(7) - 1;
+		shift = 15;
+		break;
+	case AARCH64_INSN_IMM_6:
+	case AARCH64_INSN_IMM_S:
+		mask = BIT(6) - 1;
+		shift = 10;
+		break;
+	case AARCH64_INSN_IMM_R:
+		mask = BIT(6) - 1;
+		shift = 16;
+		break;
+	case AARCH64_INSN_IMM_N:
+		mask = 1;
+		shift = 22;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	*maskp = mask;
+	*shiftp = shift;
+
+	return 0;
+}
+
+#define ADR_IMM_HILOSPLIT	2
+#define ADR_IMM_SIZE		SZ_2M
+#define ADR_IMM_LOMASK		((1 << ADR_IMM_HILOSPLIT) - 1)
+#define ADR_IMM_HIMASK		((ADR_IMM_SIZE >> ADR_IMM_HILOSPLIT) - 1)
+#define ADR_IMM_LOSHIFT		29
+#define ADR_IMM_HISHIFT		5
+
+u64 aarch64_insn_decode_immediate(enum aarch64_insn_imm_type type, u32 insn)
+{
+	u32 immlo, immhi, mask;
+	int shift;
+
+	switch (type) {
+	case AARCH64_INSN_IMM_ADR:
+		shift = 0;
+		immlo = (insn >> ADR_IMM_LOSHIFT) & ADR_IMM_LOMASK;
+		immhi = (insn >> ADR_IMM_HISHIFT) & ADR_IMM_HIMASK;
+		insn = (immhi << ADR_IMM_HILOSPLIT) | immlo;
+		mask = ADR_IMM_SIZE - 1;
+		break;
+	default:
+		if (aarch64_get_imm_shift_mask(type, &mask, &shift) < 0) {
+			pr_err("aarch64_insn_decode_immediate: unknown immediate encoding %d\n",
+			       type);
+			return 0;
+		}
+	}
+
+	return (insn >> shift) & mask;
+}
+
+u32 __kprobes aarch64_insn_encode_immediate(enum aarch64_insn_imm_type type,
+				  u32 insn, u64 imm)
+{
+	u32 immlo, immhi, mask;
+	int shift;
+
+	if (insn == AARCH64_BREAK_FAULT)
+		return AARCH64_BREAK_FAULT;
+
+	switch (type) {
+	case AARCH64_INSN_IMM_ADR:
+		shift = 0;
+		immlo = (imm & ADR_IMM_LOMASK) << ADR_IMM_LOSHIFT;
+		imm >>= ADR_IMM_HILOSPLIT;
+		immhi = (imm & ADR_IMM_HIMASK) << ADR_IMM_HISHIFT;
+		imm = immlo | immhi;
+		mask = ((ADR_IMM_LOMASK << ADR_IMM_LOSHIFT) |
+			(ADR_IMM_HIMASK << ADR_IMM_HISHIFT));
+		break;
+	default:
+		if (aarch64_get_imm_shift_mask(type, &mask, &shift) < 0) {
+			pr_err("aarch64_insn_encode_immediate: unknown immediate encoding %d\n",
+			       type);
+			return AARCH64_BREAK_FAULT;
+		}
+	}
+
+	/* Update the immediate field. */
+	insn &= ~(mask << shift);
+	insn |= (imm & mask) << shift;
+
+	return insn;
+}
+
+u32 aarch64_insn_decode_register(enum aarch64_insn_register_type type,
+					u32 insn)
+{
+	int shift;
+
+	switch (type) {
+	case AARCH64_INSN_REGTYPE_RT:
+	case AARCH64_INSN_REGTYPE_RD:
+		shift = 0;
+		break;
+	case AARCH64_INSN_REGTYPE_RN:
+		shift = 5;
+		break;
+	case AARCH64_INSN_REGTYPE_RT2:
+	case AARCH64_INSN_REGTYPE_RA:
+		shift = 10;
+		break;
+	case AARCH64_INSN_REGTYPE_RM:
+		shift = 16;
+		break;
+	default:
+		pr_err("%s: unknown register type encoding %d\n", __func__,
+		       type);
+		return 0;
+	}
+
+	return (insn >> shift) & GENMASK(4, 0);
+}
+
+static u32 aarch64_insn_encode_register(enum aarch64_insn_register_type type,
+					u32 insn,
+					enum aarch64_insn_register reg)
+{
+	int shift;
+
+	if (insn == AARCH64_BREAK_FAULT)
+		return AARCH64_BREAK_FAULT;
+
+	if (reg < AARCH64_INSN_REG_0 || reg > AARCH64_INSN_REG_SP) {
+		pr_err("%s: unknown register encoding %d\n", __func__, reg);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	switch (type) {
+	case AARCH64_INSN_REGTYPE_RT:
+	case AARCH64_INSN_REGTYPE_RD:
+		shift = 0;
+		break;
+	case AARCH64_INSN_REGTYPE_RN:
+		shift = 5;
+		break;
+	case AARCH64_INSN_REGTYPE_RT2:
+	case AARCH64_INSN_REGTYPE_RA:
+		shift = 10;
+		break;
+	case AARCH64_INSN_REGTYPE_RM:
+	case AARCH64_INSN_REGTYPE_RS:
+		shift = 16;
+		break;
+	default:
+		pr_err("%s: unknown register type encoding %d\n", __func__,
+		       type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn &= ~(GENMASK(4, 0) << shift);
+	insn |= reg << shift;
+
+	return insn;
+}
+
+static u32 aarch64_insn_encode_ldst_size(enum aarch64_insn_size_type type,
+					 u32 insn)
+{
+	u32 size;
+
+	switch (type) {
+	case AARCH64_INSN_SIZE_8:
+		size = 0;
+		break;
+	case AARCH64_INSN_SIZE_16:
+		size = 1;
+		break;
+	case AARCH64_INSN_SIZE_32:
+		size = 2;
+		break;
+	case AARCH64_INSN_SIZE_64:
+		size = 3;
+		break;
+	default:
+		pr_err("%s: unknown size encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn &= ~GENMASK(31, 30);
+	insn |= size << 30;
+
+	return insn;
+}
+
+static inline long branch_imm_common(unsigned long pc, unsigned long addr,
+				     long range)
+{
+	long offset;
+
+	if ((pc & 0x3) || (addr & 0x3)) {
+		pr_err("%s: A64 instructions must be word aligned\n", __func__);
+		return range;
+	}
+
+	offset = ((long)addr - (long)pc);
+
+	if (offset < -range || offset >= range) {
+		pr_err("%s: offset out of range\n", __func__);
+		return range;
+	}
+
+	return offset;
+}
+
+u32 __kprobes aarch64_insn_gen_branch_imm(unsigned long pc, unsigned long addr,
+					  enum aarch64_insn_branch_type type)
+{
+	u32 insn;
+	long offset;
+
+	/*
+	 * B/BL support [-128M, 128M) offset
+	 * ARM64 virtual address arrangement guarantees all kernel and module
+	 * texts are within +/-128M.
+	 */
+	offset = branch_imm_common(pc, addr, SZ_128M);
+	if (offset >= SZ_128M)
+		return AARCH64_BREAK_FAULT;
+
+	switch (type) {
+	case AARCH64_INSN_BRANCH_LINK:
+		insn = aarch64_insn_get_bl_value();
+		break;
+	case AARCH64_INSN_BRANCH_NOLINK:
+		insn = aarch64_insn_get_b_value();
+		break;
+	default:
+		pr_err("%s: unknown branch encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_26, insn,
+					     offset >> 2);
+}
+
+u32 aarch64_insn_gen_comp_branch_imm(unsigned long pc, unsigned long addr,
+				     enum aarch64_insn_register reg,
+				     enum aarch64_insn_variant variant,
+				     enum aarch64_insn_branch_type type)
+{
+	u32 insn;
+	long offset;
+
+	offset = branch_imm_common(pc, addr, SZ_1M);
+	if (offset >= SZ_1M)
+		return AARCH64_BREAK_FAULT;
+
+	switch (type) {
+	case AARCH64_INSN_BRANCH_COMP_ZERO:
+		insn = aarch64_insn_get_cbz_value();
+		break;
+	case AARCH64_INSN_BRANCH_COMP_NONZERO:
+		insn = aarch64_insn_get_cbnz_value();
+		break;
+	default:
+		pr_err("%s: unknown branch encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	switch (variant) {
+	case AARCH64_INSN_VARIANT_32BIT:
+		break;
+	case AARCH64_INSN_VARIANT_64BIT:
+		insn |= AARCH64_INSN_SF_BIT;
+		break;
+	default:
+		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg);
+
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn,
+					     offset >> 2);
+}
+
+u32 aarch64_insn_gen_cond_branch_imm(unsigned long pc, unsigned long addr,
+				     enum aarch64_insn_condition cond)
+{
+	u32 insn;
+	long offset;
+
+	offset = branch_imm_common(pc, addr, SZ_1M);
+
+	insn = aarch64_insn_get_bcond_value();
+
+	if (cond < AARCH64_INSN_COND_EQ || cond > AARCH64_INSN_COND_AL) {
+		pr_err("%s: unknown condition encoding %d\n", __func__, cond);
+		return AARCH64_BREAK_FAULT;
+	}
+	insn |= cond;
+
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn,
+					     offset >> 2);
+}
+
+u32 __kprobes aarch64_insn_gen_hint(enum aarch64_insn_hint_cr_op op)
+{
+	return aarch64_insn_get_hint_value() | op;
+}
+
+u32 __kprobes aarch64_insn_gen_nop(void)
+{
+	return aarch64_insn_gen_hint(AARCH64_INSN_HINT_NOP);
+}
+
+u32 aarch64_insn_gen_branch_reg(enum aarch64_insn_register reg,
+				enum aarch64_insn_branch_type type)
+{
+	u32 insn;
+
+	switch (type) {
+	case AARCH64_INSN_BRANCH_NOLINK:
+		insn = aarch64_insn_get_br_value();
+		break;
+	case AARCH64_INSN_BRANCH_LINK:
+		insn = aarch64_insn_get_blr_value();
+		break;
+	case AARCH64_INSN_BRANCH_RETURN:
+		insn = aarch64_insn_get_ret_value();
+		break;
+	default:
+		pr_err("%s: unknown branch encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, reg);
+}
+
+u32 aarch64_insn_gen_load_store_reg(enum aarch64_insn_register reg,
+				    enum aarch64_insn_register base,
+				    enum aarch64_insn_register offset,
+				    enum aarch64_insn_size_type size,
+				    enum aarch64_insn_ldst_type type)
+{
+	u32 insn;
+
+	switch (type) {
+	case AARCH64_INSN_LDST_LOAD_REG_OFFSET:
+		insn = aarch64_insn_get_ldr_reg_value();
+		break;
+	case AARCH64_INSN_LDST_STORE_REG_OFFSET:
+		insn = aarch64_insn_get_str_reg_value();
+		break;
+	default:
+		pr_err("%s: unknown load/store encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn = aarch64_insn_encode_ldst_size(size, insn);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+					    base);
+
+	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn,
+					    offset);
+}
+
+u32 aarch64_insn_gen_load_store_pair(enum aarch64_insn_register reg1,
+				     enum aarch64_insn_register reg2,
+				     enum aarch64_insn_register base,
+				     int offset,
+				     enum aarch64_insn_variant variant,
+				     enum aarch64_insn_ldst_type type)
+{
+	u32 insn;
+	int shift;
+
+	switch (type) {
+	case AARCH64_INSN_LDST_LOAD_PAIR_PRE_INDEX:
+		insn = aarch64_insn_get_ldp_pre_value();
+		break;
+	case AARCH64_INSN_LDST_STORE_PAIR_PRE_INDEX:
+		insn = aarch64_insn_get_stp_pre_value();
+		break;
+	case AARCH64_INSN_LDST_LOAD_PAIR_POST_INDEX:
+		insn = aarch64_insn_get_ldp_post_value();
+		break;
+	case AARCH64_INSN_LDST_STORE_PAIR_POST_INDEX:
+		insn = aarch64_insn_get_stp_post_value();
+		break;
+	default:
+		pr_err("%s: unknown load/store encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	switch (variant) {
+	case AARCH64_INSN_VARIANT_32BIT:
+		if ((offset & 0x3) || (offset < -256) || (offset > 252)) {
+			pr_err("%s: offset must be multiples of 4 in the range of [-256, 252] %d\n",
+			       __func__, offset);
+			return AARCH64_BREAK_FAULT;
+		}
+		shift = 2;
+		break;
+	case AARCH64_INSN_VARIANT_64BIT:
+		if ((offset & 0x7) || (offset < -512) || (offset > 504)) {
+			pr_err("%s: offset must be multiples of 8 in the range of [-512, 504] %d\n",
+			       __func__, offset);
+			return AARCH64_BREAK_FAULT;
+		}
+		shift = 3;
+		insn |= AARCH64_INSN_SF_BIT;
+		break;
+	default:
+		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn,
+					    reg1);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT2, insn,
+					    reg2);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+					    base);
+
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_7, insn,
+					     offset >> shift);
+}
+
+u32 aarch64_insn_gen_load_store_ex(enum aarch64_insn_register reg,
+				   enum aarch64_insn_register base,
+				   enum aarch64_insn_register state,
+				   enum aarch64_insn_size_type size,
+				   enum aarch64_insn_ldst_type type)
+{
+	u32 insn;
+
+	switch (type) {
+	case AARCH64_INSN_LDST_LOAD_EX:
+		insn = aarch64_insn_get_load_ex_value();
+		break;
+	case AARCH64_INSN_LDST_STORE_EX:
+		insn = aarch64_insn_get_store_ex_value();
+		break;
+	default:
+		pr_err("%s: unknown load/store exclusive encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn = aarch64_insn_encode_ldst_size(size, insn);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn,
+					    reg);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+					    base);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT2, insn,
+					    AARCH64_INSN_REG_ZR);
+
+	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RS, insn,
+					    state);
+}
+
+u32 aarch64_insn_gen_ldadd(enum aarch64_insn_register result,
+			   enum aarch64_insn_register address,
+			   enum aarch64_insn_register value,
+			   enum aarch64_insn_size_type size)
+{
+	u32 insn = aarch64_insn_get_ldadd_value();
+
+	switch (size) {
+	case AARCH64_INSN_SIZE_32:
+	case AARCH64_INSN_SIZE_64:
+		break;
+	default:
+		pr_err("%s: unimplemented size encoding %d\n", __func__, size);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn = aarch64_insn_encode_ldst_size(size, insn);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn,
+					    result);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+					    address);
+
+	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RS, insn,
+					    value);
+}
+
+u32 aarch64_insn_gen_stadd(enum aarch64_insn_register address,
+			   enum aarch64_insn_register value,
+			   enum aarch64_insn_size_type size)
+{
+	/*
+	 * STADD is simply encoded as an alias for LDADD with XZR as
+	 * the destination register.
+	 */
+	return aarch64_insn_gen_ldadd(AARCH64_INSN_REG_ZR, address,
+				      value, size);
+}
+
+static u32 aarch64_insn_encode_prfm_imm(enum aarch64_insn_prfm_type type,
+					enum aarch64_insn_prfm_target target,
+					enum aarch64_insn_prfm_policy policy,
+					u32 insn)
+{
+	u32 imm_type = 0, imm_target = 0, imm_policy = 0;
+
+	switch (type) {
+	case AARCH64_INSN_PRFM_TYPE_PLD:
+		break;
+	case AARCH64_INSN_PRFM_TYPE_PLI:
+		imm_type = BIT(0);
+		break;
+	case AARCH64_INSN_PRFM_TYPE_PST:
+		imm_type = BIT(1);
+		break;
+	default:
+		pr_err("%s: unknown prfm type encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	switch (target) {
+	case AARCH64_INSN_PRFM_TARGET_L1:
+		break;
+	case AARCH64_INSN_PRFM_TARGET_L2:
+		imm_target = BIT(0);
+		break;
+	case AARCH64_INSN_PRFM_TARGET_L3:
+		imm_target = BIT(1);
+		break;
+	default:
+		pr_err("%s: unknown prfm target encoding %d\n", __func__, target);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	switch (policy) {
+	case AARCH64_INSN_PRFM_POLICY_KEEP:
+		break;
+	case AARCH64_INSN_PRFM_POLICY_STRM:
+		imm_policy = BIT(0);
+		break;
+	default:
+		pr_err("%s: unknown prfm policy encoding %d\n", __func__, policy);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	/* In this case, imm5 is encoded into Rt field. */
+	insn &= ~GENMASK(4, 0);
+	insn |= imm_policy | (imm_target << 1) | (imm_type << 3);
+
+	return insn;
+}
+
+u32 aarch64_insn_gen_prefetch(enum aarch64_insn_register base,
+			      enum aarch64_insn_prfm_type type,
+			      enum aarch64_insn_prfm_target target,
+			      enum aarch64_insn_prfm_policy policy)
+{
+	u32 insn = aarch64_insn_get_prfm_value();
+
+	insn = aarch64_insn_encode_ldst_size(AARCH64_INSN_SIZE_64, insn);
+
+	insn = aarch64_insn_encode_prfm_imm(type, target, policy, insn);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+					    base);
+
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, 0);
+}
+
+u32 aarch64_insn_gen_add_sub_imm(enum aarch64_insn_register dst,
+				 enum aarch64_insn_register src,
+				 int imm, enum aarch64_insn_variant variant,
+				 enum aarch64_insn_adsb_type type)
+{
+	u32 insn;
+
+	switch (type) {
+	case AARCH64_INSN_ADSB_ADD:
+		insn = aarch64_insn_get_add_imm_value();
+		break;
+	case AARCH64_INSN_ADSB_SUB:
+		insn = aarch64_insn_get_sub_imm_value();
+		break;
+	case AARCH64_INSN_ADSB_ADD_SETFLAGS:
+		insn = aarch64_insn_get_adds_imm_value();
+		break;
+	case AARCH64_INSN_ADSB_SUB_SETFLAGS:
+		insn = aarch64_insn_get_subs_imm_value();
+		break;
+	default:
+		pr_err("%s: unknown add/sub encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	switch (variant) {
+	case AARCH64_INSN_VARIANT_32BIT:
+		break;
+	case AARCH64_INSN_VARIANT_64BIT:
+		insn |= AARCH64_INSN_SF_BIT;
+		break;
+	default:
+		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	/* We can't encode more than a 24bit value (12bit + 12bit shift) */
+	if (imm & ~(BIT(24) - 1))
+		goto out;
+
+	/* If we have something in the top 12 bits... */
+	if (imm & ~(SZ_4K - 1)) {
+		/* ... and in the low 12 bits -> error */
+		if (imm & (SZ_4K - 1))
+			goto out;
+
+		imm >>= 12;
+		insn |= AARCH64_INSN_LSL_12;
+	}
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
+
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, imm);
+
+out:
+	pr_err("%s: invalid immediate encoding %d\n", __func__, imm);
+	return AARCH64_BREAK_FAULT;
+}
+
+u32 aarch64_insn_gen_bitfield(enum aarch64_insn_register dst,
+			      enum aarch64_insn_register src,
+			      int immr, int imms,
+			      enum aarch64_insn_variant variant,
+			      enum aarch64_insn_bitfield_type type)
+{
+	u32 insn;
+	u32 mask;
+
+	switch (type) {
+	case AARCH64_INSN_BITFIELD_MOVE:
+		insn = aarch64_insn_get_bfm_value();
+		break;
+	case AARCH64_INSN_BITFIELD_MOVE_UNSIGNED:
+		insn = aarch64_insn_get_ubfm_value();
+		break;
+	case AARCH64_INSN_BITFIELD_MOVE_SIGNED:
+		insn = aarch64_insn_get_sbfm_value();
+		break;
+	default:
+		pr_err("%s: unknown bitfield encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	switch (variant) {
+	case AARCH64_INSN_VARIANT_32BIT:
+		mask = GENMASK(4, 0);
+		break;
+	case AARCH64_INSN_VARIANT_64BIT:
+		insn |= AARCH64_INSN_SF_BIT | AARCH64_INSN_N_BIT;
+		mask = GENMASK(5, 0);
+		break;
+	default:
+		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	if (immr & ~mask) {
+		pr_err("%s: invalid immr encoding %d\n", __func__, immr);
+		return AARCH64_BREAK_FAULT;
+	}
+	if (imms & ~mask) {
+		pr_err("%s: invalid imms encoding %d\n", __func__, imms);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
+
+	insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_R, insn, immr);
+
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, imms);
+}
+
+u32 aarch64_insn_gen_movewide(enum aarch64_insn_register dst,
+			      int imm, int shift,
+			      enum aarch64_insn_variant variant,
+			      enum aarch64_insn_movewide_type type)
+{
+	u32 insn;
+
+	switch (type) {
+	case AARCH64_INSN_MOVEWIDE_ZERO:
+		insn = aarch64_insn_get_movz_value();
+		break;
+	case AARCH64_INSN_MOVEWIDE_KEEP:
+		insn = aarch64_insn_get_movk_value();
+		break;
+	case AARCH64_INSN_MOVEWIDE_INVERSE:
+		insn = aarch64_insn_get_movn_value();
+		break;
+	default:
+		pr_err("%s: unknown movewide encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	if (imm & ~(SZ_64K - 1)) {
+		pr_err("%s: invalid immediate encoding %d\n", __func__, imm);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	switch (variant) {
+	case AARCH64_INSN_VARIANT_32BIT:
+		if (shift != 0 && shift != 16) {
+			pr_err("%s: invalid shift encoding %d\n", __func__,
+			       shift);
+			return AARCH64_BREAK_FAULT;
+		}
+		break;
+	case AARCH64_INSN_VARIANT_64BIT:
+		insn |= AARCH64_INSN_SF_BIT;
+		if (shift != 0 && shift != 16 && shift != 32 && shift != 48) {
+			pr_err("%s: invalid shift encoding %d\n", __func__,
+			       shift);
+			return AARCH64_BREAK_FAULT;
+		}
+		break;
+	default:
+		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn |= (shift >> 4) << 21;
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, imm);
+}
+
+u32 aarch64_insn_gen_add_sub_shifted_reg(enum aarch64_insn_register dst,
+					 enum aarch64_insn_register src,
+					 enum aarch64_insn_register reg,
+					 int shift,
+					 enum aarch64_insn_variant variant,
+					 enum aarch64_insn_adsb_type type)
+{
+	u32 insn;
+
+	switch (type) {
+	case AARCH64_INSN_ADSB_ADD:
+		insn = aarch64_insn_get_add_value();
+		break;
+	case AARCH64_INSN_ADSB_SUB:
+		insn = aarch64_insn_get_sub_value();
+		break;
+	case AARCH64_INSN_ADSB_ADD_SETFLAGS:
+		insn = aarch64_insn_get_adds_value();
+		break;
+	case AARCH64_INSN_ADSB_SUB_SETFLAGS:
+		insn = aarch64_insn_get_subs_value();
+		break;
+	default:
+		pr_err("%s: unknown add/sub encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	switch (variant) {
+	case AARCH64_INSN_VARIANT_32BIT:
+		if (shift & ~(SZ_32 - 1)) {
+			pr_err("%s: invalid shift encoding %d\n", __func__,
+			       shift);
+			return AARCH64_BREAK_FAULT;
+		}
+		break;
+	case AARCH64_INSN_VARIANT_64BIT:
+		insn |= AARCH64_INSN_SF_BIT;
+		if (shift & ~(SZ_64 - 1)) {
+			pr_err("%s: invalid shift encoding %d\n", __func__,
+			       shift);
+			return AARCH64_BREAK_FAULT;
+		}
+		break;
+	default:
+		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+		return AARCH64_BREAK_FAULT;
+	}
+
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg);
+
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_6, insn, shift);
+}
+
+u32 aarch64_insn_gen_data1(enum aarch64_insn_register dst,
+			   enum aarch64_insn_register src,
+			   enum aarch64_insn_variant variant,
+			   enum aarch64_insn_data1_type type)
+{
+	u32 insn;
+
+	switch (type) {
+	case AARCH64_INSN_DATA1_REVERSE_16:
+		insn = aarch64_insn_get_rev16_value();
+		break;
+	case AARCH64_INSN_DATA1_REVERSE_32:
+		insn = aarch64_insn_get_rev32_value();
+		break;
+	case AARCH64_INSN_DATA1_REVERSE_64:
+		if (variant != AARCH64_INSN_VARIANT_64BIT) {
+			pr_err("%s: invalid variant for reverse64 %d\n",
+			       __func__, variant);
+			return AARCH64_BREAK_FAULT;
+		}
+		insn = aarch64_insn_get_rev64_value();
+		break;
+	default:
+		pr_err("%s: unknown data1 encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	switch (variant) {
+	case AARCH64_INSN_VARIANT_32BIT:
+		break;
+	case AARCH64_INSN_VARIANT_64BIT:
+		insn |= AARCH64_INSN_SF_BIT;
+		break;
+	default:
+		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
+}
+
+u32 aarch64_insn_gen_data2(enum aarch64_insn_register dst,
+			   enum aarch64_insn_register src,
+			   enum aarch64_insn_register reg,
+			   enum aarch64_insn_variant variant,
+			   enum aarch64_insn_data2_type type)
+{
+	u32 insn;
+
+	switch (type) {
+	case AARCH64_INSN_DATA2_UDIV:
+		insn = aarch64_insn_get_udiv_value();
+		break;
+	case AARCH64_INSN_DATA2_SDIV:
+		insn = aarch64_insn_get_sdiv_value();
+		break;
+	case AARCH64_INSN_DATA2_LSLV:
+		insn = aarch64_insn_get_lslv_value();
+		break;
+	case AARCH64_INSN_DATA2_LSRV:
+		insn = aarch64_insn_get_lsrv_value();
+		break;
+	case AARCH64_INSN_DATA2_ASRV:
+		insn = aarch64_insn_get_asrv_value();
+		break;
+	case AARCH64_INSN_DATA2_RORV:
+		insn = aarch64_insn_get_rorv_value();
+		break;
+	default:
+		pr_err("%s: unknown data2 encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	switch (variant) {
+	case AARCH64_INSN_VARIANT_32BIT:
+		break;
+	case AARCH64_INSN_VARIANT_64BIT:
+		insn |= AARCH64_INSN_SF_BIT;
+		break;
+	default:
+		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
+
+	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg);
+}
+
+u32 aarch64_insn_gen_data3(enum aarch64_insn_register dst,
+			   enum aarch64_insn_register src,
+			   enum aarch64_insn_register reg1,
+			   enum aarch64_insn_register reg2,
+			   enum aarch64_insn_variant variant,
+			   enum aarch64_insn_data3_type type)
+{
+	u32 insn;
+
+	switch (type) {
+	case AARCH64_INSN_DATA3_MADD:
+		insn = aarch64_insn_get_madd_value();
+		break;
+	case AARCH64_INSN_DATA3_MSUB:
+		insn = aarch64_insn_get_msub_value();
+		break;
+	default:
+		pr_err("%s: unknown data3 encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	switch (variant) {
+	case AARCH64_INSN_VARIANT_32BIT:
+		break;
+	case AARCH64_INSN_VARIANT_64BIT:
+		insn |= AARCH64_INSN_SF_BIT;
+		break;
+	default:
+		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RA, insn, src);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+					    reg1);
+
+	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn,
+					    reg2);
+}
+
+u32 aarch64_insn_gen_logical_shifted_reg(enum aarch64_insn_register dst,
+					 enum aarch64_insn_register src,
+					 enum aarch64_insn_register reg,
+					 int shift,
+					 enum aarch64_insn_variant variant,
+					 enum aarch64_insn_logic_type type)
+{
+	u32 insn;
+
+	switch (type) {
+	case AARCH64_INSN_LOGIC_AND:
+		insn = aarch64_insn_get_and_value();
+		break;
+	case AARCH64_INSN_LOGIC_BIC:
+		insn = aarch64_insn_get_bic_value();
+		break;
+	case AARCH64_INSN_LOGIC_ORR:
+		insn = aarch64_insn_get_orr_value();
+		break;
+	case AARCH64_INSN_LOGIC_ORN:
+		insn = aarch64_insn_get_orn_value();
+		break;
+	case AARCH64_INSN_LOGIC_EOR:
+		insn = aarch64_insn_get_eor_value();
+		break;
+	case AARCH64_INSN_LOGIC_EON:
+		insn = aarch64_insn_get_eon_value();
+		break;
+	case AARCH64_INSN_LOGIC_AND_SETFLAGS:
+		insn = aarch64_insn_get_ands_value();
+		break;
+	case AARCH64_INSN_LOGIC_BIC_SETFLAGS:
+		insn = aarch64_insn_get_bics_value();
+		break;
+	default:
+		pr_err("%s: unknown logical encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	switch (variant) {
+	case AARCH64_INSN_VARIANT_32BIT:
+		if (shift & ~(SZ_32 - 1)) {
+			pr_err("%s: invalid shift encoding %d\n", __func__,
+			       shift);
+			return AARCH64_BREAK_FAULT;
+		}
+		break;
+	case AARCH64_INSN_VARIANT_64BIT:
+		insn |= AARCH64_INSN_SF_BIT;
+		if (shift & ~(SZ_64 - 1)) {
+			pr_err("%s: invalid shift encoding %d\n", __func__,
+			       shift);
+			return AARCH64_BREAK_FAULT;
+		}
+		break;
+	default:
+		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+		return AARCH64_BREAK_FAULT;
+	}
+
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg);
+
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_6, insn, shift);
+}
+
+/*
+ * MOV (register) is architecturally an alias of ORR (shifted register) where
+ * MOV <*d>, <*m> is equivalent to ORR <*d>, <*ZR>, <*m>
+ */
+u32 aarch64_insn_gen_move_reg(enum aarch64_insn_register dst,
+			      enum aarch64_insn_register src,
+			      enum aarch64_insn_variant variant)
+{
+	return aarch64_insn_gen_logical_shifted_reg(dst, AARCH64_INSN_REG_ZR,
+						    src, 0, variant,
+						    AARCH64_INSN_LOGIC_ORR);
+}
+
+u32 aarch64_insn_gen_adr(unsigned long pc, unsigned long addr,
+			 enum aarch64_insn_register reg,
+			 enum aarch64_insn_adr_type type)
+{
+	u32 insn;
+	s32 offset;
+
+	switch (type) {
+	case AARCH64_INSN_ADR_TYPE_ADR:
+		insn = aarch64_insn_get_adr_value();
+		offset = addr - pc;
+		break;
+	case AARCH64_INSN_ADR_TYPE_ADRP:
+		insn = aarch64_insn_get_adrp_value();
+		offset = (addr - ALIGN_DOWN(pc, SZ_4K)) >> 12;
+		break;
+	default:
+		pr_err("%s: unknown adr encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	if (offset < -SZ_1M || offset >= SZ_1M)
+		return AARCH64_BREAK_FAULT;
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, reg);
+
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_ADR, insn, offset);
+}
+
+/*
+ * Decode the imm field of a branch, and return the byte offset as a
+ * signed value (so it can be used when computing a new branch
+ * target).
+ */
+s32 aarch64_get_branch_offset(u32 insn)
+{
+	s32 imm;
+
+	if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn)) {
+		imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_26, insn);
+		return (imm << 6) >> 4;
+	}
+
+	if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) ||
+	    aarch64_insn_is_bcond(insn)) {
+		imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_19, insn);
+		return (imm << 13) >> 11;
+	}
+
+	if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn)) {
+		imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_14, insn);
+		return (imm << 18) >> 16;
+	}
+
+	/* Unhandled instruction */
+	BUG();
+}
+
+/*
+ * Encode the displacement of a branch in the imm field and return the
+ * updated instruction.
+ */
+u32 aarch64_set_branch_offset(u32 insn, s32 offset)
+{
+	if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn))
+		return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_26, insn,
+						     offset >> 2);
+
+	if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) ||
+	    aarch64_insn_is_bcond(insn))
+		return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn,
+						     offset >> 2);
+
+	if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn))
+		return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_14, insn,
+						     offset >> 2);
+
+	/* Unhandled instruction */
+	BUG();
+}
+
+s32 aarch64_insn_adrp_get_offset(u32 insn)
+{
+	BUG_ON(!aarch64_insn_is_adrp(insn));
+	return aarch64_insn_decode_immediate(AARCH64_INSN_IMM_ADR, insn) << 12;
+}
+
+u32 aarch64_insn_adrp_set_offset(u32 insn, s32 offset)
+{
+	BUG_ON(!aarch64_insn_is_adrp(insn));
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_ADR, insn,
+						offset >> 12);
+}
+
+/*
+ * Extract the Op/CR data from a msr/mrs instruction.
+ */
+u32 aarch64_insn_extract_system_reg(u32 insn)
+{
+	return (insn & 0x1FFFE0) >> 5;
+}
+
+bool aarch32_insn_is_wide(u32 insn)
+{
+	return insn >= 0xe800;
+}
+
+/*
+ * Macros/defines for extracting register numbers from instruction.
+ */
+u32 aarch32_insn_extract_reg_num(u32 insn, int offset)
+{
+	return (insn & (0xf << offset)) >> offset;
+}
+
+#define OPC2_MASK	0x7
+#define OPC2_OFFSET	5
+u32 aarch32_insn_mcr_extract_opc2(u32 insn)
+{
+	return (insn & (OPC2_MASK << OPC2_OFFSET)) >> OPC2_OFFSET;
+}
+
+#define CRM_MASK	0xf
+u32 aarch32_insn_mcr_extract_crm(u32 insn)
+{
+	return insn & CRM_MASK;
+}
+
+static bool range_of_ones(u64 val)
+{
+	/* Doesn't handle full ones or full zeroes */
+	u64 sval = val >> __ffs64(val);
+
+	/* One of Sean Eron Anderson's bithack tricks */
+	return ((sval + 1) & (sval)) == 0;
+}
+
+static u32 aarch64_encode_immediate(u64 imm,
+				    enum aarch64_insn_variant variant,
+				    u32 insn)
+{
+	unsigned int immr, imms, n, ones, ror, esz, tmp;
+	u64 mask;
+
+	switch (variant) {
+	case AARCH64_INSN_VARIANT_32BIT:
+		esz = 32;
+		break;
+	case AARCH64_INSN_VARIANT_64BIT:
+		insn |= AARCH64_INSN_SF_BIT;
+		esz = 64;
+		break;
+	default:
+		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	mask = GENMASK(esz - 1, 0);
+
+	/* Can't encode full zeroes, full ones, or value wider than the mask */
+	if (!imm || imm == mask || imm & ~mask)
+		return AARCH64_BREAK_FAULT;
+
+	/*
+	 * Inverse of Replicate(). Try to spot a repeating pattern
+	 * with a pow2 stride.
+	 */
+	for (tmp = esz / 2; tmp >= 2; tmp /= 2) {
+		u64 emask = BIT(tmp) - 1;
+
+		if ((imm & emask) != ((imm >> tmp) & emask))
+			break;
+
+		esz = tmp;
+		mask = emask;
+	}
+
+	/* N is only set if we're encoding a 64bit value */
+	n = esz == 64;
+
+	/* Trim imm to the element size */
+	imm &= mask;
+
+	/* That's how many ones we need to encode */
+	ones = hweight64(imm);
+
+	/*
+	 * imms is set to (ones - 1), prefixed with a string of ones
+	 * and a zero if they fit. Cap it to 6 bits.
+	 */
+	imms  = ones - 1;
+	imms |= 0xf << ffs(esz);
+	imms &= BIT(6) - 1;
+
+	/* Compute the rotation */
+	if (range_of_ones(imm)) {
+		/*
+		 * Pattern: 0..01..10..0
+		 *
+		 * Compute how many rotate we need to align it right
+		 */
+		ror = __ffs64(imm);
+	} else {
+		/*
+		 * Pattern: 0..01..10..01..1
+		 *
+		 * Fill the unused top bits with ones, and check if
+		 * the result is a valid immediate (all ones with a
+		 * contiguous ranges of zeroes).
+		 */
+		imm |= ~mask;
+		if (!range_of_ones(~imm))
+			return AARCH64_BREAK_FAULT;
+
+		/*
+		 * Compute the rotation to get a continuous set of
+		 * ones, with the first bit set at position 0
+		 */
+		ror = fls(~imm);
+	}
+
+	/*
+	 * immr is the number of bits we need to rotate back to the
+	 * original set of ones. Note that this is relative to the
+	 * element size...
+	 */
+	immr = (esz - ror) % esz;
+
+	insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_N, insn, n);
+	insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_R, insn, immr);
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, imms);
+}
+
+u32 aarch64_insn_gen_logical_immediate(enum aarch64_insn_logic_type type,
+				       enum aarch64_insn_variant variant,
+				       enum aarch64_insn_register Rn,
+				       enum aarch64_insn_register Rd,
+				       u64 imm)
+{
+	u32 insn;
+
+	switch (type) {
+	case AARCH64_INSN_LOGIC_AND:
+		insn = aarch64_insn_get_and_imm_value();
+		break;
+	case AARCH64_INSN_LOGIC_ORR:
+		insn = aarch64_insn_get_orr_imm_value();
+		break;
+	case AARCH64_INSN_LOGIC_EOR:
+		insn = aarch64_insn_get_eor_imm_value();
+		break;
+	case AARCH64_INSN_LOGIC_AND_SETFLAGS:
+		insn = aarch64_insn_get_ands_imm_value();
+		break;
+	default:
+		pr_err("%s: unknown logical encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, Rd);
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, Rn);
+	return aarch64_encode_immediate(imm, variant, insn);
+}
+
+u32 aarch64_insn_gen_extr(enum aarch64_insn_variant variant,
+			  enum aarch64_insn_register Rm,
+			  enum aarch64_insn_register Rn,
+			  enum aarch64_insn_register Rd,
+			  u8 lsb)
+{
+	u32 insn;
+
+	insn = aarch64_insn_get_extr_value();
+
+	switch (variant) {
+	case AARCH64_INSN_VARIANT_32BIT:
+		if (lsb > 31)
+			return AARCH64_BREAK_FAULT;
+		break;
+	case AARCH64_INSN_VARIANT_64BIT:
+		if (lsb > 63)
+			return AARCH64_BREAK_FAULT;
+		insn |= AARCH64_INSN_SF_BIT;
+		insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_N, insn, 1);
+		break;
+	default:
+		pr_err("%s: unknown variant encoding %d\n", __func__, variant);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, lsb);
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, Rd);
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, Rn);
+	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, Rm);
+}

From 07b03cb6b22fb776ddf9a693c48b9300a57507e4 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Mon, 14 Sep 2020 16:36:11 +0100
Subject: [PATCH 229/737] tools: bug: Remove duplicate definition

Under tools, bug.h only defines BUILD_BUG_ON_ZERO() which is already
defined in build_bug.h. This prevents a file to include both headers at
the same time.

Have bug.h include build_bug.h instead.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 tools/include/linux/bug.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tools/include/linux/bug.h b/tools/include/linux/bug.h
index 85f80258a15f6..548be7cffa8e7 100644
--- a/tools/include/linux/bug.h
+++ b/tools/include/linux/bug.h
@@ -2,10 +2,6 @@
 #ifndef _TOOLS_PERF_LINUX_BUG_H
 #define _TOOLS_PERF_LINUX_BUG_H
 
-/* Force a compilation error if condition is true, but also produce a
-   result (of value 0 and type size_t), so the expression can be used
-   e.g. in a structure initializer (or where-ever else comma expressions
-   aren't permitted). */
-#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
+#include <linux/build_bug.h>
 
 #endif	/* _TOOLS_PERF_LINUX_BUG_H */

From 2a5eaf2709d7e67c637b6c597a5b6dddc4f1649f Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Wed, 9 Sep 2020 09:39:07 +0100
Subject: [PATCH 230/737] objtool: arm64: Add base definition for arm64 backend

Provide needed definitions for a new architecture instruction decoder.
No proper decoding is done yet.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/lib/insn.c                         |   4 +-
 tools/objtool/Makefile                        |   5 +
 tools/objtool/arch/arm64/Build                |   8 +
 tools/objtool/arch/arm64/decode.c             | 141 ++++++++++++++++++
 .../arch/arm64/include/arch/cfi_regs.h        |  14 ++
 tools/objtool/arch/arm64/include/arch/elf.h   |   6 +
 .../arch/arm64/include/arch/endianness.h      |   9 ++
 .../objtool/arch/arm64/include/arch/special.h |  21 +++
 tools/objtool/arch/arm64/special.c            |  21 +++
 tools/objtool/sync-check.sh                   |  10 +-
 10 files changed, 236 insertions(+), 3 deletions(-)
 create mode 100644 tools/objtool/arch/arm64/Build
 create mode 100644 tools/objtool/arch/arm64/decode.c
 create mode 100644 tools/objtool/arch/arm64/include/arch/cfi_regs.h
 create mode 100644 tools/objtool/arch/arm64/include/arch/elf.h
 create mode 100644 tools/objtool/arch/arm64/include/arch/endianness.h
 create mode 100644 tools/objtool/arch/arm64/include/arch/special.h
 create mode 100644 tools/objtool/arch/arm64/special.c

diff --git a/arch/arm64/lib/insn.c b/arch/arm64/lib/insn.c
index bfa3a6ec0b13a..c2374373a70e1 100644
--- a/arch/arm64/lib/insn.c
+++ b/arch/arm64/lib/insn.c
@@ -11,10 +11,10 @@
 #include <linux/sizes.h>
 #include <linux/types.h>
 
-#include <asm/debug-monitors.h>
+#include <asm/debug-monitors.h>	/* __ignore_sync_check__ */
 #include <asm/errno.h>
 #include <asm/insn.h>
-#include <asm/kprobes.h>
+#include <asm/kprobes.h>	/* __ignore_sync_check__ */
 
 #define AARCH64_INSN_SF_BIT	BIT(31)
 #define AARCH64_INSN_N_BIT	BIT(22)
diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile
index 92ce4fce7bc73..d5cfbec87c022 100644
--- a/tools/objtool/Makefile
+++ b/tools/objtool/Makefile
@@ -47,6 +47,11 @@ ifeq ($(SRCARCH),x86)
 	SUBCMD_ORC := y
 endif
 
+ifeq ($(SRCARCH),arm64)
+	SUBCMD_CHECK := y
+	CFLAGS  += -Wno-nested-externs
+endif
+
 export SUBCMD_CHECK SUBCMD_ORC
 export srctree OUTPUT CFLAGS SRCARCH AWK
 include $(srctree)/tools/build/Makefile.include
diff --git a/tools/objtool/arch/arm64/Build b/tools/objtool/arch/arm64/Build
new file mode 100644
index 0000000000000..f3de3a50d5411
--- /dev/null
+++ b/tools/objtool/arch/arm64/Build
@@ -0,0 +1,8 @@
+objtool-y += special.o
+objtool-y += decode.o
+
+objtool-y += libhweight.o
+
+$(OUTPUT)arch/arm64/libhweight.o: ../lib/hweight.c FORCE
+	$(call rule_mkdir)
+	$(call if_changed_dep,cc_o_c)
diff --git a/tools/objtool/arch/arm64/decode.c b/tools/objtool/arch/arm64/decode.c
new file mode 100644
index 0000000000000..05452ad25a476
--- /dev/null
+++ b/tools/objtool/arch/arm64/decode.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+#include <asm/insn.h>
+
+#include <objtool/check.h>
+#include <objtool/arch.h>
+#include <objtool/elf.h>
+#include <objtool/warn.h>
+
+#include <arch/cfi_regs.h>
+
+/* Hack needed to avoid depending on debug-monitors.h */
+#define AARCH64_BREAK_FAULT	0xBAD
+
+/* Hack needed to avoid depending on kprobes.h */
+#ifndef __kprobes
+#define __kprobes
+#endif
+
+#include "../../../arch/arm64/lib/insn.c"
+
+bool arch_callee_saved_reg(unsigned char reg)
+{
+	switch (reg) {
+	case AARCH64_INSN_REG_19:
+	case AARCH64_INSN_REG_20:
+	case AARCH64_INSN_REG_21:
+	case AARCH64_INSN_REG_22:
+	case AARCH64_INSN_REG_23:
+	case AARCH64_INSN_REG_24:
+	case AARCH64_INSN_REG_25:
+	case AARCH64_INSN_REG_26:
+	case AARCH64_INSN_REG_27:
+	case AARCH64_INSN_REG_28:
+	case AARCH64_INSN_REG_FP:
+	case AARCH64_INSN_REG_LR:
+		return true;
+	default:
+		return false;
+	}
+}
+
+void arch_initial_func_cfi_state(struct cfi_init_state *state)
+{
+	int i;
+
+	for (i = 0; i < CFI_NUM_REGS; i++) {
+		state->regs[i].base = CFI_UNDEFINED;
+		state->regs[i].offset = 0;
+	}
+
+	/* initial CFA (call frame address) */
+	state->cfa.base = CFI_SP;
+	state->cfa.offset = 0;
+}
+
+unsigned long arch_dest_reloc_offset(int addend)
+{
+	return addend;
+}
+
+unsigned long arch_jump_destination(struct instruction *insn)
+{
+	return insn->offset + insn->immediate;
+}
+
+const char *arch_nop_insn(int len)
+{
+	static u32 nop = 0;
+
+	if (len != AARCH64_INSN_SIZE)
+		WARN("invalid NOP size: %d\n", len);
+
+	if (!nop)
+		nop = aarch64_insn_gen_nop();
+
+	return (const char*)&nop;
+}
+
+const char *arch_ret_insn(int len)
+{
+	return arch_nop_insn(len);
+}
+
+static int is_arm64(const struct elf *elf)
+{
+	switch (elf->ehdr.e_machine) {
+	case EM_AARCH64: //0xB7
+		return 1;
+	default:
+		WARN("unexpected ELF machine type %x",
+		     elf->ehdr.e_machine);
+		return 0;
+	}
+}
+
+int arch_decode_hint_reg(u8 sp_reg, int *base)
+{
+	if (sp_reg == UNWIND_HINT_REG_UNDEFINED)
+		*base = CFI_UNDEFINED;
+	else
+		*base = sp_reg;
+
+	return 0;
+}
+}
+
+int arch_decode_instruction(const struct elf *elf, const struct section *sec,
+			    unsigned long offset, unsigned int maxlen,
+			    unsigned int *len, enum insn_type *type,
+			    unsigned long *immediate,
+			    struct list_head *ops_list)
+{
+	u32 insn;
+
+	if (!is_arm64(elf))
+		return -1;
+
+	if (maxlen < AARCH64_INSN_SIZE)
+		return 0;
+
+	*len = AARCH64_INSN_SIZE;
+	*immediate = 0;
+
+	insn = *(u32 *)(sec->data->d_buf + offset);
+
+	switch (aarch64_get_insn_class(insn)) {
+	case AARCH64_INSN_CLS_UNKNOWN:
+		WARN("can't decode instruction at %s:0x%lx", sec->name, offset);
+		return -1;
+	default:
+		*type = INSN_OTHER;
+		break;
+	}
+
+	return 0;
+}
diff --git a/tools/objtool/arch/arm64/include/arch/cfi_regs.h b/tools/objtool/arch/arm64/include/arch/cfi_regs.h
new file mode 100644
index 0000000000000..a5185649686b7
--- /dev/null
+++ b/tools/objtool/arch/arm64/include/arch/cfi_regs.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _OBJTOOL_CFI_REGS_H
+#define _OBJTOOL_CFI_REGS_H
+
+#include <asm/insn.h>
+
+#define CFI_BP			AARCH64_INSN_REG_FP
+#define CFI_RA			AARCH64_INSN_REG_LR
+#define CFI_SP			AARCH64_INSN_REG_SP
+
+#define CFI_NUM_REGS		32
+
+#endif /* _OBJTOOL_CFI_REGS_H */
diff --git a/tools/objtool/arch/arm64/include/arch/elf.h b/tools/objtool/arch/arm64/include/arch/elf.h
new file mode 100644
index 0000000000000..a31a29b1a3867
--- /dev/null
+++ b/tools/objtool/arch/arm64/include/arch/elf.h
@@ -0,0 +1,6 @@
+#ifndef _OBJTOOL_ARCH_ELF
+#define _OBJTOOL_ARCH_ELF
+
+#define R_NONE R_AARCH64_NONE
+
+#endif /* _OBJTOOL_ARCH_ELF */
diff --git a/tools/objtool/arch/arm64/include/arch/endianness.h b/tools/objtool/arch/arm64/include/arch/endianness.h
new file mode 100644
index 0000000000000..7c362527da205
--- /dev/null
+++ b/tools/objtool/arch/arm64/include/arch/endianness.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _ARCH_ENDIANNESS_H
+#define _ARCH_ENDIANNESS_H
+
+#include <endian.h>
+
+#define __TARGET_BYTE_ORDER __LITTLE_ENDIAN
+
+#endif /* _ARCH_ENDIANNESS_H */
diff --git a/tools/objtool/arch/arm64/include/arch/special.h b/tools/objtool/arch/arm64/include/arch/special.h
new file mode 100644
index 0000000000000..a82a9b3e51dfd
--- /dev/null
+++ b/tools/objtool/arch/arm64/include/arch/special.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _ARM64_ARCH_SPECIAL_H
+#define _ARM64_ARCH_SPECIAL_H
+
+#define EX_ENTRY_SIZE		8
+#define EX_ORIG_OFFSET		0
+#define EX_NEW_OFFSET		4
+
+#define JUMP_ENTRY_SIZE		16
+#define JUMP_ORIG_OFFSET	0
+#define JUMP_NEW_OFFSET		4
+
+#define ALT_ENTRY_SIZE		12
+#define ALT_ORIG_OFFSET		0
+#define ALT_NEW_OFFSET		4
+#define ALT_FEATURE_OFFSET	8
+#define ALT_ORIG_LEN_OFFSET	10
+#define ALT_NEW_LEN_OFFSET	11
+
+#endif /* _ARM64_ARCH_SPECIAL_H */
diff --git a/tools/objtool/arch/arm64/special.c b/tools/objtool/arch/arm64/special.c
new file mode 100644
index 0000000000000..45f283283091f
--- /dev/null
+++ b/tools/objtool/arch/arm64/special.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <objtool/special.h>
+
+void arch_handle_alternative(unsigned short feature, struct special_alt *alt)
+{
+}
+
+bool arch_support_alt_relocation(struct special_alt *special_alt,
+				 struct instruction *insn,
+				 struct reloc *reloc)
+{
+	return false;
+}
+
+
+struct reloc *arch_find_switch_table(struct objtool_file *file,
+				     struct instruction *insn)
+{
+	return NULL;
+}
diff --git a/tools/objtool/sync-check.sh b/tools/objtool/sync-check.sh
index 4bbabaecab14e..bdcc443ea0942 100755
--- a/tools/objtool/sync-check.sh
+++ b/tools/objtool/sync-check.sh
@@ -24,6 +24,14 @@ arch/x86/include/asm/insn.h
 arch/x86/lib/inat.c
 arch/x86/lib/insn.c
 '
+elif [ "$SRCARCH" = "arm64" ]; then
+FILES="$FILES
+arch/arm64/include/asm/insn.h
+"
+
+SYNC_CHECK_FILES='
+arch/arm64/lib/insn.c
+'
 fi
 
 check_2 () {
@@ -67,7 +75,7 @@ done <<EOF
 $FILES
 EOF
 
-if [ "$SRCARCH" = "x86" ]; then
+if [ "$SRCARCH" = "x86" ] || [ "$SRCARCH" = "arm64" ]; then
 	for i in $SYNC_CHECK_FILES; do
 		check $i '-I "^.*\/\*.*__ignore_sync_check__.*\*\/.*$"'
 	done

From f8458fc8db92883e6ae2dd1bbcabc27146943c20 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Wed, 9 Sep 2020 13:24:12 +0100
Subject: [PATCH 231/737] objtool: arm64: Decode add/sub instructions

Decode aarch64 additions and substractions and create stack_ops for
instructions interacting with SP or FP.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 tools/objtool/arch/arm64/decode.c | 94 +++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)

diff --git a/tools/objtool/arch/arm64/decode.c b/tools/objtool/arch/arm64/decode.c
index 05452ad25a476..f3e6d33d17df5 100644
--- a/tools/objtool/arch/arm64/decode.c
+++ b/tools/objtool/arch/arm64/decode.c
@@ -23,6 +23,13 @@
 
 #include "../../../arch/arm64/lib/insn.c"
 
+static unsigned long sign_extend(unsigned long x, int nbits)
+{
+	unsigned long sign_bit = (x >> (nbits - 1)) & 1;
+
+	return ((~0UL + (sign_bit ^ 1)) << nbits) | x;
+}
+
 bool arch_callee_saved_reg(unsigned char reg)
 {
 	switch (reg) {
@@ -109,6 +116,61 @@ int arch_decode_hint_reg(u8 sp_reg, int *base)
 }
 }
 
+static struct stack_op *arm_make_add_op(enum aarch64_insn_register dest,
+					enum aarch64_insn_register src,
+					int val)
+{
+	struct stack_op *op;
+
+	op = calloc(1, sizeof(*op));
+	if (!op) {
+		WARN("calloc failed");
+		return NULL;
+	}
+	op->dest.type = OP_DEST_REG;
+	op->dest.reg = dest;
+	op->src.reg = src;
+	op->src.type = val != 0 ? OP_SRC_ADD : OP_SRC_REG;
+	op->src.offset = val;
+
+	return op;
+}
+
+static int arm_decode_add_sub_imm(u32 instr, bool set_flags,
+				  enum insn_type *type,
+				  unsigned long *immediate,
+				  struct list_head *ops_list)
+{
+	u32 rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, instr);
+	u32 rn = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RN, instr);
+
+	*type = INSN_OTHER;
+	*immediate = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_12, instr);
+
+	if (instr & AARCH64_INSN_LSL_12)
+		*immediate <<= 12;
+
+	if ((!set_flags && rd == AARCH64_INSN_REG_SP) ||
+	    rd == AARCH64_INSN_REG_FP ||
+	    rn == AARCH64_INSN_REG_FP ||
+	    rn == AARCH64_INSN_REG_SP) {
+		struct stack_op *op;
+		int value;
+
+		if (aarch64_insn_is_subs_imm(instr) || aarch64_insn_is_sub_imm(instr))
+			value = -*immediate;
+		else
+			value = *immediate;
+
+		op = arm_make_add_op(rd, rn, value);
+		if (!op)
+			return -1;
+		list_add_tail(&op->list, ops_list);
+	}
+
+	return 0;
+}
+
 int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 			    unsigned long offset, unsigned int maxlen,
 			    unsigned int *len, enum insn_type *type,
@@ -132,6 +194,38 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 	case AARCH64_INSN_CLS_UNKNOWN:
 		WARN("can't decode instruction at %s:0x%lx", sec->name, offset);
 		return -1;
+	case AARCH64_INSN_CLS_DP_IMM:
+		/* Mov register to and from SP are aliases of add_imm */
+		if (aarch64_insn_is_add_imm(insn) ||
+		    aarch64_insn_is_sub_imm(insn))
+			return arm_decode_add_sub_imm(insn, false, type, immediate,
+						      ops_list);
+		else if (aarch64_insn_is_adds_imm(insn) ||
+			 aarch64_insn_is_subs_imm(insn))
+			return arm_decode_add_sub_imm(insn, true, type, immediate,
+						      ops_list);
+		else
+			*type = INSN_OTHER;
+		break;
+	case AARCH64_INSN_CLS_DP_REG:
+		if (aarch64_insn_is_mov_reg(insn)) {
+			enum aarch64_insn_register rd;
+			enum aarch64_insn_register rm;
+
+			rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, insn);
+			rm = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RM, insn);
+			if (rd == AARCH64_INSN_REG_FP || rm == AARCH64_INSN_REG_FP) {
+				struct stack_op *op;
+
+				op = arm_make_add_op(rd, rm, 0);
+				if (!op)
+					return -1;
+				list_add_tail(&op->list, ops_list);
+				break;
+			}
+		}
+		*type = INSN_OTHER;
+		break;
 	default:
 		*type = INSN_OTHER;
 		break;

From 8408cce34dac89a51a06c91ba5f02b96d47a34c5 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Wed, 9 Sep 2020 14:27:17 +0100
Subject: [PATCH 232/737] objtool: arm64: Decode jump and call related
 instructions

Decode branch, branch and link (aarch64's call) and return instructions.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 tools/objtool/arch/arm64/decode.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tools/objtool/arch/arm64/decode.c b/tools/objtool/arch/arm64/decode.c
index f3e6d33d17df5..df9cca1122557 100644
--- a/tools/objtool/arch/arm64/decode.c
+++ b/tools/objtool/arch/arm64/decode.c
@@ -226,6 +226,28 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 		}
 		*type = INSN_OTHER;
 		break;
+	case AARCH64_INSN_CLS_BR_SYS:
+		if (aarch64_insn_is_ret(insn) &&
+		    aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RN, insn) == AARCH64_INSN_REG_LR) {
+			*type = INSN_RETURN;
+		} else if (aarch64_insn_is_bl(insn)) {
+			*type = INSN_CALL;
+			*immediate = aarch64_get_branch_offset(insn);
+		} else if (aarch64_insn_is_blr(insn)) {
+			*type = INSN_CALL_DYNAMIC;
+		} else if (aarch64_insn_is_b(insn)) {
+			*type = INSN_JUMP_UNCONDITIONAL;
+			*immediate = aarch64_get_branch_offset(insn);
+		} else if (aarch64_insn_is_br(insn)) {
+			*type = INSN_JUMP_DYNAMIC;
+		} else if (aarch64_insn_is_branch_imm(insn)) {
+			/* Remaining branch opcodes are conditional */
+			*type = INSN_JUMP_CONDITIONAL;
+			*immediate = aarch64_get_branch_offset(insn);
+		} else {
+			*type = INSN_OTHER;
+		}
+		break;
 	default:
 		*type = INSN_OTHER;
 		break;

From cdd8b48dd486483d29046792e7ff900e18b6df1e Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Wed, 9 Sep 2020 15:33:51 +0100
Subject: [PATCH 233/737] objtool: arm64: Decode other system instructions

Decode ERET, BRK and NOPs

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 tools/objtool/arch/arm64/decode.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tools/objtool/arch/arm64/decode.c b/tools/objtool/arch/arm64/decode.c
index df9cca1122557..a8ad78be4e48b 100644
--- a/tools/objtool/arch/arm64/decode.c
+++ b/tools/objtool/arch/arm64/decode.c
@@ -244,6 +244,13 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 			/* Remaining branch opcodes are conditional */
 			*type = INSN_JUMP_CONDITIONAL;
 			*immediate = aarch64_get_branch_offset(insn);
+		} else if (aarch64_insn_is_eret(insn)) {
+			*type = INSN_CONTEXT_SWITCH;
+		} else if (aarch64_insn_is_steppable_hint(insn)) {
+			*type = INSN_NOP;
+		} else if (aarch64_insn_is_brk(insn)) {
+			*immediate = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_16, insn);
+			*type = INSN_BUG;
 		} else {
 			*type = INSN_OTHER;
 		}

From ddfd9239c34c6b9e96ac8e9af246ca1f6b0c31c1 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Wed, 9 Sep 2020 15:56:17 +0100
Subject: [PATCH 234/737] objtool: arm64: Decode load/store instructions

Decode load/store operations and create corresponding stack_ops for
operations targetting SP or FP.

Operations storing/loading multiple registers are split into separate
stack_ops storing single registers.

Operations modifying the base register get an additional stack_op
for the register update. Since the atomic register(s) load/store + base
register update gets split into multiple operations, to make sure
objtool always sees a valid stack, consider store instruction to perform
stack allocations (i.e. modifying the base pointer before the storing)
and loads de-allocations (i.e. modifying the base pointer after the
load).

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 tools/objtool/arch/arm64/decode.c | 148 ++++++++++++++++++++++++++++++
 1 file changed, 148 insertions(+)

diff --git a/tools/objtool/arch/arm64/decode.c b/tools/objtool/arch/arm64/decode.c
index a8ad78be4e48b..afdaec346ab0b 100644
--- a/tools/objtool/arch/arm64/decode.c
+++ b/tools/objtool/arch/arm64/decode.c
@@ -116,6 +116,48 @@ int arch_decode_hint_reg(u8 sp_reg, int *base)
 }
 }
 
+static struct stack_op *arm_make_store_op(enum aarch64_insn_register base,
+					  enum aarch64_insn_register reg,
+					  int offset)
+{
+	struct stack_op *op;
+
+	op = calloc(1, sizeof(*op));
+	if (!op) {
+		WARN("calloc failed");
+		return NULL;
+	}
+	op->dest.type = OP_DEST_REG_INDIRECT;
+	op->dest.reg = base;
+	op->dest.offset = offset;
+	op->src.type = OP_SRC_REG;
+	op->src.reg = reg;
+	op->src.offset = 0;
+
+	return op;
+}
+
+static struct stack_op *arm_make_load_op(enum aarch64_insn_register base,
+					 enum aarch64_insn_register reg,
+					 int offset)
+{
+	struct stack_op *op;
+
+	op = calloc(1, sizeof(*op));
+	if (!op) {
+		WARN("calloc failed");
+		return NULL;
+	}
+	op->dest.type = OP_DEST_REG;
+	op->dest.reg = reg;
+	op->dest.offset = 0;
+	op->src.type = OP_SRC_REG_INDIRECT;
+	op->src.reg = base;
+	op->src.offset = offset;
+
+	return op;
+}
+
 static struct stack_op *arm_make_add_op(enum aarch64_insn_register dest,
 					enum aarch64_insn_register src,
 					int val)
@@ -136,6 +178,101 @@ static struct stack_op *arm_make_add_op(enum aarch64_insn_register dest,
 	return op;
 }
 
+static int arm_decode_load_store(u32 insn, enum insn_type *type,
+				 unsigned long *immediate,
+				 struct list_head *ops_list)
+{
+	enum aarch64_insn_register base;
+	enum aarch64_insn_register rt;
+	struct stack_op *op;
+	int size;
+	int offset;
+
+	*type = INSN_OTHER;
+
+	if (aarch64_insn_is_store_single(insn) ||
+	    aarch64_insn_is_load_single(insn))
+		size = 1 << ((insn & GENMASK(31, 30)) >> 30);
+	else
+		size = 4 << ((insn >> 31) & 1);
+
+	if (aarch64_insn_is_store_imm(insn) || aarch64_insn_is_load_imm(insn))
+		*immediate = size * aarch64_insn_decode_immediate(AARCH64_INSN_IMM_12,
+								  insn);
+	else if (aarch64_insn_is_store_pre(insn) ||
+		 aarch64_insn_is_load_pre(insn) ||
+		 aarch64_insn_is_store_post(insn) ||
+		 aarch64_insn_is_load_post(insn))
+		*immediate = sign_extend(aarch64_insn_decode_immediate(AARCH64_INSN_IMM_9,
+								       insn),
+					 9);
+	else if (aarch64_insn_is_stp(insn) || aarch64_insn_is_ldp(insn) ||
+		 aarch64_insn_is_stp_pre(insn) ||
+		 aarch64_insn_is_ldp_pre(insn) ||
+		 aarch64_insn_is_stp_post(insn) ||
+		 aarch64_insn_is_ldp_post(insn))
+		*immediate = size * sign_extend(aarch64_insn_decode_immediate(AARCH64_INSN_IMM_7,
+									      insn),
+						7);
+	else
+		return 1;
+
+	base = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RN, insn);
+	if (base != AARCH64_INSN_REG_FP && base != AARCH64_INSN_REG_SP)
+		return 0;
+
+	offset = *immediate;
+
+	if (aarch64_insn_is_store_pre(insn) || aarch64_insn_is_stp_pre(insn) ||
+	    aarch64_insn_is_store_post(insn) || aarch64_insn_is_stp_post(insn)) {
+		op = arm_make_add_op(base, base, *immediate);
+		list_add_tail(&op->list, ops_list);
+
+		if (aarch64_insn_is_store_post(insn) || aarch64_insn_is_stp_post(insn))
+			offset = -*immediate;
+		else
+			offset = 0;
+	} else if (aarch64_insn_is_load_post(insn) || aarch64_insn_is_ldp_post(insn)) {
+		offset = 0;
+	}
+
+	/* First register */
+	rt = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RT, insn);
+	if (aarch64_insn_is_store_single(insn) ||
+	    aarch64_insn_is_store_pair(insn))
+		op = arm_make_store_op(base, rt, offset);
+	else
+		op = arm_make_load_op(base, rt, offset);
+
+	if (!op)
+		return -1;
+	list_add_tail(&op->list, ops_list);
+
+	/* Second register (if present) */
+	if (aarch64_insn_is_store_pair(insn) ||
+	    aarch64_insn_is_load_pair(insn)) {
+		rt = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RT2,
+						  insn);
+		if (aarch64_insn_is_store_pair(insn))
+			op = arm_make_store_op(base, rt, offset + size);
+		else
+			op = arm_make_load_op(base, rt, offset + size);
+		if (!op)
+			return -1;
+		list_add_tail(&op->list, ops_list);
+	}
+
+	if (aarch64_insn_is_load_pre(insn) || aarch64_insn_is_ldp_pre(insn) ||
+	    aarch64_insn_is_load_post(insn) || aarch64_insn_is_ldp_post(insn)) {
+		op = arm_make_add_op(base, base, *immediate);
+		if (!op)
+			return -1;
+		list_add_tail(&op->list, ops_list);
+	}
+
+	return 0;
+}
+
 static int arm_decode_add_sub_imm(u32 instr, bool set_flags,
 				  enum insn_type *type,
 				  unsigned long *immediate,
@@ -255,6 +392,17 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 			*type = INSN_OTHER;
 		}
 		break;
+	case AARCH64_INSN_CLS_LDST:
+	{
+		int ret;
+
+		ret = arm_decode_load_store(insn, type, immediate, ops_list);
+		if (ret <= 0)
+			return ret;
+
+		*type = INSN_OTHER;
+		break;
+	}
 	default:
 		*type = INSN_OTHER;
 		break;

From ca85da456fb4beb7d385720e3a3241360083d855 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Tue, 19 Jan 2021 12:11:46 +0100
Subject: [PATCH 235/737] objtool: arm64: Decode LDR instructions

Load literal instructions can generate constants inside code sections.
Record the locations of the constants in order to be able to remove
their corresponding "struct instruction".

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 tools/objtool/arch/arm64/decode.c    | 86 ++++++++++++++++++++++++++++
 tools/objtool/arch/x86/decode.c      |  5 ++
 tools/objtool/check.c                |  3 +
 tools/objtool/include/objtool/arch.h |  3 +
 4 files changed, 97 insertions(+)

diff --git a/tools/objtool/arch/arm64/decode.c b/tools/objtool/arch/arm64/decode.c
index afdaec346ab0b..c0abf908f122c 100644
--- a/tools/objtool/arch/arm64/decode.c
+++ b/tools/objtool/arch/arm64/decode.c
@@ -30,6 +30,73 @@ static unsigned long sign_extend(unsigned long x, int nbits)
 	return ((~0UL + (sign_bit ^ 1)) << nbits) | x;
 }
 
+struct insn_loc {
+	const struct section *sec;
+	unsigned long offset;
+	struct hlist_node hnode;
+	bool ignorable;
+};
+
+DEFINE_HASHTABLE(invalid_insns, 16);
+
+static int record_invalid_insn(const struct section *sec,
+			       unsigned long offset,
+			       bool ignore)
+{
+	struct insn_loc *loc;
+	struct hlist_head *l;
+
+	l = &invalid_insns[hash_min(offset, HASH_BITS(invalid_insns))];
+	if (!hlist_empty(l)) {
+		loc = hlist_entry(l->first, struct insn_loc, hnode);
+		loc->ignorable |= ignore;
+		return 0;
+	}
+
+	loc = malloc(sizeof(*loc));
+	if (!loc) {
+		WARN("malloc failed");
+		return -1;
+	}
+
+	loc->sec = sec;
+	loc->offset = offset;
+	loc->ignorable = ignore;
+
+	hash_add(invalid_insns, &loc->hnode, loc->offset);
+
+	return 0;
+}
+
+int arch_post_process_instructions(struct objtool_file *file)
+{
+	struct hlist_node *tmp;
+	struct insn_loc *loc;
+	unsigned int bkt;
+	int res = 0;
+
+	hash_for_each_safe(invalid_insns, bkt, tmp, loc, hnode) {
+		struct instruction *insn;
+
+		insn = find_insn(file, (struct section *) loc->sec, loc->offset);
+		if (insn) {
+			if (loc->ignorable) {
+				list_del(&insn->list);
+				hash_del(&insn->hash);
+				free(insn);
+			} else {
+				WARN_FUNC("can't decode instruction", insn->sec, insn->offset);
+				return -1;
+			}
+		}
+
+		hash_del(&loc->hnode);
+		free(loc);
+	}
+
+	return res;
+}
+
 bool arch_callee_saved_reg(unsigned char reg)
 {
 	switch (reg) {
@@ -400,6 +467,25 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 		if (ret <= 0)
 			return ret;
 
+		if (aarch64_insn_is_ldr_lit(insn)) {
+			long pc_offset;
+
+			pc_offset = insn & GENMASK(23, 5);
+			/* Sign extend and multiply by 4 */
+			pc_offset = (pc_offset << (64 - 23));
+			pc_offset = ((pc_offset >> (64 - 23)) >> 5) << 2;
+
+			if (record_invalid_insn(sec, offset + pc_offset, true))
+				return -1;
+
+			/* 64-bit literal */
+			if (insn & BIT(30)) {
+				if (record_invalid_insn(sec,
+							offset + pc_offset + 4,
+							true))
+					return -1;
+			}
+		}
 		*type = INSN_OTHER;
 		break;
 	}
diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index bee22d4e672e2..d9d9d763e41dd 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -617,6 +617,11 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 	return 0;
 }
 
+int arch_post_process_instructions(struct objtool_file *file)
+{
+	return 0;
+}
+
 void arch_initial_func_cfi_state(struct cfi_init_state *state)
 {
 	int i;
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 605efaedc79f0..f9453cb1e1093 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -417,6 +417,9 @@ static int decode_instructions(struct objtool_file *file)
 	if (stats)
 		printf("nr_insns: %lu\n", nr_insns);
 
+	if (arch_post_process_instructions(file))
+		return -1;
+
 	return 0;
 
 err:
diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h
index 9be79f05c779c..646944d2235ce 100644
--- a/tools/objtool/include/objtool/arch.h
+++ b/tools/objtool/include/objtool/arch.h
@@ -67,6 +67,7 @@ struct stack_op {
 	struct list_head list;
 };
 
+struct objtool_file;
 struct instruction;
 
 void arch_initial_func_cfi_state(struct cfi_init_state *state);
@@ -77,6 +78,8 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 			    unsigned long *immediate,
 			    struct list_head *ops_list);
 
+int arch_post_process_instructions(struct objtool_file *file);
+
 bool arch_callee_saved_reg(unsigned char reg);
 
 unsigned long arch_jump_destination(struct instruction *insn);

From 2db0c65253aef59fa6a65c5c1c6d26714f18d335 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Tue, 19 Jan 2021 12:15:37 +0100
Subject: [PATCH 236/737] objtool: arm64: Accept padding in code sections

The compiler can introduce some '0' words in code sections to pad the
end of functions.
Similar to load literal functions, record these zero words to remove
the "struct instruction" created for them.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 tools/objtool/arch/arm64/decode.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/tools/objtool/arch/arm64/decode.c b/tools/objtool/arch/arm64/decode.c
index c0abf908f122c..01c470c3324ee 100644
--- a/tools/objtool/arch/arm64/decode.c
+++ b/tools/objtool/arch/arm64/decode.c
@@ -396,8 +396,23 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 
 	switch (aarch64_get_insn_class(insn)) {
 	case AARCH64_INSN_CLS_UNKNOWN:
-		WARN("can't decode instruction at %s:0x%lx", sec->name, offset);
-		return -1;
+	{
+		/*
+		 * There are a few reasons we might have non-valid opcodes in
+		 * code sections:
+		 * - For load literal, assembler can generate the data to be
+		 *   loaded in the code section
+		 * - Compiler/assembler can generate zeroes to pad function that
+		 *   do not end on 8-byte alignment
+		 */
+		/* Compiler might put zeroes as padding */
+		if (record_invalid_insn(sec, offset, insn == 0x0))
+			return -1;
+
+		*type = INSN_OTHER;
+
+		break;
+	}
 	case AARCH64_INSN_CLS_DP_IMM:
 		/* Mov register to and from SP are aliases of add_imm */
 		if (aarch64_insn_is_add_imm(insn) ||

From ad69e26903f7d3d9c1196cba290ea7c5cba18db7 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Tue, 31 Mar 2020 16:23:54 +0100
Subject: [PATCH 237/737] objtool: arm64: Handle supported relocations in
 alternatives

Based on get_alt_insn() in arch/arm64/kernel/alternative.c, arm64
alternative code adapts offsets for static branches and adrp
instructions.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 tools/objtool/arch/arm64/special.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/objtool/arch/arm64/special.c b/tools/objtool/arch/arm64/special.c
index 45f283283091f..a70b91e8bd7de 100644
--- a/tools/objtool/arch/arm64/special.c
+++ b/tools/objtool/arch/arm64/special.c
@@ -10,7 +10,11 @@ bool arch_support_alt_relocation(struct special_alt *special_alt,
 				 struct instruction *insn,
 				 struct reloc *reloc)
 {
-	return false;
+	u32 opcode = *(u32 *)(insn->sec->data->d_buf + insn->offset);
+
+	return aarch64_insn_is_branch_imm(opcode) ||
+	       aarch64_insn_is_adrp(opcode) ||
+	       !aarch64_insn_uses_literal(opcode);
 }
 
 

From aa18604566434f17de7092dd1ac20bdaae4a2513 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Fri, 28 Aug 2020 17:04:12 +0100
Subject: [PATCH 238/737] objtool: arm64: Ignore replacement section for
 alternative callback

ARM64_CB_PATCH doesn't have static replacement instructions. Skip
trying to validate the alternative section.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 tools/objtool/arch/arm64/special.c | 12 ++++++++++++
 tools/objtool/check.c              |  3 +++
 2 files changed, 15 insertions(+)

diff --git a/tools/objtool/arch/arm64/special.c b/tools/objtool/arch/arm64/special.c
index a70b91e8bd7de..ed642bd6f886c 100644
--- a/tools/objtool/arch/arm64/special.c
+++ b/tools/objtool/arch/arm64/special.c
@@ -4,6 +4,18 @@
 
 void arch_handle_alternative(unsigned short feature, struct special_alt *alt)
 {
+	if (alt->orig_len && !alt->new_len) {
+		/*
+		 * ARM64_CB_PATCH has no alternative instruction.
+		 * a callback is called at alternative replacement time
+		 * to dynamically change the original instructions.
+		 *
+		 * ARM64_CB_PATCH is the last ARM64 feature, it's value changes
+		 * every time a new feature is added. So the orig/alt region
+		 * length are used to detect those alternatives
+		 */
+		alt->skip_alt = true;
+	}
 }
 
 bool arch_support_alt_relocation(struct special_alt *special_alt,
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index f9453cb1e1093..62ede7a6b1560 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -1583,6 +1583,9 @@ static int add_special_section_alts(struct objtool_file *file)
 				continue;
 			}
 
+			if (special_alt->skip_alt && !special_alt->new_len)
+				continue;
+
 			ret = handle_group_alt(file, special_alt, orig_insn,
 					       &new_insn);
 			if (ret)

From b7fd95a79c2a474445cd1d31f240d2b2b2f21a11 Mon Sep 17 00:00:00 2001
From: Raphael Gault <raphael.gault@arm.com>
Date: Fri, 16 Aug 2019 13:24:03 +0100
Subject: [PATCH 239/737] objtool: arm64: Enable stack validation for arm64

Add build option to run stack validation at compile time.

When requiring stack validation, jump tables are disabled as it
simplifies objtool analysis (without having to introduce unreliable
artifacs). In local testing, this does not appear to significaly
affect final binary size nor system performance.

Signed-off-by: Raphael Gault <raphael.gault@arm.com>
Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/Kconfig  | 1 +
 arch/arm64/Makefile | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 398068e28a173..e5313b2745de0 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -182,6 +182,7 @@ config ARM64
 	select MMU_GATHER_RCU_TABLE_FREE
 	select HAVE_RSEQ
 	select HAVE_STACKPROTECTOR
+	select HAVE_STACK_VALIDATION
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_KPROBES
 	select HAVE_KRETPROBES
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 485b7dbd4f9e3..d1c4a4ad15f47 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -131,6 +131,10 @@ ifeq ($(CONFIG_DYNAMIC_FTRACE_WITH_REGS),y)
   CC_FLAGS_FTRACE := -fpatchable-function-entry=2
 endif
 
+ifeq ($(CONFIG_STACK_VALIDATION),y)
+KBUILD_CFLAGS	+= -fno-jump-tables
+endif
+
 # Default value
 head-y		:= arch/arm64/kernel/head.o
 

From 8a26fc755fa63ebcd97f054d90ccb0c03cd14328 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Wed, 18 Mar 2020 20:01:57 +0000
Subject: [PATCH 240/737] arm64: bug: Add reachable annotation to warning
 macros

WARN* and BUG* both use brk #0x800 opcodes and the distinction is
provided by the contents of the bug table. This table is not accessible
to objtool, so add an annotation to WARN* macros to let objtool know
that brk handler will return an resume the execution of the instructions
following the WARN's brk.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/include/asm/bug.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/bug.h b/arch/arm64/include/asm/bug.h
index 28be048db3f63..9917429971d48 100644
--- a/arch/arm64/include/asm/bug.h
+++ b/arch/arm64/include/asm/bug.h
@@ -19,7 +19,11 @@
 	unreachable();					\
 } while (0)
 
-#define __WARN_FLAGS(flags) __BUG_FLAGS(BUGFLAG_WARNING|(flags))
+#define __WARN_FLAGS(flags)			\
+do {						\
+	__BUG_FLAGS(BUGFLAG_WARNING|(flags));	\
+	annotate_reachable();			\
+} while (0)
 
 #define HAVE_ARCH_BUG
 

From b0e56f75a58da1f9562d43aaa2ebd5f7f62e2539 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Tue, 31 Mar 2020 12:34:17 +0100
Subject: [PATCH 241/737] arm64: kgdb: Mark code following kgdb brk as
 reachable

In the general use case, KGDB breakpoint handler should return normally
to the instruction following the brk.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/include/asm/kgdb.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/include/asm/kgdb.h b/arch/arm64/include/asm/kgdb.h
index 21fc85e9d2bed..a8cb91d8d59b3 100644
--- a/arch/arm64/include/asm/kgdb.h
+++ b/arch/arm64/include/asm/kgdb.h
@@ -19,6 +19,7 @@
 static inline void arch_kgdb_breakpoint(void)
 {
 	asm ("brk %0" : : "I" (KGDB_COMPILED_DBG_BRK_IMM));
+	annotate_reachable();
 }
 
 extern void kgdb_handle_bus_error(void);

From 5d4b2d2befaf9e96b65947d2a0cf3e7c51e76a03 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Mon, 7 Sep 2020 13:51:59 +0100
Subject: [PATCH 242/737] arm64: Add intra-function call annotations

Stack validation requires BL instructions to an address
within the symbol containing the BL to be annotated as intra-function
calls.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/crypto/crct10dif-ce-core.S | 1 +
 arch/arm64/kernel/entry.S             | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S
index 111d9c9abddd1..ec6f97180c747 100644
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/arch/arm64/crypto/crct10dif-ce-core.S
@@ -207,6 +207,7 @@ SYM_FUNC_END(__pmull_p8_core)
 	pmull2		\rq\().8h, \ad\().16b, \bd\().16b	// D = A*B
 	.endif
 
+	ANNOTATE_INTRA_FUNCTION_CALL
 	bl		.L__pmull_p8_core\i
 
 	eor		\rq\().16b, \rq\().16b, t4.16b
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 45c3d259d03c2..83bdf38c5f4f8 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -851,6 +851,7 @@ alternative_else_nop_endif
 	 * entry onto the return stack and using a RET instruction to
 	 * enter the full-fat kernel vectors.
 	 */
+	ANNOTATE_INTRA_FUNCTION_CALL
 	bl	2f
 	b	.
 2:

From 0fbb417056fc015aecdb960c41372dc615bae176 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Fri, 15 Jan 2021 17:39:16 +0100
Subject: [PATCH 243/737] arm64: Skip validation of
 qcom_link_stack_sanitization

This workaround code is more akin to an ancient incantation than
sensible code. And since the function does not call another function
than itself, unwinding from instructions in it should be reliable.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/kernel/proton-pack.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c
index faa8a6bf2376e..013e430acb1fd 100644
--- a/arch/arm64/kernel/proton-pack.c
+++ b/arch/arm64/kernel/proton-pack.c
@@ -22,6 +22,7 @@
 #include <linux/cpu.h>
 #include <linux/device.h>
 #include <linux/nospec.h>
+#include <linux/objtool.h>
 #include <linux/prctl.h>
 #include <linux/sched/task_stack.h>
 
@@ -300,6 +301,7 @@ static void qcom_link_stack_sanitisation(void)
 		     "mov	x30, %0		\n"
 		     : "=&r" (tmp));
 }
+STACK_FRAME_NON_STANDARD(qcom_link_stack_sanitisation);
 
 static bp_hardening_cb_t spectre_v2_get_sw_mitigation_cb(void)
 {

From d5c26ef5b53ce61594709d91d8be030a2bd150a4 Mon Sep 17 00:00:00 2001
From: Raphael Gault <raphael.gault@arm.com>
Date: Fri, 16 Aug 2019 13:24:00 +0100
Subject: [PATCH 244/737] arm64: kernel: Add exception on kuser32 to prevent
 stack analysis

kuser32 being used for compatibility, it contains a32 instructions
which are not recognised by objtool when trying to analyse arm64
object files. Thus, we add an exception to skip validation on this
particular file.

Signed-off-by: Raphael Gault <raphael.gault@arm.com>
Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/kernel/Makefile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 643acbc605be6..8f82dc5e1b9e4 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -29,6 +29,9 @@ obj-$(CONFIG_COMPAT)			+= sys32.o signal32.o			\
 					   sys_compat.o
 obj-$(CONFIG_COMPAT)			+= sigreturn32.o
 obj-$(CONFIG_KUSER_HELPERS)		+= kuser32.o
+
+OBJECT_FILES_NON_STANDARD_kuser32.o := y
+
 obj-$(CONFIG_FUNCTION_TRACER)		+= ftrace.o entry-ftrace.o
 obj-$(CONFIG_MODULES)			+= module.o
 obj-$(CONFIG_ARM64_MODULE_PLTS)		+= module-plts.o

From 4e8609b86657e1fb8ce23ff0b6cf005d9618131a Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Mon, 2 Dec 2019 14:34:31 +0000
Subject: [PATCH 245/737] arm64: Mark sigreturn32.o as containing non standard
 code

sigreturn32.o contains aarch32 getting copied to the VDSO for
compat user tasks.

This code shouldn't get validated by arm64 objtool.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/kernel/Makefile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 8f82dc5e1b9e4..4a39d9525d788 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -28,6 +28,9 @@ $(obj)/%.stub.o: $(obj)/%.o FORCE
 obj-$(CONFIG_COMPAT)			+= sys32.o signal32.o			\
 					   sys_compat.o
 obj-$(CONFIG_COMPAT)			+= sigreturn32.o
+ifeq ($(CONFIG_COMPAT), y)
+OBJECT_FILES_NON_STANDARD_sigreturn32.o := y
+endif
 obj-$(CONFIG_KUSER_HELPERS)		+= kuser32.o
 
 OBJECT_FILES_NON_STANDARD_kuser32.o := y

From e49eb7cdbb4dc3b73e7486a04ba0369e7c82cf0c Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Fri, 27 Mar 2020 10:09:31 +0000
Subject: [PATCH 246/737] arm64: entry: Compile out unnecessary symbols

Symbols el0_sync_invalid, el0_irq_invalid, el0_error_invalid are only
used when kernel is built without compat support.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/kernel/entry.S | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 83bdf38c5f4f8..e476da007fca8 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -619,6 +619,11 @@ __bad_stack:
 	ASM_BUG()
 	.endm
 
+SYM_CODE_START_LOCAL(el0_fiq_invalid)
+	inv_entry 0, BAD_FIQ
+SYM_CODE_END(el0_fiq_invalid)
+
+#ifndef CONFIG_COMPAT
 SYM_CODE_START_LOCAL(el0_sync_invalid)
 	inv_entry 0, BAD_SYNC
 SYM_CODE_END(el0_sync_invalid)
@@ -627,19 +632,16 @@ SYM_CODE_START_LOCAL(el0_irq_invalid)
 	inv_entry 0, BAD_IRQ
 SYM_CODE_END(el0_irq_invalid)
 
-SYM_CODE_START_LOCAL(el0_fiq_invalid)
-	inv_entry 0, BAD_FIQ
-SYM_CODE_END(el0_fiq_invalid)
-
 SYM_CODE_START_LOCAL(el0_error_invalid)
 	inv_entry 0, BAD_ERROR
 SYM_CODE_END(el0_error_invalid)
 
-#ifdef CONFIG_COMPAT
+#else
+
 SYM_CODE_START_LOCAL(el0_fiq_invalid_compat)
 	inv_entry 0, BAD_FIQ, 32
 SYM_CODE_END(el0_fiq_invalid_compat)
-#endif
+#endif /* CONFIG_COMPAT */
 
 SYM_CODE_START_LOCAL(el1_sync_invalid)
 	inv_entry 1, BAD_SYNC

From d9be47acbe320aeb68f3a5fee64e081e1344a4db Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Thu, 17 Sep 2020 10:06:26 +0100
Subject: [PATCH 247/737] arm64: crypto: Remove unnecessary stackframe

The way sha256_block_neon restore the stackframe confuses objtool.
But it turns out this function is a leaf function and does not use
FP nor LR as scratch register.

Do not create a stackframe in this function as it is not necessary.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/crypto/sha256-core.S_shipped | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/arm64/crypto/sha256-core.S_shipped b/arch/arm64/crypto/sha256-core.S_shipped
index 7c7ce2e3bad6b..787a1fe466a70 100644
--- a/arch/arm64/crypto/sha256-core.S_shipped
+++ b/arch/arm64/crypto/sha256-core.S_shipped
@@ -1225,8 +1225,6 @@ sha256_block_armv8:
 .align	4
 sha256_block_neon:
 .Lneon_entry:
-	stp	x29, x30, [sp, #-16]!
-	mov	x29, sp
 	sub	sp,sp,#16*4
 
 	adr	x16,.LK256
@@ -2060,8 +2058,7 @@ sha256_block_neon:
 	 mov	x17,sp
 	b.ne	.L_00_48
 
-	ldr	x29,[x29]
-	add	sp,sp,#16*4+16
+	add	sp,sp,#16*4
 	ret
 .size	sha256_block_neon,.-sha256_block_neon
 #ifndef	__KERNEL__

From 5700db1c6010c3a92a3a29599fef797bb525d632 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Mon, 31 Aug 2020 10:19:03 +0100
Subject: [PATCH 248/737] arm64: sleep: Properly set frame pointer before call

In __cpu_suspend_enter, the FP and LR are properly saved on the stack to
form a stack frame, but the frame pointer is not set afterwards.

Have the frame pointer point to the new frame.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/kernel/sleep.S | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index ba40d57757d63..a33701fb4c895 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -91,6 +91,7 @@ SYM_FUNC_START(__cpu_suspend_enter)
 	str	x0, [x1]
 	add	x0, x0, #SLEEP_STACK_DATA_SYSTEM_REGS
 	stp	x29, lr, [sp, #-16]!
+	mov	x29, sp
 	bl	cpu_do_suspend
 	ldp	x29, lr, [sp], #16
 	mov	x0, #1

From e7b371f3ac526d13c7605cca1455eae1770b652e Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Mon, 2 Dec 2019 13:04:58 +0000
Subject: [PATCH 249/737] arm64: Move constant to rodata

Constant arm64_relocate_new_kernel_size does not need to be in
the same section as the new kernel code/data region.

Move it to ".rodata" section.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/kernel/relocate_kernel.S | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S
index 84eec95ec06cc..a4fbfd8d92550 100644
--- a/arch/arm64/kernel/relocate_kernel.S
+++ b/arch/arm64/kernel/relocate_kernel.S
@@ -106,10 +106,13 @@ SYM_CODE_END(arm64_relocate_new_kernel)
 .Lcopy_end:
 .org	KEXEC_CONTROL_PAGE_SIZE
 
+.pushsection ".rodata", "a"
 /*
  * arm64_relocate_new_kernel_size - Number of bytes to copy to the
  * control_code_page.
  */
 .globl arm64_relocate_new_kernel_size
+.align 8
 arm64_relocate_new_kernel_size:
 	.quad	.Lcopy_end - arm64_relocate_new_kernel
+.popsection

From 33eff7f46a55779d25f687be9ff86bfd82260d9d Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Fri, 3 Apr 2020 13:44:00 +0100
Subject: [PATCH 250/737] arm64: entry: Mark tramp_exit as local symbols

Symbols tramp_exit_native and tramp_exit_compat are not used outside of
entry.S. Hence, assign them local binding.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/kernel/entry.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index e476da007fca8..ae21bf576f1dc 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -929,11 +929,11 @@ SYM_CODE_START_NOALIGN(tramp_vectors)
 	generate_tramp_vector	kpti=1, bhb=BHB_MITIGATION_NONE
 SYM_CODE_END(tramp_vectors)
 
-SYM_CODE_START(tramp_exit_native)
+SYM_CODE_START_LOCAL(tramp_exit_native)
 	tramp_exit
 SYM_CODE_END(tramp_exit_native)
 
-SYM_CODE_START(tramp_exit_compat)
+SYM_CODE_START_LOCAL(tramp_exit_compat)
 	tramp_exit	32
 SYM_CODE_END(tramp_exit_compat)
 

From b5341a6f20a1eb679cce96498ceab12f3422a448 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 13 Nov 2020 12:49:23 +0000
Subject: [PATCH 251/737] arm64: head.S: rename el2_setup -> init_kernel_el

For a while now el2_setup has performed some basic initialization of EL1
even when the kernel is booted at EL1, so the name is a little
misleading. Further, some comments are stale as with VHE it doesn't drop
the CPU to EL1.

To clarify things, rename el2_setup to init_kernel_el, and update
comments to be clearer as to the function's purpose.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20201113124937.20574-4-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
(cherry picked from commit ecbb11ab3ebc02763ec53489c9b1f983be9dc882)
---
 arch/arm64/kernel/head.S  | 15 ++++++++-------
 arch/arm64/kernel/sleep.S |  2 +-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index f969e0b3cf969..034fd8313991a 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -105,7 +105,7 @@ pe_header:
 	 */
 SYM_CODE_START(primary_entry)
 	bl	preserve_boot_args
-	bl	el2_setup			// Drop to EL1, w0=cpu_boot_mode
+	bl	init_kernel_el			// w0=cpu_boot_mode
 	adrp	x23, __PHYS_OFFSET
 	and	x23, x23, MIN_KIMG_ALIGN - 1	// KASLR offset, defaults to 0
 	bl	set_cpu_boot_mode_flag
@@ -496,13 +496,14 @@ EXPORT_SYMBOL(kimage_vaddr)
 	.section ".idmap.text","awx"
 
 /*
- * If we're fortunate enough to boot at EL2, ensure that the world is
- * sane before dropping to EL1.
+ * Starting from EL2 or EL1, configure the CPU to execute at the highest
+ * reachable EL supported by the kernel in a chosen default state. If dropping
+ * from EL2 to EL1, configure EL2 before configuring EL1.
  *
  * Returns either BOOT_CPU_MODE_EL1 or BOOT_CPU_MODE_EL2 in w0 if
  * booted in EL1 or EL2 respectively.
  */
-SYM_FUNC_START(el2_setup)
+SYM_FUNC_START(init_kernel_el)
 	msr	SPsel, #1			// We want to use SP_EL{1,2}
 	mrs	x0, CurrentEL
 	cmp	x0, #CurrentEL_EL2
@@ -663,7 +664,7 @@ SYM_INNER_LABEL(install_el2_stub, SYM_L_LOCAL)
 	msr	elr_el2, lr
 	mov	w0, #BOOT_CPU_MODE_EL2		// This CPU booted in EL2
 	eret
-SYM_FUNC_END(el2_setup)
+SYM_FUNC_END(init_kernel_el)
 
 /*
  * Sets the __boot_cpu_mode flag depending on the CPU boot mode passed
@@ -713,7 +714,7 @@ SYM_DATA_END(__early_cpu_boot_status)
 	 * cores are held until we're ready for them to initialise.
 	 */
 SYM_FUNC_START(secondary_holding_pen)
-	bl	el2_setup			// Drop to EL1, w0=cpu_boot_mode
+	bl	init_kernel_el			// w0=cpu_boot_mode
 	bl	set_cpu_boot_mode_flag
 	mrs	x0, mpidr_el1
 	mov_q	x1, MPIDR_HWID_BITMASK
@@ -731,7 +732,7 @@ SYM_FUNC_END(secondary_holding_pen)
 	 * be used where CPUs are brought online dynamically by the kernel.
 	 */
 SYM_FUNC_START(secondary_entry)
-	bl	el2_setup			// Drop to EL1
+	bl	init_kernel_el			// w0=cpu_boot_mode
 	bl	set_cpu_boot_mode_flag
 	b	secondary_startup
 SYM_FUNC_END(secondary_entry)
diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index a33701fb4c895..7adf0e69c3990 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -100,7 +100,7 @@ SYM_FUNC_END(__cpu_suspend_enter)
 
 	.pushsection ".idmap.text", "awx"
 SYM_CODE_START(cpu_resume)
-	bl	el2_setup		// if in EL2 drop to EL1 cleanly
+	bl	init_kernel_el
 	bl	__cpu_setup
 	/* enable the MMU early - so we can access sleep_save_stash by va */
 	adrp	x1, swapper_pg_dir

From 7cfd2646b9d9d9d8da6f37dde4f1fa029b635ac5 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Sun, 29 Mar 2020 13:35:21 +0100
Subject: [PATCH 252/737] arm64: Change symbol annotations

Code symbols not following the aarch64 procedure call convention should be
annotated with SYM_CODE_* instead of SYM_FUNC_*

Mark relevant symbols as generic code symbols.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/kernel/head.S   | 56 +++++++++++++++++++-------------------
 arch/arm64/kernel/sleep.S  |  4 +--
 arch/arm64/kvm/hyp/entry.S |  4 +--
 3 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 034fd8313991a..e5188f895c7f9 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -274,7 +274,7 @@ SYM_CODE_END(preserve_boot_args)
  *   - first few MB of the kernel linear mapping to jump to once the MMU has
  *     been enabled
  */
-SYM_FUNC_START_LOCAL(__create_page_tables)
+SYM_CODE_START_LOCAL(__create_page_tables)
 	mov	x28, lr
 
 	/*
@@ -408,7 +408,7 @@ SYM_FUNC_START_LOCAL(__create_page_tables)
 	bl	__inval_dcache_area
 
 	ret	x28
-SYM_FUNC_END(__create_page_tables)
+SYM_CODE_END(__create_page_tables)
 
 	/*
 	 * Create a final frame record at task_pt_regs(current)->stackframe, so
@@ -427,7 +427,7 @@ SYM_FUNC_END(__create_page_tables)
  *
  *   x0 = __PHYS_OFFSET
  */
-SYM_FUNC_START_LOCAL(__primary_switched)
+SYM_CODE_START_LOCAL(__primary_switched)
 	adrp	x4, init_thread_union
 	add	sp, x4, #THREAD_SIZE
 	adr_l	x5, init_task
@@ -480,7 +480,7 @@ SYM_FUNC_START_LOCAL(__primary_switched)
 	setup_final_frame
 	bl	start_kernel
 	ASM_BUG()
-SYM_FUNC_END(__primary_switched)
+SYM_CODE_END(__primary_switched)
 
 	.pushsection ".rodata", "a"
 SYM_DATA_START(kimage_vaddr)
@@ -503,7 +503,7 @@ EXPORT_SYMBOL(kimage_vaddr)
  * Returns either BOOT_CPU_MODE_EL1 or BOOT_CPU_MODE_EL2 in w0 if
  * booted in EL1 or EL2 respectively.
  */
-SYM_FUNC_START(init_kernel_el)
+SYM_CODE_START(init_kernel_el)
 	msr	SPsel, #1			// We want to use SP_EL{1,2}
 	mrs	x0, CurrentEL
 	cmp	x0, #CurrentEL_EL2
@@ -664,13 +664,13 @@ SYM_INNER_LABEL(install_el2_stub, SYM_L_LOCAL)
 	msr	elr_el2, lr
 	mov	w0, #BOOT_CPU_MODE_EL2		// This CPU booted in EL2
 	eret
-SYM_FUNC_END(init_kernel_el)
+SYM_CODE_END(init_kernel_el)
 
 /*
  * Sets the __boot_cpu_mode flag depending on the CPU boot mode passed
  * in w0. See arch/arm64/include/asm/virt.h for more info.
  */
-SYM_FUNC_START_LOCAL(set_cpu_boot_mode_flag)
+SYM_CODE_START_LOCAL(set_cpu_boot_mode_flag)
 	adr_l	x1, __boot_cpu_mode
 	cmp	w0, #BOOT_CPU_MODE_EL2
 	b.ne	1f
@@ -679,7 +679,7 @@ SYM_FUNC_START_LOCAL(set_cpu_boot_mode_flag)
 	dmb	sy
 	dc	ivac, x1			// Invalidate potentially stale cache line
 	ret
-SYM_FUNC_END(set_cpu_boot_mode_flag)
+SYM_CODE_END(set_cpu_boot_mode_flag)
 
 /*
  * These values are written with the MMU off, but read with the MMU on.
@@ -713,7 +713,7 @@ SYM_DATA_END(__early_cpu_boot_status)
 	 * This provides a "holding pen" for platforms to hold all secondary
 	 * cores are held until we're ready for them to initialise.
 	 */
-SYM_FUNC_START(secondary_holding_pen)
+SYM_CODE_START(secondary_holding_pen)
 	bl	init_kernel_el			// w0=cpu_boot_mode
 	bl	set_cpu_boot_mode_flag
 	mrs	x0, mpidr_el1
@@ -725,19 +725,19 @@ pen:	ldr	x4, [x3]
 	b.eq	secondary_startup
 	wfe
 	b	pen
-SYM_FUNC_END(secondary_holding_pen)
+SYM_CODE_END(secondary_holding_pen)
 
 	/*
 	 * Secondary entry point that jumps straight into the kernel. Only to
 	 * be used where CPUs are brought online dynamically by the kernel.
 	 */
-SYM_FUNC_START(secondary_entry)
+SYM_CODE_START(secondary_entry)
 	bl	init_kernel_el			// w0=cpu_boot_mode
 	bl	set_cpu_boot_mode_flag
 	b	secondary_startup
-SYM_FUNC_END(secondary_entry)
+SYM_CODE_END(secondary_entry)
 
-SYM_FUNC_START_LOCAL(secondary_startup)
+SYM_CODE_START_LOCAL(secondary_startup)
 	/*
 	 * Common entry point for secondary CPUs.
 	 */
@@ -747,9 +747,9 @@ SYM_FUNC_START_LOCAL(secondary_startup)
 	bl	__enable_mmu
 	ldr	x8, =__secondary_switched
 	br	x8
-SYM_FUNC_END(secondary_startup)
+SYM_CODE_END(secondary_startup)
 
-SYM_FUNC_START_LOCAL(__secondary_switched)
+SYM_CODE_START_LOCAL(__secondary_switched)
 	adr_l	x5, vectors
 	msr	vbar_el1, x5
 	isb
@@ -770,13 +770,13 @@ SYM_FUNC_START_LOCAL(__secondary_switched)
 
 	bl	secondary_start_kernel
 	ASM_BUG()
-SYM_FUNC_END(__secondary_switched)
+SYM_CODE_END(__secondary_switched)
 
-SYM_FUNC_START_LOCAL(__secondary_too_slow)
+SYM_CODE_START_LOCAL(__secondary_too_slow)
 	wfe
 	wfi
 	b	__secondary_too_slow
-SYM_FUNC_END(__secondary_too_slow)
+SYM_CODE_END(__secondary_too_slow)
 
 /*
  * The booting CPU updates the failed status @__early_cpu_boot_status,
@@ -808,7 +808,7 @@ SYM_FUNC_END(__secondary_too_slow)
  * Checks if the selected granule size is supported by the CPU.
  * If it isn't, park the CPU
  */
-SYM_FUNC_START(__enable_mmu)
+SYM_CODE_START(__enable_mmu)
 	mrs	x2, ID_AA64MMFR0_EL1
 	ubfx	x2, x2, #ID_AA64MMFR0_TGRAN_SHIFT, 4
 	cmp     x2, #ID_AA64MMFR0_TGRAN_SUPPORTED_MIN
@@ -834,9 +834,9 @@ SYM_FUNC_START(__enable_mmu)
 	dsb	nsh
 	isb
 	ret
-SYM_FUNC_END(__enable_mmu)
+SYM_CODE_END(__enable_mmu)
 
-SYM_FUNC_START(__cpu_secondary_check52bitva)
+SYM_CODE_START_LOCAL(__cpu_secondary_check52bitva)
 #ifdef CONFIG_ARM64_VA_BITS_52
 	ldr_l	x0, vabits_actual
 	cmp	x0, #52
@@ -854,9 +854,9 @@ SYM_FUNC_START(__cpu_secondary_check52bitva)
 
 #endif
 2:	ret
-SYM_FUNC_END(__cpu_secondary_check52bitva)
+SYM_CODE_END(__cpu_secondary_check52bitva)
 
-SYM_FUNC_START_LOCAL(__no_granule_support)
+SYM_CODE_START_LOCAL(__no_granule_support)
 	/* Indicate that this CPU can't boot and is stuck in the kernel */
 	update_early_cpu_boot_status \
 		CPU_STUCK_IN_KERNEL | CPU_STUCK_REASON_NO_GRAN, x1, x2
@@ -864,10 +864,10 @@ SYM_FUNC_START_LOCAL(__no_granule_support)
 	wfe
 	wfi
 	b	1b
-SYM_FUNC_END(__no_granule_support)
+SYM_CODE_END(__no_granule_support)
 
 #ifdef CONFIG_RELOCATABLE
-SYM_FUNC_START_LOCAL(__relocate_kernel)
+SYM_CODE_START_LOCAL(__relocate_kernel)
 	/*
 	 * Iterate over each entry in the relocation table, and apply the
 	 * relocations in place.
@@ -969,10 +969,10 @@ SYM_FUNC_START_LOCAL(__relocate_kernel)
 #endif
 	ret
 
-SYM_FUNC_END(__relocate_kernel)
+SYM_CODE_END(__relocate_kernel)
 #endif
 
-SYM_FUNC_START_LOCAL(__primary_switch)
+SYM_CODE_START_LOCAL(__primary_switch)
 #ifdef CONFIG_RANDOMIZE_BASE
 	mov	x19, x0				// preserve new SCTLR_EL1 value
 	mrs	x20, sctlr_el1			// preserve old SCTLR_EL1 value
@@ -1016,4 +1016,4 @@ SYM_FUNC_START_LOCAL(__primary_switch)
 	ldr	x8, =__primary_switched
 	adrp	x0, __PHYS_OFFSET
 	br	x8
-SYM_FUNC_END(__primary_switch)
+SYM_CODE_END(__primary_switch)
diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index 7adf0e69c3990..dc777ca4c0406 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -111,7 +111,7 @@ SYM_CODE_END(cpu_resume)
 	.ltorg
 	.popsection
 
-SYM_FUNC_START(_cpu_resume)
+SYM_CODE_START(_cpu_resume)
 	mrs	x1, mpidr_el1
 	adr_l	x8, mpidr_hash		// x8 = struct mpidr_hash virt address
 
@@ -147,4 +147,4 @@ SYM_FUNC_START(_cpu_resume)
 	ldp	x29, lr, [x29]
 	mov	x0, #0
 	ret
-SYM_FUNC_END(_cpu_resume)
+SYM_CODE_END(_cpu_resume)
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index 0c66a1d408fd7..78334aea907af 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -20,7 +20,7 @@
 /*
  * u64 __guest_enter(struct kvm_vcpu *vcpu);
  */
-SYM_FUNC_START(__guest_enter)
+SYM_CODE_START(__guest_enter)
 	// x0: vcpu
 	// x1-x17: clobbered by macros
 	// x29: guest context
@@ -203,4 +203,4 @@ abort_guest_exit_end:
 	msr	spsr_el2, x4
 	orr	x0, x0, x5
 1:	ret
-SYM_FUNC_END(__guest_enter)
+SYM_CODE_END(__guest_enter)

From 53f01bbaca8e21dcf1e99af45f0539cdcc00bec5 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Mon, 31 Aug 2020 09:49:14 +0100
Subject: [PATCH 253/737] objtool: check: Support data in text section

Assembly code can mix code and data in text sections through the use
of SYM_DATA_*() macros. Skip the content of these symbols when decoding
instructions of text sections.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 tools/objtool/check.c               | 13 +++++++++++--
 tools/objtool/elf.c                 | 14 ++++++++++++++
 tools/objtool/include/objtool/elf.h |  1 +
 3 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 62ede7a6b1560..2a6c9dfc43491 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -354,7 +354,7 @@ static int decode_instructions(struct objtool_file *file)
 {
 	struct section *sec;
 	struct symbol *func;
-	unsigned long offset;
+	unsigned long offset, next_offset;
 	struct instruction *insn;
 	int ret;
 
@@ -373,7 +373,14 @@ static int decode_instructions(struct objtool_file *file)
 		    !strncmp(sec->name, ".text.__x86.", 12))
 			sec->noinstr = true;
 
-		for (offset = 0; offset < sec->len; offset += insn->len) {
+		for (offset = 0; offset < sec->len; offset = next_offset) {
+			struct symbol *obj_sym = find_object_containing(sec, offset);
+			if (obj_sym) {
+				/* This is data in the middle of text section, skip it */
+				next_offset = obj_sym->offset + obj_sym->len;
+				continue;
+			}
+
 			insn = malloc(sizeof(*insn));
 			if (!insn) {
 				WARN("malloc failed");
@@ -397,6 +404,8 @@ static int decode_instructions(struct objtool_file *file)
 			hash_add(file->insn_hash, &insn->hash, sec_offset_hash(sec, insn->offset));
 			list_add_tail(&insn->list, &file->insn_list);
 			nr_insns++;
+
+			next_offset = offset + insn->len;
 		}
 
 		list_for_each_entry(func, &sec->symbol_list, list) {
diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index 9aa65bd9ab12a..c7326b1505c2b 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -166,6 +166,20 @@ struct symbol *find_func_containing(struct section *sec, unsigned long offset)
 	return NULL;
 }
 
+struct symbol *find_object_containing(struct section *sec, unsigned long offset)
+{
+	struct rb_node *node;
+
+	rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) {
+		struct symbol *s = rb_entry(node, struct symbol, node);
+
+		if (s->type == STT_OBJECT)
+			return s;
+	}
+
+	return NULL;
+}
+
 struct symbol *find_symbol_by_name(const struct elf *elf, const char *name)
 {
 	struct symbol *sym;
diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h
index d2c5ed827ed18..c3b8e67a72b64 100644
--- a/tools/objtool/include/objtool/elf.h
+++ b/tools/objtool/include/objtool/elf.h
@@ -150,6 +150,7 @@ struct reloc *find_reloc_by_dest(const struct elf *elf, struct section *sec, uns
 struct reloc *find_reloc_by_dest_range(const struct elf *elf, struct section *sec,
 				     unsigned long offset, unsigned int len);
 struct symbol *find_func_containing(struct section *sec, unsigned long offset);
+struct symbol *find_object_containing(struct section *sec, unsigned long offset);
 
 #define for_each_sec(file, sec)						\
 	list_for_each_entry(sec, &file->elf->sections, list)

From 47559c85413c2422ce5d63a8e60eefee41dd126b Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Tue, 17 Nov 2020 13:47:28 +0100
Subject: [PATCH 254/737] arm64/head: avoid symbol names pointing into first 64
 KB of kernel image

We no longer map the first 64 KB of the kernel image, as there is nothing
there that we ever need to refer back to once the kernel has booted. Even
though facilities like kallsyms are very careful to only refer to the
region that starts at _stext when mapping virtual addresses to symbol
names, let's avoid any confusion by switching to local .L prefixed symbol
names for the EFI header, as none of them have any significance to the
rest of the kernel.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20201117124729.12642-3-ardb@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
(cherry picked from commit b50a3225cdffef43b76b294fa7fb3cd1f32f50d0)
---
 arch/arm64/kernel/efi-header.S | 46 ++++++++++++++++------------------
 1 file changed, 22 insertions(+), 24 deletions(-)

diff --git a/arch/arm64/kernel/efi-header.S b/arch/arm64/kernel/efi-header.S
index a71844fb923ee..0a7267e9a594a 100644
--- a/arch/arm64/kernel/efi-header.S
+++ b/arch/arm64/kernel/efi-header.S
@@ -9,28 +9,26 @@
 
 	.macro	__EFI_PE_HEADER
 	.long	PE_MAGIC
-coff_header:
 	.short	IMAGE_FILE_MACHINE_ARM64		// Machine
-	.short	section_count				// NumberOfSections
+	.short	.Lsection_count				// NumberOfSections
 	.long	0 					// TimeDateStamp
 	.long	0					// PointerToSymbolTable
 	.long	0					// NumberOfSymbols
-	.short	section_table - optional_header		// SizeOfOptionalHeader
+	.short	.Lsection_table - .Loptional_header	// SizeOfOptionalHeader
 	.short	IMAGE_FILE_DEBUG_STRIPPED | \
 		IMAGE_FILE_EXECUTABLE_IMAGE | \
 		IMAGE_FILE_LINE_NUMS_STRIPPED		// Characteristics
 
-optional_header:
+.Loptional_header:
 	.short	PE_OPT_MAGIC_PE32PLUS			// PE32+ format
 	.byte	0x02					// MajorLinkerVersion
 	.byte	0x14					// MinorLinkerVersion
-	.long	__initdata_begin - efi_header_end	// SizeOfCode
+	.long	__initdata_begin - .Lefi_header_end	// SizeOfCode
 	.long	__pecoff_data_size			// SizeOfInitializedData
 	.long	0					// SizeOfUninitializedData
 	.long	__efistub_efi_pe_entry - _head		// AddressOfEntryPoint
-	.long	efi_header_end - _head			// BaseOfCode
+	.long	.Lefi_header_end - _head		// BaseOfCode
 
-extra_header_fields:
 	.quad	0					// ImageBase
 	.long	SEGMENT_ALIGN				// SectionAlignment
 	.long	PECOFF_FILE_ALIGNMENT			// FileAlignment
@@ -45,7 +43,7 @@ extra_header_fields:
 	.long	_end - _head				// SizeOfImage
 
 	// Everything before the kernel image is considered part of the header
-	.long	efi_header_end - _head			// SizeOfHeaders
+	.long	.Lefi_header_end - _head		// SizeOfHeaders
 	.long	0					// CheckSum
 	.short	IMAGE_SUBSYSTEM_EFI_APPLICATION		// Subsystem
 	.short	0					// DllCharacteristics
@@ -54,7 +52,7 @@ extra_header_fields:
 	.quad	0					// SizeOfHeapReserve
 	.quad	0					// SizeOfHeapCommit
 	.long	0					// LoaderFlags
-	.long	(section_table - .) / 8			// NumberOfRvaAndSizes
+	.long	(.Lsection_table - .) / 8		// NumberOfRvaAndSizes
 
 	.quad	0					// ExportTable
 	.quad	0					// ImportTable
@@ -64,17 +62,17 @@ extra_header_fields:
 	.quad	0					// BaseRelocationTable
 
 #ifdef CONFIG_DEBUG_EFI
-	.long	efi_debug_table - _head			// DebugTable
-	.long	efi_debug_table_size
+	.long	.Lefi_debug_table - _head		// DebugTable
+	.long	.Lefi_debug_table_size
 #endif
 
 	// Section table
-section_table:
+.Lsection_table:
 	.ascii	".text\0\0\0"
-	.long	__initdata_begin - efi_header_end	// VirtualSize
-	.long	efi_header_end - _head			// VirtualAddress
-	.long	__initdata_begin - efi_header_end	// SizeOfRawData
-	.long	efi_header_end - _head			// PointerToRawData
+	.long	__initdata_begin - .Lefi_header_end	// VirtualSize
+	.long	.Lefi_header_end - _head		// VirtualAddress
+	.long	__initdata_begin - .Lefi_header_end	// SizeOfRawData
+	.long	.Lefi_header_end - _head		// PointerToRawData
 
 	.long	0					// PointerToRelocations
 	.long	0					// PointerToLineNumbers
@@ -98,7 +96,7 @@ section_table:
 		IMAGE_SCN_MEM_READ | \
 		IMAGE_SCN_MEM_WRITE			// Characteristics
 
-	.set	section_count, (. - section_table) / 40
+	.set	.Lsection_count, (. - .Lsection_table) / 40
 
 #ifdef CONFIG_DEBUG_EFI
 	/*
@@ -114,21 +112,21 @@ section_table:
 	__INITRODATA
 
 	.align	2
-efi_debug_table:
+.Lefi_debug_table:
 	// EFI_IMAGE_DEBUG_DIRECTORY_ENTRY
 	.long	0					// Characteristics
 	.long	0					// TimeDateStamp
 	.short	0					// MajorVersion
 	.short	0					// MinorVersion
 	.long	IMAGE_DEBUG_TYPE_CODEVIEW		// Type
-	.long	efi_debug_entry_size			// SizeOfData
+	.long	.Lefi_debug_entry_size			// SizeOfData
 	.long	0					// RVA
-	.long	efi_debug_entry - _head			// FileOffset
+	.long	.Lefi_debug_entry - _head		// FileOffset
 
-	.set	efi_debug_table_size, . - efi_debug_table
+	.set	.Lefi_debug_table_size, . - .Lefi_debug_table
 	.previous
 
-efi_debug_entry:
+.Lefi_debug_entry:
 	// EFI_IMAGE_DEBUG_CODEVIEW_NB10_ENTRY
 	.ascii	"NB10"					// Signature
 	.long	0					// Unknown
@@ -137,7 +135,7 @@ efi_debug_entry:
 
 	.asciz	VMLINUX_PATH
 
-	.set	efi_debug_entry_size, . - efi_debug_entry
+	.set	.Lefi_debug_entry_size, . - .Lefi_debug_entry
 #endif
 
 	/*
@@ -148,5 +146,5 @@ efi_debug_entry:
 	 * placed at a 4k boundary in the Image to begin with.
 	 */
 	.balign	SEGMENT_ALIGN
-efi_header_end:
+.Lefi_header_end:
 	.endm

From 45b37e21b3254a079afd6dd83725dbc26ca3ceeb Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Tue, 17 Nov 2020 13:47:29 +0100
Subject: [PATCH 255/737] arm64: head: tidy up the Image header definition

Even though support for EFI boot remains entirely optional for arm64,
it is unlikely that we will ever be able to repurpose the image header
fields that the EFI loader relies on, i.e., the magic NOP at offset
0x0 and the PE header address at offset 0x3c.

So let's factor out the differences into a 'efi_signature_nop' macro and
a local symbol representing the PE header address, and move the
conditional definitions into efi-header.S, taking into account whether
CONFIG_EFI is enabled or not. While at it, switch to a signature NOP
that behaves more like a NOP, i.e., one that only clobbers the
flags.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20201117124729.12642-4-ardb@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
(cherry picked from commit 7919385b9fb3cefe495310e5c44ca8a6d9c446e8)
---
 arch/arm64/kernel/efi-header.S | 43 ++++++++++++++++++++++++++--------
 arch/arm64/kernel/head.S       | 19 ++-------------
 2 files changed, 35 insertions(+), 27 deletions(-)

diff --git a/arch/arm64/kernel/efi-header.S b/arch/arm64/kernel/efi-header.S
index 0a7267e9a594a..f4414598102d9 100644
--- a/arch/arm64/kernel/efi-header.S
+++ b/arch/arm64/kernel/efi-header.S
@@ -7,7 +7,27 @@
 #include <linux/pe.h>
 #include <linux/sizes.h>
 
+	.macro	efi_signature_nop
+#ifdef CONFIG_EFI
+.L_head:
+	/*
+	 * This ccmp instruction has no meaningful effect except that
+	 * its opcode forms the magic "MZ" signature required by UEFI.
+	 */
+	ccmp	x18, #0, #0xd, pl
+#else
+	/*
+	 * Bootloaders may inspect the opcode at the start of the kernel
+	 * image to decide if the kernel is capable of booting via UEFI.
+	 * So put an ordinary NOP here, not the "MZ.." pseudo-nop above.
+	 */
+	nop
+#endif
+	.endm
+
 	.macro	__EFI_PE_HEADER
+#ifdef CONFIG_EFI
+	.set	.Lpe_header_offset, . - .L_head
 	.long	PE_MAGIC
 	.short	IMAGE_FILE_MACHINE_ARM64		// Machine
 	.short	.Lsection_count				// NumberOfSections
@@ -26,8 +46,8 @@
 	.long	__initdata_begin - .Lefi_header_end	// SizeOfCode
 	.long	__pecoff_data_size			// SizeOfInitializedData
 	.long	0					// SizeOfUninitializedData
-	.long	__efistub_efi_pe_entry - _head		// AddressOfEntryPoint
-	.long	.Lefi_header_end - _head		// BaseOfCode
+	.long	__efistub_efi_pe_entry - .L_head	// AddressOfEntryPoint
+	.long	.Lefi_header_end - .L_head		// BaseOfCode
 
 	.quad	0					// ImageBase
 	.long	SEGMENT_ALIGN				// SectionAlignment
@@ -40,10 +60,10 @@
 	.short	0					// MinorSubsystemVersion
 	.long	0					// Win32VersionValue
 
-	.long	_end - _head				// SizeOfImage
+	.long	_end - .L_head				// SizeOfImage
 
 	// Everything before the kernel image is considered part of the header
-	.long	.Lefi_header_end - _head		// SizeOfHeaders
+	.long	.Lefi_header_end - .L_head		// SizeOfHeaders
 	.long	0					// CheckSum
 	.short	IMAGE_SUBSYSTEM_EFI_APPLICATION		// Subsystem
 	.short	0					// DllCharacteristics
@@ -62,7 +82,7 @@
 	.quad	0					// BaseRelocationTable
 
 #ifdef CONFIG_DEBUG_EFI
-	.long	.Lefi_debug_table - _head		// DebugTable
+	.long	.Lefi_debug_table - .L_head		// DebugTable
 	.long	.Lefi_debug_table_size
 #endif
 
@@ -70,9 +90,9 @@
 .Lsection_table:
 	.ascii	".text\0\0\0"
 	.long	__initdata_begin - .Lefi_header_end	// VirtualSize
-	.long	.Lefi_header_end - _head		// VirtualAddress
+	.long	.Lefi_header_end - .L_head		// VirtualAddress
 	.long	__initdata_begin - .Lefi_header_end	// SizeOfRawData
-	.long	.Lefi_header_end - _head		// PointerToRawData
+	.long	.Lefi_header_end - .L_head		// PointerToRawData
 
 	.long	0					// PointerToRelocations
 	.long	0					// PointerToLineNumbers
@@ -84,9 +104,9 @@
 
 	.ascii	".data\0\0\0"
 	.long	__pecoff_data_size			// VirtualSize
-	.long	__initdata_begin - _head		// VirtualAddress
+	.long	__initdata_begin - .L_head		// VirtualAddress
 	.long	__pecoff_data_rawsize			// SizeOfRawData
-	.long	__initdata_begin - _head		// PointerToRawData
+	.long	__initdata_begin - .L_head		// PointerToRawData
 
 	.long	0					// PointerToRelocations
 	.long	0					// PointerToLineNumbers
@@ -121,7 +141,7 @@
 	.long	IMAGE_DEBUG_TYPE_CODEVIEW		// Type
 	.long	.Lefi_debug_entry_size			// SizeOfData
 	.long	0					// RVA
-	.long	.Lefi_debug_entry - _head		// FileOffset
+	.long	.Lefi_debug_entry - .L_head		// FileOffset
 
 	.set	.Lefi_debug_table_size, . - .Lefi_debug_table
 	.previous
@@ -147,4 +167,7 @@
 	 */
 	.balign	SEGMENT_ALIGN
 .Lefi_header_end:
+#else
+	.set	.Lpe_header_offset, 0x0
+#endif
 	.endm
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index e5188f895c7f9..e37ef5a232604 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -59,21 +59,11 @@
  * in the entry routines.
  */
 	__HEAD
-_head:
 	/*
 	 * DO NOT MODIFY. Image header expected by Linux boot-loaders.
 	 */
-#ifdef CONFIG_EFI
-	/*
-	 * This add instruction has no meaningful effect except that
-	 * its opcode forms the magic "MZ" signature required by UEFI.
-	 */
-	add	x13, x18, #0x16
-	b	primary_entry
-#else
+	efi_signature_nop			// special NOP to identity as PE/COFF executable
 	b	primary_entry			// branch to kernel start, magic
-	.long	0				// reserved
-#endif
 	.quad	0				// Image load offset from start of RAM, little-endian
 	le64sym	_kernel_size_le			// Effective size of kernel image, little-endian
 	le64sym	_kernel_flags_le		// Informative flags, little-endian
@@ -81,14 +71,9 @@ _head:
 	.quad	0				// reserved
 	.quad	0				// reserved
 	.ascii	ARM64_IMAGE_MAGIC		// Magic number
-#ifdef CONFIG_EFI
-	.long	pe_header - _head		// Offset to the PE header.
+	.long	.Lpe_header_offset		// Offset to the PE header.
 
-pe_header:
 	__EFI_PE_HEADER
-#else
-	.long	0				// reserved
-#endif
 
 	__INIT
 

From 73b71dbcaccfede555846c85eb3a75f74a52b7db Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Sun, 29 Mar 2020 13:40:57 +0100
Subject: [PATCH 256/737] arm64: efi-header: Mark efi header as data

This file only contains a set of constants forming the efi header.

Make the constants part of a data symbol.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/kernel/efi-header.S | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/kernel/efi-header.S b/arch/arm64/kernel/efi-header.S
index f4414598102d9..5fb744d8acf24 100644
--- a/arch/arm64/kernel/efi-header.S
+++ b/arch/arm64/kernel/efi-header.S
@@ -28,6 +28,7 @@
 	.macro	__EFI_PE_HEADER
 #ifdef CONFIG_EFI
 	.set	.Lpe_header_offset, . - .L_head
+SYM_DATA_START_LOCAL(arm64_efi_header)
 	.long	PE_MAGIC
 	.short	IMAGE_FILE_MACHINE_ARM64		// Machine
 	.short	.Lsection_count				// NumberOfSections
@@ -167,6 +168,7 @@
 	 */
 	.balign	SEGMENT_ALIGN
 .Lefi_header_end:
+SYM_DATA_END_LABEL(arm64_efi_header, SYM_L_LOCAL, efi_header_end)
 #else
 	.set	.Lpe_header_offset, 0x0
 #endif

From 7b097990c7efa59bfa0e6e19192c5955e6051ee8 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Sun, 29 Mar 2020 13:46:19 +0100
Subject: [PATCH 257/737] arm64: head: Mark constants as data

Add data annotations to constants part of the image header.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/kernel/head.S | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index e37ef5a232604..3882c2e881957 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -62,9 +62,10 @@
 	/*
 	 * DO NOT MODIFY. Image header expected by Linux boot-loaders.
 	 */
-	efi_signature_nop			// special NOP to identity as PE/COFF executable
+SYM_DATA_LOCAL(efi_nop, efi_signature_nop)	// special NOP to identity as PE/COFF executable
 	b	primary_entry			// branch to kernel start, magic
-	.quad	0				// Image load offset from start of RAM, little-endian
+SYM_DATA_LOCAL(_zero_reserved, .quad	0)	// Image load offset from start of RAM, little-endian
+SYM_DATA_START_LOCAL(_arm64_common_header)
 	le64sym	_kernel_size_le			// Effective size of kernel image, little-endian
 	le64sym	_kernel_flags_le		// Informative flags, little-endian
 	.quad	0				// reserved
@@ -72,6 +73,7 @@
 	.quad	0				// reserved
 	.ascii	ARM64_IMAGE_MAGIC		// Magic number
 	.long	.Lpe_header_offset		// Offset to the PE header.
+SYM_DATA_END(_arm64_common_header)
 
 	__EFI_PE_HEADER
 

From 62d4a950616b280287875e507bd6b13465ecf336 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Sun, 29 Mar 2020 16:19:37 +0100
Subject: [PATCH 258/737] arm64: proc: Mark constant as data

Label __idmap_kpti_flag represents the location of a constant.
Mark it as data symbol.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/mm/proc.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index aacc7eab9b2ff..72d2181ae38dd 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -222,8 +222,8 @@ SYM_FUNC_END(idmap_cpu_replace_ttbr1)
  *
  * Called exactly once from stop_machine context by each CPU found during boot.
  */
-__idmap_kpti_flag:
-	.long	1
+SYM_DATA_LOCAL(__idmap_kpti_flag, .long	1)
+
 SYM_FUNC_START(idmap_kpti_install_ng_mappings)
 	cpu		.req	w0
 	num_cpus	.req	w1

From 447fb7f17fcb4913a87cb033e0543f65426b3065 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Thu, 17 Sep 2020 10:03:29 +0100
Subject: [PATCH 259/737] arm64: crypto: Mark data in code sections

Use SYM_DATA_* macros to annotate data bytes in the middle of .text
sections.

For local symbols, ".L" prefix needs to be dropped as the assembler
exclude the symbols from the .o symbol table, making objtool unable
to see them.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/crypto/aes-neonbs-core.S     | 14 +++++++-------
 arch/arm64/crypto/sha256-core.S_shipped | 24 ++++++++++++++----------
 arch/arm64/crypto/sha512-core.S_shipped | 15 ++++++++++-----
 3 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/arch/arm64/crypto/aes-neonbs-core.S b/arch/arm64/crypto/aes-neonbs-core.S
index 63a52ad9a75c0..d5bca3b7d0ea5 100644
--- a/arch/arm64/crypto/aes-neonbs-core.S
+++ b/arch/arm64/crypto/aes-neonbs-core.S
@@ -367,15 +367,15 @@
 
 
 	.align		6
-M0:	.octa		0x0004080c0105090d02060a0e03070b0f
+SYM_DATA_LOCAL(M0,	.octa		0x0004080c0105090d02060a0e03070b0f)
 
-M0SR:	.octa		0x0004080c05090d010a0e02060f03070b
-SR:	.octa		0x0f0e0d0c0a09080b0504070600030201
-SRM0:	.octa		0x01060b0c0207080d0304090e00050a0f
+SYM_DATA_LOCAL(M0SR,	.octa		0x0004080c05090d010a0e02060f03070b)
+SYM_DATA_LOCAL(SR,	.octa		0x0f0e0d0c0a09080b0504070600030201)
+SYM_DATA_LOCAL(SRM0,	.octa		0x01060b0c0207080d0304090e00050a0f)
 
-M0ISR:	.octa		0x0004080c0d0105090a0e0206070b0f03
-ISR:	.octa		0x0f0e0d0c080b0a090504070602010003
-ISRM0:	.octa		0x0306090c00070a0d01040b0e0205080f
+SYM_DATA_LOCAL(M0ISR,	.octa		0x0004080c0d0105090a0e0206070b0f03)
+SYM_DATA_LOCAL(ISR,	.octa		0x0f0e0d0c080b0a090504070602010003)
+SYM_DATA_LOCAL(ISRM0,	.octa		0x0306090c00070a0d01040b0e0205080f)
 
 	/*
 	 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
diff --git a/arch/arm64/crypto/sha256-core.S_shipped b/arch/arm64/crypto/sha256-core.S_shipped
index 787a1fe466a70..571323f6cca33 100644
--- a/arch/arm64/crypto/sha256-core.S_shipped
+++ b/arch/arm64/crypto/sha256-core.S_shipped
@@ -59,6 +59,8 @@
 // deliver much less improvement, likely *negative* on Cortex-A5x.
 // Which is why NEON support is limited to SHA256.]
 
+#include <linux/linkage.h>
+
 #ifndef	__KERNEL__
 # include "arm_arch.h"
 #endif
@@ -72,11 +74,11 @@
 sha256_block_data_order:
 #ifndef	__KERNEL__
 # ifdef	__ILP32__
-	ldrsw	x16,.LOPENSSL_armcap_P
+	ldrsw	x16,OPENSSL_armcap_P_rel
 # else
-	ldr	x16,.LOPENSSL_armcap_P
+	ldr	x16,OPENSSL_armcap_P_rel
 # endif
-	adr	x17,.LOPENSSL_armcap_P
+	adr	x17,OPENSSL_armcap_P_rel
 	add	x16,x16,x17
 	ldr	w16,[x16]
 	tst	w16,#ARMV8_SHA256
@@ -99,7 +101,7 @@ sha256_block_data_order:
 	ldp	w24,w25,[x0,#4*4]
 	add	x2,x1,x2,lsl#6	// end of input
 	ldp	w26,w27,[x0,#6*4]
-	adr	x30,.LK256
+	adr	x30,K256
 	stp	x0,x2,[x29,#96]
 
 .Loop:
@@ -1047,8 +1049,7 @@ sha256_block_data_order:
 .size	sha256_block_data_order,.-sha256_block_data_order
 
 .align	6
-.type	.LK256,%object
-.LK256:
+SYM_DATA_START_LOCAL(K256)
 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
@@ -1066,17 +1067,20 @@ sha256_block_data_order:
 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 	.long	0	//terminator
-.size	.LK256,.-.LK256
+SYM_DATA_END(K256)
 #ifndef	__KERNEL__
 .align	3
-.LOPENSSL_armcap_P:
+SYM_DATA_START_LOCAL(OPENSSL_armcap_P_rel)
 # ifdef	__ILP32__
 	.long	OPENSSL_armcap_P-.
 # else
 	.quad	OPENSSL_armcap_P-.
 # endif
+SYM_DATA_END(OPENSSL_armcap_P_rel)
 #endif
+SYM_DATA_START_LOCAL(OPENSSL_str)
 .asciz	"SHA256 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
+SYM_DATA_END(OPENSSL_str)
 .align	2
 #ifndef	__KERNEL__
 .type	sha256_block_armv8,%function
@@ -1087,7 +1091,7 @@ sha256_block_armv8:
 	add		x29,sp,#0
 
 	ld1		{v0.4s,v1.4s},[x0]
-	adr		x3,.LK256
+	adr		x3,K256
 
 .Loop_hw:
 	ld1		{v4.16b-v7.16b},[x1],#64
@@ -1227,7 +1231,7 @@ sha256_block_neon:
 .Lneon_entry:
 	sub	sp,sp,#16*4
 
-	adr	x16,.LK256
+	adr	x16,K256
 	add	x2,x1,x2,lsl#6	// len to point at the end of inp
 
 	ld1	{v0.16b},[x1], #16
diff --git a/arch/arm64/crypto/sha512-core.S_shipped b/arch/arm64/crypto/sha512-core.S_shipped
index e063a61067201..8477c90cf4baa 100644
--- a/arch/arm64/crypto/sha512-core.S_shipped
+++ b/arch/arm64/crypto/sha512-core.S_shipped
@@ -59,6 +59,8 @@
 // deliver much less improvement, likely *negative* on Cortex-A5x.
 // Which is why NEON support is limited to SHA256.]
 
+#include <linux/linkage.h>
+
 #ifndef	__KERNEL__
 # include "arm_arch.h"
 #endif
@@ -85,7 +87,7 @@ sha512_block_data_order:
 	ldp	x24,x25,[x0,#4*8]
 	add	x2,x1,x2,lsl#7	// end of input
 	ldp	x26,x27,[x0,#6*8]
-	adr	x30,.LK512
+	adr	x30,K512
 	stp	x0,x2,[x29,#96]
 
 .Loop:
@@ -1033,8 +1035,7 @@ sha512_block_data_order:
 .size	sha512_block_data_order,.-sha512_block_data_order
 
 .align	6
-.type	.LK512,%object
-.LK512:
+SYM_DATA_START_LOCAL(K512)
 	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
 	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
 	.quad	0x3956c25bf348b538,0x59f111f1b605d019
@@ -1076,17 +1077,21 @@ sha512_block_data_order:
 	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
 	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
 	.quad	0	// terminator
-.size	.LK512,.-.LK512
+SYM_DATA_END(K512)
+
 #ifndef	__KERNEL__
 .align	3
-.LOPENSSL_armcap_P:
+SYM_DATA_START_LOCAL(OPENSSL_armcap_P_rel)
 # ifdef	__ILP32__
 	.long	OPENSSL_armcap_P-.
 # else
 	.quad	OPENSSL_armcap_P-.
 # endif
+SYM_DATA_END(OPENSSL_armcap_P_rel)
 #endif
+SYM_DATA_START_LOCAL(OPENSSL_str)
 .asciz	"SHA512 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
+SYM_DATA_END(OPENSSL_str)
 .align	2
 #ifndef	__KERNEL__
 .comm	OPENSSL_armcap_P,4,4

From ab42aac448cd575c5129703b81b0bcc78480ee42 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Fri, 3 Apr 2020 09:51:26 +0100
Subject: [PATCH 260/737] objtool: arm64: Add unwind_hint support

Provide unwind hint defines for arm64 and objtool hint decoding.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/include/asm/unwind_hints.h       | 27 +++++++++++++++++++++
 tools/arch/arm64/include/asm/unwind_hints.h | 27 +++++++++++++++++++++
 tools/objtool/arch/arm64/decode.c           |  4 ++-
 3 files changed, 57 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/include/asm/unwind_hints.h
 create mode 100644 tools/arch/arm64/include/asm/unwind_hints.h

diff --git a/arch/arm64/include/asm/unwind_hints.h b/arch/arm64/include/asm/unwind_hints.h
new file mode 100644
index 0000000000000..60f866e4e12c8
--- /dev/null
+++ b/arch/arm64/include/asm/unwind_hints.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __ASM_UNWIND_HINTS_H
+#define __ASM_UNWIND_HINTS_H
+
+#include <linux/objtool.h>
+
+#define UNWIND_HINT_REG_UNDEFINED	0xff
+#define UNWIND_HINT_REG_SP		31
+
+#ifdef __ASSEMBLY__
+
+.macro UNWIND_HINT_EMPTY
+	UNWIND_HINT sp_reg=UNWIND_HINT_REG_UNDEFINED type=UNWIND_HINT_TYPE_CALL
+.endm
+
+.macro UNWIND_HINT_FUNC sp_offset=0
+	UNWIND_HINT sp_reg=UNWIND_HINT_REG_SP sp_offset=\sp_offset type=UNWIND_HINT_TYPE_CALL
+.endm
+
+.macro UNWIND_HINT_REGS base=UNWIND_HINT_REG_SP offset=0
+	UNWIND_HINT sp_reg=\base sp_offset=\offset type=UNWIND_HINT_TYPE_REGS
+.endm
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __ASM_UNWIND_HINTS_H */
diff --git a/tools/arch/arm64/include/asm/unwind_hints.h b/tools/arch/arm64/include/asm/unwind_hints.h
new file mode 100644
index 0000000000000..60f866e4e12c8
--- /dev/null
+++ b/tools/arch/arm64/include/asm/unwind_hints.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __ASM_UNWIND_HINTS_H
+#define __ASM_UNWIND_HINTS_H
+
+#include <linux/objtool.h>
+
+#define UNWIND_HINT_REG_UNDEFINED	0xff
+#define UNWIND_HINT_REG_SP		31
+
+#ifdef __ASSEMBLY__
+
+.macro UNWIND_HINT_EMPTY
+	UNWIND_HINT sp_reg=UNWIND_HINT_REG_UNDEFINED type=UNWIND_HINT_TYPE_CALL
+.endm
+
+.macro UNWIND_HINT_FUNC sp_offset=0
+	UNWIND_HINT sp_reg=UNWIND_HINT_REG_SP sp_offset=\sp_offset type=UNWIND_HINT_TYPE_CALL
+.endm
+
+.macro UNWIND_HINT_REGS base=UNWIND_HINT_REG_SP offset=0
+	UNWIND_HINT sp_reg=\base sp_offset=\offset type=UNWIND_HINT_TYPE_REGS
+.endm
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __ASM_UNWIND_HINTS_H */
diff --git a/tools/objtool/arch/arm64/decode.c b/tools/objtool/arch/arm64/decode.c
index 01c470c3324ee..3e0193682eba6 100644
--- a/tools/objtool/arch/arm64/decode.c
+++ b/tools/objtool/arch/arm64/decode.c
@@ -4,7 +4,10 @@
 #include <stdlib.h>
 #include <stdint.h>
 
+#include <linux/objtool.h>
+
 #include <asm/insn.h>
+#include <asm/unwind_hints.h>
 
 #include <objtool/check.h>
 #include <objtool/arch.h>
@@ -181,7 +184,6 @@ int arch_decode_hint_reg(u8 sp_reg, int *base)
 
 	return 0;
 }
-}
 
 static struct stack_op *arm_make_store_op(enum aarch64_insn_register base,
 					  enum aarch64_insn_register reg,

From 8fd6e6b01852ca86addf80d83978b9ddbe00234a Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Wed, 20 Jan 2021 11:58:40 +0100
Subject: [PATCH 261/737] arm64: Annotate ASM symbols with unknown stack state

Some assembly symbols contain code that might be executed with an
unspecified stack state (e.g. invalid stack pointer,
no stackframe, ...).

Annotate those symbol with UNWIND_HINT_EMPTY to let objtool be aware of
them.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/include/asm/assembler.h  |  2 ++
 arch/arm64/kernel/cpu-reset.S       |  2 ++
 arch/arm64/kernel/efi-entry.S       |  2 ++
 arch/arm64/kernel/entry.S           |  5 +++++
 arch/arm64/kernel/head.S            | 13 +++++++++++++
 arch/arm64/kernel/hibernate-asm.S   |  4 ++++
 arch/arm64/kernel/relocate_kernel.S |  3 ++-
 arch/arm64/kernel/sleep.S           |  3 +++
 arch/arm64/kvm/hyp/hyp-entry.S      |  2 ++
 9 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 011e681a23366..f507feb045e35 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -22,6 +22,7 @@
 #include <asm/pgtable-hwdef.h>
 #include <asm/ptrace.h>
 #include <asm/thread_info.h>
+#include <asm/unwind_hints.h>
 
 	.macro save_and_disable_daif, flags
 	mrs	\flags, daif
@@ -150,6 +151,7 @@ lr	.req	x30		// link register
  */
 	 .macro	ventry	label
 	.align	7
+	UNWIND_HINT_EMPTY
 	b	\label
 	.endm
 
diff --git a/arch/arm64/kernel/cpu-reset.S b/arch/arm64/kernel/cpu-reset.S
index 37721eb6f9a14..fcbf7bde6b2c9 100644
--- a/arch/arm64/kernel/cpu-reset.S
+++ b/arch/arm64/kernel/cpu-reset.S
@@ -10,6 +10,7 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include <asm/sysreg.h>
+#include <asm/unwind_hints.h>
 #include <asm/virt.h>
 
 .text
@@ -30,6 +31,7 @@
  * flat identity mapping.
  */
 SYM_CODE_START(__cpu_soft_restart)
+	UNWIND_HINT_EMPTY
 	/* Clear sctlr_el1 flags. */
 	mrs	x12, sctlr_el1
 	mov_q	x13, SCTLR_ELx_FLAGS
diff --git a/arch/arm64/kernel/efi-entry.S b/arch/arm64/kernel/efi-entry.S
index 0073b24b5d25e..e8f930a43820f 100644
--- a/arch/arm64/kernel/efi-entry.S
+++ b/arch/arm64/kernel/efi-entry.S
@@ -9,10 +9,12 @@
 #include <linux/init.h>
 
 #include <asm/assembler.h>
+#include <asm/unwind_hints.h>
 
 	__INIT
 
 SYM_CODE_START(efi_enter_kernel)
+	UNWIND_HINT_EMPTY
 	/*
 	 * efi_pe_entry() will have copied the kernel image if necessary and we
 	 * end up here with device tree address in x1 and the kernel entry
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index ae21bf576f1dc..89b25990638fc 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -28,6 +28,7 @@
 #include <asm/thread_info.h>
 #include <asm/asm-uaccess.h>
 #include <asm/unistd.h>
+#include <asm/unwind_hints.h>
 
 /*
  * Context tracking and irqflag tracing need to instrument transitions between
@@ -62,6 +63,7 @@
 
 	.macro kernel_ventry, el, label, regsize = 64
 	.align 7
+	UNWIND_HINT_EMPTY
 .Lventry_start\@:
 	.if	\el == 0
 	/*
@@ -829,6 +831,7 @@ alternative_else_nop_endif
 
 	.macro tramp_ventry, vector_start, regsize, kpti, bhb
 	.align	7
+	UNWIND_HINT_EMPTY
 1:
 	.if	\regsize == 64
 	msr	tpidrro_el0, x30	// Restored in kernel_ventry
@@ -930,10 +933,12 @@ SYM_CODE_START_NOALIGN(tramp_vectors)
 SYM_CODE_END(tramp_vectors)
 
 SYM_CODE_START_LOCAL(tramp_exit_native)
+	UNWIND_HINT_EMPTY
 	tramp_exit
 SYM_CODE_END(tramp_exit_native)
 
 SYM_CODE_START_LOCAL(tramp_exit_compat)
+	UNWIND_HINT_EMPTY
 	tramp_exit	32
 SYM_CODE_END(tramp_exit_compat)
 
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 3882c2e881957..e5c82863bee4a 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -33,6 +33,7 @@
 #include <asm/smp.h>
 #include <asm/sysreg.h>
 #include <asm/thread_info.h>
+#include <asm/unwind_hints.h>
 #include <asm/virt.h>
 
 #include "efi-header.S"
@@ -63,6 +64,7 @@
 	 * DO NOT MODIFY. Image header expected by Linux boot-loaders.
 	 */
 SYM_DATA_LOCAL(efi_nop, efi_signature_nop)	// special NOP to identity as PE/COFF executable
+	UNWIND_HINT_EMPTY
 	b	primary_entry			// branch to kernel start, magic
 SYM_DATA_LOCAL(_zero_reserved, .quad	0)	// Image load offset from start of RAM, little-endian
 SYM_DATA_START_LOCAL(_arm64_common_header)
@@ -111,6 +113,7 @@ SYM_CODE_END(primary_entry)
  * Preserve the arguments passed by the bootloader in x0 .. x3
  */
 SYM_CODE_START_LOCAL(preserve_boot_args)
+	UNWIND_HINT_EMPTY
 	mov	x21, x0				// x21=FDT
 
 	adr_l	x0, boot_args			// record the contents of
@@ -262,6 +265,7 @@ SYM_CODE_END(preserve_boot_args)
  *     been enabled
  */
 SYM_CODE_START_LOCAL(__create_page_tables)
+	UNWIND_HINT_EMPTY
 	mov	x28, lr
 
 	/*
@@ -491,6 +495,7 @@ EXPORT_SYMBOL(kimage_vaddr)
  * booted in EL1 or EL2 respectively.
  */
 SYM_CODE_START(init_kernel_el)
+	UNWIND_HINT_EMPTY
 	msr	SPsel, #1			// We want to use SP_EL{1,2}
 	mrs	x0, CurrentEL
 	cmp	x0, #CurrentEL_EL2
@@ -658,6 +663,7 @@ SYM_CODE_END(init_kernel_el)
  * in w0. See arch/arm64/include/asm/virt.h for more info.
  */
 SYM_CODE_START_LOCAL(set_cpu_boot_mode_flag)
+	UNWIND_HINT_EMPTY
 	adr_l	x1, __boot_cpu_mode
 	cmp	w0, #BOOT_CPU_MODE_EL2
 	b.ne	1f
@@ -701,6 +707,7 @@ SYM_DATA_END(__early_cpu_boot_status)
 	 * cores are held until we're ready for them to initialise.
 	 */
 SYM_CODE_START(secondary_holding_pen)
+	UNWIND_HINT_EMPTY
 	bl	init_kernel_el			// w0=cpu_boot_mode
 	bl	set_cpu_boot_mode_flag
 	mrs	x0, mpidr_el1
@@ -719,6 +726,7 @@ SYM_CODE_END(secondary_holding_pen)
 	 * be used where CPUs are brought online dynamically by the kernel.
 	 */
 SYM_CODE_START(secondary_entry)
+	UNWIND_HINT_EMPTY
 	bl	init_kernel_el			// w0=cpu_boot_mode
 	bl	set_cpu_boot_mode_flag
 	b	secondary_startup
@@ -737,6 +745,7 @@ SYM_CODE_START_LOCAL(secondary_startup)
 SYM_CODE_END(secondary_startup)
 
 SYM_CODE_START_LOCAL(__secondary_switched)
+	UNWIND_HINT_EMPTY
 	adr_l	x5, vectors
 	msr	vbar_el1, x5
 	isb
@@ -760,6 +769,7 @@ SYM_CODE_START_LOCAL(__secondary_switched)
 SYM_CODE_END(__secondary_switched)
 
 SYM_CODE_START_LOCAL(__secondary_too_slow)
+	UNWIND_HINT_EMPTY
 	wfe
 	wfi
 	b	__secondary_too_slow
@@ -796,6 +806,7 @@ SYM_CODE_END(__secondary_too_slow)
  * If it isn't, park the CPU
  */
 SYM_CODE_START(__enable_mmu)
+	UNWIND_HINT_EMPTY
 	mrs	x2, ID_AA64MMFR0_EL1
 	ubfx	x2, x2, #ID_AA64MMFR0_TGRAN_SHIFT, 4
 	cmp     x2, #ID_AA64MMFR0_TGRAN_SUPPORTED_MIN
@@ -824,6 +835,7 @@ SYM_CODE_START(__enable_mmu)
 SYM_CODE_END(__enable_mmu)
 
 SYM_CODE_START_LOCAL(__cpu_secondary_check52bitva)
+	UNWIND_HINT_EMPTY
 #ifdef CONFIG_ARM64_VA_BITS_52
 	ldr_l	x0, vabits_actual
 	cmp	x0, #52
@@ -855,6 +867,7 @@ SYM_CODE_END(__no_granule_support)
 
 #ifdef CONFIG_RELOCATABLE
 SYM_CODE_START_LOCAL(__relocate_kernel)
+	UNWIND_HINT_EMPTY
 	/*
 	 * Iterate over each entry in the relocation table, and apply the
 	 * relocations in place.
diff --git a/arch/arm64/kernel/hibernate-asm.S b/arch/arm64/kernel/hibernate-asm.S
index 8ccca660034e4..8ce1bfe86546b 100644
--- a/arch/arm64/kernel/hibernate-asm.S
+++ b/arch/arm64/kernel/hibernate-asm.S
@@ -13,6 +13,7 @@
 #include <asm/cputype.h>
 #include <asm/memory.h>
 #include <asm/page.h>
+#include <asm/unwind_hints.h>
 #include <asm/virt.h>
 
 /*
@@ -66,6 +67,7 @@
  */
 .pushsection    ".hibernate_exit.text", "ax"
 SYM_CODE_START(swsusp_arch_suspend_exit)
+	UNWIND_HINT_EMPTY
 	/*
 	 * We execute from ttbr0, change ttbr1 to our copied linear map tables
 	 * with a break-before-make via the zero page
@@ -120,12 +122,14 @@ SYM_CODE_END(swsusp_arch_suspend_exit)
  * x24: The physical address of __hyp_stub_vectors
  */
 SYM_CODE_START_LOCAL(el1_sync)
+	UNWIND_HINT_EMPTY
 	msr	vbar_el2, x24
 	eret
 SYM_CODE_END(el1_sync)
 
 .macro invalid_vector	label
 SYM_CODE_START_LOCAL(\label)
+	UNWIND_HINT_EMPTY
 	b \label
 SYM_CODE_END(\label)
 .endm
diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S
index a4fbfd8d92550..fa4ddf37a24cf 100644
--- a/arch/arm64/kernel/relocate_kernel.S
+++ b/arch/arm64/kernel/relocate_kernel.S
@@ -13,6 +13,7 @@
 #include <asm/kexec.h>
 #include <asm/page.h>
 #include <asm/sysreg.h>
+#include <asm/unwind_hints.h>
 
 /*
  * arm64_relocate_new_kernel - Put a 2nd stage image in place and boot it.
@@ -27,7 +28,7 @@
  * during the copy operation.
  */
 SYM_CODE_START(arm64_relocate_new_kernel)
-
+	UNWIND_HINT_EMPTY
 	/* Setup the list loop variables. */
 	mov	x18, x2				/* x18 = dtb address */
 	mov	x17, x1				/* x17 = kimage_start */
diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index dc777ca4c0406..f0df495b68cdd 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -4,6 +4,7 @@
 #include <asm/asm-offsets.h>
 #include <asm/assembler.h>
 #include <asm/smp.h>
+#include <asm/unwind_hints.h>
 
 	.text
 /*
@@ -100,6 +101,7 @@ SYM_FUNC_END(__cpu_suspend_enter)
 
 	.pushsection ".idmap.text", "awx"
 SYM_CODE_START(cpu_resume)
+	UNWIND_HINT_EMPTY
 	bl	init_kernel_el
 	bl	__cpu_setup
 	/* enable the MMU early - so we can access sleep_save_stash by va */
@@ -112,6 +114,7 @@ SYM_CODE_END(cpu_resume)
 	.popsection
 
 SYM_CODE_START(_cpu_resume)
+	UNWIND_HINT_EMPTY
 	mrs	x1, mpidr_el1
 	adr_l	x8, mpidr_hash		// x8 = struct mpidr_hash virt address
 
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index bc06243cf4225..497f69819395b 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -149,6 +149,7 @@ SYM_CODE_END(\label)
 
 .macro valid_vect target
 	.align 7
+        UNWIND_HINT_EMPTY
 661:
 	esb
 	stp	x0, x1, [sp, #-16]!
@@ -160,6 +161,7 @@ check_preamble_length 661b, 662b
 
 .macro invalid_vect target
 	.align 7
+        UNWIND_HINT_EMPTY
 661:
 	nop
 	stp	x0, x1, [sp, #-16]!

From 30fc7c06c230f6c0783c8726e533027e20fb20b1 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Wed, 20 Jan 2021 14:09:56 +0100
Subject: [PATCH 262/737] arm64: entry: Annotate valid stack in kernel entry

When taking an exception/interrupt, add unwind hints to indicate from
which point the stack pointer is known to be valid.

Signed-off-by: Julien Thierry <jthierry@redhat.com>

foo
---
 arch/arm64/kernel/entry.S | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 89b25990638fc..7190f1628b900 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -92,6 +92,7 @@
 	tbnz	x0, #THREAD_SHIFT, 0f
 	sub	x0, sp, x0			// x0'' = sp' - x0' = (sp + x0) - sp = x0
 	sub	sp, sp, x0			// sp'' = sp' - x0 = (sp + x0) - x0 = sp
+	UNWIND_HINT_FUNC sp_offset=PT_REGS_SIZE
 	b	el\()\el\()_\label
 
 0:
@@ -124,6 +125,7 @@
 	sub	sp, sp, x0
 	mrs	x0, tpidrro_el0
 #endif
+	UNWIND_HINT_FUNC sp_offset=PT_REGS_SIZE
 	b	el\()\el\()_\label
 .org .Lventry_start\@ + 128	// Did we overflow the ventry slot?
 	.endm

From 8cbf112477bddc97698cad70a1a49c51c9ca6552 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Wed, 20 Jan 2021 14:20:47 +0100
Subject: [PATCH 263/737] arm64: entry: Add annotation when switching to/from
 the irq stack

Handling of interrupts is done on a separate stack. The stack switch
can confuse objtool.

When starting interrupt handling, the only valid assumption is that the
stack pointer points to a valid address. After handling an interrupt,
all that is known is that the stack pointer points to a pt_regs frame.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/kernel/entry.S | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 7190f1628b900..11e7e297d7c60 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -460,6 +460,13 @@ SYM_CODE_END(__swpan_exit_el0)
 #endif
 
 9998:
+	/*
+         * The irq stack might either have no content or already contain a
+         * pt_regs frame. Objtool currently does not support instructions that
+         * can have different stack states, so lets pretend we always have
+         * a clean stack.
+	 */
+	UNWIND_HINT_FUNC
 	.endm
 
 	/*
@@ -470,6 +477,8 @@ SYM_CODE_END(__swpan_exit_el0)
 	.macro	irq_stack_exit
 	mov	sp, x19
 	scs_load_current
+	/* Switch back to the stack that had the PT regs */
+	UNWIND_HINT_REGS
 	.endm
 
 /* GPRs used by entry code */

From 14c51c0ca59998e32c13dc820792413404768efd Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Wed, 20 Jan 2021 14:37:48 +0100
Subject: [PATCH 264/737] arm64: entry: Annotate code switching to tasks

Whether returning to userland or creating a new task, sp is
pointing to a pt_regs frame.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/kernel/entry.S | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 11e7e297d7c60..9a79384ead2c2 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -756,6 +756,7 @@ SYM_CODE_END(el0_error)
  * "slow" syscall return path.
  */
 SYM_CODE_START_LOCAL(ret_to_user)
+	UNWIND_HINT_REGS
 	disable_daif
 	gic_prio_kentry_setup tmp=x3
 #ifdef CONFIG_TRACE_IRQFLAGS
@@ -1049,6 +1050,7 @@ NOKPROBE(cpu_switch_to)
  * This is how we return from a fork.
  */
 SYM_CODE_START(ret_from_fork)
+	UNWIND_HINT_REGS
 	bl	schedule_tail
 	cbz	x19, 1f				// not a kernel thread
 	mov	x0, x20

From ce817dda03038e54eca050a990af712812c17578 Mon Sep 17 00:00:00 2001
From: Julien Thierry <jthierry@redhat.com>
Date: Wed, 20 Jan 2021 16:12:11 +0100
Subject: [PATCH 265/737] arm64: kvm: Annotate stack state for guest enter/exit
 code

Symbol __guest_enter is called from C code, with a valid stack pointer.

Symbol __guest_exit is reached when resuming EL2 execution, and the
previous stack pointer gets restored.

Add adequate unwind hints.

Signed-off-by: Julien Thierry <jthierry@redhat.com>
---
 arch/arm64/kvm/hyp/entry.S | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index 78334aea907af..34c35069a01f9 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -14,6 +14,7 @@
 #include <asm/kvm_asm.h>
 #include <asm/kvm_mmu.h>
 #include <asm/kvm_ptrauth.h>
+#include <asm/unwind_hints.h>
 
 	.text
 
@@ -21,6 +22,7 @@
  * u64 __guest_enter(struct kvm_vcpu *vcpu);
  */
 SYM_CODE_START(__guest_enter)
+	UNWIND_HINT_FUNC
 	// x0: vcpu
 	// x1-x17: clobbered by macros
 	// x29: guest context
@@ -104,6 +106,7 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL)
 	// x1: vcpu
 	// x2-x29,lr: vcpu regs
 	// vcpu x0-x1 on the stack
+	UNWIND_HINT_FUNC sp_offset=16
 
 	add	x1, x1, #VCPU_CONTEXT
 

From 395a165d08de326ea0cefb3eda8334e6b152855b Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <surajjs@amazon.com>
Date: Mon, 3 May 2021 17:43:13 -0700
Subject: [PATCH 266/737] arm64: implement live patching

It's my understanding that the two pieces of work required to enable live
patching on arm are in flight upstream;
- Reliable stack traces as implemented by Madhavan T. Venkataraman [1]
- Objtool as implemented by Julien Thierry [2]

This is the remaining part required to enable live patching on arm.
Based on work by Torsten Duwe [3]

Allocate a task flag used to represent the patch pending state for the
task. Also implement generic functions klp_arch_set_pc() &
klp_get_ftrace_location().

In klp_arch_set_pc() it is sufficient to set regs->pc as in
ftrace_common_return() the return address is loaded from the stack.

ldr     x9, [sp, #S_PC]
<snip>
ret     x9

In klp_get_ftrace_location() it is necessary to advance the address by
AARCH64_INSN_SIZE (4) to point to the BL in the callsite as 2 nops were
placed at the start of the function, one to be patched to save the LR and
another to be patched to branch to the ftrace call, and
klp_get_ftrace_location() is expected to return the address of the BL. It
may also be necessary to advance the address by another AARCH64_INSN_SIZE
if CONFIG_ARM64_BTI_KERNEL is enabled due to the instruction placed at the
branch target to satisfy BTI,

Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>

[1] https://lkml.org/lkml/2021/5/26/1212
[2] https://lkml.org/lkml/2021/3/3/1135
[3] https://lkml.org/lkml/2018/10/26/536
---
 arch/arm64/Kconfig                   |  3 +++
 arch/arm64/include/asm/livepatch.h   | 40 ++++++++++++++++++++++++++++
 arch/arm64/include/asm/thread_info.h |  4 ++-
 arch/arm64/kernel/entry.S            |  3 ++-
 arch/arm64/kernel/signal.c           |  4 +++
 5 files changed, 52 insertions(+), 2 deletions(-)
 create mode 100644 arch/arm64/include/asm/livepatch.h

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index e5313b2745de0..8177fb77a8948 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -171,6 +171,7 @@ config ARM64
 	select HAVE_GCC_PLUGINS
 	select HAVE_HW_BREAKPOINT if PERF_EVENTS
 	select HAVE_IRQ_TIME_ACCOUNTING
+	select HAVE_LIVEPATCH
 	select HAVE_NMI
 	select HAVE_PATA_PLATFORM
 	select HAVE_PERF_EVENTS
@@ -2011,3 +2012,5 @@ source "arch/arm64/kvm/Kconfig"
 if CRYPTO
 source "arch/arm64/crypto/Kconfig"
 endif
+
+source "kernel/livepatch/Kconfig"
diff --git a/arch/arm64/include/asm/livepatch.h b/arch/arm64/include/asm/livepatch.h
new file mode 100644
index 0000000000000..9bbd18774680b
--- /dev/null
+++ b/arch/arm64/include/asm/livepatch.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * livepatch.h - arm64-specific Kernel Live Patching Core
+ */
+#ifndef _ASM_ARM64_LIVEPATCH_H
+#define _ASM_ARM64_LIVEPATCH_H
+
+#include <linux/ftrace.h>
+
+static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
+{
+	regs->pc = ip;
+}
+
+/*
+ * klp_get_ftrace_location is expected to return the address of the BL to the
+ * relevant ftrace handler in the callsite. The location of this can vary based
+ * on several compilation options.
+ * CONFIG_DYNAMIC_FTRACE_WITH_REGS
+ *	- Inserts 2 nops on function entry the second of which is the BL
+ *	  referenced above. (See ftrace_init_nop() for the callsite sequence)
+ *	  (this is required by livepatch and must be selected)
+ * CONFIG_ARM64_BTI_KERNEL:
+ *	- Inserts a hint(BTI C) on function entry if the function is called
+ *	  indirectly (to satisfy BTI requirements), which is inserted before
+ *	  the two nops from above.
+ */
+#define klp_get_ftrace_location klp_get_ftrace_location
+static inline unsigned long klp_get_ftrace_location(unsigned long faddr)
+{
+	unsigned long addr = faddr + AARCH64_INSN_SIZE;
+
+#if IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)
+	addr = ftrace_location_range(addr, addr + AARCH64_INSN_SIZE);
+#endif
+
+	return addr;
+}
+
+#endif /* _ASM_ARM64_LIVEPATCH_H */
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index cdcf307764aad..fda0458a98681 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -74,6 +74,7 @@ void arch_release_task_struct(struct task_struct *tsk);
 #define TIF_SYSCALL_TRACEPOINT	10	/* syscall tracepoint for ftrace */
 #define TIF_SECCOMP		11	/* syscall secure computing */
 #define TIF_SYSCALL_EMU		12	/* syscall emulation active */
+#define TIF_PATCH_PENDING	13	/* pending live patching update */
 #define TIF_MEMDIE		18	/* is terminating due to OOM killer */
 #define TIF_FREEZE		19
 #define TIF_RESTORE_SIGMASK	20
@@ -100,11 +101,12 @@ void arch_release_task_struct(struct task_struct *tsk);
 #define _TIF_SVE		(1 << TIF_SVE)
 #define _TIF_MTE_ASYNC_FAULT	(1 << TIF_MTE_ASYNC_FAULT)
 #define _TIF_NOTIFY_SIGNAL	(1 << TIF_NOTIFY_SIGNAL)
+#define _TIF_PATCH_PENDING	(1 << TIF_PATCH_PENDING)
 
 #define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
 				 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
 				 _TIF_UPROBE | _TIF_FSCHECK | _TIF_MTE_ASYNC_FAULT | \
-				 _TIF_NOTIFY_SIGNAL)
+				 _TIF_NOTIFY_SIGNAL | _TIF_PATCH_PENDING)
 
 #define _TIF_SYSCALL_WORK	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
 				 _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 9a79384ead2c2..9b6a50ff7337f 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -763,7 +763,8 @@ SYM_CODE_START_LOCAL(ret_to_user)
 	bl	trace_hardirqs_off
 #endif
 	ldr	x19, [tsk, #TSK_TI_FLAGS]
-	and	x2, x19, #_TIF_WORK_MASK
+	movz x2, #_TIF_WORK_MASK
+	and	x2, x19, x2
 	cbnz	x2, work_pending
 finish_ret_to_user:
 	user_enter_irqoff
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index b6fbbd527dd79..7820d496d16b2 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -18,6 +18,7 @@
 #include <linux/sizes.h>
 #include <linux/string.h>
 #include <linux/tracehook.h>
+#include <linux/livepatch.h>
 #include <linux/ratelimit.h>
 #include <linux/syscalls.h>
 
@@ -938,6 +939,9 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
 					       (void __user *)NULL, current);
 			}
 
+			if (thread_flags & _TIF_PATCH_PENDING)
+				klp_update_patch_state(current);
+
 			if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
 				do_signal(regs);
 

From 6e9c74d92fd06f3d65001efc52d8d1f820648253 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 29 Sep 2021 17:17:27 +0200
Subject: [PATCH 267/737] sched: Simplify wake_up_*idle*()

Simplify and make wake_up_if_idle() more robust, also don't iterate
the whole machine with preempt_disable() in it's caller:
wake_up_all_idle_cpus().

This prepares for another wake_up_if_idle() user that needs a full
do_idle() cycle.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c | 14 +++++---------
 kernel/smp.c        |  6 +++---
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9d6dd14cfd261..49f1ae8fd3236 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2639,15 +2639,11 @@ void wake_up_if_idle(int cpu)
 	if (!is_idle_task(rcu_dereference(rq->curr)))
 		goto out;
 
-	if (set_nr_if_polling(rq->idle)) {
-		trace_sched_wake_idle_without_ipi(cpu);
-	} else {
-		rq_lock_irqsave(rq, &rf);
-		if (is_idle_task(rq->curr))
-			smp_send_reschedule(cpu);
-		/* Else CPU is not idle, do nothing here: */
-		rq_unlock_irqrestore(rq, &rf);
-	}
+	rq_lock_irqsave(rq, &rf);
+	if (is_idle_task(rq->curr))
+		resched_curr(rq);
+	/* Else CPU is not idle, do nothing here: */
+	rq_unlock_irqrestore(rq, &rf);
 
 out:
 	rcu_read_unlock();
diff --git a/kernel/smp.c b/kernel/smp.c
index b0684b4c111e9..0e13d65e348db 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -952,14 +952,14 @@ void wake_up_all_idle_cpus(void)
 {
 	int cpu;
 
-	preempt_disable();
+	cpus_read_lock();
 	for_each_online_cpu(cpu) {
-		if (cpu == smp_processor_id())
+		if (cpu == raw_smp_processor_id())
 			continue;
 
 		wake_up_if_idle(cpu);
 	}
-	preempt_enable();
+	cpus_read_unlock();
 }
 EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
 

From 20b31fb77caae68412fd78d5207b724e2dd27a40 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 29 Sep 2021 17:17:28 +0200
Subject: [PATCH 268/737] sched,livepatch: Use wake_up_if_idle()

Make sure to prod idle CPUs so they call klp_update_patch_state().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/livepatch/transition.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index b04b87a4e0a7b..f4195d78db29d 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -416,8 +416,11 @@ void klp_try_complete_transition(void)
 	for_each_possible_cpu(cpu) {
 		task = idle_task(cpu);
 		if (cpu_online(cpu)) {
-			if (!klp_try_switch_task(task))
+			if (!klp_try_switch_task(task)) {
 				complete = false;
+				/* Make idle task go through the main loop. */
+				wake_up_if_idle(cpu);
+			}
 		} else if (task->patch_state != klp_target_state) {
 			/* offline idle tasks can be switched immediately */
 			clear_tsk_thread_flag(task, TIF_PATCH_PENDING);

From b0d28f114a38213f875c0f5ccdc5577db3acae55 Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <surajjs@amazon.com>
Date: Fri, 15 Oct 2021 13:56:40 -0700
Subject: [PATCH 269/737] ARM64: kvm: vgic-v3-sr: Bug when trying to read
 invalid APRs

There are 4 interrupt controller active priorities group [0-1] registers which
are read through __vgic_v3_read_ap0rn() and __vgic_v3_read_ap1rn(). When these
functions are passed an argument which isn't 0-3 they fall through to what ever
happens to be after them in memory. To avoid this BUG() in the case where an
invalid argument is passed.

Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/kvm/hyp/vgic-v3-sr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
index 452f4cacd6743..b47aac452fe18 100644
--- a/arch/arm64/kvm/hyp/vgic-v3-sr.c
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -164,7 +164,7 @@ static u32 __vgic_v3_read_ap0rn(int n)
 		val = read_gicreg(ICH_AP0R3_EL2);
 		break;
 	default:
-		unreachable();
+		BUG();
 	}
 
 	return val;
@@ -188,7 +188,7 @@ static u32 __vgic_v3_read_ap1rn(int n)
 		val = read_gicreg(ICH_AP1R3_EL2);
 		break;
 	default:
-		unreachable();
+		BUG();
 	}
 
 	return val;

From 22b73d2390b2866a11fa66ff1eafdfb4e534b482 Mon Sep 17 00:00:00 2001
From: Ethan Chen <yishache@amazon.com>
Date: Tue, 26 Oct 2021 21:53:17 +0000
Subject: [PATCH 270/737] efa: update to 1.14.1

Signed-off-by: Ethan Chen <yishache@amazon.com>
---
 drivers/amazon/net/efa/config.h              | 241 ++-------
 drivers/amazon/net/efa/efa-abi.h             |  42 --
 drivers/amazon/net/efa/efa.h                 |  88 +---
 drivers/amazon/net/efa/efa_admin_cmds_defs.h |  25 +-
 drivers/amazon/net/efa/efa_admin_defs.h      |   4 +-
 drivers/amazon/net/efa/efa_com.c             |  33 +-
 drivers/amazon/net/efa/efa_com_cmd.c         |   2 -
 drivers/amazon/net/efa/efa_com_cmd.h         |   4 -
 drivers/amazon/net/efa/efa_gdr.c             | 110 ++--
 drivers/amazon/net/efa/efa_gdr.h             |  20 +-
 drivers/amazon/net/efa/efa_main.c            | 247 +--------
 drivers/amazon/net/efa/efa_sysfs.c           |  10 +-
 drivers/amazon/net/efa/efa_verbs.c           | 505 ++++---------------
 drivers/amazon/net/efa/kcompat.h             |  16 +-
 drivers/amazon/net/efa/nv-p2p.h              | 439 ++++++++++++++++
 15 files changed, 736 insertions(+), 1050 deletions(-)
 create mode 100644 drivers/amazon/net/efa/nv-p2p.h

diff --git a/drivers/amazon/net/efa/config.h b/drivers/amazon/net/efa/config.h
index 127e3a9f218e3..53a5eb2eeaecb 100644
--- a/drivers/amazon/net/efa/config.h
+++ b/drivers/amazon/net/efa/config.h
@@ -1,218 +1,45 @@
-/* src/config.h.  Generated from config.h.in by configure.  */
-/* src/config.h.in.  Generated from configure.ac by autoheader.  */
-
-/* have ah core allocation */
+#define HAVE_UMEM_SCATTERLIST_IF 1
+#define HAVE_CREATE_CQ_ATTR 1
+#define HAVE_CREATE_AH_RDMA_ATTR 1
+#define HAVE_DEV_PARENT 1
+#define HAVE_POST_CONST_WR 1
+#define HAVE_MAX_SEND_RCV_SGE 1
+#define HAVE_IB_MODIFY_QP_IS_OK_FOUR_PARAMS 1
+#define HAVE_IB_DEV_OPS 1
+#define HAVE_SG_DMA_PAGE_ITER 1
+#define HAVE_PD_CORE_ALLOCATION 1
+#define HAVE_UCONTEXT_CORE_ALLOCATION 1
+#define HAVE_NO_KVERBS_DRIVERS 1
+#define HAVE_UDATA_TO_DRV_CONTEXT 1
+#define HAVE_SAFE_IB_ALLOC_DEVICE 1
 #define HAVE_AH_CORE_ALLOCATION 1
-
-/* destroy_ah has return code again */
-#define HAVE_AH_CORE_ALLOCATION_DESTROY_RC 1
-
-/* have device ops alloc_pd without ucontext */
 #define HAVE_ALLOC_PD_NO_UCONTEXT 1
-
-/* atomic64_fetch_inc exists */
-#define HAVE_ATOMIC64_FETCH_INC 1
-
-/* have bitfield.h */
-#define HAVE_BITFIELD_H 1
-
-/* have core mmap xarray */
-#define HAVE_CORE_MMAP_XA 1
-
-/* have cq core allocation */
-#define HAVE_CQ_CORE_ALLOCATION 1
-
-/* rdma_ah_init_attr exists */
-#define HAVE_CREATE_AH_INIT_ATTR 1
-
-/* create_ah doesn't have udata */
-/* #undef HAVE_CREATE_AH_NO_UDATA */
-
-/* create_ah has rdma_attr */
-#define HAVE_CREATE_AH_RDMA_ATTR 1
-
-/* create_ah has udata */
-/* #undef HAVE_CREATE_AH_UDATA */
-
-/* create_cq has attr param */
-#define HAVE_CREATE_CQ_ATTR 1
-
-/* have device ops create_cq without ucontext */
-/* #undef HAVE_CREATE_CQ_NO_UCONTEXT */
-
-/* create/destroy_ah has flags */
-/* #undef HAVE_CREATE_DESTROY_AH_FLAGS */
-
-/* have device ops dealloc pd has udata */
-/* #undef HAVE_DEALLOC_PD_UDATA */
-
-/* dealloc_pd has udata and return code */
-#define HAVE_DEALLOC_PD_UDATA_RC 1
-
-/* have device ops dereg mr udata */
 #define HAVE_DEREG_MR_UDATA 1
-
-/* have device ops destroy cq udata */
 #define HAVE_DESTROY_CQ_UDATA 1
-
-/* have device ops destroy qp udata */
 #define HAVE_DESTROY_QP_UDATA 1
-
-/* dev has parent field */
-#define HAVE_DEV_PARENT 1
-
-/* driver_id field exists */
-/* #undef HAVE_DRIVER_ID */
-
-/* efa gdr enabled */
-/* #undef HAVE_EFA_GDR */
-
-/* get_port_immutable exists */
-#define HAVE_GET_PORT_IMMUTABLE 1
-
-/* have hw_stats */
-#define HAVE_HW_STATS 1
-
-/* have ibdev print */
-#define HAVE_IBDEV_PRINT 1
-
-/* have ibdev ratelimited print */
-#define HAVE_IBDEV_PRINT_RATELIMITED 1
-
-/* IB_ACCESS_OPTIONAL exists */
-#define HAVE_IB_ACCESS_OPTIONAL 1
-
-/* ib_device_ops has common fields */
+#define HAVE_UPSTREAM_EFA 1
+#define HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE 1
 #define HAVE_IB_DEVICE_OPS_COMMON 1
-
-/* struct ib_device_ops exists */
-#define HAVE_IB_DEV_OPS 1
-
-/* destroy_cq has return code again */
-#define HAVE_IB_INT_DESTROY_CQ 1
-
-/* have ib_is_udata_cleared */
-#define HAVE_IB_IS_UDATA_CLEARED 1
-
-/* ib_modify_qp_is_ok has four params */
-#define HAVE_IB_MODIFY_QP_IS_OK_FOUR_PARAMS 1
-
-/* ib_mr has length field */
-#define HAVE_IB_MR_LENGTH 1
-
-/* ib_mtu_int_to_enum exists */
-#define HAVE_IB_MTU_INT_TO_ENUM 1
-
-/* have ib port phys state link up */
+#define HAVE_CQ_CORE_ALLOCATION 1
 #define HAVE_IB_PORT_PHYS_STATE_LINK_UP 1
-
-/* have driver qpt */
-#define HAVE_IB_QPT_DRIVER 1
-
-/* query_device has udata */
-#define HAVE_IB_QUERY_DEVICE_UDATA 1
-
-/* ib_register_device has dma_device param */
-#define HAVE_IB_REGISTER_DEVICE_DMA_DEVICE_PARAM 1
-
-/* ib_register_device has name param */
-/* #undef HAVE_IB_REGISTER_DEVICE_NAME_PARAM */
-
-/* ib_register_device has two params */
-/* #undef HAVE_IB_REGISTER_DEVICE_TWO_PARAMS */
-
-/* ib_umem_find_single_pg_size exists */
-#define HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE 1
-
-/* have ib_umem_get device param */
-#define HAVE_IB_UMEM_GET_DEVICE_PARAM 1
-
-/* ib_umem_get has no dmasync parameter */
-#define HAVE_IB_UMEM_GET_NO_DMASYNC 1
-
-/* ib_umem_get has udata */
-/* #undef HAVE_IB_UMEM_GET_UDATA */
-
-/* ib_umem_num_dma_blocks exists */
-#define HAVE_IB_UMEM_NUM_DMA_BLOCKS 1
-
-/* have void destroy cq */
-/* #undef HAVE_IB_VOID_DESTROY_CQ */
-
-/* have kvzalloc */
 #define HAVE_KVZALLOC 1
-
-/* ib_device_attr has max_send_recv_sge */
-#define HAVE_MAX_SEND_RCV_SGE 1
-
-/* have no kverbs drivers */
-#define HAVE_NO_KVERBS_DRIVERS 1
-
-/* have pci_irq_vector */
-#define HAVE_PCI_IRQ_VECTOR 1
-
-/* have amazon pci id */
+#define HAVE_IBDEV_PRINT_RATELIMITED 1
+#define HAVE_IBDEV_PRINT 1
+#define HAVE_IB_QPT_DRIVER 1
+#define HAVE_IB_IS_UDATA_CLEARED 1
+#define HAVE_IB_MR_LENGTH 1
 #define HAVE_PCI_VENDOR_ID_AMAZON 1
-
-/* have pd core allocation */
-#define HAVE_PD_CORE_ALLOCATION 1
-
-/* have const wr in post verbs */
-#define HAVE_POST_CONST_WR 1
-
-/* have unspecified node type */
+#define HAVE_IB_UMEM_GET_NO_DMASYNC 1
+#define HAVE_CORE_MMAP_XA 1
 #define HAVE_RDMA_NODE_UNSPECIFIED 1
-
-/* rdma_umem_for_each_dma_block exists */
+#define HAVE_BITFIELD_H 1
+#define HAVE_IB_UMEM_GET_DEVICE_PARAM 1
+#define HAVE_IB_ACCESS_OPTIONAL 1
+#define HAVE_CREATE_AH_INIT_ATTR 1
+#define HAVE_ATOMIC64_FETCH_INC 1
+#define HAVE_DEALLOC_PD_UDATA_RC 1
+#define HAVE_AH_CORE_ALLOCATION_DESTROY_RC 1
+#define HAVE_IB_INT_DESTROY_CQ 1
 #define HAVE_RDMA_UMEM_FOR_EACH_DMA_BLOCK 1
-
-/* rdma_user_mmap_io exists */
-/* #undef HAVE_RDMA_USER_MMAP_IO */
-
-/* safe ib_alloc_device exists */
-#define HAVE_SAFE_IB_ALLOC_DEVICE 1
-
-/* for_each_sg_dma_page exists */
-#define HAVE_SG_DMA_PAGE_ITER 1
-
-/* have ucontext core allocation */
-#define HAVE_UCONTEXT_CORE_ALLOCATION 1
-
-/* rdma_udata_to_drv_context exists */
-#define HAVE_UDATA_TO_DRV_CONTEXT 1
-
-/* ib umem scatterlist exists */
-#define HAVE_UMEM_SCATTERLIST_IF 1
-
-/* have upstream efa */
-#define HAVE_UPSTREAM_EFA 1
-
-/* have uverbs command header fix */
-/* #undef HAVE_UVERBS_CMD_HDR_FIX */
-
-/* uverbs_cmd_mask is not needed */
-/* #undef HAVE_UVERBS_CMD_MASK_NOT_NEEDED */
-
-/* Name of package */
-#define PACKAGE "efa"
-
-/* Define to the address where bug reports for this package should be sent. */
-#define PACKAGE_BUGREPORT ""
-
-/* Define to the full name of this package. */
-#define PACKAGE_NAME "efa"
-
-/* Define to the full name and version of this package. */
-#define PACKAGE_STRING "efa 1.11.1"
-
-/* Define to the one symbol short name of this package. */
-#define PACKAGE_TARNAME "efa"
-
-/* Define to the home page for this package. */
-#define PACKAGE_URL ""
-
-/* Define to the version of this package. */
-#define PACKAGE_VERSION "1.11.1"
-
-/* Version number of package */
-#define VERSION "1.11.1"
+#define HAVE_IB_UMEM_NUM_DMA_BLOCKS 1
+#define HAVE_IB_REGISTER_DEVICE_DMA_DEVICE_PARAM 1
diff --git a/drivers/amazon/net/efa/efa-abi.h b/drivers/amazon/net/efa/efa-abi.h
index fee906bc28bb6..f89fbb5b1e8d5 100644
--- a/drivers/amazon/net/efa/efa-abi.h
+++ b/drivers/amazon/net/efa/efa-abi.h
@@ -118,46 +118,4 @@ struct efa_ibv_ex_query_device_resp {
 	__u32 device_caps;
 };
 
-#ifdef HAVE_CUSTOM_COMMANDS
-/******************************************************************************/
-/*                            EFA CUSTOM COMMANDS                             */
-/******************************************************************************/
-#include <rdma/ib_user_verbs.h>
-
-enum efa_everbs_commands {
-	EFA_EVERBS_CMD_GET_AH = 1,
-	EFA_EVERBS_CMD_GET_EX_DEV_ATTRS,
-	EFA_EVERBS_CMD_MAX,
-};
-
-struct efa_everbs_get_ah {
-	__u32 comp_mask;
-	__u16 pdn;
-	__u8 reserved_30[2];
-	__aligned_u64 response;
-	__aligned_u64 user_handle;
-	__u8 gid[16];
-};
-
-struct efa_everbs_get_ah_resp {
-	__u32 comp_mask;
-	__u16 efa_address_handle;
-	__u8 reserved_30[2];
-};
-
-struct efa_everbs_get_ex_dev_attrs {
-	__u32 comp_mask;
-	__u8 reserved_20[4];
-	__aligned_u64 response;
-};
-
-struct efa_everbs_get_ex_dev_attrs_resp {
-	__u32 comp_mask;
-	__u32 max_sq_wr;
-	__u32 max_rq_wr;
-	__u16 max_sq_sge;
-	__u16 max_rq_sge;
-};
-#endif /* HAVE_CUSTOM_COMMANDS */
-
 #endif /* EFA_ABI_USER_H */
diff --git a/drivers/amazon/net/efa/efa.h b/drivers/amazon/net/efa/efa.h
index ec19c69a8b81c..019cbd632710b 100644
--- a/drivers/amazon/net/efa/efa.h
+++ b/drivers/amazon/net/efa/efa.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _EFA_H_
@@ -8,10 +8,6 @@
 
 #include "kcompat.h"
 #include <linux/bitops.h>
-#ifdef HAVE_CUSTOM_COMMANDS
-#include <linux/cdev.h>
-#include <linux/fs.h>
-#endif
 #include <linux/interrupt.h>
 #include <linux/pci.h>
 #include <linux/version.h>
@@ -33,8 +29,7 @@
 struct efa_irq {
 	irq_handler_t handler;
 	void *data;
-	int cpu;
-	u32 vector;
+	u32 irqn;
 	cpumask_t affinity_hint_mask;
 	char name[EFA_IRQNAME_SIZE];
 };
@@ -64,23 +59,9 @@ struct efa_dev {
 	u64 db_bar_addr;
 	u64 db_bar_len;
 
-#ifndef HAVE_PCI_IRQ_VECTOR
-	struct msix_entry admin_msix_entry;
-#else
 	int admin_msix_vector_idx;
-#endif
 	struct efa_irq admin_irq;
 
-#ifndef HAVE_CREATE_AH_UDATA
-	struct list_head efa_ah_list;
-	/* Protects efa_ah_list */
-	struct mutex ah_list_lock;
-#endif
-#ifdef HAVE_CUSTOM_COMMANDS
-	struct device *everbs_dev;
-	struct cdev cdev;
-#endif
-
 	struct efa_stats stats;
 };
 
@@ -147,23 +128,17 @@ struct efa_ah {
 	u8 id[EFA_GID_SIZE];
 };
 
-#ifdef HAVE_IB_QUERY_DEVICE_UDATA
 int efa_query_device(struct ib_device *ibdev,
 		     struct ib_device_attr *props,
 		     struct ib_udata *udata);
-#else
-#warning deprecated api
-int efa_query_device(struct ib_device *ibdev,
-		     struct ib_device_attr *props);
-#endif
-int efa_query_port(struct ib_device *ibdev, u8 port,
+int efa_query_port(struct ib_device *ibdev, port_t port,
 		   struct ib_port_attr *props);
 int efa_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
 		 int qp_attr_mask,
 		 struct ib_qp_init_attr *qp_init_attr);
-int efa_query_gid(struct ib_device *ibdev, u8 port, int index,
+int efa_query_gid(struct ib_device *ibdev, port_t port, int index,
 		  union ib_gid *gid);
-int efa_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+int efa_query_pkey(struct ib_device *ibdev, port_t port, u16 index,
 		   u16 *pkey);
 #ifdef HAVE_ALLOC_PD_NO_UCONTEXT
 int efa_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
@@ -201,13 +176,8 @@ int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
 #else
 int efa_destroy_cq(struct ib_cq *ibcq);
 #endif
-#ifdef HAVE_CREATE_CQ_ATTR
 int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		  struct ib_udata *udata);
-#else
-#warning deprecated api
-int efa_create_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata);
-#endif
 #ifndef HAVE_CQ_CORE_ALLOCATION
 #ifdef HAVE_CREATE_CQ_NO_UCONTEXT
 struct ib_cq *efa_kzalloc_cq(struct ib_device *ibdev,
@@ -218,11 +188,6 @@ struct ib_cq *efa_kzalloc_cq(struct ib_device *ibdev,
 			     const struct ib_cq_init_attr *attr,
 			     struct ib_ucontext *ibucontext,
 			     struct ib_udata *udata);
-#else
-struct ib_cq *efa_kzalloc_cq(struct ib_device *ibdev, int entries,
-			     int vector,
-			     struct ib_ucontext *ibucontext,
-			     struct ib_udata *udata);
 #endif
 #endif
 struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
@@ -233,10 +198,8 @@ int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
 #else
 int efa_dereg_mr(struct ib_mr *ibmr);
 #endif
-#ifdef HAVE_GET_PORT_IMMUTABLE
-int efa_get_port_immutable(struct ib_device *ibdev, u8 port_num,
+int efa_get_port_immutable(struct ib_device *ibdev, port_t port_num,
 			   struct ib_port_immutable *immutable);
-#endif
 int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata);
 #ifdef HAVE_UCONTEXT_CORE_ALLOCATION
 void efa_dealloc_ucontext(struct ib_ucontext *ibucontext);
@@ -254,11 +217,7 @@ int efa_create_ah(struct ib_ah *ibah,
 #ifdef HAVE_CREATE_AH_INIT_ATTR
 		  struct rdma_ah_init_attr *init_attr,
 #else
-#ifdef HAVE_CREATE_AH_RDMA_ATTR
 		  struct rdma_ah_attr *ah_attr,
-#else
-		  struct ib_ah_attr *ah_attr,
-#endif
 		  u32 flags,
 #endif
 		  struct ib_udata *udata);
@@ -272,13 +231,6 @@ struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd,
 struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd,
 			     struct rdma_ah_attr *ah_attr,
 			     struct ib_udata *udata);
-#elif defined(HAVE_CREATE_AH_UDATA)
-struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd,
-			     struct ib_ah_attr *ah_attr,
-			     struct ib_udata *udata);
-#else
-struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd,
-			     struct ib_ah_attr *ah_attr);
 #endif
 #endif
 #ifdef HAVE_AH_CORE_ALLOCATION_DESTROY_RC
@@ -318,26 +270,14 @@ struct ib_mr *efa_get_dma_mr(struct ib_pd *ibpd, int acc);
 int efa_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
 		  int qp_attr_mask, struct ib_udata *udata);
 enum rdma_link_layer efa_port_link_layer(struct ib_device *ibdev,
-					 u8 port_num);
-#ifdef HAVE_HW_STATS
-struct rdma_hw_stats *efa_alloc_hw_stats(struct ib_device *ibdev, u8 port_num);
-int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
-		     u8 port_num, int index);
-#endif
-
-#ifdef HAVE_CUSTOM_COMMANDS
-#ifndef HAVE_CREATE_AH_UDATA
-ssize_t efa_everbs_cmd_get_ah(struct efa_dev *dev,
-			      const char __user *buf,
-			      int in_len,
-			      int out_len);
-#endif
-#ifndef HAVE_IB_QUERY_DEVICE_UDATA
-ssize_t efa_everbs_cmd_get_ex_dev_attrs(struct efa_dev *dev,
-					const char __user *buf,
-					int in_len,
-					int out_len);
-#endif
+					 port_t port_num);
+#ifdef HAVE_SPLIT_STATS_ALLOC
+struct rdma_hw_stats *efa_alloc_hw_port_stats(struct ib_device *ibdev, port_t port_num);
+struct rdma_hw_stats *efa_alloc_hw_device_stats(struct ib_device *ibdev);
+#else
+struct rdma_hw_stats *efa_alloc_hw_stats(struct ib_device *ibdev, port_t port_num);
 #endif
+int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
+		     port_t port_num, int index);
 
 #endif /* _EFA_H_ */
diff --git a/drivers/amazon/net/efa/efa_admin_cmds_defs.h b/drivers/amazon/net/efa/efa_admin_cmds_defs.h
index b199e4ac6cf9e..fa38b34eddb88 100644
--- a/drivers/amazon/net/efa/efa_admin_cmds_defs.h
+++ b/drivers/amazon/net/efa/efa_admin_cmds_defs.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _EFA_ADMIN_CMDS_H_
@@ -161,8 +161,8 @@ struct efa_admin_create_qp_resp {
 	u32 qp_handle;
 
 	/*
-	 * QP number in the given EFA virtual device. Least-significant bits
-	 *    (as needed according to max_qp) carry unique QP ID
+	 * QP number in the given EFA virtual device. Least-significant bits (as
+	 * needed according to max_qp) carry unique QP ID
 	 */
 	u16 qp_num;
 
@@ -465,7 +465,7 @@ struct efa_admin_create_cq_cmd {
 
 	/*
 	 * number of sub cqs - must be equal to sub_cqs_per_cq of queue
-	 *    attributes.
+	 * attributes.
 	 */
 	u16 num_sub_cqs;
 
@@ -563,12 +563,8 @@ struct efa_admin_acq_get_stats_resp {
 };
 
 struct efa_admin_get_set_feature_common_desc {
-	/*
-	 * 1:0 : select - 0x1 - current value; 0x3 - default
-	 *    value
-	 * 7:3 : reserved3 - MBZ
-	 */
-	u8 flags;
+	/* MBZ */
+	u8 reserved0;
 
 	/* as appears in efa_admin_aq_feature_id */
 	u8 feature_id;
@@ -823,12 +819,6 @@ enum efa_admin_aenq_group {
 	EFA_ADMIN_AENQ_GROUPS_NUM                   = 5,
 };
 
-enum efa_admin_aenq_notification_syndrom {
-	EFA_ADMIN_SUSPEND                           = 0,
-	EFA_ADMIN_RESUME                            = 1,
-	EFA_ADMIN_UPDATE_HINTS                      = 2,
-};
-
 struct efa_admin_mmio_req_read_less_resp {
 	u16 req_id;
 
@@ -909,9 +899,6 @@ struct efa_admin_host_info {
 #define EFA_ADMIN_CREATE_CQ_CMD_VIRT_MASK                   BIT(6)
 #define EFA_ADMIN_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS_MASK    GENMASK(4, 0)
 
-/* get_set_feature_common_desc */
-#define EFA_ADMIN_GET_SET_FEATURE_COMMON_DESC_SELECT_MASK   GENMASK(1, 0)
-
 /* feature_device_attr_desc */
 #define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RDMA_READ_MASK   BIT(0)
 #define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RNR_RETRY_MASK   BIT(1)
diff --git a/drivers/amazon/net/efa/efa_admin_defs.h b/drivers/amazon/net/efa/efa_admin_defs.h
index 29d53ed63b3ed..78ff9389ae256 100644
--- a/drivers/amazon/net/efa/efa_admin_defs.h
+++ b/drivers/amazon/net/efa/efa_admin_defs.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _EFA_ADMIN_H_
@@ -82,7 +82,7 @@ struct efa_admin_acq_common_desc {
 
 	/*
 	 * indicates to the driver which AQ entry has been consumed by the
-	 *    device and could be reused
+	 * device and could be reused
 	 */
 	u16 sq_head_indx;
 };
diff --git a/drivers/amazon/net/efa/efa_com.c b/drivers/amazon/net/efa/efa_com.c
index e8a0a0c3a90dc..22793b3959593 100644
--- a/drivers/amazon/net/efa/efa_com.c
+++ b/drivers/amazon/net/efa/efa_com.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
 /*
- * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include "efa_com.h"
@@ -20,9 +20,6 @@
 #define EFA_CTRL_MINOR          0
 #define EFA_CTRL_SUB_MINOR      1
 
-#define EFA_DMA_ADDR_TO_UINT32_LOW(x)   ((u32)((u64)(x)))
-#define EFA_DMA_ADDR_TO_UINT32_HIGH(x)  ((u32)(((u64)(x)) >> 32))
-
 enum efa_cmd_status {
 	EFA_CMD_SUBMITTED,
 	EFA_CMD_COMPLETED,
@@ -33,8 +30,6 @@ struct efa_comp_ctx {
 	struct efa_admin_acq_entry *user_cqe;
 	u32 comp_size;
 	enum efa_cmd_status status;
-	/* status from the device */
-	u8 comp_status;
 	u8 cmd_opcode;
 	u8 occupied;
 };
@@ -140,8 +135,8 @@ static int efa_com_admin_init_sq(struct efa_com_dev *edev)
 
 	sq->db_addr = (u32 __iomem *)(edev->reg_bar + EFA_REGS_AQ_PROD_DB_OFF);
 
-	addr_high = EFA_DMA_ADDR_TO_UINT32_HIGH(sq->dma_addr);
-	addr_low = EFA_DMA_ADDR_TO_UINT32_LOW(sq->dma_addr);
+	addr_high = upper_32_bits(sq->dma_addr);
+	addr_low = lower_32_bits(sq->dma_addr);
 
 	writel(addr_low, edev->reg_bar + EFA_REGS_AQ_BASE_LO_OFF);
 	writel(addr_high, edev->reg_bar + EFA_REGS_AQ_BASE_HI_OFF);
@@ -174,8 +169,8 @@ static int efa_com_admin_init_cq(struct efa_com_dev *edev)
 	cq->cc = 0;
 	cq->phase = 1;
 
-	addr_high = EFA_DMA_ADDR_TO_UINT32_HIGH(cq->dma_addr);
-	addr_low = EFA_DMA_ADDR_TO_UINT32_LOW(cq->dma_addr);
+	addr_high = upper_32_bits(cq->dma_addr);
+	addr_low = lower_32_bits(cq->dma_addr);
 
 	writel(addr_low, edev->reg_bar + EFA_REGS_ACQ_BASE_LO_OFF);
 	writel(addr_high, edev->reg_bar + EFA_REGS_ACQ_BASE_HI_OFF);
@@ -215,8 +210,8 @@ static int efa_com_admin_init_aenq(struct efa_com_dev *edev,
 	aenq->cc = 0;
 	aenq->phase = 1;
 
-	addr_low = EFA_DMA_ADDR_TO_UINT32_LOW(aenq->dma_addr);
-	addr_high = EFA_DMA_ADDR_TO_UINT32_HIGH(aenq->dma_addr);
+	addr_low = lower_32_bits(aenq->dma_addr);
+	addr_high = upper_32_bits(aenq->dma_addr);
 
 	writel(addr_low, edev->reg_bar + EFA_REGS_AENQ_BASE_LO_OFF);
 	writel(addr_high, edev->reg_bar + EFA_REGS_AENQ_BASE_HI_OFF);
@@ -422,9 +417,7 @@ static void efa_com_handle_single_admin_completion(struct efa_com_admin_queue *a
 	}
 
 	comp_ctx->status = EFA_CMD_COMPLETED;
-	comp_ctx->comp_status = cqe->acq_common_descriptor.status;
-	if (comp_ctx->user_cqe)
-		memcpy(comp_ctx->user_cqe, cqe, comp_ctx->comp_size);
+	memcpy(comp_ctx->user_cqe, cqe, comp_ctx->comp_size);
 
 	if (!test_bit(EFA_AQ_STATE_POLLING_BIT, &aq->state))
 		complete(&comp_ctx->wait_event);
@@ -522,7 +515,7 @@ static int efa_com_wait_and_process_admin_cq_polling(struct efa_comp_ctx *comp_c
 		msleep(aq->poll_interval);
 	}
 
-	err = efa_com_comp_status_to_errno(comp_ctx->comp_status);
+	err = efa_com_comp_status_to_errno(comp_ctx->user_cqe->acq_common_descriptor.status);
 out:
 	efa_com_put_comp_ctx(aq, comp_ctx);
 	return err;
@@ -570,7 +563,7 @@ static int efa_com_wait_and_process_admin_cq_interrupts(struct efa_comp_ctx *com
 		goto out;
 	}
 
-	err = efa_com_comp_status_to_errno(comp_ctx->comp_status);
+	err = efa_com_comp_status_to_errno(comp_ctx->user_cqe->acq_common_descriptor.status);
 out:
 	efa_com_put_comp_ctx(aq, comp_ctx);
 	return err;
@@ -642,8 +635,8 @@ int efa_com_cmd_exec(struct efa_com_admin_queue *aq,
 			aq->efa_dev,
 			"Failed to process command %s (opcode %u) comp_status %d err %d\n",
 			efa_com_cmd_str(cmd->aq_common_descriptor.opcode),
-			cmd->aq_common_descriptor.opcode, comp_ctx->comp_status,
-			err);
+			cmd->aq_common_descriptor.opcode,
+			comp_ctx->user_cqe->acq_common_descriptor.status, err);
 		atomic64_inc(&aq->stats.cmd_err);
 	}
 
@@ -796,7 +789,7 @@ int efa_com_admin_init(struct efa_com_dev *edev,
  * This method goes over the admin completion queue and wakes up
  * all the pending threads that wait on the commands wait event.
  *
- * @note: Should be called after MSI-X interrupt.
+ * Note: Should be called after MSI-X interrupt.
  */
 void efa_com_admin_q_comp_intr_handler(struct efa_com_dev *edev)
 {
diff --git a/drivers/amazon/net/efa/efa_com_cmd.c b/drivers/amazon/net/efa/efa_com_cmd.c
index d2727cddf9703..315ab45612ad3 100644
--- a/drivers/amazon/net/efa/efa_com_cmd.c
+++ b/drivers/amazon/net/efa/efa_com_cmd.c
@@ -726,7 +726,6 @@ int efa_com_dealloc_uar(struct efa_com_dev *edev,
 	return 0;
 }
 
-#ifdef HAVE_HW_STATS
 int efa_com_get_stats(struct efa_com_dev *edev,
 		      struct efa_com_get_stats_params *params,
 		      union efa_com_get_stats_result *result)
@@ -778,4 +777,3 @@ int efa_com_get_stats(struct efa_com_dev *edev,
 
 	return 0;
 }
-#endif
diff --git a/drivers/amazon/net/efa/efa_com_cmd.h b/drivers/amazon/net/efa/efa_com_cmd.h
index e572146af876e..eea4ebfbe6ec3 100644
--- a/drivers/amazon/net/efa/efa_com_cmd.h
+++ b/drivers/amazon/net/efa/efa_com_cmd.h
@@ -224,7 +224,6 @@ struct efa_com_dealloc_uar_params {
 	u16 uarn;
 };
 
-#ifdef HAVE_HW_STATS
 struct efa_com_get_stats_params {
 	/* see enum efa_admin_get_stats_type */
 	u8 type;
@@ -260,7 +259,6 @@ union efa_com_get_stats_result {
 	struct efa_com_messages_stats messages_stats;
 	struct efa_com_rdma_read_stats rdma_read_stats;
 };
-#endif
 
 void efa_com_set_dma_addr(dma_addr_t addr, u32 *addr_high, u32 *addr_low);
 int efa_com_create_qp(struct efa_com_dev *edev,
@@ -310,10 +308,8 @@ int efa_com_alloc_uar(struct efa_com_dev *edev,
 		      struct efa_com_alloc_uar_result *result);
 int efa_com_dealloc_uar(struct efa_com_dev *edev,
 			struct efa_com_dealloc_uar_params *params);
-#ifdef HAVE_HW_STATS
 int efa_com_get_stats(struct efa_com_dev *edev,
 		      struct efa_com_get_stats_params *params,
 		      union efa_com_get_stats_result *result);
-#endif
 
 #endif /* _EFA_COM_CMD_H_ */
diff --git a/drivers/amazon/net/efa/efa_gdr.c b/drivers/amazon/net/efa/efa_gdr.c
index 5ec34afb6571a..2bcd4bec66704 100644
--- a/drivers/amazon/net/efa/efa_gdr.c
+++ b/drivers/amazon/net/efa/efa_gdr.c
@@ -1,8 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /*
- * Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2019-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
+#include <linux/module.h>
+
 #include "efa_gdr.h"
 
 #define GPU_PAGE_SHIFT 16
@@ -50,6 +52,54 @@ static struct efa_nvmem *ticket_to_nvmem(u64 ticket)
 	return NULL;
 }
 
+int nvmem_get_fp(struct efa_nvmem *nvmem)
+{
+	nvmem->ops.get_pages = symbol_get(nvidia_p2p_get_pages);
+	if (!nvmem->ops.get_pages)
+		goto err_out;
+
+	nvmem->ops.put_pages = symbol_get(nvidia_p2p_put_pages);
+	if (!nvmem->ops.put_pages)
+		goto err_put_get_pages;
+
+	nvmem->ops.dma_map_pages = symbol_get(nvidia_p2p_dma_map_pages);
+	if (!nvmem->ops.dma_map_pages)
+		goto err_put_put_pages;
+
+	nvmem->ops.dma_unmap_pages = symbol_get(nvidia_p2p_dma_unmap_pages);
+	if (!nvmem->ops.dma_unmap_pages)
+		goto err_put_dma_map_pages;
+
+	return 0;
+
+err_put_dma_map_pages:
+	symbol_put(nvidia_p2p_dma_map_pages);
+err_put_put_pages:
+	symbol_put(nvidia_p2p_put_pages);
+err_put_get_pages:
+	symbol_put(nvidia_p2p_get_pages);
+err_out:
+	return -EINVAL;
+}
+
+void nvmem_put_fp(void)
+{
+	symbol_put(nvidia_p2p_dma_unmap_pages);
+	symbol_put(nvidia_p2p_dma_map_pages);
+	symbol_put(nvidia_p2p_put_pages);
+	symbol_put(nvidia_p2p_get_pages);
+}
+
+static void nvmem_release(struct efa_dev *dev, struct efa_nvmem *nvmem)
+{
+	if (nvmem->dma_mapping)
+		nvmem->ops.dma_unmap_pages(dev->pdev, nvmem->pgtbl,
+					   nvmem->dma_mapping);
+
+	if (nvmem->pgtbl)
+		nvmem->ops.put_pages(0, 0, nvmem->virt_start, nvmem->pgtbl);
+}
+
 int nvmem_put(u64 ticket, bool in_cb)
 {
 	struct efa_com_dereg_mr_params params = {};
@@ -76,14 +126,16 @@ int nvmem_put(u64 ticket, bool in_cb)
 		nvmem->needs_dereg = false;
 	}
 
-	nvmem_release(dev, nvmem, in_cb);
-
-	/* Dereg is the last nvmem consumer, delete the ticket */
-	if (!in_cb) {
-		list_del(&nvmem->list);
-		kfree(nvmem);
+	if (in_cb) {
+		mutex_unlock(&nvmem_list_lock);
+		return 0;
 	}
+
+	list_del(&nvmem->list);
 	mutex_unlock(&nvmem_list_lock);
+	nvmem_release(dev, nvmem);
+	nvmem_put_fp();
+	kfree(nvmem);
 
 	return 0;
 }
@@ -99,7 +151,7 @@ static int nvmem_get_pages(struct efa_dev *dev, struct efa_nvmem *nvmem,
 {
 	int err;
 
-	err = nvidia_p2p_get_pages(0, 0, addr, size, &nvmem->pgtbl,
+	err = nvmem->ops.get_pages(0, 0, addr, size, &nvmem->pgtbl,
 				   nvmem_free_cb, (void *)nvmem->ticket);
 	if (err) {
 		ibdev_dbg(&dev->ibdev, "nvidia_p2p_get_pages failed %d\n", err);
@@ -109,7 +161,7 @@ static int nvmem_get_pages(struct efa_dev *dev, struct efa_nvmem *nvmem,
 	if (!NVIDIA_P2P_PAGE_TABLE_VERSION_COMPATIBLE(nvmem->pgtbl)) {
 		ibdev_dbg(&dev->ibdev, "Incompatible page table version %#08x\n",
 			  nvmem->pgtbl->version);
-		nvidia_p2p_put_pages(0, 0, addr, nvmem->pgtbl);
+		nvmem->ops.put_pages(0, 0, addr, nvmem->pgtbl);
 		nvmem->pgtbl = NULL;
 		return -EINVAL;
 	}
@@ -121,7 +173,7 @@ static int nvmem_dma_map(struct efa_dev *dev, struct efa_nvmem *nvmem)
 {
 	int err;
 
-	err = nvidia_p2p_dma_map_pages(dev->pdev, nvmem->pgtbl,
+	err = nvmem->ops.dma_map_pages(dev->pdev, nvmem->pgtbl,
 				       &nvmem->dma_mapping);
 	if (err) {
 		ibdev_dbg(&dev->ibdev, "nvidia_p2p_dma_map_pages failed %d\n",
@@ -132,7 +184,7 @@ static int nvmem_dma_map(struct efa_dev *dev, struct efa_nvmem *nvmem)
 	if (!NVIDIA_P2P_DMA_MAPPING_VERSION_COMPATIBLE(nvmem->dma_mapping)) {
 		ibdev_dbg(&dev->ibdev, "Incompatible DMA mapping version %#08x\n",
 			  nvmem->dma_mapping->version);
-		nvidia_p2p_dma_unmap_pages(dev->pdev, nvmem->pgtbl,
+		nvmem->ops.dma_unmap_pages(dev->pdev, nvmem->pgtbl,
 					   nvmem->dma_mapping);
 		nvmem->dma_mapping = NULL;
 		return -EINVAL;
@@ -160,10 +212,15 @@ struct efa_nvmem *nvmem_get(struct efa_dev *dev, struct efa_mr *mr, u64 start,
 	pinsz = start + length - virt_start;
 	nvmem->virt_start = virt_start;
 
+	err = nvmem_get_fp(nvmem);
+	if (err)
+		/* Nvidia module is not loaded */
+		goto err_free;
+
 	err = nvmem_get_pages(dev, nvmem, virt_start, pinsz);
 	if (err) {
 		/* Most likely cpu pages */
-		goto err_free;
+		goto err_put_fp;
 	}
 
 	err = nvmem_dma_map(dev, nvmem);
@@ -181,9 +238,11 @@ struct efa_nvmem *nvmem_get(struct efa_dev *dev, struct efa_mr *mr, u64 start,
 	return nvmem;
 
 err_unmap:
-	nvidia_p2p_dma_unmap_pages(dev->pdev, nvmem->pgtbl, nvmem->dma_mapping);
+	nvmem->ops.dma_unmap_pages(dev->pdev, nvmem->pgtbl, nvmem->dma_mapping);
 err_put:
-	nvidia_p2p_put_pages(0, 0, start, nvmem->pgtbl);
+	nvmem->ops.put_pages(0, 0, start, nvmem->pgtbl);
+err_put_fp:
+	nvmem_put_fp();
 err_free:
 	kfree(nvmem);
 	return NULL;
@@ -200,26 +259,3 @@ int nvmem_to_page_list(struct efa_dev *dev, struct efa_nvmem *nvmem,
 
 	return 0;
 }
-
-void nvmem_release(struct efa_dev *dev, struct efa_nvmem *nvmem, bool in_cb)
-{
-	if (in_cb) {
-		if (nvmem->dma_mapping) {
-			nvidia_p2p_free_dma_mapping(nvmem->dma_mapping);
-			nvmem->dma_mapping = NULL;
-		}
-
-		if (nvmem->pgtbl) {
-			nvidia_p2p_free_page_table(nvmem->pgtbl);
-			nvmem->pgtbl = NULL;
-		}
-	} else {
-		if (nvmem->dma_mapping)
-			nvidia_p2p_dma_unmap_pages(dev->pdev, nvmem->pgtbl,
-						   nvmem->dma_mapping);
-
-		if (nvmem->pgtbl)
-			nvidia_p2p_put_pages(0, 0, nvmem->virt_start,
-					     nvmem->pgtbl);
-	}
-}
diff --git a/drivers/amazon/net/efa/efa_gdr.h b/drivers/amazon/net/efa/efa_gdr.h
index 497307c6305da..faa743c09c945 100644
--- a/drivers/amazon/net/efa/efa_gdr.h
+++ b/drivers/amazon/net/efa/efa_gdr.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2019-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _EFA_GDR_H_
@@ -9,8 +9,23 @@
 #include "efa.h"
 #include "nv-p2p.h"
 
+struct efa_nvmem_ops {
+	int (*get_pages)(u64 p2p_token, u32 va_space, u64 virtual_address,
+			 u64 length, struct nvidia_p2p_page_table **page_table,
+			 void (*free_callback)(void *data), void *data);
+	int (*dma_map_pages)(struct pci_dev *peer,
+			     struct nvidia_p2p_page_table *page_table,
+			     struct nvidia_p2p_dma_mapping **dma_mapping);
+	int (*put_pages)(u64 p2p_token, u32 va_space, u64 virtual_address,
+			 struct nvidia_p2p_page_table *page_table);
+	int (*dma_unmap_pages)(struct pci_dev *peer,
+			       struct nvidia_p2p_page_table *page_table,
+			       struct nvidia_p2p_dma_mapping *dma_mapping);
+};
+
 struct efa_nvmem {
 	struct efa_dev *dev;
+	struct efa_nvmem_ops ops;
 	struct nvidia_p2p_page_table *pgtbl;
 	struct nvidia_p2p_dma_mapping *dma_mapping;
 	u64 virt_start;
@@ -21,11 +36,12 @@ struct efa_nvmem {
 };
 
 void nvmem_init(void);
+int nvmem_get_fp(struct efa_nvmem *nvmem);
+void nvmem_put_fp(void);
 struct efa_nvmem *nvmem_get(struct efa_dev *dev, struct efa_mr *mr, u64 start,
 			    u64 length, unsigned int *pgsz);
 int nvmem_to_page_list(struct efa_dev *dev, struct efa_nvmem *nvmem,
 		       u64 *page_list);
 int nvmem_put(u64 ticket, bool in_cb);
-void nvmem_release(struct efa_dev *dev, struct efa_nvmem *nvmem, bool in_cb);
 
 #endif /* _EFA_GDR_H_ */
diff --git a/drivers/amazon/net/efa/efa_main.c b/drivers/amazon/net/efa/efa_main.c
index 2ca15287819d3..1d1b94e800cd9 100644
--- a/drivers/amazon/net/efa/efa_main.c
+++ b/drivers/amazon/net/efa/efa_main.c
@@ -30,7 +30,7 @@ static const struct pci_device_id efa_pci_tbl[] = {
 };
 
 #define DRV_MODULE_VER_MAJOR           1
-#define DRV_MODULE_VER_MINOR           11
+#define DRV_MODULE_VER_MINOR           14
 #define DRV_MODULE_VER_SUBMINOR        1
 
 #ifndef DRV_MODULE_VERSION
@@ -61,17 +61,6 @@ MODULE_INFO(gdr, "Y");
 	(BIT(EFA_ADMIN_FATAL_ERROR) | BIT(EFA_ADMIN_WARNING) | \
 	 BIT(EFA_ADMIN_NOTIFICATION) | BIT(EFA_ADMIN_KEEP_ALIVE))
 
-#ifdef HAVE_CUSTOM_COMMANDS
-#define EFA_EVERBS_DEVICE_NAME "efa_everbs"
-#define EFA_EVERBS_MAX_DEVICES 64
-
-static struct class *efa_everbs_class;
-static unsigned int efa_everbs_major;
-
-static int efa_everbs_dev_init(struct efa_dev *dev, int devnum);
-static void efa_everbs_dev_destroy(struct efa_dev *dev);
-#endif
-
 /* This handler will called for unknown event group or unimplemented handlers */
 static void unimplemented_aenq_handler(void *data,
 				       struct efa_admin_aenq_entry *aenq_e)
@@ -121,8 +110,7 @@ static int efa_request_mgmnt_irq(struct efa_dev *dev)
 	int err;
 
 	irq = &dev->admin_irq;
-	err = request_irq(irq->vector, irq->handler, 0, irq->name,
-			  irq->data);
+	err = request_irq(irq->irqn, irq->handler, 0, irq->name, irq->data);
 	if (err) {
 		dev_err(&dev->pdev->dev, "Failed to request admin irq (%d)\n",
 			err);
@@ -130,8 +118,8 @@ static int efa_request_mgmnt_irq(struct efa_dev *dev)
 	}
 
 	dev_dbg(&dev->pdev->dev, "Set affinity hint of mgmnt irq to %*pbl (irq vector: %d)\n",
-		nr_cpumask_bits, &irq->affinity_hint_mask, irq->vector);
-	irq_set_affinity_hint(irq->vector, &irq->affinity_hint_mask);
+		nr_cpumask_bits, &irq->affinity_hint_mask, irq->irqn);
+	irq_set_affinity_hint(irq->irqn, &irq->affinity_hint_mask);
 
 	return 0;
 }
@@ -144,19 +132,13 @@ static void efa_setup_mgmnt_irq(struct efa_dev *dev)
 		 "efa-mgmnt@pci:%s", pci_name(dev->pdev));
 	dev->admin_irq.handler = efa_intr_msix_mgmnt;
 	dev->admin_irq.data = dev;
-	dev->admin_irq.vector =
-#ifndef HAVE_PCI_IRQ_VECTOR
-		dev->admin_msix_entry.vector;
-#else
+	dev->admin_irq.irqn =
 		pci_irq_vector(dev->pdev, dev->admin_msix_vector_idx);
-#endif
 	cpu = cpumask_first(cpu_online_mask);
-	dev->admin_irq.cpu = cpu;
 	cpumask_set_cpu(cpu,
 			&dev->admin_irq.affinity_hint_mask);
-	dev_info(&dev->pdev->dev, "Setup irq:0x%p vector:%d name:%s\n",
-		 &dev->admin_irq,
-		 dev->admin_irq.vector,
+	dev_info(&dev->pdev->dev, "Setup irq:%d name:%s\n",
+		 dev->admin_irq.irqn,
 		 dev->admin_irq.name);
 }
 
@@ -165,8 +147,8 @@ static void efa_free_mgmnt_irq(struct efa_dev *dev)
 	struct efa_irq *irq;
 
 	irq = &dev->admin_irq;
-	irq_set_affinity_hint(irq->vector, NULL);
-	free_irq(irq->vector, irq->data);
+	irq_set_affinity_hint(irq->irqn, NULL);
+	free_irq(irq->irqn, irq->data);
 }
 
 static int efa_set_mgmnt_irq(struct efa_dev *dev)
@@ -292,7 +274,12 @@ static const struct ib_device_ops efa_dev_ops = {
 	.uverbs_abi_ver = EFA_UVERBS_ABI_VERSION,
 #endif
 
+#ifdef HAVE_SPLIT_STATS_ALLOC
+	.alloc_hw_port_stats = efa_alloc_hw_port_stats,
+	.alloc_hw_device_stats = efa_alloc_hw_device_stats,
+#else
 	.alloc_hw_stats = efa_alloc_hw_stats,
+#endif
 #ifdef HAVE_PD_CORE_ALLOCATION
 	.alloc_pd = efa_alloc_pd,
 #else
@@ -368,16 +355,8 @@ static int efa_ib_device_add(struct efa_dev *dev)
 {
 	struct efa_com_get_hw_hints_result hw_hints;
 	struct pci_dev *pdev = dev->pdev;
-#ifdef HAVE_CUSTOM_COMMANDS
-	int devnum;
-#endif
 	int err;
 
-#ifdef HAVE_CREATE_AH_NO_UDATA
-	INIT_LIST_HEAD(&dev->efa_ah_list);
-	mutex_init(&dev->ah_list_lock);
-#endif
-
 	efa_stats_init(dev);
 
 	err = efa_com_get_device_attr(&dev->edev, &dev->dev_attr);
@@ -431,7 +410,7 @@ static int efa_ib_device_add(struct efa_dev *dev)
 		(1ull << IB_USER_VERBS_CMD_DESTROY_AH);
 #endif
 
-#if defined(HAVE_IB_QUERY_DEVICE_UDATA) && !defined(HAVE_UVERBS_CMD_MASK_NOT_NEEDED)
+#ifndef HAVE_UVERBS_CMD_MASK_NOT_NEEDED
 	dev->ibdev.uverbs_ex_cmd_mask =
 		(1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE);
 #endif
@@ -446,9 +425,7 @@ static int efa_ib_device_add(struct efa_dev *dev)
 #ifdef HAVE_IB_DEV_OPS
 	ib_set_device_ops(&dev->ibdev, &efa_dev_ops);
 #else
-#ifdef HAVE_HW_STATS
 	dev->ibdev.alloc_hw_stats = efa_alloc_hw_stats;
-#endif
 	dev->ibdev.alloc_pd = efa_kzalloc_pd;
 	dev->ibdev.alloc_ucontext = efa_kzalloc_ucontext;
 	dev->ibdev.create_ah = efa_kzalloc_ah;
@@ -461,13 +438,9 @@ static int efa_ib_device_add(struct efa_dev *dev)
 	dev->ibdev.destroy_cq = efa_destroy_cq;
 	dev->ibdev.destroy_qp = efa_destroy_qp;
 	dev->ibdev.get_dma_mr = efa_get_dma_mr;
-#ifdef HAVE_HW_STATS
 	dev->ibdev.get_hw_stats = efa_get_hw_stats;
-#endif
 	dev->ibdev.get_link_layer = efa_port_link_layer;
-#ifdef HAVE_GET_PORT_IMMUTABLE
 	dev->ibdev.get_port_immutable = efa_get_port_immutable;
-#endif
 	dev->ibdev.mmap = efa_mmap;
 	dev->ibdev.modify_qp = efa_modify_qp;
 	dev->ibdev.poll_cq = efa_poll_cq;
@@ -499,25 +472,8 @@ static int efa_ib_device_add(struct efa_dev *dev)
 
 	ibdev_info(&dev->ibdev, "IB device registered\n");
 
-#ifdef HAVE_CUSTOM_COMMANDS
-	if (sscanf(dev_name(&dev->ibdev.dev), "efa_%d\n", &devnum) != 1) {
-		err = -EINVAL;
-		goto err_unregister_ibdev;
-	}
-
-	err = efa_everbs_dev_init(dev, devnum);
-	if (err)
-		goto err_unregister_ibdev;
-	ibdev_info(&dev->ibdev, "Created everbs device %s%d\n",
-		   EFA_EVERBS_DEVICE_NAME, devnum);
-#endif
-
 	return 0;
 
-#ifdef HAVE_CUSTOM_COMMANDS
-err_unregister_ibdev:
-	ib_unregister_device(&dev->ibdev);
-#endif
 err_release_doorbell_bar:
 	efa_release_doorbell_bar(dev);
 	return err;
@@ -525,13 +481,7 @@ static int efa_ib_device_add(struct efa_dev *dev)
 
 static void efa_ib_device_remove(struct efa_dev *dev)
 {
-#ifdef HAVE_CREATE_AH_NO_UDATA
-	WARN_ON(!list_empty(&dev->efa_ah_list));
-#endif
 	efa_com_dev_reset(&dev->edev, EFA_REGS_RESET_NORMAL);
-#ifdef HAVE_CUSTOM_COMMANDS
-	efa_everbs_dev_destroy(dev);
-#endif
 	ibdev_info(&dev->ibdev, "Unregister ib device\n");
 	ib_unregister_device(&dev->ibdev);
 	efa_release_doorbell_bar(dev);
@@ -539,11 +489,7 @@ static void efa_ib_device_remove(struct efa_dev *dev)
 
 static void efa_disable_msix(struct efa_dev *dev)
 {
-#ifndef HAVE_PCI_IRQ_VECTOR
-	pci_disable_msix(dev->pdev);
-#else
 	pci_free_irq_vectors(dev->pdev);
-#endif
 }
 
 static int efa_enable_msix(struct efa_dev *dev)
@@ -555,16 +501,9 @@ static int efa_enable_msix(struct efa_dev *dev)
 	dev_dbg(&dev->pdev->dev, "Trying to enable MSI-X, vectors %d\n",
 		msix_vecs);
 
-#ifndef HAVE_PCI_IRQ_VECTOR
-	dev->admin_msix_entry.entry = EFA_MGMNT_MSIX_VEC_IDX;
-	irq_num = pci_enable_msix_range(dev->pdev,
-					&dev->admin_msix_entry,
-					msix_vecs, msix_vecs);
-#else
 	dev->admin_msix_vector_idx = EFA_MGMNT_MSIX_VEC_IDX;
 	irq_num = pci_alloc_irq_vectors(dev->pdev, msix_vecs,
 					msix_vecs, PCI_IRQ_MSIX);
-#endif
 
 	if (irq_num < 0) {
 		dev_err(&dev->pdev->dev, "Failed to enable MSI-X. irq_num %d\n",
@@ -573,6 +512,7 @@ static int efa_enable_msix(struct efa_dev *dev)
 	}
 
 	if (irq_num != msix_vecs) {
+		efa_disable_msix(dev);
 		dev_err(&dev->pdev->dev,
 			"Allocated %d MSI-X (out of %d requested)\n",
 			irq_num, msix_vecs);
@@ -682,13 +622,8 @@ static struct efa_dev *efa_probe_device(struct pci_dev *pdev)
 	if (err)
 		goto err_reg_read_destroy;
 
-#ifdef HAVE_PCI_IRQ_VECTOR
 	edev->aq.msix_vector_idx = dev->admin_msix_vector_idx;
 	edev->aenq.msix_vector_idx = dev->admin_msix_vector_idx;
-#else
-	edev->aq.msix_vector_idx = dev->admin_msix_entry.entry;
-	edev->aenq.msix_vector_idx = dev->admin_msix_entry.entry;
-#endif
 
 	err = efa_set_mgmnt_irq(dev);
 	if (err)
@@ -775,150 +710,16 @@ static struct pci_driver efa_pci_driver = {
 	.remove         = efa_remove,
 };
 
-#ifdef HAVE_CUSTOM_COMMANDS
-static ssize_t
-(*efa_everbs_cmd_table[EFA_EVERBS_CMD_MAX])(struct efa_dev *dev,
-					    const char __user *buf, int in_len,
-					    int out_len) = {
-#ifdef HAVE_CREATE_AH_NO_UDATA
-	[EFA_EVERBS_CMD_GET_AH] = efa_everbs_cmd_get_ah,
-#endif
-#ifndef HAVE_IB_QUERY_DEVICE_UDATA
-	[EFA_EVERBS_CMD_GET_EX_DEV_ATTRS] = efa_everbs_cmd_get_ex_dev_attrs,
-#endif
-};
-
-static ssize_t efa_everbs_write(struct file *filp,
-				const char __user *buf,
-				size_t count,
-				loff_t *pos)
-{
-	struct efa_dev *dev = filp->private_data;
-	struct ib_uverbs_cmd_hdr hdr;
-
-	if (count < sizeof(hdr))
-		return -EINVAL;
-
-	if (copy_from_user(&hdr, buf, sizeof(hdr)))
-		return -EFAULT;
-
-	if (hdr.in_words * 4 != count)
-		return -EINVAL;
-
-	if (hdr.command >= ARRAY_SIZE(efa_everbs_cmd_table) ||
-	    !efa_everbs_cmd_table[hdr.command])
-		return -EINVAL;
-
-	return efa_everbs_cmd_table[hdr.command](dev,
-						 buf + sizeof(hdr),
-						 hdr.in_words * 4,
-						 hdr.out_words * 4);
-}
-
-static int efa_everbs_open(struct inode *inode, struct file *filp)
-{
-	struct efa_dev *dev;
-
-	dev = container_of(inode->i_cdev, struct efa_dev, cdev);
-
-	filp->private_data = dev;
-	return nonseekable_open(inode, filp);
-}
-
-static int efa_everbs_close(struct inode *inode, struct file *filp)
-{
-	return 0;
-}
-
-static char *efa_everbs_devnode(struct device *dev, umode_t *mode)
-{
-	if (mode)
-		*mode = 0666;
-	return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
-}
-
-static const struct file_operations efa_everbs_fops = {
-	.owner   = THIS_MODULE,
-	.write   = efa_everbs_write,
-	.open    = efa_everbs_open,
-	.release = efa_everbs_close,
-	.llseek  = no_llseek,
-};
-
-static int efa_everbs_dev_init(struct efa_dev *dev, int devnum)
-{
-	dev_t devno = MKDEV(efa_everbs_major, devnum);
-	int err;
-
-	WARN_ON(devnum >= EFA_EVERBS_MAX_DEVICES);
-	cdev_init(&dev->cdev, &efa_everbs_fops);
-	dev->cdev.owner = THIS_MODULE;
-
-	err = cdev_add(&dev->cdev, devno, 1);
-	if (err)
-		return err;
-
-	dev->everbs_dev = device_create(efa_everbs_class,
-					&dev->pdev->dev,
-					devno,
-					dev,
-					EFA_EVERBS_DEVICE_NAME "%d",
-					devnum);
-	if (IS_ERR(dev->everbs_dev)) {
-		err = PTR_ERR(dev->everbs_dev);
-		ibdev_err(&dev->ibdev, "Failed to create device: %s%d [%d]\n",
-			  EFA_EVERBS_DEVICE_NAME, devnum, err);
-		goto err;
-	}
-
-	return 0;
-
-err:
-	cdev_del(&dev->cdev);
-	return err;
-}
-
-static void efa_everbs_dev_destroy(struct efa_dev *dev)
-{
-	if (!dev->everbs_dev)
-		return;
-
-	device_destroy(efa_everbs_class, dev->cdev.dev);
-	cdev_del(&dev->cdev);
-	dev->everbs_dev = NULL;
-}
-#endif /* HAVE_CUSTOM_COMMANDS */
-
 static int __init efa_init(void)
 {
-#ifdef HAVE_CUSTOM_COMMANDS
-	dev_t dev;
-#endif
 	int err;
 
 	pr_info("%s\n", version);
-#ifdef HAVE_CUSTOM_COMMANDS
-	err = alloc_chrdev_region(&dev, 0, EFA_EVERBS_MAX_DEVICES,
-				  EFA_EVERBS_DEVICE_NAME);
-	if (err) {
-		pr_err("Couldn't allocate efa_everbs device numbers\n");
-		goto out;
-	}
-	efa_everbs_major = MAJOR(dev);
-
-	efa_everbs_class = class_create(THIS_MODULE, EFA_EVERBS_DEVICE_NAME);
-	if (IS_ERR(efa_everbs_class)) {
-		err = PTR_ERR(efa_everbs_class);
-		pr_err("Couldn't create efa_everbs class\n");
-		goto err_class;
-	}
-	efa_everbs_class->devnode = efa_everbs_devnode;
-#endif
 
 	err = pci_register_driver(&efa_pci_driver);
 	if (err) {
 		pr_err("Couldn't register efa driver\n");
-		goto err_register;
+		return err;
 	}
 
 #ifdef HAVE_EFA_GDR
@@ -926,25 +727,11 @@ static int __init efa_init(void)
 #endif
 
 	return 0;
-
-err_register:
-#ifdef HAVE_CUSTOM_COMMANDS
-	class_destroy(efa_everbs_class);
-err_class:
-	unregister_chrdev_region(dev, EFA_EVERBS_MAX_DEVICES);
-out:
-#endif
-	return err;
 }
 
 static void __exit efa_exit(void)
 {
 	pci_unregister_driver(&efa_pci_driver);
-#ifdef HAVE_CUSTOM_COMMANDS
-	class_destroy(efa_everbs_class);
-	unregister_chrdev_region(MKDEV(efa_everbs_major, 0),
-				 EFA_EVERBS_MAX_DEVICES);
-#endif
 }
 
 module_init(efa_init);
diff --git a/drivers/amazon/net/efa/efa_sysfs.c b/drivers/amazon/net/efa/efa_sysfs.c
index 67e3fe9e80ac2..c9026c9cfff0f 100644
--- a/drivers/amazon/net/efa/efa_sysfs.c
+++ b/drivers/amazon/net/efa/efa_sysfs.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
 /*
- * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include "efa_sysfs.h"
@@ -10,9 +10,17 @@
 #include <linux/sysfs.h>
 
 #ifdef HAVE_EFA_GDR
+#include "efa_gdr.h"
+
 static ssize_t gdr_show(struct device *dev, struct device_attribute *attr,
 			char *buf)
 {
+	struct efa_nvmem dummynv = {};
+
+	if (nvmem_get_fp(&dummynv))
+		return sprintf(buf, "0\n");
+	nvmem_put_fp();
+
 	return sprintf(buf, "1\n");
 }
 
diff --git a/drivers/amazon/net/efa/efa_verbs.c b/drivers/amazon/net/efa/efa_verbs.c
index f50a6736c96c8..b27c2f5b0fd2a 100644
--- a/drivers/amazon/net/efa/efa_verbs.c
+++ b/drivers/amazon/net/efa/efa_verbs.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /*
- * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include "kcompat.h"
@@ -40,8 +40,21 @@ struct efa_user_mmap_entry {
 	u8 mmap_flag;
 };
 
-#ifdef HAVE_HW_STATS
-#define EFA_DEFINE_STATS(op) \
+#define EFA_DEFINE_DEVICE_STATS(op) \
+	op(EFA_SUBMITTED_CMDS, "submitted_cmds") \
+	op(EFA_COMPLETED_CMDS, "completed_cmds") \
+	op(EFA_CMDS_ERR, "cmds_err") \
+	op(EFA_NO_COMPLETION_CMDS, "no_completion_cmds") \
+	op(EFA_KEEP_ALIVE_RCVD, "keep_alive_rcvd") \
+	op(EFA_ALLOC_PD_ERR, "alloc_pd_err") \
+	op(EFA_CREATE_QP_ERR, "create_qp_err") \
+	op(EFA_CREATE_CQ_ERR, "create_cq_err") \
+	op(EFA_REG_MR_ERR, "reg_mr_err") \
+	op(EFA_ALLOC_UCONTEXT_ERR, "alloc_ucontext_err") \
+	op(EFA_CREATE_AH_ERR, "create_ah_err") \
+	op(EFA_MMAP_ERR, "mmap_err")
+
+#define EFA_DEFINE_PORT_STATS(op) \
 	op(EFA_TX_BYTES, "tx_bytes") \
 	op(EFA_TX_PKTS, "tx_pkts") \
 	op(EFA_RX_BYTES, "rx_bytes") \
@@ -55,30 +68,25 @@ struct efa_user_mmap_entry {
 	op(EFA_RDMA_READ_BYTES, "rdma_read_bytes") \
 	op(EFA_RDMA_READ_WR_ERR, "rdma_read_wr_err") \
 	op(EFA_RDMA_READ_RESP_BYTES, "rdma_read_resp_bytes") \
-	op(EFA_SUBMITTED_CMDS, "submitted_cmds") \
-	op(EFA_COMPLETED_CMDS, "completed_cmds") \
-	op(EFA_CMDS_ERR, "cmds_err") \
-	op(EFA_NO_COMPLETION_CMDS, "no_completion_cmds") \
-	op(EFA_KEEP_ALIVE_RCVD, "keep_alive_rcvd") \
-	op(EFA_ALLOC_PD_ERR, "alloc_pd_err") \
-	op(EFA_CREATE_QP_ERR, "create_qp_err") \
-	op(EFA_CREATE_CQ_ERR, "create_cq_err") \
-	op(EFA_REG_MR_ERR, "reg_mr_err") \
-	op(EFA_ALLOC_UCONTEXT_ERR, "alloc_ucontext_err") \
-	op(EFA_CREATE_AH_ERR, "create_ah_err") \
-	op(EFA_MMAP_ERR, "mmap_err")
 
 #define EFA_STATS_ENUM(ename, name) ename,
 #define EFA_STATS_STR(ename, name) [ename] = name,
 
-enum efa_hw_stats {
-	EFA_DEFINE_STATS(EFA_STATS_ENUM)
+enum efa_hw_device_stats {
+	EFA_DEFINE_DEVICE_STATS(EFA_STATS_ENUM)
 };
 
-static const char *const efa_stats_names[] = {
-	EFA_DEFINE_STATS(EFA_STATS_STR)
+static const char *const efa_device_stats_names[] = {
+	EFA_DEFINE_DEVICE_STATS(EFA_STATS_STR)
+};
+
+enum efa_hw_port_stats {
+	EFA_DEFINE_PORT_STATS(EFA_STATS_ENUM)
+};
+
+static const char *const efa_port_stats_names[] = {
+	EFA_DEFINE_PORT_STATS(EFA_STATS_STR)
 };
-#endif
 
 #define EFA_CHUNK_PAYLOAD_SHIFT       12
 #define EFA_CHUNK_PAYLOAD_SIZE        BIT(EFA_CHUNK_PAYLOAD_SHIFT)
@@ -233,12 +241,6 @@ static int mmap_entry_validate(struct efa_ucontext *ucontext,
 		return -EINVAL;
 	}
 
-	if (vma->vm_flags & VM_EXEC) {
-		ibdev_dbg(ucontext->ibucontext.device,
-			  "Mapping executable pages is not permitted\n");
-		return -EPERM;
-	}
-
 	return 0;
 }
 
@@ -274,21 +276,13 @@ rdma_user_mmap_entry_get(struct ib_ucontext *ibucontext,
 }
 #endif /* !defined (HAVE_CORE_MMAP_XA) */
 
-#ifdef HAVE_IB_QUERY_DEVICE_UDATA
 int efa_query_device(struct ib_device *ibdev,
 		     struct ib_device_attr *props,
 		     struct ib_udata *udata)
-#else
-int efa_query_device(struct ib_device *ibdev,
-		     struct ib_device_attr *props)
-#endif
 {
 	struct efa_com_get_device_attr_result *dev_attr;
-#ifdef HAVE_IB_QUERY_DEVICE_UDATA
 	struct efa_ibv_ex_query_device_resp resp = {};
-#endif
 	struct efa_dev *dev = to_edev(ibdev);
-#ifdef HAVE_IB_QUERY_DEVICE_UDATA
 	int err;
 
 	if (udata && udata->inlen &&
@@ -297,7 +291,6 @@ int efa_query_device(struct ib_device *ibdev,
 			  "Incompatible ABI params, udata not cleared\n");
 		return -EINVAL;
 	}
-#endif
 
 	dev_attr = &dev->dev_attr;
 
@@ -325,7 +318,6 @@ int efa_query_device(struct ib_device *ibdev,
 	props->max_sge_rd = dev_attr->max_wr_rdma_sge;
 	props->max_pkeys = 1;
 
-#ifdef HAVE_IB_QUERY_DEVICE_UDATA
 	if (udata && udata->outlen) {
 		resp.max_sq_sge = dev_attr->max_sq_sge;
 		resp.max_rq_sge = dev_attr->max_rq_sge;
@@ -347,12 +339,11 @@ int efa_query_device(struct ib_device *ibdev,
 			return err;
 		}
 	}
-#endif
 
 	return 0;
 }
 
-int efa_query_port(struct ib_device *ibdev, u8 port,
+int efa_query_port(struct ib_device *ibdev, port_t port,
 		   struct ib_port_attr *props)
 {
 	struct efa_dev *dev = to_edev(ibdev);
@@ -365,13 +356,8 @@ int efa_query_port(struct ib_device *ibdev, u8 port,
 	props->pkey_tbl_len = 1;
 	props->active_speed = IB_SPEED_EDR;
 	props->active_width = IB_WIDTH_4X;
-#ifdef HAVE_IB_MTU_INT_TO_ENUM
 	props->max_mtu = ib_mtu_int_to_enum(dev->dev_attr.mtu);
 	props->active_mtu = ib_mtu_int_to_enum(dev->dev_attr.mtu);
-#else
-	props->max_mtu = IB_MTU_4096;
-	props->active_mtu = IB_MTU_4096;
-#endif
 	props->max_msg_sz = dev->dev_attr.mtu;
 	props->max_vl_num = 1;
 
@@ -429,7 +415,7 @@ int efa_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
 	return 0;
 }
 
-int efa_query_gid(struct ib_device *ibdev, u8 port, int index,
+int efa_query_gid(struct ib_device *ibdev, port_t port, int index,
 		  union ib_gid *gid)
 {
 	struct efa_dev *dev = to_edev(ibdev);
@@ -439,7 +425,7 @@ int efa_query_gid(struct ib_device *ibdev, u8 port, int index,
 	return 0;
 }
 
-int efa_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+int efa_query_pkey(struct ib_device *ibdev, port_t port, u16 index,
 		   u16 *pkey)
 {
 	if (index > 0)
@@ -481,12 +467,7 @@ int efa_alloc_pd(struct ib_pd *ibpd,
 #endif
 
 	if (udata->inlen &&
-#ifdef HAVE_UVERBS_CMD_HDR_FIX
 	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
-#else
-	    /* WA for e093111ddb6c ("IB/core: Fix input len in multiple user verbs") */
-	    !ib_is_udata_cleared(udata, 0, udata->inlen - sizeof(struct ib_uverbs_cmd_hdr))) {
-#endif
 		ibdev_dbg(&dev->ibdev,
 			  "Incompatible ABI params, udata not cleared\n");
 		err = -EINVAL;
@@ -866,14 +847,8 @@ struct ib_qp *efa_create_qp(struct ib_pd *ibpd,
 	}
 
 	if (udata->inlen > sizeof(cmd) &&
-#ifdef HAVE_UVERBS_CMD_HDR_FIX
 	    !ib_is_udata_cleared(udata, sizeof(cmd),
 				 udata->inlen - sizeof(cmd))) {
-#else
-	    /* WA for e093111ddb6c ("IB/core: Fix input len in multiple user verbs") */
-	    !ib_is_udata_cleared(udata, sizeof(cmd),
-				 udata->inlen - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr))) {
-#endif
 		ibdev_dbg(&dev->ibdev,
 			  "Incompatible ABI params, unknown fields in udata\n");
 		err = -EINVAL;
@@ -1177,12 +1152,7 @@ int efa_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
 #endif
 
 	if (udata->inlen &&
-#ifdef HAVE_UVERBS_CMD_HDR_FIX
 	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
-#else
-	    /* WA for e093111ddb6c ("IB/core: Fix input len in multiple user verbs") */
-	    !ib_is_udata_cleared(udata, 0, udata->inlen - sizeof(struct ib_uverbs_cmd_hdr))) {
-#endif
 		ibdev_dbg(&dev->ibdev,
 			  "Incompatible ABI params, udata not cleared\n");
 		return -EINVAL;
@@ -1313,12 +1283,8 @@ static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq,
 	return 0;
 }
 
-#ifdef HAVE_CREATE_CQ_ATTR
 int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		  struct ib_udata *udata)
-#else
-int efa_create_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
-#endif
 {
 #ifdef HAVE_UDATA_TO_DRV_CONTEXT
 	struct efa_ucontext *ucontext = rdma_udata_to_drv_context(
@@ -1333,17 +1299,13 @@ int efa_create_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
 	struct efa_dev *dev = to_edev(ibdev);
 	struct efa_ibv_create_cq cmd = {};
 	struct efa_cq *cq = to_ecq(ibcq);
-#ifdef HAVE_CREATE_CQ_ATTR
 	int entries = attr->cqe;
-#endif
 	int err;
 
 	ibdev_dbg(ibdev, "create_cq entries %d\n", entries);
 
-#ifdef HAVE_CREATE_CQ_ATTR
 	if (attr->flags)
 		return -EOPNOTSUPP;
-#endif
 
 	if (entries < 1 || entries > dev->dev_attr.max_cq_depth) {
 		ibdev_dbg(ibdev,
@@ -1369,14 +1331,8 @@ int efa_create_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
 	}
 
 	if (udata->inlen > sizeof(cmd) &&
-#ifdef HAVE_UVERBS_CMD_HDR_FIX
 	    !ib_is_udata_cleared(udata, sizeof(cmd),
 				 udata->inlen - sizeof(cmd))) {
-#else
-	    /* WA for e093111ddb6c ("IB/core: Fix input len in multiple user verbs") */
-	    !ib_is_udata_cleared(udata, sizeof(cmd),
-				 udata->inlen - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr))) {
-#endif
 		ibdev_dbg(ibdev,
 			  "Incompatible ABI params, unknown fields in udata\n");
 		err = -EINVAL;
@@ -1480,11 +1436,6 @@ struct ib_cq *efa_kzalloc_cq(struct ib_device *ibdev,
 			     const struct ib_cq_init_attr *attr,
 			     struct ib_ucontext *ibucontext,
 			     struct ib_udata *udata)
-#else
-struct ib_cq *efa_kzalloc_cq(struct ib_device *ibdev, int entries,
-			     int vector,
-			     struct ib_ucontext *ibucontext,
-			     struct ib_udata *udata)
 #endif
 {
 	struct efa_dev *dev = to_edev(ibdev);
@@ -1505,11 +1456,7 @@ struct ib_cq *efa_kzalloc_cq(struct ib_device *ibdev, int entries,
 #endif
 
 	cq->ibcq.device = ibdev;
-#ifdef HAVE_CREATE_CQ_ATTR
 	err = efa_create_cq(&cq->ibcq, attr, udata);
-#else
-	err = efa_create_cq(&cq->ibcq, entries, udata);
-#endif
 	if (err)
 		goto err_free_cq;
 
@@ -1606,52 +1553,6 @@ static int umem_to_page_list(struct efa_dev *dev,
 
 	return 0;
 }
-#else
-#warning deprecated api
-static int umem_to_page_list(struct efa_dev *dev,
-			     struct ib_umem *umem,
-			     u64 *page_list,
-			     u32 hp_cnt,
-			     u8 hp_shift)
-{
-	u32 pages_in_hp = BIT(hp_shift - PAGE_SHIFT);
-	struct ib_umem_chunk *chunk;
-	unsigned int page_idx = 0;
-	unsigned int pages_in_sg;
-	unsigned int hp_idx = 0;
-	unsigned int entry;
-	unsigned int i;
-
-	ibdev_dbg(&dev->ibdev, "hp_cnt[%u], pages_in_hp[%u]\n",
-		  hp_cnt, pages_in_hp);
-
-	list_for_each_entry(chunk, &umem->chunk_list, list) {
-		for (entry = 0; entry < chunk->nents; entry++) {
-			if (sg_dma_len(&chunk->page_list[entry]) & ~PAGE_MASK) {
-				ibdev_dbg(&dev->ibdev,
-					  "sg_dma_len[%u] does not divide by PAGE_SIZE[%lu]\n",
-					  sg_dma_len(&chunk->page_list[entry]),
-					  PAGE_SIZE);
-				return -EINVAL;
-			}
-
-			pages_in_sg = sg_dma_len(&chunk->page_list[entry])
-				      >> PAGE_SHIFT;
-			for (i = 0; i < pages_in_sg; i++) {
-				if (page_idx % pages_in_hp == 0) {
-					page_list[hp_idx] =
-						sg_dma_address(&chunk->page_list[entry]) +
-						i * PAGE_SIZE;
-					hp_idx++;
-				}
-
-				page_idx++;
-			}
-		}
-	}
-
-	return 0;
-}
 #endif
 
 static struct scatterlist *efa_vmalloc_buf_to_sg(u64 *buf, int page_cnt)
@@ -2042,11 +1943,7 @@ static unsigned long efa_cont_pages(struct ib_umem *umem,
 				    u64 addr)
 {
 	unsigned long max_page_shift = fls64(page_size_cap);
-#ifndef HAVE_UMEM_SCATTERLIST_IF
-	struct ib_umem_chunk *chunk;
-#else
 	struct scatterlist *sg;
-#endif
 	u64 base = ~0, p = 0;
 	unsigned long tmp;
 	unsigned long m;
@@ -2059,30 +1956,6 @@ static unsigned long efa_cont_pages(struct ib_umem *umem,
 	m = find_first_bit(&tmp, BITS_PER_LONG);
 	m = min_t(unsigned long, max_page_shift - PAGE_SHIFT, m);
 
-#ifndef HAVE_UMEM_SCATTERLIST_IF
-	list_for_each_entry(chunk, &umem->chunk_list, list) {
-		for (entry = 0; entry < chunk->nents; entry++) {
-			len = DIV_ROUND_UP(sg_dma_len(&chunk->page_list[entry]),
-					   PAGE_SIZE);
-			pfn = sg_dma_address(&chunk->page_list[entry]) >> PAGE_SHIFT;
-			if (base + p != pfn) {
-				/*
-				 * If either the offset or the new
-				 * base are unaligned update m
-				 */
-				tmp = (unsigned long)(pfn | p);
-				if (!IS_ALIGNED(tmp, 1 << m))
-					m = find_first_bit(&tmp, BITS_PER_LONG);
-
-				base = pfn;
-				p = 0;
-			}
-
-			p += len;
-			i += len;
-		}
-	}
-#else
 	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
 		len = DIV_ROUND_UP(sg_dma_len(sg), PAGE_SIZE);
 		pfn = sg_dma_address(sg) >> PAGE_SHIFT;
@@ -2102,7 +1975,6 @@ static unsigned long efa_cont_pages(struct ib_umem *umem,
 		p += len;
 		i += len;
 	}
-#endif
 
 	if (i)
 		m = min_t(unsigned long, ilog2(roundup_pow_of_two(i)), m);
@@ -2136,12 +2008,7 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
 #endif
 
 	if (udata && udata->inlen &&
-#ifdef HAVE_UVERBS_CMD_HDR_FIX
 	    !ib_is_udata_cleared(udata, 0, sizeof(udata->inlen))) {
-#else
-	    /* WA for e093111ddb6c ("IB/core: Fix input len in multiple user verbs") */
-	    !ib_is_udata_cleared(udata, 0, sizeof(udata->inlen) - sizeof(struct ib_uverbs_cmd_hdr))) {
-#endif
 		ibdev_dbg(&dev->ibdev,
 			  "Incompatible ABI params, udata not cleared\n");
 		err = -EINVAL;
@@ -2249,7 +2116,16 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
 
 	params.page_shift = order_base_2(pg_sz);
 #ifdef HAVE_IB_UMEM_NUM_DMA_BLOCKS
+#ifdef HAVE_EFA_GDR
+	if (mr->umem)
+		params.page_num = ib_umem_num_dma_blocks(mr->umem, pg_sz);
+	else
+		params.page_num = DIV_ROUND_UP(length +
+					       (virt_addr & (pg_sz - 1)),
+					       pg_sz);
+#else
 	params.page_num = ib_umem_num_dma_blocks(mr->umem, pg_sz);
+#endif
 #else
 	params.page_num = DIV_ROUND_UP(length + (virt_addr & (pg_sz - 1)),
 				       pg_sz);
@@ -2298,7 +2174,7 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
 err_unmap:
 #ifdef HAVE_EFA_GDR
 	if (mr->nvmem)
-		nvmem_release(dev, mr->nvmem, false);
+		nvmem_put(mr->nvmem->ticket, false);
 	else
 		ib_umem_release(mr->umem);
 #else
@@ -2345,8 +2221,7 @@ int efa_dereg_mr(struct ib_mr *ibmr)
 	return 0;
 }
 
-#ifdef HAVE_GET_PORT_IMMUTABLE
-int efa_get_port_immutable(struct ib_device *ibdev, u8 port_num,
+int efa_get_port_immutable(struct ib_device *ibdev, port_t port_num,
 			   struct ib_port_immutable *immutable)
 {
 	struct ib_port_attr attr;
@@ -2363,7 +2238,6 @@ int efa_get_port_immutable(struct ib_device *ibdev, u8 port_num,
 
 	return 0;
 }
-#endif
 
 static int efa_dealloc_uar(struct efa_dev *dev, u16 uarn)
 {
@@ -2437,12 +2311,8 @@ int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata)
 	INIT_LIST_HEAD(&ucontext->pending_mmaps);
 #endif /* !defined(HAVE_CORE_MMAP_XA) */
 
-#ifdef HAVE_IB_QUERY_DEVICE_UDATA
 	resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE;
-#endif
-#ifndef HAVE_CREATE_AH_NO_UDATA
 	resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_CREATE_AH;
-#endif
 	resp.sub_cqs_per_cq = dev->dev_attr.sub_cqs_per_cq;
 	resp.inline_buf_size = dev->dev_attr.inline_buf_size;
 	resp.max_llq_size = dev->dev_attr.max_llq_size;
@@ -2623,99 +2493,6 @@ int efa_mmap(struct ib_ucontext *ibucontext,
 	return __efa_mmap(dev, ucontext, vma);
 }
 
-#ifdef HAVE_CREATE_AH_NO_UDATA
-struct efa_ah_id {
-	struct list_head list;
-	/* dest_addr */
-	u8 id[EFA_GID_SIZE];
-	unsigned int ref_count;
-	u16 ah;
-};
-
-static inline bool efa_ah_id_equal(u8 *id1, u8 *id2)
-{
-	return !memcmp(id1, id2, EFA_GID_SIZE);
-}
-
-/* Must be called with dev->ah_list_lock held */
-static int efa_get_ah_id(struct efa_dev *dev, u8 *id, bool ref_update, u16 *ah)
-{
-	struct efa_ah_id *ah_id;
-
-	list_for_each_entry(ah_id, &dev->efa_ah_list, list) {
-		if (efa_ah_id_equal(ah_id->id, id)) {
-			if (ref_update)
-				ah_id->ref_count++;
-			if (ah)
-				*ah = ah_id->ah;
-			return 0;
-		}
-	}
-
-	return -EINVAL;
-}
-
-static void efa_put_ah_id(struct efa_dev *dev, u8 *id)
-{
-	struct efa_ah_id *ah_id, *tmp;
-
-	mutex_lock(&dev->ah_list_lock);
-	list_for_each_entry_safe(ah_id, tmp, &dev->efa_ah_list, list) {
-		if (efa_ah_id_equal(ah_id->id, id)) {
-			ah_id->ref_count--;
-			if (!ah_id->ref_count) {
-				list_del(&ah_id->list);
-				kfree(ah_id);
-				mutex_unlock(&dev->ah_list_lock);
-				return;
-			}
-		}
-	}
-	mutex_unlock(&dev->ah_list_lock);
-}
-
-/* Must be called with dev->ah_list_lock held */
-static struct efa_ah_id *efa_create_ah_id(struct efa_dev *dev, u8 *id, u16 ah)
-{
-	struct efa_ah_id *ah_id;
-
-	ah_id = kzalloc(sizeof(*ah_id), GFP_KERNEL);
-	if (!ah_id)
-		return NULL;
-
-	memcpy(ah_id->id, id, sizeof(ah_id->id));
-	ah_id->ref_count = 1;
-	ah_id->ah = ah;
-
-	return ah_id;
-}
-
-static int efa_add_ah_id(struct efa_dev *dev, u8 *id, u16 ah)
-{
-	struct efa_ah_id *ah_id;
-	int err;
-
-	mutex_lock(&dev->ah_list_lock);
-	err = efa_get_ah_id(dev, id, true, NULL);
-	if (err) {
-		ah_id = efa_create_ah_id(dev, id, ah);
-		if (!ah_id) {
-			err = -ENOMEM;
-			goto err_unlock;
-		}
-
-		list_add_tail(&ah_id->list, &dev->efa_ah_list);
-	}
-	mutex_unlock(&dev->ah_list_lock);
-
-	return 0;
-
-err_unlock:
-	mutex_unlock(&dev->ah_list_lock);
-	return err;
-}
-#endif
-
 static int efa_ah_destroy(struct efa_dev *dev, struct efa_ah *ah)
 {
 	struct efa_com_destroy_ah_params params = {
@@ -2730,11 +2507,7 @@ int efa_create_ah(struct ib_ah *ibah,
 #ifdef HAVE_CREATE_AH_INIT_ATTR
 		  struct rdma_ah_init_attr *init_attr,
 #else
-#ifdef HAVE_CREATE_AH_RDMA_ATTR
 		  struct rdma_ah_attr *ah_attr,
-#else
-		  struct ib_ah_attr *ah_attr,
-#endif
 		  u32 flags,
 #endif
 		  struct ib_udata *udata)
@@ -2744,9 +2517,7 @@ int efa_create_ah(struct ib_ah *ibah,
 #endif
 	struct efa_dev *dev = to_edev(ibah->device);
 	struct efa_com_create_ah_params params = {};
-#ifndef HAVE_CREATE_AH_NO_UDATA
 	struct efa_ibv_create_ah_resp resp = {};
-#endif
 	struct efa_com_create_ah_result result;
 	struct efa_ah *ah = to_eah(ibah);
 	int err;
@@ -2764,7 +2535,6 @@ int efa_create_ah(struct ib_ah *ibah,
 	}
 #endif
 
-#ifndef HAVE_CREATE_AH_NO_UDATA
 #ifndef HAVE_NO_KVERBS_DRIVERS
 	if (!udata) {
 		ibdev_dbg(&dev->ibdev, "udata is NULL\n");
@@ -2774,17 +2544,11 @@ int efa_create_ah(struct ib_ah *ibah,
 #endif
 
 	if (udata->inlen &&
-#ifdef HAVE_UVERBS_CMD_HDR_FIX
 	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
-#else
-	    /* WA for e093111ddb6c ("IB/core: Fix input len in multiple user verbs") */
-	    !ib_is_udata_cleared(udata, 0, udata->inlen - sizeof(struct ib_uverbs_cmd_hdr))) {
-#endif
 		ibdev_dbg(&dev->ibdev, "Incompatible ABI params\n");
 		err = -EINVAL;
 		goto err_out;
 	}
-#endif
 
 	memcpy(params.dest_addr, ah_attr->grh.dgid.raw,
 	       sizeof(params.dest_addr));
@@ -2796,7 +2560,6 @@ int efa_create_ah(struct ib_ah *ibah,
 	memcpy(ah->id, ah_attr->grh.dgid.raw, sizeof(ah->id));
 	ah->ah = result.ah;
 
-#ifndef HAVE_CREATE_AH_NO_UDATA
 	resp.efa_address_handle = result.ah;
 
 	if (udata->outlen) {
@@ -2808,13 +2571,6 @@ int efa_create_ah(struct ib_ah *ibah,
 			goto err_destroy_ah;
 		}
 	}
-#else
-	err = efa_add_ah_id(dev, ah_attr->grh.dgid.raw, result.ah);
-	if (err) {
-		ibdev_dbg(&dev->ibdev, "Failed to add AH id\n");
-		goto err_destroy_ah;
-	}
-#endif
 	ibdev_dbg(&dev->ibdev, "Created ah[%d]\n", ah->ah);
 
 	return 0;
@@ -2836,13 +2592,6 @@ struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd,
 struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd,
 			     struct rdma_ah_attr *ah_attr,
 			     struct ib_udata *udata)
-#elif defined(HAVE_CREATE_AH_UDATA)
-struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd,
-			     struct ib_ah_attr *ah_attr,
-			     struct ib_udata *udata)
-#else
-struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd,
-			     struct ib_ah_attr *ah_attr)
 #endif
 {
 	struct efa_ah *ah;
@@ -2850,9 +2599,6 @@ struct ib_ah *efa_kzalloc_ah(struct ib_pd *ibpd,
 #ifndef HAVE_CREATE_DESTROY_AH_FLAGS
 	u32 flags = 0;
 #endif
-#ifdef HAVE_CREATE_AH_NO_UDATA
-	void *udata = NULL;
-#endif
 
 	ah = kzalloc(sizeof(*ah), GFP_KERNEL);
 	if (!ah)
@@ -2911,33 +2657,72 @@ int efa_destroy_ah(struct ib_ah *ibah)
 	err = efa_ah_destroy(dev, ah);
 	if (err)
 		return err;
-#ifdef HAVE_CREATE_AH_NO_UDATA
-	efa_put_ah_id(dev, ah->id);
-#endif
 	kfree(ah);
 	return 0;
 #endif
 }
 
-#ifdef HAVE_HW_STATS
-struct rdma_hw_stats *efa_alloc_hw_stats(struct ib_device *ibdev, u8 port_num)
+#ifdef HAVE_SPLIT_STATS_ALLOC
+struct rdma_hw_stats *efa_alloc_hw_port_stats(struct ib_device *ibdev,
+					      port_t port_num)
 {
-	return rdma_alloc_hw_stats_struct(efa_stats_names,
-					  ARRAY_SIZE(efa_stats_names),
+	return rdma_alloc_hw_stats_struct(efa_port_stats_names,
+					  ARRAY_SIZE(efa_port_stats_names),
 					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
 }
 
-int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
-		     u8 port_num, int index)
+struct rdma_hw_stats *efa_alloc_hw_device_stats(struct ib_device *ibdev)
+{
+	return rdma_alloc_hw_stats_struct(efa_device_stats_names,
+					  ARRAY_SIZE(efa_device_stats_names),
+					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+#else
+struct rdma_hw_stats *efa_alloc_hw_stats(struct ib_device *ibdev, port_t port_num)
+{
+	if (port_num)
+		return rdma_alloc_hw_stats_struct(efa_port_stats_names,
+						  ARRAY_SIZE(efa_port_stats_names),
+						  RDMA_HW_STATS_DEFAULT_LIFESPAN);
+	else
+		return rdma_alloc_hw_stats_struct(efa_device_stats_names,
+						  ARRAY_SIZE(efa_device_stats_names),
+						  RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+#endif
+
+static int efa_fill_device_stats(struct efa_dev *dev,
+				 struct rdma_hw_stats *stats)
+{
+	struct efa_com_stats_admin *as = &dev->edev.aq.stats;
+	struct efa_stats *s = &dev->stats;
+
+	stats->value[EFA_SUBMITTED_CMDS] = atomic64_read(&as->submitted_cmd);
+	stats->value[EFA_COMPLETED_CMDS] = atomic64_read(&as->completed_cmd);
+	stats->value[EFA_CMDS_ERR] = atomic64_read(&as->cmd_err);
+	stats->value[EFA_NO_COMPLETION_CMDS] = atomic64_read(&as->no_completion);
+
+	stats->value[EFA_KEEP_ALIVE_RCVD] = atomic64_read(&s->keep_alive_rcvd);
+	stats->value[EFA_ALLOC_PD_ERR] = atomic64_read(&s->alloc_pd_err);
+	stats->value[EFA_CREATE_QP_ERR] = atomic64_read(&s->create_qp_err);
+	stats->value[EFA_CREATE_CQ_ERR] = atomic64_read(&s->create_cq_err);
+	stats->value[EFA_REG_MR_ERR] = atomic64_read(&s->reg_mr_err);
+	stats->value[EFA_ALLOC_UCONTEXT_ERR] =
+		atomic64_read(&s->alloc_ucontext_err);
+	stats->value[EFA_CREATE_AH_ERR] = atomic64_read(&s->create_ah_err);
+	stats->value[EFA_MMAP_ERR] = atomic64_read(&s->mmap_err);
+
+	return ARRAY_SIZE(efa_device_stats_names);
+}
+
+static int efa_fill_port_stats(struct efa_dev *dev, struct rdma_hw_stats *stats,
+			       port_t port_num)
 {
 	struct efa_com_get_stats_params params = {};
 	union efa_com_get_stats_result result;
-	struct efa_dev *dev = to_edev(ibdev);
 	struct efa_com_rdma_read_stats *rrs;
 	struct efa_com_messages_stats *ms;
 	struct efa_com_basic_stats *bs;
-	struct efa_com_stats_admin *as;
-	struct efa_stats *s;
 	int err;
 
 	params.scope = EFA_ADMIN_GET_STATS_SCOPE_ALL;
@@ -2976,26 +2761,17 @@ int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
 	stats->value[EFA_RDMA_READ_WR_ERR] = rrs->read_wr_err;
 	stats->value[EFA_RDMA_READ_RESP_BYTES] = rrs->read_resp_bytes;
 
-	as = &dev->edev.aq.stats;
-	stats->value[EFA_SUBMITTED_CMDS] = atomic64_read(&as->submitted_cmd);
-	stats->value[EFA_COMPLETED_CMDS] = atomic64_read(&as->completed_cmd);
-	stats->value[EFA_CMDS_ERR] = atomic64_read(&as->cmd_err);
-	stats->value[EFA_NO_COMPLETION_CMDS] = atomic64_read(&as->no_completion);
-
-	s = &dev->stats;
-	stats->value[EFA_KEEP_ALIVE_RCVD] = atomic64_read(&s->keep_alive_rcvd);
-	stats->value[EFA_ALLOC_PD_ERR] = atomic64_read(&s->alloc_pd_err);
-	stats->value[EFA_CREATE_QP_ERR] = atomic64_read(&s->create_qp_err);
-	stats->value[EFA_CREATE_CQ_ERR] = atomic64_read(&s->create_cq_err);
-	stats->value[EFA_REG_MR_ERR] = atomic64_read(&s->reg_mr_err);
-	stats->value[EFA_ALLOC_UCONTEXT_ERR] =
-		atomic64_read(&s->alloc_ucontext_err);
-	stats->value[EFA_CREATE_AH_ERR] = atomic64_read(&s->create_ah_err);
-	stats->value[EFA_MMAP_ERR] = atomic64_read(&s->mmap_err);
+	return ARRAY_SIZE(efa_port_stats_names);
+}
 
-	return ARRAY_SIZE(efa_stats_names);
+int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
+		     port_t port_num, int index)
+{
+	if (port_num)
+		return efa_fill_port_stats(to_edev(ibdev), stats, port_num);
+	else
+		return efa_fill_device_stats(to_edev(ibdev), stats);
 }
-#endif
 
 #ifndef HAVE_NO_KVERBS_DRIVERS
 #ifdef HAVE_POST_CONST_WR
@@ -3058,83 +2834,8 @@ struct ib_mr *efa_get_dma_mr(struct ib_pd *ibpd, int acc)
 #endif
 
 enum rdma_link_layer efa_port_link_layer(struct ib_device *ibdev,
-					 u8 port_num)
+					 port_t port_num)
 {
 	return IB_LINK_LAYER_UNSPECIFIED;
 }
 
-#ifdef HAVE_CUSTOM_COMMANDS
-#ifdef HAVE_CREATE_AH_NO_UDATA
-ssize_t efa_everbs_cmd_get_ah(struct efa_dev *dev,
-			      const char __user *buf,
-			      int in_len,
-			      int out_len)
-{
-	struct efa_everbs_get_ah_resp resp = {};
-	struct efa_everbs_get_ah cmd = {};
-	int err;
-
-	if (out_len < sizeof(resp))
-		return -ENOSPC;
-
-	if (copy_from_user(&cmd, buf, sizeof(cmd)))
-		return -EFAULT;
-
-	if (cmd.comp_mask) {
-		ibdev_dbg(&dev->ibdev,
-			"Incompatible ABI params, unknown fields in udata\n");
-		return -EINVAL;
-	}
-
-	mutex_lock(&dev->ah_list_lock);
-	err = efa_get_ah_id(dev, cmd.gid, false, &resp.efa_address_handle);
-	mutex_unlock(&dev->ah_list_lock);
-	if (err) {
-		ibdev_dbg(&dev->ibdev,
-			"Couldn't find AH with specified GID\n");
-		return err;
-	}
-
-	if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp,
-			 sizeof(resp)))
-		return -EFAULT;
-
-	return in_len;
-}
-#endif
-
-#ifndef HAVE_IB_QUERY_DEVICE_UDATA
-ssize_t efa_everbs_cmd_get_ex_dev_attrs(struct efa_dev *dev,
-					const char __user *buf,
-					int in_len,
-					int out_len)
-{
-	struct efa_com_get_device_attr_result *dev_attr = &dev->dev_attr;
-	struct efa_everbs_get_ex_dev_attrs_resp resp = {};
-	struct efa_everbs_get_ex_dev_attrs cmd = {};
-
-	if (out_len < sizeof(resp))
-		return -ENOSPC;
-
-	if (copy_from_user(&cmd, buf, sizeof(cmd)))
-		return -EFAULT;
-
-	if (cmd.comp_mask || !is_reserved_cleared(cmd.reserved_20)) {
-		ibdev_dbg(&dev->ibdev,
-			  "Incompatible ABI params, unknown fields in udata\n");
-		return -EINVAL;
-	}
-
-	resp.max_sq_sge = dev_attr->max_sq_sge;
-	resp.max_rq_sge = dev_attr->max_rq_sge;
-	resp.max_sq_wr = dev_attr->max_sq_depth;
-	resp.max_rq_wr = dev_attr->max_rq_depth;
-
-	if (copy_to_user((void __user *)(unsigned long)cmd.response,
-			 &resp, sizeof(resp)))
-		return -EFAULT;
-
-	return in_len;
-}
-#endif
-#endif
diff --git a/drivers/amazon/net/efa/kcompat.h b/drivers/amazon/net/efa/kcompat.h
index 60575885f33a2..d0887952d8c92 100644
--- a/drivers/amazon/net/efa/kcompat.h
+++ b/drivers/amazon/net/efa/kcompat.h
@@ -6,15 +6,9 @@
 #ifndef _KCOMPAT_H_
 #define _KCOMPAT_H_
 
-#include "config.h"
-
-#if defined(HAVE_CREATE_AH_NO_UDATA) || !defined(HAVE_IB_QUERY_DEVICE_UDATA)
-#define HAVE_CUSTOM_COMMANDS
-#endif
+#include <linux/types.h>
 
-#ifndef ALIGN_DOWN
-#define ALIGN_DOWN(x, a)	__ALIGN_KERNEL((x) - ((a) - 1), (a))
-#endif
+#include "config.h"
 
 #ifndef HAVE_IB_IS_UDATA_CLEARED
 #include <linux/string.h>
@@ -186,4 +180,10 @@ static inline void __rdma_umem_block_iter_start(struct ib_block_iter *biter,
 	     __rdma_block_iter_next(biter);)
 #endif
 
+#ifdef HAVE_U32_PORT
+typedef u32 port_t;
+#else
+typedef u8 port_t;
+#endif
+
 #endif /* _KCOMPAT_H_ */
diff --git a/drivers/amazon/net/efa/nv-p2p.h b/drivers/amazon/net/efa/nv-p2p.h
new file mode 100644
index 0000000000000..d74e024963d5a
--- /dev/null
+++ b/drivers/amazon/net/efa/nv-p2p.h
@@ -0,0 +1,439 @@
+/*
+ * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _NV_P2P_H_
+#define _NV_P2P_H_
+
+/*
+ * NVIDIA P2P Structure Versioning
+ *
+ * For the nvidia_p2p_*_t structures allocated by the NVIDIA driver, it will
+ * set the version field of the structure according to the definition used by
+ * the NVIDIA driver. The "major" field of the version is defined as the upper
+ * 16 bits, and the "minor" field of the version is defined as the lower 16
+ * bits. The version field will always be the first 4 bytes of the structure,
+ * and third-party drivers should check the value of this field in structures
+ * allocated by the NVIDIA driver to ensure runtime compatibility.
+ *
+ * In general, version numbers will be incremented as follows:
+ * - When a backwards-compatible change is made to the structure layout, the
+ *   minor version for that structure will be incremented. Third-party drivers
+ *   built against an older minor version will continue to work with the newer
+ *   minor version used by the NVIDIA driver, without recompilation.
+ * - When a breaking change is made to the structure layout, the major version
+ *   will be incremented. Third-party drivers built against an older major
+ *   version require at least recompilation and potentially additional updates
+ *   to use the new API.
+ */
+#define NVIDIA_P2P_MAJOR_VERSION_MASK   0xffff0000
+#define NVIDIA_P2P_MINOR_VERSION_MASK   0x0000ffff
+
+#define NVIDIA_P2P_MAJOR_VERSION(v) \
+    (((v) & NVIDIA_P2P_MAJOR_VERSION_MASK) >> 16)
+
+#define NVIDIA_P2P_MINOR_VERSION(v) \
+    (((v) & NVIDIA_P2P_MINOR_VERSION_MASK))
+
+#define NVIDIA_P2P_MAJOR_VERSION_MATCHES(p, v) \
+    (NVIDIA_P2P_MAJOR_VERSION((p)->version) == NVIDIA_P2P_MAJOR_VERSION(v))
+
+#define NVIDIA_P2P_VERSION_COMPATIBLE(p, v)    \
+    (NVIDIA_P2P_MAJOR_VERSION_MATCHES(p, v) && \
+     (NVIDIA_P2P_MINOR_VERSION((p)->version) >= (NVIDIA_P2P_MINOR_VERSION(v))))
+
+enum {
+    NVIDIA_P2P_ARCHITECTURE_TESLA = 0,
+    NVIDIA_P2P_ARCHITECTURE_FERMI,
+    NVIDIA_P2P_ARCHITECTURE_CURRENT = NVIDIA_P2P_ARCHITECTURE_FERMI
+};
+
+#define NVIDIA_P2P_PARAMS_VERSION   0x00010001
+
+enum {
+    NVIDIA_P2P_PARAMS_ADDRESS_INDEX_GPU = 0,
+    NVIDIA_P2P_PARAMS_ADDRESS_INDEX_THIRD_PARTY_DEVICE,
+    NVIDIA_P2P_PARAMS_ADDRESS_INDEX_MAX = \
+        NVIDIA_P2P_PARAMS_ADDRESS_INDEX_THIRD_PARTY_DEVICE
+};
+
+typedef
+struct nvidia_p2p_params {
+    u32 version;
+    u32 architecture;
+    union nvidia_p2p_mailbox_addresses {
+        struct {
+            u64 wmb_addr;
+            u64 wmb_data;
+            u64 rreq_addr;
+            u64 rcomp_addr;
+            u64 reserved[2];
+        } fermi;
+    } addresses[NVIDIA_P2P_PARAMS_ADDRESS_INDEX_MAX+1];
+} nvidia_p2p_params_t;
+
+/*
+ * @brief
+ *   Initializes a third-party P2P mapping between an NVIDIA
+ *   GPU and a third-party device.
+ *
+ * @param[in]     p2p_token
+ *   A token that uniquely identifies the P2P mapping.
+ * @param[in,out] params
+ *   A pointer to a structure with P2P mapping parameters.
+ * @param[in]     destroy_callback
+ *   A pointer to the function to be invoked when the P2P mapping
+ *   is destroyed implictly.
+ * @param[in]     data
+ *   An opaque pointer to private data to be passed to the
+ *   callback function.
+ *
+ * @return
+ *    0           upon successful completion.
+ *   -EINVAL      if an invalid argument was supplied.
+ *   -ENOTSUPP    if the requested configuration is not supported.
+ *   -ENOMEM      if the driver failed to allocate memory.
+ *   -EBUSY       if the mapping has already been initialized.
+ *   -EIO         if an unknown error occurred.
+ */
+int nvidia_p2p_init_mapping(u64 p2p_token,
+        struct nvidia_p2p_params *params,
+        void (*destroy_callback)(void *data),
+        void *data);
+
+/*
+ * @brief
+ *   Tear down a previously initialized third-party P2P mapping.
+ *
+ * @param[in]     p2p_token
+ *   A token that uniquely identifies the mapping.
+ *
+ * @return
+ *    0           upon successful completion.
+ *   -EINVAL      if an invalid argument was supplied.
+ *   -ENOTSUPP    if the requested configuration is not supported.
+ *   -ENOMEM      if the driver failed to allocate memory.
+ */
+int nvidia_p2p_destroy_mapping(u64 p2p_token);
+
+enum nvidia_p2p_page_size_type {
+    NVIDIA_P2P_PAGE_SIZE_4KB = 0,
+    NVIDIA_P2P_PAGE_SIZE_64KB,
+    NVIDIA_P2P_PAGE_SIZE_128KB,
+    NVIDIA_P2P_PAGE_SIZE_COUNT
+};
+
+typedef
+struct nvidia_p2p_page {
+    u64 physical_address;
+    union nvidia_p2p_request_registers {
+        struct {
+            u32 wreqmb_h;
+            u32 rreqmb_h;
+            u32 rreqmb_0;
+            u32 reserved[3];
+        } fermi;
+    } registers;
+} nvidia_p2p_page_t;
+
+#define NVIDIA_P2P_PAGE_TABLE_VERSION   0x00010002
+
+#define NVIDIA_P2P_PAGE_TABLE_VERSION_COMPATIBLE(p) \
+    NVIDIA_P2P_VERSION_COMPATIBLE(p, NVIDIA_P2P_PAGE_TABLE_VERSION)
+
+typedef
+struct nvidia_p2p_page_table {
+    u32 version;
+    u32 page_size; /* enum nvidia_p2p_page_size_type */
+    struct nvidia_p2p_page **pages;
+    u32 entries;
+    u8 *gpu_uuid;
+} nvidia_p2p_page_table_t;
+
+/*
+ * @brief
+ *   Make the pages underlying a range of GPU virtual memory
+ *   accessible to a third-party device.
+ *
+ *   This API only supports pinned, GPU-resident memory, such as that provided
+ *   by cudaMalloc().
+ *
+ *   This API may sleep.
+ *
+ * @param[in]     p2p_token
+ *   A token that uniquely identifies the P2P mapping.
+ * @param[in]     va_space
+ *   A GPU virtual address space qualifier.
+ * @param[in]     virtual_address
+ *   The start address in the specified virtual address space.
+ *   Address must be aligned to the 64KB boundary.
+ * @param[in]     length
+ *   The length of the requested P2P mapping.
+ *   Length must be a multiple of 64KB.
+ * @param[out]    page_table
+ *   A pointer to an array of structures with P2P PTEs.
+ * @param[in]     free_callback
+ *   A non-NULL pointer to the function to be invoked when the pages
+ *   underlying the virtual address range are freed
+ *   implicitly. Must be non NULL.
+ * @param[in]     data
+ *   A non-NULL opaque pointer to private data to be passed to the
+ *   callback function.
+ *
+ * @return
+ *    0           upon successful completion.
+ *   -EINVAL      if an invalid argument was supplied.
+ *   -ENOTSUPP    if the requested operation is not supported.
+ *   -ENOMEM      if the driver failed to allocate memory or if
+ *     insufficient resources were available to complete the operation.
+ *   -EIO         if an unknown error occurred.
+ */
+int nvidia_p2p_get_pages(u64 p2p_token, u32 va_space,
+        u64 virtual_address,
+        u64 length,
+        struct nvidia_p2p_page_table **page_table,
+        void (*free_callback)(void *data),
+        void *data);
+
+#define NVIDIA_P2P_DMA_MAPPING_VERSION   0x00020003
+
+#define NVIDIA_P2P_DMA_MAPPING_VERSION_COMPATIBLE(p) \
+    NVIDIA_P2P_VERSION_COMPATIBLE(p, NVIDIA_P2P_DMA_MAPPING_VERSION)
+
+struct pci_dev;
+
+typedef
+struct nvidia_p2p_dma_mapping {
+    u32 version;
+    enum nvidia_p2p_page_size_type page_size_type;
+    u32 entries;
+    u64 *dma_addresses;
+    void *private;
+    struct pci_dev *pci_dev;
+} nvidia_p2p_dma_mapping_t;
+
+/*
+ * @brief
+ *   Make the physical pages retrieved using nvidia_p2p_get_pages accessible to
+ *   a third-party device.
+ *
+ * @param[in]     peer
+ *   The struct pci_dev * of the peer device that needs to DMA to/from the
+ *   mapping.
+ * @param[in]     page_table
+ *   The page table outlining the physical pages underlying the mapping, as
+ *   retrieved with nvidia_p2p_get_pages().
+ * @param[out]    dma_mapping
+ *   The DMA mapping containing the DMA addresses to use on the third-party
+ *   device.
+ *
+ * @return
+ *    0           upon successful completion.
+ *    -EINVAL     if an invalid argument was supplied.
+ *    -ENOTSUPP   if the requested operation is not supported.
+ *    -EIO        if an unknown error occurred.
+ */
+int nvidia_p2p_dma_map_pages(struct pci_dev *peer,
+        struct nvidia_p2p_page_table *page_table,
+        struct nvidia_p2p_dma_mapping **dma_mapping);
+
+/*
+ * @brief
+ *   Unmap the physical pages previously mapped to the third-party device by
+ *   nvidia_p2p_dma_map_pages().
+ *
+ * @param[in]     peer
+ *   The struct pci_dev * of the peer device that the DMA mapping belongs to.
+ * @param[in]     page_table
+ *   The page table backing the DMA mapping to be unmapped.
+ * @param[in]     dma_mapping
+ *   The DMA mapping containing the DMA addresses used by the third-party
+ *   device, as retrieved with nvidia_p2p_dma_map_pages(). After this call
+ *   returns, neither this struct nor the addresses contained within will be
+ *   valid for use by the third-party device.
+ *
+ * @return
+ *    0           upon successful completion.
+ *    -EINVAL     if an invalid argument was supplied.
+ *    -EIO        if an unknown error occurred.
+ */
+int nvidia_p2p_dma_unmap_pages(struct pci_dev *peer,
+        struct nvidia_p2p_page_table *page_table,
+        struct nvidia_p2p_dma_mapping *dma_mapping);
+
+/*
+ * @brief
+ *   Release a set of pages previously made accessible to
+ *   a third-party device.
+ *
+ * @param[in]     p2p_token
+ *   A token that uniquely identifies the P2P mapping.
+ * @param[in]     va_space
+ *   A GPU virtual address space qualifier.
+ * @param[in]     virtual_address
+ *   The start address in the specified virtual address space.
+ * @param[in]     page_table
+ *   A pointer to the array of structures with P2P PTEs.
+ *
+ * @return
+ *    0           upon successful completion.
+ *   -EINVAL      if an invalid argument was supplied.
+ *   -EIO         if an unknown error occurred.
+ */
+int nvidia_p2p_put_pages(u64 p2p_token, u32 va_space,
+        u64 virtual_address,
+        struct nvidia_p2p_page_table *page_table);
+
+/*
+ * @brief
+ *    Free a third-party P2P page table. (This function is a no-op.)
+ *
+ * @param[in]     page_table
+ *   A pointer to the array of structures with P2P PTEs.
+ *
+ * @return
+ *    0           upon successful completion.
+ *   -EINVAL      if an invalid argument was supplied.
+ */
+int nvidia_p2p_free_page_table(struct nvidia_p2p_page_table *page_table);
+
+/*
+ * @brief
+ *   Free a third-party P2P DMA mapping. (This function is a no-op.)
+ *
+ * @param[in]     dma_mapping
+ *   A pointer to the DMA mapping structure.
+ *
+ * @return
+ *    0           upon successful completion.
+ *    -EINVAL     if an invalid argument was supplied.
+ */
+int nvidia_p2p_free_dma_mapping(struct nvidia_p2p_dma_mapping *dma_mapping);
+
+#define NVIDIA_P2P_RSYNC_DRIVER_VERSION   0x00010001
+
+#define NVIDIA_P2P_RSYNC_DRIVER_VERSION_COMPATIBLE(p) \
+    NVIDIA_P2P_VERSION_COMPATIBLE(p, NVIDIA_P2P_RSYNC_DRIVER_VERSION)
+
+typedef
+struct nvidia_p2p_rsync_driver {
+    u32 version;
+    int (*get_relaxed_ordering_mode)(int *mode, void *data);
+    void (*put_relaxed_ordering_mode)(int mode, void *data);
+    void (*wait_for_rsync)(struct pci_dev *gpu, void *data);
+} nvidia_p2p_rsync_driver_t;
+
+/*
+ * @brief
+ *   Registers the rsync driver.
+ *
+ * @param[in]     driver
+ *   A pointer to the rsync driver structure. The NVIDIA driver would use,
+ *
+ *   get_relaxed_ordering_mode to obtain a reference to the current relaxed
+ *   ordering mode (treated as a boolean) from the rsync driver.
+ *
+ *   put_relaxed_ordering_mode to release a reference to the current relaxed
+ *   ordering mode back to the rsync driver. The NVIDIA driver will call this
+ *   function once for each successful call to get_relaxed_ordering_mode, and
+ *   the relaxed ordering mode must not change until the last reference is
+ *   released.
+ *
+ *   wait_for_rsync to call into the rsync module to issue RSYNC. This callback
+ *   can't sleep or re-schedule as it may arrive under spinlocks.
+ * @param[in]     data
+ *   A pointer to the rsync driver's private data.
+ *
+ * @Returns
+ *   0            upon successful completion.
+ *   -EINVAL      parameters are incorrect.
+ *   -EBUSY       if a module is already registered or GPU devices are in use.
+ */
+int nvidia_p2p_register_rsync_driver(nvidia_p2p_rsync_driver_t *driver,
+                                     void *data);
+
+/*
+ * @brief
+ *   Unregisters the rsync driver.
+ *
+ * @param[in]     driver
+ *   A pointer to the rsync driver structure.
+ * @param[in]     data
+ *   A pointer to the rsync driver's private data.
+ */
+void nvidia_p2p_unregister_rsync_driver(nvidia_p2p_rsync_driver_t *driver,
+                                        void *data);
+
+#define NVIDIA_P2P_RSYNC_REG_INFO_VERSION   0x00020001
+
+#define NVIDIA_P2P_RSYNC_REG_INFO_VERSION_COMPATIBLE(p) \
+    NVIDIA_P2P_VERSION_COMPATIBLE(p, NVIDIA_P2P_RSYNC_REG_INFO_VERSION)
+
+typedef struct nvidia_p2p_rsync_reg {
+    void *ptr;
+    size_t size;
+    struct pci_dev *ibmnpu;
+    struct pci_dev *gpu;
+    u32 cluster_id;
+    u32 socket_id;
+} nvidia_p2p_rsync_reg_t;
+
+typedef struct nvidia_p2p_rsync_reg_info {
+    u32 version;
+    nvidia_p2p_rsync_reg_t *regs;
+    size_t entries;
+} nvidia_p2p_rsync_reg_info_t;
+
+/*
+ * @brief
+ *   Gets rsync (GEN-ID) register information associated with the supported
+ *   NPUs.
+ *
+ *   The caller would use the returned information {GPU device, NPU device,
+ *   socket-id, cluster-id} to pick the optimal generation registers to issue
+ *   RSYNC (NVLink HW flush).
+ *
+ *   The interface allocates structures to return the information, hence
+ *   nvidia_p2p_put_rsync_registers() must be called to free the structures.
+ *
+ *   Note, cluster-id is hardcoded to zero as early system configurations would
+ *   only support cluster mode i.e. all devices would share the same cluster-id
+ *   (0). In the future, appropriate kernel support would be needed to query
+ *   cluster-ids.
+ *
+ * @param[out]     reg_info
+ *   A pointer to the rsync reg info structure.
+ *
+ * @Returns
+ *   0 Upon successful completion. Otherwise, returns negative value.
+ */
+int nvidia_p2p_get_rsync_registers(nvidia_p2p_rsync_reg_info_t **reg_info);
+
+/*
+ * @brief
+ *   Frees the structures allocated by nvidia_p2p_get_rsync_registers().
+ *
+ * @param[in]     reg_info
+ *   A pointer to the rsync reg info structure.
+ */
+void nvidia_p2p_put_rsync_registers(nvidia_p2p_rsync_reg_info_t *reg_info);
+
+#endif /* _NV_P2P_H_ */

From d79a9a608b91b56354d7b54a0f2112d8877e4e76 Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <surajjs@amazon.com>
Date: Mon, 1 Nov 2021 16:49:06 -0700
Subject: [PATCH 271/737] linux/kvm.h: Fix KVM_CAP_PTP_KVM numbering to match
 upstream

When this patch went upstream the KVM_CAP_PTP_KVM number was 195, it was
backported as 191, and it finally ended up being 198 upstream. This will cause
issues if a guest uses a different number to the host as incorrect
functionality will result.

Update it to be 198 to match upstream to avoid this issue.

Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 include/uapi/linux/kvm.h       | 2 +-
 tools/include/uapi/linux/kvm.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 797c40bbc31fa..0d7350d1795bb 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1053,7 +1053,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_X86_USER_SPACE_MSR 188
 #define KVM_CAP_X86_MSR_FILTER 189
 #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
-#define KVM_CAP_PTP_KVM 191
+#define KVM_CAP_PTP_KVM 198
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index 797c40bbc31fa..0d7350d1795bb 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -1053,7 +1053,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_X86_USER_SPACE_MSR 188
 #define KVM_CAP_X86_MSR_FILTER 189
 #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
-#define KVM_CAP_PTP_KVM 191
+#define KVM_CAP_PTP_KVM 198
 
 #ifdef KVM_CAP_IRQ_ROUTING
 

From 334a24961f5fe8af9456f748b14249a53a8212ba Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <surajjs@amazon.com>
Date: Tue, 2 Nov 2021 18:23:07 -0700
Subject: [PATCH 272/737] arm64: module: Use aarch64_insn_write when updating
 relocations later on

apply_relocate_add() is called in module init to apply the relocations which
must be computed at module load time. This is normally called from
apply_relocations() before the module text is mapped read-only in
complete_formation(). However for live patching modules it is also called
after the module text has been marked read-only causing it to fault.

Avoid this fault by calling aarch64_insn_write() to update the instruction
if the module text has already been marked read-only. Preserve the current
behaviour if called before this has been done.

Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 arch/arm64/kernel/module.c | 81 ++++++++++++++++++++++----------------
 1 file changed, 47 insertions(+), 34 deletions(-)

diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index 2a1ad95d9b2cc..23b9914ef5de5 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -19,6 +19,7 @@
 #include <asm/alternative.h>
 #include <asm/insn.h>
 #include <asm/sections.h>
+#include <asm/patching.h>
 
 void *module_alloc(unsigned long size)
 {
@@ -151,7 +152,8 @@ enum aarch64_insn_movw_imm_type {
 };
 
 static int reloc_insn_movw(enum aarch64_reloc_op op, __le32 *place, u64 val,
-			   int lsb, enum aarch64_insn_movw_imm_type imm_type)
+			   int lsb, enum aarch64_insn_movw_imm_type imm_type,
+			   bool early)
 {
 	u64 imm;
 	s64 sval;
@@ -183,7 +185,10 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, __le32 *place, u64 val,
 
 	/* Update the instruction with the new encoding. */
 	insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, imm);
-	*place = cpu_to_le32(insn);
+	if (early)
+		*place = cpu_to_le32(insn);
+	else
+		aarch64_insn_write(place, insn);
 
 	if (imm > U16_MAX)
 		return -ERANGE;
@@ -192,7 +197,8 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, __le32 *place, u64 val,
 }
 
 static int reloc_insn_imm(enum aarch64_reloc_op op, __le32 *place, u64 val,
-			  int lsb, int len, enum aarch64_insn_imm_type imm_type)
+			  int lsb, int len, enum aarch64_insn_imm_type imm_type,
+			  bool early)
 {
 	u64 imm, imm_mask;
 	s64 sval;
@@ -208,7 +214,10 @@ static int reloc_insn_imm(enum aarch64_reloc_op op, __le32 *place, u64 val,
 
 	/* Update the instruction's immediate field. */
 	insn = aarch64_insn_encode_immediate(imm_type, insn, imm);
-	*place = cpu_to_le32(insn);
+	if (early)
+		*place = cpu_to_le32(insn);
+	else
+		aarch64_insn_write(place, insn);
 
 	/*
 	 * Extract the upper value bits (including the sign bit) and
@@ -227,17 +236,17 @@ static int reloc_insn_imm(enum aarch64_reloc_op op, __le32 *place, u64 val,
 }
 
 static int reloc_insn_adrp(struct module *mod, Elf64_Shdr *sechdrs,
-			   __le32 *place, u64 val)
+			   __le32 *place, u64 val, bool early)
 {
 	u32 insn;
 
 	if (!is_forbidden_offset_for_adrp(place))
 		return reloc_insn_imm(RELOC_OP_PAGE, place, val, 12, 21,
-				      AARCH64_INSN_IMM_ADR);
+				      AARCH64_INSN_IMM_ADR, early);
 
 	/* patch ADRP to ADR if it is in range */
 	if (!reloc_insn_imm(RELOC_OP_PREL, place, val & ~0xfff, 0, 21,
-			    AARCH64_INSN_IMM_ADR)) {
+			    AARCH64_INSN_IMM_ADR, early)) {
 		insn = le32_to_cpu(*place);
 		insn &= ~BIT(31);
 	} else {
@@ -249,7 +258,10 @@ static int reloc_insn_adrp(struct module *mod, Elf64_Shdr *sechdrs,
 						   AARCH64_INSN_BRANCH_NOLINK);
 	}
 
-	*place = cpu_to_le32(insn);
+	if (early)
+		*place = cpu_to_le32(insn);
+	else
+		aarch64_insn_write(place, insn);
 	return 0;
 }
 
@@ -266,6 +278,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 	void *loc;
 	u64 val;
 	Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
+	bool early = me->state == MODULE_STATE_UNFORMED;
 
 	for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
 		/* loc corresponds to P in the AArch64 ELF document. */
@@ -318,88 +331,88 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 			fallthrough;
 		case R_AARCH64_MOVW_UABS_G0:
 			ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0,
-					      AARCH64_INSN_IMM_MOVKZ);
+					      AARCH64_INSN_IMM_MOVKZ, early);
 			break;
 		case R_AARCH64_MOVW_UABS_G1_NC:
 			overflow_check = false;
 			fallthrough;
 		case R_AARCH64_MOVW_UABS_G1:
 			ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 16,
-					      AARCH64_INSN_IMM_MOVKZ);
+					      AARCH64_INSN_IMM_MOVKZ, early);
 			break;
 		case R_AARCH64_MOVW_UABS_G2_NC:
 			overflow_check = false;
 			fallthrough;
 		case R_AARCH64_MOVW_UABS_G2:
 			ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 32,
-					      AARCH64_INSN_IMM_MOVKZ);
+					      AARCH64_INSN_IMM_MOVKZ, early);
 			break;
 		case R_AARCH64_MOVW_UABS_G3:
 			/* We're using the top bits so we can't overflow. */
 			overflow_check = false;
 			ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 48,
-					      AARCH64_INSN_IMM_MOVKZ);
+					      AARCH64_INSN_IMM_MOVKZ, early);
 			break;
 		case R_AARCH64_MOVW_SABS_G0:
 			ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0,
-					      AARCH64_INSN_IMM_MOVNZ);
+					      AARCH64_INSN_IMM_MOVNZ, early);
 			break;
 		case R_AARCH64_MOVW_SABS_G1:
 			ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 16,
-					      AARCH64_INSN_IMM_MOVNZ);
+					      AARCH64_INSN_IMM_MOVNZ, early);
 			break;
 		case R_AARCH64_MOVW_SABS_G2:
 			ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 32,
-					      AARCH64_INSN_IMM_MOVNZ);
+					      AARCH64_INSN_IMM_MOVNZ, early);
 			break;
 		case R_AARCH64_MOVW_PREL_G0_NC:
 			overflow_check = false;
 			ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0,
-					      AARCH64_INSN_IMM_MOVKZ);
+					      AARCH64_INSN_IMM_MOVKZ, early);
 			break;
 		case R_AARCH64_MOVW_PREL_G0:
 			ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0,
-					      AARCH64_INSN_IMM_MOVNZ);
+					      AARCH64_INSN_IMM_MOVNZ, early);
 			break;
 		case R_AARCH64_MOVW_PREL_G1_NC:
 			overflow_check = false;
 			ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16,
-					      AARCH64_INSN_IMM_MOVKZ);
+					      AARCH64_INSN_IMM_MOVKZ, early);
 			break;
 		case R_AARCH64_MOVW_PREL_G1:
 			ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16,
-					      AARCH64_INSN_IMM_MOVNZ);
+					      AARCH64_INSN_IMM_MOVNZ, early);
 			break;
 		case R_AARCH64_MOVW_PREL_G2_NC:
 			overflow_check = false;
 			ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32,
-					      AARCH64_INSN_IMM_MOVKZ);
+					      AARCH64_INSN_IMM_MOVKZ, early);
 			break;
 		case R_AARCH64_MOVW_PREL_G2:
 			ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32,
-					      AARCH64_INSN_IMM_MOVNZ);
+					      AARCH64_INSN_IMM_MOVNZ, early);
 			break;
 		case R_AARCH64_MOVW_PREL_G3:
 			/* We're using the top bits so we can't overflow. */
 			overflow_check = false;
 			ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 48,
-					      AARCH64_INSN_IMM_MOVNZ);
+					      AARCH64_INSN_IMM_MOVNZ, early);
 			break;
 
 		/* Immediate instruction relocations. */
 		case R_AARCH64_LD_PREL_LO19:
 			ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 19,
-					     AARCH64_INSN_IMM_19);
+					     AARCH64_INSN_IMM_19, early);
 			break;
 		case R_AARCH64_ADR_PREL_LO21:
 			ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 0, 21,
-					     AARCH64_INSN_IMM_ADR);
+					     AARCH64_INSN_IMM_ADR, early);
 			break;
 		case R_AARCH64_ADR_PREL_PG_HI21_NC:
 			overflow_check = false;
 			fallthrough;
 		case R_AARCH64_ADR_PREL_PG_HI21:
-			ovf = reloc_insn_adrp(me, sechdrs, loc, val);
+			ovf = reloc_insn_adrp(me, sechdrs, loc, val, early);
 			if (ovf && ovf != -ERANGE)
 				return ovf;
 			break;
@@ -407,40 +420,40 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 		case R_AARCH64_LDST8_ABS_LO12_NC:
 			overflow_check = false;
 			ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 0, 12,
-					     AARCH64_INSN_IMM_12);
+					     AARCH64_INSN_IMM_12, early);
 			break;
 		case R_AARCH64_LDST16_ABS_LO12_NC:
 			overflow_check = false;
 			ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 1, 11,
-					     AARCH64_INSN_IMM_12);
+					     AARCH64_INSN_IMM_12, early);
 			break;
 		case R_AARCH64_LDST32_ABS_LO12_NC:
 			overflow_check = false;
 			ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 2, 10,
-					     AARCH64_INSN_IMM_12);
+					     AARCH64_INSN_IMM_12, early);
 			break;
 		case R_AARCH64_LDST64_ABS_LO12_NC:
 			overflow_check = false;
 			ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 3, 9,
-					     AARCH64_INSN_IMM_12);
+					     AARCH64_INSN_IMM_12, early);
 			break;
 		case R_AARCH64_LDST128_ABS_LO12_NC:
 			overflow_check = false;
 			ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 4, 8,
-					     AARCH64_INSN_IMM_12);
+					     AARCH64_INSN_IMM_12, early);
 			break;
 		case R_AARCH64_TSTBR14:
 			ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 14,
-					     AARCH64_INSN_IMM_14);
+					     AARCH64_INSN_IMM_14, early);
 			break;
 		case R_AARCH64_CONDBR19:
 			ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 19,
-					     AARCH64_INSN_IMM_19);
+					     AARCH64_INSN_IMM_19, early);
 			break;
 		case R_AARCH64_JUMP26:
 		case R_AARCH64_CALL26:
 			ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 26,
-					     AARCH64_INSN_IMM_26);
+					     AARCH64_INSN_IMM_26, early);
 
 			if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
 			    ovf == -ERANGE) {
@@ -448,7 +461,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 				if (!val)
 					return -ENOEXEC;
 				ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2,
-						     26, AARCH64_INSN_IMM_26);
+						     26, AARCH64_INSN_IMM_26, early);
 			}
 			break;
 

From b0f7c76a10dff0b86fc423cf306bf52b4d1a0d13 Mon Sep 17 00:00:00 2001
From: Rafael Aquini <aquini@redhat.com>
Date: Tue, 7 Sep 2021 20:00:53 -0700
Subject: [PATCH 273/737] ipc: replace costly bailout check in
 sysvipc_find_ipc()

sysvipc_find_ipc() was left with a costly way to check if the offset
position fed to it is bigger than the total number of IPC IDs in use.  So
much so that the time it takes to iterate over /proc/sysvipc/* files grows
exponentially for a custom benchmark that creates "N" SYSV shm segments
and then times the read of /proc/sysvipc/shm (milliseconds):

    12 msecs to read   1024 segs from /proc/sysvipc/shm
    18 msecs to read   2048 segs from /proc/sysvipc/shm
    65 msecs to read   4096 segs from /proc/sysvipc/shm
   325 msecs to read   8192 segs from /proc/sysvipc/shm
  1303 msecs to read  16384 segs from /proc/sysvipc/shm
  5182 msecs to read  32768 segs from /proc/sysvipc/shm

The root problem lies with the loop that computes the total amount of ids
in use to check if the "pos" feeded to sysvipc_find_ipc() grew bigger than
"ids->in_use".  That is a quite inneficient way to get to the maximum
index in the id lookup table, specially when that value is already
provided by struct ipc_ids.max_idx.

This patch follows up on the optimization introduced via commit
15df03c879836 ("sysvipc: make get_maxid O(1) again") and gets rid of the
aforementioned costly loop replacing it by a simpler checkpoint based on
ipc_get_maxidx() returned value, which allows for a smooth linear increase
in time complexity for the same custom benchmark:

     2 msecs to read   1024 segs from /proc/sysvipc/shm
     2 msecs to read   2048 segs from /proc/sysvipc/shm
     4 msecs to read   4096 segs from /proc/sysvipc/shm
     9 msecs to read   8192 segs from /proc/sysvipc/shm
    19 msecs to read  16384 segs from /proc/sysvipc/shm
    39 msecs to read  32768 segs from /proc/sysvipc/shm

Link: https://lkml.kernel.org/r/20210809203554.1562989-1-aquini@redhat.com
Signed-off-by: Rafael Aquini <aquini@redhat.com>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Acked-by: Manfred Spraul <manfred@colorfullife.com>
Cc: Waiman Long <llong@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/util.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/ipc/util.c b/ipc/util.c
index bbb5190af6d9f..7c3601dad9bd5 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -754,21 +754,13 @@ struct pid_namespace *ipc_seq_pid_ns(struct seq_file *s)
 static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos,
 					      loff_t *new_pos)
 {
-	struct kern_ipc_perm *ipc;
-	int total, id;
-
-	total = 0;
-	for (id = 0; id < pos && total < ids->in_use; id++) {
-		ipc = idr_find(&ids->ipcs_idr, id);
-		if (ipc != NULL)
-			total++;
-	}
+	struct kern_ipc_perm *ipc = NULL;
+	int max_idx = ipc_get_maxidx(ids);
 
-	ipc = NULL;
-	if (total >= ids->in_use)
+	if (max_idx == -1 || pos > max_idx)
 		goto out;
 
-	for (; pos < ipc_mni; pos++) {
+	for (; pos <= max_idx; pos++) {
 		ipc = idr_find(&ids->ipcs_idr, pos);
 		if (ipc != NULL) {
 			rcu_read_lock();

From def3e52f82848cce4cbbdfb7805a56d13b07b55c Mon Sep 17 00:00:00 2001
From: Filippo Sironi <sironi@amazon.de>
Date: Wed, 10 Feb 2021 01:39:42 +0100
Subject: [PATCH 274/737] nvme: add 48-bit DMA address quirk for Amazon NVMe
 controllers

Some Amazon NVMe controllers do not follow the NVMe specification
and are limited to 48-bit DMA addresses.  Add a quirk to force
bounce buffering if needed and limit the IOVA allocation for these
devices.

This affects all current Amazon NVMe controllers that expose EBS
volumes (0x0061, 0x0065, 0x8061) and local instance storage
(0xcd00, 0xcd01, 0xcd02).

Signed-off-by: Filippo Sironi <sironi@amazon.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/nvme.h |  6 ++++++
 drivers/nvme/host/pci.c  | 19 ++++++++++++++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index c3e4d9b6f9c0d..54f97335dd416 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -145,6 +145,12 @@ enum nvme_quirks {
 	 */
 	NVME_QUIRK_NO_NS_DESC_LIST		= (1 << 15),
 
+	/*
+	 * The controller does not properly handle DMA addresses over
+	 * 48 bits.
+	 */
+	NVME_QUIRK_DMA_ADDRESS_BITS_48		= (1 << 16),
+
 	/*
 	 * The controller requires the command_id value be be limited, so skip
 	 * encoding the generation sequence number.
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 3aaead9b3a570..6ce3513eadb84 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2371,13 +2371,16 @@ static int nvme_pci_enable(struct nvme_dev *dev)
 {
 	int result = -ENOMEM;
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
+	int dma_address_bits = 64;
 
 	if (pci_enable_device_mem(pdev))
 		return result;
 
 	pci_set_master(pdev);
 
-	if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)))
+	if (dev->ctrl.quirks & NVME_QUIRK_DMA_ADDRESS_BITS_48)
+		dma_address_bits = 48;
+	if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(dma_address_bits)))
 		goto disable;
 
 	if (readl(dev->bar + NVME_REG_CSTS) == -1) {
@@ -3268,6 +3271,20 @@ static const struct pci_device_id nvme_id_table[] = {
 		.driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
 	{ PCI_DEVICE(0x2646, 0x2263),   /* KINGSTON A2000 NVMe SSD  */
 		.driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
+	{ PCI_DEVICE(0x1d97, 0x2263),   /* SPCC */
+		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0061),
+		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0065),
+		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x8061),
+		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd00),
+		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd01),
+		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd02),
+		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
 	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001),
 		.driver_data = NVME_QUIRK_SINGLE_VECTOR },
 	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },

From b5ac102690537843af177de786ecccd891648b6b Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Thu, 23 Dec 2021 19:28:09 +0000
Subject: [PATCH 275/737] Revert "PCI/MSI: Enforce that MSI-X table entry is
 masked for update"

This reverts commit 0b2509d7a90c11666280acf9ada4add767c2b819.

This commit, while looking perfectly correct, causes problems
with interrupts on ec2 nvme devices. So, revert it for now.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/pci/msi.c | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 3da69b26e6743..cc0683b9312fd 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -318,29 +318,14 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
 		/* Don't touch the hardware now */
 	} else if (entry->msi_attrib.is_msix) {
 		void __iomem *base = pci_msix_desc_addr(entry);
-		bool unmasked = !(entry->masked & PCI_MSIX_ENTRY_CTRL_MASKBIT);
 
 		if (!base)
 			goto skip;
 
-		/*
-		 * The specification mandates that the entry is masked
-		 * when the message is modified:
-		 *
-		 * "If software changes the Address or Data value of an
-		 * entry while the entry is unmasked, the result is
-		 * undefined."
-		 */
-		if (unmasked)
-			__pci_msix_desc_mask_irq(entry, PCI_MSIX_ENTRY_CTRL_MASKBIT);
-
 		writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR);
 		writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR);
 		writel(msg->data, base + PCI_MSIX_ENTRY_DATA);
 
-		if (unmasked)
-			__pci_msix_desc_mask_irq(entry, 0);
-
 		/* Ensure that the writes are visible in the device */
 		readl(base + PCI_MSIX_ENTRY_DATA);
 	} else {

From b9c38421c6a2941ab3980d700beb16e54ab9cbc3 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sjpark@amazon.de>
Date: Tue, 7 Sep 2021 19:57:05 -0700
Subject: [PATCH 276/737] Documentation: add documents for DAMON

This commit adds documents for DAMON under
`Documentation/admin-guide/mm/damon/` and `Documentation/vm/damon/`.

Link: https://lkml.kernel.org/r/20210716081449.22187-11-sj38.park@gmail.com
Signed-off-by: SeongJae Park <sjpark@amazon.de>
Reviewed-by: Fernand Sieber <sieberf@amazon.com>
Reviewed-by: Markus Boehme <markubo@amazon.de>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Fan Du <fan.du@intel.com>
Cc: Greg Kroah-Hartman <greg@kroah.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joe Perches <joe@perches.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Maximilian Heyne <mheyne@amazon.de>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/index.rst |  15 ++
 Documentation/admin-guide/mm/damon/start.rst | 114 +++++++++++++
 Documentation/admin-guide/mm/damon/usage.rst | 112 +++++++++++++
 Documentation/admin-guide/mm/index.rst       |   1 +
 Documentation/vm/damon/api.rst               |  20 +++
 Documentation/vm/damon/design.rst            | 166 +++++++++++++++++++
 Documentation/vm/damon/faq.rst               |  51 ++++++
 Documentation/vm/damon/index.rst             |  30 ++++
 Documentation/vm/index.rst                   |   1 +
 9 files changed, 510 insertions(+)
 create mode 100644 Documentation/admin-guide/mm/damon/index.rst
 create mode 100644 Documentation/admin-guide/mm/damon/start.rst
 create mode 100644 Documentation/admin-guide/mm/damon/usage.rst
 create mode 100644 Documentation/vm/damon/api.rst
 create mode 100644 Documentation/vm/damon/design.rst
 create mode 100644 Documentation/vm/damon/faq.rst
 create mode 100644 Documentation/vm/damon/index.rst

diff --git a/Documentation/admin-guide/mm/damon/index.rst b/Documentation/admin-guide/mm/damon/index.rst
new file mode 100644
index 0000000000000..8c5dde3a57544
--- /dev/null
+++ b/Documentation/admin-guide/mm/damon/index.rst
@@ -0,0 +1,15 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+========================
+Monitoring Data Accesses
+========================
+
+:doc:`DAMON </vm/damon/index>` allows light-weight data access monitoring.
+Using DAMON, users can analyze the memory access patterns of their systems and
+optimize those.
+
+.. toctree::
+   :maxdepth: 2
+
+   start
+   usage
diff --git a/Documentation/admin-guide/mm/damon/start.rst b/Documentation/admin-guide/mm/damon/start.rst
new file mode 100644
index 0000000000000..d5eb89a8fc386
--- /dev/null
+++ b/Documentation/admin-guide/mm/damon/start.rst
@@ -0,0 +1,114 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============
+Getting Started
+===============
+
+This document briefly describes how you can use DAMON by demonstrating its
+default user space tool.  Please note that this document describes only a part
+of its features for brevity.  Please refer to :doc:`usage` for more details.
+
+
+TL; DR
+======
+
+Follow the commands below to monitor and visualize the memory access pattern of
+your workload. ::
+
+    # # build the kernel with CONFIG_DAMON_*=y, install it, and reboot
+    # mount -t debugfs none /sys/kernel/debug/
+    # git clone https://github.com/awslabs/damo
+    # ./damo/damo record $(pidof <your workload>)
+    # ./damo/damo report heat --plot_ascii
+
+The final command draws the access heatmap of ``<your workload>``.  The heatmap
+shows which memory region (x-axis) is accessed when (y-axis) and how frequently
+(number; the higher the more accesses have been observed). ::
+
+    111111111111111111111111111111111111111111111111111111110000
+    111121111111111111111111111111211111111111111111111111110000
+    000000000000000000000000000000000000000000000000001555552000
+    000000000000000000000000000000000000000000000222223555552000
+    000000000000000000000000000000000000000011111677775000000000
+    000000000000000000000000000000000000000488888000000000000000
+    000000000000000000000000000000000177888400000000000000000000
+    000000000000000000000000000046666522222100000000000000000000
+    000000000000000000000014444344444300000000000000000000000000
+    000000000000000002222245555510000000000000000000000000000000
+    # access_frequency:  0  1  2  3  4  5  6  7  8  9
+    # x-axis: space (140286319947776-140286426374096: 101.496 MiB)
+    # y-axis: time (605442256436361-605479951866441: 37.695430s)
+    # resolution: 60x10 (1.692 MiB and 3.770s for each character)
+
+
+Prerequisites
+=============
+
+Kernel
+------
+
+You should first ensure your system is running on a kernel built with
+``CONFIG_DAMON_*=y``.
+
+
+User Space Tool
+---------------
+
+For the demonstration, we will use the default user space tool for DAMON,
+called DAMON Operator (DAMO).  It is available at
+https://github.com/awslabs/damo.  The examples below assume that ``damo`` is on
+your ``$PATH``.  It's not mandatory, though.
+
+Because DAMO is using the debugfs interface (refer to :doc:`usage` for the
+detail) of DAMON, you should ensure debugfs is mounted.  Mount it manually as
+below::
+
+    # mount -t debugfs none /sys/kernel/debug/
+
+or append the following line to your ``/etc/fstab`` file so that your system
+can automatically mount debugfs upon booting::
+
+    debugfs /sys/kernel/debug debugfs defaults 0 0
+
+
+Recording Data Access Patterns
+==============================
+
+The commands below record the memory access patterns of a program and save the
+monitoring results to a file. ::
+
+    $ git clone https://github.com/sjp38/masim
+    $ cd masim; make; ./masim ./configs/zigzag.cfg &
+    $ sudo damo record -o damon.data $(pidof masim)
+
+The first two lines of the commands download an artificial memory access
+generator program and run it in the background.  The generator will repeatedly
+access two 100 MiB sized memory regions one by one.  You can substitute this
+with your real workload.  The last line asks ``damo`` to record the access
+pattern in the ``damon.data`` file.
+
+
+Visualizing Recorded Patterns
+=============================
+
+The following three commands visualize the recorded access patterns and save
+the results as separate image files. ::
+
+    $ damo report heats --heatmap access_pattern_heatmap.png
+    $ damo report wss --range 0 101 1 --plot wss_dist.png
+    $ damo report wss --range 0 101 1 --sortby time --plot wss_chron_change.png
+
+- ``access_pattern_heatmap.png`` will visualize the data access pattern in a
+  heatmap, showing which memory region (y-axis) got accessed when (x-axis)
+  and how frequently (color).
+- ``wss_dist.png`` will show the distribution of the working set size.
+- ``wss_chron_change.png`` will show how the working set size has
+  chronologically changed.
+
+You can view the visualizations of this example workload at [1]_.
+Visualizations of other realistic workloads are available at [2]_ [3]_ [4]_.
+
+.. [1] https://damonitor.github.io/doc/html/v17/admin-guide/mm/damon/start.html#visualizing-recorded-patterns
+.. [2] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.1.png.html
+.. [3] https://damonitor.github.io/test/result/visual/latest/rec.wss_sz.png.html
+.. [4] https://damonitor.github.io/test/result/visual/latest/rec.wss_time.png.html
diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
new file mode 100644
index 0000000000000..a72cda374abac
--- /dev/null
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -0,0 +1,112 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============
+Detailed Usages
+===============
+
+DAMON provides below three interfaces for different users.
+
+- *DAMON user space tool.*
+  This is for privileged people such as system administrators who want a
+  just-working human-friendly interface.  Using this, users can use the DAMON’s
+  major features in a human-friendly way.  It may not be highly tuned for
+  special cases, though.  It supports only virtual address spaces monitoring.
+- *debugfs interface.*
+  This is for privileged user space programmers who want more optimized use of
+  DAMON.  Using this, users can use DAMON’s major features by reading
+  from and writing to special debugfs files.  Therefore, you can write and use
+  your personalized DAMON debugfs wrapper programs that reads/writes the
+  debugfs files instead of you.  The DAMON user space tool is also a reference
+  implementation of such programs.  It supports only virtual address spaces
+  monitoring.
+- *Kernel Space Programming Interface.*
+  This is for kernel space programmers.  Using this, users can utilize every
+  feature of DAMON most flexibly and efficiently by writing kernel space
+  DAMON application programs for you.  You can even extend DAMON for various
+  address spaces.
+
+Nevertheless, you could write your own user space tool using the debugfs
+interface.  A reference implementation is available at
+https://github.com/awslabs/damo.  If you are a kernel programmer, you could
+refer to :doc:`/vm/damon/api` for the kernel space programming interface.  For
+the reason, this document describes only the debugfs interface
+
+debugfs Interface
+=================
+
+DAMON exports three files, ``attrs``, ``target_ids``, and ``monitor_on`` under
+its debugfs directory, ``<debugfs>/damon/``.
+
+
+Attributes
+----------
+
+Users can get and set the ``sampling interval``, ``aggregation interval``,
+``regions update interval``, and min/max number of monitoring target regions by
+reading from and writing to the ``attrs`` file.  To know about the monitoring
+attributes in detail, please refer to the :doc:`/vm/damon/design`.  For
+example, below commands set those values to 5 ms, 100 ms, 1,000 ms, 10 and
+1000, and then check it again::
+
+    # cd <debugfs>/damon
+    # echo 5000 100000 1000000 10 1000 > attrs
+    # cat attrs
+    5000 100000 1000000 10 1000
+
+
+Target IDs
+----------
+
+Some types of address spaces supports multiple monitoring target.  For example,
+the virtual memory address spaces monitoring can have multiple processes as the
+monitoring targets.  Users can set the targets by writing relevant id values of
+the targets to, and get the ids of the current targets by reading from the
+``target_ids`` file.  In case of the virtual address spaces monitoring, the
+values should be pids of the monitoring target processes.  For example, below
+commands set processes having pids 42 and 4242 as the monitoring targets and
+check it again::
+
+    # cd <debugfs>/damon
+    # echo 42 4242 > target_ids
+    # cat target_ids
+    42 4242
+
+Note that setting the target ids doesn't start the monitoring.
+
+
+Turning On/Off
+--------------
+
+Setting the files as described above doesn't incur effect unless you explicitly
+start the monitoring.  You can start, stop, and check the current status of the
+monitoring by writing to and reading from the ``monitor_on`` file.  Writing
+``on`` to the file starts the monitoring of the targets with the attributes.
+Writing ``off`` to the file stops those.  DAMON also stops if every target
+process is terminated.  Below example commands turn on, off, and check the
+status of DAMON::
+
+    # cd <debugfs>/damon
+    # echo on > monitor_on
+    # echo off > monitor_on
+    # cat monitor_on
+    off
+
+Please note that you cannot write to the above-mentioned debugfs files while
+the monitoring is turned on.  If you write to the files while DAMON is running,
+an error code such as ``-EBUSY`` will be returned.
+
+
+Tracepoint for Monitoring Results
+=================================
+
+DAMON provides the monitoring results via a tracepoint,
+``damon:damon_aggregated``.  While the monitoring is turned on, you could
+record the tracepoint events and show results using tracepoint supporting tools
+like ``perf``.  For example::
+
+    # echo on > monitor_on
+    # perf record -e damon:damon_aggregated &
+    # sleep 5
+    # kill 9 $(pidof perf)
+    # echo off > monitor_on
+    # perf script
diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst
index cd727cfc1b040..32c27fbf1913c 100644
--- a/Documentation/admin-guide/mm/index.rst
+++ b/Documentation/admin-guide/mm/index.rst
@@ -27,6 +27,7 @@ the Linux memory management.
 
    concepts
    cma_debugfs
+   damon/index
    hugetlbpage
    idle_page_tracking
    ksm
diff --git a/Documentation/vm/damon/api.rst b/Documentation/vm/damon/api.rst
new file mode 100644
index 0000000000000..08f34df45523a
--- /dev/null
+++ b/Documentation/vm/damon/api.rst
@@ -0,0 +1,20 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=============
+API Reference
+=============
+
+Kernel space programs can use every feature of DAMON using below APIs.  All you
+need to do is including ``damon.h``, which is located in ``include/linux/`` of
+the source tree.
+
+Structures
+==========
+
+.. kernel-doc:: include/linux/damon.h
+
+
+Functions
+=========
+
+.. kernel-doc:: mm/damon/core.c
diff --git a/Documentation/vm/damon/design.rst b/Documentation/vm/damon/design.rst
new file mode 100644
index 0000000000000..b05159c295f4d
--- /dev/null
+++ b/Documentation/vm/damon/design.rst
@@ -0,0 +1,166 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======
+Design
+======
+
+Configurable Layers
+===================
+
+DAMON provides data access monitoring functionality while making the accuracy
+and the overhead controllable.  The fundamental access monitorings require
+primitives that dependent on and optimized for the target address space.  On
+the other hand, the accuracy and overhead tradeoff mechanism, which is the core
+of DAMON, is in the pure logic space.  DAMON separates the two parts in
+different layers and defines its interface to allow various low level
+primitives implementations configurable with the core logic.
+
+Due to this separated design and the configurable interface, users can extend
+DAMON for any address space by configuring the core logics with appropriate low
+level primitive implementations.  If appropriate one is not provided, users can
+implement the primitives on their own.
+
+For example, physical memory, virtual memory, swap space, those for specific
+processes, NUMA nodes, files, and backing memory devices would be supportable.
+Also, if some architectures or devices support special optimized access check
+primitives, those will be easily configurable.
+
+
+Reference Implementations of Address Space Specific Primitives
+==============================================================
+
+The low level primitives for the fundamental access monitoring are defined in
+two parts:
+
+1. Identification of the monitoring target address range for the address space.
+2. Access check of specific address range in the target space.
+
+DAMON currently provides the implementation of the primitives for only the
+virtual address spaces. Below two subsections describe how it works.
+
+
+VMA-based Target Address Range Construction
+-------------------------------------------
+
+Only small parts in the super-huge virtual address space of the processes are
+mapped to the physical memory and accessed.  Thus, tracking the unmapped
+address regions is just wasteful.  However, because DAMON can deal with some
+level of noise using the adaptive regions adjustment mechanism, tracking every
+mapping is not strictly required but could even incur a high overhead in some
+cases.  That said, too huge unmapped areas inside the monitoring target should
+be removed to not take the time for the adaptive mechanism.
+
+For the reason, this implementation converts the complex mappings to three
+distinct regions that cover every mapped area of the address space.  The two
+gaps between the three regions are the two biggest unmapped areas in the given
+address space.  The two biggest unmapped areas would be the gap between the
+heap and the uppermost mmap()-ed region, and the gap between the lowermost
+mmap()-ed region and the stack in most of the cases.  Because these gaps are
+exceptionally huge in usual address spaces, excluding these will be sufficient
+to make a reasonable trade-off.  Below shows this in detail::
+
+    <heap>
+    <BIG UNMAPPED REGION 1>
+    <uppermost mmap()-ed region>
+    (small mmap()-ed regions and munmap()-ed regions)
+    <lowermost mmap()-ed region>
+    <BIG UNMAPPED REGION 2>
+    <stack>
+
+
+PTE Accessed-bit Based Access Check
+-----------------------------------
+
+The implementation for the virtual address space uses PTE Accessed-bit for
+basic access checks.  It finds the relevant PTE Accessed bit from the address
+by walking the page table for the target task of the address.  In this way, the
+implementation finds and clears the bit for next sampling target address and
+checks whether the bit set again after one sampling period.  This could disturb
+other kernel subsystems using the Accessed bits, namely Idle page tracking and
+the reclaim logic.  To avoid such disturbances, DAMON makes it mutually
+exclusive with Idle page tracking and uses ``PG_idle`` and ``PG_young`` page
+flags to solve the conflict with the reclaim logic, as Idle page tracking does.
+
+
+Address Space Independent Core Mechanisms
+=========================================
+
+Below four sections describe each of the DAMON core mechanisms and the five
+monitoring attributes, ``sampling interval``, ``aggregation interval``,
+``regions update interval``, ``minimum number of regions``, and ``maximum
+number of regions``.
+
+
+Access Frequency Monitoring
+---------------------------
+
+The output of DAMON says what pages are how frequently accessed for a given
+duration.  The resolution of the access frequency is controlled by setting
+``sampling interval`` and ``aggregation interval``.  In detail, DAMON checks
+access to each page per ``sampling interval`` and aggregates the results.  In
+other words, counts the number of the accesses to each page.  After each
+``aggregation interval`` passes, DAMON calls callback functions that previously
+registered by users so that users can read the aggregated results and then
+clears the results.  This can be described in below simple pseudo-code::
+
+    while monitoring_on:
+        for page in monitoring_target:
+            if accessed(page):
+                nr_accesses[page] += 1
+        if time() % aggregation_interval == 0:
+            for callback in user_registered_callbacks:
+                callback(monitoring_target, nr_accesses)
+            for page in monitoring_target:
+                nr_accesses[page] = 0
+        sleep(sampling interval)
+
+The monitoring overhead of this mechanism will arbitrarily increase as the
+size of the target workload grows.
+
+
+Region Based Sampling
+---------------------
+
+To avoid the unbounded increase of the overhead, DAMON groups adjacent pages
+that assumed to have the same access frequencies into a region.  As long as the
+assumption (pages in a region have the same access frequencies) is kept, only
+one page in the region is required to be checked.  Thus, for each ``sampling
+interval``, DAMON randomly picks one page in each region, waits for one
+``sampling interval``, checks whether the page is accessed meanwhile, and
+increases the access frequency of the region if so.  Therefore, the monitoring
+overhead is controllable by setting the number of regions.  DAMON allows users
+to set the minimum and the maximum number of regions for the trade-off.
+
+This scheme, however, cannot preserve the quality of the output if the
+assumption is not guaranteed.
+
+
+Adaptive Regions Adjustment
+---------------------------
+
+Even somehow the initial monitoring target regions are well constructed to
+fulfill the assumption (pages in same region have similar access frequencies),
+the data access pattern can be dynamically changed.  This will result in low
+monitoring quality.  To keep the assumption as much as possible, DAMON
+adaptively merges and splits each region based on their access frequency.
+
+For each ``aggregation interval``, it compares the access frequencies of
+adjacent regions and merges those if the frequency difference is small.  Then,
+after it reports and clears the aggregated access frequency of each region, it
+splits each region into two or three regions if the total number of regions
+will not exceed the user-specified maximum number of regions after the split.
+
+In this way, DAMON provides its best-effort quality and minimal overhead while
+keeping the bounds users set for their trade-off.
+
+
+Dynamic Target Space Updates Handling
+-------------------------------------
+
+The monitoring target address range could dynamically changed.  For example,
+virtual memory could be dynamically mapped and unmapped.  Physical memory could
+be hot-plugged.
+
+As the changes could be quite frequent in some cases, DAMON checks the dynamic
+memory mapping changes and applies it to the abstracted target area only for
+each of a user-specified time interval (``regions update interval``).
diff --git a/Documentation/vm/damon/faq.rst b/Documentation/vm/damon/faq.rst
new file mode 100644
index 0000000000000..cb3d8b585a8b3
--- /dev/null
+++ b/Documentation/vm/damon/faq.rst
@@ -0,0 +1,51 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========================
+Frequently Asked Questions
+==========================
+
+Why a new subsystem, instead of extending perf or other user space tools?
+=========================================================================
+
+First, because it needs to be lightweight as much as possible so that it can be
+used online, any unnecessary overhead such as kernel - user space context
+switching cost should be avoided.  Second, DAMON aims to be used by other
+programs including the kernel.  Therefore, having a dependency on specific
+tools like perf is not desirable.  These are the two biggest reasons why DAMON
+is implemented in the kernel space.
+
+
+Can 'idle pages tracking' or 'perf mem' substitute DAMON?
+=========================================================
+
+Idle page tracking is a low level primitive for access check of the physical
+address space.  'perf mem' is similar, though it can use sampling to minimize
+the overhead.  On the other hand, DAMON is a higher-level framework for the
+monitoring of various address spaces.  It is focused on memory management
+optimization and provides sophisticated accuracy/overhead handling mechanisms.
+Therefore, 'idle pages tracking' and 'perf mem' could provide a subset of
+DAMON's output, but cannot substitute DAMON.
+
+
+Does DAMON support virtual memory only?
+=======================================
+
+No.  The core of the DAMON is address space independent.  The address space
+specific low level primitive parts including monitoring target regions
+constructions and actual access checks can be implemented and configured on the
+DAMON core by the users.  In this way, DAMON users can monitor any address
+space with any access check technique.
+
+Nonetheless, DAMON provides vma tracking and PTE Accessed bit check based
+implementations of the address space dependent functions for the virtual memory
+by default, for a reference and convenient use.  In near future, we will
+provide those for physical memory address space.
+
+
+Can I simply monitor page granularity?
+======================================
+
+Yes.  You can do so by setting the ``min_nr_regions`` attribute higher than the
+working set size divided by the page size.  Because the monitoring target
+regions size is forced to be ``>=page size``, the region split will make no
+effect.
diff --git a/Documentation/vm/damon/index.rst b/Documentation/vm/damon/index.rst
new file mode 100644
index 0000000000000..a2858baf3bf1d
--- /dev/null
+++ b/Documentation/vm/damon/index.rst
@@ -0,0 +1,30 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========================
+DAMON: Data Access MONitor
+==========================
+
+DAMON is a data access monitoring framework subsystem for the Linux kernel.
+The core mechanisms of DAMON (refer to :doc:`design` for the detail) make it
+
+ - *accurate* (the monitoring output is useful enough for DRAM level memory
+   management; It might not appropriate for CPU Cache levels, though),
+ - *light-weight* (the monitoring overhead is low enough to be applied online),
+   and
+ - *scalable* (the upper-bound of the overhead is in constant range regardless
+   of the size of target workloads).
+
+Using this framework, therefore, the kernel's memory management mechanisms can
+make advanced decisions.  Experimental memory management optimization works
+that incurring high data accesses monitoring overhead could implemented again.
+In user space, meanwhile, users who have some special workloads can write
+personalized applications for better understanding and optimizations of their
+workloads and systems.
+
+.. toctree::
+   :maxdepth: 2
+
+   faq
+   design
+   api
+   plans
diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst
index eff5fbd492d08..b51f0d8992f8f 100644
--- a/Documentation/vm/index.rst
+++ b/Documentation/vm/index.rst
@@ -32,6 +32,7 @@ descriptions of data structures and algorithms.
    arch_pgtable_helpers
    balance
    cleancache
+   damon/index
    free_page_reporting
    frontswap
    highmem

From 0d479f3cdc30cfa53e88bca8f9cf752e4cde91b6 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sjpark@amazon.de>
Date: Tue, 7 Sep 2021 19:57:09 -0700
Subject: [PATCH 277/737] mm/damon: add kunit tests

This commit adds kunit based unit tests for the core and the virtual
address spaces monitoring primitives of DAMON.

Link: https://lkml.kernel.org/r/20210716081449.22187-12-sj38.park@gmail.com
Signed-off-by: SeongJae Park <sjpark@amazon.de>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Fan Du <fan.du@intel.com>
Cc: Fernand Sieber <sieberf@amazon.com>
Cc: Greg Kroah-Hartman <greg@kroah.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joe Perches <joe@perches.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Maximilian Heyne <mheyne@amazon.de>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/Kconfig      |  36 +++++
 mm/damon/core-test.h  | 253 ++++++++++++++++++++++++++++++++
 mm/damon/core.c       |   7 +
 mm/damon/dbgfs-test.h | 126 ++++++++++++++++
 mm/damon/dbgfs.c      |   2 +
 mm/damon/vaddr-test.h | 329 ++++++++++++++++++++++++++++++++++++++++++
 mm/damon/vaddr.c      |   7 +
 7 files changed, 760 insertions(+)
 create mode 100644 mm/damon/core-test.h
 create mode 100644 mm/damon/dbgfs-test.h
 create mode 100644 mm/damon/vaddr-test.h

diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index c8e3dba6fb4cf..37024798a97ca 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -12,6 +12,18 @@ config DAMON
 	  See https://damonitor.github.io/doc/html/latest-damon/index.html for
 	  more information.
 
+config DAMON_KUNIT_TEST
+	bool "Test for damon" if !KUNIT_ALL_TESTS
+	depends on DAMON && KUNIT=y
+	default KUNIT_ALL_TESTS
+	help
+	  This builds the DAMON Kunit test suite.
+
+	  For more information on KUnit and unit tests in general, please refer
+	  to the KUnit documentation.
+
+	  If unsure, say N.
+
 config DAMON_VADDR
 	bool "Data access monitoring primitives for virtual address spaces"
 	depends on DAMON && MMU
@@ -20,6 +32,18 @@ config DAMON_VADDR
 	  This builds the default data access monitoring primitives for DAMON
 	  that works for virtual address spaces.
 
+config DAMON_VADDR_KUNIT_TEST
+	bool "Test for DAMON primitives" if !KUNIT_ALL_TESTS
+	depends on DAMON_VADDR && KUNIT=y
+	default KUNIT_ALL_TESTS
+	help
+	  This builds the DAMON virtual addresses primitives Kunit test suite.
+
+	  For more information on KUnit and unit tests in general, please refer
+	  to the KUnit documentation.
+
+	  If unsure, say N.
+
 config DAMON_DBGFS
 	bool "DAMON debugfs interface"
 	depends on DAMON_VADDR && DEBUG_FS
@@ -29,4 +53,16 @@ config DAMON_DBGFS
 
 	  If unsure, say N.
 
+config DAMON_DBGFS_KUNIT_TEST
+	bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS
+	depends on DAMON_DBGFS && KUNIT=y
+	default KUNIT_ALL_TESTS
+	help
+	  This builds the DAMON debugfs interface Kunit test suite.
+
+	  For more information on KUnit and unit tests in general, please refer
+	  to the KUnit documentation.
+
+	  If unsure, say N.
+
 endmenu
diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h
new file mode 100644
index 0000000000000..c938a9c34e6c5
--- /dev/null
+++ b/mm/damon/core-test.h
@@ -0,0 +1,253 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Data Access Monitor Unit Tests
+ *
+ * Copyright 2019 Amazon.com, Inc. or its affiliates.  All rights reserved.
+ *
+ * Author: SeongJae Park <sjpark@amazon.de>
+ */
+
+#ifdef CONFIG_DAMON_KUNIT_TEST
+
+#ifndef _DAMON_CORE_TEST_H
+#define _DAMON_CORE_TEST_H
+
+#include <kunit/test.h>
+
+static void damon_test_regions(struct kunit *test)
+{
+	struct damon_region *r;
+	struct damon_target *t;
+
+	r = damon_new_region(1, 2);
+	KUNIT_EXPECT_EQ(test, 1ul, r->ar.start);
+	KUNIT_EXPECT_EQ(test, 2ul, r->ar.end);
+	KUNIT_EXPECT_EQ(test, 0u, r->nr_accesses);
+
+	t = damon_new_target(42);
+	KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t));
+
+	damon_add_region(r, t);
+	KUNIT_EXPECT_EQ(test, 1u, damon_nr_regions(t));
+
+	damon_del_region(r, t);
+	KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t));
+
+	damon_free_target(t);
+}
+
+static unsigned int nr_damon_targets(struct damon_ctx *ctx)
+{
+	struct damon_target *t;
+	unsigned int nr_targets = 0;
+
+	damon_for_each_target(t, ctx)
+		nr_targets++;
+
+	return nr_targets;
+}
+
+static void damon_test_target(struct kunit *test)
+{
+	struct damon_ctx *c = damon_new_ctx();
+	struct damon_target *t;
+
+	t = damon_new_target(42);
+	KUNIT_EXPECT_EQ(test, 42ul, t->id);
+	KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c));
+
+	damon_add_target(c, t);
+	KUNIT_EXPECT_EQ(test, 1u, nr_damon_targets(c));
+
+	damon_destroy_target(t);
+	KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c));
+
+	damon_destroy_ctx(c);
+}
+
+/*
+ * Test kdamond_reset_aggregated()
+ *
+ * DAMON checks access to each region and aggregates this information as the
+ * access frequency of each region.  In detail, it increases '->nr_accesses' of
+ * regions that an access has confirmed.  'kdamond_reset_aggregated()' flushes
+ * the aggregated information ('->nr_accesses' of each regions) to the result
+ * buffer.  As a result of the flushing, the '->nr_accesses' of regions are
+ * initialized to zero.
+ */
+static void damon_test_aggregate(struct kunit *test)
+{
+	struct damon_ctx *ctx = damon_new_ctx();
+	unsigned long target_ids[] = {1, 2, 3};
+	unsigned long saddr[][3] = {{10, 20, 30}, {5, 42, 49}, {13, 33, 55} };
+	unsigned long eaddr[][3] = {{15, 27, 40}, {31, 45, 55}, {23, 44, 66} };
+	unsigned long accesses[][3] = {{42, 95, 84}, {10, 20, 30}, {0, 1, 2} };
+	struct damon_target *t;
+	struct damon_region *r;
+	int it, ir;
+
+	damon_set_targets(ctx, target_ids, 3);
+
+	it = 0;
+	damon_for_each_target(t, ctx) {
+		for (ir = 0; ir < 3; ir++) {
+			r = damon_new_region(saddr[it][ir], eaddr[it][ir]);
+			r->nr_accesses = accesses[it][ir];
+			damon_add_region(r, t);
+		}
+		it++;
+	}
+	kdamond_reset_aggregated(ctx);
+	it = 0;
+	damon_for_each_target(t, ctx) {
+		ir = 0;
+		/* '->nr_accesses' should be zeroed */
+		damon_for_each_region(r, t) {
+			KUNIT_EXPECT_EQ(test, 0u, r->nr_accesses);
+			ir++;
+		}
+		/* regions should be preserved */
+		KUNIT_EXPECT_EQ(test, 3, ir);
+		it++;
+	}
+	/* targets also should be preserved */
+	KUNIT_EXPECT_EQ(test, 3, it);
+
+	damon_destroy_ctx(ctx);
+}
+
+static void damon_test_split_at(struct kunit *test)
+{
+	struct damon_ctx *c = damon_new_ctx();
+	struct damon_target *t;
+	struct damon_region *r;
+
+	t = damon_new_target(42);
+	r = damon_new_region(0, 100);
+	damon_add_region(r, t);
+	damon_split_region_at(c, t, r, 25);
+	KUNIT_EXPECT_EQ(test, r->ar.start, 0ul);
+	KUNIT_EXPECT_EQ(test, r->ar.end, 25ul);
+
+	r = damon_next_region(r);
+	KUNIT_EXPECT_EQ(test, r->ar.start, 25ul);
+	KUNIT_EXPECT_EQ(test, r->ar.end, 100ul);
+
+	damon_free_target(t);
+	damon_destroy_ctx(c);
+}
+
+static void damon_test_merge_two(struct kunit *test)
+{
+	struct damon_target *t;
+	struct damon_region *r, *r2, *r3;
+	int i;
+
+	t = damon_new_target(42);
+	r = damon_new_region(0, 100);
+	r->nr_accesses = 10;
+	damon_add_region(r, t);
+	r2 = damon_new_region(100, 300);
+	r2->nr_accesses = 20;
+	damon_add_region(r2, t);
+
+	damon_merge_two_regions(t, r, r2);
+	KUNIT_EXPECT_EQ(test, r->ar.start, 0ul);
+	KUNIT_EXPECT_EQ(test, r->ar.end, 300ul);
+	KUNIT_EXPECT_EQ(test, r->nr_accesses, 16u);
+
+	i = 0;
+	damon_for_each_region(r3, t) {
+		KUNIT_EXPECT_PTR_EQ(test, r, r3);
+		i++;
+	}
+	KUNIT_EXPECT_EQ(test, i, 1);
+
+	damon_free_target(t);
+}
+
+static struct damon_region *__nth_region_of(struct damon_target *t, int idx)
+{
+	struct damon_region *r;
+	unsigned int i = 0;
+
+	damon_for_each_region(r, t) {
+		if (i++ == idx)
+			return r;
+	}
+
+	return NULL;
+}
+
+static void damon_test_merge_regions_of(struct kunit *test)
+{
+	struct damon_target *t;
+	struct damon_region *r;
+	unsigned long sa[] = {0, 100, 114, 122, 130, 156, 170, 184};
+	unsigned long ea[] = {100, 112, 122, 130, 156, 170, 184, 230};
+	unsigned int nrs[] = {0, 0, 10, 10, 20, 30, 1, 2};
+
+	unsigned long saddrs[] = {0, 114, 130, 156, 170};
+	unsigned long eaddrs[] = {112, 130, 156, 170, 230};
+	int i;
+
+	t = damon_new_target(42);
+	for (i = 0; i < ARRAY_SIZE(sa); i++) {
+		r = damon_new_region(sa[i], ea[i]);
+		r->nr_accesses = nrs[i];
+		damon_add_region(r, t);
+	}
+
+	damon_merge_regions_of(t, 9, 9999);
+	/* 0-112, 114-130, 130-156, 156-170 */
+	KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 5u);
+	for (i = 0; i < 5; i++) {
+		r = __nth_region_of(t, i);
+		KUNIT_EXPECT_EQ(test, r->ar.start, saddrs[i]);
+		KUNIT_EXPECT_EQ(test, r->ar.end, eaddrs[i]);
+	}
+	damon_free_target(t);
+}
+
+static void damon_test_split_regions_of(struct kunit *test)
+{
+	struct damon_ctx *c = damon_new_ctx();
+	struct damon_target *t;
+	struct damon_region *r;
+
+	t = damon_new_target(42);
+	r = damon_new_region(0, 22);
+	damon_add_region(r, t);
+	damon_split_regions_of(c, t, 2);
+	KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 2u);
+	damon_free_target(t);
+
+	t = damon_new_target(42);
+	r = damon_new_region(0, 220);
+	damon_add_region(r, t);
+	damon_split_regions_of(c, t, 4);
+	KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 4u);
+	damon_free_target(t);
+	damon_destroy_ctx(c);
+}
+
+static struct kunit_case damon_test_cases[] = {
+	KUNIT_CASE(damon_test_target),
+	KUNIT_CASE(damon_test_regions),
+	KUNIT_CASE(damon_test_aggregate),
+	KUNIT_CASE(damon_test_split_at),
+	KUNIT_CASE(damon_test_merge_two),
+	KUNIT_CASE(damon_test_merge_regions_of),
+	KUNIT_CASE(damon_test_split_regions_of),
+	{},
+};
+
+static struct kunit_suite damon_test_suite = {
+	.name = "damon",
+	.test_cases = damon_test_cases,
+};
+kunit_test_suite(damon_test_suite);
+
+#endif /* _DAMON_CORE_TEST_H */
+
+#endif	/* CONFIG_DAMON_KUNIT_TEST */
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 59033488402e8..30e9211f494a7 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -16,6 +16,11 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/damon.h>
 
+#ifdef CONFIG_DAMON_KUNIT_TEST
+#undef DAMON_MIN_REGION
+#define DAMON_MIN_REGION 1
+#endif
+
 /* Get a random number in [l, r) */
 #define damon_rand(l, r) (l + prandom_u32_max(r - l))
 
@@ -711,3 +716,5 @@ static int kdamond_fn(void *data)
 
 	do_exit(0);
 }
+
+#include "core-test.h"
diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h
new file mode 100644
index 0000000000000..930e83bceef03
--- /dev/null
+++ b/mm/damon/dbgfs-test.h
@@ -0,0 +1,126 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * DAMON Debugfs Interface Unit Tests
+ *
+ * Author: SeongJae Park <sjpark@amazon.de>
+ */
+
+#ifdef CONFIG_DAMON_DBGFS_KUNIT_TEST
+
+#ifndef _DAMON_DBGFS_TEST_H
+#define _DAMON_DBGFS_TEST_H
+
+#include <kunit/test.h>
+
+static void damon_dbgfs_test_str_to_target_ids(struct kunit *test)
+{
+	char *question;
+	unsigned long *answers;
+	unsigned long expected[] = {12, 35, 46};
+	ssize_t nr_integers = 0, i;
+
+	question = "123";
+	answers = str_to_target_ids(question, strnlen(question, 128),
+			&nr_integers);
+	KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers);
+	KUNIT_EXPECT_EQ(test, 123ul, answers[0]);
+	kfree(answers);
+
+	question = "123abc";
+	answers = str_to_target_ids(question, strnlen(question, 128),
+			&nr_integers);
+	KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers);
+	KUNIT_EXPECT_EQ(test, 123ul, answers[0]);
+	kfree(answers);
+
+	question = "a123";
+	answers = str_to_target_ids(question, strnlen(question, 128),
+			&nr_integers);
+	KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers);
+	kfree(answers);
+
+	question = "12 35";
+	answers = str_to_target_ids(question, strnlen(question, 128),
+			&nr_integers);
+	KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers);
+	for (i = 0; i < nr_integers; i++)
+		KUNIT_EXPECT_EQ(test, expected[i], answers[i]);
+	kfree(answers);
+
+	question = "12 35 46";
+	answers = str_to_target_ids(question, strnlen(question, 128),
+			&nr_integers);
+	KUNIT_EXPECT_EQ(test, (ssize_t)3, nr_integers);
+	for (i = 0; i < nr_integers; i++)
+		KUNIT_EXPECT_EQ(test, expected[i], answers[i]);
+	kfree(answers);
+
+	question = "12 35 abc 46";
+	answers = str_to_target_ids(question, strnlen(question, 128),
+			&nr_integers);
+	KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers);
+	for (i = 0; i < 2; i++)
+		KUNIT_EXPECT_EQ(test, expected[i], answers[i]);
+	kfree(answers);
+
+	question = "";
+	answers = str_to_target_ids(question, strnlen(question, 128),
+			&nr_integers);
+	KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers);
+	kfree(answers);
+
+	question = "\n";
+	answers = str_to_target_ids(question, strnlen(question, 128),
+			&nr_integers);
+	KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers);
+	kfree(answers);
+}
+
+static void damon_dbgfs_test_set_targets(struct kunit *test)
+{
+	struct damon_ctx *ctx = dbgfs_new_ctx();
+	unsigned long ids[] = {1, 2, 3};
+	char buf[64];
+
+	/* Make DAMON consider target id as plain number */
+	ctx->primitive.target_valid = NULL;
+	ctx->primitive.cleanup = NULL;
+
+	damon_set_targets(ctx, ids, 3);
+	sprint_target_ids(ctx, buf, 64);
+	KUNIT_EXPECT_STREQ(test, (char *)buf, "1 2 3\n");
+
+	damon_set_targets(ctx, NULL, 0);
+	sprint_target_ids(ctx, buf, 64);
+	KUNIT_EXPECT_STREQ(test, (char *)buf, "\n");
+
+	damon_set_targets(ctx, (unsigned long []){1, 2}, 2);
+	sprint_target_ids(ctx, buf, 64);
+	KUNIT_EXPECT_STREQ(test, (char *)buf, "1 2\n");
+
+	damon_set_targets(ctx, (unsigned long []){2}, 1);
+	sprint_target_ids(ctx, buf, 64);
+	KUNIT_EXPECT_STREQ(test, (char *)buf, "2\n");
+
+	damon_set_targets(ctx, NULL, 0);
+	sprint_target_ids(ctx, buf, 64);
+	KUNIT_EXPECT_STREQ(test, (char *)buf, "\n");
+
+	dbgfs_destroy_ctx(ctx);
+}
+
+static struct kunit_case damon_test_cases[] = {
+	KUNIT_CASE(damon_dbgfs_test_str_to_target_ids),
+	KUNIT_CASE(damon_dbgfs_test_set_targets),
+	{},
+};
+
+static struct kunit_suite damon_test_suite = {
+	.name = "damon-dbgfs",
+	.test_cases = damon_test_cases,
+};
+kunit_test_suite(damon_test_suite);
+
+#endif /* _DAMON_TEST_H */
+
+#endif	/* CONFIG_DAMON_KUNIT_TEST */
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 31ad550ecba2d..faee070977d80 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -619,3 +619,5 @@ static int __init damon_dbgfs_init(void)
 }
 
 module_init(damon_dbgfs_init);
+
+#include "dbgfs-test.h"
diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h
new file mode 100644
index 0000000000000..1f5c13257dbaf
--- /dev/null
+++ b/mm/damon/vaddr-test.h
@@ -0,0 +1,329 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Data Access Monitor Unit Tests
+ *
+ * Copyright 2019 Amazon.com, Inc. or its affiliates.  All rights reserved.
+ *
+ * Author: SeongJae Park <sjpark@amazon.de>
+ */
+
+#ifdef CONFIG_DAMON_VADDR_KUNIT_TEST
+
+#ifndef _DAMON_VADDR_TEST_H
+#define _DAMON_VADDR_TEST_H
+
+#include <kunit/test.h>
+
+static void __link_vmas(struct vm_area_struct *vmas, ssize_t nr_vmas)
+{
+	int i, j;
+	unsigned long largest_gap, gap;
+
+	if (!nr_vmas)
+		return;
+
+	for (i = 0; i < nr_vmas - 1; i++) {
+		vmas[i].vm_next = &vmas[i + 1];
+
+		vmas[i].vm_rb.rb_left = NULL;
+		vmas[i].vm_rb.rb_right = &vmas[i + 1].vm_rb;
+
+		largest_gap = 0;
+		for (j = i; j < nr_vmas; j++) {
+			if (j == 0)
+				continue;
+			gap = vmas[j].vm_start - vmas[j - 1].vm_end;
+			if (gap > largest_gap)
+				largest_gap = gap;
+		}
+		vmas[i].rb_subtree_gap = largest_gap;
+	}
+	vmas[i].vm_next = NULL;
+	vmas[i].vm_rb.rb_right = NULL;
+	vmas[i].rb_subtree_gap = 0;
+}
+
+/*
+ * Test __damon_va_three_regions() function
+ *
+ * In case of virtual memory address spaces monitoring, DAMON converts the
+ * complex and dynamic memory mappings of each target task to three
+ * discontiguous regions which cover every mapped areas.  However, the three
+ * regions should not include the two biggest unmapped areas in the original
+ * mapping, because the two biggest areas are normally the areas between 1)
+ * heap and the mmap()-ed regions, and 2) the mmap()-ed regions and stack.
+ * Because these two unmapped areas are very huge but obviously never accessed,
+ * covering the region is just a waste.
+ *
+ * '__damon_va_three_regions() receives an address space of a process.  It
+ * first identifies the start of mappings, end of mappings, and the two biggest
+ * unmapped areas.  After that, based on the information, it constructs the
+ * three regions and returns.  For more detail, refer to the comment of
+ * 'damon_init_regions_of()' function definition in 'mm/damon.c' file.
+ *
+ * For example, suppose virtual address ranges of 10-20, 20-25, 200-210,
+ * 210-220, 300-305, and 307-330 (Other comments represent this mappings in
+ * more short form: 10-20-25, 200-210-220, 300-305, 307-330) of a process are
+ * mapped.  To cover every mappings, the three regions should start with 10,
+ * and end with 305.  The process also has three unmapped areas, 25-200,
+ * 220-300, and 305-307.  Among those, 25-200 and 220-300 are the biggest two
+ * unmapped areas, and thus it should be converted to three regions of 10-25,
+ * 200-220, and 300-330.
+ */
+static void damon_test_three_regions_in_vmas(struct kunit *test)
+{
+	struct damon_addr_range regions[3] = {0,};
+	/* 10-20-25, 200-210-220, 300-305, 307-330 */
+	struct vm_area_struct vmas[] = {
+		(struct vm_area_struct) {.vm_start = 10, .vm_end = 20},
+		(struct vm_area_struct) {.vm_start = 20, .vm_end = 25},
+		(struct vm_area_struct) {.vm_start = 200, .vm_end = 210},
+		(struct vm_area_struct) {.vm_start = 210, .vm_end = 220},
+		(struct vm_area_struct) {.vm_start = 300, .vm_end = 305},
+		(struct vm_area_struct) {.vm_start = 307, .vm_end = 330},
+	};
+
+	__link_vmas(vmas, 6);
+
+	__damon_va_three_regions(&vmas[0], regions);
+
+	KUNIT_EXPECT_EQ(test, 10ul, regions[0].start);
+	KUNIT_EXPECT_EQ(test, 25ul, regions[0].end);
+	KUNIT_EXPECT_EQ(test, 200ul, regions[1].start);
+	KUNIT_EXPECT_EQ(test, 220ul, regions[1].end);
+	KUNIT_EXPECT_EQ(test, 300ul, regions[2].start);
+	KUNIT_EXPECT_EQ(test, 330ul, regions[2].end);
+}
+
+static struct damon_region *__nth_region_of(struct damon_target *t, int idx)
+{
+	struct damon_region *r;
+	unsigned int i = 0;
+
+	damon_for_each_region(r, t) {
+		if (i++ == idx)
+			return r;
+	}
+
+	return NULL;
+}
+
+/*
+ * Test 'damon_va_apply_three_regions()'
+ *
+ * test			kunit object
+ * regions		an array containing start/end addresses of current
+ *			monitoring target regions
+ * nr_regions		the number of the addresses in 'regions'
+ * three_regions	The three regions that need to be applied now
+ * expected		start/end addresses of monitoring target regions that
+ *			'three_regions' are applied
+ * nr_expected		the number of addresses in 'expected'
+ *
+ * The memory mapping of the target processes changes dynamically.  To follow
+ * the change, DAMON periodically reads the mappings, simplifies it to the
+ * three regions, and updates the monitoring target regions to fit in the three
+ * regions.  The update of current target regions is the role of
+ * 'damon_va_apply_three_regions()'.
+ *
+ * This test passes the given target regions and the new three regions that
+ * need to be applied to the function and check whether it updates the regions
+ * as expected.
+ */
+static void damon_do_test_apply_three_regions(struct kunit *test,
+				unsigned long *regions, int nr_regions,
+				struct damon_addr_range *three_regions,
+				unsigned long *expected, int nr_expected)
+{
+	struct damon_ctx *ctx = damon_new_ctx();
+	struct damon_target *t;
+	struct damon_region *r;
+	int i;
+
+	t = damon_new_target(42);
+	for (i = 0; i < nr_regions / 2; i++) {
+		r = damon_new_region(regions[i * 2], regions[i * 2 + 1]);
+		damon_add_region(r, t);
+	}
+	damon_add_target(ctx, t);
+
+	damon_va_apply_three_regions(t, three_regions);
+
+	for (i = 0; i < nr_expected / 2; i++) {
+		r = __nth_region_of(t, i);
+		KUNIT_EXPECT_EQ(test, r->ar.start, expected[i * 2]);
+		KUNIT_EXPECT_EQ(test, r->ar.end, expected[i * 2 + 1]);
+	}
+
+	damon_destroy_ctx(ctx);
+}
+
+/*
+ * This function test most common case where the three big regions are only
+ * slightly changed.  Target regions should adjust their boundary (10-20-30,
+ * 50-55, 70-80, 90-100) to fit with the new big regions or remove target
+ * regions (57-79) that now out of the three regions.
+ */
+static void damon_test_apply_three_regions1(struct kunit *test)
+{
+	/* 10-20-30, 50-55-57-59, 70-80-90-100 */
+	unsigned long regions[] = {10, 20, 20, 30, 50, 55, 55, 57, 57, 59,
+				70, 80, 80, 90, 90, 100};
+	/* 5-27, 45-55, 73-104 */
+	struct damon_addr_range new_three_regions[3] = {
+		(struct damon_addr_range){.start = 5, .end = 27},
+		(struct damon_addr_range){.start = 45, .end = 55},
+		(struct damon_addr_range){.start = 73, .end = 104} };
+	/* 5-20-27, 45-55, 73-80-90-104 */
+	unsigned long expected[] = {5, 20, 20, 27, 45, 55,
+				73, 80, 80, 90, 90, 104};
+
+	damon_do_test_apply_three_regions(test, regions, ARRAY_SIZE(regions),
+			new_three_regions, expected, ARRAY_SIZE(expected));
+}
+
+/*
+ * Test slightly bigger change.  Similar to above, but the second big region
+ * now require two target regions (50-55, 57-59) to be removed.
+ */
+static void damon_test_apply_three_regions2(struct kunit *test)
+{
+	/* 10-20-30, 50-55-57-59, 70-80-90-100 */
+	unsigned long regions[] = {10, 20, 20, 30, 50, 55, 55, 57, 57, 59,
+				70, 80, 80, 90, 90, 100};
+	/* 5-27, 56-57, 65-104 */
+	struct damon_addr_range new_three_regions[3] = {
+		(struct damon_addr_range){.start = 5, .end = 27},
+		(struct damon_addr_range){.start = 56, .end = 57},
+		(struct damon_addr_range){.start = 65, .end = 104} };
+	/* 5-20-27, 56-57, 65-80-90-104 */
+	unsigned long expected[] = {5, 20, 20, 27, 56, 57,
+				65, 80, 80, 90, 90, 104};
+
+	damon_do_test_apply_three_regions(test, regions, ARRAY_SIZE(regions),
+			new_three_regions, expected, ARRAY_SIZE(expected));
+}
+
+/*
+ * Test a big change.  The second big region has totally freed and mapped to
+ * different area (50-59 -> 61-63).  The target regions which were in the old
+ * second big region (50-55-57-59) should be removed and new target region
+ * covering the second big region (61-63) should be created.
+ */
+static void damon_test_apply_three_regions3(struct kunit *test)
+{
+	/* 10-20-30, 50-55-57-59, 70-80-90-100 */
+	unsigned long regions[] = {10, 20, 20, 30, 50, 55, 55, 57, 57, 59,
+				70, 80, 80, 90, 90, 100};
+	/* 5-27, 61-63, 65-104 */
+	struct damon_addr_range new_three_regions[3] = {
+		(struct damon_addr_range){.start = 5, .end = 27},
+		(struct damon_addr_range){.start = 61, .end = 63},
+		(struct damon_addr_range){.start = 65, .end = 104} };
+	/* 5-20-27, 61-63, 65-80-90-104 */
+	unsigned long expected[] = {5, 20, 20, 27, 61, 63,
+				65, 80, 80, 90, 90, 104};
+
+	damon_do_test_apply_three_regions(test, regions, ARRAY_SIZE(regions),
+			new_three_regions, expected, ARRAY_SIZE(expected));
+}
+
+/*
+ * Test another big change.  Both of the second and third big regions (50-59
+ * and 70-100) has totally freed and mapped to different area (30-32 and
+ * 65-68).  The target regions which were in the old second and third big
+ * regions should now be removed and new target regions covering the new second
+ * and third big regions should be crated.
+ */
+static void damon_test_apply_three_regions4(struct kunit *test)
+{
+	/* 10-20-30, 50-55-57-59, 70-80-90-100 */
+	unsigned long regions[] = {10, 20, 20, 30, 50, 55, 55, 57, 57, 59,
+				70, 80, 80, 90, 90, 100};
+	/* 5-7, 30-32, 65-68 */
+	struct damon_addr_range new_three_regions[3] = {
+		(struct damon_addr_range){.start = 5, .end = 7},
+		(struct damon_addr_range){.start = 30, .end = 32},
+		(struct damon_addr_range){.start = 65, .end = 68} };
+	/* expect 5-7, 30-32, 65-68 */
+	unsigned long expected[] = {5, 7, 30, 32, 65, 68};
+
+	damon_do_test_apply_three_regions(test, regions, ARRAY_SIZE(regions),
+			new_three_regions, expected, ARRAY_SIZE(expected));
+}
+
+static void damon_test_split_evenly(struct kunit *test)
+{
+	struct damon_ctx *c = damon_new_ctx();
+	struct damon_target *t;
+	struct damon_region *r;
+	unsigned long i;
+
+	KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(NULL, NULL, 5),
+			-EINVAL);
+
+	t = damon_new_target(42);
+	r = damon_new_region(0, 100);
+	KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 0), -EINVAL);
+
+	damon_add_region(r, t);
+	KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 10), 0);
+	KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 10u);
+
+	i = 0;
+	damon_for_each_region(r, t) {
+		KUNIT_EXPECT_EQ(test, r->ar.start, i++ * 10);
+		KUNIT_EXPECT_EQ(test, r->ar.end, i * 10);
+	}
+	damon_free_target(t);
+
+	t = damon_new_target(42);
+	r = damon_new_region(5, 59);
+	damon_add_region(r, t);
+	KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 5), 0);
+	KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 5u);
+
+	i = 0;
+	damon_for_each_region(r, t) {
+		if (i == 4)
+			break;
+		KUNIT_EXPECT_EQ(test, r->ar.start, 5 + 10 * i++);
+		KUNIT_EXPECT_EQ(test, r->ar.end, 5 + 10 * i);
+	}
+	KUNIT_EXPECT_EQ(test, r->ar.start, 5 + 10 * i);
+	KUNIT_EXPECT_EQ(test, r->ar.end, 59ul);
+	damon_free_target(t);
+
+	t = damon_new_target(42);
+	r = damon_new_region(5, 6);
+	damon_add_region(r, t);
+	KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 2), -EINVAL);
+	KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1u);
+
+	damon_for_each_region(r, t) {
+		KUNIT_EXPECT_EQ(test, r->ar.start, 5ul);
+		KUNIT_EXPECT_EQ(test, r->ar.end, 6ul);
+	}
+	damon_free_target(t);
+	damon_destroy_ctx(c);
+}
+
+static struct kunit_case damon_test_cases[] = {
+	KUNIT_CASE(damon_test_three_regions_in_vmas),
+	KUNIT_CASE(damon_test_apply_three_regions1),
+	KUNIT_CASE(damon_test_apply_three_regions2),
+	KUNIT_CASE(damon_test_apply_three_regions3),
+	KUNIT_CASE(damon_test_apply_three_regions4),
+	KUNIT_CASE(damon_test_split_evenly),
+	{},
+};
+
+static struct kunit_suite damon_test_suite = {
+	.name = "damon-primitives",
+	.test_cases = damon_test_cases,
+};
+kunit_test_suite(damon_test_suite);
+
+#endif /* _DAMON_VADDR_TEST_H */
+
+#endif	/* CONFIG_DAMON_VADDR_KUNIT_TEST */
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 897aa8cf96c83..58c1fb2aafa91 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -18,6 +18,11 @@
 #include <linux/sched/mm.h>
 #include <linux/slab.h>
 
+#ifdef CONFIG_DAMON_VADDR_KUNIT_TEST
+#undef DAMON_MIN_REGION
+#define DAMON_MIN_REGION 1
+#endif
+
 /* Get a random number in [l, r) */
 #define damon_rand(l, r) (l + prandom_u32_max(r - l))
 
@@ -663,3 +668,5 @@ void damon_va_set_primitives(struct damon_ctx *ctx)
 	ctx->primitive.target_valid = damon_va_target_valid;
 	ctx->primitive.cleanup = NULL;
 }
+
+#include "vaddr-test.h"

From 027cf4593f2be196f013b4173b2623cd7b5850e4 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sjpark@amazon.de>
Date: Tue, 7 Sep 2021 19:57:13 -0700
Subject: [PATCH 278/737] mm/damon: add user space selftests

This commit adds a simple user space tests for DAMON.  The tests are using
kselftest framework.

Link: https://lkml.kernel.org/r/20210716081449.22187-13-sj38.park@gmail.com
Signed-off-by: SeongJae Park <sjpark@amazon.de>
Reviewed-by: Markus Boehme <markubo@amazon.de>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Fan Du <fan.du@intel.com>
Cc: Fernand Sieber <sieberf@amazon.com>
Cc: Greg Kroah-Hartman <greg@kroah.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joe Perches <joe@perches.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Maximilian Heyne <mheyne@amazon.de>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/damon/Makefile        |  7 ++
 .../selftests/damon/_chk_dependency.sh        | 28 +++++++
 .../testing/selftests/damon/debugfs_attrs.sh  | 75 +++++++++++++++++++
 3 files changed, 110 insertions(+)
 create mode 100644 tools/testing/selftests/damon/Makefile
 create mode 100644 tools/testing/selftests/damon/_chk_dependency.sh
 create mode 100644 tools/testing/selftests/damon/debugfs_attrs.sh

diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile
new file mode 100644
index 0000000000000..8a3f2cd9fec0c
--- /dev/null
+++ b/tools/testing/selftests/damon/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for damon selftests
+
+TEST_FILES = _chk_dependency.sh
+TEST_PROGS = debugfs_attrs.sh
+
+include ../lib.mk
diff --git a/tools/testing/selftests/damon/_chk_dependency.sh b/tools/testing/selftests/damon/_chk_dependency.sh
new file mode 100644
index 0000000000000..0189db81550be
--- /dev/null
+++ b/tools/testing/selftests/damon/_chk_dependency.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+DBGFS=/sys/kernel/debug/damon
+
+if [ $EUID -ne 0 ];
+then
+	echo "Run as root"
+	exit $ksft_skip
+fi
+
+if [ ! -d "$DBGFS" ]
+then
+	echo "$DBGFS not found"
+	exit $ksft_skip
+fi
+
+for f in attrs target_ids monitor_on
+do
+	if [ ! -f "$DBGFS/$f" ]
+	then
+		echo "$f not found"
+		exit 1
+	fi
+done
diff --git a/tools/testing/selftests/damon/debugfs_attrs.sh b/tools/testing/selftests/damon/debugfs_attrs.sh
new file mode 100644
index 0000000000000..bfabb19dc0d3d
--- /dev/null
+++ b/tools/testing/selftests/damon/debugfs_attrs.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+test_write_result() {
+	file=$1
+	content=$2
+	orig_content=$3
+	expect_reason=$4
+	expected=$5
+
+	echo "$content" > "$file"
+	if [ $? -ne "$expected" ]
+	then
+		echo "writing $content to $file doesn't return $expected"
+		echo "expected because: $expect_reason"
+		echo "$orig_content" > "$file"
+		exit 1
+	fi
+}
+
+test_write_succ() {
+	test_write_result "$1" "$2" "$3" "$4" 0
+}
+
+test_write_fail() {
+	test_write_result "$1" "$2" "$3" "$4" 1
+}
+
+test_content() {
+	file=$1
+	orig_content=$2
+	expected=$3
+	expect_reason=$4
+
+	content=$(cat "$file")
+	if [ "$content" != "$expected" ]
+	then
+		echo "reading $file expected $expected but $content"
+		echo "expected because: $expect_reason"
+		echo "$orig_content" > "$file"
+		exit 1
+	fi
+}
+
+source ./_chk_dependency.sh
+
+# Test attrs file
+# ===============
+
+file="$DBGFS/attrs"
+orig_content=$(cat "$file")
+
+test_write_succ "$file" "1 2 3 4 5" "$orig_content" "valid input"
+test_write_fail "$file" "1 2 3 4" "$orig_content" "no enough fields"
+test_write_fail "$file" "1 2 3 5 4" "$orig_content" \
+	"min_nr_regions > max_nr_regions"
+test_content "$file" "$orig_content" "1 2 3 4 5" "successfully written"
+echo "$orig_content" > "$file"
+
+# Test target_ids file
+# ====================
+
+file="$DBGFS/target_ids"
+orig_content=$(cat "$file")
+
+test_write_succ "$file" "1 2 3 4" "$orig_content" "valid input"
+test_write_succ "$file" "1 2 abc 4" "$orig_content" "still valid input"
+test_content "$file" "$orig_content" "1 2" "non-integer was there"
+test_write_succ "$file" "abc 2 3" "$orig_content" "the file allows wrong input"
+test_content "$file" "$orig_content" "" "wrong input written"
+test_write_succ "$file" "" "$orig_content" "empty input"
+test_content "$file" "$orig_content" "" "empty input written"
+echo "$orig_content" > "$file"
+
+echo "PASS"

From 2115ba5150fb5a0a5869fae07d5d967a72eea9fa Mon Sep 17 00:00:00 2001
From: SeongJae Park <sjpark@amazon.de>
Date: Tue, 7 Sep 2021 19:57:17 -0700
Subject: [PATCH 279/737] MAINTAINERS: update for DAMON

This commit updates MAINTAINERS file for DAMON related files.

Link: https://lkml.kernel.org/r/20210716081449.22187-14-sj38.park@gmail.com
Signed-off-by: SeongJae Park <sjpark@amazon.de>
Reviewed-by: Markus Boehme <markubo@amazon.de>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Fan Du <fan.du@intel.com>
Cc: Fernand Sieber <sieberf@amazon.com>
Cc: Greg Kroah-Hartman <greg@kroah.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joe Perches <joe@perches.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Maximilian Heyne <mheyne@amazon.de>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 67f55f85d2044..e6e4093db8823 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4872,6 +4872,17 @@ F:	net/ax25/ax25_out.c
 F:	net/ax25/ax25_timer.c
 F:	net/ax25/sysctl_net_ax25.c
 
+DATA ACCESS MONITOR
+M:	SeongJae Park <sjpark@amazon.de>
+L:	linux-mm@kvack.org
+S:	Maintained
+F:	Documentation/admin-guide/mm/damon/
+F:	Documentation/vm/damon/
+F:	include/linux/damon.h
+F:	include/trace/events/damon.h
+F:	mm/damon/
+F:	tools/testing/selftests/damon/
+
 DAVICOM FAST ETHERNET (DMFE) NETWORK DRIVER
 L:	netdev@vger.kernel.org
 S:	Orphan

From 117ec1c2271380e8e61c303dc5ca0d93c06891c1 Mon Sep 17 00:00:00 2001
From: Adam Borowski <kilobyte@angband.pl>
Date: Fri, 24 Sep 2021 15:43:26 -0700
Subject: [PATCH 280/737] mm/damon: don't use strnlen() with known-bogus source
 length

gcc knows the true length too, and rightfully complains.

Link: https://lkml.kernel.org/r/20210912204447.10427-1-kilobyte@angband.pl
Signed-off-by: Adam Borowski <kilobyte@angband.pl>
Cc: SeongJae Park <sj38.park@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/dbgfs-test.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h
index 930e83bceef03..4eddcfa73996f 100644
--- a/mm/damon/dbgfs-test.h
+++ b/mm/damon/dbgfs-test.h
@@ -20,27 +20,27 @@ static void damon_dbgfs_test_str_to_target_ids(struct kunit *test)
 	ssize_t nr_integers = 0, i;
 
 	question = "123";
-	answers = str_to_target_ids(question, strnlen(question, 128),
+	answers = str_to_target_ids(question, strlen(question),
 			&nr_integers);
 	KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers);
 	KUNIT_EXPECT_EQ(test, 123ul, answers[0]);
 	kfree(answers);
 
 	question = "123abc";
-	answers = str_to_target_ids(question, strnlen(question, 128),
+	answers = str_to_target_ids(question, strlen(question),
 			&nr_integers);
 	KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers);
 	KUNIT_EXPECT_EQ(test, 123ul, answers[0]);
 	kfree(answers);
 
 	question = "a123";
-	answers = str_to_target_ids(question, strnlen(question, 128),
+	answers = str_to_target_ids(question, strlen(question),
 			&nr_integers);
 	KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers);
 	kfree(answers);
 
 	question = "12 35";
-	answers = str_to_target_ids(question, strnlen(question, 128),
+	answers = str_to_target_ids(question, strlen(question),
 			&nr_integers);
 	KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers);
 	for (i = 0; i < nr_integers; i++)
@@ -48,7 +48,7 @@ static void damon_dbgfs_test_str_to_target_ids(struct kunit *test)
 	kfree(answers);
 
 	question = "12 35 46";
-	answers = str_to_target_ids(question, strnlen(question, 128),
+	answers = str_to_target_ids(question, strlen(question),
 			&nr_integers);
 	KUNIT_EXPECT_EQ(test, (ssize_t)3, nr_integers);
 	for (i = 0; i < nr_integers; i++)
@@ -56,7 +56,7 @@ static void damon_dbgfs_test_str_to_target_ids(struct kunit *test)
 	kfree(answers);
 
 	question = "12 35 abc 46";
-	answers = str_to_target_ids(question, strnlen(question, 128),
+	answers = str_to_target_ids(question, strlen(question),
 			&nr_integers);
 	KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers);
 	for (i = 0; i < 2; i++)
@@ -64,13 +64,13 @@ static void damon_dbgfs_test_str_to_target_ids(struct kunit *test)
 	kfree(answers);
 
 	question = "";
-	answers = str_to_target_ids(question, strnlen(question, 128),
+	answers = str_to_target_ids(question, strlen(question),
 			&nr_integers);
 	KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers);
 	kfree(answers);
 
 	question = "\n";
-	answers = str_to_target_ids(question, strnlen(question, 128),
+	answers = str_to_target_ids(question, strlen(question),
 			&nr_integers);
 	KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers);
 	kfree(answers);

From be5c6bd6f999a5c9c8bb5e51c7724d2f40fd015a Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 28 Oct 2021 14:36:33 -0700
Subject: [PATCH 281/737] mm/damon/core-test: fix wrong expectations for
 'damon_split_regions_of()'

Kunit test cases for 'damon_split_regions_of()' expects the number of
regions after calling the function will be same to their request
('nr_sub').  However, the requested number is just an upper-limit,
because the function randomly decides the size of each sub-region.

This fixes the wrong expectation.

Link: https://lkml.kernel.org/r/20211028090628.14948-1-sj@kernel.org
Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests")
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/core-test.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h
index c938a9c34e6c5..7008c3735e99f 100644
--- a/mm/damon/core-test.h
+++ b/mm/damon/core-test.h
@@ -219,14 +219,14 @@ static void damon_test_split_regions_of(struct kunit *test)
 	r = damon_new_region(0, 22);
 	damon_add_region(r, t);
 	damon_split_regions_of(c, t, 2);
-	KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 2u);
+	KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u);
 	damon_free_target(t);
 
 	t = damon_new_target(42);
 	r = damon_new_region(0, 220);
 	damon_add_region(r, t);
 	damon_split_regions_of(c, t, 4);
-	KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 4u);
+	KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u);
 	damon_free_target(t);
 	damon_destroy_ctx(c);
 }

From 971f571e1628703bf8b5201f3ad14b974d9adc33 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Fri, 5 Nov 2021 13:45:52 -0700
Subject: [PATCH 282/737] mm/damon: grammar s/works/work/

Correct a singular versus plural grammar mistake in the help text for
the DAMON_VADDR config symbol.

Link: https://lkml.kernel.org/r/20210914073451.3883834-1-geert@linux-m68k.org
Fixes: 3f49584b262cf8f4 ("mm/damon: implement primitives for the virtual memory address spaces")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Reviewed-by: SeongJae Park <sjpark@amazon.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index 37024798a97ca..ba8898c7eb8eb 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -30,7 +30,7 @@ config DAMON_VADDR
 	select PAGE_IDLE_FLAG
 	help
 	  This builds the default data access monitoring primitives for DAMON
-	  that works for virtual address spaces.
+	  that work for virtual address spaces.
 
 config DAMON_VADDR_KUNIT_TEST
 	bool "Test for DAMON primitives" if !KUNIT_ALL_TESTS

From 4e776356ac902021faa0dcbb37095b03807d45c7 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sjpark@amazon.de>
Date: Fri, 5 Nov 2021 13:46:04 -0700
Subject: [PATCH 283/737] include/linux/damon.h: fix kernel-doc comments for
 'damon_callback'

A few Kernel-doc comments in 'damon.h' are broken.  This fixes them.

Link: https://lkml.kernel.org/r/20210917123958.3819-5-sj@kernel.org
Signed-off-by: SeongJae Park <sjpark@amazon.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index d68b67b8d458d..755d70804705b 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -62,7 +62,7 @@ struct damon_target {
 struct damon_ctx;
 
 /**
- * struct damon_primitive	Monitoring primitives for given use cases.
+ * struct damon_primitive - Monitoring primitives for given use cases.
  *
  * @init:			Initialize primitive-internal data structures.
  * @update:			Update primitive-internal data structures.
@@ -108,8 +108,8 @@ struct damon_primitive {
 	void (*cleanup)(struct damon_ctx *context);
 };
 
-/*
- * struct damon_callback	Monitoring events notification callbacks.
+/**
+ * struct damon_callback - Monitoring events notification callbacks.
  *
  * @before_start:	Called before starting the monitoring.
  * @after_sampling:	Called after each sampling.

From 65d93aa71c0e2947e6efc7423005e040890692b5 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:46:06 -0700
Subject: [PATCH 284/737] mm/damon/core: print kdamond start log in debug mode
 only

Logging of kdamond startup is using 'pr_info()' unnecessarily.  This
makes it to use 'pr_debug()' instead.

Link: https://lkml.kernel.org/r/20210917123958.3819-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: SeongJae Park <sjpark@amazon.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index 30e9211f494a7..874558a790a0f 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -653,7 +653,7 @@ static int kdamond_fn(void *data)
 	unsigned long sz_limit = 0;
 
 	mutex_lock(&ctx->kdamond_lock);
-	pr_info("kdamond (%d) starts\n", ctx->kdamond->pid);
+	pr_debug("kdamond (%d) starts\n", ctx->kdamond->pid);
 	mutex_unlock(&ctx->kdamond_lock);
 
 	if (ctx->primitive.init)

From 7de10ea130f929b34cb4b15b3a042e8428d83fdb Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@gmail.com>
Date: Fri, 5 Nov 2021 13:46:09 -0700
Subject: [PATCH 285/737] mm/damon: remove unnecessary do_exit() from kdamond

Just return from the kthread function.

Link: https://lkml.kernel.org/r/20210927232421.17694-1-changbin.du@gmail.com
Signed-off-by: Changbin Du <changbin.du@gmail.com>
Cc: SeongJae Park <sjpark@amazon.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index 874558a790a0f..61a9e3b37bc9d 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -714,7 +714,7 @@ static int kdamond_fn(void *data)
 	nr_running_ctxs--;
 	mutex_unlock(&damon_lock);
 
-	do_exit(0);
+	return 0;
 }
 
 #include "core-test.h"

From 3aa179ea5cae54853f51e9bc2a0eda97cc9b641f Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@gmail.com>
Date: Fri, 5 Nov 2021 13:46:12 -0700
Subject: [PATCH 286/737] mm/damon: needn't hold kdamond_lock to print pid of
 kdamond

Just get the pid by 'current->pid'.  Meanwhile, to be symmetrical make
the 'starts' and 'finishes' logs both use debug level.

Link: https://lkml.kernel.org/r/20210927232432.17750-1-changbin.du@gmail.com
Signed-off-by: Changbin Du <changbin.du@gmail.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/core.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index 61a9e3b37bc9d..8171e7dddc309 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -652,9 +652,7 @@ static int kdamond_fn(void *data)
 	unsigned int max_nr_accesses = 0;
 	unsigned long sz_limit = 0;
 
-	mutex_lock(&ctx->kdamond_lock);
-	pr_debug("kdamond (%d) starts\n", ctx->kdamond->pid);
-	mutex_unlock(&ctx->kdamond_lock);
+	pr_debug("kdamond (%d) starts\n", current->pid);
 
 	if (ctx->primitive.init)
 		ctx->primitive.init(ctx);
@@ -705,7 +703,7 @@ static int kdamond_fn(void *data)
 	if (ctx->primitive.cleanup)
 		ctx->primitive.cleanup(ctx);
 
-	pr_debug("kdamond (%d) finishes\n", ctx->kdamond->pid);
+	pr_debug("kdamond (%d) finishes\n", current->pid);
 	mutex_lock(&ctx->kdamond_lock);
 	ctx->kdamond = NULL;
 	mutex_unlock(&ctx->kdamond_lock);

From 06ec7cf96919c6c471d0295e4f7efc8ad43e30d8 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 5 Nov 2021 13:46:15 -0700
Subject: [PATCH 287/737] mm/damon/core: nullify pointer ctx->kdamond with a
 NULL

Currently a plain integer is being used to nullify the pointer
ctx->kdamond.  Use NULL instead.  Cleans up sparse warning:

  mm/damon/core.c:317:40: warning: Using plain integer as NULL pointer

Link: https://lkml.kernel.org/r/20210925215908.181226-1-colin.king@canonical.com
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index 8171e7dddc309..d993db50280cb 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -314,7 +314,7 @@ static int __damon_start(struct damon_ctx *ctx)
 				nr_running_ctxs);
 		if (IS_ERR(ctx->kdamond)) {
 			err = PTR_ERR(ctx->kdamond);
-			ctx->kdamond = 0;
+			ctx->kdamond = NULL;
 		}
 	}
 	mutex_unlock(&ctx->kdamond_lock);

From a55d0e4ce4c5d198d3128b45c877106f88150e41 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:46:18 -0700
Subject: [PATCH 288/737] mm/damon/core: account age of target regions

Patch series "Implement Data Access Monitoring-based Memory Operation Schemes".

Introduction
============

DAMON[1] can be used as a primitive for data access aware memory
management optimizations.  For that, users who want such optimizations
should run DAMON, read the monitoring results, analyze it, plan a new
memory management scheme, and apply the new scheme by themselves.  Such
efforts will be inevitable for some complicated optimizations.

However, in many other cases, the users would simply want the system to
apply a memory management action to a memory region of a specific size
having a specific access frequency for a specific time.  For example,
"page out a memory region larger than 100 MiB keeping only rare accesses
more than 2 minutes", or "Do not use THP for a memory region larger than
2 MiB rarely accessed for more than 1 seconds".

To make the works easier and non-redundant, this patchset implements a
new feature of DAMON, which is called Data Access Monitoring-based
Operation Schemes (DAMOS).  Using the feature, users can describe the
normal schemes in a simple way and ask DAMON to execute those on its
own.

[1] https://damonitor.github.io

Evaluations
===========

DAMOS is accurate and useful for memory management optimizations.  An
experimental DAMON-based operation scheme for THP, 'ethp', removes
76.15% of THP memory overheads while preserving 51.25% of THP speedup.
Another experimental DAMON-based 'proactive reclamation' implementation,
'prcl', reduces 93.38% of residential sets and 23.63% of system memory
footprint while incurring only 1.22% runtime overhead in the best case
(parsec3/freqmine).

NOTE that the experimental THP optimization and proactive reclamation
are not for production but only for proof of concepts.

Please refer to the showcase web site's evaluation document[1] for
detailed evaluation setup and results.

[1] https://damonitor.github.io/doc/html/v34/vm/damon/eval.html

Long-term Support Trees
-----------------------

For people who want to test DAMON but using LTS kernels, there are
another couple of trees based on two latest LTS kernels respectively and
containing the 'damon/master' backports.

- For v5.4.y: https://git.kernel.org/sj/h/damon/for-v5.4.y
- For v5.10.y: https://git.kernel.org/sj/h/damon/for-v5.10.y

Sequence Of Patches
===================

The 1st patch accounts age of each region.  The 2nd patch implements the
core of the DAMON-based operation schemes feature.  The 3rd patch makes
the default monitoring primitives for virtual address spaces to support
the schemes.  From this point, the kernel space users can use DAMOS.
The 4th patch exports the feature to the user space via the debugfs
interface.  The 5th patch implements schemes statistics feature for
easier tuning of the schemes and runtime access pattern analysis, and
the 6th patch adds selftests for these changes.  Finally, the 7th patch
documents this new feature.

This patch (of 7):

DAMON can be used for data access pattern aware memory management
optimizations.  For that, users should run DAMON, read the monitoring
results, analyze it, plan a new memory management scheme, and apply the
new scheme by themselves.  It would not be too hard, but still require
some level of effort.  For complicated cases, this effort is inevitable.

That said, in many cases, users would simply want to apply an actions to
a memory region of a specific size having a specific access frequency
for a specific time.  For example, "page out a memory region larger than
100 MiB but having a low access frequency more than 10 minutes", or "Use
THP for a memory region larger than 2 MiB having a high access frequency
for more than 2 seconds".

For such optimizations, users will need to first account the age of each
region themselves.  To reduce such efforts, this implements a simple age
account of each region in DAMON.  For each aggregation step, DAMON
compares the access frequency with that from last aggregation and reset
the age of the region if the change is significant.  Else, the age is
incremented.  Also, in case of the merge of regions, the region
size-weighted average of the ages is set as the age of merged new
region.

Link: https://lkml.kernel.org/r/20211001125604.29660-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20211001125604.29660-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Marco Elver <elver@google.com>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Greg Thelen <gthelen@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: David Rienjes <rientjes@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 10 ++++++++++
 mm/damon/core.c       | 13 +++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 755d70804705b..3e8215debbd47 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -31,12 +31,22 @@ struct damon_addr_range {
  * @sampling_addr:	Address of the sample for the next access check.
  * @nr_accesses:	Access frequency of this region.
  * @list:		List head for siblings.
+ * @age:		Age of this region.
+ *
+ * @age is initially zero, increased for each aggregation interval, and reset
+ * to zero again if the access frequency is significantly changed.  If two
+ * regions are merged into a new region, both @nr_accesses and @age of the new
+ * region are set as region size-weighted average of those of the two regions.
  */
 struct damon_region {
 	struct damon_addr_range ar;
 	unsigned long sampling_addr;
 	unsigned int nr_accesses;
 	struct list_head list;
+
+	unsigned int age;
+/* private: Internal value for age calculation. */
+	unsigned int last_nr_accesses;
 };
 
 /**
diff --git a/mm/damon/core.c b/mm/damon/core.c
index d993db50280cb..3efbe80779db2 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -45,6 +45,9 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end)
 	region->nr_accesses = 0;
 	INIT_LIST_HEAD(&region->list);
 
+	region->age = 0;
+	region->last_nr_accesses = 0;
+
 	return region;
 }
 
@@ -444,6 +447,7 @@ static void kdamond_reset_aggregated(struct damon_ctx *c)
 
 		damon_for_each_region(r, t) {
 			trace_damon_aggregated(t, r, damon_nr_regions(t));
+			r->last_nr_accesses = r->nr_accesses;
 			r->nr_accesses = 0;
 		}
 	}
@@ -461,6 +465,7 @@ static void damon_merge_two_regions(struct damon_target *t,
 
 	l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) /
 			(sz_l + sz_r);
+	l->age = (l->age * sz_l + r->age * sz_r) / (sz_l + sz_r);
 	l->ar.end = r->ar.end;
 	damon_destroy_region(r, t);
 }
@@ -480,6 +485,11 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres,
 	struct damon_region *r, *prev = NULL, *next;
 
 	damon_for_each_region_safe(r, next, t) {
+		if (diff_of(r->nr_accesses, r->last_nr_accesses) > thres)
+			r->age = 0;
+		else
+			r->age++;
+
 		if (prev && prev->ar.end == r->ar.start &&
 		    diff_of(prev->nr_accesses, r->nr_accesses) <= thres &&
 		    sz_damon_region(prev) + sz_damon_region(r) <= sz_limit)
@@ -527,6 +537,9 @@ static void damon_split_region_at(struct damon_ctx *ctx,
 
 	r->ar.end = new->ar.start;
 
+	new->age = r->age;
+	new->last_nr_accesses = r->last_nr_accesses;
+
 	damon_insert_region(new, r, damon_next_region(r), t);
 }
 

From 0d8b5fead65c8a586b3f8a4faf0c6fa753f8e91f Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:46:22 -0700
Subject: [PATCH 289/737] mm/damon/core: implement DAMON-based Operation
 Schemes (DAMOS)

In many cases, users might use DAMON for simple data access aware memory
management optimizations such as applying an operation scheme to a
memory region of a specific size having a specific access frequency for
a specific time.  For example, "page out a memory region larger than 100
MiB but having a low access frequency more than 10 minutes", or "Use THP
for a memory region larger than 2 MiB having a high access frequency for
more than 2 seconds".

Most simple form of the solution would be doing offline data access
pattern profiling using DAMON and modifying the application source code
or system configuration based on the profiling results.  Or, developing
a daemon constructed with two modules (one for access monitoring and the
other for applying memory management actions via mlock(), madvise(),
sysctl, etc) is imaginable.

To avoid users spending their time for implementation of such simple
data access monitoring-based operation schemes, this makes DAMON to
handle such schemes directly.  With this change, users can simply
specify their desired schemes to DAMON.  Then, DAMON will automatically
apply the schemes to the user-specified target processes.

Each of the schemes is composed with conditions for filtering of the
target memory regions and desired memory management action for the
target.  Specifically, the format is::

    <min/max size> <min/max access frequency> <min/max age> <action>

The filtering conditions are size of memory region, number of accesses
to the region monitored by DAMON, and the age of the region.  The age of
region is incremented periodically but reset when its addresses or
access frequency has significantly changed or the action of a scheme was
applied.  For the action, current implementation supports a few of
madvise()-like hints, ``WILLNEED``, ``COLD``, ``PAGEOUT``, ``HUGEPAGE``,
and ``NOHUGEPAGE``.

Because DAMON supports various address spaces and application of the
actions to a monitoring target region is dependent to the type of the
target address space, the application code should be implemented by each
primitives and registered to the framework.  Note that this only
implements the framework part.  Following commit will implement the
action applications for virtual address spaces primitives.

Link: https://lkml.kernel.org/r/20211001125604.29660-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rienjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h |  66 +++++++++++++++++++++++++
 mm/damon/core.c       | 109 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 175 insertions(+)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 3e8215debbd47..dbe18b0fb795c 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -69,6 +69,48 @@ struct damon_target {
 	struct list_head list;
 };
 
+/**
+ * enum damos_action - Represents an action of a Data Access Monitoring-based
+ * Operation Scheme.
+ *
+ * @DAMOS_WILLNEED:	Call ``madvise()`` for the region with MADV_WILLNEED.
+ * @DAMOS_COLD:		Call ``madvise()`` for the region with MADV_COLD.
+ * @DAMOS_PAGEOUT:	Call ``madvise()`` for the region with MADV_PAGEOUT.
+ * @DAMOS_HUGEPAGE:	Call ``madvise()`` for the region with MADV_HUGEPAGE.
+ * @DAMOS_NOHUGEPAGE:	Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
+ */
+enum damos_action {
+	DAMOS_WILLNEED,
+	DAMOS_COLD,
+	DAMOS_PAGEOUT,
+	DAMOS_HUGEPAGE,
+	DAMOS_NOHUGEPAGE,
+};
+
+/**
+ * struct damos - Represents a Data Access Monitoring-based Operation Scheme.
+ * @min_sz_region:	Minimum size of target regions.
+ * @max_sz_region:	Maximum size of target regions.
+ * @min_nr_accesses:	Minimum ``->nr_accesses`` of target regions.
+ * @max_nr_accesses:	Maximum ``->nr_accesses`` of target regions.
+ * @min_age_region:	Minimum age of target regions.
+ * @max_age_region:	Maximum age of target regions.
+ * @action:		&damo_action to be applied to the target regions.
+ * @list:		List head for siblings.
+ *
+ * Note that both the minimums and the maximums are inclusive.
+ */
+struct damos {
+	unsigned long min_sz_region;
+	unsigned long max_sz_region;
+	unsigned int min_nr_accesses;
+	unsigned int max_nr_accesses;
+	unsigned int min_age_region;
+	unsigned int max_age_region;
+	enum damos_action action;
+	struct list_head list;
+};
+
 struct damon_ctx;
 
 /**
@@ -79,6 +121,7 @@ struct damon_ctx;
  * @prepare_access_checks:	Prepare next access check of target regions.
  * @check_accesses:		Check the accesses to target regions.
  * @reset_aggregated:		Reset aggregated accesses monitoring results.
+ * @apply_scheme:		Apply a DAMON-based operation scheme.
  * @target_valid:		Determine if the target is valid.
  * @cleanup:			Clean up the context.
  *
@@ -104,6 +147,9 @@ struct damon_ctx;
  * of its update.  The value will be used for regions adjustment threshold.
  * @reset_aggregated should reset the access monitoring results that aggregated
  * by @check_accesses.
+ * @apply_scheme is called from @kdamond when a region for user provided
+ * DAMON-based operation scheme is found.  It should apply the scheme's action
+ * to the region.  This is not used for &DAMON_ARBITRARY_TARGET case.
  * @target_valid should check whether the target is still valid for the
  * monitoring.
  * @cleanup is called from @kdamond just before its termination.
@@ -114,6 +160,8 @@ struct damon_primitive {
 	void (*prepare_access_checks)(struct damon_ctx *context);
 	unsigned int (*check_accesses)(struct damon_ctx *context);
 	void (*reset_aggregated)(struct damon_ctx *context);
+	int (*apply_scheme)(struct damon_ctx *context, struct damon_target *t,
+			struct damon_region *r, struct damos *scheme);
 	bool (*target_valid)(void *target);
 	void (*cleanup)(struct damon_ctx *context);
 };
@@ -192,6 +240,7 @@ struct damon_callback {
  * @min_nr_regions:	The minimum number of adaptive monitoring regions.
  * @max_nr_regions:	The maximum number of adaptive monitoring regions.
  * @adaptive_targets:	Head of monitoring targets (&damon_target) list.
+ * @schemes:		Head of schemes (&damos) list.
  */
 struct damon_ctx {
 	unsigned long sample_interval;
@@ -213,6 +262,7 @@ struct damon_ctx {
 	unsigned long min_nr_regions;
 	unsigned long max_nr_regions;
 	struct list_head adaptive_targets;
+	struct list_head schemes;
 };
 
 #define damon_next_region(r) \
@@ -233,6 +283,12 @@ struct damon_ctx {
 #define damon_for_each_target_safe(t, next, ctx)	\
 	list_for_each_entry_safe(t, next, &(ctx)->adaptive_targets, list)
 
+#define damon_for_each_scheme(s, ctx) \
+	list_for_each_entry(s, &(ctx)->schemes, list)
+
+#define damon_for_each_scheme_safe(s, next, ctx) \
+	list_for_each_entry_safe(s, next, &(ctx)->schemes, list)
+
 #ifdef CONFIG_DAMON
 
 struct damon_region *damon_new_region(unsigned long start, unsigned long end);
@@ -242,6 +298,14 @@ inline void damon_insert_region(struct damon_region *r,
 void damon_add_region(struct damon_region *r, struct damon_target *t);
 void damon_destroy_region(struct damon_region *r, struct damon_target *t);
 
+struct damos *damon_new_scheme(
+		unsigned long min_sz_region, unsigned long max_sz_region,
+		unsigned int min_nr_accesses, unsigned int max_nr_accesses,
+		unsigned int min_age_region, unsigned int max_age_region,
+		enum damos_action action);
+void damon_add_scheme(struct damon_ctx *ctx, struct damos *s);
+void damon_destroy_scheme(struct damos *s);
+
 struct damon_target *damon_new_target(unsigned long id);
 void damon_add_target(struct damon_ctx *ctx, struct damon_target *t);
 void damon_free_target(struct damon_target *t);
@@ -255,6 +319,8 @@ int damon_set_targets(struct damon_ctx *ctx,
 int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
 		unsigned long aggr_int, unsigned long primitive_upd_int,
 		unsigned long min_nr_reg, unsigned long max_nr_reg);
+int damon_set_schemes(struct damon_ctx *ctx,
+			struct damos **schemes, ssize_t nr_schemes);
 int damon_nr_running_ctxs(void);
 
 int damon_start(struct damon_ctx **ctxs, int nr_ctxs);
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 3efbe80779db2..0ed97b21cbb6e 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -85,6 +85,50 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t)
 	damon_free_region(r);
 }
 
+struct damos *damon_new_scheme(
+		unsigned long min_sz_region, unsigned long max_sz_region,
+		unsigned int min_nr_accesses, unsigned int max_nr_accesses,
+		unsigned int min_age_region, unsigned int max_age_region,
+		enum damos_action action)
+{
+	struct damos *scheme;
+
+	scheme = kmalloc(sizeof(*scheme), GFP_KERNEL);
+	if (!scheme)
+		return NULL;
+	scheme->min_sz_region = min_sz_region;
+	scheme->max_sz_region = max_sz_region;
+	scheme->min_nr_accesses = min_nr_accesses;
+	scheme->max_nr_accesses = max_nr_accesses;
+	scheme->min_age_region = min_age_region;
+	scheme->max_age_region = max_age_region;
+	scheme->action = action;
+	INIT_LIST_HEAD(&scheme->list);
+
+	return scheme;
+}
+
+void damon_add_scheme(struct damon_ctx *ctx, struct damos *s)
+{
+	list_add_tail(&s->list, &ctx->schemes);
+}
+
+static void damon_del_scheme(struct damos *s)
+{
+	list_del(&s->list);
+}
+
+static void damon_free_scheme(struct damos *s)
+{
+	kfree(s);
+}
+
+void damon_destroy_scheme(struct damos *s)
+{
+	damon_del_scheme(s);
+	damon_free_scheme(s);
+}
+
 /*
  * Construct a damon_target struct
  *
@@ -156,6 +200,7 @@ struct damon_ctx *damon_new_ctx(void)
 	ctx->max_nr_regions = 1000;
 
 	INIT_LIST_HEAD(&ctx->adaptive_targets);
+	INIT_LIST_HEAD(&ctx->schemes);
 
 	return ctx;
 }
@@ -175,7 +220,13 @@ static void damon_destroy_targets(struct damon_ctx *ctx)
 
 void damon_destroy_ctx(struct damon_ctx *ctx)
 {
+	struct damos *s, *next_s;
+
 	damon_destroy_targets(ctx);
+
+	damon_for_each_scheme_safe(s, next_s, ctx)
+		damon_destroy_scheme(s);
+
 	kfree(ctx);
 }
 
@@ -250,6 +301,30 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
 	return 0;
 }
 
+/**
+ * damon_set_schemes() - Set data access monitoring based operation schemes.
+ * @ctx:	monitoring context
+ * @schemes:	array of the schemes
+ * @nr_schemes:	number of entries in @schemes
+ *
+ * This function should not be called while the kdamond of the context is
+ * running.
+ *
+ * Return: 0 if success, or negative error code otherwise.
+ */
+int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes,
+			ssize_t nr_schemes)
+{
+	struct damos *s, *next;
+	ssize_t i;
+
+	damon_for_each_scheme_safe(s, next, ctx)
+		damon_destroy_scheme(s);
+	for (i = 0; i < nr_schemes; i++)
+		damon_add_scheme(ctx, schemes[i]);
+	return 0;
+}
+
 /**
  * damon_nr_running_ctxs() - Return number of currently running contexts.
  */
@@ -453,6 +528,39 @@ static void kdamond_reset_aggregated(struct damon_ctx *c)
 	}
 }
 
+static void damon_do_apply_schemes(struct damon_ctx *c,
+				   struct damon_target *t,
+				   struct damon_region *r)
+{
+	struct damos *s;
+	unsigned long sz;
+
+	damon_for_each_scheme(s, c) {
+		sz = r->ar.end - r->ar.start;
+		if (sz < s->min_sz_region || s->max_sz_region < sz)
+			continue;
+		if (r->nr_accesses < s->min_nr_accesses ||
+				s->max_nr_accesses < r->nr_accesses)
+			continue;
+		if (r->age < s->min_age_region || s->max_age_region < r->age)
+			continue;
+		if (c->primitive.apply_scheme)
+			c->primitive.apply_scheme(c, t, r, s);
+		r->age = 0;
+	}
+}
+
+static void kdamond_apply_schemes(struct damon_ctx *c)
+{
+	struct damon_target *t;
+	struct damon_region *r;
+
+	damon_for_each_target(t, c) {
+		damon_for_each_region(r, t)
+			damon_do_apply_schemes(c, t, r);
+	}
+}
+
 #define sz_damon_region(r) (r->ar.end - r->ar.start)
 
 /*
@@ -693,6 +801,7 @@ static int kdamond_fn(void *data)
 			if (ctx->callback.after_aggregation &&
 					ctx->callback.after_aggregation(ctx))
 				set_kdamond_stop(ctx);
+			kdamond_apply_schemes(ctx);
 			kdamond_reset_aggregated(ctx);
 			kdamond_split_regions(ctx);
 			if (ctx->primitive.reset_aggregated)

From cc25636a9b79c92fcbdf8dafe3427b9490112a0c Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:46:25 -0700
Subject: [PATCH 290/737] mm/damon/vaddr: support DAMON-based Operation Schemes

This makes DAMON's default primitives for virtual address spaces to
support DAMON-based Operation Schemes (DAMOS) by implementing actions
application functions and registering it to the monitoring context.  The
implementation simply links 'madvise()' for related DAMOS actions.  That
is, 'madvise(MADV_WILLNEED)' is called for 'WILLNEED' DAMOS action and
similar for other actions ('COLD', 'PAGEOUT', 'HUGEPAGE', 'NOHUGEPAGE').

So, the kernel space DAMON users can now use the DAMON-based
optimizations with only small amount of code.

Link: https://lkml.kernel.org/r/20211001125604.29660-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rienjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h |  2 ++
 mm/damon/vaddr.c      | 56 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index dbe18b0fb795c..be6b6e81e8ee1 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -337,6 +337,8 @@ void damon_va_prepare_access_checks(struct damon_ctx *ctx);
 unsigned int damon_va_check_accesses(struct damon_ctx *ctx);
 bool damon_va_target_valid(void *t);
 void damon_va_cleanup(struct damon_ctx *ctx);
+int damon_va_apply_scheme(struct damon_ctx *context, struct damon_target *t,
+		struct damon_region *r, struct damos *scheme);
 void damon_va_set_primitives(struct damon_ctx *ctx);
 
 #endif	/* CONFIG_DAMON_VADDR */
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 58c1fb2aafa91..3e1c74d36bab7 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -7,6 +7,7 @@
 
 #define pr_fmt(fmt) "damon-va: " fmt
 
+#include <asm-generic/mman-common.h>
 #include <linux/damon.h>
 #include <linux/hugetlb.h>
 #include <linux/mm.h>
@@ -658,6 +659,60 @@ bool damon_va_target_valid(void *target)
 	return false;
 }
 
+#ifndef CONFIG_ADVISE_SYSCALLS
+static int damos_madvise(struct damon_target *target, struct damon_region *r,
+			int behavior)
+{
+	return -EINVAL;
+}
+#else
+static int damos_madvise(struct damon_target *target, struct damon_region *r,
+			int behavior)
+{
+	struct mm_struct *mm;
+	int ret = -ENOMEM;
+
+	mm = damon_get_mm(target);
+	if (!mm)
+		goto out;
+
+	ret = do_madvise(mm, PAGE_ALIGN(r->ar.start),
+			PAGE_ALIGN(r->ar.end - r->ar.start), behavior);
+	mmput(mm);
+out:
+	return ret;
+}
+#endif	/* CONFIG_ADVISE_SYSCALLS */
+
+int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t,
+		struct damon_region *r, struct damos *scheme)
+{
+	int madv_action;
+
+	switch (scheme->action) {
+	case DAMOS_WILLNEED:
+		madv_action = MADV_WILLNEED;
+		break;
+	case DAMOS_COLD:
+		madv_action = MADV_COLD;
+		break;
+	case DAMOS_PAGEOUT:
+		madv_action = MADV_PAGEOUT;
+		break;
+	case DAMOS_HUGEPAGE:
+		madv_action = MADV_HUGEPAGE;
+		break;
+	case DAMOS_NOHUGEPAGE:
+		madv_action = MADV_NOHUGEPAGE;
+		break;
+	default:
+		pr_warn("Wrong action %d\n", scheme->action);
+		return -EINVAL;
+	}
+
+	return damos_madvise(t, r, madv_action);
+}
+
 void damon_va_set_primitives(struct damon_ctx *ctx)
 {
 	ctx->primitive.init = damon_va_init;
@@ -667,6 +722,7 @@ void damon_va_set_primitives(struct damon_ctx *ctx)
 	ctx->primitive.reset_aggregated = NULL;
 	ctx->primitive.target_valid = damon_va_target_valid;
 	ctx->primitive.cleanup = NULL;
+	ctx->primitive.apply_scheme = damon_va_apply_scheme;
 }
 
 #include "vaddr-test.h"

From a95d4e182ef38ce706718428ceb4a59c740b8aeb Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:46:29 -0700
Subject: [PATCH 291/737] mm/damon/dbgfs: support DAMON-based Operation Schemes

This makes 'damon-dbgfs' to support the data access monitoring oriented
memory management schemes.  Users can read and update the schemes using
``<debugfs>/damon/schemes`` file.  The format is::

    <min/max size> <min/max access frequency> <min/max age> <action>

Link: https://lkml.kernel.org/r/20211001125604.29660-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rienjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/dbgfs.c | 165 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 162 insertions(+), 3 deletions(-)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index faee070977d80..78b7a04490c57 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -98,6 +98,159 @@ static ssize_t dbgfs_attrs_write(struct file *file,
 	return ret;
 }
 
+static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
+{
+	struct damos *s;
+	int written = 0;
+	int rc;
+
+	damon_for_each_scheme(s, c) {
+		rc = scnprintf(&buf[written], len - written,
+				"%lu %lu %u %u %u %u %d\n",
+				s->min_sz_region, s->max_sz_region,
+				s->min_nr_accesses, s->max_nr_accesses,
+				s->min_age_region, s->max_age_region,
+				s->action);
+		if (!rc)
+			return -ENOMEM;
+
+		written += rc;
+	}
+	return written;
+}
+
+static ssize_t dbgfs_schemes_read(struct file *file, char __user *buf,
+		size_t count, loff_t *ppos)
+{
+	struct damon_ctx *ctx = file->private_data;
+	char *kbuf;
+	ssize_t len;
+
+	kbuf = kmalloc(count, GFP_KERNEL);
+	if (!kbuf)
+		return -ENOMEM;
+
+	mutex_lock(&ctx->kdamond_lock);
+	len = sprint_schemes(ctx, kbuf, count);
+	mutex_unlock(&ctx->kdamond_lock);
+	if (len < 0)
+		goto out;
+	len = simple_read_from_buffer(buf, count, ppos, kbuf, len);
+
+out:
+	kfree(kbuf);
+	return len;
+}
+
+static void free_schemes_arr(struct damos **schemes, ssize_t nr_schemes)
+{
+	ssize_t i;
+
+	for (i = 0; i < nr_schemes; i++)
+		kfree(schemes[i]);
+	kfree(schemes);
+}
+
+static bool damos_action_valid(int action)
+{
+	switch (action) {
+	case DAMOS_WILLNEED:
+	case DAMOS_COLD:
+	case DAMOS_PAGEOUT:
+	case DAMOS_HUGEPAGE:
+	case DAMOS_NOHUGEPAGE:
+		return true;
+	default:
+		return false;
+	}
+}
+
+/*
+ * Converts a string into an array of struct damos pointers
+ *
+ * Returns an array of struct damos pointers that converted if the conversion
+ * success, or NULL otherwise.
+ */
+static struct damos **str_to_schemes(const char *str, ssize_t len,
+				ssize_t *nr_schemes)
+{
+	struct damos *scheme, **schemes;
+	const int max_nr_schemes = 256;
+	int pos = 0, parsed, ret;
+	unsigned long min_sz, max_sz;
+	unsigned int min_nr_a, max_nr_a, min_age, max_age;
+	unsigned int action;
+
+	schemes = kmalloc_array(max_nr_schemes, sizeof(scheme),
+			GFP_KERNEL);
+	if (!schemes)
+		return NULL;
+
+	*nr_schemes = 0;
+	while (pos < len && *nr_schemes < max_nr_schemes) {
+		ret = sscanf(&str[pos], "%lu %lu %u %u %u %u %u%n",
+				&min_sz, &max_sz, &min_nr_a, &max_nr_a,
+				&min_age, &max_age, &action, &parsed);
+		if (ret != 7)
+			break;
+		if (!damos_action_valid(action)) {
+			pr_err("wrong action %d\n", action);
+			goto fail;
+		}
+
+		pos += parsed;
+		scheme = damon_new_scheme(min_sz, max_sz, min_nr_a, max_nr_a,
+				min_age, max_age, action);
+		if (!scheme)
+			goto fail;
+
+		schemes[*nr_schemes] = scheme;
+		*nr_schemes += 1;
+	}
+	return schemes;
+fail:
+	free_schemes_arr(schemes, *nr_schemes);
+	return NULL;
+}
+
+static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf,
+		size_t count, loff_t *ppos)
+{
+	struct damon_ctx *ctx = file->private_data;
+	char *kbuf;
+	struct damos **schemes;
+	ssize_t nr_schemes = 0, ret = count;
+	int err;
+
+	kbuf = user_input_str(buf, count, ppos);
+	if (IS_ERR(kbuf))
+		return PTR_ERR(kbuf);
+
+	schemes = str_to_schemes(kbuf, ret, &nr_schemes);
+	if (!schemes) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	mutex_lock(&ctx->kdamond_lock);
+	if (ctx->kdamond) {
+		ret = -EBUSY;
+		goto unlock_out;
+	}
+
+	err = damon_set_schemes(ctx, schemes, nr_schemes);
+	if (err)
+		ret = err;
+	else
+		nr_schemes = 0;
+unlock_out:
+	mutex_unlock(&ctx->kdamond_lock);
+	free_schemes_arr(schemes, nr_schemes);
+out:
+	kfree(kbuf);
+	return ret;
+}
+
 static inline bool targetid_is_pid(const struct damon_ctx *ctx)
 {
 	return ctx->primitive.target_valid == damon_va_target_valid;
@@ -279,6 +432,12 @@ static const struct file_operations attrs_fops = {
 	.write = dbgfs_attrs_write,
 };
 
+static const struct file_operations schemes_fops = {
+	.open = damon_dbgfs_open,
+	.read = dbgfs_schemes_read,
+	.write = dbgfs_schemes_write,
+};
+
 static const struct file_operations target_ids_fops = {
 	.open = damon_dbgfs_open,
 	.read = dbgfs_target_ids_read,
@@ -292,10 +451,10 @@ static const struct file_operations kdamond_pid_fops = {
 
 static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx)
 {
-	const char * const file_names[] = {"attrs", "target_ids",
+	const char * const file_names[] = {"attrs", "schemes", "target_ids",
 		"kdamond_pid"};
-	const struct file_operations *fops[] = {&attrs_fops, &target_ids_fops,
-		&kdamond_pid_fops};
+	const struct file_operations *fops[] = {&attrs_fops, &schemes_fops,
+		&target_ids_fops, &kdamond_pid_fops};
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(file_names); i++)

From 25e8e77c5eae07d2bd9543034ecf9412672e7c76 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:46:32 -0700
Subject: [PATCH 292/737] mm/damon/schemes: implement statistics feature

To tune the DAMON-based operation schemes, knowing how many and how
large regions are affected by each of the schemes will be helful.  Those
stats could be used for not only the tuning, but also monitoring of the
working set size and the number of regions, if the scheme does not
change the program behavior too much.

For the reason, this implements the statistics for the schemes.  The
total number and size of the regions that each scheme is applied are
exported to users via '->stat_count' and '->stat_sz' of 'struct damos'.
Admins can also check the number by reading 'schemes' debugfs file.  The
last two integers now represents the stats.  To allow collecting the
stats without changing the program behavior, this also adds new scheme
action, 'DAMOS_STAT'.  Note that 'DAMOS_STAT' is not only making no
memory operation actions, but also does not reset the age of regions.

Link: https://lkml.kernel.org/r/20211001125604.29660-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rienjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 10 +++++++++-
 mm/damon/core.c       |  7 ++++++-
 mm/damon/dbgfs.c      |  5 +++--
 mm/damon/vaddr.c      |  2 ++
 4 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index be6b6e81e8ee1..f301bb53381c1 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -78,6 +78,7 @@ struct damon_target {
  * @DAMOS_PAGEOUT:	Call ``madvise()`` for the region with MADV_PAGEOUT.
  * @DAMOS_HUGEPAGE:	Call ``madvise()`` for the region with MADV_HUGEPAGE.
  * @DAMOS_NOHUGEPAGE:	Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
+ * @DAMOS_STAT:		Do nothing but count the stat.
  */
 enum damos_action {
 	DAMOS_WILLNEED,
@@ -85,6 +86,7 @@ enum damos_action {
 	DAMOS_PAGEOUT,
 	DAMOS_HUGEPAGE,
 	DAMOS_NOHUGEPAGE,
+	DAMOS_STAT,		/* Do nothing but only record the stat */
 };
 
 /**
@@ -96,9 +98,13 @@ enum damos_action {
  * @min_age_region:	Minimum age of target regions.
  * @max_age_region:	Maximum age of target regions.
  * @action:		&damo_action to be applied to the target regions.
+ * @stat_count:		Total number of regions that this scheme is applied.
+ * @stat_sz:		Total size of regions that this scheme is applied.
  * @list:		List head for siblings.
  *
- * Note that both the minimums and the maximums are inclusive.
+ * For each aggregation interval, DAMON applies @action to monitoring target
+ * regions fit in the condition and updates the statistics.  Note that both
+ * the minimums and the maximums are inclusive.
  */
 struct damos {
 	unsigned long min_sz_region;
@@ -108,6 +114,8 @@ struct damos {
 	unsigned int min_age_region;
 	unsigned int max_age_region;
 	enum damos_action action;
+	unsigned long stat_count;
+	unsigned long stat_sz;
 	struct list_head list;
 };
 
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 0ed97b21cbb6e..2f6785737902d 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -103,6 +103,8 @@ struct damos *damon_new_scheme(
 	scheme->min_age_region = min_age_region;
 	scheme->max_age_region = max_age_region;
 	scheme->action = action;
+	scheme->stat_count = 0;
+	scheme->stat_sz = 0;
 	INIT_LIST_HEAD(&scheme->list);
 
 	return scheme;
@@ -544,9 +546,12 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
 			continue;
 		if (r->age < s->min_age_region || s->max_age_region < r->age)
 			continue;
+		s->stat_count++;
+		s->stat_sz += sz;
 		if (c->primitive.apply_scheme)
 			c->primitive.apply_scheme(c, t, r, s);
-		r->age = 0;
+		if (s->action != DAMOS_STAT)
+			r->age = 0;
 	}
 }
 
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 78b7a04490c57..28d6abf277636 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -106,11 +106,11 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
 
 	damon_for_each_scheme(s, c) {
 		rc = scnprintf(&buf[written], len - written,
-				"%lu %lu %u %u %u %u %d\n",
+				"%lu %lu %u %u %u %u %d %lu %lu\n",
 				s->min_sz_region, s->max_sz_region,
 				s->min_nr_accesses, s->max_nr_accesses,
 				s->min_age_region, s->max_age_region,
-				s->action);
+				s->action, s->stat_count, s->stat_sz);
 		if (!rc)
 			return -ENOMEM;
 
@@ -159,6 +159,7 @@ static bool damos_action_valid(int action)
 	case DAMOS_PAGEOUT:
 	case DAMOS_HUGEPAGE:
 	case DAMOS_NOHUGEPAGE:
+	case DAMOS_STAT:
 		return true;
 	default:
 		return false;
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 3e1c74d36bab7..953c145b4f08a 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -705,6 +705,8 @@ int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t,
 	case DAMOS_NOHUGEPAGE:
 		madv_action = MADV_NOHUGEPAGE;
 		break;
+	case DAMOS_STAT:
+		return 0;
 	default:
 		pr_warn("Wrong action %d\n", scheme->action);
 		return -EINVAL;

From e05aa659e4ddddada00773c9708cc46d95c051fc Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:46:36 -0700
Subject: [PATCH 293/737] selftests/damon: add 'schemes' debugfs tests

This adds simple selftets for 'schemes' debugfs file of DAMON.

Link: https://lkml.kernel.org/r/20211001125604.29660-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rienjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/damon/debugfs_attrs.sh | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tools/testing/selftests/damon/debugfs_attrs.sh b/tools/testing/selftests/damon/debugfs_attrs.sh
index bfabb19dc0d3d..639cfb6a1f651 100644
--- a/tools/testing/selftests/damon/debugfs_attrs.sh
+++ b/tools/testing/selftests/damon/debugfs_attrs.sh
@@ -57,6 +57,19 @@ test_write_fail "$file" "1 2 3 5 4" "$orig_content" \
 test_content "$file" "$orig_content" "1 2 3 4 5" "successfully written"
 echo "$orig_content" > "$file"
 
+# Test schemes file
+# =================
+
+file="$DBGFS/schemes"
+orig_content=$(cat "$file")
+
+test_write_succ "$file" "1 2 3 4 5 6 4" \
+	"$orig_content" "valid input"
+test_write_fail "$file" "1 2
+3 4 5 6 3" "$orig_content" "multi lines"
+test_write_succ "$file" "" "$orig_content" "disabling"
+echo "$orig_content" > "$file"
+
 # Test target_ids file
 # ====================
 

From ddc2dc7adbd23de88819ddaefce2f035bc91b1bd Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:46:39 -0700
Subject: [PATCH 294/737] Docs/admin-guide/mm/damon: document DAMON-based
 Operation Schemes

This adds the description of DAMON-based operation schemes in the DAMON
documents.

Link: https://lkml.kernel.org/r/20211001125604.29660-8-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rienjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/start.rst | 11 +++++
 Documentation/admin-guide/mm/damon/usage.rst | 51 +++++++++++++++++++-
 2 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/start.rst b/Documentation/admin-guide/mm/damon/start.rst
index d5eb89a8fc386..51503cf90ca29 100644
--- a/Documentation/admin-guide/mm/damon/start.rst
+++ b/Documentation/admin-guide/mm/damon/start.rst
@@ -108,6 +108,17 @@ the results as separate image files. ::
 You can view the visualizations of this example workload at [1]_.
 Visualizations of other realistic workloads are available at [2]_ [3]_ [4]_.
 
+
+Data Access Pattern Aware Memory Management
+===========================================
+
+Below three commands make every memory region of size >=4K that doesn't
+accessed for >=60 seconds in your workload to be swapped out. ::
+
+    $ echo "#min-size max-size min-acc max-acc min-age max-age action" > scheme
+    $ echo "4K        max      0       0       60s     max     pageout" >> scheme
+    $ damo schemes -c my_thp_scheme <pid of your workload>
+
 .. [1] https://damonitor.github.io/doc/html/v17/admin-guide/mm/damon/start.html#visualizing-recorded-patterns
 .. [2] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.1.png.html
 .. [3] https://damonitor.github.io/test/result/visual/latest/rec.wss_sz.png.html
diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index a72cda374abac..c0296c14babff 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -34,8 +34,8 @@ the reason, this document describes only the debugfs interface
 debugfs Interface
 =================
 
-DAMON exports three files, ``attrs``, ``target_ids``, and ``monitor_on`` under
-its debugfs directory, ``<debugfs>/damon/``.
+DAMON exports four files, ``attrs``, ``target_ids``, ``schemes`` and
+``monitor_on`` under its debugfs directory, ``<debugfs>/damon/``.
 
 
 Attributes
@@ -74,6 +74,53 @@ check it again::
 Note that setting the target ids doesn't start the monitoring.
 
 
+Schemes
+-------
+
+For usual DAMON-based data access aware memory management optimizations, users
+would simply want the system to apply a memory management action to a memory
+region of a specific size having a specific access frequency for a specific
+time.  DAMON receives such formalized operation schemes from the user and
+applies those to the target processes.  It also counts the total number and
+size of regions that each scheme is applied.  This statistics can be used for
+online analysis or tuning of the schemes.
+
+Users can get and set the schemes by reading from and writing to ``schemes``
+debugfs file.  Reading the file also shows the statistics of each scheme.  To
+the file, each of the schemes should be represented in each line in below form:
+
+    min-size max-size min-acc max-acc min-age max-age action
+
+Note that the ranges are closed interval.  Bytes for the size of regions
+(``min-size`` and ``max-size``), number of monitored accesses per aggregate
+interval for access frequency (``min-acc`` and ``max-acc``), number of
+aggregate intervals for the age of regions (``min-age`` and ``max-age``), and a
+predefined integer for memory management actions should be used.  The supported
+numbers and their meanings are as below.
+
+ - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED``
+ - 1: Call ``madvise()`` for the region with ``MADV_COLD``
+ - 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT``
+ - 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``
+ - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``
+ - 5: Do nothing but count the statistics
+
+You can disable schemes by simply writing an empty string to the file.  For
+example, below commands applies a scheme saying "If a memory region of size in
+[4KiB, 8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate
+interval in [10, 20], page out the region", check the entered scheme again, and
+finally remove the scheme. ::
+
+    # cd <debugfs>/damon
+    # echo "4096 8192    0 5    10 20    2" > schemes
+    # cat schemes
+    4096 8192 0 5 10 20 2 0 0
+    # echo > schemes
+
+The last two integers in the 4th line of above example is the total number and
+the total size of the regions that the scheme is applied.
+
+
 Turning On/Off
 --------------
 

From 36cec29f25a3cb7f35a43cb6cc611edf66837200 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:46:42 -0700
Subject: [PATCH 295/737] mm/damon/dbgfs: allow users to set initial monitoring
 target regions

Patch series "DAMON: Support Physical Memory Address Space Monitoring:.

DAMON currently supports only virtual address spaces monitoring.  It can
be easily extended for various use cases and address spaces by
configuring its monitoring primitives layer to use appropriate
primitives implementations, though.  This patchset implements monitoring
primitives for the physical address space monitoring using the
structure.

The first 3 patches allow the user space users manually set the
monitoring regions.  The 1st patch implements the feature in the
'damon-dbgfs'.  Then, patches for adding a unit tests (the 2nd patch)
and updating the documentation (the 3rd patch) follow.

Following 4 patches implement the physical address space monitoring
primitives.  The 4th patch makes some primitive functions for the
virtual address spaces primitives reusable.  The 5th patch implements
the physical address space monitoring primitives.  The 6th patch links
the primitives to the 'damon-dbgfs'.  Finally, 7th patch documents this
new features.

This patch (of 7):

Some 'damon-dbgfs' users would want to monitor only a part of the entire
virtual memory address space.  The program interface users in the kernel
space could use '->before_start()' callback or set the regions inside
the context struct as they want, but 'damon-dbgfs' users cannot.

For that reason, this introduces a new debugfs file called
'init_region'.  'damon-dbgfs' users can specify which initial monitoring
target address regions they want by writing special input to the file.
The input should describe each region in each line in the below form:

    <pid> <start address> <end address>

Note that the regions will be updated to cover entire memory mapped
regions after a 'regions update interval' is passed.  If you want the
regions to not be updated after the initial setting, you could set the
interval as a very long time, say, a few decades.

Link: https://lkml.kernel.org/r/20211012205711.29216-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20211012205711.29216-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Marco Elver <elver@google.com>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Greg Thelen <gthelen@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: David Rienjes <rientjes@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/dbgfs.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 154 insertions(+), 2 deletions(-)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 28d6abf277636..1cce53cd241d0 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -394,6 +394,152 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
 	return ret;
 }
 
+static ssize_t sprint_init_regions(struct damon_ctx *c, char *buf, ssize_t len)
+{
+	struct damon_target *t;
+	struct damon_region *r;
+	int written = 0;
+	int rc;
+
+	damon_for_each_target(t, c) {
+		damon_for_each_region(r, t) {
+			rc = scnprintf(&buf[written], len - written,
+					"%lu %lu %lu\n",
+					t->id, r->ar.start, r->ar.end);
+			if (!rc)
+				return -ENOMEM;
+			written += rc;
+		}
+	}
+	return written;
+}
+
+static ssize_t dbgfs_init_regions_read(struct file *file, char __user *buf,
+		size_t count, loff_t *ppos)
+{
+	struct damon_ctx *ctx = file->private_data;
+	char *kbuf;
+	ssize_t len;
+
+	kbuf = kmalloc(count, GFP_KERNEL);
+	if (!kbuf)
+		return -ENOMEM;
+
+	mutex_lock(&ctx->kdamond_lock);
+	if (ctx->kdamond) {
+		mutex_unlock(&ctx->kdamond_lock);
+		len = -EBUSY;
+		goto out;
+	}
+
+	len = sprint_init_regions(ctx, kbuf, count);
+	mutex_unlock(&ctx->kdamond_lock);
+	if (len < 0)
+		goto out;
+	len = simple_read_from_buffer(buf, count, ppos, kbuf, len);
+
+out:
+	kfree(kbuf);
+	return len;
+}
+
+static int add_init_region(struct damon_ctx *c,
+			 unsigned long target_id, struct damon_addr_range *ar)
+{
+	struct damon_target *t;
+	struct damon_region *r, *prev;
+	unsigned long id;
+	int rc = -EINVAL;
+
+	if (ar->start >= ar->end)
+		return -EINVAL;
+
+	damon_for_each_target(t, c) {
+		id = t->id;
+		if (targetid_is_pid(c))
+			id = (unsigned long)pid_vnr((struct pid *)id);
+		if (id == target_id) {
+			r = damon_new_region(ar->start, ar->end);
+			if (!r)
+				return -ENOMEM;
+			damon_add_region(r, t);
+			if (damon_nr_regions(t) > 1) {
+				prev = damon_prev_region(r);
+				if (prev->ar.end > r->ar.start) {
+					damon_destroy_region(r, t);
+					return -EINVAL;
+				}
+			}
+			rc = 0;
+		}
+	}
+	return rc;
+}
+
+static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len)
+{
+	struct damon_target *t;
+	struct damon_region *r, *next;
+	int pos = 0, parsed, ret;
+	unsigned long target_id;
+	struct damon_addr_range ar;
+	int err;
+
+	damon_for_each_target(t, c) {
+		damon_for_each_region_safe(r, next, t)
+			damon_destroy_region(r, t);
+	}
+
+	while (pos < len) {
+		ret = sscanf(&str[pos], "%lu %lu %lu%n",
+				&target_id, &ar.start, &ar.end, &parsed);
+		if (ret != 3)
+			break;
+		err = add_init_region(c, target_id, &ar);
+		if (err)
+			goto fail;
+		pos += parsed;
+	}
+
+	return 0;
+
+fail:
+	damon_for_each_target(t, c) {
+		damon_for_each_region_safe(r, next, t)
+			damon_destroy_region(r, t);
+	}
+	return err;
+}
+
+static ssize_t dbgfs_init_regions_write(struct file *file,
+					  const char __user *buf, size_t count,
+					  loff_t *ppos)
+{
+	struct damon_ctx *ctx = file->private_data;
+	char *kbuf;
+	ssize_t ret = count;
+	int err;
+
+	kbuf = user_input_str(buf, count, ppos);
+	if (IS_ERR(kbuf))
+		return PTR_ERR(kbuf);
+
+	mutex_lock(&ctx->kdamond_lock);
+	if (ctx->kdamond) {
+		ret = -EBUSY;
+		goto unlock_out;
+	}
+
+	err = set_init_regions(ctx, kbuf, ret);
+	if (err)
+		ret = err;
+
+unlock_out:
+	mutex_unlock(&ctx->kdamond_lock);
+	kfree(kbuf);
+	return ret;
+}
+
 static ssize_t dbgfs_kdamond_pid_read(struct file *file,
 		char __user *buf, size_t count, loff_t *ppos)
 {
@@ -445,6 +591,12 @@ static const struct file_operations target_ids_fops = {
 	.write = dbgfs_target_ids_write,
 };
 
+static const struct file_operations init_regions_fops = {
+	.open = damon_dbgfs_open,
+	.read = dbgfs_init_regions_read,
+	.write = dbgfs_init_regions_write,
+};
+
 static const struct file_operations kdamond_pid_fops = {
 	.open = damon_dbgfs_open,
 	.read = dbgfs_kdamond_pid_read,
@@ -453,9 +605,9 @@ static const struct file_operations kdamond_pid_fops = {
 static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx)
 {
 	const char * const file_names[] = {"attrs", "schemes", "target_ids",
-		"kdamond_pid"};
+		"init_regions", "kdamond_pid"};
 	const struct file_operations *fops[] = {&attrs_fops, &schemes_fops,
-		&target_ids_fops, &kdamond_pid_fops};
+		&target_ids_fops, &init_regions_fops, &kdamond_pid_fops};
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(file_names); i++)

From efc621b612b7736eb6ab822ac538a2681910ecd3 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:46:46 -0700
Subject: [PATCH 296/737] mm/damon/dbgfs-test: add a unit test case for
 'init_regions'

This adds another test case for the new feature, 'init_regions'.

Link: https://lkml.kernel.org/r/20211012205711.29216-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rienjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/dbgfs-test.h | 54 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h
index 4eddcfa73996f..104b22957616b 100644
--- a/mm/damon/dbgfs-test.h
+++ b/mm/damon/dbgfs-test.h
@@ -109,9 +109,63 @@ static void damon_dbgfs_test_set_targets(struct kunit *test)
 	dbgfs_destroy_ctx(ctx);
 }
 
+static void damon_dbgfs_test_set_init_regions(struct kunit *test)
+{
+	struct damon_ctx *ctx = damon_new_ctx();
+	unsigned long ids[] = {1, 2, 3};
+	/* Each line represents one region in ``<target id> <start> <end>`` */
+	char * const valid_inputs[] = {"2 10 20\n 2   20 30\n2 35 45",
+		"2 10 20\n",
+		"2 10 20\n1 39 59\n1 70 134\n  2  20 25\n",
+		""};
+	/* Reading the file again will show sorted, clean output */
+	char * const valid_expects[] = {"2 10 20\n2 20 30\n2 35 45\n",
+		"2 10 20\n",
+		"1 39 59\n1 70 134\n2 10 20\n2 20 25\n",
+		""};
+	char * const invalid_inputs[] = {"4 10 20\n",	/* target not exists */
+		"2 10 20\n 2 14 26\n",		/* regions overlap */
+		"1 10 20\n2 30 40\n 1 5 8"};	/* not sorted by address */
+	char *input, *expect;
+	int i, rc;
+	char buf[256];
+
+	damon_set_targets(ctx, ids, 3);
+
+	/* Put valid inputs and check the results */
+	for (i = 0; i < ARRAY_SIZE(valid_inputs); i++) {
+		input = valid_inputs[i];
+		expect = valid_expects[i];
+
+		rc = set_init_regions(ctx, input, strnlen(input, 256));
+		KUNIT_EXPECT_EQ(test, rc, 0);
+
+		memset(buf, 0, 256);
+		sprint_init_regions(ctx, buf, 256);
+
+		KUNIT_EXPECT_STREQ(test, (char *)buf, expect);
+	}
+	/* Put invlid inputs and check the return error code */
+	for (i = 0; i < ARRAY_SIZE(invalid_inputs); i++) {
+		input = invalid_inputs[i];
+		pr_info("input: %s\n", input);
+		rc = set_init_regions(ctx, input, strnlen(input, 256));
+		KUNIT_EXPECT_EQ(test, rc, -EINVAL);
+
+		memset(buf, 0, 256);
+		sprint_init_regions(ctx, buf, 256);
+
+		KUNIT_EXPECT_STREQ(test, (char *)buf, "");
+	}
+
+	damon_set_targets(ctx, NULL, 0);
+	damon_destroy_ctx(ctx);
+}
+
 static struct kunit_case damon_test_cases[] = {
 	KUNIT_CASE(damon_dbgfs_test_str_to_target_ids),
 	KUNIT_CASE(damon_dbgfs_test_set_targets),
+	KUNIT_CASE(damon_dbgfs_test_set_init_regions),
 	{},
 };
 

From c4cfcdd966d18828ea12c35d3bc3ff9f107a3d7c Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:46:49 -0700
Subject: [PATCH 297/737] Docs/admin-guide/mm/damon: document 'init_regions'
 feature

This adds description of the 'init_regions' feature in the DAMON usage
document.

Link: https://lkml.kernel.org/r/20211012205711.29216-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rienjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 41 +++++++++++++++++++-
 1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index c0296c14babff..f7d5cfbb50c2d 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -34,8 +34,9 @@ the reason, this document describes only the debugfs interface
 debugfs Interface
 =================
 
-DAMON exports four files, ``attrs``, ``target_ids``, ``schemes`` and
-``monitor_on`` under its debugfs directory, ``<debugfs>/damon/``.
+DAMON exports five files, ``attrs``, ``target_ids``, ``init_regions``,
+``schemes`` and ``monitor_on`` under its debugfs directory,
+``<debugfs>/damon/``.
 
 
 Attributes
@@ -74,6 +75,42 @@ check it again::
 Note that setting the target ids doesn't start the monitoring.
 
 
+Initial Monitoring Target Regions
+---------------------------------
+
+In case of the debugfs based monitoring, DAMON automatically sets and updates
+the monitoring target regions so that entire memory mappings of target
+processes can be covered.  However, users can want to limit the monitoring
+region to specific address ranges, such as the heap, the stack, or specific
+file-mapped area.  Or, some users can know the initial access pattern of their
+workloads and therefore want to set optimal initial regions for the 'adaptive
+regions adjustment'.
+
+In such cases, users can explicitly set the initial monitoring target regions
+as they want, by writing proper values to the ``init_regions`` file.  Each line
+of the input should represent one region in below form.::
+
+    <target id> <start address> <end address>
+
+The ``target id`` should already in ``target_ids`` file, and the regions should
+be passed in address order.  For example, below commands will set a couple of
+address ranges, ``1-100`` and ``100-200`` as the initial monitoring target
+region of process 42, and another couple of address ranges, ``20-40`` and
+``50-100`` as that of process 4242.::
+
+    # cd <debugfs>/damon
+    # echo "42   1       100
+            42   100     200
+            4242 20      40
+            4242 50      100" > init_regions
+
+Note that this sets the initial monitoring target regions only.  In case of
+virtual memory monitoring, DAMON will automatically updates the boundary of the
+regions after one ``regions update interval``.  Therefore, users should set the
+``regions update interval`` large enough in this case, if they don't want the
+update.
+
+
 Schemes
 -------
 

From fe441256ba75a4d49c0aa5b2b4554c3c4a1b5df1 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:46:53 -0700
Subject: [PATCH 298/737] mm/damon/vaddr: separate commonly usable functions

This moves functions in the default virtual address spaces monitoring
primitives that commonly usable from other address spaces like physical
address space into a header file.  Those will be reused by the physical
address space monitoring primitives which will be implemented by the
following commit.

[sj@kernel.org: include 'highmem.h' to fix a build failure]
  Link: https://lkml.kernel.org/r/20211014110848.5204-1-sj@kernel.org

Link: https://lkml.kernel.org/r/20211012205711.29216-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rienjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/Makefile       |  2 +-
 mm/damon/prmtv-common.c | 87 +++++++++++++++++++++++++++++++++++++++++
 mm/damon/prmtv-common.h | 17 ++++++++
 mm/damon/vaddr.c        | 87 ++---------------------------------------
 4 files changed, 108 insertions(+), 85 deletions(-)
 create mode 100644 mm/damon/prmtv-common.c
 create mode 100644 mm/damon/prmtv-common.h

diff --git a/mm/damon/Makefile b/mm/damon/Makefile
index fed4be3bace3e..99b1bfe01ff51 100644
--- a/mm/damon/Makefile
+++ b/mm/damon/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
 
 obj-$(CONFIG_DAMON)		:= core.o
-obj-$(CONFIG_DAMON_VADDR)	+= vaddr.o
+obj-$(CONFIG_DAMON_VADDR)	+= prmtv-common.o vaddr.o
 obj-$(CONFIG_DAMON_DBGFS)	+= dbgfs.o
diff --git a/mm/damon/prmtv-common.c b/mm/damon/prmtv-common.c
new file mode 100644
index 0000000000000..7e62ee54fb543
--- /dev/null
+++ b/mm/damon/prmtv-common.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common Primitives for Data Access Monitoring
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+
+#include "prmtv-common.h"
+
+/*
+ * Get an online page for a pfn if it's in the LRU list.  Otherwise, returns
+ * NULL.
+ *
+ * The body of this function is stolen from the 'page_idle_get_page()'.  We
+ * steal rather than reuse it because the code is quite simple.
+ */
+struct page *damon_get_page(unsigned long pfn)
+{
+	struct page *page = pfn_to_online_page(pfn);
+
+	if (!page || !PageLRU(page) || !get_page_unless_zero(page))
+		return NULL;
+
+	if (unlikely(!PageLRU(page))) {
+		put_page(page);
+		page = NULL;
+	}
+	return page;
+}
+
+void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr)
+{
+	bool referenced = false;
+	struct page *page = damon_get_page(pte_pfn(*pte));
+
+	if (!page)
+		return;
+
+	if (pte_young(*pte)) {
+		referenced = true;
+		*pte = pte_mkold(*pte);
+	}
+
+#ifdef CONFIG_MMU_NOTIFIER
+	if (mmu_notifier_clear_young(mm, addr, addr + PAGE_SIZE))
+		referenced = true;
+#endif /* CONFIG_MMU_NOTIFIER */
+
+	if (referenced)
+		set_page_young(page);
+
+	set_page_idle(page);
+	put_page(page);
+}
+
+void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	bool referenced = false;
+	struct page *page = damon_get_page(pmd_pfn(*pmd));
+
+	if (!page)
+		return;
+
+	if (pmd_young(*pmd)) {
+		referenced = true;
+		*pmd = pmd_mkold(*pmd);
+	}
+
+#ifdef CONFIG_MMU_NOTIFIER
+	if (mmu_notifier_clear_young(mm, addr,
+				addr + ((1UL) << HPAGE_PMD_SHIFT)))
+		referenced = true;
+#endif /* CONFIG_MMU_NOTIFIER */
+
+	if (referenced)
+		set_page_young(page);
+
+	set_page_idle(page);
+	put_page(page);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+}
diff --git a/mm/damon/prmtv-common.h b/mm/damon/prmtv-common.h
new file mode 100644
index 0000000000000..7093d19e5d428
--- /dev/null
+++ b/mm/damon/prmtv-common.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common Primitives for Data Access Monitoring
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#include <linux/damon.h>
+#include <linux/random.h>
+
+/* Get a random number in [l, r) */
+#define damon_rand(l, r) (l + prandom_u32_max(r - l))
+
+struct page *damon_get_page(unsigned long pfn);
+
+void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr);
+void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr);
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 953c145b4f08a..cdeca92a34dcc 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -8,25 +8,20 @@
 #define pr_fmt(fmt) "damon-va: " fmt
 
 #include <asm-generic/mman-common.h>
-#include <linux/damon.h>
+#include <linux/highmem.h>
 #include <linux/hugetlb.h>
-#include <linux/mm.h>
 #include <linux/mmu_notifier.h>
-#include <linux/highmem.h>
 #include <linux/page_idle.h>
 #include <linux/pagewalk.h>
-#include <linux/random.h>
 #include <linux/sched/mm.h>
-#include <linux/slab.h>
+
+#include "prmtv-common.h"
 
 #ifdef CONFIG_DAMON_VADDR_KUNIT_TEST
 #undef DAMON_MIN_REGION
 #define DAMON_MIN_REGION 1
 #endif
 
-/* Get a random number in [l, r) */
-#define damon_rand(l, r) (l + prandom_u32_max(r - l))
-
 /*
  * 't->id' should be the pointer to the relevant 'struct pid' having reference
  * count.  Caller must put the returned task, unless it is NULL.
@@ -373,82 +368,6 @@ void damon_va_update(struct damon_ctx *ctx)
 	}
 }
 
-/*
- * Get an online page for a pfn if it's in the LRU list.  Otherwise, returns
- * NULL.
- *
- * The body of this function is stolen from the 'page_idle_get_page()'.  We
- * steal rather than reuse it because the code is quite simple.
- */
-static struct page *damon_get_page(unsigned long pfn)
-{
-	struct page *page = pfn_to_online_page(pfn);
-
-	if (!page || !PageLRU(page) || !get_page_unless_zero(page))
-		return NULL;
-
-	if (unlikely(!PageLRU(page))) {
-		put_page(page);
-		page = NULL;
-	}
-	return page;
-}
-
-static void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm,
-			     unsigned long addr)
-{
-	bool referenced = false;
-	struct page *page = damon_get_page(pte_pfn(*pte));
-
-	if (!page)
-		return;
-
-	if (pte_young(*pte)) {
-		referenced = true;
-		*pte = pte_mkold(*pte);
-	}
-
-#ifdef CONFIG_MMU_NOTIFIER
-	if (mmu_notifier_clear_young(mm, addr, addr + PAGE_SIZE))
-		referenced = true;
-#endif /* CONFIG_MMU_NOTIFIER */
-
-	if (referenced)
-		set_page_young(page);
-
-	set_page_idle(page);
-	put_page(page);
-}
-
-static void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm,
-			     unsigned long addr)
-{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	bool referenced = false;
-	struct page *page = damon_get_page(pmd_pfn(*pmd));
-
-	if (!page)
-		return;
-
-	if (pmd_young(*pmd)) {
-		referenced = true;
-		*pmd = pmd_mkold(*pmd);
-	}
-
-#ifdef CONFIG_MMU_NOTIFIER
-	if (mmu_notifier_clear_young(mm, addr,
-				addr + ((1UL) << HPAGE_PMD_SHIFT)))
-		referenced = true;
-#endif /* CONFIG_MMU_NOTIFIER */
-
-	if (referenced)
-		set_page_young(page);
-
-	set_page_idle(page);
-	put_page(page);
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-}
-
 static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
 		unsigned long next, struct mm_walk *walk)
 {

From 09ee17c1ad455672368bfb4fc0b9ef71b24f42a5 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:46:56 -0700
Subject: [PATCH 299/737] mm/damon: implement primitives for physical address
 space monitoring

This implements the monitoring primitives for the physical memory
address space.  Internally, it uses the PTE Accessed bit, similar to
that of the virtual address spaces monitoring primitives.  It supports
only user memory pages, as idle pages tracking does.  If the monitoring
target physical memory address range contains non-user memory pages,
access check of the pages will do nothing but simply treat the pages as
not accessed.

Link: https://lkml.kernel.org/r/20211012205711.29216-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rienjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h |  10 ++
 mm/damon/Kconfig      |   8 ++
 mm/damon/Makefile     |   1 +
 mm/damon/paddr.c      | 224 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 243 insertions(+)
 create mode 100644 mm/damon/paddr.c

diff --git a/include/linux/damon.h b/include/linux/damon.h
index f301bb53381c1..715dadd21f7cd 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -351,4 +351,14 @@ void damon_va_set_primitives(struct damon_ctx *ctx);
 
 #endif	/* CONFIG_DAMON_VADDR */
 
+#ifdef CONFIG_DAMON_PADDR
+
+/* Monitoring primitives for the physical memory address space */
+void damon_pa_prepare_access_checks(struct damon_ctx *ctx);
+unsigned int damon_pa_check_accesses(struct damon_ctx *ctx);
+bool damon_pa_target_valid(void *t);
+void damon_pa_set_primitives(struct damon_ctx *ctx);
+
+#endif	/* CONFIG_DAMON_PADDR */
+
 #endif	/* _DAMON_H */
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index ba8898c7eb8eb..2a5923be631e2 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -32,6 +32,14 @@ config DAMON_VADDR
 	  This builds the default data access monitoring primitives for DAMON
 	  that work for virtual address spaces.
 
+config DAMON_PADDR
+	bool "Data access monitoring primitives for the physical address space"
+	depends on DAMON && MMU
+	select PAGE_IDLE_FLAG
+	help
+	  This builds the default data access monitoring primitives for DAMON
+	  that works for the physical address space.
+
 config DAMON_VADDR_KUNIT_TEST
 	bool "Test for DAMON primitives" if !KUNIT_ALL_TESTS
 	depends on DAMON_VADDR && KUNIT=y
diff --git a/mm/damon/Makefile b/mm/damon/Makefile
index 99b1bfe01ff51..8d9b0df797029 100644
--- a/mm/damon/Makefile
+++ b/mm/damon/Makefile
@@ -2,4 +2,5 @@
 
 obj-$(CONFIG_DAMON)		:= core.o
 obj-$(CONFIG_DAMON_VADDR)	+= prmtv-common.o vaddr.o
+obj-$(CONFIG_DAMON_PADDR)	+= prmtv-common.o paddr.o
 obj-$(CONFIG_DAMON_DBGFS)	+= dbgfs.o
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
new file mode 100644
index 0000000000000..d7a2ecd09ed02
--- /dev/null
+++ b/mm/damon/paddr.c
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DAMON Primitives for The Physical Address Space
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#define pr_fmt(fmt) "damon-pa: " fmt
+
+#include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+
+#include "prmtv-common.h"
+
+static bool __damon_pa_mkold(struct page *page, struct vm_area_struct *vma,
+		unsigned long addr, void *arg)
+{
+	struct page_vma_mapped_walk pvmw = {
+		.page = page,
+		.vma = vma,
+		.address = addr,
+	};
+
+	while (page_vma_mapped_walk(&pvmw)) {
+		addr = pvmw.address;
+		if (pvmw.pte)
+			damon_ptep_mkold(pvmw.pte, vma->vm_mm, addr);
+		else
+			damon_pmdp_mkold(pvmw.pmd, vma->vm_mm, addr);
+	}
+	return true;
+}
+
+static void damon_pa_mkold(unsigned long paddr)
+{
+	struct page *page = damon_get_page(PHYS_PFN(paddr));
+	struct rmap_walk_control rwc = {
+		.rmap_one = __damon_pa_mkold,
+		.anon_lock = page_lock_anon_vma_read,
+	};
+	bool need_lock;
+
+	if (!page)
+		return;
+
+	if (!page_mapped(page) || !page_rmapping(page)) {
+		set_page_idle(page);
+		goto out;
+	}
+
+	need_lock = !PageAnon(page) || PageKsm(page);
+	if (need_lock && !trylock_page(page))
+		goto out;
+
+	rmap_walk(page, &rwc);
+
+	if (need_lock)
+		unlock_page(page);
+
+out:
+	put_page(page);
+}
+
+static void __damon_pa_prepare_access_check(struct damon_ctx *ctx,
+					    struct damon_region *r)
+{
+	r->sampling_addr = damon_rand(r->ar.start, r->ar.end);
+
+	damon_pa_mkold(r->sampling_addr);
+}
+
+void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
+{
+	struct damon_target *t;
+	struct damon_region *r;
+
+	damon_for_each_target(t, ctx) {
+		damon_for_each_region(r, t)
+			__damon_pa_prepare_access_check(ctx, r);
+	}
+}
+
+struct damon_pa_access_chk_result {
+	unsigned long page_sz;
+	bool accessed;
+};
+
+static bool __damon_pa_young(struct page *page, struct vm_area_struct *vma,
+		unsigned long addr, void *arg)
+{
+	struct damon_pa_access_chk_result *result = arg;
+	struct page_vma_mapped_walk pvmw = {
+		.page = page,
+		.vma = vma,
+		.address = addr,
+	};
+
+	result->accessed = false;
+	result->page_sz = PAGE_SIZE;
+	while (page_vma_mapped_walk(&pvmw)) {
+		addr = pvmw.address;
+		if (pvmw.pte) {
+			result->accessed = pte_young(*pvmw.pte) ||
+				!page_is_idle(page) ||
+				mmu_notifier_test_young(vma->vm_mm, addr);
+		} else {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+			result->accessed = pmd_young(*pvmw.pmd) ||
+				!page_is_idle(page) ||
+				mmu_notifier_test_young(vma->vm_mm, addr);
+			result->page_sz = ((1UL) << HPAGE_PMD_SHIFT);
+#else
+			WARN_ON_ONCE(1);
+#endif	/* CONFIG_TRANSPARENT_HUGEPAGE */
+		}
+		if (result->accessed) {
+			page_vma_mapped_walk_done(&pvmw);
+			break;
+		}
+	}
+
+	/* If accessed, stop walking */
+	return !result->accessed;
+}
+
+static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz)
+{
+	struct page *page = damon_get_page(PHYS_PFN(paddr));
+	struct damon_pa_access_chk_result result = {
+		.page_sz = PAGE_SIZE,
+		.accessed = false,
+	};
+	struct rmap_walk_control rwc = {
+		.arg = &result,
+		.rmap_one = __damon_pa_young,
+		.anon_lock = page_lock_anon_vma_read,
+	};
+	bool need_lock;
+
+	if (!page)
+		return false;
+
+	if (!page_mapped(page) || !page_rmapping(page)) {
+		if (page_is_idle(page))
+			result.accessed = false;
+		else
+			result.accessed = true;
+		put_page(page);
+		goto out;
+	}
+
+	need_lock = !PageAnon(page) || PageKsm(page);
+	if (need_lock && !trylock_page(page)) {
+		put_page(page);
+		return NULL;
+	}
+
+	rmap_walk(page, &rwc);
+
+	if (need_lock)
+		unlock_page(page);
+	put_page(page);
+
+out:
+	*page_sz = result.page_sz;
+	return result.accessed;
+}
+
+static void __damon_pa_check_access(struct damon_ctx *ctx,
+				    struct damon_region *r)
+{
+	static unsigned long last_addr;
+	static unsigned long last_page_sz = PAGE_SIZE;
+	static bool last_accessed;
+
+	/* If the region is in the last checked page, reuse the result */
+	if (ALIGN_DOWN(last_addr, last_page_sz) ==
+				ALIGN_DOWN(r->sampling_addr, last_page_sz)) {
+		if (last_accessed)
+			r->nr_accesses++;
+		return;
+	}
+
+	last_accessed = damon_pa_young(r->sampling_addr, &last_page_sz);
+	if (last_accessed)
+		r->nr_accesses++;
+
+	last_addr = r->sampling_addr;
+}
+
+unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
+{
+	struct damon_target *t;
+	struct damon_region *r;
+	unsigned int max_nr_accesses = 0;
+
+	damon_for_each_target(t, ctx) {
+		damon_for_each_region(r, t) {
+			__damon_pa_check_access(ctx, r);
+			max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
+		}
+	}
+
+	return max_nr_accesses;
+}
+
+bool damon_pa_target_valid(void *t)
+{
+	return true;
+}
+
+void damon_pa_set_primitives(struct damon_ctx *ctx)
+{
+	ctx->primitive.init = NULL;
+	ctx->primitive.update = NULL;
+	ctx->primitive.prepare_access_checks = damon_pa_prepare_access_checks;
+	ctx->primitive.check_accesses = damon_pa_check_accesses;
+	ctx->primitive.reset_aggregated = NULL;
+	ctx->primitive.target_valid = damon_pa_target_valid;
+	ctx->primitive.cleanup = NULL;
+	ctx->primitive.apply_scheme = NULL;
+}

From 6a6e511e5ca76b3ab569c7cd95f11bc2841de5c8 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:47:00 -0700
Subject: [PATCH 300/737] mm/damon/dbgfs: support physical memory monitoring

This makes the 'damon-dbgfs' to support the physical memory monitoring,
in addition to the virtual memory monitoring.

Users can do the physical memory monitoring by writing a special
keyword, 'paddr' to the 'target_ids' debugfs file.  Then, DAMON will
check the special keyword and configure the monitoring context to run
with the primitives for the physical address space.

Unlike the virtual memory monitoring, the monitoring target region will
not be automatically set.  Therefore, users should also set the
monitoring target address region using the 'init_regions' debugfs file.

Also, note that the physical memory monitoring will not automatically
terminated.  The user should explicitly turn off the monitoring by
writing 'off' to the 'monitor_on' debugfs file.

Link: https://lkml.kernel.org/r/20211012205711.29216-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rienjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/Kconfig |  2 +-
 mm/damon/dbgfs.c | 21 ++++++++++++++++++---
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index 2a5923be631e2..ca33b289ebbe4 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -54,7 +54,7 @@ config DAMON_VADDR_KUNIT_TEST
 
 config DAMON_DBGFS
 	bool "DAMON debugfs interface"
-	depends on DAMON_VADDR && DEBUG_FS
+	depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS
 	help
 	  This builds the debugfs interface for DAMON.  The user space admins
 	  can use the interface for arbitrary data access monitoring.
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 1cce53cd241d0..38188347d8abb 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -339,6 +339,7 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
 		const char __user *buf, size_t count, loff_t *ppos)
 {
 	struct damon_ctx *ctx = file->private_data;
+	bool id_is_pid = true;
 	char *kbuf, *nrs;
 	unsigned long *targets;
 	ssize_t nr_targets;
@@ -351,6 +352,11 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
 		return PTR_ERR(kbuf);
 
 	nrs = kbuf;
+	if (!strncmp(kbuf, "paddr\n", count)) {
+		id_is_pid = false;
+		/* target id is meaningless here, but we set it just for fun */
+		scnprintf(kbuf, count, "42    ");
+	}
 
 	targets = str_to_target_ids(nrs, ret, &nr_targets);
 	if (!targets) {
@@ -358,7 +364,7 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
 		goto out;
 	}
 
-	if (targetid_is_pid(ctx)) {
+	if (id_is_pid) {
 		for (i = 0; i < nr_targets; i++) {
 			targets[i] = (unsigned long)find_get_pid(
 					(int)targets[i]);
@@ -372,15 +378,24 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
 
 	mutex_lock(&ctx->kdamond_lock);
 	if (ctx->kdamond) {
-		if (targetid_is_pid(ctx))
+		if (id_is_pid)
 			dbgfs_put_pids(targets, nr_targets);
 		ret = -EBUSY;
 		goto unlock_out;
 	}
 
+	/* remove targets with previously-set primitive */
+	damon_set_targets(ctx, NULL, 0);
+
+	/* Configure the context for the address space type */
+	if (id_is_pid)
+		damon_va_set_primitives(ctx);
+	else
+		damon_pa_set_primitives(ctx);
+
 	err = damon_set_targets(ctx, targets, nr_targets);
 	if (err) {
-		if (targetid_is_pid(ctx))
+		if (id_is_pid)
 			dbgfs_put_pids(targets, nr_targets);
 		ret = err;
 	}

From f1665492c9bf0fd780bde1404f48699b1a725498 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:47:03 -0700
Subject: [PATCH 301/737] Docs/DAMON: document physical memory monitoring
 support

This updates the DAMON documents for the physical memory address space
monitoring support.

Link: https://lkml.kernel.org/r/20211012205711.29216-8-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rienjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 25 +++++++++++++----
 Documentation/vm/damon/design.rst            | 29 ++++++++++++--------
 Documentation/vm/damon/faq.rst               |  5 ++--
 3 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index f7d5cfbb50c2d..ed96bbf0daffc 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -10,15 +10,16 @@ DAMON provides below three interfaces for different users.
   This is for privileged people such as system administrators who want a
   just-working human-friendly interface.  Using this, users can use the DAMON’s
   major features in a human-friendly way.  It may not be highly tuned for
-  special cases, though.  It supports only virtual address spaces monitoring.
+  special cases, though.  It supports both virtual and physical address spaces
+  monitoring.
 - *debugfs interface.*
   This is for privileged user space programmers who want more optimized use of
   DAMON.  Using this, users can use DAMON’s major features by reading
   from and writing to special debugfs files.  Therefore, you can write and use
   your personalized DAMON debugfs wrapper programs that reads/writes the
   debugfs files instead of you.  The DAMON user space tool is also a reference
-  implementation of such programs.  It supports only virtual address spaces
-  monitoring.
+  implementation of such programs.  It supports both virtual and physical
+  address spaces monitoring.
 - *Kernel Space Programming Interface.*
   This is for kernel space programmers.  Using this, users can utilize every
   feature of DAMON most flexibly and efficiently by writing kernel space
@@ -72,20 +73,34 @@ check it again::
     # cat target_ids
     42 4242
 
+Users can also monitor the physical memory address space of the system by
+writing a special keyword, "``paddr\n``" to the file.  Because physical address
+space monitoring doesn't support multiple targets, reading the file will show a
+fake value, ``42``, as below::
+
+    # cd <debugfs>/damon
+    # echo paddr > target_ids
+    # cat target_ids
+    42
+
 Note that setting the target ids doesn't start the monitoring.
 
 
 Initial Monitoring Target Regions
 ---------------------------------
 
-In case of the debugfs based monitoring, DAMON automatically sets and updates
-the monitoring target regions so that entire memory mappings of target
+In case of the virtual address space monitoring, DAMON automatically sets and
+updates the monitoring target regions so that entire memory mappings of target
 processes can be covered.  However, users can want to limit the monitoring
 region to specific address ranges, such as the heap, the stack, or specific
 file-mapped area.  Or, some users can know the initial access pattern of their
 workloads and therefore want to set optimal initial regions for the 'adaptive
 regions adjustment'.
 
+In contrast, DAMON do not automatically sets and updates the monitoring target
+regions in case of physical memory monitoring.  Therefore, users should set the
+monitoring target regions by themselves.
+
 In such cases, users can explicitly set the initial monitoring target regions
 as they want, by writing proper values to the ``init_regions`` file.  Each line
 of the input should represent one region in below form.::
diff --git a/Documentation/vm/damon/design.rst b/Documentation/vm/damon/design.rst
index b05159c295f4d..210f0f50efd81 100644
--- a/Documentation/vm/damon/design.rst
+++ b/Documentation/vm/damon/design.rst
@@ -35,13 +35,17 @@ two parts:
 1. Identification of the monitoring target address range for the address space.
 2. Access check of specific address range in the target space.
 
-DAMON currently provides the implementation of the primitives for only the
-virtual address spaces. Below two subsections describe how it works.
+DAMON currently provides the implementations of the primitives for the physical
+and virtual address spaces. Below two subsections describe how those work.
 
 
 VMA-based Target Address Range Construction
 -------------------------------------------
 
+This is only for the virtual address space primitives implementation.  That for
+the physical address space simply asks users to manually set the monitoring
+target address ranges.
+
 Only small parts in the super-huge virtual address space of the processes are
 mapped to the physical memory and accessed.  Thus, tracking the unmapped
 address regions is just wasteful.  However, because DAMON can deal with some
@@ -71,15 +75,18 @@ to make a reasonable trade-off.  Below shows this in detail::
 PTE Accessed-bit Based Access Check
 -----------------------------------
 
-The implementation for the virtual address space uses PTE Accessed-bit for
-basic access checks.  It finds the relevant PTE Accessed bit from the address
-by walking the page table for the target task of the address.  In this way, the
-implementation finds and clears the bit for next sampling target address and
-checks whether the bit set again after one sampling period.  This could disturb
-other kernel subsystems using the Accessed bits, namely Idle page tracking and
-the reclaim logic.  To avoid such disturbances, DAMON makes it mutually
-exclusive with Idle page tracking and uses ``PG_idle`` and ``PG_young`` page
-flags to solve the conflict with the reclaim logic, as Idle page tracking does.
+Both of the implementations for physical and virtual address spaces use PTE
+Accessed-bit for basic access checks.  Only one difference is the way of
+finding the relevant PTE Accessed bit(s) from the address.  While the
+implementation for the virtual address walks the page table for the target task
+of the address, the implementation for the physical address walks every page
+table having a mapping to the address.  In this way, the implementations find
+and clear the bit(s) for next sampling target address and checks whether the
+bit(s) set again after one sampling period.  This could disturb other kernel
+subsystems using the Accessed bits, namely Idle page tracking and the reclaim
+logic.  To avoid such disturbances, DAMON makes it mutually exclusive with Idle
+page tracking and uses ``PG_idle`` and ``PG_young`` page flags to solve the
+conflict with the reclaim logic, as Idle page tracking does.
 
 
 Address Space Independent Core Mechanisms
diff --git a/Documentation/vm/damon/faq.rst b/Documentation/vm/damon/faq.rst
index cb3d8b585a8b3..11aea40eb328c 100644
--- a/Documentation/vm/damon/faq.rst
+++ b/Documentation/vm/damon/faq.rst
@@ -36,10 +36,9 @@ constructions and actual access checks can be implemented and configured on the
 DAMON core by the users.  In this way, DAMON users can monitor any address
 space with any access check technique.
 
-Nonetheless, DAMON provides vma tracking and PTE Accessed bit check based
+Nonetheless, DAMON provides vma/rmap tracking and PTE Accessed bit check based
 implementations of the address space dependent functions for the virtual memory
-by default, for a reference and convenient use.  In near future, we will
-provide those for physical memory address space.
+and the physical memory by default, for a reference and convenient use.
 
 
 Can I simply monitor page granularity?

From 44c2eb6d004f12fe29a127b642c636f3484f1fe6 Mon Sep 17 00:00:00 2001
From: Rikard Falkeborn <rikard.falkeborn@gmail.com>
Date: Fri, 5 Nov 2021 13:47:07 -0700
Subject: [PATCH 302/737] mm/damon/vaddr: constify static mm_walk_ops

The only usage of these structs is to pass their addresses to
walk_page_range(), which takes a pointer to const mm_walk_ops as
argument.  Make them const to allow the compiler to put them in
read-only memory.

Link: https://lkml.kernel.org/r/20211014075042.17174-2-rikard.falkeborn@gmail.com
Signed-off-by: Rikard Falkeborn <rikard.falkeborn@gmail.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/vaddr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index cdeca92a34dcc..14768575f9066 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -395,7 +395,7 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
 	return 0;
 }
 
-static struct mm_walk_ops damon_mkold_ops = {
+static const struct mm_walk_ops damon_mkold_ops = {
 	.pmd_entry = damon_mkold_pmd_entry,
 };
 
@@ -491,7 +491,7 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
 	return 0;
 }
 
-static struct mm_walk_ops damon_young_ops = {
+static const struct mm_walk_ops damon_young_ops = {
 	.pmd_entry = damon_young_pmd_entry,
 };
 

From 391608bb59563cdeedb11efe5284fb529c046412 Mon Sep 17 00:00:00 2001
From: Rongwei Wang <rongwei.wang@linux.alibaba.com>
Date: Fri, 5 Nov 2021 13:47:09 -0700
Subject: [PATCH 303/737] mm/damon/dbgfs: remove unnecessary variables

In some functions, it's unnecessary to declare 'err' and 'ret' variables
at the same time.  This patch mainly to simplify the issue of such
declarations by reusing one variable.

Link: https://lkml.kernel.org/r/20211014073014.35754-1-sj@kernel.org
Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com>
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/dbgfs.c | 66 +++++++++++++++++++++++-------------------------
 1 file changed, 31 insertions(+), 35 deletions(-)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 38188347d8abb..c90988a20fa4f 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -69,8 +69,7 @@ static ssize_t dbgfs_attrs_write(struct file *file,
 	struct damon_ctx *ctx = file->private_data;
 	unsigned long s, a, r, minr, maxr;
 	char *kbuf;
-	ssize_t ret = count;
-	int err;
+	ssize_t ret;
 
 	kbuf = user_input_str(buf, count, ppos);
 	if (IS_ERR(kbuf))
@@ -88,9 +87,9 @@ static ssize_t dbgfs_attrs_write(struct file *file,
 		goto unlock_out;
 	}
 
-	err = damon_set_attrs(ctx, s, a, r, minr, maxr);
-	if (err)
-		ret = err;
+	ret = damon_set_attrs(ctx, s, a, r, minr, maxr);
+	if (!ret)
+		ret = count;
 unlock_out:
 	mutex_unlock(&ctx->kdamond_lock);
 out:
@@ -220,14 +219,13 @@ static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf,
 	struct damon_ctx *ctx = file->private_data;
 	char *kbuf;
 	struct damos **schemes;
-	ssize_t nr_schemes = 0, ret = count;
-	int err;
+	ssize_t nr_schemes = 0, ret;
 
 	kbuf = user_input_str(buf, count, ppos);
 	if (IS_ERR(kbuf))
 		return PTR_ERR(kbuf);
 
-	schemes = str_to_schemes(kbuf, ret, &nr_schemes);
+	schemes = str_to_schemes(kbuf, count, &nr_schemes);
 	if (!schemes) {
 		ret = -EINVAL;
 		goto out;
@@ -239,11 +237,12 @@ static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf,
 		goto unlock_out;
 	}
 
-	err = damon_set_schemes(ctx, schemes, nr_schemes);
-	if (err)
-		ret = err;
-	else
+	ret = damon_set_schemes(ctx, schemes, nr_schemes);
+	if (!ret) {
+		ret = count;
 		nr_schemes = 0;
+	}
+
 unlock_out:
 	mutex_unlock(&ctx->kdamond_lock);
 	free_schemes_arr(schemes, nr_schemes);
@@ -343,9 +342,8 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
 	char *kbuf, *nrs;
 	unsigned long *targets;
 	ssize_t nr_targets;
-	ssize_t ret = count;
+	ssize_t ret;
 	int i;
-	int err;
 
 	kbuf = user_input_str(buf, count, ppos);
 	if (IS_ERR(kbuf))
@@ -358,7 +356,7 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
 		scnprintf(kbuf, count, "42    ");
 	}
 
-	targets = str_to_target_ids(nrs, ret, &nr_targets);
+	targets = str_to_target_ids(nrs, count, &nr_targets);
 	if (!targets) {
 		ret = -ENOMEM;
 		goto out;
@@ -393,11 +391,12 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
 	else
 		damon_pa_set_primitives(ctx);
 
-	err = damon_set_targets(ctx, targets, nr_targets);
-	if (err) {
+	ret = damon_set_targets(ctx, targets, nr_targets);
+	if (ret) {
 		if (id_is_pid)
 			dbgfs_put_pids(targets, nr_targets);
-		ret = err;
+	} else {
+		ret = count;
 	}
 
 unlock_out:
@@ -715,8 +714,7 @@ static ssize_t dbgfs_mk_context_write(struct file *file,
 {
 	char *kbuf;
 	char *ctx_name;
-	ssize_t ret = count;
-	int err;
+	ssize_t ret;
 
 	kbuf = user_input_str(buf, count, ppos);
 	if (IS_ERR(kbuf))
@@ -734,9 +732,9 @@ static ssize_t dbgfs_mk_context_write(struct file *file,
 	}
 
 	mutex_lock(&damon_dbgfs_lock);
-	err = dbgfs_mk_context(ctx_name);
-	if (err)
-		ret = err;
+	ret = dbgfs_mk_context(ctx_name);
+	if (!ret)
+		ret = count;
 	mutex_unlock(&damon_dbgfs_lock);
 
 out:
@@ -805,8 +803,7 @@ static ssize_t dbgfs_rm_context_write(struct file *file,
 		const char __user *buf, size_t count, loff_t *ppos)
 {
 	char *kbuf;
-	ssize_t ret = count;
-	int err;
+	ssize_t ret;
 	char *ctx_name;
 
 	kbuf = user_input_str(buf, count, ppos);
@@ -825,9 +822,9 @@ static ssize_t dbgfs_rm_context_write(struct file *file,
 	}
 
 	mutex_lock(&damon_dbgfs_lock);
-	err = dbgfs_rm_context(ctx_name);
-	if (err)
-		ret = err;
+	ret = dbgfs_rm_context(ctx_name);
+	if (!ret)
+		ret = count;
 	mutex_unlock(&damon_dbgfs_lock);
 
 out:
@@ -851,9 +848,8 @@ static ssize_t dbgfs_monitor_on_read(struct file *file,
 static ssize_t dbgfs_monitor_on_write(struct file *file,
 		const char __user *buf, size_t count, loff_t *ppos)
 {
-	ssize_t ret = count;
+	ssize_t ret;
 	char *kbuf;
-	int err;
 
 	kbuf = user_input_str(buf, count, ppos);
 	if (IS_ERR(kbuf))
@@ -866,14 +862,14 @@ static ssize_t dbgfs_monitor_on_write(struct file *file,
 	}
 
 	if (!strncmp(kbuf, "on", count))
-		err = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs);
+		ret = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs);
 	else if (!strncmp(kbuf, "off", count))
-		err = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs);
+		ret = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs);
 	else
-		err = -EINVAL;
+		ret = -EINVAL;
 
-	if (err)
-		ret = err;
+	if (!ret)
+		ret = count;
 	kfree(kbuf);
 	return ret;
 }

From 7470534d4bf2c4f3ad4593a28f32338307eb56d7 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:47:13 -0700
Subject: [PATCH 304/737] mm/damon/paddr: support the pageout scheme

Introduction
============

This patchset 1) makes the engine for general data access
pattern-oriented memory management (DAMOS) be more useful for production
environments, and 2) implements a static kernel module for lightweight
proactive reclamation using the engine.

Proactive Reclamation
---------------------

On general memory over-committed systems, proactively reclaiming cold
pages helps saving memory and reducing latency spikes that incurred by
the direct reclaim or the CPU consumption of kswapd, while incurring
only minimal performance degradation[2].

A Free Pages Reporting[8] based memory over-commit virtualization system
would be one more specific use case.  In the system, the guest VMs
reports their free memory to host, and the host reallocates the reported
memory to other guests.  As a result, the system's memory utilization
can be maximized.  However, the guests could be not so memory-frugal,
because some kernel subsystems and user-space applications are designed
to use as much memory as available.  Then, guests would report only
small amount of free memory to host, results in poor memory utilization.
Running the proactive reclamation in such guests could help mitigating
this problem.

Google has also implemented this idea and using it in their data center.
They further proposed upstreaming it in LSFMM'19, and "the general
consensus was that, while this sort of proactive reclaim would be useful
for a number of users, the cost of this particular solution was too high
to consider merging it upstream"[3].  The cost mainly comes from the
coldness tracking.  Roughly speaking, the implementation periodically
scans the 'Accessed' bit of each page.  For the reason, the overhead
linearly increases as the size of the memory and the scanning frequency
grows.  As a result, Google is known to dedicating one CPU for the work.
That's a reasonable option to someone like Google, but it wouldn't be so
to some others.

DAMON and DAMOS: An engine for data access pattern-oriented memory management
-----------------------------------------------------------------------------

DAMON[4] is a framework for general data access monitoring.  Its
adaptive monitoring overhead control feature minimizes its monitoring
overhead.  It also let the upper-bound of the overhead be configurable
by clients, regardless of the size of the monitoring target memory.
While monitoring 70 GiB memory of a production system every 5
milliseconds, it consumes less than 1% single CPU time.  For this, it
could sacrify some of the quality of the monitoring results.
Nevertheless, the lower-bound of the quality is configurable, and it
uses a best-effort algorithm for better quality.  Our test results[5]
show the quality is practical enough.  From the production system
monitoring, we were able to find a 4 KiB region in the 70 GiB memory
that shows highest access frequency.

We normally don't monitor the data access pattern just for fun but to
improve something like memory management.  Proactive reclamation is one
such usage.  For such general cases, DAMON provides a feature called
DAMon-based Operation Schemes (DAMOS)[6].  It makes DAMON an engine for
general data access pattern oriented memory management.  Using this,
clients can ask DAMON to find memory regions of specific data access
pattern and apply some memory management action (e.g., page out, move to
head of the LRU list, use huge page, ...).  We call the request
'scheme'.

Proactive Reclamation on top of DAMON/DAMOS
-------------------------------------------

Therefore, by using DAMON for the cold pages detection, the proactive
reclamation's monitoring overhead issue can be solved.  Actually, we
previously implemented a version of proactive reclamation using DAMOS
and achieved noticeable improvements with our evaluation setup[5].
Nevertheless, it more for a proof-of-concept, rather than production
uses.  It supports only virtual address spaces of processes, and require
additional tuning efforts for given workloads and the hardware.  For the
tuning, we introduced a simple auto-tuning user space tool[8].  Google
is also known to using a ML-based similar approach for their fleets[2].
But, making it just works with intuitive knobs in the kernel would be
helpful for general users.

To this end, this patchset improves DAMOS to be ready for such
production usages, and implements another version of the proactive
reclamation, namely DAMON_RECLAIM, on top of it.

DAMOS Improvements: Aggressiveness Control, Prioritization, and Watermarks
--------------------------------------------------------------------------

First of all, the current version of DAMOS supports only virtual address
spaces.  This patchset makes it supports the physical address space for
the page out action.

Next major problem of the current version of DAMOS is the lack of the
aggressiveness control, which can results in arbitrary overhead.  For
example, if huge memory regions having the data access pattern of
interest are found, applying the requested action to all of the regions
could incur significant overhead.  It can be controlled by tuning the
target data access pattern with manual or automated approaches[2,7].
But, some people would prefer the kernel to just work with only
intuitive tuning or default values.

For such cases, this patchset implements a safeguard, namely time/size
quota.  Using this, the clients can specify up to how much time can be
used for applying the action, and/or up to how much memory regions the
action can be applied within a user-specified time duration.  A followup
question is, to which memory regions should the action applied within
the limits? We implement a simple regions prioritization mechanism for
each action and make DAMOS to apply the action to high priority regions
first.  It also allows clients tune the prioritization mechanism to use
different weights for size, access frequency, and age of memory regions.
This means we could use not only LRU but also LFU or some fancy
algorithms like CAR[9] with lightweight overhead.

Though DAMON is lightweight, someone would want to remove even the cold
pages monitoring overhead when it is unnecessary.  Currently, it should
manually turned on and off by clients, but some clients would simply
want to turn it on and off based on some metrics like free memory ratio
or memory fragmentation.  For such cases, this patchset implements a
watermarks-based automatic activation feature.  It allows the clients
configure the metric of their interest, and three watermarks of the
metric.  If the metric is higher than the high watermark or lower than
the low watermark, the scheme is deactivated.  If the metric is lower
than the mid watermark but higher than the low watermark, the scheme is
activated.

DAMON-based Reclaim
-------------------

Using the improved version of DAMOS, this patchset implements a static
kernel module called 'damon_reclaim'.  It finds memory regions that
didn't accessed for specific time duration and page out.  Consuming too
much CPU for the paging out operations, or doing pageout too frequently
can be critical for systems configuring their swap devices with
software-defined in-memory block devices like zram/zswap or total number
of writes limited devices like SSDs, respectively.  To avoid the
problems, the time/size quotas can be configured.  Under the quotas, it
pages out memory regions that didn't accessed longer first.  Also, to
remove the monitoring overhead under peaceful situation, and to fall
back to the LRU-list based page granularity reclamation when it doesn't
make progress, the three watermarks based activation mechanism is used,
with the free memory ratio as the watermark metric.

For convenient configurations, it provides several module parameters.
Using these, sysadmins can enable/disable it, and tune its parameters
including the coldness identification time threshold, the time/size
quotas and the three watermarks.

Evaluation
==========

In short, DAMON_RECLAIM with 50ms/s time quota and regions
prioritization on v5.15-rc5 Linux kernel with ZRAM swap device achieves
38.58% memory saving with only 1.94% runtime overhead.  For this,
DAMON_RECLAIM consumes only 4.97% of single CPU time.

Setup
-----

We evaluate DAMON_RECLAIM to show how each of the DAMOS improvements
make effect.  For this, we measure DAMON_RECLAIM's CPU consumption,
entire system memory footprint, total number of major page faults, and
runtime of 24 realistic workloads in PARSEC3 and SPLASH-2X benchmark
suites on my QEMU/KVM based virtual machine.  The virtual machine runs
on an i3.metal AWS instance, has 130GiB memory, and runs a linux kernel
built on latest -mm tree[1] plus this patchset.  It also utilizes a 4
GiB ZRAM swap device.  We repeats the measurement 5 times and use
averages.

[1] https://github.com/hnaz/linux-mm/tree/v5.15-rc5-mmots-2021-10-13-19-55

Detailed Results
----------------

The results are summarized in the below table.

With coldness identification threshold of 5 seconds, DAMON_RECLAIM
without the time quota-based speed limit achieves 47.21% memory saving,
but incur 4.59% runtime slowdown to the workloads on average.  For this,
DAMON_RECLAIM consumes about 11.28% single CPU time.

Applying time quotas of 200ms/s, 50ms/s, and 10ms/s without the regions
prioritization reduces the slowdown to 4.89%, 2.65%, and 1.5%,
respectively.  Time quota of 200ms/s (20%) makes no real change compared
to the quota unapplied version, because the quota unapplied version
consumes only 11.28% CPU time.  DAMON_RECLAIM's CPU utilization also
similarly reduced: 11.24%, 5.51%, and 2.01% of single CPU time.  That
is, the overhead is proportional to the speed limit.  Nevertheless, it
also reduces the memory saving because it becomes less aggressive.  In
detail, the three variants show 48.76%, 37.83%, and 7.85% memory saving,
respectively.

Applying the regions prioritization (page out regions that not accessed
longer first within the time quota) further reduces the performance
degradation.  Runtime slowdowns and total number of major page faults
increase has been 4.89%/218,690% -> 4.39%/166,136% (200ms/s),
2.65%/111,886% -> 1.94%/59,053% (50ms/s), and 1.5%/34,973.40% ->
2.08%/8,781.75% (10ms/s).  The runtime under 10ms/s time quota has
increased with prioritization, but apparently that's under the margin of
error.

    time quota   prioritization  memory_saving  cpu_util  slowdown  pgmajfaults overhead
    N            N               47.21%         11.28%    4.59%     194,802%
    200ms/s      N               48.76%         11.24%    4.89%     218,690%
    50ms/s       N               37.83%         5.51%     2.65%     111,886%
    10ms/s       N               7.85%          2.01%     1.5%      34,793.40%
    200ms/s      Y               50.08%         10.38%    4.39%     166,136%
    50ms/s       Y               38.58%         4.97%     1.94%     59,053%
    10ms/s       Y               3.63%          1.73%     2.08%     8,781.75%

Baseline and Complete Git Trees
===============================

The patches are based on the latest -mm tree
(v5.15-rc5-mmots-2021-10-13-19-55).  You can also clone the complete git tree
from:

    $ git clone git://github.com/sjp38/linux -b damon_reclaim/patches/v1

The web is also available:
https://git.kernel.org/pub/scm/linux/kernel/git/sj/linux.git/tag/?h=damon_reclaim/patches/v1

Sequence Of Patches
===================

The first patch makes DAMOS support the physical address space for the
page out action.  Following five patches (patches 2-6) implement the
time/size quotas.  Next four patches (patches 7-10) implement the memory
regions prioritization within the limit.  Then, three following patches
(patches 11-13) implement the watermarks-based schemes activation.

Finally, the last two patches (patches 14-15) implement and document the
DAMON-based reclamation using the advanced DAMOS.

[1] https://www.kernel.org/doc/html/v5.15-rc1/vm/damon/index.html
[2] https://research.google/pubs/pub48551/
[3] https://lwn.net/Articles/787611/
[4] https://damonitor.github.io
[5] https://damonitor.github.io/doc/html/latest/vm/damon/eval.html
[6] https://lore.kernel.org/linux-mm/20211001125604.29660-1-sj@kernel.org/
[7] https://github.com/awslabs/damoos
[8] https://www.kernel.org/doc/html/latest/vm/free_page_reporting.html
[9] https://www.usenix.org/conference/fast-04/car-clock-adaptive-replacement

This patch (of 15):

This makes the DAMON primitives for physical address space support the
pageout action for DAMON-based Operation Schemes.  With this commit,
hence, users can easily implement system-level data access-aware
reclamations using DAMOS.

[sj@kernel.org: fix missing-prototype build warning]
  Link: https://lkml.kernel.org/r/20211025064220.13904-1-sj@kernel.org

Link: https://lkml.kernel.org/r/20211019150731.16699-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20211019150731.16699-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Marco Elver <elver@google.com>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Greg Thelen <gthelen@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h |  2 ++
 mm/damon/paddr.c      | 37 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 715dadd21f7cd..9a327bc787b52 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -357,6 +357,8 @@ void damon_va_set_primitives(struct damon_ctx *ctx);
 void damon_pa_prepare_access_checks(struct damon_ctx *ctx);
 unsigned int damon_pa_check_accesses(struct damon_ctx *ctx);
 bool damon_pa_target_valid(void *t);
+int damon_pa_apply_scheme(struct damon_ctx *context, struct damon_target *t,
+		struct damon_region *r, struct damos *scheme);
 void damon_pa_set_primitives(struct damon_ctx *ctx);
 
 #endif	/* CONFIG_DAMON_PADDR */
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index d7a2ecd09ed02..957ada55de77b 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -11,7 +11,9 @@
 #include <linux/page_idle.h>
 #include <linux/pagemap.h>
 #include <linux/rmap.h>
+#include <linux/swap.h>
 
+#include "../internal.h"
 #include "prmtv-common.h"
 
 static bool __damon_pa_mkold(struct page *page, struct vm_area_struct *vma,
@@ -211,6 +213,39 @@ bool damon_pa_target_valid(void *t)
 	return true;
 }
 
+int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t,
+		struct damon_region *r, struct damos *scheme)
+{
+	unsigned long addr;
+	LIST_HEAD(page_list);
+
+	if (scheme->action != DAMOS_PAGEOUT)
+		return -EINVAL;
+
+	for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
+		struct page *page = damon_get_page(PHYS_PFN(addr));
+
+		if (!page)
+			continue;
+
+		ClearPageReferenced(page);
+		test_and_clear_page_young(page);
+		if (isolate_lru_page(page)) {
+			put_page(page);
+			continue;
+		}
+		if (PageUnevictable(page)) {
+			putback_lru_page(page);
+		} else {
+			list_add(&page->lru, &page_list);
+			put_page(page);
+		}
+	}
+	reclaim_pages(&page_list);
+	cond_resched();
+	return 0;
+}
+
 void damon_pa_set_primitives(struct damon_ctx *ctx)
 {
 	ctx->primitive.init = NULL;
@@ -220,5 +255,5 @@ void damon_pa_set_primitives(struct damon_ctx *ctx)
 	ctx->primitive.reset_aggregated = NULL;
 	ctx->primitive.target_valid = damon_pa_target_valid;
 	ctx->primitive.cleanup = NULL;
-	ctx->primitive.apply_scheme = NULL;
+	ctx->primitive.apply_scheme = damon_pa_apply_scheme;
 }

From f6451217a110acaa4404ac12311cad91241dd411 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:47:16 -0700
Subject: [PATCH 305/737] mm/damon/schemes: implement size quota for schemes
 application speed control

There could be arbitrarily large memory regions fulfilling the target
data access pattern of a DAMON-based operation scheme.  In the case,
applying the action of the scheme could incur too high overhead.  To
provide an intuitive way for avoiding it, this implements a feature
called size quota.  If the quota is set, DAMON tries to apply the action
only up to the given amount of memory regions within a given time
window.

Link: https://lkml.kernel.org/r/20211019150731.16699-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 36 +++++++++++++++++++++++---
 mm/damon/core.c       | 60 +++++++++++++++++++++++++++++++++++++------
 mm/damon/dbgfs.c      |  4 ++-
 3 files changed, 87 insertions(+), 13 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 9a327bc787b52..3a1ce9d9921c8 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -89,6 +89,26 @@ enum damos_action {
 	DAMOS_STAT,		/* Do nothing but only record the stat */
 };
 
+/**
+ * struct damos_quota - Controls the aggressiveness of the given scheme.
+ * @sz:			Maximum bytes of memory that the action can be applied.
+ * @reset_interval:	Charge reset interval in milliseconds.
+ *
+ * To avoid consuming too much CPU time or IO resources for applying the
+ * &struct damos->action to large memory, DAMON allows users to set a size
+ * quota.  The quota can be set by writing non-zero values to &sz.  If the size
+ * quota is set, DAMON tries to apply the action only up to &sz bytes within
+ * &reset_interval.
+ */
+struct damos_quota {
+	unsigned long sz;
+	unsigned long reset_interval;
+
+/* private: For charging the quota */
+	unsigned long charged_sz;
+	unsigned long charged_from;
+};
+
 /**
  * struct damos - Represents a Data Access Monitoring-based Operation Scheme.
  * @min_sz_region:	Minimum size of target regions.
@@ -98,13 +118,20 @@ enum damos_action {
  * @min_age_region:	Minimum age of target regions.
  * @max_age_region:	Maximum age of target regions.
  * @action:		&damo_action to be applied to the target regions.
+ * @quota:		Control the aggressiveness of this scheme.
  * @stat_count:		Total number of regions that this scheme is applied.
  * @stat_sz:		Total size of regions that this scheme is applied.
  * @list:		List head for siblings.
  *
- * For each aggregation interval, DAMON applies @action to monitoring target
- * regions fit in the condition and updates the statistics.  Note that both
- * the minimums and the maximums are inclusive.
+ * For each aggregation interval, DAMON finds regions which fit in the
+ * condition (&min_sz_region, &max_sz_region, &min_nr_accesses,
+ * &max_nr_accesses, &min_age_region, &max_age_region) and applies &action to
+ * those.  To avoid consuming too much CPU time or IO resources for the
+ * &action, &quota is used.
+ *
+ * After applying the &action to each region, &stat_count and &stat_sz is
+ * updated to reflect the number of regions and total size of regions that the
+ * &action is applied.
  */
 struct damos {
 	unsigned long min_sz_region;
@@ -114,6 +141,7 @@ struct damos {
 	unsigned int min_age_region;
 	unsigned int max_age_region;
 	enum damos_action action;
+	struct damos_quota quota;
 	unsigned long stat_count;
 	unsigned long stat_sz;
 	struct list_head list;
@@ -310,7 +338,7 @@ struct damos *damon_new_scheme(
 		unsigned long min_sz_region, unsigned long max_sz_region,
 		unsigned int min_nr_accesses, unsigned int max_nr_accesses,
 		unsigned int min_age_region, unsigned int max_age_region,
-		enum damos_action action);
+		enum damos_action action, struct damos_quota *quota);
 void damon_add_scheme(struct damon_ctx *ctx, struct damos *s);
 void damon_destroy_scheme(struct damos *s);
 
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 2f6785737902d..cce14a0d5c725 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -89,7 +89,7 @@ struct damos *damon_new_scheme(
 		unsigned long min_sz_region, unsigned long max_sz_region,
 		unsigned int min_nr_accesses, unsigned int max_nr_accesses,
 		unsigned int min_age_region, unsigned int max_age_region,
-		enum damos_action action)
+		enum damos_action action, struct damos_quota *quota)
 {
 	struct damos *scheme;
 
@@ -107,6 +107,11 @@ struct damos *damon_new_scheme(
 	scheme->stat_sz = 0;
 	INIT_LIST_HEAD(&scheme->list);
 
+	scheme->quota.sz = quota->sz;
+	scheme->quota.reset_interval = quota->reset_interval;
+	scheme->quota.charged_sz = 0;
+	scheme->quota.charged_from = 0;
+
 	return scheme;
 }
 
@@ -530,15 +535,25 @@ static void kdamond_reset_aggregated(struct damon_ctx *c)
 	}
 }
 
+static void damon_split_region_at(struct damon_ctx *ctx,
+		struct damon_target *t, struct damon_region *r,
+		unsigned long sz_r);
+
 static void damon_do_apply_schemes(struct damon_ctx *c,
 				   struct damon_target *t,
 				   struct damon_region *r)
 {
 	struct damos *s;
-	unsigned long sz;
 
 	damon_for_each_scheme(s, c) {
-		sz = r->ar.end - r->ar.start;
+		struct damos_quota *quota = &s->quota;
+		unsigned long sz = r->ar.end - r->ar.start;
+
+		/* Check the quota */
+		if (quota->sz && quota->charged_sz >= quota->sz)
+			continue;
+
+		/* Check the target regions condition */
 		if (sz < s->min_sz_region || s->max_sz_region < sz)
 			continue;
 		if (r->nr_accesses < s->min_nr_accesses ||
@@ -546,22 +561,51 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
 			continue;
 		if (r->age < s->min_age_region || s->max_age_region < r->age)
 			continue;
-		s->stat_count++;
-		s->stat_sz += sz;
-		if (c->primitive.apply_scheme)
+
+		/* Apply the scheme */
+		if (c->primitive.apply_scheme) {
+			if (quota->sz && quota->charged_sz + sz > quota->sz) {
+				sz = ALIGN_DOWN(quota->sz - quota->charged_sz,
+						DAMON_MIN_REGION);
+				if (!sz)
+					goto update_stat;
+				damon_split_region_at(c, t, r, sz);
+			}
 			c->primitive.apply_scheme(c, t, r, s);
+			quota->charged_sz += sz;
+		}
 		if (s->action != DAMOS_STAT)
 			r->age = 0;
+
+update_stat:
+		s->stat_count++;
+		s->stat_sz += sz;
 	}
 }
 
 static void kdamond_apply_schemes(struct damon_ctx *c)
 {
 	struct damon_target *t;
-	struct damon_region *r;
+	struct damon_region *r, *next_r;
+	struct damos *s;
+
+	damon_for_each_scheme(s, c) {
+		struct damos_quota *quota = &s->quota;
+
+		if (!quota->sz)
+			continue;
+
+		/* New charge window starts */
+		if (time_after_eq(jiffies, quota->charged_from +
+					msecs_to_jiffies(
+						quota->reset_interval))) {
+			quota->charged_from = jiffies;
+			quota->charged_sz = 0;
+		}
+	}
 
 	damon_for_each_target(t, c) {
-		damon_for_each_region(r, t)
+		damon_for_each_region_safe(r, next_r, t)
 			damon_do_apply_schemes(c, t, r);
 	}
 }
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index c90988a20fa4f..a04bd50cc4c4e 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -188,6 +188,8 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
 
 	*nr_schemes = 0;
 	while (pos < len && *nr_schemes < max_nr_schemes) {
+		struct damos_quota quota = {};
+
 		ret = sscanf(&str[pos], "%lu %lu %u %u %u %u %u%n",
 				&min_sz, &max_sz, &min_nr_a, &max_nr_a,
 				&min_age, &max_age, &action, &parsed);
@@ -200,7 +202,7 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
 
 		pos += parsed;
 		scheme = damon_new_scheme(min_sz, max_sz, min_nr_a, max_nr_a,
-				min_age, max_age, action);
+				min_age, max_age, action, &quota);
 		if (!scheme)
 			goto fail;
 

From ae6f2868590257b95d6634132b936badbd8b40c4 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:47:20 -0700
Subject: [PATCH 306/737] mm/damon/schemes: skip already charged targets and
 regions

If DAMOS has stopped applying action in the middle of a group of memory
regions due to its size quota, it starts the work again from the
beginning of the address space in the next charge window.  If there is a
huge memory region at the beginning of the address space and it fulfills
the scheme's target data access pattern always, the action will applied
to only the region.

This mitigates the case by skipping memory regions that charged in
current charge window at the beginning of next charge window.

Link: https://lkml.kernel.org/r/20211019150731.16699-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h |  5 +++++
 mm/damon/core.c       | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 3a1ce9d9921c8..585d985768fd1 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -107,6 +107,8 @@ struct damos_quota {
 /* private: For charging the quota */
 	unsigned long charged_sz;
 	unsigned long charged_from;
+	struct damon_target *charge_target_from;
+	unsigned long charge_addr_from;
 };
 
 /**
@@ -307,6 +309,9 @@ struct damon_ctx {
 #define damon_prev_region(r) \
 	(container_of(r->list.prev, struct damon_region, list))
 
+#define damon_last_region(t) \
+	(list_last_entry(&t->regions_list, struct damon_region, list))
+
 #define damon_for_each_region(r, t) \
 	list_for_each_entry(r, &t->regions_list, list)
 
diff --git a/mm/damon/core.c b/mm/damon/core.c
index cce14a0d5c725..693b75bc34505 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -111,6 +111,8 @@ struct damos *damon_new_scheme(
 	scheme->quota.reset_interval = quota->reset_interval;
 	scheme->quota.charged_sz = 0;
 	scheme->quota.charged_from = 0;
+	scheme->quota.charge_target_from = NULL;
+	scheme->quota.charge_addr_from = 0;
 
 	return scheme;
 }
@@ -553,6 +555,37 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
 		if (quota->sz && quota->charged_sz >= quota->sz)
 			continue;
 
+		/* Skip previously charged regions */
+		if (quota->charge_target_from) {
+			if (t != quota->charge_target_from)
+				continue;
+			if (r == damon_last_region(t)) {
+				quota->charge_target_from = NULL;
+				quota->charge_addr_from = 0;
+				continue;
+			}
+			if (quota->charge_addr_from &&
+					r->ar.end <= quota->charge_addr_from)
+				continue;
+
+			if (quota->charge_addr_from && r->ar.start <
+					quota->charge_addr_from) {
+				sz = ALIGN_DOWN(quota->charge_addr_from -
+						r->ar.start, DAMON_MIN_REGION);
+				if (!sz) {
+					if (r->ar.end - r->ar.start <=
+							DAMON_MIN_REGION)
+						continue;
+					sz = DAMON_MIN_REGION;
+				}
+				damon_split_region_at(c, t, r, sz);
+				r = damon_next_region(r);
+				sz = r->ar.end - r->ar.start;
+			}
+			quota->charge_target_from = NULL;
+			quota->charge_addr_from = 0;
+		}
+
 		/* Check the target regions condition */
 		if (sz < s->min_sz_region || s->max_sz_region < sz)
 			continue;
@@ -573,6 +606,10 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
 			}
 			c->primitive.apply_scheme(c, t, r, s);
 			quota->charged_sz += sz;
+			if (quota->sz && quota->charged_sz >= quota->sz) {
+				quota->charge_target_from = t;
+				quota->charge_addr_from = r->ar.end + 1;
+			}
 		}
 		if (s->action != DAMOS_STAT)
 			r->age = 0;

From e4a8bad8e008a5bfc3f986fa6e2b5b1d7e017c59 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:47:23 -0700
Subject: [PATCH 307/737] mm/damon/schemes: implement time quota

The size quota feature of DAMOS is useful for IO resource-critical
systems, but not so intuitive for CPU time-critical systems.  Systems
using zram or zswap-like swap device would be examples.

To provide another intuitive ways for such systems, this implements
time-based quota for DAMON-based Operation Schemes.  If the quota is
set, DAMOS tries to use only up to the user-defined quota of CPU time
within a given time window.

Link: https://lkml.kernel.org/r/20211019150731.16699-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 25 +++++++++++++++++++-----
 mm/damon/core.c       | 45 ++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 60 insertions(+), 10 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 585d985768fd1..1e7671bf3d23a 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -91,20 +91,35 @@ enum damos_action {
 
 /**
  * struct damos_quota - Controls the aggressiveness of the given scheme.
+ * @ms:			Maximum milliseconds that the scheme can use.
  * @sz:			Maximum bytes of memory that the action can be applied.
  * @reset_interval:	Charge reset interval in milliseconds.
  *
  * To avoid consuming too much CPU time or IO resources for applying the
- * &struct damos->action to large memory, DAMON allows users to set a size
- * quota.  The quota can be set by writing non-zero values to &sz.  If the size
- * quota is set, DAMON tries to apply the action only up to &sz bytes within
- * &reset_interval.
+ * &struct damos->action to large memory, DAMON allows users to set time and/or
+ * size quotas.  The quotas can be set by writing non-zero values to &ms and
+ * &sz, respectively.  If the time quota is set, DAMON tries to use only up to
+ * &ms milliseconds within &reset_interval for applying the action.  If the
+ * size quota is set, DAMON tries to apply the action only up to &sz bytes
+ * within &reset_interval.
+ *
+ * Internally, the time quota is transformed to a size quota using estimated
+ * throughput of the scheme's action.  DAMON then compares it against &sz and
+ * uses smaller one as the effective quota.
  */
 struct damos_quota {
+	unsigned long ms;
 	unsigned long sz;
 	unsigned long reset_interval;
 
-/* private: For charging the quota */
+/* private: */
+	/* For throughput estimation */
+	unsigned long total_charged_sz;
+	unsigned long total_charged_ns;
+
+	unsigned long esz;	/* Effective size quota in bytes */
+
+	/* For charging the quota */
 	unsigned long charged_sz;
 	unsigned long charged_from;
 	struct damon_target *charge_target_from;
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 693b75bc34505..d1da4bef96ede 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -107,8 +107,12 @@ struct damos *damon_new_scheme(
 	scheme->stat_sz = 0;
 	INIT_LIST_HEAD(&scheme->list);
 
+	scheme->quota.ms = quota->ms;
 	scheme->quota.sz = quota->sz;
 	scheme->quota.reset_interval = quota->reset_interval;
+	scheme->quota.total_charged_sz = 0;
+	scheme->quota.total_charged_ns = 0;
+	scheme->quota.esz = 0;
 	scheme->quota.charged_sz = 0;
 	scheme->quota.charged_from = 0;
 	scheme->quota.charge_target_from = NULL;
@@ -550,9 +554,10 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
 	damon_for_each_scheme(s, c) {
 		struct damos_quota *quota = &s->quota;
 		unsigned long sz = r->ar.end - r->ar.start;
+		struct timespec64 begin, end;
 
 		/* Check the quota */
-		if (quota->sz && quota->charged_sz >= quota->sz)
+		if (quota->esz && quota->charged_sz >= quota->esz)
 			continue;
 
 		/* Skip previously charged regions */
@@ -597,16 +602,21 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
 
 		/* Apply the scheme */
 		if (c->primitive.apply_scheme) {
-			if (quota->sz && quota->charged_sz + sz > quota->sz) {
-				sz = ALIGN_DOWN(quota->sz - quota->charged_sz,
+			if (quota->esz &&
+					quota->charged_sz + sz > quota->esz) {
+				sz = ALIGN_DOWN(quota->esz - quota->charged_sz,
 						DAMON_MIN_REGION);
 				if (!sz)
 					goto update_stat;
 				damon_split_region_at(c, t, r, sz);
 			}
+			ktime_get_coarse_ts64(&begin);
 			c->primitive.apply_scheme(c, t, r, s);
+			ktime_get_coarse_ts64(&end);
+			quota->total_charged_ns += timespec64_to_ns(&end) -
+				timespec64_to_ns(&begin);
 			quota->charged_sz += sz;
-			if (quota->sz && quota->charged_sz >= quota->sz) {
+			if (quota->esz && quota->charged_sz >= quota->esz) {
 				quota->charge_target_from = t;
 				quota->charge_addr_from = r->ar.end + 1;
 			}
@@ -620,6 +630,29 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
 	}
 }
 
+/* Shouldn't be called if quota->ms and quota->sz are zero */
+static void damos_set_effective_quota(struct damos_quota *quota)
+{
+	unsigned long throughput;
+	unsigned long esz;
+
+	if (!quota->ms) {
+		quota->esz = quota->sz;
+		return;
+	}
+
+	if (quota->total_charged_ns)
+		throughput = quota->total_charged_sz * 1000000 /
+			quota->total_charged_ns;
+	else
+		throughput = PAGE_SIZE * 1024;
+	esz = throughput * quota->ms;
+
+	if (quota->sz && quota->sz < esz)
+		esz = quota->sz;
+	quota->esz = esz;
+}
+
 static void kdamond_apply_schemes(struct damon_ctx *c)
 {
 	struct damon_target *t;
@@ -629,15 +662,17 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
 	damon_for_each_scheme(s, c) {
 		struct damos_quota *quota = &s->quota;
 
-		if (!quota->sz)
+		if (!quota->ms && !quota->sz)
 			continue;
 
 		/* New charge window starts */
 		if (time_after_eq(jiffies, quota->charged_from +
 					msecs_to_jiffies(
 						quota->reset_interval))) {
+			quota->total_charged_sz += quota->charged_sz;
 			quota->charged_from = jiffies;
 			quota->charged_sz = 0;
+			damos_set_effective_quota(quota);
 		}
 	}
 

From 18ea862310f0f43ee65f502d204f80c7367cfdab Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:47:27 -0700
Subject: [PATCH 308/737] mm/damon/dbgfs: support quotas of schemes

This makes the debugfs interface of DAMON support the scheme quotas by
chaning the format of the input for the schemes file.

Link: https://lkml.kernel.org/r/20211019150731.16699-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/dbgfs.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index a04bd50cc4c4e..097e6745ba75c 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -105,11 +105,14 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
 
 	damon_for_each_scheme(s, c) {
 		rc = scnprintf(&buf[written], len - written,
-				"%lu %lu %u %u %u %u %d %lu %lu\n",
+				"%lu %lu %u %u %u %u %d %lu %lu %lu %lu %lu\n",
 				s->min_sz_region, s->max_sz_region,
 				s->min_nr_accesses, s->max_nr_accesses,
 				s->min_age_region, s->max_age_region,
-				s->action, s->stat_count, s->stat_sz);
+				s->action,
+				s->quota.ms, s->quota.sz,
+				s->quota.reset_interval,
+				s->stat_count, s->stat_sz);
 		if (!rc)
 			return -ENOMEM;
 
@@ -190,10 +193,11 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
 	while (pos < len && *nr_schemes < max_nr_schemes) {
 		struct damos_quota quota = {};
 
-		ret = sscanf(&str[pos], "%lu %lu %u %u %u %u %u%n",
+		ret = sscanf(&str[pos], "%lu %lu %u %u %u %u %u %lu %lu %lu%n",
 				&min_sz, &max_sz, &min_nr_a, &max_nr_a,
-				&min_age, &max_age, &action, &parsed);
-		if (ret != 7)
+				&min_age, &max_age, &action, &quota.ms,
+				&quota.sz, &quota.reset_interval, &parsed);
+		if (ret != 10)
 			break;
 		if (!damos_action_valid(action)) {
 			pr_err("wrong action %d\n", action);

From c6d37e4c6b626812261a669e4f84bddd12e33957 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:47:30 -0700
Subject: [PATCH 309/737] mm/damon/selftests: support schemes quotas

This updates DAMON selftests to support updated schemes debugfs file
format for the quotas.

Link: https://lkml.kernel.org/r/20211019150731.16699-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/damon/debugfs_attrs.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/damon/debugfs_attrs.sh b/tools/testing/selftests/damon/debugfs_attrs.sh
index 639cfb6a1f651..8e33a7b584e70 100644
--- a/tools/testing/selftests/damon/debugfs_attrs.sh
+++ b/tools/testing/selftests/damon/debugfs_attrs.sh
@@ -63,10 +63,10 @@ echo "$orig_content" > "$file"
 file="$DBGFS/schemes"
 orig_content=$(cat "$file")
 
-test_write_succ "$file" "1 2 3 4 5 6 4" \
+test_write_succ "$file" "1 2 3 4 5 6 4 0 0 0" \
 	"$orig_content" "valid input"
 test_write_fail "$file" "1 2
-3 4 5 6 3" "$orig_content" "multi lines"
+3 4 5 6 3 0 0 0" "$orig_content" "multi lines"
 test_write_succ "$file" "" "$orig_content" "disabling"
 echo "$orig_content" > "$file"
 

From ba09c9d817ada7e10da9899bb33ab13530b15b13 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:47:33 -0700
Subject: [PATCH 310/737] mm/damon/schemes: prioritize regions within the
 quotas

This makes DAMON apply schemes to regions having higher priority first,
if it cannot apply schemes to all regions due to the quotas.

The prioritization function should be implemented in the monitoring
primitives.  Those would commonly calculate the priority of the region
using attributes of regions, namely 'size', 'nr_accesses', and 'age'.
For example, some primitive would calculate the priority of each region
using a weighted sum of 'nr_accesses' and 'age' of the region.

The optimal weights would depend on give environments, so this makes
those customizable.  Nevertheless, the score calculation functions are
only encouraged to respect the weights, not mandated.

Link: https://lkml.kernel.org/r/20211019150731.16699-8-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 26 ++++++++++++++++++
 mm/damon/core.c       | 62 ++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 81 insertions(+), 7 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 1e7671bf3d23a..5d47ad9e3911b 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -14,6 +14,8 @@
 
 /* Minimal region size.  Every damon_region is aligned by this. */
 #define DAMON_MIN_REGION	PAGE_SIZE
+/* Max priority score for DAMON-based operation schemes */
+#define DAMOS_MAX_SCORE		(99)
 
 /**
  * struct damon_addr_range - Represents an address region of [@start, @end).
@@ -95,6 +97,10 @@ enum damos_action {
  * @sz:			Maximum bytes of memory that the action can be applied.
  * @reset_interval:	Charge reset interval in milliseconds.
  *
+ * @weight_sz:		Weight of the region's size for prioritization.
+ * @weight_nr_accesses:	Weight of the region's nr_accesses for prioritization.
+ * @weight_age:		Weight of the region's age for prioritization.
+ *
  * To avoid consuming too much CPU time or IO resources for applying the
  * &struct damos->action to large memory, DAMON allows users to set time and/or
  * size quotas.  The quotas can be set by writing non-zero values to &ms and
@@ -106,12 +112,22 @@ enum damos_action {
  * Internally, the time quota is transformed to a size quota using estimated
  * throughput of the scheme's action.  DAMON then compares it against &sz and
  * uses smaller one as the effective quota.
+ *
+ * For selecting regions within the quota, DAMON prioritizes current scheme's
+ * target memory regions using the &struct damon_primitive->get_scheme_score.
+ * You could customize the prioritization logic by setting &weight_sz,
+ * &weight_nr_accesses, and &weight_age, because monitoring primitives are
+ * encouraged to respect those.
  */
 struct damos_quota {
 	unsigned long ms;
 	unsigned long sz;
 	unsigned long reset_interval;
 
+	unsigned int weight_sz;
+	unsigned int weight_nr_accesses;
+	unsigned int weight_age;
+
 /* private: */
 	/* For throughput estimation */
 	unsigned long total_charged_sz;
@@ -124,6 +140,10 @@ struct damos_quota {
 	unsigned long charged_from;
 	struct damon_target *charge_target_from;
 	unsigned long charge_addr_from;
+
+	/* For prioritization */
+	unsigned long histogram[DAMOS_MAX_SCORE + 1];
+	unsigned int min_score;
 };
 
 /**
@@ -174,6 +194,7 @@ struct damon_ctx;
  * @prepare_access_checks:	Prepare next access check of target regions.
  * @check_accesses:		Check the accesses to target regions.
  * @reset_aggregated:		Reset aggregated accesses monitoring results.
+ * @get_scheme_score:		Get the score of a region for a scheme.
  * @apply_scheme:		Apply a DAMON-based operation scheme.
  * @target_valid:		Determine if the target is valid.
  * @cleanup:			Clean up the context.
@@ -200,6 +221,8 @@ struct damon_ctx;
  * of its update.  The value will be used for regions adjustment threshold.
  * @reset_aggregated should reset the access monitoring results that aggregated
  * by @check_accesses.
+ * @get_scheme_score should return the priority score of a region for a scheme
+ * as an integer in [0, &DAMOS_MAX_SCORE].
  * @apply_scheme is called from @kdamond when a region for user provided
  * DAMON-based operation scheme is found.  It should apply the scheme's action
  * to the region.  This is not used for &DAMON_ARBITRARY_TARGET case.
@@ -213,6 +236,9 @@ struct damon_primitive {
 	void (*prepare_access_checks)(struct damon_ctx *context);
 	unsigned int (*check_accesses)(struct damon_ctx *context);
 	void (*reset_aggregated)(struct damon_ctx *context);
+	int (*get_scheme_score)(struct damon_ctx *context,
+			struct damon_target *t, struct damon_region *r,
+			struct damos *scheme);
 	int (*apply_scheme)(struct damon_ctx *context, struct damon_target *t,
 			struct damon_region *r, struct damos *scheme);
 	bool (*target_valid)(void *target);
diff --git a/mm/damon/core.c b/mm/damon/core.c
index d1da4bef96ede..fad25778e2ecf 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -12,6 +12,7 @@
 #include <linux/kthread.h>
 #include <linux/random.h>
 #include <linux/slab.h>
+#include <linux/string.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/damon.h>
@@ -110,6 +111,9 @@ struct damos *damon_new_scheme(
 	scheme->quota.ms = quota->ms;
 	scheme->quota.sz = quota->sz;
 	scheme->quota.reset_interval = quota->reset_interval;
+	scheme->quota.weight_sz = quota->weight_sz;
+	scheme->quota.weight_nr_accesses = quota->weight_nr_accesses;
+	scheme->quota.weight_age = quota->weight_age;
 	scheme->quota.total_charged_sz = 0;
 	scheme->quota.total_charged_ns = 0;
 	scheme->quota.esz = 0;
@@ -545,6 +549,28 @@ static void damon_split_region_at(struct damon_ctx *ctx,
 		struct damon_target *t, struct damon_region *r,
 		unsigned long sz_r);
 
+static bool __damos_valid_target(struct damon_region *r, struct damos *s)
+{
+	unsigned long sz;
+
+	sz = r->ar.end - r->ar.start;
+	return s->min_sz_region <= sz && sz <= s->max_sz_region &&
+		s->min_nr_accesses <= r->nr_accesses &&
+		r->nr_accesses <= s->max_nr_accesses &&
+		s->min_age_region <= r->age && r->age <= s->max_age_region;
+}
+
+static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t,
+		struct damon_region *r, struct damos *s)
+{
+	bool ret = __damos_valid_target(r, s);
+
+	if (!ret || !s->quota.esz || !c->primitive.get_scheme_score)
+		return ret;
+
+	return c->primitive.get_scheme_score(c, t, r, s) >= s->quota.min_score;
+}
+
 static void damon_do_apply_schemes(struct damon_ctx *c,
 				   struct damon_target *t,
 				   struct damon_region *r)
@@ -591,13 +617,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
 			quota->charge_addr_from = 0;
 		}
 
-		/* Check the target regions condition */
-		if (sz < s->min_sz_region || s->max_sz_region < sz)
-			continue;
-		if (r->nr_accesses < s->min_nr_accesses ||
-				s->max_nr_accesses < r->nr_accesses)
-			continue;
-		if (r->age < s->min_age_region || s->max_age_region < r->age)
+		if (!damos_valid_target(c, t, r, s))
 			continue;
 
 		/* Apply the scheme */
@@ -661,6 +681,8 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
 
 	damon_for_each_scheme(s, c) {
 		struct damos_quota *quota = &s->quota;
+		unsigned long cumulated_sz;
+		unsigned int score, max_score = 0;
 
 		if (!quota->ms && !quota->sz)
 			continue;
@@ -674,6 +696,32 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
 			quota->charged_sz = 0;
 			damos_set_effective_quota(quota);
 		}
+
+		if (!c->primitive.get_scheme_score)
+			continue;
+
+		/* Fill up the score histogram */
+		memset(quota->histogram, 0, sizeof(quota->histogram));
+		damon_for_each_target(t, c) {
+			damon_for_each_region(r, t) {
+				if (!__damos_valid_target(r, s))
+					continue;
+				score = c->primitive.get_scheme_score(
+						c, t, r, s);
+				quota->histogram[score] +=
+					r->ar.end - r->ar.start;
+				if (score > max_score)
+					max_score = score;
+			}
+		}
+
+		/* Set the min score limit */
+		for (cumulated_sz = 0, score = max_score; ; score--) {
+			cumulated_sz += quota->histogram[score];
+			if (cumulated_sz >= quota->esz || !score)
+				break;
+		}
+		quota->min_score = score;
 	}
 
 	damon_for_each_target(t, c) {

From 33c25c58d1dacb4c6349285b6d39bf120cfc5e0a Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:47:37 -0700
Subject: [PATCH 311/737] mm/damon/vaddr,paddr: support pageout prioritization

This makes the default monitoring primitives for virtual address spaces
and the physical address sapce to support memory regions prioritization
for 'PAGEOUT' DAMOS action.  It calculates hotness of each region as
weighted sum of 'nr_accesses' and 'age' of the region and get the
priority score as reverse of the hotness, so that cold regions can be
paged out first.

Link: https://lkml.kernel.org/r/20211019150731.16699-9-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h   |  4 ++++
 mm/damon/paddr.c        | 14 +++++++++++++
 mm/damon/prmtv-common.c | 46 +++++++++++++++++++++++++++++++++++++++++
 mm/damon/prmtv-common.h |  3 +++
 mm/damon/vaddr.c        | 15 ++++++++++++++
 5 files changed, 82 insertions(+)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 5d47ad9e3911b..1217566a0ebcf 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -421,6 +421,8 @@ bool damon_va_target_valid(void *t);
 void damon_va_cleanup(struct damon_ctx *ctx);
 int damon_va_apply_scheme(struct damon_ctx *context, struct damon_target *t,
 		struct damon_region *r, struct damos *scheme);
+int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t,
+		struct damon_region *r, struct damos *scheme);
 void damon_va_set_primitives(struct damon_ctx *ctx);
 
 #endif	/* CONFIG_DAMON_VADDR */
@@ -433,6 +435,8 @@ unsigned int damon_pa_check_accesses(struct damon_ctx *ctx);
 bool damon_pa_target_valid(void *t);
 int damon_pa_apply_scheme(struct damon_ctx *context, struct damon_target *t,
 		struct damon_region *r, struct damos *scheme);
+int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t,
+		struct damon_region *r, struct damos *scheme);
 void damon_pa_set_primitives(struct damon_ctx *ctx);
 
 #endif	/* CONFIG_DAMON_PADDR */
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 957ada55de77b..a496d6f203d64 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -246,6 +246,19 @@ int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t,
 	return 0;
 }
 
+int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t,
+		struct damon_region *r, struct damos *scheme)
+{
+	switch (scheme->action) {
+	case DAMOS_PAGEOUT:
+		return damon_pageout_score(context, r, scheme);
+	default:
+		break;
+	}
+
+	return DAMOS_MAX_SCORE;
+}
+
 void damon_pa_set_primitives(struct damon_ctx *ctx)
 {
 	ctx->primitive.init = NULL;
@@ -256,4 +269,5 @@ void damon_pa_set_primitives(struct damon_ctx *ctx)
 	ctx->primitive.target_valid = damon_pa_target_valid;
 	ctx->primitive.cleanup = NULL;
 	ctx->primitive.apply_scheme = damon_pa_apply_scheme;
+	ctx->primitive.get_scheme_score = damon_pa_scheme_score;
 }
diff --git a/mm/damon/prmtv-common.c b/mm/damon/prmtv-common.c
index 7e62ee54fb543..92a04f5831d6b 100644
--- a/mm/damon/prmtv-common.c
+++ b/mm/damon/prmtv-common.c
@@ -85,3 +85,49 @@ void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr)
 	put_page(page);
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 }
+
+#define DAMON_MAX_SUBSCORE	(100)
+#define DAMON_MAX_AGE_IN_LOG	(32)
+
+int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
+			struct damos *s)
+{
+	unsigned int max_nr_accesses;
+	int freq_subscore;
+	unsigned int age_in_sec;
+	int age_in_log, age_subscore;
+	unsigned int freq_weight = s->quota.weight_nr_accesses;
+	unsigned int age_weight = s->quota.weight_age;
+	int hotness;
+
+	max_nr_accesses = c->aggr_interval / c->sample_interval;
+	freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses;
+
+	age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000;
+	for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec;
+			age_in_log++, age_in_sec >>= 1)
+		;
+
+	/* If frequency is 0, higher age means it's colder */
+	if (freq_subscore == 0)
+		age_in_log *= -1;
+
+	/*
+	 * Now age_in_log is in [-DAMON_MAX_AGE_IN_LOG, DAMON_MAX_AGE_IN_LOG].
+	 * Scale it to be in [0, 100] and set it as age subscore.
+	 */
+	age_in_log += DAMON_MAX_AGE_IN_LOG;
+	age_subscore = age_in_log * DAMON_MAX_SUBSCORE /
+		DAMON_MAX_AGE_IN_LOG / 2;
+
+	hotness = (freq_weight * freq_subscore + age_weight * age_subscore);
+	if (freq_weight + age_weight)
+		hotness /= freq_weight + age_weight;
+	/*
+	 * Transform it to fit in [0, DAMOS_MAX_SCORE]
+	 */
+	hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE;
+
+	/* Return coldness of the region */
+	return DAMOS_MAX_SCORE - hotness;
+}
diff --git a/mm/damon/prmtv-common.h b/mm/damon/prmtv-common.h
index 7093d19e5d428..61f27037603e1 100644
--- a/mm/damon/prmtv-common.h
+++ b/mm/damon/prmtv-common.h
@@ -15,3 +15,6 @@ struct page *damon_get_page(unsigned long pfn);
 
 void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr);
 void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr);
+
+int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
+			struct damos *s);
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 14768575f9066..0d8685d63b3f1 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -634,6 +634,20 @@ int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t,
 	return damos_madvise(t, r, madv_action);
 }
 
+int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t,
+		struct damon_region *r, struct damos *scheme)
+{
+
+	switch (scheme->action) {
+	case DAMOS_PAGEOUT:
+		return damon_pageout_score(context, r, scheme);
+	default:
+		break;
+	}
+
+	return DAMOS_MAX_SCORE;
+}
+
 void damon_va_set_primitives(struct damon_ctx *ctx)
 {
 	ctx->primitive.init = damon_va_init;
@@ -644,6 +658,7 @@ void damon_va_set_primitives(struct damon_ctx *ctx)
 	ctx->primitive.target_valid = damon_va_target_valid;
 	ctx->primitive.cleanup = NULL;
 	ctx->primitive.apply_scheme = damon_va_apply_scheme;
+	ctx->primitive.get_scheme_score = damon_va_scheme_score;
 }
 
 #include "vaddr-test.h"

From 216eb4b46720be1d4ea13f672b7ab4d91e99b4f4 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:47:40 -0700
Subject: [PATCH 312/737] mm/damon/dbgfs: support prioritization weights

This allows DAMON debugfs interface users set the prioritization weights
by putting three more numbers to the 'schemes' file.

Link: https://lkml.kernel.org/r/20211019150731.16699-10-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/dbgfs.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 097e6745ba75c..20c4feb8b918c 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -105,13 +105,16 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
 
 	damon_for_each_scheme(s, c) {
 		rc = scnprintf(&buf[written], len - written,
-				"%lu %lu %u %u %u %u %d %lu %lu %lu %lu %lu\n",
+				"%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %lu %lu\n",
 				s->min_sz_region, s->max_sz_region,
 				s->min_nr_accesses, s->max_nr_accesses,
 				s->min_age_region, s->max_age_region,
 				s->action,
 				s->quota.ms, s->quota.sz,
 				s->quota.reset_interval,
+				s->quota.weight_sz,
+				s->quota.weight_nr_accesses,
+				s->quota.weight_age,
 				s->stat_count, s->stat_sz);
 		if (!rc)
 			return -ENOMEM;
@@ -193,11 +196,14 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
 	while (pos < len && *nr_schemes < max_nr_schemes) {
 		struct damos_quota quota = {};
 
-		ret = sscanf(&str[pos], "%lu %lu %u %u %u %u %u %lu %lu %lu%n",
+		ret = sscanf(&str[pos],
+				"%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u%n",
 				&min_sz, &max_sz, &min_nr_a, &max_nr_a,
 				&min_age, &max_age, &action, &quota.ms,
-				&quota.sz, &quota.reset_interval, &parsed);
-		if (ret != 10)
+				&quota.sz, &quota.reset_interval,
+				&quota.weight_sz, &quota.weight_nr_accesses,
+				&quota.weight_age, &parsed);
+		if (ret != 13)
 			break;
 		if (!damos_action_valid(action)) {
 			pr_err("wrong action %d\n", action);

From 9d9c54ace35b5843c09c07bff7c02c242836a1fa Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:47:44 -0700
Subject: [PATCH 313/737] tools/selftests/damon: update for regions
 prioritization of schemes

This updates the DAMON selftests for 'schemes' debugfs file, as the file
format is updated.

Link: https://lkml.kernel.org/r/20211019150731.16699-11-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/damon/debugfs_attrs.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/damon/debugfs_attrs.sh b/tools/testing/selftests/damon/debugfs_attrs.sh
index 8e33a7b584e70..466dbeb37e31e 100644
--- a/tools/testing/selftests/damon/debugfs_attrs.sh
+++ b/tools/testing/selftests/damon/debugfs_attrs.sh
@@ -63,10 +63,10 @@ echo "$orig_content" > "$file"
 file="$DBGFS/schemes"
 orig_content=$(cat "$file")
 
-test_write_succ "$file" "1 2 3 4 5 6 4 0 0 0" \
+test_write_succ "$file" "1 2 3 4 5 6 4 0 0 0 1 2 3" \
 	"$orig_content" "valid input"
 test_write_fail "$file" "1 2
-3 4 5 6 3 0 0 0" "$orig_content" "multi lines"
+3 4 5 6 3 0 0 0 1 2 3" "$orig_content" "multi lines"
 test_write_succ "$file" "" "$orig_content" "disabling"
 echo "$orig_content" > "$file"
 

From 6729afbb8277e157ba2c3d024b1e57f39b0dca4c Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:47:47 -0700
Subject: [PATCH 314/737] mm/damon/schemes: activate schemes based on a
 watermarks mechanism

DAMON-based operation schemes need to be manually turned on and off.  In
some use cases, however, the condition for turning a scheme on and off
would depend on the system's situation.  For example, schemes for
proactive pages reclamation would need to be turned on when some memory
pressure is detected, and turned off when the system has enough free
memory.

For easier control of schemes activation based on the system situation,
this introduces a watermarks-based mechanism.  The client can describe
the watermark metric (e.g., amount of free memory in the system),
watermark check interval, and three watermarks, namely high, mid, and
low.  If the scheme is deactivated, it only gets the metric and compare
that to the three watermarks for every check interval.  If the metric is
higher than the high watermark, the scheme is deactivated.  If the
metric is between the mid watermark and the low watermark, the scheme is
activated.  If the metric is lower than the low watermark, the scheme is
deactivated again.  This is to allow users fall back to traditional
page-granularity mechanisms.

Link: https://lkml.kernel.org/r/20211019150731.16699-12-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 52 ++++++++++++++++++++++-
 mm/damon/core.c       | 97 ++++++++++++++++++++++++++++++++++++++++++-
 mm/damon/dbgfs.c      |  5 ++-
 3 files changed, 151 insertions(+), 3 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 1217566a0ebcf..c93325efddd7d 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -146,6 +146,45 @@ struct damos_quota {
 	unsigned int min_score;
 };
 
+/**
+ * enum damos_wmark_metric - Represents the watermark metric.
+ *
+ * @DAMOS_WMARK_NONE:		Ignore the watermarks of the given scheme.
+ * @DAMOS_WMARK_FREE_MEM_RATE:	Free memory rate of the system in [0,1000].
+ */
+enum damos_wmark_metric {
+	DAMOS_WMARK_NONE,
+	DAMOS_WMARK_FREE_MEM_RATE,
+};
+
+/**
+ * struct damos_watermarks - Controls when a given scheme should be activated.
+ * @metric:	Metric for the watermarks.
+ * @interval:	Watermarks check time interval in microseconds.
+ * @high:	High watermark.
+ * @mid:	Middle watermark.
+ * @low:	Low watermark.
+ *
+ * If &metric is &DAMOS_WMARK_NONE, the scheme is always active.  Being active
+ * means DAMON does monitoring and applying the action of the scheme to
+ * appropriate memory regions.  Else, DAMON checks &metric of the system for at
+ * least every &interval microseconds and works as below.
+ *
+ * If &metric is higher than &high, the scheme is inactivated.  If &metric is
+ * between &mid and &low, the scheme is activated.  If &metric is lower than
+ * &low, the scheme is inactivated.
+ */
+struct damos_watermarks {
+	enum damos_wmark_metric metric;
+	unsigned long interval;
+	unsigned long high;
+	unsigned long mid;
+	unsigned long low;
+
+/* private: */
+	bool activated;
+};
+
 /**
  * struct damos - Represents a Data Access Monitoring-based Operation Scheme.
  * @min_sz_region:	Minimum size of target regions.
@@ -156,6 +195,7 @@ struct damos_quota {
  * @max_age_region:	Maximum age of target regions.
  * @action:		&damo_action to be applied to the target regions.
  * @quota:		Control the aggressiveness of this scheme.
+ * @wmarks:		Watermarks for automated (in)activation of this scheme.
  * @stat_count:		Total number of regions that this scheme is applied.
  * @stat_sz:		Total size of regions that this scheme is applied.
  * @list:		List head for siblings.
@@ -166,6 +206,14 @@ struct damos_quota {
  * those.  To avoid consuming too much CPU time or IO resources for the
  * &action, &quota is used.
  *
+ * To do the work only when needed, schemes can be activated for specific
+ * system situations using &wmarks.  If all schemes that registered to the
+ * monitoring context are inactive, DAMON stops monitoring either, and just
+ * repeatedly checks the watermarks.
+ *
+ * If all schemes that registered to a &struct damon_ctx are inactive, DAMON
+ * stops monitoring and just repeatedly checks the watermarks.
+ *
  * After applying the &action to each region, &stat_count and &stat_sz is
  * updated to reflect the number of regions and total size of regions that the
  * &action is applied.
@@ -179,6 +227,7 @@ struct damos {
 	unsigned int max_age_region;
 	enum damos_action action;
 	struct damos_quota quota;
+	struct damos_watermarks wmarks;
 	unsigned long stat_count;
 	unsigned long stat_sz;
 	struct list_head list;
@@ -384,7 +433,8 @@ struct damos *damon_new_scheme(
 		unsigned long min_sz_region, unsigned long max_sz_region,
 		unsigned int min_nr_accesses, unsigned int max_nr_accesses,
 		unsigned int min_age_region, unsigned int max_age_region,
-		enum damos_action action, struct damos_quota *quota);
+		enum damos_action action, struct damos_quota *quota,
+		struct damos_watermarks *wmarks);
 void damon_add_scheme(struct damon_ctx *ctx, struct damos *s);
 void damon_destroy_scheme(struct damos *s);
 
diff --git a/mm/damon/core.c b/mm/damon/core.c
index fad25778e2ecf..6993c60ae31c4 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -10,6 +10,7 @@
 #include <linux/damon.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
+#include <linux/mm.h>
 #include <linux/random.h>
 #include <linux/slab.h>
 #include <linux/string.h>
@@ -90,7 +91,8 @@ struct damos *damon_new_scheme(
 		unsigned long min_sz_region, unsigned long max_sz_region,
 		unsigned int min_nr_accesses, unsigned int max_nr_accesses,
 		unsigned int min_age_region, unsigned int max_age_region,
-		enum damos_action action, struct damos_quota *quota)
+		enum damos_action action, struct damos_quota *quota,
+		struct damos_watermarks *wmarks)
 {
 	struct damos *scheme;
 
@@ -122,6 +124,13 @@ struct damos *damon_new_scheme(
 	scheme->quota.charge_target_from = NULL;
 	scheme->quota.charge_addr_from = 0;
 
+	scheme->wmarks.metric = wmarks->metric;
+	scheme->wmarks.interval = wmarks->interval;
+	scheme->wmarks.high = wmarks->high;
+	scheme->wmarks.mid = wmarks->mid;
+	scheme->wmarks.low = wmarks->low;
+	scheme->wmarks.activated = true;
+
 	return scheme;
 }
 
@@ -582,6 +591,9 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
 		unsigned long sz = r->ar.end - r->ar.start;
 		struct timespec64 begin, end;
 
+		if (!s->wmarks.activated)
+			continue;
+
 		/* Check the quota */
 		if (quota->esz && quota->charged_sz >= quota->esz)
 			continue;
@@ -684,6 +696,9 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
 		unsigned long cumulated_sz;
 		unsigned int score, max_score = 0;
 
+		if (!s->wmarks.activated)
+			continue;
+
 		if (!quota->ms && !quota->sz)
 			continue;
 
@@ -924,6 +939,83 @@ static bool kdamond_need_stop(struct damon_ctx *ctx)
 	return true;
 }
 
+static unsigned long damos_wmark_metric_value(enum damos_wmark_metric metric)
+{
+	struct sysinfo i;
+
+	switch (metric) {
+	case DAMOS_WMARK_FREE_MEM_RATE:
+		si_meminfo(&i);
+		return i.freeram * 1000 / i.totalram;
+	default:
+		break;
+	}
+	return -EINVAL;
+}
+
+/*
+ * Returns zero if the scheme is active.  Else, returns time to wait for next
+ * watermark check in micro-seconds.
+ */
+static unsigned long damos_wmark_wait_us(struct damos *scheme)
+{
+	unsigned long metric;
+
+	if (scheme->wmarks.metric == DAMOS_WMARK_NONE)
+		return 0;
+
+	metric = damos_wmark_metric_value(scheme->wmarks.metric);
+	/* higher than high watermark or lower than low watermark */
+	if (metric > scheme->wmarks.high || scheme->wmarks.low > metric) {
+		if (scheme->wmarks.activated)
+			pr_debug("inactivate a scheme (%d) for %s wmark\n",
+					scheme->action,
+					metric > scheme->wmarks.high ?
+					"high" : "low");
+		scheme->wmarks.activated = false;
+		return scheme->wmarks.interval;
+	}
+
+	/* inactive and higher than middle watermark */
+	if ((scheme->wmarks.high >= metric && metric >= scheme->wmarks.mid) &&
+			!scheme->wmarks.activated)
+		return scheme->wmarks.interval;
+
+	if (!scheme->wmarks.activated)
+		pr_debug("activate a scheme (%d)\n", scheme->action);
+	scheme->wmarks.activated = true;
+	return 0;
+}
+
+static void kdamond_usleep(unsigned long usecs)
+{
+	if (usecs > 100 * 1000)
+		schedule_timeout_interruptible(usecs_to_jiffies(usecs));
+	else
+		usleep_range(usecs, usecs + 1);
+}
+
+/* Returns negative error code if it's not activated but should return */
+static int kdamond_wait_activation(struct damon_ctx *ctx)
+{
+	struct damos *s;
+	unsigned long wait_time;
+	unsigned long min_wait_time = 0;
+
+	while (!kdamond_need_stop(ctx)) {
+		damon_for_each_scheme(s, ctx) {
+			wait_time = damos_wmark_wait_us(s);
+			if (!min_wait_time || wait_time < min_wait_time)
+				min_wait_time = wait_time;
+		}
+		if (!min_wait_time)
+			return 0;
+
+		kdamond_usleep(min_wait_time);
+	}
+	return -EBUSY;
+}
+
 static void set_kdamond_stop(struct damon_ctx *ctx)
 {
 	mutex_lock(&ctx->kdamond_lock);
@@ -952,6 +1044,9 @@ static int kdamond_fn(void *data)
 	sz_limit = damon_region_sz_limit(ctx);
 
 	while (!kdamond_need_stop(ctx)) {
+		if (kdamond_wait_activation(ctx))
+			continue;
+
 		if (ctx->primitive.prepare_access_checks)
 			ctx->primitive.prepare_access_checks(ctx);
 		if (ctx->callback.after_sampling &&
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 20c4feb8b918c..9f13060d10585 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -195,6 +195,9 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
 	*nr_schemes = 0;
 	while (pos < len && *nr_schemes < max_nr_schemes) {
 		struct damos_quota quota = {};
+		struct damos_watermarks wmarks = {
+			.metric = DAMOS_WMARK_NONE,
+		};
 
 		ret = sscanf(&str[pos],
 				"%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u%n",
@@ -212,7 +215,7 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
 
 		pos += parsed;
 		scheme = damon_new_scheme(min_sz, max_sz, min_nr_a, max_nr_a,
-				min_age, max_age, action, &quota);
+				min_age, max_age, action, &quota, &wmarks);
 		if (!scheme)
 			goto fail;
 

From d84584782fd2e072966e14e6d370186ceefb8ce8 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:47:50 -0700
Subject: [PATCH 315/737] mm/damon/dbgfs: support watermarks

This updates DAMON debugfs interface to support the watermarks based
schemes activation.  For this, now 'schemes' file receives five more
values.

Link: https://lkml.kernel.org/r/20211019150731.16699-13-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/dbgfs.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 9f13060d10585..6828e463348b0 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -105,7 +105,7 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
 
 	damon_for_each_scheme(s, c) {
 		rc = scnprintf(&buf[written], len - written,
-				"%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %lu %lu\n",
+				"%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu\n",
 				s->min_sz_region, s->max_sz_region,
 				s->min_nr_accesses, s->max_nr_accesses,
 				s->min_age_region, s->max_age_region,
@@ -115,6 +115,8 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
 				s->quota.weight_sz,
 				s->quota.weight_nr_accesses,
 				s->quota.weight_age,
+				s->wmarks.metric, s->wmarks.interval,
+				s->wmarks.high, s->wmarks.mid, s->wmarks.low,
 				s->stat_count, s->stat_sz);
 		if (!rc)
 			return -ENOMEM;
@@ -195,18 +197,18 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
 	*nr_schemes = 0;
 	while (pos < len && *nr_schemes < max_nr_schemes) {
 		struct damos_quota quota = {};
-		struct damos_watermarks wmarks = {
-			.metric = DAMOS_WMARK_NONE,
-		};
+		struct damos_watermarks wmarks;
 
 		ret = sscanf(&str[pos],
-				"%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u%n",
+				"%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n",
 				&min_sz, &max_sz, &min_nr_a, &max_nr_a,
 				&min_age, &max_age, &action, &quota.ms,
 				&quota.sz, &quota.reset_interval,
 				&quota.weight_sz, &quota.weight_nr_accesses,
-				&quota.weight_age, &parsed);
-		if (ret != 13)
+				&quota.weight_age, &wmarks.metric,
+				&wmarks.interval, &wmarks.high, &wmarks.mid,
+				&wmarks.low, &parsed);
+		if (ret != 18)
 			break;
 		if (!damos_action_valid(action)) {
 			pr_err("wrong action %d\n", action);

From 6dd1d461a6ea57afa4cfbaa63b8e93dea87a447e Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:47:54 -0700
Subject: [PATCH 316/737] selftests/damon: support watermarks

This updates DAMON selftests for 'schemes' debugfs file to reflect the
changes in the format.

Link: https://lkml.kernel.org/r/20211019150731.16699-14-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/damon/debugfs_attrs.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/damon/debugfs_attrs.sh b/tools/testing/selftests/damon/debugfs_attrs.sh
index 466dbeb37e31e..196b6640bf378 100644
--- a/tools/testing/selftests/damon/debugfs_attrs.sh
+++ b/tools/testing/selftests/damon/debugfs_attrs.sh
@@ -63,10 +63,10 @@ echo "$orig_content" > "$file"
 file="$DBGFS/schemes"
 orig_content=$(cat "$file")
 
-test_write_succ "$file" "1 2 3 4 5 6 4 0 0 0 1 2 3" \
+test_write_succ "$file" "1 2 3 4 5 6 4 0 0 0 1 2 3 1 100 3 2 1" \
 	"$orig_content" "valid input"
 test_write_fail "$file" "1 2
-3 4 5 6 3 0 0 0 1 2 3" "$orig_content" "multi lines"
+3 4 5 6 3 0 0 0 1 2 3 1 100 3 2 1" "$orig_content" "multi lines"
 test_write_succ "$file" "" "$orig_content" "disabling"
 echo "$orig_content" > "$file"
 

From 7cd97dff1b852952026f3919964553fffc293877 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:47:57 -0700
Subject: [PATCH 317/737] mm/damon: introduce DAMON-based Reclamation
 (DAMON_RECLAIM)

This implements a new kernel subsystem that finds cold memory regions
using DAMON and reclaims those immediately.  It is intended to be used
as proactive lightweigh reclamation logic for light memory pressure.
For heavy memory pressure, it could be inactivated and fall back to the
traditional page-scanning based reclamation.

It's implemented on top of DAMON framework to use the DAMON-based
Operation Schemes (DAMOS) feature.  It utilizes all the DAMOS features
including speed limit, prioritization, and watermarks.

It could be enabled and tuned in boot time via the kernel boot
parameter, and in run time via its module parameters
('/sys/module/damon_reclaim/parameters/') interface.

[yangyingliang@huawei.com: fix error return code in damon_reclaim_turn()]
  Link: https://lkml.kernel.org/r/20211025124500.2758060-1-yangyingliang@huawei.com

Link: https://lkml.kernel.org/r/20211019150731.16699-15-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/Kconfig   |  12 ++
 mm/damon/Makefile  |   1 +
 mm/damon/reclaim.c | 356 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 369 insertions(+)
 create mode 100644 mm/damon/reclaim.c

diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index ca33b289ebbe4..5bcf05851ad07 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -73,4 +73,16 @@ config DAMON_DBGFS_KUNIT_TEST
 
 	  If unsure, say N.
 
+config DAMON_RECLAIM
+	bool "Build DAMON-based reclaim (DAMON_RECLAIM)"
+	depends on DAMON_PADDR
+	help
+	  This builds the DAMON-based reclamation subsystem.  It finds pages
+	  that not accessed for a long time (cold) using DAMON and reclaim
+	  those.
+
+	  This is suggested to be used as a proactive and lightweight
+	  reclamation under light memory pressure, while the traditional page
+	  scanning-based reclamation is used for heavy pressure.
+
 endmenu
diff --git a/mm/damon/Makefile b/mm/damon/Makefile
index 8d9b0df797029..f7d5ac377a2bb 100644
--- a/mm/damon/Makefile
+++ b/mm/damon/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_DAMON)		:= core.o
 obj-$(CONFIG_DAMON_VADDR)	+= prmtv-common.o vaddr.o
 obj-$(CONFIG_DAMON_PADDR)	+= prmtv-common.o paddr.o
 obj-$(CONFIG_DAMON_DBGFS)	+= dbgfs.o
+obj-$(CONFIG_DAMON_RECLAIM)	+= reclaim.o
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
new file mode 100644
index 0000000000000..dc1485044eaf7
--- /dev/null
+++ b/mm/damon/reclaim.c
@@ -0,0 +1,356 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DAMON-based page reclamation
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#define pr_fmt(fmt) "damon-reclaim: " fmt
+
+#include <linux/damon.h>
+#include <linux/ioport.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "damon_reclaim."
+
+/*
+ * Enable or disable DAMON_RECLAIM.
+ *
+ * You can enable DAMON_RCLAIM by setting the value of this parameter as ``Y``.
+ * Setting it as ``N`` disables DAMON_RECLAIM.  Note that DAMON_RECLAIM could
+ * do no real monitoring and reclamation due to the watermarks-based activation
+ * condition.  Refer to below descriptions for the watermarks parameter for
+ * this.
+ */
+static bool enabled __read_mostly;
+module_param(enabled, bool, 0600);
+
+/*
+ * Time threshold for cold memory regions identification in microseconds.
+ *
+ * If a memory region is not accessed for this or longer time, DAMON_RECLAIM
+ * identifies the region as cold, and reclaims.  120 seconds by default.
+ */
+static unsigned long min_age __read_mostly = 120000000;
+module_param(min_age, ulong, 0600);
+
+/*
+ * Limit of time for trying the reclamation in milliseconds.
+ *
+ * DAMON_RECLAIM tries to use only up to this time within a time window
+ * (quota_reset_interval_ms) for trying reclamation of cold pages.  This can be
+ * used for limiting CPU consumption of DAMON_RECLAIM.  If the value is zero,
+ * the limit is disabled.
+ *
+ * 10 ms by default.
+ */
+static unsigned long quota_ms __read_mostly = 10;
+module_param(quota_ms, ulong, 0600);
+
+/*
+ * Limit of size of memory for the reclamation in bytes.
+ *
+ * DAMON_RECLAIM charges amount of memory which it tried to reclaim within a
+ * time window (quota_reset_interval_ms) and makes no more than this limit is
+ * tried.  This can be used for limiting consumption of CPU and IO.  If this
+ * value is zero, the limit is disabled.
+ *
+ * 128 MiB by default.
+ */
+static unsigned long quota_sz __read_mostly = 128 * 1024 * 1024;
+module_param(quota_sz, ulong, 0600);
+
+/*
+ * The time/size quota charge reset interval in milliseconds.
+ *
+ * The charge reset interval for the quota of time (quota_ms) and size
+ * (quota_sz).  That is, DAMON_RECLAIM does not try reclamation for more than
+ * quota_ms milliseconds or quota_sz bytes within quota_reset_interval_ms
+ * milliseconds.
+ *
+ * 1 second by default.
+ */
+static unsigned long quota_reset_interval_ms __read_mostly = 1000;
+module_param(quota_reset_interval_ms, ulong, 0600);
+
+/*
+ * The watermarks check time interval in microseconds.
+ *
+ * Minimal time to wait before checking the watermarks, when DAMON_RECLAIM is
+ * enabled but inactive due to its watermarks rule.  5 seconds by default.
+ */
+static unsigned long wmarks_interval __read_mostly = 5000000;
+module_param(wmarks_interval, ulong, 0600);
+
+/*
+ * Free memory rate (per thousand) for the high watermark.
+ *
+ * If free memory of the system in bytes per thousand bytes is higher than
+ * this, DAMON_RECLAIM becomes inactive, so it does nothing but periodically
+ * checks the watermarks.  500 (50%) by default.
+ */
+static unsigned long wmarks_high __read_mostly = 500;
+module_param(wmarks_high, ulong, 0600);
+
+/*
+ * Free memory rate (per thousand) for the middle watermark.
+ *
+ * If free memory of the system in bytes per thousand bytes is between this and
+ * the low watermark, DAMON_RECLAIM becomes active, so starts the monitoring
+ * and the reclaiming.  400 (40%) by default.
+ */
+static unsigned long wmarks_mid __read_mostly = 400;
+module_param(wmarks_mid, ulong, 0600);
+
+/*
+ * Free memory rate (per thousand) for the low watermark.
+ *
+ * If free memory of the system in bytes per thousand bytes is lower than this,
+ * DAMON_RECLAIM becomes inactive, so it does nothing but periodically checks
+ * the watermarks.  In the case, the system falls back to the LRU-based page
+ * granularity reclamation logic.  200 (20%) by default.
+ */
+static unsigned long wmarks_low __read_mostly = 200;
+module_param(wmarks_low, ulong, 0600);
+
+/*
+ * Sampling interval for the monitoring in microseconds.
+ *
+ * The sampling interval of DAMON for the cold memory monitoring.  Please refer
+ * to the DAMON documentation for more detail.  5 ms by default.
+ */
+static unsigned long sample_interval __read_mostly = 5000;
+module_param(sample_interval, ulong, 0600);
+
+/*
+ * Aggregation interval for the monitoring in microseconds.
+ *
+ * The aggregation interval of DAMON for the cold memory monitoring.  Please
+ * refer to the DAMON documentation for more detail.  100 ms by default.
+ */
+static unsigned long aggr_interval __read_mostly = 100000;
+module_param(aggr_interval, ulong, 0600);
+
+/*
+ * Minimum number of monitoring regions.
+ *
+ * The minimal number of monitoring regions of DAMON for the cold memory
+ * monitoring.  This can be used to set lower-bound of the monitoring quality.
+ * But, setting this too high could result in increased monitoring overhead.
+ * Please refer to the DAMON documentation for more detail.  10 by default.
+ */
+static unsigned long min_nr_regions __read_mostly = 10;
+module_param(min_nr_regions, ulong, 0600);
+
+/*
+ * Maximum number of monitoring regions.
+ *
+ * The maximum number of monitoring regions of DAMON for the cold memory
+ * monitoring.  This can be used to set upper-bound of the monitoring overhead.
+ * However, setting this too low could result in bad monitoring quality.
+ * Please refer to the DAMON documentation for more detail.  1000 by default.
+ */
+static unsigned long max_nr_regions __read_mostly = 1000;
+module_param(max_nr_regions, ulong, 0600);
+
+/*
+ * Start of the target memory region in physical address.
+ *
+ * The start physical address of memory region that DAMON_RECLAIM will do work
+ * against.  By default, biggest System RAM is used as the region.
+ */
+static unsigned long monitor_region_start __read_mostly;
+module_param(monitor_region_start, ulong, 0600);
+
+/*
+ * End of the target memory region in physical address.
+ *
+ * The end physical address of memory region that DAMON_RECLAIM will do work
+ * against.  By default, biggest System RAM is used as the region.
+ */
+static unsigned long monitor_region_end __read_mostly;
+module_param(monitor_region_end, ulong, 0600);
+
+/*
+ * PID of the DAMON thread
+ *
+ * If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread.
+ * Else, -1.
+ */
+static int kdamond_pid __read_mostly = -1;
+module_param(kdamond_pid, int, 0400);
+
+static struct damon_ctx *ctx;
+static struct damon_target *target;
+
+struct damon_reclaim_ram_walk_arg {
+	unsigned long start;
+	unsigned long end;
+};
+
+static int walk_system_ram(struct resource *res, void *arg)
+{
+	struct damon_reclaim_ram_walk_arg *a = arg;
+
+	if (a->end - a->start < res->end - res->start) {
+		a->start = res->start;
+		a->end = res->end;
+	}
+	return 0;
+}
+
+/*
+ * Find biggest 'System RAM' resource and store its start and end address in
+ * @start and @end, respectively.  If no System RAM is found, returns false.
+ */
+static bool get_monitoring_region(unsigned long *start, unsigned long *end)
+{
+	struct damon_reclaim_ram_walk_arg arg = {};
+
+	walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram);
+	if (arg.end <= arg.start)
+		return false;
+
+	*start = arg.start;
+	*end = arg.end;
+	return true;
+}
+
+static struct damos *damon_reclaim_new_scheme(void)
+{
+	struct damos_watermarks wmarks = {
+		.metric = DAMOS_WMARK_FREE_MEM_RATE,
+		.interval = wmarks_interval,
+		.high = wmarks_high,
+		.mid = wmarks_mid,
+		.low = wmarks_low,
+	};
+	struct damos_quota quota = {
+		/*
+		 * Do not try reclamation for more than quota_ms milliseconds
+		 * or quota_sz bytes within quota_reset_interval_ms.
+		 */
+		.ms = quota_ms,
+		.sz = quota_sz,
+		.reset_interval = quota_reset_interval_ms,
+		/* Within the quota, page out older regions first. */
+		.weight_sz = 0,
+		.weight_nr_accesses = 0,
+		.weight_age = 1
+	};
+	struct damos *scheme = damon_new_scheme(
+			/* Find regions having PAGE_SIZE or larger size */
+			PAGE_SIZE, ULONG_MAX,
+			/* and not accessed at all */
+			0, 0,
+			/* for min_age or more micro-seconds, and */
+			min_age / aggr_interval, UINT_MAX,
+			/* page out those, as soon as found */
+			DAMOS_PAGEOUT,
+			/* under the quota. */
+			&quota,
+			/* (De)activate this according to the watermarks. */
+			&wmarks);
+
+	return scheme;
+}
+
+static int damon_reclaim_turn(bool on)
+{
+	struct damon_region *region;
+	struct damos *scheme;
+	int err;
+
+	if (!on) {
+		err = damon_stop(&ctx, 1);
+		if (!err)
+			kdamond_pid = -1;
+		return err;
+	}
+
+	err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0,
+			min_nr_regions, max_nr_regions);
+	if (err)
+		return err;
+
+	if (monitor_region_start > monitor_region_end)
+		return -EINVAL;
+	if (!monitor_region_start && !monitor_region_end &&
+			!get_monitoring_region(&monitor_region_start,
+				&monitor_region_end))
+		return -EINVAL;
+	/* DAMON will free this on its own when finish monitoring */
+	region = damon_new_region(monitor_region_start, monitor_region_end);
+	if (!region)
+		return -ENOMEM;
+	damon_add_region(region, target);
+
+	/* Will be freed by 'damon_set_schemes()' below */
+	scheme = damon_reclaim_new_scheme();
+	if (!scheme) {
+		err = -ENOMEM;
+		goto free_region_out;
+	}
+	err = damon_set_schemes(ctx, &scheme, 1);
+	if (err)
+		goto free_scheme_out;
+
+	err = damon_start(&ctx, 1);
+	if (!err) {
+		kdamond_pid = ctx->kdamond->pid;
+		return 0;
+	}
+
+free_scheme_out:
+	damon_destroy_scheme(scheme);
+free_region_out:
+	damon_destroy_region(region, target);
+	return err;
+}
+
+#define ENABLE_CHECK_INTERVAL_MS	1000
+static struct delayed_work damon_reclaim_timer;
+static void damon_reclaim_timer_fn(struct work_struct *work)
+{
+	static bool last_enabled;
+	bool now_enabled;
+
+	now_enabled = enabled;
+	if (last_enabled != now_enabled) {
+		if (!damon_reclaim_turn(now_enabled))
+			last_enabled = now_enabled;
+		else
+			enabled = last_enabled;
+	}
+
+	schedule_delayed_work(&damon_reclaim_timer,
+			msecs_to_jiffies(ENABLE_CHECK_INTERVAL_MS));
+}
+static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn);
+
+static int __init damon_reclaim_init(void)
+{
+	ctx = damon_new_ctx();
+	if (!ctx)
+		return -ENOMEM;
+
+	damon_pa_set_primitives(ctx);
+
+	/* 4242 means nothing but fun */
+	target = damon_new_target(4242);
+	if (!target) {
+		damon_destroy_ctx(ctx);
+		return -ENOMEM;
+	}
+	damon_add_target(ctx, target);
+
+	schedule_delayed_work(&damon_reclaim_timer, 0);
+	return 0;
+}
+
+module_init(damon_reclaim_init);

From d1f4dda6ba8359d3877d49fed9af8eeff382517b Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:48:01 -0700
Subject: [PATCH 318/737] Documentation/admin-guide/mm/damon: add a document
 for DAMON_RECLAIM

This adds an admin-guide document for DAMON-based Reclamation.

Link: https://lkml.kernel.org/r/20211019150731.16699-16-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/index.rst  |   1 +
 .../admin-guide/mm/damon/reclaim.rst          | 235 ++++++++++++++++++
 2 files changed, 236 insertions(+)
 create mode 100644 Documentation/admin-guide/mm/damon/reclaim.rst

diff --git a/Documentation/admin-guide/mm/damon/index.rst b/Documentation/admin-guide/mm/damon/index.rst
index 8c5dde3a57544..61aff88347f3c 100644
--- a/Documentation/admin-guide/mm/damon/index.rst
+++ b/Documentation/admin-guide/mm/damon/index.rst
@@ -13,3 +13,4 @@ optimize those.
 
    start
    usage
+   reclaim
diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst
new file mode 100644
index 0000000000000..fb9def3a73559
--- /dev/null
+++ b/Documentation/admin-guide/mm/damon/reclaim.rst
@@ -0,0 +1,235 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=======================
+DAMON-based Reclamation
+=======================
+
+DAMON-based Reclamation (DAMON_RECLAIM) is a static kernel module that aimed to
+be used for proactive and lightweight reclamation under light memory pressure.
+It doesn't aim to replace the LRU-list based page_granularity reclamation, but
+to be selectively used for different level of memory pressure and requirements.
+
+Where Proactive Reclamation is Required?
+========================================
+
+On general memory over-committed systems, proactively reclaiming cold pages
+helps saving memory and reducing latency spikes that incurred by the direct
+reclaim of the process or CPU consumption of kswapd, while incurring only
+minimal performance degradation [1]_ [2]_ .
+
+Free Pages Reporting [3]_ based memory over-commit virtualization systems are
+good example of the cases.  In such systems, the guest VMs reports their free
+memory to host, and the host reallocates the reported memory to other guests.
+As a result, the memory of the systems are fully utilized.  However, the
+guests could be not so memory-frugal, mainly because some kernel subsystems and
+user-space applications are designed to use as much memory as available.  Then,
+guests could report only small amount of memory as free to host, results in
+memory utilization drop of the systems.  Running the proactive reclamation in
+guests could mitigate this problem.
+
+How It Works?
+=============
+
+DAMON_RECLAIM finds memory regions that didn't accessed for specific time
+duration and page out.  To avoid it consuming too much CPU for the paging out
+operation, a speed limit can be configured.  Under the speed limit, it pages
+out memory regions that didn't accessed longer time first.  System
+administrators can also configure under what situation this scheme should
+automatically activated and deactivated with three memory pressure watermarks.
+
+Interface: Module Parameters
+============================
+
+To use this feature, you should first ensure your system is running on a kernel
+that is built with ``CONFIG_DAMON_RECLAIM=y``.
+
+To let sysadmins enable or disable it and tune for the given system,
+DAMON_RECLAIM utilizes module parameters.  That is, you can put
+``damon_reclaim.<parameter>=<value>`` on the kernel boot command line or write
+proper values to ``/sys/modules/damon_reclaim/parameters/<parameter>`` files.
+
+Note that the parameter values except ``enabled`` are applied only when
+DAMON_RECLAIM starts.  Therefore, if you want to apply new parameter values in
+runtime and DAMON_RECLAIM is already enabled, you should disable and re-enable
+it via ``enabled`` parameter file.  Writing of the new values to proper
+parameter values should be done before the re-enablement.
+
+Below are the description of each parameter.
+
+enabled
+-------
+
+Enable or disable DAMON_RECLAIM.
+
+You can enable DAMON_RCLAIM by setting the value of this parameter as ``Y``.
+Setting it as ``N`` disables DAMON_RECLAIM.  Note that DAMON_RECLAIM could do
+no real monitoring and reclamation due to the watermarks-based activation
+condition.  Refer to below descriptions for the watermarks parameter for this.
+
+min_age
+-------
+
+Time threshold for cold memory regions identification in microseconds.
+
+If a memory region is not accessed for this or longer time, DAMON_RECLAIM
+identifies the region as cold, and reclaims it.
+
+120 seconds by default.
+
+quota_ms
+--------
+
+Limit of time for the reclamation in milliseconds.
+
+DAMON_RECLAIM tries to use only up to this time within a time window
+(quota_reset_interval_ms) for trying reclamation of cold pages.  This can be
+used for limiting CPU consumption of DAMON_RECLAIM.  If the value is zero, the
+limit is disabled.
+
+10 ms by default.
+
+quota_sz
+--------
+
+Limit of size of memory for the reclamation in bytes.
+
+DAMON_RECLAIM charges amount of memory which it tried to reclaim within a time
+window (quota_reset_interval_ms) and makes no more than this limit is tried.
+This can be used for limiting consumption of CPU and IO.  If this value is
+zero, the limit is disabled.
+
+128 MiB by default.
+
+quota_reset_interval_ms
+-----------------------
+
+The time/size quota charge reset interval in milliseconds.
+
+The charget reset interval for the quota of time (quota_ms) and size
+(quota_sz).  That is, DAMON_RECLAIM does not try reclamation for more than
+quota_ms milliseconds or quota_sz bytes within quota_reset_interval_ms
+milliseconds.
+
+1 second by default.
+
+wmarks_interval
+---------------
+
+Minimal time to wait before checking the watermarks, when DAMON_RECLAIM is
+enabled but inactive due to its watermarks rule.
+
+wmarks_high
+-----------
+
+Free memory rate (per thousand) for the high watermark.
+
+If free memory of the system in bytes per thousand bytes is higher than this,
+DAMON_RECLAIM becomes inactive, so it does nothing but only periodically checks
+the watermarks.
+
+wmarks_mid
+----------
+
+Free memory rate (per thousand) for the middle watermark.
+
+If free memory of the system in bytes per thousand bytes is between this and
+the low watermark, DAMON_RECLAIM becomes active, so starts the monitoring and
+the reclaiming.
+
+wmarks_low
+----------
+
+Free memory rate (per thousand) for the low watermark.
+
+If free memory of the system in bytes per thousand bytes is lower than this,
+DAMON_RECLAIM becomes inactive, so it does nothing but periodically checks the
+watermarks.  In the case, the system falls back to the LRU-list based page
+granularity reclamation logic.
+
+sample_interval
+---------------
+
+Sampling interval for the monitoring in microseconds.
+
+The sampling interval of DAMON for the cold memory monitoring.  Please refer to
+the DAMON documentation (:doc:`usage`) for more detail.
+
+aggr_interval
+-------------
+
+Aggregation interval for the monitoring in microseconds.
+
+The aggregation interval of DAMON for the cold memory monitoring.  Please
+refer to the DAMON documentation (:doc:`usage`) for more detail.
+
+min_nr_regions
+--------------
+
+Minimum number of monitoring regions.
+
+The minimal number of monitoring regions of DAMON for the cold memory
+monitoring.  This can be used to set lower-bound of the monitoring quality.
+But, setting this too high could result in increased monitoring overhead.
+Please refer to the DAMON documentation (:doc:`usage`) for more detail.
+
+max_nr_regions
+--------------
+
+Maximum number of monitoring regions.
+
+The maximum number of monitoring regions of DAMON for the cold memory
+monitoring.  This can be used to set upper-bound of the monitoring overhead.
+However, setting this too low could result in bad monitoring quality.  Please
+refer to the DAMON documentation (:doc:`usage`) for more detail.
+
+monitor_region_start
+--------------------
+
+Start of target memory region in physical address.
+
+The start physical address of memory region that DAMON_RECLAIM will do work
+against.  That is, DAMON_RECLAIM will find cold memory regions in this region
+and reclaims.  By default, biggest System RAM is used as the region.
+
+monitor_region_end
+------------------
+
+End of target memory region in physical address.
+
+The end physical address of memory region that DAMON_RECLAIM will do work
+against.  That is, DAMON_RECLAIM will find cold memory regions in this region
+and reclaims.  By default, biggest System RAM is used as the region.
+
+kdamond_pid
+-----------
+
+PID of the DAMON thread.
+
+If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread.  Else,
+-1.
+
+Example
+=======
+
+Below runtime example commands make DAMON_RECLAIM to find memory regions that
+not accessed for 30 seconds or more and pages out.  The reclamation is limited
+to be done only up to 1 GiB per second to avoid DAMON_RECLAIM consuming too
+much CPU time for the paging out operation.  It also asks DAMON_RECLAIM to do
+nothing if the system's free memory rate is more than 50%, but start the real
+works if it becomes lower than 40%.  If DAMON_RECLAIM doesn't make progress and
+therefore the free memory rate becomes lower than 20%, it asks DAMON_RECLAIM to
+do nothing again, so that we can fall back to the LRU-list based page
+granularity reclamation. ::
+
+    # cd /sys/modules/damon_reclaim/parameters
+    # echo 30000000 > min_age
+    # echo $((1 * 1024 * 1024 * 1024)) > quota_sz
+    # echo 1000 > quota_reset_interval_ms
+    # echo 500 > wmarks_high
+    # echo 400 > wmarks_mid
+    # echo 200 > wmarks_low
+    # echo Y > enabled
+
+.. [1] https://research.google/pubs/pub48551/
+.. [2] https://lwn.net/Articles/787611/
+.. [3] https://www.kernel.org/doc/html/latest/vm/free_page_reporting.html

From 43454a639a34254dff531771e3b7824f4924987c Mon Sep 17 00:00:00 2001
From: Xin Hao <xhao@linux.alibaba.com>
Date: Fri, 5 Nov 2021 13:48:04 -0700
Subject: [PATCH 319/737] mm/damon: remove unnecessary variable initialization

Patch series "mm/damon: Fix some small bugs", v4.

This patch (of 2):

In 'damon_va_apply_three_regions' there is no need to set variable 'i'
to zero.

Link: https://lkml.kernel.org/r/b7df8d3dad0943a37e01f60c441b1968b2b20354.1634720326.git.xhao@linux.alibaba.com
Link: https://lkml.kernel.org/r/cover.1634720326.git.xhao@linux.alibaba.com
Signed-off-by: Xin Hao <xhao@linux.alibaba.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/vaddr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 0d8685d63b3f1..47f47f60440eb 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -307,7 +307,7 @@ static void damon_va_apply_three_regions(struct damon_target *t,
 		struct damon_addr_range bregions[3])
 {
 	struct damon_region *r, *next;
-	unsigned int i = 0;
+	unsigned int i;
 
 	/* Remove regions which are not in the three big regions now */
 	damon_for_each_region_safe(r, next, t) {

From afcc17fe54c70c0e684ab4444f1f86815f3a7530 Mon Sep 17 00:00:00 2001
From: Xin Hao <xhao@linux.alibaba.com>
Date: Fri, 5 Nov 2021 13:48:07 -0700
Subject: [PATCH 320/737] mm/damon/dbgfs: add adaptive_targets list check
 before enable monitor_on

When the ctx->adaptive_targets list is empty, I did some test on
monitor_on interface like this.

    # cat /sys/kernel/debug/damon/target_ids
    #
    # echo on > /sys/kernel/debug/damon/monitor_on
    # damon: kdamond (5390) starts

Though the ctx->adaptive_targets list is empty, but the kthread_run
still be called, and the kdamond.x thread still be created, this is
meaningless.

So there adds a judgment in 'dbgfs_monitor_on_write', if the
ctx->adaptive_targets list is empty, return -EINVAL.

Link: https://lkml.kernel.org/r/0a60a6e8ec9d71989e0848a4dc3311996ca3b5d4.1634720326.git.xhao@linux.alibaba.com
Signed-off-by: Xin Hao <xhao@linux.alibaba.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h |  1 +
 mm/damon/core.c       |  5 +++++
 mm/damon/dbgfs.c      | 15 ++++++++++++---
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index c93325efddd7d..fa7f32614b65e 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -440,6 +440,7 @@ void damon_destroy_scheme(struct damos *s);
 
 struct damon_target *damon_new_target(unsigned long id);
 void damon_add_target(struct damon_ctx *ctx, struct damon_target *t);
+bool damon_targets_empty(struct damon_ctx *ctx);
 void damon_free_target(struct damon_target *t);
 void damon_destroy_target(struct damon_target *t);
 unsigned int damon_nr_regions(struct damon_target *t);
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 6993c60ae31c4..46a6afea3030c 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -180,6 +180,11 @@ void damon_add_target(struct damon_ctx *ctx, struct damon_target *t)
 	list_add_tail(&t->list, &ctx->adaptive_targets);
 }
 
+bool damon_targets_empty(struct damon_ctx *ctx)
+{
+	return list_empty(&ctx->adaptive_targets);
+}
+
 static void damon_del_target(struct damon_target *t)
 {
 	list_del(&t->list);
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 6828e463348b0..befb27a29aabd 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -878,12 +878,21 @@ static ssize_t dbgfs_monitor_on_write(struct file *file,
 		return -EINVAL;
 	}
 
-	if (!strncmp(kbuf, "on", count))
+	if (!strncmp(kbuf, "on", count)) {
+		int i;
+
+		for (i = 0; i < dbgfs_nr_ctxs; i++) {
+			if (damon_targets_empty(dbgfs_ctxs[i])) {
+				kfree(kbuf);
+				return -EINVAL;
+			}
+		}
 		ret = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs);
-	else if (!strncmp(kbuf, "off", count))
+	} else if (!strncmp(kbuf, "off", count)) {
 		ret = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs);
-	else
+	} else {
 		ret = -EINVAL;
+	}
 
 	if (!ret)
 		ret = count;

From 4b061cf0f2d23dca4aeeeacb7ef8ab1941a3ea5a Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:48:10 -0700
Subject: [PATCH 321/737] Docs/admin-guide/mm/damon/start: fix wrong example
 commands

Patch series "Fix trivial nits in Documentation/admin-guide/mm".

This patchset fixes trivial nits in admin guide documents for DAMON and
pagemap.

This patch (of 4):

Some of the example commands in DAMON getting started guide are
outdated, missing sudo, or just wrong.  This fixes those.

Link: https://lkml.kernel.org/r/20211022090311.3856-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/start.rst | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/start.rst b/Documentation/admin-guide/mm/damon/start.rst
index 51503cf90ca29..3ad8bbed9b18b 100644
--- a/Documentation/admin-guide/mm/damon/start.rst
+++ b/Documentation/admin-guide/mm/damon/start.rst
@@ -19,7 +19,7 @@ your workload. ::
     # mount -t debugfs none /sys/kernel/debug/
     # git clone https://github.com/awslabs/damo
     # ./damo/damo record $(pidof <your workload>)
-    # ./damo/damo report heat --plot_ascii
+    # ./damo/damo report heats --heatmap stdout
 
 The final command draws the access heatmap of ``<your workload>``.  The heatmap
 shows which memory region (x-axis) is accessed when (y-axis) and how frequently
@@ -94,9 +94,9 @@ Visualizing Recorded Patterns
 The following three commands visualize the recorded access patterns and save
 the results as separate image files. ::
 
-    $ damo report heats --heatmap access_pattern_heatmap.png
-    $ damo report wss --range 0 101 1 --plot wss_dist.png
-    $ damo report wss --range 0 101 1 --sortby time --plot wss_chron_change.png
+    $ sudo damo report heats --heatmap access_pattern_heatmap.png
+    $ sudo damo report wss --range 0 101 1 --plot wss_dist.png
+    $ sudo damo report wss --range 0 101 1 --sortby time --plot wss_chron_change.png
 
 - ``access_pattern_heatmap.png`` will visualize the data access pattern in a
   heatmap, showing which memory region (y-axis) got accessed when (x-axis)
@@ -115,9 +115,9 @@ Data Access Pattern Aware Memory Management
 Below three commands make every memory region of size >=4K that doesn't
 accessed for >=60 seconds in your workload to be swapped out. ::
 
-    $ echo "#min-size max-size min-acc max-acc min-age max-age action" > scheme
-    $ echo "4K        max      0       0       60s     max     pageout" >> scheme
-    $ damo schemes -c my_thp_scheme <pid of your workload>
+    $ echo "#min-size max-size min-acc max-acc min-age max-age action" > test_scheme
+    $ echo "4K        max      0       0       60s     max     pageout" >> test_scheme
+    $ damo schemes -c test_scheme <pid of your workload>
 
 .. [1] https://damonitor.github.io/doc/html/v17/admin-guide/mm/damon/start.html#visualizing-recorded-patterns
 .. [2] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.1.png.html

From b3e45aca24f21ebddad222f8f1169fd4ffe102d1 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:48:13 -0700
Subject: [PATCH 322/737] Docs/admin-guide/mm/damon/start: fix a wrong link

The 'Getting Started' of DAMON is providing a link to DAMON's user
interface document while saying about its user space tool's detailed
usages.  This fixes the link.

Link: https://lkml.kernel.org/r/20211022090311.3856-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/start.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/mm/damon/start.rst b/Documentation/admin-guide/mm/damon/start.rst
index 3ad8bbed9b18b..5f3b22cafc76f 100644
--- a/Documentation/admin-guide/mm/damon/start.rst
+++ b/Documentation/admin-guide/mm/damon/start.rst
@@ -6,7 +6,9 @@ Getting Started
 
 This document briefly describes how you can use DAMON by demonstrating its
 default user space tool.  Please note that this document describes only a part
-of its features for brevity.  Please refer to :doc:`usage` for more details.
+of its features for brevity.  Please refer to the usage `doc
+<https://github.com/awslabs/damo/blob/next/USAGE.md>`_ of the tool for more
+details.
 
 
 TL; DR

From c8d8ae0cc3ba30583acf48d85ebc0187ab7b0d3a Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 5 Nov 2021 13:48:16 -0700
Subject: [PATCH 323/737] Docs/admin-guide/mm/damon/start: simplify the content

Information in 'TL; DR' section of 'Getting Started' is duplicated in
other parts of the doc.  It is also asking readers to visit the access
pattern visualizations gallery web site to show the results of example
visualization commands, while the users of the commands can use terminal
output.

To make the doc simple, this removes the duplicated 'TL; DR' section and
replaces the visualization example commands with versions using terminal
outputs.

Link: https://lkml.kernel.org/r/20211022090311.3856-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/start.rst | 113 ++++++++++---------
 1 file changed, 60 insertions(+), 53 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/start.rst b/Documentation/admin-guide/mm/damon/start.rst
index 5f3b22cafc76f..4d5ca2c46288a 100644
--- a/Documentation/admin-guide/mm/damon/start.rst
+++ b/Documentation/admin-guide/mm/damon/start.rst
@@ -11,38 +11,6 @@ of its features for brevity.  Please refer to the usage `doc
 details.
 
 
-TL; DR
-======
-
-Follow the commands below to monitor and visualize the memory access pattern of
-your workload. ::
-
-    # # build the kernel with CONFIG_DAMON_*=y, install it, and reboot
-    # mount -t debugfs none /sys/kernel/debug/
-    # git clone https://github.com/awslabs/damo
-    # ./damo/damo record $(pidof <your workload>)
-    # ./damo/damo report heats --heatmap stdout
-
-The final command draws the access heatmap of ``<your workload>``.  The heatmap
-shows which memory region (x-axis) is accessed when (y-axis) and how frequently
-(number; the higher the more accesses have been observed). ::
-
-    111111111111111111111111111111111111111111111111111111110000
-    111121111111111111111111111111211111111111111111111111110000
-    000000000000000000000000000000000000000000000000001555552000
-    000000000000000000000000000000000000000000000222223555552000
-    000000000000000000000000000000000000000011111677775000000000
-    000000000000000000000000000000000000000488888000000000000000
-    000000000000000000000000000000000177888400000000000000000000
-    000000000000000000000000000046666522222100000000000000000000
-    000000000000000000000014444344444300000000000000000000000000
-    000000000000000002222245555510000000000000000000000000000000
-    # access_frequency:  0  1  2  3  4  5  6  7  8  9
-    # x-axis: space (140286319947776-140286426374096: 101.496 MiB)
-    # y-axis: time (605442256436361-605479951866441: 37.695430s)
-    # resolution: 60x10 (1.692 MiB and 3.770s for each character)
-
-
 Prerequisites
 =============
 
@@ -93,22 +61,66 @@ pattern in the ``damon.data`` file.
 Visualizing Recorded Patterns
 =============================
 
-The following three commands visualize the recorded access patterns and save
-the results as separate image files. ::
-
-    $ sudo damo report heats --heatmap access_pattern_heatmap.png
-    $ sudo damo report wss --range 0 101 1 --plot wss_dist.png
-    $ sudo damo report wss --range 0 101 1 --sortby time --plot wss_chron_change.png
-
-- ``access_pattern_heatmap.png`` will visualize the data access pattern in a
-  heatmap, showing which memory region (y-axis) got accessed when (x-axis)
-  and how frequently (color).
-- ``wss_dist.png`` will show the distribution of the working set size.
-- ``wss_chron_change.png`` will show how the working set size has
-  chronologically changed.
-
-You can view the visualizations of this example workload at [1]_.
-Visualizations of other realistic workloads are available at [2]_ [3]_ [4]_.
+You can visualize the pattern in a heatmap, showing which memory region
+(x-axis) got accessed when (y-axis) and how frequently (number).::
+
+    $ sudo damo report heats --heatmap stdout
+    22222222222222222222222222222222222222211111111111111111111111111111111111111100
+    44444444444444444444444444444444444444434444444444444444444444444444444444443200
+    44444444444444444444444444444444444444433444444444444444444444444444444444444200
+    33333333333333333333333333333333333333344555555555555555555555555555555555555200
+    33333333333333333333333333333333333344444444444444444444444444444444444444444200
+    22222222222222222222222222222222222223355555555555555555555555555555555555555200
+    00000000000000000000000000000000000000288888888888888888888888888888888888888400
+    00000000000000000000000000000000000000288888888888888888888888888888888888888400
+    33333333333333333333333333333333333333355555555555555555555555555555555555555200
+    88888888888888888888888888888888888888600000000000000000000000000000000000000000
+    88888888888888888888888888888888888888600000000000000000000000000000000000000000
+    33333333333333333333333333333333333333444444444444444444444444444444444444443200
+    00000000000000000000000000000000000000288888888888888888888888888888888888888400
+    [...]
+    # access_frequency:  0  1  2  3  4  5  6  7  8  9
+    # x-axis: space (139728247021568-139728453431248: 196.848 MiB)
+    # y-axis: time (15256597248362-15326899978162: 1 m 10.303 s)
+    # resolution: 80x40 (2.461 MiB and 1.758 s for each character)
+
+You can also visualize the distribution of the working set size, sorted by the
+size.::
+
+    $ sudo damo report wss --range 0 101 10
+    # <percentile> <wss>
+    # target_id     18446632103789443072
+    # avr:  107.708 MiB
+      0             0 B |                                                           |
+     10      95.328 MiB |****************************                               |
+     20      95.332 MiB |****************************                               |
+     30      95.340 MiB |****************************                               |
+     40      95.387 MiB |****************************                               |
+     50      95.387 MiB |****************************                               |
+     60      95.398 MiB |****************************                               |
+     70      95.398 MiB |****************************                               |
+     80      95.504 MiB |****************************                               |
+     90     190.703 MiB |*********************************************************  |
+    100     196.875 MiB |***********************************************************|
+
+Using ``--sortby`` option with the above command, you can show how the working
+set size has chronologically changed.::
+
+    $ sudo damo report wss --range 0 101 10 --sortby time
+    # <percentile> <wss>
+    # target_id     18446632103789443072
+    # avr:  107.708 MiB
+      0       3.051 MiB |                                                           |
+     10     190.703 MiB |***********************************************************|
+     20      95.336 MiB |*****************************                              |
+     30      95.328 MiB |*****************************                              |
+     40      95.387 MiB |*****************************                              |
+     50      95.332 MiB |*****************************                              |
+     60      95.320 MiB |*****************************                              |
+     70      95.398 MiB |*****************************                              |
+     80      95.398 MiB |*****************************                              |
+     90      95.340 MiB |*****************************                              |
+    100      95.398 MiB |*****************************                              |
 
 
 Data Access Pattern Aware Memory Management
@@ -120,8 +132,3 @@ accessed for >=60 seconds in your workload to be swapped out. ::
     $ echo "#min-size max-size min-acc max-acc min-age max-age action" > test_scheme
     $ echo "4K        max      0       0       60s     max     pageout" >> test_scheme
     $ damo schemes -c test_scheme <pid of your workload>
-
-.. [1] https://damonitor.github.io/doc/html/v17/admin-guide/mm/damon/start.html#visualizing-recorded-patterns
-.. [2] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.1.png.html
-.. [3] https://damonitor.github.io/test/result/visual/latest/rec.wss_sz.png.html
-.. [4] https://damonitor.github.io/test/result/visual/latest/rec.wss_time.png.html

From 25111266f8918e070a329ca5130b8e186889f5a6 Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@gmail.com>
Date: Fri, 5 Nov 2021 13:48:22 -0700
Subject: [PATCH 324/737] mm/damon: simplify stop mechanism

A kernel thread can exit gracefully with kthread_stop().  So we don't
need a new flag 'kdamond_stop'.  And to make sure the task struct is not
freed when accessing it, get reference to it before termination.

Link: https://lkml.kernel.org/r/20211027130517.4404-1-changbin.du@gmail.com
Signed-off-by: Changbin Du <changbin.du@gmail.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h |  1 -
 mm/damon/core.c       | 51 +++++++++++++------------------------------
 2 files changed, 15 insertions(+), 37 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index fa7f32614b65e..321de9d723600 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -381,7 +381,6 @@ struct damon_ctx {
 
 /* public: */
 	struct task_struct *kdamond;
-	bool kdamond_stop;
 	struct mutex kdamond_lock;
 
 	struct damon_primitive primitive;
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 46a6afea3030c..f37c17b538148 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -390,17 +390,6 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx)
 	return sz;
 }
 
-static bool damon_kdamond_running(struct damon_ctx *ctx)
-{
-	bool running;
-
-	mutex_lock(&ctx->kdamond_lock);
-	running = ctx->kdamond != NULL;
-	mutex_unlock(&ctx->kdamond_lock);
-
-	return running;
-}
-
 static int kdamond_fn(void *data);
 
 /*
@@ -418,7 +407,6 @@ static int __damon_start(struct damon_ctx *ctx)
 	mutex_lock(&ctx->kdamond_lock);
 	if (!ctx->kdamond) {
 		err = 0;
-		ctx->kdamond_stop = false;
 		ctx->kdamond = kthread_run(kdamond_fn, ctx, "kdamond.%d",
 				nr_running_ctxs);
 		if (IS_ERR(ctx->kdamond)) {
@@ -474,13 +462,15 @@ int damon_start(struct damon_ctx **ctxs, int nr_ctxs)
  */
 static int __damon_stop(struct damon_ctx *ctx)
 {
+	struct task_struct *tsk;
+
 	mutex_lock(&ctx->kdamond_lock);
-	if (ctx->kdamond) {
-		ctx->kdamond_stop = true;
+	tsk = ctx->kdamond;
+	if (tsk) {
+		get_task_struct(tsk);
 		mutex_unlock(&ctx->kdamond_lock);
-		while (damon_kdamond_running(ctx))
-			usleep_range(ctx->sample_interval,
-					ctx->sample_interval * 2);
+		kthread_stop(tsk);
+		put_task_struct(tsk);
 		return 0;
 	}
 	mutex_unlock(&ctx->kdamond_lock);
@@ -925,12 +915,8 @@ static bool kdamond_need_update_primitive(struct damon_ctx *ctx)
 static bool kdamond_need_stop(struct damon_ctx *ctx)
 {
 	struct damon_target *t;
-	bool stop;
 
-	mutex_lock(&ctx->kdamond_lock);
-	stop = ctx->kdamond_stop;
-	mutex_unlock(&ctx->kdamond_lock);
-	if (stop)
+	if (kthread_should_stop())
 		return true;
 
 	if (!ctx->primitive.target_valid)
@@ -1021,13 +1007,6 @@ static int kdamond_wait_activation(struct damon_ctx *ctx)
 	return -EBUSY;
 }
 
-static void set_kdamond_stop(struct damon_ctx *ctx)
-{
-	mutex_lock(&ctx->kdamond_lock);
-	ctx->kdamond_stop = true;
-	mutex_unlock(&ctx->kdamond_lock);
-}
-
 /*
  * The monitoring daemon that runs as a kernel thread
  */
@@ -1038,17 +1017,18 @@ static int kdamond_fn(void *data)
 	struct damon_region *r, *next;
 	unsigned int max_nr_accesses = 0;
 	unsigned long sz_limit = 0;
+	bool done = false;
 
 	pr_debug("kdamond (%d) starts\n", current->pid);
 
 	if (ctx->primitive.init)
 		ctx->primitive.init(ctx);
 	if (ctx->callback.before_start && ctx->callback.before_start(ctx))
-		set_kdamond_stop(ctx);
+		done = true;
 
 	sz_limit = damon_region_sz_limit(ctx);
 
-	while (!kdamond_need_stop(ctx)) {
+	while (!kdamond_need_stop(ctx) && !done) {
 		if (kdamond_wait_activation(ctx))
 			continue;
 
@@ -1056,7 +1036,7 @@ static int kdamond_fn(void *data)
 			ctx->primitive.prepare_access_checks(ctx);
 		if (ctx->callback.after_sampling &&
 				ctx->callback.after_sampling(ctx))
-			set_kdamond_stop(ctx);
+			done = true;
 
 		usleep_range(ctx->sample_interval, ctx->sample_interval + 1);
 
@@ -1069,7 +1049,7 @@ static int kdamond_fn(void *data)
 					sz_limit);
 			if (ctx->callback.after_aggregation &&
 					ctx->callback.after_aggregation(ctx))
-				set_kdamond_stop(ctx);
+				done = true;
 			kdamond_apply_schemes(ctx);
 			kdamond_reset_aggregated(ctx);
 			kdamond_split_regions(ctx);
@@ -1088,9 +1068,8 @@ static int kdamond_fn(void *data)
 			damon_destroy_region(r, t);
 	}
 
-	if (ctx->callback.before_terminate &&
-			ctx->callback.before_terminate(ctx))
-		set_kdamond_stop(ctx);
+	if (ctx->callback.before_terminate)
+		ctx->callback.before_terminate(ctx);
 	if (ctx->primitive.cleanup)
 		ctx->primitive.cleanup(ctx);
 

From f524ab98d09a13bb7a16261a787e7ac57f554131 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@googlemail.com>
Date: Fri, 5 Nov 2021 13:48:24 -0700
Subject: [PATCH 325/737] mm/damon: fix a few spelling mistakes in comments and
 a pr_debug message

There are a few spelling mistakes in the code.  Fix these.

Link: https://lkml.kernel.org/r/20211028184157.614544-1-colin.i.king@gmail.com
Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/core.c       | 2 +-
 mm/damon/dbgfs-test.h | 2 +-
 mm/damon/vaddr-test.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index f37c17b538148..c381b3c525d0b 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -959,7 +959,7 @@ static unsigned long damos_wmark_wait_us(struct damos *scheme)
 	/* higher than high watermark or lower than low watermark */
 	if (metric > scheme->wmarks.high || scheme->wmarks.low > metric) {
 		if (scheme->wmarks.activated)
-			pr_debug("inactivate a scheme (%d) for %s wmark\n",
+			pr_debug("deactivate a scheme (%d) for %s wmark\n",
 					scheme->action,
 					metric > scheme->wmarks.high ?
 					"high" : "low");
diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h
index 104b22957616b..86b9f9528231e 100644
--- a/mm/damon/dbgfs-test.h
+++ b/mm/damon/dbgfs-test.h
@@ -145,7 +145,7 @@ static void damon_dbgfs_test_set_init_regions(struct kunit *test)
 
 		KUNIT_EXPECT_STREQ(test, (char *)buf, expect);
 	}
-	/* Put invlid inputs and check the return error code */
+	/* Put invalid inputs and check the return error code */
 	for (i = 0; i < ARRAY_SIZE(invalid_inputs); i++) {
 		input = invalid_inputs[i];
 		pr_info("input: %s\n", input);
diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h
index 1f5c13257dbaf..ecfd0b2ed222d 100644
--- a/mm/damon/vaddr-test.h
+++ b/mm/damon/vaddr-test.h
@@ -233,7 +233,7 @@ static void damon_test_apply_three_regions3(struct kunit *test)
  * and 70-100) has totally freed and mapped to different area (30-32 and
  * 65-68).  The target regions which were in the old second and third big
  * regions should now be removed and new target regions covering the new second
- * and third big regions should be crated.
+ * and third big regions should be created.
  */
 static void damon_test_apply_three_regions4(struct kunit *test)
 {

From 705b674e9e1de3165e6c11f19bffafd0b038b4e9 Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@gmail.com>
Date: Fri, 5 Nov 2021 13:48:27 -0700
Subject: [PATCH 326/737] mm/damon: remove return value from before_terminate
 callback

Since the return value of 'before_terminate' callback is never used, we
make it have no return value.

Link: https://lkml.kernel.org/r/20211029005023.8895-1-changbin.du@gmail.com
Signed-off-by: Changbin Du <changbin.du@gmail.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 2 +-
 mm/damon/dbgfs.c      | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 321de9d723600..b4d4be3cc987f 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -322,7 +322,7 @@ struct damon_callback {
 	int (*before_start)(struct damon_ctx *context);
 	int (*after_sampling)(struct damon_ctx *context);
 	int (*after_aggregation)(struct damon_ctx *context);
-	int (*before_terminate)(struct damon_ctx *context);
+	void (*before_terminate)(struct damon_ctx *context);
 };
 
 /**
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index befb27a29aabd..eccc14b349013 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -645,18 +645,17 @@ static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx)
 		debugfs_create_file(file_names[i], 0600, dir, ctx, fops[i]);
 }
 
-static int dbgfs_before_terminate(struct damon_ctx *ctx)
+static void dbgfs_before_terminate(struct damon_ctx *ctx)
 {
 	struct damon_target *t, *next;
 
 	if (!targetid_is_pid(ctx))
-		return 0;
+		return;
 
 	damon_for_each_target_safe(t, next, ctx) {
 		put_pid((struct pid *)t->id);
 		damon_destroy_target(t);
 	}
-	return 0;
 }
 
 static struct damon_ctx *dbgfs_new_ctx(void)

From 1f4047cd1db6eebc814bf10676fb2948f02b0aef Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 18 Nov 2021 11:19:48 +0000
Subject: [PATCH 327/737] mm/damon/dbgfs: use '__GFP_NOWARN' for user-specified
 size buffer allocation

Patch series "DAMON fixes".

This patch (of 2):

DAMON users can trigger below warning in '__alloc_pages()' by invoking
write() to some DAMON debugfs files with arbitrarily high count argument,
because DAMON debugfs interface allocates some buffers based on the
user-specified 'count'.

        if (unlikely(order >= MAX_ORDER)) {
                WARN_ON_ONCE(!(gfp & __GFP_NOWARN));
                return NULL;
        }

Because the DAMON debugfs interface code checks failure of the
'kmalloc()', this commit simply suppresses the warnings by adding
'__GFP_NOWARN' flag.

Link: https://lkml.kernel.org/r/20211110145758.16558-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20211110145758.16558-2-sj@kernel.org
Fixes: 4bc05954d007 ("mm/damon: implement a debugfs-based user space interface")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/dbgfs.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index eccc14b349013..8ce1311ac5338 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -32,7 +32,7 @@ static char *user_input_str(const char __user *buf, size_t count, loff_t *ppos)
 	if (*ppos)
 		return ERR_PTR(-EINVAL);
 
-	kbuf = kmalloc(count + 1, GFP_KERNEL);
+	kbuf = kmalloc(count + 1, GFP_KERNEL | __GFP_NOWARN);
 	if (!kbuf)
 		return ERR_PTR(-ENOMEM);
 
@@ -133,7 +133,7 @@ static ssize_t dbgfs_schemes_read(struct file *file, char __user *buf,
 	char *kbuf;
 	ssize_t len;
 
-	kbuf = kmalloc(count, GFP_KERNEL);
+	kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN);
 	if (!kbuf)
 		return -ENOMEM;
 
@@ -452,7 +452,7 @@ static ssize_t dbgfs_init_regions_read(struct file *file, char __user *buf,
 	char *kbuf;
 	ssize_t len;
 
-	kbuf = kmalloc(count, GFP_KERNEL);
+	kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN);
 	if (!kbuf)
 		return -ENOMEM;
 
@@ -578,7 +578,7 @@ static ssize_t dbgfs_kdamond_pid_read(struct file *file,
 	char *kbuf;
 	ssize_t len;
 
-	kbuf = kmalloc(count, GFP_KERNEL);
+	kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN);
 	if (!kbuf)
 		return -ENOMEM;
 

From 212fb17b1654ed7293354d8cd528dd2dace48e92 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 18 Nov 2021 11:20:14 +0000
Subject: [PATCH 328/737] mm/damon/dbgfs: fix missed use of damon_dbgfs_lock

DAMON debugfs is supposed to protect dbgfs_ctxs, dbgfs_nr_ctxs, and
dbgfs_dirs using damon_dbgfs_lock.  However, some of the code is accessing
the variables without the protection.  This commit fixes it by protecting
all such accesses.

Link: https://lkml.kernel.org/r/20211110145758.16558-3-sj@kernel.org
Fixes: 75c1c2b53c78 ("mm/damon/dbgfs: support multiple contexts")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/dbgfs.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 8ce1311ac5338..9b520bb4a3e70 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -877,12 +877,14 @@ static ssize_t dbgfs_monitor_on_write(struct file *file,
 		return -EINVAL;
 	}
 
+	mutex_lock(&damon_dbgfs_lock);
 	if (!strncmp(kbuf, "on", count)) {
 		int i;
 
 		for (i = 0; i < dbgfs_nr_ctxs; i++) {
 			if (damon_targets_empty(dbgfs_ctxs[i])) {
 				kfree(kbuf);
+				mutex_unlock(&damon_dbgfs_lock);
 				return -EINVAL;
 			}
 		}
@@ -892,6 +894,7 @@ static ssize_t dbgfs_monitor_on_write(struct file *file,
 	} else {
 		ret = -EINVAL;
 	}
+	mutex_unlock(&damon_dbgfs_lock);
 
 	if (!ret)
 		ret = count;
@@ -944,15 +947,16 @@ static int __init __damon_dbgfs_init(void)
 
 static int __init damon_dbgfs_init(void)
 {
-	int rc;
+	int rc = -ENOMEM;
 
+	mutex_lock(&damon_dbgfs_lock);
 	dbgfs_ctxs = kmalloc(sizeof(*dbgfs_ctxs), GFP_KERNEL);
 	if (!dbgfs_ctxs)
-		return -ENOMEM;
+		goto out;
 	dbgfs_ctxs[0] = dbgfs_new_ctx();
 	if (!dbgfs_ctxs[0]) {
 		kfree(dbgfs_ctxs);
-		return -ENOMEM;
+		goto out;
 	}
 	dbgfs_nr_ctxs = 1;
 
@@ -963,6 +967,8 @@ static int __init damon_dbgfs_init(void)
 		pr_err("%s: dbgfs init failed\n", __func__);
 	}
 
+out:
+	mutex_unlock(&damon_dbgfs_lock);
 	return rc;
 }
 

From 2ab525488485048d6a21a70244af7d4f4d0c2e64 Mon Sep 17 00:00:00 2001
From: Ethan Chen <yishache@amazon.com>
Date: Tue, 7 Dec 2021 21:14:58 +0000
Subject: [PATCH 329/737] ena: Update to 2.6.0

Sourced from upstream git repo: https://github.com/amzn/amzn-drivers/

Change Log from Upstream:

2.5.0 -> 2.6.0
    **New Features**
    * Add "capabilities" field to negotiate device capabilities
    * Add support for kernel 5.14
    * Allow the device to signal the driver if features renogotiation is required

    **Bug Fixes**
    * Fix XDP packet fowarding on 6th generaion instances
    * Prevent device reset when device isn't responsive

    **Minor Changes**
    * Move Local Page Cache (LPC) code to a separate file
    * Reset device when receiving wrong request id on RX
    * Cosmetic changes and code restructuring
    * Fix typo in README
    * Remove redundant code

Signed-off-by: Ethan Chen <yishache@amazon.com>
---
 drivers/amazon/net/ena/Makefile         |   2 +-
 drivers/amazon/net/ena/ena_admin_defs.h |  13 +-
 drivers/amazon/net/ena/ena_com.c        |  15 +-
 drivers/amazon/net/ena/ena_com.h        |  15 +-
 drivers/amazon/net/ena/ena_eth_com.c    |   3 -
 drivers/amazon/net/ena/ena_ethtool.c    |  38 +-
 drivers/amazon/net/ena/ena_lpc.c        | 307 ++++++++++++
 drivers/amazon/net/ena/ena_lpc.h        |  38 ++
 drivers/amazon/net/ena/ena_netdev.c     | 639 ++++++++----------------
 drivers/amazon/net/ena/ena_netdev.h     |  70 +--
 drivers/amazon/net/ena/kcompat.h        |  12 +
 11 files changed, 631 insertions(+), 521 deletions(-)
 create mode 100644 drivers/amazon/net/ena/ena_lpc.c
 create mode 100644 drivers/amazon/net/ena/ena_lpc.h

diff --git a/drivers/amazon/net/ena/Makefile b/drivers/amazon/net/ena/Makefile
index 375448827df60..2595641267d20 100644
--- a/drivers/amazon/net/ena/Makefile
+++ b/drivers/amazon/net/ena/Makefile
@@ -7,6 +7,6 @@
 obj-$(CONFIG_AMAZON_ENA_ETHERNET) += ena.o
 
 ena-y := ena_netdev.o ena_com.o ena_eth_com.o ena_ethtool.o net_dim.o \
-	dim.o
+	dim.o ena_lpc.o
 
 ena-$(CONFIG_SYSFS) += ena_sysfs.o
diff --git a/drivers/amazon/net/ena/ena_admin_defs.h b/drivers/amazon/net/ena/ena_admin_defs.h
index 090198b02b888..be5ca30976279 100644
--- a/drivers/amazon/net/ena/ena_admin_defs.h
+++ b/drivers/amazon/net/ena/ena_admin_defs.h
@@ -53,6 +53,11 @@ enum ena_admin_aq_feature_id {
 	ENA_ADMIN_FEATURES_OPCODE_NUM               = 32,
 };
 
+/* device capabilities */
+enum ena_admin_aq_caps_id {
+	ENA_ADMIN_ENI_STATS                         = 0,
+};
+
 enum ena_admin_placement_policy_type {
 	/* descriptors and headers are in host memory */
 	ENA_ADMIN_PLACEMENT_POLICY_HOST             = 1,
@@ -460,7 +465,10 @@ struct ena_admin_device_attr_feature_desc {
 	 */
 	u32 supported_features;
 
-	u32 reserved3;
+	/* bitmap of ena_admin_aq_caps_id, which represents device
+	 * capabilities.
+	 */
+	u32 capabilities;
 
 	/* Indicates how many bits are used physical address access. */
 	u32 phys_addr_width;
@@ -1061,7 +1069,8 @@ enum ena_admin_aenq_group {
 	ENA_ADMIN_WARNING                           = 2,
 	ENA_ADMIN_NOTIFICATION                      = 3,
 	ENA_ADMIN_KEEP_ALIVE                        = 4,
-	ENA_ADMIN_AENQ_GROUPS_NUM                   = 5,
+	ENA_ADMIN_REFRESH_CAPABILITIES              = 5,
+	ENA_ADMIN_AENQ_GROUPS_NUM                   = 6,
 };
 
 enum ena_admin_aenq_notification_syndrome {
diff --git a/drivers/amazon/net/ena/ena_com.c b/drivers/amazon/net/ena/ena_com.c
index 5ce5d49800896..2a250dce55e2c 100644
--- a/drivers/amazon/net/ena/ena_com.c
+++ b/drivers/amazon/net/ena/ena_com.c
@@ -865,7 +865,7 @@ static u32 ena_com_reg_bar_read32(struct ena_com_dev *ena_dev, u16 offset)
 
 	if (unlikely(i == timeout)) {
 		netdev_err(ena_dev->net_device,
-			   "Reading reg failed for timeout. expected: req id[%hu] offset[%hu] actual: req id[%hu] offset[%hu]\n",
+			   "Reading reg failed for timeout. expected: req id[%u] offset[%u] actual: req id[%u] offset[%u]\n",
 			   mmio_read->seq_num, offset, read_resp->req_id,
 			   read_resp->reg_off);
 		ret = ENA_MMIO_READ_TIMEOUT;
@@ -1979,6 +1979,7 @@ int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev,
 	       sizeof(get_resp.u.dev_attr));
 
 	ena_dev->supported_features = get_resp.u.dev_attr.supported_features;
+	ena_dev->capabilities = get_resp.u.dev_attr.capabilities;
 
 	if (ena_dev->supported_features & BIT(ENA_ADMIN_MAX_QUEUES_EXT)) {
 		rc = ena_com_get_feature(ena_dev, &get_resp,
@@ -1987,7 +1988,8 @@ int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev,
 		if (rc)
 			return rc;
 
-		if (get_resp.u.max_queue_ext.version != ENA_FEATURE_MAX_QUEUE_EXT_VER)
+		if (get_resp.u.max_queue_ext.version !=
+		    ENA_FEATURE_MAX_QUEUE_EXT_VER)
 			return -EINVAL;
 
 		memcpy(&get_feat_ctx->max_queue_ext, &get_resp.u.max_queue_ext,
@@ -2233,6 +2235,13 @@ int ena_com_get_eni_stats(struct ena_com_dev *ena_dev,
 	struct ena_com_stats_ctx ctx;
 	int ret;
 
+	if (!ena_com_get_cap(ena_dev, ENA_ADMIN_ENI_STATS)) {
+		netdev_err(ena_dev->net_device,
+			   "Capability %d isn't supported\n",
+			   ENA_ADMIN_ENI_STATS);
+		return -EOPNOTSUPP;
+	}
+
 	memset(&ctx, 0x0, sizeof(ctx));
 	ret = ena_get_dev_stats(ena_dev, &ctx, ENA_ADMIN_GET_STATS_TYPE_ENI);
 	if (likely(ret == 0))
@@ -2407,7 +2416,7 @@ int ena_com_fill_hash_function(struct ena_com_dev *ena_dev,
 		if (key) {
 			if (key_len != sizeof(hash_key->key)) {
 				netdev_err(ena_dev->net_device,
-					   "key len (%hu) doesn't equal the supported size (%zu)\n",
+					   "key len (%u) doesn't equal the supported size (%zu)\n",
 					   key_len, sizeof(hash_key->key));
 				return -EINVAL;
 			}
diff --git a/drivers/amazon/net/ena/ena_com.h b/drivers/amazon/net/ena/ena_com.h
index 6ac41ca6f956b..795bd714778d9 100644
--- a/drivers/amazon/net/ena/ena_com.h
+++ b/drivers/amazon/net/ena/ena_com.h
@@ -125,7 +125,7 @@ struct ena_com_io_cq {
 
 	/* holds the number of cdesc of the current packet */
 	u16 cur_rx_pkt_cdesc_count;
-	/* save the firt cdesc idx of the current packet */
+	/* save the first cdesc idx of the current packet */
 	u16 cur_rx_pkt_cdesc_start_idx;
 
 	u16 q_depth;
@@ -318,6 +318,7 @@ struct ena_com_dev {
 
 	struct ena_rss rss;
 	u32 supported_features;
+	u32 capabilities;
 	u32 dma_addr_bits;
 
 	struct ena_host_attribute host_attr;
@@ -983,6 +984,18 @@ static inline void ena_com_disable_adaptive_moderation(struct ena_com_dev *ena_d
 	ena_dev->adaptive_coalescing = false;
 }
 
+/* ena_com_get_cap - query whether device supports a capability.
+ * @ena_dev: ENA communication layer struct
+ * @cap_id: enum value representing the capability
+ *
+ * @return - true if capability is supported or false otherwise
+ */
+static inline bool ena_com_get_cap(struct ena_com_dev *ena_dev,
+				   enum ena_admin_aq_caps_id cap_id)
+{
+	return !!(ena_dev->capabilities & BIT(cap_id));
+}
+
 /* ena_com_update_intr_reg - Prepare interrupt register
  * @intr_reg: interrupt register to update.
  * @rx_delay_interval: Rx interval in usecs
diff --git a/drivers/amazon/net/ena/ena_eth_com.c b/drivers/amazon/net/ena/ena_eth_com.c
index 3d6f0a466a9ed..f9f886289b970 100644
--- a/drivers/amazon/net/ena/ena_eth_com.c
+++ b/drivers/amazon/net/ena/ena_eth_com.c
@@ -328,9 +328,6 @@ static int ena_com_create_and_store_tx_meta_desc(struct ena_com_io_sq *io_sq,
 	 * compare it to the stored version, just create the meta
 	 */
 	if (io_sq->disable_meta_caching) {
-		if (unlikely(!ena_tx_ctx->meta_valid))
-			return -EINVAL;
-
 		*have_meta = true;
 		return ena_com_create_meta(io_sq, ena_meta);
 	}
diff --git a/drivers/amazon/net/ena/ena_ethtool.c b/drivers/amazon/net/ena/ena_ethtool.c
index 031274399b022..a3ff6fca628ec 100644
--- a/drivers/amazon/net/ena/ena_ethtool.c
+++ b/drivers/amazon/net/ena/ena_ethtool.c
@@ -120,8 +120,7 @@ static const struct ena_stats ena_stats_ena_com_strings[] = {
 #define ENA_STATS_ARRAY_TX		ARRAY_SIZE(ena_stats_tx_strings)
 #define ENA_STATS_ARRAY_RX		ARRAY_SIZE(ena_stats_rx_strings)
 #define ENA_STATS_ARRAY_ENA_COM		ARRAY_SIZE(ena_stats_ena_com_strings)
-#define ENA_STATS_ARRAY_ENI(adapter)	\
-	(ARRAY_SIZE(ena_stats_eni_strings) * (adapter)->eni_stats_supported)
+#define ENA_STATS_ARRAY_ENI(adapter)	ARRAY_SIZE(ena_stats_eni_strings)
 
 static const char ena_priv_flags_strings[][ETH_GSTRING_LEN] = {
 #define ENA_PRIV_FLAGS_LPC	BIT(0)
@@ -230,8 +229,9 @@ static void ena_get_ethtool_stats(struct net_device *netdev,
 				  u64 *data)
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
+	struct ena_com_dev *dev = adapter->ena_dev;
 
-	ena_get_stats(adapter, data, adapter->eni_stats_supported);
+	ena_get_stats(adapter, data, ena_com_get_cap(dev, ENA_ADMIN_ENI_STATS));
 }
 
 static int ena_get_sw_stats_count(struct ena_adapter *adapter)
@@ -243,7 +243,9 @@ static int ena_get_sw_stats_count(struct ena_adapter *adapter)
 
 static int ena_get_hw_stats_count(struct ena_adapter *adapter)
 {
-	return ENA_STATS_ARRAY_ENI(adapter);
+	bool supported = ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENI_STATS);
+
+	return ENA_STATS_ARRAY_ENI(adapter) * supported;
 }
 
 int ena_get_sset_count(struct net_device *netdev, int sset)
@@ -273,10 +275,10 @@ static void ena_queue_strings(struct ena_adapter *adapter, u8 **data)
 		for (j = 0; j < ENA_STATS_ARRAY_TX; j++) {
 			ena_stats = &ena_stats_tx_strings[j];
 
-			snprintf(*data, ETH_GSTRING_LEN,
-				 "queue_%u_%s_%s", i,
-				 is_xdp ? "xdp_tx" : "tx", ena_stats->name);
-			(*data) += ETH_GSTRING_LEN;
+			ethtool_sprintf(data,
+					"queue_%u_%s_%s", i,
+					is_xdp ? "xdp_tx" : "tx",
+					ena_stats->name);
 		}
 
 		if (!is_xdp) {
@@ -286,9 +288,9 @@ static void ena_queue_strings(struct ena_adapter *adapter, u8 **data)
 			for (j = 0; j < ENA_STATS_ARRAY_RX; j++) {
 				ena_stats = &ena_stats_rx_strings[j];
 
-				snprintf(*data, ETH_GSTRING_LEN,
-					 "queue_%u_rx_%s", i, ena_stats->name);
-				(*data) += ETH_GSTRING_LEN;
+				ethtool_sprintf(data,
+						"queue_%u_rx_%s", i,
+						ena_stats->name);
 			}
 		}
 	}
@@ -302,9 +304,8 @@ static void ena_com_dev_strings(u8 **data)
 	for (i = 0; i < ENA_STATS_ARRAY_ENA_COM; i++) {
 		ena_stats = &ena_stats_ena_com_strings[i];
 
-		snprintf(*data, ETH_GSTRING_LEN,
-			 "ena_admin_q_%s", ena_stats->name);
-		(*data) += ETH_GSTRING_LEN;
+		ethtool_sprintf(data,
+				"ena_admin_q_%s", ena_stats->name);
 	}
 }
 
@@ -317,15 +318,13 @@ static void ena_get_strings(struct ena_adapter *adapter,
 
 	for (i = 0; i < ENA_STATS_ARRAY_GLOBAL; i++) {
 		ena_stats = &ena_stats_global_strings[i];
-		memcpy(data, ena_stats->name, ETH_GSTRING_LEN);
-		data += ETH_GSTRING_LEN;
+		ethtool_sprintf(&data, ena_stats->name);
 	}
 
 	if (eni_stats_needed) {
 		for (i = 0; i < ENA_STATS_ARRAY_ENI(adapter); i++) {
 			ena_stats = &ena_stats_eni_strings[i];
-			memcpy(data, ena_stats->name, ETH_GSTRING_LEN);
-			data += ETH_GSTRING_LEN;
+			ethtool_sprintf(&data, ena_stats->name);
 		}
 	}
 
@@ -338,10 +337,11 @@ static void ena_get_ethtool_strings(struct net_device *netdev,
 				    u8 *data)
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
+	struct ena_com_dev *dev = adapter->ena_dev;
 
 	switch (sset) {
 	case ETH_SS_STATS:
-		ena_get_strings(adapter, data, adapter->eni_stats_supported);
+		ena_get_strings(adapter, data, ena_com_get_cap(dev, ENA_ADMIN_ENI_STATS));
 		break;
 	case ETH_SS_PRIV_FLAGS:
 		memcpy(data, ena_priv_flags_strings, sizeof(ena_priv_flags_strings));
diff --git a/drivers/amazon/net/ena/ena_lpc.c b/drivers/amazon/net/ena/ena_lpc.c
new file mode 100644
index 0000000000000..7e9c9aa1166d9
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_lpc.c
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright 2015-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+#include "ena_lpc.h"
+
+static void ena_free_ring_page_cache(struct ena_ring *rx_ring);
+
+/* Increase a stat by cnt while holding syncp seqlock on 32bit machines */
+static void ena_increase_stat(u64 *statp, u64 cnt,
+			      struct u64_stats_sync *syncp)
+{
+	u64_stats_update_begin(syncp);
+	(*statp) += cnt;
+	u64_stats_update_end(syncp);
+}
+
+static void ena_put_unmap_cache_page(struct ena_ring *rx_ring, struct ena_page *ena_page)
+{
+	dma_unmap_page(rx_ring->dev, ena_page->dma_addr, ENA_PAGE_SIZE,
+		       DMA_BIDIRECTIONAL);
+
+	put_page(ena_page->page);
+}
+
+/* Removes a page from page cache and allocate a new one instead. If an
+ * allocation of a new page fails, the cache entry isn't changed
+ */
+static void ena_replace_cache_page(struct ena_ring *rx_ring,
+				   struct ena_page *ena_page)
+{
+	struct page *new_page;
+	dma_addr_t dma;
+
+	new_page = ena_alloc_map_page(rx_ring, &dma);
+
+	if (unlikely(IS_ERR(new_page)))
+		return;
+
+	ena_put_unmap_cache_page(rx_ring, ena_page);
+
+	ena_page->page = new_page;
+	ena_page->dma_addr = dma;
+}
+
+/* Mark the cache page as used and return it. If the page belongs to a different
+ * NUMA than the current one, free the cache page and allocate another one
+ * instead.
+ */
+static struct page *ena_return_cache_page(struct ena_ring *rx_ring,
+					  struct ena_page *ena_page,
+					  dma_addr_t *dma)
+{
+	/* Remove pages belonging to different node than the one the CPU runs on */
+	if (unlikely(page_to_nid(ena_page->page) != numa_mem_id())) {
+		ena_increase_stat(&rx_ring->rx_stats.lpc_wrong_numa, 1, &rx_ring->syncp);
+		ena_replace_cache_page(rx_ring, ena_page);
+	}
+
+	/* Make sure no writes are pending for this page */
+	dma_sync_single_for_device(rx_ring->dev, ena_page->dma_addr,
+				   ENA_PAGE_SIZE,
+				   DMA_BIDIRECTIONAL);
+
+	/* Increase refcount to 2 so that the page is returned to the
+	 * cache after being freed
+	 */
+	page_ref_inc(ena_page->page);
+
+	*dma = ena_page->dma_addr;
+
+	return ena_page->page;
+}
+
+struct page *ena_lpc_get_page(struct ena_ring *rx_ring, dma_addr_t *dma,
+			      bool *is_lpc_page)
+{
+	struct ena_page_cache *page_cache = rx_ring->page_cache;
+	u32 head, cache_current_size;
+	struct ena_page *ena_page;
+
+	/* Cache size of zero indicates disabled cache */
+	if (!page_cache) {
+		*is_lpc_page = false;
+		return ena_alloc_map_page(rx_ring, dma);
+	}
+
+	*is_lpc_page = true;
+
+	cache_current_size = page_cache->current_size;
+	head = page_cache->head;
+
+	ena_page = &page_cache->cache[head];
+	/* Warm up phase. We fill the pages for the first time. The
+	 * phase is done in the napi context to improve the chances we
+	 * allocate on the correct NUMA node
+	 */
+	if (unlikely(cache_current_size < page_cache->max_size)) {
+		/* Check if oldest allocated page is free */
+		if (ena_page->page && page_ref_count(ena_page->page) == 1) {
+			page_cache->head = (head + 1) % cache_current_size;
+			return ena_return_cache_page(rx_ring, ena_page, dma);
+		}
+
+		ena_page = &page_cache->cache[cache_current_size];
+
+		/* Add a new page to the cache */
+		ena_page->page = ena_alloc_map_page(rx_ring, dma);
+		if (unlikely(IS_ERR(ena_page->page)))
+			return ena_page->page;
+
+		ena_page->dma_addr = *dma;
+
+		/* Increase refcount to 2 so that the page is returned to the
+		 * cache after being freed
+		 */
+		page_ref_inc(ena_page->page);
+
+		page_cache->current_size++;
+
+		ena_increase_stat(&rx_ring->rx_stats.lpc_warm_up, 1, &rx_ring->syncp);
+
+		return ena_page->page;
+	}
+
+	/* Next page is still in use, so we allocate outside the cache */
+	if (unlikely(page_ref_count(ena_page->page) != 1)) {
+		ena_increase_stat(&rx_ring->rx_stats.lpc_full, 1, &rx_ring->syncp);
+		*is_lpc_page = false;
+		return ena_alloc_map_page(rx_ring, dma);
+	}
+
+	page_cache->head = (head + 1) & (page_cache->max_size - 1);
+
+	return ena_return_cache_page(rx_ring, ena_page, dma);
+}
+
+bool ena_is_lpc_supported(struct ena_adapter *adapter,
+			  struct ena_ring *rx_ring,
+			  bool error_print)
+{
+#ifdef ENA_NETDEV_LOGS_WITHOUT_RV
+	void (*print_log)(const struct net_device *dev, const char *format, ...);
+#else
+	int (*print_log)(const struct net_device *dev, const char *format, ...);
+#endif
+	int channels_nr = adapter->num_io_queues + adapter->xdp_num_queues;
+
+	print_log = (error_print) ? netdev_err : netdev_info;
+
+	/* LPC is disabled below min number of channels */
+	if (channels_nr < ENA_LPC_MIN_NUM_OF_CHANNELS) {
+		print_log(adapter->netdev,
+			  "Local page cache is disabled for less than %d channels\n",
+			  ENA_LPC_MIN_NUM_OF_CHANNELS);
+
+		/* Disable LPC for such case. It can enabled again through
+		 * ethtool private-flag.
+		 */
+		adapter->used_lpc_size = 0;
+
+		return false;
+	}
+#ifdef ENA_XDP_SUPPORT
+
+	/* The driver doesn't support page caches under XDP */
+	if (ena_xdp_present_ring(rx_ring)) {
+		print_log(adapter->netdev,
+			  "Local page cache is disabled when using XDP\n");
+		return false;
+	}
+#endif /* ENA_XDP_SUPPORT */
+
+	return true;
+}
+
+/* Calculate the size of the Local Page Cache. If LPC should be disabled, return
+ * a size of 0.
+ */
+static u32 ena_calculate_cache_size(struct ena_adapter *adapter,
+				    struct ena_ring *rx_ring)
+{
+	u32 page_cache_size = adapter->used_lpc_size;
+
+	/* LPC cache size of 0 means disabled cache */
+	if (page_cache_size == 0)
+		return 0;
+
+	if (!ena_is_lpc_supported(adapter, rx_ring, false))
+		return 0;
+
+	/* Clap the LPC size to its maximum value */
+	if (page_cache_size > ENA_LPC_MAX_MULTIPLIER) {
+		netdev_info(adapter->netdev,
+			    "Configured LPC size %d is too large, reducing to %d (max)\n",
+			    adapter->configured_lpc_size, ENA_LPC_MAX_MULTIPLIER);
+
+		/* Override LPC size to avoid printing this message
+		 * every up/down operation
+		 */
+		adapter->configured_lpc_size = ENA_LPC_MAX_MULTIPLIER;
+		adapter->used_lpc_size = page_cache_size = ENA_LPC_MAX_MULTIPLIER;
+	}
+
+	page_cache_size = page_cache_size * ENA_LPC_MULTIPLIER_UNIT;
+	page_cache_size = roundup_pow_of_two(page_cache_size);
+
+	return page_cache_size;
+}
+
+int ena_create_page_caches(struct ena_adapter *adapter)
+{
+	struct ena_page_cache *cache;
+	u32 page_cache_size;
+	int i;
+
+	for (i = 0; i < adapter->num_io_queues; i++) {
+		struct ena_ring *rx_ring = &adapter->rx_ring[i];
+
+		page_cache_size = ena_calculate_cache_size(adapter, rx_ring);
+
+		if (!page_cache_size)
+			return 0;
+
+		cache = vzalloc(sizeof(struct ena_page_cache) +
+				sizeof(struct ena_page) * page_cache_size);
+		if (!cache)
+			goto err_cache_alloc;
+
+		cache->max_size = page_cache_size;
+		rx_ring->page_cache = cache;
+	}
+
+	return 0;
+err_cache_alloc:
+	netif_err(adapter, ifup, adapter->netdev,
+		  "Failed to initialize local page caches (LPCs)\n");
+	while (--i >= 0) {
+		struct ena_ring *rx_ring = &adapter->rx_ring[i];
+
+		ena_free_ring_page_cache(rx_ring);
+	}
+
+	return -ENOMEM;
+}
+
+/* Release all pages from the page cache */
+static void ena_free_ring_cache_pages(struct ena_adapter *adapter, int qid)
+{
+	struct ena_ring *rx_ring = &adapter->rx_ring[qid];
+	struct ena_page_cache *page_cache;
+	int i;
+
+	/* Page cache is disabled */
+	if (!rx_ring->page_cache)
+		return;
+
+	page_cache = rx_ring->page_cache;
+
+	/* We check size value to make sure we don't
+	 * free pages that weren't allocated.
+	 */
+	for (i = 0; i < page_cache->current_size; i++) {
+		struct ena_page *ena_page = &page_cache->cache[i];
+
+		WARN_ON(!ena_page->page);
+
+		dma_unmap_page(rx_ring->dev, ena_page->dma_addr,
+			       ENA_PAGE_SIZE,
+			       DMA_BIDIRECTIONAL);
+
+		/* If the page is also in the rx buffer, then this operation
+		 * would only decrease its reference count
+		 */
+		__free_page(ena_page->page);
+	}
+
+	page_cache->head = page_cache->current_size = 0;
+}
+
+void ena_free_all_cache_pages(struct ena_adapter *adapter)
+{
+	int i;
+
+	for (i = 0; i < adapter->num_io_queues; i++)
+		ena_free_ring_cache_pages(adapter, i);
+}
+
+static void ena_free_ring_page_cache(struct ena_ring *rx_ring)
+{
+	if(!rx_ring->page_cache)
+		return;
+
+	vfree(rx_ring->page_cache);
+	rx_ring->page_cache = NULL;
+}
+
+void ena_free_page_caches(struct ena_adapter *adapter)
+{
+	int i;
+
+	for (i = 0; i < adapter->num_io_queues; i++) {
+		struct ena_ring *rx_ring = &adapter->rx_ring[i];
+
+		ena_free_ring_page_cache(rx_ring);
+	}
+}
diff --git a/drivers/amazon/net/ena/ena_lpc.h b/drivers/amazon/net/ena/ena_lpc.h
new file mode 100644
index 0000000000000..2953eb24ac4dd
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_lpc.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright 2015-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include "ena_netdev.h"
+
+/* LPC definitions */
+#define ENA_LPC_DEFAULT_MULTIPLIER 2
+#define ENA_LPC_MAX_MULTIPLIER 32
+#define ENA_LPC_MULTIPLIER_UNIT 1024
+#define ENA_LPC_MIN_NUM_OF_CHANNELS 16
+
+/* Store DMA address along with the page */
+struct ena_page {
+	struct page *page;
+	dma_addr_t dma_addr;
+};
+
+struct ena_page_cache {
+	/* How many pages are produced */
+	u32 head;
+	/* How many of the entries were initialized */
+	u32 current_size;
+	/* Maximum number of pages the cache can hold */
+	u32 max_size;
+
+	struct ena_page cache[0];
+} ____cacheline_aligned;
+
+int ena_create_page_caches(struct ena_adapter *adapter);
+void ena_free_page_caches(struct ena_adapter *adapter);
+void ena_free_all_cache_pages(struct ena_adapter *adapter);
+struct page *ena_lpc_get_page(struct ena_ring *rx_ring, dma_addr_t *dma,
+			      bool *is_lpc_page);
+bool ena_is_lpc_supported(struct ena_adapter *adapter,
+			  struct ena_ring *rx_ring,
+			  bool error_print);
diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c
index 4cccbba579dd4..057b4de2ebad1 100644
--- a/drivers/amazon/net/ena/ena_netdev.c
+++ b/drivers/amazon/net/ena/ena_netdev.c
@@ -29,6 +29,8 @@
 #include "ena_pci_id_tbl.h"
 #include "ena_sysfs.h"
 
+#include "ena_lpc.h"
+
 static char version[] = DEVICE_NAME " v" DRV_MODULE_GENERATION "\n";
 
 MODULE_AUTHOR("Amazon.com, Inc. or its affiliates");
@@ -49,6 +51,7 @@ MODULE_VERSION(DRV_MODULE_GENERATION);
 
 #define ENA_SKB_PULL_MIN_LEN 64
 #endif
+
 static int debug = -1;
 module_param(debug, int, 0444);
 MODULE_PARM_DESC(debug, "Debug level (-1=default,0=none,...,16=all)");
@@ -79,7 +82,10 @@ static int ena_rss_init_default(struct ena_adapter *adapter);
 static void check_for_admin_com_state(struct ena_adapter *adapter);
 static void ena_destroy_device(struct ena_adapter *adapter, bool graceful);
 static int ena_restore_device(struct ena_adapter *adapter);
-static int ena_create_page_caches(struct ena_adapter *adapter);
+static void ena_calc_io_queue_size(struct ena_adapter *adapter,
+				   struct ena_com_dev_get_features_ctx *get_feat_ctx);
+static void ena_set_dev_offloads(struct ena_com_dev_get_features_ctx *feat,
+				 struct net_device *netdev);
 
 #ifdef ENA_XDP_SUPPORT
 static void ena_init_io_rings(struct ena_adapter *adapter,
@@ -288,11 +294,11 @@ static int ena_xdp_io_poll(struct napi_struct *napi, int budget)
 static int ena_xdp_tx_map_frame(struct ena_ring *xdp_ring,
 				struct ena_tx_buffer *tx_info,
 				struct xdp_frame *xdpf,
-				void **push_hdr,
-				u32 *push_len)
+				struct ena_com_tx_ctx *ena_tx_ctx)
 {
 	struct ena_adapter *adapter = xdp_ring->adapter;
 	struct ena_com_buf *ena_buf;
+	int push_len = 0;
 	dma_addr_t dma;
 	void *data;
 	u32 size;
@@ -301,31 +307,34 @@ static int ena_xdp_tx_map_frame(struct ena_ring *xdp_ring,
 	data = tx_info->xdpf->data;
 	size = tx_info->xdpf->len;
 
-	*push_len = 0;
-
 	if (xdp_ring->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
-		/* LLQ push buffer */
-		*push_len = min_t(u32, size, xdp_ring->tx_max_header_size);
-		*push_hdr = data;
+		/* Designate part of the packet for LLQ */
+		push_len = min_t(u32, size, xdp_ring->tx_max_header_size);
 
-		size -= *push_len;
-	} else {
-		*push_hdr = NULL;
+		ena_tx_ctx->push_header = data;
+
+		size -= push_len;
+		data += push_len;
 	}
 
+	ena_tx_ctx->header_len = push_len;
+
 	if (size > 0) {
 		dma = dma_map_single(xdp_ring->dev,
-				     data + *push_len,
+				     data,
 				     size,
 				     DMA_TO_DEVICE);
 		if (unlikely(dma_mapping_error(xdp_ring->dev, dma)))
 			goto error_report_dma_error;
 
 		tx_info->map_linear_data = 0;
-		tx_info->num_of_bufs = 1;
+
 		ena_buf = tx_info->bufs;
 		ena_buf->paddr = dma;
 		ena_buf->len = size;
+
+		ena_tx_ctx->ena_bufs = ena_buf;
+		ena_tx_ctx->num_bufs = tx_info->num_of_bufs = 1;
 	}
 
 	return 0;
@@ -346,8 +355,6 @@ static int ena_xdp_xmit_frame(struct ena_ring *xdp_ring,
 	struct ena_com_tx_ctx ena_tx_ctx = {};
 	struct ena_tx_buffer *tx_info;
 	u16 next_to_use, req_id;
-	void *push_hdr;
-	u32 push_len;
 	int rc;
 
 	next_to_use = xdp_ring->next_to_use;
@@ -355,15 +362,11 @@ static int ena_xdp_xmit_frame(struct ena_ring *xdp_ring,
 	tx_info = &xdp_ring->tx_buffer_info[req_id];
 	tx_info->num_of_bufs = 0;
 
-	rc = ena_xdp_tx_map_frame(xdp_ring, tx_info, xdpf, &push_hdr, &push_len);
+	rc = ena_xdp_tx_map_frame(xdp_ring, tx_info, xdpf, &ena_tx_ctx);
 	if (unlikely(rc))
-		goto error_drop_packet;
+		return rc;
 
-	ena_tx_ctx.ena_bufs = tx_info->bufs;
-	ena_tx_ctx.push_header = push_hdr;
-	ena_tx_ctx.num_bufs = tx_info->num_of_bufs;
 	ena_tx_ctx.req_id = req_id;
-	ena_tx_ctx.header_len = push_len;
 
 	rc = ena_xmit_common(dev,
 			     xdp_ring,
@@ -373,6 +376,7 @@ static int ena_xdp_xmit_frame(struct ena_ring *xdp_ring,
 			     xdpf->len);
 	if (rc)
 		goto error_unmap_dma;
+
 	/* trigger the dma engine. ena_ring_tx_doorbell()
 	 * calls a memory barrier inside it.
 	 */
@@ -384,8 +388,6 @@ static int ena_xdp_xmit_frame(struct ena_ring *xdp_ring,
 error_unmap_dma:
 	ena_unmap_tx_buff(xdp_ring, tx_info);
 	tx_info->xdpf = NULL;
-error_drop_packet:
-	xdp_return_frame(xdpf);
 	return rc;
 }
 
@@ -393,8 +395,8 @@ static int ena_xdp_xmit(struct net_device *dev, int n,
 			struct xdp_frame **frames, u32 flags)
 {
 	struct ena_adapter *adapter = netdev_priv(dev);
-	int qid, i, err, drops = 0;
 	struct ena_ring *xdp_ring;
+	int qid, i, nxmit = 0;
 
 	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
 		return -EINVAL;
@@ -414,12 +416,9 @@ static int ena_xdp_xmit(struct net_device *dev, int n,
 	spin_lock(&xdp_ring->xdp_tx_lock);
 
 	for (i = 0; i < n; i++) {
-		err = ena_xdp_xmit_frame(xdp_ring, dev, frames[i], 0);
-		/* The descriptor is freed by ena_xdp_xmit_frame in case
-		 * of an error.
-		 */
-		if (err)
-			drops++;
+		if (ena_xdp_xmit_frame(xdp_ring, dev, frames[i], 0))
+			break;
+		nxmit++;
 	}
 
 	/* Ring doorbell to make device aware of the packets */
@@ -428,8 +427,13 @@ static int ena_xdp_xmit(struct net_device *dev, int n,
 
 	spin_unlock(&xdp_ring->xdp_tx_lock);
 
+#ifndef ENA_XDP_XMIT_FREES_FAILED_DESCS_INTERNALLY
+	for (i = nxmit; unlikely(i < n); i++)
+		xdp_return_frame(frames[i]);
+
+#endif
 	/* Return number of packets sent */
-	return n - drops;
+	return nxmit;
 }
 
 static int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp)
@@ -468,7 +472,9 @@ static int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp)
 		/* The XDP queues are shared between XDP_TX and XDP_REDIRECT */
 		spin_lock(&xdp_ring->xdp_tx_lock);
 
-		ena_xdp_xmit_frame(xdp_ring, rx_ring->netdev, xdpf, XDP_XMIT_FLUSH);
+		if (ena_xdp_xmit_frame(xdp_ring, rx_ring->netdev, xdpf,
+				       XDP_XMIT_FLUSH))
+			xdp_return_frame(xdpf);
 
 		spin_unlock(&xdp_ring->xdp_tx_lock);
 		xdp_stat = &rx_ring->rx_stats.xdp_tx;
@@ -1061,15 +1067,7 @@ static void ena_free_all_io_rx_resources(struct ena_adapter *adapter)
 		ena_free_rx_resources(adapter, i);
 }
 
-static void ena_put_unmap_cache_page(struct ena_ring *rx_ring, struct ena_page *ena_page)
-{
-	dma_unmap_page(rx_ring->dev, ena_page->dma_addr, ENA_PAGE_SIZE,
-		       DMA_BIDIRECTIONAL);
-
-	put_page(ena_page->page);
-}
-
-static struct page *ena_alloc_map_page(struct ena_ring *rx_ring, dma_addr_t *dma)
+struct page *ena_alloc_map_page(struct ena_ring *rx_ring, dma_addr_t *dma)
 {
 	struct page *page;
 
@@ -1077,8 +1075,11 @@ static struct page *ena_alloc_map_page(struct ena_ring *rx_ring, dma_addr_t *dma
 	 * is running on.
 	 */
 	page = dev_alloc_page();
-	if (!page)
-		return NULL;
+	if (!page) {
+		ena_increase_stat(&rx_ring->rx_stats.page_alloc_fail, 1,
+				  &rx_ring->syncp);
+		return ERR_PTR(-ENOSPC);
+	}
 
 	/* To enable NIC-side port-mirroring, AKA SPAN port,
 	 * we make the buffer readable from the nic as well
@@ -1086,132 +1087,21 @@ static struct page *ena_alloc_map_page(struct ena_ring *rx_ring, dma_addr_t *dma
 	*dma = dma_map_page(rx_ring->dev, page, 0, ENA_PAGE_SIZE,
 			    DMA_BIDIRECTIONAL);
 	if (unlikely(dma_mapping_error(rx_ring->dev, *dma))) {
+		ena_increase_stat(&rx_ring->rx_stats.dma_mapping_err, 1,
+				  &rx_ring->syncp);
 		__free_page(page);
-		return NULL;
+		return ERR_PTR(-EIO);
 	}
 
 	return page;
 }
 
-/* Removes a page from page cache and allocate a new one instead. If an
- * allocation of a new page fails, the cache entry isn't changed
- */
-static void ena_replace_cache_page(struct ena_ring *rx_ring,
-				   struct ena_page *ena_page)
-{
-	struct page *new_page;
-	dma_addr_t dma;
-
-	new_page = ena_alloc_map_page(rx_ring, &dma);
-
-	if (likely(new_page)) {
-		ena_put_unmap_cache_page(rx_ring, ena_page);
-
-		ena_page->page = new_page;
-		ena_page->dma_addr = dma;
-	}
-}
-
-/* Mark the cache page as used and return it. If the page belongs to a different
- * NUMA than the current one, free the cache page and allocate another one
- * instead.
- */
-static struct page *ena_return_cache_page(struct ena_ring *rx_ring,
-					  struct ena_page *ena_page,
-					  dma_addr_t *dma,
-					  int current_nid)
-{
-	/* Remove pages belonging to different node than current_nid from cache */
-	if (unlikely(page_to_nid(ena_page->page) != current_nid)) {
-		ena_increase_stat(&rx_ring->rx_stats.lpc_wrong_numa, 1, &rx_ring->syncp);
-		ena_replace_cache_page(rx_ring, ena_page);
-	}
-
-	/* Make sure no writes are pending for this page */
-	dma_sync_single_for_device(rx_ring->dev, ena_page->dma_addr,
-				   ENA_PAGE_SIZE,
-				   DMA_BIDIRECTIONAL);
-
-	/* Increase refcount to 2 so that the page is returned to the
-	 * cache after being freed
-	 */
-	page_ref_inc(ena_page->page);
-
-	*dma = ena_page->dma_addr;
-
-	return ena_page->page;
-}
-
-static struct page *ena_get_page(struct ena_ring *rx_ring, dma_addr_t *dma,
-				 int current_nid, bool *is_lpc_page)
-{
-	struct ena_page_cache *page_cache = rx_ring->page_cache;
-	u32 head, cache_current_size;
-	struct ena_page *ena_page;
-
-	/* Cache size of zero indicates disabled cache */
-	if (!page_cache) {
-		*is_lpc_page = false;
-		return ena_alloc_map_page(rx_ring, dma);
-	}
-
-	*is_lpc_page = true;
-
-	cache_current_size = page_cache->current_size;
-	head = page_cache->head;
-
-	ena_page = &page_cache->cache[head];
-	/* Warm up phase. We fill the pages for the first time. The
-	 * phase is done in the napi context to improve the chances we
-	 * allocate on the correct NUMA node
-	 */
-	if (unlikely(cache_current_size < page_cache->max_size)) {
-		/* Check if oldest allocated page is free */
-		if (ena_page->page && page_ref_count(ena_page->page) == 1) {
-			page_cache->head = (head + 1) % cache_current_size;
-			return ena_return_cache_page(rx_ring, ena_page, dma, current_nid);
-		}
-
-		ena_page = &page_cache->cache[cache_current_size];
-
-		/* Add a new page to the cache */
-		ena_page->page = ena_alloc_map_page(rx_ring, dma);
-		if (unlikely(!ena_page->page))
-			return NULL;
-
-		ena_page->dma_addr = *dma;
-
-		/* Increase refcount to 2 so that the page is returned to the
-		 * cache after being freed
-		 */
-		page_ref_inc(ena_page->page);
-
-		page_cache->current_size++;
-
-		ena_increase_stat(&rx_ring->rx_stats.lpc_warm_up, 1, &rx_ring->syncp);
-
-		return ena_page->page;
-	}
-
-	/* Next page is still in use, so we allocate outside the cache */
-	if (unlikely(page_ref_count(ena_page->page) != 1)) {
-		ena_increase_stat(&rx_ring->rx_stats.lpc_full, 1, &rx_ring->syncp);
-		*is_lpc_page = false;
-		return ena_alloc_map_page(rx_ring, dma);
-	}
-
-	page_cache->head = (head + 1) & (page_cache->max_size - 1);
-
-	return ena_return_cache_page(rx_ring, ena_page, dma, current_nid);
-}
-
-static int ena_alloc_rx_page(struct ena_ring *rx_ring,
-			     struct ena_rx_buffer *rx_info, int current_nid)
+static int ena_alloc_rx_buffer(struct ena_ring *rx_ring,
+			       struct ena_rx_buffer *rx_info)
 {
 	int headroom = rx_ring->rx_headroom;
 	struct ena_com_buf *ena_buf;
 	struct page *page;
-	bool is_lpc_page;
 	dma_addr_t dma;
 	int tailroom;
 
@@ -1223,12 +1113,9 @@ static int ena_alloc_rx_page(struct ena_ring *rx_ring,
 		return 0;
 
 	/* We handle DMA here */
-	page = ena_get_page(rx_ring, &dma, current_nid, &is_lpc_page);
-	if (unlikely(!page)) {
-		ena_increase_stat(&rx_ring->rx_stats.page_alloc_fail, 1,
-				  &rx_ring->syncp);
-		return -ENOMEM;
-	}
+	page = ena_lpc_get_page(rx_ring, &dma, &rx_info->is_lpc_page);
+	if (unlikely(IS_ERR(page)))
+		return PTR_ERR(page);
 
 	netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
 		  "Allocate page %p, rx_info %p\n", page, rx_info);
@@ -1236,7 +1123,7 @@ static int ena_alloc_rx_page(struct ena_ring *rx_ring,
 	tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 
 	rx_info->page = page;
-	rx_info->is_lpc_page = is_lpc_page;
+	rx_info->dma_addr = dma;
 	ena_buf = &rx_info->ena_buf;
 	ena_buf->paddr = dma + headroom;
 	ena_buf->len = ENA_PAGE_SIZE - headroom - tailroom;
@@ -1247,13 +1134,13 @@ static int ena_alloc_rx_page(struct ena_ring *rx_ring,
 static void ena_unmap_rx_buff(struct ena_ring *rx_ring,
 			      struct ena_rx_buffer *rx_info)
 {
-	struct ena_com_buf *ena_buf = &rx_info->ena_buf;
-
 	/* LPC pages are unmapped at cache destruction */
-	if (!rx_info->is_lpc_page)
-		dma_unmap_page(rx_ring->dev, ena_buf->paddr - rx_ring->rx_headroom,
-			       ENA_PAGE_SIZE,
-			       DMA_BIDIRECTIONAL);
+	if (rx_info->is_lpc_page)
+		return;
+
+	dma_unmap_page(rx_ring->dev, rx_info->dma_addr,
+		       ENA_PAGE_SIZE,
+		       DMA_BIDIRECTIONAL);
 }
 
 static void ena_free_rx_page(struct ena_ring *rx_ring,
@@ -1276,13 +1163,9 @@ static void ena_free_rx_page(struct ena_ring *rx_ring,
 static int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num)
 {
 	u16 next_to_use, req_id;
-	int current_nid;
 	u32 i;
 	int rc;
 
-	/* Prefer pages to be allocate on the same NUMA as the CPU */
-	current_nid = numa_mem_id();
-
 	next_to_use = rx_ring->next_to_use;
 
 	for (i = 0; i < num; i++) {
@@ -1292,7 +1175,7 @@ static int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num)
 
 		rx_info = &rx_ring->rx_buffer_info[req_id];
 
-		rc = ena_alloc_rx_page(rx_ring, rx_info, current_nid);
+		rc = ena_alloc_rx_buffer(rx_ring, rx_info);
 		if (unlikely(rc < 0)) {
 			netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev,
 				   "Failed to allocate buffer for rx queue %d\n",
@@ -1363,52 +1246,12 @@ static void ena_refill_all_rx_bufs(struct ena_adapter *adapter)
 	}
 }
 
-/* Release all pages from the page cache */
-static void ena_free_ring_cache_pages(struct ena_adapter *adapter, int qid)
-{
-	struct ena_ring *rx_ring = &adapter->rx_ring[qid];
-	struct ena_page_cache *page_cache;
-	int i;
-
-	/* Page cache is disabled */
-	if (!rx_ring->page_cache)
-		return;
-
-	page_cache = rx_ring->page_cache;
-
-	/* We check size value to make sure we don't
-	 * free pages that weren't allocated.
-	 */
-	for (i = 0; i < page_cache->current_size; i++) {
-		struct ena_page *ena_page = &page_cache->cache[i];
-
-		/* The cache pages can be at most held by two entities */
-		WARN_ON(!ena_page->page || page_ref_count(ena_page->page) > 2);
-
-		dma_unmap_page(rx_ring->dev, ena_page->dma_addr,
-			       ENA_PAGE_SIZE,
-			       DMA_BIDIRECTIONAL);
-
-		/* If the page is also in the rx buffer, then this operation
-		 * would only decrease its reference count
-		 */
-		__free_page(ena_page->page);
-	}
-
-	page_cache->head = page_cache->current_size = 0;
-}
-
 static void ena_free_all_rx_bufs(struct ena_adapter *adapter)
 {
 	int i;
 
-	for (i = 0; i < adapter->num_io_queues; i++) {
-		/* The RX SQ's packet should be freed first, since they don't
-		 * unmap pages that belong to the page_cache.
-		 */
+	for (i = 0; i < adapter->num_io_queues; i++)
 		ena_free_rx_bufs(adapter, i);
-		ena_free_ring_cache_pages(adapter, i);
-	}
 }
 
 static void ena_unmap_tx_buff(struct ena_ring *tx_ring,
@@ -1521,14 +1364,14 @@ static int handle_invalid_req_id(struct ena_ring *ring, u16 req_id,
 		netif_err(ring->adapter,
 			  tx_done,
 			  ring->netdev,
-			  "tx_info doesn't have valid %s",
-			   is_xdp ? "xdp frame" : "skb");
+			  "tx_info doesn't have valid %s. qid %u req_id %u",
+			   is_xdp ? "xdp frame" : "skb", ring->qid, req_id);
 	else
 		netif_err(ring->adapter,
 			  tx_done,
 			  ring->netdev,
-			  "Invalid req_id: %hu\n",
-			  req_id);
+			  "Invalid req_id %u in qid %u\n",
+			  req_id, ring->qid);
 
 	ena_increase_stat(&ring->tx_stats.bad_req_id, 1, &ring->syncp);
 
@@ -1540,13 +1383,11 @@ static int handle_invalid_req_id(struct ena_ring *ring, u16 req_id,
 
 static int validate_tx_req_id(struct ena_ring *tx_ring, u16 req_id)
 {
-	struct ena_tx_buffer *tx_info = NULL;
+	struct ena_tx_buffer *tx_info;
 
-	if (likely(req_id < tx_ring->ring_size)) {
-		tx_info = &tx_ring->tx_buffer_info[req_id];
-		if (likely(tx_info->skb))
-			return 0;
-	}
+	tx_info = &tx_ring->tx_buffer_info[req_id];
+	if (likely(tx_info->skb))
+		return 0;
 
 	return handle_invalid_req_id(tx_ring, req_id, tx_info, false);
 }
@@ -1554,13 +1395,11 @@ static int validate_tx_req_id(struct ena_ring *tx_ring, u16 req_id)
 #ifdef ENA_XDP_SUPPORT
 static int validate_xdp_req_id(struct ena_ring *xdp_ring, u16 req_id)
 {
-	struct ena_tx_buffer *tx_info = NULL;
+	struct ena_tx_buffer *tx_info;
 
-	if (likely(req_id < xdp_ring->ring_size)) {
-		tx_info = &xdp_ring->tx_buffer_info[req_id];
-		if (likely(tx_info->xdpf))
-			return 0;
-	}
+	tx_info = &xdp_ring->tx_buffer_info[req_id];
+	if (likely(tx_info->xdpf))
+		return 0;
 
 	return handle_invalid_req_id(xdp_ring, req_id, tx_info, true);
 }
@@ -1586,9 +1425,14 @@ static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget)
 
 		rc = ena_com_tx_comp_req_id_get(tx_ring->ena_com_io_cq,
 						&req_id);
-		if (rc)
+		if (rc) {
+			if (unlikely(rc == -EINVAL))
+				handle_invalid_req_id(tx_ring, req_id, NULL,
+						      false);
 			break;
+		}
 
+		/* validate that the request id points to a valid skb */
 		rc = validate_tx_req_id(tx_ring, req_id);
 		if (rc)
 			break;
@@ -1686,6 +1530,7 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 				  u16 *next_to_clean)
 {
 	struct ena_rx_buffer *rx_info;
+	struct ena_adapter *adapter;
 	u16 len, req_id, buf = 0;
 	struct sk_buff *skb;
 	void *page_addr;
@@ -1701,8 +1546,13 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 	rx_info = &rx_ring->rx_buffer_info[req_id];
 
 	if (unlikely(!rx_info->page)) {
-		netif_err(rx_ring->adapter, rx_err, rx_ring->netdev,
-			  "Page is NULL\n");
+		adapter = rx_ring->adapter;
+		netif_err(adapter, rx_err, rx_ring->netdev,
+			  "Page is NULL. qid %u req_id %u\n", rx_ring->qid, req_id);
+		ena_increase_stat(&rx_ring->rx_stats.bad_req_id, 1, &rx_ring->syncp);
+		adapter->reset_reason = ENA_REGS_RESET_INV_RX_REQ_ID;
+		smp_mb__before_atomic();
+		set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
 		return NULL;
 	}
 
@@ -2208,9 +2058,14 @@ static int ena_clean_xdp_irq(struct ena_ring *xdp_ring, u32 budget)
 
 		rc = ena_com_tx_comp_req_id_get(xdp_ring->ena_com_io_cq,
 						&req_id);
-		if (rc)
+		if (rc) {
+			if (unlikely(rc == -EINVAL))
+				handle_invalid_req_id(xdp_ring, req_id, NULL,
+						      true);
 			break;
+		}
 
+		/* validate that the request id points to a valid skb */
 		rc = validate_xdp_req_id(xdp_ring, req_id);
 		if (rc)
 			break;
@@ -3026,133 +2881,6 @@ static int create_queues_with_size_backoff(struct ena_adapter *adapter)
 	}
 }
 
-static void ena_free_ring_page_cache(struct ena_ring *rx_ring)
-{
-	if(!rx_ring->page_cache)
-		return;
-
-	vfree(rx_ring->page_cache);
-	rx_ring->page_cache = NULL;
-}
-
-static bool ena_is_lpc_supported(struct ena_adapter *adapter,
-				 struct ena_ring *rx_ring,
-				 bool error_print)
-{
-#ifdef ENA_NETDEV_LOGS_WITHOUT_RV
-	void (*print_log)(const struct net_device *dev, const char *format, ...);
-#else
-	int (*print_log)(const struct net_device *dev, const char *format, ...);
-#endif
-	int channels_nr = adapter->num_io_queues + adapter->xdp_num_queues;
-
-	print_log = (error_print) ? netdev_err : netdev_info;
-
-	/* LPC is disabled below min number of channels */
-	if (channels_nr < ENA_LPC_MIN_NUM_OF_CHANNELS) {
-		print_log(adapter->netdev,
-			  "Local page cache is disabled for less than %d channels\n",
-			  ENA_LPC_MIN_NUM_OF_CHANNELS);
-
-		/* Disable LPC for such case. It can enabled again through
-		 * ethtool private-flag.
-		 */
-		adapter->lpc_size = 0;
-
-		return false;
-	}
-#ifdef ENA_XDP_SUPPORT
-
-	/* The driver doesn't support page caches under XDP */
-	if (ena_xdp_present_ring(rx_ring)) {
-		print_log(adapter->netdev,
-			  "Local page cache is disabled when using XDP\n");
-		return false;
-	}
-#endif /* ENA_XDP_SUPPORT */
-
-	return true;
-}
-
-/* Calculate the size of the Local Page Cache. If LPC should be disabled, return
- * a size of 0.
- */
-static u32 ena_calculate_cache_size(struct ena_adapter *adapter,
-				    struct ena_ring *rx_ring)
-{
-	u32 page_cache_size = adapter->lpc_size;
-
-	/* LPC cache size of 0 means disabled cache */
-	if (page_cache_size == 0)
-		return 0;
-
-	if (!ena_is_lpc_supported(adapter, rx_ring, false))
-		return 0;
-
-	/* Clap the LPC size to its maximum value */
-	if (page_cache_size > ENA_LPC_MAX_MULTIPLIER) {
-		netdev_info(adapter->netdev,
-			    "Provided lpc_size %d is too large, reducing to %d (max)\n",
-			    lpc_size, ENA_LPC_MAX_MULTIPLIER);
-		/* Override LPC size to avoid printing this message
-		 * every up/down operation
-		 */
-		adapter->lpc_size = page_cache_size = lpc_size = ENA_LPC_MAX_MULTIPLIER;
-	}
-
-	page_cache_size = page_cache_size * ENA_LPC_MULTIPLIER_UNIT;
-	page_cache_size = roundup_pow_of_two(page_cache_size);
-
-	return page_cache_size;
-}
-
-static int ena_create_page_caches(struct ena_adapter *adapter)
-{
-	struct ena_page_cache *cache;
-	u32 page_cache_size;
-	int i;
-
-	for (i = 0; i < adapter->num_io_queues; i++) {
-		struct ena_ring *rx_ring = &adapter->rx_ring[i];
-
-		page_cache_size = ena_calculate_cache_size(adapter, rx_ring);
-
-		if (!page_cache_size)
-			return 0;
-
-		cache = vzalloc(sizeof(struct ena_page_cache) +
-				sizeof(struct ena_page) * page_cache_size);
-		if (!cache)
-			goto err_cache_alloc;
-
-		cache->max_size = page_cache_size;
-		rx_ring->page_cache = cache;
-	}
-
-	return 0;
-err_cache_alloc:
-	netif_err(adapter, ifup, adapter->netdev,
-		  "Failed to initialize local page caches (LPCs)\n");
-	while (--i >= 0) {
-		struct ena_ring *rx_ring = &adapter->rx_ring[i];
-
-		ena_free_ring_page_cache(rx_ring);
-	}
-
-	return -ENOMEM;
-}
-
-static void ena_free_page_caches(struct ena_adapter *adapter)
-{
-	int i;
-
-	for (i = 0; i < adapter->num_io_queues; i++) {
-		struct ena_ring *rx_ring = &adapter->rx_ring[i];
-
-		ena_free_ring_page_cache(rx_ring);
-	}
-}
-
 static int ena_up(struct ena_adapter *adapter)
 {
 	int io_queue_count, rc, i;
@@ -3262,6 +2990,7 @@ static void ena_down(struct ena_adapter *adapter)
 
 	ena_free_all_tx_bufs(adapter);
 	ena_free_all_rx_bufs(adapter);
+	ena_free_all_cache_pages(adapter);
 	ena_free_page_caches(adapter);
 	ena_free_all_io_tx_resources(adapter);
 	ena_free_all_io_rx_resources(adapter);
@@ -3353,13 +3082,7 @@ int ena_set_lpc_state(struct ena_adapter *adapter, bool enabled)
 	if (enabled && !ena_is_lpc_supported(adapter, adapter->rx_ring, true))
 		return -EOPNOTSUPP;
 
-	/* Prevent a case in which disabling LPC on startup, prevents it from
-	 * being enabled afterwards.
-	 */
-	if (!lpc_size)
-		lpc_size = ENA_LPC_DEFAULT_MULTIPLIER;
-
-	adapter->lpc_size = enabled ? lpc_size : 0;
+	adapter->used_lpc_size = enabled ? adapter->configured_lpc_size : 0;
 
 	/* rtnl lock is already obtained in dev_ioctl() layer, so it's safe to
 	 * re-initialize IO resources.
@@ -3880,11 +3603,11 @@ static void ena_config_debug_area(struct ena_adapter *adapter)
 
 int ena_update_hw_stats(struct ena_adapter *adapter)
 {
-	int rc = 0;
+	int rc;
 
 	rc = ena_com_get_eni_stats(adapter->ena_dev, &adapter->eni_stats);
 	if (rc) {
-		dev_info_once(&adapter->pdev->dev, "Failed to get ENI stats\n");
+		netdev_err(adapter->netdev, "Failed to get ENI stats\n");
 		return rc;
 	}
 
@@ -4101,14 +3824,16 @@ static int ena_device_validate_params(struct ena_adapter *adapter,
 	return 0;
 }
 
-static void set_default_llq_configurations(struct ena_llq_configurations *llq_config,
-						  struct ena_admin_feature_llq_desc *llq)
+static void set_default_llq_configurations(struct ena_adapter *adapter,
+					   struct ena_llq_configurations *llq_config,
+					   struct ena_admin_feature_llq_desc *llq)
 {
 	llq_config->llq_header_location = ENA_ADMIN_INLINE_HEADER;
 	llq_config->llq_stride_ctrl = ENA_ADMIN_MULTIPLE_DESCS_PER_ENTRY;
 	llq_config->llq_num_decs_before_header = ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_2;
+
 	if ((llq->entry_size_ctrl_supported & ENA_ADMIN_LIST_ENTRY_SIZE_256B) &&
-	    force_large_llq_header) {
+	    adapter->large_llq_header) {
 		llq_config->llq_ring_entry_size = ENA_ADMIN_LIST_ENTRY_SIZE_256B;
 		llq_config->llq_ring_entry_size_value = 256;
 	} else {
@@ -4133,6 +3858,13 @@ static int ena_set_queues_placement_policy(struct pci_dev *pdev,
 		return 0;
 	}
 
+	if (!ena_dev->mem_bar) {
+		netdev_err(ena_dev->net_device,
+			   "LLQ is advertised as supported but device doesn't expose mem bar\n");
+		ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
+		return 0;
+	}
+
 	rc = ena_com_config_dev_mode(ena_dev, llq, llq_default_configurations);
 	if (unlikely(rc)) {
 		dev_err(&pdev->dev,
@@ -4148,15 +3880,8 @@ static int ena_map_llq_mem_bar(struct pci_dev *pdev, struct ena_com_dev *ena_dev
 {
 	bool has_mem_bar = !!(bars & BIT(ENA_MEM_BAR));
 
-	if (!has_mem_bar) {
-		if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
-			dev_err(&pdev->dev,
-				"ENA device does not expose LLQ bar. Fallback to host mode policy.\n");
-			ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
-		}
-
+	if (!has_mem_bar)
 		return 0;
-	}
 
 	ena_dev->mem_bar = devm_ioremap_wc(&pdev->dev,
 					   pci_resource_start(pdev, ENA_MEM_BAR),
@@ -4168,11 +3893,13 @@ static int ena_map_llq_mem_bar(struct pci_dev *pdev, struct ena_com_dev *ena_dev
 	return 0;
 }
 
-static int ena_device_init(struct ena_com_dev *ena_dev, struct pci_dev *pdev,
+static int ena_device_init(struct ena_adapter *adapter, struct pci_dev *pdev,
 			   struct ena_com_dev_get_features_ctx *get_feat_ctx,
 			   bool *wd_state)
 {
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
 	struct ena_llq_configurations llq_config;
+	netdev_features_t prev_netdev_features;
 	struct device *dev = &pdev->dev;
 	bool readless_supported;
 	u32 aenq_groups;
@@ -4271,7 +3998,7 @@ static int ena_device_init(struct ena_com_dev *ena_dev, struct pci_dev *pdev,
 
 	*wd_state = !!(aenq_groups & BIT(ENA_ADMIN_KEEP_ALIVE));
 
-	set_default_llq_configurations(&llq_config, &get_feat_ctx->llq);
+	set_default_llq_configurations(adapter, &llq_config, &get_feat_ctx->llq);
 
 	rc = ena_set_queues_placement_policy(pdev, ena_dev, &get_feat_ctx->llq,
 					     &llq_config);
@@ -4280,6 +4007,12 @@ static int ena_device_init(struct ena_com_dev *ena_dev, struct pci_dev *pdev,
 		goto err_admin_init;
 	}
 
+	ena_calc_io_queue_size(adapter, get_feat_ctx);
+
+	/* Turned on features shouldn't change due to reset. */
+	prev_netdev_features = adapter->netdev->features;
+	ena_set_dev_offloads(get_feat_ctx, adapter->netdev);
+	adapter->netdev->features = prev_netdev_features;
 	return 0;
 
 err_admin_init:
@@ -4379,7 +4112,7 @@ static int ena_restore_device(struct ena_adapter *adapter)
 	int rc;
 
 	set_bit(ENA_FLAG_ONGOING_RESET, &adapter->flags);
-	rc = ena_device_init(ena_dev, adapter->pdev, &get_feat_ctx, &wd_state);
+	rc = ena_device_init(adapter, adapter->pdev, &get_feat_ctx, &wd_state);
 	if (rc) {
 		dev_err(&pdev->dev, "Can not initialize device\n");
 		goto err;
@@ -4420,10 +4153,6 @@ static int ena_restore_device(struct ena_adapter *adapter)
 	mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ));
 	adapter->last_keep_alive_jiffies = jiffies;
 
-	dev_err(&pdev->dev,
-		"Device reset completed successfully, Driver info: %s\n",
-		version);
-
 	return rc;
 err_sysfs_terminate:
 	ena_sysfs_terminate(&pdev->dev);
@@ -4455,6 +4184,10 @@ static void ena_fw_reset_device(struct work_struct *work)
 	if (likely(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) {
 		ena_destroy_device(adapter, false);
 		ena_restore_device(adapter);
+
+		dev_err(&adapter->pdev->dev,
+			"Device reset completed successfully, Driver info: %s\n",
+			version);
 	}
 
 	rtnl_unlock();
@@ -4759,6 +4492,17 @@ static void ena_timer_service(unsigned long data)
 		ena_update_host_info(host_info, adapter->netdev);
 
 	if (unlikely(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) {
+		/* We don't destroy driver resources if we're not able to
+		 * communicate with the device. Failure in validating the
+		 * version implies unresponsive device.
+		 */
+		if (ena_com_validate_version(adapter->ena_dev) == -ETIME) {
+			netif_err(adapter, drv, adapter->netdev,
+				  "FW isn't responsive, skipping reset routine\n");
+			mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ));
+			return;
+		}
+
 		netif_err(adapter, drv, adapter->netdev,
 			  "Trigger reset is on\n");
 		ena_dump_stats_to_dmesg(adapter);
@@ -4800,12 +4544,8 @@ static u32 ena_calc_max_io_queue_num(struct pci_dev *pdev,
 	max_num_io_queues = min_t(u32, max_num_io_queues, io_rx_num);
 	max_num_io_queues = min_t(u32, max_num_io_queues, io_tx_sq_num);
 	max_num_io_queues = min_t(u32, max_num_io_queues, io_tx_cq_num);
-	/* 1 IRQ for for mgmnt and 1 IRQs for each IO direction */
+	/* 1 IRQ for mgmnt and 1 IRQs for each IO direction */
 	max_num_io_queues = min_t(u32, max_num_io_queues, pci_msix_vec_count(pdev) - 1);
-	if (unlikely(!max_num_io_queues)) {
-		dev_err(&pdev->dev, "The device doesn't have io queues\n");
-		return -EFAULT;
-	}
 
 	return max_num_io_queues;
 }
@@ -4904,7 +4644,7 @@ static int ena_rss_init_default(struct ena_adapter *adapter)
 		val = ethtool_rxfh_indir_default(i, adapter->num_io_queues);
 		rc = ena_com_indirect_table_fill_entry(ena_dev, i,
 						       ENA_IO_RXQ_IDX(val));
-		if (unlikely(rc && (rc != -EOPNOTSUPP))) {
+		if (unlikely(rc)) {
 			dev_err(dev, "Cannot fill indirect table\n");
 			goto err_fill_indir;
 		}
@@ -4940,17 +4680,28 @@ static void ena_release_bars(struct ena_com_dev *ena_dev, struct pci_dev *pdev)
 }
 
 
-static int ena_calc_io_queue_size(struct ena_calc_queue_size_ctx *ctx)
+static void ena_calc_io_queue_size(struct ena_adapter *adapter,
+				   struct ena_com_dev_get_features_ctx *get_feat_ctx)
 {
-	struct ena_admin_feature_llq_desc *llq = &ctx->get_feat_ctx->llq;
-	struct ena_com_dev *ena_dev = ctx->ena_dev;
+	struct ena_admin_feature_llq_desc *llq = &get_feat_ctx->llq;
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
 	u32 tx_queue_size = ENA_DEFAULT_RING_SIZE;
+	bool tx_configured, rx_configured;
 	u32 max_tx_queue_size;
 	u32 max_rx_queue_size;
 
+	/* If this function is called after driver load, the ring sizes have
+	 * already been configured. Take it into account when recalculating ring
+	 * size.
+	 */
+	tx_configured = !!adapter->tx_ring[0].ring_size;
+	rx_configured = !!adapter->rx_ring[0].ring_size;
+	tx_queue_size = tx_configured ? adapter->tx_ring[0].ring_size : tx_queue_size;
+	rx_queue_size = rx_configured ? adapter->rx_ring[0].ring_size : rx_queue_size;
+
 	if (ena_dev->supported_features & BIT(ENA_ADMIN_MAX_QUEUES_EXT)) {
 		struct ena_admin_queue_ext_feature_fields *max_queue_ext =
-			&ctx->get_feat_ctx->max_queue_ext.max_queue_ext;
+			&get_feat_ctx->max_queue_ext.max_queue_ext;
 		max_rx_queue_size = min_t(u32, max_queue_ext->max_rx_cq_depth,
 					  max_queue_ext->max_rx_sq_depth);
 		max_tx_queue_size = max_queue_ext->max_tx_cq_depth;
@@ -4962,13 +4713,13 @@ static int ena_calc_io_queue_size(struct ena_calc_queue_size_ctx *ctx)
 			max_tx_queue_size = min_t(u32, max_tx_queue_size,
 						  max_queue_ext->max_tx_sq_depth);
 
-		ctx->max_tx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
+		adapter->max_tx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
 					     max_queue_ext->max_per_packet_tx_descs);
-		ctx->max_rx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
+		adapter->max_rx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
 					     max_queue_ext->max_per_packet_rx_descs);
 	} else {
 		struct ena_admin_queue_feature_desc *max_queues =
-			&ctx->get_feat_ctx->max_queues;
+			&get_feat_ctx->max_queues;
 		max_rx_queue_size = min_t(u32, max_queues->max_cq_depth,
 					  max_queues->max_sq_depth);
 		max_tx_queue_size = max_queues->max_cq_depth;
@@ -4980,9 +4731,9 @@ static int ena_calc_io_queue_size(struct ena_calc_queue_size_ctx *ctx)
 			max_tx_queue_size = min_t(u32, max_tx_queue_size,
 						  max_queues->max_sq_depth);
 
-		ctx->max_tx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
+		adapter->max_tx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
 					     max_queues->max_packet_tx_descs);
-		ctx->max_rx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
+		adapter->max_rx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
 					     max_queues->max_packet_rx_descs);
 	}
 
@@ -4993,14 +4744,16 @@ static int ena_calc_io_queue_size(struct ena_calc_queue_size_ctx *ctx)
 	 * and therefore divide the queue size by 2, leaving the amount
 	 * of memory used by the queues unchanged.
 	 */
-	if (force_large_llq_header) {
+	if (adapter->large_llq_header) {
 		if ((llq->entry_size_ctrl_supported & ENA_ADMIN_LIST_ENTRY_SIZE_256B) &&
 		    (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)) {
 			max_tx_queue_size /= 2;
-			dev_info(&ctx->pdev->dev, "Forcing large headers and decreasing maximum TX queue size to %d\n",
+			dev_info(&adapter->pdev->dev, "Forcing large headers and decreasing maximum TX queue size to %d\n",
 				 max_tx_queue_size);
 		} else {
-			dev_err(&ctx->pdev->dev, "Forcing large headers failed: LLQ is disabled or device does not support large headers\n");
+			dev_err(&adapter->pdev->dev, "Forcing large headers failed: LLQ is disabled or device does not support large headers\n");
+
+			adapter->large_llq_header = false;
 		}
 	}
 
@@ -5012,12 +4765,10 @@ static int ena_calc_io_queue_size(struct ena_calc_queue_size_ctx *ctx)
 	tx_queue_size = rounddown_pow_of_two(tx_queue_size);
 	rx_queue_size = rounddown_pow_of_two(rx_queue_size);
 
-	ctx->max_tx_queue_size = max_tx_queue_size;
-	ctx->max_rx_queue_size = max_rx_queue_size;
-	ctx->tx_queue_size = tx_queue_size;
-	ctx->rx_queue_size = rx_queue_size;
-
-	return 0;
+	adapter->max_tx_ring_size  = max_tx_queue_size;
+	adapter->max_rx_ring_size = max_rx_queue_size;
+	adapter->requested_tx_ring_size = tx_queue_size;
+	adapter->requested_rx_ring_size = rx_queue_size;
 }
 
 /* ena_probe - Device Initialization Routine
@@ -5032,7 +4783,6 @@ static int ena_calc_io_queue_size(struct ena_calc_queue_size_ctx *ctx)
  */
 static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
-	struct ena_calc_queue_size_ctx calc_queue_ctx = {};
 	struct ena_com_dev_get_features_ctx get_feat_ctx;
 	struct ena_com_dev *ena_dev = NULL;
 	struct ena_adapter *adapter;
@@ -5120,24 +4870,22 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	pci_set_drvdata(pdev, adapter);
 
-	rc = ena_device_init(ena_dev, pdev, &get_feat_ctx, &wd_state);
+	adapter->large_llq_header = !!force_large_llq_header;
+
+	rc = ena_map_llq_mem_bar(pdev, ena_dev, bars);
 	if (rc) {
-		dev_err(&pdev->dev, "ENA device init failed\n");
-		if (rc == -ETIME)
-			rc = -EPROBE_DEFER;
+		dev_err(&pdev->dev, "ENA LLQ bar mapping failed\n");
 		goto err_netdev_destroy;
 	}
 
-	rc = ena_map_llq_mem_bar(pdev, ena_dev, bars);
+	rc = ena_device_init(adapter, pdev, &get_feat_ctx, &wd_state);
 	if (rc) {
-		dev_err(&pdev->dev, "ENA llq bar mapping failed\n");
-		goto err_device_destroy;
+		dev_err(&pdev->dev, "ENA device init failed\n");
+		if (rc == -ETIME)
+			rc = -EPROBE_DEFER;
+		goto err_netdev_destroy;
 	}
 
-	calc_queue_ctx.ena_dev = ena_dev;
-	calc_queue_ctx.get_feat_ctx = &get_feat_ctx;
-	calc_queue_ctx.pdev = pdev;
-
 	/* Initial TX and RX interrupt delay. Assumes 1 usec granularity.
 	 * Updated during device initialization with the real granularity
 	 */
@@ -5145,8 +4893,7 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	ena_dev->intr_moder_rx_interval = ENA_INTR_INITIAL_RX_INTERVAL_USECS;
 	ena_dev->intr_delay_resolution = ENA_DEFAULT_INTR_DELAY_RESOLUTION;
 	max_num_io_queues = ena_calc_max_io_queue_num(pdev, ena_dev, &get_feat_ctx);
-	rc = ena_calc_io_queue_size(&calc_queue_ctx);
-	if (rc || !max_num_io_queues) {
+	if (unlikely(!max_num_io_queues)) {
 		rc = -EFAULT;
 		goto err_device_destroy;
 	}
@@ -5155,16 +4902,14 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	adapter->reset_reason = ENA_REGS_RESET_NORMAL;
 
-	adapter->requested_tx_ring_size = calc_queue_ctx.tx_queue_size;
-	adapter->requested_rx_ring_size = calc_queue_ctx.rx_queue_size;
-	adapter->max_tx_ring_size = calc_queue_ctx.max_tx_queue_size;
-	adapter->max_rx_ring_size = calc_queue_ctx.max_rx_queue_size;
-	adapter->max_tx_sgl_size = calc_queue_ctx.max_tx_sgl_size;
-	adapter->max_rx_sgl_size = calc_queue_ctx.max_rx_sgl_size;
-
 	adapter->num_io_queues = clamp_val(num_io_queues, ENA_MIN_NUM_IO_QUEUES,
 					   max_num_io_queues);
-	adapter->lpc_size = lpc_size;
+	adapter->used_lpc_size = lpc_size;
+	/* When LPC is enabled after driver load, the configured_lpc_size is
+	 * used. Leaving it as 0, wouldn't change LPC state so we set it to
+	 * different value
+	 */
+	adapter->configured_lpc_size = lpc_size ? : ENA_LPC_DEFAULT_MULTIPLIER;
 	adapter->max_num_io_queues = max_num_io_queues;
 	adapter->last_monitored_tx_qid = 0;
 
@@ -5187,6 +4932,7 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 			"Failed to query interrupt moderation feature\n");
 		goto err_device_destroy;
 	}
+
 	ena_init_io_rings(adapter,
 			  0,
 			  adapter->xdp_num_queues +
@@ -5224,11 +4970,6 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	ena_config_debug_area(adapter);
 
-	if (!ena_update_hw_stats(adapter))
-		adapter->eni_stats_supported = true;
-	else
-		adapter->eni_stats_supported = false;
-
 	memcpy(adapter->netdev->perm_addr, adapter->mac_addr, netdev->addr_len);
 
 	netif_carrier_off(netdev);
@@ -5329,6 +5070,7 @@ static void __ena_shutoff(struct pci_dev *pdev, bool shutdown)
 	rtnl_lock(); /* lock released inside the below if-else block */
 	adapter->reset_reason = ENA_REGS_RESET_SHUTDOWN;
 	ena_destroy_device(adapter, true);
+
 	if (shutdown) {
 		netif_device_detach(netdev);
 		dev_close(netdev);
@@ -5554,6 +5296,16 @@ static void ena_notification(void *adapter_data,
 	}
 }
 
+static void ena_refresh_fw_capabilites(void *adapter_data,
+				       struct ena_admin_aenq_entry *aenq_e)
+{
+	struct ena_adapter *adapter = (struct ena_adapter *)adapter_data;
+
+	netdev_info(adapter->netdev, "Received requet to refresh capabilities\n");
+
+	set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+}
+
 /* This handler will called for unknown event group or unimplemented handlers*/
 static void unimplemented_aenq_handler(void *data,
 				       struct ena_admin_aenq_entry *aenq_e)
@@ -5569,6 +5321,7 @@ static struct ena_aenq_handlers aenq_handlers = {
 		[ENA_ADMIN_LINK_CHANGE] = ena_update_on_link_change,
 		[ENA_ADMIN_NOTIFICATION] = ena_notification,
 		[ENA_ADMIN_KEEP_ALIVE] = ena_keep_alive_wd,
+		[ENA_ADMIN_REFRESH_CAPABILITIES] = ena_refresh_fw_capabilites,
 	},
 	.unimplemented_handler = unimplemented_aenq_handler
 };
diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h
index 1061a0a3d1499..48cb953e6d801 100644
--- a/drivers/amazon/net/ena/ena_netdev.h
+++ b/drivers/amazon/net/ena/ena_netdev.h
@@ -25,7 +25,7 @@
 #include "ena_eth_com.h"
 
 #define DRV_MODULE_GEN_MAJOR	2
-#define DRV_MODULE_GEN_MINOR	5
+#define DRV_MODULE_GEN_MINOR	6
 #define DRV_MODULE_GEN_SUBMINOR 0
 
 #define DRV_MODULE_NAME		"ena"
@@ -140,6 +140,8 @@
 #define ENA_IS_XDP_INDEX(adapter, index) (false)
 #endif /* ENA_XDP_SUPPORT */
 
+struct ena_page_cache;
+
 struct ena_irq {
 	irq_handler_t handler;
 	void *data;
@@ -150,8 +152,8 @@ struct ena_irq {
 };
 
 struct ena_napi {
-	bool first_interrupt ____cacheline_aligned;
-	bool interrupts_masked;
+	u8 first_interrupt ____cacheline_aligned;
+	u8 interrupts_masked;
 	struct napi_struct napi;
 	struct ena_ring *tx_ring;
 	struct ena_ring *rx_ring;
@@ -162,18 +164,6 @@ struct ena_napi {
 	struct dim dim;
 };
 
-struct ena_calc_queue_size_ctx {
-	struct ena_com_dev_get_features_ctx *get_feat_ctx;
-	struct ena_com_dev *ena_dev;
-	struct pci_dev *pdev;
-	u32 tx_queue_size;
-	u32 rx_queue_size;
-	u32 max_tx_queue_size;
-	u32 max_rx_queue_size;
-	u16 max_tx_sgl_size;
-	u16 max_rx_sgl_size;
-};
-
 struct ena_tx_buffer {
 	struct sk_buff *skb;
 	/* num of ena desc for this specific skb
@@ -211,9 +201,10 @@ struct ena_tx_buffer {
 struct ena_rx_buffer {
 	struct sk_buff *skb;
 	struct page *page;
+	dma_addr_t dma_addr;
 	u32 page_offset;
-	bool is_lpc_page;
 	struct ena_com_buf ena_buf;
+	bool is_lpc_page;
 } ____cacheline_aligned;
 
 struct ena_stats_tx {
@@ -267,29 +258,6 @@ struct ena_stats_rx {
 	u64 lpc_wrong_numa;
 };
 
-/* LPC definitions */
-#define ENA_LPC_DEFAULT_MULTIPLIER 2
-#define ENA_LPC_MAX_MULTIPLIER 32
-#define ENA_LPC_MULTIPLIER_UNIT 1024
-#define ENA_LPC_MIN_NUM_OF_CHANNELS 16
-
-/* Store DMA address along with the page */
-struct ena_page {
-	struct page *page;
-	dma_addr_t dma_addr;
-};
-
-struct ena_page_cache {
-	/* How many pages are produced */
-	u32 head;
-	/* How many of the entries were initialized */
-	u32 current_size;
-	/* Maximum number of pages the cache can hold */
-	u32 max_size;
-
-	struct ena_page cache[0];
-} ____cacheline_aligned;
-
 struct ena_ring {
 	/* Holds the empty requests for TX/RX
 	 * out of order completions
@@ -403,10 +371,10 @@ struct ena_adapter {
 
 	u32 num_io_queues;
 	u32 max_num_io_queues;
-
-	/* Local page cache size */
-	u32 lpc_size;
-
+	/* Local page cache size when it's enabled */
+	u32 configured_lpc_size;
+	/* Current Local page cache size */
+	u32 used_lpc_size;
 #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
 	struct msix_entry *msix_entries;
 #endif
@@ -422,6 +390,8 @@ struct ena_adapter {
 
 	u32 msg_enable;
 
+	bool large_llq_header;
+
 	u16 max_tx_sgl_size;
 	u16 max_rx_sgl_size;
 
@@ -457,7 +427,6 @@ struct ena_adapter {
 	struct u64_stats_sync syncp;
 	struct ena_stats_dev dev_stats;
 	struct ena_admin_eni_stats eni_stats;
-	bool eni_stats_supported;
 
 	/* last queue index that was checked for uncompleted tx packets */
 	u32 last_monitored_tx_qid;
@@ -565,11 +534,6 @@ enum ena_xdp_errors_t {
 	ENA_XDP_NO_ENOUGH_QUEUES,
 };
 
-static inline bool ena_xdp_queues_present(struct ena_adapter *adapter)
-{
-	return adapter->xdp_first_ring != 0;
-}
-
 static inline bool ena_xdp_present(struct ena_adapter *adapter)
 {
 	return !!adapter->xdp_bpf_prog;
@@ -599,4 +563,12 @@ static inline enum ena_xdp_errors_t ena_xdp_allowed(struct ena_adapter *adapter)
 }
 #endif /* ENA_XDP_SUPPORT */
 
+/* Allocate a page and DMA map it
+ * @rx_ring: The IO queue pair which requests the allocation
+ *
+ * @return: address of the mapped page in DMA and allocated page address is
+ * succeeded, or NULL
+ */
+struct page *ena_alloc_map_page(struct ena_ring *rx_ring, dma_addr_t *dma);
+
 #endif /* !(ENA_H) */
diff --git a/drivers/amazon/net/ena/kcompat.h b/drivers/amazon/net/ena/kcompat.h
index 824128cd8dc61..c82567e4529db 100644
--- a/drivers/amazon/net/ena/kcompat.h
+++ b/drivers/amazon/net/ena/kcompat.h
@@ -831,4 +831,16 @@ xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start,
 
 #endif /* defined(ENA_XDP_SUPPORT) && LINUX_VERSION_CODE <= KERNEL_VERSION(5, 12, 0) */
 
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 13, 0)
+#define ethtool_sprintf(data, fmt, args...)			\
+	do {							\
+		snprintf(*data, ETH_GSTRING_LEN, fmt, ##args);	\
+		(*data) += ETH_GSTRING_LEN;			\
+	} while(0)
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 13, 0)
+#define ENA_XDP_XMIT_FREES_FAILED_DESCS_INTERNALLY
+#endif
+
 #endif /* _KCOMPAT_H_ */

From 344bdf2a70c95b242f0495f62df9c3ef8dfd8e7d Mon Sep 17 00:00:00 2001
From: Shaoying Xu <shaoyi@amazon.com>
Date: Tue, 11 Jan 2022 07:44:35 +0000
Subject: [PATCH 330/737] lustre: update to AmazonFSxLustreClient v2.10.8-10

Signed-off-by: Shaoying Xu <shaoyi@amazon.com>
---
 drivers/staging/lustrefsx/config.h            |   8 +-
 .../libcfs/include/libcfs/linux/linux-time.h  |   2 +-
 .../lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c |   4 +-
 .../lustrefsx/lustre/include/obd_class.h      |  10 +-
 .../staging/lustrefsx/lustre/ldlm/ldlm_lib.c  |  23 +-
 .../staging/lustrefsx/lustre/llite/super25.c  |   9 +-
 .../lustrefsx/lustre/llite/xattr_security.c   |   8 +-
 .../lustrefsx/lustre/obdclass/genops.c        | 415 +++++-------------
 .../lustrefsx/lustre/obdclass/obd_config.c    | 200 ++++++---
 .../lustrefsx/lustre/obdclass/obd_mount.c     |   7 +-
 .../lustre/ptlrpc/gss/gss_internal.h          |   2 +-
 .../lustrefsx/lustre/target/tgt_grant.c       |  15 +-
 12 files changed, 296 insertions(+), 407 deletions(-)

diff --git a/drivers/staging/lustrefsx/config.h b/drivers/staging/lustrefsx/config.h
index cea872bd120de..fce8b057480b6 100644
--- a/drivers/staging/lustrefsx/config.h
+++ b/drivers/staging/lustrefsx/config.h
@@ -981,7 +981,7 @@
 #define LUSTRE_PATCH 8
 
 /* A copy of PACKAGE_VERSION */
-#define LUSTRE_VERSION_STRING "2.10.8"
+#define LUSTRE_VERSION_STRING "2.10.8-10"
 
 /* maximum number of MDS threads */
 /* #undef MDS_MAX_THREADS */
@@ -1014,7 +1014,7 @@
 #define PACKAGE_NAME "Lustre"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "Lustre 2.10.8"
+#define PACKAGE_STRING "Lustre 2.10.8-10"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "lustre"
@@ -1023,7 +1023,7 @@
 #define PACKAGE_URL ""
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2.10.8"
+#define PACKAGE_VERSION "2.10.8-10"
 
 /* name of parallel fsck program */
 #define PFSCK "fsck"
@@ -1067,7 +1067,7 @@
 /* #undef USE_LU_REF */
 
 /* Version number of package */
-#define VERSION "2.10.8"
+#define VERSION "2.10.8-10"
 
 /* zfs fix version */
 /* #undef ZFS_FIX */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
index 07dd2e05a6083..a805ff9aedf84 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
@@ -259,7 +259,7 @@ static inline cfs_time_t cfs_time_current(void)
 
 static inline time_t cfs_time_current_sec(void)
 {
-	return get_seconds();
+	return ktime_get_real_seconds();
 }
 
 static inline cfs_duration_t cfs_time_seconds(int seconds)
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c
index 707cb1510455d..4b896a52d3bb4 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c
@@ -3375,8 +3375,8 @@ kiblnd_connd (void *arg)
                 }
 
 		while (reconn < KIB_RECONN_BREAK) {
-			if (kiblnd_data.kib_reconn_sec != get_seconds()) {
-				kiblnd_data.kib_reconn_sec = get_seconds();
+			if (kiblnd_data.kib_reconn_sec != ktime_get_real_seconds()) {
+				kiblnd_data.kib_reconn_sec = ktime_get_real_seconds();
 				list_splice_init(&kiblnd_data.kib_reconn_wait,
 						 &kiblnd_data.kib_reconn_list);
 			}
diff --git a/drivers/staging/lustrefsx/lustre/include/obd_class.h b/drivers/staging/lustrefsx/lustre/include/obd_class.h
index 5223eedaae96c..da40a4e38f91b 100644
--- a/drivers/staging/lustrefsx/lustre/include/obd_class.h
+++ b/drivers/staging/lustrefsx/lustre/include/obd_class.h
@@ -65,13 +65,9 @@ int class_register_type(struct obd_ops *, struct md_ops *, bool enable_proc,
 			const char *nm, struct lu_device_type *ldt);
 int class_unregister_type(const char *nm);
 
-struct obd_device *class_newdev(const char *type_name, const char *name,
-				const char *uuid);
-int class_register_device(struct obd_device *obd);
-void class_unregister_device(struct obd_device *obd);
-void class_free_dev(struct obd_device *obd);
+struct obd_device *class_newdev(const char *type_name, const char *name);
+void class_release_dev(struct obd_device *obd);
 
-struct obd_device *class_dev_by_str(const char *str);
 int class_name2dev(const char *name);
 struct obd_device *class_name2obd(const char *name);
 int class_uuid2dev(struct obd_uuid *uuid);
@@ -319,8 +315,6 @@ struct obd_export *class_export_get(struct obd_export *exp);
 void class_export_put(struct obd_export *exp);
 struct obd_export *class_new_export(struct obd_device *obddev,
                                     struct obd_uuid *cluuid);
-struct obd_export *class_new_export_self(struct obd_device *obd,
-					 struct obd_uuid *uuid);
 void class_unlink_export(struct obd_export *exp);
 
 struct obd_import *class_import_get(struct obd_import *);
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c
index 9b84a0d0cd21e..33d871da4bdf6 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c
@@ -950,6 +950,7 @@ int target_handle_connect(struct ptlrpc_request *req)
 	 * reconnect case */
 	struct lustre_handle conn;
 	struct lustre_handle *tmp;
+        struct obd_uuid tgtuuid;
         struct obd_uuid cluuid;
         char *str;
         int rc = 0;
@@ -958,6 +959,7 @@ int target_handle_connect(struct ptlrpc_request *req)
 	bool	 mds_conn = false, lw_client = false, initial_conn = false;
 	bool	 mds_mds_conn = false;
 	bool	 new_mds_mds_conn = false;
+	bool	 target_referenced = false;
         struct obd_connect_data *data, *tmpdata;
         int size, tmpsize;
         lnet_nid_t *client_nid = NULL;
@@ -971,7 +973,11 @@ int target_handle_connect(struct ptlrpc_request *req)
                 GOTO(out, rc = -EINVAL);
         }
 
-	target = class_dev_by_str(str);
+        obd_str2uuid(&tgtuuid, str);
+        target = class_uuid2obd(&tgtuuid);
+        if (!target)
+                target = class_name2obd(str);
+
 	if (!target) {
 		deuuidify(str, NULL, &target_start, &target_len);
 		LCONSOLE_ERROR_MSG(0x137, "%s: not available for connect "
@@ -983,9 +989,6 @@ int target_handle_connect(struct ptlrpc_request *req)
 	}
 
 	spin_lock(&target->obd_dev_lock);
-
-	target->obd_conn_inprogress++;
-
 	if (target->obd_stopping || !target->obd_set_up) {
 		spin_unlock(&target->obd_dev_lock);
 
@@ -1007,6 +1010,13 @@ int target_handle_connect(struct ptlrpc_request *req)
 		GOTO(out, rc = -EAGAIN);
 	}
 
+	/* Make sure the target isn't cleaned up while we're here. Yes,
+	 * there's still a race between the above check and our incref here.
+	 * Really, class_uuid2obd should take the ref. */
+	class_incref(target, __func__, current);
+	target_referenced = true;
+
+	target->obd_conn_inprogress++;
 	spin_unlock(&target->obd_dev_lock);
 
         str = req_capsule_client_get(&req->rq_pill, &RMF_CLUUID);
@@ -1433,11 +1443,12 @@ int target_handle_connect(struct ptlrpc_request *req)
 
 		class_export_put(export);
 	}
-	if (target != NULL) {
+	if (target_referenced == true && target != NULL) {
 		spin_lock(&target->obd_dev_lock);
 		target->obd_conn_inprogress--;
 		spin_unlock(&target->obd_dev_lock);
-		class_decref(target, "find", current);
+
+		class_decref(target, __func__, current);
 	}
 	req->rq_status = rc;
 	RETURN(rc);
diff --git a/drivers/staging/lustrefsx/lustre/llite/super25.c b/drivers/staging/lustrefsx/lustre/llite/super25.c
index 0ce267546688c..7118cce98561b 100644
--- a/drivers/staging/lustrefsx/lustre/llite/super25.c
+++ b/drivers/staging/lustrefsx/lustre/llite/super25.c
@@ -104,6 +104,7 @@ static int __init lustre_init(void)
 	struct lnet_process_id lnet_id;
 	struct timespec64 ts;
 	int i, rc, seed[2];
+	unsigned long lustre_inode_cache_flags;
 
 	CLASSERT(sizeof(LUSTRE_VOLATILE_HDR) == LUSTRE_VOLATILE_HDR_LEN + 1);
 
@@ -113,9 +114,15 @@ static int __init lustre_init(void)
 	CDEBUG(D_INFO, "Lustre client module (%p).\n",
 	       &lustre_super_operations);
 
+	lustre_inode_cache_flags = SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
+				   SLAB_MEM_SPREAD;
+#ifdef SLAB_ACCOUNT
+	lustre_inode_cache_flags |= SLAB_ACCOUNT;
+#endif
+
 	ll_inode_cachep = kmem_cache_create("lustre_inode_cache",
 					    sizeof(struct ll_inode_info),
-					    0, SLAB_HWCACHE_ALIGN, NULL);
+					    0, lustre_inode_cache_flags, NULL);
 	if (ll_inode_cachep == NULL)
 		GOTO(out_cache, rc = -ENOMEM);
 
diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr_security.c b/drivers/staging/lustrefsx/lustre/llite/xattr_security.c
index 5a79702318717..8f2e2e5cc1fa0 100644
--- a/drivers/staging/lustrefsx/lustre/llite/xattr_security.c
+++ b/drivers/staging/lustrefsx/lustre/llite/xattr_security.c
@@ -76,7 +76,13 @@ int ll_dentry_init_security(struct dentry *dentry, int mode, struct qstr *name,
 
 	rc = security_dentry_init_security(dentry, mode, name, secctx,
 					   secctx_size);
-	if (rc == -EOPNOTSUPP)
+	/* Usually, security_dentry_init_security() returns -EOPNOTSUPP when
+	 * SELinux is disabled.
+	 * But on some kernels (e.g. rhel 8.5) it returns 0 when SELinux is
+	 * disabled, and in this case the security context is empty.
+	 */
+	if (rc == -EOPNOTSUPP || (rc == 0 && *secctx_size == 0))
+		/* do nothing */
 		return 0;
 	if (rc < 0)
 		return rc;
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/genops.c b/drivers/staging/lustrefsx/lustre/obdclass/genops.c
index ef84bfe45c930..2c8e4db905d01 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/genops.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/genops.c
@@ -40,7 +40,6 @@
 #include <linux/pid_namespace.h>
 #include <linux/kthread.h>
 #include <obd_class.h>
-#include <lustre_log.h>
 #include <lprocfs_status.h>
 #include <lustre_disk.h>
 #include <lustre_kernelcomm.h>
@@ -292,20 +291,22 @@ EXPORT_SYMBOL(class_unregister_type);
 /**
  * Create a new obd device.
  *
- * Allocate the new obd_device and initialize it.
+ * Find an empty slot in ::obd_devs[], create a new obd device in it.
  *
  * \param[in] type_name obd device type string.
  * \param[in] name      obd device name.
- * \param[in] uuid      obd device UUID
  *
- * \retval newdev         pointer to created obd_device
- * \retval ERR_PTR(errno) on error
+ * \retval NULL if create fails, otherwise return the obd device
+ *         pointer created.
  */
-struct obd_device *class_newdev(const char *type_name, const char *name,
-				const char *uuid)
+struct obd_device *class_newdev(const char *type_name, const char *name)
 {
+        struct obd_device *result = NULL;
         struct obd_device *newdev;
         struct obd_type *type = NULL;
+        int i;
+        int new_obd_minor = 0;
+        bool retried = false;
         ENTRY;
 
         if (strlen(name) >= MAX_OBD_NAME) {
@@ -320,197 +321,106 @@ struct obd_device *class_newdev(const char *type_name, const char *name,
         }
 
         newdev = obd_device_alloc();
-	if (newdev == NULL) {
-		class_put_type(type);
-		RETURN(ERR_PTR(-ENOMEM));
-	}
+	if (newdev == NULL)
+		GOTO(out_type, result = ERR_PTR(-ENOMEM));
+
         LASSERT(newdev->obd_magic == OBD_DEVICE_MAGIC);
-	strncpy(newdev->obd_name, name, sizeof(newdev->obd_name) - 1);
-	newdev->obd_type = type;
-	newdev->obd_minor = -1;
-
-	rwlock_init(&newdev->obd_pool_lock);
-	newdev->obd_pool_limit = 0;
-	newdev->obd_pool_slv = 0;
-
-	INIT_LIST_HEAD(&newdev->obd_exports);
-	INIT_LIST_HEAD(&newdev->obd_unlinked_exports);
-	INIT_LIST_HEAD(&newdev->obd_delayed_exports);
-	INIT_LIST_HEAD(&newdev->obd_exports_timed);
-	INIT_LIST_HEAD(&newdev->obd_nid_stats);
-	spin_lock_init(&newdev->obd_nid_lock);
-	spin_lock_init(&newdev->obd_dev_lock);
-	mutex_init(&newdev->obd_dev_mutex);
-	spin_lock_init(&newdev->obd_osfs_lock);
-	/* newdev->obd_osfs_age must be set to a value in the distant
-	 * past to guarantee a fresh statfs is fetched on mount. */
-	newdev->obd_osfs_age = cfs_time_shift_64(-1000);
-
-	/* XXX belongs in setup not attach  */
-	init_rwsem(&newdev->obd_observer_link_sem);
-	/* recovery data */
-	spin_lock_init(&newdev->obd_recovery_task_lock);
-	init_waitqueue_head(&newdev->obd_next_transno_waitq);
-	init_waitqueue_head(&newdev->obd_evict_inprogress_waitq);
-	INIT_LIST_HEAD(&newdev->obd_req_replay_queue);
-	INIT_LIST_HEAD(&newdev->obd_lock_replay_queue);
-	INIT_LIST_HEAD(&newdev->obd_final_req_queue);
-	INIT_LIST_HEAD(&newdev->obd_evict_list);
-	INIT_LIST_HEAD(&newdev->obd_lwp_list);
-
-	llog_group_init(&newdev->obd_olg);
-	/* Detach drops this */
-	atomic_set(&newdev->obd_refcount, 1);
-	lu_ref_init(&newdev->obd_reference);
-	lu_ref_add(&newdev->obd_reference, "newdev", newdev);
-
-	newdev->obd_conn_inprogress = 0;
-
-	strncpy(newdev->obd_uuid.uuid, uuid, UUID_MAX);
-
-	CDEBUG(D_IOCTL, "Allocate new device %s (%p)\n",
-	       newdev->obd_name, newdev);
-
-	return newdev;
-}
 
-/**
- * Free obd device.
- *
- * \param[in] obd obd_device to be freed
- *
- * \retval none
- */
-void class_free_dev(struct obd_device *obd)
-{
-	struct obd_type *obd_type = obd->obd_type;
+        again:
+	write_lock(&obd_dev_lock);
+        for (i = 0; i < class_devno_max(); i++) {
+                struct obd_device *obd = class_num2obd(i);
 
-	LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x "
-		 "!= %08x\n", obd, obd->obd_magic, OBD_DEVICE_MAGIC);
-	LASSERTF(obd->obd_minor == -1 || obd_devs[obd->obd_minor] == obd,
-		 "obd %p != obd_devs[%d] %p\n",
-		 obd, obd->obd_minor, obd_devs[obd->obd_minor]);
-	LASSERTF(atomic_read(&obd->obd_refcount) == 0,
-		 "obd_refcount should be 0, not %d\n",
-		 atomic_read(&obd->obd_refcount));
-	LASSERT(obd_type != NULL);
+		if (obd && (strcmp(name, obd->obd_name) == 0)) {
 
-	CDEBUG(D_INFO, "Release obd device %s obd_type name = %s\n",
-	       obd->obd_name, obd->obd_type->typ_name);
+                        if (!retried) {
+                                write_unlock(&obd_dev_lock);
 
-	CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n",
-			 obd->obd_name, obd->obd_uuid.uuid);
-	if (obd->obd_stopping) {
-		int err;
+                                /* the obd_device could be waited to be
+                                 * destroyed by the "obd_zombie_impexp_thread".
+                                 */
+                                obd_zombie_barrier();
+                                retried = true;
+                                goto again;
+                        }
 
-		/* If we're not stopping, we were never set up */
-		err = obd_cleanup(obd);
-		if (err)
-			CERROR("Cleanup %s returned %d\n",
-				obd->obd_name, err);
-	}
+                        CERROR("Device %s already exists at %d, won't add\n",
+                               name, i);
+                        if (result) {
+                                LASSERTF(result->obd_magic == OBD_DEVICE_MAGIC,
+                                         "%p obd_magic %08x != %08x\n", result,
+                                         result->obd_magic, OBD_DEVICE_MAGIC);
+                                LASSERTF(result->obd_minor == new_obd_minor,
+                                         "%p obd_minor %d != %d\n", result,
+                                         result->obd_minor, new_obd_minor);
+
+                                obd_devs[result->obd_minor] = NULL;
+                                result->obd_name[0]='\0';
+                         }
+                        result = ERR_PTR(-EEXIST);
+                        break;
+                }
+                if (!result && !obd) {
+                        result = newdev;
+                        result->obd_minor = i;
+                        new_obd_minor = i;
+                        result->obd_type = type;
+                        strncpy(result->obd_name, name,
+                                sizeof(result->obd_name) - 1);
+                        obd_devs[i] = result;
+                }
+        }
+	write_unlock(&obd_dev_lock);
 
-	obd_device_free(obd);
+        if (result == NULL && i >= class_devno_max()) {
+                CERROR("all %u OBD devices used, increase MAX_OBD_DEVICES\n",
+                       class_devno_max());
+		GOTO(out, result = ERR_PTR(-EOVERFLOW));
+        }
 
-	class_put_type(obd_type);
-}
+	if (IS_ERR(result))
+	        GOTO(out, result);
 
-/**
- * Unregister obd device.
- *
- * Free slot in obd_dev[] used by \a obd.
- *
- * \param[in] new_obd obd_device to be unregistered
- *
- * \retval none
- */
-void class_unregister_device(struct obd_device *obd)
-{
-	write_lock(&obd_dev_lock);
-	if (obd->obd_minor >= 0) {
-		LASSERT(obd_devs[obd->obd_minor] == obd);
-		obd_devs[obd->obd_minor] = NULL;
-		obd->obd_minor = -1;
-	}
-	write_unlock(&obd_dev_lock);
+	CDEBUG(D_IOCTL, "Adding new device %s (%p)\n",
+	       result->obd_name, result);
+
+	RETURN(result);
+out:
+	obd_device_free(newdev);
+out_type:
+	class_put_type(type);
+	return result;
 }
 
-/**
- * Register obd device.
- *
- * Find free slot in obd_devs[], fills it with \a new_obd.
- *
- * \param[in] new_obd obd_device to be registered
- *
- * \retval 0          success
- * \retval -EEXIST    device with this name is registered
- * \retval -EOVERFLOW obd_devs[] is full
- */
-int class_register_device(struct obd_device *new_obd)
+void class_release_dev(struct obd_device *obd)
 {
-	int ret = 0;
-	int i;
-	int new_obd_minor = 0;
-	bool minor_assign = false;
-	bool retried = false;
+        struct obd_type *obd_type = obd->obd_type;
 
-again:
-	write_lock(&obd_dev_lock);
-	for (i = 0; i < class_devno_max(); i++) {
-		struct obd_device *obd = class_num2obd(i);
-
-		if (obd != NULL &&
-		    (strcmp(new_obd->obd_name, obd->obd_name) == 0)) {
-
-			if (!retried) {
-				write_unlock(&obd_dev_lock);
-
-				/* the obd_device could be waited to be
- 				 * destroyed by the "obd_zombie_impexp_thread".
- 				 */
-				obd_zombie_barrier();
-				retried = true;
-				goto again;
-			}
-
-			CERROR("%s: already exists, won't add\n",
-			       obd->obd_name);
-			/* in case we found a free slot before duplicate */
-			minor_assign = false;
-			ret = -EEXIST;
-			break;
-		}
-		if (!minor_assign && obd == NULL) {
-			new_obd_minor = i;
-			minor_assign = true;
-		}
-	}
+        LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x != %08x\n",
+                 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+        LASSERTF(obd == obd_devs[obd->obd_minor], "obd %p != obd_devs[%d] %p\n",
+                 obd, obd->obd_minor, obd_devs[obd->obd_minor]);
+        LASSERT(obd_type != NULL);
 
-	if (minor_assign) {
-		new_obd->obd_minor = new_obd_minor;
-		LASSERTF(obd_devs[new_obd_minor] == NULL, "obd_devs[%d] "
-			 "%p\n", new_obd_minor, obd_devs[new_obd_minor]);
-		obd_devs[new_obd_minor] = new_obd;
-	} else {
-		if (ret == 0) {
-			ret = -EOVERFLOW;
-			CERROR("%s: all %u/%u devices used, increase "
-			       "MAX_OBD_DEVICES: rc = %d\n", new_obd->obd_name,
-			       i, class_devno_max(), ret);
-		}
-	}
+        CDEBUG(D_INFO, "Release obd device %s at %d obd_type name =%s\n",
+               obd->obd_name, obd->obd_minor, obd->obd_type->typ_name);
+
+	write_lock(&obd_dev_lock);
+        obd_devs[obd->obd_minor] = NULL;
 	write_unlock(&obd_dev_lock);
+        obd_device_free(obd);
 
-	RETURN(ret);
+        class_put_type(obd_type);
 }
 
-static int class_name2dev_nolock(const char *name)
+int class_name2dev(const char *name)
 {
         int i;
 
         if (!name)
                 return -1;
 
+	read_lock(&obd_dev_lock);
         for (i = 0; i < class_devno_max(); i++) {
                 struct obd_device *obd = class_num2obd(i);
 
@@ -519,29 +429,16 @@ static int class_name2dev_nolock(const char *name)
                            out any references */
                         LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
                         if (obd->obd_attached) {
+				read_unlock(&obd_dev_lock);
                                 return i;
                         }
                         break;
                 }
         }
-
-        return -1;
-}
-
-int class_name2dev(const char *name)
-{
-	int i;
-
-	if (!name)
-		return -1;
-
-	read_lock(&obd_dev_lock);
-	i = class_name2dev_nolock(name);
 	read_unlock(&obd_dev_lock);
 
-	return i;
+        return -1;
 }
-EXPORT_SYMBOL(class_name2dev);
 
 struct obd_device *class_name2obd(const char *name)
 {
@@ -553,33 +450,24 @@ struct obd_device *class_name2obd(const char *name)
 }
 EXPORT_SYMBOL(class_name2obd);
 
-int class_uuid2dev_nolock(struct obd_uuid *uuid)
+int class_uuid2dev(struct obd_uuid *uuid)
 {
         int i;
 
+	read_lock(&obd_dev_lock);
         for (i = 0; i < class_devno_max(); i++) {
                 struct obd_device *obd = class_num2obd(i);
 
                 if (obd && obd_uuid_equals(uuid, &obd->obd_uuid)) {
                         LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+			read_unlock(&obd_dev_lock);
                         return i;
                 }
         }
-
-        return -1;
-}
-
-int class_uuid2dev(struct obd_uuid *uuid)
-{
-	int i;
-
-	read_lock(&obd_dev_lock);
-	i = class_uuid2dev_nolock(uuid);
 	read_unlock(&obd_dev_lock);
 
-	return i;
+        return -1;
 }
-EXPORT_SYMBOL(class_uuid2dev);
 
 struct obd_device *class_uuid2obd(struct obd_uuid *uuid)
 {
@@ -618,40 +506,6 @@ struct obd_device *class_num2obd(int num)
         return obd;
 }
 
-/**
- * Find obd in obd_dev[] by name or uuid.
- *
- * Increment obd's refcount if found.
- *
- * \param[in] str obd name or uuid
- *
- * \retval NULL    if not found
- * \retval target  pointer to found obd_device
- */
-struct obd_device *class_dev_by_str(const char *str)
-{
-	struct obd_device *target = NULL;
-	struct obd_uuid tgtuuid;
-	int rc;
-
-	obd_str2uuid(&tgtuuid, str);
-
-	read_lock(&obd_dev_lock);
-	rc = class_uuid2dev_nolock(&tgtuuid);
-	if (rc < 0)
-		rc = class_name2dev_nolock(str);
-
-	if (rc >= 0)
-		target = class_num2obd(rc);
-
-	if (target != NULL)
-		class_incref(target, "find", current);
-	read_unlock(&obd_dev_lock);
-
-	RETURN(target);
-}
-EXPORT_SYMBOL(class_dev_by_str);
-
 /**
  * Get obd devices count. Device in any
  *    state are counted
@@ -944,10 +798,7 @@ static void class_export_destroy(struct obd_export *exp)
 	LASSERT(list_empty(&exp->exp_req_replay_queue));
 	LASSERT(list_empty(&exp->exp_hp_rpcs));
         obd_destroy_export(exp);
-	/* self export doesn't hold a reference to an obd, although it
-	 * exists until freeing of the obd */
-	if (exp != obd->obd_self_export)
-		class_decref(obd, "export", exp);
+        class_decref(obd, "export", exp);
 
         OBD_FREE_RCU(exp, sizeof(*exp), &exp->exp_handle);
         EXIT;
@@ -980,37 +831,24 @@ void class_export_put(struct obd_export *exp)
 	       atomic_read(&exp->exp_refcount) - 1);
 
 	if (atomic_dec_and_test(&exp->exp_refcount)) {
-		struct obd_device *obd = exp->exp_obd;
-
+		LASSERT(!list_empty(&exp->exp_obd_chain));
+		LASSERT(list_empty(&exp->exp_stale_list));
 		CDEBUG(D_IOCTL, "final put %p/%s\n",
 		       exp, exp->exp_client_uuid.uuid);
 
 		/* release nid stat refererence */
 		lprocfs_exp_cleanup(exp);
 
-		if (exp == obd->obd_self_export) {
-			/* self export should be destroyed without
-			 * zombie thread as it doesn't hold a
-			 * reference to obd and doesn't hold any
-			 * resources */
-			class_export_destroy(exp);
-			/* self export is destroyed, no class
-			 * references exist and it is safe to free
-			 * obd */
-			class_free_dev(obd);
-		} else {
-			LASSERT(!list_empty(&exp->exp_obd_chain));
-			obd_zombie_export_add(exp);
-		}
-
+		obd_zombie_export_add(exp);
 	}
 }
 EXPORT_SYMBOL(class_export_put);
+
 /* Creates a new export, adds it to the hash table, and returns a
  * pointer to it. The refcount is 2: one for the hash reference, and
  * one for the pointer returned by this function. */
-struct obd_export *__class_new_export(struct obd_device *obd,
-				      struct obd_uuid *cluuid, bool is_self)
+struct obd_export *class_new_export(struct obd_device *obd,
+                                    struct obd_uuid *cluuid)
 {
         struct obd_export *export;
 	struct cfs_hash *hash = NULL;
@@ -1024,7 +862,6 @@ struct obd_export *__class_new_export(struct obd_device *obd,
         export->exp_conn_cnt = 0;
         export->exp_lock_hash = NULL;
 	export->exp_flock_hash = NULL;
-	/* 2 = class_handle_hash + last */
 	atomic_set(&export->exp_refcount, 2);
 	atomic_set(&export->exp_rpc_count, 0);
 	atomic_set(&export->exp_cb_count, 0);
@@ -1058,17 +895,17 @@ struct obd_export *__class_new_export(struct obd_device *obd,
 	export->exp_client_uuid = *cluuid;
 	obd_init_export(export);
 
-	if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) {
-		spin_lock(&obd->obd_dev_lock);
-		/* shouldn't happen, but might race */
-		if (obd->obd_stopping)
-			GOTO(exit_unlock, rc = -ENODEV);
+	spin_lock(&obd->obd_dev_lock);
+	/* shouldn't happen, but might race */
+	if (obd->obd_stopping)
+		GOTO(exit_unlock, rc = -ENODEV);
 
-		hash = cfs_hash_getref(obd->obd_uuid_hash);
-		if (hash == NULL)
-			GOTO(exit_unlock, rc = -ENODEV);
-		spin_unlock(&obd->obd_dev_lock);
+	hash = cfs_hash_getref(obd->obd_uuid_hash);
+	if (hash == NULL)
+		GOTO(exit_unlock, rc = -ENODEV);
+	spin_unlock(&obd->obd_dev_lock);
 
+        if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) {
                 rc = cfs_hash_add_unique(hash, cluuid, &export->exp_uuid_hash);
                 if (rc != 0) {
                         LCONSOLE_WARN("%s: denying duplicate export for %s, %d\n",
@@ -1080,24 +917,17 @@ struct obd_export *__class_new_export(struct obd_device *obd,
 	at_init(&export->exp_bl_lock_at, obd_timeout, 0);
 	spin_lock(&obd->obd_dev_lock);
         if (obd->obd_stopping) {
-		if (hash)
-			cfs_hash_del(hash, cluuid, &export->exp_uuid_hash);
-		GOTO(exit_unlock, rc = -ESHUTDOWN);
+                cfs_hash_del(hash, cluuid, &export->exp_uuid_hash);
+                GOTO(exit_unlock, rc = -ENODEV);
         }
 
-	if (!is_self) {
-		class_incref(obd, "export", export);
-		list_add_tail(&export->exp_obd_chain_timed,
-			      &obd->obd_exports_timed);
-		list_add(&export->exp_obd_chain, &obd->obd_exports);
-		obd->obd_num_exports++;
-	} else {
-		INIT_LIST_HEAD(&export->exp_obd_chain_timed);
-		INIT_LIST_HEAD(&export->exp_obd_chain);
-	}
+        class_incref(obd, "export", export);
+	list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports);
+	list_add_tail(&export->exp_obd_chain_timed,
+		      &export->exp_obd->obd_exports_timed);
+        export->exp_obd->obd_num_exports++;
 	spin_unlock(&obd->obd_dev_lock);
-	if (hash)
-		cfs_hash_putref(hash);
+	cfs_hash_putref(hash);
 	RETURN(export);
 
 exit_unlock:
@@ -1111,29 +941,12 @@ struct obd_export *__class_new_export(struct obd_device *obd,
         OBD_FREE_PTR(export);
         return ERR_PTR(rc);
 }
-
-struct obd_export *class_new_export(struct obd_device *obd,
-				    struct obd_uuid *uuid)
-{
-	return __class_new_export(obd, uuid, false);
-}
 EXPORT_SYMBOL(class_new_export);
 
-struct obd_export *class_new_export_self(struct obd_device *obd,
-					 struct obd_uuid *uuid)
-{
-	return __class_new_export(obd, uuid, true);
-}
-
 void class_unlink_export(struct obd_export *exp)
 {
 	class_handle_unhash(&exp->exp_handle);
 
-	if (exp->exp_obd->obd_self_export == exp) {
-		class_export_put(exp);
-		return;
-	}
-
 	spin_lock(&exp->exp_obd->obd_dev_lock);
 	/* delete an uuid-export hashitem from hashtables */
 	if (!hlist_unhashed(&exp->exp_uuid_hash))
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
index 8068de9ebea64..924322ef86e8c 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
@@ -365,7 +365,6 @@ EXPORT_SYMBOL(lustre_cfg_string);
  */
 int class_attach(struct lustre_cfg *lcfg)
 {
-	struct obd_export *exp;
         struct obd_device *obd = NULL;
         char *typename, *name, *uuid;
         int rc, len;
@@ -382,54 +381,90 @@ int class_attach(struct lustre_cfg *lcfg)
                 RETURN(-EINVAL);
         }
         name = lustre_cfg_string(lcfg, 0);
+
         if (!LUSTRE_CFG_BUFLEN(lcfg, 2)) {
                 CERROR("No UUID passed!\n");
                 RETURN(-EINVAL);
         }
+        uuid = lustre_cfg_string(lcfg, 2);
 
-	uuid = lustre_cfg_string(lcfg, 2);
-	len = strlen(uuid);
-	if (len >= sizeof(obd->obd_uuid)) {
-		CERROR("%s: uuid must be < %d bytes long\n",
-		       name, (int)sizeof(obd->obd_uuid));
-		RETURN(-EINVAL);
-	}
+        CDEBUG(D_IOCTL, "attach type %s name: %s uuid: %s\n",
+               MKSTR(typename), MKSTR(name), MKSTR(uuid));
 
-	obd = class_newdev(typename, name, uuid);
-	if (IS_ERR(obd)) { /* Already exists or out of obds */
-		rc = PTR_ERR(obd);
+        obd = class_newdev(typename, name);
+        if (IS_ERR(obd)) {
+                /* Already exists or out of obds */
+                rc = PTR_ERR(obd);
+                obd = NULL;
                 CERROR("Cannot create device %s of type %s : %d\n",
                        name, typename, rc);
-		RETURN(rc);
+                GOTO(out, rc);
         }
+        LASSERTF(obd != NULL, "Cannot get obd device %s of type %s\n",
+                 name, typename);
         LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
                  "obd %p obd_magic %08X != %08X\n",
                  obd, obd->obd_magic, OBD_DEVICE_MAGIC);
         LASSERTF(strncmp(obd->obd_name, name, strlen(name)) == 0,
                  "%p obd_name %s != %s\n", obd, obd->obd_name, name);
 
-	exp = class_new_export_self(obd, &obd->obd_uuid);
-	if (IS_ERR(exp)) {
-		rc = PTR_ERR(exp);
-		class_free_dev(obd);
-		RETURN(rc);
-	}
-
-	obd->obd_self_export = exp;
-	list_del_init(&exp->exp_obd_chain_timed);
-	class_export_put(exp);
+	rwlock_init(&obd->obd_pool_lock);
+	obd->obd_pool_limit = 0;
+	obd->obd_pool_slv = 0;
+
+	INIT_LIST_HEAD(&obd->obd_exports);
+	INIT_LIST_HEAD(&obd->obd_unlinked_exports);
+	INIT_LIST_HEAD(&obd->obd_delayed_exports);
+	INIT_LIST_HEAD(&obd->obd_exports_timed);
+	INIT_LIST_HEAD(&obd->obd_nid_stats);
+	spin_lock_init(&obd->obd_nid_lock);
+	spin_lock_init(&obd->obd_dev_lock);
+	mutex_init(&obd->obd_dev_mutex);
+	spin_lock_init(&obd->obd_osfs_lock);
+	/* obd->obd_osfs_age must be set to a value in the distant
+	 * past to guarantee a fresh statfs is fetched on mount. */
+	obd->obd_osfs_age = cfs_time_shift_64(-1000);
+
+	/* XXX belongs in setup not attach  */
+	init_rwsem(&obd->obd_observer_link_sem);
+	/* recovery data */
+	spin_lock_init(&obd->obd_recovery_task_lock);
+	init_waitqueue_head(&obd->obd_next_transno_waitq);
+	init_waitqueue_head(&obd->obd_evict_inprogress_waitq);
+	INIT_LIST_HEAD(&obd->obd_req_replay_queue);
+	INIT_LIST_HEAD(&obd->obd_lock_replay_queue);
+	INIT_LIST_HEAD(&obd->obd_final_req_queue);
+	INIT_LIST_HEAD(&obd->obd_evict_list);
+	INIT_LIST_HEAD(&obd->obd_lwp_list);
+
+	llog_group_init(&obd->obd_olg);
+
+	obd->obd_conn_inprogress = 0;
+
+        len = strlen(uuid);
+        if (len >= sizeof(obd->obd_uuid)) {
+                CERROR("uuid must be < %d bytes long\n",
+                       (int)sizeof(obd->obd_uuid));
+                GOTO(out, rc = -EINVAL);
+        }
+        memcpy(obd->obd_uuid.uuid, uuid, len);
 
-	rc = class_register_device(obd);
-	if (rc != 0) {
-		class_decref(obd, "newdev", obd);
-		RETURN(rc);
-	}
+        /* Detach drops this */
+	spin_lock(&obd->obd_dev_lock);
+	atomic_set(&obd->obd_refcount, 1);
+	spin_unlock(&obd->obd_dev_lock);
+        lu_ref_init(&obd->obd_reference);
+        lu_ref_add(&obd->obd_reference, "attach", obd);
 
-	obd->obd_attached = 1;
-	CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n",
+        obd->obd_attached = 1;
+        CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n",
 	       obd->obd_minor, typename, atomic_read(&obd->obd_refcount));
-
-	RETURN(0);
+        RETURN(0);
+ out:
+        if (obd != NULL) {
+                class_release_dev(obd);
+        }
+        return rc;
 }
 EXPORT_SYMBOL(class_attach);
 
@@ -439,6 +474,7 @@ EXPORT_SYMBOL(class_attach);
 int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 {
         int err = 0;
+        struct obd_export *exp;
         ENTRY;
 
         LASSERT(obd != NULL);
@@ -487,7 +523,7 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
                                              CFS_HASH_MAX_THETA,
                                              &uuid_hash_ops, CFS_HASH_DEFAULT);
         if (!obd->obd_uuid_hash)
-		GOTO(err_exit, err = -ENOMEM);
+                GOTO(err_hash, err = -ENOMEM);
 
         /* create a nid-export lustre hash */
         obd->obd_nid_hash = cfs_hash_create("NID_HASH",
@@ -498,7 +534,7 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
                                             CFS_HASH_MAX_THETA,
                                             &nid_hash_ops, CFS_HASH_DEFAULT);
         if (!obd->obd_nid_hash)
-		GOTO(err_exit, err = -ENOMEM);
+                GOTO(err_hash, err = -ENOMEM);
 
         /* create a nid-stats lustre hash */
         obd->obd_nid_stats_hash = cfs_hash_create("NID_STATS",
@@ -508,8 +544,8 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
                                                   CFS_HASH_MIN_THETA,
                                                   CFS_HASH_MAX_THETA,
                                                   &nid_stat_hash_ops, CFS_HASH_DEFAULT);
-	if (!obd->obd_nid_stats_hash)
-		GOTO(err_exit, err = -ENOMEM);
+        if (!obd->obd_nid_stats_hash)
+                GOTO(err_hash, err = -ENOMEM);
 
 	/* create a client_generation-export lustre hash */
 	obd->obd_gen_hash = cfs_hash_create("UUID_HASH",
@@ -520,13 +556,21 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 					    CFS_HASH_MAX_THETA,
 					    &gen_hash_ops, CFS_HASH_DEFAULT);
 	if (!obd->obd_gen_hash)
-		GOTO(err_exit, err = -ENOMEM);
+		GOTO(err_hash, err = -ENOMEM);
 
-	err = obd_setup(obd, lcfg);
-	if (err)
-		GOTO(err_exit, err);
+        exp = class_new_export(obd, &obd->obd_uuid);
+        if (IS_ERR(exp))
+                GOTO(err_hash, err = PTR_ERR(exp));
+
+        obd->obd_self_export = exp;
+	list_del_init(&exp->exp_obd_chain_timed);
+        class_export_put(exp);
+
+        err = obd_setup(obd, lcfg);
+        if (err)
+                GOTO(err_exp, err);
 
-	obd->obd_set_up = 1;
+        obd->obd_set_up = 1;
 
 	spin_lock(&obd->obd_dev_lock);
 	/* cleanup drops this */
@@ -537,7 +581,12 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
                obd->obd_name, obd->obd_uuid.uuid);
 
         RETURN(0);
-err_exit:
+err_exp:
+        if (obd->obd_self_export) {
+                class_unlink_export(obd->obd_self_export);
+                obd->obd_self_export = NULL;
+        }
+err_hash:
         if (obd->obd_uuid_hash) {
                 cfs_hash_putref(obd->obd_uuid_hash);
                 obd->obd_uuid_hash = NULL;
@@ -581,14 +630,10 @@ int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg)
 	obd->obd_attached = 0;
 	spin_unlock(&obd->obd_dev_lock);
 
-	/* cleanup in progress. we don't like to find this device after now */
-	class_unregister_device(obd);
-
         CDEBUG(D_IOCTL, "detach on obd %s (uuid %s)\n",
                obd->obd_name, obd->obd_uuid.uuid);
 
-	class_decref(obd, "newdev", obd);
-
+        class_decref(obd, "attach", obd);
         RETURN(0);
 }
 EXPORT_SYMBOL(class_detach);
@@ -618,9 +663,6 @@ int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
 	}
 	/* Leave this on forever */
 	obd->obd_stopping = 1;
-	/* function can't return error after that point, so clear setup flag
-	 * as early as possible to avoid finding via obd_devs / hash */
-	obd->obd_set_up = 0;
 	spin_unlock(&obd->obd_dev_lock);
 
 	/* wait for already-arrived-connections to finish. */
@@ -653,11 +695,17 @@ int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
 
 	LASSERT(obd->obd_self_export);
 
-	CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d/%d\n",
-	       obd->obd_name, obd->obd_num_exports,
-	       atomic_read(&obd->obd_refcount) - 2);
-	dump_exports(obd, 0, D_HA);
-	class_disconnect_exports(obd);
+	/* The three references that should be remaining are the
+	 * obd_self_export and the attach and setup references. */
+	if (atomic_read(&obd->obd_refcount) > 3) {
+		/* refcounf - 3 might be the number of real exports
+		   (excluding self export). But class_incref is called
+		   by other things as well, so don't count on it. */
+		CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d\n",
+		       obd->obd_name, atomic_read(&obd->obd_refcount) - 3);
+		dump_exports(obd, 0, D_HA);
+		class_disconnect_exports(obd);
+	}
 
 	/* Precleanup, we must make sure all exports get destroyed. */
 	err = obd_precleanup(obd);
@@ -709,27 +757,43 @@ EXPORT_SYMBOL(class_incref);
 
 void class_decref(struct obd_device *obd, const char *scope, const void *source)
 {
-	int last;
-
-	CDEBUG(D_INFO, "Decref %s (%p) now %d - %s\n", obd->obd_name, obd,
-	       atomic_read(&obd->obd_refcount), scope);
+	int err;
+	int refs;
 
-	LASSERT(obd->obd_num_exports >= 0);
-	last = atomic_dec_and_test(&obd->obd_refcount);
+	spin_lock(&obd->obd_dev_lock);
+	atomic_dec(&obd->obd_refcount);
+	refs = atomic_read(&obd->obd_refcount);
+	spin_unlock(&obd->obd_dev_lock);
 	lu_ref_del(&obd->obd_reference, scope, source);
 
-	if (last) {
-		struct obd_export *exp;
+	CDEBUG(D_INFO, "Decref %s (%p) now %d\n", obd->obd_name, obd, refs);
 
-		LASSERT(!obd->obd_attached);
+	if ((refs == 1) && obd->obd_stopping) {
 		/* All exports have been destroyed; there should
-		 * be no more in-progress ops by this point.*/
-		exp = obd->obd_self_export;
+		   be no more in-progress ops by this point.*/
 
-		if (exp) {
-			exp->exp_flags |= exp_flags_from_obd(obd);
-			class_unlink_export(exp);
+		spin_lock(&obd->obd_self_export->exp_lock);
+		obd->obd_self_export->exp_flags |= exp_flags_from_obd(obd);
+		spin_unlock(&obd->obd_self_export->exp_lock);
+
+                /* note that we'll recurse into class_decref again */
+                class_unlink_export(obd->obd_self_export);
+                return;
+        }
+
+        if (refs == 0) {
+                CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n",
+                       obd->obd_name, obd->obd_uuid.uuid);
+                LASSERT(!obd->obd_attached);
+                if (obd->obd_stopping) {
+                        /* If we're not stopping, we were never set up */
+                        err = obd_cleanup(obd);
+                        if (err)
+                                CERROR("Cleanup %s returned %d\n",
+                                       obd->obd_name, err);
                 }
+
+                class_release_dev(obd);
         }
 }
 EXPORT_SYMBOL(class_decref);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c
index ed1a1d7eea343..e3390507d900e 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c
@@ -220,7 +220,7 @@ int lustre_start_mgc(struct super_block *sb)
         struct lustre_sb_info *lsi = s2lsi(sb);
         struct obd_device *obd;
         struct obd_export *exp;
-	struct obd_uuid *uuid = NULL;
+        struct obd_uuid *uuid;
         class_uuid_t uuidc;
         lnet_nid_t nid;
 	char nidstr[LNET_NIDSTR_SIZE];
@@ -409,6 +409,7 @@ int lustre_start_mgc(struct super_block *sb)
         rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME,
 				 (char *)uuid->uuid, LUSTRE_MGS_OBDNAME,
 				 niduuid, NULL, NULL);
+        OBD_FREE_PTR(uuid);
         if (rc)
                 GOTO(out_free, rc);
 
@@ -469,7 +470,7 @@ int lustre_start_mgc(struct super_block *sb)
             lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR)
                 data->ocd_connect_flags &= ~OBD_CONNECT_IMP_RECOV;
         data->ocd_version = LUSTRE_VERSION_CODE;
-	rc = obd_connect(NULL, &exp, obd, uuid, data, NULL);
+        rc = obd_connect(NULL, &exp, obd, &(obd->obd_uuid), data, NULL);
         if (rc) {
                 CERROR("connect failed %d\n", rc);
                 GOTO(out, rc);
@@ -484,8 +485,6 @@ int lustre_start_mgc(struct super_block *sb)
 out_free:
 	mutex_unlock(&mgc_start_lock);
 
-	if (uuid)
-		OBD_FREE_PTR(uuid);
         if (data)
                 OBD_FREE_PTR(data);
         if (mgcname)
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h
index 95d00f5f7c1a7..eb86ba1627103 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h
@@ -79,7 +79,7 @@ unsigned long gss_round_ctx_expiry(unsigned long expiry,
         if (sec_flags & PTLRPC_SEC_FL_REVERSE)
                 return expiry;
 
-        if (get_seconds() + __TIMEOUT_DELTA <= expiry)
+        if (ktime_get_real_seconds() + __TIMEOUT_DELTA <= expiry)
                 return expiry - __TIMEOUT_DELTA;
 
         return expiry;
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_grant.c b/drivers/staging/lustrefsx/lustre/target/tgt_grant.c
index bedf54ee863d1..083e40020f1fc 100644
--- a/drivers/staging/lustrefsx/lustre/target/tgt_grant.c
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_grant.c
@@ -138,6 +138,11 @@ static int tgt_check_export_grants(struct obd_export *exp, u64 *dirty,
 	struct tg_export_data *ted = &exp->exp_target_data;
 	int level = D_CACHE;
 
+	if (exp->exp_obd->obd_self_export == exp)
+		CDEBUG(D_CACHE, "%s: processing self export: %ld %ld "
+		       "%ld\n", exp->exp_obd->obd_name, ted->ted_grant,
+		       ted->ted_pending, ted->ted_dirty);
+
 	if (ted->ted_grant < 0 || ted->ted_pending < 0 || ted->ted_dirty < 0)
 		level = D_ERROR;
 	CDEBUG_LIMIT(level, "%s: cli %s/%p dirty %ld pend %ld grant %ld\n",
@@ -183,7 +188,6 @@ void tgt_grant_sanity_check(struct obd_device *obd, const char *func)
 	struct lu_target *lut = obd->u.obt.obt_lut;
 	struct tg_grants_data *tgd = &lut->lut_tgd;
 	struct obd_export *exp;
-	struct tg_export_data *ted;
 	u64		   maxsize;
 	u64		   tot_dirty = 0;
 	u64		   tot_pending = 0;
@@ -205,15 +209,6 @@ void tgt_grant_sanity_check(struct obd_device *obd, const char *func)
 
 	spin_lock(&obd->obd_dev_lock);
 	spin_lock(&tgd->tgd_grant_lock);
-	exp = obd->obd_self_export;
-	ted = &exp->exp_target_data;
-	CDEBUG(D_CACHE, "%s: processing self export: %ld %ld "
-	       "%ld\n", obd->obd_name, ted->ted_grant,
-	       ted->ted_pending, ted->ted_dirty);
-	tot_granted += ted->ted_grant + ted->ted_pending;
-	tot_pending += ted->ted_pending;
-	tot_dirty += ted->ted_dirty;
-
 	list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) {
 		error = tgt_check_export_grants(exp, &tot_dirty, &tot_pending,
 						&tot_granted, maxsize);

From 749ceb2901ca37fe4837aef1763fa1a6b72ce5a1 Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Tue, 4 May 2021 18:39:33 -0700
Subject: [PATCH 331/737] drivers/base/memory: introduce
 memory_block_{online,offline}

Patch series "Allocate memmap from hotadded memory (per device)", v10.

The primary goal of this patchset is to reduce memory overhead of the
hot-added memory (at least for SPARSEMEM_VMEMMAP memory model).  The
current way we use to populate memmap (struct page array) has two main
drawbacks:

a) it consumes an additional memory until the hotadded memory itself is
   onlined and

b) memmap might end up on a different numa node which is especially
   true for movable_node configuration.

c) due to fragmentation we might end up populating memmap with base
   pages

One way to mitigate all these issues is to simply allocate memmap array
(which is the largest memory footprint of the physical memory hotplug)
from the hot-added memory itself.  SPARSEMEM_VMEMMAP memory model allows
us to map any pfn range so the memory doesn't need to be online to be
usable for the array.  See patch 4 for more details.  This feature is
only usable when CONFIG_SPARSEMEM_VMEMMAP is set.

[Overall design]:

Implementation wise we reuse vmem_altmap infrastructure to override the
default allocator used by vmemap_populate.  memory_block structure gains a
new field called nr_vmemmap_pages, which accounts for the number of
vmemmap pages used by that memory_block.  E.g: On x86_64, that is 512
vmemmap pages on small memory bloks and 4096 on large memory blocks (1GB)

We also introduce new two functions: memory_block_{online,offline}.  These
functions take care of initializing/unitializing vmemmap pages prior to
calling {online,offline}_pages, so the latter functions can remain totally
untouched.

More details can be found in the respective changelogs.

This patch (of 8):

This is a preparatory patch that introduces two new functions:
memory_block_online() and memory_block_offline().

For now, these functions will only call online_pages() and offline_pages()
respectively, but they will be later in charge of preparing the vmemmap
pages, carrying out the initialization and proper accounting of such
pages.

Since memory_block struct contains all the information, pass this struct
down the chain till the end functions.

Link: https://lkml.kernel.org/r/20210421102701.25051-1-osalvador@suse.de
Link: https://lkml.kernel.org/r/20210421102701.25051-2-osalvador@suse.de
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit 8736cc2d002f14e90d2b33bc5bef1740f6275ba4)
---
 drivers/base/memory.c | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 4f039436ac1ef..509344c10113c 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -169,30 +169,41 @@ int memory_notify(unsigned long val, void *v)
 	return blocking_notifier_call_chain(&memory_chain, val, v);
 }
 
+static int memory_block_online(struct memory_block *mem)
+{
+	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
+	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
+
+	return online_pages(start_pfn, nr_pages, mem->online_type, mem->nid);
+}
+
+static int memory_block_offline(struct memory_block *mem)
+{
+	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
+	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
+
+	return offline_pages(start_pfn, nr_pages);
+}
+
 /*
  * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
  * OK to have direct references to sparsemem variables in here.
  */
 static int
-memory_block_action(unsigned long start_section_nr, unsigned long action,
-		    int online_type, int nid)
+memory_block_action(struct memory_block *mem, unsigned long action)
 {
-	unsigned long start_pfn;
-	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 	int ret;
 
-	start_pfn = section_nr_to_pfn(start_section_nr);
-
 	switch (action) {
 	case MEM_ONLINE:
-		ret = online_pages(start_pfn, nr_pages, online_type, nid);
+		ret = memory_block_online(mem);
 		break;
 	case MEM_OFFLINE:
-		ret = offline_pages(start_pfn, nr_pages);
+		ret = memory_block_offline(mem);
 		break;
 	default:
 		WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
-		     "%ld\n", __func__, start_section_nr, action, action);
+		     "%ld\n", __func__, mem->start_section_nr, action, action);
 		ret = -EINVAL;
 	}
 
@@ -210,9 +221,7 @@ static int memory_block_change_state(struct memory_block *mem,
 	if (to_state == MEM_OFFLINE)
 		mem->state = MEM_GOING_OFFLINE;
 
-	ret = memory_block_action(mem->start_section_nr, to_state,
-				  mem->online_type, mem->nid);
-
+	ret = memory_block_action(mem, to_state);
 	mem->state = ret ? from_state_req : to_state;
 
 	return ret;

From b437f11b97e94c608accf98de62117b1e8868f11 Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Tue, 4 May 2021 18:39:36 -0700
Subject: [PATCH 332/737] mm,memory_hotplug: relax fully spanned sections check

We want {online,offline}_pages to operate on whole memblocks, but
memmap_on_memory will poke pageblock_nr_pages aligned holes in the
beginning, which is a special case we want to allow.  Relax the check to
account for that case.

Link: https://lkml.kernel.org/r/20210421102701.25051-3-osalvador@suse.de
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry -picked from commit dd8e2f230d82ecd60504fba48bb10bf3760b674e)
---
 mm/memory_hotplug.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9ec9e1e677051..ef40fb0fa4919 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -786,9 +786,16 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
 	int ret;
 	struct memory_notify arg;
 
-	/* We can only online full sections (e.g., SECTION_IS_ONLINE) */
+	/*
+	 * {on,off}lining is constrained to full memory sections (or more
+	 * precisly to memory blocks from the user space POV).
+	 * memmap_on_memory is an exception because it reserves initial part
+	 * of the physical memory space for vmemmaps. That space is pageblock
+	 * aligned.
+	 */
 	if (WARN_ON_ONCE(!nr_pages ||
-			 !IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION)))
+			 !IS_ALIGNED(pfn, pageblock_nr_pages) ||
+			 !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION)))
 		return -EINVAL;
 
 	mem_hotplug_begin();
@@ -1459,9 +1466,16 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 	int ret, node;
 	char *reason;
 
-	/* We can only offline full sections (e.g., SECTION_IS_ONLINE) */
+	/*
+	 * {on,off}lining is constrained to full memory sections (or more
+	 * precisly to memory blocks from the user space POV).
+	 * memmap_on_memory is an exception because it reserves initial part
+	 * of the physical memory space for vmemmaps. That space is pageblock
+	 * aligned.
+	 */
 	if (WARN_ON_ONCE(!nr_pages ||
-			 !IS_ALIGNED(start_pfn | nr_pages, PAGES_PER_SECTION)))
+			 !IS_ALIGNED(start_pfn, pageblock_nr_pages) ||
+			 !IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION)))
 		return -EINVAL;
 
 	mem_hotplug_begin();

From c097d947ffd16dba032d79786c8ac30c392999af Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 4 May 2021 18:39:39 -0700
Subject: [PATCH 333/737] mm,memory_hotplug: factor out adjusting present pages
 into adjust_present_page_count()

Let's have a single place (inspired by adjust_managed_page_count())
where we adjust present pages.

In contrast to adjust_managed_page_count(), only memory onlining or
offlining is allowed to modify the number of present pages.

Link: https://lkml.kernel.org/r/20210421102701.25051-4-osalvador@suse.de
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit f9901144e48f6a7ba186249add705d10e74738ec)
---
 mm/memory_hotplug.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ef40fb0fa4919..9d4295ef9b9e9 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -777,6 +777,16 @@ struct zone *zone_for_pfn_range(int online_type, int nid,
 	return default_zone_for_pfn(nid, start_pfn, nr_pages);
 }
 
+static void adjust_present_page_count(struct zone *zone, long nr_pages)
+{
+	unsigned long flags;
+
+	zone->present_pages += nr_pages;
+	pgdat_resize_lock(zone->zone_pgdat, &flags);
+	zone->zone_pgdat->node_present_pages += nr_pages;
+	pgdat_resize_unlock(zone->zone_pgdat, &flags);
+}
+
 int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
 		       int online_type, int nid)
 {
@@ -832,11 +842,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
 	}
 
 	online_pages_range(pfn, nr_pages);
-	zone->present_pages += nr_pages;
-
-	pgdat_resize_lock(zone->zone_pgdat, &flags);
-	zone->zone_pgdat->node_present_pages += nr_pages;
-	pgdat_resize_unlock(zone->zone_pgdat, &flags);
+	adjust_present_page_count(zone, nr_pages);
 
 	node_states_set_node(nid, &arg);
 	if (need_zonelists_rebuild)
@@ -1596,11 +1602,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 
 	/* removal success */
 	adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
-	zone->present_pages -= nr_pages;
-
-	pgdat_resize_lock(zone->zone_pgdat, &flags);
-	zone->zone_pgdat->node_present_pages -= nr_pages;
-	pgdat_resize_unlock(zone->zone_pgdat, &flags);
+	adjust_present_page_count(zone, -nr_pages);
 
 	init_per_zone_wmark_min();
 

From 862e0d7d603277520461af84a919403dcef2b6cf Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Tue, 4 May 2021 18:39:42 -0700
Subject: [PATCH 334/737] mm,memory_hotplug: allocate memmap from the added
 memory range

Physical memory hotadd has to allocate a memmap (struct page array) for
the newly added memory section.  Currently, alloc_pages_node() is used
for those allocations.

This has some disadvantages:
 a) an existing memory is consumed for that purpose
    (eg: ~2MB per 128MB memory section on x86_64)
    This can even lead to extreme cases where system goes OOM because
    the physically hotplugged memory depletes the available memory before
    it is onlined.
 b) if the whole node is movable then we have off-node struct pages
    which has performance drawbacks.
 c) It might be there are no PMD_ALIGNED chunks so memmap array gets
    populated with base pages.

This can be improved when CONFIG_SPARSEMEM_VMEMMAP is enabled.

Vmemap page tables can map arbitrary memory.  That means that we can
reserve a part of the physically hotadded memory to back vmemmap page
tables.  This implementation uses the beginning of the hotplugged memory
for that purpose.

There are some non-obviously things to consider though.

Vmemmap pages are allocated/freed during the memory hotplug events
(add_memory_resource(), try_remove_memory()) when the memory is
added/removed.  This means that the reserved physical range is not
online although it is used.  The most obvious side effect is that
pfn_to_online_page() returns NULL for those pfns.  The current design
expects that this should be OK as the hotplugged memory is considered a
garbage until it is onlined.  For example hibernation wouldn't save the
content of those vmmemmaps into the image so it wouldn't be restored on
resume but this should be OK as there no real content to recover anyway
while metadata is reachable from other data structures (e.g.  vmemmap
page tables).

The reserved space is therefore (de)initialized during the {on,off}line
events (mhp_{de}init_memmap_on_memory).  That is done by extracting page
allocator independent initialization from the regular onlining path.
The primary reason to handle the reserved space outside of
{on,off}line_pages is to make each initialization specific to the
purpose rather than special case them in a single function.

As per above, the functions that are introduced are:

 - mhp_init_memmap_on_memory:
   Initializes vmemmap pages by calling move_pfn_range_to_zone(), calls
   kasan_add_zero_shadow(), and onlines as many sections as vmemmap pages
   fully span.

 - mhp_deinit_memmap_on_memory:
   Offlines as many sections as vmemmap pages fully span, removes the
   range from zhe zone by remove_pfn_range_from_zone(), and calls
   kasan_remove_zero_shadow() for the range.

The new function memory_block_online() calls mhp_init_memmap_on_memory()
before doing the actual online_pages().  Should online_pages() fail, we
clean up by calling mhp_deinit_memmap_on_memory().  Adjusting of
present_pages is done at the end once we know that online_pages()
succedeed.

On offline, memory_block_offline() needs to unaccount vmemmap pages from
present_pages() before calling offline_pages().  This is necessary because
offline_pages() tears down some structures based on the fact whether the
node or the zone become empty.  If offline_pages() fails, we account back
vmemmap pages.  If it succeeds, we call mhp_deinit_memmap_on_memory().

Hot-remove:

 We need to be careful when removing memory, as adding and
 removing memory needs to be done with the same granularity.
 To check that this assumption is not violated, we check the
 memory range we want to remove and if a) any memory block has
 vmemmap pages and b) the range spans more than a single memory
 block, we scream out loud and refuse to proceed.

 If all is good and the range was using memmap on memory (aka vmemmap pages),
 we construct an altmap structure so free_hugepage_table does the right
 thing and calls vmem_altmap_free instead of free_pagetable.

Link: https://lkml.kernel.org/r/20210421102701.25051-5-osalvador@suse.de
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit a08a2ae3461383c2d50d0997dcc6cd1dd1fefb08)
---
 drivers/base/memory.c          |  72 +++++++++++++--
 include/linux/memory.h         |   8 +-
 include/linux/memory_hotplug.h |  15 ++-
 include/linux/memremap.h       |   2 +-
 include/linux/mmzone.h         |   7 +-
 mm/Kconfig                     |   5 +
 mm/memory_hotplug.c            | 161 +++++++++++++++++++++++++++++++--
 mm/sparse.c                    |   2 -
 8 files changed, 250 insertions(+), 22 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 509344c10113c..f251d5207fdb2 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -173,16 +173,73 @@ static int memory_block_online(struct memory_block *mem)
 {
 	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
+	unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+	struct zone *zone;
+	int ret;
+
+	zone = zone_for_pfn_range(mem->online_type, mem->nid, start_pfn, nr_pages);
+
+	/*
+	 * Although vmemmap pages have a different lifecycle than the pages
+	 * they describe (they remain until the memory is unplugged), doing
+	 * their initialization and accounting at memory onlining/offlining
+	 * stage helps to keep accounting easier to follow - e.g vmemmaps
+	 * belong to the same zone as the memory they backed.
+	 */
+	if (nr_vmemmap_pages) {
+		ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone);
+		if (ret)
+			return ret;
+	}
+
+	ret = online_pages(start_pfn + nr_vmemmap_pages,
+			   nr_pages - nr_vmemmap_pages, zone);
+	if (ret) {
+		if (nr_vmemmap_pages)
+			mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
+		return ret;
+	}
+
+	/*
+	 * Account once onlining succeeded. If the zone was unpopulated, it is
+	 * now already properly populated.
+	 */
+	if (nr_vmemmap_pages)
+		adjust_present_page_count(zone, nr_vmemmap_pages);
 
-	return online_pages(start_pfn, nr_pages, mem->online_type, mem->nid);
+	return ret;
 }
 
 static int memory_block_offline(struct memory_block *mem)
 {
 	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
+	unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+	struct zone *zone;
+	int ret;
+
+	zone = page_zone(pfn_to_page(start_pfn));
+
+	/*
+	 * Unaccount before offlining, such that unpopulated zone and kthreads
+	 * can properly be torn down in offline_pages().
+	 */
+	if (nr_vmemmap_pages)
+		adjust_present_page_count(zone, -nr_vmemmap_pages);
 
-	return offline_pages(start_pfn, nr_pages);
+	ret = offline_pages(start_pfn + nr_vmemmap_pages,
+			    nr_pages - nr_vmemmap_pages);
+	if (ret) {
+		/* offline_pages() failed. Account back. */
+		if (nr_vmemmap_pages)
+			adjust_present_page_count(zone, nr_vmemmap_pages);
+		return ret;
+	}
+
+	if (nr_vmemmap_pages)
+		mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
+
+	return ret;
 }
 
 /*
@@ -603,7 +660,8 @@ int register_memory(struct memory_block *memory)
 	return ret;
 }
 
-static int init_memory_block(unsigned long block_id, unsigned long state)
+static int init_memory_block(unsigned long block_id, unsigned long state,
+			     unsigned long nr_vmemmap_pages)
 {
 	struct memory_block *mem;
 	int ret = 0;
@@ -620,6 +678,7 @@ static int init_memory_block(unsigned long block_id, unsigned long state)
 	mem->start_section_nr = block_id * sections_per_block;
 	mem->state = state;
 	mem->nid = NUMA_NO_NODE;
+	mem->nr_vmemmap_pages = nr_vmemmap_pages;
 
 	ret = register_memory(mem);
 
@@ -639,7 +698,7 @@ static int add_memory_block(unsigned long base_section_nr)
 	if (section_count == 0)
 		return 0;
 	return init_memory_block(memory_block_id(base_section_nr),
-				 MEM_ONLINE);
+				 MEM_ONLINE, 0);
 }
 
 static void unregister_memory(struct memory_block *memory)
@@ -661,7 +720,8 @@ static void unregister_memory(struct memory_block *memory)
  *
  * Called under device_hotplug_lock.
  */
-int create_memory_block_devices(unsigned long start, unsigned long size)
+int create_memory_block_devices(unsigned long start, unsigned long size,
+				unsigned long vmemmap_pages)
 {
 	const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
 	unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
@@ -674,7 +734,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size)
 		return -EINVAL;
 
 	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
-		ret = init_memory_block(block_id, MEM_OFFLINE);
+		ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages);
 		if (ret)
 			break;
 	}
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 4da95e684e20f..97e92e8b556a3 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -29,6 +29,11 @@ struct memory_block {
 	int online_type;		/* for passing data to online routine */
 	int nid;			/* NID for this memory block */
 	struct device dev;
+	/*
+	 * Number of vmemmap pages. These pages
+	 * lay at the beginning of the memory block.
+	 */
+	unsigned long nr_vmemmap_pages;
 };
 
 int arch_get_memory_phys_device(unsigned long start_pfn);
@@ -80,7 +85,8 @@ static inline int memory_notify(unsigned long val, void *v)
 #else
 extern int register_memory_notifier(struct notifier_block *nb);
 extern void unregister_memory_notifier(struct notifier_block *nb);
-int create_memory_block_devices(unsigned long start, unsigned long size);
+int create_memory_block_devices(unsigned long start, unsigned long size,
+				unsigned long vmemmap_pages);
 void remove_memory_block_devices(unsigned long start, unsigned long size);
 extern void memory_dev_init(void);
 extern int memory_notify(unsigned long val, void *v);
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 1dafc7c7f5cfe..7a49c61182163 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -70,6 +70,14 @@ typedef int __bitwise mhp_t;
  */
 #define MEMHP_MERGE_RESOURCE	((__force mhp_t)BIT(0))
 
+/*
+ * We want memmap (struct page array) to be self contained.
+ * To do so, we will use the beginning of the hot-added range to build
+ * the page tables for the memmap array that describes the entire range.
+ * Only selected architectures support it with SPARSE_VMEMMAP.
+ */
+#define MHP_MEMMAP_ON_MEMORY   ((__force mhp_t)BIT(1))
+
 /*
  * Extended parameters for memory hotplug:
  * altmap: alternative allocator for memmap array (optional)
@@ -111,9 +119,13 @@ static inline void zone_seqlock_init(struct zone *zone)
 extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages);
 extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
 extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
+extern void adjust_present_page_count(struct zone *zone, long nr_pages);
 /* VM interface that may be used by firmware interface */
+extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
+				     struct zone *zone);
+extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages);
 extern int online_pages(unsigned long pfn, unsigned long nr_pages,
-			int online_type, int nid);
+			struct zone *zone);
 extern struct zone *test_pages_in_a_zone(unsigned long start_pfn,
 					 unsigned long end_pfn);
 extern void __offline_isolated_pages(unsigned long start_pfn,
@@ -361,6 +373,7 @@ extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
 					  unsigned long pnum);
 extern struct zone *zone_for_pfn_range(int online_type, int nid,
 		unsigned long start_pfn, unsigned long nr_pages);
+extern bool mhp_supports_memmap_on_memory(unsigned long size);
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index f5b464daeeca5..45a79da89c5fb 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -17,7 +17,7 @@ struct device;
  * @alloc: track pages consumed, private to vmemmap_populate()
  */
 struct vmem_altmap {
-	const unsigned long base_pfn;
+	unsigned long base_pfn;
 	const unsigned long end_pfn;
 	const unsigned long reserve;
 	unsigned long free;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b2e4599b88832..caf9490ec4bf1 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -406,6 +406,11 @@ enum zone_type {
 	 *    techniques might use alloc_contig_range() to hide previously
 	 *    exposed pages from the buddy again (e.g., to implement some sort
 	 *    of memory unplug in virtio-mem).
+	 * 6. Memory-hotplug: when using memmap_on_memory and onlining the
+	 *    memory to the MOVABLE zone, the vmemmap pages are also placed in
+	 *    such zone. Such pages cannot be really moved around as they are
+	 *    self-stored in the range, but they are treated as movable when
+	 *    the range they describe is about to be offlined.
 	 *
 	 * In general, no unmovable allocations that degrade memory offlining
 	 * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range())
@@ -1331,10 +1336,8 @@ static inline int online_section_nr(unsigned long nr)
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
-#ifdef CONFIG_MEMORY_HOTREMOVE
 void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
 #endif
-#endif
 
 static inline struct mem_section *__pfn_to_section(unsigned long pfn)
 {
diff --git a/mm/Kconfig b/mm/Kconfig
index 55460ca97fd8d..ecadb0fb6cd47 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -183,6 +183,11 @@ config MEMORY_HOTREMOVE
 	depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
 	depends on MIGRATION
 
+config MHP_MEMMAP_ON_MEMORY
+	def_bool y
+	depends on MEMORY_HOTPLUG && SPARSEMEM_VMEMMAP
+	depends on ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
+
 # Heavily threaded applications may benefit from splitting the mm-wide
 # page_table_lock, so that faults on different parts of the user address
 # space can be handled with less contention: split it at this NR_CPUS.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9d4295ef9b9e9..6f3ea78a567e0 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -42,6 +42,8 @@
 #include "internal.h"
 #include "shuffle.h"
 
+static bool memmap_on_memory;
+
 /*
  * online_page_callback contains pointer to current page onlining function.
  * Initially it is generic_online_page(). If it is required it could be
@@ -617,9 +619,16 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
 	 * decide to not expose all pages to the buddy (e.g., expose them
 	 * later). We account all pages as being online and belonging to this
 	 * zone ("present").
+	 * When using memmap_on_memory, the range might not be aligned to
+	 * MAX_ORDER_NR_PAGES - 1, but pageblock aligned. __ffs() will detect
+	 * this and the first chunk to online will be pageblock_nr_pages.
 	 */
-	for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES)
-		(*online_page_callback)(pfn_to_page(pfn), MAX_ORDER - 1);
+	for (pfn = start_pfn; pfn < end_pfn;) {
+		int order = min(MAX_ORDER - 1UL, __ffs(pfn));
+
+		(*online_page_callback)(pfn_to_page(pfn), order);
+		pfn += (1UL << order);
+	}
 
 	/* mark all involved sections as online */
 	online_mem_sections(start_pfn, end_pfn);
@@ -777,7 +786,11 @@ struct zone *zone_for_pfn_range(int online_type, int nid,
 	return default_zone_for_pfn(nid, start_pfn, nr_pages);
 }
 
-static void adjust_present_page_count(struct zone *zone, long nr_pages)
+/*
+ * This function should only be called by memory_block_{online,offline},
+ * and {online,offline}_pages.
+ */
+void adjust_present_page_count(struct zone *zone, long nr_pages)
 {
 	unsigned long flags;
 
@@ -787,12 +800,54 @@ static void adjust_present_page_count(struct zone *zone, long nr_pages)
 	pgdat_resize_unlock(zone->zone_pgdat, &flags);
 }
 
-int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
-		       int online_type, int nid)
+int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
+			      struct zone *zone)
+{
+	unsigned long end_pfn = pfn + nr_pages;
+	int ret;
+
+	ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
+	if (ret)
+		return ret;
+
+	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
+
+	/*
+	 * It might be that the vmemmap_pages fully span sections. If that is
+	 * the case, mark those sections online here as otherwise they will be
+	 * left offline.
+	 */
+	if (nr_pages >= PAGES_PER_SECTION)
+	        online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
+
+	return ret;
+}
+
+void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages)
+{
+	unsigned long end_pfn = pfn + nr_pages;
+
+	/*
+	 * It might be that the vmemmap_pages fully span sections. If that is
+	 * the case, mark those sections offline here as otherwise they will be
+	 * left online.
+	 */
+	if (nr_pages >= PAGES_PER_SECTION)
+		offline_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
+
+        /*
+	 * The pages associated with this vmemmap have been offlined, so
+	 * we can reset its state here.
+	 */
+	remove_pfn_range_from_zone(page_zone(pfn_to_page(pfn)), pfn, nr_pages);
+	kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
+}
+
+int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone)
 {
 	unsigned long flags;
-	struct zone *zone;
 	int need_zonelists_rebuild = 0;
+	const int nid = zone_to_nid(zone);
 	int ret;
 	struct memory_notify arg;
 
@@ -811,7 +866,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
 	mem_hotplug_begin();
 
 	/* associate pfn range with the zone */
-	zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
 	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
 
 	arg.start_pfn = pfn;
@@ -1025,6 +1079,45 @@ static int online_memory_block(struct memory_block *mem, void *arg)
 	return device_online(&mem->dev);
 }
 
+bool mhp_supports_memmap_on_memory(unsigned long size)
+{
+	unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
+	unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
+	unsigned long remaining_size = size - vmemmap_size;
+
+	/*
+	 * Besides having arch support and the feature enabled at runtime, we
+	 * need a few more assumptions to hold true:
+	 *
+	 * a) We span a single memory block: memory onlining/offlinin;g happens
+	 *    in memory block granularity. We don't want the vmemmap of online
+	 *    memory blocks to reside on offline memory blocks. In the future,
+	 *    we might want to support variable-sized memory blocks to make the
+	 *    feature more versatile.
+	 *
+	 * b) The vmemmap pages span complete PMDs: We don't want vmemmap code
+	 *    to populate memory from the altmap for unrelated parts (i.e.,
+	 *    other memory blocks)
+	 *
+	 * c) The vmemmap pages (and thereby the pages that will be exposed to
+	 *    the buddy) have to cover full pageblocks: memory onlining/offlining
+	 *    code requires applicable ranges to be page-aligned, for example, to
+	 *    set the migratetypes properly.
+	 *
+	 * TODO: Although we have a check here to make sure that vmemmap pages
+	 *       fully populate a PMD, it is not the right place to check for
+	 *       this. A much better solution involves improving vmemmap code
+	 *       to fallback to base pages when trying to populate vmemmap using
+	 *       altmap as an alternative source of memory, and we do not exactly
+	 *       populate a single PMD.
+	 */
+	return memmap_on_memory &&
+	       IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) &&
+	       size == memory_block_size_bytes() &&
+	       IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
+	       IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
+}
+
 /*
  * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
  * and online/offline operations (triggered e.g. by sysfs).
@@ -1034,6 +1127,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
 int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 {
 	struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
+	struct vmem_altmap mhp_altmap = {};
 	u64 start, size;
 	bool new_node = false;
 	int ret;
@@ -1060,13 +1154,26 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 		goto error;
 	new_node = ret;
 
+	/*
+	 * Self hosted memmap array
+	 */
+	if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
+		if (!mhp_supports_memmap_on_memory(size)) {
+			ret = -EINVAL;
+			goto error;
+		}
+		mhp_altmap.free = PHYS_PFN(size);
+		mhp_altmap.base_pfn = PHYS_PFN(start);
+		params.altmap = &mhp_altmap;
+	}
+
 	/* call arch's memory hotadd */
 	ret = arch_add_memory(nid, start, size, &params);
 	if (ret < 0)
 		goto error;
 
 	/* create memory block devices after memory was added */
-	ret = create_memory_block_devices(start, size);
+	ret = create_memory_block_devices(start, size, mhp_altmap.alloc);
 	if (ret) {
 		arch_remove_memory(nid, start, size, NULL);
 		goto error;
@@ -1655,6 +1762,14 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
 	return 0;
 }
 
+static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
+{
+	/*
+	 * If not set, continue with the next block.
+	 */
+	return mem->nr_vmemmap_pages;
+}
+
 static int check_cpu_on_node(pg_data_t *pgdat)
 {
 	int cpu;
@@ -1729,6 +1844,9 @@ EXPORT_SYMBOL(try_offline_node);
 static int __ref try_remove_memory(int nid, u64 start, u64 size)
 {
 	int rc = 0;
+	struct vmem_altmap mhp_altmap = {};
+	struct vmem_altmap *altmap = NULL;
+	unsigned long nr_vmemmap_pages;
 
 	BUG_ON(check_hotplug_memory_range(start, size));
 
@@ -1741,6 +1859,31 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
 	if (rc)
 		return rc;
 
+	/*
+	 * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in
+	 * the same granularity it was added - a single memory block.
+	 */
+	if (memmap_on_memory) {
+		nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
+						      get_nr_vmemmap_pages_cb);
+		if (nr_vmemmap_pages) {
+			if (size != memory_block_size_bytes()) {
+				pr_warn("Refuse to remove %#llx - %#llx,"
+					"wrong granularity\n",
+					start, start + size);
+				return -EINVAL;
+			}
+
+			/*
+			 * Let remove_pmd_table->free_hugepage_table do the
+			 * right thing if we used vmem_altmap when hot-adding
+			 * the range.
+			 */
+			mhp_altmap.alloc = nr_vmemmap_pages;
+			altmap = &mhp_altmap;
+		}
+	}
+
 	/* remove memmap entry */
 	firmware_map_remove(start, start + size, "System RAM");
 
@@ -1752,7 +1895,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
 
 	mem_hotplug_begin();
 
-	arch_remove_memory(nid, start, size, NULL);
+	arch_remove_memory(nid, start, size, altmap);
 
 	if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
 		memblock_free(start, size);
diff --git a/mm/sparse.c b/mm/sparse.c
index 33406ea2ecc44..d3fbed26e64ef 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -624,7 +624,6 @@ void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
 	}
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 /* Mark all memory sections within the pfn range as offline */
 void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
 {
@@ -645,7 +644,6 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
 		ms->section_mem_map &= ~SECTION_IS_ONLINE;
 	}
 }
-#endif
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 static struct page * __meminit populate_section_memmap(unsigned long pfn,

From 435dcca4facda9dff749ac7a48fca43e5c1dddc8 Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Tue, 4 May 2021 18:39:45 -0700
Subject: [PATCH 335/737] acpi,memhotplug: enable MHP_MEMMAP_ON_MEMORY when
 supported

Let the caller check whether it can pass MHP_MEMMAP_ON_MEMORY by
checking mhp_supports_memmap_on_memory().  MHP_MEMMAP_ON_MEMORY can only
be set in case ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE is enabled, the
architecture supports altmap, and the range to be added spans a single
memory block.

Link: https://lkml.kernel.org/r/20210421102701.25051-6-osalvador@suse.de
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit 4a3e5de9c4ec41bb0684b0d4e0c16abc39617d88)
---
 drivers/acpi/acpi_memhotplug.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index b02fd51e55896..8cc195c4c8619 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -171,6 +171,7 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 	acpi_handle handle = mem_device->device->handle;
 	int result, num_enabled = 0;
 	struct acpi_memory_info *info;
+	mhp_t mhp_flags = MHP_NONE;
 	int node;
 
 	node = acpi_get_node(handle);
@@ -194,8 +195,10 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 		if (node < 0)
 			node = memory_add_physaddr_to_nid(info->start_addr);
 
+		if (mhp_supports_memmap_on_memory(info->length))
+			mhp_flags |= MHP_MEMMAP_ON_MEMORY;
 		result = __add_memory(node, info->start_addr, info->length,
-				      MHP_NONE);
+				      mhp_flags);
 
 		/*
 		 * If the memory block has been used by the kernel, add_memory()

From 698971612d9b94cff2e99769a2cd19ffe0786907 Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Tue, 4 May 2021 18:39:48 -0700
Subject: [PATCH 336/737] mm,memory_hotplug: add kernel boot option to enable
 memmap_on_memory

Self stored memmap leads to a sparse memory situation which is
unsuitable for workloads that requires large contiguous memory chunks,
so make this an opt-in which needs to be explicitly enabled.

To control this, let memory_hotplug have its own memory space, as
suggested by David, so we can add memory_hotplug.memmap_on_memory
parameter.

Link: https://lkml.kernel.org/r/20210421102701.25051-7-osalvador@suse.de
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit e3a9d9fcc3315993de2e9fcd7ea82fab84433815)
---
 Documentation/admin-guide/kernel-parameters.txt | 17 +++++++++++++++++
 mm/Makefile                                     |  5 ++++-
 mm/memory_hotplug.c                             | 10 +++++++++-
 3 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index f1f7c068cf65b..c94f7228032a7 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2832,6 +2832,23 @@
 			seconds.  Use this parameter to check at some
 			other rate.  0 disables periodic checking.
 
+	memory_hotplug.memmap_on_memory
+			[KNL,X86,ARM] Boolean flag to enable this feature.
+			Format: {on | off (default)}
+			When enabled, runtime hotplugged memory will
+			allocate its internal metadata (struct pages)
+			from the hotadded memory which will allow to
+			hotadd a lot of memory without requiring
+			additional memory to do so.
+			This feature is disabled by default because it
+			has some implication on large (e.g. GB)
+			allocations in some configurations (e.g. small
+			memory blocks).
+			The state of the flag can be read in
+			/sys/module/memory_hotplug/parameters/memmap_on_memory.
+			Note that even when enabled, there are a few cases where
+			the feature is not effective.
+
 	memtest=	[KNL,X86,ARM,PPC] Enable memtest
 			Format: <integer>
 			default : 0 <disable>
diff --git a/mm/Makefile b/mm/Makefile
index 6fd576b8ba8ec..0096744d090bd 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -58,9 +58,13 @@ obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
 page-alloc-y := page_alloc.o
 page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o
 
+# Give 'memory_hotplug' its own module-parameter namespace
+memory-hotplug-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
+
 obj-y += page-alloc.o
 obj-y += init-mm.o
 obj-y += memblock.o
+obj-y += $(memory-hotplug-y)
 
 ifdef CONFIG_MMU
 	obj-$(CONFIG_ADVISE_SYSCALLS)	+= madvise.o
@@ -82,7 +86,6 @@ obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
 obj-$(CONFIG_KASAN)	+= kasan/
 obj-$(CONFIG_FAILSLAB) += failslab.o
-obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_MEMTEST)		+= memtest.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6f3ea78a567e0..0b13a56dcc73d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -42,7 +42,15 @@
 #include "internal.h"
 #include "shuffle.h"
 
-static bool memmap_on_memory;
+
+/*
+ * memory_hotplug.memmap_on_memory parameter
+ */
+static bool memmap_on_memory __ro_after_init;
+#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
+module_param(memmap_on_memory, bool, 0444);
+MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
+#endif
 
 /*
  * online_page_callback contains pointer to current page onlining function.

From 20b7e06981174d2feff5cacddea56051bed6d1e5 Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Tue, 4 May 2021 18:39:51 -0700
Subject: [PATCH 337/737] x86/Kconfig: introduce
 ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE

Enable x86_64 platform to use the MHP_MEMMAP_ON_MEMORY feature.

Link: https://lkml.kernel.org/r/20210421102701.25051-8-osalvador@suse.de
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit f91ef2223dc425e2e8759a625cffd48dce3503de)
---
 arch/x86/Kconfig | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6dc670e363939..b451b3ff9c351 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2525,6 +2525,9 @@ config ARCH_ENABLE_MEMORY_HOTREMOVE
 	def_bool y
 	depends on MEMORY_HOTPLUG
 
+config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
+	def_bool y
+
 config USE_PERCPU_NUMA_NODE_ID
 	def_bool y
 	depends on NUMA

From fcbf2afc15f1648244f3633bb5333b2089145fba Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Tue, 4 May 2021 18:39:54 -0700
Subject: [PATCH 338/737] arm64/Kconfig: introduce
 ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE

Enable arm64 platform to use the MHP_MEMMAP_ON_MEMORY feature.

Link: https://lkml.kernel.org/r/20210421102701.25051-9-osalvador@suse.de
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit ca6e51d592d20180374366e71bb0972de002d509)
---
 arch/arm64/Kconfig | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 8177fb77a8948..b627678ee5160 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -325,6 +325,9 @@ config ARCH_MEMORY_REMOVE
 	   for more information. If you are unsure how to answer this
 	   question, answer N.
 
+config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
+	def_bool y
+
 config SMP
 	def_bool y
 

From 9f96515659e6318a566e80824868bbeb4bacf062 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 4 Jun 2021 20:01:24 -0700
Subject: [PATCH 339/737] drivers/base/memory: fix trying offlining memory
 blocks with memory holes on aarch64

offline_pages() properly checks for memory holes and bails out.
However, we do a page_zone(pfn_to_page(start_pfn)) before calling
offline_pages() when offlining a memory block.

We should not unconditionally call page_zone(pfn_to_page(start_pfn)) on
aarch64 in offlining code, otherwise we can trigger a BUG when hitting a
memory hole:

   kernel BUG at include/linux/mm.h:1383!
   Internal error: Oops - BUG: 0 [#1] SMP
   Modules linked in: loop processor efivarfs ip_tables x_tables ext4 mbcache jbd2 dm_mod igb nvme i2c_algo_bit mlx5_core i2c_core nvme_core firmware_class
   CPU: 13 PID: 1694 Comm: ranbug Not tainted 5.12.0-next-20210524+ #4
   Hardware name: MiTAC RAPTOR EV-883832-X3-0001/RAPTOR, BIOS 1.6 06/28/2020
   pstate: 60000005 (nZCv daif -PAN -UAO -TCO BTYPE=--)
   pc : memory_subsys_offline+0x1f8/0x250
   lr : memory_subsys_offline+0x1f8/0x250
   Call trace:
     memory_subsys_offline+0x1f8/0x250
     device_offline+0x154/0x1d8
     online_store+0xa4/0x118
     dev_attr_store+0x44/0x78
     sysfs_kf_write+0xe8/0x138
     kernfs_fop_write_iter+0x26c/0x3d0
     new_sync_write+0x2bc/0x4f8
     vfs_write+0x718/0xc88
     ksys_write+0xf8/0x1e0
     __arm64_sys_write+0x74/0xa8
     invoke_syscall.constprop.0+0x78/0x1e8
     do_el0_svc+0xe4/0x298
     el0_svc+0x20/0x30
     el0_sync_handler+0xb0/0xb8
     el0_sync+0x178/0x180
   Kernel panic - not syncing: Oops - BUG: Fatal exception
   SMP: stopping secondary CPUs
   Kernel Offset: disabled
   CPU features: 0x00000251,20000846
   Memory Limit: none

If nr_vmemmap_pages is set, we know that we are dealing with hotplugged
memory that doesn't have any holes.  So call
page_zone(pfn_to_page(start_pfn)) only when really necessary -- when
nr_vmemmap_pages is set and we actually adjust the present pages.

Link: https://lkml.kernel.org/r/20210526075226.5572-1-david@redhat.com
Fixes: a08a2ae34613 ("mm,memory_hotplug: allocate memmap from the added memory range")
Signed-off-by: David Hildenbrand <david@redhat.com>
Reported-by: Qian Cai (QUIC) <quic_qiancai@quicinc.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Mike Rapoport <rppt@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit 928130532e19f2f920840e41bd6b1cae742ea63b)
---
 drivers/base/memory.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index f251d5207fdb2..6819bd5a8d5de 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -218,14 +218,14 @@ static int memory_block_offline(struct memory_block *mem)
 	struct zone *zone;
 	int ret;
 
-	zone = page_zone(pfn_to_page(start_pfn));
-
 	/*
 	 * Unaccount before offlining, such that unpopulated zone and kthreads
 	 * can properly be torn down in offline_pages().
 	 */
-	if (nr_vmemmap_pages)
+	if (nr_vmemmap_pages) {
+		zone = page_zone(pfn_to_page(start_pfn));
 		adjust_present_page_count(zone, -nr_vmemmap_pages);
+	}
 
 	ret = offline_pages(start_pfn + nr_vmemmap_pages,
 			    nr_pages - nr_vmemmap_pages);

From a572487d61a95b7ee41c3154e207536e49509e8a Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Thu, 9 Dec 2021 20:41:11 +0000
Subject: [PATCH 340/737] drivers/base/memory: use MHP_MEMMAP_ON_MEMORY from
 the probe interface

If it is possible to use MHP_MEMMAP_ON_MEMORY from the probe interface,
which should normally be the case, do so.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/base/memory.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 6819bd5a8d5de..486bff9521094 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -481,9 +481,10 @@ static DEVICE_ATTR_RW(auto_online_blocks);
 static ssize_t probe_store(struct device *dev, struct device_attribute *attr,
 			   const char *buf, size_t count)
 {
-	u64 phys_addr;
+	u64 phys_addr, size;
 	int nid, ret;
 	unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
+	mhp_t mhp_flags;
 
 	ret = kstrtoull(buf, 0, &phys_addr);
 	if (ret)
@@ -496,10 +497,12 @@ static ssize_t probe_store(struct device *dev, struct device_attribute *attr,
 	if (ret)
 		return ret;
 
+	size = MIN_MEMORY_BLOCK_SIZE * sections_per_block;
+	mhp_flags = mhp_supports_memmap_on_memory(size) ?
+	    MHP_MEMMAP_ON_MEMORY : MHP_NONE;
+
 	nid = memory_add_physaddr_to_nid(phys_addr);
-	ret = __add_memory(nid, phys_addr,
-			   MIN_MEMORY_BLOCK_SIZE * sections_per_block,
-			   MHP_NONE);
+	ret = __add_memory(nid, phys_addr, size, mhp_flags);
 
 	if (ret)
 		goto out;

From 889e91b5efc1e5b6fad321af2ee3dacba1653158 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Thu, 6 Jan 2022 19:05:17 +0000
Subject: [PATCH 341/737] mm: add offline page reporting interface

Add an interface to report offlined pages as free to the hypervisor.

Define a new entry point for page reporting drivers, report_offline.
If a driver sets it, it will be called after a range of memory
has been offlined.

This is done separately, and not with a memory notifier, since with
memmap_on_memory, there are pages that are only freed outside of
offline_pages, where the notifiers are called.

Since this will be called asynchronously (e.g. not from the page
reporting work queues), protect it with the page reporting mutex
so that a driver can't be unloaded while calling the entry point.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/base/memory.c          |  5 +++++
 include/linux/page_reporting.h |  4 ++++
 mm/page_reporting.c            | 13 +++++++++++++
 3 files changed, 22 insertions(+)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 486bff9521094..770bbd3e9c205 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -19,6 +19,7 @@
 #include <linux/memory.h>
 #include <linux/memory_hotplug.h>
 #include <linux/mm.h>
+#include <linux/page_reporting.h>
 #include <linux/stat.h>
 #include <linux/slab.h>
 #include <linux/xarray.h>
@@ -239,6 +240,10 @@ static int memory_block_offline(struct memory_block *mem)
 	if (nr_vmemmap_pages)
 		mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
 
+#ifdef CONFIG_PAGE_REPORTING
+	page_report_offline(start_pfn, nr_pages);
+#endif
+
 	return ret;
 }
 
diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h
index 3b99e0ec24f22..197c1d9928361 100644
--- a/include/linux/page_reporting.h
+++ b/include/linux/page_reporting.h
@@ -12,6 +12,8 @@ struct page_reporting_dev_info {
 	/* function that alters pages to make them "reported" */
 	int (*report)(struct page_reporting_dev_info *prdev,
 		      struct scatterlist *sg, unsigned int nents);
+	int (*report_offline)(struct page_reporting_dev_info *prdev,
+			      unsigned long start_pfn, unsigned int nr_pages);
 
 	/* work struct for processing reports */
 	struct delayed_work work;
@@ -20,6 +22,8 @@ struct page_reporting_dev_info {
 	atomic_t state;
 };
 
+void page_report_offline(unsigned long start_pfn, unsigned int nr_pages);
+
 /* Tear-down and bring-up for page reporting devices */
 void page_reporting_unregister(struct page_reporting_dev_info *prdev);
 int page_reporting_register(struct page_reporting_dev_info *prdev);
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index cd8e13d41df43..c47e07f2bbeb3 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -312,6 +312,19 @@ static void page_reporting_process(struct work_struct *work)
 static DEFINE_MUTEX(page_reporting_mutex);
 DEFINE_STATIC_KEY_FALSE(page_reporting_enabled);
 
+void page_report_offline(unsigned long start_pfn, unsigned int nr_pages)
+{
+	struct page_reporting_dev_info *prdev;
+
+	mutex_lock(&page_reporting_mutex);
+
+	prdev = rcu_access_pointer(pr_dev_info);
+	if (prdev && prdev->report_offline)
+		prdev->report_offline(prdev, start_pfn, nr_pages);
+
+	mutex_unlock(&page_reporting_mutex);
+}
+
 int page_reporting_register(struct page_reporting_dev_info *prdev)
 {
 	int err = 0;

From 07b3fc011adc4c2f7fe5ac65f4e710e21ad9096d Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Thu, 6 Jan 2022 19:16:23 +0000
Subject: [PATCH 342/737] virtio: add hack to allow pre-mapped scatterlists

When reporting offlined pages through free page reporting, and
memmap_on_memory is active, we don't want to touch the page structures
anymore, since that will lead to a reference to the range we just
offlined, as the page structures themselves reside in the range.

So, we can't use sg_phys to set the dma address. Instead, if sg_page
is set to NULL, assume that sg_dma_address is set already, and use it.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/virtio/virtio_ring.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 3cc2a4ee7152c..03635431af428 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -327,7 +327,8 @@ static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq,
 				   enum dma_data_direction direction)
 {
 	if (!vq->use_dma_api)
-		return (dma_addr_t)sg_phys(sg);
+		return sg_page(sg) == NULL ? sg_dma_address(sg) :
+		    (dma_addr_t)sg_phys(sg);
 
 	/*
 	 * We can't use dma_map_sg, because we don't use scatterlists in

From 45e370b32bd6b359c19801be9e3117fdc0b23f48 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Fri, 10 Dec 2021 19:07:04 +0000
Subject: [PATCH 343/737] virtio-balloon: optionally report offlined memory
 ranges

A hack to report offlined memory ranges through virtio-balloon.

Do this by registering a memory notifier callback for offlining,
and then calling the normal free page reporting entry point to report
the range that was just offlined.

This is only active if the virtio_balloon.report_offline module parameter
is set.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/virtio/virtio_balloon.c | 59 ++++++++++++++++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 481611c09dae1..eea83444fa7d2 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -21,6 +21,7 @@
 #include <linux/magic.h>
 #include <linux/pseudo_fs.h>
 #include <linux/page_reporting.h>
+#include <linux/memory.h>
 
 /*
  * Balloon device works in 4K page units.  So each page is pointed to by
@@ -46,6 +47,13 @@
 static struct vfsmount *balloon_mnt;
 #endif
 
+static bool report_offline = false;
+module_param(report_offline, bool, 0444);
+MODULE_PARM_DESC(report_offline,
+                 "Report offlined pages to the hypervisor");
+
+static DEFINE_MUTEX(vb_page_report_lock);
+
 enum virtio_balloon_vq {
 	VIRTIO_BALLOON_VQ_INFLATE,
 	VIRTIO_BALLOON_VQ_DEFLATE,
@@ -173,6 +181,15 @@ static int virtballoon_free_page_report(struct page_reporting_dev_info *pr_dev_i
 	struct virtqueue *vq = vb->reporting_vq;
 	unsigned int unused, err;
 
+	/*
+	 * virtqueue callers must make sure that only one thread is
+	 * using a queue. With offline page reporting enabled, multiple
+	 * threads might be calling this function at the same time.
+	 *
+	 * So, make sure they don't get in each other's way.
+	 */
+	mutex_lock(&vb_page_report_lock);
+
 	/* We should always be able to add these buffers to an empty queue. */
 	err = virtqueue_add_inbuf(vq, sg, nents, vb, GFP_NOWAIT | __GFP_NOWARN);
 
@@ -181,17 +198,55 @@ static int virtballoon_free_page_report(struct page_reporting_dev_info *pr_dev_i
 	 * are able to trigger an error we will simply display a warning
 	 * and exit without actually processing the pages.
 	 */
-	if (WARN_ON_ONCE(err))
+	if (WARN_ON_ONCE(err)) {
+		mutex_unlock(&vb_page_report_lock);
 		return err;
+	}
 
 	virtqueue_kick(vq);
 
 	/* When host has read buffer, this completes via balloon_ack */
 	wait_event(vb->acked, virtqueue_get_buf(vq, &unused));
 
+	mutex_unlock(&vb_page_report_lock);
+
 	return 0;
 }
 
+/*
+ * Callback for memory offline. Takes the offlined range and passes it
+ * to the normal free page reporting entry point.
+ *
+ * Assumptions that are currently all true:
+ *
+ * 1) We're in a safe context to sleep.
+ * 2) The offlined range is <= a memory section (128M on x86, 1G on arm64),
+ *    and so the length will fit in a 32bit field.
+ */
+static int virtioballoon_free_page_report_offline(
+			struct page_reporting_dev_info *pr_dev_info,
+			unsigned long start_pfn, unsigned int nr_pages)
+{
+	struct scatterlist sgl;
+	unsigned int len = nr_pages << PAGE_SHIFT;
+	int err;
+
+	/*
+	 * Set the page to NULL to signal a "pre-mapped" address,
+	 * e.g. the virtio ring code will not touch the page
+	 * structure and will just use the dma_address passed in.
+	 */
+	sg_init_table(&sgl, 1);
+	sg_set_page(&sgl, NULL, len, 0);
+	sgl.dma_address = PFN_PHYS(start_pfn);
+
+	err = virtballoon_free_page_report(pr_dev_info, &sgl, 1);
+	if (err)
+		pr_err("virtio_balloon: offline reporting failed (%d)\n", err);
+
+	return err;
+}
+
 static void set_page_pfns(struct virtio_balloon *vb,
 			  __virtio32 pfns[], struct page *page)
 {
@@ -984,6 +1039,8 @@ static int virtballoon_probe(struct virtio_device *vdev)
 	}
 
 	vb->pr_dev_info.report = virtballoon_free_page_report;
+	if (report_offline)
+		vb->pr_dev_info.report_offline = virtioballoon_free_page_report_offline;
 	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) {
 		unsigned int capacity;
 

From b7a6357cf1cf02b1c5381ebae42f56b8bb690722 Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <surajjs@amazon.com>
Date: Wed, 2 Feb 2022 12:05:43 -0800
Subject: [PATCH 344/737] ENA: Update to v2.6.1

Source: https://github.com/amzn/amzn-drivers/

Change Log:

	v2.6.0 -> v2.6.1
	**New Features**
	* Add BQL support enabled by module parameter

	**Minor Changes**
	* Don't print stats on refresh capabilities reset

Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
Reviewed-by: Hazem Mohamed Abuelfotoh <abuehaze@amazon.com>
Reviewed-by: Frank van der Linden <fllinden@amazon.com>
---
 drivers/amazon/net/ena/ena_netdev.c | 18 ++++++++++++++++--
 drivers/amazon/net/ena/ena_netdev.h |  3 ++-
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c
index 057b4de2ebad1..5c96ec35a74fd 100644
--- a/drivers/amazon/net/ena/ena_netdev.c
+++ b/drivers/amazon/net/ena/ena_netdev.c
@@ -60,7 +60,7 @@ static int rx_queue_size = ENA_DEFAULT_RING_SIZE;
 module_param(rx_queue_size, int, 0444);
 MODULE_PARM_DESC(rx_queue_size, "Rx queue size. The size should be a power of 2. Max value is 8K\n");
 
-static int force_large_llq_header;
+static int force_large_llq_header = 0;
 module_param(force_large_llq_header, int, 0444);
 MODULE_PARM_DESC(force_large_llq_header, "Increases maximum supported header size in LLQ mode to 224 bytes, while reducing the maximum TX queue size by half.\n");
 
@@ -68,6 +68,10 @@ static int num_io_queues = ENA_MAX_NUM_IO_QUEUES;
 module_param(num_io_queues, int, 0444);
 MODULE_PARM_DESC(num_io_queues, "Sets number of RX/TX queues to allocate to device. The maximum value depends on the device and number of online CPUs.\n");
 
+static int enable_bql = 0;
+module_param(enable_bql, int, 0444);
+MODULE_PARM_DESC(enable_bql, "Enable BQL.\n");
+
 static int lpc_size = ENA_LPC_DEFAULT_MULTIPLIER;
 module_param(lpc_size, uint, 0444);
 MODULE_PARM_DESC(lpc_size, "Each local page cache (lpc) holds N * 1024 pages. This parameter sets N which is rounded up to a multiplier of 2. If zero, the page cache is disabled. Max: 32\n");
@@ -787,6 +791,7 @@ static void ena_init_io_rings(struct ena_adapter *adapter,
 		txr->tx_max_header_size = ena_dev->tx_max_header_size;
 		txr->tx_mem_queue_type = ena_dev->tx_mem_queue_type;
 		txr->sgl_size = adapter->max_tx_sgl_size;
+		txr->enable_bql = enable_bql;
 		txr->smoothed_interval =
 			ena_com_get_nonadaptive_moderation_interval_tx(ena_dev);
 		txr->disable_meta_caching = adapter->disable_meta_caching;
@@ -1466,6 +1471,9 @@ static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget)
 	ena_com_comp_ack(tx_ring->ena_com_io_sq, total_done);
 	ena_com_update_dev_comp_head(tx_ring->ena_com_io_cq);
 
+	if (tx_ring->enable_bql)
+		netdev_tx_completed_queue(txq, tx_pkts, tx_bytes);
+
 	netif_dbg(tx_ring->adapter, tx_done, tx_ring->netdev,
 		  "tx_poll: q %d done. total pkts: %d\n",
 		  tx_ring->qid, tx_pkts);
@@ -3393,6 +3401,9 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (rc)
 		goto error_unmap_dma;
 
+	if (tx_ring->enable_bql)
+		netdev_tx_sent_queue(txq, skb->len);
+
 	/* stop the queue when no more space available, the packet can have up
 	 * to sgl_size + 2. one for the meta descriptor and one for header
 	 * (if the header is larger than tx_max_header_size).
@@ -4505,7 +4516,10 @@ static void ena_timer_service(unsigned long data)
 
 		netif_err(adapter, drv, adapter->netdev,
 			  "Trigger reset is on\n");
-		ena_dump_stats_to_dmesg(adapter);
+
+		if (adapter->reset_reason != ENA_REGS_RESET_NORMAL)
+			ena_dump_stats_to_dmesg(adapter);
+
 		queue_work(ena_wq, &adapter->reset_task);
 		return;
 	}
diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h
index 48cb953e6d801..bdc8f9f07c79c 100644
--- a/drivers/amazon/net/ena/ena_netdev.h
+++ b/drivers/amazon/net/ena/ena_netdev.h
@@ -26,7 +26,7 @@
 
 #define DRV_MODULE_GEN_MAJOR	2
 #define DRV_MODULE_GEN_MINOR	6
-#define DRV_MODULE_GEN_SUBMINOR 0
+#define DRV_MODULE_GEN_SUBMINOR 1
 
 #define DRV_MODULE_NAME		"ena"
 #ifndef DRV_MODULE_GENERATION
@@ -296,6 +296,7 @@ struct ena_ring {
 	u16 qid;
 	u16 mtu;
 	u16 sgl_size;
+	u8 enable_bql;
 
 	/* The maximum header length the device can handle */
 	u8 tx_max_header_size;

From e00fe2e5aeaa0a3af891d37e5fc59e73a26719b1 Mon Sep 17 00:00:00 2001
From: Shaoying Xu <shaoyi@amazon.com>
Date: Tue, 1 Mar 2022 23:09:51 +0000
Subject: [PATCH 345/737] lustre: update to AmazonFSxLustreClient v2.12.8-1

Signed-off-by: Shaoying Xu <shaoyi@amazon.com>
---
 drivers/staging/lustrefsx/Makefile.rules      |    1 +
 drivers/staging/lustrefsx/config.h            |  217 +-
 .../lustrefsx/libcfs/include/libcfs/bitmap.h  |    3 +
 .../lustrefsx/libcfs/include/libcfs/curproc.h |    4 +
 .../lustrefsx/libcfs/include/libcfs/libcfs.h  |   78 +-
 .../libcfs/include/libcfs/libcfs_cpu.h        |  102 +-
 .../libcfs/include/libcfs/libcfs_crypto.h     |  113 +-
 .../libcfs/include/libcfs/libcfs_debug.h      |  171 +-
 .../libcfs/include/libcfs/libcfs_fail.h       |   40 +-
 .../libcfs/include/libcfs/libcfs_prim.h       |    1 +
 .../libcfs/include/libcfs/libcfs_private.h    |   11 +-
 .../libcfs/include/libcfs/libcfs_ptask.h      |  121 -
 .../libcfs/include/libcfs/libcfs_string.h     |    2 +-
 .../libcfs/include/libcfs/libcfs_time.h       |   81 -
 .../libcfs/include/libcfs/linux/libcfs.h      |  150 -
 .../libcfs/include/libcfs/linux/linux-cpu.h   |   57 +-
 .../include/libcfs/linux/linux-crypto.h       |    5 -
 .../libcfs/include/libcfs/linux/linux-fs.h    |   21 +-
 .../libcfs/include/libcfs/linux/linux-hash.h  |  247 ++
 .../libcfs/include/libcfs/linux/linux-mem.h   |    8 -
 .../libcfs/include/libcfs/linux/linux-misc.h  |   64 +-
 .../libcfs/include/libcfs/linux/linux-time.h  |  164 +-
 .../libcfs/include/libcfs/linux/linux-wait.h  |  568 +++
 .../libcfs/include/libcfs/util/hash.h         |  103 +
 .../libcfs/include/libcfs/util/ioctl.h        |    4 +-
 .../libcfs/include/libcfs/util/parser.h       |    4 +-
 .../libcfs/include/libcfs/util/string.h       |   11 +-
 .../staging/lustrefsx/libcfs/libcfs/Makefile  |    5 +-
 .../staging/lustrefsx/libcfs/libcfs/debug.c   |  385 +-
 .../lustrefsx/libcfs/libcfs/libcfs_cpu.c      | 1180 +++++-
 .../lustrefsx/libcfs/libcfs/libcfs_mem.c      |   13 +-
 .../lustrefsx/libcfs/libcfs/libcfs_ptask.c    |  478 ---
 .../lustrefsx/libcfs/libcfs/libcfs_string.c   |   50 +-
 .../libcfs/libcfs/linux/linux-crypto-adler.c  |    2 +
 .../libcfs/libcfs/linux/linux-crypto-crc32.c  |    3 +
 .../libcfs/linux/linux-crypto-crc32c-pclmul.c |    3 +
 .../libcfs/linux/linux-crypto-crc32pclmul.c   |    4 +
 .../libcfs/libcfs/linux/linux-crypto.c        |   70 +-
 .../libcfs/libcfs/linux/linux-curproc.c       |   23 +-
 .../libcfs/libcfs/linux/linux-debug.c         |   54 +-
 .../libcfs/libcfs/linux/linux-hash.c          |   57 +
 .../libcfs/libcfs/linux/linux-module.c        |    4 +-
 .../libcfs/libcfs/linux/linux-prim.c          |   60 +-
 .../libcfs/libcfs/linux/linux-tracefile.c     |    1 +
 .../libcfs/libcfs/linux/linux-wait.c          |  115 +
 .../staging/lustrefsx/libcfs/libcfs/module.c  |  552 ++-
 .../lustrefsx/libcfs/libcfs/tracefile.c       |   89 +-
 .../lustrefsx/libcfs/libcfs/tracefile.h       |    3 +-
 .../lustrefsx/libcfs/libcfs/util/l_ioctl.c    |    4 +-
 .../lustrefsx/libcfs/libcfs/util/nidstrings.c |    6 +-
 .../lustrefsx/libcfs/libcfs/util/param.c      |   10 +-
 .../lustrefsx/libcfs/libcfs/util/parser.c     |   67 +-
 .../lustrefsx/libcfs/libcfs/util/string.c     |  122 +-
 .../lustrefsx/libcfs/libcfs/watchdog.c        |    7 +-
 .../lustrefsx/libcfs/libcfs/workitem.c        |    9 +-
 .../staging/lustrefsx/lnet/include/cyaml.h    |    2 +-
 .../staging/lustrefsx/lnet/include/lnet/api.h |    8 +-
 .../lustrefsx/lnet/include/lnet/lib-lnet.h    |  275 +-
 .../lustrefsx/lnet/include/lnet/lib-types.h   |  582 ++-
 .../lustrefsx/lnet/include/lnet/socklnd.h     |   14 +-
 .../include/uapi/linux/lnet/libcfs_debug.h    |  151 +
 .../include/uapi/linux/lnet}/libcfs_ioctl.h   |   24 +-
 .../lib-dlc.h => uapi/linux/lnet/lnet-dlc.h}  |   77 +-
 .../types.h => uapi/linux/lnet/lnet-types.h}  |  179 +-
 .../include/{ => uapi/linux}/lnet/lnetctl.h   |   36 +-
 .../include/{ => uapi/linux}/lnet/lnetst.h    |   10 +-
 .../include/{ => uapi/linux}/lnet/nidstr.h    |   14 +-
 .../lnet.h => uapi/linux/lnet/socklnd.h}      |   24 +-
 .../lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c    |  926 ++---
 .../lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h    |  426 ++-
 .../lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c | 1037 +++---
 .../lnet/klnds/o2iblnd/o2iblnd_modparams.c    |   84 +-
 .../lustrefsx/lnet/klnds/socklnd/socklnd.c    |  782 ++--
 .../lustrefsx/lnet/klnds/socklnd/socklnd.h    |  333 +-
 .../lustrefsx/lnet/klnds/socklnd/socklnd_cb.c | 1107 +++---
 .../lnet/klnds/socklnd/socklnd_lib.c          |  116 +-
 .../lnet/klnds/socklnd/socklnd_modparams.c    |    4 +-
 .../lnet/klnds/socklnd/socklnd_proto.c        |   87 +-
 .../staging/lustrefsx/lnet/lnet/acceptor.c    |   10 +-
 drivers/staging/lustrefsx/lnet/lnet/api-ni.c  | 1678 +++++++--
 drivers/staging/lustrefsx/lnet/lnet/config.c  |   66 +-
 drivers/staging/lustrefsx/lnet/lnet/lib-eq.c  |    2 -
 drivers/staging/lustrefsx/lnet/lnet/lib-md.c  |    2 +-
 .../staging/lustrefsx/lnet/lnet/lib-move.c    | 3012 +++++++++++++---
 drivers/staging/lustrefsx/lnet/lnet/lib-msg.c |  699 +++-
 drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c |    2 +-
 .../staging/lustrefsx/lnet/lnet/lib-socket.c  |   28 +-
 drivers/staging/lustrefsx/lnet/lnet/lo.c      |    2 +
 drivers/staging/lustrefsx/lnet/lnet/module.c  |   51 +-
 .../staging/lustrefsx/lnet/lnet/net_fault.c   |  198 +-
 .../staging/lustrefsx/lnet/lnet/nidstrings.c  |    4 +-
 drivers/staging/lustrefsx/lnet/lnet/peer.c    | 3172 +++++++++++++++--
 drivers/staging/lustrefsx/lnet/lnet/router.c  |  436 ++-
 .../staging/lustrefsx/lnet/lnet/router_proc.c |  107 +-
 .../lustrefsx/lnet/selftest/brw_test.c        |  107 +-
 .../staging/lustrefsx/lnet/selftest/conctl.c  |  784 ++--
 .../staging/lustrefsx/lnet/selftest/conrpc.c  |  335 +-
 .../staging/lustrefsx/lnet/selftest/conrpc.h  |   46 +-
 .../staging/lustrefsx/lnet/selftest/console.c |  444 +--
 .../staging/lustrefsx/lnet/selftest/console.h |   69 +-
 .../lustrefsx/lnet/selftest/framework.c       |  412 +--
 .../staging/lustrefsx/lnet/selftest/module.c  |  129 +-
 .../lustrefsx/lnet/selftest/ping_test.c       |   52 +-
 drivers/staging/lustrefsx/lnet/selftest/rpc.c | 1218 +++----
 drivers/staging/lustrefsx/lnet/selftest/rpc.h |  182 +-
 .../lustrefsx/lnet/selftest/selftest.h        |  246 +-
 .../staging/lustrefsx/lnet/selftest/timer.c   |   33 +-
 .../staging/lustrefsx/lnet/selftest/timer.h   |   10 +-
 .../lustrefsx/lustre/fid/fid_handler.c        |   64 +-
 .../lustrefsx/lustre/fid/fid_internal.h       |   11 +-
 .../staging/lustrefsx/lustre/fid/fid_lib.c    |    1 -
 .../lustrefsx/lustre/fid/fid_request.c        |  114 +-
 .../staging/lustrefsx/lustre/fid/fid_store.c  |    2 +-
 .../staging/lustrefsx/lustre/fid/lproc_fid.c  |  212 +-
 .../staging/lustrefsx/lustre/fld/fld_cache.c  |   23 +-
 .../lustrefsx/lustre/fld/fld_handler.c        |  149 +-
 .../staging/lustrefsx/lustre/fld/fld_index.c  |  133 +-
 .../lustrefsx/lustre/fld/fld_internal.h       |   18 +-
 .../lustrefsx/lustre/fld/fld_request.c        |  418 ++-
 .../staging/lustrefsx/lustre/fld/lproc_fld.c  |   65 +-
 .../lustrefsx/lustre/include/cl_object.h      |  214 +-
 .../lustrefsx/lustre/include/dt_object.h      |   88 +-
 .../lustrefsx/lustre/include/llog_swab.h      |    2 +-
 .../lustrefsx/lustre/include/lprocfs_status.h |  315 +-
 .../lustrefsx/lustre/include/lu_object.h      |  194 +-
 .../lustrefsx/lustre/include/lu_target.h      |   69 +-
 .../lustre/include/lustre/ll_fiemap.h         |   38 +-
 .../include/lustre/lustre_barrier_user.h      |   53 +-
 .../lustre/include/lustre/lustre_lfsck_user.h |  214 +-
 .../lustre/include/lustre/lustre_user.h       | 1639 +--------
 .../lustre/include/lustre/lustreapi.h         |  616 ++--
 .../lustrefsx/lustre/include/lustre_acl.h     |    2 +-
 .../lustrefsx/lustre/include/lustre_barrier.h |    2 +-
 .../lustrefsx/lustre/include/lustre_compat.h  |  205 +-
 .../lustrefsx/lustre/include/lustre_disk.h    |   11 +-
 .../lustrefsx/lustre/include/lustre_dlm.h     |  311 +-
 .../lustre/include/lustre_dlm_flags.h         |   73 +-
 .../lustrefsx/lustre/include/lustre_eacl.h    |    2 +-
 .../include/{lustre => }/lustre_errno.h       |    0
 .../lustrefsx/lustre/include/lustre_export.h  |  148 +-
 .../lustrefsx/lustre/include/lustre_fid.h     |   42 +-
 .../lustrefsx/lustre/include/lustre_fld.h     |   19 +-
 .../lustrefsx/lustre/include/lustre_ha.h      |    2 +-
 .../lustrefsx/lustre/include/lustre_idmap.h   |    2 +-
 .../lustrefsx/lustre/include/lustre_import.h  |  102 +-
 .../lustre/include/lustre_kernelcomm.h        |    4 +-
 .../lustrefsx/lustre/include/lustre_lfsck.h   |   10 +-
 .../lustrefsx/lustre/include/lustre_lib.h     |   38 +-
 .../lustrefsx/lustre/include/lustre_linkea.h  |    2 +-
 .../lustrefsx/lustre/include/lustre_lmv.h     |   43 +-
 .../lustrefsx/lustre/include/lustre_log.h     |   17 +-
 .../lustrefsx/lustre/include/lustre_mdc.h     |  100 +-
 .../lustrefsx/lustre/include/lustre_mds.h     |   29 +-
 .../lustrefsx/lustre/include/lustre_net.h     |  209 +-
 .../lustrefsx/lustre/include/lustre_nodemap.h |   13 +-
 .../lustrefsx/lustre/include/lustre_nrs_tbf.h |   64 +-
 .../lustrefsx/lustre/include/lustre_obdo.h    |    2 +-
 .../lustre_osc.h}                             |  598 +++-
 .../lustre/include/lustre_patchless_compat.h  |   20 -
 .../lustrefsx/lustre/include/lustre_quota.h   |   29 +-
 .../lustre/include/lustre_req_layout.h        |   20 +-
 .../lustrefsx/lustre/include/lustre_scrub.h   |  375 ++
 .../lustrefsx/lustre/include/lustre_sec.h     |   21 +-
 .../lustrefsx/lustre/include/lustre_swab.h    |    8 +-
 .../lustrefsx/lustre/include/lustre_update.h  |    5 +-
 .../lustrefsx/lustre/include/md_object.h      |  287 +-
 .../staging/lustrefsx/lustre/include/obd.h    |  211 +-
 .../lustrefsx/lustre/include/obd_cksum.h      |  153 +-
 .../lustrefsx/lustre/include/obd_class.h      | 1269 ++++---
 .../lustrefsx/lustre/include/obd_support.h    |  130 +-
 .../lustrefsx/lustre/include/obj_update.h     |    2 +-
 .../lustrefsx/lustre/include/seq_range.h      |    2 +-
 .../uapi/linux/lustre/lustre_barrier_user.h   |   74 +
 .../uapi/linux/{ => lustre}/lustre_cfg.h      |   67 +-
 .../uapi/linux/{ => lustre}/lustre_disk.h     |   36 +-
 .../uapi/linux/{ => lustre}/lustre_fid.h      |    7 +-
 .../include/uapi/linux/lustre/lustre_fiemap.h |   72 +
 .../{ => uapi/linux}/lustre/lustre_idl.h      | 1017 +++---
 .../uapi/linux/{ => lustre}/lustre_ioctl.h    |   27 +-
 .../linux/lustre/lustre_kernelcomm.h}         |   15 +-
 .../uapi/linux/lustre/lustre_lfsck_user.h     |  238 ++
 .../{ => uapi/linux/lustre}/lustre_log_user.h |    3 +-
 .../uapi/linux/{ => lustre}/lustre_ostid.h    |   14 +-
 .../uapi/linux/{ => lustre}/lustre_param.h    |    0
 .../include/uapi/linux/lustre/lustre_user.h   | 2366 ++++++++++++
 .../{ => uapi/linux/lustre}/lustre_ver.h      |    6 -
 .../lustrefsx/lustre/include/upcall_cache.h   |   10 +-
 .../lustrefsx/lustre/ldlm/interval_tree.c     |    7 +-
 .../lustrefsx/lustre/ldlm/ldlm_extent.c       |  182 +-
 .../lustrefsx/lustre/ldlm/ldlm_flock.c        |   39 +-
 .../lustrefsx/lustre/ldlm/ldlm_inodebits.c    |  441 ++-
 .../lustrefsx/lustre/ldlm/ldlm_internal.h     |   91 +-
 .../staging/lustrefsx/lustre/ldlm/ldlm_lib.c  |  715 ++--
 .../staging/lustrefsx/lustre/ldlm/ldlm_lock.c |  893 +++--
 .../lustrefsx/lustre/ldlm/ldlm_lockd.c        | 1227 ++++---
 .../lustrefsx/lustre/ldlm/ldlm_plain.c        |   36 +-
 .../staging/lustrefsx/lustre/ldlm/ldlm_pool.c |  374 +-
 .../lustrefsx/lustre/ldlm/ldlm_request.c      |  791 ++--
 .../lustrefsx/lustre/ldlm/ldlm_resource.c     |  438 ++-
 .../staging/lustrefsx/lustre/llite/Makefile   |    2 +-
 .../staging/lustrefsx/lustre/llite/dcache.c   |    3 +-
 drivers/staging/lustrefsx/lustre/llite/dir.c  |  826 +++--
 drivers/staging/lustrefsx/lustre/llite/file.c | 2010 +++++++----
 .../staging/lustrefsx/lustre/llite/glimpse.c  |   84 +-
 .../lustrefsx/lustre/llite/lcommon_cl.c       |   19 +-
 .../lustrefsx/lustre/llite/lcommon_misc.c     |    5 +-
 .../lustrefsx/lustre/llite/llite_internal.h   |  192 +-
 .../lustrefsx/lustre/llite/llite_lib.c        |  915 ++---
 .../lustrefsx/lustre/llite/llite_mmap.c       |   23 +-
 .../lustrefsx/lustre/llite/llite_nfs.c        |    2 +-
 .../lustrefsx/lustre/llite/lproc_llite.c      | 1307 +++----
 .../staging/lustrefsx/lustre/llite/namei.c    |  603 +++-
 .../lustrefsx/lustre/llite/range_lock.c       |    5 +-
 drivers/staging/lustrefsx/lustre/llite/rw.c   |  100 +-
 drivers/staging/lustrefsx/lustre/llite/rw26.c |  162 +-
 .../lustrefsx/lustre/llite/statahead.c        |  181 +-
 .../staging/lustrefsx/lustre/llite/super25.c  |   41 +-
 .../staging/lustrefsx/lustre/llite/vvp_dev.c  |  311 +-
 .../lustrefsx/lustre/llite/vvp_internal.h     |   28 +-
 .../staging/lustrefsx/lustre/llite/vvp_io.c   |  393 +-
 .../staging/lustrefsx/lustre/llite/vvp_lock.c |   86 -
 .../lustrefsx/lustre/llite/vvp_object.c       |   18 +-
 .../staging/lustrefsx/lustre/llite/vvp_page.c |   39 +-
 .../staging/lustrefsx/lustre/llite/xattr.c    |   86 +-
 .../staging/lustrefsx/lustre/llite/xattr26.c  |   32 +-
 .../lustrefsx/lustre/llite/xattr_cache.c      |    3 +-
 .../lustrefsx/lustre/llite/xattr_security.c   |   33 +
 .../staging/lustrefsx/lustre/lmv/lmv_fld.c    |    1 -
 .../staging/lustrefsx/lustre/lmv/lmv_intent.c |  245 +-
 .../lustrefsx/lustre/lmv/lmv_internal.h       |   89 +-
 .../staging/lustrefsx/lustre/lmv/lmv_obd.c    | 1622 +++++----
 .../staging/lustrefsx/lustre/lmv/lproc_lmv.c  |  121 +-
 drivers/staging/lustrefsx/lustre/lov/Makefile |    4 +-
 .../lustrefsx/lustre/lov/lov_cl_internal.h    |  296 +-
 .../staging/lustrefsx/lustre/lov/lov_dev.c    |  581 ++-
 drivers/staging/lustrefsx/lustre/lov/lov_ea.c |   84 +-
 .../lustrefsx/lustre/lov/lov_internal.h       |   53 +-
 drivers/staging/lustrefsx/lustre/lov/lov_io.c |  990 +++--
 .../staging/lustrefsx/lustre/lov/lov_lock.c   |  128 +-
 .../staging/lustrefsx/lustre/lov/lov_merge.c  |    2 +-
 .../staging/lustrefsx/lustre/lov/lov_obd.c    |  615 ++--
 .../staging/lustrefsx/lustre/lov/lov_object.c |  948 +++--
 .../staging/lustrefsx/lustre/lov/lov_offset.c |  153 +-
 .../staging/lustrefsx/lustre/lov/lov_pack.c   |  106 +-
 .../staging/lustrefsx/lustre/lov/lov_page.c   |   44 +-
 .../staging/lustrefsx/lustre/lov/lov_pool.c   |   32 +-
 .../lustrefsx/lustre/lov/lov_request.c        |  308 +-
 .../staging/lustrefsx/lustre/lov/lovsub_dev.c |  108 +-
 .../lustrefsx/lustre/lov/lovsub_lock.c        |   82 -
 .../lustrefsx/lustre/lov/lovsub_object.c      |   95 +-
 .../lustrefsx/lustre/lov/lovsub_page.c        |   70 -
 .../staging/lustrefsx/lustre/lov/lproc_lov.c  |  272 +-
 drivers/staging/lustrefsx/lustre/mdc/Makefile |    2 +-
 .../staging/lustrefsx/lustre/mdc/lproc_mdc.c  |  511 ++-
 .../lustrefsx/lustre/mdc/mdc_changelog.c      |  342 +-
 .../staging/lustrefsx/lustre/mdc/mdc_dev.c    | 1564 ++++++++
 .../lustrefsx/lustre/mdc/mdc_internal.h       |   28 +-
 .../staging/lustrefsx/lustre/mdc/mdc_lib.c    |  185 +-
 .../staging/lustrefsx/lustre/mdc/mdc_locks.c  |  353 +-
 .../staging/lustrefsx/lustre/mdc/mdc_reint.c  |  183 +-
 .../lustrefsx/lustre/mdc/mdc_request.c        |  770 ++--
 .../staging/lustrefsx/lustre/mgc/lproc_mgc.c  |   52 +-
 .../lustrefsx/lustre/mgc/mgc_internal.h       |    7 +-
 .../lustrefsx/lustre/mgc/mgc_request.c        |  175 +-
 .../lustrefsx/lustre/obdclass/Makefile        |   14 +-
 .../staging/lustrefsx/lustre/obdclass/acl.c   |  183 +-
 .../lustrefsx/lustre/obdclass/cl_internal.h   |    2 +-
 .../staging/lustrefsx/lustre/obdclass/cl_io.c |  263 +-
 .../lustrefsx/lustre/obdclass/cl_lock.c       |   11 +-
 .../lustrefsx/lustre/obdclass/cl_object.c     |  105 +-
 .../lustrefsx/lustre/obdclass/cl_page.c       |  114 +-
 .../lustrefsx/lustre/obdclass/class_obd.c     |  301 +-
 .../lustrefsx/lustre/obdclass/dt_object.c     |  671 ++--
 .../lustrefsx/lustre/obdclass/genops.c        |  931 ++---
 .../staging/lustrefsx/lustre/obdclass/idmap.c |   26 +-
 .../lustrefsx/lustre/obdclass/integrity.c     |  277 ++
 .../staging/lustrefsx/lustre/obdclass/jobid.c |  575 +++
 .../lustrefsx/lustre/obdclass/kernelcomm.c    |   11 +-
 .../lustrefsx/lustre/obdclass/linkea.c        |   12 +-
 .../lustre/obdclass/linux/linux-module.c      |  582 ---
 .../lustre/obdclass/linux/linux-obdo.c        |  157 -
 .../lustre/obdclass/linux/linux-sysctl.c      |  190 -
 .../staging/lustrefsx/lustre/obdclass/llog.c  |  311 +-
 .../lustrefsx/lustre/obdclass/llog_cat.c      |  689 ++--
 .../lustrefsx/lustre/obdclass/llog_internal.h |   11 +-
 .../lustrefsx/lustre/obdclass/llog_ioctl.c    |  296 +-
 .../lustrefsx/lustre/obdclass/llog_obd.c      |  182 +-
 .../lustrefsx/lustre/obdclass/llog_osd.c      |  112 +-
 .../lustrefsx/lustre/obdclass/llog_swab.c     |  306 +-
 .../lustrefsx/lustre/obdclass/llog_test.c     |  630 ++--
 .../lustrefsx/lustre/obdclass/local_storage.c |   10 +-
 .../lustre/obdclass/lprocfs_jobstats.c        |   63 +-
 .../lustre/obdclass/lprocfs_status.c          | 1099 +++---
 .../lustre/obdclass/lprocfs_status_server.c   |  405 ++-
 .../lustrefsx/lustre/obdclass/lu_object.c     |  461 ++-
 .../lustrefsx/lustre/obdclass/lu_ref.c        |  172 +-
 .../lustre/obdclass/lustre_handles.c          |   85 +-
 .../lustrefsx/lustre/obdclass/lustre_peer.c   |  105 +-
 .../lustrefsx/lustre/obdclass/md_attrs.c      |   27 +-
 .../lustrefsx/lustre/obdclass/obd_cksum.c     |  149 +
 .../lustrefsx/lustre/obdclass/obd_config.c    |  583 ++-
 .../lustrefsx/lustre/obdclass/obd_mount.c     |  100 +-
 .../lustre/obdclass/obd_mount_server.c        |  150 +-
 .../lustrefsx/lustre/obdclass/obd_sysfs.c     |  535 +++
 .../staging/lustrefsx/lustre/obdclass/obdo.c  |  172 +-
 .../lustrefsx/lustre/obdclass/obdo_server.c   |  156 +
 .../staging/lustrefsx/lustre/obdclass/scrub.c | 1216 +++++++
 .../lustrefsx/lustre/obdclass/statfs_pack.c   |   36 +-
 .../lustrefsx/lustre/obdclass/upcall_cache.c  |   21 +-
 .../staging/lustrefsx/lustre/obdclass/uuid.c  |   78 -
 .../staging/lustrefsx/lustre/obdecho/echo.c   |  614 +++-
 .../lustrefsx/lustre/obdecho/echo_client.c    |   87 +-
 .../lustrefsx/lustre/obdecho/echo_internal.h  |    1 +
 .../staging/lustrefsx/lustre/osc/lproc_osc.c  |  676 ++--
 .../staging/lustrefsx/lustre/osc/osc_cache.c  |  366 +-
 .../staging/lustrefsx/lustre/osc/osc_dev.c    |   58 +-
 .../lustrefsx/lustre/osc/osc_internal.h       |  191 +-
 drivers/staging/lustrefsx/lustre/osc/osc_io.c |  290 +-
 .../staging/lustrefsx/lustre/osc/osc_lock.c   |  401 ++-
 .../staging/lustrefsx/lustre/osc/osc_object.c |  167 +-
 .../staging/lustrefsx/lustre/osc/osc_page.c   |  103 +-
 .../staging/lustrefsx/lustre/osc/osc_quota.c  |   21 +-
 .../lustrefsx/lustre/osc/osc_request.c        | 1248 ++++---
 .../staging/lustrefsx/lustre/ptlrpc/client.c  |  308 +-
 .../staging/lustrefsx/lustre/ptlrpc/errno.c   |   33 +-
 .../staging/lustrefsx/lustre/ptlrpc/events.c  |   46 +-
 .../lustrefsx/lustre/ptlrpc/gss/gss_api.h     |   14 +-
 .../lustrefsx/lustre/ptlrpc/gss/gss_bulk.c    |    1 -
 .../lustre/ptlrpc/gss/gss_cli_upcall.c        |  148 +-
 .../lustrefsx/lustre/ptlrpc/gss/gss_crypto.c  |  177 +-
 .../lustrefsx/lustre/ptlrpc/gss/gss_crypto.h  |   88 +-
 .../lustre/ptlrpc/gss/gss_generic_token.c     |    1 -
 .../lustre/ptlrpc/gss/gss_internal.h          |   47 +-
 .../lustrefsx/lustre/ptlrpc/gss/gss_keyring.c |  249 +-
 .../lustrefsx/lustre/ptlrpc/gss/gss_krb5.h    |    2 +-
 .../lustre/ptlrpc/gss/gss_krb5_mech.c         | 1268 +++----
 .../lustre/ptlrpc/gss/gss_mech_switch.c       |   62 +-
 .../lustre/ptlrpc/gss/gss_null_mech.c         |    4 +-
 .../lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c  |    2 +-
 .../lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c |  309 +-
 .../lustre/ptlrpc/gss/gss_svc_upcall.c        |  180 +-
 .../lustrefsx/lustre/ptlrpc/gss/lproc_gss.c   |   41 +-
 .../lustrefsx/lustre/ptlrpc/gss/sec_gss.c     |   45 +-
 .../staging/lustrefsx/lustre/ptlrpc/import.c  |  735 ++--
 .../staging/lustrefsx/lustre/ptlrpc/layout.c  |  447 ++-
 .../lustrefsx/lustre/ptlrpc/llog_client.c     |   36 -
 .../lustrefsx/lustre/ptlrpc/llog_server.c     |   51 -
 .../lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c    |  448 ++-
 .../staging/lustrefsx/lustre/ptlrpc/niobuf.c  |  100 +-
 .../lustre/ptlrpc/nodemap_internal.h          |    2 +-
 .../staging/lustrefsx/lustre/ptlrpc/nrs_crr.c |   50 +-
 .../lustrefsx/lustre/ptlrpc/nrs_delay.c       |   44 +-
 .../staging/lustrefsx/lustre/ptlrpc/nrs_orr.c |   85 +-
 .../staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c |  790 +++-
 .../lustrefsx/lustre/ptlrpc/pack_generic.c    |  700 ++--
 .../staging/lustrefsx/lustre/ptlrpc/pers.c    |   29 +-
 .../staging/lustrefsx/lustre/ptlrpc/pinger.c  |  447 +--
 .../lustrefsx/lustre/ptlrpc/ptlrpc_internal.h |   18 +-
 .../staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c |   14 +-
 .../staging/lustrefsx/lustre/ptlrpc/recover.c |   73 +-
 drivers/staging/lustrefsx/lustre/ptlrpc/sec.c |  173 +-
 .../lustrefsx/lustre/ptlrpc/sec_bulk.c        |   79 +-
 .../lustrefsx/lustre/ptlrpc/sec_config.c      |    4 +-
 .../staging/lustrefsx/lustre/ptlrpc/sec_ctx.c |    2 +-
 .../staging/lustrefsx/lustre/ptlrpc/sec_gc.c  |  107 +-
 .../lustrefsx/lustre/ptlrpc/sec_lproc.c       |  115 +-
 .../lustrefsx/lustre/ptlrpc/sec_null.c        |   17 +-
 .../lustrefsx/lustre/ptlrpc/sec_plain.c       |   27 +-
 .../staging/lustrefsx/lustre/ptlrpc/service.c |  296 +-
 .../staging/lustrefsx/lustre/ptlrpc/wirehdr.c |    6 +-
 .../lustrefsx/lustre/ptlrpc/wiretest.c        |  794 +++--
 .../staging/lustrefsx/lustre/target/barrier.c |   24 +-
 .../lustrefsx/lustre/target/out_handler.c     |  109 +-
 .../staging/lustrefsx/lustre/target/out_lib.c |   27 +-
 .../staging/lustrefsx/lustre/target/tgt_fmd.c |  363 ++
 .../lustrefsx/lustre/target/tgt_grant.c       |  257 +-
 .../lustrefsx/lustre/target/tgt_handler.c     |  775 +++-
 .../lustrefsx/lustre/target/tgt_internal.h    |   18 +-
 .../lustrefsx/lustre/target/tgt_lastrcvd.c    |   31 +-
 .../lustrefsx/lustre/target/tgt_main.c        |  350 +-
 .../lustrefsx/lustre/target/update_records.c  |    2 +-
 .../lustrefsx/lustre/target/update_recovery.c |    2 +-
 .../lustrefsx/lustre/target/update_trans.c    |   23 +-
 drivers/staging/lustrefsx/undef.h             |  197 +-
 384 files changed, 57903 insertions(+), 32508 deletions(-)
 delete mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h
 delete mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_time.h
 delete mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/libcfs.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-hash.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-wait.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/util/hash.h
 delete mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-hash.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-wait.c
 create mode 100644 drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_debug.h
 rename drivers/staging/lustrefsx/{libcfs/include/libcfs => lnet/include/uapi/linux/lnet}/libcfs_ioctl.h (88%)
 rename drivers/staging/lustrefsx/lnet/include/{lnet/lib-dlc.h => uapi/linux/lnet/lnet-dlc.h} (76%)
 rename drivers/staging/lustrefsx/lnet/include/{lnet/types.h => uapi/linux/lnet/lnet-types.h} (85%)
 rename drivers/staging/lustrefsx/lnet/include/{ => uapi/linux}/lnet/lnetctl.h (76%)
 rename drivers/staging/lustrefsx/lnet/include/{ => uapi/linux}/lnet/lnetst.h (99%)
 rename drivers/staging/lustrefsx/lnet/include/{ => uapi/linux}/lnet/nidstr.h (93%)
 rename drivers/staging/lustrefsx/lnet/include/{lnet/lnet.h => uapi/linux/lnet/socklnd.h} (74%)
 rename drivers/staging/lustrefsx/lustre/include/{lustre => }/lustre_errno.h (100%)
 rename drivers/staging/lustrefsx/lustre/{osc/osc_cl_internal.h => include/lustre_osc.h} (52%)
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_scrub.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_barrier_user.h
 rename drivers/staging/lustrefsx/lustre/include/uapi/linux/{ => lustre}/lustre_cfg.h (77%)
 rename drivers/staging/lustrefsx/lustre/include/uapi/linux/{ => lustre}/lustre_disk.h (85%)
 rename drivers/staging/lustrefsx/lustre/include/uapi/linux/{ => lustre}/lustre_fid.h (97%)
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fiemap.h
 rename drivers/staging/lustrefsx/lustre/include/{ => uapi/linux}/lustre/lustre_idl.h (81%)
 rename drivers/staging/lustrefsx/lustre/include/uapi/linux/{ => lustre}/lustre_ioctl.h (93%)
 rename drivers/staging/lustrefsx/lustre/include/{uapi_kernelcomm.h => uapi/linux/lustre/lustre_kernelcomm.h} (88%)
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_lfsck_user.h
 rename drivers/staging/lustrefsx/lustre/include/{ => uapi/linux/lustre}/lustre_log_user.h (97%)
 rename drivers/staging/lustrefsx/lustre/include/uapi/linux/{ => lustre}/lustre_ostid.h (95%)
 rename drivers/staging/lustrefsx/lustre/include/uapi/linux/{ => lustre}/lustre_param.h (100%)
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_user.h
 rename drivers/staging/lustrefsx/lustre/include/{ => uapi/linux/lustre}/lustre_ver.h (83%)
 mode change 100755 => 100644 drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
 delete mode 100644 drivers/staging/lustrefsx/lustre/llite/vvp_lock.c
 delete mode 100644 drivers/staging/lustrefsx/lustre/lov/lovsub_lock.c
 delete mode 100644 drivers/staging/lustrefsx/lustre/lov/lovsub_page.c
 create mode 100644 drivers/staging/lustrefsx/lustre/mdc/mdc_dev.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/integrity.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/jobid.c
 delete mode 100644 drivers/staging/lustrefsx/lustre/obdclass/linux/linux-module.c
 delete mode 100644 drivers/staging/lustrefsx/lustre/obdclass/linux/linux-obdo.c
 delete mode 100644 drivers/staging/lustrefsx/lustre/obdclass/linux/linux-sysctl.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/obd_cksum.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/obd_sysfs.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/obdo_server.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/scrub.c
 delete mode 100644 drivers/staging/lustrefsx/lustre/obdclass/uuid.c
 create mode 100644 drivers/staging/lustrefsx/lustre/target/tgt_fmd.c

diff --git a/drivers/staging/lustrefsx/Makefile.rules b/drivers/staging/lustrefsx/Makefile.rules
index a0d56e80f2ce7..ce56ffa5576a0 100644
--- a/drivers/staging/lustrefsx/Makefile.rules
+++ b/drivers/staging/lustrefsx/Makefile.rules
@@ -3,4 +3,5 @@ ccflags-y += -include $(srctree)/drivers/staging/lustrefsx/config.h
 ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/libcfs/include
 ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/lnet/include
 ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/lustre/include
+ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/lustre/include/uapi
 ccflags-y += -Wno-format-truncation -Werror
diff --git a/drivers/staging/lustrefsx/config.h b/drivers/staging/lustrefsx/config.h
index fce8b057480b6..69580ddb7b9f3 100644
--- a/drivers/staging/lustrefsx/config.h
+++ b/drivers/staging/lustrefsx/config.h
@@ -25,17 +25,11 @@
 /* extened attributes for ldiskfs */
 /* #undef CONFIG_LDISKFS_FS_XATTR */
 
-/* Max LNET payload */
-#define CONFIG_LNET_MAX_PAYLOAD LNET_MTU
-
 /* enable invariant checking */
 /* #undef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK */
 
-/* IOCTL Buffer Size */
-#define CONFIG_LUSTRE_OBD_MAX_IOCTL_BUFFER 8192
-
 /* kernel has cpu affinity support */
-/* #undef CPU_AFFINITY */
+#define CPU_AFFINITY 1
 
 /* both i_dentry/d_alias uses list */
 /* #undef DATA_FOR_LLITE_IS_LIST */
@@ -58,9 +52,15 @@
 /* do data checksums */
 #define ENABLE_CHECKSUM 1
 
+/* enable flock by default */
+#define ENABLE_FLOCK 1
+
 /* Use the Pinger */
 #define ENABLE_PINGER 1
 
+/* aes-sha2 is supported by krb5 */
+/* #undef HAVE_AES_SHA2_SUPPORT */
+
 /* Define to 1 if you have the <asm/types.h> header file. */
 #define HAVE_ASM_TYPES_H 1
 
@@ -79,6 +79,12 @@
 /* 'bio_integrity_enabled' is available */
 /* #undef HAVE_BIO_INTEGRITY_ENABLED */
 
+/* kernel has bio_integrity_prep_fn */
+/* #undef HAVE_BIO_INTEGRITY_PREP_FN */
+
+/* bio_integrity_payload.bip_iter exist */
+#define HAVE_BIP_ITER_BIO_INTEGRITY_PAYLOAD 1
+
 /* 'bi_bdev' is available */
 /* #undef HAVE_BI_BDEV */
 
@@ -103,9 +109,18 @@
 /* blk_queue_max_segments is defined */
 #define HAVE_BLK_QUEUE_MAX_SEGMENTS 1
 
+/* kernel hash_64() is broken */
+/* #undef HAVE_BROKEN_HASH_64 */
+
 /* kernel has struct bvec_iter */
 #define HAVE_BVEC_ITER 1
 
+/* struct cache_detail has writers */
+#define HAVE_CACHE_DETAIL_WRITERS 1
+
+/* if cache_detail->hash_lock is a spinlock */
+#define HAVE_CACHE_HASH_SPINLOCK 1
+
 /* cache_head has hlist cache_list */
 #define HAVE_CACHE_HEAD_HLIST 1
 
@@ -118,24 +133,24 @@
 /* kernel has clean_bdev_aliases */
 /* #undef HAVE_CLEAN_BDEV_ALIASES */
 
+/* 'clear_and_wake_up_bit' is available */
+#define HAVE_CLEAR_AND_WAKE_UP_BIT 1
+
 /* have clear_inode */
 #define HAVE_CLEAR_INODE 1
 
 /* compat rdma found */
 /* #undef HAVE_COMPAT_RDMA */
 
-/* cpumap_print_to_pagebuf is available */
-#define HAVE_CPUMASK_PRINT_TO_PAGEBUF 1
-
 /* kernel compiled with CRC32 functions */
 #define HAVE_CRC32 1
 
-/* struct cred has member tgcred */
-/* #undef HAVE_CRED_TGCRED */
-
 /* crypto hash helper functions are available */
 #define HAVE_CRYPTO_HASH_HELPERS 1
 
+/* 'CRYPTO_MAX_ALG_NAME' is 128 */
+#define HAVE_CRYPTO_MAX_ALG_NAME_128 1
+
 /* current_time() has replaced CURRENT_TIME */
 #define HAVE_CURRENT_TIME 1
 
@@ -154,6 +169,9 @@
 /* dentry_open uses struct path as first argument */
 #define HAVE_DENTRY_OPEN_USE_PATH 1
 
+/* DES3 enctype is supported by krb5 */
+/* #undef HAVE_DES3_SUPPORT */
+
 /* direct_IO need 2 arguments */
 #define HAVE_DIRECTIO_2ARGS 1
 
@@ -235,6 +253,9 @@
 /* d_delete first parameter declared is not const */
 #define HAVE_D_DELETE_CONST const
 
+/* d_hash_and_lookup is exported by the kernel */
+#define HAVE_D_HASH_AND_LOOKUP 1
+
 /* have d_make_root */
 #define HAVE_D_MAKE_ROOT 1
 
@@ -322,15 +343,15 @@
 /* Define to 1 if you have the `gethostbyname' function. */
 #define HAVE_GETHOSTBYNAME 1
 
+/* get_request_key_auth() is available */
+#define HAVE_GET_REQUEST_KEY_AUTH 1
+
 /* get_user_pages takes 6 arguments */
 /* #undef HAVE_GET_USER_PAGES_6ARG */
 
 /* get_user_pages takes gup_flags in arguments */
 #define HAVE_GET_USER_PAGES_GUP_FLAGS 1
 
-/* get_user_pages takes gup_flags in arguments with 7 args */
-/* #undef HAVE_GET_USER_PAGES_GUP_FLAGS_7ARGS */
-
 /* struct group_info has member gid */
 #define HAVE_GROUP_INFO_GID 1
 
@@ -343,6 +364,9 @@
 /* Define this if the Kerberos GSS library supports gss_krb5_ccache_name */
 /* #undef HAVE_GSS_KRB5_CCACHE_NAME */
 
+/* '__rhashtable_insert_fast()' returns int */
+/* #undef HAVE_HASHTABLE_INSERT_FAST_RETURN_INT */
+
 /* Define this if you have Heimdal Kerberos libraries */
 /* #undef HAVE_HEIMDAL */
 
@@ -391,6 +415,9 @@
 /* if ib_sg_dma_address wrapper exists */
 /* #undef HAVE_IB_SG_DMA_ADDRESS */
 
+/* INIT_LIST_HEAD_RCU exists */
+#define HAVE_INIT_LIST_HEAD_RCU 1
+
 /* inode_operations .getattr member function can gather advance stats */
 #define HAVE_INODEOPS_ENHANCED_GETATTR 1
 
@@ -415,6 +442,15 @@
 /* inode_operations->permission has two args */
 #define HAVE_INODE_PERMISION_2ARGS 1
 
+/* inode times are using timespec64 */
+#define HAVE_INODE_TIMESPEC64 1
+
+/* blk_integrity.interval exist */
+/* #undef HAVE_INTERVAL_BLK_INTEGRITY */
+
+/* blk_integrity.interval_exp exist */
+#define HAVE_INTERVAL_EXP_BLK_INTEGRITY 1
+
 /* Define to 1 if you have the <inttypes.h> header file. */
 #define HAVE_INTTYPES_H 1
 
@@ -424,6 +460,9 @@
 /* have in_compat_syscall */
 #define HAVE_IN_COMPAT_SYSCALL 1
 
+/* 'in_dev_for_each_ifa_rtnl' is defined */
+#define HAVE_IN_DEV_FOR_EACH_IFA_RTNL 1
+
 /* inode_operations->rename need flags as argument */
 #define HAVE_IOPS_RENAME_WITH_FLAGS 1
 
@@ -463,18 +502,27 @@
 /* is_sxid is defined */
 #define HAVE_IS_SXID 1
 
+/* 'iterate_shared' is available */
+#define HAVE_ITERATE_SHARED 1
+
 /* struct address_space has i_pages */
 #define HAVE_I_PAGES 1
 
 /* i_uid_read is present */
 #define HAVE_I_UID_READ 1
 
-/* jiffies_to_timespec64() is available */
-#define HAVE_JIFFIES_TO_TIMESPEC64 1
+/* kallsyms_lookup_name is exported by kernel */
+/* #undef HAVE_KALLSYMS_LOOKUP_NAME */
 
 /* kernel_locked is defined */
 /* #undef HAVE_KERNEL_LOCKED */
 
+/* 'kernel_param_[un]lock' is available */
+#define HAVE_KERNEL_PARAM_LOCK 1
+
+/* 'struct kernel_param_ops' is available */
+#define HAVE_KERNEL_PARAM_OPS 1
+
 /* kernel_setsockopt still in use */
 /* #undef HAVE_KERNEL_SETSOCKOPT */
 
@@ -493,6 +541,9 @@
 /* key_type->instantiate has two args */
 #define HAVE_KEY_TYPE_INSTANTIATE_2ARGS 1
 
+/* key.usage is of type refcount_t */
+#define HAVE_KEY_USAGE_REFCOUNT 1
+
 /* ki_left exist */
 /* #undef HAVE_KIOCB_KI_LEFT */
 
@@ -521,12 +572,15 @@
    available */
 /* #undef HAVE_KRB5_GET_INIT_CREDS_OPT_SET_ADDRESSLESS */
 
+/* kset_find_obj is exported by the kernel */
+#define HAVE_KSET_FIND_OBJ 1
+
+/* kernel has kstrtobool_from_user */
+#define HAVE_KSTRTOBOOL_FROM_USER 1
+
 /* kernel has kstrtoul */
 #define HAVE_KSTRTOUL 1
 
-/* kernel has ksys_close */
-#define HAVE_KSYS_CLOSE 1
-
 /* kthread_worker found */
 /* #undef HAVE_KTHREAD_WORK */
 
@@ -554,6 +608,9 @@
 /* 'ktime_get_ts64' is available */
 #define HAVE_KTIME_GET_TS64 1
 
+/* 'ktime_ms_delta' is available */
+#define HAVE_KTIME_MS_DELTA 1
+
 /* 'ktime_to_timespec64' is available */
 #define HAVE_KTIME_TO_TIMESPEC64 1
 
@@ -581,21 +638,12 @@
 /* readline library is available */
 /* #undef HAVE_LIBREADLINE */
 
-/* Define to 1 if you have the <linux/random.h> header file. */
-#define HAVE_LINUX_RANDOM_H 1
+/* linux/rhashtable.h is present */
+#define HAVE_LINUX_RHASHTABLE_H 1
 
 /* if linux/selinux.h exists */
 /* #undef HAVE_LINUX_SELINUX_IS_ENABLED */
 
-/* Define to 1 if you have the <linux/types.h> header file. */
-#define HAVE_LINUX_TYPES_H 1
-
-/* Define to 1 if you have the <linux/unistd.h> header file. */
-#define HAVE_LINUX_UNISTD_H 1
-
-/* Define to 1 if you have the <linux/version.h> header file. */
-#define HAVE_LINUX_VERSION_H 1
-
 /* lock_manager_operations has lm_compare_owner */
 /* #undef HAVE_LM_COMPARE_OWNER */
 
@@ -605,6 +653,9 @@
 /* kernel has locks_lock_file_wait */
 #define HAVE_LOCKS_LOCK_FILE_WAIT 1
 
+/* lookup_user_key() is available */
+#define HAVE_LOOKUP_USER_KEY 1
+
 /* kernel has LOOP_CTL_GET_FREE */
 #define HAVE_LOOP_CTL_GET_FREE 1
 
@@ -633,6 +684,9 @@
 /* kernel module loading is possible */
 #define HAVE_MODULE_LOADING_SUPPORT 1
 
+/* locking module param is supported */
+/* #undef HAVE_MODULE_PARAM_LOCKING */
+
 /* Define to 1 if you have the `name_to_handle_at' function. */
 #define HAVE_NAME_TO_HANDLE_AT 1
 
@@ -642,15 +696,24 @@
 /* cancel_dirty_page with one arguement is available */
 #define HAVE_NEW_CANCEL_DIRTY_PAGE 1
 
+/* DEFINE_TIMER uses only 2 arguements */
+#define HAVE_NEW_DEFINE_TIMER 1
+
 /* 'kernel_write' aligns with read/write helpers */
 #define HAVE_NEW_KERNEL_WRITE 1
 
 /* NR_UNSTABLE_NFS is still in use. */
 /* #undef HAVE_NR_UNSTABLE_NFS */
 
+/* ns_to_timespec64() is available */
+#define HAVE_NS_TO_TIMESPEC64 1
+
 /* with oldsize */
 /* #undef HAVE_OLDSIZE_TRUNCATE_PAGECACHE */
 
+/* openssl-devel is present */
+/* #undef HAVE_OPENSSL_GETSEPOL */
+
 /* OpenSSL HMAC functions needed for SSK */
 /* #undef HAVE_OPENSSL_SSK */
 
@@ -675,6 +738,9 @@
 /* posix_acl_valid takes struct user_namespace */
 #define HAVE_POSIX_ACL_VALID_USER_NS 1
 
+/* 'prepare_to_wait_event' is available */
+#define HAVE_PREPARE_TO_WAIT_EVENT 1
+
 /* struct proc_ops exists */
 #define HAVE_PROC_OPS 1
 
@@ -687,12 +753,18 @@
 /* inode->i_nlink is protected from direct modification */
 #define HAVE_PROTECT_I_NLINK 1
 
+/* 'PTR_ERR_OR_ZERO' exist */
+#define HAVE_PTR_ERR_OR_ZERO 1
+
 /* have quota64 */
 #define HAVE_QUOTA64 1
 
 /* radix_tree_exceptional_entry exist */
 /* #undef HAVE_RADIX_EXCEPTION_ENTRY */
 
+/* rdma_connect_locked is defined */
+#define HAVE_RDMA_CONNECT_LOCKED 1
+
 /* rdma_create_id wants 4 args */
 /* #undef HAVE_RDMA_CREATE_ID_4ARG */
 
@@ -702,15 +774,24 @@
 /* rdma_reject has 4 arguments */
 #define HAVE_RDMA_REJECT_4ARGS 1
 
-/* reinit_completion is exist */
-#define HAVE_REINIT_COMPLETION 1
-
 /* kernel export remove_from_page_cache */
 /* #undef HAVE_REMOVE_FROM_PAGE_CACHE */
 
 /* remove_proc_subtree is defined */
 #define HAVE_REMOVE_PROC_SUBTREE 1
 
+/* rhashtable_lookup() is available */
+#define HAVE_RHASHTABLE_LOOKUP 1
+
+/* rhashtable_lookup_get_insert_fast() is available */
+#define HAVE_RHASHTABLE_LOOKUP_GET_INSERT_FAST 1
+
+/* struct rhltable exist */
+#define HAVE_RHLTABLE 1
+
+/* save_stack_trace_tsk is exported */
+/* #undef HAVE_SAVE_STACK_TRACE_TSK */
+
 /* Have sa_spill_alloc in ZFS */
 /* #undef HAVE_SA_SPILL_ALLOC */
 
@@ -735,6 +816,9 @@
 /* security_inode_init_security takes a 'struct qstr' parameter */
 /* #undef HAVE_SECURITY_IINITSEC_QSTR */
 
+/* security_inode_listsecurity() is available/exported */
+#define HAVE_SECURITY_INODE_LISTSECURITY 1
+
 /* security_release_secctx has 1 arg. */
 /* #undef HAVE_SEC_RELEASE_SECCTX_1ARG */
 
@@ -778,36 +862,27 @@
 /* Have spa_maxblocksize in ZFS */
 /* #undef HAVE_SPA_MAXBLOCKSIZE */
 
-/* spinlock_t is defined */
-/* #undef HAVE_SPINLOCK_T */
-
 /* struct stacktrace_ops exists */
 /* #undef HAVE_STACKTRACE_OPS */
 
 /* stacktrace_ops.warning is exist */
 /* #undef HAVE_STACKTRACE_WARNING */
 
-/* stack_trace_print() exists */
-#define HAVE_STACK_TRACE_PRINT 1
-
 /* Define to 1 if you have the <stdint.h> header file. */
 #define HAVE_STDINT_H 1
 
 /* Define to 1 if you have the <stdlib.h> header file. */
 #define HAVE_STDLIB_H 1
 
+/* stringhash.h is present */
+#define HAVE_STRINGHASH 1
+
 /* Define to 1 if you have the <strings.h> header file. */
 #define HAVE_STRINGS_H 1
 
 /* Define to 1 if you have the <string.h> header file. */
 #define HAVE_STRING_H 1
 
-/* Define to 1 if you have the `strlcat' function. */
-/* #undef HAVE_STRLCAT */
-
-/* Define to 1 if you have the `strlcpy' function. */
-/* #undef HAVE_STRLCPY */
-
 /* Define to 1 if you have the `strnlen' function. */
 #define HAVE_STRNLEN 1
 
@@ -835,9 +910,6 @@
 /* ctl_table has ctl_name field */
 /* #undef HAVE_SYSCTL_CTLNAME */
 
-/* Define to 1 if you have the <sys/ioctl.h> header file. */
-#define HAVE_SYS_IOCTL_H 1
-
 /* Define to 1 if you have <sys/quota.h>. */
 #define HAVE_SYS_QUOTA_H 1
 
@@ -868,9 +940,6 @@
 /* 'timespec64_to_ktime' is available */
 #define HAVE_TIMESPEC64_TO_KTIME 1
 
-/* have_time_t */
-/* #undef HAVE_TIME_T */
-
 /* topology_sibling_cpumask is available */
 #define HAVE_TOPOLOGY_SIBLING_CPUMASK 1
 
@@ -919,9 +988,18 @@
 /* 'struct vm_operations' remove struct vm_area_struct argument */
 #define HAVE_VM_OPS_USE_VM_FAULT_ONLY 1
 
+/* wait_bit.h is present */
+#define HAVE_WAIT_BIT_HEADER_H 1
+
 /* 'wait_queue_entry_t' is available */
 #define HAVE_WAIT_QUEUE_ENTRY 1
 
+/* linux wait_queue_head_t list_head is name head */
+#define HAVE_WAIT_QUEUE_ENTRY_LIST 1
+
+/* 'wait_var_event' is available */
+#define HAVE_WAIT_VAR_EVENT 1
+
 /* flags field exist */
 #define HAVE_XATTR_HANDLER_FLAGS 1
 
@@ -946,9 +1024,18 @@
 /* Have zap_remove_by_dnode() in ZFS */
 /* #undef HAVE_ZAP_REMOVE_ADD_BY_DNODE */
 
+/* Have inode_timespec_t */
+/* #undef HAVE_ZFS_INODE_TIMESPEC */
+
+/* Have multihost protection in ZFS */
+/* #undef HAVE_ZFS_MULTIHOST */
+
 /* Enable zfs osd */
 /* #undef HAVE_ZFS_OSD */
 
+/* Have zfs_refcount_add */
+/* #undef HAVE_ZFS_REFCOUNT_ADD */
+
 /* __add_wait_queue_exclusive exists */
 /* #undef HAVE___ADD_WAIT_QUEUE_EXCLUSIVE */
 
@@ -975,13 +1062,13 @@
 #define LUSTRE_MAJOR 2
 
 /* Second number in the Lustre version */
-#define LUSTRE_MINOR 10
+#define LUSTRE_MINOR 12
 
 /* Third number in the Lustre version */
 #define LUSTRE_PATCH 8
 
 /* A copy of PACKAGE_VERSION */
-#define LUSTRE_VERSION_STRING "2.10.8-10"
+#define LUSTRE_VERSION_STRING "2.12.8-1"
 
 /* maximum number of MDS threads */
 /* #undef MDS_MAX_THREADS */
@@ -998,6 +1085,9 @@
 /* need pclmulqdq based crc32 */
 /* #undef NEED_CRC32_ACCEL */
 
+/* 'ktime_get_ns' is not available */
+/* #undef NEED_KTIME_GET_NS */
+
 /* 'ktime_get_real_ns' is not available */
 /* #undef NEED_KTIME_GET_REAL_NS */
 
@@ -1014,7 +1104,7 @@
 #define PACKAGE_NAME "Lustre"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "Lustre 2.10.8-10"
+#define PACKAGE_STRING "Lustre 2.12.8-1"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "lustre"
@@ -1023,14 +1113,11 @@
 #define PACKAGE_URL ""
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2.10.8-10"
+#define PACKAGE_VERSION "2.12.8-1"
 
 /* name of parallel fsck program */
 #define PFSCK "fsck"
 
-/* proc handler methods use __user */
-/* #undef PROC_HANDLER_USE_USER_ATTR */
-
 /* enable randomly alloc failure */
 #define RANDOM_FAIL_ALLOC 1
 
@@ -1067,16 +1154,16 @@
 /* #undef USE_LU_REF */
 
 /* Version number of package */
-#define VERSION "2.10.8-10"
+#define VERSION "2.12.8-1"
 
 /* zfs fix version */
-/* #undef ZFS_FIX */
+#define ZFS_FIX 0
 
 /* zfs major version */
-/* #undef ZFS_MAJOR */
+#define ZFS_MAJOR 
 
 /* zfs minor version */
-/* #undef ZFS_MINOR */
+#define ZFS_MINOR 
 
 /* zfs patch version */
-/* #undef ZFS_PATCH */
+#define ZFS_PATCH 
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h
index 28472601ed4df..1763da296244d 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h
@@ -32,6 +32,9 @@
 #ifndef _LIBCFS_BITMAP_H_
 #define _LIBCFS_BITMAP_H_
 
+#include <linux/interrupt.h>
+#include <libcfs/libcfs_private.h>
+
 struct cfs_bitmap {
 	unsigned int size;
 	unsigned long data[0];
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/curproc.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/curproc.h
index e9e0cc2109034..0f00c7219e75d 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/curproc.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/curproc.h
@@ -39,6 +39,10 @@
 #ifndef __LIBCFS_CURPROC_H__
 #define __LIBCFS_CURPROC_H__
 
+/* check if task is running in compat mode.*/
+#define current_pid()		(current->pid)
+#define current_comm()		(current->comm)
+
 typedef __u32 cfs_cap_t;
 
 #define CFS_CAP_CHOWN                   0
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h
index f01170c6e1d97..9ae7b8405a94b 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -33,11 +33,28 @@
 #ifndef __LIBCFS_LIBCFS_H__
 #define __LIBCFS_LIBCFS_H__
 
-#ifdef __KERNEL__
-# include <libcfs/linux/libcfs.h>
-# include "curproc.h"
+#include <linux/kernel.h>
+#include <linux/module.h>
 
-#define LIBCFS_VERSION	"0.5.0"
+#include <libcfs/linux/linux-misc.h>
+#include <libcfs/linux/linux-time.h>
+#include <libcfs/linux/linux-wait.h>
+#include <libcfs/linux/linux-mem.h>
+
+#include <uapi/linux/lnet/libcfs_ioctl.h>
+#include <libcfs/libcfs_debug.h>
+#include <libcfs/libcfs_private.h>
+#include <libcfs/bitmap.h>
+#include <libcfs/libcfs_cpu.h>
+#include <libcfs/libcfs_prim.h>
+#include <libcfs/libcfs_string.h>
+#include <libcfs/libcfs_workitem.h>
+#include <libcfs/libcfs_hash.h>
+#include <libcfs/libcfs_heap.h>
+#include <libcfs/libcfs_fail.h>
+#include "curproc.h"
+
+#define LIBCFS_VERSION	"0.7.1"
 
 #define PO2_ROUNDUP_TYPED(x, po2, type) (-(-(type)(x) & -(type)(po2)))
 #define LOWEST_BIT_SET(x) ((x) & ~((x) - 1))
@@ -82,15 +99,19 @@ void lc_watchdog_delete(struct lc_watchdog *lcw);
  #endif
 #endif
 
+typedef s32 timeout_t;
+
 /* need both kernel and user-land acceptor */
 #define LNET_ACCEPTOR_MIN_RESERVED_PORT    512
 #define LNET_ACCEPTOR_MAX_RESERVED_PORT    1023
 
-/*
- * Drop into debugger, if possible. Implementation is provided by platform.
- */
-
-void cfs_enter_debugger(void);
+extern struct blocking_notifier_head libcfs_ioctl_list;
+static inline int notifier_from_ioctl_errno(int err)
+{
+	if (err == -EINVAL)
+		return NOTIFY_OK;
+	return notifier_from_errno(err) | NOTIFY_STOP_MASK;
+}
 
 /*
  * Defined by platform
@@ -111,21 +132,6 @@ unsigned int cfs_rand(void);
 /* seed the generator */
 void cfs_srand(unsigned int, unsigned int);
 void cfs_get_random_bytes(void *buf, int size);
-#endif /* __KERNEL__ */
-
-#include <libcfs/libcfs_debug.h>
-#ifdef __KERNEL__
-# include <libcfs/libcfs_private.h>
-# include <libcfs/bitmap.h>
-# include <libcfs/libcfs_cpu.h>
-# include <libcfs/libcfs_ioctl.h>
-# include <libcfs/libcfs_prim.h>
-# include <libcfs/libcfs_time.h>
-# include <libcfs/libcfs_string.h>
-# include <libcfs/libcfs_workitem.h>
-# include <libcfs/libcfs_hash.h>
-# include <libcfs/libcfs_heap.h>
-# include <libcfs/libcfs_fail.h>
 
 int libcfs_ioctl_data_adjust(struct libcfs_ioctl_data *data);
 int libcfs_ioctl(unsigned long cmd, void __user *uparam);
@@ -139,12 +145,30 @@ static inline void *__container_of(const void *ptr, unsigned long shift)
 		return (char *)ptr - shift;
 }
 
-#define container_of0(ptr, type, member)				\
+#define container_of0(ptr, type, member) \
 	((type *)__container_of((ptr), offsetof(type, member)))
 
-#endif /* __KERNEL__ */
+struct lnet_debugfs_symlink_def {
+	const char *name;
+	const char *target;
+};
+
+void lnet_insert_debugfs(struct ctl_table *table);
+void lnet_remove_debugfs(struct ctl_table *table);
+
+/* helper for sysctl handlers */
+int lprocfs_call_handler(void *data, int write, loff_t *ppos,
+			 void __user *buffer, size_t *lenp,
+			 int (*handler)(void *data, int write, loff_t pos,
+					void __user *buffer, int len));
+int debugfs_doint(struct ctl_table *table, int write,
+		  void __user *buffer, size_t *lenp, loff_t *ppos);
 
 /* atomic-context safe vfree */
+#ifdef HAVE_LIBCFS_VFREE_ATOMIC
 void libcfs_vfree_atomic(const void *addr);
+#else
+#define libcfs_vfree_atomic(ptr) vfree(ptr)
+#endif
 
 #endif /* _LIBCFS_LIBCFS_H_ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_cpu.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_cpu.h
index 9fd28ce749cfe..4620dcc08cf80 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_cpu.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_cpu.h
@@ -13,17 +13,12 @@
  * General Public License version 2 for more details (a copy is included
  * in the LICENSE file that accompanied this code).
  *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA
- *
  * GPL HEADER END
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -47,16 +42,16 @@
  *
  *     Example: if there are 8 cores on the system, while creating a CPT
  *     with cpu_npartitions=4:
- *              core[0, 1] = partition[0], core[2, 3] = partition[1]
- *              core[4, 5] = partition[2], core[6, 7] = partition[3]
+ *		core[0, 1] = partition[0], core[2, 3] = partition[1]
+ *		core[4, 5] = partition[2], core[6, 7] = partition[3]
  *
  *          cpu_npartitions=1:
- *              core[0, 1, ... 7] = partition[0]
+ *		core[0, 1, ... 7] = partition[0]
  *
  *   . User can also specify CPU partitions by string pattern
  *
  *     Examples: cpu_partitions="0[0,1], 1[2,3]"
- *               cpu_partitions="N 0[0-3], 1[4-8]"
+ *		 cpu_partitions="N 0[0-3], 1[4-8]"
  *
  *     The first character "N" means following numbers are numa ID
  *
@@ -76,21 +71,56 @@
 #ifndef __LIBCFS_CPU_H__
 #define __LIBCFS_CPU_H__
 
-#ifndef HAVE_LIBCFS_CPT
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/slab.h>
+#include <linux/topology.h>
+#include <linux/version.h>
+#include <linux/vmalloc.h>
+
+#include <libcfs/linux/linux-cpu.h>
+
+#ifdef CONFIG_SMP
+
+/** virtual processing unit */
+struct cfs_cpu_partition {
+	/* CPUs mask for this partition */
+	cpumask_t			*cpt_cpumask;
+	/* nodes mask for this partition */
+	nodemask_t			*cpt_nodemask;
+	/* NUMA distance between CPTs */
+	unsigned int			*cpt_distance;
+	/* spread rotor for NUMA allocator */
+	unsigned int			 cpt_spread_rotor;
+	/* NUMA node if cpt_nodemask is empty */
+	int				 cpt_node;
+};
+#endif /* CONFIG_SMP */
 
+/** descriptor for CPU partitions */
 struct cfs_cpt_table {
+#ifdef CONFIG_SMP
+	/* spread rotor for NUMA allocator */
+	unsigned int			 ctb_spread_rotor;
+	/* maximum NUMA distance between all nodes in table */
+	unsigned int			 ctb_distance;
+	/* partitions tables */
+	struct cfs_cpu_partition	*ctb_parts;
+	/* shadow HW CPU to CPU partition ID */
+	int				*ctb_cpu2cpt;
+	/* shadow HW node to CPU partition ID */
+	int				*ctb_node2cpt;
 	/* # of CPU partitions */
-	int			ctb_nparts;
-	/* cpu mask */
-	cpumask_t		ctb_mask;
-	/* node mask */
-	nodemask_t		ctb_nodemask;
-	/* version */
-	__u64			ctb_version;
+	int				 ctb_nparts;
+	/* all nodes in this partition table */
+	nodemask_t			*ctb_nodemask;
+#else
+	nodemask_t			 ctb_nodemask;
+#endif /* CONFIG_SMP */
+	/* all cpus in this partition table */
+	cpumask_t			*ctb_cpumask;
 };
 
-#endif /* !HAVE_LIBCFS_CPT */
-
 /* any CPU partition */
 #define CFS_CPT_ANY		(-1)
 
@@ -117,7 +147,7 @@ int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len);
  */
 int cfs_cpt_number(struct cfs_cpt_table *cptab);
 /**
- * return number of HW cores or hypter-threadings in a CPU partition \a cpt
+ * return number of HW cores or hyper-threadings in a CPU partition \a cpt
  */
 int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt);
 /**
@@ -147,13 +177,13 @@ int cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node);
 /**
  * NUMA distance between \a cpt1 and \a cpt2 in \a cptab
  */
-unsigned cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2);
+unsigned int cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2);
 /**
  * bind current thread on a CPU-partition \a cpt of \a cptab
  */
 int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt);
 /**
- * add \a cpu to CPU partion @cpt of \a cptab, return 1 for success,
+ * add \a cpu to CPU partition @cpt of \a cptab, return 1 for success,
  * otherwise 0 is returned
  */
 int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu);
@@ -165,7 +195,6 @@ void cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu);
  * add all cpus in \a mask to CPU partition \a cpt
  * return 1 if successfully set all CPUs, otherwise return 0
  */
-
 int cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt,
 			const cpumask_t *mask);
 /**
@@ -203,15 +232,15 @@ int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt);
 /*
  * allocate per-cpu-partition data, returned value is an array of pointers,
  * variable can be indexed by CPU ID.
- *      cptab != NULL: size of array is number of CPU partitions
- *      cptab == NULL: size of array is number of HW cores
+ *	cptab != NULL: size of array is number of CPU partitions
+ *	cptab == NULL: size of array is number of HW cores
  */
 void *cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size);
 /*
- * destory per-cpu-partition variable
+ * destroy per-cpu-partition variable
  */
-void  cfs_percpt_free(void *vars);
-int   cfs_percpt_number(void *vars);
+void cfs_percpt_free(void *vars);
+int cfs_percpt_number(void *vars);
 
 #define cfs_percpt_for_each(var, i, vars)		\
 	for (i = 0; i < cfs_percpt_number(vars) &&	\
@@ -260,16 +289,17 @@ void cfs_percpt_lock_free(struct cfs_percpt_lock *pcl);
 
 /* lock private lock \a index of \a pcl */
 void cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index);
+
 /* unlock private lock \a index of \a pcl */
 void cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index);
 
-#define CFS_PERCPT_LOCK_KEYS   256
+#define CFS_PERCPT_LOCK_KEYS	256
 
 /* NB: don't allocate keys dynamically, lockdep needs them to be in ".data" */
 #define cfs_percpt_lock_alloc(cptab)					\
 ({									\
-	static struct lock_class_key  ___keys[CFS_PERCPT_LOCK_KEYS];	\
-	struct cfs_percpt_lock	     *___lk;				\
+	static struct lock_class_key ___keys[CFS_PERCPT_LOCK_KEYS];	\
+	struct cfs_percpt_lock *___lk;					\
 									\
 	if (cfs_cpt_number(cptab) > CFS_PERCPT_LOCK_KEYS)		\
 		___lk = cfs_percpt_lock_create(cptab, NULL);		\
@@ -338,14 +368,6 @@ cfs_mem_cache_cpt_alloc(struct kmem_cache *cachep, struct cfs_cpt_table *cptab,
 #define cfs_cpt_for_each(i, cptab)	\
 	for (i = 0; i < cfs_cpt_number(cptab); i++)
 
-#ifndef __read_mostly
-# define __read_mostly
-#endif
-
-#ifndef ____cacheline_aligned
-#define ____cacheline_aligned
-#endif
-
 int  cfs_cpu_init(void);
 void cfs_cpu_fini(void);
 
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_crypto.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_crypto.h
index ea9234abc7f76..8271306ce6019 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_crypto.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_crypto.h
@@ -38,6 +38,12 @@ struct cfs_crypto_hash_type {
 	unsigned int    cht_size;       /**< hash digest size */
 };
 
+struct cfs_crypto_crypt_type {
+	char	       *cct_name;	  /**< crypto algorithm name, equal to
+					   * format name for crypto api */
+	unsigned int    cct_size;         /**< crypto key size */
+};
+
 enum cfs_crypto_hash_alg {
 	CFS_HASH_ALG_NULL	= 0,
 	CFS_HASH_ALG_ADLER32,
@@ -54,6 +60,13 @@ enum cfs_crypto_hash_alg {
 	CFS_HASH_ALG_UNKNOWN	= 0xff
 };
 
+enum cfs_crypto_crypt_alg {
+	CFS_CRYPT_ALG_NULL	= 0,
+	CFS_CRYPT_ALG_AES256_CTR,
+	CFS_CRYPT_ALG_MAX,
+	CFS_CRYPT_ALG_UNKNOWN	= 0xff
+};
+
 static struct cfs_crypto_hash_type hash_types[] = {
 	[CFS_HASH_ALG_NULL] = {
 		.cht_name	= "null",
@@ -107,6 +120,17 @@ static struct cfs_crypto_hash_type hash_types[] = {
 	}
 };
 
+static struct cfs_crypto_crypt_type crypt_types[] = {
+	[CFS_CRYPT_ALG_NULL] = {
+		.cct_name	= "null",
+		.cct_size	= 0
+	},
+	[CFS_CRYPT_ALG_AES256_CTR] = {
+		.cct_name	= "ctr(aes)",
+		.cct_size	= 32
+	}
+};
+
 /* Maximum size of hash_types[].cht_size */
 #define CFS_CRYPTO_HASH_DIGESTSIZE_MAX 64
 
@@ -188,24 +212,103 @@ static inline unsigned char cfs_crypto_hash_alg(const char *algname)
 	return CFS_HASH_ALG_UNKNOWN;
 }
 
+/**
+ * Return crypt algorithm information for the specified algorithm identifier
+ *
+ * Crypt information includes algorithm name, key size.
+ *
+ * \retval		cfs_crypto_crupt_type for valid ID (CFS_CRYPT_ALG_*)
+ * \retval		NULL for unknown algorithm identifier
+ */
+static inline const struct
+cfs_crypto_crypt_type *cfs_crypto_crypt_type(
+	enum cfs_crypto_crypt_alg crypt_alg)
+{
+	struct cfs_crypto_crypt_type *ct;
+
+	if (crypt_alg < CFS_CRYPT_ALG_MAX) {
+		ct = &crypt_types[crypt_alg];
+		if (ct->cct_name != NULL)
+			return ct;
+	}
+	return NULL;
+}
+
+/**
+ * Return crypt name for crypt algorithm identifier
+ *
+ * \param[in] crypt_alg	crypt alrgorithm id (CFS_CRYPT_ALG_*)
+ *
+ * \retval		string name of known crypt algorithm
+ * \retval		"unknown" if hash algorithm is unknown
+ */
+static inline const
+char *cfs_crypto_crypt_name(enum cfs_crypto_crypt_alg crypt_alg)
+{
+	const struct cfs_crypto_crypt_type *ct;
+
+	ct = cfs_crypto_crypt_type(crypt_alg);
+	if (ct)
+		return ct->cct_name;
+
+	return "unknown";
+}
+
+
+/**
+ * Return key size for crypto algorithm type
+ *
+ * \param[in] crypt_alg	crypt alrgorithm id (CFS_CRYPT_ALG_*)
+ *
+ * \retval		crypt algorithm key size in bytes
+ * \retval		0 if crypt algorithm type is unknown
+ */
+static inline
+unsigned int cfs_crypto_crypt_keysize(enum cfs_crypto_crypt_alg crypt_alg)
+{
+	const struct cfs_crypto_crypt_type *ct;
+
+	ct = cfs_crypto_crypt_type(crypt_alg);
+	if (ct != NULL)
+		return ct->cct_size;
+
+	return 0;
+}
+
+/**
+ * Find crypto algorithm ID for the specified algorithm name
+ *
+ * \retval		crypto algorithm ID for valid ID (CFS_CRYPT_ALG_*)
+ * \retval		CFS_CRYPT_ALG_UNKNOWN for unknown algorithm name
+ */
+static inline unsigned char cfs_crypto_crypt_alg(const char *algname)
+{
+	enum cfs_crypto_crypt_alg crypt_alg;
+
+	for (crypt_alg = 0; crypt_alg < CFS_CRYPT_ALG_MAX; crypt_alg++)
+		if (strcmp(crypt_types[crypt_alg].cct_name, algname) == 0)
+			return crypt_alg;
+
+	return CFS_CRYPT_ALG_UNKNOWN;
+}
+
 int cfs_crypto_hash_digest(enum cfs_crypto_hash_alg hash_alg,
 			   const void *buf, unsigned int buf_len,
 			   unsigned char *key, unsigned int key_len,
 			   unsigned char *hash, unsigned int *hash_len);
 
 /* cfs crypto hash descriptor */
-struct cfs_crypto_hash_desc;
 struct page;
 
-struct cfs_crypto_hash_desc *
+struct ahash_request *
 	cfs_crypto_hash_init(enum cfs_crypto_hash_alg hash_alg,
 			     unsigned char *key, unsigned int key_len);
-int cfs_crypto_hash_update_page(struct cfs_crypto_hash_desc *desc,
+int cfs_crypto_hash_update_page(struct ahash_request *req,
 				struct page *page, unsigned int offset,
 				unsigned int len);
-int cfs_crypto_hash_update(struct cfs_crypto_hash_desc *desc, const void *buf,
+int cfs_crypto_hash_update(struct ahash_request *req, const void *buf,
 			   unsigned int buf_len);
-int cfs_crypto_hash_final(struct cfs_crypto_hash_desc *desc,
+int cfs_crypto_hash_final(struct ahash_request *req,
 			  unsigned char *hash, unsigned int *hash_len);
 int cfs_crypto_register(void);
 void cfs_crypto_unregister(void);
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_debug.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_debug.h
index 2eb6b7aa57d9c..ac89d2cb60b55 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_debug.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_debug.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2014, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,6 +38,10 @@
 #ifndef __LIBCFS_DEBUG_H__
 #define __LIBCFS_DEBUG_H__
 
+#include <stdarg.h>
+#include <linux/limits.h>
+#include <uapi/linux/lnet/libcfs_debug.h>
+
 /*
  *  Debugging
  */
@@ -60,112 +64,6 @@ int libcfs_debug_str2mask(int *mask, const char *str, int is_subsys);
 extern unsigned int libcfs_catastrophe;
 extern unsigned int libcfs_panic_on_lbug;
 
-/**
- * Format for debug message headers
- */
-struct ptldebug_header {
-        __u32 ph_len;
-        __u32 ph_flags;
-        __u32 ph_subsys;
-        __u32 ph_mask;
-        __u16 ph_cpu_id;
-        __u16 ph_type;
-	/* time_t overflow in 2106 */
-        __u32 ph_sec;
-        __u64 ph_usec;
-        __u32 ph_stack;
-        __u32 ph_pid;
-        __u32 ph_extern_pid;
-        __u32 ph_line_num;
-} __attribute__((packed));
-
-
-#define PH_FLAG_FIRST_RECORD 1
-
-/* Debugging subsystems (32 bits, non-overlapping) */
-#define S_UNDEFINED	0x00000001
-#define S_MDC		0x00000002
-#define S_MDS		0x00000004
-#define S_OSC		0x00000008
-#define S_OST		0x00000010
-#define S_CLASS		0x00000020
-#define S_LOG		0x00000040
-#define S_LLITE		0x00000080
-#define S_RPC		0x00000100
-#define S_MGMT		0x00000200
-#define S_LNET		0x00000400
-#define S_LND		0x00000800 /* ALL LNDs */
-#define S_PINGER	0x00001000
-#define S_FILTER	0x00002000
-/* unused */
-#define S_ECHO		0x00008000
-#define S_LDLM		0x00010000
-#define S_LOV		0x00020000
-#define S_LQUOTA	0x00040000
-#define S_OSD		0x00080000
-#define S_LFSCK		0x00100000
-#define S_SNAPSHOT	0x00200000
-/* unused */
-#define S_LMV		0x00800000 /* b_new_cmd */
-/* unused */
-#define S_SEC		0x02000000 /* upcall cache */
-#define S_GSS		0x04000000 /* b_new_cmd */
-/* unused */
-#define S_MGC		0x10000000
-#define S_MGS		0x20000000
-#define S_FID		0x40000000 /* b_new_cmd */
-#define S_FLD		0x80000000 /* b_new_cmd */
-
-#define LIBCFS_DEBUG_SUBSYS_NAMES {					\
-	"undefined", "mdc", "mds", "osc", "ost", "class", "log",	\
-	"llite", "rpc", "mgmt", "lnet", "lnd", "pinger", "filter", "",	\
-	"echo", "ldlm", "lov", "lquota", "osd", "lfsck", "snapshot", "",\
-	"lmv",	"", "sec", "gss", "", "mgc", "mgs", "fid", "fld", NULL }
-
-/* Debugging masks (32 bits, non-overlapping) */
-#define D_TRACE		0x00000001 /* ENTRY/EXIT markers */
-#define D_INODE		0x00000002
-#define D_SUPER		0x00000004
-#define D_EXT2		0x00000008 /* anything from ext2_debug */
-#define D_MALLOC	0x00000010 /* print malloc, free information */
-#define D_CACHE		0x00000020 /* cache-related items */
-#define D_INFO		0x00000040 /* general information */
-#define D_IOCTL		0x00000080 /* ioctl related information */
-#define D_NETERROR	0x00000100 /* network errors */
-#define D_NET		0x00000200 /* network communications */
-#define D_WARNING	0x00000400 /* CWARN(...) == CDEBUG (D_WARNING, ...) */
-#define D_BUFFS		0x00000800
-#define D_OTHER		0x00001000
-#define D_DENTRY	0x00002000
-#define D_NETTRACE	0x00004000
-#define D_PAGE		0x00008000 /* bulk page handling */
-#define D_DLMTRACE	0x00010000
-#define D_ERROR		0x00020000 /* CERROR(...) == CDEBUG (D_ERROR, ...) */
-#define D_EMERG		0x00040000 /* CEMERG(...) == CDEBUG (D_EMERG, ...) */
-#define D_HA		0x00080000 /* recovery and failover */
-#define D_RPCTRACE	0x00100000 /* for distributed debugging */
-#define D_VFSTRACE	0x00200000
-#define D_READA		0x00400000 /* read-ahead */
-#define D_MMAP		0x00800000
-#define D_CONFIG	0x01000000
-#define D_CONSOLE	0x02000000
-#define D_QUOTA		0x04000000
-#define D_SEC		0x08000000
-#define D_LFSCK		0x10000000 /* For both OI scrub and LFSCK */
-#define D_HSM		0x20000000
-#define D_SNAPSHOT	0x40000000 /* snapshot */
-#define D_LAYOUT	0x80000000
-
-#define LIBCFS_DEBUG_MASKS_NAMES {					\
-	"trace", "inode", "super", "ext2", "malloc", "cache", "info",	\
-	"ioctl", "neterror", "net", "warning", "buffs", "other",	\
-	"dentry", "nettrace", "page", "dlmtrace", "error", "emerg",	\
-	"ha", "rpctrace", "vfstrace", "reada", "mmap", "config",	\
-	"console", "quota", "sec", "lfsck", "hsm", "snapshot", "layout",\
-	NULL }
-
-#define D_CANTMASK   (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE)
-
 #ifndef DEBUG_SUBSYSTEM
 # define DEBUG_SUBSYSTEM S_UNDEFINED
 #endif
@@ -207,9 +105,38 @@ do {                                                        \
                .msg_cdls   = (cdls)         };              \
         dataname.msg_mask   = (mask);
 
-#ifdef __KERNEL__
+#ifdef CDEBUG_ENABLED
 
-# ifdef CDEBUG_ENABLED
+#if !defined(__x86_64__)
+# ifdef __ia64__
+#  define CDEBUG_STACK() (THREAD_SIZE -					\
+			  ((unsigned long)__builtin_dwarf_cfa() &	\
+			   (THREAD_SIZE - 1)))
+# else
+#  define CDEBUG_STACK() (THREAD_SIZE -					\
+			  ((unsigned long)__builtin_frame_address(0) &	\
+			   (THREAD_SIZE - 1)))
+# endif /* __ia64__ */
+
+#define __CHECK_STACK(msgdata, mask, cdls)				\
+do {									\
+	if (unlikely(CDEBUG_STACK() > libcfs_stack)) {			\
+		LIBCFS_DEBUG_MSG_DATA_INIT(msgdata, D_WARNING, NULL);	\
+		libcfs_stack = CDEBUG_STACK();				\
+		libcfs_debug_msg(msgdata,				\
+				 "maximum lustre stack %lu\n",		\
+				 CDEBUG_STACK());			\
+		(msgdata)->msg_mask = mask;				\
+		(msgdata)->msg_cdls = cdls;				\
+		dump_stack();						\
+		/*panic("LBUG");*/					\
+	}								\
+} while (0)
+#define CFS_CHECK_STACK(msgdata, mask, cdls)  __CHECK_STACK(msgdata, mask, cdls)
+#else /* __x86_64__ */
+#define CFS_CHECK_STACK(msgdata, mask, cdls) do {} while (0)
+#define CDEBUG_STACK() (0L)
+#endif /* __x86_64__ */
 
 /**
  * Filters out logging messages based on mask and subsystem.
@@ -251,22 +178,6 @@ static inline int cfs_cdebug_show(unsigned int mask, unsigned int subsystem)
 #  warning "CDEBUG IS DISABLED. THIS SHOULD NEVER BE DONE FOR PRODUCTION!"
 # endif /* CDEBUG_ENABLED */
 
-#else /* !__KERNEL__ */
-static inline int cfs_cdebug_show(unsigned int mask, unsigned int subsystem)
-{
-        return 0;
-}
-# define CDEBUG(mask, format, ...)					\
-do {                                                                    \
-        if (((mask) & D_CANTMASK) != 0)                                 \
-                fprintf(stderr, "(%s:%d:%s()) " format,                 \
-                        __FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__);\
-} while (0)
-
-# define CDEBUG_LIMIT CDEBUG
-
-#endif /* __KERNEL__ */
-
 /*
  * Lustre Error Checksum: calculates checksum
  * of Hex number by XORing each bit.
@@ -288,7 +199,7 @@ do {                                                                    \
 
 #define LCONSOLE_EMERG(format, ...) CDEBUG(D_CONSOLE | D_EMERG, format, ## __VA_ARGS__)
 
-#if defined(CDEBUG_ENTRY_EXIT) && defined(__KERNEL__)
+#if defined(CDEBUG_ENTRY_EXIT)
 
 void libcfs_log_goto(struct libcfs_debug_msg_data *goto_data,
 		     const char *label, long rc);
@@ -341,7 +252,7 @@ do {									\
 # define ENTRY	CDEBUG(D_TRACE, "Process entered\n")
 # define EXIT	CDEBUG(D_TRACE, "Process leaving\n")
 
-#else /* !CDEBUG_ENTRY_EXIT || !__KERNEL__ */
+#else /* !CDEBUG_ENTRY_EXIT */
 
 # define GOTO(label, rc)						\
 	do {								\
@@ -353,7 +264,7 @@ do {									\
 # define ENTRY	do { } while (0)
 # define EXIT	do { } while (0)
 
-#endif /* CDEBUG_ENTRY_EXIT && __KERNEL__ */
+#endif /* CDEBUG_ENTRY_EXIT */
 
 #define RETURN_EXIT							\
 do {									\
@@ -370,15 +281,15 @@ extern int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata,
                               va_list args, const char *format2, ...)
         __attribute__ ((format (printf, 4, 5)));
 
-#ifdef __KERNEL__
 /* other external symbols that tracefile provides: */
 extern int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
 				   const char __user *usr_buffer,
 				   int usr_buffer_nob);
 extern int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob,
 				    const char *knl_buffer, char *append);
-#endif /* __KERNEL__ */
 
 #define LIBCFS_DEBUG_FILE_PATH_DEFAULT "/tmp/lustre-log"
 
+void cfs_debug_init(void);
+
 #endif	/* __LIBCFS_DEBUG_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_fail.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_fail.h
index 2af5149be8f69..203e470df88d0 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_fail.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_fail.h
@@ -165,7 +165,7 @@ static inline void cfs_race(__u32 id)
 			CERROR("cfs_race id %x sleeping\n", id);
 			rc = wait_event_interruptible(cfs_race_waitq,
 						      cfs_race_state != 0);
-			CERROR("cfs_fail_race id %x awake, rc=%d\n", id, rc);
+			CERROR("cfs_fail_race id %x awake: rc=%d\n", id, rc);
 		} else {
 			CERROR("cfs_fail_race id %x waking\n", id);
 			cfs_race_state = 1;
@@ -175,4 +175,42 @@ static inline void cfs_race(__u32 id)
 }
 #define CFS_RACE(id) cfs_race(id)
 
+/**
+ * Wait on race.
+ *
+ * The first thread that calls this with a matching fail_loc is put to sleep,
+ * but subseqent callers of this won't sleep. Until another thread that calls
+ * cfs_race_wakeup(), the first thread will be woken up and continue.
+ */
+static inline void cfs_race_wait(__u32 id)
+{
+	if (CFS_FAIL_PRECHECK(id)) {
+		if (unlikely(__cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET))) {
+			int rc;
+
+			cfs_race_state = 0;
+			CERROR("cfs_race id %x sleeping\n", id);
+			rc = wait_event_interruptible(cfs_race_waitq,
+						      cfs_race_state != 0);
+			CERROR("cfs_fail_race id %x awake: rc=%d\n", id, rc);
+		}
+	}
+}
+#define CFS_RACE_WAIT(id) cfs_race_wait(id)
+
+/**
+ * Wake up the thread that is waiting on the matching fail_loc.
+ */
+static inline void cfs_race_wakeup(__u32 id)
+{
+	if (CFS_FAIL_PRECHECK(id)) {
+		if (likely(!__cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET))) {
+			CERROR("cfs_fail_race id %x waking\n", id);
+			cfs_race_state = 1;
+			wake_up(&cfs_race_waitq);
+		}
+	}
+}
+#define CFS_RACE_WAKEUP(id) cfs_race_wakeup(id)
+
 #endif /* _LIBCFS_FAIL_H */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_prim.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_prim.h
index 16bda0c460ebf..1001362e75cd0 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_prim.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_prim.h
@@ -36,6 +36,7 @@
 #ifndef __LIBCFS_PRIM_H__
 #define __LIBCFS_PRIM_H__
 
+#include <linux/mm.h>
 #include <linux/sched.h>
 
 /*
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h
index ebcdc990203b2..9a242839fd843 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h
@@ -42,6 +42,9 @@
 # define DEBUG_SUBSYSTEM S_UNDEFINED
 #endif
 
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
 #ifdef LIBCFS_DEBUG
 
 /*
@@ -213,8 +216,14 @@ do {									    \
 #define LIBCFS_CPT_ALLOC(ptr, cptab, cpt, size)				    \
 	LIBCFS_CPT_ALLOC_GFP(ptr, cptab, cpt, size, GFP_NOFS)
 
+#ifdef LLIST_HEAD
 void init_libcfs_vfree_atomic(void);
 void exit_libcfs_vfree_atomic(void);
+#define HAVE_LIBCFS_VFREE_ATOMIC
+#else
+#define init_libcfs_vfree_atomic() do {} while(0)
+#define exit_libcfs_vfree_atomic() do {} while(0)
+#endif
 
 #define LIBCFS_FREE(ptr, size)						\
 do {									\
@@ -228,7 +237,7 @@ do {									\
 	CDEBUG(D_MALLOC, "kfreed '" #ptr "': %d at %p (tot %d).\n",     \
 	       s, (ptr), libcfs_kmem_read());				\
 	if (unlikely(s > LIBCFS_VMALLOC_SIZE))                          \
-		libcfs_vfree_atomic(ptr);						\
+		libcfs_vfree_atomic(ptr);				\
 	else								\
 		kfree(ptr);						\
 } while (0)
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h
deleted file mode 100644
index ca40551dfc678..0000000000000
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h
+++ /dev/null
@@ -1,121 +0,0 @@
-#ifndef __LIBCFS_PTASK_H__
-#define __LIBCFS_PTASK_H__
-
-#include <linux/types.h>
-#include <linux/bitops.h>
-#include <linux/kernel.h>
-#include <linux/cpumask.h>
-#include <linux/uaccess.h>
-#include <linux/notifier.h>
-#include <linux/workqueue.h>
-#include <linux/completion.h>
-
-/*
- * Unconditionaly disable PADATA.
- *
- * Padata is needed for PIO client feature. This feature is disabled by default
- * and was removed from Lustre code during 2.13 development (2b0a34fe43bf).
- * Instead of adapting the code to Linux 5.4+ change, just disable it.
- */
-#undef CONFIG_PADATA
-
-#ifdef CONFIG_PADATA
-#include <linux/padata.h>
-#else
-struct padata_priv {};
-struct padata_instance {};
-#endif
-
-#define PTF_COMPLETE	BIT(0)
-#define PTF_AUTOFREE	BIT(1)
-#define PTF_ORDERED	BIT(2)
-#define PTF_USER_MM	BIT(3)
-#define PTF_ATOMIC	BIT(4)
-#define PTF_RETRY	BIT(5)
-
-struct cfs_ptask_engine {
-	struct padata_instance	*pte_pinst;
-	struct workqueue_struct	*pte_wq;
-	struct notifier_block	 pte_notifier;
-	int			 pte_weight;
-};
-
-struct cfs_ptask;
-typedef int (*cfs_ptask_cb_t)(struct cfs_ptask *);
-
-struct cfs_ptask {
-	struct padata_priv	 pt_padata;
-	struct completion	 pt_completion;
-	struct mm_struct	*pt_mm;
-	unsigned int		 pt_flags;
-	int			 pt_cbcpu;
-	cfs_ptask_cb_t		 pt_cbfunc;
-	void			*pt_cbdata;
-	int			 pt_result;
-};
-
-static inline
-struct padata_priv *cfs_ptask2padata(struct cfs_ptask *ptask)
-{
-	return &ptask->pt_padata;
-}
-
-static inline
-struct cfs_ptask *cfs_padata2ptask(struct padata_priv *padata)
-{
-	return container_of(padata, struct cfs_ptask, pt_padata);
-}
-
-static inline
-bool cfs_ptask_need_complete(struct cfs_ptask *ptask)
-{
-	return ptask->pt_flags & PTF_COMPLETE;
-}
-
-static inline
-bool cfs_ptask_is_autofree(struct cfs_ptask *ptask)
-{
-	return ptask->pt_flags & PTF_AUTOFREE;
-}
-
-static inline
-bool cfs_ptask_is_ordered(struct cfs_ptask *ptask)
-{
-	return ptask->pt_flags & PTF_ORDERED;
-}
-
-static inline
-bool cfs_ptask_use_user_mm(struct cfs_ptask *ptask)
-{
-	return ptask->pt_flags & PTF_USER_MM;
-}
-
-static inline
-bool cfs_ptask_is_atomic(struct cfs_ptask *ptask)
-{
-	return ptask->pt_flags & PTF_ATOMIC;
-}
-
-static inline
-bool cfs_ptask_is_retry(struct cfs_ptask *ptask)
-{
-	return ptask->pt_flags & PTF_RETRY;
-}
-
-static inline
-int cfs_ptask_result(struct cfs_ptask *ptask)
-{
-	return ptask->pt_result;
-}
-
-struct cfs_ptask_engine *cfs_ptengine_init(const char *, const struct cpumask *);
-void cfs_ptengine_fini(struct cfs_ptask_engine *);
-int  cfs_ptengine_set_cpumask(struct cfs_ptask_engine *, const struct cpumask *);
-int  cfs_ptengine_weight(struct cfs_ptask_engine *);
-
-int  cfs_ptask_submit(struct cfs_ptask *, struct cfs_ptask_engine *);
-int  cfs_ptask_wait_for(struct cfs_ptask *);
-int  cfs_ptask_init(struct cfs_ptask *, cfs_ptask_cb_t, void *,
-		    unsigned int, int);
-
-#endif /* __LIBCFS_PTASK_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_string.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_string.h
index 3c34071d35774..4d9dbde91e8a0 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_string.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_string.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2014, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_time.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_time.h
deleted file mode 100644
index 68947c9792296..0000000000000
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_time.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * libcfs/include/libcfs/libcfs_time.h
- *
- * Time functions.
- *
- */
-
-#ifndef __LIBCFS_TIME_H__
-#define __LIBCFS_TIME_H__
-
-/*
- * generic time manipulation functions.
- */
-
-static inline cfs_time_t cfs_time_add(cfs_time_t t, cfs_duration_t d)
-{
-        return (cfs_time_t)(t + d);
-}
-
-static inline cfs_duration_t cfs_time_sub(cfs_time_t t1, cfs_time_t t2)
-{
-        return (cfs_time_t)(t1 - t2);
-}
-
-static inline int cfs_time_after(cfs_time_t t1, cfs_time_t t2)
-{
-        return cfs_time_before(t2, t1);
-}
-
-static inline int cfs_time_aftereq(cfs_time_t t1, cfs_time_t t2)
-{
-        return cfs_time_beforeq(t2, t1);
-}
-
-static inline cfs_time_t cfs_time_shift(int seconds)
-{
-        return cfs_time_add(cfs_time_current(), cfs_time_seconds(seconds));
-}
-
-#define CFS_TICK	1
-
-/*
- * return valid time-out based on user supplied one. Currently we only check
- * that time-out is not shorted than allowed.
- */
-static inline cfs_duration_t cfs_timeout_cap(cfs_duration_t timeout)
-{
-	if (timeout < CFS_TICK)
-		timeout = CFS_TICK;
-	return timeout;
-}
-
-#endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/libcfs.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/libcfs.h
deleted file mode 100644
index 0f67a87096c0a..0000000000000
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/libcfs.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2012, 2015, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- */
-
-#ifndef __LIBCFS_LINUX_LIBCFS_H__
-#define __LIBCFS_LINUX_LIBCFS_H__
-
-#ifndef __LIBCFS_LIBCFS_H__
-#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
-#endif
-
-#ifndef __KERNEL__
-#error This include is only for kernel use.
-#endif
-
-#include <linux/bitops.h>
-#include <linux/compiler.h>
-#include <linux/ctype.h>
-#include <linux/errno.h>
-#include <linux/file.h>
-#include <linux/fs.h>
-#include <linux/highmem.h>
-#include <linux/interrupt.h>
-#include <linux/kallsyms.h>
-#include <linux/kernel.h>
-#include <linux/kmod.h>
-#include <linux/kthread.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/mutex.h>
-#include <linux/notifier.h>
-#include <linux/random.h>
-#include <linux/rbtree.h>
-#include <linux/rwsem.h>
-#include <linux/scatterlist.h>
-#include <linux/sched.h>
-#ifdef HAVE_SCHED_HEADERS
-#include <linux/sched/signal.h>
-#include <linux/sched/mm.h>
-#endif
-#include <linux/signal.h>
-#include <linux/slab.h>
-#include <linux/smp.h>
-#include <linux/stat.h>
-#include <linux/string.h>
-#include <linux/time.h>
-#include <linux/timer.h>
-#include <linux/types.h>
-#include <linux/unistd.h>
-#include <linux/vmalloc.h>
-#include <net/sock.h>
-#include <linux/atomic.h>
-#include <asm/div64.h>
-#include <linux/timex.h>
-#include <linux/uaccess.h>
-#include <stdarg.h>
-
-#include <libcfs/linux/linux-cpu.h>
-#include <libcfs/linux/linux-time.h>
-#include <libcfs/linux/linux-mem.h>
-#include <libcfs/linux/linux-misc.h>
-#include <libcfs/linux/linux-fs.h>
-
-#if !defined(__x86_64__)
-# ifdef  __ia64__
-#  define CDEBUG_STACK() (THREAD_SIZE -                                 \
-                          ((unsigned long)__builtin_dwarf_cfa() &       \
-                           (THREAD_SIZE - 1)))
-# else
-#  define CDEBUG_STACK() (THREAD_SIZE -                                 \
-                          ((unsigned long)__builtin_frame_address(0) &  \
-                           (THREAD_SIZE - 1)))
-# endif /* __ia64__ */
-
-#define __CHECK_STACK(msgdata, mask, cdls)                              \
-do {                                                                    \
-        if (unlikely(CDEBUG_STACK() > libcfs_stack)) {                  \
-                LIBCFS_DEBUG_MSG_DATA_INIT(msgdata, D_WARNING, NULL);   \
-                libcfs_stack = CDEBUG_STACK();                          \
-                libcfs_debug_msg(msgdata,                               \
-                                 "maximum lustre stack %lu\n",          \
-                                 CDEBUG_STACK());                       \
-                (msgdata)->msg_mask = mask;                             \
-                (msgdata)->msg_cdls = cdls;                             \
-                dump_stack();                                           \
-              /*panic("LBUG");*/                                        \
-        }                                                               \
-} while (0)
-#define CFS_CHECK_STACK(msgdata, mask, cdls)  __CHECK_STACK(msgdata, mask, cdls)
-#else /* __x86_64__ */
-#define CFS_CHECK_STACK(msgdata, mask, cdls) do {} while(0)
-#define CDEBUG_STACK() (0L)
-#endif /* __x86_64__ */
-
-/**
- * Platform specific declarations for cfs_curproc API (libcfs/curproc.h)
- *
- * Implementation is in linux-curproc.c
- */
-#define CFS_CURPROC_COMM_MAX (sizeof ((struct task_struct *)0)->comm)
-
-/* helper for sysctl handlers */
-int lprocfs_call_handler(void *data, int write, loff_t *ppos,
-			 void __user *buffer, size_t *lenp,
-			 int (*handler)(void *data, int write,
-			 loff_t pos, void __user *buffer, int len));
-
-#ifndef WITH_WATCHDOG
-#define WITH_WATCHDOG
-#endif
-
-/*
- * Macros to access common characteristics of "current" UNIX process.
- */
-#define current_pid()             (current->pid)
-#define current_comm()            (current->comm)
-
-/* check if task is running in compat mode.*/
-int current_is_32bit(void);
-
-#endif /* _LINUX_LIBCFS_H */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-cpu.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-cpu.h
index a46e252466026..ab6b55e0586a6 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-cpu.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-cpu.h
@@ -23,7 +23,7 @@
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2012, 2013, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -39,61 +39,6 @@
 #ifndef __LIBCFS_LINUX_CPU_H__
 #define __LIBCFS_LINUX_CPU_H__
 
-#ifndef __LIBCFS_LIBCFS_H__
-#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
-#endif
-
-#ifndef __KERNEL__
-#error This include is only for kernel use.
-#endif
-
-#include <linux/cpu.h>
-#include <linux/cpuset.h>
-#include <linux/topology.h>
-#include <linux/version.h>
-
-#ifdef CONFIG_SMP
-
-#define HAVE_LIBCFS_CPT
-
-/** virtual processing unit */
-struct cfs_cpu_partition {
-	/* CPUs mask for this partition */
-	cpumask_t			*cpt_cpumask;
-	/* nodes mask for this partition */
-	nodemask_t			*cpt_nodemask;
-	/* NUMA distance between CPTs */
-	unsigned			*cpt_distance;
-	/* spread rotor for NUMA allocator */
-	int				 cpt_spread_rotor;
-	/* NUMA node if cpt_nodemask is empty */
-	int				 cpt_node;
-};
-
-/** descriptor for CPU partitions */
-struct cfs_cpt_table {
-	/* spread rotor for NUMA allocator */
-	int				ctb_spread_rotor;
-	/* maximum NUMA distance between all nodes in table */
-	unsigned			ctb_distance;
-	/* # of CPU partitions */
-	int				 ctb_nparts;
-	/* partitions tables */
-	struct cfs_cpu_partition	*ctb_parts;
-	/* shadow HW CPU to CPU partition ID */
-	int				*ctb_cpu2cpt;
-	/* all cpus in this partition table */
-	cpumask_t			*ctb_cpumask;
-	/* shadow HW node to CPU partition ID */
-	int				*ctb_node2cpt;
-	/* all nodes in this partition table */
-	nodemask_t			*ctb_nodemask;
-};
-
-void cfs_cpu_core_siblings(int cpu, cpumask_t *mask);
-
-#endif /* CONFIG_SMP */
-
 #ifndef HAVE_TOPOLOGY_SIBLING_CPUMASK
 # define topology_sibling_cpumask(cpu)	topology_thread_cpumask(cpu)
 #endif /* HAVE_TOPOLOGY_SIBLING_CPUMASK */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-crypto.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-crypto.h
index a9c15a66ab207..6346c59e516e7 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-crypto.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-crypto.h
@@ -26,11 +26,6 @@
  * Copyright 2012 Xyratex Technology Limited
  */
 
-/* Added in v4.15-rc4 (commit a208fa8f3303) */
-#ifndef CRYPTO_ALG_OPTIONAL_KEY
-#define CRYPTO_ALG_OPTIONAL_KEY 0x00004000
-#endif
-
 /**
  * Linux crypto hash specific functions.
  */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h
index dbc84de172146..dd86d1947466b 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h
@@ -37,14 +37,6 @@
 #ifndef __LIBCFS_LINUX_CFS_FS_H__
 #define __LIBCFS_LINUX_CFS_FS_H__
 
-#ifndef __LIBCFS_LIBCFS_H__
-#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
-#endif
-
-#ifndef __KERNEL__
-#error This include is only for kernel use.
-#endif
-
 #include <linux/fs.h>
 #include <linux/stat.h>
 #include <linux/mount.h>
@@ -58,6 +50,10 @@ static inline struct dentry *file_dentry(const struct file *file)
 }
 #endif
 
+#ifndef QSTR_INIT
+#define QSTR_INIT(n, l) { .len = l, .name = n }
+#endif
+
 #if defined(HAVE_FILE_FSYNC_4ARGS) || defined(HAVE_FILE_FSYNC_2ARGS)
 #define ll_vfs_fsync_range(fp, start, end, datasync) \
 	vfs_fsync_range(fp, start, end, datasync)
@@ -66,15 +62,6 @@ static inline struct dentry *file_dentry(const struct file *file)
 	vfs_fsync_range(fp, file_dentry(fp), start, end, datasync)
 #endif
 
-#define flock_type(fl)			((fl)->fl_type)
-#define flock_set_type(fl, type)	do { (fl)->fl_type = (type); } while (0)
-#define flock_pid(fl)			((fl)->fl_pid)
-#define flock_set_pid(fl, pid)		do { (fl)->fl_pid = (pid); } while (0)
-#define flock_start(fl)			((fl)->fl_start)
-#define flock_set_start(fl, st)		do { (fl)->fl_start = (st); } while (0)
-#define flock_end(fl)			((fl)->fl_end)
-#define flock_set_end(fl, end)		do { (fl)->fl_end = (end); } while (0)
-
 #ifndef IFSHIFT
 #define IFSHIFT			12
 #endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-hash.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-hash.h
new file mode 100644
index 0000000000000..2721655306bbe
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-hash.h
@@ -0,0 +1,247 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+
+#ifndef __LIBCFS_LINUX_HASH_H__
+#define __LIBCFS_LINUX_HASH_H__
+
+#include <linux/dcache.h>
+
+u64 cfs_hashlen_string(const void *salt, const char *name);
+
+#ifndef hashlen_hash
+#define hashlen_hash(hashlen) ((u32)(hashlen))
+#endif
+
+#ifndef HAVE_STRINGHASH
+#ifndef hashlen_create
+#define hashlen_create(hash, len) ((u64)(len)<<32 | (u32)(hash))
+#endif
+#endif /* !HAVE_STRINGHASH */
+
+#ifdef HAVE_LINUX_RHASHTABLE_H
+#include <linux/rhashtable.h>
+
+#ifndef HAVE_RHLTABLE
+struct rhlist_head {
+	struct rhash_head		rhead;
+	struct rhlist_head __rcu	*next;
+};
+
+struct rhltable {
+	struct rhashtable ht;
+};
+
+#define rhl_for_each_entry_rcu(tpos, pos, list, member)                 \
+	for (pos = list; pos && rht_entry(tpos, pos, member);           \
+		pos = rcu_dereference_raw(pos->next))
+
+static inline int rhltable_init(struct rhltable *hlt,
+				const struct rhashtable_params *params)
+{
+	return rhashtable_init(&hlt->ht, params);
+}
+
+static inline struct rhlist_head *rhltable_lookup(
+	struct rhltable *hlt, const void *key,
+	const struct rhashtable_params params)
+{
+	struct rhashtable *ht = &hlt->ht;
+	struct rhashtable_compare_arg arg = {
+		.ht = ht,
+		.key = key,
+	};
+	struct bucket_table *tbl;
+	struct rhash_head *he;
+	unsigned int hash;
+
+	tbl = rht_dereference_rcu(ht->tbl, ht);
+restart:
+	hash = rht_key_hashfn(ht, tbl, key, params);
+	rht_for_each_rcu(he, tbl, hash) {
+		if (params.obj_cmpfn ?
+		    params.obj_cmpfn(&arg, rht_obj(ht, he)) :
+		    rhashtable_compare(&arg, rht_obj(ht, he)))
+			continue;
+		return he ? container_of(he, struct rhlist_head, rhead) : NULL;
+	}
+
+	/* Ensure we see any new tables. */
+	smp_rmb();
+
+	tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+	if (unlikely(tbl))
+		goto restart;
+
+	return NULL;
+}
+
+static inline int rhltable_insert_key(
+	struct rhltable *hlt, const void *key, struct rhlist_head *list,
+	const struct rhashtable_params params)
+{
+#ifdef HAVE_HASHTABLE_INSERT_FAST_RETURN_INT
+	return __rhashtable_insert_fast(&hlt->ht, key, &list->rhead,
+					params);
+#else
+	return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead,
+						params));
+#endif
+}
+
+static inline int rhltable_remove(
+	struct rhltable *hlt, struct rhlist_head *list,
+	const struct rhashtable_params params)
+{
+	return rhashtable_remove_fast(&hlt->ht, &list->rhead, params);
+}
+
+static inline void rhltable_free_and_destroy(struct rhltable *hlt,
+					     void (*free_fn)(void *ptr,
+							     void *arg),
+					     void *arg)
+{
+	rhashtable_free_and_destroy(&hlt->ht, free_fn, arg);
+}
+
+static inline void rhltable_destroy(struct rhltable *hlt)
+{
+	rhltable_free_and_destroy(hlt, NULL, NULL);
+}
+
+static inline void rhltable_walk_enter(struct rhltable *hlt,
+				       struct rhashtable_iter *iter)
+{
+	rhashtable_walk_init(&hlt->ht, iter);
+}
+#endif /* !HAVE_RHLTABLE */
+
+#ifdef HAVE_BROKEN_HASH_64
+
+#define GOLDEN_RATIO_32 0x61C88647
+#define GOLDEN_RATIO_64 0x61C8864680B583EBull
+
+static inline u32 cfs_hash_32(u32 val, unsigned int bits)
+{
+	/* High bits are more random, so use them. */
+	return (val * GOLDEN_RATIO_32) >> (32 - bits);
+}
+
+static __always_inline u32 cfs_hash_64(u64 val, unsigned int bits)
+{
+#if BITS_PER_LONG == 64
+	/* 64x64-bit multiply is efficient on all 64-bit processors */
+	return val * GOLDEN_RATIO_64 >> (64 - bits);
+#else
+	/* Hash 64 bits using only 32x32-bit multiply. */
+	return cfs_hash_32(((u32)val ^ ((val >> 32) * GOLDEN_RATIO_32)), bits);
+#endif
+}
+#else
+
+#define cfs_hash_32	hash_32
+#define cfs_hash_64	hash_64
+
+#endif /* HAVE_BROKEN_HASH_64 */
+
+#ifndef HAVE_RHASHTABLE_LOOKUP_GET_INSERT_FAST
+/**
+ * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table
+ * @ht:         hash table
+ * @obj:        pointer to hash head inside object
+ * @params:     hash table parameters
+ *
+ * Just like rhashtable_lookup_insert_fast(), but this function returns the
+ * object if it exists, NULL if it did not and the insertion was successful,
+ * and an ERR_PTR otherwise.
+ */
+static inline void *rhashtable_lookup_get_insert_fast(
+	struct rhashtable *ht, struct rhash_head *obj,
+	const struct rhashtable_params params)
+{
+	const char *key;
+	void *ret;
+	int rc;
+
+	rc = rhashtable_lookup_insert_fast(ht, obj, params);
+	switch (rc) {
+	case -EEXIST:
+		key = rht_obj(ht, obj);
+		ret = rhashtable_lookup_fast(ht, key, params);
+		break;
+	case 0:
+		ret = NULL;
+		break;
+	default:
+		ret = ERR_PTR(rc);
+		break;
+	}
+	return ret;
+}
+#endif /* !HAVE_RHASHTABLE_LOOKUP_GET_INSERT_FAST */
+
+#ifndef HAVE_RHASHTABLE_LOOKUP
+/*
+ * The function rhashtable_lookup() and rhashtable_lookup_fast()
+ * are almost the same except rhashtable_lookup() doesn't
+ * take the RCU read lock. Since this is the case and only
+ * SLES12 SP3 lacks rhashtable_lookup() just duplicate the
+ * SLES12 SP3 rhashtable_lookup_fast() minus the RCU read lock.
+ */
+static inline void *rhashtable_lookup(
+	struct rhashtable *ht, const void *key,
+	const struct rhashtable_params params)
+{
+	struct rhashtable_compare_arg arg = {
+		.ht = ht,
+		.key = key,
+	};
+	const struct bucket_table *tbl;
+	struct rhash_head *he;
+	unsigned int hash;
+
+	tbl = rht_dereference_rcu(ht->tbl, ht);
+restart:
+	hash = rht_key_hashfn(ht, tbl, key, params);
+	rht_for_each_rcu(he, tbl, hash) {
+		if (params.obj_cmpfn ?
+		    params.obj_cmpfn(&arg, rht_obj(ht, he)) :
+		    rhashtable_compare(&arg, rht_obj(ht, he)))
+			continue;
+		return rht_obj(ht, he);
+	}
+
+	/* Ensure we see any new tables. */
+	smp_rmb();
+
+	tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+	if (unlikely(tbl))
+		goto restart;
+
+	return NULL;
+}
+#endif /* !HAVE_RHASHTABLE_LOOKUP */
+#else
+#define rhashtable_init(ht, param) 0
+#define rhashtable_destroy(ht) do {} while (0)
+#endif /* HAVE_LINUX_RHASHTABLE_H */
+
+#endif /* __LIBCFS_LINUX_HASH_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h
index f08d623bd8a84..81e79dbf24852 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h
@@ -37,14 +37,6 @@
 #ifndef __LIBCFS_LINUX_CFS_MEM_H__
 #define __LIBCFS_LINUX_CFS_MEM_H__
 
-#ifndef __LIBCFS_LIBCFS_H__
-#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
-#endif
-
-#ifndef __KERNEL__
-#error This include is only for kernel use.
-#endif
-
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/pagemap.h>
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h
index 754f183050485..2b07699f77284 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -34,7 +34,10 @@
 #define __LIBCFS_LINUX_MISC_H__
 
 #include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/user_namespace.h>
 #include <linux/uio.h>
+#include <linux/kallsyms.h>
 
 #ifdef HAVE_SYSCTL_CTLNAME
 #define INIT_CTL_NAME	.ctl_name = CTL_UNNUMBERED,
@@ -60,8 +63,8 @@
 #endif
 #endif /* HAVE_IOV_ITER_TYPE */
 
-#ifndef HAVE_LINUX_SELINUX_IS_ENABLED
-bool selinux_is_enabled(void);
+#ifndef HAVE_MODULE_PARAM_LOCKING
+static DEFINE_MUTEX(param_lock);
 #endif
 
 #ifndef HAVE_UIDGID_HEADER
@@ -128,13 +131,41 @@ static inline bool gid_valid(kgid_t gid)
 
 int cfs_get_environ(const char *key, char *value, int *val_len);
 
-#ifndef HAVE_WAIT_QUEUE_ENTRY
-#define wait_queue_entry_t wait_queue_t
-#endif
-
 int cfs_kernel_write(struct file *filp, const void *buf, size_t count,
 		     loff_t *pos);
 
+/*
+ * For RHEL6 struct kernel_parm_ops doesn't exist. Also
+ * the arguments for .set and .get take different
+ * parameters which is handled below
+ */
+#ifdef HAVE_KERNEL_PARAM_OPS
+#define cfs_kernel_param_arg_t const struct kernel_param
+#else
+#define cfs_kernel_param_arg_t struct kernel_param_ops
+#define kernel_param_ops kernel_param
+#endif /* ! HAVE_KERNEL_PARAM_OPS */
+
+#ifndef HAVE_KERNEL_PARAM_LOCK
+static inline void kernel_param_unlock(struct module *mod)
+{
+#ifndef	HAVE_MODULE_PARAM_LOCKING
+	mutex_unlock(&param_lock);
+#else
+	__kernel_param_unlock();
+#endif
+}
+
+static inline void kernel_param_lock(struct module *mod)
+{
+#ifndef	HAVE_MODULE_PARAM_LOCKING
+	mutex_lock(&param_lock);
+#else
+	__kernel_param_lock();
+#endif
+}
+#endif /* ! HAVE_KERNEL_PARAM_LOCK */
+
 #ifndef HAVE_KSTRTOUL
 static inline int kstrtoul(const char *s, unsigned int base, unsigned long *res)
 {
@@ -147,4 +178,23 @@ static inline int kstrtoul(const char *s, unsigned int base, unsigned long *res)
 }
 #endif /* !HAVE_KSTRTOUL */
 
+#ifndef HAVE_KSTRTOBOOL_FROM_USER
+
+#define kstrtobool strtobool
+
+int kstrtobool_from_user(const char __user *s, size_t count, bool *res);
+#endif
+
+#ifdef HAVE_KALLSYMS_LOOKUP_NAME
+static inline void *cfs_kallsyms_lookup_name(const char *name)
+{
+	return (void *)kallsyms_lookup_name(name);
+}
+#else
+static inline void *cfs_kallsyms_lookup_name(const char *name)
+{
+	return NULL;
+}
+#endif
+
 #endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
index a805ff9aedf84..3934635dcd322 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
@@ -39,54 +39,13 @@
 #ifndef __LIBCFS_LINUX_LINUX_TIME_H__
 #define __LIBCFS_LINUX_LINUX_TIME_H__
 
-#ifndef __LIBCFS_LIBCFS_H__
-#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
-#endif
-
-#ifndef __KERNEL__
-#error This include is only for kernel use.
-#endif
-
 /* Portable time API */
-
-/*
- * Platform provides three opaque data-types:
- *
- *  cfs_time_t        represents point in time. This is internal kernel
- *                    time rather than "wall clock". This time bears no
- *                    relation to gettimeofday().
- *
- *  cfs_duration_t    represents time interval with resolution of internal
- *                    platform clock
- *
- *  cfs_time_t     cfs_time_current(void);
- *  cfs_time_t     cfs_time_add    (cfs_time_t, cfs_duration_t);
- *  cfs_duration_t cfs_time_sub    (cfs_time_t, cfs_time_t);
- *  int            cfs_impl_time_before (cfs_time_t, cfs_time_t);
- *  int            cfs_impl_time_before_eq(cfs_time_t, cfs_time_t);
- *
- *  cfs_duration_t cfs_duration_build(int64_t);
- *
- *  time_t         cfs_duration_sec (cfs_duration_t);
- *  void           cfs_duration_usec(cfs_duration_t, struct timeval *);
- *  void           cfs_duration_nsec(cfs_duration_t, struct timespec *);
- *
- *  CFS_TIME_FORMAT
- *  CFS_DURATION_FORMAT
- *
- */
-
-#define ONE_BILLION ((u_int64_t)1000000000)
-#define ONE_MILLION 1000000
-
-#ifndef __KERNEL__
-#error This include is only for kernel use.
-#endif
-
+#include <linux/hrtimer.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/version.h>
 #include <linux/jiffies.h>
+#include <linux/hrtimer.h>
 #include <linux/types.h>
 #include <linux/time.h>
 #include <asm/div64.h>
@@ -94,10 +53,6 @@
 /*
  * Generic kernel stuff
  */
-
-typedef unsigned long cfs_time_t;      /* jiffies */
-typedef long cfs_duration_t;
-
 #ifndef HAVE_TIMESPEC64
 
 typedef __s64 time64_t;
@@ -143,22 +98,23 @@ static inline struct timespec timespec64_to_timespec(const struct timespec64 ts6
 
 #endif /* HAVE_TIMESPEC64 */
 
-#ifndef HAVE_TIME_T
-typedef __kernel_old_time_t time_t;
-#endif
-
-#ifndef HAVE_JIFFIES_TO_TIMESPEC64
-static inline void
-jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value)
+#ifndef HAVE_NS_TO_TIMESPEC64
+static inline struct timespec64 ns_to_timespec64(const s64 nsec)
 {
-	/*
-	 * Convert jiffies to nanoseconds and separate with
-	 * one divide.
-	 */
-	u32 rem;
-	value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
-					NSEC_PER_SEC, &rem);
-	value->tv_nsec = rem;
+	struct timespec64 ts;
+	s32 rem;
+
+	if (!nsec)
+		return (struct timespec64) {0, 0};
+
+	ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
+	if (unlikely(rem < 0)) {
+		ts.tv_sec--;
+		rem += NSEC_PER_SEC;
+	}
+	ts.tv_nsec = rem;
+
+	return ts;
 }
 #endif
 
@@ -207,6 +163,13 @@ time64_t ktime_get_real_seconds(void);
 time64_t ktime_get_seconds(void);
 #endif /* HAVE_KTIME_GET_SECONDS */
 
+#ifdef NEED_KTIME_GET_NS
+static inline u64 ktime_get_ns(void)
+{
+	return ktime_to_ns(ktime_get());
+}
+#endif /* NEED_KTIME_GET_NS */
+
 #ifdef NEED_KTIME_GET_REAL_NS
 static inline u64 ktime_get_real_ns(void)
 {
@@ -214,6 +177,13 @@ static inline u64 ktime_get_real_ns(void)
 }
 #endif /* NEED_KTIME_GET_REAL_NS */
 
+#ifndef HAVE_KTIME_MS_DELTA
+static inline s64 ktime_ms_delta(const ktime_t later, const ktime_t earlier)
+{
+	return ktime_to_ms(ktime_sub(later, earlier));
+}
+#endif /* HAVE_KTIME_MS_DELTA */
+
 #ifndef HAVE_KTIME_TO_TIMESPEC64
 static inline struct timespec64 ktime_to_timespec64(ktime_t kt)
 {
@@ -242,79 +212,39 @@ static inline ktime_t timespec64_to_ktime(struct timespec64 ts)
 }
 #endif
 
-static inline int cfs_time_before(cfs_time_t t1, cfs_time_t t2)
+static inline unsigned long cfs_time_seconds(time64_t seconds)
 {
-        return time_before(t1, t2);
+	return nsecs_to_jiffies(seconds * NSEC_PER_SEC);
 }
 
-static inline int cfs_time_beforeq(cfs_time_t t1, cfs_time_t t2)
-{
-        return time_before_eq(t1, t2);
-}
+#ifdef HAVE_NEW_DEFINE_TIMER
+# ifndef TIMER_DATA_TYPE
+# define TIMER_DATA_TYPE struct timer_list *
+# endif
 
-static inline cfs_time_t cfs_time_current(void)
-{
-        return jiffies;
-}
-
-static inline time_t cfs_time_current_sec(void)
-{
-	return ktime_get_real_seconds();
-}
-
-static inline cfs_duration_t cfs_time_seconds(int seconds)
-{
-	return ((cfs_duration_t)seconds) * msecs_to_jiffies(MSEC_PER_SEC);
-}
-
-static inline time_t cfs_duration_sec(cfs_duration_t d)
-{
-	return d / msecs_to_jiffies(MSEC_PER_SEC);
-}
-
-#define cfs_time_current_64 get_jiffies_64
-
-static inline __u64 cfs_time_add_64(__u64 t, __u64 d)
-{
-        return t + d;
-}
-
-static inline __u64 cfs_time_shift_64(int seconds)
-{
-        return cfs_time_add_64(cfs_time_current_64(),
-                               cfs_time_seconds(seconds));
-}
-
-static inline int cfs_time_before_64(__u64 t1, __u64 t2)
-{
-        return (__s64)t2 - (__s64)t1 > 0;
-}
+#define CFS_DEFINE_TIMER(_name, _function, _expires, _data) \
+	DEFINE_TIMER((_name), (_function))
+#else
+# ifndef TIMER_DATA_TYPE
+# define TIMER_DATA_TYPE unsigned long
+# endif
 
-static inline int cfs_time_beforeq_64(__u64 t1, __u64 t2)
-{
-        return (__s64)t2 - (__s64)t1 >= 0;
-}
+#define CFS_DEFINE_TIMER(_name, _function, _expires, _data) \
+	DEFINE_TIMER((_name), (_function), (_expires), (_data))
+#endif
 
-/*
- * One jiffy
- */
-#define CFS_DURATION_T          "%ld"
 #ifdef HAVE_TIMER_SETUP
 #define cfs_timer_cb_arg_t struct timer_list *
 #define cfs_from_timer(var, callback_timer, timer_fieldname) \
 	from_timer(var, callback_timer, timer_fieldname)
 #define cfs_timer_setup(timer, callback, data, flags) \
 	timer_setup((timer), (callback), (flags))
-#define CFS_DEFINE_TIMER(_name, _function, _expires, _data) \
-	DEFINE_TIMER((_name), (_function))
 #define cfs_timer_cb_arg(var, timer_fieldname) (&(var)->timer_fieldname)
 #else
 #define cfs_timer_cb_arg_t unsigned long
 #define cfs_from_timer(var, data, timer_fieldname) (typeof(var))(data)
 #define cfs_timer_setup(timer, callback, data, flags) \
 	setup_timer((timer), (callback), (data))
-#define CFS_DEFINE_TIMER(_name, _function, _expires, _data) \
-	DEFINE_TIMER((_name), (_function), (_expires), (_data))
 #define cfs_timer_cb_arg(var, timer_fieldname) (cfs_timer_cb_arg_t)(var)
 #endif
 
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-wait.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-wait.h
new file mode 100644
index 0000000000000..fd154ba0f049f
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-wait.h
@@ -0,0 +1,568 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LIBCFS_LINUX_WAIT_BIT_H
+#define __LIBCFS_LINUX_WAIT_BIT_H
+
+/* Make sure we can see if we have TASK_NOLOAD */
+#include <linux/sched.h>
+/*
+ * Linux wait-bit related types and methods:
+ */
+#ifdef HAVE_WAIT_BIT_HEADER_H
+#include <linux/wait_bit.h>
+#endif
+#include <linux/wait.h>
+
+#ifndef HAVE_WAIT_QUEUE_ENTRY
+#define wait_queue_entry_t wait_queue_t
+#endif
+
+#ifndef HAVE_WAIT_BIT_HEADER_H
+struct wait_bit_queue_entry {
+	struct wait_bit_key	key;
+	wait_queue_entry_t	wq_entry;
+};
+
+#define ___wait_is_interruptible(state)                                         \
+	(!__builtin_constant_p(state) ||                                        \
+		state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE)          \
+
+#endif /* ! HAVE_WAIT_BIT_HEADER_H */
+
+#ifndef HAVE_PREPARE_TO_WAIT_EVENT
+extern long prepare_to_wait_event(wait_queue_head_t *wq_head,
+				  wait_queue_entry_t *wq_entry, int state);
+#endif
+
+/* ___wait_cond_timeout changed number of args in v3.12-rc1-78-g35a2af94c7ce
+ * so let's define our own ___wait_cond_timeout1
+ */
+
+#define ___wait_cond_timeout1(condition)				\
+({									\
+	bool __cond = (condition);					\
+	if (__cond && !__ret)						\
+		__ret = 1;						\
+	__cond || !__ret;						\
+})
+
+#ifndef HAVE_CLEAR_AND_WAKE_UP_BIT
+/**
+ * clear_and_wake_up_bit - clear a bit and wake up anyone waiting on that bit
+ *
+ * @bit: the bit of the word being waited on
+ * @word: the word being waited on, a kernel virtual address
+ *
+ * You can use this helper if bitflags are manipulated atomically rather than
+ * non-atomically under a lock.
+ */
+static inline void clear_and_wake_up_bit(int bit, void *word)
+{
+	clear_bit_unlock(bit, word);
+	/* See wake_up_bit() for which memory barrier you need to use. */
+	smp_mb__after_atomic();
+	wake_up_bit(word, bit);
+}
+#endif /* ! HAVE_CLEAR_AND_WAKE_UP_BIT */
+
+#ifndef HAVE_WAIT_VAR_EVENT
+extern void __init wait_bit_init(void);
+extern void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry,
+				void *var, int flags);
+extern void wake_up_var(void *var);
+extern wait_queue_head_t *__var_waitqueue(void *p);
+
+#define ___wait_var_event(var, condition, state, exclusive, ret, cmd)	\
+({									\
+	__label__ __out;						\
+	wait_queue_head_t *__wq_head = __var_waitqueue(var);		\
+	struct wait_bit_queue_entry __wbq_entry;			\
+	long __ret = ret; /* explicit shadow */				\
+									\
+	init_wait_var_entry(&__wbq_entry, var,				\
+			    exclusive ? WQ_FLAG_EXCLUSIVE : 0);		\
+	for (;;) {							\
+		long __int = prepare_to_wait_event(__wq_head,		\
+						   &__wbq_entry.wq_entry, \
+						   state);		\
+		if (condition)						\
+			break;						\
+									\
+		if (___wait_is_interruptible(state) && __int) {		\
+			__ret = __int;					\
+			goto __out;					\
+		}							\
+									\
+		cmd;							\
+	}								\
+	finish_wait(__wq_head, &__wbq_entry.wq_entry);			\
+__out:	__ret;								\
+})
+
+#define __wait_var_event(var, condition)				\
+	___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0,	\
+			  schedule())
+
+#define wait_var_event(var, condition)					\
+do {									\
+	might_sleep();							\
+	if (condition)							\
+		break;							\
+	__wait_var_event(var, condition);				\
+} while (0)
+
+#define __wait_var_event_killable(var, condition)			\
+	___wait_var_event(var, condition, TASK_KILLABLE, 0, 0,		\
+			  schedule())
+
+#define wait_var_event_killable(var, condition)				\
+({									\
+	int __ret = 0;							\
+	might_sleep();							\
+	if (!(condition))						\
+		__ret = __wait_var_event_killable(var, condition);	\
+	__ret;								\
+})
+
+#define __wait_var_event_timeout(var, condition, timeout)		\
+	___wait_var_event(var, ___wait_cond_timeout1(condition),	\
+			  TASK_UNINTERRUPTIBLE, 0, timeout,		\
+			  __ret = schedule_timeout(__ret))
+
+#define wait_var_event_timeout(var, condition, timeout)			\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout1(condition))				\
+		__ret = __wait_var_event_timeout(var, condition, timeout); \
+	__ret;								\
+})
+#endif /* ! HAVE_WAIT_VAR_EVENT */
+
+/*
+ * prepare_to_wait_event() does not support an exclusive
+ * lifo wait.
+ * However it will not relink the wait_queue_entry if
+ * it is already linked.  So we link to the head of the
+ * queue here, and it will stay there.
+ */
+static inline void prepare_to_wait_exclusive_head(
+	wait_queue_head_t *waitq, wait_queue_entry_t *link)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&(waitq->lock), flags);
+#ifdef HAVE_WAIT_QUEUE_ENTRY_LIST
+	if (list_empty(&link->entry))
+#else
+	if (list_empty(&link->task_list))
+#endif
+		__add_wait_queue_exclusive(waitq, link);
+	spin_unlock_irqrestore(&((waitq)->lock), flags);
+}
+
+#ifndef ___wait_event
+/*
+ * The below macro ___wait_event() has an explicit shadow of the __ret
+ * variable when used from the wait_event_*() macros.
+ *
+ * This is so that both can use the ___wait_cond_timeout1() construct
+ * to wrap the condition.
+ *
+ * The type inconsistency of the wait_event_*() __ret variable is also
+ * on purpose; we use long where we can return timeout values and int
+ * otherwise.
+ */
+
+#define ___wait_event(wq_head, condition, state, exclusive, ret, cmd)	\
+({									\
+	__label__ __out;						\
+	wait_queue_entry_ __wq_entry;					\
+	long __ret = ret;	/* explicit shadow */			\
+									\
+	init_wait(&__wq_entry);						\
+	if (exclusive)							\
+		__wq_entry.flags = WQ_FLAG_EXCLUSIVE			\
+	for (;;) {							\
+		long __int = prepare_to_wait_event(&wq_head,		\
+						  &__wq_entry, state);	\
+									\
+		if (condition)						\
+			break;						\
+									\
+		if (___wait_is_interruptible(state) && __int) {		\
+			__ret = __int;					\
+			goto __out;					\
+		}							\
+									\
+		cmd;							\
+	}								\
+	finish_wait(&wq_head, &__wq_entry);				\
+__out:	__ret;								\
+})
+#endif
+
+#ifndef TASK_NOLOAD
+
+#define ___wait_event_idle(wq_head, condition, exclusive, ret, cmd)	\
+({									\
+	wait_queue_entry_t __wq_entry;					\
+	unsigned long flags;						\
+	long __ret = ret;	/* explicit shadow */			\
+	sigset_t __blocked;						\
+									\
+	__blocked = cfs_block_sigsinv(0);				\
+	init_wait(&__wq_entry);						\
+	if (exclusive)							\
+		__wq_entry.flags = WQ_FLAG_EXCLUSIVE;			\
+	for (;;) {							\
+		prepare_to_wait_event(&wq_head,				\
+				   &__wq_entry,				\
+				   TASK_INTERRUPTIBLE);			\
+									\
+		if (condition)						\
+			break;						\
+		/* We have to do this here because some signals */	\
+		/* are not blockable - ie from strace(1).       */	\
+		/* In these cases we want to schedule_timeout() */	\
+		/* again, because we don't want that to return  */	\
+		/* -EINTR when the RPC actually succeeded.      */	\
+		/* the recalc_sigpending() below will deliver the */	\
+		/* signal properly.                             */	\
+		if (signal_pending(current)) {				\
+			spin_lock_irqsave(&current->sighand->siglock,	\
+					  flags);			\
+			clear_tsk_thread_flag(current, TIF_SIGPENDING);	\
+			spin_unlock_irqrestore(&current->sighand->siglock,\
+					       flags);			\
+		}							\
+		cmd;							\
+	}								\
+	finish_wait(&wq_head, &__wq_entry);				\
+	cfs_restore_sigs(__blocked);					\
+	__ret;								\
+})
+
+#define wait_event_idle(wq_head, condition)				\
+do {									\
+	might_sleep();							\
+	if (!(condition))						\
+		___wait_event_idle(wq_head, condition, 0, 0, schedule());\
+} while (0)
+
+#define wait_event_idle_exclusive(wq_head, condition)			\
+do {									\
+	might_sleep();							\
+	if (!(condition))						\
+		___wait_event_idle(wq_head, condition, 1, 0, schedule());\
+} while (0)
+
+#define __wait_event_idle_exclusive_timeout(wq_head, condition, timeout)\
+	___wait_event_idle(wq_head, ___wait_cond_timeout1(condition),	\
+			   1, timeout,					\
+			   __ret = schedule_timeout(__ret))
+
+#define wait_event_idle_exclusive_timeout(wq_head, condition, timeout)	\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout1(condition))				\
+		__ret = __wait_event_idle_exclusive_timeout(		\
+			wq_head, condition, timeout);			\
+	__ret;								\
+})
+
+#define __wait_event_idle_exclusive_timeout_cmd(wq_head, condition,	\
+						timeout, cmd1, cmd2)	\
+	___wait_event_idle(wq_head, ___wait_cond_timeout1(condition),	\
+			   1, timeout,					\
+			   cmd1; __ret = schedule_timeout(__ret); cmd2)
+
+#define wait_event_idle_exclusive_timeout_cmd(wq_head, condition, timeout,\
+					      cmd1, cmd2)		\
+({									\
+	long __ret = timeout;						\
+	if (!___wait_cond_timeout1(condition))				\
+		__ret = __wait_event_idle_exclusive_timeout_cmd(	\
+			wq_head, condition, timeout, cmd1, cmd2);	\
+	__ret;								\
+})
+
+#define __wait_event_idle_timeout(wq_head, condition, timeout)		\
+	___wait_event_idle(wq_head, ___wait_cond_timeout1(condition),	\
+			   0, timeout,					\
+			   __ret = schedule_timeout(__ret))
+
+#define wait_event_idle_timeout(wq_head, condition, timeout)		\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout1(condition))				\
+		__ret = __wait_event_idle_timeout(wq_head, condition,	\
+						  timeout);		\
+	__ret;								\
+})
+
+#else /* TASK_IDLE */
+#ifndef wait_event_idle
+/**
+ * wait_event_idle - wait for a condition without contributing to system load
+ * @wq_head: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_IDLE) until the
+ * @condition evaluates to true.
+ * The @condition is checked each time the waitqueue @wq_head is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ */
+#define wait_event_idle(wq_head, condition)				\
+do {									\
+	might_sleep();							\
+	if (!(condition))						\
+		___wait_event(wq_head, condition, TASK_IDLE, 0, 0,	\
+			      schedule());				\
+} while (0)
+#endif
+#ifndef wait_event_idle_exclusive
+/**
+ * wait_event_idle_exclusive - wait for a condition without contributing to
+ *               system load
+ * @wq_head: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_IDLE) until the
+ * @condition evaluates to true.
+ * The @condition is checked each time the waitqueue @wq_head is woken up.
+ *
+ * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
+ * set thus if other processes wait on the same list, when this
+ * process is woken further processes are not considered.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ */
+#define wait_event_idle_exclusive(wq_head, condition)			\
+do {									\
+	might_sleep();							\
+	if (!(condition))						\
+		___wait_event(wq_head, condition, TASK_IDLE, 1, 0,	\
+			      schedule());				\
+} while (0)
+#endif
+#ifndef wait_event_idle_exclusive_timeout
+/**
+ * wait_event_idle_exclusive_timeout - sleep without load until a condition
+ *                       becomes true or a timeout elapses
+ * @wq_head: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout, in jiffies
+ *
+ * The process is put to sleep (TASK_IDLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq_head is woken up.
+ *
+ * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
+ * set thus if other processes wait on the same list, when this
+ * process is woken further processes are not considered.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * Returns:
+ * 0 if the @condition evaluated to %false after the @timeout elapsed,
+ * 1 if the @condition evaluated to %true after the @timeout elapsed,
+ * or the remaining jiffies (at least 1) if the @condition evaluated
+ * to %true before the @timeout elapsed.
+ */
+#define wait_event_idle_exclusive_timeout(wq_head, condition, timeout)	\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout1(condition))				\
+		__ret = __wait_event_idle_exclusive_timeout(wq_head,	\
+							    condition,	\
+							    timeout);	\
+	__ret;								\
+})
+#endif
+#ifndef wait_event_idle_exclusive_timeout_cmd
+#define __wait_event_idle_exclusive_timeout_cmd(wq_head, condition,	\
+						timeout, cmd1, cmd2)	\
+	___wait_event(wq_head, ___wait_cond_timeout1(condition),	\
+		      TASK_IDLE, 1, timeout,				\
+		      cmd1; __ret = schedule_timeout(__ret); cmd2)
+
+#define wait_event_idle_exclusive_timeout_cmd(wq_head, condition, timeout,\
+					      cmd1, cmd2)		\
+({									\
+	long __ret = timeout;						\
+	if (!___wait_cond_timeout1(condition))				\
+		__ret = __wait_event_idle_exclusive_timeout_cmd(	\
+			wq_head, condition, timeout, cmd1, cmd2);	\
+	__ret;								\
+})
+#endif
+
+#ifndef wait_event_idle_timeout
+
+#define __wait_event_idle_timeout(wq_head, condition, timeout)		\
+	___wait_event(wq_head, ___wait_cond_timeout1(condition),	\
+		      TASK_IDLE, 0, timeout,				\
+		      __ret = schedule_timeout(__ret))
+
+/**
+ * wait_event_idle_timeout - sleep without load until a condition becomes
+ *                           true or a timeout elapses
+ * @wq_head: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout, in jiffies
+ *
+ * The process is put to sleep (TASK_IDLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq_head is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * Returns:
+ * 0 if the @condition evaluated to %false after the @timeout elapsed,
+ * 1 if the @condition evaluated to %true after the @timeout elapsed,
+ * or the remaining jiffies (at least 1) if the @condition evaluated
+ * to %true before the @timeout elapsed.
+ */
+#define wait_event_idle_timeout(wq_head, condition, timeout)		\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout1(condition))				\
+		__ret = __wait_event_idle_timeout(wq_head, condition,	\
+						  timeout);		\
+	__ret;								\
+})
+#endif
+#endif /* TASK_IDLE */
+
+/* ___wait_event_lifo is used for lifo exclusive 'idle' waits */
+#ifdef TASK_NOLOAD
+
+#define ___wait_event_lifo(wq_head, condition, ret, cmd)		\
+({									\
+	wait_queue_entry_t	 __wq_entry;				\
+	long __ret = ret;	/* explicit shadow */			\
+									\
+	init_wait(&__wq_entry);						\
+	__wq_entry.flags =  WQ_FLAG_EXCLUSIVE;				\
+	for (;;) {							\
+		prepare_to_wait_exclusive_head(&wq_head, &__wq_entry);	\
+		prepare_to_wait_event(&wq_head, &__wq_entry, TASK_IDLE);\
+									\
+		if (condition)						\
+			break;						\
+									\
+		cmd;							\
+	}								\
+	finish_wait(&wq_head, &__wq_entry);				\
+	__ret;								\
+})
+#else
+#define ___wait_event_lifo(wq_head, condition, ret, cmd)		\
+({									\
+	wait_queue_entry_t __wq_entry;					\
+	unsigned long flags;						\
+	long __ret = ret;	/* explicit shadow */			\
+	sigset_t __blocked;						\
+									\
+	__blocked = cfs_block_sigsinv(0);				\
+	init_wait(&__wq_entry);						\
+	__wq_entry.flags = WQ_FLAG_EXCLUSIVE;				\
+	for (;;) {							\
+		prepare_to_wait_exclusive_head(&wq_head, &__wq_entry);	\
+		prepare_to_wait_event(&wq_head, &__wq_entry,		\
+				      TASK_INTERRUPTIBLE);		\
+									\
+		if (condition)						\
+			break;						\
+		/* See justification in ___wait_event_idle */		\
+		if (signal_pending(current)) {				\
+			spin_lock_irqsave(&current->sighand->siglock,	\
+					  flags);			\
+			clear_tsk_thread_flag(current, TIF_SIGPENDING);	\
+			spin_unlock_irqrestore(&current->sighand->siglock,\
+					       flags);			\
+		}							\
+		cmd;							\
+	}								\
+	cfs_restore_sigs(__blocked);					\
+	finish_wait(&wq_head, &__wq_entry);				\
+	__ret;								\
+})
+#endif
+
+#define wait_event_idle_exclusive_lifo(wq_head, condition)		\
+do {									\
+	might_sleep();							\
+	if (!(condition))						\
+		___wait_event_lifo(wq_head, condition, 0, schedule());	\
+} while (0)
+
+#define __wait_event_idle_lifo_timeout(wq_head, condition, timeout)	\
+	___wait_event_lifo(wq_head, ___wait_cond_timeout1(condition),	\
+			   timeout,					\
+			   __ret = schedule_timeout(__ret))
+
+#define wait_event_idle_exclusive_lifo_timeout(wq_head, condition, timeout)\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout1(condition))				\
+		__ret = __wait_event_idle_lifo_timeout(wq_head,		\
+						       condition,	\
+						       timeout);	\
+	__ret;								\
+})
+
+/* l_wait_event_abortable() is a bit like wait_event_killable()
+ * except there is a fixed set of signals which will abort:
+ * LUSTRE_FATAL_SIGS
+ */
+#define LUSTRE_FATAL_SIGS					 \
+	(sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGTERM) | \
+	 sigmask(SIGQUIT) | sigmask(SIGALRM))
+
+#define l_wait_event_abortable(wq, condition)				\
+({									\
+	sigset_t __new_blocked, __old_blocked;				\
+	int __ret = 0;							\
+	siginitsetinv(&__new_blocked, LUSTRE_FATAL_SIGS);		\
+	sigprocmask(SIG_BLOCK, &__new_blocked, &__old_blocked);		\
+	__ret = wait_event_interruptible(wq, condition);		\
+	sigprocmask(SIG_SETMASK, &__old_blocked, NULL);			\
+	__ret;								\
+})
+
+#define l_wait_event_abortable_timeout(wq, condition, timeout)		\
+({									\
+	sigset_t __new_blocked, __old_blocked;				\
+	int __ret = 0;							\
+	siginitsetinv(&__new_blocked, LUSTRE_FATAL_SIGS);		\
+	sigprocmask(SIG_BLOCK, &__new_blocked, &__old_blocked);		\
+	__ret = wait_event_interruptible_timeout(wq, condition, timeout);\
+	sigprocmask(SIG_SETMASK, &__old_blocked, NULL);			\
+	__ret;								\
+})
+
+#define l_wait_event_abortable_exclusive(wq, condition)			\
+({									\
+	sigset_t __new_blocked, __old_blocked;				\
+	int __ret = 0;							\
+	siginitsetinv(&__new_blocked, LUSTRE_FATAL_SIGS);		\
+	sigprocmask(SIG_BLOCK, &__new_blocked, &__old_blocked);		\
+	__ret = wait_event_interruptible_exclusive(wq, condition);	\
+	sigprocmask(SIG_SETMASK, &__old_blocked, NULL);			\
+	__ret;								\
+})
+
+#endif /* __LICBFS_LINUX_WAIT_BIT_H */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/hash.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/hash.h
new file mode 100644
index 0000000000000..45818dddedd94
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/hash.h
@@ -0,0 +1,103 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+
+#ifndef _LINUX_HASH_H
+#define _LINUX_HASH_H
+/* Fast hashing routine for ints,  longs and pointers.
+   (C) 2002 Nadia Yvette Chambers, IBM */
+
+/*
+ * Knuth recommends primes in approximately golden ratio to the maximum
+ * integer representable by a machine word for multiplicative hashing.
+ * Chuck Lever verified the effectiveness of this technique:
+ * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
+ *
+ * These primes are chosen to be bit-sparse, that is operations on
+ * them can use shifts and additions instead of multiplications for
+ * machines where multiplications are slow.
+ */
+
+#include <linux/types.h>
+
+/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
+#define GOLDEN_RATIO_PRIME_32 0x9e370001UL
+/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
+#define GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001UL
+
+#if __BITS_PER_LONG == 32
+#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_32
+#define hash_long(val, bits) hash_32(val, bits)
+#elif __BITS_PER_LONG == 64
+#define hash_long(val, bits) hash_64(val, bits)
+#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_64
+#else
+#error Wordsize not 32 or 64
+#endif
+
+static __always_inline __u64 hash_64(__u64 val, unsigned int bits)
+{
+	__u64 hash = val;
+
+	/*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
+	__u64 n = hash;
+	n <<= 18;
+	hash -= n;
+	n <<= 33;
+	hash -= n;
+	n <<= 3;
+	hash += n;
+	n <<= 3;
+	hash -= n;
+	n <<= 4;
+	hash += n;
+	n <<= 2;
+	hash += n;
+
+	/* High bits are more random, so use them. */
+	return hash >> (64 - bits);
+}
+
+static inline __u32 hash_32(__u32 val, unsigned int bits)
+{
+	/* On some cpus multiply is faster, on others gcc will do shifts */
+	__u32 hash = val * GOLDEN_RATIO_PRIME_32;
+
+	/* High bits are more random, so use them. */
+	return hash >> (32 - bits);
+}
+
+static inline unsigned long hash_ptr(const void *ptr, unsigned int bits)
+{
+	return hash_long((unsigned long)ptr, bits);
+}
+
+static inline __u32 hash32_ptr(const void *ptr)
+{
+	unsigned long val = (unsigned long)ptr;
+
+#if __BITS_PER_LONG == 64
+	val ^= (val >> 32);
+#endif
+	return (__u32)val;
+}
+
+#endif /* _LINUX_HASH_H */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/ioctl.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/ioctl.h
index 600bf27b607b4..a42e0c5fe4568 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/ioctl.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/ioctl.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2014, Intel Corporation.
+ * Copyright (c) 2014, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -43,7 +43,7 @@
 /* Sparse annotation. */
 #define __user
 
-#include <libcfs/libcfs_ioctl.h>
+#include <uapi/linux/lnet/libcfs_ioctl.h>
 
 #define LIBCFS_IOC_INIT(data)					\
 do {								\
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/parser.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/parser.h
index 2fb2db7c651dd..7bae8393a1916 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/parser.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/parser.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2014, Intel Corporation.
+ * Copyright (c) 2014, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -107,7 +107,7 @@ char *Parser_strarg(char *inp, const char *prompt, const char *deft,
 int Parser_arg2int(const char *inp, long *result, int base);
 
 /* Convert human readable size string to and int; "1k" -> 1000 */
-int Parser_size(int *sizep, char *str);
+int Parser_size(unsigned long *sizep, char *str);
 
 /* Convert a string boolean to an int; "enable" -> 1 */
 int Parser_bool(int *b, char *str);
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/string.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/string.h
index 72414f0c8003a..065829b7161d6 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/string.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/string.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2014, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -44,14 +44,6 @@
 #include <linux/types.h>
 #include <libcfs/util/list.h>
 
-#ifndef HAVE_STRLCPY /* not in glibc for RHEL 5.x, remove when obsolete */
-size_t strlcpy(char *tgt, const char *src, size_t tgt_len);
-#endif
-
-#ifndef HAVE_STRLCAT /* not in glibc for RHEL 5.x, remove when obsolete */
-size_t strlcat(char *tgt, const char *src, size_t tgt_len);
-#endif
-
 /**
  * Structure to represent NULL-less strings.
  */
@@ -93,5 +85,6 @@ int cfs_ip_addr_parse(char *str, int len, struct list_head *list);
 int cfs_ip_addr_range_gen(__u32 *ip_list, int count,
 			  struct list_head *ip_addr_expr);
 int cfs_ip_addr_match(__u32 addr, struct list_head *list);
+int cfs_abs_path(const char *request_path, char **resolved_path);
 
 #endif
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/Makefile b/drivers/staging/lustrefsx/libcfs/libcfs/Makefile
index a487ba0329342..a324f01fa2d77 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/Makefile
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/Makefile
@@ -1,16 +1,15 @@
 obj-$(CONFIG_LUSTREFSX_LIBCFS)	+= libcfs.o
 
 libcfs-linux-objs	:= linux-tracefile.o linux-debug.o linux-prim.o
-libcfs-linux-objs	+= linux-cpu.o linux-curproc.o linux-module.o
+libcfs-linux-objs	+= linux-curproc.o linux-module.o
 libcfs-linux-objs	+= linux-crypto.o linux-crypto-adler.o
-libcfs-linux-objs	+= linux-crypto-crc32.o
+libcfs-linux-objs	+= linux-crypto-crc32.o linux-hash.o linux-wait.o
 
 libcfs-linux-objs	:= $(addprefix linux/,$(libcfs-linux-objs))
 
 libcfs-all-objs		:= debug.o fail.o module.o tracefile.o watchdog.o
 libcfs-all-objs		+= libcfs_string.o hash.o prng.o workitem.o
 libcfs-all-objs		+= libcfs_cpu.o libcfs_mem.o libcfs_lock.o heap.o
-libcfs-all-objs		+= libcfs_ptask.o
 
 libcfs-y		+= $(libcfs-linux-objs) $(libcfs-all-objs)
 
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/debug.c b/drivers/staging/lustrefsx/libcfs/libcfs/debug.c
index a4aede1e3be08..0f7d6194a68f8 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/debug.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/debug.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -37,6 +37,7 @@
 
 # define DEBUG_SUBSYSTEM S_LNET
 
+#include <linux/ctype.h>
 #include <linux/kthread.h>
 #include <libcfs/libcfs.h>
 #include "tracefile.h"
@@ -54,8 +55,63 @@ module_param(libcfs_debug, int, 0644);
 MODULE_PARM_DESC(libcfs_debug, "Lustre kernel debug mask");
 EXPORT_SYMBOL(libcfs_debug);
 
+static int libcfs_param_debug_mb_set(const char *val,
+				     cfs_kernel_param_arg_t *kp)
+{
+	int rc;
+	unsigned int num;
+
+	rc = kstrtouint(val, 0, &num);
+	if (rc < 0)
+		return rc;
+
+/*
+ * RHEL6 does not support any kind of locking so we have to provide
+ * our own
+ */
+#if !defined(HAVE_MODULE_PARAM_LOCKING) && !defined(HAVE_KERNEL_PARAM_LOCK)
+	kernel_param_lock(THIS_MODULE);
+#endif
+	if (!*((unsigned int *)kp->arg)) {
+		*((unsigned int *)kp->arg) = num;
+
+#if !defined(HAVE_MODULE_PARAM_LOCKING) && !defined(HAVE_KERNEL_PARAM_LOCK)
+		kernel_param_unlock(THIS_MODULE);
+#endif
+		return 0;
+	}
+
+	rc = cfs_trace_set_debug_mb(num);
+
+	if (!rc)
+		*((unsigned int *)kp->arg) = cfs_trace_get_debug_mb();
+
+#if !defined(HAVE_MODULE_PARAM_LOCKING) && !defined(HAVE_KERNEL_PARAM_LOCK)
+	kernel_param_unlock(THIS_MODULE);
+#endif
+	return rc;
+}
+
+/*
+ * While debug_mb setting look like unsigned int, in fact
+ * it needs quite a bunch of extra processing, so we define special
+ * debug_mb parameter type with corresponding methods to handle this case
+ */
+static struct kernel_param_ops param_ops_debug_mb = {
+	.set = libcfs_param_debug_mb_set,
+	.get = param_get_uint,
+};
+
+#define param_check_debug_mb(name, p) \
+		__param_check(name, p, unsigned int)
+
 static unsigned int libcfs_debug_mb;
-module_param(libcfs_debug_mb, uint, 0644);
+#ifdef HAVE_KERNEL_PARAM_OPS
+module_param(libcfs_debug_mb, debug_mb, 0644);
+#else
+module_param_call(libcfs_debug_mb, libcfs_param_debug_mb_set, param_get_uint,
+		  &param_ops_debug_mb, 0644);
+#endif
 MODULE_PARM_DESC(libcfs_debug_mb, "Total debug buffer size.");
 
 unsigned int libcfs_printk = D_CANTMASK;
@@ -66,16 +122,123 @@ unsigned int libcfs_console_ratelimit = 1;
 module_param(libcfs_console_ratelimit, uint, 0644);
 MODULE_PARM_DESC(libcfs_console_ratelimit, "Lustre kernel debug console ratelimit (0 to disable)");
 
+static int param_set_delay_minmax(const char *val,
+				  cfs_kernel_param_arg_t *kp,
+				  long min, long max)
+{
+	long d;
+	int sec;
+	int rc;
+
+	rc = kstrtoint(val, 0, &sec);
+	if (rc)
+		return -EINVAL;
+
+	/* The sysfs setting is in centiseconds */
+	d = cfs_time_seconds(sec) / 100;
+	if (d < min || d > max)
+		return -EINVAL;
+
+	*((unsigned int *)kp->arg) = d;
+
+	return 0;
+}
+
+static int param_get_delay(char *buffer, cfs_kernel_param_arg_t *kp)
+{
+	unsigned int d = *(unsigned int *)kp->arg;
+
+	return sprintf(buffer, "%lu", jiffies_to_msecs(d * 10) / MSEC_PER_SEC);
+}
+
 unsigned int libcfs_console_max_delay;
-module_param(libcfs_console_max_delay, uint, 0644);
+unsigned int libcfs_console_min_delay;
+
+static int param_set_console_max_delay(const char *val,
+				       cfs_kernel_param_arg_t *kp)
+{
+	return param_set_delay_minmax(val, kp,
+				      libcfs_console_min_delay, INT_MAX);
+}
+
+static struct kernel_param_ops param_ops_console_max_delay = {
+	.set = param_set_console_max_delay,
+	.get = param_get_delay,
+};
+
+#define param_check_console_max_delay(name, p) \
+		__param_check(name, p, unsigned int)
+
+#ifdef HAVE_KERNEL_PARAM_OPS
+module_param(libcfs_console_max_delay, console_max_delay, 0644);
+#else
+module_param_call(libcfs_console_max_delay, param_set_console_max_delay,
+		  param_get_delay, &param_ops_console_max_delay, 0644);
+#endif
 MODULE_PARM_DESC(libcfs_console_max_delay, "Lustre kernel debug console max delay (jiffies)");
 
-unsigned int libcfs_console_min_delay;
-module_param(libcfs_console_min_delay, uint, 0644);
+static int param_set_console_min_delay(const char *val,
+				       cfs_kernel_param_arg_t *kp)
+{
+	return param_set_delay_minmax(val, kp,
+				      1, libcfs_console_max_delay);
+}
+
+static struct kernel_param_ops param_ops_console_min_delay = {
+	.set = param_set_console_min_delay,
+	.get = param_get_delay,
+};
+
+#define param_check_console_min_delay(name, p) \
+		__param_check(name, p, unsigned int)
+
+#ifdef HAVE_KERNEL_PARAM_OPS
+module_param(libcfs_console_min_delay, console_min_delay, 0644);
+#else
+module_param_call(libcfs_console_min_delay, param_set_console_min_delay,
+		  param_get_delay, &param_ops_console_min_delay, 0644);
+#endif
 MODULE_PARM_DESC(libcfs_console_min_delay, "Lustre kernel debug console min delay (jiffies)");
 
+static int param_set_uint_minmax(const char *val,
+				 cfs_kernel_param_arg_t *kp,
+				 unsigned int min, unsigned int max)
+{
+	unsigned int num;
+	int ret;
+
+	if (!val)
+		return -EINVAL;
+
+	ret = kstrtouint(val, 0, &num);
+	if (ret < 0 || num < min || num > max)
+		return -EINVAL;
+
+	*((unsigned int *)kp->arg) = num;
+	return 0;
+}
+
+static int param_set_uintpos(const char *val,
+			     cfs_kernel_param_arg_t *kp)
+{
+	return param_set_uint_minmax(val, kp, 1, -1);
+}
+
+static struct kernel_param_ops param_ops_uintpos = {
+	.set = param_set_uintpos,
+	.get = param_get_uint,
+};
+
+#define param_check_uintpos(name, p) \
+		__param_check(name, p, unsigned int)
+
 unsigned int libcfs_console_backoff = CDEBUG_DEFAULT_BACKOFF;
-module_param(libcfs_console_backoff, uint, 0644);
+#ifdef HAVE_KERNEL_PARAM_OPS
+module_param(libcfs_console_backoff, uintpos, 0644);
+#else
+module_param_call(libcfs_console_backoff, param_set_uintpos, param_get_uint,
+		  &param_ops_uintpos, 0644);
+#endif
 MODULE_PARM_DESC(libcfs_console_backoff, "Lustre kernel debug console backoff factor");
 
 unsigned int libcfs_debug_binary = 1;
@@ -101,15 +264,17 @@ char libcfs_debug_file_path_arr[PATH_MAX] = LIBCFS_DEBUG_FILE_PATH_DEFAULT;
 EXPORT_SYMBOL(libcfs_debug_file_path_arr);
 
 /* We need to pass a pointer here, but elsewhere this must be a const */
-static char *libcfs_debug_file_path;
+static char *libcfs_debug_file_path = LIBCFS_DEBUG_FILE_PATH_DEFAULT;
 module_param(libcfs_debug_file_path, charp, 0644);
 MODULE_PARM_DESC(libcfs_debug_file_path,
 		 "Path for dumping debug logs, set 'NONE' to prevent log dumping");
 
 int libcfs_panic_in_progress;
 
-/* libcfs_debug_token2mask() expects the returned
- * string in lower-case */
+/*
+ * libcfs_debug_token2mask() expects the returned
+ * string in lower-case
+ */
 static const char *libcfs_debug_subsys2str(int subsys)
 {
 	static const char *libcfs_debug_subsystems[] = LIBCFS_DEBUG_SUBSYS_NAMES;
@@ -120,8 +285,10 @@ static const char *libcfs_debug_subsys2str(int subsys)
 	return libcfs_debug_subsystems[subsys];
 }
 
-/* libcfs_debug_token2mask() expects the returned
- * string in lower-case */
+/*
+ * libcfs_debug_token2mask() expects the returned
+ * string in lower-case
+ */
 static const char *libcfs_debug_dbg2str(int debug)
 {
 	static const char *libcfs_debug_masks[] = LIBCFS_DEBUG_MASKS_NAMES;
@@ -135,79 +302,78 @@ static const char *libcfs_debug_dbg2str(int debug)
 int
 libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys)
 {
-        const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
-                                                 libcfs_debug_dbg2str;
-        int           len = 0;
-        const char   *token;
-        int           i;
-
-        if (mask == 0) {                        /* "0" */
-                if (size > 0)
-                        str[0] = '0';
-                len = 1;
-        } else {                                /* space-separated tokens */
-                for (i = 0; i < 32; i++) {
-                        if ((mask & (1 << i)) == 0)
-                                continue;
-
-                        token = fn(i);
-                        if (token == NULL)              /* unused bit */
-                                continue;
-
-                        if (len > 0) {                  /* separator? */
-                                if (len < size)
-                                        str[len] = ' ';
-                                len++;
-                        }
-
-                        while (*token != 0) {
-                                if (len < size)
-                                        str[len] = *token;
-                                token++;
-                                len++;
-                        }
-                }
-        }
-
-        /* terminate 'str' */
-        if (len < size)
-                str[len] = 0;
-        else
-                str[size - 1] = 0;
-
-        return len;
+	const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
+						 libcfs_debug_dbg2str;
+	int len = 0;
+	const char *token;
+	int i;
+
+	if (mask == 0) {			/* "0" */
+		if (size > 0)
+			str[0] = '0';
+		len = 1;
+	} else {				/* space-separated tokens */
+		for (i = 0; i < 32; i++) {
+			if ((mask & (1 << i)) == 0)
+				continue;
+
+			token = fn(i);
+			if (token == NULL)	/* unused bit */
+				continue;
+
+			if (len > 0) {		/* separator? */
+				if (len < size)
+					str[len] = ' ';
+				len++;
+			}
+
+			while (*token != 0) {
+				if (len < size)
+					str[len] = *token;
+				token++;
+				len++;
+			}
+		}
+	}
+
+	/* terminate 'str' */
+	if (len < size)
+		str[len] = 0;
+	else
+		str[size - 1] = 0;
+
+	return len;
 }
 
 int
 libcfs_debug_str2mask(int *mask, const char *str, int is_subsys)
 {
-        const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
-                                                 libcfs_debug_dbg2str;
-        int         m = 0;
-        int         matched;
-        int         n;
-        int         t;
-
-        /* Allow a number for backwards compatibility */
-
-        for (n = strlen(str); n > 0; n--)
-                if (!isspace(str[n-1]))
-                        break;
-        matched = n;
-
-        if ((t = sscanf(str, "%i%n", &m, &matched)) >= 1 &&
-            matched == n) {
-                /* don't print warning for lctl set_param debug=0 or -1 */
-                if (m != 0 && m != -1)
-                        CWARN("You are trying to use a numerical value for the "
-                              "mask - this will be deprecated in a future "
-                              "release.\n");
-                *mask = m;
-                return 0;
-        }
-
-        return cfs_str2mask(str, fn, mask, is_subsys ? 0 : D_CANTMASK,
-                            0xffffffff);
+	const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
+						 libcfs_debug_dbg2str;
+	int m = 0;
+	int matched;
+	int n;
+	int t;
+
+	/* Allow a number for backwards compatibility */
+	for (n = strlen(str); n > 0; n--)
+		if (!isspace(str[n-1]))
+			break;
+	matched = n;
+
+	t = sscanf(str, "%i%n", &m, &matched);
+	if (t >= 1 && matched == n) {
+		/* don't print warning for lctl set_param debug=0 or -1 */
+		if (m != 0 && m != -1)
+			CWARN("You are trying to use a numerical value for the "
+			      "mask - this will be deprecated in a future "
+			      "release.\n");
+		*mask = m;
+		return 0;
+	}
+
+	return cfs_str2mask(str, fn, mask, is_subsys ? 0 : D_CANTMASK,
+			    0xffffffff);
 }
 
 /**
@@ -248,11 +414,14 @@ void libcfs_debug_dumplog(void)
 {
 	wait_queue_entry_t wait;
 	struct task_struct *dumper;
+
 	ENTRY;
 
-	/* we're being careful to ensure that the kernel thread is
+	/*
+	 * we're being careful to ensure that the kernel thread is
 	 * able to set our state to running as it exits before we
-	 * get to schedule() */
+	 * get to schedule()
+	 */
 	init_waitqueue_entry(&wait, current);
 	set_current_state(TASK_INTERRUPTIBLE);
 	add_wait_queue(&debug_ctlwq, &wait);
@@ -274,7 +443,7 @@ EXPORT_SYMBOL(libcfs_debug_dumplog);
 
 int libcfs_debug_init(unsigned long bufsize)
 {
-	int    rc = 0;
+	int rc = 0;
 	unsigned int max = libcfs_debug_mb;
 
 	init_waitqueue_head(&debug_ctlwq);
@@ -292,55 +461,65 @@ int libcfs_debug_init(unsigned long bufsize)
 			sizeof(libcfs_debug_file_path_arr));
 	}
 
-	/* If libcfs_debug_mb is set to an invalid value or uninitialized
-	 * then just make the total buffers smp_num_cpus * TCD_MAX_PAGES */
+	/*
+	 * If libcfs_debug_mb is set to an invalid value or uninitialized
+	 * then just make the total buffers smp_num_cpus * TCD_MAX_PAGES
+	 */
 	if (max > cfs_trace_max_debug_mb() || max < num_possible_cpus()) {
 		max = TCD_MAX_PAGES;
 	} else {
 		max = (max / num_possible_cpus());
 		max = (max << (20 - PAGE_SHIFT));
 	}
-	rc = cfs_tracefile_init(max);
-
-        if (rc == 0)
-                libcfs_register_panic_notifier();
 
-        return rc;
+	rc = cfs_tracefile_init(max);
+	if (rc)
+		return rc;
+
+	libcfs_register_panic_notifier();
+	kernel_param_lock(THIS_MODULE);
+	libcfs_debug_mb = cfs_trace_get_debug_mb();
+	kernel_param_unlock(THIS_MODULE);
+	return rc;
 }
 
 int libcfs_debug_cleanup(void)
 {
-        libcfs_unregister_panic_notifier();
-        cfs_tracefile_exit();
-        return 0;
+	libcfs_unregister_panic_notifier();
+	kernel_param_lock(THIS_MODULE);
+	cfs_tracefile_exit();
+	kernel_param_unlock(THIS_MODULE);
+	return 0;
 }
 
 int libcfs_debug_clear_buffer(void)
 {
-        cfs_trace_flush_pages();
-        return 0;
+	cfs_trace_flush_pages();
+	return 0;
 }
 
-/* Debug markers, although printed by S_LNET
- * should not be be marked as such. */
+/*
+ * Debug markers, although printed by S_LNET
+ * should not be be marked as such.
+ */
 #undef DEBUG_SUBSYSTEM
 #define DEBUG_SUBSYSTEM S_UNDEFINED
 int libcfs_debug_mark_buffer(const char *text)
 {
-        CDEBUG(D_TRACE,"***************************************************\n");
-        LCONSOLE(D_WARNING, "DEBUG MARKER: %s\n", text);
-        CDEBUG(D_TRACE,"***************************************************\n");
+	CDEBUG(D_TRACE, "**************************************************\n");
+	LCONSOLE(D_WARNING, "DEBUG MARKER: %s\n", text);
+	CDEBUG(D_TRACE, "**************************************************\n");
 
-        return 0;
+	return 0;
 }
 #undef DEBUG_SUBSYSTEM
 #define DEBUG_SUBSYSTEM S_LNET
 
 long libcfs_log_return(struct libcfs_debug_msg_data *msgdata, long rc)
 {
-        libcfs_debug_msg(msgdata, "Process leaving (rc=%lu : %ld : %lx)\n",
-                         rc, rc, rc);
-        return rc;
+	libcfs_debug_msg(msgdata, "Process leaving (rc=%lu : %ld : %lx)\n",
+			 rc, rc, rc);
+	return rc;
 }
 EXPORT_SYMBOL(libcfs_log_return);
 
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_cpu.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_cpu.c
index 209333edf6b5b..8757ad1f5c1e8 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_cpu.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_cpu.c
@@ -13,16 +13,11 @@
  * General Public License version 2 for more details (a copy is included
  * in the LICENSE file that accompanied this code).
  *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA
- *
  * GPL HEADER END
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2013, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -35,42 +30,193 @@
 
 #define DEBUG_SUBSYSTEM S_LNET
 
+#include <linux/cpu.h>
+#include <linux/sched.h>
+#include <libcfs/libcfs_cpu.h>
 #include <libcfs/libcfs.h>
 
 /** Global CPU partition table */
-struct cfs_cpt_table *cfs_cpt_table __read_mostly = NULL;
+struct cfs_cpt_table *cfs_cpt_table __read_mostly;
 EXPORT_SYMBOL(cfs_cpt_table);
 
-#ifndef HAVE_LIBCFS_CPT
-
-#define CFS_CPU_VERSION_MAGIC           0xbabecafe
+/**
+ * modparam for setting number of partitions
+ *
+ *  0 : estimate best value based on cores or NUMA nodes
+ *  1 : disable multiple partitions
+ * >1 : specify number of partitions
+ */
+static int cpu_npartitions;
+module_param(cpu_npartitions, int, 0444);
+MODULE_PARM_DESC(cpu_npartitions, "# of CPU partitions");
 
-#define CFS_CPT_DISTANCE		1	/* Arbitrary positive value */
+/**
+ * modparam for setting CPU partitions patterns:
+ *
+ * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID,
+ *      number in bracket is processor ID (core or HT)
+ *
+ * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket
+ *       are NUMA node ID, number before bracket is CPU partition ID.
+ *
+ * i.e: "N", shortcut expression to create CPT from NUMA & CPU topology
+ *
+ * NB: If user specified cpu_pattern, cpu_npartitions will be ignored
+ */
+static char *cpu_pattern = "N";
+module_param(cpu_pattern, charp, 0444);
+MODULE_PARM_DESC(cpu_pattern, "CPU partitions pattern");
 
+#ifdef CONFIG_SMP
 struct cfs_cpt_table *cfs_cpt_table_alloc(int ncpt)
 {
 	struct cfs_cpt_table *cptab;
+	int i;
 
-	if (ncpt != 1) {
-		CERROR("Can't support cpu partition number %d\n", ncpt);
+	LIBCFS_ALLOC(cptab, sizeof(*cptab));
+	if (!cptab)
 		return NULL;
-	}
 
-	LIBCFS_ALLOC(cptab, sizeof(*cptab));
-	if (cptab != NULL) {
-		cptab->ctb_version = CFS_CPU_VERSION_MAGIC;
-		cpu_set(0, cptab->ctb_cpumask);
-		node_set(0, cptab->ctb_nodemask);
-		cptab->ctb_nparts  = ncpt;
+	cptab->ctb_nparts = ncpt;
+
+	LIBCFS_ALLOC(cptab->ctb_cpumask, cpumask_size());
+	if (!cptab->ctb_cpumask)
+		goto failed_alloc_cpumask;
+
+	LIBCFS_ALLOC(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
+	if (!cptab->ctb_nodemask)
+		goto failed_alloc_nodemask;
+
+	LIBCFS_ALLOC(cptab->ctb_cpu2cpt,
+		     nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
+	if (!cptab->ctb_cpu2cpt)
+		goto failed_alloc_cpu2cpt;
+
+	memset(cptab->ctb_cpu2cpt, -1,
+	       nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
+
+	LIBCFS_ALLOC(cptab->ctb_node2cpt,
+		     nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
+	if (!cptab->ctb_node2cpt)
+		goto failed_alloc_node2cpt;
+
+	memset(cptab->ctb_node2cpt, -1,
+	       nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
+
+	LIBCFS_ALLOC(cptab->ctb_parts, ncpt * sizeof(cptab->ctb_parts[0]));
+	if (!cptab->ctb_parts)
+		goto failed_alloc_ctb_parts;
+
+	memset(cptab->ctb_parts, -1, ncpt * sizeof(cptab->ctb_parts[0]));
+
+	for (i = 0; i < ncpt; i++) {
+		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
+
+		LIBCFS_ALLOC(part->cpt_cpumask, cpumask_size());
+		if (!part->cpt_cpumask)
+			goto failed_setting_ctb_parts;
+
+		LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask));
+		if (!part->cpt_nodemask)
+			goto failed_setting_ctb_parts;
+
+		LIBCFS_ALLOC(part->cpt_distance,
+			     cptab->ctb_nparts * sizeof(part->cpt_distance[0]));
+		if (!part->cpt_distance)
+			goto failed_setting_ctb_parts;
+
+		memset(part->cpt_distance, -1,
+		       cptab->ctb_nparts * sizeof(part->cpt_distance[0]));
 	}
 
 	return cptab;
+
+failed_setting_ctb_parts:
+	while (i-- >= 0) {
+		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
+
+		if (part->cpt_nodemask) {
+			LIBCFS_FREE(part->cpt_nodemask,
+				    sizeof(*part->cpt_nodemask));
+		}
+
+		if (part->cpt_cpumask)
+			LIBCFS_FREE(part->cpt_cpumask, cpumask_size());
+
+		if (part->cpt_distance) {
+			LIBCFS_FREE(part->cpt_distance,
+				cptab->ctb_nparts *
+					sizeof(part->cpt_distance[0]));
+		}
+	}
+
+	if (cptab->ctb_parts) {
+		LIBCFS_FREE(cptab->ctb_parts,
+			    cptab->ctb_nparts * sizeof(cptab->ctb_parts[0]));
+	}
+failed_alloc_ctb_parts:
+	if (cptab->ctb_node2cpt) {
+		LIBCFS_FREE(cptab->ctb_node2cpt,
+			    nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
+	}
+failed_alloc_node2cpt:
+	if (cptab->ctb_cpu2cpt) {
+		LIBCFS_FREE(cptab->ctb_cpu2cpt,
+			    nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
+	}
+failed_alloc_cpu2cpt:
+	if (cptab->ctb_nodemask)
+		LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
+failed_alloc_nodemask:
+	if (cptab->ctb_cpumask)
+		LIBCFS_FREE(cptab->ctb_cpumask, cpumask_size());
+failed_alloc_cpumask:
+	LIBCFS_FREE(cptab, sizeof(*cptab));
+	return NULL;
 }
 EXPORT_SYMBOL(cfs_cpt_table_alloc);
 
 void cfs_cpt_table_free(struct cfs_cpt_table *cptab)
 {
-	LASSERT(cptab->ctb_version == CFS_CPU_VERSION_MAGIC);
+	int i;
+
+	if (cptab->ctb_cpu2cpt) {
+		LIBCFS_FREE(cptab->ctb_cpu2cpt,
+			    nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
+	}
+
+	if (cptab->ctb_node2cpt) {
+		LIBCFS_FREE(cptab->ctb_node2cpt,
+			    nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
+	}
+
+	for (i = 0; cptab->ctb_parts && i < cptab->ctb_nparts; i++) {
+		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
+
+		if (part->cpt_nodemask) {
+			LIBCFS_FREE(part->cpt_nodemask,
+				    sizeof(*part->cpt_nodemask));
+		}
+
+		if (part->cpt_cpumask)
+			LIBCFS_FREE(part->cpt_cpumask, cpumask_size());
+
+		if (part->cpt_distance) {
+			LIBCFS_FREE(part->cpt_distance,
+				cptab->ctb_nparts *
+					sizeof(part->cpt_distance[0]));
+		}
+	}
+
+	if (cptab->ctb_parts) {
+		LIBCFS_FREE(cptab->ctb_parts,
+			    cptab->ctb_nparts * sizeof(cptab->ctb_parts[0]));
+	}
+
+	if (cptab->ctb_nodemask)
+		LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
+	if (cptab->ctb_cpumask)
+		LIBCFS_FREE(cptab->ctb_cpumask, cpumask_size());
 
 	LIBCFS_FREE(cptab, sizeof(*cptab));
 }
@@ -78,80 +224,346 @@ EXPORT_SYMBOL(cfs_cpt_table_free);
 
 int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
 {
-	int rc = 0;
+	char *tmp = buf;
+	int rc;
+	int i;
+	int j;
 
-	rc = snprintf(buf, len, "%d\t: %d\n", 0, 0);
-	len -= rc;
-	if (len <= 0)
-		return -EFBIG;
+	for (i = 0; i < cptab->ctb_nparts; i++) {
+		if (len <= 0)
+			goto err;
 
-	return rc;
+		rc = snprintf(tmp, len, "%d\t:", i);
+		len -= rc;
+
+		if (len <= 0)
+			goto err;
+
+		tmp += rc;
+		for_each_cpu(j, cptab->ctb_parts[i].cpt_cpumask) {
+			rc = snprintf(tmp, len, " %d", j);
+			len -= rc;
+			if (len <= 0)
+				goto err;
+			tmp += rc;
+		}
+
+		*tmp = '\n';
+		tmp++;
+		len--;
+	}
+
+	return tmp - buf;
+err:
+	return -E2BIG;
 }
 EXPORT_SYMBOL(cfs_cpt_table_print);
 
 int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len)
 {
-	int	rc = 0;
+	char *tmp = buf;
+	int rc;
+	int i;
+	int j;
 
-	rc = snprintf(buf, len, "%d\t: %d:%d\n", 0, CFS_CPT_DISTANCE);
-	len -= rc;
-	if (len <= 0)
-		return -EFBIG;
+	for (i = 0; i < cptab->ctb_nparts; i++) {
+		if (len <= 0)
+			goto err;
 
-	return rc;
+		rc = snprintf(tmp, len, "%d\t:", i);
+		len -= rc;
+
+		if (len <= 0)
+			goto err;
+
+		tmp += rc;
+		for (j = 0; j < cptab->ctb_nparts; j++) {
+			rc = snprintf(tmp, len, " %d:%d", j,
+				      cptab->ctb_parts[i].cpt_distance[j]);
+			len -= rc;
+			if (len <= 0)
+				goto err;
+			tmp += rc;
+		}
+
+		*tmp = '\n';
+		tmp++;
+		len--;
+	}
+
+	return tmp - buf;
+err:
+	return -E2BIG;
 }
 EXPORT_SYMBOL(cfs_cpt_distance_print);
 
 int cfs_cpt_number(struct cfs_cpt_table *cptab)
 {
-	return 1;
+	return cptab->ctb_nparts;
 }
 EXPORT_SYMBOL(cfs_cpt_number);
 
 int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
 {
-	return 1;
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cpumask_weight(cptab->ctb_cpumask) :
+	       cpumask_weight(cptab->ctb_parts[cpt].cpt_cpumask);
 }
 EXPORT_SYMBOL(cfs_cpt_weight);
 
 int cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
 {
-	return 1;
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cpumask_any_and(cptab->ctb_cpumask,
+			       cpu_online_mask) < nr_cpu_ids :
+	       cpumask_any_and(cptab->ctb_parts[cpt].cpt_cpumask,
+			       cpu_online_mask) < nr_cpu_ids;
 }
 EXPORT_SYMBOL(cfs_cpt_online);
 
 cpumask_t *cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt)
 {
-	return &cptab->ctb_mask;
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cptab->ctb_cpumask : cptab->ctb_parts[cpt].cpt_cpumask;
 }
 EXPORT_SYMBOL(cfs_cpt_cpumask);
 
 nodemask_t *cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt)
 {
-	return &cptab->ctb_nodemask;
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask;
 }
 EXPORT_SYMBOL(cfs_cpt_nodemask);
 
-unsigned cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2)
+unsigned int cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2)
 {
-	return CFS_CPT_DISTANCE;
+	LASSERT(cpt1 == CFS_CPT_ANY || (cpt1 >= 0 && cpt1 < cptab->ctb_nparts));
+	LASSERT(cpt2 == CFS_CPT_ANY || (cpt2 >= 0 && cpt2 < cptab->ctb_nparts));
+
+	if (cpt1 == CFS_CPT_ANY || cpt2 == CFS_CPT_ANY)
+		return cptab->ctb_distance;
+
+	return cptab->ctb_parts[cpt1].cpt_distance[cpt2];
 }
 EXPORT_SYMBOL(cfs_cpt_distance);
 
+/*
+ * Calculate the maximum NUMA distance between all nodes in the
+ * from_mask and all nodes in the to_mask.
+ */
+static unsigned int cfs_cpt_distance_calculate(nodemask_t *from_mask,
+					       nodemask_t *to_mask)
+{
+	unsigned int maximum;
+	unsigned int distance;
+	int from;
+	int to;
+
+	maximum = 0;
+	for_each_node_mask(from, *from_mask) {
+		for_each_node_mask(to, *to_mask) {
+			distance = node_distance(from, to);
+			if (maximum < distance)
+				maximum = distance;
+		}
+	}
+	return maximum;
+}
+
+static void cfs_cpt_add_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+	cptab->ctb_cpu2cpt[cpu] = cpt;
+
+	cpumask_set_cpu(cpu, cptab->ctb_cpumask);
+	cpumask_set_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
+}
+
+static void cfs_cpt_del_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+	cpumask_clear_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
+	cpumask_clear_cpu(cpu, cptab->ctb_cpumask);
+
+	cptab->ctb_cpu2cpt[cpu] = -1;
+}
+
+static void cfs_cpt_add_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+	struct cfs_cpu_partition *part;
+
+	if (!node_isset(node, *cptab->ctb_nodemask)) {
+		unsigned int dist;
+
+		/* first time node is added to the CPT table */
+		node_set(node, *cptab->ctb_nodemask);
+		cptab->ctb_node2cpt[node] = cpt;
+
+		dist = cfs_cpt_distance_calculate(cptab->ctb_nodemask,
+						  cptab->ctb_nodemask);
+		cptab->ctb_distance = dist;
+	}
+
+	part = &cptab->ctb_parts[cpt];
+	if (!node_isset(node, *part->cpt_nodemask)) {
+		int cpt2;
+
+		/* first time node is added to this CPT */
+		node_set(node, *part->cpt_nodemask);
+		for (cpt2 = 0; cpt2 < cptab->ctb_nparts; cpt2++) {
+			struct cfs_cpu_partition *part2;
+			unsigned int dist;
+
+			part2 = &cptab->ctb_parts[cpt2];
+			dist = cfs_cpt_distance_calculate(part->cpt_nodemask,
+							  part2->cpt_nodemask);
+			part->cpt_distance[cpt2] = dist;
+			dist = cfs_cpt_distance_calculate(part2->cpt_nodemask,
+							  part->cpt_nodemask);
+			part2->cpt_distance[cpt] = dist;
+		}
+	}
+}
+
+static void cfs_cpt_del_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+	struct cfs_cpu_partition *part = &cptab->ctb_parts[cpt];
+	int cpu;
+
+	for_each_cpu(cpu, part->cpt_cpumask) {
+		/* this CPT has other CPU belonging to this node? */
+		if (cpu_to_node(cpu) == node)
+			break;
+	}
+
+	if (cpu >= nr_cpu_ids && node_isset(node,  *part->cpt_nodemask)) {
+		int cpt2;
+
+		/* No more CPUs in the node for this CPT. */
+		node_clear(node, *part->cpt_nodemask);
+		for (cpt2 = 0; cpt2 < cptab->ctb_nparts; cpt2++) {
+			struct cfs_cpu_partition *part2;
+			unsigned int dist;
+
+			part2 = &cptab->ctb_parts[cpt2];
+			if (node_isset(node, *part2->cpt_nodemask))
+				cptab->ctb_node2cpt[node] = cpt2;
+
+			dist = cfs_cpt_distance_calculate(part->cpt_nodemask,
+							  part2->cpt_nodemask);
+			part->cpt_distance[cpt2] = dist;
+			dist = cfs_cpt_distance_calculate(part2->cpt_nodemask,
+							  part->cpt_nodemask);
+			part2->cpt_distance[cpt] = dist;
+		}
+	}
+
+	for_each_cpu(cpu, cptab->ctb_cpumask) {
+		/* this CPT-table has other CPUs belonging to this node? */
+		if (cpu_to_node(cpu) == node)
+			break;
+	}
+
+	if (cpu >= nr_cpu_ids && node_isset(node, *cptab->ctb_nodemask)) {
+		/* No more CPUs in the table for this node. */
+		node_clear(node, *cptab->ctb_nodemask);
+		cptab->ctb_node2cpt[node] = -1;
+		cptab->ctb_distance =
+			cfs_cpt_distance_calculate(cptab->ctb_nodemask,
+						   cptab->ctb_nodemask);
+	}
+}
+
 int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
 {
+	LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts);
+
+	if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_online(cpu)) {
+		CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu);
+		return 0;
+	}
+
+	if (cptab->ctb_cpu2cpt[cpu] != -1) {
+		CDEBUG(D_INFO, "CPU %d is already in partition %d\n",
+		       cpu, cptab->ctb_cpu2cpt[cpu]);
+		return 0;
+	}
+
+	if (cpumask_test_cpu(cpu, cptab->ctb_cpumask)) {
+		CDEBUG(D_INFO, "CPU %d is already in cpumask\n", cpu);
+		return 0;
+	}
+
+	if (cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask)) {
+		CDEBUG(D_INFO, "CPU %d is already in partition %d cpumask\n",
+		       cpu, cptab->ctb_cpu2cpt[cpu]);
+		return 0;
+	}
+
+	cfs_cpt_add_cpu(cptab, cpt, cpu);
+	cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu));
+
 	return 1;
 }
 EXPORT_SYMBOL(cfs_cpt_set_cpu);
 
 void cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
 {
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	if (cpu < 0 || cpu >= nr_cpu_ids) {
+		CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu);
+		return;
+	}
+
+	if (cpt == CFS_CPT_ANY) {
+		/* caller doesn't know the partition ID */
+		cpt = cptab->ctb_cpu2cpt[cpu];
+		if (cpt < 0) { /* not set in this CPT-table */
+			CDEBUG(D_INFO,
+			       "Try to unset cpu %d which is not in CPT-table %p\n",
+			       cpt, cptab);
+			return;
+		}
+
+	} else if (cpt != cptab->ctb_cpu2cpt[cpu]) {
+		CDEBUG(D_INFO,
+		       "CPU %d is not in CPU partition %d\n", cpu, cpt);
+		return;
+	}
+
+	LASSERT(cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
+	LASSERT(cpumask_test_cpu(cpu, cptab->ctb_cpumask));
+
+	cfs_cpt_del_cpu(cptab, cpt, cpu);
+	cfs_cpt_del_node(cptab, cpt, cpu_to_node(cpu));
 }
 EXPORT_SYMBOL(cfs_cpt_unset_cpu);
 
 int cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt,
 			const cpumask_t *mask)
 {
+	int cpu;
+
+	if (!cpumask_weight(mask) ||
+	    cpumask_any_and(mask, cpu_online_mask) >= nr_cpu_ids) {
+		CDEBUG(D_INFO,
+		       "No online CPU is found in the CPU mask for CPU partition %d\n",
+		       cpt);
+		return 0;
+	}
+
+	for_each_cpu(cpu, mask) {
+		cfs_cpt_add_cpu(cptab, cpt, cpu);
+		cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu));
+	}
+
 	return 1;
 }
 EXPORT_SYMBOL(cfs_cpt_set_cpumask);
@@ -159,23 +571,65 @@ EXPORT_SYMBOL(cfs_cpt_set_cpumask);
 void cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt,
 			   const cpumask_t *mask)
 {
+	int cpu;
+
+	for_each_cpu(cpu, mask) {
+		cfs_cpt_del_cpu(cptab, cpt, cpu);
+		cfs_cpt_del_node(cptab, cpt, cpu_to_node(cpu));
+	}
 }
 EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
 
 int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
 {
+	const cpumask_t *mask;
+	int cpu;
+
+	if (node < 0 || node >= nr_node_ids) {
+		CDEBUG(D_INFO,
+		       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
+		return 0;
+	}
+
+	mask = cpumask_of_node(node);
+
+	for_each_cpu(cpu, mask)
+		cfs_cpt_add_cpu(cptab, cpt, cpu);
+
+	cfs_cpt_add_node(cptab, cpt, node);
+
 	return 1;
 }
 EXPORT_SYMBOL(cfs_cpt_set_node);
 
 void cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
 {
+	const cpumask_t *mask;
+	int cpu;
+
+	if (node < 0 || node >= nr_node_ids) {
+		CDEBUG(D_INFO,
+		       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
+		return;
+	}
+
+	mask = cpumask_of_node(node);
+
+	for_each_cpu(cpu, mask)
+		cfs_cpt_del_cpu(cptab, cpt, cpu);
+
+	cfs_cpt_del_node(cptab, cpt, node);
 }
 EXPORT_SYMBOL(cfs_cpt_unset_node);
 
 int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt,
 			 const nodemask_t *mask)
 {
+	int node;
+
+	for_each_node_mask(node, *mask)
+		cfs_cpt_set_node(cptab, cpt, node);
+
 	return 1;
 }
 EXPORT_SYMBOL(cfs_cpt_set_nodemask);
@@ -183,42 +637,674 @@ EXPORT_SYMBOL(cfs_cpt_set_nodemask);
 void cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt,
 			    const nodemask_t *mask)
 {
+	int node;
+
+	for_each_node_mask(node, *mask)
+		cfs_cpt_unset_node(cptab, cpt, node);
 }
 EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
 
 int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
 {
-	return 0;
+	nodemask_t *mask;
+	int weight;
+	unsigned int rotor;
+	int node = 0;
+
+	/* convert CPU partition ID to HW node id */
+
+	if (cpt < 0 || cpt >= cptab->ctb_nparts) {
+		mask = cptab->ctb_nodemask;
+		rotor = cptab->ctb_spread_rotor++;
+	} else {
+		mask = cptab->ctb_parts[cpt].cpt_nodemask;
+		rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++;
+		node  = cptab->ctb_parts[cpt].cpt_node;
+	}
+
+	weight = nodes_weight(*mask);
+	if (weight > 0) {
+		rotor %= weight;
+
+		for_each_node_mask(node, *mask) {
+			if (!rotor--)
+				return node;
+		}
+	}
+
+	return node;
 }
 EXPORT_SYMBOL(cfs_cpt_spread_node);
 
 int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
 {
-	return 0;
+	int cpu;
+	int cpt;
+
+	preempt_disable();
+	cpu = smp_processor_id();
+	cpt = cptab->ctb_cpu2cpt[cpu];
+
+	if (cpt < 0 && remap) {
+		/* don't return negative value for safety of upper layer,
+		 * instead we shadow the unknown cpu to a valid partition ID
+		 */
+		cpt = cpu % cptab->ctb_nparts;
+	}
+	preempt_enable();
+	return cpt;
 }
 EXPORT_SYMBOL(cfs_cpt_current);
 
 int cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
 {
-	return 0;
+	LASSERT(cpu >= 0 && cpu < nr_cpu_ids);
+
+	return cptab->ctb_cpu2cpt[cpu];
 }
 EXPORT_SYMBOL(cfs_cpt_of_cpu);
 
 int cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node)
 {
-	return 0;
+	if (node < 0 || node > nr_node_ids)
+		return CFS_CPT_ANY;
+
+	return cptab->ctb_node2cpt[node];
 }
 EXPORT_SYMBOL(cfs_cpt_of_node);
 
 int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
 {
+	nodemask_t *nodemask;
+	cpumask_t *cpumask;
+	int cpu;
+	int rc;
+
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	if (cpt == CFS_CPT_ANY) {
+		cpumask = cptab->ctb_cpumask;
+		nodemask = cptab->ctb_nodemask;
+	} else {
+		cpumask = cptab->ctb_parts[cpt].cpt_cpumask;
+		nodemask = cptab->ctb_parts[cpt].cpt_nodemask;
+	}
+
+	if (!cpumask_intersects(cpumask, cpu_online_mask)) {
+		CDEBUG(D_INFO,
+		       "No online CPU found in CPU partition %d, did someone do CPU hotplug on system? You might need to reload Lustre modules to keep system working well.\n",
+			cpt);
+		return -ENODEV;
+	}
+
+	for_each_online_cpu(cpu) {
+		if (cpumask_test_cpu(cpu, cpumask))
+			continue;
+
+		rc = set_cpus_allowed_ptr(current, cpumask);
+		set_mems_allowed(*nodemask);
+		if (!rc)
+			schedule(); /* switch to allowed CPU */
+
+		return rc;
+	}
+
+	/* don't need to set affinity because all online CPUs are covered */
 	return 0;
 }
 EXPORT_SYMBOL(cfs_cpt_bind);
 
+/**
+ * Choose max to \a number CPUs from \a node and set them in \a cpt.
+ * We always prefer to choose CPU in the same core/socket.
+ */
+static int cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt,
+				cpumask_t *node_mask, int number)
+{
+	cpumask_t *socket_mask = NULL;
+	cpumask_t *core_mask = NULL;
+	int rc = 0;
+	int cpu;
+	int i;
+
+	LASSERT(number > 0);
+
+	if (number >= cpumask_weight(node_mask)) {
+		while (!cpumask_empty(node_mask)) {
+			cpu = cpumask_first(node_mask);
+			cpumask_clear_cpu(cpu, node_mask);
+
+			if (!cpu_online(cpu))
+				continue;
+
+			rc = cfs_cpt_set_cpu(cptab, cpt, cpu);
+			if (!rc)
+				return -EINVAL;
+		}
+		return 0;
+	}
+
+	/* allocate scratch buffer */
+	LIBCFS_ALLOC(socket_mask, cpumask_size());
+	LIBCFS_ALLOC(core_mask, cpumask_size());
+	if (!socket_mask || !core_mask) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	while (!cpumask_empty(node_mask)) {
+		cpu = cpumask_first(node_mask);
+
+		/* get cpumask for cores in the same socket */
+		cpumask_and(socket_mask, topology_core_cpumask(cpu), node_mask);
+		while (!cpumask_empty(socket_mask)) {
+			/* get cpumask for hts in the same core */
+			cpumask_and(core_mask, topology_sibling_cpumask(cpu),
+				    node_mask);
+
+			for_each_cpu(i, core_mask) {
+				cpumask_clear_cpu(i, socket_mask);
+				cpumask_clear_cpu(i, node_mask);
+
+				if (!cpu_online(i))
+					continue;
+
+				rc = cfs_cpt_set_cpu(cptab, cpt, i);
+				if (!rc) {
+					rc = -EINVAL;
+					goto out;
+				}
+
+				if (!--number)
+					goto out;
+			}
+			cpu = cpumask_first(socket_mask);
+		}
+	}
+
+out:
+	if (core_mask)
+		LIBCFS_FREE(core_mask, cpumask_size());
+	if (socket_mask)
+		LIBCFS_FREE(socket_mask, cpumask_size());
+	return rc;
+}
+
+#define CPT_WEIGHT_MIN 4
+
+static int cfs_cpt_num_estimate(void)
+{
+	int nthr = cpumask_weight(topology_sibling_cpumask(smp_processor_id()));
+	int ncpu = num_online_cpus();
+	int ncpt = 1;
+
+	if (ncpu > CPT_WEIGHT_MIN)
+		for (ncpt = 2; ncpu > 2 * nthr * ncpt; ncpt++)
+			; /* nothing */
+
+#if (BITS_PER_LONG == 32)
+	/* config many CPU partitions on 32-bit system could consume
+	 * too much memory
+	 */
+	ncpt = min(2, ncpt);
+#endif
+	while (ncpu % ncpt)
+		ncpt--; /* worst case is 1 */
+
+	return ncpt;
+}
+
+static struct cfs_cpt_table *cfs_cpt_table_create(int ncpt)
+{
+	struct cfs_cpt_table *cptab = NULL;
+	cpumask_t *node_mask = NULL;
+	int cpt = 0;
+	int node;
+	int num;
+	int rem;
+	int rc = 0;
+
+	num = cfs_cpt_num_estimate();
+	if (ncpt <= 0)
+		ncpt = num;
+
+	if (ncpt > num_online_cpus()) {
+		rc = -EINVAL;
+		CERROR("libcfs: CPU partition count %d > cores %d: rc = %d\n",
+		       ncpt, num_online_cpus(), rc);
+		goto failed;
+	}
+
+	if (ncpt > 4 * num) {
+		CWARN("CPU partition number %d is larger than suggested value (%d), your system may have performance issue or run out of memory while under pressure\n",
+		      ncpt, num);
+	}
+
+	cptab = cfs_cpt_table_alloc(ncpt);
+	if (!cptab) {
+		CERROR("Failed to allocate CPU map(%d)\n", ncpt);
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	LIBCFS_ALLOC(node_mask, cpumask_size());
+	if (!node_mask) {
+		CERROR("Failed to allocate scratch cpumask\n");
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	num = num_online_cpus() / ncpt;
+	rem = num_online_cpus() % ncpt;
+	for_each_online_node(node) {
+		cpumask_copy(node_mask, cpumask_of_node(node));
+
+		while (cpt < ncpt && !cpumask_empty(node_mask)) {
+			struct cfs_cpu_partition *part = &cptab->ctb_parts[cpt];
+			int ncpu = cpumask_weight(part->cpt_cpumask);
+
+			rc = cfs_cpt_choose_ncpus(cptab, cpt, node_mask,
+						  (rem > 0) + num - ncpu);
+			if (rc < 0) {
+				rc = -EINVAL;
+				goto failed_mask;
+			}
+
+			ncpu = cpumask_weight(part->cpt_cpumask);
+			if (ncpu == num + !!(rem > 0)) {
+				cpt++;
+				rem--;
+			}
+		}
+	}
+
+	LIBCFS_FREE(node_mask, cpumask_size());
+
+	return cptab;
+
+failed_mask:
+	if (node_mask)
+		LIBCFS_FREE(node_mask, cpumask_size());
+failed:
+	CERROR("Failed (rc = %d) to setup CPU partition table with %d partitions, online HW NUMA nodes: %d, HW CPU cores: %d.\n",
+	       rc, ncpt, num_online_nodes(), num_online_cpus());
+
+	if (cptab)
+		cfs_cpt_table_free(cptab);
+
+	return ERR_PTR(rc);
+}
+
+static struct cfs_cpt_table *cfs_cpt_table_create_pattern(const char *pattern)
+{
+	struct cfs_cpt_table *cptab;
+	char *pattern_dup;
+	char *bracket;
+	char *str;
+	int node = 0;
+	int ncpt = 0;
+	int cpt = 0;
+	int high;
+	int rc;
+	int c;
+	int i;
+
+	pattern_dup = kstrdup(pattern, GFP_KERNEL);
+	if (!pattern_dup) {
+		CERROR("Failed to duplicate pattern '%s'\n", pattern);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	str = cfs_trimwhite(pattern_dup);
+	if (*str == 'n' || *str == 'N') {
+		str++; /* skip 'N' char */
+		node = 1; /* NUMA pattern */
+		if (*str == '\0') {
+			node = -1;
+			for_each_online_node(i) {
+				if (!cpumask_empty(cpumask_of_node(i)))
+					ncpt++;
+			}
+			if (ncpt == 1) { /* single NUMA node */
+				kfree(pattern_dup);
+				return cfs_cpt_table_create(cpu_npartitions);
+			}
+		}
+	}
+
+	if (!ncpt) { /* scanning bracket which is mark of partition */
+		bracket = str;
+		while ((bracket = strchr(bracket, '['))) {
+			bracket++;
+			ncpt++;
+		}
+	}
+
+	if (!ncpt ||
+	    (node && ncpt > num_online_nodes()) ||
+	    (!node && ncpt > num_online_cpus())) {
+		CERROR("Invalid pattern '%s', or too many partitions %d\n",
+		       pattern_dup, ncpt);
+		rc = -EINVAL;
+		goto err_free_str;
+	}
+
+	cptab = cfs_cpt_table_alloc(ncpt);
+	if (!cptab) {
+		CERROR("Failed to allocate CPU partition table\n");
+		rc = -ENOMEM;
+		goto err_free_str;
+	}
+
+	if (node < 0) { /* shortcut to create CPT from NUMA & CPU topology */
+		for_each_online_node(i) {
+			if (cpumask_empty(cpumask_of_node(i)))
+				continue;
+
+			rc = cfs_cpt_set_node(cptab, cpt++, i);
+			if (!rc) {
+				rc = -EINVAL;
+				goto err_free_table;
+			}
+		}
+		kfree(pattern_dup);
+		return cptab;
+	}
+
+	high = node ? nr_node_ids - 1 : nr_cpu_ids - 1;
+
+	for (str = cfs_trimwhite(str), c = 0; /* until break */; c++) {
+		struct cfs_range_expr *range;
+		struct cfs_expr_list *el;
+		int n;
+
+		bracket = strchr(str, '[');
+		if (!bracket) {
+			if (*str) {
+				CERROR("Invalid pattern '%s'\n", str);
+				rc = -EINVAL;
+				goto err_free_table;
+			} else if (c != ncpt) {
+				CERROR("Expect %d partitions but found %d\n",
+				       ncpt, c);
+				rc = -EINVAL;
+				goto err_free_table;
+			}
+			break;
+		}
+
+		if (sscanf(str, "%d%n", &cpt, &n) < 1) {
+			CERROR("Invalid CPU pattern '%s'\n", str);
+			rc = -EINVAL;
+			goto err_free_table;
+		}
+
+		if (cpt < 0 || cpt >= ncpt) {
+			CERROR("Invalid partition id %d, total partitions %d\n",
+			       cpt, ncpt);
+			rc = -EINVAL;
+			goto err_free_table;
+		}
+
+		if (cfs_cpt_weight(cptab, cpt)) {
+			CERROR("Partition %d has already been set.\n", cpt);
+			rc = -EPERM;
+			goto err_free_table;
+		}
+
+		str = cfs_trimwhite(str + n);
+		if (str != bracket) {
+			CERROR("Invalid pattern '%s'\n", str);
+			rc = -EINVAL;
+			goto err_free_table;
+		}
+
+		bracket = strchr(str, ']');
+		if (!bracket) {
+			CERROR("Missing right bracket for partition %d in '%s'\n",
+			       cpt, str);
+			rc = -EINVAL;
+			goto err_free_table;
+		}
+
+		rc = cfs_expr_list_parse(str, (bracket - str) + 1, 0, high,
+					 &el);
+		if (rc) {
+			CERROR("Can't parse number range in '%s'\n", str);
+			rc = -ERANGE;
+			goto err_free_table;
+		}
+
+		list_for_each_entry(range, &el->el_exprs, re_link) {
+			for (i = range->re_lo; i <= range->re_hi; i++) {
+				if ((i - range->re_lo) % range->re_stride)
+					continue;
+
+				rc = node ? cfs_cpt_set_node(cptab, cpt, i)
+					  : cfs_cpt_set_cpu(cptab, cpt, i);
+				if (!rc) {
+					cfs_expr_list_free(el);
+					rc = -EINVAL;
+					goto err_free_table;
+				}
+			}
+		}
+
+		cfs_expr_list_free(el);
+
+		if (!cfs_cpt_online(cptab, cpt)) {
+			CERROR("No online CPU is found on partition %d\n", cpt);
+			rc = -ENODEV;
+			goto err_free_table;
+		}
+
+		str = cfs_trimwhite(bracket + 1);
+	}
+
+	kfree(pattern_dup);
+	return cptab;
+
+err_free_table:
+	cfs_cpt_table_free(cptab);
+err_free_str:
+	kfree(pattern_dup);
+	return ERR_PTR(rc);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+#ifdef HAVE_HOTPLUG_STATE_MACHINE
+static enum cpuhp_state lustre_cpu_online;
+
+static int cfs_cpu_online(unsigned int cpu)
+{
+	return 0;
+}
+#endif
+
+static int cfs_cpu_dead(unsigned int cpu)
+{
+	bool warn;
+
+	/* if all HTs in a core are offline, it may break affinity */
+	warn = cpumask_any_and(topology_sibling_cpumask(cpu),
+			       cpu_online_mask) >= nr_cpu_ids;
+	CDEBUG(warn ? D_WARNING : D_INFO,
+	       "Lustre: can't support CPU plug-out well now, performance and stability could be impacted [CPU %u]\n",
+	       cpu);
+	return 0;
+}
+
+#ifndef HAVE_HOTPLUG_STATE_MACHINE
+static int cfs_cpu_notify(struct notifier_block *self, unsigned long action,
+			  void *hcpu)
+{
+	int cpu = (unsigned long)hcpu;
+
+	switch (action) {
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+	default:
+		if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) {
+			CDEBUG(D_INFO, "CPU changed [cpu %u action %lx]\n",
+			       cpu, action);
+			break;
+		}
+
+		cfs_cpu_dead(cpu);
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cfs_cpu_notifier = {
+	.notifier_call	= cfs_cpu_notify,
+	.priority	= 0
+};
+#endif /* !HAVE_HOTPLUG_STATE_MACHINE */
+#endif /* CONFIG_HOTPLUG_CPU */
+
+void cfs_cpu_fini(void)
+{
+	if (!IS_ERR_OR_NULL(cfs_cpt_table))
+		cfs_cpt_table_free(cfs_cpt_table);
+
+#ifdef CONFIG_HOTPLUG_CPU
+#ifdef HAVE_HOTPLUG_STATE_MACHINE
+	if (lustre_cpu_online > 0)
+		cpuhp_remove_state_nocalls(lustre_cpu_online);
+	cpuhp_remove_state_nocalls(CPUHP_LUSTRE_CFS_DEAD);
+#else
+	unregister_hotcpu_notifier(&cfs_cpu_notifier);
+#endif /* !HAVE_HOTPLUG_STATE_MACHINE */
+#endif /* CONFIG_HOTPLUG_CPU */
+}
+
+int cfs_cpu_init(void)
+{
+	int ret;
+
+	LASSERT(!cfs_cpt_table);
+
+#ifdef CONFIG_HOTPLUG_CPU
+#ifdef HAVE_HOTPLUG_STATE_MACHINE
+	ret = cpuhp_setup_state_nocalls(CPUHP_LUSTRE_CFS_DEAD,
+					"fs/lustre/cfe:dead", NULL,
+					cfs_cpu_dead);
+	if (ret < 0)
+		goto failed_cpu_dead;
+
+	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+					"fs/lustre/cfe:online",
+					cfs_cpu_online, NULL);
+	if (ret < 0)
+		goto failed_cpu_online;
+
+	lustre_cpu_online = ret;
+#else
+	register_hotcpu_notifier(&cfs_cpu_notifier);
+#endif /* !HAVE_HOTPLUG_STATE_MACHINE */
+#endif /* CONFIG_HOTPLUG_CPU */
+
+	get_online_cpus();
+	if (*cpu_pattern) {
+		cfs_cpt_table = cfs_cpt_table_create_pattern(cpu_pattern);
+		if (IS_ERR(cfs_cpt_table)) {
+			CERROR("Failed to create cptab from pattern '%s'\n",
+			       cpu_pattern);
+			ret = PTR_ERR(cfs_cpt_table);
+			goto failed_alloc_table;
+		}
+
+	} else {
+		cfs_cpt_table = cfs_cpt_table_create(cpu_npartitions);
+		if (IS_ERR(cfs_cpt_table)) {
+			CERROR("Failed to create cptab with npartitions %d\n",
+			       cpu_npartitions);
+			ret = PTR_ERR(cfs_cpt_table);
+			goto failed_alloc_table;
+		}
+	}
+
+	put_online_cpus();
+
+	LCONSOLE(0, "HW NUMA nodes: %d, HW CPU cores: %d, npartitions: %d\n",
+		 num_online_nodes(), num_online_cpus(),
+		 cfs_cpt_number(cfs_cpt_table));
+	return 0;
+
+failed_alloc_table:
+	put_online_cpus();
+
+	if (!IS_ERR_OR_NULL(cfs_cpt_table))
+		cfs_cpt_table_free(cfs_cpt_table);
+
+#ifdef CONFIG_HOTPLUG_CPU
+#ifdef HAVE_HOTPLUG_STATE_MACHINE
+	if (lustre_cpu_online > 0)
+		cpuhp_remove_state_nocalls(lustre_cpu_online);
+failed_cpu_online:
+	cpuhp_remove_state_nocalls(CPUHP_LUSTRE_CFS_DEAD);
+failed_cpu_dead:
+#else
+	unregister_hotcpu_notifier(&cfs_cpu_notifier);
+#endif /* !HAVE_HOTPLUG_STATE_MACHINE */
+#endif /* CONFIG_HOTPLUG_CPU */
+	return ret;
+}
+
+#else /* ! CONFIG_SMP */
+
+struct cfs_cpt_table *cfs_cpt_table_alloc(int ncpt)
+{
+	struct cfs_cpt_table *cptab;
+
+	if (ncpt != 1) {
+		CERROR("Can't support cpu partition number %d\n", ncpt);
+		return NULL;
+	}
+
+	LIBCFS_ALLOC(cptab, sizeof(*cptab));
+	if (!cptab)
+		return NULL;
+
+	cpumask_set_cpu(0, cptab->ctb_cpumask);
+	node_set(0, cptab->ctb_nodemask);
+
+	return cptab;
+}
+EXPORT_SYMBOL(cfs_cpt_table_alloc);
+
+int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
+{
+	int rc;
+
+	rc = snprintf(buf, len, "0\t: 0\n");
+	len -= rc;
+	if (len <= 0)
+		return -EFBIG;
+
+	return rc;
+}
+EXPORT_SYMBOL(cfs_cpt_table_print);
+
+int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len)
+{
+	int rc;
+
+	rc = snprintf(buf, len, "0\t: 0:1\n");
+	len -= rc;
+	if (len <= 0)
+		return -EFBIG;
+
+	return rc;
+}
+EXPORT_SYMBOL(cfs_cpt_distance_print);
+
 void cfs_cpu_fini(void)
 {
-	if (cfs_cpt_table != NULL) {
+	if (cfs_cpt_table) {
 		cfs_cpt_table_free(cfs_cpt_table);
 		cfs_cpt_table = NULL;
 	}
@@ -228,7 +1314,7 @@ int cfs_cpu_init(void)
 {
 	cfs_cpt_table = cfs_cpt_table_alloc(1);
 
-	return cfs_cpt_table != NULL ? 0 : -1;
+	return cfs_cpt_table ? 0 : -1;
 }
 
-#endif /* HAVE_LIBCFS_CPT */
+#endif /* !CONFIG_SMP */
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c
index 2f401e74a7dd7..5f85219101eb0 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c
@@ -33,7 +33,6 @@
 
 #define DEBUG_SUBSYSTEM S_LNET
 
-#include <linux/workqueue.h>
 #include <libcfs/libcfs.h>
 
 struct cfs_var_array {
@@ -172,9 +171,12 @@ cfs_array_alloc(int count, unsigned int size)
 }
 EXPORT_SYMBOL(cfs_array_alloc);
 
+#ifdef HAVE_LIBCFS_VFREE_ATOMIC
+#include <linux/workqueue.h>
 /*
  * This is opencoding of vfree_atomic from Linux kernel added in 4.10 with
- * minimum changes needed to work on older kernels too.
+ * minimum changes needed to work on some older kernels too.
+ * For RHEL6, just use vfree() directly since it is missing too much code.
  */
 
 #ifndef raw_cpu_ptr
@@ -183,12 +185,12 @@ EXPORT_SYMBOL(cfs_array_alloc);
 
 #ifndef llist_for_each_safe
 #define llist_for_each_safe(pos, n, node)                       \
-		for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n))
+	for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n))
 #endif
 
 struct vfree_deferred {
-		struct llist_head list;
-		struct work_struct wq;
+	struct llist_head list;
+	struct work_struct wq;
 };
 static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
 
@@ -230,3 +232,4 @@ void __exit exit_libcfs_vfree_atomic(void)
 {
 	flush_scheduled_work();
 }
+#endif /* HAVE_LIBCFS_VFREE_ATOMIC */
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c
deleted file mode 100644
index 9786288cbad50..0000000000000
--- a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c
+++ /dev/null
@@ -1,478 +0,0 @@
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/cpumask.h>
-#include <linux/cpu.h>
-#include <linux/slab.h>
-#include <linux/sched.h>
-#include <linux/moduleparam.h>
-#include <linux/mmu_context.h>
-
-#define DEBUG_SUBSYSTEM S_UNDEFINED
-
-#include <libcfs/libcfs.h>
-#include <libcfs/libcfs_ptask.h>
-
-/**
- * This API based on Linux kernel padada API which is used to perform
- * encryption and decryption on large numbers of packets without
- * reordering those packets.
- *
- * It was adopted for general use in Lustre for parallelization of
- * various functionality.
- *
- * The first step in using it is to set up a cfs_ptask structure to
- * control of how this task are to be run:
- *
- * #include <libcfs/libcfs_ptask.h>
- *
- * int cfs_ptask_init(struct cfs_ptask *ptask, cfs_ptask_cb_t cbfunc,
- *                    void *cbdata, unsigned int flags, int cpu);
- *
- * The cbfunc function with cbdata argument will be called in the process
- * of getting the task done. The cpu specifies which CPU will be used for
- * the final callback when the task is done.
- *
- * The submission of task is done with:
- *
- * int cfs_ptask_submit(struct cfs_ptask *ptask, struct cfs_ptask_engine *engine);
- *
- * The task is submitted to the engine for execution.
- *
- * In order to wait for result of task execution you should call:
- *
- * int cfs_ptask_wait_for(struct cfs_ptask *ptask);
- *
- * The tasks with flag PTF_ORDERED are executed in parallel but complete
- * into submission order. So, waiting for last ordered task you can be sure
- * that all previous tasks were done before this task complete.
- */
-
-#ifndef HAVE_REINIT_COMPLETION
-/**
- * reinit_completion - reinitialize a completion structure
- * @x:  pointer to completion structure that is to be reinitialized
- *
- * This inline function should be used to reinitialize a completion
- * structure so it can be reused. This is especially important after
- * complete_all() is used.
- */
-static inline void reinit_completion(struct completion *x)
-{
-	x->done = 0;
-}
-#endif
-
-#ifndef HAVE_CPUMASK_PRINT_TO_PAGEBUF
-static inline void cpumap_print_to_pagebuf(bool unused, char *buf,
-					   const struct cpumask *mask)
-{
-	cpulist_scnprintf(buf, PAGE_SIZE, mask);
-}
-#endif
-
-#ifdef CONFIG_PADATA
-static void cfs_ptask_complete(struct padata_priv *padata)
-{
-	struct cfs_ptask *ptask = cfs_padata2ptask(padata);
-
-	if (cfs_ptask_need_complete(ptask)) {
-		if (cfs_ptask_is_ordered(ptask))
-			complete(&ptask->pt_completion);
-	} else if (cfs_ptask_is_autofree(ptask)) {
-		kfree(ptask);
-	}
-}
-
-static void cfs_ptask_execute(struct padata_priv *padata)
-{
-	struct cfs_ptask *ptask = cfs_padata2ptask(padata);
-	bool bh_enabled = false;
-
-	if (!cfs_ptask_is_atomic(ptask)) {
-		local_bh_enable();
-		bh_enabled = true;
-	}
-
-	if (cfs_ptask_use_user_mm(ptask) && ptask->pt_mm != NULL) {
-		kthread_use_mm(ptask->pt_mm);
-	}
-
-	if (ptask->pt_cbfunc != NULL)
-		ptask->pt_result = ptask->pt_cbfunc(ptask);
-	else
-		ptask->pt_result = -ENOSYS;
-
-	if (cfs_ptask_use_user_mm(ptask) && ptask->pt_mm != NULL) {
-		kthread_unuse_mm(ptask->pt_mm);
-		mmput(ptask->pt_mm);
-		ptask->pt_mm = NULL;
-	}
-
-	if (cfs_ptask_need_complete(ptask) && !cfs_ptask_is_ordered(ptask))
-		complete(&ptask->pt_completion);
-
-	if (bh_enabled)
-		local_bh_disable();
-
-	padata_do_serial(padata);
-}
-
-static int cfs_do_parallel(struct cfs_ptask_engine *engine,
-			   struct padata_priv *padata)
-{
-	struct cfs_ptask *ptask = cfs_padata2ptask(padata);
-	int rc;
-
-	if (cfs_ptask_need_complete(ptask))
-		reinit_completion(&ptask->pt_completion);
-
-	if (cfs_ptask_use_user_mm(ptask)) {
-		ptask->pt_mm = get_task_mm(current);
-	}
-	ptask->pt_result = -EINPROGRESS;
-
-retry:
-	rc = padata_do_parallel(engine->pte_pinst, padata, ptask->pt_cbcpu);
-	if (rc == -EBUSY && cfs_ptask_is_retry(ptask)) {
-		/* too many tasks already in queue */
-		schedule_timeout_uninterruptible(1);
-		goto retry;
-	}
-
-	if (rc) {
-		if (cfs_ptask_use_user_mm(ptask) && ptask->pt_mm != NULL) {
-			mmput(ptask->pt_mm);
-			ptask->pt_mm = NULL;
-		}
-		ptask->pt_result = rc;
-	}
-
-	return rc;
-}
-
-/**
- * This function submit initialized task for async execution
- * in engine with specified id.
- */
-int cfs_ptask_submit(struct cfs_ptask *ptask, struct cfs_ptask_engine *engine)
-{
-	struct padata_priv *padata = cfs_ptask2padata(ptask);
-
-	if (IS_ERR_OR_NULL(engine))
-		return -EINVAL;
-
-	memset(padata, 0, sizeof(*padata));
-
-	padata->parallel = cfs_ptask_execute;
-	padata->serial   = cfs_ptask_complete;
-
-	return cfs_do_parallel(engine, padata);
-}
-
-#else  /* !CONFIG_PADATA */
-
-/**
- * If CONFIG_PADATA is not defined this function just execute
- * the initialized task in current thread. (emulate async execution)
- */
-int cfs_ptask_submit(struct cfs_ptask *ptask, struct cfs_ptask_engine *engine)
-{
-	if (IS_ERR_OR_NULL(engine))
-		return -EINVAL;
-
-	if (ptask->pt_cbfunc != NULL)
-		ptask->pt_result = ptask->pt_cbfunc(ptask);
-	else
-		ptask->pt_result = -ENOSYS;
-
-	if (cfs_ptask_need_complete(ptask))
-		complete(&ptask->pt_completion);
-	else if (cfs_ptask_is_autofree(ptask))
-		kfree(ptask);
-
-	return 0;
-}
-#endif /* CONFIG_PADATA */
-
-EXPORT_SYMBOL(cfs_ptask_submit);
-
-/**
- * This function waits when task complete async execution.
- * The tasks with flag PTF_ORDERED are executed in parallel but completes
- * into submission order. So, waiting for last ordered task you can be sure
- * that all previous tasks were done before this task complete.
- */
-int cfs_ptask_wait_for(struct cfs_ptask *ptask)
-{
-	if (!cfs_ptask_need_complete(ptask))
-		return -EINVAL;
-
-	wait_for_completion(&ptask->pt_completion);
-
-	return 0;
-}
-EXPORT_SYMBOL(cfs_ptask_wait_for);
-
-/**
- * This function initialize internal members of task and prepare it for
- * async execution.
- */
-int cfs_ptask_init(struct cfs_ptask *ptask, cfs_ptask_cb_t cbfunc, void *cbdata,
-		   unsigned int flags, int cpu)
-{
-	memset(ptask, 0, sizeof(*ptask));
-
-	ptask->pt_flags  = flags;
-	ptask->pt_cbcpu  = cpu;
-	ptask->pt_mm     = NULL; /* will be set in cfs_do_parallel() */
-	ptask->pt_cbfunc = cbfunc;
-	ptask->pt_cbdata = cbdata;
-	ptask->pt_result = -EAGAIN;
-
-	if (cfs_ptask_need_complete(ptask)) {
-		if (cfs_ptask_is_autofree(ptask))
-			return -EINVAL;
-
-		init_completion(&ptask->pt_completion);
-	}
-
-	if (cfs_ptask_is_atomic(ptask) && cfs_ptask_use_user_mm(ptask))
-		return -EINVAL;
-
-	return 0;
-}
-EXPORT_SYMBOL(cfs_ptask_init);
-
-/**
- * This function set the mask of allowed CPUs for parallel execution
- * for engine with specified id.
- */
-int cfs_ptengine_set_cpumask(struct cfs_ptask_engine *engine,
-			     const struct cpumask *cpumask)
-{
-	int rc = 0;
-
-#ifdef CONFIG_PADATA
-	cpumask_var_t serial_mask;
-	cpumask_var_t parallel_mask;
-
-	if (IS_ERR_OR_NULL(engine))
-		return -EINVAL;
-
-	if (!alloc_cpumask_var(&serial_mask, GFP_KERNEL))
-		return -ENOMEM;
-
-	if (!alloc_cpumask_var(&parallel_mask, GFP_KERNEL)) {
-		free_cpumask_var(serial_mask);
-		return -ENOMEM;
-	}
-
-	cpumask_copy(parallel_mask, cpumask);
-	cpumask_copy(serial_mask, cpu_online_mask);
-
-	rc = padata_set_cpumask(engine->pte_pinst, PADATA_CPU_PARALLEL,
-				parallel_mask);
-	free_cpumask_var(parallel_mask);
-	if (rc)
-		goto out_failed_mask;
-
-	rc = padata_set_cpumask(engine->pte_pinst, PADATA_CPU_SERIAL,
-				serial_mask);
-out_failed_mask:
-	free_cpumask_var(serial_mask);
-#endif /* CONFIG_PADATA */
-
-	return rc;
-}
-EXPORT_SYMBOL(cfs_ptengine_set_cpumask);
-
-/**
- * This function returns the count of allowed CPUs for parallel execution
- * for engine with specified id.
- */
-int cfs_ptengine_weight(struct cfs_ptask_engine *engine)
-{
-	if (IS_ERR_OR_NULL(engine))
-		return -EINVAL;
-
-	return engine->pte_weight;
-}
-EXPORT_SYMBOL(cfs_ptengine_weight);
-
-#ifdef CONFIG_PADATA
-static int cfs_ptask_cpumask_change_notify(struct notifier_block *self,
-					   unsigned long val, void *data)
-{
-	struct padata_cpumask *padata_cpumask = data;
-	struct cfs_ptask_engine *engine;
-
-	engine = container_of(self, struct cfs_ptask_engine, pte_notifier);
-
-	if (val & PADATA_CPU_PARALLEL)
-		engine->pte_weight = cpumask_weight(padata_cpumask->pcpu);
-
-	return 0;
-}
-
-static int cfs_ptengine_padata_init(struct cfs_ptask_engine *engine,
-				    const char *name,
-				    const struct cpumask *cpumask)
-{
-	cpumask_var_t all_mask;
-	cpumask_var_t par_mask;
-	unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_CPU_INTENSIVE;
-	int rc;
-
-	get_online_cpus();
-
-	engine->pte_wq = alloc_workqueue(name, wq_flags, 1);
-	if (engine->pte_wq == NULL)
-		GOTO(err, rc = -ENOMEM);
-
-	if (!alloc_cpumask_var(&all_mask, GFP_KERNEL))
-		GOTO(err_destroy_workqueue, rc = -ENOMEM);
-
-	if (!alloc_cpumask_var(&par_mask, GFP_KERNEL))
-		GOTO(err_free_all_mask, rc = -ENOMEM);
-
-	cpumask_copy(par_mask, cpumask);
-	if (cpumask_empty(par_mask) ||
-	    cpumask_equal(par_mask, cpu_online_mask)) {
-		cpumask_copy(all_mask, cpu_online_mask);
-		cpumask_clear(par_mask);
-		while (!cpumask_empty(all_mask)) {
-			int cpu = cpumask_first(all_mask);
-
-			cpumask_set_cpu(cpu, par_mask);
-			cpumask_andnot(all_mask, all_mask,
-					topology_sibling_cpumask(cpu));
-		}
-	}
-
-	cpumask_copy(all_mask, cpu_online_mask);
-
-	{
-		char *pa_mask_buff, *cb_mask_buff;
-
-		pa_mask_buff = (char *)__get_free_page(GFP_KERNEL);
-		if (pa_mask_buff == NULL)
-			GOTO(err_free_par_mask, rc = -ENOMEM);
-
-		cb_mask_buff = (char *)__get_free_page(GFP_KERNEL);
-		if (cb_mask_buff == NULL) {
-			free_page((unsigned long)pa_mask_buff);
-			GOTO(err_free_par_mask, rc = -ENOMEM);
-		}
-
-		cpumap_print_to_pagebuf(true, pa_mask_buff, par_mask);
-		pa_mask_buff[PAGE_SIZE - 1] = '\0';
-		cpumap_print_to_pagebuf(true, cb_mask_buff, all_mask);
-		cb_mask_buff[PAGE_SIZE - 1] = '\0';
-
-		CDEBUG(D_INFO, "%s weight=%u plist='%s' cblist='%s'\n",
-			name, cpumask_weight(par_mask),
-			pa_mask_buff, cb_mask_buff);
-
-		free_page((unsigned long)cb_mask_buff);
-		free_page((unsigned long)pa_mask_buff);
-	}
-
-	engine->pte_weight = cpumask_weight(par_mask);
-	engine->pte_pinst  = padata_alloc_possible(engine->pte_wq);
-	if (engine->pte_pinst == NULL)
-		GOTO(err_free_par_mask, rc = -ENOMEM);
-
-	engine->pte_notifier.notifier_call = cfs_ptask_cpumask_change_notify;
-	rc = padata_register_cpumask_notifier(engine->pte_pinst,
-					      &engine->pte_notifier);
-	if (rc)
-		GOTO(err_free_padata, rc);
-
-	rc = cfs_ptengine_set_cpumask(engine, par_mask);
-	if (rc)
-		GOTO(err_unregister, rc);
-
-	rc = padata_start(engine->pte_pinst);
-	if (rc)
-		GOTO(err_unregister, rc);
-
-	free_cpumask_var(par_mask);
-	free_cpumask_var(all_mask);
-
-	put_online_cpus();
-	return 0;
-
-err_unregister:
-	padata_unregister_cpumask_notifier(engine->pte_pinst,
-					   &engine->pte_notifier);
-err_free_padata:
-	padata_free(engine->pte_pinst);
-err_free_par_mask:
-	free_cpumask_var(par_mask);
-err_free_all_mask:
-	free_cpumask_var(all_mask);
-err_destroy_workqueue:
-	destroy_workqueue(engine->pte_wq);
-err:
-	put_online_cpus();
-	return rc;
-}
-
-static void cfs_ptengine_padata_fini(struct cfs_ptask_engine *engine)
-{
-	padata_stop(engine->pte_pinst);
-	padata_unregister_cpumask_notifier(engine->pte_pinst,
-					   &engine->pte_notifier);
-	padata_free(engine->pte_pinst);
-	destroy_workqueue(engine->pte_wq);
-}
-
-#else  /* !CONFIG_PADATA */
-
-static int cfs_ptengine_padata_init(struct cfs_ptask_engine *engine,
-				    const char *name,
-				    const struct cpumask *cpumask)
-{
-	engine->pte_weight = 1;
-
-	return 0;
-}
-
-static void cfs_ptengine_padata_fini(struct cfs_ptask_engine *engine)
-{
-}
-#endif /* CONFIG_PADATA */
-
-struct cfs_ptask_engine *cfs_ptengine_init(const char *name,
-					   const struct cpumask *cpumask)
-{
-	struct cfs_ptask_engine *engine;
-	int rc;
-
-	engine = kzalloc(sizeof(*engine), GFP_KERNEL);
-	if (engine == NULL)
-		GOTO(err, rc = -ENOMEM);
-
-	rc = cfs_ptengine_padata_init(engine, name, cpumask);
-	if (rc)
-		GOTO(err_free_engine, rc);
-
-	return engine;
-
-err_free_engine:
-	kfree(engine);
-err:
-	return ERR_PTR(rc);
-}
-EXPORT_SYMBOL(cfs_ptengine_init);
-
-void cfs_ptengine_fini(struct cfs_ptask_engine *engine)
-{
-	if (IS_ERR_OR_NULL(engine))
-		return;
-
-	cfs_ptengine_padata_fini(engine);
-	kfree(engine);
-}
-EXPORT_SYMBOL(cfs_ptengine_fini);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_string.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_string.c
index 04e1dd56dd430..b460df3c4d9bc 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_string.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_string.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -36,7 +36,9 @@
  * Author: Nathan Rutman <nathan.rutman@sun.com>
  */
 
+#include <linux/ctype.h>
 #include <libcfs/libcfs.h>
+#include <libcfs/libcfs_string.h>
 
 char *cfs_strrstr(const char *haystack, const char *needle)
 {
@@ -252,17 +254,47 @@ int
 cfs_str2num_check(char *str, int nob, unsigned *num,
 		  unsigned min, unsigned max)
 {
-	char	*endp;
-
-	*num = simple_strtoul(str, &endp, 0);
-	if (endp == str)
-		return 0;
+	bool all_numbers = true;
+	char *endp, cache;
+	int len;
+	int rc;
+
+	endp = strim(str);
+	/**
+	 * kstrouint can only handle strings composed
+	 * of only numbers. We need to scan the string
+	 * passed in for the first non-digit character
+	 * and end the string at that location. If we
+	 * don't find any non-digit character we still
+	 * need to place a '\0' at position len since
+	 * we are not interested in the rest of the
+	 * string which is longer than len in size.
+	 * After we are done the character at the
+	 * position we placed '\0' must be restored.
+	 */
+	len = min((int)strlen(endp), nob);
+	for (; endp < str + len; endp++) {
+		if (!isxdigit(*endp) && *endp != '-' &&
+		    *endp != '+') {
+			all_numbers = false;
+			break;
+		}
+	}
 
-	for (; endp < str + nob; endp++) {
-		if (!isspace(*endp))
-			return 0;
+	/* Eat trailing space */
+	if (!all_numbers && isspace(*endp)) {
+		all_numbers = true;
+		endp--;
 	}
 
+	cache = *endp;
+	*endp = '\0';
+
+	rc = kstrtouint(str, 0, num);
+	*endp = cache;
+	if (rc || !all_numbers)
+		return 0;
+
 	return (*num >= min && *num <= max);
 }
 EXPORT_SYMBOL(cfs_str2num_check);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-adler.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-adler.c
index 0f507d555e603..7a19a5803ee8c 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-adler.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-adler.c
@@ -116,7 +116,9 @@ static struct shash_alg alg = {
 		.cra_name		= "adler32",
 		.cra_driver_name	= "adler32-zlib",
 		.cra_priority		= 100,
+#ifdef CRYPTO_ALG_OPTIONAL_KEY
 		.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
+#endif
 		.cra_blocksize		= CHKSUM_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(u32),
 		.cra_module		= NULL,
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32.c
index c20e5e9a8194b..c794e670ecfd9 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32.c
@@ -65,6 +65,7 @@ static int crc32_setkey(struct crypto_shash *hash, const u8 *key,
 
 	if (keylen != sizeof(u32))
 		return -EINVAL;
+
 	*mctx = le32_to_cpup((__le32 *)key);
 	return 0;
 }
@@ -128,7 +129,9 @@ static struct shash_alg alg = {
 		.cra_name		= "crc32",
 		.cra_driver_name	= "crc32-table",
 		.cra_priority		= 100,
+#ifdef CRYPTO_ALG_OPTIONAL_KEY
 		.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
+#endif
 		.cra_blocksize		= CHKSUM_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(u32),
 		.cra_module		= NULL,
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c
index 5262f071b8a7a..566ba882ede82 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c
@@ -63,6 +63,7 @@ static int crc32c_pclmul_setkey(struct crypto_shash *hash, const u8 *key,
 
 	if (keylen != sizeof(u32))
 		return -EINVAL;
+
 	*mctx = le32_to_cpup((__le32 *)key);
 	return 0;
 }
@@ -131,7 +132,9 @@ static struct shash_alg alg = {
 			.cra_name		= "crc32c",
 			.cra_driver_name	= "crc32c-pclmul",
 			.cra_priority		= 150,
+#ifdef CRYPTO_ALG_OPTIONAL_KEY
 			.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
+#endif
 			.cra_blocksize		= CHKSUM_BLOCK_SIZE,
 			.cra_ctxsize		= sizeof(u32),
 			.cra_module		= NULL,
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c
index 4ad3b7c310037..8d4cb640681f8 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c
@@ -102,6 +102,7 @@ static int crc32_pclmul_setkey(struct crypto_shash *hash, const u8 *key,
 
 	if (keylen != sizeof(u32))
 		return -EINVAL;
+
 	*mctx = le32_to_cpup((__le32 *)key);
 	return 0;
 }
@@ -167,6 +168,9 @@ static struct shash_alg alg = {
 			.cra_name		= "crc32",
 			.cra_driver_name	= "crc32-pclmul",
 			.cra_priority		= 200,
+#ifdef CRYPTO_ALG_OPTIONAL_KEY
+			.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
+#endif
 			.cra_blocksize		= CHKSUM_BLOCK_SIZE,
 			.cra_ctxsize		= sizeof(u32),
 			.cra_module		= NULL,
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto.c
index 1991a86a49598..dce1734a4d500 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto.c
@@ -29,6 +29,7 @@
 
 #include <crypto/hash.h>
 #include <linux/scatterlist.h>
+#include <linux/pagemap.h>
 #include <libcfs/libcfs.h>
 #include <libcfs/libcfs_crypto.h>
 #include <libcfs/linux/linux-crypto.h>
@@ -77,13 +78,27 @@ static int cfs_crypto_hash_alloc(enum cfs_crypto_hash_alg hash_alg,
 	int err = 0;
 
 	*type = cfs_crypto_hash_type(hash_alg);
-
-	if (*type == NULL) {
+	if (!*type) {
 		CWARN("Unsupported hash algorithm id = %d, max id is %d\n",
 		      hash_alg, CFS_HASH_ALG_MAX);
 		return -EINVAL;
 	}
-	tfm = crypto_alloc_ahash((*type)->cht_name, 0, CRYPTO_ALG_ASYNC);
+
+	/* Keys are only supported for the hmac version */
+	if (key && key_len > 0) {
+		char *algo_name;
+
+		algo_name = kasprintf(GFP_KERNEL, "hmac(%s)",
+				      (*type)->cht_name);
+		if (!algo_name)
+			return -ENOMEM;
+
+		tfm = crypto_alloc_ahash(algo_name, 0, CRYPTO_ALG_ASYNC);
+		kfree(algo_name);
+	} else {
+		tfm = crypto_alloc_ahash((*type)->cht_name, 0,
+					 CRYPTO_ALG_ASYNC);
+	}
 	if (IS_ERR(tfm)) {
 		CDEBUG(D_INFO, "Failed to alloc crypto hash %s\n",
 		       (*type)->cht_name);
@@ -94,8 +109,7 @@ static int cfs_crypto_hash_alloc(enum cfs_crypto_hash_alg hash_alg,
 	if (!*req) {
 		CDEBUG(D_INFO, "Failed to alloc ahash_request for %s\n",
 		       (*type)->cht_name);
-		crypto_free_ahash(tfm);
-		return -ENOMEM;
+		GOTO(out_free_tfm, err = -ENOMEM);
 	}
 
 	ahash_request_set_callback(*req, 0, NULL, NULL);
@@ -106,12 +120,8 @@ static int cfs_crypto_hash_alloc(enum cfs_crypto_hash_alg hash_alg,
 		err = crypto_ahash_setkey(tfm,
 					 (unsigned char *)&((*type)->cht_key),
 					 (*type)->cht_size);
-
-	if (err != 0) {
-		ahash_request_free(*req);
-		crypto_free_ahash(tfm);
-		return err;
-	}
+	if (err)
+		GOTO(out_free_req, err);
 
 	CDEBUG(D_INFO, "Using crypto hash: %s (%s) speed %d MB/s\n",
 	       crypto_ahash_alg_name(tfm), crypto_ahash_driver_name(tfm),
@@ -119,7 +129,9 @@ static int cfs_crypto_hash_alloc(enum cfs_crypto_hash_alg hash_alg,
 
 	err = crypto_ahash_init(*req);
 	if (err) {
+out_free_req:
 		ahash_request_free(*req);
+out_free_tfm:
 		crypto_free_ahash(tfm);
 	}
 	return err;
@@ -195,10 +207,10 @@ EXPORT_SYMBOL(cfs_crypto_hash_digest);
  *			use default initial value
  * \param[in] key_len	length of \a key in bytes
  *
- * \retval		pointer to descriptor of hash instance
+ * \retval		pointer to ahash request
  * \retval		ERR_PTR(errno) in case of error
  */
-struct cfs_crypto_hash_desc *
+struct ahash_request *
 	cfs_crypto_hash_init(enum cfs_crypto_hash_alg hash_alg,
 			     unsigned char *key, unsigned int key_len)
 {
@@ -209,14 +221,14 @@ struct cfs_crypto_hash_desc *
 	err = cfs_crypto_hash_alloc(hash_alg, &type, &req, key, key_len);
 	if (err)
 		return ERR_PTR(err);
-	return (struct cfs_crypto_hash_desc *)req;
+	return req;
 }
 EXPORT_SYMBOL(cfs_crypto_hash_init);
 
 /**
  * Update hash digest computed on data within the given \a page
  *
- * \param[in] hdesc	hash state descriptor
+ * \param[in] req	ahash request
  * \param[in] page	data page on which to compute the hash
  * \param[in] offset	offset within \a page at which to start hash
  * \param[in] len	length of data on which to compute hash
@@ -224,11 +236,10 @@ EXPORT_SYMBOL(cfs_crypto_hash_init);
  * \retval		0 for success
  * \retval		negative errno on failure
  */
-int cfs_crypto_hash_update_page(struct cfs_crypto_hash_desc *hdesc,
+int cfs_crypto_hash_update_page(struct ahash_request *req,
 				struct page *page, unsigned int offset,
 				unsigned int len)
 {
-	struct ahash_request *req = (void *)hdesc;
 	struct scatterlist sl;
 
 	sg_init_table(&sl, 1);
@@ -242,17 +253,16 @@ EXPORT_SYMBOL(cfs_crypto_hash_update_page);
 /**
  * Update hash digest computed on the specified data
  *
- * \param[in] hdesc	hash state descriptor
+ * \param[in] req	ahash request
  * \param[in] buf	data buffer on which to compute the hash
  * \param[in] buf_len	length of \buf on which to compute hash
  *
  * \retval		0 for success
  * \retval		negative errno on failure
  */
-int cfs_crypto_hash_update(struct cfs_crypto_hash_desc *hdesc,
+int cfs_crypto_hash_update(struct ahash_request *req,
 			   const void *buf, unsigned int buf_len)
 {
-	struct ahash_request *req = (void *)hdesc;
 	struct scatterlist sl;
 
 	sg_init_one(&sl, (void *)buf, buf_len);
@@ -265,7 +275,7 @@ EXPORT_SYMBOL(cfs_crypto_hash_update);
 /**
  * Finish hash calculation, copy hash digest to buffer, clean up hash descriptor
  *
- * \param[in]	hdesc		hash descriptor
+ * \param[in]	req		ahash request
  * \param[out]	hash		pointer to hash buffer to store hash digest
  * \param[in,out] hash_len	pointer to hash buffer size, if \a hash == NULL
  *				or hash_len == NULL only free \a hdesc instead
@@ -275,10 +285,9 @@ EXPORT_SYMBOL(cfs_crypto_hash_update);
  * \retval		-EOVERFLOW if hash_len is too small for the hash digest
  * \retval		negative errno for other errors from lower layers
  */
-int cfs_crypto_hash_final(struct cfs_crypto_hash_desc *hdesc,
+int cfs_crypto_hash_final(struct ahash_request *req,
 			  unsigned char *hash, unsigned int *hash_len)
 {
-	struct ahash_request *req = (void *)hdesc;
 	int size = crypto_ahash_digestsize(crypto_ahash_reqtfm(req));
 	int err;
 
@@ -313,6 +322,9 @@ EXPORT_SYMBOL(cfs_crypto_hash_final);
  * The speed is stored internally in the cfs_crypto_hash_speeds[] array, and
  * is available through the cfs_crypto_hash_speed() function.
  *
+ * This function needs to stay the same as obd_t10_performance_test() so that
+ * the speeds are comparable.
+ *
  * \param[in] hash_alg	hash algorithm id (CFS_HASH_ALG_*)
  * \param[in] buf	data buffer on which to compute the hash
  * \param[in] buf_len	length of \buf on which to compute hash
@@ -340,23 +352,23 @@ static void cfs_crypto_performance_test(enum cfs_crypto_hash_alg hash_alg)
 
 	for (start = jiffies, end = start + msecs_to_jiffies(MSEC_PER_SEC / 4),
 	     bcount = 0; time_before(jiffies, end) && err == 0; bcount++) {
-		struct cfs_crypto_hash_desc *hdesc;
+		struct ahash_request *req;
 		int i;
 
-		hdesc = cfs_crypto_hash_init(hash_alg, NULL, 0);
-		if (IS_ERR(hdesc)) {
-			err = PTR_ERR(hdesc);
+		req = cfs_crypto_hash_init(hash_alg, NULL, 0);
+		if (IS_ERR(req)) {
+			err = PTR_ERR(req);
 			break;
 		}
 
 		for (i = 0; i < buf_len / PAGE_SIZE; i++) {
-			err = cfs_crypto_hash_update_page(hdesc, page, 0,
+			err = cfs_crypto_hash_update_page(req, page, 0,
 							  PAGE_SIZE);
 			if (err != 0)
 				break;
 		}
 
-		err = cfs_crypto_hash_final(hdesc, hash, &hash_len);
+		err = cfs_crypto_hash_final(req, hash, &hash_len);
 		if (err != 0)
 			break;
 	}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c
index cd00d0ae5717f..799c40ea638ec 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -37,8 +37,12 @@
  */
 
 #include <linux/sched.h>
+#ifdef HAVE_SCHED_HEADERS
+#include <linux/sched/signal.h>
+#include <linux/sched/mm.h>
+#endif
 #include <linux/fs_struct.h>
-
+#include <linux/pagemap.h>
 #include <linux/compat.h>
 #include <linux/thread_info.h>
 
@@ -149,9 +153,7 @@ static int cfs_access_process_vm(struct task_struct *tsk,
 		int bytes, rc, offset;
 		void *maddr;
 
-#if defined(HAVE_GET_USER_PAGES_GUP_FLAGS_7ARGS)
-		rc = get_user_pages(tsk, mm, addr, 1, write ? FOLL_WRITE : 0, &page, &vma);
-#elif defined(HAVE_GET_USER_PAGES_GUP_FLAGS)
+#if defined(HAVE_GET_USER_PAGES_GUP_FLAGS)
 		rc = get_user_pages(addr, 1, write ? FOLL_WRITE : 0, &page, &vma);
 #elif defined(HAVE_GET_USER_PAGES_6ARG)
 		rc = get_user_pages(addr, 1, write, 1, &page, &vma);
@@ -254,15 +256,22 @@ int cfs_get_environ(const char *key, char *value, int *val_len)
 
 			entry = env_start;
 			entry_len = env_end - env_start;
+			CDEBUG(D_INFO, "key: %s, entry: %s\n", key, entry);
 
 			/* Key length + length of '=' */
 			if (entry_len > key_len + 1 &&
+			    entry[key_len] == '='  &&
 			    !memcmp(entry, key, key_len)) {
 				entry += key_len + 1;
 				entry_len -= key_len + 1;
-				/* The 'value' buffer passed in is too small.*/
-				if (entry_len >= *val_len)
+
+				/* The 'value' buffer passed in is too small.
+				 * Copy what fits, but return -EOVERFLOW. */
+				if (entry_len >= *val_len) {
+					memcpy(value, entry, *val_len);
+					value[*val_len - 1] = 0;
 					GOTO(out, rc = -EOVERFLOW);
+				}
 
 				memcpy(value, entry, entry_len);
 				*val_len = entry_len;
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-debug.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-debug.c
index 048b2f34df5ba..f1701f47d334a 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-debug.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-debug.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2013, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -36,7 +36,6 @@
 
 #include <linux/errno.h>
 #include <linux/interrupt.h>
-#include <linux/kallsyms.h>
 #include <linux/kmod.h>
 #include <linux/module.h>
 #include <linux/notifier.h>
@@ -115,6 +114,28 @@ EXPORT_SYMBOL(lbug_with_loc);
 
 #ifdef CONFIG_STACKTRACE
 
+#ifndef HAVE_SAVE_STACK_TRACE_TSK
+#define save_stack_trace_tsk(tsk, trace)				       \
+do {									       \
+	if (tsk == current)						       \
+		save_stack_trace(trace);				       \
+	else								       \
+		pr_info("No stack, save_stack_trace_tsk() not exported\n");    \
+} while (0)
+#endif
+
+static void cfs_print_stack_trace(unsigned long *entries, unsigned int nr)
+{
+	unsigned int i;
+
+	/* Prefer %pB for backtraced symbolic names since it was added in:
+	 * Linux v2.6.38-6557-g0f77a8d37825
+	 * vsprintf: Introduce %pB format specifier
+	 */
+	for (i = 0; i < nr; i++)
+		pr_info("[<0>] %pB\n", (void *)entries[i]);
+}
+
 #define MAX_ST_ENTRIES	100
 static DEFINE_SPINLOCK(st_lock);
 
@@ -130,11 +151,20 @@ typedef unsigned int (stack_trace_save_tsk_t)(struct task_struct *task,
 static stack_trace_save_tsk_t *task_dump_stack;
 #endif
 
-static void libcfs_call_trace(struct task_struct *tsk)
+void __init cfs_debug_init(void)
 {
 #ifdef CONFIG_ARCH_STACKWALK
+	task_dump_stack = (void *)
+			cfs_kallsyms_lookup_name("stack_trace_save_tsk");
+
+#endif
+}
+
+static void libcfs_call_trace(struct task_struct *tsk)
+{
 	static unsigned long entries[MAX_ST_ENTRIES];
-	unsigned int i, nr_entries;
+#ifdef CONFIG_ARCH_STACKWALK
+	unsigned int nr_entries;
 
 	if (!task_dump_stack)
 		task_dump_stack = (stack_trace_save_tsk_t *)
@@ -146,13 +176,11 @@ static void libcfs_call_trace(struct task_struct *tsk)
 	pr_info("Call Trace TBD:\n");
 	if (task_dump_stack) {
 		nr_entries = task_dump_stack(tsk, entries, MAX_ST_ENTRIES, 0);
-		for (i = 0; i < nr_entries; i++)
-			pr_info("[<0>] %pB\n", (void *)entries[i]);
+		cfs_print_stack_trace(entries, nr_entries);
 	}
 	spin_unlock(&st_lock);
 #else
 	struct stack_trace trace;
-	static unsigned long entries[MAX_ST_ENTRIES];
 
 	trace.nr_entries = 0;
 	trace.max_entries = MAX_ST_ENTRIES;
@@ -164,11 +192,7 @@ static void libcfs_call_trace(struct task_struct *tsk)
 	       init_utsname()->release, init_utsname()->version);
 	pr_info("Call Trace:\n");
 	save_stack_trace_tsk(tsk, &trace);
-#ifdef HAVE_STACK_TRACE_PRINT
-	stack_trace_print(trace.entries, trace.nr_entries, 0);
-#else
-	print_stack_trace(&trace, 0);
-#endif
+	cfs_print_stack_trace(trace.entries, trace.nr_entries);
 	spin_unlock(&st_lock);
 #endif
 }
@@ -270,12 +294,6 @@ void libcfs_debug_dumpstack(struct task_struct *tsk)
 }
 EXPORT_SYMBOL(libcfs_debug_dumpstack);
 
-struct task_struct *libcfs_current(void)
-{
-        CWARN("current task struct is %p\n", current);
-        return current;
-}
-
 static int panic_notifier(struct notifier_block *self, unsigned long unused1,
                          void *unused2)
 {
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-hash.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-hash.c
new file mode 100644
index 0000000000000..e4e67c20cee5d
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-hash.c
@@ -0,0 +1,57 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/module.h>
+#ifdef HAVE_STRINGHASH
+#include <linux/stringhash.h>
+#else
+#include <linux/dcache.h>
+#endif
+#include <linux/hash.h>
+
+#include <libcfs/linux/linux-hash.h>
+
+/* Return the "hash_len" (hash and length) of a null-terminated string */
+/* The kernel equivalent is in fs/namei.c but for some strange reason
+ * RHEL7.5 stuck it in dax/super.c instead. This placement never existed
+ * upstream so to make life easier we just have the equavilent
+ */
+u64 cfs_hashlen_string(const void *salt, const char *name)
+{
+#ifdef HAVE_FULL_NAME_HASH_3ARGS
+	unsigned long hash = init_name_hash(salt);
+#else
+	unsigned long hash = init_name_hash();
+#endif
+	unsigned long len = 0, c;
+
+	c = (unsigned char)*name;
+	while (c) {
+		len++;
+		hash = partial_name_hash(c, hash);
+		c = (unsigned char)name[len];
+	}
+	return hashlen_create(end_name_hash(hash), len);
+}
+EXPORT_SYMBOL(cfs_hashlen_string);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-module.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-module.c
index 839f9324ac5ca..7300af8018c69 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-module.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-module.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2014, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -32,7 +32,9 @@
 
 #define DEBUG_SUBSYSTEM S_LNET
 
+#include <linux/fs.h>
 #include <linux/miscdevice.h>
+#include <linux/uaccess.h>
 #include <libcfs/libcfs.h>
 
 static inline size_t libcfs_ioctl_packlen(struct libcfs_ioctl_data *data)
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c
index 4b73ed6e79a93..2ee18be5e59a6 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c
@@ -36,13 +36,18 @@
 #include <linux/fs.h>
 #include <linux/fs_struct.h>
 #include <linux/sched.h>
+#ifdef HAVE_SCHED_HEADERS
+#include <linux/sched/signal.h>
+#include <linux/sched/mm.h>
+#endif
 #include <linux/uaccess.h>
-#include <libcfs/libcfs.h>
 
 #if defined(CONFIG_KGDB)
 #include <asm/kgdb.h>
 #endif
 
+#include <libcfs/linux/linux-time.h>
+
 #ifndef HAVE_KTIME_GET_TS64
 void ktime_get_ts64(struct timespec64 *ts)
 {
@@ -97,17 +102,17 @@ time64_t ktime_get_seconds(void)
 EXPORT_SYMBOL(ktime_get_seconds);
 #endif /* HAVE_KTIME_GET_SECONDS */
 
-#ifndef HAVE_LINUX_SELINUX_IS_ENABLED
-static char **cfs_lsm_names;
+static int (*cfs_apply_workqueue_attrs_t)(struct workqueue_struct *wq,
+					  const struct workqueue_attrs *attrs);
 
-bool selinux_is_enabled(void)
+int cfs_apply_workqueue_attrs(struct workqueue_struct *wq,
+			      const struct workqueue_attrs *attrs)
 {
-	if (cfs_lsm_names)
-		return !!strstr("selinux", *cfs_lsm_names);
-	return false;
+	if (cfs_apply_workqueue_attrs_t)
+		return cfs_apply_workqueue_attrs_t(wq, attrs);
+	return 0;
 }
-EXPORT_SYMBOL(selinux_is_enabled);
-#endif
+EXPORT_SYMBOL_GPL(cfs_apply_workqueue_attrs);
 
 int cfs_kernel_write(struct file *filp, const void *buf, size_t count,
 		     loff_t *pos)
@@ -127,6 +132,43 @@ int cfs_kernel_write(struct file *filp, const void *buf, size_t count,
 }
 EXPORT_SYMBOL(cfs_kernel_write);
 
+#ifndef HAVE_KSET_FIND_OBJ
+struct kobject *kset_find_obj(struct kset *kset, const char *name)
+{
+	struct kobject *ret = NULL;
+	struct kobject *k;
+
+	spin_lock(&kset->list_lock);
+
+	list_for_each_entry(k, &kset->list, entry) {
+		if (kobject_name(k) && !strcmp(kobject_name(k), name)) {
+			if (kref_get_unless_zero(&k->kref))
+				ret = k;
+			break;
+		}
+	}
+
+	spin_unlock(&kset->list_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kset_find_obj);
+#endif
+
+#ifndef HAVE_KSTRTOBOOL_FROM_USER
+int kstrtobool_from_user(const char __user *s, size_t count, bool *res)
+{
+	/* Longest string needed to differentiate, newline, terminator */
+	char buf[4];
+
+	count = min(count, sizeof(buf) - 1);
+	if (copy_from_user(buf, s, count))
+		return -EFAULT;
+	buf[count] = '\0';
+	return strtobool(buf, res);
+}
+EXPORT_SYMBOL(kstrtobool_from_user);
+#endif /* !HAVE_KSTRTOBOOL_FROM_USER */
+
 sigset_t
 cfs_block_allsigs(void)
 {
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-tracefile.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-tracefile.c
index e0fd4c0de04f1..9685296266f04 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-tracefile.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-tracefile.c
@@ -33,6 +33,7 @@
 #define DEBUG_SUBSYSTEM S_LNET
 #define LUSTRE_TRACEFILE_PRIVATE
 
+#include <linux/slab.h>
 #include <libcfs/libcfs.h>
 #include "tracefile.h"
 
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-wait.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-wait.c
new file mode 100644
index 0000000000000..5843d808bc332
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-wait.c
@@ -0,0 +1,115 @@
+/*
+ * The implementation of the wait_bit*() and related waiting APIs:
+ */
+#include <linux/hash.h>
+#include <linux/sched.h>
+#ifdef HAVE_SCHED_HEADERS
+#include <linux/sched/signal.h>
+#endif
+#include <libcfs/linux/linux-wait.h>
+
+#ifndef HAVE_PREPARE_TO_WAIT_EVENT
+
+#define __add_wait_queue_entry_tail __add_wait_queue_tail
+
+long prepare_to_wait_event(wait_queue_head_t *wq_head,
+			   wait_queue_entry_t *wq_entry, int state)
+{
+	unsigned long flags;
+	long ret = 0;
+
+	spin_lock_irqsave(&wq_head->lock, flags);
+	if (unlikely(signal_pending_state(state, current))) {
+		/*
+		 * Exclusive waiter must not fail if it was selected by wakeup,
+		 * it should "consume" the condition we were waiting for.
+		 *
+		 * The caller will recheck the condition and return success if
+		 * we were already woken up, we can not miss the event because
+		 * wakeup locks/unlocks the same wq_head->lock.
+		 *
+		 * But we need to ensure that set-condition + wakeup after that
+		 * can't see us, it should wake up another exclusive waiter if
+		 * we fail.
+		 */
+		list_del_init(&wq_entry->task_list);
+		ret = -ERESTARTSYS;
+	} else {
+		if (list_empty(&wq_entry->task_list)) {
+			if (wq_entry->flags & WQ_FLAG_EXCLUSIVE)
+				__add_wait_queue_entry_tail(wq_head, wq_entry);
+			else
+				__add_wait_queue(wq_head, wq_entry);
+		}
+		set_current_state(state);
+	}
+	spin_unlock_irqrestore(&wq_head->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(prepare_to_wait_event);
+#endif /* !HAVE_PREPARE_TO_WAIT_EVENT */
+
+#ifndef HAVE_WAIT_VAR_EVENT
+
+#define WAIT_TABLE_BITS 8
+#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
+
+static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
+
+wait_queue_head_t *__var_waitqueue(void *p)
+{
+	return bit_wait_table + hash_ptr(p, WAIT_TABLE_BITS);
+}
+EXPORT_SYMBOL(__var_waitqueue);
+
+static int
+var_wake_function(wait_queue_entry_t *wq_entry, unsigned int mode,
+		  int sync, void *arg)
+{
+	struct wait_bit_key *key = arg;
+	struct wait_bit_queue_entry *wbq_entry =
+		container_of(wq_entry, struct wait_bit_queue_entry, wq_entry);
+
+	if (wbq_entry->key.flags != key->flags ||
+	    wbq_entry->key.bit_nr != key->bit_nr)
+		return 0;
+
+	return autoremove_wake_function(wq_entry, mode, sync, key);
+}
+
+void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var,
+			 int flags)
+{
+	*wbq_entry = (struct wait_bit_queue_entry){
+		.key = {
+			.flags	= (var),
+			.bit_nr = -1,
+		},
+		.wq_entry = {
+			.private = current,
+			.func = var_wake_function,
+#ifdef HAVE_WAIT_QUEUE_ENTRY_LIST
+			.entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry),
+#else
+			.task_list = LIST_HEAD_INIT(wbq_entry->wq_entry.task_list),
+#endif
+		},
+	};
+}
+EXPORT_SYMBOL(init_wait_var_entry);
+
+void wake_up_var(void *var)
+{
+	__wake_up_bit(__var_waitqueue(var), var, -1);
+}
+EXPORT_SYMBOL(wake_up_var);
+
+void __init wait_bit_init(void)
+{
+	int i;
+
+	for (i = 0; i < WAIT_TABLE_SIZE; i++)
+		init_waitqueue_head(bit_wait_table + i);
+}
+#endif /* ! HAVE_WAIT_VAR_EVENT */
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/module.c b/drivers/staging/lustrefsx/libcfs/libcfs/module.c
index f832a6fd02bce..08f5a1c1a5655 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/module.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/module.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -46,52 +46,21 @@
 #include <linux/list.h>
 
 #include <linux/sysctl.h>
-#include <linux/proc_fs.h>
+#include <linux/debugfs.h>
 #include <asm/div64.h>
 
 #define DEBUG_SUBSYSTEM S_LNET
 
 #include <libcfs/libcfs.h>
 #include <libcfs/libcfs_crypto.h>
+#include <libcfs/linux/linux-fs.h>
 #include <lnet/lib-lnet.h>
 #include "tracefile.h"
 
-#ifdef CONFIG_SYSCTL
-static struct ctl_table_header *lnet_table_header;
-#endif
-
-static DECLARE_RWSEM(ioctl_list_sem);
-static LIST_HEAD(ioctl_list);
-
-int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand)
-{
-	int rc = 0;
-
-	down_write(&ioctl_list_sem);
-	if (!list_empty(&hand->item))
-		rc = -EBUSY;
-	else
-		list_add_tail(&hand->item, &ioctl_list);
-	up_write(&ioctl_list_sem);
-
-	return rc;
-}
-EXPORT_SYMBOL(libcfs_register_ioctl);
-
-int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand)
-{
-	int rc = 0;
-
-	down_write(&ioctl_list_sem);
-	if (list_empty(&hand->item))
-		rc = -ENOENT;
-	else
-		list_del_init(&hand->item);
-	up_write(&ioctl_list_sem);
+static struct dentry *lnet_debugfs_root;
 
-	return rc;
-}
-EXPORT_SYMBOL(libcfs_deregister_ioctl);
+BLOCKING_NOTIFIER_HEAD(libcfs_ioctl_list);
+EXPORT_SYMBOL(libcfs_ioctl_list);
 
 int libcfs_ioctl(unsigned long cmd, void __user *uparam)
 {
@@ -133,35 +102,27 @@ int libcfs_ioctl(unsigned long cmd, void __user *uparam)
 		libcfs_debug_mark_buffer(data->ioc_inlbuf1);
 		break;
 
-	default: {
-		struct libcfs_ioctl_handler *hand;
-
-		err = -EINVAL;
-		down_read(&ioctl_list_sem);
-		list_for_each_entry(hand, &ioctl_list, item) {
-			err = hand->handle_ioctl(cmd, hdr);
-			if (err == -EINVAL)
-				continue;
-
-			if (err == 0) {
-				if (copy_to_user(uparam, hdr, hdr->ioc_len))
-					err = -EFAULT;
-			}
-			break;
-		}
-		up_read(&ioctl_list_sem);
-		break; }
+	default:
+		err = blocking_notifier_call_chain(&libcfs_ioctl_list,
+						   cmd, hdr);
+		if (!(err & NOTIFY_STOP_MASK))
+			/* No-one claimed the ioctl */
+			err = -EINVAL;
+		else
+			err = notifier_to_errno(err);
+		if (copy_to_user(uparam, hdr, hdr->ioc_len) && !err)
+			err = -EFAULT;
+		break;
 	}
 out:
 	LIBCFS_FREE(hdr, hdr->ioc_len);
 	RETURN(err);
 }
 
-int
-lprocfs_call_handler(void *data, int write, loff_t *ppos,
-		     void __user *buffer, size_t *lenp,
-		     int (*handler)(void *data, int write, loff_t pos,
-				    void __user *buffer, int len))
+int lprocfs_call_handler(void *data, int write, loff_t *ppos,
+			 void __user *buffer, size_t *lenp,
+			 int (*handler)(void *data, int write, loff_t pos,
+					void __user *buffer, int len))
 {
 	int rc = handler(data, write, *ppos, buffer, *lenp);
 
@@ -219,9 +180,8 @@ static int __proc_dobitmasks(void *data, int write,
 	return rc;
 }
 
-static int
-proc_dobitmasks(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp, loff_t *ppos)
+static int proc_dobitmasks(struct ctl_table *table, int write,
+			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
 				    __proc_dobitmasks);
@@ -239,9 +199,8 @@ static int __proc_dump_kernel(void *data, int write,
 	return cfs_trace_dump_debug_buffer_usrstr(buffer, nob);
 }
 
-static int
-proc_dump_kernel(struct ctl_table *table, int write, void __user *buffer,
-		 size_t *lenp, loff_t *ppos)
+static int proc_dump_kernel(struct ctl_table *table, int write,
+			    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
 				    __proc_dump_kernel);
@@ -263,156 +222,133 @@ static int __proc_daemon_file(void *data, int write,
 	return cfs_trace_daemon_command_usrstr(buffer, nob);
 }
 
-static int
-proc_daemon_file(struct ctl_table *table, int write, void __user *buffer,
-		 size_t *lenp, loff_t *ppos)
+static int proc_daemon_file(struct ctl_table *table, int write,
+			    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
 				    __proc_daemon_file);
 }
 
-static int __proc_debug_mb(void *data, int write,
-			   loff_t pos, void __user *buffer, int nob)
+static int libcfs_force_lbug(struct ctl_table *table, int write,
+			     void __user *buffer,
+			     size_t *lenp, loff_t *ppos)
 {
-	if (!write) {
-		char tmpstr[32];
-		int  len = snprintf(tmpstr, sizeof(tmpstr), "%d",
-				    cfs_trace_get_debug_mb());
-
-		if (pos >= len)
-			return 0;
-
-		return cfs_trace_copyout_string(buffer, nob, tmpstr + pos,
-						"\n");
-	}
-
-	return cfs_trace_set_debug_mb_usrstr(buffer, nob);
-}
-
-static int
-proc_debug_mb(struct ctl_table *table, int write, void __user *buffer,
-	      size_t *lenp, loff_t *ppos)
-{
-	return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
-				    __proc_debug_mb);
+	if (write)
+		LBUG();
+	return 0;
 }
 
-static int
-proc_console_max_delay_cs(struct ctl_table *table, int write,
-			  void __user *buffer, size_t *lenp, loff_t *ppos)
+static int proc_fail_loc(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-	int rc, max_delay_cs;
-	struct ctl_table dummy = *table;
-	cfs_duration_t d;
-
-	dummy.data = &max_delay_cs;
-	dummy.proc_handler = &proc_dointvec;
+	int rc;
+	long old_fail_loc = cfs_fail_loc;
 
-	if (!write) { /* read */
-		max_delay_cs = cfs_duration_sec(libcfs_console_max_delay * 100);
-		rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
-		return rc;
+	if (!*lenp || *ppos) {
+		*lenp = 0;
+		return 0;
 	}
 
-	/* write */
-	max_delay_cs = 0;
-	rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
-	if (rc < 0)
-		return rc;
-	if (max_delay_cs <= 0)
-		return -EINVAL;
-
-	d = cfs_time_seconds(max_delay_cs) / 100;
-	if (d == 0 || d < libcfs_console_min_delay)
-		return -EINVAL;
-	libcfs_console_max_delay = d;
-
-	return rc;
-}
-
-static int
-proc_console_min_delay_cs(struct ctl_table *table, int write,
-			  void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	int rc, min_delay_cs;
-	struct ctl_table dummy = *table;
-	cfs_duration_t d;
-
-	dummy.data = &min_delay_cs;
-	dummy.proc_handler = &proc_dointvec;
+	if (write) {
+		char *kbuf = memdup_user_nul(buffer, *lenp);
 
-	if (!write) { /* read */
-		min_delay_cs = cfs_duration_sec(libcfs_console_min_delay * 100);
-		rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
-		return rc;
+		if (IS_ERR(kbuf))
+			return PTR_ERR(kbuf);
+		rc = kstrtoul(kbuf, 0, &cfs_fail_loc);
+		kfree(kbuf);
+		*ppos += *lenp;
+	} else {
+		char kbuf[64/3+3];
+
+		rc = scnprintf(kbuf, sizeof(kbuf), "%lu\n", cfs_fail_loc);
+		if (copy_to_user(buffer, kbuf, rc))
+			rc = -EFAULT;
+		else {
+			*lenp = rc;
+			*ppos += rc;
+		}
 	}
 
-	/* write */
-	min_delay_cs = 0;
-	rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
-	if (rc < 0)
-		return rc;
-	if (min_delay_cs <= 0)
-		return -EINVAL;
-
-	d = cfs_time_seconds(min_delay_cs) / 100;
-	if (d == 0 || d > libcfs_console_max_delay)
-		return -EINVAL;
-	libcfs_console_min_delay = d;
-
+	if (old_fail_loc != cfs_fail_loc) {
+		cfs_race_state = 1;
+		wake_up(&cfs_race_waitq);
+	}
 	return rc;
 }
 
-static int
-proc_console_backoff(struct ctl_table *table, int write, void __user *buffer,
-		     size_t *lenp, loff_t *ppos)
+int debugfs_doint(struct ctl_table *table, int write,
+		  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-	int rc, backoff;
-	struct ctl_table dummy = *table;
-
-	dummy.data = &backoff;
-	dummy.proc_handler = &proc_dointvec;
+	int rc;
 
-	if (!write) { /* read */
-		backoff = libcfs_console_backoff;
-		rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
-		return rc;
+	if (!*lenp || *ppos) {
+		*lenp = 0;
+		return 0;
 	}
 
-	/* write */
-	backoff = 0;
-	rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
-	if (rc < 0)
-		return rc;
-
-	if (backoff <= 0)
-		return -EINVAL;
-
-	libcfs_console_backoff = backoff;
+	if (write) {
+		char *kbuf = memdup_user_nul(buffer, *lenp);
+		int val;
+
+		if (IS_ERR(kbuf))
+			return PTR_ERR(kbuf);
+
+		rc = kstrtoint(kbuf, 0, &val);
+		kfree(kbuf);
+		if (!rc) {
+			if (table->extra1 && val < *(int *)table->extra1)
+				val = *(int *)table->extra1;
+			if (table->extra2 && val > *(int *)table->extra2)
+				val = *(int *)table->extra2;
+			*(int *)table->data = val;
+		}
+		*ppos += *lenp;
+	} else {
+		char kbuf[64/3+3];
+
+		rc = scnprintf(kbuf, sizeof(kbuf), "%u\n", *(int *)table->data);
+		if (copy_to_user(buffer, kbuf, rc))
+			rc = -EFAULT;
+		else {
+			*lenp = rc;
+			*ppos += rc;
+		}
+	}
 
 	return rc;
 }
+EXPORT_SYMBOL(debugfs_doint);
 
-static int
-libcfs_force_lbug(struct ctl_table *table, int write, void __user *buffer,
-		  size_t *lenp, loff_t *ppos)
+static int debugfs_dostring(struct ctl_table *table, int write,
+			    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-	if (write)
-		LBUG();
-	return 0;
-}
-
-static int
-proc_fail_loc(struct ctl_table *table, int write, void __user *buffer,
-	      size_t *lenp, loff_t *ppos)
-{
-	int rc;
-	long old_fail_loc = cfs_fail_loc;
+	int len = *lenp;
+	char *kbuf = table->data;
 
-	rc = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
-	if (old_fail_loc != cfs_fail_loc)
-		wake_up(&cfs_race_waitq);
-	return rc;
+	if (!len || *ppos) {
+		*lenp = 0;
+		return 0;
+	}
+	if (len > table->maxlen)
+		len = table->maxlen;
+	if (write) {
+		if (copy_from_user(kbuf, buffer, len))
+			return -EFAULT;
+		memset(kbuf+len, 0, table->maxlen - len);
+		*ppos = *lenp;
+	} else {
+		len = strnlen(kbuf, len);
+		if (copy_to_user(buffer, kbuf, len))
+			return -EFAULT;
+		if (len < *lenp) {
+			if (copy_to_user(buffer+len, "\n", 1))
+				return -EFAULT;
+			len += 1;
+		}
+		*ppos += len;
+		*lenp -= len;
+	}
+	return len;
 }
 
 static int __proc_cpt_table(void *data, int write,
@@ -456,9 +392,8 @@ static int __proc_cpt_table(void *data, int write,
 	return rc;
 }
 
-static int
-proc_cpt_table(struct ctl_table *table, int write, void __user *buffer,
-	       size_t *lenp, loff_t *ppos)
+static int proc_cpt_table(struct ctl_table *table, int write,
+			  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
 				    __proc_cpt_table);
@@ -505,19 +440,14 @@ static int __proc_cpt_distance(void *data, int write,
 	return rc;
 }
 
-static int
-proc_cpt_distance(struct ctl_table *table, int write, void __user *buffer,
-	       size_t *lenp, loff_t *ppos)
+static int proc_cpt_distance(struct ctl_table *table, int write,
+			     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
 				     __proc_cpt_distance);
 }
 
 static struct ctl_table lnet_table[] = {
-	/*
-	 * NB No .strategy entries have been provided since sysctl(8) prefers
-	 * to go via /proc for portability.
-	 */
 	{
 		INIT_CTL_NAME
 		.procname	= "debug",
@@ -542,43 +472,6 @@ static struct ctl_table lnet_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dobitmasks,
 	},
-	{
-		INIT_CTL_NAME
-		.procname	= "console_ratelimit",
-		.data		= &libcfs_console_ratelimit,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
-	},
-	{
-		INIT_CTL_NAME
-		.procname	= "console_max_delay_centisecs",
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_console_max_delay_cs
-	},
-	{
-		INIT_CTL_NAME
-		.procname	= "console_min_delay_centisecs",
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_console_min_delay_cs
-	},
-	{
-		INIT_CTL_NAME
-		.procname	= "console_backoff",
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_console_backoff
-	},
-	{
-		INIT_CTL_NAME
-		.procname	= "debug_path",
-		.data		= libcfs_debug_file_path_arr,
-		.maxlen		= sizeof(libcfs_debug_file_path_arr),
-		.mode		= 0644,
-		.proc_handler	= &proc_dostring,
-	},
 	{
 		INIT_CTL_NAME
 		.procname	= "cpu_partition_table",
@@ -599,7 +492,7 @@ static struct ctl_table lnet_table[] = {
 		.data		= lnet_debug_log_upcall,
 		.maxlen		= sizeof(lnet_debug_log_upcall),
 		.mode		= 0644,
-		.proc_handler	= &proc_dostring,
+		.proc_handler	= &debugfs_dostring,
 	},
 	{
 		INIT_CTL_NAME
@@ -607,7 +500,7 @@ static struct ctl_table lnet_table[] = {
 		.data		= (int *)&libcfs_kmemory.counter,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &debugfs_doint,
 	},
 	{
 		INIT_CTL_NAME
@@ -615,15 +508,7 @@ static struct ctl_table lnet_table[] = {
 		.data		= &libcfs_catastrophe,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		INIT_CTL_NAME
-		.procname	= "panic_on_lbug",
-		.data		= &libcfs_panic_on_lbug,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &debugfs_doint,
 	},
 	{
 		INIT_CTL_NAME
@@ -639,19 +524,13 @@ static struct ctl_table lnet_table[] = {
 		.maxlen		= 256,
 		.proc_handler	= &proc_daemon_file,
 	},
-	{
-		INIT_CTL_NAME
-		.procname	= "debug_mb",
-		.mode		= 0644,
-		.proc_handler	= &proc_debug_mb,
-	},
 	{
 		INIT_CTL_NAME
 		.procname	= "watchdog_ratelimit",
 		.data		= &libcfs_watchdog_ratelimit,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= &debugfs_doint,
 		.extra1		= &min_watchdog_ratelimit,
 		.extra2		= &max_watchdog_ratelimit,
 	},
@@ -677,7 +556,7 @@ static struct ctl_table lnet_table[] = {
 		.data		= &cfs_fail_val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
+		.proc_handler	= &debugfs_doint
 	},
 	{
 		INIT_CTL_NAME
@@ -685,55 +564,154 @@ static struct ctl_table lnet_table[] = {
 		.data		= &cfs_fail_err,
 		.maxlen		= sizeof(cfs_fail_err),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &debugfs_doint,
 	},
 	{
 	}
 };
 
-#ifdef CONFIG_SYSCTL
-static struct ctl_table top_table[] = {
-	{
-		INIT_CTL_NAME
-		.procname       = "lnet",
-		.mode           = 0555,
-		.data           = NULL,
-		.maxlen         = 0,
-		.child          = lnet_table,
-	},
-	{ .procname = NULL }
+static const struct lnet_debugfs_symlink_def lnet_debugfs_symlinks[] = {
+	{ .name		= "console_ratelimit",
+	  .target	= "../../../module/libcfs/parameters/libcfs_console_ratelimit" },
+	{ .name		= "debug_path",
+	  .target	= "../../../module/libcfs/parameters/libcfs_debug_file_path" },
+	{ .name		= "panic_on_lbug",
+	  .target	= "../../../module/libcfs/parameters/libcfs_panic_on_lbug" },
+	{ .name		= "console_backoff",
+	  .target	= "../../../module/libcfs/parameters/libcfs_console_backoff" },
+	{ .name		= "debug_mb",
+	  .target	= "../../../module/libcfs/parameters/libcfs_debug_mb" },
+	{ .name		= "console_min_delay_centisecs",
+	  .target	= "../../../module/libcfs/parameters/libcfs_console_min_delay" },
+	{ .name		= "console_max_delay_centisecs",
+	  .target	= "../../../module/libcfs/parameters/libcfs_console_max_delay" },
+	{ .name		= NULL },
 };
-#endif
 
-static int insert_proc(void)
+static ssize_t lnet_debugfs_read(struct file *filp, char __user *buf,
+				 size_t count, loff_t *ppos)
 {
-#ifdef CONFIG_SYSCTL
-	if (lnet_table_header == NULL)
-		lnet_table_header = register_sysctl_table(top_table);
-#endif
-	return 0;
+	struct ctl_table *table = filp->private_data;
+	ssize_t rc = -EINVAL;
+
+	if (table) {
+		rc = table->proc_handler(table, 0, buf, &count, ppos);
+		if (!rc)
+			rc = count;
+	}
+
+	return rc;
+}
+
+static ssize_t lnet_debugfs_write(struct file *filp, const char __user *buf,
+				  size_t count, loff_t *ppos)
+{
+	struct ctl_table *table = filp->private_data;
+	ssize_t rc = -EINVAL;
+
+	if (table) {
+		rc = table->proc_handler(table, 1, (void __user *)buf, &count,
+					 ppos);
+		if (!rc)
+			rc = count;
+	}
+
+	return rc;
+}
+
+static const struct file_operations lnet_debugfs_file_operations_rw = {
+	.open		= simple_open,
+	.read		= lnet_debugfs_read,
+	.write		= lnet_debugfs_write,
+	.llseek		= default_llseek,
+};
+
+static const struct file_operations lnet_debugfs_file_operations_ro = {
+	.open		= simple_open,
+	.read		= lnet_debugfs_read,
+	.llseek		= default_llseek,
+};
+
+static const struct file_operations lnet_debugfs_file_operations_wo = {
+	.open		= simple_open,
+	.write		= lnet_debugfs_write,
+	.llseek		= default_llseek,
+};
+
+static const struct file_operations *lnet_debugfs_fops_select(umode_t mode)
+{
+	if (!(mode & S_IWUGO))
+		return &lnet_debugfs_file_operations_ro;
+
+	if (!(mode & S_IRUGO))
+		return &lnet_debugfs_file_operations_wo;
+
+	return &lnet_debugfs_file_operations_rw;
 }
 
-static void remove_proc(void)
+void lnet_insert_debugfs(struct ctl_table *table)
 {
-#ifdef CONFIG_SYSCTL
-	if (lnet_table_header != NULL)
-		unregister_sysctl_table(lnet_table_header);
+	if (!lnet_debugfs_root)
+		lnet_debugfs_root = debugfs_create_dir("lnet", NULL);
 
-	lnet_table_header = NULL;
+	/* Even if we cannot create, just ignore it altogether) */
+	if (IS_ERR_OR_NULL(lnet_debugfs_root))
+		return;
+
+	/* We don't save the dentry returned in next two calls, because
+	 * we don't call debugfs_remove() but rather remove_recursive()
+	 */
+	for (; table && table->procname; table++)
+		debugfs_create_file(table->procname, table->mode,
+				    lnet_debugfs_root, table,
+				    lnet_debugfs_fops_select(table->mode));
+}
+EXPORT_SYMBOL_GPL(lnet_insert_debugfs);
+
+static void lnet_insert_debugfs_links(
+		const struct lnet_debugfs_symlink_def *symlinks)
+{
+	for (; symlinks && symlinks->name; symlinks++)
+		debugfs_create_symlink(symlinks->name, lnet_debugfs_root,
+				       symlinks->target);
+}
+
+void lnet_remove_debugfs(struct ctl_table *table)
+{
+#ifndef HAVE_D_HASH_AND_LOOKUP
+	debugfs_remove_recursive(lnet_debugfs_root);
+	lnet_debugfs_root = NULL;
+	return;
 #endif
+
+	for (; table && table->procname; table++) {
+		struct qstr dname = QSTR_INIT(table->procname,
+					      strlen(table->procname));
+		struct dentry *dentry;
+
+		dentry = d_hash_and_lookup(lnet_debugfs_root, &dname);
+		debugfs_remove(dentry);
+	}
 }
+EXPORT_SYMBOL_GPL(lnet_remove_debugfs);
 
 static int __init libcfs_init(void)
 {
 	int rc;
+
+#ifndef HAVE_WAIT_VAR_EVENT
+	wait_bit_init();
+#endif
 	init_libcfs_vfree_atomic();
+
 	rc = libcfs_debug_init(5 * 1024 * 1024);
 	if (rc < 0) {
 		printk(KERN_ERR "LustreError: libcfs_debug_init: %d\n", rc);
 		return (rc);
 	}
 
+	cfs_debug_init();
+
 	rc = cfs_cpu_init();
 	if (rc != 0)
 		goto cleanup_debug;
@@ -765,17 +743,12 @@ static int __init libcfs_init(void)
 		goto cleanup_wi;
 	}
 
-
-	rc = insert_proc();
-	if (rc) {
-		CERROR("insert_proc: error %d\n", rc);
-		goto cleanup_crypto;
-	}
+	lnet_insert_debugfs(lnet_table);
+	if (!IS_ERR_OR_NULL(lnet_debugfs_root))
+		lnet_insert_debugfs_links(lnet_debugfs_symlinks);
 
 	CDEBUG (D_OTHER, "portals setup OK\n");
 	return 0;
-cleanup_crypto:
-	cfs_crypto_unregister();
 cleanup_wi:
 	cfs_wi_shutdown();
 cleanup_deregister:
@@ -791,7 +764,11 @@ static void __exit libcfs_exit(void)
 {
 	int rc;
 
-	remove_proc();
+	/* Remove everthing */
+	if (lnet_debugfs_root) {
+		debugfs_remove_recursive(lnet_debugfs_root);
+		lnet_debugfs_root = NULL;
+	}
 
 	CDEBUG(D_MALLOC, "before Portals cleanup: kmem %d\n",
 	       atomic_read(&libcfs_kmemory));
@@ -816,6 +793,7 @@ static void __exit libcfs_exit(void)
 	if (rc)
 		printk(KERN_ERR "LustreError: libcfs_debug_cleanup: %d\n",
 		       rc);
+
 	exit_libcfs_vfree_atomic();
 }
 
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c
index ac762726fa5ce..f9d96d12f2555 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,8 +40,12 @@
 #define LUSTRE_TRACEFILE_PRIVATE
 #include "tracefile.h"
 
+#include <linux/ctype.h>
+#include <linux/fs.h>
 #include <linux/kthread.h>
-#include <libcfs/linux/linux-misc.h>
+#include <linux/pagemap.h>
+#include <linux/uaccess.h>
+#include <libcfs/linux/linux-fs.h>
 #include <libcfs/libcfs.h>
 
 /* XXX move things up to the top, comment */
@@ -390,34 +394,34 @@ int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata,
                 return 1;
         }
 
-        if (cdls != NULL) {
-                if (libcfs_console_ratelimit &&
-                    cdls->cdls_next != 0 &&     /* not first time ever */
-                    !cfs_time_after(cfs_time_current(), cdls->cdls_next)) {
-                        /* skipping a console message */
-                        cdls->cdls_count++;
-                        if (tcd != NULL)
-                                cfs_trace_put_tcd(tcd);
-                        return 1;
-                }
+	if (cdls != NULL) {
+		if (libcfs_console_ratelimit &&
+		    cdls->cdls_next != 0 &&	/* not first time ever */
+		    time_before(jiffies, cdls->cdls_next)) {
+			/* skipping a console message */
+			cdls->cdls_count++;
+			if (tcd != NULL)
+				cfs_trace_put_tcd(tcd);
+			return 1;
+		}
 
-                if (cfs_time_after(cfs_time_current(), cdls->cdls_next +
-                                                       libcfs_console_max_delay
-                                                       + cfs_time_seconds(10))) {
-                        /* last timeout was a long time ago */
-                        cdls->cdls_delay /= libcfs_console_backoff * 4;
-                } else {
-                        cdls->cdls_delay *= libcfs_console_backoff;
-                }
+		if (time_after(jiffies, cdls->cdls_next +
+					libcfs_console_max_delay +
+					cfs_time_seconds(10))) {
+			/* last timeout was a long time ago */
+			cdls->cdls_delay /= libcfs_console_backoff * 4;
+		} else {
+			cdls->cdls_delay *= libcfs_console_backoff;
+		}
 
 		if (cdls->cdls_delay < libcfs_console_min_delay)
 			cdls->cdls_delay = libcfs_console_min_delay;
 		else if (cdls->cdls_delay > libcfs_console_max_delay)
 			cdls->cdls_delay = libcfs_console_max_delay;
 
-                /* ensure cdls_next is never zero after it's been seen */
-                cdls->cdls_next = (cfs_time_current() + cdls->cdls_delay) | 1;
-        }
+		/* ensure cdls_next is never zero after it's been seen */
+		cdls->cdls_next = (jiffies + cdls->cdls_delay) | 1;
+	}
 
         if (tcd != NULL) {
                 cfs_print_to_console(&header, mask, string_buf, needed, file,
@@ -737,12 +741,8 @@ int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
         if (usr_buffer_nob > knl_buffer_nob)
                 return -EOVERFLOW;
 
-#ifdef PROC_HANDLER_USE_USER_ATTR
 	if (copy_from_user(knl_buffer, usr_buffer, usr_buffer_nob))
                 return -EFAULT;
-#else
-	memcpy(knl_buffer, usr_buffer, usr_buffer_nob);
-#endif
 
         nob = strnlen(knl_buffer, usr_buffer_nob);
         while (nob-- >= 0)                      /* strip trailing whitespace */
@@ -771,20 +771,12 @@ int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob,
         if (nob > usr_buffer_nob)
                 nob = usr_buffer_nob;
 
-#ifdef PROC_HANDLER_USE_USER_ATTR
 	if (copy_to_user(usr_buffer, knl_buffer, nob))
                 return -EFAULT;
-#else
-	memcpy(usr_buffer, knl_buffer, nob);
-#endif
 
         if (append != NULL && nob < usr_buffer_nob) {
-#ifdef PROC_HANDLER_USE_USER_ATTR
 		if (copy_to_user(usr_buffer + nob, append, 1))
                         return -EFAULT;
-#else
-		memcpy(usr_buffer + nob, append, 1);
-#endif
 
                 nob++;
         }
@@ -841,13 +833,16 @@ int cfs_trace_daemon_command(char *str)
                 cfs_tracefile_write_lock();
                 memset(cfs_tracefile, 0, sizeof(cfs_tracefile));
 
-        } else if (strncmp(str, "size=", 5) == 0) {
-                cfs_tracefile_size = simple_strtoul(str + 5, NULL, 0);
-                if (cfs_tracefile_size < 10 || cfs_tracefile_size > 20480)
-                        cfs_tracefile_size = CFS_TRACEFILE_SIZE;
-                else
-                        cfs_tracefile_size <<= 20;
+	} else if (strncmp(str, "size=", 5) == 0) {
+		unsigned long tmp;
 
+		rc = kstrtoul(str + 5, 10, &tmp);
+		if (!rc) {
+			if (tmp < 10 || tmp > 20480)
+				cfs_tracefile_size = CFS_TRACEFILE_SIZE;
+			else
+				cfs_tracefile_size = tmp << 20;
+		}
         } else if (strlen(str) >= sizeof(cfs_tracefile)) {
                 rc = -ENAMETOOLONG;
         } else if (str[0] != '/') {
@@ -920,18 +915,6 @@ int cfs_trace_set_debug_mb(int mb)
 	return 0;
 }
 
-int cfs_trace_set_debug_mb_usrstr(void __user *usr_str, int usr_str_nob)
-{
-        char     str[32];
-        int      rc;
-
-        rc = cfs_trace_copyin_string(str, sizeof(str), usr_str, usr_str_nob);
-        if (rc < 0)
-                return rc;
-
-        return cfs_trace_set_debug_mb(simple_strtoul(str, NULL, 0));
-}
-
 int cfs_trace_get_debug_mb(void)
 {
         int i;
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.h b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.h
index 2f5dc4f272783..c6ca34d4fb08e 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.h
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2014, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -82,7 +82,6 @@ int cfs_trace_dump_debug_buffer_usrstr(void __user *usr_str, int usr_str_nob);
 int cfs_trace_daemon_command(char *str);
 int cfs_trace_daemon_command_usrstr(void __user *usr_str, int usr_str_nob);
 int cfs_trace_set_debug_mb(int mb);
-int cfs_trace_set_debug_mb_usrstr(void __user *usr_str, int usr_str_nob);
 int cfs_trace_get_debug_mb(void);
 
 extern void libcfs_debug_dumplog_internal(void *arg);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c
index c3d5556ab1557..f1676aa8f7a4d 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
  *
- * Copyright (c) 2014, Intel Corporation.
+ * Copyright (c) 2014, 2017, Intel Corporation.
  *
  *   This file is part of Lustre, https://wiki.whamcloud.com/
  *
@@ -35,7 +35,7 @@
 #include <linux/types.h>
 
 #include <libcfs/util/ioctl.h>
-#include <lnet/lnetctl.h>
+#include <linux/lnet/lnetctl.h>
 
 struct ioc_dev {
 	const char *dev_name;
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/nidstrings.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/nidstrings.c
index 04a33bdef4c4c..246d420354217 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/util/nidstrings.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/nidstrings.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -44,8 +44,8 @@
 #include <string.h>
 
 #include <libcfs/util/string.h>
-#include <lnet/types.h>
-#include <lnet/nidstr.h>
+#include <linux/lnet/lnet-types.h>
+#include <linux/lnet/nidstr.h>
 #ifdef HAVE_NETDB_H
 # include <netdb.h>
 #endif
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/param.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/param.c
index 9facce6bfa975..18fe84dc53f6a 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/util/param.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/param.c
@@ -64,10 +64,10 @@
 int
 cfs_get_param_paths(glob_t *paths, const char *pattern, ...)
 {
-	char path[PATH_MAX] = "{/sys/{fs,kernel/debug}/{lnet,lustre}/,"
-			       "/proc/{fs,sys}/{lnet,lustre}/}";
+	char topdir[PATH_MAX] = "{/sys/{fs,kernel/debug}/{lnet,lustre},"
+				"/proc/{fs,sys}/{lnet,lustre}}";
 	static bool test_mounted = false;
-	size_t len = strlen(path);
+	char path[PATH_MAX];
 	char buf[PATH_MAX];
 	struct statfs statfsbuf;
 	va_list args;
@@ -127,9 +127,9 @@ cfs_get_param_paths(glob_t *paths, const char *pattern, ...)
 		errno = EINVAL;
 		return -1;
 	}
-	len += rc;
 
-	if (strlcat(path, buf, sizeof(path)) != len) {
+	if (snprintf(path, sizeof(path), "%s/%s", topdir, buf) >=
+	    sizeof(path)) {
 		errno = E2BIG;
 		return -1;
 	}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/parser.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/parser.c
index 9afdaa07f8883..861f97a3c51e6 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/util/parser.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/parser.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (C) 2001 Cluster File Systems, Inc.
  *
- * Copyright (c) 2014, Intel Corporation.
+ * Copyright (c) 2014, 2017, Intel Corporation.
  *
  *   This file is part of Lustre, http://www.sf.net/projects/lustre/
  *
@@ -36,7 +36,7 @@
 #include <unistd.h>
 
 #include <libcfs/util/parser.h>
-#include <lustre_ver.h>
+#include <linux/lustre/lustre_ver.h>
 
 static command_t * top_level;           /* Top level of commands, initialized by
                                     * InitParser                              */
@@ -768,40 +768,41 @@ int Parser_arg2int(const char *inp, long *result, int base)
 }
 
 /* Convert human readable size string to and int; "1k" -> 1000 */
-int Parser_size (int *sizep, char *str) {
-        int size;
-        char mod[32];
+int Parser_size(unsigned long *sizep, char *str)
+{
+	unsigned long size;
+	char mod[32];
 
-        switch (sscanf (str, "%d%1[gGmMkK]", &size, mod)) {
-        default:
-                return (-1);
+	switch (sscanf(str, "%lu%1[gGmMkK]", &size, mod)) {
+	default:
+		return -1;
 
-        case 1:
-                *sizep = size;
-                return (0);
+	case 1:
+		*sizep = size;
+		return 0;
 
-        case 2:
-                switch (*mod) {
-                case 'g':
-                case 'G':
-                        *sizep = size << 30;
-                        return (0);
-
-                case 'm':
-                case 'M':
-                        *sizep = size << 20;
-                        return (0);
-
-                case 'k':
-                case 'K':
-                        *sizep = size << 10;
-                        return (0);
-
-                default:
-                        *sizep = size;
-                        return (0);
-                }
-        }
+	case 2:
+		switch (*mod) {
+		case 'g':
+		case 'G':
+			*sizep = size << 30;
+			return 0;
+
+		case 'm':
+		case 'M':
+			*sizep = size << 20;
+			return 0;
+
+		case 'k':
+		case 'K':
+			*sizep = size << 10;
+			return 0;
+
+		default:
+			*sizep = size;
+			return 0;
+		}
+	}
 }
 
 /* Convert a string boolean to an int; "enable" -> 1 */
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/string.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/string.c
index 9078500020bb9..2c1a24cacebb2 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/util/string.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/string.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2014, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -41,46 +41,10 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <limits.h>
+#include <unistd.h>
 #include <libcfs/util/string.h>
 
-/*
- * According manual of strlcpy() and strlcat() the functions should return
- * the total length of the string they tried to create. For strlcpy() that
- * means the length of src. For strlcat() that means the initial length of
- * dst plus the length of src. So, the function strnlen() cannot be used
- * otherwise the return value will be wrong.
- */
-#ifndef HAVE_STRLCPY /* not in glibc for RHEL 5.x, remove when obsolete */
-size_t strlcpy(char *dst, const char *src, size_t size)
-{
-	size_t ret = strlen(src);
-
-	if (size) {
-		size_t len = (ret >= size) ? size - 1 : ret;
-		memcpy(dst, src, len);
-		dst[len] = '\0';
-	}
-	return ret;
-}
-#endif
-
-#ifndef HAVE_STRLCAT /* not in glibc for RHEL 5.x, remove when obsolete */
-size_t strlcat(char *dst, const char *src, size_t size)
-{
-	size_t dsize = strlen(dst);
-	size_t len = strlen(src);
-	size_t ret = dsize + len;
-
-	dst  += dsize;
-	size -= dsize;
-	if (len >= size)
-		len = size-1;
-	memcpy(dst, src, len);
-	dst[len] = '\0';
-	return ret;
-}
-#endif
-
 /**
  * Extracts tokens from strings.
  *
@@ -480,3 +444,83 @@ cfs_expr_list_free_list(struct list_head *list)
 		cfs_expr_list_free(el);
 	}
 }
+
+/**
+ * cfs_abs_path() - Get the absolute path of a relative path
+ * @request_path:	The relative path to be resolved
+ * @resolved_path:	Set to the resolved absolute path
+ *
+ * Returns the canonicalized absolute pathname.  This function is a wrapper to
+ * realpath, but will work even if the target file does not exist.  All
+ * directories in the path must exist.
+ *
+ * Return: On success, 0 is returned and resolved_path points to an allocated
+ * string containing the absolute pathname.  On error, errno is set
+ * appropriately, -errno is returned, and resolved_path points to NULL.
+ */
+int cfs_abs_path(const char *request_path, char **resolved_path)
+{
+	char  buf[PATH_MAX + 1] = "";
+	char *path;
+	char *ptr;
+	int len;
+	int rc = 0;
+	const char *fmt;
+
+	path = malloc(sizeof(buf));
+	if (path == NULL)
+		return -ENOMEM;
+
+	if (request_path[0] != '/') {
+		if (getcwd(path, sizeof(buf) - 1) == NULL) {
+			rc = -errno;
+			goto out;
+		}
+		len = snprintf(buf, sizeof(buf), "%s/%s", path, request_path);
+		if (len >= sizeof(buf)) {
+			rc = -ENAMETOOLONG;
+			goto out;
+		}
+	} else {
+		/* skip duplicate leading '/' */
+		len = snprintf(buf, sizeof(buf), "%s",
+			       request_path + strspn(request_path, "/") - 1);
+		if (len >= sizeof(buf)) {
+			rc = -ENAMETOOLONG;
+			goto out;
+		}
+	}
+
+	/* if filename not in root directory, call realpath for parent path */
+	ptr = strrchr(buf, '/');
+	if (ptr != buf) {
+		*ptr = '\0';
+		if (path != realpath(buf, path)) {
+			rc = -errno;
+			goto out;
+		}
+		/* add the filename back */
+		len = strlen(path);
+		fmt = (path[len - 1] == '/') ? "%s" : "/%s";
+		len = snprintf(path + len, sizeof(buf) - len, fmt, ptr + 1);
+		if (len >= sizeof(buf) - len) {
+			rc = -ENAMETOOLONG;
+			goto out;
+		}
+	} else {
+		len = snprintf(path, sizeof(buf), "%s", buf);
+		if (len >= sizeof(buf)) {
+			rc = -ENAMETOOLONG;
+			goto out;
+		}
+	}
+
+out:
+	if (rc == 0) {
+		*resolved_path = path;
+	} else {
+		*resolved_path = NULL;
+		free(path);
+	}
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c b/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c
index f9e4de58b8ed2..dd451dd807bc1 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2014, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,6 +40,10 @@
 #include <libcfs/libcfs.h>
 #include "tracefile.h"
 
+#ifndef WITH_WATCHDOG
+#define WITH_WATCHDOG
+#endif
+
 struct lc_watchdog {
 	spinlock_t		lcw_lock;	/* check or change lcw_list */
 	int			lcw_refcount;	/* must hold lcw_pending_timers_lock */
@@ -331,6 +335,7 @@ static void lcw_dispatch_stop(void)
 	wake_up(&lcw_event_waitq);
 
 	wait_for_completion(&lcw_stop_completion);
+	clear_bit(LCW_FLAG_STOP, &lcw_flags);
 
 	CDEBUG(D_INFO, "watchdog dispatcher has shut down.\n");
 
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/workitem.c b/drivers/staging/lustrefsx/libcfs/libcfs/workitem.c
index fb4fd643ee0c0..f370ffab81677 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/workitem.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/workitem.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2013, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -313,10 +313,9 @@ cfs_wi_sched_destroy(struct cfs_wi_sched *sched)
 		int i = 2;
 
 		while (sched->ws_nthreads > 0) {
-			CDEBUG(is_power_of_2(++i) ? D_WARNING : D_NET,
-			       "waiting for %d threads of WI sched[%s] to "
-			       "terminate\n", sched->ws_nthreads,
-			       sched->ws_name);
+			CDEBUG(is_power_of_2(++i / 20) ? D_WARNING : D_NET,
+			       "waiting %us for %d %s worker threads to exit\n",
+			       i / 20, sched->ws_nthreads, sched->ws_name);
 
 			spin_unlock(&cfs_wi_data.wi_glock);
 			set_current_state(TASK_UNINTERRUPTIBLE);
diff --git a/drivers/staging/lustrefsx/lnet/include/cyaml.h b/drivers/staging/lustrefsx/lnet/include/cyaml.h
index c9c21c750a45d..1537dbd19ed0c 100644
--- a/drivers/staging/lustrefsx/lnet/include/cyaml.h
+++ b/drivers/staging/lustrefsx/lnet/include/cyaml.h
@@ -18,7 +18,7 @@
  *
  * LGPL HEADER END
  *
- * Copyright (c) 2014, Intel Corporation.
+ * Copyright (c) 2014, 2017, Intel Corporation.
  *
  * Author:
  *   Amir Shehata <amir.shehata@intel.com>
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/api.h b/drivers/staging/lustrefsx/lnet/include/lnet/api.h
index 84c6bd0039632..1ce4a0056829d 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/api.h
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/api.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2016, Intel Corporation.
+ * Copyright (c) 2016, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -47,7 +47,7 @@
 # error This include is only for kernel use.
 #endif
 
-#include <lnet/types.h>
+#include <uapi/linux/lnet/lnet-types.h>
 
 /** \defgroup lnet_init_fini Initialization and cleanup
  * The LNet must be properly initialized before any LNet calls can be made.
@@ -198,7 +198,8 @@ int LNetGet(lnet_nid_t	      self,
 	    struct lnet_process_id target_in,
 	    unsigned int      portal_in,
 	    __u64	      match_bits_in,
-	    unsigned int      offset_in);
+	    unsigned int      offset_in,
+	    bool	      recovery);
 /** @} lnet_data */
 
 
@@ -210,6 +211,7 @@ int LNetSetLazyPortal(int portal);
 int LNetClearLazyPortal(int portal);
 int LNetCtl(unsigned int cmd, void *arg);
 void LNetDebugPeer(struct lnet_process_id id);
+int LNetGetPeerDiscoveryStatus(void);
 
 /** @} lnet_misc */
 
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h b/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h
index c905eda43b5b8..9ed1856453610 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -39,11 +39,13 @@
 
 #include <linux/netdevice.h>
 
-#include <libcfs/linux/linux-misc.h>
 #include <libcfs/libcfs.h>
 #include <lnet/api.h>
-#include <lnet/lnet.h>
 #include <lnet/lib-types.h>
+#include <uapi/linux/lnet/lnet-dlc.h>
+#include <uapi/linux/lnet/lnet-types.h>
+#include <uapi/linux/lnet/lnetctl.h>
+#include <uapi/linux/lnet/nidstr.h>
 
 extern struct lnet the_lnet;			/* THE network */
 
@@ -69,6 +71,10 @@ extern struct lnet the_lnet;			/* THE network */
 /** exclusive lock */
 #define LNET_LOCK_EX		CFS_PERCPT_LOCK_EX
 
+/* default timeout */
+#define DEFAULT_PEER_TIMEOUT    180
+#define LNET_LND_DEFAULT_TIMEOUT 5
+
 #ifdef HAVE_KERN_SOCK_GETNAME_2ARGS
 #define lnet_kernel_getpeername(sock, addr, addrlen) \
 		kernel_getpeername(sock, addr)
@@ -389,10 +395,40 @@ lnet_handle2me(struct lnet_handle_me *handle)
 	return lh_entry(lh, struct lnet_me, me_lh);
 }
 
+static inline void
+lnet_peer_net_addref_locked(struct lnet_peer_net *lpn)
+{
+	atomic_inc(&lpn->lpn_refcount);
+}
+
+extern void lnet_destroy_peer_net_locked(struct lnet_peer_net *lpn);
+
+static inline void
+lnet_peer_net_decref_locked(struct lnet_peer_net *lpn)
+{
+	if (atomic_dec_and_test(&lpn->lpn_refcount))
+		lnet_destroy_peer_net_locked(lpn);
+}
+
+static inline void
+lnet_peer_addref_locked(struct lnet_peer *lp)
+{
+	atomic_inc(&lp->lp_refcount);
+}
+
+extern void lnet_destroy_peer_locked(struct lnet_peer *lp);
+
+static inline void
+lnet_peer_decref_locked(struct lnet_peer *lp)
+{
+	if (atomic_dec_and_test(&lp->lp_refcount))
+		lnet_destroy_peer_locked(lp);
+}
+
 static inline void
 lnet_peer_ni_addref_locked(struct lnet_peer_ni *lp)
 {
-	LASSERT (atomic_read(&lp->lpni_refcount) > 0);
+	LASSERT(atomic_read(&lp->lpni_refcount) > 0);
 	atomic_inc(&lp->lpni_refcount);
 }
 
@@ -401,9 +437,8 @@ extern void lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lp);
 static inline void
 lnet_peer_ni_decref_locked(struct lnet_peer_ni *lp)
 {
-	LASSERT (atomic_read(&lp->lpni_refcount) > 0);
-	atomic_dec(&lp->lpni_refcount);
-	if (atomic_read(&lp->lpni_refcount) == 0)
+	LASSERT(atomic_read(&lp->lpni_refcount) > 0);
+	if (atomic_dec_and_test(&lp->lpni_refcount))
 		lnet_destroy_peer_ni_locked(lp);
 }
 
@@ -465,6 +500,26 @@ lnet_msg_free(struct lnet_msg *msg)
 	LIBCFS_FREE(msg, sizeof(*msg));
 }
 
+static inline struct lnet_rsp_tracker *
+lnet_rspt_alloc(int cpt)
+{
+	struct lnet_rsp_tracker *rspt;
+	LIBCFS_ALLOC(rspt, sizeof(*rspt));
+	lnet_net_lock(cpt);
+	the_lnet.ln_counters[cpt]->lct_health.lch_rst_alloc++;
+	lnet_net_unlock(cpt);
+	return rspt;
+}
+
+static inline void
+lnet_rspt_free(struct lnet_rsp_tracker *rspt, int cpt)
+{
+	LIBCFS_FREE(rspt, sizeof(*rspt));
+	lnet_net_lock(cpt);
+	the_lnet.ln_counters[cpt]->lct_health.lch_rst_alloc--;
+	lnet_net_unlock(cpt);
+}
+
 void lnet_ni_free(struct lnet_ni *ni);
 void lnet_net_free(struct lnet_net *net);
 
@@ -502,19 +557,26 @@ extern struct lnet_ni *lnet_nid2ni_locked(lnet_nid_t nid, int cpt);
 extern struct lnet_ni *lnet_nid2ni_addref(lnet_nid_t nid);
 extern struct lnet_ni *lnet_net2ni_locked(__u32 net, int cpt);
 extern struct lnet_ni *lnet_net2ni_addref(__u32 net);
-bool lnet_is_ni_healthy_locked(struct lnet_ni *ni);
 struct lnet_net *lnet_get_net_locked(__u32 net_id);
 
 int lnet_lib_init(void);
 void lnet_lib_exit(void);
 
+extern unsigned lnet_transaction_timeout;
+extern unsigned lnet_retry_count;
 extern unsigned int lnet_numa_range;
+extern unsigned int lnet_health_sensitivity;
+extern unsigned int lnet_recovery_interval;
+extern unsigned int lnet_peer_discovery_disabled;
+extern unsigned int lnet_drop_asym_route;
 extern int portal_rotor;
 
+void lnet_mt_event_handler(struct lnet_event *event);
+
 int lnet_notify(struct lnet_ni *ni, lnet_nid_t peer, int alive,
-		cfs_time_t when);
+		time64_t when);
 void lnet_notify_locked(struct lnet_peer_ni *lp, int notifylnd, int alive,
-			cfs_time_t when);
+			time64_t when);
 int lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway_nid,
 		   unsigned int priority);
 int lnet_check_routes(void);
@@ -527,24 +589,15 @@ struct lnet_ni *lnet_get_next_ni_locked(struct lnet_net *mynet,
 					struct lnet_ni *prev);
 struct lnet_ni *lnet_get_ni_idx_locked(int idx);
 
-struct libcfs_ioctl_handler {
-	struct list_head item;
-	int (*handle_ioctl)(unsigned int cmd, struct libcfs_ioctl_hdr *hdr);
-};
-
-#define DECLARE_IOCTL_HANDLER(ident, func)			\
-	static struct libcfs_ioctl_handler ident = {		\
-		.item	      = LIST_HEAD_INIT(ident.item),	\
-		.handle_ioctl = func				\
-	}
-
-extern int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand);
-extern int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand);
 extern int libcfs_ioctl_getdata(struct libcfs_ioctl_hdr **hdr_pp,
 				struct libcfs_ioctl_hdr __user *uparam);
+extern int lnet_get_peer_list(__u32 *countp, __u32 *sizep,
+			      struct lnet_process_id __user *ids);
+extern void lnet_peer_ni_set_healthv(lnet_nid_t nid, int value, bool all);
+extern void lnet_peer_ni_add_to_recoveryq_locked(struct lnet_peer_ni *lpni);
 
-void lnet_proc_init(void);
-void lnet_proc_fini(void);
+void lnet_router_debugfs_init(void);
+void lnet_router_debugfs_fini(void);
 int  lnet_rtrpools_alloc(int im_a_router);
 void lnet_destroy_rtrbuf(struct lnet_rtrbuf *rb, int npages);
 int  lnet_rtrpools_adjust(int tiny, int small, int large);
@@ -564,7 +617,6 @@ int lnet_islocalnet(__u32 net);
 
 void lnet_msg_attach_md(struct lnet_msg *msg, struct lnet_libmd *md,
 			unsigned int offset, unsigned int mlen);
-void lnet_msg_detach_md(struct lnet_msg *msg, int status);
 void lnet_build_unlink_event(struct lnet_libmd *md, struct lnet_event *ev);
 void lnet_build_msg_event(struct lnet_msg *msg, enum lnet_event_kind ev_type);
 void lnet_msg_commit(struct lnet_msg *msg, int cpt);
@@ -575,11 +627,15 @@ void lnet_prep_send(struct lnet_msg *msg, int type,
 		    struct lnet_process_id target, unsigned int offset,
 		    unsigned int len);
 int lnet_send(lnet_nid_t nid, struct lnet_msg *msg, lnet_nid_t rtr_nid);
+int lnet_send_ping(lnet_nid_t dest_nid, struct lnet_handle_md *mdh, int nnis,
+		   void *user_ptr, struct lnet_handle_eq eqh, bool recovery);
 void lnet_return_tx_credits_locked(struct lnet_msg *msg);
 void lnet_return_rx_credits_locked(struct lnet_msg *msg);
 void lnet_schedule_blocked_locked(struct lnet_rtrbufpool *rbp);
 void lnet_drop_routed_msgs_locked(struct list_head *list, int cpt);
 
+struct list_head **lnet_create_array_of_queues(void);
+
 /* portals functions */
 /* portals attributes */
 static inline int
@@ -644,16 +700,22 @@ void lnet_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg,
 void lnet_ni_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg,
 		  int delayed, unsigned int offset,
 		  unsigned int mlen, unsigned int rlen);
+void lnet_ni_send(struct lnet_ni *ni, struct lnet_msg *msg);
 
 struct lnet_msg *lnet_create_reply_msg(struct lnet_ni *ni,
 				       struct lnet_msg *get_msg);
 void lnet_set_reply_msg_len(struct lnet_ni *ni, struct lnet_msg *msg,
 			    unsigned int len);
+void lnet_detach_rsp_tracker(struct lnet_libmd *md, int cpt);
+void lnet_clean_zombie_rstqs(void);
 
 void lnet_finalize(struct lnet_msg *msg, int rc);
+bool lnet_send_error_simulation(struct lnet_msg *msg,
+				enum lnet_msg_hstatus *hstatus);
+void lnet_handle_remote_failure_locked(struct lnet_peer_ni *lpni);
 
 void lnet_drop_message(struct lnet_ni *ni, int cpt, void *private,
-		       unsigned int nob);
+		       unsigned int nob, __u32 msg_type);
 void lnet_drop_delayed_msg_list(struct list_head *head, char *reason);
 void lnet_recv_delayed_msg_list(struct list_head *head);
 
@@ -662,6 +724,7 @@ void lnet_msg_container_cleanup(struct lnet_msg_container *container);
 void lnet_msg_containers_destroy(void);
 int lnet_msg_containers_create(void);
 
+char *lnet_health_error2str(enum lnet_msg_hstatus hstatus);
 char *lnet_msgtyp2str(int type);
 void lnet_print_hdr(struct lnet_hdr *hdr);
 int lnet_fail_nid(lnet_nid_t nid, unsigned int threshold);
@@ -672,7 +735,7 @@ int lnet_fault_ctl(int cmd, struct libcfs_ioctl_data *data);
 int lnet_fault_init(void);
 void lnet_fault_fini(void);
 
-bool lnet_drop_rule_match(struct lnet_hdr *hdr);
+bool lnet_drop_rule_match(struct lnet_hdr *hdr, enum lnet_msg_hstatus *hstatus);
 
 int lnet_delay_rule_add(struct lnet_fault_attr *attr);
 int lnet_delay_rule_del(lnet_nid_t src, lnet_nid_t dst, bool shutdown);
@@ -684,6 +747,7 @@ bool lnet_delay_rule_match_locked(struct lnet_hdr *hdr, struct lnet_msg *msg);
 
 /** @} lnet_fault_simulation */
 
+void lnet_counters_get_common(struct lnet_counters_common *common);
 void lnet_counters_get(struct lnet_counters *counters);
 void lnet_counters_reset(void);
 
@@ -763,6 +827,7 @@ void lnet_md_deconstruct(struct lnet_libmd *lmd, struct lnet_md *umd);
 struct page *lnet_kvaddr_to_page(unsigned long vaddr);
 int lnet_cpt_of_md(struct lnet_libmd *md, unsigned int offset);
 
+unsigned int lnet_get_lnd_timeout(void);
 void lnet_register_lnd(struct lnet_lnd *lnd);
 void lnet_unregister_lnd(struct lnet_lnd *lnd);
 
@@ -801,10 +866,45 @@ int lnet_sock_connect(struct socket **sockp, int *fatal,
 int lnet_peers_start_down(void);
 int lnet_peer_buffer_credits(struct lnet_net *net);
 
-int lnet_router_checker_start(void);
-void lnet_router_checker_stop(void);
+int lnet_monitor_thr_start(void);
+void lnet_monitor_thr_stop(void);
+
+bool lnet_router_checker_active(void);
+void lnet_check_routers(void);
+int lnet_router_pre_mt_start(void);
+void lnet_router_post_mt_start(void);
+void lnet_prune_rc_data(int wait_unlink);
+void lnet_router_cleanup(void);
 void lnet_router_ni_update_locked(struct lnet_peer_ni *gw, __u32 net);
-void lnet_swap_pinginfo(struct lnet_ping_info *info);
+void lnet_swap_pinginfo(struct lnet_ping_buffer *pbuf);
+
+int lnet_ping_info_validate(struct lnet_ping_info *pinfo);
+struct lnet_ping_buffer *lnet_ping_buffer_alloc(int nnis, gfp_t gfp);
+void lnet_ping_buffer_free(struct lnet_ping_buffer *pbuf);
+
+static inline void lnet_ping_buffer_addref(struct lnet_ping_buffer *pbuf)
+{
+	atomic_inc(&pbuf->pb_refcnt);
+}
+
+static inline void lnet_ping_buffer_decref(struct lnet_ping_buffer *pbuf)
+{
+	if (atomic_dec_and_test(&pbuf->pb_refcnt))
+		lnet_ping_buffer_free(pbuf);
+}
+
+static inline int lnet_ping_buffer_numref(struct lnet_ping_buffer *pbuf)
+{
+	return atomic_read(&pbuf->pb_refcnt);
+}
+
+static inline int lnet_push_target_resize_needed(void)
+{
+	return the_lnet.ln_push_target->pb_nnis < the_lnet.ln_push_target_nnis;
+}
+
+int lnet_push_target_resize(void);
+void lnet_peer_push_event(struct lnet_event *ev);
 
 int lnet_parse_ip2nets(char **networksp, char *ip2nets);
 int lnet_parse_routes(char *route_str, int *im_a_router);
@@ -819,94 +919,115 @@ __u32 lnet_get_dlc_seq_locked(void);
 struct lnet_peer_ni *lnet_get_next_peer_ni_locked(struct lnet_peer *peer,
 						  struct lnet_peer_net *peer_net,
 						  struct lnet_peer_ni *prev);
-struct lnet_peer *lnet_find_or_create_peer_locked(lnet_nid_t dst_nid, int cpt);
-struct lnet_peer_ni *lnet_nid2peerni_locked(lnet_nid_t nid, int cpt);
+struct lnet_peer_ni *lnet_nid2peerni_locked(lnet_nid_t nid, lnet_nid_t pref,
+					int cpt);
 struct lnet_peer_ni *lnet_nid2peerni_ex(lnet_nid_t nid, int cpt);
 struct lnet_peer_ni *lnet_find_peer_ni_locked(lnet_nid_t nid);
+struct lnet_peer *lnet_find_peer(lnet_nid_t nid);
 void lnet_peer_net_added(struct lnet_net *net);
 lnet_nid_t lnet_peer_primary_nid_locked(lnet_nid_t nid);
+int lnet_discover_peer_locked(struct lnet_peer_ni *lpni, int cpt, bool block);
+int lnet_peer_discovery_start(void);
+void lnet_peer_discovery_stop(void);
+void lnet_push_update_to_peers(int force);
 void lnet_peer_tables_cleanup(struct lnet_net *net);
 void lnet_peer_uninit(void);
 int lnet_peer_tables_create(void);
 void lnet_debug_peer(lnet_nid_t nid);
 struct lnet_peer_net *lnet_peer_get_net_locked(struct lnet_peer *peer,
 					       __u32 net_id);
-bool lnet_peer_is_ni_pref_locked(struct lnet_peer_ni *lpni,
-				 struct lnet_ni *ni);
-int lnet_add_peer_ni_to_peer(lnet_nid_t key_nid, lnet_nid_t nid, bool mr);
-int lnet_del_peer_ni_from_peer(lnet_nid_t key_nid, lnet_nid_t nid);
-int lnet_get_peer_info(__u32 idx, lnet_nid_t *primary_nid, lnet_nid_t *nid,
-		       bool *mr,
-		       struct lnet_peer_ni_credit_info __user *peer_ni_info,
-		       struct lnet_ioctl_element_stats __user *peer_ni_stats);
+bool lnet_peer_is_pref_nid_locked(struct lnet_peer_ni *lpni, lnet_nid_t nid);
+int lnet_peer_ni_set_non_mr_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid);
+int lnet_add_peer_ni(lnet_nid_t key_nid, lnet_nid_t nid, bool mr);
+int lnet_del_peer_ni(lnet_nid_t key_nid, lnet_nid_t nid);
+int lnet_get_peer_info(struct lnet_ioctl_peer_cfg *cfg, void __user *bulk);
 int lnet_get_peer_ni_info(__u32 peer_index, __u64 *nid,
 			  char alivness[LNET_MAX_STR_LEN],
 			  __u32 *cpt_iter, __u32 *refcount,
 			  __u32 *ni_peer_tx_credits, __u32 *peer_tx_credits,
 			  __u32 *peer_rtr_credits, __u32 *peer_min_rtr_credtis,
 			  __u32 *peer_tx_qnob);
+int lnet_get_peer_ni_hstats(struct lnet_ioctl_peer_ni_hstats *stats);
 
-
-static inline __u32
-lnet_get_num_peer_nis(struct lnet_peer *peer)
+static inline struct lnet_peer_net *
+lnet_find_peer_net_locked(struct lnet_peer *peer, __u32 net_id)
 {
-	struct lnet_peer_net *lpn;
-	struct lnet_peer_ni *lpni;
-	__u32 count = 0;
+	struct lnet_peer_net *peer_net;
 
-	list_for_each_entry(lpn, &peer->lp_peer_nets, lpn_on_peer_list)
-		list_for_each_entry(lpni, &lpn->lpn_peer_nis,
-				    lpni_on_peer_net_list)
-			count++;
+	list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_peer_nets) {
+		if (peer_net->lpn_net_id == net_id)
+			return peer_net;
+	}
 
-	return count;
+	return NULL;
 }
 
-static inline bool
-lnet_is_peer_ni_healthy_locked(struct lnet_peer_ni *lpni)
+static inline void
+lnet_peer_set_alive(struct lnet_peer_ni *lp)
 {
-	return lpni->lpni_healthy;
+	lp->lpni_last_alive = ktime_get_seconds();
+	lp->lpni_last_query = lp->lpni_last_alive;
+	if (!lp->lpni_alive)
+		lnet_notify_locked(lp, 0, 1, lp->lpni_last_alive);
 }
 
-static inline void
-lnet_set_peer_ni_health_locked(struct lnet_peer_ni *lpni, bool health)
+static inline bool
+lnet_peer_is_multi_rail(struct lnet_peer *lp)
 {
-	lpni->lpni_healthy = health;
+	if (lp->lp_state & LNET_PEER_MULTI_RAIL)
+		return true;
+	return false;
 }
 
 static inline bool
-lnet_is_peer_net_healthy_locked(struct lnet_peer_net *peer_net)
+lnet_peer_ni_is_configured(struct lnet_peer_ni *lpni)
 {
-	struct lnet_peer_ni *lpni;
-
-	list_for_each_entry(lpni, &peer_net->lpn_peer_nis,
-			    lpni_on_peer_net_list) {
-		if (lnet_is_peer_ni_healthy_locked(lpni))
-			return true;
-	}
-
+	if (lpni->lpni_peer_net->lpn_peer->lp_state & LNET_PEER_CONFIGURED)
+		return true;
 	return false;
 }
 
 static inline bool
-lnet_is_peer_healthy_locked(struct lnet_peer *peer)
+lnet_peer_ni_is_primary(struct lnet_peer_ni *lpni)
 {
-	struct lnet_peer_net *peer_net;
+	return lpni->lpni_nid == lpni->lpni_peer_net->lpn_peer->lp_primary_nid;
+}
 
-	list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_on_peer_list) {
-		if (lnet_is_peer_net_healthy_locked(peer_net))
-			return true;
-	}
+bool lnet_peer_is_uptodate(struct lnet_peer *lp);
+bool lnet_peer_is_uptodate_locked(struct lnet_peer *lp);
+bool lnet_is_discovery_disabled(struct lnet_peer *lp);
 
+static inline bool
+lnet_peer_needs_push(struct lnet_peer *lp)
+{
+	if (!(lp->lp_state & LNET_PEER_MULTI_RAIL))
+		return false;
+	if (lp->lp_state & LNET_PEER_FORCE_PUSH)
+		return true;
+	if (lp->lp_state & LNET_PEER_NO_DISCOVERY)
+		return false;
+	/* if discovery is not enabled then no need to push */
+	if (lnet_peer_discovery_disabled)
+		return false;
+	if (lp->lp_node_seqno < atomic_read(&the_lnet.ln_ping_target_seqno))
+		return true;
 	return false;
 }
 
 static inline void
-lnet_peer_set_alive(struct lnet_peer_ni *lp)
+lnet_inc_healthv(atomic_t *healthv)
 {
-	lp->lpni_last_alive = lp->lpni_last_query = cfs_time_current();
-	if (!lp->lpni_alive)
-		lnet_notify_locked(lp, 0, 1, lp->lpni_last_alive);
+	atomic_add_unless(healthv, 1, LNET_MAX_HEALTH_VALUE);
 }
 
+void lnet_incr_stats(struct lnet_element_stats *stats,
+		     enum lnet_msg_type msg_type,
+		     enum lnet_stats_type stats_type);
+
+__u32 lnet_sum_stats(struct lnet_element_stats *stats,
+		     enum lnet_stats_type stats_type);
+
+void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
+			      struct lnet_element_stats *stats);
+
 #endif
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lib-types.h b/drivers/staging/lustrefsx/lnet/include/lnet/lib-types.h
index 9b8af0e45a4c8..496a1b0fe0f93 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/lib-types.h
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/lib-types.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -44,26 +44,56 @@
 
 #include <linux/kthread.h>
 #include <linux/uio.h>
+#include <linux/semaphore.h>
 #include <linux/types.h>
 
-#include <lnet/lnetctl.h>
+#include <uapi/linux/lnet/lnet-dlc.h>
+#include <uapi/linux/lnet/lnetctl.h>
 
 /* Max payload size */
-#ifndef CONFIG_LNET_MAX_PAYLOAD
-# error "CONFIG_LNET_MAX_PAYLOAD must be defined in config.h"
-#endif
+#define LNET_MAX_PAYLOAD	LNET_MTU
 
-#define LNET_MAX_PAYLOAD       CONFIG_LNET_MAX_PAYLOAD
-#if (LNET_MAX_PAYLOAD < LNET_MTU)
-# error "LNET_MAX_PAYLOAD too small - error in configure --with-max-payload-mb"
-#elif (LNET_MAX_PAYLOAD > (PAGE_SIZE * LNET_MAX_IOV))
-# error "LNET_MAX_PAYLOAD too large - error in configure --with-max-payload-mb"
-#endif
+/** limit on the number of fragments in discontiguous MDs */
+#define LNET_MAX_IOV	256
+
+/*
+ * This is the maximum health value.
+ * All local and peer NIs created have their health default to this value.
+ */
+#define LNET_MAX_HEALTH_VALUE 1000
 
 /* forward refs */
 struct lnet_libmd;
 
-typedef struct lnet_msg {
+enum lnet_msg_hstatus {
+	LNET_MSG_STATUS_OK = 0,
+	LNET_MSG_STATUS_LOCAL_INTERRUPT,
+	LNET_MSG_STATUS_LOCAL_DROPPED,
+	LNET_MSG_STATUS_LOCAL_ABORTED,
+	LNET_MSG_STATUS_LOCAL_NO_ROUTE,
+	LNET_MSG_STATUS_LOCAL_ERROR,
+	LNET_MSG_STATUS_LOCAL_TIMEOUT,
+	LNET_MSG_STATUS_REMOTE_ERROR,
+	LNET_MSG_STATUS_REMOTE_DROPPED,
+	LNET_MSG_STATUS_REMOTE_TIMEOUT,
+	LNET_MSG_STATUS_NETWORK_TIMEOUT,
+	LNET_MSG_STATUS_END,
+};
+
+struct lnet_rsp_tracker {
+	/* chain on the waiting list */
+	struct list_head rspt_on_list;
+	/* cpt to lock */
+	int rspt_cpt;
+	/* nid of next hop */
+	lnet_nid_t rspt_next_hop_nid;
+	/* deadline of the REPLY/ACK */
+	ktime_t rspt_deadline;
+	/* parent MD */
+	struct lnet_handle_md rspt_mdh;
+};
+
+struct lnet_msg {
 	struct list_head	msg_activelist;
 	struct list_head	msg_list;	/* Q for credits/MD */
 
@@ -74,6 +104,28 @@ typedef struct lnet_msg {
 	lnet_nid_t		msg_from;
 	__u32			msg_type;
 
+	/*
+	 * hold parameters in case message is with held due
+	 * to discovery
+	 */
+	lnet_nid_t		msg_src_nid_param;
+	lnet_nid_t		msg_rtr_nid_param;
+
+	/*
+	 * Deadline for the message after which it will be finalized if it
+	 * has not completed.
+	 */
+	ktime_t			msg_deadline;
+
+	/* The message health status. */
+	enum lnet_msg_hstatus	msg_health_status;
+	/* This is a recovery message */
+	bool			msg_recovery;
+	/* the number of times a transmission has been retried */
+	int			msg_retry_count;
+	/* flag to indicate that we do not want to resend this message */
+	bool			msg_no_resend;
+
 	/* committed for sending */
 	unsigned int		msg_tx_committed:1;
 	/* CPT # this message committed for sending */
@@ -120,17 +172,17 @@ typedef struct lnet_msg {
 
 	struct lnet_event	msg_ev;
 	struct lnet_hdr		msg_hdr;
-} lnet_msg_t;
+};
 
-typedef struct lnet_libhandle {
+struct lnet_libhandle {
 	struct list_head	lh_hash_chain;
 	__u64			lh_cookie;
-} lnet_libhandle_t;
+};
 
 #define lh_entry(ptr, type, member) \
 	((type *)((char *)(ptr)-(char *)(&((type *)0)->member)))
 
-typedef struct lnet_eq {
+struct lnet_eq {
 	struct list_head	eq_list;
 	struct lnet_libhandle	eq_lh;
 	unsigned long		eq_enq_seq;
@@ -139,9 +191,9 @@ typedef struct lnet_eq {
 	lnet_eq_handler_t	eq_callback;
 	struct lnet_event	*eq_events;
 	int			**eq_refs;	/* percpt refcount for EQ */
-} lnet_eq_t;
+};
 
-typedef struct lnet_me {
+struct lnet_me {
 	struct list_head	me_list;
 	struct lnet_libhandle	me_lh;
 	struct lnet_process_id	me_match_id;
@@ -151,40 +203,41 @@ typedef struct lnet_me {
 	__u64			me_ignore_bits;
 	enum lnet_unlink	me_unlink;
 	struct lnet_libmd      *me_md;
-} lnet_me_t;
-
-typedef struct lnet_libmd {
-	struct list_head	md_list;
-	struct lnet_libhandle	md_lh;
-	struct lnet_me	       *md_me;
-	char		       *md_start;
-	unsigned int		md_offset;
-	unsigned int		md_length;
-	unsigned int		md_max_size;
-	int			md_threshold;
-	int			md_refcount;
-	unsigned int		md_options;
-	unsigned int		md_flags;
-	unsigned int		md_niov;	/* # frags at end of struct */
-	void		       *md_user_ptr;
-	struct lnet_eq	       *md_eq;
-	struct lnet_handle_md	md_bulk_handle;
+};
+
+struct lnet_libmd {
+	struct list_head	 md_list;
+	struct lnet_libhandle	 md_lh;
+	struct lnet_me	        *md_me;
+	char		        *md_start;
+	unsigned int		 md_offset;
+	unsigned int		 md_length;
+	unsigned int		 md_max_size;
+	int			 md_threshold;
+	int			 md_refcount;
+	unsigned int		 md_options;
+	unsigned int		 md_flags;
+	unsigned int		 md_niov;	/* # frags at end of struct */
+	void		        *md_user_ptr;
+	struct lnet_rsp_tracker *md_rspt_ptr;
+	struct lnet_eq	        *md_eq;
+	struct lnet_handle_md	 md_bulk_handle;
 	union {
-		struct kvec	iov[LNET_MAX_IOV];
-		lnet_kiov_t	kiov[LNET_MAX_IOV];
+		struct kvec	 iov[LNET_MAX_IOV];
+		lnet_kiov_t	 kiov[LNET_MAX_IOV];
 	} md_iov;
-} lnet_libmd_t;
+};
 
 #define LNET_MD_FLAG_ZOMBIE	 (1 << 0)
 #define LNET_MD_FLAG_AUTO_UNLINK (1 << 1)
 #define LNET_MD_FLAG_ABORTED	 (1 << 2)
 
-typedef struct lnet_test_peer {
+struct lnet_test_peer {
 	/* info about peers we are trying to fail */
 	struct list_head	tp_list;	/* ln_test_peers */
 	lnet_nid_t		tp_nid;		/* matching nid */
 	unsigned int		tp_threshold;	/* # failures to simulate */
-} lnet_test_peer_t;
+};
 
 #define LNET_COOKIE_TYPE_MD    1
 #define LNET_COOKIE_TYPE_ME    2
@@ -195,7 +248,7 @@ typedef struct lnet_test_peer {
 struct lnet_ni;					 /* forward ref */
 struct socket;
 
-typedef struct lnet_lnd {
+struct lnet_lnd {
 	/* fields managed by portals */
 	struct list_head	lnd_list;	/* stash in the LND table */
 	int			lnd_refcount;	/* # active instances */
@@ -249,17 +302,11 @@ typedef struct lnet_lnd {
 	void (*lnd_notify)(struct lnet_ni *ni, lnet_nid_t peer, int alive);
 
 	/* query of peer aliveness */
-	void (*lnd_query)(struct lnet_ni *ni, lnet_nid_t peer, cfs_time_t *when);
+	void (*lnd_query)(struct lnet_ni *ni, lnet_nid_t peer, time64_t *when);
 
 	/* accept a new connection */
 	int (*lnd_accept)(struct lnet_ni *ni, struct socket *sock);
-} lnd_t;
-
-typedef struct lnet_ni_status {
-	lnet_nid_t ns_nid;
-	__u32	   ns_status;
-	__u32	   ns_unused;
-} WIRE_ATTR lnet_ni_status_t;
+};
 
 struct lnet_tx_queue {
 	int			tq_credits;	/* # tx credits free */
@@ -280,22 +327,51 @@ enum lnet_net_state {
 };
 
 enum lnet_ni_state {
-	/* set when NI block is allocated */
+	/* initial state when NI is created */
 	LNET_NI_STATE_INIT = 0,
-	/* set when NI is started successfully */
+	/* set when NI is brought up */
 	LNET_NI_STATE_ACTIVE,
-	/* set when LND notifies NI failed */
-	LNET_NI_STATE_FAILED,
-	/* set when LND notifies NI degraded */
-	LNET_NI_STATE_DEGRADED,
-	/* set when shuttding down NI */
-	LNET_NI_STATE_DELETING
+	/* set when NI is being shutdown */
+	LNET_NI_STATE_DELETING,
+};
+
+#define LNET_NI_RECOVERY_PENDING	BIT(0)
+#define LNET_NI_RECOVERY_FAILED		BIT(1)
+
+enum lnet_stats_type {
+	LNET_STATS_TYPE_SEND = 0,
+	LNET_STATS_TYPE_RECV,
+	LNET_STATS_TYPE_DROP
+};
+
+struct lnet_comm_count {
+	atomic_t co_get_count;
+	atomic_t co_put_count;
+	atomic_t co_reply_count;
+	atomic_t co_ack_count;
+	atomic_t co_hello_count;
 };
 
 struct lnet_element_stats {
-	atomic_t	send_count;
-	atomic_t	recv_count;
-	atomic_t	drop_count;
+	struct lnet_comm_count el_send_stats;
+	struct lnet_comm_count el_recv_stats;
+	struct lnet_comm_count el_drop_stats;
+};
+
+struct lnet_health_local_stats {
+	atomic_t hlt_local_interrupt;
+	atomic_t hlt_local_dropped;
+	atomic_t hlt_local_aborted;
+	atomic_t hlt_local_no_route;
+	atomic_t hlt_local_timeout;
+	atomic_t hlt_local_error;
+};
+
+struct lnet_health_remote_stats {
+	atomic_t hlt_remote_dropped;
+	atomic_t hlt_remote_timeout;
+	atomic_t hlt_remote_error;
+	atomic_t hlt_network_timeout;
 };
 
 struct lnet_net {
@@ -342,12 +418,15 @@ struct lnet_net {
 	enum lnet_net_state	net_state;
 };
 
-typedef struct lnet_ni {
+struct lnet_ni {
 	/* chain on the lnet_net structure */
 	struct list_head	ni_netlist;
 
-	/* chain on net_ni_cpt */
-	struct list_head	ni_cptlist;
+	/* chain on the recovery queue */
+	struct list_head	ni_recovery;
+
+	/* MD handle for recovery ping */
+	struct lnet_handle_md	ni_ping_mdh;
 
 	spinlock_t		ni_lock;
 
@@ -373,7 +452,7 @@ typedef struct lnet_ni {
 	int			**ni_refs;
 
 	/* when I was last alive */
-	long			ni_last_alive;
+	time64_t		ni_last_alive;
 
 	/* pointer to parent network */
 	struct lnet_net		*ni_net;
@@ -381,9 +460,12 @@ typedef struct lnet_ni {
 	/* my health status */
 	struct lnet_ni_status	*ni_status;
 
-	/* NI FSM */
+	/* NI FSM. Protected by lnet_ni_lock() */
 	enum lnet_ni_state	ni_state;
 
+	/* Recovery state. Protected by lnet_ni_lock() */
+	__u32			ni_recovery_state;
+
 	/* per NI LND tunables */
 	struct lnet_lnd_tunables ni_lnd_tunables;
 
@@ -392,6 +474,7 @@ typedef struct lnet_ni {
 
 	/* NI statistics */
 	struct lnet_element_stats ni_stats;
+	struct lnet_health_local_stats ni_hstats;
 
 	/* physical device CPT */
 	int			ni_dev_cpt;
@@ -399,50 +482,69 @@ typedef struct lnet_ni {
 	/* sequence number used to round robin over nis within a net */
 	__u32			ni_seq;
 
+	/*
+	 * health value
+	 *	initialized to LNET_MAX_HEALTH_VALUE
+	 * Value is decremented every time we fail to send a message over
+	 * this NI because of a NI specific failure.
+	 * Value is incremented if we successfully send a message.
+	 */
+	atomic_t		ni_healthv;
+
+	/*
+	 * Set to 1 by the LND when it receives an event telling it the device
+	 * has gone into a fatal state. Set to 0 when the LND receives an
+	 * even telling it the device is back online.
+	 */
+	atomic_t		ni_fatal_error_on;
+
 	/*
 	 * equivalent interfaces to use
 	 * This is an array because socklnd bonding can still be configured
 	 */
-	char			*ni_interfaces[LNET_NUM_INTERFACES];
+	char			*ni_interfaces[LNET_INTERFACES_NUM];
 	struct net		*ni_net_ns;     /* original net namespace */
-} lnet_ni_t;
+};
 
 #define LNET_PROTO_PING_MATCHBITS	0x8000000000000000LL
 
-/* NB: value of these features equal to LNET_PROTO_PING_VERSION_x
- * of old LNet, so there shouldn't be any compatibility issue */
-#define LNET_PING_FEAT_INVAL		(0)		/* no feature */
-#define LNET_PING_FEAT_BASE		(1 << 0)	/* just a ping */
-#define LNET_PING_FEAT_NI_STATUS	(1 << 1)	/* return NI status */
-#define LNET_PING_FEAT_RTE_DISABLED	(1 << 2)	/* Routing enabled */
+/*
+ * Descriptor of a ping info buffer: keep a separate indicator of the
+ * size and a reference count. The type is used both as a source and
+ * sink of data, so we need to keep some information outside of the
+ * area that may be overwritten by network data.
+ */
+struct lnet_ping_buffer {
+	int			pb_nnis;
+	atomic_t		pb_refcnt;
+	struct lnet_ping_info	pb_info;
+};
 
-#define LNET_PING_FEAT_MASK		(LNET_PING_FEAT_BASE | \
-					 LNET_PING_FEAT_NI_STATUS)
+#define LNET_PING_BUFFER_SIZE(NNIDS) \
+	offsetof(struct lnet_ping_buffer, pb_info.pi_ni[NNIDS])
+#define LNET_PING_BUFFER_LONI(PBUF)	((PBUF)->pb_info.pi_ni[0].ns_nid)
+#define LNET_PING_BUFFER_SEQNO(PBUF)	((PBUF)->pb_info.pi_ni[0].ns_status)
 
-typedef struct lnet_ping_info {
-	__u32			pi_magic;
-	__u32			pi_features;
-	lnet_pid_t		pi_pid;
-	__u32			pi_nnis;
-	struct lnet_ni_status	pi_ni[0];
-} WIRE_ATTR lnet_ping_info_t;
+#define LNET_PING_INFO_TO_BUFFER(PINFO)	\
+	container_of((PINFO), struct lnet_ping_buffer, pb_info)
 
 /* router checker data, per router */
-#define LNET_MAX_RTR_NIS   16
-#define LNET_PINGINFO_SIZE offsetof(struct lnet_ping_info, pi_ni[LNET_MAX_RTR_NIS])
-typedef struct lnet_rc_data {
+struct lnet_rc_data {
 	/* chain on the_lnet.ln_zombie_rcd or ln_deathrow_rcd */
 	struct list_head	rcd_list;
 	struct lnet_handle_md	rcd_mdh;	/* ping buffer MD */
 	struct lnet_peer_ni	*rcd_gateway;	/* reference to gateway */
-	struct lnet_ping_info	*rcd_pinginfo;	/* ping buffer */
-} lnet_rc_data_t;
+	struct lnet_ping_buffer	*rcd_pingbuffer;/* ping buffer */
+	int			rcd_nnis;	/* desired size of buffer */
+};
 
 struct lnet_peer_ni {
-	/* chain on peer_net */
-	struct list_head	lpni_on_peer_net_list;
+	/* chain on lpn_peer_nis */
+	struct list_head	lpni_peer_nis;
 	/* chain on remote peer list */
 	struct list_head	lpni_on_remote_peer_ni_list;
+	/* chain on recovery queue */
+	struct list_head	lpni_recovery;
 	/* chain on peer hash */
 	struct list_head	lpni_hashlist;
 	/* messages blocking for tx credits */
@@ -455,6 +557,7 @@ struct lnet_peer_ni {
 	struct lnet_peer_net	*lpni_peer_net;
 	/* statistics kept on each peer NI */
 	struct lnet_element_stats lpni_stats;
+	struct lnet_health_remote_stats lpni_hstats;
 	/* spin lock protecting credits and lpni_txq / lpni_rtrq */
 	spinlock_t		lpni_lock;
 	/* # tx credits available */
@@ -480,23 +583,29 @@ struct lnet_peer_ni {
 	/* # times router went dead<->alive. Protected with lpni_lock */
 	int			lpni_alive_count;
 	/* time of last aliveness news */
-	cfs_time_t		lpni_timestamp;
+	time64_t		lpni_timestamp;
 	/* time of last ping attempt */
-	cfs_time_t		lpni_ping_timestamp;
+	time64_t		lpni_ping_timestamp;
 	/* != 0 if ping reply expected */
-	cfs_time_t		lpni_ping_deadline;
+	time64_t		lpni_ping_deadline;
 	/* when I was last alive */
-	cfs_time_t		lpni_last_alive;
+	time64_t		lpni_last_alive;
 	/* when lpni_ni was queried last time */
-	cfs_time_t		lpni_last_query;
+	time64_t		lpni_last_query;
 	/* network peer is on */
 	struct lnet_net		*lpni_net;
 	/* peer's NID */
 	lnet_nid_t		lpni_nid;
 	/* # refs */
 	atomic_t		lpni_refcount;
+	/* health value for the peer */
+	atomic_t		lpni_healthv;
+	/* recovery ping mdh */
+	struct lnet_handle_md	lpni_recovery_ping_mdh;
 	/* CPT this peer attached on */
 	int			lpni_cpt;
+	/* state flags -- protected by lpni_lock */
+	unsigned		lpni_state;
 	/* # refs from lnet_route_t::lr_gateway */
 	int			lpni_rtr_refcount;
 	/* sequence number used to round robin over peer nis within a net */
@@ -509,31 +618,148 @@ struct lnet_peer_ni {
 	unsigned int		lpni_ping_feats;
 	/* routes on this peer */
 	struct list_head	lpni_routes;
-	/* array of preferred local nids */
-	lnet_nid_t		*lpni_pref_nids;
+	/* preferred local nids: if only one, use lpni_pref.nid */
+	union lpni_pref {
+		lnet_nid_t	nid;
+		lnet_nid_t	*nids;
+	} lpni_pref;
 	/* number of preferred NIDs in lnpi_pref_nids */
 	__u32			lpni_pref_nnids;
 	/* router checker state */
 	struct lnet_rc_data	*lpni_rcd;
 };
 
+/* Preferred path added due to traffic on non-MR peer_ni */
+#define LNET_PEER_NI_NON_MR_PREF	(1 << 0)
+/* peer is being recovered. */
+#define LNET_PEER_NI_RECOVERY_PENDING	(1 << 1)
+/* recovery ping failed */
+#define LNET_PEER_NI_RECOVERY_FAILED	(1 << 2)
+/* peer is being deleted */
+#define LNET_PEER_NI_DELETING		(1 << 3)
+
 struct lnet_peer {
-	/* chain on global peer list */
-	struct list_head	lp_on_lnet_peer_list;
+	/* chain on pt_peer_list */
+	struct list_head	lp_peer_list;
 
 	/* list of peer nets */
 	struct list_head	lp_peer_nets;
 
+	/* list of messages pending discovery*/
+	struct list_head	lp_dc_pendq;
+
 	/* primary NID of the peer */
 	lnet_nid_t		lp_primary_nid;
 
-	/* peer is Multi-Rail enabled peer */
-	bool			lp_multi_rail;
+	/* source NID to use during discovery */
+	lnet_nid_t		lp_disc_src_nid;
+
+	/* CPT of peer_table */
+	int			lp_cpt;
+
+	/* number of NIDs on this peer */
+	int			lp_nnis;
+
+	/* reference count */
+	atomic_t		lp_refcount;
+
+	/* lock protecting peer state flags */
+	spinlock_t		lp_lock;
+
+	/* peer state flags */
+	unsigned		lp_state;
+
+	/* buffer for data pushed by peer */
+	struct lnet_ping_buffer	*lp_data;
+
+	/* MD handle for ping in progress */
+	struct lnet_handle_md	lp_ping_mdh;
+
+	/* MD handle for push in progress */
+	struct lnet_handle_md	lp_push_mdh;
+
+	/* number of NIDs for sizing push data */
+	int			lp_data_nnis;
+
+	/* NI config sequence number of peer */
+	__u32			lp_peer_seqno;
+
+	/* Local NI config sequence number acked by peer */
+	__u32			lp_node_seqno;
+
+	/* Local NI config sequence number sent to peer */
+	__u32			lp_node_seqno_sent;
+
+	/* Ping error encountered during discovery. */
+	int			lp_ping_error;
+
+	/* Push error encountered during discovery. */
+	int			lp_push_error;
+
+	/* Error encountered during discovery. */
+	int			lp_dc_error;
+
+	/* time it was put on the ln_dc_working queue */
+	time64_t		lp_last_queued;
+
+	/* link on discovery-related lists */
+	struct list_head	lp_dc_list;
+
+	/* tasks waiting on discovery of this peer */
+	wait_queue_head_t	lp_dc_waitq;
 };
 
+/*
+ * The status flags in lp_state. Their semantics have chosen so that
+ * lp_state can be zero-initialized.
+ *
+ * A peer is marked MULTI_RAIL in two cases: it was configured using DLC
+ * as multi-rail aware, or the LNET_PING_FEAT_MULTI_RAIL bit was set.
+ *
+ * A peer is marked NO_DISCOVERY if the LNET_PING_FEAT_DISCOVERY bit was
+ * NOT set when the peer was pinged by discovery.
+ */
+#define LNET_PEER_MULTI_RAIL	(1 << 0)	/* Multi-rail aware */
+#define LNET_PEER_NO_DISCOVERY	(1 << 1)	/* Peer disabled discovery */
+/*
+ * A peer is marked CONFIGURED if it was configured by DLC.
+ *
+ * In addition, a peer is marked DISCOVERED if it has fully passed
+ * through Peer Discovery.
+ *
+ * When Peer Discovery is disabled, the discovery thread will mark
+ * peers REDISCOVER to indicate that they should be re-examined if
+ * discovery is (re)enabled on the node.
+ *
+ * A peer that was created as the result of inbound traffic will not
+ * be marked at all.
+ */
+#define LNET_PEER_CONFIGURED	(1 << 2)	/* Configured via DLC */
+#define LNET_PEER_DISCOVERED	(1 << 3)	/* Peer was discovered */
+#define LNET_PEER_REDISCOVER	(1 << 4)	/* Discovery was disabled */
+/*
+ * A peer is marked DISCOVERING when discovery is in progress.
+ * The other flags below correspond to stages of discovery.
+ */
+#define LNET_PEER_DISCOVERING	(1 << 5)	/* Discovering */
+#define LNET_PEER_DATA_PRESENT	(1 << 6)	/* Remote peer data present */
+#define LNET_PEER_NIDS_UPTODATE	(1 << 7)	/* Remote peer info uptodate */
+#define LNET_PEER_PING_SENT	(1 << 8)	/* Waiting for REPLY to Ping */
+#define LNET_PEER_PUSH_SENT	(1 << 9)	/* Waiting for ACK of Push */
+#define LNET_PEER_PING_FAILED	(1 << 10)	/* Ping send failure */
+#define LNET_PEER_PUSH_FAILED	(1 << 11)	/* Push send failure */
+/*
+ * A ping can be forced as a way to fix up state, or as a manual
+ * intervention by an admin.
+ * A push can be forced in circumstances that would normally not
+ * allow for one to happen.
+ */
+#define LNET_PEER_FORCE_PING	(1 << 12)	/* Forced Ping */
+#define LNET_PEER_FORCE_PUSH	(1 << 13)	/* Forced Push */
+
 struct lnet_peer_net {
-	/* chain on peer block */
-	struct list_head	lpn_on_peer_list;
+	/* chain on lp_peer_nets */
+	struct list_head	lpn_peer_nets;
 
 	/* list of peer_nis on this network */
 	struct list_head	lpn_peer_nis;
@@ -543,19 +769,38 @@ struct lnet_peer_net {
 
 	/* Net ID */
 	__u32			lpn_net_id;
+
+	/* reference count */
+	atomic_t		lpn_refcount;
 };
 
 /* peer hash size */
 #define LNET_PEER_HASH_BITS	9
 #define LNET_PEER_HASH_SIZE	(1 << LNET_PEER_HASH_BITS)
 
-/* peer hash table */
+/*
+ * peer hash table - one per CPT
+ *
+ * protected by lnet_net_lock/EX for update
+ *    pt_version
+ *    pt_number
+ *    pt_hash[...]
+ *    pt_peer_list
+ *    pt_peers
+ * protected by pt_zombie_lock:
+ *    pt_zombie_list
+ *    pt_zombies
+ *
+ * pt_zombie lock nests inside lnet_net_lock
+ */
 struct lnet_peer_table {
 	int			pt_version;	/* /proc validity stamp */
-	atomic_t		pt_number;	/* # peers extant */
+	int			pt_number;	/* # peers_ni extant */
 	struct list_head	*pt_hash;	/* NID->peer hash */
-	struct list_head	pt_zombie_list;	/* zombie peers */
-	int			pt_zombies;	/* # zombie peers */
+	struct list_head	pt_peer_list;	/* peers */
+	int			pt_peers;	/* # peers */
+	struct list_head	pt_zombie_list;	/* zombie peer_ni */
+	int			pt_zombies;	/* # zombie peers_ni */
 	spinlock_t		pt_zombie_lock;	/* protect list and count */
 };
 
@@ -566,7 +811,7 @@ struct lnet_peer_table {
 					((lp)->lpni_net) && \
 					(lp)->lpni_net->net_tunables.lct_peer_timeout > 0)
 
-typedef struct lnet_route {
+struct lnet_route {
 	struct list_head	lr_list;	/* chain on net */
 	struct list_head	lr_gwlist;	/* chain on gateway */
 	struct lnet_peer_ni	*lr_gateway;	/* router node */
@@ -575,27 +820,29 @@ typedef struct lnet_route {
 	unsigned int		lr_downis;	/* number of down NIs */
 	__u32			lr_hops;	/* how far I am */
 	unsigned int		lr_priority;	/* route priority */
-} lnet_route_t;
+};
 
 #define LNET_REMOTE_NETS_HASH_DEFAULT	(1U << 7)
 #define LNET_REMOTE_NETS_HASH_MAX	(1U << 16)
 #define LNET_REMOTE_NETS_HASH_SIZE	(1 << the_lnet.ln_remote_nets_hbits)
 
-typedef struct lnet_remotenet {
+struct lnet_remotenet {
 	/* chain on ln_remote_nets_hash */
 	struct list_head	lrn_list;
 	/* routes to me */
 	struct list_head	lrn_routes;
 	/* my net number */
 	__u32			lrn_net;
-} lnet_remotenet_t;
+};
 
 /** lnet message has credit and can be submitted to lnd for send/receive */
 #define LNET_CREDIT_OK		0
 /** lnet message is waiting for credit */
 #define LNET_CREDIT_WAIT	1
+/** lnet message is waiting for discovery */
+#define LNET_DC_WAIT		2
 
-typedef struct lnet_rtrbufpool {
+struct lnet_rtrbufpool {
 	/* my free buffer pool */
 	struct list_head	rbp_bufs;
 	/* messages blocking for a buffer */
@@ -610,13 +857,13 @@ typedef struct lnet_rtrbufpool {
 	int			rbp_credits;
 	/* low water mark */
 	int			rbp_mincredits;
-} lnet_rtrbufpool_t;
+};
 
-typedef struct lnet_rtrbuf {
+struct lnet_rtrbuf {
 	struct list_head	 rb_list;	/* chain on rbp_bufs */
 	struct lnet_rtrbufpool	*rb_pool;	/* owning pool */
 	lnet_kiov_t		 rb_kiov[0];	/* the buffer space */
-} lnet_rtrbuf_t;
+};
 
 #define LNET_PEER_HASHSIZE   503		/* prime! */
 
@@ -686,7 +933,7 @@ struct lnet_match_table {
 /* dispatch routed PUT message by hashing source NID for wildcard portals */
 #define	LNET_PTL_ROTOR_HASH_RT	3
 
-typedef struct lnet_portal {
+struct lnet_portal {
 	spinlock_t		ptl_lock;
 	unsigned int		ptl_index;	/* portal ID, reserved */
 	/* flags on this portal: lazy, unique... */
@@ -703,7 +950,7 @@ typedef struct lnet_portal {
 	int			ptl_mt_nmaps;
 	/* array of active entries' cpu-partition-id */
 	int			ptl_mt_maps[0];
-} lnet_portal_t;
+};
 
 #define LNET_LH_HASH_BITS	12
 #define LNET_LH_HASH_SIZE	(1ULL << LNET_LH_HASH_BITS)
@@ -724,22 +971,31 @@ struct lnet_msg_container {
 	int			msc_nfinalizers;
 	/* msgs waiting to complete finalizing */
 	struct list_head	msc_finalizing;
+	/* msgs waiting to be resent */
+	struct list_head	msc_resending;
 	struct list_head	msc_active;	/* active message list */
 	/* threads doing finalization */
 	void			**msc_finalizers;
+	/* threads doing resends */
+	void			**msc_resenders;
 };
 
+/* Peer Discovery states */
+#define LNET_DC_STATE_SHUTDOWN		0	/* not started */
+#define LNET_DC_STATE_RUNNING		1	/* started up OK */
+#define LNET_DC_STATE_STOPPING		2	/* telling thread to stop */
+
 /* Router Checker states */
-#define LNET_RC_STATE_SHUTDOWN		0	/* not started */
-#define LNET_RC_STATE_RUNNING		1	/* started up OK */
-#define LNET_RC_STATE_STOPPING		2	/* telling thread to stop */
+#define LNET_MT_STATE_SHUTDOWN		0	/* not started */
+#define LNET_MT_STATE_RUNNING		1	/* started up OK */
+#define LNET_MT_STATE_STOPPING		2	/* telling thread to stop */
 
 /* LNet states */
 #define LNET_STATE_SHUTDOWN		0	/* not started */
 #define LNET_STATE_RUNNING		1	/* started up OK */
 #define LNET_STATE_STOPPING		2	/* telling thread to stop */
 
-typedef struct lnet {
+struct lnet {
 	/* CPU partition table of LNet */
 	struct cfs_cpt_table		*ln_cpt_table;
 	/* number of CPTs in ln_cpt_table */
@@ -770,8 +1026,6 @@ typedef struct lnet {
 	struct lnet_msg_container	**ln_msg_containers;
 	struct lnet_counters		**ln_counters;
 	struct lnet_peer_table		**ln_peer_tables;
-	/* list of configured or discovered peers */
-	struct list_head		ln_peers;
 	/* list of peer nis not on a local network */
 	struct list_head		ln_remote_peer_ni_list;
 	/* failure simulation */
@@ -784,6 +1038,10 @@ typedef struct lnet {
 	struct lnet_ni			*ln_loni;
 	/* network zombie list */
 	struct list_head		ln_net_zombie;
+	/* resend messages list */
+	struct list_head		ln_msg_resend;
+	/* spin lock to protect the msg resend list */
+	spinlock_t			ln_msg_resend_lock;
 
 	/* remote networks with routes to them */
 	struct list_head		*ln_remote_nets_hash;
@@ -796,12 +1054,46 @@ typedef struct lnet {
 	/* percpt router buffer pools */
 	struct lnet_rtrbufpool		**ln_rtrpools;
 
+	/*
+	 * Ping target / Push source
+	 *
+	 * The ping target and push source share a single buffer. The
+	 * ln_ping_target is protected against concurrent updates by
+	 * ln_api_mutex.
+	 */
 	struct lnet_handle_md		ln_ping_target_md;
 	struct lnet_handle_eq		ln_ping_target_eq;
-	struct lnet_ping_info		*ln_ping_info;
+	struct lnet_ping_buffer		*ln_ping_target;
+	atomic_t			ln_ping_target_seqno;
 
-	/* router checker startup/shutdown state */
-	int				ln_rc_state;
+	/*
+	 * Push Target
+	 *
+	 * ln_push_nnis contains the desired size of the push target.
+	 * The lnet_net_lock is used to handle update races. The old
+	 * buffer may linger a while after it has been unlinked, in
+	 * which case the event handler cleans up.
+	 */
+	struct lnet_handle_eq		ln_push_target_eq;
+	struct lnet_handle_md		ln_push_target_md;
+	struct lnet_ping_buffer		*ln_push_target;
+	int				ln_push_target_nnis;
+
+	/* discovery event queue handle */
+	struct lnet_handle_eq		ln_dc_eqh;
+	/* discovery requests */
+	struct list_head		ln_dc_request;
+	/* discovery working list */
+	struct list_head		ln_dc_working;
+	/* discovery expired list */
+	struct list_head		ln_dc_expired;
+	/* discovery thread wait queue */
+	wait_queue_head_t		ln_dc_waitq;
+	/* discovery startup/shutdown state */
+	int				ln_dc_state;
+
+	/* monitor thread startup/shutdown state */
+	int				ln_mt_state;
 	/* router checker's event queue */
 	struct lnet_handle_eq		ln_rc_eqh;
 	/* rcd still pending on net */
@@ -809,7 +1101,7 @@ typedef struct lnet {
 	/* rcd ready for free */
 	struct list_head		ln_rcd_zombie;
 	/* serialise startup/shutdown */
-	struct semaphore		ln_rc_signal;
+	struct semaphore		ln_mt_signal;
 
 	struct mutex			ln_api_mutex;
 	struct mutex			ln_lnd_mutex;
@@ -837,10 +1129,36 @@ typedef struct lnet {
 	 */
 	bool				ln_nis_from_mod_params;
 
-	/* waitq for router checker.  As long as there are no routes in
-	 * the list, the router checker will sleep on this queue.  when
-	 * routes are added the thread will wake up */
-	wait_queue_head_t		ln_rc_waitq;
-} lnet_t;
+	/*
+	 * waitq for the monitor thread. The monitor thread takes care of
+	 * checking routes, timedout messages and resending messages.
+	 */
+	wait_queue_head_t		ln_mt_waitq;
+
+	/* per-cpt resend queues */
+	struct list_head		**ln_mt_resendqs;
+	/* local NIs to recover */
+	struct list_head		ln_mt_localNIRecovq;
+	/* local NIs to recover */
+	struct list_head		ln_mt_peerNIRecovq;
+	/*
+	 * An array of queues for GET/PUT waiting for REPLY/ACK respectively.
+	 * There are CPT number of queues. Since response trackers will be
+	 * added on the fast path we can't afford to grab the exclusive
+	 * net lock to protect these queues. The CPT will be calculated
+	 * based on the mdh cookie.
+	 */
+	struct list_head		**ln_mt_rstq;
+	/*
+	 * A response tracker becomes a zombie when the associated MD is queued
+	 * for unlink before the response tracker is detached from the MD. An
+	 * entry on a zombie list can be freed when either the remaining
+	 * operations on the MD complete or when LNet has shut down.
+	 */
+	struct list_head		**ln_mt_zombie_rstqs;
+	/* recovery eq handler */
+	struct lnet_handle_eq		ln_mt_eqh;
+
+};
 
 #endif
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/socklnd.h b/drivers/staging/lustrefsx/lnet/include/lnet/socklnd.h
index 843d35c06105a..e2c19f2a4ed35 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/socklnd.h
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/socklnd.h
@@ -28,22 +28,12 @@
  * Lustre is a trademark of Sun Microsystems, Inc.
  *
  * lnet/include/lnet/socklnd.h
- *
- * #defines shared between socknal implementation and utilities
  */
 #ifndef __LNET_LNET_SOCKLND_H__
 #define __LNET_LNET_SOCKLND_H__
 
-#include <lnet/types.h>
-
-#define SOCKLND_CONN_NONE     (-1)
-#define SOCKLND_CONN_ANY	0
-#define SOCKLND_CONN_CONTROL	1
-#define SOCKLND_CONN_BULK_IN	2
-#define SOCKLND_CONN_BULK_OUT	3
-#define SOCKLND_CONN_NTYPES	4
-
-#define SOCKLND_CONN_ACK	SOCKLND_CONN_BULK_IN
+#include <uapi/linux/lnet/lnet-types.h>
+#include <uapi/linux/lnet/socklnd.h>
 
 struct ksock_hello_msg {
 	__u32			kshm_magic;	/* magic number of socklnd message */
diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_debug.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_debug.h
new file mode 100644
index 0000000000000..2672fe7ae103d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_debug.h
@@ -0,0 +1,151 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_debug.h
+ *
+ * Debug messages and assertions
+ *
+ */
+
+#ifndef __UAPI_LIBCFS_DEBUG_H__
+#define __UAPI_LIBCFS_DEBUG_H__
+
+#include <linux/types.h>
+
+/**
+ * Format for debug message headers
+ */
+struct ptldebug_header {
+	__u32 ph_len;
+	__u32 ph_flags;
+	__u32 ph_subsys;
+	__u32 ph_mask;
+	__u16 ph_cpu_id;
+	__u16 ph_type;
+	/* time_t overflow in 2106 */
+	__u32 ph_sec;
+	__u64 ph_usec;
+	__u32 ph_stack;
+	__u32 ph_pid;
+	__u32 ph_extern_pid;
+	__u32 ph_line_num;
+} __attribute__((packed));
+
+#define PH_FLAG_FIRST_RECORD	1
+
+/* Debugging subsystems (32 bits, non-overlapping) */
+#define S_UNDEFINED     0x00000001
+#define S_MDC           0x00000002
+#define S_MDS           0x00000004
+#define S_OSC           0x00000008
+#define S_OST           0x00000010
+#define S_CLASS         0x00000020
+#define S_LOG           0x00000040
+#define S_LLITE         0x00000080
+#define S_RPC           0x00000100
+#define S_MGMT          0x00000200
+#define S_LNET          0x00000400
+#define S_LND           0x00000800 /* ALL LNDs */
+#define S_PINGER        0x00001000
+#define S_FILTER        0x00002000
+#define S_LIBCFS        0x00004000
+#define S_ECHO          0x00008000
+#define S_LDLM          0x00010000
+#define S_LOV           0x00020000
+#define S_LQUOTA        0x00040000
+#define S_OSD           0x00080000
+#define S_LFSCK         0x00100000
+#define S_SNAPSHOT      0x00200000
+/* unused */
+#define S_LMV           0x00800000 /* b_new_cmd */
+/* unused */
+#define S_SEC           0x02000000 /* upcall cache */
+#define S_GSS           0x04000000 /* b_new_cmd */
+/* unused */
+#define S_MGC           0x10000000
+#define S_MGS           0x20000000
+#define S_FID           0x40000000 /* b_new_cmd */
+#define S_FLD           0x80000000 /* b_new_cmd */
+
+#define LIBCFS_DEBUG_SUBSYS_NAMES {					\
+	"undefined", "mdc", "mds", "osc", "ost", "class", "log",	\
+	"llite", "rpc", "mgmt", "lnet", "lnd", "pinger", "filter",	\
+	"libcfs", "echo", "ldlm", "lov", "lquota", "osd", "lfsck",	\
+	"snapshot", "", "lmv", "", "sec", "gss", "", "mgc", "mgs",	\
+	"fid", "fld", NULL }
+
+/* Debugging masks (32 bits, non-overlapping) */
+#define D_TRACE         0x00000001 /* ENTRY/EXIT markers */
+#define D_INODE         0x00000002
+#define D_SUPER         0x00000004
+#define D_EXT2          0x00000008 /* anything from ext2_debug */
+#define D_MALLOC        0x00000010 /* print malloc, free information */
+#define D_CACHE         0x00000020 /* cache-related items */
+#define D_INFO          0x00000040 /* general information */
+#define D_IOCTL         0x00000080 /* ioctl related information */
+#define D_NETERROR      0x00000100 /* network errors */
+#define D_NET           0x00000200 /* network communications */
+#define D_WARNING       0x00000400 /* CWARN(...) == CDEBUG (D_WARNING, ...) */
+#define D_BUFFS         0x00000800
+#define D_OTHER         0x00001000
+#define D_DENTRY        0x00002000
+#define D_NETTRACE      0x00004000
+#define D_PAGE          0x00008000 /* bulk page handling */
+#define D_DLMTRACE      0x00010000
+#define D_ERROR         0x00020000 /* CERROR(...) == CDEBUG (D_ERROR, ...) */
+#define D_EMERG         0x00040000 /* CEMERG(...) == CDEBUG (D_EMERG, ...) */
+#define D_HA            0x00080000 /* recovery and failover */
+#define D_RPCTRACE      0x00100000 /* for distributed debugging */
+#define D_VFSTRACE      0x00200000
+#define D_READA         0x00400000 /* read-ahead */
+#define D_MMAP          0x00800000
+#define D_CONFIG        0x01000000
+#define D_CONSOLE       0x02000000
+#define D_QUOTA         0x04000000
+#define D_SEC           0x08000000
+#define D_LFSCK         0x10000000 /* For both OI scrub and LFSCK */
+#define D_HSM           0x20000000
+#define D_SNAPSHOT      0x40000000 /* snapshot */
+#define D_LAYOUT        0x80000000
+
+#define LIBCFS_DEBUG_MASKS_NAMES {					\
+	"trace", "inode", "super", "ext2", "malloc", "cache", "info",	\
+	"ioctl", "neterror", "net", "warning", "buffs", "other",	\
+	"dentry", "nettrace", "page", "dlmtrace", "error", "emerg",	\
+	"ha", "rpctrace", "vfstrace", "reada", "mmap", "config",	\
+	"console", "quota", "sec", "lfsck", "hsm", "snapshot", "layout",\
+	NULL }
+
+#define D_CANTMASK   (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE)
+
+#define LIBCFS_DEBUG_FILE_PATH_DEFAULT "/tmp/lustre-log"
+
+#endif	/* __UAPI_LIBCFS_DEBUG_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ioctl.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_ioctl.h
similarity index 88%
rename from drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ioctl.h
rename to drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_ioctl.h
index 6b79096f761a0..cdac10f572408 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ioctl.h
+++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_ioctl.h
@@ -23,21 +23,19 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2013, 2016, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  * Lustre is a trademark of Sun Microsystems, Inc.
  *
- * libcfs/include/libcfs/libcfs_ioctl.h
- *
  * Low-level ioctl data structures. Kernel ioctl functions declared here,
  * and user space functions are in libcfs/util/ioctl.h.
  *
  */
 
-#ifndef __LIBCFS_IOCTL_H__
-#define __LIBCFS_IOCTL_H__
+#ifndef __UAPI_LIBCFS_IOCTL_H__
+#define __UAPI_LIBCFS_IOCTL_H__
 
 #include <linux/types.h>
 #include <linux/ioctl.h>
@@ -77,8 +75,7 @@ struct libcfs_ioctl_data {
 	char ioc_bulk[0];
 };
 
-struct libcfs_debug_ioctl_data
-{
+struct libcfs_debug_ioctl_data {
 	struct libcfs_ioctl_hdr hdr;
 	unsigned int subs;
 	unsigned int debug;
@@ -105,7 +102,7 @@ struct libcfs_debug_ioctl_data
 #define IOC_LIBCFS_CONFIGURE		   _IOWR('e', 59, IOCTL_LIBCFS_TYPE)
 #define IOC_LIBCFS_TESTPROTOCOMPAT	   _IOWR('e', 60, IOCTL_LIBCFS_TYPE)
 #define IOC_LIBCFS_PING			   _IOWR('e', 61, IOCTL_LIBCFS_TYPE)
-/*	IOC_LIBCFS_DEBUG_PEER		   _IOWR('e', 62, IOCTL_LIBCFS_TYPE) */
+#define IOC_LIBCFS_PING_PEER               _IOWR('e', 62, IOCTL_LIBCFS_TYPE)
 #define IOC_LIBCFS_LNETST		   _IOWR('e', 63, IOCTL_LIBCFS_TYPE)
 #define IOC_LIBCFS_LNET_FAULT		   _IOWR('e', 64, IOCTL_LIBCFS_TYPE)
 /* lnd ioctls */
@@ -116,7 +113,7 @@ struct libcfs_debug_ioctl_data
 #define IOC_LIBCFS_DEL_PEER		   _IOWR('e', 74, IOCTL_LIBCFS_TYPE)
 #define IOC_LIBCFS_ADD_PEER		   _IOWR('e', 75, IOCTL_LIBCFS_TYPE)
 #define IOC_LIBCFS_GET_PEER		   _IOWR('e', 76, IOCTL_LIBCFS_TYPE)
-/* ioctl 77 is free for use */
+#define IOC_LIBCFS_DISCOVER                _IOWR('e', 77, IOCTL_LIBCFS_TYPE)
 #define IOC_LIBCFS_ADD_INTERFACE	   _IOWR('e', 78, IOCTL_LIBCFS_TYPE)
 #define IOC_LIBCFS_DEL_INTERFACE	   _IOWR('e', 79, IOCTL_LIBCFS_TYPE)
 #define IOC_LIBCFS_GET_INTERFACE	   _IOWR('e', 80, IOCTL_LIBCFS_TYPE)
@@ -148,8 +145,13 @@ struct libcfs_debug_ioctl_data
 #define IOC_LIBCFS_GET_LOCAL_NI		   _IOWR(IOC_LIBCFS_TYPE, 97, IOCTL_CONFIG_SIZE)
 #define IOC_LIBCFS_SET_NUMA_RANGE	   _IOWR(IOC_LIBCFS_TYPE, 98, IOCTL_CONFIG_SIZE)
 #define IOC_LIBCFS_GET_NUMA_RANGE	   _IOWR(IOC_LIBCFS_TYPE, 99, IOCTL_CONFIG_SIZE)
-#define IOC_LIBCFS_MAX_NR					  99
+#define IOC_LIBCFS_GET_PEER_LIST	   _IOWR(IOC_LIBCFS_TYPE, 100, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_LOCAL_NI_MSG_STATS  _IOWR(IOC_LIBCFS_TYPE, 101, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_SET_HEALHV		   _IOWR(IOC_LIBCFS_TYPE, 102, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_LOCAL_HSTATS	   _IOWR(IOC_LIBCFS_TYPE, 103, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_RECOVERY_QUEUE	   _IOWR(IOC_LIBCFS_TYPE, 104, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_MAX_NR					  104
 
 extern int libcfs_ioctl_data_adjust(struct libcfs_ioctl_data *data);
 
-#endif /* __LIBCFS_IOCTL_H__ */
+#endif /* __UAPI_LIBCFS_IOCTL_H__ */
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lib-dlc.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-dlc.h
similarity index 76%
rename from drivers/staging/lustrefsx/lnet/include/lnet/lib-dlc.h
rename to drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-dlc.h
index 4141f7c492c22..f10cbc3309176 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/lib-dlc.h
+++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-dlc.h
@@ -20,21 +20,32 @@
  *
  */
 /*
- * Copyright (c) 2014, 2016, Intel Corporation.
+ * Copyright (c) 2014, 2017, Intel Corporation.
  */
 /*
  * Author: Amir Shehata <amir.shehata@intel.com>
  */
 
-#ifndef LNET_DLC_H
-#define LNET_DLC_H
+#ifndef __UAPI_LNET_DLC_H_
+#define __UAPI_LNET_DLC_H_
 
-#include <libcfs/libcfs_ioctl.h>
-#include <lnet/types.h>
+#include <linux/types.h>
+/*
+ * This is due to us being out of kernel and the way the OpenSFS branch
+ * handles CFLAGS.
+ */
+#ifdef __KERNEL__
+# include <uapi/linux/lnet/libcfs_ioctl.h>
+# include <uapi/linux/lnet/lnet-types.h>
+#else
+# include <linux/lnet/libcfs_ioctl.h>
+# include <linux/lnet/lnet-types.h>
+#endif
 
 #define MAX_NUM_SHOW_ENTRIES	32
 #define LNET_MAX_STR_LEN	128
 #define LNET_MAX_SHOW_NUM_CPT	128
+#define LNET_MAX_SHOW_NUM_NID	128
 #define LNET_UNDEFINED_HOPS	((__u32) -1)
 
 /*
@@ -81,7 +92,7 @@ struct lnet_ioctl_config_lnd_tunables {
 };
 
 struct lnet_ioctl_net_config {
-	char ni_interfaces[LNET_NUM_INTERFACES][LNET_MAX_STR_LEN];
+	char ni_interfaces[LNET_INTERFACES_NUM][LNET_MAX_STR_LEN];
 	__u32 ni_status;
 	__u32 ni_cpts[LNET_MAX_SHOW_NUM_CPT];
 	char cfg_bulk[0];
@@ -111,8 +122,8 @@ struct lnet_ioctl_ping_data {
 	__u32 ping_count;
 	__u32 ping_flags;
 	bool mr_info;
-	lnet_process_id_t ping_id;
-	lnet_process_id_t __user *ping_buf;
+	struct lnet_process_id ping_id;
+	struct lnet_process_id __user *ping_buf;
 };
 
 struct lnet_ioctl_config_data {
@@ -163,6 +174,31 @@ struct lnet_ioctl_element_stats {
 	__u32 iel_drop_count;
 };
 
+enum lnet_health_type {
+	LNET_HEALTH_TYPE_LOCAL_NI = 0,
+	LNET_HEALTH_TYPE_PEER_NI,
+};
+
+struct lnet_ioctl_local_ni_hstats {
+	struct libcfs_ioctl_hdr hlni_hdr;
+	lnet_nid_t hlni_nid;
+	__u32 hlni_local_interrupt;
+	__u32 hlni_local_dropped;
+	__u32 hlni_local_aborted;
+	__u32 hlni_local_no_route;
+	__u32 hlni_local_timeout;
+	__u32 hlni_local_error;
+	__s32 hlni_health_value;
+};
+
+struct lnet_ioctl_peer_ni_hstats {
+	__u32 hlpni_remote_dropped;
+	__u32 hlpni_remote_timeout;
+	__u32 hlpni_remote_error;
+	__u32 hlpni_network_timeout;
+	__s32 hlpni_health_value;
+};
+
 struct lnet_ioctl_element_msg_stats {
 	struct libcfs_ioctl_hdr im_hdr;
 	__u32 im_idx;
@@ -184,7 +220,7 @@ struct lnet_ioctl_element_msg_stats {
 struct lnet_ioctl_config_ni {
 	struct libcfs_ioctl_hdr lic_cfg_hdr;
 	lnet_nid_t		lic_nid;
-	char			lic_ni_intf[LNET_NUM_INTERFACES][LNET_MAX_STR_LEN];
+	char			lic_ni_intf[LNET_INTERFACES_NUM][LNET_MAX_STR_LEN];
 	char			lic_legacy_ip2nets[LNET_MAX_STR_LEN];
 	__u32			lic_cpts[LNET_MAX_SHOW_NUM_CPT];
 	__u32			lic_ncpts;
@@ -230,9 +266,24 @@ struct lnet_ioctl_peer_cfg {
 	void __user *prcfg_bulk;
 };
 
-struct lnet_ioctl_numa_range {
-	struct libcfs_ioctl_hdr nr_hdr;
-	__u32 nr_range;
+struct lnet_ioctl_reset_health_cfg {
+	struct libcfs_ioctl_hdr rh_hdr;
+	enum lnet_health_type rh_type;
+	bool rh_all;
+	int rh_value;
+	lnet_nid_t rh_nid;
+};
+
+struct lnet_ioctl_recovery_list {
+	struct libcfs_ioctl_hdr rlst_hdr;
+	enum lnet_health_type rlst_type;
+	int rlst_num_nids;
+	lnet_nid_t rlst_nid_array[LNET_MAX_SHOW_NUM_NID];
+};
+
+struct lnet_ioctl_set_value {
+	struct libcfs_ioctl_hdr sv_hdr;
+	__u32 sv_value;
 };
 
 struct lnet_ioctl_lnet_stats {
@@ -240,4 +291,4 @@ struct lnet_ioctl_lnet_stats {
 	struct lnet_counters st_cntrs;
 };
 
-#endif /* LNET_DLC_H */
+#endif /* _LNET_DLC_H_ */
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/types.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-types.h
similarity index 85%
rename from drivers/staging/lustrefsx/lnet/include/lnet/types.h
rename to drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-types.h
index e4bfe3d4951dd..1f7828c8c9c15 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/types.h
+++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-types.h
@@ -23,15 +23,15 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  * Lustre is a trademark of Sun Microsystems, Inc.
  */
 
-#ifndef __LNET_TYPES_H__
-#define __LNET_TYPES_H__
+#ifndef __UAPI_LNET_TYPES_H__
+#define __UAPI_LNET_TYPES_H__
 
 /** \addtogroup lnet
  * @{ */
@@ -107,30 +107,33 @@ static inline __u32 LNET_MKNET(__u32 type, __u32 num)
 	return (type << 16) | num;
 }
 
+/** The lolnd NID (i.e. myself) */
+#define LNET_NID_LO_0 LNET_MKNID(LNET_MKNET(LOLND, 0), 0)
+
 #define WIRE_ATTR	__attribute__((packed))
 
 /* Packed version of struct lnet_process_id to transfer via network */
-typedef struct lnet_process_id_packed {
+struct lnet_process_id_packed {
 	lnet_nid_t nid;
 	lnet_pid_t pid;	/* node id / process id */
-} WIRE_ATTR lnet_process_id_packed;
+} WIRE_ATTR;
 
 /* The wire handle's interface cookie only matches one network interface in
  * one epoch (i.e. new cookie when the interface restarts or the node
  * reboots).  The object cookie only matches one object on that interface
  * during that object's lifetime (i.e. no cookie re-use). */
-typedef struct lnet_handle_wire {
+struct lnet_handle_wire {
 	__u64 wh_interface_cookie;
 	__u64 wh_object_cookie;
-} WIRE_ATTR lnet_handle_wire_t;
+} WIRE_ATTR;
 
-typedef enum lnet_msg_type {
+enum lnet_msg_type {
 	LNET_MSG_ACK = 0,
 	LNET_MSG_PUT,
 	LNET_MSG_GET,
 	LNET_MSG_REPLY,
 	LNET_MSG_HELLO,
-} lnet_msg_type_t;
+};
 
 /* The variant fields of the portals message header are aligned on an 8
  * byte boundary in the message header.  Note that all types used in these
@@ -167,7 +170,7 @@ struct lnet_hello {
 	__u32			type;
 } WIRE_ATTR;
 
-typedef struct lnet_hdr {
+struct lnet_hdr {
 	lnet_nid_t	dest_nid;
 	lnet_nid_t	src_nid;
 	lnet_pid_t	dest_pid;
@@ -182,7 +185,7 @@ typedef struct lnet_hdr {
 		struct lnet_reply	reply;
 		struct lnet_hello	hello;
 	} msg;
-} WIRE_ATTR lnet_hdr_t;
+} WIRE_ATTR;
 
 /* A HELLO message contains a magic number and protocol version
  * code in the header's dest_nid, the peer's NID in the src_nid, and
@@ -193,11 +196,11 @@ typedef struct lnet_hdr {
  * exchange HELLO messages when a connection is first established.  Individual
  * LNDs can put whatever else they fancy in lnet_hdr::msg.
  */
-typedef struct lnet_magicversion {
+struct lnet_magicversion {
 	__u32	magic;		/* LNET_PROTO_TCP_MAGIC */
 	__u16	version_major;	/* increment on incompatible change */
 	__u16	version_minor;	/* increment on compatible change */
-} WIRE_ATTR lnet_magic_version_t;
+} WIRE_ATTR;
 
 /* PROTO MAGIC for LNDs */
 #define LNET_PROTO_IB_MAGIC		0x0be91b91
@@ -215,39 +218,109 @@ typedef struct lnet_magicversion {
 #define LNET_PROTO_TCP_VERSION_MINOR	0
 
 /* Acceptor connection request */
-typedef struct lnet_acceptor_connreq {
+struct lnet_acceptor_connreq {
 	__u32	acr_magic;	/* PTL_ACCEPTOR_PROTO_MAGIC */
 	__u32	acr_version;	/* protocol version */
 	__u64	acr_nid;	/* target NID */
-} WIRE_ATTR lnet_acceptor_connreq_t;
+} WIRE_ATTR;
 
 #define LNET_PROTO_ACCEPTOR_VERSION	1
 
-typedef struct lnet_counters {
-	__u32	msgs_alloc;
-	__u32	msgs_max;
-	__u32	errors;
-	__u32	send_count;
-	__u32	recv_count;
-	__u32	route_count;
-	__u32	drop_count;
-	__u64	send_length;
-	__u64	recv_length;
-	__u64	route_length;
-	__u64	drop_length;
-} WIRE_ATTR lnet_counters_t;
+struct lnet_counters_common {
+	__u32	lcc_msgs_alloc;
+	__u32	lcc_msgs_max;
+	__u32	lcc_errors;
+	__u32	lcc_send_count;
+	__u32	lcc_recv_count;
+	__u32	lcc_route_count;
+	__u32	lcc_drop_count;
+	__u64	lcc_send_length;
+	__u64	lcc_recv_length;
+	__u64	lcc_route_length;
+	__u64	lcc_drop_length;
+} WIRE_ATTR;
+
+struct lnet_counters_health {
+	__u32	lch_rst_alloc;
+	__u32	lch_resend_count;
+	__u32	lch_response_timeout_count;
+	__u32	lch_local_interrupt_count;
+	__u32	lch_local_dropped_count;
+	__u32	lch_local_aborted_count;
+	__u32	lch_local_no_route_count;
+	__u32	lch_local_timeout_count;
+	__u32	lch_local_error_count;
+	__u32	lch_remote_dropped_count;
+	__u32	lch_remote_error_count;
+	__u32	lch_remote_timeout_count;
+	__u32	lch_network_timeout_count;
+};
+
+struct lnet_counters {
+	struct lnet_counters_common lct_common;
+	struct lnet_counters_health lct_health;
+};
 
 #define LNET_NI_STATUS_UP	0x15aac0de
 #define LNET_NI_STATUS_DOWN	0xdeadface
 #define LNET_NI_STATUS_INVALID	0x00000000
 
+struct lnet_ni_status {
+	lnet_nid_t ns_nid;
+	__u32      ns_status;
+	__u32      ns_unused;
+} WIRE_ATTR;
+
+/*
+ * NB: value of these features equal to LNET_PROTO_PING_VERSION_x
+ * of old LNet, so there shouldn't be any compatibility issue
+ */
+#define LNET_PING_FEAT_INVAL		(0)		/* no feature */
+#define LNET_PING_FEAT_BASE		(1 << 0)	/* just a ping */
+#define LNET_PING_FEAT_NI_STATUS	(1 << 1)	/* return NI status */
+#define LNET_PING_FEAT_RTE_DISABLED	(1 << 2)        /* Routing enabled */
+#define LNET_PING_FEAT_MULTI_RAIL	(1 << 3)        /* Multi-Rail aware */
+#define LNET_PING_FEAT_DISCOVERY	(1 << 4)	/* Supports Discovery */
+
+/*
+ * All ping feature bits fit to hit the wire.
+ * In lnet_assert_wire_constants() this is compared against its open-coded
+ * value, and in lnet_ping_target_update() it is used to verify that no
+ * unknown bits have been set.
+ * New feature bits can be added, just be aware that this does change the
+ * over-the-wire protocol.
+ */
+#define LNET_PING_FEAT_BITS		(LNET_PING_FEAT_BASE | \
+					 LNET_PING_FEAT_NI_STATUS | \
+					 LNET_PING_FEAT_RTE_DISABLED | \
+					 LNET_PING_FEAT_MULTI_RAIL | \
+					 LNET_PING_FEAT_DISCOVERY)
+
+struct lnet_ping_info {
+	__u32			pi_magic;
+	__u32			pi_features;
+	lnet_pid_t		pi_pid;
+	__u32			pi_nnis;
+	struct lnet_ni_status	pi_ni[0];
+} WIRE_ATTR;
+
+#define LNET_PING_INFO_SIZE(NNIDS) \
+	offsetof(struct lnet_ping_info, pi_ni[NNIDS])
+#define LNET_PING_INFO_LONI(PINFO)	((PINFO)->pi_ni[0].ns_nid)
+#define LNET_PING_INFO_SEQNO(PINFO)	((PINFO)->pi_ni[0].ns_status)
+
 /*
  * This is a hard-coded limit on the number of interfaces supported by
  * the interface bonding implemented by the ksocknal LND. It must be
  * defined here because it is used in LNet data structures that are
  * common to all LNDs.
  */
-#define LNET_NUM_INTERFACES	16
+#define LNET_INTERFACES_NUM	16
+
+/* The minimum number of interfaces per node supported by LNet. */
+#define LNET_INTERFACES_MIN	16
+/* The default - arbitrary - value of the lnet_max_interfaces tunable. */
+#define LNET_INTERFACES_MAX_DEFAULT	200
 
 /**
  * Objects maintained by the LNet are accessed through handles. Handle types
@@ -258,9 +331,9 @@ typedef struct lnet_counters {
  */
 #define LNET_WIRE_HANDLE_COOKIE_NONE   (-1)
 
-typedef struct lnet_handle_eq {
+struct lnet_handle_eq {
 	__u64	cookie;
-} lnet_handle_eq_t;
+};
 
 /**
  * Invalidate eq handle \a h.
@@ -280,9 +353,9 @@ static inline int LNetEQHandleIsInvalid(struct lnet_handle_eq h)
 	return (LNET_WIRE_HANDLE_COOKIE_NONE == h.cookie);
 }
 
-typedef struct lnet_handle_md {
+struct lnet_handle_md {
 	__u64	cookie;
-} lnet_handle_md_t;
+};
 
 /**
  * Invalidate md handle \a h.
@@ -302,19 +375,19 @@ static inline int LNetMDHandleIsInvalid(struct lnet_handle_md h)
 	return (LNET_WIRE_HANDLE_COOKIE_NONE == h.cookie);
 }
 
-typedef struct lnet_handle_me {
+struct lnet_handle_me {
 	__u64	cookie;
-} lnet_handle_me_t;
+};
 
 /**
  * Global process ID.
  */
-typedef struct lnet_process_id {
+struct lnet_process_id {
 	/** node id */
 	lnet_nid_t nid;
 	/** process id */
 	lnet_pid_t pid;
-} lnet_process_id_t;
+};
 /** @} lnet_addr */
 
 /** \addtogroup lnet_me
@@ -324,10 +397,10 @@ typedef struct lnet_process_id {
  * Specifies whether the match entry or memory descriptor should be unlinked
  * automatically (LNET_UNLINK) or not (LNET_RETAIN).
  */
-typedef enum lnet_unlink {
+enum lnet_unlink {
 	LNET_RETAIN = 0,
 	LNET_UNLINK
-} lnet_unlink_t;
+};
 
 /**
  * Values of the type enum lnet_ins_pos are used to control where a new match
@@ -336,14 +409,14 @@ typedef enum lnet_unlink {
  * LNET_INS_AFTER is used to insert the new entry after the current entry
  * or after the last item in the list.
  */
-typedef enum lnet_ins_pos {
+enum lnet_ins_pos {
 	/** insert ME before current position or head of the list */
 	LNET_INS_BEFORE,
 	/** insert ME after current position or tail of the list */
 	LNET_INS_AFTER,
 	/** attach ME at tail of local CPU partition ME list */
 	LNET_INS_LOCAL
-} lnet_ins_pos;
+};
 
 /** @} lnet_me */
 
@@ -354,7 +427,7 @@ typedef enum lnet_ins_pos {
  * Defines the visible parts of a memory descriptor. Values of this type
  * are used to initialize memory descriptors.
  */
-typedef struct lnet_md {
+struct lnet_md {
 	/**
 	 * Specify the memory region associated with the memory descriptor.
 	 * If the options field has:
@@ -458,7 +531,7 @@ typedef struct lnet_md {
 	 * if the LNET_MD_BULK_HANDLE option is set.
 	 */
 	struct lnet_handle_md bulk_handle;
-} lnet_md_t;
+};
 
 /* Max Transfer Unit (minimum supported everywhere).
  * CAVEAT EMPTOR, with multinet (i.e. routers forwarding between networks)
@@ -466,9 +539,6 @@ typedef struct lnet_md {
 #define LNET_MTU_BITS	20
 #define LNET_MTU	(1 << LNET_MTU_BITS)
 
-/** limit on the number of fragments in discontiguous MDs */
-#define LNET_MAX_IOV	256
-
 /**
  * Options for the MD structure. See struct lnet_md::options.
  */
@@ -520,7 +590,7 @@ typedef struct {
 /**
  * Six types of events can be logged in an event queue.
  */
-typedef enum lnet_event_kind {
+enum lnet_event_kind {
 	/** An incoming GET operation has completed on the MD. */
 	LNET_EVENT_GET		= 1,
 	/**
@@ -556,14 +626,14 @@ typedef enum lnet_event_kind {
 	 * \see LNetMDUnlink
 	 */
 	LNET_EVENT_UNLINK,
-} lnet_event_kind_t;
+};
 
 #define LNET_SEQ_GT(a, b)	(((signed long)((a) - (b))) > 0)
 
 /**
  * Information about an event on a MD.
  */
-typedef struct lnet_event {
+struct lnet_event {
 	/** The identifier (nid, pid) of the target. */
 	struct lnet_process_id   target;
 	/** The identifier (nid, pid) of the initiator. */
@@ -608,6 +678,11 @@ typedef struct lnet_event {
 	 * \see LNetPut
 	 */
 	__u64               hdr_data;
+	/**
+	 * The message type, to ensure a handler for LNET_EVENT_SEND can
+	 * distinguish between LNET_MSG_GET and LNET_MSG_PUT.
+	 */
+	__u32               msg_type;
 	/**
 	 * Indicates the completion status of the operation. It's 0 for
 	 * successful operations, otherwise it's an error code.
@@ -632,7 +707,7 @@ typedef struct lnet_event {
 	 * to each event.
 	 */
 	volatile unsigned long sequence;
-} lnet_event_t;
+};
 
 /**
  * Event queue handler function type.
@@ -659,12 +734,12 @@ typedef void (*lnet_eq_handler_t)(struct lnet_event *event);
  * \see struct lnet_md::options for the discussion on LNET_MD_ACK_DISABLE
  * by which acknowledgments can be disabled for a MD.
  */
-typedef enum lnet_ack_req {
+enum lnet_ack_req {
 	/** Request an acknowledgment */
 	LNET_ACK_REQ,
 	/** Request that no acknowledgment should be generated. */
 	LNET_NOACK_REQ
-} lnet_ack_req_t;
+};
 /** @} lnet_data */
 
 /** @} lnet */
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lnetctl.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetctl.h
similarity index 76%
rename from drivers/staging/lustrefsx/lnet/include/lnet/lnetctl.h
rename to drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetctl.h
index 4328135c5ec72..cb4f153e377d1 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/lnetctl.h
+++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetctl.h
@@ -17,12 +17,23 @@
  * header for lnet ioctl
  */
 /*
- * Copyright (c) 2014, Intel Corporation.
+ * Copyright (c) 2014, 2017, Intel Corporation.
  */
-#ifndef _LNETCTL_H_
-#define _LNETCTL_H_
+#ifndef __UAPI_LNETCTL_H_
+#define __UAPI_LNETCTL_H_
 
-#include <lnet/types.h>
+#include <linux/types.h>
+/*
+ * This is due to us being out of kernel and the way the OpenSFS branch
+ * handles CFLAGS.
+ */
+#ifdef __KERNEL__
+# include <uapi/linux/lnet/lnet-types.h>
+#else
+# include <linux/lnet/lnet-types.h>
+#endif
+
+#include <stdbool.h>
 
 /** \addtogroup lnet_fault_simulation
  * @{ */
@@ -43,6 +54,19 @@ enum {
 #define LNET_GET_BIT		(1 << 2)
 #define LNET_REPLY_BIT		(1 << 3)
 
+#define HSTATUS_END			11
+#define HSTATUS_LOCAL_INTERRUPT_BIT	(1 << 1)
+#define HSTATUS_LOCAL_DROPPED_BIT	(1 << 2)
+#define HSTATUS_LOCAL_ABORTED_BIT	(1 << 3)
+#define HSTATUS_LOCAL_NO_ROUTE_BIT	(1 << 4)
+#define HSTATUS_LOCAL_ERROR_BIT		(1 << 5)
+#define HSTATUS_LOCAL_TIMEOUT_BIT	(1 << 6)
+#define HSTATUS_REMOTE_ERROR_BIT	(1 << 7)
+#define HSTATUS_REMOTE_DROPPED_BIT	(1 << 8)
+#define HSTATUS_REMOTE_TIMEOUT_BIT	(1 << 9)
+#define HSTATUS_NETWORK_TIMEOUT_BIT	(1 << 10)
+#define HSTATUS_RANDOM			0xffffffff
+
 /** ioctl parameter for LNet fault simulation */
 struct lnet_fault_attr {
 	/**
@@ -80,6 +104,10 @@ struct lnet_fault_attr {
 			 * with da_rate
 			 */
 			__u32			da_interval;
+			/** error type mask */
+			__u32			da_health_error_mask;
+			/** randomize error generation */
+			bool			da_random;
 		} drop;
 		/** message latency simulation */
 		struct {
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lnetst.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetst.h
similarity index 99%
rename from drivers/staging/lustrefsx/lnet/include/lnet/lnetst.h
rename to drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetst.h
index 7071039d9aa38..ca871cac02b7b 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/lnetst.h
+++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetst.h
@@ -29,13 +29,13 @@
  * This file is part of Lustre, http://www.lustre.org/
  * Lustre is a trademark of Sun Microsystems, Inc.
  *
- * lnet/include/lnet/lnetst.h
- *
  * Author: Liang Zhen <liangzhen@clusterfs.com>
  */
 
-#ifndef __LNET_ST_H__
-#define __LNET_ST_H__
+#ifndef __UAPI_LNET_ST_H__
+#define __UAPI_LNET_ST_H__
+
+#include <linux/types.h>
 
 #define LST_FEAT_NONE		(0)
 #define LST_FEAT_BULK_LEN	(1 << 0)	/* enable variable page size */
@@ -67,7 +67,7 @@
 
 struct lst_sid {
 	lnet_nid_t	ses_nid;	/* nid of console node */
-	__u64		ses_stamp;	/* time stamp */
+	__s64		ses_stamp;	/* time stamp in milliseconds */
 };					/*** session id */
 
 extern struct lst_sid LST_INVALID_SID;
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/nidstr.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/nidstr.h
similarity index 93%
rename from drivers/staging/lustrefsx/lnet/include/lnet/nidstr.h
rename to drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/nidstr.h
index be14a1dfcf71d..c41b9158ecd7d 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/nidstr.h
+++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/nidstr.h
@@ -23,11 +23,21 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2014, 2015, Intel Corporation.
+ * Copyright (c) 2015, 2017, Intel Corporation.
  */
 #ifndef _LNET_NIDSTRINGS_H
 #define _LNET_NIDSTRINGS_H
-#include <lnet/types.h>
+
+#include <linux/types.h>
+/*
+ * This is due to us being out of kernel and the way the OpenSFS branch
+ * handles CFLAGS.
+ */
+#ifdef __KERNEL__
+# include <uapi/linux/lnet/lnet-types.h>
+#else
+# include <linux/lnet/lnet-types.h>
+#endif
 
 /**
  *  Lustre Network Driver types.
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lnet.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/socklnd.h
similarity index 74%
rename from drivers/staging/lustrefsx/lnet/include/lnet/lnet.h
rename to drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/socklnd.h
index 54061f593496e..6453e053fa99d 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/lnet.h
+++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/socklnd.h
@@ -22,25 +22,23 @@
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
- *
- * Copyright (c) 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  * Lustre is a trademark of Sun Microsystems, Inc.
- */
-
-#ifndef __LNET_H__
-#define __LNET_H__
-
-/*
- * lnet.h
  *
- * User application interface file
+ * #defines shared between socknal implementation and utilities
  */
+#ifndef __UAPI_LNET_SOCKLND_H__
+#define __UAPI_LNET_SOCKLND_H__
+
+#define SOCKLND_CONN_NONE     (-1)
+#define SOCKLND_CONN_ANY	0
+#define SOCKLND_CONN_CONTROL	1
+#define SOCKLND_CONN_BULK_IN	2
+#define SOCKLND_CONN_BULK_OUT	3
+#define SOCKLND_CONN_NTYPES	4
 
-#include <lnet/types.h>
-#include <lnet/lib-dlc.h>
-#include <lnet/nidstr.h>
+#define SOCKLND_CONN_ACK	SOCKLND_CONN_BULK_IN
 
 #endif
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c
index 90645f6388ea6..8a14b86f904c1 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -35,11 +35,13 @@
  */
 
 #include <asm/page.h>
+#include <linux/inetdevice.h>
+
 #include "o2iblnd.h"
 
 static struct lnet_lnd the_o2iblnd;
 
-kib_data_t              kiblnd_data;
+struct kib_data kiblnd_data;
 
 static __u32
 kiblnd_cksum (void *ptr, int nob)
@@ -96,41 +98,40 @@ kiblnd_msgtype2str(int type)
 static int
 kiblnd_msgtype2size(int type)
 {
-        const int hdr_size = offsetof(kib_msg_t, ibm_u);
+	const int hdr_size = offsetof(struct kib_msg, ibm_u);
 
         switch (type) {
         case IBLND_MSG_CONNREQ:
         case IBLND_MSG_CONNACK:
-                return hdr_size + sizeof(kib_connparams_t);
+		return hdr_size + sizeof(struct kib_connparams);
 
         case IBLND_MSG_NOOP:
                 return hdr_size;
 
         case IBLND_MSG_IMMEDIATE:
-                return offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]);
+		return offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[0]);
 
         case IBLND_MSG_PUT_REQ:
-                return hdr_size + sizeof(kib_putreq_msg_t);
+		return hdr_size + sizeof(struct kib_putreq_msg);
 
         case IBLND_MSG_PUT_ACK:
-                return hdr_size + sizeof(kib_putack_msg_t);
+		return hdr_size + sizeof(struct kib_putack_msg);
 
         case IBLND_MSG_GET_REQ:
-                return hdr_size + sizeof(kib_get_msg_t);
+		return hdr_size + sizeof(struct kib_get_msg);
 
         case IBLND_MSG_PUT_NAK:
         case IBLND_MSG_PUT_DONE:
         case IBLND_MSG_GET_DONE:
-                return hdr_size + sizeof(kib_completion_msg_t);
+		return hdr_size + sizeof(struct kib_completion_msg);
         default:
                 return -1;
         }
 }
 
-static int
-kiblnd_unpack_rd(kib_msg_t *msg, int flip)
+static int kiblnd_unpack_rd(struct kib_msg *msg, int flip)
 {
-        kib_rdma_desc_t   *rd;
+	struct kib_rdma_desc *rd;
         int                nob;
         int                n;
         int                i;
@@ -155,7 +156,7 @@ kiblnd_unpack_rd(kib_msg_t *msg, int flip)
                 return 1;
         }
 
-        nob = offsetof (kib_msg_t, ibm_u) +
+	nob = offsetof(struct kib_msg, ibm_u) +
               kiblnd_rd_msg_size(rd, msg->ibm_type, n);
 
         if (msg->ibm_nob < nob) {
@@ -175,11 +176,10 @@ kiblnd_unpack_rd(kib_msg_t *msg, int flip)
         return 0;
 }
 
-void
-kiblnd_pack_msg(struct lnet_ni *ni, kib_msg_t *msg, int version,
-		int credits, lnet_nid_t dstnid, __u64 dststamp)
+void kiblnd_pack_msg(struct lnet_ni *ni, struct kib_msg *msg, int version,
+		     int credits, lnet_nid_t dstnid, __u64 dststamp)
 {
-        kib_net_t *net = ni->ni_data;
+	struct kib_net *net = ni->ni_data;
 
         /* CAVEAT EMPTOR! all message fields not set here should have been
          * initialised previously. */
@@ -200,10 +200,9 @@ kiblnd_pack_msg(struct lnet_ni *ni, kib_msg_t *msg, int version,
         }
 }
 
-int
-kiblnd_unpack_msg(kib_msg_t *msg, int nob)
+int kiblnd_unpack_msg(struct kib_msg *msg, int nob)
 {
-        const int hdr_size = offsetof(kib_msg_t, ibm_u);
+	const int hdr_size = offsetof(struct kib_msg, ibm_u);
         __u32     msg_cksum;
         __u16     version;
         int       msg_nob;
@@ -313,12 +312,13 @@ kiblnd_unpack_msg(kib_msg_t *msg, int nob)
 }
 
 int
-kiblnd_create_peer(struct lnet_ni *ni, kib_peer_ni_t **peerp, lnet_nid_t nid)
+kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer_ni **peerp,
+		   lnet_nid_t nid)
 {
-	kib_peer_ni_t	*peer_ni;
-	kib_net_t	*net = ni->ni_data;
-	int		cpt = lnet_cpt_of_nid(nid, ni);
-	unsigned long   flags;
+	struct kib_peer_ni *peer_ni;
+	struct kib_net *net = ni->ni_data;
+	int cpt = lnet_cpt_of_nid(nid, ni);
+	unsigned long flags;
 
 	LASSERT(net != NULL);
 	LASSERT(nid != LNET_NID_ANY);
@@ -333,7 +333,7 @@ kiblnd_create_peer(struct lnet_ni *ni, kib_peer_ni_t **peerp, lnet_nid_t nid)
 	peer_ni->ibp_nid = nid;
 	peer_ni->ibp_error = 0;
 	peer_ni->ibp_last_alive = 0;
-	peer_ni->ibp_max_frags = kiblnd_cfg_rdma_frags(peer_ni->ibp_ni);
+	peer_ni->ibp_max_frags = IBLND_MAX_RDMA_FRAGS;
 	peer_ni->ibp_queue_depth = ni->ni_net->net_tunables.lct_peer_tx_credits;
 	atomic_set(&peer_ni->ibp_refcount, 1);	/* 1 ref for caller */
 
@@ -356,9 +356,9 @@ kiblnd_create_peer(struct lnet_ni *ni, kib_peer_ni_t **peerp, lnet_nid_t nid)
 }
 
 void
-kiblnd_destroy_peer (kib_peer_ni_t *peer_ni)
+kiblnd_destroy_peer(struct kib_peer_ni *peer_ni)
 {
-	kib_net_t *net = peer_ni->ibp_ni->ni_data;
+	struct kib_net *net = peer_ni->ibp_ni->ni_data;
 
 	LASSERT(net != NULL);
 	LASSERT (atomic_read(&peer_ni->ibp_refcount) == 0);
@@ -375,18 +375,18 @@ kiblnd_destroy_peer (kib_peer_ni_t *peer_ni)
 	atomic_dec(&net->ibn_npeers);
 }
 
-kib_peer_ni_t *
+struct kib_peer_ni *
 kiblnd_find_peer_locked(struct lnet_ni *ni, lnet_nid_t nid)
 {
 	/* the caller is responsible for accounting the additional reference
 	 * that this creates */
 	struct list_head	*peer_list = kiblnd_nid2peerlist(nid);
 	struct list_head	*tmp;
-	kib_peer_ni_t		*peer_ni;
+	struct kib_peer_ni		*peer_ni;
 
 	list_for_each(tmp, peer_list) {
 
-		peer_ni = list_entry(tmp, kib_peer_ni_t, ibp_list);
+		peer_ni = list_entry(tmp, struct kib_peer_ni, ibp_list);
 		LASSERT(!kiblnd_peer_idle(peer_ni));
 
 		/*
@@ -409,7 +409,7 @@ kiblnd_find_peer_locked(struct lnet_ni *ni, lnet_nid_t nid)
 }
 
 void
-kiblnd_unlink_peer_locked (kib_peer_ni_t *peer_ni)
+kiblnd_unlink_peer_locked(struct kib_peer_ni *peer_ni)
 {
 	LASSERT(list_empty(&peer_ni->ibp_conns));
 
@@ -423,7 +423,7 @@ static int
 kiblnd_get_peer_info(struct lnet_ni *ni, int index,
 		     lnet_nid_t *nidp, int *count)
 {
-	kib_peer_ni_t		*peer_ni;
+	struct kib_peer_ni		*peer_ni;
 	struct list_head	*ptmp;
 	int			 i;
 	unsigned long		 flags;
@@ -434,7 +434,7 @@ kiblnd_get_peer_info(struct lnet_ni *ni, int index,
 
 		list_for_each(ptmp, &kiblnd_data.kib_peers[i]) {
 
-			peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list);
+			peer_ni = list_entry(ptmp, struct kib_peer_ni, ibp_list);
 			LASSERT(!kiblnd_peer_idle(peer_ni));
 
 			if (peer_ni->ibp_ni != ni)
@@ -457,17 +457,17 @@ kiblnd_get_peer_info(struct lnet_ni *ni, int index,
 }
 
 static void
-kiblnd_del_peer_locked (kib_peer_ni_t *peer_ni)
+kiblnd_del_peer_locked(struct kib_peer_ni *peer_ni)
 {
-	struct list_head	*ctmp;
-	struct list_head	*cnxt;
-	kib_conn_t		*conn;
+	struct list_head *ctmp;
+	struct list_head *cnxt;
+	struct kib_conn	*conn;
 
 	if (list_empty(&peer_ni->ibp_conns)) {
 		kiblnd_unlink_peer_locked(peer_ni);
 	} else {
 		list_for_each_safe(ctmp, cnxt, &peer_ni->ibp_conns) {
-			conn = list_entry(ctmp, kib_conn_t, ibc_list);
+			conn = list_entry(ctmp, struct kib_conn, ibc_list);
 
 			kiblnd_close_conn_locked(conn, 0);
 		}
@@ -483,7 +483,7 @@ kiblnd_del_peer(struct lnet_ni *ni, lnet_nid_t nid)
 	struct list_head	zombies = LIST_HEAD_INIT(zombies);
 	struct list_head	*ptmp;
 	struct list_head	*pnxt;
-	kib_peer_ni_t		*peer_ni;
+	struct kib_peer_ni		*peer_ni;
 	int			lo;
 	int			hi;
 	int			i;
@@ -501,7 +501,7 @@ kiblnd_del_peer(struct lnet_ni *ni, lnet_nid_t nid)
 
 	for (i = lo; i <= hi; i++) {
 		list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
-			peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list);
+			peer_ni = list_entry(ptmp, struct kib_peer_ni, ibp_list);
 			LASSERT(!kiblnd_peer_idle(peer_ni));
 
 			if (peer_ni->ibp_ni != ni)
@@ -524,17 +524,17 @@ kiblnd_del_peer(struct lnet_ni *ni, lnet_nid_t nid)
 
 	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 
-	kiblnd_txlist_done(&zombies, -EIO);
+	kiblnd_txlist_done(&zombies, -EIO, LNET_MSG_STATUS_LOCAL_ERROR);
 
 	return rc;
 }
 
-static kib_conn_t *
+static struct kib_conn *
 kiblnd_get_conn_by_idx(struct lnet_ni *ni, int index)
 {
-	kib_peer_ni_t		*peer_ni;
+	struct kib_peer_ni		*peer_ni;
 	struct list_head	*ptmp;
-	kib_conn_t		*conn;
+	struct kib_conn	*conn;
 	struct list_head	*ctmp;
 	int			i;
 	unsigned long		flags;
@@ -544,7 +544,7 @@ kiblnd_get_conn_by_idx(struct lnet_ni *ni, int index)
 	for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
 		list_for_each(ptmp, &kiblnd_data.kib_peers[i]) {
 
-			peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list);
+			peer_ni = list_entry(ptmp, struct kib_peer_ni, ibp_list);
 			LASSERT(!kiblnd_peer_idle(peer_ni));
 
 			if (peer_ni->ibp_ni != ni)
@@ -554,7 +554,7 @@ kiblnd_get_conn_by_idx(struct lnet_ni *ni, int index)
 				if (index-- > 0)
 					continue;
 
-				conn = list_entry(ctmp, kib_conn_t, ibc_list);
+				conn = list_entry(ctmp, struct kib_conn, ibc_list);
 				kiblnd_conn_addref(conn);
 				read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
 						       flags);
@@ -568,7 +568,7 @@ kiblnd_get_conn_by_idx(struct lnet_ni *ni, int index)
 }
 
 static void
-kiblnd_debug_rx (kib_rx_t *rx)
+kiblnd_debug_rx(struct kib_rx *rx)
 {
         CDEBUG(D_CONSOLE, "      %p status %d msg_type %x cred %d\n",
                rx, rx->rx_status, rx->rx_msg->ibm_type,
@@ -576,19 +576,19 @@ kiblnd_debug_rx (kib_rx_t *rx)
 }
 
 static void
-kiblnd_debug_tx (kib_tx_t *tx)
+kiblnd_debug_tx(struct kib_tx *tx)
 {
-        CDEBUG(D_CONSOLE, "      %p snd %d q %d w %d rc %d dl %lx "
+	CDEBUG(D_CONSOLE, "      %p snd %d q %d w %d rc %d dl %lld "
 	       "cookie %#llx msg %s%s type %x cred %d\n",
                tx, tx->tx_sending, tx->tx_queued, tx->tx_waiting,
-               tx->tx_status, tx->tx_deadline, tx->tx_cookie,
+	       tx->tx_status, ktime_to_ns(tx->tx_deadline), tx->tx_cookie,
                tx->tx_lntmsg[0] == NULL ? "-" : "!",
                tx->tx_lntmsg[1] == NULL ? "-" : "!",
                tx->tx_msg->ibm_type, tx->tx_msg->ibm_credits);
 }
 
 void
-kiblnd_debug_conn (kib_conn_t *conn)
+kiblnd_debug_conn(struct kib_conn *conn)
 {
 	struct list_head	*tmp;
 	int			i;
@@ -606,27 +606,27 @@ kiblnd_debug_conn (kib_conn_t *conn)
 
 	CDEBUG(D_CONSOLE, "   early_rxs:\n");
 	list_for_each(tmp, &conn->ibc_early_rxs)
-		kiblnd_debug_rx(list_entry(tmp, kib_rx_t, rx_list));
+		kiblnd_debug_rx(list_entry(tmp, struct kib_rx, rx_list));
 
 	CDEBUG(D_CONSOLE, "   tx_noops:\n");
 	list_for_each(tmp, &conn->ibc_tx_noops)
-		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+		kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
 
 	CDEBUG(D_CONSOLE, "   tx_queue_nocred:\n");
 	list_for_each(tmp, &conn->ibc_tx_queue_nocred)
-		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+		kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
 
 	CDEBUG(D_CONSOLE, "   tx_queue_rsrvd:\n");
 	list_for_each(tmp, &conn->ibc_tx_queue_rsrvd)
-		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+		kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
 
 	CDEBUG(D_CONSOLE, "   tx_queue:\n");
 	list_for_each(tmp, &conn->ibc_tx_queue)
-		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+		kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
 
 	CDEBUG(D_CONSOLE, "   active_txs:\n");
 	list_for_each(tmp, &conn->ibc_active_txs)
-		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+		kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
 
 	CDEBUG(D_CONSOLE, "   rxs:\n");
 	for (i = 0; i < IBLND_RX_MSGS(conn); i++)
@@ -672,7 +672,7 @@ kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid)
 }
 
 static int
-kiblnd_get_completion_vector(kib_conn_t *conn, int cpt)
+kiblnd_get_completion_vector(struct kib_conn *conn, int cpt)
 {
 	cpumask_t	*mask;
 	int		vectors;
@@ -734,15 +734,32 @@ static unsigned int kiblnd_send_wrs(struct kib_conn *conn)
 	 * One WR for the LNet message
 	 * And ibc_max_frags for the transfer WRs
 	 */
-	unsigned int ret = 1 + conn->ibc_max_frags;
+	int ret;
+	int multiplier = 1 + conn->ibc_max_frags;
+	enum kib_dev_caps dev_caps = conn->ibc_hdev->ibh_dev->ibd_dev_caps;
+
+	/* FastReg needs two extra WRs for map and invalidate */
+	if (dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED)
+		multiplier += 2;
 
 	/* account for a maximum of ibc_queue_depth in-flight transfers */
-	ret *= conn->ibc_queue_depth;
-	return ret;
+	ret = multiplier * conn->ibc_queue_depth;
+
+	if (ret > conn->ibc_hdev->ibh_max_qp_wr) {
+		CDEBUG(D_NET, "peer_credits %u will result in send work "
+		       "request size %d larger than maximum %d device "
+		       "can handle\n", conn->ibc_queue_depth, ret,
+		       conn->ibc_hdev->ibh_max_qp_wr);
+		conn->ibc_queue_depth =
+			conn->ibc_hdev->ibh_max_qp_wr / multiplier;
+	}
+
+	/* don't go beyond the maximum the device can handle */
+	return min(ret, conn->ibc_hdev->ibh_max_qp_wr);
 }
 
-kib_conn_t *
-kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
+struct kib_conn *
+kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid,
 		   int state, int version)
 {
 	/* CAVEAT EMPTOR:
@@ -753,14 +770,14 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
 	 * to destroy 'cmid' here since I'm called from the CM which still has
 	 * its ref on 'cmid'). */
 	rwlock_t	       *glock = &kiblnd_data.kib_global_lock;
-	kib_net_t              *net = peer_ni->ibp_ni->ni_data;
-	kib_dev_t              *dev;
+	struct kib_net              *net = peer_ni->ibp_ni->ni_data;
+	struct kib_dev *dev;
 	struct ib_qp_init_attr *init_qp_attr;
 	struct kib_sched_info	*sched;
 #ifdef HAVE_IB_CQ_INIT_ATTR
 	struct ib_cq_init_attr  cq_attr = {};
 #endif
-	kib_conn_t		*conn;
+	struct kib_conn	*conn;
 	struct ib_cq		*cq;
 	unsigned long		flags;
 	int			cpt;
@@ -815,6 +832,7 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
 	INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd);
 	INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred);
 	INIT_LIST_HEAD(&conn->ibc_active_txs);
+	INIT_LIST_HEAD(&conn->ibc_zombie_txs);
 	spin_lock_init(&conn->ibc_lock);
 
 	LIBCFS_CPT_ALLOC(conn->ibc_connvars, lnet_cpt_table(), cpt,
@@ -853,7 +871,7 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
 	write_unlock_irqrestore(glock, flags);
 
 	LIBCFS_CPT_ALLOC(conn->ibc_rxs, lnet_cpt_table(), cpt,
-			 IBLND_RX_MSGS(conn) * sizeof(kib_rx_t));
+			 IBLND_RX_MSGS(conn) * sizeof(struct kib_rx));
 	if (conn->ibc_rxs == NULL) {
 		CERROR("Cannot allocate RX buffers\n");
 		goto failed_2;
@@ -879,6 +897,12 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
 			  kiblnd_get_completion_vector(conn, cpt));
 #endif
 	if (IS_ERR(cq)) {
+		/*
+		 * on MLX-5 (possibly MLX-4 as well) this error could be
+		 * hit if the concurrent_sends and/or peer_tx_credits is set
+		 * too high. Or due to an MLX-5 bug which tries to
+		 * allocate 256kb via kmalloc for WR cookie array
+		 */
 		CERROR("Failed to create CQ with %d CQEs: %ld\n",
 			IBLND_CQ_ENTRIES(conn), PTR_ERR(cq));
 		goto failed_2;
@@ -900,20 +924,14 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
 	init_qp_attr->qp_type = IB_QPT_RC;
 	init_qp_attr->send_cq = cq;
 	init_qp_attr->recv_cq = cq;
+	/*
+	 * kiblnd_send_wrs() can change the connection's queue depth if
+	 * the maximum work requests for the device is maxed out
+	 */
+	init_qp_attr->cap.max_send_wr = kiblnd_send_wrs(conn);
+	init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn);
 
-	conn->ibc_sched = sched;
-
-	do {
-		init_qp_attr->cap.max_send_wr = kiblnd_send_wrs(conn);
-		init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn);
-
-		rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
-		if (!rc || conn->ibc_queue_depth < 2)
-			break;
-
-		conn->ibc_queue_depth--;
-	} while (rc);
-
+	rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
 	if (rc) {
 		CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d, "
 		       "send_sge: %d, recv_sge: %d\n",
@@ -924,6 +942,8 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
 		goto failed_2;
 	}
 
+	conn->ibc_sched = sched;
+
 	if (conn->ibc_queue_depth != peer_ni->ibp_queue_depth)
 		CWARN("peer %s - queue depth reduced from %u to %u"
 		      "  to allow for qp creation\n",
@@ -976,7 +996,8 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
         return conn;
 
  failed_2:
-	kiblnd_destroy_conn(conn, true);
+	kiblnd_destroy_conn(conn);
+	LIBCFS_FREE(conn, sizeof(*conn));
  failed_1:
         LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
  failed_0:
@@ -984,10 +1005,10 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
 }
 
 void
-kiblnd_destroy_conn(kib_conn_t *conn, bool free_conn)
+kiblnd_destroy_conn(struct kib_conn *conn)
 {
 	struct rdma_cm_id *cmid = conn->ibc_cmid;
-	kib_peer_ni_t        *peer_ni = conn->ibc_peer;
+	struct kib_peer_ni *peer_ni = conn->ibc_peer;
 
 	LASSERT (!in_interrupt());
 	LASSERT (atomic_read(&conn->ibc_refcount) == 0);
@@ -1021,12 +1042,15 @@ kiblnd_destroy_conn(kib_conn_t *conn, bool free_conn)
 	if (conn->ibc_cq)
 		ib_destroy_cq(conn->ibc_cq);
 
+	kiblnd_txlist_done(&conn->ibc_zombie_txs, -ECONNABORTED,
+			   LNET_MSG_STATUS_OK);
+
 	if (conn->ibc_rx_pages != NULL)
 		kiblnd_unmap_rx_descs(conn);
 
 	if (conn->ibc_rxs != NULL) {
 		LIBCFS_FREE(conn->ibc_rxs,
-			    IBLND_RX_MSGS(conn) * sizeof(kib_rx_t));
+			    IBLND_RX_MSGS(conn) * sizeof(struct kib_rx));
 	}
 
 	if (conn->ibc_connvars != NULL)
@@ -1037,27 +1061,24 @@ kiblnd_destroy_conn(kib_conn_t *conn, bool free_conn)
 
 	/* See CAVEAT EMPTOR above in kiblnd_create_conn */
 	if (conn->ibc_state != IBLND_CONN_INIT) {
-		kib_net_t *net = peer_ni->ibp_ni->ni_data;
+		struct kib_net *net = peer_ni->ibp_ni->ni_data;
 
 		kiblnd_peer_decref(peer_ni);
 		rdma_destroy_id(cmid);
 		atomic_dec(&net->ibn_nconns);
 	}
-
-	if (free_conn)
-		LIBCFS_FREE(conn, sizeof(*conn));
 }
 
 int
-kiblnd_close_peer_conns_locked(kib_peer_ni_t *peer_ni, int why)
+kiblnd_close_peer_conns_locked(struct kib_peer_ni *peer_ni, int why)
 {
-	kib_conn_t		*conn;
+	struct kib_conn	*conn;
 	struct list_head	*ctmp;
 	struct list_head	*cnxt;
 	int			count = 0;
 
 	list_for_each_safe(ctmp, cnxt, &peer_ni->ibp_conns) {
-		conn = list_entry(ctmp, kib_conn_t, ibc_list);
+		conn = list_entry(ctmp, struct kib_conn, ibc_list);
 
 		CDEBUG(D_NET, "Closing conn -> %s, "
 			      "version: %x, reason: %d\n",
@@ -1072,16 +1093,16 @@ kiblnd_close_peer_conns_locked(kib_peer_ni_t *peer_ni, int why)
 }
 
 int
-kiblnd_close_stale_conns_locked(kib_peer_ni_t *peer_ni,
+kiblnd_close_stale_conns_locked(struct kib_peer_ni *peer_ni,
 				int version, __u64 incarnation)
 {
-	kib_conn_t		*conn;
+	struct kib_conn	*conn;
 	struct list_head	*ctmp;
 	struct list_head	*cnxt;
 	int			count = 0;
 
 	list_for_each_safe(ctmp, cnxt, &peer_ni->ibp_conns) {
-		conn = list_entry(ctmp, kib_conn_t, ibc_list);
+		conn = list_entry(ctmp, struct kib_conn, ibc_list);
 
 		if (conn->ibc_version     == version &&
 		    conn->ibc_incarnation == incarnation)
@@ -1103,7 +1124,7 @@ kiblnd_close_stale_conns_locked(kib_peer_ni_t *peer_ni,
 static int
 kiblnd_close_matching_conns(struct lnet_ni *ni, lnet_nid_t nid)
 {
-	kib_peer_ni_t		*peer_ni;
+	struct kib_peer_ni		*peer_ni;
 	struct list_head	*ptmp;
 	struct list_head	*pnxt;
 	int			lo;
@@ -1124,7 +1145,7 @@ kiblnd_close_matching_conns(struct lnet_ni *ni, lnet_nid_t nid)
 	for (i = lo; i <= hi; i++) {
 		list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
 
-			peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list);
+			peer_ni = list_entry(ptmp, struct kib_peer_ni, ibp_list);
 			LASSERT(!kiblnd_peer_idle(peer_ni));
 
 			if (peer_ni->ibp_ni != ni)
@@ -1169,7 +1190,7 @@ kiblnd_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
                 break;
         }
         case IOC_LIBCFS_GET_CONN: {
-                kib_conn_t *conn;
+		struct kib_conn *conn;
 
                 rc = 0;
                 conn = kiblnd_get_conn_by_idx(ni, data->ioc_count);
@@ -1201,13 +1222,13 @@ kiblnd_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
 }
 
 static void
-kiblnd_query(struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when)
+kiblnd_query(struct lnet_ni *ni, lnet_nid_t nid, time64_t *when)
 {
-	cfs_time_t	last_alive = 0;
-	cfs_time_t	now = cfs_time_current();
-	rwlock_t	*glock = &kiblnd_data.kib_global_lock;
-	kib_peer_ni_t	*peer_ni;
-	unsigned long	flags;
+	time64_t last_alive = 0;
+	time64_t now = ktime_get_seconds();
+	rwlock_t *glock = &kiblnd_data.kib_global_lock;
+	struct kib_peer_ni *peer_ni;
+	unsigned long flags;
 
 	read_lock_irqsave(glock, flags);
 
@@ -1225,14 +1246,14 @@ kiblnd_query(struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when)
 	if (peer_ni == NULL)
 		kiblnd_launch_tx(ni, NULL, nid);
 
-	CDEBUG(D_NET, "peer_ni %s %p, alive %ld secs ago\n",
+	CDEBUG(D_NET, "peer_ni %s %p, alive %lld secs ago\n",
 	       libcfs_nid2str(nid), peer_ni,
-	       last_alive ? cfs_duration_sec(now - last_alive) : -1);
+	       last_alive ? now - last_alive : -1);
 	return;
 }
 
 static void
-kiblnd_free_pages(kib_pages_t *p)
+kiblnd_free_pages(struct kib_pages *p)
 {
 	int	npages = p->ibp_npages;
 	int	i;
@@ -1242,23 +1263,23 @@ kiblnd_free_pages(kib_pages_t *p)
 			__free_page(p->ibp_pages[i]);
 	}
 
-	LIBCFS_FREE(p, offsetof(kib_pages_t, ibp_pages[npages]));
+	LIBCFS_FREE(p, offsetof(struct kib_pages, ibp_pages[npages]));
 }
 
 int
-kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages)
+kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages)
 {
-	kib_pages_t	*p;
-	int		i;
+	struct kib_pages *p;
+	int i;
 
 	LIBCFS_CPT_ALLOC(p, lnet_cpt_table(), cpt,
-			 offsetof(kib_pages_t, ibp_pages[npages]));
+			 offsetof(struct kib_pages, ibp_pages[npages]));
         if (p == NULL) {
                 CERROR("Can't allocate descriptor for %d pages\n", npages);
                 return -ENOMEM;
         }
 
-        memset(p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
+	memset(p, 0, offsetof(struct kib_pages, ibp_pages[npages]));
         p->ibp_npages = npages;
 
         for (i = 0; i < npages; i++) {
@@ -1276,9 +1297,9 @@ kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages)
 }
 
 void
-kiblnd_unmap_rx_descs(kib_conn_t *conn)
+kiblnd_unmap_rx_descs(struct kib_conn *conn)
 {
-        kib_rx_t *rx;
+	struct kib_rx *rx;
         int       i;
 
         LASSERT (conn->ibc_rxs != NULL);
@@ -1301,9 +1322,9 @@ kiblnd_unmap_rx_descs(kib_conn_t *conn)
 }
 
 void
-kiblnd_map_rx_descs(kib_conn_t *conn)
+kiblnd_map_rx_descs(struct kib_conn *conn)
 {
-        kib_rx_t       *rx;
+	struct kib_rx *rx;
         struct page    *pg;
         int             pg_off;
         int             ipg;
@@ -1314,7 +1335,7 @@ kiblnd_map_rx_descs(kib_conn_t *conn)
 		rx = &conn->ibc_rxs[i];
 
 		rx->rx_conn = conn;
-		rx->rx_msg = (kib_msg_t *)(((char *)page_address(pg)) + pg_off);
+		rx->rx_msg = (struct kib_msg *)(((char *)page_address(pg)) + pg_off);
 
 		rx->rx_msgaddr =
 			kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev,
@@ -1340,11 +1361,11 @@ kiblnd_map_rx_descs(kib_conn_t *conn)
 }
 
 static void
-kiblnd_unmap_tx_pool(kib_tx_pool_t *tpo)
+kiblnd_unmap_tx_pool(struct kib_tx_pool *tpo)
 {
-        kib_hca_dev_t  *hdev = tpo->tpo_hdev;
-        kib_tx_t       *tx;
-        int             i;
+	struct kib_hca_dev *hdev = tpo->tpo_hdev;
+	struct kib_tx *tx;
+	int i;
 
         LASSERT (tpo->tpo_pool.po_allocated == 0);
 
@@ -1363,10 +1384,10 @@ kiblnd_unmap_tx_pool(kib_tx_pool_t *tpo)
         tpo->tpo_hdev = NULL;
 }
 
-static kib_hca_dev_t *
-kiblnd_current_hdev(kib_dev_t *dev)
+static struct kib_hca_dev *
+kiblnd_current_hdev(struct kib_dev *dev)
 {
-        kib_hca_dev_t *hdev;
+	struct kib_hca_dev *hdev;
         unsigned long  flags;
         int            i = 0;
 
@@ -1391,14 +1412,14 @@ kiblnd_current_hdev(kib_dev_t *dev)
 }
 
 static void
-kiblnd_map_tx_pool(kib_tx_pool_t *tpo)
-{
-        kib_pages_t    *txpgs = tpo->tpo_tx_pages;
-        kib_pool_t     *pool  = &tpo->tpo_pool;
-        kib_net_t      *net   = pool->po_owner->ps_net;
-	kib_dev_t      *dev;
-        struct page    *page;
-        kib_tx_t       *tx;
+kiblnd_map_tx_pool(struct kib_tx_pool *tpo)
+{
+	struct kib_pages *txpgs = tpo->tpo_tx_pages;
+	struct kib_pool *pool = &tpo->tpo_pool;
+	struct kib_net      *net   = pool->po_owner->ps_net;
+	struct kib_dev *dev;
+	struct page *page;
+	struct kib_tx *tx;
         int             page_offset;
         int             ipage;
         int             i;
@@ -1419,8 +1440,8 @@ kiblnd_map_tx_pool(kib_tx_pool_t *tpo)
 		page = txpgs->ibp_pages[ipage];
 		tx = &tpo->tpo_tx_descs[i];
 
-		tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
-					   page_offset);
+		tx->tx_msg = (struct kib_msg *)(((char *)page_address(page)) +
+						page_offset);
 
 		tx->tx_msgaddr = kiblnd_dma_map_single(tpo->tpo_hdev->ibh_ibdev,
 						       tx->tx_msg,
@@ -1443,39 +1464,14 @@ kiblnd_map_tx_pool(kib_tx_pool_t *tpo)
 	}
 }
 
-#ifdef HAVE_IB_GET_DMA_MR
-struct ib_mr *
-kiblnd_find_rd_dma_mr(struct lnet_ni *ni, kib_rdma_desc_t *rd,
-		      int negotiated_nfrags)
-{
-	kib_net_t     *net   = ni->ni_data;
-	kib_hca_dev_t *hdev  = net->ibn_dev->ibd_hdev;
-	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
-	int	mod;
-	__u16	nfrags;
-
-	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
-	mod = tunables->lnd_map_on_demand;
-	nfrags = (negotiated_nfrags != -1) ? negotiated_nfrags : mod;
-
-	LASSERT(hdev->ibh_mrs != NULL);
-
-	if (mod > 0 && nfrags <= rd->rd_nfrags)
-		return NULL;
-
-	return hdev->ibh_mrs;
-}
-#endif
-
 static void
-kiblnd_destroy_fmr_pool(kib_fmr_pool_t *fpo)
+kiblnd_destroy_fmr_pool(struct kib_fmr_pool *fpo)
 {
 	LASSERT(fpo->fpo_map_count == 0);
 
 #ifdef HAVE_FMR_POOL_API
-	if (fpo->fpo_is_fmr) {
-		if (fpo->fmr.fpo_fmr_pool)
-			ib_destroy_fmr_pool(fpo->fmr.fpo_fmr_pool);
+	if (fpo->fpo_is_fmr && fpo->fmr.fpo_fmr_pool) {
+		ib_destroy_fmr_pool(fpo->fmr.fpo_fmr_pool);
 	} else
 #endif /* HAVE_FMR_POOL_API */
 	{
@@ -1506,7 +1502,7 @@ kiblnd_destroy_fmr_pool(kib_fmr_pool_t *fpo)
 static void
 kiblnd_destroy_fmr_pool_list(struct list_head *head)
 {
-	kib_fmr_pool_t *fpo, *tmp;
+	struct kib_fmr_pool *fpo, *tmp;
 
 	list_for_each_entry_safe(fpo, tmp, head, fpo_list) {
 		list_del(&fpo->fpo_list);
@@ -1533,10 +1529,11 @@ kiblnd_fmr_flush_trigger(struct lnet_ioctl_config_o2iblnd_tunables *tunables,
 }
 
 #ifdef HAVE_FMR_POOL_API
-static int kiblnd_alloc_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t *fpo)
+static int kiblnd_alloc_fmr_pool(struct kib_fmr_poolset *fps,
+				 struct kib_fmr_pool *fpo)
 {
 	struct ib_fmr_pool_param param = {
-		.max_pages_per_fmr = LNET_MAX_PAYLOAD/PAGE_SIZE,
+		.max_pages_per_fmr = LNET_MAX_IOV,
 		.page_shift        = PAGE_SHIFT,
 		.access            = (IB_ACCESS_LOCAL_WRITE |
 				      IB_ACCESS_REMOTE_WRITE),
@@ -1556,16 +1553,23 @@ static int kiblnd_alloc_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t *fpo)
 		else
 			CERROR("FMRs are not supported\n");
 	}
+	fpo->fpo_is_fmr = true;
 
 	return rc;
 }
 #endif /* HAVE_FMR_POOL_API */
 
-static int kiblnd_alloc_freg_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t *fpo)
+static int kiblnd_alloc_freg_pool(struct kib_fmr_poolset *fps,
+				  struct kib_fmr_pool *fpo,
+				  enum kib_dev_caps dev_caps)
 {
 	struct kib_fast_reg_descriptor *frd, *tmp;
 	int i, rc;
 
+#ifdef HAVE_FMR_POOL_API
+	fpo->fpo_is_fmr = false;
+#endif
+
 	INIT_LIST_HEAD(&fpo->fast_reg.fpo_pool_list);
 	fpo->fast_reg.fpo_pool_size = 0;
 	for (i = 0; i < fps->fps_pool_size; i++) {
@@ -1580,7 +1584,7 @@ static int kiblnd_alloc_freg_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t *fpo)
 
 #ifndef HAVE_IB_MAP_MR_SG
 		frd->frd_frpl = ib_alloc_fast_reg_page_list(fpo->fpo_hdev->ibh_ibdev,
-							    LNET_MAX_PAYLOAD/PAGE_SIZE);
+							    LNET_MAX_IOV);
 		if (IS_ERR(frd->frd_frpl)) {
 			rc = PTR_ERR(frd->frd_frpl);
 			CERROR("Failed to allocate ib_fast_reg_page_list: %d\n",
@@ -1592,11 +1596,28 @@ static int kiblnd_alloc_freg_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t *fpo)
 
 #ifdef HAVE_IB_ALLOC_FAST_REG_MR
 		frd->frd_mr = ib_alloc_fast_reg_mr(fpo->fpo_hdev->ibh_pd,
-						   LNET_MAX_PAYLOAD/PAGE_SIZE);
+						   LNET_MAX_IOV);
 #else
+		/*
+		 * it is expected to get here if this is an MLX-5 card.
+		 * MLX-4 cards will always use FMR and MLX-5 cards will
+		 * always use fast_reg. It turns out that some MLX-5 cards
+		 * (possibly due to older FW versions) do not natively support
+		 * gaps. So we will need to track them here.
+		 */
 		frd->frd_mr = ib_alloc_mr(fpo->fpo_hdev->ibh_pd,
-					  IB_MR_TYPE_MEM_REG,
-					  LNET_MAX_PAYLOAD/PAGE_SIZE);
+#ifdef IB_MR_TYPE_SG_GAPS
+					  ((*kiblnd_tunables.kib_use_fastreg_gaps == 1) &&
+					   (dev_caps & IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT)) ?
+						IB_MR_TYPE_SG_GAPS :
+						IB_MR_TYPE_MEM_REG,
+#else
+						IB_MR_TYPE_MEM_REG,
+#endif
+					  LNET_MAX_IOV);
+		if ((*kiblnd_tunables.kib_use_fastreg_gaps == 1) &&
+		    (dev_caps & IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT))
+			CWARN("using IB_MR_TYPE_SG_GAPS, expect a performance drop\n");
 #endif
 		if (IS_ERR(frd->frd_mr)) {
 			rc = PTR_ERR(frd->frd_mr);
@@ -1639,79 +1660,32 @@ static int kiblnd_alloc_freg_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t *fpo)
 	return rc;
 }
 
-static int
-kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t **pp_fpo)
+static int kiblnd_create_fmr_pool(struct kib_fmr_poolset *fps,
+				  struct kib_fmr_pool **pp_fpo)
 {
-	struct ib_device_attr *dev_attr;
-	kib_dev_t *dev = fps->fps_net->ibn_dev;
-	kib_fmr_pool_t *fpo;
+	struct kib_dev *dev = fps->fps_net->ibn_dev;
+	struct kib_fmr_pool *fpo;
 	int rc;
 
-#ifndef HAVE_IB_DEVICE_ATTRS
-	dev_attr = kmalloc(sizeof(*dev_attr), GFP_KERNEL);
-	if (!dev_attr)
-		return -ENOMEM;
-#endif
-
 	LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo));
 	if (!fpo) {
-		rc = -ENOMEM;
-		goto out_dev_attr;
+		return -ENOMEM;
 	}
+	memset(fpo, 0, sizeof(*fpo));
 
 	fpo->fpo_hdev = kiblnd_current_hdev(dev);
 
-#ifdef HAVE_IB_DEVICE_ATTRS
-	dev_attr = &fpo->fpo_hdev->ibh_ibdev->attrs;
-#else
-	rc = ib_query_device(fpo->fpo_hdev->ibh_ibdev, dev_attr);
-	if (rc) {
-		CERROR("Query device failed for %s: %d\n",
-			fpo->fpo_hdev->ibh_ibdev->name, rc);
-		goto out_dev_attr;
-	}
-#endif
-
-#ifdef HAVE_FMR_POOL_API
-	/* Check for FMR or FastReg support */
-	fpo->fpo_is_fmr = 0;
-#ifdef HAVE_IB_DEVICE_OPS
-	if (fpo->fpo_hdev->ibh_ibdev->ops.alloc_fmr &&
-	    fpo->fpo_hdev->ibh_ibdev->ops.dealloc_fmr &&
-	    fpo->fpo_hdev->ibh_ibdev->ops.map_phys_fmr &&
-	    fpo->fpo_hdev->ibh_ibdev->ops.unmap_fmr) {
-#else
-	if (fpo->fpo_hdev->ibh_ibdev->alloc_fmr &&
-	    fpo->fpo_hdev->ibh_ibdev->dealloc_fmr &&
-	    fpo->fpo_hdev->ibh_ibdev->map_phys_fmr &&
-	    fpo->fpo_hdev->ibh_ibdev->unmap_fmr) {
-#endif
-		LCONSOLE_INFO("Using FMR for registration\n");
-		fpo->fpo_is_fmr = 1;
-	} else
-#endif /* HAVE_FMR_POOL_API */
-	if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
-		LCONSOLE_INFO("Using FastReg for registration\n");
-	} else {
-		rc = -ENOSYS;
-		LCONSOLE_ERROR_MSG(rc, "IB device does not support FMRs nor FastRegs, can't register memory\n");
-		goto out_dev_attr;
-	}
-
 #ifdef HAVE_FMR_POOL_API
-	if (fpo->fpo_is_fmr)
+	if (dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED)
 		rc = kiblnd_alloc_fmr_pool(fps, fpo);
 	else
 #endif /* HAVE_FMR_POOL_API */
-		rc = kiblnd_alloc_freg_pool(fps, fpo);
+		rc = kiblnd_alloc_freg_pool(fps, fpo, dev->ibd_dev_caps);
 	if (rc)
 		goto out_fpo;
 
-#ifndef HAVE_IB_DEVICE_ATTRS
-	kfree(dev_attr);
-#endif
-	fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
-	fpo->fpo_owner    = fps;
+	fpo->fpo_deadline = ktime_get_seconds() + IBLND_POOL_DEADLINE;
+	fpo->fpo_owner = fps;
 	*pp_fpo = fpo;
 
 	return 0;
@@ -1719,17 +1693,11 @@ kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t **pp_fpo)
 out_fpo:
 	kiblnd_hdev_decref(fpo->fpo_hdev);
 	LIBCFS_FREE(fpo, sizeof(*fpo));
-
-out_dev_attr:
-#ifndef HAVE_IB_DEVICE_ATTRS
-	kfree(dev_attr);
-#endif
-
 	return rc;
 }
 
 static void
-kiblnd_fail_fmr_poolset(kib_fmr_poolset_t *fps, struct list_head *zombies)
+kiblnd_fail_fmr_poolset(struct kib_fmr_poolset *fps, struct list_head *zombies)
 {
 	if (fps->fps_net == NULL) /* intialized? */
 		return;
@@ -1737,8 +1705,10 @@ kiblnd_fail_fmr_poolset(kib_fmr_poolset_t *fps, struct list_head *zombies)
 	spin_lock(&fps->fps_lock);
 
 	while (!list_empty(&fps->fps_pool_list)) {
-		kib_fmr_pool_t *fpo = list_entry(fps->fps_pool_list.next,
-                                                 kib_fmr_pool_t, fpo_list);
+		struct kib_fmr_pool *fpo = list_entry(fps->fps_pool_list.next,
+						      struct kib_fmr_pool,
+						      fpo_list);
+
 		fpo->fpo_failed = 1;
 		list_del(&fpo->fpo_list);
 		if (fpo->fpo_map_count == 0)
@@ -1751,7 +1721,7 @@ kiblnd_fail_fmr_poolset(kib_fmr_poolset_t *fps, struct list_head *zombies)
 }
 
 static void
-kiblnd_fini_fmr_poolset(kib_fmr_poolset_t *fps)
+kiblnd_fini_fmr_poolset(struct kib_fmr_poolset *fps)
 {
 	if (fps->fps_net != NULL) { /* initialized? */
 		kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list);
@@ -1760,14 +1730,14 @@ kiblnd_fini_fmr_poolset(kib_fmr_poolset_t *fps)
 }
 
 static int
-kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt, int ncpts,
-			kib_net_t *net,
+kiblnd_init_fmr_poolset(struct kib_fmr_poolset *fps, int cpt, int ncpts,
+			struct kib_net *net,
 			struct lnet_ioctl_config_o2iblnd_tunables *tunables)
 {
-	kib_fmr_pool_t *fpo;
-	int		rc;
+	struct kib_fmr_pool *fpo;
+	int rc;
 
-	memset(fps, 0, sizeof(kib_fmr_poolset_t));
+	memset(fps, 0, sizeof(struct kib_fmr_poolset));
 
 	fps->fps_net = net;
 	fps->fps_cpt = cpt;
@@ -1788,20 +1758,20 @@ kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt, int ncpts,
 }
 
 static int
-kiblnd_fmr_pool_is_idle(kib_fmr_pool_t *fpo, cfs_time_t now)
+kiblnd_fmr_pool_is_idle(struct kib_fmr_pool *fpo, time64_t now)
 {
         if (fpo->fpo_map_count != 0) /* still in use */
                 return 0;
         if (fpo->fpo_failed)
                 return 1;
-        return cfs_time_aftereq(now, fpo->fpo_deadline);
+	return now >= fpo->fpo_deadline;
 }
 
 #if defined(HAVE_FMR_POOL_API) || !defined(HAVE_IB_MAP_MR_SG)
 static int
-kiblnd_map_tx_pages(kib_tx_t *tx, kib_rdma_desc_t *rd)
+kiblnd_map_tx_pages(struct kib_tx *tx, struct kib_rdma_desc *rd)
 {
-	kib_hca_dev_t	*hdev;
+	struct kib_hca_dev *hdev;
 	__u64		*pages = tx->tx_pages;
 	int		npages;
 	int		size;
@@ -1822,13 +1792,13 @@ kiblnd_map_tx_pages(kib_tx_t *tx, kib_rdma_desc_t *rd)
 #endif
 
 void
-kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status)
+kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status)
 {
-	struct list_head   zombies = LIST_HEAD_INIT(zombies);
-	kib_fmr_pool_t    *fpo = fmr->fmr_pool;
-	kib_fmr_poolset_t *fps;
-	cfs_time_t         now = cfs_time_current();
-	kib_fmr_pool_t    *tmp;
+	struct list_head zombies = LIST_HEAD_INIT(zombies);
+	struct kib_fmr_pool *fpo = fmr->fmr_pool;
+	struct kib_fmr_poolset *fps;
+	time64_t now = ktime_get_seconds();
+	struct kib_fmr_pool *tmp;
 
 	if (!fpo)
 		return;
@@ -1853,10 +1823,11 @@ kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status)
 
 		if (frd) {
 			frd->frd_valid = false;
+			frd->frd_posted = false;
+			fmr->fmr_frd = NULL;
 			spin_lock(&fps->fps_lock);
 			list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list);
 			spin_unlock(&fps->fps_lock);
-			fmr->fmr_frd = NULL;
 		}
 	}
 	fmr->fmr_pool = NULL;
@@ -1880,11 +1851,11 @@ kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status)
 		kiblnd_destroy_fmr_pool_list(&zombies);
 }
 
-int
-kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, kib_rdma_desc_t *rd,
-		    __u32 nob, __u64 iov, kib_fmr_t *fmr, bool *is_fastreg)
+int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx,
+			struct kib_rdma_desc *rd, u32 nob, u64 iov,
+			struct kib_fmr *fmr)
 {
-	kib_fmr_pool_t *fpo;
+	struct kib_fmr_pool *fpo;
 	__u64 version;
 	bool is_rx = (rd != tx->tx_rd);
 #ifdef HAVE_FMR_POOL_API
@@ -1898,7 +1869,7 @@ kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, kib_rdma_desc_t *rd,
 	spin_lock(&fps->fps_lock);
 	version = fps->fps_version;
 	list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) {
-		fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+		fpo->fpo_deadline = ktime_get_seconds() + IBLND_POOL_DEADLINE;
 		fpo->fpo_map_count++;
 
 #ifdef HAVE_FMR_POOL_API
@@ -1906,7 +1877,6 @@ kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, kib_rdma_desc_t *rd,
 		if (fpo->fpo_is_fmr) {
 			struct ib_pool_fmr *pfmr;
 
-			*is_fastreg = 0;
 			spin_unlock(&fps->fps_lock);
 
 			if (!tx_pages_mapped) {
@@ -1928,7 +1898,6 @@ kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, kib_rdma_desc_t *rd,
 		} else
 #endif /* HAVE_FMR_POOL_API */
 		{
-			*is_fastreg = 1;
 			if (!list_empty(&fpo->fast_reg.fpo_pool_list)) {
 				struct kib_fast_reg_descriptor *frd;
 #ifdef HAVE_IB_MAP_MR_SG
@@ -1970,14 +1939,14 @@ kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, kib_rdma_desc_t *rd,
 #ifdef HAVE_IB_MAP_MR_SG
 #ifdef HAVE_IB_MAP_MR_SG_5ARGS
 				n = ib_map_mr_sg(mr, tx->tx_frags,
-						 tx->tx_nfrags, NULL, PAGE_SIZE);
+						 rd->rd_nfrags, NULL, PAGE_SIZE);
 #else
 				n = ib_map_mr_sg(mr, tx->tx_frags,
-						 tx->tx_nfrags, PAGE_SIZE);
+						 rd->rd_nfrags, PAGE_SIZE);
 #endif /* HAVE_IB_MAP_MR_SG_5ARGS */
-				if (unlikely(n != tx->tx_nfrags)) {
+				if (unlikely(n != rd->rd_nfrags)) {
 					CERROR("Failed to map mr %d/%d "
-					       "elements\n", n, tx->tx_nfrags);
+					       "elements\n", n, rd->rd_nfrags);
 					return n < 0 ? n : -EINVAL;
 				}
 
@@ -2024,6 +1993,7 @@ kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, kib_rdma_desc_t *rd,
 				fmr->fmr_key  = is_rx ? mr->rkey : mr->lkey;
 				fmr->fmr_frd  = frd;
 				fmr->fmr_pool = fpo;
+				frd->frd_posted = false;
 				return 0;
 			}
 			spin_unlock(&fps->fps_lock);
@@ -2053,7 +2023,7 @@ kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, kib_rdma_desc_t *rd,
 
 	}
 
-	if (cfs_time_before(cfs_time_current(), fps->fps_next_retry)) {
+	if (ktime_get_seconds() < fps->fps_next_retry) {
 		/* someone failed recently */
 		spin_unlock(&fps->fps_lock);
 		return -EAGAIN;
@@ -2070,7 +2040,7 @@ kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, kib_rdma_desc_t *rd,
 		fps->fps_version++;
 		list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
 	} else {
-		fps->fps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
+		fps->fps_next_retry = ktime_get_seconds() + IBLND_POOL_RETRY;
 	}
 	spin_unlock(&fps->fps_lock);
 
@@ -2078,7 +2048,7 @@ kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, kib_rdma_desc_t *rd,
 }
 
 static void
-kiblnd_fini_pool(kib_pool_t *pool)
+kiblnd_fini_pool(struct kib_pool *pool)
 {
 	LASSERT(list_empty(&pool->po_free_list));
 	LASSERT(pool->po_allocated == 0);
@@ -2087,24 +2057,24 @@ kiblnd_fini_pool(kib_pool_t *pool)
 }
 
 static void
-kiblnd_init_pool(kib_poolset_t *ps, kib_pool_t *pool, int size)
+kiblnd_init_pool(struct kib_poolset *ps, struct kib_pool *pool, int size)
 {
 	CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name);
 
-	memset(pool, 0, sizeof(kib_pool_t));
+	memset(pool, 0, sizeof(struct kib_pool));
 	INIT_LIST_HEAD(&pool->po_free_list);
-	pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
-	pool->po_owner	  = ps;
-	pool->po_size	  = size;
+	pool->po_deadline = ktime_get_seconds() + IBLND_POOL_DEADLINE;
+	pool->po_owner = ps;
+	pool->po_size = size;
 }
 
 static void
 kiblnd_destroy_pool_list(struct list_head *head)
 {
-	kib_pool_t *pool;
+	struct kib_pool *pool;
 
 	while (!list_empty(head)) {
-		pool = list_entry(head->next, kib_pool_t, po_list);
+		pool = list_entry(head->next, struct kib_pool, po_list);
 		list_del(&pool->po_list);
 
 		LASSERT(pool->po_owner != NULL);
@@ -2113,15 +2083,16 @@ kiblnd_destroy_pool_list(struct list_head *head)
 }
 
 static void
-kiblnd_fail_poolset(kib_poolset_t *ps, struct list_head *zombies)
+kiblnd_fail_poolset(struct kib_poolset *ps, struct list_head *zombies)
 {
 	if (ps->ps_net == NULL) /* intialized? */
 		return;
 
 	spin_lock(&ps->ps_lock);
 	while (!list_empty(&ps->ps_pool_list)) {
-		kib_pool_t *po = list_entry(ps->ps_pool_list.next,
-                                            kib_pool_t, po_list);
+		struct kib_pool *po = list_entry(ps->ps_pool_list.next,
+						 struct kib_pool, po_list);
+
 		po->po_failed = 1;
 		list_del(&po->po_list);
 		if (po->po_allocated == 0)
@@ -2133,7 +2104,7 @@ kiblnd_fail_poolset(kib_poolset_t *ps, struct list_head *zombies)
 }
 
 static void
-kiblnd_fini_poolset(kib_poolset_t *ps)
+kiblnd_fini_poolset(struct kib_poolset *ps)
 {
 	if (ps->ps_net != NULL) { /* initialized? */
 		kiblnd_destroy_pool_list(&ps->ps_failed_pool_list);
@@ -2142,17 +2113,17 @@ kiblnd_fini_poolset(kib_poolset_t *ps)
 }
 
 static int
-kiblnd_init_poolset(kib_poolset_t *ps, int cpt,
-		    kib_net_t *net, char *name, int size,
+kiblnd_init_poolset(struct kib_poolset *ps, int cpt,
+		    struct kib_net *net, char *name, int size,
 		    kib_ps_pool_create_t po_create,
 		    kib_ps_pool_destroy_t po_destroy,
 		    kib_ps_node_init_t nd_init,
 		    kib_ps_node_fini_t nd_fini)
 {
-	kib_pool_t	*pool;
-	int		rc;
+	struct kib_pool	*pool;
+	int rc;
 
-	memset(ps, 0, sizeof(kib_poolset_t));
+	memset(ps, 0, sizeof(struct kib_poolset));
 
 	ps->ps_cpt	    = cpt;
         ps->ps_net          = net;
@@ -2178,22 +2149,22 @@ kiblnd_init_poolset(kib_poolset_t *ps, int cpt,
 }
 
 static int
-kiblnd_pool_is_idle(kib_pool_t *pool, cfs_time_t now)
+kiblnd_pool_is_idle(struct kib_pool *pool, time64_t now)
 {
         if (pool->po_allocated != 0) /* still in use */
                 return 0;
         if (pool->po_failed)
                 return 1;
-        return cfs_time_aftereq(now, pool->po_deadline);
+	return now >= pool->po_deadline;
 }
 
 void
-kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node)
+kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node)
 {
 	struct list_head zombies = LIST_HEAD_INIT(zombies);
-	kib_poolset_t	*ps = pool->po_owner;
-	kib_pool_t	*tmp;
-	cfs_time_t	 now = cfs_time_current();
+	struct kib_poolset *ps = pool->po_owner;
+	struct kib_pool *tmp;
+	time64_t now = ktime_get_seconds();
 
 	spin_lock(&ps->ps_lock);
 
@@ -2219,14 +2190,14 @@ kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node)
 }
 
 struct list_head *
-kiblnd_pool_alloc_node(kib_poolset_t *ps)
+kiblnd_pool_alloc_node(struct kib_poolset *ps)
 {
 	struct list_head	*node;
-	kib_pool_t		*pool;
+	struct kib_pool	*pool;
 	int			rc;
 	unsigned int		interval = 1;
-	cfs_time_t		time_before;
-	unsigned int		trips = 0;
+	ktime_t time_before;
+	unsigned int trips = 0;
 
 again:
 	spin_lock(&ps->ps_lock);
@@ -2235,7 +2206,8 @@ kiblnd_pool_alloc_node(kib_poolset_t *ps)
 			continue;
 
 		pool->po_allocated++;
-		pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+		pool->po_deadline = ktime_get_seconds() +
+				    IBLND_POOL_DEADLINE;
 		node = pool->po_free_list.next;
 		list_del(node);
 
@@ -2265,7 +2237,7 @@ kiblnd_pool_alloc_node(kib_poolset_t *ps)
                 goto again;
         }
 
-	if (cfs_time_before(cfs_time_current(), ps->ps_next_retry)) {
+	if (ktime_get_seconds() < ps->ps_next_retry) {
 		/* someone failed recently */
 		spin_unlock(&ps->ps_lock);
 		return NULL;
@@ -2275,17 +2247,17 @@ kiblnd_pool_alloc_node(kib_poolset_t *ps)
 	spin_unlock(&ps->ps_lock);
 
 	CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name);
-	time_before = cfs_time_current();
+	time_before = ktime_get();
 	rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool);
-	CDEBUG(D_NET, "ps_pool_create took %lu HZ to complete",
-	       cfs_time_current() - time_before);
+	CDEBUG(D_NET, "ps_pool_create took %lld ms to complete",
+	       ktime_ms_delta(ktime_get(), time_before));
 
 	spin_lock(&ps->ps_lock);
 	ps->ps_increasing = 0;
 	if (rc == 0) {
 		list_add_tail(&pool->po_list, &ps->ps_pool_list);
 	} else {
-		ps->ps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
+		ps->ps_next_retry = ktime_get_seconds() + IBLND_POOL_RETRY;
 		CERROR("Can't allocate new %s pool because out of memory\n",
 		       ps->ps_name);
 	}
@@ -2295,10 +2267,11 @@ kiblnd_pool_alloc_node(kib_poolset_t *ps)
 }
 
 static void
-kiblnd_destroy_tx_pool(kib_pool_t *pool)
+kiblnd_destroy_tx_pool(struct kib_pool *pool)
 {
-        kib_tx_pool_t  *tpo = container_of(pool, kib_tx_pool_t, tpo_pool);
-        int             i;
+	struct kib_tx_pool *tpo = container_of(pool, struct kib_tx_pool,
+					       tpo_pool);
+	int i;
 
         LASSERT (pool->po_allocated == 0);
 
@@ -2311,7 +2284,7 @@ kiblnd_destroy_tx_pool(kib_pool_t *pool)
                 goto out;
 
         for (i = 0; i < pool->po_size; i++) {
-		kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+		struct kib_tx *tx = &tpo->tpo_tx_descs[i];
 		int	  wrq_sge = *kiblnd_tunables.kib_wrq_sge;
 
 		list_del(&tx->tx_list);
@@ -2333,15 +2306,15 @@ kiblnd_destroy_tx_pool(kib_pool_t *pool)
 				    sizeof(*tx->tx_sge));
                 if (tx->tx_rd != NULL)
                         LIBCFS_FREE(tx->tx_rd,
-                                    offsetof(kib_rdma_desc_t,
+				    offsetof(struct kib_rdma_desc,
                                              rd_frags[IBLND_MAX_RDMA_FRAGS]));
         }
 
         LIBCFS_FREE(tpo->tpo_tx_descs,
-                    pool->po_size * sizeof(kib_tx_t));
+		    pool->po_size * sizeof(struct kib_tx));
 out:
         kiblnd_fini_pool(pool);
-        LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
+	LIBCFS_FREE(tpo, sizeof(struct kib_tx_pool));
 }
 
 static int kiblnd_tx_pool_size(struct lnet_ni *ni, int ncpts)
@@ -2356,12 +2329,12 @@ static int kiblnd_tx_pool_size(struct lnet_ni *ni, int ncpts)
 }
 
 static int
-kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
+kiblnd_create_tx_pool(struct kib_poolset *ps, int size, struct kib_pool **pp_po)
 {
         int            i;
         int            npg;
-        kib_pool_t    *pool;
-        kib_tx_pool_t *tpo;
+	struct kib_pool *pool;
+	struct kib_tx_pool *tpo;
 
 	LIBCFS_CPT_ALLOC(tpo, lnet_cpt_table(), ps->ps_cpt, sizeof(*tpo));
         if (tpo == NULL) {
@@ -2377,22 +2350,22 @@ kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
         npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE;
 	if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg) != 0) {
 		CERROR("Can't allocate tx pages: %d\n", npg);
-		LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
+		LIBCFS_FREE(tpo, sizeof(struct kib_tx_pool));
 		return -ENOMEM;
 	}
 
 	LIBCFS_CPT_ALLOC(tpo->tpo_tx_descs, lnet_cpt_table(), ps->ps_cpt,
-			 size * sizeof(kib_tx_t));
+			 size * sizeof(struct kib_tx));
         if (tpo->tpo_tx_descs == NULL) {
                 CERROR("Can't allocate %d tx descriptors\n", size);
                 ps->ps_pool_destroy(pool);
                 return -ENOMEM;
         }
 
-        memset(tpo->tpo_tx_descs, 0, size * sizeof(kib_tx_t));
+	memset(tpo->tpo_tx_descs, 0, size * sizeof(struct kib_tx));
 
         for (i = 0; i < size; i++) {
-		kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+		struct kib_tx *tx = &tpo->tpo_tx_descs[i];
 		int	  wrq_sge = *kiblnd_tunables.kib_wrq_sge;
 
                 tx->tx_pool = tpo;
@@ -2425,7 +2398,7 @@ kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
 			break;
 
 		LIBCFS_CPT_ALLOC(tx->tx_rd, lnet_cpt_table(), ps->ps_cpt,
-				 offsetof(kib_rdma_desc_t,
+				 offsetof(struct kib_rdma_desc,
 					  rd_frags[IBLND_MAX_RDMA_FRAGS]));
 		if (tx->tx_rd == NULL)
 			break;
@@ -2442,23 +2415,24 @@ kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
 }
 
 static void
-kiblnd_tx_init(kib_pool_t *pool, struct list_head *node)
+kiblnd_tx_init(struct kib_pool *pool, struct list_head *node)
 {
-	kib_tx_poolset_t *tps = container_of(pool->po_owner, kib_tx_poolset_t,
-					     tps_poolset);
-	kib_tx_t	 *tx  = list_entry(node, kib_tx_t, tx_list);
+	struct kib_tx_poolset *tps = container_of(pool->po_owner,
+						  struct kib_tx_poolset,
+						  tps_poolset);
+	struct kib_tx *tx  = list_entry(node, struct kib_tx, tx_list);
 
 	tx->tx_cookie = tps->tps_next_tx_cookie++;
 }
 
 static void
-kiblnd_net_fini_pools(kib_net_t *net)
+kiblnd_net_fini_pools(struct kib_net *net)
 {
 	int	i;
 
 	cfs_cpt_for_each(i, lnet_cpt_table()) {
-		kib_tx_poolset_t	*tps;
-		kib_fmr_poolset_t	*fps;
+		struct kib_tx_poolset *tps;
+		struct kib_fmr_poolset *fps;
 
 		if (net->ibn_tx_ps != NULL) {
 			tps = net->ibn_tx_ps[i];
@@ -2483,7 +2457,7 @@ kiblnd_net_fini_pools(kib_net_t *net)
 }
 
 static int
-kiblnd_net_init_pools(kib_net_t *net, struct lnet_ni *ni, __u32 *cpts,
+kiblnd_net_init_pools(struct kib_net *net, struct lnet_ni *ni, __u32 *cpts,
 		      int ncpts)
 {
 	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
@@ -2498,7 +2472,12 @@ kiblnd_net_init_pools(kib_net_t *net, struct lnet_ni *ni, __u32 *cpts,
 
 #ifdef HAVE_IB_GET_DMA_MR
 	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-	if (tunables->lnd_map_on_demand == 0) {
+	/*
+	 * if lnd_map_on_demand is zero then we have effectively disabled
+	 * FMR or FastReg and we're using global memory regions
+	 * exclusively.
+	 */
+	if (!tunables->lnd_map_on_demand) {
 		read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
 					   flags);
 		goto create_tx_pool;
@@ -2523,7 +2502,7 @@ kiblnd_net_init_pools(kib_net_t *net, struct lnet_ni *ni, __u32 *cpts,
 	 * FMR pool and map-on-demand if premapping failed */
 
 	net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
-					   sizeof(kib_fmr_poolset_t));
+					   sizeof(struct kib_fmr_poolset));
 	if (net->ibn_fmr_ps == NULL) {
 		CERROR("Failed to allocate FMR pool array\n");
 		rc = -ENOMEM;
@@ -2548,7 +2527,7 @@ kiblnd_net_init_pools(kib_net_t *net, struct lnet_ni *ni, __u32 *cpts,
  create_tx_pool:
 #endif
 	net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(),
-					  sizeof(kib_tx_poolset_t));
+					  sizeof(struct kib_tx_poolset));
 	if (net->ibn_tx_ps == NULL) {
 		CERROR("Failed to allocate tx pool array\n");
 		rc = -ENOMEM;
@@ -2578,52 +2557,87 @@ kiblnd_net_init_pools(kib_net_t *net, struct lnet_ni *ni, __u32 *cpts,
 }
 
 static int
-kiblnd_hdev_get_attr(kib_hca_dev_t *hdev)
+kiblnd_hdev_get_attr(struct kib_hca_dev *hdev)
 {
+	struct ib_device_attr *dev_attr;
+	int rc = 0;
+
+	/* It's safe to assume a HCA can handle a page size
+	 * matching that of the native system */
+	hdev->ibh_page_shift = PAGE_SHIFT;
+	hdev->ibh_page_size  = 1 << PAGE_SHIFT;
+	hdev->ibh_page_mask  = ~((__u64)hdev->ibh_page_size - 1);
+
 #ifndef HAVE_IB_DEVICE_ATTRS
-	struct ib_device_attr *attr;
-	int                    rc;
+	LIBCFS_ALLOC(dev_attr, sizeof(*dev_attr));
+	if (dev_attr == NULL) {
+		CERROR("Out of memory\n");
+		return -ENOMEM;
+	}
+
+	rc = ib_query_device(hdev->ibh_ibdev, dev_attr);
+	if (rc != 0) {
+		CERROR("Failed to query IB device: %d\n", rc);
+		goto out_clean_attr;
+	}
+#else
+	dev_attr = &hdev->ibh_ibdev->attrs;
 #endif
 
-        /* It's safe to assume a HCA can handle a page size
-         * matching that of the native system */
-        hdev->ibh_page_shift = PAGE_SHIFT;
-        hdev->ibh_page_size  = 1 << PAGE_SHIFT;
-        hdev->ibh_page_mask  = ~((__u64)hdev->ibh_page_size - 1);
+	hdev->ibh_mr_size = dev_attr->max_mr_size;
+	hdev->ibh_max_qp_wr = dev_attr->max_qp_wr;
 
-#ifdef HAVE_IB_DEVICE_ATTRS
-	hdev->ibh_mr_size = hdev->ibh_ibdev->attrs.max_mr_size;
+	/* Setup device Memory Registration capabilities */
+#ifdef HAVE_FMR_POOL_API
+#ifdef HAVE_IB_DEVICE_OPS
+	if (hdev->ibh_ibdev->ops.alloc_fmr &&
+	    hdev->ibh_ibdev->ops.dealloc_fmr &&
+	    hdev->ibh_ibdev->ops.map_phys_fmr &&
+	    hdev->ibh_ibdev->ops.unmap_fmr) {
 #else
-        LIBCFS_ALLOC(attr, sizeof(*attr));
-        if (attr == NULL) {
-                CERROR("Out of memory\n");
-                return -ENOMEM;
-        }
-
-        rc = ib_query_device(hdev->ibh_ibdev, attr);
-        if (rc == 0)
-                hdev->ibh_mr_size = attr->max_mr_size;
+	if (hdev->ibh_ibdev->alloc_fmr &&
+	    hdev->ibh_ibdev->dealloc_fmr &&
+	    hdev->ibh_ibdev->map_phys_fmr &&
+	    hdev->ibh_ibdev->unmap_fmr) {
+#endif
+		LCONSOLE_INFO("Using FMR for registration\n");
+		hdev->ibh_dev->ibd_dev_caps |= IBLND_DEV_CAPS_FMR_ENABLED;
+	} else
+#endif /* HAVE_FMR_POOL_API */
+	if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
+		LCONSOLE_INFO("Using FastReg for registration\n");
+		hdev->ibh_dev->ibd_dev_caps |= IBLND_DEV_CAPS_FASTREG_ENABLED;
+#ifndef HAVE_IB_ALLOC_FAST_REG_MR
+#ifdef IB_DEVICE_SG_GAPS_REG
+		if (dev_attr->device_cap_flags & IB_DEVICE_SG_GAPS_REG)
+			hdev->ibh_dev->ibd_dev_caps |= IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT;
+#endif
+#endif
+	} else {
+		rc = -ENOSYS;
+	}
 
-        LIBCFS_FREE(attr, sizeof(*attr));
+	if (rc == 0 && hdev->ibh_mr_size == ~0ULL)
+		hdev->ibh_mr_shift = 64;
+	else if (rc != 0)
+		rc = -EINVAL;
 
-        if (rc != 0) {
-                CERROR("Failed to query IB device: %d\n", rc);
-                return rc;
-        }
+#ifndef HAVE_IB_DEVICE_ATTRS
+out_clean_attr:
+	LIBCFS_FREE(dev_attr, sizeof(*dev_attr));
 #endif
 
-        if (hdev->ibh_mr_size == ~0ULL) {
-                hdev->ibh_mr_shift = 64;
-                return 0;
-        }
-
-	CERROR("Invalid mr size: %#llx\n", hdev->ibh_mr_size);
-        return -EINVAL;
+	if (rc == -ENOSYS)
+		CERROR("IB device does not support FMRs nor FastRegs, can't "
+		       "register memory: %d\n", rc);
+	else if (rc == -EINVAL)
+		CERROR("Invalid mr size: %#llx\n", hdev->ibh_mr_size);
+	return rc;
 }
 
 #ifdef HAVE_IB_GET_DMA_MR
 static void
-kiblnd_hdev_cleanup_mrs(kib_hca_dev_t *hdev)
+kiblnd_hdev_cleanup_mrs(struct kib_hca_dev *hdev)
 {
 	if (hdev->ibh_mrs == NULL)
 		return;
@@ -2635,7 +2649,7 @@ kiblnd_hdev_cleanup_mrs(kib_hca_dev_t *hdev)
 #endif
 
 void
-kiblnd_hdev_destroy(kib_hca_dev_t *hdev)
+kiblnd_hdev_destroy(struct kib_hca_dev *hdev)
 {
 #ifdef HAVE_IB_GET_DMA_MR
         kiblnd_hdev_cleanup_mrs(hdev);
@@ -2652,17 +2666,12 @@ kiblnd_hdev_destroy(kib_hca_dev_t *hdev)
 
 #ifdef HAVE_IB_GET_DMA_MR
 static int
-kiblnd_hdev_setup_mrs(kib_hca_dev_t *hdev)
+kiblnd_hdev_setup_mrs(struct kib_hca_dev *hdev)
 {
 	struct ib_mr *mr;
-	int           rc;
 	int           acflags = IB_ACCESS_LOCAL_WRITE |
 				IB_ACCESS_REMOTE_WRITE;
 
-	rc = kiblnd_hdev_get_attr(hdev);
-	if (rc != 0)
-		return rc;
-
 	mr = ib_get_dma_mr(hdev->ibh_pd, acflags);
 	if (IS_ERR(mr)) {
 		CERROR("Failed ib_get_dma_mr: %ld\n", PTR_ERR(mr));
@@ -2683,7 +2692,7 @@ kiblnd_dummy_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
 }
 
 static int
-kiblnd_dev_need_failover(kib_dev_t *dev, struct net *ns)
+kiblnd_dev_need_failover(struct kib_dev *dev, struct net *ns)
 {
         struct rdma_cm_id  *cmid;
         struct sockaddr_in  srcaddr;
@@ -2735,16 +2744,16 @@ kiblnd_dev_need_failover(kib_dev_t *dev, struct net *ns)
 }
 
 int
-kiblnd_dev_failover(kib_dev_t *dev, struct net *ns)
+kiblnd_dev_failover(struct kib_dev *dev, struct net *ns)
 {
 	struct list_head    zombie_tpo = LIST_HEAD_INIT(zombie_tpo);
 	struct list_head    zombie_ppo = LIST_HEAD_INIT(zombie_ppo);
 	struct list_head    zombie_fpo = LIST_HEAD_INIT(zombie_fpo);
         struct rdma_cm_id  *cmid  = NULL;
-        kib_hca_dev_t      *hdev  = NULL;
-        kib_hca_dev_t      *old;
+	struct kib_hca_dev *hdev  = NULL;
+	struct kib_hca_dev *old;
         struct ib_pd       *pd;
-        kib_net_t          *net;
+	struct kib_net *net;
         struct sockaddr_in  addr;
         unsigned long       flags;
         int                 rc = 0;
@@ -2776,7 +2785,7 @@ kiblnd_dev_failover(kib_dev_t *dev, struct net *ns)
         }
 
 	cmid = kiblnd_rdma_create_id(ns, kiblnd_cm_callback, dev, RDMA_PS_TCP,
-                                     IB_QPT_RC);
+				     IB_QPT_RC);
         if (IS_ERR(cmid)) {
                 rc = PTR_ERR(cmid);
                 CERROR("Failed to create cmid for failover: %d\n", rc);
@@ -2830,16 +2839,16 @@ kiblnd_dev_failover(kib_dev_t *dev, struct net *ns)
                 goto out;
         }
 
-#ifdef HAVE_IB_GET_DMA_MR
-	rc = kiblnd_hdev_setup_mrs(hdev);
+	rc = kiblnd_hdev_get_attr(hdev);
 	if (rc != 0) {
-		CERROR("Can't setup device: %d\n", rc);
+		CERROR("Can't get device attributes: %d\n", rc);
 		goto out;
 	}
-#else
-	rc = kiblnd_hdev_get_attr(hdev);
+
+#ifdef HAVE_IB_GET_DMA_MR
+	rc = kiblnd_hdev_setup_mrs(hdev);
 	if (rc != 0) {
-		CERROR("Can't get device attributes: %d\n", rc);
+		CERROR("Can't setup device: %d\n", rc);
 		goto out;
 	}
 #endif
@@ -2881,9 +2890,9 @@ kiblnd_dev_failover(kib_dev_t *dev, struct net *ns)
 }
 
 void
-kiblnd_destroy_dev (kib_dev_t *dev)
+kiblnd_destroy_dev(struct kib_dev *dev)
 {
-        LASSERT (dev->ibd_nnets == 0);
+	LASSERT(dev->ibd_nnets == 0);
 	LASSERT(list_empty(&dev->ibd_nets));
 
 	list_del(&dev->ibd_fail_list);
@@ -2969,7 +2978,7 @@ kiblnd_base_shutdown(void)
 static void
 kiblnd_shutdown(struct lnet_ni *ni)
 {
-        kib_net_t        *net = ni->ni_data;
+	struct kib_net *net = ni->ni_data;
 	rwlock_t     *g_lock = &kiblnd_data.kib_global_lock;
         int               i;
         unsigned long     flags;
@@ -3175,7 +3184,8 @@ kiblnd_start_schedulers(struct kib_sched_info *sched)
 	return rc;
 }
 
-static int kiblnd_dev_start_threads(kib_dev_t *dev, __u32 *cpts, int ncpts)
+static int kiblnd_dev_start_threads(struct kib_dev *dev, bool newdev, u32 *cpts,
+				    int ncpts)
 {
 	int	cpt;
 	int	rc;
@@ -3187,7 +3197,7 @@ static int kiblnd_dev_start_threads(kib_dev_t *dev, __u32 *cpts, int ncpts)
 		cpt = (cpts == NULL) ? i : cpts[i];
 		sched = kiblnd_data.kib_scheds[cpt];
 
-		if (sched->ibs_nthreads > 0)
+		if (!newdev && sched->ibs_nthreads > 0)
 			continue;
 
 		rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]);
@@ -3200,38 +3210,80 @@ static int kiblnd_dev_start_threads(kib_dev_t *dev, __u32 *cpts, int ncpts)
 	return 0;
 }
 
+static struct kib_dev *
+kiblnd_dev_search(char *ifname)
+{
+	struct kib_dev *alias = NULL;
+	struct kib_dev *dev;
+	char            *colon;
+	char            *colon2;
+
+	colon = strchr(ifname, ':');
+	list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
+		if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
+			return dev;
+
+		if (alias != NULL)
+			continue;
+
+		colon2 = strchr(dev->ibd_ifname, ':');
+		if (colon != NULL)
+			*colon = 0;
+		if (colon2 != NULL)
+			*colon2 = 0;
+
+		if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
+			alias = dev;
+
+		if (colon != NULL)
+			*colon = ':';
+		if (colon2 != NULL)
+			*colon2 = ':';
+	}
+	return alias;
+}
+
 static int
 kiblnd_startup(struct lnet_ni *ni)
 {
-        char                     *ifname;
+	char *ifname = NULL;
 	struct lnet_inetdev *ifaces = NULL;
-        kib_dev_t                *ibdev = NULL;
-        kib_net_t                *net;
-        unsigned long             flags;
-        int                       rc;
+	struct kib_dev *ibdev = NULL;
+	struct kib_net *net = NULL;
+	unsigned long flags;
+	int rc;
 	int i;
+	bool newdev;
 
-        LASSERT (ni->ni_net->net_lnd == &the_o2iblnd);
+	LASSERT(ni->ni_net->net_lnd == &the_o2iblnd);
 
-        if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
+	if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
 		rc = kiblnd_base_startup(ni->ni_net_ns);
-                if (rc != 0)
-                        return rc;
-        }
+		if (rc != 0)
+			return rc;
+	}
 
-        LIBCFS_ALLOC(net, sizeof(*net));
-        ni->ni_data = net;
-        if (net == NULL)
-                goto failed;
+	LIBCFS_ALLOC(net, sizeof(*net));
+	ni->ni_data = net;
+	if (net == NULL) {
+		rc = -ENOMEM;
+		goto failed;
+	}
 
 	net->ibn_incarnation = ktime_get_real_ns() / NSEC_PER_USEC;
 
 	kiblnd_tunables_setup(ni);
 
+	/*
+	 * ni_interfaces is only to support legacy pre Multi-Rail
+	 * tcp bonding for ksocklnd. Multi-Rail wants each secondary
+	 * IP to be treated as an unique 'struct ni' interfaces instead.
+	 */
 	if (ni->ni_interfaces[0] != NULL) {
 		/* Use the IPoIB interface specified in 'networks=' */
 		if (ni->ni_interfaces[1] != NULL) {
 			CERROR("ko2iblnd: Multiple interfaces not supported\n");
+			rc = -EINVAL;
 			goto failed;
 		}
 
@@ -3240,10 +3292,11 @@ kiblnd_startup(struct lnet_ni *ni)
 		ifname = *kiblnd_tunables.kib_default_ipif;
 	}
 
-        if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
-                CERROR("IPoIB interface name too long: %s\n", ifname);
-                goto failed;
-        }
+	if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
+		CERROR("IPoIB interface name too long: %s\n", ifname);
+		rc = -E2BIG;
+		goto failed;
+	}
 
 	rc = lnet_inet_enumerate(&ifaces, ni->ni_net_ns);
 	if (rc < 0)
@@ -3260,63 +3313,70 @@ kiblnd_startup(struct lnet_ni *ni)
 		goto failed;
 	}
 
-	LIBCFS_ALLOC(ibdev, sizeof(*ibdev));
-	if (!ibdev) {
-		rc = -ENOMEM;
-		goto failed;
-	}
+	ibdev = kiblnd_dev_search(ifname);
+	newdev = ibdev == NULL;
+	/* hmm...create kib_dev even for alias */
+	if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0) {
+		LIBCFS_ALLOC(ibdev, sizeof(*ibdev));
+		if (!ibdev) {
+			rc = -ENOMEM;
+			goto failed;
+		}
 
-	ibdev->ibd_ifip = ifaces[i].li_ipaddr;
-	strlcpy(ibdev->ibd_ifname, ifaces[i].li_name,
-		sizeof(ibdev->ibd_ifname));
-	ibdev->ibd_can_failover = !!(ifaces[i].li_flags & IFF_MASTER);
+		ibdev->ibd_ifip = ifaces[i].li_ipaddr;
+		strlcpy(ibdev->ibd_ifname, ifaces[i].li_name,
+			sizeof(ibdev->ibd_ifname));
+		ibdev->ibd_can_failover = !!(ifaces[i].li_flags & IFF_MASTER);
 
-	INIT_LIST_HEAD(&ibdev->ibd_nets);
-	INIT_LIST_HEAD(&ibdev->ibd_list); /* not yet in kib_devs */
-	INIT_LIST_HEAD(&ibdev->ibd_fail_list);
+		INIT_LIST_HEAD(&ibdev->ibd_nets);
+		INIT_LIST_HEAD(&ibdev->ibd_list); /* not yet in kib_devs */
+		INIT_LIST_HEAD(&ibdev->ibd_fail_list);
 
-	/* initialize the device */
-	rc = kiblnd_dev_failover(ibdev, ni->ni_net_ns);
-	if (rc) {
-		CERROR("ko2iblnd: Can't initialize device: rc = %d\n", rc);
-		goto failed;
-	}
+		/* initialize the device */
+		rc = kiblnd_dev_failover(ibdev, ni->ni_net_ns);
+		if (rc) {
+			CERROR("ko2iblnd: Can't initialize device: rc = %d\n", rc);
+			goto failed;
+		}
 
-	list_add_tail(&ibdev->ibd_list, &kiblnd_data.kib_devs);
+		list_add_tail(&ibdev->ibd_list, &kiblnd_data.kib_devs);
+	}
 
 	net->ibn_dev = ibdev;
 	ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
 
 	ni->ni_dev_cpt = ifaces[i].li_cpt;
 
-	rc = kiblnd_dev_start_threads(ibdev, ni->ni_cpts, ni->ni_ncpts);
+	rc = kiblnd_dev_start_threads(ibdev, newdev, ni->ni_cpts, ni->ni_ncpts);
 	if (rc != 0)
 		goto failed;
 
 	rc = kiblnd_net_init_pools(net, ni, ni->ni_cpts, ni->ni_ncpts);
-        if (rc != 0) {
-                CERROR("Failed to initialize NI pools: %d\n", rc);
-                goto failed;
-        }
+	if (rc != 0) {
+		CERROR("Failed to initialize NI pools: %d\n", rc);
+		goto failed;
+	}
 
 	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 	ibdev->ibd_nnets++;
 	list_add_tail(&net->ibn_list, &ibdev->ibd_nets);
 	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 
-        net->ibn_init = IBLND_INIT_ALL;
+	net->ibn_init = IBLND_INIT_ALL;
 
-        return 0;
+	return 0;
 
 failed:
 	if (net != NULL && net->ibn_dev == NULL && ibdev != NULL)
-                kiblnd_destroy_dev(ibdev);
+		kiblnd_destroy_dev(ibdev);
 
 	kfree(ifaces);
-        kiblnd_shutdown(ni);
+	kiblnd_shutdown(ni);
 
-        CDEBUG(D_NET, "kiblnd_startup failed\n");
-        return -ENETDOWN;
+	CDEBUG(D_NET, "Configuration of device %s failed: rc = %d\n",
+	       ifname ? ifname : "", rc);
+
+	return -ENETDOWN;
 }
 
 static struct lnet_lnd the_o2iblnd = {
@@ -3338,11 +3398,11 @@ static int __init ko2iblnd_init(void)
 {
 	int rc;
 
-	CLASSERT(sizeof(kib_msg_t) <= IBLND_MSG_SIZE);
-	CLASSERT(offsetof(kib_msg_t,
+	CLASSERT(sizeof(struct kib_msg) <= IBLND_MSG_SIZE);
+	CLASSERT(offsetof(struct kib_msg,
 			  ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) <=
 		 IBLND_MSG_SIZE);
-	CLASSERT(offsetof(kib_msg_t,
+	CLASSERT(offsetof(struct kib_msg,
 			  ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
 		 <= IBLND_MSG_SIZE);
 
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h
index 7a9a1c3de16a4..3e24405c2c31e 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -78,7 +78,6 @@
 #define DEBUG_SUBSYSTEM S_LND
 
 #include <libcfs/libcfs.h>
-#include <lnet/lnet.h>
 #include <lnet/lib-lnet.h>
 
 #define IBLND_PEER_HASH_SIZE		101	/* # peer_ni lists */
@@ -88,13 +87,12 @@
 #define IBLND_N_SCHED			2
 #define IBLND_N_SCHED_HIGH		4
 
-typedef struct
-{
+struct kib_tunables {
 	int              *kib_dev_failover;     /* HCA failover */
 	unsigned int     *kib_service;          /* IB service number */
 	int              *kib_min_reconnect_interval; /* first failed connection retry... */
 	int              *kib_max_reconnect_interval; /* ...exponentially increasing to this */
-	int              *kib_cksum;            /* checksum kib_msg_t? */
+	int              *kib_cksum;            /* checksum struct kib_msg? */
 	int              *kib_timeout;          /* comms timeout (seconds) */
 	int              *kib_keepalive;        /* keepalive timeout (seconds) */
 	int              *kib_ntx;              /* # tx descs */
@@ -107,32 +105,32 @@ typedef struct
 	/* # threads on each CPT */
 	int		 *kib_nscheds;
 	int		 *kib_wrq_sge;		/* # sg elements per wrq */
-} kib_tunables_t;
+	int		 *kib_use_fastreg_gaps; /* enable discontiguous fastreg fragment support */
+};
 
-extern kib_tunables_t  kiblnd_tunables;
+extern struct kib_tunables  kiblnd_tunables;
 
 #define IBLND_MSG_QUEUE_SIZE_V1      8          /* V1 only : # messages/RDMAs in-flight */
 #define IBLND_CREDIT_HIGHWATER_V1    7          /* V1 only : when eagerly to return credits */
 
 #define IBLND_CREDITS_DEFAULT        8          /* default # of peer_ni credits */
-#define IBLND_CREDITS_MAX          ((typeof(((kib_msg_t*) 0)->ibm_credits)) - 1)  /* Max # of peer_ni credits */
+#define IBLND_CREDITS_MAX          ((typeof(((struct kib_msg *) 0)->ibm_credits)) - 1)  /* Max # of peer_ni credits */
 
 /* when eagerly to return credits */
-#define IBLND_CREDITS_HIGHWATER(t, v) ((v) == IBLND_MSG_VERSION_1 ? \
+#define IBLND_CREDITS_HIGHWATER(t, conn) ((conn->ibc_version) == IBLND_MSG_VERSION_1 ? \
 					IBLND_CREDIT_HIGHWATER_V1 : \
-					t->lnd_peercredits_hiw)
+			min(t->lnd_peercredits_hiw, (__u32)conn->ibc_queue_depth - 1))
 
 #ifdef HAVE_RDMA_CREATE_ID_5ARG
-# define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) rdma_create_id(ns, cb, \
-								    dev, ps, \
-								    qpt)
+# define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) \
+	 rdma_create_id((ns) ? (ns) : &init_net, cb, dev, ps, qpt)
 #else
 # ifdef HAVE_RDMA_CREATE_ID_4ARG
-#  define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) rdma_create_id(cb, dev, \
-								     ps, qpt)
+#  define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) \
+	  rdma_create_id(cb, dev, ps, qpt)
 # else
-#  define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) rdma_create_id(cb, dev, \
-								     ps)
+#  define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) \
+	  rdma_create_id(cb, dev, ps)
 # endif
 #endif
 
@@ -162,7 +160,7 @@ extern kib_tunables_t  kiblnd_tunables;
 #define IBLND_RECV_WRS(c)            IBLND_RX_MSGS(c)
 
 /* 2 = LNet msg + Transfer chain */
-#define IBLND_CQ_ENTRIES(c)	(IBLND_RECV_WRS(c) + kiblnd_send_wrs(c))
+#define IBLND_CQ_ENTRIES(c) (IBLND_RECV_WRS(c) + kiblnd_send_wrs(c))
 
 struct kib_hca_dev;
 
@@ -173,8 +171,15 @@ struct kib_hca_dev;
 #define KIB_IFNAME_SIZE              256
 #endif
 
-typedef struct
-{
+enum kib_dev_caps {
+	IBLND_DEV_CAPS_FASTREG_ENABLED		= BIT(0),
+	IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT	= BIT(1),
+#ifdef HAVE_FMR_POOL_API
+	IBLND_DEV_CAPS_FMR_ENABLED		= BIT(2),
+#endif
+};
+
+struct kib_dev {
 	struct list_head	ibd_list;	/* chain on kib_devs */
 	struct list_head	ibd_fail_list;	/* chain on kib_failed_devs */
 	__u32			ibd_ifip;	/* IPoIB interface IP */
@@ -182,7 +187,7 @@ typedef struct
 	char			ibd_ifname[KIB_IFNAME_SIZE];
 	int			ibd_nnets;	/* # nets extant */
 
-	cfs_time_t		ibd_next_failover;
+	time64_t		ibd_next_failover;
 	/* # failover failures */
 	int			ibd_failed_failover;
 	/* failover in progress */
@@ -191,10 +196,10 @@ typedef struct
 	unsigned int		ibd_can_failover;
 	struct list_head	ibd_nets;
 	struct kib_hca_dev	*ibd_hdev;
-} kib_dev_t;
+	enum kib_dev_caps	ibd_dev_caps;
+};
 
-typedef struct kib_hca_dev
-{
+struct kib_hca_dev {
 	struct rdma_cm_id   *ibh_cmid;          /* listener cmid */
 	struct ib_device    *ibh_ibdev;         /* IB device */
 	int                  ibh_page_shift;    /* page shift of current HCA */
@@ -202,24 +207,24 @@ typedef struct kib_hca_dev
 	__u64                ibh_page_mask;     /* page mask of current HCA */
 	int                  ibh_mr_shift;      /* bits shift of max MR size */
 	__u64                ibh_mr_size;       /* size of MR */
+	int		     ibh_max_qp_wr;     /* maximum work requests size */
 #ifdef HAVE_IB_GET_DMA_MR
 	struct ib_mr        *ibh_mrs;           /* global MR */
 #endif
 	struct ib_pd        *ibh_pd;            /* PD */
-	kib_dev_t           *ibh_dev;           /* owner */
+	struct kib_dev           *ibh_dev;           /* owner */
 	atomic_t             ibh_ref;           /* refcount */
-} kib_hca_dev_t;
+};
 
 /** # of seconds to keep pool alive */
 #define IBLND_POOL_DEADLINE     300
 /** # of seconds to retry if allocation failed */
 #define IBLND_POOL_RETRY        1
 
-typedef struct
-{
+struct kib_pages {
         int                     ibp_npages;             /* # pages */
         struct page            *ibp_pages[0];           /* page array */
-} kib_pages_t;
+};
 
 struct kib_pool;
 struct kib_poolset;
@@ -234,8 +239,7 @@ struct kib_net;
 
 #define IBLND_POOL_NAME_LEN     32
 
-typedef struct kib_poolset
-{
+struct kib_poolset {
 	/* serialize */
 	spinlock_t		ps_lock;
 	/* network it belongs to */
@@ -247,7 +251,7 @@ typedef struct kib_poolset
 	/* failed pool list */
 	struct list_head	ps_failed_pool_list;
 	/* time stamp for retry if failed to allocate */
-	cfs_time_t		ps_next_retry;
+	time64_t		ps_next_retry;
 	/* is allocating new pool */
 	int			ps_increasing;
 	/* new pool size */
@@ -263,40 +267,38 @@ typedef struct kib_poolset
 	kib_ps_node_init_t	ps_node_init;
 	/* finalize node */
 	kib_ps_node_fini_t	ps_node_fini;
-} kib_poolset_t;
+};
 
-typedef struct kib_pool
-{
+struct kib_pool {
 	/* chain on pool list */
 	struct list_head	po_list;
 	/* pre-allocated node */
 	struct list_head	po_free_list;
 	/* pool_set of this pool */
-	kib_poolset_t	       *po_owner;
+	struct kib_poolset     *po_owner;
 	/* deadline of this pool */
-	cfs_time_t		po_deadline;
+	time64_t		po_deadline;
 	/* # of elements in use */
 	int			po_allocated;
 	/* pool is created on failed HCA */
 	int			po_failed;
 	/* # of pre-allocated elements */
 	int			po_size;
-} kib_pool_t;
+};
 
-typedef struct {
-        kib_poolset_t           tps_poolset;            /* pool-set */
+struct kib_tx_poolset {
+	struct kib_poolset	tps_poolset;		/* pool-set */
         __u64                   tps_next_tx_cookie;     /* cookie of TX */
-} kib_tx_poolset_t;
+};
 
-typedef struct {
-        kib_pool_t              tpo_pool;               /* pool */
+struct kib_tx_pool {
+	struct kib_pool		tpo_pool;		/* pool */
         struct kib_hca_dev     *tpo_hdev;               /* device for this pool */
         struct kib_tx          *tpo_tx_descs;           /* all the tx descriptors */
-        kib_pages_t            *tpo_tx_pages;           /* premapped tx msg pages */
-} kib_tx_pool_t;
+	struct kib_pages       *tpo_tx_pages;           /* premapped tx msg pages */
+};
 
-typedef struct
-{
+struct kib_fmr_poolset {
 	spinlock_t		fps_lock;		/* serialize */
 	struct kib_net	       *fps_net;		/* IB network */
 	struct list_head	fps_pool_list;		/* FMR pool list */
@@ -309,8 +311,8 @@ typedef struct
 	/* is allocating new pool */
 	int			fps_increasing;
 	/* time stamp for retry if failed to allocate */
-	cfs_time_t		fps_next_retry;
-} kib_fmr_poolset_t;
+	time64_t		fps_next_retry;
+};
 
 #ifndef HAVE_IB_RDMA_WR
 struct ib_rdma_wr {
@@ -329,13 +331,13 @@ struct kib_fast_reg_descriptor { /* For fast registration */
 #endif
 	struct ib_mr			*frd_mr;
 	bool				 frd_valid;
+	bool				 frd_posted;
 };
 
-typedef struct
-{
+struct kib_fmr_pool {
 	struct list_head	fpo_list;	/* chain on pool list */
 	struct kib_hca_dev     *fpo_hdev;	/* device for this pool */
-	kib_fmr_poolset_t      *fpo_owner;	/* owner of this pool */
+	struct kib_fmr_poolset      *fpo_owner;	/* owner of this pool */
 #ifdef HAVE_FMR_POOL_API
 	union {
 		struct {
@@ -348,25 +350,24 @@ typedef struct
 		} fast_reg;
 #ifdef HAVE_FMR_POOL_API
 	};
-	int			fpo_is_fmr;
+	bool			fpo_is_fmr; /* True if FMR pools allocated */
 #endif
-	cfs_time_t		fpo_deadline;	/* deadline of this pool */
+	time64_t		fpo_deadline;	/* deadline of this pool */
 	int			fpo_failed;	/* fmr pool is failed */
 	int			fpo_map_count;	/* # of mapped FMR */
-} kib_fmr_pool_t;
+};
 
-typedef struct {
-	kib_fmr_pool_t			*fmr_pool;	/* pool of FMR */
+struct kib_fmr {
+	struct kib_fmr_pool		*fmr_pool;	/* pool of FMR */
 #ifdef HAVE_FMR_POOL_API
 	struct ib_pool_fmr		*fmr_pfmr;	/* IB pool fmr */
 #endif /* HAVE_FMR_POOL_API */
 	struct kib_fast_reg_descriptor	*fmr_frd;
 	u32				 fmr_key;
-} kib_fmr_t;
+};
 
-typedef struct kib_net
-{
-	/* chain on kib_dev_t::ibd_nets */
+struct kib_net {
+	/* chain on struct kib_dev::ibd_nets */
 	struct list_head	ibn_list;
 	__u64			ibn_incarnation;/* my epoch */
 	int			ibn_init;	/* initialisation state */
@@ -375,11 +376,11 @@ typedef struct kib_net
 	atomic_t		ibn_npeers;	/* # peers extant */
 	atomic_t		ibn_nconns;	/* # connections extant */
 
-	kib_tx_poolset_t	**ibn_tx_ps;	/* tx pool-set */
-	kib_fmr_poolset_t	**ibn_fmr_ps;	/* fmr pool-set */
+	struct kib_tx_poolset	**ibn_tx_ps;	/* tx pool-set */
+	struct kib_fmr_poolset	**ibn_fmr_ps;	/* fmr pool-set */
 
-	kib_dev_t		*ibn_dev;	/* underlying IB device */
-} kib_net_t;
+	struct kib_dev		*ibn_dev;	/* underlying IB device */
+};
 
 #define KIB_THREAD_SHIFT		16
 #define KIB_THREAD_ID(cpt, tid)		((cpt) << KIB_THREAD_SHIFT | (tid))
@@ -400,8 +401,7 @@ struct kib_sched_info {
 	int			ibs_cpt;	/* CPT id */
 };
 
-typedef struct
-{
+struct kib_data {
 	int			kib_init;	/* initialisation state */
 	int			kib_shutdown;	/* shut down? */
 	struct list_head	kib_devs;	/* IB devices extant */
@@ -430,14 +430,14 @@ typedef struct
 	 * The second that peers are pulled out from \a kib_reconn_wait
 	 * for reconnection.
 	 */
-	unsigned int		kib_reconn_sec;
+	time64_t		kib_reconn_sec;
 	/* connection daemon sleeps here */
 	wait_queue_head_t	kib_connd_waitq;
 	spinlock_t		kib_connd_lock;	/* serialise */
 	struct ib_qp_attr	kib_error_qpa;	/* QP->ERROR */
 	/* percpt data for schedulers */
 	struct kib_sched_info	**kib_scheds;
-} kib_data_t;
+};
 
 #define IBLND_INIT_NOTHING         0
 #define IBLND_INIT_DATA            1
@@ -448,60 +448,51 @@ typedef struct
  * These are sent in sender's byte order (i.e. receiver flips).
  */
 
-typedef struct kib_connparams
-{
+struct kib_connparams {
         __u16             ibcp_queue_depth;
         __u16             ibcp_max_frags;
         __u32             ibcp_max_msg_size;
-} WIRE_ATTR kib_connparams_t;
+} WIRE_ATTR;
 
-typedef struct
-{
+struct kib_immediate_msg {
 	struct lnet_hdr		ibim_hdr;	/* portals header */
 	char			ibim_payload[0];/* piggy-backed payload */
-} WIRE_ATTR kib_immediate_msg_t;
+} WIRE_ATTR;
 
-typedef struct
-{
+struct kib_rdma_frag {
         __u32             rf_nob;               /* # bytes this frag */
         __u64             rf_addr;              /* CAVEAT EMPTOR: misaligned!! */
-} WIRE_ATTR kib_rdma_frag_t;
+} WIRE_ATTR;
 
-typedef struct
-{
-        __u32             rd_key;               /* local/remote key */
-        __u32             rd_nfrags;            /* # fragments */
-        kib_rdma_frag_t   rd_frags[0];          /* buffer frags */
-} WIRE_ATTR kib_rdma_desc_t;
+struct kib_rdma_desc {
+	__u32			rd_key;		/* local/remote key */
+	__u32			rd_nfrags;	/* # fragments */
+	struct kib_rdma_frag	rd_frags[0];	/* buffer frags */
+} WIRE_ATTR;
 
-typedef struct
-{
+struct kib_putreq_msg {
 	struct lnet_hdr		ibprm_hdr;	/* portals header */
 	__u64			ibprm_cookie;	/* opaque completion cookie */
-} WIRE_ATTR kib_putreq_msg_t;
+} WIRE_ATTR;
 
-typedef struct
-{
+struct kib_putack_msg {
         __u64             ibpam_src_cookie;     /* reflected completion cookie */
         __u64             ibpam_dst_cookie;     /* opaque completion cookie */
-        kib_rdma_desc_t   ibpam_rd;             /* sender's sink buffer */
-} WIRE_ATTR kib_putack_msg_t;
+	struct kib_rdma_desc	ibpam_rd;	/* sender's sink buffer */
+} WIRE_ATTR;
 
-typedef struct
-{
+struct kib_get_msg {
 	struct lnet_hdr		ibgm_hdr;	/* portals header */
 	__u64			ibgm_cookie;	/* opaque completion cookie */
-	kib_rdma_desc_t		ibgm_rd;	/* rdma descriptor */
-} WIRE_ATTR kib_get_msg_t;
+	struct kib_rdma_desc	ibgm_rd;	/* rdma descriptor */
+} WIRE_ATTR;
 
-typedef struct
-{
+struct kib_completion_msg {
         __u64             ibcm_cookie;          /* opaque completion cookie */
         __s32             ibcm_status;          /* < 0 failure: >= 0 length */
-} WIRE_ATTR kib_completion_msg_t;
+} WIRE_ATTR;
 
-typedef struct
-{
+struct kib_msg {
         /* First 2 fields fixed FOR ALL TIME */
         __u32             ibm_magic;            /* I'm an ibnal message */
         __u16             ibm_version;          /* this is my version number */
@@ -516,14 +507,14 @@ typedef struct
         __u64             ibm_dststamp;         /* destination's incarnation */
 
         union {
-                kib_connparams_t      connparams;
-                kib_immediate_msg_t   immediate;
-                kib_putreq_msg_t      putreq;
-                kib_putack_msg_t      putack;
-                kib_get_msg_t         get;
-                kib_completion_msg_t  completion;
+		struct kib_connparams		connparams;
+		struct kib_immediate_msg	immediate;
+		struct kib_putreq_msg		putreq;
+		struct kib_putack_msg		putack;
+		struct kib_get_msg		get;
+		struct kib_completion_msg	completion;
         } WIRE_ATTR ibm_u;
-} WIRE_ATTR kib_msg_t;
+} WIRE_ATTR;
 
 #define IBLND_MSG_MAGIC LNET_PROTO_IB_MAGIC	/* unique magic */
 
@@ -542,14 +533,14 @@ typedef struct
 #define IBLND_MSG_GET_REQ           0xd6        /* getreq (sink->src) */
 #define IBLND_MSG_GET_DONE          0xd7        /* completion (src->sink: all OK) */
 
-typedef struct {
+struct kib_rej {
         __u32            ibr_magic;             /* sender's magic */
         __u16            ibr_version;           /* sender's version */
         __u8             ibr_why;               /* reject reason */
         __u8             ibr_padding;           /* padding */
         __u64            ibr_incarnation;       /* incarnation of peer_ni */
-        kib_connparams_t ibr_cp;                /* connection parameters */
-} WIRE_ATTR kib_rej_t;
+	struct kib_connparams	ibr_cp;		/* connection parameters */
+} WIRE_ATTR;
 
 /* connection rejection reasons */
 #define IBLND_REJECT_CONN_RACE       1          /* You lost connection race */
@@ -567,8 +558,7 @@ typedef struct {
 
 /***********************************************************************/
 
-typedef struct kib_rx                           /* receive message */
-{
+struct kib_rx {					/* receive message */
 	/* queue for attention */
 	struct list_head	rx_list;
 	/* owning conn */
@@ -578,7 +568,7 @@ typedef struct kib_rx                           /* receive message */
 	/* completion status */
 	enum ib_wc_status	rx_status;
 	/* message buffer (host vaddr) */
-	kib_msg_t	       *rx_msg;
+	struct kib_msg	       *rx_msg;
 	/* message buffer (I/O addr) */
 	__u64			rx_msgaddr;
 	/* for dma_unmap_single() */
@@ -587,19 +577,18 @@ typedef struct kib_rx                           /* receive message */
 	struct ib_recv_wr	rx_wrq;
 	/* ...and its memory */
 	struct ib_sge		rx_sge;
-} kib_rx_t;
+};
 
 #define IBLND_POSTRX_DONT_POST    0             /* don't post */
 #define IBLND_POSTRX_NO_CREDIT    1             /* post: no credits */
 #define IBLND_POSTRX_PEER_CREDIT  2             /* post: give peer_ni back 1 credit */
 #define IBLND_POSTRX_RSRVD_CREDIT 3             /* post: give myself back 1 reserved credit */
 
-typedef struct kib_tx                           /* transmit message */
-{
+struct kib_tx {					/* transmit message */
 	/* queue on idle_txs ibc_tx_queue etc. */
 	struct list_head	tx_list;
 	/* pool I'm from */
-	kib_tx_pool_t		*tx_pool;
+	struct kib_tx_pool	*tx_pool;
 	/* owning conn */
 	struct kib_conn		*tx_conn;
 	/* # tx callbacks outstanding */
@@ -610,14 +599,16 @@ typedef struct kib_tx                           /* transmit message */
 	short			tx_waiting;
 	/* LNET completion status */
 	int			tx_status;
+	/* health status of the transmit */
+	enum lnet_msg_hstatus	tx_hstatus;
 	/* completion deadline */
-	unsigned long		tx_deadline;
+	ktime_t			tx_deadline;
 	/* completion cookie */
 	__u64			tx_cookie;
 	/* lnet msgs to finalize on completion */
 	struct lnet_msg		*tx_lntmsg[2];
 	/* message buffer (host vaddr) */
-	kib_msg_t		*tx_msg;
+	struct kib_msg		*tx_msg;
 	/* message buffer (I/O addr) */
 	__u64			tx_msgaddr;
 	/* for dma_unmap_single() */
@@ -633,33 +624,33 @@ typedef struct kib_tx                           /* transmit message */
 	/* ...and their memory */
 	struct ib_sge		*tx_sge;
 	/* rdma descriptor */
-	kib_rdma_desc_t		*tx_rd;
+	struct kib_rdma_desc	*tx_rd;
 	/* # entries in... */
 	int			tx_nfrags;
 	/* dma_map_sg descriptor */
 	struct scatterlist	*tx_frags;
 	/* rdma phys page addrs */
 	__u64			*tx_pages;
+	/* gaps in fragments */
+	bool			tx_gaps;
 	/* FMR */
-	kib_fmr_t		fmr;
+	struct kib_fmr		tx_fmr;
 				/* dma direction */
 	int			tx_dmadir;
-} kib_tx_t;
+};
 
-typedef struct kib_connvars
-{
+struct kib_connvars {
         /* connection-in-progress variables */
-        kib_msg_t                 cv_msg;
-} kib_connvars_t;
+	struct kib_msg		cv_msg;
+};
 
-typedef struct kib_conn
-{
+struct kib_conn {
 	/* scheduler information */
 	struct kib_sched_info	*ibc_sched;
 	/* owning peer_ni */
-	struct kib_peer		*ibc_peer;
+	struct kib_peer_ni	*ibc_peer;
 	/* HCA bound on */
-	kib_hca_dev_t		*ibc_hdev;
+	struct kib_hca_dev	*ibc_hdev;
 	/* stash on peer_ni's conn list */
 	struct list_head	ibc_list;
 	/* schedule for attention */
@@ -697,7 +688,7 @@ typedef struct kib_conn
 	/* CQ callback fired */
 	unsigned int		ibc_ready:1;
 	/* time of last send */
-	unsigned long		ibc_last_send;
+	ktime_t			ibc_last_send;
 	/** link chain for kiblnd_check_conns only */
 	struct list_head	ibc_connd_list;
 	/** rxs completed before ESTABLISHED */
@@ -712,12 +703,14 @@ typedef struct kib_conn
 	struct list_head	ibc_tx_queue_rsrvd;
 	/* active tx awaiting completion */
 	struct list_head	ibc_active_txs;
+	/* zombie tx awaiting done */
+	struct list_head	ibc_zombie_txs;
 	/* serialise */
 	spinlock_t		ibc_lock;
 	/* the rx descs */
-	kib_rx_t		*ibc_rxs;
+	struct kib_rx		*ibc_rxs;
 	/* premapped rx msg pages */
-	kib_pages_t		*ibc_rx_pages;
+	struct kib_pages	*ibc_rx_pages;
 
 	/* CM id */
 	struct rdma_cm_id	*ibc_cmid;
@@ -725,8 +718,8 @@ typedef struct kib_conn
 	struct ib_cq		*ibc_cq;
 
 	/* in-progress connection state */
-	kib_connvars_t		*ibc_connvars;
-} kib_conn_t;
+	struct kib_connvars	*ibc_connvars;
+};
 
 #define IBLND_CONN_INIT               0         /* being initialised */
 #define IBLND_CONN_ACTIVE_CONNECT     1         /* active sending req */
@@ -735,8 +728,7 @@ typedef struct kib_conn
 #define IBLND_CONN_CLOSING            4         /* being closed */
 #define IBLND_CONN_DISCONNECTED       5         /* disconnected */
 
-typedef struct kib_peer
-{
+struct kib_peer_ni {
 	/* stash on global peer_ni list */
 	struct list_head	ibp_list;
 	/* who's on the other end(s) */
@@ -751,8 +743,8 @@ typedef struct kib_peer
 	struct list_head	ibp_tx_queue;
 	/* incarnation of peer_ni */
 	__u64			ibp_incarnation;
-	/* when (in jiffies) I was last alive */
-	cfs_time_t		ibp_last_alive;
+	/* when (in seconds) I was last alive */
+	time64_t		ibp_last_alive;
 	/* # users */
 	atomic_t		ibp_refcount;
 	/* version of peer_ni */
@@ -767,13 +759,15 @@ typedef struct kib_peer
 	unsigned char		ibp_races;
 	/* # consecutive reconnection attempts to this peer */
 	unsigned int		ibp_reconnected;
+	/* number of total active retries */
+	unsigned int		ibp_retries;
 	/* errno on closing this peer_ni */
 	int			ibp_error;
 	/* max map_on_demand */
 	__u16			ibp_max_frags;
 	/* max_peer_credits */
 	__u16			ibp_queue_depth;
-} kib_peer_ni_t;
+};
 
 #ifndef HAVE_IB_INC_RKEY
 /**
@@ -788,32 +782,12 @@ static inline u32 ib_inc_rkey(u32 rkey)
 }
 #endif
 
-extern kib_data_t      kiblnd_data;
+extern struct kib_data kiblnd_data;
 
-extern void kiblnd_hdev_destroy(kib_hca_dev_t *hdev);
+extern void kiblnd_hdev_destroy(struct kib_hca_dev *hdev);
 
 int kiblnd_msg_queue_size(int version, struct lnet_ni *ni);
 
-/* max # of fragments configured by user */
-static inline int
-kiblnd_cfg_rdma_frags(struct lnet_ni *ni)
-{
-	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
-	int mod;
-
-	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
-	mod = tunables->lnd_map_on_demand;
-	return mod != 0 ? mod : IBLND_MAX_RDMA_FRAGS;
-}
-
-static inline int
-kiblnd_rdma_frags(int version, struct lnet_ni *ni)
-{
-	return version == IBLND_MSG_VERSION_1 ?
-	  IBLND_MAX_RDMA_FRAGS :
-	  kiblnd_cfg_rdma_frags(ni);
-}
-
 static inline int
 kiblnd_concurrent_sends(int version, struct lnet_ni *ni)
 {
@@ -835,14 +809,14 @@ kiblnd_concurrent_sends(int version, struct lnet_ni *ni)
 }
 
 static inline void
-kiblnd_hdev_addref_locked(kib_hca_dev_t *hdev)
+kiblnd_hdev_addref_locked(struct kib_hca_dev *hdev)
 {
 	LASSERT(atomic_read(&hdev->ibh_ref) > 0);
 	atomic_inc(&hdev->ibh_ref);
 }
 
 static inline void
-kiblnd_hdev_decref(kib_hca_dev_t *hdev)
+kiblnd_hdev_decref(struct kib_hca_dev *hdev)
 {
 	LASSERT(atomic_read(&hdev->ibh_ref) > 0);
 	if (atomic_dec_and_test(&hdev->ibh_ref))
@@ -850,7 +824,7 @@ kiblnd_hdev_decref(kib_hca_dev_t *hdev)
 }
 
 static inline int
-kiblnd_dev_can_failover(kib_dev_t *dev)
+kiblnd_dev_can_failover(struct kib_dev *dev)
 {
 	if (!list_empty(&dev->ibd_fail_list)) /* already scheduled */
                 return 0;
@@ -906,7 +880,7 @@ do {                                                            \
 } while (0)
 
 static inline bool
-kiblnd_peer_connecting(kib_peer_ni_t *peer_ni)
+kiblnd_peer_connecting(struct kib_peer_ni *peer_ni)
 {
 	return peer_ni->ibp_connecting != 0 ||
 	       peer_ni->ibp_reconnecting != 0 ||
@@ -914,7 +888,7 @@ kiblnd_peer_connecting(kib_peer_ni_t *peer_ni)
 }
 
 static inline bool
-kiblnd_peer_idle(kib_peer_ni_t *peer_ni)
+kiblnd_peer_idle(struct kib_peer_ni *peer_ni)
 {
 	return !kiblnd_peer_connecting(peer_ni) && list_empty(&peer_ni->ibp_conns);
 }
@@ -929,14 +903,14 @@ kiblnd_nid2peerlist (lnet_nid_t nid)
 }
 
 static inline int
-kiblnd_peer_active (kib_peer_ni_t *peer_ni)
+kiblnd_peer_active(struct kib_peer_ni *peer_ni)
 {
 	/* Am I in the peer_ni hash table? */
 	return !list_empty(&peer_ni->ibp_list);
 }
 
 static inline struct kib_conn *
-kiblnd_get_conn_locked (kib_peer_ni_t *peer_ni)
+kiblnd_get_conn_locked(struct kib_peer_ni *peer_ni)
 {
 	struct list_head *next;
 
@@ -954,16 +928,17 @@ kiblnd_get_conn_locked (kib_peer_ni_t *peer_ni)
 }
 
 static inline int
-kiblnd_send_keepalive(kib_conn_t *conn)
+kiblnd_send_keepalive(struct kib_conn *conn)
 {
+	s64 keepalive_ns = *kiblnd_tunables.kib_keepalive * NSEC_PER_SEC;
+
 	return (*kiblnd_tunables.kib_keepalive > 0) &&
-		cfs_time_after(jiffies, conn->ibc_last_send +
-			       msecs_to_jiffies(*kiblnd_tunables.kib_keepalive *
-						MSEC_PER_SEC));
+		ktime_after(ktime_get(),
+			    ktime_add_ns(conn->ibc_last_send, keepalive_ns));
 }
 
 static inline int
-kiblnd_need_noop(kib_conn_t *conn)
+kiblnd_need_noop(struct kib_conn *conn)
 {
 	struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
 	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
@@ -972,7 +947,7 @@ kiblnd_need_noop(kib_conn_t *conn)
 	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
 
         if (conn->ibc_outstanding_credits <
-	    IBLND_CREDITS_HIGHWATER(tunables, conn->ibc_version) &&
+	    IBLND_CREDITS_HIGHWATER(tunables, conn) &&
             !kiblnd_send_keepalive(conn))
                 return 0; /* No need to send NOOP */
 
@@ -999,14 +974,14 @@ kiblnd_need_noop(kib_conn_t *conn)
 }
 
 static inline void
-kiblnd_abort_receives(kib_conn_t *conn)
+kiblnd_abort_receives(struct kib_conn *conn)
 {
         ib_modify_qp(conn->ibc_cmid->qp,
                      &kiblnd_data.kib_error_qpa, IB_QP_STATE);
 }
 
 static inline const char *
-kiblnd_queue2str(kib_conn_t *conn, struct list_head *q)
+kiblnd_queue2str(struct kib_conn *conn, struct list_head *q)
 {
 	if (q == &conn->ibc_tx_queue)
 		return "tx_queue";
@@ -1057,21 +1032,21 @@ kiblnd_wreqid2type (__u64 wreqid)
 }
 
 static inline void
-kiblnd_set_conn_state (kib_conn_t *conn, int state)
+kiblnd_set_conn_state(struct kib_conn *conn, int state)
 {
 	conn->ibc_state = state;
 	smp_mb();
 }
 
 static inline void
-kiblnd_init_msg (kib_msg_t *msg, int type, int body_nob)
+kiblnd_init_msg(struct kib_msg *msg, int type, int body_nob)
 {
         msg->ibm_type = type;
-        msg->ibm_nob  = offsetof(kib_msg_t, ibm_u) + body_nob;
+	msg->ibm_nob = offsetof(struct kib_msg, ibm_u) + body_nob;
 }
 
 static inline int
-kiblnd_rd_size (kib_rdma_desc_t *rd)
+kiblnd_rd_size(struct kib_rdma_desc *rd)
 {
         int   i;
         int   size;
@@ -1083,25 +1058,25 @@ kiblnd_rd_size (kib_rdma_desc_t *rd)
 }
 
 static inline __u64
-kiblnd_rd_frag_addr(kib_rdma_desc_t *rd, int index)
+kiblnd_rd_frag_addr(struct kib_rdma_desc *rd, int index)
 {
         return rd->rd_frags[index].rf_addr;
 }
 
 static inline __u32
-kiblnd_rd_frag_size(kib_rdma_desc_t *rd, int index)
+kiblnd_rd_frag_size(struct kib_rdma_desc *rd, int index)
 {
         return rd->rd_frags[index].rf_nob;
 }
 
 static inline __u32
-kiblnd_rd_frag_key(kib_rdma_desc_t *rd, int index)
+kiblnd_rd_frag_key(struct kib_rdma_desc *rd, int index)
 {
         return rd->rd_key;
 }
 
 static inline int
-kiblnd_rd_consume_frag(kib_rdma_desc_t *rd, int index, __u32 nob)
+kiblnd_rd_consume_frag(struct kib_rdma_desc *rd, int index, __u32 nob)
 {
         if (nob < rd->rd_frags[index].rf_nob) {
                 rd->rd_frags[index].rf_addr += nob;
@@ -1114,14 +1089,14 @@ kiblnd_rd_consume_frag(kib_rdma_desc_t *rd, int index, __u32 nob)
 }
 
 static inline int
-kiblnd_rd_msg_size(kib_rdma_desc_t *rd, int msgtype, int n)
+kiblnd_rd_msg_size(struct kib_rdma_desc *rd, int msgtype, int n)
 {
         LASSERT (msgtype == IBLND_MSG_GET_REQ ||
                  msgtype == IBLND_MSG_PUT_ACK);
 
         return msgtype == IBLND_MSG_GET_REQ ?
-               offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]) :
-               offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
+	       offsetof(struct kib_get_msg, ibgm_rd.rd_frags[n]) :
+	       offsetof(struct kib_putack_msg, ibpam_rd.rd_frags[n]);
 }
 
 static inline __u64
@@ -1179,6 +1154,10 @@ static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev,
         return ib_sg_dma_len(dev, sg);
 }
 
+#ifndef HAVE_RDMA_CONNECT_LOCKED
+#define rdma_connect_locked(cmid, cpp)	rdma_connect(cmid, cpp)
+#endif
+
 /* XXX We use KIBLND_CONN_PARAM(e) as writable buffer, it's not strictly
  * right because OFED1.2 defines it as const, to use it we have to add
  * (void *) cast to overcome "const" */
@@ -1186,19 +1165,16 @@ static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev,
 #define KIBLND_CONN_PARAM(e)            ((e)->param.conn.private_data)
 #define KIBLND_CONN_PARAM_LEN(e)        ((e)->param.conn.private_data_len)
 
-#ifdef HAVE_IB_GET_DMA_MR
-struct ib_mr *kiblnd_find_rd_dma_mr(struct lnet_ni *ni, kib_rdma_desc_t *rd,
-				    int negotiated_nfrags);
-#endif
-void kiblnd_map_rx_descs(kib_conn_t *conn);
-void kiblnd_unmap_rx_descs(kib_conn_t *conn);
-void kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node);
-struct list_head *kiblnd_pool_alloc_node(kib_poolset_t *ps);
+void kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs);
+void kiblnd_map_rx_descs(struct kib_conn *conn);
+void kiblnd_unmap_rx_descs(struct kib_conn *conn);
+void kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node);
+struct list_head *kiblnd_pool_alloc_node(struct kib_poolset *ps);
 
-int  kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx,
-			 kib_rdma_desc_t *rd, __u32 nob, __u64 iov,
-			 kib_fmr_t *fmr, bool *is_fastreg);
-void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status);
+int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx,
+			struct kib_rdma_desc *rd, u32 nob, u64 iov,
+			struct kib_fmr *fmr);
+void kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status);
 
 int  kiblnd_tunables_setup(struct lnet_ni *ni);
 int  kiblnd_tunables_init(void);
@@ -1208,43 +1184,45 @@ int  kiblnd_scheduler(void *arg);
 int  kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name);
 int  kiblnd_failover_thread (void *arg);
 
-int  kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages);
+int kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages);
 
 int  kiblnd_cm_callback(struct rdma_cm_id *cmid,
                         struct rdma_cm_event *event);
 int  kiblnd_translate_mtu(int value);
 
-int  kiblnd_dev_failover(kib_dev_t *dev, struct net *ns);
-int kiblnd_create_peer(struct lnet_ni *ni, kib_peer_ni_t **peerp,
+int  kiblnd_dev_failover(struct kib_dev *dev, struct net *ns);
+int kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer_ni **peerp,
 		       lnet_nid_t nid);
-void kiblnd_destroy_peer (kib_peer_ni_t *peer);
-bool kiblnd_reconnect_peer(kib_peer_ni_t *peer);
-void kiblnd_destroy_dev (kib_dev_t *dev);
-void kiblnd_unlink_peer_locked (kib_peer_ni_t *peer_ni);
-kib_peer_ni_t *kiblnd_find_peer_locked(struct lnet_ni *ni, lnet_nid_t nid);
-int  kiblnd_close_stale_conns_locked (kib_peer_ni_t *peer_ni,
-                                      int version, __u64 incarnation);
-int  kiblnd_close_peer_conns_locked (kib_peer_ni_t *peer_ni, int why);
-
-kib_conn_t *kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
-			       int state, int version);
-void kiblnd_destroy_conn(kib_conn_t *conn, bool free_conn);
-void kiblnd_close_conn (kib_conn_t *conn, int error);
-void kiblnd_close_conn_locked (kib_conn_t *conn, int error);
-
-void kiblnd_launch_tx(struct lnet_ni *ni, kib_tx_t *tx, lnet_nid_t nid);
-void kiblnd_txlist_done(struct list_head *txlist, int status);
+void kiblnd_destroy_peer(struct kib_peer_ni *peer);
+bool kiblnd_reconnect_peer(struct kib_peer_ni *peer);
+void kiblnd_destroy_dev(struct kib_dev *dev);
+void kiblnd_unlink_peer_locked(struct kib_peer_ni *peer_ni);
+struct kib_peer_ni *kiblnd_find_peer_locked(struct lnet_ni *ni, lnet_nid_t nid);
+int  kiblnd_close_stale_conns_locked(struct kib_peer_ni *peer_ni,
+				     int version, u64 incarnation);
+int  kiblnd_close_peer_conns_locked(struct kib_peer_ni *peer_ni, int why);
+
+struct kib_conn *kiblnd_create_conn(struct kib_peer_ni *peer_ni,
+				    struct rdma_cm_id *cmid,
+				    int state, int version);
+void kiblnd_destroy_conn(struct kib_conn *conn);
+void kiblnd_close_conn(struct kib_conn *conn, int error);
+void kiblnd_close_conn_locked(struct kib_conn *conn, int error);
+
+void kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid);
+void kiblnd_txlist_done(struct list_head *txlist, int status,
+			enum lnet_msg_hstatus hstatus);
 
 void kiblnd_qp_event(struct ib_event *event, void *arg);
 void kiblnd_cq_event(struct ib_event *event, void *arg);
 void kiblnd_cq_completion(struct ib_cq *cq, void *arg);
 
-void kiblnd_pack_msg(struct lnet_ni *ni, kib_msg_t *msg, int version,
+void kiblnd_pack_msg(struct lnet_ni *ni, struct kib_msg *msg, int version,
 		     int credits, lnet_nid_t dstnid, __u64 dststamp);
-int  kiblnd_unpack_msg(kib_msg_t *msg, int nob);
-int  kiblnd_post_rx (kib_rx_t *rx, int credit);
+int kiblnd_unpack_msg(struct kib_msg *msg, int nob);
+int kiblnd_post_rx(struct kib_rx *rx, int credit);
 
-int  kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg);
+int kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg);
 int kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
 		int delayed, unsigned int niov, struct kvec *iov,
 		lnet_kiov_t *kiov, unsigned int offset, unsigned int mlen,
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c
index 4b896a52d3bb4..e2eb6c272114f 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,20 +38,21 @@
 
 #define MAX_CONN_RACES_BEFORE_ABORT 20
 
-static void kiblnd_peer_alive(kib_peer_ni_t *peer_ni);
-static void kiblnd_peer_connect_failed(kib_peer_ni_t *peer_ni, int active, int error);
-static void kiblnd_init_tx_msg(struct lnet_ni *ni, kib_tx_t *tx,
+static void kiblnd_peer_alive(struct kib_peer_ni *peer_ni);
+static void kiblnd_peer_connect_failed(struct kib_peer_ni *peer_ni, int active,
+				       int error);
+static void kiblnd_init_tx_msg(struct lnet_ni *ni, struct kib_tx *tx,
 			       int type, int body_nob);
-static int kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type,
-			    int resid, kib_rdma_desc_t *dstrd, __u64 dstcookie);
-static void kiblnd_queue_tx_locked(kib_tx_t *tx, kib_conn_t *conn);
-static void kiblnd_queue_tx(kib_tx_t *tx, kib_conn_t *conn);
+static int kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type,
+			    int resid, struct kib_rdma_desc *dstrd, u64 dstcookie);
+static void kiblnd_queue_tx_locked(struct kib_tx *tx, struct kib_conn *conn);
+static void kiblnd_queue_tx(struct kib_tx *tx, struct kib_conn *conn);
 
-static void kiblnd_unmap_tx(kib_tx_t *tx);
-static void kiblnd_check_sends_locked(kib_conn_t *conn);
+static void kiblnd_unmap_tx(struct kib_tx *tx);
+static void kiblnd_check_sends_locked(struct kib_conn *conn);
 
 void
-kiblnd_tx_done(kib_tx_t *tx)
+kiblnd_tx_done(struct kib_tx *tx)
 {
 	struct lnet_msg *lntmsg[2];
 	int         rc;
@@ -85,39 +86,46 @@ kiblnd_tx_done(kib_tx_t *tx)
 		if (lntmsg[i] == NULL)
 			continue;
 
+		/* propagate health status to LNet for requests */
+		if (i == 0 && lntmsg[i])
+			lntmsg[i]->msg_health_status = tx->tx_hstatus;
+
 		lnet_finalize(lntmsg[i], rc);
 	}
 }
 
 void
-kiblnd_txlist_done(struct list_head *txlist, int status)
+kiblnd_txlist_done(struct list_head *txlist, int status,
+		   enum lnet_msg_hstatus hstatus)
 {
-	kib_tx_t *tx;
+	struct kib_tx *tx;
 
 	while (!list_empty(txlist)) {
-		tx = list_entry(txlist->next, kib_tx_t, tx_list);
+		tx = list_entry(txlist->next, struct kib_tx, tx_list);
 
 		list_del(&tx->tx_list);
 		/* complete now */
 		tx->tx_waiting = 0;
 		tx->tx_status = status;
+		if (hstatus != LNET_MSG_STATUS_OK)
+			tx->tx_hstatus = hstatus;
 		kiblnd_tx_done(tx);
 	}
 }
 
-static kib_tx_t *
+static struct kib_tx *
 kiblnd_get_idle_tx(struct lnet_ni *ni, lnet_nid_t target)
 {
-	kib_net_t		*net = (kib_net_t *)ni->ni_data;
-	struct list_head	*node;
-	kib_tx_t		*tx;
-	kib_tx_poolset_t	*tps;
+	struct kib_net *net = ni->ni_data;
+	struct list_head *node;
+	struct kib_tx *tx;
+	struct kib_tx_poolset *tps;
 
 	tps = net->ibn_tx_ps[lnet_cpt_of_nid(target, ni)];
 	node = kiblnd_pool_alloc_node(&tps->tps_poolset);
         if (node == NULL)
                 return NULL;
-        tx = container_of(node, kib_tx_t, tx_list);
+	tx = container_of(node, struct kib_tx, tx_list);
 
         LASSERT (tx->tx_nwrq == 0);
         LASSERT (!tx->tx_queued);
@@ -129,15 +137,18 @@ kiblnd_get_idle_tx(struct lnet_ni *ni, lnet_nid_t target)
         LASSERT (tx->tx_lntmsg[1] == NULL);
         LASSERT (tx->tx_nfrags == 0);
 
+	tx->tx_gaps = false;
+	tx->tx_hstatus = LNET_MSG_STATUS_OK;
+
         return tx;
 }
 
 static void
-kiblnd_drop_rx(kib_rx_t *rx)
+kiblnd_drop_rx(struct kib_rx *rx)
 {
-	kib_conn_t		*conn	= rx->rx_conn;
-	struct kib_sched_info	*sched	= conn->ibc_sched;
-	unsigned long		flags;
+	struct kib_conn *conn = rx->rx_conn;
+	struct kib_sched_info *sched = conn->ibc_sched;
+	unsigned long flags;
 
 	spin_lock_irqsave(&sched->ibs_lock, flags);
 	LASSERT(conn->ibc_nrx > 0);
@@ -148,15 +159,15 @@ kiblnd_drop_rx(kib_rx_t *rx)
 }
 
 int
-kiblnd_post_rx (kib_rx_t *rx, int credit)
+kiblnd_post_rx(struct kib_rx *rx, int credit)
 {
-	kib_conn_t         *conn = rx->rx_conn;
-	kib_net_t          *net = conn->ibc_peer->ibp_ni->ni_data;
-	struct ib_recv_wr  *bad_wrq = NULL;
+	struct kib_conn *conn = rx->rx_conn;
+	struct kib_net *net = conn->ibc_peer->ibp_ni->ni_data;
+	struct ib_recv_wr *bad_wrq = NULL;
 #ifdef HAVE_IB_GET_DMA_MR
-	struct ib_mr       *mr = conn->ibc_hdev->ibh_mrs;
+	struct ib_mr *mr = conn->ibc_hdev->ibh_mrs;
 #endif
-	int                 rc;
+	int rc;
 
 	LASSERT (net != NULL);
 	LASSERT (!in_interrupt());
@@ -229,13 +240,13 @@ kiblnd_post_rx (kib_rx_t *rx, int credit)
 	return rc;
 }
 
-static kib_tx_t *
-kiblnd_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
+static struct kib_tx *
+kiblnd_find_waiting_tx_locked(struct kib_conn *conn, int txtype, u64 cookie)
 {
 	struct list_head *tmp;
 
 	list_for_each(tmp, &conn->ibc_active_txs) {
-		kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
+		struct kib_tx *tx = list_entry(tmp, struct kib_tx, tx_list);
 
 		LASSERT(!tx->tx_queued);
 		LASSERT(tx->tx_sending != 0 || tx->tx_waiting);
@@ -255,11 +266,11 @@ kiblnd_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
 }
 
 static void
-kiblnd_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
+kiblnd_handle_completion(struct kib_conn *conn, int txtype, int status, u64 cookie)
 {
-	kib_tx_t    *tx;
-	struct lnet_ni   *ni = conn->ibc_peer->ibp_ni;
-	int          idle;
+	struct kib_tx *tx;
+	struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
+	int idle;
 
 	spin_lock(&conn->ibc_lock);
 
@@ -268,23 +279,24 @@ kiblnd_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
 		spin_unlock(&conn->ibc_lock);
 
 		CWARN("Unmatched completion type %x cookie %#llx from %s\n",
-                      txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
-                kiblnd_close_conn(conn, -EPROTO);
-                return;
-        }
+		      txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		kiblnd_close_conn(conn, -EPROTO);
+		return;
+	}
 
-        if (tx->tx_status == 0) {               /* success so far */
-                if (status < 0) {               /* failed? */
-                        tx->tx_status = status;
-                } else if (txtype == IBLND_MSG_GET_REQ) {
-                        lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status);
-                }
-        }
+	if (tx->tx_status == 0) {               /* success so far */
+		if (status < 0) {               /* failed? */
+			tx->tx_status = status;
+			tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_ERROR;
+		} else if (txtype == IBLND_MSG_GET_REQ) {
+			lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status);
+		}
+	}
 
-        tx->tx_waiting = 0;
+	tx->tx_waiting = 0;
 
-        idle = !tx->tx_queued && (tx->tx_sending == 0);
-        if (idle)
+	idle = !tx->tx_queued && (tx->tx_sending == 0);
+	if (idle)
 		list_del(&tx->tx_list);
 
 	spin_unlock(&conn->ibc_lock);
@@ -294,10 +306,10 @@ kiblnd_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
 }
 
 static void
-kiblnd_send_completion(kib_conn_t *conn, int type, int status, __u64 cookie)
+kiblnd_send_completion(struct kib_conn *conn, int type, int status, u64 cookie)
 {
-	struct lnet_ni   *ni = conn->ibc_peer->ibp_ni;
-	kib_tx_t    *tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+	struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
+	struct kib_tx *tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
 
         if (tx == NULL) {
                 CERROR("Can't get tx for completion %x for %s\n",
@@ -307,19 +319,19 @@ kiblnd_send_completion(kib_conn_t *conn, int type, int status, __u64 cookie)
 
         tx->tx_msg->ibm_u.completion.ibcm_status = status;
         tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
-        kiblnd_init_tx_msg(ni, tx, type, sizeof(kib_completion_msg_t));
+	kiblnd_init_tx_msg(ni, tx, type, sizeof(struct kib_completion_msg));
 
         kiblnd_queue_tx(tx, conn);
 }
 
 static void
-kiblnd_handle_rx (kib_rx_t *rx)
+kiblnd_handle_rx(struct kib_rx *rx)
 {
-        kib_msg_t    *msg = rx->rx_msg;
-        kib_conn_t   *conn = rx->rx_conn;
-	struct lnet_ni    *ni = conn->ibc_peer->ibp_ni;
+	struct kib_msg *msg = rx->rx_msg;
+	struct kib_conn   *conn = rx->rx_conn;
+	struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
         int           credits = msg->ibm_credits;
-        kib_tx_t     *tx;
+	struct kib_tx *tx;
         int           rc = 0;
         int           rc2;
         int           post_credit;
@@ -474,14 +486,14 @@ kiblnd_handle_rx (kib_rx_t *rx)
 }
 
 static void
-kiblnd_rx_complete (kib_rx_t *rx, int status, int nob)
+kiblnd_rx_complete(struct kib_rx *rx, int status, int nob)
 {
-        kib_msg_t    *msg = rx->rx_msg;
-        kib_conn_t   *conn = rx->rx_conn;
-	struct lnet_ni    *ni = conn->ibc_peer->ibp_ni;
-        kib_net_t    *net = ni->ni_data;
-        int           rc;
-        int           err = -EIO;
+	struct kib_msg *msg = rx->rx_msg;
+	struct kib_conn   *conn = rx->rx_conn;
+	struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
+	struct kib_net *net = ni->ni_data;
+	int rc;
+	int err = -EIO;
 
         LASSERT (net != NULL);
         LASSERT (rx->rx_nob < 0);               /* was posted */
@@ -545,47 +557,112 @@ kiblnd_rx_complete (kib_rx_t *rx, int status, int nob)
 }
 
 static int
-kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, __u32 nob)
+kiblnd_fmr_map_tx(struct kib_net *net, struct kib_tx *tx,
+		  struct kib_rdma_desc *rd, u32 nob)
 {
-	kib_hca_dev_t		*hdev;
-	kib_fmr_poolset_t	*fps;
+	struct kib_hca_dev *hdev;
+	struct kib_dev *dev;
+	struct kib_fmr_poolset *fps;
 	int			cpt;
 	int			rc;
-	bool			is_fastreg = 0;
+	int i;
 
 	LASSERT(tx->tx_pool != NULL);
 	LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL);
 
+	dev = net->ibn_dev;
 	hdev = tx->tx_pool->tpo_hdev;
 	cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
 
+	/*
+	 * If we're dealing with FastReg, but the device doesn't
+	 * support GAPS and the tx has GAPS, then there is no real point
+	 * in trying to map the memory, because it'll just fail. So
+	 * preemptively fail with an appropriate message
+	 */
+	if ((dev->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED) &&
+	    !(dev->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT) &&
+	    tx->tx_gaps) {
+		CERROR("Using FastReg with no GAPS support, but tx has gaps. "
+		       "Try setting use_fastreg_gaps to 1\n");
+		return -EPROTONOSUPPORT;
+	}
+
+#ifdef HAVE_FMR_POOL_API
+	/*
+	 * FMR does not support gaps but the tx has gaps then
+	 * we should make sure that the number of fragments we'll be sending
+	 * over fits within the number of fragments negotiated on the
+	 * connection, otherwise, we won't be able to RDMA the data.
+	 * We need to maintain the number of fragments negotiation on the
+	 * connection for backwards compatibility.
+	 */
+	if (tx->tx_gaps && (dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED)) {
+		if (tx->tx_conn &&
+		    tx->tx_conn->ibc_max_frags <= rd->rd_nfrags) {
+			CERROR("TX number of frags (%d) is <= than connection"
+			       " number of frags (%d). Consider setting peer's"
+			       " map_on_demand to 256\n", tx->tx_nfrags,
+			       tx->tx_conn->ibc_max_frags);
+			return -EFBIG;
+		}
+	}
+#endif
+
 	fps = net->ibn_fmr_ps[cpt];
-	rc = kiblnd_fmr_pool_map(fps, tx, rd, nob, 0, &tx->fmr, &is_fastreg);
+	rc = kiblnd_fmr_pool_map(fps, tx, rd, nob, 0, &tx->tx_fmr);
 	if (rc != 0) {
-		CERROR("Can't map %u pages: %d\n", nob, rc);
+		CERROR("Can't map %u bytes (%u/%u)s: %d\n", nob,
+		       tx->tx_nfrags, rd->rd_nfrags, rc);
 		return rc;
 	}
 
-	/* If rd is not tx_rd, it's going to get sent to a peer_ni, who will need
-	 * the rkey */
-	rd->rd_key = tx->fmr.fmr_key;
-	if (!is_fastreg)
-		rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask;
-	rd->rd_frags[0].rf_nob   = nob;
-	rd->rd_nfrags = 1;
+	/*
+	 * If rd is not tx_rd, it's going to get sent to a peer_ni, who will
+	 * need the rkey
+	 */
+	rd->rd_key = tx->tx_fmr.fmr_key;
+	/*
+	 * for FastReg or FMR with no gaps we can accumulate all
+	 * the fragments in one FastReg or FMR fragment.
+	 */
+	if (
+#ifdef HAVE_FMR_POOL_API
+	    ((dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED)
+	     && !tx->tx_gaps) ||
+#endif
+	    (dev->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED)) {
+		/* FMR requires zero based address */
+#ifdef HAVE_FMR_POOL_API
+		if (dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED)
+			rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask;
+#endif
+		rd->rd_frags[0].rf_nob = nob;
+		rd->rd_nfrags = 1;
+	} else {
+		/*
+		 * We're transmitting with gaps using FMR.
+		 * We'll need to use multiple fragments and identify the
+		 * zero based address of each fragment.
+		 */
+		for (i = 0; i < rd->rd_nfrags; i++) {
+			rd->rd_frags[i].rf_addr &= ~hdev->ibh_page_mask;
+			rd->rd_frags[i].rf_addr += i << hdev->ibh_page_shift;
+		}
+	}
 
 	return 0;
 }
 
 static void
-kiblnd_unmap_tx(kib_tx_t *tx)
+kiblnd_unmap_tx(struct kib_tx *tx)
 {
 	if (
 #ifdef HAVE_FMR_POOL_API
-		tx->fmr.fmr_pfmr ||
+		tx->tx_fmr.fmr_pfmr ||
 #endif
-		tx->fmr.fmr_frd)
-		kiblnd_fmr_pool_unmap(&tx->fmr, tx->tx_status);
+		tx->tx_fmr.fmr_frd)
+		kiblnd_fmr_pool_unmap(&tx->tx_fmr, tx->tx_status);
 
 	if (tx->tx_nfrags != 0) {
 		kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev,
@@ -594,13 +671,46 @@ kiblnd_unmap_tx(kib_tx_t *tx)
 	}
 }
 
-static int
-kiblnd_map_tx(struct lnet_ni *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, int nfrags)
+#ifdef HAVE_IB_GET_DMA_MR
+static struct ib_mr *
+kiblnd_find_rd_dma_mr(struct lnet_ni *ni, struct kib_rdma_desc *rd)
 {
-	kib_net_t     *net   = ni->ni_data;
-	kib_hca_dev_t *hdev  = net->ibn_dev->ibd_hdev;
+	struct kib_net *net = ni->ni_data;
+	struct kib_hca_dev *hdev = net->ibn_dev->ibd_hdev;
+	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+
+	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+
+	/*
+	 * if map-on-demand is turned on and the device supports
+	 * either FMR or FastReg then use that. Otherwise use global
+	 * memory regions. If that's not available either, then you're
+	 * dead in the water and fail the operation.
+	 */
+	if (tunables->lnd_map_on_demand &&
+	    (net->ibn_dev->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED
+#ifdef HAVE_FMR_POOL_API
+	     || net->ibn_dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED
+#endif
+	))
+		return NULL;
+
+	/*
+	 * hdev->ibh_mrs can be NULL. This case is dealt with gracefully
+	 * in the call chain. The mapping will fail with appropriate error
+	 * message.
+	 */
+	return hdev->ibh_mrs;
+}
+#endif
+
+static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
+			 struct kib_rdma_desc *rd, int nfrags)
+{
+	struct kib_net *net = ni->ni_data;
+	struct kib_hca_dev *hdev = net->ibn_dev->ibd_hdev;
 #ifdef HAVE_IB_GET_DMA_MR
-	struct ib_mr  *mr    = NULL;
+	struct ib_mr *mr = NULL;
 #endif
 	__u32 nob;
 	int i;
@@ -622,9 +732,7 @@ kiblnd_map_tx(struct lnet_ni *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, int nfrags)
         }
 
 #ifdef HAVE_IB_GET_DMA_MR
-	mr = kiblnd_find_rd_dma_mr(ni, rd,
-				   (tx->tx_conn != NULL) ?
-				   tx->tx_conn->ibc_max_frags : -1);
+	mr = kiblnd_find_rd_dma_mr(ni, rd);
 	if (mr != NULL) {
 		/* found pre-mapping MR */
 		rd->rd_key = (rd != tx->tx_rd) ? mr->rkey : mr->lkey;
@@ -638,17 +746,17 @@ kiblnd_map_tx(struct lnet_ni *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, int nfrags)
 	return -EINVAL;
 }
 
-
-static int
-kiblnd_setup_rd_iov(struct lnet_ni *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
-		    unsigned int niov, struct kvec *iov, int offset, int nob)
+static int kiblnd_setup_rd_iov(struct lnet_ni *ni, struct kib_tx *tx,
+			       struct kib_rdma_desc *rd, unsigned int niov,
+			       struct kvec *iov, int offset, int nob)
 {
-        kib_net_t          *net = ni->ni_data;
-        struct page        *page;
+	struct kib_net *net = ni->ni_data;
+	struct page *page;
         struct scatterlist *sg;
         unsigned long       vaddr;
         int                 fragnob;
         int                 page_offset;
+	unsigned int	    max_niov;
 
         LASSERT (nob > 0);
         LASSERT (niov > 0);
@@ -661,6 +769,8 @@ kiblnd_setup_rd_iov(struct lnet_ni *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
                 LASSERT (niov > 0);
         }
 
+	max_niov = niov;
+
 	sg = tx->tx_frags;
 	do {
 		LASSERT(niov > 0);
@@ -676,6 +786,20 @@ kiblnd_setup_rd_iov(struct lnet_ni *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
 		fragnob = min((int)(iov->iov_len - offset), nob);
 		fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
 
+		/*
+		 * We're allowed to start at a non-aligned page offset in
+		 * the first fragment and end at a non-aligned page offset
+		 * in the last fragment.
+		 */
+		if ((fragnob < (int)PAGE_SIZE - page_offset) &&
+		    (niov < max_niov) && nob > fragnob) {
+			CDEBUG(D_NET, "fragnob %d < available page %d: with"
+				      " remaining %d iovs with %d nob left\n",
+			       fragnob, (int)PAGE_SIZE - page_offset, niov,
+			       nob);
+			tx->tx_gaps = true;
+		}
+
 		sg_set_page(sg, page, fragnob, page_offset);
 		sg = sg_next(sg);
 		if (!sg) {
@@ -696,32 +820,49 @@ kiblnd_setup_rd_iov(struct lnet_ni *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
         return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
 }
 
-static int
-kiblnd_setup_rd_kiov(struct lnet_ni *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
-		     int nkiov, lnet_kiov_t *kiov, int offset, int nob)
+static int kiblnd_setup_rd_kiov(struct lnet_ni *ni, struct kib_tx *tx,
+				struct kib_rdma_desc *rd, int nkiov,
+				lnet_kiov_t *kiov, int offset, int nob)
 {
-        kib_net_t          *net = ni->ni_data;
-        struct scatterlist *sg;
-        int                 fragnob;
+	struct kib_net *net = ni->ni_data;
+	struct scatterlist *sg;
+	int                 fragnob;
+	int		    max_nkiov;
+
+	CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+
+	LASSERT(nob > 0);
+	LASSERT(nkiov > 0);
+	LASSERT(net != NULL);
+
+	while (offset >= kiov->kiov_len) {
+		offset -= kiov->kiov_len;
+		nkiov--;
+		kiov++;
+		LASSERT(nkiov > 0);
+	}
 
-        CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+	max_nkiov = nkiov;
 
-        LASSERT (nob > 0);
-        LASSERT (nkiov > 0);
-        LASSERT (net != NULL);
-
-        while (offset >= kiov->kiov_len) {
-                offset -= kiov->kiov_len;
-                nkiov--;
-                kiov++;
-                LASSERT (nkiov > 0);
-        }
+	sg = tx->tx_frags;
+	do {
+		LASSERT(nkiov > 0);
 
-        sg = tx->tx_frags;
-        do {
-                LASSERT (nkiov > 0);
+		fragnob = min((int)(kiov->kiov_len - offset), nob);
 
-                fragnob = min((int)(kiov->kiov_len - offset), nob);
+		/*
+		 * We're allowed to start at a non-aligned page offset in
+		 * the first fragment and end at a non-aligned page offset
+		 * in the last fragment.
+		 */
+		if ((fragnob < (int)(kiov->kiov_len - offset)) &&
+		    nkiov < max_nkiov && nob > fragnob) {
+			CDEBUG(D_NET, "fragnob %d < available page %d: with"
+				      " remaining %d kiovs with %d nob left\n",
+			       fragnob, (int)(kiov->kiov_len - offset),
+			       nkiov, nob);
+			tx->tx_gaps = true;
+		}
 
 		sg_set_page(sg, kiov->kiov_page, fragnob,
 			    kiov->kiov_offset + offset);
@@ -731,22 +872,23 @@ kiblnd_setup_rd_kiov(struct lnet_ni *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
 			return -EFAULT;
 		}
 
-                offset = 0;
-                kiov++;
-                nkiov--;
-                nob -= fragnob;
-        } while (nob > 0);
+		offset = 0;
+		kiov++;
+		nkiov--;
+		nob -= fragnob;
+	} while (nob > 0);
 
-        return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
+	return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
 }
 
 static int
-kiblnd_post_tx_locked (kib_conn_t *conn, kib_tx_t *tx, int credit)
+kiblnd_post_tx_locked(struct kib_conn *conn, struct kib_tx *tx, int credit)
 __must_hold(&conn->ibc_lock)
 {
-	kib_msg_t *msg = tx->tx_msg;
-	kib_peer_ni_t *peer_ni = conn->ibc_peer;
+	struct kib_msg *msg = tx->tx_msg;
+	struct kib_peer_ni *peer_ni = conn->ibc_peer;
 	struct lnet_ni *ni = peer_ni->ibp_ni;
+	struct kib_fast_reg_descriptor *frd = tx->tx_fmr.fmr_frd;
 	int ver = conn->ibc_version;
 	int rc;
 	int done;
@@ -764,11 +906,11 @@ __must_hold(&conn->ibc_lock)
 
 	if (conn->ibc_nsends_posted ==
 	    kiblnd_concurrent_sends(ver, ni)) {
-                /* tx completions outstanding... */
-                CDEBUG(D_NET, "%s: posted enough\n",
-                       libcfs_nid2str(peer_ni->ibp_nid));
-                return -EAGAIN;
-        }
+		/* tx completions outstanding... */
+		CDEBUG(D_NET, "%s: posted enough\n",
+		       libcfs_nid2str(peer_ni->ibp_nid));
+		return -EAGAIN;
+	}
 
         if (credit != 0 && conn->ibc_credits == 0) {   /* no credits */
                 CDEBUG(D_NET, "%s: no credits\n",
@@ -796,6 +938,7 @@ __must_hold(&conn->ibc_lock)
 		 * kiblnd_check_sends_locked will queue NOOP again when
 		 * posted NOOPs complete */
 		spin_unlock(&conn->ibc_lock);
+		tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
 		kiblnd_tx_done(tx);
 		spin_lock(&conn->ibc_lock);
                 CDEBUG(D_NET, "%s(%d): redundant or enough NOOP\n",
@@ -830,11 +973,10 @@ __must_hold(&conn->ibc_lock)
                 /* close_conn will launch failover */
                 rc = -ENETDOWN;
         } else {
-		struct kib_fast_reg_descriptor *frd = tx->fmr.fmr_frd;
 		struct ib_send_wr *bad = &tx->tx_wrq[tx->tx_nwrq - 1].wr;
 		struct ib_send_wr *wr  = &tx->tx_wrq[0].wr;
 
-		if (frd != NULL) {
+		if (frd != NULL && !frd->frd_posted) {
 			if (!frd->frd_valid) {
 				wr = &frd->frd_inv_wr.wr;
 				wr->next = &frd->frd_fastreg_wr.wr;
@@ -850,18 +992,24 @@ __must_hold(&conn->ibc_lock)
 			 libcfs_nid2str(conn->ibc_peer->ibp_nid));
 
 		bad = NULL;
+		if (lnet_send_error_simulation(tx->tx_lntmsg[0], &tx->tx_hstatus))
+			rc = -EINVAL;
+		else
 #ifdef HAVE_IB_POST_SEND_RECV_CONST
-		rc = ib_post_send(conn->ibc_cmid->qp, wr,
-				  (const struct ib_send_wr **)&bad);
+			rc = ib_post_send(conn->ibc_cmid->qp, wr,
+					  (const struct ib_send_wr **)&bad);
 #else
-		rc = ib_post_send(conn->ibc_cmid->qp, wr, &bad);
+			rc = ib_post_send(conn->ibc_cmid->qp, wr, &bad);
 #endif
 	}
 
-        conn->ibc_last_send = jiffies;
+	conn->ibc_last_send = ktime_get();
 
-        if (rc == 0)
-                return 0;
+	if (rc == 0) {
+		if (frd != NULL)
+			frd->frd_posted = true;
+		return 0;
+	}
 
         /* NB credits are transferred in the actual
          * message, which can only be the last work item */
@@ -899,11 +1047,11 @@ __must_hold(&conn->ibc_lock)
 }
 
 static void
-kiblnd_check_sends_locked(kib_conn_t *conn)
+kiblnd_check_sends_locked(struct kib_conn *conn)
 {
-        int        ver = conn->ibc_version;
+	int ver = conn->ibc_version;
 	struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
-        kib_tx_t  *tx;
+	struct kib_tx *tx;
 
         /* Don't send anything until after the connection is established */
         if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
@@ -921,7 +1069,7 @@ kiblnd_check_sends_locked(kib_conn_t *conn)
         while (conn->ibc_reserved_credits > 0 &&
 	       !list_empty(&conn->ibc_tx_queue_rsrvd)) {
 		tx = list_entry(conn->ibc_tx_queue_rsrvd.next,
-                                    kib_tx_t, tx_list);
+				struct kib_tx, tx_list);
 		list_del(&tx->tx_list);
 		list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
                 conn->ibc_reserved_credits--;
@@ -945,16 +1093,16 @@ kiblnd_check_sends_locked(kib_conn_t *conn)
 		if (!list_empty(&conn->ibc_tx_queue_nocred)) {
                         credit = 0;
 			tx = list_entry(conn->ibc_tx_queue_nocred.next,
-                                            kib_tx_t, tx_list);
+					struct kib_tx, tx_list);
 		} else if (!list_empty(&conn->ibc_tx_noops)) {
                         LASSERT (!IBLND_OOB_CAPABLE(ver));
                         credit = 1;
 			tx = list_entry(conn->ibc_tx_noops.next,
-                                        kib_tx_t, tx_list);
+					struct kib_tx, tx_list);
 		} else if (!list_empty(&conn->ibc_tx_queue)) {
                         credit = 1;
 			tx = list_entry(conn->ibc_tx_queue.next,
-                                            kib_tx_t, tx_list);
+					struct kib_tx, tx_list);
                 } else
                         break;
 
@@ -964,26 +1112,30 @@ kiblnd_check_sends_locked(kib_conn_t *conn)
 }
 
 static void
-kiblnd_tx_complete (kib_tx_t *tx, int status)
+kiblnd_tx_complete(struct kib_tx *tx, int status)
 {
-        int           failed = (status != IB_WC_SUCCESS);
-        kib_conn_t   *conn = tx->tx_conn;
-        int           idle;
+	int           failed = (status != IB_WC_SUCCESS);
+	struct kib_conn   *conn = tx->tx_conn;
+	int           idle;
 
-        LASSERT (tx->tx_sending > 0);
+	if (tx->tx_sending <= 0) {
+		CERROR("Received an event on a freed tx: %p status %d\n",
+		       tx, tx->tx_status);
+		return;
+	}
 
-        if (failed) {
-                if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
+	if (failed) {
+		if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
 			CNETERR("Tx -> %s cookie %#llx"
-                                " sending %d waiting %d: failed %d\n",
-                                libcfs_nid2str(conn->ibc_peer->ibp_nid),
-                                tx->tx_cookie, tx->tx_sending, tx->tx_waiting,
-                                status);
+				" sending %d waiting %d: failed %d\n",
+				libcfs_nid2str(conn->ibc_peer->ibp_nid),
+				tx->tx_cookie, tx->tx_sending, tx->tx_waiting,
+				status);
 
-                kiblnd_close_conn(conn, -EIO);
-        } else {
-                kiblnd_peer_alive(conn->ibc_peer);
-        }
+		kiblnd_close_conn(conn, -EIO);
+	} else {
+		kiblnd_peer_alive(conn->ibc_peer);
+	}
 
 	spin_lock(&conn->ibc_lock);
 
@@ -996,6 +1148,7 @@ kiblnd_tx_complete (kib_tx_t *tx, int status)
                 conn->ibc_noops_posted--;
 
         if (failed) {
+		tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_DROPPED;
                 tx->tx_waiting = 0;             /* don't wait for peer_ni */
                 tx->tx_status = -EIO;
         }
@@ -1014,12 +1167,13 @@ kiblnd_tx_complete (kib_tx_t *tx, int status)
 }
 
 static void
-kiblnd_init_tx_msg(struct lnet_ni *ni, kib_tx_t *tx, int type, int body_nob)
+kiblnd_init_tx_msg(struct lnet_ni *ni, struct kib_tx *tx, int type,
+		   int body_nob)
 {
-	kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev;
+	struct kib_hca_dev *hdev = tx->tx_pool->tpo_hdev;
 	struct ib_sge *sge = &tx->tx_msgsge;
 	struct ib_rdma_wr *wrq;
-	int nob = offsetof(kib_msg_t, ibm_u) + body_nob;
+	int nob = offsetof(struct kib_msg, ibm_u) + body_nob;
 #ifdef HAVE_IB_GET_DMA_MR
 	struct ib_mr *mr = hdev->ibh_mrs;
 #endif
@@ -1055,11 +1209,11 @@ kiblnd_init_tx_msg(struct lnet_ni *ni, kib_tx_t *tx, int type, int body_nob)
 }
 
 static int
-kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type,
-		 int resid, kib_rdma_desc_t *dstrd, __u64 dstcookie)
+kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type,
+		 int resid, struct kib_rdma_desc *dstrd, u64 dstcookie)
 {
-	kib_msg_t	  *ibmsg = tx->tx_msg;
-	kib_rdma_desc_t   *srcrd = tx->tx_rd;
+	struct kib_msg *ibmsg = tx->tx_msg;
+	struct kib_rdma_desc *srcrd = tx->tx_rd;
 	struct ib_rdma_wr *wrq = NULL;
 	struct ib_sge	  *sge;
 	int		   rc  = resid;
@@ -1147,24 +1301,39 @@ kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type,
         ibmsg->ibm_u.completion.ibcm_status = rc;
         ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
         kiblnd_init_tx_msg(conn->ibc_peer->ibp_ni, tx,
-                           type, sizeof (kib_completion_msg_t));
+			   type, sizeof(struct kib_completion_msg));
 
         return rc;
 }
 
 static void
-kiblnd_queue_tx_locked(kib_tx_t *tx, kib_conn_t *conn)
+kiblnd_queue_tx_locked(struct kib_tx *tx, struct kib_conn *conn)
 {
 	struct list_head *q;
+	s64 timeout_ns;
 
 	LASSERT(tx->tx_nwrq > 0);	/* work items set up */
 	LASSERT(!tx->tx_queued);	/* not queued for sending already */
 	LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
 
+	if (conn->ibc_state >= IBLND_CONN_DISCONNECTED) {
+		tx->tx_status = -ECONNABORTED;
+		tx->tx_waiting = 0;
+		if (tx->tx_conn != NULL) {
+			/* PUT_DONE first attached to conn as a PUT_REQ */
+			LASSERT(tx->tx_conn == conn);
+			LASSERT(tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE);
+			tx->tx_conn = NULL;
+			kiblnd_conn_decref(conn);
+		}
+		list_add(&tx->tx_list, &conn->ibc_zombie_txs);
+
+		return;
+	}
+
+	timeout_ns = lnet_get_lnd_timeout() * NSEC_PER_SEC;
 	tx->tx_queued = 1;
-	tx->tx_deadline = jiffies +
-			  msecs_to_jiffies(*kiblnd_tunables.kib_timeout *
-					   MSEC_PER_SEC);
+	tx->tx_deadline = ktime_add_ns(ktime_get(), timeout_ns);
 
         if (tx->tx_conn == NULL) {
                 kiblnd_conn_addref(conn);
@@ -1208,7 +1377,7 @@ kiblnd_queue_tx_locked(kib_tx_t *tx, kib_conn_t *conn)
 }
 
 static void
-kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
+kiblnd_queue_tx(struct kib_tx *tx, struct kib_conn *conn)
 {
 	spin_lock(&conn->ibc_lock);
 	kiblnd_queue_tx_locked(tx, conn);
@@ -1254,14 +1423,14 @@ static int kiblnd_resolve_addr(struct rdma_cm_id *cmid,
 }
 
 static void
-kiblnd_connect_peer (kib_peer_ni_t *peer_ni)
+kiblnd_connect_peer(struct kib_peer_ni *peer_ni)
 {
         struct rdma_cm_id *cmid;
-        kib_dev_t         *dev;
-        kib_net_t         *net = peer_ni->ibp_ni->ni_data;
+	struct kib_dev *dev;
+	struct kib_net *net = peer_ni->ibp_ni->ni_data;
         struct sockaddr_in srcaddr;
         struct sockaddr_in dstaddr;
-        int                rc;
+	int rc;
 
         LASSERT (net != NULL);
         LASSERT (peer_ni->ibp_connecting > 0);
@@ -1289,21 +1458,21 @@ kiblnd_connect_peer (kib_peer_ni_t *peer_ni)
 
         kiblnd_peer_addref(peer_ni);               /* cmid's ref */
 
-        if (*kiblnd_tunables.kib_use_priv_port) {
-                rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr,
-                                         *kiblnd_tunables.kib_timeout * 1000);
-        } else {
-                rc = rdma_resolve_addr(cmid,
-                                       (struct sockaddr *)&srcaddr,
-                                       (struct sockaddr *)&dstaddr,
-                                       *kiblnd_tunables.kib_timeout * 1000);
-        }
-        if (rc != 0) {
-                /* Can't initiate address resolution:  */
-                CERROR("Can't resolve addr for %s: %d\n",
-                       libcfs_nid2str(peer_ni->ibp_nid), rc);
-                goto failed2;
-        }
+	if (*kiblnd_tunables.kib_use_priv_port) {
+		rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr,
+					 lnet_get_lnd_timeout() * 1000);
+	} else {
+		rc = rdma_resolve_addr(cmid,
+				       (struct sockaddr *)&srcaddr,
+				       (struct sockaddr *)&dstaddr,
+				       lnet_get_lnd_timeout() * 1000);
+	}
+	if (rc != 0) {
+		/* Can't initiate address resolution:  */
+		CERROR("Can't resolve addr for %s: %d\n",
+		       libcfs_nid2str(peer_ni->ibp_nid), rc);
+		goto failed2;
+	}
 
 	return;
 
@@ -1317,7 +1486,7 @@ kiblnd_connect_peer (kib_peer_ni_t *peer_ni)
 }
 
 bool
-kiblnd_reconnect_peer(kib_peer_ni_t *peer_ni)
+kiblnd_reconnect_peer(struct kib_peer_ni *peer_ni)
 {
 	rwlock_t	 *glock = &kiblnd_data.kib_global_lock;
 	char		 *reason = NULL;
@@ -1363,17 +1532,18 @@ kiblnd_reconnect_peer(kib_peer_ni_t *peer_ni)
 
 	CWARN("Abort reconnection of %s: %s\n",
 	      libcfs_nid2str(peer_ni->ibp_nid), reason);
-	kiblnd_txlist_done(&txs, -ECONNABORTED);
+	kiblnd_txlist_done(&txs, -ECONNABORTED,
+			   LNET_MSG_STATUS_LOCAL_ABORTED);
 	return false;
 }
 
 void
-kiblnd_launch_tx(struct lnet_ni *ni, kib_tx_t *tx, lnet_nid_t nid)
+kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid)
 {
-        kib_peer_ni_t        *peer_ni;
-        kib_peer_ni_t        *peer2;
-        kib_conn_t        *conn;
-	rwlock_t	*g_lock = &kiblnd_data.kib_global_lock;
+	struct kib_peer_ni *peer_ni;
+	struct kib_peer_ni *peer2;
+	struct kib_conn *conn;
+	rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
         unsigned long      flags;
         int                rc;
 	int		   i;
@@ -1438,6 +1608,7 @@ kiblnd_launch_tx(struct lnet_ni *ni, kib_tx_t *tx, lnet_nid_t nid)
 		if (tx != NULL) {
 			tx->tx_status = -EHOSTUNREACH;
 			tx->tx_waiting = 0;
+			tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
 			kiblnd_tx_done(tx);
 		}
 		return;
@@ -1475,7 +1646,7 @@ kiblnd_launch_tx(struct lnet_ni *ni, kib_tx_t *tx, lnet_nid_t nid)
 	peer_ni->ibp_connecting = tunables->lnd_conns_per_peer;
 
 	/* always called with a ref on ni, which prevents ni being shutdown */
-	LASSERT(((kib_net_t *)ni->ni_data)->ibn_shutdown == 0);
+	LASSERT(((struct kib_net *)ni->ni_data)->ibn_shutdown == 0);
 
 	if (tx != NULL)
 		list_add_tail(&tx->tx_list, &peer_ni->ibp_tx_queue);
@@ -1503,9 +1674,9 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
 	lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
 	unsigned int      payload_offset = lntmsg->msg_offset;
 	unsigned int      payload_nob = lntmsg->msg_len;
-	kib_msg_t        *ibmsg;
-	kib_rdma_desc_t  *rd;
-	kib_tx_t         *tx;
+	struct kib_msg *ibmsg;
+	struct kib_rdma_desc *rd;
+	struct kib_tx *tx;
 	int               nob;
 	int               rc;
 
@@ -1536,7 +1707,7 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
                         break;                  /* send IMMEDIATE */
 
                 /* is the REPLY message too small for RDMA? */
-                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
+		nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
                 if (nob <= IBLND_MSG_SIZE)
                         break;                  /* send IMMEDIATE */
 
@@ -1562,11 +1733,12 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
 		if (rc != 0) {
 			CERROR("Can't setup GET sink for %s: %d\n",
 			       libcfs_nid2str(target.nid), rc);
+			tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
 			kiblnd_tx_done(tx);
 			return -EIO;
 		}
 
-		nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[rd->rd_nfrags]);
+		nob = offsetof(struct kib_get_msg, ibgm_rd.rd_frags[rd->rd_nfrags]);
 		ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
 		ibmsg->ibm_u.get.ibgm_hdr = *hdr;
 
@@ -1588,7 +1760,7 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
         case LNET_MSG_REPLY:
         case LNET_MSG_PUT:
                 /* Is the payload small enough not to need RDMA? */
-                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+		nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[payload_nob]);
                 if (nob <= IBLND_MSG_SIZE)
                         break;                  /* send IMMEDIATE */
 
@@ -1618,7 +1790,8 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
                 ibmsg = tx->tx_msg;
                 ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
                 ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
-                kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
+		kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_REQ,
+				   sizeof(struct kib_putreq_msg));
 
                 tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
                 tx->tx_waiting = 1;             /* waiting for PUT_{ACK,NAK} */
@@ -1626,10 +1799,9 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
                 return 0;
         }
 
-        /* send IMMEDIATE */
-
-        LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
-                 <= IBLND_MSG_SIZE);
+	/* send IMMEDIATE */
+	LASSERT(offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[payload_nob])
+		<= IBLND_MSG_SIZE);
 
 	tx = kiblnd_get_idle_tx(ni, target.nid);
         if (tx == NULL) {
@@ -1643,16 +1815,16 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
 
         if (payload_kiov != NULL)
                 lnet_copy_kiov2flat(IBLND_MSG_SIZE, ibmsg,
-                                    offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+				    offsetof(struct kib_msg, ibm_u.immediate.ibim_payload),
                                     payload_niov, payload_kiov,
                                     payload_offset, payload_nob);
         else
                 lnet_copy_iov2flat(IBLND_MSG_SIZE, ibmsg,
-                                   offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+				   offsetof(struct kib_msg, ibm_u.immediate.ibim_payload),
                                    payload_niov, payload_iov,
                                    payload_offset, payload_nob);
 
-        nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
+	nob = offsetof(struct kib_immediate_msg, ibim_payload[payload_nob]);
         kiblnd_init_tx_msg(ni, tx, IBLND_MSG_IMMEDIATE, nob);
 
         tx->tx_lntmsg[0] = lntmsg;              /* finalise lntmsg on completion */
@@ -1661,7 +1833,7 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
 }
 
 static void
-kiblnd_reply(struct lnet_ni *ni, kib_rx_t *rx, struct lnet_msg *lntmsg)
+kiblnd_reply(struct lnet_ni *ni, struct kib_rx *rx, struct lnet_msg *lntmsg)
 {
 	struct lnet_process_id target = lntmsg->msg_target;
         unsigned int      niov = lntmsg->msg_niov;
@@ -1669,7 +1841,7 @@ kiblnd_reply(struct lnet_ni *ni, kib_rx_t *rx, struct lnet_msg *lntmsg)
         lnet_kiov_t      *kiov = lntmsg->msg_kiov;
         unsigned int      offset = lntmsg->msg_offset;
         unsigned int      nob = lntmsg->msg_len;
-        kib_tx_t         *tx;
+	struct kib_tx *tx;
         int               rc;
 
 	tx = kiblnd_get_idle_tx(ni, rx->rx_conn->ibc_peer->ibp_nid);
@@ -1716,9 +1888,11 @@ kiblnd_reply(struct lnet_ni *ni, kib_rx_t *rx, struct lnet_msg *lntmsg)
         kiblnd_queue_tx(tx, rx->rx_conn);
         return;
 
- failed_1:
+
+failed_1:
+	tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
 	kiblnd_tx_done(tx);
- failed_0:
+failed_0:
 	lnet_finalize(lntmsg, -EIO);
 }
 
@@ -1727,10 +1901,10 @@ kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
 	    int delayed, unsigned int niov, struct kvec *iov, lnet_kiov_t *kiov,
 	    unsigned int offset, unsigned int mlen, unsigned int rlen)
 {
-        kib_rx_t    *rx = private;
-        kib_msg_t   *rxmsg = rx->rx_msg;
-        kib_conn_t  *conn = rx->rx_conn;
-        kib_tx_t    *tx;
+	struct kib_rx *rx = private;
+	struct kib_msg *rxmsg = rx->rx_msg;
+	struct kib_conn *conn = rx->rx_conn;
+	struct kib_tx *tx;
 	__u64	     ibprm_cookie;
 	int          nob;
 	int          post_credit = IBLND_POSTRX_PEER_CREDIT;
@@ -1746,7 +1920,7 @@ kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
 		LBUG();
 
         case IBLND_MSG_IMMEDIATE:
-                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+		nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[rlen]);
                 if (nob > rx->rx_nob) {
                         CERROR ("Immediate message from %s too big: %d(%d)\n",
                                 libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
@@ -1758,19 +1932,19 @@ kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
                 if (kiov != NULL)
                         lnet_copy_flat2kiov(niov, kiov, offset,
                                             IBLND_MSG_SIZE, rxmsg,
-                                            offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+					    offsetof(struct kib_msg, ibm_u.immediate.ibim_payload),
                                             mlen);
                 else
                         lnet_copy_flat2iov(niov, iov, offset,
                                            IBLND_MSG_SIZE, rxmsg,
-                                           offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+					   offsetof(struct kib_msg, ibm_u.immediate.ibim_payload),
                                            mlen);
 		lnet_finalize(lntmsg, 0);
 		break;
 
 	case IBLND_MSG_PUT_REQ: {
-		kib_msg_t	*txmsg;
-		kib_rdma_desc_t *rd;
+		struct kib_msg	*txmsg;
+		struct kib_rdma_desc *rd;
 		ibprm_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
 
 		if (mlen == 0) {
@@ -1800,6 +1974,7 @@ kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
 		if (rc != 0) {
 			CERROR("Can't setup PUT sink for %s: %d\n",
 			       libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+			tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
 			kiblnd_tx_done(tx);
 			/* tell peer_ni it's over */
 			kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK,
@@ -1807,7 +1982,7 @@ kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
 			break;
 		}
 
-		nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[rd->rd_nfrags]);
+		nob = offsetof(struct kib_putack_msg, ibpam_rd.rd_frags[rd->rd_nfrags]);
 		txmsg->ibm_u.putack.ibpam_src_cookie = ibprm_cookie;
 		txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
 
@@ -1858,18 +2033,18 @@ kiblnd_thread_fini (void)
 }
 
 static void
-kiblnd_peer_alive (kib_peer_ni_t *peer_ni)
+kiblnd_peer_alive(struct kib_peer_ni *peer_ni)
 {
-	/* This is racy, but everyone's only writing cfs_time_current() */
-	peer_ni->ibp_last_alive = cfs_time_current();
+	/* This is racy, but everyone's only writing ktime_get_seconds() */
+	peer_ni->ibp_last_alive = ktime_get_seconds();
 	smp_mb();
 }
 
 static void
-kiblnd_peer_notify (kib_peer_ni_t *peer_ni)
+kiblnd_peer_notify(struct kib_peer_ni *peer_ni)
 {
         int           error = 0;
-        cfs_time_t    last_alive = 0;
+	time64_t last_alive = 0;
         unsigned long flags;
 
 	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
@@ -1889,7 +2064,7 @@ kiblnd_peer_notify (kib_peer_ni_t *peer_ni)
 }
 
 void
-kiblnd_close_conn_locked (kib_conn_t *conn, int error)
+kiblnd_close_conn_locked(struct kib_conn *conn, int error)
 {
         /* This just does the immediate housekeeping.  'error' is zero for a
          * normal shutdown which can happen only after the connection has been
@@ -1897,9 +2072,9 @@ kiblnd_close_conn_locked (kib_conn_t *conn, int error)
          * connection to be finished off by the connd.  Otherwise the connd is
          * already dealing with it (either to set it up or tear it down).
          * Caller holds kib_global_lock exclusively in irq context */
-        kib_peer_ni_t       *peer_ni = conn->ibc_peer;
-        kib_dev_t        *dev;
-        unsigned long     flags;
+	struct kib_peer_ni *peer_ni = conn->ibc_peer;
+	struct kib_dev *dev;
+	unsigned long flags;
 
         LASSERT (error != 0 || conn->ibc_state >= IBLND_CONN_ESTABLISHED);
 
@@ -1929,7 +2104,7 @@ kiblnd_close_conn_locked (kib_conn_t *conn, int error)
 		       list_empty(&conn->ibc_active_txs) ? "" : "(waiting)");
 	}
 
-	dev = ((kib_net_t *)peer_ni->ibp_ni->ni_data)->ibn_dev;
+	dev = ((struct kib_net *)peer_ni->ibp_ni->ni_data)->ibn_dev;
 	if (peer_ni->ibp_next_conn == conn)
 		/* clear next_conn so it won't be used */
 		peer_ni->ibp_next_conn = NULL;
@@ -1962,7 +2137,7 @@ kiblnd_close_conn_locked (kib_conn_t *conn, int error)
 }
 
 void
-kiblnd_close_conn(kib_conn_t *conn, int error)
+kiblnd_close_conn(struct kib_conn *conn, int error)
 {
 	unsigned long flags;
 
@@ -1974,10 +2149,10 @@ kiblnd_close_conn(kib_conn_t *conn, int error)
 }
 
 static void
-kiblnd_handle_early_rxs(kib_conn_t *conn)
+kiblnd_handle_early_rxs(struct kib_conn *conn)
 {
-	unsigned long    flags;
-	kib_rx_t        *rx;
+	unsigned long flags;
+	struct kib_rx *rx;
 
 	LASSERT(!in_interrupt());
 	LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
@@ -1985,7 +2160,7 @@ kiblnd_handle_early_rxs(kib_conn_t *conn)
 	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 	while (!list_empty(&conn->ibc_early_rxs)) {
 		rx = list_entry(conn->ibc_early_rxs.next,
-				    kib_rx_t, rx_list);
+				struct kib_rx, rx_list);
 		list_del(&rx->rx_list);
 		write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 
@@ -1996,30 +2171,52 @@ kiblnd_handle_early_rxs(kib_conn_t *conn)
 	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 }
 
-static void
-kiblnd_abort_txs(kib_conn_t *conn, struct list_head *txs)
+void
+kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs)
 {
 	struct list_head	 zombies = LIST_HEAD_INIT(zombies);
 	struct list_head	*tmp;
 	struct list_head	*nxt;
-	kib_tx_t		*tx;
+	struct kib_tx *tx;
 
 	spin_lock(&conn->ibc_lock);
 
 	list_for_each_safe(tmp, nxt, txs) {
-		tx = list_entry(tmp, kib_tx_t, tx_list);
+		tx = list_entry(tmp, struct kib_tx, tx_list);
 
 		if (txs == &conn->ibc_active_txs) {
 			LASSERT(!tx->tx_queued);
 			LASSERT(tx->tx_waiting ||
 				tx->tx_sending != 0);
+			if (conn->ibc_comms_error == -ETIMEDOUT) {
+				if (tx->tx_waiting && !tx->tx_sending)
+					tx->tx_hstatus =
+					  LNET_MSG_STATUS_REMOTE_TIMEOUT;
+				else if (tx->tx_sending)
+					tx->tx_hstatus =
+					  LNET_MSG_STATUS_NETWORK_TIMEOUT;
+			}
 		} else {
 			LASSERT(tx->tx_queued);
+			if (conn->ibc_comms_error == -ETIMEDOUT)
+				tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_TIMEOUT;
+			else
+				tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
 		}
 
 		tx->tx_status = -ECONNABORTED;
 		tx->tx_waiting = 0;
 
+		/*
+		 * TODO: This makes an assumption that
+		 * kiblnd_tx_complete() will be called for each tx. If
+		 * that event is dropped we could end up with stale
+		 * connections floating around. We'd like to deal with
+		 * that in a better way.
+		 *
+		 * Also that means we can exceed the timeout by many
+		 * seconds.
+		 */
 		if (tx->tx_sending == 0) {
 			tx->tx_queued = 0;
 			list_del(&tx->tx_list);
@@ -2029,22 +2226,28 @@ kiblnd_abort_txs(kib_conn_t *conn, struct list_head *txs)
 
 	spin_unlock(&conn->ibc_lock);
 
-	kiblnd_txlist_done(&zombies, -ECONNABORTED);
+	/*
+	 * aborting transmits occurs when finalizing the connection.
+	 * The connection is finalized on error.
+	 * Passing LNET_MSG_STATUS_OK to txlist_done() will not
+	 * override the value already set in tx->tx_hstatus above.
+	 */
+	kiblnd_txlist_done(&zombies, -ECONNABORTED, LNET_MSG_STATUS_OK);
 }
 
 static void
-kiblnd_finalise_conn (kib_conn_t *conn)
+kiblnd_finalise_conn(struct kib_conn *conn)
 {
 	LASSERT (!in_interrupt());
 	LASSERT (conn->ibc_state > IBLND_CONN_INIT);
 
-	kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED);
-
 	/* abort_receives moves QP state to IB_QPS_ERR.  This is only required
 	 * for connections that didn't get as far as being connected, because
 	 * rdma_disconnect() does this for free. */
 	kiblnd_abort_receives(conn);
 
+	kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED);
+
 	/* Complete all tx descs not waiting for sends to complete.
 	 * NB we should be safe from RDMA now that the QP has changed state */
 
@@ -2058,7 +2261,8 @@ kiblnd_finalise_conn (kib_conn_t *conn)
 }
 
 static void
-kiblnd_peer_connect_failed(kib_peer_ni_t *peer_ni, int active, int error)
+kiblnd_peer_connect_failed(struct kib_peer_ni *peer_ni, int active,
+			   int error)
 {
 	struct list_head zombies = LIST_HEAD_INIT(zombies);
 	unsigned long	flags;
@@ -2086,8 +2290,7 @@ kiblnd_peer_connect_failed(kib_peer_ni_t *peer_ni, int active, int error)
 	peer_ni->ibp_reconnected = 0;
 	if (list_empty(&peer_ni->ibp_conns)) {
 		/* Take peer_ni's blocked transmits to complete with error */
-		list_add(&zombies, &peer_ni->ibp_tx_queue);
-		list_del_init(&peer_ni->ibp_tx_queue);
+		list_splice_init(&peer_ni->ibp_tx_queue, &zombies);
 
 		if (kiblnd_peer_active(peer_ni))
 			kiblnd_unlink_peer_locked(peer_ni);
@@ -2108,14 +2311,15 @@ kiblnd_peer_connect_failed(kib_peer_ni_t *peer_ni, int active, int error)
 	CNETERR("Deleting messages for %s: connection failed\n",
 		libcfs_nid2str(peer_ni->ibp_nid));
 
-	kiblnd_txlist_done(&zombies, -EHOSTUNREACH);
+	kiblnd_txlist_done(&zombies, error,
+			   LNET_MSG_STATUS_LOCAL_DROPPED);
 }
 
 static void
-kiblnd_connreq_done(kib_conn_t *conn, int status)
+kiblnd_connreq_done(struct kib_conn *conn, int status)
 {
-	kib_peer_ni_t	 *peer_ni = conn->ibc_peer;
-	kib_tx_t	 *tx;
+	struct kib_peer_ni *peer_ni = conn->ibc_peer;
+	struct kib_tx *tx;
 	struct list_head txs;
 	unsigned long	 flags;
 	int		 active;
@@ -2132,20 +2336,23 @@ kiblnd_connreq_done(kib_conn_t *conn, int status)
 		 (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT &&
 		  peer_ni->ibp_accepting > 0));
 
-        LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
-        conn->ibc_connvars = NULL;
+	LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+	conn->ibc_connvars = NULL;
 
-        if (status != 0) {
-                /* failed to establish connection */
-                kiblnd_peer_connect_failed(peer_ni, active, status);
-                kiblnd_finalise_conn(conn);
-                return;
-        }
+	if (status != 0) {
+		/* failed to establish connection */
+		kiblnd_peer_connect_failed(peer_ni, active, status);
+		kiblnd_finalise_conn(conn);
+		return;
+	}
 
-        /* connection established */
+	/* connection established */
 	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 
-        conn->ibc_last_send = jiffies;
+	/* reset retry count */
+	peer_ni->ibp_retries = 0;
+
+	conn->ibc_last_send = ktime_get();
         kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED);
         kiblnd_peer_alive(peer_ni);
 
@@ -2183,7 +2390,8 @@ kiblnd_connreq_done(kib_conn_t *conn, int status)
                 kiblnd_close_conn_locked(conn, -ECONNABORTED);
 		write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 
-		kiblnd_txlist_done(&txs, -ECONNABORTED);
+		kiblnd_txlist_done(&txs, -ECONNABORTED,
+				   LNET_MSG_STATUS_LOCAL_ERROR);
 
 		return;
 	}
@@ -2203,7 +2411,7 @@ kiblnd_connreq_done(kib_conn_t *conn, int status)
 	 */
 	spin_lock(&conn->ibc_lock);
 	while (!list_empty(&txs)) {
-		tx = list_entry(txs.next, kib_tx_t, tx_list);
+		tx = list_entry(txs.next, struct kib_tx, tx_list);
 		list_del(&tx->tx_list);
 
 		kiblnd_queue_tx_locked(tx, conn);
@@ -2217,7 +2425,7 @@ kiblnd_connreq_done(kib_conn_t *conn, int status)
 }
 
 static void
-kiblnd_reject(struct rdma_cm_id *cmid, kib_rej_t *rej)
+kiblnd_reject(struct rdma_cm_id *cmid, struct kib_rej *rej)
 {
         int          rc;
 
@@ -2235,17 +2443,17 @@ static int
 kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
 {
 	rwlock_t		*g_lock = &kiblnd_data.kib_global_lock;
-        kib_msg_t             *reqmsg = priv;
-        kib_msg_t             *ackmsg;
-        kib_dev_t             *ibdev;
-        kib_peer_ni_t            *peer_ni;
-        kib_peer_ni_t            *peer2;
-        kib_conn_t            *conn;
-	struct lnet_ni             *ni  = NULL;
-        kib_net_t             *net = NULL;
+	struct kib_msg *reqmsg = priv;
+	struct kib_msg *ackmsg;
+	struct kib_dev *ibdev;
+	struct kib_peer_ni *peer_ni;
+	struct kib_peer_ni *peer2;
+	struct kib_conn *conn;
+	struct lnet_ni *ni = NULL;
+	struct kib_net *net = NULL;
         lnet_nid_t             nid;
         struct rdma_conn_param cp;
-        kib_rej_t              rej;
+	struct kib_rej rej;
 	int                    version = IBLND_MSG_VERSION;
 	unsigned long          flags;
 	int                    rc;
@@ -2253,8 +2461,8 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
 	LASSERT (!in_interrupt());
 
 	/* cmid inherits 'context' from the corresponding listener id */
-	ibdev = (kib_dev_t *)cmid->context;
-	LASSERT (ibdev != NULL);
+	ibdev = cmid->context;
+	LASSERT(ibdev);
 
         memset(&rej, 0, sizeof(rej));
         rej.ibr_magic                = IBLND_MSG_MAGIC;
@@ -2270,7 +2478,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
 		goto failed;
 	}
 
-	if (priv_nob < offsetof(kib_msg_t, ibm_type)) {
+	if (priv_nob < offsetof(struct kib_msg, ibm_type)) {
 		CERROR("Short connection request\n");
 		goto failed;
 	}
@@ -2303,7 +2511,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
 	ni  = lnet_nid2ni_addref(reqmsg->ibm_dstnid);
 
 	if (ni != NULL) {
-		net = (kib_net_t *)ni->ni_data;
+		net = (struct kib_net *)ni->ni_data;
 		rej.ibr_incarnation = net->ibn_incarnation;
 	}
 
@@ -2352,26 +2560,26 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
 	}
 
 	if (reqmsg->ibm_u.connparams.ibcp_max_frags >
-	    kiblnd_rdma_frags(version, ni)) {
+	    IBLND_MAX_RDMA_FRAGS) {
 		CWARN("Can't accept conn from %s (version %x): "
 		      "max_frags %d too large (%d wanted)\n",
 		      libcfs_nid2str(nid), version,
 		      reqmsg->ibm_u.connparams.ibcp_max_frags,
-		      kiblnd_rdma_frags(version, ni));
+		      IBLND_MAX_RDMA_FRAGS);
 
 		if (version >= IBLND_MSG_VERSION)
 			rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
 
 		goto failed;
 	} else if (reqmsg->ibm_u.connparams.ibcp_max_frags <
-		   kiblnd_rdma_frags(version, ni) &&
+		   IBLND_MAX_RDMA_FRAGS &&
 		   net->ibn_fmr_ps == NULL) {
 		CWARN("Can't accept conn from %s (version %x): "
 		      "max_frags %d incompatible without FMR pool "
 		      "(%d wanted)\n",
 		      libcfs_nid2str(nid), version,
 		      reqmsg->ibm_u.connparams.ibcp_max_frags,
-		      kiblnd_rdma_frags(version, ni));
+		      IBLND_MAX_RDMA_FRAGS);
 
 		if (version == IBLND_MSG_VERSION)
 			rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
@@ -2545,7 +2753,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
 	if (ni != NULL) {
 		rej.ibr_cp.ibcp_queue_depth =
 			kiblnd_msg_queue_size(version, ni);
-		rej.ibr_cp.ibcp_max_frags   = kiblnd_rdma_frags(version, ni);
+		rej.ibr_cp.ibcp_max_frags   = IBLND_MAX_RDMA_FRAGS;
 		lnet_ni_decref(ni);
 	}
 
@@ -2556,11 +2764,11 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
 }
 
 static void
-kiblnd_check_reconnect(kib_conn_t *conn, int version,
-		       __u64 incarnation, int why, kib_connparams_t *cp)
+kiblnd_check_reconnect(struct kib_conn *conn, int version,
+		       u64 incarnation, int why, struct kib_connparams *cp)
 {
 	rwlock_t	*glock = &kiblnd_data.kib_global_lock;
-	kib_peer_ni_t	*peer_ni = conn->ibc_peer;
+	struct kib_peer_ni *peer_ni = conn->ibc_peer;
 	char		*reason;
 	int		 msg_size = IBLND_MSG_SIZE;
 	int		 frag_num = -1;
@@ -2592,10 +2800,15 @@ kiblnd_check_reconnect(kib_conn_t *conn, int version,
 		goto out;
 	}
 
-        switch (why) {
-        default:
-                reason = "Unknown";
-                break;
+	if (peer_ni->ibp_retries > *kiblnd_tunables.kib_retry_count) {
+		reason = "retry count exceeded due to no listener";
+		goto out;
+	}
+
+	switch (why) {
+	default:
+		reason = "Unknown";
+		break;
 
 	case IBLND_REJECT_RDMA_FRAGS: {
 		struct lnet_ioctl_config_o2iblnd_tunables *tunables;
@@ -2605,10 +2818,16 @@ kiblnd_check_reconnect(kib_conn_t *conn, int version,
 			goto out;
 		}
 		tunables = &peer_ni->ibp_ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+#ifdef HAVE_IB_GET_DMA_MR
+		/*
+		 * This check only makes sense if the kernel supports global
+		 * memory registration. Otherwise, map_on_demand will never == 0
+		 */
 		if (!tunables->lnd_map_on_demand) {
 			reason = "map_on_demand must be enabled";
 			goto out;
 		}
+#endif
 		if (conn->ibc_max_frags <= frag_num) {
 			reason = "unsupported max frags";
 			goto out;
@@ -2670,9 +2889,9 @@ kiblnd_check_reconnect(kib_conn_t *conn, int version,
 }
 
 static void
-kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
+kiblnd_rejected(struct kib_conn *conn, int reason, void *priv, int priv_nob)
 {
-	kib_peer_ni_t    *peer_ni = conn->ibc_peer;
+	struct kib_peer_ni *peer_ni = conn->ibc_peer;
 
 	LASSERT (!in_interrupt());
 	LASSERT (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
@@ -2684,17 +2903,18 @@ kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
 		break;
 
         case IB_CM_REJ_INVALID_SERVICE_ID:
+		peer_ni->ibp_retries++;
 		kiblnd_check_reconnect(conn, IBLND_MSG_VERSION, 0,
 				       IBLND_REJECT_INVALID_SRV_ID, NULL);
-                CNETERR("%s rejected: no listener at %d\n",
-                        libcfs_nid2str(peer_ni->ibp_nid),
-                        *kiblnd_tunables.kib_service);
-                break;
+		CNETERR("%s rejected: no listener at %d\n",
+			libcfs_nid2str(peer_ni->ibp_nid),
+			*kiblnd_tunables.kib_service);
+		break;
 
         case IB_CM_REJ_CONSUMER_DEFINED:
-                if (priv_nob >= offsetof(kib_rej_t, ibr_padding)) {
-                        kib_rej_t        *rej         = priv;
-                        kib_connparams_t *cp          = NULL;
+		if (priv_nob >= offsetof(struct kib_rej, ibr_padding)) {
+			struct kib_rej *rej = priv;
+			struct kib_connparams *cp = NULL;
                         int               flip        = 0;
                         __u64             incarnation = -1;
 
@@ -2707,7 +2927,7 @@ kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
                          * it rejected me then upgrade to V2, I have no idea
                          * about the upgrading and try to reconnect with V1,
                          * in this case upgraded V2 can find out I'm trying to
-                         * talk to the old guy and reject me(incarnation is -1). 
+			 * talk to the old guy and reject me(incarnation is -1).
                          */
 
                         if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) ||
@@ -2717,7 +2937,7 @@ kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
                                 flip = 1;
                         }
 
-                        if (priv_nob >= sizeof(kib_rej_t) &&
+			if (priv_nob >= sizeof(struct kib_rej) &&
                             rej->ibr_version > IBLND_MSG_VERSION_1) {
                                 /* priv_nob is always 148 in current version
                                  * of OFED, so we still need to check version.
@@ -2797,12 +3017,12 @@ kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
 }
 
 static void
-kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob)
+kiblnd_check_connreply(struct kib_conn *conn, void *priv, int priv_nob)
 {
-        kib_peer_ni_t    *peer_ni = conn->ibc_peer;
-	struct lnet_ni *ni   = peer_ni->ibp_ni;
-        kib_net_t     *net  = ni->ni_data;
-        kib_msg_t     *msg  = priv;
+	struct kib_peer_ni *peer_ni = conn->ibc_peer;
+	struct lnet_ni *ni = peer_ni->ibp_ni;
+	struct kib_net *net = ni->ni_data;
+	struct kib_msg *msg = priv;
         int            ver  = conn->ibc_version;
         int            rc   = kiblnd_unpack_msg(msg, priv_nob);
         unsigned long  flags;
@@ -2898,12 +3118,12 @@ kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob)
 }
 
 static int
-kiblnd_active_connect (struct rdma_cm_id *cmid)
+kiblnd_active_connect(struct rdma_cm_id *cmid)
 {
-        kib_peer_ni_t              *peer_ni = (kib_peer_ni_t *)cmid->context;
-        kib_conn_t              *conn;
-        kib_msg_t               *msg;
-        struct rdma_conn_param   cp;
+	struct kib_peer_ni *peer_ni = cmid->context;
+	struct kib_conn *conn;
+	struct kib_msg *msg;
+	struct rdma_conn_param cp;
         int                      version;
         __u64                    incarnation;
         unsigned long            flags;
@@ -2951,8 +3171,7 @@ kiblnd_active_connect (struct rdma_cm_id *cmid)
 
         LASSERT(cmid->context == (void *)conn);
         LASSERT(conn->ibc_cmid == cmid);
-
-        rc = rdma_connect(cmid, &cp);
+	rc = rdma_connect_locked(cmid, &cp);
         if (rc != 0) {
                 CERROR("Can't connect to %s: %d\n",
                        libcfs_nid2str(peer_ni->ibp_nid), rc);
@@ -2966,9 +3185,9 @@ kiblnd_active_connect (struct rdma_cm_id *cmid)
 int
 kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
 {
-        kib_peer_ni_t  *peer_ni;
-        kib_conn_t  *conn;
-	int          rc;
+	struct kib_peer_ni *peer_ni;
+	struct kib_conn *conn;
+	int rc;
 
 	switch (event->event) {
 	default:
@@ -2978,14 +3197,14 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
 
 	case RDMA_CM_EVENT_CONNECT_REQUEST:
                 /* destroy cmid on failure */
-		rc = kiblnd_passive_connect(cmid, 
+		rc = kiblnd_passive_connect(cmid,
                                             (void *)KIBLND_CONN_PARAM(event),
                                             KIBLND_CONN_PARAM_LEN(event));
                 CDEBUG(D_NET, "connreq: %d\n", rc);
                 return rc;
-                
+
 	case RDMA_CM_EVENT_ADDR_ERROR:
-                peer_ni = (kib_peer_ni_t *)cmid->context;
+		peer_ni = cmid->context;
                 CNETERR("%s: ADDR ERROR %d\n",
                        libcfs_nid2str(peer_ni->ibp_nid), event->status);
                 kiblnd_peer_connect_failed(peer_ni, 1, -EHOSTUNREACH);
@@ -2993,7 +3212,7 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                 return -EHOSTUNREACH;      /* rc != 0 destroys cmid */
 
 	case RDMA_CM_EVENT_ADDR_RESOLVED:
-                peer_ni = (kib_peer_ni_t *)cmid->context;
+		peer_ni = cmid->context;
 
                 CDEBUG(D_NET,"%s Addr resolved: %d\n",
                        libcfs_nid2str(peer_ni->ibp_nid), event->status);
@@ -3002,12 +3221,12 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                         CNETERR("Can't resolve address for %s: %d\n",
                                 libcfs_nid2str(peer_ni->ibp_nid), event->status);
                         rc = event->status;
-                } else {
-                        rc = rdma_resolve_route(
-                                cmid, *kiblnd_tunables.kib_timeout * 1000);
+		} else {
+			rc = rdma_resolve_route(
+				cmid, lnet_get_lnd_timeout() * 1000);
 			if (rc == 0) {
-				kib_net_t *net = peer_ni->ibp_ni->ni_data;
-				kib_dev_t *dev = net->ibn_dev;
+				struct kib_net *net = peer_ni->ibp_ni->ni_data;
+				struct kib_dev *dev = net->ibn_dev;
 
 				CDEBUG(D_NET, "%s: connection bound to "\
 				       "%s:%pI4h:%s\n",
@@ -3027,7 +3246,7 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                 return rc;                      /* rc != 0 destroys cmid */
 
 	case RDMA_CM_EVENT_ROUTE_ERROR:
-                peer_ni = (kib_peer_ni_t *)cmid->context;
+		peer_ni = cmid->context;
                 CNETERR("%s: ROUTE ERROR %d\n",
                         libcfs_nid2str(peer_ni->ibp_nid), event->status);
                 kiblnd_peer_connect_failed(peer_ni, 1, -EHOSTUNREACH);
@@ -3035,7 +3254,7 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                 return -EHOSTUNREACH;           /* rc != 0 destroys cmid */
 
 	case RDMA_CM_EVENT_ROUTE_RESOLVED:
-                peer_ni = (kib_peer_ni_t *)cmid->context;
+		peer_ni = cmid->context;
                 CDEBUG(D_NET,"%s Route resolved: %d\n",
                        libcfs_nid2str(peer_ni->ibp_nid), event->status);
 
@@ -3047,9 +3266,9 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                 kiblnd_peer_connect_failed(peer_ni, 1, event->status);
                 kiblnd_peer_decref(peer_ni);
                 return event->status;           /* rc != 0 destroys cmid */
-                
+
 	case RDMA_CM_EVENT_UNREACHABLE:
-                conn = (kib_conn_t *)cmid->context;
+		conn = cmid->context;
                 LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
                         conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
                 CNETERR("%s: UNREACHABLE %d\n",
@@ -3059,7 +3278,7 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                 return 0;
 
 	case RDMA_CM_EVENT_CONNECT_ERROR:
-                conn = (kib_conn_t *)cmid->context;
+		conn = cmid->context;
                 LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
                         conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
                 CNETERR("%s: CONNECT ERROR %d\n",
@@ -3069,7 +3288,7 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                 return 0;
 
 	case RDMA_CM_EVENT_REJECTED:
-                conn = (kib_conn_t *)cmid->context;
+		conn = cmid->context;
                 switch (conn->ibc_state) {
                 default:
                         LBUG();
@@ -3091,7 +3310,7 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                 return 0;
 
 	case RDMA_CM_EVENT_ESTABLISHED:
-                conn = (kib_conn_t *)cmid->context;
+		conn = cmid->context;
                 switch (conn->ibc_state) {
                 default:
                         LBUG();
@@ -3118,7 +3337,7 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                 return 0;
 
 	case RDMA_CM_EVENT_DISCONNECTED:
-                conn = (kib_conn_t *)cmid->context;
+		conn = cmid->context;
                 if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
                         CERROR("%s DISCONNECTED\n",
                                libcfs_nid2str(conn->ibc_peer->ibp_nid));
@@ -3145,13 +3364,13 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
 }
 
 static int
-kiblnd_check_txs_locked(kib_conn_t *conn, struct list_head *txs)
+kiblnd_check_txs_locked(struct kib_conn *conn, struct list_head *txs)
 {
-	kib_tx_t	 *tx;
+	struct kib_tx *tx;
 	struct list_head *ttmp;
 
 	list_for_each(ttmp, txs) {
-		tx = list_entry(ttmp, kib_tx_t, tx_list);
+		tx = list_entry(ttmp, struct kib_tx, tx_list);
 
 		if (txs != &conn->ibc_active_txs) {
 			LASSERT(tx->tx_queued);
@@ -3160,10 +3379,11 @@ kiblnd_check_txs_locked(kib_conn_t *conn, struct list_head *txs)
 			LASSERT(tx->tx_waiting || tx->tx_sending != 0);
 		}
 
-		if (cfs_time_aftereq(jiffies, tx->tx_deadline)) {
-			CERROR("Timed out tx: %s, %lu seconds\n",
+		if (ktime_compare(ktime_get(), tx->tx_deadline) >= 0) {
+			CERROR("Timed out tx: %s, %lld seconds\n",
 			       kiblnd_queue2str(conn, txs),
-			       cfs_duration_sec(jiffies - tx->tx_deadline));
+			       ktime_ms_delta(ktime_get(),
+					      tx->tx_deadline) / MSEC_PER_SEC);
 			return 1;
 		}
 	}
@@ -3172,7 +3392,7 @@ kiblnd_check_txs_locked(kib_conn_t *conn, struct list_head *txs)
 }
 
 static int
-kiblnd_conn_timed_out_locked(kib_conn_t *conn)
+kiblnd_conn_timed_out_locked(struct kib_conn *conn)
 {
         return  kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue) ||
                 kiblnd_check_txs_locked(conn, &conn->ibc_tx_noops) ||
@@ -3189,9 +3409,9 @@ kiblnd_check_conns (int idx)
 	struct list_head  timedout_txs = LIST_HEAD_INIT(timedout_txs);
 	struct list_head *peers = &kiblnd_data.kib_peers[idx];
 	struct list_head *ptmp;
-	kib_peer_ni_t	 *peer_ni;
-	kib_conn_t	 *conn;
-	kib_tx_t	 *tx, *tx_tmp;
+	struct kib_peer_ni *peer_ni;
+	struct kib_conn	*conn;
+	struct kib_tx *tx, *tx_tmp;
 	struct list_head *ctmp;
 	unsigned long	  flags;
 
@@ -3201,14 +3421,15 @@ kiblnd_check_conns (int idx)
 	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 
 	list_for_each(ptmp, peers) {
-		peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list);
+		peer_ni = list_entry(ptmp, struct kib_peer_ni, ibp_list);
 
 		/* Check tx_deadline */
 		list_for_each_entry_safe(tx, tx_tmp, &peer_ni->ibp_tx_queue, tx_list) {
-			if (cfs_time_aftereq(jiffies, tx->tx_deadline)) {
-				CWARN("Timed out tx for %s: %lu seconds\n",
+			if (ktime_compare(ktime_get(), tx->tx_deadline) >= 0) {
+				CWARN("Timed out tx for %s: %lld seconds\n",
 				      libcfs_nid2str(peer_ni->ibp_nid),
-				      cfs_duration_sec(jiffies - tx->tx_deadline));
+				      ktime_ms_delta(ktime_get(),
+						     tx->tx_deadline) / MSEC_PER_SEC);
 				list_move(&tx->tx_list, &timedout_txs);
 			}
 		}
@@ -3217,7 +3438,7 @@ kiblnd_check_conns (int idx)
 			int timedout;
 			int sendnoop;
 
-			conn = list_entry(ctmp, kib_conn_t, ibc_list);
+			conn = list_entry(ctmp, struct kib_conn, ibc_list);
 
 			LASSERT(conn->ibc_state == IBLND_CONN_ESTABLISHED);
 
@@ -3231,11 +3452,10 @@ kiblnd_check_conns (int idx)
 			}
 
 			if (timedout) {
-				CERROR("Timed out RDMA with %s (%lu): "
+				CERROR("Timed out RDMA with %s (%lld): "
 				       "c: %u, oc: %u, rc: %u\n",
 				       libcfs_nid2str(peer_ni->ibp_nid),
-				       cfs_duration_sec(cfs_time_current() -
-							peer_ni->ibp_last_alive),
+				       ktime_get_seconds() - peer_ni->ibp_last_alive,
 				       conn->ibc_credits,
 				       conn->ibc_outstanding_credits,
 				       conn->ibc_reserved_credits);
@@ -3253,14 +3473,15 @@ kiblnd_check_conns (int idx)
 	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 
 	if (!list_empty(&timedout_txs))
-		kiblnd_txlist_done(&timedout_txs, -ETIMEDOUT);
+		kiblnd_txlist_done(&timedout_txs, -ETIMEDOUT,
+				   LNET_MSG_STATUS_LOCAL_TIMEOUT);
 
 	/* Handle timeout by closing the whole
 	 * connection. We can only be sure RDMA activity
 	 * has ceased once the QP has been modified. */
 	while (!list_empty(&closes)) {
 		conn = list_entry(closes.next,
-				  kib_conn_t, ibc_connd_list);
+				  struct kib_conn, ibc_connd_list);
 		list_del(&conn->ibc_connd_list);
 		kiblnd_close_conn(conn, -ETIMEDOUT);
 		kiblnd_conn_decref(conn);
@@ -3271,7 +3492,7 @@ kiblnd_check_conns (int idx)
 	 * free to do it last time... */
 	while (!list_empty(&checksends)) {
 		conn = list_entry(checksends.next,
-				  kib_conn_t, ibc_connd_list);
+				  struct kib_conn, ibc_connd_list);
 		list_del(&conn->ibc_connd_list);
 
 		spin_lock(&conn->ibc_lock);
@@ -3283,7 +3504,7 @@ kiblnd_check_conns (int idx)
 }
 
 static void
-kiblnd_disconnect_conn (kib_conn_t *conn)
+kiblnd_disconnect_conn(struct kib_conn *conn)
 {
 	LASSERT (!in_interrupt());
 	LASSERT (current == kiblnd_data.kib_connd);
@@ -3312,7 +3533,7 @@ kiblnd_connd (void *arg)
 	spinlock_t	  *lock= &kiblnd_data.kib_connd_lock;
 	wait_queue_entry_t wait;
 	unsigned long      flags;
-	kib_conn_t        *conn;
+	struct kib_conn *conn;
 	int                timeout;
 	int                i;
 	int                dropped_lock;
@@ -3332,10 +3553,10 @@ kiblnd_connd (void *arg)
                 dropped_lock = 0;
 
 		if (!list_empty(&kiblnd_data.kib_connd_zombies)) {
-			kib_peer_ni_t *peer_ni = NULL;
+			struct kib_peer_ni *peer_ni = NULL;
 
 			conn = list_entry(kiblnd_data.kib_connd_zombies.next,
-					  kib_conn_t, ibc_list);
+					  struct kib_conn, ibc_list);
 			list_del(&conn->ibc_list);
 			if (conn->ibc_reconnect) {
 				peer_ni = conn->ibc_peer;
@@ -3345,11 +3566,13 @@ kiblnd_connd (void *arg)
 			spin_unlock_irqrestore(lock, flags);
 			dropped_lock = 1;
 
-			kiblnd_destroy_conn(conn, !peer_ni);
+			kiblnd_destroy_conn(conn);
 
 			spin_lock_irqsave(lock, flags);
-			if (!peer_ni)
+			if (!peer_ni) {
+				LIBCFS_FREE(conn, sizeof(*conn));
 				continue;
+			}
 
 			conn->ibc_peer = peer_ni;
 			if (peer_ni->ibp_reconnected < KIB_RECONN_HIGH_RACE)
@@ -3362,7 +3585,7 @@ kiblnd_connd (void *arg)
 
 		if (!list_empty(&kiblnd_data.kib_connd_conns)) {
 			conn = list_entry(kiblnd_data.kib_connd_conns.next,
-					      kib_conn_t, ibc_list);
+					  struct kib_conn, ibc_list);
 			list_del(&conn->ibc_list);
 
 			spin_unlock_irqrestore(lock, flags);
@@ -3375,7 +3598,8 @@ kiblnd_connd (void *arg)
                 }
 
 		while (reconn < KIB_RECONN_BREAK) {
-			if (kiblnd_data.kib_reconn_sec != ktime_get_real_seconds()) {
+			if (kiblnd_data.kib_reconn_sec !=
+			    ktime_get_real_seconds()) {
 				kiblnd_data.kib_reconn_sec = ktime_get_real_seconds();
 				list_splice_init(&kiblnd_data.kib_reconn_wait,
 						 &kiblnd_data.kib_reconn_list);
@@ -3385,7 +3609,7 @@ kiblnd_connd (void *arg)
 				break;
 
 			conn = list_entry(kiblnd_data.kib_reconn_list.next,
-					  kib_conn_t, ibc_list);
+					  struct kib_conn, ibc_list);
 			list_del(&conn->ibc_list);
 
 			spin_unlock_irqrestore(lock, flags);
@@ -3404,6 +3628,7 @@ kiblnd_connd (void *arg)
                         const int n = 4;
                         const int p = 1;
                         int       chunk = kiblnd_data.kib_peer_hash_size;
+			unsigned int lnd_timeout;
 
 			spin_unlock_irqrestore(lock, flags);
                         dropped_lock = 1;
@@ -3416,11 +3641,11 @@ kiblnd_connd (void *arg)
                          * connection within (n+1)/n times the timeout
                          * interval. */
 
-                        if (*kiblnd_tunables.kib_timeout > n * p)
-                                chunk = (chunk * n * p) /
-                                        *kiblnd_tunables.kib_timeout;
-                        if (chunk == 0)
-                                chunk = 1;
+			lnd_timeout = lnet_get_lnd_timeout();
+			if (lnd_timeout > n * p)
+				chunk = (chunk * n * p) / lnd_timeout;
+			if (chunk == 0)
+				chunk = 1;
 
 			for (i = 0; i < chunk; i++) {
 				kiblnd_check_conns(peer_index);
@@ -3456,23 +3681,36 @@ kiblnd_connd (void *arg)
 void
 kiblnd_qp_event(struct ib_event *event, void *arg)
 {
-        kib_conn_t *conn = arg;
+	struct kib_conn *conn = arg;
 
-        switch (event->event) {
-        case IB_EVENT_COMM_EST:
-                CDEBUG(D_NET, "%s established\n",
-                       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+	switch (event->event) {
+	case IB_EVENT_COMM_EST:
+		CDEBUG(D_NET, "%s established\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid));
 		/* We received a packet but connection isn't established
 		 * probably handshake packet was lost, so free to
 		 * force make connection established */
 		rdma_notify(conn->ibc_cmid, IB_EVENT_COMM_EST);
-                return;
+		return;
 
-        default:
-                CERROR("%s: Async QP event type %d\n",
-                       libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
-                return;
-        }
+	case IB_EVENT_PORT_ERR:
+	case IB_EVENT_DEVICE_FATAL:
+		CERROR("Fatal device error for NI %s\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_ni->ni_nid));
+		atomic_set(&conn->ibc_peer->ibp_ni->ni_fatal_error_on, 1);
+		return;
+
+	case IB_EVENT_PORT_ACTIVE:
+		CERROR("Port reactivated for NI %s\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_ni->ni_nid));
+		atomic_set(&conn->ibc_peer->ibp_ni->ni_fatal_error_on, 0);
+		return;
+
+	default:
+		CERROR("%s: Async QP event type %d\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
+		return;
+	}
 }
 
 static void
@@ -3518,9 +3756,9 @@ kiblnd_cq_completion(struct ib_cq *cq, void *arg)
 	 * consuming my CQ I could be called after all completions have
 	 * occurred.  But in this case, ibc_nrx == 0 && ibc_nsends_posted == 0
 	 * and this CQ is about to be destroyed so I NOOP. */
-	kib_conn_t		*conn = (kib_conn_t *)arg;
-	struct kib_sched_info	*sched = conn->ibc_sched;
-	unsigned long		flags;
+	struct kib_conn	*conn = arg;
+	struct kib_sched_info *sched = conn->ibc_sched;
+	unsigned long flags;
 
 	LASSERT(cq == conn->ibc_cq);
 
@@ -3545,7 +3783,7 @@ kiblnd_cq_completion(struct ib_cq *cq, void *arg)
 void
 kiblnd_cq_event(struct ib_event *event, void *arg)
 {
-        kib_conn_t *conn = arg;
+	struct kib_conn *conn = arg;
 
         CERROR("%s: async CQ event type %d\n",
                libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
@@ -3556,7 +3794,7 @@ kiblnd_scheduler(void *arg)
 {
 	long			id = (long)arg;
 	struct kib_sched_info	*sched;
-	kib_conn_t		*conn;
+	struct kib_conn	*conn;
 	wait_queue_entry_t      wait;
 	unsigned long		flags;
 	struct ib_wc		wc;
@@ -3594,7 +3832,7 @@ kiblnd_scheduler(void *arg)
 
 		if (!list_empty(&sched->ibs_conns)) {
 			conn = list_entry(sched->ibs_conns.next,
-					      kib_conn_t, ibc_sched_list);
+					  struct kib_conn, ibc_sched_list);
 			/* take over kib_sched_conns' ref on conn... */
 			LASSERT(conn->ibc_scheduled);
 			list_del(&conn->ibc_sched_list);
@@ -3698,7 +3936,7 @@ int
 kiblnd_failover_thread(void *arg)
 {
 	rwlock_t	*glock = &kiblnd_data.kib_global_lock;
-	kib_dev_t	*dev;
+	struct kib_dev *dev;
 	struct net *ns = arg;
 	wait_queue_entry_t wait;
 	unsigned long	 flags;
@@ -3717,8 +3955,7 @@ kiblnd_failover_thread(void *arg)
 
 		list_for_each_entry(dev, &kiblnd_data.kib_failed_devs,
                                     ibd_fail_list) {
-                        if (cfs_time_before(cfs_time_current(),
-                                            dev->ibd_next_failover))
+			if (ktime_get_seconds() < dev->ibd_next_failover)
                                 continue;
                         do_failover = 1;
                         break;
@@ -3736,13 +3973,13 @@ kiblnd_failover_thread(void *arg)
                         LASSERT (dev->ibd_failover);
                         dev->ibd_failover = 0;
                         if (rc >= 0) { /* Device is OK or failover succeed */
-                                dev->ibd_next_failover = cfs_time_shift(3);
+				dev->ibd_next_failover = ktime_get_seconds() + 3;
                                 continue;
                         }
 
                         /* failed to failover, retry later */
-                        dev->ibd_next_failover =
-                                cfs_time_shift(min(dev->ibd_failed_failover, 10));
+			dev->ibd_next_failover = ktime_get_seconds() +
+						 min(dev->ibd_failed_failover, 10);
                         if (kiblnd_dev_can_failover(dev)) {
 				list_add_tail(&dev->ibd_fail_list,
                                               &kiblnd_data.kib_failed_devs);
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_modparams.c
index 72cb50ecd14f5..39f9a620d04a4 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_modparams.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_modparams.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -82,7 +82,7 @@ static int peer_buffer_credits = 0;
 module_param(peer_buffer_credits, int, 0444);
 MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits");
 
-static int peer_timeout = 180;
+static int peer_timeout = DEFAULT_PEER_TIMEOUT;
 module_param(peer_timeout, int, 0444);
 MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)");
 
@@ -92,7 +92,7 @@ MODULE_PARM_DESC(ipif_name, "IPoIB interface name");
 
 static int retry_count = 5;
 module_param(retry_count, int, 0644);
-MODULE_PARM_DESC(retry_count, "Retransmissions when no ACK received");
+MODULE_PARM_DESC(retry_count, "Number of times to retry connection operations");
 
 static int rnr_retry_count = 6;
 module_param(rnr_retry_count, int, 0644);
@@ -110,16 +110,46 @@ static int concurrent_sends;
 module_param(concurrent_sends, int, 0444);
 MODULE_PARM_DESC(concurrent_sends, "send work-queue sizing");
 
+static int use_fastreg_gaps;
+module_param(use_fastreg_gaps, int, 0444);
+MODULE_PARM_DESC(use_fastreg_gaps, "Enable discontiguous fastreg fragment support. Expect performance drop");
+
+/*
+ * map_on_demand is a flag used to determine if we can use FMR or FastReg.
+ * This is applicable for kernels which support global memory regions. For
+ * later kernels this flag is always enabled, since we will always either
+ * use FMR or FastReg
+ * For kernels which support global memory regions map_on_demand defaults
+ * to 0 which means we will be using global memory regions exclusively.
+ * If it is set to a value other than 0, then we will behave as follows:
+ *  1. Always default the number of fragments to IBLND_MAX_RDMA_FRAGS
+ *  2. Create FMR/FastReg pools
+ *  3. Negotiate the supported number of fragments per connection
+ *  4. Attempt to transmit using global memory regions only if
+ *     map-on-demand is not turned on, otherwise use FMR or FastReg
+ *  5. In case of transmitting tx with GAPS over FMR we will need to
+ *     transmit it with multiple fragments. Look at the comments in
+ *     kiblnd_fmr_map_tx() for an explanation of the behavior.
+ *
+ * For later kernels we default map_on_demand to 1 and not allow
+ * it to be set to 0, since there is no longer support for global memory
+ * regions. Behavior:
+ *  1. Default the number of fragments to IBLND_MAX_RDMA_FRAGS
+ *  2. Create FMR/FastReg pools
+ *  3. Negotiate the supported number of fragments per connection
+ *  4. Look at the comments in kiblnd_fmr_map_tx() for an explanation of
+ *     the behavior when transmit with GAPS verses contiguous.
+ */
 #ifdef HAVE_IB_GET_DMA_MR
 #define IBLND_DEFAULT_MAP_ON_DEMAND 0
-#define IBLND_MIN_MAP_ON_DEMAND 0
+#define MOD_STR "map on demand"
 #else
-#define IBLND_DEFAULT_MAP_ON_DEMAND IBLND_MAX_RDMA_FRAGS
-#define IBLND_MIN_MAP_ON_DEMAND 1
+#define IBLND_DEFAULT_MAP_ON_DEMAND 1
+#define MOD_STR "map on demand (obsolete)"
 #endif
 static int map_on_demand = IBLND_DEFAULT_MAP_ON_DEMAND;
 module_param(map_on_demand, int, 0444);
-MODULE_PARM_DESC(map_on_demand, "map on demand");
+MODULE_PARM_DESC(map_on_demand, MOD_STR);
 
 /* NB: this value is shared by all CPTs, it can grow at runtime */
 static int fmr_pool_size = 512;
@@ -156,7 +186,7 @@ static unsigned int wrq_sge = 2;
 module_param(wrq_sge, uint, 0444);
 MODULE_PARM_DESC(wrq_sge, "# scatter/gather element per work request");
 
-kib_tunables_t kiblnd_tunables = {
+struct kib_tunables kiblnd_tunables = {
         .kib_dev_failover           = &dev_failover,
         .kib_service                = &service,
         .kib_cksum                  = &cksum,
@@ -170,6 +200,7 @@ kib_tunables_t kiblnd_tunables = {
 	.kib_use_priv_port	    = &use_privileged_port,
 	.kib_nscheds		    = &nscheds,
 	.kib_wrq_sge		    = &wrq_sge,
+	.kib_use_fastreg_gaps       = &use_fastreg_gaps,
 };
 
 static struct lnet_ioctl_config_o2iblnd_tunables default_tunables;
@@ -236,6 +267,15 @@ kiblnd_tunables_setup(struct lnet_ni *ni)
 		net_tunables->lct_peer_tx_credits =
 			net_tunables->lct_max_tx_credits;
 
+#ifndef HAVE_IB_GET_DMA_MR
+	/*
+	 * For kernels which do not support global memory regions, always
+	 * enable map_on_demand
+	 */
+	if (tunables->lnd_map_on_demand == 0)
+		tunables->lnd_map_on_demand = 1;
+#endif
+
 	if (!tunables->lnd_peercredits_hiw)
 		tunables->lnd_peercredits_hiw = peer_credits_hiw;
 
@@ -245,30 +285,8 @@ kiblnd_tunables_setup(struct lnet_ni *ni)
 	if (tunables->lnd_peercredits_hiw >= net_tunables->lct_peer_tx_credits)
 		tunables->lnd_peercredits_hiw = net_tunables->lct_peer_tx_credits - 1;
 
-	if (tunables->lnd_map_on_demand < IBLND_MIN_MAP_ON_DEMAND ||
-	    tunables->lnd_map_on_demand > IBLND_MAX_RDMA_FRAGS) {
-		/* Use the default */
-		CWARN("Invalid map_on_demand (%d), expects %d - %d. Using default of %d\n",
-		      tunables->lnd_map_on_demand, IBLND_MIN_MAP_ON_DEMAND,
-		      IBLND_MAX_RDMA_FRAGS, IBLND_DEFAULT_MAP_ON_DEMAND);
-		tunables->lnd_map_on_demand = IBLND_DEFAULT_MAP_ON_DEMAND;
-	}
-
-	if (tunables->lnd_map_on_demand == 1) {
-		/* don't make sense to create map if only one fragment */
-		tunables->lnd_map_on_demand = 2;
-	}
-
-	if (tunables->lnd_concurrent_sends == 0) {
-		if (tunables->lnd_map_on_demand > 0 &&
-		    tunables->lnd_map_on_demand <= IBLND_MAX_RDMA_FRAGS / 8) {
-			tunables->lnd_concurrent_sends =
-					net_tunables->lct_peer_tx_credits * 2;
-		} else {
-			tunables->lnd_concurrent_sends =
-				net_tunables->lct_peer_tx_credits;
-		}
-	}
+	if (tunables->lnd_concurrent_sends == 0)
+			tunables->lnd_concurrent_sends = net_tunables->lct_peer_tx_credits;
 
 	if (tunables->lnd_concurrent_sends > net_tunables->lct_peer_tx_credits * 2)
 		tunables->lnd_concurrent_sends = net_tunables->lct_peer_tx_credits * 2;
@@ -303,7 +321,7 @@ int
 kiblnd_tunables_init(void)
 {
 	default_tunables.lnd_version = CURRENT_LND_VERSION;
-	default_tunables.lnd_peercredits_hiw = peer_credits_hiw,
+	default_tunables.lnd_peercredits_hiw = peer_credits_hiw;
 	default_tunables.lnd_map_on_demand = map_on_demand;
 	default_tunables.lnd_concurrent_sends = concurrent_sends;
 	default_tunables.lnd_fmr_pool_size = fmr_pool_size;
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c
index d0b8756143580..9b199e3ab541a 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -41,17 +41,17 @@
 #include <linux/inetdevice.h>
 
 static struct lnet_lnd the_ksocklnd;
-ksock_nal_data_t        ksocknal_data;
+struct ksock_nal_data ksocknal_data;
 
-static ksock_interface_t *
+static struct ksock_interface *
 ksocknal_ip2iface(struct lnet_ni *ni, __u32 ip)
 {
-	ksock_net_t *net = ni->ni_data;
+	struct ksock_net *net = ni->ni_data;
 	int i;
-	ksock_interface_t *iface;
+	struct ksock_interface *iface;
 
 	for (i = 0; i < net->ksnn_ninterfaces; i++) {
-		LASSERT(i < LNET_NUM_INTERFACES);
+		LASSERT(i < LNET_INTERFACES_NUM);
 		iface = &net->ksnn_interfaces[i];
 
 		if (iface->ksni_ipaddr == ip)
@@ -61,10 +61,10 @@ ksocknal_ip2iface(struct lnet_ni *ni, __u32 ip)
 	return NULL;
 }
 
-static ksock_route_t *
-ksocknal_create_route (__u32 ipaddr, int port)
+static struct ksock_route *
+ksocknal_create_route(__u32 ipaddr, int port)
 {
-	ksock_route_t *route;
+	struct ksock_route *route;
 
 	LIBCFS_ALLOC (route, sizeof (*route));
 	if (route == NULL)
@@ -86,7 +86,7 @@ ksocknal_create_route (__u32 ipaddr, int port)
 }
 
 void
-ksocknal_destroy_route (ksock_route_t *route)
+ksocknal_destroy_route(struct ksock_route *route)
 {
 	LASSERT (atomic_read(&route->ksnr_refcount) == 0);
 
@@ -97,12 +97,12 @@ ksocknal_destroy_route (ksock_route_t *route)
 }
 
 static int
-ksocknal_create_peer(ksock_peer_ni_t **peerp, struct lnet_ni *ni,
+ksocknal_create_peer(struct ksock_peer_ni **peerp, struct lnet_ni *ni,
 		     struct lnet_process_id id)
 {
-	int		cpt = lnet_cpt_of_nid(id.nid, ni);
-	ksock_net_t	*net = ni->ni_data;
-	ksock_peer_ni_t	*peer_ni;
+	int cpt = lnet_cpt_of_nid(id.nid, ni);
+	struct ksock_net *net = ni->ni_data;
+	struct ksock_peer_ni *peer_ni;
 
 	LASSERT(id.nid != LNET_NID_ANY);
 	LASSERT(id.pid != LNET_PID_ANY);
@@ -146,9 +146,9 @@ ksocknal_create_peer(ksock_peer_ni_t **peerp, struct lnet_ni *ni,
 }
 
 void
-ksocknal_destroy_peer (ksock_peer_ni_t *peer_ni)
+ksocknal_destroy_peer(struct ksock_peer_ni *peer_ni)
 {
-	ksock_net_t    *net = peer_ni->ksnp_ni->ni_data;
+	struct ksock_net *net = peer_ni->ksnp_ni->ni_data;
 
 	CDEBUG (D_NET, "peer_ni %s %p deleted\n",
 		libcfs_id2str(peer_ni->ksnp_id), peer_ni);
@@ -171,16 +171,15 @@ ksocknal_destroy_peer (ksock_peer_ni_t *peer_ni)
 	spin_unlock_bh(&net->ksnn_lock);
 }
 
-ksock_peer_ni_t *
+struct ksock_peer_ni *
 ksocknal_find_peer_locked(struct lnet_ni *ni, struct lnet_process_id id)
 {
 	struct list_head *peer_list = ksocknal_nid2peerlist(id.nid);
 	struct list_head *tmp;
-	ksock_peer_ni_t	 *peer_ni;
+	struct ksock_peer_ni *peer_ni;
 
 	list_for_each(tmp, peer_list) {
-
-		peer_ni = list_entry(tmp, ksock_peer_ni_t, ksnp_list);
+		peer_ni = list_entry(tmp, struct ksock_peer_ni, ksnp_list);
 
 		LASSERT(!peer_ni->ksnp_closing);
 
@@ -199,10 +198,10 @@ ksocknal_find_peer_locked(struct lnet_ni *ni, struct lnet_process_id id)
 	return NULL;
 }
 
-ksock_peer_ni_t *
+struct ksock_peer_ni *
 ksocknal_find_peer(struct lnet_ni *ni, struct lnet_process_id id)
 {
-        ksock_peer_ni_t     *peer_ni;
+	struct ksock_peer_ni *peer_ni;
 
 	read_lock(&ksocknal_data.ksnd_global_lock);
 	peer_ni = ksocknal_find_peer_locked(ni, id);
@@ -214,14 +213,14 @@ ksocknal_find_peer(struct lnet_ni *ni, struct lnet_process_id id)
 }
 
 static void
-ksocknal_unlink_peer_locked(ksock_peer_ni_t *peer_ni)
+ksocknal_unlink_peer_locked(struct ksock_peer_ni *peer_ni)
 {
 	int i;
 	__u32 ip;
-	ksock_interface_t *iface;
+	struct ksock_interface *iface;
 
 	for (i = 0; i < peer_ni->ksnp_n_passive_ips; i++) {
-		LASSERT(i < LNET_NUM_INTERFACES);
+		LASSERT(i < LNET_INTERFACES_NUM);
 		ip = peer_ni->ksnp_passive_ips[i];
 
 		iface = ksocknal_ip2iface(peer_ni->ksnp_ni, ip);
@@ -250,19 +249,19 @@ ksocknal_get_peer_info(struct lnet_ni *ni, int index,
 		       struct lnet_process_id *id, __u32 *myip, __u32 *peer_ip,
 		       int *port, int *conn_count, int *share_count)
 {
-	ksock_peer_ni_t	  *peer_ni;
-	struct list_head  *ptmp;
-	ksock_route_t     *route;
-	struct list_head  *rtmp;
-	int		   i;
-        int                j;
-	int		   rc = -ENOENT;
+	struct ksock_peer_ni *peer_ni;
+	struct list_head *ptmp;
+	struct ksock_route *route;
+	struct list_head *rtmp;
+	int i;
+	int j;
+	int rc = -ENOENT;
 
 	read_lock(&ksocknal_data.ksnd_global_lock);
 
 	for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
 		list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
-			peer_ni = list_entry(ptmp, ksock_peer_ni_t, ksnp_list);
+			peer_ni = list_entry(ptmp, struct ksock_peer_ni, ksnp_list);
 
 			if (peer_ni->ksnp_ni != ni)
 				continue;
@@ -300,7 +299,7 @@ ksocknal_get_peer_info(struct lnet_ni *ni, int index,
 				if (index-- > 0)
 					continue;
 
-				route = list_entry(rtmp, ksock_route_t,
+				route = list_entry(rtmp, struct ksock_route,
 						   ksnr_list);
 
 				*id = peer_ni->ksnp_id;
@@ -320,11 +319,11 @@ ksocknal_get_peer_info(struct lnet_ni *ni, int index,
 }
 
 static void
-ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn)
+ksocknal_associate_route_conn_locked(struct ksock_route *route, struct ksock_conn *conn)
 {
-	ksock_peer_ni_t	  *peer_ni = route->ksnr_peer;
-	int		   type = conn->ksnc_type;
-	ksock_interface_t *iface;
+	struct ksock_peer_ni *peer_ni = route->ksnr_peer;
+	int type = conn->ksnc_type;
+	struct ksock_interface *iface;
 
 	conn->ksnc_route = route;
 	ksocknal_route_addref(route);
@@ -364,11 +363,11 @@ ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn)
 }
 
 static void
-ksocknal_add_route_locked (ksock_peer_ni_t *peer_ni, ksock_route_t *route)
+ksocknal_add_route_locked(struct ksock_peer_ni *peer_ni, struct ksock_route *route)
 {
 	struct list_head *tmp;
-	ksock_conn_t	 *conn;
-	ksock_route_t	 *route2;
+	struct ksock_conn *conn;
+	struct ksock_route *route2;
 
 	LASSERT(!peer_ni->ksnp_closing);
 	LASSERT(route->ksnr_peer == NULL);
@@ -378,7 +377,7 @@ ksocknal_add_route_locked (ksock_peer_ni_t *peer_ni, ksock_route_t *route)
 
 	/* LASSERT(unique) */
 	list_for_each(tmp, &peer_ni->ksnp_routes) {
-		route2 = list_entry(tmp, ksock_route_t, ksnr_list);
+		route2 = list_entry(tmp, struct ksock_route, ksnr_list);
 
 		if (route2->ksnr_ipaddr == route->ksnr_ipaddr) {
 			CERROR("Duplicate route %s %pI4h\n",
@@ -394,7 +393,7 @@ ksocknal_add_route_locked (ksock_peer_ni_t *peer_ni, ksock_route_t *route)
 	list_add_tail(&route->ksnr_list, &peer_ni->ksnp_routes);
 
 	list_for_each(tmp, &peer_ni->ksnp_conns) {
-		conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+		conn = list_entry(tmp, struct ksock_conn, ksnc_list);
 
 		if (conn->ksnc_ipaddr != route->ksnr_ipaddr)
 			continue;
@@ -405,19 +404,19 @@ ksocknal_add_route_locked (ksock_peer_ni_t *peer_ni, ksock_route_t *route)
 }
 
 static void
-ksocknal_del_route_locked (ksock_route_t *route)
+ksocknal_del_route_locked(struct ksock_route *route)
 {
-	ksock_peer_ni_t	  *peer_ni = route->ksnr_peer;
-	ksock_interface_t *iface;
-	ksock_conn_t	  *conn;
-	struct list_head  *ctmp;
-	struct list_head  *cnxt;
+	struct ksock_peer_ni *peer_ni = route->ksnr_peer;
+	struct ksock_interface *iface;
+	struct ksock_conn *conn;
+	struct list_head *ctmp;
+	struct list_head *cnxt;
 
 	LASSERT(!route->ksnr_deleted);
 
 	/* Close associated conns */
 	list_for_each_safe(ctmp, cnxt, &peer_ni->ksnp_conns) {
-		conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
+		conn = list_entry(ctmp, struct ksock_conn, ksnc_list);
 
 		if (conn->ksnc_route != route)
 			continue;
@@ -449,11 +448,11 @@ ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ipaddr,
 		  int port)
 {
 	struct list_head *tmp;
-	ksock_peer_ni_t	 *peer_ni;
-	ksock_peer_ni_t	 *peer2;
-	ksock_route_t	 *route;
-	ksock_route_t	 *route2;
-	int		  rc;
+	struct ksock_peer_ni *peer_ni;
+	struct ksock_peer_ni *peer2;
+	struct ksock_route *route;
+	struct ksock_route *route2;
+	int rc;
 
         if (id.nid == LNET_NID_ANY ||
             id.pid == LNET_PID_ANY)
@@ -473,7 +472,7 @@ ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ipaddr,
 	write_lock_bh(&ksocknal_data.ksnd_global_lock);
 
         /* always called with a ref on ni, so shutdown can't have started */
-        LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0);
+	LASSERT(((struct ksock_net *) ni->ni_data)->ksnn_shutdown == 0);
 
 	peer2 = ksocknal_find_peer_locked(ni, id);
 	if (peer2 != NULL) {
@@ -487,7 +486,7 @@ ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ipaddr,
 
 	route2 = NULL;
 	list_for_each(tmp, &peer_ni->ksnp_routes) {
-		route2 = list_entry(tmp, ksock_route_t, ksnr_list);
+		route2 = list_entry(tmp, struct ksock_route, ksnr_list);
 
 		if (route2->ksnr_ipaddr == ipaddr)
 			break;
@@ -508,13 +507,13 @@ ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ipaddr,
 }
 
 static void
-ksocknal_del_peer_locked (ksock_peer_ni_t *peer_ni, __u32 ip)
+ksocknal_del_peer_locked(struct ksock_peer_ni *peer_ni, __u32 ip)
 {
-	ksock_conn_t	 *conn;
-	ksock_route_t	 *route;
+	struct ksock_conn *conn;
+	struct ksock_route *route;
 	struct list_head *tmp;
 	struct list_head *nxt;
-	int		  nshared;
+	int nshared;
 
 	LASSERT(!peer_ni->ksnp_closing);
 
@@ -522,7 +521,7 @@ ksocknal_del_peer_locked (ksock_peer_ni_t *peer_ni, __u32 ip)
 	ksocknal_peer_addref(peer_ni);
 
 	list_for_each_safe(tmp, nxt, &peer_ni->ksnp_routes) {
-		route = list_entry(tmp, ksock_route_t, ksnr_list);
+		route = list_entry(tmp, struct ksock_route, ksnr_list);
 
 		/* no match */
 		if (!(ip == 0 || route->ksnr_ipaddr == ip))
@@ -535,7 +534,7 @@ ksocknal_del_peer_locked (ksock_peer_ni_t *peer_ni, __u32 ip)
 
 	nshared = 0;
 	list_for_each_safe(tmp, nxt, &peer_ni->ksnp_routes) {
-		route = list_entry(tmp, ksock_route_t, ksnr_list);
+		route = list_entry(tmp, struct ksock_route, ksnr_list);
 		nshared += route->ksnr_share_count;
 	}
 
@@ -544,7 +543,7 @@ ksocknal_del_peer_locked (ksock_peer_ni_t *peer_ni, __u32 ip)
 		 * left */
 
 		list_for_each_safe(tmp, nxt, &peer_ni->ksnp_routes) {
-			route = list_entry(tmp, ksock_route_t, ksnr_list);
+			route = list_entry(tmp, struct ksock_route, ksnr_list);
 
 			/* we should only be removing auto-entries */
 			LASSERT(route->ksnr_share_count == 0);
@@ -552,27 +551,27 @@ ksocknal_del_peer_locked (ksock_peer_ni_t *peer_ni, __u32 ip)
 		}
 
 		list_for_each_safe(tmp, nxt, &peer_ni->ksnp_conns) {
-			conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+			conn = list_entry(tmp, struct ksock_conn, ksnc_list);
 
 			ksocknal_close_conn_locked(conn, 0);
 		}
 	}
 
 	ksocknal_peer_decref(peer_ni);
-		/* NB peer_ni unlinks itself when last conn/route is removed */
+	/* NB peer_ni unlinks itself when last conn/route is removed */
 }
 
 static int
 ksocknal_del_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip)
 {
-	struct list_head  zombies = LIST_HEAD_INIT(zombies);
+	struct list_head zombies = LIST_HEAD_INIT(zombies);
 	struct list_head *ptmp;
 	struct list_head *pnxt;
-	ksock_peer_ni_t     *peer_ni;
-	int		  lo;
-	int		  hi;
-	int		  i;
-	int		  rc = -ENOENT;
+	struct ksock_peer_ni *peer_ni;
+	int lo;
+	int hi;
+	int i;
+	int rc = -ENOENT;
 
 	write_lock_bh(&ksocknal_data.ksnd_global_lock);
 
@@ -588,7 +587,7 @@ ksocknal_del_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip)
 	for (i = lo; i <= hi; i++) {
 		list_for_each_safe(ptmp, pnxt,
 				   &ksocknal_data.ksnd_peers[i]) {
-			peer_ni = list_entry(ptmp, ksock_peer_ni_t, ksnp_list);
+			peer_ni = list_entry(ptmp, struct ksock_peer_ni, ksnp_list);
 
 			if (peer_ni->ksnp_ni != ni)
 				continue;
@@ -625,20 +624,20 @@ ksocknal_del_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip)
 	return rc;
 }
 
-static ksock_conn_t *
+static struct ksock_conn *
 ksocknal_get_conn_by_idx(struct lnet_ni *ni, int index)
 {
-	ksock_peer_ni_t	 *peer_ni;
+	struct ksock_peer_ni *peer_ni;
 	struct list_head *ptmp;
-	ksock_conn_t	 *conn;
+	struct ksock_conn *conn;
 	struct list_head *ctmp;
-	int		  i;
+	int i;
 
 	read_lock(&ksocknal_data.ksnd_global_lock);
 
 	for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
 		list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
-			peer_ni = list_entry(ptmp, ksock_peer_ni_t, ksnp_list);
+			peer_ni = list_entry(ptmp, struct ksock_peer_ni, ksnp_list);
 
 			LASSERT(!peer_ni->ksnp_closing);
 
@@ -649,7 +648,7 @@ ksocknal_get_conn_by_idx(struct lnet_ni *ni, int index)
 				if (index-- > 0)
 					continue;
 
-				conn = list_entry(ctmp, ksock_conn_t,
+				conn = list_entry(ctmp, struct ksock_conn,
 						  ksnc_list);
 				ksocknal_conn_addref(conn);
 				read_unlock(&ksocknal_data. \
@@ -663,50 +662,37 @@ ksocknal_get_conn_by_idx(struct lnet_ni *ni, int index)
 	return NULL;
 }
 
-static ksock_sched_t *
+static struct ksock_sched *
 ksocknal_choose_scheduler_locked(unsigned int cpt)
 {
-	struct ksock_sched_info	*info = ksocknal_data.ksnd_sched_info[cpt];
-	ksock_sched_t		*sched;
-	int			i;
+	struct ksock_sched *sched = ksocknal_data.ksnd_schedulers[cpt];
+	int i;
 
-	if (info->ksi_nthreads == 0) {
-		cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
-			if (info->ksi_nthreads > 0) {
+	if (sched->kss_nthreads == 0) {
+		cfs_percpt_for_each(sched, i, ksocknal_data.ksnd_schedulers) {
+			if (sched->kss_nthreads > 0) {
 				CDEBUG(D_NET, "scheduler[%d] has no threads. selected scheduler[%d]\n",
-				       cpt, info->ksi_cpt);
-				goto select_sched;
+				       cpt, sched->kss_cpt);
+				return sched;
 			}
 		}
 		return NULL;
 	}
 
-select_sched:
-	sched = &info->ksi_scheds[0];
-	/*
-	 * NB: it's safe so far, but info->ksi_nthreads could be changed
-	 * at runtime when we have dynamic LNet configuration, then we
-	 * need to take care of this.
-	 */
-	for (i = 1; i < info->ksi_nthreads; i++) {
-		if (sched->kss_nconns > info->ksi_scheds[i].kss_nconns)
-			sched = &info->ksi_scheds[i];
-	}
-
 	return sched;
 }
 
 static int
 ksocknal_local_ipvec(struct lnet_ni *ni, __u32 *ipaddrs)
 {
-	ksock_net_t *net = ni->ni_data;
+	struct ksock_net *net = ni->ni_data;
 	int i;
 	int nip;
 
 	read_lock(&ksocknal_data.ksnd_global_lock);
 
 	nip = net->ksnn_ninterfaces;
-	LASSERT(nip <= LNET_NUM_INTERFACES);
+	LASSERT(nip <= LNET_INTERFACES_NUM);
 
 	/*
 	 * Only offer interfaces for additional connections if I have
@@ -727,14 +713,14 @@ ksocknal_local_ipvec(struct lnet_ni *ni, __u32 *ipaddrs)
 }
 
 static int
-ksocknal_match_peerip (ksock_interface_t *iface, __u32 *ips, int nips)
+ksocknal_match_peerip(struct ksock_interface *iface, __u32 *ips, int nips)
 {
-        int   best_netmatch = 0;
-        int   best_xor      = 0;
-        int   best          = -1;
-        int   this_xor;
-        int   this_netmatch;
-        int   i;
+	int best_netmatch = 0;
+	int best_xor = 0;
+	int best = -1;
+	int this_xor;
+	int this_netmatch;
+	int i;
 
         for (i = 0; i < nips; i++) {
                 if (ips[i] == 0)
@@ -759,21 +745,21 @@ ksocknal_match_peerip (ksock_interface_t *iface, __u32 *ips, int nips)
 }
 
 static int
-ksocknal_select_ips(ksock_peer_ni_t *peer_ni, __u32 *peerips, int n_peerips)
+ksocknal_select_ips(struct ksock_peer_ni *peer_ni, __u32 *peerips, int n_peerips)
 {
-	rwlock_t		*global_lock = &ksocknal_data.ksnd_global_lock;
-        ksock_net_t        *net = peer_ni->ksnp_ni->ni_data;
-        ksock_interface_t  *iface;
-        ksock_interface_t  *best_iface;
-        int                 n_ips;
-        int                 i;
-        int                 j;
-        int                 k;
-        __u32               ip;
-        __u32               xor;
-        int                 this_netmatch;
-        int                 best_netmatch;
-        int                 best_npeers;
+	rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock;
+	struct ksock_net *net = peer_ni->ksnp_ni->ni_data;
+	struct ksock_interface *iface;
+	struct ksock_interface *best_iface;
+	int n_ips;
+	int i;
+	int j;
+	int k;
+	u32 ip;
+	u32 xor;
+	int this_netmatch;
+	int best_netmatch;
+	int best_npeers;
 
         /* CAVEAT EMPTOR: We do all our interface matching with an
          * exclusive hold of global lock at IRQ priority.  We're only
@@ -785,8 +771,8 @@ ksocknal_select_ips(ksock_peer_ni_t *peer_ni, __u32 *peerips, int n_peerips)
 
 	write_lock_bh(global_lock);
 
-	LASSERT(n_peerips <= LNET_NUM_INTERFACES);
-	LASSERT(net->ksnn_ninterfaces <= LNET_NUM_INTERFACES);
+	LASSERT(n_peerips <= LNET_INTERFACES_NUM);
+	LASSERT(net->ksnn_ninterfaces <= LNET_INTERFACES_NUM);
 
 	/* Only match interfaces for additional connections
          * if I have > 1 interface */
@@ -865,17 +851,17 @@ ksocknal_select_ips(ksock_peer_ni_t *peer_ni, __u32 *peerips, int n_peerips)
 }
 
 static void
-ksocknal_create_routes(ksock_peer_ni_t *peer_ni, int port,
+ksocknal_create_routes(struct ksock_peer_ni *peer_ni, int port,
                        __u32 *peer_ipaddrs, int npeer_ipaddrs)
 {
-	ksock_route_t		*newroute = NULL;
+	struct ksock_route		*newroute = NULL;
 	rwlock_t		*global_lock = &ksocknal_data.ksnd_global_lock;
 	struct lnet_ni *ni = peer_ni->ksnp_ni;
-	ksock_net_t		*net = ni->ni_data;
+	struct ksock_net		*net = ni->ni_data;
 	struct list_head	*rtmp;
-	ksock_route_t		*route;
-	ksock_interface_t	*iface;
-	ksock_interface_t	*best_iface;
+	struct ksock_route		*route;
+	struct ksock_interface	*iface;
+	struct ksock_interface	*best_iface;
 	int			best_netmatch;
 	int			this_netmatch;
 	int			best_nroutes;
@@ -896,7 +882,7 @@ ksocknal_create_routes(ksock_peer_ni_t *peer_ni, int port,
                 return;
         }
 
-	LASSERT(npeer_ipaddrs <= LNET_NUM_INTERFACES);
+	LASSERT(npeer_ipaddrs <= LNET_INTERFACES_NUM);
 
         for (i = 0; i < npeer_ipaddrs; i++) {
                 if (newroute != NULL) {
@@ -919,7 +905,7 @@ ksocknal_create_routes(ksock_peer_ni_t *peer_ni, int port,
 		/* Already got a route? */
 		route = NULL;
 		list_for_each(rtmp, &peer_ni->ksnp_routes) {
-			route = list_entry(rtmp, ksock_route_t, ksnr_list);
+			route = list_entry(rtmp, struct ksock_route, ksnr_list);
 
 			if (route->ksnr_ipaddr == newroute->ksnr_ipaddr)
 				break;
@@ -933,7 +919,7 @@ ksocknal_create_routes(ksock_peer_ni_t *peer_ni, int port,
 		best_nroutes = 0;
 		best_netmatch = 0;
 
-		LASSERT(net->ksnn_ninterfaces <= LNET_NUM_INTERFACES);
+		LASSERT(net->ksnn_ninterfaces <= LNET_INTERFACES_NUM);
 
 		/* Select interface to connect from */
 		for (j = 0; j < net->ksnn_ninterfaces; j++) {
@@ -941,7 +927,7 @@ ksocknal_create_routes(ksock_peer_ni_t *peer_ni, int port,
 
 			/* Using this interface already? */
 			list_for_each(rtmp, &peer_ni->ksnp_routes) {
-				route = list_entry(rtmp, ksock_route_t,
+				route = list_entry(rtmp, struct ksock_route,
 						   ksnr_list);
 
 				if (route->ksnr_myipaddr == iface->ksni_ipaddr)
@@ -985,10 +971,10 @@ ksocknal_create_routes(ksock_peer_ni_t *peer_ni, int port,
 int
 ksocknal_accept(struct lnet_ni *ni, struct socket *sock)
 {
-	ksock_connreq_t	*cr;
-	int		 rc;
-	__u32		 peer_ip;
-	int		 peer_port;
+	struct ksock_connreq *cr;
+	int rc;
+	u32 peer_ip;
+	int peer_port;
 
 	rc = lnet_sock_getaddr(sock, true, &peer_ip, &peer_port);
 	LASSERT(rc == 0);		/* we succeeded before */
@@ -1014,9 +1000,9 @@ ksocknal_accept(struct lnet_ni *ni, struct socket *sock)
 }
 
 static int
-ksocknal_connecting (ksock_peer_ni_t *peer_ni, __u32 ipaddr)
+ksocknal_connecting(struct ksock_peer_ni *peer_ni, __u32 ipaddr)
 {
-	ksock_route_t *route;
+	struct ksock_route *route;
 
 	list_for_each_entry(route, &peer_ni->ksnp_routes, ksnr_list) {
 		if (route->ksnr_ipaddr == ipaddr)
@@ -1026,27 +1012,27 @@ ksocknal_connecting (ksock_peer_ni_t *peer_ni, __u32 ipaddr)
 }
 
 int
-ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
+ksocknal_create_conn(struct lnet_ni *ni, struct ksock_route *route,
 		     struct socket *sock, int type)
 {
-	rwlock_t		*global_lock = &ksocknal_data.ksnd_global_lock;
-	struct list_head	zombies = LIST_HEAD_INIT(zombies);
+	rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock;
+	struct list_head zombies = LIST_HEAD_INIT(zombies);
 	struct lnet_process_id peerid;
-	struct list_head	*tmp;
-        __u64              incarnation;
-        ksock_conn_t      *conn;
-        ksock_conn_t      *conn2;
-        ksock_peer_ni_t      *peer_ni = NULL;
-        ksock_peer_ni_t      *peer2;
-        ksock_sched_t     *sched;
+	struct list_head *tmp;
+	u64 incarnation;
+	struct ksock_conn *conn;
+	struct ksock_conn *conn2;
+	struct ksock_peer_ni *peer_ni = NULL;
+	struct ksock_peer_ni *peer2;
+	struct ksock_sched *sched;
 	struct ksock_hello_msg *hello;
-	int		   cpt;
-        ksock_tx_t        *tx;
-        ksock_tx_t        *txtmp;
-        int                rc;
-	int                rc2;
-        int                active;
-        char              *warn = NULL;
+	int cpt;
+	struct ksock_tx *tx;
+	struct ksock_tx *txtmp;
+	int rc;
+	int rc2;
+	int active;
+	char *warn = NULL;
 
         active = (route != NULL);
 
@@ -1078,7 +1064,7 @@ ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
 	atomic_set (&conn->ksnc_tx_nob, 0);
 
 	LIBCFS_ALLOC(hello, offsetof(struct ksock_hello_msg,
-				     kshm_ips[LNET_NUM_INTERFACES]));
+				     kshm_ips[LNET_INTERFACES_NUM]));
         if (hello == NULL) {
                 rc = -ENOMEM;
                 goto failed_1;
@@ -1148,7 +1134,7 @@ ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
 		write_lock_bh(global_lock);
 
                 /* called with a ref on ni, so shutdown can't have started */
-                LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0);
+		LASSERT(((struct ksock_net *) ni->ni_data)->ksnn_shutdown == 0);
 
 		peer2 = ksocknal_find_peer_locked(ni, peerid);
 		if (peer2 == NULL) {
@@ -1224,7 +1210,7 @@ ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
 	 * loopback connection */
 	if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) {
 		list_for_each(tmp, &peer_ni->ksnp_conns) {
-			conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
+			conn2 = list_entry(tmp, struct ksock_conn, ksnc_list);
 
                         if (conn2->ksnc_ipaddr != conn->ksnc_ipaddr ||
                             conn2->ksnc_myipaddr != conn->ksnc_myipaddr ||
@@ -1258,7 +1244,7 @@ ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
 	 * by routes in my peer_ni to match my own route entries so I don't
 	 * continually create duplicate routes. */
 	list_for_each(tmp, &peer_ni->ksnp_routes) {
-		route = list_entry(tmp, ksock_route_t, ksnr_list);
+		route = list_entry(tmp, struct ksock_route, ksnr_list);
 
 		if (route->ksnr_ipaddr != conn->ksnc_ipaddr)
 			continue;
@@ -1268,7 +1254,7 @@ ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
 	}
 
 	conn->ksnc_peer = peer_ni;                 /* conn takes my ref on peer_ni */
-	peer_ni->ksnp_last_alive = cfs_time_current();
+	peer_ni->ksnp_last_alive = ktime_get_seconds();
 	peer_ni->ksnp_send_keepalive = 0;
 	peer_ni->ksnp_error = 0;
 
@@ -1281,14 +1267,15 @@ ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
 	 * The cpt might have changed if we ended up selecting a non cpt
 	 * native scheduler. So use the scheduler's cpt instead.
 	 */
-	cpt = sched->kss_info->ksi_cpt;
+	cpt = sched->kss_cpt;
         sched->kss_nconns++;
         conn->ksnc_scheduler = sched;
 
-	conn->ksnc_tx_last_post = cfs_time_current();
+	conn->ksnc_tx_last_post = ktime_get_seconds();
 	/* Set the deadline for the outgoing HELLO to drain */
 	conn->ksnc_tx_bufnob = sock->sk->sk_wmem_queued;
-	conn->ksnc_tx_deadline = cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+	conn->ksnc_tx_deadline = ktime_get_seconds() +
+				 lnet_get_lnd_timeout();
 	smp_mb();   /* order with adding to peer_ni's conn list */
 
 	list_add(&conn->ksnc_list, &peer_ni->ksnp_conns);
@@ -1319,11 +1306,10 @@ ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
          */
 
 	CDEBUG(D_NET, "New conn %s p %d.x %pI4h -> %pI4h/%d"
-	       " incarnation:%lld sched[%d:%d]\n",
+	       " incarnation:%lld sched[%d]\n",
 	       libcfs_id2str(peerid), conn->ksnc_proto->pro_version,
 	       &conn->ksnc_myipaddr, &conn->ksnc_ipaddr,
-	       conn->ksnc_port, incarnation, cpt,
-	       (int)(sched - &sched->kss_info->ksi_scheds[0]));
+	       conn->ksnc_port, incarnation, cpt);
 
         if (active) {
                 /* additional routes after interface exchange? */
@@ -1336,7 +1322,7 @@ ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
         }
 
 	LIBCFS_FREE(hello, offsetof(struct ksock_hello_msg,
-				    kshm_ips[LNET_NUM_INTERFACES]));
+				    kshm_ips[LNET_INTERFACES_NUM]));
 
         /* setup the socket AFTER I've received hello (it disables
          * SO_LINGER).  I might call back to the acceptor who may want
@@ -1420,7 +1406,7 @@ ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
 failed_1:
 	if (hello != NULL)
 		LIBCFS_FREE(hello, offsetof(struct ksock_hello_msg,
-					    kshm_ips[LNET_NUM_INTERFACES]));
+					    kshm_ips[LNET_INTERFACES_NUM]));
 
 	LIBCFS_FREE(conn, sizeof(*conn));
 
@@ -1430,15 +1416,15 @@ ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
 }
 
 void
-ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
+ksocknal_close_conn_locked(struct ksock_conn *conn, int error)
 {
         /* This just does the immmediate housekeeping, and queues the
          * connection for the reaper to terminate.
          * Caller holds ksnd_global_lock exclusively in irq context */
-        ksock_peer_ni_t      *peer_ni = conn->ksnc_peer;
-        ksock_route_t     *route;
-        ksock_conn_t      *conn2;
-	struct list_head  *tmp;
+	struct ksock_peer_ni *peer_ni = conn->ksnc_peer;
+	struct ksock_route *route;
+	struct ksock_conn *conn2;
+	struct list_head *tmp;
 
 	LASSERT(peer_ni->ksnp_error == 0);
 	LASSERT(!conn->ksnc_closing);
@@ -1455,7 +1441,7 @@ ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
 
 		conn2 = NULL;
 		list_for_each(tmp, &peer_ni->ksnp_conns) {
-			conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
+			conn2 = list_entry(tmp, struct ksock_conn, ksnc_list);
 
 			if (conn2->ksnc_route == route &&
 			    conn2->ksnc_type == conn->ksnc_type)
@@ -1475,7 +1461,7 @@ ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
 		/* No more connections to this peer_ni */
 
 		if (!list_empty(&peer_ni->ksnp_tx_queue)) {
-				ksock_tx_t *tx;
+				struct ksock_tx *tx;
 
 			LASSERT(conn->ksnc_proto == &ksocknal_protocol_v3x);
 
@@ -1513,10 +1499,10 @@ ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
 }
 
 void
-ksocknal_peer_failed (ksock_peer_ni_t *peer_ni)
+ksocknal_peer_failed(struct ksock_peer_ni *peer_ni)
 {
-        int        notify = 0;
-	cfs_time_t last_alive = 0;
+	int notify = 0;
+	time64_t last_alive = 0;
 
 	/* There has been a connection failure or comms error; but I'll only
 	 * tell LNET I think the peer_ni is dead if it's to another kernel and
@@ -1540,12 +1526,12 @@ ksocknal_peer_failed (ksock_peer_ni_t *peer_ni)
 }
 
 void
-ksocknal_finalize_zcreq(ksock_conn_t *conn)
+ksocknal_finalize_zcreq(struct ksock_conn *conn)
 {
-	ksock_peer_ni_t	 *peer_ni = conn->ksnc_peer;
-	ksock_tx_t	 *tx;
-	ksock_tx_t	 *tmp;
-	struct list_head  zlist = LIST_HEAD_INIT(zlist);
+	struct ksock_peer_ni *peer_ni = conn->ksnc_peer;
+	struct ksock_tx *tx;
+	struct ksock_tx *tmp;
+	struct list_head zlist = LIST_HEAD_INIT(zlist);
 
 	/* NB safe to finalize TXs because closing of socket will
 	 * abort all buffered data */
@@ -1568,7 +1554,7 @@ ksocknal_finalize_zcreq(ksock_conn_t *conn)
 	spin_unlock(&peer_ni->ksnp_lock);
 
 	while (!list_empty(&zlist)) {
-		tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list);
+		tx = list_entry(zlist.next, struct ksock_tx, tx_zc_list);
 
 		list_del(&tx->tx_zc_list);
 		ksocknal_tx_decref(tx);
@@ -1576,15 +1562,15 @@ ksocknal_finalize_zcreq(ksock_conn_t *conn)
 }
 
 void
-ksocknal_terminate_conn(ksock_conn_t *conn)
+ksocknal_terminate_conn(struct ksock_conn *conn)
 {
         /* This gets called by the reaper (guaranteed thread context) to
          * disengage the socket from its callbacks and close it.
          * ksnc_refcount will eventually hit zero, and then the reaper will
          * destroy it. */
-        ksock_peer_ni_t     *peer_ni = conn->ksnc_peer;
-        ksock_sched_t    *sched = conn->ksnc_scheduler;
-        int               failed = 0;
+	struct ksock_peer_ni *peer_ni = conn->ksnc_peer;
+	struct ksock_sched *sched = conn->ksnc_scheduler;
+	int failed = 0;
 
         LASSERT(conn->ksnc_closing);
 
@@ -1637,10 +1623,9 @@ ksocknal_terminate_conn(ksock_conn_t *conn)
 }
 
 void
-ksocknal_queue_zombie_conn (ksock_conn_t *conn)
+ksocknal_queue_zombie_conn(struct ksock_conn *conn)
 {
 	/* Queue the conn for the reaper to destroy */
-
 	LASSERT(atomic_read(&conn->ksnc_conn_refcount) == 0);
 	spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
 
@@ -1651,9 +1636,9 @@ ksocknal_queue_zombie_conn (ksock_conn_t *conn)
 }
 
 void
-ksocknal_destroy_conn (ksock_conn_t *conn)
+ksocknal_destroy_conn(struct ksock_conn *conn)
 {
-	cfs_time_t      last_rcv;
+	time64_t last_rcv;
 
 	/* Final coup-de-grace of the reaper */
 	CDEBUG (D_NET, "connection %p\n", conn);
@@ -1670,16 +1655,18 @@ ksocknal_destroy_conn (ksock_conn_t *conn)
         switch (conn->ksnc_rx_state) {
         case SOCKNAL_RX_LNET_PAYLOAD:
                 last_rcv = conn->ksnc_rx_deadline -
-			   cfs_time_seconds(*ksocknal_tunables.ksnd_timeout);
+			   lnet_get_lnd_timeout();
 		CERROR("Completing partial receive from %s[%d], "
 		       "ip %pI4h:%d, with error, wanted: %d, left: %d, "
-		       "last alive is %ld secs ago\n",
+		       "last alive is %lld secs ago\n",
                        libcfs_id2str(conn->ksnc_peer->ksnp_id), conn->ksnc_type,
 		       &conn->ksnc_ipaddr, conn->ksnc_port,
                        conn->ksnc_rx_nob_wanted, conn->ksnc_rx_nob_left,
-		       cfs_duration_sec(cfs_time_sub(cfs_time_current(),
-					last_rcv)));
-		lnet_finalize(conn->ksnc_cookie, -EIO);
+		       ktime_get_seconds() - last_rcv);
+		if (conn->ksnc_lnet_msg)
+			conn->ksnc_lnet_msg->msg_health_status =
+				LNET_MSG_STATUS_REMOTE_ERROR;
+		lnet_finalize(conn->ksnc_lnet_msg, -EIO);
 		break;
         case SOCKNAL_RX_LNET_HEADER:
                 if (conn->ksnc_rx_started)
@@ -1715,15 +1702,15 @@ ksocknal_destroy_conn (ksock_conn_t *conn)
 }
 
 int
-ksocknal_close_peer_conns_locked (ksock_peer_ni_t *peer_ni, __u32 ipaddr, int why)
+ksocknal_close_peer_conns_locked(struct ksock_peer_ni *peer_ni, __u32 ipaddr, int why)
 {
-        ksock_conn_t       *conn;
-	struct list_head         *ctmp;
-	struct list_head         *cnxt;
-        int                 count = 0;
+	struct ksock_conn *conn;
+	struct list_head *ctmp;
+	struct list_head *cnxt;
+	int count = 0;
 
 	list_for_each_safe(ctmp, cnxt, &peer_ni->ksnp_conns) {
-		conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
+		conn = list_entry(ctmp, struct ksock_conn, ksnc_list);
 
                 if (ipaddr == 0 ||
                     conn->ksnc_ipaddr == ipaddr) {
@@ -1736,11 +1723,11 @@ ksocknal_close_peer_conns_locked (ksock_peer_ni_t *peer_ni, __u32 ipaddr, int wh
 }
 
 int
-ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why)
+ksocknal_close_conn_and_siblings(struct ksock_conn *conn, int why)
 {
-        ksock_peer_ni_t     *peer_ni = conn->ksnc_peer;
-        __u32             ipaddr = conn->ksnc_ipaddr;
-        int               count;
+	struct ksock_peer_ni *peer_ni = conn->ksnc_peer;
+	u32 ipaddr = conn->ksnc_ipaddr;
+	int count;
 
 	write_lock_bh(&ksocknal_data.ksnd_global_lock);
 
@@ -1754,13 +1741,13 @@ ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why)
 int
 ksocknal_close_matching_conns(struct lnet_process_id id, __u32 ipaddr)
 {
-        ksock_peer_ni_t       *peer_ni;
-	struct list_head         *ptmp;
-	struct list_head         *pnxt;
-        int                 lo;
-        int                 hi;
-        int                 i;
-        int                 count = 0;
+	struct ksock_peer_ni *peer_ni;
+	struct list_head *ptmp;
+	struct list_head *pnxt;
+	int lo;
+	int hi;
+	int i;
+	int count = 0;
 
 	write_lock_bh(&ksocknal_data.ksnd_global_lock);
 
@@ -1774,7 +1761,7 @@ ksocknal_close_matching_conns(struct lnet_process_id id, __u32 ipaddr)
         for (i = lo; i <= hi; i++) {
 		list_for_each_safe(ptmp, pnxt, &ksocknal_data.ksnd_peers[i]) {
 
-			peer_ni = list_entry(ptmp, ksock_peer_ni_t, ksnp_list);
+			peer_ni = list_entry(ptmp, struct ksock_peer_ni, ksnp_list);
 
                         if (!((id.nid == LNET_NID_ANY || id.nid == peer_ni->ksnp_id.nid) &&
                               (id.pid == LNET_PID_ANY || id.pid == peer_ni->ksnp_id.pid)))
@@ -1818,12 +1805,12 @@ ksocknal_notify(struct lnet_ni *ni, lnet_nid_t gw_nid, int alive)
 }
 
 void
-ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when)
+ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, time64_t *when)
 {
 	int connect = 1;
-	cfs_time_t last_alive = 0;
-	cfs_time_t now = cfs_time_current();
-	ksock_peer_ni_t *peer_ni = NULL;
+	time64_t last_alive = 0;
+	time64_t now = ktime_get_seconds();
+	struct ksock_peer_ni *peer_ni = NULL;
 	rwlock_t *glock = &ksocknal_data.ksnd_global_lock;
 	struct lnet_process_id id = {
 		.nid = nid,
@@ -1832,20 +1819,20 @@ ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when)
 
 	read_lock(glock);
 
-        peer_ni = ksocknal_find_peer_locked(ni, id);
-        if (peer_ni != NULL) {
-		struct list_head       *tmp;
-                ksock_conn_t     *conn;
-                int               bufnob;
+	peer_ni = ksocknal_find_peer_locked(ni, id);
+	if (peer_ni != NULL) {
+		struct list_head *tmp;
+		struct ksock_conn *conn;
+		int bufnob;
 
 		list_for_each(tmp, &peer_ni->ksnp_conns) {
-			conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+			conn = list_entry(tmp, struct ksock_conn, ksnc_list);
 			bufnob = conn->ksnc_sock->sk->sk_wmem_queued;
 
 			if (bufnob < conn->ksnc_tx_bufnob) {
 				/* something got ACKed */
-				conn->ksnc_tx_deadline =
-					cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+				conn->ksnc_tx_deadline = ktime_get_seconds() +
+							 lnet_get_lnd_timeout();
                                 peer_ni->ksnp_last_alive = now;
                                 conn->ksnc_tx_bufnob = bufnob;
                         }
@@ -1861,9 +1848,9 @@ ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when)
         if (last_alive != 0)
 		*when = last_alive;
 
-	CDEBUG(D_NET, "peer_ni %s %p, alive %ld secs ago, connect %d\n",
+	CDEBUG(D_NET, "peer_ni %s %p, alive %lld secs ago, connect %d\n",
                libcfs_nid2str(nid), peer_ni,
-	       last_alive ? cfs_duration_sec(now - last_alive) : -1,
+	       last_alive ? now - last_alive : -1,
                connect);
 
         if (!connect)
@@ -1882,12 +1869,12 @@ ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when)
 }
 
 static void
-ksocknal_push_peer (ksock_peer_ni_t *peer_ni)
+ksocknal_push_peer(struct ksock_peer_ni *peer_ni)
 {
-        int               index;
-        int               i;
-	struct list_head       *tmp;
-        ksock_conn_t     *conn;
+	int index;
+	int i;
+	struct list_head *tmp;
+	struct ksock_conn *conn;
 
         for (index = 0; ; index++) {
 		read_lock(&ksocknal_data.ksnd_global_lock);
@@ -1897,8 +1884,8 @@ ksocknal_push_peer (ksock_peer_ni_t *peer_ni)
 
 		list_for_each(tmp, &peer_ni->ksnp_conns) {
                         if (i++ == index) {
-				conn = list_entry(tmp, ksock_conn_t,
-                                                       ksnc_list);
+				conn = list_entry(tmp, struct ksock_conn,
+						  ksnc_list);
                                 ksocknal_conn_addref(conn);
                                 break;
                         }
@@ -1934,7 +1921,7 @@ ksocknal_push(struct lnet_ni *ni, struct lnet_process_id id)
 		int	peer_off; /* searching offset in peer_ni hash table */
 
 		for (peer_off = 0; ; peer_off++) {
-			ksock_peer_ni_t *peer_ni;
+			struct ksock_peer_ni *peer_ni;
 			int	      i = 0;
 
 			read_lock(&ksocknal_data.ksnd_global_lock);
@@ -1966,15 +1953,15 @@ ksocknal_push(struct lnet_ni *ni, struct lnet_process_id id)
 static int
 ksocknal_add_interface(struct lnet_ni *ni, __u32 ipaddress, __u32 netmask)
 {
-	ksock_net_t *net = ni->ni_data;
-	ksock_interface_t *iface;
+	struct ksock_net *net = ni->ni_data;
+	struct ksock_interface *iface;
 	int rc;
 	int i;
 	int j;
 	struct list_head *ptmp;
-	ksock_peer_ni_t *peer_ni;
+	struct ksock_peer_ni *peer_ni;
 	struct list_head *rtmp;
-	ksock_route_t *route;
+	struct ksock_route *route;
 
 	if (ipaddress == 0 ||
 	    netmask == 0)
@@ -1986,7 +1973,7 @@ ksocknal_add_interface(struct lnet_ni *ni, __u32 ipaddress, __u32 netmask)
 	if (iface != NULL) {
 		/* silently ignore dups */
 		rc = 0;
-	} else if (net->ksnn_ninterfaces == LNET_NUM_INTERFACES) {
+	} else if (net->ksnn_ninterfaces == LNET_INTERFACES_NUM) {
 		rc = -ENOSPC;
 	} else {
 		iface = &net->ksnn_interfaces[net->ksnn_ninterfaces++];
@@ -1998,7 +1985,7 @@ ksocknal_add_interface(struct lnet_ni *ni, __u32 ipaddress, __u32 netmask)
 
 		for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
 			list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
-				peer_ni = list_entry(ptmp, ksock_peer_ni_t,
+				peer_ni = list_entry(ptmp, struct ksock_peer_ni,
 						     ksnp_list);
 
 				for (j = 0; j < peer_ni->ksnp_n_passive_ips; j++)
@@ -2007,7 +1994,7 @@ ksocknal_add_interface(struct lnet_ni *ni, __u32 ipaddress, __u32 netmask)
 
 				list_for_each(rtmp, &peer_ni->ksnp_routes) {
 					route = list_entry(rtmp,
-							   ksock_route_t,
+							   struct ksock_route,
 							   ksnr_list);
 
 					if (route->ksnr_myipaddr == ipaddress)
@@ -2026,14 +2013,14 @@ ksocknal_add_interface(struct lnet_ni *ni, __u32 ipaddress, __u32 netmask)
 }
 
 static void
-ksocknal_peer_del_interface_locked(ksock_peer_ni_t *peer_ni, __u32 ipaddr)
+ksocknal_peer_del_interface_locked(struct ksock_peer_ni *peer_ni, __u32 ipaddr)
 {
-	struct list_head         *tmp;
-	struct list_head         *nxt;
-        ksock_route_t      *route;
-        ksock_conn_t       *conn;
-        int                 i;
-        int                 j;
+	struct list_head *tmp;
+	struct list_head *nxt;
+	struct ksock_route *route;
+	struct ksock_conn *conn;
+	int i;
+	int j;
 
         for (i = 0; i < peer_ni->ksnp_n_passive_ips; i++)
                 if (peer_ni->ksnp_passive_ips[i] == ipaddr) {
@@ -2045,7 +2032,7 @@ ksocknal_peer_del_interface_locked(ksock_peer_ni_t *peer_ni, __u32 ipaddr)
                 }
 
 	list_for_each_safe(tmp, nxt, &peer_ni->ksnp_routes) {
-		route = list_entry(tmp, ksock_route_t, ksnr_list);
+		route = list_entry(tmp, struct ksock_route, ksnr_list);
 
                 if (route->ksnr_myipaddr != ipaddr)
                         continue;
@@ -2059,7 +2046,7 @@ ksocknal_peer_del_interface_locked(ksock_peer_ni_t *peer_ni, __u32 ipaddr)
         }
 
 	list_for_each_safe(tmp, nxt, &peer_ni->ksnp_conns) {
-		conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+		conn = list_entry(tmp, struct ksock_conn, ksnc_list);
 
                 if (conn->ksnc_myipaddr == ipaddr)
                         ksocknal_close_conn_locked (conn, 0);
@@ -2069,14 +2056,14 @@ ksocknal_peer_del_interface_locked(ksock_peer_ni_t *peer_ni, __u32 ipaddr)
 static int
 ksocknal_del_interface(struct lnet_ni *ni, __u32 ipaddress)
 {
-        ksock_net_t       *net = ni->ni_data;
-        int                rc = -ENOENT;
-	struct list_head        *tmp;
-	struct list_head        *nxt;
-        ksock_peer_ni_t      *peer_ni;
-        __u32              this_ip;
-        int                i;
-        int                j;
+	struct ksock_net *net = ni->ni_data;
+	int rc = -ENOENT;
+	struct list_head *tmp;
+	struct list_head *nxt;
+	struct ksock_peer_ni *peer_ni;
+	u32 this_ip;
+	int i;
+	int j;
 
 	write_lock_bh(&ksocknal_data.ksnd_global_lock);
 
@@ -2097,9 +2084,9 @@ ksocknal_del_interface(struct lnet_ni *ni, __u32 ipaddress)
 
                 for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) {
 			list_for_each_safe(tmp, nxt,
-                                               &ksocknal_data.ksnd_peers[j]) {
-				peer_ni = list_entry(tmp, ksock_peer_ni_t,
-                                                      ksnp_list);
+					   &ksocknal_data.ksnd_peers[j]) {
+				peer_ni = list_entry(tmp, struct ksock_peer_ni,
+						     ksnp_list);
 
                                 if (peer_ni->ksnp_ni != ni)
                                         continue;
@@ -2123,8 +2110,8 @@ ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
 
         switch(cmd) {
         case IOC_LIBCFS_GET_INTERFACE: {
-                ksock_net_t       *net = ni->ni_data;
-                ksock_interface_t *iface;
+		struct ksock_net *net = ni->ni_data;
+		struct ksock_interface *iface;
 
 		read_lock(&ksocknal_data.ksnd_global_lock);
 
@@ -2193,7 +2180,7 @@ ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
                 int           txmem;
                 int           rxmem;
                 int           nagle;
-                ksock_conn_t *conn = ksocknal_get_conn_by_idx (ni, data->ioc_count);
+		struct ksock_conn *conn = ksocknal_get_conn_by_idx(ni, data->ioc_count);
 
                 if (conn == NULL)
                         return -ENOENT;
@@ -2207,7 +2194,7 @@ ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
                 data->ioc_u32[1] = conn->ksnc_port;
                 data->ioc_u32[2] = conn->ksnc_myipaddr;
                 data->ioc_u32[3] = conn->ksnc_type;
-		data->ioc_u32[4] = conn->ksnc_scheduler->kss_info->ksi_cpt;
+		data->ioc_u32[4] = conn->ksnc_scheduler->kss_cpt;
                 data->ioc_u32[5] = rxmem;
                 data->ioc_u32[6] = conn->ksnc_peer->ksnp_id.pid;
                 ksocknal_conn_decref(conn);
@@ -2246,19 +2233,8 @@ ksocknal_free_buffers (void)
 {
 	LASSERT (atomic_read(&ksocknal_data.ksnd_nactive_txs) == 0);
 
-	if (ksocknal_data.ksnd_sched_info != NULL) {
-		struct ksock_sched_info	*info;
-		int			i;
-
-		cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
-			if (info->ksi_scheds != NULL) {
-				LIBCFS_FREE(info->ksi_scheds,
-					    info->ksi_nthreads_max *
-					    sizeof(info->ksi_scheds[0]));
-			}
-		}
-		cfs_percpt_free(ksocknal_data.ksnd_sched_info);
-	}
+	if (ksocknal_data.ksnd_schedulers != NULL)
+		cfs_percpt_free(ksocknal_data.ksnd_schedulers);
 
         LIBCFS_FREE (ksocknal_data.ksnd_peers,
 		     sizeof(struct list_head) *
@@ -2267,15 +2243,15 @@ ksocknal_free_buffers (void)
 	spin_lock(&ksocknal_data.ksnd_tx_lock);
 
 	if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
-		struct list_head	zlist;
-		ksock_tx_t	*tx;
+		struct list_head zlist;
+		struct ksock_tx	*tx;
 
 		list_add(&zlist, &ksocknal_data.ksnd_idle_noop_txs);
 		list_del_init(&ksocknal_data.ksnd_idle_noop_txs);
 		spin_unlock(&ksocknal_data.ksnd_tx_lock);
 
 		while (!list_empty(&zlist)) {
-			tx = list_entry(zlist.next, ksock_tx_t, tx_list);
+			tx = list_entry(zlist.next, struct ksock_tx, tx_list);
 			list_del(&tx->tx_list);
 			LIBCFS_FREE(tx, tx->tx_desc_size);
 		}
@@ -2287,26 +2263,23 @@ ksocknal_free_buffers (void)
 static void
 ksocknal_base_shutdown(void)
 {
-	struct ksock_sched_info *info;
-	ksock_sched_t		*sched;
-	int			i;
-	int			j;
+	struct ksock_sched *sched;
+	int i;
 
 	CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
 	       atomic_read (&libcfs_kmemory));
 	LASSERT (ksocknal_data.ksnd_nnets == 0);
 
-        switch (ksocknal_data.ksnd_init) {
-        default:
-                LASSERT (0);
-		/* Fall through */
+	switch (ksocknal_data.ksnd_init) {
+	default:
+		LASSERT(0);
+		/* fallthrough */
 
-        case SOCKNAL_INIT_ALL:
-        case SOCKNAL_INIT_DATA:
-                LASSERT (ksocknal_data.ksnd_peers != NULL);
-                for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+	case SOCKNAL_INIT_ALL:
+	case SOCKNAL_INIT_DATA:
+		LASSERT(ksocknal_data.ksnd_peers != NULL);
+		for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++)
 			LASSERT(list_empty(&ksocknal_data.ksnd_peers[i]));
-                }
 
 		LASSERT(list_empty(&ksocknal_data.ksnd_nets));
 		LASSERT(list_empty(&ksocknal_data.ksnd_enomem_conns));
@@ -2314,23 +2287,14 @@ ksocknal_base_shutdown(void)
 		LASSERT(list_empty(&ksocknal_data.ksnd_connd_connreqs));
 		LASSERT(list_empty(&ksocknal_data.ksnd_connd_routes));
 
-		if (ksocknal_data.ksnd_sched_info != NULL) {
-			cfs_percpt_for_each(info, i,
-					    ksocknal_data.ksnd_sched_info) {
-				if (info->ksi_scheds == NULL)
-					continue;
-
-				for (j = 0; j < info->ksi_nthreads_max; j++) {
+		if (ksocknal_data.ksnd_schedulers != NULL) {
+			cfs_percpt_for_each(sched, i,
+					    ksocknal_data.ksnd_schedulers) {
 
-					sched = &info->ksi_scheds[j];
-					LASSERT(list_empty(&sched->\
-							       kss_tx_conns));
-					LASSERT(list_empty(&sched->\
-							       kss_rx_conns));
-					LASSERT(list_empty(&sched-> \
-						  kss_zombie_noop_txs));
-					LASSERT(sched->kss_nconns == 0);
-				}
+				LASSERT(list_empty(&sched->kss_tx_conns));
+				LASSERT(list_empty(&sched->kss_rx_conns));
+				LASSERT(list_empty(&sched->kss_zombie_noop_txs));
+				LASSERT(sched->kss_nconns == 0);
 			}
 		}
 
@@ -2339,17 +2303,10 @@ ksocknal_base_shutdown(void)
 		wake_up_all(&ksocknal_data.ksnd_connd_waitq);
 		wake_up_all(&ksocknal_data.ksnd_reaper_waitq);
 
-		if (ksocknal_data.ksnd_sched_info != NULL) {
-			cfs_percpt_for_each(info, i,
-					    ksocknal_data.ksnd_sched_info) {
-				if (info->ksi_scheds == NULL)
-					continue;
-
-				for (j = 0; j < info->ksi_nthreads_max; j++) {
-					sched = &info->ksi_scheds[j];
+		if (ksocknal_data.ksnd_schedulers != NULL) {
+			cfs_percpt_for_each(sched, i,
+					    ksocknal_data.ksnd_schedulers)
 					wake_up_all(&sched->kss_waitq);
-				}
-			}
 		}
 
 		i = 4;
@@ -2382,9 +2339,9 @@ ksocknal_base_shutdown(void)
 static int
 ksocknal_base_startup(void)
 {
-	struct ksock_sched_info	*info;
-	int			rc;
-	int			i;
+	struct ksock_sched *sched;
+	int rc;
+	int i;
 
         LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
         LASSERT (ksocknal_data.ksnd_nnets == 0);
@@ -2424,50 +2381,43 @@ ksocknal_base_startup(void)
 	ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
 	try_module_get(THIS_MODULE);
 
-	ksocknal_data.ksnd_sched_info = cfs_percpt_alloc(lnet_cpt_table(),
-							 sizeof(*info));
-	if (ksocknal_data.ksnd_sched_info == NULL)
+	/* Create a scheduler block per available CPT */
+	ksocknal_data.ksnd_schedulers = cfs_percpt_alloc(lnet_cpt_table(),
+							 sizeof(*sched));
+	if (ksocknal_data.ksnd_schedulers == NULL)
 		goto failed;
 
-	cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
-		ksock_sched_t	*sched;
-		int		nthrs;
+	cfs_percpt_for_each(sched, i, ksocknal_data.ksnd_schedulers) {
+		int nthrs;
 
+		/*
+		 * make sure not to allocate more threads than there are
+		 * cores/CPUs in teh CPT
+		 */
 		nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
 		if (*ksocknal_tunables.ksnd_nscheds > 0) {
 			nthrs = min(nthrs, *ksocknal_tunables.ksnd_nscheds);
 		} else {
-			/* max to half of CPUs, assume another half should be
-			 * reserved for upper layer modules */
+			/*
+			 * max to half of CPUs, assume another half should be
+			 * reserved for upper layer modules
+			 */
 			nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
 		}
 
-		info->ksi_nthreads_max = nthrs;
-		info->ksi_cpt = i;
-
-		if (nthrs != 0) {
-			LIBCFS_CPT_ALLOC(info->ksi_scheds, lnet_cpt_table(), i,
-					 info->ksi_nthreads_max *
-						sizeof(*sched));
-			if (info->ksi_scheds == NULL)
-				goto failed;
-
-			for (; nthrs > 0; nthrs--) {
-				sched = &info->ksi_scheds[nthrs - 1];
-
-				sched->kss_info = info;
-				spin_lock_init(&sched->kss_lock);
-				INIT_LIST_HEAD(&sched->kss_rx_conns);
-				INIT_LIST_HEAD(&sched->kss_tx_conns);
-				INIT_LIST_HEAD(&sched->kss_zombie_noop_txs);
-				init_waitqueue_head(&sched->kss_waitq);
-			}
-		}
+		sched->kss_nthreads_max = nthrs;
+		sched->kss_cpt = i;
+
+		spin_lock_init(&sched->kss_lock);
+		INIT_LIST_HEAD(&sched->kss_rx_conns);
+		INIT_LIST_HEAD(&sched->kss_tx_conns);
+		INIT_LIST_HEAD(&sched->kss_zombie_noop_txs);
+		init_waitqueue_head(&sched->kss_waitq);
         }
 
         ksocknal_data.ksnd_connd_starting         = 0;
         ksocknal_data.ksnd_connd_failed_stamp     = 0;
-	ksocknal_data.ksnd_connd_starting_stamp   = cfs_time_current_sec();
+	ksocknal_data.ksnd_connd_starting_stamp   = ktime_get_real_seconds();
         /* must have at least 2 connds to remain responsive to accepts while
          * connecting */
         if (*ksocknal_tunables.ksnd_nconnds < SOCKNAL_CONND_RESV + 1)
@@ -2517,15 +2467,15 @@ ksocknal_base_startup(void)
 static void
 ksocknal_debug_peerhash(struct lnet_ni *ni)
 {
-	ksock_peer_ni_t	*peer_ni = NULL;
-	struct list_head	*tmp;
-	int		i;
+	struct ksock_peer_ni *peer_ni = NULL;
+	struct list_head *tmp;
+	int i;
 
 	read_lock(&ksocknal_data.ksnd_global_lock);
 
         for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
 		list_for_each(tmp, &ksocknal_data.ksnd_peers[i]) {
-			peer_ni = list_entry(tmp, ksock_peer_ni_t, ksnp_list);
+			peer_ni = list_entry(tmp, struct ksock_peer_ni, ksnp_list);
 
                         if (peer_ni->ksnp_ni == ni) break;
 
@@ -2534,8 +2484,8 @@ ksocknal_debug_peerhash(struct lnet_ni *ni)
         }
 
         if (peer_ni != NULL) {
-                ksock_route_t *route;
-                ksock_conn_t  *conn;
+		struct ksock_route *route;
+		struct ksock_conn  *conn;
 
 		CWARN ("Active peer_ni on shutdown: %s, ref %d, scnt %d, "
 		       "closing %d, accepting %d, err %d, zcookie %llu, "
@@ -2548,7 +2498,7 @@ ksocknal_debug_peerhash(struct lnet_ni *ni)
 		       !list_empty(&peer_ni->ksnp_zc_req_list));
 
 		list_for_each(tmp, &peer_ni->ksnp_routes) {
-			route = list_entry(tmp, ksock_route_t, ksnr_list);
+			route = list_entry(tmp, struct ksock_route, ksnr_list);
 			CWARN ("Route: ref %d, schd %d, conn %d, cnted %d, "
 			       "del %d\n", atomic_read(&route->ksnr_refcount),
 			       route->ksnr_scheduled, route->ksnr_connecting,
@@ -2556,7 +2506,7 @@ ksocknal_debug_peerhash(struct lnet_ni *ni)
 		}
 
 		list_for_each(tmp, &peer_ni->ksnp_conns) {
-			conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+			conn = list_entry(tmp, struct ksock_conn, ksnc_list);
 			CWARN ("Conn: ref %d, sref %d, t %d, c %d\n",
 			       atomic_read(&conn->ksnc_conn_refcount),
 			       atomic_read(&conn->ksnc_sock_refcount),
@@ -2571,7 +2521,7 @@ ksocknal_debug_peerhash(struct lnet_ni *ni)
 void
 ksocknal_shutdown(struct lnet_ni *ni)
 {
-	ksock_net_t *net = ni->ni_data;
+	struct ksock_net *net = ni->ni_data;
 	struct lnet_process_id anyid = {
 		.nid = LNET_NID_ANY,
 		.pid = LNET_PID_ANY,
@@ -2621,17 +2571,17 @@ ksocknal_shutdown(struct lnet_ni *ni)
 }
 
 static int
-ksocknal_search_new_ipif(ksock_net_t *net)
+ksocknal_search_new_ipif(struct ksock_net *net)
 {
-	int	new_ipif = 0;
-	int	i;
+	int new_ipif = 0;
+	int i;
 
 	for (i = 0; i < net->ksnn_ninterfaces; i++) {
-		char		*ifnam = &net->ksnn_interfaces[i].ksni_name[0];
-		char		*colon = strchr(ifnam, ':');
-		int		found  = 0;
-		ksock_net_t	*tmp;
-		int		j;
+		char *ifnam = &net->ksnn_interfaces[i].ksni_name[0];
+		char *colon = strchr(ifnam, ':');
+		int found  = 0;
+		struct ksock_net *tmp;
+		int j;
 
 		if (colon != NULL) /* ignore alias device */
 			*colon = 0;
@@ -2663,36 +2613,35 @@ ksocknal_search_new_ipif(ksock_net_t *net)
 }
 
 static int
-ksocknal_start_schedulers(struct ksock_sched_info *info)
+ksocknal_start_schedulers(struct ksock_sched *sched)
 {
 	int	nthrs;
 	int	rc = 0;
 	int	i;
 
-	if (info->ksi_nthreads == 0) {
+	if (sched->kss_nthreads == 0) {
 		if (*ksocknal_tunables.ksnd_nscheds > 0) {
-			nthrs = info->ksi_nthreads_max;
+			nthrs = sched->kss_nthreads_max;
 		} else {
 			nthrs = cfs_cpt_weight(lnet_cpt_table(),
-					       info->ksi_cpt);
+					       sched->kss_cpt);
 			nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
 			nthrs = min(SOCKNAL_NSCHEDS_HIGH, nthrs);
 		}
-		nthrs = min(nthrs, info->ksi_nthreads_max);
+		nthrs = min(nthrs, sched->kss_nthreads_max);
 	} else {
-		LASSERT(info->ksi_nthreads <= info->ksi_nthreads_max);
+		LASSERT(sched->kss_nthreads <= sched->kss_nthreads_max);
 		/* increase two threads if there is new interface */
-		nthrs = min(2, info->ksi_nthreads_max - info->ksi_nthreads);
+		nthrs = min(2, sched->kss_nthreads_max - sched->kss_nthreads);
 	}
 
 	for (i = 0; i < nthrs; i++) {
-		long		id;
-		char		name[20];
-		ksock_sched_t	*sched;
-		id = KSOCK_THREAD_ID(info->ksi_cpt, info->ksi_nthreads + i);
-		sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
+		long id;
+		char name[20];
+
+		id = KSOCK_THREAD_ID(sched->kss_cpt, sched->kss_nthreads + i);
 		snprintf(name, sizeof(name), "socknal_sd%02d_%02d",
-			 info->ksi_cpt, (int)(sched - &info->ksi_scheds[0]));
+			 sched->kss_cpt, (int)KSOCK_THREAD_SID(id));
 
 		rc = ksocknal_thread_start(ksocknal_scheduler,
 					   (void *)id, name);
@@ -2700,35 +2649,35 @@ ksocknal_start_schedulers(struct ksock_sched_info *info)
 			continue;
 
 		CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
-		       info->ksi_cpt, info->ksi_nthreads + i, rc);
+		       sched->kss_cpt, (int) KSOCK_THREAD_SID(id), rc);
 		break;
 	}
 
-	info->ksi_nthreads += i;
+	sched->kss_nthreads += i;
 	return rc;
 }
 
 static int
-ksocknal_net_start_threads(ksock_net_t *net, __u32 *cpts, int ncpts)
+ksocknal_net_start_threads(struct ksock_net *net, __u32 *cpts, int ncpts)
 {
-	int	newif = ksocknal_search_new_ipif(net);
-	int	rc;
-	int	i;
+	int newif = ksocknal_search_new_ipif(net);
+	int rc;
+	int i;
 
 	if (ncpts > 0 && ncpts > cfs_cpt_number(lnet_cpt_table()))
 		return -EINVAL;
 
 	for (i = 0; i < ncpts; i++) {
-		struct ksock_sched_info	*info;
+		struct ksock_sched *sched;
 		int cpt = (cpts == NULL) ? i : cpts[i];
 
 		LASSERT(cpt < cfs_cpt_number(lnet_cpt_table()));
-		info = ksocknal_data.ksnd_sched_info[cpt];
+		sched = ksocknal_data.ksnd_schedulers[cpt];
 
-		if (!newif && info->ksi_nthreads > 0)
+		if (!newif && sched->kss_nthreads > 0)
 			continue;
 
-		rc = ksocknal_start_schedulers(info);
+		rc = ksocknal_start_schedulers(sched);
 		if (rc != 0)
 			return rc;
 	}
@@ -2738,8 +2687,9 @@ ksocknal_net_start_threads(ksock_net_t *net, __u32 *cpts, int ncpts)
 int
 ksocknal_startup(struct lnet_ni *ni)
 {
-	ksock_net_t  *net;
-	ksock_interface_t *ksi = NULL;
+	struct ksock_net *net;
+	struct lnet_ioctl_config_lnd_cmn_tunables *net_tunables;
+	struct ksock_interface *ksi = NULL;
 	struct lnet_inetdev *ifaces = NULL;
 	int i = 0;
 	int rc;
@@ -2759,18 +2709,28 @@ ksocknal_startup(struct lnet_ni *ni)
 	spin_lock_init(&net->ksnn_lock);
 	net->ksnn_incarnation = ktime_get_real_ns();
 	ni->ni_data = net;
-	if (!ni->ni_net->net_tunables_set) {
-		ni->ni_net->net_tunables.lct_peer_timeout =
+	net_tunables = &ni->ni_net->net_tunables;
+
+	if (net_tunables->lct_peer_timeout == -1)
+		net_tunables->lct_peer_timeout =
 			*ksocknal_tunables.ksnd_peertimeout;
-		ni->ni_net->net_tunables.lct_max_tx_credits =
+
+	if (net_tunables->lct_max_tx_credits == -1)
+		net_tunables->lct_max_tx_credits =
 			*ksocknal_tunables.ksnd_credits;
-		ni->ni_net->net_tunables.lct_peer_tx_credits =
+
+	if (net_tunables->lct_peer_tx_credits == -1)
+		net_tunables->lct_peer_tx_credits =
 			*ksocknal_tunables.ksnd_peertxcredits;
-		ni->ni_net->net_tunables.lct_peer_rtr_credits =
-			*ksocknal_tunables.ksnd_peerrtrcredits;
-		ni->ni_net->net_tunables_set = true;
-	}
 
+	if (net_tunables->lct_peer_tx_credits >
+	    net_tunables->lct_max_tx_credits)
+		net_tunables->lct_peer_tx_credits =
+			net_tunables->lct_max_tx_credits;
+
+	if (net_tunables->lct_peer_rtr_credits == -1)
+		net_tunables->lct_peer_rtr_credits =
+			*ksocknal_tunables.ksnd_peerrtrcredits;
 
 	rc = lnet_inet_enumerate(&ifaces, ni->ni_net_ns);
 	if (rc < 0)
@@ -2797,13 +2757,13 @@ ksocknal_startup(struct lnet_ni *ni)
 		 * should exist. Each IP alias should be mapped to
 		 * each 'struct net_ni'.
 		 */
-		for (i = 0; i < LNET_NUM_INTERFACES; i++) {
+		for (i = 0; i < LNET_INTERFACES_NUM; i++) {
 			int j;
 
 			if (!ni->ni_interfaces[i])
 				break;
 
-			for (j = 0; j < LNET_NUM_INTERFACES;  j++) {
+			for (j = 0; j < LNET_INTERFACES_NUM;  j++) {
 				if (i != j && ni->ni_interfaces[j] &&
 				    strcmp(ni->ni_interfaces[i],
 					   ni->ni_interfaces[j]) == 0) {
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h
index 12d6cb83ef4ac..cbc40f7347d4d 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  *
  *   Author: Zach Brown <zab@zabbo.net>
  *   Author: Peter J. Braam <braam@clusterfs.com>
@@ -41,6 +41,7 @@
 #include <linux/list.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/pagemap.h>
 #include <linux/stat.h>
 #include <linux/string.h>
 #include <linux/syscalls.h>
@@ -50,13 +51,9 @@
 #include <net/sock.h>
 #include <net/tcp.h>
 
-#include <libcfs/libcfs.h>
-#include <lnet/lnet.h>
 #include <lnet/lib-lnet.h>
 #include <lnet/socklnd.h>
 
-#include <libcfs/linux/linux-net.h>
-
 #ifdef HAVE_TCP_SENDPAGE_USE_SOCKET
 # define cfs_tcp_sendpage(sk, page, offset, size, flags) \
 	tcp_sendpage((sk)->sk_socket, page, offset, size, flags)
@@ -65,6 +62,8 @@
 	tcp_sendpage(sk, page, offset, size, flags)
 #endif /* HAVE_TCP_SENDPAGE_USE_SOCKET */
 
+#include <libcfs/linux/linux-net.h>
+
 #ifndef NETIF_F_CSUM_MASK
 # define NETIF_F_CSUM_MASK NETIF_F_ALL_CSUM
 #endif
@@ -76,7 +75,7 @@
 #define SOCKNAL_PEER_HASH_SIZE  101             /* # peer_ni lists */
 #define SOCKNAL_RESCHED         100             /* # scheduler loops before reschedule */
 #define SOCKNAL_INSANITY_RECONN 5000            /* connd is trying on reconn infinitely */
-#define SOCKNAL_ENOMEM_RETRY    CFS_TICK        /* jiffies between retries */
+#define SOCKNAL_ENOMEM_RETRY    1		/* seconds between retries */
 
 #define SOCKNAL_SINGLE_FRAG_TX      0           /* disable multi-fragment sends */
 #define SOCKNAL_SINGLE_FRAG_RX      0           /* disable multi-fragment receives */
@@ -91,33 +90,25 @@
 # define SOCKNAL_RISK_KMAP_DEADLOCK  1
 #endif
 
-struct ksock_sched_info;
-
-typedef struct                                  /* per scheduler state */
-{
-	spinlock_t		kss_lock;	/* serialise */
-	struct list_head	kss_rx_conns;	/* conn waiting to be read */
+/* per scheduler state */
+struct ksock_sched {
+	/* serialise */
+	spinlock_t kss_lock;
 	/* conn waiting to be written */
-	struct list_head	kss_tx_conns;
+	struct list_head kss_rx_conns;
+	struct list_head kss_tx_conns;
 	/* zombie noop tx list */
-	struct list_head	kss_zombie_noop_txs;
-	wait_queue_head_t	kss_waitq;	/* where scheduler sleeps */
+	struct list_head kss_zombie_noop_txs;
+	/* where scheduler sleeps */
+	wait_queue_head_t kss_waitq;
 	/* # connections assigned to this scheduler */
-	int			kss_nconns;
-	struct ksock_sched_info	*kss_info;	/* owner of it */
-#if !SOCKNAL_SINGLE_FRAG_RX
-	struct page		*kss_rx_scratch_pgs[LNET_MAX_IOV];
-#endif
-#if !SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_SINGLE_FRAG_RX
-	struct kvec		kss_scratch_iov[LNET_MAX_IOV];
-#endif
-} ksock_sched_t;
-
-struct ksock_sched_info {
-	int			ksi_nthreads_max; /* max allowed threads */
-	int			ksi_nthreads;	/* number of threads */
-	int			ksi_cpt;	/* CPT id */
-	ksock_sched_t		*ksi_scheds;	/* array of schedulers */
+	int kss_nconns;
+	/* max allowed threads */
+	int kss_nthreads_max;
+	/* number of threads */
+	int kss_nthreads;
+	/* CPT id */
+	int kss_cpt;
 };
 
 #define KSOCK_CPT_SHIFT			16
@@ -125,17 +116,15 @@ struct ksock_sched_info {
 #define KSOCK_THREAD_CPT(id)		((id) >> KSOCK_CPT_SHIFT)
 #define KSOCK_THREAD_SID(id)		((id) & ((1UL << KSOCK_CPT_SHIFT) - 1))
 
-typedef struct                                  /* in-use interface */
-{
+struct ksock_interface {			/* in-use interface */
 	__u32		ksni_ipaddr;		/* interface's IP address */
 	__u32		ksni_netmask;		/* interface's network mask */
 	int		ksni_nroutes;		/* # routes using (active) */
 	int		ksni_npeers;		/* # peers using (passive) */
 	char		ksni_name[IFNAMSIZ];	/* interface name */
-} ksock_interface_t;
+};
 
-typedef struct
-{
+struct ksock_tunables {
 	/* "stuck" socket timeout (seconds) */
 	int              *ksnd_timeout;
 	/* # scheduler threads in each pool while starting */
@@ -175,26 +164,24 @@ typedef struct
 #if SOCKNAL_VERSION_DEBUG
         int              *ksnd_protocol;        /* protocol version */
 #endif
-} ksock_tunables_t;
+};
 
-typedef struct
-{
+struct ksock_net {
 	__u64		  ksnn_incarnation;	/* my epoch */
 	spinlock_t	  ksnn_lock;		/* serialise */
 	struct list_head  ksnn_list;		/* chain on global list */
 	int		  ksnn_npeers;		/* # peers */
 	int		  ksnn_shutdown;	/* shutting down? */
 	int		  ksnn_ninterfaces;	/* IP interfaces */
-	ksock_interface_t ksnn_interfaces[LNET_NUM_INTERFACES];
-} ksock_net_t;
+	struct ksock_interface ksnn_interfaces[LNET_INTERFACES_NUM];
+};
 
 /** connd timeout */
 #define SOCKNAL_CONND_TIMEOUT  120
 /** reserved thread for accepting & creating new connd */
 #define SOCKNAL_CONND_RESV     1
 
-typedef struct
-{
+struct ksock_nal_data {
 	int			ksnd_init;	/* initialisation state */
 	int			ksnd_nnets;	/* # networks set up */
 	struct list_head	ksnd_nets;	/* list of nets */
@@ -207,7 +194,7 @@ typedef struct
 	int			ksnd_nthreads;	/* # live threads */
 	int			ksnd_shuttingdown; /* tell threads to exit */
 	/* schedulers information */
-	struct ksock_sched_info	**ksnd_sched_info;
+	struct ksock_sched	**ksnd_schedulers;
 
 	atomic_t      ksnd_nactive_txs;    /* #active txs */
 
@@ -220,7 +207,7 @@ typedef struct
 	/* reaper sleeps here */
 	wait_queue_head_t       ksnd_reaper_waitq;
 	/* when reaper will wake */
-	cfs_time_t        ksnd_reaper_waketime;
+	time64_t		ksnd_reaper_waketime;
 	/* serialise */
 	spinlock_t	  ksnd_reaper_lock;
 
@@ -237,11 +224,11 @@ typedef struct
 	/* # connds connecting */
 	int			ksnd_connd_connecting;
 	/** time stamp of the last failed connecting attempt */
-	long			ksnd_connd_failed_stamp;
+	time64_t		ksnd_connd_failed_stamp;
 	/** # starting connd */
 	unsigned		ksnd_connd_starting;
 	/** time stamp of the last starting connd */
-	long			ksnd_connd_starting_stamp;
+	time64_t		ksnd_connd_starting_stamp;
 	/** # running connd */
 	unsigned		ksnd_connd_running;
 	/* serialise */
@@ -251,8 +238,7 @@ typedef struct
 	struct list_head	ksnd_idle_noop_txs;
 	/* serialise, g_lock unsafe */
 	spinlock_t		ksnd_tx_lock;
-
-} ksock_nal_data_t;
+};
 
 #define SOCKNAL_INIT_NOTHING    0
 #define SOCKNAL_INIT_DATA       1
@@ -272,8 +258,7 @@ struct ksock_peer;                              /* forward ref */
 struct ksock_route;                             /* forward ref */
 struct ksock_proto;                             /* forward ref */
 
-typedef struct                                  /* transmit packet */
-{
+struct ksock_tx {			/* transmit packet */
 	struct list_head   tx_list;	/* queue on conn for transmission etc */
 	struct list_head   tx_zc_list;	/* queue on peer_ni for ZC request */
 	atomic_t       tx_refcount;    /* tx reference count */
@@ -289,9 +274,10 @@ typedef struct                                  /* transmit packet */
         lnet_kiov_t   *tx_kiov;        /* packet page frags */
 	struct ksock_conn *tx_conn;        /* owning conn */
 	struct lnet_msg	  *tx_lnetmsg;	/* lnet message for lnet_finalize() */
-	cfs_time_t     tx_deadline;    /* when (in jiffies) tx times out */
-	struct ksock_msg    tx_msg;         /* socklnd message buffer */
+	time64_t	   tx_deadline;	/* when (in secs) tx times out */
+	struct ksock_msg   tx_msg;         /* socklnd message buffer */
         int            tx_desc_size;   /* size of this descriptor */
+	enum lnet_msg_hstatus tx_hstatus; /* health status of tx */
         union {
                 struct {
 			struct kvec iov;	/* virt hdr */
@@ -301,18 +287,18 @@ typedef struct                                  /* transmit packet */
 			struct kvec iov[1];	/* virt hdr + payload */
                 }                  virt;
         }                       tx_frags;
-} ksock_tx_t;
+};
 
-#define KSOCK_NOOP_TX_SIZE  ((int)offsetof(ksock_tx_t, tx_frags.paged.kiov[0]))
+#define KSOCK_NOOP_TX_SIZE  ((int)offsetof(struct ksock_tx, tx_frags.paged.kiov[0]))
 
-/* network zero copy callback descriptor embedded in ksock_tx_t */
+/* network zero copy callback descriptor embedded in struct ksock_tx */
 
 /* space for the rx frag descriptors; we either read a single contiguous
  * header, or up to LNET_MAX_IOV frags of payload of either type. */
-typedef union {
-	struct kvec     iov[LNET_MAX_IOV];
-        lnet_kiov_t      kiov[LNET_MAX_IOV];
-} ksock_rxiovspace_t;
+union ksock_rxiovspace {
+	struct kvec	iov[LNET_MAX_IOV];
+	lnet_kiov_t	kiov[LNET_MAX_IOV];
+};
 
 #define SOCKNAL_RX_KSM_HEADER   1               /* reading ksock message header */
 #define SOCKNAL_RX_LNET_HEADER  2               /* reading lnet message header */
@@ -321,17 +307,16 @@ typedef union {
 #define SOCKNAL_RX_LNET_PAYLOAD 5               /* reading lnet payload (to deliver here) */
 #define SOCKNAL_RX_SLOP         6               /* skipping body */
 
-typedef struct ksock_conn
-{
-	struct ksock_peer  *ksnc_peer;		/* owning peer_ni */
-	struct ksock_route *ksnc_route;		/* owning route */
+struct ksock_conn {
+	struct ksock_peer_ni	*ksnc_peer;	/* owning peer_ni */
+	struct ksock_route	*ksnc_route;	/* owning route */
 	struct list_head    ksnc_list;		/* stash on peer_ni's conn list */
 	struct socket       *ksnc_sock;		/* actual socket */
 	void                *ksnc_saved_data_ready; /* socket's original data_ready() callback */
 	void                *ksnc_saved_write_space; /* socket's original write_space() callback */
 	atomic_t            ksnc_conn_refcount; /* conn refcount */
 	atomic_t            ksnc_sock_refcount; /* sock refcount */
-	ksock_sched_t       *ksnc_scheduler;  /* who schedules this connection */
+	struct ksock_sched *ksnc_scheduler;	/* who schedules this connection */
 	__u32               ksnc_myipaddr;   /* my IP */
         __u32               ksnc_ipaddr;     /* peer_ni's IP */
         int                 ksnc_port;       /* peer_ni's port */
@@ -346,7 +331,7 @@ typedef struct ksock_conn
 
 	/* where I enq waiting input or a forwarding descriptor */
 	struct list_head   ksnc_rx_list;
-	cfs_time_t            ksnc_rx_deadline; /* when (in jiffies) receive times out */
+	time64_t		ksnc_rx_deadline; /* when (in seconds) receive times out */
         __u8                  ksnc_rx_started;  /* started receiving a message */
         __u8                  ksnc_rx_ready;    /* data ready to read */
         __u8                  ksnc_rx_scheduled;/* being progressed */
@@ -357,9 +342,9 @@ typedef struct ksock_conn
 	struct kvec          *ksnc_rx_iov;      /* the kvec frags */
         int                   ksnc_rx_nkiov;    /* # page frags */
         lnet_kiov_t          *ksnc_rx_kiov;     /* the page frags */
-        ksock_rxiovspace_t    ksnc_rx_iov_space;/* space for frag descriptors */
+	union ksock_rxiovspace	ksnc_rx_iov_space;/* space for frag descriptors */
         __u32                 ksnc_rx_csum;     /* partial checksum for incoming data */
-        void                 *ksnc_cookie;      /* rx lnet_finalize passthru arg */
+	struct lnet_msg      *ksnc_lnet_msg;    /* rx lnet_finalize arg*/
 	struct ksock_msg	ksnc_msg;	/* incoming message buffer:
 						 * V2.x message takes the
 						 * whole struct
@@ -373,9 +358,9 @@ typedef struct ksock_conn
 	/* packets waiting to be sent */
 	struct list_head	ksnc_tx_queue;
 	/* next TX that can carry a LNet message or ZC-ACK */
-	ksock_tx_t		*ksnc_tx_carrier;
-	/* when (in jiffies) tx times out */
-	cfs_time_t		ksnc_tx_deadline;
+	struct ksock_tx		*ksnc_tx_carrier;
+	/* when (in seconds) tx times out */
+	time64_t		ksnc_tx_deadline;
 	/* send buffer marker */
 	int			ksnc_tx_bufnob;
 	/* # bytes queued */
@@ -385,17 +370,16 @@ typedef struct ksock_conn
 	/* being progressed */
 	int			ksnc_tx_scheduled;
 	/* time stamp of the last posted TX */
-	cfs_time_t		ksnc_tx_last_post;
-} ksock_conn_t;
+	time64_t		ksnc_tx_last_post;
+};
 
-typedef struct ksock_route
-{
+struct ksock_route {
 	struct list_head   ksnr_list;		/* chain on peer_ni route list */
 	struct list_head   ksnr_connd_list;	/* chain on ksnr_connd_routes */
-	struct ksock_peer *ksnr_peer;		/* owning peer_ni */
+	struct ksock_peer_ni *ksnr_peer;	/* owning peer_ni */
 	atomic_t	   ksnr_refcount;	/* # users */
-	cfs_time_t            ksnr_timeout;     /* when (in jiffies) reconnection can happen next */
-	cfs_duration_t        ksnr_retry_interval; /* how long between retries */
+	time64_t	   ksnr_timeout;	/* when (in secs) reconnection can happen next */
+	time64_t	   ksnr_retry_interval;	/* how long between retries */
         __u32                 ksnr_myipaddr;    /* my IP */
         __u32                 ksnr_ipaddr;      /* IP address to connect to */
         int                   ksnr_port;        /* port to connect to */
@@ -405,14 +389,13 @@ typedef struct ksock_route
         unsigned int          ksnr_deleted:1;   /* been removed from peer_ni? */
         unsigned int          ksnr_share_count; /* created explicitly? */
         int                   ksnr_conn_count;  /* # conns established by this route */
-} ksock_route_t;
+};
 
 #define SOCKNAL_KEEPALIVE_PING          1       /* cookie for keepalive ping */
 
-typedef struct ksock_peer
-{
+struct ksock_peer_ni {
 	struct list_head	ksnp_list;	/* stash on global peer_ni list */
-	cfs_time_t            ksnp_last_alive;  /* when (in jiffies) I was last alive */
+	time64_t		ksnp_last_alive;/* when (in seconds) I was last alive */
 	struct lnet_process_id	ksnp_id;	/* who's on the other end(s) */
 	atomic_t              ksnp_refcount; /* # users */
 	int                   ksnp_sharecount;  /* lconf usage counter */
@@ -428,50 +411,48 @@ typedef struct ksock_peer
 	spinlock_t	      ksnp_lock;	/* serialize, g_lock unsafe */
 	/* zero copy requests wait for ACK  */
 	struct list_head	ksnp_zc_req_list;
-	cfs_time_t            ksnp_send_keepalive; /* time to send keepalive */
+	time64_t		ksnp_send_keepalive; /* time to send keepalive */
 	struct lnet_ni       *ksnp_ni;       /* which network */
 	int                   ksnp_n_passive_ips; /* # of... */
-	__u32                 ksnp_passive_ips[LNET_NUM_INTERFACES]; /* preferred local interfaces */
-} ksock_peer_ni_t;
+	__u32                 ksnp_passive_ips[LNET_INTERFACES_NUM]; /* preferred local interfaces */
+};
 
-typedef struct ksock_connreq
-{
+struct ksock_connreq {
 	/* stash on ksnd_connd_connreqs */
 	struct list_head	ksncr_list;
 	/* chosen NI */
 	struct lnet_ni		*ksncr_ni;
 	/* accepted socket */
 	struct socket		*ksncr_sock;
-} ksock_connreq_t;
+};
 
-extern ksock_nal_data_t ksocknal_data;
-extern ksock_tunables_t ksocknal_tunables;
+extern struct ksock_nal_data ksocknal_data;
+extern struct ksock_tunables ksocknal_tunables;
 
 #define SOCKNAL_MATCH_NO        0        /* TX can't match type of connection */
 #define SOCKNAL_MATCH_YES       1        /* TX matches type of connection */
 #define SOCKNAL_MATCH_MAY       2        /* TX can be sent on the connection, but not preferred */
 
-typedef struct ksock_proto
-{
+struct ksock_proto {
         int           pro_version;                                              /* version number of protocol */
-	int         (*pro_send_hello)(ksock_conn_t *, struct ksock_hello_msg *);     /* handshake function */
-	int         (*pro_recv_hello)(ksock_conn_t *, struct ksock_hello_msg *, int);/* handshake function */
-        void        (*pro_pack)(ksock_tx_t *);                                  /* message pack */
+	int         (*pro_send_hello)(struct ksock_conn *, struct ksock_hello_msg *);     /* handshake function */
+	int         (*pro_recv_hello)(struct ksock_conn *, struct ksock_hello_msg *, int);/* handshake function */
+	void        (*pro_pack)(struct ksock_tx *);                                  /* message pack */
 	void        (*pro_unpack)(struct ksock_msg *);				/* message unpack */
-        ksock_tx_t *(*pro_queue_tx_msg)(ksock_conn_t *, ksock_tx_t *);          /* queue tx on the connection */
-        int         (*pro_queue_tx_zcack)(ksock_conn_t *, ksock_tx_t *, __u64); /* queue ZC ack on the connection */
-        int         (*pro_handle_zcreq)(ksock_conn_t *, __u64, int);            /* handle ZC request */
-        int         (*pro_handle_zcack)(ksock_conn_t *, __u64, __u64);          /* handle ZC ACK */
-        int         (*pro_match_tx)(ksock_conn_t *, ksock_tx_t *, int);         /* msg type matches the connection type:
+	struct ksock_tx *(*pro_queue_tx_msg)(struct ksock_conn *, struct ksock_tx *);          /* queue tx on the connection */
+	int         (*pro_queue_tx_zcack)(struct ksock_conn *, struct ksock_tx *, __u64); /* queue ZC ack on the connection */
+	int         (*pro_handle_zcreq)(struct ksock_conn *, __u64, int);            /* handle ZC request */
+	int         (*pro_handle_zcack)(struct ksock_conn *, __u64, __u64);          /* handle ZC ACK */
+	int         (*pro_match_tx)(struct ksock_conn *, struct ksock_tx *, int);         /* msg type matches the connection type:
                                                                                  * return value:
                                                                                  *   return MATCH_NO  : no
                                                                                  *   return MATCH_YES : matching type
                                                                                  *   return MATCH_MAY : can be backup */
-} ksock_proto_t;
+};
 
-extern ksock_proto_t ksocknal_protocol_v1x;
-extern ksock_proto_t ksocknal_protocol_v2x;
-extern ksock_proto_t ksocknal_protocol_v3x;
+extern struct ksock_proto ksocknal_protocol_v1x;
+extern struct ksock_proto ksocknal_protocol_v2x;
+extern struct ksock_proto ksocknal_protocol_v3x;
 
 #define KSOCK_PROTO_V1_MAJOR    LNET_PROTO_TCP_VERSION_MAJOR
 #define KSOCK_PROTO_V1_MINOR    LNET_PROTO_TCP_VERSION_MINOR
@@ -513,27 +494,27 @@ ksocknal_nid2peerlist (lnet_nid_t nid)
 }
 
 static inline void
-ksocknal_conn_addref (ksock_conn_t *conn)
+ksocknal_conn_addref(struct ksock_conn *conn)
 {
-	LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+	LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0);
 	atomic_inc(&conn->ksnc_conn_refcount);
 }
 
-extern void ksocknal_queue_zombie_conn (ksock_conn_t *conn);
-extern void ksocknal_finalize_zcreq(ksock_conn_t *conn);
+extern void ksocknal_queue_zombie_conn(struct ksock_conn *conn);
+extern void ksocknal_finalize_zcreq(struct ksock_conn *conn);
 
 static inline void
-ksocknal_conn_decref (ksock_conn_t *conn)
+ksocknal_conn_decref(struct ksock_conn *conn)
 {
-	LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+	LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0);
 	if (atomic_dec_and_test(&conn->ksnc_conn_refcount))
 		ksocknal_queue_zombie_conn(conn);
 }
 
 static inline int
-ksocknal_connsock_addref (ksock_conn_t *conn)
+ksocknal_connsock_addref(struct ksock_conn *conn)
 {
-	int   rc = -ESHUTDOWN;
+	int rc = -ESHUTDOWN;
 
 	read_lock(&ksocknal_data.ksnd_global_lock);
 	if (!conn->ksnc_closing) {
@@ -547,9 +528,9 @@ ksocknal_connsock_addref (ksock_conn_t *conn)
 }
 
 static inline void
-ksocknal_connsock_decref (ksock_conn_t *conn)
+ksocknal_connsock_decref(struct ksock_conn *conn)
 {
-	LASSERT (atomic_read(&conn->ksnc_sock_refcount) > 0);
+	LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0);
 	if (atomic_dec_and_test(&conn->ksnc_sock_refcount)) {
 		LASSERT (conn->ksnc_closing);
 		sock_release(conn->ksnc_sock);
@@ -559,55 +540,55 @@ ksocknal_connsock_decref (ksock_conn_t *conn)
 }
 
 static inline void
-ksocknal_tx_addref (ksock_tx_t *tx)
+ksocknal_tx_addref(struct ksock_tx *tx)
 {
-	LASSERT (atomic_read(&tx->tx_refcount) > 0);
+	LASSERT(atomic_read(&tx->tx_refcount) > 0);
 	atomic_inc(&tx->tx_refcount);
 }
 
-extern void ksocknal_tx_prep (ksock_conn_t *, ksock_tx_t *tx);
-extern void ksocknal_tx_done(struct lnet_ni *ni, ksock_tx_t *tx, int error);
+extern void ksocknal_tx_prep(struct ksock_conn *, struct ksock_tx *tx);
+extern void ksocknal_tx_done(struct lnet_ni *ni, struct ksock_tx *tx, int error);
 
 static inline void
-ksocknal_tx_decref (ksock_tx_t *tx)
+ksocknal_tx_decref(struct ksock_tx *tx)
 {
-	LASSERT (atomic_read(&tx->tx_refcount) > 0);
+	LASSERT(atomic_read(&tx->tx_refcount) > 0);
 	if (atomic_dec_and_test(&tx->tx_refcount))
 		ksocknal_tx_done(NULL, tx, 0);
 }
 
 static inline void
-ksocknal_route_addref (ksock_route_t *route)
+ksocknal_route_addref(struct ksock_route *route)
 {
-	LASSERT (atomic_read(&route->ksnr_refcount) > 0);
+	LASSERT(atomic_read(&route->ksnr_refcount) > 0);
 	atomic_inc(&route->ksnr_refcount);
 }
 
-extern void ksocknal_destroy_route (ksock_route_t *route);
+extern void ksocknal_destroy_route(struct ksock_route *route);
 
 static inline void
-ksocknal_route_decref (ksock_route_t *route)
+ksocknal_route_decref(struct ksock_route *route)
 {
-	LASSERT (atomic_read (&route->ksnr_refcount) > 0);
+	LASSERT(atomic_read(&route->ksnr_refcount) > 0);
 	if (atomic_dec_and_test(&route->ksnr_refcount))
 		ksocknal_destroy_route (route);
 }
 
 static inline void
-ksocknal_peer_addref (ksock_peer_ni_t *peer_ni)
+ksocknal_peer_addref(struct ksock_peer_ni *peer_ni)
 {
-	LASSERT (atomic_read (&peer_ni->ksnp_refcount) > 0);
+	LASSERT(atomic_read(&peer_ni->ksnp_refcount) > 0);
 	atomic_inc(&peer_ni->ksnp_refcount);
 }
 
-extern void ksocknal_destroy_peer (ksock_peer_ni_t *peer_ni);
+extern void ksocknal_destroy_peer(struct ksock_peer_ni *peer_ni);
 
 static inline void
-ksocknal_peer_decref (ksock_peer_ni_t *peer_ni)
+ksocknal_peer_decref(struct ksock_peer_ni *peer_ni)
 {
 	LASSERT (atomic_read (&peer_ni->ksnp_refcount) > 0);
 	if (atomic_dec_and_test(&peer_ni->ksnp_refcount))
-		ksocknal_destroy_peer (peer_ni);
+		ksocknal_destroy_peer(peer_ni);
 }
 
 int ksocknal_startup(struct lnet_ni *ni);
@@ -622,73 +603,77 @@ int ksocknal_accept(struct lnet_ni *ni, struct socket *sock);
 
 int ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip,
 		      int port);
-ksock_peer_ni_t *ksocknal_find_peer_locked(struct lnet_ni *ni,
+struct ksock_peer_ni *ksocknal_find_peer_locked(struct lnet_ni *ni,
 					   struct lnet_process_id id);
-ksock_peer_ni_t *ksocknal_find_peer(struct lnet_ni *ni,
+struct ksock_peer_ni *ksocknal_find_peer(struct lnet_ni *ni,
 				    struct lnet_process_id id);
-extern void ksocknal_peer_failed (ksock_peer_ni_t *peer_ni);
-extern int ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
+extern void ksocknal_peer_failed(struct ksock_peer_ni *peer_ni);
+extern int ksocknal_create_conn(struct lnet_ni *ni, struct ksock_route *route,
 				struct socket *sock, int type);
-extern void ksocknal_close_conn_locked (ksock_conn_t *conn, int why);
-extern void ksocknal_terminate_conn (ksock_conn_t *conn);
-extern void ksocknal_destroy_conn (ksock_conn_t *conn);
-extern int  ksocknal_close_peer_conns_locked (ksock_peer_ni_t *peer_ni,
-                                              __u32 ipaddr, int why);
-extern int ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why);
+extern void ksocknal_close_conn_locked(struct ksock_conn *conn, int why);
+extern void ksocknal_terminate_conn(struct ksock_conn *conn);
+extern void ksocknal_destroy_conn(struct ksock_conn *conn);
+extern int  ksocknal_close_peer_conns_locked(struct ksock_peer_ni *peer_ni,
+					     __u32 ipaddr, int why);
+extern int ksocknal_close_conn_and_siblings(struct ksock_conn *conn, int why);
 int ksocknal_close_matching_conns(struct lnet_process_id id, __u32 ipaddr);
-extern ksock_conn_t *ksocknal_find_conn_locked(ksock_peer_ni_t *peer_ni,
-                                               ksock_tx_t *tx, int nonblk);
+extern struct ksock_conn *ksocknal_find_conn_locked(struct ksock_peer_ni *peer_ni,
+						    struct ksock_tx *tx, int nonblk);
 
-extern int  ksocknal_launch_packet(struct lnet_ni *ni, ksock_tx_t *tx,
+extern int  ksocknal_launch_packet(struct lnet_ni *ni, struct ksock_tx *tx,
 				   struct lnet_process_id id);
-extern ksock_tx_t *ksocknal_alloc_tx(int type, int size);
-extern void ksocknal_free_tx (ksock_tx_t *tx);
-extern ksock_tx_t *ksocknal_alloc_tx_noop(__u64 cookie, int nonblk);
-extern void ksocknal_next_tx_carrier(ksock_conn_t *conn);
-extern void ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn);
+extern struct ksock_tx *ksocknal_alloc_tx(int type, int size);
+extern void ksocknal_free_tx(struct ksock_tx *tx);
+extern struct ksock_tx *ksocknal_alloc_tx_noop(__u64 cookie, int nonblk);
+extern void ksocknal_next_tx_carrier(struct ksock_conn *conn);
+extern void ksocknal_queue_tx_locked(struct ksock_tx *tx, struct ksock_conn *conn);
 extern void ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist,
 				 int error);
 extern void ksocknal_notify(struct lnet_ni *ni, lnet_nid_t gw_nid, int alive);
-extern void ksocknal_query (struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when);
+extern void ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, time64_t *when);
 extern int ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name);
-extern void ksocknal_thread_fini (void);
-extern void ksocknal_launch_all_connections_locked (ksock_peer_ni_t *peer_ni);
-extern ksock_route_t *ksocknal_find_connectable_route_locked (ksock_peer_ni_t *peer_ni);
-extern ksock_route_t *ksocknal_find_connecting_route_locked (ksock_peer_ni_t *peer_ni);
-extern int ksocknal_new_packet (ksock_conn_t *conn, int skip);
-extern int ksocknal_scheduler (void *arg);
-extern int ksocknal_connd (void *arg);
-extern int ksocknal_reaper (void *arg);
-int ksocknal_send_hello(struct lnet_ni *ni, ksock_conn_t *conn,
+extern void ksocknal_thread_fini(void);
+extern void ksocknal_launch_all_connections_locked(struct ksock_peer_ni *peer_ni);
+extern struct ksock_route *ksocknal_find_connectable_route_locked(struct ksock_peer_ni *peer_ni);
+extern struct ksock_route *ksocknal_find_connecting_route_locked(struct ksock_peer_ni *peer_ni);
+extern int ksocknal_new_packet(struct ksock_conn *conn, int skip);
+extern int ksocknal_scheduler(void *arg);
+extern int ksocknal_connd(void *arg);
+extern int ksocknal_reaper(void *arg);
+int ksocknal_send_hello(struct lnet_ni *ni, struct ksock_conn *conn,
 			lnet_nid_t peer_nid, struct ksock_hello_msg *hello);
-int ksocknal_recv_hello(struct lnet_ni *ni, ksock_conn_t *conn,
+int ksocknal_recv_hello(struct lnet_ni *ni, struct ksock_conn *conn,
 			struct ksock_hello_msg *hello,
 			struct lnet_process_id *id,
 			__u64 *incarnation);
-extern void ksocknal_read_callback(ksock_conn_t *conn);
-extern void ksocknal_write_callback(ksock_conn_t *conn);
+extern void ksocknal_read_callback(struct ksock_conn *conn);
+extern void ksocknal_write_callback(struct ksock_conn *conn);
 
-extern int ksocknal_lib_zc_capable(ksock_conn_t *conn);
-extern void ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn);
-extern void ksocknal_lib_set_callback(struct socket *sock,  ksock_conn_t *conn);
+extern int ksocknal_lib_zc_capable(struct ksock_conn *conn);
+extern void ksocknal_lib_save_callback(struct socket *sock, struct ksock_conn *conn);
+extern void ksocknal_lib_set_callback(struct socket *sock,  struct ksock_conn *conn);
 extern void ksocknal_lib_reset_callback(struct socket *sock,
-					ksock_conn_t *conn);
-extern void ksocknal_lib_push_conn(ksock_conn_t *conn);
-extern int ksocknal_lib_get_conn_addrs(ksock_conn_t *conn);
+					struct ksock_conn *conn);
+extern void ksocknal_lib_push_conn(struct ksock_conn *conn);
+extern int ksocknal_lib_get_conn_addrs(struct ksock_conn *conn);
 extern int ksocknal_lib_setup_sock(struct socket *so);
-extern int ksocknal_lib_send_iov(ksock_conn_t *conn, ksock_tx_t *tx);
-extern int ksocknal_lib_send_kiov(ksock_conn_t *conn, ksock_tx_t *tx);
-extern void ksocknal_lib_eager_ack(ksock_conn_t *conn);
-extern int ksocknal_lib_recv_iov(ksock_conn_t *conn);
-extern int ksocknal_lib_recv_kiov(ksock_conn_t *conn);
-extern int ksocknal_lib_get_conn_tunables(ksock_conn_t *conn, int *txmem,
+extern int ksocknal_lib_send_iov(struct ksock_conn *conn, struct ksock_tx *tx,
+				 struct kvec *scratch_iov);
+extern int ksocknal_lib_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx,
+				  struct kvec *scratch_iov);
+extern void ksocknal_lib_eager_ack(struct ksock_conn *conn);
+extern int ksocknal_lib_recv_iov(struct ksock_conn *conn,
+				 struct kvec *scratchiov);
+extern int ksocknal_lib_recv_kiov(struct ksock_conn *conn, struct page **pages,
+		       struct kvec *scratchiov);
+extern int ksocknal_lib_get_conn_tunables(struct ksock_conn *conn, int *txmem,
 					  int *rxmem, int *nagle);
 
 extern int ksocknal_tunables_init(void);
 
-extern void ksocknal_lib_csum_tx(ksock_tx_t *tx);
+extern void ksocknal_lib_csum_tx(struct ksock_tx *tx);
 
-extern int ksocknal_lib_memory_pressure(ksock_conn_t *conn);
+extern int ksocknal_lib_memory_pressure(struct ksock_conn *conn);
 extern int ksocknal_lib_bind_thread_to_cpu(int id);
 
 #endif /* _SOCKLND_SOCKLND_H_ */
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c
index 83c6a2da2f4ae..1da3fe51398ca 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  *
  *   Author: Zach Brown <zab@zabbo.net>
  *   Author: Peter J. Braam <braam@clusterfs.com>
@@ -26,10 +26,10 @@
 
 #include "socklnd.h"
 
-ksock_tx_t *
+struct ksock_tx *
 ksocknal_alloc_tx(int type, int size)
 {
-	ksock_tx_t *tx = NULL;
+	struct ksock_tx *tx = NULL;
 
 	if (type == KSOCK_MSG_NOOP) {
 		LASSERT(size == KSOCK_NOOP_TX_SIZE);
@@ -38,8 +38,8 @@ ksocknal_alloc_tx(int type, int size)
 		spin_lock(&ksocknal_data.ksnd_tx_lock);
 
 		if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
-			tx = list_entry(ksocknal_data.ksnd_idle_noop_txs. \
-					    next, ksock_tx_t, tx_list);
+			tx = list_entry(ksocknal_data.ksnd_idle_noop_txs.next,
+					struct ksock_tx, tx_list);
 			LASSERT(tx->tx_desc_size == size);
 			list_del(&tx->tx_list);
 		}
@@ -57,6 +57,7 @@ ksocknal_alloc_tx(int type, int size)
 	tx->tx_zc_aborted = 0;
 	tx->tx_zc_capable = 0;
 	tx->tx_zc_checked = 0;
+	tx->tx_hstatus = LNET_MSG_STATUS_OK;
 	tx->tx_desc_size  = size;
 
 	atomic_inc(&ksocknal_data.ksnd_nactive_txs);
@@ -64,10 +65,10 @@ ksocknal_alloc_tx(int type, int size)
 	return tx;
 }
 
-ksock_tx_t *
+struct ksock_tx *
 ksocknal_alloc_tx_noop(__u64 cookie, int nonblk)
 {
-        ksock_tx_t *tx;
+	struct ksock_tx *tx;
 
         tx = ksocknal_alloc_tx(KSOCK_MSG_NOOP, KSOCK_NOOP_TX_SIZE);
         if (tx == NULL) {
@@ -93,7 +94,7 @@ ksocknal_alloc_tx_noop(__u64 cookie, int nonblk)
 
 
 void
-ksocknal_free_tx (ksock_tx_t *tx)
+ksocknal_free_tx(struct ksock_tx *tx)
 {
 	atomic_dec(&ksocknal_data.ksnd_nactive_txs);
 
@@ -110,82 +111,85 @@ ksocknal_free_tx (ksock_tx_t *tx)
 }
 
 static int
-ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
+ksocknal_send_iov(struct ksock_conn *conn, struct ksock_tx *tx,
+		  struct kvec *scratch_iov)
 {
 	struct kvec *iov = tx->tx_iov;
-        int    nob;
-        int    rc;
+	int    nob;
+	int    rc;
 
-        LASSERT (tx->tx_niov > 0);
+	LASSERT(tx->tx_niov > 0);
 
-        /* Never touch tx->tx_iov inside ksocknal_lib_send_iov() */
-        rc = ksocknal_lib_send_iov(conn, tx);
+	/* Never touch tx->tx_iov inside ksocknal_lib_send_iov() */
+	rc = ksocknal_lib_send_iov(conn, tx, scratch_iov);
 
-        if (rc <= 0)                            /* sent nothing? */
-                return (rc);
+	if (rc <= 0)                            /* sent nothing? */
+		return rc;
 
-        nob = rc;
-        LASSERT (nob <= tx->tx_resid);
-        tx->tx_resid -= nob;
+	nob = rc;
+	LASSERT(nob <= tx->tx_resid);
+	tx->tx_resid -= nob;
 
-        /* "consume" iov */
-        do {
-                LASSERT (tx->tx_niov > 0);
+	/* "consume" iov */
+	do {
+		LASSERT(tx->tx_niov > 0);
 
-                if (nob < (int) iov->iov_len) {
+		if (nob < (int) iov->iov_len) {
 			iov->iov_base += nob;
-                        iov->iov_len -= nob;
-                        return (rc);
-                }
+			iov->iov_len -= nob;
+			return rc;
+		}
 
-                nob -= iov->iov_len;
-                tx->tx_iov = ++iov;
-                tx->tx_niov--;
-        } while (nob != 0);
+		nob -= iov->iov_len;
+		tx->tx_iov = ++iov;
+		tx->tx_niov--;
+	} while (nob != 0);
 
-        return (rc);
+	return rc;
 }
 
 static int
-ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
+ksocknal_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx,
+		   struct kvec *scratch_iov)
 {
-        lnet_kiov_t    *kiov = tx->tx_kiov;
-        int     nob;
-        int     rc;
+	lnet_kiov_t *kiov = tx->tx_kiov;
+	int nob;
+	int rc;
 
-        LASSERT (tx->tx_niov == 0);
-        LASSERT (tx->tx_nkiov > 0);
+	LASSERT(tx->tx_niov == 0);
+	LASSERT(tx->tx_nkiov > 0);
 
-        /* Never touch tx->tx_kiov inside ksocknal_lib_send_kiov() */
-        rc = ksocknal_lib_send_kiov(conn, tx);
+	/* Never touch tx->tx_kiov inside ksocknal_lib_send_kiov() */
+	rc = ksocknal_lib_send_kiov(conn, tx, scratch_iov);
 
-        if (rc <= 0)                            /* sent nothing? */
-                return (rc);
+	if (rc <= 0)                            /* sent nothing? */
+		return rc;
 
-        nob = rc;
-        LASSERT (nob <= tx->tx_resid);
-        tx->tx_resid -= nob;
+	nob = rc;
+	LASSERT(nob <= tx->tx_resid);
+	tx->tx_resid -= nob;
 
-        /* "consume" kiov */
-        do {
-                LASSERT(tx->tx_nkiov > 0);
+	/* "consume" kiov */
+	do {
+		LASSERT(tx->tx_nkiov > 0);
 
-                if (nob < (int)kiov->kiov_len) {
-                        kiov->kiov_offset += nob;
-                        kiov->kiov_len -= nob;
-                        return rc;
-                }
+		if (nob < (int)kiov->kiov_len) {
+			kiov->kiov_offset += nob;
+			kiov->kiov_len -= nob;
+			return rc;
+		}
 
-                nob -= (int)kiov->kiov_len;
-                tx->tx_kiov = ++kiov;
-                tx->tx_nkiov--;
-        } while (nob != 0);
+		nob -= (int)kiov->kiov_len;
+		tx->tx_kiov = ++kiov;
+		tx->tx_nkiov--;
+	} while (nob != 0);
 
-        return (rc);
+	return rc;
 }
 
 static int
-ksocknal_transmit(ksock_conn_t *conn, ksock_tx_t *tx)
+ksocknal_transmit(struct ksock_conn *conn, struct ksock_tx *tx,
+		  struct kvec *scratch_iov)
 {
 	int	rc;
 	int	bufnob;
@@ -197,214 +201,223 @@ ksocknal_transmit(ksock_conn_t *conn, ksock_tx_t *tx)
 
 	LASSERT(tx->tx_resid != 0);
 
-        rc = ksocknal_connsock_addref(conn);
-        if (rc != 0) {
-                LASSERT (conn->ksnc_closing);
-                return (-ESHUTDOWN);
-        }
+	rc = ksocknal_connsock_addref(conn);
+	if (rc != 0) {
+		LASSERT(conn->ksnc_closing);
+		return -ESHUTDOWN;
+	}
 
-        do {
-                if (ksocknal_data.ksnd_enomem_tx > 0) {
-                        /* testing... */
-                        ksocknal_data.ksnd_enomem_tx--;
-                        rc = -EAGAIN;
-                } else if (tx->tx_niov != 0) {
-                        rc = ksocknal_send_iov (conn, tx);
-                } else {
-                        rc = ksocknal_send_kiov (conn, tx);
-                }
+	do {
+		if (ksocknal_data.ksnd_enomem_tx > 0) {
+			/* testing... */
+			ksocknal_data.ksnd_enomem_tx--;
+			rc = -EAGAIN;
+		} else if (tx->tx_niov != 0) {
+			rc = ksocknal_send_iov(conn, tx, scratch_iov);
+		} else {
+			rc = ksocknal_send_kiov(conn, tx, scratch_iov);
+		}
 
 		bufnob = conn->ksnc_sock->sk->sk_wmem_queued;
-                if (rc > 0)                     /* sent something? */
-                        conn->ksnc_tx_bufnob += rc; /* account it */
+		if (rc > 0)                     /* sent something? */
+			conn->ksnc_tx_bufnob += rc; /* account it */
 
 		if (bufnob < conn->ksnc_tx_bufnob) {
 			/* allocated send buffer bytes < computed; infer
 			 * something got ACKed */
-			conn->ksnc_tx_deadline =
-				cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
-			conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+			conn->ksnc_tx_deadline = ktime_get_seconds() +
+						 lnet_get_lnd_timeout();
+			conn->ksnc_peer->ksnp_last_alive = ktime_get_seconds();
 			conn->ksnc_tx_bufnob = bufnob;
 			smp_mb();
 		}
 
 		if (rc <= 0) { /* Didn't write anything? */
+			/* some stacks return 0 instead of -EAGAIN */
+			if (rc == 0)
+				rc = -EAGAIN;
 
-                        if (rc == 0) /* some stacks return 0 instead of -EAGAIN */
-                                rc = -EAGAIN;
-
-                        /* Check if EAGAIN is due to memory pressure */
-                        if(rc == -EAGAIN && ksocknal_lib_memory_pressure(conn))
-                                rc = -ENOMEM;
+			/* Check if EAGAIN is due to memory pressure */
+			if (rc == -EAGAIN && ksocknal_lib_memory_pressure(conn))
+				rc = -ENOMEM;
 
-                        break;
-                }
+			break;
+		}
 
-                /* socket's wmem_queued now includes 'rc' bytes */
+		/* socket's wmem_queued now includes 'rc' bytes */
 		atomic_sub (rc, &conn->ksnc_tx_nob);
-                rc = 0;
+		rc = 0;
 
-        } while (tx->tx_resid != 0);
+	} while (tx->tx_resid != 0);
 
-        ksocknal_connsock_decref(conn);
-        return (rc);
+	ksocknal_connsock_decref(conn);
+	return rc;
 }
 
 static int
-ksocknal_recv_iov (ksock_conn_t *conn)
+ksocknal_recv_iov(struct ksock_conn *conn, struct kvec *scratchiov)
 {
 	struct kvec *iov = conn->ksnc_rx_iov;
-        int     nob;
-        int     rc;
+	int     nob;
+	int     rc;
 
-        LASSERT (conn->ksnc_rx_niov > 0);
+	LASSERT(conn->ksnc_rx_niov > 0);
 
 	/* Never touch conn->ksnc_rx_iov or change connection
-         * status inside ksocknal_lib_recv_iov */
-        rc = ksocknal_lib_recv_iov(conn);
+	 * status inside ksocknal_lib_recv_iov */
+	rc = ksocknal_lib_recv_iov(conn, scratchiov);
 
-        if (rc <= 0)
-                return (rc);
+	if (rc <= 0)
+		return rc;
 
-        /* received something... */
-        nob = rc;
+	/* received something... */
+	nob = rc;
 
-	conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
-	conn->ksnc_rx_deadline =
-		cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+	conn->ksnc_peer->ksnp_last_alive = ktime_get_seconds();
+	conn->ksnc_rx_deadline = ktime_get_seconds() +
+				 lnet_get_lnd_timeout();
 	smp_mb();                       /* order with setting rx_started */
 	conn->ksnc_rx_started = 1;
 
 	conn->ksnc_rx_nob_wanted -= nob;
 	conn->ksnc_rx_nob_left -= nob;
 
-        do {
-                LASSERT (conn->ksnc_rx_niov > 0);
+	do {
+		LASSERT(conn->ksnc_rx_niov > 0);
 
-                if (nob < (int)iov->iov_len) {
-                        iov->iov_len -= nob;
+		if (nob < (int)iov->iov_len) {
+			iov->iov_len -= nob;
 			iov->iov_base += nob;
-                        return (-EAGAIN);
-                }
+			return -EAGAIN;
+		}
 
-                nob -= iov->iov_len;
-                conn->ksnc_rx_iov = ++iov;
-                conn->ksnc_rx_niov--;
-        } while (nob != 0);
+		nob -= iov->iov_len;
+		conn->ksnc_rx_iov = ++iov;
+		conn->ksnc_rx_niov--;
+	} while (nob != 0);
 
-        return (rc);
+	return rc;
 }
 
 static int
-ksocknal_recv_kiov (ksock_conn_t *conn)
+ksocknal_recv_kiov(struct ksock_conn *conn, struct page **rx_scratch_pgs,
+		   struct kvec *scratch_iov)
 {
-        lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
-        int     nob;
-        int     rc;
-        LASSERT (conn->ksnc_rx_nkiov > 0);
+	lnet_kiov_t *kiov = conn->ksnc_rx_kiov;
+	int nob;
+	int rc;
+	LASSERT(conn->ksnc_rx_nkiov > 0);
 
 	/* Never touch conn->ksnc_rx_kiov or change connection
-         * status inside ksocknal_lib_recv_iov */
-        rc = ksocknal_lib_recv_kiov(conn);
+	 * status inside ksocknal_lib_recv_iov */
+	rc = ksocknal_lib_recv_kiov(conn, rx_scratch_pgs, scratch_iov);
 
-        if (rc <= 0)
-                return (rc);
+	if (rc <= 0)
+		return rc;
 
-        /* received something... */
-        nob = rc;
+	/* received something... */
+	nob = rc;
 
-	conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
-	conn->ksnc_rx_deadline =
-		cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+	conn->ksnc_peer->ksnp_last_alive = ktime_get_seconds();
+	conn->ksnc_rx_deadline = ktime_get_seconds() +
+				 lnet_get_lnd_timeout();
 	smp_mb();                       /* order with setting rx_started */
 	conn->ksnc_rx_started = 1;
 
 	conn->ksnc_rx_nob_wanted -= nob;
 	conn->ksnc_rx_nob_left -= nob;
 
-        do {
-                LASSERT (conn->ksnc_rx_nkiov > 0);
+	do {
+		LASSERT(conn->ksnc_rx_nkiov > 0);
 
-                if (nob < (int) kiov->kiov_len) {
-                        kiov->kiov_offset += nob;
-                        kiov->kiov_len -= nob;
-                        return -EAGAIN;
-                }
+		if (nob < (int) kiov->kiov_len) {
+			kiov->kiov_offset += nob;
+			kiov->kiov_len -= nob;
+			return -EAGAIN;
+		}
 
-                nob -= kiov->kiov_len;
-                conn->ksnc_rx_kiov = ++kiov;
-                conn->ksnc_rx_nkiov--;
-        } while (nob != 0);
+		nob -= kiov->kiov_len;
+		conn->ksnc_rx_kiov = ++kiov;
+		conn->ksnc_rx_nkiov--;
+	} while (nob != 0);
 
-        return 1;
+	return 1;
 }
 
 static int
-ksocknal_receive (ksock_conn_t *conn)
+ksocknal_receive(struct ksock_conn *conn, struct page **rx_scratch_pgs,
+		 struct kvec *scratch_iov)
 {
-        /* Return 1 on success, 0 on EOF, < 0 on error.
-         * Caller checks ksnc_rx_nob_wanted to determine
-         * progress/completion. */
-        int     rc;
-        ENTRY;
+	/* Return 1 on success, 0 on EOF, < 0 on error.
+	 * Caller checks ksnc_rx_nob_wanted to determine
+	 * progress/completion. */
+	int     rc;
+	ENTRY;
 
 	if (ksocknal_data.ksnd_stall_rx != 0) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_timeout(cfs_time_seconds(ksocknal_data.ksnd_stall_rx));
 	}
 
-        rc = ksocknal_connsock_addref(conn);
-        if (rc != 0) {
-                LASSERT (conn->ksnc_closing);
-                return (-ESHUTDOWN);
-        }
+	rc = ksocknal_connsock_addref(conn);
+	if (rc != 0) {
+		LASSERT(conn->ksnc_closing);
+		return -ESHUTDOWN;
+	}
 
-        for (;;) {
-                if (conn->ksnc_rx_niov != 0)
-                        rc = ksocknal_recv_iov (conn);
-                else
-                        rc = ksocknal_recv_kiov (conn);
-
-                if (rc <= 0) {
-                        /* error/EOF or partial receive */
-                        if (rc == -EAGAIN) {
-                                rc = 1;
-                        } else if (rc == 0 && conn->ksnc_rx_started) {
-                                /* EOF in the middle of a message */
-                                rc = -EPROTO;
-                        }
-                        break;
-                }
+	for (;;) {
+		if (conn->ksnc_rx_niov != 0)
+			rc = ksocknal_recv_iov(conn, scratch_iov);
+		else
+			rc = ksocknal_recv_kiov(conn, rx_scratch_pgs,
+						 scratch_iov);
 
-                /* Completed a fragment */
+		if (rc <= 0) {
+			/* error/EOF or partial receive */
+			if (rc == -EAGAIN) {
+				rc = 1;
+			} else if (rc == 0 && conn->ksnc_rx_started) {
+				/* EOF in the middle of a message */
+				rc = -EPROTO;
+			}
+			break;
+		}
 
-                if (conn->ksnc_rx_nob_wanted == 0) {
-                        rc = 1;
-                        break;
-                }
-        }
+		/* Completed a fragment */
 
-        ksocknal_connsock_decref(conn);
-        RETURN (rc);
+		if (conn->ksnc_rx_nob_wanted == 0) {
+			rc = 1;
+			break;
+		}
+	}
+
+	ksocknal_connsock_decref(conn);
+	RETURN(rc);
 }
 
 void
-ksocknal_tx_done(struct lnet_ni *ni, ksock_tx_t *tx, int rc)
+ksocknal_tx_done(struct lnet_ni *ni, struct ksock_tx *tx, int rc)
 {
 	struct lnet_msg *lnetmsg = tx->tx_lnetmsg;
+	enum lnet_msg_hstatus hstatus = tx->tx_hstatus;
         ENTRY;
 
 	LASSERT(ni != NULL || tx->tx_conn != NULL);
 
-	if (!rc && (tx->tx_resid != 0 || tx->tx_zc_aborted))
+	if (!rc && (tx->tx_resid != 0 || tx->tx_zc_aborted)) {
 		rc = -EIO;
+		if (hstatus == LNET_MSG_STATUS_OK)
+			hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
+	}
 
 	if (tx->tx_conn != NULL)
 		ksocknal_conn_decref(tx->tx_conn);
 
 	ksocknal_free_tx(tx);
-	if (lnetmsg != NULL) /* KSOCK_MSG_NOOP go without lnetmsg */
+	if (lnetmsg != NULL) { /* KSOCK_MSG_NOOP go without lnetmsg */
+		lnetmsg->msg_health_status = hstatus;
 		lnet_finalize(lnetmsg, rc);
+	}
 
 	EXIT;
 }
@@ -412,10 +425,10 @@ ksocknal_tx_done(struct lnet_ni *ni, ksock_tx_t *tx, int rc)
 void
 ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist, int error)
 {
-	ksock_tx_t *tx;
+	struct ksock_tx *tx;
 
 	while (!list_empty(txlist)) {
-		tx = list_entry(txlist->next, ksock_tx_t, tx_list);
+		tx = list_entry(txlist->next, struct ksock_tx, tx_list);
 
 		if (error && tx->tx_lnetmsg != NULL) {
 			CNETERR("Deleting packet type %d len %d %s->%s\n",
@@ -429,16 +442,34 @@ ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist, int error)
 
 		list_del(&tx->tx_list);
 
+		if (tx->tx_hstatus == LNET_MSG_STATUS_OK) {
+			if (error == -ETIMEDOUT)
+				tx->tx_hstatus =
+				  LNET_MSG_STATUS_LOCAL_TIMEOUT;
+			else if (error == -ENETDOWN ||
+				 error == -EHOSTUNREACH ||
+				 error == -ENETUNREACH ||
+				 error == -ECONNREFUSED ||
+				 error == -ECONNRESET)
+				tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_DROPPED;
+			/*
+			 * for all other errors we don't want to
+			 * retransmit
+			 */
+			else if (error)
+				tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
+		}
+
 		LASSERT(atomic_read(&tx->tx_refcount) == 1);
 		ksocknal_tx_done(ni, tx, error);
 	}
 }
 
 static void
-ksocknal_check_zc_req(ksock_tx_t *tx)
+ksocknal_check_zc_req(struct ksock_tx *tx)
 {
-        ksock_conn_t   *conn = tx->tx_conn;
-        ksock_peer_ni_t   *peer_ni = conn->ksnc_peer;
+	struct ksock_conn *conn = tx->tx_conn;
+	struct ksock_peer_ni *peer_ni = conn->ksnc_peer;
 
         /* Set tx_msg.ksm_zc_cookies[0] to a unique non-zero cookie and add tx
          * to ksnp_zc_req_list if some fragment of this message should be sent
@@ -463,8 +494,8 @@ ksocknal_check_zc_req(ksock_tx_t *tx)
 	spin_lock(&peer_ni->ksnp_lock);
 
         /* ZC_REQ is going to be pinned to the peer_ni */
-	tx->tx_deadline =
-		cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+	tx->tx_deadline = ktime_get_seconds() +
+			  lnet_get_lnd_timeout();
 
         LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0);
 
@@ -479,9 +510,9 @@ ksocknal_check_zc_req(ksock_tx_t *tx)
 }
 
 static void
-ksocknal_uncheck_zc_req(ksock_tx_t *tx)
+ksocknal_uncheck_zc_req(struct ksock_tx *tx)
 {
-	ksock_peer_ni_t   *peer_ni = tx->tx_conn->ksnc_peer;
+	struct ksock_peer_ni *peer_ni = tx->tx_conn->ksnc_peer;
 
 	LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
 	LASSERT(tx->tx_zc_capable);
@@ -505,85 +536,111 @@ ksocknal_uncheck_zc_req(ksock_tx_t *tx)
 }
 
 static int
-ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
+ksocknal_process_transmit(struct ksock_conn *conn, struct ksock_tx *tx,
+			  struct kvec *scratch_iov)
 {
-        int            rc;
+	int rc;
+	bool error_sim = false;
+
+	if (lnet_send_error_simulation(tx->tx_lnetmsg, &tx->tx_hstatus)) {
+		error_sim = true;
+		rc = -EINVAL;
+		goto simulate_error;
+	}
 
-        if (tx->tx_zc_capable && !tx->tx_zc_checked)
-                ksocknal_check_zc_req(tx);
+	if (tx->tx_zc_capable && !tx->tx_zc_checked)
+		ksocknal_check_zc_req(tx);
 
-        rc = ksocknal_transmit (conn, tx);
+	rc = ksocknal_transmit(conn, tx, scratch_iov);
 
-        CDEBUG (D_NET, "send(%d) %d\n", tx->tx_resid, rc);
+	CDEBUG(D_NET, "send(%d) %d\n", tx->tx_resid, rc);
 
-        if (tx->tx_resid == 0) {
-                /* Sent everything OK */
-                LASSERT (rc == 0);
+	if (tx->tx_resid == 0) {
+		/* Sent everything OK */
+		LASSERT(rc == 0);
 
-                return (0);
-        }
+		return 0;
+	}
 
-        if (rc == -EAGAIN)
-                return (rc);
+	if (rc == -EAGAIN)
+		return rc;
 
-        if (rc == -ENOMEM) {
-                static int counter;
+	if (rc == -ENOMEM) {
+		static int counter;
 
-                counter++;   /* exponential backoff warnings */
-                if ((counter & (-counter)) == counter)
-                        CWARN("%u ENOMEM tx %p (%u allocated)\n",
+		counter++;   /* exponential backoff warnings */
+		if ((counter & (-counter)) == counter)
+			CWARN("%u ENOMEM tx %p (%u allocated)\n",
 			      counter, conn, atomic_read(&libcfs_kmemory));
 
-                /* Queue on ksnd_enomem_conns for retry after a timeout */
+		/* Queue on ksnd_enomem_conns for retry after a timeout */
 		spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
 
-                /* enomem list takes over scheduler's ref... */
-                LASSERT (conn->ksnc_tx_scheduled);
+		/* enomem list takes over scheduler's ref... */
+		LASSERT(conn->ksnc_tx_scheduled);
 		list_add_tail(&conn->ksnc_tx_list,
-                                  &ksocknal_data.ksnd_enomem_conns);
-		if (!cfs_time_aftereq(cfs_time_add(cfs_time_current(),
-					SOCKNAL_ENOMEM_RETRY),
-					ksocknal_data.ksnd_reaper_waketime))
+				  &ksocknal_data.ksnd_enomem_conns);
+		if (ktime_get_seconds() + SOCKNAL_ENOMEM_RETRY <
+		    ksocknal_data.ksnd_reaper_waketime)
 			wake_up(&ksocknal_data.ksnd_reaper_waitq);
 
 		spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+		/*
+		 * set the health status of the message which determines
+		 * whether we should retry the transmit
+		 */
+		tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
 		return (rc);
 	}
 
-        /* Actual error */
-        LASSERT (rc < 0);
+simulate_error:
 
-        if (!conn->ksnc_closing) {
-                switch (rc) {
-                case -ECONNRESET:
+	/* Actual error */
+	LASSERT(rc < 0);
+
+	if (!error_sim) {
+		/*
+		* set the health status of the message which determines
+		* whether we should retry the transmit
+		*/
+		if (rc == -ETIMEDOUT)
+			tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_TIMEOUT;
+		else
+			tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
+	}
+
+	if (!conn->ksnc_closing) {
+		switch (rc) {
+		case -ECONNRESET:
 			LCONSOLE_WARN("Host %pI4h reset our connection "
-                                      "while we were sending data; it may have "
-                                      "rebooted.\n",
+				      "while we were sending data; it may have "
+				      "rebooted.\n",
 				      &conn->ksnc_ipaddr);
-                        break;
-                default:
-                        LCONSOLE_WARN("There was an unexpected network error "
+			break;
+		default:
+			LCONSOLE_WARN("There was an unexpected network error "
 				      "while writing to %pI4h: %d.\n",
 				      &conn->ksnc_ipaddr, rc);
-                        break;
-                }
+			break;
+		}
 		CDEBUG(D_NET, "[%p] Error %d on write to %s ip %pI4h:%d\n",
 		       conn, rc, libcfs_id2str(conn->ksnc_peer->ksnp_id),
 		       &conn->ksnc_ipaddr, conn->ksnc_port);
-        }
+	}
 
-        if (tx->tx_zc_checked)
-                ksocknal_uncheck_zc_req(tx);
+	if (tx->tx_zc_checked)
+		ksocknal_uncheck_zc_req(tx);
 
-        /* it's not an error if conn is being closed */
-        ksocknal_close_conn_and_siblings (conn,
-                                          (conn->ksnc_closing) ? 0 : rc);
+	/* it's not an error if conn is being closed */
+	ksocknal_close_conn_and_siblings(conn,
+					  (conn->ksnc_closing) ? 0 : rc);
 
-        return (rc);
+	return rc;
 }
 
 static void
-ksocknal_launch_connection_locked (ksock_route_t *route)
+ksocknal_launch_connection_locked(struct ksock_route *route)
 {
 
         /* called holding write lock on ksnd_global_lock */
@@ -605,9 +662,9 @@ ksocknal_launch_connection_locked (ksock_route_t *route)
 }
 
 void
-ksocknal_launch_all_connections_locked (ksock_peer_ni_t *peer_ni)
+ksocknal_launch_all_connections_locked(struct ksock_peer_ni *peer_ni)
 {
-        ksock_route_t *route;
+	struct ksock_route *route;
 
         /* called holding write lock on ksnd_global_lock */
         for (;;) {
@@ -620,21 +677,22 @@ ksocknal_launch_all_connections_locked (ksock_peer_ni_t *peer_ni)
         }
 }
 
-ksock_conn_t *
-ksocknal_find_conn_locked(ksock_peer_ni_t *peer_ni, ksock_tx_t *tx, int nonblk)
+struct ksock_conn *
+ksocknal_find_conn_locked(struct ksock_peer_ni *peer_ni, struct ksock_tx *tx, int nonblk)
 {
 	struct list_head *tmp;
-        ksock_conn_t     *conn;
-        ksock_conn_t     *typed = NULL;
-        ksock_conn_t     *fallback = NULL;
-        int               tnob     = 0;
-        int               fnob     = 0;
+	struct ksock_conn *conn;
+	struct ksock_conn *typed = NULL;
+	struct ksock_conn *fallback = NULL;
+	int tnob = 0;
+	int fnob = 0;
 
 	list_for_each(tmp, &peer_ni->ksnp_conns) {
-		ksock_conn_t *c  = list_entry(tmp, ksock_conn_t, ksnc_list);
-		int           nob = atomic_read(&c->ksnc_tx_nob) +
-					c->ksnc_sock->sk->sk_wmem_queued;
-                int           rc;
+		struct ksock_conn *c = list_entry(tmp, struct ksock_conn,
+						  ksnc_list);
+		int nob = atomic_read(&c->ksnc_tx_nob) +
+			  c->ksnc_sock->sk->sk_wmem_queued;
+		int rc;
 
                 LASSERT (!c->ksnc_closing);
                 LASSERT (c->ksnc_proto != NULL &&
@@ -651,7 +709,7 @@ ksocknal_find_conn_locked(ksock_peer_ni_t *peer_ni, ksock_tx_t *tx, int nonblk)
                 case SOCKNAL_MATCH_YES: /* typed connection */
                         if (typed == NULL || tnob > nob ||
                             (tnob == nob && *ksocknal_tunables.ksnd_round_robin &&
-			     cfs_time_after(typed->ksnc_tx_last_post, c->ksnc_tx_last_post))) {
+			     typed->ksnc_tx_last_post > c->ksnc_tx_last_post)) {
                                 typed = c;
                                 tnob  = nob;
                         }
@@ -660,7 +718,7 @@ ksocknal_find_conn_locked(ksock_peer_ni_t *peer_ni, ksock_tx_t *tx, int nonblk)
                 case SOCKNAL_MATCH_MAY: /* fallback connection */
                         if (fallback == NULL || fnob > nob ||
                             (fnob == nob && *ksocknal_tunables.ksnd_round_robin &&
-			     cfs_time_after(fallback->ksnc_tx_last_post, c->ksnc_tx_last_post))) {
+			     fallback->ksnc_tx_last_post > c->ksnc_tx_last_post)) {
                                 fallback = c;
                                 fnob     = nob;
                         }
@@ -672,13 +730,13 @@ ksocknal_find_conn_locked(ksock_peer_ni_t *peer_ni, ksock_tx_t *tx, int nonblk)
         conn = (typed != NULL) ? typed : fallback;
 
         if (conn != NULL)
-		conn->ksnc_tx_last_post = cfs_time_current();
+		conn->ksnc_tx_last_post = ktime_get_seconds();
 
         return conn;
 }
 
 void
-ksocknal_tx_prep(ksock_conn_t *conn, ksock_tx_t *tx)
+ksocknal_tx_prep(struct ksock_conn *conn, struct ksock_tx *tx)
 {
         conn->ksnc_proto->pro_pack(tx);
 
@@ -688,12 +746,12 @@ ksocknal_tx_prep(ksock_conn_t *conn, ksock_tx_t *tx)
 }
 
 void
-ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn)
+ksocknal_queue_tx_locked(struct ksock_tx *tx, struct ksock_conn *conn)
 {
-        ksock_sched_t *sched = conn->ksnc_scheduler;
-	struct ksock_msg   *msg = &tx->tx_msg;
-        ksock_tx_t    *ztx = NULL;
-        int            bufnob = 0;
+	struct ksock_sched *sched = conn->ksnc_scheduler;
+	struct ksock_msg *msg = &tx->tx_msg;
+	struct ksock_tx *ztx = NULL;
+	int bufnob = 0;
 
         /* called holding global lock (read or irq-write) and caller may
          * not have dropped this lock between finding conn and calling me,
@@ -729,10 +787,10 @@ ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn)
 
 	if (list_empty(&conn->ksnc_tx_queue) && bufnob == 0) {
 		/* First packet starts the timeout */
-		conn->ksnc_tx_deadline =
-			cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+		conn->ksnc_tx_deadline = ktime_get_seconds() +
+					 lnet_get_lnd_timeout();
 		if (conn->ksnc_tx_bufnob > 0) /* something got ACKed */
-			conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+			conn->ksnc_peer->ksnp_last_alive = ktime_get_seconds();
 		conn->ksnc_tx_bufnob = 0;
 		smp_mb(); /* order with adding to tx_queue */
 	}
@@ -775,15 +833,15 @@ ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn)
 }
 
 
-ksock_route_t *
-ksocknal_find_connectable_route_locked (ksock_peer_ni_t *peer_ni)
+struct ksock_route *
+ksocknal_find_connectable_route_locked(struct ksock_peer_ni *peer_ni)
 {
-	cfs_time_t     now = cfs_time_current();
-	struct list_head    *tmp;
-        ksock_route_t *route;
+	time64_t now = ktime_get_seconds();
+	struct list_head *tmp;
+	struct ksock_route *route;
 
 	list_for_each(tmp, &peer_ni->ksnp_routes) {
-		route = list_entry(tmp, ksock_route_t, ksnr_list);
+		route = list_entry(tmp, struct ksock_route, ksnr_list);
 
                 LASSERT (!route->ksnr_connecting || route->ksnr_scheduled);
 
@@ -795,14 +853,14 @@ ksocknal_find_connectable_route_locked (ksock_peer_ni_t *peer_ni)
                         continue;
 
                 if (!(route->ksnr_retry_interval == 0 || /* first attempt */
-		      cfs_time_aftereq(now, route->ksnr_timeout))) {
+		      now >= route->ksnr_timeout)) {
                         CDEBUG(D_NET,
 			       "Too soon to retry route %pI4h "
-			       "(cnted %d, interval %ld, %ld secs later)\n",
+			       "(cnted %d, interval %lld, %lld secs later)\n",
 			       &route->ksnr_ipaddr,
                                route->ksnr_connected,
                                route->ksnr_retry_interval,
-			       cfs_duration_sec(route->ksnr_timeout - now));
+			       route->ksnr_timeout - now);
                         continue;
                 }
 
@@ -812,14 +870,14 @@ ksocknal_find_connectable_route_locked (ksock_peer_ni_t *peer_ni)
         return (NULL);
 }
 
-ksock_route_t *
-ksocknal_find_connecting_route_locked (ksock_peer_ni_t *peer_ni)
+struct ksock_route *
+ksocknal_find_connecting_route_locked(struct ksock_peer_ni *peer_ni)
 {
-	struct list_head        *tmp;
-        ksock_route_t     *route;
+	struct list_head *tmp;
+	struct ksock_route *route;
 
 	list_for_each(tmp, &peer_ni->ksnp_routes) {
-		route = list_entry(tmp, ksock_route_t, ksnr_list);
+		route = list_entry(tmp, struct ksock_route, ksnr_list);
 
                 LASSERT (!route->ksnr_connecting || route->ksnr_scheduled);
 
@@ -831,14 +889,14 @@ ksocknal_find_connecting_route_locked (ksock_peer_ni_t *peer_ni)
 }
 
 int
-ksocknal_launch_packet(struct lnet_ni *ni, ksock_tx_t *tx,
+ksocknal_launch_packet(struct lnet_ni *ni, struct ksock_tx *tx,
 		       struct lnet_process_id id)
 {
-        ksock_peer_ni_t     *peer_ni;
-        ksock_conn_t     *conn;
-	rwlock_t     *g_lock;
-        int               retry;
-        int               rc;
+	struct ksock_peer_ni *peer_ni;
+	struct ksock_conn *conn;
+	rwlock_t *g_lock;
+	int retry;
+	int rc;
 
         LASSERT (tx->tx_conn == NULL);
 
@@ -906,8 +964,8 @@ ksocknal_launch_packet(struct lnet_ni *ni, ksock_tx_t *tx,
         if (peer_ni->ksnp_accepting > 0 ||
             ksocknal_find_connecting_route_locked (peer_ni) != NULL) {
                 /* the message is going to be pinned to the peer_ni */
-		tx->tx_deadline =
-			cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+		tx->tx_deadline = ktime_get_seconds() +
+				  lnet_get_lnd_timeout();
 
                 /* Queue the message until a connection is established */
 		list_add_tail(&tx->tx_list, &peer_ni->ksnp_tx_queue);
@@ -919,6 +977,7 @@ ksocknal_launch_packet(struct lnet_ni *ni, ksock_tx_t *tx,
 
         /* NB Routes may be ignored if connections to them failed recently */
         CNETERR("No usable routes to %s\n", libcfs_id2str(id));
+	tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_ERROR;
         return (-EHOSTUNREACH);
 }
 
@@ -933,7 +992,7 @@ ksocknal_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
         lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
         unsigned int      payload_offset = lntmsg->msg_offset;
         unsigned int      payload_nob = lntmsg->msg_len;
-        ksock_tx_t       *tx;
+	struct ksock_tx *tx;
         int               desc_size;
         int               rc;
 
@@ -950,10 +1009,10 @@ ksocknal_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
 	LASSERT (!in_interrupt ());
 
 	if (payload_iov != NULL)
-		desc_size = offsetof(ksock_tx_t,
+		desc_size = offsetof(struct ksock_tx,
 				     tx_frags.virt.iov[1 + payload_niov]);
 	else
-		desc_size = offsetof(ksock_tx_t,
+		desc_size = offsetof(struct ksock_tx,
 				     tx_frags.paged.kiov[payload_niov]);
 
         if (lntmsg->msg_vmflush)
@@ -1003,6 +1062,7 @@ ksocknal_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
         if (rc == 0)
                 return (0);
 
+	lntmsg->msg_health_status = tx->tx_hstatus;
         ksocknal_free_tx(tx);
         return (-EIO);
 }
@@ -1030,13 +1090,12 @@ ksocknal_thread_fini (void)
 }
 
 int
-ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip)
+ksocknal_new_packet(struct ksock_conn *conn, int nob_to_skip)
 {
         static char ksocknal_slop_buffer[4096];
-
-        int            nob;
-        unsigned int   niov;
-        int            skipped;
+	int nob;
+	unsigned int niov;
+	int skipped;
 
         LASSERT(conn->ksnc_proto != NULL);
 
@@ -1112,7 +1171,9 @@ ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip)
 }
 
 static int
-ksocknal_process_receive (ksock_conn_t *conn)
+ksocknal_process_receive(struct ksock_conn *conn,
+			 struct page **rx_scratch_pgs,
+			 struct kvec *scratch_iov)
 {
 	struct lnet_hdr *lhdr;
 	struct lnet_process_id *id;
@@ -1122,13 +1183,14 @@ ksocknal_process_receive (ksock_conn_t *conn)
 
 	/* NB: sched lock NOT held */
 	/* SOCKNAL_RX_LNET_HEADER is here for backward compatibility */
-        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_KSM_HEADER ||
-                 conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD ||
-                 conn->ksnc_rx_state == SOCKNAL_RX_LNET_HEADER ||
-                 conn->ksnc_rx_state == SOCKNAL_RX_SLOP);
+	LASSERT(conn->ksnc_rx_state == SOCKNAL_RX_KSM_HEADER ||
+		conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD ||
+		conn->ksnc_rx_state == SOCKNAL_RX_LNET_HEADER ||
+		conn->ksnc_rx_state == SOCKNAL_RX_SLOP);
  again:
-        if (conn->ksnc_rx_nob_wanted != 0) {
-                rc = ksocknal_receive(conn);
+	if (conn->ksnc_rx_nob_wanted != 0) {
+		rc = ksocknal_receive(conn, rx_scratch_pgs,
+				      scratch_iov);
 
 		if (rc <= 0) {
 			struct lnet_process_id ksnp_id;
@@ -1294,7 +1356,10 @@ ksocknal_process_receive (ksock_conn_t *conn)
                                         le64_to_cpu(lhdr->src_nid) != id->nid);
                 }
 
-		lnet_finalize(conn->ksnc_cookie, rc);
+		if (rc && conn->ksnc_lnet_msg)
+			conn->ksnc_lnet_msg->msg_health_status =
+				LNET_MSG_STATUS_REMOTE_ERROR;
+		lnet_finalize(conn->ksnc_lnet_msg, rc);
 
                 if (rc != 0) {
                         ksocknal_new_packet(conn, 0);
@@ -1324,15 +1389,15 @@ ksocknal_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg,
 	      lnet_kiov_t *kiov, unsigned int offset, unsigned int mlen,
 	      unsigned int rlen)
 {
-        ksock_conn_t  *conn = (ksock_conn_t *)private;
-        ksock_sched_t *sched = conn->ksnc_scheduler;
+	struct ksock_conn *conn = private;
+	struct ksock_sched *sched = conn->ksnc_scheduler;
 
         LASSERT (mlen <= rlen);
         LASSERT (niov <= LNET_MAX_IOV);
 
-        conn->ksnc_cookie = msg;
-        conn->ksnc_rx_nob_wanted = mlen;
-        conn->ksnc_rx_nob_left   = rlen;
+	conn->ksnc_lnet_msg = msg;
+	conn->ksnc_rx_nob_wanted = mlen;
+	conn->ksnc_rx_nob_left   = rlen;
 
         if (mlen == 0 || iov != NULL) {
                 conn->ksnc_rx_nkiov = 0;
@@ -1378,7 +1443,7 @@ ksocknal_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg,
 }
 
 static inline int
-ksocknal_sched_cansleep(ksock_sched_t *sched)
+ksocknal_sched_cansleep(struct ksock_sched *sched)
 {
 	int           rc;
 
@@ -1394,154 +1459,169 @@ ksocknal_sched_cansleep(ksock_sched_t *sched)
 
 int ksocknal_scheduler(void *arg)
 {
-	struct ksock_sched_info	*info;
-	ksock_sched_t		*sched;
-	ksock_conn_t		*conn;
-	ksock_tx_t		*tx;
-	int			rc;
-	int			nloops = 0;
-	long			id = (long)arg;
+	struct ksock_sched *sched;
+	struct ksock_conn *conn;
+	struct ksock_tx	*tx;
+	int rc;
+	int nloops = 0;
+	long id = (long)arg;
+	struct page **rx_scratch_pgs;
+	struct kvec *scratch_iov;
+
+	sched = ksocknal_data.ksnd_schedulers[KSOCK_THREAD_CPT(id)];
+
+	LIBCFS_CPT_ALLOC(rx_scratch_pgs, lnet_cpt_table(), sched->kss_cpt,
+			 sizeof(*rx_scratch_pgs) * LNET_MAX_IOV);
+	if (!rx_scratch_pgs) {
+		CERROR("Unable to allocate scratch pages\n");
+		return -ENOMEM;
+	}
 
-	info = ksocknal_data.ksnd_sched_info[KSOCK_THREAD_CPT(id)];
-	sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
+	LIBCFS_CPT_ALLOC(scratch_iov, lnet_cpt_table(), sched->kss_cpt,
+			 sizeof(*scratch_iov) * LNET_MAX_IOV);
+	if (!scratch_iov) {
+		CERROR("Unable to allocate scratch iov\n");
+		return -ENOMEM;
+	}
 
 	cfs_block_allsigs();
 
-	rc = cfs_cpt_bind(lnet_cpt_table(), info->ksi_cpt);
+	rc = cfs_cpt_bind(lnet_cpt_table(), sched->kss_cpt);
 	if (rc != 0) {
 		CWARN("Can't set CPU partition affinity to %d: %d\n",
-			info->ksi_cpt, rc);
+			sched->kss_cpt, rc);
 	}
 
 	spin_lock_bh(&sched->kss_lock);
 
-        while (!ksocknal_data.ksnd_shuttingdown) {
-                int did_something = 0;
+	while (!ksocknal_data.ksnd_shuttingdown) {
+		int did_something = 0;
 
-                /* Ensure I progress everything semi-fairly */
+		/* Ensure I progress everything semi-fairly */
 
 		if (!list_empty(&sched->kss_rx_conns)) {
 			conn = list_entry(sched->kss_rx_conns.next,
-                                              ksock_conn_t, ksnc_rx_list);
+					  struct ksock_conn, ksnc_rx_list);
 			list_del(&conn->ksnc_rx_list);
 
-                        LASSERT(conn->ksnc_rx_scheduled);
-                        LASSERT(conn->ksnc_rx_ready);
+			LASSERT(conn->ksnc_rx_scheduled);
+			LASSERT(conn->ksnc_rx_ready);
 
-                        /* clear rx_ready in case receive isn't complete.
-                         * Do it BEFORE we call process_recv, since
-                         * data_ready can set it any time after we release
-                         * kss_lock. */
-                        conn->ksnc_rx_ready = 0;
+			/* clear rx_ready in case receive isn't complete.
+			 * Do it BEFORE we call process_recv, since
+			 * data_ready can set it any time after we release
+			 * kss_lock. */
+			conn->ksnc_rx_ready = 0;
 			spin_unlock_bh(&sched->kss_lock);
 
-			rc = ksocknal_process_receive(conn);
+			rc = ksocknal_process_receive(conn, rx_scratch_pgs,
+						      scratch_iov);
 
 			spin_lock_bh(&sched->kss_lock);
 
-                        /* I'm the only one that can clear this flag */
-                        LASSERT(conn->ksnc_rx_scheduled);
+			/* I'm the only one that can clear this flag */
+			LASSERT(conn->ksnc_rx_scheduled);
 
-                        /* Did process_receive get everything it wanted? */
-                        if (rc == 0)
-                                conn->ksnc_rx_ready = 1;
-
-                        if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) {
-                                /* Conn blocked waiting for ksocknal_recv()
-                                 * I change its state (under lock) to signal
-                                 * it can be rescheduled */
-                                conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT;
-                        } else if (conn->ksnc_rx_ready) {
-                                /* reschedule for rx */
+			/* Did process_receive get everything it wanted? */
+			if (rc == 0)
+				conn->ksnc_rx_ready = 1;
+
+			if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) {
+				/* Conn blocked waiting for ksocknal_recv()
+				 * I change its state (under lock) to signal
+				 * it can be rescheduled */
+				conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT;
+			} else if (conn->ksnc_rx_ready) {
+				/* reschedule for rx */
 				list_add_tail(&conn->ksnc_rx_list,
-                                                   &sched->kss_rx_conns);
-                        } else {
-                                conn->ksnc_rx_scheduled = 0;
-                                /* drop my ref */
-                                ksocknal_conn_decref(conn);
-                        }
+						   &sched->kss_rx_conns);
+			} else {
+				conn->ksnc_rx_scheduled = 0;
+				/* drop my ref */
+				ksocknal_conn_decref(conn);
+			}
 
-                        did_something = 1;
-                }
+			did_something = 1;
+		}
 
 		if (!list_empty(&sched->kss_tx_conns)) {
 			struct list_head zlist = LIST_HEAD_INIT(zlist);
 
 			if (!list_empty(&sched->kss_zombie_noop_txs)) {
 				list_add(&zlist,
-                                             &sched->kss_zombie_noop_txs);
+					 &sched->kss_zombie_noop_txs);
 				list_del_init(&sched->kss_zombie_noop_txs);
-                        }
+			}
 
 			conn = list_entry(sched->kss_tx_conns.next,
-                                              ksock_conn_t, ksnc_tx_list);
+					  struct ksock_conn, ksnc_tx_list);
 			list_del(&conn->ksnc_tx_list);
 
-                        LASSERT(conn->ksnc_tx_scheduled);
-                        LASSERT(conn->ksnc_tx_ready);
+			LASSERT(conn->ksnc_tx_scheduled);
+			LASSERT(conn->ksnc_tx_ready);
 			LASSERT(!list_empty(&conn->ksnc_tx_queue));
 
 			tx = list_entry(conn->ksnc_tx_queue.next,
-                                            ksock_tx_t, tx_list);
+					struct ksock_tx, tx_list);
 
-                        if (conn->ksnc_tx_carrier == tx)
-                                ksocknal_next_tx_carrier(conn);
+			if (conn->ksnc_tx_carrier == tx)
+				ksocknal_next_tx_carrier(conn);
 
-                        /* dequeue now so empty list => more to send */
+			/* dequeue now so empty list => more to send */
 			list_del(&tx->tx_list);
 
-                        /* Clear tx_ready in case send isn't complete.  Do
-                         * it BEFORE we call process_transmit, since
-                         * write_space can set it any time after we release
-                         * kss_lock. */
-                        conn->ksnc_tx_ready = 0;
+			/* Clear tx_ready in case send isn't complete.  Do
+			 * it BEFORE we call process_transmit, since
+			 * write_space can set it any time after we release
+			 * kss_lock. */
+			conn->ksnc_tx_ready = 0;
 			spin_unlock_bh(&sched->kss_lock);
 
 			if (!list_empty(&zlist)) {
 				/* free zombie noop txs, it's fast because
-                                 * noop txs are just put in freelist */
-                                ksocknal_txlist_done(NULL, &zlist, 0);
-                        }
+				 * noop txs are just put in freelist */
+				ksocknal_txlist_done(NULL, &zlist, 0);
+			}
 
-                        rc = ksocknal_process_transmit(conn, tx);
+			rc = ksocknal_process_transmit(conn, tx, scratch_iov);
 
-                        if (rc == -ENOMEM || rc == -EAGAIN) {
-                                /* Incomplete send: replace tx on HEAD of tx_queue */
+			if (rc == -ENOMEM || rc == -EAGAIN) {
+				/* Incomplete send: replace tx on HEAD of tx_queue */
 				spin_lock_bh(&sched->kss_lock);
 				list_add(&tx->tx_list,
-					     &conn->ksnc_tx_queue);
+					 &conn->ksnc_tx_queue);
 			} else {
 				/* Complete send; tx -ref */
 				ksocknal_tx_decref(tx);
 
 				spin_lock_bh(&sched->kss_lock);
-                                /* assume space for more */
-                                conn->ksnc_tx_ready = 1;
-                        }
+				/* assume space for more */
+				conn->ksnc_tx_ready = 1;
+			}
 
-                        if (rc == -ENOMEM) {
-                                /* Do nothing; after a short timeout, this
-                                 * conn will be reposted on kss_tx_conns. */
-                        } else if (conn->ksnc_tx_ready &&
+			if (rc == -ENOMEM) {
+				/* Do nothing; after a short timeout, this
+				 * conn will be reposted on kss_tx_conns. */
+			} else if (conn->ksnc_tx_ready &&
 				   !list_empty(&conn->ksnc_tx_queue)) {
-                                /* reschedule for tx */
+				/* reschedule for tx */
 				list_add_tail(&conn->ksnc_tx_list,
-                                                   &sched->kss_tx_conns);
-                        } else {
-                                conn->ksnc_tx_scheduled = 0;
-                                /* drop my ref */
-                                ksocknal_conn_decref(conn);
-                        }
+					      &sched->kss_tx_conns);
+			} else {
+				conn->ksnc_tx_scheduled = 0;
+				/* drop my ref */
+				ksocknal_conn_decref(conn);
+			}
 
-                        did_something = 1;
-                }
-                if (!did_something ||           /* nothing to do */
-                    ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */
+			did_something = 1;
+		}
+		if (!did_something ||           /* nothing to do */
+		    ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */
 			spin_unlock_bh(&sched->kss_lock);
 
-                        nloops = 0;
+			nloops = 0;
 
-                        if (!did_something) {   /* wait for something to do */
+			if (!did_something) {   /* wait for something to do */
 				rc = wait_event_interruptible_exclusive(
 					sched->kss_waitq,
 					!ksocknal_sched_cansleep(sched));
@@ -1555,6 +1635,10 @@ int ksocknal_scheduler(void *arg)
 	}
 
 	spin_unlock_bh(&sched->kss_lock);
+	LIBCFS_FREE(rx_scratch_pgs, sizeof(*rx_scratch_pgs) *
+		    LNET_MAX_IOV);
+	LIBCFS_FREE(scratch_iov, sizeof(*scratch_iov) *
+		    LNET_MAX_IOV);
 	ksocknal_thread_fini();
 	return 0;
 }
@@ -1563,9 +1647,9 @@ int ksocknal_scheduler(void *arg)
  * Add connection to kss_rx_conns of scheduler
  * and wakeup the scheduler.
  */
-void ksocknal_read_callback (ksock_conn_t *conn)
+void ksocknal_read_callback(struct ksock_conn *conn)
 {
-	ksock_sched_t *sched;
+	struct ksock_sched *sched;
 	ENTRY;
 
 	sched = conn->ksnc_scheduler;
@@ -1592,9 +1676,9 @@ void ksocknal_read_callback (ksock_conn_t *conn)
  * Add connection to kss_tx_conns of scheduler
  * and wakeup the scheduler.
  */
-void ksocknal_write_callback(ksock_conn_t *conn)
+void ksocknal_write_callback(struct ksock_conn *conn)
 {
-	ksock_sched_t *sched;
+	struct ksock_sched *sched;
 	ENTRY;
 
 	sched = conn->ksnc_scheduler;
@@ -1618,7 +1702,7 @@ void ksocknal_write_callback(ksock_conn_t *conn)
 	EXIT;
 }
 
-static ksock_proto_t *
+static struct ksock_proto *
 ksocknal_parse_proto_version (struct ksock_hello_msg *hello)
 {
         __u32   version = 0;
@@ -1663,13 +1747,13 @@ ksocknal_parse_proto_version (struct ksock_hello_msg *hello)
 }
 
 int
-ksocknal_send_hello(struct lnet_ni *ni, ksock_conn_t *conn,
+ksocknal_send_hello(struct lnet_ni *ni, struct ksock_conn *conn,
 		    lnet_nid_t peer_nid, struct ksock_hello_msg *hello)
 {
 	/* CAVEAT EMPTOR: this byte flips 'ipaddrs' */
-	ksock_net_t         *net = (ksock_net_t *)ni->ni_data;
+	struct ksock_net *net = (struct ksock_net *)ni->ni_data;
 
-	LASSERT(hello->kshm_nips <= LNET_NUM_INTERFACES);
+	LASSERT(hello->kshm_nips <= LNET_INTERFACES_NUM);
 
 	/* rely on caller to hold a ref on socket so it wouldn't disappear */
 	LASSERT(conn->ksnc_proto != NULL);
@@ -1702,7 +1786,7 @@ ksocknal_invert_type(int type)
 }
 
 int
-ksocknal_recv_hello(struct lnet_ni *ni, ksock_conn_t *conn,
+ksocknal_recv_hello(struct lnet_ni *ni, struct ksock_conn *conn,
 		    struct ksock_hello_msg *hello,
 		    struct lnet_process_id *peerid,
 		    __u64 *incarnation)
@@ -1717,13 +1801,13 @@ ksocknal_recv_hello(struct lnet_ni *ni, ksock_conn_t *conn,
         int                  timeout;
         int                  proto_match;
         int                  rc;
-        ksock_proto_t       *proto;
-	struct lnet_process_id    recv_id;
+	struct ksock_proto *proto;
+	struct lnet_process_id recv_id;
 
 	/* socket type set on active connections - not set on passive */
 	LASSERT(!active == !(conn->ksnc_type != SOCKLND_CONN_NONE));
 
-	timeout = active ? *ksocknal_tunables.ksnd_timeout :
+	timeout = active ? lnet_get_lnd_timeout() :
 			    lnet_acceptor_timeout();
 
 	rc = lnet_sock_read(sock, &hello->kshm_magic,
@@ -1847,19 +1931,18 @@ ksocknal_recv_hello(struct lnet_ni *ni, ksock_conn_t *conn,
 }
 
 static int
-ksocknal_connect (ksock_route_t *route)
+ksocknal_connect(struct ksock_route *route)
 {
-	struct list_head        zombies = LIST_HEAD_INIT(zombies);
-        ksock_peer_ni_t     *peer_ni = route->ksnr_peer;
+	struct list_head zombies = LIST_HEAD_INIT(zombies);
+	struct ksock_peer_ni *peer_ni = route->ksnr_peer;
         int               type;
         int               wanted;
 	struct socket     *sock;
-	cfs_time_t        deadline;
+	time64_t deadline;
         int               retry_later = 0;
         int               rc = 0;
 
-	deadline = cfs_time_add(cfs_time_current(),
-				cfs_time_seconds(*ksocknal_tunables.ksnd_timeout));
+	deadline = ktime_get_seconds() + lnet_get_lnd_timeout();
 
 	write_lock_bh(&ksocknal_data.ksnd_global_lock);
 
@@ -1903,7 +1986,7 @@ ksocknal_connect (ksock_route_t *route)
 
 		write_unlock_bh(&ksocknal_data.ksnd_global_lock);
 
-		if (cfs_time_aftereq(cfs_time_current(), deadline)) {
+		if (ktime_get_seconds() >= deadline) {
                         rc = -ETIMEDOUT;
                         lnet_connect_console_error(rc, peer_ni->ksnp_id.nid,
                                                    route->ksnr_ipaddr,
@@ -1911,12 +1994,12 @@ ksocknal_connect (ksock_route_t *route)
                         goto failed;
                 }
 
-                rc = lnet_connect(&sock, peer_ni->ksnp_id.nid,
-                                  route->ksnr_myipaddr,
+		rc = lnet_connect(&sock, peer_ni->ksnp_id.nid,
+				  route->ksnr_myipaddr,
 				  route->ksnr_ipaddr, route->ksnr_port,
 				  peer_ni->ksnp_ni->ni_net_ns);
-                if (rc != 0)
-                        goto failed;
+		if (rc != 0)
+			goto failed;
 
                 rc = ksocknal_create_conn(peer_ni->ksnp_ni, route, sock, type);
                 if (rc < 0) {
@@ -1949,10 +2032,9 @@ ksocknal_connect (ksock_route_t *route)
                          * attempt to connect if we lost conn race,
                          * but the race is resolved quickly usually,
                          * so min_reconnectms should be good heuristic */
-			route->ksnr_retry_interval =
-				cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000;
-			route->ksnr_timeout = cfs_time_add(cfs_time_current(),
-							   route->ksnr_retry_interval);
+			route->ksnr_retry_interval = *ksocknal_tunables.ksnd_min_reconnectms / 1000;
+			route->ksnr_timeout = ktime_get_seconds() +
+					      route->ksnr_retry_interval;
                 }
 
                 ksocknal_launch_connection_locked(route);
@@ -1970,26 +2052,25 @@ ksocknal_connect (ksock_route_t *route)
 	/* This is a retry rather than a new connection */
 	route->ksnr_retry_interval *= 2;
 	route->ksnr_retry_interval =
-		MAX(route->ksnr_retry_interval,
-		    cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000);
+		max_t(time64_t, route->ksnr_retry_interval,
+		      *ksocknal_tunables.ksnd_min_reconnectms / 1000);
 	route->ksnr_retry_interval =
-		MIN(route->ksnr_retry_interval,
-		    cfs_time_seconds(*ksocknal_tunables.ksnd_max_reconnectms)/1000);
+		min_t(time64_t, route->ksnr_retry_interval,
+		      *ksocknal_tunables.ksnd_max_reconnectms / 1000);
 
-	LASSERT (route->ksnr_retry_interval != 0);
-	route->ksnr_timeout = cfs_time_add(cfs_time_current(),
-					   route->ksnr_retry_interval);
+	LASSERT(route->ksnr_retry_interval);
+	route->ksnr_timeout = ktime_get_seconds() + route->ksnr_retry_interval;
 
 	if (!list_empty(&peer_ni->ksnp_tx_queue) &&
             peer_ni->ksnp_accepting == 0 &&
             ksocknal_find_connecting_route_locked(peer_ni) == NULL) {
-                ksock_conn_t *conn;
+		struct ksock_conn *conn;
 
                 /* ksnp_tx_queue is queued on a conn on successful
                  * connection for V1.x and V2.x */
 		if (!list_empty(&peer_ni->ksnp_conns)) {
 			conn = list_entry(peer_ni->ksnp_conns.next,
-                                              ksock_conn_t, ksnc_list);
+					  struct ksock_conn, ksnc_list);
                         LASSERT (conn->ksnc_proto == &ksocknal_protocol_v3x);
                 }
 
@@ -2012,7 +2093,7 @@ ksocknal_connect (ksock_route_t *route)
  * running out of resource.
  */
 static int
-ksocknal_connd_check_start(long sec, long *timeout)
+ksocknal_connd_check_start(time64_t sec, long *timeout)
 {
 	char name[16];
         int rc;
@@ -2062,7 +2143,7 @@ ksocknal_connd_check_start(long sec, long *timeout)
         /* we tried ... */
         LASSERT(ksocknal_data.ksnd_connd_starting > 0);
         ksocknal_data.ksnd_connd_starting--;
-	ksocknal_data.ksnd_connd_failed_stamp = cfs_time_current_sec();
+	ksocknal_data.ksnd_connd_failed_stamp = ktime_get_real_seconds();
 
         return 1;
 }
@@ -2074,7 +2155,7 @@ ksocknal_connd_check_start(long sec, long *timeout)
  * again to recheck these conditions.
  */
 static int
-ksocknal_connd_check_stop(long sec, long *timeout)
+ksocknal_connd_check_stop(time64_t sec, long *timeout)
 {
         int val;
 
@@ -2110,38 +2191,36 @@ ksocknal_connd_check_stop(long sec, long *timeout)
 
 /* Go through connd_routes queue looking for a route that we can process
  * right now, @timeout_p can be updated if we need to come back later */
-static ksock_route_t *
+static struct ksock_route *
 ksocknal_connd_get_route_locked(signed long *timeout_p)
 {
-	ksock_route_t *route;
-	cfs_time_t     now;
-
-	now = cfs_time_current();
+	time64_t now = ktime_get_seconds();
+	struct ksock_route *route;
 
 	/* connd_routes can contain both pending and ordinary routes */
 	list_for_each_entry(route, &ksocknal_data.ksnd_connd_routes,
 				 ksnr_connd_list) {
 
 		if (route->ksnr_retry_interval == 0 ||
-		    cfs_time_aftereq(now, route->ksnr_timeout))
+		    now >= route->ksnr_timeout)
 			return route;
 
 		if (*timeout_p == MAX_SCHEDULE_TIMEOUT ||
-		    (int)*timeout_p > (int)(route->ksnr_timeout - now))
-			*timeout_p = (int)(route->ksnr_timeout - now);
+		    *timeout_p > cfs_time_seconds(route->ksnr_timeout - now))
+			*timeout_p = cfs_time_seconds(route->ksnr_timeout - now);
 	}
 
 	return NULL;
 }
 
 int
-ksocknal_connd (void *arg)
+ksocknal_connd(void *arg)
 {
-	spinlock_t    *connd_lock = &ksocknal_data.ksnd_connd_lock;
-	ksock_connreq_t   *cr;
+	spinlock_t *connd_lock = &ksocknal_data.ksnd_connd_lock;
+	struct ksock_connreq *cr;
 	wait_queue_entry_t wait;
-	int                nloops = 0;
-	int                cons_retry = 0;
+	int nloops = 0;
+	int cons_retry = 0;
 
 	cfs_block_allsigs();
 
@@ -2154,8 +2233,8 @@ ksocknal_connd (void *arg)
 	ksocknal_data.ksnd_connd_running++;
 
 	while (!ksocknal_data.ksnd_shuttingdown) {
-		ksock_route_t *route = NULL;
-		long sec = cfs_time_current_sec();
+		struct ksock_route *route = NULL;
+		time64_t sec = ktime_get_real_seconds();
 		long timeout = MAX_SCHEDULE_TIMEOUT;
 		int  dropped_lock = 0;
 
@@ -2172,8 +2251,8 @@ ksocknal_connd (void *arg)
 
 		if (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) {
                         /* Connection accepted by the listener */
-			cr = list_entry(ksocknal_data.ksnd_connd_connreqs. \
-                                            next, ksock_connreq_t, ksncr_list);
+			cr = list_entry(ksocknal_data.ksnd_connd_connreqs.next,
+					struct ksock_connreq, ksncr_list);
 
 			list_del(&cr->ksncr_list);
 			spin_unlock_bh(connd_lock);
@@ -2247,16 +2326,18 @@ ksocknal_connd (void *arg)
 	return 0;
 }
 
-static ksock_conn_t *
-ksocknal_find_timed_out_conn (ksock_peer_ni_t *peer_ni)
+static struct ksock_conn *
+ksocknal_find_timed_out_conn(struct ksock_peer_ni *peer_ni)
 {
         /* We're called with a shared lock on ksnd_global_lock */
-        ksock_conn_t      *conn;
-	struct list_head        *ctmp;
+	struct ksock_conn *conn;
+	struct list_head *ctmp;
+	struct ksock_tx *tx;
 
 	list_for_each(ctmp, &peer_ni->ksnp_conns) {
-                int     error;
-		conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
+		int error;
+
+		conn = list_entry(ctmp, struct ksock_conn, ksnc_list);
 
                 /* Don't need the {get,put}connsock dance to deref ksnc_sock */
                 LASSERT (!conn->ksnc_closing);
@@ -2296,8 +2377,7 @@ ksocknal_find_timed_out_conn (ksock_peer_ni_t *peer_ni)
                 }
 
                 if (conn->ksnc_rx_started &&
-		    cfs_time_aftereq(cfs_time_current(),
-				     conn->ksnc_rx_deadline)) {
+		    ktime_get_seconds() >= conn->ksnc_rx_deadline) {
                         /* Timed out incomplete incoming message */
                         ksocknal_conn_addref(conn);
 			CNETERR("Timeout receiving from %s (%pI4h:%d), "
@@ -2313,11 +2393,14 @@ ksocknal_find_timed_out_conn (ksock_peer_ni_t *peer_ni)
 
 		if ((!list_empty(&conn->ksnc_tx_queue) ||
 		     conn->ksnc_sock->sk->sk_wmem_queued != 0) &&
-		    cfs_time_aftereq(cfs_time_current(),
-				     conn->ksnc_tx_deadline)) {
+		    ktime_get_seconds() >= conn->ksnc_tx_deadline) {
                         /* Timed out messages queued for sending or
                          * buffered in the socket's send buffer */
                         ksocknal_conn_addref(conn);
+			list_for_each_entry(tx, &conn->ksnc_tx_queue,
+					    tx_list)
+				tx->tx_hstatus =
+					LNET_MSG_STATUS_LOCAL_TIMEOUT;
 			CNETERR("Timeout sending data to %s (%pI4h:%d) "
                                 "the network or that node may be down.\n",
                                 libcfs_id2str(peer_ni->ksnp_id),
@@ -2330,21 +2413,22 @@ ksocknal_find_timed_out_conn (ksock_peer_ni_t *peer_ni)
 }
 
 static inline void
-ksocknal_flush_stale_txs(ksock_peer_ni_t *peer_ni)
+ksocknal_flush_stale_txs(struct ksock_peer_ni *peer_ni)
 {
-	ksock_tx_t	  *tx;
-	struct list_head	stale_txs = LIST_HEAD_INIT(stale_txs);
+	struct ksock_tx	*tx;
+	struct list_head stale_txs = LIST_HEAD_INIT(stale_txs);
 
 	write_lock_bh(&ksocknal_data.ksnd_global_lock);
 
 	while (!list_empty(&peer_ni->ksnp_tx_queue)) {
 		tx = list_entry(peer_ni->ksnp_tx_queue.next,
-				     ksock_tx_t, tx_list);
+				struct ksock_tx, tx_list);
 
-		if (!cfs_time_aftereq(cfs_time_current(),
-				      tx->tx_deadline))
+		if (ktime_get_seconds() < tx->tx_deadline)
 			break;
 
+		tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_TIMEOUT;
+
 		list_del(&tx->tx_list);
 		list_add_tail(&tx->tx_list, &stale_txs);
 	}
@@ -2355,12 +2439,12 @@ ksocknal_flush_stale_txs(ksock_peer_ni_t *peer_ni)
 }
 
 static int
-ksocknal_send_keepalive_locked(ksock_peer_ni_t *peer_ni)
+ksocknal_send_keepalive_locked(struct ksock_peer_ni *peer_ni)
 __must_hold(&ksocknal_data.ksnd_global_lock)
 {
-        ksock_sched_t  *sched;
-        ksock_conn_t   *conn;
-        ksock_tx_t     *tx;
+	struct ksock_sched *sched;
+	struct ksock_conn *conn;
+	struct ksock_tx *tx;
 
 	/* last_alive will be updated by create_conn */
 	if (list_empty(&peer_ni->ksnp_conns))
@@ -2370,18 +2454,16 @@ __must_hold(&ksocknal_data.ksnd_global_lock)
                 return 0;
 
         if (*ksocknal_tunables.ksnd_keepalive <= 0 ||
-	    cfs_time_before(cfs_time_current(),
-			    cfs_time_add(peer_ni->ksnp_last_alive,
-					 cfs_time_seconds(*ksocknal_tunables.ksnd_keepalive))))
+	    ktime_get_seconds() < peer_ni->ksnp_last_alive +
+				  *ksocknal_tunables.ksnd_keepalive)
                 return 0;
 
-	if (cfs_time_before(cfs_time_current(),
-			    peer_ni->ksnp_send_keepalive))
+	if (ktime_get_seconds() < peer_ni->ksnp_send_keepalive)
                 return 0;
 
         /* retry 10 secs later, so we wouldn't put pressure
          * on this peer_ni if we failed to send keepalive this time */
-	peer_ni->ksnp_send_keepalive = cfs_time_shift(10);
+	peer_ni->ksnp_send_keepalive = ktime_get_seconds() + 10;
 
         conn = ksocknal_find_conn_locked(peer_ni, NULL, 1);
         if (conn != NULL) {
@@ -2419,12 +2501,12 @@ __must_hold(&ksocknal_data.ksnd_global_lock)
 
 
 static void
-ksocknal_check_peer_timeouts (int idx)
+ksocknal_check_peer_timeouts(int idx)
 {
-	struct list_head       *peers = &ksocknal_data.ksnd_peers[idx];
-        ksock_peer_ni_t     *peer_ni;
-        ksock_conn_t     *conn;
-        ksock_tx_t       *tx;
+	struct list_head *peers = &ksocknal_data.ksnd_peers[idx];
+	struct ksock_peer_ni *peer_ni;
+	struct ksock_conn *conn;
+	struct ksock_tx *tx;
 
  again:
         /* NB. We expect to have a look at all the peers and not find any
@@ -2433,10 +2515,10 @@ ksocknal_check_peer_timeouts (int idx)
 	read_lock(&ksocknal_data.ksnd_global_lock);
 
 	list_for_each_entry(peer_ni, peers, ksnp_list) {
-		ksock_tx_t *tx_stale;
-		cfs_time_t  deadline = 0;
-		int         resid = 0;
-		int         n     = 0;
+		struct ksock_tx *tx_stale;
+		time64_t deadline = 0;
+		int resid = 0;
+		int n = 0;
 
                 if (ksocknal_send_keepalive_locked(peer_ni) != 0) {
 			read_unlock(&ksocknal_data.ksnd_global_lock);
@@ -2460,13 +2542,11 @@ ksocknal_check_peer_timeouts (int idx)
                 /* we can't process stale txs right here because we're
                  * holding only shared lock */
 		if (!list_empty(&peer_ni->ksnp_tx_queue)) {
-                        ksock_tx_t *tx =
-				list_entry(peer_ni->ksnp_tx_queue.next,
-                                                ksock_tx_t, tx_list);
-
-			if (cfs_time_aftereq(cfs_time_current(),
-					     tx->tx_deadline)) {
+			struct ksock_tx *tx;
 
+			tx = list_entry(peer_ni->ksnp_tx_queue.next,
+					struct ksock_tx, tx_list);
+			if (ktime_get_seconds() >= tx->tx_deadline) {
                                 ksocknal_peer_addref(peer_ni);
 				read_unlock(&ksocknal_data.ksnd_global_lock);
 
@@ -2483,8 +2563,7 @@ ksocknal_check_peer_timeouts (int idx)
 		tx_stale = NULL;
 		spin_lock(&peer_ni->ksnp_lock);
 		list_for_each_entry(tx, &peer_ni->ksnp_zc_req_list, tx_zc_list) {
-			if (!cfs_time_aftereq(cfs_time_current(),
-					      tx->tx_deadline))
+			if (ktime_get_seconds() < tx->tx_deadline)
                                 break;
                         /* ignore the TX if connection is being closed */
                         if (tx->tx_conn->ksnc_closing)
@@ -2508,10 +2587,10 @@ ksocknal_check_peer_timeouts (int idx)
 		read_unlock(&ksocknal_data.ksnd_global_lock);
 
 		CERROR("Total %d stale ZC_REQs for peer_ni %s detected; the "
-		       "oldest(%p) timed out %ld secs ago, "
+		       "oldest(%p) timed out %lld secs ago, "
 		       "resid: %d, wmem: %d\n",
 		       n, libcfs_nid2str(peer_ni->ksnp_id.nid), tx_stale,
-		       cfs_duration_sec(cfs_time_current() - deadline),
+		       ktime_get_seconds() - deadline,
 		       resid, conn->ksnc_sock->sk->sk_wmem_queued);
 
                 ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT);
@@ -2525,14 +2604,14 @@ ksocknal_check_peer_timeouts (int idx)
 int ksocknal_reaper(void *arg)
 {
 	wait_queue_entry_t wait;
-	ksock_conn_t      *conn;
-	ksock_sched_t     *sched;
-	struct list_head         enomem_conns;
-        int                nenomem_conns;
-	cfs_duration_t     timeout;
-        int                i;
-        int                peer_index = 0;
-	cfs_time_t         deadline = cfs_time_current();
+	struct ksock_conn *conn;
+	struct ksock_sched *sched;
+	struct list_head enomem_conns;
+	int nenomem_conns;
+	time64_t timeout;
+	int i;
+	int peer_index = 0;
+	time64_t deadline = ktime_get_seconds();
 
         cfs_block_allsigs ();
 
@@ -2542,11 +2621,9 @@ int ksocknal_reaper(void *arg)
 	spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
 
         while (!ksocknal_data.ksnd_shuttingdown) {
-
 		if (!list_empty(&ksocknal_data.ksnd_deathrow_conns)) {
-			conn = list_entry(ksocknal_data. \
-                                               ksnd_deathrow_conns.next,
-                                               ksock_conn_t, ksnc_list);
+			conn = list_entry(ksocknal_data.ksnd_deathrow_conns.next,
+					  struct ksock_conn, ksnc_list);
 			list_del(&conn->ksnc_list);
 
 			spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
@@ -2559,8 +2636,8 @@ int ksocknal_reaper(void *arg)
                 }
 
 		if (!list_empty(&ksocknal_data.ksnd_zombie_conns)) {
-			conn = list_entry(ksocknal_data.ksnd_zombie_conns.\
-                                               next, ksock_conn_t, ksnc_list);
+			conn = list_entry(ksocknal_data.ksnd_zombie_conns.next,
+					  struct ksock_conn, ksnc_list);
 			list_del(&conn->ksnc_list);
 
 			spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
@@ -2583,7 +2660,7 @@ int ksocknal_reaper(void *arg)
                 nenomem_conns = 0;
 		while (!list_empty(&enomem_conns)) {
 			conn = list_entry(enomem_conns.next,
-                                               ksock_conn_t, ksnc_tx_list);
+					  struct ksock_conn, ksnc_tx_list);
 			list_del(&conn->ksnc_tx_list);
 
                         sched = conn->ksnc_scheduler;
@@ -2601,11 +2678,11 @@ int ksocknal_reaper(void *arg)
                 }
 
                 /* careful with the jiffy wrap... */
-		while ((timeout = cfs_time_sub(deadline,
-					       cfs_time_current())) <= 0) {
+		while ((timeout = deadline - ktime_get_seconds()) <= 0) {
                         const int n = 4;
                         const int p = 1;
                         int       chunk = ksocknal_data.ksnd_peer_hash_size;
+			unsigned int lnd_timeout;
 
                         /* Time to check for timeouts on a few more peers: I do
                          * checks every 'p' seconds on a proportion of the peer_ni
@@ -2614,11 +2691,11 @@ int ksocknal_reaper(void *arg)
                          * timeout on any connection within (n+1)/n times the
                          * timeout interval. */
 
-                        if (*ksocknal_tunables.ksnd_timeout > n * p)
-                                chunk = (chunk * n * p) /
-                                        *ksocknal_tunables.ksnd_timeout;
-                        if (chunk == 0)
-                                chunk = 1;
+			lnd_timeout = lnet_get_lnd_timeout();
+			if (lnd_timeout > n * p)
+				chunk = (chunk * n * p) / lnd_timeout;
+			if (chunk == 0)
+				chunk = 1;
 
                         for (i = 0; i < chunk; i++) {
                                 ksocknal_check_peer_timeouts (peer_index);
@@ -2626,7 +2703,7 @@ int ksocknal_reaper(void *arg)
                                              ksocknal_data.ksnd_peer_hash_size;
                         }
 
-			deadline = cfs_time_add(deadline, cfs_time_seconds(p));
+			deadline += p;
                 }
 
                 if (nenomem_conns != 0) {
@@ -2635,16 +2712,16 @@ int ksocknal_reaper(void *arg)
                          * if any go back on my enomem list. */
                         timeout = SOCKNAL_ENOMEM_RETRY;
                 }
-		ksocknal_data.ksnd_reaper_waketime =
-			cfs_time_add(cfs_time_current(), timeout);
+		ksocknal_data.ksnd_reaper_waketime = ktime_get_seconds() +
+						     timeout;
 
-			set_current_state(TASK_INTERRUPTIBLE);
+		set_current_state(TASK_INTERRUPTIBLE);
 		add_wait_queue(&ksocknal_data.ksnd_reaper_waitq, &wait);
 
 		if (!ksocknal_data.ksnd_shuttingdown &&
 		    list_empty(&ksocknal_data.ksnd_deathrow_conns) &&
 		    list_empty(&ksocknal_data.ksnd_zombie_conns))
-			schedule_timeout(timeout);
+			schedule_timeout(cfs_time_seconds(timeout));
 
 		set_current_state(TASK_RUNNING);
 		remove_wait_queue(&ksocknal_data.ksnd_reaper_waitq, &wait);
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c
index 91a9cf05e2ad8..72f2bd526613e 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -33,11 +33,11 @@
 #include "socklnd.h"
 
 int
-ksocknal_lib_get_conn_addrs (ksock_conn_t *conn)
+ksocknal_lib_get_conn_addrs(struct ksock_conn *conn)
 {
 	int rc = lnet_sock_getaddr(conn->ksnc_sock, true,
-                                     &conn->ksnc_ipaddr,
-                                     &conn->ksnc_port);
+				   &conn->ksnc_ipaddr,
+				   &conn->ksnc_port);
 
         /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
         LASSERT (!conn->ksnc_closing);
@@ -58,7 +58,7 @@ ksocknal_lib_get_conn_addrs (ksock_conn_t *conn)
 }
 
 int
-ksocknal_lib_zc_capable(ksock_conn_t *conn)
+ksocknal_lib_zc_capable(struct ksock_conn *conn)
 {
 	int  caps = conn->ksnc_sock->sk->sk_route_caps;
 
@@ -71,7 +71,8 @@ ksocknal_lib_zc_capable(ksock_conn_t *conn)
 }
 
 int
-ksocknal_lib_send_iov(ksock_conn_t *conn, ksock_tx_t *tx)
+ksocknal_lib_send_iov(struct ksock_conn *conn, struct ksock_tx *tx,
+		      struct kvec *scratchiov)
 {
 	struct socket  *sock = conn->ksnc_sock;
 	int		nob;
@@ -92,7 +93,6 @@ ksocknal_lib_send_iov(ksock_conn_t *conn, ksock_tx_t *tx)
 		struct kvec *scratchiov = &scratch;
 		unsigned int niov = 1;
 #else
-		struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
 		unsigned int niov = tx->tx_niov;
 #endif
 		struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
@@ -113,41 +113,42 @@ ksocknal_lib_send_iov(ksock_conn_t *conn, ksock_tx_t *tx)
 }
 
 int
-ksocknal_lib_send_kiov(ksock_conn_t *conn, ksock_tx_t *tx)
+ksocknal_lib_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx,
+		       struct kvec *scratchiov)
 {
-        struct socket *sock = conn->ksnc_sock;
-        lnet_kiov_t   *kiov = tx->tx_kiov;
-        int            rc;
-        int            nob;
+	struct socket *sock = conn->ksnc_sock;
+	lnet_kiov_t   *kiov = tx->tx_kiov;
+	int            rc;
+	int            nob;
 
-        /* Not NOOP message */
-        LASSERT (tx->tx_lnetmsg != NULL);
+	/* Not NOOP message */
+	LASSERT(tx->tx_lnetmsg != NULL);
 
-        /* NB we can't trust socket ops to either consume our iovs
-         * or leave them alone. */
-        if (tx->tx_msg.ksm_zc_cookies[0] != 0) {
-                /* Zero copy is enabled */
-                struct sock   *sk = sock->sk;
-                struct page   *page = kiov->kiov_page;
-                int            offset = kiov->kiov_offset;
-                int            fragsize = kiov->kiov_len;
-                int            msgflg = MSG_DONTWAIT;
+	/* NB we can't trust socket ops to either consume our iovs
+	 * or leave them alone. */
+	if (tx->tx_msg.ksm_zc_cookies[0] != 0) {
+		/* Zero copy is enabled */
+		struct sock   *sk = sock->sk;
+		struct page   *page = kiov->kiov_page;
+		int            offset = kiov->kiov_offset;
+		int            fragsize = kiov->kiov_len;
+		int            msgflg = MSG_DONTWAIT;
 
-                CDEBUG(D_NET, "page %p + offset %x for %d\n",
-                               page, offset, kiov->kiov_len);
+		CDEBUG(D_NET, "page %p + offset %x for %d\n",
+			       page, offset, kiov->kiov_len);
 
 		if (!list_empty(&conn->ksnc_tx_queue) ||
-                    fragsize < tx->tx_resid)
-                        msgflg |= MSG_MORE;
-
-                if (sk->sk_prot->sendpage != NULL) {
-                        rc = sk->sk_prot->sendpage(sk, page,
-                                                   offset, fragsize, msgflg);
-                } else {
-                        rc = cfs_tcp_sendpage(sk, page, offset, fragsize,
-                                              msgflg);
-                }
-        } else {
+		    fragsize < tx->tx_resid)
+			msgflg |= MSG_MORE;
+
+		if (sk->sk_prot->sendpage != NULL) {
+			rc = sk->sk_prot->sendpage(sk, page,
+						   offset, fragsize, msgflg);
+		} else {
+			rc = cfs_tcp_sendpage(sk, page, offset, fragsize,
+					      msgflg);
+		}
+	} else {
 #if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK
 		struct kvec	scratch;
 		struct kvec   *scratchiov = &scratch;
@@ -156,7 +157,6 @@ ksocknal_lib_send_kiov(ksock_conn_t *conn, ksock_tx_t *tx)
 #ifdef CONFIG_HIGHMEM
 #warning "XXX risk of kmap deadlock on multiple frags..."
 #endif
-		struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
 		unsigned int  niov = tx->tx_nkiov;
 #endif
 		struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
@@ -181,7 +181,7 @@ ksocknal_lib_send_kiov(ksock_conn_t *conn, ksock_tx_t *tx)
 }
 
 void
-ksocknal_lib_eager_ack (ksock_conn_t *conn)
+ksocknal_lib_eager_ack(struct ksock_conn *conn)
 {
 	struct socket *sock = conn->ksnc_sock;
 
@@ -194,14 +194,13 @@ ksocknal_lib_eager_ack (ksock_conn_t *conn)
 }
 
 int
-ksocknal_lib_recv_iov (ksock_conn_t *conn)
+ksocknal_lib_recv_iov(struct ksock_conn *conn, struct kvec *scratchiov)
 {
 #if SOCKNAL_SINGLE_FRAG_RX
 	struct kvec  scratch;
 	struct kvec *scratchiov = &scratch;
 	unsigned int  niov = 1;
 #else
-	struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
 	unsigned int  niov = conn->ksnc_rx_niov;
 #endif
 	struct kvec *iov = conn->ksnc_rx_iov;
@@ -299,7 +298,8 @@ ksocknal_lib_kiov_vmap(lnet_kiov_t *kiov, int niov,
 }
 
 int
-ksocknal_lib_recv_kiov (ksock_conn_t *conn)
+ksocknal_lib_recv_kiov(struct ksock_conn *conn, struct page **pages,
+		       struct kvec *scratchiov)
 {
 #if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK
 	struct kvec   scratch;
@@ -310,8 +310,6 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn)
 #ifdef CONFIG_HIGHMEM
 #warning "XXX risk of kmap deadlock on multiple frags..."
 #endif
-	struct kvec  *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
-	struct page  **pages      = conn->ksnc_scheduler->kss_rx_scratch_pgs;
 	unsigned int   niov       = conn->ksnc_rx_nkiov;
 #endif
 	lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
@@ -378,7 +376,7 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn)
 }
 
 void
-ksocknal_lib_csum_tx(ksock_tx_t *tx)
+ksocknal_lib_csum_tx(struct ksock_tx *tx)
 {
         int          i;
         __u32        csum;
@@ -417,7 +415,7 @@ ksocknal_lib_csum_tx(ksock_tx_t *tx)
 }
 
 int
-ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
+ksocknal_lib_get_conn_tunables(struct ksock_conn *conn, int *txmem, int *rxmem, int *nagle)
 {
 	struct socket *sock = conn->ksnc_sock;
 	struct tcp_sock *tp = tcp_sk(sock->sk);
@@ -548,12 +546,12 @@ ksocknal_lib_setup_sock (struct socket *sock)
 }
 
 void
-ksocknal_lib_push_conn (ksock_conn_t *conn)
+ksocknal_lib_push_conn(struct ksock_conn *conn)
 {
-        struct sock    *sk;
-        struct tcp_sock *tp;
-        int             nonagle;
-        int             rc;
+	struct sock *sk;
+	struct tcp_sock *tp;
+	int nonagle;
+	int rc;
 
 	rc = ksocknal_connsock_addref(conn);
 	if (rc != 0)                            /* being shut down */
@@ -576,8 +574,8 @@ ksocknal_lib_push_conn (ksock_conn_t *conn)
 	ksocknal_connsock_decref(conn);
 }
 
-extern void ksocknal_read_callback (ksock_conn_t *conn);
-extern void ksocknal_write_callback (ksock_conn_t *conn);
+void ksocknal_read_callback(struct ksock_conn *conn);
+void ksocknal_write_callback(struct ksock_conn *conn);
 /*
  * socket call back in Linux
  */
@@ -588,7 +586,7 @@ ksocknal_data_ready(struct sock *sk)
 ksocknal_data_ready(struct sock *sk, int n)
 #endif
 {
-	ksock_conn_t  *conn;
+	struct ksock_conn  *conn;
 	ENTRY;
 
         /* interleave correctly with closing sockets... */
@@ -614,7 +612,7 @@ ksocknal_data_ready(struct sock *sk, int n)
 static void
 ksocknal_write_space (struct sock *sk)
 {
-        ksock_conn_t  *conn;
+	struct ksock_conn  *conn;
         int            wspace;
         int            min_wpace;
 
@@ -657,14 +655,14 @@ ksocknal_write_space (struct sock *sk)
 }
 
 void
-ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn)
+ksocknal_lib_save_callback(struct socket *sock, struct ksock_conn *conn)
 {
         conn->ksnc_saved_data_ready = sock->sk->sk_data_ready;
         conn->ksnc_saved_write_space = sock->sk->sk_write_space;
 }
 
 void
-ksocknal_lib_set_callback(struct socket *sock,  ksock_conn_t *conn)
+ksocknal_lib_set_callback(struct socket *sock,  struct ksock_conn *conn)
 {
         sock->sk->sk_user_data = conn;
         sock->sk->sk_data_ready = ksocknal_data_ready;
@@ -673,7 +671,7 @@ ksocknal_lib_set_callback(struct socket *sock,  ksock_conn_t *conn)
 }
 
 void
-ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn)
+ksocknal_lib_reset_callback(struct socket *sock, struct ksock_conn *conn)
 {
         /* Remove conn's network callbacks.
          * NB I _have_ to restore the callback, rather than storing a noop,
@@ -690,10 +688,10 @@ ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn)
 }
 
 int
-ksocknal_lib_memory_pressure(ksock_conn_t *conn)
+ksocknal_lib_memory_pressure(struct ksock_conn *conn)
 {
 	int            rc = 0;
-	ksock_sched_t *sched;
+	struct ksock_sched *sched;
 
 	sched = conn->ksnc_scheduler;
 	spin_lock_bh(&sched->kss_lock);
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_modparams.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_modparams.c
index 6495703626094..df9d96e6e4cfc 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_modparams.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_modparams.c
@@ -37,7 +37,7 @@ static int peer_buffer_credits;
 module_param(peer_buffer_credits, int, 0444);
 MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits");
 
-static int peer_timeout = 180;
+static int peer_timeout = DEFAULT_PEER_TIMEOUT;
 module_param(peer_timeout, int, 0444);
 MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)");
 
@@ -155,7 +155,7 @@ module_param(protocol, int, 0644);
 MODULE_PARM_DESC(protocol, "protocol version");
 #endif
 
-ksock_tunables_t ksocknal_tunables;
+struct ksock_tunables ksocknal_tunables;
 
 int ksocknal_tunables_init(void)
 {
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c
index 42dff10fdb563..6dd648a2299cc 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2012, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  *
  *   Author: Zach Brown <zab@zabbo.net>
  *   Author: Peter J. Braam <braam@clusterfs.com>
@@ -41,8 +41,8 @@
  *   pro_match_tx()       : Called holding glock
  */
 
-static ksock_tx_t *
-ksocknal_queue_tx_msg_v1(ksock_conn_t *conn, ksock_tx_t *tx_msg)
+static struct ksock_tx *
+ksocknal_queue_tx_msg_v1(struct ksock_conn *conn, struct ksock_tx *tx_msg)
 {
         /* V1.x, just enqueue it */
 	list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
@@ -50,9 +50,9 @@ ksocknal_queue_tx_msg_v1(ksock_conn_t *conn, ksock_tx_t *tx_msg)
 }
 
 void
-ksocknal_next_tx_carrier(ksock_conn_t *conn)
+ksocknal_next_tx_carrier(struct ksock_conn *conn)
 {
-        ksock_tx_t     *tx = conn->ksnc_tx_carrier;
+	struct ksock_tx *tx = conn->ksnc_tx_carrier;
 
         /* Called holding BH lock: conn->ksnc_scheduler->kss_lock */
 	LASSERT(!list_empty(&conn->ksnc_tx_queue));
@@ -64,17 +64,17 @@ ksocknal_next_tx_carrier(ksock_conn_t *conn)
                 conn->ksnc_tx_carrier = NULL;
         } else {
 		conn->ksnc_tx_carrier = list_entry(tx->tx_list.next,
-                                                       ksock_tx_t, tx_list);
+						   struct ksock_tx, tx_list);
 		LASSERT(conn->ksnc_tx_carrier->tx_msg.ksm_type ==
 			tx->tx_msg.ksm_type);
         }
 }
 
 static int
-ksocknal_queue_tx_zcack_v2(ksock_conn_t *conn,
-                           ksock_tx_t *tx_ack, __u64 cookie)
+ksocknal_queue_tx_zcack_v2(struct ksock_conn *conn,
+			   struct ksock_tx *tx_ack, __u64 cookie)
 {
-        ksock_tx_t *tx = conn->ksnc_tx_carrier;
+	struct ksock_tx *tx = conn->ksnc_tx_carrier;
 
         LASSERT (tx_ack == NULL ||
                  tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP);
@@ -117,10 +117,10 @@ ksocknal_queue_tx_zcack_v2(ksock_conn_t *conn,
         return 1;
 }
 
-static ksock_tx_t *
-ksocknal_queue_tx_msg_v2(ksock_conn_t *conn, ksock_tx_t *tx_msg)
+static struct ksock_tx *
+ksocknal_queue_tx_msg_v2(struct ksock_conn *conn, struct ksock_tx *tx_msg)
 {
-        ksock_tx_t  *tx  = conn->ksnc_tx_carrier;
+	struct ksock_tx  *tx  = conn->ksnc_tx_carrier;
 
         /*
          * Enqueue tx_msg:
@@ -154,10 +154,10 @@ ksocknal_queue_tx_msg_v2(ksock_conn_t *conn, ksock_tx_t *tx_msg)
 }
 
 static int
-ksocknal_queue_tx_zcack_v3(ksock_conn_t *conn,
-                           ksock_tx_t *tx_ack, __u64 cookie)
+ksocknal_queue_tx_zcack_v3(struct ksock_conn *conn,
+			   struct ksock_tx *tx_ack, __u64 cookie)
 {
-        ksock_tx_t *tx;
+	struct ksock_tx *tx;
 
         if (conn->ksnc_type != SOCKLND_CONN_ACK)
                 return ksocknal_queue_tx_zcack_v2(conn, tx_ack, cookie);
@@ -271,7 +271,7 @@ ksocknal_queue_tx_zcack_v3(ksock_conn_t *conn,
 }
 
 static int
-ksocknal_match_tx(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
+ksocknal_match_tx(struct ksock_conn *conn, struct ksock_tx *tx, int nonblk)
 {
         int nob;
 
@@ -315,7 +315,7 @@ ksocknal_match_tx(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
 }
 
 static int
-ksocknal_match_tx_v3(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
+ksocknal_match_tx_v3(struct ksock_conn *conn, struct ksock_tx *tx, int nonblk)
 {
         int nob;
 
@@ -359,18 +359,18 @@ ksocknal_match_tx_v3(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
 
 /* (Sink) handle incoming ZC request from sender */
 static int
-ksocknal_handle_zcreq(ksock_conn_t *c, __u64 cookie, int remote)
+ksocknal_handle_zcreq(struct ksock_conn *c, __u64 cookie, int remote)
 {
-	ksock_peer_ni_t   *peer_ni = c->ksnc_peer;
-	ksock_conn_t   *conn;
-	ksock_tx_t     *tx;
-	int             rc;
+	struct ksock_peer_ni *peer_ni = c->ksnc_peer;
+	struct ksock_conn *conn;
+	struct ksock_tx *tx;
+	int rc;
 
 	read_lock(&ksocknal_data.ksnd_global_lock);
 
 	conn = ksocknal_find_conn_locked(peer_ni, NULL, !!remote);
 	if (conn != NULL) {
-		ksock_sched_t *sched = conn->ksnc_scheduler;
+		struct ksock_sched *sched = conn->ksnc_scheduler;
 
 		LASSERT(conn->ksnc_proto->pro_queue_tx_zcack != NULL);
 
@@ -402,13 +402,13 @@ ksocknal_handle_zcreq(ksock_conn_t *c, __u64 cookie, int remote)
 
 /* (Sender) handle ZC_ACK from sink */
 static int
-ksocknal_handle_zcack(ksock_conn_t *conn, __u64 cookie1, __u64 cookie2)
+ksocknal_handle_zcack(struct ksock_conn *conn, __u64 cookie1, __u64 cookie2)
 {
-        ksock_peer_ni_t      *peer_ni = conn->ksnc_peer;
-        ksock_tx_t        *tx;
-        ksock_tx_t        *tmp;
-	struct list_head        zlist = LIST_HEAD_INIT(zlist);
-        int                count;
+	struct ksock_peer_ni *peer_ni = conn->ksnc_peer;
+	struct ksock_tx *tx;
+	struct ksock_tx *tmp;
+	struct list_head zlist = LIST_HEAD_INIT(zlist);
+	int count;
 
         if (cookie1 == 0)
                 cookie1 = cookie2;
@@ -440,7 +440,7 @@ ksocknal_handle_zcack(ksock_conn_t *conn, __u64 cookie1, __u64 cookie2)
 	spin_unlock(&peer_ni->ksnp_lock);
 
 	while (!list_empty(&zlist)) {
-		tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list);
+		tx = list_entry(zlist.next, struct ksock_tx, tx_zc_list);
 		list_del(&tx->tx_zc_list);
                 ksocknal_tx_decref(tx);
         }
@@ -449,7 +449,7 @@ ksocknal_handle_zcack(ksock_conn_t *conn, __u64 cookie1, __u64 cookie2)
 }
 
 static int
-ksocknal_send_hello_v1 (ksock_conn_t *conn, struct ksock_hello_msg *hello)
+ksocknal_send_hello_v1(struct ksock_conn *conn, struct ksock_hello_msg *hello)
 {
 	struct socket *sock = conn->ksnc_sock;
 	struct lnet_hdr *hdr;
@@ -524,10 +524,10 @@ ksocknal_send_hello_v1 (ksock_conn_t *conn, struct ksock_hello_msg *hello)
 }
 
 static int
-ksocknal_send_hello_v2 (ksock_conn_t *conn, struct ksock_hello_msg *hello)
+ksocknal_send_hello_v2(struct ksock_conn *conn, struct ksock_hello_msg *hello)
 {
-	struct socket   *sock = conn->ksnc_sock;
-        int             rc;
+	struct socket *sock = conn->ksnc_sock;
+	int rc;
 
         hello->kshm_magic   = LNET_PROTO_MAGIC;
         hello->kshm_version = conn->ksnc_proto->pro_version;
@@ -567,7 +567,8 @@ ksocknal_send_hello_v2 (ksock_conn_t *conn, struct ksock_hello_msg *hello)
 }
 
 static int
-ksocknal_recv_hello_v1(ksock_conn_t *conn, struct ksock_hello_msg *hello,int timeout)
+ksocknal_recv_hello_v1(struct ksock_conn *conn, struct ksock_hello_msg *hello,
+		       int timeout)
 {
 	struct socket *sock = conn->ksnc_sock;
 	struct lnet_hdr *hdr;
@@ -607,7 +608,7 @@ ksocknal_recv_hello_v1(ksock_conn_t *conn, struct ksock_hello_msg *hello,int tim
         hello->kshm_nips            = le32_to_cpu (hdr->payload_length) /
                                          sizeof (__u32);
 
-	if (hello->kshm_nips > LNET_NUM_INTERFACES) {
+	if (hello->kshm_nips > LNET_INTERFACES_NUM) {
 		CERROR("Bad nips %d from ip %pI4h\n",
 		       hello->kshm_nips, &conn->ksnc_ipaddr);
 		rc = -EPROTO;
@@ -643,7 +644,7 @@ ksocknal_recv_hello_v1(ksock_conn_t *conn, struct ksock_hello_msg *hello,int tim
 }
 
 static int
-ksocknal_recv_hello_v2(ksock_conn_t *conn, struct ksock_hello_msg *hello,
+ksocknal_recv_hello_v2(struct ksock_conn *conn, struct ksock_hello_msg *hello,
 		       int timeout)
 {
 	struct socket	  *sock = conn->ksnc_sock;
@@ -677,7 +678,7 @@ ksocknal_recv_hello_v2(ksock_conn_t *conn, struct ksock_hello_msg *hello,
                 __swab32s(&hello->kshm_nips);
         }
 
-	if (hello->kshm_nips > LNET_NUM_INTERFACES) {
+	if (hello->kshm_nips > LNET_INTERFACES_NUM) {
 		CERROR("Bad nips %d from ip %pI4h\n",
 		       hello->kshm_nips, &conn->ksnc_ipaddr);
 		return -EPROTO;
@@ -710,7 +711,7 @@ ksocknal_recv_hello_v2(ksock_conn_t *conn, struct ksock_hello_msg *hello,
 }
 
 static void
-ksocknal_pack_msg_v1(ksock_tx_t *tx)
+ksocknal_pack_msg_v1(struct ksock_tx *tx)
 {
 	/* V1.x has no KSOCK_MSG_NOOP */
 	LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
@@ -724,7 +725,7 @@ ksocknal_pack_msg_v1(ksock_tx_t *tx)
 }
 
 static void
-ksocknal_pack_msg_v2(ksock_tx_t *tx)
+ksocknal_pack_msg_v2(struct ksock_tx *tx)
 {
         tx->tx_iov[0].iov_base = (void *)&tx->tx_msg;
 
@@ -757,7 +758,7 @@ ksocknal_unpack_msg_v2(struct ksock_msg *msg)
         return;  /* Do nothing */
 }
 
-ksock_proto_t  ksocknal_protocol_v1x =
+struct ksock_proto  ksocknal_protocol_v1x =
 {
         .pro_version            = KSOCK_PROTO_V1,
         .pro_send_hello         = ksocknal_send_hello_v1,
@@ -771,7 +772,7 @@ ksock_proto_t  ksocknal_protocol_v1x =
         .pro_match_tx           = ksocknal_match_tx
 };
 
-ksock_proto_t  ksocknal_protocol_v2x =
+struct ksock_proto  ksocknal_protocol_v2x =
 {
         .pro_version            = KSOCK_PROTO_V2,
         .pro_send_hello         = ksocknal_send_hello_v2,
@@ -785,7 +786,7 @@ ksock_proto_t  ksocknal_protocol_v2x =
         .pro_match_tx           = ksocknal_match_tx
 };
 
-ksock_proto_t  ksocknal_protocol_v3x =
+struct ksock_proto  ksocknal_protocol_v3x =
 {
         .pro_version            = KSOCK_PROTO_V3,
         .pro_send_hello         = ksocknal_send_hello_v2,
diff --git a/drivers/staging/lustrefsx/lnet/lnet/acceptor.c b/drivers/staging/lustrefsx/lnet/lnet/acceptor.c
index 8d3d6030d7d31..5be1dd88a6b2f 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/acceptor.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/acceptor.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -32,7 +32,6 @@
 
 #define DEBUG_SUBSYSTEM S_LNET
 
-#include <linux/nsproxy.h>
 #include <linux/completion.h>
 #include <net/sock.h>
 #include <lnet/lib-lnet.h>
@@ -481,14 +480,15 @@ lnet_acceptor_start(void)
 
 	if (lnet_count_acceptor_nets() == 0)  /* not required */
 		return 0;
-
-	lnet_acceptor_state.pta_ns = current->nsproxy->net_ns;
+	if (current->nsproxy && current->nsproxy->net_ns)
+		lnet_acceptor_state.pta_ns = current->nsproxy->net_ns;
+	else
+		lnet_acceptor_state.pta_ns = &init_net;
 	task = kthread_run(lnet_acceptor, (void *)(uintptr_t)secure,
 			   "acceptor_%03ld", secure);
 	if (IS_ERR(task)) {
 		rc2 = PTR_ERR(task);
 		CERROR("Can't start acceptor thread: %ld\n", rc2);
-
 		return -ESRCH;
 	}
 
diff --git a/drivers/staging/lustrefsx/lnet/lnet/api-ni.c b/drivers/staging/lustrefsx/lnet/lnet/api-ni.c
index c70e26680b447..24e7d7aa59cd0 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/api-ni.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/api-ni.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -31,14 +31,25 @@
  */
 
 #define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/ctype.h>
 #include <linux/log2.h>
 #include <linux/ktime.h>
+#include <linux/moduleparam.h>
+#include <linux/uaccess.h>
 
 #include <lnet/lib-lnet.h>
 
 #define D_LNI D_CONSOLE
 
-struct lnet the_lnet;		/* THE state of the network */
+/*
+ * initialize ln_api_mutex statically, since it needs to be used in
+ * discovery_set callback. That module parameter callback can be called
+ * before module init completes. The mutex needs to be ready for use then.
+ */
+struct lnet the_lnet = {
+	.ln_api_mutex = __MUTEX_INITIALIZER(the_lnet.ln_api_mutex),
+};		/* THE state of the network */
 EXPORT_SYMBOL(the_lnet);
 
 static char *ip2nets = "";
@@ -60,13 +71,157 @@ MODULE_PARM_DESC(rnet_htable_size, "size of remote network hash table");
 static int use_tcp_bonding = false;
 module_param(use_tcp_bonding, int, 0444);
 MODULE_PARM_DESC(use_tcp_bonding,
-		 "Set to 1 to use socklnd bonding. 0 to use Multi-Rail");
+		 "use_tcp_bonding parameter has been deprecated");
 
 unsigned int lnet_numa_range = 0;
 module_param(lnet_numa_range, uint, 0444);
 MODULE_PARM_DESC(lnet_numa_range,
 		"NUMA range to consider during Multi-Rail selection");
 
+/*
+ * lnet_health_sensitivity determines by how much we decrement the health
+ * value on sending error. The value defaults to 100, which means health
+ * interface health is decremented by 100 points every failure.
+ */
+unsigned int lnet_health_sensitivity = 100;
+static int sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp);
+#ifdef HAVE_KERNEL_PARAM_OPS
+static struct kernel_param_ops param_ops_health_sensitivity = {
+	.set = sensitivity_set,
+	.get = param_get_int,
+};
+#define param_check_health_sensitivity(name, p) \
+		__param_check(name, p, int)
+module_param(lnet_health_sensitivity, health_sensitivity, S_IRUGO|S_IWUSR);
+#else
+module_param_call(lnet_health_sensitivity, sensitivity_set, param_get_int,
+		  &lnet_health_sensitivity, S_IRUGO|S_IWUSR);
+#endif
+MODULE_PARM_DESC(lnet_health_sensitivity,
+		"Value to decrement the health value by on error");
+
+/*
+ * lnet_recovery_interval determines how often we should perform recovery
+ * on unhealthy interfaces.
+ */
+unsigned int lnet_recovery_interval = 1;
+static int recovery_interval_set(const char *val, cfs_kernel_param_arg_t *kp);
+#ifdef HAVE_KERNEL_PARAM_OPS
+static struct kernel_param_ops param_ops_recovery_interval = {
+	.set = recovery_interval_set,
+	.get = param_get_int,
+};
+#define param_check_recovery_interval(name, p) \
+		__param_check(name, p, int)
+module_param(lnet_recovery_interval, recovery_interval, S_IRUGO|S_IWUSR);
+#else
+module_param_call(lnet_recovery_interval, recovery_interval_set, param_get_int,
+		  &lnet_recovery_interval, S_IRUGO|S_IWUSR);
+#endif
+MODULE_PARM_DESC(lnet_recovery_interval,
+		"Interval to recover unhealthy interfaces in seconds");
+
+static int lnet_interfaces_max = LNET_INTERFACES_MAX_DEFAULT;
+static int intf_max_set(const char *val, cfs_kernel_param_arg_t *kp);
+
+#ifdef HAVE_KERNEL_PARAM_OPS
+static struct kernel_param_ops param_ops_interfaces_max = {
+	.set = intf_max_set,
+	.get = param_get_int,
+};
+
+#define param_check_interfaces_max(name, p) \
+		__param_check(name, p, int)
+
+module_param(lnet_interfaces_max, interfaces_max, 0644);
+#else
+module_param_call(lnet_interfaces_max, intf_max_set, param_get_int,
+		  &lnet_interfaces_max, 0644);
+#endif
+MODULE_PARM_DESC(lnet_interfaces_max,
+		"Maximum number of interfaces in a node.");
+
+unsigned lnet_peer_discovery_disabled = 0;
+static int discovery_set(const char *val, cfs_kernel_param_arg_t *kp);
+
+#ifdef HAVE_KERNEL_PARAM_OPS
+static struct kernel_param_ops param_ops_discovery_disabled = {
+	.set = discovery_set,
+	.get = param_get_int,
+};
+
+#define param_check_discovery_disabled(name, p) \
+		__param_check(name, p, int)
+module_param(lnet_peer_discovery_disabled, discovery_disabled, 0644);
+#else
+module_param_call(lnet_peer_discovery_disabled, discovery_set, param_get_int,
+		  &lnet_peer_discovery_disabled, 0644);
+#endif
+MODULE_PARM_DESC(lnet_peer_discovery_disabled,
+		"Set to 1 to disable peer discovery on this node.");
+
+unsigned int lnet_drop_asym_route;
+static int drop_asym_route_set(const char *val, cfs_kernel_param_arg_t *kp);
+
+#ifdef HAVE_KERNEL_PARAM_OPS
+static struct kernel_param_ops param_ops_drop_asym_route = {
+	.set = drop_asym_route_set,
+	.get = param_get_int,
+};
+
+#define param_check_drop_asym_route(name, p)	\
+	__param_check(name, p, int)
+module_param(lnet_drop_asym_route, drop_asym_route, 0644);
+#else
+module_param_call(lnet_drop_asym_route, drop_asym_route_set, param_get_int,
+		  &lnet_drop_asym_route, 0644);
+#endif
+MODULE_PARM_DESC(lnet_drop_asym_route,
+		 "Set to 1 to drop asymmetrical route messages.");
+
+#define LNET_TRANSACTION_TIMEOUT_NO_HEALTH_DEFAULT 50
+#define LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT 50
+
+unsigned lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT;
+static int transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp);
+#ifdef HAVE_KERNEL_PARAM_OPS
+static struct kernel_param_ops param_ops_transaction_timeout = {
+	.set = transaction_to_set,
+	.get = param_get_int,
+};
+
+#define param_check_transaction_timeout(name, p) \
+		__param_check(name, p, int)
+module_param(lnet_transaction_timeout, transaction_timeout, S_IRUGO|S_IWUSR);
+#else
+module_param_call(lnet_transaction_timeout, transaction_to_set, param_get_int,
+		  &lnet_transaction_timeout, S_IRUGO|S_IWUSR);
+#endif
+MODULE_PARM_DESC(lnet_transaction_timeout,
+		"Maximum number of seconds to wait for a peer response.");
+
+#define LNET_RETRY_COUNT_HEALTH_DEFAULT 2
+unsigned lnet_retry_count = LNET_RETRY_COUNT_HEALTH_DEFAULT;
+static int retry_count_set(const char *val, cfs_kernel_param_arg_t *kp);
+#ifdef HAVE_KERNEL_PARAM_OPS
+static struct kernel_param_ops param_ops_retry_count = {
+	.set = retry_count_set,
+	.get = param_get_int,
+};
+
+#define param_check_retry_count(name, p) \
+		__param_check(name, p, int)
+module_param(lnet_retry_count, retry_count, S_IRUGO|S_IWUSR);
+#else
+module_param_call(lnet_retry_count, retry_count_set, param_get_int,
+		  &lnet_retry_count, S_IRUGO|S_IWUSR);
+#endif
+MODULE_PARM_DESC(lnet_retry_count,
+		 "Maximum number of times to retry transmitting a message");
+
+
+unsigned lnet_lnd_timeout = LNET_LND_DEFAULT_TIMEOUT;
+
 /*
  * This sequence number keeps track of how many times DLC was used to
  * update the local NIs. It is incremented when a NI is added or
@@ -79,6 +234,282 @@ static atomic_t lnet_dlc_seq_no = ATOMIC_INIT(0);
 static int lnet_ping(struct lnet_process_id id, signed long timeout,
 		     struct lnet_process_id __user *ids, int n_ids);
 
+static int lnet_discover(struct lnet_process_id id, __u32 force,
+			 struct lnet_process_id __user *ids, int n_ids);
+
+static int
+sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+	int rc;
+	unsigned *sensitivity = (unsigned *)kp->arg;
+	unsigned long value;
+
+	rc = kstrtoul(val, 0, &value);
+	if (rc) {
+		CERROR("Invalid module parameter value for 'lnet_health_sensitivity'\n");
+		return rc;
+	}
+
+	/*
+	 * The purpose of locking the api_mutex here is to ensure that
+	 * the correct value ends up stored properly.
+	 */
+	mutex_lock(&the_lnet.ln_api_mutex);
+
+	if (value > LNET_MAX_HEALTH_VALUE) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		CERROR("Invalid health value. Maximum: %d value = %lu\n",
+		       LNET_MAX_HEALTH_VALUE, value);
+		return -EINVAL;
+	}
+
+	/*
+	 * if we're turning on health then use the health timeout
+	 * defaults.
+	 */
+	if (*sensitivity == 0 && value != 0) {
+		lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT;
+		lnet_retry_count = LNET_RETRY_COUNT_HEALTH_DEFAULT;
+	/*
+	 * if we're turning off health then use the no health timeout
+	 * default.
+	 */
+	} else if (*sensitivity != 0 && value == 0) {
+		lnet_transaction_timeout =
+			LNET_TRANSACTION_TIMEOUT_NO_HEALTH_DEFAULT;
+		lnet_retry_count = 0;
+	}
+
+	*sensitivity = value;
+
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	return 0;
+}
+
+static int
+recovery_interval_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+	int rc;
+	unsigned *interval = (unsigned *)kp->arg;
+	unsigned long value;
+
+	rc = kstrtoul(val, 0, &value);
+	if (rc) {
+		CERROR("Invalid module parameter value for 'lnet_recovery_interval'\n");
+		return rc;
+	}
+
+	if (value < 1) {
+		CERROR("lnet_recovery_interval must be at least 1 second\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * The purpose of locking the api_mutex here is to ensure that
+	 * the correct value ends up stored properly.
+	 */
+	mutex_lock(&the_lnet.ln_api_mutex);
+
+	*interval = value;
+
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	return 0;
+}
+
+static int
+discovery_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+	int rc;
+	unsigned *discovery = (unsigned *)kp->arg;
+	unsigned long value;
+	struct lnet_ping_buffer *pbuf;
+
+	rc = kstrtoul(val, 0, &value);
+	if (rc) {
+		CERROR("Invalid module parameter value for 'lnet_peer_discovery_disabled'\n");
+		return rc;
+	}
+
+	value = (value) ? 1 : 0;
+
+	/*
+	 * The purpose of locking the api_mutex here is to ensure that
+	 * the correct value ends up stored properly.
+	 */
+	mutex_lock(&the_lnet.ln_api_mutex);
+
+	if (value == *discovery) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return 0;
+	}
+
+	*discovery = value;
+
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return 0;
+	}
+
+	/* tell peers that discovery setting has changed */
+	lnet_net_lock(LNET_LOCK_EX);
+	pbuf = the_lnet.ln_ping_target;
+	if (value)
+		pbuf->pb_info.pi_features &= ~LNET_PING_FEAT_DISCOVERY;
+	else
+		pbuf->pb_info.pi_features |= LNET_PING_FEAT_DISCOVERY;
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	lnet_push_update_to_peers(1);
+
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	return 0;
+}
+
+static int
+drop_asym_route_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+	int rc;
+	unsigned int *drop_asym_route = (unsigned int *)kp->arg;
+	unsigned long value;
+
+	rc = kstrtoul(val, 0, &value);
+	if (rc) {
+		CERROR("Invalid module parameter value for "
+		       "'lnet_drop_asym_route'\n");
+		return rc;
+	}
+
+	/*
+	 * The purpose of locking the api_mutex here is to ensure that
+	 * the correct value ends up stored properly.
+	 */
+	mutex_lock(&the_lnet.ln_api_mutex);
+
+	if (value == *drop_asym_route) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return 0;
+	}
+
+	*drop_asym_route = value;
+
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	return 0;
+}
+
+static int
+transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+	int rc;
+	unsigned *transaction_to = (unsigned *)kp->arg;
+	unsigned long value;
+
+	rc = kstrtoul(val, 0, &value);
+	if (rc) {
+		CERROR("Invalid module parameter value for 'lnet_transaction_timeout'\n");
+		return rc;
+	}
+
+	/*
+	 * The purpose of locking the api_mutex here is to ensure that
+	 * the correct value ends up stored properly.
+	 */
+	mutex_lock(&the_lnet.ln_api_mutex);
+
+	if (value < lnet_retry_count || value == 0) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		CERROR("Invalid value for lnet_transaction_timeout (%lu). "
+		       "Has to be greater than lnet_retry_count (%u)\n",
+		       value, lnet_retry_count);
+		return -EINVAL;
+	}
+
+	if (value == *transaction_to) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return 0;
+	}
+
+	*transaction_to = value;
+	if (lnet_retry_count == 0)
+		lnet_lnd_timeout = value;
+	else
+		lnet_lnd_timeout = value / lnet_retry_count;
+
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	return 0;
+}
+
+static int
+retry_count_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+	int rc;
+	unsigned *retry_count = (unsigned *)kp->arg;
+	unsigned long value;
+
+	rc = kstrtoul(val, 0, &value);
+	if (rc) {
+		CERROR("Invalid module parameter value for 'lnet_retry_count'\n");
+		return rc;
+	}
+
+	/*
+	 * The purpose of locking the api_mutex here is to ensure that
+	 * the correct value ends up stored properly.
+	 */
+	mutex_lock(&the_lnet.ln_api_mutex);
+
+	if (lnet_health_sensitivity == 0) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		CERROR("Can not set retry_count when health feature is turned off\n");
+		return -EINVAL;
+	}
+
+	if (value > lnet_transaction_timeout) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		CERROR("Invalid value for lnet_retry_count (%lu). "
+		       "Has to be smaller than lnet_transaction_timeout (%u)\n",
+		       value, lnet_transaction_timeout);
+		return -EINVAL;
+	}
+
+	*retry_count = value;
+
+	if (value == 0)
+		lnet_lnd_timeout = lnet_transaction_timeout;
+	else
+		lnet_lnd_timeout = lnet_transaction_timeout / value;
+
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	return 0;
+}
+
+static int
+intf_max_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+	int value, rc;
+
+	rc = kstrtoint(val, 0, &value);
+	if (rc) {
+		CERROR("Invalid module parameter value for 'lnet_interfaces_max'\n");
+		return rc;
+	}
+
+	if (value < LNET_INTERFACES_MIN) {
+		CWARN("max interfaces provided are too small, setting to %d\n",
+		      LNET_INTERFACES_MAX_DEFAULT);
+		value = LNET_INTERFACES_MAX_DEFAULT;
+	}
+
+	*(int *)kp->arg = value;
+
+	return 0;
+}
+
 static char *
 lnet_get_routes(void)
 {
@@ -112,10 +543,10 @@ static void
 lnet_init_locks(void)
 {
 	spin_lock_init(&the_lnet.ln_eq_wait_lock);
+	spin_lock_init(&the_lnet.ln_msg_resend_lock);
 	init_waitqueue_head(&the_lnet.ln_eq_waitq);
-	init_waitqueue_head(&the_lnet.ln_rc_waitq);
+	init_waitqueue_head(&the_lnet.ln_mt_waitq);
 	mutex_init(&the_lnet.ln_lnd_mutex);
-	mutex_init(&the_lnet.ln_api_mutex);
 }
 
 static void
@@ -326,6 +757,43 @@ static void lnet_assert_wire_constants(void)
 	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.hello.incarnation) == 8);
 	CLASSERT((int)offsetof(struct lnet_hdr, msg.hello.type) == 40);
 	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.hello.type) == 4);
+
+	/* Checks for struct lnet_ni_status and related constants */
+	CLASSERT(LNET_NI_STATUS_INVALID == 0x00000000);
+	CLASSERT(LNET_NI_STATUS_UP == 0x15aac0de);
+	CLASSERT(LNET_NI_STATUS_DOWN == 0xdeadface);
+
+	/* Checks for struct lnet_ni_status */
+	CLASSERT((int)sizeof(struct lnet_ni_status) == 16);
+	CLASSERT((int)offsetof(struct lnet_ni_status, ns_nid) == 0);
+	CLASSERT((int)sizeof(((struct lnet_ni_status *)0)->ns_nid) == 8);
+	CLASSERT((int)offsetof(struct lnet_ni_status, ns_status) == 8);
+	CLASSERT((int)sizeof(((struct lnet_ni_status *)0)->ns_status) == 4);
+	CLASSERT((int)offsetof(struct lnet_ni_status, ns_unused) == 12);
+	CLASSERT((int)sizeof(((struct lnet_ni_status *)0)->ns_unused) == 4);
+
+	/* Checks for struct lnet_ping_info and related constants */
+	CLASSERT(LNET_PROTO_PING_MAGIC == 0x70696E67);
+	CLASSERT(LNET_PING_FEAT_INVAL == 0);
+	CLASSERT(LNET_PING_FEAT_BASE == 1);
+	CLASSERT(LNET_PING_FEAT_NI_STATUS == 2);
+	CLASSERT(LNET_PING_FEAT_RTE_DISABLED == 4);
+	CLASSERT(LNET_PING_FEAT_MULTI_RAIL == 8);
+	CLASSERT(LNET_PING_FEAT_DISCOVERY == 16);
+	CLASSERT(LNET_PING_FEAT_BITS == 31);
+
+	/* Checks for struct lnet_ping_info */
+	CLASSERT((int)sizeof(struct lnet_ping_info) == 16);
+	CLASSERT((int)offsetof(struct lnet_ping_info, pi_magic) == 0);
+	CLASSERT((int)sizeof(((struct lnet_ping_info *)0)->pi_magic) == 4);
+	CLASSERT((int)offsetof(struct lnet_ping_info, pi_features) == 4);
+	CLASSERT((int)sizeof(((struct lnet_ping_info *)0)->pi_features) == 4);
+	CLASSERT((int)offsetof(struct lnet_ping_info, pi_pid) == 8);
+	CLASSERT((int)sizeof(((struct lnet_ping_info *)0)->pi_pid) == 4);
+	CLASSERT((int)offsetof(struct lnet_ping_info, pi_nnis) == 12);
+	CLASSERT((int)sizeof(((struct lnet_ping_info *)0)->pi_nnis) == 4);
+	CLASSERT((int)offsetof(struct lnet_ping_info, pi_ni) == 16);
+	CLASSERT((int)sizeof(((struct lnet_ping_info *)0)->pi_ni) == 0);
 }
 
 static struct lnet_lnd *lnet_find_lnd_by_type(__u32 type)
@@ -343,6 +811,13 @@ static struct lnet_lnd *lnet_find_lnd_by_type(__u32 type)
 	return NULL;
 }
 
+unsigned int
+lnet_get_lnd_timeout(void)
+{
+	return lnet_lnd_timeout;
+}
+EXPORT_SYMBOL(lnet_get_lnd_timeout);
+
 void
 lnet_register_lnd(struct lnet_lnd *lnd)
 {
@@ -375,29 +850,71 @@ lnet_unregister_lnd(struct lnet_lnd *lnd)
 }
 EXPORT_SYMBOL(lnet_unregister_lnd);
 
+void
+lnet_counters_get_common(struct lnet_counters_common *common)
+{
+	struct lnet_counters *ctr;
+	int i;
+
+	memset(common, 0, sizeof(*common));
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) {
+		common->lcc_msgs_max     += ctr->lct_common.lcc_msgs_max;
+		common->lcc_msgs_alloc   += ctr->lct_common.lcc_msgs_alloc;
+		common->lcc_errors       += ctr->lct_common.lcc_errors;
+		common->lcc_send_count   += ctr->lct_common.lcc_send_count;
+		common->lcc_recv_count   += ctr->lct_common.lcc_recv_count;
+		common->lcc_route_count  += ctr->lct_common.lcc_route_count;
+		common->lcc_drop_count   += ctr->lct_common.lcc_drop_count;
+		common->lcc_send_length  += ctr->lct_common.lcc_send_length;
+		common->lcc_recv_length  += ctr->lct_common.lcc_recv_length;
+		common->lcc_route_length += ctr->lct_common.lcc_route_length;
+		common->lcc_drop_length  += ctr->lct_common.lcc_drop_length;
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+EXPORT_SYMBOL(lnet_counters_get_common);
+
 void
 lnet_counters_get(struct lnet_counters *counters)
 {
 	struct lnet_counters *ctr;
+	struct lnet_counters_health *health = &counters->lct_health;
 	int		i;
 
 	memset(counters, 0, sizeof(*counters));
 
+	lnet_counters_get_common(&counters->lct_common);
+
 	lnet_net_lock(LNET_LOCK_EX);
 
 	cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) {
-		counters->msgs_max     += ctr->msgs_max;
-		counters->msgs_alloc   += ctr->msgs_alloc;
-		counters->errors       += ctr->errors;
-		counters->send_count   += ctr->send_count;
-		counters->recv_count   += ctr->recv_count;
-		counters->route_count  += ctr->route_count;
-		counters->drop_count   += ctr->drop_count;
-		counters->send_length  += ctr->send_length;
-		counters->recv_length  += ctr->recv_length;
-		counters->route_length += ctr->route_length;
-		counters->drop_length  += ctr->drop_length;
-
+		health->lch_rst_alloc    += ctr->lct_health.lch_rst_alloc;
+		health->lch_resend_count += ctr->lct_health.lch_resend_count;
+		health->lch_response_timeout_count +=
+				ctr->lct_health.lch_response_timeout_count;
+		health->lch_local_interrupt_count +=
+				ctr->lct_health.lch_local_interrupt_count;
+		health->lch_local_dropped_count +=
+				ctr->lct_health.lch_local_dropped_count;
+		health->lch_local_aborted_count +=
+				ctr->lct_health.lch_local_aborted_count;
+		health->lch_local_no_route_count +=
+				ctr->lct_health.lch_local_no_route_count;
+		health->lch_local_timeout_count +=
+				ctr->lct_health.lch_local_timeout_count;
+		health->lch_local_error_count +=
+				ctr->lct_health.lch_local_error_count;
+		health->lch_remote_dropped_count +=
+				ctr->lct_health.lch_remote_dropped_count;
+		health->lch_remote_error_count +=
+				ctr->lct_health.lch_remote_error_count;
+		health->lch_remote_timeout_count +=
+				ctr->lct_health.lch_remote_timeout_count;
+		health->lch_network_timeout_count +=
+				ctr->lct_health.lch_network_timeout_count;
 	}
 	lnet_net_unlock(LNET_LOCK_EX);
 }
@@ -582,6 +1099,26 @@ lnet_res_lh_initialize(struct lnet_res_container *rec,
 	list_add(&lh->lh_hash_chain, &rec->rec_lh_hash[hash]);
 }
 
+struct list_head **
+lnet_create_array_of_queues(void)
+{
+	struct list_head **qs;
+	struct list_head *q;
+	int i;
+
+	qs = cfs_percpt_alloc(lnet_cpt_table(),
+			      sizeof(struct list_head));
+	if (!qs) {
+		CERROR("Failed to allocate queues\n");
+		return NULL;
+	}
+
+	cfs_percpt_for_each(q, i, qs)
+		INIT_LIST_HEAD(q);
+
+	return qs;
+}
+
 static int lnet_unprepare(void);
 
 static int
@@ -604,12 +1141,18 @@ lnet_prepare(lnet_pid_t requested_pid)
 	the_lnet.ln_pid = requested_pid;
 
 	INIT_LIST_HEAD(&the_lnet.ln_test_peers);
-	INIT_LIST_HEAD(&the_lnet.ln_peers);
 	INIT_LIST_HEAD(&the_lnet.ln_remote_peer_ni_list);
 	INIT_LIST_HEAD(&the_lnet.ln_nets);
 	INIT_LIST_HEAD(&the_lnet.ln_routers);
 	INIT_LIST_HEAD(&the_lnet.ln_drop_rules);
 	INIT_LIST_HEAD(&the_lnet.ln_delay_rules);
+	INIT_LIST_HEAD(&the_lnet.ln_dc_request);
+	INIT_LIST_HEAD(&the_lnet.ln_dc_working);
+	INIT_LIST_HEAD(&the_lnet.ln_dc_expired);
+	INIT_LIST_HEAD(&the_lnet.ln_mt_localNIRecovq);
+	INIT_LIST_HEAD(&the_lnet.ln_mt_peerNIRecovq);
+	init_waitqueue_head(&the_lnet.ln_dc_waitq);
+	LNetInvalidateEQHandle(&the_lnet.ln_mt_eqh);
 
 	rc = lnet_descriptor_setup();
 	if (rc != 0)
@@ -668,6 +1211,12 @@ lnet_prepare(lnet_pid_t requested_pid)
 		goto failed;
 	}
 
+	the_lnet.ln_mt_zombie_rstqs = lnet_create_array_of_queues();
+	if (!the_lnet.ln_mt_zombie_rstqs) {
+		rc = -ENOMEM;
+		goto failed;
+	}
+
 	return 0;
 
  failed:
@@ -678,6 +1227,8 @@ lnet_prepare(lnet_pid_t requested_pid)
 static int
 lnet_unprepare (void)
 {
+	int rc;
+
 	/* NB no LNET_LOCK since this is the last reference.  All LND instances
 	 * have shut down already, so it is safe to unlink and free all
 	 * descriptors, even those that appear committed to a network op (eg MD
@@ -689,6 +1240,17 @@ lnet_unprepare (void)
 	LASSERT(list_empty(&the_lnet.ln_test_peers));
 	LASSERT(list_empty(&the_lnet.ln_nets));
 
+	if (the_lnet.ln_mt_zombie_rstqs) {
+		lnet_clean_zombie_rstqs();
+		the_lnet.ln_mt_zombie_rstqs = NULL;
+	}
+
+	if (!LNetEQHandleIsInvalid(the_lnet.ln_mt_eqh)) {
+		rc = LNetEQFree(the_lnet.ln_mt_eqh);
+		LNetInvalidateEQHandle(&the_lnet.ln_mt_eqh);
+		LASSERT(rc == 0);
+	}
+
 	lnet_portals_destroy();
 
 	if (the_lnet.ln_md_containers != NULL) {
@@ -854,16 +1416,6 @@ lnet_islocalnet(__u32 net_id)
 	return local;
 }
 
-bool
-lnet_is_ni_healthy_locked(struct lnet_ni *ni)
-{
-	if (ni->ni_state == LNET_NI_STATE_ACTIVE ||
-	    ni->ni_state == LNET_NI_STATE_DEGRADED)
-		return true;
-
-	return false;
-}
-
 struct lnet_ni  *
 lnet_nid2ni_locked(lnet_nid_t nid, int cpt)
 {
@@ -931,25 +1483,45 @@ lnet_count_acceptor_nets(void)
 	return count;
 }
 
-static struct lnet_ping_info *
-lnet_ping_info_create(int num_ni)
+struct lnet_ping_buffer *
+lnet_ping_buffer_alloc(int nnis, gfp_t gfp)
+{
+	struct lnet_ping_buffer *pbuf;
+
+	LIBCFS_ALLOC_GFP(pbuf, LNET_PING_BUFFER_SIZE(nnis), gfp);
+	if (pbuf) {
+		pbuf->pb_nnis = nnis;
+		atomic_set(&pbuf->pb_refcnt, 1);
+	}
+
+	return pbuf;
+}
+
+void
+lnet_ping_buffer_free(struct lnet_ping_buffer *pbuf)
 {
-	struct lnet_ping_info *ping_info;
-	unsigned int	 infosz;
+	LASSERT(lnet_ping_buffer_numref(pbuf) == 0);
+	LIBCFS_FREE(pbuf, LNET_PING_BUFFER_SIZE(pbuf->pb_nnis));
+}
 
-	infosz = offsetof(struct lnet_ping_info, pi_ni[num_ni]);
-	LIBCFS_ALLOC(ping_info, infosz);
-	if (ping_info == NULL) {
-		CERROR("Can't allocate ping info[%d]\n", num_ni);
+static struct lnet_ping_buffer *
+lnet_ping_target_create(int nnis)
+{
+	struct lnet_ping_buffer *pbuf;
+
+	pbuf = lnet_ping_buffer_alloc(nnis, GFP_NOFS);
+	if (pbuf == NULL) {
+		CERROR("Can't allocate ping source [%d]\n", nnis);
 		return NULL;
 	}
 
-	ping_info->pi_nnis = num_ni;
-	ping_info->pi_pid = the_lnet.ln_pid;
-	ping_info->pi_magic = LNET_PROTO_PING_MAGIC;
-	ping_info->pi_features = LNET_PING_FEAT_NI_STATUS;
+	pbuf->pb_info.pi_nnis = nnis;
+	pbuf->pb_info.pi_pid = the_lnet.ln_pid;
+	pbuf->pb_info.pi_magic = LNET_PROTO_PING_MAGIC;
+	pbuf->pb_info.pi_features =
+		LNET_PING_FEAT_NI_STATUS | LNET_PING_FEAT_MULTI_RAIL;
 
-	return ping_info;
+	return pbuf;
 }
 
 static inline int
@@ -995,16 +1567,25 @@ lnet_get_ni_count(void)
 	return count;
 }
 
-static inline void
-lnet_ping_info_free(struct lnet_ping_info *pinfo)
+int
+lnet_ping_info_validate(struct lnet_ping_info *pinfo)
 {
-	LIBCFS_FREE(pinfo,
-		    offsetof(struct lnet_ping_info,
-			     pi_ni[pinfo->pi_nnis]));
+	if (!pinfo)
+		return -EINVAL;
+	if (pinfo->pi_magic != LNET_PROTO_PING_MAGIC)
+		return -EPROTO;
+	if (!(pinfo->pi_features & LNET_PING_FEAT_NI_STATUS))
+		return -EPROTO;
+	/* Loopback is guaranteed to be present */
+	if (pinfo->pi_nnis < 1 || pinfo->pi_nnis > lnet_interfaces_max)
+		return -ERANGE;
+	if (LNET_PING_INFO_LONI(pinfo) != LNET_NID_LO_0)
+		return -EPROTO;
+	return 0;
 }
 
 static void
-lnet_ping_info_destroy(void)
+lnet_ping_target_destroy(void)
 {
 	struct lnet_net *net;
 	struct lnet_ni	*ni;
@@ -1019,25 +1600,25 @@ lnet_ping_info_destroy(void)
 		}
 	}
 
-	lnet_ping_info_free(the_lnet.ln_ping_info);
-	the_lnet.ln_ping_info = NULL;
+	lnet_ping_buffer_decref(the_lnet.ln_ping_target);
+	the_lnet.ln_ping_target = NULL;
 
 	lnet_net_unlock(LNET_LOCK_EX);
 }
 
 static void
-lnet_ping_event_handler(struct lnet_event *event)
+lnet_ping_target_event_handler(struct lnet_event *event)
 {
-	struct lnet_ping_info *pinfo = event->md.user_ptr;
+	struct lnet_ping_buffer *pbuf = event->md.user_ptr;
 
 	if (event->unlinked)
-		pinfo->pi_features = LNET_PING_FEAT_INVAL;
+		lnet_ping_buffer_decref(pbuf);
 }
 
 static int
-lnet_ping_info_setup(struct lnet_ping_info **ppinfo,
-		     struct lnet_handle_md *md_handle,
-		     int ni_count, bool set_eq)
+lnet_ping_target_setup(struct lnet_ping_buffer **ppbuf,
+		       struct lnet_handle_md *ping_mdh,
+		       int ni_count, bool set_eq)
 {
 	struct lnet_process_id id = {
 		.nid = LNET_NID_ANY,
@@ -1048,72 +1629,76 @@ lnet_ping_info_setup(struct lnet_ping_info **ppinfo,
 	int rc, rc2;
 
 	if (set_eq) {
-		rc = LNetEQAlloc(0, lnet_ping_event_handler,
+		rc = LNetEQAlloc(0, lnet_ping_target_event_handler,
 				 &the_lnet.ln_ping_target_eq);
 		if (rc != 0) {
-			CERROR("Can't allocate ping EQ: %d\n", rc);
+			CERROR("Can't allocate ping buffer EQ: %d\n", rc);
 			return rc;
 		}
 	}
 
-	*ppinfo = lnet_ping_info_create(ni_count);
-	if (*ppinfo == NULL) {
+	*ppbuf = lnet_ping_target_create(ni_count);
+	if (*ppbuf == NULL) {
 		rc = -ENOMEM;
-		goto failed_0;
+		goto fail_free_eq;
 	}
 
+	/* Ping target ME/MD */
 	rc = LNetMEAttach(LNET_RESERVED_PORTAL, id,
 			  LNET_PROTO_PING_MATCHBITS, 0,
 			  LNET_UNLINK, LNET_INS_AFTER,
 			  &me_handle);
 	if (rc != 0) {
-		CERROR("Can't create ping ME: %d\n", rc);
-		goto failed_1;
+		CERROR("Can't create ping target ME: %d\n", rc);
+		goto fail_decref_ping_buffer;
 	}
 
 	/* initialize md content */
-	md.start     = *ppinfo;
-	md.length    = offsetof(struct lnet_ping_info,
-				pi_ni[(*ppinfo)->pi_nnis]);
+	md.start     = &(*ppbuf)->pb_info;
+	md.length    = LNET_PING_INFO_SIZE((*ppbuf)->pb_nnis);
 	md.threshold = LNET_MD_THRESH_INF;
 	md.max_size  = 0;
 	md.options   = LNET_MD_OP_GET | LNET_MD_TRUNCATE |
 		       LNET_MD_MANAGE_REMOTE;
-	md.user_ptr  = NULL;
 	md.eq_handle = the_lnet.ln_ping_target_eq;
-	md.user_ptr = *ppinfo;
+	md.user_ptr  = *ppbuf;
 
-	rc = LNetMDAttach(me_handle, md, LNET_RETAIN, md_handle);
+	rc = LNetMDAttach(me_handle, md, LNET_RETAIN, ping_mdh);
 	if (rc != 0) {
-		CERROR("Can't attach ping MD: %d\n", rc);
-		goto failed_2;
+		CERROR("Can't attach ping target MD: %d\n", rc);
+		goto fail_unlink_ping_me;
 	}
+	lnet_ping_buffer_addref(*ppbuf);
 
 	return 0;
 
-failed_2:
+fail_unlink_ping_me:
 	rc2 = LNetMEUnlink(me_handle);
 	LASSERT(rc2 == 0);
-failed_1:
-	lnet_ping_info_free(*ppinfo);
-	*ppinfo = NULL;
-failed_0:
-	if (set_eq)
-		LNetEQFree(the_lnet.ln_ping_target_eq);
+fail_decref_ping_buffer:
+	LASSERT(lnet_ping_buffer_numref(*ppbuf) == 1);
+	lnet_ping_buffer_decref(*ppbuf);
+	*ppbuf = NULL;
+fail_free_eq:
+	if (set_eq) {
+		rc2 = LNetEQFree(the_lnet.ln_ping_target_eq);
+		LASSERT(rc2 == 0);
+	}
 	return rc;
 }
 
 static void
-lnet_ping_md_unlink(struct lnet_ping_info *pinfo, struct lnet_handle_md *md_handle)
+lnet_ping_md_unlink(struct lnet_ping_buffer *pbuf,
+		    struct lnet_handle_md *ping_mdh)
 {
 	sigset_t	blocked = cfs_block_allsigs();
 
-	LNetMDUnlink(*md_handle);
-	LNetInvalidateMDHandle(md_handle);
+	LNetMDUnlink(*ping_mdh);
+	LNetInvalidateMDHandle(ping_mdh);
 
-	/* NB md could be busy; this just starts the unlink */
-	while (pinfo->pi_features != LNET_PING_FEAT_INVAL) {
-		CDEBUG(D_NET, "Still waiting for ping MD to unlink\n");
+	/* NB the MD could be busy; this just starts the unlink */
+	while (lnet_ping_buffer_numref(pbuf) > 1) {
+		CDEBUG(D_NET, "Still waiting for ping data MD to unlink\n");
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_timeout(cfs_time_seconds(1));
 	}
@@ -1122,77 +1707,241 @@ lnet_ping_md_unlink(struct lnet_ping_info *pinfo, struct lnet_handle_md *md_hand
 }
 
 static void
-lnet_ping_info_install_locked(struct lnet_ping_info *ping_info)
+lnet_ping_target_install_locked(struct lnet_ping_buffer *pbuf)
 {
-	int			i;
 	struct lnet_ni		*ni;
 	struct lnet_net		*net;
 	struct lnet_ni_status *ns;
+	int			i;
+	int			rc;
 
 	i = 0;
 	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
 		list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
-			LASSERT(i < ping_info->pi_nnis);
+			LASSERT(i < pbuf->pb_nnis);
 
-			ns = &ping_info->pi_ni[i];
+			ns = &pbuf->pb_info.pi_ni[i];
 
 			ns->ns_nid = ni->ni_nid;
 
 			lnet_ni_lock(ni);
 			ns->ns_status = (ni->ni_status != NULL) ?
-					ni->ni_status->ns_status :
+					 ni->ni_status->ns_status :
 						LNET_NI_STATUS_UP;
 			ni->ni_status = ns;
 			lnet_ni_unlock(ni);
 
 			i++;
 		}
-
 	}
+	/*
+	 * We (ab)use the ns_status of the loopback interface to
+	 * transmit the sequence number. The first interface listed
+	 * must be the loopback interface.
+	 */
+	rc = lnet_ping_info_validate(&pbuf->pb_info);
+	if (rc) {
+		LCONSOLE_EMERG("Invalid ping target: %d\n", rc);
+		LBUG();
+	}
+	LNET_PING_BUFFER_SEQNO(pbuf) =
+		atomic_inc_return(&the_lnet.ln_ping_target_seqno);
 }
 
 static void
-lnet_ping_target_update(struct lnet_ping_info *pinfo,
-			struct lnet_handle_md md_handle)
+lnet_ping_target_update(struct lnet_ping_buffer *pbuf,
+			struct lnet_handle_md ping_mdh)
 {
-	struct lnet_ping_info *old_pinfo = NULL;
-	struct lnet_handle_md old_md;
+	struct lnet_ping_buffer *old_pbuf = NULL;
+	struct lnet_handle_md old_ping_md;
 
 	/* switch the NIs to point to the new ping info created */
 	lnet_net_lock(LNET_LOCK_EX);
 
 	if (!the_lnet.ln_routing)
-		pinfo->pi_features |= LNET_PING_FEAT_RTE_DISABLED;
-	lnet_ping_info_install_locked(pinfo);
+		pbuf->pb_info.pi_features |= LNET_PING_FEAT_RTE_DISABLED;
+	if (!lnet_peer_discovery_disabled)
+		pbuf->pb_info.pi_features |= LNET_PING_FEAT_DISCOVERY;
+
+	/* Ensure only known feature bits have been set. */
+	LASSERT(pbuf->pb_info.pi_features & LNET_PING_FEAT_BITS);
+	LASSERT(!(pbuf->pb_info.pi_features & ~LNET_PING_FEAT_BITS));
+
+	lnet_ping_target_install_locked(pbuf);
+
+	if (the_lnet.ln_ping_target) {
+		old_pbuf = the_lnet.ln_ping_target;
+		old_ping_md = the_lnet.ln_ping_target_md;
+	}
+	the_lnet.ln_ping_target_md = ping_mdh;
+	the_lnet.ln_ping_target = pbuf;
+
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	if (old_pbuf) {
+		/* unlink and free the old ping info */
+		lnet_ping_md_unlink(old_pbuf, &old_ping_md);
+		lnet_ping_buffer_decref(old_pbuf);
+	}
+
+	lnet_push_update_to_peers(0);
+}
+
+static void
+lnet_ping_target_fini(void)
+{
+	int		rc;
+
+	lnet_ping_md_unlink(the_lnet.ln_ping_target,
+			    &the_lnet.ln_ping_target_md);
+
+	rc = LNetEQFree(the_lnet.ln_ping_target_eq);
+	LASSERT(rc == 0);
+
+	lnet_ping_target_destroy();
+}
+
+/* Resize the push target. */
+int lnet_push_target_resize(void)
+{
+	struct lnet_process_id id = { LNET_NID_ANY, LNET_PID_ANY };
+	struct lnet_md md = { NULL };
+	struct lnet_handle_me meh;
+	struct lnet_handle_md mdh;
+	struct lnet_handle_md old_mdh;
+	struct lnet_ping_buffer *pbuf;
+	struct lnet_ping_buffer *old_pbuf;
+	int nnis = the_lnet.ln_push_target_nnis;
+	int rc;
+
+	if (nnis <= 0) {
+		rc = -EINVAL;
+		goto fail_return;
+	}
+again:
+	pbuf = lnet_ping_buffer_alloc(nnis, GFP_NOFS);
+	if (!pbuf) {
+		rc = -ENOMEM;
+		goto fail_return;
+	}
+
+	rc = LNetMEAttach(LNET_RESERVED_PORTAL, id,
+			  LNET_PROTO_PING_MATCHBITS, 0,
+			  LNET_UNLINK, LNET_INS_AFTER,
+			  &meh);
+	if (rc) {
+		CERROR("Can't create push target ME: %d\n", rc);
+		goto fail_decref_pbuf;
+	}
+
+	/* initialize md content */
+	md.start     = &pbuf->pb_info;
+	md.length    = LNET_PING_INFO_SIZE(nnis);
+	md.threshold = LNET_MD_THRESH_INF;
+	md.max_size  = 0;
+	md.options   = LNET_MD_OP_PUT | LNET_MD_TRUNCATE |
+		       LNET_MD_MANAGE_REMOTE;
+	md.user_ptr  = pbuf;
+	md.eq_handle = the_lnet.ln_push_target_eq;
 
-	if (the_lnet.ln_ping_info != NULL) {
-		old_pinfo = the_lnet.ln_ping_info;
-		old_md = the_lnet.ln_ping_target_md;
+	rc = LNetMDAttach(meh, md, LNET_RETAIN, &mdh);
+	if (rc) {
+		CERROR("Can't attach push MD: %d\n", rc);
+		goto fail_unlink_meh;
 	}
-	the_lnet.ln_ping_target_md = md_handle;
-	the_lnet.ln_ping_info = pinfo;
+	lnet_ping_buffer_addref(pbuf);
 
+	lnet_net_lock(LNET_LOCK_EX);
+	old_pbuf = the_lnet.ln_push_target;
+	old_mdh = the_lnet.ln_push_target_md;
+	the_lnet.ln_push_target = pbuf;
+	the_lnet.ln_push_target_md = mdh;
 	lnet_net_unlock(LNET_LOCK_EX);
 
-	if (old_pinfo != NULL) {
-		/* unlink the old ping info */
-		lnet_ping_md_unlink(old_pinfo, &old_md);
-		lnet_ping_info_free(old_pinfo);
+	if (old_pbuf) {
+		LNetMDUnlink(old_mdh);
+		lnet_ping_buffer_decref(old_pbuf);
+	}
+
+	if (nnis < the_lnet.ln_push_target_nnis)
+		goto again;
+
+	CDEBUG(D_NET, "nnis %d success\n", nnis);
+
+	return 0;
+
+fail_unlink_meh:
+	LNetMEUnlink(meh);
+fail_decref_pbuf:
+	lnet_ping_buffer_decref(pbuf);
+fail_return:
+	CDEBUG(D_NET, "nnis %d error %d\n", nnis, rc);
+	return rc;
+}
+
+static void lnet_push_target_event_handler(struct lnet_event *ev)
+{
+	struct lnet_ping_buffer *pbuf = ev->md.user_ptr;
+
+	if (pbuf->pb_info.pi_magic == __swab32(LNET_PROTO_PING_MAGIC))
+		lnet_swap_pinginfo(pbuf);
+
+	lnet_peer_push_event(ev);
+	if (ev->unlinked)
+		lnet_ping_buffer_decref(pbuf);
+}
+
+/* Initialize the push target. */
+static int lnet_push_target_init(void)
+{
+	int rc;
+
+	if (the_lnet.ln_push_target)
+		return -EALREADY;
+
+	rc = LNetEQAlloc(0, lnet_push_target_event_handler,
+			 &the_lnet.ln_push_target_eq);
+	if (rc) {
+		CERROR("Can't allocated push target EQ: %d\n", rc);
+		return rc;
+	}
+
+	/* Start at the required minimum, we'll enlarge if required. */
+	the_lnet.ln_push_target_nnis = LNET_INTERFACES_MIN;
+
+	rc = lnet_push_target_resize();
+
+	if (rc) {
+		LNetEQFree(the_lnet.ln_push_target_eq);
+		LNetInvalidateEQHandle(&the_lnet.ln_push_target_eq);
 	}
+
+	return rc;
 }
 
-static void
-lnet_ping_target_fini(void)
+/* Clean up the push target. */
+static void lnet_push_target_fini(void)
 {
-	int		rc;
+	if (!the_lnet.ln_push_target)
+		return;
 
-	lnet_ping_md_unlink(the_lnet.ln_ping_info,
-			    &the_lnet.ln_ping_target_md);
+	/* Unlink and invalidate to prevent new references. */
+	LNetMDUnlink(the_lnet.ln_push_target_md);
+	LNetInvalidateMDHandle(&the_lnet.ln_push_target_md);
 
-	rc = LNetEQFree(the_lnet.ln_ping_target_eq);
-	LASSERT(rc == 0);
+	/* Wait for the unlink to complete. */
+	while (lnet_ping_buffer_numref(the_lnet.ln_push_target) > 1) {
+		CDEBUG(D_NET, "Still waiting for ping data MD to unlink\n");
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(1));
+	}
 
-	lnet_ping_info_destroy();
+	lnet_ping_buffer_decref(the_lnet.ln_push_target);
+	the_lnet.ln_push_target = NULL;
+	the_lnet.ln_push_target_nnis = 0;
+
+	LNetEQFree(the_lnet.ln_push_target_eq);
+	LNetInvalidateEQHandle(&the_lnet.ln_push_target_eq);
 }
 
 static int
@@ -1215,11 +1964,6 @@ lnet_ni_tq_credits(struct lnet_ni *ni)
 static void
 lnet_ni_unlink_locked(struct lnet_ni *ni)
 {
-	if (!list_empty(&ni->ni_cptlist)) {
-		list_del_init(&ni->ni_cptlist);
-		lnet_ni_decref_locked(ni, 0);
-	}
-
 	/* move it to zombie list and nobody can find it anymore */
 	LASSERT(!list_empty(&ni->ni_netlist));
 	list_move(&ni->ni_netlist, &ni->ni_net->net_ni_zombie);
@@ -1258,7 +2002,13 @@ lnet_clear_zombies_nis_locked(struct lnet_net *net)
 		}
 
 		if (!list_empty(&ni->ni_netlist)) {
+			/* Unlock mutex while waiting to allow other
+			 * threads to read the LNet state and fall through
+			 * to avoid deadlock
+			 */
 			lnet_net_unlock(LNET_LOCK_EX);
+			mutex_unlock(&the_lnet.ln_api_mutex);
+
 			++i;
 			if ((i & (-i)) == i) {
 				CDEBUG(D_WARNING,
@@ -1267,6 +2017,8 @@ lnet_clear_zombies_nis_locked(struct lnet_net *net)
 			}
 			set_current_state(TASK_UNINTERRUPTIBLE);
 			schedule_timeout(cfs_time_seconds(1));
+
+			mutex_lock(&the_lnet.ln_api_mutex);
 			lnet_net_lock(LNET_LOCK_EX);
 			continue;
 		}
@@ -1296,7 +2048,9 @@ lnet_shutdown_lndni(struct lnet_ni *ni)
 	struct lnet_net *net = ni->ni_net;
 
 	lnet_net_lock(LNET_LOCK_EX);
+	lnet_ni_lock(ni);
 	ni->ni_state = LNET_NI_STATE_DELETING;
+	lnet_ni_unlock(ni);
 	lnet_ni_unlink_locked(ni);
 	lnet_incr_dlc_seq();
 	lnet_net_unlock(LNET_LOCK_EX);
@@ -1350,6 +2104,10 @@ static void
 lnet_shutdown_lndnets(void)
 {
 	struct lnet_net *net;
+	struct list_head resend;
+	struct lnet_msg *msg, *tmp;
+
+	INIT_LIST_HEAD(&resend);
 
 	/* NB called holding the global mutex */
 
@@ -1385,6 +2143,16 @@ lnet_shutdown_lndnets(void)
 		lnet_shutdown_lndnet(net);
 	}
 
+	spin_lock(&the_lnet.ln_msg_resend_lock);
+	list_splice(&the_lnet.ln_msg_resend, &resend);
+	spin_unlock(&the_lnet.ln_msg_resend_lock);
+
+	list_for_each_entry_safe(msg, tmp, &resend, msg_list) {
+		list_del_init(&msg->msg_list);
+		msg->msg_no_resend = true;
+		lnet_finalize(msg, -ECANCELED);
+	}
+
 	lnet_net_lock(LNET_LOCK_EX);
 	the_lnet.ln_state = LNET_STATE_SHUTDOWN;
 	lnet_net_unlock(LNET_LOCK_EX);
@@ -1418,7 +2186,9 @@ lnet_startup_lndni(struct lnet_ni *ni, struct lnet_lnd_tunables *tun)
 		goto failed0;
 	}
 
+	lnet_ni_lock(ni);
 	ni->ni_state = LNET_NI_STATE_ACTIVE;
+	lnet_ni_unlock(ni);
 
 	/* We keep a reference on the loopback net through the loopback NI */
 	if (net->net_lnd->lnd_type == LOLND) {
@@ -1453,6 +2223,7 @@ lnet_startup_lndni(struct lnet_ni *ni, struct lnet_lnd_tunables *tun)
 
 	atomic_set(&ni->ni_tx_credits,
 		   lnet_ni_tq_credits(ni) * ni->ni_ncpts);
+	atomic_set(&ni->ni_healthv, LNET_MAX_HEALTH_VALUE);
 
 	CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n",
 		libcfs_nid2str(ni->ni_nid),
@@ -1496,8 +2267,6 @@ lnet_startup_lndnet(struct lnet_net *net, struct lnet_lnd_tunables *tun)
 	if (lnet_net_unique(net->net_id, &the_lnet.ln_nets, &net_l)) {
 		lnd_type = LNET_NETTYP(net->net_id);
 
-		LASSERT(libcfs_isknown_lnd(lnd_type));
-
 		mutex_lock(&the_lnet.ln_lnd_mutex);
 		lnd = lnet_find_lnd_by_type(lnd_type);
 
@@ -1576,7 +2345,7 @@ lnet_startup_lndnet(struct lnet_net *net, struct lnet_lnd_tunables *tun)
 		 * up is actually unique. if it's not fail. */
 		if (!lnet_ni_unique_net(&net_l->net_ni_list,
 					ni->ni_interfaces[0])) {
-			rc = -EINVAL;
+			rc = -EEXIST;
 			goto failed1;
 		}
 
@@ -1701,8 +2470,6 @@ int lnet_lib_init(void)
 
 	lnet_assert_wire_constants();
 
-	memset(&the_lnet, 0, sizeof(the_lnet));
-
 	/* refer to global cfs_cpt_table for now */
 	the_lnet.ln_cpt_table	= cfs_cpt_table;
 	the_lnet.ln_cpt_number	= cfs_cpt_number(cfs_cpt_table);
@@ -1730,6 +2497,7 @@ int lnet_lib_init(void)
 	INIT_LIST_HEAD(&the_lnet.ln_lnds);
 	INIT_LIST_HEAD(&the_lnet.ln_net_zombie);
 	INIT_LIST_HEAD(&the_lnet.ln_rcd_zombie);
+	INIT_LIST_HEAD(&the_lnet.ln_msg_resend);
 	INIT_LIST_HEAD(&the_lnet.ln_rcd_deathrow);
 
 	/* The hash table size is the number of bits it takes to express the set
@@ -1786,8 +2554,8 @@ LNetNIInit(lnet_pid_t requested_pid)
 	int			im_a_router = 0;
 	int			rc;
 	int			ni_count;
-	struct lnet_ping_info	*pinfo;
-	struct lnet_handle_md	md_handle;
+	struct lnet_ping_buffer	*pbuf;
+	struct lnet_handle_md	ping_mdh;
 	struct list_head	net_head;
 	struct lnet_net		*net;
 
@@ -1822,6 +2590,9 @@ LNetNIInit(lnet_pid_t requested_pid)
 		goto err_empty_list;
 	}
 
+	if (use_tcp_bonding)
+		CWARN("'use_tcp_bonding' option has been deprecated. See LU-13641\n");
+
 	/* If LNet is being initialized via DLC it is possible
 	 * that the user requests not to load module parameters (ones which
 	 * are supported by DLC) on initialization.  Therefore, make sure not
@@ -1862,23 +2633,41 @@ LNetNIInit(lnet_pid_t requested_pid)
 	the_lnet.ln_refcount = 1;
 	/* Now I may use my own API functions... */
 
-	rc = lnet_ping_info_setup(&pinfo, &md_handle, ni_count, true);
+	rc = lnet_ping_target_setup(&pbuf, &ping_mdh, ni_count, true);
 	if (rc != 0)
 		goto err_acceptor_stop;
 
-	lnet_ping_target_update(pinfo, md_handle);
+	lnet_ping_target_update(pbuf, ping_mdh);
+
+	rc = LNetEQAlloc(0, lnet_mt_event_handler, &the_lnet.ln_mt_eqh);
+	if (rc != 0) {
+		CERROR("Can't allocate monitor thread EQ: %d\n", rc);
+		goto err_stop_ping;
+	}
 
-	rc = lnet_router_checker_start();
+	rc = lnet_monitor_thr_start();
 	if (rc != 0)
 		goto err_stop_ping;
 
+	rc = lnet_push_target_init();
+	if (rc != 0)
+		goto err_stop_monitor_thr;
+
+	rc = lnet_peer_discovery_start();
+	if (rc != 0)
+		goto err_destroy_push_target;
+
 	lnet_fault_init();
-	lnet_proc_init();
+	lnet_router_debugfs_init();
 
 	mutex_unlock(&the_lnet.ln_api_mutex);
 
 	return 0;
 
+err_destroy_push_target:
+	lnet_push_target_fini();
+err_stop_monitor_thr:
+	lnet_monitor_thr_stop();
 err_stop_ping:
 	lnet_ping_target_fini();
 err_acceptor_stop:
@@ -1927,8 +2716,10 @@ LNetNIFini()
 
 		lnet_fault_fini();
 
-		lnet_proc_fini();
-		lnet_router_checker_stop();
+		lnet_router_debugfs_fini();
+		lnet_peer_discovery_stop();
+		lnet_push_target_fini();
+		lnet_monitor_thr_stop();
 		lnet_ping_target_fini();
 
 		/* Teardown fns that use my own API functions BEFORE here */
@@ -1976,15 +2767,22 @@ lnet_fill_ni_info(struct lnet_ni *ni, struct lnet_ioctl_config_ni *cfg_ni,
 	}
 
 	cfg_ni->lic_nid = ni->ni_nid;
-	cfg_ni->lic_status = ni->ni_status->ns_status;
+	if (ni->ni_nid == LNET_NID_LO_0)
+		cfg_ni->lic_status = LNET_NI_STATUS_UP;
+	else
+		cfg_ni->lic_status = ni->ni_status->ns_status;
 	cfg_ni->lic_tcp_bonding = use_tcp_bonding;
 	cfg_ni->lic_dev_cpt = ni->ni_dev_cpt;
 
 	memcpy(&tun->lt_cmn, &ni->ni_net->net_tunables, sizeof(tun->lt_cmn));
 
 	if (stats) {
-		stats->iel_send_count = atomic_read(&ni->ni_stats.send_count);
-		stats->iel_recv_count = atomic_read(&ni->ni_stats.recv_count);
+		stats->iel_send_count = lnet_sum_stats(&ni->ni_stats,
+						       LNET_STATS_TYPE_SEND);
+		stats->iel_recv_count = lnet_sum_stats(&ni->ni_stats,
+						       LNET_STATS_TYPE_RECV);
+		stats->iel_drop_count = lnet_sum_stats(&ni->ni_stats,
+						       LNET_STATS_TYPE_DROP);
 	}
 
 	/*
@@ -2061,7 +2859,10 @@ lnet_fill_ni_info_legacy(struct lnet_ni *ni,
 	config->cfg_config_u.cfg_net.net_peer_rtr_credits =
 		ni->ni_net->net_tunables.lct_peer_rtr_credits;
 
-	net_config->ni_status = ni->ni_status->ns_status;
+	if (ni->ni_nid == LNET_NID_LO_0)
+		net_config->ni_status = LNET_NI_STATUS_UP;
+	else
+		net_config->ni_status = ni->ni_status->ns_status;
 
 	if (ni->ni_cpts) {
 		int num_cpts = min(ni->ni_ncpts, LNET_MAX_SHOW_NUM_CPT);
@@ -2119,10 +2920,17 @@ lnet_get_next_ni_locked(struct lnet_net *mynet, struct lnet_ni *prev)
 	struct lnet_ni		*ni;
 	struct lnet_net		*net = mynet;
 
+	/*
+	 * It is possible that the net has been cleaned out while there is
+	 * a message being sent. This function accessed the net without
+	 * checking if the list is empty
+	 */
 	if (prev == NULL) {
 		if (net == NULL)
 			net = list_entry(the_lnet.ln_nets.next, struct lnet_net,
 					net_list);
+		if (list_empty(&net->net_ni_list))
+			return NULL;
 		ni = list_entry(net->net_ni_list.next, struct lnet_ni,
 				ni_netlist);
 
@@ -2144,6 +2952,8 @@ lnet_get_next_ni_locked(struct lnet_net *mynet, struct lnet_ni *prev)
 		/* get the next net */
 		net = list_entry(prev->ni_net->net_list.next, struct lnet_net,
 				 net_list);
+		if (list_empty(&net->net_ni_list))
+			return NULL;
 		/* get the ni on it */
 		ni = list_entry(net->net_ni_list.next, struct lnet_ni,
 				ni_netlist);
@@ -2151,6 +2961,9 @@ lnet_get_next_ni_locked(struct lnet_net *mynet, struct lnet_ni *prev)
 		return ni;
 	}
 
+	if (list_empty(&prev->ni_netlist))
+		return NULL;
+
 	/* there are more nis left */
 	ni = list_entry(prev->ni_netlist.next, struct lnet_ni, ni_netlist);
 
@@ -2208,12 +3021,35 @@ lnet_get_ni_config(struct lnet_ioctl_config_ni *cfg_ni,
 	return rc;
 }
 
+int lnet_get_ni_stats(struct lnet_ioctl_element_msg_stats *msg_stats)
+{
+	struct lnet_ni *ni;
+	int cpt;
+	int rc = -ENOENT;
+
+	if (!msg_stats)
+		return -EINVAL;
+
+	cpt = lnet_net_lock_current();
+
+	ni = lnet_get_ni_idx_locked(msg_stats->im_idx);
+
+	if (ni) {
+		lnet_usr_translate_stats(msg_stats, &ni->ni_stats);
+		rc = 0;
+	}
+
+	lnet_net_unlock(cpt);
+
+	return rc;
+}
+
 static int lnet_add_net_common(struct lnet_net *net,
 			       struct lnet_ioctl_config_lnd_tunables *tun)
 {
 	__u32			net_id;
-	struct lnet_ping_info	*pinfo;
-	struct lnet_handle_md	md_handle;
+	struct lnet_ping_buffer *pbuf;
+	struct lnet_handle_md	ping_mdh;
 	int			rc;
 	struct lnet_remotenet *rnet;
 	int			net_ni_count;
@@ -2235,7 +3071,7 @@ static int lnet_add_net_common(struct lnet_net *net,
 
 	/*
 	 * make sure you calculate the correct number of slots in the ping
-	 * info. Since the ping info is a flattened list of all the NIs,
+	 * buffer. Since the ping info is a flattened list of all the NIs,
 	 * we should allocate enough slots to accomodate the number of NIs
 	 * which will be added.
 	 *
@@ -2244,9 +3080,9 @@ static int lnet_add_net_common(struct lnet_net *net,
 	 */
 	net_ni_count = lnet_get_net_ni_count_pre(net);
 
-	rc = lnet_ping_info_setup(&pinfo, &md_handle,
-				  net_ni_count + lnet_get_ni_count(),
-				  false);
+	rc = lnet_ping_target_setup(&pbuf, &ping_mdh,
+				    net_ni_count + lnet_get_ni_count(),
+				    false);
 	if (rc < 0) {
 		lnet_net_free(net);
 		return rc;
@@ -2297,13 +3133,13 @@ static int lnet_add_net_common(struct lnet_net *net,
 	lnet_peer_net_added(net);
 	lnet_net_unlock(LNET_LOCK_EX);
 
-	lnet_ping_target_update(pinfo, md_handle);
+	lnet_ping_target_update(pbuf, ping_mdh);
 
 	return 0;
 
 failed:
-	lnet_ping_md_unlink(pinfo, &md_handle);
-	lnet_ping_info_free(pinfo);
+	lnet_ping_md_unlink(pbuf, &ping_mdh);
+	lnet_ping_buffer_decref(pbuf);
 	return rc;
 }
 
@@ -2351,7 +3187,7 @@ int lnet_dyn_add_ni(struct lnet_ioctl_config_ni *conf)
 	struct lnet_ni *ni;
 	struct lnet_ioctl_config_lnd_tunables *tun = NULL;
 	int rc, i;
-	__u32 net_id;
+	__u32 net_id, lnd_type;
 
 	/* get the tunables if they are available */
 	if (conf->lic_cfg_hdr.ioc_len >=
@@ -2365,6 +3201,12 @@ int lnet_dyn_add_ni(struct lnet_ioctl_config_ni *conf)
 						  tun);
 
 	net_id = LNET_NIDNET(conf->lic_nid);
+	lnd_type = LNET_NETTYP(net_id);
+
+	if (!libcfs_isknown_lnd(lnd_type)) {
+		CERROR("No valid net and lnd information provided\n");
+		return -EINVAL;
+	}
 
 	net = lnet_net_alloc(net_id, NULL);
 	if (!net)
@@ -2394,8 +3236,8 @@ int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf)
 	struct lnet_net	 *net;
 	struct lnet_ni *ni;
 	__u32 net_id = LNET_NIDNET(conf->lic_nid);
-	struct lnet_ping_info *pinfo;
-	struct lnet_handle_md md_handle;
+	struct lnet_ping_buffer *pbuf;
+	struct lnet_handle_md  ping_mdh;
 	int		  rc;
 	int		  net_count;
 	__u32		  addr;
@@ -2413,7 +3255,7 @@ int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf)
 		CERROR("net %s not found\n",
 		       libcfs_net2str(net_id));
 		rc = -ENOENT;
-		goto net_unlock;
+		goto unlock_net;
 	}
 
 	addr = LNET_NIDADDR(conf->lic_nid);
@@ -2424,28 +3266,28 @@ int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf)
 		lnet_net_unlock(0);
 
 		/* create and link a new ping info, before removing the old one */
-		rc = lnet_ping_info_setup(&pinfo, &md_handle,
+		rc = lnet_ping_target_setup(&pbuf, &ping_mdh,
 					lnet_get_ni_count() - net_count,
 					false);
 		if (rc != 0)
-			goto out;
+			goto unlock_api_mutex;
 
 		lnet_shutdown_lndnet(net);
 
 		if (lnet_count_acceptor_nets() == 0)
 			lnet_acceptor_stop();
 
-		lnet_ping_target_update(pinfo, md_handle);
+		lnet_ping_target_update(pbuf, ping_mdh);
 
-		goto out;
+		goto unlock_api_mutex;
 	}
 
 	ni = lnet_nid2ni_locked(conf->lic_nid, 0);
 	if (!ni) {
-		CERROR("nid %s not found \n",
+		CERROR("nid %s not found\n",
 		       libcfs_nid2str(conf->lic_nid));
 		rc = -ENOENT;
-		goto net_unlock;
+		goto unlock_net;
 	}
 
 	net_count = lnet_get_net_ni_count_locked(net);
@@ -2453,27 +3295,27 @@ int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf)
 	lnet_net_unlock(0);
 
 	/* create and link a new ping info, before removing the old one */
-	rc = lnet_ping_info_setup(&pinfo, &md_handle,
+	rc = lnet_ping_target_setup(&pbuf, &ping_mdh,
 				  lnet_get_ni_count() - 1, false);
 	if (rc != 0)
-		goto out;
+		goto unlock_api_mutex;
 
 	lnet_shutdown_lndni(ni);
 
 	if (lnet_count_acceptor_nets() == 0)
 		lnet_acceptor_stop();
 
-	lnet_ping_target_update(pinfo, md_handle);
+	lnet_ping_target_update(pbuf, ping_mdh);
 
 	/* check if the net is empty and remove it if it is */
 	if (net_count == 1)
 		lnet_shutdown_lndnet(net);
 
-	goto out;
+	goto unlock_api_mutex;
 
-net_unlock:
+unlock_net:
 	lnet_net_unlock(0);
-out:
+unlock_api_mutex:
 	mutex_unlock(&the_lnet.ln_api_mutex);
 
 	return rc;
@@ -2541,8 +3383,8 @@ int
 lnet_dyn_del_net(__u32 net_id)
 {
 	struct lnet_net	 *net;
-	struct lnet_ping_info *pinfo;
-	struct lnet_handle_md md_handle;
+	struct lnet_ping_buffer *pbuf;
+	struct lnet_handle_md ping_mdh;
 	int		  rc;
 	int		  net_ni_count;
 
@@ -2556,6 +3398,7 @@ lnet_dyn_del_net(__u32 net_id)
 
 	net = lnet_get_net_locked(net_id);
 	if (net == NULL) {
+		lnet_net_unlock(0);
 		rc = -EINVAL;
 		goto out;
 	}
@@ -2565,8 +3408,8 @@ lnet_dyn_del_net(__u32 net_id)
 	lnet_net_unlock(0);
 
 	/* create and link a new ping info, before removing the old one */
-	rc = lnet_ping_info_setup(&pinfo, &md_handle,
-				  lnet_get_ni_count() - net_ni_count, false);
+	rc = lnet_ping_target_setup(&pbuf, &ping_mdh,
+				    lnet_get_ni_count() - net_ni_count, false);
 	if (rc != 0)
 		goto out;
 
@@ -2575,7 +3418,7 @@ lnet_dyn_del_net(__u32 net_id)
 	if (lnet_count_acceptor_nets() == 0)
 		lnet_acceptor_stop();
 
-	lnet_ping_target_update(pinfo, md_handle);
+	lnet_ping_target_update(pbuf, ping_mdh);
 
 out:
 	mutex_unlock(&the_lnet.ln_api_mutex);
@@ -2593,6 +3436,102 @@ __u32 lnet_get_dlc_seq_locked(void)
 	return atomic_read(&lnet_dlc_seq_no);
 }
 
+static void
+lnet_ni_set_healthv(lnet_nid_t nid, int value, bool all)
+{
+	struct lnet_net *net;
+	struct lnet_ni *ni;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+			if (ni->ni_nid == nid || all) {
+				atomic_set(&ni->ni_healthv, value);
+				if (list_empty(&ni->ni_recovery) &&
+				    value < LNET_MAX_HEALTH_VALUE) {
+					CERROR("manually adding local NI %s to recovery\n",
+					       libcfs_nid2str(ni->ni_nid));
+					list_add_tail(&ni->ni_recovery,
+						      &the_lnet.ln_mt_localNIRecovq);
+					lnet_ni_addref_locked(ni, 0);
+				}
+				if (!all) {
+					lnet_net_unlock(LNET_LOCK_EX);
+					return;
+				}
+			}
+		}
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+static int
+lnet_get_local_ni_hstats(struct lnet_ioctl_local_ni_hstats *stats)
+{
+	int cpt, rc = 0;
+	struct lnet_ni *ni;
+	lnet_nid_t nid = stats->hlni_nid;
+
+	cpt = lnet_net_lock_current();
+	ni = lnet_nid2ni_locked(nid, cpt);
+
+	if (!ni) {
+		rc = -ENOENT;
+		goto unlock;
+	}
+
+	stats->hlni_local_interrupt = atomic_read(&ni->ni_hstats.hlt_local_interrupt);
+	stats->hlni_local_dropped = atomic_read(&ni->ni_hstats.hlt_local_dropped);
+	stats->hlni_local_aborted = atomic_read(&ni->ni_hstats.hlt_local_aborted);
+	stats->hlni_local_no_route = atomic_read(&ni->ni_hstats.hlt_local_no_route);
+	stats->hlni_local_timeout = atomic_read(&ni->ni_hstats.hlt_local_timeout);
+	stats->hlni_local_error = atomic_read(&ni->ni_hstats.hlt_local_error);
+	stats->hlni_health_value = atomic_read(&ni->ni_healthv);
+
+unlock:
+	lnet_net_unlock(cpt);
+
+	return rc;
+}
+
+static int
+lnet_get_local_ni_recovery_list(struct lnet_ioctl_recovery_list *list)
+{
+	struct lnet_ni *ni;
+	int i = 0;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	list_for_each_entry(ni, &the_lnet.ln_mt_localNIRecovq, ni_recovery) {
+		list->rlst_nid_array[i] = ni->ni_nid;
+		i++;
+		if (i >= LNET_MAX_SHOW_NUM_NID)
+			break;
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+	list->rlst_num_nids = i;
+
+	return 0;
+}
+
+static int
+lnet_get_peer_ni_recovery_list(struct lnet_ioctl_recovery_list *list)
+{
+	struct lnet_peer_ni *lpni;
+	int i = 0;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	list_for_each_entry(lpni, &the_lnet.ln_mt_peerNIRecovq, lpni_recovery) {
+		list->rlst_nid_array[i] = lpni->lpni_nid;
+		i++;
+		if (i >= LNET_MAX_SHOW_NUM_NID)
+			break;
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+	list->rlst_num_nids = i;
+
+	return 0;
+}
+
 /**
  * LNet ioctl handler.
  *
@@ -2674,9 +3613,10 @@ LNetCtl(unsigned int cmd, void *arg)
 		__u32 tun_size;
 
 		cfg_ni = arg;
+
 		/* get the tunables if they are available */
 		if (cfg_ni->lic_cfg_hdr.ioc_len <
-		    sizeof(*cfg_ni) + sizeof(*stats)+ sizeof(*tun))
+		    sizeof(*cfg_ni) + sizeof(*stats) + sizeof(*tun))
 			return -EINVAL;
 
 		stats = (struct lnet_ioctl_element_stats *)
@@ -2693,6 +3633,19 @@ LNetCtl(unsigned int cmd, void *arg)
 		return rc;
 	}
 
+	case IOC_LIBCFS_GET_LOCAL_NI_MSG_STATS: {
+		struct lnet_ioctl_element_msg_stats *msg_stats = arg;
+
+		if (msg_stats->im_hdr.ioc_len != sizeof(*msg_stats))
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_get_ni_stats(msg_stats);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+
+		return rc;
+	}
+
 	case IOC_LIBCFS_GET_NET: {
 		size_t total = sizeof(*config) +
 			       sizeof(struct lnet_ioctl_net_config);
@@ -2753,22 +3706,22 @@ LNetCtl(unsigned int cmd, void *arg)
 		return rc;
 
 	case IOC_LIBCFS_SET_NUMA_RANGE: {
-		struct lnet_ioctl_numa_range *numa;
+		struct lnet_ioctl_set_value *numa;
 		numa = arg;
-		if (numa->nr_hdr.ioc_len != sizeof(*numa))
+		if (numa->sv_hdr.ioc_len != sizeof(*numa))
 			return -EINVAL;
-		mutex_lock(&the_lnet.ln_api_mutex);
-		lnet_numa_range = numa->nr_range;
-		mutex_unlock(&the_lnet.ln_api_mutex);
+		lnet_net_lock(LNET_LOCK_EX);
+		lnet_numa_range = numa->sv_value;
+		lnet_net_unlock(LNET_LOCK_EX);
 		return 0;
 	}
 
 	case IOC_LIBCFS_GET_NUMA_RANGE: {
-		struct lnet_ioctl_numa_range *numa;
+		struct lnet_ioctl_set_value *numa;
 		numa = arg;
-		if (numa->nr_hdr.ioc_len != sizeof(*numa))
+		if (numa->sv_hdr.ioc_len != sizeof(*numa))
 			return -EINVAL;
-		numa->nr_range = lnet_numa_range;
+		numa->sv_value = lnet_numa_range;
 		return 0;
 	}
 
@@ -2789,6 +3742,33 @@ LNetCtl(unsigned int cmd, void *arg)
 		return rc;
 	}
 
+	case IOC_LIBCFS_GET_LOCAL_HSTATS: {
+		struct lnet_ioctl_local_ni_hstats *stats = arg;
+
+		if (stats->hlni_hdr.ioc_len < sizeof(*stats))
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_get_local_ni_hstats(stats);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+
+		return rc;
+	}
+
+	case IOC_LIBCFS_GET_RECOVERY_QUEUE: {
+		struct lnet_ioctl_recovery_list *list = arg;
+		if (list->rlst_hdr.ioc_len < sizeof(*list))
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		if (list->rlst_type == LNET_HEALTH_TYPE_LOCAL_NI)
+			rc = lnet_get_local_ni_recovery_list(list);
+		else
+			rc = lnet_get_peer_ni_recovery_list(list);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+	}
+
 	case IOC_LIBCFS_ADD_PEER_NI: {
 		struct lnet_ioctl_peer_cfg *cfg = arg;
 
@@ -2796,9 +3776,9 @@ LNetCtl(unsigned int cmd, void *arg)
 			return -EINVAL;
 
 		mutex_lock(&the_lnet.ln_api_mutex);
-		rc = lnet_add_peer_ni_to_peer(cfg->prcfg_prim_nid,
-					      cfg->prcfg_cfg_nid,
-					      cfg->prcfg_mr);
+		rc = lnet_add_peer_ni(cfg->prcfg_prim_nid,
+				      cfg->prcfg_cfg_nid,
+				      cfg->prcfg_mr);
 		mutex_unlock(&the_lnet.ln_api_mutex);
 		return rc;
 	}
@@ -2810,8 +3790,8 @@ LNetCtl(unsigned int cmd, void *arg)
 			return -EINVAL;
 
 		mutex_lock(&the_lnet.ln_api_mutex);
-		rc = lnet_del_peer_ni_from_peer(cfg->prcfg_prim_nid,
-						cfg->prcfg_cfg_nid);
+		rc = lnet_del_peer_ni(cfg->prcfg_prim_nid,
+				      cfg->prcfg_cfg_nid);
 		mutex_unlock(&the_lnet.ln_api_mutex);
 		return rc;
 	}
@@ -2840,30 +3820,65 @@ LNetCtl(unsigned int cmd, void *arg)
 
 	case IOC_LIBCFS_GET_PEER_NI: {
 		struct lnet_ioctl_peer_cfg *cfg = arg;
-		struct lnet_peer_ni_credit_info __user *lpni_cri;
-		struct lnet_ioctl_element_stats __user *lpni_stats;
-		size_t usr_size = sizeof(*lpni_cri) + sizeof(*lpni_stats);
 
-		if ((cfg->prcfg_hdr.ioc_len != sizeof(*cfg)) ||
-		    (cfg->prcfg_size != usr_size))
+		if (cfg->prcfg_hdr.ioc_len < sizeof(*cfg))
 			return -EINVAL;
 
-		lpni_cri = cfg->prcfg_bulk;
-		lpni_stats = cfg->prcfg_bulk + sizeof(*lpni_cri);
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_get_peer_info(cfg,
+					(void __user *)cfg->prcfg_bulk);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+	}
+
+	case IOC_LIBCFS_GET_PEER_LIST: {
+		struct lnet_ioctl_peer_cfg *cfg = arg;
+
+		if (cfg->prcfg_hdr.ioc_len < sizeof(*cfg))
+			return -EINVAL;
 
 		mutex_lock(&the_lnet.ln_api_mutex);
-		rc = lnet_get_peer_info(cfg->prcfg_count, &cfg->prcfg_prim_nid,
-					&cfg->prcfg_cfg_nid, &cfg->prcfg_mr,
-					lpni_cri, lpni_stats);
+		rc = lnet_get_peer_list(&cfg->prcfg_count, &cfg->prcfg_size,
+				(struct lnet_process_id __user *)cfg->prcfg_bulk);
 		mutex_unlock(&the_lnet.ln_api_mutex);
 		return rc;
 	}
 
-	case IOC_LIBCFS_NOTIFY_ROUTER:
+	case IOC_LIBCFS_SET_HEALHV: {
+		struct lnet_ioctl_reset_health_cfg *cfg = arg;
+		int value;
+		if (cfg->rh_hdr.ioc_len < sizeof(*cfg))
+			return -EINVAL;
+		if (cfg->rh_value < 0 ||
+		    cfg->rh_value > LNET_MAX_HEALTH_VALUE)
+			value = LNET_MAX_HEALTH_VALUE;
+		else
+			value = cfg->rh_value;
+		CDEBUG(D_NET, "Manually setting healthv to %d for %s:%s. all = %d\n",
+		       value, (cfg->rh_type == LNET_HEALTH_TYPE_LOCAL_NI) ?
+		       "local" : "peer", libcfs_nid2str(cfg->rh_nid), cfg->rh_all);
+		mutex_lock(&the_lnet.ln_api_mutex);
+		if (cfg->rh_type == LNET_HEALTH_TYPE_LOCAL_NI)
+			lnet_ni_set_healthv(cfg->rh_nid, value,
+					     cfg->rh_all);
+		else
+			lnet_peer_ni_set_healthv(cfg->rh_nid, value,
+						  cfg->rh_all);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return 0;
+	}
+
+	case IOC_LIBCFS_NOTIFY_ROUTER: {
+		time64_t deadline = ktime_get_real_seconds() - data->ioc_u64[0];
+
+		/* The deadline passed in by the user should be some time in
+		 * seconds in the future since the UNIX epoch. We have to map
+		 * that deadline to the wall clock.
+		 */
+		deadline += ktime_get_seconds();
 		return lnet_notify(NULL, data->ioc_nid, data->ioc_flags,
-				  cfs_time_current() -
-				  cfs_time_seconds(cfs_time_current_sec() -
-						   (time_t)data->ioc_u64[0]));
+				   deadline);
+	}
 
 	case IOC_LIBCFS_LNET_DIST:
 		rc = LNetDist(data->ioc_nid, &data->ioc_nid, &data->ioc_u32[1]);
@@ -2888,24 +3903,77 @@ LNetCtl(unsigned int cmd, void *arg)
 		id.nid = data->ioc_nid;
 		id.pid = data->ioc_u32[0];
 
-		/* Don't block longer than 2 minutes */
-		if (data->ioc_u32[1] > 120 * MSEC_PER_SEC)
-			return -EINVAL;
-
-		/* If timestamp is negative then disable timeout */
-		if ((s32)data->ioc_u32[1] < 0)
-			timeout = MAX_SCHEDULE_TIMEOUT;
+		/* If timeout is negative then set default of 3 minutes */
+		if (((s32)data->ioc_u32[1] <= 0) ||
+		    data->ioc_u32[1] > (DEFAULT_PEER_TIMEOUT * MSEC_PER_SEC))
+			timeout = msecs_to_jiffies(DEFAULT_PEER_TIMEOUT * MSEC_PER_SEC);
 		else
 			timeout = msecs_to_jiffies(data->ioc_u32[1]);
 
 		rc = lnet_ping(id, timeout, data->ioc_pbuf1,
 			       data->ioc_plen1 / sizeof(struct lnet_process_id));
+
 		if (rc < 0)
 			return rc;
+
 		data->ioc_count = rc;
 		return 0;
 	}
 
+	case IOC_LIBCFS_PING_PEER: {
+		struct lnet_ioctl_ping_data *ping = arg;
+		struct lnet_peer *lp;
+		signed long timeout;
+
+		/* If timeout is negative then set default of 3 minutes */
+		if (((s32)ping->op_param) <= 0 ||
+		    ping->op_param > (DEFAULT_PEER_TIMEOUT * MSEC_PER_SEC))
+			timeout = msecs_to_jiffies(DEFAULT_PEER_TIMEOUT * MSEC_PER_SEC);
+		else
+			timeout = msecs_to_jiffies(ping->op_param);
+
+		rc = lnet_ping(ping->ping_id, timeout,
+			       ping->ping_buf,
+			       ping->ping_count);
+		if (rc < 0)
+			return rc;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		lp = lnet_find_peer(ping->ping_id.nid);
+		if (lp) {
+			ping->ping_id.nid = lp->lp_primary_nid;
+			ping->mr_info = lnet_peer_is_multi_rail(lp);
+			lnet_peer_decref_locked(lp);
+		}
+		mutex_unlock(&the_lnet.ln_api_mutex);
+
+		ping->ping_count = rc;
+		return 0;
+	}
+
+	case IOC_LIBCFS_DISCOVER: {
+		struct lnet_ioctl_ping_data *discover = arg;
+		struct lnet_peer *lp;
+
+		rc = lnet_discover(discover->ping_id, discover->op_param,
+				   discover->ping_buf,
+				   discover->ping_count);
+		if (rc < 0)
+			return rc;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		lp = lnet_find_peer(discover->ping_id.nid);
+		if (lp) {
+			discover->ping_id.nid = lp->lp_primary_nid;
+			discover->mr_info = lnet_peer_is_multi_rail(lp);
+			lnet_peer_decref_locked(lp);
+		}
+		mutex_unlock(&the_lnet.ln_api_mutex);
+
+		discover->ping_count = rc;
+		return 0;
+	}
+
 	default:
 		ni = lnet_net2ni_addref(data->ioc_net);
 		if (ni == NULL)
@@ -3005,43 +4073,47 @@ static int lnet_ping(struct lnet_process_id id, signed long timeout,
 	struct lnet_handle_md mdh;
 	struct lnet_event event;
 	struct lnet_md md = { NULL };
-	int		     which;
-	int		     unlinked = 0;
-	int		     replied = 0;
+	int which;
+	int unlinked = 0;
+	int replied = 0;
 	const signed long a_long_time = msecs_to_jiffies(60 * MSEC_PER_SEC);
-	int		     infosz;
-	struct lnet_ping_info *info;
+	struct lnet_ping_buffer *pbuf;
 	struct lnet_process_id tmpid;
-	int		     i;
-	int		     nob;
-	int		     rc;
-	int		     rc2;
-	sigset_t	 blocked;
-
-	infosz = offsetof(struct lnet_ping_info, pi_ni[n_ids]);
+	int i;
+	int nob;
+	int rc;
+	int rc2;
+	sigset_t blocked;
 
 	/* n_ids limit is arbitrary */
-	if (n_ids <= 0 || n_ids > 20 || id.nid == LNET_NID_ANY)
+	if (n_ids <= 0 || id.nid == LNET_NID_ANY)
 		return -EINVAL;
 
+	/*
+	 * if the user buffer has more space than the lnet_interfaces_max
+	 * then only fill it up to lnet_interfaces_max
+	 */
+	if (n_ids > lnet_interfaces_max)
+		n_ids = lnet_interfaces_max;
+
 	if (id.pid == LNET_PID_ANY)
 		id.pid = LNET_PID_LUSTRE;
 
-	LIBCFS_ALLOC(info, infosz);
-	if (info == NULL)
+	pbuf = lnet_ping_buffer_alloc(n_ids, GFP_NOFS);
+	if (!pbuf)
 		return -ENOMEM;
 
 	/* NB 2 events max (including any unlink event) */
 	rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &eqh);
 	if (rc != 0) {
 		CERROR("Can't allocate EQ: %d\n", rc);
-		goto out_0;
+		goto fail_ping_buffer_decref;
 	}
 
 	/* initialize md content */
-	md.start     = info;
-	md.length    = infosz;
-	md.threshold = 2; /*GET/REPLY*/
+	md.start     = &pbuf->pb_info;
+	md.length    = LNET_PING_INFO_SIZE(n_ids);
+	md.threshold = 2; /* GET/REPLY */
 	md.max_size  = 0;
 	md.options   = LNET_MD_TRUNCATE;
 	md.user_ptr  = NULL;
@@ -3050,16 +4122,15 @@ static int lnet_ping(struct lnet_process_id id, signed long timeout,
 	rc = LNetMDBind(md, LNET_UNLINK, &mdh);
 	if (rc != 0) {
 		CERROR("Can't bind MD: %d\n", rc);
-		goto out_1;
+		goto fail_free_eq;
 	}
 
 	rc = LNetGet(LNET_NID_ANY, mdh, id,
 		     LNET_RESERVED_PORTAL,
-		     LNET_PROTO_PING_MATCHBITS, 0);
+		     LNET_PROTO_PING_MATCHBITS, 0, false);
 
 	if (rc != 0) {
 		/* Don't CERROR; this could be deliberate! */
-
 		rc2 = LNetMDUnlink(mdh);
 		LASSERT(rc2 == 0);
 
@@ -3107,7 +4178,6 @@ static int lnet_ping(struct lnet_process_id id, signed long timeout,
 			replied = 1;
 			rc = event.mlength;
 		}
-
 	} while (rc2 <= 0 || !event.unlinked);
 
 	if (!replied) {
@@ -3115,68 +4185,170 @@ static int lnet_ping(struct lnet_process_id id, signed long timeout,
 			CWARN("%s: Unexpected rc >= 0 but no reply!\n",
 			      libcfs_id2str(id));
 		rc = -EIO;
-		goto out_1;
+		goto fail_free_eq;
 	}
 
 	nob = rc;
-	LASSERT(nob >= 0 && nob <= infosz);
+	LASSERT(nob >= 0 && nob <= LNET_PING_INFO_SIZE(n_ids));
 
-	rc = -EPROTO;				/* if I can't parse... */
+	rc = -EPROTO;		/* if I can't parse... */
 
 	if (nob < 8) {
-		/* can't check magic/version */
 		CERROR("%s: ping info too short %d\n",
 		       libcfs_id2str(id), nob);
-		goto out_1;
+		goto fail_free_eq;
 	}
 
-	if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) {
-		lnet_swap_pinginfo(info);
-	} else if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
+	if (pbuf->pb_info.pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) {
+		lnet_swap_pinginfo(pbuf);
+	} else if (pbuf->pb_info.pi_magic != LNET_PROTO_PING_MAGIC) {
 		CERROR("%s: Unexpected magic %08x\n",
-		       libcfs_id2str(id), info->pi_magic);
-		goto out_1;
+		       libcfs_id2str(id), pbuf->pb_info.pi_magic);
+		goto fail_free_eq;
 	}
 
-	if ((info->pi_features & LNET_PING_FEAT_NI_STATUS) == 0) {
+	if ((pbuf->pb_info.pi_features & LNET_PING_FEAT_NI_STATUS) == 0) {
 		CERROR("%s: ping w/o NI status: 0x%x\n",
-		       libcfs_id2str(id), info->pi_features);
-		goto out_1;
+		       libcfs_id2str(id), pbuf->pb_info.pi_features);
+		goto fail_free_eq;
 	}
 
-	if (nob < offsetof(struct lnet_ping_info, pi_ni[0])) {
-		CERROR("%s: Short reply %d(%d min)\n", libcfs_id2str(id),
-		       nob, (int)offsetof(struct lnet_ping_info, pi_ni[0]));
-		goto out_1;
+	if (nob < LNET_PING_INFO_SIZE(0)) {
+		CERROR("%s: Short reply %d(%d min)\n",
+		       libcfs_id2str(id),
+		       nob, (int)LNET_PING_INFO_SIZE(0));
+		goto fail_free_eq;
 	}
 
-	if (info->pi_nnis < n_ids)
-		n_ids = info->pi_nnis;
+	if (pbuf->pb_info.pi_nnis < n_ids)
+		n_ids = pbuf->pb_info.pi_nnis;
 
-	if (nob < offsetof(struct lnet_ping_info, pi_ni[n_ids])) {
-		CERROR("%s: Short reply %d(%d expected)\n", libcfs_id2str(id),
-		       nob, (int)offsetof(struct lnet_ping_info, pi_ni[n_ids]));
-		goto out_1;
+	if (nob < LNET_PING_INFO_SIZE(n_ids)) {
+		CERROR("%s: Short reply %d(%d expected)\n",
+		       libcfs_id2str(id),
+		       nob, (int)LNET_PING_INFO_SIZE(n_ids));
+		goto fail_free_eq;
 	}
 
-	rc = -EFAULT;				/* If I SEGV... */
+	rc = -EFAULT;		/* if I segv in copy_to_user()... */
 
 	memset(&tmpid, 0, sizeof(tmpid));
 	for (i = 0; i < n_ids; i++) {
-		tmpid.pid = info->pi_pid;
-		tmpid.nid = info->pi_ni[i].ns_nid;
+		tmpid.pid = pbuf->pb_info.pi_pid;
+		tmpid.nid = pbuf->pb_info.pi_ni[i].ns_nid;
 		if (copy_to_user(&ids[i], &tmpid, sizeof(tmpid)))
-			goto out_1;
+			goto fail_free_eq;
 	}
-	rc = info->pi_nnis;
+	rc = pbuf->pb_info.pi_nnis;
 
- out_1:
+ fail_free_eq:
 	rc2 = LNetEQFree(eqh);
 	if (rc2 != 0)
 		CERROR("rc2 %d\n", rc2);
 	LASSERT(rc2 == 0);
 
- out_0:
-	LIBCFS_FREE(info, infosz);
+ fail_ping_buffer_decref:
+	lnet_ping_buffer_decref(pbuf);
+	return rc;
+}
+
+static int
+lnet_discover(struct lnet_process_id id, __u32 force,
+	      struct lnet_process_id __user *ids, int n_ids)
+{
+	struct lnet_peer_ni *lpni;
+	struct lnet_peer_ni *p;
+	struct lnet_peer *lp;
+	struct lnet_process_id *buf;
+	int cpt;
+	int i;
+	int rc;
+	int max_intf = lnet_interfaces_max;
+	size_t buf_size;
+
+	if (n_ids <= 0 ||
+	    id.nid == LNET_NID_ANY)
+		return -EINVAL;
+
+	if (id.pid == LNET_PID_ANY)
+		id.pid = LNET_PID_LUSTRE;
+
+	/*
+	 * if the user buffer has more space than the max_intf
+	 * then only fill it up to max_intf
+	 */
+	if (n_ids > max_intf)
+		n_ids = max_intf;
+
+	buf_size = n_ids * sizeof(*buf);
+
+	LIBCFS_ALLOC(buf, buf_size);
+	if (!buf)
+		return -ENOMEM;
+
+	cpt = lnet_net_lock_current();
+	lpni = lnet_nid2peerni_locked(id.nid, LNET_NID_ANY, cpt);
+	if (IS_ERR(lpni)) {
+		rc = PTR_ERR(lpni);
+		goto out;
+	}
+
+	/*
+	 * Clearing the NIDS_UPTODATE flag ensures the peer will
+	 * be discovered, provided discovery has not been disabled.
+	 */
+	lp = lpni->lpni_peer_net->lpn_peer;
+	spin_lock(&lp->lp_lock);
+	lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE;
+	/* If the force flag is set, force a PING and PUSH as well. */
+	if (force)
+		lp->lp_state |= LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH;
+	spin_unlock(&lp->lp_lock);
+	rc = lnet_discover_peer_locked(lpni, cpt, true);
+	if (rc)
+		goto out_decref;
+
+	/* Peer may have changed. */
+	lp = lpni->lpni_peer_net->lpn_peer;
+	if (lp->lp_nnis < n_ids)
+		n_ids = lp->lp_nnis;
+
+	i = 0;
+	p = NULL;
+	while ((p = lnet_get_next_peer_ni_locked(lp, NULL, p)) != NULL) {
+		buf[i].pid = id.pid;
+		buf[i].nid = p->lpni_nid;
+		if (++i >= n_ids)
+			break;
+	}
+
+	lnet_net_unlock(cpt);
+
+	rc = -EFAULT;
+	if (copy_to_user(ids, buf, n_ids * sizeof(*buf)))
+		goto out_relock;
+	rc = n_ids;
+out_relock:
+	lnet_net_lock(cpt);
+out_decref:
+	lnet_peer_ni_decref_locked(lpni);
+out:
+	lnet_net_unlock(cpt);
+
+	LIBCFS_FREE(buf, buf_size);
+
 	return rc;
 }
+
+/**
+ * Retrieve peer discovery status.
+ *
+ * \retval 1 if lnet_peer_discovery_disabled is 0
+ * \retval 0 if lnet_peer_discovery_disabled is 1
+ */
+int
+LNetGetPeerDiscoveryStatus(void)
+{
+	return !lnet_peer_discovery_disabled;
+}
+EXPORT_SYMBOL(LNetGetPeerDiscoveryStatus);
diff --git a/drivers/staging/lustrefsx/lnet/lnet/config.c b/drivers/staging/lustrefsx/lnet/lnet/config.c
index 2f90e90849ac3..741711af0813f 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/config.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/config.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -32,6 +32,8 @@
 
 #define DEBUG_SUBSYSTEM S_LNET
 
+#include <linux/ctype.h>
+#include <linux/rtnetlink.h>
 #include <linux/inetdevice.h>
 #include <linux/nsproxy.h>
 #include <net/net_namespace.h>
@@ -123,10 +125,10 @@ lnet_ni_unique_net(struct list_head *nilist, char *iface)
 /* check that the NI is unique to the interfaces with in the same NI.
  * This is only a consideration if use_tcp_bonding is set */
 static bool
-lnet_ni_unique_ni(char *iface_list[LNET_NUM_INTERFACES], char *iface)
+lnet_ni_unique_ni(char *iface_list[LNET_INTERFACES_NUM], char *iface)
 {
 	int i;
-	for (i = 0; i < LNET_NUM_INTERFACES; i++) {
+	for (i = 0; i < LNET_INTERFACES_NUM; i++) {
 		if (iface_list[i] != NULL &&
 		    strncmp(iface_list[i], iface, strlen(iface)) == 0)
 			return false;
@@ -309,7 +311,7 @@ lnet_ni_free(struct lnet_ni *ni)
 	if (ni->ni_cpts != NULL)
 		cfs_expr_list_values_free(ni->ni_cpts, ni->ni_ncpts);
 
-	for (i = 0; i < LNET_NUM_INTERFACES &&
+	for (i = 0; i < LNET_INTERFACES_NUM &&
 		    ni->ni_interfaces[i] != NULL; i++) {
 		LIBCFS_FREE(ni->ni_interfaces[i],
 			    strlen(ni->ni_interfaces[i]) + 1);
@@ -409,11 +411,11 @@ lnet_ni_add_interface(struct lnet_ni *ni, char *iface)
 	 * can free the tokens at the end of the function.
 	 * The newly allocated ni_interfaces[] can be
 	 * freed when freeing the NI */
-	while (niface < LNET_NUM_INTERFACES &&
+	while (niface < LNET_INTERFACES_NUM &&
 	       ni->ni_interfaces[niface] != NULL)
 		niface++;
 
-	if (niface >= LNET_NUM_INTERFACES) {
+	if (niface >= LNET_INTERFACES_NUM) {
 		LCONSOLE_ERROR_MSG(0x115, "Too many interfaces "
 				   "for net %s\n",
 				   libcfs_net2str(LNET_NIDNET(ni->ni_nid)));
@@ -456,8 +458,9 @@ lnet_ni_alloc_common(struct lnet_net *net, char *iface)
 	}
 
 	spin_lock_init(&ni->ni_lock);
-	INIT_LIST_HEAD(&ni->ni_cptlist);
 	INIT_LIST_HEAD(&ni->ni_netlist);
+	INIT_LIST_HEAD(&ni->ni_recovery);
+	LNetInvalidateMDHandle(&ni->ni_ping_mdh);
 	ni->ni_refs = cfs_percpt_alloc(lnet_cpt_table(),
 				       sizeof(*ni->ni_refs[0]));
 	if (ni->ni_refs == NULL)
@@ -476,12 +479,12 @@ lnet_ni_alloc_common(struct lnet_net *net, char *iface)
 	ni->ni_nid = LNET_MKNID(net->net_id, 0);
 
 	/* Store net namespace in which current ni is being created */
-	if (current->nsproxy->net_ns != NULL)
+	if (current->nsproxy && current->nsproxy->net_ns)
 		ni->ni_net_ns = get_net(current->nsproxy->net_ns);
 	else
-		ni->ni_net_ns = NULL;
+		ni->ni_net_ns = get_net(&init_net);
 
-	ni->ni_last_alive = cfs_time_current_sec();
+	ni->ni_last_alive = ktime_get_real_seconds();
 	ni->ni_state = LNET_NI_STATE_INIT;
 	list_add_tail(&ni->ni_netlist, &net->net_ni_added);
 
@@ -1121,26 +1124,26 @@ lnet_parse_priority(char *str, unsigned int *priority, char **token)
 }
 
 static int
-lnet_parse_route (char *str, int *im_a_router)
+lnet_parse_route(char *str, int *im_a_router)
 {
 	/* static scratch buffer OK (single threaded) */
-	static char	  cmd[LNET_SINGLE_TEXTBUF_NOB];
+	static char cmd[LNET_SINGLE_TEXTBUF_NOB];
 
-	struct list_head  nets;
-	struct list_head  gateways;
+	struct list_head nets;
+	struct list_head gateways;
 	struct list_head *tmp1;
 	struct list_head *tmp2;
-	__u32		  net;
-	lnet_nid_t	  nid;
-	struct lnet_text_buf  *ltb;
-	int		  rc;
-	char		 *sep;
-	char		 *token = str;
-	int		  ntokens = 0;
-	int		  myrc = -1;
-	__u32		  hops;
-	int		  got_hops = 0;
-	unsigned int	  priority = 0;
+	__u32 net;
+	lnet_nid_t nid;
+	struct lnet_text_buf *ltb;
+	int rc;
+	char *sep;
+	char *token = str;
+	int ntokens = 0;
+	int myrc = -1;
+	__u32 hops;
+	int got_hops = 0;
+	unsigned int priority = 0;
 
 	INIT_LIST_HEAD(&gateways);
 	INIT_LIST_HEAD(&nets);
@@ -1214,8 +1217,7 @@ lnet_parse_route (char *str, int *im_a_router)
 					goto token_error;
 
 				nid = libcfs_str2nid(ltb->ltb_text);
-				if (nid == LNET_NID_ANY ||
-				    LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
+				if (nid == LNET_NID_ANY || nid == LNET_NID_LO_0)
 					goto token_error;
 			}
 		}
@@ -1603,11 +1605,12 @@ lnet_match_networks (char **networksp, char *ip2nets, __u32 *ipaddrs, int nip)
 }
 /*
  * kernel 5.3: commit ef11db3310e272d3d8dbe8739e0770820dd20e52
+ * kernel 4.18.0-193.el8:
  * added in_dev_for_each_ifa_rtnl and in_dev_for_each_ifa_rcu
  * and removed for_ifa and endfor_ifa.
  * Use the _rntl variant as the current locking is rtnl.
  */
-#ifdef in_dev_for_each_ifa_rtnl
+#ifdef HAVE_IN_DEV_FOR_EACH_IFA_RTNL
 #define DECLARE_CONST_IN_IFADDR(ifa)		const struct in_ifaddr *ifa
 #define endfor_ifa(in_dev)
 #else
@@ -1653,7 +1656,7 @@ int lnet_inet_enumerate(struct lnet_inetdev **dev_list, struct net *ns)
 			if (nip >= nalloc) {
 				struct lnet_inetdev *tmp;
 
-				nalloc += LNET_NUM_INTERFACES;
+				nalloc += LNET_INTERFACES_NUM;
 				tmp = krealloc(ifaces, nalloc * sizeof(*tmp),
 					       GFP_KERNEL);
 				if (!tmp) {
@@ -1697,7 +1700,10 @@ lnet_parse_ip2nets (char **networksp, char *ip2nets)
 	int	   rc;
 	int i;
 
-	nip = lnet_inet_enumerate(&ifaces, current->nsproxy->net_ns);
+	if (current->nsproxy && current->nsproxy->net_ns)
+		nip = lnet_inet_enumerate(&ifaces, current->nsproxy->net_ns);
+	else
+		nip = lnet_inet_enumerate(&ifaces, &init_net);
 	if (nip < 0) {
 		if (nip != -ENOENT) {
 			LCONSOLE_ERROR_MSG(0x117,
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-eq.c b/drivers/staging/lustrefsx/lnet/lnet/lib-eq.c
index 3bca6b77539a6..354c9768a3a1d 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lib-eq.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-eq.c
@@ -159,8 +159,6 @@ LNetEQFree(struct lnet_handle_eq eqh)
 	int		size = 0;
 	int		i;
 
-	LASSERT(the_lnet.ln_refcount > 0);
-
 	lnet_res_lock(LNET_LOCK_EX);
 	/* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do
 	 * both EQ lookup and poll event with only lnet_eq_wait_lock */
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-md.c b/drivers/staging/lustrefsx/lnet/lnet/lib-md.c
index a3d0487063cbd..9bf890c9477b6 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lib-md.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-md.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2013, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-move.c b/drivers/staging/lustrefsx/lnet/lnet/lib-move.c
index b60106f949b69..b48d4af51b739 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lib-move.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-move.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -36,6 +36,8 @@
 
 #define DEBUG_SUBSYSTEM S_LNET
 
+#include <linux/pagemap.h>
+
 #include <lnet/lib-lnet.h>
 #include <linux/nsproxy.h>
 #include <net/net_namespace.h>
@@ -44,6 +46,119 @@ static int local_nid_dist_zero = 1;
 module_param(local_nid_dist_zero, int, 0444);
 MODULE_PARM_DESC(local_nid_dist_zero, "Reserved");
 
+struct lnet_send_data {
+	struct lnet_ni *sd_best_ni;
+	struct lnet_peer_ni *sd_best_lpni;
+	struct lnet_peer_ni *sd_final_dst_lpni;
+	struct lnet_peer *sd_peer;
+	struct lnet_peer *sd_gw_peer;
+	struct lnet_peer_ni *sd_gw_lpni;
+	struct lnet_peer_net *sd_peer_net;
+	struct lnet_msg *sd_msg;
+	lnet_nid_t sd_dst_nid;
+	lnet_nid_t sd_src_nid;
+	lnet_nid_t sd_rtr_nid;
+	int sd_cpt;
+	int sd_md_cpt;
+	__u32 sd_send_case;
+};
+
+static inline struct lnet_comm_count *
+get_stats_counts(struct lnet_element_stats *stats,
+		 enum lnet_stats_type stats_type)
+{
+	switch (stats_type) {
+	case LNET_STATS_TYPE_SEND:
+		return &stats->el_send_stats;
+	case LNET_STATS_TYPE_RECV:
+		return &stats->el_recv_stats;
+	case LNET_STATS_TYPE_DROP:
+		return &stats->el_drop_stats;
+	default:
+		CERROR("Unknown stats type\n");
+	}
+
+	return NULL;
+}
+
+void lnet_incr_stats(struct lnet_element_stats *stats,
+		     enum lnet_msg_type msg_type,
+		     enum lnet_stats_type stats_type)
+{
+	struct lnet_comm_count *counts = get_stats_counts(stats, stats_type);
+	if (!counts)
+		return;
+
+	switch (msg_type) {
+	case LNET_MSG_ACK:
+		atomic_inc(&counts->co_ack_count);
+		break;
+	case LNET_MSG_PUT:
+		atomic_inc(&counts->co_put_count);
+		break;
+	case LNET_MSG_GET:
+		atomic_inc(&counts->co_get_count);
+		break;
+	case LNET_MSG_REPLY:
+		atomic_inc(&counts->co_reply_count);
+		break;
+	case LNET_MSG_HELLO:
+		atomic_inc(&counts->co_hello_count);
+		break;
+	default:
+		CERROR("There is a BUG in the code. Unknown message type\n");
+		break;
+	}
+}
+
+__u32 lnet_sum_stats(struct lnet_element_stats *stats,
+		     enum lnet_stats_type stats_type)
+{
+	struct lnet_comm_count *counts = get_stats_counts(stats, stats_type);
+	if (!counts)
+		return 0;
+
+	return (atomic_read(&counts->co_ack_count) +
+		atomic_read(&counts->co_put_count) +
+		atomic_read(&counts->co_get_count) +
+		atomic_read(&counts->co_reply_count) +
+		atomic_read(&counts->co_hello_count));
+}
+
+static inline void assign_stats(struct lnet_ioctl_comm_count *msg_stats,
+				struct lnet_comm_count *counts)
+{
+	msg_stats->ico_get_count = atomic_read(&counts->co_get_count);
+	msg_stats->ico_put_count = atomic_read(&counts->co_put_count);
+	msg_stats->ico_reply_count = atomic_read(&counts->co_reply_count);
+	msg_stats->ico_ack_count = atomic_read(&counts->co_ack_count);
+	msg_stats->ico_hello_count = atomic_read(&counts->co_hello_count);
+}
+
+void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
+			      struct lnet_element_stats *stats)
+{
+	struct lnet_comm_count *counts;
+
+	LASSERT(msg_stats);
+	LASSERT(stats);
+
+	counts = get_stats_counts(stats, LNET_STATS_TYPE_SEND);
+	if (!counts)
+		return;
+	assign_stats(&msg_stats->im_send_stats, counts);
+
+	counts = get_stats_counts(stats, LNET_STATS_TYPE_RECV);
+	if (!counts)
+		return;
+	assign_stats(&msg_stats->im_recv_stats, counts);
+
+	counts = get_stats_counts(stats, LNET_STATS_TYPE_DROP);
+	if (!counts)
+		return;
+	assign_stats(&msg_stats->im_drop_stats, counts);
+}
+
 int
 lnet_fail_nid(lnet_nid_t nid, unsigned int threshold)
 {
@@ -630,25 +745,29 @@ lnet_prep_send(struct lnet_msg *msg, int type, struct lnet_process_id target,
 
 	memset (&msg->msg_hdr, 0, sizeof (msg->msg_hdr));
 	msg->msg_hdr.type           = cpu_to_le32(type);
+	/* dest_nid will be overwritten by lnet_select_pathway() */
+	msg->msg_hdr.dest_nid       = cpu_to_le64(target.nid);
 	msg->msg_hdr.dest_pid       = cpu_to_le32(target.pid);
 	/* src_nid will be set later */
 	msg->msg_hdr.src_pid        = cpu_to_le32(the_lnet.ln_pid);
 	msg->msg_hdr.payload_length = cpu_to_le32(len);
 }
 
-static void
+void
 lnet_ni_send(struct lnet_ni *ni, struct lnet_msg *msg)
 {
-	void   *priv = msg->msg_private;
-	int	rc;
+	void *priv = msg->msg_private;
+	int rc;
 
-	LASSERT (!in_interrupt ());
-	LASSERT (LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND ||
-		 (msg->msg_txcredit && msg->msg_peertxcredit));
+	LASSERT(!in_interrupt());
+	LASSERT(ni->ni_nid == LNET_NID_LO_0 ||
+		(msg->msg_txcredit && msg->msg_peertxcredit));
 
 	rc = (ni->ni_net->net_lnd->lnd_send)(ni, priv, msg);
-	if (rc < 0)
+	if (rc < 0) {
+		msg->msg_no_resend = true;
 		lnet_finalize(msg, rc);
+	}
 }
 
 static int
@@ -686,7 +805,7 @@ lnet_ni_eager_recv(struct lnet_ni *ni, struct lnet_msg *msg)
 static void
 lnet_ni_query_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp)
 {
-	cfs_time_t last_alive = 0;
+	time64_t last_alive = 0;
 	int cpt = lnet_cpt_of_nid_locked(lp->lpni_nid, ni);
 
 	LASSERT(lnet_peer_aliveness_enabled(lp));
@@ -696,7 +815,7 @@ lnet_ni_query_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp)
 	(ni->ni_net->net_lnd->lnd_query)(ni, lp->lpni_nid, &last_alive);
 	lnet_net_lock(cpt);
 
-	lp->lpni_last_query = cfs_time_current();
+	lp->lpni_last_query = ktime_get_seconds();
 
 	if (last_alive != 0) /* NI has updated timestamp */
 		lp->lpni_last_alive = last_alive;
@@ -704,10 +823,10 @@ lnet_ni_query_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp)
 
 /* NB: always called with lnet_net_lock held */
 static inline int
-lnet_peer_is_alive (struct lnet_peer_ni *lp, cfs_time_t now)
+lnet_peer_is_alive(struct lnet_peer_ni *lp, time64_t now)
 {
-	int        alive;
-	cfs_time_t deadline;
+	int alive;
+	time64_t deadline;
 
 	LASSERT (lnet_peer_aliveness_enabled(lp));
 
@@ -717,16 +836,14 @@ lnet_peer_is_alive (struct lnet_peer_ni *lp, cfs_time_t now)
 	 */
 	spin_lock(&lp->lpni_lock);
 	if (!lp->lpni_alive && lp->lpni_alive_count > 0 &&
-	    cfs_time_aftereq(lp->lpni_timestamp, lp->lpni_last_alive)) {
+	    lp->lpni_timestamp >= lp->lpni_last_alive) {
 		spin_unlock(&lp->lpni_lock);
 		return 0;
 	}
 
-	deadline =
-	  cfs_time_add(lp->lpni_last_alive,
-		       cfs_time_seconds(lp->lpni_net->net_tunables.
-					lct_peer_timeout));
-	alive = cfs_time_after(deadline, now);
+	deadline = lp->lpni_last_alive +
+		   lp->lpni_net->net_tunables.lct_peer_timeout;
+	alive = deadline > now;
 
 	/*
 	 * Update obsolete lp_alive except for routers assumed to be dead
@@ -748,9 +865,10 @@ lnet_peer_is_alive (struct lnet_peer_ni *lp, cfs_time_t now)
 /* NB: returns 1 when alive, 0 when dead, negative when error;
  *     may drop the lnet_net_lock */
 static int
-lnet_peer_alive_locked (struct lnet_ni *ni, struct lnet_peer_ni *lp)
+lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp,
+		       struct lnet_msg *msg)
 {
-	cfs_time_t now = cfs_time_current();
+	time64_t now = ktime_get_seconds();
 
 	if (!lnet_peer_aliveness_enabled(lp))
 		return -ENODEV;
@@ -758,23 +876,29 @@ lnet_peer_alive_locked (struct lnet_ni *ni, struct lnet_peer_ni *lp)
 	if (lnet_peer_is_alive(lp, now))
 		return 1;
 
+	/*
+	 * If we're resending a message, let's attempt to send it even if
+	 * the peer is down to fulfill our resend quota on the message
+	 */
+	if (msg->msg_retry_count > 0)
+		return 1;
+
 	/*
 	 * Peer appears dead, but we should avoid frequent NI queries (at
 	 * most once per lnet_queryinterval seconds).
 	 */
 	if (lp->lpni_last_query != 0) {
 		static const int lnet_queryinterval = 1;
+		time64_t next_query;
 
-		cfs_time_t next_query =
-			   cfs_time_add(lp->lpni_last_query,
-					cfs_time_seconds(lnet_queryinterval));
+		next_query = lp->lpni_last_query + lnet_queryinterval;
 
-		if (cfs_time_before(now, next_query)) {
+		if (now < next_query) {
 			if (lp->lpni_alive)
 				CWARN("Unexpected aliveness of peer %s: "
-				      "%d < %d (%d/%d)\n",
+				      "%lld < %lld (%d/%d)\n",
 				      libcfs_nid2str(lp->lpni_nid),
-				      (int)now, (int)next_query,
+				      now, next_query,
 				      lnet_queryinterval,
 				      lp->lpni_net->net_tunables.lct_peer_timeout);
 			return 0;
@@ -814,20 +938,28 @@ lnet_post_send_locked(struct lnet_msg *msg, int do_send)
 	LASSERT(!do_send || msg->msg_tx_delayed);
 	LASSERT(!msg->msg_receiving);
 	LASSERT(msg->msg_tx_committed);
+	/* can't get here if we're sending to the loopback interface */
+	LASSERT(lp->lpni_nid != the_lnet.ln_loni->ni_nid);
 
 	/* NB 'lp' is always the next hop */
 	if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 &&
-	    lnet_peer_alive_locked(ni, lp) == 0) {
-		the_lnet.ln_counters[cpt]->drop_count++;
-		the_lnet.ln_counters[cpt]->drop_length += msg->msg_len;
+	    lnet_peer_alive_locked(ni, lp, msg) == 0) {
+		the_lnet.ln_counters[cpt]->lct_common.lcc_drop_count++;
+		the_lnet.ln_counters[cpt]->lct_common.lcc_drop_length +=
+			msg->msg_len;
 		lnet_net_unlock(cpt);
 		if (msg->msg_txpeer)
-			atomic_inc(&msg->msg_txpeer->lpni_stats.drop_count);
+			lnet_incr_stats(&msg->msg_txpeer->lpni_stats,
+					msg->msg_type,
+					LNET_STATS_TYPE_DROP);
 		if (msg->msg_txni)
-			atomic_inc(&msg->msg_txni->ni_stats.drop_count);
+			lnet_incr_stats(&msg->msg_txni->ni_stats,
+					msg->msg_type,
+					LNET_STATS_TYPE_DROP);
 
 		CNETERR("Dropping message for %s: peer not alive\n",
 			libcfs_id2str(msg->msg_target));
+		msg->msg_health_status = LNET_MSG_STATUS_REMOTE_DROPPED;
 		if (do_send)
 			lnet_finalize(msg, -EHOSTUNREACH);
 
@@ -842,8 +974,12 @@ lnet_post_send_locked(struct lnet_msg *msg, int do_send)
 		CNETERR("Aborting message for %s: LNetM[DE]Unlink() already "
 			"called on the MD/ME.\n",
 			libcfs_id2str(msg->msg_target));
-		if (do_send)
+		if (do_send) {
+			msg->msg_no_resend = true;
+			CDEBUG(D_NET, "msg %p to %s canceled and will not be resent\n",
+			       msg, libcfs_id2str(msg->msg_target));
 			lnet_finalize(msg, -ECANCELED);
+		}
 
 		lnet_net_lock(cpt);
 		return -ECANCELED;
@@ -888,6 +1024,15 @@ lnet_post_send_locked(struct lnet_msg *msg, int do_send)
 		}
 	}
 
+	if (unlikely(!list_empty(&the_lnet.ln_delay_rules)) &&
+	    lnet_delay_rule_match_locked(&msg->msg_hdr, msg)) {
+		msg->msg_tx_delayed = 1;
+		return LNET_CREDIT_WAIT;
+	}
+
+	/* unset the tx_delay flag as we're going to send it now */
+	msg->msg_tx_delayed = 0;
+
 	if (do_send) {
 		lnet_net_unlock(cpt);
 		lnet_ni_send(ni, msg);
@@ -983,6 +1128,9 @@ lnet_post_routed_recv_locked(struct lnet_msg *msg, int do_recv)
 	msg->msg_niov = rbp->rbp_npages;
 	msg->msg_kiov = &rb->rb_kiov[0];
 
+	/* unset the msg-rx_delayed flag since we're receiving the message */
+	msg->msg_rx_delayed = 0;
+
 	if (do_recv) {
 		int cpt = msg->msg_rx_cpt;
 
@@ -1082,15 +1230,6 @@ lnet_return_tx_credits_locked(struct lnet_msg *msg)
 	}
 
 	if (txpeer != NULL) {
-		/*
-		 * TODO:
-		 * Once the patch for the health comes in we need to set
-		 * the health of the peer ni to bad when we fail to send
-		 * a message.
-		 * int status = msg->msg_ev.status;
-		 * if (status != 0)
-		 *	lnet_set_peer_ni_health_locked(txpeer, false)
-		 */
 		msg->msg_txpeer = NULL;
 		lnet_peer_ni_decref_locked(txpeer);
 	}
@@ -1122,6 +1261,8 @@ lnet_drop_routed_msgs_locked(struct list_head *list, int cpt)
 		lnet_ni_recv(msg->msg_rxni, msg->msg_private, NULL,
 			     0, 0, 0, msg->msg_hdr.payload_length);
 		list_del_init(&msg->msg_list);
+		msg->msg_no_resend = true;
+		msg->msg_health_status = LNET_MSG_STATUS_REMOTE_ERROR;
 		lnet_finalize(msg, -ECANCELED);
 	}
 
@@ -1268,7 +1409,7 @@ lnet_compare_routes(struct lnet_route *r1, struct lnet_route *r2)
 }
 
 static struct lnet_peer_ni *
-lnet_find_route_locked(struct lnet_net *net, lnet_nid_t target,
+lnet_find_route_locked(struct lnet_net *net, __u32 remote_net,
 		       lnet_nid_t rtr_nid)
 {
 	struct lnet_remotenet	*rnet;
@@ -1282,7 +1423,7 @@ lnet_find_route_locked(struct lnet_net *net, lnet_nid_t target,
 	/* If @rtr_nid is not LNET_NID_ANY, return the gateway with
 	 * rtr_nid nid, otherwise find the best gateway I can use */
 
-	rnet = lnet_find_rnet_locked(LNET_NIDNET(target));
+	rnet = lnet_find_rnet_locked(remote_net);
 	if (rnet == NULL)
 		return NULL;
 
@@ -1327,30 +1468,42 @@ lnet_find_route_locked(struct lnet_net *net, lnet_nid_t target,
 }
 
 static struct lnet_ni *
-lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *cur_ni,
+lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
+		 struct lnet_peer *peer, struct lnet_peer_net *peer_net,
 		 int md_cpt)
 {
-	struct lnet_ni *ni = NULL, *best_ni = cur_ni;
+	struct lnet_ni *ni = NULL;
 	unsigned int shortest_distance;
 	int best_credits;
+	int best_healthv;
+
+	/*
+	 * If there is no peer_ni that we can send to on this network,
+	 * then there is no point in looking for a new best_ni here.
+	*/
+	if (!lnet_get_next_peer_ni_locked(peer, peer_net, NULL))
+		return best_ni;
 
 	if (best_ni == NULL) {
 		shortest_distance = UINT_MAX;
 		best_credits = INT_MIN;
+		best_healthv = 0;
 	} else {
 		shortest_distance = cfs_cpt_distance(lnet_cpt_table(), md_cpt,
 						     best_ni->ni_dev_cpt);
 		best_credits = atomic_read(&best_ni->ni_tx_credits);
+		best_healthv = atomic_read(&best_ni->ni_healthv);
 	}
 
 	while ((ni = lnet_get_next_ni_locked(local_net, ni))) {
 		unsigned int distance;
 		int ni_credits;
-
-		if (!lnet_is_ni_healthy_locked(ni))
-			continue;
+		int ni_healthv;
+		int ni_fatal;
 
 		ni_credits = atomic_read(&ni->ni_tx_credits);
+		ni_healthv = atomic_read(&ni->ni_healthv);
+		ni_fatal = atomic_read(&ni->ni_fatal_error_on);
 
 		/*
 		 * calculate the distance from the CPT on which
@@ -1361,6 +1514,12 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *cur_ni,
 					    md_cpt,
 					    ni->ni_dev_cpt);
 
+		CDEBUG(D_NET, "compare ni %s [c:%d, d:%d, s:%d] with best_ni %s [c:%d, d:%d, s:%d]\n",
+		       libcfs_nid2str(ni->ni_nid), ni_credits, distance,
+		       ni->ni_seq, (best_ni) ? libcfs_nid2str(best_ni->ni_nid)
+			: "not seleced", best_credits, shortest_distance,
+			(best_ni) ? best_ni->ni_seq : 0);
+
 		/*
 		 * All distances smaller than the NUMA range
 		 * are treated equally.
@@ -1369,383 +1528,242 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *cur_ni,
 			distance = lnet_numa_range;
 
 		/*
-		 * Select on shorter distance, then available
+		 * Select on health, shorter distance, available
 		 * credits, then round-robin.
 		 */
-		if (distance > shortest_distance) {
+		if (ni_fatal) {
+			continue;
+		} else if (ni_healthv < best_healthv) {
+			continue;
+		} else if (ni_healthv > best_healthv) {
+			best_healthv = ni_healthv;
+			/*
+			 * If we're going to prefer this ni because it's
+			 * the healthiest, then we should set the
+			 * shortest_distance in the algorithm in case
+			 * there are multiple NIs with the same health but
+			 * different distances.
+			 */
+			if (distance < shortest_distance)
+				shortest_distance = distance;
+		} else if (distance > shortest_distance) {
 			continue;
 		} else if (distance < shortest_distance) {
 			shortest_distance = distance;
 		} else if (ni_credits < best_credits) {
 			continue;
 		} else if (ni_credits == best_credits) {
-			if (best_ni && (best_ni)->ni_seq <= ni->ni_seq)
+			if (best_ni && best_ni->ni_seq <= ni->ni_seq)
 				continue;
 		}
 		best_ni = ni;
 		best_credits = ni_credits;
 	}
 
+	CDEBUG(D_NET, "selected best_ni %s\n",
+	       (best_ni) ? libcfs_nid2str(best_ni->ni_nid) : "no selection");
+
 	return best_ni;
 }
 
-static int
-lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
-		    struct lnet_msg *msg, lnet_nid_t rtr_nid)
+/*
+ * Traffic to the LNET_RESERVED_PORTAL may not trigger peer discovery,
+ * because such traffic is required to perform discovery. We therefore
+ * exclude all GET and PUT on that portal. We also exclude all ACK and
+ * REPLY traffic, but that is because the portal is not tracked in the
+ * message structure for these message types. We could restrict this
+ * further by also checking for LNET_PROTO_PING_MATCHBITS.
+ */
+static bool
+lnet_msg_discovery(struct lnet_msg *msg)
 {
-	struct lnet_ni		*best_ni;
-	struct lnet_peer_ni	*best_lpni;
-	struct lnet_peer_ni	*best_gw;
-	struct lnet_peer_ni	*lpni;
-	struct lnet_peer_ni	*final_dst;
-	struct lnet_peer	*peer;
-	struct lnet_peer_net	*peer_net;
-	struct lnet_net		*local_net;
-	__u32			seq;
-	int			cpt, cpt2, rc;
-	bool			routing;
-	bool			routing2;
-	bool			ni_is_pref;
-	bool			preferred;
-	bool			local_found;
-	int			best_lpni_credits;
-	int			md_cpt;
+	if (msg->msg_type == LNET_MSG_PUT) {
+		if (msg->msg_hdr.msg.put.ptl_index != LNET_RESERVED_PORTAL)
+			return true;
+	} else if (msg->msg_type == LNET_MSG_GET) {
+		if (msg->msg_hdr.msg.get.ptl_index != LNET_RESERVED_PORTAL)
+			return true;
+	}
+	return false;
+}
 
-	/*
-	 * get an initial CPT to use for locking. The idea here is not to
-	 * serialize the calls to select_pathway, so that as many
-	 * operations can run concurrently as possible. To do that we use
-	 * the CPT where this call is being executed. Later on when we
-	 * determine the CPT to use in lnet_message_commit, we switch the
-	 * lock and check if there was any configuration change.  If none,
-	 * then we proceed, if there is, then we restart the operation.
-	 */
-	cpt = lnet_net_lock_current();
+#define SRC_SPEC	0x0001
+#define SRC_ANY		0x0002
+#define LOCAL_DST	0x0004
+#define REMOTE_DST	0x0008
+#define MR_DST		0x0010
+#define NMR_DST		0x0020
+#define SND_RESP	0x0040
+
+/* The following to defines are used for return codes */
+#define REPEAT_SEND	0x1000
+#define PASS_THROUGH	0x2000
+
+/* The different cases lnet_select pathway needs to handle */
+#define SRC_SPEC_LOCAL_MR_DST	(SRC_SPEC | LOCAL_DST | MR_DST)
+#define SRC_SPEC_ROUTER_MR_DST	(SRC_SPEC | REMOTE_DST | MR_DST)
+#define SRC_SPEC_LOCAL_NMR_DST	(SRC_SPEC | LOCAL_DST | NMR_DST)
+#define SRC_SPEC_ROUTER_NMR_DST	(SRC_SPEC | REMOTE_DST | NMR_DST)
+#define SRC_ANY_LOCAL_MR_DST	(SRC_ANY | LOCAL_DST | MR_DST)
+#define SRC_ANY_ROUTER_MR_DST	(SRC_ANY | REMOTE_DST | MR_DST)
+#define SRC_ANY_LOCAL_NMR_DST	(SRC_ANY | LOCAL_DST | NMR_DST)
+#define SRC_ANY_ROUTER_NMR_DST	(SRC_ANY | REMOTE_DST | NMR_DST)
 
-	md_cpt = lnet_cpt_of_md(msg->msg_md, msg->msg_offset);
-	if (md_cpt == CFS_CPT_ANY)
-		md_cpt = cpt;
+static int
+lnet_handle_lo_send(struct lnet_send_data *sd)
+{
+	struct lnet_msg *msg = sd->sd_msg;
+	int cpt = sd->sd_cpt;
 
-again:
-	best_ni = NULL;
-	best_lpni = NULL;
-	best_gw = NULL;
-	final_dst = NULL;
-	local_net = NULL;
-	routing = false;
-	routing2 = false;
-	local_found = false;
-
-	seq = lnet_get_dlc_seq_locked();
-
-	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
-		lnet_net_unlock(cpt);
-		return -ESHUTDOWN;
-	}
+	/* No send credit hassles with LOLND */
+	lnet_ni_addref_locked(the_lnet.ln_loni, cpt);
+	msg->msg_hdr.dest_nid = cpu_to_le64(the_lnet.ln_loni->ni_nid);
+	if (!msg->msg_routing)
+		msg->msg_hdr.src_nid =
+			cpu_to_le64(the_lnet.ln_loni->ni_nid);
+	msg->msg_target.nid = the_lnet.ln_loni->ni_nid;
+	lnet_msg_commit(msg, cpt);
+	msg->msg_txni = the_lnet.ln_loni;
 
-	peer = lnet_find_or_create_peer_locked(dst_nid, cpt);
-	if (IS_ERR(peer)) {
-		lnet_net_unlock(cpt);
-		return PTR_ERR(peer);
-	}
+	return LNET_CREDIT_OK;
+}
 
-	/* If peer is not healthy then can not send anything to it */
-	if (!lnet_is_peer_healthy_locked(peer)) {
-		lnet_net_unlock(cpt);
-		return -EHOSTUNREACH;
-	}
+static int
+lnet_handle_send(struct lnet_send_data *sd)
+{
+	struct lnet_ni *best_ni = sd->sd_best_ni;
+	struct lnet_peer_ni *best_lpni = sd->sd_best_lpni;
+	struct lnet_peer_ni *final_dst_lpni = sd->sd_final_dst_lpni;
+	struct lnet_msg *msg = sd->sd_msg;
+	int cpt2;
+	__u32 send_case = sd->sd_send_case;
+	int rc;
+	__u32 routing = send_case & REMOTE_DST;
+	 struct lnet_rsp_tracker *rspt;
 
-	if (!peer->lp_multi_rail && lnet_get_num_peer_nis(peer) > 1) {
-		lnet_net_unlock(cpt);
-		CERROR("peer %s is declared to be non MR capable, "
-		       "yet configured with more than one NID\n",
-		       libcfs_nid2str(dst_nid));
-		return -EINVAL;
-	}
+	/*
+	 * Increment sequence number of the selected peer so that we
+	 * pick the next one in Round Robin.
+	 */
+	best_lpni->lpni_seq++;
 
 	/*
-	 * STEP 1: first jab at determining best_ni
-	 * if src_nid is explicitly specified, then best_ni is already
-	 * pre-determiend for us. Otherwise we need to select the best
-	 * one to use later on
+	 * grab a reference on the peer_ni so it sticks around even if
+	 * we need to drop and relock the lnet_net_lock below.
 	 */
-	if (src_nid != LNET_NID_ANY) {
-		best_ni = lnet_nid2ni_locked(src_nid, cpt);
-		if (!best_ni) {
-			lnet_net_unlock(cpt);
-			LCONSOLE_WARN("Can't send to %s: src %s is not a "
-				      "local nid\n", libcfs_nid2str(dst_nid),
-				      libcfs_nid2str(src_nid));
-			return -EINVAL;
-		}
-	}
+	lnet_peer_ni_addref_locked(best_lpni);
 
-	if (msg->msg_type == LNET_MSG_REPLY ||
-	    msg->msg_type == LNET_MSG_ACK ||
-	    !peer->lp_multi_rail ||
-	    best_ni) {
-		/*
-		 * for replies we want to respond on the same peer_ni we
-		 * received the message on if possible. If not, then pick
-		 * a peer_ni to send to
-		 *
-		 * if the peer is non-multi-rail then you want to send to
-		 * the dst_nid provided as well.
-		 *
-		 * If the best_ni has already been determined, IE the
-		 * src_nid has been specified, then use the
-		 * destination_nid provided as well, since we're
-		 * continuing a series of related messages for the same
-		 * RPC.
-		 *
-		 * It is expected to find the lpni using dst_nid, since we
-		 * created it earlier.
-		 */
-		best_lpni = lnet_find_peer_ni_locked(dst_nid);
-		if (best_lpni)
+	/*
+	 * Use lnet_cpt_of_nid() to determine the CPT used to commit the
+	 * message. This ensures that we get a CPT that is correct for
+	 * the NI when the NI has been restricted to a subset of all CPTs.
+	 * If the selected CPT differs from the one currently locked, we
+	 * must unlock and relock the lnet_net_lock(), and then check whether
+	 * the configuration has changed. We don't have a hold on the best_ni
+	 * yet, and it may have vanished.
+	 */
+	cpt2 = lnet_cpt_of_nid_locked(best_lpni->lpni_nid, best_ni);
+	if (sd->sd_cpt != cpt2) {
+		__u32 seq = lnet_get_dlc_seq_locked();
+		lnet_net_unlock(sd->sd_cpt);
+		sd->sd_cpt = cpt2;
+		lnet_net_lock(sd->sd_cpt);
+		if (seq != lnet_get_dlc_seq_locked()) {
 			lnet_peer_ni_decref_locked(best_lpni);
-
-		if (best_lpni && !lnet_get_net_locked(LNET_NIDNET(dst_nid))) {
-			/*
-			 * this lpni is not on a local network so we need
-			 * to route this reply.
-			 */
-			best_gw = lnet_find_route_locked(NULL,
-							 best_lpni->lpni_nid,
-							 rtr_nid);
-			if (best_gw) {
-				/*
-				* RULE: Each node considers only the next-hop
-				*
-				* We're going to route the message, so change the peer to
-				* the router.
-				*/
-				LASSERT(best_gw->lpni_peer_net);
-				LASSERT(best_gw->lpni_peer_net->lpn_peer);
-				peer = best_gw->lpni_peer_net->lpn_peer;
-
-				/*
-				* if the router is not multi-rail then use the best_gw
-				* found to send the message to
-				*/
-				if (!peer->lp_multi_rail)
-					best_lpni = best_gw;
-				else
-					best_lpni = NULL;
-
-				routing = true;
-			} else {
-				best_lpni = NULL;
-			}
-		} else if (!best_lpni) {
-			lnet_net_unlock(cpt);
-			CERROR("unable to send msg_type %d to "
-			      "originating %s. Destination NID not in DB\n",
-			      msg->msg_type, libcfs_nid2str(dst_nid));
-			return -EINVAL;
+			return REPEAT_SEND;
 		}
 	}
 
 	/*
-	 * if the peer is not MR capable, then we should always send to it
-	 * using the first NI in the NET we determined.
+	 * store the best_lpni in the message right away to avoid having
+	 * to do the same operation under different conditions
 	 */
-	if (!peer->lp_multi_rail) {
-		if (!best_lpni) {
-			lnet_net_unlock(cpt);
-			CERROR("no route to %s\n",
-			       libcfs_nid2str(dst_nid));
-			return -EHOSTUNREACH;
-		}
-
-		/* best ni could be set because src_nid was provided */
-		if (!best_ni) {
-			best_ni = lnet_net2ni_locked(best_lpni->lpni_net->net_id, cpt);
-			if (!best_ni) {
-				lnet_net_unlock(cpt);
-				CERROR("no path to %s from net %s\n",
-				libcfs_nid2str(best_lpni->lpni_nid),
-				libcfs_net2str(best_lpni->lpni_net->net_id));
-				return -EHOSTUNREACH;
-			}
-		}
-	}
+	msg->msg_txpeer = best_lpni;
+	msg->msg_txni = best_ni;
 
 	/*
-	 * if we already found a best_ni because src_nid is specified and
-	 * best_lpni because we are replying to a message then just send
-	 * the message
+	 * grab a reference for the best_ni since now it's in use in this
+	 * send. The reference will be dropped in lnet_finalize()
 	 */
-	if (best_ni && best_lpni)
-		goto send;
+	lnet_ni_addref_locked(msg->msg_txni, sd->sd_cpt);
 
 	/*
-	 * If we already found a best_ni because src_nid is specified then
-	 * pick the peer then send the message
+	 * Always set the target.nid to the best peer picked. Either the
+	 * NID will be one of the peer NIDs selected, or the same NID as
+	 * what was originally set in the target or it will be the NID of
+	 * a router if this message should be routed
 	 */
-	if (best_ni)
-		goto pick_peer;
+	msg->msg_target.nid = msg->msg_txpeer->lpni_nid;
 
 	/*
-	 * pick the best_ni by going through all the possible networks of
-	 * that peer and see which local NI is best suited to talk to that
-	 * peer.
-	 *
-	 * Locally connected networks will always be preferred over
-	 * a routed network. If there are only routed paths to the peer,
-	 * then the best route is chosen. If all routes are equal then
-	 * they are used in round robin.
+	 * lnet_msg_commit assigns the correct cpt to the message, which
+	 * is used to decrement the correct refcount on the ni when it's
+	 * time to return the credits
 	 */
-	list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_on_peer_list) {
-		if (!lnet_is_peer_net_healthy_locked(peer_net))
-			continue;
-
-		local_net = lnet_get_net_locked(peer_net->lpn_net_id);
-		if (!local_net && !routing && !local_found) {
-			struct lnet_peer_ni *net_gw;
-
-			lpni = list_entry(peer_net->lpn_peer_nis.next,
-					  struct lnet_peer_ni,
-					  lpni_on_peer_net_list);
-
-			net_gw = lnet_find_route_locked(NULL,
-							lpni->lpni_nid,
-							rtr_nid);
-			if (!net_gw)
-				continue;
-
-			if (best_gw) {
-				/*
-				 * lnet_find_route_locked() call
-				 * will return the best_Gw on the
-				 * lpni->lpni_nid network.
-				 * However, best_gw and net_gw can
-				 * be on different networks.
-				 * Therefore need to compare them
-				 * to pick the better of either.
-				 */
-				if (lnet_compare_peers(best_gw, net_gw) > 0)
-					continue;
-				if (best_gw->lpni_gw_seq <= net_gw->lpni_gw_seq)
-					continue;
-			}
-			best_gw = net_gw;
-			final_dst = lpni;
-
-			routing2 = true;
-		} else {
-			best_gw = NULL;
-			final_dst = NULL;
-			routing2 = false;
-			local_found = true;
-		}
-
-		/*
-		 * a gw on this network is found, but there could be
-		 * other better gateways on other networks. So don't pick
-		 * the best_ni until we determine the best_gw.
-		 */
-		if (best_gw)
-			continue;
+	lnet_msg_commit(msg, sd->sd_cpt);
 
-		/* if no local_net found continue */
-		if (!local_net)
-			continue;
+	/*
+	 * If we are routing the message then we keep the src_nid that was
+	 * set by the originator. If we are not routing then we are the
+	 * originator and set it here.
+	 */
+	if (!msg->msg_routing)
+		msg->msg_hdr.src_nid = cpu_to_le64(msg->msg_txni->ni_nid);
 
+	if (routing) {
+		msg->msg_target_is_router = 1;
+		msg->msg_target.pid = LNET_PID_LUSTRE;
 		/*
-		 * Iterate through the NIs in this local Net and select
-		 * the NI to send from. The selection is determined by
-		 * these 3 criterion in the following priority:
-		 *	1. NUMA
-		 *	2. NI available credits
-		 *	3. Round Robin
+		 * since we're routing we want to ensure that the
+		 * msg_hdr.dest_nid is set to the final destination. When
+		 * the router receives this message it knows how to route
+		 * it.
+		 *
+		 * final_dst_lpni is set at the beginning of the
+		 * lnet_select_pathway() function and is never changed.
+		 * It's safe to use it here.
 		 */
-		best_ni = lnet_get_best_ni(local_net, best_ni, md_cpt);
-	}
-
-	if (!best_ni && !best_gw) {
-		lnet_net_unlock(cpt);
-		LCONSOLE_WARN("No local ni found to send from to %s\n",
-			libcfs_nid2str(dst_nid));
-		return -EINVAL;
-	}
-
-	if (!best_ni) {
-		best_ni = lnet_get_best_ni(best_gw->lpni_net, best_ni, md_cpt);
-		LASSERT(best_gw && best_ni);
-
+		msg->msg_hdr.dest_nid = cpu_to_le64(final_dst_lpni->lpni_nid);
+	} else {
 		/*
-		 * We're going to route the message, so change the peer to
-		 * the router.
+		 * if we're not routing set the dest_nid to the best peer
+		 * ni NID that we picked earlier in the algorithm.
 		 */
-		LASSERT(best_gw->lpni_peer_net);
-		LASSERT(best_gw->lpni_peer_net->lpn_peer);
-		best_gw->lpni_gw_seq++;
-		peer = best_gw->lpni_peer_net->lpn_peer;
+		msg->msg_hdr.dest_nid = cpu_to_le64(msg->msg_txpeer->lpni_nid);
 	}
 
 	/*
-	 * Now that we selected the NI to use increment its sequence
-	 * number so the Round Robin algorithm will detect that it has
-	 * been used and pick the next NI.
-	 */
-	best_ni->ni_seq++;
-
-pick_peer:
-	/*
-	 * At this point the best_ni is on a local network on which
-	 * the peer has a peer_ni as well
+	 * if we have response tracker block update it with the next hop
+	 * nid
 	 */
-	peer_net = lnet_peer_get_net_locked(peer,
-					    best_ni->ni_net->net_id);
-	/*
-	 * peer_net is not available or the src_nid is explicitly defined
-	 * and the peer_net for that src_nid is unhealthy. find a route to
-	 * the destination nid.
-	 */
-	if (!peer_net ||
-	    (src_nid != LNET_NID_ANY &&
-	     !lnet_is_peer_net_healthy_locked(peer_net))) {
-		best_gw = lnet_find_route_locked(best_ni->ni_net,
-						 dst_nid,
-						 rtr_nid);
-		/*
-		 * if no route is found for that network then
-		 * move onto the next peer_ni in the peer
-		 */
-		if (!best_gw) {
-			lnet_net_unlock(cpt);
-			LCONSOLE_WARN("No route to peer from %s\n",
-				libcfs_nid2str(best_ni->ni_nid));
-			return -EHOSTUNREACH;
+	if (msg->msg_md) {
+		rspt = msg->msg_md->md_rspt_ptr;
+		if (rspt) {
+			rspt->rspt_next_hop_nid = msg->msg_txpeer->lpni_nid;
+			CDEBUG(D_NET, "rspt_next_hop_nid = %s\n",
+			       libcfs_nid2str(rspt->rspt_next_hop_nid));
 		}
+	}
 
-		CDEBUG(D_NET, "Best route to %s via %s for %s %d\n",
-			libcfs_nid2str(dst_nid),
-			libcfs_nid2str(best_gw->lpni_nid),
-			lnet_msgtyp2str(msg->msg_type), msg->msg_len);
+	rc = lnet_post_send_locked(msg, 0);
 
-		routing2 = true;
-		/*
-		 * RULE: Each node considers only the next-hop
-		 *
-		 * We're going to route the message, so change the peer to
-		 * the router.
-		 */
-		LASSERT(best_gw->lpni_peer_net);
-		LASSERT(best_gw->lpni_peer_net->lpn_peer);
-		peer = best_gw->lpni_peer_net->lpn_peer;
-	} else if (!lnet_is_peer_net_healthy_locked(peer_net)) {
-		/*
-		 * this peer_net is unhealthy but we still have an opportunity
-		 * to find another peer_net that we can use
-		 */
-		__u32 net_id = peer_net->lpn_net_id;
-		LCONSOLE_WARN("peer net %s unhealthy\n",
-			      libcfs_net2str(net_id));
-		goto again;
-	}
+	if (!rc)
+		CDEBUG(D_NET, "TRACE: %s(%s:%s) -> %s(%s:%s) : %s try# %d\n",
+		       libcfs_nid2str(msg->msg_hdr.src_nid),
+		       libcfs_nid2str(msg->msg_txni->ni_nid),
+		       libcfs_nid2str(sd->sd_src_nid),
+		       libcfs_nid2str(msg->msg_hdr.dest_nid),
+		       libcfs_nid2str(sd->sd_dst_nid),
+		       libcfs_nid2str(msg->msg_txpeer->lpni_nid),
+		       lnet_msgtyp2str(msg->msg_type), msg->msg_retry_count);
+
+	return rc;
+}
 
+static struct lnet_peer_ni *
+lnet_select_peer_ni(struct lnet_send_data *sd, struct lnet_peer *peer,
+		    struct lnet_peer_net *peer_net)
+{
 	/*
 	 * Look at the peer NIs for the destination peer that connect
 	 * to the chosen net. If a peer_ni is preferred when using the
@@ -1754,24 +1772,45 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
 	 * the available transmit credits are used. If the transmit
 	 * credits are equal, we round-robin over the peer_ni.
 	 */
-	lpni = NULL;
-	best_lpni_credits = INT_MIN;
-	preferred = false;
-	best_lpni = NULL;
+	struct lnet_peer_ni *lpni = NULL;
+	struct lnet_peer_ni *best_lpni = NULL;
+	struct lnet_ni *best_ni = sd->sd_best_ni;
+	lnet_nid_t dst_nid = sd->sd_dst_nid;
+	int best_lpni_credits = INT_MIN;
+	bool preferred = false;
+	bool ni_is_pref;
+	int best_lpni_healthv = 0;
+	int lpni_healthv;
+
 	while ((lpni = lnet_get_next_peer_ni_locked(peer, peer_net, lpni))) {
 		/*
-		 * if this peer ni is not healthy just skip it, no point in
-		 * examining it further
+		 * if the best_ni we've chosen aleady has this lpni
+		 * preferred, then let's use it
 		 */
-		if (!lnet_is_peer_ni_healthy_locked(lpni))
-			continue;
-		ni_is_pref = lnet_peer_is_ni_pref_locked(lpni, best_ni);
+		ni_is_pref = lnet_peer_is_pref_nid_locked(lpni,
+							  best_ni->ni_nid);
 
-		/* if this is a preferred peer use it */
-		if (!preferred && ni_is_pref) {
-			preferred = true;
-		} else if (preferred && !ni_is_pref) {
-			/*
+		lpni_healthv = atomic_read(&lpni->lpni_healthv);
+
+		CDEBUG(D_NET, "%s ni_is_pref = %d\n",
+		       libcfs_nid2str(best_ni->ni_nid), ni_is_pref);
+
+		if (best_lpni)
+			CDEBUG(D_NET, "%s c:[%d, %d], s:[%d, %d]\n",
+				libcfs_nid2str(lpni->lpni_nid),
+				lpni->lpni_txcredits, best_lpni_credits,
+				lpni->lpni_seq, best_lpni->lpni_seq);
+
+		/* pick the healthiest peer ni */
+		if (lpni_healthv < best_lpni_healthv) {
+			continue;
+		} else if (lpni_healthv > best_lpni_healthv) {
+			best_lpni_healthv = lpni_healthv;
+		/* if this is a preferred peer use it */
+		} else if (!preferred && ni_is_pref) {
+			preferred = true;
+		} else if (preferred && !ni_is_pref) {
+			/*
 			 * this is not the preferred peer so let's ignore
 			 * it.
 			 */
@@ -1804,174 +1843,1924 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
 	if (!best_lpni) {
 		__u32 net_id = (peer_net) ? peer_net->lpn_net_id :
 			LNET_NIDNET(dst_nid);
-		lnet_net_unlock(cpt);
-		LCONSOLE_WARN("no peer_ni found on peer net %s\n",
+		CDEBUG(D_NET, "no peer_ni found on peer net %s\n",
 				libcfs_net2str(net_id));
+		return NULL;
+	}
+
+	CDEBUG(D_NET, "sd_best_lpni = %s\n",
+	       libcfs_nid2str(best_lpni->lpni_nid));
+
+	return best_lpni;
+}
+
+/*
+ * Prerequisite: the best_ni should already be set in the sd
+ */
+static inline struct lnet_peer_ni *
+lnet_find_best_lpni_on_net(struct lnet_send_data *sd, struct lnet_peer *peer,
+			   __u32 net_id)
+{
+	struct lnet_peer_net *peer_net;
+
+	/*
+	 * The gateway is Multi-Rail capable so now we must select the
+	 * proper peer_ni
+	 */
+	peer_net = lnet_peer_get_net_locked(peer, net_id);
+
+	if (!peer_net) {
+		CERROR("gateway peer %s has no NI on net %s\n",
+		       libcfs_nid2str(peer->lp_primary_nid),
+		       libcfs_net2str(net_id));
+		return NULL;
+	}
+
+	return lnet_select_peer_ni(sd, peer, peer_net);
+}
+
+static inline void
+lnet_set_non_mr_pref_nid(struct lnet_send_data *sd)
+{
+	if (sd->sd_send_case & NMR_DST &&
+	    sd->sd_msg->msg_type != LNET_MSG_REPLY &&
+	    sd->sd_msg->msg_type != LNET_MSG_ACK &&
+	    sd->sd_best_lpni->lpni_pref_nnids == 0) {
+		CDEBUG(D_NET, "Setting preferred local NID %s on NMR peer %s\n",
+		       libcfs_nid2str(sd->sd_best_ni->ni_nid),
+		       libcfs_nid2str(sd->sd_best_lpni->lpni_nid));
+		lnet_peer_ni_set_non_mr_pref_nid(sd->sd_best_lpni,
+						 sd->sd_best_ni->ni_nid);
+	}
+}
+
+/*
+ * Source Specified
+ * Local Destination
+ * non-mr peer
+ *
+ * use the source and destination NIDs as the pathway
+ */
+static int
+lnet_handle_spec_local_nmr_dst(struct lnet_send_data *sd)
+{
+	/* the destination lpni is set before we get here. */
+
+	/* find local NI */
+	sd->sd_best_ni = lnet_nid2ni_locked(sd->sd_src_nid, sd->sd_cpt);
+	if (!sd->sd_best_ni) {
+		CERROR("Can't send to %s: src %s is not a "
+		       "local nid\n", libcfs_nid2str(sd->sd_dst_nid),
+				libcfs_nid2str(sd->sd_src_nid));
+		return -EINVAL;
+	}
+
+	/*
+	 * the preferred NID will only be set for NMR peers
+	 */
+	lnet_set_non_mr_pref_nid(sd);
+
+	return lnet_handle_send(sd);
+}
+
+/*
+ * Source Specified
+ * Local Destination
+ * MR Peer
+ *
+ * Don't run the selection algorithm on the peer NIs. By specifying the
+ * local NID, we're also saying that we should always use the destination NID
+ * provided. This handles the case where we should be using the same
+ * destination NID for the all the messages which belong to the same RPC
+ * request.
+ */
+static int
+lnet_handle_spec_local_mr_dst(struct lnet_send_data *sd)
+{
+	sd->sd_best_ni = lnet_nid2ni_locked(sd->sd_src_nid, sd->sd_cpt);
+	if (!sd->sd_best_ni) {
+		CERROR("Can't send to %s: src %s is not a "
+		       "local nid\n", libcfs_nid2str(sd->sd_dst_nid),
+				libcfs_nid2str(sd->sd_src_nid));
+		return -EINVAL;
+	}
+
+	if (sd->sd_best_lpni &&
+	    sd->sd_best_lpni->lpni_nid == the_lnet.ln_loni->ni_nid)
+		return lnet_handle_lo_send(sd);
+	else if (sd->sd_best_lpni)
+		return lnet_handle_send(sd);
+
+	CERROR("can't send to %s. no NI on %s\n",
+	       libcfs_nid2str(sd->sd_dst_nid),
+	       libcfs_net2str(sd->sd_best_ni->ni_net->net_id));
+
+	return -EHOSTUNREACH;
+}
+
+struct lnet_ni *
+lnet_find_best_ni_on_spec_net(struct lnet_ni *cur_best_ni,
+			      struct lnet_peer *peer,
+			      struct lnet_peer_net *peer_net,
+			      int cpt,
+			      bool incr_seq)
+{
+	struct lnet_net *local_net;
+	struct lnet_ni *best_ni;
+
+	local_net = lnet_get_net_locked(peer_net->lpn_net_id);
+	if (!local_net)
+		return NULL;
+
+	/*
+	 * Iterate through the NIs in this local Net and select
+	 * the NI to send from. The selection is determined by
+	 * these 3 criterion in the following priority:
+	 *	1. NUMA
+	 *	2. NI available credits
+	 *	3. Round Robin
+	 */
+	best_ni = lnet_get_best_ni(local_net, cur_best_ni,
+				   peer, peer_net, cpt);
+
+	if (incr_seq && best_ni)
+		best_ni->ni_seq++;
+
+	return best_ni;
+}
+
+static int
+lnet_handle_find_routed_path(struct lnet_send_data *sd,
+			     lnet_nid_t dst_nid,
+			     struct lnet_peer_ni **gw_lpni,
+			     struct lnet_peer **gw_peer)
+{
+	struct lnet_peer_ni *gw;
+	lnet_nid_t src_nid = sd->sd_src_nid;
+
+	gw = lnet_find_route_locked(NULL, LNET_NIDNET(dst_nid),
+				    sd->sd_rtr_nid);
+	if (!gw) {
+		CERROR("no route to %s from %s\n",
+		       libcfs_nid2str(dst_nid), libcfs_nid2str(src_nid));
 		return -EHOSTUNREACH;
 	}
 
+	/* get the peer of the gw_ni */
+	LASSERT(gw->lpni_peer_net);
+	LASSERT(gw->lpni_peer_net->lpn_peer);
+
+	*gw_peer = gw->lpni_peer_net->lpn_peer;
+
+	if (!sd->sd_best_ni)
+		sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL, *gw_peer,
+					gw->lpni_peer_net,
+					sd->sd_md_cpt,
+					true);
+
+	if (!sd->sd_best_ni) {
+		CERROR("Internal Error. Expected local ni on %s "
+		       "but non found :%s\n",
+		       libcfs_net2str(gw->lpni_peer_net->lpn_net_id),
+		       libcfs_nid2str(sd->sd_src_nid));
+		return -EFAULT;
+	}
+
+	/*
+	 * if gw is MR let's find its best peer_ni
+	 */
+	if (lnet_peer_is_multi_rail(*gw_peer)) {
+		gw = lnet_find_best_lpni_on_net(sd, *gw_peer,
+						sd->sd_best_ni->ni_net->net_id);
+		/*
+		 * We've already verified that the gw has an NI on that
+		 * desired net, but we're not finding it. Something is
+		 * wrong.
+		 */
+		if (!gw) {
+			CERROR("Internal Error. Route expected to %s from %s\n",
+				libcfs_nid2str(dst_nid),
+				libcfs_nid2str(src_nid));
+			return -EFAULT;
+		}
+	}
+
+	*gw_lpni = gw;
+
+	return 0;
+}
+
+/*
+ * Handle two cases:
+ *
+ * Case 1:
+ *  Source specified
+ *  Remote destination
+ *  Non-MR destination
+ *
+ * Case 2:
+ *  Source specified
+ *  Remote destination
+ *  MR destination
+ *
+ * The handling of these two cases is similar. Even though the destination
+ * can be MR or non-MR, we'll deal directly with the router.
+ */
+static int
+lnet_handle_spec_router_dst(struct lnet_send_data *sd)
+{
+	int rc;
+	struct lnet_peer_ni *gw_lpni = NULL;
+	struct lnet_peer *gw_peer = NULL;
+
+	/* find local NI */
+	sd->sd_best_ni = lnet_nid2ni_locked(sd->sd_src_nid, sd->sd_cpt);
+	if (!sd->sd_best_ni) {
+		CERROR("Can't send to %s: src %s is not a "
+		       "local nid\n", libcfs_nid2str(sd->sd_dst_nid),
+				libcfs_nid2str(sd->sd_src_nid));
+		return -EINVAL;
+	}
+
+	rc = lnet_handle_find_routed_path(sd, sd->sd_dst_nid, &gw_lpni,
+				     &gw_peer);
+	if (rc < 0)
+		return rc;
+
+	if (sd->sd_send_case & NMR_DST)
+		/*
+		* since the final destination is non-MR let's set its preferred
+		* NID before we send
+		*/
+		lnet_set_non_mr_pref_nid(sd);
+
+	/*
+	 * We're going to send to the gw found so let's set its
+	 * info
+	 */
+	sd->sd_peer = gw_peer;
+	sd->sd_best_lpni = gw_lpni;
+
+	return lnet_handle_send(sd);
+}
+
+struct lnet_ni *
+lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt)
+{
+	struct lnet_peer_net *peer_net = NULL;
+	struct lnet_ni *best_ni = NULL;
+
+	/*
+	 * The peer can have multiple interfaces, some of them can be on
+	 * the local network and others on a routed network. We should
+	 * prefer the local network. However if the local network is not
+	 * available then we need to try the routed network
+	 */
+
+	/* go through all the peer nets and find the best_ni */
+	list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_peer_nets) {
+		/*
+		 * The peer's list of nets can contain non-local nets. We
+		 * want to only examine the local ones.
+		 */
+		if (!lnet_get_net_locked(peer_net->lpn_net_id))
+			continue;
+		best_ni = lnet_find_best_ni_on_spec_net(best_ni, peer,
+						   peer_net, md_cpt, false);
+	}
+
+	if (best_ni)
+		/* increment sequence number so we can round robin */
+		best_ni->ni_seq++;
+
+	return best_ni;
+}
+
+static struct lnet_ni *
+lnet_find_existing_preferred_best_ni(struct lnet_send_data *sd)
+{
+	struct lnet_ni *best_ni = NULL;
+	struct lnet_peer_net *peer_net;
+	struct lnet_peer *peer = sd->sd_peer;
+	struct lnet_peer_ni *best_lpni = sd->sd_best_lpni;
+	struct lnet_peer_ni *lpni;
+	int cpt = sd->sd_cpt;
+
+	/*
+	 * We must use a consistent source address when sending to a
+	 * non-MR peer. However, a non-MR peer can have multiple NIDs
+	 * on multiple networks, and we may even need to talk to this
+	 * peer on multiple networks -- certain types of
+	 * load-balancing configuration do this.
+	 *
+	 * So we need to pick the NI the peer prefers for this
+	 * particular network.
+	 */
+
+	/* Get the target peer_ni */
+	peer_net = lnet_peer_get_net_locked(peer,
+			LNET_NIDNET(best_lpni->lpni_nid));
+	LASSERT(peer_net != NULL);
+	list_for_each_entry(lpni, &peer_net->lpn_peer_nis,
+				lpni_peer_nis) {
+		if (lpni->lpni_pref_nnids == 0)
+			continue;
+		LASSERT(lpni->lpni_pref_nnids == 1);
+		best_ni = lnet_nid2ni_locked(
+				lpni->lpni_pref.nid, cpt);
+		break;
+	}
+
+	return best_ni;
+}
+
+/* Prerequisite: sd->sd_peer and sd->sd_best_lpni should be set */
+static int
+lnet_select_preferred_best_ni(struct lnet_send_data *sd)
+{
+	struct lnet_ni *best_ni = NULL;
+	struct lnet_peer_ni *best_lpni = sd->sd_best_lpni;
+
+	/*
+	 * We must use a consistent source address when sending to a
+	 * non-MR peer. However, a non-MR peer can have multiple NIDs
+	 * on multiple networks, and we may even need to talk to this
+	 * peer on multiple networks -- certain types of
+	 * load-balancing configuration do this.
+	 *
+	 * So we need to pick the NI the peer prefers for this
+	 * particular network.
+	 */
+
+	best_ni = lnet_find_existing_preferred_best_ni(sd);
+
+	/* if best_ni is still not set just pick one */
+	if (!best_ni) {
+		best_ni =
+		  lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer,
+						sd->sd_best_lpni->lpni_peer_net,
+						sd->sd_md_cpt, true);
+		/* If there is no best_ni we don't have a route */
+		if (!best_ni) {
+			CERROR("no path to %s from net %s\n",
+				libcfs_nid2str(best_lpni->lpni_nid),
+				libcfs_net2str(best_lpni->lpni_net->net_id));
+			return -EHOSTUNREACH;
+		}
+	}
+
+	sd->sd_best_ni = best_ni;
+
+	/* Set preferred NI if necessary. */
+	lnet_set_non_mr_pref_nid(sd);
+
+	return 0;
+}
+
+
+/*
+ * Source not specified
+ * Local destination
+ * Non-MR Peer
+ *
+ * always use the same source NID for NMR peers
+ * If we've talked to that peer before then we already have a preferred
+ * source NI associated with it. Otherwise, we select a preferred local NI
+ * and store it in the peer
+ */
+static int
+lnet_handle_any_local_nmr_dst(struct lnet_send_data *sd)
+{
+	int rc;
+
+	/* sd->sd_best_lpni is already set to the final destination */
+
+	/*
+	 * At this point we should've created the peer ni and peer. If we
+	 * can't find it, then something went wrong. Instead of assert
+	 * output a relevant message and fail the send
+	 */
+	if (!sd->sd_best_lpni) {
+		CERROR("Internal fault. Unable to send msg %s to %s. "
+		       "NID not known\n",
+		       lnet_msgtyp2str(sd->sd_msg->msg_type),
+		       libcfs_nid2str(sd->sd_dst_nid));
+		return -EFAULT;
+	}
+
+	rc = lnet_select_preferred_best_ni(sd);
+	if (!rc)
+		rc = lnet_handle_send(sd);
+
+	return rc;
+}
+
+static int
+lnet_handle_any_mr_dsta(struct lnet_send_data *sd)
+{
+	/*
+	 * NOTE we've already handled the remote peer case. So we only
+	 * need to worry about the local case here.
+	 *
+	 * if we're sending a response, ACK or reply, we need to send it
+	 * to the destination NID given to us. At this point we already
+	 * have the peer_ni we're suppose to send to, so just find the
+	 * best_ni on the peer net and use that. Since we're sending to an
+	 * MR peer then we can just run the selection algorithm on our
+	 * local NIs and pick the best one.
+	 */
+	if (sd->sd_send_case & SND_RESP) {
+		sd->sd_best_ni =
+		  lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer,
+						sd->sd_best_lpni->lpni_peer_net,
+						sd->sd_md_cpt, true);
+
+		if (!sd->sd_best_ni) {
+			/*
+			 * We're not going to deal with not able to send
+			 * a response to the provided final destination
+			 */
+			CERROR("Can't send response to %s. "
+			       "No local NI available\n",
+				libcfs_nid2str(sd->sd_dst_nid));
+			return -EHOSTUNREACH;
+		}
+
+		return lnet_handle_send(sd);
+	}
+
+	/*
+	 * If we get here that means we're sending a fresh request, PUT or
+	 * GET, so we need to run our standard selection algorithm.
+	 * First find the best local interface that's on any of the peer's
+	 * networks.
+	 */
+	sd->sd_best_ni = lnet_find_best_ni_on_local_net(sd->sd_peer,
+							sd->sd_md_cpt);
+	if (sd->sd_best_ni) {
+		sd->sd_best_lpni =
+		  lnet_find_best_lpni_on_net(sd, sd->sd_peer,
+					     sd->sd_best_ni->ni_net->net_id);
+
+		/*
+		 * if we're successful in selecting a peer_ni on the local
+		 * network, then send to it. Otherwise fall through and
+		 * try and see if we can reach it over another routed
+		 * network
+		 */
+		if (sd->sd_best_lpni &&
+		    sd->sd_best_lpni->lpni_nid == the_lnet.ln_loni->ni_nid) {
+			/*
+			 * in case we initially started with a routed
+			 * destination, let's reset to local
+			 */
+			sd->sd_send_case &= ~REMOTE_DST;
+			sd->sd_send_case |= LOCAL_DST;
+			return lnet_handle_lo_send(sd);
+		} else if (sd->sd_best_lpni) {
+			/*
+			 * in case we initially started with a routed
+			 * destination, let's reset to local
+			 */
+			sd->sd_send_case &= ~REMOTE_DST;
+			sd->sd_send_case |= LOCAL_DST;
+			return lnet_handle_send(sd);
+		}
+
+		CERROR("Internal Error. Expected to have a best_lpni: "
+		       "%s -> %s\n",
+		       libcfs_nid2str(sd->sd_src_nid),
+		       libcfs_nid2str(sd->sd_dst_nid));
+
+		return -EFAULT;
+	}
+
+	/*
+	 * Peer doesn't have a local network. Let's see if there is
+	 * a remote network we can reach it on.
+	 */
+	return PASS_THROUGH;
+}
+
+/*
+ * Case 1:
+ *	Source NID not specified
+ *	Local destination
+ *	MR peer
+ *
+ * Case 2:
+ *	Source NID not speified
+ *	Remote destination
+ *	MR peer
+ *
+ * In both of these cases if we're sending a response, ACK or REPLY, then
+ * we need to send to the destination NID provided.
+ *
+ * In the remote case let's deal with MR routers.
+ *
+ */
+
+static int
+lnet_handle_any_mr_dst(struct lnet_send_data *sd)
+{
+	int rc = 0;
+	struct lnet_peer *gw_peer = NULL;
+	struct lnet_peer_ni *gw_lpni = NULL;
+
+	/*
+	 * handle sending a response to a remote peer here so we don't
+	 * have to worry about it if we hit lnet_handle_any_mr_dsta()
+	 */
+	if (sd->sd_send_case & REMOTE_DST &&
+	    sd->sd_send_case & SND_RESP) {
+		struct lnet_peer_ni *gw;
+		struct lnet_peer *gw_peer;
+
+		rc = lnet_handle_find_routed_path(sd, sd->sd_dst_nid, &gw,
+						  &gw_peer);
+		if (rc < 0) {
+			CERROR("Can't send response to %s. "
+			       "No route available\n",
+				libcfs_nid2str(sd->sd_dst_nid));
+			return -EHOSTUNREACH;
+		}
+
+		sd->sd_best_lpni = gw;
+		sd->sd_peer = gw_peer;
+
+		return lnet_handle_send(sd);
+	}
+
+	/*
+	 * Even though the NID for the peer might not be on a local network,
+	 * since the peer is MR there could be other interfaces on the
+	 * local network. In that case we'd still like to prefer the local
+	 * network over the routed network. If we're unable to do that
+	 * then we select the best router among the different routed networks,
+	 * and if the router is MR then we can deal with it as such.
+	 */
+	rc = lnet_handle_any_mr_dsta(sd);
+	if (rc != PASS_THROUGH)
+		return rc;
+
+	/*
+	 * TODO; One possible enhancement is to run the selection
+	 * algorithm on the peer. However for remote peers the credits are
+	 * not decremented, so we'll be basically going over the peer NIs
+	 * in round robin. An MR router will run the selection algorithm
+	 * on the next-hop interfaces.
+	 */
+	rc = lnet_handle_find_routed_path(sd, sd->sd_dst_nid, &gw_lpni,
+					  &gw_peer);
+	if (rc < 0)
+		return rc;
+
+	sd->sd_send_case &= ~LOCAL_DST;
+	sd->sd_send_case |= REMOTE_DST;
+
+	sd->sd_peer = gw_peer;
+	sd->sd_best_lpni = gw_lpni;
+
+	return lnet_handle_send(sd);
+}
+
+/*
+ * Source not specified
+ * Remote destination
+ * Non-MR peer
+ *
+ * Must send to the specified peer NID using the same source NID that
+ * we've used before. If it's the first time to talk to that peer then
+ * find the source NI and assign it as preferred to that peer
+ */
+static int
+lnet_handle_any_router_nmr_dst(struct lnet_send_data *sd)
+{
+	int rc;
+	struct lnet_peer_ni *gw_lpni = NULL;
+	struct lnet_peer *gw_peer = NULL;
+
+	/*
+	 * Let's set if we have a preferred NI to talk to this NMR peer
+	 */
+	sd->sd_best_ni = lnet_find_existing_preferred_best_ni(sd);
+
+	/*
+	 * find the router and that'll find the best NI if we didn't find
+	 * it already.
+	 */
+	rc = lnet_handle_find_routed_path(sd, sd->sd_dst_nid, &gw_lpni,
+					  &gw_peer);
+	if (rc < 0)
+		return rc;
+
+	/*
+	 * set the best_ni we've chosen as the preferred one for
+	 * this peer
+	 */
+	lnet_set_non_mr_pref_nid(sd);
+
+	/* we'll be sending to the gw */
+	sd->sd_best_lpni = gw_lpni;
+	sd->sd_peer = gw_peer;
+
+	return lnet_handle_send(sd);
+}
+
+static int
+lnet_handle_send_case_locked(struct lnet_send_data *sd)
+{
+	/*
+	 * turn off the SND_RESP bit.
+	 * It will be checked in the case handling
+	 */
+	__u32 send_case = sd->sd_send_case &= ~SND_RESP ;
+
+	CDEBUG(D_NET, "Source %s%s to %s %s %s destination\n",
+		(send_case & SRC_SPEC) ? "Specified: " : "ANY",
+		(send_case & SRC_SPEC) ? libcfs_nid2str(sd->sd_src_nid) : "",
+		(send_case & MR_DST) ? "MR: " : "NMR: ",
+		libcfs_nid2str(sd->sd_dst_nid),
+		(send_case & LOCAL_DST) ? "local" : "routed");
+
+	switch (send_case) {
+	/*
+	 * For all cases where the source is specified, we should always
+	 * use the destination NID, whether it's an MR destination or not,
+	 * since we're continuing a series of related messages for the
+	 * same RPC
+	 */
+	case SRC_SPEC_LOCAL_NMR_DST:
+		return lnet_handle_spec_local_nmr_dst(sd);
+	case SRC_SPEC_LOCAL_MR_DST:
+		return lnet_handle_spec_local_mr_dst(sd);
+	case SRC_SPEC_ROUTER_NMR_DST:
+	case SRC_SPEC_ROUTER_MR_DST:
+		return lnet_handle_spec_router_dst(sd);
+	case SRC_ANY_LOCAL_NMR_DST:
+		return lnet_handle_any_local_nmr_dst(sd);
+	case SRC_ANY_LOCAL_MR_DST:
+	case SRC_ANY_ROUTER_MR_DST:
+		return lnet_handle_any_mr_dst(sd);
+	case SRC_ANY_ROUTER_NMR_DST:
+		return lnet_handle_any_router_nmr_dst(sd);
+	default:
+		CERROR("Unknown send case\n");
+		return -1;
+	}
+}
+
+static int
+lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
+		    struct lnet_msg *msg, lnet_nid_t rtr_nid)
+{
+	struct lnet_peer_ni *lpni;
+	struct lnet_peer *peer;
+	struct lnet_send_data send_data;
+	int cpt, rc;
+	int md_cpt;
+	__u32 send_case = 0;
+
+	memset(&send_data, 0, sizeof(send_data));
+
+	/*
+	 * get an initial CPT to use for locking. The idea here is not to
+	 * serialize the calls to select_pathway, so that as many
+	 * operations can run concurrently as possible. To do that we use
+	 * the CPT where this call is being executed. Later on when we
+	 * determine the CPT to use in lnet_message_commit, we switch the
+	 * lock and check if there was any configuration change.  If none,
+	 * then we proceed, if there is, then we restart the operation.
+	 */
+	cpt = lnet_net_lock_current();
+
+	md_cpt = lnet_cpt_of_md(msg->msg_md, msg->msg_offset);
+	if (md_cpt == CFS_CPT_ANY)
+		md_cpt = cpt;
+
+again:
+
+	/*
+	 * If we're being asked to send to the loopback interface, there
+	 * is no need to go through any selection. We can just shortcut
+	 * the entire process and send over lolnd
+	 */
+	send_data.sd_msg = msg;
+	send_data.sd_cpt = cpt;
+	if (dst_nid == LNET_NID_LO_0) {
+		rc = lnet_handle_lo_send(&send_data);
+		lnet_net_unlock(cpt);
+		return rc;
+	}
+
+	/*
+	 * find an existing peer_ni, or create one and mark it as having been
+	 * created due to network traffic. This call will create the
+	 * peer->peer_net->peer_ni tree.
+	 */
+	lpni = lnet_nid2peerni_locked(dst_nid, LNET_NID_ANY, cpt);
+	if (IS_ERR(lpni)) {
+		lnet_net_unlock(cpt);
+		return PTR_ERR(lpni);
+	}
+
+	/*
+	 * Cache the original src_nid. If we need to resend the message
+	 * then we'll need to know whether the src_nid was originally
+	 * specified for this message. If it was originally specified,
+	 * then we need to keep using the same src_nid since it's
+	 * continuing the same sequence of messages.
+	 */
+	msg->msg_src_nid_param = src_nid;
+
+	/*
+	 * Now that we have a peer_ni, check if we want to discover
+	 * the peer. Traffic to the LNET_RESERVED_PORTAL should not
+	 * trigger discovery.
+	 */
+	peer = lpni->lpni_peer_net->lpn_peer;
+	if (lnet_msg_discovery(msg) && !lnet_peer_is_uptodate(peer)) {
+		lnet_nid_t primary_nid;
+		rc = lnet_discover_peer_locked(lpni, cpt, false);
+		if (rc) {
+			lnet_peer_ni_decref_locked(lpni);
+			lnet_net_unlock(cpt);
+			return rc;
+		}
+		/* The peer may have changed. */
+		peer = lpni->lpni_peer_net->lpn_peer;
+		spin_lock(&peer->lp_lock);
+		if (lnet_peer_is_uptodate_locked(peer)) {
+			spin_unlock(&peer->lp_lock);
+		} else {
+			/* queue message and return */
+			msg->msg_rtr_nid_param = rtr_nid;
+			msg->msg_sending = 0;
+			list_add_tail(&msg->msg_list, &peer->lp_dc_pendq);
+			primary_nid = peer->lp_primary_nid;
+			spin_unlock(&peer->lp_lock);
+			lnet_peer_ni_decref_locked(lpni);
+			lnet_net_unlock(cpt);
+
+			CDEBUG(D_NET, "%s pending discovery\n",
+			       libcfs_nid2str(primary_nid));
+
+			return LNET_DC_WAIT;
+		}
+	}
+	lnet_peer_ni_decref_locked(lpni);
+	peer = lpni->lpni_peer_net->lpn_peer;
+
+	/*
+	 * Identify the different send cases
+	 */
+	if (src_nid == LNET_NID_ANY)
+		send_case |= SRC_ANY;
+	else
+		send_case |= SRC_SPEC;
+
+	if (lnet_get_net_locked(LNET_NIDNET(dst_nid)))
+		send_case |= LOCAL_DST;
+	else
+		send_case |= REMOTE_DST;
+
+	/*
+	 * if this is a non-MR peer or if we're recovering a peer ni then
+	 * let's consider this an NMR case so we can hit the destination
+	 * NID.
+	 */
+	if (!lnet_peer_is_multi_rail(peer) || msg->msg_recovery)
+		send_case |= NMR_DST;
+	else
+		send_case |= MR_DST;
+
+	if (msg->msg_type == LNET_MSG_REPLY ||
+	    msg->msg_type == LNET_MSG_ACK)
+		send_case |= SND_RESP;
+
+	/* assign parameters to the send_data */
+	send_data.sd_rtr_nid = rtr_nid;
+	send_data.sd_src_nid = src_nid;
+	send_data.sd_dst_nid = dst_nid;
+	send_data.sd_best_lpni = lpni;
+	/*
+	 * keep a pointer to the final destination in case we're going to
+	 * route, so we'll need to access it later
+	 */
+	send_data.sd_final_dst_lpni = lpni;
+	send_data.sd_peer = peer;
+	send_data.sd_md_cpt = md_cpt;
+	send_data.sd_send_case = send_case;
+
+	rc = lnet_handle_send_case_locked(&send_data);
+
+	/*
+	 * Update the local cpt since send_data.sd_cpt might've been
+	 * updated as a result of calling lnet_handle_send_case_locked().
+	 */
+	cpt = send_data.sd_cpt;
+
+	if (rc == REPEAT_SEND)
+		goto again;
+
+	lnet_net_unlock(cpt);
+
+	return rc;
+}
+
+int
+lnet_send(lnet_nid_t src_nid, struct lnet_msg *msg, lnet_nid_t rtr_nid)
+{
+	lnet_nid_t		dst_nid = msg->msg_target.nid;
+	int			rc;
+
+	/*
+	 * NB: rtr_nid is set to LNET_NID_ANY for all current use-cases,
+	 * but we might want to use pre-determined router for ACK/REPLY
+	 * in the future
+	 */
+	/* NB: ni != NULL == interface pre-determined (ACK/REPLY) */
+	LASSERT(msg->msg_txpeer == NULL);
+	LASSERT(msg->msg_txni == NULL);
+	LASSERT(!msg->msg_sending);
+	LASSERT(!msg->msg_target_is_router);
+	LASSERT(!msg->msg_receiving);
+
+	msg->msg_sending = 1;
+
+	LASSERT(!msg->msg_tx_committed);
+
+	rc = lnet_select_pathway(src_nid, dst_nid, msg, rtr_nid);
+	if (rc < 0)
+		return rc;
+
+	if (rc == LNET_CREDIT_OK)
+		lnet_ni_send(msg->msg_txni, msg);
+
+	/* rc == LNET_CREDIT_OK or LNET_CREDIT_WAIT or LNET_DC_WAIT */
+	return 0;
+}
+
+enum lnet_mt_event_type {
+	MT_TYPE_LOCAL_NI = 0,
+	MT_TYPE_PEER_NI
+};
+
+struct lnet_mt_event_info {
+	enum lnet_mt_event_type mt_type;
+	lnet_nid_t mt_nid;
+};
+
+/* called with res_lock held */
+void
+lnet_detach_rsp_tracker(struct lnet_libmd *md, int cpt)
+{
+	struct lnet_rsp_tracker *rspt;
+
+	/*
+	 * msg has a refcount on the MD so the MD is not going away.
+	 * The rspt queue for the cpt is protected by
+	 * the lnet_net_lock(cpt). cpt is the cpt of the MD cookie.
+	 */
+	if (!md->md_rspt_ptr)
+		return;
+
+	rspt = md->md_rspt_ptr;
+
+	/* debug code */
+	LASSERT(rspt->rspt_cpt == cpt);
+
+	md->md_rspt_ptr = NULL;
+
+	if (LNetMDHandleIsInvalid(rspt->rspt_mdh)) {
+		/*
+		 * The monitor thread has invalidated this handle because the
+		 * response timed out, but it failed to lookup the MD. That
+		 * means this response tracker is on the zombie list. We can
+		 * safely remove it under the resource lock (held by caller) and
+		 * free the response tracker block.
+		 */
+		list_del(&rspt->rspt_on_list);
+		lnet_rspt_free(rspt, cpt);
+	} else {
+		/*
+		 * invalidate the handle to indicate that a response has been
+		 * received, which will then lead the monitor thread to clean up
+		 * the rspt block.
+		 */
+		LNetInvalidateMDHandle(&rspt->rspt_mdh);
+	}
+}
+
+void
+lnet_clean_zombie_rstqs(void)
+{
+	struct lnet_rsp_tracker *rspt, *tmp;
+	int i;
+
+	cfs_cpt_for_each(i, lnet_cpt_table()) {
+		list_for_each_entry_safe(rspt, tmp,
+					 the_lnet.ln_mt_zombie_rstqs[i],
+					 rspt_on_list) {
+			list_del(&rspt->rspt_on_list);
+			lnet_rspt_free(rspt, i);
+		}
+	}
+
+	cfs_percpt_free(the_lnet.ln_mt_zombie_rstqs);
+}
+
+static void
+lnet_finalize_expired_responses(void)
+{
+	struct lnet_libmd *md;
+	struct list_head local_queue;
+	struct lnet_rsp_tracker *rspt, *tmp;
+	ktime_t now;
+	int i;
+
+	if (the_lnet.ln_mt_rstq == NULL)
+		return;
+
+	cfs_cpt_for_each(i, lnet_cpt_table()) {
+		INIT_LIST_HEAD(&local_queue);
+
+		lnet_net_lock(i);
+		if (!the_lnet.ln_mt_rstq[i]) {
+			lnet_net_unlock(i);
+			continue;
+		}
+		list_splice_init(the_lnet.ln_mt_rstq[i], &local_queue);
+		lnet_net_unlock(i);
+
+		now = ktime_get();
+
+		list_for_each_entry_safe(rspt, tmp, &local_queue, rspt_on_list) {
+			/*
+			 * The rspt mdh will be invalidated when a response
+			 * is received or whenever we want to discard the
+			 * block the monitor thread will walk the queue
+			 * and clean up any rsts with an invalid mdh.
+			 * The monitor thread will walk the queue until
+			 * the first unexpired rspt block. This means that
+			 * some rspt blocks which received their
+			 * corresponding responses will linger in the
+			 * queue until they are cleaned up eventually.
+			 */
+			lnet_res_lock(i);
+			if (LNetMDHandleIsInvalid(rspt->rspt_mdh)) {
+				lnet_res_unlock(i);
+				list_del(&rspt->rspt_on_list);
+				lnet_rspt_free(rspt, i);
+				continue;
+			}
+
+			if (ktime_compare(now, rspt->rspt_deadline) >= 0 ||
+			    the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN) {
+				struct lnet_peer_ni *lpni;
+				lnet_nid_t nid;
+
+				md = lnet_handle2md(&rspt->rspt_mdh);
+				if (!md) {
+					/* MD has been queued for unlink, but
+					 * rspt hasn't been detached (Note we've
+					 * checked above that the rspt_mdh is
+					 * valid). Since we cannot lookup the MD
+					 * we're unable to detach the rspt
+					 * ourselves. Thus, move the rspt to the
+					 * zombie list where we'll wait for
+					 * either:
+					 *   1. The remaining operations on the
+					 *   MD to complete. In this case the
+					 *   final operation will result in
+					 *   lnet_msg_detach_md()->
+					 *   lnet_detach_rsp_tracker() where
+					 *   we will clean up this response
+					 *   tracker.
+					 *   2. LNet to shutdown. In this case
+					 *   we'll wait until after all LND Nets
+					 *   have shutdown and then we can
+					 *   safely free any remaining response
+					 *   tracker blocks on the zombie list.
+					 * Note: We need to hold the resource
+					 * lock when adding to the zombie list
+					 * because we may have concurrent access
+					 * with lnet_detach_rsp_tracker().
+					 */
+					LNetInvalidateMDHandle(&rspt->rspt_mdh);
+					list_move(&rspt->rspt_on_list,
+						  the_lnet.ln_mt_zombie_rstqs[i]);
+					lnet_res_unlock(i);
+					continue;
+				}
+				LASSERT(md->md_rspt_ptr == rspt);
+				md->md_rspt_ptr = NULL;
+				lnet_res_unlock(i);
+
+				LNetMDUnlink(rspt->rspt_mdh);
+
+				nid = rspt->rspt_next_hop_nid;
+
+				list_del(&rspt->rspt_on_list);
+				lnet_rspt_free(rspt, i);
+
+				/* If we're shutting down we just want to clean
+				 * up the rspt blocks
+				 */
+				if (the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN)
+					continue;
+
+				lnet_net_lock(i);
+				the_lnet.ln_counters[i]->lct_health.lch_response_timeout_count++;
+				lnet_net_unlock(i);
+
+				CDEBUG(D_NET,
+				       "Response timeout: md = %p: nid = %s\n",
+				       md, libcfs_nid2str(nid));
+
+				/*
+				 * If there is a timeout on the response
+				 * from the next hop decrement its health
+				 * value so that we don't use it
+				 */
+				lnet_net_lock(0);
+				lpni = lnet_find_peer_ni_locked(nid);
+				if (lpni) {
+					lnet_handle_remote_failure_locked(lpni);
+					lnet_peer_ni_decref_locked(lpni);
+				}
+				lnet_net_unlock(0);
+			} else {
+				lnet_res_unlock(i);
+				break;
+			}
+		}
+
+		if (!list_empty(&local_queue)) {
+			lnet_net_lock(i);
+			list_splice(&local_queue, the_lnet.ln_mt_rstq[i]);
+			lnet_net_unlock(i);
+		}
+	}
+}
+
+static void
+lnet_resend_pending_msgs_locked(struct list_head *resendq, int cpt)
+{
+	struct lnet_msg *msg;
+
+	while (!list_empty(resendq)) {
+		struct lnet_peer_ni *lpni;
+
+		msg = list_entry(resendq->next, struct lnet_msg,
+				 msg_list);
+
+		list_del_init(&msg->msg_list);
+
+		lpni = lnet_find_peer_ni_locked(msg->msg_hdr.dest_nid);
+		if (!lpni) {
+			lnet_net_unlock(cpt);
+			CERROR("Expected that a peer is already created for %s\n",
+			       libcfs_nid2str(msg->msg_hdr.dest_nid));
+			msg->msg_no_resend = true;
+			lnet_finalize(msg, -EFAULT);
+			lnet_net_lock(cpt);
+		} else {
+			struct lnet_peer *peer;
+			int rc;
+			lnet_nid_t src_nid = LNET_NID_ANY;
+
+			/*
+			 * if this message is not being routed and the
+			 * peer is non-MR then we must use the same
+			 * src_nid that was used in the original send.
+			 * Otherwise if we're routing the message (IE
+			 * we're a router) then we can use any of our
+			 * local interfaces. It doesn't matter to the
+			 * final destination.
+			 */
+			peer = lpni->lpni_peer_net->lpn_peer;
+			if (!msg->msg_routing &&
+			    !lnet_peer_is_multi_rail(peer))
+				src_nid = le64_to_cpu(msg->msg_hdr.src_nid);
+
+			/*
+			 * If we originally specified a src NID, then we
+			 * must attempt to reuse it in the resend as well.
+			 */
+			if (msg->msg_src_nid_param != LNET_NID_ANY)
+				src_nid = msg->msg_src_nid_param;
+			lnet_peer_ni_decref_locked(lpni);
+
+			lnet_net_unlock(cpt);
+			CDEBUG(D_NET, "resending %s->%s: %s recovery %d try# %d\n",
+			       libcfs_nid2str(src_nid),
+			       libcfs_id2str(msg->msg_target),
+			       lnet_msgtyp2str(msg->msg_type),
+			       msg->msg_recovery,
+			       msg->msg_retry_count);
+			rc = lnet_send(src_nid, msg, LNET_NID_ANY);
+			if (rc) {
+				CERROR("Error sending %s to %s: %d\n",
+				       lnet_msgtyp2str(msg->msg_type),
+				       libcfs_id2str(msg->msg_target), rc);
+				msg->msg_no_resend = true;
+				lnet_finalize(msg, rc);
+			}
+			lnet_net_lock(cpt);
+			if (!rc)
+				the_lnet.ln_counters[cpt]->lct_health.lch_resend_count++;
+		}
+	}
+}
+
+static void
+lnet_resend_pending_msgs(void)
+{
+	int i;
+
+	cfs_cpt_for_each(i, lnet_cpt_table()) {
+		lnet_net_lock(i);
+		lnet_resend_pending_msgs_locked(the_lnet.ln_mt_resendqs[i], i);
+		lnet_net_unlock(i);
+	}
+}
+
+/* called with cpt and ni_lock held */
+static void
+lnet_unlink_ni_recovery_mdh_locked(struct lnet_ni *ni, int cpt, bool force)
+{
+	struct lnet_handle_md recovery_mdh;
+
+	LNetInvalidateMDHandle(&recovery_mdh);
+
+	if (ni->ni_recovery_state & LNET_NI_RECOVERY_PENDING ||
+	    force) {
+		recovery_mdh = ni->ni_ping_mdh;
+		LNetInvalidateMDHandle(&ni->ni_ping_mdh);
+	}
+	lnet_ni_unlock(ni);
+	lnet_net_unlock(cpt);
+	if (!LNetMDHandleIsInvalid(recovery_mdh))
+		LNetMDUnlink(recovery_mdh);
+	lnet_net_lock(cpt);
+	lnet_ni_lock(ni);
+}
+
+static void
+lnet_recover_local_nis(void)
+{
+	struct lnet_mt_event_info *ev_info;
+	struct list_head processed_list;
+	struct list_head local_queue;
+	struct lnet_handle_md mdh;
+	struct lnet_ni *tmp;
+	struct lnet_ni *ni;
+	lnet_nid_t nid;
+	int healthv;
+	int rc;
+
+	INIT_LIST_HEAD(&local_queue);
+	INIT_LIST_HEAD(&processed_list);
+
+	/*
+	 * splice the recovery queue on a local queue. We will iterate
+	 * through the local queue and update it as needed. Once we're
+	 * done with the traversal, we'll splice the local queue back on
+	 * the head of the ln_mt_localNIRecovq. Any newly added local NIs
+	 * will be traversed in the next iteration.
+	 */
+	lnet_net_lock(0);
+	list_splice_init(&the_lnet.ln_mt_localNIRecovq,
+			 &local_queue);
+	lnet_net_unlock(0);
+
+	list_for_each_entry_safe(ni, tmp, &local_queue, ni_recovery) {
+		/*
+		 * if an NI is being deleted or it is now healthy, there
+		 * is no need to keep it around in the recovery queue.
+		 * The monitor thread is the only thread responsible for
+		 * removing the NI from the recovery queue.
+		 * Multiple threads can be adding NIs to the recovery
+		 * queue.
+		 */
+		healthv = atomic_read(&ni->ni_healthv);
+
+		lnet_net_lock(0);
+		lnet_ni_lock(ni);
+		if (ni->ni_state != LNET_NI_STATE_ACTIVE ||
+		    healthv == LNET_MAX_HEALTH_VALUE) {
+			list_del_init(&ni->ni_recovery);
+			lnet_unlink_ni_recovery_mdh_locked(ni, 0, false);
+			lnet_ni_unlock(ni);
+			lnet_ni_decref_locked(ni, 0);
+			lnet_net_unlock(0);
+			continue;
+		}
+
+		/*
+		 * if the local NI failed recovery we must unlink the md.
+		 * But we want to keep the local_ni on the recovery queue
+		 * so we can continue the attempts to recover it.
+		 */
+		if (ni->ni_recovery_state & LNET_NI_RECOVERY_FAILED) {
+			lnet_unlink_ni_recovery_mdh_locked(ni, 0, true);
+			ni->ni_recovery_state &= ~LNET_NI_RECOVERY_FAILED;
+		}
+
+		lnet_ni_unlock(ni);
+		lnet_net_unlock(0);
+
+
+		CDEBUG(D_NET, "attempting to recover local ni: %s\n",
+		       libcfs_nid2str(ni->ni_nid));
+
+		lnet_ni_lock(ni);
+		if (!(ni->ni_recovery_state & LNET_NI_RECOVERY_PENDING)) {
+			ni->ni_recovery_state |= LNET_NI_RECOVERY_PENDING;
+			lnet_ni_unlock(ni);
+
+			LIBCFS_ALLOC(ev_info, sizeof(*ev_info));
+			if (!ev_info) {
+				CERROR("out of memory. Can't recover %s\n",
+				       libcfs_nid2str(ni->ni_nid));
+				lnet_ni_lock(ni);
+				ni->ni_recovery_state &=
+				  ~LNET_NI_RECOVERY_PENDING;
+				lnet_ni_unlock(ni);
+				continue;
+			}
+
+			mdh = ni->ni_ping_mdh;
+			/*
+			 * Invalidate the ni mdh in case it's deleted.
+			 * We'll unlink the mdh in this case below.
+			 */
+			LNetInvalidateMDHandle(&ni->ni_ping_mdh);
+			nid = ni->ni_nid;
+
+			/*
+			 * remove the NI from the local queue and drop the
+			 * reference count to it while we're recovering
+			 * it. The reason for that, is that the NI could
+			 * be deleted, and the way the code is structured
+			 * is if we don't drop the NI, then the deletion
+			 * code will enter a loop waiting for the
+			 * reference count to be removed while holding the
+			 * ln_mutex_lock(). When we look up the peer to
+			 * send to in lnet_select_pathway() we will try to
+			 * lock the ln_mutex_lock() as well, leading to
+			 * a deadlock. By dropping the refcount and
+			 * removing it from the list, we allow for the NI
+			 * to be removed, then we use the cached NID to
+			 * look it up again. If it's gone, then we just
+			 * continue examining the rest of the queue.
+			 */
+			lnet_net_lock(0);
+			list_del_init(&ni->ni_recovery);
+			lnet_ni_decref_locked(ni, 0);
+			lnet_net_unlock(0);
+
+			ev_info->mt_type = MT_TYPE_LOCAL_NI;
+			ev_info->mt_nid = nid;
+			rc = lnet_send_ping(nid, &mdh, LNET_INTERFACES_MIN,
+					    ev_info, the_lnet.ln_mt_eqh, true);
+			/* lookup the nid again */
+			lnet_net_lock(0);
+			ni = lnet_nid2ni_locked(nid, 0);
+			if (!ni) {
+				/*
+				 * the NI has been deleted when we dropped
+				 * the ref count
+				 */
+				lnet_net_unlock(0);
+				LNetMDUnlink(mdh);
+				continue;
+			}
+			/*
+			 * Same note as in lnet_recover_peer_nis(). When
+			 * we're sending the ping, the NI is free to be
+			 * deleted or manipulated. By this point it
+			 * could've been added back on the recovery queue,
+			 * and a refcount taken on it.
+			 * So we can't just add it blindly again or we'll
+			 * corrupt the queue. We must check under lock if
+			 * it's not on any list and if not then add it
+			 * to the processed list, which will eventually be
+			 * spliced back on to the recovery queue.
+			 */
+			ni->ni_ping_mdh = mdh;
+			if (list_empty(&ni->ni_recovery)) {
+				list_add_tail(&ni->ni_recovery, &processed_list);
+				lnet_ni_addref_locked(ni, 0);
+			}
+			lnet_net_unlock(0);
+
+			lnet_ni_lock(ni);
+			if (rc)
+				ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING;
+		}
+		lnet_ni_unlock(ni);
+	}
+
+	/*
+	 * put back the remaining NIs on the ln_mt_localNIRecovq to be
+	 * reexamined in the next iteration.
+	 */
+	list_splice_init(&processed_list, &local_queue);
+	lnet_net_lock(0);
+	list_splice(&local_queue, &the_lnet.ln_mt_localNIRecovq);
+	lnet_net_unlock(0);
+}
+
+static int
+lnet_resendqs_create(void)
+{
+	struct list_head **resendqs;
+	resendqs = lnet_create_array_of_queues();
+
+	if (!resendqs)
+		return -ENOMEM;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_mt_resendqs = resendqs;
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	return 0;
+}
+
+static void
+lnet_clean_local_ni_recoveryq(void)
+{
+	struct lnet_ni *ni;
+
+	/* This is only called when the monitor thread has stopped */
+	lnet_net_lock(0);
+
+	while (!list_empty(&the_lnet.ln_mt_localNIRecovq)) {
+		ni = list_entry(the_lnet.ln_mt_localNIRecovq.next,
+				struct lnet_ni, ni_recovery);
+		list_del_init(&ni->ni_recovery);
+		lnet_ni_lock(ni);
+		lnet_unlink_ni_recovery_mdh_locked(ni, 0, true);
+		lnet_ni_unlock(ni);
+		lnet_ni_decref_locked(ni, 0);
+	}
+
+	lnet_net_unlock(0);
+}
+
+static void
+lnet_unlink_lpni_recovery_mdh_locked(struct lnet_peer_ni *lpni, int cpt,
+				     bool force)
+{
+	struct lnet_handle_md recovery_mdh;
+
+	LNetInvalidateMDHandle(&recovery_mdh);
+
+	if (lpni->lpni_state & LNET_PEER_NI_RECOVERY_PENDING || force) {
+		recovery_mdh = lpni->lpni_recovery_ping_mdh;
+		LNetInvalidateMDHandle(&lpni->lpni_recovery_ping_mdh);
+	}
+	spin_unlock(&lpni->lpni_lock);
+	lnet_net_unlock(cpt);
+	if (!LNetMDHandleIsInvalid(recovery_mdh))
+		LNetMDUnlink(recovery_mdh);
+	lnet_net_lock(cpt);
+	spin_lock(&lpni->lpni_lock);
+}
+
+static void
+lnet_clean_peer_ni_recoveryq(void)
+{
+	struct lnet_peer_ni *lpni, *tmp;
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	list_for_each_entry_safe(lpni, tmp, &the_lnet.ln_mt_peerNIRecovq,
+				 lpni_recovery) {
+		list_del_init(&lpni->lpni_recovery);
+		spin_lock(&lpni->lpni_lock);
+		lnet_unlink_lpni_recovery_mdh_locked(lpni, LNET_LOCK_EX, true);
+		spin_unlock(&lpni->lpni_lock);
+		lnet_peer_ni_decref_locked(lpni);
+	}
+
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+static void
+lnet_clean_resendqs(void)
+{
+	struct lnet_msg *msg, *tmp;
+	struct list_head msgs;
+	int i;
+
+	INIT_LIST_HEAD(&msgs);
+
+	cfs_cpt_for_each(i, lnet_cpt_table()) {
+		lnet_net_lock(i);
+		list_splice_init(the_lnet.ln_mt_resendqs[i], &msgs);
+		lnet_net_unlock(i);
+		list_for_each_entry_safe(msg, tmp, &msgs, msg_list) {
+			list_del_init(&msg->msg_list);
+			msg->msg_no_resend = true;
+			lnet_finalize(msg, -ESHUTDOWN);
+		}
+	}
+
+	cfs_percpt_free(the_lnet.ln_mt_resendqs);
+}
+
+static void
+lnet_recover_peer_nis(void)
+{
+	struct lnet_mt_event_info *ev_info;
+	struct list_head processed_list;
+	struct list_head local_queue;
+	struct lnet_handle_md mdh;
+	struct lnet_peer_ni *lpni;
+	struct lnet_peer_ni *tmp;
+	lnet_nid_t nid;
+	int healthv;
+	int rc;
+
+	INIT_LIST_HEAD(&local_queue);
+	INIT_LIST_HEAD(&processed_list);
+
+	/*
+	 * Always use cpt 0 for locking across all interactions with
+	 * ln_mt_peerNIRecovq
+	 */
+	lnet_net_lock(0);
+	list_splice_init(&the_lnet.ln_mt_peerNIRecovq,
+			 &local_queue);
+	lnet_net_unlock(0);
+
+	list_for_each_entry_safe(lpni, tmp, &local_queue,
+				 lpni_recovery) {
+		/*
+		 * The same protection strategy is used here as is in the
+		 * local recovery case.
+		 */
+		lnet_net_lock(0);
+		healthv = atomic_read(&lpni->lpni_healthv);
+		spin_lock(&lpni->lpni_lock);
+		if (lpni->lpni_state & LNET_PEER_NI_DELETING ||
+		    healthv == LNET_MAX_HEALTH_VALUE) {
+			list_del_init(&lpni->lpni_recovery);
+			lnet_unlink_lpni_recovery_mdh_locked(lpni, 0, false);
+			spin_unlock(&lpni->lpni_lock);
+			lnet_peer_ni_decref_locked(lpni);
+			lnet_net_unlock(0);
+			continue;
+		}
+
+		/*
+		 * If the peer NI has failed recovery we must unlink the
+		 * md. But we want to keep the peer ni on the recovery
+		 * queue so we can try to continue recovering it
+		 */
+		if (lpni->lpni_state & LNET_PEER_NI_RECOVERY_FAILED) {
+			lnet_unlink_lpni_recovery_mdh_locked(lpni, 0, true);
+			lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_FAILED;
+		}
+
+		spin_unlock(&lpni->lpni_lock);
+		lnet_net_unlock(0);
 
-send:
-	/* Shortcut for loopback. */
-	if (best_ni == the_lnet.ln_loni) {
-		/* No send credit hassles with LOLND */
-		lnet_ni_addref_locked(best_ni, cpt);
-		msg->msg_hdr.dest_nid = cpu_to_le64(best_ni->ni_nid);
-		if (!msg->msg_routing)
-			msg->msg_hdr.src_nid = cpu_to_le64(best_ni->ni_nid);
-		msg->msg_target.nid = best_ni->ni_nid;
-		lnet_msg_commit(msg, cpt);
-		msg->msg_txni = best_ni;
-		lnet_net_unlock(cpt);
+		/*
+		 * NOTE: we're racing with peer deletion from user space.
+		 * It's possible that a peer is deleted after we check its
+		 * state. In this case the recovery can create a new peer
+		 */
+		spin_lock(&lpni->lpni_lock);
+		if (!(lpni->lpni_state & LNET_PEER_NI_RECOVERY_PENDING) &&
+		    !(lpni->lpni_state & LNET_PEER_NI_DELETING)) {
+			lpni->lpni_state |= LNET_PEER_NI_RECOVERY_PENDING;
+			spin_unlock(&lpni->lpni_lock);
+
+			LIBCFS_ALLOC(ev_info, sizeof(*ev_info));
+			if (!ev_info) {
+				CERROR("out of memory. Can't recover %s\n",
+				       libcfs_nid2str(lpni->lpni_nid));
+				spin_lock(&lpni->lpni_lock);
+				lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING;
+				spin_unlock(&lpni->lpni_lock);
+				continue;
+			}
+
+			/* look at the comments in lnet_recover_local_nis() */
+			mdh = lpni->lpni_recovery_ping_mdh;
+			LNetInvalidateMDHandle(&lpni->lpni_recovery_ping_mdh);
+			nid = lpni->lpni_nid;
+			lnet_net_lock(0);
+			list_del_init(&lpni->lpni_recovery);
+			lnet_peer_ni_decref_locked(lpni);
+			lnet_net_unlock(0);
+
+			ev_info->mt_type = MT_TYPE_PEER_NI;
+			ev_info->mt_nid = nid;
+			rc = lnet_send_ping(nid, &mdh, LNET_INTERFACES_MIN,
+					    ev_info, the_lnet.ln_mt_eqh, true);
+			lnet_net_lock(0);
+			/*
+			 * lnet_find_peer_ni_locked() grabs a refcount for
+			 * us. No need to take it explicitly.
+			 */
+			lpni = lnet_find_peer_ni_locked(nid);
+			if (!lpni) {
+				lnet_net_unlock(0);
+				LNetMDUnlink(mdh);
+				continue;
+			}
 
-		return LNET_CREDIT_OK;
+			lpni->lpni_recovery_ping_mdh = mdh;
+			/*
+			 * While we're unlocked the lpni could've been
+			 * readded on the recovery queue. In this case we
+			 * don't need to add it to the local queue, since
+			 * it's already on there and the thread that added
+			 * it would've incremented the refcount on the
+			 * peer, which means we need to decref the refcount
+			 * that was implicitly grabbed by find_peer_ni_locked.
+			 * Otherwise, if the lpni is still not on
+			 * the recovery queue, then we'll add it to the
+			 * processed list.
+			 */
+			if (list_empty(&lpni->lpni_recovery))
+				list_add_tail(&lpni->lpni_recovery, &processed_list);
+			else
+				lnet_peer_ni_decref_locked(lpni);
+			lnet_net_unlock(0);
+
+			spin_lock(&lpni->lpni_lock);
+			if (rc)
+				lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING;
+		}
+		spin_unlock(&lpni->lpni_lock);
 	}
 
-	routing = routing || routing2;
+	list_splice_init(&processed_list, &local_queue);
+	lnet_net_lock(0);
+	list_splice(&local_queue, &the_lnet.ln_mt_peerNIRecovq);
+	lnet_net_unlock(0);
+}
 
-	/*
-	 * Increment sequence number of the peer selected so that we
-	 * pick the next one in Round Robin.
-	 */
-	best_lpni->lpni_seq++;
+static int
+lnet_monitor_thread(void *arg)
+{
+	time64_t recovery_timeout = 0;
+	time64_t rsp_timeout = 0;
+	int interval;
+	time64_t now;
 
 	/*
-	 * grab a reference on the peer_ni so it sticks around even if
-	 * we need to drop and relock the lnet_net_lock below.
+	 * The monitor thread takes care of the following:
+	 *  1. Checks the aliveness of routers
+	 *  2. Checks if there are messages on the resend queue to resend
+	 *     them.
+	 *  3. Check if there are any NIs on the local recovery queue and
+	 *     pings them
+	 *  4. Checks if there are any NIs on the remote recovery queue
+	 *     and pings them.
 	 */
-	lnet_peer_ni_addref_locked(best_lpni);
+	cfs_block_allsigs();
 
-	/*
-	 * Use lnet_cpt_of_nid() to determine the CPT used to commit the
-	 * message. This ensures that we get a CPT that is correct for
-	 * the NI when the NI has been restricted to a subset of all CPTs.
-	 * If the selected CPT differs from the one currently locked, we
-	 * must unlock and relock the lnet_net_lock(), and then check whether
-	 * the configuration has changed. We don't have a hold on the best_ni
-	 * yet, and it may have vanished.
-	 */
-	cpt2 = lnet_cpt_of_nid_locked(best_lpni->lpni_nid, best_ni);
-	if (cpt != cpt2) {
-		lnet_net_unlock(cpt);
-		cpt = cpt2;
-		lnet_net_lock(cpt);
-		if (seq != lnet_get_dlc_seq_locked()) {
-			lnet_peer_ni_decref_locked(best_lpni);
-			goto again;
+	while (the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING) {
+		now = ktime_get_real_seconds();
+
+		if (lnet_router_checker_active())
+			lnet_check_routers();
+
+		lnet_resend_pending_msgs();
+
+		if (now >= rsp_timeout) {
+			lnet_finalize_expired_responses();
+			rsp_timeout = now + (lnet_transaction_timeout / 2);
 		}
+
+		if (now >= recovery_timeout) {
+			lnet_recover_local_nis();
+			lnet_recover_peer_nis();
+			recovery_timeout = now + lnet_recovery_interval;
+		}
+
+		/*
+		 * TODO do we need to check if we should sleep without
+		 * timeout?  Technically, an active system will always
+		 * have messages in flight so this check will always
+		 * evaluate to false. And on an idle system do we care
+		 * if we wake up every 1 second? Although, we've seen
+		 * cases where we get a complaint that an idle thread
+		 * is waking up unnecessarily.
+		 */
+		interval = min(lnet_recovery_interval,
+			       lnet_transaction_timeout / 2);
+		wait_event_interruptible_timeout(the_lnet.ln_mt_waitq,
+						false,
+						cfs_time_seconds(interval));
 	}
 
-	/*
-	 * store the best_lpni in the message right away to avoid having
-	 * to do the same operation under different conditions
-	 */
-	msg->msg_txpeer = best_lpni;
-	msg->msg_txni = best_ni;
+	/* clean up the router checker */
+	lnet_prune_rc_data(1);
 
-	/*
-	 * grab a reference for the best_ni since now it's in use in this
-	 * send. the reference will need to be dropped when the message is
-	 * finished in lnet_finalize()
-	 */
-	lnet_ni_addref_locked(msg->msg_txni, cpt);
+	/* Shutting down */
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_mt_state = LNET_MT_STATE_SHUTDOWN;
+	lnet_net_unlock(LNET_LOCK_EX);
 
-	/*
-	 * Always set the target.nid to the best peer picked. Either the
-	 * nid will be one of the preconfigured NIDs, or the same NID as
-	 * what was originally set in the target or it will be the NID of
-	 * a router if this message should be routed
-	 */
-	msg->msg_target.nid = msg->msg_txpeer->lpni_nid;
+	/* signal that the monitor thread is exiting */
+	up(&the_lnet.ln_mt_signal);
 
-	/*
-	 * lnet_msg_commit assigns the correct cpt to the message, which
-	 * is used to decrement the correct refcount on the ni when it's
-	 * time to return the credits
-	 */
-	lnet_msg_commit(msg, cpt);
+	return 0;
+}
 
-	/*
-	 * If we are routing the message then we don't need to overwrite
-	 * the src_nid since it would've been set at the origin. Otherwise
-	 * we are the originator so we need to set it.
-	 */
-	if (!msg->msg_routing)
-		msg->msg_hdr.src_nid = cpu_to_le64(msg->msg_txni->ni_nid);
+/*
+ * lnet_send_ping
+ * Sends a ping.
+ * Returns == 0 if success
+ * Returns > 0 if LNetMDBind or prior fails
+ * Returns < 0 if LNetGet fails
+ */
+int
+lnet_send_ping(lnet_nid_t dest_nid,
+	       struct lnet_handle_md *mdh, int nnis,
+	       void *user_data, struct lnet_handle_eq eqh, bool recovery)
+{
+	struct lnet_md md = { NULL };
+	struct lnet_process_id id;
+	struct lnet_ping_buffer *pbuf;
+	int rc;
 
-	if (routing) {
-		msg->msg_target_is_router = 1;
-		msg->msg_target.pid = LNET_PID_LUSTRE;
+	if (dest_nid == LNET_NID_ANY) {
+		rc = -EHOSTUNREACH;
+		goto fail_error;
+	}
+
+	pbuf = lnet_ping_buffer_alloc(nnis, GFP_NOFS);
+	if (!pbuf) {
+		rc = ENOMEM;
+		goto fail_error;
+	}
+
+	/* initialize md content */
+	md.start     = &pbuf->pb_info;
+	md.length    = LNET_PING_INFO_SIZE(nnis);
+	md.threshold = 2; /* GET/REPLY */
+	md.max_size  = 0;
+	md.options   = LNET_MD_TRUNCATE;
+	md.user_ptr  = user_data;
+	md.eq_handle = eqh;
+
+	rc = LNetMDBind(md, LNET_UNLINK, mdh);
+	if (rc) {
+		lnet_ping_buffer_decref(pbuf);
+		CERROR("Can't bind MD: %d\n", rc);
+		rc = -rc; /* change the rc to positive */
+		goto fail_error;
+	}
+	id.pid = LNET_PID_LUSTRE;
+	id.nid = dest_nid;
+
+	rc = LNetGet(LNET_NID_ANY, *mdh, id,
+		     LNET_RESERVED_PORTAL,
+		     LNET_PROTO_PING_MATCHBITS, 0, recovery);
+
+	if (rc)
+		goto fail_unlink_md;
+
+	return 0;
+
+fail_unlink_md:
+	LNetMDUnlink(*mdh);
+	LNetInvalidateMDHandle(mdh);
+fail_error:
+	return rc;
+}
+
+static void
+lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info,
+			   int status, bool unlink_event)
+{
+	lnet_nid_t nid = ev_info->mt_nid;
+
+	if (ev_info->mt_type == MT_TYPE_LOCAL_NI) {
+		struct lnet_ni *ni;
+
+		lnet_net_lock(0);
+		ni = lnet_nid2ni_locked(nid, 0);
+		if (!ni) {
+			lnet_net_unlock(0);
+			return;
+		}
+		lnet_ni_lock(ni);
+		ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING;
+		if (status)
+			ni->ni_recovery_state |= LNET_NI_RECOVERY_FAILED;
+		lnet_ni_unlock(ni);
+		lnet_net_unlock(0);
+
+		if (status != 0) {
+			CERROR("local NI (%s) recovery failed with %d\n",
+			       libcfs_nid2str(nid), status);
+			return;
+		}
 		/*
-		 * since we're routing we want to ensure that the
-		 * msg_hdr.dest_nid is set to the final destination. When
-		 * the router receives this message it knows how to route
-		 * it.
+		 * need to increment healthv for the ni here, because in
+		 * the lnet_finalize() path we don't have access to this
+		 * NI. And in order to get access to it, we'll need to
+		 * carry forward too much information.
+		 * In the peer case, it'll naturally be incremented
 		 */
-		msg->msg_hdr.dest_nid =
-			cpu_to_le64(final_dst ? final_dst->lpni_nid : dst_nid);
+		if (!unlink_event)
+			lnet_inc_healthv(&ni->ni_healthv);
 	} else {
-		/*
-		 * if we're not routing set the dest_nid to the best peer
-		 * ni that we picked earlier in the algorithm.
-		 */
-		msg->msg_hdr.dest_nid = cpu_to_le64(msg->msg_txpeer->lpni_nid);
+		struct lnet_peer_ni *lpni;
+		int cpt;
+
+		cpt = lnet_net_lock_current();
+		lpni = lnet_find_peer_ni_locked(nid);
+		if (!lpni) {
+			lnet_net_unlock(cpt);
+			return;
+		}
+		spin_lock(&lpni->lpni_lock);
+		lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING;
+		if (status)
+			lpni->lpni_state |= LNET_PEER_NI_RECOVERY_FAILED;
+		spin_unlock(&lpni->lpni_lock);
+		lnet_peer_ni_decref_locked(lpni);
+		lnet_net_unlock(cpt);
+
+		if (status != 0)
+			CERROR("peer NI (%s) recovery failed with %d\n",
+			       libcfs_nid2str(nid), status);
 	}
+}
 
-	rc = lnet_post_send_locked(msg, 0);
+void
+lnet_mt_event_handler(struct lnet_event *event)
+{
+	struct lnet_mt_event_info *ev_info = event->md.user_ptr;
+	struct lnet_ping_buffer *pbuf;
+
+	/* TODO: remove assert */
+	LASSERT(event->type == LNET_EVENT_REPLY ||
+		event->type == LNET_EVENT_SEND ||
+		event->type == LNET_EVENT_UNLINK);
+
+	CDEBUG(D_NET, "Received event: %d status: %d\n", event->type,
+	       event->status);
+
+	switch (event->type) {
+	case LNET_EVENT_UNLINK:
+		CDEBUG(D_NET, "%s recovery ping unlinked\n",
+		       libcfs_nid2str(ev_info->mt_nid));
+		/* fallthrough */
+	case LNET_EVENT_REPLY:
+		lnet_handle_recovery_reply(ev_info, event->status,
+					   event->type == LNET_EVENT_UNLINK);
+		break;
+	case LNET_EVENT_SEND:
+		CDEBUG(D_NET, "%s recovery message sent %s:%d\n",
+			       libcfs_nid2str(ev_info->mt_nid),
+			       (event->status) ? "unsuccessfully" :
+			       "successfully", event->status);
+		break;
+	default:
+		CERROR("Unexpected event: %d\n", event->type);
+		break;
+	}
+	if (event->unlinked) {
+		LIBCFS_FREE(ev_info, sizeof(*ev_info));
+		pbuf = LNET_PING_INFO_TO_BUFFER(event->md.start);
+		lnet_ping_buffer_decref(pbuf);
+	}
+}
 
-	if (!rc)
-		CDEBUG(D_NET, "TRACE: %s(%s:%s) -> %s(%s:%s) : %s\n",
-		       libcfs_nid2str(msg->msg_hdr.src_nid),
-		       libcfs_nid2str(msg->msg_txni->ni_nid),
-		       libcfs_nid2str(src_nid),
-		       libcfs_nid2str(msg->msg_hdr.dest_nid),
-		       libcfs_nid2str(dst_nid),
-		       libcfs_nid2str(msg->msg_txpeer->lpni_nid),
-		       lnet_msgtyp2str(msg->msg_type));
+static int
+lnet_rsp_tracker_create(void)
+{
+	struct list_head **rstqs;
+	rstqs = lnet_create_array_of_queues();
 
-	lnet_net_unlock(cpt);
+	if (!rstqs)
+		return -ENOMEM;
 
-	return rc;
+	the_lnet.ln_mt_rstq = rstqs;
+
+	return 0;
 }
 
-int
-lnet_send(lnet_nid_t src_nid, struct lnet_msg *msg, lnet_nid_t rtr_nid)
+static void
+lnet_rsp_tracker_clean(void)
 {
-	lnet_nid_t		dst_nid = msg->msg_target.nid;
-	int			rc;
+	lnet_finalize_expired_responses();
 
-	/*
-	 * NB: rtr_nid is set to LNET_NID_ANY for all current use-cases,
-	 * but we might want to use pre-determined router for ACK/REPLY
-	 * in the future
-	 */
-	/* NB: ni != NULL == interface pre-determined (ACK/REPLY) */
-	LASSERT (msg->msg_txpeer == NULL);
-	LASSERT (!msg->msg_sending);
-	LASSERT (!msg->msg_target_is_router);
-	LASSERT (!msg->msg_receiving);
+	cfs_percpt_free(the_lnet.ln_mt_rstq);
+	the_lnet.ln_mt_rstq = NULL;
+}
 
-	msg->msg_sending = 1;
+int lnet_monitor_thr_start(void)
+{
+	int rc = 0;
+	struct task_struct *task;
 
-	LASSERT(!msg->msg_tx_committed);
+	if (the_lnet.ln_mt_state != LNET_MT_STATE_SHUTDOWN)
+		return -EALREADY;
 
-	rc = lnet_select_pathway(src_nid, dst_nid, msg, rtr_nid);
-	if (rc < 0)
+	rc = lnet_resendqs_create();
+	if (rc)
 		return rc;
 
-	if (rc == LNET_CREDIT_OK)
-		lnet_ni_send(msg->msg_txni, msg);
+	rc = lnet_rsp_tracker_create();
+	if (rc)
+		goto clean_queues;
+
+	/* Pre monitor thread start processing */
+	rc = lnet_router_pre_mt_start();
+	if (rc)
+		goto free_mem;
+
+	sema_init(&the_lnet.ln_mt_signal, 0);
+
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_mt_state = LNET_MT_STATE_RUNNING;
+	lnet_net_unlock(LNET_LOCK_EX);
+	task = kthread_run(lnet_monitor_thread, NULL, "monitor_thread");
+	if (IS_ERR(task)) {
+		rc = PTR_ERR(task);
+		CERROR("Can't start monitor thread: %d\n", rc);
+		goto clean_thread;
+	}
+
+	/* post monitor thread start processing */
+	lnet_router_post_mt_start();
 
-	/* rc == LNET_CREDIT_OK or LNET_CREDIT_WAIT */
 	return 0;
+
+clean_thread:
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_mt_state = LNET_MT_STATE_STOPPING;
+	lnet_net_unlock(LNET_LOCK_EX);
+	/* block until event callback signals exit */
+	down(&the_lnet.ln_mt_signal);
+	/* clean up */
+	lnet_router_cleanup();
+free_mem:
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_mt_state = LNET_MT_STATE_SHUTDOWN;
+	lnet_net_unlock(LNET_LOCK_EX);
+	lnet_rsp_tracker_clean();
+	lnet_clean_local_ni_recoveryq();
+	lnet_clean_peer_ni_recoveryq();
+	lnet_clean_resendqs();
+	LNetInvalidateEQHandle(&the_lnet.ln_mt_eqh);
+	return rc;
+clean_queues:
+	lnet_rsp_tracker_clean();
+	lnet_clean_local_ni_recoveryq();
+	lnet_clean_peer_ni_recoveryq();
+	lnet_clean_resendqs();
+	return rc;
+}
+
+void lnet_monitor_thr_stop(void)
+{
+	if (the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN)
+		return;
+
+	LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING);
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_mt_state = LNET_MT_STATE_STOPPING;
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	/* tell the monitor thread that we're shutting down */
+	wake_up(&the_lnet.ln_mt_waitq);
+
+	/* block until monitor thread signals that it's done */
+	down(&the_lnet.ln_mt_signal);
+	LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN);
+
+	/* perform cleanup tasks */
+	lnet_router_cleanup();
+	lnet_rsp_tracker_clean();
+	lnet_clean_local_ni_recoveryq();
+	lnet_clean_peer_ni_recoveryq();
+	lnet_clean_resendqs();
+
+	return;
 }
 
 void
-lnet_drop_message(struct lnet_ni *ni, int cpt, void *private, unsigned int nob)
+lnet_drop_message(struct lnet_ni *ni, int cpt, void *private, unsigned int nob,
+		  __u32 msg_type)
 {
 	lnet_net_lock(cpt);
-	the_lnet.ln_counters[cpt]->drop_count++;
-	the_lnet.ln_counters[cpt]->drop_length += nob;
+	lnet_incr_stats(&ni->ni_stats, msg_type, LNET_STATS_TYPE_DROP);
+	the_lnet.ln_counters[cpt]->lct_common.lcc_drop_count++;
+	the_lnet.ln_counters[cpt]->lct_common.lcc_drop_length += nob;
 	lnet_net_unlock(cpt);
 
 	lnet_ni_recv(ni, private, NULL, 0, 0, 0, nob);
@@ -2128,13 +3917,13 @@ lnet_parse_get(struct lnet_ni *ni, struct lnet_msg *msg, int rdma_get)
 static int
 lnet_parse_reply(struct lnet_ni *ni, struct lnet_msg *msg)
 {
-	void		 *private = msg->msg_private;
-	struct lnet_hdr	 *hdr = &msg->msg_hdr;
+	void *private = msg->msg_private;
+	struct lnet_hdr *hdr = &msg->msg_hdr;
 	struct lnet_process_id src = {0};
-	struct lnet_libmd	 *md;
-	int		  rlength;
-	int		  mlength;
-	int			cpt;
+	struct lnet_libmd *md;
+	int rlength;
+	int mlength;
+	int cpt;
 
 	cpt = lnet_cpt_of_cookie(hdr->msg.reply.dst_wmd.wh_object_cookie);
 	lnet_res_lock(cpt);
@@ -2195,10 +3984,10 @@ lnet_parse_reply(struct lnet_ni *ni, struct lnet_msg *msg)
 static int
 lnet_parse_ack(struct lnet_ni *ni, struct lnet_msg *msg)
 {
-	struct lnet_hdr	 *hdr = &msg->msg_hdr;
+	struct lnet_hdr *hdr = &msg->msg_hdr;
 	struct lnet_process_id src = {0};
-	struct lnet_libmd	 *md;
-	int			cpt;
+	struct lnet_libmd *md;
+	int cpt;
 
 	src.nid = hdr->src_nid;
 	src.pid = hdr->src_pid;
@@ -2405,11 +4194,12 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
 	for_me = (ni->ni_nid == dest_nid);
 	cpt = lnet_cpt_of_nid(from_nid, ni);
 
-	CDEBUG(D_NET, "TRACE: %s(%s) <- %s : %s\n",
+	CDEBUG(D_NET, "TRACE: %s(%s) <- %s : %s - %s\n",
 		libcfs_nid2str(dest_nid),
 		libcfs_nid2str(ni->ni_nid),
 		libcfs_nid2str(src_nid),
-		lnet_msgtyp2str(type));
+		lnet_msgtyp2str(type),
+		(for_me) ? "for me" : "routed");
 
 	switch (type) {
 	case LNET_MSG_ACK:
@@ -2446,10 +4236,10 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
 	}
 
 	if (the_lnet.ln_routing &&
-	    ni->ni_last_alive != cfs_time_current_sec()) {
+	    ni->ni_last_alive != ktime_get_real_seconds()) {
 		/* NB: so far here is the only place to set NI status to "up */
 		lnet_ni_lock(ni);
-		ni->ni_last_alive = cfs_time_current_sec();
+		ni->ni_last_alive = ktime_get_real_seconds();
 		if (ni->ni_status != NULL &&
 		    ni->ni_status->ns_status == LNET_NI_STATUS_DOWN)
 			ni->ni_status->ns_status = LNET_NI_STATUS_UP;
@@ -2513,7 +4303,7 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
 	}
 
 	if (!list_empty(&the_lnet.ln_drop_rules) &&
-	    lnet_drop_rule_match(hdr)) {
+	    lnet_drop_rule_match(hdr, NULL)) {
 		CDEBUG(D_NET, "%s, src %s, dst %s: Dropping %s to simulate"
 			      "silent message loss\n",
 		       libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
@@ -2521,6 +4311,52 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
 		goto drop;
 	}
 
+	if (lnet_drop_asym_route && for_me &&
+	    LNET_NIDNET(src_nid) != LNET_NIDNET(from_nid)) {
+		struct lnet_net *net;
+		struct lnet_remotenet *rnet;
+		bool found = true;
+
+		/* we are dealing with a routed message,
+		 * so see if route to reach src_nid goes through from_nid
+		 */
+		lnet_net_lock(cpt);
+		net = lnet_get_net_locked(LNET_NIDNET(ni->ni_nid));
+		if (!net) {
+			lnet_net_unlock(cpt);
+			CERROR("net %s not found\n",
+			       libcfs_net2str(LNET_NIDNET(ni->ni_nid)));
+			return -EPROTO;
+		}
+
+		rnet = lnet_find_rnet_locked(LNET_NIDNET(src_nid));
+		if (rnet) {
+			struct lnet_peer_ni *gw = NULL;
+			struct lnet_route *route;
+
+			list_for_each_entry(route, &rnet->lrn_routes, lr_list) {
+				found = false;
+				gw = route->lr_gateway;
+				if (gw->lpni_net != net)
+					continue;
+				if (gw->lpni_nid == from_nid) {
+					found = true;
+					break;
+				}
+			}
+		}
+		lnet_net_unlock(cpt);
+		if (!found) {
+			/* we would not use from_nid to route a message to
+			 * src_nid
+			 * => asymmetric routing detected but forbidden
+			 */
+			CERROR("%s, src %s: Dropping asymmetrical route %s\n",
+			       libcfs_nid2str(from_nid),
+			       libcfs_nid2str(src_nid), lnet_msgtyp2str(type));
+			goto drop;
+		}
+	}
 
 	msg = lnet_msg_alloc();
 	if (msg == NULL) {
@@ -2558,7 +4394,7 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
 	}
 
 	lnet_net_lock(cpt);
-	lpni = lnet_nid2peerni_locked(from_nid, cpt);
+	lpni = lnet_nid2peerni_locked(from_nid, ni->ni_nid, cpt);
 	if (IS_ERR(lpni)) {
 		lnet_net_unlock(cpt);
 		CERROR("%s, src %s: Dropping %s "
@@ -2625,7 +4461,7 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
 	lnet_finalize(msg, rc);
 
  drop:
-	lnet_drop_message(ni, cpt, private, payload_length);
+	lnet_drop_message(ni, cpt, private, payload_length, type);
 	return 0;
 }
 EXPORT_SYMBOL(lnet_parse);
@@ -2661,7 +4497,10 @@ lnet_drop_delayed_msg_list(struct list_head *head, char *reason)
 		 * until that's done */
 
 		lnet_drop_message(msg->msg_rxni, msg->msg_rx_cpt,
-				  msg->msg_private, msg->msg_len);
+				  msg->msg_private, msg->msg_len,
+				  msg->msg_type);
+
+		msg->msg_no_resend = true;
 		/*
 		 * NB: message will not generate event because w/o attached MD,
 		 * but we still should give error code so lnet_msg_decommit()
@@ -2704,6 +4543,54 @@ lnet_recv_delayed_msg_list(struct list_head *head)
 	}
 }
 
+static void
+lnet_attach_rsp_tracker(struct lnet_rsp_tracker *rspt, int cpt,
+			struct lnet_libmd *md, struct lnet_handle_md mdh)
+{
+	s64 timeout_ns;
+	bool new_entry = true;
+	struct lnet_rsp_tracker *local_rspt;
+
+	/*
+	 * MD has a refcount taken by message so it's not going away.
+	 * The MD however can be looked up. We need to secure the access
+	 * to the md_rspt_ptr by taking the res_lock.
+	 * The rspt can be accessed without protection up to when it gets
+	 * added to the list.
+	 */
+
+	lnet_res_lock(cpt);
+	local_rspt = md->md_rspt_ptr;
+	timeout_ns = lnet_transaction_timeout * NSEC_PER_SEC;
+	if (local_rspt != NULL) {
+		/*
+		 * we already have an rspt attached to the md, so we'll
+		 * update the deadline on that one.
+		 */
+		LIBCFS_FREE(rspt, sizeof(*rspt));
+		new_entry = false;
+	} else {
+		/* new md */
+		rspt->rspt_mdh = mdh;
+		rspt->rspt_cpt = cpt;
+		/* store the rspt so we can access it when we get the REPLY */
+		md->md_rspt_ptr = rspt;
+		local_rspt = rspt;
+	}
+	local_rspt->rspt_deadline = ktime_add_ns(ktime_get(), timeout_ns);
+
+	/*
+	 * add to the list of tracked responses. It's added to tail of the
+	 * list in order to expire all the older entries first.
+	 */
+	lnet_net_lock(cpt);
+	if (!new_entry && !list_empty(&local_rspt->rspt_on_list))
+		list_del_init(&local_rspt->rspt_on_list);
+	list_add_tail(&local_rspt->rspt_on_list, the_lnet.ln_mt_rstq[cpt]);
+	lnet_net_unlock(cpt);
+	lnet_res_unlock(cpt);
+}
+
 /**
  * Initiate an asynchronous PUT operation.
  *
@@ -2754,10 +4641,11 @@ LNetPut(lnet_nid_t self, struct lnet_handle_md mdh, enum lnet_ack_req ack,
 	__u64 match_bits, unsigned int offset,
 	__u64 hdr_data)
 {
-	struct lnet_msg		*msg;
-	struct lnet_libmd	*md;
-	int			cpt;
-	int			rc;
+	struct lnet_msg *msg;
+	struct lnet_libmd *md;
+	int cpt;
+	int rc;
+	struct lnet_rsp_tracker *rspt = NULL;
 
 	LASSERT(the_lnet.ln_refcount > 0);
 
@@ -2777,6 +4665,17 @@ LNetPut(lnet_nid_t self, struct lnet_handle_md mdh, enum lnet_ack_req ack,
 	msg->msg_vmflush = !!memory_pressure_get();
 
 	cpt = lnet_cpt_of_cookie(mdh.cookie);
+
+	if (ack == LNET_ACK_REQ) {
+		rspt = lnet_rspt_alloc(cpt);
+		if (!rspt) {
+			CERROR("Dropping PUT to %s: ENOMEM on response tracker\n",
+				libcfs_id2str(target));
+			return -ENOMEM;
+		}
+		INIT_LIST_HEAD(&rspt->rspt_on_list);
+	}
+
 	lnet_res_lock(cpt);
 
 	md = lnet_handle2md(&mdh);
@@ -2789,6 +4688,7 @@ LNetPut(lnet_nid_t self, struct lnet_handle_md mdh, enum lnet_ack_req ack,
 			       md->md_me->me_portal);
 		lnet_res_unlock(cpt);
 
+		LIBCFS_FREE(rspt, sizeof(*rspt));
 		lnet_msg_free(msg);
 		return -ENOENT;
 	}
@@ -2821,10 +4721,14 @@ LNetPut(lnet_nid_t self, struct lnet_handle_md mdh, enum lnet_ack_req ack,
 
 	lnet_build_msg_event(msg, LNET_EVENT_SEND);
 
+	if (ack == LNET_ACK_REQ)
+		lnet_attach_rsp_tracker(rspt, cpt, md, mdh);
+
 	rc = lnet_send(self, msg, LNET_NID_ANY);
 	if (rc != 0) {
 		CNETERR("Error sending PUT to %s: %d\n",
 			libcfs_id2str(target), rc);
+		msg->msg_no_resend = true;
 		lnet_finalize(msg, rc);
 	}
 
@@ -2901,8 +4805,10 @@ lnet_create_reply_msg(struct lnet_ni *ni, struct lnet_msg *getmsg)
 	cpt = lnet_cpt_of_nid(peer_id.nid, ni);
 
 	lnet_net_lock(cpt);
-	the_lnet.ln_counters[cpt]->drop_count++;
-	the_lnet.ln_counters[cpt]->drop_length += getmd->md_length;
+	lnet_incr_stats(&ni->ni_stats, LNET_MSG_GET, LNET_STATS_TYPE_DROP);
+	the_lnet.ln_counters[cpt]->lct_common.lcc_drop_count++;
+	the_lnet.ln_counters[cpt]->lct_common.lcc_drop_length +=
+		getmd->md_length;
 	lnet_net_unlock(cpt);
 
 	if (msg != NULL)
@@ -2953,12 +4859,13 @@ EXPORT_SYMBOL(lnet_set_reply_msg_len);
 int
 LNetGet(lnet_nid_t self, struct lnet_handle_md mdh,
 	struct lnet_process_id target, unsigned int portal,
-	__u64 match_bits, unsigned int offset)
+	__u64 match_bits, unsigned int offset, bool recovery)
 {
-	struct lnet_msg		*msg;
-	struct lnet_libmd	*md;
-	int			cpt;
-	int			rc;
+	struct lnet_msg *msg;
+	struct lnet_libmd *md;
+	struct lnet_rsp_tracker *rspt;
+	int cpt;
+	int rc;
 
 	LASSERT(the_lnet.ln_refcount > 0);
 
@@ -2971,13 +4878,24 @@ LNetGet(lnet_nid_t self, struct lnet_handle_md mdh,
 	}
 
 	msg = lnet_msg_alloc();
-	if (msg == NULL) {
+	if (!msg) {
 		CERROR("Dropping GET to %s: ENOMEM on struct lnet_msg\n",
 		       libcfs_id2str(target));
 		return -ENOMEM;
 	}
 
 	cpt = lnet_cpt_of_cookie(mdh.cookie);
+
+	rspt = lnet_rspt_alloc(cpt);
+	if (!rspt) {
+		CERROR("Dropping GET to %s: ENOMEM on response tracker\n",
+		       libcfs_id2str(target));
+		return -ENOMEM;
+	}
+	INIT_LIST_HEAD(&rspt->rspt_on_list);
+
+	msg->msg_recovery = recovery;
+
 	lnet_res_lock(cpt);
 
 	md = lnet_handle2md(&mdh);
@@ -2992,6 +4910,7 @@ LNetGet(lnet_nid_t self, struct lnet_handle_md mdh,
 		lnet_res_unlock(cpt);
 
 		lnet_msg_free(msg);
+		LIBCFS_FREE(rspt, sizeof(*rspt));
 		return -ENOENT;
 	}
 
@@ -3016,10 +4935,13 @@ LNetGet(lnet_nid_t self, struct lnet_handle_md mdh,
 
 	lnet_build_msg_event(msg, LNET_EVENT_SEND);
 
+	lnet_attach_rsp_tracker(rspt, cpt, md, mdh);
+
 	rc = lnet_send(self, msg, LNET_NID_ANY);
 	if (rc < 0) {
 		CNETERR("Error sending GET to %s: %d\n",
 			libcfs_id2str(target), rc);
+		msg->msg_no_resend = true;
 		lnet_finalize(msg, rc);
 	}
 
@@ -3045,14 +4967,14 @@ EXPORT_SYMBOL(LNetGet);
 int
 LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp)
 {
-	struct list_head	*e;
+	struct list_head *e;
 	struct lnet_ni *ni = NULL;
 	struct lnet_remotenet *rnet;
-	__u32			dstnet = LNET_NIDNET(dstnid);
-	int			hops;
-	int			cpt;
-	__u32			order = 2;
-	struct list_head	*rn_list;
+	__u32 dstnet = LNET_NIDNET(dstnid);
+	int hops;
+	int cpt;
+	__u32 order = 2;
+	struct list_head *rn_list;
 
 	/* if !local_nid_dist_zero, I don't return a distance of 0 ever
 	 * (when lustre sees a distance of 0, it substitutes 0@lo), so I
@@ -3068,7 +4990,7 @@ LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp)
 			if (srcnidp != NULL)
 				*srcnidp = dstnid;
 			if (orderp != NULL) {
-				if (LNET_NETTYP(LNET_NIDNET(dstnid)) == LOLND)
+				if (dstnid == LNET_NID_LO_0)
 					*orderp = 0;
 				else
 					*orderp = 1;
@@ -3083,9 +5005,9 @@ LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp)
 			 * current net namespace.
 			 * If not, assign order above 0xffff0000,
 			 * to make this ni not a priority. */
-			if (!net_eq(ni->ni_net_ns, current->nsproxy->net_ns))
-				order += 0xffff0000;
-
+			if (current->nsproxy &&
+			    !net_eq(ni->ni_net_ns, current->nsproxy->net_ns))
+					order += 0xffff0000;
 			if (srcnidp != NULL)
 				*srcnidp = ni->ni_nid;
 			if (orderp != NULL)
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-msg.c b/drivers/staging/lustrefsx/lnet/lnet/lib-msg.c
index 1b90855375a20..959c370d2d4da 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lib-msg.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-msg.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2014, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -65,6 +65,7 @@ lnet_build_msg_event(struct lnet_msg *msg, enum lnet_event_kind ev_type)
 	LASSERT(!msg->msg_routing);
 
 	ev->type = ev_type;
+	ev->msg_type = msg->msg_type;
 
 	if (ev_type == LNET_EVENT_SEND) {
 		/* event for active message */
@@ -75,7 +76,6 @@ lnet_build_msg_event(struct lnet_msg *msg, enum lnet_event_kind ev_type)
 		ev->source.nid	  = LNET_NID_ANY;
 		ev->source.pid    = the_lnet.ln_pid;
 		ev->sender	  = LNET_NID_ANY;
-
 	} else {
 		/* event for passive message */
 		ev->target.pid	  = hdr->dest_pid;
@@ -142,14 +142,18 @@ void
 lnet_msg_commit(struct lnet_msg *msg, int cpt)
 {
 	struct lnet_msg_container *container = the_lnet.ln_msg_containers[cpt];
-	struct lnet_counters *counters = the_lnet.ln_counters[cpt];
+	struct lnet_counters_common *common;
+	s64 timeout_ns;
+
+	/* set the message deadline */
+	timeout_ns = lnet_transaction_timeout * NSEC_PER_SEC;
+	msg->msg_deadline = ktime_add_ns(ktime_get(), timeout_ns);
 
 	/* routed message can be committed for both receiving and sending */
 	LASSERT(!msg->msg_tx_committed);
 
 	if (msg->msg_sending) {
 		LASSERT(!msg->msg_receiving);
-
 		msg->msg_tx_cpt = cpt;
 		msg->msg_tx_committed = 1;
 		if (msg->msg_rx_committed) { /* routed message REPLY */
@@ -163,33 +167,35 @@ lnet_msg_commit(struct lnet_msg *msg, int cpt)
 	}
 
 	LASSERT(!msg->msg_onactivelist);
+
 	msg->msg_onactivelist = 1;
-	list_add(&msg->msg_activelist, &container->msc_active);
+	list_add_tail(&msg->msg_activelist, &container->msc_active);
 
-	counters->msgs_alloc++;
-	if (counters->msgs_alloc > counters->msgs_max)
-		counters->msgs_max = counters->msgs_alloc;
+	common = &the_lnet.ln_counters[cpt]->lct_common;
+	common->lcc_msgs_alloc++;
+	if (common->lcc_msgs_alloc > common->lcc_msgs_max)
+		common->lcc_msgs_max = common->lcc_msgs_alloc;
 }
 
 static void
 lnet_msg_decommit_tx(struct lnet_msg *msg, int status)
 {
-	struct lnet_counters *counters;
+	struct lnet_counters_common *common;
 	struct lnet_event *ev = &msg->msg_ev;
 
 	LASSERT(msg->msg_tx_committed);
 	if (status != 0)
 		goto out;
 
-	counters = the_lnet.ln_counters[msg->msg_tx_cpt];
+	common = &(the_lnet.ln_counters[msg->msg_tx_cpt]->lct_common);
 	switch (ev->type) {
 	default: /* routed message */
 		LASSERT(msg->msg_routing);
 		LASSERT(msg->msg_rx_committed);
 		LASSERT(ev->type == 0);
 
-		counters->route_length += msg->msg_len;
-		counters->route_count++;
+		common->lcc_route_length += msg->msg_len;
+		common->lcc_route_count++;
 		goto incr_stats;
 
 	case LNET_EVENT_PUT:
@@ -203,7 +209,7 @@ lnet_msg_decommit_tx(struct lnet_msg *msg, int status)
 	case LNET_EVENT_SEND:
 		LASSERT(!msg->msg_rx_committed);
 		if (msg->msg_type == LNET_MSG_PUT)
-			counters->send_length += msg->msg_len;
+			common->lcc_send_length += msg->msg_len;
 		break;
 
 	case LNET_EVENT_GET:
@@ -215,13 +221,17 @@ lnet_msg_decommit_tx(struct lnet_msg *msg, int status)
 		break;
 	}
 
-	counters->send_count++;
+	common->lcc_send_count++;
 
 incr_stats:
 	if (msg->msg_txpeer)
-		atomic_inc(&msg->msg_txpeer->lpni_stats.send_count);
+		lnet_incr_stats(&msg->msg_txpeer->lpni_stats,
+				msg->msg_type,
+				LNET_STATS_TYPE_SEND);
 	if (msg->msg_txni)
-		atomic_inc(&msg->msg_txni->ni_stats.send_count);
+		lnet_incr_stats(&msg->msg_txni->ni_stats,
+				msg->msg_type,
+				LNET_STATS_TYPE_SEND);
  out:
 	lnet_return_tx_credits_locked(msg);
 	msg->msg_tx_committed = 0;
@@ -230,7 +240,7 @@ lnet_msg_decommit_tx(struct lnet_msg *msg, int status)
 static void
 lnet_msg_decommit_rx(struct lnet_msg *msg, int status)
 {
-	struct lnet_counters *counters;
+	struct lnet_counters_common *common;
 	struct lnet_event *ev = &msg->msg_ev;
 
 	LASSERT(!msg->msg_tx_committed); /* decommitted or never committed */
@@ -239,7 +249,7 @@ lnet_msg_decommit_rx(struct lnet_msg *msg, int status)
 	if (status != 0)
 		goto out;
 
-	counters = the_lnet.ln_counters[msg->msg_rx_cpt];
+	common = &(the_lnet.ln_counters[msg->msg_rx_cpt]->lct_common);
 	switch (ev->type) {
 	default:
 		LASSERT(ev->type == 0);
@@ -257,7 +267,7 @@ lnet_msg_decommit_rx(struct lnet_msg *msg, int status)
 		 * lnet_msg_decommit_tx(), see details in lnet_parse_get() */
 		LASSERT(msg->msg_type == LNET_MSG_REPLY ||
 			msg->msg_type == LNET_MSG_GET);
-		counters->send_length += msg->msg_wanted;
+		common->lcc_send_length += msg->msg_wanted;
 		break;
 
 	case LNET_EVENT_PUT:
@@ -272,15 +282,19 @@ lnet_msg_decommit_rx(struct lnet_msg *msg, int status)
 		break;
 	}
 
-	counters->recv_count++;
+	common->lcc_recv_count++;
 
 incr_stats:
 	if (msg->msg_rxpeer)
-		atomic_inc(&msg->msg_rxpeer->lpni_stats.recv_count);
+		lnet_incr_stats(&msg->msg_rxpeer->lpni_stats,
+				msg->msg_type,
+				LNET_STATS_TYPE_RECV);
 	if (msg->msg_rxni)
-		atomic_inc(&msg->msg_rxni->ni_stats.recv_count);
+		lnet_incr_stats(&msg->msg_rxni->ni_stats,
+				msg->msg_type,
+				LNET_STATS_TYPE_RECV);
 	if (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_REPLY)
-		counters->recv_length += msg->msg_wanted;
+		common->lcc_recv_length += msg->msg_wanted;
 
  out:
 	lnet_return_rx_credits_locked(msg);
@@ -313,7 +327,7 @@ lnet_msg_decommit(struct lnet_msg *msg, int cpt, int status)
 	list_del(&msg->msg_activelist);
 	msg->msg_onactivelist = 0;
 
-	the_lnet.ln_counters[cpt2]->msgs_alloc--;
+	the_lnet.ln_counters[cpt2]->lct_common.lcc_msgs_alloc--;
 
 	if (cpt2 != cpt) {
 		lnet_net_unlock(cpt2);
@@ -349,29 +363,6 @@ lnet_msg_attach_md(struct lnet_msg *msg, struct lnet_libmd *md,
 	lnet_md_deconstruct(md, &msg->msg_ev.md);
 }
 
-void
-lnet_msg_detach_md(struct lnet_msg *msg, int status)
-{
-	struct lnet_libmd *md = msg->msg_md;
-	int unlink;
-
-	/* Now it's safe to drop my caller's ref */
-	md->md_refcount--;
-	LASSERT(md->md_refcount >= 0);
-
-	unlink = lnet_md_unlinkable(md);
-	if (md->md_eq != NULL) {
-		msg->msg_ev.status   = status;
-		msg->msg_ev.unlinked = unlink;
-		lnet_eq_enqueue_event(md->md_eq, &msg->msg_ev);
-	}
-
-	if (unlink)
-		lnet_md_unlink(md);
-
-	msg->msg_md = NULL;
-}
-
 static int
 lnet_complete_msg_locked(struct lnet_msg *msg, int cpt)
 {
@@ -448,14 +439,549 @@ lnet_complete_msg_locked(struct lnet_msg *msg, int cpt)
 	return 0;
 }
 
+static void
+lnet_dec_healthv_locked(atomic_t *healthv)
+{
+	int h = atomic_read(healthv);
+
+	if (h < lnet_health_sensitivity) {
+		atomic_set(healthv, 0);
+	} else {
+		h -= lnet_health_sensitivity;
+		atomic_set(healthv, h);
+	}
+}
+
+static void
+lnet_handle_local_failure(struct lnet_msg *msg)
+{
+	struct lnet_ni *local_ni;
+
+	local_ni = msg->msg_txni;
+
+	/*
+	 * the lnet_net_lock(0) is used to protect the addref on the ni
+	 * and the recovery queue.
+	 */
+	lnet_net_lock(0);
+	/* the mt could've shutdown and cleaned up the queues */
+	if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) {
+		lnet_net_unlock(0);
+		return;
+	}
+
+	lnet_dec_healthv_locked(&local_ni->ni_healthv);
+	/*
+	 * add the NI to the recovery queue if it's not already there
+	 * and it's health value is actually below the maximum. It's
+	 * possible that the sensitivity might be set to 0, and the health
+	 * value will not be reduced. In this case, there is no reason to
+	 * invoke recovery
+	 */
+	if (list_empty(&local_ni->ni_recovery) &&
+	    atomic_read(&local_ni->ni_healthv) < LNET_MAX_HEALTH_VALUE) {
+		CDEBUG(D_NET, "ni %s added to recovery queue. Health = %d\n",
+			libcfs_nid2str(local_ni->ni_nid),
+			atomic_read(&local_ni->ni_healthv));
+		list_add_tail(&local_ni->ni_recovery,
+			      &the_lnet.ln_mt_localNIRecovq);
+		lnet_ni_addref_locked(local_ni, 0);
+	}
+	lnet_net_unlock(0);
+}
+
+void
+lnet_handle_remote_failure_locked(struct lnet_peer_ni *lpni)
+{
+	/* lpni could be NULL if we're in the LOLND case */
+	if (!lpni)
+		return;
+
+	lnet_dec_healthv_locked(&lpni->lpni_healthv);
+	/*
+	 * add the peer NI to the recovery queue if it's not already there
+	 * and it's health value is actually below the maximum. It's
+	 * possible that the sensitivity might be set to 0, and the health
+	 * value will not be reduced. In this case, there is no reason to
+	 * invoke recovery
+	 */
+	lnet_peer_ni_add_to_recoveryq_locked(lpni);
+}
+
+static void
+lnet_handle_remote_failure(struct lnet_peer_ni *lpni)
+{
+	/* lpni could be NULL if we're in the LOLND case */
+	if (!lpni)
+		return;
+
+	lnet_net_lock(0);
+	/* the mt could've shutdown and cleaned up the queues */
+	if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) {
+		lnet_net_unlock(0);
+		return;
+	}
+	lnet_handle_remote_failure_locked(lpni);
+	lnet_net_unlock(0);
+}
+
+static void
+lnet_incr_hstats(struct lnet_msg *msg, enum lnet_msg_hstatus hstatus)
+{
+	struct lnet_ni *ni = msg->msg_txni;
+	struct lnet_peer_ni *lpni = msg->msg_txpeer;
+	struct lnet_counters_health *health;
+
+	health = &the_lnet.ln_counters[0]->lct_health;
+
+	switch (hstatus) {
+	case LNET_MSG_STATUS_LOCAL_INTERRUPT:
+		atomic_inc(&ni->ni_hstats.hlt_local_interrupt);
+		health->lch_local_interrupt_count++;
+		break;
+	case LNET_MSG_STATUS_LOCAL_DROPPED:
+		atomic_inc(&ni->ni_hstats.hlt_local_dropped);
+		health->lch_local_dropped_count++;
+		break;
+	case LNET_MSG_STATUS_LOCAL_ABORTED:
+		atomic_inc(&ni->ni_hstats.hlt_local_aborted);
+		health->lch_local_aborted_count++;
+		break;
+	case LNET_MSG_STATUS_LOCAL_NO_ROUTE:
+		atomic_inc(&ni->ni_hstats.hlt_local_no_route);
+		health->lch_local_no_route_count++;
+		break;
+	case LNET_MSG_STATUS_LOCAL_TIMEOUT:
+		atomic_inc(&ni->ni_hstats.hlt_local_timeout);
+		health->lch_local_timeout_count++;
+		break;
+	case LNET_MSG_STATUS_LOCAL_ERROR:
+		atomic_inc(&ni->ni_hstats.hlt_local_error);
+		health->lch_local_error_count++;
+		break;
+	case LNET_MSG_STATUS_REMOTE_DROPPED:
+		if (lpni)
+			atomic_inc(&lpni->lpni_hstats.hlt_remote_dropped);
+		health->lch_remote_dropped_count++;
+		break;
+	case LNET_MSG_STATUS_REMOTE_ERROR:
+		if (lpni)
+			atomic_inc(&lpni->lpni_hstats.hlt_remote_error);
+		health->lch_remote_error_count++;
+		break;
+	case LNET_MSG_STATUS_REMOTE_TIMEOUT:
+		if (lpni)
+			atomic_inc(&lpni->lpni_hstats.hlt_remote_timeout);
+		health->lch_remote_timeout_count++;
+		break;
+	case LNET_MSG_STATUS_NETWORK_TIMEOUT:
+		if (lpni)
+			atomic_inc(&lpni->lpni_hstats.hlt_network_timeout);
+		health->lch_network_timeout_count++;
+		break;
+	case LNET_MSG_STATUS_OK:
+		break;
+	default:
+		LBUG();
+	}
+}
+
+static void
+lnet_resend_msg_locked(struct lnet_msg *msg)
+{
+	msg->msg_retry_count++;
+
+	/*
+	 * remove message from the active list and reset it to prepare
+	 * for a resend. Two exceptions to this
+	 *
+	 * 1. the router case. When a message is being routed it is
+	 * committed for rx when received and committed for tx when
+	 * forwarded. We don't want to remove it from the active list, since
+	 * code which handles receiving expects it to remain on the active
+	 * list.
+	 *
+	 * 2. The REPLY case. Reply messages use the same message
+	 * structure for the GET that was received.
+	 */
+	if (!msg->msg_routing && msg->msg_type != LNET_MSG_REPLY) {
+		list_del_init(&msg->msg_activelist);
+		msg->msg_onactivelist = 0;
+	}
+	/*
+	 * The msg_target.nid which was originally set
+	 * when calling LNetGet() or LNetPut() might've
+	 * been overwritten if we're routing this message.
+	 * Call lnet_msg_decommit_tx() to return the credit
+	 * this message consumed. The message will
+	 * consume another credit when it gets resent.
+	 */
+	msg->msg_target.nid = msg->msg_hdr.dest_nid;
+	lnet_msg_decommit_tx(msg, -EAGAIN);
+	msg->msg_sending = 0;
+	msg->msg_receiving = 0;
+	msg->msg_target_is_router = 0;
+
+	CDEBUG(D_NET, "%s->%s:%s:%s - queuing msg (%p) for resend\n",
+	       libcfs_nid2str(msg->msg_hdr.src_nid),
+	       libcfs_nid2str(msg->msg_hdr.dest_nid),
+	       lnet_msgtyp2str(msg->msg_type),
+	       lnet_health_error2str(msg->msg_health_status), msg);
+
+	list_add_tail(&msg->msg_list, the_lnet.ln_mt_resendqs[msg->msg_tx_cpt]);
+
+	wake_up(&the_lnet.ln_mt_waitq);
+}
+
+int
+lnet_check_finalize_recursion_locked(struct lnet_msg *msg,
+				     struct list_head *containerq,
+				     int nworkers, void **workers)
+{
+	int my_slot = -1;
+	int i;
+
+	list_add_tail(&msg->msg_list, containerq);
+
+	for (i = 0; i < nworkers; i++) {
+		if (workers[i] == current)
+			break;
+
+		if (my_slot < 0 && workers[i] == NULL)
+			my_slot = i;
+	}
+
+	if (i < nworkers || my_slot < 0)
+		return -1;
+
+	workers[my_slot] = current;
+
+	return my_slot;
+}
+
+int
+lnet_attempt_msg_resend(struct lnet_msg *msg)
+{
+	struct lnet_msg_container *container;
+	int my_slot;
+	int cpt;
+
+	/* we can only resend tx_committed messages */
+	LASSERT(msg->msg_tx_committed);
+
+	/* don't resend recovery messages */
+	if (msg->msg_recovery) {
+		CDEBUG(D_NET, "msg %s->%s is a recovery ping. retry# %d\n",
+			libcfs_nid2str(msg->msg_from),
+			libcfs_nid2str(msg->msg_target.nid),
+			msg->msg_retry_count);
+		return -ENOTRECOVERABLE;
+	}
+
+	/*
+	 * if we explicitly indicated we don't want to resend then just
+	 * return
+	 */
+	if (msg->msg_no_resend) {
+		CDEBUG(D_NET, "msg %s->%s requested no resend. retry# %d\n",
+			libcfs_nid2str(msg->msg_from),
+			libcfs_nid2str(msg->msg_target.nid),
+			msg->msg_retry_count);
+		return -ENOTRECOVERABLE;
+	}
+
+	/* check if the message has exceeded the number of retries */
+	if (msg->msg_retry_count >= lnet_retry_count) {
+		CNETERR("msg %s->%s exceeded retry count %d\n",
+			libcfs_nid2str(msg->msg_from),
+			libcfs_nid2str(msg->msg_target.nid),
+			msg->msg_retry_count);
+		return -ENOTRECOVERABLE;
+	}
+
+	cpt = msg->msg_tx_cpt;
+	lnet_net_lock(cpt);
+
+	/* check again under lock */
+	if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) {
+		lnet_net_unlock(cpt);
+		return -ESHUTDOWN;
+	}
+
+	container = the_lnet.ln_msg_containers[cpt];
+	my_slot =
+		lnet_check_finalize_recursion_locked(msg,
+					&container->msc_resending,
+					container->msc_nfinalizers,
+					container->msc_resenders);
+
+	/* enough threads are resending */
+	if (my_slot == -1) {
+		lnet_net_unlock(cpt);
+		return 0;
+	}
+
+	while (!list_empty(&container->msc_resending)) {
+		msg = list_entry(container->msc_resending.next,
+					struct lnet_msg, msg_list);
+		list_del(&msg->msg_list);
+
+		/*
+		 * resending the message will require us to call
+		 * lnet_msg_decommit_tx() which will return the credit
+		 * which this message holds. This could trigger another
+		 * queued message to be sent. If that message fails and
+		 * requires a resend we will recurse.
+		 * But since at this point the slot is taken, the message
+		 * will be queued in the container and dealt with
+		 * later. This breaks the recursion.
+		 */
+		lnet_resend_msg_locked(msg);
+	}
+
+	/*
+	 * msc_resenders is an array of process pointers. Each entry holds
+	 * a pointer to the current process operating on the message. An
+	 * array entry is created per CPT. If the array slot is already
+	 * set, then it means that there is a thread on the CPT currently
+	 * resending a message.
+	 * Once the thread finishes clear the slot to enable the thread to
+	 * take on more resend work.
+	 */
+	container->msc_resenders[my_slot] = NULL;
+	lnet_net_unlock(cpt);
+
+	return 0;
+}
+
+/*
+ * Do a health check on the message:
+ * return -1 if we're not going to handle the error or
+ *   if we've reached the maximum number of retries.
+ *   success case will return -1 as well
+ * return 0 if it the message is requeued for send
+ */
+static int
+lnet_health_check(struct lnet_msg *msg)
+{
+	enum lnet_msg_hstatus hstatus = msg->msg_health_status;
+	bool lo = false;
+
+	/* if we're shutting down no point in handling health. */
+	if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING)
+		return -1;
+
+	LASSERT(msg->msg_txni);
+
+	/*
+	 * if we're sending to the LOLND then the msg_txpeer will not be
+	 * set. So no need to sanity check it.
+	 */
+	if (msg->msg_txni->ni_nid != LNET_NID_LO_0)
+		LASSERT(msg->msg_txpeer);
+	else
+		lo = true;
+
+	if (hstatus != LNET_MSG_STATUS_OK &&
+	    ktime_compare(ktime_get(), msg->msg_deadline) >= 0)
+		return -1;
+
+	/*
+	 * stats are only incremented for errors so avoid wasting time
+	 * incrementing statistics if there is no error.
+	 */
+	if (hstatus != LNET_MSG_STATUS_OK) {
+		lnet_net_lock(0);
+		lnet_incr_hstats(msg, hstatus);
+		lnet_net_unlock(0);
+	}
+
+	CDEBUG(D_NET, "health check: %s->%s: %s: %s\n",
+	       libcfs_nid2str(msg->msg_txni->ni_nid),
+	       (lo) ? "self" : libcfs_nid2str(msg->msg_txpeer->lpni_nid),
+	       lnet_msgtyp2str(msg->msg_type),
+	       lnet_health_error2str(hstatus));
+
+	switch (hstatus) {
+	case LNET_MSG_STATUS_OK:
+		lnet_inc_healthv(&msg->msg_txni->ni_healthv);
+		/*
+		 * It's possible msg_txpeer is NULL in the LOLND
+		 * case.
+		 */
+		if (msg->msg_txpeer)
+			lnet_inc_healthv(&msg->msg_txpeer->lpni_healthv);
+
+		/* we can finalize this message */
+		return -1;
+	case LNET_MSG_STATUS_LOCAL_INTERRUPT:
+	case LNET_MSG_STATUS_LOCAL_DROPPED:
+	case LNET_MSG_STATUS_LOCAL_ABORTED:
+	case LNET_MSG_STATUS_LOCAL_NO_ROUTE:
+	case LNET_MSG_STATUS_LOCAL_TIMEOUT:
+		lnet_handle_local_failure(msg);
+		/* add to the re-send queue */
+		return lnet_attempt_msg_resend(msg);
+
+	/*
+	 * These errors will not trigger a resend so simply
+	 * finalize the message
+	 */
+	case LNET_MSG_STATUS_LOCAL_ERROR:
+		lnet_handle_local_failure(msg);
+		return -1;
+
+	/*
+	 * TODO: since the remote dropped the message we can
+	 * attempt a resend safely.
+	 */
+	case LNET_MSG_STATUS_REMOTE_DROPPED:
+		lnet_handle_remote_failure(msg->msg_txpeer);
+		return lnet_attempt_msg_resend(msg);
+
+	case LNET_MSG_STATUS_REMOTE_ERROR:
+	case LNET_MSG_STATUS_REMOTE_TIMEOUT:
+	case LNET_MSG_STATUS_NETWORK_TIMEOUT:
+		lnet_handle_remote_failure(msg->msg_txpeer);
+		return -1;
+	default:
+		LBUG();
+	}
+
+	/* no resend is needed */
+	return -1;
+}
+
+static void
+lnet_msg_detach_md(struct lnet_msg *msg, int cpt, int status)
+{
+	struct lnet_libmd *md = msg->msg_md;
+	int unlink;
+
+	/* Now it's safe to drop my caller's ref */
+	md->md_refcount--;
+	LASSERT(md->md_refcount >= 0);
+
+	unlink = lnet_md_unlinkable(md);
+	if (md->md_eq != NULL) {
+		msg->msg_ev.status   = status;
+		msg->msg_ev.unlinked = unlink;
+		lnet_eq_enqueue_event(md->md_eq, &msg->msg_ev);
+	}
+
+	if (unlink || (md->md_refcount == 0 &&
+		       md->md_threshold == LNET_MD_THRESH_INF))
+		lnet_detach_rsp_tracker(md, cpt);
+
+	if (unlink)
+		lnet_md_unlink(md);
+
+	msg->msg_md = NULL;
+}
+
+static bool
+lnet_is_health_check(struct lnet_msg *msg)
+{
+	bool hc;
+	int status = msg->msg_ev.status;
+
+	if ((!msg->msg_tx_committed && !msg->msg_rx_committed) ||
+	    !msg->msg_onactivelist) {
+		CDEBUG(D_NET, "msg %p not committed for send or receive\n",
+		       msg);
+		return false;
+	}
+
+	if ((msg->msg_tx_committed && !msg->msg_txpeer) ||
+	    (msg->msg_rx_committed && !msg->msg_rxpeer)) {
+		CDEBUG(D_NET, "msg %p failed too early to retry and send\n",
+		       msg);
+		return false;
+	}
+
+	/*
+	 * perform a health check for any message committed for transmit
+	 */
+	hc = msg->msg_tx_committed;
+
+	/* Check for status inconsistencies */
+	if (hc &&
+	    ((!status && msg->msg_health_status != LNET_MSG_STATUS_OK) ||
+	     (status && msg->msg_health_status == LNET_MSG_STATUS_OK))) {
+		CDEBUG(D_NET, "Msg %p is in inconsistent state, don't perform health "
+			      "checking (%d, %d)\n", msg, status,
+			      msg->msg_health_status);
+		hc = false;
+	}
+
+	CDEBUG(D_NET, "health check = %d, status = %d, hstatus = %d\n",
+	       hc, status, msg->msg_health_status);
+
+	return hc;
+}
+
+char *
+lnet_health_error2str(enum lnet_msg_hstatus hstatus)
+{
+	switch (hstatus) {
+	case LNET_MSG_STATUS_LOCAL_INTERRUPT:
+		return "LOCAL_INTERRUPT";
+	case LNET_MSG_STATUS_LOCAL_DROPPED:
+		return "LOCAL_DROPPED";
+	case LNET_MSG_STATUS_LOCAL_ABORTED:
+		return "LOCAL_ABORTED";
+	case LNET_MSG_STATUS_LOCAL_NO_ROUTE:
+		return "LOCAL_NO_ROUTE";
+	case LNET_MSG_STATUS_LOCAL_TIMEOUT:
+		return "LOCAL_TIMEOUT";
+	case LNET_MSG_STATUS_LOCAL_ERROR:
+		return "LOCAL_ERROR";
+	case LNET_MSG_STATUS_REMOTE_DROPPED:
+		return "REMOTE_DROPPED";
+	case LNET_MSG_STATUS_REMOTE_ERROR:
+		return "REMOTE_ERROR";
+	case LNET_MSG_STATUS_REMOTE_TIMEOUT:
+		return "REMOTE_TIMEOUT";
+	case LNET_MSG_STATUS_NETWORK_TIMEOUT:
+		return "NETWORK_TIMEOUT";
+	case LNET_MSG_STATUS_OK:
+		return "OK";
+	default:
+		return "<UNKNOWN>";
+	}
+}
+
+bool
+lnet_send_error_simulation(struct lnet_msg *msg,
+			   enum lnet_msg_hstatus *hstatus)
+{
+	if (!msg)
+		return false;
+
+	if (list_empty(&the_lnet.ln_drop_rules))
+	    return false;
+
+	/* match only health rules */
+	if (!lnet_drop_rule_match(&msg->msg_hdr, hstatus))
+		return false;
+
+	CDEBUG(D_NET, "src %s, dst %s: %s simulate health error: %s\n",
+		libcfs_nid2str(msg->msg_hdr.src_nid),
+		libcfs_nid2str(msg->msg_hdr.dest_nid),
+		lnet_msgtyp2str(msg->msg_type),
+		lnet_health_error2str(*hstatus));
+
+	return true;
+}
+EXPORT_SYMBOL(lnet_send_error_simulation);
+
 void
 lnet_finalize(struct lnet_msg *msg, int status)
 {
-	struct lnet_msg_container	*container;
-	int				my_slot;
-	int				cpt;
-	int				rc;
-	int				i;
+	struct lnet_msg_container *container;
+	int my_slot;
+	int cpt;
+	int rc;
 
 	LASSERT(!in_interrupt());
 
@@ -464,16 +990,35 @@ lnet_finalize(struct lnet_msg *msg, int status)
 
 	msg->msg_ev.status = status;
 
+	if (lnet_is_health_check(msg)) {
+		/*
+		 * Check the health status of the message. If it has one
+		 * of the errors that we're supposed to handle, and it has
+		 * not timed out, then
+		 *	1. Decrement the appropriate health_value
+		 *	2. queue the message on the resend queue
+
+		 * if the message send is success, timed out or failed in the
+		 * health check for any reason then we'll just finalize the
+		 * message. Otherwise just return since the message has been
+		 * put on the resend queue.
+		 */
+		if (!lnet_health_check(msg))
+			return;
+	}
+
+	/*
+	 * We're not going to resend this message so detach its MD and invoke
+	 * the appropriate callbacks
+	 */
 	if (msg->msg_md != NULL) {
 		cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie);
-
 		lnet_res_lock(cpt);
-		lnet_msg_detach_md(msg, status);
+		lnet_msg_detach_md(msg, cpt, status);
 		lnet_res_unlock(cpt);
 	}
 
- again:
-	rc = 0;
+again:
 	if (!msg->msg_tx_committed && !msg->msg_rx_committed) {
 		/* not committed to network yet */
 		LASSERT(!msg->msg_onactivelist);
@@ -490,32 +1035,26 @@ lnet_finalize(struct lnet_msg *msg, int status)
 	lnet_net_lock(cpt);
 
 	container = the_lnet.ln_msg_containers[cpt];
-	list_add_tail(&msg->msg_list, &container->msc_finalizing);
 
 	/* Recursion breaker.  Don't complete the message here if I am (or
 	 * enough other threads are) already completing messages */
+	my_slot = lnet_check_finalize_recursion_locked(msg,
+						&container->msc_finalizing,
+						container->msc_nfinalizers,
+						container->msc_finalizers);
 
-	my_slot = -1;
-	for (i = 0; i < container->msc_nfinalizers; i++) {
-		if (container->msc_finalizers[i] == current)
-			break;
-
-		if (my_slot < 0 && container->msc_finalizers[i] == NULL)
-			my_slot = i;
-	}
-
-	if (i < container->msc_nfinalizers || my_slot < 0) {
+	/* enough threads are resending */
+	if (my_slot == -1) {
 		lnet_net_unlock(cpt);
 		return;
 	}
 
-	container->msc_finalizers[my_slot] = current;
-
+	rc = 0;
 	while (!list_empty(&container->msc_finalizing)) {
 		msg = list_entry(container->msc_finalizing.next,
 				 struct lnet_msg, msg_list);
 
-		list_del(&msg->msg_list);
+		list_del_init(&msg->msg_list);
 
 		/* NB drops and regains the lnet lock if it actually does
 		 * anything, so my finalizing friends can chomp along too */
@@ -553,7 +1092,7 @@ lnet_msg_container_cleanup(struct lnet_msg_container *container)
 				  struct lnet_msg, msg_activelist);
 		LASSERT(msg->msg_onactivelist);
 		msg->msg_onactivelist = 0;
-		list_del(&msg->msg_activelist);
+		list_del_init(&msg->msg_activelist);
 		lnet_msg_free(msg);
 		count++;
 	}
@@ -567,6 +1106,13 @@ lnet_msg_container_cleanup(struct lnet_msg_container *container)
 			    sizeof(*container->msc_finalizers));
 		container->msc_finalizers = NULL;
 	}
+
+	if (container->msc_resenders != NULL) {
+		LIBCFS_FREE(container->msc_resenders,
+			    container->msc_nfinalizers *
+			    sizeof(*container->msc_resenders));
+		container->msc_resenders = NULL;
+	}
 	container->msc_init = 0;
 }
 
@@ -579,6 +1125,7 @@ lnet_msg_container_setup(struct lnet_msg_container *container, int cpt)
 
 	INIT_LIST_HEAD(&container->msc_active);
 	INIT_LIST_HEAD(&container->msc_finalizing);
+	INIT_LIST_HEAD(&container->msc_resending);
 
 	/* number of CPUs */
 	container->msc_nfinalizers = cfs_cpt_weight(lnet_cpt_table(), cpt);
@@ -595,6 +1142,16 @@ lnet_msg_container_setup(struct lnet_msg_container *container, int cpt)
 		return -ENOMEM;
 	}
 
+	LIBCFS_CPT_ALLOC(container->msc_resenders, lnet_cpt_table(), cpt,
+			 container->msc_nfinalizers *
+			 sizeof(*container->msc_resenders));
+
+	if (container->msc_resenders == NULL) {
+		CERROR("Failed to allocate message resenders\n");
+		lnet_msg_container_cleanup(container);
+		return -ENOMEM;
+	}
+
 	return rc;
 }
 
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c b/drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c
index 3773ed9e2436c..75a352dec6ff8 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c
@@ -21,7 +21,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
index 973587a2a1dc5..ba330c6d2af1c 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2015, 2016, Intel Corporation.
+ * Copyright (c) 2015, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,9 +40,9 @@
 #include <linux/syscalls.h>
 #include <net/sock.h>
 
+#include <libcfs/linux/linux-time.h>
 #include <libcfs/linux/linux-net.h>
 #include <libcfs/libcfs.h>
-#include <libcfs/linux/linux-time.h>
 #include <lnet/lib-lnet.h>
 
 /*
@@ -66,20 +66,6 @@
 #define SO_RCVTIMEO SO_RCVTIMEO_OLD
 #endif
 
-static int
-lnet_sock_create_kern(struct socket **sock, struct net *ns)
-{
-	int rc;
-
-#ifdef HAVE_SOCK_CREATE_KERN_USE_NET
-	rc = sock_create_kern(ns, PF_INET, SOCK_STREAM, 0, sock);
-#else
-	rc = sock_create_kern(PF_INET, SOCK_STREAM, 0, sock);
-#endif
-
-	return rc;
-}
-
 int
 lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout)
 {
@@ -186,13 +172,17 @@ lnet_sock_create(struct socket **sockp, int *fatal,
 		 __u32 local_ip, int local_port, struct net *ns)
 {
 	struct sockaddr_in  locaddr;
-	struct socket *sock;
-	int rc;
+	struct socket	   *sock;
+	int		    rc;
 
 	/* All errors are fatal except bind failure if the port is in use */
 	*fatal = 1;
 
-	rc = lnet_sock_create_kern(&sock, ns);
+#ifdef HAVE_SOCK_CREATE_KERN_USE_NET
+	rc = sock_create_kern(ns, PF_INET, SOCK_STREAM, 0, &sock);
+#else
+	rc = sock_create_kern(PF_INET, SOCK_STREAM, 0, &sock);
+#endif
 	*sockp = sock;
 	if (rc != 0) {
 		CERROR("Can't create socket: %d\n", rc);
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lo.c b/drivers/staging/lustrefsx/lnet/lnet/lo.c
index eaa06fb41631d..a11ecddb08349 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lo.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lo.c
@@ -22,6 +22,8 @@
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
+ *
+ * Copyright (c) 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lnet/lnet/module.c b/drivers/staging/lustrefsx/lnet/lnet/module.c
index a7190dd79d002..676f7345ca576 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/module.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/module.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -31,8 +31,9 @@
  */
 
 #define DEBUG_SUBSYSTEM S_LNET
+
 #include <lnet/lib-lnet.h>
-#include <lnet/lib-dlc.h>
+#include <uapi/linux/lnet/lnet-dlc.h>
 
 static int config_on_load = 0;
 module_param(config_on_load, int, 0444);
@@ -171,36 +172,45 @@ lnet_dyn_unconfigure_ni(struct libcfs_ioctl_hdr *hdr)
 }
 
 static int
-lnet_ioctl(unsigned int cmd, struct libcfs_ioctl_hdr *hdr)
+lnet_ioctl(struct notifier_block *nb,
+	   unsigned long cmd, void *vdata)
 {
-	int   rc;
+	struct libcfs_ioctl_hdr *hdr = vdata;
+	int rc;
 
 	switch (cmd) {
 	case IOC_LIBCFS_CONFIGURE: {
 		struct libcfs_ioctl_data *data =
 		  (struct libcfs_ioctl_data *)hdr;
 
-		if (data->ioc_hdr.ioc_len < sizeof(*data))
-			return -EINVAL;
-
-		the_lnet.ln_nis_from_mod_params = data->ioc_flags;
-		return lnet_configure(NULL);
+		if (data->ioc_hdr.ioc_len < sizeof(*data)) {
+			rc = -EINVAL;
+		} else {
+			the_lnet.ln_nis_from_mod_params = data->ioc_flags;
+			rc = lnet_configure(NULL);
+		}
+		break;
 	}
 
 	case IOC_LIBCFS_UNCONFIGURE:
-		return lnet_unconfigure();
+		rc = lnet_unconfigure();
+		break;
 
 	case IOC_LIBCFS_ADD_NET:
-		return lnet_dyn_configure_net(hdr);
+		rc = lnet_dyn_configure_net(hdr);
+		break;
 
 	case IOC_LIBCFS_DEL_NET:
-		return lnet_dyn_unconfigure_net(hdr);
+		rc = lnet_dyn_unconfigure_net(hdr);
+		break;
 
 	case IOC_LIBCFS_ADD_LOCAL_NI:
-		return lnet_dyn_configure_ni(hdr);
+		rc = lnet_dyn_configure_ni(hdr);
+		break;
 
 	case IOC_LIBCFS_DEL_LOCAL_NI:
-		return lnet_dyn_unconfigure_ni(hdr);
+		rc = lnet_dyn_unconfigure_ni(hdr);
+		break;
 
 	default:
 		/* Passing LNET_PID_ANY only gives me a ref if the net is up
@@ -211,11 +221,14 @@ lnet_ioctl(unsigned int cmd, struct libcfs_ioctl_hdr *hdr)
 			rc = LNetCtl(cmd, hdr);
 			LNetNIFini();
 		}
-		return rc;
+		break;
 	}
+	return notifier_from_ioctl_errno(rc);
 }
 
-DECLARE_IOCTL_HANDLER(lnet_ioctl_handler, lnet_ioctl);
+static struct notifier_block lnet_ioctl_handler = {
+	.notifier_call = lnet_ioctl,
+};
 
 static int __init lnet_init(void)
 {
@@ -230,7 +243,8 @@ static int __init lnet_init(void)
 		RETURN(rc);
 	}
 
-	rc = libcfs_register_ioctl(&lnet_ioctl_handler);
+	rc = blocking_notifier_chain_register(&libcfs_ioctl_list,
+					      &lnet_ioctl_handler);
 	LASSERT(rc == 0);
 
 	if (config_on_load) {
@@ -246,7 +260,8 @@ static void __exit lnet_exit(void)
 {
 	int rc;
 
-	rc = libcfs_deregister_ioctl(&lnet_ioctl_handler);
+	rc = blocking_notifier_chain_unregister(&libcfs_ioctl_list,
+						&lnet_ioctl_handler);
 	LASSERT(rc == 0);
 
 	lnet_lib_exit();
diff --git a/drivers/staging/lustrefsx/lnet/lnet/net_fault.c b/drivers/staging/lustrefsx/lnet/lnet/net_fault.c
index b3d5b907a827b..e2172da009db5 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/net_fault.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/net_fault.c
@@ -21,7 +21,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2014, 2016, Intel Corporation.
+ * Copyright (c) 2014, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -37,7 +37,7 @@
 #define DEBUG_SUBSYSTEM S_LNET
 
 #include <lnet/lib-lnet.h>
-#include <lnet/lnetctl.h>
+#include <uapi/linux/lnet/lnetctl.h>
 
 #define LNET_MSG_MASK		(LNET_PUT_BIT | LNET_ACK_BIT | \
 				 LNET_GET_BIT | LNET_REPLY_BIT)
@@ -57,9 +57,9 @@ struct lnet_drop_rule {
 	/**
 	 * seconds to drop the next message, it's exclusive with dr_drop_at
 	 */
-	cfs_time_t		dr_drop_time;
+	time64_t		dr_drop_time;
 	/** baseline to caculate dr_drop_time */
-	cfs_time_t		dr_time_base;
+	time64_t		dr_time_base;
 	/** statistic of dropped messages */
 	struct lnet_fault_stat	dr_stat;
 };
@@ -170,9 +170,9 @@ lnet_drop_rule_add(struct lnet_fault_attr *attr)
 
 	rule->dr_attr = *attr;
 	if (attr->u.drop.da_interval != 0) {
-		rule->dr_time_base = cfs_time_shift(attr->u.drop.da_interval);
-		rule->dr_drop_time = cfs_time_shift(cfs_rand() %
-						    attr->u.drop.da_interval);
+		rule->dr_time_base = ktime_get_seconds() + attr->u.drop.da_interval;
+		rule->dr_drop_time = ktime_get_seconds() +
+				     cfs_rand() % attr->u.drop.da_interval;
 	} else {
 		rule->dr_drop_at = cfs_rand() % attr->u.drop.da_rate;
 	}
@@ -283,10 +283,9 @@ lnet_drop_rule_reset(void)
 		if (attr->u.drop.da_rate != 0) {
 			rule->dr_drop_at = cfs_rand() % attr->u.drop.da_rate;
 		} else {
-			rule->dr_drop_time = cfs_time_shift(cfs_rand() %
-						attr->u.drop.da_interval);
-			rule->dr_time_base = cfs_time_shift(attr->u.drop.
-								  da_interval);
+			rule->dr_drop_time = ktime_get_seconds() +
+					     cfs_rand() % attr->u.drop.da_interval;
+			rule->dr_time_base = ktime_get_seconds() + attr->u.drop.da_interval;
 		}
 		spin_unlock(&rule->dr_lock);
 	}
@@ -295,13 +294,58 @@ lnet_drop_rule_reset(void)
 	EXIT;
 }
 
+static void
+lnet_fault_match_health(enum lnet_msg_hstatus *hstatus, __u32 mask)
+{
+	unsigned int random;
+	int choice;
+	int delta;
+	int best_delta;
+	int i;
+
+	/* assign a random failure */
+	random = cfs_rand();
+	choice = random % (LNET_MSG_STATUS_END - LNET_MSG_STATUS_OK);
+	if (choice == 0)
+		choice++;
+
+	if (mask == HSTATUS_RANDOM) {
+		*hstatus = choice;
+		return;
+	}
+
+	if (mask & (1 << choice)) {
+		*hstatus = choice;
+		return;
+	}
+
+	/* round to the closest ON bit */
+	i = HSTATUS_END;
+	best_delta = HSTATUS_END;
+	while (i > 0) {
+		if (mask & (1 << i)) {
+			delta = choice - i;
+			if (delta < 0)
+				delta *= -1;
+			if (delta < best_delta) {
+				best_delta = delta;
+				choice = i;
+			}
+		}
+		i--;
+	}
+
+	*hstatus = choice;
+}
+
 /**
  * check source/destination NID, portal, message type and drop rate,
  * decide whether should drop this message or not
  */
 static bool
 drop_rule_match(struct lnet_drop_rule *rule, lnet_nid_t src,
-		lnet_nid_t dst, unsigned int type, unsigned int portal)
+		lnet_nid_t dst, unsigned int type, unsigned int portal,
+		enum lnet_msg_hstatus *hstatus)
 {
 	struct lnet_fault_attr	*attr = &rule->dr_attr;
 	bool			 drop;
@@ -309,24 +353,36 @@ drop_rule_match(struct lnet_drop_rule *rule, lnet_nid_t src,
 	if (!lnet_fault_attr_match(attr, src, dst, type, portal))
 		return false;
 
+	/*
+	 * if we're trying to match a health status error but it hasn't
+	 * been set in the rule, then don't match
+	 */
+	if ((hstatus && !attr->u.drop.da_health_error_mask) ||
+	    (!hstatus && attr->u.drop.da_health_error_mask))
+		return false;
+
 	/* match this rule, check drop rate now */
 	spin_lock(&rule->dr_lock);
-	if (rule->dr_drop_time != 0) { /* time based drop */
-		cfs_time_t now = cfs_time_current();
+	if (attr->u.drop.da_random) {
+		int value = cfs_rand() % attr->u.drop.da_interval;
+		if (value >= (attr->u.drop.da_interval / 2))
+			drop = true;
+		else
+			drop = false;
+	} else if (rule->dr_drop_time != 0) { /* time based drop */
+		time64_t now = ktime_get_seconds();
 
 		rule->dr_stat.fs_count++;
-		drop = cfs_time_aftereq(now, rule->dr_drop_time);
+		drop = now >= rule->dr_drop_time;
 		if (drop) {
-			if (cfs_time_after(now, rule->dr_time_base))
+			if (now > rule->dr_time_base)
 				rule->dr_time_base = now;
 
 			rule->dr_drop_time = rule->dr_time_base +
-					     cfs_time_seconds(cfs_rand() %
-						attr->u.drop.da_interval);
-			rule->dr_time_base += cfs_time_seconds(attr->u.drop.
-							       da_interval);
+					     cfs_rand() % attr->u.drop.da_interval;
+			rule->dr_time_base += attr->u.drop.da_interval;
 
-			CDEBUG(D_NET, "Drop Rule %s->%s: next drop : %ld\n",
+			CDEBUG(D_NET, "Drop Rule %s->%s: next drop : %lld\n",
 			       libcfs_nid2str(attr->fa_src),
 			       libcfs_nid2str(attr->fa_dst),
 			       rule->dr_drop_time);
@@ -347,6 +403,9 @@ drop_rule_match(struct lnet_drop_rule *rule, lnet_nid_t src,
 	}
 
 	if (drop) { /* drop this message, update counters */
+		if (hstatus)
+			lnet_fault_match_health(hstatus,
+				attr->u.drop.da_health_error_mask);
 		lnet_fault_stat_inc(&rule->dr_stat, type);
 		rule->dr_stat.u.drop.ds_dropped++;
 	}
@@ -359,15 +418,15 @@ drop_rule_match(struct lnet_drop_rule *rule, lnet_nid_t src,
  * Check if message from \a src to \a dst can match any existed drop rule
  */
 bool
-lnet_drop_rule_match(struct lnet_hdr *hdr)
+lnet_drop_rule_match(struct lnet_hdr *hdr, enum lnet_msg_hstatus *hstatus)
 {
-	struct lnet_drop_rule	*rule;
-	lnet_nid_t		 src = le64_to_cpu(hdr->src_nid);
-	lnet_nid_t		 dst = le64_to_cpu(hdr->dest_nid);
-	unsigned int		 typ = le32_to_cpu(hdr->type);
-	unsigned int		 ptl = -1;
-	bool			 drop = false;
-	int			 cpt;
+	lnet_nid_t src = le64_to_cpu(hdr->src_nid);
+	lnet_nid_t dst = le64_to_cpu(hdr->dest_nid);
+	unsigned int typ = le32_to_cpu(hdr->type);
+	struct lnet_drop_rule *rule;
+	unsigned int ptl = -1;
+	bool drop = false;
+	int cpt;
 
 	/* NB: if Portal is specified, then only PUT and GET will be
 	 * filtered by drop rule */
@@ -378,12 +437,13 @@ lnet_drop_rule_match(struct lnet_hdr *hdr)
 
 	cpt = lnet_net_lock_current();
 	list_for_each_entry(rule, &the_lnet.ln_drop_rules, dr_link) {
-		drop = drop_rule_match(rule, src, dst, typ, ptl);
+		drop = drop_rule_match(rule, src, dst, typ, ptl,
+				       hstatus);
 		if (drop)
 			break;
 	}
-
 	lnet_net_unlock(cpt);
+
 	return drop;
 }
 
@@ -412,9 +472,9 @@ struct lnet_delay_rule {
 	/**
 	 * seconds to delay the next message, it's exclusive with dl_delay_at
 	 */
-	cfs_time_t		dl_delay_time;
+	time64_t		dl_delay_time;
 	/** baseline to caculate dl_delay_time */
-	cfs_time_t		dl_time_base;
+	time64_t		dl_time_base;
 	/** jiffies to send the next delayed message */
 	unsigned long		dl_msg_send;
 	/** delayed message list */
@@ -444,13 +504,6 @@ struct delay_daemon_data {
 
 static struct delay_daemon_data	delay_dd;
 
-static cfs_time_t
-round_timeout(cfs_time_t timeout)
-{
-	return cfs_time_seconds((unsigned int)
-			cfs_duration_sec(cfs_time_sub(timeout, 0)) + 1);
-}
-
 static void
 delay_rule_decref(struct lnet_delay_rule *rule)
 {
@@ -472,8 +525,9 @@ delay_rule_match(struct lnet_delay_rule *rule, lnet_nid_t src,
 		lnet_nid_t dst, unsigned int type, unsigned int portal,
 		struct lnet_msg *msg)
 {
-	struct lnet_fault_attr	*attr = &rule->dl_attr;
-	bool			 delay;
+	struct lnet_fault_attr *attr = &rule->dl_attr;
+	bool delay;
+	time64_t now = ktime_get_seconds();
 
 	if (!lnet_fault_attr_match(attr, src, dst, type, portal))
 		return false;
@@ -481,21 +535,17 @@ delay_rule_match(struct lnet_delay_rule *rule, lnet_nid_t src,
 	/* match this rule, check delay rate now */
 	spin_lock(&rule->dl_lock);
 	if (rule->dl_delay_time != 0) { /* time based delay */
-		cfs_time_t now = cfs_time_current();
-
 		rule->dl_stat.fs_count++;
-		delay = cfs_time_aftereq(now, rule->dl_delay_time);
+		delay = now >= rule->dl_delay_time;
 		if (delay) {
-			if (cfs_time_after(now, rule->dl_time_base))
+			if (now > rule->dl_time_base)
 				rule->dl_time_base = now;
 
 			rule->dl_delay_time = rule->dl_time_base +
-					     cfs_time_seconds(cfs_rand() %
-						attr->u.delay.la_interval);
-			rule->dl_time_base += cfs_time_seconds(attr->u.delay.
-							       la_interval);
+					      cfs_rand() % attr->u.delay.la_interval;
+			rule->dl_time_base += attr->u.delay.la_interval;
 
-			CDEBUG(D_NET, "Delay Rule %s->%s: next delay : %ld\n",
+			CDEBUG(D_NET, "Delay Rule %s->%s: next delay : %lld\n",
 			       libcfs_nid2str(attr->fa_src),
 			       libcfs_nid2str(attr->fa_dst),
 			       rule->dl_delay_time);
@@ -526,11 +576,11 @@ delay_rule_match(struct lnet_delay_rule *rule, lnet_nid_t src,
 	rule->dl_stat.u.delay.ls_delayed++;
 
 	list_add_tail(&msg->msg_list, &rule->dl_msg_list);
-	msg->msg_delay_send = round_timeout(
-			cfs_time_shift(attr->u.delay.la_latency));
+	msg->msg_delay_send = now + attr->u.delay.la_latency;
 	if (rule->dl_msg_send == -1) {
 		rule->dl_msg_send = msg->msg_delay_send;
-		mod_timer(&rule->dl_timer, rule->dl_msg_send);
+		mod_timer(&rule->dl_timer,
+			  jiffies + cfs_time_seconds(attr->u.delay.la_latency));
 	}
 
 	spin_unlock(&rule->dl_lock);
@@ -574,7 +624,7 @@ delayed_msg_check(struct lnet_delay_rule *rule, bool all,
 {
 	struct lnet_msg *msg;
 	struct lnet_msg *tmp;
-	unsigned long	 now = cfs_time_current();
+	time64_t now = ktime_get_seconds();
 
 	if (!all && rule->dl_msg_send > now)
 		return;
@@ -598,7 +648,9 @@ delayed_msg_check(struct lnet_delay_rule *rule, bool all,
 		msg = list_entry(rule->dl_msg_list.next,
 				 struct lnet_msg, msg_list);
 		rule->dl_msg_send = msg->msg_delay_send;
-		mod_timer(&rule->dl_timer, rule->dl_msg_send);
+		mod_timer(&rule->dl_timer,
+			  jiffies +
+			  cfs_time_seconds(msg->msg_delay_send - now));
 	}
 	spin_unlock(&rule->dl_lock);
 }
@@ -614,6 +666,20 @@ delayed_msg_process(struct list_head *msg_list, bool drop)
 		int		rc;
 
 		msg = list_entry(msg_list->next, struct lnet_msg, msg_list);
+
+		if (msg->msg_sending) {
+			/* Delayed send */
+			list_del_init(&msg->msg_list);
+			ni = msg->msg_txni;
+			CDEBUG(D_NET, "TRACE: msg %p %s -> %s : %s\n", msg,
+			       libcfs_nid2str(ni->ni_nid),
+			       libcfs_nid2str(msg->msg_txpeer->lpni_nid),
+			       lnet_msgtyp2str(msg->msg_type));
+			lnet_ni_send(ni, msg);
+			continue;
+		}
+
+		/* Delayed receive */
 		LASSERT(msg->msg_rxpeer != NULL);
 		LASSERT(msg->msg_rxni != NULL);
 
@@ -638,7 +704,7 @@ delayed_msg_process(struct list_head *msg_list, bool drop)
 			case LNET_CREDIT_OK:
 				lnet_ni_recv(ni, msg->msg_private, msg, 0,
 					     0, msg->msg_len, msg->msg_len);
-				/* Fall through */
+				/* fallthrough */
 			case LNET_CREDIT_WAIT:
 				continue;
 			default: /* failures */
@@ -646,7 +712,8 @@ delayed_msg_process(struct list_head *msg_list, bool drop)
 			}
 		}
 
-		lnet_drop_message(ni, cpt, msg->msg_private, msg->msg_len);
+		lnet_drop_message(ni, cpt, msg->msg_private, msg->msg_len,
+				  msg->msg_type);
 		lnet_finalize(msg, rc);
 	}
 }
@@ -782,9 +849,10 @@ lnet_delay_rule_add(struct lnet_fault_attr *attr)
 
 	rule->dl_attr = *attr;
 	if (attr->u.delay.la_interval != 0) {
-		rule->dl_time_base = cfs_time_shift(attr->u.delay.la_interval);
-		rule->dl_delay_time = cfs_time_shift(cfs_rand() %
-						     attr->u.delay.la_interval);
+		rule->dl_time_base = ktime_get_seconds() +
+				     attr->u.delay.la_interval;
+		rule->dl_delay_time = ktime_get_seconds() +
+				      cfs_rand() % attr->u.delay.la_interval;
 	} else {
 		rule->dl_delay_at = cfs_rand() % attr->u.delay.la_rate;
 	}
@@ -935,10 +1003,10 @@ lnet_delay_rule_reset(void)
 		if (attr->u.delay.la_rate != 0) {
 			rule->dl_delay_at = cfs_rand() % attr->u.delay.la_rate;
 		} else {
-			rule->dl_delay_time = cfs_time_shift(cfs_rand() %
-						attr->u.delay.la_interval);
-			rule->dl_time_base = cfs_time_shift(attr->u.delay.
-								  la_interval);
+			rule->dl_delay_time = ktime_get_seconds() +
+					      cfs_rand() % attr->u.delay.la_interval;
+			rule->dl_time_base = ktime_get_seconds() +
+					     attr->u.delay.la_interval;
 		}
 		spin_unlock(&rule->dl_lock);
 	}
diff --git a/drivers/staging/lustrefsx/lnet/lnet/nidstrings.c b/drivers/staging/lustrefsx/lnet/lnet/nidstrings.c
index 5122a2e6b5d81..fe3add7b9701c 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/nidstrings.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/nidstrings.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -37,7 +37,7 @@
 #define DEBUG_SUBSYSTEM S_LNET
 
 #include <libcfs/libcfs.h>
-#include <lnet/nidstr.h>
+#include <uapi/linux/lnet/nidstr.h>
 
 /* max value for numeric network address */
 #define MAX_NUMERIC_VALUE 0xffffffff
diff --git a/drivers/staging/lustrefsx/lnet/lnet/peer.c b/drivers/staging/lustrefsx/lnet/lnet/peer.c
index 612af87d47692..c2d64d140702e 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/peer.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/peer.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2014, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -34,8 +34,19 @@
 
 #define DEBUG_SUBSYSTEM S_LNET
 
+#include <linux/sched.h>
+#ifdef HAVE_SCHED_HEADERS
+#include <linux/sched/signal.h>
+#endif
+#include <linux/uaccess.h>
+
 #include <lnet/lib-lnet.h>
-#include <lnet/lib-dlc.h>
+#include <uapi/linux/lnet/lnet-dlc.h>
+
+/* Value indicating that recovery needs to re-check a peer immediately. */
+#define LNET_REDISCOVER_PEER	(1)
+
+static int lnet_peer_queue_for_discovery(struct lnet_peer *lp);
 
 static void
 lnet_peer_remove_from_remote_list(struct lnet_peer_ni *lpni)
@@ -127,6 +138,8 @@ lnet_peer_tables_create(void)
 		spin_lock_init(&ptable->pt_zombie_lock);
 		INIT_LIST_HEAD(&ptable->pt_zombie_list);
 
+		INIT_LIST_HEAD(&ptable->pt_peer_list);
+
 		for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
 			INIT_LIST_HEAD(&hash[j]);
 		ptable->pt_hash = hash; /* sign of initialization */
@@ -152,17 +165,19 @@ lnet_peer_ni_alloc(lnet_nid_t nid)
 	INIT_LIST_HEAD(&lpni->lpni_rtrq);
 	INIT_LIST_HEAD(&lpni->lpni_routes);
 	INIT_LIST_HEAD(&lpni->lpni_hashlist);
-	INIT_LIST_HEAD(&lpni->lpni_on_peer_net_list);
+	INIT_LIST_HEAD(&lpni->lpni_peer_nis);
+	INIT_LIST_HEAD(&lpni->lpni_recovery);
 	INIT_LIST_HEAD(&lpni->lpni_on_remote_peer_ni_list);
+	LNetInvalidateMDHandle(&lpni->lpni_recovery_ping_mdh);
 
 	spin_lock_init(&lpni->lpni_lock);
 
 	lpni->lpni_alive = !lnet_peers_start_down(); /* 1 bit!! */
-	lpni->lpni_last_alive = cfs_time_current(); /* assumes alive */
+	lpni->lpni_last_alive = ktime_get_seconds(); /* assumes alive */
 	lpni->lpni_ping_feats = LNET_PING_FEAT_INVAL;
 	lpni->lpni_nid = nid;
 	lpni->lpni_cpt = cpt;
-	lnet_set_peer_ni_health_locked(lpni, true);
+	atomic_set(&lpni->lpni_healthv, LNET_MAX_HEALTH_VALUE);
 
 	net = lnet_get_net_locked(LNET_NIDNET(nid));
 	lpni->lpni_net = net;
@@ -184,7 +199,7 @@ lnet_peer_ni_alloc(lnet_nid_t nid)
 			      &the_lnet.ln_remote_peer_ni_list);
 	}
 
-	/* TODO: update flags */
+	CDEBUG(D_NET, "%p nid %s\n", lpni, libcfs_nid2str(lpni->lpni_nid));
 
 	return lpni;
 }
@@ -198,13 +213,32 @@ lnet_peer_net_alloc(__u32 net_id)
 	if (!lpn)
 		return NULL;
 
-	INIT_LIST_HEAD(&lpn->lpn_on_peer_list);
+	INIT_LIST_HEAD(&lpn->lpn_peer_nets);
 	INIT_LIST_HEAD(&lpn->lpn_peer_nis);
 	lpn->lpn_net_id = net_id;
 
+	CDEBUG(D_NET, "%p net %s\n", lpn, libcfs_net2str(lpn->lpn_net_id));
+
 	return lpn;
 }
 
+void
+lnet_destroy_peer_net_locked(struct lnet_peer_net *lpn)
+{
+	struct lnet_peer *lp;
+
+	CDEBUG(D_NET, "%p net %s\n", lpn, libcfs_net2str(lpn->lpn_net_id));
+
+	LASSERT(atomic_read(&lpn->lpn_refcount) == 0);
+	LASSERT(list_empty(&lpn->lpn_peer_nis));
+	LASSERT(list_empty(&lpn->lpn_peer_nets));
+	lp = lpn->lpn_peer;
+	lpn->lpn_peer = NULL;
+	LIBCFS_FREE(lpn, sizeof(*lpn));
+
+	lnet_peer_decref_locked(lp);
+}
+
 static struct lnet_peer *
 lnet_peer_alloc(lnet_nid_t nid)
 {
@@ -214,47 +248,118 @@ lnet_peer_alloc(lnet_nid_t nid)
 	if (!lp)
 		return NULL;
 
-	INIT_LIST_HEAD(&lp->lp_on_lnet_peer_list);
+	INIT_LIST_HEAD(&lp->lp_peer_list);
 	INIT_LIST_HEAD(&lp->lp_peer_nets);
+	INIT_LIST_HEAD(&lp->lp_dc_list);
+	INIT_LIST_HEAD(&lp->lp_dc_pendq);
+	init_waitqueue_head(&lp->lp_dc_waitq);
+	spin_lock_init(&lp->lp_lock);
 	lp->lp_primary_nid = nid;
+	lp->lp_disc_src_nid = LNET_NID_ANY;
+
+	/*
+	 * Turn off discovery for loopback peer. If you're creating a peer
+	 * for the loopback interface then that was initiated when we
+	 * attempted to send a message over the loopback. There is no need
+	 * to ever use a different interface when sending messages to
+	 * myself.
+	 */
+	if (nid == LNET_NID_LO_0)
+		lp->lp_state = LNET_PEER_NO_DISCOVERY;
+	lp->lp_cpt = lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
 
-	/* TODO: update flags */
+	CDEBUG(D_NET, "%p nid %s\n", lp, libcfs_nid2str(lp->lp_primary_nid));
 
 	return lp;
 }
 
+void
+lnet_destroy_peer_locked(struct lnet_peer *lp)
+{
+	CDEBUG(D_NET, "%p nid %s\n", lp, libcfs_nid2str(lp->lp_primary_nid));
+
+	LASSERT(atomic_read(&lp->lp_refcount) == 0);
+	LASSERT(list_empty(&lp->lp_peer_nets));
+	LASSERT(list_empty(&lp->lp_peer_list));
+	LASSERT(list_empty(&lp->lp_dc_list));
+
+	if (lp->lp_data)
+		lnet_ping_buffer_decref(lp->lp_data);
+
+	/*
+	 * if there are messages still on the pending queue, then make
+	 * sure to queue them on the ln_msg_resend list so they can be
+	 * resent at a later point if the discovery thread is still
+	 * running.
+	 * If the discovery thread has stopped, then the wakeup will be a
+	 * no-op, and it is expected the lnet_shutdown_lndnets() will
+	 * eventually be called, which will traverse this list and
+	 * finalize the messages on the list.
+	 * We can not resend them now because we're holding the cpt lock.
+	 * Releasing the lock can cause an inconsistent state
+	 */
+	spin_lock(&the_lnet.ln_msg_resend_lock);
+	spin_lock(&lp->lp_lock);
+	list_splice(&lp->lp_dc_pendq, &the_lnet.ln_msg_resend);
+	spin_unlock(&lp->lp_lock);
+	spin_unlock(&the_lnet.ln_msg_resend_lock);
+	wake_up(&the_lnet.ln_dc_waitq);
+
+	LIBCFS_FREE(lp, sizeof(*lp));
+}
 
+/*
+ * Detach a peer_ni from its peer_net. If this was the last peer_ni on
+ * that peer_net, detach the peer_net from the peer.
+ *
+ * Call with lnet_net_lock/EX held
+ */
 static void
-lnet_try_destroy_peer_hierarchy_locked(struct lnet_peer_ni *lpni)
+lnet_peer_detach_peer_ni_locked(struct lnet_peer_ni *lpni)
 {
-	struct lnet_peer_net *peer_net;
-	struct lnet_peer *peer;
+	struct lnet_peer_table *ptable;
+	struct lnet_peer_net *lpn;
+	struct lnet_peer *lp;
 
-	/* TODO: could the below situation happen? accessing an already
-	 * destroyed peer? */
-	if (lpni->lpni_peer_net == NULL ||
-	    lpni->lpni_peer_net->lpn_peer == NULL)
-		return;
+	/*
+	 * Belts and suspenders: gracefully handle teardown of a
+	 * partially connected peer_ni.
+	 */
+	lpn = lpni->lpni_peer_net;
 
-	peer_net = lpni->lpni_peer_net;
-	peer = lpni->lpni_peer_net->lpn_peer;
+	list_del_init(&lpni->lpni_peer_nis);
+	/*
+	 * If there are no lpni's left, we detach lpn from
+	 * lp_peer_nets, so it cannot be found anymore.
+	 */
+	if (list_empty(&lpn->lpn_peer_nis))
+		list_del_init(&lpn->lpn_peer_nets);
 
-	list_del_init(&lpni->lpni_on_peer_net_list);
-	lpni->lpni_peer_net = NULL;
+	/* Update peer NID count. */
+	lp = lpn->lpn_peer;
+	lp->lp_nnis--;
 
-	/* if peer_net is empty, then remove it from the peer */
-	if (list_empty(&peer_net->lpn_peer_nis)) {
-		list_del_init(&peer_net->lpn_on_peer_list);
-		peer_net->lpn_peer = NULL;
-		LIBCFS_FREE(peer_net, sizeof(*peer_net));
-
-		/* if the peer is empty then remove it from the
-		 * the_lnet.ln_peers */
-		if (list_empty(&peer->lp_peer_nets)) {
-			list_del_init(&peer->lp_on_lnet_peer_list);
-			LIBCFS_FREE(peer, sizeof(*peer));
-		}
+	/*
+	 * If there are no more peer nets, make the peer unfindable
+	 * via the peer_tables.
+	 *
+	 * Otherwise, if the peer is DISCOVERED, tell discovery to
+	 * take another look at it. This is a no-op if discovery for
+	 * this peer did the detaching.
+	 */
+	if (list_empty(&lp->lp_peer_nets)) {
+		list_del_init(&lp->lp_peer_list);
+		ptable = the_lnet.ln_peer_tables[lp->lp_cpt];
+		ptable->pt_peers--;
+	} else if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING) {
+		/* Discovery isn't running, nothing to do here. */
+	} else if (lp->lp_state & LNET_PEER_DISCOVERED) {
+		lnet_peer_queue_for_discovery(lp);
+		wake_up(&the_lnet.ln_dc_waitq);
 	}
+	CDEBUG(D_NET, "peer %s NID %s\n",
+		libcfs_nid2str(lp->lp_primary_nid),
+		libcfs_nid2str(lpni->lpni_nid));
 }
 
 /* called with lnet_net_lock LNET_LOCK_EX held */
@@ -275,10 +380,18 @@ lnet_peer_ni_del_locked(struct lnet_peer_ni *lpni)
 	/* remove peer ni from the hash list. */
 	list_del_init(&lpni->lpni_hashlist);
 
+	/*
+	 * indicate the peer is being deleted so the monitor thread can
+	 * remove it from the recovery queue.
+	 */
+	spin_lock(&lpni->lpni_lock);
+	lpni->lpni_state |= LNET_PEER_NI_DELETING;
+	spin_unlock(&lpni->lpni_lock);
+
 	/* decrement the ref count on the peer table */
 	ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt];
-	LASSERT(atomic_read(&ptable->pt_number) > 0);
-	atomic_dec(&ptable->pt_number);
+	LASSERT(ptable->pt_number > 0);
+	ptable->pt_number--;
 
 	/*
 	 * The peer_ni can no longer be found with a lookup. But there
@@ -287,7 +400,7 @@ lnet_peer_ni_del_locked(struct lnet_peer_ni *lpni)
 	 *
 	 * The last reference may be lost in a place where the
 	 * lnet_net_lock locks only a single cpt, and that cpt may not
-	 * be lpni->lpni_cpt. So the zombie list of this peer_table
+	 * be lpni->lpni_cpt. So the zombie list of lnet_peer_table
 	 * has its own lock.
 	 */
 	spin_lock(&ptable->pt_zombie_lock);
@@ -295,10 +408,10 @@ lnet_peer_ni_del_locked(struct lnet_peer_ni *lpni)
 	ptable->pt_zombies++;
 	spin_unlock(&ptable->pt_zombie_lock);
 
-	/* no need to keep this peer on the hierarchy anymore */
-	lnet_try_destroy_peer_hierarchy_locked(lpni);
+	/* no need to keep this peer_ni on the hierarchy anymore */
+	lnet_peer_detach_peer_ni_locked(lpni);
 
-	/* decrement reference on peer */
+	/* remove hashlist reference on peer_ni */
 	lnet_peer_ni_decref_locked(lpni);
 
 	return 0;
@@ -326,6 +439,8 @@ lnet_peer_del_locked(struct lnet_peer *peer)
 	struct lnet_peer_ni *lpni = NULL, *lpni2;
 	int rc = 0, rc2 = 0;
 
+	CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(peer->lp_primary_nid));
+
 	lpni = lnet_get_next_peer_ni_locked(peer, NULL, lpni);
 	while (lpni != NULL) {
 		lpni2 = lnet_get_next_peer_ni_locked(peer, NULL, lpni);
@@ -338,6 +453,71 @@ lnet_peer_del_locked(struct lnet_peer *peer)
 	return rc2;
 }
 
+static int
+lnet_peer_del(struct lnet_peer *peer)
+{
+	lnet_net_lock(LNET_LOCK_EX);
+	lnet_peer_del_locked(peer);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	return 0;
+}
+
+/*
+ * Delete a NID from a peer. Call with ln_api_mutex held.
+ *
+ * Error codes:
+ *  -EPERM:  Non-DLC deletion from DLC-configured peer.
+ *  -ENOENT: No lnet_peer_ni corresponding to the nid.
+ *  -ECHILD: The lnet_peer_ni isn't connected to the peer.
+ *  -EBUSY:  The lnet_peer_ni is the primary, and not the only peer_ni.
+ */
+static int
+lnet_peer_del_nid(struct lnet_peer *lp, lnet_nid_t nid, unsigned flags)
+{
+	struct lnet_peer_ni *lpni;
+	lnet_nid_t primary_nid = lp->lp_primary_nid;
+	int rc = 0;
+
+	if (!(flags & LNET_PEER_CONFIGURED)) {
+		if (lp->lp_state & LNET_PEER_CONFIGURED) {
+			rc = -EPERM;
+			goto out;
+		}
+	}
+	lpni = lnet_find_peer_ni_locked(nid);
+	if (!lpni) {
+		rc = -ENOENT;
+		goto out;
+	}
+	lnet_peer_ni_decref_locked(lpni);
+	if (lp != lpni->lpni_peer_net->lpn_peer) {
+		rc = -ECHILD;
+		goto out;
+	}
+
+	/*
+	 * This function only allows deletion of the primary NID if it
+	 * is the only NID.
+	 */
+	if (nid == lp->lp_primary_nid && lp->lp_nnis != 1) {
+		rc = -EBUSY;
+		goto out;
+	}
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	rc = lnet_peer_ni_del_locked(lpni);
+
+	lnet_net_unlock(LNET_LOCK_EX);
+
+out:
+	CDEBUG(D_NET, "peer %s NID %s flags %#x: %d\n",
+	       libcfs_nid2str(primary_nid), libcfs_nid2str(nid), flags, rc);
+
+	return rc;
+}
+
 static void
 lnet_peer_table_cleanup_locked(struct lnet_net *net,
 			       struct lnet_peer_table *ptable)
@@ -424,8 +604,8 @@ lnet_peer_table_del_rtrs_locked(struct lnet_net *net,
 void
 lnet_peer_tables_cleanup(struct lnet_net *net)
 {
-	int				i;
-	struct lnet_peer_table		*ptable;
+	int i;
+	struct lnet_peer_table *ptable;
 
 	LASSERT(the_lnet.ln_state != LNET_STATE_SHUTDOWN || net != NULL);
 	/* If just deleting the peers for a NI, get rid of any routes these
@@ -482,42 +662,24 @@ lnet_find_peer_ni_locked(lnet_nid_t nid)
 }
 
 struct lnet_peer *
-lnet_find_or_create_peer_locked(lnet_nid_t dst_nid, int cpt)
+lnet_find_peer(lnet_nid_t nid)
 {
 	struct lnet_peer_ni *lpni;
-	struct lnet_peer *lp;
+	struct lnet_peer *lp = NULL;
+	int cpt;
 
-	lpni = lnet_find_peer_ni_locked(dst_nid);
-	if (!lpni) {
-		lpni = lnet_nid2peerni_locked(dst_nid, cpt);
-		if (IS_ERR(lpni))
-			return ERR_CAST(lpni);
+	cpt = lnet_net_lock_current();
+	lpni = lnet_find_peer_ni_locked(nid);
+	if (lpni) {
+		lp = lpni->lpni_peer_net->lpn_peer;
+		lnet_peer_addref_locked(lp);
+		lnet_peer_ni_decref_locked(lpni);
 	}
-
-	lp = lpni->lpni_peer_net->lpn_peer;
-	lnet_peer_ni_decref_locked(lpni);
+	lnet_net_unlock(cpt);
 
 	return lp;
 }
 
-struct lnet_peer_ni *
-lnet_get_peer_ni_idx_locked(int idx, struct lnet_peer_net **lpn,
-			    struct lnet_peer **lp)
-{
-	struct lnet_peer_ni	*lpni;
-
-	list_for_each_entry((*lp), &the_lnet.ln_peers, lp_on_lnet_peer_list) {
-		list_for_each_entry((*lpn), &((*lp)->lp_peer_nets), lpn_on_peer_list) {
-			list_for_each_entry(lpni, &((*lpn)->lpn_peer_nis),
-					    lpni_on_peer_net_list)
-				if (idx-- == 0)
-					return lpni;
-		}
-	}
-
-	return NULL;
-}
-
 struct lnet_peer_ni *
 lnet_get_next_peer_ni_locked(struct lnet_peer *peer,
 			     struct lnet_peer_net *peer_net,
@@ -527,18 +689,21 @@ lnet_get_next_peer_ni_locked(struct lnet_peer *peer,
 	struct lnet_peer_net *net = peer_net;
 
 	if (!prev) {
-		if (!net)
+		if (!net) {
+			if (list_empty(&peer->lp_peer_nets))
+				return NULL;
+
 			net = list_entry(peer->lp_peer_nets.next,
 					 struct lnet_peer_net,
-					 lpn_on_peer_list);
+					 lpn_peer_nets);
+		}
 		lpni = list_entry(net->lpn_peer_nis.next, struct lnet_peer_ni,
-				  lpni_on_peer_net_list);
+				  lpni_peer_nis);
 
 		return lpni;
 	}
 
-	if (prev->lpni_on_peer_net_list.next ==
-	    &prev->lpni_peer_net->lpn_peer_nis) {
+	if (prev->lpni_peer_nis.next == &prev->lpni_peer_net->lpn_peer_nis) {
 		/*
 		 * if you reached the end of the peer ni list and the peer
 		 * net is specified then there are no more peer nis in that
@@ -551,428 +716,915 @@ lnet_get_next_peer_ni_locked(struct lnet_peer *peer,
 		 * we reached the end of this net ni list. move to the
 		 * next net
 		 */
-		if (prev->lpni_peer_net->lpn_on_peer_list.next ==
+		if (prev->lpni_peer_net->lpn_peer_nets.next ==
 		    &peer->lp_peer_nets)
 			/* no more nets and no more NIs. */
 			return NULL;
 
 		/* get the next net */
-		net = list_entry(prev->lpni_peer_net->lpn_on_peer_list.next,
+		net = list_entry(prev->lpni_peer_net->lpn_peer_nets.next,
 				 struct lnet_peer_net,
-				 lpn_on_peer_list);
+				 lpn_peer_nets);
 		/* get the ni on it */
 		lpni = list_entry(net->lpn_peer_nis.next, struct lnet_peer_ni,
-				  lpni_on_peer_net_list);
+				  lpni_peer_nis);
 
 		return lpni;
 	}
 
 	/* there are more nis left */
-	lpni = list_entry(prev->lpni_on_peer_net_list.next,
-			  struct lnet_peer_ni, lpni_on_peer_net_list);
+	lpni = list_entry(prev->lpni_peer_nis.next,
+			  struct lnet_peer_ni, lpni_peer_nis);
 
 	return lpni;
 }
 
+/* Call with the ln_api_mutex held */
+int lnet_get_peer_list(u32 *countp, u32 *sizep, struct lnet_process_id __user *ids)
+{
+	struct lnet_process_id id;
+	struct lnet_peer_table *ptable;
+	struct lnet_peer *lp;
+	__u32 count = 0;
+	__u32 size = 0;
+	int lncpt;
+	int cpt;
+	__u32 i;
+	int rc;
+
+	rc = -ESHUTDOWN;
+	if (the_lnet.ln_state != LNET_STATE_RUNNING)
+		goto done;
+
+	lncpt = cfs_percpt_number(the_lnet.ln_peer_tables);
+
+	/*
+	 * Count the number of peers, and return E2BIG if the buffer
+	 * is too small. We'll also return the desired size.
+	 */
+	rc = -E2BIG;
+	for (cpt = 0; cpt < lncpt; cpt++) {
+		ptable = the_lnet.ln_peer_tables[cpt];
+		count += ptable->pt_peers;
+	}
+	size = count * sizeof(*ids);
+	if (size > *sizep)
+		goto done;
+
+	/*
+	 * Walk the peer lists and copy out the primary nids.
+	 * This is safe because the peer lists are only modified
+	 * while the ln_api_mutex is held. So we don't need to
+	 * hold the lnet_net_lock as well, and can therefore
+	 * directly call copy_to_user().
+	 */
+	rc = -EFAULT;
+	memset(&id, 0, sizeof(id));
+	id.pid = LNET_PID_LUSTRE;
+	i = 0;
+	for (cpt = 0; cpt < lncpt; cpt++) {
+		ptable = the_lnet.ln_peer_tables[cpt];
+		list_for_each_entry(lp, &ptable->pt_peer_list, lp_peer_list) {
+			if (i >= count)
+				goto done;
+			id.nid = lp->lp_primary_nid;
+			if (copy_to_user(&ids[i], &id, sizeof(id)))
+				goto done;
+			i++;
+		}
+	}
+	rc = 0;
+done:
+	*countp = count;
+	*sizep = size;
+	return rc;
+}
+
+/*
+ * Start pushes to peers that need to be updated for a configuration
+ * change on this node.
+ */
+void
+lnet_push_update_to_peers(int force)
+{
+	struct lnet_peer_table *ptable;
+	struct lnet_peer *lp;
+	int lncpt;
+	int cpt;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	lncpt = cfs_percpt_number(the_lnet.ln_peer_tables);
+	for (cpt = 0; cpt < lncpt; cpt++) {
+		ptable = the_lnet.ln_peer_tables[cpt];
+		list_for_each_entry(lp, &ptable->pt_peer_list, lp_peer_list) {
+			if (force) {
+				spin_lock(&lp->lp_lock);
+				if (lp->lp_state & LNET_PEER_MULTI_RAIL)
+					lp->lp_state |= LNET_PEER_FORCE_PUSH;
+				spin_unlock(&lp->lp_lock);
+			}
+			if (lnet_peer_needs_push(lp))
+				lnet_peer_queue_for_discovery(lp);
+		}
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+	wake_up(&the_lnet.ln_dc_waitq);
+}
+
+/*
+ * Test whether a ni is a preferred ni for this peer_ni, e.g, whether
+ * this is a preferred point-to-point path. Call with lnet_net_lock in
+ * shared mmode.
+ */
 bool
-lnet_peer_is_ni_pref_locked(struct lnet_peer_ni *lpni, struct lnet_ni *ni)
+lnet_peer_is_pref_nid_locked(struct lnet_peer_ni *lpni, lnet_nid_t nid)
 {
 	int i;
 
+	if (lpni->lpni_pref_nnids == 0)
+		return false;
+	if (lpni->lpni_pref_nnids == 1)
+		return lpni->lpni_pref.nid == nid;
 	for (i = 0; i < lpni->lpni_pref_nnids; i++) {
-		if (lpni->lpni_pref_nids[i] == ni->ni_nid)
+		if (lpni->lpni_pref.nids[i] == nid)
 			return true;
 	}
 	return false;
 }
 
-lnet_nid_t
-lnet_peer_primary_nid_locked(lnet_nid_t nid)
+/*
+ * Set a single ni as preferred, provided no preferred ni is already
+ * defined. Only to be used for non-multi-rail peer_ni.
+ */
+int
+lnet_peer_ni_set_non_mr_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid)
 {
-	struct lnet_peer_ni *lpni;
-	lnet_nid_t primary_nid = nid;
+	int rc = 0;
 
-	lpni = lnet_find_peer_ni_locked(nid);
-	if (lpni) {
-		primary_nid = lpni->lpni_peer_net->lpn_peer->lp_primary_nid;
-		lnet_peer_ni_decref_locked(lpni);
+	spin_lock(&lpni->lpni_lock);
+	if (nid == LNET_NID_ANY) {
+		rc = -EINVAL;
+	} else if (lpni->lpni_pref_nnids > 0) {
+		rc = -EPERM;
+	} else if (lpni->lpni_pref_nnids == 0) {
+		lpni->lpni_pref.nid = nid;
+		lpni->lpni_pref_nnids = 1;
+		lpni->lpni_state |= LNET_PEER_NI_NON_MR_PREF;
 	}
+	spin_unlock(&lpni->lpni_lock);
 
-	return primary_nid;
+	CDEBUG(D_NET, "peer %s nid %s: %d\n",
+	       libcfs_nid2str(lpni->lpni_nid), libcfs_nid2str(nid), rc);
+	return rc;
 }
 
-lnet_nid_t
-LNetPrimaryNID(lnet_nid_t nid)
+/*
+ * Clear the preferred NID from a non-multi-rail peer_ni, provided
+ * this preference was set by lnet_peer_ni_set_non_mr_pref_nid().
+ */
+int
+lnet_peer_ni_clr_non_mr_pref_nid(struct lnet_peer_ni *lpni)
 {
-	struct lnet_peer_ni *lpni;
-	lnet_nid_t primary_nid = nid;
 	int rc = 0;
-	int cpt;
 
-	cpt = lnet_net_lock_current();
-	lpni = lnet_nid2peerni_locked(nid, cpt);
-	if (IS_ERR(lpni)) {
-		rc = PTR_ERR(lpni);
-		goto out_unlock;
+	spin_lock(&lpni->lpni_lock);
+	if (lpni->lpni_state & LNET_PEER_NI_NON_MR_PREF) {
+		lpni->lpni_pref_nnids = 0;
+		lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF;
+	} else if (lpni->lpni_pref_nnids == 0) {
+		rc = -ENOENT;
+	} else {
+		rc = -EPERM;
 	}
-	primary_nid = lpni->lpni_peer_net->lpn_peer->lp_primary_nid;
-	lnet_peer_ni_decref_locked(lpni);
-out_unlock:
-	lnet_net_unlock(cpt);
+	spin_unlock(&lpni->lpni_lock);
 
-	CDEBUG(D_NET, "NID %s primary NID %s rc %d\n", libcfs_nid2str(nid),
-	       libcfs_nid2str(primary_nid), rc);
-	return primary_nid;
+	CDEBUG(D_NET, "peer %s: %d\n",
+	       libcfs_nid2str(lpni->lpni_nid), rc);
+	return rc;
 }
-EXPORT_SYMBOL(LNetPrimaryNID);
 
-struct lnet_peer_net *
-lnet_peer_get_net_locked(struct lnet_peer *peer, __u32 net_id)
+/*
+ * Clear the preferred NIDs from a non-multi-rail peer.
+ */
+void
+lnet_peer_clr_non_mr_pref_nids(struct lnet_peer *lp)
 {
-	struct lnet_peer_net *peer_net;
-	list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_on_peer_list) {
-		if (peer_net->lpn_net_id == net_id)
-			return peer_net;
-	}
-	return NULL;
+	struct lnet_peer_ni *lpni = NULL;
+
+	while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL)
+		lnet_peer_ni_clr_non_mr_pref_nid(lpni);
 }
 
-static int
-lnet_peer_setup_hierarchy(struct lnet_peer *lp, struct lnet_peer_ni *lpni,
-			  lnet_nid_t nid)
+int
+lnet_peer_add_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid)
 {
-	struct lnet_peer_net *lpn = NULL;
-	struct lnet_peer_table *ptable;
-        __u32 net_id = LNET_NIDNET(nid);
+	lnet_nid_t *nids = NULL;
+	lnet_nid_t *oldnids = NULL;
+	struct lnet_peer *lp = lpni->lpni_peer_net->lpn_peer;
+	int size;
+	int i;
+	int rc = 0;
 
-	/*
-	 * Create the peer_ni, peer_net, and peer if they don't exist
-	 * yet.
-	 */
-	if (lp) {
-		lpn = lnet_peer_get_net_locked(lp, net_id);
-	} else {
-		lp = lnet_peer_alloc(nid);
-		if (!lp)
-			goto out_enomem;
+	if (nid == LNET_NID_ANY) {
+		rc = -EINVAL;
+		goto out;
 	}
 
-	if (!lpn) {
-		lpn = lnet_peer_net_alloc(net_id);
-		if (!lpn)
-			goto out_maybe_free_lp;
+	if (lpni->lpni_pref_nnids == 1 && lpni->lpni_pref.nid == nid) {
+		rc = -EEXIST;
+		goto out;
 	}
 
-	if (!lpni) {
-		lpni = lnet_peer_ni_alloc(nid);
-		if (!lpni)
-			goto out_maybe_free_lpn;
+	/* A non-MR node may have only one preferred NI per peer_ni */
+	if (lpni->lpni_pref_nnids > 0) {
+		if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) {
+			rc = -EPERM;
+			goto out;
+		}
+	}
+
+	if (lpni->lpni_pref_nnids != 0) {
+		size = sizeof(*nids) * (lpni->lpni_pref_nnids + 1);
+		LIBCFS_CPT_ALLOC(nids, lnet_cpt_table(), lpni->lpni_cpt, size);
+		if (!nids) {
+			rc = -ENOMEM;
+			goto out;
+		}
+		for (i = 0; i < lpni->lpni_pref_nnids; i++) {
+			if (lpni->lpni_pref.nids[i] == nid) {
+				LIBCFS_FREE(nids, size);
+				rc = -EEXIST;
+				goto out;
+			}
+			nids[i] = lpni->lpni_pref.nids[i];
+		}
+		nids[i] = nid;
 	}
 
-	/* Install the new peer_ni */
 	lnet_net_lock(LNET_LOCK_EX);
-	/* Add peer_ni to global peer table hash, if necessary. */
-	if (list_empty(&lpni->lpni_hashlist)) {
-		ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt];
-		list_add_tail(&lpni->lpni_hashlist,
-			      &ptable->pt_hash[lnet_nid2peerhash(nid)]);
-		ptable->pt_version++;
-		atomic_inc(&ptable->pt_number);
-		atomic_inc(&lpni->lpni_refcount);
+	spin_lock(&lpni->lpni_lock);
+	if (lpni->lpni_pref_nnids == 0) {
+		lpni->lpni_pref.nid = nid;
+	} else {
+		oldnids = lpni->lpni_pref.nids;
+		lpni->lpni_pref.nids = nids;
 	}
+	lpni->lpni_pref_nnids++;
+	lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF;
+	spin_unlock(&lpni->lpni_lock);
+	lnet_net_unlock(LNET_LOCK_EX);
 
-	/* Detach the peer_ni from an existing peer, if necessary. */
-	if (lpni->lpni_peer_net && lpni->lpni_peer_net->lpn_peer != lp)
-		lnet_try_destroy_peer_hierarchy_locked(lpni);
+	if (oldnids) {
+		size = sizeof(*nids) * (lpni->lpni_pref_nnids - 1);
+		LIBCFS_FREE(oldnids, sizeof(*oldnids) * size);
+	}
+out:
+	if (rc == -EEXIST && (lpni->lpni_state & LNET_PEER_NI_NON_MR_PREF)) {
+		spin_lock(&lpni->lpni_lock);
+		lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF;
+		spin_unlock(&lpni->lpni_lock);
+	}
+	CDEBUG(D_NET, "peer %s nid %s: %d\n",
+	       libcfs_nid2str(lp->lp_primary_nid), libcfs_nid2str(nid), rc);
+	return rc;
+}
 
-	/* Add peer_ni to peer_net */
-	lpni->lpni_peer_net = lpn;
-	list_add_tail(&lpni->lpni_on_peer_net_list, &lpn->lpn_peer_nis);
+int
+lnet_peer_del_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid)
+{
+	lnet_nid_t *nids = NULL;
+	lnet_nid_t *oldnids = NULL;
+	struct lnet_peer *lp = lpni->lpni_peer_net->lpn_peer;
+	int size;
+	int i, j;
+	int rc = 0;
 
-	/* Add peer_net to peer */
-	if (!lpn->lpn_peer) {
-		lpn->lpn_peer = lp;
-		list_add_tail(&lpn->lpn_on_peer_list, &lp->lp_peer_nets);
+	if (lpni->lpni_pref_nnids == 0) {
+		rc = -ENOENT;
+		goto out;
 	}
 
-	/* Add peer to global peer list */
-	if (list_empty(&lp->lp_on_lnet_peer_list))
-		list_add_tail(&lp->lp_on_lnet_peer_list, &the_lnet.ln_peers);
-	lnet_net_unlock(LNET_LOCK_EX);
+	if (lpni->lpni_pref_nnids == 1) {
+		if (lpni->lpni_pref.nid != nid) {
+			rc = -ENOENT;
+			goto out;
+		}
+	} else if (lpni->lpni_pref_nnids == 2) {
+		if (lpni->lpni_pref.nids[0] != nid &&
+		    lpni->lpni_pref.nids[1] != nid) {
+			rc = -ENOENT;
+			goto out;
+		}
+	} else {
+		size = sizeof(*nids) * (lpni->lpni_pref_nnids - 1);
+		LIBCFS_CPT_ALLOC(nids, lnet_cpt_table(), lpni->lpni_cpt, size);
+		if (!nids) {
+			rc = -ENOMEM;
+			goto out;
+		}
+		for (i = 0, j = 0; i < lpni->lpni_pref_nnids; i++) {
+			if (lpni->lpni_pref.nids[i] != nid)
+				continue;
+			nids[j++] = lpni->lpni_pref.nids[i];
+		}
+		/* Check if we actually removed a nid. */
+		if (j == lpni->lpni_pref_nnids) {
+			LIBCFS_FREE(nids, size);
+			rc = -ENOENT;
+			goto out;
+		}
+	}
 
-	return 0;
+	lnet_net_lock(LNET_LOCK_EX);
+	spin_lock(&lpni->lpni_lock);
+	if (lpni->lpni_pref_nnids == 1) {
+		lpni->lpni_pref.nid = LNET_NID_ANY;
+	} else if (lpni->lpni_pref_nnids == 2) {
+		oldnids = lpni->lpni_pref.nids;
+		if (oldnids[0] == nid)
+			lpni->lpni_pref.nid = oldnids[1];
+		else
+			lpni->lpni_pref.nid = oldnids[2];
+	} else {
+		oldnids = lpni->lpni_pref.nids;
+		lpni->lpni_pref.nids = nids;
+	}
+	lpni->lpni_pref_nnids--;
+	lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF;
+	spin_unlock(&lpni->lpni_lock);
+	lnet_net_unlock(LNET_LOCK_EX);
 
-out_maybe_free_lpn:
-	if (list_empty(&lpn->lpn_on_peer_list))
-		LIBCFS_FREE(lpn, sizeof(*lpn));
-out_maybe_free_lp:
-	if (list_empty(&lp->lp_on_lnet_peer_list))
-		LIBCFS_FREE(lp, sizeof(*lp));
-out_enomem:
-	return -ENOMEM;
+	if (oldnids) {
+		size = sizeof(*nids) * (lpni->lpni_pref_nnids + 1);
+		LIBCFS_FREE(oldnids, sizeof(*oldnids) * size);
+	}
+out:
+	CDEBUG(D_NET, "peer %s nid %s: %d\n",
+	       libcfs_nid2str(lp->lp_primary_nid), libcfs_nid2str(nid), rc);
+	return rc;
 }
 
-static int
-lnet_add_prim_lpni(lnet_nid_t nid)
+lnet_nid_t
+lnet_peer_primary_nid_locked(lnet_nid_t nid)
 {
-	int rc = 0;
-	struct lnet_peer *peer;
 	struct lnet_peer_ni *lpni;
+	lnet_nid_t primary_nid = nid;
 
-	LASSERT(nid != LNET_NID_ANY);
-
-	/*
-	 * lookup the NID and its peer
-	 *  if the peer doesn't exist, create it.
-	 *  if this is a non-MR peer then change its state to MR and exit.
-	 *  if this is an MR peer and it's a primary NI: NO-OP.
-	 *  if this is an MR peer and it's not a primary NI. Operation not
-	 *     allowed.
-	 *
-	 * The adding and deleting of peer nis is being serialized through
-	 * the api_mutex. So we can look up peers with the mutex locked
-	 * safely. Only when we need to change the ptable, do we need to
-	 * exclusively lock the lnet_net_lock()
-	 */
 	lpni = lnet_find_peer_ni_locked(nid);
-	if (!lpni) {
-		rc = lnet_peer_setup_hierarchy(NULL, NULL, nid);
-		if (rc != 0)
-			return rc;
-		lpni = lnet_find_peer_ni_locked(nid);
+	if (lpni) {
+		primary_nid = lpni->lpni_peer_net->lpn_peer->lp_primary_nid;
+		lnet_peer_ni_decref_locked(lpni);
 	}
 
-	LASSERT(lpni);
+	return primary_nid;
+}
 
-	lnet_peer_ni_decref_locked(lpni);
+bool
+lnet_is_discovery_disabled_locked(struct lnet_peer *lp)
+{
+	if (lnet_peer_discovery_disabled)
+		return true;
 
-	peer = lpni->lpni_peer_net->lpn_peer;
+	if (!(lp->lp_state & LNET_PEER_MULTI_RAIL) ||
+	    (lp->lp_state & LNET_PEER_NO_DISCOVERY)) {
+		return true;
+	}
 
-	/*
-	 * If we found a lpni with the same nid as the NID we're trying to
-	 * create, then we're trying to create an already existing lpni 
-	 * that belongs to a different peer
-	 */
-	if (peer->lp_primary_nid != nid)
-		return -EEXIST;
+	return false;
+}
 
-	/*
-	 * if we found an lpni that is not a multi-rail, which could occur
-	 * if lpni is already created as a non-mr lpni or we just created
-	 * it, then make sure you indicate that this lpni is a primary mr
-	 * capable peer.
-	 *
-	 * TODO: update flags if necessary
-	 */
-	if (!peer->lp_multi_rail && peer->lp_primary_nid == nid)
-		peer->lp_multi_rail = true;
+/*
+ * Peer Discovery
+ */
+bool
+lnet_is_discovery_disabled(struct lnet_peer *lp)
+{
+	bool rc = false;
+
+	spin_lock(&lp->lp_lock);
+	rc = lnet_is_discovery_disabled_locked(lp);
+	spin_unlock(&lp->lp_lock);
 
 	return rc;
 }
 
+lnet_nid_t
+LNetPrimaryNID(lnet_nid_t nid)
+{
+	struct lnet_peer *lp;
+	struct lnet_peer_ni *lpni;
+	lnet_nid_t primary_nid = nid;
+	int rc = 0;
+	int cpt;
+
+	if (nid == LNET_NID_LO_0)
+		return LNET_NID_LO_0;
+
+	cpt = lnet_net_lock_current();
+	lpni = lnet_nid2peerni_locked(nid, LNET_NID_ANY, cpt);
+	if (IS_ERR(lpni)) {
+		rc = PTR_ERR(lpni);
+		goto out_unlock;
+	}
+	lp = lpni->lpni_peer_net->lpn_peer;
+
+	while (!lnet_peer_is_uptodate(lp)) {
+		spin_lock(&lp->lp_lock);
+		/* force a full discovery cycle */
+		lp->lp_state |= LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH;
+		spin_unlock(&lp->lp_lock);
+
+		rc = lnet_discover_peer_locked(lpni, cpt, true);
+		if (rc)
+			goto out_decref;
+		lp = lpni->lpni_peer_net->lpn_peer;
+
+		/* Only try once if discovery is disabled */
+		if (lnet_is_discovery_disabled(lp))
+			break;
+	}
+	primary_nid = lp->lp_primary_nid;
+out_decref:
+	lnet_peer_ni_decref_locked(lpni);
+out_unlock:
+	lnet_net_unlock(cpt);
+
+	CDEBUG(D_NET, "NID %s primary NID %s rc %d\n", libcfs_nid2str(nid),
+	       libcfs_nid2str(primary_nid), rc);
+	return primary_nid;
+}
+EXPORT_SYMBOL(LNetPrimaryNID);
+
+struct lnet_peer_net *
+lnet_peer_get_net_locked(struct lnet_peer *peer, __u32 net_id)
+{
+	struct lnet_peer_net *peer_net;
+	list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_peer_nets) {
+		if (peer_net->lpn_net_id == net_id)
+			return peer_net;
+	}
+	return NULL;
+}
+
+/*
+ * Attach a peer_ni to a peer_net and peer. This function assumes
+ * peer_ni is not already attached to the peer_net/peer. The peer_ni
+ * may be attached to a different peer, in which case it will be
+ * properly detached first. The whole operation is done atomically.
+ *
+ * Always returns 0.  This is the last function called from functions
+ * that do return an int, so returning 0 here allows the compiler to
+ * do a tail call.
+ */
 static int
-lnet_add_peer_ni_to_prim_lpni(lnet_nid_t prim_nid, lnet_nid_t nid)
+lnet_peer_attach_peer_ni(struct lnet_peer *lp,
+				struct lnet_peer_net *lpn,
+				struct lnet_peer_ni *lpni,
+				unsigned flags)
 {
-	struct lnet_peer *peer, *primary_peer;
-	struct lnet_peer_ni *lpni = NULL, *klpni = NULL;
+	struct lnet_peer_table *ptable;
+
+	/* Install the new peer_ni */
+	lnet_net_lock(LNET_LOCK_EX);
+	/* Add peer_ni to global peer table hash, if necessary. */
+	if (list_empty(&lpni->lpni_hashlist)) {
+		int hash = lnet_nid2peerhash(lpni->lpni_nid);
 
-	LASSERT(prim_nid != LNET_NID_ANY && nid != LNET_NID_ANY);
+		ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt];
+		list_add_tail(&lpni->lpni_hashlist, &ptable->pt_hash[hash]);
+		ptable->pt_version++;
+		ptable->pt_number++;
+		/* This is the 1st refcount on lpni. */
+		atomic_inc(&lpni->lpni_refcount);
+	}
 
-	/*
-	 * key nid must be created by this point. If not then this
-	 * operation is not permitted
-	 */
-	klpni = lnet_find_peer_ni_locked(prim_nid);
-	if (!klpni)
-		return -ENOENT;
+	/* Detach the peer_ni from an existing peer, if necessary. */
+	if (lpni->lpni_peer_net) {
+		LASSERT(lpni->lpni_peer_net != lpn);
+		LASSERT(lpni->lpni_peer_net->lpn_peer != lp);
+		lnet_peer_detach_peer_ni_locked(lpni);
+		lnet_peer_net_decref_locked(lpni->lpni_peer_net);
+		lpni->lpni_peer_net = NULL;
+	}
+
+	/* Add peer_ni to peer_net */
+	lpni->lpni_peer_net = lpn;
+	list_add_tail(&lpni->lpni_peer_nis, &lpn->lpn_peer_nis);
+	lnet_peer_net_addref_locked(lpn);
+
+	/* Add peer_net to peer */
+	if (!lpn->lpn_peer) {
+		lpn->lpn_peer = lp;
+		list_add_tail(&lpn->lpn_peer_nets, &lp->lp_peer_nets);
+		lnet_peer_addref_locked(lp);
+	}
+
+	/* Add peer to global peer list, if necessary */
+	ptable = the_lnet.ln_peer_tables[lp->lp_cpt];
+	if (list_empty(&lp->lp_peer_list)) {
+		list_add_tail(&lp->lp_peer_list, &ptable->pt_peer_list);
+		ptable->pt_peers++;
+	}
+
+
+	/* Update peer state */
+	spin_lock(&lp->lp_lock);
+	if (flags & LNET_PEER_CONFIGURED) {
+		if (!(lp->lp_state & LNET_PEER_CONFIGURED))
+			lp->lp_state |= LNET_PEER_CONFIGURED;
+	}
+	if (flags & LNET_PEER_MULTI_RAIL) {
+		if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) {
+			lp->lp_state |= LNET_PEER_MULTI_RAIL;
+			lnet_peer_clr_non_mr_pref_nids(lp);
+		}
+	}
+	spin_unlock(&lp->lp_lock);
+
+	lp->lp_nnis++;
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	CDEBUG(D_NET, "peer %s NID %s flags %#x\n",
+	       libcfs_nid2str(lp->lp_primary_nid),
+	       libcfs_nid2str(lpni->lpni_nid), flags);
 
-	lnet_peer_ni_decref_locked(klpni);
+	return 0;
+}
+
+/*
+ * Create a new peer, with nid as its primary nid.
+ *
+ * Call with the lnet_api_mutex held.
+ */
+static int
+lnet_peer_add(lnet_nid_t nid, unsigned flags)
+{
+	struct lnet_peer *lp;
+	struct lnet_peer_net *lpn;
+	struct lnet_peer_ni *lpni;
+	int rc = 0;
 
-	primary_peer = klpni->lpni_peer_net->lpn_peer;
+	LASSERT(nid != LNET_NID_ANY);
 
+	/*
+	 * No need for the lnet_net_lock here, because the
+	 * lnet_api_mutex is held.
+	 */
 	lpni = lnet_find_peer_ni_locked(nid);
 	if (lpni) {
+		/* A peer with this NID already exists. */
+		lp = lpni->lpni_peer_net->lpn_peer;
 		lnet_peer_ni_decref_locked(lpni);
-
-		peer = lpni->lpni_peer_net->lpn_peer;
 		/*
-		 * lpni already exists in the system but it belongs to
-		 * a different peer. We can't re-added it
+		 * This is an error if the peer was configured and the
+		 * primary NID differs or an attempt is made to change
+		 * the Multi-Rail flag. Otherwise the assumption is
+		 * that an existing peer is being modified.
 		 */
-		if (peer->lp_primary_nid != prim_nid && peer->lp_multi_rail) {
-			CERROR("Cannot add NID %s owned by peer %s to peer %s\n",
-			       libcfs_nid2str(lpni->lpni_nid),
-			       libcfs_nid2str(peer->lp_primary_nid),
-			       libcfs_nid2str(prim_nid));
-			return -EEXIST;
-		} else if (peer->lp_primary_nid == prim_nid) {
-			/*
-			 * found a peer_ni that is already part of the
-			 * peer. This is a no-op operation.
-			 */
-			return 0;
+		if (lp->lp_state & LNET_PEER_CONFIGURED) {
+			if (lp->lp_primary_nid != nid)
+				rc = -EEXIST;
+			else if ((lp->lp_state ^ flags) & LNET_PEER_MULTI_RAIL)
+				rc = -EPERM;
+			goto out;
 		}
+		/* Delete and recreate as a configured peer. */
+		lnet_peer_del(lp);
+	}
 
-		/*
-		 * TODO: else if (peer->lp_primary_nid != prim_nid &&
-		 *		  !peer->lp_multi_rail)
-		 * peer is not an MR peer and it will be moved in the next
-		 * step to klpni, so update its flags accordingly.
-		 * lnet_move_peer_ni()
-		 */
-
-		/*
-		 * TODO: call lnet_update_peer() from here to update the
-		 * flags. This is the case when the lpni you're trying to
-		 * add is already part of the peer. This could've been
-		 * added by the DD previously, so go ahead and do any
-		 * updates to the state if necessary
-		 */
+	/* Create peer, peer_net, and peer_ni. */
+	rc = -ENOMEM;
+	lp = lnet_peer_alloc(nid);
+	if (!lp)
+		goto out;
+	lpn = lnet_peer_net_alloc(LNET_NIDNET(nid));
+	if (!lpn)
+		goto out_free_lp;
+	lpni = lnet_peer_ni_alloc(nid);
+	if (!lpni)
+		goto out_free_lpn;
 
-	}
+	return lnet_peer_attach_peer_ni(lp, lpn, lpni, flags);
 
-	/*
-	 * When we get here we either have found an existing lpni, which
-	 * we can switch to the new peer. Or we need to create one and
-	 * add it to the new peer
-	 */
-	return lnet_peer_setup_hierarchy(primary_peer, lpni, nid);
+out_free_lpn:
+	LIBCFS_FREE(lpn, sizeof(*lpn));
+out_free_lp:
+	LIBCFS_FREE(lp, sizeof(*lp));
+out:
+	CDEBUG(D_NET, "peer %s NID flags %#x: %d\n",
+	       libcfs_nid2str(nid), flags, rc);
+	return rc;
 }
 
 /*
- * lpni creation initiated due to traffic either sending or receiving.
+ * Add a NID to a peer. Call with ln_api_mutex held.
+ *
+ * Error codes:
+ *  -EPERM:    Non-DLC addition to a DLC-configured peer.
+ *  -EEXIST:   The NID was configured by DLC for a different peer.
+ *  -ENOMEM:   Out of memory.
+ *  -ENOTUNIQ: Adding a second peer NID on a single network on a
+ *             non-multi-rail peer.
  */
 static int
-lnet_peer_ni_traffic_add(lnet_nid_t nid)
+lnet_peer_add_nid(struct lnet_peer *lp, lnet_nid_t nid, unsigned flags)
 {
+	struct lnet_peer_net *lpn;
 	struct lnet_peer_ni *lpni;
 	int rc = 0;
 
-	if (nid == LNET_NID_ANY)
-		return -EINVAL;
+	LASSERT(lp);
+	LASSERT(nid != LNET_NID_ANY);
+
+	/* A configured peer can only be updated through configuration. */
+	if (!(flags & LNET_PEER_CONFIGURED)) {
+		if (lp->lp_state & LNET_PEER_CONFIGURED) {
+			rc = -EPERM;
+			goto out;
+		}
+	}
+
+	/*
+	 * The MULTI_RAIL flag can be set but not cleared, because
+	 * that would leave the peer struct in an invalid state.
+	 */
+	if (flags & LNET_PEER_MULTI_RAIL) {
+		spin_lock(&lp->lp_lock);
+		if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) {
+			lp->lp_state |= LNET_PEER_MULTI_RAIL;
+			lnet_peer_clr_non_mr_pref_nids(lp);
+		}
+		spin_unlock(&lp->lp_lock);
+	} else if (lp->lp_state & LNET_PEER_MULTI_RAIL) {
+		rc = -EPERM;
+		goto out;
+	}
 
-	/* lnet_net_lock is not needed here because ln_api_lock is held */
 	lpni = lnet_find_peer_ni_locked(nid);
 	if (lpni) {
 		/*
-		 * TODO: lnet_update_primary_nid() but not all of it
-		 * only indicate if we're converting this to MR capable
-		 * Can happen due to DD
+		 * A peer_ni already exists. This is only a problem if
+		 * it is not connected to this peer and was configured
+		 * by DLC.
 		 */
 		lnet_peer_ni_decref_locked(lpni);
+		if (lpni->lpni_peer_net->lpn_peer == lp)
+			goto out;
+		if (lnet_peer_ni_is_configured(lpni)) {
+			rc = -EEXIST;
+			goto out;
+		}
+		/* If this is the primary NID, destroy the peer. */
+		if (lnet_peer_ni_is_primary(lpni)) {
+			lnet_peer_del(lpni->lpni_peer_net->lpn_peer);
+			lpni = lnet_peer_ni_alloc(nid);
+			if (!lpni) {
+				rc = -ENOMEM;
+				goto out;
+			}
+		}
 	} else {
-		rc = lnet_peer_setup_hierarchy(NULL, NULL, nid);
+		lpni = lnet_peer_ni_alloc(nid);
+		if (!lpni) {
+			rc = -ENOMEM;
+			goto out;
+		}
+	}
+
+	/*
+	 * Get the peer_net. Check that we're not adding a second
+	 * peer_ni on a peer_net of a non-multi-rail peer.
+	 */
+	lpn = lnet_peer_get_net_locked(lp, LNET_NIDNET(nid));
+	if (!lpn) {
+		lpn = lnet_peer_net_alloc(LNET_NIDNET(nid));
+		if (!lpn) {
+			rc = -ENOMEM;
+			goto out_free_lpni;
+		}
+	} else if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) {
+		rc = -ENOTUNIQ;
+		goto out_free_lpni;
 	}
 
+	return lnet_peer_attach_peer_ni(lp, lpn, lpni, flags);
+
+out_free_lpni:
+	/* If the peer_ni was allocated above its peer_net pointer is NULL */
+	if (!lpni->lpni_peer_net)
+		LIBCFS_FREE(lpni, sizeof(*lpni));
+out:
+	CDEBUG(D_NET, "peer %s NID %s flags %#x: %d\n",
+	       libcfs_nid2str(lp->lp_primary_nid), libcfs_nid2str(nid),
+	       flags, rc);
 	return rc;
+}
+
+/*
+ * Update the primary NID of a peer, if possible.
+ *
+ * Call with the lnet_api_mutex held.
+ */
+static int
+lnet_peer_set_primary_nid(struct lnet_peer *lp, lnet_nid_t nid, unsigned flags)
+{
+	lnet_nid_t old = lp->lp_primary_nid;
+	int rc = 0;
 
+	if (lp->lp_primary_nid == nid)
+		goto out;
+	rc = lnet_peer_add_nid(lp, nid, flags);
+	if (rc)
+		goto out;
+	lp->lp_primary_nid = nid;
+out:
+	CDEBUG(D_NET, "peer %s NID %s: %d\n",
+	       libcfs_nid2str(old), libcfs_nid2str(nid), rc);
+	return rc;
 }
 
+/*
+ * lpni creation initiated due to traffic either sending or receiving.
+ */
 static int
-lnet_peer_ni_add_non_mr(lnet_nid_t nid)
+lnet_peer_ni_traffic_add(lnet_nid_t nid, lnet_nid_t pref)
 {
+	struct lnet_peer *lp;
+	struct lnet_peer_net *lpn;
 	struct lnet_peer_ni *lpni;
+	unsigned flags = 0;
+	int rc = 0;
+
+	if (nid == LNET_NID_ANY) {
+		rc = -EINVAL;
+		goto out;
+	}
 
+	/* lnet_net_lock is not needed here because ln_api_lock is held */
 	lpni = lnet_find_peer_ni_locked(nid);
 	if (lpni) {
-		CERROR("Cannot add %s as non-mr when it already exists\n",
-		       libcfs_nid2str(nid));
+		/*
+		 * We must have raced with another thread. Since we
+		 * know next to nothing about a peer_ni created by
+		 * traffic, we just assume everything is ok and
+		 * return.
+		 */
 		lnet_peer_ni_decref_locked(lpni);
-		return -EEXIST;
+		goto out;
 	}
 
-	return lnet_peer_setup_hierarchy(NULL, NULL, nid);
+	/* Create peer, peer_net, and peer_ni. */
+	rc = -ENOMEM;
+	lp = lnet_peer_alloc(nid);
+	if (!lp)
+		goto out;
+	lpn = lnet_peer_net_alloc(LNET_NIDNET(nid));
+	if (!lpn)
+		goto out_free_lp;
+	lpni = lnet_peer_ni_alloc(nid);
+	if (!lpni)
+		goto out_free_lpn;
+	if (pref != LNET_NID_ANY)
+		lnet_peer_ni_set_non_mr_pref_nid(lpni, pref);
+
+	return lnet_peer_attach_peer_ni(lp, lpn, lpni, flags);
+
+out_free_lpn:
+	LIBCFS_FREE(lpn, sizeof(*lpn));
+out_free_lp:
+	LIBCFS_FREE(lp, sizeof(*lp));
+out:
+	CDEBUG(D_NET, "peer %s: %d\n", libcfs_nid2str(nid), rc);
+	return rc;
 }
 
 /*
+ * Implementation of IOC_LIBCFS_ADD_PEER_NI.
+ *
  * This API handles the following combinations:
- *	Create a primary NI if only the prim_nid is provided
- *	Create or add an lpni to a primary NI. Primary NI must've already
- *	been created
- *	Create a non-MR peer.
+ *   Create a peer with its primary NI if only the prim_nid is provided
+ *   Add a NID to a peer identified by the prim_nid. The peer identified
+ *   by the prim_nid must already exist.
+ *   The peer being created may be non-MR.
+ *
+ * The caller must hold ln_api_mutex. This prevents the peer from
+ * being created/modified/deleted by a different thread.
  */
 int
-lnet_add_peer_ni_to_peer(lnet_nid_t prim_nid, lnet_nid_t nid, bool mr)
+lnet_add_peer_ni(lnet_nid_t prim_nid, lnet_nid_t nid, bool mr)
 {
+	struct lnet_peer *lp = NULL;
+	struct lnet_peer_ni *lpni;
+	unsigned flags;
+
+	/* The prim_nid must always be specified */
+	if (prim_nid == LNET_NID_ANY)
+		return -EINVAL;
+
+	flags = LNET_PEER_CONFIGURED;
+	if (mr)
+		flags |= LNET_PEER_MULTI_RAIL;
+
 	/*
-	 * Caller trying to setup an MR like peer hierarchy but
-	 * specifying it to be non-MR. This is not allowed.
+	 * If nid isn't specified, we must create a new peer with
+	 * prim_nid as its primary nid.
 	 */
-	if (prim_nid != LNET_NID_ANY &&
-	    nid != LNET_NID_ANY && !mr)
-		return -EPERM;
+	if (nid == LNET_NID_ANY)
+		return lnet_peer_add(prim_nid, flags);
+
+	/* Look up the prim_nid, which must exist. */
+	lpni = lnet_find_peer_ni_locked(prim_nid);
+	if (!lpni)
+		return -ENOENT;
+	lnet_peer_ni_decref_locked(lpni);
+	lp = lpni->lpni_peer_net->lpn_peer;
 
-	/* Add the primary NID of a peer */
-	if (prim_nid != LNET_NID_ANY &&
-	    nid == LNET_NID_ANY && mr)
-		return lnet_add_prim_lpni(prim_nid);
+	/* Peer must have been configured. */
+	if (!(lp->lp_state & LNET_PEER_CONFIGURED)) {
+		CDEBUG(D_NET, "peer %s was not configured\n",
+		       libcfs_nid2str(prim_nid));
+		return -ENOENT;
+	}
 
-	/* Add a NID to an existing peer */
-	if (prim_nid != LNET_NID_ANY &&
-	    nid != LNET_NID_ANY && mr)
-		return lnet_add_peer_ni_to_prim_lpni(prim_nid, nid);
+	/* Primary NID must match */
+	if (lp->lp_primary_nid != prim_nid) {
+		CDEBUG(D_NET, "prim_nid %s is not primary for peer %s\n",
+		       libcfs_nid2str(prim_nid),
+		       libcfs_nid2str(lp->lp_primary_nid));
+		return -ENODEV;
+	}
 
-	/* Add a non-MR peer NI */
-	if (((prim_nid != LNET_NID_ANY &&
-	      nid == LNET_NID_ANY) ||
-	     (prim_nid == LNET_NID_ANY &&
-	      nid != LNET_NID_ANY)) && !mr)
-		return lnet_peer_ni_add_non_mr(prim_nid != LNET_NID_ANY ?
-							 prim_nid : nid);
+	/* Multi-Rail flag must match. */
+	if ((lp->lp_state ^ flags) & LNET_PEER_MULTI_RAIL) {
+		CDEBUG(D_NET, "multi-rail state mismatch for peer %s\n",
+		       libcfs_nid2str(prim_nid));
+		return -EPERM;
+	}
 
-	return 0;
+	return lnet_peer_add_nid(lp, nid, flags);
 }
 
+/*
+ * Implementation of IOC_LIBCFS_DEL_PEER_NI.
+ *
+ * This API handles the following combinations:
+ *   Delete a NI from a peer if both prim_nid and nid are provided.
+ *   Delete a peer if only prim_nid is provided.
+ *   Delete a peer if its primary nid is provided.
+ *
+ * The caller must hold ln_api_mutex. This prevents the peer from
+ * being modified/deleted by a different thread.
+ */
 int
-lnet_del_peer_ni_from_peer(lnet_nid_t prim_nid, lnet_nid_t nid)
+lnet_del_peer_ni(lnet_nid_t prim_nid, lnet_nid_t nid)
 {
-	lnet_nid_t local_nid;
-	struct lnet_peer *peer;
+	struct lnet_peer *lp;
 	struct lnet_peer_ni *lpni;
-	int rc;
+	unsigned flags;
 
 	if (prim_nid == LNET_NID_ANY)
 		return -EINVAL;
 
-	local_nid = (nid != LNET_NID_ANY) ? nid : prim_nid;
-
-	lpni = lnet_find_peer_ni_locked(local_nid);
+	lpni = lnet_find_peer_ni_locked(prim_nid);
 	if (!lpni)
-		return -EINVAL;
+		return -ENOENT;
 	lnet_peer_ni_decref_locked(lpni);
+	lp = lpni->lpni_peer_net->lpn_peer;
 
-	peer = lpni->lpni_peer_net->lpn_peer;
-	LASSERT(peer != NULL);
-
-	if (peer->lp_primary_nid == lpni->lpni_nid) {
-		/*
-		 * deleting the primary ni is equivalent to deleting the
-		 * entire peer
-		 */
-		lnet_net_lock(LNET_LOCK_EX);
-		rc = lnet_peer_del_locked(peer);
-		lnet_net_unlock(LNET_LOCK_EX);
-
-		return rc;
+	if (prim_nid != lp->lp_primary_nid) {
+		CDEBUG(D_NET, "prim_nid %s is not primary for peer %s\n",
+		       libcfs_nid2str(prim_nid),
+		       libcfs_nid2str(lp->lp_primary_nid));
+		return -ENODEV;
 	}
 
-	lnet_net_lock(LNET_LOCK_EX);
-	rc = lnet_peer_ni_del_locked(lpni);
-	lnet_net_unlock(LNET_LOCK_EX);
+	if (nid == LNET_NID_ANY || nid == lp->lp_primary_nid)
+		return lnet_peer_del(lp);
 
-	return rc;
+	flags = LNET_PEER_CONFIGURED;
+	if (lp->lp_state & LNET_PEER_MULTI_RAIL)
+		flags |= LNET_PEER_MULTI_RAIL;
+
+	return lnet_peer_del_nid(lp, nid, flags);
 }
 
 void
 lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lpni)
 {
 	struct lnet_peer_table *ptable;
+	struct lnet_peer_net *lpn;
+
+	CDEBUG(D_NET, "%p nid %s\n", lpni, libcfs_nid2str(lpni->lpni_nid));
 
 	LASSERT(atomic_read(&lpni->lpni_refcount) == 0);
 	LASSERT(lpni->lpni_rtr_refcount == 0);
 	LASSERT(list_empty(&lpni->lpni_txq));
 	LASSERT(lpni->lpni_txqnob == 0);
+	LASSERT(list_empty(&lpni->lpni_peer_nis));
+	LASSERT(list_empty(&lpni->lpni_on_remote_peer_ni_list));
 
+	lpn = lpni->lpni_peer_net;
+	lpni->lpni_peer_net = NULL;
 	lpni->lpni_net = NULL;
 
 	/* remove the peer ni from the zombie list */
@@ -982,7 +1634,13 @@ lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lpni)
 	ptable->pt_zombies--;
 	spin_unlock(&ptable->pt_zombie_lock);
 
+	if (lpni->lpni_pref_nnids > 1) {
+		LIBCFS_FREE(lpni->lpni_pref.nids,
+			sizeof(*lpni->lpni_pref.nids) * lpni->lpni_pref_nnids);
+	}
 	LIBCFS_FREE(lpni, sizeof(*lpni));
+
+	lnet_peer_net_decref_locked(lpn);
 }
 
 struct lnet_peer_ni *
@@ -1004,7 +1662,7 @@ lnet_nid2peerni_ex(lnet_nid_t nid, int cpt)
 
 	lnet_net_unlock(cpt);
 
-	rc = lnet_peer_ni_traffic_add(nid);
+	rc = lnet_peer_ni_traffic_add(nid, LNET_NID_ANY);
 	if (rc) {
 		lpni = ERR_PTR(rc);
 		goto out_net_relock;
@@ -1019,8 +1677,12 @@ lnet_nid2peerni_ex(lnet_nid_t nid, int cpt)
 	return lpni;
 }
 
+/*
+ * Get a peer_ni for the given nid, create it if necessary. Takes a
+ * hold on the peer_ni.
+ */
 struct lnet_peer_ni *
-lnet_nid2peerni_locked(lnet_nid_t nid, int cpt)
+lnet_nid2peerni_locked(lnet_nid_t nid, lnet_nid_t pref, int cpt)
 {
 	struct lnet_peer_ni *lpni = NULL;
 	int rc;
@@ -1059,7 +1721,7 @@ lnet_nid2peerni_locked(lnet_nid_t nid, int cpt)
 		goto out_mutex_unlock;
 	}
 
-	rc = lnet_peer_ni_traffic_add(nid);
+	rc = lnet_peer_ni_traffic_add(nid, pref);
 	if (rc) {
 		lpni = ERR_PTR(rc);
 		goto out_mutex_unlock;
@@ -1072,20 +1734,1615 @@ lnet_nid2peerni_locked(lnet_nid_t nid, int cpt)
 	mutex_unlock(&the_lnet.ln_api_mutex);
 	lnet_net_lock(cpt);
 
+	/* Lock has been dropped, check again for shutdown. */
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+		if (!IS_ERR(lpni))
+			lnet_peer_ni_decref_locked(lpni);
+		lpni = ERR_PTR(-ESHUTDOWN);
+	}
+
 	return lpni;
 }
 
-void
-lnet_debug_peer(lnet_nid_t nid)
+bool
+lnet_peer_is_uptodate(struct lnet_peer *lp)
 {
-	char			*aliveness = "NA";
-	struct lnet_peer_ni	*lp;
-	int			cpt;
-
-	cpt = lnet_cpt_of_nid(nid, NULL);
-	lnet_net_lock(cpt);
+	bool rc;
 
-	lp = lnet_nid2peerni_locked(nid, cpt);
+	spin_lock(&lp->lp_lock);
+	rc = lnet_peer_is_uptodate_locked(lp);
+	spin_unlock(&lp->lp_lock);
+	return rc;
+}
+
+/*
+ * Is a peer uptodate from the point of view of discovery?
+ *
+ * If it is currently being processed, obviously not.
+ * A forced Ping or Push is also handled by the discovery thread.
+ *
+ * Otherwise look at whether the peer needs rediscovering.
+ */
+bool
+lnet_peer_is_uptodate_locked(struct lnet_peer *lp)
+__must_hold(&lp->lp_lock)
+{
+	bool rc;
+
+	if (lp->lp_state & (LNET_PEER_DISCOVERING |
+			    LNET_PEER_FORCE_PING |
+			    LNET_PEER_FORCE_PUSH)) {
+		rc = false;
+	} else if (lp->lp_state & LNET_PEER_REDISCOVER) {
+			rc = false;
+	} else if (lnet_peer_needs_push(lp)) {
+		rc = false;
+	} else if (lp->lp_state & LNET_PEER_DISCOVERED) {
+		if (lp->lp_state & LNET_PEER_NIDS_UPTODATE)
+			rc = true;
+		else
+			rc = false;
+	} else {
+		rc = false;
+	}
+
+	return rc;
+}
+
+/*
+ * Queue a peer for the attention of the discovery thread.  Call with
+ * lnet_net_lock/EX held. Returns 0 if the peer was queued, and
+ * -EALREADY if the peer was already queued.
+ */
+static int lnet_peer_queue_for_discovery(struct lnet_peer *lp)
+{
+	int rc;
+
+	spin_lock(&lp->lp_lock);
+	if (!(lp->lp_state & LNET_PEER_DISCOVERING))
+		lp->lp_state |= LNET_PEER_DISCOVERING;
+	spin_unlock(&lp->lp_lock);
+	if (list_empty(&lp->lp_dc_list)) {
+		lnet_peer_addref_locked(lp);
+		list_add_tail(&lp->lp_dc_list, &the_lnet.ln_dc_request);
+		wake_up(&the_lnet.ln_dc_waitq);
+		rc = 0;
+	} else {
+		rc = -EALREADY;
+	}
+
+	CDEBUG(D_NET, "Queue peer %s: %d\n",
+	       libcfs_nid2str(lp->lp_primary_nid), rc);
+
+	return rc;
+}
+
+/*
+ * Discovery of a peer is complete. Wake all waiters on the peer.
+ * Call with lnet_net_lock/EX held.
+ */
+static void lnet_peer_discovery_complete(struct lnet_peer *lp)
+{
+	struct lnet_msg *msg, *tmp;
+	int rc = 0;
+	struct list_head pending_msgs;
+
+	INIT_LIST_HEAD(&pending_msgs);
+
+	CDEBUG(D_NET, "Discovery complete. Dequeue peer %s\n",
+	       libcfs_nid2str(lp->lp_primary_nid));
+
+	list_del_init(&lp->lp_dc_list);
+	spin_lock(&lp->lp_lock);
+	list_splice_init(&lp->lp_dc_pendq, &pending_msgs);
+	spin_unlock(&lp->lp_lock);
+	wake_up_all(&lp->lp_dc_waitq);
+
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	/* iterate through all pending messages and send them again */
+	list_for_each_entry_safe(msg, tmp, &pending_msgs, msg_list) {
+		list_del_init(&msg->msg_list);
+		if (lp->lp_dc_error) {
+			lnet_finalize(msg, lp->lp_dc_error);
+			continue;
+		}
+
+		CDEBUG(D_NET, "sending pending message %s to target %s\n",
+		       lnet_msgtyp2str(msg->msg_type),
+		       libcfs_id2str(msg->msg_target));
+		rc = lnet_send(msg->msg_src_nid_param, msg,
+			       msg->msg_rtr_nid_param);
+		if (rc < 0) {
+			CNETERR("Error sending %s to %s: %d\n",
+			       lnet_msgtyp2str(msg->msg_type),
+			       libcfs_id2str(msg->msg_target), rc);
+			lnet_finalize(msg, rc);
+		}
+	}
+	lnet_net_lock(LNET_LOCK_EX);
+	lnet_peer_decref_locked(lp);
+}
+
+/*
+ * Handle inbound push.
+ * Like any event handler, called with lnet_res_lock/CPT held.
+ */
+void lnet_peer_push_event(struct lnet_event *ev)
+{
+	struct lnet_ping_buffer *pbuf = ev->md.user_ptr;
+	struct lnet_peer *lp;
+
+	/* lnet_find_peer() adds a refcount */
+	lp = lnet_find_peer(ev->source.nid);
+	if (!lp) {
+		CDEBUG(D_NET, "Push Put from unknown %s (source %s). Ignoring...\n",
+		       libcfs_nid2str(ev->initiator.nid),
+		       libcfs_nid2str(ev->source.nid));
+		return;
+	}
+
+	/* Ensure peer state remains consistent while we modify it. */
+	spin_lock(&lp->lp_lock);
+
+	/*
+	 * If some kind of error happened the contents of the message
+	 * cannot be used. Clear the NIDS_UPTODATE and set the
+	 * FORCE_PING flag to trigger a ping.
+	 */
+	if (ev->status) {
+		lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE;
+		lp->lp_state |= LNET_PEER_FORCE_PING;
+		CDEBUG(D_NET, "Push Put error %d from %s (source %s)\n",
+		       ev->status,
+		       libcfs_nid2str(lp->lp_primary_nid),
+		       libcfs_nid2str(ev->source.nid));
+		goto out;
+	}
+
+	/*
+	 * A push with invalid or corrupted info. Clear the UPTODATE
+	 * flag to trigger a ping.
+	 */
+	if (lnet_ping_info_validate(&pbuf->pb_info)) {
+		lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE;
+		lp->lp_state |= LNET_PEER_FORCE_PING;
+		CDEBUG(D_NET, "Corrupted Push from %s\n",
+		       libcfs_nid2str(lp->lp_primary_nid));
+		goto out;
+	}
+
+	/*
+	 * Make sure we'll allocate the correct size ping buffer when
+	 * pinging the peer.
+	 */
+	if (lp->lp_data_nnis < pbuf->pb_info.pi_nnis)
+		lp->lp_data_nnis = pbuf->pb_info.pi_nnis;
+
+	/*
+	 * A non-Multi-Rail peer is not supposed to be capable of
+	 * sending a push.
+	 */
+	if (!(pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL)) {
+		CERROR("Push from non-Multi-Rail peer %s dropped\n",
+		       libcfs_nid2str(lp->lp_primary_nid));
+		goto out;
+	}
+
+	/*
+	 * The peer may have discovery disabled at its end. Set
+	 * NO_DISCOVERY as appropriate.
+	 */
+	if (!(pbuf->pb_info.pi_features & LNET_PING_FEAT_DISCOVERY)) {
+		CDEBUG(D_NET, "Peer %s has discovery disabled\n",
+		       libcfs_nid2str(lp->lp_primary_nid));
+		lp->lp_state |= LNET_PEER_NO_DISCOVERY;
+	} else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) {
+		CDEBUG(D_NET, "Peer %s has discovery enabled\n",
+		       libcfs_nid2str(lp->lp_primary_nid));
+		lp->lp_state &= ~LNET_PEER_NO_DISCOVERY;
+	}
+
+	/*
+	 * Update the MULTI_RAIL flag based on the push. If the peer
+	 * was configured with DLC then the setting should match what
+	 * DLC put in.
+	 * NB: We verified above that the MR feature bit is set in pi_features
+	 */
+	if (lp->lp_state & LNET_PEER_MULTI_RAIL) {
+		CDEBUG(D_NET, "peer %s(%p) is MR\n",
+		       libcfs_nid2str(lp->lp_primary_nid), lp);
+	} else if (lp->lp_state & LNET_PEER_CONFIGURED) {
+		CWARN("Push says %s is Multi-Rail, DLC says not\n",
+		      libcfs_nid2str(lp->lp_primary_nid));
+	} else if (lnet_peer_discovery_disabled) {
+		CDEBUG(D_NET, "peer %s(%p) not MR: DD disabled locally\n",
+		       libcfs_nid2str(lp->lp_primary_nid), lp);
+	} else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) {
+		CDEBUG(D_NET, "peer %s(%p) not MR: DD disabled remotely\n",
+		       libcfs_nid2str(lp->lp_primary_nid), lp);
+	} else {
+		CDEBUG(D_NET, "peer %s(%p) is MR capable\n",
+		       libcfs_nid2str(lp->lp_primary_nid), lp);
+		lp->lp_state |= LNET_PEER_MULTI_RAIL;
+		lnet_peer_clr_non_mr_pref_nids(lp);
+	}
+
+	/*
+	 * Check for truncation of the Put message. Clear the
+	 * NIDS_UPTODATE flag and set FORCE_PING to trigger a ping,
+	 * and tell discovery to allocate a bigger buffer.
+	 */
+	if (pbuf->pb_nnis < pbuf->pb_info.pi_nnis) {
+		if (the_lnet.ln_push_target_nnis < pbuf->pb_info.pi_nnis)
+			the_lnet.ln_push_target_nnis = pbuf->pb_info.pi_nnis;
+		lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE;
+		lp->lp_state |= LNET_PEER_FORCE_PING;
+		CDEBUG(D_NET, "Truncated Push from %s (%d nids)\n",
+		       libcfs_nid2str(lp->lp_primary_nid),
+		       pbuf->pb_info.pi_nnis);
+		goto out;
+	}
+
+	/* always assume new data */
+	lp->lp_peer_seqno = LNET_PING_BUFFER_SEQNO(pbuf);
+	lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE;
+
+	/*
+	 * If there is data present that hasn't been processed yet,
+	 * we'll replace it if the Put contained newer data and it
+	 * fits. We're racing with a Ping or earlier Push in this
+	 * case.
+	 */
+	if (lp->lp_state & LNET_PEER_DATA_PRESENT) {
+		if (LNET_PING_BUFFER_SEQNO(pbuf) >
+			LNET_PING_BUFFER_SEQNO(lp->lp_data) &&
+		    pbuf->pb_info.pi_nnis <= lp->lp_data->pb_nnis) {
+			memcpy(&lp->lp_data->pb_info, &pbuf->pb_info,
+			       LNET_PING_INFO_SIZE(pbuf->pb_info.pi_nnis));
+			CDEBUG(D_NET, "Ping/Push race from %s: %u vs %u\n",
+			      libcfs_nid2str(lp->lp_primary_nid),
+			      LNET_PING_BUFFER_SEQNO(pbuf),
+			      LNET_PING_BUFFER_SEQNO(lp->lp_data));
+		}
+		goto out;
+	}
+
+	/*
+	 * Allocate a buffer to copy the data. On a failure we drop
+	 * the Push and set FORCE_PING to force the discovery
+	 * thread to fix the problem by pinging the peer.
+	 */
+	lp->lp_data = lnet_ping_buffer_alloc(lp->lp_data_nnis, GFP_ATOMIC);
+	if (!lp->lp_data) {
+		lp->lp_state |= LNET_PEER_FORCE_PING;
+		CDEBUG(D_NET, "Cannot allocate Push buffer for %s %u\n",
+		       libcfs_nid2str(lp->lp_primary_nid),
+		       LNET_PING_BUFFER_SEQNO(pbuf));
+		goto out;
+	}
+
+	/* Success */
+	memcpy(&lp->lp_data->pb_info, &pbuf->pb_info,
+	       LNET_PING_INFO_SIZE(pbuf->pb_info.pi_nnis));
+	lp->lp_state |= LNET_PEER_DATA_PRESENT;
+	CDEBUG(D_NET, "Received Push %s %u\n",
+	       libcfs_nid2str(lp->lp_primary_nid),
+	       LNET_PING_BUFFER_SEQNO(pbuf));
+
+out:
+	/*
+	 * Queue the peer for discovery if not done, force it on the request
+	 * queue and wake the discovery thread if the peer was already queued,
+	 * because its status changed.
+	 */
+	spin_unlock(&lp->lp_lock);
+	lnet_net_lock(LNET_LOCK_EX);
+	if (!lnet_peer_is_uptodate(lp) && lnet_peer_queue_for_discovery(lp)) {
+		list_move(&lp->lp_dc_list, &the_lnet.ln_dc_request);
+		wake_up(&the_lnet.ln_dc_waitq);
+	}
+	/* Drop refcount from lookup */
+	lnet_peer_decref_locked(lp);
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+/*
+ * Clear the discovery error state, unless we're already discovering
+ * this peer, in which case the error is current.
+ */
+static void lnet_peer_clear_discovery_error(struct lnet_peer *lp)
+{
+	spin_lock(&lp->lp_lock);
+	if (!(lp->lp_state & LNET_PEER_DISCOVERING))
+		lp->lp_dc_error = 0;
+	spin_unlock(&lp->lp_lock);
+}
+
+/*
+ * Peer discovery slow path. The ln_api_mutex is held on entry, and
+ * dropped/retaken within this function. An lnet_peer_ni is passed in
+ * because discovery could tear down an lnet_peer.
+ */
+int
+lnet_discover_peer_locked(struct lnet_peer_ni *lpni, int cpt, bool block)
+{
+	DEFINE_WAIT(wait);
+	struct lnet_peer *lp;
+	int rc = 0;
+	int count = 0;
+
+again:
+	lnet_net_unlock(cpt);
+	lnet_net_lock(LNET_LOCK_EX);
+	lp = lpni->lpni_peer_net->lpn_peer;
+	lnet_peer_clear_discovery_error(lp);
+
+	/*
+	 * We're willing to be interrupted. The lpni can become a
+	 * zombie if we race with DLC, so we must check for that.
+	 */
+	for (;;) {
+		/* Keep lp alive when the lnet_net_lock is unlocked */
+		lnet_peer_addref_locked(lp);
+		prepare_to_wait(&lp->lp_dc_waitq, &wait, TASK_INTERRUPTIBLE);
+		if (signal_pending(current))
+			break;
+		if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING)
+			break;
+		/*
+		 * Don't repeat discovery if discovery is disabled. This is
+		 * done to ensure we can use discovery as a standard ping as
+		 * well for backwards compatibility with routers which do not
+		 * have discovery or have discovery disabled
+		 */
+		if (lnet_is_discovery_disabled(lp) && count > 0)
+			break;
+		if (lp->lp_dc_error)
+			break;
+		if (lnet_peer_is_uptodate(lp))
+			break;
+		lnet_peer_queue_for_discovery(lp);
+		count++;
+		CDEBUG(D_NET, "Discovery attempt # %d\n", count);
+
+		/*
+		 * If caller requested a non-blocking operation then
+		 * return immediately. Once discovery is complete any
+		 * pending messages that were stopped due to discovery
+		 * will be transmitted.
+		 */
+		if (!block)
+			break;
+
+		lnet_net_unlock(LNET_LOCK_EX);
+		schedule();
+		finish_wait(&lp->lp_dc_waitq, &wait);
+		lnet_net_lock(LNET_LOCK_EX);
+		lnet_peer_decref_locked(lp);
+		/* Peer may have changed */
+		lp = lpni->lpni_peer_net->lpn_peer;
+	}
+	finish_wait(&lp->lp_dc_waitq, &wait);
+
+	lnet_net_unlock(LNET_LOCK_EX);
+	lnet_net_lock(cpt);
+	lnet_peer_decref_locked(lp);
+	/*
+	 * The peer may have changed, so re-check and rediscover if that turns
+	 * out to have been the case. The reference count on lp ensured that
+	 * even if it was unlinked from lpni the memory could not be recycled.
+	 * Thus the check below is sufficient to determine whether the peer
+	 * changed. If the peer changed, then lp must not be dereferenced.
+	 */
+	if (lp != lpni->lpni_peer_net->lpn_peer)
+		goto again;
+
+	if (signal_pending(current))
+		rc = -EINTR;
+	else if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING)
+		rc = -ESHUTDOWN;
+	else if (lp->lp_dc_error)
+		rc = lp->lp_dc_error;
+	else if (!block)
+		CDEBUG(D_NET, "non-blocking discovery\n");
+	else if (!lnet_peer_is_uptodate(lp) && !lnet_is_discovery_disabled(lp))
+		goto again;
+
+	CDEBUG(D_NET, "peer %s NID %s: %d. %s\n",
+	       (lp ? libcfs_nid2str(lp->lp_primary_nid) : "(none)"),
+	       libcfs_nid2str(lpni->lpni_nid), rc,
+	       (!block) ? "pending discovery" : "discovery complete");
+
+	return rc;
+}
+
+/* Handle an incoming ack for a push. */
+static void
+lnet_discovery_event_ack(struct lnet_peer *lp, struct lnet_event *ev)
+{
+	struct lnet_ping_buffer *pbuf;
+
+	pbuf = LNET_PING_INFO_TO_BUFFER(ev->md.start);
+	spin_lock(&lp->lp_lock);
+	lp->lp_state &= ~LNET_PEER_PUSH_SENT;
+	lp->lp_push_error = ev->status;
+	if (ev->status)
+		lp->lp_state |= LNET_PEER_PUSH_FAILED;
+	else
+		lp->lp_node_seqno = LNET_PING_BUFFER_SEQNO(pbuf);
+	spin_unlock(&lp->lp_lock);
+
+	CDEBUG(D_NET, "peer %s ev->status %d\n",
+	       libcfs_nid2str(lp->lp_primary_nid), ev->status);
+}
+
+/* Handle a Reply message. This is the reply to a Ping message. */
+static void
+lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev)
+{
+	struct lnet_ping_buffer *pbuf;
+	int rc;
+
+	spin_lock(&lp->lp_lock);
+
+	lp->lp_disc_src_nid = ev->target.nid;
+
+	/*
+	 * If some kind of error happened the contents of message
+	 * cannot be used. Set PING_FAILED to trigger a retry.
+	 */
+	if (ev->status) {
+		lp->lp_state |= LNET_PEER_PING_FAILED;
+		lp->lp_ping_error = ev->status;
+		CDEBUG(D_NET, "Ping Reply error %d from %s (source %s)\n",
+		       ev->status,
+		       libcfs_nid2str(lp->lp_primary_nid),
+		       libcfs_nid2str(ev->source.nid));
+		goto out;
+	}
+
+	pbuf = LNET_PING_INFO_TO_BUFFER(ev->md.start);
+	if (pbuf->pb_info.pi_magic == __swab32(LNET_PROTO_PING_MAGIC))
+		lnet_swap_pinginfo(pbuf);
+
+	/*
+	 * A reply with invalid or corrupted info. Set PING_FAILED to
+	 * trigger a retry.
+	 */
+	rc = lnet_ping_info_validate(&pbuf->pb_info);
+	if (rc) {
+		lp->lp_state |= LNET_PEER_PING_FAILED;
+		lp->lp_ping_error = 0;
+		CDEBUG(D_NET, "Corrupted Ping Reply from %s: %d\n",
+		       libcfs_nid2str(lp->lp_primary_nid), rc);
+		goto out;
+	}
+
+	/* The peer may have discovery disabled at its end. Set
+	 * NO_DISCOVERY as appropriate.
+	 */
+	if ((pbuf->pb_info.pi_features & LNET_PING_FEAT_DISCOVERY) &&
+		!lnet_peer_discovery_disabled) {
+		CDEBUG(D_NET, "Peer %s has discovery enabled\n",
+			libcfs_nid2str(lp->lp_primary_nid));
+		lp->lp_state &= ~LNET_PEER_NO_DISCOVERY;
+	} else {
+		CDEBUG(D_NET, "Peer %s has discovery disabled\n",
+			libcfs_nid2str(lp->lp_primary_nid));
+		lp->lp_state |= LNET_PEER_NO_DISCOVERY;
+	}
+
+	/*
+	 * Update the MULTI_RAIL flag based on the reply. If the peer
+	 * was configured with DLC then the setting should match what
+	 * DLC put in.
+	 */
+	if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL) {
+		if (lp->lp_state & LNET_PEER_MULTI_RAIL) {
+			CDEBUG(D_NET, "peer %s(%p) is MR\n",
+			       libcfs_nid2str(lp->lp_primary_nid), lp);
+		} else if (lp->lp_state & LNET_PEER_CONFIGURED) {
+			CWARN("Reply says %s is Multi-Rail, DLC says not\n",
+			      libcfs_nid2str(lp->lp_primary_nid));
+		} else if (lnet_peer_discovery_disabled) {
+			CDEBUG(D_NET,
+			       "peer %s(%p) not MR: DD disabled locally\n",
+			       libcfs_nid2str(lp->lp_primary_nid), lp);
+		} else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) {
+			CDEBUG(D_NET,
+			       "peer %s(%p) not MR: DD disabled remotely\n",
+			       libcfs_nid2str(lp->lp_primary_nid), lp);
+		} else {
+			CDEBUG(D_NET, "peer %s(%p) is MR capable\n",
+			       libcfs_nid2str(lp->lp_primary_nid), lp);
+			lp->lp_state |= LNET_PEER_MULTI_RAIL;
+			lnet_peer_clr_non_mr_pref_nids(lp);
+		}
+	} else if (lp->lp_state & LNET_PEER_MULTI_RAIL) {
+		if (lp->lp_state & LNET_PEER_CONFIGURED) {
+			CWARN("DLC says %s is Multi-Rail, Reply says not\n",
+			      libcfs_nid2str(lp->lp_primary_nid));
+		} else {
+			CERROR("Multi-Rail state vanished from %s\n",
+			       libcfs_nid2str(lp->lp_primary_nid));
+			lp->lp_state &= ~LNET_PEER_MULTI_RAIL;
+		}
+	}
+
+	/*
+	 * Make sure we'll allocate the correct size ping buffer when
+	 * pinging the peer.
+	 */
+	if (lp->lp_data_nnis < pbuf->pb_info.pi_nnis)
+		lp->lp_data_nnis = pbuf->pb_info.pi_nnis;
+
+	/*
+	 * Check for truncation of the Reply. Clear PING_SENT and set
+	 * PING_FAILED to trigger a retry.
+	 */
+	if (pbuf->pb_nnis < pbuf->pb_info.pi_nnis) {
+		if (the_lnet.ln_push_target_nnis < pbuf->pb_info.pi_nnis)
+			the_lnet.ln_push_target_nnis = pbuf->pb_info.pi_nnis;
+		lp->lp_state |= LNET_PEER_PING_FAILED;
+		lp->lp_ping_error = 0;
+		CDEBUG(D_NET, "Truncated Reply from %s (%d nids)\n",
+		       libcfs_nid2str(lp->lp_primary_nid),
+		       pbuf->pb_info.pi_nnis);
+		goto out;
+	}
+
+	/*
+	 * Check the sequence numbers in the reply. These are only
+	 * available if the reply came from a Multi-Rail peer.
+	 */
+	if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL &&
+	    pbuf->pb_info.pi_nnis > 1 &&
+	    lp->lp_primary_nid == pbuf->pb_info.pi_ni[1].ns_nid) {
+		if (LNET_PING_BUFFER_SEQNO(pbuf) < lp->lp_peer_seqno)
+			CDEBUG(D_NET, "peer %s: seq# got %u have %u. peer rebooted?\n",
+				libcfs_nid2str(lp->lp_primary_nid),
+				LNET_PING_BUFFER_SEQNO(pbuf),
+				lp->lp_peer_seqno);
+
+		lp->lp_peer_seqno = LNET_PING_BUFFER_SEQNO(pbuf);
+	}
+
+	/* We're happy with the state of the data in the buffer. */
+	CDEBUG(D_NET, "peer %s data present %u\n",
+	       libcfs_nid2str(lp->lp_primary_nid), lp->lp_peer_seqno);
+	if (lp->lp_state & LNET_PEER_DATA_PRESENT)
+		lnet_ping_buffer_decref(lp->lp_data);
+	else
+		lp->lp_state |= LNET_PEER_DATA_PRESENT;
+	lnet_ping_buffer_addref(pbuf);
+	lp->lp_data = pbuf;
+out:
+	lp->lp_state &= ~LNET_PEER_PING_SENT;
+	spin_unlock(&lp->lp_lock);
+}
+
+/*
+ * Send event handling. Only matters for error cases, where we clean
+ * up state on the peer and peer_ni that would otherwise be updated in
+ * the REPLY event handler for a successful Ping, and the ACK event
+ * handler for a successful Push.
+ */
+static int
+lnet_discovery_event_send(struct lnet_peer *lp, struct lnet_event *ev)
+{
+	int rc = 0;
+
+	if (!ev->status)
+		goto out;
+
+	spin_lock(&lp->lp_lock);
+	if (ev->msg_type == LNET_MSG_GET) {
+		lp->lp_state &= ~LNET_PEER_PING_SENT;
+		lp->lp_state |= LNET_PEER_PING_FAILED;
+		lp->lp_ping_error = ev->status;
+	} else { /* ev->msg_type == LNET_MSG_PUT */
+		lp->lp_state &= ~LNET_PEER_PUSH_SENT;
+		lp->lp_state |= LNET_PEER_PUSH_FAILED;
+		lp->lp_push_error = ev->status;
+	}
+	spin_unlock(&lp->lp_lock);
+	rc = LNET_REDISCOVER_PEER;
+out:
+	CDEBUG(D_NET, "%s Send to %s: %d\n",
+		(ev->msg_type == LNET_MSG_GET ? "Ping" : "Push"),
+		libcfs_nid2str(ev->target.nid), rc);
+	return rc;
+}
+
+/*
+ * Unlink event handling. This event is only seen if a call to
+ * LNetMDUnlink() caused the event to be unlinked. If this call was
+ * made after the event was set up in LNetGet() or LNetPut() then we
+ * assume the Ping or Push timed out.
+ */
+static void
+lnet_discovery_event_unlink(struct lnet_peer *lp, struct lnet_event *ev)
+{
+	spin_lock(&lp->lp_lock);
+	/* We've passed through LNetGet() */
+	if (lp->lp_state & LNET_PEER_PING_SENT) {
+		lp->lp_state &= ~LNET_PEER_PING_SENT;
+		lp->lp_state |= LNET_PEER_PING_FAILED;
+		lp->lp_ping_error = -ETIMEDOUT;
+		CDEBUG(D_NET, "Ping Unlink for message to peer %s\n",
+			libcfs_nid2str(lp->lp_primary_nid));
+	}
+	/* We've passed through LNetPut() */
+	if (lp->lp_state & LNET_PEER_PUSH_SENT) {
+		lp->lp_state &= ~LNET_PEER_PUSH_SENT;
+		lp->lp_state |= LNET_PEER_PUSH_FAILED;
+		lp->lp_push_error = -ETIMEDOUT;
+		CDEBUG(D_NET, "Push Unlink for message to peer %s\n",
+			libcfs_nid2str(lp->lp_primary_nid));
+	}
+	spin_unlock(&lp->lp_lock);
+}
+
+/*
+ * Event handler for the discovery EQ.
+ *
+ * Called with lnet_res_lock(cpt) held. The cpt is the
+ * lnet_cpt_of_cookie() of the md handle cookie.
+ */
+static void lnet_discovery_event_handler(struct lnet_event *event)
+{
+	struct lnet_peer *lp = event->md.user_ptr;
+	struct lnet_ping_buffer *pbuf;
+	int rc;
+
+	/* discovery needs to take another look */
+	rc = LNET_REDISCOVER_PEER;
+
+	CDEBUG(D_NET, "Received event: %d\n", event->type);
+
+	switch (event->type) {
+	case LNET_EVENT_ACK:
+		lnet_discovery_event_ack(lp, event);
+		break;
+	case LNET_EVENT_REPLY:
+		lnet_discovery_event_reply(lp, event);
+		break;
+	case LNET_EVENT_SEND:
+		/* Only send failure triggers a retry. */
+		rc = lnet_discovery_event_send(lp, event);
+		break;
+	case LNET_EVENT_UNLINK:
+		/* LNetMDUnlink() was called */
+		lnet_discovery_event_unlink(lp, event);
+		break;
+	default:
+		/* Invalid events. */
+		LBUG();
+	}
+	lnet_net_lock(LNET_LOCK_EX);
+	if (event->unlinked) {
+		pbuf = LNET_PING_INFO_TO_BUFFER(event->md.start);
+		lnet_ping_buffer_decref(pbuf);
+		lnet_peer_decref_locked(lp);
+	}
+
+	/* put peer back at end of request queue, if discovery not already
+	 * done */
+	if (rc == LNET_REDISCOVER_PEER && !lnet_peer_is_uptodate(lp) &&
+	    lnet_peer_queue_for_discovery(lp)) {
+		list_move_tail(&lp->lp_dc_list, &the_lnet.ln_dc_request);
+		wake_up(&the_lnet.ln_dc_waitq);
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+/*
+ * Build a peer from incoming data.
+ *
+ * The NIDs in the incoming data are supposed to be structured as follows:
+ *  - loopback
+ *  - primary NID
+ *  - other NIDs in same net
+ *  - NIDs in second net
+ *  - NIDs in third net
+ *  - ...
+ * This due to the way the list of NIDs in the data is created.
+ *
+ * Note that this function will mark the peer uptodate unless an
+ * ENOMEM is encontered. All other errors are due to a conflict
+ * between the DLC configuration and what discovery sees. We treat DLC
+ * as binding, and therefore set the NIDS_UPTODATE flag to prevent the
+ * peer from becoming stuck in discovery.
+ */
+static int lnet_peer_merge_data(struct lnet_peer *lp,
+				struct lnet_ping_buffer *pbuf)
+{
+	struct lnet_peer_ni *lpni;
+	lnet_nid_t *curnis = NULL;
+	lnet_nid_t *addnis = NULL;
+	lnet_nid_t *delnis = NULL;
+	unsigned flags;
+	int ncurnis;
+	int naddnis;
+	int ndelnis;
+	int nnis = 0;
+	int i;
+	int j;
+	int rc;
+
+	flags = LNET_PEER_DISCOVERED;
+	if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL)
+		flags |= LNET_PEER_MULTI_RAIL;
+
+	nnis = MAX(lp->lp_nnis, pbuf->pb_info.pi_nnis);
+	LIBCFS_ALLOC(curnis, nnis * sizeof(lnet_nid_t));
+	LIBCFS_ALLOC(addnis, nnis * sizeof(lnet_nid_t));
+	LIBCFS_ALLOC(delnis, nnis * sizeof(lnet_nid_t));
+	if (!curnis || !addnis || !delnis) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	ncurnis = 0;
+	naddnis = 0;
+	ndelnis = 0;
+
+	/* Construct the list of NIDs present in peer. */
+	lpni = NULL;
+	while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL)
+		curnis[ncurnis++] = lpni->lpni_nid;
+
+	/*
+	 * Check for NIDs in pbuf not present in curnis[].
+	 * The loop starts at 1 to skip the loopback NID.
+	 */
+	for (i = 1; i < pbuf->pb_info.pi_nnis; i++) {
+		for (j = 0; j < ncurnis; j++)
+			if (pbuf->pb_info.pi_ni[i].ns_nid == curnis[j])
+				break;
+		if (j == ncurnis)
+			addnis[naddnis++] = pbuf->pb_info.pi_ni[i].ns_nid;
+	}
+	/*
+	 * Check for NIDs in curnis[] not present in pbuf.
+	 * The nested loop starts at 1 to skip the loopback NID.
+	 *
+	 * But never add the loopback NID to delnis[]: if it is
+	 * present in curnis[] then this peer is for this node.
+	 */
+	for (i = 0; i < ncurnis; i++) {
+		if (curnis[i] == LNET_NID_LO_0)
+			continue;
+		for (j = 1; j < pbuf->pb_info.pi_nnis; j++)
+			if (curnis[i] == pbuf->pb_info.pi_ni[j].ns_nid)
+				break;
+		if (j == pbuf->pb_info.pi_nnis)
+			delnis[ndelnis++] = curnis[i];
+	}
+
+	rc = 0;
+	if (lnet_is_discovery_disabled(lp))
+		goto out;
+
+	for (i = 0; i < naddnis; i++) {
+		rc = lnet_peer_add_nid(lp, addnis[i], flags);
+		if (rc) {
+			CERROR("Error adding NID %s to peer %s: %d\n",
+			       libcfs_nid2str(addnis[i]),
+			       libcfs_nid2str(lp->lp_primary_nid), rc);
+			if (rc == -ENOMEM)
+				goto out;
+		}
+	}
+	for (i = 0; i < ndelnis; i++) {
+		rc = lnet_peer_del_nid(lp, delnis[i], flags);
+		if (rc) {
+			CERROR("Error deleting NID %s from peer %s: %d\n",
+			       libcfs_nid2str(delnis[i]),
+			       libcfs_nid2str(lp->lp_primary_nid), rc);
+			if (rc == -ENOMEM)
+				goto out;
+		}
+	}
+	/*
+	 * Errors other than -ENOMEM are due to peers having been
+	 * configured with DLC. Ignore these because DLC overrides
+	 * Discovery.
+	 */
+	rc = 0;
+out:
+	LIBCFS_FREE(curnis, nnis * sizeof(lnet_nid_t));
+	LIBCFS_FREE(addnis, nnis * sizeof(lnet_nid_t));
+	LIBCFS_FREE(delnis, nnis * sizeof(lnet_nid_t));
+	lnet_ping_buffer_decref(pbuf);
+	CDEBUG(D_NET, "peer %s: %d\n", libcfs_nid2str(lp->lp_primary_nid), rc);
+
+	if (rc) {
+		spin_lock(&lp->lp_lock);
+		lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE;
+		lp->lp_state |= LNET_PEER_FORCE_PING;
+		spin_unlock(&lp->lp_lock);
+	}
+	return rc;
+}
+
+/*
+ * The data in pbuf says lp is its primary peer, but the data was
+ * received by a different peer. Try to update lp with the data.
+ */
+static int
+lnet_peer_set_primary_data(struct lnet_peer *lp, struct lnet_ping_buffer *pbuf)
+{
+	struct lnet_handle_md mdh;
+
+	/* Queue lp for discovery, and force it on the request queue. */
+	lnet_net_lock(LNET_LOCK_EX);
+	if (lnet_peer_queue_for_discovery(lp))
+		list_move(&lp->lp_dc_list, &the_lnet.ln_dc_request);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	LNetInvalidateMDHandle(&mdh);
+
+	/*
+	 * Decide whether we can move the peer to the DATA_PRESENT state.
+	 *
+	 * We replace stale data for a multi-rail peer, repair PING_FAILED
+	 * status, and preempt FORCE_PING.
+	 *
+	 * If after that we have DATA_PRESENT, we merge it into this peer.
+	 */
+	spin_lock(&lp->lp_lock);
+	if (lp->lp_state & LNET_PEER_MULTI_RAIL) {
+		if (lp->lp_peer_seqno < LNET_PING_BUFFER_SEQNO(pbuf)) {
+			lp->lp_peer_seqno = LNET_PING_BUFFER_SEQNO(pbuf);
+		} else if (lp->lp_state & LNET_PEER_DATA_PRESENT) {
+			lp->lp_state &= ~LNET_PEER_DATA_PRESENT;
+			lnet_ping_buffer_decref(pbuf);
+			pbuf = lp->lp_data;
+			lp->lp_data = NULL;
+		}
+	}
+	if (lp->lp_state & LNET_PEER_DATA_PRESENT) {
+		lnet_ping_buffer_decref(lp->lp_data);
+		lp->lp_data = NULL;
+		lp->lp_state &= ~LNET_PEER_DATA_PRESENT;
+	}
+	if (lp->lp_state & LNET_PEER_PING_FAILED) {
+		mdh = lp->lp_ping_mdh;
+		LNetInvalidateMDHandle(&lp->lp_ping_mdh);
+		lp->lp_state &= ~LNET_PEER_PING_FAILED;
+		lp->lp_ping_error = 0;
+	}
+	if (lp->lp_state & LNET_PEER_FORCE_PING)
+		lp->lp_state &= ~LNET_PEER_FORCE_PING;
+	lp->lp_state |= LNET_PEER_NIDS_UPTODATE;
+	spin_unlock(&lp->lp_lock);
+
+	if (!LNetMDHandleIsInvalid(mdh))
+		LNetMDUnlink(mdh);
+
+	if (pbuf)
+		return lnet_peer_merge_data(lp, pbuf);
+
+	CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(lp->lp_primary_nid));
+	return 0;
+}
+
+static bool lnet_is_nid_in_ping_info(lnet_nid_t nid, struct lnet_ping_info *pinfo)
+{
+	int i;
+
+	for (i = 0; i < pinfo->pi_nnis; i++) {
+		if (pinfo->pi_ni[i].ns_nid == nid)
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ * Update a peer using the data received.
+ */
+static int lnet_peer_data_present(struct lnet_peer *lp)
+__must_hold(&lp->lp_lock)
+{
+	struct lnet_ping_buffer *pbuf;
+	struct lnet_peer_ni *lpni;
+	lnet_nid_t nid = LNET_NID_ANY;
+	unsigned flags;
+	int rc = 0;
+
+	pbuf = lp->lp_data;
+	lp->lp_data = NULL;
+	lp->lp_state &= ~LNET_PEER_DATA_PRESENT;
+	lp->lp_state |= LNET_PEER_NIDS_UPTODATE;
+	spin_unlock(&lp->lp_lock);
+
+	/*
+	 * Modifications of peer structures are done while holding the
+	 * ln_api_mutex. A global lock is required because we may be
+	 * modifying multiple peer structures, and a mutex greatly
+	 * simplifies memory management.
+	 *
+	 * The actual changes to the data structures must also protect
+	 * against concurrent lookups, for which the lnet_net_lock in
+	 * LNET_LOCK_EX mode is used.
+	 */
+	mutex_lock(&the_lnet.ln_api_mutex);
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+		rc = -ESHUTDOWN;
+		goto out;
+	}
+
+	/*
+	 * If this peer is not on the peer list then it is being torn
+	 * down, and our reference count may be all that is keeping it
+	 * alive. Don't do any work on it.
+	 */
+	if (list_empty(&lp->lp_peer_list))
+		goto out;
+
+	flags = LNET_PEER_DISCOVERED;
+	if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL)
+		flags |= LNET_PEER_MULTI_RAIL;
+
+	/*
+	 * Check whether the primary NID in the message matches the
+	 * primary NID of the peer. If it does, update the peer, if
+	 * it it does not, check whether there is already a peer with
+	 * that primary NID. If no such peer exists, try to update
+	 * the primary NID of the current peer (allowed if it was
+	 * created due to message traffic) and complete the update.
+	 * If the peer did exist, hand off the data to it.
+	 *
+	 * The peer for the loopback interface is a special case: this
+	 * is the peer for the local node, and we want to set its
+	 * primary NID to the correct value here. Moreover, this peer
+	 * can show up with only the loopback NID in the ping buffer.
+	 */
+	if (pbuf->pb_info.pi_nnis <= 1)
+		goto out;
+	nid = pbuf->pb_info.pi_ni[1].ns_nid;
+	if (lp->lp_primary_nid == LNET_NID_LO_0) {
+		rc = lnet_peer_set_primary_nid(lp, nid, flags);
+		if (!rc)
+			rc = lnet_peer_merge_data(lp, pbuf);
+	} else if (lp->lp_primary_nid == nid ||
+		   (lnet_is_nid_in_ping_info(lp->lp_primary_nid, &pbuf->pb_info) &&
+		    lnet_is_discovery_disabled(lp))) {
+		rc = lnet_peer_merge_data(lp, pbuf);
+	} else {
+		lpni = lnet_find_peer_ni_locked(nid);
+		if (!lpni) {
+			rc = lnet_peer_set_primary_nid(lp, nid, flags);
+			if (rc) {
+				CERROR("Primary NID error %s versus %s: %d\n",
+				       libcfs_nid2str(lp->lp_primary_nid),
+				       libcfs_nid2str(nid), rc);
+			} else {
+				rc = lnet_peer_merge_data(lp, pbuf);
+			}
+		} else {
+			struct lnet_peer *new_lp;
+
+			new_lp = lpni->lpni_peer_net->lpn_peer;
+			/* if lp has discovery/MR enabled that means new_lp
+			 * should have discovery/MR enabled as well, since
+			 * it's the same peer, which we're about to merge
+			 */
+			if (!(lp->lp_state & LNET_PEER_NO_DISCOVERY))
+				new_lp->lp_state &= ~LNET_PEER_NO_DISCOVERY;
+			if (lp->lp_state & LNET_PEER_MULTI_RAIL)
+				new_lp->lp_state |= LNET_PEER_MULTI_RAIL;
+			rc = lnet_peer_set_primary_data(
+				lpni->lpni_peer_net->lpn_peer, pbuf);
+			lnet_peer_ni_decref_locked(lpni);
+		}
+	}
+out:
+	CDEBUG(D_NET, "peer %s: %d\n", libcfs_nid2str(lp->lp_primary_nid), rc);
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	spin_lock(&lp->lp_lock);
+	/* Tell discovery to re-check the peer immediately. */
+	if (!rc)
+		rc = LNET_REDISCOVER_PEER;
+	return rc;
+}
+
+/*
+ * A ping failed. Clear the PING_FAILED state and set the
+ * FORCE_PING state, to ensure a retry even if discovery is
+ * disabled. This avoids being left with incorrect state.
+ */
+static int lnet_peer_ping_failed(struct lnet_peer *lp)
+__must_hold(&lp->lp_lock)
+{
+	struct lnet_handle_md mdh;
+	int rc;
+
+	mdh = lp->lp_ping_mdh;
+	LNetInvalidateMDHandle(&lp->lp_ping_mdh);
+	lp->lp_state &= ~LNET_PEER_PING_FAILED;
+	lp->lp_state |= LNET_PEER_FORCE_PING;
+	rc = lp->lp_ping_error;
+	lp->lp_ping_error = 0;
+	spin_unlock(&lp->lp_lock);
+
+	if (!LNetMDHandleIsInvalid(mdh))
+		LNetMDUnlink(mdh);
+
+	CDEBUG(D_NET, "peer %s:%d\n",
+	       libcfs_nid2str(lp->lp_primary_nid), rc);
+
+	spin_lock(&lp->lp_lock);
+	return rc ? rc : LNET_REDISCOVER_PEER;
+}
+
+/*
+ * Select NID to send a Ping or Push to.
+ */
+static lnet_nid_t lnet_peer_select_nid(struct lnet_peer *lp)
+{
+	struct lnet_peer_ni *lpni;
+
+	/* Look for a direct-connected NID for this peer. */
+	lpni = NULL;
+	while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL) {
+		if (!lnet_get_net_locked(lpni->lpni_peer_net->lpn_net_id))
+			continue;
+		break;
+	}
+	if (lpni)
+		return lpni->lpni_nid;
+
+	/* Look for a routed-connected NID for this peer. */
+	lpni = NULL;
+	while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL) {
+		if (!lnet_find_rnet_locked(lpni->lpni_peer_net->lpn_net_id))
+			continue;
+		break;
+	}
+	if (lpni)
+		return lpni->lpni_nid;
+
+	return LNET_NID_ANY;
+}
+
+/* Active side of ping. */
+static int lnet_peer_send_ping(struct lnet_peer *lp)
+__must_hold(&lp->lp_lock)
+{
+	lnet_nid_t pnid;
+	int nnis;
+	int rc;
+	int cpt;
+
+	lp->lp_state |= LNET_PEER_PING_SENT;
+	lp->lp_state &= ~LNET_PEER_FORCE_PING;
+	spin_unlock(&lp->lp_lock);
+
+	cpt = lnet_net_lock_current();
+	/* Refcount for MD. */
+	lnet_peer_addref_locked(lp);
+	pnid = lnet_peer_select_nid(lp);
+	lnet_net_unlock(cpt);
+
+	nnis = MAX(lp->lp_data_nnis, LNET_INTERFACES_MIN);
+
+	rc = lnet_send_ping(pnid, &lp->lp_ping_mdh, nnis, lp,
+			    the_lnet.ln_dc_eqh, false);
+
+	/*
+	 * if LNetMDBind in lnet_send_ping fails we need to decrement the
+	 * refcount on the peer, otherwise LNetMDUnlink will be called
+	 * which will eventually do that.
+	 */
+	if (rc > 0) {
+		lnet_net_lock(cpt);
+		lnet_peer_decref_locked(lp);
+		lnet_net_unlock(cpt);
+		rc = -rc; /* change the rc to negative value */
+		goto fail_error;
+	} else if (rc < 0) {
+		goto fail_error;
+	}
+
+	CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(lp->lp_primary_nid));
+
+	spin_lock(&lp->lp_lock);
+	return 0;
+
+fail_error:
+	CDEBUG(D_NET, "peer %s: %d\n", libcfs_nid2str(lp->lp_primary_nid), rc);
+	/*
+	 * The errors that get us here are considered hard errors and
+	 * cause Discovery to terminate. So we clear PING_SENT, but do
+	 * not set either PING_FAILED or FORCE_PING. In fact we need
+	 * to clear PING_FAILED, because the unlink event handler will
+	 * have set it if we called LNetMDUnlink() above.
+	 */
+	spin_lock(&lp->lp_lock);
+	lp->lp_state &= ~(LNET_PEER_PING_SENT | LNET_PEER_PING_FAILED);
+	return rc;
+}
+
+/*
+ * This function exists because you cannot call LNetMDUnlink() from an
+ * event handler.
+ */
+static int lnet_peer_push_failed(struct lnet_peer *lp)
+__must_hold(&lp->lp_lock)
+{
+	struct lnet_handle_md mdh;
+	int rc;
+
+	mdh = lp->lp_push_mdh;
+	LNetInvalidateMDHandle(&lp->lp_push_mdh);
+	lp->lp_state &= ~LNET_PEER_PUSH_FAILED;
+	rc = lp->lp_push_error;
+	lp->lp_push_error = 0;
+	spin_unlock(&lp->lp_lock);
+
+	if (!LNetMDHandleIsInvalid(mdh))
+		LNetMDUnlink(mdh);
+
+	CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(lp->lp_primary_nid));
+	spin_lock(&lp->lp_lock);
+	return rc ? rc : LNET_REDISCOVER_PEER;
+}
+
+/*
+ * Mark the peer as discovered.
+ */
+static int lnet_peer_discovered(struct lnet_peer *lp)
+__must_hold(&lp->lp_lock)
+{
+	lp->lp_state |= LNET_PEER_DISCOVERED;
+	lp->lp_state &= ~(LNET_PEER_DISCOVERING |
+			  LNET_PEER_REDISCOVER);
+
+	CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(lp->lp_primary_nid));
+
+	return 0;
+}
+
+/* Active side of push. */
+static int lnet_peer_send_push(struct lnet_peer *lp)
+__must_hold(&lp->lp_lock)
+{
+	struct lnet_ping_buffer *pbuf;
+	struct lnet_process_id id;
+	struct lnet_md md;
+	int cpt;
+	int rc;
+
+	/* Don't push to a non-multi-rail peer. */
+	if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) {
+		lp->lp_state &= ~LNET_PEER_FORCE_PUSH;
+		/* if peer's NIDs are uptodate then peer is discovered */
+		if (lp->lp_state & LNET_PEER_NIDS_UPTODATE) {
+			rc = lnet_peer_discovered(lp);
+			return rc;
+		}
+
+		return 0;
+	}
+
+	lp->lp_state |= LNET_PEER_PUSH_SENT;
+	lp->lp_state &= ~LNET_PEER_FORCE_PUSH;
+	spin_unlock(&lp->lp_lock);
+
+	cpt = lnet_net_lock_current();
+	pbuf = the_lnet.ln_ping_target;
+	lnet_ping_buffer_addref(pbuf);
+	lnet_net_unlock(cpt);
+
+	/* Push source MD */
+	md.start     = &pbuf->pb_info;
+	md.length    = LNET_PING_INFO_SIZE(pbuf->pb_nnis);
+	md.threshold = 2; /* Put/Ack */
+	md.max_size  = 0;
+	md.options   = 0;
+	md.eq_handle = the_lnet.ln_dc_eqh;
+	md.user_ptr  = lp;
+
+	rc = LNetMDBind(md, LNET_UNLINK, &lp->lp_push_mdh);
+	if (rc) {
+		lnet_ping_buffer_decref(pbuf);
+		CERROR("Can't bind push source MD: %d\n", rc);
+		goto fail_error;
+	}
+	cpt = lnet_net_lock_current();
+	/* Refcount for MD. */
+	lnet_peer_addref_locked(lp);
+	id.pid = LNET_PID_LUSTRE;
+	id.nid = lnet_peer_select_nid(lp);
+	lnet_net_unlock(cpt);
+
+	if (id.nid == LNET_NID_ANY) {
+		rc = -EHOSTUNREACH;
+		goto fail_unlink;
+	}
+
+	rc = LNetPut(lp->lp_disc_src_nid, lp->lp_push_mdh,
+		     LNET_ACK_REQ, id, LNET_RESERVED_PORTAL,
+		     LNET_PROTO_PING_MATCHBITS, 0, 0);
+
+	/*
+	 * reset the discovery nid. There is no need to restrict sending
+	 * from that source, if we call lnet_push_update_to_peers(). It'll
+	 * get set to a specific NID, if we initiate discovery from the
+	 * scratch
+	 */
+	lp->lp_disc_src_nid = LNET_NID_ANY;
+
+	if (rc)
+		goto fail_unlink;
+
+	CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(lp->lp_primary_nid));
+
+	spin_lock(&lp->lp_lock);
+	return 0;
+
+fail_unlink:
+	LNetMDUnlink(lp->lp_push_mdh);
+	LNetInvalidateMDHandle(&lp->lp_push_mdh);
+fail_error:
+	CDEBUG(D_NET, "peer %s: %d\n", libcfs_nid2str(lp->lp_primary_nid), rc);
+	/*
+	 * The errors that get us here are considered hard errors and
+	 * cause Discovery to terminate. So we clear PUSH_SENT, but do
+	 * not set PUSH_FAILED. In fact we need to clear PUSH_FAILED,
+	 * because the unlink event handler will have set it if we
+	 * called LNetMDUnlink() above.
+	 */
+	spin_lock(&lp->lp_lock);
+	lp->lp_state &= ~(LNET_PEER_PUSH_SENT | LNET_PEER_PUSH_FAILED);
+	return rc;
+}
+
+/*
+ * An unrecoverable error was encountered during discovery.
+ * Set error status in peer and abort discovery.
+ */
+static void lnet_peer_discovery_error(struct lnet_peer *lp, int error)
+{
+	CDEBUG(D_NET, "Discovery error %s: %d\n",
+	       libcfs_nid2str(lp->lp_primary_nid), error);
+
+	spin_lock(&lp->lp_lock);
+	lp->lp_dc_error = error;
+	lp->lp_state &= ~LNET_PEER_DISCOVERING;
+	lp->lp_state |= LNET_PEER_REDISCOVER;
+	spin_unlock(&lp->lp_lock);
+}
+
+/*
+ * Discovering this peer is taking too long. Cancel any Ping or Push
+ * that discovery is waiting on by unlinking the relevant MDs. The
+ * lnet_discovery_event_handler() will proceed from here and complete
+ * the cleanup.
+ */
+static void lnet_peer_cancel_discovery(struct lnet_peer *lp)
+{
+	struct lnet_handle_md ping_mdh;
+	struct lnet_handle_md push_mdh;
+
+	LNetInvalidateMDHandle(&ping_mdh);
+	LNetInvalidateMDHandle(&push_mdh);
+
+	spin_lock(&lp->lp_lock);
+	if (lp->lp_state & LNET_PEER_PING_SENT) {
+		ping_mdh = lp->lp_ping_mdh;
+		LNetInvalidateMDHandle(&lp->lp_ping_mdh);
+	}
+	if (lp->lp_state & LNET_PEER_PUSH_SENT) {
+		push_mdh = lp->lp_push_mdh;
+		LNetInvalidateMDHandle(&lp->lp_push_mdh);
+	}
+	spin_unlock(&lp->lp_lock);
+
+	if (!LNetMDHandleIsInvalid(ping_mdh))
+		LNetMDUnlink(ping_mdh);
+	if (!LNetMDHandleIsInvalid(push_mdh))
+		LNetMDUnlink(push_mdh);
+}
+
+/*
+ * Wait for work to be queued or some other change that must be
+ * attended to. Returns non-zero if the discovery thread should shut
+ * down.
+ */
+static int lnet_peer_discovery_wait_for_work(void)
+{
+	int cpt;
+	int rc = 0;
+
+	DEFINE_WAIT(wait);
+
+	cpt = lnet_net_lock_current();
+	for (;;) {
+		prepare_to_wait(&the_lnet.ln_dc_waitq, &wait,
+				TASK_INTERRUPTIBLE);
+		if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING)
+			break;
+		if (lnet_push_target_resize_needed())
+			break;
+		if (!list_empty(&the_lnet.ln_dc_request))
+			break;
+		if (!list_empty(&the_lnet.ln_msg_resend))
+			break;
+		lnet_net_unlock(cpt);
+
+		/*
+		 * wakeup max every second to check if there are peers that
+		 * have been stuck on the working queue for greater than
+		 * the peer timeout.
+		 */
+		schedule_timeout(cfs_time_seconds(1));
+		finish_wait(&the_lnet.ln_dc_waitq, &wait);
+		cpt = lnet_net_lock_current();
+	}
+	finish_wait(&the_lnet.ln_dc_waitq, &wait);
+
+	if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING)
+		rc = -ESHUTDOWN;
+
+	lnet_net_unlock(cpt);
+
+	CDEBUG(D_NET, "woken: %d\n", rc);
+
+	return rc;
+}
+
+/*
+ * Messages that were pending on a destroyed peer will be put on a global
+ * resend list. The message resend list will be checked by
+ * the discovery thread when it wakes up, and will resend messages. These
+ * messages can still be sendable in the case the lpni which was the initial
+ * cause of the message re-queue was transfered to another peer.
+ *
+ * It is possible that LNet could be shutdown while we're iterating
+ * through the list. lnet_shudown_lndnets() will attempt to access the
+ * resend list, but will have to wait until the spinlock is released, by
+ * which time there shouldn't be any more messages on the resend list.
+ * During shutdown lnet_send() will fail and lnet_finalize() will be called
+ * for the messages so they can be released. The other case is that
+ * lnet_shudown_lndnets() can finalize all the messages before this
+ * function can visit the resend list, in which case this function will be
+ * a no-op.
+ */
+static void lnet_resend_msgs(void)
+{
+	struct lnet_msg *msg, *tmp;
+	struct list_head resend;
+	int rc;
+
+	INIT_LIST_HEAD(&resend);
+
+	spin_lock(&the_lnet.ln_msg_resend_lock);
+	list_splice(&the_lnet.ln_msg_resend, &resend);
+	spin_unlock(&the_lnet.ln_msg_resend_lock);
+
+	list_for_each_entry_safe(msg, tmp, &resend, msg_list) {
+		list_del_init(&msg->msg_list);
+		rc = lnet_send(msg->msg_src_nid_param, msg,
+			       msg->msg_rtr_nid_param);
+		if (rc < 0) {
+			CNETERR("Error sending %s to %s: %d\n",
+			       lnet_msgtyp2str(msg->msg_type),
+			       libcfs_id2str(msg->msg_target), rc);
+			lnet_finalize(msg, rc);
+		}
+	}
+}
+
+/* The discovery thread. */
+static int lnet_peer_discovery(void *arg)
+{
+	struct lnet_peer *lp;
+	int rc;
+
+	CDEBUG(D_NET, "started\n");
+	cfs_block_allsigs();
+
+	for (;;) {
+		if (lnet_peer_discovery_wait_for_work())
+			break;
+
+		lnet_resend_msgs();
+
+		if (lnet_push_target_resize_needed())
+			lnet_push_target_resize();
+
+		lnet_net_lock(LNET_LOCK_EX);
+		if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING) {
+			lnet_net_unlock(LNET_LOCK_EX);
+			break;
+		}
+
+		/*
+		 * Process all incoming discovery work requests.  When
+		 * discovery must wait on a peer to change state, it
+		 * is added to the tail of the ln_dc_working queue. A
+		 * timestamp keeps track of when the peer was added,
+		 * so we can time out discovery requests that take too
+		 * long.
+		 */
+		while (!list_empty(&the_lnet.ln_dc_request)) {
+			lp = list_first_entry(&the_lnet.ln_dc_request,
+					      struct lnet_peer, lp_dc_list);
+			list_move(&lp->lp_dc_list, &the_lnet.ln_dc_working);
+			/*
+			 * set the time the peer was put on the dc_working
+			 * queue. It shouldn't remain on the queue
+			 * forever, in case the GET message (for ping)
+			 * doesn't get a REPLY or the PUT message (for
+			 * push) doesn't get an ACK.
+			 */
+			lp->lp_last_queued = ktime_get_real_seconds();
+			lnet_net_unlock(LNET_LOCK_EX);
+
+			/*
+			 * Select an action depending on the state of
+			 * the peer and whether discovery is disabled.
+			 * The check whether discovery is disabled is
+			 * done after the code that handles processing
+			 * for arrived data, cleanup for failures, and
+			 * forcing a Ping or Push.
+			 */
+			spin_lock(&lp->lp_lock);
+			CDEBUG(D_NET, "peer %s state %#x\n",
+				libcfs_nid2str(lp->lp_primary_nid),
+				lp->lp_state);
+			if (lp->lp_state & LNET_PEER_DATA_PRESENT)
+				rc = lnet_peer_data_present(lp);
+			else if (lp->lp_state & LNET_PEER_PING_FAILED)
+				rc = lnet_peer_ping_failed(lp);
+			else if (lp->lp_state & LNET_PEER_PUSH_FAILED)
+				rc = lnet_peer_push_failed(lp);
+			else if (lp->lp_state & LNET_PEER_FORCE_PING)
+				rc = lnet_peer_send_ping(lp);
+			else if (lp->lp_state & LNET_PEER_FORCE_PUSH)
+				rc = lnet_peer_send_push(lp);
+			else if (!(lp->lp_state & LNET_PEER_NIDS_UPTODATE))
+				rc = lnet_peer_send_ping(lp);
+			else if (lnet_peer_needs_push(lp))
+				rc = lnet_peer_send_push(lp);
+			else
+				rc = lnet_peer_discovered(lp);
+			CDEBUG(D_NET, "peer %s state %#x rc %d\n",
+				libcfs_nid2str(lp->lp_primary_nid),
+				lp->lp_state, rc);
+			spin_unlock(&lp->lp_lock);
+
+			lnet_net_lock(LNET_LOCK_EX);
+			if (rc == LNET_REDISCOVER_PEER) {
+				list_move(&lp->lp_dc_list,
+					  &the_lnet.ln_dc_request);
+			} else if (rc) {
+				lnet_peer_discovery_error(lp, rc);
+			}
+			if (!(lp->lp_state & LNET_PEER_DISCOVERING))
+				lnet_peer_discovery_complete(lp);
+			if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING)
+				break;
+		}
+
+		lnet_net_unlock(LNET_LOCK_EX);
+	}
+
+	CDEBUG(D_NET, "stopping\n");
+	/*
+	 * Clean up before telling lnet_peer_discovery_stop() that
+	 * we're done. Use wake_up() below to somewhat reduce the
+	 * size of the thundering herd if there are multiple threads
+	 * waiting on discovery of a single peer.
+	 */
+
+	/* Queue cleanup 1: stop all pending pings and pushes. */
+	lnet_net_lock(LNET_LOCK_EX);
+	while (!list_empty(&the_lnet.ln_dc_working)) {
+		lp = list_first_entry(&the_lnet.ln_dc_working,
+				      struct lnet_peer, lp_dc_list);
+		list_move(&lp->lp_dc_list, &the_lnet.ln_dc_expired);
+		lnet_net_unlock(LNET_LOCK_EX);
+		lnet_peer_cancel_discovery(lp);
+		lnet_net_lock(LNET_LOCK_EX);
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	/* Queue cleanup 2: wait for the expired queue to clear. */
+	while (!list_empty(&the_lnet.ln_dc_expired))
+		schedule_timeout(cfs_time_seconds(1));
+
+	/* Queue cleanup 3: clear the request queue. */
+	lnet_net_lock(LNET_LOCK_EX);
+	while (!list_empty(&the_lnet.ln_dc_request)) {
+		lp = list_first_entry(&the_lnet.ln_dc_request,
+				      struct lnet_peer, lp_dc_list);
+		lnet_peer_discovery_error(lp, -ESHUTDOWN);
+		lnet_peer_discovery_complete(lp);
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	LNetEQFree(the_lnet.ln_dc_eqh);
+	LNetInvalidateEQHandle(&the_lnet.ln_dc_eqh);
+
+	the_lnet.ln_dc_state = LNET_DC_STATE_SHUTDOWN;
+	wake_up(&the_lnet.ln_dc_waitq);
+
+	CDEBUG(D_NET, "stopped\n");
+
+	return 0;
+}
+
+/* ln_api_mutex is held on entry. */
+int lnet_peer_discovery_start(void)
+{
+	struct task_struct *task;
+	int rc;
+
+	if (the_lnet.ln_dc_state != LNET_DC_STATE_SHUTDOWN)
+		return -EALREADY;
+
+	rc = LNetEQAlloc(0, lnet_discovery_event_handler, &the_lnet.ln_dc_eqh);
+	if (rc != 0) {
+		CERROR("Can't allocate discovery EQ: %d\n", rc);
+		return rc;
+	}
+
+	the_lnet.ln_dc_state = LNET_DC_STATE_RUNNING;
+	task = kthread_run(lnet_peer_discovery, NULL, "lnet_discovery");
+	if (IS_ERR(task)) {
+		rc = PTR_ERR(task);
+		CERROR("Can't start peer discovery thread: %d\n", rc);
+
+		LNetEQFree(the_lnet.ln_dc_eqh);
+		LNetInvalidateEQHandle(&the_lnet.ln_dc_eqh);
+
+		the_lnet.ln_dc_state = LNET_DC_STATE_SHUTDOWN;
+	}
+
+	CDEBUG(D_NET, "discovery start: %d\n", rc);
+
+	return rc;
+}
+
+/* ln_api_mutex is held on entry. */
+void lnet_peer_discovery_stop(void)
+{
+	if (the_lnet.ln_dc_state == LNET_DC_STATE_SHUTDOWN)
+		return;
+
+	LASSERT(the_lnet.ln_dc_state == LNET_DC_STATE_RUNNING);
+	the_lnet.ln_dc_state = LNET_DC_STATE_STOPPING;
+	wake_up(&the_lnet.ln_dc_waitq);
+
+	wait_event(the_lnet.ln_dc_waitq,
+		   the_lnet.ln_dc_state == LNET_DC_STATE_SHUTDOWN);
+
+	LASSERT(list_empty(&the_lnet.ln_dc_request));
+	LASSERT(list_empty(&the_lnet.ln_dc_working));
+	LASSERT(list_empty(&the_lnet.ln_dc_expired));
+
+	CDEBUG(D_NET, "discovery stopped\n");
+}
+
+/* Debugging */
+
+void
+lnet_debug_peer(lnet_nid_t nid)
+{
+	char			*aliveness = "NA";
+	struct lnet_peer_ni	*lp;
+	int			cpt;
+
+	cpt = lnet_cpt_of_nid(nid, NULL);
+	lnet_net_lock(cpt);
+
+	lp = lnet_nid2peerni_locked(nid, LNET_NID_ANY, cpt);
 	if (IS_ERR(lp)) {
 		lnet_net_unlock(cpt);
 		CDEBUG(D_WARNING, "No peer %s\n", libcfs_nid2str(nid));
@@ -1106,6 +3363,8 @@ lnet_debug_peer(lnet_nid_t nid)
 	lnet_net_unlock(cpt);
 }
 
+/* Gathering information for userspace. */
+
 int lnet_get_peer_ni_info(__u32 peer_index, __u64 *nid,
 			  char aliveness[LNET_MAX_STR_LEN],
 			  __u32 *cpt_iter, __u32 *refcount,
@@ -1169,56 +3428,193 @@ int lnet_get_peer_ni_info(__u32 peer_index, __u64 *nid,
 	return found ? 0 : -ENOENT;
 }
 
-int lnet_get_peer_info(__u32 idx, lnet_nid_t *primary_nid, lnet_nid_t *nid,
-		       bool *mr,
-		       struct lnet_peer_ni_credit_info __user *peer_ni_info,
-		       struct lnet_ioctl_element_stats __user *peer_ni_stats)
+/* ln_api_mutex is held, which keeps the peer list stable */
+int lnet_get_peer_info(struct lnet_ioctl_peer_cfg *cfg, void __user *bulk)
 {
-	struct lnet_peer_ni *lpni = NULL;
-	struct lnet_peer_net *lpn = NULL;
-	struct lnet_peer *lp = NULL;
-	struct lnet_peer_ni_credit_info ni_info;
-	struct lnet_ioctl_element_stats ni_stats;
+	struct lnet_ioctl_element_stats *lpni_stats;
+	struct lnet_ioctl_element_msg_stats *lpni_msg_stats;
+	struct lnet_ioctl_peer_ni_hstats *lpni_hstats;
+	struct lnet_peer_ni_credit_info *lpni_info;
+	struct lnet_peer_ni *lpni;
+	struct lnet_peer *lp;
+	lnet_nid_t nid;
+	__u32 size;
 	int rc;
 
-	lpni = lnet_get_peer_ni_idx_locked(idx, &lpn, &lp);
-
-	if (!lpni)
-		return -ENOENT;
+	lp = lnet_find_peer(cfg->prcfg_prim_nid);
 
-	*primary_nid = lp->lp_primary_nid;
-	*mr = lp->lp_multi_rail;
-	*nid = lpni->lpni_nid;
-	snprintf(ni_info.cr_aliveness, LNET_MAX_STR_LEN, "NA");
-	if (lnet_isrouter(lpni) ||
-		lnet_peer_aliveness_enabled(lpni))
-		snprintf(ni_info.cr_aliveness, LNET_MAX_STR_LEN,
-			 lpni->lpni_alive ? "up" : "down");
-
-	ni_info.cr_refcount = atomic_read(&lpni->lpni_refcount);
-	ni_info.cr_ni_peer_tx_credits = (lpni->lpni_net != NULL) ?
-		lpni->lpni_net->net_tunables.lct_peer_tx_credits : 0;
-	ni_info.cr_peer_tx_credits = lpni->lpni_txcredits;
-	ni_info.cr_peer_rtr_credits = lpni->lpni_rtrcredits;
-	ni_info.cr_peer_min_rtr_credits = lpni->lpni_minrtrcredits;
-	ni_info.cr_peer_min_tx_credits = lpni->lpni_mintxcredits;
-	ni_info.cr_peer_tx_qnob = lpni->lpni_txqnob;
-	ni_info.cr_ncpt = lpni->lpni_cpt;
-
-	ni_stats.iel_send_count = atomic_read(&lpni->lpni_stats.send_count);
-	ni_stats.iel_recv_count = atomic_read(&lpni->lpni_stats.recv_count);
-	ni_stats.iel_drop_count = atomic_read(&lpni->lpni_stats.drop_count);
-
-	/* If copy_to_user fails */
-	rc = -EFAULT;
-	if (copy_to_user(peer_ni_info, &ni_info, sizeof(ni_info)))
-		goto copy_failed;
+	if (!lp) {
+		rc = -ENOENT;
+		goto out;
+	}
 
-	if (copy_to_user(peer_ni_stats, &ni_stats, sizeof(ni_stats)))
-		goto copy_failed;
+	size = sizeof(nid) + sizeof(*lpni_info) + sizeof(*lpni_stats)
+		+ sizeof(*lpni_msg_stats) + sizeof(*lpni_hstats);
+	size *= lp->lp_nnis;
+	if (size > cfg->prcfg_size) {
+		cfg->prcfg_size = size;
+		rc = -E2BIG;
+		goto out_lp_decref;
+	}
 
+	cfg->prcfg_prim_nid = lp->lp_primary_nid;
+	cfg->prcfg_mr = lnet_peer_is_multi_rail(lp);
+	cfg->prcfg_cfg_nid = lp->lp_primary_nid;
+	cfg->prcfg_count = lp->lp_nnis;
+	cfg->prcfg_size = size;
+	cfg->prcfg_state = lp->lp_state;
+
+	/* Allocate helper buffers. */
+	rc = -ENOMEM;
+	LIBCFS_ALLOC(lpni_info, sizeof(*lpni_info));
+	if (!lpni_info)
+		goto out_lp_decref;
+	LIBCFS_ALLOC(lpni_stats, sizeof(*lpni_stats));
+	if (!lpni_stats)
+		goto out_free_info;
+	LIBCFS_ALLOC(lpni_msg_stats, sizeof(*lpni_msg_stats));
+	if (!lpni_msg_stats)
+		goto out_free_stats;
+	LIBCFS_ALLOC(lpni_hstats, sizeof(*lpni_hstats));
+	if (!lpni_hstats)
+		goto out_free_msg_stats;
+
+
+	lpni = NULL;
+	rc = -EFAULT;
+	while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL) {
+		nid = lpni->lpni_nid;
+		if (copy_to_user(bulk, &nid, sizeof(nid)))
+			goto out_free_hstats;
+		bulk += sizeof(nid);
+
+		memset(lpni_info, 0, sizeof(*lpni_info));
+		snprintf(lpni_info->cr_aliveness, LNET_MAX_STR_LEN, "NA");
+		if (lnet_isrouter(lpni) ||
+			lnet_peer_aliveness_enabled(lpni))
+			snprintf(lpni_info->cr_aliveness, LNET_MAX_STR_LEN,
+				lpni->lpni_alive ? "up" : "down");
+
+		lpni_info->cr_refcount = atomic_read(&lpni->lpni_refcount);
+		lpni_info->cr_ni_peer_tx_credits = (lpni->lpni_net != NULL) ?
+			lpni->lpni_net->net_tunables.lct_peer_tx_credits : 0;
+		lpni_info->cr_peer_tx_credits = lpni->lpni_txcredits;
+		lpni_info->cr_peer_rtr_credits = lpni->lpni_rtrcredits;
+		lpni_info->cr_peer_min_rtr_credits = lpni->lpni_minrtrcredits;
+		lpni_info->cr_peer_min_tx_credits = lpni->lpni_mintxcredits;
+		lpni_info->cr_peer_tx_qnob = lpni->lpni_txqnob;
+		if (copy_to_user(bulk, lpni_info, sizeof(*lpni_info)))
+			goto out_free_hstats;
+		bulk += sizeof(*lpni_info);
+
+		memset(lpni_stats, 0, sizeof(*lpni_stats));
+		lpni_stats->iel_send_count = lnet_sum_stats(&lpni->lpni_stats,
+							    LNET_STATS_TYPE_SEND);
+		lpni_stats->iel_recv_count = lnet_sum_stats(&lpni->lpni_stats,
+							    LNET_STATS_TYPE_RECV);
+		lpni_stats->iel_drop_count = lnet_sum_stats(&lpni->lpni_stats,
+							    LNET_STATS_TYPE_DROP);
+		if (copy_to_user(bulk, lpni_stats, sizeof(*lpni_stats)))
+			goto out_free_hstats;
+		bulk += sizeof(*lpni_stats);
+		lnet_usr_translate_stats(lpni_msg_stats, &lpni->lpni_stats);
+		if (copy_to_user(bulk, lpni_msg_stats, sizeof(*lpni_msg_stats)))
+			goto out_free_hstats;
+		bulk += sizeof(*lpni_msg_stats);
+		lpni_hstats->hlpni_network_timeout =
+		  atomic_read(&lpni->lpni_hstats.hlt_network_timeout);
+		lpni_hstats->hlpni_remote_dropped =
+		  atomic_read(&lpni->lpni_hstats.hlt_remote_dropped);
+		lpni_hstats->hlpni_remote_timeout =
+		  atomic_read(&lpni->lpni_hstats.hlt_remote_timeout);
+		lpni_hstats->hlpni_remote_error =
+		  atomic_read(&lpni->lpni_hstats.hlt_remote_error);
+		lpni_hstats->hlpni_health_value =
+		  atomic_read(&lpni->lpni_healthv);
+		if (copy_to_user(bulk, lpni_hstats, sizeof(*lpni_hstats)))
+			goto out_free_hstats;
+		bulk += sizeof(*lpni_hstats);
+	}
 	rc = 0;
 
-copy_failed:
+out_free_hstats:
+	LIBCFS_FREE(lpni_hstats, sizeof(*lpni_hstats));
+out_free_msg_stats:
+	LIBCFS_FREE(lpni_msg_stats, sizeof(*lpni_msg_stats));
+out_free_stats:
+	LIBCFS_FREE(lpni_stats, sizeof(*lpni_stats));
+out_free_info:
+	LIBCFS_FREE(lpni_info, sizeof(*lpni_info));
+out_lp_decref:
+	lnet_peer_decref_locked(lp);
+out:
 	return rc;
 }
+
+void
+lnet_peer_ni_add_to_recoveryq_locked(struct lnet_peer_ni *lpni)
+{
+	/* the mt could've shutdown and cleaned up the queues */
+	if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING)
+		return;
+
+	if (list_empty(&lpni->lpni_recovery) &&
+	    atomic_read(&lpni->lpni_healthv) < LNET_MAX_HEALTH_VALUE) {
+		CDEBUG(D_NET, "lpni %s added to recovery queue. Health = %d\n",
+			libcfs_nid2str(lpni->lpni_nid),
+			atomic_read(&lpni->lpni_healthv));
+		list_add_tail(&lpni->lpni_recovery, &the_lnet.ln_mt_peerNIRecovq);
+		lnet_peer_ni_addref_locked(lpni);
+	}
+}
+
+/* Call with the ln_api_mutex held */
+void
+lnet_peer_ni_set_healthv(lnet_nid_t nid, int value, bool all)
+{
+	struct lnet_peer_table *ptable;
+	struct lnet_peer *lp;
+	struct lnet_peer_net *lpn;
+	struct lnet_peer_ni *lpni;
+	int lncpt;
+	int cpt;
+
+	if (the_lnet.ln_state != LNET_STATE_RUNNING)
+		return;
+
+	if (!all) {
+		lnet_net_lock(LNET_LOCK_EX);
+		lpni = lnet_find_peer_ni_locked(nid);
+		if (!lpni) {
+			lnet_net_unlock(LNET_LOCK_EX);
+			return;
+		}
+		atomic_set(&lpni->lpni_healthv, value);
+		lnet_peer_ni_add_to_recoveryq_locked(lpni);
+		lnet_peer_ni_decref_locked(lpni);
+		lnet_net_unlock(LNET_LOCK_EX);
+		return;
+	}
+
+	lncpt = cfs_percpt_number(the_lnet.ln_peer_tables);
+
+	/*
+	 * Walk all the peers and reset the healhv for each one to the
+	 * maximum value.
+	 */
+	lnet_net_lock(LNET_LOCK_EX);
+	for (cpt = 0; cpt < lncpt; cpt++) {
+		ptable = the_lnet.ln_peer_tables[cpt];
+		list_for_each_entry(lp, &ptable->pt_peer_list, lp_peer_list) {
+			list_for_each_entry(lpn, &lp->lp_peer_nets, lpn_peer_nets) {
+				list_for_each_entry(lpni, &lpn->lpn_peer_nis,
+						    lpni_peer_nis) {
+					atomic_set(&lpni->lpni_healthv, value);
+					lnet_peer_ni_add_to_recoveryq_locked(lpni);
+				}
+			}
+		}
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
diff --git a/drivers/staging/lustrefsx/lnet/lnet/router.c b/drivers/staging/lustrefsx/lnet/lnet/router.c
index bd30963a960d1..e2966cf77c561 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/router.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/router.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  *
  *   This file is part of Lustre, https://wiki.whamcloud.com/
  *
@@ -68,9 +68,6 @@ lnet_peer_buffer_credits(struct lnet_net *net)
 	return net->net_tunables.lct_peer_tx_credits;
 }
 
-/* forward ref's */
-static int lnet_router_checker(void *);
-
 static int check_routers_before_use;
 module_param(check_routers_before_use, int, 0444);
 MODULE_PARM_DESC(check_routers_before_use, "Assume routers are down and ping them before use");
@@ -99,9 +96,9 @@ lnet_peers_start_down(void)
 
 void
 lnet_notify_locked(struct lnet_peer_ni *lp, int notifylnd, int alive,
-		   cfs_time_t when)
+		   time64_t when)
 {
-	if (cfs_time_before(when, lp->lpni_timestamp)) { /* out of date information */
+	if (lp->lpni_timestamp > when) { /* out of date information */
 		CDEBUG(D_NET, "Out of date\n");
 		return;
 	}
@@ -114,7 +111,7 @@ lnet_notify_locked(struct lnet_peer_ni *lp, int notifylnd, int alive,
 	 */
 	spin_lock(&lp->lpni_lock);
 
-	lp->lpni_timestamp = when;                /* update timestamp */
+	lp->lpni_timestamp = when; /* update timestamp */
 	lp->lpni_ping_deadline = 0;               /* disable ping timeout */
 
 	if (lp->lpni_alive_count != 0 &&          /* got old news */
@@ -334,7 +331,7 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
 	       libcfs_net2str(net), hops, priority, libcfs_nid2str(gateway));
 
 	if (gateway == LNET_NID_ANY ||
-	    LNET_NETTYP(LNET_NIDNET(gateway)) == LOLND ||
+	    gateway == LNET_NID_LO_0 ||
 	    net == LNET_NIDNET(LNET_NID_ANY) ||
 	    LNET_NETTYP(net) == LOLND ||
 	    LNET_NIDNET(gateway) == net ||
@@ -344,6 +341,13 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
 	if (lnet_islocalnet(net))	/* it's a local network */
 		return -EEXIST;
 
+	if (!lnet_islocalnet(LNET_NIDNET(gateway))) {
+		CERROR("Cannot add route with gateway %s. There is no local interface configured on LNet %s\n",
+		       libcfs_nid2str(gateway),
+		       libcfs_net2str(LNET_NIDNET(gateway)));
+		return -EHOSTUNREACH;
+	}
+
 	/* Assume net, route, all new */
 	LIBCFS_ALLOC(route, sizeof(*route));
 	LIBCFS_ALLOC(rnet, sizeof(*rnet));
@@ -433,8 +437,8 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
 	if (rnet != rnet2)
 		LIBCFS_FREE(rnet, sizeof(*rnet));
 
-	/* indicate to startup the router checker if configured */
-	wake_up(&the_lnet.ln_rc_waitq);
+	/* kick start the monitor thread to handle the added route */
+	wake_up(&the_lnet.ln_mt_waitq);
 
 	return rc;
 }
@@ -577,29 +581,29 @@ lnet_destroy_routes (void)
 	lnet_del_route(LNET_NIDNET(LNET_NID_ANY), LNET_NID_ANY);
 }
 
-int lnet_get_rtr_pool_cfg(int idx, struct lnet_ioctl_pool_cfg *pool_cfg)
+int lnet_get_rtr_pool_cfg(int cpt, struct lnet_ioctl_pool_cfg *pool_cfg)
 {
+	struct lnet_rtrbufpool *rbp;
 	int i, rc = -ENOENT, j;
 
 	if (the_lnet.ln_rtrpools == NULL)
 		return rc;
 
-	for (i = 0; i < LNET_NRBPOOLS; i++) {
-		struct lnet_rtrbufpool *rbp;
 
-		lnet_net_lock(LNET_LOCK_EX);
-		cfs_percpt_for_each(rbp, j, the_lnet.ln_rtrpools) {
-			if (i++ != idx)
-				continue;
+	cfs_percpt_for_each(rbp, i, the_lnet.ln_rtrpools) {
+		if (i != cpt)
+			continue;
 
-			pool_cfg->pl_pools[i].pl_npages = rbp[i].rbp_npages;
-			pool_cfg->pl_pools[i].pl_nbuffers = rbp[i].rbp_nbuffers;
-			pool_cfg->pl_pools[i].pl_credits = rbp[i].rbp_credits;
-			pool_cfg->pl_pools[i].pl_mincredits = rbp[i].rbp_mincredits;
-			rc = 0;
-			break;
+		lnet_net_lock(i);
+		for (j = 0; j < LNET_NRBPOOLS; j++) {
+			pool_cfg->pl_pools[j].pl_npages = rbp[j].rbp_npages;
+			pool_cfg->pl_pools[j].pl_nbuffers = rbp[j].rbp_nbuffers;
+			pool_cfg->pl_pools[j].pl_credits = rbp[j].rbp_credits;
+			pool_cfg->pl_pools[j].pl_mincredits = rbp[j].rbp_mincredits;
 		}
-		lnet_net_unlock(LNET_LOCK_EX);
+		lnet_net_unlock(i);
+		rc = 0;
+		break;
 	}
 
 	lnet_net_lock(LNET_LOCK_EX);
@@ -650,17 +654,21 @@ lnet_get_route(int idx, __u32 *net, __u32 *hops,
 }
 
 void
-lnet_swap_pinginfo(struct lnet_ping_info *info)
+lnet_swap_pinginfo(struct lnet_ping_buffer *pbuf)
 {
-	int		  i;
 	struct lnet_ni_status *stat;
+	int nnis;
+	int i;
 
-	__swab32s(&info->pi_magic);
-	__swab32s(&info->pi_features);
-	__swab32s(&info->pi_pid);
-	__swab32s(&info->pi_nnis);
-	for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
-		stat = &info->pi_ni[i];
+	__swab32s(&pbuf->pb_info.pi_magic);
+	__swab32s(&pbuf->pb_info.pi_features);
+	__swab32s(&pbuf->pb_info.pi_pid);
+	__swab32s(&pbuf->pb_info.pi_nnis);
+	nnis = pbuf->pb_info.pi_nnis;
+	if (nnis > pbuf->pb_nnis)
+		nnis = pbuf->pb_nnis;
+	for (i = 0; i < nnis; i++) {
+		stat = &pbuf->pb_info.pi_ni[i];
 		__swab64s(&stat->ns_nid);
 		__swab32s(&stat->ns_status);
 	}
@@ -674,11 +682,12 @@ lnet_swap_pinginfo(struct lnet_ping_info *info)
 static void
 lnet_parse_rc_info(struct lnet_rc_data *rcd)
 {
-	struct lnet_ping_info	*info = rcd->rcd_pinginfo;
+	struct lnet_ping_buffer	*pbuf = rcd->rcd_pingbuffer;
 	struct lnet_peer_ni	*gw   = rcd->rcd_gateway;
 	struct lnet_route		*rte;
+	int			nnis;
 
-	if (!gw->lpni_alive)
+	if (!gw->lpni_alive || !pbuf)
 		return;
 
 	/*
@@ -687,29 +696,29 @@ lnet_parse_rc_info(struct lnet_rc_data *rcd)
 	 */
 	spin_lock(&gw->lpni_lock);
 
-	if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC))
-		lnet_swap_pinginfo(info);
+	if (pbuf->pb_info.pi_magic == __swab32(LNET_PROTO_PING_MAGIC))
+		lnet_swap_pinginfo(pbuf);
 
 	/* NB always racing with network! */
-	if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
+	if (pbuf->pb_info.pi_magic != LNET_PROTO_PING_MAGIC) {
 		CDEBUG(D_NET, "%s: Unexpected magic %08x\n",
-		       libcfs_nid2str(gw->lpni_nid), info->pi_magic);
+		       libcfs_nid2str(gw->lpni_nid), pbuf->pb_info.pi_magic);
 		gw->lpni_ping_feats = LNET_PING_FEAT_INVAL;
-		spin_unlock(&gw->lpni_lock);
-		return;
+		goto out;
 	}
 
-	gw->lpni_ping_feats = info->pi_features;
-	if ((gw->lpni_ping_feats & LNET_PING_FEAT_MASK) == 0) {
-		CDEBUG(D_NET, "%s: Unexpected features 0x%x\n",
-		       libcfs_nid2str(gw->lpni_nid), gw->lpni_ping_feats);
-		spin_unlock(&gw->lpni_lock);
-		return; /* nothing I can understand */
-	}
+	gw->lpni_ping_feats = pbuf->pb_info.pi_features;
+
+	/* Without NI status info there's nothing more to do. */
+	if ((gw->lpni_ping_feats & LNET_PING_FEAT_NI_STATUS) == 0)
+		goto out;
 
-	if ((gw->lpni_ping_feats & LNET_PING_FEAT_NI_STATUS) == 0) {
-		spin_unlock(&gw->lpni_lock);
-		return; /* can't carry NI status info */
+	/* Determine the number of NIs for which there is data. */
+	nnis = pbuf->pb_info.pi_nnis;
+	if (pbuf->pb_nnis < nnis) {
+		if (rcd->rcd_nnis < nnis)
+			rcd->rcd_nnis = nnis;
+		nnis = pbuf->pb_nnis;
 	}
 
 	list_for_each_entry(rte, &gw->lpni_routes, lr_gwlist) {
@@ -717,24 +726,24 @@ lnet_parse_rc_info(struct lnet_rc_data *rcd)
 		int	up = 0;
 		int	i;
 
+		/* If routing disabled then the route is down. */
 		if ((gw->lpni_ping_feats & LNET_PING_FEAT_RTE_DISABLED) != 0) {
 			rte->lr_downis = 1;
 			continue;
 		}
 
-		for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
-			struct lnet_ni_status *stat = &info->pi_ni[i];
+		for (i = 0; i < nnis; i++) {
+			struct lnet_ni_status *stat = &pbuf->pb_info.pi_ni[i];
 			lnet_nid_t	 nid = stat->ns_nid;
 
 			if (nid == LNET_NID_ANY) {
 				CDEBUG(D_NET, "%s: unexpected LNET_NID_ANY\n",
 				       libcfs_nid2str(gw->lpni_nid));
 				gw->lpni_ping_feats = LNET_PING_FEAT_INVAL;
-				spin_unlock(&gw->lpni_lock);
-				return;
+				goto out;
 			}
 
-			if (LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
+			if (nid == LNET_NID_LO_0)
 				continue;
 
 			if (stat->ns_status == LNET_NI_STATUS_DOWN) {
@@ -753,8 +762,7 @@ lnet_parse_rc_info(struct lnet_rc_data *rcd)
 			CDEBUG(D_NET, "%s: Unexpected status 0x%x\n",
 			       libcfs_nid2str(gw->lpni_nid), stat->ns_status);
 			gw->lpni_ping_feats = LNET_PING_FEAT_INVAL;
-			spin_unlock(&gw->lpni_lock);
-			return;
+			goto out;
 		}
 
 		if (up) { /* ignore downed NIs if NI for dest network is up */
@@ -768,7 +776,7 @@ lnet_parse_rc_info(struct lnet_rc_data *rcd)
 
 		rte->lr_downis = down;
 	}
-
+out:
 	spin_unlock(&gw->lpni_lock);
 }
 
@@ -812,7 +820,7 @@ lnet_router_checker_event(struct lnet_event *event)
 	 * we ping alive routers to try to detect router death before
 	 * apps get burned). */
 
-	lnet_notify_locked(lp, 1, (event->status == 0), cfs_time_current());
+	lnet_notify_locked(lp, 1, !event->status, ktime_get_seconds());
 	/* The router checker will wake up very shortly and do the
 	 * actual notification.
 	 * XXX If 'lp' stops being a router before then, it will still
@@ -832,8 +840,9 @@ lnet_wait_known_routerstate(void)
 	struct list_head *entry;
 	int all_known;
 
-	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+	LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING);
 
+	/* the_lnet.ln_api_mutex must be locked */
 	for (;;) {
 		int cpt = lnet_net_lock_current();
 
@@ -857,8 +866,10 @@ lnet_wait_known_routerstate(void)
 		if (all_known)
 			return;
 
+		mutex_unlock(&the_lnet.ln_api_mutex);
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_timeout(cfs_time_seconds(1));
+		mutex_lock(&the_lnet.ln_api_mutex);
 	}
 }
 
@@ -881,15 +892,15 @@ static void
 lnet_update_ni_status_locked(void)
 {
 	struct lnet_ni *ni = NULL;
-	long now;
-	int timeout;
+	time64_t now;
+	time64_t timeout;
 
 	LASSERT(the_lnet.ln_routing);
 
 	timeout = router_ping_timeout +
 		  MAX(live_router_check_interval, dead_router_check_interval);
 
-	now = cfs_time_current_sec();
+	now = ktime_get_real_seconds();
 	while ((ni = lnet_get_next_ni_locked(NULL, ni))) {
 		if (ni->ni_net->net_lnd->lnd_type == LOLND)
 			continue;
@@ -907,7 +918,7 @@ lnet_update_ni_status_locked(void)
 		LASSERT(ni->ni_status != NULL);
 
 		if (ni->ni_status->ns_status != LNET_NI_STATUS_DOWN) {
-			CDEBUG(D_NET, "NI(%s:%d) status changed to down\n",
+			CDEBUG(D_NET, "NI(%s:%lld) status changed to down\n",
 			       libcfs_nid2str(ni->ni_nid), timeout);
 			/* NB: so far, this is the only place to set
 			 * NI status to "down" */
@@ -932,43 +943,62 @@ lnet_destroy_rc_data(struct lnet_rc_data *rcd)
 		lnet_net_unlock(cpt);
 	}
 
-	if (rcd->rcd_pinginfo != NULL)
-		LIBCFS_FREE(rcd->rcd_pinginfo, LNET_PINGINFO_SIZE);
+	if (rcd->rcd_pingbuffer != NULL)
+		lnet_ping_buffer_decref(rcd->rcd_pingbuffer);
 
 	LIBCFS_FREE(rcd, sizeof(*rcd));
 }
 
 static struct lnet_rc_data *
-lnet_create_rc_data_locked(struct lnet_peer_ni *gateway)
+lnet_update_rc_data_locked(struct lnet_peer_ni *gateway)
 {
-	struct lnet_rc_data		*rcd = NULL;
-	struct lnet_ping_info	*pi;
-	int			rc;
-	int			i;
+	struct lnet_handle_md mdh;
+	struct lnet_rc_data *rcd;
+	struct lnet_ping_buffer *pbuf = NULL;
+	int nnis = LNET_INTERFACES_MIN;
+	int rc;
+	int i;
+
+	rcd = gateway->lpni_rcd;
+	if (rcd) {
+		nnis = rcd->rcd_nnis;
+		mdh = rcd->rcd_mdh;
+		LNetInvalidateMDHandle(&rcd->rcd_mdh);
+		pbuf = rcd->rcd_pingbuffer;
+		rcd->rcd_pingbuffer = NULL;
+	} else {
+		LNetInvalidateMDHandle(&mdh);
+	}
 
 	lnet_net_unlock(gateway->lpni_cpt);
 
-	LIBCFS_ALLOC(rcd, sizeof(*rcd));
-	if (rcd == NULL)
-		goto out;
+	if (rcd) {
+		LNetMDUnlink(mdh);
+		lnet_ping_buffer_decref(pbuf);
+	} else {
+		LIBCFS_ALLOC(rcd, sizeof(*rcd));
+		if (rcd == NULL)
+			goto out;
 
-	LNetInvalidateMDHandle(&rcd->rcd_mdh);
-	INIT_LIST_HEAD(&rcd->rcd_list);
+		LNetInvalidateMDHandle(&rcd->rcd_mdh);
+		INIT_LIST_HEAD(&rcd->rcd_list);
+		rcd->rcd_nnis = nnis;
+	}
 
-	LIBCFS_ALLOC(pi, LNET_PINGINFO_SIZE);
-	if (pi == NULL)
+	pbuf = lnet_ping_buffer_alloc(nnis, GFP_NOFS);
+	if (pbuf == NULL)
 		goto out;
 
-	for (i = 0; i < LNET_MAX_RTR_NIS; i++) {
-		pi->pi_ni[i].ns_nid = LNET_NID_ANY;
-		pi->pi_ni[i].ns_status = LNET_NI_STATUS_INVALID;
+	for (i = 0; i < nnis; i++) {
+		pbuf->pb_info.pi_ni[i].ns_nid = LNET_NID_ANY;
+		pbuf->pb_info.pi_ni[i].ns_status = LNET_NI_STATUS_INVALID;
 	}
-	rcd->rcd_pinginfo = pi;
+	rcd->rcd_pingbuffer = pbuf;
 
 	LASSERT(!LNetEQHandleIsInvalid(the_lnet.ln_rc_eqh));
-	rc = LNetMDBind((struct lnet_md){.start     = pi,
+	rc = LNetMDBind((struct lnet_md){.start     = &pbuf->pb_info,
 				    .user_ptr  = rcd,
-				    .length    = LNET_PINGINFO_SIZE,
+				    .length    = LNET_PING_INFO_SIZE(nnis),
 				    .threshold = LNET_MD_THRESH_INF,
 				    .options   = LNET_MD_TRUNCATE,
 				    .eq_handle = the_lnet.ln_rc_eqh},
@@ -976,33 +1006,37 @@ lnet_create_rc_data_locked(struct lnet_peer_ni *gateway)
 			&rcd->rcd_mdh);
 	if (rc < 0) {
 		CERROR("Can't bind MD: %d\n", rc);
-		goto out;
+		goto out_ping_buffer_decref;
 	}
 	LASSERT(rc == 0);
 
 	lnet_net_lock(gateway->lpni_cpt);
-	/* router table changed or someone has created rcd for this gateway */
-	if (!lnet_isrouter(gateway) || gateway->lpni_rcd != NULL) {
-		lnet_net_unlock(gateway->lpni_cpt);
-		goto out;
+	/* Check if this is still a router. */
+	if (!lnet_isrouter(gateway))
+		goto out_unlock;
+	/* Check if someone else installed router data. */
+	if (gateway->lpni_rcd && gateway->lpni_rcd != rcd)
+		goto out_unlock;
+
+	/* Install and/or update the router data. */
+	if (!gateway->lpni_rcd) {
+		lnet_peer_ni_addref_locked(gateway);
+		rcd->rcd_gateway = gateway;
+		gateway->lpni_rcd = rcd;
 	}
-
-	lnet_peer_ni_addref_locked(gateway);
-	rcd->rcd_gateway = gateway;
-	gateway->lpni_rcd = rcd;
 	gateway->lpni_ping_notsent = 0;
 
 	return rcd;
 
+out_unlock:
+	lnet_net_unlock(gateway->lpni_cpt);
+	rc = LNetMDUnlink(mdh);
+	LASSERT(rc == 0);
+out_ping_buffer_decref:
+	lnet_ping_buffer_decref(pbuf);
 out:
-	if (rcd != NULL) {
-		if (!LNetMDHandleIsInvalid(rcd->rcd_mdh)) {
-			rc = LNetMDUnlink(rcd->rcd_mdh);
-			LASSERT(rc == 0);
-		}
+	if (rcd && rcd != gateway->lpni_rcd)
 		lnet_destroy_rc_data(rcd);
-	}
-
 	lnet_net_lock(gateway->lpni_cpt);
 	return gateway->lpni_rcd;
 }
@@ -1024,14 +1058,14 @@ static void
 lnet_ping_router_locked(struct lnet_peer_ni *rtr)
 {
 	struct lnet_rc_data *rcd = NULL;
-	cfs_time_t      now = cfs_time_current();
-	int             secs;
-	struct lnet_ni  *ni;
+	time64_t now = ktime_get_seconds();
+	time64_t secs;
+	struct lnet_ni *ni;
 
 	lnet_peer_ni_addref_locked(rtr);
 
 	if (rtr->lpni_ping_deadline != 0 && /* ping timed out? */
-	    cfs_time_after(now, rtr->lpni_ping_deadline))
+	    now >  rtr->lpni_ping_deadline)
 		lnet_notify_locked(rtr, 1, 0, now);
 
 	/* Run any outstanding notifications */
@@ -1039,30 +1073,36 @@ lnet_ping_router_locked(struct lnet_peer_ni *rtr)
 	lnet_ni_notify_locked(ni, rtr);
 
 	if (!lnet_isrouter(rtr) ||
-	    the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
+	    the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) {
 		/* router table changed or router checker is shutting down */
 		lnet_peer_ni_decref_locked(rtr);
 		return;
 	}
 
-	rcd = rtr->lpni_rcd != NULL ?
-	      rtr->lpni_rcd : lnet_create_rc_data_locked(rtr);
+	rcd = rtr->lpni_rcd;
 
+	/*
+	 * The response to the router checker ping could've timed out and
+	 * the mdh might've been invalidated, so we need to update it
+	 * again.
+	 */
+	if (!rcd || rcd->rcd_nnis > rcd->rcd_pingbuffer->pb_nnis ||
+	    LNetMDHandleIsInvalid(rcd->rcd_mdh))
+		rcd = lnet_update_rc_data_locked(rtr);
 	if (rcd == NULL)
 		return;
 
 	secs = lnet_router_check_interval(rtr);
 
 	CDEBUG(D_NET,
-	       "rtr %s %d: deadline %lu ping_notsent %d alive %d "
-	       "alive_count %d lpni_ping_timestamp %lu\n",
+	       "rtr %s %lld: deadline %lld ping_notsent %d alive %d "
+	       "alive_count %d lpni_ping_timestamp %lld\n",
 	       libcfs_nid2str(rtr->lpni_nid), secs,
 	       rtr->lpni_ping_deadline, rtr->lpni_ping_notsent,
 	       rtr->lpni_alive, rtr->lpni_alive_count, rtr->lpni_ping_timestamp);
 
 	if (secs != 0 && !rtr->lpni_ping_notsent &&
-	    cfs_time_after(now, cfs_time_add(rtr->lpni_ping_timestamp,
-					     cfs_time_seconds(secs)))) {
+	    now > rtr->lpni_ping_timestamp + secs) {
 		int               rc;
 		struct lnet_process_id id;
 		struct lnet_handle_md mdh;
@@ -1077,14 +1117,14 @@ lnet_ping_router_locked(struct lnet_peer_ni *rtr)
 		mdh = rcd->rcd_mdh;
 
 		if (rtr->lpni_ping_deadline == 0) {
-			rtr->lpni_ping_deadline =
-				cfs_time_shift(router_ping_timeout);
+			rtr->lpni_ping_deadline = ktime_get_seconds() +
+						  router_ping_timeout;
 		}
 
 		lnet_net_unlock(rtr->lpni_cpt);
 
 		rc = LNetGet(LNET_NID_ANY, mdh, id, LNET_RESERVED_PORTAL,
-			     LNET_PROTO_PING_MATCHBITS, 0);
+			     LNET_PROTO_PING_MATCHBITS, 0, false);
 
 		lnet_net_lock(rtr->lpni_cpt);
 		if (rc != 0)
@@ -1095,14 +1135,9 @@ lnet_ping_router_locked(struct lnet_peer_ni *rtr)
 	return;
 }
 
-int
-lnet_router_checker_start(void)
+int lnet_router_pre_mt_start(void)
 {
-	int			rc;
-	int			eqsz = 0;
-	struct task_struct     *task;
-
-	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
+	int rc;
 
 	if (check_routers_before_use &&
 	    dead_router_check_interval <= 0) {
@@ -1112,60 +1147,36 @@ lnet_router_checker_start(void)
 		return -EINVAL;
 	}
 
-	sema_init(&the_lnet.ln_rc_signal, 0);
-
 	rc = LNetEQAlloc(0, lnet_router_checker_event, &the_lnet.ln_rc_eqh);
 	if (rc != 0) {
-		CERROR("Can't allocate EQ(%d): %d\n", eqsz, rc);
+		CERROR("Can't allocate EQ(0): %d\n", rc);
 		return -ENOMEM;
 	}
 
-	the_lnet.ln_rc_state = LNET_RC_STATE_RUNNING;
-	task = kthread_run(lnet_router_checker, NULL, "router_checker");
-	if (IS_ERR(task)) {
-		rc = PTR_ERR(task);
-		CERROR("Can't start router checker thread: %d\n", rc);
-		/* block until event callback signals exit */
-		down(&the_lnet.ln_rc_signal);
-		rc = LNetEQFree(the_lnet.ln_rc_eqh);
-		LASSERT(rc == 0);
-		the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
-		return -ENOMEM;
-	}
+	return 0;
+}
 
+void lnet_router_post_mt_start(void)
+{
 	if (check_routers_before_use) {
 		/* Note that a helpful side-effect of pinging all known routers
 		 * at startup is that it makes them drop stale connections they
 		 * may have to a previous instance of me. */
 		lnet_wait_known_routerstate();
 	}
-
-	return 0;
 }
 
 void
-lnet_router_checker_stop (void)
+lnet_router_cleanup(void)
 {
 	int rc;
 
-	if (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN)
-		return;
-
-	LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
-	the_lnet.ln_rc_state = LNET_RC_STATE_STOPPING;
-	/* wakeup the RC thread if it's sleeping */
-	wake_up(&the_lnet.ln_rc_waitq);
-
-	/* block until event callback signals exit */
-	down(&the_lnet.ln_rc_signal);
-	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
-
 	rc = LNetEQFree(the_lnet.ln_rc_eqh);
 	LASSERT(rc == 0);
 	return;
 }
 
-static void
+void
 lnet_prune_rc_data(int wait_unlink)
 {
 	struct lnet_rc_data *rcd;
@@ -1174,7 +1185,7 @@ lnet_prune_rc_data(int wait_unlink)
 	struct list_head head;
 	int i = 2;
 
-	if (likely(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING &&
+	if (likely(the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING &&
 		   list_empty(&the_lnet.ln_rcd_deathrow) &&
 		   list_empty(&the_lnet.ln_rcd_zombie)))
 		return;
@@ -1183,7 +1194,7 @@ lnet_prune_rc_data(int wait_unlink)
 
 	lnet_net_lock(LNET_LOCK_EX);
 
-	if (the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
+	if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) {
 		/* router checker is stopping, prune all */
 		list_for_each_entry(lp, &the_lnet.ln_routers,
 				    lpni_rtr_list) {
@@ -1247,18 +1258,13 @@ lnet_prune_rc_data(int wait_unlink)
 }
 
 /*
- * This function is called to check if the RC should block indefinitely.
- * It's called from lnet_router_checker() as well as being passed to
- * wait_event_interruptible() to avoid the lost wake_up problem.
- *
- * When it's called from wait_event_interruptible() it is necessary to
- * also not sleep if the rc state is not running to avoid a deadlock
- * when the system is shutting down
+ * This function is called from the monitor thread to check if there are
+ * any active routers that need to be checked.
  */
-static inline bool
+inline bool
 lnet_router_checker_active(void)
 {
-	if (the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING)
+	if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING)
 		return true;
 
 	/* Router Checker thread needs to run when routing is enabled in
@@ -1266,79 +1272,58 @@ lnet_router_checker_active(void)
 	if (the_lnet.ln_routing)
 		return true;
 
+	/* if there are routers that need to be cleaned up then do so */
+	if (!list_empty(&the_lnet.ln_rcd_deathrow) ||
+	    !list_empty(&the_lnet.ln_rcd_zombie))
+		return true;
+
 	return !list_empty(&the_lnet.ln_routers) &&
 		(live_router_check_interval > 0 ||
 		 dead_router_check_interval > 0);
 }
 
-static int
-lnet_router_checker(void *arg)
+void
+lnet_check_routers(void)
 {
 	struct lnet_peer_ni *rtr;
 	struct list_head *entry;
+	__u64	version;
+	int	cpt;
+	int	cpt2;
 
-	cfs_block_allsigs();
-
-	while (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) {
-		__u64	version;
-		int	cpt;
-		int	cpt2;
-
-		cpt = lnet_net_lock_current();
+	cpt = lnet_net_lock_current();
 rescan:
-		version = the_lnet.ln_routers_version;
-
-		list_for_each(entry, &the_lnet.ln_routers) {
-			rtr = list_entry(entry, struct lnet_peer_ni,
-					 lpni_rtr_list);
-
-			cpt2 = rtr->lpni_cpt;
-			if (cpt != cpt2) {
-				lnet_net_unlock(cpt);
-				cpt = cpt2;
-				lnet_net_lock(cpt);
-				/* the routers list has changed */
-				if (version != the_lnet.ln_routers_version)
-					goto rescan;
-			}
+	version = the_lnet.ln_routers_version;
 
-			lnet_ping_router_locked(rtr);
+	list_for_each(entry, &the_lnet.ln_routers) {
+		rtr = list_entry(entry, struct lnet_peer_ni,
+					lpni_rtr_list);
 
-			/* NB dropped lock */
-			if (version != the_lnet.ln_routers_version) {
-				/* the routers list has changed */
+		cpt2 = rtr->lpni_cpt;
+		if (cpt != cpt2) {
+			lnet_net_unlock(cpt);
+			cpt = cpt2;
+			lnet_net_lock(cpt);
+			/* the routers list has changed */
+			if (version != the_lnet.ln_routers_version)
 				goto rescan;
-			}
 		}
 
-		if (the_lnet.ln_routing)
-			lnet_update_ni_status_locked();
-
-		lnet_net_unlock(cpt);
-
-		lnet_prune_rc_data(0); /* don't wait for UNLINK */
+		lnet_ping_router_locked(rtr);
 
-		/* Call schedule_timeout() here always adds 1 to load average
-		 * because kernel counts # active tasks as nr_running
-		 * + nr_uninterruptible. */
-		/* if there are any routes then wakeup every second.  If
-		 * there are no routes then sleep indefinitely until woken
-		 * up by a user adding a route */
-		if (!lnet_router_checker_active())
-			wait_event_interruptible(the_lnet.ln_rc_waitq,
-						 lnet_router_checker_active());
-		else
-			wait_event_interruptible_timeout(the_lnet.ln_rc_waitq,
-							 false,
-							 cfs_time_seconds(1));
+		/* NB dropped lock */
+		if (version != the_lnet.ln_routers_version) {
+			/* the routers list has changed */
+			goto rescan;
+		}
 	}
 
-	lnet_prune_rc_data(1); /* wait for UNLINK */
+	if (the_lnet.ln_routing)
+		lnet_update_ni_status_locked();
 
-	the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
-	up(&the_lnet.ln_rc_signal);
-	/* The unlink event callback will signal final completion */
-	return 0;
+	lnet_net_unlock(cpt);
+
+	lnet_prune_rc_data(0); /* don't wait for UNLINK */
 }
 
 void
@@ -1741,7 +1726,8 @@ lnet_rtrpools_enable(void)
 	lnet_net_lock(LNET_LOCK_EX);
 	the_lnet.ln_routing = 1;
 
-	the_lnet.ln_ping_info->pi_features &= ~LNET_PING_FEAT_RTE_DISABLED;
+	the_lnet.ln_ping_target->pb_info.pi_features &=
+		~LNET_PING_FEAT_RTE_DISABLED;
 	lnet_net_unlock(LNET_LOCK_EX);
 
 	return rc;
@@ -1755,7 +1741,8 @@ lnet_rtrpools_disable(void)
 
 	lnet_net_lock(LNET_LOCK_EX);
 	the_lnet.ln_routing = 0;
-	the_lnet.ln_ping_info->pi_features |= LNET_PING_FEAT_RTE_DISABLED;
+	the_lnet.ln_ping_target->pb_info.pi_features |=
+		LNET_PING_FEAT_RTE_DISABLED;
 
 	tiny_router_buffers = 0;
 	small_router_buffers = 0;
@@ -1765,10 +1752,10 @@ lnet_rtrpools_disable(void)
 }
 
 int
-lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, int alive, cfs_time_t when)
+lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, int alive, time64_t when)
 {
 	struct lnet_peer_ni *lp = NULL;
-	cfs_time_t now = cfs_time_current();
+	time64_t now = ktime_get_seconds();
 	int cpt = lnet_cpt_of_nid(nid, ni);
 
 	LASSERT (!in_interrupt ());
@@ -1787,12 +1774,11 @@ lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, int alive, cfs_time_t when)
 	}
 
 	/* can't do predictions... */
-	if (cfs_time_after(when, now)) {
+	if (when > now) {
 		CWARN("Ignoring prediction from %s of %s %s "
-		      "%ld seconds in the future\n",
+		      "%lld seconds in the future\n",
 		      (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
-		      libcfs_nid2str(nid), alive ? "up" : "down",
-		      cfs_duration_sec(cfs_time_sub(when, now)));
+		      libcfs_nid2str(nid), alive ? "up" : "down", when - now);
 		return -EINVAL;
 	}
 
diff --git a/drivers/staging/lustrefsx/lnet/lnet/router_proc.c b/drivers/staging/lustrefsx/lnet/lnet/router_proc.c
index b7d513521b433..2e60609ee229d 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/router_proc.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/router_proc.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  *
  *   This file is part of Lustre, https://wiki.whamcloud.com/
  *
@@ -21,14 +21,15 @@
  */
 
 #define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/uaccess.h>
+
 #include <libcfs/libcfs.h>
 #include <lnet/lib-lnet.h>
 
 /* This is really lnet_proc.c. You might need to update sanity test 215
  * if any file format is changed. */
 
-static struct ctl_table_header *lnet_table_header = NULL;
-
 #define LNET_LOFFT_BITS		(sizeof(loff_t) * 8)
 /*
  * NB: max allowed LNET_CPT_BITS is 8 on 64-bit system and 2 on 32-bit system
@@ -81,6 +82,7 @@ static int __proc_lnet_stats(void *data, int write,
 {
 	int		 rc;
 	struct lnet_counters *ctrs;
+	struct lnet_counters_common common;
 	int		 len;
 	char		*tmpstr;
 	const int	 tmpsiz = 256; /* 7 %u and 4 __u64 */
@@ -103,16 +105,17 @@ static int __proc_lnet_stats(void *data, int write,
 	}
 
 	lnet_counters_get(ctrs);
+	common = ctrs->lct_common;
 
 	len = snprintf(tmpstr, tmpsiz,
 		       "%u %u %u %u %u %u %u %llu %llu "
 		       "%llu %llu",
-		       ctrs->msgs_alloc, ctrs->msgs_max,
-		       ctrs->errors,
-		       ctrs->send_count, ctrs->recv_count,
-		       ctrs->route_count, ctrs->drop_count,
-		       ctrs->send_length, ctrs->recv_length,
-		       ctrs->route_length, ctrs->drop_length);
+		       common.lcc_msgs_alloc, common.lcc_msgs_max,
+		       common.lcc_errors,
+		       common.lcc_send_count, common.lcc_recv_count,
+		       common.lcc_route_count, common.lcc_drop_count,
+		       common.lcc_send_length, common.lcc_recv_length,
+		       common.lcc_route_length, common.lcc_drop_length);
 
 	if (pos >= min_t(int, len, strlen(tmpstr)))
 		rc = 0;
@@ -244,14 +247,9 @@ proc_lnet_routes(struct ctl_table *table, int write, void __user *buffer,
 	if (len > *lenp) {    /* linux-supplied buffer is too small */
 		rc = -EINVAL;
 	} else if (len > 0) { /* wrote something */
-#ifdef PROC_HANDLER_USE_USER_ATTR
 		if (copy_to_user(buffer, tmpstr, len))
 			rc = -EFAULT;
 		else {
-#else
-		memcpy(buffer, tmpstr, len);
-		{
-#endif
 			off += 1;
 			*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
 		}
@@ -335,15 +333,14 @@ proc_lnet_routers(struct ctl_table *table, int write, void __user *buffer,
 
 		if (peer != NULL) {
 			lnet_nid_t nid = peer->lpni_nid;
-			cfs_time_t now = cfs_time_current();
-			cfs_time_t deadline = peer->lpni_ping_deadline;
+			time64_t now = ktime_get_seconds();
+			time64_t deadline = peer->lpni_ping_deadline;
 			int nrefs     = atomic_read(&peer->lpni_refcount);
 			int nrtrrefs  = peer->lpni_rtr_refcount;
 			int alive_cnt = peer->lpni_alive_count;
 			int alive     = peer->lpni_alive;
 			int pingsent  = !peer->lpni_ping_notsent;
-			int last_ping = cfs_duration_sec(cfs_time_sub(now,
-						     peer->lpni_ping_timestamp));
+			time64_t last_ping = now - peer->lpni_ping_timestamp;
 			int down_ni   = 0;
 			struct lnet_route *rtr;
 
@@ -362,18 +359,18 @@ proc_lnet_routers(struct ctl_table *table, int write, void __user *buffer,
 
 			if (deadline == 0)
 				s += snprintf(s, tmpstr + tmpsiz - s,
-					      "%-4d %7d %9d %6s %12d %9d %8s %7d %s\n",
+					      "%-4d %7d %9d %6s %12llu %9d %8s %7d %s\n",
 					      nrefs, nrtrrefs, alive_cnt,
 					      alive ? "up" : "down", last_ping,
 					      pingsent, "NA", down_ni,
 					      libcfs_nid2str(nid));
 			else
 				s += snprintf(s, tmpstr + tmpsiz - s,
-					      "%-4d %7d %9d %6s %12d %9d %8lu %7d %s\n",
+					      "%-4d %7d %9d %6s %12llu %9d %8llu %7d %s\n",
 					      nrefs, nrtrrefs, alive_cnt,
 					      alive ? "up" : "down", last_ping,
 					      pingsent,
-					      cfs_duration_sec(cfs_time_sub(deadline, now)),
+					      deadline - now,
 					      down_ni, libcfs_nid2str(nid));
 			LASSERT(tmpstr + tmpsiz - s > 0);
 		}
@@ -386,14 +383,9 @@ proc_lnet_routers(struct ctl_table *table, int write, void __user *buffer,
 	if (len > *lenp) {    /* linux-supplied buffer is too small */
 		rc = -EINVAL;
 	} else if (len > 0) { /* wrote something */
-#ifdef PROC_HANDLER_USE_USER_ATTR
 		if (copy_to_user(buffer, tmpstr, len))
 			rc = -EFAULT;
 		else {
-#else
-		memcpy(buffer, tmpstr, len);
-		{
-#endif
 			off += 1;
 			*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
 		}
@@ -531,7 +523,7 @@ proc_lnet_peers(struct ctl_table *table, int write, void __user *buffer,
 		if (peer != NULL) {
 			lnet_nid_t nid = peer->lpni_nid;
 			int nrefs = atomic_read(&peer->lpni_refcount);
-			int lastalive = -1;
+			time64_t lastalive = -1;
 			char *aliveness = "NA";
 			int maxcr = (peer->lpni_net) ?
 			  peer->lpni_net->net_tunables.lct_peer_tx_credits : 0;
@@ -546,11 +538,9 @@ proc_lnet_peers(struct ctl_table *table, int write, void __user *buffer,
 				aliveness = peer->lpni_alive ? "up" : "down";
 
 			if (lnet_peer_aliveness_enabled(peer)) {
-				cfs_time_t now = cfs_time_current();
-				cfs_duration_t delta;
+				time64_t now = ktime_get_seconds();
 
-				delta = cfs_time_sub(now, peer->lpni_last_alive);
-				lastalive = cfs_duration_sec(delta);
+				lastalive = now - peer->lpni_last_alive;
 
 				/* No need to mess up peers contents with
 				 * arbitrarily long integers - it suffices to
@@ -563,7 +553,7 @@ proc_lnet_peers(struct ctl_table *table, int write, void __user *buffer,
 			lnet_net_unlock(cpt);
 
 			s += snprintf(s, tmpstr + tmpsiz - s,
-				      "%-24s %4d %5s %5d %5d %5d %5d %5d %5d %d\n",
+				      "%-24s %4d %5s %5lld %5d %5d %5d %5d %5d %d\n",
 				      libcfs_nid2str(nid), nrefs, aliveness,
 				      lastalive, maxcr, rtrcr, minrtrcr, txcr,
 				      mintxcr, txqnob);
@@ -587,13 +577,9 @@ proc_lnet_peers(struct ctl_table *table, int write, void __user *buffer,
 	if (len > *lenp) {    /* linux-supplied buffer is too small */
 		rc = -EINVAL;
 	} else if (len > 0) { /* wrote something */
-#ifdef PROC_HANDLER_USE_USER_ATTR
 		if (copy_to_user(buffer, tmpstr, len))
 			rc = -EFAULT;
 		else
-#else
-		memcpy(buffer, tmpstr, len);
-#endif
 			*ppos = LNET_PROC_POS_MAKE(cpt, ver, hash, hoff);
 	}
 
@@ -741,12 +727,12 @@ proc_lnet_nis(struct ctl_table *table, int write, void __user *buffer,
 		ni = lnet_get_ni_idx_locked(skip);
 
 		if (ni != NULL) {
-			struct lnet_tx_queue	*tq;
-			char	*stat;
-			long now = cfs_time_current_sec();
-			int	last_alive = -1;
-			int	i;
-			int	j;
+			struct lnet_tx_queue *tq;
+			char *stat;
+			time64_t now = ktime_get_real_seconds();
+			time64_t last_alive = -1;
+			int i;
+			int j;
 
 			if (the_lnet.ln_routing)
 				last_alive = now - ni->ni_last_alive;
@@ -777,7 +763,7 @@ proc_lnet_nis(struct ctl_table *table, int write, void __user *buffer,
 					lnet_net_lock(i);
 
 				s += snprintf(s, tmpstr + tmpsiz - s,
-				      "%-24s %6s %5d %4d %4d %4d %5d %5d %5d\n",
+				      "%-24s %6s %5lld %4d %4d %4d %5d %5d %5d\n",
 				      libcfs_nid2str(ni->ni_nid), stat,
 				      last_alive, *ni->ni_refs[i],
 				      ni->ni_net->net_tunables.lct_peer_tx_credits,
@@ -798,14 +784,9 @@ proc_lnet_nis(struct ctl_table *table, int write, void __user *buffer,
 	if (len > *lenp) {    /* linux-supplied buffer is too small */
 		rc = -EINVAL;
 	} else if (len > 0) { /* wrote something */
-
-#ifdef PROC_HANDLER_USE_USER_ATTR
 		if (copy_to_user(buffer, tmpstr, len))
 			rc = -EFAULT;
 		else
-#else
-		memcpy(buffer, tmpstr, len);
-#endif
 			*ppos += 1;
 	}
 
@@ -974,34 +955,12 @@ static struct ctl_table lnet_table[] = {
 	{ .procname = NULL }
 };
 
-static struct ctl_table top_table[] = {
-	{
-		INIT_CTL_NAME
-		.procname	= "lnet",
-		.mode		= 0555,
-		.data		= NULL,
-		.maxlen		= 0,
-		.child		= lnet_table,
-	},
-	{ .procname = NULL }
-};
-
-void
-lnet_proc_init(void)
+void lnet_router_debugfs_init(void)
 {
-#ifdef CONFIG_SYSCTL
-	if (lnet_table_header == NULL)
-		lnet_table_header = register_sysctl_table(top_table);
-#endif
+	lnet_insert_debugfs(lnet_table);
 }
 
-void
-lnet_proc_fini(void)
+void lnet_router_debugfs_fini(void)
 {
-#ifdef CONFIG_SYSCTL
-	if (lnet_table_header != NULL)
-		unregister_sysctl_table(lnet_table_header);
-
-	lnet_table_header = NULL;
-#endif
+	lnet_remove_debugfs(lnet_table);
 }
diff --git a/drivers/staging/lustrefsx/lnet/selftest/brw_test.c b/drivers/staging/lustrefsx/lnet/selftest/brw_test.c
index 512dbb5b8a2f1..a03f6078c0589 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/brw_test.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/brw_test.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -49,10 +49,10 @@ MODULE_PARM_DESC(brw_inject_errors, "# data errors to inject randomly, zero by d
 #define BRW_MSIZE	sizeof(__u64)
 
 static void
-brw_client_fini (sfw_test_instance_t *tsi)
+brw_client_fini(struct sfw_test_instance *tsi)
 {
-	srpc_bulk_t	*bulk;
-	sfw_test_unit_t	*tsu;
+	struct srpc_bulk *bulk;
+	struct sfw_test_unit *tsu;
 
 	LASSERT(tsi->tsi_is_client);
 
@@ -67,22 +67,22 @@ brw_client_fini (sfw_test_instance_t *tsi)
 }
 
 static int
-brw_client_init (sfw_test_instance_t *tsi)
+brw_client_init(struct sfw_test_instance *tsi)
 {
-	sfw_session_t	 *sn = tsi->tsi_batch->bat_session;
+	struct sfw_session *sn = tsi->tsi_batch->bat_session;
 	int		  flags;
 	int		  off;
 	int		  npg;
 	int		  len;
 	int		  opc;
-	srpc_bulk_t	 *bulk;
-	sfw_test_unit_t	 *tsu;
+	struct srpc_bulk *bulk;
+	struct sfw_test_unit *tsu;
 
 	LASSERT(sn != NULL);
 	LASSERT(tsi->tsi_is_client);
 
 	if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
-		test_bulk_req_t  *breq = &tsi->tsi_u.bulk_v0;
+		struct test_bulk_req *breq = &tsi->tsi_u.bulk_v0;
 
 		opc   = breq->blk_opc;
 		flags = breq->blk_flags;
@@ -93,7 +93,7 @@ brw_client_init (sfw_test_instance_t *tsi)
 		off   = 0;
 
 	} else {
-		test_bulk_req_v1_t  *breq = &tsi->tsi_u.bulk_v1;
+		struct test_bulk_req_v1 *breq = &tsi->tsi_u.bulk_v1;
 
 		/* I should never get this step if it's unknown feature
 		 * because make_session will reject unknown feature */
@@ -137,7 +137,7 @@ brw_client_init (sfw_test_instance_t *tsi)
 #define BRW_MAGIC       0xeeb0eeb1eeb2eeb3ULL
 #define BRW_MSIZE       sizeof(__u64)
 
-int brw_inject_one_error(void)
+static int brw_inject_one_error(void)
 {
 	struct timespec64 ts;
 
@@ -228,7 +228,7 @@ brw_check_page(struct page *pg, int off, int len, int pattern, __u64 magic)
 }
 
 static void
-brw_fill_bulk(srpc_bulk_t *bk, int pattern, __u64 magic)
+brw_fill_bulk(struct srpc_bulk *bk, int pattern, __u64 magic)
 {
 	int	     i;
 	struct page *pg;
@@ -245,7 +245,7 @@ brw_fill_bulk(srpc_bulk_t *bk, int pattern, __u64 magic)
 }
 
 static int
-brw_check_bulk(srpc_bulk_t *bk, int pattern, __u64 magic)
+brw_check_bulk(struct srpc_bulk *bk, int pattern, __u64 magic)
 {
 	int	     i;
 	struct page *pg;
@@ -268,25 +268,25 @@ brw_check_bulk(srpc_bulk_t *bk, int pattern, __u64 magic)
 }
 
 static int
-brw_client_prep_rpc(sfw_test_unit_t *tsu,
-		    struct lnet_process_id dest, srpc_client_rpc_t **rpcpp)
+brw_client_prep_rpc(struct sfw_test_unit *tsu, struct lnet_process_id dest,
+		    struct srpc_client_rpc **rpcpp)
 {
-	srpc_bulk_t	    *bulk = tsu->tsu_private;
-	sfw_test_instance_t *tsi = tsu->tsu_instance;
-	sfw_session_t	    *sn = tsi->tsi_batch->bat_session;
-	srpc_client_rpc_t   *rpc;
-	srpc_brw_reqst_t    *req;
-	int		     flags;
-	int		     npg;
-	int		     len;
-	int		     opc;
-	int		     rc;
+	struct srpc_bulk *bulk = tsu->tsu_private;
+	struct sfw_test_instance *tsi = tsu->tsu_instance;
+	struct sfw_session *sn = tsi->tsi_batch->bat_session;
+	struct srpc_client_rpc *rpc;
+	struct srpc_brw_reqst *req;
+	int flags;
+	int npg;
+	int len;
+	int opc;
+	int rc;
 
 	LASSERT(sn != NULL);
 	LASSERT(bulk != NULL);
 
 	if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
-		test_bulk_req_t *breq = &tsi->tsi_u.bulk_v0;
+		struct test_bulk_req *breq = &tsi->tsi_u.bulk_v0;
 
 		opc   = breq->blk_opc;
 		flags = breq->blk_flags;
@@ -294,8 +294,8 @@ brw_client_prep_rpc(sfw_test_unit_t *tsu,
 		len   = npg * PAGE_SIZE;
 
 	} else {
-		test_bulk_req_v1_t  *breq = &tsi->tsi_u.bulk_v1;
-		int		     off;
+		struct test_bulk_req_v1 *breq = &tsi->tsi_u.bulk_v1;
+		int off;
 
 		/* I should never get this step if it's unknown feature
 		 * because make_session will reject unknown feature */
@@ -312,7 +312,7 @@ brw_client_prep_rpc(sfw_test_unit_t *tsu,
 	if (rc != 0)
 		return rc;
 
-	memcpy(&rpc->crpc_bulk, bulk, offsetof(srpc_bulk_t, bk_iovs[npg]));
+	memcpy(&rpc->crpc_bulk, bulk, offsetof(struct srpc_bulk, bk_iovs[npg]));
 	if (opc == LST_BRW_WRITE)
 		brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_MAGIC);
 	else
@@ -328,14 +328,14 @@ brw_client_prep_rpc(sfw_test_unit_t *tsu,
 }
 
 static void
-brw_client_done_rpc(sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
+brw_client_done_rpc(struct sfw_test_unit *tsu, struct srpc_client_rpc *rpc)
 {
-	__u64                magic = BRW_MAGIC;
-	sfw_test_instance_t *tsi = tsu->tsu_instance;
-	sfw_session_t       *sn = tsi->tsi_batch->bat_session;
-	srpc_msg_t          *msg = &rpc->crpc_replymsg;
-	srpc_brw_reply_t    *reply = &msg->msg_body.brw_reply;
-	srpc_brw_reqst_t    *reqst = &rpc->crpc_reqstmsg.msg_body.brw_reqst;
+	__u64 magic = BRW_MAGIC;
+	struct sfw_test_instance *tsi = tsu->tsu_instance;
+	struct sfw_session *sn = tsi->tsi_batch->bat_session;
+	struct srpc_msg *msg = &rpc->crpc_replymsg;
+	struct srpc_brw_reply *reply = &msg->msg_body.brw_reply;
+	struct srpc_brw_reqst *reqst = &rpc->crpc_reqstmsg.msg_body.brw_reqst;
 
 	LASSERT(sn != NULL);
 
@@ -376,9 +376,9 @@ brw_client_done_rpc(sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
 }
 
 static void
-brw_server_rpc_done(srpc_server_rpc_t *rpc)
+brw_server_rpc_done(struct srpc_server_rpc *rpc)
 {
-	srpc_bulk_t *blk = rpc->srpc_bulk;
+	struct srpc_bulk *blk = rpc->srpc_bulk;
 
 	if (blk == NULL)
 		return;
@@ -396,12 +396,12 @@ brw_server_rpc_done(srpc_server_rpc_t *rpc)
 }
 
 static int
-brw_bulk_ready(srpc_server_rpc_t *rpc, int status)
+brw_bulk_ready(struct srpc_server_rpc *rpc, int status)
 {
-        __u64             magic = BRW_MAGIC;
-        srpc_brw_reply_t *reply = &rpc->srpc_replymsg.msg_body.brw_reply;
-        srpc_brw_reqst_t *reqst;
-        srpc_msg_t       *reqstmsg;
+	__u64 magic = BRW_MAGIC;
+	struct srpc_brw_reply *reply = &rpc->srpc_replymsg.msg_body.brw_reply;
+	struct srpc_brw_reqst *reqst;
+	struct srpc_msg *reqstmsg;
 
         LASSERT (rpc->srpc_bulk != NULL);
         LASSERT (rpc->srpc_reqstbuf != NULL);
@@ -434,13 +434,13 @@ brw_bulk_ready(srpc_server_rpc_t *rpc, int status)
 static int
 brw_server_handle(struct srpc_server_rpc *rpc)
 {
-	struct srpc_service	*sv = rpc->srpc_scd->scd_svc;
-        srpc_msg_t       *replymsg = &rpc->srpc_replymsg;
-        srpc_msg_t       *reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
-        srpc_brw_reply_t *reply = &replymsg->msg_body.brw_reply;
-        srpc_brw_reqst_t *reqst = &reqstmsg->msg_body.brw_reqst;
-	int		  npg;
-        int               rc;
+	struct srpc_service *sv = rpc->srpc_scd->scd_svc;
+	struct srpc_msg *replymsg = &rpc->srpc_replymsg;
+	struct srpc_msg *reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+	struct srpc_brw_reply *reply = &replymsg->msg_body.brw_reply;
+	struct srpc_brw_reqst *reqst = &reqstmsg->msg_body.brw_reqst;
+	int npg;
+	int rc;
 
         LASSERT (sv->sv_id == SRPC_SERVICE_BRW);
 
@@ -505,7 +505,8 @@ brw_server_handle(struct srpc_server_rpc *rpc)
         return 0;
 }
 
-sfw_test_client_ops_t brw_test_client;
+struct sfw_test_client_ops brw_test_client;
+
 void brw_init_test_client(void)
 {
         brw_test_client.tso_init       = brw_client_init;
@@ -514,10 +515,10 @@ void brw_init_test_client(void)
         brw_test_client.tso_done_rpc   = brw_client_done_rpc;
 };
 
-srpc_service_t brw_test_service;
+struct srpc_service brw_test_service;
+
 void brw_init_test_service(void)
 {
-
         brw_test_service.sv_id         = SRPC_SERVICE_BRW;
         brw_test_service.sv_name       = "brw_test";
         brw_test_service.sv_handler    = brw_server_handle;
diff --git a/drivers/staging/lustrefsx/lnet/selftest/conctl.c b/drivers/staging/lustrefsx/lnet/selftest/conctl.c
index 9e60d0d671df2..7ce53bbabff32 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/conctl.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/conctl.c
@@ -43,27 +43,27 @@
 static int
 lst_session_new_ioctl(struct lstio_session_new_args *args)
 {
-        char      *name;
-        int        rc;
-
-        if (args->lstio_ses_idp   == NULL || /* address for output sid */
-            args->lstio_ses_key   == 0 || /* no key is specified */
-            args->lstio_ses_namep == NULL || /* session name */
-            args->lstio_ses_nmlen <= 0 ||
-            args->lstio_ses_nmlen > LST_NAME_SIZE)
-                return -EINVAL;
+	char *name;
+	int rc;
+
+	if (args->lstio_ses_idp == NULL || /* address for output sid */
+	    args->lstio_ses_key == 0 || /* no key is specified */
+	    args->lstio_ses_namep == NULL || /* session name */
+	    args->lstio_ses_nmlen <= 0 ||
+	    args->lstio_ses_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
 
-        LIBCFS_ALLOC(name, args->lstio_ses_nmlen + 1);
-        if (name == NULL)
-                return -ENOMEM;
+	LIBCFS_ALLOC(name, args->lstio_ses_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_ses_namep,
 			   args->lstio_ses_nmlen)) {
-                LIBCFS_FREE(name, args->lstio_ses_nmlen + 1);
-                return -EFAULT;
-        }
+		LIBCFS_FREE(name, args->lstio_ses_nmlen + 1);
+		return -EFAULT;
+	}
 
-        name[args->lstio_ses_nmlen] = 0;
+	name[args->lstio_ses_nmlen] = 0;
 
 	rc = lstcon_session_new(name,
 				args->lstio_ses_key,
@@ -79,272 +79,272 @@ lst_session_new_ioctl(struct lstio_session_new_args *args)
 static int
 lst_session_end_ioctl(struct lstio_session_end_args *args)
 {
-        if (args->lstio_ses_key != console_session.ses_key)
-                return -EACCES;
+	if (args->lstio_ses_key != console_session.ses_key)
+		return -EACCES;
 
-        return lstcon_session_end();
+	return lstcon_session_end();
 }
 
 static int
 lst_session_info_ioctl(struct lstio_session_info_args *args)
 {
-        /* no checking of key */
-
-        if (args->lstio_ses_idp   == NULL || /* address for ouput sid */
-            args->lstio_ses_keyp  == NULL || /* address for ouput key */
-	    args->lstio_ses_featp  == NULL || /* address for ouput features */
-            args->lstio_ses_ndinfo == NULL || /* address for output ndinfo */
-            args->lstio_ses_namep == NULL || /* address for ouput name */
-            args->lstio_ses_nmlen <= 0 ||
-            args->lstio_ses_nmlen > LST_NAME_SIZE)
-                return -EINVAL;
-
-        return lstcon_session_info(args->lstio_ses_idp,
-                                   args->lstio_ses_keyp,
+	/* no checking of key */
+
+	if (args->lstio_ses_idp == NULL || /* address for ouput sid */
+	    args->lstio_ses_keyp == NULL || /* address for ouput key */
+	    args->lstio_ses_featp == NULL || /* address for ouput features */
+	    args->lstio_ses_ndinfo == NULL || /* address for output ndinfo */
+	    args->lstio_ses_namep == NULL || /* address for ouput name */
+	    args->lstio_ses_nmlen <= 0 ||
+	    args->lstio_ses_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	return lstcon_session_info(args->lstio_ses_idp,
+				   args->lstio_ses_keyp,
 				   args->lstio_ses_featp,
-                                   args->lstio_ses_ndinfo,
-                                   args->lstio_ses_namep,
-                                   args->lstio_ses_nmlen);
+				   args->lstio_ses_ndinfo,
+				   args->lstio_ses_namep,
+				   args->lstio_ses_nmlen);
 }
 
 static int
 lst_debug_ioctl(struct lstio_debug_args *args)
 {
-        char   *name   = NULL;
-        int     client = 1;
-        int     rc;
+	char *name = NULL;
+	int client = 1;
+	int rc;
 
-        if (args->lstio_dbg_key != console_session.ses_key)
-                return -EACCES;
+	if (args->lstio_dbg_key != console_session.ses_key)
+		return -EACCES;
 
-        if (args->lstio_dbg_resultp == NULL)
-                return -EINVAL;
+	if (args->lstio_dbg_resultp == NULL)
+		return -EINVAL;
 
-        if (args->lstio_dbg_namep != NULL && /* name of batch/group */
-            (args->lstio_dbg_nmlen <= 0 ||
-             args->lstio_dbg_nmlen > LST_NAME_SIZE))
-                return -EINVAL;
+	if (args->lstio_dbg_namep != NULL && /* name of batch/group */
+	    (args->lstio_dbg_nmlen <= 0 ||
+	     args->lstio_dbg_nmlen > LST_NAME_SIZE))
+		return -EINVAL;
 
-        if (args->lstio_dbg_namep != NULL) {
-                LIBCFS_ALLOC(name, args->lstio_dbg_nmlen + 1);
-                if (name == NULL)
-                        return -ENOMEM;
+	if (args->lstio_dbg_namep != NULL) {
+		LIBCFS_ALLOC(name, args->lstio_dbg_nmlen + 1);
+		if (name == NULL)
+			return -ENOMEM;
 
 		if (copy_from_user(name, args->lstio_dbg_namep,
-                                       args->lstio_dbg_nmlen)) {
-                        LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
+				   args->lstio_dbg_nmlen)) {
+			LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
 
-                        return -EFAULT;
-                }
+			return -EFAULT;
+		}
 
-                name[args->lstio_dbg_nmlen] = 0;
-        }
+		name[args->lstio_dbg_nmlen] = 0;
+	}
 
-        rc = -EINVAL;
+	rc = -EINVAL;
 
-        switch (args->lstio_dbg_type) {
-        case LST_OPC_SESSION:
-                rc = lstcon_session_debug(args->lstio_dbg_timeout,
-                                          args->lstio_dbg_resultp);
-                break;
+	switch (args->lstio_dbg_type) {
+	case LST_OPC_SESSION:
+		rc = lstcon_session_debug(args->lstio_dbg_timeout,
+					  args->lstio_dbg_resultp);
+		break;
 
-        case LST_OPC_BATCHSRV:
-                client = 0;
-		/* Fall through */
-        case LST_OPC_BATCHCLI:
-                if (name == NULL)
-                        goto out;
+	case LST_OPC_BATCHSRV:
+		client = 0;
+		/* fallthrough */
+	case LST_OPC_BATCHCLI:
+		if (name == NULL)
+			goto out;
 
-                rc = lstcon_batch_debug(args->lstio_dbg_timeout,
-                                        name, client, args->lstio_dbg_resultp);
-                break;
+		rc = lstcon_batch_debug(args->lstio_dbg_timeout,
+					name, client, args->lstio_dbg_resultp);
+		break;
 
-        case LST_OPC_GROUP:
-                if (name == NULL)
-                        goto out;
+	case LST_OPC_GROUP:
+		if (name == NULL)
+			goto out;
 
-                rc = lstcon_group_debug(args->lstio_dbg_timeout,
-                                        name, args->lstio_dbg_resultp);
-                break;
+		rc = lstcon_group_debug(args->lstio_dbg_timeout,
+					name, args->lstio_dbg_resultp);
+		break;
 
-        case LST_OPC_NODES:
-                if (args->lstio_dbg_count <= 0 ||
-                    args->lstio_dbg_idsp == NULL)
-                        goto out;
+	case LST_OPC_NODES:
+		if (args->lstio_dbg_count <= 0 ||
+		    args->lstio_dbg_idsp == NULL)
+			goto out;
 
-                rc = lstcon_nodes_debug(args->lstio_dbg_timeout,
-                                        args->lstio_dbg_count,
-                                        args->lstio_dbg_idsp,
-                                        args->lstio_dbg_resultp);
-                break;
+		rc = lstcon_nodes_debug(args->lstio_dbg_timeout,
+					args->lstio_dbg_count,
+					args->lstio_dbg_idsp,
+					args->lstio_dbg_resultp);
+		break;
 
-        default:
-                break;
-        }
+	default:
+		break;
+	}
 
 out:
-        if (name != NULL)
-                LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
+	if (name != NULL)
+		LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
 
-        return rc;
+	return rc;
 }
 
 static int
 lst_group_add_ioctl(struct lstio_group_add_args *args)
 {
-        char           *name;
-        int             rc;
+	char *name;
+	int rc;
 
-        if (args->lstio_grp_key != console_session.ses_key)
-                return -EACCES;
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
 
-        if (args->lstio_grp_namep == NULL||
+	if (args->lstio_grp_namep == NULL ||
 	    args->lstio_grp_nmlen <= 0 ||
-            args->lstio_grp_nmlen > LST_NAME_SIZE)
-                return -EINVAL;
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
 
-        LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
-        if (name == NULL)
-                return -ENOMEM;
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_grp_namep,
 			   args->lstio_grp_nmlen)) {
-                LIBCFS_FREE(name, args->lstio_grp_nmlen);
-                return -EFAULT;
-        }
+		LIBCFS_FREE(name, args->lstio_grp_nmlen);
+		return -EFAULT;
+	}
 
-        name[args->lstio_grp_nmlen] = 0;
+	name[args->lstio_grp_nmlen] = 0;
 
-        rc = lstcon_group_add(name);
+	rc = lstcon_group_add(name);
 
-        LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
 
-        return rc;
+	return rc;
 }
 
 static int
 lst_group_del_ioctl(struct lstio_group_del_args *args)
 {
-        int     rc;
-        char   *name;
+	int rc;
+	char *name;
 
-        if (args->lstio_grp_key != console_session.ses_key)
-                return -EACCES;
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
 
-        if (args->lstio_grp_namep == NULL ||
-            args->lstio_grp_nmlen <= 0 ||
-            args->lstio_grp_nmlen > LST_NAME_SIZE)
-                return -EINVAL;
+	if (args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
 
-        LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
-        if (name == NULL)
-                return -ENOMEM;
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_grp_namep,
 			   args->lstio_grp_nmlen)) {
-                LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
-                return -EFAULT;
-        }
+		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+		return -EFAULT;
+	}
 
-        name[args->lstio_grp_nmlen] = 0;
+	name[args->lstio_grp_nmlen] = 0;
 
-        rc = lstcon_group_del(name);
+	rc = lstcon_group_del(name);
 
-        LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
 
-        return rc;
+	return rc;
 }
 
 static int
 lst_group_update_ioctl(struct lstio_group_update_args *args)
 {
-        int     rc;
-        char   *name;
+	int rc;
+	char *name;
 
-        if (args->lstio_grp_key != console_session.ses_key)
-                return -EACCES;
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
 
-        if (args->lstio_grp_resultp == NULL ||
-            args->lstio_grp_namep == NULL ||
+	if (args->lstio_grp_resultp == NULL ||
+	    args->lstio_grp_namep == NULL ||
 	    args->lstio_grp_nmlen <= 0 ||
-            args->lstio_grp_nmlen > LST_NAME_SIZE)
-                return -EINVAL;
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
 
-        LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
-        if (name == NULL)
-                return -ENOMEM;
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_grp_namep,
 			   args->lstio_grp_nmlen)) {
-                LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
-                return -EFAULT;
-        }
-
-        name[args->lstio_grp_nmlen] = 0;
-
-        switch (args->lstio_grp_opc) {
-        case LST_GROUP_CLEAN:
-                rc = lstcon_group_clean(name, args->lstio_grp_args);
-                break;
-
-        case LST_GROUP_REFRESH:
-                rc = lstcon_group_refresh(name, args->lstio_grp_resultp);
-                break;
-
-        case LST_GROUP_RMND:
-                if (args->lstio_grp_count  <= 0 ||
-                    args->lstio_grp_idsp == NULL) {
-                        rc = -EINVAL;
-                        break;
-                }
-                rc = lstcon_nodes_remove(name, args->lstio_grp_count,
-                                         args->lstio_grp_idsp,
-                                         args->lstio_grp_resultp);
-                break;
-
-        default:
-                rc = -EINVAL;
-                break;
-        }
-
-        LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
-
-        return rc;
+		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_grp_nmlen] = 0;
+
+	switch (args->lstio_grp_opc) {
+	case LST_GROUP_CLEAN:
+		rc = lstcon_group_clean(name, args->lstio_grp_args);
+		break;
+
+	case LST_GROUP_REFRESH:
+		rc = lstcon_group_refresh(name, args->lstio_grp_resultp);
+		break;
+
+	case LST_GROUP_RMND:
+		if (args->lstio_grp_count <= 0 ||
+		    args->lstio_grp_idsp == NULL) {
+			rc = -EINVAL;
+			break;
+		}
+		rc = lstcon_nodes_remove(name, args->lstio_grp_count,
+					 args->lstio_grp_idsp,
+					 args->lstio_grp_resultp);
+		break;
+
+	default:
+		rc = -EINVAL;
+		break;
+	}
+
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+	return rc;
 }
 
 static int
 lst_nodes_add_ioctl(struct lstio_group_nodes_args *args)
 {
-	unsigned feats;
-        int     rc;
-        char   *name;
+	unsigned int feats;
+	int rc;
+	char *name;
 
-        if (args->lstio_grp_key != console_session.ses_key)
-                return -EACCES;
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
 
-        if (args->lstio_grp_idsp == NULL || /* array of ids */
-            args->lstio_grp_count <= 0 ||
-            args->lstio_grp_resultp == NULL ||
+	if (args->lstio_grp_idsp == NULL || /* array of ids */
+	    args->lstio_grp_count <= 0 ||
+	    args->lstio_grp_resultp == NULL ||
 	    args->lstio_grp_featp == NULL ||
 	    args->lstio_grp_namep == NULL ||
 	    args->lstio_grp_nmlen <= 0 ||
-            args->lstio_grp_nmlen > LST_NAME_SIZE)
-                return -EINVAL;
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
 
-        LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
-        if (name == NULL)
-                return -ENOMEM;
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_grp_namep,
-                               args->lstio_grp_nmlen)) {
-                LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+			   args->lstio_grp_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
 
-                return -EFAULT;
-        }
+		return -EFAULT;
+	}
 
-        name[args->lstio_grp_nmlen] = 0;
+	name[args->lstio_grp_nmlen] = 0;
 
-        rc = lstcon_nodes_add(name, args->lstio_grp_count,
+	rc = lstcon_nodes_add(name, args->lstio_grp_count,
 			      args->lstio_grp_idsp, &feats,
 			      args->lstio_grp_resultp);
 
@@ -354,50 +354,50 @@ lst_nodes_add_ioctl(struct lstio_group_nodes_args *args)
 		return -EINVAL;
 	}
 
-        return rc;
+	return rc;
 }
 
 static int
 lst_group_list_ioctl(struct lstio_group_list_args *args)
 {
 	if (args->lstio_grp_key != console_session.ses_key)
-                return -EACCES;
+		return -EACCES;
 
-        if (args->lstio_grp_idx   < 0 ||
-            args->lstio_grp_namep == NULL ||
-            args->lstio_grp_nmlen <= 0 ||
-            args->lstio_grp_nmlen > LST_NAME_SIZE)
-                return -EINVAL;
+	if (args->lstio_grp_idx   < 0 ||
+	    args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
 
-        return lstcon_group_list(args->lstio_grp_idx,
-                              args->lstio_grp_nmlen,
-                              args->lstio_grp_namep);
+	return lstcon_group_list(args->lstio_grp_idx,
+				 args->lstio_grp_nmlen,
+				 args->lstio_grp_namep);
 }
 
 static int
 lst_group_info_ioctl(struct lstio_group_info_args *args)
 {
-        char           *name;
-        int             ndent;
-        int             index;
-        int             rc;
+	char *name;
+	int ndent;
+	int index;
+	int rc;
 
-        if (args->lstio_grp_key != console_session.ses_key)
-                return -EACCES;
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
 
-        if (args->lstio_grp_namep == NULL ||
-            args->lstio_grp_nmlen <= 0 ||
-            args->lstio_grp_nmlen > LST_NAME_SIZE)
-                return -EINVAL;
+	if (args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
 
-        if (args->lstio_grp_entp  == NULL && /* output: group entry */
-            args->lstio_grp_dentsp == NULL)  /* output: node entry */
-                return -EINVAL;
+	if (args->lstio_grp_entp == NULL && /* output: group entry */
+	    args->lstio_grp_dentsp == NULL)  /* output: node entry */
+		return -EINVAL;
 
-        if (args->lstio_grp_dentsp != NULL) { /* have node entry */
-                if (args->lstio_grp_idxp == NULL || /* node index */
-                    args->lstio_grp_ndentp == NULL) /* # of node entry */
-                        return -EINVAL;
+	if (args->lstio_grp_dentsp != NULL) { /* have node entry */
+		if (args->lstio_grp_idxp == NULL || /* node index */
+		    args->lstio_grp_ndentp == NULL) /* # of node entry */
+			return -EINVAL;
 
 		if (copy_from_user(&ndent, args->lstio_grp_ndentp,
 				   sizeof(ndent)) ||
@@ -415,19 +415,19 @@ lst_group_info_ioctl(struct lstio_group_info_args *args)
 
 	if (copy_from_user(name, args->lstio_grp_namep,
 			   args->lstio_grp_nmlen)) {
-                LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
-                return -EFAULT;
-        }
+		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+		return -EFAULT;
+	}
 
-        name[args->lstio_grp_nmlen] = 0;
+	name[args->lstio_grp_nmlen] = 0;
 
-        rc = lstcon_group_info(name, args->lstio_grp_entp,
-                               &index, &ndent, args->lstio_grp_dentsp);
+	rc = lstcon_group_info(name, args->lstio_grp_entp,
+			       &index, &ndent, args->lstio_grp_dentsp);
 
-        LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
 
 	if (rc != 0)
-                return rc;
+		return rc;
 
 	if (args->lstio_grp_dentsp != NULL &&
 	    (copy_to_user(args->lstio_grp_idxp, &index, sizeof(index)) ||
@@ -440,20 +440,20 @@ lst_group_info_ioctl(struct lstio_group_info_args *args)
 static int
 lst_batch_add_ioctl(struct lstio_batch_add_args *args)
 {
-        int             rc;
-        char           *name;
+	int rc;
+	char *name;
 
-        if (args->lstio_bat_key != console_session.ses_key)
-                return -EACCES;
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
 
-        if (args->lstio_bat_namep == NULL ||
-            args->lstio_bat_nmlen <= 0 ||
-            args->lstio_bat_nmlen > LST_NAME_SIZE)
-                return -EINVAL;
+	if (args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
 
-        LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
-        if (name == NULL)
-                return -ENOMEM;
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_bat_namep,
 			   args->lstio_bat_nmlen)) {
@@ -461,32 +461,32 @@ lst_batch_add_ioctl(struct lstio_batch_add_args *args)
 		return -EFAULT;
 	}
 
-        name[args->lstio_bat_nmlen] = 0;
+	name[args->lstio_bat_nmlen] = 0;
 
-        rc = lstcon_batch_add(name);
+	rc = lstcon_batch_add(name);
 
-        LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
 
-        return rc;
+	return rc;
 }
 
 static int
 lst_batch_run_ioctl(struct lstio_batch_run_args *args)
 {
-        int             rc;
-        char           *name;
+	int rc;
+	char *name;
 
-        if (args->lstio_bat_key != console_session.ses_key)
-                return -EACCES;
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
 
-        if (args->lstio_bat_namep == NULL ||
-            args->lstio_bat_nmlen <= 0 ||
-            args->lstio_bat_nmlen > LST_NAME_SIZE)
-                return -EINVAL;
+	if (args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
 
-        LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
-        if (name == NULL)
-                return -ENOMEM;
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_bat_namep,
 			   args->lstio_bat_nmlen)) {
@@ -494,34 +494,34 @@ lst_batch_run_ioctl(struct lstio_batch_run_args *args)
 		return -EFAULT;
 	}
 
-        name[args->lstio_bat_nmlen] = 0;
+	name[args->lstio_bat_nmlen] = 0;
 
-        rc = lstcon_batch_run(name, args->lstio_bat_timeout,
-                              args->lstio_bat_resultp);
+	rc = lstcon_batch_run(name, args->lstio_bat_timeout,
+			      args->lstio_bat_resultp);
 
-        LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
 
-        return rc;
+	return rc;
 }
 
 static int
 lst_batch_stop_ioctl(struct lstio_batch_stop_args *args)
 {
-        int             rc;
-        char           *name;
+	int rc;
+	char *name;
 
-        if (args->lstio_bat_key != console_session.ses_key)
-                return -EACCES;
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
 
-        if (args->lstio_bat_resultp == NULL ||
-            args->lstio_bat_namep == NULL ||
-            args->lstio_bat_nmlen <= 0 ||
-            args->lstio_bat_nmlen > LST_NAME_SIZE)
-                return -EINVAL;
+	if (args->lstio_bat_resultp == NULL ||
+	    args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
 
-        LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
-        if (name == NULL)
-                return -ENOMEM;
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_bat_namep,
 			   args->lstio_bat_nmlen)) {
@@ -529,37 +529,37 @@ lst_batch_stop_ioctl(struct lstio_batch_stop_args *args)
 		return -EFAULT;
 	}
 
-        name[args->lstio_bat_nmlen] = 0;
+	name[args->lstio_bat_nmlen] = 0;
 
-        rc = lstcon_batch_stop(name, args->lstio_bat_force,
-                               args->lstio_bat_resultp);
+	rc = lstcon_batch_stop(name, args->lstio_bat_force,
+			       args->lstio_bat_resultp);
 
-        LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
 
-        return rc;
+	return rc;
 }
 
 static int
 lst_batch_query_ioctl(struct lstio_batch_query_args *args)
 {
-        char   *name;
-        int     rc;
+	char *name;
+	int rc;
 
-        if (args->lstio_bat_key != console_session.ses_key)
-                return -EACCES;
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
 
-        if (args->lstio_bat_resultp == NULL ||
-            args->lstio_bat_namep == NULL ||
-            args->lstio_bat_nmlen <= 0 ||
-            args->lstio_bat_nmlen > LST_NAME_SIZE)
-                return -EINVAL;
+	if (args->lstio_bat_resultp == NULL ||
+	    args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
 
-        if (args->lstio_bat_testidx < 0)
-                return -EINVAL;
+	if (args->lstio_bat_testidx < 0)
+		return -EINVAL;
 
-        LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
-        if (name == NULL)
-                return -ENOMEM;
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_bat_namep,
 			   args->lstio_bat_nmlen)) {
@@ -567,92 +567,92 @@ lst_batch_query_ioctl(struct lstio_batch_query_args *args)
 		return -EFAULT;
 	}
 
-        name[args->lstio_bat_nmlen] = 0;
+	name[args->lstio_bat_nmlen] = 0;
 
-        rc = lstcon_test_batch_query(name,
-                                     args->lstio_bat_testidx,
-                                     args->lstio_bat_client,
-                                     args->lstio_bat_timeout,
-                                     args->lstio_bat_resultp);
+	rc = lstcon_test_batch_query(name,
+				     args->lstio_bat_testidx,
+				     args->lstio_bat_client,
+				     args->lstio_bat_timeout,
+				     args->lstio_bat_resultp);
 
-        LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
 
-        return rc;
+	return rc;
 }
 
 static int
 lst_batch_list_ioctl(struct lstio_batch_list_args *args)
 {
-        if (args->lstio_bat_key != console_session.ses_key)
-                return -EACCES;
-
-        if (args->lstio_bat_idx   < 0 ||
-            args->lstio_bat_namep == NULL ||
-            args->lstio_bat_nmlen <= 0 ||
-            args->lstio_bat_nmlen > LST_NAME_SIZE)
-                return -EINVAL;
-
-        return lstcon_batch_list(args->lstio_bat_idx,
-                              args->lstio_bat_nmlen,
-                              args->lstio_bat_namep);
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_idx < 0 ||
+	    args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	return lstcon_batch_list(args->lstio_bat_idx,
+				 args->lstio_bat_nmlen,
+				 args->lstio_bat_namep);
 }
 
 static int
 lst_batch_info_ioctl(struct lstio_batch_info_args *args)
 {
-        char           *name;
-        int             rc;
-        int             index;
-        int             ndent;
+	char *name;
+	int rc;
+	int index;
+	int ndent;
 
-        if (args->lstio_bat_key != console_session.ses_key)
-                return -EACCES;
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
 
-        if (args->lstio_bat_namep == NULL || /* batch name */
-            args->lstio_bat_nmlen <= 0 ||
-            args->lstio_bat_nmlen > LST_NAME_SIZE)
-                return -EINVAL;
+	if (args->lstio_bat_namep == NULL || /* batch name */
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
 
-        if (args->lstio_bat_entp == NULL && /* output: batch entry */
-            args->lstio_bat_dentsp == NULL) /* output: node entry */
-                return -EINVAL;
+	if (args->lstio_bat_entp == NULL && /* output: batch entry */
+	    args->lstio_bat_dentsp == NULL) /* output: node entry */
+		return -EINVAL;
 
-        if (args->lstio_bat_dentsp != NULL) { /* have node entry */
-                if (args->lstio_bat_idxp == NULL || /* node index */
-                    args->lstio_bat_ndentp == NULL) /* # of node entry */
-                        return -EINVAL;
+	if (args->lstio_bat_dentsp != NULL) { /* have node entry */
+		if (args->lstio_bat_idxp == NULL || /* node index */
+		    args->lstio_bat_ndentp == NULL) /* # of node entry */
+			return -EINVAL;
 
 		if (copy_from_user(&index, args->lstio_bat_idxp,
-                                       sizeof(index)) ||
+				   sizeof(index)) ||
 		    copy_from_user(&ndent, args->lstio_bat_ndentp,
-                                       sizeof(ndent)))
-                        return -EFAULT;
+				   sizeof(ndent)))
+			return -EFAULT;
 
-                if (ndent <= 0 || index < 0)
-                        return -EINVAL;
-        }
+		if (ndent <= 0 || index < 0)
+			return -EINVAL;
+	}
 
-        LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
-        if (name == NULL)
-                return -ENOMEM;
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_bat_namep,
 			   args->lstio_bat_nmlen)) {
-                LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
-                return -EFAULT;
-        }
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
 
-        name[args->lstio_bat_nmlen] = 0;
+	name[args->lstio_bat_nmlen] = 0;
 
-        rc = lstcon_batch_info(name,
-                            args->lstio_bat_entp, args->lstio_bat_server,
-                            args->lstio_bat_testidx, &index, &ndent,
-                            args->lstio_bat_dentsp);
+	rc = lstcon_batch_info(name,
+			       args->lstio_bat_entp, args->lstio_bat_server,
+			       args->lstio_bat_testidx, &index, &ndent,
+			       args->lstio_bat_dentsp);
 
-        LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
 
-        if (rc != 0)
-                return rc;
+	if (rc != 0)
+		return rc;
 
 	if (args->lstio_bat_dentsp != NULL &&
 	    (copy_to_user(args->lstio_bat_idxp, &index, sizeof(index)) ||
@@ -665,12 +665,12 @@ lst_batch_info_ioctl(struct lstio_batch_info_args *args)
 static int
 lst_stat_query_ioctl(struct lstio_stat_args *args)
 {
-        int             rc;
-	char           *name = NULL;
+	int rc;
+	char *name = NULL;
 
-        /* TODO: not finished */
-        if (args->lstio_sta_key != console_session.ses_key)
-                return -EACCES;
+	/* TODO: not finished */
+	if (args->lstio_sta_key != console_session.ses_key)
+		return -EACCES;
 
 	if (args->lstio_sta_resultp == NULL)
 		return -EINVAL;
@@ -680,9 +680,9 @@ lst_stat_query_ioctl(struct lstio_stat_args *args)
 			return -EINVAL;
 
 		rc = lstcon_nodes_stat(args->lstio_sta_count,
-                                       args->lstio_sta_idsp,
-                                       args->lstio_sta_timeout,
-                                       args->lstio_sta_resultp);
+				       args->lstio_sta_idsp,
+				       args->lstio_sta_timeout,
+				       args->lstio_sta_resultp);
 	} else if (args->lstio_sta_namep != NULL) {
 		if (args->lstio_sta_nmlen <= 0 ||
 		    args->lstio_sta_nmlen > LST_NAME_SIZE)
@@ -711,12 +711,12 @@ lst_stat_query_ioctl(struct lstio_stat_args *args)
 
 static int lst_test_add_ioctl(struct lstio_test_args *args)
 {
-	char		*batch_name;
-	char		*src_name = NULL;
-	char		*dst_name = NULL;
-	void		*param = NULL;
-	int		ret = 0;
-	int		rc = -ENOMEM;
+	char *batch_name;
+	char *src_name = NULL;
+	char *dst_name = NULL;
+	void *param = NULL;
+	int ret = 0;
+	int rc = -ENOMEM;
 
 	if (args->lstio_tes_resultp == NULL ||
 	    args->lstio_tes_retp == NULL ||
@@ -737,12 +737,12 @@ static int lst_test_add_ioctl(struct lstio_test_args *args)
 	    args->lstio_tes_span <= 0)
 		return -EINVAL;
 
-        /* have parameter, check if parameter length is valid */
-        if (args->lstio_tes_param != NULL &&
-            (args->lstio_tes_param_len <= 0 ||
+	/* have parameter, check if parameter length is valid */
+	if (args->lstio_tes_param != NULL &&
+	    (args->lstio_tes_param_len <= 0 ||
 	     args->lstio_tes_param_len >
-	     PAGE_SIZE - sizeof(lstcon_test_t)))
-                return -EINVAL;
+	     PAGE_SIZE - sizeof(struct lstcon_test)))
+		return -EINVAL;
 
 	LIBCFS_ALLOC(batch_name, args->lstio_tes_bat_nmlen + 1);
 	if (batch_name == NULL)
@@ -777,17 +777,17 @@ static int lst_test_add_ioctl(struct lstio_test_args *args)
 		goto out;
 
 	rc = lstcon_test_add(batch_name,
-			    args->lstio_tes_type,
-			    args->lstio_tes_loop,
-			    args->lstio_tes_concur,
-			    args->lstio_tes_dist, args->lstio_tes_span,
-			    src_name, dst_name, param,
-			    args->lstio_tes_param_len,
-			    &ret, args->lstio_tes_resultp);
-
-        if (ret != 0)
+			     args->lstio_tes_type,
+			     args->lstio_tes_loop,
+			     args->lstio_tes_concur,
+			     args->lstio_tes_dist, args->lstio_tes_span,
+			     src_name, dst_name, param,
+			     args->lstio_tes_param_len,
+			     &ret, args->lstio_tes_resultp);
+
+	if (ret != 0)
 		rc = (copy_to_user(args->lstio_tes_retp, &ret,
-                                       sizeof(ret))) ? -EFAULT : 0;
+				   sizeof(ret))) ? -EFAULT : 0;
 out:
 	if (batch_name != NULL)
 		LIBCFS_FREE(batch_name, args->lstio_tes_bat_nmlen + 1);
@@ -805,36 +805,40 @@ static int lst_test_add_ioctl(struct lstio_test_args *args)
 }
 
 int
-lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_hdr *hdr)
+lstcon_ioctl_entry(struct notifier_block *nb,
+		   unsigned long cmd, void *vdata)
 {
-	char   *buf;
+	struct libcfs_ioctl_hdr *hdr = vdata;
 	struct libcfs_ioctl_data *data;
-	int     opc;
-	int     rc;
+	char *buf = NULL;
+	int rc = -EINVAL;
+	int opc;
 
 	if (cmd != IOC_LIBCFS_LNETST)
-		return -EINVAL;
+		goto err;
 
 	data = container_of(hdr, struct libcfs_ioctl_data, ioc_hdr);
 
 	opc = data->ioc_u32[0];
 
 	if (data->ioc_plen1 > PAGE_SIZE)
-		return -EINVAL;
+		goto err;
 
 	LIBCFS_ALLOC(buf, data->ioc_plen1);
-	if (buf == NULL)
-		return -ENOMEM;
+	if (buf == NULL) {
+		rc = -ENOMEM;
+		goto err;
+	}
 
 	/* copy in parameter */
 	if (copy_from_user(buf, data->ioc_pbuf1, data->ioc_plen1)) {
-		LIBCFS_FREE(buf, data->ioc_plen1);
-		return -EFAULT;
+		rc = -EFAULT;
+		goto out_free_buf;
 	}
 
 	mutex_lock(&console_session.ses_mutex);
 
-	console_session.ses_laststamp = cfs_time_current_sec();
+	console_session.ses_laststamp = ktime_get_real_seconds();
 
 	if (console_session.ses_shutdown) {
 		rc = -ESHUTDOWN;
@@ -851,7 +855,8 @@ lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_hdr *hdr)
 		goto out;
 	}
 
-	memset(&console_session.ses_trans_stat, 0, sizeof(struct lstcon_trans_stat));
+	memset(&console_session.ses_trans_stat, 0,
+	       sizeof(struct lstcon_trans_stat));
 
 	switch (opc) {
 	case LSTIO_SESSION_NEW:
@@ -910,6 +915,7 @@ lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_hdr *hdr)
 		break;
 	default:
 		rc = -EINVAL;
+		goto out;
 	}
 
 	if (copy_to_user(data->ioc_pbuf2, &console_session.ses_trans_stat,
@@ -917,8 +923,8 @@ lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_hdr *hdr)
 		rc = -EFAULT;
 out:
 	mutex_unlock(&console_session.ses_mutex);
-
+out_free_buf:
 	LIBCFS_FREE(buf, data->ioc_plen1);
-
-	return rc;
+err:
+	return notifier_from_ioctl_errno(rc);
 }
diff --git a/drivers/staging/lustrefsx/lnet/selftest/conrpc.c b/drivers/staging/lustrefsx/lnet/selftest/conrpc.c
index a1ef9ada96804..b39756f724a2a 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/conrpc.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/conrpc.c
@@ -43,13 +43,13 @@
 #include "conrpc.h"
 #include "console.h"
 
-void lstcon_rpc_stat_reply(lstcon_rpc_trans_t *, srpc_msg_t *,
-			   lstcon_node_t *, struct lstcon_trans_stat *);
+void lstcon_rpc_stat_reply(struct lstcon_rpc_trans *, struct srpc_msg *,
+			   struct lstcon_node *, struct lstcon_trans_stat *);
 
 static void
-lstcon_rpc_done(srpc_client_rpc_t *rpc)
+lstcon_rpc_done(struct srpc_client_rpc *rpc)
 {
-	lstcon_rpc_t *crpc = (lstcon_rpc_t *)rpc->crpc_priv;
+	struct lstcon_rpc *crpc = rpc->crpc_priv;
 
 	LASSERT(crpc != NULL && rpc == crpc->crp_rpc);
 	LASSERT(crpc->crp_posted && !crpc->crp_finished);
@@ -69,11 +69,11 @@ lstcon_rpc_done(srpc_client_rpc_t *rpc)
 	/* not an orphan RPC */
 	crpc->crp_finished = 1;
 
-	if (crpc->crp_stamp == 0) {
+	if (crpc->crp_stamp_ns == 0) {
 		/* not aborted */
-		LASSERT (crpc->crp_status == 0);
+		LASSERT(crpc->crp_status == 0);
 
-		crpc->crp_stamp  = cfs_time_current();
+		crpc->crp_stamp_ns = ktime_get_ns();
 		crpc->crp_status = rpc->crpc_status;
 	}
 
@@ -85,22 +85,19 @@ lstcon_rpc_done(srpc_client_rpc_t *rpc)
 }
 
 static int
-lstcon_rpc_init(lstcon_node_t *nd, int service, unsigned feats,
-		int bulk_npg, int bulk_len, int embedded, lstcon_rpc_t *crpc)
+lstcon_rpc_init(struct lstcon_node *nd, int service, unsigned int feats,
+		int bulk_npg, int bulk_len, int embedded,
+		struct lstcon_rpc *crpc)
 {
+	memset(crpc, 0, sizeof(*crpc));
+
 	crpc->crp_rpc = sfw_create_rpc(nd->nd_id, service,
 				       feats, bulk_npg, bulk_len,
 				       lstcon_rpc_done, (void *)crpc);
 	if (crpc->crp_rpc == NULL)
 		return -ENOMEM;
 
-	crpc->crp_trans	   = NULL;
 	crpc->crp_node	   = nd;
-	crpc->crp_posted   = 0;
-	crpc->crp_finished = 0;
-	crpc->crp_unpacked = 0;
-	crpc->crp_status   = 0;
-	crpc->crp_stamp	   = 0;
 	crpc->crp_embedded = embedded;
 	INIT_LIST_HEAD(&crpc->crp_link);
 
@@ -110,17 +107,17 @@ lstcon_rpc_init(lstcon_node_t *nd, int service, unsigned feats,
 }
 
 static int
-lstcon_rpc_prep(lstcon_node_t *nd, int service, unsigned feats,
-		int bulk_npg, int bulk_len, lstcon_rpc_t **crpcpp)
+lstcon_rpc_prep(struct lstcon_node *nd, int service, unsigned int feats,
+		int bulk_npg, int bulk_len, struct lstcon_rpc **crpcpp)
 {
-	lstcon_rpc_t  *crpc = NULL;
-	int            rc;
+	struct lstcon_rpc *crpc = NULL;
+	int rc;
 
 	spin_lock(&console_session.ses_rpc_lock);
 
 	if (!list_empty(&console_session.ses_rpc_freelist)) {
 		crpc = list_entry(console_session.ses_rpc_freelist.next,
-				  lstcon_rpc_t, crp_link);
+				  struct lstcon_rpc, crp_link);
 		list_del_init(&crpc->crp_link);
 	}
 
@@ -144,10 +141,10 @@ lstcon_rpc_prep(lstcon_node_t *nd, int service, unsigned feats,
 }
 
 void
-lstcon_rpc_put(lstcon_rpc_t *crpc)
+lstcon_rpc_put(struct lstcon_rpc *crpc)
 {
-	srpc_bulk_t *bulk = &crpc->crp_rpc->crpc_bulk;
-	int	     i;
+	struct srpc_bulk *bulk = &crpc->crp_rpc->crpc_bulk;
+	int i;
 
 	LASSERT(list_empty(&crpc->crp_link));
 
@@ -179,9 +176,9 @@ lstcon_rpc_put(lstcon_rpc_t *crpc)
 }
 
 static void
-lstcon_rpc_post(lstcon_rpc_t *crpc)
+lstcon_rpc_post(struct lstcon_rpc *crpc)
 {
-        lstcon_rpc_trans_t *trans = crpc->crp_trans;
+	struct lstcon_rpc_trans *trans = crpc->crp_trans;
 
         LASSERT (trans != NULL);
 
@@ -232,9 +229,9 @@ lstcon_rpc_trans_name(int transop)
 
 int
 lstcon_rpc_trans_prep(struct list_head *translist, int transop,
-		      lstcon_rpc_trans_t **transpp)
+		      struct lstcon_rpc_trans **transpp)
 {
-	lstcon_rpc_trans_t *trans;
+	struct lstcon_rpc_trans *trans;
 
 	if (translist != NULL) {
 		list_for_each_entry(trans, translist, tas_link) {
@@ -272,18 +269,18 @@ lstcon_rpc_trans_prep(struct list_head *translist, int transop,
 }
 
 void
-lstcon_rpc_trans_addreq(lstcon_rpc_trans_t *trans, lstcon_rpc_t *crpc)
+lstcon_rpc_trans_addreq(struct lstcon_rpc_trans *trans, struct lstcon_rpc *crpc)
 {
 	list_add_tail(&crpc->crp_link, &trans->tas_rpcs_list);
 	crpc->crp_trans = trans;
 }
 
 void
-lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error)
+lstcon_rpc_trans_abort(struct lstcon_rpc_trans *trans, int error)
 {
-	srpc_client_rpc_t *rpc;
-	lstcon_rpc_t	  *crpc;
-	lstcon_node_t	  *nd;
+	struct srpc_client_rpc *rpc;
+	struct lstcon_rpc *crpc;
+	struct lstcon_node *nd;
 
 	list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
 		rpc = crpc->crp_rpc;
@@ -291,16 +288,16 @@ lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error)
 		spin_lock(&rpc->crpc_lock);
 
 		if (!crpc->crp_posted || /* not posted */
-		    crpc->crp_stamp != 0) { /* rpc done or aborted already */
-			if (crpc->crp_stamp == 0) {
-				crpc->crp_stamp = cfs_time_current();
+		    crpc->crp_stamp_ns != 0) { /* rpc done or aborted already */
+			if (crpc->crp_stamp_ns == 0) {
+				crpc->crp_stamp_ns = ktime_get_ns();
 				crpc->crp_status = -EINTR;
 			}
 			spin_unlock(&rpc->crpc_lock);
 			continue;
 		}
 
-		crpc->crp_stamp  = cfs_time_current();
+		crpc->crp_stamp_ns  = ktime_get_ns();
 		crpc->crp_status = error;
 
 		spin_unlock(&rpc->crpc_lock);
@@ -311,16 +308,16 @@ lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error)
 			continue;
 
 		nd = crpc->crp_node;
-		if (cfs_time_after(nd->nd_stamp, crpc->crp_stamp))
+		if (ktime_to_ns(nd->nd_stamp) > crpc->crp_stamp_ns)
 			continue;
 
-		nd->nd_stamp = crpc->crp_stamp;
+		nd->nd_stamp = ktime_set(0, crpc->crp_stamp_ns);
 		nd->nd_state = LST_NODE_DOWN;
 	}
 }
 
 static int
-lstcon_rpc_trans_check(lstcon_rpc_trans_t *trans)
+lstcon_rpc_trans_check(struct lstcon_rpc_trans *trans)
 {
 	if (console_session.ses_shutdown &&
 	    !list_empty(&trans->tas_olink)) /* Not an end session RPC */
@@ -330,10 +327,10 @@ lstcon_rpc_trans_check(lstcon_rpc_trans_t *trans)
 }
 
 int
-lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout)
+lstcon_rpc_trans_postwait(struct lstcon_rpc_trans *trans, int timeout)
 {
-	lstcon_rpc_t  *crpc;
-	int	       rc;
+	struct lstcon_rpc *crpc;
+	int rc;
 
 	if (list_empty(&trans->tas_rpcs_list))
                 return 0;
@@ -381,14 +378,14 @@ lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout)
 }
 
 static int
-lstcon_rpc_get_reply(lstcon_rpc_t *crpc, srpc_msg_t **msgpp)
+lstcon_rpc_get_reply(struct lstcon_rpc *crpc, struct srpc_msg **msgpp)
 {
-        lstcon_node_t        *nd  = crpc->crp_node;
-        srpc_client_rpc_t    *rpc = crpc->crp_rpc;
-        srpc_generic_reply_t *rep;
+	struct lstcon_node *nd = crpc->crp_node;
+	struct srpc_client_rpc *rpc = crpc->crp_rpc;
+	struct srpc_generic_reply *rep;
 
-        LASSERT (nd != NULL && rpc != NULL);
-        LASSERT (crpc->crp_stamp != 0);
+	LASSERT(nd != NULL && rpc != NULL);
+	LASSERT(crpc->crp_stamp_ns != 0);
 
         if (crpc->crp_status != 0) {
                 *msgpp = NULL;
@@ -401,11 +398,11 @@ lstcon_rpc_get_reply(lstcon_rpc_t *crpc, srpc_msg_t **msgpp)
                 crpc->crp_unpacked = 1;
         }
 
-        if (cfs_time_after(nd->nd_stamp, crpc->crp_stamp))
-                return 0;
+	if (ktime_to_ns(nd->nd_stamp) > crpc->crp_stamp_ns)
+		return 0;
 
-        nd->nd_stamp = crpc->crp_stamp;
-        rep = &(*msgpp)->msg_body.reply;
+	nd->nd_stamp = ktime_set(0, crpc->crp_stamp_ns);
+	rep = &(*msgpp)->msg_body.reply;
 
         if (rep->sid.ses_nid == LNET_NID_ANY)
                 nd->nd_state = LST_NODE_UNKNOWN;
@@ -418,11 +415,12 @@ lstcon_rpc_get_reply(lstcon_rpc_t *crpc, srpc_msg_t **msgpp)
 }
 
 void
-lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans, struct lstcon_trans_stat *stat)
+lstcon_rpc_trans_stat(struct lstcon_rpc_trans *trans,
+		      struct lstcon_trans_stat *stat)
 {
-	lstcon_rpc_t	*crpc;
-	srpc_msg_t	*rep;
-	int		 error;
+	struct lstcon_rpc *crpc;
+	struct srpc_msg	*rep;
+	int error;
 
 	LASSERT(stat != NULL);
 
@@ -431,7 +429,7 @@ lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans, struct lstcon_trans_stat *stat)
 	list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
 		lstcon_rpc_stat_total(stat, 1);
 
-		LASSERT(crpc->crp_stamp != 0);
+		LASSERT(crpc->crp_stamp_ns != 0);
 
                 error = lstcon_rpc_get_reply(crpc, &rep);
                 if (error != 0) {
@@ -464,20 +462,20 @@ lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans, struct lstcon_trans_stat *stat)
 }
 
 int
-lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
+lstcon_rpc_trans_interpreter(struct lstcon_rpc_trans *trans,
 			     struct list_head __user *head_up,
 			     lstcon_rpc_readent_func_t readent)
 {
-	struct list_head      tmp;
-	struct list_head     __user *next;
-	struct lstcon_rpc_ent     *ent;
-        srpc_generic_reply_t *rep;
-        lstcon_rpc_t         *crpc;
-        srpc_msg_t           *msg;
-        lstcon_node_t        *nd;
-        cfs_duration_t        dur;
+	struct list_head tmp;
+	struct list_head __user *next;
+	struct lstcon_rpc_ent *ent;
+	struct srpc_generic_reply *rep;
+	struct lstcon_rpc *crpc;
+	struct srpc_msg *msg;
+	struct lstcon_node *nd;
 	struct timespec64 ts;
-        int                   error;
+	int error;
+	s64 dur;
 
 	LASSERT(head_up != NULL);
 
@@ -495,15 +493,15 @@ lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
 
 		ent = list_entry(next, struct lstcon_rpc_ent, rpe_link);
 
-		LASSERT(crpc->crp_stamp != 0);
+		LASSERT(crpc->crp_stamp_ns != 0);
 
-                error = lstcon_rpc_get_reply(crpc, &msg);
+		error = lstcon_rpc_get_reply(crpc, &msg);
 
-                nd = crpc->crp_node;
+		nd = crpc->crp_node;
 
-		dur = (cfs_duration_t)cfs_time_sub(crpc->crp_stamp,
-		       (cfs_time_t)console_session.ses_id.ses_stamp);
-		jiffies_to_timespec64(dur, &ts);
+		dur = crpc->crp_stamp_ns -
+		      console_session.ses_id.ses_stamp * NSEC_PER_MSEC;
+		ts = ns_to_timespec64(dur);
 
 		if (copy_to_user(&ent->rpe_peer,
 				 &nd->nd_id, sizeof(struct lnet_process_id)) ||
@@ -518,7 +516,7 @@ lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
 			continue;
 
 		/* RPC is done */
-		rep = (srpc_generic_reply_t *)&msg->msg_body.reply;
+		rep = (struct srpc_generic_reply *)&msg->msg_body.reply;
 
 		if (copy_to_user(&ent->rpe_sid,
 				 &rep->sid, sizeof(rep->sid)) ||
@@ -538,12 +536,12 @@ lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
 }
 
 void
-lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans)
+lstcon_rpc_trans_destroy(struct lstcon_rpc_trans *trans)
 {
-	srpc_client_rpc_t *rpc;
-	lstcon_rpc_t      *crpc;
-	lstcon_rpc_t      *tmp;
-	int                count = 0;
+	struct srpc_client_rpc *rpc;
+	struct lstcon_rpc *crpc;
+	struct lstcon_rpc *tmp;
+	int count = 0;
 
 	list_for_each_entry_safe(crpc, tmp, &trans->tas_rpcs_list, crp_link) {
 		rpc = crpc->crp_rpc;
@@ -592,12 +590,12 @@ lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans)
 }
 
 int
-lstcon_sesrpc_prep(lstcon_node_t *nd, int transop,
-		   unsigned feats, lstcon_rpc_t **crpc)
+lstcon_sesrpc_prep(struct lstcon_node *nd, int transop,
+		   unsigned int feats, struct lstcon_rpc **crpc)
 {
-        srpc_mksn_reqst_t *msrq;
-        srpc_rmsn_reqst_t *rsrq;
-        int                rc;
+	struct srpc_mksn_reqst *msrq;
+	struct srpc_rmsn_reqst *rsrq;
+	int rc;
 
         switch (transop) {
         case LST_TRANS_SESNEW:
@@ -631,10 +629,11 @@ lstcon_sesrpc_prep(lstcon_node_t *nd, int transop,
 }
 
 int
-lstcon_dbgrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc)
+lstcon_dbgrpc_prep(struct lstcon_node *nd, unsigned int feats,
+		   struct lstcon_rpc **crpc)
 {
-	srpc_debug_reqst_t *drq;
-	int		    rc;
+	struct srpc_debug_reqst *drq;
+	int rc;
 
 	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_DEBUG, feats, 0, 0, crpc);
         if (rc != 0)
@@ -649,12 +648,12 @@ lstcon_dbgrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc)
 }
 
 int
-lstcon_batrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
-		   lstcon_tsb_hdr_t *tsb, lstcon_rpc_t **crpc)
+lstcon_batrpc_prep(struct lstcon_node *nd, int transop, unsigned int feats,
+		   struct lstcon_tsb_hdr *tsb, struct lstcon_rpc **crpc)
 {
-	lstcon_batch_t	   *batch;
-	srpc_batch_reqst_t *brq;
-	int		    rc;
+	struct lstcon_batch *batch;
+	struct srpc_batch_reqst *brq;
+	int rc;
 
 	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_BATCH, feats, 0, 0, crpc);
         if (rc != 0)
@@ -675,17 +674,18 @@ lstcon_batrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
 
         LASSERT (tsb->tsb_index == 0);
 
-        batch = (lstcon_batch_t *)tsb;
+	batch = (struct lstcon_batch *)tsb;
         brq->bar_arg = batch->bat_arg;
 
         return 0;
 }
 
 int
-lstcon_statrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc)
+lstcon_statrpc_prep(struct lstcon_node *nd, unsigned int feats,
+		    struct lstcon_rpc **crpc)
 {
-	srpc_stat_reqst_t *srq;
-	int		   rc;
+	struct srpc_stat_reqst *srq;
+	int rc;
 
 	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_QUERY_STAT, feats, 0, 0, crpc);
         if (rc != 0)
@@ -715,15 +715,15 @@ lstcon_next_id(int idx, int nkiov, lnet_kiov_t *kiov)
 }
 
 static int
-lstcon_dstnodes_prep(lstcon_group_t *grp, int idx,
+lstcon_dstnodes_prep(struct lstcon_group *grp, int idx,
                      int dist, int span, int nkiov, lnet_kiov_t *kiov)
 {
 	struct lnet_process_id_packed *pid;
-        lstcon_ndlink_t          *ndl;
-        lstcon_node_t            *nd;
-        int                       start;
-        int                       end;
-        int                       i = 0;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_node *nd;
+	int start;
+	int end;
+	int i = 0;
 
         LASSERT (dist >= 1);
         LASSERT (span >= 1);
@@ -769,9 +769,10 @@ lstcon_dstnodes_prep(lstcon_group_t *grp, int idx,
 }
 
 static int
-lstcon_pingrpc_prep(struct lst_test_ping_param *param, srpc_test_reqst_t *req)
+lstcon_pingrpc_prep(struct lst_test_ping_param *param,
+		    struct srpc_test_reqst *req)
 {
-        test_ping_req_t *prq = &req->tsr_u.ping;
+	struct test_ping_req *prq = &req->tsr_u.ping;
 
         prq->png_size   = param->png_size;
         prq->png_flags  = param->png_flags;
@@ -780,9 +781,10 @@ lstcon_pingrpc_prep(struct lst_test_ping_param *param, srpc_test_reqst_t *req)
 }
 
 static int
-lstcon_bulkrpc_v0_prep(struct lst_test_bulk_param *param, srpc_test_reqst_t *req)
+lstcon_bulkrpc_v0_prep(struct lst_test_bulk_param *param,
+		       struct srpc_test_reqst *req)
 {
-	test_bulk_req_t *brq = &req->tsr_u.bulk_v0;
+	struct test_bulk_req *brq = &req->tsr_u.bulk_v0;
 
 	brq->blk_opc    = param->blk_opc;
 	brq->blk_npg    = (param->blk_size + PAGE_SIZE - 1) /
@@ -794,9 +796,9 @@ lstcon_bulkrpc_v0_prep(struct lst_test_bulk_param *param, srpc_test_reqst_t *req
 
 static int
 lstcon_bulkrpc_v1_prep(struct lst_test_bulk_param *param, bool is_client,
-		       srpc_test_reqst_t *req)
+		       struct srpc_test_reqst *req)
 {
-	test_bulk_req_v1_t *brq = &req->tsr_u.bulk_v1;
+	struct test_bulk_req_v1 *brq = &req->tsr_u.bulk_v1;
 
 	brq->blk_opc	= param->blk_opc;
 	brq->blk_flags	= param->blk_flags;
@@ -807,17 +809,17 @@ lstcon_bulkrpc_v1_prep(struct lst_test_bulk_param *param, bool is_client,
 }
 
 int
-lstcon_testrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
-                    lstcon_test_t *test, lstcon_rpc_t **crpc)
+lstcon_testrpc_prep(struct lstcon_node *nd, int transop, unsigned int feats,
+		    struct lstcon_test *test, struct lstcon_rpc **crpc)
 {
-        lstcon_group_t    *sgrp = test->tes_src_grp;
-        lstcon_group_t    *dgrp = test->tes_dst_grp;
-        srpc_test_reqst_t *trq;
-        srpc_bulk_t       *bulk;
-        int                i;
-	int		   npg = 0;
-	int		   nob = 0;
-	int		   rc  = 0;
+	struct lstcon_group *sgrp = test->tes_src_grp;
+	struct lstcon_group *dgrp = test->tes_dst_grp;
+	struct srpc_test_reqst *trq;
+	struct srpc_bulk *bulk;
+	int i;
+	int npg = 0;
+	int nob = 0;
+	int rc = 0;
 
 	if (transop == LST_TRANS_TSBCLIADD) {
 		npg = sfw_id_pages(test->tes_span);
@@ -915,11 +917,11 @@ lstcon_testrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
 }
 
 static int
-lstcon_sesnew_stat_reply(lstcon_rpc_trans_t *trans,
-			 lstcon_node_t *nd, srpc_msg_t *reply)
+lstcon_sesnew_stat_reply(struct lstcon_rpc_trans *trans,
+			 struct lstcon_node *nd, struct srpc_msg *reply)
 {
-	srpc_mksn_reply_t *mksn_rep = &reply->msg_body.mksn_reply;
-	int		   status   = mksn_rep->mksn_status;
+	struct srpc_mksn_reply *mksn_rep = &reply->msg_body.mksn_reply;
+	int status = mksn_rep->mksn_status;
 
 	if (status == 0 &&
 	    (reply->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
@@ -962,15 +964,15 @@ lstcon_sesnew_stat_reply(lstcon_rpc_trans_t *trans,
 }
 
 void
-lstcon_rpc_stat_reply(lstcon_rpc_trans_t *trans, srpc_msg_t *msg,
-		      lstcon_node_t *nd, struct lstcon_trans_stat *stat)
+lstcon_rpc_stat_reply(struct lstcon_rpc_trans *trans, struct srpc_msg *msg,
+		      struct lstcon_node *nd, struct lstcon_trans_stat *stat)
 {
-        srpc_rmsn_reply_t  *rmsn_rep;
-        srpc_debug_reply_t *dbg_rep;
-        srpc_batch_reply_t *bat_rep;
-        srpc_test_reply_t  *test_rep;
-        srpc_stat_reply_t  *stat_rep;
-        int                 rc = 0;
+	struct srpc_rmsn_reply *rmsn_rep;
+	struct srpc_debug_reply *dbg_rep;
+	struct srpc_batch_reply *bat_rep;
+	struct srpc_test_reply *test_rep;
+	struct srpc_stat_reply *stat_rep;
+	int rc = 0;
 
 	switch (trans->tas_opc) {
 	case LST_TRANS_SESNEW:
@@ -1085,14 +1087,14 @@ int
 lstcon_rpc_trans_ndlist(struct list_head *ndlist,
 			struct list_head *translist, int transop,
 			void *arg, lstcon_rpc_cond_func_t condition,
-			lstcon_rpc_trans_t **transpp)
+			struct lstcon_rpc_trans **transpp)
 {
-        lstcon_rpc_trans_t *trans;
-        lstcon_ndlink_t    *ndl;
-        lstcon_node_t      *nd;
-        lstcon_rpc_t       *rpc;
-	unsigned	    feats;
-        int                 rc;
+	struct lstcon_rpc_trans *trans;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_node *nd;
+	struct lstcon_rpc *rpc;
+	unsigned int feats;
+	int rc;
 
         /* Creating session RPG for list of nodes */
 
@@ -1130,14 +1132,16 @@ lstcon_rpc_trans_ndlist(struct list_head *ndlist,
 		case LST_TRANS_TSBCLIADD:
 		case LST_TRANS_TSBSRVADD:
 			rc = lstcon_testrpc_prep(nd, transop, feats,
-						 (lstcon_test_t *)arg, &rpc);
+						 (struct lstcon_test *)arg,
+						 &rpc);
 			break;
 		case LST_TRANS_TSBRUN:
 		case LST_TRANS_TSBSTOP:
 		case LST_TRANS_TSBCLIQRY:
 		case LST_TRANS_TSBSRVQRY:
 			rc = lstcon_batrpc_prep(nd, transop, feats,
-						(lstcon_tsb_hdr_t *)arg, &rpc);
+						(struct lstcon_tsb_hdr *)arg,
+						&rpc);
 			break;
 		case LST_TRANS_STATQRY:
 			rc = lstcon_statrpc_prep(nd, feats, &rpc);
@@ -1169,16 +1173,16 @@ lstcon_rpc_trans_ndlist(struct list_head *ndlist,
 static void
 lstcon_rpc_pinger(void *arg)
 {
-        stt_timer_t        *ptimer = (stt_timer_t *)arg;
-        lstcon_rpc_trans_t *trans;
-        lstcon_rpc_t       *crpc;
-        srpc_msg_t         *rep;
-        srpc_debug_reqst_t *drq;
-        lstcon_ndlink_t    *ndl;
-        lstcon_node_t      *nd;
+	struct stt_timer *ptimer = arg;
+	struct lstcon_rpc_trans *trans;
+	struct lstcon_rpc *crpc;
+	struct srpc_msg *rep;
+	struct srpc_debug_reqst *drq;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_node *nd;
 	int intv;
-        int                 count = 0;
-        int                 rc;
+	int count = 0;
+	int rc;
 
         /* RPC pinger is a special case of transaction,
          * it's called by timer at 8 seconds interval.
@@ -1191,8 +1195,8 @@ lstcon_rpc_pinger(void *arg)
         }
 
 	if (!console_session.ses_expired &&
-	    cfs_time_current_sec() - console_session.ses_laststamp >
-	    (time_t)console_session.ses_timeout)
+	    ktime_get_real_seconds() - console_session.ses_laststamp >
+	    (time64_t)console_session.ses_timeout)
 		console_session.ses_expired = 1;
 
 	trans = console_session.ses_ping;
@@ -1245,12 +1249,13 @@ lstcon_rpc_pinger(void *arg)
 			lstcon_rpc_put(crpc);
 		}
 
-                if (nd->nd_state != LST_NODE_ACTIVE)
-                        continue;
+		if (nd->nd_state != LST_NODE_ACTIVE)
+			continue;
 
-		intv = cfs_duration_sec(jiffies - nd->nd_stamp);
+		intv = div_u64(ktime_ms_delta(ktime_get(), nd->nd_stamp),
+			       MSEC_PER_SEC);
 		if (intv < nd->nd_timeout / 2)
-                        continue;
+			continue;
 
 		rc = lstcon_rpc_init(nd, SRPC_SERVICE_DEBUG,
 				     trans->tas_features, 0, 0, 1, crpc);
@@ -1277,7 +1282,7 @@ lstcon_rpc_pinger(void *arg)
 
         CDEBUG(D_NET, "Ping %d nodes in session\n", count);
 
-	ptimer->stt_expires = (cfs_time_t)(cfs_time_current_sec() + LST_PING_INTERVAL);
+	ptimer->stt_expires = ktime_get_real_seconds() + LST_PING_INTERVAL;
 	stt_add_timer(ptimer);
 
 	mutex_unlock(&console_session.ses_mutex);
@@ -1286,8 +1291,8 @@ lstcon_rpc_pinger(void *arg)
 int
 lstcon_rpc_pinger_start(void)
 {
-	stt_timer_t	*ptimer;
-	int		 rc;
+	struct stt_timer *ptimer;
+	int rc;
 
 	LASSERT(list_empty(&console_session.ses_rpc_freelist));
 	LASSERT(atomic_read(&console_session.ses_rpc_counter) == 0);
@@ -1300,7 +1305,7 @@ lstcon_rpc_pinger_start(void)
         }
 
 	ptimer = &console_session.ses_ping_timer;
-	ptimer->stt_expires = (cfs_time_t)(cfs_time_current_sec() + LST_PING_INTERVAL);
+	ptimer->stt_expires = ktime_get_real_seconds() + LST_PING_INTERVAL;
 
 	stt_add_timer(ptimer);
 
@@ -1326,10 +1331,10 @@ lstcon_rpc_pinger_stop(void)
 void
 lstcon_rpc_cleanup_wait(void)
 {
-	lstcon_rpc_trans_t	*trans;
-	lstcon_rpc_t		*crpc;
-	struct list_head	*pacer;
-	struct list_head	 zlist;
+	struct lstcon_rpc_trans	*trans;
+	struct lstcon_rpc *crpc;
+	struct list_head *pacer;
+	struct list_head zlist;
 
 	/* Called with hold of global mutex */
 
@@ -1337,7 +1342,7 @@ lstcon_rpc_cleanup_wait(void)
 
 	while (!list_empty(&console_session.ses_trans_list)) {
 		list_for_each(pacer, &console_session.ses_trans_list) {
-			trans = list_entry(pacer, lstcon_rpc_trans_t,
+			trans = list_entry(pacer, struct lstcon_rpc_trans,
 					   tas_link);
 
 			CDEBUG(D_NET, "Session closed, wakeup transaction %s\n",
@@ -1370,10 +1375,10 @@ lstcon_rpc_cleanup_wait(void)
 	spin_unlock(&console_session.ses_rpc_lock);
 
 	while (!list_empty(&zlist)) {
-		crpc = list_entry(zlist.next, lstcon_rpc_t, crp_link);
+		crpc = list_entry(zlist.next, struct lstcon_rpc, crp_link);
 
 		list_del(&crpc->crp_link);
-		LIBCFS_FREE(crpc, sizeof(lstcon_rpc_t));
+		LIBCFS_FREE(crpc, sizeof(*crpc));
 	}
 }
 
diff --git a/drivers/staging/lustrefsx/lnet/selftest/conrpc.h b/drivers/staging/lustrefsx/lnet/selftest/conrpc.h
index fd56e648491ce..51d4ee90e07cc 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/conrpc.h
+++ b/drivers/staging/lustrefsx/lnet/selftest/conrpc.h
@@ -40,7 +40,6 @@
 #define __LST_CONRPC_H__
 
 #include <libcfs/libcfs.h>
-#include <lnet/lnet.h>
 #include <lnet/lib-types.h>
 #include "rpc.h"
 #include "selftest.h"
@@ -58,9 +57,9 @@ struct lstcon_tsb_hdr;
 struct lstcon_test;
 struct lstcon_node;
 
-typedef struct lstcon_rpc {
+struct lstcon_rpc {
 	struct list_head	 crp_link;	/* chain on rpc transaction */
-	srpc_client_rpc_t	*crp_rpc;	/* client rpc */
+	struct srpc_client_rpc	*crp_rpc;	/* client rpc */
 	struct lstcon_node	*crp_node;	/* destination node */
 	struct lstcon_rpc_trans *crp_trans;	/* conrpc transaction */
 
@@ -70,10 +69,10 @@ typedef struct lstcon_rpc {
 	/** RPC is embedded in other structure and can't free it */
 	unsigned int		 crp_embedded:1;
         int                      crp_status;     /* console rpc errors */
-        cfs_time_t               crp_stamp;      /* replied time stamp */
-} lstcon_rpc_t;
+	s64			 crp_stamp_ns;	 /* replied time stamp */
+};
 
-typedef struct lstcon_rpc_trans {
+struct lstcon_rpc_trans {
 	/* link chain on owner list */
 	struct list_head	tas_olink;
 	/* link chain on global list */
@@ -87,7 +86,7 @@ typedef struct lstcon_rpc_trans {
 	wait_queue_head_t	tas_waitq;	/* wait queue head */
 	atomic_t		tas_remaining;	/* # of un-scheduled rpcs */
 	struct list_head	tas_rpcs_list;	/* queued requests */
-} lstcon_rpc_trans_t;
+};
 
 #define LST_TRANS_PRIVATE       0x1000
 
@@ -105,36 +104,37 @@ typedef struct lstcon_rpc_trans {
 
 #define LST_TRANS_STATQRY       0x21
 
-typedef int (* lstcon_rpc_cond_func_t)(int, struct lstcon_node *, void *);
-typedef int (*lstcon_rpc_readent_func_t)(int, srpc_msg_t *,
+typedef int (*lstcon_rpc_cond_func_t)(int, struct lstcon_node *, void *);
+typedef int (*lstcon_rpc_readent_func_t)(int, struct srpc_msg *,
 					 struct lstcon_rpc_ent __user *);
 
 int  lstcon_sesrpc_prep(struct lstcon_node *nd, int transop,
-			unsigned version, lstcon_rpc_t **crpc);
+			unsigned int version, struct lstcon_rpc **crpc);
 int  lstcon_dbgrpc_prep(struct lstcon_node *nd,
-			unsigned version, lstcon_rpc_t **crpc);
+			unsigned int version, struct lstcon_rpc **crpc);
 int  lstcon_batrpc_prep(struct lstcon_node *nd, int transop, unsigned version,
-                        struct lstcon_tsb_hdr *tsb, lstcon_rpc_t **crpc);
+			struct lstcon_tsb_hdr *tsb, struct lstcon_rpc **crpc);
 int  lstcon_testrpc_prep(struct lstcon_node *nd, int transop, unsigned version,
-                         struct lstcon_test *test, lstcon_rpc_t **crpc);
+			 struct lstcon_test *test, struct lstcon_rpc **crpc);
 int  lstcon_statrpc_prep(struct lstcon_node *nd, unsigned version,
-			 lstcon_rpc_t **crpc);
-void lstcon_rpc_put(lstcon_rpc_t *crpc);
+			 struct lstcon_rpc **crpc);
+void lstcon_rpc_put(struct lstcon_rpc *crpc);
 int  lstcon_rpc_trans_prep(struct list_head *translist,
-			   int transop, lstcon_rpc_trans_t **transpp);
+			   int transop, struct lstcon_rpc_trans **transpp);
 int  lstcon_rpc_trans_ndlist(struct list_head *ndlist,
 			     struct list_head *translist, int transop,
 			     void *arg, lstcon_rpc_cond_func_t condition,
-			     lstcon_rpc_trans_t **transpp);
-void lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans,
+			     struct lstcon_rpc_trans **transpp);
+void lstcon_rpc_trans_stat(struct lstcon_rpc_trans *trans,
 			   struct lstcon_trans_stat *stat);
-int  lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
+int  lstcon_rpc_trans_interpreter(struct lstcon_rpc_trans *trans,
 				  struct list_head __user *head_up,
 				  lstcon_rpc_readent_func_t readent);
-void lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error);
-void lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans);
-void lstcon_rpc_trans_addreq(lstcon_rpc_trans_t *trans, lstcon_rpc_t *req);
-int  lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout);
+void lstcon_rpc_trans_abort(struct lstcon_rpc_trans *trans, int error);
+void lstcon_rpc_trans_destroy(struct lstcon_rpc_trans *trans);
+void lstcon_rpc_trans_addreq(struct lstcon_rpc_trans *trans,
+			     struct lstcon_rpc *req);
+int  lstcon_rpc_trans_postwait(struct lstcon_rpc_trans *trans, int timeout);
 int  lstcon_rpc_pinger_start(void);
 void lstcon_rpc_pinger_stop(void);
 void lstcon_rpc_cleanup_wait(void);
diff --git a/drivers/staging/lustrefsx/lnet/selftest/console.c b/drivers/staging/lustrefsx/lnet/selftest/console.c
index a9fe8a85a2dd1..1e37454732cd1 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/console.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/console.c
@@ -36,7 +36,6 @@
  * Author: Liang Zhen <liangzhen@clusterfs.com>
  */
 
-
 #include <libcfs/libcfs.h>
 #include <lnet/lib-lnet.h>
 #include "console.h"
@@ -55,10 +54,10 @@ do {                                                    \
         (p)->nle_nnode ++;                              \
 } while (0)
 
-lstcon_session_t        console_session;
+struct lstcon_session console_session;
 
 static void
-lstcon_node_get(lstcon_node_t *nd)
+lstcon_node_get(struct lstcon_node *nd)
 {
         LASSERT (nd->nd_ref >= 1);
 
@@ -66,10 +65,11 @@ lstcon_node_get(lstcon_node_t *nd)
 }
 
 static int
-lstcon_node_find(struct lnet_process_id id, lstcon_node_t **ndpp, int create)
+lstcon_node_find(struct lnet_process_id id, struct lstcon_node **ndpp,
+		 int create)
 {
-	lstcon_ndlink_t	*ndl;
-	unsigned int	 idx = LNET_NIDADDR(id.nid) % LST_GLOBAL_HASHSIZE;
+	struct lstcon_ndlink *ndl;
+	unsigned int idx = LNET_NIDADDR(id.nid) % LST_GLOBAL_HASHSIZE;
 
 	LASSERT(id.nid != LNET_NID_ANY);
 
@@ -87,20 +87,20 @@ lstcon_node_find(struct lnet_process_id id, lstcon_node_t **ndpp, int create)
         if (!create)
                 return -ENOENT;
 
-        LIBCFS_ALLOC(*ndpp, sizeof(lstcon_node_t) + sizeof(lstcon_ndlink_t));
-        if (*ndpp == NULL)
-                return -ENOMEM;
+	LIBCFS_ALLOC(*ndpp, sizeof(**ndpp) + sizeof(*ndl));
+	if (*ndpp == NULL)
+		return -ENOMEM;
 
-        ndl = (lstcon_ndlink_t *)(*ndpp + 1);
+	ndl = (struct lstcon_ndlink *)(*ndpp + 1);
 
-        ndl->ndl_node = *ndpp;
+	ndl->ndl_node = *ndpp;
 
-        ndl->ndl_node->nd_ref   = 1;
-        ndl->ndl_node->nd_id    = id;
-        ndl->ndl_node->nd_stamp = cfs_time_current();
-        ndl->ndl_node->nd_state = LST_NODE_UNKNOWN;
-        ndl->ndl_node->nd_timeout = 0;
-        memset(&ndl->ndl_node->nd_ping, 0, sizeof(lstcon_rpc_t));
+	ndl->ndl_node->nd_ref   = 1;
+	ndl->ndl_node->nd_id    = id;
+	ndl->ndl_node->nd_stamp = ktime_get();
+	ndl->ndl_node->nd_state = LST_NODE_UNKNOWN;
+	ndl->ndl_node->nd_timeout = 0;
+	memset(&ndl->ndl_node->nd_ping, 0, sizeof(ndl->ndl_node->nd_ping));
 
 	/* queued in global hash & list, no refcount is taken by
 	 * global hash & list, if caller release his refcount,
@@ -112,16 +112,16 @@ lstcon_node_find(struct lnet_process_id id, lstcon_node_t **ndpp, int create)
 }
 
 static void
-lstcon_node_put(lstcon_node_t *nd)
+lstcon_node_put(struct lstcon_node *nd)
 {
-	lstcon_ndlink_t *ndl;
+	struct lstcon_ndlink *ndl;
 
 	LASSERT(nd->nd_ref > 0);
 
 	if (--nd->nd_ref > 0)
 		return;
 
-	ndl = (lstcon_ndlink_t *)(nd + 1);
+	ndl = (struct lstcon_ndlink *)(nd + 1);
 
 	LASSERT(!list_empty(&ndl->ndl_link));
 	LASSERT(!list_empty(&ndl->ndl_hlink));
@@ -130,17 +130,17 @@ lstcon_node_put(lstcon_node_t *nd)
 	list_del(&ndl->ndl_link);
 	list_del(&ndl->ndl_hlink);
 
-	LIBCFS_FREE(nd, sizeof(lstcon_node_t) + sizeof(lstcon_ndlink_t));
+	LIBCFS_FREE(nd, sizeof(*nd) + sizeof(*ndl));
 }
 
 static int
 lstcon_ndlink_find(struct list_head *hash, struct lnet_process_id id,
-		   lstcon_ndlink_t **ndlpp, int create)
+		   struct lstcon_ndlink **ndlpp, int create)
 {
-	unsigned int	 idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE;
-	lstcon_ndlink_t *ndl;
-	lstcon_node_t   *nd;
-	int		 rc;
+	unsigned int idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_node *nd;
+	int rc;
 
 	if (id.nid == LNET_NID_ANY)
 		return -EINVAL;
@@ -163,7 +163,7 @@ lstcon_ndlink_find(struct list_head *hash, struct lnet_process_id id,
         if (rc != 0)
                 return rc;
 
-        LIBCFS_ALLOC(ndl, sizeof(lstcon_ndlink_t));
+	LIBCFS_ALLOC(ndl, sizeof(*ndl));
         if (ndl == NULL) {
                 lstcon_node_put(nd);
                 return -ENOMEM;
@@ -179,7 +179,7 @@ lstcon_ndlink_find(struct list_head *hash, struct lnet_process_id id,
 }
 
 static void
-lstcon_ndlink_release(lstcon_ndlink_t *ndl)
+lstcon_ndlink_release(struct lstcon_ndlink *ndl)
 {
 	LASSERT(list_empty(&ndl->ndl_link));
 	LASSERT(!list_empty(&ndl->ndl_hlink));
@@ -191,12 +191,12 @@ lstcon_ndlink_release(lstcon_ndlink_t *ndl)
 }
 
 static int
-lstcon_group_alloc(char *name, lstcon_group_t **grpp)
+lstcon_group_alloc(char *name, struct lstcon_group **grpp)
 {
-	lstcon_group_t *grp;
-	int		i;
+	struct lstcon_group *grp;
+	int i;
 
-        LIBCFS_ALLOC(grp, offsetof(lstcon_group_t,
+	LIBCFS_ALLOC(grp, offsetof(struct lstcon_group,
                                    grp_ndl_hash[LST_NODE_HASHSIZE]));
         if (grp == NULL)
                 return -ENOMEM;
@@ -204,7 +204,7 @@ lstcon_group_alloc(char *name, lstcon_group_t **grpp)
         grp->grp_ref = 1;
 	if (name != NULL) {
 		if (strlen(name) > sizeof(grp->grp_name)-1) {
-			LIBCFS_FREE(grp, offsetof(lstcon_group_t,
+			LIBCFS_FREE(grp, offsetof(struct lstcon_group,
 					  grp_ndl_hash[LST_NODE_HASHSIZE]));
 			return -E2BIG;
 		}
@@ -224,18 +224,19 @@ lstcon_group_alloc(char *name, lstcon_group_t **grpp)
 }
 
 static void
-lstcon_group_addref(lstcon_group_t *grp)
+lstcon_group_addref(struct lstcon_group *grp)
 {
 	grp->grp_ref++;
 }
 
-static void lstcon_group_ndlink_release(lstcon_group_t *, lstcon_ndlink_t *);
+static void lstcon_group_ndlink_release(struct lstcon_group *,
+					struct lstcon_ndlink *);
 
 static void
-lstcon_group_drain(lstcon_group_t *grp, int keep)
+lstcon_group_drain(struct lstcon_group *grp, int keep)
 {
-	lstcon_ndlink_t *ndl;
-	lstcon_ndlink_t *tmp;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_ndlink *tmp;
 
 	list_for_each_entry_safe(ndl, tmp, &grp->grp_ndl_list, ndl_link) {
 		if ((ndl->ndl_node->nd_state & keep) == 0)
@@ -244,7 +245,7 @@ lstcon_group_drain(lstcon_group_t *grp, int keep)
 }
 
 static void
-lstcon_group_decref(lstcon_group_t *grp)
+lstcon_group_decref(struct lstcon_group *grp)
 {
 	int i;
 
@@ -259,14 +260,14 @@ lstcon_group_decref(lstcon_group_t *grp)
 	for (i = 0; i < LST_NODE_HASHSIZE; i++)
 		LASSERT(list_empty(&grp->grp_ndl_hash[i]));
 
-	LIBCFS_FREE(grp, offsetof(lstcon_group_t,
+	LIBCFS_FREE(grp, offsetof(struct lstcon_group,
 				  grp_ndl_hash[LST_NODE_HASHSIZE]));
 }
 
 static int
-lstcon_group_find(const char *name, lstcon_group_t **grpp)
+lstcon_group_find(const char *name, struct lstcon_group **grpp)
 {
-	lstcon_group_t *grp;
+	struct lstcon_group *grp;
 
 	list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) {
 		if (strncmp(grp->grp_name, name, LST_NAME_SIZE) != 0)
@@ -281,8 +282,8 @@ lstcon_group_find(const char *name, lstcon_group_t **grpp)
 }
 
 static int
-lstcon_group_ndlink_find(lstcon_group_t *grp, struct lnet_process_id id,
-                         lstcon_ndlink_t **ndlpp, int create)
+lstcon_group_ndlink_find(struct lstcon_group *grp, struct lnet_process_id id,
+			 struct lstcon_ndlink **ndlpp, int create)
 {
 	int rc;
 
@@ -300,7 +301,7 @@ lstcon_group_ndlink_find(lstcon_group_t *grp, struct lnet_process_id id,
 }
 
 static void
-lstcon_group_ndlink_release(lstcon_group_t *grp, lstcon_ndlink_t *ndl)
+lstcon_group_ndlink_release(struct lstcon_group *grp, struct lstcon_ndlink *ndl)
 {
 	list_del_init(&ndl->ndl_link);
 	lstcon_ndlink_release(ndl);
@@ -308,8 +309,8 @@ lstcon_group_ndlink_release(lstcon_group_t *grp, lstcon_ndlink_t *ndl)
 }
 
 static void
-lstcon_group_ndlink_move(lstcon_group_t *old,
-                         lstcon_group_t *new, lstcon_ndlink_t *ndl)
+lstcon_group_ndlink_move(struct lstcon_group *old,
+			 struct lstcon_group *new, struct lstcon_ndlink *ndl)
 {
 	unsigned int idx = LNET_NIDADDR(ndl->ndl_node->nd_id.nid) %
 					LST_NODE_HASHSIZE;
@@ -326,21 +327,21 @@ lstcon_group_ndlink_move(lstcon_group_t *old,
 }
 
 static void
-lstcon_group_move(lstcon_group_t *old, lstcon_group_t *new)
+lstcon_group_move(struct lstcon_group *old, struct lstcon_group *new)
 {
-	lstcon_ndlink_t *ndl;
+	struct lstcon_ndlink *ndl;
 
 	while (!list_empty(&old->grp_ndl_list)) {
 		ndl = list_entry(old->grp_ndl_list.next,
-				 lstcon_ndlink_t, ndl_link);
+				 struct lstcon_ndlink, ndl_link);
 		lstcon_group_ndlink_move(old, new, ndl);
 	}
 }
 
 static int
-lstcon_sesrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+lstcon_sesrpc_condition(int transop, struct lstcon_node *nd, void *arg)
 {
-        lstcon_group_t *grp = (lstcon_group_t *)arg;
+	struct lstcon_group *grp = arg;
 
         switch (transop) {
         case LST_TRANS_SESNEW:
@@ -367,10 +368,10 @@ lstcon_sesrpc_condition(int transop, lstcon_node_t *nd, void *arg)
 }
 
 static int
-lstcon_sesrpc_readent(int transop, srpc_msg_t *msg,
+lstcon_sesrpc_readent(int transop, struct srpc_msg *msg,
 		      struct lstcon_rpc_ent __user *ent_up)
 {
-        srpc_debug_reply_t *rep;
+	struct srpc_debug_reply *rep;
 
         switch (transop) {
         case LST_TRANS_SESNEW:
@@ -396,16 +397,17 @@ lstcon_sesrpc_readent(int transop, srpc_msg_t *msg,
 }
 
 static int
-lstcon_group_nodes_add(lstcon_group_t *grp,
+lstcon_group_nodes_add(struct lstcon_group *grp,
 		       int count, struct lnet_process_id __user *ids_up,
-		       unsigned *featp, struct list_head __user *result_up)
+		       unsigned int *featp,
+		       struct list_head __user *result_up)
 {
-        lstcon_rpc_trans_t      *trans;
-        lstcon_ndlink_t         *ndl;
-        lstcon_group_t          *tmp;
-	struct lnet_process_id        id;
-        int                      i;
-        int                      rc;
+	struct lstcon_rpc_trans *trans;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_group *tmp;
+	struct lnet_process_id id;
+	int i;
+	int rc;
 
         rc = lstcon_group_alloc(NULL, &tmp);
         if (rc != 0) {
@@ -463,16 +465,16 @@ lstcon_group_nodes_add(lstcon_group_t *grp,
 }
 
 static int
-lstcon_group_nodes_remove(lstcon_group_t *grp,
+lstcon_group_nodes_remove(struct lstcon_group *grp,
 			  int count, struct lnet_process_id __user *ids_up,
 			  struct list_head __user *result_up)
 {
-        lstcon_rpc_trans_t     *trans;
-        lstcon_ndlink_t        *ndl;
-        lstcon_group_t         *tmp;
-	struct lnet_process_id       id;
-        int                     rc;
-        int                     i;
+	struct lstcon_rpc_trans *trans;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_group *tmp;
+	struct lnet_process_id id;
+	int rc;
+	int i;
 
         /* End session and remove node from the group */
 
@@ -520,8 +522,8 @@ lstcon_group_nodes_remove(lstcon_group_t *grp,
 int
 lstcon_group_add(char *name)
 {
-        lstcon_group_t *grp;
-        int             rc;
+	struct lstcon_group *grp;
+	int rc;
 
         rc = (lstcon_group_find(name, &grp) == 0)? -EEXIST: 0;
         if (rc != 0) {
@@ -545,7 +547,7 @@ int
 lstcon_nodes_add(char *name, int count, struct lnet_process_id __user *ids_up,
 		 unsigned *featp, struct list_head __user *result_up)
 {
-        lstcon_group_t         *grp;
+	struct lstcon_group         *grp;
         int                     rc;
 
         LASSERT (count > 0);
@@ -575,9 +577,9 @@ lstcon_nodes_add(char *name, int count, struct lnet_process_id __user *ids_up,
 int
 lstcon_group_del(char *name)
 {
-        lstcon_rpc_trans_t *trans;
-        lstcon_group_t     *grp;
-        int                 rc;
+	struct lstcon_rpc_trans *trans;
+	struct lstcon_group *grp;
+	int rc;
 
         rc = lstcon_group_find(name, &grp);
         if (rc != 0) {
@@ -616,8 +618,8 @@ lstcon_group_del(char *name)
 int
 lstcon_group_clean(char *name, int args)
 {
-        lstcon_group_t *grp = NULL;
-        int             rc;
+	struct lstcon_group *grp = NULL;
+	int rc;
 
         rc = lstcon_group_find(name, &grp);
         if (rc != 0) {
@@ -650,8 +652,8 @@ lstcon_nodes_remove(char *name, int count,
 		    struct lnet_process_id __user *ids_up,
 		    struct list_head __user *result_up)
 {
-        lstcon_group_t *grp = NULL;
-        int             rc;
+	struct lstcon_group *grp = NULL;
+	int rc;
 
         rc = lstcon_group_find(name, &grp);
         if (rc != 0) {
@@ -679,9 +681,9 @@ lstcon_nodes_remove(char *name, int count,
 int
 lstcon_group_refresh(char *name, struct list_head __user *result_up)
 {
-        lstcon_rpc_trans_t      *trans;
-        lstcon_group_t          *grp;
-        int                      rc;
+	struct lstcon_rpc_trans *trans;
+	struct lstcon_group *grp;
+	int rc;
 
         rc = lstcon_group_find(name, &grp);
         if (rc != 0) {
@@ -721,7 +723,7 @@ lstcon_group_refresh(char *name, struct list_head __user *result_up)
 int
 lstcon_group_list(int index, int len, char __user *name_up)
 {
-	lstcon_group_t *grp;
+	struct lstcon_group *grp;
 
 	LASSERT(index >= 0);
 	LASSERT(name_up != NULL);
@@ -740,10 +742,10 @@ static int
 lstcon_nodes_getent(struct list_head *head, int *index_p,
 		    int *count_p, struct lstcon_node_ent __user *dents_up)
 {
-        lstcon_ndlink_t  *ndl;
-        lstcon_node_t    *nd;
-        int               count = 0;
-        int               index = 0;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_node *nd;
+	int count = 0;
+	int index = 0;
 
 	LASSERT(index_p != NULL && count_p != NULL);
 	LASSERT(dents_up != NULL);
@@ -782,9 +784,9 @@ lstcon_group_info(char *name, struct lstcon_ndlist_ent __user *gents_p,
 		  struct lstcon_node_ent __user *dents_up)
 {
 	struct lstcon_ndlist_ent *gentp;
-        lstcon_group_t      *grp;
-        lstcon_ndlink_t     *ndl;
-        int                  rc;
+	struct lstcon_group *grp;
+	struct lstcon_ndlink *ndl;
+	int rc;
 
         rc = lstcon_group_find(name, &grp);
         if (rc != 0) {
@@ -824,9 +826,9 @@ lstcon_group_info(char *name, struct lstcon_ndlist_ent __user *gents_p,
 }
 
 static int
-lstcon_batch_find(const char *name, lstcon_batch_t **batpp)
+lstcon_batch_find(const char *name, struct lstcon_batch **batpp)
 {
-	lstcon_batch_t *bat;
+	struct lstcon_batch *bat;
 
 	list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) {
 		if (strncmp(bat->bat_name, name, LST_NAME_SIZE) == 0) {
@@ -841,9 +843,9 @@ lstcon_batch_find(const char *name, lstcon_batch_t **batpp)
 int
 lstcon_batch_add(char *name)
 {
-        lstcon_batch_t   *bat;
-        int               i;
-        int               rc;
+	struct lstcon_batch *bat;
+	int i;
+	int rc;
 
         rc = (lstcon_batch_find(name, &bat) == 0)? -EEXIST: 0;
         if (rc != 0) {
@@ -851,17 +853,17 @@ lstcon_batch_add(char *name)
                 return rc;
         }
 
-        LIBCFS_ALLOC(bat, sizeof(lstcon_batch_t));
+	LIBCFS_ALLOC(bat, sizeof(*bat));
         if (bat == NULL) {
                 CERROR("Can't allocate descriptor for batch %s\n", name);
                 return -ENOMEM;
         }
 
-        LIBCFS_ALLOC(bat->bat_cli_hash,
+	LIBCFS_ALLOC(bat->bat_cli_hash,
 		     sizeof(struct list_head) * LST_NODE_HASHSIZE);
-        if (bat->bat_cli_hash == NULL) {
-                CERROR("Can't allocate hash for batch %s\n", name);
-                LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+	if (bat->bat_cli_hash == NULL) {
+		CERROR("Can't allocate hash for batch %s\n", name);
+		LIBCFS_FREE(bat, sizeof(*bat));
 
                 return -ENOMEM;
         }
@@ -871,7 +873,7 @@ lstcon_batch_add(char *name)
         if (bat->bat_srv_hash == NULL) {
                 CERROR("Can't allocate hash for batch %s\n", name);
                 LIBCFS_FREE(bat->bat_cli_hash, LST_NODE_HASHSIZE);
-                LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+		LIBCFS_FREE(bat, sizeof(*bat));
 
                 return -ENOMEM;
         }
@@ -879,7 +881,7 @@ lstcon_batch_add(char *name)
 	if (strlen(name) > sizeof(bat->bat_name)-1) {
 		LIBCFS_FREE(bat->bat_srv_hash, LST_NODE_HASHSIZE);
 		LIBCFS_FREE(bat->bat_cli_hash, LST_NODE_HASHSIZE);
-		LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+		LIBCFS_FREE(bat, sizeof(*bat));
 		return -E2BIG;
 	}
 	strncpy(bat->bat_name, name, sizeof(bat->bat_name));
@@ -907,7 +909,7 @@ lstcon_batch_add(char *name)
 int
 lstcon_batch_list(int index, int len, char __user *name_up)
 {
-	lstcon_batch_t *bat;
+	struct lstcon_batch *bat;
 
 	LASSERT(name_up != NULL);
 	LASSERT(index >= 0);
@@ -928,12 +930,12 @@ lstcon_batch_info(char *name, struct lstcon_test_batch_ent __user *ent_up,
 		  struct lstcon_node_ent __user *dents_up)
 {
 	struct lstcon_test_batch_ent *entp;
-	struct list_head	*clilst;
-	struct list_head	*srvlst;
-        lstcon_test_t           *test = NULL;
-        lstcon_batch_t          *bat;
-        lstcon_ndlink_t         *ndl;
-        int                      rc;
+	struct list_head *clilst;
+	struct list_head *srvlst;
+	struct lstcon_test *test = NULL;
+	struct lstcon_batch *bat;
+	struct lstcon_ndlink *ndl;
+	int rc;
 
         rc = lstcon_batch_find(name, &bat);
         if (rc != 0) {
@@ -996,7 +998,7 @@ lstcon_batch_info(char *name, struct lstcon_test_batch_ent __user *ent_up,
 }
 
 static int
-lstcon_batrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+lstcon_batrpc_condition(int transop, struct lstcon_node *nd, void *arg)
 {
         switch (transop) {
         case LST_TRANS_TSBRUN:
@@ -1018,10 +1020,10 @@ lstcon_batrpc_condition(int transop, lstcon_node_t *nd, void *arg)
 }
 
 static int
-lstcon_batch_op(lstcon_batch_t *bat, int transop,
+lstcon_batch_op(struct lstcon_batch *bat, int transop,
 		struct list_head __user *result_up)
 {
-        lstcon_rpc_trans_t *trans;
+	struct lstcon_rpc_trans *trans;
         int                 rc;
 
         rc = lstcon_rpc_trans_ndlist(&bat->bat_cli_list,
@@ -1044,8 +1046,8 @@ lstcon_batch_op(lstcon_batch_t *bat, int transop,
 int
 lstcon_batch_run(char *name, int timeout, struct list_head __user *result_up)
 {
-        lstcon_batch_t *bat;
-        int             rc;
+	struct lstcon_batch *bat;
+	int rc;
 
         if (lstcon_batch_find(name, &bat) != 0) {
                 CDEBUG(D_NET, "Can't find batch %s\n", name);
@@ -1066,8 +1068,8 @@ lstcon_batch_run(char *name, int timeout, struct list_head __user *result_up)
 int
 lstcon_batch_stop(char *name, int force, struct list_head __user *result_up)
 {
-        lstcon_batch_t *bat;
-        int             rc;
+	struct lstcon_batch *bat;
+	int rc;
 
         if (lstcon_batch_find(name, &bat) != 0) {
                 CDEBUG(D_NET, "Can't find batch %s\n", name);
@@ -1086,17 +1088,17 @@ lstcon_batch_stop(char *name, int force, struct list_head __user *result_up)
 }
 
 static void
-lstcon_batch_destroy(lstcon_batch_t *bat)
+lstcon_batch_destroy(struct lstcon_batch *bat)
 {
-        lstcon_ndlink_t    *ndl;
-        lstcon_test_t      *test;
-        int                 i;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_test *test;
+	int i;
 
 	list_del(&bat->bat_link);
 
 	while (!list_empty(&bat->bat_test_list)) {
 		test = list_entry(bat->bat_test_list.next,
-				  lstcon_test_t, tes_link);
+				  struct lstcon_test, tes_link);
 		LASSERT(list_empty(&test->tes_trans_list));
 
 		list_del(&test->tes_link);
@@ -1104,7 +1106,7 @@ lstcon_batch_destroy(lstcon_batch_t *bat)
 		lstcon_group_decref(test->tes_src_grp);
 		lstcon_group_decref(test->tes_dst_grp);
 
-		LIBCFS_FREE(test, offsetof(lstcon_test_t,
+		LIBCFS_FREE(test, offsetof(struct lstcon_test,
 					   tes_param[test->tes_paramlen]));
 	}
 
@@ -1112,7 +1114,7 @@ lstcon_batch_destroy(lstcon_batch_t *bat)
 
 	while (!list_empty(&bat->bat_cli_list)) {
 		ndl = list_entry(bat->bat_cli_list.next,
-				 lstcon_ndlink_t, ndl_link);
+				 struct lstcon_ndlink, ndl_link);
 		list_del_init(&ndl->ndl_link);
 
 		lstcon_ndlink_release(ndl);
@@ -1120,7 +1122,7 @@ lstcon_batch_destroy(lstcon_batch_t *bat)
 
 	while (!list_empty(&bat->bat_srv_list)) {
 		ndl = list_entry(bat->bat_srv_list.next,
-				 lstcon_ndlink_t, ndl_link);
+				 struct lstcon_ndlink, ndl_link);
 		list_del_init(&ndl->ndl_link);
 
 		lstcon_ndlink_release(ndl);
@@ -1135,19 +1137,18 @@ lstcon_batch_destroy(lstcon_batch_t *bat)
 		    sizeof(struct list_head) * LST_NODE_HASHSIZE);
 	LIBCFS_FREE(bat->bat_srv_hash,
 		    sizeof(struct list_head) * LST_NODE_HASHSIZE);
-	LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+	LIBCFS_FREE(bat, sizeof(*bat));
 }
 
 static int
-lstcon_testrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+lstcon_testrpc_condition(int transop, struct lstcon_node *nd, void *arg)
 {
-	lstcon_test_t	 *test;
-	lstcon_batch_t	 *batch;
-	lstcon_ndlink_t  *ndl;
+	struct lstcon_test *test = arg;
+	struct lstcon_batch *batch;
+	struct lstcon_ndlink *ndl;
 	struct list_head *hash;
 	struct list_head *head;
 
-	test = (lstcon_test_t *)arg;
 	LASSERT(test != NULL);
 
 	batch = test->tes_batch;
@@ -1183,12 +1184,13 @@ lstcon_testrpc_condition(int transop, lstcon_node_t *nd, void *arg)
 }
 
 static int
-lstcon_test_nodes_add(lstcon_test_t *test, struct list_head __user *result_up)
+lstcon_test_nodes_add(struct lstcon_test *test,
+		      struct list_head __user *result_up)
 {
-        lstcon_rpc_trans_t     *trans;
-        lstcon_group_t         *grp;
-        int                     transop;
-        int                     rc;
+	struct lstcon_rpc_trans *trans;
+	struct lstcon_group *grp;
+	int transop;
+	int rc;
 
         LASSERT (test->tes_src_grp != NULL);
         LASSERT (test->tes_dst_grp != NULL);
@@ -1235,7 +1237,7 @@ lstcon_test_nodes_add(lstcon_test_t *test, struct list_head __user *result_up)
 }
 
 static int
-lstcon_verify_batch(const char *name, lstcon_batch_t **batch)
+lstcon_verify_batch(const char *name, struct lstcon_batch **batch)
 {
 	int rc;
 
@@ -1254,10 +1256,10 @@ lstcon_verify_batch(const char *name, lstcon_batch_t **batch)
 }
 
 static int
-lstcon_verify_group(const char *name, lstcon_group_t **grp)
+lstcon_verify_group(const char *name, struct lstcon_group **grp)
 {
-	int			rc;
-	lstcon_ndlink_t		*ndl;
+	int rc;
+	struct lstcon_ndlink *ndl;
 
 	rc = lstcon_group_find(name, grp);
 	if (rc != 0) {
@@ -1283,11 +1285,11 @@ lstcon_test_add(char *batch_name, int type, int loop,
 		void *param, int paramlen, int *retp,
 		struct list_head __user *result_up)
 {
-	lstcon_test_t	 *test	 = NULL;
-	int		 rc;
-	lstcon_group_t	 *src_grp = NULL;
-	lstcon_group_t	 *dst_grp = NULL;
-	lstcon_batch_t	 *batch = NULL;
+	struct lstcon_test *test = NULL;
+	int rc;
+	struct lstcon_group *src_grp = NULL;
+	struct lstcon_group *dst_grp = NULL;
+	struct lstcon_batch *batch = NULL;
 
 	/*
 	 * verify that a batch of the given name exists, and the groups
@@ -1309,7 +1311,7 @@ lstcon_test_add(char *batch_name, int type, int loop,
 	if (dst_grp->grp_userland)
 		*retp = 1;
 
-	LIBCFS_ALLOC(test, offsetof(lstcon_test_t, tes_param[paramlen]));
+	LIBCFS_ALLOC(test, offsetof(struct lstcon_test, tes_param[paramlen]));
 	if (!test) {
 		CERROR("Can't allocate test descriptor\n");
 		rc = -ENOMEM;
@@ -1356,7 +1358,8 @@ lstcon_test_add(char *batch_name, int type, int loop,
 	return rc;
 out:
 	if (test != NULL)
-		LIBCFS_FREE(test, offsetof(lstcon_test_t, tes_param[paramlen]));
+		LIBCFS_FREE(test, offsetof(struct lstcon_test,
+					   tes_param[paramlen]));
 
 	if (dst_grp != NULL)
 		lstcon_group_decref(dst_grp);
@@ -1368,9 +1371,10 @@ lstcon_test_add(char *batch_name, int type, int loop,
 }
 
 static int
-lstcon_test_find(lstcon_batch_t *batch, int idx, lstcon_test_t **testpp)
+lstcon_test_find(struct lstcon_batch *batch, int idx,
+		 struct lstcon_test **testpp)
 {
-	lstcon_test_t *test;
+	struct lstcon_test *test;
 
 	list_for_each_entry(test, &batch->bat_test_list, tes_link) {
 		if (idx == test->tes_hdr.tsb_index) {
@@ -1383,10 +1387,10 @@ lstcon_test_find(lstcon_batch_t *batch, int idx, lstcon_test_t **testpp)
 }
 
 static int
-lstcon_tsbrpc_readent(int transop, srpc_msg_t *msg,
+lstcon_tsbrpc_readent(int transop, struct srpc_msg *msg,
 		      struct lstcon_rpc_ent __user *ent_up)
 {
-        srpc_batch_reply_t *rep = &msg->msg_body.bat_reply;
+	struct srpc_batch_reply *rep = &msg->msg_body.bat_reply;
 
         LASSERT (transop == LST_TRANS_TSBCLIQRY ||
                  transop == LST_TRANS_TSBSRVQRY);
@@ -1403,14 +1407,14 @@ int
 lstcon_test_batch_query(char *name, int testidx, int client,
 			int timeout, struct list_head __user *result_up)
 {
-        lstcon_rpc_trans_t *trans;
-	struct list_head   *translist;
-	struct list_head   *ndlist;
-        lstcon_tsb_hdr_t   *hdr;
-        lstcon_batch_t     *batch;
-        lstcon_test_t      *test = NULL;
-        int                 transop;
-        int                 rc;
+	struct lstcon_rpc_trans *trans;
+	struct list_head *translist;
+	struct list_head *ndlist;
+	struct lstcon_tsb_hdr *hdr;
+	struct lstcon_batch *batch;
+	struct lstcon_test *test = NULL;
+	int transop;
+	int rc;
 
         rc = lstcon_batch_find(name, &batch);
         if (rc != 0) {
@@ -1462,13 +1466,13 @@ lstcon_test_batch_query(char *name, int testidx, int client,
 }
 
 static int
-lstcon_statrpc_readent(int transop, srpc_msg_t *msg,
+lstcon_statrpc_readent(int transop, struct srpc_msg *msg,
 		       struct lstcon_rpc_ent __user *ent_up)
 {
-	srpc_stat_reply_t *rep = &msg->msg_body.stat_reply;
-	struct sfw_counters __user  *sfwk_stat;
+	struct srpc_stat_reply *rep = &msg->msg_body.stat_reply;
+	struct sfw_counters __user *sfwk_stat;
 	struct srpc_counters __user *srpc_stat;
-	struct lnet_counters __user *lnet_stat;
+	struct lnet_counters_common __user *lnet_stat;
 
         if (rep->str_status != 0)
                 return 0;
@@ -1476,7 +1480,7 @@ lstcon_statrpc_readent(int transop, srpc_msg_t *msg,
 	sfwk_stat = (struct sfw_counters __user *)&ent_up->rpe_payload[0];
 	srpc_stat = (struct srpc_counters __user *)
 		((char __user *)sfwk_stat + sizeof(*sfwk_stat));
-	lnet_stat = (struct lnet_counters __user *)
+	lnet_stat = (struct lnet_counters_common __user *)
 		((char __user *)srpc_stat + sizeof(*srpc_stat));
 
 	if (copy_to_user(sfwk_stat, &rep->str_fw, sizeof(*sfwk_stat)) ||
@@ -1492,7 +1496,7 @@ lstcon_ndlist_stat(struct list_head *ndlist,
 		   int timeout, struct list_head __user *result_up)
 {
 	struct list_head    head;
-	lstcon_rpc_trans_t *trans;
+	struct lstcon_rpc_trans *trans;
 	int		    rc;
 
 	INIT_LIST_HEAD(&head);
@@ -1517,8 +1521,8 @@ int
 lstcon_group_stat(char *grp_name, int timeout,
 		  struct list_head __user *result_up)
 {
-        lstcon_group_t     *grp;
-        int                 rc;
+	struct lstcon_group *grp;
+	int rc;
 
         rc = lstcon_group_find(grp_name, &grp);
         if (rc != 0) {
@@ -1537,11 +1541,11 @@ int
 lstcon_nodes_stat(int count, struct lnet_process_id __user *ids_up,
 		  int timeout, struct list_head __user *result_up)
 {
-        lstcon_ndlink_t         *ndl;
-        lstcon_group_t          *tmp;
-	struct lnet_process_id        id;
-        int                      i;
-        int                      rc;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_group *tmp;
+	struct lnet_process_id id;
+	int i;
+	int rc;
 
         rc = lstcon_group_alloc(NULL, &tmp);
         if (rc != 0) {
@@ -1582,8 +1586,8 @@ lstcon_debug_ndlist(struct list_head *ndlist,
 		    struct list_head *translist,
 		    int timeout, struct list_head __user *result_up)
 {
-        lstcon_rpc_trans_t *trans;
-        int                 rc;
+	struct lstcon_rpc_trans *trans;
+	int rc;
 
         rc = lstcon_rpc_trans_ndlist(ndlist, translist, LST_TRANS_SESQRY,
                                      NULL, lstcon_sesrpc_condition, &trans);
@@ -1612,8 +1616,8 @@ int
 lstcon_batch_debug(int timeout, char *name,
 		   int client, struct list_head __user *result_up)
 {
-        lstcon_batch_t *bat;
-        int             rc;
+	struct lstcon_batch *bat;
+	int rc;
 
         rc = lstcon_batch_find(name, &bat);
         if (rc != 0)
@@ -1630,8 +1634,8 @@ int
 lstcon_group_debug(int timeout, char *name,
 		   struct list_head __user *result_up)
 {
-        lstcon_group_t *grp;
-        int             rc;
+	struct lstcon_group *grp;
+	int rc;
 
         rc = lstcon_group_find(name, &grp);
         if (rc != 0)
@@ -1645,15 +1649,15 @@ lstcon_group_debug(int timeout, char *name,
 }
 
 int
-lstcon_nodes_debug(int timeout,
-		   int count, struct lnet_process_id __user *ids_up,
+lstcon_nodes_debug(int timeout, int count,
+		   struct lnet_process_id __user *ids_up,
 		   struct list_head __user *result_up)
 {
-	struct lnet_process_id  id;
-        lstcon_ndlink_t   *ndl;
-        lstcon_group_t    *grp;
-        int                i;
-        int                rc;
+	struct lnet_process_id id;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_group *grp;
+	int i;
+	int rc;
 
         rc = lstcon_group_alloc(NULL, &grp);
         if (rc != 0) {
@@ -1700,11 +1704,11 @@ lstcon_new_session_id(struct lst_sid *sid)
 {
 	struct lnet_process_id id;
 
-        LASSERT (console_session.ses_state == LST_SESSION_NONE);
+	LASSERT(console_session.ses_state == LST_SESSION_NONE);
 
-        LNetGetId(1, &id);
-        sid->ses_nid   = id.nid;
-        sid->ses_stamp = cfs_time_current();
+	LNetGetId(1, &id);
+	sid->ses_nid = id.nid;
+	sid->ses_stamp = div_u64(ktime_get_ns(), NSEC_PER_MSEC);
 }
 
 int
@@ -1759,7 +1763,7 @@ lstcon_session_new(char *name, int key, unsigned feats,
 
         rc = lstcon_rpc_pinger_start();
         if (rc != 0) {
-                lstcon_batch_t *bat = NULL;
+		struct lstcon_batch *bat = NULL;
 
                 lstcon_batch_find(LST_DEFAULT_BATCH, &bat);
                 lstcon_batch_destroy(bat);
@@ -1783,8 +1787,8 @@ lstcon_session_info(struct lst_sid __user *sid_up, int __user *key_up,
 		    char __user *name_up, int len)
 {
 	struct lstcon_ndlist_ent *entp;
-        lstcon_ndlink_t     *ndl;
-        int                  rc = 0;
+	struct lstcon_ndlink *ndl;
+	int rc = 0;
 
         if (console_session.ses_state != LST_SESSION_ACTIVE)
                 return -ESRCH;
@@ -1814,10 +1818,10 @@ lstcon_session_info(struct lst_sid __user *sid_up, int __user *key_up,
 int
 lstcon_session_end()
 {
-        lstcon_rpc_trans_t *trans;
-        lstcon_group_t     *grp;
-        lstcon_batch_t     *bat;
-        int                 rc = 0;
+	struct lstcon_rpc_trans *trans;
+	struct lstcon_group *grp;
+	struct lstcon_batch *bat;
+	int rc = 0;
 
         LASSERT (console_session.ses_state == LST_SESSION_ACTIVE);
 
@@ -1850,7 +1854,7 @@ lstcon_session_end()
 	/* destroy all batches */
 	while (!list_empty(&console_session.ses_bat_list)) {
 		bat = list_entry(console_session.ses_bat_list.next,
-				 lstcon_batch_t, bat_link);
+				 struct lstcon_batch, bat_link);
 
 		lstcon_batch_destroy(bat);
 	}
@@ -1858,7 +1862,7 @@ lstcon_session_end()
 	/* destroy all groups */
 	while (!list_empty(&console_session.ses_grp_list)) {
 		grp = list_entry(console_session.ses_grp_list.next,
-				 lstcon_group_t, grp_link);
+				 struct lstcon_group, grp_link);
 		LASSERT(grp->grp_ref == 1);
 
 		lstcon_group_decref(grp);
@@ -1906,15 +1910,15 @@ lstcon_session_feats_check(unsigned feats)
 }
 
 static int
-lstcon_acceptor_handle (srpc_server_rpc_t *rpc)
+lstcon_acceptor_handle(struct srpc_server_rpc *rpc)
 {
-        srpc_msg_t        *rep  = &rpc->srpc_replymsg;
-        srpc_msg_t        *req  = &rpc->srpc_reqstbuf->buf_msg;
-        srpc_join_reqst_t *jreq = &req->msg_body.join_reqst;
-        srpc_join_reply_t *jrep = &rep->msg_body.join_reply;
-        lstcon_group_t    *grp  = NULL;
-        lstcon_ndlink_t   *ndl;
-        int                rc   = 0;
+	struct srpc_msg *rep = &rpc->srpc_replymsg;
+	struct srpc_msg *req = &rpc->srpc_reqstbuf->buf_msg;
+	struct srpc_join_reqst *jreq = &req->msg_body.join_reqst;
+	struct srpc_join_reply *jrep = &rep->msg_body.join_reply;
+	struct lstcon_group *grp = NULL;
+	struct lstcon_ndlink *ndl;
+	int rc = 0;
 
         sfw_unpack_message(req);
 
@@ -1989,7 +1993,8 @@ lstcon_acceptor_handle (srpc_server_rpc_t *rpc)
         return rc;
 }
 
-static srpc_service_t lstcon_acceptor_service;
+static struct srpc_service lstcon_acceptor_service;
+
 static void lstcon_init_acceptor_service(void)
 {
         /* initialize selftest console acceptor service table */
@@ -1999,9 +2004,9 @@ static void lstcon_init_acceptor_service(void)
 	lstcon_acceptor_service.sv_wi_total = SFW_FRWK_WI_MAX;
 }
 
-int lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_hdr *hdr);
-
-DECLARE_IOCTL_HANDLER(lstcon_ioctl_handler, lstcon_ioctl_entry);
+static struct notifier_block lstcon_ioctl_handler = {
+	.notifier_call = lstcon_ioctl_entry,
+};
 
 /* initialize console */
 int
@@ -2010,8 +2015,6 @@ lstcon_console_init(void)
         int     i;
         int     rc;
 
-        memset(&console_session, 0, sizeof(lstcon_session_t));
-
 	console_session.ses_id		    = LST_INVALID_SID;
 	console_session.ses_state	    = LST_SESSION_NONE;
 	console_session.ses_timeout	    = 0;
@@ -2019,7 +2022,7 @@ lstcon_console_init(void)
 	console_session.ses_expired	    = 0;
 	console_session.ses_feats_updated   = 0;
 	console_session.ses_features	    = LST_FEATS_MASK;
-	console_session.ses_laststamp = cfs_time_current_sec();
+	console_session.ses_laststamp = ktime_get_real_seconds();
 
 	mutex_init(&console_session.ses_mutex);
 
@@ -2055,12 +2058,12 @@ lstcon_console_init(void)
                 goto out;
         }
 
-        rc = libcfs_register_ioctl(&lstcon_ioctl_handler);
-
-        if (rc == 0) {
-                lstcon_rpc_module_init();
-                return 0;
-        }
+	rc = blocking_notifier_chain_register(&libcfs_ioctl_list,
+					      &lstcon_ioctl_handler);
+	if (rc == 0) {
+		lstcon_rpc_module_init();
+		return 0;
+	}
 
 out:
 	srpc_shutdown_service(&lstcon_acceptor_service);
@@ -2077,9 +2080,10 @@ lstcon_console_init(void)
 int
 lstcon_console_fini(void)
 {
-        int     i;
+	int i;
 
-        libcfs_deregister_ioctl(&lstcon_ioctl_handler);
+	blocking_notifier_chain_unregister(&libcfs_ioctl_list,
+					   &lstcon_ioctl_handler);
 
 	mutex_lock(&console_session.ses_mutex);
 
diff --git a/drivers/staging/lustrefsx/lnet/selftest/console.h b/drivers/staging/lustrefsx/lnet/selftest/console.h
index ae76a50b4d173..02c76a89627e6 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/console.h
+++ b/drivers/staging/lustrefsx/lnet/selftest/console.h
@@ -39,29 +39,32 @@
 #ifndef __LST_CONSOLE_H__
 #define __LST_CONSOLE_H__
 
+#include <linux/uaccess.h>
 
 #include <libcfs/libcfs.h>
-#include <lnet/lnet.h>
 #include <lnet/lib-types.h>
 #include "selftest.h"
 #include "conrpc.h"
 
-typedef struct lstcon_node {
-	struct lnet_process_id    nd_id;          /* id of the node */
+/* node descriptor */
+struct lstcon_node {
+	struct lnet_process_id    nd_id;	/* id of the node */
         int                  nd_ref;         /* reference count */
         int                  nd_state;       /* state of the node */
         int                  nd_timeout;     /* session timeout */
-        cfs_time_t           nd_stamp;       /* timestamp of last replied RPC */
-        struct lstcon_rpc    nd_ping;        /* ping rpc */
-} lstcon_node_t;                                /*** node descriptor */
+	ktime_t			nd_stamp;	/* last RPC reply timestamp */
+	struct lstcon_rpc	nd_ping;	/* ping rpc */
+};
 
-typedef struct {
+/* node link descriptor */
+struct lstcon_ndlink {
 	struct list_head	ndl_link;	/* chain on list */
 	struct list_head	ndl_hlink;	/* chain on hash */
-	lstcon_node_t		*ndl_node;	/* pointer to node */
-} lstcon_ndlink_t;				/*** node link descriptor */
+	struct lstcon_node	*ndl_node;	/* pointer to node */
+};
 
-typedef struct {
+/* (alias of nodes) group descriptor */
+struct lstcon_group {
 	struct list_head	grp_link;	/* chain on global group list */
 	int			grp_ref;	/* reference count */
 	int			grp_userland;	/* has userland nodes */
@@ -71,19 +74,20 @@ typedef struct {
 	struct list_head	grp_trans_list;	/* transaction list */
 	struct list_head	grp_ndl_list;	/* nodes list */
 	struct list_head	grp_ndl_hash[0];/* hash table for nodes */
-} lstcon_group_t;		/*** (alias of nodes) group descriptor */
+};
 
 #define LST_BATCH_IDLE          0xB0            /* idle batch */
 #define LST_BATCH_RUNNING       0xB1            /* running batch */
 
-typedef struct lstcon_tsb_hdr {
+struct lstcon_tsb_hdr {
 	struct lst_bid		tsb_id;		/* batch ID */
         int                     tsb_index;      /* test index */
-} lstcon_tsb_hdr_t;
+};
 
-typedef struct {
+/* (tests ) batch descriptor */
+struct lstcon_batch {
 	/* test_batch header */
-	lstcon_tsb_hdr_t	bat_hdr;
+	struct lstcon_tsb_hdr	bat_hdr;
 	/* chain on session's batches list */
 	struct list_head	bat_link;
 	/* # of test */
@@ -99,7 +103,7 @@ typedef struct {
 	struct list_head	bat_test_list;
 	/* list head of transaction */
 	struct list_head	bat_trans_list;
-	/* list head of client nodes (lstcon_node_t) */
+	/* list head of client nodes (struct lstcon_node) */
 	struct list_head	bat_cli_list;
 	/* hash table of client nodes */
 	struct list_head	*bat_cli_hash;
@@ -107,15 +111,16 @@ typedef struct {
 	struct list_head	bat_srv_list;
 	/* hash table of server nodes */
 	struct list_head	*bat_srv_hash;
-} lstcon_batch_t;		/*** (tests ) batch descritptor */
+};
 
-typedef struct lstcon_test {
+/* a single test descriptor */
+struct lstcon_test {
 	/* test batch header */
-	lstcon_tsb_hdr_t	tes_hdr;
+	struct lstcon_tsb_hdr	tes_hdr;
 	/* chain on batch's tests list */
 	struct list_head	tes_link;
 	/* pointer to batch */
-	lstcon_batch_t	       *tes_batch;
+	struct lstcon_batch	*tes_batch;
 
         int                   tes_type;       /* type of the test, i.e: bulk, ping */
         int                   tes_stop_onerr; /* stop on error */
@@ -127,12 +132,12 @@ typedef struct lstcon_test {
         int                   tes_cliidx;     /* client index, used for RPC creating */
 
 	struct list_head	tes_trans_list;	/* transaction list */
-	lstcon_group_t		*tes_src_grp;	/* group run the test */
-	lstcon_group_t		*tes_dst_grp;	/* target group */
+	struct lstcon_group	*tes_src_grp;	/* group run the test */
+	struct lstcon_group	*tes_dst_grp;	/* target group */
 
         int                   tes_paramlen;   /* test parameter length */
         char                  tes_param[0];   /* test parameter */
-} lstcon_test_t;                                /*** a single test descriptor */
+};
 
 #define LST_GLOBAL_HASHSIZE     503             /* global nodes hash table size */
 #define LST_NODE_HASHSIZE       239             /* node hash table (for batch or group) */
@@ -142,13 +147,13 @@ typedef struct lstcon_test {
 
 #define LST_CONSOLE_TIMEOUT     300             /* default console timeout */
 
-typedef struct {
+struct lstcon_session {
 	struct mutex		ses_mutex;      /* only 1 thread in session */
-	struct lst_sid               ses_id;         /* global session id */
+	struct lst_sid          ses_id;         /* global session id */
         int                     ses_key;        /* local session key */
         int                     ses_state;      /* state of session */
         int                     ses_timeout;    /* timeout in seconds */
-	time_t			ses_laststamp;  /* last operation stamp (seconds) */
+	time64_t		ses_laststamp;  /* last operation stamp (seconds) */
 	/** tests features of the session */
 	unsigned		ses_features;
 	/** features are synced with remote test nodes */
@@ -161,9 +166,9 @@ typedef struct {
 	unsigned		ses_expired:1;
         __u64                   ses_id_cookie;  /* batch id cookie */
         char                    ses_name[LST_NAME_SIZE];  /* session name */
-        lstcon_rpc_trans_t     *ses_ping;       /* session pinger */
-        stt_timer_t             ses_ping_timer; /* timer for pinger */
-	struct lstcon_trans_stat     ses_trans_stat; /* transaction stats */
+	struct lstcon_rpc_trans	*ses_ping;      /* session pinger */
+	struct stt_timer	ses_ping_timer;	/* timer for pinger */
+	struct lstcon_trans_stat ses_trans_stat;/* transaction stats */
 
 	struct list_head	ses_trans_list;	/* global list of transaction */
 	struct list_head	ses_grp_list;	/* global list of groups */
@@ -174,9 +179,9 @@ typedef struct {
 	spinlock_t		ses_rpc_lock;	/* serialize */
 	atomic_t		ses_rpc_counter;/* # of initialized RPCs */
 	struct list_head	ses_rpc_freelist;/* idle console rpc */
-} lstcon_session_t;		/*** session descriptor */
+}; /* session descriptor */
 
-extern lstcon_session_t         console_session;
+extern struct lstcon_session console_session;
 
 static inline struct lstcon_trans_stat *
 lstcon_trans_stat(void)
@@ -250,6 +255,8 @@ extern int lstcon_test_add(char *batch_name, int type, int loop,
 			   void *param, int paramlen, int *retp,
 			   struct list_head __user *result_up);
 
+int lstcon_ioctl_entry(struct notifier_block *nb,
+		       unsigned long cmd, void *vdata);
 int lstcon_console_init(void);
 int lstcon_console_fini(void);
 
diff --git a/drivers/staging/lustrefsx/lnet/selftest/framework.c b/drivers/staging/lustrefsx/lnet/selftest/framework.c
index b5d430dde00d1..000fca9d34e33 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/framework.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/framework.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -51,49 +51,49 @@ MODULE_PARM_DESC(rpc_timeout, "rpc timeout in seconds (64 by default, 0 == never
 
 #define sfw_unpack_id(id)               \
 do {                                    \
-        __swab64s(&(id).nid);           \
-        __swab32s(&(id).pid);           \
+	__swab64s(&(id).nid);           \
+	__swab32s(&(id).pid);           \
 } while (0)
 
 #define sfw_unpack_sid(sid)             \
 do {                                    \
-        __swab64s(&(sid).ses_nid);      \
-        __swab64s(&(sid).ses_stamp);    \
+	__swab64s(&(sid).ses_nid);      \
+	__swab64s(&(sid).ses_stamp);    \
 } while (0)
 
 #define sfw_unpack_fw_counters(fc)        \
 do {                                      \
-        __swab32s(&(fc).running_ms);      \
-        __swab32s(&(fc).active_batches);  \
-        __swab32s(&(fc).zombie_sessions); \
-        __swab32s(&(fc).brw_errors);      \
-        __swab32s(&(fc).ping_errors);     \
+	__swab32s(&(fc).running_ms);      \
+	__swab32s(&(fc).active_batches);  \
+	__swab32s(&(fc).zombie_sessions); \
+	__swab32s(&(fc).brw_errors);      \
+	__swab32s(&(fc).ping_errors);     \
 } while (0)
 
 #define sfw_unpack_rpc_counters(rc)     \
 do {                                    \
-        __swab32s(&(rc).errors);        \
-        __swab32s(&(rc).rpcs_sent);     \
-        __swab32s(&(rc).rpcs_rcvd);     \
-        __swab32s(&(rc).rpcs_dropped);  \
-        __swab32s(&(rc).rpcs_expired);  \
-        __swab64s(&(rc).bulk_get);      \
-        __swab64s(&(rc).bulk_put);      \
+	__swab32s(&(rc).errors);        \
+	__swab32s(&(rc).rpcs_sent);     \
+	__swab32s(&(rc).rpcs_rcvd);     \
+	__swab32s(&(rc).rpcs_dropped);  \
+	__swab32s(&(rc).rpcs_expired);  \
+	__swab64s(&(rc).bulk_get);      \
+	__swab64s(&(rc).bulk_put);      \
 } while (0)
 
 #define sfw_unpack_lnet_counters(lc)    \
 do {                                    \
-        __swab32s(&(lc).errors);        \
-        __swab32s(&(lc).msgs_max);      \
-        __swab32s(&(lc).msgs_alloc);    \
-        __swab32s(&(lc).send_count);    \
-        __swab32s(&(lc).recv_count);    \
-        __swab32s(&(lc).drop_count);    \
-        __swab32s(&(lc).route_count);   \
-        __swab64s(&(lc).send_length);   \
-        __swab64s(&(lc).recv_length);   \
-        __swab64s(&(lc).drop_length);   \
-        __swab64s(&(lc).route_length);  \
+	__swab32s(&(lc).lcc_errors);        \
+	__swab32s(&(lc).lcc_msgs_max);      \
+	__swab32s(&(lc).lcc_msgs_alloc);    \
+	__swab32s(&(lc).lcc_send_count);    \
+	__swab32s(&(lc).lcc_recv_count);    \
+	__swab32s(&(lc).lcc_drop_count);    \
+	__swab32s(&(lc).lcc_route_count);   \
+	__swab64s(&(lc).lcc_send_length);   \
+	__swab64s(&(lc).lcc_recv_length);   \
+	__swab64s(&(lc).lcc_drop_length);   \
+	__swab64s(&(lc).lcc_route_length);  \
 } while (0)
 
 #define sfw_test_active(t)      (atomic_read(&(t)->tsi_nactive) != 0)
@@ -111,21 +111,21 @@ static struct smoketest_framework {
 	/* serialise */
 	spinlock_t		fw_lock;
 	/* _the_ session */
-	sfw_session_t		*fw_session;
+	struct sfw_session	*fw_session;
 	/* shutdown in progress */
 	int			fw_shuttingdown;
 	/* running RPC */
-	srpc_server_rpc_t	*fw_active_srpc;
+	struct srpc_server_rpc	*fw_active_srpc;
 } sfw_data;
 
 /* forward ref's */
-int sfw_stop_batch (sfw_batch_t *tsb, int force);
-void sfw_destroy_session (sfw_session_t *sn);
+static int sfw_stop_batch(struct sfw_batch *tsb, int force);
+static void sfw_destroy_session(struct sfw_session *sn);
 
-static inline sfw_test_case_t *
+static inline struct sfw_test_case *
 sfw_find_test_case(int id)
 {
-	sfw_test_case_t *tsc;
+	struct sfw_test_case *tsc;
 
 	LASSERT(id <= SRPC_SERVICE_MAX_ID);
 	LASSERT(id > SRPC_FRAMEWORK_SERVICE_MAX_ID);
@@ -139,9 +139,10 @@ sfw_find_test_case(int id)
 }
 
 static int
-sfw_register_test (srpc_service_t *service, sfw_test_client_ops_t *cliops)
+sfw_register_test(struct srpc_service *service,
+		  struct sfw_test_client_ops *cliops)
 {
-        sfw_test_case_t *tsc;
+	struct sfw_test_case *tsc;
 
         if (sfw_find_test_case(service->sv_id) != NULL) {
                 CERROR ("Failed to register test %s (%d)\n",
@@ -149,7 +150,7 @@ sfw_register_test (srpc_service_t *service, sfw_test_client_ops_t *cliops)
                 return -EEXIST;
         }
 
-        LIBCFS_ALLOC(tsc, sizeof(sfw_test_case_t));
+	LIBCFS_ALLOC(tsc, sizeof(*tsc));
         if (tsc == NULL)
                 return -ENOMEM;
 
@@ -163,8 +164,8 @@ sfw_register_test (srpc_service_t *service, sfw_test_client_ops_t *cliops)
 static void
 sfw_add_session_timer (void)
 {
-        sfw_session_t *sn = sfw_data.fw_session;
-        stt_timer_t   *timer = &sn->sn_timer;
+	struct sfw_session *sn = sfw_data.fw_session;
+	struct stt_timer *timer = &sn->sn_timer;
 
         LASSERT (!sfw_data.fw_shuttingdown);
 
@@ -174,8 +175,7 @@ sfw_add_session_timer (void)
         LASSERT (!sn->sn_timer_active);
 
         sn->sn_timer_active = 1;
-	timer->stt_expires = cfs_time_add(sn->sn_timeout,
-					  cfs_time_current_sec());
+	timer->stt_expires = ktime_get_real_seconds()+ sn->sn_timeout;
         stt_add_timer(timer);
         return;
 }
@@ -183,7 +183,7 @@ sfw_add_session_timer (void)
 static int
 sfw_del_session_timer (void)
 {
-        sfw_session_t *sn = sfw_data.fw_session;
+	struct sfw_session *sn = sfw_data.fw_session;
 
         if (sn == NULL || !sn->sn_timer_active)
                 return 0;
@@ -203,10 +203,10 @@ static void
 sfw_deactivate_session (void)
 __must_hold(&sfw_data.fw_lock)
 {
-        sfw_session_t *sn = sfw_data.fw_session;
+	struct sfw_session *sn = sfw_data.fw_session;
         int            nactive = 0;
-        sfw_batch_t   *tsb;
-        sfw_test_case_t *tsc;
+	struct sfw_batch *tsb;
+	struct sfw_test_case *tsc;
 
         if (sn == NULL) return;
 
@@ -246,7 +246,7 @@ __must_hold(&sfw_data.fw_lock)
 static void
 sfw_session_expired (void *data)
 {
-	sfw_session_t *sn = data;
+	struct sfw_session *sn = data;
 
 	spin_lock(&sfw_data.fw_lock);
 
@@ -264,12 +264,12 @@ sfw_session_expired (void *data)
 }
 
 static inline void
-sfw_init_session(sfw_session_t *sn, struct lst_sid sid,
+sfw_init_session(struct sfw_session *sn, struct lst_sid sid,
 		 unsigned features, const char *name)
 {
-        stt_timer_t *timer = &sn->sn_timer;
+	struct stt_timer *timer = &sn->sn_timer;
 
-        memset(sn, 0, sizeof(sfw_session_t));
+	memset(sn, 0, sizeof(struct sfw_session));
 	INIT_LIST_HEAD(&sn->sn_list);
 	INIT_LIST_HEAD(&sn->sn_batches);
 	atomic_set(&sn->sn_refcount, 1);        /* +1 for caller */
@@ -277,14 +277,14 @@ sfw_init_session(sfw_session_t *sn, struct lst_sid sid,
 	atomic_set(&sn->sn_ping_errors, 0);
 	strlcpy(&sn->sn_name[0], name, sizeof(sn->sn_name));
 
-        sn->sn_timer_active = 0;
-        sn->sn_id           = sid;
-	sn->sn_features	    = features;
-        sn->sn_timeout      = session_timeout;
-        sn->sn_started      = cfs_time_current();
+	sn->sn_timer_active = 0;
+	sn->sn_id = sid;
+	sn->sn_features = features;
+	sn->sn_timeout = session_timeout;
+	sn->sn_started = ktime_get();
 
-        timer->stt_data = sn;
-        timer->stt_func = sfw_session_expired;
+	timer->stt_data = sn;
+	timer->stt_func = sfw_session_expired;
 	INIT_LIST_HEAD(&timer->stt_list);
 }
 
@@ -308,7 +308,7 @@ sfw_server_rpc_done(struct srpc_server_rpc *rpc)
 }
 
 static void
-sfw_client_rpc_fini (srpc_client_rpc_t *rpc)
+sfw_client_rpc_fini(struct srpc_client_rpc *rpc)
 {
 	LASSERT(rpc->crpc_bulk.bk_niov == 0);
 	LASSERT(list_empty(&rpc->crpc_list));
@@ -329,11 +329,11 @@ sfw_client_rpc_fini (srpc_client_rpc_t *rpc)
 	spin_unlock(&sfw_data.fw_lock);
 }
 
-static sfw_batch_t *
+static struct sfw_batch *
 sfw_find_batch(struct lst_bid bid)
 {
-	sfw_session_t *sn = sfw_data.fw_session;
-	sfw_batch_t   *bat;
+	struct sfw_session *sn = sfw_data.fw_session;
+	struct sfw_batch *bat;
 
 	LASSERT(sn != NULL);
 
@@ -345,11 +345,11 @@ sfw_find_batch(struct lst_bid bid)
 	return NULL;
 }
 
-static sfw_batch_t *
+static struct sfw_batch *
 sfw_bid2batch(struct lst_bid bid)
 {
-        sfw_session_t *sn = sfw_data.fw_session;
-        sfw_batch_t   *bat;
+	struct sfw_session *sn = sfw_data.fw_session;
+	struct sfw_batch *bat;
 
         LASSERT (sn != NULL);
 
@@ -357,7 +357,7 @@ sfw_bid2batch(struct lst_bid bid)
         if (bat != NULL)
                 return bat;
 
-        LIBCFS_ALLOC(bat, sizeof(sfw_batch_t));
+	LIBCFS_ALLOC(bat, sizeof(*bat));
 	if (bat == NULL)
                 return NULL;
 
@@ -372,11 +372,11 @@ sfw_bid2batch(struct lst_bid bid)
 }
 
 static int
-sfw_get_stats (srpc_stat_reqst_t *request, srpc_stat_reply_t *reply)
+sfw_get_stats(struct srpc_stat_reqst *request, struct srpc_stat_reply *reply)
 {
-        sfw_session_t  *sn = sfw_data.fw_session;
+	struct sfw_session *sn = sfw_data.fw_session;
 	struct sfw_counters *cnt = &reply->str_fw;
-        sfw_batch_t    *bat;
+	struct sfw_batch *bat;
 
         reply->str_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
 
@@ -390,14 +390,14 @@ sfw_get_stats (srpc_stat_reqst_t *request, srpc_stat_reply_t *reply)
                 return 0;
         }
 
-	lnet_counters_get(&reply->str_lnet);
+	lnet_counters_get_common(&reply->str_lnet);
 	srpc_get_counters(&reply->str_rpc);
 
         /* send over the msecs since the session was started
          - with 32 bits to send, this is ~49 days */
-	cnt->running_ms      = jiffies_to_msecs(jiffies - sn->sn_started);
-	cnt->brw_errors      = atomic_read(&sn->sn_brw_errors);
-	cnt->ping_errors     = atomic_read(&sn->sn_ping_errors);
+	cnt->running_ms = ktime_ms_delta(ktime_get(), sn->sn_started);
+	cnt->brw_errors = atomic_read(&sn->sn_brw_errors);
+	cnt->ping_errors = atomic_read(&sn->sn_ping_errors);
 	cnt->zombie_sessions = atomic_read(&sfw_data.fw_nzombies);
 
 	cnt->active_batches = 0;
@@ -411,12 +411,12 @@ sfw_get_stats (srpc_stat_reqst_t *request, srpc_stat_reply_t *reply)
 }
 
 int
-sfw_make_session(srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply)
+sfw_make_session(struct srpc_mksn_reqst *request, struct srpc_mksn_reply *reply)
 {
-	sfw_session_t *sn = sfw_data.fw_session;
-	srpc_msg_t    *msg = container_of(request, srpc_msg_t,
-					  msg_body.mksn_reqst);
-	int	       cplen = 0;
+	struct sfw_session *sn = sfw_data.fw_session;
+	struct srpc_msg *msg = container_of(request, struct srpc_msg,
+					    msg_body.mksn_reqst);
+	int cplen = 0;
 
         if (request->mksn_sid.ses_nid == LNET_NID_ANY) {
                 reply->mksn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
@@ -446,7 +446,7 @@ sfw_make_session(srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply)
 
 	/* reject the request if it requires unknown features
 	 * NB: old version will always accept all features because it's not
-	 * aware of srpc_msg_t::msg_ses_feats, it's a defect but it's also
+	 * aware of struct srpc_msg::msg_ses_feats, it's a defect but it's also
 	 * harmless because it will return zero feature to console, and it's
 	 * console's responsibility to make sure all nodes in a session have
 	 * same feature mask. */
@@ -456,7 +456,7 @@ sfw_make_session(srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply)
 	}
 
 	/* brand new or create by force */
-	LIBCFS_ALLOC(sn, sizeof(sfw_session_t));
+	LIBCFS_ALLOC(sn, sizeof(*sn));
 	if (sn == NULL) {
 		CERROR("dropping RPC mksn under memory pressure\n");
 		return -ENOMEM;
@@ -480,9 +480,10 @@ sfw_make_session(srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply)
 }
 
 static int
-sfw_remove_session (srpc_rmsn_reqst_t *request, srpc_rmsn_reply_t *reply)
+sfw_remove_session(struct srpc_rmsn_reqst *request,
+		   struct srpc_rmsn_reply *reply)
 {
-        sfw_session_t *sn = sfw_data.fw_session;
+	struct sfw_session *sn = sfw_data.fw_session;
 
         reply->rmsn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
 
@@ -512,9 +513,10 @@ sfw_remove_session (srpc_rmsn_reqst_t *request, srpc_rmsn_reply_t *reply)
 }
 
 static int
-sfw_debug_session (srpc_debug_reqst_t *request, srpc_debug_reply_t *reply)
+sfw_debug_session(struct srpc_debug_reqst *request,
+		  struct srpc_debug_reply *reply)
 {
-        sfw_session_t *sn = sfw_data.fw_session;
+	struct sfw_session *sn = sfw_data.fw_session;
 
         if (sn == NULL) {
                 reply->dbg_status = ESRCH;
@@ -533,10 +535,10 @@ sfw_debug_session (srpc_debug_reqst_t *request, srpc_debug_reply_t *reply)
 }
 
 static void
-sfw_test_rpc_fini (srpc_client_rpc_t *rpc)
+sfw_test_rpc_fini(struct srpc_client_rpc *rpc)
 {
-	sfw_test_unit_t	    *tsu = rpc->crpc_priv;
-	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	struct sfw_test_unit *tsu = rpc->crpc_priv;
+	struct sfw_test_instance *tsi = tsu->tsu_instance;
 
 	/* Called with hold of tsi->tsi_lock */
 	LASSERT(list_empty(&rpc->crpc_list));
@@ -544,7 +546,7 @@ sfw_test_rpc_fini (srpc_client_rpc_t *rpc)
 }
 
 static inline int
-sfw_test_buffers(sfw_test_instance_t *tsi)
+sfw_test_buffers(struct sfw_test_instance *tsi)
 {
 	struct sfw_test_case	*tsc;
 	struct srpc_service	*svc;
@@ -618,10 +620,10 @@ sfw_unload_test(struct sfw_test_instance *tsi)
 }
 
 static void
-sfw_destroy_test_instance (sfw_test_instance_t *tsi)
+sfw_destroy_test_instance(struct sfw_test_instance *tsi)
 {
-        srpc_client_rpc_t *rpc;
-        sfw_test_unit_t   *tsu;
+	struct srpc_client_rpc *rpc;
+	struct sfw_test_unit *tsu;
 
         if (!tsi->tsi_is_client) goto clean;
 
@@ -633,14 +635,14 @@ sfw_destroy_test_instance (sfw_test_instance_t *tsi)
 
 	while (!list_empty(&tsi->tsi_units)) {
 		tsu = list_entry(tsi->tsi_units.next,
-				 sfw_test_unit_t, tsu_list);
+				 struct sfw_test_unit, tsu_list);
 		list_del(&tsu->tsu_list);
 		LIBCFS_FREE(tsu, sizeof(*tsu));
 	}
 
 	while (!list_empty(&tsi->tsi_free_rpcs)) {
 		rpc = list_entry(tsi->tsi_free_rpcs.next,
-				 srpc_client_rpc_t, crpc_list);
+				 struct srpc_client_rpc, crpc_list);
 		list_del(&rpc->crpc_list);
 		LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
 	}
@@ -652,35 +654,35 @@ sfw_destroy_test_instance (sfw_test_instance_t *tsi)
 }
 
 static void
-sfw_destroy_batch (sfw_batch_t *tsb)
+sfw_destroy_batch(struct sfw_batch *tsb)
 {
-	sfw_test_instance_t *tsi;
+	struct sfw_test_instance *tsi;
 
 	LASSERT(!sfw_batch_active(tsb));
 	LASSERT(list_empty(&tsb->bat_list));
 
 	while (!list_empty(&tsb->bat_tests)) {
 		tsi = list_entry(tsb->bat_tests.next,
-				 sfw_test_instance_t, tsi_list);
+				 struct sfw_test_instance, tsi_list);
 		list_del_init(&tsi->tsi_list);
 		sfw_destroy_test_instance(tsi);
 	}
 
-	LIBCFS_FREE(tsb, sizeof(sfw_batch_t));
+	LIBCFS_FREE(tsb, sizeof(*tsb));
 	return;
 }
 
-void
-sfw_destroy_session (sfw_session_t *sn)
+static void
+sfw_destroy_session(struct sfw_session *sn)
 {
-	sfw_batch_t *batch;
+	struct sfw_batch *batch;
 
 	LASSERT(list_empty(&sn->sn_list));
 	LASSERT(sn != sfw_data.fw_session);
 
 	while (!list_empty(&sn->sn_batches)) {
 		batch = list_entry(sn->sn_batches.next,
-				   sfw_batch_t, bat_list);
+				   struct sfw_batch, bat_list);
 		list_del_init(&batch->bat_list);
 		sfw_destroy_batch(batch);
 	}
@@ -691,9 +693,9 @@ sfw_destroy_session (sfw_session_t *sn)
 }
 
 static void
-sfw_unpack_addtest_req(srpc_msg_t *msg)
+sfw_unpack_addtest_req(struct srpc_msg *msg)
 {
-        srpc_test_reqst_t *req = &msg->msg_body.tes_reqst;
+	struct srpc_test_reqst *req = &msg->msg_body.tes_reqst;
 
         LASSERT (msg->msg_type == SRPC_MSG_TEST_REQST);
         LASSERT (req->tsr_is_client);
@@ -705,14 +707,14 @@ sfw_unpack_addtest_req(srpc_msg_t *msg)
 
 	if (req->tsr_service == SRPC_SERVICE_BRW) {
 		if ((msg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) {
-			test_bulk_req_t *bulk = &req->tsr_u.bulk_v0;
+			struct test_bulk_req *bulk = &req->tsr_u.bulk_v0;
 
 			__swab32s(&bulk->blk_opc);
 			__swab32s(&bulk->blk_npg);
 			__swab32s(&bulk->blk_flags);
 
 		} else {
-			test_bulk_req_v1_t *bulk = &req->tsr_u.bulk_v1;
+			struct test_bulk_req_v1 *bulk = &req->tsr_u.bulk_v1;
 
 			__swab16s(&bulk->blk_opc);
 			__swab16s(&bulk->blk_flags);
@@ -724,7 +726,7 @@ sfw_unpack_addtest_req(srpc_msg_t *msg)
 	}
 
         if (req->tsr_service == SRPC_SERVICE_PING) {
-                test_ping_req_t *ping = &req->tsr_u.ping;
+		struct test_ping_req *ping = &req->tsr_u.ping;
 
                 __swab32s(&ping->png_size);
                 __swab32s(&ping->png_flags);
@@ -736,16 +738,16 @@ sfw_unpack_addtest_req(srpc_msg_t *msg)
 }
 
 static int
-sfw_add_test_instance (sfw_batch_t *tsb, srpc_server_rpc_t *rpc)
+sfw_add_test_instance(struct sfw_batch *tsb, struct srpc_server_rpc *rpc)
 {
-        srpc_msg_t          *msg = &rpc->srpc_reqstbuf->buf_msg;
-        srpc_test_reqst_t   *req = &msg->msg_body.tes_reqst;
-        srpc_bulk_t         *bk = rpc->srpc_bulk;
-        int                  ndest = req->tsr_ndest;
-        sfw_test_unit_t     *tsu;
-        sfw_test_instance_t *tsi;
-        int                  i;
-        int                  rc;
+	struct srpc_msg *msg = &rpc->srpc_reqstbuf->buf_msg;
+	struct srpc_test_reqst *req = &msg->msg_body.tes_reqst;
+	struct srpc_bulk *bk = rpc->srpc_bulk;
+	int ndest = req->tsr_ndest;
+	struct sfw_test_unit *tsu;
+	struct sfw_test_instance *tsi;
+	int i;
+	int rc;
 
         LIBCFS_ALLOC(tsi, sizeof(*tsi));
         if (tsi == NULL) {
@@ -802,7 +804,7 @@ sfw_add_test_instance (sfw_batch_t *tsb, srpc_server_rpc_t *rpc)
                         sfw_unpack_id(id);
 
                 for (j = 0; j < tsi->tsi_concur; j++) {
-                        LIBCFS_ALLOC(tsu, sizeof(sfw_test_unit_t));
+			LIBCFS_ALLOC(tsu, sizeof(*tsu));
                         if (tsu == NULL) {
                                 rc = -ENOMEM;
                                 CERROR ("Can't allocate tsu for %d\n",
@@ -831,11 +833,11 @@ sfw_add_test_instance (sfw_batch_t *tsb, srpc_server_rpc_t *rpc)
 }
 
 static void
-sfw_test_unit_done (sfw_test_unit_t *tsu)
+sfw_test_unit_done(struct sfw_test_unit *tsu)
 {
-        sfw_test_instance_t *tsi = tsu->tsu_instance;
-        sfw_batch_t         *tsb = tsi->tsi_batch;
-        sfw_session_t       *sn = tsb->bat_session;
+	struct sfw_test_instance *tsi = tsu->tsu_instance;
+	struct sfw_batch *tsb = tsi->tsi_batch;
+	struct sfw_session *sn = tsb->bat_session;
 
         LASSERT (sfw_test_active(tsi));
 
@@ -874,10 +876,10 @@ sfw_test_unit_done (sfw_test_unit_t *tsu)
 }
 
 static void
-sfw_test_rpc_done (srpc_client_rpc_t *rpc)
+sfw_test_rpc_done(struct srpc_client_rpc *rpc)
 {
-        sfw_test_unit_t     *tsu = rpc->crpc_priv;
-        sfw_test_instance_t *tsi = tsu->tsu_instance;
+	struct sfw_test_unit *tsu = rpc->crpc_priv;
+	struct sfw_test_instance *tsi = tsu->tsu_instance;
         int                  done = 0;
 
         tsi->tsi_ops->tso_done_rpc(tsu, rpc);
@@ -910,12 +912,12 @@ sfw_test_rpc_done (srpc_client_rpc_t *rpc)
 }
 
 int
-sfw_create_test_rpc(sfw_test_unit_t *tsu, struct lnet_process_id peer,
+sfw_create_test_rpc(struct sfw_test_unit *tsu, struct lnet_process_id peer,
 		    unsigned features, int nblk, int blklen,
-		    srpc_client_rpc_t **rpcpp)
+		    struct srpc_client_rpc **rpcpp)
 {
-	srpc_client_rpc_t   *rpc = NULL;
-	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	struct srpc_client_rpc *rpc = NULL;
+	struct sfw_test_instance *tsi = tsu->tsu_instance;
 
 	spin_lock(&tsi->tsi_lock);
 
@@ -924,7 +926,7 @@ sfw_create_test_rpc(sfw_test_unit_t *tsu, struct lnet_process_id peer,
 	if (!list_empty(&tsi->tsi_free_rpcs)) {
 		/* pick request from buffer */
 		rpc = list_entry(tsi->tsi_free_rpcs.next,
-				 srpc_client_rpc_t, crpc_list);
+				 struct srpc_client_rpc, crpc_list);
 		LASSERT(nblk == rpc->crpc_bulk.bk_niov);
 		list_del_init(&rpc->crpc_list);
 	}
@@ -953,11 +955,11 @@ sfw_create_test_rpc(sfw_test_unit_t *tsu, struct lnet_process_id peer,
 }
 
 static int
-sfw_run_test (swi_workitem_t *wi)
+sfw_run_test(struct swi_workitem *wi)
 {
-        sfw_test_unit_t     *tsu = wi->swi_workitem.wi_data;
-        sfw_test_instance_t *tsi = tsu->tsu_instance;
-        srpc_client_rpc_t   *rpc = NULL;
+	struct sfw_test_unit *tsu = wi->swi_workitem.wi_data;
+	struct sfw_test_instance *tsi = tsu->tsu_instance;
+	struct srpc_client_rpc *rpc = NULL;
 
         LASSERT (wi == &tsu->tsu_worker);
 
@@ -1002,11 +1004,11 @@ sfw_run_test (swi_workitem_t *wi)
 }
 
 static int
-sfw_run_batch (sfw_batch_t *tsb)
+sfw_run_batch(struct sfw_batch *tsb)
 {
-        swi_workitem_t      *wi;
-        sfw_test_unit_t     *tsu;
-        sfw_test_instance_t *tsi;
+	struct swi_workitem *wi;
+	struct sfw_test_unit *tsu;
+	struct sfw_test_instance *tsi;
 
         if (sfw_batch_active(tsb)) {
 		CDEBUG(D_NET, "Batch already active: %llu (%d)\n",
@@ -1038,11 +1040,11 @@ sfw_run_batch (sfw_batch_t *tsb)
 	return 0;
 }
 
-int
-sfw_stop_batch (sfw_batch_t *tsb, int force)
+static int
+sfw_stop_batch(struct sfw_batch *tsb, int force)
 {
-        sfw_test_instance_t *tsi;
-        srpc_client_rpc_t   *rpc;
+	struct sfw_test_instance *tsi;
+	struct srpc_client_rpc *rpc;
 
         if (!sfw_batch_active(tsb)) {
 		CDEBUG(D_NET, "Batch %llu inactive\n", tsb->bat_id.bat_id);
@@ -1081,9 +1083,10 @@ sfw_stop_batch (sfw_batch_t *tsb, int force)
 }
 
 static int
-sfw_query_batch (sfw_batch_t *tsb, int testidx, srpc_batch_reply_t *reply)
+sfw_query_batch(struct sfw_batch *tsb, int testidx,
+		struct srpc_batch_reply *reply)
 {
-        sfw_test_instance_t *tsi;
+	struct sfw_test_instance *tsi;
 
         if (testidx < 0)
                 return -EINVAL;
@@ -1105,7 +1108,7 @@ sfw_query_batch (sfw_batch_t *tsb, int testidx, srpc_batch_reply_t *reply)
 }
 
 void
-sfw_free_pages (srpc_server_rpc_t *rpc)
+sfw_free_pages(struct srpc_server_rpc *rpc)
 {
         srpc_free_bulk(rpc->srpc_bulk);
         rpc->srpc_bulk = NULL;
@@ -1126,13 +1129,13 @@ sfw_alloc_pages(struct srpc_server_rpc *rpc, int cpt, int npages, int len,
 }
 
 static int
-sfw_add_test (srpc_server_rpc_t *rpc)
+sfw_add_test(struct srpc_server_rpc *rpc)
 {
-        sfw_session_t     *sn = sfw_data.fw_session;
-        srpc_test_reply_t *reply = &rpc->srpc_replymsg.msg_body.tes_reply;
-        srpc_test_reqst_t *request;
-        int                rc;
-        sfw_batch_t       *bat;
+	struct sfw_session *sn = sfw_data.fw_session;
+	struct srpc_test_reply *reply = &rpc->srpc_replymsg.msg_body.tes_reply;
+	struct srpc_test_reqst *request;
+	int rc;
+	struct sfw_batch *bat;
 
         request = &rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst;
         reply->tsr_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
@@ -1196,11 +1199,12 @@ sfw_add_test (srpc_server_rpc_t *rpc)
 }
 
 static int
-sfw_control_batch (srpc_batch_reqst_t *request, srpc_batch_reply_t *reply)
+sfw_control_batch(struct srpc_batch_reqst *request,
+		  struct srpc_batch_reply *reply)
 {
-        sfw_session_t *sn = sfw_data.fw_session;
+	struct sfw_session *sn = sfw_data.fw_session;
         int            rc = 0;
-        sfw_batch_t   *bat;
+	struct sfw_batch *bat;
 
         reply->bar_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
 
@@ -1240,8 +1244,8 @@ static int
 sfw_handle_server_rpc(struct srpc_server_rpc *rpc)
 {
 	struct srpc_service	*sv = rpc->srpc_scd->scd_svc;
-	srpc_msg_t     *reply	= &rpc->srpc_replymsg;
-	srpc_msg_t     *request	= &rpc->srpc_reqstbuf->buf_msg;
+	struct srpc_msg     *reply	= &rpc->srpc_replymsg;
+	struct srpc_msg     *request	= &rpc->srpc_reqstbuf->buf_msg;
 	unsigned	features = LST_FEATS_MASK;
 	int		rc = 0;
 
@@ -1274,7 +1278,7 @@ sfw_handle_server_rpc(struct srpc_server_rpc *rpc)
 
 	if (sv->sv_id != SRPC_SERVICE_MAKE_SESSION &&
 	    sv->sv_id != SRPC_SERVICE_DEBUG) {
-		sfw_session_t *sn = sfw_data.fw_session;
+		struct sfw_session *sn = sfw_data.fw_session;
 
 		if (sn != NULL &&
 		    sn->sn_features != request->msg_ses_feats) {
@@ -1390,12 +1394,12 @@ sfw_bulk_ready(struct srpc_server_rpc *rpc, int status)
 	return rc;
 }
 
-srpc_client_rpc_t *
+struct srpc_client_rpc *
 sfw_create_rpc(struct lnet_process_id peer, int service,
 	       unsigned features, int nbulkiov, int bulklen,
-	       void (*done)(srpc_client_rpc_t *), void *priv)
+	       void (*done)(struct srpc_client_rpc *), void *priv)
 {
-	srpc_client_rpc_t *rpc = NULL;
+	struct srpc_client_rpc *rpc = NULL;
 
 	spin_lock(&sfw_data.fw_lock);
 
@@ -1404,7 +1408,7 @@ sfw_create_rpc(struct lnet_process_id peer, int service,
 
 	if (nbulkiov == 0 && !list_empty(&sfw_data.fw_zombie_rpcs)) {
 		rpc = list_entry(sfw_data.fw_zombie_rpcs.next,
-                                     srpc_client_rpc_t, crpc_list);
+				     struct srpc_client_rpc, crpc_list);
 		list_del(&rpc->crpc_list);
 
                 srpc_init_client_rpc(rpc, peer, service, 0, 0,
@@ -1428,7 +1432,7 @@ sfw_create_rpc(struct lnet_process_id peer, int service,
 }
 
 void
-sfw_unpack_message (srpc_msg_t *msg)
+sfw_unpack_message(struct srpc_msg *msg)
 {
         if (msg->msg_magic == SRPC_MSG_MAGIC)
                 return; /* no flipping needed */
@@ -1437,7 +1441,7 @@ sfw_unpack_message (srpc_msg_t *msg)
         LASSERT (msg->msg_magic == __swab32(SRPC_MSG_MAGIC));
 
         if (msg->msg_type == SRPC_MSG_STAT_REQST) {
-                srpc_stat_reqst_t *req = &msg->msg_body.stat_reqst;
+		struct srpc_stat_reqst *req = &msg->msg_body.stat_reqst;
 
                 __swab32s(&req->str_type);
                 __swab64s(&req->str_rpyid);
@@ -1446,7 +1450,7 @@ sfw_unpack_message (srpc_msg_t *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_STAT_REPLY) {
-                srpc_stat_reply_t *rep = &msg->msg_body.stat_reply;
+		struct srpc_stat_reply *rep = &msg->msg_body.stat_reply;
 
                 __swab32s(&rep->str_status);
                 sfw_unpack_sid(rep->str_sid);
@@ -1457,7 +1461,7 @@ sfw_unpack_message (srpc_msg_t *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_MKSN_REQST) {
-                srpc_mksn_reqst_t *req = &msg->msg_body.mksn_reqst;
+		struct srpc_mksn_reqst *req = &msg->msg_body.mksn_reqst;
 
                 __swab64s(&req->mksn_rpyid);
                 __swab32s(&req->mksn_force);
@@ -1466,7 +1470,7 @@ sfw_unpack_message (srpc_msg_t *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_MKSN_REPLY) {
-                srpc_mksn_reply_t *rep = &msg->msg_body.mksn_reply;
+		struct srpc_mksn_reply *rep = &msg->msg_body.mksn_reply;
 
                 __swab32s(&rep->mksn_status);
                 __swab32s(&rep->mksn_timeout);
@@ -1475,7 +1479,7 @@ sfw_unpack_message (srpc_msg_t *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_RMSN_REQST) {
-                srpc_rmsn_reqst_t *req = &msg->msg_body.rmsn_reqst;
+		struct srpc_rmsn_reqst *req = &msg->msg_body.rmsn_reqst;
 
                 __swab64s(&req->rmsn_rpyid);
                 sfw_unpack_sid(req->rmsn_sid);
@@ -1483,7 +1487,7 @@ sfw_unpack_message (srpc_msg_t *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_RMSN_REPLY) {
-                srpc_rmsn_reply_t *rep = &msg->msg_body.rmsn_reply;
+		struct srpc_rmsn_reply *rep = &msg->msg_body.rmsn_reply;
 
                 __swab32s(&rep->rmsn_status);
                 sfw_unpack_sid(rep->rmsn_sid);
@@ -1491,7 +1495,7 @@ sfw_unpack_message (srpc_msg_t *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_DEBUG_REQST) {
-                srpc_debug_reqst_t *req = &msg->msg_body.dbg_reqst;
+		struct srpc_debug_reqst *req = &msg->msg_body.dbg_reqst;
 
                 __swab64s(&req->dbg_rpyid);
                 __swab32s(&req->dbg_flags);
@@ -1500,7 +1504,7 @@ sfw_unpack_message (srpc_msg_t *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_DEBUG_REPLY) {
-                srpc_debug_reply_t *rep = &msg->msg_body.dbg_reply;
+		struct srpc_debug_reply *rep = &msg->msg_body.dbg_reply;
 
                 __swab32s(&rep->dbg_nbatch);
                 __swab32s(&rep->dbg_timeout);
@@ -1509,7 +1513,7 @@ sfw_unpack_message (srpc_msg_t *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_BATCH_REQST) {
-                srpc_batch_reqst_t *req = &msg->msg_body.bat_reqst;
+		struct srpc_batch_reqst *req = &msg->msg_body.bat_reqst;
 
                 __swab32s(&req->bar_opc);
                 __swab64s(&req->bar_rpyid);
@@ -1521,7 +1525,7 @@ sfw_unpack_message (srpc_msg_t *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_BATCH_REPLY) {
-                srpc_batch_reply_t *rep = &msg->msg_body.bat_reply;
+		struct srpc_batch_reply *rep = &msg->msg_body.bat_reply;
 
                 __swab32s(&rep->bar_status);
                 sfw_unpack_sid(rep->bar_sid);
@@ -1529,7 +1533,7 @@ sfw_unpack_message (srpc_msg_t *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_TEST_REQST) {
-                srpc_test_reqst_t *req = &msg->msg_body.tes_reqst;
+		struct srpc_test_reqst *req = &msg->msg_body.tes_reqst;
 
                 __swab64s(&req->tsr_rpyid);
                 __swab64s(&req->tsr_bulkid);
@@ -1543,7 +1547,7 @@ sfw_unpack_message (srpc_msg_t *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_TEST_REPLY) {
-                srpc_test_reply_t *rep = &msg->msg_body.tes_reply;
+		struct srpc_test_reply *rep = &msg->msg_body.tes_reply;
 
                 __swab32s(&rep->tsr_status);
                 sfw_unpack_sid(rep->tsr_sid);
@@ -1551,7 +1555,7 @@ sfw_unpack_message (srpc_msg_t *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_JOIN_REQST) {
-                srpc_join_reqst_t *req = &msg->msg_body.join_reqst;
+		struct srpc_join_reqst *req = &msg->msg_body.join_reqst;
 
                 __swab64s(&req->join_rpyid);
                 sfw_unpack_sid(req->join_sid);
@@ -1559,7 +1563,7 @@ sfw_unpack_message (srpc_msg_t *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_JOIN_REPLY) {
-                srpc_join_reply_t *rep = &msg->msg_body.join_reply;
+		struct srpc_join_reply *rep = &msg->msg_body.join_reply;
 
                 __swab32s(&rep->join_status);
                 __swab32s(&rep->join_timeout);
@@ -1572,7 +1576,7 @@ sfw_unpack_message (srpc_msg_t *msg)
 }
 
 void
-sfw_abort_rpc (srpc_client_rpc_t *rpc)
+sfw_abort_rpc(struct srpc_client_rpc *rpc)
 {
 	LASSERT(atomic_read(&rpc->crpc_refcount) > 0);
 	LASSERT(rpc->crpc_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
@@ -1584,7 +1588,7 @@ sfw_abort_rpc (srpc_client_rpc_t *rpc)
 }
 
 void
-sfw_post_rpc (srpc_client_rpc_t *rpc)
+sfw_post_rpc(struct srpc_client_rpc *rpc)
 {
 	spin_lock(&rpc->crpc_lock);
 
@@ -1600,44 +1604,14 @@ sfw_post_rpc (srpc_client_rpc_t *rpc)
 	return;
 }
 
-static srpc_service_t sfw_services[] =
-{
-        {
-                /* sv_id */    SRPC_SERVICE_DEBUG,
-                /* sv_name */  "debug",
-                0
-        },
-        {
-                /* sv_id */    SRPC_SERVICE_QUERY_STAT,
-                /* sv_name */  "query stats",
-                0
-        },
-        {
-                /* sv_id */    SRPC_SERVICE_MAKE_SESSION,
-                /* sv_name */  "make session",
-                0
-        },
-        {
-                /* sv_id */    SRPC_SERVICE_REMOVE_SESSION,
-                /* sv_name */  "remove session",
-                0
-        },
-        {
-                /* sv_id */    SRPC_SERVICE_BATCH,
-                /* sv_name */  "batch service",
-                0
-        },
-        {
-                /* sv_id */    SRPC_SERVICE_TEST,
-                /* sv_name */  "test service",
-                0
-        },
-        {
-                /* sv_id */    0,
-                /* sv_name */  NULL,
-                0
-        }
-};
+static struct srpc_service sfw_services[] = {
+	{ .sv_id = SRPC_SERVICE_DEBUG,		.sv_name = "debug", },
+	{ .sv_id = SRPC_SERVICE_QUERY_STAT,	.sv_name = "query stats", },
+	{ .sv_id = SRPC_SERVICE_MAKE_SESSION,	.sv_name = "make session", },
+	{ .sv_id = SRPC_SERVICE_REMOVE_SESSION,	.sv_name = "remove session", },
+	{ .sv_id = SRPC_SERVICE_BATCH,		.sv_name = "batch service", },
+	{ .sv_id = SRPC_SERVICE_TEST,		.sv_name = "test service", },
+	{ .sv_id = 0, } };
 
 int
 sfw_startup (void)
@@ -1645,8 +1619,8 @@ sfw_startup (void)
         int              i;
         int              rc;
         int              error;
-        srpc_service_t  *sv;
-        sfw_test_case_t *tsc;
+	struct srpc_service *sv;
+	struct sfw_test_case *tsc;
 
 
         if (session_timeout < 0) {
@@ -1740,8 +1714,8 @@ sfw_startup (void)
 void
 sfw_shutdown (void)
 {
-	srpc_service_t	*sv;
-	sfw_test_case_t	*tsc;
+	struct srpc_service *sv;
+	struct sfw_test_case *tsc;
 	int		 i;
 
 	spin_lock(&sfw_data.fw_lock);
@@ -1778,10 +1752,10 @@ sfw_shutdown (void)
         }
 
 	while (!list_empty(&sfw_data.fw_zombie_rpcs)) {
-		srpc_client_rpc_t *rpc;
+		struct srpc_client_rpc *rpc;
 
 		rpc = list_entry(sfw_data.fw_zombie_rpcs.next,
-				 srpc_client_rpc_t, crpc_list);
+				 struct srpc_client_rpc, crpc_list);
 		list_del(&rpc->crpc_list);
 
 		LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
@@ -1797,7 +1771,7 @@ sfw_shutdown (void)
 
 	while (!list_empty(&sfw_data.fw_tests)) {
 		tsc = list_entry(sfw_data.fw_tests.next,
-				 sfw_test_case_t, tsc_list);
+				 struct sfw_test_case, tsc_list);
 
 		srpc_wait_service_shutdown(tsc->tsc_srv_service);
 
diff --git a/drivers/staging/lustrefsx/lnet/selftest/module.c b/drivers/staging/lustrefsx/lnet/selftest/module.c
index 56212a840dcc4..5324957500940 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/module.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/module.c
@@ -52,61 +52,58 @@ struct cfs_wi_sched **lst_sched_test;
 static void
 lnet_selftest_exit(void)
 {
-	int	i;
-
-        switch (lst_init_step) {
-                case LST_INIT_CONSOLE:
-                        lstcon_console_fini();
-		/* Fall through */
-                case LST_INIT_FW:
-                        sfw_shutdown();
-		/* Fall through */
-                case LST_INIT_RPC:
-                        srpc_shutdown();
-		/* Fall through */
-		case LST_INIT_WI_TEST:
-			for (i = 0;
-			     i < cfs_cpt_number(lnet_cpt_table()); i++) {
-				if (lst_sched_test[i] == NULL)
-					continue;
-				cfs_wi_sched_destroy(lst_sched_test[i]);
-			}
-			LIBCFS_FREE(lst_sched_test,
-				    sizeof(lst_sched_test[0]) *
-				    cfs_cpt_number(lnet_cpt_table()));
-			lst_sched_test = NULL;
-		/* Fall through */
-
-		case LST_INIT_WI_SERIAL:
-			cfs_wi_sched_destroy(lst_sched_serial);
-			lst_sched_serial = NULL;
-		/* Fall through */
-                case LST_INIT_NONE:
-                        break;
-		/* Fall through */
-                default:
-                        LBUG();
-        }
-        return;
+	int i;
+
+	switch (lst_init_step) {
+	case LST_INIT_CONSOLE:
+		lstcon_console_fini();
+		/* fallthrough */
+	case LST_INIT_FW:
+		sfw_shutdown();
+		/* fallthrough */
+	case LST_INIT_RPC:
+		srpc_shutdown();
+		/* fallthrough */
+	case LST_INIT_WI_TEST:
+		for (i = 0;
+		     i < cfs_cpt_number(lnet_cpt_table()); i++) {
+			if (lst_sched_test[i] == NULL)
+				continue;
+			cfs_wi_sched_destroy(lst_sched_test[i]);
+		}
+		LIBCFS_FREE(lst_sched_test,
+			    sizeof(lst_sched_test[0]) *
+			    cfs_cpt_number(lnet_cpt_table()));
+		lst_sched_test = NULL;
+		/* fallthrough */
+	case LST_INIT_WI_SERIAL:
+		cfs_wi_sched_destroy(lst_sched_serial);
+		lst_sched_serial = NULL;
+		/* fallthrough */
+	case LST_INIT_NONE:
+		break;
+	default:
+		LBUG();
+	}
 }
 
 void
 lnet_selftest_structure_assertion(void)
 {
-        CLASSERT(sizeof(srpc_msg_t) == 160);
-        CLASSERT(sizeof(srpc_test_reqst_t) == 70);
-        CLASSERT(offsetof(srpc_msg_t, msg_body.tes_reqst.tsr_concur) == 72);
-        CLASSERT(offsetof(srpc_msg_t, msg_body.tes_reqst.tsr_ndest) == 78);
-        CLASSERT(sizeof(srpc_stat_reply_t) == 136);
-        CLASSERT(sizeof(srpc_stat_reqst_t) == 28);
+	CLASSERT(sizeof(struct srpc_msg) == 160);
+	CLASSERT(sizeof(struct srpc_test_reqst) == 70);
+	CLASSERT(offsetof(struct srpc_msg, msg_body.tes_reqst.tsr_concur) == 72);
+	CLASSERT(offsetof(struct srpc_msg, msg_body.tes_reqst.tsr_ndest) == 78);
+	CLASSERT(sizeof(struct srpc_stat_reply) == 136);
+	CLASSERT(sizeof(struct srpc_stat_reqst) == 28);
 }
 
 static int __init
 lnet_selftest_init(void)
 {
-	int	nscheds;
-	int	rc;
-	int	i;
+	int nscheds;
+	int rc;
+	int i;
 
 	rc = cfs_wi_sched_create("lst_s", lnet_cpt_table(), CFS_CPT_ANY,
 				 1, &lst_sched_serial);
@@ -130,31 +127,31 @@ lnet_selftest_init(void)
 		rc = cfs_wi_sched_create("lst_t", lnet_cpt_table(), i,
 					 nthrs, &lst_sched_test[i]);
 		if (rc != 0) {
-			CERROR("Failed to create CPU partition affinity WI "
-			       "scheduler %d for LST\n", i);
+			CERROR("Failed to create CPU partition affinity WI scheduler %d for LST\n",
+			       i);
 			goto error;
 		}
 	}
 
-        rc = srpc_startup();
-        if (rc != 0) {
-                CERROR("LST can't startup rpc\n");
-                goto error;
-        }
-        lst_init_step = LST_INIT_RPC;
-
-        rc = sfw_startup();
-        if (rc != 0) {
-                CERROR("LST can't startup framework\n");
-                goto error;
-        }
-        lst_init_step = LST_INIT_FW;
-
-        rc = lstcon_console_init();
-        if (rc != 0) {
-                CERROR("LST can't startup console\n");
-                goto error;
-        }
+	rc = srpc_startup();
+	if (rc != 0) {
+		CERROR("LST can't startup rpc\n");
+		goto error;
+	}
+	lst_init_step = LST_INIT_RPC;
+
+	rc = sfw_startup();
+	if (rc != 0) {
+		CERROR("LST can't startup framework\n");
+		goto error;
+	}
+	lst_init_step = LST_INIT_FW;
+
+	rc = lstcon_console_init();
+	if (rc != 0) {
+		CERROR("LST can't startup console\n");
+		goto error;
+	}
 	lst_init_step = LST_INIT_CONSOLE;
 	return 0;
 error:
diff --git a/drivers/staging/lustrefsx/lnet/selftest/ping_test.c b/drivers/staging/lustrefsx/lnet/selftest/ping_test.c
index ea2076103c756..2d1403b34c7bc 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/ping_test.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/ping_test.c
@@ -44,17 +44,17 @@ static int ping_srv_workitems = SFW_TEST_WI_MAX;
 module_param(ping_srv_workitems, int, 0644);
 MODULE_PARM_DESC(ping_srv_workitems, "# PING server workitems");
 
-typedef struct {
+struct lst_ping_data {
 	spinlock_t	pnd_lock;	/* serialize */
 	int		pnd_counter;	/* sequence counter */
-} lst_ping_data_t;
+};
 
-static lst_ping_data_t  lst_ping_data;
+static struct lst_ping_data lst_ping_data;
 
 static int
-ping_client_init(sfw_test_instance_t *tsi)
+ping_client_init(struct sfw_test_instance *tsi)
 {
-	sfw_session_t *sn = tsi->tsi_batch->bat_session;
+	struct sfw_session *sn = tsi->tsi_batch->bat_session;
 
 	LASSERT(tsi->tsi_is_client);
 	LASSERT(sn != NULL && (sn->sn_features & ~LST_FEATS_MASK) == 0);
@@ -66,9 +66,9 @@ ping_client_init(sfw_test_instance_t *tsi)
 }
 
 static void
-ping_client_fini (sfw_test_instance_t *tsi)
+ping_client_fini(struct sfw_test_instance *tsi)
 {
-        sfw_session_t *sn = tsi->tsi_batch->bat_session;
+	struct sfw_session *sn = tsi->tsi_batch->bat_session;
         int            errors;
 
         LASSERT (sn != NULL);
@@ -82,14 +82,14 @@ ping_client_fini (sfw_test_instance_t *tsi)
 }
 
 static int
-ping_client_prep_rpc(sfw_test_unit_t *tsu,
-		     struct lnet_process_id dest, srpc_client_rpc_t **rpc)
+ping_client_prep_rpc(struct sfw_test_unit *tsu, struct lnet_process_id dest,
+		     struct srpc_client_rpc **rpc)
 {
-	srpc_ping_reqst_t   *req;
-	sfw_test_instance_t *tsi = tsu->tsu_instance;
-	sfw_session_t       *sn  = tsi->tsi_batch->bat_session;
+	struct srpc_ping_reqst *req;
+	struct sfw_test_instance *tsi = tsu->tsu_instance;
+	struct sfw_session *sn = tsi->tsi_batch->bat_session;
 	struct timespec64 ts;
-	int		     rc;
+	int rc;
 
 	LASSERT(sn != NULL);
 	LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
@@ -114,12 +114,12 @@ ping_client_prep_rpc(sfw_test_unit_t *tsu,
 }
 
 static void
-ping_client_done_rpc (sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
+ping_client_done_rpc(struct sfw_test_unit *tsu, struct srpc_client_rpc *rpc)
 {
-        sfw_test_instance_t *tsi = tsu->tsu_instance;
-	sfw_session_t *sn = tsi->tsi_batch->bat_session;
-	srpc_ping_reqst_t *reqst = &rpc->crpc_reqstmsg.msg_body.ping_reqst;
-	srpc_ping_reply_t *reply = &rpc->crpc_replymsg.msg_body.ping_reply;
+	struct sfw_test_instance *tsi = tsu->tsu_instance;
+	struct sfw_session *sn = tsi->tsi_batch->bat_session;
+	struct srpc_ping_reqst *reqst = &rpc->crpc_reqstmsg.msg_body.ping_reqst;
+	struct srpc_ping_reply *reply = &rpc->crpc_replymsg.msg_body.ping_reply;
 	struct timespec64 ts;
 
 	LASSERT(sn != NULL);
@@ -167,11 +167,11 @@ ping_client_done_rpc (sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
 static int
 ping_server_handle(struct srpc_server_rpc *rpc)
 {
-	struct srpc_service	*sv  = rpc->srpc_scd->scd_svc;
-        srpc_msg_t        *reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
-	srpc_msg_t	  *replymsg = &rpc->srpc_replymsg;
-        srpc_ping_reqst_t *req = &reqstmsg->msg_body.ping_reqst;
-        srpc_ping_reply_t *rep = &rpc->srpc_replymsg.msg_body.ping_reply;
+	struct srpc_service *sv  = rpc->srpc_scd->scd_svc;
+	struct srpc_msg *reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+	struct srpc_msg *replymsg = &rpc->srpc_replymsg;
+	struct srpc_ping_reqst *req = &reqstmsg->msg_body.ping_reqst;
+	struct srpc_ping_reply *rep = &rpc->srpc_replymsg.msg_body.ping_reply;
 
         LASSERT (sv->sv_id == SRPC_SERVICE_PING);
 
@@ -207,7 +207,8 @@ ping_server_handle(struct srpc_server_rpc *rpc)
 	return 0;
 }
 
-sfw_test_client_ops_t ping_test_client;
+struct sfw_test_client_ops ping_test_client;
+
 void ping_init_test_client(void)
 {
         ping_test_client.tso_init     = ping_client_init;
@@ -216,7 +217,8 @@ void ping_init_test_client(void)
         ping_test_client.tso_done_rpc = ping_client_done_rpc;
 }
 
-srpc_service_t ping_test_service;
+struct srpc_service ping_test_service;
+
 void ping_init_test_service(void)
 {
 	ping_test_service.sv_id       = SRPC_SERVICE_PING;
diff --git a/drivers/staging/lustrefsx/lnet/selftest/rpc.c b/drivers/staging/lustrefsx/lnet/selftest/rpc.c
index ed88dfeac7085..b1cc58926acb3 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/rpc.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/rpc.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -42,21 +42,21 @@
 
 #include "selftest.h"
 
-typedef enum {
-        SRPC_STATE_NONE,
-        SRPC_STATE_NI_INIT,
-        SRPC_STATE_EQ_INIT,
-        SRPC_STATE_RUNNING,
-        SRPC_STATE_STOPPING,
-} srpc_state_t;
+enum srpc_state {
+	SRPC_STATE_NONE,
+	SRPC_STATE_NI_INIT,
+	SRPC_STATE_EQ_INIT,
+	SRPC_STATE_RUNNING,
+	SRPC_STATE_STOPPING,
+};
 
 static struct smoketest_rpc {
 	spinlock_t	 rpc_glock;	/* global lock */
-	srpc_service_t	*rpc_services[SRPC_SERVICE_MAX_ID + 1];
-	struct lnet_handle_eq	rpc_lnet_eq;	/* _the_ LNet event queue */
-	srpc_state_t	 rpc_state;
+	struct srpc_service	*rpc_services[SRPC_SERVICE_MAX_ID + 1];
+	struct lnet_handle_eq	 rpc_lnet_eq;	/* _the_ LNet event queue */
+	enum srpc_state		 rpc_state;
 	struct srpc_counters	 rpc_counters;
-	__u64		 rpc_matchbits;	/* matchbits counter */
+	__u64			 rpc_matchbits;	/* matchbits counter */
 } srpc_data;
 
 static inline int
@@ -67,7 +67,7 @@ srpc_serv_portal(int svc_id)
 }
 
 /* forward ref's */
-int srpc_handle_rpc(swi_workitem_t *wi);
+static int srpc_handle_rpc(struct swi_workitem *wi);
 
 void srpc_get_counters(struct srpc_counters *cnt)
 {
@@ -84,7 +84,8 @@ void srpc_set_counters(const struct srpc_counters *cnt)
 }
 
 static int
-srpc_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i, int off, int nob)
+srpc_add_bulk_page(struct srpc_bulk *bk, struct page *pg, int i, int off,
+		   int nob)
 {
 	LASSERT(off < PAGE_SIZE);
 	LASSERT(nob > 0 && nob <= PAGE_SIZE);
@@ -96,48 +97,49 @@ srpc_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i, int off, int nob)
 }
 
 void
-srpc_free_bulk (srpc_bulk_t *bk)
+srpc_free_bulk(struct srpc_bulk *bk)
 {
-        int         i;
+	int i;
 	struct page *pg;
 
-        LASSERT (bk != NULL);
+	LASSERT(bk != NULL);
 
-        for (i = 0; i < bk->bk_niov; i++) {
-                pg = bk->bk_iovs[i].kiov_page;
-                if (pg == NULL) break;
+	for (i = 0; i < bk->bk_niov; i++) {
+		pg = bk->bk_iovs[i].kiov_page;
+		if (pg == NULL)
+			break;
 
 		__free_page(pg);
-        }
+	}
 
-        LIBCFS_FREE(bk, offsetof(srpc_bulk_t, bk_iovs[bk->bk_niov]));
-        return;
+	LIBCFS_FREE(bk, offsetof(struct srpc_bulk, bk_iovs[bk->bk_niov]));
+	return;
 }
 
-srpc_bulk_t *
+struct srpc_bulk *
 srpc_alloc_bulk(int cpt, unsigned bulk_off, unsigned bulk_npg,
 		unsigned bulk_len, int sink)
 {
-	srpc_bulk_t  *bk;
-	int	      i;
+	struct srpc_bulk *bk;
+	int i;
 
 	LASSERT(bulk_npg > 0 && bulk_npg <= LNET_MAX_IOV);
 
 	LIBCFS_CPT_ALLOC(bk, lnet_cpt_table(), cpt,
-			 offsetof(srpc_bulk_t, bk_iovs[bulk_npg]));
+			 offsetof(struct srpc_bulk, bk_iovs[bulk_npg]));
 	if (bk == NULL) {
 		CERROR("Can't allocate descriptor for %d pages\n", bulk_npg);
 		return NULL;
 	}
 
-	memset(bk, 0, offsetof(srpc_bulk_t, bk_iovs[bulk_npg]));
+	memset(bk, 0, offsetof(struct srpc_bulk, bk_iovs[bulk_npg]));
 	bk->bk_sink   = sink;
 	bk->bk_len    = bulk_len;
 	bk->bk_niov   = bulk_npg;
 
 	for (i = 0; i < bulk_npg; i++) {
 		struct page *pg;
-		int	    nob;
+		int nob;
 
 		pg = cfs_page_cpt_alloc(lnet_cpt_table(), cpt, GFP_KERNEL);
 		if (pg == NULL) {
@@ -190,11 +192,11 @@ srpc_init_server_rpc(struct srpc_server_rpc *rpc,
 static void
 srpc_service_fini(struct srpc_service *svc)
 {
-	struct srpc_service_cd	*scd;
-	struct srpc_server_rpc	*rpc;
-	struct srpc_buffer	*buf;
-	struct list_head		*q;
-	int			i;
+	struct srpc_service_cd *scd;
+	struct srpc_server_rpc *rpc;
+	struct srpc_buffer *buf;
+	struct list_head *q;
+	int i;
 
 	if (svc->sv_cpt_data == NULL)
 		return;
@@ -210,8 +212,8 @@ srpc_service_fini(struct srpc_service *svc)
 
 			while (!list_empty(q)) {
 				buf = list_entry(q->next,
-						     struct srpc_buffer,
-						     buf_list);
+						 struct srpc_buffer,
+						 buf_list);
 				list_del(&buf->buf_list);
 				LIBCFS_FREE(buf, sizeof(*buf));
 			}
@@ -221,8 +223,8 @@ srpc_service_fini(struct srpc_service *svc)
 
 		while (!list_empty(&scd->scd_rpc_free)) {
 			rpc = list_entry(scd->scd_rpc_free.next,
-					     struct srpc_server_rpc,
-					     srpc_list);
+					 struct srpc_server_rpc,
+					 srpc_list);
 			list_del(&rpc->srpc_list);
 			LIBCFS_FREE(rpc, sizeof(*rpc));
 		}
@@ -246,11 +248,11 @@ int srpc_add_buffer(struct swi_workitem *wi);
 static int
 srpc_service_init(struct srpc_service *svc)
 {
-	struct srpc_service_cd	*scd;
-	struct srpc_server_rpc	*rpc;
-	int			nrpcs;
-	int			i;
-	int			j;
+	struct srpc_service_cd *scd;
+	struct srpc_server_rpc *rpc;
+	int nrpcs;
+	int i;
+	int j;
 
 	svc->sv_shuttingdown = 0;
 
@@ -327,13 +329,13 @@ srpc_add_service(struct srpc_service *sv)
 	CDEBUG(D_NET, "Adding service: id %d, name %s\n", id, sv->sv_name);
 	return 0;
 
- failed:
+failed:
 	srpc_service_fini(sv);
 	return -EBUSY;
 }
 
 int
-srpc_remove_service (srpc_service_t *sv)
+srpc_remove_service(struct srpc_service *sv)
 {
 	int id = sv->sv_id;
 
@@ -352,98 +354,100 @@ srpc_remove_service (srpc_service_t *sv)
 static int
 srpc_post_passive_rdma(int portal, int local, __u64 matchbits, void *buf,
 		       int len, int options, struct lnet_process_id peer,
-		       struct lnet_handle_md *mdh, srpc_event_t *ev)
+		       struct lnet_handle_md *mdh, struct srpc_event *ev)
 {
-	int		 rc;
-	struct lnet_md	 md;
+	int rc;
+	struct lnet_md md;
 	struct lnet_handle_me meh;
 
 	rc = LNetMEAttach(portal, peer, matchbits, 0, LNET_UNLINK,
 			  local ? LNET_INS_LOCAL : LNET_INS_AFTER, &meh);
-        if (rc != 0) {
-                CERROR ("LNetMEAttach failed: %d\n", rc);
-                LASSERT (rc == -ENOMEM);
-                return -ENOMEM;
-        }
-
-        md.threshold = 1;
-        md.user_ptr  = ev;
-        md.start     = buf;
-        md.length    = len;
-        md.options   = options;
-        md.eq_handle = srpc_data.rpc_lnet_eq;
-
-        rc = LNetMDAttach(meh, md, LNET_UNLINK, mdh);
-        if (rc != 0) {
-                CERROR ("LNetMDAttach failed: %d\n", rc);
-                LASSERT (rc == -ENOMEM);
-
-                rc = LNetMEUnlink(meh);
-                LASSERT (rc == 0);
-                return -ENOMEM;
-        }
-
-        CDEBUG (D_NET,
-		"Posted passive RDMA: peer %s, portal %d, matchbits %#llx\n",
-                libcfs_id2str(peer), portal, matchbits);
-        return 0;
+	if (rc != 0) {
+		CERROR("LNetMEAttach failed: %d\n", rc);
+		LASSERT(rc == -ENOMEM);
+		return -ENOMEM;
+	}
+
+	md.threshold = 1;
+	md.user_ptr  = ev;
+	md.start     = buf;
+	md.length    = len;
+	md.options   = options;
+	md.eq_handle = srpc_data.rpc_lnet_eq;
+
+	rc = LNetMDAttach(meh, md, LNET_UNLINK, mdh);
+	if (rc != 0) {
+		CERROR("LNetMDAttach failed: %d\n", rc);
+		LASSERT(rc == -ENOMEM);
+
+		rc = LNetMEUnlink(meh);
+		LASSERT(rc == 0);
+		return -ENOMEM;
+	}
+
+	CDEBUG(D_NET,
+	       "Posted passive RDMA: peer %s, portal %d, matchbits %#llx\n",
+	       libcfs_id2str(peer), portal, matchbits);
+	return 0;
 }
 
 static int
 srpc_post_active_rdma(int portal, __u64 matchbits, void *buf, int len,
-		      int options, struct lnet_process_id peer, lnet_nid_t self,
-		      struct lnet_handle_md *mdh, srpc_event_t *ev)
+		      int options, struct lnet_process_id peer,
+		      lnet_nid_t self, struct lnet_handle_md *mdh,
+		      struct srpc_event *ev)
 {
 	int rc;
 	struct lnet_md md;
 
-        md.user_ptr  = ev;
-        md.start     = buf;
-        md.length    = len;
-        md.eq_handle = srpc_data.rpc_lnet_eq;
-        md.threshold = ((options & LNET_MD_OP_GET) != 0) ? 2 : 1;
-        md.options   = options & ~(LNET_MD_OP_PUT | LNET_MD_OP_GET);
-
-        rc = LNetMDBind(md, LNET_UNLINK, mdh);
-        if (rc != 0) {
-                CERROR ("LNetMDBind failed: %d\n", rc);
-                LASSERT (rc == -ENOMEM);
-                return -ENOMEM;
-        }
-
-        /* this is kind of an abuse of the LNET_MD_OP_{PUT,GET} options.
-         * they're only meaningful for MDs attached to an ME (i.e. passive
-         * buffers... */
-        if ((options & LNET_MD_OP_PUT) != 0) {
-                rc = LNetPut(self, *mdh, LNET_NOACK_REQ, peer,
-                             portal, matchbits, 0, 0);
-        } else {
-                LASSERT ((options & LNET_MD_OP_GET) != 0);
-
-                rc = LNetGet(self, *mdh, peer, portal, matchbits, 0);
-        }
-
-        if (rc != 0) {
-		CERROR ("LNet%s(%s, %d, %lld) failed: %d\n",
-                        ((options & LNET_MD_OP_PUT) != 0) ? "Put" : "Get",
-                        libcfs_id2str(peer), portal, matchbits, rc);
-
-                /* The forthcoming unlink event will complete this operation
-                 * with failure, so fall through and return success here.
-                 */
-                rc = LNetMDUnlink(*mdh);
-                LASSERT (rc == 0);
-        } else {
-                CDEBUG (D_NET,
-			"Posted active RDMA: peer %s, portal %u, matchbits %#llx\n",
-                        libcfs_id2str(peer), portal, matchbits);
-        }
-        return 0;
+	md.user_ptr  = ev;
+	md.start     = buf;
+	md.length    = len;
+	md.eq_handle = srpc_data.rpc_lnet_eq;
+	md.threshold = ((options & LNET_MD_OP_GET) != 0) ? 2 : 1;
+	md.options   = options & ~(LNET_MD_OP_PUT | LNET_MD_OP_GET);
+
+	rc = LNetMDBind(md, LNET_UNLINK, mdh);
+	if (rc != 0) {
+		CERROR("LNetMDBind failed: %d\n", rc);
+		LASSERT(rc == -ENOMEM);
+		return -ENOMEM;
+	}
+
+	/* this is kind of an abuse of the LNET_MD_OP_{PUT,GET} options.
+	 * they're only meaningful for MDs attached to an ME (i.e. passive
+	 * buffers...
+	 */
+	if ((options & LNET_MD_OP_PUT) != 0) {
+		rc = LNetPut(self, *mdh, LNET_NOACK_REQ, peer,
+			     portal, matchbits, 0, 0);
+	} else {
+		LASSERT((options & LNET_MD_OP_GET) != 0);
+
+		rc = LNetGet(self, *mdh, peer, portal, matchbits, 0, false);
+	}
+
+	if (rc != 0) {
+		CERROR("LNet%s(%s, %d, %lld) failed: %d\n",
+		       ((options & LNET_MD_OP_PUT) != 0) ? "Put" : "Get",
+		       libcfs_id2str(peer), portal, matchbits, rc);
+
+		/* The forthcoming unlink event will complete this operation
+		 * with failure, so fall through and return success here.
+		 */
+		rc = LNetMDUnlink(*mdh);
+		LASSERT(rc == 0);
+	} else {
+		CDEBUG(D_NET,
+		       "Posted active RDMA: peer %s, portal %u, matchbits %#llx\n",
+		       libcfs_id2str(peer), portal, matchbits);
+	}
+	return 0;
 }
 
 static int
 srpc_post_passive_rqtbuf(int service, int local, void *buf, int len,
-			 struct lnet_handle_md *mdh, srpc_event_t *ev)
+			 struct lnet_handle_md *mdh, struct srpc_event *ev)
 {
 	struct lnet_process_id any = {0};
 
@@ -459,9 +463,9 @@ static int
 srpc_service_post_buffer(struct srpc_service_cd *scd, struct srpc_buffer *buf)
 __must_hold(&scd->scd_lock)
 {
-	struct srpc_service	*sv = scd->scd_svc;
-	struct srpc_msg		*msg = &buf->buf_msg;
-	int			rc;
+	struct srpc_service *sv = scd->scd_svc;
+	struct srpc_msg *msg = &buf->buf_msg;
+	int rc;
 
 	LNetInvalidateMDHandle(&buf->buf_mdh);
 	list_add(&buf->buf_list, &scd->scd_buf_posted);
@@ -507,9 +511,10 @@ __must_hold(&scd->scd_lock)
 int
 srpc_add_buffer(struct swi_workitem *wi)
 {
-	struct srpc_service_cd	*scd = wi->swi_workitem.wi_data;
-	struct srpc_buffer	*buf;
-	int			rc = 0;
+	struct srpc_service_cd *scd = container_of(wi, struct srpc_service_cd,
+						   scd_buf_wi);
+	struct srpc_buffer *buf;
+	int rc = 0;
 
 	/* it's called by workitem scheduler threads, these threads
 	 * should have been set CPT affinity, so buffers will be posted
@@ -553,7 +558,7 @@ srpc_add_buffer(struct swi_workitem *wi)
 	}
 
 	if (rc != 0) {
-		scd->scd_buf_err_stamp = cfs_time_current_sec();
+		scd->scd_buf_err_stamp = ktime_get_real_seconds();
 		scd->scd_buf_err = rc;
 
 		LASSERT(scd->scd_buf_posting > 0);
@@ -567,9 +572,9 @@ srpc_add_buffer(struct swi_workitem *wi)
 int
 srpc_service_add_buffers(struct srpc_service *sv, int nbuffer)
 {
-	struct srpc_service_cd	*scd;
-	int			rc = 0;
-	int			i;
+	struct srpc_service_cd *scd;
+	int rc = 0;
+	int i;
 
 	LASSERTF(nbuffer > 0, "nbuffer must be positive: %d\n", nbuffer);
 
@@ -621,9 +626,9 @@ srpc_service_add_buffers(struct srpc_service *sv, int nbuffer)
 void
 srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer)
 {
-	struct srpc_service_cd	*scd;
-	int			num;
-	int			i;
+	struct srpc_service_cd *scd;
+	int num;
+	int i;
 
 	LASSERT(!sv->sv_shuttingdown);
 
@@ -641,9 +646,9 @@ srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer)
 int
 srpc_finish_service(struct srpc_service *sv)
 {
-	struct srpc_service_cd	*scd;
-	struct srpc_server_rpc	*rpc;
-	int			i;
+	struct srpc_service_cd *scd;
+	struct srpc_server_rpc *rpc;
+	int i;
 
 	LASSERT(sv->sv_shuttingdown); /* srpc_shutdown_service called */
 
@@ -655,8 +660,8 @@ srpc_finish_service(struct srpc_service *sv)
 		}
 
 		if (scd->scd_buf_nposted > 0) {
-			CDEBUG(D_NET, "waiting for %d posted buffers to "
-			       "unlink\n", scd->scd_buf_nposted);
+			CDEBUG(D_NET, "waiting for %d posted buffers to unlink\n",
+			       scd->scd_buf_nposted);
 			spin_unlock(&scd->scd_lock);
 			return 0;
 		}
@@ -667,10 +672,8 @@ srpc_finish_service(struct srpc_service *sv)
 		}
 
 		rpc = list_entry(scd->scd_rpc_active.next,
-				     struct srpc_server_rpc, srpc_list);
-		CNETERR("Active RPC %p on shutdown: sv %s, peer %s, "
-			"wi %s scheduled %d running %d, "
-			"ev fired %d type %d status %d lnet %d\n",
+				 struct srpc_server_rpc, srpc_list);
+		CNETERR("Active RPC %p on shutdown: sv %s, peer %s, wi %s scheduled %d running %d, ev fired %d type %d status %d lnet %d\n",
 			rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
 			swi_state2str(rpc->srpc_wi.swi_state),
 			rpc->srpc_wi.swi_workitem.wi_scheduled,
@@ -688,7 +691,8 @@ srpc_finish_service(struct srpc_service *sv)
 
 /* called with sv->sv_lock held */
 static void
-srpc_service_recycle_buffer(struct srpc_service_cd *scd, srpc_buffer_t *buf)
+srpc_service_recycle_buffer(struct srpc_service_cd *scd,
+			    struct srpc_buffer *buf)
 __must_hold(&scd->scd_lock)
 {
 	if (!scd->scd_svc->sv_shuttingdown && scd->scd_buf_adjust >= 0) {
@@ -721,9 +725,9 @@ __must_hold(&scd->scd_lock)
 void
 srpc_abort_service(struct srpc_service *sv)
 {
-	struct srpc_service_cd	*scd;
-	struct srpc_server_rpc	*rpc;
-	int			i;
+	struct srpc_service_cd *scd;
+	struct srpc_server_rpc *rpc;
+	int i;
 
 	CDEBUG(D_NET, "Aborting service: id %d, name %s\n",
 	       sv->sv_id, sv->sv_name);
@@ -733,7 +737,8 @@ srpc_abort_service(struct srpc_service *sv)
 
 		/* schedule in-flight RPCs to notice the abort, NB:
 		 * racing with incoming RPCs; complete fix should make test
-		 * RPCs carry session ID in its headers */
+		 * RPCs carry session ID in its headers
+		 */
 		list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list) {
 			rpc->srpc_aborted = 1;
 			swi_schedule_workitem(&rpc->srpc_wi);
@@ -744,12 +749,12 @@ srpc_abort_service(struct srpc_service *sv)
 }
 
 void
-srpc_shutdown_service(srpc_service_t *sv)
+srpc_shutdown_service(struct srpc_service *sv)
 {
-	struct srpc_service_cd	*scd;
-	struct srpc_server_rpc	*rpc;
-	srpc_buffer_t		*buf;
-	int			i;
+	struct srpc_service_cd *scd;
+	struct srpc_server_rpc *rpc;
+	struct srpc_buffer *buf;
+	int i;
 
 	CDEBUG(D_NET, "Shutting down service: id %d, name %s\n",
 	       sv->sv_id, sv->sv_name);
@@ -772,135 +777,139 @@ srpc_shutdown_service(srpc_service_t *sv)
 		spin_unlock(&scd->scd_lock);
 
 		/* OK to traverse scd_buf_posted without lock, since no one
-		 * touches scd_buf_posted now */
+		 * touches scd_buf_posted now
+		 */
 		list_for_each_entry(buf, &scd->scd_buf_posted, buf_list)
 			LNetMDUnlink(buf->buf_mdh);
 	}
 }
 
 static int
-srpc_send_request (srpc_client_rpc_t *rpc)
+srpc_send_request(struct srpc_client_rpc *rpc)
 {
-        srpc_event_t *ev = &rpc->crpc_reqstev;
-        int           rc;
+	struct srpc_event *ev = &rpc->crpc_reqstev;
+	int rc;
 
-        ev->ev_fired = 0;
-        ev->ev_data  = rpc;
-        ev->ev_type  = SRPC_REQUEST_SENT;
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = SRPC_REQUEST_SENT;
 
 	rc = srpc_post_active_rdma(srpc_serv_portal(rpc->crpc_service),
 				   rpc->crpc_service, &rpc->crpc_reqstmsg,
-				   sizeof(srpc_msg_t), LNET_MD_OP_PUT,
+				   sizeof(struct srpc_msg), LNET_MD_OP_PUT,
 				   rpc->crpc_dest, LNET_NID_ANY,
 				   &rpc->crpc_reqstmdh, ev);
-        if (rc != 0) {
-                LASSERT (rc == -ENOMEM);
-                ev->ev_fired = 1;  /* no more event expected */
-        }
-        return rc;
+	if (rc != 0) {
+		LASSERT(rc == -ENOMEM);
+		ev->ev_fired = 1;  /* no more event expected */
+	}
+	return rc;
 }
 
 static int
-srpc_prepare_reply (srpc_client_rpc_t *rpc)
+srpc_prepare_reply(struct srpc_client_rpc *rpc)
 {
-        srpc_event_t *ev = &rpc->crpc_replyev;
-        __u64        *id = &rpc->crpc_reqstmsg.msg_body.reqst.rpyid;
-        int           rc;
+	struct srpc_event *ev = &rpc->crpc_replyev;
+	u64 *id = &rpc->crpc_reqstmsg.msg_body.reqst.rpyid;
+	int rc;
 
-        ev->ev_fired = 0;
-        ev->ev_data  = rpc;
-        ev->ev_type  = SRPC_REPLY_RCVD;
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = SRPC_REPLY_RCVD;
 
-        *id = srpc_next_id();
+	*id = srpc_next_id();
 
 	rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id,
-                                    &rpc->crpc_replymsg, sizeof(srpc_msg_t),
-                                    LNET_MD_OP_PUT, rpc->crpc_dest,
-                                    &rpc->crpc_replymdh, ev);
-        if (rc != 0) {
-                LASSERT (rc == -ENOMEM);
-                ev->ev_fired = 1;  /* no more event expected */
-        }
-        return rc;
+				    &rpc->crpc_replymsg,
+				    sizeof(struct srpc_msg),
+				    LNET_MD_OP_PUT, rpc->crpc_dest,
+				    &rpc->crpc_replymdh, ev);
+	if (rc != 0) {
+		LASSERT(rc == -ENOMEM);
+		ev->ev_fired = 1;  /* no more event expected */
+	}
+	return rc;
 }
 
 static int
-srpc_prepare_bulk (srpc_client_rpc_t *rpc)
+srpc_prepare_bulk(struct srpc_client_rpc *rpc)
 {
-        srpc_bulk_t  *bk = &rpc->crpc_bulk;
-        srpc_event_t *ev = &rpc->crpc_bulkev;
-        __u64        *id = &rpc->crpc_reqstmsg.msg_body.reqst.bulkid;
-        int           rc;
-        int           opt;
+	struct srpc_bulk *bk = &rpc->crpc_bulk;
+	struct srpc_event *ev = &rpc->crpc_bulkev;
+	__u64 *id = &rpc->crpc_reqstmsg.msg_body.reqst.bulkid;
+	int rc;
+	int opt;
 
-        LASSERT (bk->bk_niov <= LNET_MAX_IOV);
+	LASSERT(bk->bk_niov <= LNET_MAX_IOV);
 
-        if (bk->bk_niov == 0) return 0; /* nothing to do */
+	/* nothing to do */
+	if (bk->bk_niov == 0)
+		return 0;
 
-        opt = bk->bk_sink ? LNET_MD_OP_PUT : LNET_MD_OP_GET;
-        opt |= LNET_MD_KIOV;
+	opt = bk->bk_sink ? LNET_MD_OP_PUT : LNET_MD_OP_GET;
+	opt |= LNET_MD_KIOV;
 
-        ev->ev_fired = 0;
-        ev->ev_data  = rpc;
-        ev->ev_type  = SRPC_BULK_REQ_RCVD;
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = SRPC_BULK_REQ_RCVD;
 
-        *id = srpc_next_id();
+	*id = srpc_next_id();
 
 	rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id,
-                                    &bk->bk_iovs[0], bk->bk_niov, opt,
-                                    rpc->crpc_dest, &bk->bk_mdh, ev);
-        if (rc != 0) {
-                LASSERT (rc == -ENOMEM);
-                ev->ev_fired = 1;  /* no more event expected */
-        }
-        return rc;
+				    &bk->bk_iovs[0], bk->bk_niov, opt,
+				    rpc->crpc_dest, &bk->bk_mdh, ev);
+	if (rc != 0) {
+		LASSERT(rc == -ENOMEM);
+		ev->ev_fired = 1;  /* no more event expected */
+	}
+	return rc;
 }
 
 static int
-srpc_do_bulk (srpc_server_rpc_t *rpc)
+srpc_do_bulk(struct srpc_server_rpc *rpc)
 {
-        srpc_event_t  *ev = &rpc->srpc_ev;
-        srpc_bulk_t   *bk = rpc->srpc_bulk;
-        __u64          id = rpc->srpc_reqstbuf->buf_msg.msg_body.reqst.bulkid;
-        int            rc;
-        int            opt;
-
-        LASSERT (bk != NULL);
-
-        opt = bk->bk_sink ? LNET_MD_OP_GET : LNET_MD_OP_PUT;
-        opt |= LNET_MD_KIOV;
-
-        ev->ev_fired = 0;
-        ev->ev_data  = rpc;
-        ev->ev_type  = bk->bk_sink ? SRPC_BULK_GET_RPLD : SRPC_BULK_PUT_SENT;
-
-        rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, id,
-                                   &bk->bk_iovs[0], bk->bk_niov, opt,
-                                   rpc->srpc_peer, rpc->srpc_self,
-                                   &bk->bk_mdh, ev);
-        if (rc != 0)
-                ev->ev_fired = 1;  /* no more event expected */
-        return rc;
+	struct srpc_event *ev = &rpc->srpc_ev;
+	struct srpc_bulk *bk = rpc->srpc_bulk;
+	__u64 id = rpc->srpc_reqstbuf->buf_msg.msg_body.reqst.bulkid;
+	int rc;
+	int opt;
+
+	LASSERT(bk != NULL);
+
+	opt = bk->bk_sink ? LNET_MD_OP_GET : LNET_MD_OP_PUT;
+	opt |= LNET_MD_KIOV;
+
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = bk->bk_sink ? SRPC_BULK_GET_RPLD : SRPC_BULK_PUT_SENT;
+
+	rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, id,
+				   &bk->bk_iovs[0], bk->bk_niov, opt,
+				   rpc->srpc_peer, rpc->srpc_self,
+				   &bk->bk_mdh, ev);
+	if (rc != 0)
+		ev->ev_fired = 1;  /* no more event expected */
+	return rc;
 }
 
 /* only called from srpc_handle_rpc */
 static void
-srpc_server_rpc_done(srpc_server_rpc_t *rpc, int status)
+srpc_server_rpc_done(struct srpc_server_rpc *rpc, int status)
 {
 	struct srpc_service_cd	*scd = rpc->srpc_scd;
 	struct srpc_service	*sv  = scd->scd_svc;
-	srpc_buffer_t		*buffer;
+	struct srpc_buffer *buffer;
 
-        LASSERT (status != 0 || rpc->srpc_wi.swi_state == SWI_STATE_DONE);
+	LASSERT(status != 0 || rpc->srpc_wi.swi_state == SWI_STATE_DONE);
 
-        rpc->srpc_status = status;
+	rpc->srpc_status = status;
 
-        CDEBUG_LIMIT (status == 0 ? D_NET : D_NETERROR,
-                "Server RPC %p done: service %s, peer %s, status %s:%d\n",
-                rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
-                swi_state2str(rpc->srpc_wi.swi_state), status);
+	CDEBUG_LIMIT(status == 0 ? D_NET : D_NETERROR,
+		     "Server RPC %p done: service %s, peer %s, status %s:%d\n",
+		     rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+		     swi_state2str(rpc->srpc_wi.swi_state), status);
 
-        if (status != 0) {
+	if (status != 0) {
 		spin_lock(&srpc_data.rpc_glock);
 		srpc_data.rpc_counters.rpcs_dropped++;
 		spin_unlock(&srpc_data.rpc_glock);
@@ -914,7 +923,8 @@ srpc_server_rpc_done(srpc_server_rpc_t *rpc, int status)
 
 	if (rpc->srpc_reqstbuf != NULL) {
 		/* NB might drop sv_lock in srpc_service_recycle_buffer, but
-		 * sv won't go away for scd_rpc_active must not be empty */
+		 * sv won't go away for scd_rpc_active must not be empty
+		 */
 		srpc_service_recycle_buffer(scd, rpc->srpc_reqstbuf);
 		rpc->srpc_reqstbuf = NULL;
 	}
@@ -932,7 +942,7 @@ srpc_server_rpc_done(srpc_server_rpc_t *rpc, int status)
 
 	if (!sv->sv_shuttingdown && !list_empty(&scd->scd_buf_blocked)) {
 		buffer = list_entry(scd->scd_buf_blocked.next,
-					srpc_buffer_t, buf_list);
+				    struct srpc_buffer, buf_list);
 		list_del(&buffer->buf_list);
 
 		srpc_init_server_rpc(rpc, scd, buffer);
@@ -947,14 +957,14 @@ srpc_server_rpc_done(srpc_server_rpc_t *rpc, int status)
 }
 
 /* handles an incoming RPC */
-int
-srpc_handle_rpc(swi_workitem_t *wi)
+static int srpc_handle_rpc(struct swi_workitem *wi)
 {
-	struct srpc_server_rpc	*rpc = wi->swi_workitem.wi_data;
-	struct srpc_service_cd	*scd = rpc->srpc_scd;
-	struct srpc_service	*sv = scd->scd_svc;
-	srpc_event_t		*ev = &rpc->srpc_ev;
-	int			rc = 0;
+	struct srpc_server_rpc *rpc = container_of(wi, struct srpc_server_rpc,
+						   srpc_wi);
+	struct srpc_service_cd *scd = rpc->srpc_scd;
+	struct srpc_service *sv = scd->scd_svc;
+	struct srpc_event *ev = &rpc->srpc_ev;
+	int rc = 0;
 
 	LASSERT(wi == &rpc->srpc_wi);
 
@@ -963,31 +973,32 @@ srpc_handle_rpc(swi_workitem_t *wi)
 	if (sv->sv_shuttingdown || rpc->srpc_aborted) {
 		spin_unlock(&scd->scd_lock);
 
-                if (rpc->srpc_bulk != NULL)
-                        LNetMDUnlink(rpc->srpc_bulk->bk_mdh);
-                LNetMDUnlink(rpc->srpc_replymdh);
+		if (rpc->srpc_bulk != NULL)
+			LNetMDUnlink(rpc->srpc_bulk->bk_mdh);
+		LNetMDUnlink(rpc->srpc_replymdh);
 
-                if (ev->ev_fired) { /* no more event, OK to finish */
-                        srpc_server_rpc_done(rpc, -ESHUTDOWN);
-                        return 1;
-                }
-                return 0;
-        }
+		if (ev->ev_fired) { /* no more event, OK to finish */
+			srpc_server_rpc_done(rpc, -ESHUTDOWN);
+			return 1;
+		}
+		return 0;
+	}
 
 	spin_unlock(&scd->scd_lock);
 
-        switch (wi->swi_state) {
-        default:
-                LBUG ();
-        case SWI_STATE_NEWBORN: {
-                srpc_msg_t           *msg;
-                srpc_generic_reply_t *reply;
+	switch (wi->swi_state) {
+	default:
+		LBUG();
+		/* fallthrough */
+	case SWI_STATE_NEWBORN: {
+		struct srpc_msg *msg;
+		struct srpc_generic_reply *reply;
 
-                msg = &rpc->srpc_reqstbuf->buf_msg;
-                reply = &rpc->srpc_replymsg.msg_body.reply;
+		msg = &rpc->srpc_reqstbuf->buf_msg;
+		reply = &rpc->srpc_replymsg.msg_body.reply;
 
-                if (msg->msg_magic == 0) {
-                        /* moaned already in srpc_lnet_ev_handler */
+		if (msg->msg_magic == 0) {
+			/* moaned already in srpc_lnet_ev_handler */
 			srpc_server_rpc_done(rpc, EBADMSG);
 			return 1;
 		}
@@ -1007,67 +1018,67 @@ srpc_handle_rpc(swi_workitem_t *wi)
 				srpc_server_rpc_done(rpc, rc);
 				return 1;
 			}
-                }
-
-                wi->swi_state = SWI_STATE_BULK_STARTED;
-
-                if (rpc->srpc_bulk != NULL) {
-                        rc = srpc_do_bulk(rpc);
-                        if (rc == 0)
-                                return 0; /* wait for bulk */
-
-                        LASSERT (ev->ev_fired);
-                        ev->ev_status = rc;
-                }
-        }
-	/* Fall through */
-        case SWI_STATE_BULK_STARTED:
-                LASSERT (rpc->srpc_bulk == NULL || ev->ev_fired);
-
-                if (rpc->srpc_bulk != NULL) {
-                        rc = ev->ev_status;
-
-                        if (sv->sv_bulk_ready != NULL)
-                                rc = (*sv->sv_bulk_ready) (rpc, rc);
-
-                        if (rc != 0) {
-                                srpc_server_rpc_done(rpc, rc);
-                                return 1;
-                        }
-                }
-
-                wi->swi_state = SWI_STATE_REPLY_SUBMITTED;
-                rc = srpc_send_reply(rpc);
-                if (rc == 0)
-                        return 0; /* wait for reply */
-                srpc_server_rpc_done(rpc, rc);
-                return 1;
-
-        case SWI_STATE_REPLY_SUBMITTED:
-                if (!ev->ev_fired) {
-                        CERROR("RPC %p: bulk %p, service %d\n",
+		}
+
+		wi->swi_state = SWI_STATE_BULK_STARTED;
+
+		if (rpc->srpc_bulk != NULL) {
+			rc = srpc_do_bulk(rpc);
+			if (rc == 0)
+				return 0; /* wait for bulk */
+
+			LASSERT(ev->ev_fired);
+			ev->ev_status = rc;
+		}
+	}
+	/* fallthrough */
+	case SWI_STATE_BULK_STARTED:
+		LASSERT(rpc->srpc_bulk == NULL || ev->ev_fired);
+
+		if (rpc->srpc_bulk != NULL) {
+			rc = ev->ev_status;
+
+			if (sv->sv_bulk_ready != NULL)
+				rc = (*sv->sv_bulk_ready) (rpc, rc);
+
+			if (rc != 0) {
+				srpc_server_rpc_done(rpc, rc);
+				return 1;
+			}
+		}
+
+		wi->swi_state = SWI_STATE_REPLY_SUBMITTED;
+		rc = srpc_send_reply(rpc);
+		if (rc == 0)
+			return 0; /* wait for reply */
+		srpc_server_rpc_done(rpc, rc);
+		return 1;
+
+	case SWI_STATE_REPLY_SUBMITTED:
+		if (!ev->ev_fired) {
+			CERROR("RPC %p: bulk %p, service %d\n",
 			       rpc, rpc->srpc_bulk, sv->sv_id);
-                        CERROR("Event: status %d, type %d, lnet %d\n",
-                               ev->ev_status, ev->ev_type, ev->ev_lnet);
-                        LASSERT (ev->ev_fired);
-                }
+			CERROR("Event: status %d, type %d, lnet %d\n",
+			       ev->ev_status, ev->ev_type, ev->ev_lnet);
+			LASSERT(ev->ev_fired);
+		}
 
-                wi->swi_state = SWI_STATE_DONE;
-                srpc_server_rpc_done(rpc, ev->ev_status);
-                return 1;
-        }
+		wi->swi_state = SWI_STATE_DONE;
+		srpc_server_rpc_done(rpc, ev->ev_status);
+		return 1;
+	}
 
-        return 0;
+	return 0;
 }
 
 static void
 srpc_client_rpc_expired (void *data)
 {
-        srpc_client_rpc_t *rpc = data;
+	struct srpc_client_rpc *rpc = data;
 
-        CWARN ("Client RPC expired: service %d, peer %s, timeout %d.\n",
-               rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
-               rpc->crpc_timeout);
+	CWARN("Client RPC expired: service %d, peer %s, timeout %d.\n",
+	      rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+	      rpc->crpc_timeout);
 
 	spin_lock(&rpc->crpc_lock);
 
@@ -1082,9 +1093,9 @@ srpc_client_rpc_expired (void *data)
 }
 
 static void
-srpc_add_client_rpc_timer(srpc_client_rpc_t *rpc)
+srpc_add_client_rpc_timer(struct srpc_client_rpc *rpc)
 {
-	stt_timer_t *timer = &rpc->crpc_timer;
+	struct stt_timer *timer = &rpc->crpc_timer;
 
 	if (rpc->crpc_timeout == 0)
 		return;
@@ -1092,8 +1103,7 @@ srpc_add_client_rpc_timer(srpc_client_rpc_t *rpc)
 	INIT_LIST_HEAD(&timer->stt_list);
 	timer->stt_data	   = rpc;
 	timer->stt_func    = srpc_client_rpc_expired;
-	timer->stt_expires = cfs_time_add(rpc->crpc_timeout,
-					  cfs_time_current_sec());
+	timer->stt_expires = ktime_get_real_seconds() + rpc->crpc_timeout;
 	stt_add_timer(timer);
 	return;
 }
@@ -1102,9 +1112,10 @@ srpc_add_client_rpc_timer(srpc_client_rpc_t *rpc)
  * Called with rpc->crpc_lock held.
  *
  * Upon exit the RPC expiry timer is not queued and the handler is not
- * running on any CPU. */
+ * running on any CPU.
+ */
 static void
-srpc_del_client_rpc_timer (srpc_client_rpc_t *rpc)
+srpc_del_client_rpc_timer(struct srpc_client_rpc *rpc)
 {
 	/* timer not planted or already exploded */
 	if (rpc->crpc_timeout == 0)
@@ -1125,34 +1136,34 @@ srpc_del_client_rpc_timer (srpc_client_rpc_t *rpc)
 }
 
 static void
-srpc_client_rpc_done (srpc_client_rpc_t *rpc, int status)
+srpc_client_rpc_done(struct srpc_client_rpc *rpc, int status)
 {
-	swi_workitem_t *wi = &rpc->crpc_wi;
+	struct swi_workitem *wi = &rpc->crpc_wi;
 
 	LASSERT(status != 0 || wi->swi_state == SWI_STATE_DONE);
 
 	spin_lock(&rpc->crpc_lock);
 
-        rpc->crpc_closed = 1;
-        if (rpc->crpc_status == 0)
-                rpc->crpc_status = status;
-
-        srpc_del_client_rpc_timer(rpc);
-
-        CDEBUG_LIMIT ((status == 0) ? D_NET : D_NETERROR,
-                "Client RPC done: service %d, peer %s, status %s:%d:%d\n",
-                rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
-                swi_state2str(wi->swi_state), rpc->crpc_aborted, status);
-
-        /*
-         * No one can schedule me now since:
-         * - RPC timer has been defused.
-         * - all LNet events have been fired.
-         * - crpc_closed has been set, preventing srpc_abort_rpc from
-         *   scheduling me.
-         * Cancel pending schedules and prevent future schedule attempts:
-         */
-        LASSERT (!srpc_event_pending(rpc));
+	rpc->crpc_closed = 1;
+	if (rpc->crpc_status == 0)
+		rpc->crpc_status = status;
+
+	srpc_del_client_rpc_timer(rpc);
+
+	CDEBUG_LIMIT((status == 0) ? D_NET : D_NETERROR,
+		     "Client RPC done: service %d, peer %s, status %s:%d:%d\n",
+		     rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+		     swi_state2str(wi->swi_state), rpc->crpc_aborted, status);
+
+	/*
+	 * No one can schedule me now since:
+	 * - RPC timer has been defused.
+	 * - all LNet events have been fired.
+	 * - crpc_closed has been set, preventing srpc_abort_rpc from
+	 *   scheduling me.
+	 * Cancel pending schedules and prevent future schedule attempts:
+	 */
+	LASSERT(!srpc_event_pending(rpc));
 	swi_exit_workitem(wi);
 
 	spin_unlock(&rpc->crpc_lock);
@@ -1163,19 +1174,19 @@ srpc_client_rpc_done (srpc_client_rpc_t *rpc, int status)
 
 /* sends an outgoing RPC */
 int
-srpc_send_rpc (swi_workitem_t *wi)
+srpc_send_rpc(struct swi_workitem *wi)
 {
-        int                rc = 0;
-	srpc_client_rpc_t *rpc;
-	srpc_msg_t        *reply;
-	int                do_bulk;
+	int rc = 0;
+	struct srpc_client_rpc *rpc;
+	struct srpc_msg *reply;
+	int do_bulk;
 
 	LASSERT(wi != NULL);
 
 	rpc = wi->swi_workitem.wi_data;
 
-        LASSERT (rpc != NULL);
-        LASSERT (wi == &rpc->crpc_wi);
+	LASSERT(rpc != NULL);
+	LASSERT(wi == &rpc->crpc_wi);
 
 	reply = &rpc->crpc_replymsg;
 	do_bulk = rpc->crpc_bulk.bk_niov > 0;
@@ -1189,86 +1200,93 @@ srpc_send_rpc (swi_workitem_t *wi)
 
 	spin_unlock(&rpc->crpc_lock);
 
-        switch (wi->swi_state) {
-        default:
-                LBUG ();
-        case SWI_STATE_NEWBORN:
-                LASSERT (!srpc_event_pending(rpc));
+	switch (wi->swi_state) {
+	default:
+		LBUG();
+	case SWI_STATE_NEWBORN:
+		LASSERT(!srpc_event_pending(rpc));
+
+		rc = srpc_prepare_reply(rpc);
+		if (rc != 0) {
+			srpc_client_rpc_done(rpc, rc);
+			return 1;
+		}
 
-                rc = srpc_prepare_reply(rpc);
-                if (rc != 0) {
-                        srpc_client_rpc_done(rpc, rc);
-                        return 1;
-                }
+		rc = srpc_prepare_bulk(rpc);
+		if (rc != 0)
+			break;
 
-                rc = srpc_prepare_bulk(rpc);
-                if (rc != 0) break;
+		wi->swi_state = SWI_STATE_REQUEST_SUBMITTED;
+		rc = srpc_send_request(rpc);
+		break;
 
-                wi->swi_state = SWI_STATE_REQUEST_SUBMITTED;
-                rc = srpc_send_request(rpc);
-                break;
+	case SWI_STATE_REQUEST_SUBMITTED:
+		/* CAVEAT EMPTOR: rqtev, rpyev, and bulkev may come in any
+		 * order; however, they're processed in a strict order:
+		 * rqt, rpy, and bulk.
+		 */
+		if (!rpc->crpc_reqstev.ev_fired)
+			break;
 
-        case SWI_STATE_REQUEST_SUBMITTED:
-                /* CAVEAT EMPTOR: rqtev, rpyev, and bulkev may come in any
-                 * order; however, they're processed in a strict order:
-                 * rqt, rpy, and bulk. */
-                if (!rpc->crpc_reqstev.ev_fired) break;
+		rc = rpc->crpc_reqstev.ev_status;
+		if (rc != 0)
+			break;
 
-                rc = rpc->crpc_reqstev.ev_status;
-                if (rc != 0) break;
+		wi->swi_state = SWI_STATE_REQUEST_SENT;
+		/* fallthrough */
+	case SWI_STATE_REQUEST_SENT: {
+		enum srpc_msg_type type;
 
-                wi->swi_state = SWI_STATE_REQUEST_SENT;
-                /* perhaps more events, fall thru */
-		/* Fall through */
-        case SWI_STATE_REQUEST_SENT: {
-                srpc_msg_type_t type = srpc_service2reply(rpc->crpc_service);
+		type = srpc_service2reply(rpc->crpc_service);
 
-                if (!rpc->crpc_replyev.ev_fired) break;
+		if (!rpc->crpc_replyev.ev_fired)
+			break;
 
-                rc = rpc->crpc_replyev.ev_status;
-                if (rc != 0) break;
+		rc = rpc->crpc_replyev.ev_status;
+		if (rc != 0)
+			break;
 
 		srpc_unpack_msg_hdr(reply);
 		if (reply->msg_type != type ||
 		    (reply->msg_magic != SRPC_MSG_MAGIC &&
 		     reply->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
-                        CWARN ("Bad message from %s: type %u (%d expected),"
-                               " magic %u (%d expected).\n",
-                               libcfs_id2str(rpc->crpc_dest),
-                               reply->msg_type, type,
-                               reply->msg_magic, SRPC_MSG_MAGIC);
-                        rc = -EBADMSG;
-                        break;
-                }
-
-                if (do_bulk && reply->msg_body.reply.status != 0) {
-                        CWARN ("Remote error %d at %s, unlink bulk buffer in "
-                               "case peer didn't initiate bulk transfer\n",
-                               reply->msg_body.reply.status,
-                               libcfs_id2str(rpc->crpc_dest));
-                        LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
-                }
-
-                wi->swi_state = SWI_STATE_REPLY_RECEIVED;
-        }
-	/* Fall through */
-        case SWI_STATE_REPLY_RECEIVED:
-                if (do_bulk && !rpc->crpc_bulkev.ev_fired) break;
-
-                rc = do_bulk ? rpc->crpc_bulkev.ev_status : 0;
-
-                /* Bulk buffer was unlinked due to remote error. Clear error
-                 * since reply buffer still contains valid data.
-                 * NB rpc->crpc_done shouldn't look into bulk data in case of
-                 * remote error. */
-                if (do_bulk && rpc->crpc_bulkev.ev_lnet == LNET_EVENT_UNLINK &&
-                    rpc->crpc_status == 0 && reply->msg_body.reply.status != 0)
-                        rc = 0;
-
-                wi->swi_state = SWI_STATE_DONE;
-                srpc_client_rpc_done(rpc, rc);
-                return 1;
-        }
+			CWARN("Bad message from %s: type %u (%d expected), magic %u (%d expected).\n",
+			      libcfs_id2str(rpc->crpc_dest),
+			      reply->msg_type, type,
+			      reply->msg_magic, SRPC_MSG_MAGIC);
+			rc = -EBADMSG;
+			break;
+		}
+
+		if (do_bulk && reply->msg_body.reply.status != 0) {
+			CWARN("Remote error %d at %s, unlink bulk buffer in case peer didn't initiate bulk transfer\n",
+			      reply->msg_body.reply.status,
+			      libcfs_id2str(rpc->crpc_dest));
+			LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
+		}
+
+		wi->swi_state = SWI_STATE_REPLY_RECEIVED;
+	}
+	/* fallthrough */
+	case SWI_STATE_REPLY_RECEIVED:
+		if (do_bulk && !rpc->crpc_bulkev.ev_fired)
+			break;
+
+		rc = do_bulk ? rpc->crpc_bulkev.ev_status : 0;
+
+		/* Bulk buffer was unlinked due to remote error. Clear error
+		 * since reply buffer still contains valid data.
+		 * NB rpc->crpc_done shouldn't look into bulk data in case of
+		 * remote error.
+		 */
+		if (do_bulk && rpc->crpc_bulkev.ev_lnet == LNET_EVENT_UNLINK &&
+		    rpc->crpc_status == 0 && reply->msg_body.reply.status != 0)
+			rc = 0;
+
+		wi->swi_state = SWI_STATE_DONE;
+		srpc_client_rpc_done(rpc, rc);
+		return 1;
+	}
 
 	if (rc != 0) {
 		spin_lock(&rpc->crpc_lock);
@@ -1277,85 +1295,85 @@ srpc_send_rpc (swi_workitem_t *wi)
 	}
 
 abort:
-        if (rpc->crpc_aborted) {
-                LNetMDUnlink(rpc->crpc_reqstmdh);
-                LNetMDUnlink(rpc->crpc_replymdh);
-                LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
-
-                if (!srpc_event_pending(rpc)) {
-                        srpc_client_rpc_done(rpc, -EINTR);
-                        return 1;
-                }
-        }
-        return 0;
+	if (rpc->crpc_aborted) {
+		LNetMDUnlink(rpc->crpc_reqstmdh);
+		LNetMDUnlink(rpc->crpc_replymdh);
+		LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
+
+		if (!srpc_event_pending(rpc)) {
+			srpc_client_rpc_done(rpc, -EINTR);
+			return 1;
+		}
+	}
+	return 0;
 }
 
-srpc_client_rpc_t *
+struct srpc_client_rpc *
 srpc_create_client_rpc(struct lnet_process_id peer, int service,
 		       int nbulkiov, int bulklen,
-		       void (*rpc_done)(srpc_client_rpc_t *),
-		       void (*rpc_fini)(srpc_client_rpc_t *), void *priv)
+		       void (*rpc_done)(struct srpc_client_rpc *),
+		       void (*rpc_fini)(struct srpc_client_rpc *), void *priv)
 {
-        srpc_client_rpc_t *rpc;
+	struct srpc_client_rpc *rpc;
 
-        LIBCFS_ALLOC(rpc, offsetof(srpc_client_rpc_t,
-                                   crpc_bulk.bk_iovs[nbulkiov]));
-        if (rpc == NULL)
-                return NULL;
+	LIBCFS_ALLOC(rpc, offsetof(struct srpc_client_rpc,
+				   crpc_bulk.bk_iovs[nbulkiov]));
+	if (rpc == NULL)
+		return NULL;
 
-        srpc_init_client_rpc(rpc, peer, service, nbulkiov,
-                             bulklen, rpc_done, rpc_fini, priv);
-        return rpc;
+	srpc_init_client_rpc(rpc, peer, service, nbulkiov,
+			     bulklen, rpc_done, rpc_fini, priv);
+	return rpc;
 }
 
 /* called with rpc->crpc_lock held */
 void
-srpc_abort_rpc (srpc_client_rpc_t *rpc, int why)
+srpc_abort_rpc(struct srpc_client_rpc *rpc, int why)
 {
-        LASSERT (why != 0);
+	LASSERT(why != 0);
 
-        if (rpc->crpc_aborted || /* already aborted */
-            rpc->crpc_closed)    /* callback imminent */
-                return;
+	if (rpc->crpc_aborted || /* already aborted */
+	    rpc->crpc_closed)    /* callback imminent */
+		return;
 
-        CDEBUG (D_NET,
-                "Aborting RPC: service %d, peer %s, state %s, why %d\n",
-                rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
-                swi_state2str(rpc->crpc_wi.swi_state), why);
+	CDEBUG(D_NET,
+	       "Aborting RPC: service %d, peer %s, state %s, why %d\n",
+	       rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+	       swi_state2str(rpc->crpc_wi.swi_state), why);
 
-        rpc->crpc_aborted = 1;
-        rpc->crpc_status  = why;
-        swi_schedule_workitem(&rpc->crpc_wi);
-        return;
+	rpc->crpc_aborted = 1;
+	rpc->crpc_status  = why;
+	swi_schedule_workitem(&rpc->crpc_wi);
+	return;
 }
 
 /* called with rpc->crpc_lock held */
 void
-srpc_post_rpc (srpc_client_rpc_t *rpc)
+srpc_post_rpc(struct srpc_client_rpc *rpc)
 {
-        LASSERT (!rpc->crpc_aborted);
-        LASSERT (srpc_data.rpc_state == SRPC_STATE_RUNNING);
+	LASSERT(!rpc->crpc_aborted);
+	LASSERT(srpc_data.rpc_state == SRPC_STATE_RUNNING);
 
-        CDEBUG (D_NET, "Posting RPC: peer %s, service %d, timeout %d\n",
-                libcfs_id2str(rpc->crpc_dest), rpc->crpc_service,
-                rpc->crpc_timeout);
+	CDEBUG(D_NET, "Posting RPC: peer %s, service %d, timeout %d\n",
+	       libcfs_id2str(rpc->crpc_dest), rpc->crpc_service,
+	       rpc->crpc_timeout);
 
-        srpc_add_client_rpc_timer(rpc);
-        swi_schedule_workitem(&rpc->crpc_wi);
-        return;
+	srpc_add_client_rpc_timer(rpc);
+	swi_schedule_workitem(&rpc->crpc_wi);
+	return;
 }
 
 
 int
 srpc_send_reply(struct srpc_server_rpc *rpc)
 {
-	srpc_event_t		*ev = &rpc->srpc_ev;
-	struct srpc_msg		*msg = &rpc->srpc_replymsg;
-	struct srpc_buffer	*buffer = rpc->srpc_reqstbuf;
-	struct srpc_service_cd	*scd = rpc->srpc_scd;
-	struct srpc_service	*sv = scd->scd_svc;
-	__u64			rpyid;
-	int			rc;
+	struct srpc_event *ev = &rpc->srpc_ev;
+	struct srpc_msg *msg = &rpc->srpc_replymsg;
+	struct srpc_buffer *buffer = rpc->srpc_reqstbuf;
+	struct srpc_service_cd *scd = rpc->srpc_scd;
+	struct srpc_service *sv = scd->scd_svc;
+	__u64 rpyid;
+	int rc;
 
 	LASSERT(buffer != NULL);
 	rpyid = buffer->buf_msg.msg_body.reqst.rpyid;
@@ -1364,7 +1382,8 @@ srpc_send_reply(struct srpc_server_rpc *rpc)
 
 	if (!sv->sv_shuttingdown && !srpc_serv_is_framework(sv)) {
 		/* Repost buffer before replying since test client
-		 * might send me another RPC once it gets the reply */
+		 * might send me another RPC once it gets the reply
+		 */
 		if (srpc_service_post_buffer(scd, buffer) != 0)
 			CWARN("Failed to repost %s buffer\n", sv->sv_name);
 		rpc->srpc_reqstbuf = NULL;
@@ -1372,37 +1391,37 @@ srpc_send_reply(struct srpc_server_rpc *rpc)
 
 	spin_unlock(&scd->scd_lock);
 
-        ev->ev_fired = 0;
-        ev->ev_data  = rpc;
-        ev->ev_type  = SRPC_REPLY_SENT;
-
-        msg->msg_magic   = SRPC_MSG_MAGIC;
-        msg->msg_version = SRPC_MSG_VERSION;
-        msg->msg_type    = srpc_service2reply(sv->sv_id);
-
-        rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, rpyid, msg,
-                                   sizeof(*msg), LNET_MD_OP_PUT,
-                                   rpc->srpc_peer, rpc->srpc_self,
-                                   &rpc->srpc_replymdh, ev);
-        if (rc != 0)
-                ev->ev_fired = 1;  /* no more event expected */
-        return rc;
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = SRPC_REPLY_SENT;
+
+	msg->msg_magic   = SRPC_MSG_MAGIC;
+	msg->msg_version = SRPC_MSG_VERSION;
+	msg->msg_type    = srpc_service2reply(sv->sv_id);
+
+	rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, rpyid, msg,
+				   sizeof(*msg), LNET_MD_OP_PUT,
+				   rpc->srpc_peer, rpc->srpc_self,
+				   &rpc->srpc_replymdh, ev);
+	if (rc != 0)
+		ev->ev_fired = 1;  /* no more event expected */
+	return rc;
 }
 
 /* when in kernel always called with LNET_LOCK() held, and in thread context */
 static void
 srpc_lnet_ev_handler(struct lnet_event *ev)
 {
-	struct srpc_service_cd	*scd;
-	srpc_event_t      *rpcev = ev->md.user_ptr;
-	srpc_client_rpc_t *crpc;
-	srpc_server_rpc_t *srpc;
-	srpc_buffer_t     *buffer;
-	srpc_service_t    *sv;
-	srpc_msg_t        *msg;
-	srpc_msg_type_t    type;
+	struct srpc_service_cd *scd;
+	struct srpc_event *rpcev = ev->md.user_ptr;
+	struct srpc_client_rpc *crpc;
+	struct srpc_server_rpc *srpc;
+	struct srpc_buffer *buffer;
+	struct srpc_service *sv;
+	struct srpc_msg *msg;
+	enum srpc_msg_type type;
 
-	LASSERT (!in_interrupt());
+	LASSERT(!in_interrupt());
 
 	if (ev->status != 0) {
 		__u32 errors;
@@ -1417,41 +1436,43 @@ srpc_lnet_ev_handler(struct lnet_event *ev)
 			ev->status, ev->type, errors);
 	}
 
-        rpcev->ev_lnet = ev->type;
+	rpcev->ev_lnet = ev->type;
 
-        switch (rpcev->ev_type) {
-        default:
-                CERROR("Unknown event: status %d, type %d, lnet %d\n",
-                       rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet);
-                LBUG ();
-        case SRPC_REQUEST_SENT:
-                if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
+	switch (rpcev->ev_type) {
+	default:
+		CERROR("Unknown event: status %d, type %d, lnet %d\n",
+		       rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet);
+		LBUG();
+		/* fallthrough */
+	case SRPC_REQUEST_SENT:
+		if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
 			spin_lock(&srpc_data.rpc_glock);
 			srpc_data.rpc_counters.rpcs_sent++;
 			spin_unlock(&srpc_data.rpc_glock);
-                }
-		/* Fall through */
-        case SRPC_REPLY_RCVD:
-        case SRPC_BULK_REQ_RCVD:
-                crpc = rpcev->ev_data;
-
-                if (rpcev != &crpc->crpc_reqstev &&
-                    rpcev != &crpc->crpc_replyev &&
-                    rpcev != &crpc->crpc_bulkev) {
-                        CERROR("rpcev %p, crpc %p, reqstev %p, replyev %p, bulkev %p\n",
-                               rpcev, crpc, &crpc->crpc_reqstev,
-                               &crpc->crpc_replyev, &crpc->crpc_bulkev);
-                        CERROR("Bad event: status %d, type %d, lnet %d\n",
-                               rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet);
-                        LBUG ();
-                }
+		}
+		/* fallthrough */
+	case SRPC_REPLY_RCVD:
+	case SRPC_BULK_REQ_RCVD:
+		crpc = rpcev->ev_data;
+
+		if (rpcev != &crpc->crpc_reqstev &&
+		    rpcev != &crpc->crpc_replyev &&
+		    rpcev != &crpc->crpc_bulkev) {
+			CERROR("rpcev %p, crpc %p, reqstev %p, replyev %p, bulkev %p\n",
+			       rpcev, crpc, &crpc->crpc_reqstev,
+			       &crpc->crpc_replyev, &crpc->crpc_bulkev);
+			CERROR("Bad event: status %d, type %d, lnet %d\n",
+			       rpcev->ev_status, rpcev->ev_type,
+			       rpcev->ev_lnet);
+			LBUG();
+		}
 
 		spin_lock(&crpc->crpc_lock);
 
 		LASSERT(rpcev->ev_fired == 0);
 		rpcev->ev_fired  = 1;
 		rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ?
-						-EINTR : ev->status;
+				   -EINTR : ev->status;
 		swi_schedule_workitem(&crpc->crpc_wi);
 
 		spin_unlock(&crpc->crpc_lock);
@@ -1465,28 +1486,30 @@ srpc_lnet_ev_handler(struct lnet_event *ev)
 
 		spin_lock(&scd->scd_lock);
 
-                LASSERT (ev->unlinked);
-                LASSERT (ev->type == LNET_EVENT_PUT ||
-                         ev->type == LNET_EVENT_UNLINK);
-                LASSERT (ev->type != LNET_EVENT_UNLINK ||
-                         sv->sv_shuttingdown);
+		LASSERT(ev->unlinked);
+		LASSERT(ev->type == LNET_EVENT_PUT ||
+			ev->type == LNET_EVENT_UNLINK);
+		LASSERT(ev->type != LNET_EVENT_UNLINK ||
+			sv->sv_shuttingdown);
 
-                buffer = container_of(ev->md.start, srpc_buffer_t, buf_msg);
+		buffer = container_of(ev->md.start, struct srpc_buffer,
+				      buf_msg);
 		buffer->buf_peer = ev->source;
-                buffer->buf_self = ev->target.nid;
+		buffer->buf_self = ev->target.nid;
 
 		LASSERT(scd->scd_buf_nposted > 0);
 		scd->scd_buf_nposted--;
 
 		if (sv->sv_shuttingdown) {
 			/* Leave buffer on scd->scd_buf_nposted since
-			 * srpc_finish_service needs to traverse it. */
+			 * srpc_finish_service needs to traverse it.
+			 */
 			spin_unlock(&scd->scd_lock);
 			break;
 		}
 
 		if (scd->scd_buf_err_stamp != 0 &&
-		    scd->scd_buf_err_stamp < cfs_time_current_sec()) {
+		    scd->scd_buf_err_stamp < ktime_get_real_seconds()) {
 			/* re-enable adding buffer */
 			scd->scd_buf_err_stamp = 0;
 			scd->scd_buf_err = 0;
@@ -1504,22 +1527,22 @@ srpc_lnet_ev_handler(struct lnet_event *ev)
 		msg = &buffer->buf_msg;
 		type = srpc_service2request(sv->sv_id);
 
-                if (ev->status != 0 || ev->mlength != sizeof(*msg) ||
-                    (msg->msg_type != type &&
-                     msg->msg_type != __swab32(type)) ||
-                    (msg->msg_magic != SRPC_MSG_MAGIC &&
-                     msg->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
-                        CERROR ("Dropping RPC (%s) from %s: "
-                                "status %d mlength %d type %u magic %u.\n",
-                                sv->sv_name, libcfs_id2str(ev->initiator),
-                                ev->status, ev->mlength,
-                                msg->msg_type, msg->msg_magic);
-
-                        /* NB can't call srpc_service_recycle_buffer here since
-                         * it may call LNetM[DE]Attach. The invalid magic tells
-                         * srpc_handle_rpc to drop this RPC */
-                        msg->msg_magic = 0;
-                }
+		if (ev->status != 0 || ev->mlength != sizeof(*msg) ||
+		    (msg->msg_type != type &&
+		     msg->msg_type != __swab32(type)) ||
+		    (msg->msg_magic != SRPC_MSG_MAGIC &&
+		     msg->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
+			CERROR("Dropping RPC (%s) from %s: status %d mlength %d type %u magic %u.\n",
+			       sv->sv_name, libcfs_id2str(ev->initiator),
+			       ev->status, ev->mlength,
+			       msg->msg_type, msg->msg_magic);
+
+			/* NB can't call srpc_service_recycle_buffer here since
+			 * it may call LNetM[DE]Attach. The invalid magic tells
+			 * srpc_handle_rpc to drop this RPC
+			 */
+			msg->msg_magic = 0;
+		}
 
 		if (!list_empty(&scd->scd_rpc_free)) {
 			srpc = list_entry(scd->scd_rpc_free.next,
@@ -1541,19 +1564,18 @@ srpc_lnet_ev_handler(struct lnet_event *ev)
 		spin_lock(&srpc_data.rpc_glock);
 		srpc_data.rpc_counters.rpcs_rcvd++;
 		spin_unlock(&srpc_data.rpc_glock);
-                break;
-
-        case SRPC_BULK_GET_RPLD:
-                LASSERT (ev->type == LNET_EVENT_SEND ||
-                         ev->type == LNET_EVENT_REPLY ||
-                         ev->type == LNET_EVENT_UNLINK);
+		break;
 
-                if (!ev->unlinked)
-                        break; /* wait for final event */
-		/* Fall through */
+	case SRPC_BULK_GET_RPLD:
+		LASSERT(ev->type == LNET_EVENT_SEND ||
+			ev->type == LNET_EVENT_REPLY ||
+			ev->type == LNET_EVENT_UNLINK);
 
-        case SRPC_BULK_PUT_SENT:
-                if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
+		if (!ev->unlinked)
+			break; /* wait for final event */
+		/* fallthrough */
+	case SRPC_BULK_PUT_SENT:
+		if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
 			spin_lock(&srpc_data.rpc_glock);
 
 			if (rpcev->ev_type == SRPC_BULK_GET_RPLD)
@@ -1563,7 +1585,7 @@ srpc_lnet_ev_handler(struct lnet_event *ev)
 
 			spin_unlock(&srpc_data.rpc_glock);
 		}
-		/* Fall through */
+		/* fallthrough */
 	case SRPC_REPLY_SENT:
 		srpc = rpcev->ev_data;
 		scd  = srpc->srpc_scd;
@@ -1594,84 +1616,84 @@ srpc_startup (void)
 	/* 1 second pause to avoid timestamp reuse */
 	set_current_state(TASK_UNINTERRUPTIBLE);
 	schedule_timeout(cfs_time_seconds(1));
-	srpc_data.rpc_matchbits = ((__u64) cfs_time_current_sec()) << 48;
+	srpc_data.rpc_matchbits = ((__u64) ktime_get_real_seconds()) << 48;
 
 	srpc_data.rpc_state = SRPC_STATE_NONE;
 
 	rc = LNetNIInit(LNET_PID_LUSTRE);
-        if (rc < 0) {
-                CERROR ("LNetNIInit() has failed: %d\n", rc);
+	if (rc < 0) {
+		CERROR("LNetNIInit() has failed: %d\n", rc);
 		return rc;
-        }
+	}
 
-        srpc_data.rpc_state = SRPC_STATE_NI_INIT;
+	srpc_data.rpc_state = SRPC_STATE_NI_INIT;
 
 	LNetInvalidateEQHandle(&srpc_data.rpc_lnet_eq);
 	rc = LNetEQAlloc(0, srpc_lnet_ev_handler, &srpc_data.rpc_lnet_eq);
-        if (rc != 0) {
-                CERROR("LNetEQAlloc() has failed: %d\n", rc);
-                goto bail;
-        }
+	if (rc != 0) {
+		CERROR("LNetEQAlloc() has failed: %d\n", rc);
+		goto bail;
+	}
 
 	rc = LNetSetLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
 	LASSERT(rc == 0);
 	rc = LNetSetLazyPortal(SRPC_REQUEST_PORTAL);
 	LASSERT(rc == 0);
 
-        srpc_data.rpc_state = SRPC_STATE_EQ_INIT;
+	srpc_data.rpc_state = SRPC_STATE_EQ_INIT;
 
-        rc = stt_startup();
+	rc = stt_startup();
 
 bail:
-        if (rc != 0)
-                srpc_shutdown();
-        else
-                srpc_data.rpc_state = SRPC_STATE_RUNNING;
+	if (rc != 0)
+		srpc_shutdown();
+	else
+		srpc_data.rpc_state = SRPC_STATE_RUNNING;
 
-        return rc;
+	return rc;
 }
 
 void
 srpc_shutdown (void)
 {
-        int i;
-        int rc;
-        int state;
+	int i;
+	int rc;
+	int state;
 
-        state = srpc_data.rpc_state;
-        srpc_data.rpc_state = SRPC_STATE_STOPPING;
+	state = srpc_data.rpc_state;
+	srpc_data.rpc_state = SRPC_STATE_STOPPING;
 
-        switch (state) {
-        default:
-                LBUG ();
-        case SRPC_STATE_RUNNING:
+	switch (state) {
+	default:
+		LBUG();
+		/* fallthrough */
+	case SRPC_STATE_RUNNING:
 		spin_lock(&srpc_data.rpc_glock);
 
-                for (i = 0; i <= SRPC_SERVICE_MAX_ID; i++) {
-                        srpc_service_t *sv = srpc_data.rpc_services[i];
+		for (i = 0; i <= SRPC_SERVICE_MAX_ID; i++) {
+			struct srpc_service *sv = srpc_data.rpc_services[i];
 
-                        LASSERTF (sv == NULL,
-                                  "service not empty: id %d, name %s\n",
-                                  i, sv->sv_name);
-                }
+			LASSERTF(sv == NULL,
+				 "service not empty: id %d, name %s\n",
+				 i, sv->sv_name);
+		}
 
 		spin_unlock(&srpc_data.rpc_glock);
 
-                stt_shutdown();
-		/* Fall through */
+		stt_shutdown();
+		/* fallthrough */
 
-        case SRPC_STATE_EQ_INIT:
-                rc = LNetClearLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
+	case SRPC_STATE_EQ_INIT:
+		rc = LNetClearLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
 		rc = LNetClearLazyPortal(SRPC_REQUEST_PORTAL);
-                LASSERT (rc == 0);
-                rc = LNetEQFree(srpc_data.rpc_lnet_eq);
-                LASSERT (rc == 0); /* the EQ should have no user by now */
-		/* Fall through */
+		LASSERT(rc == 0);
+		rc = LNetEQFree(srpc_data.rpc_lnet_eq);
+		LASSERT(rc == 0); /* the EQ should have no user by now */
+		/* fallthrough */
 
-        case SRPC_STATE_NI_INIT:
-                LNetNIFini();
-		/* Fall through */
-        }
+	case SRPC_STATE_NI_INIT:
+		LNetNIFini();
+	}
 
-        return;
+	return;
 }
diff --git a/drivers/staging/lustrefsx/lnet/selftest/rpc.h b/drivers/staging/lustrefsx/lnet/selftest/rpc.h
index aab2629e7ba1d..8cc8c434645d5 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/rpc.h
+++ b/drivers/staging/lustrefsx/lnet/selftest/rpc.h
@@ -33,14 +33,14 @@
 #ifndef __SELFTEST_RPC_H__
 #define __SELFTEST_RPC_H__
 
-#include <lnet/lnetst.h>
+#include <uapi/linux/lnet/lnetst.h>
 
 /*
  * LST wired structures
- * 
+ *
  * XXX: *REPLY == *REQST + 1
  */
-typedef enum {
+enum srpc_msg_type {
         SRPC_MSG_MKSN_REQST     = 0,
         SRPC_MSG_MKSN_REPLY     = 1,
         SRPC_MSG_RMSN_REQST     = 2,
@@ -59,118 +59,118 @@ typedef enum {
         SRPC_MSG_PING_REPLY     = 15,
         SRPC_MSG_JOIN_REQST     = 16,
         SRPC_MSG_JOIN_REPLY     = 17,
-} srpc_msg_type_t;
+};
 
 /* CAVEAT EMPTOR:
- * All srpc_*_reqst_t's 1st field must be matchbits of reply buffer,
+ * All struct srpc_*_reqst's 1st field must be matchbits of reply buffer,
  * and 2nd field matchbits of bulk buffer if any.
  *
- * All srpc_*_reply_t's 1st field must be a __u32 status, and 2nd field
+ * All struct srpc_*_reply's 1st field must be a __u32 status, and 2nd field
  * session id if needed.
  */
-typedef struct {
+struct srpc_generic_reqst {
         __u64 			rpyid;  	/* reply buffer matchbits */
         __u64 			bulkid; 	/* bulk buffer matchbits */
-} WIRE_ATTR srpc_generic_reqst_t;
+} WIRE_ATTR;
 
-typedef struct {
+struct srpc_generic_reply {
         __u32                   status;
 	struct lst_sid               sid;
-} WIRE_ATTR srpc_generic_reply_t;
+} WIRE_ATTR;
 
 /* FRAMEWORK RPCs */
-typedef struct {
+struct srpc_mksn_reqst {
         __u64 			mksn_rpyid;      /* reply buffer matchbits */
 	struct lst_sid               mksn_sid;        /* session id */
         __u32 			mksn_force;      /* use brute force */
         char  			mksn_name[LST_NAME_SIZE];
-} WIRE_ATTR srpc_mksn_reqst_t; 			/* make session request */
+} WIRE_ATTR;					/* make session request */
 
-typedef struct {
+struct srpc_mksn_reply {
         __u32                   mksn_status;      /* session status */
 	struct lst_sid               mksn_sid;         /* session id */
         __u32                   mksn_timeout;     /* session timeout */
         char  			mksn_name[LST_NAME_SIZE];
-} WIRE_ATTR srpc_mksn_reply_t; /* make session reply */
+} WIRE_ATTR;					/* make session reply */
 
-typedef struct {
-        __u64			rmsn_rpyid;      /* reply buffer matchbits */
-	struct lst_sid		rmsn_sid;        /* session id */
-} WIRE_ATTR srpc_rmsn_reqst_t; /* remove session request */
+struct srpc_rmsn_reqst {
+	__u64			rmsn_rpyid;	/* reply buffer matchbits */
+	struct lst_sid		rmsn_sid;	/* session id */
+} WIRE_ATTR;					/* remove session request */
 
-typedef struct {
+struct srpc_rmsn_reply {
         __u32			rmsn_status;
-	struct lst_sid		rmsn_sid;        /* session id */
-} WIRE_ATTR srpc_rmsn_reply_t; /* remove session reply */
+	struct lst_sid		rmsn_sid;	/* session id */
+} WIRE_ATTR;					/* remove session reply */
 
-typedef struct {
+struct srpc_join_reqst {
         __u64			join_rpyid;     /* reply buffer matchbits */
 	struct lst_sid               join_sid;       /* session id to join */
         char                    join_group[LST_NAME_SIZE]; /* group name */
-} WIRE_ATTR srpc_join_reqst_t;
+} WIRE_ATTR;
 
-typedef struct {
+struct srpc_join_reply {
         __u32                   join_status;    /* returned status */
 	struct lst_sid               join_sid;       /* session id */
         __u32 			join_timeout;   /* # seconds' inactivity to expire */
         char                    join_session[LST_NAME_SIZE]; /* session name */
-} WIRE_ATTR srpc_join_reply_t;
+} WIRE_ATTR;
 
-typedef struct {
+struct srpc_debug_reqst {
         __u64                   dbg_rpyid;      /* reply buffer matchbits */ 
 	struct lst_sid               dbg_sid;        /* session id */
         __u32                   dbg_flags;      /* bitmap of debug */
-} WIRE_ATTR srpc_debug_reqst_t;
+} WIRE_ATTR;
 
-typedef struct {
+struct srpc_debug_reply {
         __u32                   dbg_status;     /* returned code */
 	struct lst_sid               dbg_sid;        /* session id */
         __u32                   dbg_timeout;    /* session timeout */
         __u32                   dbg_nbatch;     /* # of batches in the node */
         char                    dbg_name[LST_NAME_SIZE]; /* session name */
-} WIRE_ATTR srpc_debug_reply_t;
+} WIRE_ATTR;
 
 #define SRPC_BATCH_OPC_RUN      1
 #define SRPC_BATCH_OPC_STOP     2
 #define SRPC_BATCH_OPC_QUERY    3
 
-typedef struct {
+struct srpc_batch_reqst {
         __u64                   bar_rpyid;      /* reply buffer matchbits */ 
 	struct lst_sid               bar_sid;        /* session id */
 	struct lst_bid               bar_bid;        /* batch id */
         __u32                   bar_opc;        /* create/start/stop batch */
         __u32                   bar_testidx;    /* index of test */
         __u32                   bar_arg;        /* parameters */
-} WIRE_ATTR srpc_batch_reqst_t;
+} WIRE_ATTR;
 
-typedef struct {
+struct srpc_batch_reply {
         __u32                   bar_status;     /* status of request */
 	struct lst_sid		bar_sid;	/* session id */
         __u32                   bar_active;     /* # of active tests in batch/test */
         __u32                   bar_time;       /* remained time */
-} WIRE_ATTR srpc_batch_reply_t;
+} WIRE_ATTR;
 
-typedef struct {
+struct srpc_stat_reqst {
         __u64                   str_rpyid;      /* reply buffer matchbits */
 	struct lst_sid		str_sid;	/* session id */
         __u32                   str_type;       /* type of stat */
-} WIRE_ATTR srpc_stat_reqst_t;
+} WIRE_ATTR;
 
-typedef struct {
-        __u32                   str_status;
-	struct lst_sid		str_sid;
-	struct sfw_counters	str_fw;
-	struct srpc_counters	str_rpc;
-	struct lnet_counters	str_lnet;
-} WIRE_ATTR srpc_stat_reply_t;
+struct srpc_stat_reply {
+	__u32                    str_status;
+	struct lst_sid           str_sid;
+	struct sfw_counters      str_fw;
+	struct srpc_counters     str_rpc;
+	struct lnet_counters_common str_lnet;
+} WIRE_ATTR;
 
-typedef struct {
+struct test_bulk_req {
         __u32                   blk_opc;        /* bulk operation code */
         __u32                   blk_npg;        /* # of pages */
         __u32                   blk_flags;      /* reserved flags */
-} WIRE_ATTR test_bulk_req_t;
+} WIRE_ATTR;
 
-typedef struct {
+struct test_bulk_req_v1 {
 	/** bulk operation code */
 	__u16			blk_opc;
 	/** data check flags */
@@ -179,14 +179,14 @@ typedef struct {
 	__u32			blk_len;
 	/** bulk offset */
 	__u32                   blk_offset;
-} WIRE_ATTR test_bulk_req_v1_t;
+} WIRE_ATTR;
 
-typedef struct {
+struct test_ping_req {
 	__u32			png_size;       /* size of ping message */
 	__u32			png_flags;      /* reserved flags */
-} WIRE_ATTR test_ping_req_t;
+} WIRE_ATTR;
 
-typedef struct {
+struct srpc_test_reqst {
 	__u64			tsr_rpyid;      /* reply buffer matchbits */
 	__u64			tsr_bulkid;     /* bulk buffer matchbits */
 	struct lst_sid		tsr_sid;        /* session id */
@@ -200,86 +200,86 @@ typedef struct {
 	__u32			tsr_ndest;      /* # of dest nodes */
 
 	union {
-		test_ping_req_t		ping;
-		test_bulk_req_t		bulk_v0;
-		test_bulk_req_v1_t	bulk_v1;
-	}		tsr_u;
-} WIRE_ATTR srpc_test_reqst_t;
+		struct test_ping_req	ping;
+		struct test_bulk_req	bulk_v0;
+		struct test_bulk_req_v1	bulk_v1;
+	} tsr_u;
+} WIRE_ATTR;
 
-typedef struct {
+struct srpc_test_reply {
 	__u32			tsr_status;     /* returned code */
 	struct lst_sid		tsr_sid;
-} WIRE_ATTR srpc_test_reply_t;
+} WIRE_ATTR;
 
 /* TEST RPCs */
-typedef struct {
+struct srpc_ping_reqst {
         __u64                   pnr_rpyid;
         __u32                   pnr_magic;
         __u32                   pnr_seq;
         __u64                   pnr_time_sec;
 	__u64                   pnr_time_nsec;
-} WIRE_ATTR srpc_ping_reqst_t;
+} WIRE_ATTR;
 
-typedef struct {
+struct srpc_ping_reply {
         __u32                   pnr_status;
         __u32                   pnr_magic;
         __u32                   pnr_seq;
-} WIRE_ATTR srpc_ping_reply_t;
+} WIRE_ATTR;
 
-typedef struct {
+struct srpc_brw_reqst {
         __u64                   brw_rpyid;      /* reply buffer matchbits */
         __u64                   brw_bulkid;     /* bulk buffer matchbits */
         __u32                   brw_rw;         /* read or write */
         __u32                   brw_len;        /* bulk data len */
         __u32                   brw_flags;      /* bulk data patterns */
-} WIRE_ATTR srpc_brw_reqst_t; /* bulk r/w request */
+} WIRE_ATTR;					/* bulk r/w request */
 
-typedef struct {
+struct srpc_brw_reply {
         __u32                   brw_status;
-} WIRE_ATTR srpc_brw_reply_t; /* bulk r/w reply */
+} WIRE_ATTR; /* bulk r/w reply */
 
 #define SRPC_MSG_MAGIC                  0xeeb0f00d
 #define SRPC_MSG_VERSION                1
 
-typedef struct srpc_msg {
+struct srpc_msg {
 	/** magic number */
 	__u32	msg_magic;
 	/** message version number */
 	__u32	msg_version;
-	/** type of message body: srpc_msg_type_t */
+	/** type of message body: enum srpc_msg_type */
 	__u32	msg_type;
 	__u32	msg_reserved0;
 	__u32	msg_reserved1;
 	/** test session features */
 	__u32	msg_ses_feats;
         union {
-                srpc_generic_reqst_t reqst;
-                srpc_generic_reply_t reply;
-
-                srpc_mksn_reqst_t    mksn_reqst;
-                srpc_mksn_reply_t    mksn_reply;
-                srpc_rmsn_reqst_t    rmsn_reqst;
-                srpc_rmsn_reply_t    rmsn_reply;
-                srpc_debug_reqst_t   dbg_reqst;
-                srpc_debug_reply_t   dbg_reply;
-                srpc_batch_reqst_t   bat_reqst;
-                srpc_batch_reply_t   bat_reply;
-                srpc_stat_reqst_t    stat_reqst;
-                srpc_stat_reply_t    stat_reply;
-                srpc_test_reqst_t    tes_reqst;
-                srpc_test_reply_t    tes_reply;
-                srpc_join_reqst_t    join_reqst;
-                srpc_join_reply_t    join_reply;
-
-                srpc_ping_reqst_t    ping_reqst;
-                srpc_ping_reply_t    ping_reply;
-                srpc_brw_reqst_t     brw_reqst;
-                srpc_brw_reply_t     brw_reply;
-        }     msg_body;
-} WIRE_ATTR srpc_msg_t;
+		struct srpc_generic_reqst	reqst;
+		struct srpc_generic_reply	reply;
+
+		struct srpc_mksn_reqst		mksn_reqst;
+		struct srpc_mksn_reply		mksn_reply;
+		struct srpc_rmsn_reqst		rmsn_reqst;
+		struct srpc_rmsn_reply		rmsn_reply;
+		struct srpc_debug_reqst		dbg_reqst;
+		struct srpc_debug_reply		dbg_reply;
+		struct srpc_batch_reqst		bat_reqst;
+		struct srpc_batch_reply		bat_reply;
+		struct srpc_stat_reqst		stat_reqst;
+		struct srpc_stat_reply		stat_reply;
+		struct srpc_test_reqst		tes_reqst;
+		struct srpc_test_reply		tes_reply;
+		struct srpc_join_reqst		join_reqst;
+		struct srpc_join_reply		join_reply;
+
+		struct srpc_ping_reqst		ping_reqst;
+		struct srpc_ping_reply		ping_reply;
+		struct srpc_brw_reqst		brw_reqst;
+		struct srpc_brw_reply		brw_reply;
+	} msg_body;
+} WIRE_ATTR;
 
 static inline void
-srpc_unpack_msg_hdr(srpc_msg_t *msg)
+srpc_unpack_msg_hdr(struct srpc_msg *msg)
 {
 	if (msg->msg_magic == SRPC_MSG_MAGIC)
 		return; /* no flipping needed */
diff --git a/drivers/staging/lustrefsx/lnet/selftest/selftest.h b/drivers/staging/lustrefsx/lnet/selftest/selftest.h
index 2a29161cd4802..3f7c295e9a90c 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/selftest.h
+++ b/drivers/staging/lustrefsx/lnet/selftest/selftest.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -42,7 +42,7 @@
 #include <lnet/api.h>
 #include <lnet/lib-lnet.h>
 #include <lnet/lib-types.h>
-#include <lnet/lnetst.h>
+#include <uapi/linux/lnet/lnetst.h>
 
 #include "rpc.h"
 #include "timer.h"
@@ -89,7 +89,7 @@ struct sfw_test_instance;
 /* all reply/bulk RDMAs go to this portal */
 #define SRPC_RDMA_PORTAL                52
 
-static inline srpc_msg_type_t
+static inline enum srpc_msg_type
 srpc_service2request (int service)
 {
         switch (service) {
@@ -124,13 +124,13 @@ srpc_service2request (int service)
         }
 }
 
-static inline srpc_msg_type_t
+static inline enum srpc_msg_type
 srpc_service2reply (int service)
 {
         return srpc_service2request(service) + 1;
 }
 
-typedef enum {
+enum srpc_event_type {
         SRPC_BULK_REQ_RCVD   = 1, /* passive bulk request(PUT sink/GET source) received */
         SRPC_BULK_PUT_SENT   = 2, /* active bulk PUT sent (source) */
         SRPC_BULK_GET_RPLD   = 3, /* active bulk GET replied (sink) */
@@ -138,73 +138,74 @@ typedef enum {
         SRPC_REPLY_SENT      = 5, /* outgoing reply sent */
         SRPC_REQUEST_RCVD    = 6, /* incoming request received */
         SRPC_REQUEST_SENT    = 7, /* outgoing request sent */
-} srpc_event_type_t;
+};
 
 /* RPC event */
-typedef struct {
-        srpc_event_type_t ev_type;   /* what's up */
-	enum lnet_event_kind ev_lnet;   /* LNet event type */
+struct srpc_event {
+	enum srpc_event_type	ev_type;   /* what's up */
+	enum lnet_event_kind	ev_lnet;   /* LNet event type */
         int               ev_fired;  /* LNet event fired? */
         int               ev_status; /* LNet event status */
         void             *ev_data;   /* owning server/client RPC */
-} srpc_event_t;
+};
 
-typedef struct {
+/* bulk descriptor */
+struct srpc_bulk {
         int              bk_len;  /* len of bulk data */
 	struct lnet_handle_md bk_mdh;
         int              bk_sink; /* sink/source */
         int              bk_niov; /* # iov in bk_iovs */
         lnet_kiov_t      bk_iovs[0];
-} srpc_bulk_t; /* bulk descriptor */
+};
 
 /* message buffer descriptor */
-typedef struct srpc_buffer {
+struct srpc_buffer {
 	struct list_head	buf_list; /* chain on srpc_service::*_msgq */
-	srpc_msg_t		buf_msg;
+	struct srpc_msg		buf_msg;
 	struct lnet_handle_md	buf_mdh;
 	lnet_nid_t		buf_self;
 	struct lnet_process_id	buf_peer;
-} srpc_buffer_t;
+};
 
 struct swi_workitem;
-typedef int (*swi_action_t) (struct swi_workitem *);
+typedef int (*swi_action_t)(struct swi_workitem *);
 
-typedef struct swi_workitem {
+struct swi_workitem {
 	struct cfs_wi_sched	*swi_sched;
-	struct cfs_workitem       swi_workitem;
+	struct cfs_workitem	swi_workitem;
         swi_action_t         swi_action;
         int                  swi_state;
-} swi_workitem_t;
+};
 
 /* server-side state of a RPC */
-typedef struct srpc_server_rpc {
+struct srpc_server_rpc {
 	/* chain on srpc_service::*_rpcq */
 	struct list_head	srpc_list;
 	struct srpc_service_cd *srpc_scd;
-	swi_workitem_t		srpc_wi;
-	srpc_event_t		srpc_ev;	/* bulk/reply event */
+	struct swi_workitem	srpc_wi;
+	struct srpc_event	srpc_ev;	/* bulk/reply event */
 	lnet_nid_t		srpc_self;
 	struct lnet_process_id	srpc_peer;
-	srpc_msg_t		srpc_replymsg;
+	struct srpc_msg		srpc_replymsg;
 	struct lnet_handle_md	srpc_replymdh;
-	srpc_buffer_t		*srpc_reqstbuf;
-	srpc_bulk_t		*srpc_bulk;
+	struct srpc_buffer     *srpc_reqstbuf;
+	struct srpc_bulk       *srpc_bulk;
 
 	unsigned int	srpc_aborted; /* being given up */
 	int		srpc_status;
 	void		(*srpc_done)(struct srpc_server_rpc *);
-} srpc_server_rpc_t;
+};
 
 /* client-side state of a RPC */
-typedef struct srpc_client_rpc {
+struct srpc_client_rpc {
 	struct list_head	crpc_list;	/* chain on user's lists */
 	spinlock_t		crpc_lock;	/* serialize */
 	int			crpc_service;
 	atomic_t		crpc_refcount;
 	/* # seconds to wait for reply */
 	int			crpc_timeout;
-	stt_timer_t		crpc_timer;
-	swi_workitem_t		crpc_wi;
+	struct stt_timer	crpc_timer;
+	struct swi_workitem	crpc_wi;
 	struct lnet_process_id	crpc_dest;
 
         void               (*crpc_done)(struct srpc_client_rpc *);
@@ -216,21 +217,21 @@ typedef struct srpc_client_rpc {
         unsigned int         crpc_aborted:1; /* being given up */
         unsigned int         crpc_closed:1;  /* completed */
 
-        /* RPC events */
-        srpc_event_t         crpc_bulkev;    /* bulk event */
-        srpc_event_t         crpc_reqstev;   /* request event */
-        srpc_event_t         crpc_replyev;   /* reply event */
+	/* RPC events */
+	struct srpc_event	crpc_bulkev;	/* bulk event */
+	struct srpc_event	crpc_reqstev;	/* request event */
+	struct srpc_event	crpc_replyev;	/* reply event */
 
-        /* bulk, request(reqst), and reply exchanged on wire */
-        srpc_msg_t           crpc_reqstmsg;
-        srpc_msg_t           crpc_replymsg;
+	/* bulk, request(reqst), and reply exchanged on wire */
+	struct srpc_msg		crpc_reqstmsg;
+	struct srpc_msg		crpc_replymsg;
 	struct lnet_handle_md	crpc_reqstmdh;
 	struct lnet_handle_md	crpc_replymdh;
-        srpc_bulk_t          crpc_bulk;
-} srpc_client_rpc_t;
+	struct srpc_bulk	crpc_bulk;
+};
 
 #define srpc_client_rpc_size(rpc)                                       \
-offsetof(srpc_client_rpc_t, crpc_bulk.bk_iovs[(rpc)->crpc_bulk.bk_niov])
+offsetof(struct srpc_client_rpc, crpc_bulk.bk_iovs[(rpc)->crpc_bulk.bk_niov])
 
 #define srpc_client_rpc_addref(rpc)                                     \
 do {                                                                    \
@@ -262,19 +263,19 @@ struct srpc_service_cd {
 	/** backref to service */
 	struct srpc_service	*scd_svc;
 	/** event buffer */
-	srpc_event_t		scd_ev;
+	struct srpc_event	scd_ev;
 	/** free RPC descriptors */
 	struct list_head	scd_rpc_free;
 	/** in-flight RPCs */
 	struct list_head	scd_rpc_active;
 	/** workitem for posting buffer */
-	swi_workitem_t		scd_buf_wi;
+	struct swi_workitem	scd_buf_wi;
 	/** CPT id */
 	int			scd_cpt;
 	/** error code for scd_buf_wi */
 	int			scd_buf_err;
 	/** timestamp for scd_buf_err */
-	unsigned long		scd_buf_err_stamp;
+	time64_t		scd_buf_err_stamp;
 	/** total # request buffers */
 	int			scd_buf_total;
 	/** # posted request buffers */
@@ -302,7 +303,7 @@ struct srpc_service_cd {
 #define SFW_FRWK_WI_MIN		16
 #define SFW_FRWK_WI_MAX		256
 
-typedef struct srpc_service {
+struct srpc_service {
 	int			sv_id;		/* service id */
 	const char		*sv_name;	/* human readable name */
 	int			sv_wi_total;	/* total server workitems */
@@ -314,11 +315,11 @@ typedef struct srpc_service {
          * - sv_handler: process incoming RPC request
          * - sv_bulk_ready: notify bulk data
          */
-        int              (*sv_handler) (srpc_server_rpc_t *);
-        int              (*sv_bulk_ready) (srpc_server_rpc_t *, int);
-} srpc_service_t;
+	int              (*sv_handler)(struct srpc_server_rpc *);
+	int              (*sv_bulk_ready)(struct srpc_server_rpc *, int);
+};
 
-typedef struct {
+struct sfw_session {
 	/* chain on fw_zombie_sessions */
 	struct list_head	sn_list;
 	struct lst_sid		sn_id;		/* unique identifier */
@@ -326,42 +327,42 @@ typedef struct {
 	unsigned int		sn_timeout;
 	int			sn_timer_active;
 	unsigned int		sn_features;
-	stt_timer_t		sn_timer;
+	struct stt_timer	sn_timer;
 	struct list_head	sn_batches;	/* list of batches */
 	char			sn_name[LST_NAME_SIZE];
 	atomic_t		sn_refcount;
 	atomic_t		sn_brw_errors;
 	atomic_t		sn_ping_errors;
-	cfs_time_t		sn_started;
-} sfw_session_t;
+	ktime_t			sn_started;
+};
 
 #define sfw_sid_equal(sid0, sid1)     ((sid0).ses_nid == (sid1).ses_nid && \
                                        (sid0).ses_stamp == (sid1).ses_stamp)
 
-typedef struct {
+struct sfw_batch {
 	struct list_head	bat_list;	/* chain on sn_batches */
 	struct lst_bid		bat_id;		/* batch id */
 	int			bat_error;	/* error code of batch */
-	sfw_session_t		*bat_session;	/* batch's session */
+	struct sfw_session	*bat_session;	/* batch's session */
 	atomic_t		bat_nactive;	/* # of active tests */
 	struct list_head	bat_tests;	/* test instances */
-} sfw_batch_t;
+};
 
-typedef struct {
-        int  (*tso_init)(struct sfw_test_instance *tsi); /* intialize test client */
-        void (*tso_fini)(struct sfw_test_instance *tsi); /* finalize test client */
-        int  (*tso_prep_rpc)(struct sfw_test_unit *tsu,
+struct sfw_test_client_ops {
+	int  (*tso_init)(struct sfw_test_instance *tsi); /* intailize test client */
+	void (*tso_fini)(struct sfw_test_instance *tsi); /* finalize test client */
+	int  (*tso_prep_rpc)(struct sfw_test_unit *tsu,
 			     struct lnet_process_id dest,
-                             srpc_client_rpc_t **rpc);   /* prep a tests rpc */
+			     struct srpc_client_rpc **rpc); /* prep a tests rpc */
         void (*tso_done_rpc)(struct sfw_test_unit *tsu,
-                             srpc_client_rpc_t *rpc);    /* done a test rpc */
-} sfw_test_client_ops_t;
+			     struct srpc_client_rpc *rpc);  /* done a test rpc */
+};
 
-typedef struct sfw_test_instance {
+struct sfw_test_instance {
 	struct list_head	tsi_list;	/* chain on batch */
 	int			tsi_service;	/* test type */
-	sfw_batch_t		*tsi_batch;	/* batch */
-	sfw_test_client_ops_t	*tsi_ops;	/* test client operations */
+	struct sfw_batch	*tsi_batch;	/* batch */
+	struct sfw_test_client_ops	*tsi_ops;	/* test client operations */
 
 	/* public parameter for all test units */
 	unsigned int		tsi_is_client:1;     /* is test client */
@@ -378,11 +379,11 @@ typedef struct sfw_test_instance {
 	struct list_head	tsi_active_rpcs;/* active rpcs */
 
 	union {
-		test_ping_req_t		ping;	  /* ping parameter */
-		test_bulk_req_t		bulk_v0;  /* bulk parameter */
-		test_bulk_req_v1_t	bulk_v1;  /* bulk v1 parameter */
+		struct test_ping_req	ping;	  /* ping parameter */
+		struct test_bulk_req	bulk_v0;  /* bulk parameter */
+		struct test_bulk_req_v1	bulk_v1;  /* bulk v1 parameter */
 	} tsi_u;
-} sfw_test_instance_t;
+};
 
 /* XXX: trailing (PAGE_SIZE % sizeof(struct lnet_process_id)) bytes at
  * the end of pages are not used */
@@ -391,57 +392,59 @@ typedef struct sfw_test_instance {
 #define SFW_MAX_NDESTS     (LNET_MAX_IOV * SFW_ID_PER_PAGE)
 #define sfw_id_pages(n)    (((n) + SFW_ID_PER_PAGE - 1) / SFW_ID_PER_PAGE)
 
-typedef struct sfw_test_unit {
+struct sfw_test_unit {
 	struct list_head	tsu_list;	/* chain on lst_test_instance */
 	struct lnet_process_id	tsu_dest;	/* id of dest node */
 	int			tsu_loop;	/* loop count of the test */
-	sfw_test_instance_t	*tsu_instance;	/* pointer to test instance */
+	struct sfw_test_instance *tsu_instance;	/* pointer to test instance */
 	void			*tsu_private;	/* private data */
-	swi_workitem_t		tsu_worker;	/* workitem of the test unit */
-} sfw_test_unit_t;
+	struct swi_workitem	 tsu_worker;	/* workitem of the test unit */
+};
 
-typedef struct sfw_test_case {
-	struct list_head	tsc_list;		/* chain on fw_tests */
-	srpc_service_t		*tsc_srv_service;	/* test service */
-	sfw_test_client_ops_t	*tsc_cli_ops;		/* ops of test client */
-} sfw_test_case_t;
+struct sfw_test_case {
+	struct list_head		tsc_list;		/* chain on fw_tests */
+	struct srpc_service		*tsc_srv_service;	/* test service */
+	struct sfw_test_client_ops	*tsc_cli_ops;		/* ops of test client */
+};
 
-srpc_client_rpc_t *
+struct srpc_client_rpc *
 sfw_create_rpc(struct lnet_process_id peer, int service,
 	       unsigned features, int nbulkiov, int bulklen,
-	       void (*done) (srpc_client_rpc_t *), void *priv);
-int sfw_create_test_rpc(sfw_test_unit_t *tsu,
+	       void (*done)(struct srpc_client_rpc *), void *priv);
+int sfw_create_test_rpc(struct sfw_test_unit *tsu,
 			struct lnet_process_id peer, unsigned int features,
-			int nblk, int blklen, srpc_client_rpc_t **rpc);
-void sfw_abort_rpc(srpc_client_rpc_t *rpc);
-void sfw_post_rpc(srpc_client_rpc_t *rpc);
-void sfw_client_rpc_done(srpc_client_rpc_t *rpc);
-void sfw_unpack_message(srpc_msg_t *msg);
-void sfw_free_pages(srpc_server_rpc_t *rpc);
-void sfw_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i);
-int sfw_alloc_pages(srpc_server_rpc_t *rpc, int cpt, int npages, int len,
+			int nblk, int blklen, struct srpc_client_rpc **rpc);
+void sfw_abort_rpc(struct srpc_client_rpc *rpc);
+void sfw_post_rpc(struct srpc_client_rpc *rpc);
+void sfw_client_rpc_done(struct srpc_client_rpc *rpc);
+void sfw_unpack_message(struct srpc_msg *msg);
+void sfw_free_pages(struct srpc_server_rpc *rpc);
+void sfw_add_bulk_page(struct srpc_bulk *bk, struct page *pg, int i);
+int sfw_alloc_pages(struct srpc_server_rpc *rpc, int cpt, int npages, int len,
 		    int sink);
-int sfw_make_session (srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply);
+int sfw_make_session(struct srpc_mksn_reqst *request,
+		     struct srpc_mksn_reply *reply);
 
-srpc_client_rpc_t *
+struct srpc_client_rpc *
 srpc_create_client_rpc(struct lnet_process_id peer, int service,
                        int nbulkiov, int bulklen,
-                       void (*rpc_done)(srpc_client_rpc_t *),
-                       void (*rpc_fini)(srpc_client_rpc_t *), void *priv);
-void srpc_post_rpc(srpc_client_rpc_t *rpc);
-void srpc_abort_rpc(srpc_client_rpc_t *rpc, int why);
-void srpc_free_bulk(srpc_bulk_t *bk);
-srpc_bulk_t *srpc_alloc_bulk(int cpt, unsigned off, unsigned bulk_npg,
-			     unsigned bulk_len, int sink);
-int srpc_send_rpc(swi_workitem_t *wi);
-int srpc_send_reply(srpc_server_rpc_t *rpc);
-int srpc_add_service(srpc_service_t *sv);
-int srpc_remove_service(srpc_service_t *sv);
-void srpc_shutdown_service(srpc_service_t *sv);
-void srpc_abort_service(srpc_service_t *sv);
-int srpc_finish_service(srpc_service_t *sv);
-int srpc_service_add_buffers(srpc_service_t *sv, int nbuffer);
-void srpc_service_remove_buffers(srpc_service_t *sv, int nbuffer);
+		       void (*rpc_done)(struct srpc_client_rpc *),
+		       void (*rpc_fini)(struct srpc_client_rpc *), void *priv);
+void srpc_post_rpc(struct srpc_client_rpc *rpc);
+void srpc_abort_rpc(struct srpc_client_rpc *rpc, int why);
+void srpc_free_bulk(struct srpc_bulk *bk);
+struct srpc_bulk *srpc_alloc_bulk(int cpt, unsigned int off,
+				  unsigned int bulk_npg, unsigned int bulk_len,
+				  int sink);
+int srpc_send_rpc(struct swi_workitem *wi);
+int srpc_send_reply(struct srpc_server_rpc *rpc);
+int srpc_add_service(struct srpc_service *sv);
+int srpc_remove_service(struct srpc_service *sv);
+void srpc_shutdown_service(struct srpc_service *sv);
+void srpc_abort_service(struct srpc_service *sv);
+int srpc_finish_service(struct srpc_service *sv);
+int srpc_service_add_buffers(struct srpc_service *sv, int nbuffer);
+void srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer);
 void srpc_get_counters(struct srpc_counters *cnt);
 void srpc_set_counters(const struct srpc_counters *cnt);
 
@@ -457,13 +460,14 @@ srpc_serv_is_framework(struct srpc_service *svc)
 static inline int
 swi_wi_action(struct cfs_workitem *wi)
 {
-        swi_workitem_t *swi = container_of(wi, swi_workitem_t, swi_workitem);
+	struct swi_workitem *swi;
 
-        return swi->swi_action(swi);
+	swi = container_of(wi, struct swi_workitem, swi_workitem);
+	return swi->swi_action(swi);
 }
 
 static inline void
-swi_init_workitem(swi_workitem_t *swi, void *data,
+swi_init_workitem(struct swi_workitem *swi, void *data,
 		  swi_action_t action, struct cfs_wi_sched *sched)
 {
 	swi->swi_sched  = sched;
@@ -473,19 +477,19 @@ swi_init_workitem(swi_workitem_t *swi, void *data,
 }
 
 static inline void
-swi_schedule_workitem(swi_workitem_t *wi)
+swi_schedule_workitem(struct swi_workitem *wi)
 {
 	cfs_wi_schedule(wi->swi_sched, &wi->swi_workitem);
 }
 
 static inline void
-swi_exit_workitem(swi_workitem_t *swi)
+swi_exit_workitem(struct swi_workitem *swi)
 {
 	cfs_wi_exit(swi->swi_sched, &swi->swi_workitem);
 }
 
 static inline int
-swi_deschedule_workitem(swi_workitem_t *swi)
+swi_deschedule_workitem(struct swi_workitem *swi)
 {
 	return cfs_wi_deschedule(swi->swi_sched, &swi->swi_workitem);
 }
@@ -496,7 +500,7 @@ void sfw_shutdown(void);
 void srpc_shutdown(void);
 
 static inline void
-srpc_destroy_client_rpc (srpc_client_rpc_t *rpc)
+srpc_destroy_client_rpc(struct srpc_client_rpc *rpc)
 {
 	LASSERT (rpc != NULL);
 	LASSERT (!srpc_event_pending(rpc));
@@ -512,14 +516,14 @@ srpc_destroy_client_rpc (srpc_client_rpc_t *rpc)
 }
 
 static inline void
-srpc_init_client_rpc(srpc_client_rpc_t *rpc, struct lnet_process_id peer,
+srpc_init_client_rpc(struct srpc_client_rpc *rpc, struct lnet_process_id peer,
 		     int service, int nbulkiov, int bulklen,
-		     void (*rpc_done)(srpc_client_rpc_t *),
-		     void (*rpc_fini)(srpc_client_rpc_t *), void *priv)
+		     void (*rpc_done)(struct srpc_client_rpc *),
+		     void (*rpc_fini)(struct srpc_client_rpc *), void *priv)
 {
 	LASSERT(nbulkiov <= LNET_MAX_IOV);
 
-	memset(rpc, 0, offsetof(srpc_client_rpc_t,
+	memset(rpc, 0, offsetof(struct srpc_client_rpc,
 				crpc_bulk.bk_iovs[nbulkiov]));
 
 	INIT_LIST_HEAD(&rpc->crpc_list);
@@ -585,7 +589,7 @@ do {									\
 } while (0)
 
 static inline void
-srpc_wait_service_shutdown(srpc_service_t *sv)
+srpc_wait_service_shutdown(struct srpc_service *sv)
 {
 	int i = 2;
 
@@ -601,13 +605,13 @@ srpc_wait_service_shutdown(srpc_service_t *sv)
 	}
 }
 
-extern sfw_test_client_ops_t ping_test_client;
-extern srpc_service_t        ping_test_service;
+extern struct sfw_test_client_ops ping_test_client;
+extern struct srpc_service ping_test_service;
 void ping_init_test_client(void);
 void ping_init_test_service(void);
 
-extern sfw_test_client_ops_t brw_test_client;
-extern srpc_service_t        brw_test_service;
+extern struct sfw_test_client_ops brw_test_client;
+extern struct srpc_service brw_test_service;
 void brw_init_test_client(void);
 void brw_init_test_service(void);
 
diff --git a/drivers/staging/lustrefsx/lnet/selftest/timer.c b/drivers/staging/lustrefsx/lnet/selftest/timer.c
index 7e09e6672b3ef..3ceec81bf1b08 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/timer.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/timer.c
@@ -56,7 +56,7 @@
 static struct st_timer_data {
 	spinlock_t		stt_lock;
 	/* start time of the slot processed previously */
-	cfs_time_t		stt_prev_slot;
+	time64_t		stt_prev_slot;
 	struct list_head	stt_hash[STTIMER_NSLOTS];
 	int			stt_shuttingdown;
 	wait_queue_head_t	stt_waitq;
@@ -64,7 +64,7 @@ static struct st_timer_data {
 } stt_data;
 
 void
-stt_add_timer(stt_timer_t *timer)
+stt_add_timer(struct stt_timer *timer)
 {
 	struct list_head *pos;
 
@@ -74,11 +74,12 @@ stt_add_timer(stt_timer_t *timer)
 	LASSERT(!stt_data.stt_shuttingdown);
 	LASSERT(timer->stt_func != NULL);
 	LASSERT(list_empty(&timer->stt_list));
-	LASSERT(cfs_time_after(timer->stt_expires, cfs_time_current_sec()));
+	LASSERT(timer->stt_expires > ktime_get_real_seconds());
 
 	/* a simple insertion sort */
 	list_for_each_prev(pos, STTIMER_SLOT(timer->stt_expires)) {
-		stt_timer_t *old = list_entry(pos, stt_timer_t, stt_list);
+		struct stt_timer *old = list_entry(pos, struct stt_timer,
+						   stt_list);
 
 		if (timer->stt_expires >= old->stt_expires)
 			break;
@@ -98,7 +99,7 @@ stt_add_timer(stt_timer_t *timer)
  * another CPU.
  */
 int
-stt_del_timer(stt_timer_t *timer)
+stt_del_timer(struct stt_timer *timer)
 {
 	int ret = 0;
 
@@ -118,13 +119,13 @@ stt_del_timer(stt_timer_t *timer)
 
 /* called with stt_data.stt_lock held */
 static int
-stt_expire_list(struct list_head *slot, cfs_time_t now)
+stt_expire_list(struct list_head *slot, time64_t now)
 {
 	int	     expired = 0;
-	stt_timer_t *timer;
+	struct stt_timer *timer;
 
 	while (!list_empty(slot)) {
-		timer = list_entry(slot->next, stt_timer_t, stt_list);
+		timer = list_entry(slot->next, struct stt_timer, stt_list);
 
 		if (timer->stt_expires > now)
 			break;
@@ -142,20 +143,20 @@ stt_expire_list(struct list_head *slot, cfs_time_t now)
 }
 
 static int
-stt_check_timers(cfs_time_t *last)
+stt_check_timers(time64_t *last)
 {
 	int expired = 0;
-	cfs_time_t now;
-        cfs_time_t this_slot;
+	time64_t now;
+	time64_t this_slot;
 
-	now = cfs_time_current_sec();
-        this_slot = now & STTIMER_SLOTTIMEMASK;
+	now = ktime_get_real_seconds();
+	this_slot = now & STTIMER_SLOTTIMEMASK;
 
 	spin_lock(&stt_data.stt_lock);
 
-	while (cfs_time_aftereq(this_slot, *last)) {
+	while (this_slot >= *last) {
 		expired += stt_expire_list(STTIMER_SLOT(this_slot), now);
-		this_slot = cfs_time_sub(this_slot, STTIMER_SLOTTIME);
+		this_slot = this_slot - STTIMER_SLOTTIME;
 	}
 
 	*last = now & STTIMER_SLOTTIMEMASK;
@@ -210,7 +211,7 @@ stt_startup (void)
         int i;
 
         stt_data.stt_shuttingdown = 0;
-	stt_data.stt_prev_slot = cfs_time_current_sec() & STTIMER_SLOTTIMEMASK;
+	stt_data.stt_prev_slot = ktime_get_real_seconds() & STTIMER_SLOTTIMEMASK;
 
 	spin_lock_init(&stt_data.stt_lock);
         for (i = 0; i < STTIMER_NSLOTS; i++)
diff --git a/drivers/staging/lustrefsx/lnet/selftest/timer.h b/drivers/staging/lustrefsx/lnet/selftest/timer.h
index 71c3de2736b15..e769c4cc9ebd7 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/timer.h
+++ b/drivers/staging/lustrefsx/lnet/selftest/timer.h
@@ -34,15 +34,15 @@
 #ifndef __SELFTEST_TIMER_H__
 #define __SELFTEST_TIMER_H__
 
-typedef struct {
+struct stt_timer {
 	struct list_head	stt_list;
-	cfs_time_t		stt_expires;
+	time64_t		stt_expires;
 	void			(*stt_func)(void *);
 	void			*stt_data;
-} stt_timer_t;
+};
 
-void stt_add_timer(stt_timer_t *timer);
-int stt_del_timer(stt_timer_t *timer);
+void stt_add_timer(struct stt_timer *timer);
+int stt_del_timer(struct stt_timer *timer);
 int stt_startup(void);
 void stt_shutdown(void);
 
diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_handler.c b/drivers/staging/lustrefsx/lustre/fid/fid_handler.c
index ef61772f0dcb2..8676ec223548d 100644
--- a/drivers/staging/lustrefsx/lustre/fid/fid_handler.c
+++ b/drivers/staging/lustrefsx/lustre/fid/fid_handler.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -48,8 +48,6 @@
 #include <lustre_fid.h>
 #include "fid_internal.h"
 
-static void seq_server_proc_fini(struct lu_server_seq *seq);
-
 /* Assigns client to sequence controller node. */
 int seq_server_set_cli(const struct lu_env *env, struct lu_server_seq *seq,
 		       struct lu_client_seq *cli)
@@ -458,35 +456,43 @@ LU_KEY_INIT_FINI(seq, struct seq_thread_info);
 /* context key: seq_thread_key */
 LU_CONTEXT_KEY_DEFINE(seq, LCT_MD_THREAD | LCT_DT_THREAD);
 
-extern const struct file_operations seq_fld_proc_seq_fops;
+extern const struct file_operations seq_fld_debugfs_seq_fops;
+
+static void seq_server_debugfs_fini(struct lu_server_seq *seq)
+{
+	if (!IS_ERR_OR_NULL(seq->lss_debugfs_entry))
+		ldebugfs_remove(&seq->lss_debugfs_entry);
+}
 
-static int seq_server_proc_init(struct lu_server_seq *seq)
+static int seq_server_debugfs_init(struct lu_server_seq *seq)
 {
-#ifdef CONFIG_PROC_FS
 	int rc;
 	ENTRY;
 
-	seq->lss_proc_dir = lprocfs_register(seq->lss_name,
-					     seq_type_proc_dir,
-					     NULL, NULL);
-	if (IS_ERR(seq->lss_proc_dir)) {
-		rc = PTR_ERR(seq->lss_proc_dir);
+	seq->lss_debugfs_entry = ldebugfs_register(seq->lss_name,
+						   seq_debugfs_dir,
+						   NULL, NULL);
+	if (IS_ERR_OR_NULL(seq->lss_debugfs_entry)) {
+		rc = seq->lss_debugfs_entry ? PTR_ERR(seq->lss_debugfs_entry)
+					    : -ENOMEM;
+		seq->lss_debugfs_entry = NULL;
 		RETURN(rc);
 	}
 
-	rc = lprocfs_add_vars(seq->lss_proc_dir, seq_server_proc_list, seq);
+	rc = ldebugfs_add_vars(seq->lss_debugfs_entry,
+			       seq_server_debugfs_list, seq);
 	if (rc) {
-		CERROR("%s: Can't init sequence manager "
-		       "proc, rc %d\n", seq->lss_name, rc);
+		CERROR("%s: Can't init sequence manager debugfs, rc %d\n",
+		       seq->lss_name, rc);
 		GOTO(out_cleanup, rc);
 	}
 
 	if (seq->lss_type == LUSTRE_SEQ_CONTROLLER) {
-		rc = lprocfs_seq_create(seq->lss_proc_dir, "fldb", 0644,
-					&seq_fld_proc_seq_fops, seq);
+		rc = ldebugfs_seq_create(seq->lss_debugfs_entry, "fldb", 0644,
+					 &seq_fld_debugfs_seq_fops, seq);
 		if (rc) {
-			CERROR("%s: Can't create fldb for sequence manager "
-			       "proc: rc = %d\n", seq->lss_name, rc);
+			CERROR("%s: Can't create fldb for sequence manager debugfs: rc = %d\n",
+			       seq->lss_name, rc);
 			GOTO(out_cleanup, rc);
 		}
 	}
@@ -494,24 +500,8 @@ static int seq_server_proc_init(struct lu_server_seq *seq)
 	RETURN(0);
 
 out_cleanup:
-	seq_server_proc_fini(seq);
+	seq_server_debugfs_fini(seq);
 	return rc;
-#else /* !CONFIG_PROC_FS */
-	return 0;
-#endif /* CONFIG_PROC_FS */
-}
-
-static void seq_server_proc_fini(struct lu_server_seq *seq)
-{
-#ifdef CONFIG_PROC_FS
-        ENTRY;
-        if (seq->lss_proc_dir != NULL) {
-                if (!IS_ERR(seq->lss_proc_dir))
-                        lprocfs_remove(&seq->lss_proc_dir);
-                seq->lss_proc_dir = NULL;
-        }
-        EXIT;
-#endif /* CONFIG_PROC_FS */
 }
 
 int seq_server_init(const struct lu_env *env,
@@ -592,7 +582,7 @@ int seq_server_init(const struct lu_env *env,
 			lu_seq_range_is_sane(&seq->lss_space));
 	}
 
-        rc  = seq_server_proc_init(seq);
+	rc  = seq_server_debugfs_init(seq);
         if (rc)
                 GOTO(out, rc);
 
@@ -609,7 +599,7 @@ void seq_server_fini(struct lu_server_seq *seq,
 {
         ENTRY;
 
-        seq_server_proc_fini(seq);
+	seq_server_debugfs_fini(seq);
         seq_store_fini(seq, env);
 
         EXIT;
diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_internal.h b/drivers/staging/lustrefsx/lustre/fid/fid_internal.h
index 9ad1420e1812e..1c6587d43b52b 100644
--- a/drivers/staging/lustrefsx/lustre/fid/fid_internal.h
+++ b/drivers/staging/lustrefsx/lustre/fid/fid_internal.h
@@ -36,7 +36,6 @@
 #ifndef __FID_INTERNAL_H
 #define __FID_INTERNAL_H
 
-#include <lustre/lustre_idl.h>
 #include <libcfs/libcfs.h>
 
 #ifdef HAVE_SERVER_SUPPORT
@@ -56,9 +55,7 @@ enum {
 
 extern struct lu_context_key seq_thread_key;
 
-# ifdef CONFIG_PROC_FS
-extern struct lprocfs_vars seq_server_proc_list[];
-# endif
+extern struct ldebugfs_vars seq_server_debugfs_list[];
 
 /* Store API functions. */
 struct dt_device;
@@ -90,10 +87,8 @@ void fid_server_mod_exit(void);
 int seq_client_alloc_super(struct lu_client_seq *seq,
 			   const struct lu_env *env);
 
-# ifdef CONFIG_PROC_FS
-extern struct lprocfs_vars seq_client_proc_list[];
-# endif
+extern struct dentry *seq_debugfs_dir;
 
-extern struct proc_dir_entry *seq_type_proc_dir;
+extern struct ldebugfs_vars seq_client_debugfs_list[];
 
 #endif /* __FID_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_lib.c b/drivers/staging/lustrefsx/lustre/fid/fid_lib.c
index 7c5477c044351..ab3a59820abc7 100644
--- a/drivers/staging/lustrefsx/lustre/fid/fid_lib.c
+++ b/drivers/staging/lustrefsx/lustre/fid/fid_lib.c
@@ -41,7 +41,6 @@
 
 #include <libcfs/libcfs.h>
 #include <linux/module.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_fid.h>
 
 /**
diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_request.c b/drivers/staging/lustrefsx/lustre/fid/fid_request.c
index ab1cca59bc916..93f6402a12232 100644
--- a/drivers/staging/lustrefsx/lustre/fid/fid_request.c
+++ b/drivers/staging/lustrefsx/lustre/fid/fid_request.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,8 +38,8 @@
 
 #define DEBUG_SUBSYSTEM S_FID
 
+#include <linux/err.h>
 #include <linux/module.h>
-#include <libcfs/libcfs.h>
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
@@ -48,6 +48,8 @@
 #include <lustre_mdc.h>
 #include "fid_internal.h"
 
+struct dentry *seq_debugfs_dir;
+
 static int seq_client_rpc(struct lu_client_seq *seq,
                           struct lu_seq_range *output, __u32 opc,
                           const char *opcname)
@@ -438,51 +440,57 @@ void seq_client_flush(struct lu_client_seq *seq)
 }
 EXPORT_SYMBOL(seq_client_flush);
 
-static void seq_client_proc_fini(struct lu_client_seq *seq)
+static void seq_client_debugfs_fini(struct lu_client_seq *seq)
 {
-#ifdef CONFIG_PROC_FS
-	ENTRY;
-	if (seq->lcs_proc_dir) {
-		if (!IS_ERR(seq->lcs_proc_dir))
-			lprocfs_remove(&seq->lcs_proc_dir);
-		seq->lcs_proc_dir = NULL;
-	}
-	EXIT;
-#endif /* CONFIG_PROC_FS */
+	if (!IS_ERR_OR_NULL(seq->lcs_debugfs_entry))
+		ldebugfs_remove(&seq->lcs_debugfs_entry);
 }
 
-static int seq_client_proc_init(struct lu_client_seq *seq)
+static int seq_client_debugfs_init(struct lu_client_seq *seq)
 {
-#ifdef CONFIG_PROC_FS
         int rc;
-        ENTRY;
 
-	seq->lcs_proc_dir = lprocfs_register(seq->lcs_name, seq_type_proc_dir,
-					     NULL, NULL);
-        if (IS_ERR(seq->lcs_proc_dir)) {
-                CERROR("%s: LProcFS failed in seq-init\n",
-                       seq->lcs_name);
-                rc = PTR_ERR(seq->lcs_proc_dir);
-                RETURN(rc);
+	seq->lcs_debugfs_entry = ldebugfs_register(seq->lcs_name,
+						   seq_debugfs_dir,
+						   NULL, NULL);
+	if (IS_ERR_OR_NULL(seq->lcs_debugfs_entry)) {
+		CERROR("%s: LdebugFS failed in seq-init\n", seq->lcs_name);
+		rc = seq->lcs_debugfs_entry ? PTR_ERR(seq->lcs_debugfs_entry)
+					    : -ENOMEM;
+		seq->lcs_debugfs_entry = NULL;
+		RETURN(rc);
         }
 
-	rc = lprocfs_add_vars(seq->lcs_proc_dir, seq_client_proc_list, seq);
-        if (rc) {
-                CERROR("%s: Can't init sequence manager "
-                       "proc, rc %d\n", seq->lcs_name, rc);
+	rc = ldebugfs_add_vars(seq->lcs_debugfs_entry,
+			       seq_client_debugfs_list, seq);
+	if (rc) {
+		CERROR("%s: Can't init sequence manager debugfs, rc %d\n",
+		       seq->lcs_name, rc);
                 GOTO(out_cleanup, rc);
         }
 
         RETURN(0);
 
 out_cleanup:
-        seq_client_proc_fini(seq);
+	seq_client_debugfs_fini(seq);
         return rc;
+}
 
-#else /* !CONFIG_PROC_FS */
-	return 0;
-#endif /* CONFIG_PROC_FS */
+void seq_client_fini(struct lu_client_seq *seq)
+{
+	ENTRY;
+
+	seq_client_debugfs_fini(seq);
+
+	if (seq->lcs_exp != NULL) {
+		class_export_put(seq->lcs_exp);
+		seq->lcs_exp = NULL;
+	}
+
+	seq->lcs_srv = NULL;
+	EXIT;
 }
+EXPORT_SYMBOL(seq_client_fini);
 
 int seq_client_init(struct lu_client_seq *seq,
                     struct obd_export *exp,
@@ -515,29 +523,13 @@ int seq_client_init(struct lu_client_seq *seq,
 	snprintf(seq->lcs_name, sizeof(seq->lcs_name),
 		 "cli-%s", prefix);
 
-	rc = seq_client_proc_init(seq);
+	rc = seq_client_debugfs_init(seq);
 	if (rc)
 		seq_client_fini(seq);
 	RETURN(rc);
 }
 EXPORT_SYMBOL(seq_client_init);
 
-void seq_client_fini(struct lu_client_seq *seq)
-{
-        ENTRY;
-
-        seq_client_proc_fini(seq);
-
-        if (seq->lcs_exp != NULL) {
-                class_export_put(seq->lcs_exp);
-                seq->lcs_exp = NULL;
-        }
-
-        seq->lcs_srv = NULL;
-        EXIT;
-}
-EXPORT_SYMBOL(seq_client_fini);
-
 int client_fid_init(struct obd_device *obd,
 		    struct obd_export *exp, enum lu_cli_type type)
 {
@@ -591,21 +583,18 @@ int client_fid_fini(struct obd_device *obd)
 }
 EXPORT_SYMBOL(client_fid_fini);
 
-struct proc_dir_entry *seq_type_proc_dir;
-
 static int __init fid_init(void)
 {
-	seq_type_proc_dir = lprocfs_register(LUSTRE_SEQ_NAME,
-					     proc_lustre_root,
-					     NULL, NULL);
-	if (IS_ERR(seq_type_proc_dir))
-		return PTR_ERR(seq_type_proc_dir);
+#ifdef HAVE_SERVER_SUPPORT
+	int rc = fid_server_mod_init();
 
-# ifdef HAVE_SERVER_SUPPORT
-	fid_server_mod_init();
-# endif
-
-	return 0;
+	if (rc)
+		return rc;
+#endif
+	seq_debugfs_dir = ldebugfs_register(LUSTRE_SEQ_NAME,
+					    debugfs_lustre_root,
+					    NULL, NULL);
+	return PTR_ERR_OR_ZERO(seq_debugfs_dir);
 }
 
 static void __exit fid_exit(void)
@@ -613,11 +602,8 @@ static void __exit fid_exit(void)
 # ifdef HAVE_SERVER_SUPPORT
 	fid_server_mod_exit();
 # endif
-
-	if (seq_type_proc_dir != NULL && !IS_ERR(seq_type_proc_dir)) {
-		lprocfs_remove(&seq_type_proc_dir);
-		seq_type_proc_dir = NULL;
-	}
+	if (!IS_ERR_OR_NULL(seq_debugfs_dir))
+		ldebugfs_remove(&seq_debugfs_dir);
 }
 
 MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_store.c b/drivers/staging/lustrefsx/lustre/fid/fid_store.c
index 225ddfad6f634..1565d80811d29 100644
--- a/drivers/staging/lustrefsx/lustre/fid/fid_store.c
+++ b/drivers/staging/lustrefsx/lustre/fid/fid_store.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c b/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c
index d95888f15cfcb..5ac2b883d0861 100644
--- a/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c
+++ b/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -47,8 +47,6 @@
 #include <lprocfs_status.h>
 #include "fid_internal.h"
 
-#ifdef CONFIG_PROC_FS
-
 /* Format: [0x64BIT_INT - 0x64BIT_INT] + 32 bytes just in case */
 #define MAX_FID_RANGE_STRLEN (32 + 2 * 2 * sizeof(__u64))
 /**
@@ -60,34 +58,37 @@
  * safe for production use.
  */
 static int
-lprocfs_fid_write_common(struct file *file, const char __user *buffer,
-				size_t count, struct lu_seq_range *range)
+ldebugfs_fid_write_common(const char __user *buffer, size_t count,
+			  struct lu_seq_range *range)
 {
+	char kernbuf[MAX_FID_RANGE_STRLEN];
 	struct lu_seq_range tmp = {
 		.lsr_start = 0,
 	};
-	char kernbuf[MAX_FID_RANGE_STRLEN];
-	ENTRY;
+	int rc;
 
-	LASSERT(range != NULL);
+	ENTRY;
+	LASSERT(range);
 
 	if (count >= sizeof(kernbuf))
 		RETURN(-EINVAL);
 
-	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
+	if (copy_from_user(kernbuf, buffer, count))
 		RETURN(-EFAULT);
 
 	kernbuf[count] = 0;
 
 	if (count == 5 && strcmp(kernbuf, "clear") == 0) {
 		memset(range, 0, sizeof(*range));
-		RETURN(0);
+		RETURN(count);
 	}
 
 	/* of the form "[0x0000000240000400 - 0x000000028000400]" */
-	sscanf(kernbuf, "[%llx - %llx]\n",
-	       (long long unsigned *)&tmp.lsr_start,
-	       (long long unsigned *)&tmp.lsr_end);
+	rc = sscanf(kernbuf, "[%llx - %llx]\n",
+		    (unsigned long long *)&tmp.lsr_start,
+		    (unsigned long long *)&tmp.lsr_end);
+	if (rc != 2)
+		RETURN(-EINVAL);
 	if (!lu_seq_range_is_sane(&tmp) || lu_seq_range_is_zero(&tmp) ||
 	    tmp.lsr_start < range->lsr_start || tmp.lsr_end > range->lsr_end)
 		RETURN(-EINVAL);
@@ -97,23 +98,24 @@ lprocfs_fid_write_common(struct file *file, const char __user *buffer,
 
 #ifdef HAVE_SERVER_SUPPORT
 /*
- * Server side procfs stuff.
+ * Server side debugfs stuff.
  */
 static ssize_t
-lprocfs_server_fid_space_seq_write(struct file *file, const char __user *buffer,
-					size_t count, loff_t *off)
+ldebugfs_server_fid_space_seq_write(struct file *file,
+				    const char __user *buffer,
+				    size_t count, loff_t *off)
 {
-	struct lu_server_seq *seq = ((struct seq_file *)file->private_data)->private;
+	struct lu_server_seq *seq;
 	int rc;
-	ENTRY;
 
-	LASSERT(seq != NULL);
+	ENTRY;
+	seq = ((struct seq_file *)file->private_data)->private;
 
 	mutex_lock(&seq->lss_mutex);
-	rc = lprocfs_fid_write_common(file, buffer, count, &seq->lss_space);
+	rc = ldebugfs_fid_write_common(buffer, count, &seq->lss_space);
 	if (rc == 0) {
-		CDEBUG(D_INFO, "%s: Space: "DRANGE"\n",
-			seq->lss_name, PRANGE(&seq->lss_space));
+		CDEBUG(D_INFO, "%s: Space: " DRANGE "\n",
+		       seq->lss_name, PRANGE(&seq->lss_space));
 	}
 	mutex_unlock(&seq->lss_mutex);
 
@@ -121,13 +123,11 @@ lprocfs_server_fid_space_seq_write(struct file *file, const char __user *buffer,
 }
 
 static int
-lprocfs_server_fid_space_seq_show(struct seq_file *m, void *unused)
+ldebugfs_server_fid_space_seq_show(struct seq_file *m, void *unused)
 {
 	struct lu_server_seq *seq = (struct lu_server_seq *)m->private;
 	ENTRY;
 
-	LASSERT(seq != NULL);
-
 	mutex_lock(&seq->lss_mutex);
 	seq_printf(m, "[%#llx - %#llx]:%x:%s\n", PRANGE(&seq->lss_space));
 	mutex_unlock(&seq->lss_mutex);
@@ -136,14 +136,12 @@ lprocfs_server_fid_space_seq_show(struct seq_file *m, void *unused)
 }
 
 static int
-lprocfs_server_fid_server_seq_show(struct seq_file *m, void *unused)
+ldebugfs_server_fid_server_seq_show(struct seq_file *m, void *unused)
 {
 	struct lu_server_seq *seq = (struct lu_server_seq *)m->private;
 	struct client_obd *cli;
 	ENTRY;
 
-	LASSERT(seq != NULL);
-
 	if (seq->lss_cli) {
 		if (seq->lss_cli->lcs_exp != NULL) {
 			cli = &seq->lss_cli->lcs_exp->exp_obd->u.cli;
@@ -158,34 +156,24 @@ lprocfs_server_fid_server_seq_show(struct seq_file *m, void *unused)
 	RETURN(0);
 }
 
-static ssize_t
-lprocfs_server_fid_width_seq_write(struct file *file, const char __user *buffer,
-					size_t count, loff_t *off)
+static ssize_t ldebugfs_server_fid_width_seq_write(struct file *file,
+						   const char __user *buffer,
+						   size_t count, loff_t *off)
 {
-	struct lu_server_seq *seq = ((struct seq_file *)file->private_data)->private;
+	struct seq_file *m = file->private_data;
+	struct lu_server_seq *seq = m->private;
 	int rc;
-	__s64 val;
-	ENTRY;
-
-	LASSERT(seq != NULL);
 
+	ENTRY;
 	mutex_lock(&seq->lss_mutex);
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtoull_from_user(buffer, count, 0, &seq->lss_width);
 	if (rc) {
 		CERROR("%s: invalid FID sequence width: rc = %d\n",
 		       seq->lss_name, rc);
 		GOTO(out_unlock, count = rc);
 	}
 
-	if (val < 0) {
-		CERROR("%s: invalid FID sequence width: rc = %d\n",
-		       seq->lss_name, -ERANGE);
-		GOTO(out_unlock, count = -ERANGE);
-	}
-
-	seq->lss_width = val;
-
 	CDEBUG(D_INFO, "%s: Width: %llu\n",
 	       seq->lss_name, seq->lss_width);
 out_unlock:
@@ -195,13 +183,11 @@ lprocfs_server_fid_width_seq_write(struct file *file, const char __user *buffer,
 }
 
 static int
-lprocfs_server_fid_width_seq_show(struct seq_file *m, void *unused)
+ldebugfs_server_fid_width_seq_show(struct seq_file *m, void *unused)
 {
 	struct lu_server_seq *seq = (struct lu_server_seq *)m->private;
-	ENTRY;
-
-	LASSERT(seq != NULL);
 
+	ENTRY;
 	mutex_lock(&seq->lss_mutex);
 	seq_printf(m, "%llu\n", seq->lss_width);
 	mutex_unlock(&seq->lss_mutex);
@@ -209,17 +195,17 @@ lprocfs_server_fid_width_seq_show(struct seq_file *m, void *unused)
 	RETURN(0);
 }
 
-LPROC_SEQ_FOPS(lprocfs_server_fid_space);
-LPROC_SEQ_FOPS(lprocfs_server_fid_width);
-LPROC_SEQ_FOPS_RO(lprocfs_server_fid_server);
+LDEBUGFS_SEQ_FOPS(ldebugfs_server_fid_space);
+LDEBUGFS_SEQ_FOPS(ldebugfs_server_fid_width);
+LDEBUGFS_SEQ_FOPS_RO(ldebugfs_server_fid_server);
 
-struct lprocfs_vars seq_server_proc_list[] = {
+struct ldebugfs_vars seq_server_debugfs_list[] = {
 	{ .name	=	"space",
-	  .fops	=	&lprocfs_server_fid_space_fops	},
+	  .fops	=	&ldebugfs_server_fid_space_fops	},
 	{ .name	=	"width",
-	  .fops	=	&lprocfs_server_fid_width_fops	},
+	  .fops	=	&ldebugfs_server_fid_width_fops	},
 	{ .name	=	"server",
-	  .fops	=	&lprocfs_server_fid_server_fops	},
+	  .fops	=	&ldebugfs_server_fid_server_fops},
 	{ NULL }
 };
 
@@ -350,7 +336,7 @@ struct seq_operations fldb_sops = {
 static int fldb_seq_open(struct inode *inode, struct file *file)
 {
 	struct seq_file		*seq;
-	struct lu_server_seq    *ss = (struct lu_server_seq *) PDE_DATA(inode);
+	struct lu_server_seq *ss = inode->i_private;
 	struct lu_server_fld    *fld;
 	struct dt_object	*obj;
 	const struct dt_it_ops  *iops;
@@ -361,10 +347,6 @@ static int fldb_seq_open(struct inode *inode, struct file *file)
 	fld = ss->lss_site->ss_server_fld;
 	LASSERT(fld != NULL);
 
-	rc = LPROCFS_ENTRY_CHECK(inode);
-	if (rc < 0)
-		return rc;
-
 	rc = seq_open(file, &fldb_sops);
 	if (rc)
 		return rc;
@@ -416,7 +398,7 @@ static int fldb_seq_release(struct inode *inode, struct file *file)
 
 	param = seq->private;
 	if (param == NULL) {
-		lprocfs_seq_release(inode, file);
+		seq_release(inode, file);
 		return 0;
 	}
 
@@ -430,7 +412,7 @@ static int fldb_seq_release(struct inode *inode, struct file *file)
 	iops->fini(&param->fsp_env, param->fsp_it);
 	lu_env_fini(&param->fsp_env);
 	OBD_FREE_PTR(param);
-	lprocfs_seq_release(inode, file);
+	seq_release(inode, file);
 
 	return 0;
 }
@@ -496,7 +478,7 @@ static ssize_t fldb_seq_write(struct file *file, const char __user *buf,
 	RETURN(rc < 0 ? rc : len);
 }
 
-const struct file_operations seq_fld_proc_seq_fops = {
+const struct file_operations seq_fld_debugfs_seq_fops = {
 	.owner	 = THIS_MODULE,
 	.open	 = fldb_seq_open,
 	.read	 = seq_read,
@@ -506,21 +488,22 @@ const struct file_operations seq_fld_proc_seq_fops = {
 
 #endif /* HAVE_SERVER_SUPPORT */
 
-/* Client side procfs stuff */
+/* Client side debugfs stuff */
 static ssize_t
-lprocfs_client_fid_space_seq_write(struct file *file, const char __user *buffer,
-				   size_t count, loff_t *off)
+ldebugfs_client_fid_space_seq_write(struct file *file,
+				    const char __user *buffer,
+				    size_t count, loff_t *off)
 {
-	struct lu_client_seq *seq = ((struct seq_file *)file->private_data)->private;
+	struct lu_client_seq *seq;
 	int rc;
-	ENTRY;
 
-	LASSERT(seq != NULL);
+	ENTRY;
+	seq = ((struct seq_file *)file->private_data)->private;
 
 	mutex_lock(&seq->lcs_mutex);
-	rc = lprocfs_fid_write_common(file, buffer, count, &seq->lcs_space);
+	rc = ldebugfs_fid_write_common(buffer, count, &seq->lcs_space);
 	if (rc == 0) {
-		CDEBUG(D_INFO, "%s: Space: "DRANGE"\n",
+		CDEBUG(D_INFO, "%s: Space: " DRANGE "\n",
                        seq->lcs_name, PRANGE(&seq->lcs_space));
 	}
 
@@ -529,68 +512,58 @@ lprocfs_client_fid_space_seq_write(struct file *file, const char __user *buffer,
 	RETURN(count);
 }
 
-static int
-lprocfs_client_fid_space_seq_show(struct seq_file *m, void *unused)
+static int ldebugfs_client_fid_space_seq_show(struct seq_file *m, void *unused)
 {
 	struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
-	ENTRY;
-
-	LASSERT(seq != NULL);
 
+	ENTRY;
 	mutex_lock(&seq->lcs_mutex);
-	seq_printf(m, "[%#llx - %#llx]:%x:%s\n",
-		   PRANGE(&seq->lcs_space));
+	seq_printf(m, "[%#llx - %#llx]:%x:%s\n", PRANGE(&seq->lcs_space));
 	mutex_unlock(&seq->lcs_mutex);
 
 	RETURN(0);
 }
 
-static ssize_t
-lprocfs_client_fid_width_seq_write(struct file *file, const char __user *buffer,
-				   size_t count, loff_t *off)
+static ssize_t ldebugfs_client_fid_width_seq_write(struct file *file,
+						   const char __user *buffer,
+						   size_t count, loff_t *off)
 {
-	struct lu_client_seq *seq = ((struct seq_file *)file->private_data)->private;
-	__u64 max;
+	struct seq_file *m = file->private_data;
+	struct lu_client_seq *seq = m->private;
+	u64 val;
+	u64 max;
 	int rc;
-	__s64 val;
-	ENTRY;
 
-	LASSERT(seq != NULL);
+	ENTRY;
+	rc = kstrtoull_from_user(buffer, count, 0, &val);
+	if (rc)
+		return rc;
 
 	mutex_lock(&seq->lcs_mutex);
-
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
-	if (rc) {
-		GOTO(out_unlock, count = rc);
-	}
-
 	if (seq->lcs_type == LUSTRE_SEQ_DATA)
 		max = LUSTRE_DATA_SEQ_MAX_WIDTH;
 	else
 		max = LUSTRE_METADATA_SEQ_MAX_WIDTH;
 
-	if (val <= max && val > 0) {
+	if (val <= max) {
 		seq->lcs_width = val;
 
-		CDEBUG(D_INFO, "%s: Sequence size: %llu\n",
-		       seq->lcs_name, seq->lcs_width);
+		CDEBUG(D_INFO, "%s: Sequence size: %llu\n", seq->lcs_name,
+		       seq->lcs_width);
 	} else {
-		GOTO(out_unlock, count = -ERANGE);
+		count = -ERANGE;
 	}
 
-out_unlock:
 	mutex_unlock(&seq->lcs_mutex);
 	RETURN(count);
 }
 
 static int
-lprocfs_client_fid_width_seq_show(struct seq_file *m, void *unused)
+ldebugfs_client_fid_width_seq_show(struct seq_file *m, void *unused)
 {
 	struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
-	ENTRY;
-
-	LASSERT(seq != NULL);
 
+	ENTRY;
 	mutex_lock(&seq->lcs_mutex);
 	seq_printf(m, "%llu\n", seq->lcs_width);
 	mutex_unlock(&seq->lcs_mutex);
@@ -599,13 +572,11 @@ lprocfs_client_fid_width_seq_show(struct seq_file *m, void *unused)
 }
 
 static int
-lprocfs_client_fid_fid_seq_show(struct seq_file *m, void *unused)
+ldebugfs_client_fid_fid_seq_show(struct seq_file *m, void *unused)
 {
 	struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
-	ENTRY;
-
-	LASSERT(seq != NULL);
 
+	ENTRY;
 	mutex_lock(&seq->lcs_mutex);
 	seq_printf(m, DFID"\n", PFID(&seq->lcs_fid));
 	mutex_unlock(&seq->lcs_mutex);
@@ -614,38 +585,37 @@ lprocfs_client_fid_fid_seq_show(struct seq_file *m, void *unused)
 }
 
 static int
-lprocfs_client_fid_server_seq_show(struct seq_file *m, void *unused)
+ldebugfs_client_fid_server_seq_show(struct seq_file *m, void *unused)
 {
 	struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
 	struct client_obd *cli;
 	ENTRY;
 
-	LASSERT(seq != NULL);
-
-	if (seq->lcs_exp != NULL) {
+	if (seq->lcs_exp) {
 		cli = &seq->lcs_exp->exp_obd->u.cli;
 		seq_printf(m, "%s\n", cli->cl_target_uuid.uuid);
+#ifdef HAVE_SERVER_SUPPORT
 	} else {
 		seq_printf(m, "%s\n", seq->lcs_srv->lss_name);
+#endif /* HAVE_SERVER_SUPPORT */
 	}
+
 	RETURN(0);
 }
 
-LPROC_SEQ_FOPS(lprocfs_client_fid_space);
-LPROC_SEQ_FOPS(lprocfs_client_fid_width);
-LPROC_SEQ_FOPS_RO(lprocfs_client_fid_server);
-LPROC_SEQ_FOPS_RO(lprocfs_client_fid_fid);
+LDEBUGFS_SEQ_FOPS(ldebugfs_client_fid_space);
+LDEBUGFS_SEQ_FOPS(ldebugfs_client_fid_width);
+LDEBUGFS_SEQ_FOPS_RO(ldebugfs_client_fid_server);
+LDEBUGFS_SEQ_FOPS_RO(ldebugfs_client_fid_fid);
 
-struct lprocfs_vars seq_client_proc_list[] = {
+struct ldebugfs_vars seq_client_debugfs_list[] = {
 	{ .name	=	"space",
-	  .fops	=	&lprocfs_client_fid_space_fops	},
+	  .fops	=	&ldebugfs_client_fid_space_fops	},
 	{ .name	=	"width",
-	  .fops	=	&lprocfs_client_fid_width_fops	},
+	  .fops	=	&ldebugfs_client_fid_width_fops	},
 	{ .name	=	"server",
-	  .fops	=	&lprocfs_client_fid_server_fops	},
+	  .fops	=	&ldebugfs_client_fid_server_fops},
 	{ .name	=	"fid",
-	  .fops	=	&lprocfs_client_fid_fid_fops	},
+	  .fops	=	&ldebugfs_client_fid_fid_fops	},
 	{ NULL }
 };
-
-#endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_cache.c b/drivers/staging/lustrefsx/lustre/fld/fld_cache.c
index 9b46feed04e72..f638e0dcd1ea4 100644
--- a/drivers/staging/lustrefsx/lustre/fld/fld_cache.c
+++ b/drivers/staging/lustrefsx/lustre/fld/fld_cache.c
@@ -88,27 +88,14 @@ struct fld_cache *fld_cache_init(const char *name,
  */
 void fld_cache_fini(struct fld_cache *cache)
 {
-        __u64 pct;
-        ENTRY;
-
-        LASSERT(cache != NULL);
-        fld_cache_flush(cache);
-
-        if (cache->fci_stat.fst_count > 0) {
-                pct = cache->fci_stat.fst_cache * 100;
-                do_div(pct, cache->fci_stat.fst_count);
-        } else {
-                pct = 0;
-        }
+	LASSERT(cache != NULL);
+	fld_cache_flush(cache);
 
-        CDEBUG(D_INFO, "FLD cache statistics (%s):\n", cache->fci_name);
-	CDEBUG(D_INFO, "  Total reqs: %llu\n", cache->fci_stat.fst_count);
+	CDEBUG(D_INFO, "FLD cache statistics (%s):\n", cache->fci_name);
 	CDEBUG(D_INFO, "  Cache reqs: %llu\n", cache->fci_stat.fst_cache);
-	CDEBUG(D_INFO, "  Cache hits: %llu%%\n", pct);
-
-        OBD_FREE_PTR(cache);
+	CDEBUG(D_INFO, "  Total reqs: %llu\n", cache->fci_stat.fst_count);
 
-        EXIT;
+	OBD_FREE_PTR(cache);
 }
 
 /**
diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_handler.c b/drivers/staging/lustrefsx/lustre/fld/fld_handler.c
index 375070464cd85..42f00da7f1363 100644
--- a/drivers/staging/lustrefsx/lustre/fld/fld_handler.c
+++ b/drivers/staging/lustrefsx/lustre/fld/fld_handler.c
@@ -103,15 +103,16 @@ EXPORT_SYMBOL(fld_server_create);
 /**
  * Extract index information from fld name like srv-fsname-MDT0000
  **/
-int fld_name_to_index(const char *name, __u32 *index)
+int fld_name_to_index(const char *name, u32 *index)
 {
 	char *dash;
 	int rc;
+
 	ENTRY;
 
 	CDEBUG(D_INFO, "get index from %s\n", name);
 	dash = strrchr(name, '-');
-	if (dash == NULL)
+	if (!dash)
 		RETURN(-EINVAL);
 	dash++;
 	rc = target_name2index(dash, index, NULL);
@@ -124,17 +125,20 @@ int fld_name_to_index(const char *name, __u32 *index)
 int fld_update_from_controller(const struct lu_env *env,
 			       struct lu_server_fld *fld)
 {
-	struct fld_thread_info	  *info;
-	struct lu_seq_range	  *range;
+	struct fld_thread_info *info;
+	struct lu_seq_range *range;
 	struct lu_seq_range_array *lsra;
-	__u32			  index;
-	struct ptlrpc_request	  *req;
-	int			  rc;
-	int			  i;
+	u32 index;
+	struct ptlrpc_request *req;
+	int rc;
+	int i;
+
 	ENTRY;
 
-	/* Update only happens during initalization, i.e. local FLDB
-	 * does not exist yet */
+	/*
+	 * Update only happens during initalization, i.e. local FLDB
+	 * does not exist yet
+	 */
 	if (!fld->lsf_new)
 		RETURN(0);
 
@@ -162,7 +166,7 @@ int fld_update_from_controller(const struct lu_env *env,
 		LASSERT(req != NULL);
 		lsra = (struct lu_seq_range_array *)req_capsule_server_get(
 					  &req->rq_pill, &RMF_GENERIC_DATA);
-		if (lsra == NULL)
+		if (!lsra)
 			GOTO(out, rc = -EPROTO);
 
 		range_array_le_to_cpu(lsra, lsra);
@@ -188,7 +192,7 @@ int fld_update_from_controller(const struct lu_env *env,
 
 	fld->lsf_new = 1;
 out:
-	if (req != NULL)
+	if (req)
 		ptlrpc_req_finished(req);
 
 	RETURN(rc);
@@ -204,6 +208,7 @@ int fld_local_lookup(const struct lu_env *env, struct lu_server_fld *fld,
 	struct lu_seq_range *erange;
 	struct fld_thread_info *info;
 	int rc;
+
 	ENTRY;
 
 	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
@@ -215,9 +220,9 @@ int fld_local_lookup(const struct lu_env *env, struct lu_server_fld *fld,
 	if (rc == 0) {
 		if (unlikely(fld_range_type(erange) != fld_range_type(range) &&
 			     !fld_range_is_any(range))) {
-			CERROR("%s: FLD cache range "DRANGE" does not match"
-			       "requested flag %x: rc = %d\n", fld->lsf_name,
-			       PRANGE(erange), range->lsr_flags, -EIO);
+			CERROR("%s: FLD cache range "DRANGE" does not match requested flag %x: rc = %d\n",
+			       fld->lsf_name, PRANGE(erange), range->lsr_flags,
+			       -EIO);
 			RETURN(-EIO);
 		}
 		*range = *erange;
@@ -237,8 +242,9 @@ EXPORT_SYMBOL(fld_local_lookup);
 int fld_server_lookup(const struct lu_env *env, struct lu_server_fld *fld,
 		      u64 seq, struct lu_seq_range *range)
 {
-	__u32 index;
+	u32 index;
 	int rc;
+
 	ENTRY;
 
 	rc = fld_local_lookup(env, fld, seq, range);
@@ -250,18 +256,21 @@ int fld_server_lookup(const struct lu_env *env, struct lu_server_fld *fld,
 		RETURN(rc);
 
 	if (index == 0 && rc == LDD_F_SV_TYPE_MDT) {
-		/* On server side, all entries should be in cache.
-		 * If we can not find it in cache, just return error */
+		/*
+		 * On server side, all entries should be in cache.
+		 * If we can not find it in cache, just return error
+		 */
 		CERROR("%s: Cannot find sequence %#llx: rc = %d\n",
 		       fld->lsf_name, seq, -ENOENT);
 		RETURN(-ENOENT);
 	} else {
-		if (fld->lsf_control_exp == NULL) {
-			CERROR("%s: lookup %#llx, but not connects to MDT0"
-			       "yet: rc = %d.\n", fld->lsf_name, seq, -EIO);
+		if (!fld->lsf_control_exp) {
+			CERROR("%s: lookup %#llx, but not connects to MDT0 yet: rc = %d.\n",
+			       fld->lsf_name, seq, -EIO);
 			RETURN(-EIO);
 		}
-		/* send request to mdt0 i.e. super seq. controller.
+		/*
+		 * send request to mdt0 i.e. super seq. controller.
 		 * This is temporary solution, long term solution is fld
 		 * replication on all mdt servers.
 		 */
@@ -281,17 +290,17 @@ EXPORT_SYMBOL(fld_server_lookup);
  */
 static int fld_handle_lookup(struct tgt_session_info *tsi)
 {
-	struct obd_export	*exp = tsi->tsi_exp;
-	struct lu_site		*site = exp->exp_obd->obd_lu_dev->ld_site;
-	struct lu_server_fld	*fld;
-	struct lu_seq_range	*in;
-	struct lu_seq_range	*out;
-	int			rc;
+	struct obd_export *exp = tsi->tsi_exp;
+	struct lu_site *site = exp->exp_obd->obd_lu_dev->ld_site;
+	struct lu_server_fld *fld;
+	struct lu_seq_range *in;
+	struct lu_seq_range *out;
+	int rc;
 
 	ENTRY;
 
 	in = req_capsule_client_get(tsi->tsi_pill, &RMF_FLD_MDFLD);
-	if (in == NULL)
+	if (!in)
 		RETURN(err_serious(-EPROTO));
 
 	rc = req_capsule_server_pack(tsi->tsi_pill);
@@ -299,7 +308,7 @@ static int fld_handle_lookup(struct tgt_session_info *tsi)
 		RETURN(err_serious(rc));
 
 	out = req_capsule_server_get(tsi->tsi_pill, &RMF_FLD_MDFLD);
-	if (out == NULL)
+	if (!out)
 		RETURN(err_serious(-EPROTO));
 	*out = *in;
 
@@ -315,18 +324,18 @@ static int fld_handle_lookup(struct tgt_session_info *tsi)
 
 static int fld_handle_read(struct tgt_session_info *tsi)
 {
-	struct obd_export	*exp = tsi->tsi_exp;
-	struct lu_site		*site = exp->exp_obd->obd_lu_dev->ld_site;
-	struct lu_seq_range	*in;
-	void			*data;
-	int			rc;
+	struct obd_export *exp = tsi->tsi_exp;
+	struct lu_site *site = exp->exp_obd->obd_lu_dev->ld_site;
+	struct lu_seq_range *in;
+	void *data;
+	int rc;
 
 	ENTRY;
 
 	req_capsule_set(tsi->tsi_pill, &RQF_FLD_READ);
 
 	in = req_capsule_client_get(tsi->tsi_pill, &RMF_FLD_MDFLD);
-	if (in == NULL)
+	if (!in)
 		RETURN(err_serious(-EPROTO));
 
 	req_capsule_set_size(tsi->tsi_pill, &RMF_GENERIC_DATA, RCL_SERVER,
@@ -365,12 +374,13 @@ static int fld_handle_query(struct tgt_session_info *tsi)
  * fid_is_local() is supposed to be used in assertion checks only.
  */
 int fid_is_local(const struct lu_env *env,
-                 struct lu_site *site, const struct lu_fid *fid)
+		 struct lu_site *site, const struct lu_fid *fid)
 {
 	int result;
 	struct seq_server_site *ss_site;
 	struct lu_seq_range *range;
 	struct fld_thread_info *info;
+
 	ENTRY;
 
 	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
@@ -378,7 +388,7 @@ int fid_is_local(const struct lu_env *env,
 
 	result = 1; /* conservatively assume fid is local */
 	ss_site = lu_site2seq(site);
-	if (ss_site->ss_client_fld != NULL) {
+	if (ss_site->ss_client_fld) {
 		int rc;
 
 		rc = fld_cache_lookup(ss_site->ss_client_fld->lcf_cache,
@@ -388,54 +398,37 @@ int fid_is_local(const struct lu_env *env,
 	}
 	return result;
 }
+EXPORT_SYMBOL(fid_is_local);
 
-static void fld_server_proc_fini(struct lu_server_fld *fld);
+static void fld_server_debugfs_fini(struct lu_server_fld *fld)
+{
+	if (!IS_ERR_OR_NULL(fld->lsf_debugfs_entry))
+		ldebugfs_remove(&fld->lsf_debugfs_entry);
+}
 
-#ifdef CONFIG_PROC_FS
-static int fld_server_proc_init(struct lu_server_fld *fld)
+static int fld_server_debugfs_init(struct lu_server_fld *fld)
 {
-        int rc = 0;
-        ENTRY;
+	int rc = 0;
 
-	fld->lsf_proc_dir = lprocfs_register(fld->lsf_name, fld_type_proc_dir,
-					     fld_server_proc_list, fld);
-	if (IS_ERR(fld->lsf_proc_dir)) {
-		rc = PTR_ERR(fld->lsf_proc_dir);
+	ENTRY;
+	fld->lsf_debugfs_entry = ldebugfs_register(fld->lsf_name,
+						   fld_debugfs_dir,
+						   NULL, NULL);
+	if (IS_ERR_OR_NULL(fld->lsf_debugfs_entry)) {
+		rc = fld->lsf_debugfs_entry ? PTR_ERR(fld->lsf_debugfs_entry)
+					    : -ENOMEM;
+		fld->lsf_debugfs_entry = NULL;
 		RETURN(rc);
 	}
 
-	rc = lprocfs_seq_create(fld->lsf_proc_dir, "fldb", 0444,
-				&fld_proc_seq_fops, fld);
-	if (rc) {
-		lprocfs_remove(&fld->lsf_proc_dir);
-		fld->lsf_proc_dir = NULL;
-	}
+	rc = ldebugfs_seq_create(fld->lsf_debugfs_entry, "fldb", 0444,
+				 &fld_debugfs_seq_fops, fld);
+	if (rc)
+		ldebugfs_remove(&fld->lsf_debugfs_entry);
 
 	RETURN(rc);
 }
 
-static void fld_server_proc_fini(struct lu_server_fld *fld)
-{
-        ENTRY;
-        if (fld->lsf_proc_dir != NULL) {
-                if (!IS_ERR(fld->lsf_proc_dir))
-                        lprocfs_remove(&fld->lsf_proc_dir);
-                fld->lsf_proc_dir = NULL;
-        }
-        EXIT;
-}
-#else
-static int fld_server_proc_init(struct lu_server_fld *fld)
-{
-        return 0;
-}
-
-static void fld_server_proc_fini(struct lu_server_fld *fld)
-{
-        return;
-}
-#endif
-
 int fld_server_init(const struct lu_env *env, struct lu_server_fld *fld,
 		    struct dt_device *dt, const char *prefix, int type)
 {
@@ -463,7 +456,7 @@ int fld_server_init(const struct lu_env *env, struct lu_server_fld *fld,
 	if (rc)
 		GOTO(out_cache, rc);
 
-	rc = fld_server_proc_init(fld);
+	rc = fld_server_debugfs_init(fld);
 	if (rc)
 		GOTO(out_index, rc);
 
@@ -484,10 +477,10 @@ void fld_server_fini(const struct lu_env *env, struct lu_server_fld *fld)
 {
 	ENTRY;
 
-	fld_server_proc_fini(fld);
+	fld_server_debugfs_fini(fld);
 	fld_index_fini(env, fld);
 
-	if (fld->lsf_cache != NULL) {
+	if (fld->lsf_cache) {
 		if (!IS_ERR(fld->lsf_cache))
 			fld_cache_fini(fld->lsf_cache);
 		fld->lsf_cache = NULL;
diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_index.c b/drivers/staging/lustrefsx/lustre/fld/fld_index.c
index fa9ca9427f22f..f2079cb5b1f49 100644
--- a/drivers/staging/lustrefsx/lustre/fld/fld_index.c
+++ b/drivers/staging/lustrefsx/lustre/fld/fld_index.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -84,10 +84,10 @@ int fld_declare_index_create(const struct lu_env *env,
 			     const struct lu_seq_range *new_range,
 			     struct thandle *th)
 {
-	struct lu_seq_range	*tmp;
-	struct lu_seq_range	*range;
-	struct fld_thread_info	*info;
-	int			rc = 0;
+	struct lu_seq_range *tmp;
+	struct lu_seq_range *range;
+	struct fld_thread_info *info;
+	int rc = 0;
 
 	ENTRY;
 
@@ -109,8 +109,10 @@ int fld_declare_index_create(const struct lu_env *env,
 		GOTO(out, rc);
 	}
 
-	/* Check for merge case, since the fld entry can only be increamental,
-	 * so we will only check whether it can be merged from the left. */
+	/*
+	 * Check for merge case, since the fld entry can only be increamental,
+	 * so we will only check whether it can be merged from the left.
+	 */
 	if (new_range->lsr_start == range->lsr_end && range->lsr_end != 0 &&
 	    lu_seq_range_compare_loc(new_range, range) == 0) {
 		range_cpu_to_be(tmp, range);
@@ -156,12 +158,13 @@ int fld_declare_index_create(const struct lu_env *env,
 int fld_index_create(const struct lu_env *env, struct lu_server_fld *fld,
 		     const struct lu_seq_range *new_range, struct thandle *th)
 {
-	struct lu_seq_range	*range;
-	struct lu_seq_range	*tmp;
-	struct fld_thread_info	*info;
-	int			rc = 0;
-	int			deleted = 0;
-	struct fld_cache_entry	*flde;
+	struct lu_seq_range *range;
+	struct lu_seq_range *tmp;
+	struct fld_thread_info *info;
+	int rc = 0;
+	int deleted = 0;
+	struct fld_cache_entry *flde;
+
 	ENTRY;
 
 	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
@@ -193,7 +196,7 @@ int fld_index_create(const struct lu_env *env, struct lu_server_fld *fld,
 
 	range_cpu_to_be(tmp, tmp);
 	rc = dt_insert(env, fld->lsf_obj, (struct dt_rec *)tmp,
-		       (struct dt_key *)&tmp->lsr_start, th, 1);
+		       (struct dt_key *)&tmp->lsr_start, th);
 	if (rc != 0) {
 		CERROR("%s: insert range "DRANGE" failed: rc = %d\n",
 		       fld->lsf_name, PRANGE(new_range), rc);
@@ -229,11 +232,11 @@ int fld_index_create(const struct lu_env *env, struct lu_server_fld *fld,
 int fld_index_lookup(const struct lu_env *env, struct lu_server_fld *fld,
 		     u64 seq, struct lu_seq_range *range)
 {
-        struct lu_seq_range     *fld_rec;
-        struct fld_thread_info  *info;
-        int rc;
+	struct lu_seq_range *fld_rec;
+	struct fld_thread_info *info;
+	int rc;
 
-        ENTRY;
+	ENTRY;
 
 	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
 	fld_rec = &info->fti_rec;
@@ -245,12 +248,12 @@ int fld_index_lookup(const struct lu_env *env, struct lu_server_fld *fld,
 			rc = 0;
 		else
 			rc = -ENOENT;
-        }
+	}
 
 	CDEBUG(D_INFO, "%s: lookup seq = %#llx range : "DRANGE" rc = %d\n",
-               fld->lsf_name, seq, PRANGE(range), rc);
+	       fld->lsf_name, seq, PRANGE(range), rc);
 
-        RETURN(rc);
+	RETURN(rc);
 }
 
 /**
@@ -273,6 +276,7 @@ int fld_insert_entry(const struct lu_env *env,
 	struct thandle *th;
 	struct dt_device *dt = lu2dt_dev(fld->lsf_obj->do_lu.lo_dev);
 	int rc;
+
 	ENTRY;
 
 	LASSERT(mutex_is_locked(&fld->lsf_lock));
@@ -325,16 +329,18 @@ static int fld_insert_special_entries(const struct lu_env *env,
 int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
 		   struct dt_device *dt, int type)
 {
-	struct dt_object	*dt_obj = NULL;
-	struct lu_fid		fid;
-	struct lu_attr		*attr = NULL;
-	struct lu_seq_range	*range = NULL;
-	struct fld_thread_info	*info;
-	struct dt_object_format	dof;
-	struct dt_it		*it;
-	const struct dt_it_ops	*iops;
-	int			rc;
-	__u32			index;
+	struct dt_object *dt_obj = NULL;
+	struct lu_fid fid;
+	struct lu_attr *attr = NULL;
+	struct lu_seq_range *range = NULL;
+	struct fld_thread_info *info;
+	struct dt_object_format dof;
+	struct dt_it *it;
+	const struct dt_it_ops *iops;
+	int rc;
+	u32 index;
+	int range_count = 0;
+
 	ENTRY;
 
 	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
@@ -342,7 +348,7 @@ int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
 
 	lu_local_obj_fid(&fid, FLD_INDEX_OID);
 	OBD_ALLOC_PTR(attr);
-	if (attr == NULL)
+	if (!attr)
 		RETURN(-ENOMEM);
 
 	memset(attr, 0, sizeof(*attr));
@@ -388,26 +394,41 @@ int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
 		GOTO(out, rc = PTR_ERR(it));
 
 	rc = iops->load(env, it, 0);
+	if (rc > 0)
+		rc = 0;
+	else if (rc == 0)
+		rc = iops->next(env, it);
+
 	if (rc < 0)
 		GOTO(out_it_fini, rc);
 
-	if (rc > 0) {
-		/* Load FLD entry into server cache */
-		do {
-			rc = iops->rec(env, it, (struct dt_rec *)range, 0);
-			if (rc != 0)
-				GOTO(out_it_put, rc);
-			LASSERT(range != NULL);
-			range_be_to_cpu(range, range);
+	while (rc == 0) {
+		rc = iops->rec(env, it, (struct dt_rec *)range, 0);
+		if (rc != 0)
+			GOTO(out_it_put, rc);
+
+		range_be_to_cpu(range, range);
+
+		/*
+		 * Newly created ldiskfs IAM indexes may include a
+		 * zeroed-out key and record. Ignore it here.
+		 */
+		if (range->lsr_start < range->lsr_end) {
 			rc = fld_cache_insert(fld->lsf_cache, range);
 			if (rc != 0)
 				GOTO(out_it_put, rc);
-			rc = iops->next(env, it);
-		} while (rc == 0);
-	} else {
-		fld->lsf_new = 1;
+
+			range_count++;
+		}
+
+		rc = iops->next(env, it);
+		if (rc < 0)
+			GOTO(out_it_fini, rc);
 	}
 
+	if (range_count == 0)
+		fld->lsf_new = 1;
+
 	rc = fld_name_to_index(fld->lsf_name, &index);
 	if (rc < 0)
 		GOTO(out_it_put, rc);
@@ -415,8 +436,10 @@ int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
 		rc = 0;
 
 	if (index == 0 && type == LU_SEQ_RANGE_MDT) {
-		/* Note: fld_insert_entry will detect whether these
-		 * special entries already exist inside FLDB */
+		/*
+		 * Note: fld_insert_entry will detect whether these
+		 * special entries already exist inside FLDB
+		 */
 		mutex_lock(&fld->lsf_lock);
 		rc = fld_insert_special_entries(env, fld);
 		mutex_unlock(&fld->lsf_lock);
@@ -431,11 +454,11 @@ int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
 out_it_fini:
 	iops->fini(env, it);
 out:
-	if (attr != NULL)
+	if (attr)
 		OBD_FREE_PTR(attr);
 
 	if (rc < 0) {
-		if (dt_obj != NULL)
+		if (dt_obj)
 			dt_object_put(env, dt_obj);
 		fld->lsf_obj = NULL;
 	}
@@ -445,7 +468,7 @@ int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
 void fld_index_fini(const struct lu_env *env, struct lu_server_fld *fld)
 {
 	ENTRY;
-	if (fld->lsf_obj != NULL) {
+	if (fld->lsf_obj) {
 		if (!IS_ERR(fld->lsf_obj))
 			dt_object_put(env, fld->lsf_obj);
 		fld->lsf_obj = NULL;
@@ -457,12 +480,12 @@ int fld_server_read(const struct lu_env *env, struct lu_server_fld *fld,
 		    struct lu_seq_range *range, void *data, int data_len)
 {
 	struct lu_seq_range_array *lsra = data;
-	struct fld_thread_info	  *info;
-	struct dt_object	  *dt_obj = fld->lsf_obj;
-	struct lu_seq_range	  *entry;
-	struct dt_it		  *it;
-	const struct dt_it_ops	  *iops;
-	int			  rc;
+	struct fld_thread_info *info;
+	struct dt_object *dt_obj = fld->lsf_obj;
+	struct lu_seq_range *entry;
+	struct dt_it *it;
+	const struct dt_it_ops *iops;
+	int rc;
 
 	ENTRY;
 
diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_internal.h b/drivers/staging/lustrefsx/lustre/fld/fld_internal.h
index dcb24a3c2f22a..48337e0b6839b 100644
--- a/drivers/staging/lustrefsx/lustre/fld/fld_internal.h
+++ b/drivers/staging/lustrefsx/lustre/fld/fld_internal.h
@@ -56,7 +56,6 @@
 #define __FLD_INTERNAL_H
 
 #include <obd.h>
-#include <lustre/lustre_idl.h>
 #include <libcfs/libcfs.h>
 #include <lustre_fld.h>
 
@@ -139,12 +138,6 @@ enum {
 
 extern struct lu_fld_hash fld_hash[];
 
-
-#ifdef CONFIG_PROC_FS
-extern struct proc_dir_entry *fld_type_proc_dir;
-extern struct lprocfs_vars fld_client_proc_list[];
-#endif
-
 # ifdef HAVE_SERVER_SUPPORT
 struct fld_thread_info {
 	struct lu_seq_range fti_rec;
@@ -172,16 +165,15 @@ int fld_index_lookup(const struct lu_env *env, struct lu_server_fld *fld,
 		     u64 seq, struct lu_seq_range *range);
 
 int fld_name_to_index(const char *name, __u32 *index);
-int fld_server_mod_init(void);
 
+int fld_server_mod_init(void);
 void fld_server_mod_exit(void);
 
 int fld_server_read(const struct lu_env *env, struct lu_server_fld *fld,
 		    struct lu_seq_range *range, void *data, int data_len);
-#ifdef CONFIG_PROC_FS
-extern const struct file_operations fld_proc_seq_fops;
-extern struct lprocfs_vars fld_server_proc_list[];
-#endif
+
+extern const struct file_operations fld_debugfs_seq_fops;
+extern struct dentry *fld_debugfs_dir;
 
 # endif /* HAVE_SERVER_SUPPORT */
 
@@ -189,6 +181,8 @@ int fld_client_rpc(struct obd_export *exp,
                    struct lu_seq_range *range, __u32 fld_op,
 		   struct ptlrpc_request **reqp);
 
+extern struct ldebugfs_vars fld_client_debugfs_list[];
+
 struct fld_cache *fld_cache_init(const char *name,
                                  int cache_size, int cache_threshold);
 
diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_request.c b/drivers/staging/lustrefsx/lustre/fld/fld_request.c
index 19b5789c19851..3dd616e0a6e94 100644
--- a/drivers/staging/lustrefsx/lustre/fld/fld_request.c
+++ b/drivers/staging/lustrefsx/lustre/fld/fld_request.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -60,15 +60,18 @@ static int fld_rrb_hash(struct lu_client_fld *fld, u64 seq)
 static struct lu_fld_target *
 fld_rrb_scan(struct lu_client_fld *fld, u64 seq)
 {
-        struct lu_fld_target *target;
-        int hash;
-        ENTRY;
+	struct lu_fld_target *target;
+	int hash;
+
+	ENTRY;
 
-	/* Because almost all of special sequence located in MDT0,
+	/*
+	 * Because almost all of special sequence located in MDT0,
 	 * it should go to index 0 directly, instead of calculating
 	 * hash again, and also if other MDTs is not being connected,
 	 * the fld lookup requests(for seq on MDT0) should not be
-	 * blocked because of other MDTs */
+	 * blocked because of other MDTs
+	 */
 	if (fid_seq_is_norm(seq))
 		hash = fld_rrb_hash(fld, seq);
 	else
@@ -76,57 +79,59 @@ fld_rrb_scan(struct lu_client_fld *fld, u64 seq)
 
 again:
 	list_for_each_entry(target, &fld->lcf_targets, ft_chain) {
-                if (target->ft_idx == hash)
-                        RETURN(target);
-        }
+		if (target->ft_idx == hash)
+			RETURN(target);
+	}
 
 	if (hash != 0) {
-		/* It is possible the remote target(MDT) are not connected to
+		/*
+		 * It is possible the remote target(MDT) are not connected to
 		 * with client yet, so we will refer this to MDT0, which should
-		 * be connected during mount */
+		 * be connected during mount
+		 */
 		hash = 0;
 		goto again;
 	}
 
-	CERROR("%s: Can't find target by hash %d (seq %#llx). "
-               "Targets (%d):\n", fld->lcf_name, hash, seq,
-               fld->lcf_count);
+	CERROR("%s: Can't find target by hash %d (seq %#llx). Targets (%d):\n",
+	       fld->lcf_name, hash, seq, fld->lcf_count);
 
 	list_for_each_entry(target, &fld->lcf_targets, ft_chain) {
-                const char *srv_name = target->ft_srv != NULL  ?
-                        target->ft_srv->lsf_name : "<null>";
-                const char *exp_name = target->ft_exp != NULL ?
-                        (char *)target->ft_exp->exp_obd->obd_uuid.uuid :
-                        "<null>";
+		const char *srv_name = target->ft_srv != NULL  ?
+			target->ft_srv->lsf_name : "<null>";
+		const char *exp_name = target->ft_exp != NULL ?
+			(char *)target->ft_exp->exp_obd->obd_uuid.uuid :
+			"<null>";
 
 		CERROR("  exp: 0x%p (%s), srv: 0x%p (%s), idx: %llu\n",
-                       target->ft_exp, exp_name, target->ft_srv,
-                       srv_name, target->ft_idx);
-        }
-
-        /*
-         * If target is not found, there is logical error anyway, so here is
-         * LBUG() to catch this situation.
-         */
-        LBUG();
-        RETURN(NULL);
+		       target->ft_exp, exp_name, target->ft_srv,
+		       srv_name, target->ft_idx);
+	}
+
+	/*
+	 * If target is not found, there is logical error anyway, so here is
+	 * LBUG() to catch this situation.
+	 */
+	LBUG();
+	RETURN(NULL);
 }
 
 struct lu_fld_hash fld_hash[] = {
-        {
-                .fh_name = "RRB",
-                .fh_hash_func = fld_rrb_hash,
-                .fh_scan_func = fld_rrb_scan
-        },
-        {
+	{
+		.fh_name = "RRB",
+		.fh_hash_func = fld_rrb_hash,
+		.fh_scan_func = fld_rrb_scan
+	},
+	{
 		NULL,
-        }
+	}
 };
 
 static struct lu_fld_target *
 fld_client_get_target(struct lu_client_fld *fld, u64 seq)
 {
 	struct lu_fld_target *target;
+
 	ENTRY;
 
 	LASSERT(fld->lcf_hash != NULL);
@@ -135,13 +140,12 @@ fld_client_get_target(struct lu_client_fld *fld, u64 seq)
 	target = fld->lcf_hash->fh_scan_func(fld, seq);
 	spin_unlock(&fld->lcf_lock);
 
-        if (target != NULL) {
-		CDEBUG(D_INFO, "%s: Found target (idx %llu"
-		       ") by seq %#llx\n", fld->lcf_name,
-                       target->ft_idx, seq);
-        }
+	if (target) {
+		CDEBUG(D_INFO, "%s: Found target (idx %llu) by seq %#llx\n",
+		       fld->lcf_name, target->ft_idx, seq);
+	}
 
-        RETURN(target);
+	RETURN(target);
 }
 
 /*
@@ -149,44 +153,45 @@ fld_client_get_target(struct lu_client_fld *fld, u64 seq)
  * of FLD module.
  */
 int fld_client_add_target(struct lu_client_fld *fld,
-                          struct lu_fld_target *tar)
+			  struct lu_fld_target *tar)
 {
 	const char *name;
-        struct lu_fld_target *target, *tmp;
-        ENTRY;
+	struct lu_fld_target *target, *tmp;
 
-        LASSERT(tar != NULL);
+	ENTRY;
+
+	LASSERT(tar != NULL);
 	name = fld_target_name(tar);
-        LASSERT(name != NULL);
-        LASSERT(tar->ft_srv != NULL || tar->ft_exp != NULL);
+	LASSERT(name != NULL);
+	LASSERT(tar->ft_srv != NULL || tar->ft_exp != NULL);
 
 	CDEBUG(D_INFO, "%s: Adding target %s (idx %llu)\n", fld->lcf_name,
 	       name, tar->ft_idx);
 
-        OBD_ALLOC_PTR(target);
-        if (target == NULL)
-                RETURN(-ENOMEM);
+	OBD_ALLOC_PTR(target);
+	if (!target)
+		RETURN(-ENOMEM);
 
 	spin_lock(&fld->lcf_lock);
 	list_for_each_entry(tmp, &fld->lcf_targets, ft_chain) {
 		if (tmp->ft_idx == tar->ft_idx) {
 			spin_unlock(&fld->lcf_lock);
-                        OBD_FREE_PTR(target);
+			OBD_FREE_PTR(target);
 			CERROR("Target %s exists in FLD and known as %s:#%llu\n",
-                               name, fld_target_name(tmp), tmp->ft_idx);
-                        RETURN(-EEXIST);
-                }
-        }
+			       name, fld_target_name(tmp), tmp->ft_idx);
+			RETURN(-EEXIST);
+		}
+	}
 
-        target->ft_exp = tar->ft_exp;
-        if (target->ft_exp != NULL)
-                class_export_get(target->ft_exp);
-        target->ft_srv = tar->ft_srv;
-        target->ft_idx = tar->ft_idx;
+	target->ft_exp = tar->ft_exp;
+	if (target->ft_exp)
+		class_export_get(target->ft_exp);
+	target->ft_srv = tar->ft_srv;
+	target->ft_idx = tar->ft_idx;
 
 	list_add_tail(&target->ft_chain, &fld->lcf_targets);
 
-        fld->lcf_count++;
+	fld->lcf_count++;
 	spin_unlock(&fld->lcf_lock);
 
 	RETURN(0);
@@ -194,9 +199,10 @@ int fld_client_add_target(struct lu_client_fld *fld,
 EXPORT_SYMBOL(fld_client_add_target);
 
 /* Remove export from FLD */
-int fld_client_del_target(struct lu_client_fld *fld, __u64 idx)
+int fld_client_del_target(struct lu_client_fld *fld, u64 idx)
 {
 	struct lu_fld_target *target, *tmp;
+
 	ENTRY;
 
 	spin_lock(&fld->lcf_lock);
@@ -206,182 +212,161 @@ int fld_client_del_target(struct lu_client_fld *fld, __u64 idx)
 			list_del(&target->ft_chain);
 			spin_unlock(&fld->lcf_lock);
 
-                        if (target->ft_exp != NULL)
-                                class_export_put(target->ft_exp);
+			if (target->ft_exp)
+				class_export_put(target->ft_exp);
 
-                        OBD_FREE_PTR(target);
-                        RETURN(0);
-                }
-        }
+			OBD_FREE_PTR(target);
+			RETURN(0);
+		}
+	}
 	spin_unlock(&fld->lcf_lock);
 	RETURN(-ENOENT);
 }
 
-#ifdef CONFIG_PROC_FS
-static int fld_client_proc_init(struct lu_client_fld *fld)
+struct dentry *fld_debugfs_dir;
+
+static int fld_client_debugfs_init(struct lu_client_fld *fld)
 {
 	int rc;
-	ENTRY;
 
-	fld->lcf_proc_dir = lprocfs_register(fld->lcf_name, fld_type_proc_dir,
-					     NULL, NULL);
-	if (IS_ERR(fld->lcf_proc_dir)) {
-		CERROR("%s: LProcFS failed in fld-init\n",
-		       fld->lcf_name);
-		rc = PTR_ERR(fld->lcf_proc_dir);
+	ENTRY;
+	fld->lcf_debugfs_entry = ldebugfs_register(fld->lcf_name,
+						   fld_debugfs_dir,
+						   fld_client_debugfs_list,
+						   fld);
+	if (IS_ERR_OR_NULL(fld->lcf_debugfs_entry)) {
+		CERROR("%s: LdebugFS failed in fld-init\n", fld->lcf_name);
+		rc = fld->lcf_debugfs_entry ? PTR_ERR(fld->lcf_debugfs_entry)
+					    : -ENOMEM;
+		fld->lcf_debugfs_entry = NULL;
 		RETURN(rc);
 	}
 
-	rc = lprocfs_add_vars(fld->lcf_proc_dir, fld_client_proc_list, fld);
-	if (rc) {
-		CERROR("%s: Can't init FLD proc, rc %d\n",
-		       fld->lcf_name, rc);
-		GOTO(out_cleanup, rc);
-	}
-
-	RETURN(0);
-
-out_cleanup:
-	fld_client_proc_fini(fld);
-	return rc;
-}
-
-void fld_client_proc_fini(struct lu_client_fld *fld)
-{
-        ENTRY;
-        if (fld->lcf_proc_dir) {
-                if (!IS_ERR(fld->lcf_proc_dir))
-                        lprocfs_remove(&fld->lcf_proc_dir);
-                fld->lcf_proc_dir = NULL;
-        }
-        EXIT;
-}
-#else /* !CONFIG_PROC_FS */
-static int fld_client_proc_init(struct lu_client_fld *fld)
-{
-        return 0;
+	return 0;
 }
 
-void fld_client_proc_fini(struct lu_client_fld *fld)
+void fld_client_debugfs_fini(struct lu_client_fld *fld)
 {
-        return;
+	if (!IS_ERR_OR_NULL(fld->lcf_debugfs_entry))
+		ldebugfs_remove(&fld->lcf_debugfs_entry);
 }
-#endif /* CONFIG_PROC_FS */
-
-EXPORT_SYMBOL(fld_client_proc_fini);
+EXPORT_SYMBOL(fld_client_debugfs_fini);
 
 static inline int hash_is_sane(int hash)
 {
-        return (hash >= 0 && hash < ARRAY_SIZE(fld_hash));
+	return (hash >= 0 && hash < ARRAY_SIZE(fld_hash));
 }
 
 int fld_client_init(struct lu_client_fld *fld,
-                    const char *prefix, int hash)
+		    const char *prefix, int hash)
 {
-        int cache_size, cache_threshold;
-        int rc;
-        ENTRY;
-
-        LASSERT(fld != NULL);
+	int cache_size, cache_threshold;
+	int rc;
 
-        snprintf(fld->lcf_name, sizeof(fld->lcf_name),
-                 "cli-%s", prefix);
+	ENTRY;
+	snprintf(fld->lcf_name, sizeof(fld->lcf_name),
+		 "cli-%s", prefix);
 
-        if (!hash_is_sane(hash)) {
-                CERROR("%s: Wrong hash function %#x\n",
-                       fld->lcf_name, hash);
-                RETURN(-EINVAL);
-        }
+	if (!hash_is_sane(hash)) {
+		CERROR("%s: Wrong hash function %#x\n",
+		       fld->lcf_name, hash);
+		RETURN(-EINVAL);
+	}
 
 	fld->lcf_count = 0;
 	spin_lock_init(&fld->lcf_lock);
 	fld->lcf_hash = &fld_hash[hash];
 	INIT_LIST_HEAD(&fld->lcf_targets);
 
-        cache_size = FLD_CLIENT_CACHE_SIZE /
-                sizeof(struct fld_cache_entry);
+	cache_size = FLD_CLIENT_CACHE_SIZE /
+		sizeof(struct fld_cache_entry);
 
-        cache_threshold = cache_size *
-                FLD_CLIENT_CACHE_THRESHOLD / 100;
+	cache_threshold = cache_size *
+		FLD_CLIENT_CACHE_THRESHOLD / 100;
 
-        fld->lcf_cache = fld_cache_init(fld->lcf_name,
-                                        cache_size, cache_threshold);
-        if (IS_ERR(fld->lcf_cache)) {
-                rc = PTR_ERR(fld->lcf_cache);
-                fld->lcf_cache = NULL;
-                GOTO(out, rc);
-        }
+	fld->lcf_cache = fld_cache_init(fld->lcf_name,
+					cache_size, cache_threshold);
+	if (IS_ERR(fld->lcf_cache)) {
+		rc = PTR_ERR(fld->lcf_cache);
+		fld->lcf_cache = NULL;
+		GOTO(out, rc);
+	}
 
-        rc = fld_client_proc_init(fld);
-        if (rc)
-                GOTO(out, rc);
-        EXIT;
+	rc = fld_client_debugfs_init(fld);
+	if (rc)
+		GOTO(out, rc);
+	EXIT;
 out:
-        if (rc)
-                fld_client_fini(fld);
-        else
-                CDEBUG(D_INFO, "%s: Using \"%s\" hash\n",
-                       fld->lcf_name, fld->lcf_hash->fh_name);
-        return rc;
+	if (rc)
+		fld_client_fini(fld);
+	else
+		CDEBUG(D_INFO, "%s: Using \"%s\" hash\n",
+		       fld->lcf_name, fld->lcf_hash->fh_name);
+	return rc;
 }
 EXPORT_SYMBOL(fld_client_init);
 
 void fld_client_fini(struct lu_client_fld *fld)
 {
 	struct lu_fld_target *target, *tmp;
+
 	ENTRY;
 
 	spin_lock(&fld->lcf_lock);
 	list_for_each_entry_safe(target, tmp, &fld->lcf_targets, ft_chain) {
-                fld->lcf_count--;
+		fld->lcf_count--;
 		list_del(&target->ft_chain);
-                if (target->ft_exp != NULL)
-                        class_export_put(target->ft_exp);
-                OBD_FREE_PTR(target);
-        }
+		if (target->ft_exp)
+			class_export_put(target->ft_exp);
+		OBD_FREE_PTR(target);
+	}
 	spin_unlock(&fld->lcf_lock);
 
-        if (fld->lcf_cache != NULL) {
-                if (!IS_ERR(fld->lcf_cache))
-                        fld_cache_fini(fld->lcf_cache);
-                fld->lcf_cache = NULL;
-        }
+	if (fld->lcf_cache) {
+		if (!IS_ERR(fld->lcf_cache))
+			fld_cache_fini(fld->lcf_cache);
+		fld->lcf_cache = NULL;
+	}
 
-        EXIT;
+	EXIT;
 }
 EXPORT_SYMBOL(fld_client_fini);
 
 int fld_client_rpc(struct obd_export *exp,
-		   struct lu_seq_range *range, __u32 fld_op,
+		   struct lu_seq_range *range, u32 fld_op,
 		   struct ptlrpc_request **reqp)
 {
 	struct ptlrpc_request *req = NULL;
-	struct lu_seq_range   *prange;
-	__u32                 *op;
-	int                    rc = 0;
-	struct obd_import     *imp;
+	struct lu_seq_range *prange;
+	u32 *op;
+	int rc = 0;
+	struct obd_import *imp;
+
 	ENTRY;
 
 	LASSERT(exp != NULL);
 
-again:
 	imp = class_exp2cliimp(exp);
 	switch (fld_op) {
 	case FLD_QUERY:
 		req = ptlrpc_request_alloc_pack(imp, &RQF_FLD_QUERY,
 						LUSTRE_MDS_VERSION, FLD_QUERY);
-		if (req == NULL)
+		if (!req)
 			RETURN(-ENOMEM);
 
-		/* XXX: only needed when talking to old server(< 2.6), it should
-		 * be removed when < 2.6 server is not supported */
+		/*
+		 * XXX: only needed when talking to old server(< 2.6), it should
+		 * be removed when < 2.6 server is not supported
+		 */
 		op = req_capsule_client_get(&req->rq_pill, &RMF_FLD_OPC);
 		*op = FLD_LOOKUP;
 
-		/* For MDS_MDS seq lookup, it will always use LWP connection,
+		/*
+		 * For MDS_MDS seq lookup, it will always use LWP connection,
 		 * but LWP will be evicted after restart, so cause the error.
 		 * so we will set no_delay for seq lookup request, once the
-		 * request fails because of the eviction. always retry here */
+		 * request fails because of the eviction. always retry here
+		 */
 		if (imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS) {
 			req->rq_allow_replay = 1;
 			req->rq_no_delay = 1;
@@ -390,7 +375,7 @@ int fld_client_rpc(struct obd_export *exp,
 	case FLD_READ:
 		req = ptlrpc_request_alloc_pack(imp, &RQF_FLD_READ,
 						LUSTRE_MDS_VERSION, FLD_READ);
-		if (req == NULL)
+		if (!req)
 			RETURN(-ENOMEM);
 
 		req_capsule_set_size(&req->rq_pill, &RMF_GENERIC_DATA,
@@ -407,13 +392,19 @@ int fld_client_rpc(struct obd_export *exp,
 	prange = req_capsule_client_get(&req->rq_pill, &RMF_FLD_MDFLD);
 	*prange = *range;
 	ptlrpc_request_set_replen(req);
-        req->rq_request_portal = FLD_REQUEST_PORTAL;
+	req->rq_request_portal = FLD_REQUEST_PORTAL;
 	req->rq_reply_portal = MDC_REPLY_PORTAL;
-        ptlrpc_at_set_req_timeout(req);
-
-	obd_get_request_slot(&exp->exp_obd->u.cli);
-	rc = ptlrpc_queue_wait(req);
-	obd_put_request_slot(&exp->exp_obd->u.cli);
+	ptlrpc_at_set_req_timeout(req);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_FLD_QUERY_REQ && req->rq_no_delay)) {
+		/* the same error returned by ptlrpc_import_delay_req */
+		rc = -EWOULDBLOCK;
+		req->rq_status = rc;
+	} else {
+		obd_get_request_slot(&exp->exp_obd->u.cli);
+		rc = ptlrpc_queue_wait(req);
+		obd_put_request_slot(&exp->exp_obd->u.cli);
+	}
 
 	if (rc == -ENOENT) {
 		/* Don't loop forever on non-existing FID sequences. */
@@ -426,14 +417,11 @@ int fld_client_rpc(struct obd_export *exp,
 		    imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS &&
 		    OCD_HAS_FLAG(&imp->imp_connect_data, LIGHTWEIGHT) &&
 		    rc != -ENOTSUPP) {
-			/* Since LWP is not replayable, so it will keep
-			 * trying unless umount happens or the remote
-			 * target does not support the operation, otherwise
-			 * it would cause unecessary failure of the
-			 * application. */
-			ptlrpc_req_finished(req);
-			rc = 0;
-			goto again;
+			/*
+			 * Since LWP is not replayable, so notify the caller
+			 * to retry if needed after a while.
+			 */
+			rc = -EAGAIN;
 		}
 		GOTO(out_req, rc);
 	}
@@ -441,31 +429,32 @@ int fld_client_rpc(struct obd_export *exp,
 	if (fld_op == FLD_QUERY) {
 		prange = req_capsule_server_get(&req->rq_pill,
 						&RMF_FLD_MDFLD);
-		if (prange == NULL)
+		if (!prange)
 			GOTO(out_req, rc = -EFAULT);
 		*range = *prange;
 	}
 
 	EXIT;
 out_req:
-	if (rc != 0 || reqp == NULL) {
+	if (rc != 0 || !reqp) {
 		ptlrpc_req_finished(req);
 		req = NULL;
 	}
 
-	if (reqp != NULL)
+	if (reqp)
 		*reqp = req;
 
 	return rc;
 }
 
 int fld_client_lookup(struct lu_client_fld *fld, u64 seq, u32 *mds,
-		      __u32 flags, const struct lu_env *env)
+		      u32 flags, const struct lu_env *env)
 {
 	struct lu_seq_range res = { 0 };
 	struct lu_fld_target *target;
 	struct lu_fld_target *origin;
 	int rc;
+
 	ENTRY;
 
 	rc = fld_cache_lookup(fld->lcf_cache, seq, &res);
@@ -474,20 +463,19 @@ int fld_client_lookup(struct lu_client_fld *fld, u64 seq, u32 *mds,
 		RETURN(0);
 	}
 
-        /* Can not find it in the cache */
-        target = fld_client_get_target(fld, seq);
-        LASSERT(target != NULL);
+	/* Can not find it in the cache */
+	target = fld_client_get_target(fld, seq);
+	LASSERT(target != NULL);
 	origin = target;
 again:
-	CDEBUG(D_INFO, "%s: Lookup fld entry (seq: %#llx) on "
-	       "target %s (idx %llu)\n", fld->lcf_name, seq,
-               fld_target_name(target), target->ft_idx);
+	CDEBUG(D_INFO, "%s: Lookup fld entry (seq: %#llx) on target %s (idx %llu)\n",
+	       fld->lcf_name, seq, fld_target_name(target), target->ft_idx);
 
 	res.lsr_start = seq;
 	fld_range_set_type(&res, flags);
 
 #ifdef HAVE_SERVER_SUPPORT
-	if (target->ft_srv != NULL) {
+	if (target->ft_srv) {
 		LASSERT(env != NULL);
 		rc = fld_server_lookup(env, target->ft_srv, seq, &res);
 	} else
@@ -497,15 +485,17 @@ int fld_client_lookup(struct lu_client_fld *fld, u64 seq, u32 *mds,
 	}
 
 	if (rc == -ESHUTDOWN) {
-		/* If fld lookup failed because the target has been shutdown,
+		/*
+		 * If fld lookup failed because the target has been shutdown,
 		 * then try next target in the list, until trying all targets
-		 * or fld lookup succeeds */
+		 * or fld lookup succeeds
+		 */
 		spin_lock(&fld->lcf_lock);
-
-		/* If the next entry in the list is the head of the list,
+		/*
+		 * If the next entry in the list is the head of the list,
 		 * move to the next entry after the head and retrieve
-		 * the target. Else retreive the next target entry. */
-
+		 * the target. Else retreive the next target entry.
+		 */
 		if (target->ft_chain.next == &fld->lcf_targets)
 			target = list_entry(target->ft_chain.next->next,
 					    struct lu_fld_target, ft_chain);
@@ -528,25 +518,23 @@ EXPORT_SYMBOL(fld_client_lookup);
 
 void fld_client_flush(struct lu_client_fld *fld)
 {
-        fld_cache_flush(fld->lcf_cache);
+	fld_cache_flush(fld->lcf_cache);
 }
 
-
-struct proc_dir_entry *fld_type_proc_dir;
-
 static int __init fld_init(void)
 {
-	fld_type_proc_dir = lprocfs_register(LUSTRE_FLD_NAME,
-					     proc_lustre_root,
-					     NULL, NULL);
-	if (IS_ERR(fld_type_proc_dir))
-		return PTR_ERR(fld_type_proc_dir);
-
 #ifdef HAVE_SERVER_SUPPORT
-	fld_server_mod_init();
+	int rc;
+
+	rc = fld_server_mod_init();
+	if (rc)
+		return rc;
 #endif /* HAVE_SERVER_SUPPORT */
 
-	return 0;
+	fld_debugfs_dir = ldebugfs_register(LUSTRE_FLD_NAME,
+					    debugfs_lustre_root,
+					    NULL, NULL);
+	return PTR_ERR_OR_ZERO(fld_debugfs_dir);
 }
 
 static void __exit fld_exit(void)
@@ -555,10 +543,8 @@ static void __exit fld_exit(void)
 	fld_server_mod_exit();
 #endif /* HAVE_SERVER_SUPPORT */
 
-	if (fld_type_proc_dir != NULL && !IS_ERR(fld_type_proc_dir)) {
-		lprocfs_remove(&fld_type_proc_dir);
-		fld_type_proc_dir = NULL;
-	}
+	if (!IS_ERR_OR_NULL(fld_debugfs_dir))
+		ldebugfs_remove(&fld_debugfs_dir);
 }
 
 MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
diff --git a/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c b/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c
index 926ed5598052b..a555889f57730 100644
--- a/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c
+++ b/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c
@@ -41,37 +41,36 @@
 
 #include <libcfs/libcfs.h>
 #include <linux/module.h>
+
+#ifdef HAVE_SERVER_SUPPORT
 #include <dt_object.h>
+#endif
 #include <obd_support.h>
 #include <lustre_fld.h>
 #include <lustre_fid.h>
 #include "fld_internal.h"
 
-#ifdef CONFIG_PROC_FS
 static int
-fld_proc_targets_seq_show(struct seq_file *m, void *unused)
+fld_debugfs_targets_seq_show(struct seq_file *m, void *unused)
 {
 	struct lu_client_fld *fld = (struct lu_client_fld *)m->private;
         struct lu_fld_target *target;
-	ENTRY;
-
-	LASSERT(fld != NULL);
 
+	ENTRY;
 	spin_lock(&fld->lcf_lock);
 	list_for_each_entry(target, &fld->lcf_targets, ft_chain)
 	seq_printf(m, "%s\n", fld_target_name(target));
 	spin_unlock(&fld->lcf_lock);
+
 	RETURN(0);
 }
 
 static int
-fld_proc_hash_seq_show(struct seq_file *m, void *unused)
+fld_debugfs_hash_seq_show(struct seq_file *m, void *unused)
 {
 	struct lu_client_fld *fld = (struct lu_client_fld *)m->private;
-	ENTRY;
-
-	LASSERT(fld != NULL);
 
+	ENTRY;
 	spin_lock(&fld->lcf_lock);
 	seq_printf(m, "%s\n", fld->lcf_hash->fh_name);
 	spin_unlock(&fld->lcf_lock);
@@ -80,7 +79,7 @@ fld_proc_hash_seq_show(struct seq_file *m, void *unused)
 }
 
 static ssize_t
-fld_proc_hash_seq_write(struct file *file, const char __user *buffer,
+fld_debugfs_hash_seq_write(struct file *file, const char __user *buffer,
 			size_t count, loff_t *off)
 {
 	struct lu_client_fld *fld;
@@ -91,13 +90,12 @@ fld_proc_hash_seq_write(struct file *file, const char __user *buffer,
 	if (count > sizeof(fh_name))
 		return -ENAMETOOLONG;
 
-	if (lprocfs_copy_from_user(file, fh_name, buffer, count) != 0)
+	if (copy_from_user(fh_name, buffer, count) != 0)
 		return -EFAULT;
 
 	fld = ((struct seq_file *)file->private_data)->private;
-	LASSERT(fld != NULL);
 
-	for (i = 0; fld_hash[i].fh_name != NULL; i++) {
+	for (i = 0; fld_hash[i].fh_name; i++) {
 		if (count != strlen(fld_hash[i].fh_name))
 			continue;
 
@@ -107,7 +105,7 @@ fld_proc_hash_seq_write(struct file *file, const char __user *buffer,
 		}
 	}
 
-	if (hash != NULL) {
+	if (hash) {
 		spin_lock(&fld->lcf_lock);
 		fld->lcf_hash = hash;
 		spin_unlock(&fld->lcf_lock);
@@ -119,15 +117,14 @@ fld_proc_hash_seq_write(struct file *file, const char __user *buffer,
 	return count;
 }
 
-static ssize_t
-lprocfs_cache_flush_seq_write(struct file *file, const char __user *buffer,
-			       size_t count, loff_t *pos)
+static ssize_t ldebugfs_cache_flush_seq_write(struct file *file,
+					      const char __user *buffer,
+					      size_t count, loff_t *pos)
 {
-	struct lu_client_fld *fld = ((struct seq_file *)file->private_data)->private;
-	ENTRY;
-
-        LASSERT(fld != NULL);
+	struct seq_file *m = file->private_data;
+	struct lu_client_fld *fld = m->private;
 
+	ENTRY;
         fld_cache_flush(fld->lcf_cache);
 
         CDEBUG(D_INFO, "%s: Lookup cache is flushed\n", fld->lcf_name);
@@ -135,15 +132,15 @@ lprocfs_cache_flush_seq_write(struct file *file, const char __user *buffer,
         RETURN(count);
 }
 
-LPROC_SEQ_FOPS_RO(fld_proc_targets);
-LPROC_SEQ_FOPS(fld_proc_hash);
-LPROC_SEQ_FOPS_WO_TYPE(fld, cache_flush);
+LDEBUGFS_SEQ_FOPS_RO(fld_debugfs_targets);
+LDEBUGFS_SEQ_FOPS(fld_debugfs_hash);
+LDEBUGFS_FOPS_WR_ONLY(fld, cache_flush);
 
-struct lprocfs_vars fld_client_proc_list[] = {
+struct ldebugfs_vars fld_client_debugfs_list[] = {
 	{ .name	=	"targets",
-	  .fops	=	&fld_proc_targets_fops	},
+	  .fops	=	&fld_debugfs_targets_fops	},
 	{ .name	=	"hash",
-	  .fops	=	&fld_proc_hash_fops	},
+	  .fops	=	&fld_debugfs_hash_fops	},
 	{ .name	=	"cache_flush",
 	  .fops	=	&fld_cache_flush_fops	},
 	{ NULL }
@@ -275,17 +272,13 @@ struct seq_operations fldb_sops = {
 static int fldb_seq_open(struct inode *inode, struct file *file)
 {
 	struct seq_file		*seq;
-	struct lu_server_fld    *fld = (struct lu_server_fld *)PDE_DATA(inode);
+	struct lu_server_fld    *fld = inode->i_private;
 	struct dt_object	*obj;
 	const struct dt_it_ops  *iops;
 	struct fld_seq_param    *param = NULL;
 	int			env_init = 0;
 	int			rc;
 
-	rc = LPROCFS_ENTRY_CHECK(inode);
-	if (rc < 0)
-		return rc;
-
 	rc = seq_open(file, &fldb_sops);
 	if (rc)
 		GOTO(out, rc);
@@ -355,17 +348,11 @@ static int fldb_seq_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-const struct file_operations fld_proc_seq_fops = {
+const struct file_operations fld_debugfs_seq_fops = {
 	.owner   = THIS_MODULE,
 	.open    = fldb_seq_open,
 	.read    = seq_read,
 	.release = fldb_seq_release,
 };
 
-struct lprocfs_vars fld_server_proc_list[] = {
-	{ NULL }
-};
-
 # endif /* HAVE_SERVER_SUPPORT */
-
-#endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/include/cl_object.h b/drivers/staging/lustrefsx/lustre/include/cl_object.h
index 78d09269a33c9..f0c8a5b4bfda0 100644
--- a/drivers/staging/lustrefsx/lustre/include/cl_object.h
+++ b/drivers/staging/lustrefsx/lustre/include/cl_object.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -88,14 +88,17 @@
 /*
  * super-class definitions.
  */
+#include <linux/aio.h>
+#include <linux/fs.h>
+
 #include <libcfs/libcfs.h>
-#include <libcfs/libcfs_ptask.h>
 #include <lu_object.h>
 #include <linux/atomic.h>
 #include <linux/mutex.h>
 #include <linux/radix-tree.h>
 #include <linux/spinlock.h>
 #include <linux/wait.h>
+#include <linux/pagevec.h>
 #include <lustre_dlm.h>
 
 struct obd_info;
@@ -118,8 +121,6 @@ struct cl_io_slice;
 
 struct cl_req_attr;
 
-extern struct cfs_ptask_engine *cl_io_engine;
-
 /**
  * Device in the client stack.
  *
@@ -415,6 +416,13 @@ struct cl_object_operations {
 	void (*coo_req_attr_set)(const struct lu_env *env,
 				 struct cl_object *obj,
 				 struct cl_req_attr *attr);
+	/**
+	 * Flush \a obj data corresponding to \a lock. Used for DoM
+	 * locks in llite's cancelling blocking ast callback.
+	 */
+	int (*coo_object_flush)(const struct lu_env *env,
+				struct cl_object *obj,
+				struct ldlm_lock *lock);
 };
 
 /**
@@ -703,7 +711,7 @@ enum cl_page_type {
 
         /** Transient page, the transient cl_page is used to bind a cl_page
          *  to vmpage which is not belonging to the same object of cl_page.
-         *  it is used in DirectIO, lockless IO and liblustre. */
+         *  it is used in DirectIO and lockless IO. */
         CPT_TRANSIENT,
 };
 
@@ -864,6 +872,13 @@ struct cl_page_operations {
          */
         int (*cpo_is_vmlocked)(const struct lu_env *env,
                                const struct cl_page_slice *slice);
+
+	/**
+	 * Update file attributes when all we have is this page.  Used for tiny
+	 * writes to update attributes when we don't have a full cl_io.
+	 */
+	void (*cpo_page_touch)(const struct lu_env *env,
+			       const struct cl_page_slice *slice, size_t to);
         /**
          * Page destruction.
          */
@@ -888,7 +903,8 @@ struct cl_page_operations {
                            const struct cl_page_slice *slice);
         /** Destructor. Frees resources and slice itself. */
         void (*cpo_fini)(const struct lu_env *env,
-                         struct cl_page_slice *slice);
+			 struct cl_page_slice *slice,
+			 struct pagevec *pvec);
         /**
          * Optional debugging helper. Prints given page slice.
          *
@@ -1071,15 +1087,13 @@ static inline bool __page_in_use(const struct cl_page *page, int refc)
  * (struct cl_lock) and a list of layers (struct cl_lock_slice), linked to
  * cl_lock::cll_layers list through cl_lock_slice::cls_linkage.
  *
- * Typical cl_lock consists of the two layers:
+ * Typical cl_lock consists of one layer:
  *
- *     - vvp_lock (vvp specific data), and
  *     - lov_lock (lov specific data).
  *
  * lov_lock contains an array of sub-locks. Each of these sub-locks is a
  * normal cl_lock: it has a header (struct cl_lock) and a list of layers:
  *
- *     - lovsub_lock, and
  *     - osc_lock
  *
  * Each sub-lock is associated with a cl_object (representing stripe
@@ -1199,7 +1213,7 @@ struct cl_lock {
 /**
  * Per-layer part of cl_lock
  *
- * \see vvp_lock, lov_lock, lovsub_lock, osc_lock
+ * \see lov_lock, osc_lock
  */
 struct cl_lock_slice {
         struct cl_lock                  *cls_lock;
@@ -1213,7 +1227,7 @@ struct cl_lock_slice {
 
 /**
  *
- * \see vvp_lock_ops, lov_lock_ops, lovsub_lock_ops, osc_lock_ops
+ * \see lov_lock_ops, osc_lock_ops
  */
 struct cl_lock_operations {
 	/** @{ */
@@ -1225,8 +1239,7 @@ struct cl_lock_operations {
 	 *		@anchor for resources
 	 * \retval -ve	failure
 	 *
-	 * \see vvp_lock_enqueue(), lov_lock_enqueue(), lovsub_lock_enqueue(),
-	 * \see osc_lock_enqueue()
+	 * \see lov_lock_enqueue(), osc_lock_enqueue()
 	 */
 	int  (*clo_enqueue)(const struct lu_env *env,
 			    const struct cl_lock_slice *slice,
@@ -1241,8 +1254,7 @@ struct cl_lock_operations {
 	/**
 	 * Destructor. Frees resources and the slice.
 	 *
-	 * \see vvp_lock_fini(), lov_lock_fini(), lovsub_lock_fini(),
-	 * \see osc_lock_fini()
+	 * \see lov_lock_fini(), osc_lock_fini()
 	 */
         void (*clo_fini)(const struct lu_env *env, struct cl_lock_slice *slice);
         /**
@@ -1297,7 +1309,7 @@ struct cl_page_list {
 	struct task_struct	*pl_owner;
 };
 
-/** 
+/**
  * A 2-queue of pages. A convenience data-type for common use case, 2-queue
  * contains an incoming page list and an outgoing page list.
  */
@@ -1378,6 +1390,10 @@ enum cl_io_type {
 	 * To write out a range of file
 	 */
 	CIT_FSYNC,
+	/**
+	 * glimpse. An io context to acquire glimpse lock.
+	 */
+	CIT_GLIMPSE,
 	/**
          * Miscellaneous io. This is used for occasional io activity that
          * doesn't fit into other types. Currently this is used for:
@@ -1389,8 +1405,6 @@ enum cl_io_type {
          *     - VM induced page write-out. An io context for writing page out
          *     for memory cleansing;
          *
-         *     - glimpse. An io context to acquire glimpse lock.
-         *
          *     - grouplock. An io context to acquire group lock.
          *
          * CIT_MISC io is used simply as a context in which locks and pages
@@ -1607,25 +1621,30 @@ enum cl_enq_flags {
          * -EWOULDBLOCK is returned immediately.
          */
         CEF_NONBLOCK     = 0x00000001,
-        /**
-         * take lock asynchronously (out of order), as it cannot
-         * deadlock. This is for LDLM_FL_HAS_INTENT locks used for glimpsing.
-         */
-        CEF_ASYNC        = 0x00000002,
+	/**
+	 * Tell lower layers this is a glimpse request, translated to
+	 * LDLM_FL_HAS_INTENT at LDLM layer.
+	 *
+	 * Also, because glimpse locks never block other locks, we count this
+	 * as automatically compatible with other osc locks.
+	 * (see osc_lock_compatible)
+	 */
+	CEF_GLIMPSE        = 0x00000002,
         /**
          * tell the server to instruct (though a flag in the blocking ast) an
          * owner of the conflicting lock, that it can drop dirty pages
          * protected by this lock, without sending them to the server.
          */
         CEF_DISCARD_DATA = 0x00000004,
-        /**
-         * tell the sub layers that it must be a `real' lock. This is used for
-         * mmapped-buffer locks and glimpse locks that must be never converted
-         * into lockless mode.
-         *
-         * \see vvp_mmap_locks(), cl_glimpse_lock().
-         */
-        CEF_MUST         = 0x00000008,
+	/**
+	 * tell the sub layers that it must be a `real' lock. This is used for
+	 * mmapped-buffer locks, glimpse locks, manually requested locks
+	 * (LU_LADVISE_LOCKAHEAD) that must never be converted into lockless
+	 * mode.
+	 *
+	 * \see vvp_mmap_locks(), cl_glimpse_lock, cl_request_lock().
+	 */
+	CEF_MUST         = 0x00000008,
         /**
          * tell the sub layers that never request a `real' lock. This flag is
          * not used currently.
@@ -1638,9 +1657,16 @@ enum cl_enq_flags {
          */
         CEF_NEVER        = 0x00000010,
         /**
-         * for async glimpse lock.
+	 * tell the dlm layer this is a speculative lock request
+	 * speculative lock requests are locks which are not requested as part
+	 * of an I/O operation.  Instead, they are requested because we expect
+	 * to use them in the future.  They are requested asynchronously at the
+	 * ptlrpc layer.
+	 *
+	 * Currently used for asynchronous glimpse locks and manually requested
+	 * locks (LU_LADVISE_LOCKAHEAD).
          */
-        CEF_AGL          = 0x00000020,
+	CEF_SPECULATIVE          = 0x00000020,
 	/**
 	 * enqueue a lock to test DLM lock existence.
 	 */
@@ -1650,10 +1676,14 @@ enum cl_enq_flags {
 	 * is known to exist.
 	 */
 	CEF_LOCK_MATCH  = 0x00000080,
+	/**
+	 * tell the DLM layer to lock only the requested range
+	 */
+	CEF_LOCK_NO_EXPAND    = 0x00000100,
 	/**
 	 * mask of enq_flags.
 	 */
-	CEF_MASK         = 0x000000ff,
+	CEF_MASK         = 0x000001ff,
 };
 
 /**
@@ -1731,21 +1761,10 @@ enum cl_fsync_mode {
 	CL_FSYNC_ALL   = 3
 };
 
-struct cl_io_range {
-	loff_t cir_pos;
-	size_t cir_count;
-};
-
-struct cl_io_pt {
-	struct cl_io_pt		*cip_next;
-	struct cfs_ptask	 cip_task;
-	struct kiocb		 cip_iocb;
-	struct iov_iter		 cip_iter;
-	struct file		*cip_file;
-	enum cl_io_type		 cip_iot;
-	loff_t			 cip_pos;
-	size_t			 cip_count;
-	ssize_t			 cip_result;
+struct cl_io_rw_common {
+	loff_t	crw_pos;
+	size_t	crw_count;
+	int	crw_nonblock;
 };
 
 /**
@@ -1775,27 +1794,30 @@ struct cl_io {
         struct cl_lockset              ci_lockset;
         /** lock requirements, this is just a help info for sublayers. */
         enum cl_io_lock_dmd            ci_lockreq;
-        union {
-		struct cl_rw_io {
-			struct iov_iter		 rw_iter;
-			struct kiocb		 rw_iocb;
-			struct cl_io_range	 rw_range;
-			struct file		*rw_file;
-			unsigned int		 rw_nonblock:1,
-						 rw_append:1,
-						 rw_sync:1;
-			int (*rw_ptask)(struct cfs_ptask *ptask);
-		} ci_rw;
+	/** layout version when this IO occurs */
+	__u32				ci_layout_version;
+	union {
+		struct cl_rd_io {
+			struct cl_io_rw_common rd;
+		} ci_rd;
+		struct cl_wr_io {
+			struct cl_io_rw_common wr;
+			int                    wr_append;
+			int                    wr_sync;
+		} ci_wr;
+		struct cl_io_rw_common ci_rw;
 		struct cl_setattr_io {
 			struct ost_lvb		 sa_attr;
 			unsigned int		 sa_attr_flags;
-			unsigned int		 sa_valid;
+			unsigned int		 sa_avalid; /* ATTR_* */
+			unsigned int		 sa_xvalid; /* OP_XVALID */
 			int			 sa_stripe_index;
 			struct ost_layout	 sa_layout;
 			const struct lu_fid	*sa_parent_fid;
 		} ci_setattr;
 		struct cl_data_version_io {
 			u64 dv_data_version;
+			u32 dv_layout_version;
 			int dv_flags;
 		} ci_data_version;
                 struct cl_fault_io {
@@ -1850,8 +1872,10 @@ struct cl_io {
 	 */
 			     ci_ignore_layout:1,
 	/**
-	 * Need MDS intervention to complete a write. This usually means the
-	 * corresponding component is not initialized for the writing extent.
+	 * Need MDS intervention to complete a write.
+	 * Write intent is required for the following cases:
+	 * 1. component being written is not initialized, or
+	 * 2. the mirrored files are NOT in WRITE_PENDING state.
 	 */
 			     ci_need_write_intent:1,
 	/**
@@ -1870,12 +1894,43 @@ struct cl_io {
 	 * O_NOATIME
 	 */
 			     ci_noatime:1,
-	/** Set to 1 if parallel execution is allowed for current I/O? */
-			     ci_pio:1;
+	/* Tell sublayers not to expand LDLM locks requested for this IO */
+			     ci_lock_no_expand:1,
+	/**
+	 * Set if non-delay RPC should be used for this IO.
+	 *
+	 * If this file has multiple mirrors, and if the OSTs of the current
+	 * mirror is inaccessible, non-delay RPC would error out quickly so
+	 * that the upper layer can try to access the next mirror.
+	 */
+			     ci_ndelay:1,
+	/**
+	 * Set if we've tried all mirrors for this read IO, if it's not set,
+	 * the read IO will check to-be-read OSCs' status, and make fast-switch
+	 * another mirror if some of the OSTs are not healthy.
+	 */
+			     ci_tried_all_mirrors:1;
+	/**
+	 * Bypass quota check
+	 */
+	unsigned	     ci_noquota:1;
+	/**
+	 * How many times the read has retried before this one.
+	 * Set by the top level and consumed by the LOV.
+	 */
+	unsigned             ci_ndelay_tried;
+	/**
+	 * Designated mirror index for this I/O.
+	 */
+	unsigned	     ci_designated_mirror;
 	/**
 	 * Number of pages owned by this IO. For invariant checking.
 	 */
 	unsigned	     ci_owned_nr;
+	/**
+	 * Range of write intent. Valid if ci_need_write_intent is set.
+	 */
+	struct lu_extent	ci_write_intent;
 };
 
 /** @} cl_io */
@@ -2058,6 +2113,9 @@ int cl_object_fiemap(const struct lu_env *env, struct cl_object *obj,
 int cl_object_layout_get(const struct lu_env *env, struct cl_object *obj,
 			 struct cl_layout *cl);
 loff_t cl_object_maxbytes(struct cl_object *obj);
+int cl_object_flush(const struct lu_env *env, struct cl_object *obj,
+		    struct ldlm_lock *lock);
+
 
 /**
  * Returns true, iff \a o0 and \a o1 are slices of the same object.
@@ -2112,6 +2170,9 @@ struct cl_page *cl_page_alloc       (const struct lu_env *env,
 void            cl_page_get         (struct cl_page *page);
 void            cl_page_put         (const struct lu_env *env,
                                      struct cl_page *page);
+void		cl_pagevec_put      (const struct lu_env *env,
+				     struct cl_page *page,
+				     struct pagevec *pvec);
 void            cl_page_print       (const struct lu_env *env, void *cookie,
                                      lu_printer_t printer,
                                      const struct cl_page *pg);
@@ -2179,6 +2240,8 @@ void    cl_page_discard(const struct lu_env *env, struct cl_io *io,
 void    cl_page_delete(const struct lu_env *env, struct cl_page *pg);
 int     cl_page_is_vmlocked(const struct lu_env *env,
 			    const struct cl_page *pg);
+void	cl_page_touch(const struct lu_env *env, const struct cl_page *pg,
+		      size_t to);
 void    cl_page_export(const struct lu_env *env,
 		       struct cl_page *pg, int uptodate);
 loff_t  cl_offset(const struct cl_object *obj, pgoff_t idx);
@@ -2306,12 +2369,12 @@ int   cl_io_cancel       (const struct lu_env *env, struct cl_io *io,
  */
 static inline int cl_io_is_append(const struct cl_io *io)
 {
-	return io->ci_type == CIT_WRITE && io->u.ci_rw.rw_append;
+	return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_append;
 }
 
 static inline int cl_io_is_sync_write(const struct cl_io *io)
 {
-	return io->ci_type == CIT_WRITE && io->u.ci_rw.rw_sync;
+	return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_sync;
 }
 
 static inline int cl_io_is_mkwrite(const struct cl_io *io)
@@ -2324,8 +2387,8 @@ static inline int cl_io_is_mkwrite(const struct cl_io *io)
  */
 static inline int cl_io_is_trunc(const struct cl_io *io)
 {
-        return io->ci_type == CIT_SETATTR &&
-                (io->u.ci_setattr.sa_valid & ATTR_SIZE);
+	return io->ci_type == CIT_SETATTR &&
+		(io->u.ci_setattr.sa_avalid & ATTR_SIZE);
 }
 
 struct cl_io *cl_io_top(struct cl_io *io);
@@ -2333,13 +2396,12 @@ struct cl_io *cl_io_top(struct cl_io *io);
 void cl_io_print(const struct lu_env *env, void *cookie,
                  lu_printer_t printer, const struct cl_io *io);
 
-#define CL_IO_SLICE_CLEAN(foo_io, base)                                 \
-do {                                                                    \
-        typeof(foo_io) __foo_io = (foo_io);                             \
-                                                                        \
-        CLASSERT(offsetof(typeof(*__foo_io), base) == 0);               \
-        memset(&__foo_io->base + 1, 0,                                  \
-               (sizeof *__foo_io) - sizeof __foo_io->base);             \
+#define CL_IO_SLICE_CLEAN(foo_io, base)					\
+do {									\
+	typeof(foo_io) __foo_io = (foo_io);				\
+									\
+	memset(&__foo_io->base, 0,					\
+	       sizeof(*__foo_io) - offsetof(typeof(*__foo_io), base));	\
 } while (0)
 
 /** @} cl_io */
diff --git a/drivers/staging/lustrefsx/lustre/include/dt_object.h b/drivers/staging/lustrefsx/lustre/include/dt_object.h
index e872981b5284e..f16895ddafba6 100644
--- a/drivers/staging/lustrefsx/lustre/include/dt_object.h
+++ b/drivers/staging/lustrefsx/lustre/include/dt_object.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -86,6 +86,8 @@ struct dt_device_param {
 	 * calculation */
 	unsigned int	   ddp_extent_tax;
 	unsigned int	   ddp_brw_size;	/* optimal RPC size */
+	/* T10PI checksum type, zero if not supported */
+	enum cksum_types   ddp_t10_cksum_type;
 };
 
 /**
@@ -251,6 +253,13 @@ struct dt_device_operations {
                              const struct dt_device *dev,
                              struct dt_device_param *param);
 
+	/**
+	 * Return device's super block.
+	 *
+	 * \param[in] dev	dt device
+	 */
+	struct super_block *(*dt_mnt_sb_get)(const struct dt_device *dev);
+
 	/**
 	 * Sync the device.
 	 *
@@ -369,6 +378,9 @@ struct dt_allocation_hint {
 	const void		*dah_eadata;
 	int			dah_eadata_len;
 	__u32			dah_mode;
+	int			dah_append_stripes;
+	bool			dah_can_block;
+	char			*dah_append_pool;
 };
 
 /**
@@ -416,6 +428,8 @@ typedef __u64 dt_obj_version_t;
 
 union ldlm_policy_data;
 
+struct md_layout_change;
+
 /**
  * A dt_object provides common operations to create and destroy
  * objects and to manage regular and extended attributes.
@@ -1040,8 +1054,7 @@ struct dt_object_operations {
 	 */
 	int (*do_declare_layout_change)(const struct lu_env *env,
 					struct dt_object *dt,
-					struct layout_intent *layout,
-					const struct lu_buf *buf,
+					struct md_layout_change *mlc,
 					struct thandle *th);
 
 	/**
@@ -1057,8 +1070,8 @@ struct dt_object_operations {
 	 * \retval -ne		error code
 	 */
 	int (*do_layout_change)(const struct lu_env *env, struct dt_object *dt,
-				struct layout_intent *layout,
-				const struct lu_buf *buf, struct thandle *th);
+				struct md_layout_change *mlc,
+				struct thandle *th);
 };
 
 enum dt_bufs_type {
@@ -1136,7 +1149,6 @@ struct dt_body_operations {
 	 * \param[in] pos	position in the object to start
 	 * \param[out] pos	\a pos + bytes written
 	 * \param[in] th	transaction handle
-	 * \param[in] ignore	unused (was used to request quota ignorance)
 	 *
 	 * \retval positive	bytes written on success
 	 * \retval negative	negated errno on error
@@ -1145,8 +1157,7 @@ struct dt_body_operations {
 			     struct dt_object *dt,
 			     const struct lu_buf *buf,
 			     loff_t *pos,
-			     struct thandle *th,
-			     int ignore);
+			     struct thandle *th);
 
 	/**
 	 * Return buffers for data.
@@ -1175,6 +1186,7 @@ struct dt_body_operations {
 	 * \param[in] pos	position in the object to start
 	 * \param[in] len	size of region in bytes
 	 * \param[out] lb	array of descriptors to fill
+	 * \param[in] maxlnb	max slots in @lnb array
 	 * \param[in] rw	0 if used to read, 1 if used for write
 	 *
 	 * \retval positive	number of descriptors on success
@@ -1185,6 +1197,7 @@ struct dt_body_operations {
 			    loff_t pos,
 			    ssize_t len,
 			    struct niobuf_local *lb,
+			    int maxlnb,
 			    enum dt_bufs_type rw);
 
 	/**
@@ -1479,7 +1492,6 @@ struct dt_index_operations {
 	 * \param[in] rec	buffer storing value
 	 * \param[in] key	key
 	 * \param[in] th	transaction handle
-	 * \param[in] ignore	unused (was used to request quota ignorance)
 	 *
 	 * \retval 0		on success
 	 * \retval negative	negated errno on error
@@ -1488,8 +1500,7 @@ struct dt_index_operations {
 			  struct dt_object *dt,
 			  const struct dt_rec *rec,
 			  const struct dt_key *key,
-			  struct thandle *th,
-			  int ignore);
+			  struct thandle *th);
 
 	/**
 	 * Declare intention to delete a key/value from an index.
@@ -1782,6 +1793,14 @@ struct dt_device {
 	struct list_head		   dd_txn_callbacks;
 	unsigned int			   dd_record_fid_accessed:1,
 					   dd_rdonly:1;
+
+	/* sysfs and debugfs handling */
+	struct dentry			  *dd_debugfs_entry;
+
+	const struct attribute		 **dd_def_attrs;
+	struct kobject			   dd_kobj;
+	struct kobj_type		   dd_ktype;
+	struct completion		   dd_kobj_unregister;
 };
 
 int  dt_device_init(struct dt_device *dev, struct lu_device_type *t);
@@ -1900,7 +1919,9 @@ struct thandle {
 				th_wait_submit:1,
 	/* complex transaction which will track updates on all targets,
 	 * including OSTs */
-				th_complex:1;
+				th_complex:1,
+	/* whether ignore quota */
+				th_ignore_quota:1;
 };
 
 /**
@@ -2380,13 +2401,14 @@ static inline int dt_ref_del(const struct lu_env *env,
 
 static inline int dt_bufs_get(const struct lu_env *env, struct dt_object *d,
 			      struct niobuf_remote *rnb,
-			      struct niobuf_local *lnb, enum dt_bufs_type rw)
+			      struct niobuf_local *lnb, int maxlnb,
+			      enum dt_bufs_type rw)
 {
 	LASSERT(d);
 	LASSERT(d->do_body_ops);
 	LASSERT(d->do_body_ops->dbo_bufs_get);
 	return d->do_body_ops->dbo_bufs_get(env, d, rnb->rnb_offset,
-					    rnb->rnb_len, lnb, rw);
+					    rnb->rnb_len, lnb, maxlnb, rw);
 }
 
 static inline int dt_bufs_put(const struct lu_env *env, struct dt_object *d,
@@ -2450,12 +2472,12 @@ static inline int dt_declare_write(const struct lu_env *env,
 
 static inline ssize_t dt_write(const struct lu_env *env, struct dt_object *dt,
 			       const struct lu_buf *buf, loff_t *pos,
-			       struct thandle *th, int rq)
+			       struct thandle *th)
 {
 	LASSERT(dt);
 	LASSERT(dt->do_body_ops);
 	LASSERT(dt->do_body_ops->dbo_write);
-	return dt->do_body_ops->dbo_write(env, dt, buf, pos, th, rq);
+	return dt->do_body_ops->dbo_write(env, dt, buf, pos, th);
 }
 
 static inline int dt_declare_punch(const struct lu_env *env,
@@ -2525,6 +2547,16 @@ static inline void dt_conf_get(const struct lu_env *env,
         return dev->dd_ops->dt_conf_get(env, dev, param);
 }
 
+static inline struct super_block *dt_mnt_sb_get(const struct dt_device *dev)
+{
+	LASSERT(dev);
+	LASSERT(dev->dd_ops);
+	if (dev->dd_ops->dt_mnt_sb_get)
+		return dev->dd_ops->dt_mnt_sb_get(dev);
+
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 static inline int dt_sync(const struct lu_env *env, struct dt_device *dev)
 {
         LASSERT(dev);
@@ -2558,11 +2590,10 @@ static inline int dt_declare_insert(const struct lu_env *env,
 }
 
 static inline int dt_insert(const struct lu_env *env,
-                                    struct dt_object *dt,
-                                    const struct dt_rec *rec,
-                                    const struct dt_key *key,
-                                    struct thandle *th,
-                                    int noquota)
+			    struct dt_object *dt,
+			    const struct dt_rec *rec,
+			    const struct dt_key *key,
+			    struct thandle *th)
 {
         LASSERT(dt);
         LASSERT(dt->do_index_ops);
@@ -2571,7 +2602,7 @@ static inline int dt_insert(const struct lu_env *env,
 	if (CFS_FAULT_CHECK(OBD_FAIL_DT_INSERT))
 		return cfs_fail_err;
 
-	return dt->do_index_ops->dio_insert(env, dt, rec, key, th, noquota);
+	return dt->do_index_ops->dio_insert(env, dt, rec, key, th);
 }
 
 static inline int dt_declare_xattr_del(const struct lu_env *env,
@@ -2747,26 +2778,24 @@ static inline int dt_lookup(const struct lu_env *env,
 
 static inline int dt_declare_layout_change(const struct lu_env *env,
 					   struct dt_object *o,
-					   struct layout_intent *layout,
-					   const struct lu_buf *buf,
+					   struct md_layout_change *mlc,
 					   struct thandle *th)
 {
 	LASSERT(o);
 	LASSERT(o->do_ops);
 	LASSERT(o->do_ops->do_declare_layout_change);
-	return o->do_ops->do_declare_layout_change(env, o, layout, buf, th);
+	return o->do_ops->do_declare_layout_change(env, o, mlc, th);
 }
 
 static inline int dt_layout_change(const struct lu_env *env,
 				   struct dt_object *o,
-				   struct layout_intent *layout,
-				   const struct lu_buf *buf,
+				   struct md_layout_change *mlc,
 				   struct thandle *th)
 {
 	LASSERT(o);
 	LASSERT(o->do_ops);
 	LASSERT(o->do_ops->do_layout_change);
-	return o->do_ops->do_layout_change(env, o, layout, buf, th);
+	return o->do_ops->do_layout_change(env, o, mlc, th);
 }
 
 struct dt_find_hint {
@@ -2815,6 +2844,9 @@ static inline struct dt_thread_info *dt_info(const struct lu_env *env)
 
 int dt_global_init(void);
 void dt_global_fini(void);
+int dt_tunables_init(struct dt_device *dt, struct obd_type *type,
+		     const char *name, struct ldebugfs_vars *list);
+int dt_tunables_fini(struct dt_device *dt);
 
 # ifdef CONFIG_PROC_FS
 int lprocfs_dt_blksize_seq_show(struct seq_file *m, void *v);
diff --git a/drivers/staging/lustrefsx/lustre/include/llog_swab.h b/drivers/staging/lustrefsx/lustre/include/llog_swab.h
index a0b8d022c1a5b..6fe62bce3bcb3 100644
--- a/drivers/staging/lustrefsx/lustre/include/llog_swab.h
+++ b/drivers/staging/lustrefsx/lustre/include/llog_swab.h
@@ -48,7 +48,7 @@
 #ifndef _LLOG_SWAB_H_
 #define _LLOG_SWAB_H_
 
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 struct lustre_cfg;
 
 void lustre_swab_lu_fid(struct lu_fid *fid);
diff --git a/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h b/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h
index a9d6342f1b6c3..85b66b3af7126 100644
--- a/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h
+++ b/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -47,7 +47,7 @@
 
 #include <libcfs/libcfs.h>
 #include <libcfs/linux/linux-fs.h>
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 
 /*
  * Liuux 5.6 introduces proc_ops with v5.5-8862-gd56c0d45f0e2
@@ -58,18 +58,43 @@ struct lprocfs_vars {
 	const char			*name;
 	const struct proc_ops		*fops;
 	void				*data;
-	/* /proc file mode. */
+	/** /proc file mode. */
 	mode_t				 proc_mode;
 };
 
+/** Provide a debugfs container */
 struct ldebugfs_vars {
 	const char			*name;
 	const struct file_operations	*fops;
 	void				*data;
-	/* debugfs file mode. */
+	/** debugfs file mode. */
 	mode_t				 proc_mode;
 };
 
+static inline unsigned int pct(unsigned long a, unsigned long b)
+{
+	return b ? a * 100 / b : 0;
+}
+
+#define PAGES_TO_MiB(pages)	((pages) >> (20 - PAGE_SHIFT))
+#define MiB_TO_PAGES(mb)	((mb) << (20 - PAGE_SHIFT))
+
+/**
+ * Append a space separated list of current set flags to str.
+ */
+#define flag2str(port, flag)						\
+	do {								\
+		if ((port)->port##_##flag) {				\
+			seq_printf(m, "%s" #flag, first ? "" : ", ");	\
+			first = false;					\
+		}							\
+	} while (0)
+
+void obd_connect_seq_flags2str(struct seq_file *m, __u64 flags, __u64 flags2,
+			       const char *sep);
+void obd_connect_data_seqprint(struct seq_file *m,
+			       struct obd_connect_data *ocd);
+
 /* if we find more consumers this could be generalized */
 #define OBD_HIST_MAX 32
 struct obd_histogram {
@@ -349,28 +374,29 @@ enum {
 #define PTLRPC_FIRST_CNTR PTLRPC_REQWAIT_CNTR
 
 enum lprocfs_extra_opc {
-        LDLM_GLIMPSE_ENQUEUE = 0,
-        LDLM_PLAIN_ENQUEUE,
-        LDLM_EXTENT_ENQUEUE,
-        LDLM_FLOCK_ENQUEUE,
-        LDLM_IBITS_ENQUEUE,
-        MDS_REINT_SETATTR,
-        MDS_REINT_CREATE,
-        MDS_REINT_LINK,
-        MDS_REINT_UNLINK,
-        MDS_REINT_RENAME,
-        MDS_REINT_OPEN,
-        MDS_REINT_SETXATTR,
-        BRW_READ_BYTES,
-        BRW_WRITE_BYTES,
-        EXTRA_LAST_OPC
+	LDLM_GLIMPSE_ENQUEUE = 0,
+	LDLM_PLAIN_ENQUEUE,
+	LDLM_EXTENT_ENQUEUE,
+	LDLM_FLOCK_ENQUEUE,
+	LDLM_IBITS_ENQUEUE,
+	MDS_REINT_SETATTR,
+	MDS_REINT_CREATE,
+	MDS_REINT_LINK,
+	MDS_REINT_UNLINK,
+	MDS_REINT_RENAME,
+	MDS_REINT_OPEN,
+	MDS_REINT_SETXATTR,
+	MDS_REINT_RESYNC,
+	BRW_READ_BYTES,
+	BRW_WRITE_BYTES,
+	EXTRA_LAST_OPC
 };
 
 #define EXTRA_FIRST_OPC LDLM_GLIMPSE_ENQUEUE
 /* class_obd.c */
 extern struct proc_dir_entry *proc_lustre_root;
 extern struct dentry *debugfs_lustre_root;
-extern struct kobject *lustre_kobj;
+extern struct kset *lustre_kset;
 
 struct obd_device;
 struct obd_histogram;
@@ -387,7 +413,7 @@ struct obd_job_stats {
 	struct list_head	ojs_list;	/* list of job_stat structs */
 	rwlock_t		ojs_lock;	/* protect ojs_list/js_list */
 	unsigned int		ojs_cleanup_interval;/* seconds before expiry */
-	time_t			ojs_last_cleanup; /* previous cleanup time */
+	time64_t		ojs_last_cleanup; /* previous cleanup time */
 	cntr_init_callback	ojs_cntr_init_fn;/* lprocfs_stats initializer */
 	unsigned short		ojs_cntr_num;	/* number of stats in struct */
 	bool			ojs_cleaning;	/* currently expiring stats */
@@ -463,13 +489,9 @@ extern struct lprocfs_stats *
 lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags);
 extern void lprocfs_clear_stats(struct lprocfs_stats *stats);
 extern void lprocfs_free_stats(struct lprocfs_stats **stats);
-extern void lprocfs_init_ops_stats(int num_private_stats,
-                                   struct lprocfs_stats *stats);
-extern void lprocfs_init_mps_stats(int num_private_stats,
-                                   struct lprocfs_stats *stats);
 extern void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats);
 extern int lprocfs_alloc_obd_stats(struct obd_device *obddev,
-                                   unsigned int num_private_stats);
+				   unsigned int num_stats);
 extern int lprocfs_alloc_md_stats(struct obd_device *obddev,
                                   unsigned int num_private_stats);
 extern void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
@@ -484,10 +506,14 @@ extern int lprocfs_add_clear_entry(struct obd_device * obd,
 #ifdef HAVE_SERVER_SUPPORT
 extern int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *peer_nid);
 extern int lprocfs_exp_cleanup(struct obd_export *exp);
+struct dentry *ldebugfs_add_symlink(const char *name, const char *target,
+				    const char *format, ...);
 #else
 static inline int lprocfs_exp_cleanup(struct obd_export *exp)
 { return 0; }
 #endif
+struct dentry *ldebugfs_add_simple(struct dentry *root, char *name, void *data,
+				   const struct file_operations *fops);
 extern struct proc_dir_entry *
 lprocfs_add_simple(struct proc_dir_entry *root, char *name,
 		   void *data, const struct proc_ops *ops);
@@ -504,11 +530,12 @@ extern int lprocfs_nid_stats_clear_seq_show(struct seq_file *file, void *data);
 extern int ldebugfs_register_stats(struct dentry *parent, const char *name,
 				   struct lprocfs_stats *stats);
 extern int lprocfs_register_stats(struct proc_dir_entry *root, const char *name,
-                                  struct lprocfs_stats *stats);
+				  struct lprocfs_stats *stats);
+extern const struct file_operations ldebugfs_stats_seq_fops;
 
 /* lprocfs_status.c */
 extern int ldebugfs_add_vars(struct dentry *parent, struct ldebugfs_vars *var,
-			     void *data);
+			      void *data);
 extern int lprocfs_add_vars(struct proc_dir_entry *root,
 			    struct lprocfs_vars *var, void *data);
 
@@ -546,44 +573,32 @@ static inline int LPROCFS_ENTRY_CHECK(struct inode *inode)
 static inline int LPROCFS_ENTRY_CHECK(struct inode *inode)
 { return 0; }
 #endif
-extern int lprocfs_obd_setup(struct obd_device *dev);
+
+extern int lprocfs_obd_setup(struct obd_device *obd, bool uuid_only);
 extern int lprocfs_obd_cleanup(struct obd_device *obd);
 #ifdef HAVE_SERVER_SUPPORT
 extern const struct file_operations lprocfs_evict_client_fops;
 #endif
 
-extern int ldebugfs_seq_create(struct dentry *parent, const char *name,
-			       umode_t mode,
-			       const struct file_operations *seq_fops,
-			       void *data);
+int ldebugfs_seq_create(struct dentry *parent, const char *name, umode_t mode,
+			const struct file_operations *seq_fops, void *data);
 extern int lprocfs_seq_create(struct proc_dir_entry *parent, const char *name,
 			      mode_t mode, const struct proc_ops *seq_fops,
 			      void *data);
-extern int lprocfs_obd_seq_create(struct obd_device *dev, const char *name,
+extern int lprocfs_obd_seq_create(struct obd_device *obd, const char *name,
 				  mode_t mode, const struct proc_ops *seq_fops,
 				  void *data);
 
 /* Generic callbacks */
-extern int lprocfs_u64_seq_show(struct seq_file *m, void *data);
-extern int lprocfs_atomic_seq_show(struct seq_file *m, void *data);
-extern ssize_t lprocfs_atomic_seq_write(struct file *file,
-					const char __user *buffer,
-					size_t count, loff_t *off);
-extern int lprocfs_uint_seq_show(struct seq_file *m, void *data);
-extern ssize_t lprocfs_uint_seq_write(struct file *file,
-				      const char __user *buffer,
-				      size_t count, loff_t *off);
-extern int lprocfs_wr_uint(struct file *file, const char __user *buffer,
-			   unsigned long count, void *data);
 extern int lprocfs_uuid_seq_show(struct seq_file *m, void *data);
-extern int lprocfs_name_seq_show(struct seq_file *m, void *data);
 extern int lprocfs_server_uuid_seq_show(struct seq_file *m, void *data);
-extern int lprocfs_conn_uuid_seq_show(struct seq_file *m, void *data);
+ssize_t conn_uuid_show(struct kobject *kobj, struct attribute *attr, char *buf);
 extern int lprocfs_import_seq_show(struct seq_file *m, void *data);
 extern int lprocfs_state_seq_show(struct seq_file *m, void *data);
 extern int lprocfs_connect_flags_seq_show(struct seq_file *m, void *data);
 #ifdef HAVE_SERVER_SUPPORT
-extern int lprocfs_num_exports_seq_show(struct seq_file *m, void *data);
+ssize_t num_exports_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf);
 #endif
 struct adaptive_timeout;
 extern int lprocfs_at_hist_helper(struct seq_file *m,
@@ -597,32 +612,27 @@ extern ssize_t
 lprocfs_evict_client_seq_write(struct file *file, const char __user *buffer,
 				size_t count, loff_t *off);
 #endif
+ssize_t ping_store(struct kobject *kobj, struct attribute *attr,
+		   const char *buffer, size_t count);
+ssize_t ping_show(struct kobject *kobj, struct attribute *attr,
+		  char *buffer);
+
 extern ssize_t
-lprocfs_ping_seq_write(struct file *file, const char __user *buffer,
-		       size_t count, loff_t *off);
-extern ssize_t
+ldebugfs_import_seq_write(struct file *file, const char __user *buffer,
+			  size_t count, loff_t *off);
+static inline ssize_t
 lprocfs_import_seq_write(struct file *file, const char __user *buffer,
-			 size_t count, loff_t *off);
+			 size_t count, loff_t *off)
+{
+	return ldebugfs_import_seq_write(file, buffer, count, off);
+}
+
 extern int lprocfs_pinger_recov_seq_show(struct seq_file *m, void *data);
 extern ssize_t
 lprocfs_pinger_recov_seq_write(struct file *file, const char __user *buffer,
 			       size_t count, loff_t *off);
 
-/* Statfs helpers */
-extern int lprocfs_blksize_seq_show(struct seq_file *m, void *data);
-extern int lprocfs_kbytestotal_seq_show(struct seq_file *m, void *data);
-extern int lprocfs_kbytesfree_seq_show(struct seq_file *m, void *data);
-extern int lprocfs_kbytesavail_seq_show(struct seq_file *m, void *data);
-extern int lprocfs_filestotal_seq_show(struct seq_file *m, void *data);
-extern int lprocfs_filesfree_seq_show(struct seq_file *m, void *data);
-
-extern int lprocfs_seq_read_frac_helper(struct seq_file *m, long val, int mult);
-extern int lprocfs_read_frac_helper(char *buffer, unsigned long count,
-                                    long val, int mult);
-extern int lprocfs_str_to_s64(struct file *, const char __user *buffer,
-			      unsigned long count, __s64 *val);
-extern int lprocfs_str_with_units_to_s64(struct file *,
-					 const char __user *buffer,
+extern int lprocfs_str_with_units_to_s64(const char __user *buffer,
 					 unsigned long count, __s64 *val,
 					 char defunit);
 
@@ -645,10 +655,10 @@ int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data);
 int lprocfs_hash_seq_show(struct seq_file *m, void *data);
 
 /* lprocfs_status.c: IR factor */
-int lprocfs_ir_factor_seq_show(struct seq_file *m, void *data);
-ssize_t
-lprocfs_ir_factor_seq_write(struct file *file, const char __user *buffer,
-				size_t count, loff_t *off);
+ssize_t ir_factor_show(struct kobject *kobj, struct attribute *attr,
+		       char *buf);
+ssize_t ir_factor_store(struct kobject *kobj, struct attribute *attr,
+			const char *buffer, size_t count);
 #endif
 
 /* lprocfs_status.c: dump pages on cksum error */
@@ -673,10 +683,75 @@ extern int lprocfs_seq_release(struct inode *, struct file *);
 #define LPROCFS_CLIMP_EXIT(obd)                 \
 	up_read(&(obd)->u.cli.cl_sem);
 
+/* write the name##_seq_show function, call LDEBUGFS_SEQ_FOPS_RO for read-only
+ * debugfs entries; otherwise, you will define name##_seq_write function also
+ * for a read-write debugfs entry, and then call LDEBUGFS_SEQ_FOPS instead.
+ * Finally, call ldebugfs_seq_create(obd, filename, 0444, &name#_fops, data);
+ */
+#define __LDEBUGFS_SEQ_FOPS(name, custom_seq_write)			\
+static int name##_single_open(struct inode *inode, struct file *file)	\
+{									\
+	return single_open(file, name##_seq_show, inode->i_private);	\
+}									\
+static const struct file_operations name##_fops = {			\
+	.owner	 = THIS_MODULE,						\
+	.open	 = name##_single_open,					\
+	.read	 = seq_read,						\
+	.write	 = custom_seq_write,					\
+	.llseek	 = seq_lseek,						\
+	.release = single_release,					\
+}
+
+#define LDEBUGFS_SEQ_FOPS_RO(name)	__LDEBUGFS_SEQ_FOPS(name, NULL)
+#define LDEBUGFS_SEQ_FOPS(name)		__LDEBUGFS_SEQ_FOPS(name, \
+							    name##_seq_write)
+
+#define LDEBUGFS_SEQ_FOPS_RO_TYPE(name, type)				\
+	static int name##_##type##_seq_show(struct seq_file *m, void *v)\
+	{								\
+		return lprocfs_##type##_seq_show(m, m->private);	\
+	}								\
+	LDEBUGFS_SEQ_FOPS_RO(name##_##type)
+
+#define LDEBUGFS_SEQ_FOPS_RW_TYPE(name, type)				\
+	static int name##_##type##_seq_show(struct seq_file *m, void *v)\
+	{								\
+		return lprocfs_##type##_seq_show(m, m->private);	\
+	}								\
+	static ssize_t name##_##type##_seq_write(struct file *file,	\
+			const char __user *buffer, size_t count,	\
+			loff_t *off)					\
+	{								\
+		struct seq_file *seq = file->private_data;		\
+		return ldebugfs_##type##_seq_write(file, buffer, count,	\
+						   seq->private);	\
+	}								\
+	LDEBUGFS_SEQ_FOPS(name##_##type);
+
+#define LDEBUGFS_FOPS_WR_ONLY(name, type)				\
+	static ssize_t name##_##type##_write(struct file *file,		\
+			const char __user *buffer, size_t count,	\
+			loff_t *off)					\
+	{								\
+		return ldebugfs_##type##_seq_write(file, buffer, count,	\
+						   off);		\
+	}								\
+	static int name##_##type##_open(struct inode *inode,		\
+					struct file *file)		\
+	{								\
+		return single_open(file, NULL, inode->i_private);	\
+	}								\
+	static const struct file_operations name##_##type##_fops = {	\
+		.open	 = name##_##type##_open,			\
+		.write	 = name##_##type##_write,			\
+		.release = single_release,				\
+	};
+
 /* write the name##_seq_show function, call LPROC_SEQ_FOPS_RO for read-only
-  proc entries; otherwise, you will define name##_seq_write function also for
-  a read-write proc entry, and then call LPROC_SEQ_SEQ instead. Finally,
-  call lprocfs_obd_seq_create(obd, filename, 0444, &name#_fops, data); */
+ * proc entries; otherwise, you will define name##_seq_write function also for
+ * a read-write proc entry, and then call LPROC_SEQ_FOPS instead. Finally,
+ * call ldebugfs_obd_seq_create(obd, filename, 0444, &name#_fops, data);
+ */
 #define __LPROC_SEQ_FOPS(name, custom_seq_write)			\
 static int name##_single_open(struct inode *inode, struct file *file)	\
 {									\
@@ -687,7 +762,8 @@ static int name##_single_open(struct inode *inode, struct file *file)	\
 		return rc;						\
 									\
 	return single_open(file, name##_seq_show,			\
-			   inode->i_private ? : PDE_DATA(inode));	\
+			   inode->i_private ? inode->i_private :	\
+					      PDE_DATA(inode));		\
 }									\
 static const struct proc_ops name##_fops = {				\
 	PROC_OWNER(THIS_MODULE)						\
@@ -719,11 +795,11 @@ static const struct proc_ops name##_fops = {				\
 	{								\
 		struct seq_file *seq = file->private_data;		\
 		return lprocfs_##type##_seq_write(file, buffer,		\
-						count, seq->private);	\
+						  count, seq->private);	\
 	}								\
 	LPROC_SEQ_FOPS(name##_##type);
 
-#define LPROC_SEQ_FOPS_WO_TYPE(name, type)				\
+#define LPROC_SEQ_FOPS_WR_ONLY(name, type)				\
 	static ssize_t name##_##type##_write(struct file *file,		\
 			const char __user *buffer, size_t count,	\
 			loff_t *off)					\
@@ -733,7 +809,8 @@ static const struct proc_ops name##_fops = {				\
 	static int name##_##type##_open(struct inode *inode, struct file *file)\
 	{								\
 		return single_open(file, NULL,				\
-				   inode->i_private ? : PDE_DATA(inode));\
+				   inode->i_private ? inode->i_private : \
+				   PDE_DATA(inode));			\
 	}								\
 	static const struct proc_ops name##_##type##_fops = {		\
 		.proc_open	= name##_##type##_open,			\
@@ -749,22 +826,10 @@ struct lustre_attr {
 			 const char *buf, size_t len);
 };
 
-/*
- * Hacks to get around set_fs removal.
- */
-void lprocfs_file_set_kernel(struct file *file);
-bool lprocfs_file_is_kernel(struct file *file);
-
-/*
- * Version of copy_from_user() that uses the above hacks to determine
- * whether it's dealing with user or kernel space.
- */
-unsigned long lprocfs_copy_from_user(struct file *file, void *to,
-				     const void __user *from, unsigned long n);
-
 #define LUSTRE_ATTR(name, mode, show, store) \
 static struct lustre_attr lustre_attr_##name = __ATTR(name, mode, show, store)
 
+#define LUSTRE_WO_ATTR(name) LUSTRE_ATTR(name, 0200, NULL, name##_store)
 #define LUSTRE_RO_ATTR(name) LUSTRE_ATTR(name, 0444, name##_show, NULL)
 #define LUSTRE_RW_ATTR(name) LUSTRE_ATTR(name, 0644, name##_show, name##_store)
 
@@ -786,33 +851,43 @@ int lprocfs_job_stats_log(struct obd_device *obd, char *jobid,
 void lprocfs_job_stats_fini(struct obd_device *obd);
 int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
 			   cntr_init_callback fn);
-int lprocfs_job_interval_seq_show(struct seq_file *m, void *data);
-ssize_t
-lprocfs_job_interval_seq_write(struct file *file, const char __user *buffer,
-				size_t count, loff_t *off);
-/* lproc_status.c */
-int lprocfs_recovery_time_soft_seq_show(struct seq_file *m, void *data);
-ssize_t lprocfs_recovery_time_soft_seq_write(struct file *file,
-						const char __user *buffer,
-						size_t count, loff_t *off);
-int lprocfs_recovery_time_hard_seq_show(struct seq_file *m, void *data);
-ssize_t
-lprocfs_recovery_time_hard_seq_write(struct file *file,
-				     const char __user *buffer,
-				     size_t count, loff_t *off);
-int lprocfs_target_instance_seq_show(struct seq_file *m, void *data);
+ssize_t job_cleanup_interval_show(struct kobject *kobj, struct attribute *attr,
+				  char *buf);
+ssize_t job_cleanup_interval_store(struct kobject *kobj,
+				   struct attribute *attr,
+				   const char *buffer, size_t count);
+/* lproc_status_server.c */
+ssize_t recovery_time_soft_show(struct kobject *kobj, struct attribute *attr,
+				char *buf);
+ssize_t recovery_time_soft_store(struct kobject *kobj,
+				 struct attribute *attr,
+				 const char *buffer, size_t count);
+ssize_t recovery_time_hard_show(struct kobject *kobj, struct attribute *attr,
+				char *buf);
+ssize_t recovery_time_hard_store(struct kobject *kobj,
+				 struct attribute *attr,
+				 const char *buffer, size_t count);
+ssize_t instance_show(struct kobject *kobj, struct attribute *attr,
+		      char *buf);
 #endif
+/* lproc_status.c */
 int lprocfs_obd_max_pages_per_rpc_seq_show(struct seq_file *m, void *data);
 ssize_t lprocfs_obd_max_pages_per_rpc_seq_write(struct file *file,
 						const char __user *buffer,
 						size_t count, loff_t *off);
+int lprocfs_obd_short_io_bytes_seq_show(struct seq_file *m, void *data);
+ssize_t lprocfs_obd_short_io_bytes_seq_write(struct file *file,
+					     const char __user *buffer,
+					     size_t count, loff_t *off);
+ssize_t short_io_bytes_show(struct kobject *kobj, struct attribute *attr,
+			    char *buf);
+ssize_t short_io_bytes_store(struct kobject *kobj, struct attribute *attr,
+			     const char *buffer, size_t count);
 
 struct root_squash_info;
-int lprocfs_wr_root_squash(struct file *file, const char __user *buffer,
-			   unsigned long count,
+int lprocfs_wr_root_squash(const char __user *buffer, unsigned long count,
 			   struct root_squash_info *squash, char *name);
-int lprocfs_wr_nosquash_nids(struct file *file, const char __user *buffer,
-			     unsigned long count,
+int lprocfs_wr_nosquash_nids(const char __user *buffer, unsigned long count,
 			     struct root_squash_info *squash, char *name);
 
 #else /* !CONFIG_PROC_FS */
@@ -852,16 +927,10 @@ static inline int lprocfs_register_stats(struct proc_dir_entry *root,
                                          const char *name,
                                          struct lprocfs_stats *stats)
 { return 0; }
-static inline void lprocfs_init_ops_stats(int num_private_stats,
-                                          struct lprocfs_stats *stats)
-{ return; }
-static inline void lprocfs_init_mps_stats(int num_private_stats,
-                                          struct lprocfs_stats *stats)
-{ return; }
 static inline void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
 { return; }
 static inline int lprocfs_alloc_obd_stats(struct obd_device *obddev,
-                                          unsigned int num_private_stats)
+					  unsigned int num_stats)
 { return 0; }
 static inline int lprocfs_alloc_md_stats(struct obd_device *obddev,
                                          unsigned int num_private_stats)
@@ -910,18 +979,14 @@ static inline void lprocfs_remove(struct proc_dir_entry **root)
 static inline void lprocfs_remove_proc_entry(const char *name,
                                              struct proc_dir_entry *parent)
 { return; }
-static inline int lprocfs_obd_setup(struct obd_device *dev)
+static inline int lprocfs_obd_setup(struct obd_device *dev, bool uuid_only)
 { return 0; }
 static inline int lprocfs_obd_cleanup(struct obd_device *dev)
 { return 0; }
 static inline int lprocfs_uuid_seq_show(struct seq_file *m, void *data)
 { return 0; }
-static inline int lprocfs_name_seq_show(struct seq_file *m, void *data)
-{ return 0; }
 static inline int lprocfs_server_seq_show(struct seq_file *m, void *data)
 { return 0; }
-static inline int lprocfs_conn_uuid_seq_show(struct seq_file *m, void *data)
-{ return 0; }
 static inline int lprocfs_import_seq_show(struct seq_file *m, void *data)
 { return 0; }
 static inline int lprocfs_state_seq_show(struct seq_file *m, void *data)
@@ -953,6 +1018,10 @@ lprocfs_ping_seq_write(struct file *file, const char __user *buffer,
 		       size_t count, loff_t *off)
 { return 0; }
 static inline ssize_t
+ldebugfs_import_seq_write(struct file *file, const char __user *buffer,
+			  size_t count, loff_t *off)
+{ return 0; }
+static inline ssize_t
 lprocfs_import_seq_write(struct file *file, const char __user *buffer,
 			 size_t count, loff_t *off)
 { return 0; }
@@ -1008,7 +1077,7 @@ u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx,
 #define LPROC_SEQ_FOPS(name)
 #define LPROC_SEQ_FOPS_RO_TYPE(name, type)
 #define LPROC_SEQ_FOPS_RW_TYPE(name, type)
-#define LPROC_SEQ_FOPS_WO_TYPE(name, type)
+#define LPROC_SEQ_FOPS_WR_ONLY(name, type)
 
 /* lprocfs_jobstats.c */
 static inline
diff --git a/drivers/staging/lustrefsx/lustre/include/lu_object.h b/drivers/staging/lustrefsx/lustre/include/lu_object.h
index ae5bb3dde4c82..c75d3115fdff5 100644
--- a/drivers/staging/lustrefsx/lustre/include/lu_object.h
+++ b/drivers/staging/lustrefsx/lustre/include/lu_object.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -35,7 +35,7 @@
 
 #include <stdarg.h>
 #include <libcfs/libcfs.h>
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 #include <lu_ref.h>
 #include <linux/percpu_counter.h>
 
@@ -426,26 +426,8 @@ struct lu_attr {
         __u32          la_rdev;
 	/** project id */
 	__u32	       la_projid;
-};
-
-/** Bit-mask of valid attributes */
-enum la_valid {
-        LA_ATIME = 1 << 0,
-        LA_MTIME = 1 << 1,
-        LA_CTIME = 1 << 2,
-        LA_SIZE  = 1 << 3,
-        LA_MODE  = 1 << 4,
-        LA_UID   = 1 << 5,
-        LA_GID   = 1 << 6,
-        LA_BLOCKS = 1 << 7,
-        LA_TYPE   = 1 << 8,
-        LA_FLAGS  = 1 << 9,
-        LA_NLINK  = 1 << 10,
-        LA_RDEV   = 1 << 11,
-        LA_BLKSIZE = 1 << 12,
-        LA_KILL_SUID = 1 << 13,
-        LA_KILL_SGID = 1 << 14,
-	LA_PROJID    = 1 << 15,
+	/** set layout version to OST objects. */
+	__u32		la_layout_version;
 };
 
 /**
@@ -484,17 +466,23 @@ enum lu_object_header_flags {
 	/**
 	 * Mark this object has already been taken out of cache.
 	 */
-	LU_OBJECT_UNHASHED = 1,
+	LU_OBJECT_UNHASHED	= 1,
+	/**
+	 * Object is initialized, when object is found in cache, it may not be
+	 * intialized yet, the object allocator will initialize it.
+	 */
+	LU_OBJECT_INITED	= 2
 };
 
 enum lu_object_header_attr {
-        LOHA_EXISTS   = 1 << 0,
-        LOHA_REMOTE   = 1 << 1,
-        /**
-         * UNIX file type is stored in S_IFMT bits.
-         */
-        LOHA_FT_START = 001 << 12, /**< S_IFIFO */
-        LOHA_FT_END   = 017 << 12, /**< S_IFMT */
+	LOHA_EXISTS		= 1 << 0,
+	LOHA_REMOTE		= 1 << 1,
+	LOHA_HAS_AGENT_ENTRY	= 1 << 2,
+	/**
+	 * UNIX file type is stored in S_IFMT bits.
+	 */
+	LOHA_FT_START		= 001 << 12, /**< S_IFIFO */
+	LOHA_FT_END		= 017 << 12, /**< S_IFMT */
 };
 
 /**
@@ -548,31 +536,6 @@ struct lu_object_header {
 
 struct fld;
 
-struct lu_site_bkt_data {
-	/**
-	 * number of object in this bucket on the lsb_lru list.
-	 */
-	long			lsb_lru_len;
-	/**
-	 * LRU list, updated on each access to object. Protected by
-	 * bucket lock of lu_site::ls_obj_hash.
-	 *
-	 * "Cold" end of LRU is lu_site::ls_lru.next. Accessed object are
-	 * moved to the lu_site::ls_lru.prev (this is due to the non-existence
-	 * of list_for_each_entry_safe_reverse()).
-	 */
-	struct list_head	lsb_lru;
-	/**
-	 * Wait-queue signaled when an object in this site is ultimately
-	 * destroyed (lu_object_free()). It is used by lu_object_find() to
-	 * wait before re-trying when object in the process of destruction is
-	 * found in the hash table.
-	 *
-	 * \see htable_lookup().
-	 */
-	wait_queue_head_t	lsb_marche_funebre;
-};
-
 enum {
 	LU_SS_CREATED		= 0,
 	LU_SS_CACHE_HIT,
@@ -643,14 +606,8 @@ struct lu_site {
 	struct percpu_counter   ls_lru_len_counter;
 };
 
-static inline struct lu_site_bkt_data *
-lu_site_bkt_from_fid(struct lu_site *site, struct lu_fid *fid)
-{
-	struct cfs_hash_bd bd;
-
-        cfs_hash_bd_get(site->ls_obj_hash, fid, &bd);
-        return cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
-}
+wait_queue_head_t *
+lu_site_wq_from_fid(struct lu_site *site, struct lu_fid *fid);
 
 static inline struct seq_server_site *lu_site2seq(const struct lu_site *s)
 {
@@ -715,6 +672,14 @@ static inline int lu_object_is_dying(const struct lu_object_header *h)
 	return test_bit(LU_OBJECT_HEARD_BANSHEE, &h->loh_flags);
 }
 
+/**
+ * Return true if object is initialized.
+ */
+static inline int lu_object_is_inited(const struct lu_object_header *h)
+{
+	return test_bit(LU_OBJECT_INITED, &h->loh_flags);
+}
+
 void lu_object_put(const struct lu_env *env, struct lu_object *o);
 void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o);
 void lu_object_unhash(const struct lu_env *env, struct lu_object *o);
@@ -844,6 +809,22 @@ int lu_object_invariant(const struct lu_object *o);
  */
 #define lu_object_remote(o) unlikely((o)->lo_header->loh_attr & LOHA_REMOTE)
 
+/**
+ * Check whether the object as agent entry on current target
+ */
+#define lu_object_has_agent_entry(o) \
+	unlikely((o)->lo_header->loh_attr & LOHA_HAS_AGENT_ENTRY)
+
+static inline void lu_object_set_agent_entry(struct lu_object *o)
+{
+	o->lo_header->loh_attr |= LOHA_HAS_AGENT_ENTRY;
+}
+
+static inline void lu_object_clear_agent_entry(struct lu_object *o)
+{
+	o->lo_header->loh_attr &= ~LOHA_HAS_AGENT_ENTRY;
+}
+
 static inline int lu_object_assert_exists(const struct lu_object *o)
 {
 	return lu_object_exists(o);
@@ -860,7 +841,8 @@ static inline int lu_object_assert_not_exists(const struct lu_object *o)
 static inline __u32 lu_object_attr(const struct lu_object *o)
 {
 	LASSERT(lu_object_exists(o) != 0);
-        return o->lo_header->loh_attr;
+
+	return o->lo_header->loh_attr & S_IFMT;
 }
 
 static inline void lu_object_ref_add(struct lu_object *o,
@@ -907,7 +889,9 @@ struct lu_rdpg {
 
 enum lu_xattr_flags {
 	LU_XATTR_REPLACE = (1 << 0),
-	LU_XATTR_CREATE  = (1 << 1)
+	LU_XATTR_CREATE  = (1 << 1),
+	LU_XATTR_MERGE   = (1 << 2),
+	LU_XATTR_SPLIT   = (1 << 3),
 };
 
 /** @} helpers */
@@ -1129,20 +1113,20 @@ struct lu_context_key {
 };
 
 #define LU_KEY_INIT(mod, type)                                    \
-        static void* mod##_key_init(const struct lu_context *ctx, \
-                                    struct lu_context_key *key)   \
-        {                                                         \
-                type *value;                                      \
+	static void *mod##_key_init(const struct lu_context *ctx, \
+				    struct lu_context_key *key)   \
+	{                                                         \
+		type *value;                                      \
                                                                   \
 		CLASSERT(PAGE_SIZE >= sizeof(*value));		  \
                                                                   \
-                OBD_ALLOC_PTR(value);                             \
-                if (value == NULL)                                \
-                        value = ERR_PTR(-ENOMEM);                 \
-                                                                  \
-                return value;                                     \
-        }                                                         \
-        struct __##mod##__dummy_init {;} /* semicolon catcher */
+		OBD_ALLOC_PTR(value);                             \
+		if (value == NULL)                                \
+			value = ERR_PTR(-ENOMEM);                 \
+								  \
+		return value;                                     \
+	}                                                         \
+	struct __##mod##__dummy_init { ; } /* semicolon catcher */
 
 #define LU_KEY_FINI(mod, type)                                              \
         static void mod##_key_fini(const struct lu_context *ctx,            \
@@ -1278,6 +1262,37 @@ void lu_env_fini  (struct lu_env *env);
 int  lu_env_refill(struct lu_env *env);
 int  lu_env_refill_by_tags(struct lu_env *env, __u32 ctags, __u32 stags);
 
+static inline void* lu_env_info(const struct lu_env *env,
+				const struct lu_context_key *key)
+{
+	void *info;
+	info = lu_context_key_get(&env->le_ctx, key);
+	if (!info) {
+		if (!lu_env_refill((struct lu_env *)env))
+			info = lu_context_key_get(&env->le_ctx, key);
+	}
+	LASSERT(info);
+	return info;
+}
+
+#ifdef HAVE_SERVER_SUPPORT
+struct lu_env *lu_env_find(void);
+int lu_env_add(struct lu_env *env);
+void lu_env_remove(struct lu_env *env);
+#else
+static inline struct lu_env *lu_env_find(void)
+{
+	return NULL;
+}
+static inline int lu_env_add(struct lu_env *env)
+{
+	return 0;
+}
+static inline void lu_env_remove(struct lu_env *env)
+{
+}
+#endif /* HAVE_SERVER_SUPPORT */
+
 /** @} lu_context */
 
 /**
@@ -1294,6 +1309,26 @@ struct lu_name {
         int            ln_namelen;
 };
 
+static inline bool name_is_dot_or_dotdot(const char *name, int namelen)
+{
+	return name[0] == '.' &&
+	       (namelen == 1 || (namelen == 2 && name[1] == '.'));
+}
+
+static inline bool lu_name_is_dot_or_dotdot(const struct lu_name *lname)
+{
+	return name_is_dot_or_dotdot(lname->ln_name, lname->ln_namelen);
+}
+
+static inline bool lu_name_is_valid_len(const char *name, size_t name_len)
+{
+	return name != NULL &&
+	       name_len > 0 &&
+	       name_len < INT_MAX &&
+	       strlen(name) == name_len &&
+	       memchr(name, '/', name_len) == NULL;
+}
+
 /**
  * Validate names (path components)
  *
@@ -1305,12 +1340,7 @@ struct lu_name {
  */
 static inline bool lu_name_is_valid_2(const char *name, size_t name_len)
 {
-	return name != NULL &&
-	       name_len > 0 &&
-	       name_len < INT_MAX &&
-	       name[name_len] == '\0' &&
-	       strlen(name) == name_len &&
-	       memchr(name, '/', name_len) == NULL;
+	return lu_name_is_valid_len(name, name_len) && name[name_len] == '\0';
 }
 
 static inline bool lu_name_is_valid(const struct lu_name *ln)
diff --git a/drivers/staging/lustrefsx/lustre/include/lu_target.h b/drivers/staging/lustrefsx/lustre/include/lu_target.h
index 0d3ef968923ad..0810fbea8b55e 100644
--- a/drivers/staging/lustrefsx/lustre/include/lu_target.h
+++ b/drivers/staging/lustrefsx/lustre/include/lu_target.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -126,14 +126,17 @@ struct tg_grants_data {
 	u64			 tgd_tot_granted;
 	/* grant used by I/Os in progress (between prepare and commit) */
 	u64			 tgd_tot_pending;
+	/* amount of available space in percentage that is never used for
+	 * grants, used on MDT to always keep space for metadata. */
+	u64			 tgd_reserved_pcnt;
 	/* number of clients using grants */
 	int			 tgd_tot_granted_clients;
 	/* shall we grant space to clients not
 	 * supporting OBD_CONNECT_GRANT_PARAM? */
-	int			 tgd_grant_compat_disable;
+	unsigned int		 tgd_grant_compat_disable:1;
 	/* protect all statfs-related counters */
 	spinlock_t		 tgd_osfs_lock;
-	__u64			 tgd_osfs_age;
+	time64_t		 tgd_osfs_age;
 	int			 tgd_blockbits;
 	/* counters used during statfs update, protected by ofd_osfs_lock.
 	 * record when some statfs refresh are in progress */
@@ -201,8 +204,18 @@ struct lu_target {
 
 	/* target grants fields */
 	struct tg_grants_data	 lut_tgd;
+
+	/* target tunables */
+	const struct attribute	**lut_attrs;
+
+	/* FMD (file modification data) values */
+	int			 lut_fmd_max_num;
+	time64_t		 lut_fmd_max_age;
 };
 
+#define LUT_FMD_MAX_NUM_DEFAULT 128
+#define LUT_FMD_MAX_AGE_DEFAULT (obd_timeout + 10)
+
 /* number of slots in reply bitmap */
 #define LUT_REPLY_SLOTS_PER_CHUNK (1<<20)
 #define LUT_REPLY_SLOTS_MAX_CHUNKS 16
@@ -356,7 +369,7 @@ struct tgt_handler {
 	/* Flags in enum tgt_handler_flags */
 	__u32			 th_flags;
 	/* Request version for this opcode */
-	int			 th_version;
+	enum lustre_msg_version	 th_version;
 	/* Handler function */
 	int			(*th_act)(struct tgt_session_info *tsi);
 	/* Handler function for high priority requests */
@@ -409,8 +422,6 @@ int tgt_convert(struct tgt_session_info *tsi);
 int tgt_bl_callback(struct tgt_session_info *tsi);
 int tgt_cp_callback(struct tgt_session_info *tsi);
 int tgt_llog_open(struct tgt_session_info *tsi);
-int tgt_llog_close(struct tgt_session_info *tsi);
-int tgt_llog_destroy(struct tgt_session_info *tsi);
 int tgt_llog_read_header(struct tgt_session_info *tsi);
 int tgt_llog_next_block(struct tgt_session_info *tsi);
 int tgt_llog_prev_block(struct tgt_session_info *tsi);
@@ -426,15 +437,13 @@ int tgt_sync(const struct lu_env *env, struct lu_target *tgt,
 int tgt_io_thread_init(struct ptlrpc_thread *thread);
 void tgt_io_thread_done(struct ptlrpc_thread *thread);
 
-int tgt_extent_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
-		    __u64 start, __u64 end, struct lustre_handle *lh,
-		    int mode, __u64 *flags);
+int tgt_mdt_data_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
+		      struct lustre_handle *lh, int mode, __u64 *flags);
+void tgt_mdt_data_unlock(struct lustre_handle *lh, enum ldlm_mode mode);
+int tgt_extent_lock(const struct lu_env *env, struct ldlm_namespace *ns,
+		    struct ldlm_res_id *res_id, __u64 start, __u64 end,
+		    struct lustre_handle *lh, int mode, __u64 *flags);
 void tgt_extent_unlock(struct lustre_handle *lh, enum ldlm_mode mode);
-int tgt_brw_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
-		 struct obd_ioobj *obj, struct niobuf_remote *nb,
-		 struct lustre_handle *lh, enum ldlm_mode mode);
-void tgt_brw_unlock(struct obd_ioobj *obj, struct niobuf_remote *niob,
-		    struct lustre_handle *lh, enum ldlm_mode mode);
 int tgt_brw_read(struct tgt_session_info *tsi);
 int tgt_brw_write(struct tgt_session_info *tsi);
 int tgt_hpreq_handler(struct ptlrpc_request *req);
@@ -494,6 +503,8 @@ int tgt_add_reply_data(const struct lu_env *env, struct lu_target *tgt,
 		       struct thandle *th, bool update_lrd_file);
 struct tg_reply_data *tgt_lookup_reply_by_xid(struct tg_export_data *ted,
 					       __u64 xid);
+int tgt_tunables_init(struct lu_target *lut);
+void tgt_tunables_fini(struct lu_target *lut);
 
 /* target/tgt_grant.c */
 static inline int exp_grant_param_supp(struct obd_export *exp)
@@ -521,8 +532,36 @@ int tgt_grant_commit_cb_add(struct thandle *th, struct obd_export *exp,
 long tgt_grant_create(const struct lu_env *env, struct obd_export *exp,
 		      s64 *nr);
 int tgt_statfs_internal(const struct lu_env *env, struct lu_target *lut,
-			struct obd_statfs *osfs, __u64 max_age,
+			struct obd_statfs *osfs, time64_t max_age,
 			int *from_cache);
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 16, 53, 0)
+ssize_t sync_lock_cancel_show(struct kobject *kobj,
+			      struct attribute *attr, char *buf);
+ssize_t sync_lock_cancel_store(struct kobject *kobj, struct attribute *attr,
+			       const char *buffer, size_t count);
+#endif
+ssize_t tot_dirty_show(struct kobject *kobj, struct attribute *attr,
+		       char *buf);
+ssize_t tot_granted_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf);
+ssize_t tot_pending_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf);
+ssize_t grant_compat_disable_show(struct kobject *kobj, struct attribute *attr,
+				  char *buf);
+ssize_t grant_compat_disable_store(struct kobject *kobj,
+				   struct attribute *attr,
+				   const char *buffer, size_t count);
+
+/* FMD */
+void tgt_fmd_update(struct obd_export *exp, const struct lu_fid *fid,
+		    __u64 xid);
+bool tgt_fmd_check(struct obd_export *exp, const struct lu_fid *fid,
+		   __u64 xid);
+#ifdef DO_FMD_DROP
+void tgt_fmd_drop(struct obd_export *exp, const struct lu_fid *fid);
+#else
+#define tgt_fmd_drop(exp, fid) do {} while (0)
+#endif
 
 /* target/update_trans.c */
 int distribute_txn_init(const struct lu_env *env,
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h b/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h
index 6f57a20a6a8ab..e5466c7886238 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h
@@ -38,36 +38,12 @@
  * Author: Andreas Dilger <adilger@sun.com>
  */
 
-#ifndef _LUSTRE_FIEMAP_H
-#define _LUSTRE_FIEMAP_H
-
-#include <stddef.h>
-#include <linux/fiemap.h>
-
-/* XXX: We use fiemap_extent::fe_reserved[0] */
-#define fe_device	fe_reserved[0]
-
-static inline size_t fiemap_count_to_size(size_t extent_count)
-{
-	return sizeof(struct fiemap) + extent_count *
-				       sizeof(struct fiemap_extent);
-}
-
-static inline unsigned fiemap_size_to_count(size_t array_size)
-{
-	return (array_size - sizeof(struct fiemap)) /
-	       sizeof(struct fiemap_extent);
-}
-
-#define FIEMAP_FLAG_DEVICE_ORDER 0x40000000 /* return device ordered mapping */
-
-#ifdef FIEMAP_FLAGS_COMPAT
-#undef FIEMAP_FLAGS_COMPAT
-#endif
+/*
+ * NOTE: This file is DEPRECATED! Please include linux/lustre/lustre_fiemap.h
+ * directly instead of this file. This file will be removed from a
+ * future version of lustre!
+ */
 
-/* Lustre specific flags - use a high bit, don't conflict with upstream flag */
-#define FIEMAP_EXTENT_NO_DIRECT 0x40000000 /* Data mapping undefined */
-#define FIEMAP_EXTENT_NET       0x80000000 /* Data stored remotely.
-					    * Sets NO_DIRECT flag */
+#include <linux/lustre/lustre_fiemap.h>
 
-#endif /* _LUSTRE_FIEMAP_H */
+#warning "Including ll_fiemap.h is deprecated. Include linux/lustre/lustre_fiemap.h directly."
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_barrier_user.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_barrier_user.h
index e69bdc2795e56..f8489d55a3b44 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_barrier_user.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_barrier_user.h
@@ -6,13 +6,13 @@
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 only,
  * as published by the Free Software Foundation.
-
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License version 2 for more details.  A copy is
  * included in the COPYING file that accompanied this code.
-
+ *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2016, Intel Corporation.
+ * Copyright (c) 2017, Intel Corporation.
  *
  * lustre/include/lustre/lustre_barrier_user.h
  *
@@ -28,46 +28,13 @@
  *
  * Author: Fan, Yong <fan.yong@intel.com>
  */
-#ifndef _LUSTRE_BARRIER_USER_H
-# define _LUSTRE_BARRIER_USER_H
-
-#include <lustre/lustre_user.h>
 
-#define BARRIER_VERSION_V1	1
-#define BARRIER_TIMEOUT_DEFAULT	30
-
-enum barrier_commands {
-	BC_FREEZE	= 1,
-	BC_THAW		= 2,
-	BC_STAT		= 3,
-	BC_RESCAN	= 4,
-};
-
-enum barrier_status {
-	BS_INIT		= 0,
-	BS_FREEZING_P1	= 1,
-	BS_FREEZING_P2	= 2,
-	BS_FROZEN	= 3,
-	BS_THAWING	= 4,
-	BS_THAWED	= 5,
-	BS_FAILED	= 6,
-	BS_EXPIRED	= 7,
-	BS_RESCAN	= 8,
-};
+/*
+ * NOTE: This file is DEPRECATED! Please include linux/lustre/lustre_barrier_user.h
+ * directly instead of this file. This file will be removed from a
+ * future version of lustre!
+ */
 
-struct barrier_ctl {
-	__u32	bc_version;
-	__u32	bc_cmd;
-	union {
-		__s32	bc_timeout;
-		__u32	bc_total;
-	};
-	union {
-		__u32	bc_status;
-		__u32	bc_absence;
-	};
-	char	bc_name[12];
-	__u32	bc_padding;
-};
+#include <linux/lustre/lustre_barrier_user.h>
 
-#endif /* _LUSTRE_BARRIER_USER_H */
+#warning "Including lustre_barrier_user.h is deprecated. Include linux/lustre/lustre_barrier_user.h directly."
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_lfsck_user.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_lfsck_user.h
index a02f65fa08aef..7b84426fa2750 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_lfsck_user.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_lfsck_user.h
@@ -6,13 +6,13 @@
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 only,
  * as published by the Free Software Foundation.
-
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License version 2 for more details.  A copy is
  * included in the COPYING file that accompanied this code.
-
+ *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * lustre/include/lustre/lustre_lfsck_user.h
@@ -30,207 +30,11 @@
  * Author: Fan, Yong <fan.yong@intel.com>
  */
 
-#ifndef _LUSTRE_LFSCK_USER_H
-# define _LUSTRE_LFSCK_USER_H
-# include <lustre/lustre_user.h>
-
-/**
- * state machine:
- *
- *					LS_INIT
- *					   |
- *				     (lfsck|start)
- *					   |
- *					   v
- *				   LS_SCANNING_PHASE1
- *					|	^
- *					|	:
- *					| (lfsck:restart)
- *					|	:
- *					v	:
- *	-----------------------------------------------------------------
- *	|		    |^		|^	   |^	      |^	|^
- *	|		    |:		|:	   |:	      |:	|:
- *	v		    v:		v:	   v:	      v:	v:
- * LS_SCANNING_PHASE2	LS_FAILED  LS_STOPPED  LS_PAUSED LS_CRASHED LS_PARTIAL
- *			  (CO_)       (CO_)	 (CO_)
- *	|	^	    ^:		^:	   ^:	      ^:	^:
- *	|	:	    |:		|:	   |:	      |:	|:
- *	| (lfsck:restart)   |:		|:	   |:	      |:	|:
- *	v	:	    |v		|v	   |v	      |v	|v
- *	-----------------------------------------------------------------
- *	    |
- *	    v
- *    LS_COMPLETED
+/*
+ * NOTE: This file is DEPRECATED! Please include linux/lustre/lustre_lfsck_user.h
+ * directly instead of this file. This file will be removed from a
+ * future version of lustre!
  */
-enum lfsck_status {
-	/* The lfsck file is new created, for new MDT, upgrading from old disk,
-	 * or re-creating the lfsck file manually. */
-	LS_INIT			= 0,
-
-	/* The first-step system scanning. The checked items during the phase1
-	 * scanning depends on the LFSCK type. */
-	LS_SCANNING_PHASE1	= 1,
-
-	/* The second-step system scanning. The checked items during the phase2
-	 * scanning depends on the LFSCK type. */
-	LS_SCANNING_PHASE2	= 2,
-
-	/* The LFSCK processing has completed for all objects. */
-	LS_COMPLETED		= 3,
-
-	/* The LFSCK exited automatically for failure, will not auto restart. */
-	LS_FAILED		= 4,
-
-	/* The LFSCK is stopped manually, will not auto restart. */
-	LS_STOPPED		= 5,
-
-	/* LFSCK is paused automatically when umount,
-	 * will be restarted automatically when remount. */
-	LS_PAUSED		= 6,
-
-	/* System crashed during the LFSCK,
-	 * will be restarted automatically after recovery. */
-	LS_CRASHED		= 7,
-
-	/* Some OST/MDT failed during the LFSCK, or not join the LFSCK. */
-	LS_PARTIAL		= 8,
-
-	/* The LFSCK is failed because its controller is failed. */
-	LS_CO_FAILED		= 9,
-
-	/* The LFSCK is stopped because its controller is stopped. */
-	LS_CO_STOPPED		= 10,
-
-	/* The LFSCK is paused because its controller is paused. */
-	LS_CO_PAUSED		= 11,
-
-	LS_MAX
-};
-
-static inline const char *lfsck_status2name(int status)
-{
-	static const char * const lfsck_status_names[] = {
-		[LS_INIT]		= "init",
-		[LS_SCANNING_PHASE1]	= "scanning-phase1",
-		[LS_SCANNING_PHASE2]	= "scanning-phase2",
-		[LS_COMPLETED]		= "completed",
-		[LS_FAILED]		= "failed",
-		[LS_STOPPED]		= "stopped",
-		[LS_PAUSED]		= "paused",
-		[LS_CRASHED]		= "crashed",
-		[LS_PARTIAL]		= "partial",
-		[LS_CO_FAILED]		= "co-failed",
-		[LS_CO_STOPPED]		= "co-stopped",
-		[LS_CO_PAUSED]		= "co-paused"
-	};
-
-	if (status < 0 || status >= LS_MAX)
-		return "unknown";
-
-	return lfsck_status_names[status];
-}
-
-enum lfsck_param_flags {
-	/* Reset LFSCK iterator position to the device beginning. */
-	LPF_RESET		= 0x0001,
-
-	/* Exit when fail. */
-	LPF_FAILOUT		= 0x0002,
-
-	/* Dryrun mode, only check without modification */
-	LPF_DRYRUN		= 0x0004,
-
-	/* LFSCK runs on all targets. */
-	LPF_ALL_TGT		= 0x0008,
-
-	/* Broadcast the command to other MDTs. Only valid on the sponsor MDT */
-	LPF_BROADCAST		= 0x0010,
-
-	/* Handle orphan OST-objects. */
-	LPF_OST_ORPHAN		= 0x0020,
-
-	/* Create OST-object for dangling LOV EA. */
-	LPF_CREATE_OSTOBJ	= 0x0040,
-
-	/* Create MDT-object for dangling name entry. */
-	LPF_CREATE_MDTOBJ	= 0x0080,
-
-	/* Do not return until the LFSCK not running. */
-	LPF_WAIT		= 0x0100,
-
-	/* Delay to create OST-object for dangling LOV EA. */
-	LPF_DELAY_CREATE_OSTOBJ	= 0x0200,
-};
-
-enum lfsck_type {
-	/* For MDT and OST internal OSD consistency check/repair. */
-	LFSCK_TYPE_SCRUB	= 0x0000,
-
-	/* For MDT-OST (layout, object) consistency check/repair. */
-	LFSCK_TYPE_LAYOUT	= 0x0001,
-
-	/* For MDT (FID-in-dirent, linkEA) consistency check/repair. */
-	LFSCK_TYPE_NAMESPACE	= 0x0004,
-	LFSCK_TYPES_SUPPORTED	= (LFSCK_TYPE_SCRUB | LFSCK_TYPE_LAYOUT |
-				   LFSCK_TYPE_NAMESPACE),
-	LFSCK_TYPES_DEF		= LFSCK_TYPES_SUPPORTED,
-	LFSCK_TYPES_ALL		= ((__u16)(~0))
-};
-
-#define LFSCK_VERSION_V1	1
-#define LFSCK_VERSION_V2	2
-
-#define LFSCK_SPEED_NO_LIMIT	0
-#define LFSCK_SPEED_LIMIT_DEF	LFSCK_SPEED_NO_LIMIT
-#define LFSCK_ASYNC_WIN_DEFAULT 1024
-#define LFSCK_ASYNC_WIN_MAX	((__u16)(~0))
-#define LFSCK_TYPE_BITS		16
-
-enum lfsck_start_valid {
-	LSV_SPEED_LIMIT		= 0x00000001,
-	LSV_ERROR_HANDLE	= 0x00000002,
-	LSV_DRYRUN		= 0x00000004,
-	LSV_ASYNC_WINDOWS	= 0x00000008,
-	LSV_CREATE_OSTOBJ	= 0x00000010,
-	LSV_CREATE_MDTOBJ	= 0x00000020,
-	LSV_DELAY_CREATE_OSTOBJ	= 0x00000040,
-};
-
-/* Arguments for starting lfsck. */
-struct lfsck_start {
-	/* Which arguments are valid, see 'enum lfsck_start_valid'. */
-	__u32   ls_valid;
-
-	/* How many items can be scanned at most per second. */
-	__u32   ls_speed_limit;
-
-	/* For compatibility between user space tools and kernel service. */
-	__u16   ls_version;
-
-	/* Which LFSCK components to be (have been) started. */
-	__u16   ls_active;
-
-	/* Flags for the LFSCK, see 'enum lfsck_param_flags'. */
-	__u16   ls_flags;
-
-	/* The windows size for async requests pipeline. */
-	__u16   ls_async_windows;
-};
-
-struct lfsck_stop {
-	__u32	ls_status;
-	__u16	ls_flags;
-	__u16	ls_padding_1; /* For 64-bits aligned. */
-	__u64	ls_padding_2;
-};
-
-struct lfsck_query {
-	__u16	lu_types;
-	__u16	lu_flags;
-	__u32	lu_mdts_count[LFSCK_TYPE_BITS][LS_MAX + 1];
-	__u32	lu_osts_count[LFSCK_TYPE_BITS][LS_MAX + 1];
-	__u64	lu_repaired[LFSCK_TYPE_BITS];
-};
 
-#endif /* _LUSTRE_LFSCK_USER_H */
+#include <linux/lustre/lustre_lfsck_user.h>
+#warning "Including lustre_lfsck_user.h is deprecated. Include linux/lustre/lustre_lfsck_user.h directly."
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h
index 67ed9768fcb2f..9d8f5ebefa569 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2016, Intel Corporation.
+ * Copyright (c) 2010, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -34,1638 +34,15 @@
  * Lustre public user-space interface definitions.
  */
 
-#ifndef _LUSTRE_USER_H
-#define _LUSTRE_USER_H
-
-/** \defgroup lustreuser lustreuser
- *
- * @{
- */
-
-#include <linux/types.h>
-
-#ifdef __KERNEL__
-# include <linux/fs.h>
-# include <linux/quota.h>
-# include <linux/string.h> /* snprintf() */
-# include <linux/version.h>
-#else /* !__KERNEL__ */
-# include <limits.h>
-# include <stdbool.h>
-# include <stdio.h> /* snprintf() */
-# include <string.h>
-# define NEED_QUOTA_DEFS
-/* # include <sys/quota.h> - this causes complaints about caddr_t */
-# include <sys/stat.h>
-#endif /* __KERNEL__ */
-#include <lustre/ll_fiemap.h>
-
 /*
- * This is a temporary solution of adding quota type.
- * Should be removed as soon as system header is updated.
- */
-#undef LL_MAXQUOTAS
-#define LL_MAXQUOTAS 3
-#undef INITQFNAMES
-#define INITQFNAMES { \
-    "user",	/* USRQUOTA */ \
-    "group",	/* GRPQUOTA */ \
-    "project",	/* PRJQUOTA */ \
-    "undefined", \
-};
-#ifndef USRQUOTA
-#define USRQUOTA 0
-#endif
-#ifndef GRPQUOTA
-#define GRPQUOTA 1
-#endif
-#ifndef PRJQUOTA
-#define PRJQUOTA 2
-#endif
-
-#if defined(__x86_64__) || defined(__ia64__) || defined(__ppc64__) || \
-    defined(__craynv) || defined(__mips64__) || defined(__powerpc64__) || \
-    defined(__aarch64__)
-typedef struct stat	lstat_t;
-# define lstat_f	lstat
-# define fstat_f	fstat
-# define fstatat_f	fstatat
-# define HAVE_LOV_USER_MDS_DATA
-#elif defined(__USE_LARGEFILE64) || defined(__KERNEL__)
-typedef struct stat64	lstat_t;
-# define lstat_f	lstat64
-# define fstat_f	fstat64
-# define fstatat_f	fstatat64
-# define HAVE_LOV_USER_MDS_DATA
-#endif
-
-#define LUSTRE_EOF 0xffffffffffffffffULL
-
-/* for statfs() */
-#define LL_SUPER_MAGIC 0x0BD00BD0
-
-#ifndef FSFILT_IOC_GETFLAGS
-#define FSFILT_IOC_GETFLAGS               _IOR('f', 1, long)
-#define FSFILT_IOC_SETFLAGS               _IOW('f', 2, long)
-#define FSFILT_IOC_GETVERSION             _IOR('f', 3, long)
-#define FSFILT_IOC_SETVERSION             _IOW('f', 4, long)
-#define FSFILT_IOC_GETVERSION_OLD         _IOR('v', 1, long)
-#define FSFILT_IOC_SETVERSION_OLD         _IOW('v', 2, long)
-#endif
-
-/* FIEMAP flags supported by Lustre */
-#define LUSTRE_FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_DEVICE_ORDER)
-
-enum obd_statfs_state {
-        OS_STATE_DEGRADED       = 0x00000001, /**< RAID degraded/rebuilding */
-        OS_STATE_READONLY       = 0x00000002, /**< filesystem is read-only */
-	OS_STATE_NOPRECREATE    = 0x00000004, /**< no object precreation */
-	OS_STATE_ENOSPC		= 0x00000020, /**< not enough free space */
-	OS_STATE_ENOINO		= 0x00000040, /**< not enough inodes */
-};
-
-struct obd_statfs {
-        __u64           os_type;
-        __u64           os_blocks;
-        __u64           os_bfree;
-        __u64           os_bavail;
-        __u64           os_files;
-        __u64           os_ffree;
-        __u8            os_fsid[40];
-        __u32           os_bsize;
-        __u32           os_namelen;
-        __u64           os_maxbytes;
-        __u32           os_state;       /**< obd_statfs_state OS_STATE_* flag */
-	__u32           os_fprecreated;	/* objs available now to the caller */
-					/* used in QoS code to find preferred
-					 * OSTs */
-        __u32           os_spare2;
-        __u32           os_spare3;
-        __u32           os_spare4;
-        __u32           os_spare5;
-        __u32           os_spare6;
-        __u32           os_spare7;
-        __u32           os_spare8;
-        __u32           os_spare9;
-};
-
-/**
- * File IDentifier.
- *
- * FID is a cluster-wide unique identifier of a file or an object (stripe).
- * FIDs are never reused.
- **/
-struct lu_fid {
-       /**
-	* FID sequence. Sequence is a unit of migration: all files (objects)
-	* with FIDs from a given sequence are stored on the same server.
-	* Lustre should support 2^64 objects, so even if each sequence
-	* has only a single object we can still enumerate 2^64 objects.
-	**/
-	__u64 f_seq;
-	/* FID number within sequence. */
-	__u32 f_oid;
-	/**
-	 * FID version, used to distinguish different versions (in the sense
-	 * of snapshots, etc.) of the same file system object. Not currently
-	 * used.
-	 **/
-	__u32 f_ver;
-};
-
-static inline bool fid_is_zero(const struct lu_fid *fid)
-{
-	return fid->f_seq == 0 && fid->f_oid == 0;
-}
-
-/* Currently, the filter_fid::ff_parent::f_ver is not the real parent
- * MDT-object's FID::f_ver, instead it is the OST-object index in its
- * parent MDT-object's layout EA. */
-#define f_stripe_idx f_ver
-
-struct ost_layout {
-	__u32	ol_stripe_size;
-	__u32	ol_stripe_count;
-	__u64	ol_comp_start;
-	__u64	ol_comp_end;
-	__u32	ol_comp_id;
-} __attribute__((packed));
-
-/* The filter_fid structure has changed several times over its lifetime.
- * For a long time "trusted.fid" held the MDT inode parent FID/IGIF and
- * stripe_index and the "self FID" (objid/seq) to be able to recover the
- * OST objects in case of corruption.  With the move to 2.4 and OSD-API for
- * the OST, the "trusted.lma" xattr was added to the OST objects to store
- * the "self FID" to be consistent with the MDT on-disk format, and the
- * filter_fid only stored the MDT inode parent FID and stripe index.
- *
- * In 2.10, the addition of PFL composite layouts required more information
- * to be stored into the filter_fid in order to be able to identify which
- * component the OST object belonged.  As well, the stripe size may vary
- * between components, so it was no longer safe to assume the stripe size
- * or stripe_count of a file.  This is also more robust for plain layouts.
- *
- * For ldiskfs OSTs that were formatted with 256-byte inodes, there is not
- * enough space to store both the filter_fid and LMA in the inode, so they
- * are packed into struct lustre_ost_attrs on disk in trusted.lma to avoid
- * an extra seek for every OST object access.
- *
- * In 2.11, FLR mirror layouts also need to store the layout version and
- * range so that writes to old versions of the layout are not allowed.
- * That ensures that mirrored objects are not modified by evicted clients,
- * and ensures that the components are correctly marked stale on the MDT.
- */
-struct filter_fid_18_23 {
-	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
-	__u64			ff_objid;
-	__u64			ff_seq;
-};
-
-struct filter_fid_24_29 {
-	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
-};
-
-struct filter_fid_210 {
-	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
-	struct ost_layout	ff_layout;
-};
-
-struct filter_fid {
-	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
-	struct ost_layout	ff_layout;
-	__u32			ff_layout_version;
-	__u32			ff_range; /* range of layout version that
-					   * write are allowed */
-} __attribute__((packed));
-
-/* Userspace should treat lu_fid as opaque, and only use the following methods
- * to print or parse them.  Other functions (e.g. compare, swab) could be moved
- * here from lustre_idl.h if needed. */
-typedef struct lu_fid lustre_fid;
-
-enum lma_compat {
-	LMAC_HSM	 = 0x00000001,
-/*	LMAC_SOM	 = 0x00000002, obsolete since 2.8.0 */
-	LMAC_NOT_IN_OI	 = 0x00000004, /* the object does NOT need OI mapping */
-	LMAC_FID_ON_OST  = 0x00000008, /* For OST-object, its OI mapping is
-				       * under /O/<seq>/d<x>. */
-	LMAC_STRIPE_INFO = 0x00000010, /* stripe info in the LMA EA. */
-	LMAC_COMP_INFO	 = 0x00000020, /* Component info in the LMA EA. */
-};
-
-/**
- * Masks for all features that should be supported by a Lustre version to
- * access a specific file.
- * This information is stored in lustre_mdt_attrs::lma_incompat.
- */
-enum lma_incompat {
-	LMAI_RELEASED		= 0x00000001, /* file is released */
-	LMAI_AGENT		= 0x00000002, /* agent inode */
-	LMAI_REMOTE_PARENT	= 0x00000004, /* the parent of the object
-						 is on the remote MDT */
-	LMAI_STRIPED		= 0x00000008, /* striped directory inode */
-	LMAI_ORPHAN		= 0x00000010, /* inode is orphan */
-	LMA_INCOMPAT_SUPP	= (LMAI_AGENT | LMAI_REMOTE_PARENT | \
-				   LMAI_STRIPED | LMAI_ORPHAN)
-};
-
-
-/**
- * Following struct for object attributes, that will be kept inode's EA.
- * Introduced in 2.0 release (please see b15993, for details)
- * Added to all objects since Lustre 2.4 as contains self FID
- */
-struct lustre_mdt_attrs {
-	/**
-	 * Bitfield for supported data in this structure. From enum lma_compat.
-	 * lma_self_fid and lma_flags are always available.
-	 */
-	__u32   lma_compat;
-	/**
-	 * Per-file incompat feature list. Lustre version should support all
-	 * flags set in this field. The supported feature mask is available in
-	 * LMA_INCOMPAT_SUPP.
-	 */
-	__u32   lma_incompat;
-	/** FID of this inode */
-	struct lu_fid  lma_self_fid;
-};
-
-struct lustre_ost_attrs {
-	/* Use lustre_mdt_attrs directly for now, need a common header
-	 * structure if want to change lustre_mdt_attrs in future. */
-	struct lustre_mdt_attrs loa_lma;
-
-	/* Below five elements are for OST-object's PFID EA, the
-	 * lma_parent_fid::f_ver is composed of the stripe_count (high 16 bits)
-	 * and the stripe_index (low 16 bits), the size should not exceed
-	 * 5 * sizeof(__u64)) to be accessable by old Lustre. If the flag
-	 * LMAC_STRIPE_INFO is set, then loa_parent_fid and loa_stripe_size
-	 * are valid; if the flag LMAC_COMP_INFO is set, then the next three
-	 * loa_comp_* elements are valid. */
-	struct lu_fid	loa_parent_fid;
-	__u32		loa_stripe_size;
-	__u32		loa_comp_id;
-	__u64		loa_comp_start;
-	__u64		loa_comp_end;
-};
-
-/**
- * Prior to 2.4, the LMA structure also included SOM attributes which has since
- * been moved to a dedicated xattr
- * lma_flags was also removed because of lma_compat/incompat fields.
- */
-#define LMA_OLD_SIZE (sizeof(struct lustre_mdt_attrs) + 5 * sizeof(__u64))
-
-/**
- * OST object IDentifier.
+ * NOTE: This file is DEPRECATED! Please include linux/lustre/lustre_user.h
+ * directly instead of this file. This file will be removed from a
+ * future version of lustre!
  */
-struct ost_id {
-	union {
-		struct {
-			__u64	oi_id;
-			__u64	oi_seq;
-		} oi;
-		struct lu_fid oi_fid;
-	};
-};
-
-#define DOSTID "%#llx:%llu"
-#define POSTID(oi) ((unsigned long long)ostid_seq(oi)), \
-		   ((unsigned long long)ostid_id(oi))
 
-struct ll_futimes_3 {
-	__u64 lfu_atime_sec;
-	__u64 lfu_atime_nsec;
-	__u64 lfu_mtime_sec;
-	__u64 lfu_mtime_nsec;
-	__u64 lfu_ctime_sec;
-	__u64 lfu_ctime_nsec;
-};
+#include <linux/lustre/lustre_user.h>
 
-/*
- * The ioctl naming rules:
- * LL_*     - works on the currently opened filehandle instead of parent dir
- * *_OBD_*  - gets data for both OSC or MDC (LOV, LMV indirectly)
- * *_MDC_*  - gets/sets data related to MDC
- * *_LOV_*  - gets/sets data related to OSC/LOV
- * *FILE*   - called on parent dir and passes in a filename
- * *STRIPE* - set/get lov_user_md
- * *INFO    - set/get lov_user_mds_data
- */
-/*	lustre_ioctl.h			101-150 */
-#define LL_IOC_GETFLAGS                 _IOR ('f', 151, long)
-#define LL_IOC_SETFLAGS                 _IOW ('f', 152, long)
-#define LL_IOC_CLRFLAGS                 _IOW ('f', 153, long)
-#define LL_IOC_LOV_SETSTRIPE            _IOW ('f', 154, long)
-#define LL_IOC_LOV_SETSTRIPE_NEW	_IOWR('f', 154, struct lov_user_md)
-#define LL_IOC_LOV_GETSTRIPE            _IOW ('f', 155, long)
-#define LL_IOC_LOV_GETSTRIPE_NEW	_IOR('f', 155, struct lov_user_md)
-#define LL_IOC_LOV_SETEA                _IOW ('f', 156, long)
-/*	LL_IOC_RECREATE_OBJ             157 obsolete */
-/*	LL_IOC_RECREATE_FID             157 obsolete */
-#define LL_IOC_GROUP_LOCK               _IOW ('f', 158, long)
-#define LL_IOC_GROUP_UNLOCK             _IOW ('f', 159, long)
-/*	LL_IOC_QUOTACHECK               160 OBD_IOC_QUOTACHECK */
-/*	LL_IOC_POLL_QUOTACHECK		161 OBD_IOC_POLL_QUOTACHECK */
-/*	LL_IOC_QUOTACTL			162 OBD_IOC_QUOTACTL */
-#define IOC_OBD_STATFS                  _IOWR('f', 164, struct obd_statfs *)
-/*	IOC_LOV_GETINFO                 165 obsolete */
-#define LL_IOC_FLUSHCTX                 _IOW ('f', 166, long)
-/*	LL_IOC_RMTACL                   167 obsolete */
-#define LL_IOC_GETOBDCOUNT              _IOR ('f', 168, long)
-#define LL_IOC_LLOOP_ATTACH             _IOWR('f', 169, long)
-#define LL_IOC_LLOOP_DETACH             _IOWR('f', 170, long)
-#define LL_IOC_LLOOP_INFO               _IOWR('f', 171, struct lu_fid)
-#define LL_IOC_LLOOP_DETACH_BYDEV       _IOWR('f', 172, long)
-#define LL_IOC_PATH2FID                 _IOR ('f', 173, long)
-#define LL_IOC_GET_CONNECT_FLAGS        _IOWR('f', 174, __u64 *)
-#define LL_IOC_GET_MDTIDX               _IOR ('f', 175, int)
-#define LL_IOC_FUTIMES_3		_IOWR('f', 176, struct ll_futimes_3)
-/*	lustre_ioctl.h			177-210 */
-#define LL_IOC_HSM_STATE_GET		_IOR('f', 211, struct hsm_user_state)
-#define LL_IOC_HSM_STATE_SET		_IOW('f', 212, struct hsm_state_set)
-#define LL_IOC_HSM_CT_START		_IOW('f', 213, struct lustre_kernelcomm)
-#define LL_IOC_HSM_COPY_START		_IOW('f', 214, struct hsm_copy *)
-#define LL_IOC_HSM_COPY_END		_IOW('f', 215, struct hsm_copy *)
-#define LL_IOC_HSM_PROGRESS		_IOW('f', 216, struct hsm_user_request)
-#define LL_IOC_HSM_REQUEST		_IOW('f', 217, struct hsm_user_request)
-#define LL_IOC_DATA_VERSION		_IOR('f', 218, struct ioc_data_version)
-#define LL_IOC_LOV_SWAP_LAYOUTS		_IOW('f', 219, \
-						struct lustre_swap_layouts)
-#define LL_IOC_HSM_ACTION		_IOR('f', 220, \
-						struct hsm_current_action)
-/*	lustre_ioctl.h			221-232 */
-#define LL_IOC_LMV_SETSTRIPE		_IOWR('f', 240, struct lmv_user_md)
-#define LL_IOC_LMV_GETSTRIPE		_IOWR('f', 241, struct lmv_user_md)
-#define LL_IOC_REMOVE_ENTRY		_IOWR('f', 242, __u64)
-#define LL_IOC_SET_LEASE		_IOWR('f', 243, long)
-#define LL_IOC_GET_LEASE		_IO('f', 244)
-#define LL_IOC_HSM_IMPORT		_IOWR('f', 245, struct hsm_user_import)
-#define LL_IOC_LMV_SET_DEFAULT_STRIPE	_IOWR('f', 246, struct lmv_user_md)
-#define LL_IOC_MIGRATE			_IOR('f', 247, int)
-#define LL_IOC_FID2MDTIDX		_IOWR('f', 248, struct lu_fid)
-#define LL_IOC_GETPARENT		_IOWR('f', 249, struct getparent)
-#define LL_IOC_LADVISE			_IOR('f', 250, struct llapi_lu_ladvise)
-
-#ifndef	FS_IOC_FSGETXATTR
-/*
- * Structure for FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR.
+/* Disable warning until 2.16 or 3.0, until new header is widely available.
+ * This gives apps time to move to the new header without spurious warnings.
+#warning "Including lustre/lustre_user.h is deprecated. Include linux/lustre/lustre_user.h instead."
 */
-struct fsxattr {
-	__u32           fsx_xflags;     /* xflags field value (get/set) */
-	__u32           fsx_extsize;    /* extsize field value (get/set)*/
-	__u32           fsx_nextents;   /* nextents field value (get)   */
-	__u32           fsx_projid;     /* project identifier (get/set) */
-	unsigned char   fsx_pad[12];
-};
-#define FS_IOC_FSGETXATTR		_IOR('X', 31, struct fsxattr)
-#define FS_IOC_FSSETXATTR		_IOW('X', 32, struct fsxattr)
-#endif
-#define LL_IOC_FSGETXATTR		FS_IOC_FSGETXATTR
-#define LL_IOC_FSSETXATTR		FS_IOC_FSSETXATTR
-#define LL_PROJINHERIT_FL		0x20000000
-
-
-/* Lease types for use as arg and return of LL_IOC_{GET,SET}_LEASE ioctl. */
-enum ll_lease_type {
-	LL_LEASE_RDLCK	= 0x1,
-	LL_LEASE_WRLCK	= 0x2,
-	LL_LEASE_UNLCK	= 0x4,
-};
-
-#define LL_STATFS_LMV		1
-#define LL_STATFS_LOV		2
-#define LL_STATFS_NODELAY	4
-
-#define IOC_MDC_TYPE            'i'
-#define IOC_MDC_LOOKUP          _IOWR(IOC_MDC_TYPE, 20, struct obd_device *)
-#define IOC_MDC_GETFILESTRIPE   _IOWR(IOC_MDC_TYPE, 21, struct lov_user_md *)
-#define IOC_MDC_GETFILEINFO     _IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data *)
-#define LL_IOC_MDC_GETINFO      _IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data *)
-
-#define MAX_OBD_NAME 128 /* If this changes, a NEW ioctl must be added */
-
-/* Define O_LOV_DELAY_CREATE to be a mask that is not useful for regular
- * files, but are unlikely to be used in practice and are not harmful if
- * used incorrectly.  O_NOCTTY and FASYNC are only meaningful for character
- * devices and are safe for use on new files. See LU-4209. */
-/* To be compatible with old statically linked binary we keep the check for
- * the older 0100000000 flag.  This is already removed upstream.  LU-812. */
-#define O_LOV_DELAY_CREATE_1_8	0100000000 /* FMODE_NONOTIFY masked in 2.6.36 */
-#ifndef FASYNC
-#define FASYNC			00020000   /* fcntl, for BSD compatibility */
-#endif
-#define O_LOV_DELAY_CREATE_MASK	(O_NOCTTY | FASYNC)
-#define O_LOV_DELAY_CREATE		(O_LOV_DELAY_CREATE_1_8 | \
-					 O_LOV_DELAY_CREATE_MASK)
-
-#define LL_FILE_IGNORE_LOCK     0x00000001
-#define LL_FILE_GROUP_LOCKED    0x00000002
-#define LL_FILE_READAHEA        0x00000004
-#define LL_FILE_LOCKED_DIRECTIO 0x00000008 /* client-side locks with dio */
-#define LL_FILE_LOCKLESS_IO     0x00000010 /* server-side locks with cio */
-
-#define LOV_USER_MAGIC_V1	0x0BD10BD0
-#define LOV_USER_MAGIC		LOV_USER_MAGIC_V1
-#define LOV_USER_MAGIC_JOIN_V1	0x0BD20BD0
-#define LOV_USER_MAGIC_V3	0x0BD30BD0
-/* 0x0BD40BD0 is occupied by LOV_MAGIC_MIGRATE */
-#define LOV_USER_MAGIC_SPECIFIC 0x0BD50BD0	/* for specific OSTs */
-#define LOV_USER_MAGIC_COMP_V1	0x0BD60BD0
-
-#define LMV_USER_MAGIC		0x0CD30CD0    /* default lmv magic */
-#define LMV_USER_MAGIC_V0	0x0CD20CD0    /* old default lmv magic */
-
-#define LOV_PATTERN_NONE	0x000
-#define LOV_PATTERN_RAID0	0x001
-#define LOV_PATTERN_RAID1	0x002
-#define LOV_PATTERN_FIRST	0x100
-#define LOV_PATTERN_CMOBD	0x200
-
-#define LOV_PATTERN_F_MASK	0xffff0000
-#define LOV_PATTERN_F_HOLE	0x40000000 /* there is hole in LOV EA */
-#define LOV_PATTERN_F_RELEASED	0x80000000 /* HSM released file */
-#define LOV_PATTERN_DEFAULT	0xffffffff
-
-static inline bool lov_pattern_supported(__u32 pattern)
-{
-	return pattern == LOV_PATTERN_RAID0 ||
-	       pattern == (LOV_PATTERN_RAID0 | LOV_PATTERN_F_RELEASED);
-}
-
-#define LOV_MAXPOOLNAME 15
-#define LOV_POOLNAMEF "%.15s"
-
-#define LOV_MIN_STRIPE_BITS 16   /* maximum PAGE_SIZE (ia64), power of 2 */
-#define LOV_MIN_STRIPE_SIZE (1 << LOV_MIN_STRIPE_BITS)
-#define LOV_MAX_STRIPE_COUNT_OLD 160
-/* This calculation is crafted so that input of 4096 will result in 160
- * which in turn is equal to old maximal stripe count.
- * XXX: In fact this is too simpified for now, what it also need is to get
- * ea_type argument to clearly know how much space each stripe consumes.
- *
- * The limit of 12 pages is somewhat arbitrary, but is a reasonably large
- * allocation that is sufficient for the current generation of systems.
- *
- * (max buffer size - lov+rpc header) / sizeof(struct lov_ost_data_v1) */
-#define LOV_MAX_STRIPE_COUNT 2000  /* ((12 * 4096 - 256) / 24) */
-#define LOV_ALL_STRIPES       0xffff /* only valid for directories */
-#define LOV_V1_INSANE_STRIPE_COUNT 65532 /* maximum stripe count bz13933 */
-
-#define XATTR_LUSTRE_PREFIX	"lustre."
-#define XATTR_LUSTRE_LOV	XATTR_LUSTRE_PREFIX"lov"
-
-#define lov_user_ost_data lov_user_ost_data_v1
-struct lov_user_ost_data_v1 {     /* per-stripe data structure */
-	struct ost_id l_ost_oi;	  /* OST object ID */
-	__u32 l_ost_gen;          /* generation of this OST index */
-	__u32 l_ost_idx;          /* OST index in LOV */
-} __attribute__((packed));
-
-#define lov_user_md lov_user_md_v1
-struct lov_user_md_v1 {           /* LOV EA user data (host-endian) */
-	__u32 lmm_magic;          /* magic number = LOV_USER_MAGIC_V1 */
-	__u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
-	struct ost_id lmm_oi;	  /* MDT parent inode id/seq (id/0 for 1.x) */
-	__u32 lmm_stripe_size;    /* size of stripe in bytes */
-	__u16 lmm_stripe_count;   /* num stripes in use for this object */
-	union {
-		__u16 lmm_stripe_offset;  /* starting stripe offset in
-					   * lmm_objects, use when writing */
-		__u16 lmm_layout_gen;     /* layout generation number
-					   * used when reading */
-	};
-	struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
-} __attribute__((packed,  __may_alias__));
-
-struct lov_user_md_v3 {           /* LOV EA user data (host-endian) */
-	__u32 lmm_magic;          /* magic number = LOV_USER_MAGIC_V3 */
-	__u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
-	struct ost_id lmm_oi;	  /* MDT parent inode id/seq (id/0 for 1.x) */
-	__u32 lmm_stripe_size;    /* size of stripe in bytes */
-	__u16 lmm_stripe_count;   /* num stripes in use for this object */
-	union {
-		__u16 lmm_stripe_offset;  /* starting stripe offset in
-					   * lmm_objects, use when writing */
-		__u16 lmm_layout_gen;     /* layout generation number
-					   * used when reading */
-	};
-	char  lmm_pool_name[LOV_MAXPOOLNAME + 1]; /* pool name */
-	struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
-} __attribute__((packed));
-
-struct lu_extent {
-	__u64	e_start;
-	__u64	e_end;
-};
-
-#define DEXT "[ %#llx , %#llx )"
-#define PEXT(ext) (ext)->e_start, (ext)->e_end
-
-static inline bool lu_extent_is_overlapped(struct lu_extent *e1,
-					   struct lu_extent *e2)
-{
-	return e1->e_start < e2->e_end && e2->e_start < e1->e_end;
-}
-
-enum lov_comp_md_entry_flags {
-	LCME_FL_PRIMARY	= 0x00000001,	/* Not used */
-	LCME_FL_STALE	= 0x00000002,	/* Not used */
-	LCME_FL_OFFLINE	= 0x00000004,	/* Not used */
-	LCME_FL_PREFERRED = 0x00000008, /* Not used */
-	LCME_FL_INIT	= 0x00000010,	/* instantiated */
-	LCME_FL_NEG	= 0x80000000	/* used to indicate a negative flag,
-					   won't be stored on disk */
-};
-
-#define LCME_KNOWN_FLAGS	(LCME_FL_NEG | LCME_FL_INIT)
-
-/* lcme_id can be specified as certain flags, and the the first
- * bit of lcme_id is used to indicate that the ID is representing
- * certain LCME_FL_* but not a real ID. Which implies we can have
- * at most 31 flags (see LCME_FL_XXX). */
-enum lcme_id {
-	LCME_ID_INVAL	= 0x0,
-	LCME_ID_MAX	= 0x7FFFFFFF,
-	LCME_ID_ALL	= 0xFFFFFFFF,
-	LCME_ID_NOT_ID	= LCME_FL_NEG
-};
-
-#define LCME_ID_MASK	LCME_ID_MAX
-
-struct lov_comp_md_entry_v1 {
-	__u32			lcme_id;        /* unique id of component */
-	__u32			lcme_flags;     /* LCME_FL_XXX */
-	struct lu_extent	lcme_extent;    /* file extent for component */
-	__u32			lcme_offset;    /* offset of component blob,
-						   start from lov_comp_md_v1 */
-	__u32			lcme_size;      /* size of component blob */
-	__u64			lcme_padding[2];
-} __attribute__((packed));
-
-enum lov_comp_md_flags;
-
-struct lov_comp_md_v1 {
-	__u32	lcm_magic;      /* LOV_USER_MAGIC_COMP_V1 */
-	__u32	lcm_size;       /* overall size including this struct */
-	__u32	lcm_layout_gen;
-	__u16	lcm_flags;
-	__u16	lcm_entry_count;
-	__u64	lcm_padding1;
-	__u64	lcm_padding2;
-	struct lov_comp_md_entry_v1 lcm_entries[0];
-} __attribute__((packed));
-
-static inline __u32 lov_user_md_size(__u16 stripes, __u32 lmm_magic)
-{
-	if (stripes == (__u16)-1)
-		stripes = 0;
-
-	if (lmm_magic == LOV_USER_MAGIC_V1)
-		return sizeof(struct lov_user_md_v1) +
-			      stripes * sizeof(struct lov_user_ost_data_v1);
-	return sizeof(struct lov_user_md_v3) +
-				stripes * sizeof(struct lov_user_ost_data_v1);
-}
-
-/* Compile with -D_LARGEFILE64_SOURCE or -D_GNU_SOURCE (or #define) to
- * use this.  It is unsafe to #define those values in this header as it
- * is possible the application has already #included <sys/stat.h>. */
-#ifdef HAVE_LOV_USER_MDS_DATA
-#define lov_user_mds_data lov_user_mds_data_v1
-struct lov_user_mds_data_v1 {
-        lstat_t lmd_st;                 /* MDS stat struct */
-        struct lov_user_md_v1 lmd_lmm;  /* LOV EA V1 user data */
-} __attribute__((packed));
-
-struct lov_user_mds_data_v3 {
-        lstat_t lmd_st;                 /* MDS stat struct */
-        struct lov_user_md_v3 lmd_lmm;  /* LOV EA V3 user data */
-} __attribute__((packed));
-#endif
-
-struct lmv_user_mds_data {
-	struct lu_fid	lum_fid;
-	__u32		lum_padding;
-	__u32		lum_mds;
-};
-
-enum lmv_hash_type {
-	LMV_HASH_TYPE_UNKNOWN	= 0,	/* 0 is reserved for testing purpose */
-	LMV_HASH_TYPE_ALL_CHARS = 1,
-	LMV_HASH_TYPE_FNV_1A_64 = 2,
-	LMV_HASH_TYPE_MAX,
-};
-
-#define LMV_HASH_NAME_ALL_CHARS	"all_char"
-#define LMV_HASH_NAME_FNV_1A_64	"fnv_1a_64"
-
-extern char *mdt_hash_name[LMV_HASH_TYPE_MAX];
-
-/* Got this according to how get LOV_MAX_STRIPE_COUNT, see above,
- * (max buffer size - lmv+rpc header) / sizeof(struct lmv_user_mds_data) */
-#define LMV_MAX_STRIPE_COUNT 2000  /* ((12 * 4096 - 256) / 24) */
-#define lmv_user_md lmv_user_md_v1
-struct lmv_user_md_v1 {
-	__u32	lum_magic;	 /* must be the first field */
-	__u32	lum_stripe_count;  /* dirstripe count */
-	__u32	lum_stripe_offset; /* MDT idx for default dirstripe */
-	__u32	lum_hash_type;     /* Dir stripe policy */
-	__u32	lum_type;	  /* LMV type: default or normal */
-	__u32	lum_padding1;
-	__u32	lum_padding2;
-	__u32	lum_padding3;
-	char	lum_pool_name[LOV_MAXPOOLNAME + 1];
-	struct	lmv_user_mds_data  lum_objects[0];
-} __attribute__((packed));
-
-static inline int lmv_user_md_size(int stripes, int lmm_magic)
-{
-	return sizeof(struct lmv_user_md) +
-		      stripes * sizeof(struct lmv_user_mds_data);
-}
-
-struct ll_recreate_obj {
-        __u64 lrc_id;
-        __u32 lrc_ost_idx;
-};
-
-struct ll_fid {
-        __u64 id;         /* holds object id */
-        __u32 generation; /* holds object generation */
-        __u32 f_type;     /* holds object type or stripe idx when passing it to
-                           * OST for saving into EA. */
-};
-
-#define UUID_MAX        40
-struct obd_uuid {
-        char uuid[UUID_MAX];
-};
-
-static inline bool obd_uuid_equals(const struct obd_uuid *u1,
-				   const struct obd_uuid *u2)
-{
-	return strcmp((char *)u1->uuid, (char *)u2->uuid) == 0;
-}
-
-static inline int obd_uuid_empty(struct obd_uuid *uuid)
-{
-        return uuid->uuid[0] == '\0';
-}
-
-static inline void obd_str2uuid(struct obd_uuid *uuid, const char *tmp)
-{
-        strncpy((char *)uuid->uuid, tmp, sizeof(*uuid));
-        uuid->uuid[sizeof(*uuid) - 1] = '\0';
-}
-
-/* For printf's only, make sure uuid is terminated */
-static inline char *obd_uuid2str(const struct obd_uuid *uuid)
-{
-	if (uuid == NULL)
-		return NULL;
-
-        if (uuid->uuid[sizeof(*uuid) - 1] != '\0') {
-                /* Obviously not safe, but for printfs, no real harm done...
-                   we're always null-terminated, even in a race. */
-                static char temp[sizeof(*uuid)];
-                memcpy(temp, uuid->uuid, sizeof(*uuid) - 1);
-                temp[sizeof(*uuid) - 1] = '\0';
-                return temp;
-        }
-        return (char *)(uuid->uuid);
-}
-
-#define LUSTRE_MAXFSNAME 8
-
-/* Extract fsname from uuid (or target name) of a target
-   e.g. (myfs-OST0007_UUID -> myfs)
-   see also deuuidify. */
-static inline void obd_uuid2fsname(char *buf, char *uuid, int buflen)
-{
-        char *p;
-
-        strncpy(buf, uuid, buflen - 1);
-        buf[buflen - 1] = '\0';
-        p = strrchr(buf, '-');
-	if (p != NULL)
-		*p = '\0';
-}
-
-/* printf display format for Lustre FIDs
- * usage: printf("file FID is "DFID"\n", PFID(fid)); */
-#define FID_NOBRACE_LEN 40
-#define FID_LEN (FID_NOBRACE_LEN + 2)
-#define DFID_NOBRACE "%#llx:0x%x:0x%x"
-#define DFID "["DFID_NOBRACE"]"
-#define PFID(fid) (unsigned long long)(fid)->f_seq, (fid)->f_oid, (fid)->f_ver
-
-/* scanf input parse format for fids in DFID_NOBRACE format
- * Need to strip '[' from DFID format first or use "["SFID"]" at caller.
- * usage: sscanf(fidstr, SFID, RFID(&fid)); */
-#define SFID "0x%llx:0x%x:0x%x"
-#define RFID(fid) &((fid)->f_seq), &((fid)->f_oid), &((fid)->f_ver)
-
-/********* Quotas **********/
-
-#define LUSTRE_QUOTABLOCK_BITS 10
-#define LUSTRE_QUOTABLOCK_SIZE (1 << LUSTRE_QUOTABLOCK_BITS)
-
-static inline __u64 lustre_stoqb(size_t space)
-{
-	return (space + LUSTRE_QUOTABLOCK_SIZE - 1) >> LUSTRE_QUOTABLOCK_BITS;
-}
-
-#define Q_QUOTACHECK	0x800100 /* deprecated as of 2.4 */
-#define Q_INITQUOTA	0x800101 /* deprecated as of 2.4  */
-#define Q_GETOINFO	0x800102 /* get obd quota info */
-#define Q_GETOQUOTA	0x800103 /* get obd quotas */
-#define Q_FINVALIDATE	0x800104 /* deprecated as of 2.4 */
-
-/* these must be explicitly translated into linux Q_* in ll_dir_ioctl */
-#define LUSTRE_Q_QUOTAON    0x800002     /* deprecated as of 2.4 */
-#define LUSTRE_Q_QUOTAOFF   0x800003     /* deprecated as of 2.4 */
-#define LUSTRE_Q_GETINFO    0x800005     /* get information about quota files */
-#define LUSTRE_Q_SETINFO    0x800006     /* set information about quota files */
-#define LUSTRE_Q_GETQUOTA   0x800007     /* get user quota structure */
-#define LUSTRE_Q_SETQUOTA   0x800008     /* set user quota structure */
-/* lustre-specific control commands */
-#define LUSTRE_Q_INVALIDATE  0x80000b     /* deprecated as of 2.4 */
-#define LUSTRE_Q_FINVALIDATE 0x80000c     /* deprecated as of 2.4 */
-
-#define ALLQUOTA 255       /* set all quota */
-static inline char *qtype_name(int qtype)
-{
-	switch (qtype) {
-	case USRQUOTA:
-		return "usr";
-	case GRPQUOTA:
-		return "grp";
-	case PRJQUOTA:
-		return "prj";
-	}
-	return "unknown";
-}
-
-#define IDENTITY_DOWNCALL_MAGIC 0x6d6dd629
-
-/* permission */
-#define N_PERMS_MAX      64
-
-struct perm_downcall_data {
-        __u64 pdd_nid;
-        __u32 pdd_perm;
-        __u32 pdd_padding;
-};
-
-struct identity_downcall_data {
-        __u32                            idd_magic;
-        __u32                            idd_err;
-        __u32                            idd_uid;
-        __u32                            idd_gid;
-        __u32                            idd_nperms;
-        __u32                            idd_ngroups;
-        struct perm_downcall_data idd_perms[N_PERMS_MAX];
-        __u32                            idd_groups[0];
-};
-
-#ifdef NEED_QUOTA_DEFS
-#ifndef QIF_BLIMITS
-#define QIF_BLIMITS     1
-#define QIF_SPACE       2
-#define QIF_ILIMITS     4
-#define QIF_INODES      8
-#define QIF_BTIME       16
-#define QIF_ITIME       32
-#define QIF_LIMITS      (QIF_BLIMITS | QIF_ILIMITS)
-#define QIF_USAGE       (QIF_SPACE | QIF_INODES)
-#define QIF_TIMES       (QIF_BTIME | QIF_ITIME)
-#define QIF_ALL         (QIF_LIMITS | QIF_USAGE | QIF_TIMES)
-#endif
-
-#endif /* !__KERNEL__ */
-
-/* lustre volatile file support
- * file name header: .^L^S^T^R:volatile"
- */
-#define LUSTRE_VOLATILE_HDR	".\x0c\x13\x14\x12:VOLATILE"
-#define LUSTRE_VOLATILE_HDR_LEN	14
-
-typedef enum lustre_quota_version {
-        LUSTRE_QUOTA_V2 = 1
-} lustre_quota_version_t;
-
-/* XXX: same as if_dqinfo struct in kernel */
-struct obd_dqinfo {
-        __u64 dqi_bgrace;
-        __u64 dqi_igrace;
-        __u32 dqi_flags;
-        __u32 dqi_valid;
-};
-
-/* XXX: same as if_dqblk struct in kernel, plus one padding */
-struct obd_dqblk {
-        __u64 dqb_bhardlimit;
-        __u64 dqb_bsoftlimit;
-        __u64 dqb_curspace;
-        __u64 dqb_ihardlimit;
-        __u64 dqb_isoftlimit;
-        __u64 dqb_curinodes;
-        __u64 dqb_btime;
-        __u64 dqb_itime;
-        __u32 dqb_valid;
-        __u32 dqb_padding;
-};
-
-enum {
-        QC_GENERAL      = 0,
-        QC_MDTIDX       = 1,
-        QC_OSTIDX       = 2,
-        QC_UUID         = 3
-};
-
-struct if_quotactl {
-        __u32                   qc_cmd;
-        __u32                   qc_type;
-        __u32                   qc_id;
-        __u32                   qc_stat;
-        __u32                   qc_valid;
-        __u32                   qc_idx;
-        struct obd_dqinfo       qc_dqinfo;
-        struct obd_dqblk        qc_dqblk;
-        char                    obd_type[16];
-        struct obd_uuid         obd_uuid;
-};
-
-/* swap layout flags */
-#define SWAP_LAYOUTS_CHECK_DV1		(1 << 0)
-#define SWAP_LAYOUTS_CHECK_DV2		(1 << 1)
-#define SWAP_LAYOUTS_KEEP_MTIME		(1 << 2)
-#define SWAP_LAYOUTS_KEEP_ATIME		(1 << 3)
-#define SWAP_LAYOUTS_CLOSE		(1 << 4)
-
-/* Swap XATTR_NAME_HSM as well, only on the MDT so far */
-#define SWAP_LAYOUTS_MDS_HSM		(1 << 31)
-struct lustre_swap_layouts {
-	__u64	sl_flags;
-	__u32	sl_fd;
-	__u32	sl_gid;
-	__u64	sl_dv1;
-	__u64	sl_dv2;
-};
-
-
-/********* Changelogs **********/
-/** Changelog record types */
-enum changelog_rec_type {
-        CL_MARK     = 0,
-        CL_CREATE   = 1,  /* namespace */
-        CL_MKDIR    = 2,  /* namespace */
-        CL_HARDLINK = 3,  /* namespace */
-        CL_SOFTLINK = 4,  /* namespace */
-        CL_MKNOD    = 5,  /* namespace */
-        CL_UNLINK   = 6,  /* namespace */
-        CL_RMDIR    = 7,  /* namespace */
-        CL_RENAME   = 8,  /* namespace */
-        CL_EXT      = 9,  /* namespace extended record (2nd half of rename) */
-        CL_OPEN     = 10, /* not currently used */
-        CL_CLOSE    = 11, /* may be written to log only with mtime change */
-	CL_LAYOUT   = 12, /* file layout/striping modified */
-	CL_TRUNC    = 13,
-	CL_SETATTR  = 14,
-	CL_XATTR    = 15,
-	CL_HSM      = 16, /* HSM specific events, see flags */
-	CL_MTIME    = 17, /* Precedence: setattr > mtime > ctime > atime */
-	CL_CTIME    = 18,
-	CL_ATIME    = 19,
-	CL_MIGRATE  = 20,
-	CL_LAST
-};
-
-static inline const char *changelog_type2str(int type) {
-	static const char *changelog_str[] = {
-		"MARK",  "CREAT", "MKDIR", "HLINK", "SLINK", "MKNOD", "UNLNK",
-		"RMDIR", "RENME", "RNMTO", "OPEN",  "CLOSE", "LYOUT", "TRUNC",
-		"SATTR", "XATTR", "HSM",   "MTIME", "CTIME", "ATIME", "MIGRT"
-	};
-
-	if (type >= 0 && type < CL_LAST)
-		return changelog_str[type];
-	return NULL;
-}
-
-/* per-record flags */
-#define CLF_FLAGSHIFT   12
-#define CLF_FLAGMASK    ((1U << CLF_FLAGSHIFT) - 1)
-#define CLF_VERMASK     (~CLF_FLAGMASK)
-enum changelog_rec_flags {
-	CLF_VERSION	= 0x1000,
-	CLF_RENAME	= 0x2000,
-	CLF_JOBID	= 0x4000,
-	CLF_SUPPORTED	= CLF_VERSION | CLF_RENAME | CLF_JOBID
-};
-
-
-/* Anything under the flagmask may be per-type (if desired) */
-/* Flags for unlink */
-#define CLF_UNLINK_LAST       0x0001 /* Unlink of last hardlink */
-#define CLF_UNLINK_HSM_EXISTS 0x0002 /* File has something in HSM */
-                                     /* HSM cleaning needed */
-/* Flags for rename */
-#define CLF_RENAME_LAST		0x0001 /* rename unlink last hardlink
-					* of target */
-#define CLF_RENAME_LAST_EXISTS	0x0002 /* rename unlink last hardlink of target
-					* has an archive in backend */
-
-/* Flags for HSM */
-/* 12b used (from high weight to low weight):
- * 2b for flags
- * 3b for event
- * 7b for error code
- */
-#define CLF_HSM_ERR_L        0 /* HSM return code, 7 bits */
-#define CLF_HSM_ERR_H        6
-#define CLF_HSM_EVENT_L      7 /* HSM event, 3 bits, see enum hsm_event */
-#define CLF_HSM_EVENT_H      9
-#define CLF_HSM_FLAG_L      10 /* HSM flags, 2 bits, 1 used, 1 spare */
-#define CLF_HSM_FLAG_H      11
-#define CLF_HSM_SPARE_L     12 /* 4 spare bits */
-#define CLF_HSM_SPARE_H     15
-#define CLF_HSM_LAST        15
-
-/* Remove bits higher than _h, then extract the value
- * between _h and _l by shifting lower weigth to bit 0. */
-#define CLF_GET_BITS(_b, _h, _l) (((_b << (CLF_HSM_LAST - _h)) & 0xFFFF) \
-                                   >> (CLF_HSM_LAST - _h + _l))
-
-#define CLF_HSM_SUCCESS      0x00
-#define CLF_HSM_MAXERROR     0x7E
-#define CLF_HSM_ERROVERFLOW  0x7F
-
-#define CLF_HSM_DIRTY        1 /* file is dirty after HSM request end */
-
-/* 3 bits field => 8 values allowed */
-enum hsm_event {
-        HE_ARCHIVE      = 0,
-        HE_RESTORE      = 1,
-        HE_CANCEL       = 2,
-        HE_RELEASE      = 3,
-        HE_REMOVE       = 4,
-        HE_STATE        = 5,
-        HE_SPARE1       = 6,
-        HE_SPARE2       = 7,
-};
-
-static inline enum hsm_event hsm_get_cl_event(__u16 flags)
-{
-	return (enum hsm_event)CLF_GET_BITS(flags, CLF_HSM_EVENT_H,
-					    CLF_HSM_EVENT_L);
-}
-
-static inline void hsm_set_cl_event(int *flags, enum hsm_event he)
-{
-        *flags |= (he << CLF_HSM_EVENT_L);
-}
-
-static inline __u16 hsm_get_cl_flags(int flags)
-{
-        return CLF_GET_BITS(flags, CLF_HSM_FLAG_H, CLF_HSM_FLAG_L);
-}
-
-static inline void hsm_set_cl_flags(int *flags, int bits)
-{
-        *flags |= (bits << CLF_HSM_FLAG_L);
-}
-
-static inline int hsm_get_cl_error(int flags)
-{
-        return CLF_GET_BITS(flags, CLF_HSM_ERR_H, CLF_HSM_ERR_L);
-}
-
-static inline void hsm_set_cl_error(int *flags, int error)
-{
-        *flags |= (error << CLF_HSM_ERR_L);
-}
-
-enum changelog_send_flag {
-	/* Not yet implemented */
-	CHANGELOG_FLAG_FOLLOW   = 0x01,
-	/* Blocking IO makes sense in case of slow user parsing of the records,
-	 * but it also prevents us from cleaning up if the records are not
-	 * consumed. */
-	CHANGELOG_FLAG_BLOCK    = 0x02,
-	/* Pack jobid into the changelog records if available. */
-	CHANGELOG_FLAG_JOBID    = 0x04,
-};
-
-#define CR_MAXSIZE cfs_size_round(2 * NAME_MAX + 2 + \
-				  changelog_rec_offset(CLF_SUPPORTED))
-
-/* 31 usable bytes string + null terminator. */
-#define LUSTRE_JOBID_SIZE	32
-
-/* This is the minimal changelog record. It can contain extensions
- * such as rename fields or process jobid. Its exact content is described
- * by the cr_flags.
- *
- * Extensions are packed in the same order as their corresponding flags.
- */
-struct changelog_rec {
-	__u16			cr_namelen;
-	__u16			cr_flags; /**< \a changelog_rec_flags */
-	__u32			cr_type;  /**< \a changelog_rec_type */
-	__u64			cr_index; /**< changelog record number */
-	__u64			cr_prev;  /**< last index for this target fid */
-	__u64			cr_time;
-	union {
-		lustre_fid	cr_tfid;        /**< target fid */
-		__u32		cr_markerflags; /**< CL_MARK flags */
-	};
-	lustre_fid		cr_pfid;        /**< parent fid */
-};
-
-/* Changelog extension for RENAME. */
-struct changelog_ext_rename {
-	lustre_fid		cr_sfid;     /**< source fid, or zero */
-	lustre_fid		cr_spfid;    /**< source parent fid, or zero */
-};
-
-/* Changelog extension to include JOBID. */
-struct changelog_ext_jobid {
-	char	cr_jobid[LUSTRE_JOBID_SIZE];	/**< zero-terminated string. */
-};
-
-
-static inline size_t changelog_rec_offset(enum changelog_rec_flags crf)
-{
-	size_t size = sizeof(struct changelog_rec);
-
-	if (crf & CLF_RENAME)
-		size += sizeof(struct changelog_ext_rename);
-
-	if (crf & CLF_JOBID)
-		size += sizeof(struct changelog_ext_jobid);
-
-	return size;
-}
-
-static inline size_t changelog_rec_size(const struct changelog_rec *rec)
-{
-	return changelog_rec_offset(rec->cr_flags);
-}
-
-static inline size_t changelog_rec_varsize(const struct changelog_rec *rec)
-{
-	return changelog_rec_size(rec) - sizeof(*rec) + rec->cr_namelen;
-}
-
-static inline
-struct changelog_ext_rename *changelog_rec_rename(const struct changelog_rec *rec)
-{
-	enum changelog_rec_flags crf = rec->cr_flags & CLF_VERSION;
-
-	return (struct changelog_ext_rename *)((char *)rec +
-					       changelog_rec_offset(crf));
-}
-
-/* The jobid follows the rename extension, if present */
-static inline
-struct changelog_ext_jobid *changelog_rec_jobid(const struct changelog_rec *rec)
-{
-	enum changelog_rec_flags crf = rec->cr_flags &
-					(CLF_VERSION | CLF_RENAME);
-
-	return (struct changelog_ext_jobid *)((char *)rec +
-					      changelog_rec_offset(crf));
-}
-
-/* The name follows the rename and jobid extensions, if present */
-static inline char *changelog_rec_name(const struct changelog_rec *rec)
-{
-	return (char *)rec + changelog_rec_offset(rec->cr_flags &
-						  CLF_SUPPORTED);
-}
-
-static inline size_t changelog_rec_snamelen(const struct changelog_rec *rec)
-{
-	return rec->cr_namelen - strlen(changelog_rec_name(rec)) - 1;
-}
-
-static inline char *changelog_rec_sname(const struct changelog_rec *rec)
-{
-	char *cr_name = changelog_rec_name(rec);
-
-	return cr_name + strlen(cr_name) + 1;
-}
-
-/**
- * Remap a record to the desired format as specified by the crf flags.
- * The record must be big enough to contain the final remapped version.
- * Superfluous extension fields are removed and missing ones are added
- * and zeroed. The flags of the record are updated accordingly.
- *
- * The jobid and rename extensions can be added to a record, to match the
- * format an application expects, typically. In this case, the newly added
- * fields will be zeroed.
- * The Jobid field can be removed, to guarantee compatibility with older
- * clients that don't expect this field in the records they process.
- *
- * The following assumptions are being made:
- *   - CLF_RENAME will not be removed
- *   - CLF_JOBID will not be added without CLF_RENAME being added too
- *
- * @param[in,out]  rec         The record to remap.
- * @param[in]      crf_wanted  Flags describing the desired extensions.
- */
-static inline void changelog_remap_rec(struct changelog_rec *rec,
-				       enum changelog_rec_flags crf_wanted)
-{
-	char *jid_mov;
-	char *rnm_mov;
-
-	crf_wanted &= CLF_SUPPORTED;
-
-	if ((rec->cr_flags & CLF_SUPPORTED) == crf_wanted)
-		return;
-
-	/* First move the variable-length name field */
-	memmove((char *)rec + changelog_rec_offset(crf_wanted),
-		changelog_rec_name(rec), rec->cr_namelen);
-
-	/* Locations of jobid and rename extensions in the remapped record */
-	jid_mov = (char *)rec +
-		  changelog_rec_offset(crf_wanted & ~CLF_JOBID);
-	rnm_mov = (char *)rec +
-		  changelog_rec_offset(crf_wanted & ~(CLF_JOBID | CLF_RENAME));
-
-	/* Move the extension fields to the desired positions */
-	if ((crf_wanted & CLF_JOBID) && (rec->cr_flags & CLF_JOBID))
-		memmove(jid_mov, changelog_rec_jobid(rec),
-			sizeof(struct changelog_ext_jobid));
-
-	if ((crf_wanted & CLF_RENAME) && (rec->cr_flags & CLF_RENAME))
-		memmove(rnm_mov, changelog_rec_rename(rec),
-			sizeof(struct changelog_ext_rename));
-
-	/* Clear newly added fields */
-	if ((crf_wanted & CLF_JOBID) && !(rec->cr_flags & CLF_JOBID))
-		memset(jid_mov, 0, sizeof(struct changelog_ext_jobid));
-
-	if ((crf_wanted & CLF_RENAME) && !(rec->cr_flags & CLF_RENAME))
-		memset(rnm_mov, 0, sizeof(struct changelog_ext_rename));
-
-	/* Update the record's flags accordingly */
-	rec->cr_flags = (rec->cr_flags & CLF_FLAGMASK) | crf_wanted;
-}
-
-enum changelog_message_type {
-        CL_RECORD = 10, /* message is a changelog_rec */
-        CL_EOF    = 11, /* at end of current changelog */
-};
-
-/********* Misc **********/
-
-struct ioc_data_version {
-        __u64 idv_version;
-        __u64 idv_flags;     /* See LL_DV_xxx */
-};
-#define LL_DV_RD_FLUSH (1 << 0) /* Flush dirty pages from clients */
-#define LL_DV_WR_FLUSH (1 << 1) /* Flush all caching pages from clients */
-
-#ifndef offsetof
-#define offsetof(typ, memb)     ((unsigned long)((char *)&(((typ *)0)->memb)))
-#endif
-
-#define dot_lustre_name ".lustre"
-
-
-/********* HSM **********/
-
-/** HSM per-file state
- * See HSM_FLAGS below.
- */
-enum hsm_states {
-	HS_NONE		= 0x00000000,
-	HS_EXISTS	= 0x00000001,
-	HS_DIRTY	= 0x00000002,
-	HS_RELEASED	= 0x00000004,
-	HS_ARCHIVED	= 0x00000008,
-	HS_NORELEASE	= 0x00000010,
-	HS_NOARCHIVE	= 0x00000020,
-	HS_LOST		= 0x00000040,
-};
-
-/* HSM user-setable flags. */
-#define HSM_USER_MASK   (HS_NORELEASE | HS_NOARCHIVE | HS_DIRTY)
-
-/* Other HSM flags. */
-#define HSM_STATUS_MASK (HS_EXISTS | HS_LOST | HS_RELEASED | HS_ARCHIVED)
-
-/*
- * All HSM-related possible flags that could be applied to a file.
- * This should be kept in sync with hsm_states.
- */
-#define HSM_FLAGS_MASK  (HSM_USER_MASK | HSM_STATUS_MASK)
-
-/**
- * HSM request progress state
- */
-enum hsm_progress_states {
-	HPS_WAITING	= 1,
-	HPS_RUNNING	= 2,
-	HPS_DONE	= 3,
-};
-#define HPS_NONE	0
-
-static inline const char *hsm_progress_state2name(enum hsm_progress_states s)
-{
-	switch  (s) {
-	case HPS_WAITING:	return "waiting";
-	case HPS_RUNNING:	return "running";
-	case HPS_DONE:		return "done";
-	default:		return "unknown";
-	}
-}
-
-struct hsm_extent {
-	__u64 offset;
-	__u64 length;
-} __attribute__((packed));
-
-/**
- * Current HSM states of a Lustre file.
- *
- * This structure purpose is to be sent to user-space mainly. It describes the
- * current HSM flags and in-progress action.
- */
-struct hsm_user_state {
-	/** Current HSM states, from enum hsm_states. */
-	__u32			hus_states;
-	__u32			hus_archive_id;
-	/**  The current undergoing action, if there is one */
-	__u32			hus_in_progress_state;
-	__u32			hus_in_progress_action;
-	struct hsm_extent	hus_in_progress_location;
-	char			hus_extended_info[];
-};
-
-struct hsm_state_set_ioc {
-	struct lu_fid	hssi_fid;
-	__u64		hssi_setmask;
-	__u64		hssi_clearmask;
-};
-
-/*
- * This structure describes the current in-progress action for a file.
- * it is retuned to user space and send over the wire
- */
-struct hsm_current_action {
-	/**  The current undergoing action, if there is one */
-	/* state is one of hsm_progress_states */
-	__u32			hca_state;
-	/* action is one of hsm_user_action */
-	__u32			hca_action;
-	struct hsm_extent	hca_location;
-};
-
-/***** HSM user requests ******/
-/* User-generated (lfs/ioctl) request types */
-enum hsm_user_action {
-        HUA_NONE    =  1, /* no action (noop) */
-        HUA_ARCHIVE = 10, /* copy to hsm */
-        HUA_RESTORE = 11, /* prestage */
-        HUA_RELEASE = 12, /* drop ost objects */
-        HUA_REMOVE  = 13, /* remove from archive */
-        HUA_CANCEL  = 14  /* cancel a request */
-};
-
-static inline const char *hsm_user_action2name(enum hsm_user_action  a)
-{
-        switch  (a) {
-        case HUA_NONE:    return "NOOP";
-        case HUA_ARCHIVE: return "ARCHIVE";
-        case HUA_RESTORE: return "RESTORE";
-        case HUA_RELEASE: return "RELEASE";
-        case HUA_REMOVE:  return "REMOVE";
-        case HUA_CANCEL:  return "CANCEL";
-        default:          return "UNKNOWN";
-        }
-}
-
-/*
- * List of hr_flags (bit field)
- */
-#define HSM_FORCE_ACTION 0x0001
-/* used by CT, cannot be set by user */
-#define HSM_GHOST_COPY   0x0002
-
-/**
- * Contains all the fixed part of struct hsm_user_request.
- *
- */
-struct hsm_request {
-	__u32 hr_action;	/* enum hsm_user_action */
-	__u32 hr_archive_id;	/* archive id, used only with HUA_ARCHIVE */
-	__u64 hr_flags;		/* request flags */
-	__u32 hr_itemcount;	/* item count in hur_user_item vector */
-	__u32 hr_data_len;
-};
-
-struct hsm_user_item {
-       lustre_fid        hui_fid;
-       struct hsm_extent hui_extent;
-} __attribute__((packed));
-
-struct hsm_user_request {
-	struct hsm_request	hur_request;
-	struct hsm_user_item	hur_user_item[0];
-	/* extra data blob at end of struct (after all
-	 * hur_user_items), only use helpers to access it
-	 */
-} __attribute__((packed));
-
-/** Return pointer to data field in a hsm user request */
-static inline void *hur_data(struct hsm_user_request *hur)
-{
-	return &(hur->hur_user_item[hur->hur_request.hr_itemcount]);
-}
-
-/**
- * Compute the current length of the provided hsm_user_request.  This returns -1
- * instead of an errno because ssize_t is defined to be only [ -1, SSIZE_MAX ]
- *
- * return -1 on bounds check error.
- */
-static inline ssize_t hur_len(struct hsm_user_request *hur)
-{
-	__u64	size;
-
-	/* can't overflow a __u64 since hr_itemcount is only __u32 */
-	size = offsetof(struct hsm_user_request, hur_user_item[0]) +
-		(__u64)hur->hur_request.hr_itemcount *
-		sizeof(hur->hur_user_item[0]) + hur->hur_request.hr_data_len;
-
-	if (size != (ssize_t)size)
-		return -1;
-
-	return size;
-}
-
-/****** HSM RPCs to copytool *****/
-/* Message types the copytool may receive */
-enum hsm_message_type {
-        HMT_ACTION_LIST = 100, /* message is a hsm_action_list */
-};
-
-/* Actions the copytool may be instructed to take for a given action_item */
-enum hsm_copytool_action {
-        HSMA_NONE    = 10, /* no action */
-        HSMA_ARCHIVE = 20, /* arbitrary offset */
-        HSMA_RESTORE = 21,
-        HSMA_REMOVE  = 22,
-        HSMA_CANCEL  = 23
-};
-
-static inline const char *hsm_copytool_action2name(enum hsm_copytool_action  a)
-{
-        switch  (a) {
-        case HSMA_NONE:    return "NOOP";
-        case HSMA_ARCHIVE: return "ARCHIVE";
-        case HSMA_RESTORE: return "RESTORE";
-        case HSMA_REMOVE:  return "REMOVE";
-        case HSMA_CANCEL:  return "CANCEL";
-        default:           return "UNKNOWN";
-        }
-}
-
-/* Copytool item action description */
-struct hsm_action_item {
-	__u32      hai_len;     /* valid size of this struct */
-	__u32      hai_action;  /* hsm_copytool_action, but use known size */
-	lustre_fid hai_fid;     /* Lustre FID to operate on */
-	lustre_fid hai_dfid;    /* fid used for data access */
-	struct hsm_extent hai_extent;  /* byte range to operate on */
-	__u64      hai_cookie;  /* action cookie from coordinator */
-	__u64      hai_gid;     /* grouplock id */
-	char       hai_data[0]; /* variable length */
-} __attribute__((packed));
-
-/**
- * helper function which print in hexa the first bytes of
- * hai opaque field
- *
- * \param hai [IN]        record to print
- * \param buffer [IN,OUT] buffer to write the hex string to
- * \param len [IN]        max buffer length
- *
- * \retval buffer
- */
-static inline char *hai_dump_data_field(const struct hsm_action_item *hai,
-					char *buffer, size_t len)
-{
-	int i;
-	int data_len;
-	char *ptr;
-
-	ptr = buffer;
-	data_len = hai->hai_len - sizeof(*hai);
-	for (i = 0; (i < data_len) && (len > 2); i++) {
-		snprintf(ptr, 3, "%02X", (unsigned char)hai->hai_data[i]);
-		ptr += 2;
-		len -= 2;
-	}
-
-	*ptr = '\0';
-
-	return buffer;
-}
-
-/* Copytool action list */
-#define HAL_VERSION 1
-#define HAL_MAXSIZE LNET_MTU /* bytes, used in userspace only */
-struct hsm_action_list {
-	__u32 hal_version;
-	__u32 hal_count;       /* number of hai's to follow */
-	__u64 hal_compound_id; /* returned by coordinator */
-	__u64 hal_flags;
-	__u32 hal_archive_id; /* which archive backend */
-	__u32 padding1;
-	char  hal_fsname[0];   /* null-terminated */
-	/* struct hsm_action_item[hal_count] follows, aligned on 8-byte
-	   boundaries. See hai_zero */
-} __attribute__((packed));
-
-#ifndef HAVE_CFS_SIZE_ROUND
-static inline int cfs_size_round (int val)
-{
-        return (val + 7) & (~0x7);
-}
-#define HAVE_CFS_SIZE_ROUND
-#endif
-
-/* Return pointer to first hai in action list */
-static inline struct hsm_action_item *hai_first(struct hsm_action_list *hal)
-{
-	return (struct hsm_action_item *)(hal->hal_fsname +
-					  cfs_size_round(strlen(hal-> \
-								hal_fsname)
-							 + 1));
-}
-/* Return pointer to next hai */
-static inline struct hsm_action_item * hai_next(struct hsm_action_item *hai)
-{
-        return (struct hsm_action_item *)((char *)hai +
-                                          cfs_size_round(hai->hai_len));
-}
-
-/* Return size of an hsm_action_list */
-static inline size_t hal_size(struct hsm_action_list *hal)
-{
-	__u32 i;
-	size_t sz;
-	struct hsm_action_item *hai;
-
-	sz = sizeof(*hal) + cfs_size_round(strlen(hal->hal_fsname) + 1);
-	hai = hai_first(hal);
-	for (i = 0; i < hal->hal_count ; i++, hai = hai_next(hai))
-		sz += cfs_size_round(hai->hai_len);
-
-	return sz;
-}
-
-/* HSM file import
- * describe the attributes to be set on imported file
- */
-struct hsm_user_import {
-	__u64		hui_size;
-	__u64		hui_atime;
-	__u64		hui_mtime;
-	__u32		hui_atime_ns;
-	__u32		hui_mtime_ns;
-	__u32		hui_uid;
-	__u32		hui_gid;
-	__u32		hui_mode;
-	__u32		hui_archive_id;
-};
-
-/* Copytool progress reporting */
-#define HP_FLAG_COMPLETED 0x01
-#define HP_FLAG_RETRY     0x02
-
-struct hsm_progress {
-	lustre_fid		hp_fid;
-	__u64			hp_cookie;
-	struct hsm_extent	hp_extent;
-	__u16			hp_flags;
-	__u16			hp_errval; /* positive val */
-	__u32			padding;
-};
-
-struct hsm_copy {
-	__u64			hc_data_version;
-	__u16			hc_flags;
-	__u16			hc_errval; /* positive val */
-	__u32			padding;
-	struct hsm_action_item	hc_hai;
-};
-
-/* JSON objects */
-enum llapi_json_types {
-	LLAPI_JSON_INTEGER = 1,
-	LLAPI_JSON_BIGNUM,
-	LLAPI_JSON_REAL,
-	LLAPI_JSON_STRING
-};
-
-struct llapi_json_item {
-	char			*lji_key;
-	__u32			lji_type;
-	union {
-		int	lji_integer;
-		__u64	lji_u64;
-		double	lji_real;
-		char	*lji_string;
-	};
-	struct llapi_json_item	*lji_next;
-};
-
-struct llapi_json_item_list {
-	int			ljil_item_count;
-	struct llapi_json_item	*ljil_items;
-};
-
-enum lu_ladvise_type {
-	LU_LADVISE_INVALID	= 0,
-	LU_LADVISE_WILLREAD	= 1,
-	LU_LADVISE_DONTNEED	= 2,
-};
-
-#define LU_LADVISE_NAMES {						\
-	[LU_LADVISE_WILLREAD]	= "willread",				\
-	[LU_LADVISE_DONTNEED]	= "dontneed",				\
-}
-
-/* This is the userspace argument for ladvise.  It is currently the same as
- * what goes on the wire (struct lu_ladvise), but is defined separately as we
- * may need info which is only used locally. */
-struct llapi_lu_ladvise {
-	__u16 lla_advice;	/* advice type */
-	__u16 lla_value1;	/* values for different advice types */
-	__u32 lla_value2;
-	__u64 lla_start;	/* first byte of extent for advice */
-	__u64 lla_end;		/* last byte of extent for advice */
-	__u32 lla_value3;
-	__u32 lla_value4;
-};
-
-enum ladvise_flag {
-	LF_ASYNC	= 0x00000001,
-};
-
-#define LADVISE_MAGIC 0x1ADF1CE0
-#define LF_MASK LF_ASYNC
-
-/* This is the userspace argument for ladvise, corresponds to ladvise_hdr which
- * is used on the wire.  It is defined separately as we may need info which is
- * only used locally. */
-struct llapi_ladvise_hdr {
-	__u32			lah_magic;	/* LADVISE_MAGIC */
-	__u32			lah_count;	/* number of advices */
-	__u64			lah_flags;	/* from enum ladvise_flag */
-	__u32			lah_value1;	/* unused */
-	__u32			lah_value2;	/* unused */
-	__u64			lah_value3;	/* unused */
-	struct llapi_lu_ladvise	lah_advise[0];	/* advices in this header */
-};
-
-#define LAH_COUNT_MAX	(1024)
-
-/* Shared key */
-enum sk_crypt_alg {
-	SK_CRYPT_INVALID	= -1,
-	SK_CRYPT_EMPTY		= 0,
-	SK_CRYPT_AES256_CTR	= 1,
-	SK_CRYPT_MAX		= 2,
-};
-
-enum sk_hmac_alg {
-	SK_HMAC_INVALID	= -1,
-	SK_HMAC_EMPTY	= 0,
-	SK_HMAC_SHA256	= 1,
-	SK_HMAC_SHA512	= 2,
-	SK_HMAC_MAX	= 3,
-};
-
-struct sk_crypt_type {
-	char    *sct_name;
-	size_t   sct_bytes;
-};
-
-struct sk_hmac_type {
-	char    *sht_name;
-	size_t   sht_bytes;
-};
-
-/** @} lustreuser */
-#endif /* _LUSTRE_USER_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h
index 67df286a5c358..da1e166d9c39c 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,9 +38,17 @@
  * @{
  */
 
+#include <glob.h>
 #include <stdarg.h>
 #include <stdint.h>
-#include <lustre/lustre_user.h>
+#include <time.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <linux/lustre/lustre_user.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
 
 #ifndef LL_MAXQUOTAS
 #define LL_MAXQUOTAS 3
@@ -50,8 +58,13 @@
 #define ARRAY_SIZE(a) ((sizeof(a)) / (sizeof((a)[0])))
 #endif
 
-extern bool liblustreapi_initialized;
+#define lustre_fid struct lu_fid
 
+/* Currently external applications can access this but in the
+ * future this will no longer be exposed for the user. Instead
+ * if you want to know if the library is initialized just call
+ * llapi_liblustreapi_initialized() which is now available. */
+extern bool liblustreapi_initialized;
 
 typedef void (*llapi_cb_t)(char *obd_type_name, char *obd_name, char *obd_uuid,
 			   void *args);
@@ -71,6 +84,10 @@ enum llapi_message_level {
 typedef void (*llapi_log_callback_t)(enum llapi_message_level level, int err,
 				     const char *fmt, va_list ap);
 
+static inline bool llapi_liblustreapi_initialized(void)
+{
+	return liblustreapi_initialized;
+}
 
 /* the bottom three bits reserved for llapi_message_level */
 #define LLAPI_MSG_MASK          0x00000007
@@ -87,10 +104,11 @@ static inline const char *llapi_msg_level2str(enum llapi_message_level level)
 
 	return levels[level];
 }
-extern void llapi_msg_set_level(int level);
+
+void llapi_msg_set_level(int level);
 int llapi_msg_get_level(void);
-extern llapi_log_callback_t llapi_error_callback_set(llapi_log_callback_t cb);
-extern llapi_log_callback_t llapi_info_callback_set(llapi_log_callback_t cb);
+llapi_log_callback_t llapi_error_callback_set(llapi_log_callback_t cb);
+llapi_log_callback_t llapi_info_callback_set(llapi_log_callback_t cb);
 
 void llapi_error(enum llapi_message_level level, int err, const char *fmt, ...)
 	__attribute__((__format__(__printf__, 3, 4)));
@@ -110,53 +128,64 @@ struct llapi_stripe_param {
 	__u32			lsp_osts[0];
 };
 
-extern int llapi_file_open_param(const char *name, int flags, mode_t mode,
-				 const struct llapi_stripe_param *param);
-extern int llapi_file_create(const char *name, unsigned long long stripe_size,
-                             int stripe_offset, int stripe_count,
-                             int stripe_pattern);
-extern int llapi_file_open(const char *name, int flags, int mode,
-                           unsigned long long stripe_size, int stripe_offset,
-                           int stripe_count, int stripe_pattern);
-extern int llapi_file_create_pool(const char *name,
-                                  unsigned long long stripe_size,
-                                  int stripe_offset, int stripe_count,
-                                  int stripe_pattern, char *pool_name);
-extern int llapi_file_open_pool(const char *name, int flags, int mode,
-                                unsigned long long stripe_size,
-                                int stripe_offset, int stripe_count,
-                                int stripe_pattern, char *pool_name);
-extern int llapi_poollist(const char *name);
-extern int llapi_get_poollist(const char *name, char **poollist, int list_size,
-                              char *buffer, int buffer_size);
-extern int llapi_get_poolmembers(const char *poolname, char **members,
-                                 int list_size, char *buffer, int buffer_size);
-extern int llapi_file_get_stripe(const char *path, struct lov_user_md *lum);
-extern int llapi_file_lookup(int dirfd, const char *name);
-
-#define VERBOSE_COUNT		   0x1
-#define VERBOSE_SIZE		   0x2
-#define VERBOSE_OFFSET		   0x4
-#define VERBOSE_POOL		   0x8
-#define VERBOSE_DETAIL		  0x10
-#define VERBOSE_OBJID		  0x20
-#define VERBOSE_GENERATION	  0x40
-#define VERBOSE_MDTINDEX	  0x80
-#define VERBOSE_LAYOUT		 0x100
-#define VERBOSE_COMP_COUNT	 0x200
-#define VERBOSE_COMP_FLAGS	 0x400
-#define VERBOSE_COMP_START	 0x800
-#define VERBOSE_COMP_END	0x1000
-#define VERBOSE_COMP_ID		0x2000
-#define VERBOSE_DFID		0x4000
-#define VERBOSE_HASH_TYPE	0x8000
-#define VERBOSE_DEFAULT		(VERBOSE_COUNT | VERBOSE_SIZE | \
-				 VERBOSE_OFFSET | VERBOSE_POOL | \
-				 VERBOSE_OBJID | VERBOSE_GENERATION | \
-				 VERBOSE_LAYOUT | VERBOSE_HASH_TYPE | \
-				 VERBOSE_COMP_COUNT | VERBOSE_COMP_FLAGS | \
-				 VERBOSE_COMP_START | VERBOSE_COMP_END | \
-				 VERBOSE_COMP_ID)
+#define lsp_tgts	lsp_osts
+
+int llapi_file_open_param(const char *name, int flags, mode_t mode,
+			  const struct llapi_stripe_param *param);
+int llapi_file_create(const char *name, unsigned long long stripe_size,
+		      int stripe_offset, int stripe_count, int stripe_pattern);
+int llapi_file_open(const char *name, int flags, int mode,
+		    unsigned long long stripe_size, int stripe_offset,
+		    int stripe_count, int stripe_pattern);
+int llapi_file_create_pool(const char *name, unsigned long long stripe_size,
+			   int stripe_offset, int stripe_count,
+			   int stripe_pattern, char *pool_name);
+int llapi_file_open_pool(const char *name, int flags, int mode,
+			 unsigned long long stripe_size, int stripe_offset,
+			 int stripe_count, int stripe_pattern, char *pool_name);
+int llapi_poollist(const char *name);
+int llapi_get_poollist(const char *name, char **poollist, int list_size,
+		       char *buffer, int buffer_size);
+int llapi_get_poolmembers(const char *poolname, char **members, int list_size,
+			  char *buffer, int buffer_size);
+int llapi_file_get_stripe(const char *path, struct lov_user_md *lum);
+int llapi_file_lookup(int dirfd, const char *name);
+void llapi_set_command_name(const char *cmd);
+void llapi_clear_command_name(void);
+
+enum llapi_layout_verbose  {
+	VERBOSE_STRIPE_COUNT	=     0x1,
+	VERBOSE_STRIPE_SIZE	=     0x2,
+	VERBOSE_STRIPE_OFFSET	=     0x4,
+	VERBOSE_POOL		=     0x8,
+	VERBOSE_DETAIL		=    0x10,
+	VERBOSE_OBJID		=    0x20,
+	VERBOSE_GENERATION	=    0x40,
+	VERBOSE_MDTINDEX	=    0x80,
+	VERBOSE_PATTERN		=   0x100,
+	VERBOSE_COMP_COUNT	=   0x200,
+	VERBOSE_COMP_FLAGS	=   0x400,
+	VERBOSE_COMP_START	=   0x800,
+	VERBOSE_COMP_END	=  0x1000,
+	VERBOSE_COMP_ID		=  0x2000,
+	VERBOSE_DFID		=  0x4000,
+	VERBOSE_HASH_TYPE	=  0x8000,
+	VERBOSE_MIRROR_COUNT	= 0x10000,
+	VERBOSE_MIRROR_ID	= 0x20000,
+	VERBOSE_DEFAULT		= VERBOSE_STRIPE_COUNT | VERBOSE_STRIPE_SIZE |
+				  VERBOSE_STRIPE_OFFSET | VERBOSE_POOL |
+				  VERBOSE_OBJID | VERBOSE_GENERATION |
+				  VERBOSE_PATTERN | VERBOSE_HASH_TYPE |
+				  VERBOSE_COMP_COUNT | VERBOSE_COMP_FLAGS |
+				  VERBOSE_COMP_START | VERBOSE_COMP_END |
+				  VERBOSE_COMP_ID | VERBOSE_MIRROR_COUNT |
+				  VERBOSE_MIRROR_ID
+};
+/* Compatibility with original names */
+#define VERBOSE_SIZE	VERBOSE_STRIPE_SIZE
+#define VERBOSE_COUNT	VERBOSE_STRIPE_COUNT
+#define VERBOSE_OFFSET	VERBOSE_STRIPE_OFFSET
+#define VERBOSE_LAYOUT	VERBOSE_PATTERN
 
 struct find_param {
 	unsigned int		 fp_max_depth;
@@ -179,7 +208,11 @@ struct find_param {
 				 fp_comp_start_sign:2,
 				 fp_comp_end_sign:2,
 				 fp_comp_count_sign:2,
-				 fp_mdt_count_sign:2;
+				 fp_mirror_count_sign:2,
+				 fp_mirror_index_sign:2,
+				 fp_mirror_id_sign:2,
+				 fp_mdt_count_sign:2,
+				 fp_blocks_sign:2;
 	unsigned long long	 fp_size;
 	unsigned long long	 fp_size_units;
 
@@ -214,21 +247,30 @@ struct find_param {
 				 fp_exclude_projid:1,
 				 fp_check_comp_count:1,
 				 fp_exclude_comp_count:1,
+				 fp_check_mirror_count:1,
+				 fp_exclude_mirror_count:1,
 				 fp_check_comp_flags:1,
-				 fp_exclude_comp_flags:1,
+				 fp_check_mirror_state:1,
 				 fp_check_comp_start:1,
 				 fp_exclude_comp_start:1,
 				 fp_check_comp_end:1,
 				 fp_exclude_comp_end:1,
 				 fp_check_comp_id:1,
 				 fp_exclude_comp_id:1,
+				 fp_check_mirror_id:1,
+				 fp_exclude_mirror_id:1,
+				 fp_check_mirror_index:1,
+				 fp_exclude_mirror_index:1,
 				 fp_check_mdt_count:1,
 				 fp_exclude_mdt_count:1,
 				 fp_check_hash_type:1,
 				 fp_exclude_hash_type:1,
-				 fp_yaml:1;	/* output layout in YAML */
+				 fp_yaml:1,	/* output layout in YAML */
+				 fp_check_blocks:1,
+				 fp_exclude_blocks:1,
+				 fp_lazy:1;
 
-	int			 fp_verbose;
+	enum llapi_layout_verbose fp_verbose;
 	int			 fp_quiet;
 
 	/* regular expression */
@@ -261,14 +303,22 @@ struct find_param {
 	__u32			 fp_layout;
 
 	__u32			 fp_comp_count;
+	__u32			 fp_mirror_count;
 	__u32			 fp_comp_flags;
+	__u32			 fp_comp_neg_flags;
+	__u16			 fp_mirror_state;
+	__u16			 fp_mirror_neg_state;
 	__u32			 fp_comp_id;
+	__u16			 fp_mirror_id;
+	__u16			 fp_mirror_index;
 	unsigned long long	 fp_comp_start;
 	unsigned long long	 fp_comp_start_units;
 	unsigned long long	 fp_comp_end;
 	unsigned long long	 fp_comp_end_units;
 	unsigned long long	 fp_mdt_count;
 	unsigned		 fp_projid;
+	unsigned long long	 fp_blocks;
+	unsigned long long	 fp_blocks_units;
 
 	/* In-process parameters. */
 	unsigned long		 fp_got_uuids:1,
@@ -277,104 +327,123 @@ struct find_param {
 	unsigned int		 fp_hash_type;
 };
 
-extern int llapi_ostlist(char *path, struct find_param *param);
-extern int llapi_uuid_match(char *real_uuid, char *search_uuid);
-extern int llapi_getstripe(char *path, struct find_param *param);
-extern int llapi_find(char *path, struct find_param *param);
-
-extern int llapi_file_fget_mdtidx(int fd, int *mdtidx);
-extern int llapi_dir_set_default_lmv_stripe(const char *name, int stripe_offset,
-					   int stripe_count, int stripe_pattern,
-					    const char *pool_name);
-extern int llapi_dir_create_pool(const char *name, int flags, int stripe_offset,
-				 int stripe_count, int stripe_pattern,
-				 const char *poolname);
+int llapi_ostlist(char *path, struct find_param *param);
+int llapi_uuid_match(char *real_uuid, char *search_uuid);
+int llapi_getstripe(char *path, struct find_param *param);
+int llapi_find(char *path, struct find_param *param);
+
+int llapi_file_fget_mdtidx(int fd, int *mdtidx);
+int llapi_dir_set_default_lmv(const char *name,
+			      const struct llapi_stripe_param *param);
+int llapi_dir_set_default_lmv_stripe(const char *name, int stripe_offset,
+				     int stripe_count, int stripe_pattern,
+				     const char *pool_name);
+int llapi_dir_create(const char *name, mode_t mode,
+		     const struct llapi_stripe_param *param);
+int llapi_dir_create_pool(const char *name, int flags, int stripe_offset,
+			  int stripe_count, int stripe_pattern,
+			  const char *poolname);
 int llapi_direntry_remove(char *dname);
 
 int llapi_obd_fstatfs(int fd, __u32 type, __u32 index,
 		      struct obd_statfs *stat_buf, struct obd_uuid *uuid_buf);
-extern int llapi_obd_statfs(char *path, __u32 type, __u32 index,
-                     struct obd_statfs *stat_buf,
-                     struct obd_uuid *uuid_buf);
-extern int llapi_ping(char *obd_type, char *obd_name);
-extern int llapi_target_check(int num_types, char **obd_types, char *dir);
-extern int llapi_file_get_lov_uuid(const char *path, struct obd_uuid *lov_uuid);
-extern int llapi_file_get_lmv_uuid(const char *path, struct obd_uuid *lmv_uuid);
-extern int llapi_file_fget_lov_uuid(int fd, struct obd_uuid *lov_uuid);
-extern int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count);
-extern int llapi_lmv_get_uuids(int fd, struct obd_uuid *uuidp, int *mdt_count);
-extern int llapi_is_lustre_mnttype(const char *type);
-extern int llapi_search_ost(char *fsname, char *poolname, char *ostname);
-extern int llapi_get_obd_count(char *mnt, int *count, int is_mdt);
-extern int llapi_parse_size(const char *optarg, unsigned long long *size,
-			    unsigned long long *size_units, int bytes_spec);
-extern int llapi_search_mounts(const char *pathname, int index,
-                               char *mntdir, char *fsname);
-extern int llapi_search_fsname(const char *pathname, char *fsname);
-extern int llapi_getname(const char *path, char *buf, size_t size);
-extern int llapi_search_fileset(const char *pathname, char *fileset);
-
-extern int llapi_search_rootpath(char *pathname, const char *fsname);
-extern int llapi_nodemap_exists(const char *name);
-extern int llapi_migrate_mdt(char *path, struct find_param *param);
-extern int llapi_mv(char *path, struct find_param *param);
+int llapi_obd_statfs(char *path, __u32 type, __u32 index,
+		     struct obd_statfs *stat_buf, struct obd_uuid *uuid_buf);
+int llapi_ping(char *obd_type, char *obd_name);
+int llapi_target_check(int num_types, char **obd_types, char *dir);
+int llapi_file_get_lov_uuid(const char *path, struct obd_uuid *lov_uuid);
+int llapi_file_get_lmv_uuid(const char *path, struct obd_uuid *lmv_uuid);
+int llapi_file_fget_lov_uuid(int fd, struct obd_uuid *lov_uuid);
+int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count);
+int llapi_lmv_get_uuids(int fd, struct obd_uuid *uuidp, int *mdt_count);
+int llapi_is_lustre_mnttype(const char *type);
+int llapi_search_tgt(char *fsname, char *poolname, char *tgtname, bool is_mdt);
+int llapi_search_ost(char *fsname, char *poolname, char *ostname);
+int llapi_get_obd_count(char *mnt, int *count, int is_mdt);
+int llapi_parse_size(const char *optarg, unsigned long long *size,
+		     unsigned long long *size_units, int bytes_spec);
+int llapi_search_mounts(const char *pathname, int index, char *mntdir,
+			char *fsname);
+int llapi_search_fsname(const char *pathname, char *fsname);
+int llapi_getname(const char *path, char *buf, size_t size);
+int llapi_search_fileset(const char *pathname, char *fileset);
+
+int llapi_search_rootpath(char *pathname, const char *fsname);
+int llapi_nodemap_exists(const char *name);
+int llapi_migrate_mdt(char *path, struct find_param *param);
+int llapi_mv(char *path, struct find_param *param);
 
 struct mntent;
+
 #define HAVE_LLAPI_IS_LUSTRE_MNT
-extern int llapi_is_lustre_mnt(struct mntent *mnt);
-extern int llapi_quotactl(char *mnt, struct if_quotactl *qctl);
-extern int llapi_target_iterate(int type_num, char **obd_type, void *args,
-				llapi_cb_t cb);
-extern int llapi_get_connect_flags(const char *mnt, __u64 *flags);
-extern int llapi_cp(int argc, char *argv[]);
-extern int llapi_ls(int argc, char *argv[]);
-extern int llapi_fid2path(const char *device, const char *fidstr, char *path,
-			  int pathlen, long long *recno, int *linkno);
-extern int llapi_path2fid(const char *path, lustre_fid *fid);
-extern int llapi_get_mdt_index_by_fid(int fd, const lustre_fid *fid,
-				      int *mdt_index);
-extern int llapi_fd2fid(const int fd, lustre_fid *fid);
+int llapi_is_lustre_mnt(struct mntent *mnt);
+int llapi_quotactl(char *mnt, struct if_quotactl *qctl);
+int llapi_target_iterate(int type_num, char **obd_type, void *args,
+			 llapi_cb_t cb);
+int llapi_get_connect_flags(const char *mnt, __u64 *flags);
+int llapi_cp(int argc, char *argv[]);
+int llapi_ls(int argc, char *argv[]);
+int llapi_fid2path(const char *device, const char *fidstr, char *path,
+		   int pathlen, long long *recno, int *linkno);
+int llapi_path2fid(const char *path, struct lu_fid *fid);
+int llapi_get_mdt_index_by_fid(int fd, const struct lu_fid *fid,
+			       int *mdt_index);
+int llapi_get_lum_file(const char *path, __u64 *valid, lstatx_t *statx,
+		       struct lov_user_md *lum, size_t lumsize);
+int llapi_get_lum_dir(const char *path, __u64 *valid, lstatx_t *statx,
+		      struct lov_user_md *lum, size_t lumsize);
+int llapi_get_lum_file_fd(int dir_fd, const char *fname, __u64 *valid,
+			  lstatx_t *statx, struct lov_user_md *lum,
+			  size_t lumsize);
+int llapi_get_lum_dir_fd(int dir_fd, __u64 *valid, lstatx_t *statx,
+			 struct lov_user_md *lum, size_t lumsize);
+
+int llapi_fd2fid(int fd, struct lu_fid *fid);
 /* get FID of parent dir + the related name of entry in this parent dir */
-extern int llapi_path2parent(const char *path, unsigned int linkno,
-			     lustre_fid *parent_fid, char *name,
-			     size_t name_size);
-extern int llapi_fd2parent(int fd, unsigned int linkno,
-			   lustre_fid *parent_fid, char *name,
-			   size_t name_size);
-extern int llapi_chomp_string(char *buf);
-extern int llapi_open_by_fid(const char *dir, const lustre_fid *fid,
-			     int open_flags);
-
-extern int llapi_get_version_string(char *version, unsigned int version_size);
+int llapi_path2parent(const char *path, unsigned int linkno,
+		      struct lu_fid *parent_fid, char *name, size_t name_size);
+int llapi_fd2parent(int fd, unsigned int linkno, struct lu_fid *parent_fid,
+		    char *name, size_t name_size);
+int llapi_rmfid(const char *path, struct fid_array *fa);
+int llapi_chomp_string(char *buf);
+int llapi_open_by_fid(const char *dir, const struct lu_fid *fid,
+		      int open_flags);
+int llapi_get_version_string(char *version, unsigned int version_size);
 /* llapi_get_version() is deprecated, use llapi_get_version_string() instead */
-extern int llapi_get_version(char *buffer, int buffer_size, char **version)
+int llapi_get_version(char *buffer, int buffer_size, char **version)
 	__attribute__((deprecated));
-extern int llapi_get_data_version(int fd, __u64 *data_version, __u64 flags);
-extern int llapi_hsm_state_get_fd(int fd, struct hsm_user_state *hus);
-extern int llapi_hsm_state_get(const char *path, struct hsm_user_state *hus);
-extern int llapi_hsm_state_set_fd(int fd, __u64 setmask, __u64 clearmask,
-				  __u32 archive_id);
-extern int llapi_hsm_state_set(const char *path, __u64 setmask, __u64 clearmask,
-			       __u32 archive_id);
-extern int llapi_hsm_register_event_fifo(const char *path);
-extern int llapi_hsm_unregister_event_fifo(const char *path);
-extern void llapi_hsm_log_error(enum llapi_message_level level, int _rc,
-				const char *fmt, va_list args);
-
-extern int llapi_get_agent_uuid(char *path, char *buf, size_t bufsize);
-extern int llapi_create_volatile_idx(char *directory, int idx, int mode);
-static inline int llapi_create_volatile(char *directory, int mode)
+int llapi_get_data_version(int fd, __u64 *data_version, __u64 flags);
+int llapi_file_flush(int fd);
+extern int llapi_get_ost_layout_version(int fd, __u32 *layout_version);
+int llapi_hsm_state_get_fd(int fd, struct hsm_user_state *hus);
+int llapi_hsm_state_get(const char *path, struct hsm_user_state *hus);
+int llapi_hsm_state_set_fd(int fd, __u64 setmask, __u64 clearmask,
+			   __u32 archive_id);
+int llapi_hsm_state_set(const char *path, __u64 setmask, __u64 clearmask,
+			__u32 archive_id);
+int llapi_hsm_register_event_fifo(const char *path);
+int llapi_hsm_unregister_event_fifo(const char *path);
+void llapi_hsm_log_error(enum llapi_message_level level, int _rc,
+			 const char *fmt, va_list args);
+
+int llapi_get_agent_uuid(char *path, char *buf, size_t bufsize);
+int llapi_create_volatile_idx(const char *directory, int mdt_idx,
+			      int open_flags);
+int llapi_create_volatile_param(const char *directory, int mdt_idx,
+				int open_flags, mode_t mode,
+				const struct llapi_stripe_param *stripe_param);
+
+static inline int llapi_create_volatile(char *directory, int open_flags)
 {
-	return llapi_create_volatile_idx(directory, -1, mode);
+	return llapi_create_volatile_idx(directory, -1, open_flags);
 }
 
 
-extern int llapi_fswap_layouts_grouplock(int fd1, int fd2, __u64 dv1, __u64 dv2,
-					 int gid, __u64 flags);
-extern int llapi_fswap_layouts(int fd1, int fd2, __u64 dv1, __u64 dv2,
-			       __u64 flags);
-extern int llapi_swap_layouts(const char *path1, const char *path2,
-			      __u64 dv1, __u64 dv2, __u64 flags);
+int llapi_fswap_layouts_grouplock(int fd1, int fd2, __u64 dv1, __u64 dv2,
+				  int gid, __u64 flags);
+int llapi_fswap_layouts(int fd1, int fd2, __u64 dv1, __u64 dv2, __u64 flags);
+int llapi_swap_layouts(const char *path1, const char *path2, __u64 dv1,
+		       __u64 dv2, __u64 flags);
 
 /* Changelog interface.  priv is private state, managed internally by these
  * functions */
@@ -384,15 +453,18 @@ extern int llapi_swap_layouts(const char *path1, const char *path2,
  * converted to extended format in the lustre api to ease changelog analysis. */
 #define HAVE_CHANGELOG_EXTEND_REC 1
 
-extern int llapi_changelog_start(void **priv, enum changelog_send_flag flags,
-				 const char *mdtname, long long startrec);
-extern int llapi_changelog_fini(void **priv);
-extern int llapi_changelog_recv(void *priv, struct changelog_rec **rech);
-extern int llapi_changelog_free(struct changelog_rec **rech);
-extern int llapi_changelog_get_fd(void *priv);
+int llapi_changelog_start(void **priv, enum changelog_send_flag flags,
+			  const char *mdtname, long long startrec);
+int llapi_changelog_fini(void **priv);
+int llapi_changelog_recv(void *priv, struct changelog_rec **rech);
+int llapi_changelog_in_buf(void *priv);
+int llapi_changelog_free(struct changelog_rec **rech);
+int llapi_changelog_get_fd(void *priv);
 /* Allow records up to endrec to be destroyed; requires registered id. */
-extern int llapi_changelog_clear(const char *mdtname, const char *idstr,
-                                 long long endrec);
+int llapi_changelog_clear(const char *mdtname, const char *idstr,
+			  long long endrec);
+extern int llapi_changelog_set_xflags(void *priv,
+				    enum changelog_send_extra_flag extra_flags);
 
 /* HSM copytool interface.
  * priv is private state, managed internally by these functions
@@ -400,52 +472,51 @@ extern int llapi_changelog_clear(const char *mdtname, const char *idstr,
 struct hsm_copytool_private;
 struct hsm_copyaction_private;
 
-extern int llapi_hsm_copytool_register(struct hsm_copytool_private **priv,
-				       const char *mnt, int archive_count,
-				       int *archives, int rfd_flags);
-extern int llapi_hsm_copytool_unregister(struct hsm_copytool_private **priv);
-extern int llapi_hsm_copytool_get_fd(struct hsm_copytool_private *ct);
-extern int llapi_hsm_copytool_recv(struct hsm_copytool_private *priv,
-				   struct hsm_action_list **hal, int *msgsize);
-extern int llapi_hsm_action_begin(struct hsm_copyaction_private **phcp,
-				  const struct hsm_copytool_private *ct,
-				  const struct hsm_action_item *hai,
-				  int restore_mdt_index, int restore_open_flags,
-				  bool is_error);
-extern int llapi_hsm_action_end(struct hsm_copyaction_private **phcp,
-				const struct hsm_extent *he,
-				int hp_flags, int errval);
-extern int llapi_hsm_action_progress(struct hsm_copyaction_private *hcp,
-				     const struct hsm_extent *he, __u64 total,
-				     int hp_flags);
-extern int llapi_hsm_action_get_dfid(const struct hsm_copyaction_private *hcp,
-				     lustre_fid *fid);
-extern int llapi_hsm_action_get_fd(const struct hsm_copyaction_private *hcp);
-extern int llapi_hsm_import(const char *dst, int archive, const struct stat *st,
-			    unsigned long long stripe_size, int stripe_offset,
-			    int stripe_count, int stripe_pattern,
-			    char *pool_name, lustre_fid *newfid);
+int llapi_hsm_copytool_register(struct hsm_copytool_private **priv,
+				const char *mnt, int archive_count,
+				int *archives, int rfd_flags);
+int llapi_hsm_copytool_unregister(struct hsm_copytool_private **priv);
+int llapi_hsm_copytool_get_fd(struct hsm_copytool_private *ct);
+int llapi_hsm_copytool_recv(struct hsm_copytool_private *priv,
+			    struct hsm_action_list **hal, int *msgsize);
+int llapi_hsm_action_begin(struct hsm_copyaction_private **phcp,
+			   const struct hsm_copytool_private *ct,
+			   const struct hsm_action_item *hai,
+			   int restore_mdt_index, int restore_open_flags,
+			   bool is_error);
+int llapi_hsm_action_end(struct hsm_copyaction_private **phcp,
+			 const struct hsm_extent *he, int hp_flags, int errval);
+int llapi_hsm_action_progress(struct hsm_copyaction_private *hcp,
+			      const struct hsm_extent *he, __u64 total,
+			      int hp_flags);
+int llapi_hsm_action_get_dfid(const struct hsm_copyaction_private *hcp,
+			      struct lu_fid *fid);
+int llapi_hsm_action_get_fd(const struct hsm_copyaction_private *hcp);
+int llapi_hsm_import(const char *dst, int archive, const struct stat *st,
+		     unsigned long long stripe_size, int stripe_offset,
+		     int stripe_count, int stripe_pattern, char *pool_name,
+		     struct lu_fid *newfid);
 
 /* HSM user interface */
-extern struct hsm_user_request *llapi_hsm_user_request_alloc(int itemcount,
-							     int data_len);
-extern int llapi_hsm_request(const char *path,
-			     const struct hsm_user_request *request);
-extern int llapi_hsm_current_action(const char *path,
-				    struct hsm_current_action *hca);
+struct hsm_user_request *llapi_hsm_user_request_alloc(int itemcount,
+						      int data_len);
+int llapi_hsm_request(const char *path, const struct hsm_user_request *request);
+int llapi_hsm_current_action(const char *path, struct hsm_current_action *hca);
 
 /* JSON handling */
-extern int llapi_json_init_list(struct llapi_json_item_list **item_list);
-extern int llapi_json_destroy_list(struct llapi_json_item_list **item_list);
-extern int llapi_json_add_item(struct llapi_json_item_list **item_list,
-			       char *key, __u32 type, void *val);
-extern int llapi_json_write_list(struct llapi_json_item_list **item_list,
-				 FILE *fp);
+int llapi_json_init_list(struct llapi_json_item_list **item_list);
+int llapi_json_destroy_list(struct llapi_json_item_list **item_list);
+int llapi_json_add_item(struct llapi_json_item_list **item_list, char *key,
+			__u32 type, void *val);
+int llapi_json_write_list(struct llapi_json_item_list **item_list, FILE *fp);
 
 /* File lease */
-extern int llapi_lease_get(int fd, int mode);
-extern int llapi_lease_check(int fd);
-extern int llapi_lease_put(int fd);
+int llapi_lease_acquire(int fd, enum ll_lease_mode mode);
+int llapi_lease_release(int fd);
+int llapi_lease_set(int fd, const struct ll_ioc_lease *data);
+int llapi_lease_check(int fd);
+int llapi_lease_get(int fd, int mode); /* obsoleted */
+int llapi_lease_put(int fd); /* obsoleted */
 
 /* Group lock */
 int llapi_group_lock(int fd, int gid);
@@ -458,9 +529,33 @@ int llapi_ladvise(int fd, unsigned long long flags, int num_advise,
 
 /* llapi_layout user interface */
 
+/**
+ * An array element storing component info to be resynced during mirror
+ * resynchronization.
+ */
+struct llapi_resync_comp {
+	uint64_t lrc_start;
+	uint64_t lrc_end;
+	uint32_t lrc_mirror_id;
+	uint32_t lrc_id;	/* component id */
+	bool lrc_synced;
+};
+
 /** Opaque data type abstracting the layout of a Lustre file. */
 struct llapi_layout;
 
+int llapi_mirror_truncate(int fd, unsigned int id, off_t length);
+ssize_t llapi_mirror_write(int fd, unsigned int id, const void *buf,
+			   size_t count, off_t pos);
+uint32_t llapi_mirror_find(struct llapi_layout *layout,
+			   uint64_t file_start, uint64_t file_end,
+			   uint64_t *endp);
+int llapi_mirror_find_stale(struct llapi_layout *layout,
+		struct llapi_resync_comp *comp, size_t comp_size,
+		__u16 *mirror_ids, int ids_nr);
+int llapi_mirror_resync_many(int fd, struct llapi_layout *layout,
+			     struct llapi_resync_comp *comp_array,
+			     int comp_size,  uint64_t start, uint64_t end);
 /*
  * Flags to control how layouts are retrieved.
  */
@@ -487,8 +582,8 @@ struct llapi_layout *llapi_layout_get_by_fd(int fd, uint32_t flags);
 
 /**
  * Return a pointer to a newly-allocated opaque data type containing the
- * layout for the file associated with Lustre file identifier string
- * \a fidstr.  The string \a path must name a path within the
+ * layout for the file associated with Lustre file identifier
+ * \a fid.  The string \a path must name a path within the
  * filesystem that contains the file being looked up, such as the
  * filesystem root.  The returned pointer should be freed with
  * llapi_layout_free() when it is no longer needed.  Failure is
@@ -496,9 +591,35 @@ struct llapi_layout *llapi_layout_get_by_fd(int fd, uint32_t flags);
  * stored in errno.
  */
 struct llapi_layout *llapi_layout_get_by_fid(const char *path,
-					     const lustre_fid *fid,
+					     const struct lu_fid *fid,
 					     uint32_t flags);
 
+enum llapi_layout_xattr_flags {
+	LLAPI_LXF_CHECK = 0x0001,
+	LLAPI_LXF_COPY  = 0x0002,
+};
+
+/**
+ * Return a pointer to a newly-allocated opaque data type containing the
+ * layout for the file associated with extended attribute \a lov_xattr.  The
+ * length of the extended attribute is \a lov_xattr_size. The \a lov_xattr
+ * should be raw xattr without being swapped, since this function will swap it
+ * properly. Thus, \a lov_xattr will be modified during the process. If the
+ * \a LLAPI_LXF_CHECK flag of \a flags is set, this function will check whether
+ * the objects count in lum is consistent with the stripe count in lum. This
+ * check only apply to regular file, so \a LLAPI_LXF_CHECK flag should be
+ * cleared if the xattr belongs to a directory. If the \a LLAPI_LXF_COPY flag
+ * of \a flags is set, this function will use a temporary buffer for byte
+ * swapping when necessary, leaving \a lov_xattr untouched. Otherwise, the byte
+ * swapping will be done to the \a lov_xattr buffer directly.  The returned
+ * pointer should be freed with llapi_layout_free() when it is no longer
+ * needed.  Failure is  * indicated with a NULL return value and an appropriate
+ * error code stored in errno.
+ */
+struct llapi_layout *llapi_layout_get_by_xattr(void *lov_xattr,
+					       ssize_t lov_xattr_size,
+					       uint32_t flags);
+
 /**
  * Allocate a new layout. Use this when creating a new file with
  * llapi_layout_file_create().
@@ -510,6 +631,19 @@ struct llapi_layout *llapi_layout_alloc(void);
  */
 void llapi_layout_free(struct llapi_layout *layout);
 
+/**
+ * llapi_layout_merge() - Merge a composite layout into another one.
+ * @dst_layout: Destination composite layout.
+ * @src_layout: Source composite layout.
+ *
+ * This function copies all of the components from @src_layout and
+ * appends them to @dst_layout.
+ *
+ * Return: 0 on success or -1 on failure.
+ */
+int llapi_layout_merge(struct llapi_layout **dst_layout,
+		       const struct llapi_layout *src_layout);
+
 /** Not a valid stripe size, offset, or RAID pattern. */
 #define LLAPI_LAYOUT_INVALID	0x1000000000000001ULL
 
@@ -531,7 +665,8 @@ void llapi_layout_free(struct llapi_layout *layout);
  * stored using RAID0.  That is, data will be split evenly and without
  * redundancy across all OSTs in the layout.
  */
-#define LLAPI_LAYOUT_RAID0	0
+#define LLAPI_LAYOUT_RAID0	0ULL
+#define LLAPI_LAYOUT_MDT	2ULL
 
 /**
 * The layout includes a specific set of OSTs on which to allocate.
@@ -731,6 +866,39 @@ int llapi_layout_file_open(const char *path, int open_flags, mode_t mode,
 int llapi_layout_file_create(const char *path, int open_flags, int mode,
 			     const struct llapi_layout *layout);
 
+/**
+ * Set flags to the header of component layout.
+ */
+int llapi_layout_flags_set(struct llapi_layout *layout, uint32_t flags);
+int llapi_layout_flags_get(struct llapi_layout *layout, uint32_t *flags);
+const char *llapi_layout_flags_string(uint32_t flags);
+const __u16 llapi_layout_string_flags(char *string);
+
+/**
+ * llapi_layout_mirror_count_get() - Get mirror count from the header of
+ *				     a layout.
+ * @layout: Layout to get mirror count from.
+ * @count:  Returned mirror count value.
+ *
+ * This function gets mirror count from the header of a layout.
+ *
+ * Return: 0 on success or -1 on failure.
+ */
+int llapi_layout_mirror_count_get(struct llapi_layout *layout,
+				  uint16_t *count);
+
+/**
+ * llapi_layout_mirror_count_set() - Set mirror count to the header of a layout.
+ * @layout: Layout to set mirror count in.
+ * @count:  Mirror count value to be set.
+ *
+ * This function sets mirror count to the header of a layout.
+ *
+ * Return: 0 on success or -1 on failure.
+ */
+int llapi_layout_mirror_count_set(struct llapi_layout *layout,
+				  uint16_t count);
+
 /**
  * Fetch the start and end offset of the current layout component.
  */
@@ -748,12 +916,10 @@ static const struct comp_flag_name {
 	const char *cfn_name;
 } comp_flags_table[] = {
 	{ LCME_FL_INIT,		"init" },
-	/* For now, only "init" is supported
-	{ LCME_FL_PRIMARY,	"primary" },
 	{ LCME_FL_STALE,	"stale" },
+	{ LCME_FL_PREF_RW,	"prefer" },
 	{ LCME_FL_OFFLINE,	"offline" },
-	{ LCME_FL_PREFERRED,	"preferred" }
-	*/
+	{ LCME_FL_NOSYNC,	"nosync" },
 };
 
 /**
@@ -773,10 +939,18 @@ int llapi_layout_comp_flags_clear(struct llapi_layout *layout, uint32_t flags);
  * Fetches the file-unique component ID of the current layout component.
  */
 int llapi_layout_comp_id_get(const struct llapi_layout *layout, uint32_t *id);
+/**
+ * Fetches the mirror ID of the current layout component.
+ */
+int llapi_layout_mirror_id_get(const struct llapi_layout *layout, uint32_t *id);
 /**
  * Adds one component to the existing composite or plain layout.
  */
 int llapi_layout_comp_add(struct llapi_layout *layout);
+/**
+ * Adds a first component of a mirror to the existing composite layout.
+ */
+int llapi_layout_add_first_comp(struct llapi_layout *layout);
 /**
  * Deletes the current layout component from the composite layout.
  */
@@ -813,10 +987,52 @@ int llapi_layout_file_comp_del(const char *path, uint32_t id, uint32_t flags);
  * attributes are passed in by @comp and @valid is used to specify which
  * attributes in the component are going to be changed.
  */
-int llapi_layout_file_comp_set(const char *path,
-			       const struct llapi_layout *comp,
-			       uint32_t valid);
+int llapi_layout_file_comp_set(const char *path, uint32_t *ids, uint32_t *flags,
+			       size_t count);
+/**
+ * Check if the file layout is composite.
+ */
+bool llapi_layout_is_composite(struct llapi_layout *layout);
+
+enum {
+	LLAPI_LAYOUT_ITER_CONT = 0,
+	LLAPI_LAYOUT_ITER_STOP = 1,
+};
+
+/**
+ * Iteration callback function.
+ *
+ * \retval LLAPI_LAYOUT_ITER_CONT	Iteration proceeds
+ * \retval LLAPI_LAYOUT_ITER_STOP	Stop iteration
+ * \retval < 0				error code
+ */
+typedef int (*llapi_layout_iter_cb)(struct llapi_layout *layout, void *cbdata);
+
+/**
+ * Iterate all components in the corresponding layout
+ */
+int llapi_layout_comp_iterate(struct llapi_layout *layout,
+			      llapi_layout_iter_cb cb, void *cbdata);
+
+/**
+ * FLR: mirror operation APIs
+ */
+int llapi_mirror_set(int fd, unsigned int id);
+int llapi_mirror_clear(int fd);
+ssize_t llapi_mirror_read(int fd, unsigned int id,
+			   void *buf, size_t count, off_t pos);
+ssize_t llapi_mirror_copy_many(int fd, __u16 src, __u16 *dst, size_t count);
+int llapi_mirror_copy(int fd, unsigned int src, unsigned int dst,
+		       off_t pos, size_t count);
+
+int llapi_param_get_paths(const char *pattern, glob_t *paths);
+int llapi_param_get_value(const char *path, char **buf, size_t *buflen);
+void llapi_param_paths_free(glob_t *paths);
 
 /** @} llapi */
 
+#if defined(__cplusplus)
+}
+#endif
+
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_acl.h b/drivers/staging/lustrefsx/lustre/include/lustre_acl.h
index beab4a225119f..933d09ab4ef1f 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_acl.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_acl.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2014, Intel Corporation.
+ * Copyright (c) 2014, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_barrier.h b/drivers/staging/lustrefsx/lustre/include/lustre_barrier.h
index 231eae97972ee..df6f78bb4b29b 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_barrier.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_barrier.h
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2016, Intel Corporation.
+ * Copyright (c) 2017, Intel Corporation.
  *
  * lustre/include/lustre_barrier.h
  *
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
index 441f737170daa..e56f9abf7c8ec 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -33,15 +33,19 @@
 #ifndef _LUSTRE_COMPAT_H
 #define _LUSTRE_COMPAT_H
 
+#include <linux/aio.h>
+#include <linux/fs.h>
 #include <linux/fs_struct.h>
 #include <linux/namei.h>
 #include <linux/pagemap.h>
 #include <linux/bio.h>
 #include <linux/xattr.h>
+#include <linux/workqueue.h>
+#include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/security.h>
 
-#include <libcfs/libcfs.h>
+#include <libcfs/linux/linux-fs.h>
 #include <lustre_patchless_compat.h>
 #include <obd_support.h>
 
@@ -80,22 +84,6 @@ static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
 		path_put(&old_pwd);
 }
 
-/*
- * set ATTR_BLOCKS to a high value to avoid any risk of collision with other
- * ATTR_* attributes (see bug 13828)
- */
-#define ATTR_BLOCKS    (1 << 27)
-
-/*
- * In more recent kernels, this flag was removed because nobody was using it.
- * But Lustre does. So define it if needed. It is safe to do so, since it's
- * not been replaced with a different flag with the same value, and Lustre
- * only uses it internally.
- */
-#ifndef ATTR_ATTR_FLAG
-#define ATTR_ATTR_FLAG (1 << 10)
-#endif
-
 #define current_ngroups current_cred()->group_info->ngroups
 #define current_groups current_cred()->group_info->small_block
 
@@ -156,8 +144,12 @@ static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
 #define simple_setattr(dentry, ops) inode_setattr((dentry)->d_inode, ops)
 #endif
 
-#ifndef SLAB_DESTROY_BY_RCU
-#define SLAB_DESTROY_BY_RCU 0
+#ifndef HAVE_INIT_LIST_HEAD_RCU
+static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
+{
+	WRITE_ONCE(list->next, list);
+	WRITE_ONCE(list->prev, list);
+}
 #endif
 
 #ifndef HAVE_DQUOT_SUSPEND
@@ -190,6 +182,12 @@ static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
 #define bvl_to_page(bvl)		(bvl->bv_page)
 #endif
 
+#ifdef HAVE_BVEC_ITER
+#define bio_start_sector(bio) (bio->bi_iter.bi_sector)
+#else
+#define bio_start_sector(bio) (bio->bi_sector)
+#endif
+
 #ifndef HAVE_BLK_QUEUE_MAX_SEGMENTS
 #define blk_queue_max_segments(rq, seg)                      \
         do { blk_queue_max_phys_segments(rq, seg);           \
@@ -406,6 +404,16 @@ static inline void truncate_inode_pages_final(struct address_space *map)
 }
 #endif
 
+#ifndef HAVE_PTR_ERR_OR_ZERO
+static inline int __must_check PTR_ERR_OR_ZERO(__force const void *ptr)
+{
+	if (IS_ERR(ptr))
+		return PTR_ERR(ptr);
+	else
+		return 0;
+}
+#endif
+
 #ifndef SIZE_MAX
 #define SIZE_MAX	(~(size_t)0)
 #endif
@@ -436,9 +444,11 @@ static inline void truncate_inode_pages_final(struct address_space *map)
 #endif
 
 #ifdef HAVE_PID_NS_FOR_CHILDREN
-# define ll_task_pid_ns(task)	((task)->nsproxy->pid_ns_for_children)
+# define ll_task_pid_ns(task) \
+	 ((task)->nsproxy ? ((task)->nsproxy->pid_ns_for_children) : NULL)
 #else
-# define ll_task_pid_ns(task)	((task)->nsproxy->pid_ns)
+# define ll_task_pid_ns(task) \
+	 ((task)->nsproxy ? ((task)->nsproxy->pid_ns) : NULL)
 #endif
 
 #ifdef HAVE_FULL_NAME_HASH_3ARGS
@@ -472,37 +482,30 @@ int ll_removexattr(struct dentry *dentry, const char *name);
 #ifndef HAVE_VFS_SETXATTR
 const struct xattr_handler *get_xattr_type(const char *name);
 
-#ifdef HAVE_XATTR_HANDLER_FLAGS
 static inline int
 __vfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
 	       const void *value, size_t size, int flags)
 {
+# ifdef HAVE_XATTR_HANDLER_FLAGS
 	const struct xattr_handler *handler;
 	int rc;
 
 	handler = get_xattr_type(name);
 	if (!handler)
-		return -ENXIO;
-
-#if defined(HAVE_XATTR_HANDLER_INODE_PARAM)
-	rc = handler->set(handler, dentry, inode, name, value, size,
-			  XATTR_CREATE);
-#elif defined(HAVE_XATTR_HANDLER_SIMPLIFIED)
-	rc = handler->set(handler, dentry, name, value, size, XATTR_CREATE);
-#else
-	rc = handler->set(dentry, name, value, size, XATTR_CREATE,
-			  handler->flags);
-#endif /* !HAVE_XATTR_HANDLER_INODE_PARAM */
+		return -EOPNOTSUPP;
+
+#  if defined(HAVE_XATTR_HANDLER_INODE_PARAM)
+	rc = handler->set(handler, dentry, inode, name, value, size, flags);
+#  elif defined(HAVE_XATTR_HANDLER_SIMPLIFIED)
+	rc = handler->set(handler, dentry, name, value, size, flags);
+#  else
+	rc = handler->set(dentry, name, value, size, flags, handler->flags);
+#  endif /* !HAVE_XATTR_HANDLER_INODE_PARAM */
 	return rc;
-}
-#else /* !HAVE_XATTR_HANDLER_FLAGS */
-static inline int
-__vfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
-	       const void *value, size_t size, int flags)
-{
+# else /* !HAVE_XATTR_HANDLER_FLAGS */
 	return ll_setxattr(dentry, name, value, size, flags);
+# endif /* HAVE_XATTR_HANDLER_FLAGS */
 }
-#endif /* HAVE_XATTR_HANDLER_FLAGS */
 #endif /* HAVE_VFS_SETXATTR */
 
 #ifdef HAVE_IOP_SET_ACL
@@ -689,10 +692,122 @@ static inline struct timespec current_time(struct inode *inode)
 }
 #endif
 
+#ifndef time_after32
+/**
+ * time_after32 - compare two 32-bit relative times
+ * @a: the time which may be after @b
+ * @b: the time which may be before @a
+ *
+ * time_after32(a, b) returns true if the time @a is after time @b.
+ * time_before32(b, a) returns true if the time @b is before time @a.
+ *
+ * Similar to time_after(), compare two 32-bit timestamps for relative
+ * times.  This is useful for comparing 32-bit seconds values that can't
+ * be converted to 64-bit values (e.g. due to disk format or wire protocol
+ * issues) when it is known that the times are less than 68 years apart.
+ */
+#define time_after32(a, b)     ((s32)((u32)(b) - (u32)(a)) < 0)
+#define time_before32(b, a)    time_after32(a, b)
+
+#endif
+
 #ifndef __GFP_COLD
 #define __GFP_COLD 0
 #endif
 
+#ifndef alloc_workqueue
+#define alloc_workqueue(name, flags, max_active) create_workqueue(name)
+#endif
+
+#ifndef READ_ONCE
+#define READ_ONCE ACCESS_ONCE
+#endif
+
+#if IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)
+static inline unsigned short blk_integrity_interval(struct blk_integrity *bi)
+{
+#ifdef HAVE_INTERVAL_EXP_BLK_INTEGRITY
+	return bi->interval_exp ? 1 << bi->interval_exp : 0;
+#elif defined(HAVE_INTERVAL_BLK_INTEGRITY)
+	return bi->interval;
+#else
+	return bi->sector_size;
+#endif /* !HAVE_INTERVAL_EXP_BLK_INTEGRITY */
+}
+
+static inline const char *blk_integrity_name(struct blk_integrity *bi)
+{
+#ifdef HAVE_INTERVAL_EXP_BLK_INTEGRITY
+	return bi->profile->name;
+#else
+	return bi->name;
+#endif
+}
+
+static inline unsigned int bip_size(struct bio_integrity_payload *bip)
+{
+#ifdef HAVE_BIP_ITER_BIO_INTEGRITY_PAYLOAD
+	return bip->bip_iter.bi_size;
+#else
+	return bip->bip_size;
+#endif
+}
+#else /* !CONFIG_BLK_DEV_INTEGRITY */
+static inline unsigned short blk_integrity_interval(struct blk_integrity *bi)
+{
+	return 0;
+}
+static inline const char *blk_integrity_name(struct blk_integrity *bi)
+{
+	/* gcc8 dislikes when strcmp() is called against NULL */
+	return "";
+}
+#endif /* !CONFIG_BLK_DEV_INTEGRITY */
+
+#ifndef INTEGRITY_FLAG_READ
+#define INTEGRITY_FLAG_READ BLK_INTEGRITY_VERIFY
+#endif
+
+#ifndef INTEGRITY_FLAG_WRITE
+#define INTEGRITY_FLAG_WRITE BLK_INTEGRITY_GENERATE
+#endif
+
+static inline bool bdev_integrity_enabled(struct block_device *bdev, int rw)
+{
+#if IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)
+	struct blk_integrity *bi = bdev_get_integrity(bdev);
+
+	if (bi == NULL)
+		return false;
+
+#ifdef HAVE_INTERVAL_EXP_BLK_INTEGRITY
+	if (rw == 0 && bi->profile->verify_fn != NULL &&
+	    (bi->flags & INTEGRITY_FLAG_READ))
+		return true;
+
+	if (rw == 1 && bi->profile->generate_fn != NULL &&
+	    (bi->flags & INTEGRITY_FLAG_WRITE))
+		return true;
+#else
+	if (rw == 0 && bi->verify_fn != NULL &&
+	    (bi->flags & INTEGRITY_FLAG_READ))
+		return true;
+
+	if (rw == 1 && bi->generate_fn != NULL &&
+	    (bi->flags & INTEGRITY_FLAG_WRITE))
+		return true;
+#endif /* !HAVE_INTERVAL_EXP_BLK_INTEGRITY */
+#endif /* !CONFIG_BLK_DEV_INTEGRITY */
+
+	return false;
+}
+
+#ifdef HAVE_PAGEVEC_INIT_ONE_PARAM
+#define ll_pagevec_init(pvec, n) pagevec_init(pvec)
+#else
+#define ll_pagevec_init(pvec, n) pagevec_init(pvec, n)
+#endif
+
 #ifdef HAVE_I_PAGES
 #define page_tree i_pages
 #else
@@ -701,16 +816,16 @@ static inline struct timespec current_time(struct inode *inode)
 #define xa_unlock_irq(lockp) spin_unlock_irq(lockp)
 #endif
 
-#ifndef HAVE_LINUX_SELINUX_IS_ENABLED
-#define selinux_is_enabled() 1
-#endif
-
 #ifndef KMEM_CACHE_USERCOPY
 #define kmem_cache_create_usercopy(name, size, align, flags, useroffset, \
 				   usersize, ctor)			 \
 	kmem_cache_create(name, size, align, flags, ctor)
 #endif
 
+#ifndef HAVE_LINUX_SELINUX_IS_ENABLED
+#define selinux_is_enabled() 1
+#endif
+
 static inline void ll_security_release_secctx(char *secdata, u32 seclen)
 {
 #ifdef HAVE_SEC_RELEASE_SECCTX_1ARG
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_disk.h b/drivers/staging/lustrefsx/lustre/include/lustre_disk.h
index 9b20b7ba8f09e..c121ab18420d2 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_disk.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_disk.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -48,8 +48,8 @@
 #include <linux/backing-dev.h>
 #include <linux/list.h>
 #include <libcfs/libcfs.h>
-#include <uapi/linux/lustre_disk.h>
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_disk.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 
 #define IS_MDT(data)		((data)->lsi_flags & LDD_F_SV_TYPE_MDT)
 #define IS_OST(data)		((data)->lsi_flags & LDD_F_SV_TYPE_OST)
@@ -111,6 +111,7 @@ struct lustre_mount_data {
 
 /****************** superblock additional info *********************/
 struct ll_sb_info;
+struct kobject;
 
 struct lustre_sb_info {
 	int                       lsi_flags;
@@ -119,6 +120,7 @@ struct lustre_sb_info {
 	struct ll_sb_info        *lsi_llsbi;   /* add'l client sbi info */
 	struct dt_device	 *lsi_dt_dev;  /* dt device to access disk fs*/
 	atomic_t		  lsi_mounts;  /* references to the srv_mnt */
+	struct kobject		 *lsi_kobj;
 	char			  lsi_svname[MTI_NAME_MAXLEN];
 	/* lsi_osd_obdname format = 'lsi->ls_svname'-osd */
 	char			  lsi_osd_obdname[MTI_NAME_MAXLEN + 4];
@@ -129,8 +131,9 @@ struct lustre_sb_info {
 	char			  lsi_fstype[16];
 	struct backing_dev_info   lsi_bdi;     /* each client mountpoint needs
 						  own backing_dev_info */
+	/* protect lsi_lwp_list */
+	struct mutex		  lsi_lwp_mutex;
 	struct list_head	  lsi_lwp_list;
-	spinlock_t		  lsi_lwp_lock;
 	unsigned long		  lsi_lwp_started:1;
 };
 
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_dlm.h b/drivers/staging/lustrefsx/lustre/include/lustre_dlm.h
index 3eed4226f85a7..c6291b62f4259 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_dlm.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_dlm.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2016, Intel Corporation.
+ * Copyright (c) 2010, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -64,6 +64,9 @@ extern struct kset *ldlm_svc_kset;
 #define LDLM_DEFAULT_LRU_SIZE (100 * num_online_cpus())
 #define LDLM_DEFAULT_MAX_ALIVE		3900	/* 3900 seconds ~65 min */
 #define LDLM_CTIME_AGE_LIMIT (10)
+/* if client lock is unused for that time it can be cancelled if any other
+ * client shows interest in that lock, e.g. glimpse is occured. */
+#define LDLM_DIRTY_AGE_LIMIT (10)
 #define LDLM_DEFAULT_PARALLEL_AST_LIMIT 1024
 
 /**
@@ -232,8 +235,8 @@ struct ldlm_pool_ops {
  * This feature is commonly referred to as lru_resize.
  */
 struct ldlm_pool {
-	/** Pool proc directory. */
-	struct proc_dir_entry	*pl_proc_dir;
+	/** Pool debugfs directory. */
+	struct dentry		*pl_debugfs_entry;
 	/** Pool name, must be long enough to hold compound proc entry name. */
 	char			pl_name[100];
 	/** Lock for protecting SLV/CLV updates. */
@@ -269,9 +272,10 @@ struct ldlm_pool {
 	struct completion	 pl_kobj_unregister;
 };
 
-typedef int (*ldlm_res_policy)(struct ldlm_namespace *, struct ldlm_lock **,
-			       void *req_cookie, enum ldlm_mode mode,
-			       __u64 flags, void *data);
+typedef int (*ldlm_res_policy)(const struct lu_env *env,
+			       struct ldlm_namespace *,
+			       struct ldlm_lock **, void *req_cookie,
+			       enum ldlm_mode mode, __u64 flags, void *data);
 
 typedef int (*ldlm_cancel_cbt)(struct ldlm_lock *lock);
 
@@ -289,11 +293,10 @@ typedef int (*ldlm_cancel_cbt)(struct ldlm_lock *lock);
  * of ldlm_[res_]lvbo_[init,update,fill]() functions.
  */
 struct ldlm_valblock_ops {
-        int (*lvbo_init)(struct ldlm_resource *res);
-        int (*lvbo_update)(struct ldlm_resource *res,
-                           struct ptlrpc_request *r,
-                           int increase);
-        int (*lvbo_free)(struct ldlm_resource *res);
+	int (*lvbo_init)(struct ldlm_resource *res);
+	int (*lvbo_update)(struct ldlm_resource *res, struct ldlm_lock *lock,
+			   struct ptlrpc_request *r, int increase);
+	int (*lvbo_free)(struct ldlm_resource *res);
 	/* Return size of lvb data appropriate RPC size can be reserved */
 	int (*lvbo_size)(struct ldlm_lock *lock);
 	/* Called to fill in lvb data to RPC buffer @buf */
@@ -348,6 +351,14 @@ enum ldlm_ns_type {
 	LDLM_NS_TYPE_MGT,		/**< MGT namespace */
 };
 
+enum ldlm_namespace_flags {
+	/**
+	 * Flag to indicate the LRU cancel is in progress.
+	 * Used to limit the process by 1 thread only.
+	 */
+	LDLM_LRU_CANCEL = 0
+};
+
 /**
  * LDLM Namespace.
  *
@@ -376,6 +387,9 @@ struct ldlm_namespace {
 	/** Flag indicating if namespace is on client instead of server */
 	enum ldlm_side		ns_client;
 
+	/** name of this namespace */
+	char			*ns_name;
+
 	/** Resource hash table for namespace. */
 	struct cfs_hash		*ns_rs_hash;
 
@@ -394,8 +408,8 @@ struct ldlm_namespace {
 	/** Client side original connect flags supported by server. */
 	__u64			ns_orig_connect_flags;
 
-	/* namespace proc dir entry */
-	struct proc_dir_entry	*ns_proc_dir_entry;
+	/* namespace debugfs dir entry */
+	struct dentry		*ns_debugfs_entry;
 
 	/**
 	 * Position in global namespace list linking all namespaces on
@@ -439,14 +453,20 @@ struct ldlm_namespace {
 	 * This allows the client to start caching negative dentries
 	 * for a directory and may save an RPC for a later stat.
 	 */
-	unsigned int		ns_ctime_age_limit;
-
+	time64_t		ns_ctime_age_limit;
+	/**
+	 * Number of seconds since the lock was last used. The client may
+	 * cancel the lock limited by this age and flush related data if
+	 * any other client shows interest in it doing glimpse request.
+	 * This allows to cache stat data locally for such files early.
+	 */
+	time64_t		ns_dirty_age_limit;
 	/**
 	 * Used to rate-limit ldlm_namespace_dump calls.
 	 * \see ldlm_namespace_dump. Increased by 10 seconds every time
 	 * it is called.
 	 */
-	cfs_time_t		ns_next_dump;
+	time64_t		ns_next_dump;
 
 	/** "policy" function that does actual lock conflict determination */
 	ldlm_res_policy		ns_policy;
@@ -484,7 +504,7 @@ struct ldlm_namespace {
 	 * The resources in this namespace remember contended state during
 	 * \a ns_contention_time, in seconds.
 	 */
-	unsigned		ns_contention_time;
+	time64_t		ns_contention_time;
 
 	/**
 	 * Limit size of contended extent locks, in bytes.
@@ -519,6 +539,11 @@ struct ldlm_namespace {
 
 	struct kobject		ns_kobj; /* sysfs object */
 	struct completion	ns_kobj_unregister;
+
+	/**
+	 * To avoid another ns_lock usage, a separate bitops field.
+	 */
+	unsigned long		ns_flags;
 };
 
 /**
@@ -527,8 +552,6 @@ struct ldlm_namespace {
 static inline int ns_is_client(struct ldlm_namespace *ns)
 {
         LASSERT(ns != NULL);
-        LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT |
-                                    LDLM_NAMESPACE_SERVER)));
         LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT ||
                 ns->ns_client == LDLM_NAMESPACE_SERVER);
         return ns->ns_client == LDLM_NAMESPACE_CLIENT;
@@ -540,8 +563,6 @@ static inline int ns_is_client(struct ldlm_namespace *ns)
 static inline int ns_is_server(struct ldlm_namespace *ns)
 {
         LASSERT(ns != NULL);
-        LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT |
-                                    LDLM_NAMESPACE_SERVER)));
         LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT ||
                 ns->ns_client == LDLM_NAMESPACE_SERVER);
         return ns->ns_client == LDLM_NAMESPACE_SERVER;
@@ -584,6 +605,9 @@ typedef int (*ldlm_completion_callback)(struct ldlm_lock *lock, __u64 flags,
 /** Type for glimpse callback function of a lock. */
 typedef int (*ldlm_glimpse_callback)(struct ldlm_lock *lock, void *data);
 
+/** Type for created callback function of a lock. */
+typedef void (*ldlm_created_callback)(struct ldlm_lock *lock);
+
 /** Work list for sending GL ASTs to multiple locks. */
 struct ldlm_glimpse_work {
 	struct ldlm_lock	*gl_lock; /* lock to glimpse */
@@ -595,6 +619,11 @@ struct ldlm_glimpse_work {
 	void			*gl_interpret_data;
 };
 
+struct ldlm_bl_desc {
+	unsigned int bl_same_client:1,
+		     bl_cos_incompat:1;
+};
+
 struct ldlm_cb_set_arg {
 	struct ptlrpc_request_set	*set;
 	int				 type; /* LDLM_{CP,BL,GL}_CALLBACK */
@@ -603,6 +632,7 @@ struct ldlm_cb_set_arg {
 	union ldlm_gl_desc		*gl_desc; /* glimpse AST descriptor */
 	ptlrpc_interpterer_t		 gl_interpret_reply;
 	void				*gl_interpret_data;
+	struct ldlm_bl_desc		*bl_desc;
 };
 
 struct ldlm_cb_async_args {
@@ -610,8 +640,8 @@ struct ldlm_cb_async_args {
 	struct ldlm_lock	*ca_lock;
 };
 
-/** The ldlm_glimpse_work is allocated on the stack and should not be freed. */
-#define LDLM_GL_WORK_NOFREE 0x1
+/** The ldlm_glimpse_work was slab allocated & must be freed accordingly.*/
+#define LDLM_GL_WORK_SLAB_ALLOCATED 0x1
 
 /** Interval node data for each LDLM_EXTENT lock. */
 struct ldlm_interval {
@@ -634,6 +664,19 @@ struct ldlm_interval_tree {
 	struct interval_node	*lit_root; /* actual ldlm_interval */
 };
 
+/**
+ * Lists of waiting locks for each inodebit type.
+ * A lock can be in several liq_waiting lists and it remains in lr_waiting.
+ */
+struct ldlm_ibits_queues {
+	struct list_head	liq_waiting[MDS_INODELOCK_NUMBITS];
+};
+
+struct ldlm_ibits_node {
+	struct list_head	lin_link[MDS_INODELOCK_NUMBITS];
+	struct ldlm_lock	*lock;
+};
+
 /** Whether to track references to exports by LDLM locks. */
 #define LUSTRE_TRACKS_LOCK_EXP_REFS (0)
 
@@ -724,14 +767,17 @@ struct ldlm_lock {
 	struct list_head	l_lru;
 	/**
 	 * Linkage to resource's lock queues according to current lock state.
-	 * (could be granted, waiting or converting)
+	 * (could be granted or waiting)
 	 * Protected by lr_lock in struct ldlm_resource.
 	 */
 	struct list_head	l_res_link;
 	/**
-	 * Tree node for ldlm_extent.
+	 * Internal structures per lock type..
 	 */
-	struct ldlm_interval	*l_tree_node;
+	union {
+		struct ldlm_interval	*l_tree_node;
+		struct ldlm_ibits_node  *l_ibits_node;
+	};
 	/**
 	 * Per export hash of locks.
 	 * Protected by per-bucket exp->exp_lock_hash locks.
@@ -845,10 +891,13 @@ struct ldlm_lock {
 	 * the lock at client, e.g. enqueue the lock. For server it is the
 	 * time when blocking ast was sent.
 	 */
-		time64_t        l_activity;
-		time64_t        l_blast_sent;
+		time64_t	l_activity;
+		time64_t	l_blast_sent;
 	};
 
+	/* separate ost_lvb used mostly by Data-on-MDT for now.
+	 * It is introduced to don't mix with layout lock data. */
+	struct ost_lvb		 l_ost_lvb;
 	/*
 	 * Server-side-only members.
 	 */
@@ -876,7 +925,7 @@ struct ldlm_lock {
 	 * under this lock.
 	 * \see ost_rw_prolong_locks
 	 */
-	cfs_time_t		l_callback_timeout;
+	time64_t		l_callback_timeout;
 
 	/** Local PID of process which created this lock. */
 	__u32			l_pid;
@@ -928,6 +977,20 @@ struct ldlm_lock {
 	struct list_head	l_exp_list;
 };
 
+/**
+ * Describe the overlap between two locks.  itree_overlap_cb data.
+ */
+struct ldlm_match_data {
+	struct ldlm_lock	*lmd_old;
+	struct ldlm_lock	*lmd_lock;
+	enum ldlm_mode		*lmd_mode;
+	union ldlm_policy_data	*lmd_policy;
+	__u64			 lmd_flags;
+	__u64			 lmd_skip_flags;
+	int			 lmd_unref;
+	bool			 lmd_has_ast_data;
+};
+
 /** For uncommitted cross-MDT lock, store transno this lock belongs to */
 #define l_transno l_client_cookie
 
@@ -935,6 +998,15 @@ struct ldlm_lock {
  *  which is for server. */
 #define l_slc_link l_rk_ast
 
+#define HANDLE_MAP_SIZE  ((LMV_MAX_STRIPE_COUNT + 7) >> 3)
+
+struct lustre_handle_array {
+	unsigned int		ha_count;
+	/* ha_map is used as bit flag to indicate handle is remote or local */
+	char			ha_map[HANDLE_MAP_SIZE];
+	struct lustre_handle	ha_handles[0];
+};
+
 /**
  * LDLM resource description.
  * Basically, resource is a representation for a single object.
@@ -966,8 +1038,6 @@ struct ldlm_resource {
 	 * @{ */
 	/** List of locks in granted state */
 	struct list_head	lr_granted;
-	/** List of locks waiting to change their granted mode (converted) */
-	struct list_head	lr_converting;
 	/**
 	 * List of locks that could not be granted due to conflicts and
 	 * that are waiting for conflicts to go away */
@@ -977,16 +1047,21 @@ struct ldlm_resource {
 	/** Resource name */
 	struct ldlm_res_id	lr_name;
 
-	/**
-	 * Interval trees (only for extent locks) for all modes of this resource
-	 */
-	struct ldlm_interval_tree *lr_itree;
+	union {
+		/**
+		 * Interval trees (only for extent locks) for all modes of
+		 * this resource
+		 */
+		struct ldlm_interval_tree *lr_itree;
+		struct ldlm_ibits_queues *lr_ibits_queues;
+	};
 
 	union {
 		/**
 		 * When the resource was considered as contended,
-		 * used only on server side. */
-		cfs_time_t	lr_contention_time;
+		 * used only on server side.
+		 */
+		time64_t	lr_contention_time;
 		/**
 		 * Associated inode, used only on client side.
 		 */
@@ -1011,16 +1086,27 @@ struct ldlm_resource {
 	struct lu_ref		lr_reference;
 };
 
+static inline int ldlm_is_granted(struct ldlm_lock *lock)
+{
+	return lock->l_req_mode == lock->l_granted_mode;
+}
+
 static inline bool ldlm_has_layout(struct ldlm_lock *lock)
 {
 	return lock->l_resource->lr_type == LDLM_IBITS &&
 		lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_LAYOUT;
 }
 
+static inline bool ldlm_has_dom(struct ldlm_lock *lock)
+{
+	return lock->l_resource->lr_type == LDLM_IBITS &&
+		lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_DOM;
+}
+
 static inline char *
 ldlm_ns_name(struct ldlm_namespace *ns)
 {
-        return ns->ns_rs_hash->hs_name;
+	return ns->ns_name;
 }
 
 static inline struct ldlm_namespace *
@@ -1127,10 +1213,11 @@ struct ldlm_enqueue_info {
 	void		*ei_cb_local_bl; /** blocking local lock callback */
 	void		*ei_cb_cp;	/** lock completion callback */
 	void		*ei_cb_gl;	/** lock glimpse callback */
+	ldlm_created_callback ei_cb_created;	/** lock created callback */
 	void		*ei_cbdata;	/** Data to be passed into callbacks. */
 	void		*ei_namespace;	/** lock namespace **/
-	unsigned int	ei_enq_slave:1,	/** whether enqueue slave stripes */
-			ei_nonblock:1;	/** non block enqueue */
+	u64		ei_inodebits;	/** lock inode bits **/
+	unsigned int	ei_enq_slave:1;	/** whether enqueue slave stripes */
 };
 
 #define ei_res_id	ei_cb_gl
@@ -1203,21 +1290,21 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
  * LDLM_PROCESS_RESCAN:
  *
  * It's used when policy functions are called from ldlm_reprocess_queue() to
- * reprocess the wait & convert list and try to grant locks, blocking ASTs
+ * reprocess the wait list and try to grant locks, blocking ASTs
  * have already been sent in this situation, completion ASTs need be sent for
  * the locks being granted.
  *
  * LDLM_PROCESS_ENQUEUE:
  *
  * It's used when policy functions are called from ldlm_lock_enqueue() to
- * process the wait & convert list for handling an enqueue request, blocking
+ * process the wait list for handling an enqueue request, blocking
  * ASTs have not been sent yet, so list of conflicting locks would be
  * collected and ASTs sent.
  *
  * LDLM_PROCESS_RECOVERY:
  *
  * It's used when policy functions are called from ldlm_reprocess_queue() to
- * reprocess the wait & convert list when recovery done. In case of blocking
+ * reprocess the wait list when recovery done. In case of blocking
  * ASTs are lost before recovery, it needs not only to grant locks if
  * available, but also send blocking ASTs to the locks doesn't have AST sent
  * flag. Completion ASTs need be sent for the locks being granted.
@@ -1233,6 +1320,12 @@ typedef int (*ldlm_processing_policy)(struct ldlm_lock *lock, __u64 *flags,
 				      enum ldlm_error *err,
 				      struct list_head *work_list);
 
+typedef int (*ldlm_reprocessing_policy)(struct ldlm_resource *res,
+					struct list_head *queue,
+					struct list_head *work_list,
+					enum ldlm_process_intention intention,
+					struct ldlm_lock *hint);
+
 /**
  * Return values for lock iterators.
  * Also used during deciding of lock grants and cancellations.
@@ -1269,7 +1362,7 @@ struct ldlm_prolong_args {
 	struct ldlm_res_id	lpa_resid;
 	struct ldlm_extent	lpa_extent;
 	enum ldlm_mode		lpa_mode;
-	int			lpa_timeout;
+	time64_t		lpa_timeout;
 	int			lpa_locks_cnt;
 	int			lpa_blocks_cnt;
 };
@@ -1303,14 +1396,11 @@ int ldlm_glimpse_locks(struct ldlm_resource *res,
  * MDT or OST to pass through LDLM requests to LDLM for handling
  * @{
  */
-int ldlm_handle_enqueue(struct ptlrpc_request *req, ldlm_completion_callback,
-                        ldlm_blocking_callback, ldlm_glimpse_callback);
 int ldlm_handle_enqueue0(struct ldlm_namespace *ns, struct ptlrpc_request *req,
-                         const struct ldlm_request *dlm_req,
-                         const struct ldlm_callback_suite *cbs);
-int ldlm_handle_convert(struct ptlrpc_request *req);
+			 const struct ldlm_request *dlm_req,
+			 const struct ldlm_callback_suite *cbs);
 int ldlm_handle_convert0(struct ptlrpc_request *req,
-                         const struct ldlm_request *dlm_req);
+			 const struct ldlm_request *dlm_req);
 int ldlm_handle_cancel(struct ptlrpc_request *req);
 int ldlm_request_cancel(struct ptlrpc_request *req,
 			const struct ldlm_request *dlm_req,
@@ -1318,10 +1408,10 @@ int ldlm_request_cancel(struct ptlrpc_request *req,
 /** @} ldlm_handlers */
 
 void ldlm_revoke_export_locks(struct obd_export *exp);
-unsigned int ldlm_bl_timeout(struct ldlm_lock *lock);
+time64_t ldlm_bl_timeout(struct ldlm_lock *lock);
 #endif
 int ldlm_del_waiting_lock(struct ldlm_lock *lock);
-int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout);
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, time64_t timeout);
 int ldlm_get_ref(void);
 void ldlm_put_ref(void);
 int ldlm_init_export(struct obd_export *exp);
@@ -1331,6 +1421,8 @@ struct ldlm_lock *ldlm_request_lock(struct ptlrpc_request *req);
 /* ldlm_lock.c */
 #ifdef HAVE_SERVER_SUPPORT
 ldlm_processing_policy ldlm_get_processing_policy(struct ldlm_resource *res);
+ldlm_reprocessing_policy
+ldlm_get_reprocessing_policy(struct ldlm_resource *res);
 #endif
 void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg);
 void ldlm_lock2handle(const struct ldlm_lock *lock,
@@ -1366,9 +1458,11 @@ ldlm_handle2lock_long(const struct lustre_handle *h, __u64 flags)
  * Update Lock Value Block Operations (LVBO) on a resource taking into account
  * data from request \a r
  */
-static inline int ldlm_res_lvbo_update(struct ldlm_resource *res,
-				       struct ptlrpc_request *req, int increase)
+static inline int ldlm_lvbo_update(struct ldlm_resource *res,
+				   struct ldlm_lock *lock,
+				   struct ptlrpc_request *req, int increase)
 {
+	struct ldlm_namespace *ns = ldlm_res_to_ns(res);
 	int rc;
 
 	/* delayed lvb init may be required */
@@ -1378,14 +1472,21 @@ static inline int ldlm_res_lvbo_update(struct ldlm_resource *res,
 		return rc;
 	}
 
-	if (ldlm_res_to_ns(res)->ns_lvbo &&
-	    ldlm_res_to_ns(res)->ns_lvbo->lvbo_update) {
-		return ldlm_res_to_ns(res)->ns_lvbo->lvbo_update(res, req,
-								 increase);
-	}
+	if (ns->ns_lvbo && ns->ns_lvbo->lvbo_update)
+		return ns->ns_lvbo->lvbo_update(res, lock, req, increase);
+
 	return 0;
 }
 
+static inline int ldlm_res_lvbo_update(struct ldlm_resource *res,
+				       struct ptlrpc_request *req,
+				       int increase)
+{
+	return ldlm_lvbo_update(res, NULL, req, increase);
+}
+
+int is_granted_or_cancelled_nolock(struct ldlm_lock *lock);
+
 int ldlm_error2errno(enum ldlm_error error);
 enum ldlm_error ldlm_errno2error(int err_no); /* don't call it `errno': this
 					       * confuses user-space. */
@@ -1448,17 +1549,33 @@ void ldlm_lock_fail_match_locked(struct ldlm_lock *lock);
 void ldlm_lock_fail_match(struct ldlm_lock *lock);
 void ldlm_lock_allow_match(struct ldlm_lock *lock);
 void ldlm_lock_allow_match_locked(struct ldlm_lock *lock);
-enum ldlm_mode ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags,
-			       const struct ldlm_res_id *, enum ldlm_type type,
-			       union ldlm_policy_data *, enum ldlm_mode mode,
-			       struct lustre_handle *, int unref);
+enum ldlm_mode ldlm_lock_match_with_skip(struct ldlm_namespace *ns,
+					 __u64 flags, __u64 skip_flags,
+					 const struct ldlm_res_id *res_id,
+					 enum ldlm_type type,
+					 union ldlm_policy_data *policy,
+					 enum ldlm_mode mode,
+					 struct lustre_handle *lh,
+					 int unref);
+static inline enum ldlm_mode ldlm_lock_match(struct ldlm_namespace *ns,
+					     __u64 flags,
+					     const struct ldlm_res_id *res_id,
+					     enum ldlm_type type,
+					     union ldlm_policy_data *policy,
+					     enum ldlm_mode mode,
+					     struct lustre_handle *lh,
+					     int unref)
+{
+	return ldlm_lock_match_with_skip(ns, flags, 0, res_id, type, policy,
+					 mode, lh, unref);
+}
+struct ldlm_lock *search_itree(struct ldlm_resource *res,
+			       struct ldlm_match_data *data);
 enum ldlm_mode ldlm_revalidate_lock_handle(const struct lustre_handle *lockh,
 					   __u64 *bits);
-struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock,
-					enum ldlm_mode new_mode, __u32 *flags);
-void ldlm_lock_downgrade(struct ldlm_lock *lock, enum ldlm_mode new_mode);
+void ldlm_lock_mode_downgrade(struct ldlm_lock *lock, enum ldlm_mode new_mode);
 void ldlm_lock_cancel(struct ldlm_lock *lock);
-void ldlm_reprocess_all(struct ldlm_resource *res);
+void ldlm_reprocess_all(struct ldlm_resource *res, struct ldlm_lock *hint);
 void ldlm_reprocess_recovery_done(struct ldlm_namespace *ns);
 void ldlm_lock_dump_handle(int level, const struct lustre_handle *lockh);
 void ldlm_unlink_lock_skiplist(struct ldlm_lock *req);
@@ -1480,12 +1597,40 @@ void ldlm_namespace_unregister(struct ldlm_namespace *ns,
 			       enum ldlm_side client);
 void ldlm_namespace_get(struct ldlm_namespace *ns);
 void ldlm_namespace_put(struct ldlm_namespace *ns);
-int ldlm_proc_setup(void);
-#ifdef CONFIG_PROC_FS
-void ldlm_proc_cleanup(void);
-#else
-static inline void ldlm_proc_cleanup(void) {}
-#endif
+
+int ldlm_debugfs_setup(void);
+void ldlm_debugfs_cleanup(void);
+
+static inline void ldlm_svc_get_eopc(const struct ldlm_request *dlm_req,
+				     struct lprocfs_stats *srv_stats)
+{
+	int lock_type = 0, op = 0;
+
+	lock_type = dlm_req->lock_desc.l_resource.lr_type;
+
+	switch (lock_type) {
+	case LDLM_PLAIN:
+		op = PTLRPC_LAST_CNTR + LDLM_PLAIN_ENQUEUE;
+		break;
+	case LDLM_EXTENT:
+		op = PTLRPC_LAST_CNTR + LDLM_EXTENT_ENQUEUE;
+		break;
+	case LDLM_FLOCK:
+		op = PTLRPC_LAST_CNTR + LDLM_FLOCK_ENQUEUE;
+		break;
+	case LDLM_IBITS:
+		op = PTLRPC_LAST_CNTR + LDLM_IBITS_ENQUEUE;
+		break;
+	default:
+		op = 0;
+		break;
+	}
+
+	if (op != 0)
+		lprocfs_counter_incr(srv_stats, op);
+
+	return;
+}
 
 /* resource.c - internal */
 struct ldlm_resource *ldlm_resource_get(struct ldlm_namespace *ns,
@@ -1555,7 +1700,8 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
 			  enum ldlm_mode mode, __u64 *flags, void *lvb,
 			  __u32 lvb_len,
 			  const struct lustre_handle *lockh, int rc);
-int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
+int ldlm_cli_enqueue_local(const struct lu_env *env,
+			   struct ldlm_namespace *ns,
 			   const struct ldlm_res_id *res_id,
 			   enum ldlm_type type, union ldlm_policy_data *policy,
 			   enum ldlm_mode mode, __u64 *flags,
@@ -1565,8 +1711,9 @@ int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
 			   void *data, __u32 lvb_len, enum lvb_type lvb_type,
 			   const __u64 *client_cookie,
 			   struct lustre_handle *lockh);
-int ldlm_cli_convert(const struct lustre_handle *lockh, int new_mode,
-		     __u32 *flags);
+int ldlm_cli_convert_req(struct ldlm_lock *lock, __u32 *flags, __u64 new_bits);
+int ldlm_cli_convert(struct ldlm_lock *lock,
+		     enum ldlm_cancel_flags cancel_flags);
 int ldlm_cli_update_pool(struct ptlrpc_request *req);
 int ldlm_cli_cancel(const struct lustre_handle *lockh,
 		    enum ldlm_cancel_flags cancel_flags);
@@ -1590,8 +1737,15 @@ int ldlm_cli_cancel_list_local(struct list_head *cancels, int count,
 int ldlm_cli_cancel_list(struct list_head *head, int count,
 			 struct ptlrpc_request *req,
 			 enum ldlm_cancel_flags flags);
+
+int ldlm_inodebits_drop(struct ldlm_lock *lock, __u64 to_drop);
+int ldlm_cli_inodebits_convert(struct ldlm_lock *lock,
+			       enum ldlm_cancel_flags cancel_flags);
+
 /** @} ldlm_cli_api */
 
+extern unsigned int ldlm_enqueue_min;
+
 /* mds/handler.c */
 /* This has to be here because recursive inclusion sucks. */
 int intent_disposition(struct ldlm_reply *rep, int flag);
@@ -1639,7 +1793,6 @@ void unlock_res_and_lock(struct ldlm_lock *lock);
  * There are not used outside of ldlm.
  * @{
  */
-int ldlm_pools_recalc(enum ldlm_side client);
 int ldlm_pools_init(void);
 void ldlm_pools_fini(void);
 
@@ -1648,7 +1801,7 @@ int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
 int ldlm_pool_shrink(struct ldlm_pool *pl, int nr, gfp_t gfp_mask);
 void ldlm_pool_fini(struct ldlm_pool *pl);
 int ldlm_pool_setup(struct ldlm_pool *pl, int limit);
-int ldlm_pool_recalc(struct ldlm_pool *pl);
+time64_t ldlm_pool_recalc(struct ldlm_pool *pl);
 __u32 ldlm_pool_get_lvf(struct ldlm_pool *pl);
 __u64 ldlm_pool_get_slv(struct ldlm_pool *pl);
 __u64 ldlm_pool_get_clv(struct ldlm_pool *pl);
@@ -1673,5 +1826,7 @@ static inline int ldlm_extent_contain(const struct ldlm_extent *ex1,
 	return ex1->start <= ex2->start && ex1->end >= ex2->end;
 }
 
+int ldlm_inodebits_drop(struct ldlm_lock *lock,  __u64 to_drop);
+
 #endif
 /** @} LDLM */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h b/drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h
index cab4e5f2f702a..9fdebcefe66a5 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h
@@ -26,7 +26,7 @@
 #ifndef LDLM_ALL_FLAGS_MASK
 
 /** l_flags bits marked as "all_flags" bits */
-#define LDLM_FL_ALL_FLAGS_MASK          0x00FFFFFFC08F932FULL
+#define LDLM_FL_ALL_FLAGS_MASK          0x00FFFFFFC28F932FULL
 
 /** extent, mode, or resource changed */
 #define LDLM_FL_LOCK_CHANGED            0x0000000000000001ULL // bit   0
@@ -44,7 +44,7 @@
 
 /**
  * Server placed lock on conv list, or a recovering client wants the lock
- * added to the conv list, no questions asked. */
+ * added to the conv list, no questions asked. (obsoleted) */
 #define LDLM_FL_BLOCK_CONV              0x0000000000000004ULL // bit   2
 #define ldlm_is_block_conv(_l)          LDLM_TEST_FLAG(( _l), 1ULL <<  2)
 #define ldlm_set_block_conv(_l)         LDLM_SET_FLAG((  _l), 1ULL <<  2)
@@ -58,6 +58,15 @@
 #define ldlm_set_block_wait(_l)         LDLM_SET_FLAG((  _l), 1ULL <<  3)
 #define ldlm_clear_block_wait(_l)       LDLM_CLEAR_FLAG((_l), 1ULL <<  3)
 
+/**
+ * Lock request is speculative/asynchronous, and cannot wait for any reason.
+ * Fail the lock request if any blocking locks are encountered.
+ * */
+#define LDLM_FL_SPECULATIVE		0x0000000000000010ULL /* bit   4 */
+#define ldlm_is_speculative(_l)		LDLM_TEST_FLAG((_l), 1ULL <<  4)
+#define ldlm_set_speculative(_l)	LDLM_SET_FLAG((_l), 1ULL <<  4)
+#define ldlm_clear_specualtive_(_l)	LDLM_CLEAR_FLAG((_l), 1ULL <<  4)
+
 /** blocking or cancel packet was queued for sending. */
 #define LDLM_FL_AST_SENT                0x0000000000000020ULL // bit   5
 #define ldlm_is_ast_sent(_l)            LDLM_TEST_FLAG(( _l), 1ULL <<  5)
@@ -138,6 +147,35 @@
 #define ldlm_set_cos_incompat(_l)	LDLM_SET_FLAG((_l), 1ULL << 24)
 #define ldlm_clear_cos_incompat(_l)	LDLM_CLEAR_FLAG((_l), 1ULL << 24)
 
+/*
+ * Flag indicates that lock is being converted (downgraded) during the blocking
+ * AST instead of cancelling. Used for IBITS locks now and drops conflicting
+ * bits only keepeing other.
+ */
+#define LDLM_FL_CONVERTING              0x0000000002000000ULL /* bit  25 */
+#define ldlm_is_converting(_l)          LDLM_TEST_FLAG((_l), 1ULL << 25)
+#define ldlm_set_converting(_l)         LDLM_SET_FLAG((_l), 1ULL << 25)
+#define ldlm_clear_converting(_l)       LDLM_CLEAR_FLAG((_l), 1ULL << 25)
+
+/**
+ * Part of original lockahead implementation, OBD_CONNECT_LOCKAHEAD_OLD.
+ * Reserved temporarily to allow those implementations to keep working.
+ * Will be removed after 2.12 release.
+ * */
+#define LDLM_FL_LOCKAHEAD_OLD_RESERVED	0x0000000010000000ULL /* bit  28 */
+#define ldlm_is_do_not_expand_io(_l)	 LDLM_TEST_FLAG((_l), 1ULL << 28)
+#define ldlm_set_do_not_expand_io(_l)	 LDLM_SET_FLAG((_l), 1ULL << 28)
+#define ldlm_clear_do_not_expand_io(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 28)
+
+/**
+ * Do not expand this lock.  Grant it only on the extent requested.
+ * Used for manually requested locks from the client (LU_LADVISE_LOCKAHEAD).
+ * */
+#define LDLM_FL_NO_EXPANSION		0x0000000020000000ULL /* bit  29 */
+#define ldlm_is_do_not_expand(_l)	LDLM_TEST_FLAG((_l), 1ULL << 29)
+#define ldlm_set_do_not_expand(_l)	LDLM_SET_FLAG((_l), 1ULL << 29)
+#define ldlm_clear_do_not_expand(_l)	LDLM_CLEAR_FLAG((_l), 1ULL << 29)
+
 /**
  * measure lock contention and return -EUSERS if locking contention is high */
 #define LDLM_FL_DENY_ON_CONTENTION        0x0000000040000000ULL // bit  30
@@ -354,26 +392,43 @@
 #define ldlm_is_cos_enabled(_l)          LDLM_TEST_FLAG((_l), 1ULL << 57)
 #define ldlm_set_cos_enabled(_l)         LDLM_SET_FLAG((_l), 1ULL << 57)
 
+/**
+ * This flags means to use non-delay RPC to send dlm request RPC.
+ */
+#define LDLM_FL_NDELAY			 0x0400000000000000ULL /* bit  58 */
+#define ldlm_is_ndelay(_l)		 LDLM_TEST_FLAG((_l), 1ULL << 58)
+#define ldlm_set_ndelay(_l)		 LDLM_SET_FLAG((_l), 1ULL << 58)
+
+/**
+ * LVB from this lock is cached in osc object
+ */
+#define LDLM_FL_LVB_CACHED              0x0800000000000000ULL /* bit  59 */
+#define ldlm_is_lvb_cached(_l)          LDLM_TEST_FLAG((_l), 1ULL << 59)
+#define ldlm_set_lvb_cached(_l)         LDLM_SET_FLAG((_l), 1ULL << 59)
+#define ldlm_clear_lvb_cached(_l)       LDLM_CLEAR_FLAG((_l), 1ULL << 59)
+
 /** l_flags bits marked as "ast" bits */
 #define LDLM_FL_AST_MASK                (LDLM_FL_FLOCK_DEADLOCK		|\
-					 LDLM_FL_AST_DISCARD_DATA)
+					 LDLM_FL_DISCARD_DATA)
 
 /** l_flags bits marked as "blocked" bits */
 #define LDLM_FL_BLOCKED_MASK            (LDLM_FL_BLOCK_GRANTED		|\
-					 LDLM_FL_BLOCK_CONV		|\
 					 LDLM_FL_BLOCK_WAIT)
 
 /** l_flags bits marked as "gone" bits */
 #define LDLM_FL_GONE_MASK		(LDLM_FL_DESTROYED		|\
 					 LDLM_FL_FAILED)
 
-/** l_flags bits marked as "inherit" bits */
-/* Flags inherited from wire on enqueue/reply between client/server. */
-/* NO_TIMEOUT flag to force ldlm_lock_match() to wait with no timeout. */
-/* TEST_LOCK flag to not let TEST lock to be granted. */
+/** l_flags bits marked as "inherit" bits
+ * Flags inherited from wire on enqueue/reply between client/server.
+ * CANCEL_ON_BLOCK so server will not grant if a blocking lock is found
+ * NO_TIMEOUT flag to force ldlm_lock_match() to wait with no timeout.
+ * TEST_LOCK flag to not let TEST lock to be granted.
+ * NO_EXPANSION to tell server not to expand extent of lock request */
 #define LDLM_FL_INHERIT_MASK            (LDLM_FL_CANCEL_ON_BLOCK	|\
 					 LDLM_FL_NO_TIMEOUT		|\
-					 LDLM_FL_TEST_LOCK)
+					 LDLM_FL_TEST_LOCK              |\
+					 LDLM_FL_NO_EXPANSION)
 
 /** flags returned in @flags parameter on ldlm_lock_enqueue,
  * to be re-constructed on re-send */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_eacl.h b/drivers/staging/lustrefsx/lustre/include/lustre_eacl.h
index 3061be1bc6124..03b9adc84897c 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_eacl.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_eacl.h
@@ -24,7 +24,7 @@
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_errno.h b/drivers/staging/lustrefsx/lustre/include/lustre_errno.h
similarity index 100%
rename from drivers/staging/lustrefsx/lustre/include/lustre/lustre_errno.h
rename to drivers/staging/lustrefsx/lustre/include/lustre_errno.h
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_export.h b/drivers/staging/lustrefsx/lustre/include/lustre_export.h
index 8552d3d1c00a7..5cf29e1a74d00 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_export.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_export.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -42,8 +42,10 @@
  * @{
  */
 
+#include <linux/workqueue.h>
+
 #include <lprocfs_status.h>
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 #include <lustre_dlm.h>
 
 struct mds_client_data;
@@ -99,6 +101,13 @@ struct tg_export_data {
 	long			ted_grant;    /* in bytes */
 	long			ted_pending;  /* bytes just being written */
 	__u8			ted_pagebits; /* log2 of client page size */
+
+	/**
+	 * File Modification Data (FMD) tracking
+	 */
+	spinlock_t		ted_fmd_lock; /* protects ted_fmd_list */
+	struct list_head	ted_fmd_list; /* FIDs being modified */
+	int			ted_fmd_count;/* items in ted_fmd_list */
 };
 
 /**
@@ -119,13 +128,10 @@ struct ec_export_data { /* echo client */
 /** Filter (oss-side) specific import data */
 struct filter_export_data {
 	struct tg_export_data	fed_ted;
-	spinlock_t		fed_lock;	/**< protects fed_mod_list */
 	__u64			fed_lastid_gen;
-	struct list_head	fed_mod_list; /* files being modified */
 	/* count of SOFT_SYNC RPCs, which will be reset after
 	 * ofd_soft_sync_limit number of RPCs, and trigger a sync. */
 	atomic_t		fed_soft_sync_count;
-	int			fed_mod_count;/* items in fed_writing list */
 	__u32			fed_group;
 };
 
@@ -202,6 +208,8 @@ struct obd_export {
 	struct obd_uuid		exp_client_uuid;
         /** To link all exports on an obd device */
 	struct list_head	exp_obd_chain;
+	/** work_struct for destruction of export */
+	struct work_struct	exp_zombie_work;
 	/* Unlinked export list */
 	struct list_head	exp_stale_list;
 	struct hlist_node	exp_uuid_hash;	/** uuid-export hash*/
@@ -239,45 +247,44 @@ struct obd_export {
 	/** Last committed transno for this export */
 	__u64			exp_last_committed;
 	/** When was last request received */
-	cfs_time_t		exp_last_request_time;
+	time64_t		exp_last_request_time;
 	/** On replay all requests waiting for replay are linked here */
 	struct list_head	exp_req_replay_queue;
 	/**
 	 * protects exp_flags, exp_outstanding_replies and the change
 	 * of exp_imp_reverse
 	 */
-	spinlock_t		  exp_lock;
+	spinlock_t		exp_lock;
 	/** Compatibility flags for this export are embedded into
 	 *  exp_connect_data */
-	struct obd_connect_data   exp_connect_data;
-        enum obd_option           exp_flags;
-        unsigned long             exp_failed:1,
-                                  exp_in_recovery:1,
-                                  exp_disconnected:1,
-                                  exp_connecting:1,
-                                  /** VBR: export missed recovery */
-                                  exp_delayed:1,
-                                  /** VBR: failed version checking */
-                                  exp_vbr_failed:1,
-                                  exp_req_replay_needed:1,
-                                  exp_lock_replay_needed:1,
-                                  exp_need_sync:1,
-                                  exp_flvr_changed:1,
-                                  exp_flvr_adapt:1,
-                                  exp_libclient:1, /* liblustre client? */
-				  /* if to swap nidtbl entries for 2.2 clients.
-				   * Only used by the MGS to fix LU-1644. */
-				  exp_need_mne_swab:1,
-				  /* The export already got final replay ping
-				   * request. */
-				  exp_replay_done:1;
-        /* also protected by exp_lock */
-        enum lustre_sec_part      exp_sp_peer;
-        struct sptlrpc_flavor     exp_flvr;             /* current */
-        struct sptlrpc_flavor     exp_flvr_old[2];      /* about-to-expire */
-	time64_t		  exp_flvr_expire[2];	/* seconds */
-
-        /** protects exp_hp_rpcs */
+	struct obd_connect_data exp_connect_data;
+	enum obd_option		exp_flags;
+	unsigned long		exp_failed:1,
+				exp_in_recovery:1,
+				exp_disconnected:1,
+				exp_connecting:1,
+				/** VBR: export missed recovery */
+				exp_delayed:1,
+				/** VBR: failed version checking */
+				exp_vbr_failed:1,
+				exp_req_replay_needed:1,
+				exp_lock_replay_needed:1,
+				exp_need_sync:1,
+				exp_flvr_changed:1,
+				exp_flvr_adapt:1,
+				/* if to swap nidtbl entries for 2.2 clients.
+				 * Only used by the MGS to fix LU-1644. */
+				exp_need_mne_swab:1,
+				/* The export already got final replay ping
+				 * request. */
+				exp_replay_done:1;
+	/* also protected by exp_lock */
+	enum lustre_sec_part	exp_sp_peer;
+	struct sptlrpc_flavor	exp_flvr;		/* current */
+	struct sptlrpc_flavor	exp_flvr_old[2];	/* about-to-expire */
+	time64_t		exp_flvr_expire[2];	/* seconds */
+
+	/** protects exp_hp_rpcs */
 	spinlock_t		exp_rpc_lock;
 	struct list_head	exp_hp_rpcs;	/* (potential) HP RPCs */
 	struct list_head	exp_reg_rpcs;  /* RPC being handled */
@@ -318,6 +325,18 @@ static inline __u64 exp_connect_flags(struct obd_export *exp)
 	return *exp_connect_flags_ptr(exp);
 }
 
+static inline __u64 *exp_connect_flags2_ptr(struct obd_export *exp)
+{
+	return &exp->exp_connect_data.ocd_connect_flags2;
+}
+
+static inline __u64 exp_connect_flags2(struct obd_export *exp)
+{
+	if (exp_connect_flags(exp) & OBD_CONNECT_FLAGS2)
+		return *exp_connect_flags2_ptr(exp);
+	return 0;
+}
+
 static inline int exp_max_brw_size(struct obd_export *exp)
 {
 	LASSERT(exp != NULL);
@@ -332,13 +351,6 @@ static inline int exp_connect_multibulk(struct obd_export *exp)
 	return exp_max_brw_size(exp) > ONE_MB_BRW_SIZE;
 }
 
-static inline int exp_expired(struct obd_export *exp, cfs_duration_t age)
-{
-        LASSERT(exp->exp_delayed);
-        return cfs_time_before(cfs_time_add(exp->exp_last_request_time, age),
-                               cfs_time_current_sec());
-}
-
 static inline int exp_connect_cancelset(struct obd_export *exp)
 {
 	LASSERT(exp != NULL);
@@ -407,6 +419,13 @@ static inline bool imp_connect_disp_stripe(struct obd_import *imp)
 	return ocd->ocd_connect_flags & OBD_CONNECT_DISP_STRIPE;
 }
 
+static inline bool imp_connect_shortio(struct obd_import *imp)
+{
+	struct obd_connect_data *ocd = &imp->imp_connect_data;
+
+	return ocd->ocd_connect_flags & OBD_CONNECT_SHORTIO;
+}
+
 static inline __u64 exp_connect_ibits(struct obd_export *exp)
 {
 	struct obd_connect_data *ocd;
@@ -420,13 +439,50 @@ static inline int exp_connect_large_acl(struct obd_export *exp)
 	return !!(exp_connect_flags(exp) & OBD_CONNECT_LARGE_ACL);
 }
 
+static inline int exp_connect_lockahead_old(struct obd_export *exp)
+{
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_LOCKAHEAD_OLD);
+}
+
+static inline int exp_connect_lockahead(struct obd_export *exp)
+{
+	return !!(exp_connect_flags2(exp) & OBD_CONNECT2_LOCKAHEAD);
+}
+
+static inline int exp_connect_flr(struct obd_export *exp)
+{
+	return !!(exp_connect_flags2(exp) & OBD_CONNECT2_FLR);
+}
+
+static inline int exp_connect_lock_convert(struct obd_export *exp)
+{
+	return !!(exp_connect_flags2(exp) & OBD_CONNECT2_LOCK_CONVERT);
+}
+
 extern struct obd_export *class_conn2export(struct lustre_handle *conn);
-extern struct obd_device *class_conn2obd(struct lustre_handle *conn);
 
-#define KKUC_CT_DATA_MAGIC	0x092013cea
+static inline int exp_connect_archive_id_array(struct obd_export *exp)
+{
+	return !!(exp_connect_flags2(exp) & OBD_CONNECT2_ARCHIVE_ID_ARRAY);
+}
+
+static inline int exp_connect_sepol(struct obd_export *exp)
+{
+	return !!(exp_connect_flags2(exp) & OBD_CONNECT2_SELINUX_POLICY);
+}
+
+enum {
+	/* archive_ids in array format */
+	KKUC_CT_DATA_ARRAY_MAGIC	= 0x092013cea,
+	/* archive_ids in bitmap format */
+	KKUC_CT_DATA_BITMAP_MAGIC	= 0x082018cea,
+};
+
+
 struct kkuc_ct_data {
 	__u32		kcd_magic;
-	__u32		kcd_archive;
+	__u32		kcd_nr_archives;
+	__u32		kcd_archives[0];
 };
 
 /** @} export */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_fid.h b/drivers/staging/lustrefsx/lustre/include/lustre_fid.h
index 43d0c3419417d..ea6d743b1aaae 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_fid.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_fid.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -149,9 +149,9 @@
  */
 
 #include <libcfs/libcfs.h>
-#include <uapi/linux/lustre_fid.h>
-#include <lustre/lustre_idl.h>
-#include <uapi/linux/lustre_ostid.h>
+#include <uapi/linux/lustre/lustre_fid.h>
+#include <uapi/linux/lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_ostid.h>
 
 struct lu_env;
 struct lu_site;
@@ -196,13 +196,6 @@ enum {
 	LUSTRE_SEQ_SUPER_WIDTH = ((1ULL << 30ULL) * LUSTRE_SEQ_META_WIDTH)
 };
 
-enum {
-        /** 2^6 FIDs for OI containers */
-        OSD_OI_FID_OID_BITS     = 6,
-        /** reserve enough FIDs in case we want more in the future */
-        OSD_OI_FID_OID_BITS_MAX = 10,
-};
-
 /** special OID for local objects */
 enum local_oid {
 	/** \see fld_mod_init */
@@ -225,6 +218,7 @@ enum local_oid {
 	OSD_LPF_OID		= 19UL,
 	REPLY_DATA_OID		= 21UL,
 	ACCT_PROJECT_OID	= 22UL,
+	INDEX_BACKUP_OID	= 4116UL,
 	OFD_LAST_GROUP_OID	= 4117UL,
 	LLOG_CATALOGS_OID	= 4118UL,
 	MGS_CONFIGS_OID		= 4119UL,
@@ -350,10 +344,13 @@ static inline void filter_fid_cpu_to_le(struct filter_fid *dst,
 {
 	fid_cpu_to_le(&dst->ff_parent, &src->ff_parent);
 
-	if (size < sizeof(struct filter_fid))
+	if (size < sizeof(struct filter_fid)) {
 		memset(&dst->ff_layout, 0, sizeof(dst->ff_layout));
-	else
+	} else {
 		ost_layout_cpu_to_le(&dst->ff_layout, &src->ff_layout);
+		dst->ff_layout_version = cpu_to_le32(src->ff_layout_version);
+		dst->ff_range = cpu_to_le32(src->ff_range);
+	}
 
 	/* XXX: Add more if filter_fid is enlarged in the future. */
 }
@@ -363,10 +360,13 @@ static inline void filter_fid_le_to_cpu(struct filter_fid *dst,
 {
 	fid_le_to_cpu(&dst->ff_parent, &src->ff_parent);
 
-	if (size < sizeof(struct filter_fid))
+	if (size < sizeof(struct filter_fid)) {
 		memset(&dst->ff_layout, 0, sizeof(dst->ff_layout));
-	else
+	} else {
 		ost_layout_le_to_cpu(&dst->ff_layout, &src->ff_layout);
+		dst->ff_layout_version = le32_to_cpu(src->ff_layout_version);
+		dst->ff_range = le32_to_cpu(src->ff_range);
+	}
 
 	/* XXX: Add more if filter_fid is enlarged in the future. */
 }
@@ -416,8 +416,8 @@ struct lu_client_seq {
          */
         struct lu_seq_range         lcs_space;
 
-        /* Seq related proc */
-	struct proc_dir_entry   *lcs_proc_dir;
+	/* Seq related debugfs */
+	struct dentry		*lcs_debugfs_entry;
 
         /* This holds last allocated fid in last obtained seq */
         struct lu_fid           lcs_fid;
@@ -427,7 +427,7 @@ struct lu_client_seq {
 
         /*
          * Service uuid, passed from MDT + seq name to form unique seq name to
-         * use it with procfs.
+	 * use it with debugfs.
          */
         char                    lcs_name[80];
 
@@ -463,8 +463,8 @@ struct lu_server_seq {
         /* /seq file object device */
         struct dt_object       *lss_obj;
 
-        /* Seq related proc */
-	struct proc_dir_entry	*lss_proc_dir;
+	/* Seq related debugfs */
+	struct dentry		*lss_debugfs_entry;
 
         /* LUSTRE_SEQ_SERVER or LUSTRE_SEQ_CONTROLLER */
         enum lu_mgr_type       lss_type;
@@ -477,7 +477,7 @@ struct lu_server_seq {
 
         /*
          * Service uuid, passed from MDT + seq name to form unique seq name to
-         * use it with procfs.
+	 * use it with debugfs.
          */
         char                    lss_name[80];
 
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_fld.h b/drivers/staging/lustrefsx/lustre/include/lustre_fld.h
index 2f39962f8fb5e..102dcfac77480 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_fld.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_fld.h
@@ -38,7 +38,7 @@
  * @{
  */
 
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 #include <libcfs/libcfs.h>
 #include <seq_range.h>
 
@@ -67,9 +67,10 @@ struct lu_fld_target {
 };
 
 struct lu_server_fld {
-        /**
-         * Fld dir proc entry. */
-	struct proc_dir_entry	*lsf_proc_dir;
+	/**
+	 * Fld dir debugfs entry.
+	 */
+	struct dentry		*lsf_debugfs_entry;
 
         /**
          * /fld file object device */
@@ -108,8 +109,9 @@ struct lu_server_fld {
 
 struct lu_client_fld {
 	/**
-	 * Client side proc entry. */
-	struct proc_dir_entry	*lcf_proc_dir;
+	 * Client side debugfs entry.
+	 */
+	struct dentry		*lcf_debugfs_entry;
 
 	/**
 	 * List of exports client FLD knows about. */
@@ -132,7 +134,8 @@ struct lu_client_fld {
         struct fld_cache        *lcf_cache;
 
         /**
-         * Client fld proc entry name. */
+	 * Client fld debugfs entry name.
+	 */
         char                     lcf_name[80];
 };
 
@@ -189,7 +192,7 @@ int fld_client_add_target(struct lu_client_fld *fld,
 int fld_client_del_target(struct lu_client_fld *fld,
                           __u64 idx);
 
-void fld_client_proc_fini(struct lu_client_fld *fld);
+void fld_client_debugfs_fini(struct lu_client_fld *fld);
 
 /** @} fld */
 
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_ha.h b/drivers/staging/lustrefsx/lustre/include/lustre_ha.h
index 7c22d985af5a4..2cb4969b615bf 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_ha.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_ha.h
@@ -50,7 +50,7 @@ void ptlrpc_free_committed(struct obd_import *imp);
 void ptlrpc_wake_delayed(struct obd_import *imp);
 int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async);
 int ptlrpc_set_import_active(struct obd_import *imp, int active);
-void ptlrpc_activate_import(struct obd_import *imp);
+void ptlrpc_activate_import(struct obd_import *imp, bool set_state_full);
 void ptlrpc_deactivate_import(struct obd_import *imp);
 void ptlrpc_invalidate_import(struct obd_import *imp);
 void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt);
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_idmap.h b/drivers/staging/lustrefsx/lustre/include/lustre_idmap.h
index 57a192359d118..a8c5a218b6c7d 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_idmap.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_idmap.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_import.h b/drivers/staging/lustrefsx/lustre/include/lustre_import.h
index 1b44d32393139..430fde2e92738 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_import.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_import.h
@@ -42,10 +42,15 @@
  *
  * @{
  */
-
-#include <lustre_handles.h>
-#include <lustre/lustre_idl.h>
-
+#include <linux/atomic.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/time.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <libcfs/libcfs.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 
 /**
  * Adaptive Timeout stuff
@@ -101,19 +106,21 @@ enum lustre_imp_state {
         LUSTRE_IMP_RECOVER    = 8,
         LUSTRE_IMP_FULL       = 9,
         LUSTRE_IMP_EVICTED    = 10,
+	LUSTRE_IMP_IDLE	      = 11,
+	LUSTRE_IMP_LAST
 };
 
 /** Returns test string representation of numeric import state \a state */
 static inline char * ptlrpc_import_state_name(enum lustre_imp_state state)
 {
-        static char* import_state_names[] = {
-                "<UNKNOWN>", "CLOSED",  "NEW", "DISCONN",
-                "CONNECTING", "REPLAY", "REPLAY_LOCKS", "REPLAY_WAIT",
-                "RECOVER", "FULL", "EVICTED",
-        };
-
-        LASSERT (state <= LUSTRE_IMP_EVICTED);
-        return import_state_names[state];
+	static char *import_state_names[] = {
+		"<UNKNOWN>", "CLOSED",  "NEW", "DISCONN",
+		"CONNECTING", "REPLAY", "REPLAY_LOCKS", "REPLAY_WAIT",
+		"RECOVER", "FULL", "EVICTED", "IDLE",
+	};
+
+	LASSERT(state < LUSTRE_IMP_LAST);
+	return import_state_names[state];
 }
 
 /**
@@ -140,9 +147,9 @@ struct obd_import_conn {
         /** uuid of remote side */
         struct obd_uuid           oic_uuid;
         /**
-         * Time (64 bit jiffies) of last connection attempt on this connection
+	 * Time (64 bit seconds) of last connection attempt on this connection
          */
-        __u64                     oic_last_attempt;
+	time64_t		  oic_last_attempt;
 };
 
 /* state history */
@@ -157,8 +164,6 @@ struct import_state_hist {
  * Imports are representing client-side view to remote target.
  */
 struct obd_import {
-	/** Local handle (== id) for this import. */
-	struct portals_handle     imp_handle;
 	/** Reference counter */
 	atomic_t                  imp_refcount;
 	struct lustre_handle      imp_dlm_handle; /* client's ldlm export */
@@ -168,8 +173,8 @@ struct obd_import {
         struct ptlrpc_client     *imp_client;
 	/** List element for linking into pinger chain */
 	struct list_head	  imp_pinger_chain;
-	/** List element for linking into chain for destruction */
-	struct list_head	  imp_zombie_chain;
+	/** work struct for destruction of import */
+	struct work_struct	  imp_zombie_work;
 
         /**
          * Lists of requests that are retained for replay, waiting for a reply,
@@ -213,12 +218,17 @@ struct obd_import {
 	/** Wait queue for those who need to wait for recovery completion */
 	wait_queue_head_t         imp_recovery_waitq;
 
+	/** Number of requests allocated */
+	atomic_t                  imp_reqs;
 	/** Number of requests currently in-flight */
 	atomic_t                  imp_inflight;
 	/** Number of requests currently unregistering */
 	atomic_t                  imp_unregistering;
 	/** Number of replay requests inflight */
 	atomic_t                  imp_replay_inflight;
+	/** In-flight replays rate control */
+	wait_queue_head_t	  imp_replay_waitq;
+
 	/** Number of currently happening import invalidations */
 	atomic_t                  imp_inval_count;
 	/** Numbner of request timeouts */
@@ -232,6 +242,8 @@ struct obd_import {
         int                       imp_state_hist_idx;
         /** Current import generation. Incremented on every reconnect */
         int                       imp_generation;
+	/** Idle connection initiated at this generation */
+	int			  imp_initiated_at;
         /** Incremented every time we send reconnection request */
         __u32                     imp_conn_cnt;
        /** 
@@ -256,9 +268,9 @@ struct obd_import {
          */
         struct lustre_handle      imp_remote_handle;
         /** When to perform next ping. time in jiffies. */
-        cfs_time_t                imp_next_ping;
+	time64_t		imp_next_ping;
 	/** When we last successfully connected. time in 64bit jiffies */
-        __u64                     imp_last_success_conn;
+	time64_t		imp_last_success_conn;
 
         /** List of all possible connection for import. */
 	struct list_head	imp_conn_list;
@@ -283,9 +295,6 @@ struct obd_import {
 				  imp_server_timeout:1,
 				  /* VBR: imp in delayed recovery */
 				  imp_delayed_recovery:1,
-				  /* VBR: if gap was found then no lock replays
-				   */
-				  imp_no_lock_replay:1,
 				  /* recovery by versions was failed */
 				  imp_vbr_failed:1,
 				  /* force an immidiate ping */
@@ -298,30 +307,32 @@ struct obd_import {
 				  imp_resend_replay:1,
 				  /* disable normal recovery, for test only. */
 				  imp_no_pinger_recover:1,
-#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
-				  /* need IR MNE swab */
-				  imp_need_mne_swab:1,
-#endif
 				  /* import must be reconnected instead of
 				   * chouse new connection */
 				  imp_force_reconnect:1,
 				  /* import has tried to connect with server */
 				  imp_connect_tried:1,
 				  /* connected but not FULL yet */
-				  imp_connected:1;
-	__u32                     imp_connect_op;
-	struct obd_connect_data   imp_connect_data;
-	__u64                     imp_connect_flags_orig;
-	__u64                     imp_connect_flags2_orig;
-	int                       imp_connect_error;
-
-	__u32                     imp_msg_magic;
-				  /* adjusted based on server capability */
-	__u32                     imp_msghdr_flags;
-
-				  /* adaptive timeout data */
-	struct imp_at             imp_at;
-	time64_t		  imp_last_reply_time;	/* for health check */
+				  imp_connected:1,
+				  /* grant shrink disabled */
+				  imp_grant_shrink_disabled:1,
+				  /* to supress LCONSOLE() at conn.restore */
+				  imp_was_idle:1;
+	u32			  imp_connect_op;
+	u32			  imp_idle_timeout;
+	u32			  imp_idle_debug;
+	struct obd_connect_data	  imp_connect_data;
+	__u64			  imp_connect_flags_orig;
+	__u64			  imp_connect_flags2_orig;
+	int			  imp_connect_error;
+
+	enum lustre_msg_magic	imp_msg_magic;
+				/* adjusted based on server capability */
+	enum lustre_msghdr	imp_msghdr_flags;
+
+				/* adaptive timeout data */
+	struct imp_at		imp_at;
+	time64_t		imp_last_reply_time;	/* for health check */
 };
 
 /* import.c */
@@ -331,11 +342,11 @@ static inline unsigned int at_est2timeout(unsigned int val)
         return (val + (val >> 2) + 5);
 }
 
-static inline unsigned int at_timeout2est(unsigned int val)
+static inline timeout_t at_timeout2est(timeout_t timeout)
 {
-        /* restore estimate value from timeout: e=4/5(t-5) */
-        LASSERT(val);
-        return (max((val << 2) / 5, 5U) - 4);
+	/* restore estimate value from timeout: e=4/5(t-5) */
+	LASSERT(timeout > 0);
+	return max((timeout << 2) / 5, 5) - 4;
 }
 
 static inline void at_reset_nolock(struct adaptive_timeout *at, int val)
@@ -381,7 +392,6 @@ extern unsigned int at_max;
 /* genops.c */
 struct obd_export;
 extern struct obd_import *class_exp2cliimp(struct obd_export *);
-extern struct obd_import *class_conn2cliimp(struct lustre_handle *);
 
 /** @} import */
 
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_kernelcomm.h b/drivers/staging/lustrefsx/lustre/include/lustre_kernelcomm.h
index 4fc76566501ba..4af88af0edf87 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_kernelcomm.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_kernelcomm.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2013, 2015, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,7 +38,7 @@
 #define __LUSTRE_KERNELCOMM_H__
 
 /* For declarations shared with userspace */
-#include <uapi_kernelcomm.h>
+#include <uapi/linux/lustre/lustre_kernelcomm.h>
 
 /* prototype for callback function on kuc groups */
 typedef int (*libcfs_kkuc_cb_t)(void *data, void *cb_arg);
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_lfsck.h b/drivers/staging/lustrefsx/lustre/include/lustre_lfsck.h
index 37f6ee1de49eb..11409b97e66c8 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_lfsck.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_lfsck.h
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2013, 2016, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
  */
 /*
  * lustre/include/lustre_lfsck.h
@@ -33,7 +33,7 @@
 #ifndef _LUSTRE_LFSCK_H
 # define _LUSTRE_LFSCK_H
 
-#include <lustre/lustre_lfsck_user.h>
+#include <uapi/linux/lustre/lustre_lfsck_user.h>
 #include <lustre_dlm.h>
 #include <lu_object.h>
 #include <dt_object.h>
@@ -101,10 +101,10 @@ int lfsck_query(const struct lu_env *env, struct dt_device *key,
 		struct lfsck_request *req, struct lfsck_reply *rep,
 		struct lfsck_query *que);
 
-int lfsck_get_speed(struct seq_file *m, struct dt_device *key);
+int lfsck_get_speed(struct seq_file *m, char *buf, struct dt_device *key);
 int lfsck_set_speed(struct dt_device *key, __u32 val);
-int lfsck_get_windows(struct seq_file *m, struct dt_device *key);
-int lfsck_set_windows(struct dt_device *key, int val);
+int lfsck_get_windows(char *buf, struct dt_device *key);
+int lfsck_set_windows(struct dt_device *key, unsigned int val);
 
 int lfsck_dump(struct seq_file *m, struct dt_device *key, enum lfsck_type type);
 
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_lib.h b/drivers/staging/lustrefsx/lustre/include/lustre_lib.h
index df1ca627aa4d0..f67791252056d 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_lib.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_lib.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -42,11 +42,15 @@
  * @{
  */
 
-#include <libcfs/linux/linux-misc.h>
+#ifdef HAVE_SCHED_HEADERS
+#include <linux/sched/signal.h>
+#include <linux/sched/mm.h>
+#endif
+
 #include <libcfs/libcfs.h>
-#include <lustre/lustre_idl.h>
-#include <lustre_ver.h>
-#include <uapi/linux/lustre_cfg.h>
+#include <uapi/linux/lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_ver.h>
+#include <uapi/linux/lustre/lustre_cfg.h>
 
 /* target.c */
 struct ptlrpc_request;
@@ -69,7 +73,6 @@ int rev_import_init(struct obd_export *exp);
 int target_handle_connect(struct ptlrpc_request *req);
 int target_handle_disconnect(struct ptlrpc_request *req);
 void target_destroy_export(struct obd_export *exp);
-int target_handle_ping(struct ptlrpc_request *req);
 void target_committed_to_req(struct ptlrpc_request *req);
 void target_cancel_recovery_timer(struct obd_device *obd);
 void target_stop_recovery_thread(struct obd_device *obd);
@@ -161,9 +164,9 @@ static inline int back_to_sleep(void *arg)
 #define LWI_ON_SIGNAL_NOOP ((void (*)(void *))(-1))
 
 struct l_wait_info {
-        cfs_duration_t lwi_timeout;
-        cfs_duration_t lwi_interval;
-        int            lwi_allow_intr;
+	long		lwi_timeout;
+	long		lwi_interval;
+	int		lwi_allow_intr;
         int  (*lwi_on_timeout)(void *);
         void (*lwi_on_signal)(void *);
         void  *lwi_cb_data;
@@ -255,8 +258,8 @@ static inline void __add_wait_queue_exclusive(wait_queue_head_t *q,
 #define __l_wait_event(wq, condition, info, ret, l_add_wait)                   \
 do {                                                                           \
 	wait_queue_entry_t __wait;                                             \
-	cfs_duration_t __timeout = info->lwi_timeout;                          \
-	sigset_t   __blocked;                                              \
+	long __timeout = info->lwi_timeout;				       \
+	sigset_t __blocked;						       \
 	int   __allow_intr = info->lwi_allow_intr;                             \
 									       \
 	ret = 0;                                                               \
@@ -305,13 +308,12 @@ do {                                                                           \
 		if (__timeout == 0) {                                          \
 			schedule();					       \
 		} else {                                                       \
-			cfs_duration_t interval = info->lwi_interval?          \
-					     min_t(cfs_duration_t,             \
-						 info->lwi_interval,__timeout):\
-					     __timeout;                        \
-			cfs_duration_t remaining = schedule_timeout(interval); \
-			__timeout = cfs_time_sub(__timeout,                    \
-					    cfs_time_sub(interval, remaining));\
+			long interval = info->lwi_interval ?		       \
+						min_t(long, info->lwi_interval,\
+						      __timeout) : __timeout;  \
+			long remaining = schedule_timeout(interval);	       \
+									       \
+			__timeout -= interval - remaining;		       \
 			if (__timeout == 0) {                                  \
 				if (info->lwi_on_timeout == NULL ||            \
 				    info->lwi_on_timeout(info->lwi_cb_data)) { \
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_linkea.h b/drivers/staging/lustrefsx/lustre/include/lustre_linkea.h
index 89a040f735d5d..3bf6e2b54fd9b 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_linkea.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_linkea.h
@@ -21,7 +21,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2013, 2014, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
  * Use is subject to license terms.
  *
  * Author: di wang <di.wang@intel.com>
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_lmv.h b/drivers/staging/lustrefsx/lustre/include/lustre_lmv.h
index f936973801012..d5fb751524b0b 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_lmv.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_lmv.h
@@ -32,7 +32,7 @@
 
 #ifndef _LUSTRE_LMV_H
 #define _LUSTRE_LMV_H
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 
 struct lmv_oinfo {
 	struct lu_fid	lmo_fid;
@@ -46,6 +46,8 @@ struct lmv_stripe_md {
 	__u32	lsm_md_master_mdt_index;
 	__u32	lsm_md_hash_type;
 	__u32	lsm_md_layout_version;
+	__u32	lsm_md_migrate_offset;
+	__u32	lsm_md_migrate_hash;
 	__u32	lsm_md_default_count;
 	__u32	lsm_md_default_index;
 	char	lsm_md_pool_name[LOV_MAXPOOLNAME + 1];
@@ -64,6 +66,10 @@ lsm_md_eq(const struct lmv_stripe_md *lsm1, const struct lmv_stripe_md *lsm2)
 	    lsm1->lsm_md_hash_type != lsm2->lsm_md_hash_type ||
 	    lsm1->lsm_md_layout_version !=
 				lsm2->lsm_md_layout_version ||
+	    lsm1->lsm_md_migrate_offset !=
+				lsm2->lsm_md_migrate_offset ||
+	    lsm1->lsm_md_migrate_hash !=
+				lsm2->lsm_md_migrate_hash ||
 	    strcmp(lsm1->lsm_md_pool_name,
 		      lsm2->lsm_md_pool_name) != 0)
 		return false;
@@ -76,14 +82,27 @@ lsm_md_eq(const struct lmv_stripe_md *lsm1, const struct lmv_stripe_md *lsm2)
 
 	return true;
 }
+
+static inline void lsm_md_dump(int mask, const struct lmv_stripe_md *lsm)
+{
+	int i;
+
+	CDEBUG(mask, "magic %#x stripe count %d master mdt %d hash type %#x "
+		"version %d migrate offset %d migrate hash %#x pool %s\n",
+		lsm->lsm_md_magic, lsm->lsm_md_stripe_count,
+		lsm->lsm_md_master_mdt_index, lsm->lsm_md_hash_type,
+		lsm->lsm_md_layout_version, lsm->lsm_md_migrate_offset,
+		lsm->lsm_md_migrate_hash, lsm->lsm_md_pool_name);
+
+	for (i = 0; i < lsm->lsm_md_stripe_count; i++)
+		CDEBUG(mask, "stripe[%d] "DFID"\n",
+		       i, PFID(&lsm->lsm_md_oinfo[i].lmo_fid));
+}
+
 union lmv_mds_md;
 
 void lmv_free_memmd(struct lmv_stripe_md *lsm);
 
-int lmvea_load_shards(const struct lu_env *env, struct dt_object *obj,
-		      struct lu_dirent *ent, struct lu_buf *buf,
-		      bool resize);
-
 static inline void lmv1_le_to_cpu(struct lmv_mds_md_v1 *lmv_dst,
 				  const struct lmv_mds_md_v1 *lmv_src)
 {
@@ -141,18 +160,14 @@ static inline int lmv_name_to_stripe_index(__u32 lmv_hash_type,
 					   unsigned int stripe_count,
 					   const char *name, int namelen)
 {
-	int	idx;
-	__u32	hash_type = lmv_hash_type & LMV_HASH_TYPE_MASK;
+	int idx;
 
 	LASSERT(namelen > 0);
-	if (stripe_count <= 1)
-		return 0;
 
-	/* for migrating object, always start from 0 stripe */
-	if (lmv_hash_type & LMV_HASH_FLAG_MIGRATION)
+	if (stripe_count <= 1)
 		return 0;
 
-	switch (hash_type) {
+	switch (lmv_hash_type & LMV_HASH_TYPE_MASK) {
 	case LMV_HASH_TYPE_ALL_CHARS:
 		idx = lmv_hash_all_chars(stripe_count, name, namelen);
 		break;
@@ -164,8 +179,8 @@ static inline int lmv_name_to_stripe_index(__u32 lmv_hash_type,
 		break;
 	}
 
-	CDEBUG(D_INFO, "name %.*s hash_type %d idx %d\n", namelen, name,
-	       hash_type, idx);
+	CDEBUG(D_INFO, "name %.*s hash_type %#x idx %d/%u\n", namelen, name,
+	       lmv_hash_type, idx, stripe_count);
 
 	return idx;
 }
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_log.h b/drivers/staging/lustrefsx/lustre/include/lustre_log.h
index 237da21bf4210..f2522050f7337 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_log.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_log.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -52,9 +52,9 @@
  */
 
 #include <obd_class.h>
-#include <lustre/lustre_idl.h>
 #include <dt_object.h>
-#include <lustre_log_user.h>
+#include <uapi/linux/lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_log_user.h>
 
 #define LOG_NAME_LIMIT(logname, name)                   \
         snprintf(logname, sizeof(logname), "LOGS/%s", name)
@@ -160,6 +160,7 @@ int llog_cat_process_or_fork(const struct lu_env *env,
 int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh,
 		     llog_cb_t cb, void *data, int startcat, int startidx);
 __u64 llog_cat_size(const struct lu_env *env, struct llog_handle *cat_llh);
+__u32 llog_cat_free_space(struct llog_handle *cat_llh);
 int llog_cat_reverse_process(const struct lu_env *env,
 			     struct llog_handle *cat_llh, llog_cb_t cb,
 			     void *data);
@@ -170,8 +171,6 @@ int llog_setup(const struct lu_env *env, struct obd_device *obd,
 int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt);
 int llog_cleanup(const struct lu_env *env, struct llog_ctxt *);
 int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags);
-int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt,
-		struct llog_cookie *cookies, int flags);
 
 /* llog_ioctl.c */
 struct obd_ioctl_data;
@@ -202,8 +201,6 @@ struct llog_operations {
 	int (*lop_sync)(struct llog_ctxt *ctxt, struct obd_export *exp,
 			int flags);
 	int (*lop_cleanup)(const struct lu_env *env, struct llog_ctxt *ctxt);
-	int (*lop_cancel)(const struct lu_env *env, struct llog_ctxt *ctxt,
-			  struct llog_cookie *cookies, int flags);
 	int (*lop_connect)(struct llog_ctxt *ctxt, struct llog_logid *logid,
 			   struct llog_gen *gen, struct obd_uuid *uuid);
 	/**
@@ -271,8 +268,8 @@ struct llog_handle {
 	 * case, after it will have reached LLOG_HDR_BITMAP_SIZE, llh_cat_idx
 	 * will become its upper limit */
 	int			 lgh_last_idx;
-	int			 lgh_cur_idx; /* used during llog_process */
-	__u64			 lgh_cur_offset; /* used during llog_process */
+	struct rw_semaphore	 lgh_last_sem;
+	__u64			 lgh_cur_offset; /* used for test only */
 	struct llog_ctxt	*lgh_ctxt;
 	union {
 		struct plain_handle_data	 phd;
@@ -284,7 +281,7 @@ struct llog_handle {
 	atomic_t		 lgh_refcount;
 
 	int			lgh_max_size;
-	__u32			lgh_stale:1;
+	bool			lgh_destroyed;
 };
 
 /* llog_osd.c */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_mdc.h b/drivers/staging/lustrefsx/lustre/include/lustre_mdc.h
index be0eb7742e644..ac7b0d5f4a2f0 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_mdc.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_mdc.h
@@ -53,7 +53,7 @@
 #include <lustre_intent.h>
 #include <libcfs/libcfs.h>
 #include <obd_class.h>
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 #include <lustre_lib.h>
 #include <lustre_dlm.h>
 #include <lustre_export.h>
@@ -63,104 +63,6 @@ struct obd_export;
 struct ptlrpc_request;
 struct obd_device;
 
-/**
- * Serializes in-flight MDT-modifying RPC requests to preserve idempotency.
- *
- * This mutex is used to implement execute-once semantics on the MDT.
- * The MDT stores the last transaction ID and result for every client in
- * its last_rcvd file. If the client doesn't get a reply, it can safely
- * resend the request and the MDT will reconstruct the reply being aware
- * that the request has already been executed. Without this lock,
- * execution status of concurrent in-flight requests would be
- * overwritten.
- *
- * This design limits the extent to which we can keep a full pipeline of
- * in-flight requests from a single client.  This limitation could be
- * overcome by allowing multiple slots per client in the last_rcvd file.
- */
-struct mdc_rpc_lock {
-	/** Lock protecting in-flight RPC concurrency. */
-	struct mutex		rpcl_mutex;
-	/** Intent associated with currently executing request. */
-	struct lookup_intent	*rpcl_it;
-	/** Used for MDS/RPC load testing purposes. */
-	int			rpcl_fakes;
-};
-
-#define MDC_FAKE_RPCL_IT ((void *)0x2c0012bfUL)
-
-static inline void mdc_init_rpc_lock(struct mdc_rpc_lock *lck)
-{
-	mutex_init(&lck->rpcl_mutex);
-        lck->rpcl_it = NULL;
-}
-
-static inline void mdc_get_rpc_lock(struct mdc_rpc_lock *lck,
-				    struct lookup_intent *it)
-{
-	ENTRY;
-
-	if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP ||
-			   it->it_op == IT_LAYOUT || it->it_op == IT_READDIR))
-		return;
-
-	/* This would normally block until the existing request finishes.
-	 * If fail_loc is set it will block until the regular request is
-	 * done, then set rpcl_it to MDC_FAKE_RPCL_IT.  Once that is set
-	 * it will only be cleared when all fake requests are finished.
-	 * Only when all fake requests are finished can normal requests
-	 * be sent, to ensure they are recoverable again. */
- again:
-	mutex_lock(&lck->rpcl_mutex);
-
-	if (CFS_FAIL_CHECK_QUIET(OBD_FAIL_MDC_RPCS_SEM)) {
-		lck->rpcl_it = MDC_FAKE_RPCL_IT;
-		lck->rpcl_fakes++;
-		mutex_unlock(&lck->rpcl_mutex);
-		return;
-	}
-
-	/* This will only happen when the CFS_FAIL_CHECK() was
-	 * just turned off but there are still requests in progress.
-	 * Wait until they finish.  It doesn't need to be efficient
-	 * in this extremely rare case, just have low overhead in
-	 * the common case when it isn't true. */
-	while (unlikely(lck->rpcl_it == MDC_FAKE_RPCL_IT)) {
-		mutex_unlock(&lck->rpcl_mutex);
-		schedule_timeout(cfs_time_seconds(1) / 4);
-		goto again;
-	}
-
-	LASSERT(lck->rpcl_it == NULL);
-	lck->rpcl_it = it;
-}
-
-static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck,
-				    struct lookup_intent *it)
-{
-	if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP ||
-			   it->it_op == IT_LAYOUT || it->it_op == IT_READDIR))
-		goto out;
-
-	if (lck->rpcl_it == MDC_FAKE_RPCL_IT) { /* OBD_FAIL_MDC_RPCS_SEM */
-		mutex_lock(&lck->rpcl_mutex);
-
-		LASSERTF(lck->rpcl_fakes > 0, "%d\n", lck->rpcl_fakes);
-		lck->rpcl_fakes--;
-
-		if (lck->rpcl_fakes == 0)
-			lck->rpcl_it = NULL;
-
-	} else {
-		LASSERTF(it == lck->rpcl_it, "%p != %p\n", it, lck->rpcl_it);
-		lck->rpcl_it = NULL;
-	}
-
-	mutex_unlock(&lck->rpcl_mutex);
- out:
-	EXIT;
-}
-
 static inline void mdc_get_mod_rpc_slot(struct ptlrpc_request *req,
 					struct lookup_intent *it)
 {
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_mds.h b/drivers/staging/lustrefsx/lustre/include/lustre_mds.h
index c254c7f730f10..cb43281574890 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_mds.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_mds.h
@@ -45,7 +45,7 @@
 
 #include <lustre_handles.h>
 #include <libcfs/libcfs.h>
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 #include <lustre_lib.h>
 #include <lustre_dlm.h>
 #include <lustre_export.h>
@@ -60,13 +60,34 @@ struct mds_capa_info {
         struct lustre_capa_key *capa;
 };
 
+struct md_rejig_data {
+	struct md_object	*mrd_obj;
+	__u16			mrd_mirror_id;
+};
+
 #define MDD_OBD_NAME     "mdd_obd"
 #define MDD_OBD_UUID     "mdd_obd_uuid"
 
-static inline int md_should_create(__u64 flags)
+static inline int md_should_create(u64 open_flags)
 {
-	return !(flags & MDS_OPEN_DELAY_CREATE) && (flags & FMODE_WRITE) &&
-               !(flags & MDS_OPEN_LEASE);
+	return !(open_flags & MDS_OPEN_DELAY_CREATE) &&
+		(open_flags & MDS_FMODE_WRITE) &&
+	       !(open_flags & MDS_OPEN_LEASE);
+}
+
+/* do NOT or the MAY_*'s, you'll get the weakest */
+static inline int mds_accmode(u64 open_flags)
+{
+	int res = 0;
+
+	if (open_flags & MDS_FMODE_READ)
+		res |= MAY_READ;
+	if (open_flags & (MDS_FMODE_WRITE | MDS_OPEN_TRUNC | MDS_OPEN_APPEND))
+		res |= MAY_WRITE;
+	if (open_flags & MDS_FMODE_EXEC)
+		res = MAY_EXEC;
+
+	return res;
 }
 
 /** @} mds */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_net.h b/drivers/staging/lustrefsx/lustre/include/lustre_net.h
index f6d67c832ed64..3a94a921e11de 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_net.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_net.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2016, Intel Corporation.
+ * Copyright (c) 2010, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -50,12 +50,13 @@
  *
  * @{
  */
-
+#include <linux/kobject.h>
 #include <linux/uio.h>
 #include <libcfs/libcfs.h>
-#include <lnet/nidstr.h>
 #include <lnet/api.h>
-#include <lustre/lustre_idl.h>
+#include <lnet/lib-types.h>
+#include <uapi/linux/lnet/nidstr.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 #include <lustre_ha.h>
 #include <lustre_sec.h>
 #include <lustre_import.h>
@@ -63,7 +64,7 @@
 #include <lu_object.h>
 #include <lustre_req_layout.h>
 #include <obd_support.h>
-#include <lustre_ver.h>
+#include <uapi/linux/lustre/lustre_ver.h>
 
 /* MD flags we _always_ use */
 #define PTLRPC_MD_OPTIONS  0
@@ -75,7 +76,7 @@
  * value.  The client is free to limit the actual RPC size for any bulk
  * transfer via cl_max_pages_per_rpc to some non-power-of-two value.
  * NOTE: This is limited to 16 (=64GB RPCs) by IOOBJ_MAX_BRW_BITS. */
-#define PTLRPC_BULK_OPS_BITS	4
+#define PTLRPC_BULK_OPS_BITS	6
 #if PTLRPC_BULK_OPS_BITS > 16
 #error "More than 65536 BRW RPCs not allowed by IOOBJ_MAX_BRW_BITS."
 #endif
@@ -472,19 +473,31 @@
  * - single object with 16 pages is 512 bytes
  * - OST_IO_MAXREQSIZE must be at least 1 page of cookies plus some spillover
  * - Must be a multiple of 1024
- * - actual size is about 18K
  */
-#define _OST_MAXREQSIZE_SUM (sizeof(struct lustre_msg) + \
-			     sizeof(struct ptlrpc_body) + \
-			     sizeof(struct obdo) + \
-			     sizeof(struct obd_ioobj) + \
-			     sizeof(struct niobuf_remote) * DT_MAX_BRW_PAGES)
+#define _OST_MAXREQSIZE_BASE ((unsigned long)(sizeof(struct lustre_msg) + \
+				    sizeof(struct ptlrpc_body) +	  \
+				    sizeof(struct obdo) +		  \
+				    sizeof(struct obd_ioobj) +		  \
+				    sizeof(struct niobuf_remote)))
+#define _OST_MAXREQSIZE_SUM ((unsigned long)(_OST_MAXREQSIZE_BASE +	  \
+				   sizeof(struct niobuf_remote) *	  \
+				   (DT_MAX_BRW_PAGES - 1)))
 /**
  * FIEMAP request can be 4K+ for now
  */
-#define OST_MAXREQSIZE		(16 * 1024)
-#define OST_IO_MAXREQSIZE	max_t(int, OST_MAXREQSIZE, \
-				(((_OST_MAXREQSIZE_SUM - 1) | (1024 - 1)) + 1))
+#define OST_MAXREQSIZE		(16UL * 1024UL)
+#define OST_IO_MAXREQSIZE	max(OST_MAXREQSIZE,			\
+				   ((_OST_MAXREQSIZE_SUM - 1) |		\
+				    (1024UL - 1)) + 1)
+/* Safe estimate of free space in standard RPC, provides upper limit for # of
+ * bytes of i/o to pack in RPC (skipping bulk transfer). */
+#define OST_SHORT_IO_SPACE	(OST_IO_MAXREQSIZE - _OST_MAXREQSIZE_BASE)
+
+/* Actual size used for short i/o buffer.  Calculation means this:
+ * At least one page (for large PAGE_SIZE), or 16 KiB, but not more
+ * than the available space aligned to a page boundary. */
+#define OBD_MAX_SHORT_IO_BYTES	min(max(PAGE_SIZE, 16UL * 1024UL), \
+				    OST_SHORT_IO_SPACE & PAGE_MASK)
 
 #define OST_MAXREPSIZE		(9 * 1024)
 #define OST_IO_MAXREPSIZE	OST_MAXREPSIZE
@@ -498,6 +511,7 @@
  */
 #define OST_IO_BUFSIZE		max_t(int, OST_IO_MAXREQSIZE + 1024, 64 * 1024)
 
+
 /* Macro to hide a typecast. */
 #define ptlrpc_req_async_args(req) ((void *)&req->rq_async_args)
 
@@ -552,7 +566,6 @@ union ptlrpc_async_args {
 };
 
 struct ptlrpc_request_set;
-typedef int (*set_interpreter_func)(struct ptlrpc_request_set *, void *, int);
 typedef int (*set_producer_func)(struct ptlrpc_request_set *, void *);
 
 /**
@@ -574,19 +587,8 @@ struct ptlrpc_request_set {
 	atomic_t		set_remaining;
 	/** wait queue to wait on for request events */
 	wait_queue_head_t	set_waitq;
-	wait_queue_head_t      *set_wakeup_ptr;
 	/** List of requests in the set */
 	struct list_head	set_requests;
-	/**
-	 * List of completion callbacks to be called when the set is completed
-	 * This is only used if \a set_interpret is NULL.
-	 * Links struct ptlrpc_set_cbdata.
-	 */
-	struct list_head	set_cblist;
-	/** Completion callback, if only one. */
-	set_interpreter_func	set_interpret;
-	/** opaq argument passed to completion \a set_interpret callback. */
-	void			*set_arg;
 	/**
 	 * Lock for \a set_new_requests manipulations
 	 * locked so that any old caller can communicate requests to
@@ -608,18 +610,6 @@ struct ptlrpc_request_set {
 	unsigned int		 set_allow_intr:1;
 };
 
-/**
- * Description of a single ptrlrpc_set callback
- */
-struct ptlrpc_set_cbdata {
-	/** List linkage item */
-	struct list_head	psc_item;
-	/** Pointer to interpreting function */
-	set_interpreter_func	psc_interpret;
-	/** Opaq argument to pass to the callback */
-	void			*psc_data;
-};
-
 struct ptlrpc_bulk_desc;
 struct ptlrpc_service_part;
 struct ptlrpc_service;
@@ -784,9 +774,9 @@ struct ptlrpc_cli_req {
 	/** For bulk requests on client only: bulk descriptor */
 	struct ptlrpc_bulk_desc		*cr_bulk;
 	/** optional time limit for send attempts */
-	cfs_duration_t			 cr_delay_limit;
+	time64_t			 cr_delay_limit;
 	/** time request was first queued */
-	cfs_time_t			 cr_queued_time;
+	time64_t			 cr_queued_time;
 	/** request sent in nanoseconds */
 	ktime_t				 cr_sent_ns;
 	/** time for request really sent out */
@@ -1059,6 +1049,13 @@ struct ptlrpc_request {
 	/** description of flavors for client & server */
 	struct sptlrpc_flavor		 rq_flvr;
 
+	/**
+	 * SELinux policy info at the time of the request
+	 * sepol string format is:
+	 * <mode>:<policy name>:<policy version>:<policy hash>
+	 */
+	char rq_sepol[LUSTRE_NODEMAP_SEPOL_LENGTH + 1];
+
 	/* client/server security flags */
 	unsigned int
                                  rq_ctx_init:1,      /* context initiation */
@@ -1115,8 +1112,17 @@ struct ptlrpc_request {
 	/**
 	 * service time estimate (secs)
 	 * If the request is not served by this time, it is marked as timed out.
+	 * Do not change to time64_t since this is transmitted over the wire.
+	 *
+	 * The linux kernel handles timestamps with time64_t and timeouts
+	 * are normally done with jiffies. Lustre shares the rq_timeout between
+	 * nodes. Since jiffies can vary from node to node Lustre instead
+	 * will express the timeout value in seconds. To avoid confusion with
+	 * timestamps (time64_t) and jiffy timeouts (long) Lustre timeouts
+	 * are expressed in s32 (timeout_t). Also what is transmitted over
+	 * the wire is 32 bits.
 	 */
-	int				 rq_timeout;
+	timeout_t			 rq_timeout;
 	/**
 	 * when request/reply sent (secs), or time when request should be sent
 	 */
@@ -1173,37 +1179,37 @@ static inline bool ptlrpc_nrs_req_can_move(struct ptlrpc_request *req)
 /** @} nrs */
 
 /**
- * Returns 1 if request buffer at offset \a index was already swabbed
+ * Returns true if request buffer at offset \a index was already swabbed
  */
-static inline int lustre_req_swabbed(struct ptlrpc_request *req, size_t index)
+static inline bool lustre_req_swabbed(struct ptlrpc_request *req, size_t index)
 {
-        LASSERT(index < sizeof(req->rq_req_swab_mask) * 8);
-        return req->rq_req_swab_mask & (1 << index);
+	LASSERT(index < sizeof(req->rq_req_swab_mask) * 8);
+	return req->rq_req_swab_mask & (1 << index);
 }
 
 /**
- * Returns 1 if request reply buffer at offset \a index was already swabbed
+ * Returns true if request reply buffer at offset \a index was already swabbed
  */
-static inline int lustre_rep_swabbed(struct ptlrpc_request *req, size_t index)
+static inline bool lustre_rep_swabbed(struct ptlrpc_request *req, size_t index)
 {
-        LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8);
-        return req->rq_rep_swab_mask & (1 << index);
+	LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8);
+	return req->rq_rep_swab_mask & (1 << index);
 }
 
 /**
- * Returns 1 if request needs to be swabbed into local cpu byteorder
+ * Returns true if request needs to be swabbed into local cpu byteorder
  */
-static inline int ptlrpc_req_need_swab(struct ptlrpc_request *req)
+static inline bool ptlrpc_req_need_swab(struct ptlrpc_request *req)
 {
-        return lustre_req_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+	return lustre_req_swabbed(req, MSG_PTLRPC_HEADER_OFF);
 }
 
 /**
- * Returns 1 if request reply needs to be swabbed into local cpu byteorder
+ * Returns true if request reply needs to be swabbed into local cpu byteorder
  */
-static inline int ptlrpc_rep_need_swab(struct ptlrpc_request *req)
+static inline bool ptlrpc_rep_need_swab(struct ptlrpc_request *req)
 {
-        return lustre_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+	return lustre_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
 }
 
 /**
@@ -1438,6 +1444,8 @@ extern const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kvec_ops;
  *  Another user is readpage for MDT.
  */
 struct ptlrpc_bulk_desc {
+	/** number MD's assigned including zero-sends */
+	unsigned int bd_refs;
 	/** completed with failure */
 	unsigned long bd_failure:1;
 	/** client side */
@@ -1462,6 +1470,7 @@ struct ptlrpc_bulk_desc {
 	int                    bd_max_iov;      /* allocated size of bd_iov */
 	int                    bd_nob;          /* # bytes covered */
 	int                    bd_nob_transferred; /* # bytes GOT/PUT */
+	unsigned int	       bd_nob_last;	/* # bytes in last MD */
 
 	__u64                  bd_last_mbits;
 
@@ -1469,6 +1478,8 @@ struct ptlrpc_bulk_desc {
 	lnet_nid_t             bd_sender;       /* stash event::sender */
 	int			bd_md_count;	/* # valid entries in bd_mds */
 	int			bd_md_max_brw;	/* max entries in bd_mds */
+	/** array of offsets for each MD */
+	unsigned int		bd_mds_off[PTLRPC_BULK_OPS_COUNT];
 	/** array of associated MDs */
 	struct lnet_handle_md	bd_mds[PTLRPC_BULK_OPS_COUNT];
 
@@ -1693,8 +1704,8 @@ struct ptlrpc_service {
 	int				srv_nthrs_cpt_init;
 	/** limit of threads number for each partition */
 	int				srv_nthrs_cpt_limit;
-        /** Root of /proc dir tree for this service */
-	struct proc_dir_entry           *srv_procroot;
+	/** Root of debugfs dir tree for this service */
+	struct dentry		       *srv_debugfs_entry;
         /** Pointer to statistic data for this service */
         struct lprocfs_stats           *srv_stats;
         /** # hp per lp reqs to handle */
@@ -1720,17 +1731,25 @@ struct ptlrpc_service {
         int                             srv_watchdog_factor;
         /** under unregister_service */
         unsigned                        srv_is_stopping:1;
+	/** Whether or not to restrict service threads to CPUs in this CPT */
+	unsigned			srv_cpt_bind:1;
 
+	/** max # request buffers */
+	int				srv_nrqbds_max;
 	/** max # request buffers in history per partition */
 	int				srv_hist_nrqbds_cpt_max;
-	/** number of CPTs this service bound on */
+	/** number of CPTs this service associated with */
 	int				srv_ncpts;
-	/** CPTs array this service bound on */
+	/** CPTs array this service associated with */
 	__u32				*srv_cpts;
 	/** 2^srv_cptab_bits >= cfs_cpt_numbert(srv_cptable) */
 	int				srv_cpt_bits;
 	/** CPT table this service is running over */
 	struct cfs_cpt_table		*srv_cptable;
+
+	/* sysfs object */
+	struct kobject			srv_kobj;
+	struct completion		srv_kobj_unregister;
 	/**
 	 * partition data for ptlrpc service
 	 */
@@ -1777,6 +1796,8 @@ struct ptlrpc_service_part {
 	 * threads starting & stopping are also protected by this lock.
 	 */
 	spinlock_t			scp_lock  __cfs_cacheline_aligned;
+	/** userland serialization */
+	struct mutex			scp_mutex;
 	/** total # req buffer descs allocated */
 	int				scp_nrqbds_total;
 	/** # posted request buffers for receiving */
@@ -1791,8 +1812,8 @@ struct ptlrpc_service_part {
 	struct list_head		scp_rqbd_posted;
 	/** incoming reqs */
 	struct list_head		scp_req_incoming;
-	/** timeout before re-posting reqs, in tick */
-	cfs_duration_t			scp_rqbd_timeout;
+	/** timeout before re-posting reqs, in jiffies */
+	long				scp_rqbd_timeout;
 	/**
 	 * all threads sleep on this. This wait-queue is signalled when new
 	 * incoming request arrives and when difficult reply has to be handled.
@@ -1843,7 +1864,7 @@ struct ptlrpc_service_part {
 	/** early reply timer */
 	struct timer_list		scp_at_timer;
 	/** debug */
-	cfs_time_t			scp_at_checktime;
+	ktime_t				scp_at_checktime;
 	/** check early replies */
 	unsigned			scp_at_check;
 	/** @} */
@@ -2061,7 +2082,7 @@ static inline int ptlrpc_server_bulk_active(struct ptlrpc_bulk_desc *desc)
 	LASSERT(desc != NULL);
 
 	spin_lock(&desc->bd_lock);
-	rc = desc->bd_md_count;
+	rc = desc->bd_refs;
 	spin_unlock(&desc->bd_lock);
 	return rc;
 }
@@ -2078,14 +2099,15 @@ static inline int ptlrpc_client_bulk_active(struct ptlrpc_request *req)
 	LASSERT(req != NULL);
 	desc = req->rq_bulk;
 
+	if (!desc)
+		return 0;
+
 	if (req->rq_bulk_deadline > ktime_get_real_seconds())
 		return 1;
 
-	if (!desc)
-		return 0;
 
 	spin_lock(&desc->bd_lock);
-	rc = desc->bd_md_count;
+	rc = desc->bd_refs;
 	spin_unlock(&desc->bd_lock);
 	return rc;
 }
@@ -2125,10 +2147,8 @@ void ptlrpc_abort_set(struct ptlrpc_request_set *set);
 struct ptlrpc_request_set *ptlrpc_prep_set(void);
 struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func,
 					     void *arg);
-int ptlrpc_set_add_cb(struct ptlrpc_request_set *set,
-                      set_interpreter_func fn, void *data);
 int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set);
-int ptlrpc_set_wait(struct ptlrpc_request_set *);
+int ptlrpc_set_wait(const struct lu_env *env, struct ptlrpc_request_set *);
 void ptlrpc_mark_interrupted(struct ptlrpc_request *req);
 void ptlrpc_set_destroy(struct ptlrpc_request_set *);
 void ptlrpc_set_add_req(struct ptlrpc_request_set *, struct ptlrpc_request *);
@@ -2245,8 +2265,8 @@ struct ptlrpc_service_thr_conf {
 	/* user specified threads number, it will be validated due to
 	 * other members of this structure. */
 	unsigned int			tc_nthrs_user;
-	/* set NUMA node affinity for service threads */
-	unsigned int			tc_cpu_affinity;
+	/* bind service threads to only CPUs in their associated CPT */
+	unsigned int			tc_cpu_bind;
 	/* Tags for lu_context associated with service thread */
 	__u32				tc_ctx_tags;
 };
@@ -2255,6 +2275,8 @@ struct ptlrpc_service_cpt_conf {
 	struct cfs_cpt_table		*cc_cptable;
 	/* string pattern to describe CPTs for a service */
 	char				*cc_pattern;
+	/* whether or not to have per-CPT service partitions */
+	bool				cc_affinity;
 };
 
 struct ptlrpc_service_conf {
@@ -2287,18 +2309,18 @@ void ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs);
 int ptlrpc_hpreq_handler(struct ptlrpc_request *req);
 struct ptlrpc_service *ptlrpc_register_service(
 				struct ptlrpc_service_conf *conf,
-				struct proc_dir_entry *proc_entry);
+				struct kset *parent,
+				struct dentry *debugfs_entry);
 void ptlrpc_stop_all_threads(struct ptlrpc_service *svc);
 
 int ptlrpc_start_threads(struct ptlrpc_service *svc);
 int ptlrpc_unregister_service(struct ptlrpc_service *service);
-int liblustre_check_services(void *arg);
-void ptlrpc_daemonize(char *name);
 int ptlrpc_service_health_check(struct ptlrpc_service *);
 void ptlrpc_server_drop_request(struct ptlrpc_request *req);
 void ptlrpc_request_change_export(struct ptlrpc_request *req,
 				  struct obd_export *export);
-void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay);
+void ptlrpc_update_export_timer(struct obd_export *exp,
+				time64_t extra_delay);
 
 int ptlrpc_hr_init(void);
 void ptlrpc_hr_fini(void);
@@ -2311,8 +2333,10 @@ void ptlrpc_hr_fini(void);
  * @{
  */
 int ptlrpc_connect_import(struct obd_import *imp);
+int ptlrpc_connect_import_locked(struct obd_import *imp);
 int ptlrpc_init_import(struct obd_import *imp);
 int ptlrpc_disconnect_import(struct obd_import *imp, int noclose);
+int ptlrpc_disconnect_and_idle_import(struct obd_import *imp);
 int ptlrpc_import_recovery_state_machine(struct obd_import *imp);
 void deuuidify(char *uuid, const char *prefix, char **uuid_start,
 	       int *uuid_len);
@@ -2326,8 +2350,14 @@ int ptlrpc_reconnect_import(struct obd_import *imp);
  *
  * @{
  */
-int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout,
-			 __u32 index);
+#define PTLRPC_MAX_BUFCOUNT \
+	(sizeof(((struct ptlrpc_request *)0)->rq_req_swab_mask) * 8)
+#define MD_MAX_BUFLEN		(MDS_REG_MAXREQSIZE > OUT_MAXREQSIZE ? \
+				 MDS_REG_MAXREQSIZE : OUT_MAXREQSIZE)
+#define PTLRPC_MAX_BUFLEN	(OST_IO_MAXREQSIZE > MD_MAX_BUFLEN ? \
+				 OST_IO_MAXREQSIZE : MD_MAX_BUFLEN)
+bool ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout,
+			  __u32 index);
 void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout,
 			    __u32 index);
 int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len);
@@ -2370,7 +2400,7 @@ __u32 lustre_msg_get_op_flags(struct lustre_msg *msg);
 void lustre_msg_add_op_flags(struct lustre_msg *msg, __u32 flags);
 struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg);
 __u32 lustre_msg_get_type(struct lustre_msg *msg);
-__u32 lustre_msg_get_version(struct lustre_msg *msg);
+enum lustre_msg_version lustre_msg_get_version(struct lustre_msg *msg);
 void lustre_msg_add_version(struct lustre_msg *msg, __u32 version);
 __u32 lustre_msg_get_opc(struct lustre_msg *msg);
 __u64 lustre_msg_get_last_xid(struct lustre_msg *msg);
@@ -2385,8 +2415,8 @@ void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit);
 int lustre_msg_get_status(struct lustre_msg *msg);
 __u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg);
 __u32 lustre_msg_get_magic(struct lustre_msg *msg);
-__u32 lustre_msg_get_timeout(struct lustre_msg *msg);
-__u32 lustre_msg_get_service_time(struct lustre_msg *msg);
+timeout_t lustre_msg_get_timeout(struct lustre_msg *msg);
+timeout_t lustre_msg_get_service_timeout(struct lustre_msg *msg);
 char *lustre_msg_get_jobid(struct lustre_msg *msg);
 __u32 lustre_msg_get_cksum(struct lustre_msg *msg);
 __u64 lustre_msg_get_mbits(struct lustre_msg *msg);
@@ -2403,8 +2433,9 @@ void lustre_msg_set_status(struct lustre_msg *msg, __u32 status);
 void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt);
 void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *sizes);
 void ptlrpc_request_set_replen(struct ptlrpc_request *req);
-void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout);
-void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time);
+void lustre_msg_set_timeout(struct lustre_msg *msg, timeout_t timeout);
+void lustre_msg_set_service_timeout(struct lustre_msg *msg,
+				    timeout_t service_timeout);
 void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid);
 void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum);
 void lustre_msg_set_mbits(struct lustre_msg *msg, __u64 mbits);
@@ -2588,11 +2619,8 @@ static inline int ptlrpc_req_get_repsize(struct ptlrpc_request *req)
 static inline int ptlrpc_send_limit_expired(struct ptlrpc_request *req)
 {
         if (req->rq_delay_limit != 0 &&
-            cfs_time_before(cfs_time_add(req->rq_queued_time,
-                                         cfs_time_seconds(req->rq_delay_limit)),
-                            cfs_time_current())) {
+	    req->rq_queued_time + req->rq_delay_limit < ktime_get_seconds())
                 return 1;
-        }
         return 0;
 }
 
@@ -2659,11 +2687,6 @@ struct timeout_item;
 typedef int (*timeout_cb_t)(struct timeout_item *, void *);
 int ptlrpc_pinger_add_import(struct obd_import *imp);
 int ptlrpc_pinger_del_import(struct obd_import *imp);
-int ptlrpc_add_timeout_client(int time, enum timeout_event event,
-			      timeout_cb_t cb, void *data,
-			      struct list_head *obd_list);
-int ptlrpc_del_timeout_client(struct list_head *obd_list,
-                              enum timeout_event event);
 struct ptlrpc_request * ptlrpc_prep_ping(struct obd_import *imp);
 int ptlrpc_obd_ping(struct obd_device *obd);
 void ping_evictor_start(void);
@@ -2702,11 +2725,9 @@ static inline void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes) {}
 
 /* ptlrpc/llog_server.c */
 int llog_origin_handle_open(struct ptlrpc_request *req);
-int llog_origin_handle_destroy(struct ptlrpc_request *req);
 int llog_origin_handle_prev_block(struct ptlrpc_request *req);
 int llog_origin_handle_next_block(struct ptlrpc_request *req);
 int llog_origin_handle_read_header(struct ptlrpc_request *req);
-int llog_origin_handle_close(struct ptlrpc_request *req);
 
 /* ptlrpc/llog_client.c */
 extern struct llog_operations llog_client_ops;
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nodemap.h b/drivers/staging/lustrefsx/lustre/include/lustre_nodemap.h
index 7cabc6f2424d7..9d200bf651b64 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_nodemap.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_nodemap.h
@@ -21,13 +21,16 @@
  */
 /*
  * Copyright (C) 2013, Trustees of Indiana University
+ *
+ * Copyright (c) 2017, Intel Corporation.
+ *
  * Author: Joshua Walgenbach <jjw@iu.edu>
  */
 
 #ifndef _LUSTRE_NODEMAP_H
 #define _LUSTRE_NODEMAP_H
 
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 
 #define LUSTRE_NODEMAP_NAME "nodemap"
 
@@ -73,7 +76,8 @@ struct lu_nodemap {
 				 nmf_deny_unknown:1,
 				 nmf_allow_root_access:1,
 				 nmf_map_uid_only:1,
-				 nmf_map_gid_only:1;
+				 nmf_map_gid_only:1,
+				 nmf_enable_audit:1;
 	/* unique ID set by MGS */
 	unsigned int		 nm_id;
 	/* nodemap ref counter */
@@ -102,6 +106,8 @@ struct lu_nodemap {
 	struct nodemap_pde	*nm_pde_data;
 	/* fileset the nodes of this nodemap are restricted to */
 	char			 nm_fileset[PATH_MAX+1];
+	/* information about the expected SELinux policy on the nodes */
+	char			 nm_sepol[LUSTRE_NODEMAP_SEPOL_LENGTH + 1];
 
 	/* used when loading/unloading nodemaps */
 	struct list_head	 nm_list;
@@ -132,6 +138,7 @@ int nodemap_set_deny_unknown(const char *name, bool deny_unknown);
 int nodemap_set_mapping_mode(const char *name, enum nodemap_mapping_modes mode);
 int nodemap_set_squash_uid(const char *name, uid_t uid);
 int nodemap_set_squash_gid(const char *name, gid_t gid);
+int nodemap_set_audit_mode(const char *name, bool enable_audit);
 bool nodemap_can_setquota(const struct lu_nodemap *nodemap);
 int nodemap_add_idmap(const char *name, enum nodemap_id_type id_type,
 		      const __u32 map[2]);
@@ -139,6 +146,8 @@ int nodemap_del_idmap(const char *name, enum nodemap_id_type id_type,
 		      const __u32 map[2]);
 int nodemap_set_fileset(const char *name, const char *fileset);
 char *nodemap_get_fileset(const struct lu_nodemap *nodemap);
+int nodemap_set_sepol(const char *name, const char *sepol);
+const char *nodemap_get_sepol(const struct lu_nodemap *nodemap);
 __u32 nodemap_map_id(struct lu_nodemap *nodemap,
 		     enum nodemap_id_type id_type,
 		     enum nodemap_tree_type tree_type, __u32 id);
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nrs_tbf.h b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_tbf.h
index 6e0c736ab8d87..0a407197c36f6 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_nrs_tbf.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_tbf.h
@@ -51,7 +51,31 @@ struct nrs_tbf_jobid {
 	struct list_head tj_linkage;
 };
 
-#define NRS_TBF_KEY_LEN	(LNET_NIDSTR_SIZE + LUSTRE_JOBID_SIZE + 3 + 2)
+#define MAX_U32_STR_LEN	10
+#define NRS_TBF_KEY_LEN	(LNET_NIDSTR_SIZE + LUSTRE_JOBID_SIZE + \
+			 MAX_U32_STR_LEN + MAX_U32_STR_LEN + 3 + 2)
+
+enum nrs_tbf_flag {
+	NRS_TBF_FLAG_INVALID    = 0x0000000,
+	NRS_TBF_FLAG_JOBID      = 0x0000001,
+	NRS_TBF_FLAG_NID        = 0x0000002,
+	NRS_TBF_FLAG_OPCODE     = 0x0000004,
+	NRS_TBF_FLAG_GENERIC    = 0x0000008,
+	NRS_TBF_FLAG_UID        = 0x0000010,
+	NRS_TBF_FLAG_GID        = 0x0000020,
+};
+
+struct tbf_id {
+	enum nrs_tbf_flag	ti_type;
+	u32			ti_uid;
+	u32			ti_gid;
+};
+
+struct nrs_tbf_id {
+	struct tbf_id		nti_id;
+	struct list_head	nti_linkage;
+};
+
 struct nrs_tbf_client {
 	/** Resource object for policy instance. */
 	struct ptlrpc_nrs_resource	 tc_res;
@@ -63,6 +87,8 @@ struct nrs_tbf_client {
 	char				 tc_jobid[LUSTRE_JOBID_SIZE];
 	/** opcode of the client. */
 	__u32				 tc_opcode;
+	/** gid or uid of the client. */
+	struct tbf_id			tc_id;
 	/** Hash key of the client. */
 	char				 tc_key[NRS_TBF_KEY_LEN];
 	/** Reference number of the client. */
@@ -85,6 +111,13 @@ struct nrs_tbf_client {
 	__u64				 tc_depth;
 	/** Time check-point. */
 	__u64				 tc_check_time;
+	/** Deadline of a class */
+	__u64				 tc_deadline;
+	/**
+	 * Time residue: the remainder of elapsed time
+	 * divided by nsecs when dequeue a request.
+	 */
+	__u64				 tc_nsecs_resid;
 	/** List of queued requests. */
 	struct list_head		 tc_list;
 	/** Node in binary heap. */
@@ -102,8 +135,11 @@ struct nrs_tbf_client {
 
 #define MAX_TBF_NAME (16)
 
-#define NTRS_STOPPING	0x0000001
-#define NTRS_DEFAULT	0x0000002
+enum nrs_rule_flags {
+	NTRS_STOPPING	= 0x00000001,
+	NTRS_DEFAULT	= 0x00000002,
+	NTRS_REALTIME	= 0x00000004,
+};
 
 struct nrs_tbf_rule {
 	/** Name of the rule. */
@@ -120,6 +156,10 @@ struct nrs_tbf_rule {
 	struct list_head		 tr_jobids;
 	/** Jobid list string of the rule.*/
 	char				*tr_jobids_str;
+	/** uid/gid list of the rule. */
+	struct list_head		tr_ids;
+	/** uid/gid list string of the rule. */
+	char				*tr_ids_str;
 	/** Opcode bitmap of the rule. */
 	struct cfs_bitmap		*tr_opcodes;
 	/** Opcode list string of the rule.*/
@@ -139,7 +179,7 @@ struct nrs_tbf_rule {
 	/** List of client. */
 	struct list_head		 tr_cli_list;
 	/** Flags of the rule. */
-	__u32				 tr_flags;
+	enum nrs_rule_flags		 tr_flags;
 	/** Usage Reference count taken on the rule. */
 	atomic_t			 tr_ref;
 	/** Generation of the rule. */
@@ -168,16 +208,10 @@ struct nrs_tbf_ops {
 #define NRS_TBF_TYPE_NID	"nid"
 #define NRS_TBF_TYPE_OPCODE	"opcode"
 #define NRS_TBF_TYPE_GENERIC	"generic"
+#define NRS_TBF_TYPE_UID	"uid"
+#define NRS_TBF_TYPE_GID	"gid"
 #define NRS_TBF_TYPE_MAX_LEN	20
 
-enum nrs_tbf_flag {
-	NRS_TBF_FLAG_INVALID	= 0x0000000,
-	NRS_TBF_FLAG_JOBID	= 0x0000001,
-	NRS_TBF_FLAG_NID	= 0x0000002,
-	NRS_TBF_FLAG_OPCODE	= 0x0000004,
-	NRS_TBF_FLAG_GENERIC	= 0x0000008,
-};
-
 struct nrs_tbf_type {
 	const char		*ntt_name;
 	enum nrs_tbf_flag	 ntt_flag;
@@ -270,12 +304,14 @@ struct nrs_tbf_cmd {
 			char			*ts_nids_str;
 			struct list_head	 ts_jobids;
 			char			*ts_jobids_str;
+			struct list_head	 ts_ids;
+			char			*ts_ids_str;
 			struct cfs_bitmap	*ts_opcodes;
 			char			*ts_opcodes_str;
 			struct list_head	 ts_conds;
 			char			*ts_conds_str;
 			__u32			 ts_valid_type;
-			__u32			 ts_rule_flags;
+			enum nrs_rule_flags	 ts_rule_flags;
 			char			*ts_next_name;
 		} tc_start;
 		struct nrs_tbf_cmd_change {
@@ -289,6 +325,8 @@ enum nrs_tbf_field {
 	NRS_TBF_FIELD_NID,
 	NRS_TBF_FIELD_JOBID,
 	NRS_TBF_FIELD_OPCODE,
+	NRS_TBF_FIELD_UID,
+	NRS_TBF_FIELD_GID,
 	NRS_TBF_FIELD_MAX
 };
 
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_obdo.h b/drivers/staging/lustrefsx/lustre/include/lustre_obdo.h
index d3afac961b043..dd99eee5af714 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_obdo.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_obdo.h
@@ -35,7 +35,7 @@
 #ifndef _LUSTRE_OBDO_H_
 #define _LUSTRE_OBDO_H_
 
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 
 /**
  * Create an obdo to send over the wire
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_cl_internal.h b/drivers/staging/lustrefsx/lustre/include/lustre_osc.h
similarity index 52%
rename from drivers/staging/lustrefsx/lustre/osc/osc_cl_internal.h
rename to drivers/staging/lustrefsx/lustre/include/lustre_osc.h
index 7e6cbc017dfde..f865036f897cf 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_cl_internal.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_osc.h
@@ -23,35 +23,99 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  * Lustre is a trademark of Sun Microsystems, Inc.
  */
 /*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
+ * lustre/include/lustre_osc.h
  *
- * Internal interfaces of OSC layer.
+ * OSC layer structures and methods common for both OSC and MDC.
+ *
+ * This file contains OSC interfaces used by OSC and MDC. Most of them
+ * were just moved from lustre/osc/osc_cl_internal.h for Data-on-MDT
+ * purposes.
  *
  *   Author: Nikita Danilov <nikita.danilov@sun.com>
  *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ *   Author: Mikhail Pershin <mike.pershin@intel.com>
  */
 
-#ifndef OSC_CL_INTERNAL_H
-#define OSC_CL_INTERNAL_H
+#ifndef LUSTRE_OSC_H
+#define LUSTRE_OSC_H
 
 #include <libcfs/libcfs.h>
 #include <obd.h>
-/* osc_build_res_name() */
 #include <cl_object.h>
-#include "osc_internal.h"
 
 /** \defgroup osc osc
  *  @{
  */
 
+struct osc_quota_info {
+	/** linkage for quota hash table */
+	struct hlist_node oqi_hash;
+	__u32             oqi_id;
+};
+
+enum async_flags {
+	ASYNC_READY = 0x1, /* ap_make_ready will not be called before this
+			      page is added to an rpc */
+	ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */
+	ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called
+				     to give the caller a chance to update
+				     or cancel the size of the io */
+	ASYNC_HP = 0x10,
+};
+
+struct osc_async_page {
+	int			oap_magic;
+	unsigned short		oap_cmd;
+	unsigned short		oap_interrupted:1;
+
+	struct list_head	oap_pending_item;
+	struct list_head	oap_rpc_item;
+
+	loff_t			oap_obj_off;
+	unsigned		oap_page_off;
+	enum async_flags	oap_async_flags;
+
+	struct brw_page		oap_brw_page;
+
+	struct ptlrpc_request	*oap_request;
+	struct client_obd	*oap_cli;
+	struct osc_object	*oap_obj;
+
+	spinlock_t		 oap_lock;
+};
+
+#define oap_page	oap_brw_page.pg
+#define oap_count	oap_brw_page.count
+#define oap_brw_flags	oap_brw_page.flag
+
+static inline struct osc_async_page *brw_page2oap(struct brw_page *pga)
+{
+	return container_of(pga, struct osc_async_page, oap_brw_page);
+}
+
+struct osc_device {
+	struct cl_device	od_cl;
+	struct obd_export	*od_exp;
+
+	/* Write stats is actually protected by client_obd's lock. */
+	struct osc_stats {
+		uint64_t	os_lockless_writes;    /* by bytes */
+		uint64_t	os_lockless_reads;     /* by bytes */
+		uint64_t	os_lockless_truncates; /* by times */
+	} od_stats;
+
+	/* configuration item(s) */
+	time64_t		od_contention_time;
+	int			od_lockless_truncate;
+};
+
 struct osc_extent;
 
 /**
@@ -63,7 +127,9 @@ struct osc_io {
 	/** true if this io is lockless. */
 	unsigned int	   oi_lockless:1,
 	/** true if this io is counted as active IO */
-			   oi_is_active:1;
+			   oi_is_active:1,
+	/** true if this io has CAP_SYS_RESOURCE */
+			   oi_cap_sys_resource:1;
 	/** how many LRU pages are reserved for this IO */
 	unsigned long	   oi_lru_reserved;
 
@@ -78,8 +144,8 @@ struct osc_io {
 	struct obdo        oi_oa;
 	struct osc_async_cbargs {
 		bool		  opc_rpc_sent;
-		int               opc_rc;
-		struct completion	opc_sync;
+		int		  opc_rc;
+		struct completion opc_sync;
 	} oi_cbarg;
 };
 
@@ -87,7 +153,7 @@ struct osc_io {
  * State maintained by osc layer for the duration of a system call.
  */
 struct osc_session {
-        struct osc_io       os_io;
+	struct osc_io os_io;
 };
 
 #define OTI_PVEC_SIZE 256
@@ -99,6 +165,7 @@ struct osc_thread_info {
 	struct lustre_handle	oti_handle;
 	struct cl_page_list	oti_plist;
 	struct cl_io		oti_io;
+	struct pagevec		oti_pagevec;
 	void			*oti_pvec[OTI_PVEC_SIZE];
 	/**
 	 * Fields used by cl_lock_discard_pages().
@@ -110,21 +177,88 @@ struct osc_thread_info {
 	struct lu_buf		oti_ladvise_buf;
 };
 
+static inline __u64 osc_enq2ldlm_flags(__u32 enqflags)
+{
+	__u64 result = 0;
+
+	CDEBUG(D_DLMTRACE, "flags: %x\n", enqflags);
+
+	LASSERT((enqflags & ~CEF_MASK) == 0);
+
+	if (enqflags & CEF_NONBLOCK)
+		result |= LDLM_FL_BLOCK_NOWAIT;
+	if (enqflags & CEF_GLIMPSE)
+		result |= LDLM_FL_HAS_INTENT;
+	if (enqflags & CEF_DISCARD_DATA)
+		result |= LDLM_FL_AST_DISCARD_DATA;
+	if (enqflags & CEF_PEEK)
+		result |= LDLM_FL_TEST_LOCK;
+	if (enqflags & CEF_LOCK_MATCH)
+		result |= LDLM_FL_MATCH_LOCK;
+	if (enqflags & CEF_LOCK_NO_EXPAND)
+		result |= LDLM_FL_NO_EXPANSION;
+	if (enqflags & CEF_SPECULATIVE)
+		result |= LDLM_FL_SPECULATIVE;
+	return result;
+}
+
+typedef int (*osc_enqueue_upcall_f)(void *cookie, struct lustre_handle *lockh,
+				    int rc);
+
+struct osc_enqueue_args {
+	struct obd_export	*oa_exp;
+	enum ldlm_type		oa_type;
+	enum ldlm_mode		oa_mode;
+	__u64			*oa_flags;
+	osc_enqueue_upcall_f	oa_upcall;
+	void			*oa_cookie;
+	struct ost_lvb		*oa_lvb;
+	struct lustre_handle	oa_lockh;
+	bool			oa_speculative;
+};
+
+/**
+ * Bit flags for osc_dlm_lock_at_pageoff().
+ */
+enum osc_dap_flags {
+	/**
+	 * Just check if the desired lock exists, it won't hold reference
+	 * count on lock.
+	 */
+	OSC_DAP_FL_TEST_LOCK = 1 << 0,
+	/**
+	 * Return the lock even if it is being canceled.
+	 */
+	OSC_DAP_FL_CANCELING = 1 << 1
+};
+
+/*
+ * The set of operations which are different for MDC and OSC objects
+ */
+struct osc_object_operations {
+	void (*oto_build_res_name)(struct osc_object *osc,
+				   struct ldlm_res_id *resname);
+	struct ldlm_lock* (*oto_dlmlock_at_pgoff)(const struct lu_env *env,
+						struct osc_object *obj,
+						pgoff_t index,
+						enum osc_dap_flags dap_flags);
+};
+
 struct osc_object {
-        struct cl_object   oo_cl;
-        struct lov_oinfo  *oo_oinfo;
-        /**
-         * True if locking against this stripe got -EUSERS.
-         */
-        int                oo_contended;
-        cfs_time_t         oo_contention_time;
+	struct cl_object	oo_cl;
+	struct lov_oinfo	*oo_oinfo;
+	/**
+	 * True if locking against this stripe got -EUSERS.
+	 */
+	int			oo_contended;
+	ktime_t			oo_contention_time;
 #ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
-        /**
-         * IO context used for invariant checks in osc_lock_has_pages().
-         */
-        struct cl_io       oo_debug_io;
-        /** Serialization object for osc_object::oo_debug_io. */
-	struct mutex	   oo_debug_mutex;
+	/**
+	 * IO context used for invariant checks in osc_lock_has_pages().
+	 */
+	struct cl_io		oo_debug_io;
+	/** Serialization object for osc_object::oo_debug_io. */
+	struct mutex		oo_debug_mutex;
 #endif
 	/**
 	 * used by the osc to keep track of what objects to build into rpcs.
@@ -138,7 +272,7 @@ struct osc_object {
 	/**
 	 * extent is a red black tree to manage (async) dirty pages.
 	 */
-	struct rb_root       oo_root;
+	struct rb_root		oo_root;
 	/**
 	 * Manage write(dirty) extents.
 	 */
@@ -148,12 +282,12 @@ struct osc_object {
 
 	struct list_head	oo_reading_exts;
 
-	atomic_t	 oo_nr_reads;
-	atomic_t	 oo_nr_writes;
+	atomic_t		oo_nr_reads;
+	atomic_t		oo_nr_writes;
 
 	/** Protect extent tree. Will be used to protect
 	 * oo_{read|write}_pages soon. */
-	spinlock_t	    oo_lock;
+	spinlock_t		oo_lock;
 
 	/**
 	 * Radix tree for caching pages
@@ -169,8 +303,25 @@ struct osc_object {
 	/** number of active IOs of this object */
 	atomic_t		oo_nr_ios;
 	wait_queue_head_t	oo_io_waitq;
+
+	const struct osc_object_operations *oo_obj_ops;
+	bool			oo_initialized;
 };
 
+static inline void osc_build_res_name(struct osc_object *osc,
+				      struct ldlm_res_id *resname)
+{
+	return osc->oo_obj_ops->oto_build_res_name(osc, resname);
+}
+
+static inline struct ldlm_lock *osc_dlmlock_at_pgoff(const struct lu_env *env,
+						    struct osc_object *obj,
+						    pgoff_t index,
+						    enum osc_dap_flags flags)
+{
+	return obj->oo_obj_ops->oto_dlmlock_at_pgoff(env, obj, index, flags);
+}
+
 static inline void osc_object_lock(struct osc_object *obj)
 {
 	spin_lock(&obj->oo_lock);
@@ -200,15 +351,27 @@ static inline int osc_object_is_locked(struct osc_object *obj)
 #endif
 }
 
+static inline void osc_object_set_contended(struct osc_object *obj)
+{
+	obj->oo_contention_time = ktime_get();
+	/* mb(); */
+	obj->oo_contended = 1;
+}
+
+static inline void osc_object_clear_contended(struct osc_object *obj)
+{
+	obj->oo_contended = 0;
+}
+
 /*
  * Lock "micro-states" for osc layer.
  */
 enum osc_lock_state {
-        OLS_NEW,
-        OLS_ENQUEUED,
-        OLS_UPCALL_RECEIVED,
-        OLS_GRANTED,
-        OLS_CANCELLED
+	OLS_NEW,
+	OLS_ENQUEUED,
+	OLS_UPCALL_RECEIVED,
+	OLS_GRANTED,
+	OLS_CANCELLED
 };
 
 /**
@@ -271,55 +434,68 @@ struct osc_lock {
 	/** DLM flags with which osc_lock::ols_lock was enqueued */
 	__u64			ols_flags;
 	/** osc_lock::ols_lock handle */
-	struct lustre_handle     ols_handle;
+	struct lustre_handle	ols_handle;
 	struct ldlm_enqueue_info ols_einfo;
-	enum osc_lock_state      ols_state;
+	enum osc_lock_state	ols_state;
 	/** lock value block */
 	struct ost_lvb		ols_lvb;
-
-        /**
-         * true, if ldlm_lock_addref() was called against
-         * osc_lock::ols_lock. This is used for sanity checking.
-         *
-         * \see osc_lock::ols_has_ref
-         */
-        unsigned                  ols_hold :1,
-        /**
-         * this is much like osc_lock::ols_hold, except that this bit is
-         * cleared _after_ reference in released in osc_lock_unuse(). This
-         * fine distinction is needed because:
-         *
-         *     - if ldlm lock still has a reference, osc_ast_data_get() needs
-         *       to return associated cl_lock (so that a flag is needed that is
-         *       cleared after ldlm_lock_decref() returned), and
-         *
-         *     - ldlm_lock_decref() can invoke blocking ast (for a
-         *       LDLM_FL_CBPENDING lock), and osc_lock functions like
-         *       osc_lock_cancel() called from there need to know whether to
-         *       release lock reference (so that a flag is needed that is
-         *       cleared before ldlm_lock_decref() is called).
-         */
-                                 ols_has_ref:1,
-        /**
-         * inherit the lockless attribute from top level cl_io.
-         * If true, osc_lock_enqueue is able to tolerate the -EUSERS error.
-         */
-                                 ols_locklessable:1,
-        /**
-         * if set, the osc_lock is a glimpse lock. For glimpse locks, we treat
-         * the EVAVAIL error as torerable, this will make upper logic happy
-         * to wait all glimpse locks to each OSTs to be completed.
-         * Glimpse lock converts to normal lock if the server lock is
-         * granted.
-         * Glimpse lock should be destroyed immediately after use.
-         */
-                                 ols_glimpse:1,
-        /**
-         * For async glimpse lock.
-         */
-                                 ols_agl:1;
+	/** Lockless operations to be used by lockless lock */
+	const struct cl_lock_operations *ols_lockless_ops;
+	/**
+	 * true, if ldlm_lock_addref() was called against
+	 * osc_lock::ols_lock. This is used for sanity checking.
+	 *
+	 * \see osc_lock::ols_has_ref
+	 */
+	unsigned		ols_hold :1,
+	/**
+	 * this is much like osc_lock::ols_hold, except that this bit is
+	 * cleared _after_ reference in released in osc_lock_unuse(). This
+	 * fine distinction is needed because:
+	 *
+	 *     - if ldlm lock still has a reference, osc_ast_data_get() needs
+	 *       to return associated cl_lock (so that a flag is needed that is
+	 *       cleared after ldlm_lock_decref() returned), and
+	 *
+	 *     - ldlm_lock_decref() can invoke blocking ast (for a
+	 *       LDLM_FL_CBPENDING lock), and osc_lock functions like
+	 *       osc_lock_cancel() called from there need to know whether to
+	 *       release lock reference (so that a flag is needed that is
+	 *       cleared before ldlm_lock_decref() is called).
+	 */
+				ols_has_ref:1,
+	/**
+	 * inherit the lockless attribute from top level cl_io.
+	 * If true, osc_lock_enqueue is able to tolerate the -EUSERS error.
+	 */
+				ols_locklessable:1,
+	/**
+	 * if set, the osc_lock is a glimpse lock. For glimpse locks, we treat
+	 * the EVAVAIL error as torerable, this will make upper logic happy
+	 * to wait all glimpse locks to each OSTs to be completed.
+	 * Glimpse lock converts to normal lock if the server lock is granted.
+	 * Glimpse lock should be destroyed immediately after use.
+	 */
+				ols_glimpse:1,
+	/**
+	 * For async glimpse lock.
+	 */
+				ols_agl:1,
+	/**
+	 * for speculative locks - asynchronous glimpse locks and ladvise
+	 * lockahead manual lock requests
+	 *
+	 * Used to tell osc layer to not wait for the ldlm reply from the
+	 * server, so the osc lock will be short lived - It only exists to
+	 * create the ldlm request and is not updated on request completion.
+	 */
+				ols_speculative:1;
 };
 
+static inline int osc_lock_is_lockless(const struct osc_lock *ols)
+{
+	return (ols->ols_cl.cls_ops == ols->ols_lockless_ops);
+}
 
 /**
  * Page state private for osc layer.
@@ -348,7 +524,7 @@ struct osc_page {
 	/**
 	 * in LRU?
 	 */
-			      ops_in_lru:1,
+				ops_in_lru:1,
 	/**
 	 * Set if the page must be transferred with OBD_BRW_SRVLOCK.
 	 */
@@ -364,7 +540,19 @@ struct osc_page {
 	/**
 	 * Submit time - the time when the page is starting RPC. For debugging.
 	 */
-	cfs_time_t            ops_submit_time;
+	ktime_t			ops_submit_time;
+};
+
+struct osc_brw_async_args {
+	struct obdo		*aa_oa;
+	int			 aa_requested_nob;
+	int			 aa_nio_count;
+	u32			 aa_page_count;
+	s32			 aa_resends;
+	struct brw_page		**aa_ppga;
+	struct client_obd	*aa_cli;
+	struct list_head	 aa_oaps;
+	struct list_head	 aa_exts;
 };
 
 extern struct kmem_cache *osc_lock_kmem;
@@ -372,32 +560,27 @@ extern struct kmem_cache *osc_object_kmem;
 extern struct kmem_cache *osc_thread_kmem;
 extern struct kmem_cache *osc_session_kmem;
 extern struct kmem_cache *osc_extent_kmem;
+extern struct kmem_cache *osc_quota_kmem;
+extern struct kmem_cache *osc_obdo_kmem;
 
-extern struct lu_device_type osc_device_type;
 extern struct lu_context_key osc_key;
 extern struct lu_context_key osc_session_key;
 
 #define OSC_FLAGS (ASYNC_URGENT|ASYNC_READY)
 
-int osc_lock_init(const struct lu_env *env,
-                  struct cl_object *obj, struct cl_lock *lock,
-                  const struct cl_io *io);
-int osc_io_init  (const struct lu_env *env,
-                  struct cl_object *obj, struct cl_io *io);
-struct lu_object *osc_object_alloc(const struct lu_env *env,
-                                   const struct lu_object_header *hdr,
-                                   struct lu_device *dev);
+/* osc_page.c */
 int osc_page_init(const struct lu_env *env, struct cl_object *obj,
 		  struct cl_page *page, pgoff_t ind);
-
-void osc_index2policy(union ldlm_policy_data *policy,
-		      const struct cl_object *obj, pgoff_t start, pgoff_t end);
-int  osc_lvb_print(const struct lu_env *env, void *cookie,
-		   lu_printer_t p, const struct ost_lvb *lvb);
-
+void osc_index2policy(union ldlm_policy_data *policy, const struct cl_object *obj,
+		      pgoff_t start, pgoff_t end);
 void osc_lru_add_batch(struct client_obd *cli, struct list_head *list);
 void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
 		     enum cl_req_type crt, int brw_flags);
+int lru_queue_work(const struct lu_env *env, void *data);
+long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
+		    long target, bool force);
+
+/* osc_cache.c */
 int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops);
 int osc_set_async_flags(struct osc_object *obj, struct osc_page *opg,
 			u32 async_flags);
@@ -411,8 +594,9 @@ int osc_teardown_async_page(const struct lu_env *env, struct osc_object *obj,
 			    struct osc_page *ops);
 int osc_flush_async_page(const struct lu_env *env, struct cl_io *io,
 			 struct osc_page *ops);
-int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
-			 struct list_head *list, int cmd, int brw_flags);
+int osc_queue_sync_pages(const struct lu_env *env, const struct cl_io *io,
+			 struct osc_object *obj, struct list_head *list,
+			 int brw_flags);
 int osc_cache_truncate_start(const struct lu_env *env, struct osc_object *obj,
 			     __u64 size, struct osc_extent **extp);
 void osc_cache_truncate_end(const struct lu_env *env, struct osc_extent *ext);
@@ -420,59 +604,161 @@ int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
 			      pgoff_t start, pgoff_t end, int hp, int discard);
 int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
 			 pgoff_t start, pgoff_t end);
-void osc_io_unplug(const struct lu_env *env, struct client_obd *cli,
-		   struct osc_object *osc);
-int lru_queue_work(const struct lu_env *env, void *data);
+int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli,
+		   struct osc_object *osc, int async);
+static inline void osc_wake_cache_waiters(struct client_obd *cli)
+{
+	wake_up(&cli->cl_cache_waiters);
+}
 
-void osc_object_set_contended  (struct osc_object *obj);
-void osc_object_clear_contended(struct osc_object *obj);
-int  osc_object_is_contended   (struct osc_object *obj);
+static inline int osc_io_unplug_async(const struct lu_env *env,
+				      struct client_obd *cli,
+				      struct osc_object *osc)
+{
+	return osc_io_unplug0(env, cli, osc, 1);
+}
 
-int  osc_lock_is_lockless      (const struct osc_lock *olck);
+static inline void osc_io_unplug(const struct lu_env *env,
+				 struct client_obd *cli,
+				 struct osc_object *osc)
+{
+	(void)osc_io_unplug0(env, cli, osc, 0);
+}
+
+typedef int (*osc_page_gang_cbt)(const struct lu_env *, struct cl_io *,
+				 struct osc_page *, void *);
+int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
+			struct osc_object *osc, pgoff_t start, pgoff_t end,
+			osc_page_gang_cbt cb, void *cbdata);
+int osc_discard_cb(const struct lu_env *env, struct cl_io *io,
+		   struct osc_page *ops, void *cbdata);
+
+/* osc_dev.c */
+int osc_device_init(const struct lu_env *env, struct lu_device *d,
+		    const char *name, struct lu_device *next);
+struct lu_device *osc_device_fini(const struct lu_env *env,
+				  struct lu_device *d);
+struct lu_device *osc_device_free(const struct lu_env *env,
+				  struct lu_device *d);
+
+/* osc_object.c */
+int osc_object_init(const struct lu_env *env, struct lu_object *obj,
+		    const struct lu_object_conf *conf);
+void osc_object_free(const struct lu_env *env, struct lu_object *obj);
+int osc_lvb_print(const struct lu_env *env, void *cookie,
+		  lu_printer_t p, const struct ost_lvb *lvb);
+int osc_object_print(const struct lu_env *env, void *cookie,
+		     lu_printer_t p, const struct lu_object *obj);
+int osc_attr_get(const struct lu_env *env, struct cl_object *obj,
+		 struct cl_attr *attr);
+int osc_attr_update(const struct lu_env *env, struct cl_object *obj,
+		    const struct cl_attr *attr, unsigned valid);
+int osc_object_glimpse(const struct lu_env *env, const struct cl_object *obj,
+		       struct ost_lvb *lvb);
+int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc);
+int osc_object_is_contended(struct osc_object *obj);
+int osc_object_find_cbdata(const struct lu_env *env, struct cl_object *obj,
+			   ldlm_iterator_t iter, void *data);
+int osc_object_prune(const struct lu_env *env, struct cl_object *obj);
+
+/* osc_request.c */
+void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd);
+int osc_setup_common(struct obd_device *obd, struct lustre_cfg *lcfg);
+int osc_precleanup_common(struct obd_device *obd);
+int osc_cleanup_common(struct obd_device *obd);
+int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+		       u32 keylen, void *key, u32 vallen, void *val,
+		       struct ptlrpc_request_set *set);
+int osc_ldlm_resource_invalidate(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+				 struct hlist_node *hnode, void *arg);
+int osc_reconnect(const struct lu_env *env, struct obd_export *exp,
+		  struct obd_device *obd, struct obd_uuid *cluuid,
+		  struct obd_connect_data *data, void *localdata);
+int osc_disconnect(struct obd_export *exp);
+int osc_punch_send(struct obd_export *exp, struct obdo *oa,
+		   obd_enqueue_update_f upcall, void *cookie);
+
+/* osc_io.c */
+int osc_io_submit(const struct lu_env *env, const struct cl_io_slice *ios,
+		  enum cl_req_type crt, struct cl_2queue *queue);
+int osc_io_commit_async(const struct lu_env *env,
+			const struct cl_io_slice *ios,
+			struct cl_page_list *qin, int from, int to,
+			cl_commit_cbt cb);
+int osc_io_iter_init(const struct lu_env *env, const struct cl_io_slice *ios);
+void osc_io_iter_fini(const struct lu_env *env,
+		      const struct cl_io_slice *ios);
+int osc_io_write_iter_init(const struct lu_env *env,
+			   const struct cl_io_slice *ios);
+void osc_io_write_iter_fini(const struct lu_env *env,
+			    const struct cl_io_slice *ios);
+int osc_io_fault_start(const struct lu_env *env, const struct cl_io_slice *ios);
+void osc_io_setattr_end(const struct lu_env *env,
+			const struct cl_io_slice *slice);
+int osc_io_read_start(const struct lu_env *env,
+		      const struct cl_io_slice *slice);
+int osc_io_write_start(const struct lu_env *env,
+		       const struct cl_io_slice *slice);
+void osc_io_end(const struct lu_env *env, const struct cl_io_slice *slice);
+int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj,
+		  struct cl_fsync_io *fio);
+void osc_io_fsync_end(const struct lu_env *env,
+		      const struct cl_io_slice *slice);
+void osc_read_ahead_release(const struct lu_env *env, void *cbdata);
+
+/* osc_lock.c */
+void osc_lock_to_lockless(const struct lu_env *env, struct osc_lock *ols,
+			  int force);
+void osc_lock_wake_waiters(const struct lu_env *env, struct osc_object *osc,
+			   struct osc_lock *oscl);
+int osc_lock_enqueue_wait(const struct lu_env *env, struct osc_object *obj,
+			  struct osc_lock *oscl);
+void osc_lock_set_writer(const struct lu_env *env, const struct cl_io *io,
+			 struct cl_object *obj, struct osc_lock *oscl);
+int osc_lock_print(const struct lu_env *env, void *cookie,
+		   lu_printer_t p, const struct cl_lock_slice *slice);
+void osc_lock_cancel(const struct lu_env *env,
+		     const struct cl_lock_slice *slice);
+void osc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice);
+int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data);
+unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock);
 
 /*****************************************************************************
  *
- * Accessors.
+ * Accessors and type conversions.
  *
  */
-
 static inline struct osc_thread_info *osc_env_info(const struct lu_env *env)
 {
-        struct osc_thread_info *info;
+	struct osc_thread_info *info;
 
-        info = lu_context_key_get(&env->le_ctx, &osc_key);
-        LASSERT(info != NULL);
-        return info;
+	info = lu_context_key_get(&env->le_ctx, &osc_key);
+	LASSERT(info != NULL);
+	return info;
 }
 
 static inline struct osc_session *osc_env_session(const struct lu_env *env)
 {
-        struct osc_session *ses;
+	struct osc_session *ses;
 
-        ses = lu_context_key_get(env->le_ses, &osc_session_key);
-        LASSERT(ses != NULL);
-        return ses;
+	ses = lu_context_key_get(env->le_ses, &osc_session_key);
+	LASSERT(ses != NULL);
+	return ses;
 }
 
 static inline struct osc_io *osc_env_io(const struct lu_env *env)
 {
-        return &osc_env_session(env)->os_io;
-}
-
-static inline int osc_is_object(const struct lu_object *obj)
-{
-        return obj->lo_dev->ld_type == &osc_device_type;
+	return &osc_env_session(env)->os_io;
 }
 
 static inline struct osc_device *lu2osc_dev(const struct lu_device *d)
 {
-        LINVRNT(d->ld_type == &osc_device_type);
-        return container_of0(d, struct osc_device, od_cl.cd_lu_dev);
+	return container_of0(d, struct osc_device, od_cl.cd_lu_dev);
 }
 
 static inline struct obd_export *osc_export(const struct osc_object *obj)
 {
-        return lu2osc_dev(obj->oo_cl.co_lu.lo_dev)->od_exp;
+	return lu2osc_dev(obj->oo_cl.co_lu.lo_dev)->od_exp;
 }
 
 static inline struct client_obd *osc_cli(const struct osc_object *obj)
@@ -482,8 +768,7 @@ static inline struct client_obd *osc_cli(const struct osc_object *obj)
 
 static inline struct osc_object *cl2osc(const struct cl_object *obj)
 {
-        LINVRNT(osc_is_object(&obj->co_lu));
-        return container_of0(obj, struct osc_object, oo_cl);
+	return container_of0(obj, struct osc_object, oo_cl);
 }
 
 static inline struct cl_object *osc2cl(const struct osc_object *obj)
@@ -491,6 +776,36 @@ static inline struct cl_object *osc2cl(const struct osc_object *obj)
 	return (struct cl_object *)&obj->oo_cl;
 }
 
+static inline struct osc_device *obd2osc_dev(const struct obd_device *d)
+{
+	return container_of0(d->obd_lu_dev, struct osc_device,
+			     od_cl.cd_lu_dev);
+}
+
+static inline struct lu_device *osc2lu_dev(struct osc_device *osc)
+{
+	return &osc->od_cl.cd_lu_dev;
+}
+
+static inline struct lu_object *osc2lu(struct osc_object *osc)
+{
+	return &osc->oo_cl.co_lu;
+}
+
+static inline struct osc_object *lu2osc(const struct lu_object *obj)
+{
+	return container_of0(obj, struct osc_object, oo_cl.co_lu);
+}
+
+static inline struct osc_io *cl2osc_io(const struct lu_env *env,
+				       const struct cl_io_slice *slice)
+{
+	struct osc_io *oio = container_of0(slice, struct osc_io, oi_cl);
+
+	LINVRNT(oio == osc_env_io(env));
+	return oio;
+}
+
 static inline enum ldlm_mode osc_cl_lock2ldlm(enum cl_lock_mode mode)
 {
 	LASSERT(mode == CLM_READ || mode == CLM_WRITE || mode == CLM_GROUP);
@@ -513,8 +828,7 @@ static inline enum cl_lock_mode osc_ldlm2cl_lock(enum ldlm_mode mode)
 
 static inline struct osc_page *cl2osc_page(const struct cl_page_slice *slice)
 {
-        LINVRNT(osc_is_object(&slice->cpl_obj->co_lu));
-        return container_of0(slice, struct osc_page, ops_cl);
+	return container_of0(slice, struct osc_page, ops_cl);
 }
 
 static inline struct osc_page *oap2osc(struct osc_async_page *oap)
@@ -549,18 +863,12 @@ osc_cl_page_osc(struct cl_page *page, struct osc_object *osc)
 
 static inline struct osc_lock *cl2osc_lock(const struct cl_lock_slice *slice)
 {
-        LINVRNT(osc_is_object(&slice->cls_obj->co_lu));
-        return container_of0(slice, struct osc_lock, ols_cl);
-}
-
-static inline struct osc_lock *osc_lock_at(const struct cl_lock *lock)
-{
-        return cl2osc_lock(cl_lock_at(lock, &osc_device_type));
+	return container_of0(slice, struct osc_lock, ols_cl);
 }
 
 static inline int osc_io_srvlock(struct osc_io *oio)
 {
-        return (oio->oi_lockless && !oio->oi_cl.cis_io->ci_no_srvlock);
+	return (oio->oi_lockless && !oio->oi_cl.cis_io->ci_no_srvlock);
 }
 
 enum osc_extent_state {
@@ -626,7 +934,9 @@ struct osc_extent {
 				oe_hp:1,
 	/** this extent should be written back asap. set if one of pages is
 	 * called by page WB daemon, or sync write or reading requests. */
-				oe_urgent:1;
+				oe_urgent:1,
+	/** Non-delay RPC should be used for this extent. */
+				oe_ndelay:1;
 	/** how many grants allocated for this extent.
 	 *  Grant allocated for this extent. There is no grant allocated
 	 *  for reading extents and sync write extents. */
@@ -660,20 +970,10 @@ struct osc_extent {
 	int			oe_rc;
 	/** max pages per rpc when this extent was created */
 	unsigned int		oe_mppr;
+	/** FLR: layout version when this osc_extent is publised */
+	__u32			oe_layout_version;
 };
 
-int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
-		      int sent, int rc);
-int osc_extent_release(const struct lu_env *env, struct osc_extent *ext);
-
-int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc,
-			   pgoff_t start, pgoff_t end, bool discard_pages);
-
-typedef int (*osc_page_gang_cbt)(const struct lu_env *, struct cl_io *,
-				 struct osc_page *, void *);
-int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
-			 struct osc_object *osc, pgoff_t start, pgoff_t end,
-			 osc_page_gang_cbt cb, void *cbdata);
 /** @} osc */
 
-#endif /* OSC_CL_INTERNAL_H */
+#endif /* LUSTRE_OSC_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_patchless_compat.h b/drivers/staging/lustrefsx/lustre/include/lustre_patchless_compat.h
index 2ad8bce19ac53..b6070871e555c 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_patchless_compat.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_patchless_compat.h
@@ -111,26 +111,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
 #  define ll_d_count(d)			((d)->d_count)
 #endif /* HAVE_DCACHE_LOCK */
 
-#ifdef ATTR_OPEN
-# define ATTR_FROM_OPEN ATTR_OPEN
-#else
-# ifndef ATTR_FROM_OPEN
-#  define ATTR_FROM_OPEN 0
-# endif
-#endif /* ATTR_OPEN */
-
-#ifndef ATTR_RAW
-#define ATTR_RAW 0
-#endif
-
-#ifndef ATTR_CTIME_SET
-/*
- * set ATTR_CTIME_SET to a high value to avoid any risk of collision with other
- * ATTR_* attributes (see bug 13828)
- */
-#define ATTR_CTIME_SET (1 << 28)
-#endif
-
 #ifndef HAVE_IN_COMPAT_SYSCALL
 #define in_compat_syscall	is_compat_task
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_quota.h b/drivers/staging/lustrefsx/lustre/include/lustre_quota.h
index 8cb25d2374322..17ff2da6240ca 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_quota.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_quota.h
@@ -21,7 +21,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2012, 2014, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  * Use is subject to license terms.
  */
 
@@ -175,13 +175,22 @@ struct qsd_instance;
  * Below are the function prototypes to be used by OSD layer to manage quota
  * enforcement. Arguments are documented where each function is defined.  */
 
+/* flags for quota local enforcement */
+enum osd_quota_local_flags {
+	QUOTA_FL_OVER_USRQUOTA = 1 << 0,
+	QUOTA_FL_OVER_GRPQUOTA = 1 << 1,
+	QUOTA_FL_SYNC = 1 << 2,
+	QUOTA_FL_OVER_PRJQUOTA = 1 << 3,
+};
+
 struct qsd_instance *qsd_init(const struct lu_env *, char *, struct dt_device *,
-			      struct proc_dir_entry *);
+			      struct proc_dir_entry *, bool is_md);
 int qsd_prepare(const struct lu_env *, struct qsd_instance *);
 int qsd_start(const struct lu_env *, struct qsd_instance *);
 void qsd_fini(const struct lu_env *, struct qsd_instance *);
 int qsd_op_begin(const struct lu_env *, struct qsd_instance *,
-		 struct lquota_trans *, struct lquota_id_info *, int *);
+		 struct lquota_trans *, struct lquota_id_info *,
+		 enum osd_quota_local_flags *);
 void qsd_op_end(const struct lu_env *, struct qsd_instance *,
 		struct lquota_trans *);
 void qsd_op_adjust(const struct lu_env *, struct qsd_instance *,
@@ -212,13 +221,13 @@ struct lquota_id_info {
 	bool			 lqi_is_blk;
 };
 
-/* Since we enforce only inode quota in meta pool (MDTs), and block quota in
- * data pool (OSTs), there are at most 4 quota ids being enforced in a single
- * transaction, which is chown transaction:
+/* With the DoM, both inode quota in meta pool and block quota in data pool
+ * will be enforced at MDT, there are at most 4 quota ids being enforced in
+ * a single transaction for inode and block quota, which is chown transaction:
  * original uid and gid, new uid and gid.
  *
  * This value might need to be revised when directory quota is added.  */
-#define QUOTA_MAX_TRANSIDS    4
+#define QUOTA_MAX_TRANSIDS    8
 
 /* all qids involved in a single transaction */
 struct lquota_trans {
@@ -226,12 +235,6 @@ struct lquota_trans {
 	struct lquota_id_info	lqt_ids[QUOTA_MAX_TRANSIDS];
 };
 
-/* flags for quota local enforcement */
-#define QUOTA_FL_OVER_USRQUOTA  0x01
-#define QUOTA_FL_OVER_GRPQUOTA  0x02
-#define QUOTA_FL_SYNC           0x04
-#define QUOTA_FL_OVER_PRJQUOTA  0x08
-
 #define IS_LQUOTA_RES(res)						\
 	(res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA ||	\
 	 res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA_GLB)
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h b/drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h
index 46e6fa862f48e..7b6c03b195624 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -59,7 +59,7 @@ enum req_location {
 };
 
 /* Maximal number of fields (buffers) in a request message. */
-#define REQ_MAX_FIELD_NR 10
+#define REQ_MAX_FIELD_NR 11
 
 struct req_capsule {
         struct ptlrpc_request   *rc_req;
@@ -128,6 +128,7 @@ int req_capsule_server_grow(struct req_capsule *pill,
 			    __u32 newlen);
 int  req_layout_init(void);
 void req_layout_fini(void);
+int req_check_sepol(struct req_capsule *pill);
 
 extern struct req_format RQF_OBD_PING;
 extern struct req_format RQF_OBD_SET_INFO;
@@ -145,6 +146,7 @@ extern struct req_format RQF_FLD_READ;
 extern struct req_format RQF_MDS_CONNECT;
 extern struct req_format RQF_MDS_DISCONNECT;
 extern struct req_format RQF_MDS_STATFS;
+extern struct req_format RQF_MDS_STATFS_NEW;
 extern struct req_format RQF_MDS_GET_ROOT;
 extern struct req_format RQF_MDS_SYNC;
 extern struct req_format RQF_MDS_GETXATTR;
@@ -156,7 +158,7 @@ extern struct req_format RQF_OUT_UPDATE;
  */
 extern struct req_format RQF_MDS_GETATTR_NAME;
 extern struct req_format RQF_MDS_CLOSE;
-extern struct req_format RQF_MDS_INTENT_CLOSE;
+extern struct req_format RQF_MDS_CLOSE_INTENT;
 extern struct req_format RQF_MDS_CONNECT;
 extern struct req_format RQF_MDS_DISCONNECT;
 extern struct req_format RQF_MDS_GET_INFO;
@@ -176,6 +178,8 @@ extern struct req_format RQF_MDS_QUOTACTL;
 extern struct req_format RQF_QUOTA_DQACQ;
 extern struct req_format RQF_MDS_SWAP_LAYOUTS;
 extern struct req_format RQF_MDS_REINT_MIGRATE;
+extern struct req_format RQF_MDS_REINT_RESYNC;
+extern struct req_format RQF_MDS_RMFID;
 /* MDS hsm formats */
 extern struct req_format RQF_MDS_HSM_STATE_GET;
 extern struct req_format RQF_MDS_HSM_STATE_SET;
@@ -215,7 +219,6 @@ extern struct req_format RQF_LDLM_INTENT_LAYOUT;
 extern struct req_format RQF_LDLM_INTENT_GETATTR;
 extern struct req_format RQF_LDLM_INTENT_OPEN;
 extern struct req_format RQF_LDLM_INTENT_CREATE;
-extern struct req_format RQF_LDLM_INTENT_UNLINK;
 extern struct req_format RQF_LDLM_INTENT_GETXATTR;
 extern struct req_format RQF_LDLM_INTENT_QUOTA;
 extern struct req_format RQF_LDLM_CANCEL;
@@ -223,15 +226,12 @@ extern struct req_format RQF_LDLM_CALLBACK;
 extern struct req_format RQF_LDLM_CP_CALLBACK;
 extern struct req_format RQF_LDLM_BL_CALLBACK;
 extern struct req_format RQF_LDLM_GL_CALLBACK;
-extern struct req_format RQF_LDLM_GL_DESC_CALLBACK;
+extern struct req_format RQF_LDLM_GL_CALLBACK_DESC;
 /* LOG req_format */
-extern struct req_format RQF_LOG_CANCEL;
 extern struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE;
-extern struct req_format RQF_LLOG_ORIGIN_HANDLE_DESTROY;
 extern struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK;
 extern struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK;
 extern struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER;
-extern struct req_format RQF_LLOG_ORIGIN_CONNECT;
 
 extern struct req_format RQF_CONNECT;
 
@@ -257,6 +257,7 @@ extern struct req_msg_field RMF_IDX_INFO;
 extern struct req_msg_field RMF_CLOSE_DATA;
 extern struct req_msg_field RMF_FILE_SECCTX_NAME;
 extern struct req_msg_field RMF_FILE_SECCTX;
+extern struct req_msg_field RMF_FID_ARRAY;
 
 /*
  * connection handle received in MDS_CONNECT request.
@@ -291,6 +292,7 @@ extern struct req_msg_field RMF_HSM_USER_STATE;
 extern struct req_msg_field RMF_HSM_STATE_SET;
 extern struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION;
 extern struct req_msg_field RMF_MDS_HSM_REQUEST;
+extern struct req_msg_field RMF_SELINUX_POL;
 
 /* seq-mgr fields */
 extern struct req_msg_field RMF_SEQ_OPC;
@@ -313,10 +315,12 @@ extern struct req_msg_field RMF_OBD_IOOBJ;
 extern struct req_msg_field RMF_OBD_ID;
 extern struct req_msg_field RMF_FID;
 extern struct req_msg_field RMF_NIOBUF_REMOTE;
+extern struct req_msg_field RMF_NIOBUF_INLINE;
 extern struct req_msg_field RMF_RCS;
 extern struct req_msg_field RMF_FIEMAP_KEY;
 extern struct req_msg_field RMF_FIEMAP_VAL;
 extern struct req_msg_field RMF_OST_ID;
+extern struct req_msg_field RMF_SHORT_IO;
 
 /* MGS config read message format */
 extern struct req_msg_field RMF_MGS_CONFIG_BODY;
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_scrub.h b/drivers/staging/lustrefsx/lustre/include/lustre_scrub.h
new file mode 100644
index 0000000000000..3eba040fac690
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_scrub.h
@@ -0,0 +1,375 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Intel Corporation.
+ */
+/*
+ * lustre/include/lustre_scrub.h
+ *
+ * Shared definitions and declarations for Lustre OI scrub.
+ *
+ * Author: Fan Yong <fan.yong@intel.com>
+ */
+
+#ifndef _LUSTRE_SCRUB_H
+# define _LUSTRE_SCRUB_H
+
+#include <dt_object.h>
+#include <lustre_net.h>
+
+#define OSD_OI_FID_OID_BITS_MAX	10
+#define OSD_OI_FID_NR_MAX	(1UL << OSD_OI_FID_OID_BITS_MAX)
+#define SCRUB_OI_BITMAP_SIZE	(OSD_OI_FID_NR_MAX >> 3)
+#define PFID_STRIPE_IDX_BITS	16
+#define PFID_STRIPE_COUNT_MASK	((1 << PFID_STRIPE_IDX_BITS) - 1)
+
+#define SCRUB_MAGIC_V1			0x4C5FD252
+#define SCRUB_CHECKPOINT_INTERVAL	60
+#define SCRUB_WINDOW_SIZE		1024
+
+enum scrub_next_status {
+	/* exit current loop and process next group */
+	SCRUB_NEXT_BREAK	= 1,
+
+	/* skip current object and process next bit */
+	SCRUB_NEXT_CONTINUE	= 2,
+
+	/* exit all the loops */
+	SCRUB_NEXT_EXIT		= 3,
+
+	/* wait for free cache slot */
+	SCRUB_NEXT_WAIT		= 4,
+
+	/* simulate system crash during OI scrub */
+	SCRUB_NEXT_CRASH	= 5,
+
+	/* simulate failure during OI scrub */
+	SCRUB_NEXT_FATAL	= 6,
+
+	/* new created object, no scrub on it */
+	SCRUB_NEXT_NOSCRUB	= 7,
+
+	/* the object has no FID-in-LMA */
+	SCRUB_NEXT_NOLMA	= 8,
+
+	/* for OST-object */
+	SCRUB_NEXT_OSTOBJ	= 9,
+
+	/* old OST-object, no LMA or no FID-on-OST flags in LMA */
+	SCRUB_NEXT_OSTOBJ_OLD	= 10,
+};
+
+enum scrub_local_file_flags {
+	SLFF_SCAN_SUBITEMS	= 0x0001,
+	SLFF_HIDE_FID		= 0x0002,
+	SLFF_SHOW_NAME		= 0x0004,
+	SLFF_NO_OI		= 0x0008,
+	SLFF_IDX_IN_FID		= 0x0010,
+};
+
+enum scrub_status {
+	/* The scrub file is new created, for new MDT, upgrading from old disk,
+	 * or re-creating the scrub file manually. */
+	SS_INIT		= 0,
+
+	/* The scrub is checking/repairing the OI files. */
+	SS_SCANNING	= 1,
+
+	/* The scrub checked/repaired the OI files successfully. */
+	SS_COMPLETED	= 2,
+
+	/* The scrub failed to check/repair the OI files. */
+	SS_FAILED	= 3,
+
+	/* The scrub is stopped manually, the OI files may be inconsistent. */
+	SS_STOPPED	= 4,
+
+	/* The scrub is paused automatically when umount. */
+	SS_PAUSED	= 5,
+
+	/* The scrub crashed during the scanning, should be restarted. */
+	SS_CRASHED	= 6,
+};
+
+enum scrub_flags {
+	/* OI files have been recreated, OI mappings should be re-inserted. */
+	SF_RECREATED	= 0x0000000000000001ULL,
+
+	/* OI files are invalid, should be rebuild ASAP */
+	SF_INCONSISTENT	= 0x0000000000000002ULL,
+
+	/* OI scrub is triggered automatically. */
+	SF_AUTO		= 0x0000000000000004ULL,
+
+	/* The device is upgraded from 1.8 format. */
+	SF_UPGRADE	= 0x0000000000000008ULL,
+};
+
+enum scrub_param {
+	/* Exit when fail. */
+	SP_FAILOUT	= 0x0001,
+
+	/* Check only without repairing. */
+	SP_DRYRUN	= 0x0002,
+};
+
+enum scrub_start {
+	/* Set failout flag. */
+	SS_SET_FAILOUT		= 0x00000001,
+
+	/* Clear failout flag. */
+	SS_CLEAR_FAILOUT	= 0x00000002,
+
+	/* Reset scrub start position. */
+	SS_RESET		= 0x00000004,
+
+	/* Trigger full scrub automatically. */
+	SS_AUTO_FULL		= 0x00000008,
+
+	/* Trigger partial scrub automatically. */
+	SS_AUTO_PARTIAL		= 0x00000010,
+
+	/* Set dryrun flag. */
+	SS_SET_DRYRUN		= 0x00000020,
+
+	/* Clear dryrun flag. */
+	SS_CLEAR_DRYRUN		= 0x00000040,
+};
+
+enum osd_lf_flags {
+	OLF_SCAN_SUBITEMS	= 0x0001,
+	OLF_HIDE_FID		= 0x0002,
+	OLF_SHOW_NAME		= 0x0004,
+	OLF_NO_OI		= 0x0008,
+	OLF_IDX_IN_FID		= 0x0010,
+	OLF_NOT_BACKUP		= 0x0020,
+};
+
+/* There are some overhead to detect OI inconsistency automatically
+ * during normal RPC handling. We do not want to always auto detect
+ * OI inconsistency especailly when OI scrub just done recently.
+ *
+ * The 'auto_scrub' defines the time (united as second) interval to
+ * enable auto detect OI inconsistency since last OI scurb done. */
+enum auto_scrub {
+	/* Disable auto scrub. */
+	AS_NEVER	= 0,
+
+	/* 1 second is too short interval, it is almost equal to always auto
+	 * detect inconsistent OI, usually used for test. */
+	AS_ALWAYS	= 1,
+
+	/* Enable auto detect OI inconsistency one month (60 * 60 * 24 * 30)
+	 * after last OI scrub. */
+	AS_DEFAULT	= 2592000LL,
+};
+
+struct scrub_file {
+	/* 128-bit uuid for volume. */
+	__u8    sf_uuid[16];
+
+	/* See 'enum scrub_flags'. */
+	__u64   sf_flags;
+
+	/* The scrub magic. */
+	__u32   sf_magic;
+
+	/* See 'enum scrub_status'. */
+	__u16   sf_status;
+
+	/* See 'enum scrub_param'. */
+	__u16   sf_param;
+
+	/* The time for the last OI scrub completed. */
+	time64_t sf_time_last_complete;
+
+	/* The ttime for the latest OI scrub ran. */
+	time64_t sf_time_latest_start;
+
+	/* The time for the last OI scrub checkpoint. */
+	time64_t sf_time_last_checkpoint;
+
+	/* The position for the latest OI scrub started from. */
+	__u64   sf_pos_latest_start;
+
+	/* The position for the last OI scrub checkpoint. */
+	__u64   sf_pos_last_checkpoint;
+
+	/* The position for the first should be updated object. */
+	__u64   sf_pos_first_inconsistent;
+
+	/* How many objects have been checked. */
+	__u64   sf_items_checked;
+
+	/* How many objects have been updated. */
+	__u64   sf_items_updated;
+
+	/* How many objects failed to be processed. */
+	__u64   sf_items_failed;
+
+	/* How many prior objects have been updated during scanning. */
+	__u64   sf_items_updated_prior;
+
+	/* How many objects marked as LDISKFS_STATE_LUSTRE_NOSCRUB. */
+	__u64   sf_items_noscrub;
+
+	/* How many IGIF objects. */
+	__u64   sf_items_igif;
+
+	/* How long the OI scrub has run in seconds. Do NOT change
+	 * to time64_t since this breaks backwards compatibility.
+	 * It shouldn't take more than 136 years to complete :-)
+	 */
+	time_t	sf_run_time;
+
+	/* How many completed OI scrub ran on the device. */
+	__u32   sf_success_count;
+
+	/* How many OI files. */
+	__u16   sf_oi_count;
+
+	/* Keep the flags after scrub reset. See 'enum scrub_internal_flags' */
+	__u16	sf_internal_flags;
+
+	__u32	sf_reserved_1;
+	__u64	sf_reserved_2[16];
+
+	/* Bitmap for OI files recreated case. */
+	__u8    sf_oi_bitmap[SCRUB_OI_BITMAP_SIZE];
+};
+
+struct lustre_scrub {
+	/* Object for the scrub file. */
+	struct dt_object       *os_obj;
+
+	struct ptlrpc_thread    os_thread;
+	struct list_head	os_inconsistent_items;
+
+	/* write lock for scrub prep/update/post/checkpoint,
+	 * read lock for scrub dump. */
+	struct rw_semaphore	os_rwsem;
+	spinlock_t		os_lock;
+
+	/* Scrub file in memory. */
+	struct scrub_file       os_file;
+
+	/* Buffer for scrub file load/store. */
+	struct scrub_file       os_file_disk;
+
+	const char	       *os_name;
+
+	/* The time for last checkpoint, seconds */
+	time64_t		os_time_last_checkpoint;
+
+	/* The time for next checkpoint, seconds */
+	time64_t		os_time_next_checkpoint;
+
+	/* How many objects have been checked since last checkpoint. */
+	__u64			os_new_checked;
+	__u64			os_pos_current;
+	__u32			os_start_flags;
+	unsigned int		os_in_prior:1, /* process inconsistent item
+						* found by RPC prior */
+				os_waiting:1, /* Waiting for scan window. */
+				os_full_speed:1, /* run w/o speed limit */
+				os_paused:1, /* The scrub is paused. */
+				os_convert_igif:1,
+				os_partial_scan:1,
+				os_in_join:1,
+				os_full_scrub:1;
+};
+
+#define INDEX_BACKUP_MAGIC_V1	0x1E41F208
+#define INDEX_BACKUP_BUFSIZE	(4096 * 4)
+
+enum lustre_index_backup_policy {
+	/* By default, do not backup the index */
+	LIBP_NONE	= 0,
+
+	/* Backup the dirty index objects when umount */
+	LIBP_AUTO	= 1,
+};
+
+struct lustre_index_backup_header {
+	__u32		libh_magic;
+	__u32		libh_count;
+	__u32		libh_keysize;
+	__u32		libh_recsize;
+	struct lu_fid	libh_owner;
+	__u64		libh_pad[60]; /* keep header 512 bytes aligned */
+};
+
+struct lustre_index_backup_unit {
+	struct list_head	libu_link;
+	struct lu_fid		libu_fid;
+	__u32			libu_keysize;
+	__u32			libu_recsize;
+};
+
+struct lustre_index_restore_unit {
+	struct list_head	liru_link;
+	struct lu_fid		liru_pfid;
+	struct lu_fid		liru_cfid;
+	__u64			liru_clid;
+	int			liru_len;
+	char			liru_name[0];
+};
+
+void scrub_file_init(struct lustre_scrub *scrub, __u8 *uuid);
+void scrub_file_reset(struct lustre_scrub *scrub, __u8 *uuid, __u64 flags);
+int scrub_file_load(const struct lu_env *env, struct lustre_scrub *scrub);
+int scrub_file_store(const struct lu_env *env, struct lustre_scrub *scrub);
+int scrub_checkpoint(const struct lu_env *env, struct lustre_scrub *scrub);
+int scrub_start(int (*threadfn)(void *data), struct lustre_scrub *scrub,
+		void *data, __u32 flags);
+void scrub_stop(struct lustre_scrub *scrub);
+void scrub_dump(struct seq_file *m, struct lustre_scrub *scrub);
+
+int lustre_liru_new(struct list_head *head, const struct lu_fid *pfid,
+		    const struct lu_fid *cfid, __u64 child,
+		    const char *name, int namelen);
+
+int lustre_index_register(struct dt_device *dev, const char *devname,
+			  struct list_head *head, spinlock_t *lock, int *guard,
+			  const struct lu_fid *fid,
+			  __u32 keysize, __u32 recsize);
+
+void lustre_index_backup(const struct lu_env *env, struct dt_device *dev,
+			 const char *devname, struct list_head *head,
+			 spinlock_t *lock, int *guard, bool backup);
+int lustre_index_restore(const struct lu_env *env, struct dt_device *dev,
+			 const struct lu_fid *parent_fid,
+			 const struct lu_fid *tgt_fid,
+			 const struct lu_fid *bak_fid, const char *name,
+			 struct list_head *head, spinlock_t *lock,
+			 char *buf, int bufsize);
+
+static inline void lustre_fid2lbx(char *buf, const struct lu_fid *fid, int len)
+{
+	snprintf(buf, len, DFID_NOBRACE".lbx", PFID(fid));
+}
+
+static inline const char *osd_scrub2name(struct lustre_scrub *scrub)
+{
+	return scrub->os_name;
+}
+#endif /* _LUSTRE_SCRUB_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_sec.h b/drivers/staging/lustrefsx/lustre/include/lustre_sec.h
index 7e6f490854911..6a69d01150aa1 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_sec.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_sec.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -549,7 +549,7 @@ struct ptlrpc_cli_ctx {
 	atomic_t		cc_refcount;
 	struct ptlrpc_sec      *cc_sec;
 	struct ptlrpc_ctx_ops  *cc_ops;
-	cfs_time_t		cc_expire;	/* in seconds */
+	time64_t		cc_expire;	/* in seconds */
 	unsigned int		cc_early_expire:1;
 	unsigned long		cc_flags;
 	struct vfs_cred		cc_vcred;
@@ -869,6 +869,17 @@ struct ptlrpc_sec {
         /** owning import */
         struct obd_import              *ps_import;
 	spinlock_t			ps_lock;
+	/** mtime of SELinux policy file */
+	ktime_t				ps_sepol_mtime;
+	/** next check time of SELinux policy file */
+	ktime_t				ps_sepol_checknext;
+	/**
+	 * SELinux policy info
+	 * sepol string format is:
+	 * <mode>:<policy name>:<policy version>:<policy hash>
+	 */
+	char				ps_sepol[LUSTRE_NODEMAP_SEPOL_LENGTH
+						 + 1];
 
 	/*
 	 * garbage collection
@@ -1092,6 +1103,7 @@ int  sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req,
 void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req);
 
 void sptlrpc_request_out_callback(struct ptlrpc_request *req);
+int sptlrpc_get_sepol(struct ptlrpc_request *req);
 
 /*
  * exported higher interface of import & request
@@ -1109,6 +1121,7 @@ void sptlrpc_import_flush_all_ctx(struct obd_import *imp);
 int  sptlrpc_req_get_ctx(struct ptlrpc_request *req);
 void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync);
 int  sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout);
+int  sptlrpc_export_update_ctx(struct obd_export *exp);
 int  sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req);
 void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode);
 
@@ -1193,10 +1206,6 @@ int sptlrpc_current_user_desc_size(void);
 int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset);
 int sptlrpc_unpack_user_desc(struct lustre_msg *req, int offset, int swabbed);
 
-
-#define CFS_CAP_CHOWN_MASK (1 << CFS_CAP_CHOWN)
-#define CFS_CAP_SYS_RESOURCE_MASK (1 << CFS_CAP_SYS_RESOURCE)
-
 /** @} sptlrpc */
 
 #endif /* _LUSTRE_SEC_H_ */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_swab.h b/drivers/staging/lustrefsx/lustre/include/lustre_swab.h
index 8f8b375e64c25..96dcd493f5f33 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_swab.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_swab.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2016, Intel Corporation.
+ * Copyright (c) 2017, Intel Corporation.
  *
  * Copyright 2015 Cray Inc, all rights reserved.
  * Author: Ben Evans.
@@ -48,10 +48,11 @@
 #ifndef _LUSTRE_SWAB_H_
 #define _LUSTRE_SWAB_H_
 
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 
 void lustre_swab_orphan_ent(struct lu_orphan_ent *ent);
 void lustre_swab_orphan_ent_v2(struct lu_orphan_ent_v2 *ent);
+void lustre_swab_orphan_ent_v3(struct lu_orphan_ent_v3 *ent);
 void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
 void lustre_swab_connect(struct obd_connect_data *ocd);
 void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
@@ -92,11 +93,13 @@ void lustre_swab_obdo(struct obdo *o);
 void lustre_swab_ost_body(struct ost_body *b);
 void lustre_swab_ost_last_id(__u64 *id);
 void lustre_swab_fiemap(struct fiemap *fiemap);
+void lustre_swab_fiemap_info_key(struct ll_fiemap_info_key *fiemap_info);
 void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum);
 void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum);
 void lustre_swab_lov_comp_md_v1(struct lov_comp_md_v1 *lum);
 void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
 				     int stripe_count);
+void lustre_swab_lov_user_md(struct lov_user_md *lum, size_t size);
 void lustre_swab_lov_mds_md(struct lov_mds_md *lmm);
 void lustre_swab_idx_info(struct idx_info *ii);
 void lustre_swab_lip_header(struct lu_idxpage *lip);
@@ -118,6 +121,7 @@ void lustre_swab_object_update_result(struct object_update_result *our);
 void lustre_swab_object_update_reply(struct object_update_reply *our);
 void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl);
 void lustre_swab_close_data(struct close_data *data);
+void lustre_swab_close_data_resync_done(struct close_data_resync_done *resync);
 void lustre_swab_lmv_user_md(struct lmv_user_md *lum);
 void lustre_swab_ladvise(struct lu_ladvise *ladvise);
 void lustre_swab_ladvise_hdr(struct ladvise_hdr *ladvise_hdr);
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_update.h b/drivers/staging/lustrefsx/lustre/include/lustre_update.h
index 968cc51028d86..78cd3d4bfdd51 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_update.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_update.h
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2013, 2016, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
  */
 /*
  * lustre/include/lustre_update.h
@@ -454,6 +454,9 @@ int out_xattr_get_pack(const struct lu_env *env,
 		       struct object_update *update, size_t *max_update_size,
 		       const struct lu_fid *fid, const char *name,
 		       const int bufsize);
+int out_xattr_list_pack(const struct lu_env *env, struct object_update *update,
+		       size_t *max_update_size, const struct lu_fid *fid,
+		       const int bufsize);
 int out_read_pack(const struct lu_env *env, struct object_update *update,
 		  size_t *max_update_length, const struct lu_fid *fid,
 		  size_t size, loff_t pos);
diff --git a/drivers/staging/lustrefsx/lustre/include/md_object.h b/drivers/staging/lustrefsx/lustre/include/md_object.h
index d64d243ff8988..a5f994e36d50b 100644
--- a/drivers/staging/lustrefsx/lustre/include/md_object.h
+++ b/drivers/staging/lustrefsx/lustre/include/md_object.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -74,6 +74,7 @@ enum ma_valid {
 	MA_HSM       = 1 << 6,
 	MA_PFID      = 1 << 7,
 	MA_LMV_DEF   = 1 << 8,
+	MA_SOM	     = 1 << 9,
 };
 
 typedef enum {
@@ -108,34 +109,47 @@ struct md_hsm {
 	__u64	mh_arch_ver;
 };
 
+
+/* memory structure for SOM attributes
+ * for fields description see the on disk structure som_attrs
+ * which is defined in lustre_idl.h
+ */
+struct md_som {
+	__u16	ms_valid;
+	__u64	ms_size;
+	__u64	ms_blocks;
+};
+
 struct md_attr {
-        __u64                   ma_valid;
-        __u64                   ma_need;
-        __u64                   ma_attr_flags;
-        struct lu_attr          ma_attr;
-        struct lu_fid           ma_pfid;
-        struct md_hsm           ma_hsm;
-        struct lov_mds_md      *ma_lmm;
-	union lmv_mds_md       *ma_lmv;
-        void                   *ma_acl;
-        int                     ma_lmm_size;
-        int                     ma_lmv_size;
-        int                     ma_acl_size;
+	__u64			 ma_valid;
+	__u64			 ma_need;
+	__u64			 ma_attr_flags;
+	struct lu_attr		 ma_attr;
+	struct lu_fid		 ma_pfid;
+	struct md_hsm		 ma_hsm;
+	struct md_som		 ma_som;
+	struct lov_mds_md	*ma_lmm;
+	union lmv_mds_md	*ma_lmv;
+	void			*ma_acl;
+	int			 ma_lmm_size;
+	int			 ma_lmv_size;
+	int			 ma_acl_size;
+	int			 ma_enable_chprojid_gid;
 };
 
 /** Additional parameters for create */
 struct md_op_spec {
-        union {
-                /** symlink target */
-                const char               *sp_symname;
-                /** eadata for regular files */
-                struct md_spec_reg {
-                        const void *eadata;
-                        int  eadatalen;
-                } sp_ea;
-        } u;
-
-	/** Create flag from client: such as MDS_OPEN_CREAT, and others. */
+	union {
+		/** symlink target */
+		const char *sp_symname;
+		/** eadata for regular files */
+		struct md_spec_reg {
+			void *eadata;
+			int  eadatalen;
+		} sp_ea;
+	} u;
+
+	/** Open flags from client: such as MDS_OPEN_CREAT, and others. */
 	__u64      sp_cr_flags;
 
 	/* File security context for creates. */
@@ -150,10 +164,30 @@ struct md_op_spec {
 		     sp_permitted:1, /* do not check permission */
 		     sp_migrate_close:1; /* close the file during migrate */
 	/** Current lock mode for parent dir where create is performing. */
-        mdl_mode_t sp_cr_mode;
+	mdl_mode_t sp_cr_mode;
+
+	/** to create directory */
+	const struct dt_index_features *sp_feat;
+};
+
+enum md_layout_opc {
+	MD_LAYOUT_NOP	= 0,
+	MD_LAYOUT_WRITE,	/* FLR: write the file */
+	MD_LAYOUT_RESYNC,	/* FLR: resync starts */
+	MD_LAYOUT_RESYNC_DONE,	/* FLR: resync done */
+};
 
-        /** to create directory */
-        const struct dt_index_features *sp_feat;
+/**
+ * Parameters for layout change API.
+ */
+struct md_layout_change {
+	enum md_layout_opc	 mlc_opc;
+	__u16			 mlc_mirror_id;
+	struct layout_intent	*mlc_intent;
+	struct lu_buf		 mlc_buf;
+	struct lustre_som_attrs	 mlc_som;
+	size_t			 mlc_resync_count;
+	__u32			*mlc_resync_ids;
 };
 
 union ldlm_policy_data;
@@ -161,51 +195,53 @@ union ldlm_policy_data;
  * Operations implemented for each md object (both directory and leaf).
  */
 struct md_object_operations {
-        int (*moo_permission)(const struct lu_env *env,
-                              struct md_object *pobj, struct md_object *cobj,
-                              struct md_attr *attr, int mask);
+	int (*moo_permission)(const struct lu_env *env,
+			      struct md_object *pobj, struct md_object *cobj,
+			      struct md_attr *attr, int mask);
 
-        int (*moo_attr_get)(const struct lu_env *env, struct md_object *obj,
-                            struct md_attr *attr);
+	int (*moo_attr_get)(const struct lu_env *env, struct md_object *obj,
+			    struct md_attr *attr);
 
-        int (*moo_attr_set)(const struct lu_env *env, struct md_object *obj,
-                            const struct md_attr *attr);
+	int (*moo_attr_set)(const struct lu_env *env, struct md_object *obj,
+			    const struct md_attr *attr);
 
-        int (*moo_xattr_get)(const struct lu_env *env, struct md_object *obj,
-                             struct lu_buf *buf, const char *name);
+	int (*moo_xattr_get)(const struct lu_env *env, struct md_object *obj,
+			     struct lu_buf *buf, const char *name);
 
-        int (*moo_xattr_list)(const struct lu_env *env, struct md_object *obj,
-                              struct lu_buf *buf);
+	int (*moo_xattr_list)(const struct lu_env *env, struct md_object *obj,
+			      struct lu_buf *buf);
 
-        int (*moo_xattr_set)(const struct lu_env *env, struct md_object *obj,
-                             const struct lu_buf *buf, const char *name,
-                             int fl);
+	int (*moo_xattr_set)(const struct lu_env *env, struct md_object *obj,
+			     const struct lu_buf *buf, const char *name,
+			     int fl);
 
-        int (*moo_xattr_del)(const struct lu_env *env, struct md_object *obj,
-                             const char *name);
+	int (*moo_xattr_del)(const struct lu_env *env, struct md_object *obj,
+			     const char *name);
 
 	/** This method is used to swap the layouts between 2 objects */
 	int (*moo_swap_layouts)(const struct lu_env *env,
 			       struct md_object *obj1, struct md_object *obj2,
 			       __u64 flags);
 
-        /** \retval number of bytes actually read upon success */
-        int (*moo_readpage)(const struct lu_env *env, struct md_object *obj,
-                            const struct lu_rdpg *rdpg);
+	/** \retval number of bytes actually read upon success */
+	int (*moo_readpage)(const struct lu_env *env, struct md_object *obj,
+			    const struct lu_rdpg *rdpg);
+
+	int (*moo_readlink)(const struct lu_env *env, struct md_object *obj,
+			    struct lu_buf *buf);
 
-        int (*moo_readlink)(const struct lu_env *env, struct md_object *obj,
-                            struct lu_buf *buf);
 	int (*moo_changelog)(const struct lu_env *env,
-			     enum changelog_rec_type type, int flags,
+			     enum changelog_rec_type type,
+			     enum changelog_rec_flags clf_flags,
 			     struct md_device *m, const struct lu_fid *fid);
 
-        int (*moo_open)(const struct lu_env *env,
-                        struct md_object *obj, int flag);
+	int (*moo_open)(const struct lu_env *env,
+			struct md_object *obj, u64 open_flags);
 
-        int (*moo_close)(const struct lu_env *env, struct md_object *obj,
-                         struct md_attr *ma, int mode);
+	int (*moo_close)(const struct lu_env *env, struct md_object *obj,
+			 struct md_attr *ma, u64 open_flags);
 
-        int (*moo_object_sync)(const struct lu_env *, struct md_object *);
+	int (*moo_object_sync)(const struct lu_env *, struct md_object *);
 
 	int (*moo_object_lock)(const struct lu_env *env, struct md_object *obj,
 			       struct lustre_handle *lh,
@@ -222,55 +258,62 @@ struct md_object_operations {
 	 *
 	 * The caller should have held layout lock.
 	 *
+	 * This API can be extended to support every other layout changing
+	 * operations, such as component {add,del,change}, layout swap,
+	 * layout merge, etc. One of the benefits by doing this is that the MDT
+	 * no longer needs to understand layout.
+	 *
+	 * However, layout creation, removal, and fetch should still use
+	 * xattr_{get,set}() because they don't interpret layout on the
+	 * MDT layer.
+	 *
 	 * \param[in] env	execution environment
 	 * \param[in] obj	MD object
 	 * \param[in] layout	data structure to describe the changes to
 	 *			the MD object's layout
-	 * \param[in] buf	buffer containing the client's lovea
 	 *
 	 * \retval 0		success
 	 * \retval -ne		error code
 	 */
 	int (*moo_layout_change)(const struct lu_env *env,
 				 struct md_object *obj,
-				 struct layout_intent *layout,
-				 const struct lu_buf *buf);
+				 struct md_layout_change *layout);
 };
 
 /**
  * Operations implemented for each directory object.
  */
 struct md_dir_operations {
-        int (*mdo_is_subdir) (const struct lu_env *env, struct md_object *obj,
-                              const struct lu_fid *fid, struct lu_fid *sfid);
+	int (*mdo_is_subdir)(const struct lu_env *env, struct md_object *obj,
+			     const struct lu_fid *fid);
 
-        int (*mdo_lookup)(const struct lu_env *env, struct md_object *obj,
-                          const struct lu_name *lname, struct lu_fid *fid,
-                          struct md_op_spec *spec);
+	int (*mdo_lookup)(const struct lu_env *env, struct md_object *obj,
+			  const struct lu_name *lname, struct lu_fid *fid,
+			  struct md_op_spec *spec);
 
-        mdl_mode_t (*mdo_lock_mode)(const struct lu_env *env,
-                                    struct md_object *obj,
-                                    mdl_mode_t mode);
+	mdl_mode_t (*mdo_lock_mode)(const struct lu_env *env,
+				    struct md_object *obj,
+				    mdl_mode_t mode);
 
-        int (*mdo_create)(const struct lu_env *env, struct md_object *pobj,
-                          const struct lu_name *lname, struct md_object *child,
-                          struct md_op_spec *spec,
-                          struct md_attr *ma);
+	int (*mdo_create)(const struct lu_env *env, struct md_object *pobj,
+			  const struct lu_name *lname, struct md_object *child,
+			  struct md_op_spec *spec,
+			  struct md_attr *ma);
 
-        /** This method is used for creating data object for this meta object*/
-        int (*mdo_create_data)(const struct lu_env *env, struct md_object *p,
-                               struct md_object *o,
-                               const struct md_op_spec *spec,
-                               struct md_attr *ma);
+	/** This method is used for creating data object for this meta object*/
+	int (*mdo_create_data)(const struct lu_env *env, struct md_object *p,
+			       struct md_object *o,
+			       const struct md_op_spec *spec,
+			       struct md_attr *ma);
 
-        int (*mdo_rename)(const struct lu_env *env, struct md_object *spobj,
-                          struct md_object *tpobj, const struct lu_fid *lf,
-                          const struct lu_name *lsname, struct md_object *tobj,
-                          const struct lu_name *ltname, struct md_attr *ma);
+	int (*mdo_rename)(const struct lu_env *env, struct md_object *spobj,
+			  struct md_object *tpobj, const struct lu_fid *lf,
+			  const struct lu_name *lsname, struct md_object *tobj,
+			  const struct lu_name *ltname, struct md_attr *ma);
 
-        int (*mdo_link)(const struct lu_env *env, struct md_object *tgt_obj,
-                        struct md_object *src_obj, const struct lu_name *lname,
-                        struct md_attr *ma);
+	int (*mdo_link)(const struct lu_env *env, struct md_object *tgt_obj,
+			struct md_object *src_obj, const struct lu_name *lname,
+			struct md_attr *ma);
 
 	int (*mdo_unlink)(const struct lu_env *env, struct md_object *pobj,
 			  struct md_object *cobj, const struct lu_name *lname,
@@ -278,7 +321,8 @@ struct md_dir_operations {
 
 	int (*mdo_migrate)(const struct lu_env *env, struct md_object *pobj,
 			   struct md_object *sobj, const struct lu_name *lname,
-			   struct md_object *tobj, struct md_attr *ma);
+			   struct md_object *tobj, struct md_op_spec *spec,
+			   struct md_attr *ma);
 };
 
 struct md_device_operations {
@@ -286,8 +330,8 @@ struct md_device_operations {
 	int (*mdo_root_get)(const struct lu_env *env, struct md_device *m,
 			    struct lu_fid *f);
 
-	int (*mdo_maxeasize_get)(const struct lu_env *env, struct md_device *m,
-				int *easize);
+	const struct dt_device_param *(*mdo_dtconf_get)(const struct lu_env *e,
+							struct md_device *m);
 
         int (*mdo_statfs)(const struct lu_env *env, struct md_device *m,
                           struct obd_statfs *sfs);
@@ -346,22 +390,19 @@ static inline struct md_object *md_object_find_slice(const struct lu_env *env,
 
 
 /** md operations */
-static inline int mo_permission(const struct lu_env *env,
-                                struct md_object *p,
-                                struct md_object *c,
-                                struct md_attr *at,
-                                int mask)
+static inline int mo_permission(const struct lu_env *env, struct md_object *p,
+				struct md_object *c, struct md_attr *at,
+				int mask)
 {
-        LASSERT(c->mo_ops->moo_permission);
-        return c->mo_ops->moo_permission(env, p, c, at, mask);
+	LASSERT(c->mo_ops->moo_permission);
+	return c->mo_ops->moo_permission(env, p, c, at, mask);
 }
 
-static inline int mo_attr_get(const struct lu_env *env,
-                              struct md_object *m,
-                              struct md_attr *at)
+static inline int mo_attr_get(const struct lu_env *env, struct md_object *m,
+			      struct md_attr *at)
 {
-        LASSERT(m->mo_ops->moo_attr_get);
-        return m->mo_ops->moo_attr_get(env, m, at);
+	LASSERT(m->mo_ops->moo_attr_get);
+	return m->mo_ops->moo_attr_get(env, m, at);
 }
 
 static inline int mo_readlink(const struct lu_env *env,
@@ -374,8 +415,8 @@ static inline int mo_readlink(const struct lu_env *env,
 
 static inline int mo_changelog(const struct lu_env *env,
 			       enum changelog_rec_type type,
-			       int flags, struct md_device *m,
-			       const struct lu_fid *fid)
+			       enum changelog_rec_flags clf_flags,
+			       struct md_device *m, const struct lu_fid *fid)
 {
 	struct lu_fid rootfid;
 	struct md_object *root;
@@ -390,7 +431,7 @@ static inline int mo_changelog(const struct lu_env *env,
 		RETURN(PTR_ERR(root));
 
 	LASSERT(root->mo_ops->moo_changelog);
-	rc = root->mo_ops->moo_changelog(env, type, flags, m, fid);
+	rc = root->mo_ops->moo_changelog(env, type, clf_flags, m, fid);
 
 	lu_object_put(env, &root->mo_lu);
 
@@ -448,12 +489,11 @@ static inline int mo_invalidate(const struct lu_env *env, struct md_object *m)
 
 static inline int mo_layout_change(const struct lu_env *env,
 				   struct md_object *m,
-				   struct layout_intent *layout,
-				   const struct lu_buf *buf)
+				   struct md_layout_change *layout)
 {
 	/* need instantiate objects which in the access range */
 	LASSERT(m->mo_ops->moo_layout_change);
-	return m->mo_ops->moo_layout_change(env, m, layout, buf);
+	return m->mo_ops->moo_layout_change(env, m, layout);
 }
 
 static inline int mo_swap_layouts(const struct lu_env *env,
@@ -467,21 +507,18 @@ static inline int mo_swap_layouts(const struct lu_env *env,
 	return o1->mo_ops->moo_swap_layouts(env, o1, o2, flags);
 }
 
-static inline int mo_open(const struct lu_env *env,
-                          struct md_object *m,
-                          int flags)
+static inline int mo_open(const struct lu_env *env, struct md_object *m,
+			  u64 open_flags)
 {
-        LASSERT(m->mo_ops->moo_open);
-        return m->mo_ops->moo_open(env, m, flags);
+	LASSERT(m->mo_ops->moo_open);
+	return m->mo_ops->moo_open(env, m, open_flags);
 }
 
-static inline int mo_close(const struct lu_env *env,
-                           struct md_object *m,
-                           struct md_attr *ma,
-                           int mode)
+static inline int mo_close(const struct lu_env *env, struct md_object *m,
+			   struct md_attr *ma, u64 open_flags)
 {
-        LASSERT(m->mo_ops->moo_close);
-        return m->mo_ops->moo_close(env, m, ma, mode);
+	LASSERT(m->mo_ops->moo_close);
+	return m->mo_ops->moo_close(env, m, ma, open_flags);
 }
 
 static inline int mo_readpage(const struct lu_env *env,
@@ -576,19 +613,20 @@ static inline int mdo_migrate(const struct lu_env *env,
 			     struct md_object *sobj,
 			     const struct lu_name *lname,
 			     struct md_object *tobj,
+			     struct md_op_spec *spec,
 			     struct md_attr *ma)
 {
 	LASSERT(pobj->mo_dir_ops->mdo_migrate);
-	return pobj->mo_dir_ops->mdo_migrate(env, pobj, sobj, lname, tobj, ma);
+	return pobj->mo_dir_ops->mdo_migrate(env, pobj, sobj, lname, tobj, spec,
+					     ma);
 }
 
 static inline int mdo_is_subdir(const struct lu_env *env,
-                                struct md_object *mo,
-                                const struct lu_fid *fid,
-                                struct lu_fid *sfid)
+				struct md_object *mo,
+				const struct lu_fid *fid)
 {
-        LASSERT(mo->mo_dir_ops->mdo_is_subdir);
-        return mo->mo_dir_ops->mdo_is_subdir(env, mo, fid, sfid);
+	LASSERT(mo->mo_dir_ops->mdo_is_subdir);
+	return mo->mo_dir_ops->mdo_is_subdir(env, mo, fid);
 }
 
 static inline int mdo_link(const struct lu_env *env,
@@ -611,6 +649,14 @@ static inline int mdo_unlink(const struct lu_env *env,
 	return p->mo_dir_ops->mdo_unlink(env, p, c, lname, ma, no_name);
 }
 
+static inline int mdo_statfs(const struct lu_env *env,
+			     struct md_device *m,
+			     struct obd_statfs *sfs)
+{
+	LASSERT(m->md_ops->mdo_statfs);
+	return m->md_ops->mdo_statfs(env, m, sfs);
+}
+
 /**
  * Used in MDD/OUT layer for object lock rule
  **/
@@ -624,6 +670,7 @@ enum mdd_object_role {
 
 struct dt_device;
 
+void lustre_som_swab(struct lustre_som_attrs *attrs);
 int lustre_buf2hsm(void *buf, int rc, struct md_hsm *mh);
 void lustre_hsm2buf(void *buf, const struct md_hsm *mh);
 
@@ -650,6 +697,8 @@ struct lu_ucred {
 	struct group_info	*uc_ginfo;
 	struct md_identity	*uc_identity;
 	char			 uc_jobid[LUSTRE_JOBID_SIZE];
+	lnet_nid_t		 uc_nid;
+	bool			 uc_enable_audit;
 };
 
 struct lu_ucred *lu_ucred(const struct lu_env *env);
diff --git a/drivers/staging/lustrefsx/lustre/include/obd.h b/drivers/staging/lustrefsx/lustre/include/obd.h
index 9d49ce5a2a17a..7c00e69a20322 100644
--- a/drivers/staging/lustrefsx/lustre/include/obd.h
+++ b/drivers/staging/lustrefsx/lustre/include/obd.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -33,9 +33,11 @@
 #ifndef __OBD_H
 #define __OBD_H
 
+#include <linux/kobject.h>
 #include <linux/spinlock.h>
+#include <linux/sysfs.h>
 
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 #include <lustre_lib.h>
 #include <libcfs/bitmap.h>
 #ifdef HAVE_SERVER_SUPPORT
@@ -100,11 +102,15 @@ struct obd_type {
 	struct md_ops		*typ_md_ops;
 	struct proc_dir_entry	*typ_procroot;
 	struct proc_dir_entry	*typ_procsym;
-	__u32			 typ_sym_filter;
+	struct dentry		*typ_debugfs_entry;
+#ifdef HAVE_SERVER_SUPPORT
+	bool			 typ_sym_filter;
+#endif
 	char			*typ_name;
 	int			 typ_refcnt;
 	struct lu_device_type	*typ_lu;
 	spinlock_t		 obd_type_lock;
+	struct kobject		*typ_kobj;
 };
 
 struct brw_page {
@@ -116,7 +122,7 @@ struct brw_page {
 
 struct timeout_item {
 	enum timeout_event ti_event;
-	cfs_time_t         ti_timeout;
+	time64_t	   ti_timeout;
 	timeout_cb_t       ti_cb;
 	void              *ti_cb_data;
 	struct list_head   ti_obd_list;
@@ -126,16 +132,15 @@ struct timeout_item {
 #define OBD_MAX_RIF_DEFAULT	8
 #define OBD_MAX_RIF_MAX		512
 #define OSC_MAX_RIF_MAX		256
-#define OSC_MAX_DIRTY_DEFAULT	(OBD_MAX_RIF_DEFAULT * 4)
+#define OSC_MAX_DIRTY_DEFAULT	2000	 /* Arbitrary large value */
 #define OSC_MAX_DIRTY_MB_MAX	2048     /* arbitrary, but < MAX_LONG bytes */
 #define OSC_DEFAULT_RESENDS	10
 
-/* possible values for fo_sync_lock_cancel */
-enum {
-        NEVER_SYNC_ON_CANCEL = 0,
-        BLOCKING_SYNC_ON_CANCEL = 1,
-        ALWAYS_SYNC_ON_CANCEL = 2,
-        NUM_SYNC_ON_CANCEL_STATES
+/* possible values for lut_sync_lock_cancel */
+enum tgt_sync_lock_cancel {
+	SYNC_LOCK_CANCEL_NEVER    = 0,
+	SYNC_LOCK_CANCEL_BLOCKING = 1,
+	SYNC_LOCK_CANCEL_ALWAYS   = 2,
 };
 
 /*
@@ -181,6 +186,17 @@ struct client_obd {
 	 * run-time if a larger observed size is advertised by the MDT. */
 	__u32			 cl_max_mds_easize;
 
+	/* Data-on-MDT specific value to set larger reply buffer for possible
+	 * data read along with open/stat requests. By default it tries to use
+	 * unused space in reply buffer.
+	 * This value is used to ensure that reply buffer has at least as
+	 * much free space as value indicates. That free space is gained from
+	 * LOV EA buffer which is small for DoM files and on big systems can
+	 * provide up to 32KB of extra space in reply buffer.
+	 * Default value is 8K now.
+	 */
+	__u32			 cl_dom_min_inline_repsize;
+
 	enum lustre_sec_part	 cl_sp_me;
 	enum lustre_sec_part	 cl_sp_to;
 	struct sptlrpc_flavor	 cl_flvr_mgc; /* fixed flavor of mgc->mgs */
@@ -188,7 +204,6 @@ struct client_obd {
 	/* the grant values are protected by loi_list_lock below */
 	unsigned long		 cl_dirty_pages;      /* all _dirty_ in pages */
 	unsigned long		 cl_dirty_max_pages;  /* allowed w/o rpc */
-	unsigned long		 cl_dirty_transit;    /* dirty synchronous */
 	unsigned long		 cl_avail_grant;   /* bytes of credit for ost */
 	unsigned long		 cl_lost_grant;    /* lost credits (trunc) */
 	/* grant consumed for dirty pages */
@@ -199,10 +214,10 @@ struct client_obd {
 	 * grant before trying to dirty a page and unreserve the rest.
 	 * See osc_{reserve|unreserve}_grant for details. */
 	long			cl_reserved_grant;
-	struct list_head	cl_cache_waiters; /* waiting for cache/grant */
-	cfs_time_t		cl_next_shrink_grant;   /* jiffies */
-	struct list_head	cl_grant_shrink_list;  /* Timeout event list */
-	int			cl_grant_shrink_interval; /* seconds */
+	wait_queue_head_t	cl_cache_waiters; /* waiting for cache/grant */
+	time64_t		cl_next_shrink_grant;	/* seconds */
+	struct list_head	cl_grant_chain;
+	time64_t		cl_grant_shrink_interval; /* seconds */
 
 	/* A chunk is an optimal size used by osc_extent to determine
 	 * the extent size. A chunk is max(PAGE_SIZE, OST block size) */
@@ -240,8 +255,9 @@ struct client_obd {
 	/* just a sum of the loi/lop pending numbers to be exported by /proc */
 	atomic_t		cl_pending_w_pages;
 	atomic_t		cl_pending_r_pages;
-	__u32			cl_max_pages_per_rpc;
-	__u32			cl_max_rpcs_in_flight;
+	u32			cl_max_pages_per_rpc;
+	u32			cl_max_rpcs_in_flight;
+	u32			cl_max_short_io_bytes;
 	struct obd_histogram	cl_read_rpc_hist;
 	struct obd_histogram	cl_write_rpc_hist;
 	struct obd_histogram	cl_read_page_hist;
@@ -288,8 +304,6 @@ struct client_obd {
 	atomic_t		 cl_destroy_in_flight;
 	wait_queue_head_t	 cl_destroy_waitq;
 
-        struct mdc_rpc_lock     *cl_rpc_lock;
-
 	/* modify rpcs in flight
 	 * currently used for metadata only */
 	spinlock_t		 cl_mod_rpcs_lock;
@@ -304,8 +318,11 @@ struct client_obd {
 	struct mutex		  cl_mgc_mutex;
 	struct local_oid_storage *cl_mgc_los;
 	struct dt_object	 *cl_mgc_configs_dir;
-	atomic_t		  cl_mgc_refcount;
 	struct obd_export        *cl_mgc_mgsexp;
+	atomic_t		  cl_mgc_refcount;
+	/* in-flight control list and total RPCs counter */
+	struct list_head	 cl_flight_waiters;
+	__u32			 cl_rpcs_in_flight;
 
         /* checksumming for data sent over the network */
 	unsigned int		 cl_checksum:1, /* 0 = disabled, 1 = enabled */
@@ -313,7 +330,7 @@ struct client_obd {
         /* supported checksum types that are worked out at connect time */
         __u32                    cl_supp_cksum_types;
         /* checksum algorithm to be used */
-        cksum_type_t             cl_cksum_type;
+	enum cksum_types	 cl_cksum_type;
 
         /* also protected by the poorly named _loi_list_lock lock above */
         struct osc_async_rc      cl_ar;
@@ -327,8 +344,11 @@ struct client_obd {
 	/* ptlrpc work for writeback in ptlrpcd context */
 	void			*cl_writeback_work;
 	void			*cl_lru_work;
+	struct mutex		  cl_quota_mutex;
 	/* hash tables for osc_quota_info */
 	struct cfs_hash		*cl_quota_hash[LL_MAXQUOTAS];
+	/* the xid of the request updating the hash tables */
+	__u64			 cl_quota_last_xid;
 	/* Links to the global list of registered changelog devices */
 	struct list_head	 cl_chg_dev_linkage;
 };
@@ -358,6 +378,8 @@ struct ost_pool {
 
 /* allow statfs data caching for 1 second */
 #define OBD_STATFS_CACHE_SECONDS 1
+/* arbitrary maximum. larger would be useless, allows catching bogus input */
+#define OBD_STATFS_CACHE_MAX_AGE 3600 /* seconds */
 
 struct lov_tgt_desc {
 	struct list_head    ltd_kill;
@@ -371,6 +393,11 @@ struct lov_tgt_desc {
                             ltd_reap:1;  /* should this target be deleted */
 };
 
+struct lov_md_tgt_desc {
+	struct obd_device *lmtd_mdc;
+	__u32		   lmtd_index;
+};
+
 struct lov_obd {
 	struct lov_desc		desc;
 	struct lov_tgt_desc   **lov_tgts;		/* sparse array */
@@ -393,10 +420,15 @@ struct lov_obd {
 	struct cl_client_cache *lov_cache;
 
 	struct rw_semaphore	lov_notify_lock;
+	/* Data-on-MDT: MDC array */
+	struct lov_md_tgt_desc	*lov_mdc_tgts;
+
+	struct kobject		*lov_tgts_kobj;
 };
 
 struct lmv_tgt_desc {
 	struct obd_uuid		ltd_uuid;
+	struct obd_device	*ltd_obd;
 	struct obd_export	*ltd_exp;
 	__u32			ltd_idx;
 	struct mutex		ltd_fid_mutex;
@@ -407,19 +439,23 @@ struct lmv_obd {
 	struct lu_client_fld	lmv_fld;
 	spinlock_t		lmv_lock;
 	struct lmv_desc		desc;
-	struct proc_dir_entry	*targets_proc_entry;
 
 	struct mutex		lmv_init_mutex;
 	int			connected;
 	int			max_easize;
 	int			max_def_easize;
+	u32			lmv_statfs_start;
 
-	__u32			tgts_size; /* size of tgts array */
+	u32			tgts_size; /* size of tgts array */
 	struct lmv_tgt_desc	**tgts;
 
 	struct obd_connect_data	conn_data;
+	struct kobject		*lmv_tgts_kobj;
 };
 
+/* Minimum sector size is 512 */
+#define MAX_GUARD_NUMBER (PAGE_SIZE / 512)
+
 struct niobuf_local {
 	__u64		lnb_file_offset;
 	__u32		lnb_page_offset;
@@ -428,6 +464,11 @@ struct niobuf_local {
 	int		lnb_rc;
 	struct page	*lnb_page;
 	void		*lnb_data;
+	__u16		lnb_guards[MAX_GUARD_NUMBER];
+	__u16		lnb_guard_rpc:1;
+	__u16		lnb_guard_disk:1;
+	/* separate unlock for read path to allow shared access */
+	__u16		lnb_locked:1;
 };
 
 struct tgt_thread_big_cache {
@@ -540,7 +581,7 @@ enum obd_notify_event {
 
 /*
  * Data structure used to pass obd_notify()-event to non-obd listeners (llite
- * and liblustre being main examples).
+ * being main example).
  */
 struct obd_notify_upcall {
 	int (*onu_upcall)(struct obd_device *host, struct obd_device *watched,
@@ -595,7 +636,6 @@ struct obd_device {
 					 * (for /proc/status only!!) */
 		obd_no_ir:1,		/* no imperative recovery. */
 		obd_process_conf:1,	/* device is processing mgs config */
-		obd_uses_nid_stats:1,	/* maintain per-client OBD stats */
 		obd_checksum_dump:1;	/* dump pages upon cksum error */
 
         /* use separate field as it is set in interrupt to don't mess with
@@ -623,7 +663,7 @@ struct obd_device {
 	spinlock_t		obd_dev_lock; /* protect OBD bitfield above */
 	spinlock_t		obd_osfs_lock;
 	struct obd_statfs	obd_osfs;       /* locked by obd_osfs_lock */
-	__u64			obd_osfs_age;
+	time64_t		obd_osfs_age;
 	__u64			obd_last_committed;
 	struct mutex		obd_dev_mutex;
 	struct lvfs_run_ctxt	obd_lvfs_ctxt;
@@ -635,9 +675,9 @@ struct obd_device {
 	struct obd_export	*obd_lwp_export;
 	/* list of exports in LRU order, for ping evictor, with obd_dev_lock */
 	struct list_head	obd_exports_timed;
-	time_t			obd_eviction_timer;	/* for ping evictor */
+	time64_t		obd_eviction_timer;	/* for ping evictor */
 
-	int                     obd_max_recoverable_clients;
+	atomic_t                obd_max_recoverable_clients;
 	atomic_t                obd_connected_clients;
 	int                     obd_stale_clients;
         /* this lock protects all recovery list_heads, timer and
@@ -648,7 +688,7 @@ struct obd_device {
 	int			obd_requests_queued_for_recovery;
 	wait_queue_head_t	obd_next_transno_waitq;
 	/* protected by obd_recovery_task_lock */
-	struct timer_list	obd_recovery_timer;
+	struct hrtimer		obd_recovery_timer;
 	/* seconds */
 	time64_t		obd_recovery_start;
 	/* seconds, for lprocfs_status */
@@ -683,16 +723,17 @@ struct obd_device {
 
 	/* Fields used by LProcFS */
 	struct lprocfs_stats		*obd_stats;
-	unsigned int			obd_cntr_base;
 
-	unsigned int			 obd_md_cntr_base;
 	struct lprocfs_stats		*obd_md_stats;
 
+	struct dentry			*obd_debugfs_entry;
 	struct proc_dir_entry	*obd_proc_entry;
 	struct proc_dir_entry	*obd_proc_exports_entry;
-	struct proc_dir_entry	*obd_svc_procroot;
+	struct dentry			*obd_svc_debugfs_entry;
 	struct lprocfs_stats	*obd_svc_stats;
+	const struct attribute	       **obd_attrs;
 	struct lprocfs_vars	*obd_vars;
+	struct ldebugfs_vars	*obd_debugfs_vars;
 	atomic_t		obd_evict_inprogress;
 	wait_queue_head_t	obd_evict_inprogress_waitq;
 	struct list_head	obd_evict_list;	/* protected with pet_lock */
@@ -709,6 +750,10 @@ struct obd_device {
 	/**
 	 * List of outstanding class_incref()'s fo this OBD. For debugging. */
 	struct lu_ref			obd_reference;
+
+	struct kset		        obd_kset; /* sysfs object collection */
+	struct kobj_type		obd_ktype;
+	struct completion		obd_kobj_unregister;
 };
 
 /* get/set_info keys */
@@ -741,6 +786,17 @@ struct obd_device {
 #define KEY_CACHE_LRU_SHRINK	"cache_lru_shrink"
 #define KEY_OSP_CONNECTED	"osp_connected"
 
+/* Flags for op_xvalid */
+enum op_xvalid {
+	OP_XVALID_CTIME_SET	= BIT(0),	/* 0x0001 */
+	OP_XVALID_BLOCKS	= BIT(1),	/* 0x0002 */
+	OP_XVALID_OWNEROVERRIDE	= BIT(2),	/* 0x0004 */
+	OP_XVALID_FLAGS		= BIT(3),	/* 0x0008 */
+	OP_XVALID_PROJID	= BIT(4),	/* 0x0010 */
+	OP_XVALID_LAZYSIZE	= BIT(5),	/* 0x0020 */
+	OP_XVALID_LAZYBLOCKS	= BIT(6),	/* 0x0040 */
+};
+
 struct lu_context;
 
 static inline int it_to_lock_mode(struct lookup_intent *it)
@@ -748,15 +804,14 @@ static inline int it_to_lock_mode(struct lookup_intent *it)
 	/* CREAT needs to be tested before open (both could be set) */
 	if (it->it_op & IT_CREAT)
 		return LCK_CW;
-	else if (it->it_op & (IT_GETATTR | IT_OPEN | IT_LOOKUP |
-			      IT_LAYOUT))
+	else if (it->it_op & (IT_GETATTR | IT_OPEN | IT_LOOKUP))
 		return LCK_CR;
+	else if (it->it_op & IT_LAYOUT)
+		return (it->it_flags & FMODE_WRITE) ? LCK_EX : LCK_CR;
 	else if (it->it_op &  IT_READDIR)
 		return LCK_PR;
 	else if (it->it_op &  IT_GETXATTR)
 		return LCK_PR;
-	else if (it->it_op &  IT_SETXATTR)
-		return LCK_PW;
 
 	LASSERTF(0, "Invalid it_op: %d\n", it->it_op);
 	return -EINVAL;
@@ -768,6 +823,7 @@ enum md_op_flags {
 	MF_MDC_CANCEL_FID3	= 1 << 2,
 	MF_MDC_CANCEL_FID4	= 1 << 3,
 	MF_GET_MDT_IDX		= 1 << 4,
+	MF_GETATTR_BY_FID	= 1 << 5,
 };
 
 enum md_cli_flags {
@@ -785,7 +841,7 @@ enum md_cli_flags {
  */
 static inline bool it_has_reply_body(const struct lookup_intent *it)
 {
-	return it->it_op & (IT_OPEN | IT_UNLINK | IT_LOOKUP | IT_GETATTR);
+	return it->it_op & (IT_OPEN | IT_LOOKUP | IT_GETATTR);
 }
 
 struct md_op_data {
@@ -795,10 +851,12 @@ struct md_op_data {
 	struct lu_fid		op_fid4; /* to the operation locks. */
 	u32			op_mds;  /* what mds server open will go to */
 	__u32			op_mode;
-	struct lustre_handle	op_handle;
+	struct lustre_handle	op_open_handle;
 	s64			op_mod_time;
 	const char		*op_name;
 	size_t			op_namelen;
+	struct rw_semaphore	*op_mea1_sem;
+	struct rw_semaphore	*op_mea2_sem;
 	struct lmv_stripe_md	*op_mea1;
 	struct lmv_stripe_md	*op_mea2;
 	__u32			op_suppgids[2];
@@ -810,9 +868,10 @@ struct md_op_data {
 
 	/* iattr fields and blocks. */
 	struct iattr            op_attr;
+	enum op_xvalid		op_xvalid;	/* eXtra validity flags */
 	loff_t                  op_attr_blocks;
-	__u64                   op_valid; /* OBD_MD_* */
-	unsigned int		op_attr_flags; /* LUSTRE_{SYNC,..}_FL */
+	u64			op_valid;	/* OBD_MD_* */
+	unsigned int		op_attr_flags;	/* LUSTRE_{SYNC,..}_FL */
 
 	enum md_op_flags	op_flags;
 
@@ -827,8 +886,9 @@ struct md_op_data {
 	__u64			op_data_version;
 	struct lustre_handle	op_lease_handle;
 
-	/* File security context, for creates. */
+	/* File security context, for creates/metadata ops */
 	const char	       *op_file_secctx_name;
+	__u32			op_file_secctx_name_size;
 	void		       *op_file_secctx;
 	__u32			op_file_secctx_size;
 
@@ -840,6 +900,19 @@ struct md_op_data {
 	/* Used by readdir */
 	unsigned int		op_max_pages;
 
+	__u16			op_mirror_id;
+
+	/*
+	 * used to access migrating dir: if it's set, assume migration is
+	 * finished, use the new layout to access dir, otherwise use old layout.
+	 * By default it's not set, because new files are created under new
+	 * layout, if we can't find file with name under both old and new
+	 * layout, we are sure file with name doesn't exist, but in reverse
+	 * order there may be a race with creation by others.
+	 */
+	bool			op_post_migrate;
+	/* used to access dir with bash hash */
+	__u32			op_stripe_index;
 };
 
 struct md_callback {
@@ -911,9 +984,9 @@ struct obd_ops {
 	 * about this.
 	 */
 	int (*o_statfs)(const struct lu_env *, struct obd_export *exp,
-			struct obd_statfs *osfs, __u64 max_age, __u32 flags);
+			struct obd_statfs *osfs, time64_t max_age, __u32 flags);
 	int (*o_statfs_async)(struct obd_export *exp, struct obd_info *oinfo,
-			      __u64 max_age, struct ptlrpc_request_set *set);
+			      time64_t max_age, struct ptlrpc_request_set *set);
 	int (*o_create)(const struct lu_env *env, struct obd_export *exp,
 			struct obdo *oa);
 	int (*o_destroy)(const struct lu_env *env, struct obd_export *exp,
@@ -947,8 +1020,6 @@ struct obd_ops {
 	int (*o_quotactl)(struct obd_device *, struct obd_export *,
 			  struct obd_quotactl *);
 
-	int (*o_ping)(const struct lu_env *, struct obd_export *exp);
-
 	/* pools methods */
 	int (*o_pool_new)(struct obd_device *obd, char *poolname);
 	int (*o_pool_del)(struct obd_device *obd, char *poolname);
@@ -956,12 +1027,6 @@ struct obd_ops {
 			  char *ostname);
 	int (*o_pool_rem)(struct obd_device *obd, char *poolname,
 			  char *ostname);
-	void (*o_getref)(struct obd_device *obd);
-	void (*o_putref)(struct obd_device *obd);
-	/*
-	 * NOTE: If adding ops, add another LPROCFS_OBD_OP_INIT() line
-	 * to lprocfs_alloc_obd_stats() in obdclass/lprocfs_status.c.
-	 * Also, add a wrapper function in include/linux/obd_class.h. */
 };
 
 /* lmv structures */
@@ -983,7 +1048,7 @@ struct md_open_data {
 };
 
 struct obd_client_handle {
-	struct lustre_handle	 och_fh;
+	struct lustre_handle	 och_open_handle;
 	struct lu_fid		 och_fid;
 	struct md_open_data	*och_mod;
 	struct lustre_handle	 och_lease_handle; /* open lock for lease */
@@ -997,18 +1062,6 @@ struct lookup_intent;
 struct cl_attr;
 
 struct md_ops {
-	/* Every operation from MD_STATS_FIRST_OP up to and including
-	 * MD_STATS_LAST_OP will be counted by EXP_MD_OP_INCREMENT()
-	 * and will appear in /proc/fs/lustre/{lmv,mdc}/.../md_stats.
-	 * Operations after MD_STATS_LAST_OP are excluded from stats.
-	 * There are a few reasons for doing this: we prune the 17
-	 * counters which will be of minimal use in understanding
-	 * metadata utilization, we save memory by allocating 15
-	 * instead of 32 counters, we save cycles by not counting.
-	 *
-	 * MD_STATS_FIRST_OP must be the first member of md_ops.
-	 */
-#define MD_STATS_FIRST_OP m_close
 	int (*m_close)(struct obd_export *, struct md_op_data *,
 		       struct md_open_data *, struct ptlrpc_request **);
 
@@ -1049,12 +1102,11 @@ struct md_ops {
 			struct ptlrpc_request **);
 
 	int (*m_setxattr)(struct obd_export *, const struct lu_fid *,
-			  u64, const char *, const char *, int, int, int, u32,
-			  struct ptlrpc_request **);
+			  u64, const char *, const void *, size_t, unsigned int,
+			  u32, struct ptlrpc_request **);
 
 	int (*m_getxattr)(struct obd_export *, const struct lu_fid *,
-			  u64, const char *, const char *, int, int, int,
-			  struct ptlrpc_request **);
+			  u64, const char *, size_t, struct ptlrpc_request **);
 
 	int (*m_intent_getattr_async)(struct obd_export *,
 				      struct md_enqueue_info *);
@@ -1062,7 +1114,7 @@ struct md_ops {
         int (*m_revalidate_lock)(struct obd_export *, struct lookup_intent *,
                                  struct lu_fid *, __u64 *bits);
 
-#define MD_STATS_LAST_OP m_revalidate_lock
+	int (*m_file_resync)(struct obd_export *, struct md_op_data *);
 
 	int (*m_get_root)(struct obd_export *, const char *, struct lu_fid *);
 	int (*m_null_inode)(struct obd_export *, const struct lu_fid *);
@@ -1107,6 +1159,8 @@ struct md_ops {
 				  struct lu_fid *fid);
 	int (*m_unpackmd)(struct obd_export *exp, struct lmv_stripe_md **plsm,
 			  const union lmv_mds_md *lmv, size_t lmv_size);
+	int (*m_rmfid)(struct obd_export *exp, struct fid_array *fa, int *rcs,
+		       struct ptlrpc_request_set *set);
 };
 
 static inline struct md_open_data *obd_mod_alloc(void)
@@ -1201,7 +1255,8 @@ static inline int cli_brw_size(struct obd_device *obd)
 	return obd->u.cli.cl_max_pages_per_rpc << PAGE_SHIFT;
 }
 
-/* when RPC size or the max RPCs in flight is increased, the max dirty pages
+/*
+ * When RPC size or the max RPCs in flight is increased, the max dirty pages
  * of the client should be increased accordingly to avoid sending fragmented
  * RPCs over the network when the client runs out of the maximum dirty space
  * when so many RPCs are being generated.
@@ -1209,10 +1264,10 @@ static inline int cli_brw_size(struct obd_device *obd)
 static inline void client_adjust_max_dirty(struct client_obd *cli)
 {
 	 /* initializing */
-	if (cli->cl_dirty_max_pages <= 0)
-		cli->cl_dirty_max_pages = (OSC_MAX_DIRTY_DEFAULT * 1024 * 1024)
-							>> PAGE_SHIFT;
-	else {
+	if (cli->cl_dirty_max_pages <= 0) {
+		cli->cl_dirty_max_pages =
+			(OSC_MAX_DIRTY_DEFAULT * 1024 * 1024) >> PAGE_SHIFT;
+	} else {
 		unsigned long dirty_max = cli->cl_max_rpcs_in_flight *
 					  cli->cl_max_pages_per_rpc;
 
@@ -1222,6 +1277,12 @@ static inline void client_adjust_max_dirty(struct client_obd *cli)
 
 	if (cli->cl_dirty_max_pages > cfs_totalram_pages() / 8)
 		cli->cl_dirty_max_pages = cfs_totalram_pages() / 8;
+
+	/* This value is exported to userspace through the max_dirty_mb
+	 * parameter.  So we round up the number of pages to make it a round
+	 * number of MBs. */
+	cli->cl_dirty_max_pages = round_up(cli->cl_dirty_max_pages,
+					   1 << (20 - PAGE_SHIFT));
 }
 
 #endif /* __OBD_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/obd_cksum.h b/drivers/staging/lustrefsx/lustre/include/obd_cksum.h
index 6a0cfe8d72fc0..6e807d762c354 100644
--- a/drivers/staging/lustrefsx/lustre/include/obd_cksum.h
+++ b/drivers/staging/lustrefsx/lustre/include/obd_cksum.h
@@ -34,9 +34,12 @@
 #define __OBD_CKSUM
 #include <libcfs/libcfs.h>
 #include <libcfs/libcfs_crypto.h>
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 
-static inline unsigned char cksum_obd2cfs(cksum_type_t cksum_type)
+int obd_t10_cksum_speed(const char *obd_name,
+			enum cksum_types cksum_type);
+
+static inline unsigned char cksum_obd2cfs(enum cksum_types cksum_type)
 {
 	switch (cksum_type) {
 	case OBD_CKSUM_CRC32:
@@ -52,58 +55,23 @@ static inline unsigned char cksum_obd2cfs(cksum_type_t cksum_type)
 	return 0;
 }
 
-/* The OBD_FL_CKSUM_* flags is packed into 5 bits of o_flags, since there can
- * only be a single checksum type per RPC.
- *
- * The OBD_CHECKSUM_* type bits passed in ocd_cksum_types are a 32-bit bitmask
- * since they need to represent the full range of checksum algorithms that
- * both the client and server can understand.
- *
- * In case of an unsupported types/flags we fall back to ADLER
- * because that is supported by all clients since 1.8
- *
- * In case multiple algorithms are supported the best one is used. */
-static inline u32 cksum_type_pack(cksum_type_t cksum_type)
-{
-	unsigned int    performance = 0, tmp;
-	u32		flag = OBD_FL_CKSUM_ADLER;
-
-	if (cksum_type & OBD_CKSUM_CRC32) {
-		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32));
-		if (tmp > performance) {
-			performance = tmp;
-			flag = OBD_FL_CKSUM_CRC32;
-		}
-	}
-	if (cksum_type & OBD_CKSUM_CRC32C) {
-		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C));
-		if (tmp > performance) {
-			performance = tmp;
-			flag = OBD_FL_CKSUM_CRC32C;
-		}
-	}
-	if (cksum_type & OBD_CKSUM_ADLER) {
-		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER));
-		if (tmp > performance) {
-			performance = tmp;
-			flag = OBD_FL_CKSUM_ADLER;
-		}
-	}
-	if (unlikely(cksum_type && !(cksum_type & (OBD_CKSUM_CRC32C |
-						   OBD_CKSUM_CRC32 |
-						   OBD_CKSUM_ADLER))))
-		CWARN("unknown cksum type %x\n", cksum_type);
-
-	return flag;
-}
+u32 obd_cksum_type_pack(const char *obd_name, enum cksum_types cksum_type);
 
-static inline cksum_type_t cksum_type_unpack(u32 o_flags)
+static inline enum cksum_types obd_cksum_type_unpack(u32 o_flags)
 {
 	switch (o_flags & OBD_FL_CKSUM_ALL) {
 	case OBD_FL_CKSUM_CRC32C:
 		return OBD_CKSUM_CRC32C;
 	case OBD_FL_CKSUM_CRC32:
 		return OBD_CKSUM_CRC32;
+	case OBD_FL_CKSUM_T10IP512:
+		return OBD_CKSUM_T10IP512;
+	case OBD_FL_CKSUM_T10IP4K:
+		return OBD_CKSUM_T10IP4K;
+	case OBD_FL_CKSUM_T10CRC512:
+		return OBD_CKSUM_T10CRC512;
+	case OBD_FL_CKSUM_T10CRC4K:
+		return OBD_CKSUM_T10CRC4K;
 	default:
 		break;
 	}
@@ -115,9 +83,9 @@ static inline cksum_type_t cksum_type_unpack(u32 o_flags)
  * 1.8 supported ADLER it is base and not depend on hw
  * Client uses all available local algos
  */
-static inline cksum_type_t cksum_types_supported_client(void)
+static inline enum cksum_types obd_cksum_types_supported_client(void)
 {
-	cksum_type_t ret = OBD_CKSUM_ADLER;
+	enum cksum_types ret = OBD_CKSUM_ADLER;
 
 	CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n",
 	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)),
@@ -129,32 +97,13 @@ static inline cksum_type_t cksum_types_supported_client(void)
 	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) > 0)
 		ret |= OBD_CKSUM_CRC32;
 
-	return ret;
-}
-
-/* Server uses algos that perform at 50% or better of the Adler */
-static inline enum cksum_types cksum_types_supported_server(void)
-{
-	enum cksum_types ret = OBD_CKSUM_ADLER;
-	int base_speed;
-
-	CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n",
-	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)),
-	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)),
-	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)));
-
-	base_speed = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)) / 2;
-
-	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) >=
-	    base_speed)
-		ret |= OBD_CKSUM_CRC32C;
-	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) >=
-	    base_speed)
-		ret |= OBD_CKSUM_CRC32;
+	/* Client support all kinds of T10 checksum */
+	ret |= OBD_CKSUM_T10_ALL;
 
 	return ret;
 }
 
+enum cksum_types obd_cksum_types_supported_server(const char *obd_name);
 
 /* Select the best checksum algorithm among those supplied in the cksum_types
  * input.
@@ -163,13 +112,69 @@ static inline enum cksum_types cksum_types_supported_server(void)
  * checksum type due to its benchmarking at libcfs module load.
  * Caution is advised, however, since what is fastest on a single client may
  * not be the fastest or most efficient algorithm on the server.  */
-static inline cksum_type_t cksum_type_select(cksum_type_t cksum_types)
+static inline enum cksum_types
+obd_cksum_type_select(const char *obd_name, enum cksum_types cksum_types)
 {
-	return cksum_type_unpack(cksum_type_pack(cksum_types));
+	u32 flag = obd_cksum_type_pack(obd_name, cksum_types);
+
+	return obd_cksum_type_unpack(flag);
 }
 
 /* Checksum algorithm names. Must be defined in the same order as the
  * OBD_CKSUM_* flags. */
-#define DECLARE_CKSUM_NAME char *cksum_name[] = {"crc32", "adler", "crc32c"}
+#define DECLARE_CKSUM_NAME const char *cksum_name[] = {"crc32", "adler", \
+	"crc32c", "reserved", "t10ip512", "t10ip4K", "t10crc512", "t10crc4K"}
+
+typedef __u16 (obd_dif_csum_fn) (void *, unsigned int);
+
+__u16 obd_dif_crc_fn(void *data, unsigned int len);
+__u16 obd_dif_ip_fn(void *data, unsigned int len);
+int obd_page_dif_generate_buffer(const char *obd_name, struct page *page,
+				 __u32 offset, __u32 length,
+				 __u16 *guard_start, int guard_number,
+				 int *used_number, int sector_size,
+				 obd_dif_csum_fn *fn);
+/*
+ * If checksum type is one T10 checksum types, init the csum_fn and sector
+ * size. Otherwise, init them to NULL/zero.
+ */
+static inline void obd_t10_cksum2dif(enum cksum_types cksum_type,
+				     obd_dif_csum_fn **fn, int *sector_size)
+{
+	*fn = NULL;
+	*sector_size = 0;
+
+#if IS_ENABLED(CONFIG_CRC_T10DIF)
+	switch (cksum_type) {
+	case OBD_CKSUM_T10IP512:
+		*fn = obd_dif_ip_fn;
+		*sector_size = 512;
+		break;
+	case OBD_CKSUM_T10IP4K:
+		*fn = obd_dif_ip_fn;
+		*sector_size = 4096;
+		break;
+	case OBD_CKSUM_T10CRC512:
+		*fn = obd_dif_crc_fn;
+		*sector_size = 512;
+		break;
+	case OBD_CKSUM_T10CRC4K:
+		*fn = obd_dif_crc_fn;
+		*sector_size = 4096;
+		break;
+	default:
+		break;
+	}
+#endif /* CONFIG_CRC_T10DIF */
+}
+
+enum obd_t10_cksum_type {
+	OBD_T10_CKSUM_UNKNOWN = 0,
+	OBD_T10_CKSUM_IP512,
+	OBD_T10_CKSUM_IP4K,
+	OBD_T10_CKSUM_CRC512,
+	OBD_T10_CKSUM_CRC4K,
+	OBD_T10_CKSUM_MAX
+};
 
 #endif /* __OBD_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/obd_class.h b/drivers/staging/lustrefsx/lustre/include/obd_class.h
index da40a4e38f91b..ce46183f9d4da 100644
--- a/drivers/staging/lustrefsx/lustre/include/obd_class.h
+++ b/drivers/staging/lustrefsx/lustre/include/obd_class.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -32,13 +32,13 @@
 #ifndef __CLASS_OBD_H
 #define __CLASS_OBD_H
 
-
+#include <linux/kobject.h>
 #include <obd_support.h>
 #include <lustre_import.h>
 #include <lustre_net.h>
 #include <obd.h>
 #include <lustre_lib.h>
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 #include <lprocfs_status.h>
 
 #define OBD_STATFS_NODELAY      0x0001  /* requests should be send without delay
@@ -47,27 +47,36 @@
                                          * obd_osfs_age */
 #define OBD_STATFS_FOR_MDT0	0x0004	/* The statfs is only for retrieving
 					 * information from MDT0. */
+#define OBD_STATFS_SUM		0x0008	/* get aggregated statfs from MDT */
 
 extern rwlock_t obd_dev_lock;
 
 /* OBD Operations Declarations */
-extern struct obd_device *class_conn2obd(struct lustre_handle *);
 extern struct obd_device *class_exp2obd(struct obd_export *);
 extern int class_handle_ioctl(unsigned int cmd, unsigned long arg);
-extern int lustre_get_jobid(char *jobid);
+int lustre_get_jobid(char *jobid, size_t len);
+void lustre_jobid_clear(const char *jobid);
+void jobid_cache_fini(void);
+int jobid_cache_init(void);
 
 struct lu_device_type;
 
 /* genops.c */
 struct obd_export *class_conn2export(struct lustre_handle *);
-int class_register_type(struct obd_ops *, struct md_ops *, bool enable_proc,
-			struct lprocfs_vars *module_vars,
+struct kobject *class_setup_tunables(const char *name);
+int class_register_type(const struct obd_ops *dt_ops,
+			const struct md_ops *md_ops, bool enable_proc,
+			struct ldebugfs_vars *module_vars,
 			const char *nm, struct lu_device_type *ldt);
 int class_unregister_type(const char *nm);
 
-struct obd_device *class_newdev(const char *type_name, const char *name);
-void class_release_dev(struct obd_device *obd);
+struct obd_device *class_newdev(const char *type_name, const char *name,
+				const char *uuid);
+int class_register_device(struct obd_device *obd);
+void class_unregister_device(struct obd_device *obd);
+void class_free_dev(struct obd_device *obd);
 
+struct obd_device *class_dev_by_str(const char *str);
 int class_name2dev(const char *name);
 struct obd_device *class_name2obd(const char *name);
 int class_uuid2dev(struct obd_uuid *uuid);
@@ -83,7 +92,17 @@ int get_devices_count(void);
 
 int class_notify_sptlrpc_conf(const char *fsname, int namelen);
 
-char *obd_export_nid2str(struct obd_export *exp);
+static inline char *obd_export_nid2str(struct obd_export *exp)
+{
+	return exp->exp_connection == NULL ?
+	       "<unknown>" : libcfs_nid2str(exp->exp_connection->c_peer.nid);
+}
+
+static inline char *obd_import_nid2str(struct obd_import *imp)
+{
+	return imp->imp_connection == NULL ?
+	       "<unknown>" : libcfs_nid2str(imp->imp_connection->c_peer.nid);
+}
 
 int obd_export_evict_by_nid(struct obd_device *obd, const char *nid);
 int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid);
@@ -133,8 +152,9 @@ struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg,
 				     const char *new_name);
 void print_lustre_cfg(struct lustre_cfg *lcfg);
 int class_process_config(struct lustre_cfg *lcfg);
-int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
-			     struct lustre_cfg *lcfg, void *data);
+ssize_t class_set_global(const char *param);
+ssize_t class_modify_config(struct lustre_cfg *lcfg, const char *prefix,
+			    struct kobject *kobj);
 int class_attach(struct lustre_cfg *lcfg);
 int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
 int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg);
@@ -164,12 +184,11 @@ int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg);
 #define CFG_F_START     0x01   /* Set when we start updating from a log */
 #define CFG_F_MARKER    0x02   /* We are within a maker */
 #define CFG_F_SKIP      0x04   /* We should ignore this cfg command */
-#define CFG_F_COMPAT146 0x08   /* Allow old-style logs */
 #define CFG_F_EXCLUDE   0x10   /* OST exclusion list */
 
 /* Passed as data param to class_config_parse_llog */
 struct config_llog_instance {
-	void			*cfg_instance;
+	unsigned long		 cfg_instance;
 	struct super_block	*cfg_sb;
 	struct obd_uuid		 cfg_uuid;
 	llog_cb_t		 cfg_callback;
@@ -181,6 +200,19 @@ struct config_llog_instance {
 int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
 			    char *name, struct config_llog_instance *cfg);
 
+/**
+ * Generate a unique configuration instance for this mount
+ *
+ * Temporary hack to bypass ASLR in 4.15+ kernels, a better fix soon.
+ * For now, use the same value as before - the superblock pointer value.
+ *
+ * Using the client UUID would be an option, but it needs more testing.
+ */
+static inline unsigned long ll_get_cfg_instance(struct super_block *sb)
+{
+	return (unsigned long)sb;
+}
+
 #define CONFIG_SUB_SPTLRPC	0x01
 #define CONFIG_SUB_RECOVER	0x02
 #define CONFIG_SUB_PARAMS	0x04
@@ -215,7 +247,7 @@ static inline bool logname_is_barrier(const char *logname)
 struct config_llog_data {
 	struct ldlm_res_id	    cld_resid;
 	struct config_llog_instance cld_cfg;
-	struct list_head	    cld_list_chain;
+	struct list_head	    cld_list_chain;/* on config_llog_list */
 	atomic_t		    cld_refcount;
 	struct config_llog_data    *cld_sptlrpc;/* depended sptlrpc log */
 	struct config_llog_data    *cld_params;	/* common parameters log */
@@ -315,6 +347,8 @@ struct obd_export *class_export_get(struct obd_export *exp);
 void class_export_put(struct obd_export *exp);
 struct obd_export *class_new_export(struct obd_device *obddev,
                                     struct obd_uuid *cluuid);
+struct obd_export *class_new_export_self(struct obd_device *obd,
+					 struct obd_uuid *uuid);
 void class_unlink_export(struct obd_export *exp);
 
 struct obd_import *class_import_get(struct obd_import *);
@@ -334,6 +368,7 @@ void class_disconnect_exports(struct obd_device *obddev);
 int class_manual_cleanup(struct obd_device *obd);
 void class_disconnect_stale_exports(struct obd_device *,
                                     int (*test_export)(struct obd_export *));
+
 static inline enum obd_option exp_flags_from_obd(struct obd_device *obd)
 {
         return ((obd->obd_fail ? OBD_OPT_FAILOVER : 0) |
@@ -368,25 +403,25 @@ void la_from_obdo(struct lu_attr *la, const struct obdo *dst, u64 valid);
 void obdo_cpy_md(struct obdo *dst, const struct obdo *src, u64 valid);
 void obdo_to_ioobj(const struct obdo *oa, struct obd_ioobj *ioobj);
 
-#define OBT(dev)        (dev)->obd_type
 #define OBP(dev, op)    (dev)->obd_type->typ_dt_ops->o_ ## op
 #define MDP(dev, op)    (dev)->obd_type->typ_md_ops->m_ ## op
-#define CTXTP(ctxt, op) (ctxt)->loc_logops->lop_##op
 
-/* Ensure obd_setup: used for cleanup which must be called
-   while obd is stopping */
-#define OBD_CHECK_DEV(obd)                                      \
-do {                                                            \
-        if (!(obd)) {                                           \
-                CERROR("NULL device\n");                        \
-                RETURN(-ENODEV);                                \
-        }                                                       \
-} while (0)
+static inline int obd_check_dev(struct obd_device *obd)
+{
+	if (!obd) {
+		CERROR("NULL device\n");
+		return -ENODEV;
+	}
+	return 0;
+}
 
 /* ensure obd_setup and !obd_stopping */
 #define OBD_CHECK_DEV_ACTIVE(obd)                               \
 do {                                                            \
-        OBD_CHECK_DEV(obd);                                     \
+	rc = obd_check_dev(obd);				\
+	if (rc)							\
+		return rc;					\
+								\
         if (!(obd)->obd_set_up || (obd)->obd_stopping) {        \
                 CERROR("Device %d not setup\n",                 \
                        (obd)->obd_minor);                       \
@@ -395,61 +430,6 @@ do {                                                            \
 } while (0)
 
 
-#ifdef CONFIG_PROC_FS
-#define OBD_COUNTER_OFFSET(op)						       \
-	((offsetof(struct obd_ops, o_ ## op) -				       \
-	  offsetof(struct obd_ops, o_iocontrol))			       \
-	 / sizeof(((struct obd_ops *)NULL)->o_iocontrol))
-
-/* The '- 1' below is for o_owner. */
-#define NUM_OBD_STATS							       \
-	(sizeof(struct obd_ops) /					       \
-	 sizeof(((struct obd_ops *)NULL)->o_iocontrol) - 1)
-
-#define OBD_COUNTER_INCREMENT(obd, op)					       \
-	lprocfs_counter_incr((obd)->obd_stats,				       \
-			     (obd)->obd_cntr_base + OBD_COUNTER_OFFSET(op))
-
-#define EXP_COUNTER_INCREMENT(exp, op)					       \
-	do {								       \
-		unsigned int _off;					       \
-		_off = (exp)->exp_obd->obd_cntr_base + OBD_COUNTER_OFFSET(op); \
-		lprocfs_counter_incr((exp)->exp_obd->obd_stats, _off);	       \
-		if ((exp)->exp_obd->obd_uses_nid_stats &&		       \
-		    (exp)->exp_nid_stats != NULL)			       \
-			lprocfs_counter_incr((exp)->exp_nid_stats->nid_stats,  \
-					     _off);			       \
-	} while (0)
-
-#define _MD_COUNTER_OFFSET(m_op)					       \
-	((offsetof(struct md_ops, m_op) -				       \
-	  offsetof(struct md_ops, MD_STATS_FIRST_OP)) /			       \
-	 sizeof(((struct md_ops *)NULL)->MD_STATS_FIRST_OP))
-
-#define MD_COUNTER_OFFSET(op) _MD_COUNTER_OFFSET(m_ ## op)
-
-#define NUM_MD_STATS							       \
-	(_MD_COUNTER_OFFSET(MD_STATS_LAST_OP) -				       \
-	 _MD_COUNTER_OFFSET(MD_STATS_FIRST_OP) + 1)
-
-/* Note that we only increment md counters for ops whose offset is less
- * than NUM_MD_STATS. This is explained in a comment in the definition
- * of struct md_ops. */
-#define EXP_MD_COUNTER_INCREMENT(exp, op)				       \
-	do {								       \
-		if (MD_COUNTER_OFFSET(op) < NUM_MD_STATS)		       \
-			lprocfs_counter_incr((exp)->exp_obd->obd_md_stats,     \
-					(exp)->exp_obd->obd_md_cntr_base +     \
-					MD_COUNTER_OFFSET(op));	               \
-	} while (0)
-
-#else
-#define OBD_COUNTER_OFFSET(op)
-#define OBD_COUNTER_INCREMENT(obd, op)
-#define EXP_COUNTER_INCREMENT(exp, op)
-#define EXP_MD_COUNTER_INCREMENT(exp, op)
-#endif
-
 static inline int lprocfs_nid_ldlm_stats_init(struct nid_stat* tmp)
 {
 	/* Always add in ldlm_stats */
@@ -465,57 +445,16 @@ static inline int lprocfs_nid_ldlm_stats_init(struct nid_stat* tmp)
 				      tmp->nid_ldlm_stats);
 }
 
-#define EXP_CHECK_MD_OP(exp, op)					\
-do {									\
-	if ((exp) == NULL) {						\
-		CERROR("obd_" #op ": NULL export\n");			\
-		RETURN(-ENODEV);					\
-	}								\
-	if ((exp)->exp_obd == NULL || !OBT((exp)->exp_obd)) {		\
-		CERROR("obd_" #op ": cleaned up obd\n");		\
-		RETURN(-EOPNOTSUPP);					\
-	}								\
-	if (!OBT((exp)->exp_obd) || !MDP((exp)->exp_obd, op)) {		\
-		CERROR("%s: obd_" #op ": dev %d no operation\n",	\
-		       (exp)->exp_obd->obd_name,			\
-		       (exp)->exp_obd->obd_minor);			\
-		RETURN(-EOPNOTSUPP);					\
-	}								\
-} while (0)
-
-
-#define OBD_CHECK_DT_OP(obd, op, err)					\
-do {									\
-	if (!OBT(obd) || !OBP((obd), op)) {				\
-		if (err)						\
-			CERROR("%s: no obd_" #op " operation\n",	\
-			       obd->obd_name);				\
-		RETURN(err);						\
-	}								\
-} while (0)
-
-#define EXP_CHECK_DT_OP(exp, op)					\
-do {									\
-	if ((exp) == NULL) {						\
-		CERROR("obd_" #op ": NULL export\n");			\
-		RETURN(-ENODEV);					\
-	}								\
-	if ((exp)->exp_obd == NULL || !OBT((exp)->exp_obd)) {		\
-		CERROR("obd_" #op ": cleaned up obd\n");		\
-		RETURN(-EOPNOTSUPP);					\
-	}								\
-	OBD_CHECK_DT_OP((exp)->exp_obd, op, -EOPNOTSUPP);		\
-} while (0)
-
-#define CTXT_CHECK_OP(ctxt, op, err)					\
-do {									\
-	if (!OBT(ctxt->loc_obd) || !CTXTP((ctxt), op)) {		\
-		if (err)						\
-			CERROR("%s: no lop_" #op "operation\n",		\
-			       ctxt->loc_obd->obd_name);		\
-		RETURN(err);						\
-	}								\
-} while (0)
+static inline int exp_check_ops(struct obd_export *exp)
+{
+	if (exp == NULL) {
+		RETURN(-ENODEV);
+	}
+	if (exp->exp_obd == NULL || !exp->exp_obd->obd_type) {
+		RETURN(-EOPNOTSUPP);
+	}
+	RETURN(0);
+}
 
 static inline int class_devno_max(void)
 {
@@ -529,8 +468,15 @@ static inline int obd_get_info(const struct lu_env *env, struct obd_export *exp,
 	int rc;
 	ENTRY;
 
-	EXP_CHECK_DT_OP(exp, get_info);
-	EXP_COUNTER_INCREMENT(exp, get_info);
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_get_info) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
 
 	rc = OBP(exp->exp_obd, get_info)(env, exp, keylen, key, vallen, val);
 	RETURN(rc);
@@ -545,8 +491,15 @@ static inline int obd_set_info_async(const struct lu_env *env,
         int rc;
         ENTRY;
 
-        EXP_CHECK_DT_OP(exp, set_info_async);
-        EXP_COUNTER_INCREMENT(exp, set_info_async);
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_set_info_async) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
 
         rc = OBP(exp->exp_obd, set_info_async)(env, exp, keylen, key, vallen,
                                                val, set);
@@ -570,18 +523,14 @@ static inline int obd_set_info_async(const struct lu_env *env,
  * functionality of ->o_precleanup() and ->o_cleanup() they override. Hence,
  * obd_precleanup() and obd_cleanup() call both lu_device and obd operations.
  */
-
-#define DECLARE_LU_VARS(ldt, d)                 \
-        struct lu_device_type *ldt;       \
-        struct lu_device *d
-
 static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg)
 {
         int rc;
-        DECLARE_LU_VARS(ldt, d);
+	struct lu_device_type *ldt = obd->obd_type->typ_lu;
+	struct lu_device *d;
+
         ENTRY;
 
-        ldt = obd->obd_type->typ_lu;
         if (ldt != NULL) {
                 struct lu_context  session_ctx;
                 struct lu_env env;
@@ -605,8 +554,11 @@ static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg)
                 lu_context_fini(&session_ctx);
 
         } else {
-                OBD_CHECK_DT_OP(obd, setup, -EOPNOTSUPP);
-                OBD_COUNTER_INCREMENT(obd, setup);
+		if (!obd->obd_type->typ_dt_ops->o_setup) {
+			CERROR("%s: no %s operation\n", obd->obd_name,
+			       __func__);
+			RETURN(-EOPNOTSUPP);
+		}
                 rc = OBP(obd, setup)(obd, cfg);
         }
         RETURN(rc);
@@ -615,23 +567,30 @@ static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg)
 static inline int obd_precleanup(struct obd_device *obd)
 {
 	int rc;
-	DECLARE_LU_VARS(ldt, d);
+	struct lu_device_type *ldt = obd->obd_type->typ_lu;
+	struct lu_device *d = obd->obd_lu_dev;
+
 	ENTRY;
 
-	OBD_CHECK_DEV(obd);
-	ldt = obd->obd_type->typ_lu;
-	d = obd->obd_lu_dev;
 	if (ldt != NULL && d != NULL) {
-		struct lu_env env;
-
-		rc = lu_env_init(&env, ldt->ldt_ctx_tags);
-		if (rc == 0) {
-			ldt->ldt_ops->ldto_device_fini(&env, d);
-			lu_env_fini(&env);
+		struct lu_env *env = lu_env_find();
+		struct lu_env _env;
+
+		if (!env) {
+			env = &_env;
+			rc = lu_env_init(env, ldt->ldt_ctx_tags);
+			LASSERT(rc == 0);
+			lu_env_add(env);
+		}
+		ldt->ldt_ops->ldto_device_fini(env, d);
+		if (env == &_env) {
+			lu_env_remove(env);
+			lu_env_fini(env);
 		}
 	}
-	OBD_CHECK_DT_OP(obd, precleanup, 0);
-	OBD_COUNTER_INCREMENT(obd, precleanup);
+
+	if (!obd->obd_type->typ_dt_ops->o_precleanup)
+		RETURN(0);
 
 	rc = OBP(obd, precleanup)(obd);
 	RETURN(rc);
@@ -640,13 +599,10 @@ static inline int obd_precleanup(struct obd_device *obd)
 static inline int obd_cleanup(struct obd_device *obd)
 {
         int rc;
-        DECLARE_LU_VARS(ldt, d);
-        ENTRY;
-
-        OBD_CHECK_DEV(obd);
+	struct lu_device_type *ldt = obd->obd_type->typ_lu;
+	struct lu_device *d = obd->obd_lu_dev;
 
-        ldt = obd->obd_type->typ_lu;
-        d = obd->obd_lu_dev;
+	ENTRY;
         if (ldt != NULL && d != NULL) {
                 struct lu_env env;
 
@@ -657,8 +613,8 @@ static inline int obd_cleanup(struct obd_device *obd)
                         obd->obd_lu_dev = NULL;
                 }
         }
-        OBD_CHECK_DT_OP(obd, cleanup, 0);
-        OBD_COUNTER_INCREMENT(obd, cleanup);
+	if (!obd->obd_type->typ_dt_ops->o_cleanup)
+		RETURN(0);
 
         rc = OBP(obd, cleanup)(obd);
         RETURN(rc);
@@ -685,18 +641,16 @@ static inline void obd_cleanup_client_import(struct obd_device *obd)
         EXIT;
 }
 
-static inline int
-obd_process_config(struct obd_device *obd, int datalen, void *data)
+static inline int obd_process_config(struct obd_device *obd, int datalen,
+				     void *data)
 {
         int rc;
-        DECLARE_LU_VARS(ldt, d);
-        ENTRY;
+	struct lu_device_type *ldt = obd->obd_type->typ_lu;
+	struct lu_device *d = obd->obd_lu_dev;
 
-        OBD_CHECK_DEV(obd);
+	ENTRY;
 
         obd->obd_process_conf = 1;
-        ldt = obd->obd_type->typ_lu;
-        d = obd->obd_lu_dev;
         if (ldt != NULL && d != NULL) {
                 struct lu_env env;
 
@@ -706,10 +660,14 @@ obd_process_config(struct obd_device *obd, int datalen, void *data)
                         lu_env_fini(&env);
                 }
         } else {
-                OBD_CHECK_DT_OP(obd, process_config, -EOPNOTSUPP);
+		if (!obd->obd_type->typ_dt_ops->o_process_config) {
+			CERROR("%s: no %s operation\n",
+			       obd->obd_name, __func__);
+			RETURN(-EOPNOTSUPP);
+		}
                 rc = OBP(obd, process_config)(obd, datalen, data);
         }
-        OBD_COUNTER_INCREMENT(obd, process_config);
+
         obd->obd_process_conf = 0;
 
         RETURN(rc);
@@ -721,8 +679,15 @@ static inline int obd_create(const struct lu_env *env, struct obd_export *exp,
 	int rc;
 	ENTRY;
 
-	EXP_CHECK_DT_OP(exp, create);
-	EXP_COUNTER_INCREMENT(exp, create);
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_create) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
 
 	rc = OBP(exp->exp_obd, create)(env, exp, obdo);
 	RETURN(rc);
@@ -734,8 +699,15 @@ static inline int obd_destroy(const struct lu_env *env, struct obd_export *exp,
 	int rc;
 	ENTRY;
 
-	EXP_CHECK_DT_OP(exp, destroy);
-	EXP_COUNTER_INCREMENT(exp, destroy);
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_destroy) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
 
 	rc = OBP(exp->exp_obd, destroy)(env, exp, obdo);
 	RETURN(rc);
@@ -747,8 +719,16 @@ static inline int obd_getattr(const struct lu_env *env, struct obd_export *exp,
 	int rc;
 
 	ENTRY;
-	EXP_CHECK_DT_OP(exp, getattr);
-	EXP_COUNTER_INCREMENT(exp, getattr);
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_getattr) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
+
 	rc = OBP(exp->exp_obd, getattr)(env, exp, oa);
 
 	RETURN(rc);
@@ -760,8 +740,16 @@ static inline int obd_setattr(const struct lu_env *env, struct obd_export *exp,
 	int rc;
 
 	ENTRY;
-	EXP_CHECK_DT_OP(exp, setattr);
-	EXP_COUNTER_INCREMENT(exp, setattr);
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_setattr) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
+
 	rc = OBP(exp->exp_obd, setattr)(env, exp, oa);
 
 	RETURN(rc);
@@ -775,8 +763,10 @@ static inline int obd_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
         ENTRY;
 
         OBD_CHECK_DEV_ACTIVE(obd);
-        OBD_CHECK_DT_OP(obd, add_conn, -EOPNOTSUPP);
-        OBD_COUNTER_INCREMENT(obd, add_conn);
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_add_conn) {
+		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
+		RETURN(-EOPNOTSUPP);
+	}
 
         rc = OBP(obd, add_conn)(imp, uuid, priority);
         RETURN(rc);
@@ -789,8 +779,10 @@ static inline int obd_del_conn(struct obd_import *imp, struct obd_uuid *uuid)
         ENTRY;
 
         OBD_CHECK_DEV_ACTIVE(obd);
-        OBD_CHECK_DT_OP(obd, del_conn, -EOPNOTSUPP);
-        OBD_COUNTER_INCREMENT(obd, del_conn);
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_del_conn) {
+		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
+		RETURN(-EOPNOTSUPP);
+	}
 
         rc = OBP(obd, del_conn)(imp, uuid);
         RETURN(rc);
@@ -801,8 +793,9 @@ static inline struct obd_uuid *obd_get_uuid(struct obd_export *exp)
         struct obd_uuid *uuid;
         ENTRY;
 
-        OBD_CHECK_DT_OP(exp->exp_obd, get_uuid, NULL);
-        EXP_COUNTER_INCREMENT(exp, get_uuid);
+	if (!exp->exp_obd->obd_type ||
+	    !exp->exp_obd->obd_type->typ_dt_ops->o_get_uuid)
+		RETURN(NULL);
 
         uuid = OBP(exp->exp_obd, get_uuid)(exp);
         RETURN(uuid);
@@ -825,8 +818,10 @@ static inline int obd_connect(const struct lu_env *env,
         ENTRY;
 
         OBD_CHECK_DEV_ACTIVE(obd);
-        OBD_CHECK_DT_OP(obd, connect, -EOPNOTSUPP);
-        OBD_COUNTER_INCREMENT(obd, connect);
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_connect) {
+		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
+		RETURN(-EOPNOTSUPP);
+	}
 
         rc = OBP(obd, connect)(env, exp, obd, cluuid, data, localdata);
         /* check that only subset is granted */
@@ -849,8 +844,8 @@ static inline int obd_reconnect(const struct lu_env *env,
         ENTRY;
 
         OBD_CHECK_DEV_ACTIVE(obd);
-        OBD_CHECK_DT_OP(obd, reconnect, 0);
-        OBD_COUNTER_INCREMENT(obd, reconnect);
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_reconnect)
+		RETURN(0);
 
         rc = OBP(obd, reconnect)(env, exp, obd, cluuid, d, localdata);
         /* check that only subset is granted */
@@ -863,9 +858,15 @@ static inline int obd_disconnect(struct obd_export *exp)
 {
         int rc;
         ENTRY;
-
-        EXP_CHECK_DT_OP(exp, disconnect);
-        EXP_COUNTER_INCREMENT(exp, disconnect);
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_disconnect) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
 
         rc = OBP(exp->exp_obd, disconnect)(exp);
         RETURN(rc);
@@ -877,8 +878,8 @@ static inline int obd_fid_init(struct obd_device *obd, struct obd_export *exp,
 	int rc;
 	ENTRY;
 
-	OBD_CHECK_DT_OP(obd, fid_init, 0);
-	OBD_COUNTER_INCREMENT(obd, fid_init);
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_fid_init)
+		RETURN(0);
 
 	rc = OBP(obd, fid_init)(obd, exp, type);
 	RETURN(rc);
@@ -888,9 +889,8 @@ static inline int obd_fid_fini(struct obd_device *obd)
 {
 	int rc;
 	ENTRY;
-
-	OBD_CHECK_DT_OP(obd, fid_fini, 0);
-	OBD_COUNTER_INCREMENT(obd, fid_fini);
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_fid_fini)
+		RETURN(0);
 
 	rc = OBP(obd, fid_fini)(obd);
 	RETURN(rc);
@@ -903,33 +903,29 @@ static inline int obd_fid_alloc(const struct lu_env *env,
 {
 	int rc;
 	ENTRY;
-
-	EXP_CHECK_DT_OP(exp, fid_alloc);
-	EXP_COUNTER_INCREMENT(exp, fid_alloc);
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_fid_alloc) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
 
 	rc = OBP(exp->exp_obd, fid_alloc)(env, exp, fid, op_data);
 	RETURN(rc);
 }
 
-static inline int obd_ping(const struct lu_env *env, struct obd_export *exp)
-{
-        int rc;
-        ENTRY;
-
-        OBD_CHECK_DT_OP(exp->exp_obd, ping, 0);
-        EXP_COUNTER_INCREMENT(exp, ping);
-
-        rc = OBP(exp->exp_obd, ping)(env, exp);
-        RETURN(rc);
-}
-
 static inline int obd_pool_new(struct obd_device *obd, char *poolname)
 {
         int rc;
         ENTRY;
 
-        OBD_CHECK_DT_OP(obd, pool_new, -EOPNOTSUPP);
-        OBD_COUNTER_INCREMENT(obd, pool_new);
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_pool_new) {
+		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
+		RETURN(-EOPNOTSUPP);
+	}
 
         rc = OBP(obd, pool_new)(obd, poolname);
         RETURN(rc);
@@ -939,173 +935,166 @@ static inline int obd_pool_del(struct obd_device *obd, char *poolname)
 {
         int rc;
         ENTRY;
-
-        OBD_CHECK_DT_OP(obd, pool_del, -EOPNOTSUPP);
-        OBD_COUNTER_INCREMENT(obd, pool_del);
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_pool_del) {
+		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
+		RETURN(-EOPNOTSUPP);
+	}
 
         rc = OBP(obd, pool_del)(obd, poolname);
         RETURN(rc);
 }
 
-static inline int obd_pool_add(struct obd_device *obd, char *poolname, char *ostname)
+static inline int obd_pool_add(struct obd_device *obd, char *poolname,
+			       char *ostname)
 {
         int rc;
         ENTRY;
 
-        OBD_CHECK_DT_OP(obd, pool_add, -EOPNOTSUPP);
-        OBD_COUNTER_INCREMENT(obd, pool_add);
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_pool_add) {
+		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
+		RETURN(-EOPNOTSUPP);
+	}
 
         rc = OBP(obd, pool_add)(obd, poolname, ostname);
         RETURN(rc);
 }
 
-static inline int obd_pool_rem(struct obd_device *obd, char *poolname, char *ostname)
+static inline int obd_pool_rem(struct obd_device *obd, char *poolname,
+			       char *ostname)
 {
-        int rc;
-        ENTRY;
-
-        OBD_CHECK_DT_OP(obd, pool_rem, -EOPNOTSUPP);
-        OBD_COUNTER_INCREMENT(obd, pool_rem);
-
-        rc = OBP(obd, pool_rem)(obd, poolname, ostname);
-        RETURN(rc);
-}
+	int rc;
 
-static inline void obd_getref(struct obd_device *obd)
-{
-        ENTRY;
-        if (OBT(obd) && OBP(obd, getref)) {
-                OBD_COUNTER_INCREMENT(obd, getref);
-                OBP(obd, getref)(obd);
-        }
-        EXIT;
-}
+	ENTRY;
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_pool_rem) {
+		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
+		RETURN(-EOPNOTSUPP);
+	}
 
-static inline void obd_putref(struct obd_device *obd)
-{
-        ENTRY;
-        if (OBT(obd) && OBP(obd, putref)) {
-                OBD_COUNTER_INCREMENT(obd, putref);
-                OBP(obd, putref)(obd);
-        }
-        EXIT;
+	rc = OBP(obd, pool_rem)(obd, poolname, ostname);
+	RETURN(rc);
 }
 
 static inline int obd_init_export(struct obd_export *exp)
 {
-        int rc = 0;
+	int rc = 0;
 
-        ENTRY;
-        if ((exp)->exp_obd != NULL && OBT((exp)->exp_obd) &&
-            OBP((exp)->exp_obd, init_export))
-                rc = OBP(exp->exp_obd, init_export)(exp);
-        RETURN(rc);
+	ENTRY;
+	if (exp->exp_obd != NULL && exp->exp_obd->obd_type &&
+	    OBP((exp)->exp_obd, init_export))
+		rc = OBP(exp->exp_obd, init_export)(exp);
+	RETURN(rc);
 }
 
 static inline int obd_destroy_export(struct obd_export *exp)
 {
-        ENTRY;
-        if ((exp)->exp_obd != NULL && OBT((exp)->exp_obd) &&
-            OBP((exp)->exp_obd, destroy_export))
-                OBP(exp->exp_obd, destroy_export)(exp);
-        RETURN(0);
+	ENTRY;
+	if (exp->exp_obd != NULL && exp->exp_obd->obd_type &&
+	    OBP(exp->exp_obd, destroy_export))
+		OBP(exp->exp_obd, destroy_export)(exp);
+	RETURN(0);
 }
 
-/* @max_age is the oldest time in jiffies that we accept using a cached data.
+/* @max_age is the oldest time in seconds that we accept using a cached data.
  * If the cache is older than @max_age we will get a new value from the
- * target.  Use a value of "cfs_time_current() + HZ" to guarantee freshness. */
+ * target. Use a value of 'ktime_get_seconds() + X' to guarantee freshness.
+ */
 static inline int obd_statfs_async(struct obd_export *exp,
-                                   struct obd_info *oinfo,
-                                   __u64 max_age,
-                                   struct ptlrpc_request_set *rqset)
+				   struct obd_info *oinfo,
+				   time64_t max_age,
+				   struct ptlrpc_request_set *rqset)
 {
-        int rc = 0;
-        struct obd_device *obd;
-        ENTRY;
-
-        if (exp == NULL || exp->exp_obd == NULL)
-                RETURN(-EINVAL);
-
-        obd = exp->exp_obd;
-        OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP);
-        OBD_COUNTER_INCREMENT(obd, statfs);
-
-	CDEBUG(D_SUPER, "%s: osfs %p age %llu, max_age %llu\n",
-               obd->obd_name, &obd->obd_osfs, obd->obd_osfs_age, max_age);
-        if (cfs_time_before_64(obd->obd_osfs_age, max_age)) {
-                rc = OBP(obd, statfs_async)(exp, oinfo, max_age, rqset);
-        } else {
-		CDEBUG(D_SUPER, "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n",
-                       obd->obd_name, &obd->obd_osfs,
-                       obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
-                       obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
-		spin_lock(&obd->obd_osfs_lock);
-		memcpy(oinfo->oi_osfs, &obd->obd_osfs, sizeof(*oinfo->oi_osfs));
-		spin_unlock(&obd->obd_osfs_lock);
-                oinfo->oi_flags |= OBD_STATFS_FROM_CACHE;
-                if (oinfo->oi_cb_up)
-                        oinfo->oi_cb_up(oinfo, 0);
-        }
-        RETURN(rc);
-}
-
-static inline int obd_statfs_rqset(struct obd_export *exp,
-                                   struct obd_statfs *osfs, __u64 max_age,
-                                   __u32 flags)
-{
-	struct ptlrpc_request_set *set = NULL;
-	struct obd_info oinfo = {
-		.oi_osfs = osfs,
-		.oi_flags = flags,
-	};
+	struct obd_device *obd;
 	int rc = 0;
 
 	ENTRY;
 
-	set = ptlrpc_prep_set();
-	if (set == NULL)
-		RETURN(-ENOMEM);
-
-	rc = obd_statfs_async(exp, &oinfo, max_age, set);
-	if (rc == 0)
-		rc = ptlrpc_set_wait(set);
+	if (exp == NULL || exp->exp_obd == NULL)
+		RETURN(-EINVAL);
 
-	ptlrpc_set_destroy(set);
+	obd = exp->exp_obd;
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_statfs) {
+		rc = -EOPNOTSUPP;
+		CERROR("%s: no statfs operation: rc = %d\n", obd->obd_name, rc);
+		RETURN(rc);
+	}
 
+	CDEBUG(D_SUPER, "%s: age %lld, max_age %lld\n",
+	       obd->obd_name, obd->obd_osfs_age, max_age);
+	if (obd->obd_osfs_age < max_age) {
+		rc = OBP(obd, statfs_async)(exp, oinfo, max_age, rqset);
+	} else {
+		CDEBUG(D_SUPER,
+		       "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n",
+		       obd->obd_name, &obd->obd_osfs,
+		       obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
+		       obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
+		spin_lock(&obd->obd_osfs_lock);
+		memcpy(oinfo->oi_osfs, &obd->obd_osfs, sizeof(*oinfo->oi_osfs));
+		spin_unlock(&obd->obd_osfs_lock);
+		oinfo->oi_flags |= OBD_STATFS_FROM_CACHE;
+		if (oinfo->oi_cb_up)
+			oinfo->oi_cb_up(oinfo, 0);
+	}
 	RETURN(rc);
 }
 
-/* @max_age is the oldest time in jiffies that we accept using a cached data.
+/* @max_age is the oldest time in seconds that we accept using a cached data.
  * If the cache is older than @max_age we will get a new value from the
- * target.  Use a value of "cfs_time_current() + HZ" to guarantee freshness. */
+ * target. Use a value of 'ktime_get_seconds() + X' to guarantee freshness.
+ */
 static inline int obd_statfs(const struct lu_env *env, struct obd_export *exp,
-                             struct obd_statfs *osfs, __u64 max_age,
-                             __u32 flags)
+			     struct obd_statfs *osfs, time64_t max_age,
+			     __u32 flags)
 {
-        int rc = 0;
-        struct obd_device *obd = exp->exp_obd;
-        ENTRY;
+	struct obd_device *obd = exp->exp_obd;
+	int rc = 0;
 
-        if (obd == NULL)
-                RETURN(-EINVAL);
+	ENTRY;
+	if (unlikely(obd == NULL))
+		RETURN(-EINVAL);
 
-        OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP);
 	OBD_CHECK_DEV_ACTIVE(obd);
-        OBD_COUNTER_INCREMENT(obd, statfs);
 
-	CDEBUG(D_SUPER, "osfs %llu, max_age %llu\n",
-               obd->obd_osfs_age, max_age);
-        if (cfs_time_before_64(obd->obd_osfs_age, max_age)) {
-                rc = OBP(obd, statfs)(env, exp, osfs, max_age, flags);
-                if (rc == 0) {
+	if (unlikely(!obd->obd_type || !obd->obd_type->typ_dt_ops->o_statfs)) {
+		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
+		RETURN(-EOPNOTSUPP);
+	}
+
+	CDEBUG(D_SUPER, "%s: age %lld, max_age %lld\n",
+	       obd->obd_name, obd->obd_osfs_age, max_age);
+	/* ignore cache if aggregated isn't expected */
+	if (obd->obd_osfs_age < max_age ||
+	    ((obd->obd_osfs.os_state & OS_STATE_SUM) &&
+	     !(flags & OBD_STATFS_SUM))) {
+		/* the RPC will block anyway, so avoid sending many at once */
+		rc = mutex_lock_interruptible(&obd->obd_dev_mutex);
+		if (rc)
+			RETURN(rc);
+		if (obd->obd_osfs_age < max_age ||
+		    ((obd->obd_osfs.os_state & OS_STATE_SUM) &&
+		     !(flags & OBD_STATFS_SUM))) {
+			rc = OBP(obd, statfs)(env, exp, osfs, max_age, flags);
+		} else {
+			mutex_unlock(&obd->obd_dev_mutex);
+			GOTO(cached, rc = 0);
+		}
+		if (rc == 0) {
+			CDEBUG(D_SUPER,
+			       "%s: update %p cache blocks %llu/%llu objects %llu/%llu\n",
+			       obd->obd_name, &obd->obd_osfs,
+			       osfs->os_bavail, osfs->os_blocks,
+			       osfs->os_ffree, osfs->os_files);
+
 			spin_lock(&obd->obd_osfs_lock);
 			memcpy(&obd->obd_osfs, osfs, sizeof(obd->obd_osfs));
-			obd->obd_osfs_age = cfs_time_current_64();
+			obd->obd_osfs_age = ktime_get_seconds();
 			spin_unlock(&obd->obd_osfs_lock);
 		}
+		mutex_unlock(&obd->obd_dev_mutex);
 	} else {
-		CDEBUG(D_SUPER, "%s: use %p cache blocks %llu/%llu"
-		       " objects %llu/%llu\n",
+cached:
+		CDEBUG(D_SUPER,
+		       "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n",
 		       obd->obd_name, &obd->obd_osfs,
 		       obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
 		       obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
@@ -1125,8 +1114,17 @@ static inline int obd_preprw(const struct lu_env *env, int cmd,
 	int rc;
 
 	ENTRY;
-	EXP_CHECK_DT_OP(exp, preprw);
-	EXP_COUNTER_INCREMENT(exp, preprw);
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_preprw) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
+
 	rc = OBP(exp->exp_obd, preprw)(env, cmd, exp, oa, objcount, obj, remote,
 				       pages, local);
 
@@ -1137,14 +1135,23 @@ static inline int obd_commitrw(const struct lu_env *env, int cmd,
 			       struct obd_export *exp, struct obdo *oa,
 			       int objcount, struct obd_ioobj *obj,
 			       struct niobuf_remote *rnb, int pages,
-			       struct niobuf_local *local, int rc)
+			       struct niobuf_local *local, const int orig_rc)
 {
+	int rc;
 	ENTRY;
 
-	EXP_CHECK_DT_OP(exp, commitrw);
-	EXP_COUNTER_INCREMENT(exp, commitrw);
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_commitrw) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
+
 	rc = OBP(exp->exp_obd, commitrw)(env, cmd, exp, oa, objcount, obj,
-					 rnb, pages, local, rc);
+					 rnb, pages, local, orig_rc);
 
 	RETURN(rc);
 }
@@ -1155,8 +1162,15 @@ static inline int obd_iocontrol(unsigned int cmd, struct obd_export *exp,
         int rc;
         ENTRY;
 
-        EXP_CHECK_DT_OP(exp, iocontrol);
-        EXP_COUNTER_INCREMENT(exp, iocontrol);
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_iocontrol) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
 
         rc = OBP(exp->exp_obd, iocontrol)(cmd, exp, len, karg, uarg);
         RETURN(rc);
@@ -1172,10 +1186,10 @@ static inline void obd_import_event(struct obd_device *obd,
                 EXIT;
                 return;
         }
-        if (obd->obd_set_up && OBP(obd, import_event)) {
-                OBD_COUNTER_INCREMENT(obd, import_event);
+
+        if (obd->obd_set_up && OBP(obd, import_event))
                 OBP(obd, import_event)(obd, imp, event);
-        }
+
         EXIT;
 }
 
@@ -1185,7 +1199,10 @@ static inline int obd_notify(struct obd_device *obd,
 {
 	int rc;
 	ENTRY;
-	OBD_CHECK_DEV(obd);
+
+	rc = obd_check_dev(obd);
+	if (rc)
+		return rc;
 
 	if (!obd->obd_set_up) {
 		CDEBUG(D_HA, "obd %s not set up\n", obd->obd_name);
@@ -1197,7 +1214,6 @@ static inline int obd_notify(struct obd_device *obd,
 		RETURN(-ENOSYS);
 	}
 
-	OBD_COUNTER_INCREMENT(obd, notify);
 	rc = OBP(obd, notify)(obd, watched, ev);
 
 	RETURN(rc);
@@ -1230,45 +1246,58 @@ static inline int obd_quotactl(struct obd_export *exp,
         int rc;
         ENTRY;
 
-        EXP_CHECK_DT_OP(exp, quotactl);
-        EXP_COUNTER_INCREMENT(exp, quotactl);
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_quotactl) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
 
         rc = OBP(exp->exp_obd, quotactl)(exp->exp_obd, exp, oqctl);
         RETURN(rc);
 }
 
 static inline int obd_health_check(const struct lu_env *env,
-                                   struct obd_device *obd)
-{
-        /* returns: 0 on healthy
-         *         >0 on unhealthy + reason code/flag
-         *            however the only suppored reason == 1 right now
-         *            We'll need to define some better reasons
-         *            or flags in the future.
-         *         <0 on error
-         */
-        int rc;
-        ENTRY;
+				   struct obd_device *obd)
+{
+	/* returns: 0 on healthy
+	 *         >0 on unhealthy + reason code/flag
+	 *            however the only suppored reason == 1 right now
+	 *            We'll need to define some better reasons
+	 *            or flags in the future.
+	 *         <0 on error
+	 */
+	int rc;
 
-        /* don't use EXP_CHECK_DT_OP, because NULL method is normal here */
-        if (obd == NULL || !OBT(obd)) {
-                CERROR("cleaned up obd\n");
-                RETURN(-EOPNOTSUPP);
-        }
-        if (!obd->obd_set_up || obd->obd_stopping)
-                RETURN(0);
-        if (!OBP(obd, health_check))
-                RETURN(0);
+	ENTRY;
 
-        rc = OBP(obd, health_check)(env, obd);
-        RETURN(rc);
+	/* NULL method is normal here */
+	if (obd == NULL || !obd->obd_type) {
+		CERROR("cleaned up obd\n");
+		RETURN(-EOPNOTSUPP);
+	}
+	if (!obd->obd_set_up || obd->obd_stopping)
+		RETURN(0);
+	if (!OBP(obd, health_check))
+		RETURN(0);
+
+	rc = OBP(obd, health_check)(env, obd);
+	RETURN(rc);
 }
 
 static inline int obd_register_observer(struct obd_device *obd,
                                         struct obd_device *observer)
 {
+	int rc;
         ENTRY;
-        OBD_CHECK_DEV(obd);
+
+	rc = obd_check_dev(obd);
+	if (rc)
+		return rc;
+
 	down_write(&obd->obd_observer_link_sem);
         if (obd->obd_observer && observer) {
 		up_write(&obd->obd_observer_link_sem);
@@ -1280,51 +1309,79 @@ static inline int obd_register_observer(struct obd_device *obd,
 }
 
 /* metadata helpers */
+enum mps_stat_idx {
+	LPROC_MD_CLOSE,
+	LPROC_MD_CREATE,
+	LPROC_MD_ENQUEUE,
+	LPROC_MD_GETATTR,
+	LPROC_MD_INTENT_LOCK,
+	LPROC_MD_LINK,
+	LPROC_MD_RENAME,
+	LPROC_MD_SETATTR,
+	LPROC_MD_FSYNC,
+	LPROC_MD_READ_PAGE,
+	LPROC_MD_UNLINK,
+	LPROC_MD_SETXATTR,
+	LPROC_MD_GETXATTR,
+	LPROC_MD_INTENT_GETATTR_ASYNC,
+	LPROC_MD_REVALIDATE_LOCK,
+	LPROC_MD_LAST_OPC,
+};
+
 static inline int md_get_root(struct obd_export *exp, const char *fileset,
 			      struct lu_fid *fid)
 {
 	int rc;
 
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, get_root);
-	EXP_MD_COUNTER_INCREMENT(exp, get_root);
-	rc = MDP(exp->exp_obd, get_root)(exp, fileset, fid);
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
 
-	RETURN(rc);
+	return MDP(exp->exp_obd, get_root)(exp, fileset, fid);
 }
 
-static inline int md_getattr(struct obd_export *exp, struct md_op_data *op_data,
-                             struct ptlrpc_request **request)
+static inline int md_getattr(struct obd_export *exp,
+			     struct md_op_data *op_data,
+			     struct ptlrpc_request **request)
 {
-        int rc;
-        ENTRY;
-        EXP_CHECK_MD_OP(exp, getattr);
-        EXP_MD_COUNTER_INCREMENT(exp, getattr);
-        rc = MDP(exp->exp_obd, getattr)(exp, op_data, request);
-        RETURN(rc);
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_GETATTR);
+
+	return MDP(exp->exp_obd, getattr)(exp, op_data, request);
 }
 
 static inline int md_null_inode(struct obd_export *exp,
                                    const struct lu_fid *fid)
 {
-        int rc;
-        ENTRY;
-        EXP_CHECK_MD_OP(exp, null_inode);
-        EXP_MD_COUNTER_INCREMENT(exp, null_inode);
-        rc = MDP(exp->exp_obd, null_inode)(exp, fid);
-        RETURN(rc);
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, null_inode)(exp, fid);
 }
 
 static inline int md_close(struct obd_export *exp, struct md_op_data *op_data,
                            struct md_open_data *mod,
                            struct ptlrpc_request **request)
 {
-        int rc;
-        ENTRY;
-        EXP_CHECK_MD_OP(exp, close);
-        EXP_MD_COUNTER_INCREMENT(exp, close);
-        rc = MDP(exp->exp_obd, close)(exp, op_data, mod, request);
-        RETURN(rc);
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_CLOSE);
+
+	return MDP(exp->exp_obd, close)(exp, op_data, mod, request);
 }
 
 static inline int md_create(struct obd_export *exp, struct md_op_data *op_data,
@@ -1332,13 +1389,18 @@ static inline int md_create(struct obd_export *exp, struct md_op_data *op_data,
 			    uid_t uid, gid_t gid, cfs_cap_t cap_effective,
 			    __u64 rdev, struct ptlrpc_request **request)
 {
-        int rc;
-        ENTRY;
-        EXP_CHECK_MD_OP(exp, create);
-        EXP_MD_COUNTER_INCREMENT(exp, create);
-        rc = MDP(exp->exp_obd, create)(exp, op_data, data, datalen, mode,
-                                       uid, gid, cap_effective, rdev, request);
-        RETURN(rc);
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_CREATE);
+
+	return MDP(exp->exp_obd, create)(exp, op_data, data, datalen, mode,
+					 uid, gid, cap_effective, rdev,
+					 request);
 }
 
 static inline int md_enqueue(struct obd_export *exp,
@@ -1349,24 +1411,29 @@ static inline int md_enqueue(struct obd_export *exp,
 			     __u64 extra_lock_flags)
 {
 	int rc;
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, enqueue);
-	EXP_MD_COUNTER_INCREMENT(exp, enqueue);
-	rc = MDP(exp->exp_obd, enqueue)(exp, einfo, policy, op_data, lockh,
-					extra_lock_flags);
-        RETURN(rc);
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_ENQUEUE);
+
+	return MDP(exp->exp_obd, enqueue)(exp, einfo, policy, op_data, lockh,
+		   extra_lock_flags);
 }
 
 static inline int md_getattr_name(struct obd_export *exp,
                                   struct md_op_data *op_data,
                                   struct ptlrpc_request **request)
 {
-        int rc;
-        ENTRY;
-        EXP_CHECK_MD_OP(exp, getattr_name);
-        EXP_MD_COUNTER_INCREMENT(exp, getattr_name);
-        rc = MDP(exp->exp_obd, getattr_name)(exp, op_data, request);
-        RETURN(rc);
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, getattr_name)(exp, op_data, request);
 }
 
 static inline int md_intent_lock(struct obd_export *exp,
@@ -1377,36 +1444,49 @@ static inline int md_intent_lock(struct obd_export *exp,
 				 __u64 extra_lock_flags)
 {
 	int rc;
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, intent_lock);
-	EXP_MD_COUNTER_INCREMENT(exp, intent_lock);
-	rc = MDP(exp->exp_obd, intent_lock)(exp, op_data, it, reqp, cb_blocking,
-					    extra_lock_flags);
-	RETURN(rc);
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_INTENT_LOCK);
+
+	return MDP(exp->exp_obd, intent_lock)(exp, op_data, it, reqp,
+					      cb_blocking, extra_lock_flags);
 }
 
 static inline int md_link(struct obd_export *exp, struct md_op_data *op_data,
                           struct ptlrpc_request **request)
 {
-        int rc;
-        ENTRY;
-        EXP_CHECK_MD_OP(exp, link);
-        EXP_MD_COUNTER_INCREMENT(exp, link);
-        rc = MDP(exp->exp_obd, link)(exp, op_data, request);
-        RETURN(rc);
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_LINK);
+
+	return MDP(exp->exp_obd, link)(exp, op_data, request);
 }
 
 static inline int md_rename(struct obd_export *exp, struct md_op_data *op_data,
-			    const char *old, size_t oldlen, const char *new,
-			    size_t newlen, struct ptlrpc_request **request)
+			    const char *old_name, size_t oldlen,
+			    const char *new_name, size_t newlen,
+			    struct ptlrpc_request **request)
 {
-        int rc;
-        ENTRY;
-        EXP_CHECK_MD_OP(exp, rename);
-        EXP_MD_COUNTER_INCREMENT(exp, rename);
-        rc = MDP(exp->exp_obd, rename)(exp, op_data, old, oldlen, new,
-                                       newlen, request);
-        RETURN(rc);
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_RENAME);
+
+	return MDP(exp->exp_obd, rename)(exp, op_data, old_name, oldlen,
+					 new_name, newlen, request);
 }
 
 static inline int md_setattr(struct obd_export *exp, struct md_op_data *op_data,
@@ -1414,11 +1494,15 @@ static inline int md_setattr(struct obd_export *exp, struct md_op_data *op_data,
 			     struct ptlrpc_request **request)
 {
 	int rc;
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, setattr);
-	EXP_MD_COUNTER_INCREMENT(exp, setattr);
-	rc = MDP(exp->exp_obd, setattr)(exp, op_data, ea, ealen, request);
-	RETURN(rc);
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_SETATTR);
+
+	return MDP(exp->exp_obd, setattr)(exp, op_data, ea, ealen, request);
 }
 
 static inline int md_fsync(struct obd_export *exp, const struct lu_fid *fid,
@@ -1426,12 +1510,27 @@ static inline int md_fsync(struct obd_export *exp, const struct lu_fid *fid,
 {
 	int rc;
 
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, fsync);
-	EXP_MD_COUNTER_INCREMENT(exp, fsync);
-	rc = MDP(exp->exp_obd, fsync)(exp, fid, request);
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
 
-	RETURN(rc);
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_FSYNC);
+
+	return MDP(exp->exp_obd, fsync)(exp, fid, request);
+}
+
+/* FLR: resync mirrored files. */
+static inline int md_file_resync(struct obd_export *exp,
+				 struct md_op_data *data)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, file_resync)(exp, data);
 }
 
 static inline int md_read_page(struct obd_export *exp,
@@ -1441,23 +1540,31 @@ static inline int md_read_page(struct obd_export *exp,
 			       struct page **ppage)
 {
 	int rc;
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, read_page);
-	EXP_MD_COUNTER_INCREMENT(exp, read_page);
-	rc = MDP(exp->exp_obd, read_page)(exp, op_data, cb_op, hash_offset,
-					  ppage);
-	RETURN(rc);
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_READ_PAGE);
+
+	return MDP(exp->exp_obd, read_page)(exp, op_data, cb_op, hash_offset,
+					    ppage);
 }
 
 static inline int md_unlink(struct obd_export *exp, struct md_op_data *op_data,
                             struct ptlrpc_request **request)
 {
-        int rc;
-        ENTRY;
-        EXP_CHECK_MD_OP(exp, unlink);
-        EXP_MD_COUNTER_INCREMENT(exp, unlink);
-        rc = MDP(exp->exp_obd, unlink)(exp, op_data, request);
-        RETURN(rc);
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_UNLINK);
+
+	return MDP(exp->exp_obd, unlink)(exp, op_data, request);
 }
 
 static inline int md_get_lustre_md(struct obd_export *exp,
@@ -1466,19 +1573,25 @@ static inline int md_get_lustre_md(struct obd_export *exp,
                                    struct obd_export *md_exp,
                                    struct lustre_md *md)
 {
-        ENTRY;
-        EXP_CHECK_MD_OP(exp, get_lustre_md);
-        EXP_MD_COUNTER_INCREMENT(exp, get_lustre_md);
-        RETURN(MDP(exp->exp_obd, get_lustre_md)(exp, req, dt_exp, md_exp, md));
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, get_lustre_md)(exp, req, dt_exp, md_exp, md);
 }
 
 static inline int md_free_lustre_md(struct obd_export *exp,
                                     struct lustre_md *md)
 {
-        ENTRY;
-        EXP_CHECK_MD_OP(exp, free_lustre_md);
-        EXP_MD_COUNTER_INCREMENT(exp, free_lustre_md);
-        RETURN(MDP(exp->exp_obd, free_lustre_md)(exp, md));
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, free_lustre_md)(exp, md);
 }
 
 static inline int md_merge_attr(struct obd_export *exp,
@@ -1486,67 +1599,88 @@ static inline int md_merge_attr(struct obd_export *exp,
 				struct cl_attr *attr,
 				ldlm_blocking_callback cb)
 {
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, merge_attr);
-	EXP_MD_COUNTER_INCREMENT(exp, merge_attr);
-	RETURN(MDP(exp->exp_obd, merge_attr)(exp, lsm, attr, cb));
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, merge_attr)(exp, lsm, attr, cb);
 }
 
 static inline int md_setxattr(struct obd_export *exp, const struct lu_fid *fid,
-			      u64 valid, const char *name,
-			      const char *input, int input_size,
-			      int output_size, int flags, __u32 suppgid,
-			      struct ptlrpc_request **request)
+			      u64 obd_md_valid, const char *name,
+			      const void *value, size_t value_size,
+			      unsigned int xattr_flags, u32 suppgid,
+			      struct ptlrpc_request **req)
 {
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, setxattr);
-	EXP_MD_COUNTER_INCREMENT(exp, setxattr);
-	RETURN(MDP(exp->exp_obd, setxattr)(exp, fid, valid, name, input,
-					   input_size, output_size, flags,
-					   suppgid, request));
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_SETXATTR);
+
+	return MDP(exp->exp_obd, setxattr)(exp, fid, obd_md_valid, name,
+					   value, value_size, xattr_flags,
+					   suppgid, req);
 }
 
 static inline int md_getxattr(struct obd_export *exp, const struct lu_fid *fid,
-			      u64 valid, const char *name,
-			      const char *input, int input_size,
-			      int output_size, int flags,
-			      struct ptlrpc_request **request)
+			      u64 obd_md_valid, const char *name,
+			      size_t buf_size, struct ptlrpc_request **req)
 {
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, getxattr);
-	EXP_MD_COUNTER_INCREMENT(exp, getxattr);
-	RETURN(MDP(exp->exp_obd, getxattr)(exp, fid, valid, name, input,
-					   input_size, output_size, flags,
-					   request));
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_GETXATTR);
+
+	return MDP(exp->exp_obd, getxattr)(exp, fid, obd_md_valid, name,
+					   buf_size, req);
 }
 
 static inline int md_set_open_replay_data(struct obd_export *exp,
 					  struct obd_client_handle *och,
 					  struct lookup_intent *it)
 {
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, set_open_replay_data);
-	EXP_MD_COUNTER_INCREMENT(exp, set_open_replay_data);
-	RETURN(MDP(exp->exp_obd, set_open_replay_data)(exp, och, it));
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, set_open_replay_data)(exp, och, it);
 }
 
 static inline int md_clear_open_replay_data(struct obd_export *exp,
                                             struct obd_client_handle *och)
 {
-        ENTRY;
-        EXP_CHECK_MD_OP(exp, clear_open_replay_data);
-        EXP_MD_COUNTER_INCREMENT(exp, clear_open_replay_data);
-        RETURN(MDP(exp->exp_obd, clear_open_replay_data)(exp, och));
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, clear_open_replay_data)(exp, och);
 }
 
 static inline int md_set_lock_data(struct obd_export *exp,
 				   const struct lustre_handle *lockh,
 				   void *data, __u64 *bits)
 {
-        ENTRY;
-        EXP_CHECK_MD_OP(exp, set_lock_data);
-        EXP_MD_COUNTER_INCREMENT(exp, set_lock_data);
-        RETURN(MDP(exp->exp_obd, set_lock_data)(exp, lockh, data, bits));
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, set_lock_data)(exp, lockh, data, bits);
 }
 
 static inline
@@ -1555,14 +1689,13 @@ int md_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
 		     enum ldlm_cancel_flags cancel_flags, void *opaque)
 {
 	int rc;
-	ENTRY;
 
-	EXP_CHECK_MD_OP(exp, cancel_unused);
-	EXP_MD_COUNTER_INCREMENT(exp, cancel_unused);
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
 
-	rc = MDP(exp->exp_obd, cancel_unused)(exp, fid, policy, mode,
-					      cancel_flags, opaque);
-	RETURN(rc);
+	return MDP(exp->exp_obd, cancel_unused)(exp, fid, policy, mode,
+						cancel_flags, opaque);
 }
 
 static inline enum ldlm_mode md_lock_match(struct obd_export *exp, __u64 flags,
@@ -1572,43 +1705,57 @@ static inline enum ldlm_mode md_lock_match(struct obd_export *exp, __u64 flags,
 					   enum ldlm_mode mode,
 					   struct lustre_handle *lockh)
 {
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, lock_match);
-	EXP_MD_COUNTER_INCREMENT(exp, lock_match);
-	RETURN(MDP(exp->exp_obd, lock_match)(exp, flags, fid, type,
-					     policy, mode, lockh));
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, lock_match)(exp, flags, fid, type,
+					     policy, mode, lockh);
 }
 
 static inline int md_init_ea_size(struct obd_export *exp, __u32 ea_size,
 				  __u32 def_ea_size)
 {
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, init_ea_size);
-	EXP_MD_COUNTER_INCREMENT(exp, init_ea_size);
-	RETURN(MDP(exp->exp_obd, init_ea_size)(exp, ea_size, def_ea_size));
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, init_ea_size)(exp, ea_size, def_ea_size);
 }
 
 static inline int md_intent_getattr_async(struct obd_export *exp,
 					  struct md_enqueue_info *minfo)
 {
 	int rc;
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, intent_getattr_async);
-	EXP_MD_COUNTER_INCREMENT(exp, intent_getattr_async);
-	rc = MDP(exp->exp_obd, intent_getattr_async)(exp, minfo);
-	RETURN(rc);
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_INTENT_GETATTR_ASYNC);
+
+	return MDP(exp->exp_obd, intent_getattr_async)(exp, minfo);
 }
 
 static inline int md_revalidate_lock(struct obd_export *exp,
                                      struct lookup_intent *it,
                                      struct lu_fid *fid, __u64 *bits)
 {
-        int rc;
-        ENTRY;
-        EXP_CHECK_MD_OP(exp, revalidate_lock);
-        EXP_MD_COUNTER_INCREMENT(exp, revalidate_lock);
-        rc = MDP(exp->exp_obd, revalidate_lock)(exp, it, fid, bits);
-        RETURN(rc);
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_REVALIDATE_LOCK);
+
+	return MDP(exp->exp_obd, revalidate_lock)(exp, it, fid, bits);
 }
 
 static inline int md_get_fid_from_lsm(struct obd_export *exp,
@@ -1617,13 +1764,14 @@ static inline int md_get_fid_from_lsm(struct obd_export *exp,
 				      struct lu_fid *fid)
 {
 	int rc;
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, get_fid_from_lsm);
-	EXP_MD_COUNTER_INCREMENT(exp, get_fid_from_lsm);
-	rc = MDP(exp->exp_obd, get_fid_from_lsm)(exp, lsm, name, namelen, fid);
-	RETURN(rc);
-}
 
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, get_fid_from_lsm)(exp, lsm, name, namelen,
+						   fid);
+}
 
 /* Unpack an MD struct from disk to in-memory format.
  * Returns +ve size of unpacked MD (0 for free), or -ve error.
@@ -1636,31 +1784,30 @@ static inline int md_unpackmd(struct obd_export *exp,
 			      const union lmv_mds_md *lmm, size_t lmm_size)
 {
 	int rc;
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, unpackmd);
-	EXP_MD_COUNTER_INCREMENT(exp, unpackmd);
-	rc = MDP(exp->exp_obd, unpackmd)(exp, plsm, lmm, lmm_size);
-	RETURN(rc);
-}
 
-/* OBD Metadata Support */
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
 
-extern int obd_init_caches(void);
-extern void obd_cleanup_caches(void);
+	return MDP(exp->exp_obd, unpackmd)(exp, plsm, lmm, lmm_size);
+}
 
-/* support routines */
-extern struct kmem_cache *obdo_cachep;
+static inline int md_rmfid(struct obd_export *exp, struct fid_array *fa,
+			   int *rcs, struct ptlrpc_request_set *set)
+{
+	int rc;
 
-#define OBDO_ALLOC(ptr)                                                       \
-do {                                                                          \
-	OBD_SLAB_ALLOC_PTR_GFP((ptr), obdo_cachep, GFP_NOFS);             \
-} while(0)
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
 
-#define OBDO_FREE(ptr)                                                        \
-do {                                                                          \
-        OBD_SLAB_FREE_PTR((ptr), obdo_cachep);                                \
-} while(0)
+	return MDP(exp->exp_obd, rmfid)(exp, fa, rcs, set);
+}
 
+/* OBD Metadata Support */
+
+extern int obd_init_caches(void);
+extern void obd_cleanup_caches(void);
 
 typedef int (*register_lwp_cb)(void *data);
 
@@ -1692,13 +1839,14 @@ int lustre_register_fs(void);
 int lustre_unregister_fs(void);
 int lustre_check_exclusion(struct super_block *sb, char *svname);
 
-/* sysctl.c */
-extern int obd_sysctl_init(void);
-extern void obd_sysctl_clean(void);
-
-/* uuid.c  */
 typedef __u8 class_uuid_t[16];
-void class_uuid_unparse(class_uuid_t in, struct obd_uuid *out);
+static inline void class_uuid_unparse(class_uuid_t uu, struct obd_uuid *out)
+{
+	snprintf(out->uuid, sizeof(out->uuid), "%02x%02x%02x%02x-%02x%02x-"
+		 "%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+		 uu[14], uu[15], uu[12], uu[13], uu[10], uu[11], uu[8], uu[9],
+		 uu[6], uu[7], uu[4], uu[5], uu[2], uu[3], uu[0], uu[1]);
+}
 
 /* lustre_peer.c    */
 int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index);
@@ -1707,7 +1855,7 @@ int class_del_uuid (const char *uuid);
 int class_check_uuid(struct obd_uuid *uuid, __u64 nid);
 
 /* class_obd.c */
-extern char obd_jobid_node[];
+extern char obd_jobid_name[];
 
 /* prng.c */
 #define ll_generate_random_uuid(uuid_out) cfs_get_random_bytes(uuid_out, sizeof(class_uuid_t))
@@ -1733,5 +1881,4 @@ extern struct miscdevice obd_psdev;
 int obd_ioctl_getdata(char **buf, int *len, void __user *arg);
 int class_procfs_init(void);
 int class_procfs_clean(void);
-
 #endif /* __LINUX_OBD_CLASS_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/obd_support.h b/drivers/staging/lustrefsx/lustre/include/obd_support.h
index c22e08fe8cdb2..356585d91932b 100644
--- a/drivers/staging/lustrefsx/lustre/include/obd_support.h
+++ b/drivers/staging/lustrefsx/lustre/include/obd_support.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -33,11 +33,8 @@
 #ifndef _OBD_SUPPORT
 #define _OBD_SUPPORT
 
-#ifndef __KERNEL__
-# error Userspace should not include obd_support.h.
-#endif /* !__KERNEL__ */
-
 #include <linux/atomic.h>
+#include <linux/ctype.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
 #include <linux/types.h>
@@ -56,6 +53,7 @@ enum {
 extern unsigned int obd_debug_peer_on_timeout;
 extern unsigned int obd_dump_on_timeout;
 extern unsigned int obd_dump_on_eviction;
+extern unsigned int obd_lbug_on_eviction;
 /* obd_timeout should only be used for recovery, not for
    networking / disk / timings affected by load (use Adaptive Timeouts) */
 extern unsigned int obd_timeout;          /* seconds */
@@ -70,7 +68,6 @@ extern int at_early_margin;
 extern int at_extra;
 extern unsigned long obd_max_dirty_pages;
 extern atomic_long_t obd_dirty_pages;
-extern atomic_long_t obd_dirty_transit_pages;
 extern char obd_jobid_var[];
 
 /* Some hash init argument constants */
@@ -182,7 +179,9 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_MDS_GET_ROOT_NET	 0x11b
 #define OBD_FAIL_MDS_GET_ROOT_PACK	 0x11c
 #define OBD_FAIL_MDS_STATFS_PACK         0x11d
+#define OBD_FAIL_MDS_STATFS_SUM_PACK     0x11d
 #define OBD_FAIL_MDS_STATFS_NET          0x11e
+#define OBD_FAIL_MDS_STATFS_SUM_NET      0x11e
 #define OBD_FAIL_MDS_GETATTR_NAME_NET    0x11f
 #define OBD_FAIL_MDS_PIN_NET             0x120
 #define OBD_FAIL_MDS_UNPIN_NET           0x121
@@ -245,11 +244,16 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a
 #define OBD_FAIL_MDS_LLOG_CREATE_FAILED2 0x15b
 #define OBD_FAIL_MDS_FLD_LOOKUP			0x15c
+#define OBD_FAIL_MDS_CHANGELOG_REORDER	0x15d
 #define OBD_FAIL_MDS_INTENT_DELAY		0x160
 #define OBD_FAIL_MDS_XATTR_REP			0x161
 #define OBD_FAIL_MDS_TRACK_OVERFLOW	 0x162
 #define OBD_FAIL_MDS_LOV_CREATE_RACE	 0x163
 #define OBD_FAIL_MDS_HSM_CDT_DELAY	 0x164
+#define OBD_FAIL_MDS_ORPHAN_DELETE	 0x165
+#define OBD_FAIL_MDS_RMFID_NET		 0x166
+#define OBD_FAIL_MDS_REINT_OPEN		 0x169
+#define OBD_FAIL_MDS_REINT_OPEN2	 0x16a
 
 /* layout lock */
 #define OBD_FAIL_MDS_NO_LL_GETATTR	 0x170
@@ -265,6 +269,8 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_MDS_RECOVERY_ACCEPTS_GAPS 0x185
 #define OBD_FAIL_MDS_GET_INFO_NET        0x186
 #define OBD_FAIL_MDS_DQACQ_NET           0x187
+#define OBD_FAIL_MDS_STRIPE_CREATE	 0x188
+#define OBD_FAIL_MDS_STRIPE_FID		 0x189
 
 /* OI scrub */
 #define OBD_FAIL_OSD_SCRUB_DELAY			0x190
@@ -275,6 +281,12 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OSD_COMPAT_INVALID_ENTRY		0x195
 #define OBD_FAIL_OSD_COMPAT_NO_ENTRY			0x196
 #define OBD_FAIL_OSD_OST_EA_FID_SET			0x197
+#define OBD_FAIL_OSD_NO_OI_ENTRY			0x198
+#define OBD_FAIL_OSD_INDEX_CRASH			0x199
+
+#define OBD_FAIL_OSD_TXN_START				0x19a
+
+#define OBD_FAIL_OSD_DUPLICATE_MAP			0x19b
 
 #define OBD_FAIL_OFD_SET_OID				0x1e0
 
@@ -329,6 +341,14 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OST_PAUSE_PUNCH         0x236
 #define OBD_FAIL_OST_LADVISE_PAUSE	 0x237
 #define OBD_FAIL_OST_FAKE_RW		 0x238
+#define OBD_FAIL_OST_LIST_ASSERT         0x239
+#define OBD_FAIL_OST_GL_WORK_ALLOC	 0x240
+#define OBD_FAIL_OST_SKIP_LV_CHECK	 0x241
+#define OBD_FAIL_OST_STATFS_DELAY	 0x242
+#define OBD_FAIL_OST_INTEGRITY_FAULT	 0x243
+#define OBD_FAIL_OST_INTEGRITY_CMP	 0x244
+#define OBD_FAIL_OST_DISCONNECT_DELAY	 0x245
+#define OBD_FAIL_OST_2BIG_NIOBUF	 0x248
 
 #define OBD_FAIL_LDLM                    0x300
 #define OBD_FAIL_LDLM_NAMESPACE_NEW      0x301
@@ -371,9 +391,11 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_LDLM_SRV_GL_AST	 0x326
 #define OBD_FAIL_LDLM_WATERMARK_LOW	 0x327
 #define OBD_FAIL_LDLM_WATERMARK_HIGH	 0x328
+#define OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL 0x329
 
 #define OBD_FAIL_LDLM_GRANT_CHECK        0x32a
 #define OBD_FAIL_LDLM_PROLONG_PAUSE	 0x32b
+#define OBD_FAIL_LDLM_LOCK_REPLAY	 0x32d
 
 /* LOCKLESS IO */
 #define OBD_FAIL_LDLM_SET_CONTENTION     0x385
@@ -399,6 +421,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OSC_DELAY_SETTIME	 0x412
 #define OBD_FAIL_OSC_CONNECT_GRANT_PARAM 0x413
 #define OBD_FAIL_OSC_DELAY_IO            0x414
+#define OBD_FAIL_OSC_NO_SIZE_DATA        0x415
 
 #define OBD_FAIL_PTLRPC                  0x500
 #define OBD_FAIL_PTLRPC_ACK              0x501
@@ -427,19 +450,21 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_PTLRPC_LONG_BOTH_UNLINK 0x51c
 #define OBD_FAIL_PTLRPC_CLIENT_BULK_CB3  0x520
 #define OBD_FAIL_PTLRPC_BULK_ATTACH      0x521
+#define OBD_FAIL_PTLRPC_RESEND_RACE	 0x525
 #define OBD_FAIL_PTLRPC_CONNECT_RACE	 0x531
 
 #define OBD_FAIL_OBD_PING_NET            0x600
-#define OBD_FAIL_OBD_LOG_CANCEL_NET      0x601
+/*	OBD_FAIL_OBD_LOG_CANCEL_NET      0x601 obsolete since 1.5 */
 #define OBD_FAIL_OBD_LOGD_NET            0x602
 /*	OBD_FAIL_OBD_QC_CALLBACK_NET     0x603 obsolete since 2.4 */
 #define OBD_FAIL_OBD_DQACQ               0x604
 #define OBD_FAIL_OBD_LLOG_SETUP          0x605
-#define OBD_FAIL_OBD_LOG_CANCEL_REP      0x606
+/*	OBD_FAIL_OBD_LOG_CANCEL_REP      0x606 obsolete since 1.5 */
 #define OBD_FAIL_OBD_IDX_READ_NET        0x607
 #define OBD_FAIL_OBD_IDX_READ_BREAK	 0x608
 #define OBD_FAIL_OBD_NO_LRU		 0x609
 #define OBD_FAIL_OBDCLASS_MODULE_LOAD	 0x60a
+#define OBD_FAIL_OBD_ZERO_NLINK_RACE	 0x60b
 
 #define OBD_FAIL_TGT_REPLY_NET           0x700
 #define OBD_FAIL_TGT_CONN_RACE           0x701
@@ -462,14 +487,19 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_TGT_CLIENT_DEL		 0x718
 #define OBD_FAIL_TGT_SLUGGISH_NET	 0x719
 #define OBD_FAIL_TGT_RCVD_EIO		 0x720
+#define OBD_FAIL_TGT_RECOVERY_REQ_RACE	 0x721
+#define OBD_FAIL_TGT_REPLY_DATA_RACE	 0x722
+#define OBD_FAIL_TGT_NO_GRANT		 0x725
 
 #define OBD_FAIL_MDC_REVALIDATE_PAUSE    0x800
 #define OBD_FAIL_MDC_ENQUEUE_PAUSE       0x801
 #define OBD_FAIL_MDC_OLD_EXT_FLAGS       0x802
 #define OBD_FAIL_MDC_GETATTR_ENQUEUE     0x803
-#define OBD_FAIL_MDC_RPCS_SEM		 0x804
+#define OBD_FAIL_MDC_RPCS_SEM		 0x804 /* deprecated */
 #define OBD_FAIL_MDC_LIGHTWEIGHT	 0x805
 #define OBD_FAIL_MDC_CLOSE		 0x806
+#define OBD_FAIL_MDC_MERGE		 0x807
+#define OBD_FAIL_MDC_GLIMPSE_DDOS	 0x808
 
 #define OBD_FAIL_MGS                     0x900
 #define OBD_FAIL_MGS_ALL_REQUEST_NET     0x901
@@ -501,6 +531,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_FLD                     0x1100
 #define OBD_FAIL_FLD_QUERY_NET           0x1101
 #define OBD_FAIL_FLD_READ_NET		 0x1102
+#define OBD_FAIL_FLD_QUERY_REQ		 0x1103
 
 #define OBD_FAIL_SEC_CTX                 0x1200
 #define OBD_FAIL_SEC_CTX_INIT_NET        0x1201
@@ -509,18 +540,25 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_SEC_CTX_HDL_PAUSE       0x1204
 
 #define OBD_FAIL_LLOG                               0x1300
-#define OBD_FAIL_LLOG_ORIGIN_CONNECT_NET            0x1301
+/* was	OBD_FAIL_LLOG_ORIGIN_CONNECT_NET            0x1301 until 2.4 */
 #define OBD_FAIL_LLOG_ORIGIN_HANDLE_CREATE_NET      0x1302
-#define OBD_FAIL_LLOG_ORIGIN_HANDLE_DESTROY_NET     0x1303
+/* was	OBD_FAIL_LLOG_ORIGIN_HANDLE_DESTROY_NET     0x1303 until 2.11 */
 #define OBD_FAIL_LLOG_ORIGIN_HANDLE_READ_HEADER_NET 0x1304
 #define OBD_FAIL_LLOG_ORIGIN_HANDLE_NEXT_BLOCK_NET  0x1305
 #define OBD_FAIL_LLOG_ORIGIN_HANDLE_PREV_BLOCK_NET  0x1306
-#define OBD_FAIL_LLOG_ORIGIN_HANDLE_WRITE_REC_NET   0x1307
-#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CLOSE_NET       0x1308
-#define OBD_FAIL_LLOG_CATINFO_NET                   0x1309
+/* was	OBD_FAIL_LLOG_ORIGIN_HANDLE_WRITE_REC_NET   0x1307 until 2.1 */
+/* was	OBD_FAIL_LLOG_ORIGIN_HANDLE_CLOSE_NET       0x1308 until 1.8 */
+/* was	OBD_FAIL_LLOG_CATINFO_NET                   0x1309 until 2.3 */
 #define OBD_FAIL_MDS_SYNC_CAPA_SL                   0x1310
 #define OBD_FAIL_SEQ_ALLOC                          0x1311
 #define OBD_FAIL_CAT_RECORDS			    0x1312
+#define OBD_FAIL_CAT_FREE_RECORDS		    0x1313
+#define OBD_FAIL_TIME_IN_CHLOG_USER		    0x1314
+#define CFS_FAIL_CHLOG_USER_REG_UNREG_RACE	    0x1315
+#define OBD_FAIL_FORCE_GC_THREAD		    0x1316
+#define OBD_FAIL_LLOG_PROCESS_TIMEOUT		    0x1317
+#define OBD_FAIL_LLOG_PURGE_DELAY		    0x1318
+#define OBD_FAIL_CATLIST			    0x131b
 
 #define OBD_FAIL_LLITE                              0x1400
 #define OBD_FAIL_LLITE_FAULT_TRUNC_RACE             0x1401
@@ -536,9 +574,10 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_LLITE_NEWNODE_PAUSE		    0x140a
 #define OBD_FAIL_LLITE_SETDIRSTRIPE_PAUSE	    0x140b
 #define OBD_FAIL_LLITE_CREATE_NODE_PAUSE	    0x140c
-#define OBD_FAIL_LLITE_PTASK_IO_FAIL		    0x140d
 #define OBD_FAIL_LLITE_IMUTEX_SEC		    0x140e
 #define OBD_FAIL_LLITE_IMUTEX_NOSEC		    0x140f
+#define OBD_FAIL_LLITE_OPEN_BY_NAME		    0x1410
+#define OBD_FAIL_LLITE_SHORT_COMMIT		    0x1415
 
 #define OBD_FAIL_FID_INDIR	0x1501
 #define OBD_FAIL_FID_INLMA	0x1502
@@ -587,9 +626,11 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV	0x162a
 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV	0x162b
 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME	0x162c
-#define OBD_FAIL_LFSCK_ASSISTANT_DIRECT	0x162d
+#define OBD_FAIL_LFSCK_ENGINE_DELAY	0x162d
 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2	0x162e
 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE	0x162f
+#define OBD_FAIL_LFSCK_NO_AGENTOBJ	0x1630
+#define OBD_FAIL_LFSCK_NO_AGENTENT	0x1631
 
 #define OBD_FAIL_LFSCK_NOTIFY_NET	0x16f0
 #define OBD_FAIL_LFSCK_QUERY_NET	0x16f1
@@ -603,14 +644,17 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_INVALIDATE_UPDATE	0x1705
 
 /* MIGRATE */
-#define OBD_FAIL_MIGRATE_NET_REP		0x1800
 #define OBD_FAIL_MIGRATE_ENTRIES		0x1801
-#define OBD_FAIL_MIGRATE_LINKEA			0x1802
-#define OBD_FAIL_MIGRATE_DELAY			0x1803
 
 /* LMV */
 #define OBD_FAIL_UNKNOWN_LMV_STRIPE		0x1901
 
+/* FLR */
+#define OBD_FAIL_FLR_GLIMPSE_IMMUTABLE		0x1A00
+#define OBD_FAIL_FLR_LV_DELAY			0x1A01
+#define OBD_FAIL_FLR_LV_INC			0x1A02
+#define OBD_FAIL_FLR_RANDOM_PICK_MIRROR	0x1A03
+
 /* DT */
 #define OBD_FAIL_DT_DECLARE_ATTR_GET		0x2000
 #define OBD_FAIL_DT_ATTR_GET			0x2001
@@ -642,14 +686,19 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OSP_CHECK_INVALID_REC		0x2100
 #define OBD_FAIL_OSP_CHECK_ENOMEM		0x2101
 #define OBD_FAIL_OSP_FAKE_PRECREATE		0x2102
+#define OBD_FAIL_OSP_RPCS_SEM			0x2104
+#define OBD_FAIL_OSP_CANT_PROCESS_LLOG		0x2105
+#define OBD_FAIL_OSP_INVALID_LOGID		0x2106
 
- /* barrier */
+/* barrier */
 #define OBD_FAIL_MGS_BARRIER_READ_NET		0x2200
 #define OBD_FAIL_MGS_BARRIER_NOTIFY_NET		0x2201
 
 #define OBD_FAIL_BARRIER_DELAY			0x2202
 #define OBD_FAIL_BARRIER_FAILURE		0x2203
 
+#define OBD_FAIL_OSD_FAIL_AT_TRUNCATE		0x2301
+
 /* Assign references to moved code to reduce code changes */
 #define OBD_FAIL_PRECHECK(id)                   CFS_FAIL_PRECHECK(id)
 #define OBD_FAIL_CHECK(id)                      CFS_FAIL_CHECK(id)
@@ -731,11 +780,13 @@ static inline void obd_memory_sub(long size)
 
 #define __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, flags)		      \
 do {									      \
-	(ptr) = (cptab) == NULL ?					      \
-		kmalloc(size, (flags) | __GFP_ZERO) :			      \
-		cfs_cpt_malloc(cptab, cpt, size, (flags) | __GFP_ZERO);	      \
-	if (likely((ptr) != NULL))                                            \
-		OBD_ALLOC_POST(ptr, size, "kmalloced");                       \
+	if (cptab)							      \
+		ptr = cfs_cpt_malloc((cptab), (cpt), (size),		      \
+				     (flags) | __GFP_ZERO | __GFP_NOWARN);    \
+	if (!(cptab) || unlikely(!(ptr))) /* retry without CPT if failure */  \
+		ptr = kmalloc(size, (flags) | __GFP_ZERO);		      \
+	if (likely((ptr) != NULL))					      \
+		OBD_ALLOC_POST((ptr), (size), "kmalloced");		      \
 } while (0)
 
 #define OBD_ALLOC_GFP(ptr, size, gfp_mask)				      \
@@ -762,7 +813,7 @@ do {									      \
 #define __OBD_VMALLOC_VERBOSE(ptr, cptab, cpt, size)			      \
 do {									      \
 	(ptr) = cptab == NULL ?						      \
-		__ll_vmalloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO):    \
+		__ll_vmalloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO) :   \
 		cfs_cpt_vzalloc(cptab, cpt, size);			      \
 	if (unlikely((ptr) == NULL)) {                                        \
 		CERROR("vmalloc of '" #ptr "' (%d bytes) failed\n",           \
@@ -823,7 +874,7 @@ do {									      \
 do {									      \
 	if (is_vmalloc_addr(ptr)) {					      \
 		OBD_FREE_PRE(ptr, size, "vfreed");			      \
-		libcfs_vfree_atomic(ptr);						      \
+		libcfs_vfree_atomic(ptr);				      \
 		POISON_PTR(ptr);					      \
 	} else {							      \
 		OBD_FREE(ptr, size);					      \
@@ -911,4 +962,29 @@ static inline int lustre_to_lma_flags(__u32 la_flags)
 	return (la_flags & LUSTRE_ORPHAN_FL) ? LMAI_ORPHAN : 0;
 }
 
+/* Convert wire LUSTRE_*_FL to corresponding client local VFS S_* values
+ * for the client inode i_flags.  The LUSTRE_*_FL are the Lustre wire
+ * protocol equivalents of LDISKFS_*_FL values stored on disk, while
+ * the S_* flags are kernel-internal values that change between kernel
+ * versions. These flags are set/cleared via FSFILT_IOC_{GET,SET}_FLAGS.
+ * See b=16526 for a full history.
+ */
+static inline int ll_ext_to_inode_flags(int flags)
+{
+	return (((flags & LUSTRE_SYNC_FL)      ? S_SYNC      : 0) |
+		((flags & LUSTRE_NOATIME_FL)   ? S_NOATIME   : 0) |
+		((flags & LUSTRE_APPEND_FL)    ? S_APPEND    : 0) |
+		((flags & LUSTRE_DIRSYNC_FL)   ? S_DIRSYNC   : 0) |
+		((flags & LUSTRE_IMMUTABLE_FL) ? S_IMMUTABLE : 0));
+}
+
+static inline int ll_inode_to_ext_flags(int iflags)
+{
+	return (((iflags & S_SYNC)      ? LUSTRE_SYNC_FL      : 0) |
+		((iflags & S_NOATIME)   ? LUSTRE_NOATIME_FL   : 0) |
+		((iflags & S_APPEND)    ? LUSTRE_APPEND_FL    : 0) |
+		((iflags & S_DIRSYNC)   ? LUSTRE_DIRSYNC_FL   : 0) |
+		((iflags & S_IMMUTABLE) ? LUSTRE_IMMUTABLE_FL : 0));
+}
+
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/include/obj_update.h b/drivers/staging/lustrefsx/lustre/include/obj_update.h
index c381f77f0045e..8c88de86005ea 100644
--- a/drivers/staging/lustrefsx/lustre/include/obj_update.h
+++ b/drivers/staging/lustrefsx/lustre/include/obj_update.h
@@ -31,7 +31,7 @@
 #ifndef _OBJ_UPDATE_H_
 #define _OBJ_UPDATE_H_
 
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 
 static inline size_t
 object_update_param_size(const struct object_update_param *param)
diff --git a/drivers/staging/lustrefsx/lustre/include/seq_range.h b/drivers/staging/lustrefsx/lustre/include/seq_range.h
index 616ee3a78e68b..374d1932f0bdf 100644
--- a/drivers/staging/lustrefsx/lustre/include/seq_range.h
+++ b/drivers/staging/lustrefsx/lustre/include/seq_range.h
@@ -34,7 +34,7 @@
 #ifndef _SEQ_RANGE_H_
 #define _SEQ_RANGE_H_
 
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 
 /**
  * computes the sequence range type \a range
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_barrier_user.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_barrier_user.h
new file mode 100644
index 0000000000000..38084241d8998
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_barrier_user.h
@@ -0,0 +1,74 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Intel Corporation.
+ *
+ * lustre/include/lustre/lustre_barrier_user.h
+ *
+ * Lustre write barrier (on MDT) userspace interfaces.
+ *
+ * Author: Fan, Yong <fan.yong@intel.com>
+ */
+#ifndef _LUSTRE_BARRIER_USER_H
+# define _LUSTRE_BARRIER_USER_H
+
+#include <linux/types.h>
+#include <linux/lustre/lustre_user.h>
+
+#define BARRIER_VERSION_V1	1
+#define BARRIER_TIMEOUT_DEFAULT	30
+
+enum barrier_commands {
+	BC_FREEZE	= 1,
+	BC_THAW		= 2,
+	BC_STAT		= 3,
+	BC_RESCAN	= 4,
+};
+
+enum barrier_status {
+	BS_INIT		= 0,
+	BS_FREEZING_P1	= 1,
+	BS_FREEZING_P2	= 2,
+	BS_FROZEN	= 3,
+	BS_THAWING	= 4,
+	BS_THAWED	= 5,
+	BS_FAILED	= 6,
+	BS_EXPIRED	= 7,
+	BS_RESCAN	= 8,
+};
+
+struct barrier_ctl {
+	__u32	bc_version;
+	__u32	bc_cmd;
+	union {
+		__s32	bc_timeout;
+		__u32	bc_total;
+	};
+	union {
+		__u32	bc_status;
+		__u32	bc_absence;
+	};
+	char	bc_name[12];
+	__u32	bc_padding;
+};
+
+#endif /* _LUSTRE_BARRIER_USER_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_cfg.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_cfg.h
similarity index 77%
rename from drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_cfg.h
rename to drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_cfg.h
index b1f68d50b0242..30d5c7d614892 100644
--- a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_cfg.h
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_cfg.h
@@ -33,14 +33,10 @@
 #ifndef _UAPI_LUSTRE_CFG_H
 #define _UAPI_LUSTRE_CFG_H
 
+#include <linux/errno.h>
 #include <linux/kernel.h>
-#include <lustre/lustre_user.h>
-
-/* Handle older distros */
-#ifndef __ALIGN_KERNEL
-# define __ALIGN_KERNEL(x, a)		__ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
-# define __ALIGN_KERNEL_MASK(x, mask)	(((x) + (mask)) & ~(mask))
-#endif
+#include <linux/types.h>
+#include <linux/lustre/lustre_user.h>
 
 /** \defgroup cfg cfg
  *
@@ -139,6 +135,8 @@ enum lcfg_command_type {
 						 *  users
 						 */
 	LCFG_NODEMAP_MAP_MODE	  = 0x00ce059, /**< set the mapping mode */
+	LCFG_NODEMAP_AUDIT_MODE	  = 0x00ce05a, /**< set the audit mode */
+	LCFG_NODEMAP_SET_SEPOL	  = 0x00ce05b, /**< set SELinux policy */
 };
 
 struct lustre_cfg_bufs {
@@ -160,6 +158,57 @@ struct lustre_cfg {
 	__u32 lcfg_buflens[0];
 };
 
+struct lcfg_type_data {
+	__u32	 ltd_type;
+	char	*ltd_name;
+	char	*ltd_bufs[4];
+};
+
+static struct lcfg_type_data lcfg_data_table[] = {
+	{ LCFG_ATTACH, "attach", { "type", "UUID", "3", "4" } },
+	{ LCFG_DETACH, "detach", { "1", "2", "3", "4" } },
+	{ LCFG_SETUP, "setup", { "UUID", "node", "options", "failout" } },
+	{ LCFG_CLEANUP, "cleanup", { "1", "2", "3", "4" } },
+	{ LCFG_ADD_UUID, "add_uuid", { "node", "2", "3", "4" }  },
+	{ LCFG_DEL_UUID, "del_uuid", { "1", "2", "3", "4" }  },
+	{ LCFG_MOUNTOPT, "new_profile", { "name", "lov", "lmv", "4" }  },
+	{ LCFG_DEL_MOUNTOPT, "del_mountopt", { "1", "2", "3", "4" }  },
+	{ LCFG_SET_TIMEOUT, "set_timeout", { "parameter", "2", "3", "4" }  },
+	{ LCFG_SET_UPCALL, "set_upcall", { "1", "2", "3", "4" }  },
+	{ LCFG_ADD_CONN, "add_conn", { "node", "2", "3", "4" }  },
+	{ LCFG_DEL_CONN, "del_conn", { "1", "2", "3", "4" }  },
+	{ LCFG_LOV_ADD_OBD, "add_osc", { "ost", "index", "gen", "UUID" } },
+	{ LCFG_LOV_DEL_OBD, "del_osc", { "1", "2", "3", "4" } },
+	{ LCFG_PARAM, "conf_param", { "parameter", "value", "3", "4" } },
+	{ LCFG_MARKER, "marker", { "1", "2", "3", "4" } },
+	{ LCFG_LOG_START, "log_start", { "1", "2", "3", "4" } },
+	{ LCFG_LOG_END, "log_end", { "1", "2", "3", "4" } },
+	{ LCFG_LOV_ADD_INA, "add_osc_inactive", { "1", "2", "3", "4" }  },
+	{ LCFG_ADD_MDC, "add_mdc", { "mdt", "index", "gen", "UUID" } },
+	{ LCFG_DEL_MDC, "del_mdc", { "1", "2", "3", "4" } },
+	{ LCFG_SPTLRPC_CONF, "security", { "parameter", "2", "3", "4" } },
+	{ LCFG_POOL_NEW, "new_pool", { "fsname", "pool", "3", "4" }  },
+	{ LCFG_POOL_ADD, "add_pool", { "fsname", "pool", "ost", "4" } },
+	{ LCFG_POOL_REM, "remove_pool", { "fsname", "pool", "ost", "4" } },
+	{ LCFG_POOL_DEL, "del_pool", { "fsname", "pool", "3", "4" } },
+	{ LCFG_SET_LDLM_TIMEOUT, "set_ldlm_timeout",
+	  { "parameter", "2", "3", "4" } },
+	{ LCFG_SET_PARAM, "set_param", { "parameter", "value", "3", "4" } },
+	{ 0, NULL, { NULL, NULL, NULL, NULL } }
+};
+
+static inline struct lcfg_type_data *lcfg_cmd2data(__u32 cmd)
+{
+	int i = 0;
+
+	while (lcfg_data_table[i].ltd_type != 0) {
+		if (lcfg_data_table[i].ltd_type == cmd)
+			return &lcfg_data_table[i];
+		i++;
+	}
+	return NULL;
+}
+
 enum cfg_record_type {
 	PORTALS_CFG_TYPE	= 1,
 	LUSTRE_CFG_TYPE		= 123,
@@ -201,7 +250,7 @@ static inline void lustre_cfg_bufs_reset(struct lustre_cfg_bufs *bufs,
 static inline void *lustre_cfg_buf(struct lustre_cfg *lcfg, __u32 index)
 {
 	__u32 i;
-	size_t offset;
+	__kernel_size_t offset;
 	__u32 bufcount;
 
 	if (!lcfg)
@@ -261,7 +310,7 @@ static inline void lustre_cfg_init(struct lustre_cfg *lcfg, int cmd,
 	}
 }
 
-static inline int lustre_cfg_sanity_check(void *buf, size_t len)
+static inline int lustre_cfg_sanity_check(void *buf, __kernel_size_t len)
 {
 	struct lustre_cfg *lcfg = (struct lustre_cfg *)buf;
 
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_disk.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_disk.h
similarity index 85%
rename from drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_disk.h
rename to drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_disk.h
index 8887c82d3b8b9..e9cbf3066738a 100644
--- a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_disk.h
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_disk.h
@@ -29,8 +29,6 @@
  * This file is part of Lustre, http://www.lustre.org/
  * Lustre is a trademark of Sun Microsystems, Inc.
  *
- * uapi/linux/lustre_disk.h
- *
  * Lustre disk format definitions.
  *
  * Author: Nathan Rutman <nathan.rutman@seagate.com>
@@ -62,11 +60,16 @@
 #define MGS_NIDTBL_DIR		"NIDTBL_VERSIONS"
 #define QMT_DIR			"quota_master"
 #define QSD_DIR			"quota_slave"
+#define QSD_DIR_DT		"quota_slave_dt"
+#define QSD_DIR_MD		"quota_slave_md"
 #define HSM_ACTIONS		"hsm_actions"
 #define LFSCK_DIR		"LFSCK"
 #define LFSCK_BOOKMARK		"lfsck_bookmark"
 #define LFSCK_LAYOUT		"lfsck_layout"
 #define LFSCK_NAMESPACE		"lfsck_namespace"
+#define REMOTE_PARENT_DIR	"REMOTE_PARENT_DIR"
+#define INDEX_BACKUP_DIR	"index_backup"
+#define MDT_ORPHAN_DIR		"PENDING"
 
 /****************** persistent mount data *********************/
 
@@ -88,7 +91,7 @@
 /** regenerate config logs for this fs or server */
 #define LDD_F_WRITECONF		0x0100
 /** COMPAT_14 */
-#define LDD_F_UPGRADE14		0x0200
+/*#define LDD_F_UPGRADE14		0x0200 deprecated since 1.8 */
 /** process as lctl conf_param */
 #define LDD_F_PARAM		0x0400
 /** all nodes are specified as service nodes */
@@ -114,36 +117,9 @@ enum ldd_mount_type {
 	LDD_MT_LAST
 };
 
-/* On-disk configuration file. In host-endian order. */
-struct lustre_disk_data {
-	__u32 ldd_magic;
-	__u32 ldd_feature_compat;	/* compatible feature flags */
-	__u32 ldd_feature_rocompat;	/* read-only compatible feature flags */
-	__u32 ldd_feature_incompat;	/* incompatible feature flags */
-
-	__u32 ldd_config_ver;		/* config rewrite count - not used */
-	__u32 ldd_flags;		/* LDD_SV_TYPE */
-	__u32 ldd_svindex;		/* server index (0001), must match
-					 * svname
-					 */
-	__u32 ldd_mount_type;		/* target fs type LDD_MT_* */
-	char  ldd_fsname[64];		/* filesystem this server is part of,
-					 * MTI_NAME_MAXLEN
-					 */
-	char  ldd_svname[64];		/* this server's name (lustre-mdt0001)*/
-	__u8  ldd_uuid[40];		/* server UUID (COMPAT_146) */
-
-	char  ldd_userdata[1024 - 200];	/* arbitrary user string '200' */
-	__u8  ldd_padding[4096 - 1024];	/* 1024 */
-	char  ldd_mount_opts[4096];	/* target fs mount opts '4096' */
-	char  ldd_params[4096];		/* key=value pairs '8192' */
-};
-
 /****************** last_rcvd file *********************/
 
 #define LR_EXPIRE_INTERVALS 16	/**< number of intervals to track transno */
-#define ENOENT_VERSION 1	/** 'virtual' version of non-existent object */
-
 #define LR_SERVER_SIZE	512
 #define LR_CLIENT_START	8192
 #define LR_CLIENT_SIZE	128
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_fid.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fid.h
similarity index 97%
rename from drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_fid.h
rename to drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fid.h
index 3e58dd5329c3f..f11ad3b3b2115 100644
--- a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_fid.h
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fid.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  *
  * Copyright 2016 Cray Inc, all rights reserved.
  * Author: Ben Evans.
@@ -37,7 +37,8 @@
 #ifndef _UAPI_LUSTRE_FID_H_
 #define _UAPI_LUSTRE_FID_H_
 
-#include <lustre/lustre_idl.h>
+#include <linux/types.h>
+#include <linux/lustre/lustre_idl.h>
 
 /** returns fid object sequence */
 static inline __u64 fid_seq(const struct lu_fid *fid)
@@ -277,7 +278,7 @@ static inline bool fid_is_last_id(const struct lu_fid *fid)
  * \param fid an igif to get inode number from.
  * \return inode number for the igif.
  */
-static inline ino_t lu_igif_ino(const struct lu_fid *fid)
+static inline __kernel_ino_t lu_igif_ino(const struct lu_fid *fid)
 {
 	return fid_seq(fid);
 }
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fiemap.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fiemap.h
new file mode 100644
index 0000000000000..8cdb05dedbd8c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fiemap.h
@@ -0,0 +1,72 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * FIEMAP data structures and flags. This header file will be used until
+ * fiemap.h is available in the upstream kernel.
+ *
+ * Author: Kalpak Shah <kalpak.shah@sun.com>
+ * Author: Andreas Dilger <adilger@sun.com>
+ */
+
+#ifndef _LUSTRE_FIEMAP_H
+#define _LUSTRE_FIEMAP_H
+
+#include <stddef.h>
+#include <linux/fiemap.h>
+#include <linux/types.h>
+
+/* XXX: We use fiemap_extent::fe_reserved[0] */
+#define fe_device	fe_reserved[0]
+
+static inline __kernel_size_t fiemap_count_to_size(__kernel_size_t extent_count)
+{
+	return sizeof(struct fiemap) + extent_count *
+				       sizeof(struct fiemap_extent);
+}
+
+static inline unsigned int fiemap_size_to_count(__kernel_size_t array_size)
+{
+	return (array_size - sizeof(struct fiemap)) /
+	       sizeof(struct fiemap_extent);
+}
+
+#define FIEMAP_FLAG_DEVICE_ORDER 0x40000000 /* return device ordered mapping */
+
+#ifdef FIEMAP_FLAGS_COMPAT
+#undef FIEMAP_FLAGS_COMPAT
+#endif
+
+/* Lustre specific flags - use a high bit, don't conflict with upstream flag */
+#define FIEMAP_EXTENT_NO_DIRECT 0x40000000 /* Data mapping undefined */
+#define FIEMAP_EXTENT_NET       0x80000000 /* Data stored remotely.
+					    * Sets NO_DIRECT flag */
+
+#endif /* _LUSTRE_FIEMAP_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_idl.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_idl.h
similarity index 81%
rename from drivers/staging/lustrefsx/lustre/include/lustre/lustre_idl.h
rename to drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_idl.h
index f2c850c0f1848..fb26eaeceec28 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_idl.h
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_idl.h
@@ -23,14 +23,12 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  * Lustre is a trademark of Sun Microsystems, Inc.
  *
- * lustre/include/lustre/lustre_idl.h
- *
  * Lustre wire protocol definitions.
  */
 
@@ -42,7 +40,7 @@
  * that are used in interfaces with userspace should go in lustre_user.h.
  *
  * All structs being declared here should be built from simple fixed-size
- * types (__u8, __u16, __u32, __u64) or be built from other types or
+ * types defined in linux/types.h or be built from other types or
  * structs also declared in this file.  Similarly, all flags and magic
  * values in those structs should also be declared here.  This ensures
  * that the Lustre wire protocol is not influenced by external dependencies.
@@ -70,11 +68,24 @@
 #define _LUSTRE_IDL_H_
 
 #include <asm/byteorder.h>
+#include <linux/errno.h>
+#include <linux/fiemap.h>
 #include <linux/types.h>
+/*
+ * This is due to us being out of kernel and the way the OpenSFS branch
+ * handles CFLAGS.
+ */
+#ifdef __KERNEL__
+# include <uapi/linux/lnet/lnet-types.h>
+#else
+# include <linux/lnet/lnet-types.h>
+#endif
+#include <linux/lustre/lustre_user.h>
+#include <linux/lustre/lustre_ver.h>
 
-#include <lnet/types.h>
-#include <lustre/lustre_user.h> /* Defn's shared with user-space. */
-#include <lustre_ver.h>
+#if defined(__cplusplus)
+extern "C" {
+#endif
 
 /*
  *  GENERAL STUFF
@@ -86,25 +97,25 @@
 
 #define CONNMGR_REQUEST_PORTAL          1
 #define CONNMGR_REPLY_PORTAL            2
-//#define OSC_REQUEST_PORTAL            3
+/* #define OSC_REQUEST_PORTAL		 3*/
 #define OSC_REPLY_PORTAL                4
-//#define OSC_BULK_PORTAL               5
+/*#define OSC_BULK_PORTAL		 5*/
 #define OST_IO_PORTAL                   6
 #define OST_CREATE_PORTAL               7
 #define OST_BULK_PORTAL                 8
-//#define MDC_REQUEST_PORTAL            9
+/*#define MDC_REQUEST_PORTAL		 9*/
 #define MDC_REPLY_PORTAL               10
-//#define MDC_BULK_PORTAL              11
+/*#define MDC_BULK_PORTAL		11*/
 #define MDS_REQUEST_PORTAL             12
-//#define MDS_REPLY_PORTAL             13
+#define MDS_IO_PORTAL			13
 #define MDS_BULK_PORTAL                14
 #define LDLM_CB_REQUEST_PORTAL         15
 #define LDLM_CB_REPLY_PORTAL           16
 #define LDLM_CANCEL_REQUEST_PORTAL     17
 #define LDLM_CANCEL_REPLY_PORTAL       18
-//#define PTLBD_REQUEST_PORTAL           19
-//#define PTLBD_REPLY_PORTAL             20
-//#define PTLBD_BULK_PORTAL              21
+/*#define PTLBD_REQUEST_PORTAL		19*/
+/*#define PTLBD_REPLY_PORTAL		20*/
+/*#define PTLBD_BULK_PORTAL		21*/
 #define MDS_SETATTR_PORTAL             22
 #define MDS_READPAGE_PORTAL            23
 #define OUT_PORTAL			24
@@ -117,28 +128,8 @@
 #define SEQ_DATA_PORTAL                31
 #define SEQ_CONTROLLER_PORTAL          32
 #define MGS_BULK_PORTAL                33
-
-/* Portal 63 is reserved for the Cray Inc DVS - nic@cray.com, roe@cray.com, n8851@cray.com */
-
-/* packet types */
-#define PTL_RPC_MSG_REQUEST 4711
-#define PTL_RPC_MSG_ERR     4712
-#define PTL_RPC_MSG_REPLY   4713
-
-/* DON'T use swabbed values of MAGIC as magic! */
-#define LUSTRE_MSG_MAGIC_V2 0x0BD00BD3
-#define LUSTRE_MSG_MAGIC_V2_SWABBED 0xD30BD00B
-
-#define LUSTRE_MSG_MAGIC LUSTRE_MSG_MAGIC_V2
-
-#define PTLRPC_MSG_VERSION  0x00000003
-#define LUSTRE_VERSION_MASK 0xffff0000
-#define LUSTRE_OBD_VERSION  0x00010000
-#define LUSTRE_MDS_VERSION  0x00020000
-#define LUSTRE_OST_VERSION  0x00030000
-#define LUSTRE_DLM_VERSION  0x00040000
-#define LUSTRE_LOG_VERSION  0x00050000
-#define LUSTRE_MGS_VERSION  0x00060000
+/* #define DVS_PORTAL			63 */
+/* reserved for Cray DVS - spitzcor@cray.com, roe@cray.com, n8851@cray.com */
 
 /**
  * Describes a range of sequence, lsr_start is included but lsr_end is
@@ -178,12 +169,14 @@ extern void lustre_loa_init(struct lustre_ost_attrs *loa,
 			    const struct lu_fid *fid,
 			    __u32 compat, __u32 incompat);
 
-/* copytool uses a 32b bitmask field to encode archive-Ids during register
- * with MDT thru kuc.
+/* copytool can use any nonnegative integer to represent archive-Ids during
+ * register with MDT thru kuc.
  * archive num = 0 => all
- * archive num from 1 to 32
+ * archive num from 1 to MAX_U32
  */
-#define LL_HSM_MAX_ARCHIVE (sizeof(__u32) * 8)
+#define LL_HSM_ORIGIN_MAX_ARCHIVE	(sizeof(__u32) * 8)
+/* the max count of archive ids that one agent can support */
+#define LL_HSM_MAX_ARCHIVES_PER_AGENT	1024
 
 /**
  * HSM on-disk attributes stored in a separate xattr.
@@ -389,6 +382,23 @@ struct lu_orphan_ent_v2 {
 	struct lu_orphan_rec_v2	loe_rec;
 };
 
+struct lu_orphan_rec_v3 {
+	struct lu_orphan_rec	lor_rec;
+	struct ost_layout	lor_layout;
+	/* The OST-object declared layout version in PFID EA.*/
+	__u32			lor_layout_version;
+	/* The OST-object declared layout range (of version) in PFID EA.*/
+	__u32			lor_range;
+	__u32			lor_padding_1;
+	__u64			lor_padding_2;
+};
+
+struct lu_orphan_ent_v3 {
+	/* The orphan OST-object's FID */
+	struct lu_fid		loe_key;
+	struct lu_orphan_rec_v3	loe_rec;
+};
+
 /** @} lu_fid */
 
 /** \defgroup lu_dir lu_dir
@@ -514,18 +524,21 @@ static inline struct lu_dirent *lu_dirent_next(struct lu_dirent *ent)
 	return next;
 }
 
-static inline size_t lu_dirent_calc_size(size_t namelen, __u16 attr)
+static inline __kernel_size_t lu_dirent_calc_size(size_t namelen, __u16 attr)
 {
-	size_t size;
+	__kernel_size_t size;
 
 	if (attr & LUDA_TYPE) {
-		const size_t align = sizeof(struct luda_type) - 1;
-                size = (sizeof(struct lu_dirent) + namelen + align) & ~align;
-                size += sizeof(struct luda_type);
-        } else
-                size = sizeof(struct lu_dirent) + namelen;
+		const __kernel_size_t align = sizeof(struct luda_type) - 1;
+
+		size = (sizeof(struct lu_dirent) + namelen + 1 + align) &
+		       ~align;
+		size += sizeof(struct luda_type);
+	} else {
+		size = sizeof(struct lu_dirent) + namelen + 1;
+	}
 
-        return (size + 7) & ~7;
+	return (size + 7) & ~7;
 }
 
 #define MDS_DIR_END_OFF 0xfffffffffffffffeULL
@@ -569,59 +582,109 @@ static inline void lustre_handle_copy(struct lustre_handle *tgt,
 	tgt->cookie = src->cookie;
 }
 
-struct lustre_handle_array {
-	unsigned int		count;
-	struct lustre_handle	handles[0];
+/* lustre_msg struct magic.  DON'T use swabbed values of MAGIC as magic! */
+enum lustre_msg_magic {
+	LUSTRE_MSG_MAGIC_V2		= 0x0BD00BD3,
+	LUSTRE_MSG_MAGIC_V2_SWABBED	= 0xD30BD00B,
+	LUSTRE_MSG_MAGIC		= LUSTRE_MSG_MAGIC_V2
 };
 
 /* flags for lm_flags */
-#define MSGHDR_AT_SUPPORT               0x1
-#define MSGHDR_CKSUM_INCOMPAT18         0x2
+enum lustre_msghdr {
+	MSGHDR_AT_SUPPORT	= 0x1,	/* adaptive timeouts, lm_cksum valid
+					 * in early reply messages */
+	MSGHDR_CKSUM_INCOMPAT18	= 0x2,	/* compat for 1.8, needs to be set well
+					 * beyond 2.8.0 for compatibility */
+};
 
 #define lustre_msg lustre_msg_v2
 /* we depend on this structure to be 8-byte aligned */
 /* this type is only endian-adjusted in lustre_unpack_msg() */
 struct lustre_msg_v2 {
-        __u32 lm_bufcount;
-        __u32 lm_secflvr;
-        __u32 lm_magic;
-        __u32 lm_repsize;
-        __u32 lm_cksum;
-        __u32 lm_flags;
-        __u32 lm_padding_2;
-        __u32 lm_padding_3;
-        __u32 lm_buflens[0];
-};
-
-/* without gss, ptlrpc_body is put at the first buffer. */
+	__u32 lm_bufcount;	/* number of buffers in lm_buflens[] */
+	__u32 lm_secflvr;	/* 0 = no crypto, or sptlrpc security flavour */
+	__u32 lm_magic;		/* RPC version magic = LUSTRE_MSG_MAGIC_V2 */
+	__u32 lm_repsize;	/* size of preallocated reply buffer */
+	__u32 lm_cksum;		/* CRC32 of ptlrpc_body early reply messages */
+	__u32 lm_flags;		/* enum lustre_msghdr MSGHDR_* flags */
+	__u32 lm_padding_2;	/* unused */
+	__u32 lm_padding_3;	/* unused */
+	__u32 lm_buflens[0];	/* length of additional buffers in bytes,
+				 * padded to a multiple of 8 bytes. */
+	/*
+	 * message buffers are packed after padded lm_buflens[] array,
+	 * padded to a multiple of 8 bytes each to align contents.
+	 */
+};
+
+/* ptlrpc_body packet pb_types */
+#define PTL_RPC_MSG_REQUEST	4711	/* normal RPC request message */
+#define PTL_RPC_MSG_ERR		4712	/* error reply if request unprocessed */
+#define PTL_RPC_MSG_REPLY	4713	/* normal RPC reply message */
+
+/* ptlrpc_body pb_version ((target_version << 16) | rpc_version) */
+enum lustre_msg_version {
+	PTLRPC_MSG_VERSION	= 0x00000003,
+	LUSTRE_VERSION_MASK	= 0xffff0000,
+	LUSTRE_OBD_VERSION	= 0x00010000,
+	LUSTRE_MDS_VERSION	= 0x00020000,
+	LUSTRE_OST_VERSION	= 0x00030000,
+	LUSTRE_DLM_VERSION	= 0x00040000,
+	LUSTRE_LOG_VERSION	= 0x00050000,
+	LUSTRE_MGS_VERSION	= 0x00060000,
+};
+
+/* pb_flags that apply to all request messages */
+/* #define MSG_LAST_REPLAY	0x0001 obsolete 2.0 => {REQ,LOCK}_REPLAY_DONE */
+#define MSG_RESENT		0x0002 /* was previously sent, no reply seen */
+#define MSG_REPLAY		0x0004 /* was processed, got reply, recovery */
+/* #define MSG_AT_SUPPORT	0x0008 obsolete since 1.5, AT always enabled */
+/* #define MSG_DELAY_REPLAY	0x0010 obsolete since 2.0 */
+/* #define MSG_VERSION_REPLAY	0x0020 obsolete since 1.8.2, VBR always on */
+#define MSG_REQ_REPLAY_DONE	0x0040 /* request replay over, locks next */
+#define MSG_LOCK_REPLAY_DONE	0x0080 /* lock replay over, client done */
+
+/* pb_op_flags for connect opcodes: MDS_CONNECT, OST_CONNECT, MGS_CONNECT */
+#define MSG_CONNECT_RECOVERING	0x00000001 /* target is in recovery */
+#define MSG_CONNECT_RECONNECT	0x00000002 /* tgt already has client import */
+#define MSG_CONNECT_REPLAYABLE	0x00000004 /* target supports RPC replay */
+/* #define MSG_CONNECT_PEER	0x00000008 obsolete since 1.2, removed in 1.5 */
+#define MSG_CONNECT_LIBCLIENT	0x00000010 /* obsolete since 2.3, removed 2.6 */
+#define MSG_CONNECT_INITIAL	0x00000020 /* first client connection attempt */
+/* #define MSG_CONNECT_ASYNC	0x00000040 obsolete since 1.5 */
+#define MSG_CONNECT_NEXT_VER	0x00000080 /* use next version of lustre_msg */
+#define MSG_CONNECT_TRANSNO	0x00000100 /* client sent transno in replay */
+
+/* number of previous object versions in pb_pre_versions[] */
 #define PTLRPC_NUM_VERSIONS     4
+/* without gss, ptlrpc_body is put at the first buffer. */
 struct ptlrpc_body_v3 {
 	struct lustre_handle pb_handle;
-	__u32 pb_type;
-	__u32 pb_version;
-	__u32 pb_opc;
-	__u32 pb_status;
-	__u64 pb_last_xid; /* highest replied XID without lower unreplied XID */
-	__u16 pb_tag;      /* virtual slot idx for multiple modifying RPCs */
+	__u32 pb_type;		/* request/reply/err type: PTL_RPC_MSG_* */
+	__u32 pb_version;	/* LUSTRE_*_VERSION | PTLRPC_MSG_VERSION */
+	__u32 pb_opc;		/* RPC opcodes: MDS_*, OST_*, LDLM_, ... */
+	__u32 pb_status;	/* negative Linux x86 error number */
+	__u64 pb_last_xid;	/* highest replied XID w/o lower unreplied XID*/
+	__u16 pb_tag;		/* multiple modifying RPCs virtual slot index */
 	__u16 pb_padding0;
 	__u32 pb_padding1;
-	__u64 pb_last_committed;
-	__u64 pb_transno;
-	__u32 pb_flags;
-	__u32 pb_op_flags;
-	__u32 pb_conn_cnt;
-	__u32 pb_timeout;  /* for req, the deadline, for rep, the service est */
-	__u32 pb_service_time; /* for rep, actual service time */
-	__u32 pb_limit;
-	__u64 pb_slv;
-	/* VBR: pre-versions */
+	__u64 pb_last_committed;/* rep: highest pb_transno committed to disk */
+	__u64 pb_transno;	/* server-assigned transno for modifying RPCs */
+	__u32 pb_flags;		/* req: MSG_* flags */
+	__u32 pb_op_flags;	/* req: MSG_CONNECT_* flags */
+	__u32 pb_conn_cnt;	/* connect instance of this client on server */
+	__u32 pb_timeout;	/* req: max wait time; rep: service estimate */
+	__u32 pb_service_time;	/* rep: server arrival to reply in seconds */
+	__u32 pb_limit;		/* rep: dynamic DLM LRU lock count limit */
+	__u64 pb_slv;		/* rep: dynamic DLM LRU server lock volume */
+	/* VBR: rep: previous pb_version(s) of objects modified by this RPC */
 	__u64 pb_pre_versions[PTLRPC_NUM_VERSIONS];
 	__u64 pb_mbits;	/**< match bits for bulk request */
-	/* padding for future needs */
+	/* padding for future needs - fix lustre_swab_ptlrpc_body() also */
 	__u64 pb_padding64_0;
 	__u64 pb_padding64_1;
 	__u64 pb_padding64_2;
-	char  pb_jobid[LUSTRE_JOBID_SIZE];
+	char  pb_jobid[LUSTRE_JOBID_SIZE]; /* req: ASCII jobid from env + NUL */
 };
 #define ptlrpc_body     ptlrpc_body_v3
 
@@ -677,38 +740,6 @@ struct ptlrpc_body_v2 {
 /** only use in req->rq_{req,rep}_swab_mask */
 #define MSG_PTLRPC_HEADER_OFF           31
 
-/* Flags that are operation-specific go in the top 16 bits. */
-#define MSG_OP_FLAG_MASK   0xffff0000
-#define MSG_OP_FLAG_SHIFT  16
-
-/* Flags that apply to all requests are in the bottom 16 bits */
-#define MSG_GEN_FLAG_MASK     0x0000ffff
-#define MSG_LAST_REPLAY           0x0001
-#define MSG_RESENT                0x0002
-#define MSG_REPLAY                0x0004
-/* #define MSG_AT_SUPPORT         0x0008
- * This was used in early prototypes of adaptive timeouts, and while there
- * shouldn't be any users of that code there also isn't a need for using this
- * bits. Defer usage until at least 1.10 to avoid potential conflict. */
-#define MSG_DELAY_REPLAY          0x0010
-#define MSG_VERSION_REPLAY        0x0020
-#define MSG_REQ_REPLAY_DONE       0x0040
-#define MSG_LOCK_REPLAY_DONE      0x0080
-
-/*
- * Flags for all connect opcodes (MDS_CONNECT, OST_CONNECT)
- */
-
-#define MSG_CONNECT_RECOVERING  0x00000001
-#define MSG_CONNECT_RECONNECT   0x00000002
-#define MSG_CONNECT_REPLAYABLE  0x00000004
-//#define MSG_CONNECT_PEER        0x8
-#define MSG_CONNECT_LIBCLIENT   0x00000010
-#define MSG_CONNECT_INITIAL     0x00000020
-#define MSG_CONNECT_ASYNC       0x00000040
-#define MSG_CONNECT_NEXT_VER    0x00000080 /* use next version of lustre_msg */
-#define MSG_CONNECT_TRANSNO     0x00000100 /* report transno */
-
 /* Connect flags */
 #define OBD_CONNECT_RDONLY                0x1ULL /*client has read-only access*/
 #define OBD_CONNECT_INDEX                 0x2ULL /*connect specific LOV idx */
@@ -783,14 +814,27 @@ struct ptlrpc_body_v2 {
 							 RPCs in parallel */
 #define OBD_CONNECT_DIR_STRIPE	 0x400000000000000ULL /* striped DNE dir */
 #define OBD_CONNECT_SUBTREE	0x800000000000000ULL /* fileset mount */
-#define OBD_CONNECT_LOCK_AHEAD	 0x1000000000000000ULL /* lock ahead */
+#define OBD_CONNECT_LOCKAHEAD_OLD 0x1000000000000000ULL /* Old Cray lockahead */
+
 /** bulk matchbits is sent within ptlrpc_body */
 #define OBD_CONNECT_BULK_MBITS	 0x2000000000000000ULL
 #define OBD_CONNECT_OBDOPACK	 0x4000000000000000ULL /* compact OUT obdo */
 #define OBD_CONNECT_FLAGS2	 0x8000000000000000ULL /* second flags word */
 /* ocd_connect_flags2 flags */
-#define OBD_CONNECT2_FILE_SECCTX	0x1ULL /* set file security context at create */
-
+#define OBD_CONNECT2_FILE_SECCTX	 0x1ULL /* set file security context at create */
+#define OBD_CONNECT2_LOCKAHEAD		 0x2ULL /* ladvise lockahead v2 */
+#define OBD_CONNECT2_DIR_MIGRATE	 0x4ULL /* migrate striped dir */
+#define OBD_CONNECT2_SUM_STATFS		0x8ULL /* MDT return aggregated stats */
+#define OBD_CONNECT2_FLR		0x20ULL /* FLR support */
+#define OBD_CONNECT2_WBC_INTENTS	0x40ULL /* create/unlink/... intents for wbc, also operations under client-held parent locks */
+#define OBD_CONNECT2_LOCK_CONVERT	0x80ULL /* IBITS lock convert support */
+#define OBD_CONNECT2_ARCHIVE_ID_ARRAY	0x100ULL /* store HSM archive_id in array */
+#define OBD_CONNECT2_SELINUX_POLICY	0x400ULL /* has client SELinux policy */
+#define OBD_CONNECT2_LSOM		0x800ULL /* LSOM support */
+#define OBD_CONNECT2_ASYNC_DISCARD	0x4000ULL /* support async DoM data discard */
+#define OBD_CONNECT2_ENCRYPT		0x8000ULL /* client-to-disk encrypt */
+#define OBD_CONNECT2_FIDMAP	       0x10000ULL /* FID map */
+#define OBD_CONNECT2_GETATTR_PFID      0x20000ULL /* pack parent FID in getattr */
 /* XXX README XXX:
  * Please DO NOT add flag values here before first ensuring that this same
  * flag value is not in use on some other branch.  Please clear any such
@@ -832,13 +876,23 @@ struct ptlrpc_body_v2 {
 				OBD_CONNECT_FLOCK_DEAD | \
 				OBD_CONNECT_DISP_STRIPE | OBD_CONNECT_LFSCK | \
 				OBD_CONNECT_OPEN_BY_FID | \
-				OBD_CONNECT_DIR_STRIPE | \
-				OBD_CONNECT_BULK_MBITS | \
+				OBD_CONNECT_DIR_STRIPE | OBD_CONNECT_GRANT | \
+				OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_SRVLOCK | \
+				OBD_CONNECT_BULK_MBITS | OBD_CONNECT_CKSUM | \
 				OBD_CONNECT_MULTIMODRPCS | \
 				OBD_CONNECT_SUBTREE | OBD_CONNECT_LARGE_ACL | \
-				OBD_CONNECT_FLAGS2)
-
-#define MDT_CONNECT_SUPPORTED2 OBD_CONNECT2_FILE_SECCTX
+				OBD_CONNECT_GRANT_PARAM | \
+				OBD_CONNECT_SHORTIO | OBD_CONNECT_FLAGS2)
+
+#define MDT_CONNECT_SUPPORTED2 (OBD_CONNECT2_FILE_SECCTX | OBD_CONNECT2_FLR | \
+                                OBD_CONNECT2_SUM_STATFS | \
+				OBD_CONNECT2_LOCK_CONVERT | \
+				OBD_CONNECT2_DIR_MIGRATE | \
+				OBD_CONNECT2_ARCHIVE_ID_ARRAY | \
+				OBD_CONNECT2_SELINUX_POLICY | \
+				OBD_CONNECT2_LSOM | \
+				OBD_CONNECT2_ASYNC_DISCARD | \
+				OBD_CONNECT2_GETATTR_PFID)
 
 #define OST_CONNECT_SUPPORTED  (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
 				OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
@@ -856,10 +910,12 @@ struct ptlrpc_body_v2 {
 				OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_FID | \
 				OBD_CONNECT_PINGLESS | OBD_CONNECT_LFSCK | \
 				OBD_CONNECT_BULK_MBITS | \
-				OBD_CONNECT_GRANT_PARAM)
-#define OST_CONNECT_SUPPORTED2 0
+				OBD_CONNECT_GRANT_PARAM | \
+				OBD_CONNECT_SHORTIO | OBD_CONNECT_FLAGS2)
 
-#define ECHO_CONNECT_SUPPORTED 0
+#define OST_CONNECT_SUPPORTED2 OBD_CONNECT2_LOCKAHEAD
+
+#define ECHO_CONNECT_SUPPORTED (OBD_CONNECT_FID)
 #define ECHO_CONNECT_SUPPORTED2 0
 
 #define MGS_CONNECT_SUPPORTED  (OBD_CONNECT_VERSION | OBD_CONNECT_AT | \
@@ -871,6 +927,7 @@ struct ptlrpc_body_v2 {
 
 /* Features required for this version of the client to work with server */
 #define CLIENT_CONNECT_MDT_REQD (OBD_CONNECT_FID |	\
+				 OBD_CONNECT_ATTRFID |	\
 				 OBD_CONNECT_FULL20)
 
 /* This structure is used for both request and reply.
@@ -927,21 +984,43 @@ struct obd_connect_data {
 /*
  * Supported checksum algorithms. Up to 32 checksum types are supported.
  * (32-bit mask stored in obd_connect_data::ocd_cksum_types)
- * Please update DECLARE_CKSUM_NAME/OBD_CKSUM_ALL in obd.h when adding a new
- * algorithm and also the OBD_FL_CKSUM* flags.
+ * Please update DECLARE_CKSUM_NAME in obd_cksum.h when adding a new
+ * algorithm and also the OBD_FL_CKSUM* flags, OBD_CKSUM_ALL flag,
+ * OBD_FL_CKSUM_ALL flag and potentially OBD_CKSUM_T10_ALL flag.
+ */
+enum cksum_types {
+	OBD_CKSUM_CRC32		= 0x00000001,
+	OBD_CKSUM_ADLER		= 0x00000002,
+	OBD_CKSUM_CRC32C	= 0x00000004,
+	OBD_CKSUM_RESERVED	= 0x00000008,
+	OBD_CKSUM_T10IP512	= 0x00000010,
+	OBD_CKSUM_T10IP4K	= 0x00000020,
+	OBD_CKSUM_T10CRC512	= 0x00000040,
+	OBD_CKSUM_T10CRC4K	= 0x00000080,
+};
+
+#define OBD_CKSUM_T10_ALL (OBD_CKSUM_T10IP512 | OBD_CKSUM_T10IP4K | \
+	OBD_CKSUM_T10CRC512 | OBD_CKSUM_T10CRC4K)
+
+#define OBD_CKSUM_ALL (OBD_CKSUM_CRC32 | OBD_CKSUM_ADLER | OBD_CKSUM_CRC32C | \
+		       OBD_CKSUM_T10_ALL)
+
+/*
+ * The default checksum algorithm used on top of T10PI GRD tags for RPC.
+ * Considering that the checksum-of-checksums is only computing CRC32 on a
+ * 4KB chunk of GRD tags for a 1MB RPC for 512B sectors, or 16KB of GRD
+ * tags for 16MB of 4KB sectors, this is only 1/256 or 1/1024 of the
+ * total data being checksummed, so the checksum type used here should not
+ * affect overall system performance noticeably.
  */
-typedef enum cksum_types {
-        OBD_CKSUM_CRC32 = 0x00000001,
-        OBD_CKSUM_ADLER = 0x00000002,
-        OBD_CKSUM_CRC32C= 0x00000004,
-} cksum_type_t;
+#define OBD_CKSUM_T10_TOP OBD_CKSUM_ADLER
 
 /*
  *   OST requests: OBDO & OBD request records
  */
 
 /* opcodes */
-typedef enum {
+enum ost_cmd {
         OST_REPLY      =  0,       /* reply ? */
         OST_GETATTR    =  1,
         OST_SETATTR    =  2,
@@ -962,8 +1041,10 @@ typedef enum {
         OST_QUOTACTL   = 19,
 	OST_QUOTA_ADJUST_QUNIT = 20, /* not used since 2.4 */
 	OST_LADVISE    = 21,
-	OST_LAST_OPC /* must be < 33 to avoid MDS_GETATTR */
-} ost_cmd_t;
+	OST_LAST_OPC, /* must be < 33 to avoid MDS_GETATTR */
+	OST_FALLOCATE  = 22,
+	OST_SEEK       = 23,
+};
 #define OST_FIRST_OPC  OST_REPLY
 
 enum obdo_flags {
@@ -980,13 +1061,16 @@ enum obdo_flags {
         OBD_FL_NO_GRPQUOTA  = 0x00000200, /* the object's group is over quota */
         OBD_FL_CREATE_CROW  = 0x00000400, /* object should be create on write */
         OBD_FL_SRVLOCK      = 0x00000800, /* delegate DLM locking to server */
-        OBD_FL_CKSUM_CRC32  = 0x00001000, /* CRC32 checksum type */
-        OBD_FL_CKSUM_ADLER  = 0x00002000, /* ADLER checksum type */
-        OBD_FL_CKSUM_CRC32C = 0x00004000, /* CRC32C checksum type */
-        OBD_FL_CKSUM_RSVD2  = 0x00008000, /* for future cksum types */
-        OBD_FL_CKSUM_RSVD3  = 0x00010000, /* for future cksum types */
-        OBD_FL_SHRINK_GRANT = 0x00020000, /* object shrink the grant */
-        OBD_FL_MMAP         = 0x00040000, /* object is mmapped on the client.
+	OBD_FL_CKSUM_CRC32  = 0x00001000, /* CRC32 checksum type */
+	OBD_FL_CKSUM_ADLER  = 0x00002000, /* ADLER checksum type */
+	OBD_FL_CKSUM_CRC32C = 0x00004000, /* CRC32C checksum type */
+	OBD_FL_CKSUM_T10IP512  = 0x00005000, /* T10PI IP cksum, 512B sector */
+	OBD_FL_CKSUM_T10IP4K   = 0x00006000, /* T10PI IP cksum, 4KB sector */
+	OBD_FL_CKSUM_T10CRC512 = 0x00007000, /* T10PI CRC cksum, 512B sector */
+	OBD_FL_CKSUM_T10CRC4K  = 0x00008000, /* T10PI CRC cksum, 4KB sector */
+	OBD_FL_CKSUM_RSVD3  = 0x00010000, /* for future cksum types */
+	OBD_FL_SHRINK_GRANT = 0x00020000, /* object shrink the grant */
+	OBD_FL_MMAP         = 0x00040000, /* object is mmapped on the client.
                                            * XXX: obsoleted - reserved for old
                                            * clients prior than 2.2 */
         OBD_FL_RECOV_RESEND = 0x00080000, /* recoverable resent */
@@ -995,10 +1079,18 @@ enum obdo_flags {
 	OBD_FL_SHORT_IO	    = 0x00400000, /* short io request */
 	/* OBD_FL_LOCAL_MASK = 0xF0000000, was local-only flags until 2.10 */
 
-	/* Note that while these checksum values are currently separate bits,
-	 * in 2.x we can actually allow all values from 1-31 if we wanted. */
+	/*
+	 * Note that while the original checksum values were separate bits,
+	 * in 2.x we can actually allow all values from 1-31. T10-PI checksum
+	 * types already use values which are not separate bits.
+	 */
 	OBD_FL_CKSUM_ALL    = OBD_FL_CKSUM_CRC32 | OBD_FL_CKSUM_ADLER |
-			      OBD_FL_CKSUM_CRC32C,
+			      OBD_FL_CKSUM_CRC32C | OBD_FL_CKSUM_T10IP512 |
+			      OBD_FL_CKSUM_T10IP4K | OBD_FL_CKSUM_T10CRC512 |
+			      OBD_FL_CKSUM_T10CRC4K,
+
+	OBD_FL_NO_QUOTA_ALL = OBD_FL_NO_USRQUOTA | OBD_FL_NO_GRPQUOTA |
+			      OBD_FL_NO_PRJQUOTA,
 };
 
 /*
@@ -1035,10 +1127,10 @@ enum obdo_flags {
  * those *_DEF magics are only used on server side internally, they
  * won't be put on wire or disk.
  */
-#define LOV_MAGIC_DEF		0x10000000
-#define LOV_MAGIC_V1_DEF	(LOV_MAGIC_DEF | LOV_MAGIC_V1)
-#define LOV_MAGIC_V3_DEF	(LOV_MAGIC_DEF | LOV_MAGIC_V3)
-#define LOV_MAGIC_COMP_V1_DEF	(LOV_MAGIC_DEF | LOV_MAGIC_COMP_V1)
+#define LOV_MAGIC_DEFINED		0x10000000
+#define LOV_MAGIC_V1_DEFINED		(LOV_MAGIC_DEFINED | LOV_MAGIC_V1)
+#define LOV_MAGIC_V3_DEFINED		(LOV_MAGIC_DEFINED | LOV_MAGIC_V3)
+#define LOV_MAGIC_COMP_V1_DEFINED	(LOV_MAGIC_DEFINED | LOV_MAGIC_COMP_V1)
 
 #define lov_pattern(pattern)		(pattern & ~LOV_PATTERN_F_MASK)
 #define lov_pattern_flags(pattern)	(pattern & LOV_PATTERN_F_MASK)
@@ -1081,6 +1173,7 @@ struct lov_mds_md_v1 {            /* LOV EA mds/wire data (little-endian) */
 #define XATTR_TRUSTED_PREFIX    "trusted."
 #define XATTR_SECURITY_PREFIX   "security."
 
+#define XATTR_NAME_SOM		"trusted.som"
 #define XATTR_NAME_LOV          "trusted.lov"
 #define XATTR_NAME_LMA          "trusted.lma"
 #define XATTR_NAME_LMV          "trusted.lmv"
@@ -1122,7 +1215,7 @@ static inline __u32 lov_mds_md_size(__u16 stripes, __u32 lmm_magic)
 }
 
 static inline __u32
-lov_mds_md_max_stripe_count(size_t buf_size, __u32 lmm_magic)
+lov_mds_md_max_stripe_count(__kernel_size_t buf_size, __u32 lmm_magic)
 {
 	switch (lmm_magic) {
 	case LOV_MAGIC_V1: {
@@ -1158,20 +1251,21 @@ lov_mds_md_max_stripe_count(size_t buf_size, __u32 lmm_magic)
 #define OBD_MD_FLUID       (0x00000200ULL) /* user ID */
 #define OBD_MD_FLGID       (0x00000400ULL) /* group ID */
 #define OBD_MD_FLFLAGS     (0x00000800ULL) /* flags word */
+#define OBD_MD_DOM_SIZE    (0X00001000ULL) /* Data-on-MDT component size */
 #define OBD_MD_FLNLINK     (0x00002000ULL) /* link count */
-#define OBD_MD_FLGENER     (0x00004000ULL) /* generation number */
-/*#define OBD_MD_FLINLINE    (0x00008000ULL)  inline data. used until 1.6.5 */
+#define OBD_MD_FLPARENT    (0x00004000ULL) /* parent FID */
+#define OBD_MD_LAYOUT_VERSION (0x00008000ULL) /* OST object layout version */
 #define OBD_MD_FLRDEV      (0x00010000ULL) /* device number */
 #define OBD_MD_FLEASIZE    (0x00020000ULL) /* extended attribute data */
 #define OBD_MD_LINKNAME    (0x00040000ULL) /* symbolic link target */
 #define OBD_MD_FLHANDLE    (0x00080000ULL) /* file/lock handle */
 #define OBD_MD_FLCKSUM     (0x00100000ULL) /* bulk data checksum */
-#define OBD_MD_FLQOS       (0x00200000ULL) /* quality of service stats */
-/*	OBD_MD_FLCOOKIE    (0x00800000ULL)    obsolete in 2.8 */
+/*	OBD_MD_FLQOS       (0x00200000ULL) has never been used */
+/*	OBD_MD_FLCOOKIE    (0x00800000ULL) obsolete in 2.8 */
 #define OBD_MD_FLPRJQUOTA  (0x00400000ULL) /* over quota flags sent from ost */
 #define OBD_MD_FLGROUP     (0x01000000ULL) /* group */
 #define OBD_MD_FLFID       (0x02000000ULL) /* ->ost write inline fid */
-#define OBD_MD_FLEPOCH     (0x04000000ULL) /* ->ost write with ioepoch */
+/*	OBD_MD_FLEPOCH     (0x04000000ULL) obsolete 2.7.50 */
                                            /* ->mds if epoch opens or closes */
 #define OBD_MD_FLGRANT     (0x08000000ULL) /* ost preallocation space grant */
 #define OBD_MD_FLDIREA     (0x10000000ULL) /* dir's extended attribute data */
@@ -1180,7 +1274,7 @@ lov_mds_md_max_stripe_count(size_t buf_size, __u32 lmm_magic)
 #define OBD_MD_FLMODEASIZE (0x80000000ULL) /* EA size will be changed */
 
 #define OBD_MD_MDS         (0x0000000100000000ULL) /* where an inode lives on */
-#define OBD_MD_REINT       (0x0000000200000000ULL) /* reintegrate oa */
+/*	OBD_MD_REINT       (0x0000000200000000ULL) obsolete 1.8 */
 #define OBD_MD_MEA         (0x0000000400000000ULL) /* CMD split EA  */
 #define OBD_MD_TSTATE      (0x0000000800000000ULL) /* transient state field */
 
@@ -1188,10 +1282,10 @@ lov_mds_md_max_stripe_count(size_t buf_size, __u32 lmm_magic)
 #define OBD_MD_FLXATTRLS     (0x0000002000000000ULL) /* xattr list */
 #define OBD_MD_FLXATTRRM     (0x0000004000000000ULL) /* xattr remove */
 #define OBD_MD_FLACL         (0x0000008000000000ULL) /* ACL */
-/*	OBD_MD_FLRMTPERM     (0x0000010000000000ULL) remote perm, obsolete */
-#define OBD_MD_FLMDSCAPA     (0x0000020000000000ULL) /* MDS capability */
-#define OBD_MD_FLOSSCAPA     (0x0000040000000000ULL) /* OSS capability */
-#define OBD_MD_FLCKSPLIT     (0x0000080000000000ULL) /* Check split on server */
+#define OBD_MD_FLAGSTATFS    (0x0000010000000000ULL) /* aggregated statfs */
+/*	OBD_MD_FLMDSCAPA     (0x0000020000000000ULL) obsolete 2.7.54 */
+/*	OBD_MD_FLOSSCAPA     (0x0000040000000000ULL) obsolete 2.7.54 */
+/*      OBD_MD_FLCKSPLIT     (0x0000080000000000ULL) obsolete 2.3.58*/
 #define OBD_MD_FLCROSSREF    (0x0000100000000000ULL) /* Cross-ref case */
 #define OBD_MD_FLGETATTRLOCK (0x0000200000000000ULL) /* Get IOEpoch attributes
                                                       * under lock; for xattr
@@ -1206,6 +1300,10 @@ lov_mds_md_max_stripe_count(size_t buf_size, __u32 lmm_magic)
 #define OBD_MD_DEFAULT_MEA   (0x0040000000000000ULL) /* default MEA */
 #define OBD_MD_FLOSTLAYOUT   (0x0080000000000000ULL) /* contain ost_layout */
 #define OBD_MD_FLPROJID      (0x0100000000000000ULL) /* project ID */
+#define OBD_MD_SECCTX        (0x0200000000000000ULL) /* embed security xattr */
+
+#define OBD_MD_FLLAZYSIZE    (0x0400000000000000ULL) /* Lazy size */
+#define OBD_MD_FLLAZYBLOCKS  (0x0800000000000000ULL) /* Lazy blocks */
 
 #define OBD_MD_FLALLQUOTA (OBD_MD_FLUSRQUOTA | \
 			   OBD_MD_FLGRPQUOTA | \
@@ -1215,7 +1313,7 @@ lov_mds_md_max_stripe_count(size_t buf_size, __u32 lmm_magic)
 			  OBD_MD_FLCTIME | OBD_MD_FLSIZE  | OBD_MD_FLBLKSZ | \
 			  OBD_MD_FLMODE  | OBD_MD_FLTYPE  | OBD_MD_FLUID   | \
 			  OBD_MD_FLGID   | OBD_MD_FLFLAGS | OBD_MD_FLNLINK | \
-			  OBD_MD_FLGENER | OBD_MD_FLRDEV  | OBD_MD_FLGROUP | \
+			  OBD_MD_FLPARENT | OBD_MD_FLRDEV  | OBD_MD_FLGROUP | \
 			  OBD_MD_FLPROJID)
 
 #define OBD_MD_FLXATTRALL (OBD_MD_FLXATTR | OBD_MD_FLXATTRLS)
@@ -1241,6 +1339,9 @@ struct hsm_state_set {
 #define OBD_BRW_READ            0x01
 #define OBD_BRW_WRITE           0x02
 #define OBD_BRW_RWMASK          (OBD_BRW_READ | OBD_BRW_WRITE)
+#define OBD_BRW_NDELAY		0x04 /* Non-delay RPC should be issued for
+				      * this page. Non-delay RPCs have bit
+				      * rq_no_delay set. */
 #define OBD_BRW_SYNC            0x08 /* this page is a part of synchronous
                                       * transfer and is not accounted in
                                       * the grant. */
@@ -1495,11 +1596,11 @@ struct lquota_lvb {
 #define lvb_glb_ver  lvb_id_may_rel /* current version of the global index */
 
 /* op codes */
-typedef enum {
+enum quota_cmd {
 	QUOTA_DQACQ	= 601,
 	QUOTA_DQREL	= 602,
 	QUOTA_LAST_OPC
-} quota_cmd_t;
+};
 #define QUOTA_FIRST_OPC	QUOTA_DQACQ
 
 /*
@@ -1507,7 +1608,7 @@ typedef enum {
  */
 
 /* opcodes */
-typedef enum {
+enum mds_cmd {
 	MDS_GETATTR		= 33,
 	MDS_GETATTR_NAME	= 34,
 	MDS_CLOSE		= 35,
@@ -1537,17 +1638,18 @@ typedef enum {
 	MDS_HSM_CT_REGISTER	= 59,
 	MDS_HSM_CT_UNREGISTER	= 60,
 	MDS_SWAP_LAYOUTS	= 61,
+	MDS_RMFID		= 62,
 	MDS_LAST_OPC
-} mds_cmd_t;
+};
 
 #define MDS_FIRST_OPC    MDS_GETATTR
 
 
 /* opcodes for object update */
-typedef enum {
+enum update_cmd {
 	OUT_UPDATE	= 1000,
 	OUT_UPDATE_LAST_OPC
-} update_cmd_t;
+};
 
 #define OUT_UPDATE_FIRST_OPC    OUT_UPDATE
 
@@ -1555,7 +1657,7 @@ typedef enum {
  * Do not exceed 63
  */
 
-typedef enum {
+enum mds_reint_op {
 	REINT_SETATTR  = 1,
 	REINT_CREATE   = 2,
 	REINT_LINK     = 3,
@@ -1565,8 +1667,9 @@ typedef enum {
 	REINT_SETXATTR = 7,
 	REINT_RMENTRY  = 8,
 	REINT_MIGRATE  = 9,
-        REINT_MAX
-} mds_reint_t, mdt_reint_t;
+	REINT_RESYNC   = 10,
+	REINT_MAX
+};
 
 /* the disposition of the intent outlines what was executed */
 #define DISP_IT_EXECD        0x00000001
@@ -1584,28 +1687,33 @@ typedef enum {
 #define DISP_OPEN_DENY	     0x10000000
 
 /* INODE LOCK PARTS */
-#define MDS_INODELOCK_LOOKUP 0x000001	/* For namespace, dentry etc, and also
-					 * was used to protect permission (mode,
-					 * owner, group etc) before 2.4. */
-#define MDS_INODELOCK_UPDATE 0x000002	/* size, links, timestamps */
-#define MDS_INODELOCK_OPEN   0x000004	/* For opened files */
-#define MDS_INODELOCK_LAYOUT 0x000008	/* for layout */
-
-/* The PERM bit is added int 2.4, and it is used to protect permission(mode,
- * owner, group, acl etc), so to separate the permission from LOOKUP lock.
- * Because for remote directories(in DNE), these locks will be granted by
- * different MDTs(different ldlm namespace).
- *
- * For local directory, MDT will always grant UPDATE_LOCK|PERM_LOCK together.
- * For Remote directory, the master MDT, where the remote directory is, will
- * grant UPDATE_LOCK|PERM_LOCK, and the remote MDT, where the name entry is,
- * will grant LOOKUP_LOCK. */
-#define MDS_INODELOCK_PERM   0x000010
-#define MDS_INODELOCK_XATTR  0x000020	/* extended attributes */
-
-#define MDS_INODELOCK_MAXSHIFT 5
+enum mds_ibits_locks {
+	MDS_INODELOCK_LOOKUP	= 0x000001, /* For namespace, dentry etc.  Was
+					     * used to protect permission (mode,
+					     * owner, group, etc) before 2.4. */
+	MDS_INODELOCK_UPDATE	= 0x000002, /* size, links, timestamps */
+	MDS_INODELOCK_OPEN	= 0x000004, /* For opened files */
+	MDS_INODELOCK_LAYOUT	= 0x000008, /* for layout */
+
+	/* The PERM bit is added in 2.4, and is used to protect permission
+	 * (mode, owner, group, ACL, etc.) separate from LOOKUP lock.
+	 * For remote directories (in DNE) these locks will be granted by
+	 * different MDTs (different LDLM namespace).
+	 *
+	 * For local directory, the MDT always grants UPDATE|PERM together.
+	 * For remote directory, master MDT (where remote directory is) grants
+	 * UPDATE|PERM, and remote MDT (where name entry is) grants LOOKUP_LOCK.
+	 */
+	MDS_INODELOCK_PERM	= 0x000010,
+	MDS_INODELOCK_XATTR	= 0x000020, /* non-permission extended attrs */
+	MDS_INODELOCK_DOM	= 0x000040, /* Data for Data-on-MDT files */
+	/* Do not forget to increase MDS_INODELOCK_NUMBITS when adding bits */
+};
+#define MDS_INODELOCK_NUMBITS 7
 /* This FULL lock is useful to take on unlink sort of operations */
-#define MDS_INODELOCK_FULL ((1<<(MDS_INODELOCK_MAXSHIFT+1))-1)
+#define MDS_INODELOCK_FULL ((1 << MDS_INODELOCK_NUMBITS) - 1)
+/* DOM lock shouldn't be canceled early, use this macro for ELC */
+#define MDS_INODELOCK_ELC (MDS_INODELOCK_FULL & ~MDS_INODELOCK_DOM)
 
 /* NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2],
  * but was moved into name[1] along with the OID to avoid consuming the
@@ -1625,17 +1733,17 @@ enum {
 enum {
 	/* these should be identical to their EXT4_*_FL counterparts, they are
 	 * redefined here only to avoid dragging in fs/ext4/ext4.h */
-	LUSTRE_SYNC_FL = 0x00000008, /* Synchronous updates */
-	LUSTRE_IMMUTABLE_FL = 0x00000010, /* Immutable file */
-	LUSTRE_APPEND_FL = 0x00000020, /* writes to file may only append */
-	LUSTRE_NODUMP_FL = 0x00000040, /* do not dump file */
-	LUSTRE_NOATIME_FL = 0x00000080, /* do not update atime */
-	LUSTRE_INDEX_FL = 0x00001000, /* hash-indexed directory */
-	LUSTRE_DIRSYNC_FL = 0x00010000, /* dirsync behaviour (dir only) */
-	LUSTRE_TOPDIR_FL = 0x00020000, /* Top of directory hierarchies*/
-	LUSTRE_DIRECTIO_FL = 0x00100000, /* Use direct i/o */
-	LUSTRE_INLINE_DATA_FL = 0x10000000, /* Inode has inline data. */
-	LUSTRE_PROJINHERIT_FL = 0x20000000, /* Create with parents projid */
+	LUSTRE_SYNC_FL		= 0x00000008, /* Synchronous updates */
+	LUSTRE_IMMUTABLE_FL	= 0x00000010, /* Immutable file */
+	LUSTRE_APPEND_FL	= 0x00000020, /* file writes may only append */
+	LUSTRE_NODUMP_FL	= 0x00000040, /* do not dump file */
+	LUSTRE_NOATIME_FL	= 0x00000080, /* do not update atime */
+	LUSTRE_INDEX_FL		= 0x00001000, /* hash-indexed directory */
+	LUSTRE_DIRSYNC_FL	= 0x00010000, /* dirsync behaviour (dir only) */
+	LUSTRE_TOPDIR_FL	= 0x00020000, /* Top of directory hierarchies*/
+	LUSTRE_DIRECTIO_FL	= 0x00100000, /* Use direct i/o */
+	LUSTRE_INLINE_DATA_FL	= 0x10000000, /* Inode has inline data. */
+	LUSTRE_PROJINHERIT_FL	= 0x20000000, /* Create with parents projid */
 
 	/* These flags will not be identical to any EXT4_*_FL counterparts,
 	 * and only reserved for lustre purpose. Note: these flags might
@@ -1644,45 +1752,26 @@ enum {
 	 * wired by la_flags see osd_attr_get().
 	 * 2. If these flags needs to be stored into inode, they will be
 	 * stored in LMA. see LMAI_XXXX */
-	LUSTRE_ORPHAN_FL = 0x00002000,
+	LUSTRE_ORPHAN_FL	= 0x00002000,
+	LUSTRE_SET_SYNC_FL	= 0x00040000, /* Synchronous setattr on OSTs */
 
-	LUSTRE_LMA_FL_MASKS = LUSTRE_ORPHAN_FL,
+	LUSTRE_LMA_FL_MASKS	= LUSTRE_ORPHAN_FL,
 };
 
-#ifndef FS_XFLAG_PROJINHERIT
-#define FS_XFLAG_PROJINHERIT	0x00000200	/* create with parents projid */
+#ifndef FS_XFLAG_SYNC
+#define FS_XFLAG_SYNC		0x00000020	/* all writes synchronous */
 #endif
-
-#ifdef __KERNEL__
-/* Convert wire LUSTRE_*_FL to corresponding client local VFS S_* values
- * for the client inode i_flags.  The LUSTRE_*_FL are the Lustre wire
- * protocol equivalents of LDISKFS_*_FL values stored on disk, while
- * the S_* flags are kernel-internal values that change between kernel
- * versions.  These flags are set/cleared via FSFILT_IOC_{GET,SET}_FLAGS.
- * See b=16526 for a full history. */
-static inline int ll_ext_to_inode_flags(int flags)
-{
-        return (((flags & LUSTRE_SYNC_FL)      ? S_SYNC      : 0) |
-                ((flags & LUSTRE_NOATIME_FL)   ? S_NOATIME   : 0) |
-                ((flags & LUSTRE_APPEND_FL)    ? S_APPEND    : 0) |
-#if defined(S_DIRSYNC)
-                ((flags & LUSTRE_DIRSYNC_FL)   ? S_DIRSYNC   : 0) |
+#ifndef FS_XFLAG_NOATIME
+#define FS_XFLAG_NOATIME	0x00000040	/* do not update access time */
 #endif
-		((flags & LUSTRE_IMMUTABLE_FL) ? S_IMMUTABLE : 0) |
-		((flags & LUSTRE_PROJINHERIT_FL) ? FS_XFLAG_PROJINHERIT : 0));
-}
-
-static inline int ll_inode_to_ext_flags(int iflags)
-{
-        return (((iflags & S_SYNC)      ? LUSTRE_SYNC_FL      : 0) |
-                ((iflags & S_NOATIME)   ? LUSTRE_NOATIME_FL   : 0) |
-                ((iflags & S_APPEND)    ? LUSTRE_APPEND_FL    : 0) |
-#if defined(S_DIRSYNC)
-                ((iflags & S_DIRSYNC)   ? LUSTRE_DIRSYNC_FL   : 0) |
+#ifndef FS_XFLAG_IMMUTABLE
+#define FS_XFLAG_IMMUTABLE	0x00000008	/* file cannot be modified */
 #endif
-		((iflags & S_IMMUTABLE) ? LUSTRE_IMMUTABLE_FL : 0) |
-		((iflags & FS_XFLAG_PROJINHERIT) ? LUSTRE_PROJINHERIT_FL : 0));
-}
+#ifndef FS_XFLAG_APPEND
+#define FS_XFLAG_APPEND		0x00000010	/* all writes append */
+#endif
+#ifndef FS_XFLAG_PROJINHERIT
+#define FS_XFLAG_PROJINHERIT	0x00000200	/* create with parents projid */
 #endif
 
 /* 64 possible states */
@@ -1693,14 +1782,14 @@ enum md_transient_state {
 struct mdt_body {
 	struct lu_fid mbo_fid1;
 	struct lu_fid mbo_fid2;
-	struct lustre_handle mbo_handle;
+	struct lustre_handle mbo_open_handle;
 	__u64	mbo_valid;
 	__u64	mbo_size; /* Offset, in the case of MDS_READPAGE */
 	__s64	mbo_mtime;
 	__s64	mbo_atime;
 	__s64	mbo_ctime;
 	__u64	mbo_blocks; /* XID, in the case of MDS_READPAGE */
-	__u64	mbo_ioepoch;
+	__u64	mbo_version; /* was mbo_ioepoch before 2.11 */
 	__u64	mbo_t_state; /* transient file state defined in
 			      * enum md_transient_state
 			      * was "ino" until 2.4.0 */
@@ -1713,7 +1802,7 @@ struct mdt_body {
 	__u32	mbo_flags;   /* LUSTRE_*_FL file attributes */
 	__u32	mbo_rdev;
 	__u32	mbo_nlink; /* #bytes to read in the case of MDS_READPAGE */
-	__u32	mbo_unused2; /* was "generation" until 2.4.0 */
+	__u32	mbo_layout_gen; /* was "generation" until 2.4.0 */
 	__u32	mbo_suppgid;
 	__u32	mbo_eadatasize;
 	__u32	mbo_aclsize;
@@ -1722,15 +1811,15 @@ struct mdt_body {
 	__u32	mbo_uid_h; /* high 32-bits of uid, for FUID */
 	__u32	mbo_gid_h; /* high 32-bits of gid, for FUID */
 	__u32	mbo_projid;
-	__u64	mbo_padding_6; /* also fix lustre_swab_mdt_body */
-	__u64	mbo_padding_7;
-	__u64	mbo_padding_8;
+	__u64	mbo_dom_size; /* size of DOM component */
+	__u64	mbo_dom_blocks; /* blocks consumed by DOM component */
+	__u64	mbo_padding_8; /* also fix lustre_swab_mdt_body */
 	__u64	mbo_padding_9;
 	__u64	mbo_padding_10;
 }; /* 216 */
 
 struct mdt_ioepoch {
-	struct lustre_handle mio_handle;
+	struct lustre_handle mio_open_handle;
 	__u64 mio_unused1; /* was ioepoch */
 	__u32 mio_unused2; /* was flags */
 	__u32 mio_padding;
@@ -1794,103 +1883,72 @@ struct mdt_rec_setattr {
 #define MDS_ATTR_FROM_OPEN  0x4000ULL /* = 16384, called from open path, ie O_TRUNC */
 #define MDS_ATTR_BLOCKS     0x8000ULL /* = 32768 */
 #define MDS_ATTR_PROJID	    0x10000ULL	/* = 65536 */
-
-#ifndef FMODE_READ
-#define FMODE_READ               00000001
-#define FMODE_WRITE              00000002
-#endif
-
-#define MDS_FMODE_CLOSED         00000000
-#define MDS_FMODE_EXEC           00000004
-/*	MDS_FMODE_EPOCH          01000000 obsolete since 2.8.0 */
-/*	MDS_FMODE_TRUNC          02000000 obsolete since 2.8.0 */
-/*	MDS_FMODE_SOM            04000000 obsolete since 2.8.0 */
-
-#define MDS_OPEN_CREATED         00000010
-#define MDS_OPEN_CROSS           00000020
-
-#define MDS_OPEN_CREAT           00000100
-#define MDS_OPEN_EXCL            00000200
-#define MDS_OPEN_TRUNC           00001000
-#define MDS_OPEN_APPEND          00002000
-#define MDS_OPEN_SYNC            00010000
-#define MDS_OPEN_DIRECTORY       00200000
-
-#define MDS_OPEN_BY_FID 	040000000 /* open_by_fid for known object */
-#define MDS_OPEN_DELAY_CREATE  0100000000 /* delay initial object create */
-#define MDS_OPEN_OWNEROVERRIDE 0200000000 /* NFSD rw-reopen ro file for owner */
-#define MDS_OPEN_JOIN_FILE     0400000000 /* open for join file.
-                                           * We do not support JOIN FILE
-                                           * anymore, reserve this flags
-                                           * just for preventing such bit
-                                           * to be reused. */
-
-#define MDS_OPEN_LOCK         04000000000 /* This open requires open lock */
-#define MDS_OPEN_HAS_EA      010000000000 /* specify object create pattern */
-#define MDS_OPEN_HAS_OBJS    020000000000 /* Just set the EA the obj exist */
-#define MDS_OPEN_NORESTORE  0100000000000ULL /* Do not restore file at open */
-#define MDS_OPEN_NEWSTRIPE  0200000000000ULL /* New stripe needed (restripe or
-                                              * hsm restore) */
-#define MDS_OPEN_VOLATILE   0400000000000ULL /* File is volatile = created
-						unlinked */
-#define MDS_OPEN_LEASE	   01000000000000ULL /* Open the file and grant lease
-					      * delegation, succeed if it's not
-					      * being opened with conflict mode.
-					      */
-#define MDS_OPEN_RELEASE   02000000000000ULL /* Open the file for HSM release */
-
-/* lustre internal open flags, which should not be set from user space */
-#define MDS_OPEN_FL_INTERNAL (MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS |	\
-			      MDS_OPEN_OWNEROVERRIDE | MDS_OPEN_LOCK |	\
-			      MDS_OPEN_BY_FID | MDS_OPEN_LEASE |	\
-			      MDS_OPEN_RELEASE)
+#define MDS_ATTR_LSIZE      0x20000ULL	/* = 131072 */
+#define MDS_ATTR_LBLOCKS    0x40000ULL	/* = 262144 */
+#define MDS_ATTR_OVERRIDE	0x2000000ULL /* = 33554432 */
 
 enum mds_op_bias {
-	MDS_CHECK_SPLIT		= 1 << 0,
+/*	MDS_CHECK_SPLIT		= 1 << 0, obsolete before 2.3.58 */
+	/* used for remote object getattr/open by name: in the original
+	 * getattr/open request, MDT found the object against name is on another
+	 * MDT, then packed FID and LOOKUP lock in reply and returned -EREMOTE,
+	 * and client knew it's a remote object, then set this flag in
+	 * getattr/open request and sent to the corresponding MDT to finish
+	 * getattr/open, which fetched attributes and UPDATE lock/opened file.
+	 */
 	MDS_CROSS_REF		= 1 << 1,
-	MDS_VTX_BYPASS		= 1 << 2,
+/*	MDS_VTX_BYPASS		= 1 << 2, obsolete since 2.3.54 */
 	MDS_PERM_BYPASS		= 1 << 3,
 /*	MDS_SOM			= 1 << 4, obsolete since 2.8.0 */
 	MDS_QUOTA_IGNORE	= 1 << 5,
-	/* Was MDS_CLOSE_CLEANUP (1 << 6), No more used */
+/*	MDS_CLOSE_CLEANUP	= 1 << 6, obsolete since 2.3.51 */
 	MDS_KEEP_ORPHAN		= 1 << 7,
 	MDS_RECOV_OPEN		= 1 << 8,
 	MDS_DATA_MODIFIED	= 1 << 9,
 	MDS_CREATE_VOLATILE	= 1 << 10,
 	MDS_OWNEROVERRIDE	= 1 << 11,
 	MDS_HSM_RELEASE		= 1 << 12,
-	MDS_RENAME_MIGRATE	= 1 << 13,
+	MDS_CLOSE_MIGRATE	= 1 << 13,
 	MDS_CLOSE_LAYOUT_SWAP	= 1 << 14,
+	MDS_CLOSE_LAYOUT_MERGE	= 1 << 15,
+	MDS_CLOSE_RESYNC_DONE	= 1 << 16,
+	MDS_CLOSE_LAYOUT_SPLIT	= 1 << 17,
+	MDS_TRUNC_KEEP_LEASE	= 1 << 18,
+	MDS_CLOSE_UPDATE_TIMES	= 1 << 20,
 };
 
+#define MDS_CLOSE_INTENT (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP |         \
+			  MDS_CLOSE_LAYOUT_MERGE | MDS_CLOSE_LAYOUT_SPLIT | \
+			  MDS_CLOSE_RESYNC_DONE)
+
 /* instance of mdt_reint_rec */
 struct mdt_rec_create {
-        __u32           cr_opcode;
-        __u32           cr_cap;
-        __u32           cr_fsuid;
-        __u32           cr_fsuid_h;
-        __u32           cr_fsgid;
-        __u32           cr_fsgid_h;
-        __u32           cr_suppgid1;
-        __u32           cr_suppgid1_h;
-        __u32           cr_suppgid2;
-        __u32           cr_suppgid2_h;
-        struct lu_fid   cr_fid1;
-        struct lu_fid   cr_fid2;
-        struct lustre_handle cr_old_handle; /* handle in case of open replay */
+	__u32		cr_opcode;
+	__u32		cr_cap;
+	__u32		cr_fsuid;
+	__u32		cr_fsuid_h;
+	__u32		cr_fsgid;
+	__u32		cr_fsgid_h;
+	__u32		cr_suppgid1;
+	__u32		cr_suppgid1_h;
+	__u32		cr_suppgid2;
+	__u32		cr_suppgid2_h;
+	struct lu_fid	cr_fid1;
+	struct lu_fid	cr_fid2;
+	struct lustre_handle cr_open_handle_old; /* in case of open replay */
 	__s64		cr_time;
-        __u64           cr_rdev;
-        __u64           cr_ioepoch;
-        __u64           cr_padding_1;   /* rr_blocks */
-        __u32           cr_mode;
-        __u32           cr_bias;
-        /* use of helpers set/get_mrc_cr_flags() is needed to access
-         * 64 bits cr_flags [cr_flags_l, cr_flags_h], this is done to
-         * extend cr_flags size without breaking 1.8 compat */
-        __u32           cr_flags_l;     /* for use with open, low  32 bits  */
-        __u32           cr_flags_h;     /* for use with open, high 32 bits */
-        __u32           cr_umask;       /* umask for create */
-        __u32           cr_padding_4;   /* rr_padding_4 */
+	__u64		cr_rdev;
+	__u64		cr_ioepoch;
+	__u64		cr_padding_1;   /* rr_blocks */
+	__u32		cr_mode;
+	__u32		cr_bias;
+	/* use of helpers set/get_mrc_cr_flags() is needed to access
+	 * 64 bits cr_flags [cr_flags_l, cr_flags_h], this is done to
+	 * extend cr_flags size without breaking 1.8 compat */
+	__u32		cr_flags_l;	/* for use with open, low  32 bits  */
+	__u32		cr_flags_h;	/* for use with open, high 32 bits */
+	__u32		cr_umask;	/* umask for create */
+	__u32		cr_padding_4;   /* rr_padding_4 */
 };
 
 /* instance of mdt_reint_rec */
@@ -2003,6 +2061,35 @@ struct mdt_rec_setxattr {
         __u32           sx_padding_11;  /* rr_padding_4 */
 };
 
+/* instance of mdt_reint_rec
+ * FLR: for file resync MDS_REINT_RESYNC RPC. */
+struct mdt_rec_resync {
+	__u32           rs_opcode;
+	__u32           rs_cap;
+	__u32           rs_fsuid;
+	__u32           rs_fsuid_h;
+	__u32           rs_fsgid;
+	__u32           rs_fsgid_h;
+	__u32           rs_suppgid1;
+	__u32           rs_suppgid1_h;
+	__u32           rs_suppgid2;
+	__u32           rs_suppgid2_h;
+	struct lu_fid   rs_fid;
+	__u8		rs_padding0[sizeof(struct lu_fid)];
+	struct lustre_handle rs_lease_handle;	/* rr_mtime */
+	__s64		rs_padding1;	/* rr_atime */
+	__s64		rs_padding2;	/* rr_ctime */
+	__u64           rs_padding3;	/* rr_size */
+	__u64           rs_padding4;	/* rr_blocks */
+	__u32           rs_bias;
+	__u32           rs_padding5;	/* rr_mode */
+	__u32           rs_padding6;	/* rr_flags */
+	__u32           rs_padding7;	/* rr_flags_h */
+	__u32           rs_padding8;	/* rr_umask */
+	__u16           rs_mirror_id;
+	__u16           rs_padding9;	/* rr_padding_4 */
+};
+
 /*
  * mdt_rec_reint is the template for all mdt_reint_xxx structures.
  * Do NOT change the size of various members, otherwise the value
@@ -2034,7 +2121,8 @@ struct mdt_rec_reint {
 	__u32           rr_flags;
 	__u32           rr_flags_h;
 	__u32           rr_umask;
-	__u32           rr_padding_4; /* also fix lustre_swab_mdt_rec_reint */
+	__u16		rr_mirror_id;
+	__u16           rr_padding_4; /* also fix lustre_swab_mdt_rec_reint */
 };
 
 /* lmv structures */
@@ -2065,9 +2153,16 @@ struct lmv_mds_md_v1 {
 					 * used for now. Higher 16 bits will
 					 * be used to mark the object status,
 					 * for example migrating or dead. */
-	__u32 lmv_layout_version;	/* Used for directory restriping */
-	__u32 lmv_padding1;
-	__u64 lmv_padding2;
+	__u32 lmv_layout_version;	/* increased each time layout changed,
+					 * by directory migration, restripe
+					 * and LFSCK. */
+	__u32 lmv_migrate_offset;	/* once this is set, it means this
+					 * directory is been migrated, stripes
+					 * before this offset belong to target,
+					 * from this to source. */
+	__u32 lmv_migrate_hash;		/* hash type of source stripes of
+					 * migrating directory */
+	__u32 lmv_padding2;
 	__u64 lmv_padding3;
 	char lmv_pool_name[LOV_MAXPOOLNAME + 1];	/* pool name */
 	struct lu_fid lmv_stripe_fids[0];	/* FIDs for each stripe */
@@ -2087,7 +2182,7 @@ struct lmv_mds_md_v1 {
 
 #define LMV_HASH_FLAG_MIGRATION	0x80000000
 
-#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 10, 53, 0)
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 12, 53, 0)
 /* Since lustre 2.8, this flag will not be needed, instead this DEAD
  * and orphan flags will be stored in LMA (see LMAI_ORPHAN)
  * Keep this flag just for LFSCK, because it still might meet such
@@ -2115,11 +2210,11 @@ struct lmv_mds_md_v1 {
  **/
 #define LUSTRE_FNV_1A_64_PRIME	0x100000001b3ULL
 #define LUSTRE_FNV_1A_64_OFFSET_BIAS 0xcbf29ce484222325ULL
-static inline __u64 lustre_hash_fnv_1a_64(const void *buf, size_t size)
+static inline __u64 lustre_hash_fnv_1a_64(const void *buf, __kernel_size_t size)
 {
 	__u64 hash = LUSTRE_FNV_1A_64_OFFSET_BIAS;
 	const unsigned char *p = buf;
-	size_t i;
+	__kernel_size_t i;
 
 	for (i = 0; i < size; i++) {
 		hash ^= p[i];
@@ -2135,18 +2230,22 @@ union lmv_mds_md {
 	struct lmv_user_md	 lmv_user_md;
 };
 
-static inline int lmv_mds_md_size(int stripe_count, unsigned int lmm_magic)
+static inline __kernel_ssize_t lmv_mds_md_size(int stripe_count,
+					       unsigned int lmm_magic)
 {
+	__kernel_ssize_t len = -EINVAL;
+
 	switch (lmm_magic) {
-	case LMV_MAGIC_V1:{
+	case LMV_MAGIC_V1: {
 		struct lmv_mds_md_v1 *lmm1;
 
-		return sizeof(*lmm1) + stripe_count *
-				       sizeof(lmm1->lmv_stripe_fids[0]);
-	}
+		len = sizeof(*lmm1);
+		len += stripe_count * sizeof(lmm1->lmv_stripe_fids[0]);
+		break; }
 	default:
-		return -EINVAL;
+		break;
 	}
+	return len;
 }
 
 static inline int lmv_mds_md_stripe_count_get(const union lmv_mds_md *lmm)
@@ -2198,12 +2297,12 @@ enum fld_op {
 };
 
 /* LFSCK opcodes */
-typedef enum {
+enum lfsck_cmd {
 	LFSCK_NOTIFY		= 1101,
 	LFSCK_QUERY		= 1102,
 	LFSCK_LAST_OPC,
-	LFSCK_FIRST_OPC 	= LFSCK_NOTIFY
-} lfsck_cmd_t;
+	LFSCK_FIRST_OPC		= LFSCK_NOTIFY
+};
 
 /*
  *  LOV data structures
@@ -2239,7 +2338,7 @@ struct lov_desc {
  *   LDLM requests:
  */
 /* opcodes -- MUST be distinct from OST/MDS opcodes */
-typedef enum {
+enum ldlm_cmd {
         LDLM_ENQUEUE     = 101,
         LDLM_CONVERT     = 102,
         LDLM_CANCEL      = 103,
@@ -2248,7 +2347,7 @@ typedef enum {
         LDLM_GL_CALLBACK = 106,
         LDLM_SET_INFO    = 107,
         LDLM_LAST_OPC
-} ldlm_cmd_t;
+};
 #define LDLM_FIRST_OPC LDLM_ENQUEUE
 
 #define RES_NAME_SIZE 4
@@ -2263,7 +2362,7 @@ struct ldlm_res_id {
 			(unsigned long long)(res)->lr_name.name[3]
 
 /* lock types */
-typedef enum ldlm_mode {
+enum ldlm_mode {
 	LCK_MINMODE	= 0,
 	LCK_EX		= 1,
 	LCK_PW		= 2,
@@ -2274,17 +2373,17 @@ typedef enum ldlm_mode {
 	LCK_GROUP	= 64,
 	LCK_COS		= 128,
 	LCK_MAXMODE
-} ldlm_mode_t;
+};
 
 #define LCK_MODE_NUM    8
 
-typedef enum ldlm_type {
+enum ldlm_type {
 	LDLM_PLAIN	= 10,
 	LDLM_EXTENT	= 11,
 	LDLM_FLOCK	= 12,
 	LDLM_IBITS	= 13,
 	LDLM_MAX_TYPE
-} ldlm_type_t;
+};
 
 #define LDLM_MIN_TYPE LDLM_PLAIN
 
@@ -2294,8 +2393,18 @@ struct ldlm_extent {
         __u64 gid;
 };
 
+static inline bool ldlm_extent_equal(const struct ldlm_extent *ex1,
+				    const struct ldlm_extent *ex2)
+{
+	return ex1->start == ex2->start && ex1->end == ex2->end;
+}
+
 struct ldlm_inodebits {
-        __u64 bits;
+	__u64 bits;
+	union {
+		__u64 try_bits; /* optional bits to try */
+		__u64 cancel_bits; /* for lock convert */
+	};
 };
 
 struct ldlm_flock_wire {
@@ -2312,11 +2421,11 @@ struct ldlm_flock_wire {
  * this ever changes we will need to swab the union differently based
  * on the resource type. */
 
-typedef union ldlm_wire_policy_data {
+union ldlm_wire_policy_data {
 	struct ldlm_extent	l_extent;
 	struct ldlm_flock_wire	l_flock;
 	struct ldlm_inodebits	l_inodebits;
-} ldlm_wire_policy_data_t;
+};
 
 struct barrier_lvb {
 	__u32	lvb_status;
@@ -2338,19 +2447,21 @@ union ldlm_gl_desc {
 enum ldlm_intent_flags {
 	IT_OPEN        = 0x00000001,
 	IT_CREAT       = 0x00000002,
-	IT_OPEN_CREAT  = 0x00000003,
-	IT_READDIR     = 0x00000004,
+	IT_OPEN_CREAT  = IT_OPEN | IT_CREAT, /* To allow case label. */
+	IT_READDIR     = 0x00000004, /* Used by mdc, not put on the wire. */
 	IT_GETATTR     = 0x00000008,
 	IT_LOOKUP      = 0x00000010,
-	IT_UNLINK      = 0x00000020,
-	IT_TRUNC       = 0x00000040,
+/*	IT_UNLINK      = 0x00000020, Obsolete. */
+/*	IT_TRUNC       = 0x00000040, Obsolete. */
 	IT_GETXATTR    = 0x00000080,
-	IT_EXEC        = 0x00000100,
-	IT_PIN         = 0x00000200,
+/*	IT_EXEC        = 0x00000100, Obsolete. */
+/*	IT_PIN         = 0x00000200, Obsolete. */
 	IT_LAYOUT      = 0x00000400,
 	IT_QUOTA_DQACQ = 0x00000800,
 	IT_QUOTA_CONN  = 0x00001000,
-	IT_SETXATTR    = 0x00002000,
+/*	IT_SETXATTR    = 0x00002000, Obsolete. */
+	IT_GLIMPSE     = 0x00004000,
+	IT_BRW	       = 0x00008000,
 };
 
 struct ldlm_intent {
@@ -2374,10 +2485,10 @@ struct ldlm_lock_desc {
 #define LDLM_ENQUEUE_CANCEL_OFF 1
 
 struct ldlm_request {
-        __u32 lock_flags;
-        __u32 lock_count;
-        struct ldlm_lock_desc lock_desc;
-        struct lustre_handle lock_handle[LDLM_LOCKREQ_HANDLES];
+	__u32 lock_flags;		/* LDLM_FL_*, see lustre_dlm_flags.h */
+	__u32 lock_count;		/* number of locks in lock_handle[] */
+	struct ldlm_lock_desc lock_desc;/* lock descriptor */
+	struct lustre_handle lock_handle[LDLM_LOCKREQ_HANDLES];
 };
 
 struct ldlm_reply {
@@ -2395,17 +2506,17 @@ struct ldlm_reply {
 /*
  * Opcodes for mountconf (mgs and mgc)
  */
-typedef enum {
-        MGS_CONNECT = 250,
-        MGS_DISCONNECT,
-        MGS_EXCEPTION,         /* node died, etc. */
-        MGS_TARGET_REG,        /* whenever target starts up */
-        MGS_TARGET_DEL,
-        MGS_SET_INFO,
-        MGS_CONFIG_READ,
-        MGS_LAST_OPC
-} mgs_cmd_t;
-#define MGS_FIRST_OPC MGS_CONNECT
+enum mgs_cmd {
+	MGS_CONNECT	= 250,
+	MGS_DISCONNECT	= 251,
+	MGS_EXCEPTION	= 252,	/* node died, etc. */
+	MGS_TARGET_REG	= 253,	/* whenever target starts up */
+	MGS_TARGET_DEL	= 254,
+	MGS_SET_INFO	= 255,
+	MGS_CONFIG_READ	= 256,
+	MGS_LAST_OPC,
+	MGS_FIRST_OPC	= MGS_CONNECT
+};
 
 #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0)
 #define MGS_PARAM_MAXLEN 1024
@@ -2421,17 +2532,17 @@ struct mgs_send_param {
 #define MTI_PARAM_MAXLEN 4096
 #define MTI_NIDS_MAX     32
 struct mgs_target_info {
-        __u32            mti_lustre_ver;
-        __u32            mti_stripe_index;
-        __u32            mti_config_ver;
-        __u32            mti_flags;
-        __u32            mti_nid_count;
-        __u32            mti_instance; /* Running instance of target */
-        char             mti_fsname[MTI_NAME_MAXLEN];
-        char             mti_svname[MTI_NAME_MAXLEN];
-        char             mti_uuid[sizeof(struct obd_uuid)];
-        __u64            mti_nids[MTI_NIDS_MAX];     /* host nids (lnet_nid_t)*/
-        char             mti_params[MTI_PARAM_MAXLEN];
+	__u32		mti_lustre_ver;
+	__u32		mti_stripe_index;
+	__u32		mti_config_ver;
+	__u32		mti_flags;    /* LDD_F_* */
+	__u32		mti_nid_count;
+	__u32		mti_instance; /* Running instance of target */
+	char		mti_fsname[MTI_NAME_MAXLEN];
+	char		mti_svname[MTI_NAME_MAXLEN];
+	char		mti_uuid[sizeof(struct obd_uuid)];
+	__u64		mti_nids[MTI_NIDS_MAX]; /* host nids (lnet_nid_t) */
+	char		mti_params[MTI_PARAM_MAXLEN];
 };
 
 struct mgs_nidtbl_entry {
@@ -2497,15 +2608,14 @@ struct cfg_marker {
 /*
  * Opcodes for multiple servers.
  */
-
-typedef enum {
-        OBD_PING = 400,
-        OBD_LOG_CANCEL,
-	OBD_QC_CALLBACK, /* not used since 2.4 */
-	OBD_IDX_READ,
-        OBD_LAST_OPC
-} obd_cmd_t;
-#define OBD_FIRST_OPC OBD_PING
+enum obd_cmd {
+	OBD_PING	= 400,
+/*	OBD_LOG_CANCEL	= 401, obsolete since 1.5 */
+/*	OBD_QC_CALLBACK	= 402, obsolete since 2.4 */
+	OBD_IDX_READ	= 403,
+	OBD_LAST_OPC,
+	OBD_FIRST_OPC = OBD_PING
+};
 
 /**
  * llog contexts indices.
@@ -2554,7 +2664,7 @@ struct llog_catid {
 #define LLOG_OP_MAGIC 0x10600000
 #define LLOG_OP_MASK  0xfff00000
 
-typedef enum {
+enum llog_op_type {
 	LLOG_PAD_MAGIC		= LLOG_OP_MAGIC | 0x00000,
 	OST_SZ_REC		= LLOG_OP_MAGIC | 0x00f00,
 	/* OST_RAID1_REC	= LLOG_OP_MAGIC | 0x01000, never used */
@@ -2571,11 +2681,12 @@ typedef enum {
 	/* LLOG_JOIN_REC	= LLOG_OP_MAGIC | 0x50000, obsolete  1.8.0 */
 	CHANGELOG_REC		= LLOG_OP_MAGIC | 0x60000,
 	CHANGELOG_USER_REC	= LLOG_OP_MAGIC | 0x70000,
+	CHANGELOG_USER_REC2	= LLOG_OP_MAGIC | 0x70002,
 	HSM_AGENT_REC		= LLOG_OP_MAGIC | 0x80000,
 	UPDATE_REC		= LLOG_OP_MAGIC | 0xa0000,
 	LLOG_HDR_MAGIC		= LLOG_OP_MAGIC | 0x45539,
 	LLOG_LOGID_MAGIC	= LLOG_OP_MAGIC | 0x4553b,
-} llog_op_type;
+};
 
 #define LLOG_REC_HDR_NEEDS_SWABBING(r) \
 	(((r)->lrh_type & __swab32(LLOG_OP_MASK)) == __swab32(LLOG_OP_MAGIC))
@@ -2589,12 +2700,12 @@ struct llog_rec_hdr {
 	__u32	lrh_index;
 	__u32	lrh_type;
 	__u32	lrh_id;
-};
+} __attribute__((packed));
 
 struct llog_rec_tail {
 	__u32	lrt_len;
 	__u32	lrt_index;
-};
+} __attribute__((packed));
 
 /* Where data follow just after header */
 #define REC_DATA(ptr)						\
@@ -2652,7 +2763,7 @@ struct llog_setattr64_rec_v2 {
 	__u32			lsr_gid_h;
 	__u64			lsr_valid;
 	__u32			lsr_projid;
-	__u32			lsr_padding1;
+	__u32			lsr_layout_version;
 	__u64			lsr_padding2;
 	__u64			lsr_padding3;
 	struct llog_rec_tail	lsr_tail;
@@ -2676,8 +2787,13 @@ struct llog_size_change_rec {
 #define CHANGELOG_ALLMASK 0XFFFFFFFF
 /** default \a changelog_rec_type mask. Allow all of them, except
  * CL_ATIME since it can really be time consuming, and not necessary
- * under normal use. */
-#define CHANGELOG_DEFMASK (CHANGELOG_ALLMASK & ~(1 << CL_ATIME))
+ * under normal use.
+ * Remove also CL_OPEN, CL_GETXATTR and CL_DN_OPEN from default list as it can
+ * be costly and only necessary for audit purpose.
+ */
+#define CHANGELOG_DEFMASK (CHANGELOG_ALLMASK & \
+			   ~(1 << CL_ATIME | 1 << CL_OPEN | 1 << CL_GETXATTR | \
+			     1 << CL_DN_OPEN))
 
 /* changelog llog name, needed by client replicators */
 #define CHANGELOG_CATALOG "changelog_catalog"
@@ -2697,11 +2813,13 @@ struct llog_changelog_rec {
 #define CHANGELOG_USER_PREFIX "cl"
 
 struct llog_changelog_user_rec {
-        struct llog_rec_hdr   cur_hdr;
-        __u32                 cur_id;
-        __u32                 cur_padding;
-        __u64                 cur_endrec;
-        struct llog_rec_tail  cur_tail;
+	struct llog_rec_hdr   cur_hdr;
+	__u32                 cur_id;
+	/* only intended to be used in relative time comparisons to
+	 * detect idle users */
+	__u32                 cur_time;
+	__u64                 cur_endrec;
+	struct llog_rec_tail  cur_tail;
 } __attribute__((packed));
 
 enum agent_req_status {
@@ -2737,7 +2855,7 @@ struct llog_agent_req_rec {
 						 * agent_req_status */
 	__u32			arr_archive_id;	/**< backend archive number */
 	__u64			arr_flags;	/**< req flags */
-	__u64			arr_compound_id;	/**< compound cookie */
+	__u64			arr_compound_id; /**< compound cookie, ignored */
 	__u64			arr_req_create;	/**< req. creation time */
 	__u64			arr_req_change;	/**< req. status change time */
 	struct hsm_action_item	arr_hai;	/**< req. to the agent */
@@ -2766,12 +2884,25 @@ enum llog_flag {
 	LLOG_F_IS_PLAIN		= 0x4,
 	LLOG_F_EXT_JOBID	= 0x8,
 	LLOG_F_IS_FIXSIZE	= 0x10,
+	LLOG_F_EXT_EXTRA_FLAGS  = 0x20,
+	LLOG_F_EXT_X_UIDGID	= 0x40,
+	LLOG_F_EXT_X_NID	= 0x80,
+	LLOG_F_EXT_X_OMODE	= 0x100,
+	LLOG_F_EXT_X_XATTR	= 0x200,
+	LLOG_F_RM_ON_ERR	= 0x400,
 
 	/* Note: Flags covered by LLOG_F_EXT_MASK will be inherited from
 	 * catlog to plain log, so do not add LLOG_F_IS_FIXSIZE here,
 	 * because the catlog record is usually fixed size, but its plain
 	 * log record can be variable */
-	LLOG_F_EXT_MASK = LLOG_F_EXT_JOBID,
+	LLOG_F_EXT_MASK = LLOG_F_EXT_JOBID | LLOG_F_EXT_EXTRA_FLAGS |
+			  LLOG_F_EXT_X_UIDGID | LLOG_F_EXT_X_NID |
+			  LLOG_F_EXT_X_OMODE | LLOG_F_EXT_X_XATTR,
+};
+
+/* means first record of catalog */
+enum {
+	LLOG_CAT_FIRST = -1,
 };
 
 /* On-disk header structure of each log object, stored in little endian order */
@@ -2817,9 +2948,13 @@ struct llog_log_hdr {
 						 llh->llh_hdr.lrh_len -	\
 						 sizeof(llh->llh_tail)))
 
-/** log cookies are used to reference a specific log file and a record therein */
+/** log cookies are used to reference a specific log file and a record therein,
+    and pass record offset from llog_process_thread to llog_write */
 struct llog_cookie {
-        struct llog_logid       lgc_lgl;
+	union {
+		struct llog_logid	lgc_lgl;
+		__u64			lgc_offset;
+	};
         __u32                   lgc_subsys;
         __u32                   lgc_index;
         __u32                   lgc_padding;
@@ -2827,17 +2962,17 @@ struct llog_cookie {
 
 /** llog protocol */
 enum llogd_rpc_ops {
-        LLOG_ORIGIN_HANDLE_CREATE       = 501,
-        LLOG_ORIGIN_HANDLE_NEXT_BLOCK   = 502,
-        LLOG_ORIGIN_HANDLE_READ_HEADER  = 503,
-        LLOG_ORIGIN_HANDLE_WRITE_REC    = 504,
-        LLOG_ORIGIN_HANDLE_CLOSE        = 505,
-        LLOG_ORIGIN_CONNECT             = 506,
-	LLOG_CATINFO			= 507,  /* deprecated */
-        LLOG_ORIGIN_HANDLE_PREV_BLOCK   = 508,
-        LLOG_ORIGIN_HANDLE_DESTROY      = 509,  /* for destroy llog object*/
-        LLOG_LAST_OPC,
-        LLOG_FIRST_OPC                  = LLOG_ORIGIN_HANDLE_CREATE
+	LLOG_ORIGIN_HANDLE_CREATE	= 501,
+	LLOG_ORIGIN_HANDLE_NEXT_BLOCK	= 502,
+	LLOG_ORIGIN_HANDLE_READ_HEADER	= 503,
+/*	LLOG_ORIGIN_HANDLE_WRITE_REC	= 504, Obsolete by 2.1. */
+/*	LLOG_ORIGIN_HANDLE_CLOSE	= 505, Obsolete by 1.8. */
+/*	LLOG_ORIGIN_CONNECT		= 506, Obsolete by 2.4. */
+/*	LLOG_CATINFO			= 507, Obsolete by 2.3. */
+	LLOG_ORIGIN_HANDLE_PREV_BLOCK	= 508,
+	LLOG_ORIGIN_HANDLE_DESTROY	= 509, /* Obsolete by 2.11. */
+	LLOG_LAST_OPC,
+	LLOG_FIRST_OPC			= LLOG_ORIGIN_HANDLE_CREATE
 };
 
 struct llogd_body {
@@ -2891,7 +3026,7 @@ struct obdo {
 	 *
 	 * sizeof(ost_layout) + sieof(__u32) == sizeof(llog_cookie). */
 	struct ost_layout	o_layout;
-	__u32			o_padding_3;
+	__u32			o_layout_version;
 	__u32			o_uid_h;
 	__u32			o_gid_h;
 
@@ -3064,13 +3199,13 @@ union lu_page {
 };
 
 /* security opcodes */
-typedef enum {
+enum sec_cmd {
         SEC_CTX_INIT            = 801,
         SEC_CTX_INIT_CONT       = 802,
         SEC_CTX_FINI            = 803,
         SEC_LAST_OPC,
         SEC_FIRST_OPC           = SEC_CTX_INIT
-} sec_cmd_t;
+};
 
 /*
  * capa related definitions
@@ -3151,7 +3286,7 @@ struct link_ea_entry {
         unsigned char      lee_reclen[2];
         unsigned char      lee_parent_fid[sizeof(struct lu_fid)];
         char               lee_name[0];
-}__attribute__((packed));
+} __attribute__((packed));
 
 /** fid2path request/reply structure */
 struct getinfo_fid2path {
@@ -3173,7 +3308,7 @@ struct getparent {
 	char		gp_name[0];     /**< zero-terminated link name */
 } __attribute__((packed));
 
-enum {
+enum layout_intent_opc {
 	LAYOUT_INTENT_ACCESS	= 0,	/** generic access */
 	LAYOUT_INTENT_READ	= 1,	/** not used */
 	LAYOUT_INTENT_WRITE	= 2,	/** write file, for comp layout */
@@ -3187,8 +3322,7 @@ enum {
 struct layout_intent {
 	__u32 li_opc;	/* intent operation for enqueue, read, write etc */
 	__u32 li_flags;
-	__u64 li_start;
-	__u64 li_end;
+	struct lu_extent li_extent;
 } __attribute__((packed));
 
 /**
@@ -3198,7 +3332,7 @@ struct layout_intent {
  */
 struct hsm_progress_kernel {
 	/* Field taken from struct hsm_progress */
-	lustre_fid		hpk_fid;
+	struct lu_fid		hpk_fid;
 	__u64			hpk_cookie;
 	struct hsm_extent	hpk_extent;
 	__u16			hpk_flags;
@@ -3263,6 +3397,7 @@ enum update_type {
 	OUT_PUNCH		= 14,
 	OUT_READ		= 15,
 	OUT_NOOP		= 16,
+	OUT_XATTR_LIST		= 17,
 	OUT_LAST
 };
 
@@ -3353,11 +3488,22 @@ struct mdc_swap_layouts {
 	__u64           msl_flags;
 } __attribute__((packed));
 
+#define INLINE_RESYNC_ARRAY_SIZE	15
+struct close_data_resync_done {
+	__u32	resync_count;
+	__u32	resync_ids_inline[INLINE_RESYNC_ARRAY_SIZE];
+};
+
 struct close_data {
 	struct lustre_handle	cd_handle;
 	struct lu_fid		cd_fid;
 	__u64			cd_data_version;
-	__u64			cd_reserved[8];
+	union {
+		__u64				cd_reserved[8];
+		struct close_data_resync_done	cd_resync;
+		/* split close */
+		__u16				cd_mirror_id;
+	};
 };
 
 /* Update llog format */
@@ -3366,7 +3512,7 @@ struct update_op {
 	__u16		uop_type;
 	__u16		uop_param_count;
 	__u16		uop_params_off[0];
-};
+} __attribute__((packed));
 
 struct update_ops {
 	struct update_op	uops_op[0];
@@ -3417,6 +3563,19 @@ struct llog_update_record {
 	*/
 };
 
+/* sepol string format is:
+ * <1-digit for SELinux status>:<policy name>:<policy version>:<policy hash>
+ */
+/* Max length of the sepol string
+ * Should be large enough to contain a sha512sum of the policy
+ */
+#define SELINUX_MODE_LEN 1
+#define SELINUX_POLICY_VER_LEN 3 /* 3 chars to leave room for the future */
+#define SELINUX_POLICY_HASH_LEN 64
+#define LUSTRE_NODEMAP_SEPOL_LENGTH (SELINUX_MODE_LEN + NAME_MAX + \
+				     SELINUX_POLICY_VER_LEN + \
+				     SELINUX_POLICY_HASH_LEN + 3)
+
 /* nodemap records, uses 32 byte record length */
 #define LUSTRE_NODEMAP_NAME_LENGTH 16
 struct nodemap_cluster_rec {
@@ -3487,5 +3646,9 @@ struct ladvise_hdr {
 	struct lu_ladvise	lah_advise[0];	/* advices in this header */
 };
 
+#if defined(__cplusplus)
+}
+#endif
+
 #endif
 /** @} lustreidl */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ioctl.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ioctl.h
similarity index 93%
rename from drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ioctl.h
rename to drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ioctl.h
index 9fddf2b1b9bd3..d0dc08bda5433 100644
--- a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ioctl.h
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ioctl.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2014, 2016, Intel Corporation.
+ * Copyright (c) 2014, 2017, Intel Corporation.
  */
 #ifndef _UAPI_LUSTRE_IOCTL_H
 #define _UAPI_LUSTRE_IOCTL_H
@@ -31,20 +31,13 @@
 #include <linux/ioctl.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
-#include <lustre/lustre_idl.h>
+#include <linux/lustre/lustre_idl.h>
 
-#ifndef __KERNEL__
-# define __user
-#endif
-
-#if !defined(__KERNEL__) && !defined(LUSTRE_UTILS)
-# error This file is for Lustre internal use only.
-#endif
-
-/* Handle older distros */
-#ifndef __ALIGN_KERNEL
-# define __ALIGN_KERNEL(x, a)	__ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
-# define __ALIGN_KERNEL_MASK(x, mask)	(((x) + (mask)) & ~(mask))
+/*
+ * sparse kernel source annotations
+ */
+#ifndef __user
+#define __user
 #endif
 
 enum md_echo_cmd {
@@ -64,7 +57,6 @@ enum md_echo_cmd {
 
 #define OBD_IOCTL_VERSION	0x00010004
 #define OBD_DEV_BY_DEVNAME	0xffffd0de
-#define OBD_MAX_IOCTL_BUFFER	CONFIG_LUSTRE_OBD_MAX_IOCTL_BUFFER
 
 struct obd_ioctl_data {
 	__u32		ioc_len;
@@ -228,13 +220,14 @@ static inline __u32 obd_ioctl_packlen(struct obd_ioctl_data *data)
 #define OBD_IOC_LCFG_ERASE	_IOWR('f', 209, OBD_IOC_DATA_TYPE)
 #define OBD_IOC_GET_OBJ_VERSION	_IOR('f', 210, OBD_IOC_DATA_TYPE)
 
-/*	lustre/lustre_user.h	212-217 */
-#define OBD_IOC_GET_MNTOPT	_IOW('f', 220, mntopt_t)
+/*	lustre/lustre_user.h	211-220 */
+/* was #define OBD_IOC_GET_MNTOPT	_IOW('f', 220, mntopt_t) until 2.11 */
 #define OBD_IOC_ECHO_MD		_IOR('f', 221, struct obd_ioctl_data)
 #define OBD_IOC_ECHO_ALLOC_SEQ	_IOWR('f', 222, struct obd_ioctl_data)
 #define OBD_IOC_START_LFSCK	_IOWR('f', 230, OBD_IOC_DATA_TYPE)
 #define OBD_IOC_STOP_LFSCK	_IOW('f', 231, OBD_IOC_DATA_TYPE)
 #define OBD_IOC_QUERY_LFSCK	_IOR('f', 232, struct obd_ioctl_data)
+#define OBD_IOC_CHLG_POLL	_IOR('f', 233, long)
 /*	lustre/lustre_user.h	240-249 */
 /*	LIBCFS_IOC_DEBUG_MASK	250 */
 
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi_kernelcomm.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_kernelcomm.h
similarity index 88%
rename from drivers/staging/lustrefsx/lustre/include/uapi_kernelcomm.h
rename to drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_kernelcomm.h
index e8119f5278c23..26819ff7995cf 100644
--- a/drivers/staging/lustrefsx/lustre/include/uapi_kernelcomm.h
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_kernelcomm.h
@@ -73,17 +73,26 @@ enum kuc_generic_message_type {
 #define KUC_GRP_HSM	0x02
 #define KUC_GRP_MAX	KUC_GRP_HSM
 
-#define LK_FLG_STOP 0x01
+enum lk_flags {
+	LK_FLG_STOP	= 0x0001,
+	LK_FLG_DATANR	= 0x0002,
+};
 #define LK_NOFD -1U
 
-/* kernelcomm control structure, passed from userspace to kernel */
+/* kernelcomm control structure, passed from userspace to kernel.
+ * For compatibility with old copytools, users who pass ARCHIVE_IDs
+ * to kernel using lk_data_count and lk_data should fill lk_flags with
+ * LK_FLG_DATANR. Otherwise kernel will take lk_data_count as bitmap of
+ * ARCHIVE IDs.
+ */
 struct lustre_kernelcomm {
 	__u32 lk_wfd;
 	__u32 lk_rfd;
 	__u32 lk_uid;
 	__u32 lk_group;
-	__u32 lk_data;
+	__u32 lk_data_count;
 	__u32 lk_flags;
+	__u32 lk_data[0];
 } __attribute__((packed));
 
 #endif	/* __UAPI_KERNELCOMM_H__ */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_lfsck_user.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_lfsck_user.h
new file mode 100644
index 0000000000000..68c8d3a1009c4
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_lfsck_user.h
@@ -0,0 +1,238 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * lustre/include/lustre/lustre_lfsck_user.h
+ *
+ * Lustre LFSCK userspace interfaces.
+ *
+ * Author: Fan, Yong <fan.yong@intel.com>
+ */
+
+#ifndef _LUSTRE_LFSCK_USER_H
+# define _LUSTRE_LFSCK_USER_H
+
+#include <linux/types.h>
+#include <linux/lustre/lustre_user.h>
+
+/**
+ * state machine:
+ *
+ *					LS_INIT
+ *					   |
+ *				     (lfsck|start)
+ *					   |
+ *					   v
+ *				   LS_SCANNING_PHASE1
+ *					|	^
+ *					|	:
+ *					| (lfsck:restart)
+ *					|	:
+ *					v	:
+ *	-----------------------------------------------------------------
+ *	|		    |^		|^	   |^	      |^	|^
+ *	|		    |:		|:	   |:	      |:	|:
+ *	v		    v:		v:	   v:	      v:	v:
+ * LS_SCANNING_PHASE2	LS_FAILED  LS_STOPPED  LS_PAUSED LS_CRASHED LS_PARTIAL
+ *			  (CO_)       (CO_)	 (CO_)
+ *	|	^	    ^:		^:	   ^:	      ^:	^:
+ *	|	:	    |:		|:	   |:	      |:	|:
+ *	| (lfsck:restart)   |:		|:	   |:	      |:	|:
+ *	v	:	    |v		|v	   |v	      |v	|v
+ *	-----------------------------------------------------------------
+ *	    |
+ *	    v
+ *    LS_COMPLETED
+ */
+enum lfsck_status {
+	/* The lfsck file is new created, for new MDT, upgrading from old disk,
+	 * or re-creating the lfsck file manually. */
+	LS_INIT			= 0,
+
+	/* The first-step system scanning. The checked items during the phase1
+	 * scanning depends on the LFSCK type. */
+	LS_SCANNING_PHASE1	= 1,
+
+	/* The second-step system scanning. The checked items during the phase2
+	 * scanning depends on the LFSCK type. */
+	LS_SCANNING_PHASE2	= 2,
+
+	/* The LFSCK processing has completed for all objects. */
+	LS_COMPLETED		= 3,
+
+	/* The LFSCK exited automatically for failure, will not auto restart. */
+	LS_FAILED		= 4,
+
+	/* The LFSCK is stopped manually, will not auto restart. */
+	LS_STOPPED		= 5,
+
+	/* LFSCK is paused automatically when umount,
+	 * will be restarted automatically when remount. */
+	LS_PAUSED		= 6,
+
+	/* System crashed during the LFSCK,
+	 * will be restarted automatically after recovery. */
+	LS_CRASHED		= 7,
+
+	/* Some OST/MDT failed during the LFSCK, or not join the LFSCK. */
+	LS_PARTIAL		= 8,
+
+	/* The LFSCK is failed because its controller is failed. */
+	LS_CO_FAILED		= 9,
+
+	/* The LFSCK is stopped because its controller is stopped. */
+	LS_CO_STOPPED		= 10,
+
+	/* The LFSCK is paused because its controller is paused. */
+	LS_CO_PAUSED		= 11,
+
+	LS_MAX
+};
+
+static inline const char *lfsck_status2name(int status)
+{
+	static const char * const lfsck_status_names[] = {
+		[LS_INIT]		= "init",
+		[LS_SCANNING_PHASE1]	= "scanning-phase1",
+		[LS_SCANNING_PHASE2]	= "scanning-phase2",
+		[LS_COMPLETED]		= "completed",
+		[LS_FAILED]		= "failed",
+		[LS_STOPPED]		= "stopped",
+		[LS_PAUSED]		= "paused",
+		[LS_CRASHED]		= "crashed",
+		[LS_PARTIAL]		= "partial",
+		[LS_CO_FAILED]		= "co-failed",
+		[LS_CO_STOPPED]		= "co-stopped",
+		[LS_CO_PAUSED]		= "co-paused"
+	};
+
+	if (status < 0 || status >= LS_MAX)
+		return "unknown";
+
+	return lfsck_status_names[status];
+}
+
+enum lfsck_param_flags {
+	/* Reset LFSCK iterator position to the device beginning. */
+	LPF_RESET		= 0x0001,
+
+	/* Exit when fail. */
+	LPF_FAILOUT		= 0x0002,
+
+	/* Dryrun mode, only check without modification */
+	LPF_DRYRUN		= 0x0004,
+
+	/* LFSCK runs on all targets. */
+	LPF_ALL_TGT		= 0x0008,
+
+	/* Broadcast the command to other MDTs. Only valid on the sponsor MDT */
+	LPF_BROADCAST		= 0x0010,
+
+	/* Handle orphan OST-objects. */
+	LPF_OST_ORPHAN		= 0x0020,
+
+	/* Create OST-object for dangling LOV EA. */
+	LPF_CREATE_OSTOBJ	= 0x0040,
+
+	/* Create MDT-object for dangling name entry. */
+	LPF_CREATE_MDTOBJ	= 0x0080,
+
+	/* Do not return until the LFSCK not running. */
+	LPF_WAIT		= 0x0100,
+
+	/* Delay to create OST-object for dangling LOV EA. */
+	LPF_DELAY_CREATE_OSTOBJ	= 0x0200,
+};
+
+enum lfsck_type {
+	/* For MDT and OST internal OSD consistency check/repair. */
+	LFSCK_TYPE_SCRUB	= 0x0000,
+
+	/* For MDT-OST (layout, object) consistency check/repair. */
+	LFSCK_TYPE_LAYOUT	= 0x0001,
+
+	/* For MDT (FID-in-dirent, linkEA) consistency check/repair. */
+	LFSCK_TYPE_NAMESPACE	= 0x0004,
+	LFSCK_TYPES_SUPPORTED	= (LFSCK_TYPE_SCRUB | LFSCK_TYPE_LAYOUT |
+				   LFSCK_TYPE_NAMESPACE),
+	LFSCK_TYPES_DEF		= LFSCK_TYPES_SUPPORTED,
+	LFSCK_TYPES_ALL		= ((__u16)(~0))
+};
+
+#define LFSCK_VERSION_V1	1
+#define LFSCK_VERSION_V2	2
+
+#define LFSCK_SPEED_NO_LIMIT	0
+#define LFSCK_SPEED_LIMIT_DEF	LFSCK_SPEED_NO_LIMIT
+#define LFSCK_ASYNC_WIN_DEFAULT 1024
+#define LFSCK_ASYNC_WIN_MAX	((__u16)(~0))
+#define LFSCK_TYPE_BITS		16
+
+enum lfsck_start_valid {
+	LSV_SPEED_LIMIT		= 0x00000001,
+	LSV_ERROR_HANDLE	= 0x00000002,
+	LSV_DRYRUN		= 0x00000004,
+	LSV_ASYNC_WINDOWS	= 0x00000008,
+	LSV_CREATE_OSTOBJ	= 0x00000010,
+	LSV_CREATE_MDTOBJ	= 0x00000020,
+	LSV_DELAY_CREATE_OSTOBJ	= 0x00000040,
+};
+
+/* Arguments for starting lfsck. */
+struct lfsck_start {
+	/* Which arguments are valid, see 'enum lfsck_start_valid'. */
+	__u32   ls_valid;
+
+	/* How many items can be scanned at most per second. */
+	__u32   ls_speed_limit;
+
+	/* For compatibility between user space tools and kernel service. */
+	__u16   ls_version;
+
+	/* Which LFSCK components to be (have been) started. */
+	__u16   ls_active;
+
+	/* Flags for the LFSCK, see 'enum lfsck_param_flags'. */
+	__u16   ls_flags;
+
+	/* The windows size for async requests pipeline. */
+	__u16   ls_async_windows;
+};
+
+struct lfsck_stop {
+	__u32	ls_status;
+	__u16	ls_flags;
+	__u16	ls_padding_1; /* For 64-bits aligned. */
+	__u64	ls_padding_2;
+};
+
+struct lfsck_query {
+	__u16	lu_types;
+	__u16	lu_flags;
+	__u32	lu_mdts_count[LFSCK_TYPE_BITS][LS_MAX + 1];
+	__u32	lu_osts_count[LFSCK_TYPE_BITS][LS_MAX + 1];
+	__u64	lu_repaired[LFSCK_TYPE_BITS];
+};
+
+#endif /* _LUSTRE_LFSCK_USER_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_log_user.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_log_user.h
similarity index 97%
rename from drivers/staging/lustrefsx/lustre/include/lustre_log_user.h
rename to drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_log_user.h
index ee5f0f7385fa0..bcf46eb21e6c2 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_log_user.h
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_log_user.h
@@ -38,7 +38,8 @@
 #ifndef _LUSTRE_LOG_USER_H
 #define _LUSTRE_LOG_USER_H
 
-#include <uapi/linux/lustre_fid.h>
+#include <linux/types.h>
+#include <linux/lustre/lustre_fid.h>
 
 /*  Lustre logs use FIDs constructed from oi_id and oi_seq directly,
  *  without attempting to use the IGIF and IDIF ranges as is done
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ostid.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ostid.h
similarity index 95%
rename from drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ostid.h
rename to drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ostid.h
index c0e662ae7b84f..90fa213f83e90 100644
--- a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ostid.h
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ostid.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  *
  * Copyright 2015 Cray Inc, all rights reserved.
  * Author: Ben Evans.
@@ -34,15 +34,9 @@
 #ifndef _UAPI_LUSTRE_OSTID_H_
 #define _UAPI_LUSTRE_OSTID_H_
 
-/*
- * This is due to us being out of kernel and the way the OpenSFS branch
- * handles CFLAGS. Upstream will just have linux/lustre_fid.h
- */
-#ifdef __KERNEL__
-#include <uapi/linux/lustre_fid.h>
-#else
-#include <linux/lustre_fid.h>
-#endif
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/lustre/lustre_fid.h>
 
 static inline __u64 lmm_oi_id(const struct ost_id *oi)
 {
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_param.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_param.h
similarity index 100%
rename from drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_param.h
rename to drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_param.h
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_user.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_user.h
new file mode 100644
index 0000000000000..b8d8bd71f19f9
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_user.h
@@ -0,0 +1,2366 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/lustre_user.h
+ *
+ * Lustre public user-space interface definitions.
+ */
+
+#ifndef _LUSTRE_USER_H
+#define _LUSTRE_USER_H
+
+/** \defgroup lustreuser lustreuser
+ *
+ * @{
+ */
+
+#include <linux/limits.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#ifdef __KERNEL__
+# define __USE_ISOC99	1
+# include <linux/fs.h>
+# include <linux/quota.h>
+# include <linux/string.h> /* snprintf() */
+# include <linux/unistd.h>
+#else /* ! __KERNEL__ */
+# include <stdbool.h>
+# include <stdio.h> /* snprintf() */
+# include <string.h>
+# define NEED_QUOTA_DEFS
+/* # include <sys/quota.h> - this causes complaints about caddr_t */
+# include <sys/stat.h>
+# define FILEID_LUSTRE 0x97 /* for name_to_handle_at() (and llapi_fd2fid()) */
+#endif /* !__KERNEL__ */
+
+/* Handle older distros */
+#ifndef __ALIGN_KERNEL
+#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask))
+#define __ALIGN_KERNEL(x, a)	     __ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
+#endif
+
+#include <linux/lustre/lustre_fiemap.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#ifdef __STRICT_ANSI__
+#define typeof  __typeof__
+#endif
+
+/*
+ * This is a temporary solution of adding quota type.
+ * Should be removed as soon as system header is updated.
+ */
+#undef LL_MAXQUOTAS
+#define LL_MAXQUOTAS 3
+#undef INITQFNAMES
+#define INITQFNAMES { \
+    "user",	/* USRQUOTA */ \
+    "group",	/* GRPQUOTA */ \
+    "project",	/* PRJQUOTA */ \
+    "undefined", \
+};
+#ifndef USRQUOTA
+#define USRQUOTA 0
+#endif
+#ifndef GRPQUOTA
+#define GRPQUOTA 1
+#endif
+#ifndef PRJQUOTA
+#define PRJQUOTA 2
+#endif
+
+/*
+ * We need to always use 64bit version because the structure
+ * is shared across entire cluster where 32bit and 64bit machines
+ * are co-existing.
+ */
+#if __BITS_PER_LONG != 64 || defined(__ARCH_WANT_STAT64)
+typedef struct stat64   lstat_t;
+#define lstat_f  lstat64
+#define fstat_f         fstat64
+#define fstatat_f       fstatat64
+#else
+typedef struct stat     lstat_t;
+#define lstat_f  lstat
+#define fstat_f         fstat
+#define fstatat_f       fstatat
+#endif
+
+#ifndef STATX_BASIC_STATS
+/*
+ * Timestamp structure for the timestamps in struct statx.
+ *
+ * tv_sec holds the number of seconds before (negative) or after (positive)
+ * 00:00:00 1st January 1970 UTC.
+ *
+ * tv_nsec holds a number of nanoseconds (0..999,999,999) after the tv_sec time.
+ *
+ * __reserved is held in case we need a yet finer resolution.
+ */
+struct statx_timestamp {
+	__s64	tv_sec;
+	__u32	tv_nsec;
+	__s32	__reserved;
+};
+
+/*
+ * Structures for the extended file attribute retrieval system call
+ * (statx()).
+ *
+ * The caller passes a mask of what they're specifically interested in as a
+ * parameter to statx().  What statx() actually got will be indicated in
+ * st_mask upon return.
+ *
+ * For each bit in the mask argument:
+ *
+ * - if the datum is not supported:
+ *
+ *   - the bit will be cleared, and
+ *
+ *   - the datum will be set to an appropriate fabricated value if one is
+ *     available (eg. CIFS can take a default uid and gid), otherwise
+ *
+ *   - the field will be cleared;
+ *
+ * - otherwise, if explicitly requested:
+ *
+ *   - the datum will be synchronised to the server if AT_STATX_FORCE_SYNC is
+ *     set or if the datum is considered out of date, and
+ *
+ *   - the field will be filled in and the bit will be set;
+ *
+ * - otherwise, if not requested, but available in approximate form without any
+ *   effort, it will be filled in anyway, and the bit will be set upon return
+ *   (it might not be up to date, however, and no attempt will be made to
+ *   synchronise the internal state first);
+ *
+ * - otherwise the field and the bit will be cleared before returning.
+ *
+ * Items in STATX_BASIC_STATS may be marked unavailable on return, but they
+ * will have values installed for compatibility purposes so that stat() and
+ * co. can be emulated in userspace.
+ */
+struct statx {
+	/* 0x00 */
+	__u32	stx_mask;	/* What results were written [uncond] */
+	__u32	stx_blksize;	/* Preferred general I/O size [uncond] */
+	__u64	stx_attributes;	/* Flags conveying information about the file [uncond] */
+	/* 0x10 */
+	__u32	stx_nlink;	/* Number of hard links */
+	__u32	stx_uid;	/* User ID of owner */
+	__u32	stx_gid;	/* Group ID of owner */
+	__u16	stx_mode;	/* File mode */
+	__u16	__spare0[1];
+	/* 0x20 */
+	__u64	stx_ino;	/* Inode number */
+	__u64	stx_size;	/* File size */
+	__u64	stx_blocks;	/* Number of 512-byte blocks allocated */
+	__u64	stx_attributes_mask; /* Mask to show what's supported in stx_attributes */
+	/* 0x40 */
+	struct statx_timestamp	stx_atime;	/* Last access time */
+	struct statx_timestamp	stx_btime;	/* File creation time */
+	struct statx_timestamp	stx_ctime;	/* Last attribute change time */
+	struct statx_timestamp	stx_mtime;	/* Last data modification time */
+	/* 0x80 */
+	__u32	stx_rdev_major;	/* Device ID of special file [if bdev/cdev] */
+	__u32	stx_rdev_minor;
+	__u32	stx_dev_major;	/* ID of device containing file [uncond] */
+	__u32	stx_dev_minor;
+	/* 0x90 */
+	__u64	__spare2[14];	/* Spare space for future expansion */
+	/* 0x100 */
+};
+
+/*
+ * Flags to be stx_mask
+ *
+ * Query request/result mask for statx() and struct statx::stx_mask.
+ *
+ * These bits should be set in the mask argument of statx() to request
+ * particular items when calling statx().
+ */
+#define STATX_TYPE		0x00000001U	/* Want/got stx_mode & S_IFMT */
+#define STATX_MODE		0x00000002U	/* Want/got stx_mode & ~S_IFMT */
+#define STATX_NLINK		0x00000004U	/* Want/got stx_nlink */
+#define STATX_UID		0x00000008U	/* Want/got stx_uid */
+#define STATX_GID		0x00000010U	/* Want/got stx_gid */
+#define STATX_ATIME		0x00000020U	/* Want/got stx_atime */
+#define STATX_MTIME		0x00000040U	/* Want/got stx_mtime */
+#define STATX_CTIME		0x00000080U	/* Want/got stx_ctime */
+#define STATX_INO		0x00000100U	/* Want/got stx_ino */
+#define STATX_SIZE		0x00000200U	/* Want/got stx_size */
+#define STATX_BLOCKS		0x00000400U	/* Want/got stx_blocks */
+#define STATX_BASIC_STATS	0x000007ffU	/* The stuff in the normal stat struct */
+#define STATX_BTIME		0x00000800U	/* Want/got stx_btime */
+#define STATX_ALL		0x00000fffU	/* All currently supported flags */
+#define STATX__RESERVED		0x80000000U	/* Reserved for future struct statx expansion */
+
+/*
+ * Attributes to be found in stx_attributes and masked in stx_attributes_mask.
+ *
+ * These give information about the features or the state of a file that might
+ * be of use to ordinary userspace programs such as GUIs or ls rather than
+ * specialised tools.
+ *
+ * Note that the flags marked [I] correspond to generic FS_IOC_FLAGS
+ * semantically.  Where possible, the numerical value is picked to correspond
+ * also.
+ */
+#define STATX_ATTR_COMPRESSED		0x00000004 /* [I] File is compressed by the fs */
+#define STATX_ATTR_IMMUTABLE		0x00000010 /* [I] File is marked immutable */
+#define STATX_ATTR_APPEND		0x00000020 /* [I] File is append-only */
+#define STATX_ATTR_NODUMP		0x00000040 /* [I] File is not to be dumped */
+#define STATX_ATTR_ENCRYPTED		0x00000800 /* [I] File requires key to decrypt in fs */
+
+#define STATX_ATTR_AUTOMOUNT		0x00001000 /* Dir: Automount trigger */
+
+#endif
+
+typedef struct statx lstatx_t;
+
+#define HAVE_LOV_USER_MDS_DATA
+
+#define LUSTRE_EOF 0xffffffffffffffffULL
+
+/* for statfs() */
+#define LL_SUPER_MAGIC 0x0BD00BD0
+
+#define FSFILT_IOC_GETVERSION		_IOR('f', 3, long)
+
+/* FIEMAP flags supported by Lustre */
+#define LUSTRE_FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_DEVICE_ORDER)
+
+enum obd_statfs_state {
+	OS_STATE_DEGRADED	= 0x00000001, /**< RAID degraded/rebuilding */
+	OS_STATE_READONLY	= 0x00000002, /**< filesystem is read-only */
+	OS_STATE_NOPRECREATE	= 0x00000004, /**< no object precreation */
+	OS_STATE_UNUSED1	= 0x00000008, /**< obsolete 1.6, was EROFS=30 */
+	OS_STATE_UNUSED2	= 0x00000010, /**< obsolete 1.6, was EROFS=30 */
+	OS_STATE_ENOSPC		= 0x00000020, /**< not enough free space */
+	OS_STATE_ENOINO		= 0x00000040, /**< not enough inodes */
+	OS_STATE_SUM		= 0x00000100, /**< aggregated for all tagrets */
+	OS_STATE_NONROT		= 0x00000200, /**< non-rotational device */
+};
+
+/** filesystem statistics/attributes for target device */
+struct obd_statfs {
+	__u64		os_type;	/* EXT4_SUPER_MAGIC, UBERBLOCK_MAGIC */
+	__u64		os_blocks;	/* total size in #os_bsize blocks */
+	__u64		os_bfree;	/* number of unused blocks */
+	__u64		os_bavail;	/* blocks available for allocation */
+	__u64		os_files;	/* total number of objects */
+	__u64		os_ffree;	/* # objects that could be created */
+	__u8		os_fsid[40];	/* identifier for filesystem */
+	__u32		os_bsize;	/* block size in bytes for os_blocks */
+	__u32		os_namelen;	/* maximum length of filename in bytes*/
+	__u64		os_maxbytes;	/* maximum object size in bytes */
+	__u32		os_state;       /**< obd_statfs_state OS_STATE_* flag */
+	__u32		os_fprecreated;	/* objs available now to the caller */
+					/* used in QoS code to find preferred
+					 * OSTs */
+	__u32           os_granted;	/* space granted for MDS */
+	__u32           os_spare3;	/* Unused padding fields.  Remember */
+	__u32           os_spare4;	/* to fix lustre_swab_obd_statfs() */
+	__u32           os_spare5;
+	__u32           os_spare6;
+	__u32           os_spare7;
+	__u32           os_spare8;
+	__u32           os_spare9;
+};
+
+/**
+ * File IDentifier.
+ *
+ * FID is a cluster-wide unique identifier of a file or an object (stripe).
+ * FIDs are never reused.
+ **/
+struct lu_fid {
+       /**
+	* FID sequence. Sequence is a unit of migration: all files (objects)
+	* with FIDs from a given sequence are stored on the same server.
+	* Lustre should support 2^64 objects, so even if each sequence
+	* has only a single object we can still enumerate 2^64 objects.
+	**/
+	__u64 f_seq;
+	/* FID number within sequence. */
+	__u32 f_oid;
+	/**
+	 * FID version, used to distinguish different versions (in the sense
+	 * of snapshots, etc.) of the same file system object. Not currently
+	 * used.
+	 **/
+	__u32 f_ver;
+} __attribute__((packed));
+
+static inline bool fid_is_zero(const struct lu_fid *fid)
+{
+	return fid->f_seq == 0 && fid->f_oid == 0;
+}
+
+/* Currently, the filter_fid::ff_parent::f_ver is not the real parent
+ * MDT-object's FID::f_ver, instead it is the OST-object index in its
+ * parent MDT-object's layout EA. */
+#define f_stripe_idx f_ver
+
+struct ost_layout {
+	__u32	ol_stripe_size;
+	__u32	ol_stripe_count;
+	__u64	ol_comp_start;
+	__u64	ol_comp_end;
+	__u32	ol_comp_id;
+} __attribute__((packed));
+
+/* The filter_fid structure has changed several times over its lifetime.
+ * For a long time "trusted.fid" held the MDT inode parent FID/IGIF and
+ * stripe_index and the "self FID" (objid/seq) to be able to recover the
+ * OST objects in case of corruption.  With the move to 2.4 and OSD-API for
+ * the OST, the "trusted.lma" xattr was added to the OST objects to store
+ * the "self FID" to be consistent with the MDT on-disk format, and the
+ * filter_fid only stored the MDT inode parent FID and stripe index.
+ *
+ * In 2.10, the addition of PFL composite layouts required more information
+ * to be stored into the filter_fid in order to be able to identify which
+ * component the OST object belonged.  As well, the stripe size may vary
+ * between components, so it was no longer safe to assume the stripe size
+ * or stripe_count of a file.  This is also more robust for plain layouts.
+ *
+ * For ldiskfs OSTs that were formatted with 256-byte inodes, there is not
+ * enough space to store both the filter_fid and LMA in the inode, so they
+ * are packed into struct lustre_ost_attrs on disk in trusted.lma to avoid
+ * an extra seek for every OST object access.
+ *
+ * In 2.11, FLR mirror layouts also need to store the layout version and
+ * range so that writes to old versions of the layout are not allowed.
+ * That ensures that mirrored objects are not modified by evicted clients,
+ * and ensures that the components are correctly marked stale on the MDT.
+ */
+struct filter_fid_18_23 {
+	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
+	__u64			ff_objid;
+	__u64			ff_seq;
+};
+
+struct filter_fid_24_29 {
+	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
+};
+
+struct filter_fid_210 {
+	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
+	struct ost_layout	ff_layout;
+};
+
+struct filter_fid {
+	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
+	struct ost_layout	ff_layout;
+	__u32			ff_layout_version;
+	__u32			ff_range; /* range of layout version that
+					   * write are allowed */
+} __attribute__((packed));
+
+/* Userspace should treat lu_fid as opaque, and only use the following methods
+ * to print or parse them.  Other functions (e.g. compare, swab) could be moved
+ * here from lustre_idl.h if needed. */
+struct lu_fid;
+
+enum lma_compat {
+	LMAC_HSM	 = 0x00000001,
+/*	LMAC_SOM	 = 0x00000002, obsolete since 2.8.0 */
+	LMAC_NOT_IN_OI	 = 0x00000004, /* the object does NOT need OI mapping */
+	LMAC_FID_ON_OST  = 0x00000008, /* For OST-object, its OI mapping is
+				       * under /O/<seq>/d<x>. */
+	LMAC_STRIPE_INFO = 0x00000010, /* stripe info in the LMA EA. */
+	LMAC_COMP_INFO	 = 0x00000020, /* Component info in the LMA EA. */
+	LMAC_IDX_BACKUP  = 0x00000040, /* Has index backup. */
+};
+
+/**
+ * Masks for all features that should be supported by a Lustre version to
+ * access a specific file.
+ * This information is stored in lustre_mdt_attrs::lma_incompat.
+ */
+enum lma_incompat {
+	LMAI_RELEASED		= 0x00000001, /* file is released */
+	LMAI_AGENT		= 0x00000002, /* agent inode */
+	LMAI_REMOTE_PARENT	= 0x00000004, /* the parent of the object
+						 is on the remote MDT */
+	LMAI_STRIPED		= 0x00000008, /* striped directory inode */
+	LMAI_ORPHAN		= 0x00000010, /* inode is orphan */
+	LMA_INCOMPAT_SUPP	= (LMAI_AGENT | LMAI_REMOTE_PARENT | \
+				   LMAI_STRIPED | LMAI_ORPHAN)
+};
+
+
+/**
+ * Following struct for object attributes, that will be kept inode's EA.
+ * Introduced in 2.0 release (please see b15993, for details)
+ * Added to all objects since Lustre 2.4 as contains self FID
+ */
+struct lustre_mdt_attrs {
+	/**
+	 * Bitfield for supported data in this structure. From enum lma_compat.
+	 * lma_self_fid and lma_flags are always available.
+	 */
+	__u32   lma_compat;
+	/**
+	 * Per-file incompat feature list. Lustre version should support all
+	 * flags set in this field. The supported feature mask is available in
+	 * LMA_INCOMPAT_SUPP.
+	 */
+	__u32   lma_incompat;
+	/** FID of this inode */
+	struct lu_fid  lma_self_fid;
+};
+
+struct lustre_ost_attrs {
+	/* Use lustre_mdt_attrs directly for now, need a common header
+	 * structure if want to change lustre_mdt_attrs in future. */
+	struct lustre_mdt_attrs loa_lma;
+
+	/* Below five elements are for OST-object's PFID EA, the
+	 * lma_parent_fid::f_ver is composed of the stripe_count (high 16 bits)
+	 * and the stripe_index (low 16 bits), the size should not exceed
+	 * 5 * sizeof(__u64)) to be accessable by old Lustre. If the flag
+	 * LMAC_STRIPE_INFO is set, then loa_parent_fid and loa_stripe_size
+	 * are valid; if the flag LMAC_COMP_INFO is set, then the next three
+	 * loa_comp_* elements are valid. */
+	struct lu_fid	loa_parent_fid;
+	__u32		loa_stripe_size;
+	__u32		loa_comp_id;
+	__u64		loa_comp_start;
+	__u64		loa_comp_end;
+};
+
+/**
+ * Prior to 2.4, the LMA structure also included SOM attributes which has since
+ * been moved to a dedicated xattr
+ * lma_flags was also removed because of lma_compat/incompat fields.
+ */
+#define LMA_OLD_SIZE (sizeof(struct lustre_mdt_attrs) + 5 * sizeof(__u64))
+
+enum lustre_som_flags {
+	/* Unknow or no SoM data, must get size from OSTs. */
+	SOM_FL_UNKNOWN	= 0x0000,
+	/* Known strictly correct, FLR or DoM file (SoM guaranteed). */
+	SOM_FL_STRICT	= 0x0001,
+	/* Known stale - was right at some point in the past, but it is
+	 * known (or likely) to be incorrect now (e.g. opened for write). */
+	SOM_FL_STALE	= 0x0002,
+	/* Approximate, may never have been strictly correct,
+	 * need to sync SOM data to achieve eventual consistency. */
+	SOM_FL_LAZY	= 0x0004,
+};
+
+struct lustre_som_attrs {
+	__u16	lsa_valid;
+	__u16	lsa_reserved[3];
+	__u64	lsa_size;
+	__u64	lsa_blocks;
+};
+
+/**
+ * OST object IDentifier.
+ */
+struct ost_id {
+	union {
+		struct {
+			__u64	oi_id;
+			__u64	oi_seq;
+		} oi;
+		struct lu_fid oi_fid;
+	};
+} __attribute__((packed));
+
+#define DOSTID "%#llx:%llu"
+#define POSTID(oi) ((unsigned long long)ostid_seq(oi)), \
+		   ((unsigned long long)ostid_id(oi))
+
+struct ll_futimes_3 {
+	__u64 lfu_atime_sec;
+	__u64 lfu_atime_nsec;
+	__u64 lfu_mtime_sec;
+	__u64 lfu_mtime_nsec;
+	__u64 lfu_ctime_sec;
+	__u64 lfu_ctime_nsec;
+};
+
+/*
+ * Maximum number of mirrors currently implemented.
+ */
+#define LUSTRE_MIRROR_COUNT_MAX		16
+
+/* Lease types for use as arg and return of LL_IOC_{GET,SET}_LEASE ioctl. */
+enum ll_lease_mode {
+	LL_LEASE_RDLCK	= 0x01,
+	LL_LEASE_WRLCK	= 0x02,
+	LL_LEASE_UNLCK	= 0x04,
+};
+
+enum ll_lease_flags {
+	LL_LEASE_RESYNC		= 0x1,
+	LL_LEASE_RESYNC_DONE	= 0x2,
+	LL_LEASE_LAYOUT_MERGE	= 0x4,
+	LL_LEASE_LAYOUT_SPLIT	= 0x8,
+};
+
+#define IOC_IDS_MAX	4096
+struct ll_ioc_lease {
+	__u32		lil_mode;
+	__u32		lil_flags;
+	__u32		lil_count;
+	__u32		lil_ids[0];
+};
+
+struct ll_ioc_lease_id {
+	__u32		lil_mode;
+	__u32		lil_flags;
+	__u32		lil_count;
+	__u16		lil_mirror_id;
+	__u16		lil_padding1;
+	__u64		lil_padding2;
+	__u32		lil_ids[0];
+};
+
+/*
+ * The ioctl naming rules:
+ * LL_*     - works on the currently opened filehandle instead of parent dir
+ * *_OBD_*  - gets data for both OSC or MDC (LOV, LMV indirectly)
+ * *_MDC_*  - gets/sets data related to MDC
+ * *_LOV_*  - gets/sets data related to OSC/LOV
+ * *FILE*   - called on parent dir and passes in a filename
+ * *STRIPE* - set/get lov_user_md
+ * *INFO    - set/get lov_user_mds_data
+ */
+/*	lustre_ioctl.h			101-150 */
+#define LL_IOC_GETFLAGS                 _IOR ('f', 151, long)
+#define LL_IOC_SETFLAGS                 _IOW ('f', 152, long)
+#define LL_IOC_CLRFLAGS                 _IOW ('f', 153, long)
+#define LL_IOC_LOV_SETSTRIPE            _IOW ('f', 154, long)
+#define LL_IOC_LOV_SETSTRIPE_NEW	_IOWR('f', 154, struct lov_user_md)
+#define LL_IOC_LOV_GETSTRIPE            _IOW ('f', 155, long)
+#define LL_IOC_LOV_GETSTRIPE_NEW	_IOR('f', 155, struct lov_user_md)
+#define LL_IOC_LOV_SETEA                _IOW ('f', 156, long)
+/*	LL_IOC_RECREATE_OBJ             157 obsolete */
+/*	LL_IOC_RECREATE_FID             157 obsolete */
+#define LL_IOC_GROUP_LOCK               _IOW ('f', 158, long)
+#define LL_IOC_GROUP_UNLOCK             _IOW ('f', 159, long)
+/*	LL_IOC_QUOTACHECK               160 OBD_IOC_QUOTACHECK */
+/*	LL_IOC_POLL_QUOTACHECK		161 OBD_IOC_POLL_QUOTACHECK */
+/*	LL_IOC_QUOTACTL			162 OBD_IOC_QUOTACTL */
+#define IOC_OBD_STATFS                  _IOWR('f', 164, struct obd_statfs *)
+/*	IOC_LOV_GETINFO                 165 obsolete */
+#define LL_IOC_FLUSHCTX                 _IOW ('f', 166, long)
+/*	LL_IOC_RMTACL                   167 obsolete */
+#define LL_IOC_GETOBDCOUNT              _IOR ('f', 168, long)
+#define LL_IOC_LLOOP_ATTACH             _IOWR('f', 169, long)
+#define LL_IOC_LLOOP_DETACH             _IOWR('f', 170, long)
+#define LL_IOC_LLOOP_INFO               _IOWR('f', 171, struct lu_fid)
+#define LL_IOC_LLOOP_DETACH_BYDEV       _IOWR('f', 172, long)
+#define LL_IOC_PATH2FID                 _IOR ('f', 173, long)
+#define LL_IOC_GET_CONNECT_FLAGS        _IOWR('f', 174, __u64 *)
+#define LL_IOC_GET_MDTIDX               _IOR ('f', 175, int)
+#define LL_IOC_FUTIMES_3		_IOWR('f', 176, struct ll_futimes_3)
+#define LL_IOC_FLR_SET_MIRROR		_IOW ('f', 177, long)
+/*	lustre_ioctl.h			177-210 */
+#define LL_IOC_HSM_STATE_GET		_IOR('f', 211, struct hsm_user_state)
+#define LL_IOC_HSM_STATE_SET		_IOW('f', 212, struct hsm_state_set)
+#define LL_IOC_HSM_CT_START		_IOW('f', 213, struct lustre_kernelcomm)
+#define LL_IOC_HSM_COPY_START		_IOW('f', 214, struct hsm_copy *)
+#define LL_IOC_HSM_COPY_END		_IOW('f', 215, struct hsm_copy *)
+#define LL_IOC_HSM_PROGRESS		_IOW('f', 216, struct hsm_user_request)
+#define LL_IOC_HSM_REQUEST		_IOW('f', 217, struct hsm_user_request)
+#define LL_IOC_DATA_VERSION		_IOR('f', 218, struct ioc_data_version)
+#define LL_IOC_LOV_SWAP_LAYOUTS		_IOW('f', 219, \
+						struct lustre_swap_layouts)
+#define LL_IOC_HSM_ACTION		_IOR('f', 220, \
+						struct hsm_current_action)
+/*	lustre_ioctl.h			221-232 */
+#define LL_IOC_LMV_SETSTRIPE		_IOWR('f', 240, struct lmv_user_md)
+#define LL_IOC_LMV_GETSTRIPE		_IOWR('f', 241, struct lmv_user_md)
+#define LL_IOC_REMOVE_ENTRY		_IOWR('f', 242, __u64)
+#define LL_IOC_RMFID			_IOR('f', 242, struct fid_array)
+#define LL_IOC_SET_LEASE		_IOWR('f', 243, struct ll_ioc_lease)
+#define LL_IOC_SET_LEASE_OLD		_IOWR('f', 243, long)
+#define LL_IOC_GET_LEASE		_IO('f', 244)
+#define LL_IOC_HSM_IMPORT		_IOWR('f', 245, struct hsm_user_import)
+#define LL_IOC_LMV_SET_DEFAULT_STRIPE	_IOWR('f', 246, struct lmv_user_md)
+#define LL_IOC_MIGRATE			_IOR('f', 247, int)
+#define LL_IOC_FID2MDTIDX		_IOWR('f', 248, struct lu_fid)
+#define LL_IOC_GETPARENT		_IOWR('f', 249, struct getparent)
+#define LL_IOC_LADVISE			_IOR('f', 250, struct llapi_lu_ladvise)
+
+#ifndef	FS_IOC_FSGETXATTR
+/*
+ * Structure for FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR.
+*/
+struct fsxattr {
+	__u32           fsx_xflags;     /* xflags field value (get/set) */
+	__u32           fsx_extsize;    /* extsize field value (get/set)*/
+	__u32           fsx_nextents;   /* nextents field value (get)   */
+	__u32           fsx_projid;     /* project identifier (get/set) */
+	unsigned char   fsx_pad[12];
+};
+#define FS_IOC_FSGETXATTR		_IOR('X', 31, struct fsxattr)
+#define FS_IOC_FSSETXATTR		_IOW('X', 32, struct fsxattr)
+#endif
+#define LL_IOC_FSGETXATTR		FS_IOC_FSGETXATTR
+#define LL_IOC_FSSETXATTR		FS_IOC_FSSETXATTR
+#ifndef FS_XFLAG_PROJINHERIT
+#define FS_XFLAG_PROJINHERIT		0x00000200
+#endif
+
+
+#define LL_STATFS_LMV		1
+#define LL_STATFS_LOV		2
+#define LL_STATFS_NODELAY	4
+
+#define IOC_MDC_TYPE		'i'
+#define IOC_MDC_LOOKUP		_IOWR(IOC_MDC_TYPE, 20, struct obd_device *)
+#define IOC_MDC_GETFILESTRIPE	_IOWR(IOC_MDC_TYPE, 21, struct lov_user_md *)
+#ifdef HAVE_LOV_USER_MDS_DATA
+#define IOC_MDC_GETFILEINFO_OLD	_IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data_v1 *)
+#define IOC_MDC_GETFILEINFO	_IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data)
+#define LL_IOC_MDC_GETINFO_OLD	_IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data_v1 *)
+#define LL_IOC_MDC_GETINFO	_IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data)
+#endif
+
+#define MAX_OBD_NAME 128 /* If this changes, a NEW ioctl must be added */
+
+/* Define O_LOV_DELAY_CREATE to be a mask that is not useful for regular
+ * files, but are unlikely to be used in practice and are not harmful if
+ * used incorrectly.  O_NOCTTY and FASYNC are only meaningful for character
+ * devices and are safe for use on new files. See LU-4209. */
+/* To be compatible with old statically linked binary we keep the check for
+ * the older 0100000000 flag.  This is already removed upstream.  LU-812. */
+#define O_LOV_DELAY_CREATE_1_8	0100000000 /* FMODE_NONOTIFY masked in 2.6.36 */
+#ifndef FASYNC
+#define FASYNC			00020000   /* fcntl, for BSD compatibility */
+#endif
+#define O_LOV_DELAY_CREATE_MASK	(O_NOCTTY | FASYNC)
+#define O_LOV_DELAY_CREATE		(O_LOV_DELAY_CREATE_1_8 | \
+					 O_LOV_DELAY_CREATE_MASK)
+
+#define LL_FILE_IGNORE_LOCK     0x00000001
+#define LL_FILE_GROUP_LOCKED    0x00000002
+#define LL_FILE_READAHEA        0x00000004
+#define LL_FILE_LOCKED_DIRECTIO 0x00000008 /* client-side locks with dio */
+#define LL_FILE_LOCKLESS_IO     0x00000010 /* server-side locks with cio */
+
+#define LOV_USER_MAGIC_V1	0x0BD10BD0
+#define LOV_USER_MAGIC		LOV_USER_MAGIC_V1
+#define LOV_USER_MAGIC_JOIN_V1	0x0BD20BD0
+#define LOV_USER_MAGIC_V3	0x0BD30BD0
+/* 0x0BD40BD0 is occupied by LOV_MAGIC_MIGRATE */
+#define LOV_USER_MAGIC_SPECIFIC 0x0BD50BD0	/* for specific OSTs */
+#define LOV_USER_MAGIC_COMP_V1	0x0BD60BD0
+
+#define LMV_USER_MAGIC		0x0CD30CD0    /* default lmv magic */
+#define LMV_USER_MAGIC_V0	0x0CD20CD0    /* old default lmv magic*/
+#define LMV_USER_MAGIC_SPECIFIC	0x0CD40CD0
+
+#define LOV_PATTERN_NONE	0x000
+#define LOV_PATTERN_RAID0	0x001
+#define LOV_PATTERN_RAID1	0x002
+#define LOV_PATTERN_MDT		0x100
+#define LOV_PATTERN_CMOBD	0x200
+
+#define LOV_PATTERN_F_MASK	0xffff0000
+#define LOV_PATTERN_F_HOLE	0x40000000 /* there is hole in LOV EA */
+#define LOV_PATTERN_F_RELEASED	0x80000000 /* HSM released file */
+#define LOV_PATTERN_DEFAULT	0xffffffff
+
+static inline bool lov_pattern_supported(__u32 pattern)
+{
+	return (pattern & ~LOV_PATTERN_F_RELEASED) == LOV_PATTERN_RAID0 ||
+	       (pattern & ~LOV_PATTERN_F_RELEASED) == LOV_PATTERN_MDT;
+}
+
+#define LOV_MAXPOOLNAME 15
+#define LOV_POOLNAMEF "%.15s"
+
+#define LOV_MIN_STRIPE_BITS 16   /* maximum PAGE_SIZE (ia64), power of 2 */
+#define LOV_MIN_STRIPE_SIZE (1 << LOV_MIN_STRIPE_BITS)
+#define LOV_MAX_STRIPE_COUNT_OLD 160
+/* This calculation is crafted so that input of 4096 will result in 160
+ * which in turn is equal to old maximal stripe count.
+ * XXX: In fact this is too simpified for now, what it also need is to get
+ * ea_type argument to clearly know how much space each stripe consumes.
+ *
+ * The limit of 12 pages is somewhat arbitrary, but is a reasonably large
+ * allocation that is sufficient for the current generation of systems.
+ *
+ * (max buffer size - lov+rpc header) / sizeof(struct lov_ost_data_v1) */
+#define LOV_MAX_STRIPE_COUNT 2000  /* ((12 * 4096 - 256) / 24) */
+#define LOV_ALL_STRIPES       0xffff /* only valid for directories */
+#define LOV_V1_INSANE_STRIPE_COUNT 65532 /* maximum stripe count bz13933 */
+
+#define XATTR_LUSTRE_PREFIX	"lustre."
+#define XATTR_LUSTRE_LOV	XATTR_LUSTRE_PREFIX"lov"
+
+/* Please update if XATTR_LUSTRE_LOV".set" groks more flags in the future */
+#define allowed_lustre_lov(att) (strcmp((att), XATTR_LUSTRE_LOV".add") == 0 || \
+			strcmp((att), XATTR_LUSTRE_LOV".set") == 0 || \
+			strcmp((att), XATTR_LUSTRE_LOV".set.flags") == 0 || \
+			strcmp((att), XATTR_LUSTRE_LOV".del") == 0)
+
+#define lov_user_ost_data lov_user_ost_data_v1
+struct lov_user_ost_data_v1 {     /* per-stripe data structure */
+	struct ost_id l_ost_oi;	  /* OST object ID */
+	__u32 l_ost_gen;          /* generation of this OST index */
+	__u32 l_ost_idx;          /* OST index in LOV */
+} __attribute__((packed));
+
+#define lov_user_md lov_user_md_v1
+struct lov_user_md_v1 {           /* LOV EA user data (host-endian) */
+	__u32 lmm_magic;          /* magic number = LOV_USER_MAGIC_V1 */
+	__u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+	struct ost_id lmm_oi;	  /* MDT parent inode id/seq (id/0 for 1.x) */
+	__u32 lmm_stripe_size;    /* size of stripe in bytes */
+	__u16 lmm_stripe_count;   /* num stripes in use for this object */
+	union {
+		__u16 lmm_stripe_offset;  /* starting stripe offset in
+					   * lmm_objects, use when writing */
+		__u16 lmm_layout_gen;     /* layout generation number
+					   * used when reading */
+	};
+	struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+} __attribute__((packed, __may_alias__));
+
+struct lov_user_md_v3 {           /* LOV EA user data (host-endian) */
+	__u32 lmm_magic;          /* magic number = LOV_USER_MAGIC_V3 */
+	__u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+	struct ost_id lmm_oi;	  /* MDT parent inode id/seq (id/0 for 1.x) */
+	__u32 lmm_stripe_size;    /* size of stripe in bytes */
+	__u16 lmm_stripe_count;   /* num stripes in use for this object */
+	union {
+		__u16 lmm_stripe_offset;  /* starting stripe offset in
+					   * lmm_objects, use when writing */
+		__u16 lmm_layout_gen;     /* layout generation number
+					   * used when reading */
+	};
+	char  lmm_pool_name[LOV_MAXPOOLNAME + 1]; /* pool name */
+	struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+} __attribute__((packed, __may_alias__));
+
+struct lu_extent {
+	__u64	e_start;
+	__u64	e_end;
+} __attribute__((packed));
+
+#define DEXT "[%#llx, %#llx)"
+#define PEXT(ext) (unsigned long long)(ext)->e_start, (unsigned long long)(ext)->e_end
+
+static inline bool lu_extent_is_overlapped(struct lu_extent *e1,
+					   struct lu_extent *e2)
+{
+	return e1->e_start < e2->e_end && e2->e_start < e1->e_end;
+}
+
+static inline bool lu_extent_is_whole(struct lu_extent *e)
+{
+	return e->e_start == 0 && e->e_end == LUSTRE_EOF;
+}
+
+enum lov_comp_md_entry_flags {
+	LCME_FL_STALE	= 0x00000001,	/* FLR: stale data */
+	LCME_FL_PREF_RD	= 0x00000002,	/* FLR: preferred for reading */
+	LCME_FL_PREF_WR	= 0x00000004,	/* FLR: preferred for writing */
+	LCME_FL_PREF_RW	= LCME_FL_PREF_RD | LCME_FL_PREF_WR,
+	LCME_FL_OFFLINE	= 0x00000008,	/* Not used */
+	LCME_FL_INIT	= 0x00000010,	/* instantiated */
+	LCME_FL_NOSYNC	= 0x00000020,	/* FLR: no sync for the mirror */
+	LCME_FL_NEG	= 0x80000000	/* used to indicate a negative flag,
+					   won't be stored on disk */
+};
+
+#define LCME_KNOWN_FLAGS	(LCME_FL_NEG | LCME_FL_INIT | LCME_FL_STALE | \
+				 LCME_FL_PREF_RW | LCME_FL_NOSYNC)
+/* The flags can be set by users at mirror creation time. */
+#define LCME_USER_FLAGS		(LCME_FL_PREF_RW)
+
+/* The flags are for mirrors */
+#define LCME_MIRROR_FLAGS	(LCME_FL_NOSYNC)
+
+/* These flags have meaning when set in a default layout and will be inherited
+ * from the default/template layout set on a directory.
+ */
+#define LCME_TEMPLATE_FLAGS	(LCME_FL_PREF_RW | LCME_FL_NOSYNC)
+
+/* the highest bit in obdo::o_layout_version is used to mark if the file is
+ * being resynced. */
+#define LU_LAYOUT_RESYNC	LCME_FL_NEG
+
+/* lcme_id can be specified as certain flags, and the the first
+ * bit of lcme_id is used to indicate that the ID is representing
+ * certain LCME_FL_* but not a real ID. Which implies we can have
+ * at most 31 flags (see LCME_FL_XXX). */
+enum lcme_id {
+	LCME_ID_INVAL	= 0x0,
+	LCME_ID_MAX	= 0x7FFFFFFF,
+	LCME_ID_ALL	= 0xFFFFFFFF,
+	LCME_ID_NOT_ID	= LCME_FL_NEG
+};
+
+#define LCME_ID_MASK	LCME_ID_MAX
+
+struct lov_comp_md_entry_v1 {
+	__u32			lcme_id;        /* unique id of component */
+	__u32			lcme_flags;     /* LCME_FL_XXX */
+	struct lu_extent	lcme_extent;    /* file extent for component */
+	__u32			lcme_offset;    /* offset of component blob,
+						   start from lov_comp_md_v1 */
+	__u32			lcme_size;      /* size of component blob */
+	__u32			lcme_layout_gen;
+	__u64			lcme_timestamp;	/* snapshot time if applicable*/
+	__u32			lcme_padding_1;
+} __attribute__((packed));
+
+#define SEQ_ID_MAX		0x0000FFFF
+#define SEQ_ID_MASK		SEQ_ID_MAX
+/* bit 30:16 of lcme_id is used to store mirror id */
+#define MIRROR_ID_MASK		0x7FFF0000
+#define MIRROR_ID_NEG		0x8000
+#define MIRROR_ID_SHIFT		16
+
+static inline __u32 pflr_id(__u16 mirror_id, __u16 seqid)
+{
+	return ((mirror_id << MIRROR_ID_SHIFT) & MIRROR_ID_MASK) | seqid;
+}
+
+static inline __u16 mirror_id_of(__u32 id)
+{
+	return (id & MIRROR_ID_MASK) >> MIRROR_ID_SHIFT;
+}
+
+/**
+ * on-disk data for lcm_flags. Valid if lcm_magic is LOV_MAGIC_COMP_V1.
+ */
+enum lov_comp_md_flags {
+	/* the least 2 bits are used by FLR to record file state */
+	LCM_FL_NONE          = 0,
+	LCM_FL_RDONLY           = 1,
+	LCM_FL_WRITE_PENDING    = 2,
+	LCM_FL_SYNC_PENDING     = 3,
+	LCM_FL_FLR_MASK         = 0x3,
+};
+
+struct lov_comp_md_v1 {
+	__u32	lcm_magic;      /* LOV_USER_MAGIC_COMP_V1 */
+	__u32	lcm_size;       /* overall size including this struct */
+	__u32	lcm_layout_gen;
+	__u16	lcm_flags;
+	__u16	lcm_entry_count;
+	/* lcm_mirror_count stores the number of actual mirrors minus 1,
+	 * so that non-flr files will have value 0 meaning 1 mirror. */
+	__u16	lcm_mirror_count;
+	__u16	lcm_padding1[3];
+	__u64	lcm_padding2;
+	struct lov_comp_md_entry_v1 lcm_entries[0];
+} __attribute__((packed));
+
+static inline __u32 lov_user_md_size(__u16 stripes, __u32 lmm_magic)
+{
+	if (stripes == (__u16)-1)
+		stripes = 0;
+
+	if (lmm_magic == LOV_USER_MAGIC_V1)
+		return sizeof(struct lov_user_md_v1) +
+			      stripes * sizeof(struct lov_user_ost_data_v1);
+	return sizeof(struct lov_user_md_v3) +
+				stripes * sizeof(struct lov_user_ost_data_v1);
+}
+
+/* Compile with -D_LARGEFILE64_SOURCE or -D_GNU_SOURCE (or #define) to
+ * use this.  It is unsafe to #define those values in this header as it
+ * is possible the application has already #included <sys/stat.h>. */
+#ifdef HAVE_LOV_USER_MDS_DATA
+#define lov_user_mds_data lov_user_mds_data_v2
+struct lov_user_mds_data_v1 {
+	lstat_t lmd_st;                 /* MDS stat struct */
+	struct lov_user_md_v1 lmd_lmm;  /* LOV EA V1 user data */
+} __attribute__((packed));
+
+struct lov_user_mds_data_v2 {
+	struct lu_fid lmd_fid;		/* Lustre FID */
+	lstatx_t lmd_stx;		/* MDS statx struct */
+	__u64 lmd_flags;		/* MDS stat flags */
+	__u32 lmd_lmmsize;		/* LOV EA size */
+	__u32 lmd_padding;		/* unused */
+	struct lov_user_md_v1 lmd_lmm;	/* LOV EA user data */
+} __attribute__((packed));
+#endif
+
+struct lmv_user_mds_data {
+	struct lu_fid	lum_fid;
+	__u32		lum_padding;
+	__u32		lum_mds;
+} __attribute__((packed, __may_alias__));
+
+enum lmv_hash_type {
+	LMV_HASH_TYPE_UNKNOWN	= 0,	/* 0 is reserved for testing purpose */
+	LMV_HASH_TYPE_ALL_CHARS = 1,
+	LMV_HASH_TYPE_FNV_1A_64 = 2,
+	LMV_HASH_TYPE_MAX,
+};
+
+#define LMV_HASH_NAME_ALL_CHARS	"all_char"
+#define LMV_HASH_NAME_FNV_1A_64	"fnv_1a_64"
+
+extern char *mdt_hash_name[LMV_HASH_TYPE_MAX];
+
+/* Got this according to how get LOV_MAX_STRIPE_COUNT, see above,
+ * (max buffer size - lmv+rpc header) / sizeof(struct lmv_user_mds_data) */
+#define LMV_MAX_STRIPE_COUNT 2000  /* ((12 * 4096 - 256) / 24) */
+#define lmv_user_md lmv_user_md_v1
+struct lmv_user_md_v1 {
+	__u32	lum_magic;	 /* must be the first field */
+	__u32	lum_stripe_count;  /* dirstripe count */
+	__u32	lum_stripe_offset; /* MDT idx for default dirstripe */
+	__u32	lum_hash_type;     /* Dir stripe policy */
+	__u32	lum_type;	  /* LMV type: default or normal */
+	__u32	lum_padding1;
+	__u32	lum_padding2;
+	__u32	lum_padding3;
+	char	lum_pool_name[LOV_MAXPOOLNAME + 1];
+	struct	lmv_user_mds_data  lum_objects[0];
+} __attribute__((packed));
+
+static inline int lmv_user_md_size(int stripes, int lmm_magic)
+{
+	int size = sizeof(struct lmv_user_md);
+
+	if (lmm_magic == LMV_USER_MAGIC_SPECIFIC)
+		size += stripes * sizeof(struct lmv_user_mds_data);
+
+	return size;
+}
+
+struct ll_recreate_obj {
+	__u64 lrc_id;
+	__u32 lrc_ost_idx;
+};
+
+struct ll_fid {
+	__u64 id;         /* holds object id */
+	__u32 generation; /* holds object generation */
+	__u32 f_type;     /* holds object type or stripe idx when passing it to
+			   * OST for saving into EA. */
+};
+
+#define UUID_MAX        40
+struct obd_uuid {
+	char uuid[UUID_MAX];
+};
+
+static inline bool obd_uuid_equals(const struct obd_uuid *u1,
+				   const struct obd_uuid *u2)
+{
+	return strcmp((char *)u1->uuid, (char *)u2->uuid) == 0;
+}
+
+static inline int obd_uuid_empty(struct obd_uuid *uuid)
+{
+	return uuid->uuid[0] == '\0';
+}
+
+static inline void obd_str2uuid(struct obd_uuid *uuid, const char *tmp)
+{
+	strncpy((char *)uuid->uuid, tmp, sizeof(*uuid));
+	uuid->uuid[sizeof(*uuid) - 1] = '\0';
+}
+
+/* For printf's only, make sure uuid is terminated */
+static inline char *obd_uuid2str(const struct obd_uuid *uuid)
+{
+	if (uuid == NULL)
+		return NULL;
+
+	if (uuid->uuid[sizeof(*uuid) - 1] != '\0') {
+		/* Obviously not safe, but for printfs, no real harm done...
+		   we're always null-terminated, even in a race. */
+		static char temp[sizeof(*uuid)];
+		memcpy(temp, uuid->uuid, sizeof(*uuid) - 1);
+		temp[sizeof(*uuid) - 1] = '\0';
+		return temp;
+	}
+	return (char *)(uuid->uuid);
+}
+
+#define LUSTRE_MAXFSNAME 8
+
+/* Extract fsname from uuid (or target name) of a target
+   e.g. (myfs-OST0007_UUID -> myfs)
+   see also deuuidify. */
+static inline void obd_uuid2fsname(char *buf, char *uuid, int buflen)
+{
+	char *p;
+
+	strncpy(buf, uuid, buflen - 1);
+	buf[buflen - 1] = '\0';
+	p = strrchr(buf, '-');
+	if (p != NULL)
+		*p = '\0';
+}
+
+/* printf display format for Lustre FIDs
+ * usage: printf("file FID is "DFID"\n", PFID(fid)); */
+#define FID_NOBRACE_LEN 40
+#define FID_LEN (FID_NOBRACE_LEN + 2)
+#define DFID_NOBRACE "%#llx:0x%x:0x%x"
+#define DFID "["DFID_NOBRACE"]"
+#define PFID(fid) (unsigned long long)(fid)->f_seq, (fid)->f_oid, (fid)->f_ver
+
+/* scanf input parse format for fids in DFID_NOBRACE format
+ * Need to strip '[' from DFID format first or use "["SFID"]" at caller.
+ * usage: sscanf(fidstr, SFID, RFID(&fid)); */
+#define SFID "0x%llx:0x%x:0x%x"
+#define RFID(fid) (unsigned long long *)&((fid)->f_seq), &((fid)->f_oid), &((fid)->f_ver)
+
+/********* Quotas **********/
+
+#define LUSTRE_QUOTABLOCK_BITS 10
+#define LUSTRE_QUOTABLOCK_SIZE (1 << LUSTRE_QUOTABLOCK_BITS)
+
+static inline __u64 lustre_stoqb(__kernel_size_t space)
+{
+	return (space + LUSTRE_QUOTABLOCK_SIZE - 1) >> LUSTRE_QUOTABLOCK_BITS;
+}
+
+#define Q_QUOTACHECK	0x800100 /* deprecated as of 2.4 */
+#define Q_INITQUOTA	0x800101 /* deprecated as of 2.4  */
+#define Q_GETOINFO	0x800102 /* get obd quota info */
+#define Q_GETOQUOTA	0x800103 /* get obd quotas */
+#define Q_FINVALIDATE	0x800104 /* deprecated as of 2.4 */
+
+/* these must be explicitly translated into linux Q_* in ll_dir_ioctl */
+#define LUSTRE_Q_QUOTAON    0x800002     /* deprecated as of 2.4 */
+#define LUSTRE_Q_QUOTAOFF   0x800003     /* deprecated as of 2.4 */
+#define LUSTRE_Q_GETINFO    0x800005     /* get information about quota files */
+#define LUSTRE_Q_SETINFO    0x800006     /* set information about quota files */
+#define LUSTRE_Q_GETQUOTA   0x800007     /* get user quota structure */
+#define LUSTRE_Q_SETQUOTA   0x800008     /* set user quota structure */
+/* lustre-specific control commands */
+#define LUSTRE_Q_INVALIDATE  0x80000b     /* deprecated as of 2.4 */
+#define LUSTRE_Q_FINVALIDATE 0x80000c     /* deprecated as of 2.4 */
+#define LUSTRE_Q_GETDEFAULT  0x80000d     /* get default quota */
+#define LUSTRE_Q_SETDEFAULT  0x80000e     /* set default quota */
+
+/* In the current Lustre implementation, the grace time is either the time
+ * or the timestamp to be used after some quota ID exceeds the soft limt,
+ * 48 bits should be enough, its high 16 bits can be used as quota flags.
+ * */
+#define LQUOTA_GRACE_BITS	48
+#define LQUOTA_GRACE_MASK	((1ULL << LQUOTA_GRACE_BITS) - 1)
+#define LQUOTA_GRACE_MAX	LQUOTA_GRACE_MASK
+#define LQUOTA_GRACE(t)		(t & LQUOTA_GRACE_MASK)
+#define LQUOTA_FLAG(t)		(t >> LQUOTA_GRACE_BITS)
+#define LQUOTA_GRACE_FLAG(t, f)	((__u64)t | (__u64)f << LQUOTA_GRACE_BITS)
+
+/* different quota flags */
+
+/* the default quota flag, the corresponding quota ID will use the default
+ * quota setting, the hardlimit and softlimit of its quota record in the global
+ * quota file will be set to 0, the low 48 bits of the grace will be set to 0
+ * and high 16 bits will contain this flag (see above comment).
+ * */
+#define LQUOTA_FLAG_DEFAULT	0x0001
+
+#define ALLQUOTA 255       /* set all quota */
+static inline char *qtype_name(int qtype)
+{
+	switch (qtype) {
+	case USRQUOTA:
+		return "usr";
+	case GRPQUOTA:
+		return "grp";
+	case PRJQUOTA:
+		return "prj";
+	}
+	return "unknown";
+}
+
+#define IDENTITY_DOWNCALL_MAGIC 0x6d6dd629
+#define SEPOL_DOWNCALL_MAGIC 0x8b8bb842
+
+/* permission */
+#define N_PERMS_MAX      64
+
+struct perm_downcall_data {
+	__u64 pdd_nid;
+	__u32 pdd_perm;
+	__u32 pdd_padding;
+};
+
+struct identity_downcall_data {
+	__u32                            idd_magic;
+	__u32                            idd_err;
+	__u32                            idd_uid;
+	__u32                            idd_gid;
+	__u32                            idd_nperms;
+	__u32                            idd_ngroups;
+	struct perm_downcall_data idd_perms[N_PERMS_MAX];
+	__u32                            idd_groups[0];
+};
+
+struct sepol_downcall_data {
+	__u32		sdd_magic;
+	__s64		sdd_sepol_mtime;
+	__u16		sdd_sepol_len;
+	char		sdd_sepol[0];
+};
+
+#ifdef NEED_QUOTA_DEFS
+#ifndef QIF_BLIMITS
+#define QIF_BLIMITS     1
+#define QIF_SPACE       2
+#define QIF_ILIMITS     4
+#define QIF_INODES      8
+#define QIF_BTIME       16
+#define QIF_ITIME       32
+#define QIF_LIMITS      (QIF_BLIMITS | QIF_ILIMITS)
+#define QIF_USAGE       (QIF_SPACE | QIF_INODES)
+#define QIF_TIMES       (QIF_BTIME | QIF_ITIME)
+#define QIF_ALL         (QIF_LIMITS | QIF_USAGE | QIF_TIMES)
+#endif
+
+#endif /* !__KERNEL__ */
+
+/* lustre volatile file support
+ * file name header: ".^L^S^T^R:volatile"
+ */
+#define LUSTRE_VOLATILE_HDR	".\x0c\x13\x14\x12:VOLATILE"
+#define LUSTRE_VOLATILE_HDR_LEN	14
+
+enum lustre_quota_version {
+	LUSTRE_QUOTA_V2 = 1
+};
+
+/* XXX: same as if_dqinfo struct in kernel */
+struct obd_dqinfo {
+	__u64 dqi_bgrace;
+	__u64 dqi_igrace;
+	__u32 dqi_flags;
+	__u32 dqi_valid;
+};
+
+/* XXX: same as if_dqblk struct in kernel, plus one padding */
+struct obd_dqblk {
+	__u64 dqb_bhardlimit;
+	__u64 dqb_bsoftlimit;
+	__u64 dqb_curspace;
+	__u64 dqb_ihardlimit;
+	__u64 dqb_isoftlimit;
+	__u64 dqb_curinodes;
+	__u64 dqb_btime;
+	__u64 dqb_itime;
+	__u32 dqb_valid;
+	__u32 dqb_padding;
+};
+
+enum {
+	QC_GENERAL      = 0,
+	QC_MDTIDX       = 1,
+	QC_OSTIDX       = 2,
+	QC_UUID         = 3
+};
+
+struct if_quotactl {
+	__u32                   qc_cmd;
+	__u32                   qc_type;
+	__u32                   qc_id;
+	__u32                   qc_stat;
+	__u32                   qc_valid;
+	__u32                   qc_idx;
+	struct obd_dqinfo       qc_dqinfo;
+	struct obd_dqblk        qc_dqblk;
+	char                    obd_type[16];
+	struct obd_uuid         obd_uuid;
+};
+
+/* swap layout flags */
+#define SWAP_LAYOUTS_CHECK_DV1		(1 << 0)
+#define SWAP_LAYOUTS_CHECK_DV2		(1 << 1)
+#define SWAP_LAYOUTS_KEEP_MTIME		(1 << 2)
+#define SWAP_LAYOUTS_KEEP_ATIME		(1 << 3)
+#define SWAP_LAYOUTS_CLOSE		(1 << 4)
+
+/* Swap XATTR_NAME_HSM as well, only on the MDT so far */
+#define SWAP_LAYOUTS_MDS_HSM		(1 << 31)
+struct lustre_swap_layouts {
+	__u64	sl_flags;
+	__u32	sl_fd;
+	__u32	sl_gid;
+	__u64	sl_dv1;
+	__u64	sl_dv2;
+};
+
+/** Bit-mask of valid attributes */
+/* The LA_* flags are written to disk as part of the ChangeLog records
+ * so they are part of the on-disk and network protocol, and cannot be changed.
+ * Only the first 12 bits are currently saved.
+ */
+enum la_valid {
+	LA_ATIME	= 1 << 0,	/* 0x00001 */
+	LA_MTIME	= 1 << 1,	/* 0x00002 */
+	LA_CTIME	= 1 << 2,	/* 0x00004 */
+	LA_SIZE		= 1 << 3,	/* 0x00008 */
+	LA_MODE		= 1 << 4,	/* 0x00010 */
+	LA_UID		= 1 << 5,	/* 0x00020 */
+	LA_GID		= 1 << 6,	/* 0x00040 */
+	LA_BLOCKS	= 1 << 7,	/* 0x00080 */
+	LA_TYPE		= 1 << 8,	/* 0x00100 */
+	LA_FLAGS	= 1 << 9,	/* 0x00200 */
+	LA_NLINK	= 1 << 10,	/* 0x00400 */
+	LA_RDEV		= 1 << 11,	/* 0x00800 */
+	LA_BLKSIZE	= 1 << 12,	/* 0x01000 */
+	LA_KILL_SUID	= 1 << 13,	/* 0x02000 */
+	LA_KILL_SGID	= 1 << 14,	/* 0x04000 */
+	LA_PROJID	= 1 << 15,	/* 0x08000 */
+	LA_LAYOUT_VERSION = 1 << 16,	/* 0x10000 */
+	LA_LSIZE	= 1 << 17,	/* 0x20000 */
+	LA_LBLOCKS	= 1 << 18,	/* 0x40000 */
+	/**
+	 * Attributes must be transmitted to OST objects
+	 */
+	LA_REMOTE_ATTR_SET = (LA_UID | LA_GID | LA_PROJID | LA_LAYOUT_VERSION)
+};
+
+#define MDS_FMODE_READ           00000001
+#define MDS_FMODE_WRITE          00000002
+
+#define MDS_FMODE_CLOSED         00000000
+#define MDS_FMODE_EXEC           00000004
+/*	MDS_FMODE_EPOCH          01000000 obsolete since 2.8.0 */
+/*	MDS_FMODE_TRUNC          02000000 obsolete since 2.8.0 */
+/*	MDS_FMODE_SOM            04000000 obsolete since 2.8.0 */
+
+#define MDS_OPEN_CREATED         00000010
+/*	MDS_OPEN_CROSS           00000020 obsolete in 2.12, internal use only */
+
+#define MDS_OPEN_CREAT           00000100
+#define MDS_OPEN_EXCL            00000200
+#define MDS_OPEN_TRUNC           00001000
+#define MDS_OPEN_APPEND          00002000
+#define MDS_OPEN_SYNC            00010000
+#define MDS_OPEN_DIRECTORY       00200000
+
+#define MDS_OPEN_BY_FID		040000000 /* open_by_fid for known object */
+#define MDS_OPEN_DELAY_CREATE  0100000000 /* delay initial object create */
+#define MDS_OPEN_OWNEROVERRIDE 0200000000 /* NFSD rw-reopen ro file for owner */
+#define MDS_OPEN_JOIN_FILE     0400000000 /* open for join file.
+					   * We do not support JOIN FILE
+					   * anymore, reserve this flags
+					   * just for preventing such bit
+					   * to be reused. */
+
+#define MDS_OPEN_LOCK         04000000000 /* This open requires open lock */
+#define MDS_OPEN_HAS_EA      010000000000 /* specify object create pattern */
+#define MDS_OPEN_HAS_OBJS    020000000000 /* Just set the EA the obj exist */
+#define MDS_OPEN_NORESTORE  0100000000000ULL /* Do not restore file at open */
+#define MDS_OPEN_NEWSTRIPE  0200000000000ULL /* New stripe needed (restripe or
+					      * hsm restore) */
+#define MDS_OPEN_VOLATILE   0400000000000ULL /* File is volatile = created
+						unlinked */
+#define MDS_OPEN_LEASE	   01000000000000ULL /* Open the file and grant lease
+					      * delegation, succeed if it's not
+					      * being opened with conflict mode.
+					      */
+#define MDS_OPEN_RELEASE   02000000000000ULL /* Open the file for HSM release */
+
+#define MDS_OPEN_RESYNC    04000000000000ULL /* FLR: file resync */
+
+/* lustre internal open flags, which should not be set from user space */
+#define MDS_OPEN_FL_INTERNAL (MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS |	\
+			      MDS_OPEN_OWNEROVERRIDE | MDS_OPEN_LOCK |	\
+			      MDS_OPEN_BY_FID | MDS_OPEN_LEASE |	\
+			      MDS_OPEN_RELEASE | MDS_OPEN_RESYNC)
+
+
+/********* Changelogs **********/
+/** Changelog record types */
+enum changelog_rec_type {
+	CL_NONE     = -1,
+	CL_MARK     = 0,
+	CL_CREATE   = 1,  /* namespace */
+	CL_MKDIR    = 2,  /* namespace */
+	CL_HARDLINK = 3,  /* namespace */
+	CL_SOFTLINK = 4,  /* namespace */
+	CL_MKNOD    = 5,  /* namespace */
+	CL_UNLINK   = 6,  /* namespace */
+	CL_RMDIR    = 7,  /* namespace */
+	CL_RENAME   = 8,  /* namespace */
+	CL_EXT      = 9,  /* namespace extended record (2nd half of rename) */
+	CL_OPEN     = 10, /* not currently used */
+	CL_CLOSE    = 11, /* may be written to log only with mtime change */
+	CL_LAYOUT   = 12, /* file layout/striping modified */
+	CL_TRUNC    = 13,
+	CL_SETATTR  = 14,
+	CL_SETXATTR = 15,
+	CL_XATTR    = CL_SETXATTR, /* Deprecated name */
+	CL_HSM      = 16, /* HSM specific events, see flags */
+	CL_MTIME    = 17, /* Precedence: setattr > mtime > ctime > atime */
+	CL_CTIME    = 18,
+	CL_ATIME    = 19,
+	CL_MIGRATE  = 20,
+	CL_FLRW     = 21, /* FLR: file was firstly written */
+	CL_RESYNC   = 22, /* FLR: file was resync-ed */
+	CL_GETXATTR = 23,
+	CL_DN_OPEN  = 24, /* denied open */
+	CL_LAST
+};
+
+static inline const char *changelog_type2str(int type) {
+	static const char *changelog_str[] = {
+		"MARK",  "CREAT", "MKDIR", "HLINK", "SLINK", "MKNOD", "UNLNK",
+		"RMDIR", "RENME", "RNMTO", "OPEN",  "CLOSE", "LYOUT", "TRUNC",
+		"SATTR", "XATTR", "HSM",   "MTIME", "CTIME", "ATIME", "MIGRT",
+		"FLRW",  "RESYNC","GXATR", "NOPEN",
+	};
+
+	if (type >= 0 && type < CL_LAST)
+		return changelog_str[type];
+	return NULL;
+}
+
+/* 12 bits of per-record data can be stored in the bottom of the flags */
+#define CLF_FLAGSHIFT   12
+enum changelog_rec_flags {
+	CLF_VERSION	= 0x1000,
+	CLF_RENAME	= 0x2000,
+	CLF_JOBID	= 0x4000,
+	CLF_EXTRA_FLAGS = 0x8000,
+	CLF_SUPPORTED	= CLF_VERSION | CLF_RENAME | CLF_JOBID |
+			  CLF_EXTRA_FLAGS,
+	CLF_FLAGMASK	= (1U << CLF_FLAGSHIFT) - 1,
+	CLF_VERMASK	= ~CLF_FLAGMASK,
+};
+
+
+/* Anything under the flagmask may be per-type (if desired) */
+/* Flags for unlink */
+#define CLF_UNLINK_LAST       0x0001 /* Unlink of last hardlink */
+#define CLF_UNLINK_HSM_EXISTS 0x0002 /* File has something in HSM */
+				     /* HSM cleaning needed */
+/* Flags for rename */
+#define CLF_RENAME_LAST		0x0001 /* rename unlink last hardlink
+					* of target */
+#define CLF_RENAME_LAST_EXISTS	0x0002 /* rename unlink last hardlink of target
+					* has an archive in backend */
+
+/* Flags for HSM */
+/* 12b used (from high weight to low weight):
+ * 2b for flags
+ * 3b for event
+ * 7b for error code
+ */
+#define CLF_HSM_ERR_L        0 /* HSM return code, 7 bits */
+#define CLF_HSM_ERR_H        6
+#define CLF_HSM_EVENT_L      7 /* HSM event, 3 bits, see enum hsm_event */
+#define CLF_HSM_EVENT_H      9
+#define CLF_HSM_FLAG_L      10 /* HSM flags, 2 bits, 1 used, 1 spare */
+#define CLF_HSM_FLAG_H      11
+#define CLF_HSM_SPARE_L     12 /* 4 spare bits */
+#define CLF_HSM_SPARE_H     15
+#define CLF_HSM_LAST        15
+
+/* Remove bits higher than _h, then extract the value
+ * between _h and _l by shifting lower weigth to bit 0. */
+#define CLF_GET_BITS(_b, _h, _l) (((_b << (CLF_HSM_LAST - _h)) & 0xFFFF) \
+				   >> (CLF_HSM_LAST - _h + _l))
+
+#define CLF_HSM_SUCCESS      0x00
+#define CLF_HSM_MAXERROR     0x7E
+#define CLF_HSM_ERROVERFLOW  0x7F
+
+#define CLF_HSM_DIRTY        1 /* file is dirty after HSM request end */
+
+/* 3 bits field => 8 values allowed */
+enum hsm_event {
+	HE_ARCHIVE      = 0,
+	HE_RESTORE      = 1,
+	HE_CANCEL       = 2,
+	HE_RELEASE      = 3,
+	HE_REMOVE       = 4,
+	HE_STATE        = 5,
+	HE_SPARE1       = 6,
+	HE_SPARE2       = 7,
+};
+
+static inline enum hsm_event hsm_get_cl_event(__u16 flags)
+{
+	return (enum hsm_event)CLF_GET_BITS(flags, CLF_HSM_EVENT_H,
+					    CLF_HSM_EVENT_L);
+}
+
+static inline void hsm_set_cl_event(enum changelog_rec_flags *clf_flags,
+				    enum hsm_event he)
+{
+	*clf_flags |= (he << CLF_HSM_EVENT_L);
+}
+
+static inline __u16 hsm_get_cl_flags(enum changelog_rec_flags clf_flags)
+{
+	return CLF_GET_BITS(clf_flags, CLF_HSM_FLAG_H, CLF_HSM_FLAG_L);
+}
+
+static inline void hsm_set_cl_flags(enum changelog_rec_flags *clf_flags,
+				    unsigned int bits)
+{
+	*clf_flags |= (bits << CLF_HSM_FLAG_L);
+}
+
+static inline int hsm_get_cl_error(enum changelog_rec_flags clf_flags)
+{
+	return CLF_GET_BITS(clf_flags, CLF_HSM_ERR_H, CLF_HSM_ERR_L);
+}
+
+static inline void hsm_set_cl_error(enum changelog_rec_flags *clf_flags,
+				    unsigned int error)
+{
+	*clf_flags |= (error << CLF_HSM_ERR_L);
+}
+
+enum changelog_rec_extra_flags {
+	CLFE_INVALID	= 0,
+	CLFE_UIDGID	= 0x0001,
+	CLFE_NID	= 0x0002,
+	CLFE_OPEN	= 0x0004,
+	CLFE_XATTR	= 0x0008,
+	CLFE_SUPPORTED	= CLFE_UIDGID | CLFE_NID | CLFE_OPEN | CLFE_XATTR
+};
+
+enum changelog_send_flag {
+	/* Not yet implemented */
+	CHANGELOG_FLAG_FOLLOW      = 0x01,
+	/* Blocking IO makes sense in case of slow user parsing of the records,
+	 * but it also prevents us from cleaning up if the records are not
+	 * consumed. */
+	CHANGELOG_FLAG_BLOCK       = 0x02,
+	/* Pack jobid into the changelog records if available. */
+	CHANGELOG_FLAG_JOBID       = 0x04,
+	/* Pack additional flag bits into the changelog record */
+	CHANGELOG_FLAG_EXTRA_FLAGS = 0x08,
+};
+
+enum changelog_send_extra_flag {
+	/* Pack uid/gid into the changelog record */
+	CHANGELOG_EXTRA_FLAG_UIDGID = 0x01,
+	/* Pack nid into the changelog record */
+	CHANGELOG_EXTRA_FLAG_NID    = 0x02,
+	/* Pack open mode into the changelog record */
+	CHANGELOG_EXTRA_FLAG_OMODE  = 0x04,
+	/* Pack xattr name into the changelog record */
+	CHANGELOG_EXTRA_FLAG_XATTR  = 0x08,
+};
+
+#define CR_MAXSIZE __ALIGN_KERNEL(2 * NAME_MAX + 2 + \
+				  changelog_rec_offset(CLF_SUPPORTED, \
+						       CLFE_SUPPORTED), 8)
+
+/* 31 usable bytes string + null terminator. */
+#define LUSTRE_JOBID_SIZE	32
+
+/* This is the minimal changelog record. It can contain extensions
+ * such as rename fields or process jobid. Its exact content is described
+ * by the cr_flags and cr_extra_flags.
+ *
+ * Extensions are packed in the same order as their corresponding flags,
+ * then in the same order as their corresponding extra flags.
+ */
+struct changelog_rec {
+	__u16			cr_namelen;
+	__u16			cr_flags; /**< \a changelog_rec_flags */
+	__u32			cr_type;  /**< \a changelog_rec_type */
+	__u64			cr_index; /**< changelog record number */
+	__u64			cr_prev;  /**< last index for this target fid */
+	__u64			cr_time;
+	union {
+		struct lu_fid	cr_tfid;        /**< target fid */
+		__u32		cr_markerflags; /**< CL_MARK flags */
+	};
+	struct lu_fid		cr_pfid;        /**< parent fid */
+} __attribute__ ((packed));
+
+/* Changelog extension for RENAME. */
+struct changelog_ext_rename {
+	struct lu_fid		cr_sfid;     /**< source fid, or zero */
+	struct lu_fid		cr_spfid;    /**< source parent fid, or zero */
+};
+
+/* Changelog extension to include JOBID. */
+struct changelog_ext_jobid {
+	char	cr_jobid[LUSTRE_JOBID_SIZE];	/**< zero-terminated string. */
+};
+
+/* Changelog extension to include additional flags. */
+struct changelog_ext_extra_flags {
+	__u64 cr_extra_flags; /* Additional CLFE_* flags */
+};
+
+/* Changelog extra extension to include UID/GID. */
+struct changelog_ext_uidgid {
+	__u64	cr_uid;
+	__u64	cr_gid;
+};
+
+/* Changelog extra extension to include NID. */
+struct changelog_ext_nid {
+	/* have __u64 instead of lnet_nid_t type for use by client api */
+	__u64 cr_nid;
+	/* for use when IPv6 support is added */
+	__u64 extra;
+	__u32 padding;
+};
+
+/* Changelog extra extension to include low 32 bits of MDS_OPEN_* flags. */
+struct changelog_ext_openmode {
+	__u32 cr_openflags;
+};
+
+/* Changelog extra extension to include xattr */
+struct changelog_ext_xattr {
+	char cr_xattr[XATTR_NAME_MAX + 1]; /**< zero-terminated string. */
+};
+
+static inline struct changelog_ext_extra_flags *changelog_rec_extra_flags(
+	const struct changelog_rec *rec);
+
+static inline __kernel_size_t changelog_rec_offset(enum changelog_rec_flags crf,
+					  enum changelog_rec_extra_flags cref)
+{
+	__kernel_size_t size = sizeof(struct changelog_rec);
+
+	if (crf & CLF_RENAME)
+		size += sizeof(struct changelog_ext_rename);
+
+	if (crf & CLF_JOBID)
+		size += sizeof(struct changelog_ext_jobid);
+
+	if (crf & CLF_EXTRA_FLAGS) {
+		size += sizeof(struct changelog_ext_extra_flags);
+		if (cref & CLFE_UIDGID)
+			size += sizeof(struct changelog_ext_uidgid);
+		if (cref & CLFE_NID)
+			size += sizeof(struct changelog_ext_nid);
+		if (cref & CLFE_OPEN)
+			size += sizeof(struct changelog_ext_openmode);
+		if (cref & CLFE_XATTR)
+			size += sizeof(struct changelog_ext_xattr);
+	}
+
+	return size;
+}
+
+static inline __kernel_size_t changelog_rec_size(const struct changelog_rec *rec)
+{
+	enum changelog_rec_extra_flags cref = CLFE_INVALID;
+
+	if (rec->cr_flags & CLF_EXTRA_FLAGS)
+		cref = changelog_rec_extra_flags(rec)->cr_extra_flags;
+
+	return changelog_rec_offset(rec->cr_flags, cref);
+}
+
+static inline __kernel_size_t changelog_rec_varsize(const struct changelog_rec *rec)
+{
+	return changelog_rec_size(rec) - sizeof(*rec) + rec->cr_namelen;
+}
+
+static inline
+struct changelog_ext_rename *changelog_rec_rename(const struct changelog_rec *rec)
+{
+	enum changelog_rec_flags crf = rec->cr_flags & CLF_VERSION;
+
+	return (struct changelog_ext_rename *)((char *)rec +
+					       changelog_rec_offset(crf,
+								 CLFE_INVALID));
+}
+
+/* The jobid follows the rename extension, if present */
+static inline
+struct changelog_ext_jobid *changelog_rec_jobid(const struct changelog_rec *rec)
+{
+	enum changelog_rec_flags crf = rec->cr_flags &
+					(CLF_VERSION | CLF_RENAME);
+
+	return (struct changelog_ext_jobid *)((char *)rec +
+					      changelog_rec_offset(crf,
+								 CLFE_INVALID));
+}
+
+/* The additional flags follow the rename and jobid extensions, if present */
+static inline
+struct changelog_ext_extra_flags *changelog_rec_extra_flags(
+	const struct changelog_rec *rec)
+{
+	enum changelog_rec_flags crf = rec->cr_flags &
+		(CLF_VERSION | CLF_RENAME | CLF_JOBID);
+
+	return (struct changelog_ext_extra_flags *)((char *)rec +
+						 changelog_rec_offset(crf,
+								 CLFE_INVALID));
+}
+
+/* The uid/gid is the first extra extension */
+static inline
+struct changelog_ext_uidgid *changelog_rec_uidgid(
+	const struct changelog_rec *rec)
+{
+	enum changelog_rec_flags crf = rec->cr_flags &
+		(CLF_VERSION | CLF_RENAME | CLF_JOBID | CLF_EXTRA_FLAGS);
+
+	return (struct changelog_ext_uidgid *)((char *)rec +
+					       changelog_rec_offset(crf,
+								 CLFE_INVALID));
+}
+
+/* The nid is the second extra extension */
+static inline
+struct changelog_ext_nid *changelog_rec_nid(const struct changelog_rec *rec)
+{
+	enum changelog_rec_flags crf = rec->cr_flags &
+		(CLF_VERSION | CLF_RENAME | CLF_JOBID | CLF_EXTRA_FLAGS);
+	enum changelog_rec_extra_flags cref = CLFE_INVALID;
+
+	if (rec->cr_flags & CLF_EXTRA_FLAGS)
+		cref = changelog_rec_extra_flags(rec)->cr_extra_flags &
+		       CLFE_UIDGID;
+
+	return (struct changelog_ext_nid *)((char *)rec +
+					    changelog_rec_offset(crf, cref));
+}
+
+/* The OPEN mode is the third extra extension */
+static inline
+struct changelog_ext_openmode *changelog_rec_openmode(
+	const struct changelog_rec *rec)
+{
+	enum changelog_rec_flags crf = rec->cr_flags &
+		(CLF_VERSION | CLF_RENAME | CLF_JOBID | CLF_EXTRA_FLAGS);
+	enum changelog_rec_extra_flags cref = CLFE_INVALID;
+
+	if (rec->cr_flags & CLF_EXTRA_FLAGS)
+		cref = changelog_rec_extra_flags(rec)->cr_extra_flags &
+		       (CLFE_UIDGID | CLFE_NID);
+
+	return (struct changelog_ext_openmode *)((char *)rec +
+					       changelog_rec_offset(crf, cref));
+}
+
+/* The xattr name is the fourth extra extension */
+static inline
+struct changelog_ext_xattr *changelog_rec_xattr(
+	const struct changelog_rec *rec)
+{
+	enum changelog_rec_flags crf = rec->cr_flags &
+		(CLF_VERSION | CLF_RENAME | CLF_JOBID | CLF_EXTRA_FLAGS);
+	enum changelog_rec_extra_flags cref = CLFE_INVALID;
+
+	if (rec->cr_flags & CLF_EXTRA_FLAGS)
+		cref = changelog_rec_extra_flags(rec)->cr_extra_flags &
+			(CLFE_UIDGID | CLFE_NID | CLFE_OPEN);
+
+	return (struct changelog_ext_xattr *)((char *)rec +
+					      changelog_rec_offset(crf, cref));
+}
+
+/* The name follows the rename, jobid  and extra flags extns, if present */
+static inline char *changelog_rec_name(const struct changelog_rec *rec)
+{
+	enum changelog_rec_extra_flags cref = CLFE_INVALID;
+
+	if (rec->cr_flags & CLF_EXTRA_FLAGS)
+		cref = changelog_rec_extra_flags(rec)->cr_extra_flags;
+
+	return (char *)rec + changelog_rec_offset(rec->cr_flags & CLF_SUPPORTED,
+						  cref & CLFE_SUPPORTED);
+}
+
+static inline __kernel_size_t changelog_rec_snamelen(const struct changelog_rec *rec)
+{
+	return rec->cr_namelen - strlen(changelog_rec_name(rec)) - 1;
+}
+
+static inline char *changelog_rec_sname(const struct changelog_rec *rec)
+{
+	char *cr_name = changelog_rec_name(rec);
+
+	return cr_name + strlen(cr_name) + 1;
+}
+
+/**
+ * Remap a record to the desired format as specified by the crf flags.
+ * The record must be big enough to contain the final remapped version.
+ * Superfluous extension fields are removed and missing ones are added
+ * and zeroed. The flags of the record are updated accordingly.
+ *
+ * The jobid and rename extensions can be added to a record, to match the
+ * format an application expects, typically. In this case, the newly added
+ * fields will be zeroed.
+ * The Jobid field can be removed, to guarantee compatibility with older
+ * clients that don't expect this field in the records they process.
+ *
+ * The following assumptions are being made:
+ *   - CLF_RENAME will not be removed
+ *   - CLF_JOBID will not be added without CLF_RENAME being added too
+ *   - CLF_EXTRA_FLAGS will not be added without CLF_JOBID being added too
+ *
+ * @param[in,out]  rec         The record to remap.
+ * @param[in]      crf_wanted  Flags describing the desired extensions.
+ * @param[in]      cref_want   Flags describing the desired extra extensions.
+ */
+static inline void changelog_remap_rec(struct changelog_rec *rec,
+				       enum changelog_rec_flags crf_wanted,
+				       enum changelog_rec_extra_flags cref_want)
+{
+	char *xattr_mov = NULL;
+	char *omd_mov = NULL;
+	char *nid_mov = NULL;
+	char *uidgid_mov = NULL;
+	char *ef_mov;
+	char *jid_mov;
+	char *rnm_mov;
+	enum changelog_rec_extra_flags cref = CLFE_INVALID;
+
+	crf_wanted &= CLF_SUPPORTED;
+	cref_want &= CLFE_SUPPORTED;
+
+	if ((rec->cr_flags & CLF_SUPPORTED) == crf_wanted) {
+		if (!(rec->cr_flags & CLF_EXTRA_FLAGS) ||
+		    (rec->cr_flags & CLF_EXTRA_FLAGS &&
+		    (changelog_rec_extra_flags(rec)->cr_extra_flags &
+							CLFE_SUPPORTED) ==
+								     cref_want))
+			return;
+	}
+
+	/* First move the variable-length name field */
+	memmove((char *)rec + changelog_rec_offset(crf_wanted, cref_want),
+		changelog_rec_name(rec), rec->cr_namelen);
+
+	/* Locations of extensions in the remapped record */
+	if (rec->cr_flags & CLF_EXTRA_FLAGS) {
+		xattr_mov = (char *)rec +
+			changelog_rec_offset(crf_wanted & CLF_SUPPORTED,
+					     cref_want & ~CLFE_XATTR);
+		omd_mov = (char *)rec +
+			changelog_rec_offset(crf_wanted & CLF_SUPPORTED,
+					     cref_want & ~(CLFE_OPEN |
+							   CLFE_XATTR));
+		nid_mov = (char *)rec +
+			changelog_rec_offset(crf_wanted & CLF_SUPPORTED,
+					     cref_want & ~(CLFE_NID |
+							   CLFE_OPEN |
+							   CLFE_XATTR));
+		uidgid_mov = (char *)rec +
+			changelog_rec_offset(crf_wanted & CLF_SUPPORTED,
+					     cref_want & ~(CLFE_UIDGID |
+							   CLFE_NID |
+							   CLFE_OPEN |
+							   CLFE_XATTR));
+		cref = changelog_rec_extra_flags(rec)->cr_extra_flags;
+	}
+
+	ef_mov  = (char *)rec +
+		  changelog_rec_offset(crf_wanted & ~CLF_EXTRA_FLAGS,
+				       CLFE_INVALID);
+	jid_mov = (char *)rec +
+		  changelog_rec_offset(crf_wanted &
+				       ~(CLF_EXTRA_FLAGS | CLF_JOBID),
+				       CLFE_INVALID);
+	rnm_mov = (char *)rec +
+		  changelog_rec_offset(crf_wanted &
+				       ~(CLF_EXTRA_FLAGS |
+					 CLF_JOBID |
+					 CLF_RENAME),
+				       CLFE_INVALID);
+
+	/* Move the extension fields to the desired positions */
+	if ((crf_wanted & CLF_EXTRA_FLAGS) &&
+	    (rec->cr_flags & CLF_EXTRA_FLAGS)) {
+		if ((cref_want & CLFE_XATTR) && (cref & CLFE_XATTR))
+			memmove(xattr_mov, changelog_rec_xattr(rec),
+				sizeof(struct changelog_ext_xattr));
+
+		if ((cref_want & CLFE_OPEN) && (cref & CLFE_OPEN))
+			memmove(omd_mov, changelog_rec_openmode(rec),
+				sizeof(struct changelog_ext_openmode));
+
+		if ((cref_want & CLFE_NID) && (cref & CLFE_NID))
+			memmove(nid_mov, changelog_rec_nid(rec),
+				sizeof(struct changelog_ext_nid));
+
+		if ((cref_want & CLFE_UIDGID) && (cref & CLFE_UIDGID))
+			memmove(uidgid_mov, changelog_rec_uidgid(rec),
+				sizeof(struct changelog_ext_uidgid));
+
+		memmove(ef_mov, changelog_rec_extra_flags(rec),
+			sizeof(struct changelog_ext_extra_flags));
+	}
+
+	if ((crf_wanted & CLF_JOBID) && (rec->cr_flags & CLF_JOBID))
+		memmove(jid_mov, changelog_rec_jobid(rec),
+			sizeof(struct changelog_ext_jobid));
+
+	if ((crf_wanted & CLF_RENAME) && (rec->cr_flags & CLF_RENAME))
+		memmove(rnm_mov, changelog_rec_rename(rec),
+			sizeof(struct changelog_ext_rename));
+
+	/* Clear newly added fields */
+	if (xattr_mov && (cref_want & CLFE_XATTR) &&
+	    !(cref & CLFE_XATTR))
+		memset(xattr_mov, 0, sizeof(struct changelog_ext_xattr));
+
+	if (omd_mov && (cref_want & CLFE_OPEN) &&
+	    !(cref & CLFE_OPEN))
+		memset(omd_mov, 0, sizeof(struct changelog_ext_openmode));
+
+	if (nid_mov && (cref_want & CLFE_NID) &&
+	    !(cref & CLFE_NID))
+		memset(nid_mov, 0, sizeof(struct changelog_ext_nid));
+
+	if (uidgid_mov && (cref_want & CLFE_UIDGID) &&
+	    !(cref & CLFE_UIDGID))
+		memset(uidgid_mov, 0, sizeof(struct changelog_ext_uidgid));
+
+	if ((crf_wanted & CLF_EXTRA_FLAGS) &&
+	    !(rec->cr_flags & CLF_EXTRA_FLAGS))
+		memset(ef_mov, 0, sizeof(struct changelog_ext_extra_flags));
+
+	if ((crf_wanted & CLF_JOBID) && !(rec->cr_flags & CLF_JOBID))
+		memset(jid_mov, 0, sizeof(struct changelog_ext_jobid));
+
+	if ((crf_wanted & CLF_RENAME) && !(rec->cr_flags & CLF_RENAME))
+		memset(rnm_mov, 0, sizeof(struct changelog_ext_rename));
+
+	/* Update the record's flags accordingly */
+	rec->cr_flags = (rec->cr_flags & CLF_FLAGMASK) | crf_wanted;
+	if (rec->cr_flags & CLF_EXTRA_FLAGS)
+		changelog_rec_extra_flags(rec)->cr_extra_flags =
+			changelog_rec_extra_flags(rec)->cr_extra_flags |
+			cref_want;
+}
+
+enum changelog_message_type {
+	CL_RECORD = 10, /* message is a changelog_rec */
+	CL_EOF    = 11, /* at end of current changelog */
+};
+
+/********* Misc **********/
+
+struct ioc_data_version {
+	__u64	idv_version;
+	__u32	idv_layout_version; /* FLR: layout version for OST objects */
+	__u32	idv_flags;	/* enum ioc_data_version_flags */
+};
+
+enum ioc_data_version_flags {
+	LL_DV_RD_FLUSH	= (1 << 0), /* Flush dirty pages from clients */
+	LL_DV_WR_FLUSH	= (1 << 1), /* Flush all caching pages from clients */
+};
+
+#ifndef offsetof
+#define offsetof(typ, memb)     ((unsigned long)((char *)&(((typ *)0)->memb)))
+#endif
+
+#define dot_lustre_name ".lustre"
+
+
+/********* HSM **********/
+
+/** HSM per-file state
+ * See HSM_FLAGS below.
+ */
+enum hsm_states {
+	HS_NONE		= 0x00000000,
+	HS_EXISTS	= 0x00000001,
+	HS_DIRTY	= 0x00000002,
+	HS_RELEASED	= 0x00000004,
+	HS_ARCHIVED	= 0x00000008,
+	HS_NORELEASE	= 0x00000010,
+	HS_NOARCHIVE	= 0x00000020,
+	HS_LOST		= 0x00000040,
+};
+
+/* HSM user-setable flags. */
+#define HSM_USER_MASK   (HS_NORELEASE | HS_NOARCHIVE | HS_DIRTY)
+
+/* Other HSM flags. */
+#define HSM_STATUS_MASK (HS_EXISTS | HS_LOST | HS_RELEASED | HS_ARCHIVED)
+
+/*
+ * All HSM-related possible flags that could be applied to a file.
+ * This should be kept in sync with hsm_states.
+ */
+#define HSM_FLAGS_MASK  (HSM_USER_MASK | HSM_STATUS_MASK)
+
+/**
+ * HSM request progress state
+ */
+enum hsm_progress_states {
+	HPS_NONE	= 0,
+	HPS_WAITING	= 1,
+	HPS_RUNNING	= 2,
+	HPS_DONE	= 3,
+};
+
+static inline const char *hsm_progress_state2name(enum hsm_progress_states s)
+{
+	switch  (s) {
+	case HPS_WAITING:	return "waiting";
+	case HPS_RUNNING:	return "running";
+	case HPS_DONE:		return "done";
+	default:		return "unknown";
+	}
+}
+
+struct hsm_extent {
+	__u64 offset;
+	__u64 length;
+} __attribute__((packed));
+
+/**
+ * Current HSM states of a Lustre file.
+ *
+ * This structure purpose is to be sent to user-space mainly. It describes the
+ * current HSM flags and in-progress action.
+ */
+struct hsm_user_state {
+	/** Current HSM states, from enum hsm_states. */
+	__u32			hus_states;
+	__u32			hus_archive_id;
+	/**  The current undergoing action, if there is one */
+	__u32			hus_in_progress_state;
+	__u32			hus_in_progress_action;
+	struct hsm_extent	hus_in_progress_location;
+	char			hus_extended_info[];
+};
+
+struct hsm_state_set_ioc {
+	struct lu_fid	hssi_fid;
+	__u64		hssi_setmask;
+	__u64		hssi_clearmask;
+};
+
+/*
+ * This structure describes the current in-progress action for a file.
+ * it is retuned to user space and send over the wire
+ */
+struct hsm_current_action {
+	/**  The current undergoing action, if there is one */
+	/* state is one of hsm_progress_states */
+	__u32			hca_state;
+	/* action is one of hsm_user_action */
+	__u32			hca_action;
+	struct hsm_extent	hca_location;
+};
+
+/***** HSM user requests ******/
+/* User-generated (lfs/ioctl) request types */
+enum hsm_user_action {
+	HUA_NONE    =  1, /* no action (noop) */
+	HUA_ARCHIVE = 10, /* copy to hsm */
+	HUA_RESTORE = 11, /* prestage */
+	HUA_RELEASE = 12, /* drop ost objects */
+	HUA_REMOVE  = 13, /* remove from archive */
+	HUA_CANCEL  = 14  /* cancel a request */
+};
+
+static inline const char *hsm_user_action2name(enum hsm_user_action  a)
+{
+	switch  (a) {
+	case HUA_NONE:    return "NOOP";
+	case HUA_ARCHIVE: return "ARCHIVE";
+	case HUA_RESTORE: return "RESTORE";
+	case HUA_RELEASE: return "RELEASE";
+	case HUA_REMOVE:  return "REMOVE";
+	case HUA_CANCEL:  return "CANCEL";
+	default:          return "UNKNOWN";
+	}
+}
+
+/*
+ * List of hr_flags (bit field)
+ */
+#define HSM_FORCE_ACTION 0x0001
+/* used by CT, cannot be set by user */
+#define HSM_GHOST_COPY   0x0002
+
+/**
+ * Contains all the fixed part of struct hsm_user_request.
+ *
+ */
+struct hsm_request {
+	__u32 hr_action;	/* enum hsm_user_action */
+	__u32 hr_archive_id;	/* archive id, used only with HUA_ARCHIVE */
+	__u64 hr_flags;		/* request flags */
+	__u32 hr_itemcount;	/* item count in hur_user_item vector */
+	__u32 hr_data_len;
+};
+
+struct hsm_user_item {
+       struct lu_fid        hui_fid;
+       struct hsm_extent hui_extent;
+} __attribute__((packed));
+
+struct hsm_user_request {
+	struct hsm_request	hur_request;
+	struct hsm_user_item	hur_user_item[0];
+	/* extra data blob at end of struct (after all
+	 * hur_user_items), only use helpers to access it
+	 */
+} __attribute__((packed));
+
+/** Return pointer to data field in a hsm user request */
+static inline void *hur_data(struct hsm_user_request *hur)
+{
+	return &(hur->hur_user_item[hur->hur_request.hr_itemcount]);
+}
+
+/**
+ * Compute the current length of the provided hsm_user_request.  This returns -1
+ * instead of an errno because __kernel_ssize_t is defined to be only
+ * [ -1, SSIZE_MAX ]
+ *
+ * return -1 on bounds check error.
+ */
+static inline __kernel_size_t hur_len(struct hsm_user_request *hur)
+{
+	__u64	size;
+
+	/* can't overflow a __u64 since hr_itemcount is only __u32 */
+	size = offsetof(struct hsm_user_request, hur_user_item[0]) +
+		(__u64)hur->hur_request.hr_itemcount *
+		sizeof(hur->hur_user_item[0]) + hur->hur_request.hr_data_len;
+
+	if ((__kernel_ssize_t)size < 0)
+		return -1;
+
+	return size;
+}
+
+/****** HSM RPCs to copytool *****/
+/* Message types the copytool may receive */
+enum hsm_message_type {
+	HMT_ACTION_LIST = 100, /* message is a hsm_action_list */
+};
+
+/* Actions the copytool may be instructed to take for a given action_item */
+enum hsm_copytool_action {
+	HSMA_NONE    = 10, /* no action */
+	HSMA_ARCHIVE = 20, /* arbitrary offset */
+	HSMA_RESTORE = 21,
+	HSMA_REMOVE  = 22,
+	HSMA_CANCEL  = 23
+};
+
+static inline const char *hsm_copytool_action2name(enum hsm_copytool_action  a)
+{
+	switch  (a) {
+	case HSMA_NONE:    return "NOOP";
+	case HSMA_ARCHIVE: return "ARCHIVE";
+	case HSMA_RESTORE: return "RESTORE";
+	case HSMA_REMOVE:  return "REMOVE";
+	case HSMA_CANCEL:  return "CANCEL";
+	default:           return "UNKNOWN";
+	}
+}
+
+/* Copytool item action description */
+struct hsm_action_item {
+	__u32      hai_len;     /* valid size of this struct */
+	__u32      hai_action;  /* hsm_copytool_action, but use known size */
+	struct lu_fid hai_fid;     /* Lustre FID to operate on */
+	struct lu_fid hai_dfid;    /* fid used for data access */
+	struct hsm_extent hai_extent;  /* byte range to operate on */
+	__u64      hai_cookie;  /* action cookie from coordinator */
+	__u64      hai_gid;     /* grouplock id */
+	char       hai_data[0]; /* variable length */
+} __attribute__((packed));
+
+/**
+ * helper function which print in hexa the first bytes of
+ * hai opaque field
+ *
+ * \param hai [IN]        record to print
+ * \param buffer [IN,OUT] buffer to write the hex string to
+ * \param len [IN]        max buffer length
+ *
+ * \retval buffer
+ */
+static inline char *hai_dump_data_field(const struct hsm_action_item *hai,
+					char *buffer, __kernel_size_t len)
+{
+	int i;
+	int data_len;
+	char *ptr;
+
+	ptr = buffer;
+	data_len = hai->hai_len - sizeof(*hai);
+	for (i = 0; (i < data_len) && (len > 2); i++) {
+		snprintf(ptr, 3, "%02X", (unsigned char)hai->hai_data[i]);
+		ptr += 2;
+		len -= 2;
+	}
+
+	*ptr = '\0';
+
+	return buffer;
+}
+
+/* Copytool action list */
+#define HAL_VERSION 1
+#define HAL_MAXSIZE LNET_MTU /* bytes, used in userspace only */
+struct hsm_action_list {
+	__u32 hal_version;
+	__u32 hal_count;       /* number of hai's to follow */
+	__u64 hal_compound_id; /* returned by coordinator, ignored */
+	__u64 hal_flags;
+	__u32 hal_archive_id; /* which archive backend */
+	__u32 padding1;
+	char  hal_fsname[0];   /* null-terminated */
+	/* struct hsm_action_item[hal_count] follows, aligned on 8-byte
+	   boundaries. See hai_zero */
+} __attribute__((packed));
+
+/* Return pointer to first hai in action list */
+static inline struct hsm_action_item *hai_first(struct hsm_action_list *hal)
+{
+	__kernel_size_t offset = __ALIGN_KERNEL(strlen(hal->hal_fsname) + 1, 8);
+
+	return (struct hsm_action_item *)(hal->hal_fsname + offset);
+}
+
+/* Return pointer to next hai */
+static inline struct hsm_action_item * hai_next(struct hsm_action_item *hai)
+{
+	__kernel_size_t offset = __ALIGN_KERNEL(hai->hai_len, 8);
+
+	return (struct hsm_action_item *)((char *)hai + offset);
+}
+
+/* Return size of an hsm_action_list */
+static inline __kernel_size_t hal_size(struct hsm_action_list *hal)
+{
+	__u32 i;
+	__kernel_size_t sz;
+	struct hsm_action_item *hai;
+
+	sz = sizeof(*hal) + __ALIGN_KERNEL(strlen(hal->hal_fsname) + 1, 8);
+	hai = hai_first(hal);
+	for (i = 0; i < hal->hal_count ; i++, hai = hai_next(hai))
+		sz += __ALIGN_KERNEL(hai->hai_len, 8);
+
+	return sz;
+}
+
+/* HSM file import
+ * describe the attributes to be set on imported file
+ */
+struct hsm_user_import {
+	__u64		hui_size;
+	__u64		hui_atime;
+	__u64		hui_mtime;
+	__u32		hui_atime_ns;
+	__u32		hui_mtime_ns;
+	__u32		hui_uid;
+	__u32		hui_gid;
+	__u32		hui_mode;
+	__u32		hui_archive_id;
+};
+
+/* Copytool progress reporting */
+#define HP_FLAG_COMPLETED 0x01
+#define HP_FLAG_RETRY     0x02
+
+struct hsm_progress {
+	struct lu_fid		hp_fid;
+	__u64			hp_cookie;
+	struct hsm_extent	hp_extent;
+	__u16			hp_flags;
+	__u16			hp_errval; /* positive val */
+	__u32			padding;
+};
+
+struct hsm_copy {
+	__u64			hc_data_version;
+	__u16			hc_flags;
+	__u16			hc_errval; /* positive val */
+	__u32			padding;
+	struct hsm_action_item	hc_hai;
+};
+
+/* JSON objects */
+enum llapi_json_types {
+	LLAPI_JSON_INTEGER = 1,
+	LLAPI_JSON_BIGNUM,
+	LLAPI_JSON_REAL,
+	LLAPI_JSON_STRING
+};
+
+struct llapi_json_item {
+	char			*lji_key;
+	__u32			lji_type;
+	union {
+		int	lji_integer;
+		__u64	lji_u64;
+		double	lji_real;
+		char	*lji_string;
+	};
+	struct llapi_json_item	*lji_next;
+};
+
+struct llapi_json_item_list {
+	int			ljil_item_count;
+	struct llapi_json_item	*ljil_items;
+};
+
+enum lu_ladvise_type {
+	LU_LADVISE_INVALID	= 0,
+	LU_LADVISE_WILLREAD	= 1,
+	LU_LADVISE_DONTNEED	= 2,
+	LU_LADVISE_LOCKNOEXPAND = 3,
+	LU_LADVISE_LOCKAHEAD	= 4,
+	LU_LADVISE_MAX
+};
+
+#define LU_LADVISE_NAMES {						\
+	[LU_LADVISE_WILLREAD]		= "willread",			\
+	[LU_LADVISE_DONTNEED]		= "dontneed",			\
+	[LU_LADVISE_LOCKNOEXPAND]	= "locknoexpand",		\
+	[LU_LADVISE_LOCKAHEAD]		= "lockahead",			\
+}
+
+/* This is the userspace argument for ladvise.  It is currently the same as
+ * what goes on the wire (struct lu_ladvise), but is defined separately as we
+ * may need info which is only used locally. */
+struct llapi_lu_ladvise {
+	__u16 lla_advice;	/* advice type */
+	__u16 lla_value1;	/* values for different advice types */
+	__u32 lla_value2;
+	__u64 lla_start;	/* first byte of extent for advice */
+	__u64 lla_end;		/* last byte of extent for advice */
+	__u32 lla_value3;
+	__u32 lla_value4;
+};
+
+enum ladvise_flag {
+	LF_ASYNC	= 0x00000001,
+	LF_UNSET        = 0x00000002,
+};
+
+#define LADVISE_MAGIC 0x1ADF1CE0
+/* Masks of valid flags for each advice */
+#define LF_LOCKNOEXPAND_MASK LF_UNSET
+/* Flags valid for all advices not explicitly specified */
+#define LF_DEFAULT_MASK LF_ASYNC
+/* All flags */
+#define LF_MASK (LF_ASYNC | LF_UNSET)
+
+#define lla_lockahead_mode   lla_value1
+#define lla_peradvice_flags    lla_value2
+#define lla_lockahead_result lla_value3
+
+/* This is the userspace argument for ladvise, corresponds to ladvise_hdr which
+ * is used on the wire.  It is defined separately as we may need info which is
+ * only used locally. */
+struct llapi_ladvise_hdr {
+	__u32			lah_magic;	/* LADVISE_MAGIC */
+	__u32			lah_count;	/* number of advices */
+	__u64			lah_flags;	/* from enum ladvise_flag */
+	__u32			lah_value1;	/* unused */
+	__u32			lah_value2;	/* unused */
+	__u64			lah_value3;	/* unused */
+	struct llapi_lu_ladvise	lah_advise[0];	/* advices in this header */
+};
+
+#define LAH_COUNT_MAX	(1024)
+
+/* Shared key */
+enum sk_crypt_alg {
+	SK_CRYPT_INVALID	= -1,
+	SK_CRYPT_EMPTY		= 0,
+	SK_CRYPT_AES256_CTR	= 1,
+};
+
+enum sk_hmac_alg {
+	SK_HMAC_INVALID	= -1,
+	SK_HMAC_EMPTY	= 0,
+	SK_HMAC_SHA256	= 1,
+	SK_HMAC_SHA512	= 2,
+};
+
+struct sk_crypt_type {
+	const char     *sct_name;
+	int		sct_type;
+};
+
+struct sk_hmac_type {
+	const char     *sht_name;
+	int		sht_type;
+};
+
+enum lock_mode_user {
+	MODE_READ_USER = 1,
+	MODE_WRITE_USER,
+	MODE_MAX_USER,
+};
+
+#define LOCK_MODE_NAMES { \
+	[MODE_READ_USER]  = "READ",\
+	[MODE_WRITE_USER] = "WRITE"\
+}
+
+enum lockahead_results {
+	LLA_RESULT_SENT = 0,
+	LLA_RESULT_DIFFERENT,
+	LLA_RESULT_SAME,
+};
+
+struct fid_array {
+	__u32 fa_nr;
+	/* make header's size equal lu_fid */
+	__u32 fa_padding0;
+	__u64 fa_padding1;
+	struct lu_fid fa_fids[0];
+};
+#define OBD_MAX_FIDS_IN_ARRAY	4096
+
+#if defined(__cplusplus)
+}
+#endif
+
+/** @} lustreuser */
+
+#endif /* _LUSTRE_USER_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_ver.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ver.h
similarity index 83%
rename from drivers/staging/lustrefsx/lustre/include/lustre_ver.h
rename to drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ver.h
index 0557c2dd554e5..90aa25d8aab8a 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_ver.h
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ver.h
@@ -23,15 +23,9 @@
 #define LUSTRE_VERSION_CODE						\
 	OBD_OCD_VERSION(LUSTRE_MAJOR, LUSTRE_MINOR, LUSTRE_PATCH, LUSTRE_FIX)
 
-/* liblustre clients are only allowed to connect if their LUSTRE_FIX mismatches
- * by this amount (set in lustre/autoconf/lustre-version.ac). */
-#define LUSTRE_VERSION_ALLOWED_OFFSET OBD_OCD_VERSION(0, 0, 1, 32)
-
-#ifdef __KERNEL__
 /* If lustre version of client and servers it connects to differs by more
  * than this amount, client would issue a warning.
  * (set in lustre/autoconf/lustre-version.ac) */
 #define LUSTRE_VERSION_OFFSET_WARN OBD_OCD_VERSION(0, 4, 50, 0)
-#endif
 
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/include/upcall_cache.h b/drivers/staging/lustrefsx/lustre/include/upcall_cache.h
index accc4495d156e..1f02294b9660d 100644
--- a/drivers/staging/lustrefsx/lustre/include/upcall_cache.h
+++ b/drivers/staging/lustrefsx/lustre/include/upcall_cache.h
@@ -34,7 +34,7 @@
 #define _UPCALL_CACHE_H
 
 #include <libcfs/libcfs.h>
-#include <lnet/types.h>
+#include <uapi/linux/lnet/lnet-types.h>
 
 /** \defgroup ucache ucache
  *
@@ -85,8 +85,8 @@ struct upcall_cache_entry {
 	atomic_t		ue_refcount;
 	int			ue_flags;
 	wait_queue_head_t	ue_waitq;
-	cfs_time_t		ue_acquire_expire;
-	cfs_time_t		ue_expire;
+	time64_t		ue_acquire_expire;
+	time64_t		ue_expire;
 	union {
 		struct md_identity	identity;
 	} u;
@@ -121,8 +121,8 @@ struct upcall_cache {
 
 	char			uc_name[40];		/* for upcall */
 	char			uc_upcall[UC_CACHE_UPCALL_MAXPATH];
-	int			uc_acquire_expire;	/* seconds */
-	int			uc_entry_expire;	/* seconds */
+	time64_t		uc_acquire_expire;	/* seconds */
+	time64_t		uc_entry_expire;	/* seconds */
 	struct upcall_cache_ops	*uc_ops;
 };
 
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/interval_tree.c b/drivers/staging/lustrefsx/lustre/ldlm/interval_tree.c
index 7dd0c65332649..b39b105a894e6 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/interval_tree.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/interval_tree.c
@@ -36,11 +36,8 @@
  * Author: Huang Wei <huangwei@clusterfs.com>
  * Author: Jay Xiong <jinshan.xiong@sun.com>
  */
-#ifdef __KERNEL__
-# include <lustre_dlm.h>
-#else
-# include <libcfs/libcfs.h>
-#endif
+
+#include <lustre_dlm.h>
 #include <interval_tree.h>
 
 enum {
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_extent.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_extent.c
index 1088d583145e7..59d1302a36516 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_extent.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_extent.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2013, Intel Corporation.
+ * Copyright (c) 2010, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -269,38 +269,49 @@ ldlm_extent_internal_policy_waiting(struct ldlm_lock *req,
 static void ldlm_extent_policy(struct ldlm_resource *res,
 			       struct ldlm_lock *lock, __u64 *flags)
 {
-        struct ldlm_extent new_ex = { .start = 0, .end = OBD_OBJECT_EOF };
-
-        if (lock->l_export == NULL)
-                /*
-                 * this is local lock taken by server (e.g., as a part of
-                 * OST-side locking, or unlink handling). Expansion doesn't
-                 * make a lot of sense for local locks, because they are
-                 * dropped immediately on operation completion and would only
-                 * conflict with other threads.
-                 */
-                return;
+	struct ldlm_extent new_ex = { .start = 0, .end = OBD_OBJECT_EOF };
+
+	if (lock->l_export == NULL)
+		/*
+		 * this is a local lock taken by server (e.g., as a part of
+		 * OST-side locking, or unlink handling). Expansion doesn't
+		 * make a lot of sense for local locks, because they are
+		 * dropped immediately on operation completion and would only
+		 * conflict with other threads.
+		 */
+		return;
 
-        if (lock->l_policy_data.l_extent.start == 0 &&
-            lock->l_policy_data.l_extent.end == OBD_OBJECT_EOF)
-                /* fast-path whole file locks */
-                return;
+	if (lock->l_policy_data.l_extent.start == 0 &&
+	    lock->l_policy_data.l_extent.end == OBD_OBJECT_EOF)
+		/* fast-path whole file locks */
+		return;
 
-        ldlm_extent_internal_policy_granted(lock, &new_ex);
-        ldlm_extent_internal_policy_waiting(lock, &new_ex);
+	/* Because reprocess_queue zeroes flags and uses it to return
+	 * LDLM_FL_LOCK_CHANGED, we must check for the NO_EXPANSION flag
+	 * in the lock flags rather than the 'flags' argument */
+	if (likely(!(lock->l_flags & LDLM_FL_NO_EXPANSION))) {
+		ldlm_extent_internal_policy_granted(lock, &new_ex);
+		ldlm_extent_internal_policy_waiting(lock, &new_ex);
+	} else {
+		LDLM_DEBUG(lock, "Not expanding manually requested lock.\n");
+		new_ex.start = lock->l_policy_data.l_extent.start;
+		new_ex.end = lock->l_policy_data.l_extent.end;
+		/* In case the request is not on correct boundaries, we call
+		 * fixup. (normally called in ldlm_extent_internal_policy_*) */
+		ldlm_extent_internal_policy_fixup(lock, &new_ex, 0);
+	}
 
-        if (new_ex.start != lock->l_policy_data.l_extent.start ||
-            new_ex.end != lock->l_policy_data.l_extent.end) {
-                *flags |= LDLM_FL_LOCK_CHANGED;
-                lock->l_policy_data.l_extent.start = new_ex.start;
-                lock->l_policy_data.l_extent.end = new_ex.end;
-        }
+	if (!ldlm_extent_equal(&new_ex, &lock->l_policy_data.l_extent)) {
+		*flags |= LDLM_FL_LOCK_CHANGED;
+		lock->l_policy_data.l_extent.start = new_ex.start;
+		lock->l_policy_data.l_extent.end = new_ex.end;
+	}
 }
 
 static int ldlm_check_contention(struct ldlm_lock *lock, int contended_locks)
 {
 	struct ldlm_resource *res = lock->l_resource;
-	cfs_time_t now = cfs_time_current();
+	time64_t now = ktime_get_seconds();
 
 	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_SET_CONTENTION))
 		return 1;
@@ -308,8 +319,9 @@ static int ldlm_check_contention(struct ldlm_lock *lock, int contended_locks)
 	CDEBUG(D_DLMTRACE, "contended locks = %d\n", contended_locks);
 	if (contended_locks > ldlm_res_to_ns(res)->ns_contended_locks)
 		res->lr_contention_time = now;
-	return cfs_time_before(now, cfs_time_add(res->lr_contention_time,
-		cfs_time_seconds(ldlm_res_to_ns(res)->ns_contention_time)));
+
+	return now < res->lr_contention_time +
+		     ldlm_res_to_ns(res)->ns_contention_time;
 }
 
 struct ldlm_extent_compat_args {
@@ -421,7 +433,8 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
                         }
 
                         if (tree->lit_mode == LCK_GROUP) {
-                                if (*flags & LDLM_FL_BLOCK_NOWAIT) {
+				if (*flags & (LDLM_FL_BLOCK_NOWAIT |
+					      LDLM_FL_SPECULATIVE)) {
                                         compat = -EWOULDBLOCK;
                                         goto destroylock;
                                 }
@@ -438,10 +451,24 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
                                 continue;
                         }
 
-                        if (!work_list) {
-                                rc = interval_is_overlapped(tree->lit_root,&ex);
-                                if (rc)
-                                        RETURN(0);
+			/* We've found a potentially blocking lock, check
+			 * compatibility.  This handles locks other than GROUP
+			 * locks, which are handled separately above.
+			 *
+			 * Locks with FL_SPECULATIVE are asynchronous requests
+			 * which must never wait behind another lock, so they
+			 * fail if any conflicting lock is found. */
+			if (!work_list || (*flags & LDLM_FL_SPECULATIVE)) {
+				rc = interval_is_overlapped(tree->lit_root,
+							    &ex);
+				if (rc) {
+					if (!work_list) {
+						RETURN(0);
+					} else {
+						compat = -EWOULDBLOCK;
+						goto destroylock;
+					}
+				}
                         } else {
                                 interval_search(tree->lit_root, &ex,
                                                 ldlm_extent_compat_cb, &data);
@@ -528,8 +555,8 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
                                     lock->l_policy_data.l_extent.gid) {
                                         /* If existing lock with matched gid is granted,
                                            we grant new one too. */
-                                        if (lock->l_req_mode == lock->l_granted_mode)
-                                                RETURN(2);
+					if (ldlm_is_granted(lock))
+						RETURN(2);
 
                                         /* Otherwise we are scanning queue of waiting
                                          * locks and it means current request would
@@ -537,7 +564,8 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
                                          * already blocked.
                                          * If we are in nonblocking mode - return
                                          * immediately */
-                                        if (*flags & LDLM_FL_BLOCK_NOWAIT) {
+					if (*flags & (LDLM_FL_BLOCK_NOWAIT
+						      | LDLM_FL_SPECULATIVE)) {
                                                 compat = -EWOULDBLOCK;
                                                 goto destroylock;
                                         }
@@ -556,8 +584,8 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
                                 }
                         }
 
-                        if (unlikely(req_mode == LCK_GROUP &&
-                                     (lock->l_req_mode != lock->l_granted_mode))) {
+			if (unlikely(req_mode == LCK_GROUP &&
+				     !ldlm_is_granted(lock))) {
                                 scan = 1;
                                 compat = 0;
                                 if (lock->l_req_mode != LCK_GROUP) {
@@ -580,10 +608,11 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
                         }
 
                         if (unlikely(lock->l_req_mode == LCK_GROUP)) {
-                                /* If compared lock is GROUP, then requested is PR/PW/
-                                 * so this is not compatible; extent range does not
-                                 * matter */
-                                if (*flags & LDLM_FL_BLOCK_NOWAIT) {
+				/* If compared lock is GROUP, then requested is
+				 * PR/PW so this is not compatible; extent
+				 * range does not matter */
+				if (*flags & (LDLM_FL_BLOCK_NOWAIT
+					      | LDLM_FL_SPECULATIVE)) {
                                         compat = -EWOULDBLOCK;
                                         goto destroylock;
                                 } else {
@@ -602,6 +631,11 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
                         if (!work_list)
                                 RETURN(0);
 
+			if (*flags & LDLM_FL_SPECULATIVE) {
+				compat = -EWOULDBLOCK;
+				goto destroylock;
+			}
+
                         /* don't count conflicting glimpse locks */
                         if (lock->l_req_mode == LCK_PR &&
                             lock->l_policy_data.l_extent.start == 0 &&
@@ -642,7 +676,7 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
 void ldlm_lock_prolong_one(struct ldlm_lock *lock,
 			   struct ldlm_prolong_args *arg)
 {
-	int timeout;
+	time64_t timeout;
 
 	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PROLONG_PAUSE, 3);
 
@@ -662,7 +696,7 @@ void ldlm_lock_prolong_one(struct ldlm_lock *lock,
 	 */
 	timeout = arg->lpa_timeout + (ldlm_bl_timeout(lock) >> 1);
 
-	LDLM_DEBUG(lock, "refreshed to %ds.\n", timeout);
+	LDLM_DEBUG(lock, "refreshed to %llds.\n", timeout);
 
 	arg->lpa_blocks_cnt++;
 
@@ -752,25 +786,24 @@ int ldlm_process_extent_lock(struct ldlm_lock *lock, __u64 *flags,
 			     enum ldlm_error *err, struct list_head *work_list)
 {
 	struct ldlm_resource *res = lock->l_resource;
-	struct list_head rpc_list;
 	int rc, rc2;
 	int contended_locks = 0;
+	struct list_head *grant_work = intention == LDLM_PROCESS_ENQUEUE ?
+							NULL : work_list;
 	ENTRY;
 
-	LASSERT(lock->l_granted_mode != lock->l_req_mode);
-	LASSERT(list_empty(&res->lr_converting));
+	LASSERT(!ldlm_is_granted(lock));
 	LASSERT(!(*flags & LDLM_FL_DENY_ON_CONTENTION) ||
 		!ldlm_is_ast_discard_data(lock));
-	INIT_LIST_HEAD(&rpc_list);
 	check_res_locked(res);
 	*err = ELDLM_OK;
 
 	if (intention == LDLM_PROCESS_RESCAN) {
-                /* Careful observers will note that we don't handle -EWOULDBLOCK
-                 * here, but it's ok for a non-obvious reason -- compat_queue
-                 * can only return -EWOULDBLOCK if (flags & BLOCK_NOWAIT).
-                 * flags should always be zero here, and if that ever stops
-                 * being true, we want to find out. */
+		/* Careful observers will note that we don't handle -EWOULDBLOCK
+		 * here, but it's ok for a non-obvious reason -- compat_queue
+		 * can only return -EWOULDBLOCK if (flags & BLOCK_NOWAIT |
+		 * SPECULATIVE). flags should always be zero here, and if that
+		 * ever stops being true, we want to find out. */
                 LASSERT(*flags == 0);
                 rc = ldlm_extent_compat_queue(&res->lr_granted, lock, flags,
                                               err, NULL, &contended_locks);
@@ -786,49 +819,38 @@ int ldlm_process_extent_lock(struct ldlm_lock *lock, __u64 *flags,
 
                 if (!OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_EVICT_RACE))
                         ldlm_extent_policy(res, lock, flags);
-                ldlm_grant_lock(lock, work_list);
+		ldlm_grant_lock(lock, grant_work);
                 RETURN(LDLM_ITER_CONTINUE);
         }
 
-	LASSERT((intention == LDLM_PROCESS_ENQUEUE && work_list == NULL) ||
-		(intention == LDLM_PROCESS_RECOVERY && work_list != NULL));
- restart:
         contended_locks = 0;
         rc = ldlm_extent_compat_queue(&res->lr_granted, lock, flags, err,
-                                      &rpc_list, &contended_locks);
+				      work_list, &contended_locks);
 	if (rc < 0)
 		GOTO(out_rpc_list, rc);
 
 	rc2 = 0;
 	if (rc != 2) {
 		rc2 = ldlm_extent_compat_queue(&res->lr_waiting, lock,
-					       flags, err, &rpc_list,
+					       flags, err, work_list,
 					       &contended_locks);
 		if (rc2 < 0)
 			GOTO(out_rpc_list, rc = rc2);
 	}
 
-	if (rc + rc2 != 2) {
-		/* Adding LDLM_FL_NO_TIMEOUT flag to granted lock to force
-		 * client to wait for the lock endlessly once the lock is
-		 * enqueued -bzzz */
-		rc = ldlm_handle_conflict_lock(lock, flags, &rpc_list,
-					       LDLM_FL_NO_TIMEOUT);
-		if (rc == -ERESTART)
-			GOTO(restart, rc);
-		*err = rc;
-	} else {
+	if (rc + rc2 == 2) {
 		ldlm_extent_policy(res, lock, flags);
 		ldlm_resource_unlink_lock(lock);
-		ldlm_grant_lock(lock, work_list);
-		rc = 0;
+		ldlm_grant_lock(lock, grant_work);
+	} else {
+		/* Adding LDLM_FL_NO_TIMEOUT flag to granted lock to
+		 * force client to wait for the lock endlessly once
+		 * the lock is enqueued -bzzz */
+		*flags |= LDLM_FL_NO_TIMEOUT;
 	}
+	rc = LDLM_ITER_CONTINUE;
 
 out_rpc_list:
-	if (!list_empty(&rpc_list)) {
-		LASSERT(!ldlm_is_ast_discard_data(lock));
-		ldlm_discard_bl_list(&rpc_list);
-	}
 	RETURN(rc);
 }
 #endif /* HAVE_SERVER_SUPPORT */
@@ -943,7 +965,7 @@ __u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms)
 EXPORT_SYMBOL(ldlm_extent_shift_kms);
 
 struct kmem_cache *ldlm_interval_slab;
-struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock)
+static struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock)
 {
 	struct ldlm_interval *node;
 	ENTRY;
@@ -1004,6 +1026,14 @@ static inline int ldlm_mode_to_index(enum ldlm_mode mode)
 	return index;
 }
 
+int ldlm_extent_alloc_lock(struct ldlm_lock *lock)
+{
+	lock->l_tree_node = NULL;
+	if (ldlm_interval_alloc(lock) == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
 /** Add newly granted lock into interval tree for the resource. */
 void ldlm_extent_add_lock(struct ldlm_resource *res,
                           struct ldlm_lock *lock)
@@ -1013,7 +1043,7 @@ void ldlm_extent_add_lock(struct ldlm_resource *res,
         struct ldlm_extent *extent;
 	int idx, rc;
 
-        LASSERT(lock->l_granted_mode == lock->l_req_mode);
+	LASSERT(ldlm_is_granted(lock));
 
         node = lock->l_tree_node;
         LASSERT(node != NULL);
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_flock.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_flock.c
index b3d669799ceba..be849938cc6c6 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_flock.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_flock.c
@@ -27,7 +27,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2014, Intel Corporation.
+ * Copyright (c) 2010, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -289,6 +289,8 @@ ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags,
 	int overlaps = 0;
 	int splitted = 0;
 	const struct ldlm_callback_suite null_cbs = { NULL };
+	struct list_head *grant_work = intention == LDLM_PROCESS_ENQUEUE ?
+							NULL : work_list;
 	ENTRY;
 
 	CDEBUG(D_DLMTRACE, "flags %#llx owner %llu pid %u mode %u start "
@@ -348,7 +350,7 @@ ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags,
 				reprocess_failed = 1;
 				if (ldlm_flock_deadlock(req, lock)) {
 					ldlm_flock_cancel_on_deadlock(req,
-							work_list);
+							grant_work);
 					RETURN(LDLM_ITER_CONTINUE);
 				}
 				continue;
@@ -579,7 +581,7 @@ ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags,
 restart:
 				ldlm_reprocess_queue(res, &res->lr_waiting,
 						     &rpc_list,
-						     LDLM_PROCESS_RESCAN);
+						     LDLM_PROCESS_RESCAN, NULL);
 
                                 unlock_res_and_lock(req);
                                 rc = ldlm_run_ast_work(ns, &rpc_list,
@@ -590,7 +592,7 @@ ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags,
                        }
                 } else {
                         LASSERT(req->l_completion_ast);
-                        ldlm_add_ast_work_item(req, NULL, work_list);
+			ldlm_add_ast_work_item(req, NULL, grant_work);
                 }
 #else /* !HAVE_SERVER_SUPPORT */
                 /* The only one possible case for client-side calls flock
@@ -742,7 +744,7 @@ ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
 		RETURN(-EIO);
 	}
 
-        /* ldlm_lock_enqueue() has already placed lock on the granted list. */
+	/* ldlm_lock_enqueue() has already placed lock on the granted list. */
 	ldlm_resource_unlink_lock(lock);
 
 	/* Import invalidation. We need to actually release the lock
@@ -757,7 +759,7 @@ ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
 			LASSERT(ldlm_is_test_lock(lock));
 
 		if (ldlm_is_test_lock(lock) || ldlm_is_flock_deadlock(lock))
-			mode = flock_type(getlk);
+			mode = getlk->fl_type;
 		else
 			mode = lock->l_granted_mode;
 
@@ -780,27 +782,26 @@ ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
 	LDLM_DEBUG(lock, "client-side enqueue granted");
 
 	if (flags & LDLM_FL_TEST_LOCK) {
-                /* fcntl(F_GETLK) request */
-                /* The old mode was saved in getlk->fl_type so that if the mode
-                 * in the lock changes we can decref the appropriate refcount.*/
+		/*
+		 * fcntl(F_GETLK) request
+		 * The old mode was saved in getlk->fl_type so that if the mode
+		 * in the lock changes we can decref the appropriate refcount.
+		 */
 		LASSERT(ldlm_is_test_lock(lock));
-		ldlm_flock_destroy(lock, flock_type(getlk),
-				   LDLM_FL_WAIT_NOREPROC);
+		ldlm_flock_destroy(lock, getlk->fl_type, LDLM_FL_WAIT_NOREPROC);
 		switch (lock->l_granted_mode) {
 		case LCK_PR:
-			flock_set_type(getlk, F_RDLCK);
+			getlk->fl_type = F_RDLCK;
 			break;
 		case LCK_PW:
-			flock_set_type(getlk, F_WRLCK);
+			getlk->fl_type = F_WRLCK;
 			break;
 		default:
-			flock_set_type(getlk, F_UNLCK);
+			getlk->fl_type = F_UNLCK;
 		}
-		flock_set_pid(getlk, (pid_t)lock->l_policy_data.l_flock.pid);
-		flock_set_start(getlk,
-				(loff_t)lock->l_policy_data.l_flock.start);
-		flock_set_end(getlk,
-			      (loff_t)lock->l_policy_data.l_flock.end);
+		getlk->fl_pid = (pid_t)lock->l_policy_data.l_flock.pid;
+		getlk->fl_start = (loff_t)lock->l_policy_data.l_flock.start;
+		getlk->fl_end = (loff_t)lock->l_policy_data.l_flock.end;
 	} else {
 		__u64 noreproc = LDLM_FL_WAIT_NOREPROC;
 
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_inodebits.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_inodebits.c
index 90e34a612d7c8..c407cf676fba8 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_inodebits.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_inodebits.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -57,6 +57,89 @@
 #include "ldlm_internal.h"
 
 #ifdef HAVE_SERVER_SUPPORT
+
+/**
+ * It should iterate through all waiting locks on a given resource queue and
+ * attempt to grant them. An optimization is to check only heads waitintg
+ * locks for each inodebit type.
+ *
+ * Must be called with resource lock held.
+ */
+int ldlm_reprocess_inodebits_queue(struct ldlm_resource *res,
+				   struct list_head *queue,
+				   struct list_head *work_list,
+				   enum ldlm_process_intention intention,
+				   struct ldlm_lock *hint)
+{
+	__u64 flags;
+	int rc = LDLM_ITER_CONTINUE;
+	enum ldlm_error err;
+	struct list_head bl_ast_list = LIST_HEAD_INIT(bl_ast_list);
+	struct ldlm_ibits_queues *queues = res->lr_ibits_queues;
+	int i;
+
+	ENTRY;
+
+	check_res_locked(res);
+
+	LASSERT(res->lr_type == LDLM_IBITS);
+	LASSERT(intention == LDLM_PROCESS_RESCAN ||
+		intention == LDLM_PROCESS_RECOVERY);
+
+	if (intention == LDLM_PROCESS_RECOVERY)
+		return ldlm_reprocess_queue(res, queue, work_list, intention,
+					    NULL);
+
+restart:
+	CDEBUG(D_DLMTRACE, "--- Reprocess resource "DLDLMRES" (%p)\n",
+	       PLDLMRES(res), res);
+
+	for (i = 0; i < MDS_INODELOCK_NUMBITS; i++) {
+		struct list_head rpc_list = LIST_HEAD_INIT(rpc_list);
+		struct list_head *head = &queues->liq_waiting[i];
+		struct ldlm_lock *pending;
+		struct ldlm_ibits_node *node;
+
+		if (list_empty(head))
+			continue;
+		if (hint && !(hint->l_policy_data.l_inodebits.bits & (1 << i)))
+			continue;
+
+		node = list_entry(head->next, struct ldlm_ibits_node,
+				  lin_link[i]);
+
+		pending = node->lock;
+		LDLM_DEBUG(pending, "Reprocessing lock from queue %d", i);
+
+		flags = 0;
+		rc = ldlm_process_inodebits_lock(pending, &flags, intention,
+						 &err, &rpc_list);
+		if (ldlm_is_granted(pending)) {
+			list_splice(&rpc_list, work_list);
+			/* Try to grant more locks from current queue */
+			i--;
+		} else {
+			list_splice(&rpc_list, &bl_ast_list);
+		}
+	}
+
+	if (!list_empty(&bl_ast_list)) {
+		unlock_res(res);
+
+		rc = ldlm_run_ast_work(ldlm_res_to_ns(res), &bl_ast_list,
+				       LDLM_WORK_BL_AST);
+
+		lock_res(res);
+		if (rc == -ERESTART)
+			GOTO(restart, rc);
+	}
+
+	if (!list_empty(&bl_ast_list))
+		ldlm_discard_bl_list(&bl_ast_list);
+
+	RETURN(rc);
+}
+
 /**
  * Determine if the lock is compatible with all locks on the queue.
  *
@@ -79,12 +162,18 @@ ldlm_inodebits_compat_queue(struct list_head *queue, struct ldlm_lock *req,
 	struct list_head *tmp;
 	struct ldlm_lock *lock;
 	__u64 req_bits = req->l_policy_data.l_inodebits.bits;
+	__u64 *try_bits = &req->l_policy_data.l_inodebits.try_bits;
 	int compat = 1;
+
 	ENTRY;
 
-	/* There is no sense in lock with no bits set, I think.
-	 * Also, such a lock would be compatible with any other bit lock */
-	LASSERT(req_bits != 0);
+	/* There is no sense in lock with no bits set. Also such a lock
+	 * would be compatible with any other bit lock.
+	 * Meanwhile that can be true if there were just try_bits and all
+	 * are failed, so just exit gracefully and let the caller to care.
+	 */
+	if ((req_bits | *try_bits) == 0)
+		RETURN(0);
 
 	list_for_each(tmp, queue) {
 		struct list_head *mode_tail;
@@ -99,11 +188,10 @@ ldlm_inodebits_compat_queue(struct list_head *queue, struct ldlm_lock *req,
 
 		/* last lock in mode group */
 		LASSERT(lock->l_sl_mode.prev != NULL);
-		mode_tail = &list_entry(lock->l_sl_mode.prev,
-					struct ldlm_lock,
+		mode_tail = &list_entry(lock->l_sl_mode.prev, struct ldlm_lock,
 					l_sl_mode)->l_res_link;
 
-		/* if reqest lock is not COS_INCOMPAT and COS is disabled,
+		/* if request lock is not COS_INCOMPAT and COS is disabled,
 		 * they are compatible, IOW this request is from a local
 		 * transaction on a DNE system. */
 		if (lock->l_req_mode == LCK_COS && !ldlm_is_cos_incompat(req) &&
@@ -125,8 +213,24 @@ ldlm_inodebits_compat_queue(struct list_head *queue, struct ldlm_lock *req,
 
 			/* Advance loop cursor to last lock in policy group. */
 			tmp = &list_entry(lock->l_sl_policy.prev,
-					      struct ldlm_lock,
-					      l_sl_policy)->l_res_link;
+					  struct ldlm_lock,
+					  l_sl_policy)->l_res_link;
+
+			/* New lock's try_bits are filtered out by ibits
+			 * of all locks in both granted and waiting queues.
+			 */
+			*try_bits &= ~(lock->l_policy_data.l_inodebits.bits |
+				lock->l_policy_data.l_inodebits.try_bits);
+
+			if ((req_bits | *try_bits) == 0)
+				RETURN(0);
+
+			/* The new lock ibits is more preferable than try_bits
+			 * of waiting locks so drop conflicting try_bits in
+			 * the waiting queue.
+			 * Notice that try_bits of granted locks must be zero.
+			 */
+			lock->l_policy_data.l_inodebits.try_bits &= ~req_bits;
 
 			/* Locks with overlapping bits conflict. */
 			if (lock->l_policy_data.l_inodebits.bits & req_bits) {
@@ -138,6 +242,7 @@ ldlm_inodebits_compat_queue(struct list_head *queue, struct ldlm_lock *req,
 				    ldlm_is_cos_enabled(req) &&
 				    lock->l_client_cookie == req->l_client_cookie)
 					goto not_conflicting;
+
 				/* Found a conflicting policy group. */
 				if (!work_list)
 					RETURN(0);
@@ -146,22 +251,21 @@ ldlm_inodebits_compat_queue(struct list_head *queue, struct ldlm_lock *req,
 
 				/* Add locks of the policy group to @work_list
 				 * as blocking locks for @req */
-                                if (lock->l_blocking_ast)
-                                        ldlm_add_ast_work_item(lock, req,
-                                                               work_list);
-                                head = &lock->l_sl_policy;
+				if (lock->l_blocking_ast)
+					ldlm_add_ast_work_item(lock, req,
+							       work_list);
+				head = &lock->l_sl_policy;
 				list_for_each_entry(lock, head, l_sl_policy)
-                                        if (lock->l_blocking_ast)
-                                                ldlm_add_ast_work_item(lock, req,
-                                                                       work_list);
-                        }
-                not_conflicting:
-                        if (tmp == mode_tail)
-                                break;
-
-                        tmp = tmp->next;
-			lock = list_entry(tmp, struct ldlm_lock,
-                                              l_res_link);
+					if (lock->l_blocking_ast)
+						ldlm_add_ast_work_item(lock,
+								req, work_list);
+			}
+not_conflicting:
+			if (tmp == mode_tail)
+				break;
+
+			tmp = tmp->next;
+			lock = list_entry(tmp, struct ldlm_lock, l_res_link);
 		} /* Loop over policy groups within one mode group. */
 	} /* Loop over mode groups within @queue. */
 
@@ -182,57 +286,95 @@ int ldlm_process_inodebits_lock(struct ldlm_lock *lock, __u64 *flags,
 				struct list_head *work_list)
 {
 	struct ldlm_resource *res = lock->l_resource;
-	struct list_head rpc_list;
+	struct list_head *grant_work = intention == LDLM_PROCESS_ENQUEUE ?
+							NULL : work_list;
 	int rc;
+
 	ENTRY;
 
-	LASSERT(lock->l_granted_mode != lock->l_req_mode);
-	LASSERT(list_empty(&res->lr_converting));
-	INIT_LIST_HEAD(&rpc_list);
+	LASSERT(!ldlm_is_granted(lock));
 	check_res_locked(res);
 
-	/* (*flags & LDLM_FL_BLOCK_NOWAIT) is for layout lock right now. */
-	if (intention == LDLM_PROCESS_RESCAN ||
-	    (*flags & LDLM_FL_BLOCK_NOWAIT)) {
-		*err = ELDLM_LOCK_ABORTED;
-		if (*flags & LDLM_FL_BLOCK_NOWAIT)
+	if (intention == LDLM_PROCESS_RESCAN) {
+		struct list_head *bl_list;
+
+		if (*flags & LDLM_FL_BLOCK_NOWAIT) {
+			bl_list = NULL;
 			*err = ELDLM_LOCK_WOULDBLOCK;
+		} else {
+			bl_list = work_list;
+			*err = ELDLM_LOCK_ABORTED;
+		}
 
-                rc = ldlm_inodebits_compat_queue(&res->lr_granted, lock, NULL);
-                if (!rc)
-                        RETURN(LDLM_ITER_STOP);
-                rc = ldlm_inodebits_compat_queue(&res->lr_waiting, lock, NULL);
-                if (!rc)
-                        RETURN(LDLM_ITER_STOP);
+		LASSERT(lock->l_policy_data.l_inodebits.bits != 0);
 
-                ldlm_resource_unlink_lock(lock);
-                ldlm_grant_lock(lock, work_list);
+		/* It is possible that some of granted locks was not canceled
+		 * but converted and is kept in granted queue. So there is
+		 * a window where lock with 'ast_sent' might become granted
+		 * again. Meanwhile a new lock may appear in that window and
+		 * conflicts with the converted lock so the following scenario
+		 * is possible:
+		 *
+		 * 1) lock1 conflicts with lock2
+		 * 2) bl_ast was sent for lock2
+		 * 3) lock3 comes and conflicts with lock2 too
+		 * 4) no bl_ast sent because lock2->l_bl_ast_sent is 1
+		 * 5) lock2 was converted for lock1 but not for lock3
+		 * 6) lock1 granted, lock3 still is waiting for lock2, but
+		 *    there will never be another bl_ast for that
+		 *
+		 * To avoid this scenario the work_list is used below to collect
+		 * any blocked locks from granted queue during every reprocess
+		 * and bl_ast will be sent if needed.
+		 */
+		rc = ldlm_inodebits_compat_queue(&res->lr_granted, lock,
+						 bl_list);
+		if (!rc)
+			RETURN(LDLM_ITER_STOP);
+		rc = ldlm_inodebits_compat_queue(&res->lr_waiting, lock, NULL);
+		if (!rc)
+			RETURN(LDLM_ITER_STOP);
+
+		/* grant also try_bits if any */
+		if (lock->l_policy_data.l_inodebits.try_bits != 0) {
+			lock->l_policy_data.l_inodebits.bits |=
+				lock->l_policy_data.l_inodebits.try_bits;
+			lock->l_policy_data.l_inodebits.try_bits = 0;
+			*flags |= LDLM_FL_LOCK_CHANGED;
+		}
+		ldlm_resource_unlink_lock(lock);
+		ldlm_grant_lock(lock, grant_work);
 
 		*err = ELDLM_OK;
 		RETURN(LDLM_ITER_CONTINUE);
 	}
 
-	LASSERT((intention == LDLM_PROCESS_ENQUEUE && work_list == NULL) ||
-		(intention == LDLM_PROCESS_RECOVERY && work_list != NULL));
- restart:
-        rc = ldlm_inodebits_compat_queue(&res->lr_granted, lock, &rpc_list);
-        rc += ldlm_inodebits_compat_queue(&res->lr_waiting, lock, &rpc_list);
+	rc = ldlm_inodebits_compat_queue(&res->lr_granted, lock, work_list);
+	rc += ldlm_inodebits_compat_queue(&res->lr_waiting, lock, work_list);
 
-        if (rc != 2) {
-		rc = ldlm_handle_conflict_lock(lock, flags, &rpc_list, 0);
-		if (rc == -ERESTART)
-			GOTO(restart, rc);
-		*err = rc;
+	if (rc != 2) {
+		/* if there were only bits to try and all are conflicting */
+		if ((lock->l_policy_data.l_inodebits.bits |
+		     lock->l_policy_data.l_inodebits.try_bits) == 0) {
+			*err = ELDLM_LOCK_WOULDBLOCK;
+		} else {
+			*err = ELDLM_OK;
+		}
 	} else {
+		/* grant also all remaining try_bits */
+		if (lock->l_policy_data.l_inodebits.try_bits != 0) {
+			lock->l_policy_data.l_inodebits.bits |=
+				lock->l_policy_data.l_inodebits.try_bits;
+			lock->l_policy_data.l_inodebits.try_bits = 0;
+			*flags |= LDLM_FL_LOCK_CHANGED;
+		}
+		LASSERT(lock->l_policy_data.l_inodebits.bits);
 		ldlm_resource_unlink_lock(lock);
-		ldlm_grant_lock(lock, work_list);
-		rc = 0;
+		ldlm_grant_lock(lock, grant_work);
+		*err = ELDLM_OK;
 	}
 
-	if (!list_empty(&rpc_list))
-		ldlm_discard_bl_list(&rpc_list);
-
-	RETURN(rc);
+	RETURN(LDLM_ITER_CONTINUE);
 }
 #endif /* HAVE_SERVER_SUPPORT */
 
@@ -240,6 +382,10 @@ void ldlm_ibits_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy,
 				     union ldlm_policy_data *lpolicy)
 {
 	lpolicy->l_inodebits.bits = wpolicy->l_inodebits.bits;
+	/**
+	 * try_bits are to be handled outside of generic write_to_local due
+	 * to different behavior on a server and client.
+	 */
 }
 
 void ldlm_ibits_policy_local_to_wire(const union ldlm_policy_data *lpolicy,
@@ -247,4 +393,185 @@ void ldlm_ibits_policy_local_to_wire(const union ldlm_policy_data *lpolicy,
 {
 	memset(wpolicy, 0, sizeof(*wpolicy));
 	wpolicy->l_inodebits.bits = lpolicy->l_inodebits.bits;
+	wpolicy->l_inodebits.try_bits = lpolicy->l_inodebits.try_bits;
+}
+
+/**
+ * Attempt to convert already granted IBITS lock with several bits set to
+ * a lock with less bits (downgrade).
+ *
+ * Such lock conversion is used to keep lock with non-blocking bits instead of
+ * cancelling it, introduced for better support of DoM files.
+ */
+int ldlm_inodebits_drop(struct ldlm_lock *lock, __u64 to_drop)
+{
+	ENTRY;
+
+	check_res_locked(lock->l_resource);
+
+	/* Just return if there are no conflicting bits */
+	if ((lock->l_policy_data.l_inodebits.bits & to_drop) == 0) {
+		LDLM_WARN(lock, "try to drop unset bits %#llx/%#llx",
+			  lock->l_policy_data.l_inodebits.bits, to_drop);
+		/* nothing to do */
+		RETURN(0);
+	}
+
+	/* remove lock from a skiplist and put in the new place
+	 * according with new inodebits */
+	ldlm_resource_unlink_lock(lock);
+	lock->l_policy_data.l_inodebits.bits &= ~to_drop;
+	ldlm_grant_lock_with_skiplist(lock);
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_inodebits_drop);
+
+/* convert single lock */
+int ldlm_cli_inodebits_convert(struct ldlm_lock *lock,
+			       enum ldlm_cancel_flags cancel_flags)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+	struct ldlm_lock_desc ld = { { 0 } };
+	__u64 drop_bits, new_bits;
+	__u32 flags = 0;
+	int rc;
+
+	ENTRY;
+
+	check_res_locked(lock->l_resource);
+
+	/* Lock is being converted already */
+	if (ldlm_is_converting(lock)) {
+		if (!(cancel_flags & LCF_ASYNC)) {
+			struct l_wait_info lwi = { 0 };
+
+			unlock_res_and_lock(lock);
+			l_wait_event(lock->l_waitq,
+				     is_lock_converted(lock), &lwi);
+			lock_res_and_lock(lock);
+		}
+		RETURN(0);
+	}
+
+	/* lru_cancel may happen in parallel and call ldlm_cli_cancel_list()
+	 * independently.
+	 */
+	if (ldlm_is_canceling(lock))
+		RETURN(-EINVAL);
+
+	/* no need in only local convert */
+	if (lock->l_flags & (LDLM_FL_LOCAL_ONLY | LDLM_FL_CANCEL_ON_BLOCK))
+		RETURN(-EINVAL);
+
+	drop_bits = lock->l_policy_data.l_inodebits.cancel_bits;
+	/* no cancel bits - means that caller needs full cancel */
+	if (drop_bits == 0)
+		RETURN(-EINVAL);
+
+	new_bits = lock->l_policy_data.l_inodebits.bits & ~drop_bits;
+	/* check if all lock bits are dropped, proceed with cancel */
+	if (!new_bits)
+		RETURN(-EINVAL);
+
+	/* check if no dropped bits, consider this as successful convert */
+	if (lock->l_policy_data.l_inodebits.bits == new_bits)
+		RETURN(0);
+
+	ldlm_set_converting(lock);
+	/* Finally call cancel callback for remaining bits only.
+	 * It is important to have converting flag during that
+	 * so blocking_ast callback can distinguish convert from
+	 * cancels.
+	 */
+	ld.l_policy_data.l_inodebits.cancel_bits = drop_bits;
+	unlock_res_and_lock(lock);
+	lock->l_blocking_ast(lock, &ld, lock->l_ast_data, LDLM_CB_CANCELING);
+	/* now notify server about convert */
+	rc = ldlm_cli_convert_req(lock, &flags, new_bits);
+	lock_res_and_lock(lock);
+	if (rc)
+		GOTO(full_cancel, rc);
+
+	/* Finally clear these bits in lock ibits */
+	ldlm_inodebits_drop(lock, drop_bits);
+
+	/* Being locked again check if lock was canceled, it is important
+	 * to do and don't drop cbpending below
+	 */
+	if (ldlm_is_canceling(lock))
+		GOTO(full_cancel, rc = -EINVAL);
+
+	/* also check again if more bits to be cancelled appeared */
+	if (drop_bits != lock->l_policy_data.l_inodebits.cancel_bits)
+		GOTO(clear_converting, rc = -EAGAIN);
+
+	/* clear cbpending flag early, it is safe to match lock right after
+	 * client convert because it is downgrade always.
+	 */
+	ldlm_clear_cbpending(lock);
+	ldlm_clear_bl_ast(lock);
+	spin_lock(&ns->ns_lock);
+	if (list_empty(&lock->l_lru))
+		ldlm_lock_add_to_lru_nolock(lock);
+	spin_unlock(&ns->ns_lock);
+
+	/* the job is done, zero the cancel_bits. If more conflicts appear,
+	 * it will result in another cycle of ldlm_cli_inodebits_convert().
+	 */
+full_cancel:
+	lock->l_policy_data.l_inodebits.cancel_bits = 0;
+clear_converting:
+	ldlm_clear_converting(lock);
+	RETURN(rc);
+}
+
+int ldlm_inodebits_alloc_lock(struct ldlm_lock *lock)
+{
+	if (ldlm_is_ns_srv(lock)) {
+		int i;
+
+		OBD_SLAB_ALLOC_PTR(lock->l_ibits_node, ldlm_inodebits_slab);
+		if (lock->l_ibits_node == NULL)
+			return -ENOMEM;
+		for (i = 0; i < MDS_INODELOCK_NUMBITS; i++)
+			INIT_LIST_HEAD(&lock->l_ibits_node->lin_link[i]);
+		lock->l_ibits_node->lock = lock;
+	} else {
+		lock->l_ibits_node = NULL;
+	}
+	return 0;
+}
+
+void ldlm_inodebits_add_lock(struct ldlm_resource *res, struct list_head *head,
+			     struct ldlm_lock *lock)
+{
+	int i;
+
+	if (!ldlm_is_ns_srv(lock))
+		return;
+
+	if (head == &res->lr_waiting) {
+		for (i = 0; i < MDS_INODELOCK_NUMBITS; i++) {
+			if (lock->l_policy_data.l_inodebits.bits & (1 << i))
+				list_add_tail(&lock->l_ibits_node->lin_link[i],
+					&res->lr_ibits_queues->liq_waiting[i]);
+		}
+	} else if (head == &res->lr_granted && lock->l_ibits_node != NULL) {
+		for (i = 0; i < MDS_INODELOCK_NUMBITS; i++)
+			LASSERT(list_empty(&lock->l_ibits_node->lin_link[i]));
+		OBD_SLAB_FREE_PTR(lock->l_ibits_node, ldlm_inodebits_slab);
+		lock->l_ibits_node = NULL;
+	}
+}
+
+void ldlm_inodebits_unlink_lock(struct ldlm_lock *lock)
+{
+	int i;
+
+	ldlm_unlink_lock_skiplist(lock);
+	if (!ldlm_is_ns_srv(lock))
+		return;
+
+	for (i = 0; i < MDS_INODELOCK_NUMBITS; i++)
+		list_del_init(&lock->l_ibits_node->lin_link[i]);
 }
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h
index 779dec55882e5..733773c50ed0c 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,6 +40,7 @@ extern struct mutex ldlm_cli_namespace_lock;
 extern struct list_head ldlm_cli_active_namespace_list;
 extern struct list_head ldlm_cli_inactive_namespace_list;
 extern unsigned int ldlm_cancel_unused_locks_before_replay;
+extern struct kmem_cache *ldlm_glimpse_work_kmem;
 
 static inline int ldlm_namespace_nr_read(enum ldlm_side client)
 {
@@ -97,30 +98,27 @@ struct ldlm_namespace *ldlm_namespace_first_locked(enum ldlm_side);
 /* ldlm_request.c */
 /* Cancel lru flag, it indicates we cancel aged locks. */
 enum ldlm_lru_flags {
-	LDLM_LRU_FLAG_AGED	= 0x01, /* Cancel aged locks (non LRU resize) */
-	LDLM_LRU_FLAG_PASSED	= 0x02, /* Cancel passed number of locks */
-	LDLM_LRU_FLAG_SHRINK	= 0x04, /* Cancel locks from shrinker */
-	LDLM_LRU_FLAG_LRUR	= 0x08, /* Cancel locks from lru resize */
-	LDLM_LRU_FLAG_NO_WAIT	= 0x10, /* Cancel locks w/o blocking (neither
-					 * sending nor waiting for any RPCs) */
-	LDLM_LRU_FLAG_CLEANUP	= 0x20, /* Used when clearing lru, tells
-					 * prepare_lru_list to set discard flag
-					 * on PR extent locks so we don't waste
-					 * time saving pages that will be
-					 * discarded momentarily */
+	LDLM_LRU_FLAG_NO_WAIT	= 0x1, /* Cancel locks w/o blocking (neither
+					* sending nor waiting for any RPCs) */
+	LDLM_LRU_FLAG_CLEANUP	= 0x2, /* Used when clearing lru, tells
+					* prepare_lru_list to set discard flag
+					* on PR extent locks so we don't waste
+					* time saving pages that will be
+					* discarded momentarily */
 };
 
-int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr,
+int ldlm_cancel_lru(struct ldlm_namespace *ns, int min,
 		    enum ldlm_cancel_flags cancel_flags,
 		    enum ldlm_lru_flags lru_flags);
 int ldlm_cancel_lru_local(struct ldlm_namespace *ns,
-			  struct list_head *cancels, int count, int max,
+			  struct list_head *cancels, int min, int max,
 			  enum ldlm_cancel_flags cancel_flags,
 			  enum ldlm_lru_flags lru_flags);
 extern unsigned int ldlm_enqueue_min;
 /* ldlm_resource.c */
 extern struct kmem_cache *ldlm_resource_slab;
 extern struct kmem_cache *ldlm_lock_slab;
+extern struct kmem_cache *ldlm_inodebits_slab;
 extern struct kmem_cache *ldlm_interval_tree_slab;
 
 void ldlm_resource_insert_lock_after(struct ldlm_lock *original,
@@ -135,6 +133,7 @@ typedef enum {
 	LDLM_WORK_GL_AST
 } ldlm_desc_ast_t;
 
+void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock);
 void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list);
 int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill,
 		  enum req_location loc, void *data, int size);
@@ -143,7 +142,9 @@ ldlm_lock_create(struct ldlm_namespace *ns, const struct ldlm_res_id *,
 		 enum ldlm_type type, enum ldlm_mode mode,
 		 const struct ldlm_callback_suite *cbs,
 		 void *data, __u32 lvb_len, enum lvb_type lvb_type);
-enum ldlm_error ldlm_lock_enqueue(struct ldlm_namespace *, struct ldlm_lock **,
+enum ldlm_error ldlm_lock_enqueue(const struct lu_env *env,
+				  struct ldlm_namespace *,
+				  struct ldlm_lock **,
 				  void *cookie, __u64 *flags);
 void ldlm_lock_addref_internal(struct ldlm_lock *, enum ldlm_mode mode);
 void ldlm_lock_addref_internal_nolock(struct ldlm_lock *, enum ldlm_mode mode);
@@ -154,13 +155,16 @@ void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
 #ifdef HAVE_SERVER_SUPPORT
 int ldlm_reprocess_queue(struct ldlm_resource *res, struct list_head *queue,
 			 struct list_head *work_list,
-			 enum ldlm_process_intention intention);
+			 enum ldlm_process_intention intention,
+			 struct ldlm_lock *hint);
 int ldlm_handle_conflict_lock(struct ldlm_lock *lock, __u64 *flags,
-			      struct list_head *rpc_list, __u64 grant_flags);
+			      struct list_head *rpc_list);
 void ldlm_discard_bl_list(struct list_head *bl_list);
+void ldlm_clear_blocking_lock(struct ldlm_lock *lock);
+void ldlm_clear_blocking_data(struct ldlm_lock *lock);
 #endif
 int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
-                      ldlm_desc_ast_t ast_type);
+		      ldlm_desc_ast_t ast_type);
 int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq);
 int ldlm_lock_remove_from_lru_check(struct ldlm_lock *lock, ktime_t last_use);
 #define ldlm_lock_remove_from_lru(lock) \
@@ -173,6 +177,7 @@ void ldlm_lock_destroy_nolock(struct ldlm_lock *lock);
 
 int ldlm_export_cancel_blocked_locks(struct obd_export *exp);
 int ldlm_export_cancel_locks(struct obd_export *exp);
+void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock);
 
 /* ldlm_lockd.c */
 int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld,
@@ -185,6 +190,7 @@ int ldlm_bl_thread_wakeup(void);
 
 void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
                              struct ldlm_lock_desc *ld, struct ldlm_lock *lock);
+void ldlm_bl_desc2lock(const struct ldlm_lock_desc *ld, struct ldlm_lock *lock);
 
 #ifdef HAVE_SERVER_SUPPORT
 /* ldlm_plain.c */
@@ -197,14 +203,25 @@ int ldlm_process_inodebits_lock(struct ldlm_lock *lock, __u64 *flags,
 				enum ldlm_process_intention intention,
 				enum ldlm_error *err,
 				struct list_head *work_list);
+int ldlm_reprocess_inodebits_queue(struct ldlm_resource *res,
+				   struct list_head *queue,
+				   struct list_head *work_list,
+				   enum ldlm_process_intention intention,
+				   struct ldlm_lock *hint);
 /* ldlm_extent.c */
 int ldlm_process_extent_lock(struct ldlm_lock *lock, __u64 *flags,
 			     enum ldlm_process_intention intention,
 			     enum ldlm_error *err, struct list_head *work_list);
 #endif
+int ldlm_extent_alloc_lock(struct ldlm_lock *lock);
 void ldlm_extent_add_lock(struct ldlm_resource *res, struct ldlm_lock *lock);
 void ldlm_extent_unlink_lock(struct ldlm_lock *lock);
 
+int ldlm_inodebits_alloc_lock(struct ldlm_lock *lock);
+void ldlm_inodebits_add_lock(struct ldlm_resource *res, struct list_head *head,
+			     struct ldlm_lock *lock);
+void ldlm_inodebits_unlink_lock(struct ldlm_lock *lock);
+
 /* ldlm_flock.c */
 int ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags,
 			    enum ldlm_process_intention intention,
@@ -216,7 +233,7 @@ void ldlm_destroy_flock_export(struct obd_export *exp);
 void l_check_ns_lock(struct ldlm_namespace *ns);
 void l_check_no_ns_lock(struct ldlm_namespace *ns);
 
-extern struct proc_dir_entry *ldlm_svc_proc_dir;
+extern struct dentry *ldlm_svc_debugfs_dir;
 
 struct ldlm_state {
         struct ptlrpc_service *ldlm_cb_service;
@@ -230,7 +247,6 @@ struct ldlm_state {
 extern struct kmem_cache *ldlm_interval_slab; /* slab cache for ldlm_interval */
 extern void ldlm_interval_attach(struct ldlm_interval *n, struct ldlm_lock *l);
 extern struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l);
-extern struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock);
 extern void ldlm_interval_free(struct ldlm_interval *node);
 /* this function must be called with res lock held */
 static inline struct ldlm_extent *
@@ -281,7 +297,7 @@ enum ldlm_policy_res {
 	static ssize_t var##_store(struct kobject *kobj,		   \
 				   struct attribute *attr,		   \
 				   const char *buffer,			   \
-				   unsigned long count)			   \
+				   size_t count)			   \
 	{								   \
 		struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool,\
 						    pl_kobj);		   \
@@ -317,7 +333,7 @@ enum ldlm_policy_res {
 	static ssize_t var##_store(struct kobject *kobj,		   \
 				   struct attribute *attr,		   \
 				   const char *buffer,			   \
-				   unsigned long count)			   \
+				   size_t count)			   \
 	{								   \
 		struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool,\
 						    pl_kobj);		   \
@@ -336,28 +352,24 @@ enum ldlm_policy_res {
 	struct __##var##__dummy_write {; } /* semicolon catcher */
 
 static inline void
-ldlm_add_var(struct lprocfs_vars *vars, struct proc_dir_entry *proc_dir,
-	     const char *name, void *data, const struct proc_ops *ops)
+ldlm_add_var(struct ldebugfs_vars *vars, struct dentry *debugfs_entry,
+	     const char *name, void *data, const struct file_operations *ops)
 {
 	snprintf((char *)vars->name, MAX_STRING_SIZE, "%s", name);
 	vars->data = data;
 	vars->fops = ops;
-	lprocfs_add_vars(proc_dir, vars, NULL);
+	ldebugfs_add_vars(debugfs_entry, vars, NULL);
 }
 
 static inline int is_granted_or_cancelled(struct ldlm_lock *lock)
 {
-        int ret = 0;
+	int ret = 0;
 
-        lock_res_and_lock(lock);
-	if ((lock->l_req_mode == lock->l_granted_mode) &&
-	     !ldlm_is_cp_reqd(lock))
-		ret = 1;
-	else if (ldlm_is_failed(lock) || ldlm_is_cancel(lock))
-                ret = 1;
-        unlock_res_and_lock(lock);
+	lock_res_and_lock(lock);
+	ret = is_granted_or_cancelled_nolock(lock);
+	unlock_res_and_lock(lock);
 
-        return ret;
+	return ret;
 }
 
 static inline bool is_bl_done(struct ldlm_lock *lock)
@@ -373,6 +385,17 @@ static inline bool is_bl_done(struct ldlm_lock *lock)
 	return bl_done;
 }
 
+static inline bool is_lock_converted(struct ldlm_lock *lock)
+{
+	bool ret = 0;
+
+	lock_res_and_lock(lock);
+	ret = (lock->l_policy_data.l_inodebits.cancel_bits == 0);
+	unlock_res_and_lock(lock);
+
+	return ret;
+}
+
 typedef void (*ldlm_policy_wire_to_local_t)(const union ldlm_wire_policy_data *,
 					    union ldlm_policy_data *);
 typedef void (*ldlm_policy_local_to_wire_t)(const union ldlm_policy_data *,
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c
index 33d871da4bdf6..41e655b6fc353 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2016, Intel Corporation.
+ * Copyright (c) 2010, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -39,6 +39,8 @@
 
 #define DEBUG_SUBSYSTEM S_LDLM
 
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
 #include <linux/kthread.h>
 #include <libcfs/libcfs.h>
 #include <obd.h>
@@ -358,12 +360,13 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
 		     sizeof(server_uuid)));
 
 	cli->cl_dirty_pages = 0;
+	cli->cl_dirty_max_pages = 0;
 	cli->cl_avail_grant = 0;
 	/* FIXME: Should limit this for the sum of all cl_dirty_max_pages. */
 	/* cl_dirty_max_pages may be changed at connect time in
 	 * ptlrpc_connect_interpret(). */
 	client_adjust_max_dirty(cli);
-	INIT_LIST_HEAD(&cli->cl_cache_waiters);
+	init_waitqueue_head(&cli->cl_cache_waiters);
 	INIT_LIST_HEAD(&cli->cl_loi_ready_list);
 	INIT_LIST_HEAD(&cli->cl_loi_hp_ready_list);
 	INIT_LIST_HEAD(&cli->cl_loi_write_list);
@@ -390,9 +393,15 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
 	spin_lock_init(&cli->cl_lru_list_lock);
 	atomic_long_set(&cli->cl_unstable_count, 0);
 	INIT_LIST_HEAD(&cli->cl_shrink_list);
+	INIT_LIST_HEAD(&cli->cl_grant_chain);
+
+	INIT_LIST_HEAD(&cli->cl_flight_waiters);
+	cli->cl_rpcs_in_flight = 0;
 
 	init_waitqueue_head(&cli->cl_destroy_waitq);
 	atomic_set(&cli->cl_destroy_in_flight, 0);
+
+	cli->cl_supp_cksum_types = OBD_CKSUM_CRC32;
 #ifdef ENABLE_CHECKSUM
 	/* Turn on checksumming by default. */
 	cli->cl_checksum = 1;
@@ -401,7 +410,7 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
 	 * Set cl_chksum* to CRC32 for now to avoid returning screwed info
 	 * through procfs.
 	 */
-	cli->cl_cksum_type = cli->cl_supp_cksum_types = OBD_CKSUM_CRC32;
+	cli->cl_cksum_type = cli->cl_supp_cksum_types;
 #endif
 	atomic_set(&cli->cl_resends, OSC_DEFAULT_RESENDS);
 
@@ -409,6 +418,8 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
 	 * from OFD after connecting. */
 	cli->cl_max_pages_per_rpc = PTLRPC_MAX_BRW_PAGES;
 
+	cli->cl_max_short_io_bytes = OBD_MAX_SHORT_IO_BYTES;
+
 	/* set cl_chunkbits default value to PAGE_SHIFT,
 	 * it will be updated at OSC connection time. */
 	cli->cl_chunkbits = PAGE_SHIFT;
@@ -426,7 +437,7 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
 			cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_MAX;
 		else
 			cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT;
-        }
+	}
 
 	spin_lock_init(&cli->cl_mod_rpcs_lock);
 	spin_lock_init(&cli->cl_mod_rpcs_hist.oh_lock);
@@ -599,6 +610,7 @@ int client_connect_import(const struct lu_env *env,
 			 ocd->ocd_connect_flags, "old %#llx, new %#llx\n",
 			 data->ocd_connect_flags, ocd->ocd_connect_flags);
 		data->ocd_connect_flags = ocd->ocd_connect_flags;
+		data->ocd_connect_flags2 = ocd->ocd_connect_flags2;
 	}
 
 	ptlrpc_pinger_add_import(imp);
@@ -731,6 +743,32 @@ int server_disconnect_export(struct obd_export *exp)
 }
 EXPORT_SYMBOL(server_disconnect_export);
 
+static inline int target_check_recovery_timer(struct obd_device *target)
+{
+	ktime_t remaining;
+	s64 timeout;
+
+	if (!target->obd_recovering || target->obd_recovery_start == 0)
+		return 0;
+
+	remaining = hrtimer_expires_remaining(&target->obd_recovery_timer);
+	timeout = ktime_divns(remaining, NSEC_PER_SEC);
+	if (timeout > -30)
+		return 0;
+
+	/* the recovery timer should expire, but it isn't triggered,
+	 * it's better to abort the recovery of this target to speed up
+	 * the recovery of the whole cluster. */
+	spin_lock(&target->obd_dev_lock);
+	if (target->obd_recovering) {
+		CERROR("%s: Aborting recovery\n", target->obd_name);
+		target->obd_abort_recovery = 1;
+		wake_up(&target->obd_next_transno_waitq);
+	}
+	spin_unlock(&target->obd_dev_lock);
+	return 0;
+}
+
 /* --------------------------------------------------------------------------
  * from old lib/target.c
  * -------------------------------------------------------------------------- */
@@ -741,12 +779,11 @@ static int target_handle_reconnect(struct lustre_handle *conn,
 {
 	struct obd_device *target;
 	struct lustre_handle *hdl;
-	cfs_time_t now;
-	cfs_time_t deadline;
-	int timeout;
+	ktime_t remaining;
+	s64 timeout;
 	int rc = 0;
-	ENTRY;
 
+	ENTRY;
 	hdl = &exp->exp_imp_reverse->imp_remote_handle;
 	if (!exp->exp_connection || !lustre_handle_is_used(hdl)) {
 		conn->cookie = exp->exp_handle.h_cookie;
@@ -780,46 +817,45 @@ static int target_handle_reconnect(struct lustre_handle *conn,
 		GOTO(out_already, rc);
 	}
 
-	now = cfs_time_current();
-	deadline = target->obd_recovery_timer.expires;
-	if (cfs_time_before(now, deadline)) {
-		struct target_distribute_txn_data *tdtd =
-					class_exp2tgt(exp)->lut_tdtd;
+	remaining = hrtimer_expires_remaining(&target->obd_recovery_timer);
+	timeout = ktime_divns(remaining, NSEC_PER_SEC);
+	if (timeout > 0) {
+		LCONSOLE_WARN("%s: Client %s (at %s) reconnected, waiting for %d clients in recovery for %lld:%.02lld\n",
+			      target->obd_name,
+			      obd_uuid2str(&exp->exp_client_uuid),
+			      obd_export_nid2str(exp),
+			      atomic_read(&target->obd_max_recoverable_clients),
+			      timeout / 60, timeout % 60);
+	} else {
+		struct target_distribute_txn_data *tdtd;
 		int size = 0;
 		int count = 0;
 		char *buf = NULL;
 
-		timeout = cfs_duration_sec(cfs_time_sub(deadline, now));
+		target_check_recovery_timer(target);
+
+		tdtd = class_exp2tgt(exp)->lut_tdtd;
 		if (tdtd && tdtd->tdtd_show_update_logs_retrievers)
 			buf = tdtd->tdtd_show_update_logs_retrievers(
 				tdtd->tdtd_show_retrievers_cbdata,
 				&size, &count);
 
 		if (count > 0)
-			LCONSOLE_WARN("%s: Recovery already passed deadline "
-				      "%d:%.02d. It is due to DNE recovery "
-				      "failed/stuck on the %d MDT(s):%s. "
-				      "Please wait until all MDTs recovered "
-				      "or abort the recovery by force.\n",
-				      target->obd_name, timeout / 60,
-				      timeout % 60, count,
-				      buf ? buf : "unknown (not enough RAM)");
+			LCONSOLE_WARN("%s: Client %s (at %s) reconnecting, waiting for %d MDTs (%s) in recovery for %lld:%.02lld. Please wait until all MDTs recovered or you may force MDT evicition via 'lctl --device %s abort_recovery.\n",
+				      target->obd_name,
+				      obd_uuid2str(&exp->exp_client_uuid),
+				      obd_export_nid2str(exp), count,
+				      buf ? buf : "unknown (not enough RAM)",
+				      (abs(timeout) + target->obd_recovery_timeout) / 60,
+				      (abs(timeout) + target->obd_recovery_timeout) % 60,
+				      target->obd_name);
 		else
-			LCONSOLE_WARN("%s: Recovery already passed deadline "
-				      "%d:%.02d. If you do not want to wait "
-				      "more, please abort the recovery by "
-				      "force.\n", target->obd_name,
-				      timeout / 60, timeout % 60);
+			LCONSOLE_WARN("%s: Recovery already passed deadline %lld:%.02lld. If you do not want to wait more, you may force taget eviction via 'lctl --device %s abort_recovery.\n",
+				      target->obd_name, abs(timeout) / 60,
+				      abs(timeout) % 60, target->obd_name);
 
 		if (buf != NULL)
 			OBD_FREE(buf, size);
-	} else {
-		timeout = cfs_duration_sec(cfs_time_sub(now, deadline));
-		LCONSOLE_WARN("%s: Recovery already passed deadline"
-			" %d:%.02d, It is most likely due to DNE"
-			" recovery is failed or stuck, please wait a"
-			" few more minutes or abort the recovery.\n",
-			target->obd_name, timeout / 60, timeout % 60);
 	}
 
 out_already:
@@ -950,7 +986,6 @@ int target_handle_connect(struct ptlrpc_request *req)
 	 * reconnect case */
 	struct lustre_handle conn;
 	struct lustre_handle *tmp;
-        struct obd_uuid tgtuuid;
         struct obd_uuid cluuid;
         char *str;
         int rc = 0;
@@ -959,7 +994,6 @@ int target_handle_connect(struct ptlrpc_request *req)
 	bool	 mds_conn = false, lw_client = false, initial_conn = false;
 	bool	 mds_mds_conn = false;
 	bool	 new_mds_mds_conn = false;
-	bool	 target_referenced = false;
         struct obd_connect_data *data, *tmpdata;
         int size, tmpsize;
         lnet_nid_t *client_nid = NULL;
@@ -973,11 +1007,7 @@ int target_handle_connect(struct ptlrpc_request *req)
                 GOTO(out, rc = -EINVAL);
         }
 
-        obd_str2uuid(&tgtuuid, str);
-        target = class_uuid2obd(&tgtuuid);
-        if (!target)
-                target = class_name2obd(str);
-
+	target = class_dev_by_str(str);
 	if (!target) {
 		deuuidify(str, NULL, &target_start, &target_len);
 		LCONSOLE_ERROR_MSG(0x137, "%s: not available for connect "
@@ -989,6 +1019,9 @@ int target_handle_connect(struct ptlrpc_request *req)
 	}
 
 	spin_lock(&target->obd_dev_lock);
+
+	target->obd_conn_inprogress++;
+
 	if (target->obd_stopping || !target->obd_set_up) {
 		spin_unlock(&target->obd_dev_lock);
 
@@ -1010,13 +1043,6 @@ int target_handle_connect(struct ptlrpc_request *req)
 		GOTO(out, rc = -EAGAIN);
 	}
 
-	/* Make sure the target isn't cleaned up while we're here. Yes,
-	 * there's still a race between the above check and our incref here.
-	 * Really, class_uuid2obd should take the ref. */
-	class_incref(target, __func__, current);
-	target_referenced = true;
-
-	target->obd_conn_inprogress++;
 	spin_unlock(&target->obd_dev_lock);
 
         str = req_capsule_client_get(&req->rq_pill, &RMF_CLUUID);
@@ -1033,11 +1059,13 @@ int target_handle_connect(struct ptlrpc_request *req)
 
         conn = *tmp;
 
-        size = req_capsule_get_size(&req->rq_pill, &RMF_CONNECT_DATA,
-                                    RCL_CLIENT);
-        data = req_capsule_client_get(&req->rq_pill, &RMF_CONNECT_DATA);
-        if (!data)
-                GOTO(out, rc = -EPROTO);
+	size = req_capsule_get_size(&req->rq_pill, &RMF_CONNECT_DATA,
+				    RCL_CLIENT);
+	if (size < 0 || size > 8 * sizeof(struct obd_connect_data))
+		GOTO(out, rc = -EPROTO);
+	data = req_capsule_client_get(&req->rq_pill, &RMF_CONNECT_DATA);
+	if (!data)
+		GOTO(out, rc = -EPROTO);
 
         rc = req_capsule_server_pack(&req->rq_pill);
         if (rc)
@@ -1055,50 +1083,36 @@ int target_handle_connect(struct ptlrpc_request *req)
 	 */
 	if (!(data->ocd_connect_flags & OBD_CONNECT_FULL20))
 		GOTO(out, rc = -EPROTO);
-#endif
 
+	/* Don't allow liblustre clients to connect.
+	 * - testing was disabled in v2_2_50_0-61-g6a75d65
+	 * - building was disabled in v2_5_58_0-28-g7277179
+	 * - client code was deleted in v2_6_50_0-101-gcdfbc72,
+	 * - clients were refused connect for version difference > 0.0.1.32  */
 	if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
-		if (data->ocd_version < LUSTRE_VERSION_CODE -
-		                               LUSTRE_VERSION_ALLOWED_OFFSET ||
-		    data->ocd_version > LUSTRE_VERSION_CODE +
-		                               LUSTRE_VERSION_ALLOWED_OFFSET) {
-			DEBUG_REQ(D_WARNING, req, "Refusing %s (%d.%d.%d.%d) "
-				  "libclient connection attempt",
-				  data->ocd_version < LUSTRE_VERSION_CODE ?
-				  "old" : "new",
-				  OBD_OCD_VERSION_MAJOR(data->ocd_version),
-				  OBD_OCD_VERSION_MINOR(data->ocd_version),
-				  OBD_OCD_VERSION_PATCH(data->ocd_version),
-				  OBD_OCD_VERSION_FIX(data->ocd_version));
-			data = req_capsule_server_sized_get(&req->rq_pill,
-							    &RMF_CONNECT_DATA,
-				    offsetof(typeof(*data), ocd_version) +
-					     sizeof(data->ocd_version));
-			if (data) {
-				data->ocd_connect_flags = OBD_CONNECT_VERSION;
-				data->ocd_version = LUSTRE_VERSION_CODE;
-			}
-			GOTO(out, rc = -EPROTO);
-		}
+		DEBUG_REQ(D_WARNING, req, "Refusing libclient connection");
+		GOTO(out, rc = -EPROTO);
 	}
+#endif
 
 	/* Note: lw_client is needed in MDS-MDS failover during update log
 	 * processing, so we needs to allow lw_client to be connected at
-	 * anytime, instead of only the initial connection */
-	lw_client = (data->ocd_connect_flags & OBD_CONNECT_LIGHTWEIGHT) != 0;
+	 * anytime, instead of only the initial connection
+	 */
+	lw_client = OCD_HAS_FLAG(data, LIGHTWEIGHT);
 
 	if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_INITIAL) {
 		initial_conn = true;
-		mds_conn = (data->ocd_connect_flags & OBD_CONNECT_MDS) != 0;
-		mds_mds_conn = (data->ocd_connect_flags &
-				OBD_CONNECT_MDS_MDS) != 0;
+		mds_conn = OCD_HAS_FLAG(data, MDS);
+		mds_mds_conn = OCD_HAS_FLAG(data, MDS_MDS);
 
 		/* OBD_CONNECT_MNE_SWAB is defined as OBD_CONNECT_MDS_MDS
 		 * for Imperative Recovery connection from MGC to MGS.
 		 *
 		 * Via check OBD_CONNECT_FID, we can distinguish whether
 		 * the OBD_CONNECT_MDS_MDS/OBD_CONNECT_MNE_SWAB is from
-		 * MGC or MDT. */
+		 * MGC or MDT, since MGC does not use OBD_CONNECT_FID.
+		 */
 		if (!lw_client &&
 		    (data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) &&
 		    (data->ocd_connect_flags & OBD_CONNECT_FID) &&
@@ -1147,27 +1161,29 @@ int target_handle_connect(struct ptlrpc_request *req)
 		export = NULL;
 		rc = -EALREADY;
 	} else if ((mds_conn || (lw_client && initial_conn) ||
-		   data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) &&
-		   export->exp_connection != NULL) {
+		   OCD_HAS_FLAG(data, MDS_MDS)) && export->exp_connection) {
 		spin_unlock(&export->exp_lock);
 		if (req->rq_peer.nid != export->exp_connection->c_peer.nid) {
 			/* MDS or LWP reconnected after failover. */
-			LCONSOLE_WARN("%s: Received %s connection from "
-			    "%s, removing former export from %s\n",
-			    target->obd_name, mds_conn ? "MDS" : "LWP",
-			    libcfs_nid2str(req->rq_peer.nid),
-			    libcfs_nid2str(export->exp_connection->c_peer.nid));
+			LCONSOLE_WARN("%s: Received %s connection from %s, removing former export from %s\n",
+				      target->obd_name,
+				      lw_client ? "LWP" : "MDS",
+				      libcfs_nid2str(req->rq_peer.nid),
+				      libcfs_nid2str(export->exp_connection->c_peer.nid));
 		} else {
-			/* New MDS connection from the same NID. */
-			LCONSOLE_WARN("%s: Received new %s connection from "
-				"%s, removing former export from same NID\n",
-				target->obd_name, mds_conn ? "MDS" : "LWP",
-				libcfs_nid2str(req->rq_peer.nid));
+			/* New connection from the same NID. */
+			LCONSOLE_WARN("%s: Received new %s connection from %s, %s former export from same NID\n",
+				      target->obd_name,
+				      lw_client ? "LWP" : "MDS",
+				      libcfs_nid2str(req->rq_peer.nid),
+				      OCD_HAS_FLAG(data, MDS_MDS) ?
+				      "keep" : "remove");
 		}
 
 		if (req->rq_peer.nid == export->exp_connection->c_peer.nid &&
-		    data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) {
-			/* Because exports between MDTs will always be
+		    OCD_HAS_FLAG(data, MDS_MDS)) {
+			/*
+			 * Because exports between MDTs will always be
 			 * kept, let's do not fail such export if they
 			 * come from the same NID, otherwise it might
 			 * cause eviction between MDTs, which might
@@ -1234,11 +1250,11 @@ int target_handle_connect(struct ptlrpc_request *req)
                 GOTO(out, rc);
         }
 
-	CDEBUG(D_HA, "%s: connection from %s@%s %st%llu exp %p cur %ld last %ld\n",
-               target->obd_name, cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
-              target->obd_recovering ? "recovering/" : "", data->ocd_transno,
-              export, (long)cfs_time_current_sec(),
-              export ? (long)export->exp_last_request_time : 0);
+	CDEBUG(D_HA, "%s: connection from %s@%s %st%llu exp %p cur %lld last %lld\n",
+	       target->obd_name, cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
+	       target->obd_recovering ? "recovering/" : "", data->ocd_transno,
+	       export, ktime_get_seconds(),
+	       export ? export->exp_last_request_time : 0);
 
 	/* If this is the first time a client connects, reset the recovery
 	 * timer. Discard lightweight connections which might be local. */
@@ -1264,27 +1280,37 @@ int target_handle_connect(struct ptlrpc_request *req)
 		/* allow "new" MDT to be connected during recovery, since we
 		 * need retrieve recovery update records from it */
 		if (target->obd_recovering && !lw_client && !mds_mds_conn) {
-                        cfs_time_t t;
-			int	c; /* connected */
-			int	i; /* in progress */
-			int	k; /* known */
-			int	s; /* stale/evicted */
-
-			c = atomic_read(&target->obd_connected_clients);
-			i = atomic_read(&target->obd_lock_replay_clients);
-			k = target->obd_max_recoverable_clients;
-			s = target->obd_stale_clients;
-			t = target->obd_recovery_timer.expires;
-			t = cfs_time_sub(t, cfs_time_current());
-			t = cfs_duration_sec(t);
-			LCONSOLE_WARN("%s: Denying connection for new client %s"
-				      "(at %s), waiting for %d known clients "
-				      "(%d recovered, %d in progress, and %d "
-				      "evicted) to recover in %d:%.02d\n",
+			struct hrtimer *timer = &target->obd_recovery_timer;
+			ktime_t remaining;
+			s64 timeout, left;
+			int in_progress;
+			int connected;
+			int known;
+			int stale;
+			char *msg;
+
+			connected = atomic_read(&target->obd_connected_clients);
+			in_progress = atomic_read(&target->obd_lock_replay_clients);
+			known =
+			   atomic_read(&target->obd_max_recoverable_clients);
+			stale = target->obd_stale_clients;
+			remaining = hrtimer_expires_remaining(timer);
+			left = ktime_divns(remaining, NSEC_PER_SEC);
+			if (ktime_to_ns(remaining) > 0) {
+				msg = "to recover in";
+				timeout = left;
+			} else {
+				msg = "already passed deadline";
+				timeout = -left;
+
+				target_check_recovery_timer(target);
+			}
+
+			LCONSOLE_WARN("%s: Denying connection for new client %s (at %s), waiting for %d known clients (%d recovered, %d in progress, and %d evicted) %s %lld:%.02lld\n",
 				      target->obd_name, cluuid.uuid,
-				      libcfs_nid2str(req->rq_peer.nid), k,
-				      c - i, i, s, (int)t / 60,
-				      (int)t % 60);
+				      libcfs_nid2str(req->rq_peer.nid), known,
+				      connected - in_progress, in_progress,
+				      stale, msg, timeout / 60, timeout % 60);
 			rc = -EBUSY;
 		} else {
 dont_check_exports:
@@ -1339,37 +1365,26 @@ int target_handle_connect(struct ptlrpc_request *req)
 		spin_unlock(&export->exp_lock);
 		CDEBUG(D_RPCTRACE, "%s: %s already connected at greater "
 		       "or equal conn_cnt: %d >= %d\n",
-                       cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
-                       export->exp_conn_cnt,
-                       lustre_msg_get_conn_cnt(req->rq_reqmsg));
-
-                GOTO(out, rc = -EALREADY);
-        }
-        LASSERT(lustre_msg_get_conn_cnt(req->rq_reqmsg) > 0);
-        export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
+		       cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
+		       export->exp_conn_cnt,
+		       lustre_msg_get_conn_cnt(req->rq_reqmsg));
 
-	/* Don't evict liblustre clients for not pinging. */
-        if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
-                export->exp_libclient = 1;
-		spin_unlock(&export->exp_lock);
-
-		spin_lock(&target->obd_dev_lock);
-		list_del_init(&export->exp_obd_chain_timed);
-		spin_unlock(&target->obd_dev_lock);
-	} else {
-		spin_unlock(&export->exp_lock);
+		GOTO(out, rc = -EALREADY);
 	}
+	LASSERT(lustre_msg_get_conn_cnt(req->rq_reqmsg) > 0);
+	export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
+	spin_unlock(&export->exp_lock);
 
-        if (export->exp_connection != NULL) {
+	if (export->exp_connection != NULL) {
 		/* Check to see if connection came from another NID. */
-                if ((export->exp_connection->c_peer.nid != req->rq_peer.nid) &&
+		if ((export->exp_connection->c_peer.nid != req->rq_peer.nid) &&
 		    !hlist_unhashed(&export->exp_nid_hash))
-                        cfs_hash_del(export->exp_obd->obd_nid_hash,
-                                     &export->exp_connection->c_peer.nid,
-                                     &export->exp_nid_hash);
+			cfs_hash_del(export->exp_obd->obd_nid_hash,
+				     &export->exp_connection->c_peer.nid,
+				     &export->exp_nid_hash);
 
-                ptlrpc_connection_put(export->exp_connection);
-        }
+		ptlrpc_connection_put(export->exp_connection);
+	}
 
 	export->exp_connection = ptlrpc_connection_get(req->rq_peer,
 						       req->rq_self,
@@ -1425,9 +1440,10 @@ int target_handle_connect(struct ptlrpc_request *req)
 		 * also needs to be increased to match other recovery checking
 		 * condition. */
 		if (new_mds_mds_conn)
-			target->obd_max_recoverable_clients++;
+			atomic_inc(&target->obd_max_recoverable_clients);
+
 		if (atomic_inc_return(&target->obd_connected_clients) ==
-		    target->obd_max_recoverable_clients)
+		    atomic_read(&target->obd_max_recoverable_clients))
 			wake_up(&target->obd_next_transno_waitq);
 	}
 
@@ -1443,12 +1459,11 @@ int target_handle_connect(struct ptlrpc_request *req)
 
 		class_export_put(export);
 	}
-	if (target_referenced == true && target != NULL) {
+	if (target != NULL) {
 		spin_lock(&target->obd_dev_lock);
 		target->obd_conn_inprogress--;
 		spin_unlock(&target->obd_dev_lock);
-
-		class_decref(target, __func__, current);
+		class_decref(target, "find", current);
 	}
 	req->rq_status = rc;
 	RETURN(rc);
@@ -1460,11 +1475,23 @@ int target_handle_disconnect(struct ptlrpc_request *req)
         ENTRY;
 
         rc = req_capsule_server_pack(&req->rq_pill);
-        if (rc)
-                RETURN(rc);
+	if (rc)
+		RETURN(rc);
+
+	/* In case of target disconnect, updating sec ctx immediately is
+	 * required in order to record latest sequence number used.
+	 * Sequence is normally updated on export destroy, but this event
+	 * can occur too late, ie after a new target connect request has
+	 * been processed.
+	 * Maintaining correct sequence when client connection becomes idle
+	 * ensures that GSS does not erroneously consider requests as replays.
+	 */
+	rc = sptlrpc_export_update_ctx(req->rq_export);
+	if (rc)
+		RETURN(rc);
 
 	/* Keep the rq_export around so we can send the reply. */
-        req->rq_status = obd_disconnect(class_export_get(req->rq_export));
+	req->rq_status = obd_disconnect(class_export_get(req->rq_export));
 
         RETURN(0);
 }
@@ -1577,14 +1604,14 @@ static void target_finish_recovery(struct lu_target *lut)
 
 	/* Only log a recovery message when recovery has occurred. */
 	if (obd->obd_recovery_start) {
-		time64_t now = ktime_get_real_seconds();
+		time64_t now = ktime_get_seconds();
 		time64_t elapsed_time;
 
 		elapsed_time = max_t(time64_t, now - obd->obd_recovery_start, 1);
 		LCONSOLE_INFO("%s: Recovery over after %lld:%.02lld, of %d clients "
 			"%d recovered and %d %s evicted.\n", obd->obd_name,
 			(s64)elapsed_time / 60, (s64)elapsed_time % 60,
-			obd->obd_max_recoverable_clients,
+			atomic_read(&obd->obd_max_recoverable_clients),
 			atomic_read(&obd->obd_connected_clients),
 			obd->obd_stale_clients,
 			obd->obd_stale_clients == 1 ? "was" : "were");
@@ -1607,15 +1634,16 @@ static void target_finish_recovery(struct lu_target *lut)
 	}
 	spin_unlock(&obd->obd_recovery_task_lock);
 
-	obd->obd_recovery_end = ktime_get_real_seconds();
+	obd->obd_recovery_end = ktime_get_seconds();
 
 	/* When recovery finished, cleanup orphans on MDS and OST. */
-        if (OBT(obd) && OBP(obd, postrecov)) {
-                int rc = OBP(obd, postrecov)(obd);
-                if (rc < 0)
-                        LCONSOLE_WARN("%s: Post recovery failed, rc %d\n",
-                                      obd->obd_name, rc);
-        }
+	if (obd->obd_type && OBP(obd, postrecov)) {
+		int rc = OBP(obd, postrecov)(obd);
+
+		if (rc < 0)
+			LCONSOLE_WARN("%s: Post recovery failed, rc %d\n",
+				      obd->obd_name, rc);
+	}
         EXIT;
 }
 
@@ -1712,12 +1740,14 @@ EXPORT_SYMBOL(target_cleanup_recovery);
 /* obd_recovery_task_lock should be held */
 void target_cancel_recovery_timer(struct obd_device *obd)
 {
-        CDEBUG(D_HA, "%s: cancel recovery timer\n", obd->obd_name);
-	del_timer(&obd->obd_recovery_timer);
+	CDEBUG(D_HA, "%s: cancel recovery timer\n", obd->obd_name);
+	hrtimer_cancel(&obd->obd_recovery_timer);
 }
 
 static void target_start_recovery_timer(struct obd_device *obd)
 {
+	ktime_t delay;
+
 	if (obd->obd_recovery_start != 0)
 		return;
 
@@ -1734,33 +1764,36 @@ static void target_start_recovery_timer(struct obd_device *obd)
 		return;
 	}
 
-	mod_timer(&obd->obd_recovery_timer,
-		  cfs_time_shift(obd->obd_recovery_timeout));
-	obd->obd_recovery_start = ktime_get_real_seconds();
+	obd->obd_recovery_start = ktime_get_seconds();
+	delay = ktime_set(obd->obd_recovery_start +
+			  obd->obd_recovery_timeout, 0);
+	hrtimer_start(&obd->obd_recovery_timer, delay, HRTIMER_MODE_ABS);
 	spin_unlock(&obd->obd_dev_lock);
 
-	LCONSOLE_WARN("%s: Will be in recovery for at least %llu:%02llu, or until %d client%s reconnect%s\n",
+	LCONSOLE_WARN("%s: Will be in recovery for at least %lu:%02lu, or until %d client%s reconnect%s\n",
 		      obd->obd_name,
 		      obd->obd_recovery_timeout / 60,
 		      obd->obd_recovery_timeout % 60,
-		      obd->obd_max_recoverable_clients,
-		      (obd->obd_max_recoverable_clients == 1) ? "" : "s",
-		      (obd->obd_max_recoverable_clients == 1) ? "s": "");
+		      atomic_read(&obd->obd_max_recoverable_clients),
+		      (atomic_read(&obd->obd_max_recoverable_clients) == 1) ?
+		      "" : "s",
+		      (atomic_read(&obd->obd_max_recoverable_clients) == 1) ?
+		      "s" : "");
 }
 
 /**
  * extend recovery window.
  *
- * if @extend is true, extend recovery window to have @drt remaining at least;
- * otherwise, make sure the recovery timeout value is not less than @drt.
+ * if @extend is true, extend recovery window to have @dr_timeout remaining
+ * at least; otherwise, make sure the recovery timeout value is not less
+ * than @dr_timeout.
  */
-static void extend_recovery_timer(struct obd_device *obd, int drt,
+static void extend_recovery_timer(struct obd_device *obd, time_t dr_timeout,
 				  bool extend)
 {
-	time64_t now;
-	time64_t end;
-	time64_t left;
-	time64_t to;
+	ktime_t left_ns;
+	time_t timeout;
+	time_t left;
 
 	spin_lock(&obd->obd_dev_lock);
 	if (!obd->obd_recovering || obd->obd_abort_recovery) {
@@ -1769,33 +1802,43 @@ static void extend_recovery_timer(struct obd_device *obd, int drt,
 	}
 	LASSERT(obd->obd_recovery_start != 0);
 
-	now = ktime_get_real_seconds();
-	to = obd->obd_recovery_timeout;
-	end = obd->obd_recovery_start + to;
-	left = end - now;
+	left_ns = hrtimer_expires_remaining(&obd->obd_recovery_timer);
+	left = ktime_divns(left_ns, NSEC_PER_SEC);
 
-        if (extend && (drt > left)) {
-                to += drt - left;
-        } else if (!extend && (drt > to)) {
-                to = drt;
-        }
-
-	if (to > obd->obd_recovery_time_hard) {
-		to = obd->obd_recovery_time_hard;
-		CWARN("%s: extended recovery timer reaching hard limit: %lld, extend: %d\n",
-		      obd->obd_name, to, extend);
+	if (extend) {
+		timeout = obd->obd_recovery_timeout;
+		/* dr_timeout will happen after the hrtimer has expired.
+		 * Add the excess time to the soft recovery timeout without
+		 * exceeding the hard recovery timeout.
+		 */
+		if (dr_timeout > left) {
+			timeout += dr_timeout - left;
+			timeout = min_t(time_t, obd->obd_recovery_time_hard,
+					timeout);
+		}
+	} else {
+		timeout = clamp_t(time_t, dr_timeout, obd->obd_recovery_timeout,
+				  obd->obd_recovery_time_hard);
 	}
 
-	if (obd->obd_recovery_timeout < to) {
-                obd->obd_recovery_timeout = to;
-		end = obd->obd_recovery_start + to;
-		mod_timer(&obd->obd_recovery_timer,
-			  cfs_time_shift(end - now));
-        }
+	if (timeout == obd->obd_recovery_time_hard)
+		CWARN("%s: extended recovery timer reached hard limit: %ld, extend: %d\n",
+		      obd->obd_name, timeout, extend);
+
+	if (obd->obd_recovery_timeout < timeout) {
+		ktime_t end, now;
+
+		obd->obd_recovery_timeout = timeout;
+		end = ktime_set(obd->obd_recovery_start + timeout, 0);
+		now = ktime_set(ktime_get_seconds(), 0);
+		left_ns = ktime_sub(end, now);
+		hrtimer_start(&obd->obd_recovery_timer, end, HRTIMER_MODE_ABS);
+		left = ktime_divns(left_ns, NSEC_PER_SEC);
+	}
 	spin_unlock(&obd->obd_dev_lock);
 
-	CDEBUG(D_HA, "%s: recovery timer will expire in %lld seconds\n",
-		obd->obd_name, (s64)(end - now));
+	CDEBUG(D_HA, "%s: recovery timer will expire in %ld seconds\n",
+		obd->obd_name, left);
 }
 
 /* Reset the timer with each new client connection */
@@ -1808,40 +1851,45 @@ static void extend_recovery_timer(struct obd_device *obd, int drt,
  * be extended to make sure the client could be reconnected, in the
  * process, the timeout from the new client should be ignored.
  */
-
 static void
 check_and_start_recovery_timer(struct obd_device *obd,
-                               struct ptlrpc_request *req,
-                               int new_client)
+			       struct ptlrpc_request *req,
+			       int new_client)
 {
-        int service_time = lustre_msg_get_service_time(req->rq_reqmsg);
-        struct obd_device_target *obt = &obd->u.obt;
+	timeout_t service_timeout = lustre_msg_get_service_timeout(req->rq_reqmsg);
+	struct obd_device_target *obt = &obd->u.obt;
 
-        if (!new_client && service_time)
-                /* Teach server about old server's estimates, as first guess
-                 * at how long new requests will take. */
+	if (!new_client && service_timeout)
+		/*
+		 * Teach server about old server's estimates, as first guess
+		 * at how long new requests will take.
+		 */
 		at_measured(&req->rq_rqbd->rqbd_svcpt->scp_at_estimate,
-                            service_time);
+			    service_timeout);
 
-        target_start_recovery_timer(obd);
+	target_start_recovery_timer(obd);
 
-	/* Convert the service time to RPC timeout,
-	 * and reuse service_time to limit stack usage. */
-	service_time = at_est2timeout(service_time);
+	/*
+	 * Convert the service time to RPC timeout,
+	 * and reuse service_timeout to limit stack usage.
+	 */
+	service_timeout = at_est2timeout(service_timeout);
 
 	if (OBD_FAIL_CHECK(OBD_FAIL_TGT_SLUGGISH_NET) &&
-	    service_time < at_extra)
-		service_time = at_extra;
+	    service_timeout < at_extra)
+		service_timeout = at_extra;
 
-	/* We expect other clients to timeout within service_time, then try
+	/*
+	 * We expect other clients to timeout within service_timeout, then try
 	 * to reconnect, then try the failover server.  The max delay between
-	 * connect attempts is SWITCH_MAX + SWITCH_INC + INITIAL. */
-        service_time += 2 * INITIAL_CONNECT_TIMEOUT;
+	 * connect attempts is SWITCH_MAX + SWITCH_INC + INITIAL.
+	 */
+	service_timeout += 2 * INITIAL_CONNECT_TIMEOUT;
 
-        LASSERT(obt->obt_magic == OBT_MAGIC);
-	service_time += 2 * (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC);
-	if (service_time > obd->obd_recovery_timeout && !new_client)
-		extend_recovery_timer(obd, service_time, false);
+	LASSERT(obt->obt_magic == OBT_MAGIC);
+	service_timeout += 2 * (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC);
+	if (service_timeout > obd->obd_recovery_timeout && !new_client)
+		extend_recovery_timer(obd, service_timeout, false);
 }
 
 /** Health checking routines */
@@ -1913,9 +1961,10 @@ static int check_for_next_transno(struct lu_target *lut)
 	queue_len = obd->obd_requests_queued_for_recovery;
 	next_transno = obd->obd_next_recovery_transno;
 
-	CDEBUG(D_HA, "max: %d, connected: %d, completed: %d, queue_len: %d, "
-	       "req_transno: %llu, next_transno: %llu\n",
-	       obd->obd_max_recoverable_clients, connected, completed,
+	CDEBUG(D_HA,
+	       "max: %d, connected: %d, completed: %d, queue_len: %d, req_transno: %llu, next_transno: %llu\n",
+	       atomic_read(&obd->obd_max_recoverable_clients),
+	       connected, completed,
 	       queue_len, req_transno, next_transno);
 
 	if (obd->obd_abort_recovery) {
@@ -1987,6 +2036,24 @@ static int check_for_next_lock(struct lu_target *lut)
 	return wake_up;
 }
 
+static int check_update_llog(struct lu_target *lut)
+{
+	struct obd_device *obd = lut->lut_obd;
+	struct target_distribute_txn_data *tdtd = lut->lut_tdtd;
+
+	if (obd->obd_abort_recovery) {
+		CDEBUG(D_HA, "waking for aborted recovery\n");
+		return 1;
+	}
+
+	if (atomic_read(&tdtd->tdtd_recovery_threads_count) == 0) {
+		CDEBUG(D_HA, "waking for completion of reading update log\n");
+		return 1;
+	}
+
+	return 0;
+}
+
 /**
  * wait for recovery events,
  * check its status with help of check_routine
@@ -2010,7 +2077,7 @@ static int target_recovery_overseer(struct lu_target *lut,
 			last = now;
 		}
 	}
-	if (obd->obd_recovery_start != 0 && ktime_get_real_seconds() >=
+	if (obd->obd_recovery_start != 0 && ktime_get_seconds() >=
 	      (obd->obd_recovery_start + obd->obd_recovery_time_hard)) {
 		__u64 next_update_transno = 0;
 
@@ -2026,16 +2093,16 @@ static int target_recovery_overseer(struct lu_target *lut,
 			 * updatelog retrieve threads did not get any records
 			 * yet, let's wait those threads stopped */
 			if (next_update_transno == 0) {
-				struct l_wait_info lwi = { 0 };
+				spin_unlock(&obd->obd_recovery_task_lock);
 
-				l_wait_event(tdtd->tdtd_recovery_threads_waitq,
-				       atomic_read(
-				       &tdtd->tdtd_recovery_threads_count) == 0,
-				       &lwi);
+				while (wait_event_timeout(
+					tdtd->tdtd_recovery_threads_waitq,
+					check_update_llog(lut),
+					cfs_time_seconds(60)) == 0);
 
+				spin_lock(&obd->obd_recovery_task_lock);
 				next_update_transno =
-					distribute_txn_get_next_transno(
-								lut->lut_tdtd);
+					distribute_txn_get_next_transno(tdtd);
 			}
 		}
 
@@ -2088,6 +2155,7 @@ static int target_recovery_overseer(struct lu_target *lut,
 		return 1;
 	} else if (obd->obd_recovery_expired) {
 		obd->obd_recovery_expired = 0;
+
 		/** If some clients died being recovered, evict them */
 		LCONSOLE_WARN("%s: recovery is timed out, "
 			      "evict stale exports\n", obd->obd_name);
@@ -2178,34 +2246,41 @@ static void handle_recovery_req(struct ptlrpc_thread *thread,
         (void)handler(req);
         lu_context_exit(&thread->t_env->le_ctx);
 
-        /* don't reset timer for final stage */
-        if (!exp_finished(req->rq_export)) {
-                int to = obd_timeout;
+	req->rq_svc_thread->t_env->le_ses = NULL;
+
+	/* don't reset timer for final stage */
+	if (!exp_finished(req->rq_export)) {
+		timeout_t timeout = obd_timeout;
 
-                /**
-                 * Add request timeout to the recovery time so next request from
-                 * this client may come in recovery time
-                 */
-                if (!AT_OFF) {
+		/**
+		 * Add request @timeout to the recovery time so next request from
+		 * this client may come in recovery time
+		 */
+		if (!AT_OFF) {
 			struct ptlrpc_service_part *svcpt;
+			timeout_t est_timeout;
 
 			svcpt = req->rq_rqbd->rqbd_svcpt;
 			/* If the server sent early reply for this request,
 			 * the client will recalculate the timeout according to
 			 * current server estimate service time, so we will
 			 * use the maxium timeout here for waiting the client
-			 * sending the next req */
-			to = max((int)at_est2timeout(
-				 at_get(&svcpt->scp_at_estimate)),
-				 (int)lustre_msg_get_timeout(req->rq_reqmsg));
-			/* Add 2 net_latency, one for balance rq_deadline
+			 * sending the next req
+			 */
+			est_timeout = at_get(&svcpt->scp_at_estimate);
+			timeout = max_t(timeout_t, at_est2timeout(est_timeout),
+					lustre_msg_get_timeout(req->rq_reqmsg));
+			/*
+			 * Add 2 net_latency, one for balance rq_deadline
 			 * (see ptl_send_rpc), one for resend the req to server,
 			 * Note: client will pack net_latency in replay req
-			 * (see ptlrpc_replay_req) */
-			to += 2 * lustre_msg_get_service_time(req->rq_reqmsg);
-                }
-                extend_recovery_timer(class_exp2obd(req->rq_export), to, true);
-        }
+			 * (see ptlrpc_replay_req)
+			 */
+			timeout += 2 * lustre_msg_get_service_timeout(req->rq_reqmsg);
+		}
+		extend_recovery_timer(class_exp2obd(req->rq_export), timeout,
+				      true);
+	}
 	EXIT;
 }
 
@@ -2215,15 +2290,17 @@ static int check_for_recovery_ready(struct lu_target *lut)
 	struct obd_device *obd = lut->lut_obd;
 	unsigned int clnts = atomic_read(&obd->obd_connected_clients);
 
-	CDEBUG(D_HA, "connected %d stale %d max_recoverable_clients %d"
-	       " abort %d expired %d\n", clnts, obd->obd_stale_clients,
-	       obd->obd_max_recoverable_clients, obd->obd_abort_recovery,
-	       obd->obd_recovery_expired);
+	CDEBUG(D_HA,
+	       "connected %d stale %d max_recoverable_clients %d abort %d expired %d\n",
+	       clnts, obd->obd_stale_clients,
+	       atomic_read(&obd->obd_max_recoverable_clients),
+	       obd->obd_abort_recovery, obd->obd_recovery_expired);
 
 	if (!obd->obd_abort_recovery && !obd->obd_recovery_expired) {
-		LASSERT(clnts <= obd->obd_max_recoverable_clients);
+		LASSERT(clnts <=
+			atomic_read(&obd->obd_max_recoverable_clients));
 		if (clnts + obd->obd_stale_clients <
-		    obd->obd_max_recoverable_clients)
+		    atomic_read(&obd->obd_max_recoverable_clients))
 			return 0;
 	}
 
@@ -2234,7 +2311,8 @@ static int check_for_recovery_ready(struct lu_target *lut)
 			 * timer expired, and some clients got evicted */
 			extend_recovery_timer(obd, obd->obd_recovery_timeout,
 					      true);
-			CDEBUG(D_HA, "%s update recovery is not ready, extend recovery %llu\n",
+			CDEBUG(D_HA,
+			       "%s update recovery is not ready, extend recovery %lu\n",
 			       obd->obd_name, obd->obd_recovery_timeout);
 			return 0;
 		}
@@ -2327,6 +2405,8 @@ static void drop_duplicate_replay_req(struct lu_env *env,
 	obd->obd_replayed_requests++;
 }
 
+#define WATCHDOG_TIMEOUT (obd_timeout * 10)
+
 static void replay_request_or_update(struct lu_env *env,
 				     struct lu_target *lut,
 				     struct target_recovery_data *trd,
@@ -2397,8 +2477,13 @@ static void replay_request_or_update(struct lu_env *env,
 				  lustre_msg_get_transno(req->rq_reqmsg),
 				  libcfs_nid2str(req->rq_peer.nid));
 
+			thread->t_watchdog = lc_watchdog_add(WATCHDOG_TIMEOUT,
+							     NULL, NULL);
 			handle_recovery_req(thread, req,
 					    trd->trd_recovery_handler);
+			lc_watchdog_delete(thread->t_watchdog);
+			thread->t_watchdog = NULL;
+
 			/**
 			 * bz18031: increase next_recovery_transno before
 			 * target_request_copy_put() will drop exp_rpc reference
@@ -2418,7 +2503,11 @@ static void replay_request_or_update(struct lu_env *env,
 			LASSERT(tdtd != NULL);
 			dtrq = distribute_txn_get_next_req(tdtd);
 			lu_context_enter(&thread->t_env->le_ctx);
+			thread->t_watchdog = lc_watchdog_add(WATCHDOG_TIMEOUT,
+							     NULL, NULL);
 			rc = tdtd->tdtd_replay_handler(env, tdtd, dtrq);
+			lc_watchdog_delete(thread->t_watchdog);
+			thread->t_watchdog = NULL;
 			lu_context_exit(&thread->t_env->le_ctx);
 			extend_recovery_timer(obd, obd_timeout, true);
 
@@ -2473,18 +2562,16 @@ static int target_recovery_thread(void *arg)
         if (thread == NULL)
                 RETURN(-ENOMEM);
 
-        OBD_ALLOC_PTR(env);
-        if (env == NULL) {
-                OBD_FREE_PTR(thread);
-                RETURN(-ENOMEM);
-        }
+	OBD_ALLOC_PTR(env);
+	if (env == NULL)
+		GOTO(out_thread, rc = -ENOMEM);
+	rc = lu_env_add(env);
+	if (rc)
+		GOTO(out_env, rc);
 
         rc = lu_context_init(&env->le_ctx, LCT_MD_THREAD | LCT_DT_THREAD);
-        if (rc) {
-                OBD_FREE_PTR(thread);
-                OBD_FREE_PTR(env);
-                RETURN(rc);
-        }
+	if (rc)
+		GOTO(out_env_remove, rc);
 
         thread->t_env = env;
         thread->t_id = -1; /* force filter_iobuf_get/put to use local buffers */
@@ -2526,6 +2613,11 @@ static int target_recovery_thread(void *arg)
 		LASSERT(trd->trd_processing_task == current_pid());
 		DEBUG_REQ(D_HA, req, "processing lock from %s: ",
 			  libcfs_nid2str(req->rq_peer.nid));
+		if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_LOCK_REPLAY)) {
+			req->rq_status = -ENODEV;
+			target_request_copy_put(req);
+			continue;
+		}
 		handle_recovery_req(thread, req,
 				    trd->trd_recovery_handler);
 		target_request_copy_put(req);
@@ -2576,8 +2668,12 @@ static int target_recovery_thread(void *arg)
 	complete(&trd->trd_finishing);
 
 	tgt_io_thread_done(thread);
-	OBD_FREE_PTR(thread);
+out_env_remove:
+	lu_env_remove(env);
+out_env:
 	OBD_FREE_PTR(env);
+out_thread:
+	OBD_FREE_PTR(thread);
 	RETURN(rc);
 }
 
@@ -2634,17 +2730,20 @@ void target_recovery_fini(struct obd_device *obd)
 }
 EXPORT_SYMBOL(target_recovery_fini);
 
-static void target_recovery_expired(cfs_timer_cb_arg_t data)
+static enum hrtimer_restart target_recovery_expired(struct hrtimer *timer)
 {
-	struct obd_device *obd = cfs_from_timer(obd, data, obd_recovery_timer);
-	CDEBUG(D_HA, "%s: recovery timed out; %d clients are still in recovery"
-	       " after %llus (%d clients connected)\n",
+	struct obd_device *obd = container_of(timer, struct obd_device,
+					      obd_recovery_timer);
+
+	CDEBUG(D_HA,
+	       "%s: recovery timed out; %d clients are still in recovery after %llu seconds (%d clients connected)\n",
 	       obd->obd_name, atomic_read(&obd->obd_lock_replay_clients),
-	       (s64)(ktime_get_real_seconds() - obd->obd_recovery_start),
+	       ktime_get_real_seconds() - obd->obd_recovery_start,
 	       atomic_read(&obd->obd_connected_clients));
 
 	obd->obd_recovery_expired = 1;
 	wake_up(&obd->obd_next_transno_waitq);
+	return HRTIMER_NORESTART;
 }
 
 void target_recovery_init(struct lu_target *lut, svc_handler_t handler)
@@ -2654,7 +2753,7 @@ void target_recovery_init(struct lu_target *lut, svc_handler_t handler)
 	if (lut->lut_bottom->dd_rdonly)
 		return;
 
-	if (obd->obd_max_recoverable_clients == 0) {
+	if (atomic_read(&obd->obd_max_recoverable_clients) == 0) {
 		/** Update server last boot epoch */
 		tgt_boot_epoch_update(lut);
 		return;
@@ -2662,14 +2761,16 @@ void target_recovery_init(struct lu_target *lut, svc_handler_t handler)
 
 	CDEBUG(D_HA, "RECOVERY: service %s, %d recoverable clients, "
 	       "last_transno %llu\n", obd->obd_name,
-	       obd->obd_max_recoverable_clients, obd->obd_last_committed);
-        LASSERT(obd->obd_stopping == 0);
-        obd->obd_next_recovery_transno = obd->obd_last_committed + 1;
-        obd->obd_recovery_start = 0;
-        obd->obd_recovery_end = 0;
-
-	cfs_timer_setup(&obd->obd_recovery_timer, target_recovery_expired,
-			(unsigned long)obd, 0);
+	       atomic_read(&obd->obd_max_recoverable_clients),
+	       obd->obd_last_committed);
+	LASSERT(obd->obd_stopping == 0);
+	obd->obd_next_recovery_transno = obd->obd_last_committed + 1;
+	obd->obd_recovery_start = 0;
+	obd->obd_recovery_end = 0;
+
+	hrtimer_init(&obd->obd_recovery_timer, CLOCK_MONOTONIC,
+		     HRTIMER_MODE_ABS);
+	obd->obd_recovery_timer.function = &target_recovery_expired;
 	target_start_recovery_thread(lut, handler);
 }
 EXPORT_SYMBOL(target_recovery_init);
@@ -2725,6 +2826,17 @@ int target_queue_recovery_request(struct ptlrpc_request *req,
         target_process_req_flags(obd, req);
 
         if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LOCK_REPLAY_DONE) {
+		if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_TGT_RECOVERY_REQ_RACE))) {
+			if (cfs_fail_val == 1) {
+				cfs_race_state = 1;
+				cfs_fail_val = 0;
+				wake_up(&cfs_race_waitq);
+
+				set_current_state(TASK_INTERRUPTIBLE);
+				schedule_timeout(cfs_time_seconds(1));
+			}
+		}
+
                 /* client declares he's ready to complete recovery
                  * so, we put the request on th final queue */
 		target_request_copy_get(req);
@@ -2875,12 +2987,6 @@ int target_queue_recovery_request(struct ptlrpc_request *req,
 	RETURN(0);
 }
 
-int target_handle_ping(struct ptlrpc_request *req)
-{
-        obd_ping(req->rq_svc_thread->t_env, req->rq_export);
-        return req_capsule_server_pack(&req->rq_pill);
-}
-
 void target_committed_to_req(struct ptlrpc_request *req)
 {
         struct obd_export *exp = req->rq_export;
@@ -3172,10 +3278,10 @@ static inline const char *bulk2type(struct ptlrpc_request *req)
 int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc,
                    struct l_wait_info *lwi)
 {
-	struct ptlrpc_request	*req = desc->bd_req;
-	time_t			 start = cfs_time_current_sec();
-	time_t			 deadline;
-	int			 rc = 0;
+	struct ptlrpc_request *req = desc->bd_req;
+	time64_t start = ktime_get_seconds();
+	time64_t deadline;
+	int rc = 0;
 
 	ENTRY;
 
@@ -3222,12 +3328,13 @@ int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc,
 		deadline = req->rq_deadline;
 
 	do {
-		long timeoutl = deadline - cfs_time_current_sec();
-		cfs_duration_t timeout = timeoutl <= 0 ?
-					 CFS_TICK : cfs_time_seconds(timeoutl);
-		time_t	rq_deadline;
+		time64_t timeoutl = deadline - ktime_get_seconds();
+		long timeout_jiffies = timeoutl <= 0 ?
+				       1 : cfs_time_seconds(timeoutl);
+		time64_t rq_deadline;
 
-		*lwi = LWI_TIMEOUT_INTERVAL(timeout, cfs_time_seconds(1),
+		*lwi = LWI_TIMEOUT_INTERVAL(timeout_jiffies,
+					    cfs_time_seconds(1),
 					    target_bulk_timeout, desc);
 		rc = l_wait_event(desc->bd_waitq,
 				  !ptlrpc_server_bulk_active(desc) ||
@@ -3237,17 +3344,17 @@ int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc,
 				  lwi);
 		LASSERT(rc == 0 || rc == -ETIMEDOUT);
 		/* Wait again if we changed rq_deadline. */
-		rq_deadline = ACCESS_ONCE(req->rq_deadline);
+		rq_deadline = READ_ONCE(req->rq_deadline);
 		deadline = start + bulk_timeout;
 		if (deadline > rq_deadline)
 			deadline = rq_deadline;
-	} while ((rc == -ETIMEDOUT) &&
-		 (deadline > cfs_time_current_sec()));
+	} while (rc == -ETIMEDOUT &&
+		 deadline > ktime_get_seconds());
 
 	if (rc == -ETIMEDOUT) {
-		DEBUG_REQ(D_ERROR, req, "timeout on bulk %s after %ld%+lds",
+		DEBUG_REQ(D_ERROR, req, "timeout on bulk %s after %lld%+llds",
 			  bulk2type(req), deadline - start,
-			  cfs_time_current_sec() - deadline);
+			  ktime_get_real_seconds() - deadline);
 		ptlrpc_abort_bulk(desc);
 	} else if (exp->exp_failed) {
 		DEBUG_REQ(D_ERROR, req, "Eviction on bulk %s",
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c
index df28b2d7b5131..42eccaf9cf861 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2016, Intel Corporation.
+ * Copyright (c) 2010, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -44,6 +44,9 @@
 
 #include "ldlm_internal.h"
 
+struct kmem_cache *ldlm_glimpse_work_kmem;
+EXPORT_SYMBOL(ldlm_glimpse_work_kmem);
+
 /* lock types */
 char *ldlm_lockname[] = {
 	[0] = "--",
@@ -122,8 +125,6 @@ const char *ldlm_it2str(enum ldlm_intent_flags it)
 		return "getattr";
 	case IT_LOOKUP:
 		return "lookup";
-	case IT_UNLINK:
-		return "unlink";
 	case IT_GETXATTR:
 		return "getxattr";
 	case IT_LAYOUT:
@@ -150,6 +151,19 @@ ldlm_processing_policy ldlm_get_processing_policy(struct ldlm_resource *res)
         return ldlm_processing_policy_table[res->lr_type];
 }
 EXPORT_SYMBOL(ldlm_get_processing_policy);
+
+static ldlm_reprocessing_policy ldlm_reprocessing_policy_table[] = {
+	[LDLM_PLAIN]	= ldlm_reprocess_queue,
+	[LDLM_EXTENT]	= ldlm_reprocess_queue,
+	[LDLM_FLOCK]	= ldlm_reprocess_queue,
+	[LDLM_IBITS]	= ldlm_reprocess_inodebits_queue,
+};
+
+ldlm_reprocessing_policy ldlm_get_reprocessing_policy(struct ldlm_resource *res)
+{
+	return ldlm_reprocessing_policy_table[res->lr_type];
+}
+
 #endif /* HAVE_SERVER_SUPPORT */
 
 void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg)
@@ -204,8 +218,6 @@ void ldlm_lock_put(struct ldlm_lock *lock)
                 lprocfs_counter_decr(ldlm_res_to_ns(res)->ns_stats,
                                      LDLM_NSS_LOCKS);
                 lu_ref_del(&res->lr_reference, "lock", lock);
-                ldlm_resource_putref(res);
-                lock->l_resource = NULL;
                 if (lock->l_export) {
                         class_export_lock_put(lock->l_export, lock);
                         lock->l_export = NULL;
@@ -214,7 +226,15 @@ void ldlm_lock_put(struct ldlm_lock *lock)
                 if (lock->l_lvb_data != NULL)
                         OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
 
-                ldlm_interval_free(ldlm_interval_detach(lock));
+		if (res->lr_type == LDLM_EXTENT) {
+			ldlm_interval_free(ldlm_interval_detach(lock));
+		} else if (res->lr_type == LDLM_IBITS) {
+			if (lock->l_ibits_node != NULL)
+				OBD_SLAB_FREE_PTR(lock->l_ibits_node,
+						  ldlm_inodebits_slab);
+		}
+		ldlm_resource_putref(res);
+		lock->l_resource = NULL;
                 lu_ref_fini(&lock->l_reference);
 		OBD_FREE_RCU(lock, sizeof(*lock), &lock->l_handle);
         }
@@ -477,7 +497,7 @@ static struct ldlm_lock *ldlm_lock_new(struct ldlm_resource *resource)
 
         lprocfs_counter_incr(ldlm_res_to_ns(resource)->ns_stats,
                              LDLM_NSS_LOCKS);
-	INIT_LIST_HEAD(&lock->l_handle.h_link);
+	INIT_LIST_HEAD_RCU(&lock->l_handle.h_link);
 	class_handle_hash(&lock->l_handle, &lock_handle_ops);
 
         lu_ref_init(&lock->l_reference);
@@ -664,12 +684,19 @@ static void ldlm_add_bl_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
 		 * discard dirty data, rather than writing back. */
 		if (ldlm_is_ast_discard_data(new))
 			ldlm_set_discard_data(lock);
-		LASSERT(list_empty(&lock->l_bl_ast));
-		list_add(&lock->l_bl_ast, work_list);
-                LDLM_LOCK_GET(lock);
-                LASSERT(lock->l_blocking_lock == NULL);
-                lock->l_blocking_lock = LDLM_LOCK_GET(new);
-        }
+
+		/* Lock can be converted from a blocking state back to granted
+		 * after lock convert or COS downgrade but still be in an
+		 * older bl_list because it is controlled only by
+		 * ldlm_work_bl_ast_lock(), let it be processed there.
+		 */
+		if (list_empty(&lock->l_bl_ast)) {
+			list_add(&lock->l_bl_ast, work_list);
+			LDLM_LOCK_GET(lock);
+		}
+		LASSERT(lock->l_blocking_lock == NULL);
+		lock->l_blocking_lock = LDLM_LOCK_GET(new);
+	}
 }
 
 /**
@@ -867,7 +894,8 @@ void ldlm_lock_decref_internal(struct ldlm_lock *lock, enum ldlm_mode mode)
         } else if (ns_is_client(ns) &&
                    !lock->l_readers && !lock->l_writers &&
 		   !ldlm_is_no_lru(lock) &&
-		   !ldlm_is_bl_ast(lock)) {
+		   !ldlm_is_bl_ast(lock) &&
+		   !ldlm_is_converting(lock)) {
 
                 LDLM_DEBUG(lock, "add lock into lru list");
 
@@ -1071,16 +1099,14 @@ static void ldlm_granted_list_add_lock(struct ldlm_lock *lock,
  * Add a lock to granted list on a resource maintaining skiplist
  * correctness.
  */
-static void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock)
+void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock)
 {
-        struct sl_insert_point prev;
-        ENTRY;
+	struct sl_insert_point prev;
 
-        LASSERT(lock->l_req_mode == lock->l_granted_mode);
+	LASSERT(ldlm_is_granted(lock));
 
-        search_granted_lock(&lock->l_resource->lr_granted, lock, &prev);
-        ldlm_granted_list_add_lock(lock, &prev);
-        EXIT;
+	search_granted_lock(&lock->l_resource->lr_granted, lock, &prev);
+	ldlm_granted_list_add_lock(lock, &prev);
 }
 
 /**
@@ -1090,7 +1116,6 @@ static void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock)
  * NOTE: called by
  *  - ldlm_lock_enqueue
  *  - ldlm_reprocess_queue
- *  - ldlm_lock_convert
  *
  * must be called with lr_lock held
  */
@@ -1131,18 +1156,6 @@ void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list)
         EXIT;
 }
 
-/**
- * Describe the overlap between two locks.  itree_overlap_cb data.
- */
-struct lock_match_data {
-	struct ldlm_lock	*lmd_old;
-	struct ldlm_lock	*lmd_lock;
-	enum ldlm_mode		*lmd_mode;
-	union ldlm_policy_data	*lmd_policy;
-	__u64			 lmd_flags;
-	int			 lmd_unref;
-};
-
 /**
  * Check if the given @lock meets the criteria for a match.
  * A reference on the lock is taken if matched.
@@ -1150,10 +1163,10 @@ struct lock_match_data {
  * \param lock     test-against this lock
  * \param data	   parameters
  */
-static int lock_matches(struct ldlm_lock *lock, struct lock_match_data *data)
+static int lock_matches(struct ldlm_lock *lock, struct ldlm_match_data *data)
 {
 	union ldlm_policy_data *lpol = &lock->l_policy_data;
-	enum ldlm_mode match;
+	enum ldlm_mode match = LCK_MINMODE;
 
 	if (lock == data->lmd_old)
 		return INTERVAL_ITER_STOP;
@@ -1178,6 +1191,17 @@ static int lock_matches(struct ldlm_lock *lock, struct lock_match_data *data)
 
 	if (!(lock->l_req_mode & *data->lmd_mode))
 		return INTERVAL_ITER_CONT;
+
+	/* When we search for ast_data, we are not doing a traditional match,
+	 * so we don't worry about IBITS or extent matching.
+	 */
+	if (data->lmd_has_ast_data) {
+		if (!lock->l_ast_data)
+			return INTERVAL_ITER_CONT;
+
+		goto matched;
+	}
+
 	match = lock->l_req_mode;
 
 	switch (lock->l_resource->lr_type) {
@@ -1211,6 +1235,11 @@ static int lock_matches(struct ldlm_lock *lock, struct lock_match_data *data)
 	if (!equi(data->lmd_flags & LDLM_FL_LOCAL_ONLY, ldlm_is_local(lock)))
 		return INTERVAL_ITER_CONT;
 
+	/* Filter locks by skipping flags */
+	if (data->lmd_skip_flags & lock->l_flags)
+		return INTERVAL_ITER_CONT;
+
+matched:
 	if (data->lmd_flags & LDLM_FL_TEST_LOCK) {
 		LDLM_LOCK_GET(lock);
 		ldlm_lock_touch_in_lru(lock);
@@ -1227,7 +1256,7 @@ static int lock_matches(struct ldlm_lock *lock, struct lock_match_data *data)
 static unsigned int itree_overlap_cb(struct interval_node *in, void *args)
 {
 	struct ldlm_interval *node = to_ldlm_interval(in);
-	struct lock_match_data *data = args;
+	struct ldlm_match_data *data = args;
 	struct ldlm_lock *lock;
 	int rc;
 
@@ -1247,8 +1276,8 @@ static unsigned int itree_overlap_cb(struct interval_node *in, void *args)
  *
  * \retval a referenced lock or NULL.
  */
-static struct ldlm_lock *search_itree(struct ldlm_resource *res,
-				      struct lock_match_data *data)
+struct ldlm_lock *search_itree(struct ldlm_resource *res,
+			       struct ldlm_match_data *data)
 {
 	struct interval_node_extent ext = {
 		.start     = data->lmd_policy->l_extent.start,
@@ -1256,6 +1285,8 @@ static struct ldlm_lock *search_itree(struct ldlm_resource *res,
 	};
 	int idx;
 
+	data->lmd_lock = NULL;
+
 	for (idx = 0; idx < LCK_MODE_NUM; idx++) {
 		struct ldlm_interval_tree *tree = &res->lr_itree[idx];
 
@@ -1267,9 +1298,13 @@ static struct ldlm_lock *search_itree(struct ldlm_resource *res,
 
 		interval_search(tree->lit_root, &ext,
 				itree_overlap_cb, data);
+		if (data->lmd_lock)
+			return data->lmd_lock;
 	}
-	return data->lmd_lock;
+
+	return NULL;
 }
+EXPORT_SYMBOL(search_itree);
 
 
 /**
@@ -1281,16 +1316,19 @@ static struct ldlm_lock *search_itree(struct ldlm_resource *res,
  * \retval a referenced lock or NULL.
  */
 static struct ldlm_lock *search_queue(struct list_head *queue,
-				      struct lock_match_data *data)
+				      struct ldlm_match_data *data)
 {
 	struct ldlm_lock *lock;
 	int rc;
 
+	data->lmd_lock = NULL;
+
 	list_for_each_entry(lock, queue, l_res_link) {
 		rc = lock_matches(lock, data);
 		if (rc == INTERVAL_ITER_STOP)
 			return data->lmd_lock;
 	}
+
 	return NULL;
 }
 
@@ -1366,24 +1404,28 @@ EXPORT_SYMBOL(ldlm_lock_allow_match);
  * keep caller code unchanged), the context failure will be discovered by
  * caller sometime later.
  */
-enum ldlm_mode ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags,
-			       const struct ldlm_res_id *res_id,
-			       enum ldlm_type type,
-			       union ldlm_policy_data *policy,
-			       enum ldlm_mode mode,
-			       struct lustre_handle *lockh, int unref)
-{
-	struct lock_match_data data = {
-		.lmd_old	= NULL,
-		.lmd_lock	= NULL,
-		.lmd_mode	= &mode,
-		.lmd_policy	= policy,
-		.lmd_flags	= flags,
-		.lmd_unref	= unref,
+enum ldlm_mode ldlm_lock_match_with_skip(struct ldlm_namespace *ns,
+					 __u64 flags, __u64 skip_flags,
+					 const struct ldlm_res_id *res_id,
+					 enum ldlm_type type,
+					 union ldlm_policy_data *policy,
+					 enum ldlm_mode mode,
+					 struct lustre_handle *lockh, int unref)
+{
+	struct ldlm_match_data data = {
+		.lmd_old = NULL,
+		.lmd_lock = NULL,
+		.lmd_mode = &mode,
+		.lmd_policy = policy,
+		.lmd_flags = flags,
+		.lmd_skip_flags = skip_flags,
+		.lmd_unref = unref,
+		.lmd_has_ast_data = false,
 	};
 	struct ldlm_resource *res;
 	struct ldlm_lock *lock;
-	int rc = 0;
+	int matched;
+
 	ENTRY;
 
 	if (ns == NULL) {
@@ -1404,101 +1446,78 @@ enum ldlm_mode ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags,
 
 	LDLM_RESOURCE_ADDREF(res);
 	lock_res(res);
-
 	if (res->lr_type == LDLM_EXTENT)
 		lock = search_itree(res, &data);
 	else
 		lock = search_queue(&res->lr_granted, &data);
-	if (lock != NULL)
-		GOTO(out, rc = 1);
-	if (flags & LDLM_FL_BLOCK_GRANTED)
-		GOTO(out, rc = 0);
-	lock = search_queue(&res->lr_converting, &data);
-	if (lock != NULL)
-		GOTO(out, rc = 1);
-	lock = search_queue(&res->lr_waiting, &data);
-	if (lock != NULL)
-		GOTO(out, rc = 1);
-
-        EXIT;
- out:
-        unlock_res(res);
-        LDLM_RESOURCE_DELREF(res);
-        ldlm_resource_putref(res);
+	if (!lock && !(flags & LDLM_FL_BLOCK_GRANTED))
+		lock = search_queue(&res->lr_waiting, &data);
+	matched = lock ? mode : 0;
+	unlock_res(res);
+	LDLM_RESOURCE_DELREF(res);
+	ldlm_resource_putref(res);
 
-        if (lock) {
-                ldlm_lock2handle(lock, lockh);
-                if ((flags & LDLM_FL_LVB_READY) &&
+	if (lock) {
+		ldlm_lock2handle(lock, lockh);
+		if ((flags & LDLM_FL_LVB_READY) &&
 		    (!ldlm_is_lvb_ready(lock))) {
 			__u64 wait_flags = LDLM_FL_LVB_READY |
 				LDLM_FL_DESTROYED | LDLM_FL_FAIL_NOTIFIED;
-                        struct l_wait_info lwi;
-                        if (lock->l_completion_ast) {
-                                int err = lock->l_completion_ast(lock,
-                                                          LDLM_FL_WAIT_NOREPROC,
-                                                                 NULL);
-                                if (err) {
-                                        if (flags & LDLM_FL_TEST_LOCK)
-                                                LDLM_LOCK_RELEASE(lock);
-                                        else
-                                                ldlm_lock_decref_internal(lock,
-                                                                          mode);
-                                        rc = 0;
-                                        goto out2;
-                                }
-                        }
+			struct l_wait_info lwi;
+
+			if (lock->l_completion_ast) {
+				int err = lock->l_completion_ast(lock,
+							LDLM_FL_WAIT_NOREPROC,
+							NULL);
+				if (err)
+					GOTO(out_fail_match, matched = 0);
+			}
 
-                        lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(obd_timeout),
-                                               NULL, LWI_ON_SIGNAL_NOOP, NULL);
+			lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(obd_timeout),
+					       NULL, LWI_ON_SIGNAL_NOOP, NULL);
 
 			/* XXX FIXME see comment on CAN_MATCH in lustre_dlm.h */
-			l_wait_event(lock->l_waitq,
-				     lock->l_flags & wait_flags,
+			l_wait_event(lock->l_waitq, lock->l_flags & wait_flags,
 				     &lwi);
-			if (!ldlm_is_lvb_ready(lock)) {
-                                if (flags & LDLM_FL_TEST_LOCK)
-                                        LDLM_LOCK_RELEASE(lock);
-                                else
-                                        ldlm_lock_decref_internal(lock, mode);
-                                rc = 0;
-                        }
-                }
-        }
- out2:
-        if (rc) {
-		LDLM_DEBUG(lock, "matched (%llu %llu)",
-                           (type == LDLM_PLAIN || type == LDLM_IBITS) ?
-                                res_id->name[2] : policy->l_extent.start,
-                           (type == LDLM_PLAIN || type == LDLM_IBITS) ?
-                                res_id->name[3] : policy->l_extent.end);
-
-                /* check user's security context */
-                if (lock->l_conn_export &&
-                    sptlrpc_import_check_ctx(
-                                class_exp2cliimp(lock->l_conn_export))) {
-                        if (!(flags & LDLM_FL_TEST_LOCK))
-                                ldlm_lock_decref_internal(lock, mode);
-                        rc = 0;
-                }
+			if (!ldlm_is_lvb_ready(lock))
+				GOTO(out_fail_match, matched = 0);
+		}
 
-                if (flags & LDLM_FL_TEST_LOCK)
-                        LDLM_LOCK_RELEASE(lock);
+		/* check user's security context */
+		if (lock->l_conn_export &&
+		    sptlrpc_import_check_ctx(
+				class_exp2cliimp(lock->l_conn_export)))
+			GOTO(out_fail_match, matched = 0);
 
-        } else if (!(flags & LDLM_FL_TEST_LOCK)) {/*less verbose for test-only*/
-                LDLM_DEBUG_NOLOCK("not matched ns %p type %u mode %u res "
+		LDLM_DEBUG(lock, "matched (%llu %llu)",
+			   (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+			   res_id->name[2] : policy->l_extent.start,
+			   (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+			   res_id->name[3] : policy->l_extent.end);
+
+out_fail_match:
+		if (flags & LDLM_FL_TEST_LOCK)
+			LDLM_LOCK_RELEASE(lock);
+		else if (!matched)
+			ldlm_lock_decref_internal(lock, mode);
+	}
+
+	/* less verbose for test-only */
+	if (!matched && !(flags & LDLM_FL_TEST_LOCK)) {
+		LDLM_DEBUG_NOLOCK("not matched ns %p type %u mode %u res "
 				  "%llu/%llu (%llu %llu)", ns,
-                                  type, mode, res_id->name[0], res_id->name[1],
-                                  (type == LDLM_PLAIN || type == LDLM_IBITS) ?
-                                        res_id->name[2] :policy->l_extent.start,
-                                  (type == LDLM_PLAIN || type == LDLM_IBITS) ?
-                                        res_id->name[3] : policy->l_extent.end);
-        }
+				  type, mode, res_id->name[0], res_id->name[1],
+				  (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+				  res_id->name[2] : policy->l_extent.start,
+				  (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+				  res_id->name[3] : policy->l_extent.end);
+	}
 	if (data.lmd_old != NULL)
 		LDLM_LOCK_PUT(data.lmd_old);
 
-	return rc ? mode : 0;
+	return matched;
 }
-EXPORT_SYMBOL(ldlm_lock_match);
+EXPORT_SYMBOL(ldlm_lock_match_with_skip);
 
 enum ldlm_mode ldlm_revalidate_lock_handle(const struct lustre_handle *lockh,
 					   __u64 *bits)
@@ -1669,11 +1688,18 @@ struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns,
 		lock->l_glimpse_ast = cbs->lcs_glimpse;
 	}
 
-	lock->l_tree_node = NULL;
-	/* if this is the extent lock, allocate the interval tree node */
-	if (type == LDLM_EXTENT)
-		if (ldlm_interval_alloc(lock) == NULL)
-			GOTO(out, rc = -ENOMEM);
+	switch (type) {
+	case LDLM_EXTENT:
+		rc = ldlm_extent_alloc_lock(lock);
+		break;
+	case LDLM_IBITS:
+		rc = ldlm_inodebits_alloc_lock(lock);
+		break;
+	default:
+		rc = 0;
+	}
+	if (rc)
+		GOTO(out, rc);
 
 	if (lvb_len) {
 		lock->l_lvb_len = lvb_len;
@@ -1694,6 +1720,30 @@ struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns,
 	RETURN(ERR_PTR(rc));
 }
 
+#ifdef HAVE_SERVER_SUPPORT
+static enum ldlm_error ldlm_lock_enqueue_helper(struct ldlm_lock *lock,
+					     __u64 *flags)
+{
+	struct ldlm_resource *res = lock->l_resource;
+	enum ldlm_error rc = ELDLM_OK;
+	struct list_head rpc_list = LIST_HEAD_INIT(rpc_list);
+	ldlm_processing_policy policy;
+
+	ENTRY;
+
+	policy = ldlm_get_processing_policy(res);
+	policy(lock, flags, LDLM_PROCESS_ENQUEUE, &rc, &rpc_list);
+	if (rc == ELDLM_OK && lock->l_granted_mode != lock->l_req_mode &&
+	    res->lr_type != LDLM_FLOCK)
+		rc = ldlm_handle_conflict_lock(lock, flags, &rpc_list);
+
+	if (!list_empty(&rpc_list))
+		ldlm_discard_bl_list(&rpc_list);
+
+	RETURN(rc);
+}
+#endif
+
 /**
  * Enqueue (request) a lock.
  *
@@ -1704,16 +1754,14 @@ struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns,
  * set, skip all the enqueueing and delegate lock processing to intent policy
  * function.
  */
-enum ldlm_error ldlm_lock_enqueue(struct ldlm_namespace *ns,
+enum ldlm_error ldlm_lock_enqueue(const struct lu_env *env,
+				  struct ldlm_namespace *ns,
 				  struct ldlm_lock **lockp,
 				  void *cookie, __u64 *flags)
 {
 	struct ldlm_lock *lock = *lockp;
 	struct ldlm_resource *res = lock->l_resource;
 	int local = ns_is_client(ldlm_res_to_ns(res));
-#ifdef HAVE_SERVER_SUPPORT
-	ldlm_processing_policy policy;
-#endif
 	enum ldlm_error rc = ELDLM_OK;
 	struct ldlm_interval *node = NULL;
 	ENTRY;
@@ -1721,8 +1769,8 @@ enum ldlm_error ldlm_lock_enqueue(struct ldlm_namespace *ns,
         /* policies are not executed on the client or during replay */
         if ((*flags & (LDLM_FL_HAS_INTENT|LDLM_FL_REPLAY)) == LDLM_FL_HAS_INTENT
             && !local && ns->ns_policy) {
-                rc = ns->ns_policy(ns, lockp, cookie, lock->l_req_mode, *flags,
-                                   NULL);
+		rc = ns->ns_policy(env, ns, lockp, cookie, lock->l_req_mode,
+				   *flags, NULL);
                 if (rc == ELDLM_LOCK_REPLACED) {
                         /* The lock that was returned has already been granted,
                          * and placed into lockp.  If it's not the same as the
@@ -1735,7 +1783,7 @@ enum ldlm_error ldlm_lock_enqueue(struct ldlm_namespace *ns,
                         *flags |= LDLM_FL_LOCK_CHANGED;
                         RETURN(0);
 		} else if (rc != ELDLM_OK &&
-			   lock->l_req_mode == lock->l_granted_mode) {
+			   ldlm_is_granted(lock)) {
 			LASSERT(*flags & LDLM_FL_RESENT);
 			/* It may happen that ns_policy returns an error in
 			 * resend case, object may be unlinked or just some
@@ -1758,7 +1806,7 @@ enum ldlm_error ldlm_lock_enqueue(struct ldlm_namespace *ns,
 		 * Take NO_TIMEOUT from the lock as it is inherited through
 		 * LDLM_FL_INHERIT_MASK */
 		*flags |= LDLM_FL_LOCK_CHANGED;
-		if (lock->l_req_mode != lock->l_granted_mode)
+		if (!ldlm_is_granted(lock))
 			*flags |= LDLM_FL_BLOCK_GRANTED;
 		*flags |= lock->l_flags & LDLM_FL_NO_TIMEOUT;
 		RETURN(ELDLM_OK);
@@ -1771,8 +1819,8 @@ enum ldlm_error ldlm_lock_enqueue(struct ldlm_namespace *ns,
 	if (!local && (*flags & LDLM_FL_REPLAY) && res->lr_type == LDLM_EXTENT)
 		OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, GFP_NOFS);
 
-        lock_res_and_lock(lock);
-        if (local && lock->l_req_mode == lock->l_granted_mode) {
+	lock_res_and_lock(lock);
+	if (local && ldlm_is_granted(lock)) {
                 /* The server returned a blocked lock, but it was granted
                  * before we got a chance to actually enqueue it.  We don't
                  * need to do anything else. */
@@ -1813,33 +1861,27 @@ enum ldlm_error ldlm_lock_enqueue(struct ldlm_namespace *ns,
 	 * more or less trusting the clients not to lie.
 	 *
 	 * FIXME (bug 268): Detect obvious lies by checking compatibility in
-	 * granted/converting queues. */
+	 * granted queue. */
         if (local) {
-                if (*flags & LDLM_FL_BLOCK_CONV)
-                        ldlm_resource_add_lock(res, &res->lr_converting, lock);
-                else if (*flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED))
-                        ldlm_resource_add_lock(res, &res->lr_waiting, lock);
-                else
-                        ldlm_grant_lock(lock, NULL);
+		if (*flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED))
+			ldlm_resource_add_lock(res, &res->lr_waiting, lock);
+		else
+			ldlm_grant_lock(lock, NULL);
 		GOTO(out, rc = ELDLM_OK);
 #ifdef HAVE_SERVER_SUPPORT
-        } else if (*flags & LDLM_FL_REPLAY) {
-                if (*flags & LDLM_FL_BLOCK_CONV) {
-                        ldlm_resource_add_lock(res, &res->lr_converting, lock);
-			GOTO(out, rc = ELDLM_OK);
-                } else if (*flags & LDLM_FL_BLOCK_WAIT) {
-                        ldlm_resource_add_lock(res, &res->lr_waiting, lock);
+	} else if (*flags & LDLM_FL_REPLAY) {
+		if (*flags & LDLM_FL_BLOCK_WAIT) {
+			ldlm_resource_add_lock(res, &res->lr_waiting, lock);
 			GOTO(out, rc = ELDLM_OK);
-                } else if (*flags & LDLM_FL_BLOCK_GRANTED) {
-                        ldlm_grant_lock(lock, NULL);
+		} else if (*flags & LDLM_FL_BLOCK_GRANTED) {
+			ldlm_grant_lock(lock, NULL);
 			GOTO(out, rc = ELDLM_OK);
-                }
-                /* If no flags, fall through to normal enqueue path. */
-        }
+		}
+		/* If no flags, fall through to normal enqueue path. */
+	}
 
-        policy = ldlm_processing_policy_table[res->lr_type];
-	policy(lock, flags, LDLM_PROCESS_ENQUEUE, &rc, NULL);
-        GOTO(out, rc);
+	rc = ldlm_lock_enqueue_helper(lock, flags);
+	GOTO(out, rc);
 #else
         } else {
                 CERROR("This is client-side-only module, cannot handle "
@@ -1864,31 +1906,42 @@ enum ldlm_error ldlm_lock_enqueue(struct ldlm_namespace *ns,
  */
 int ldlm_reprocess_queue(struct ldlm_resource *res, struct list_head *queue,
 			 struct list_head *work_list,
-			 enum ldlm_process_intention intention)
+			 enum ldlm_process_intention intention,
+			 struct ldlm_lock *hint)
 {
 	struct list_head *tmp, *pos;
 	ldlm_processing_policy policy;
 	__u64 flags;
 	int rc = LDLM_ITER_CONTINUE;
 	enum ldlm_error err;
+	struct list_head bl_ast_list = LIST_HEAD_INIT(bl_ast_list);
+
 	ENTRY;
 
 	check_res_locked(res);
 
-	policy = ldlm_processing_policy_table[res->lr_type];
+	policy = ldlm_get_processing_policy(res);
 	LASSERT(policy);
 	LASSERT(intention == LDLM_PROCESS_RESCAN ||
 		intention == LDLM_PROCESS_RECOVERY);
 
+restart:
 	list_for_each_safe(tmp, pos, queue) {
 		struct ldlm_lock *pending;
+		struct list_head rpc_list = LIST_HEAD_INIT(rpc_list);
 
 		pending = list_entry(tmp, struct ldlm_lock, l_res_link);
 
                 CDEBUG(D_INFO, "Reprocessing lock %p\n", pending);
 
                 flags = 0;
-		rc = policy(pending, &flags, intention, &err, work_list);
+		rc = policy(pending, &flags, intention, &err, &rpc_list);
+		if (pending->l_granted_mode == pending->l_req_mode ||
+		    res->lr_type == LDLM_FLOCK) {
+			list_splice(&rpc_list, work_list);
+		} else {
+			list_splice(&rpc_list, &bl_ast_list);
+		}
 		/*
 		 * When this is called from recovery done, we always want
 		 * to scan the whole list no matter what 'rc' is returned.
@@ -1898,6 +1951,20 @@ int ldlm_reprocess_queue(struct ldlm_resource *res, struct list_head *queue,
 			break;
         }
 
+	if (!list_empty(&bl_ast_list)) {
+		unlock_res(res);
+
+		rc = ldlm_run_ast_work(ldlm_res_to_ns(res), &bl_ast_list,
+				       LDLM_WORK_BL_AST);
+
+		lock_res(res);
+		if (rc == -ERESTART)
+			GOTO(restart, rc);
+	}
+
+	if (!list_empty(&bl_ast_list))
+		ldlm_discard_bl_list(&bl_ast_list);
+
         RETURN(intention == LDLM_PROCESS_RESCAN ? rc : LDLM_ITER_CONTINUE);
 }
 
@@ -1908,7 +1975,6 @@ int ldlm_reprocess_queue(struct ldlm_resource *res, struct list_head *queue,
  * \param[in] lock		The lock to be enqueued.
  * \param[out] flags		Lock flags for the lock to be enqueued.
  * \param[in] rpc_list		Conflicting locks list.
- * \param[in] grant_flags	extra flags when granting a lock.
  *
  * \retval -ERESTART:	Some lock was instantly canceled while sending
  * 			blocking ASTs, caller needs to re-check conflicting
@@ -1917,7 +1983,7 @@ int ldlm_reprocess_queue(struct ldlm_resource *res, struct list_head *queue,
  * \reval 0:		Lock is successfully added in waiting list.
  */
 int ldlm_handle_conflict_lock(struct ldlm_lock *lock, __u64 *flags,
-			      struct list_head *rpc_list, __u64 grant_flags)
+			      struct list_head *rpc_list)
 {
 	struct ldlm_resource *res = lock->l_resource;
 	int rc;
@@ -1942,6 +2008,9 @@ int ldlm_handle_conflict_lock(struct ldlm_lock *lock, __u64 *flags,
 	    !ns_is_client(ldlm_res_to_ns(res)))
 		class_fail_export(lock->l_export);
 
+	if (rc == -ERESTART)
+		ldlm_reprocess_all(res, NULL);
+
 	lock_res(res);
 	if (rc == -ERESTART) {
 		/* 15715: The lock was granted and destroyed after
@@ -1953,7 +2022,7 @@ int ldlm_handle_conflict_lock(struct ldlm_lock *lock, __u64 *flags,
 			RETURN(-EAGAIN);
 
 		/* lock was granted while resource was unlocked. */
-		if (lock->l_granted_mode == lock->l_req_mode) {
+		if (ldlm_is_granted(lock)) {
 			/* bug 11300: if the lock has been granted,
 			 * break earlier because otherwise, we will go
 			 * to restart and ldlm_resource_unlink will be
@@ -1961,12 +2030,10 @@ int ldlm_handle_conflict_lock(struct ldlm_lock *lock, __u64 *flags,
 			 * freed. Then we will fail at
 			 * ldlm_extent_add_lock() */
 			*flags &= ~LDLM_FL_BLOCKED_MASK;
-			RETURN(0);
 		}
 
-		RETURN(rc);
 	}
-	*flags |= (LDLM_FL_BLOCK_GRANTED | grant_flags);
+	*flags |= LDLM_FL_BLOCK_GRANTED;
 
 	RETURN(0);
 }
@@ -1979,27 +2046,21 @@ int ldlm_handle_conflict_lock(struct ldlm_lock *lock, __u64 *flags,
  */
 void ldlm_discard_bl_list(struct list_head *bl_list)
 {
-	struct list_head *tmp, *pos;
-        ENTRY;
+	struct ldlm_lock *lock, *tmp;
 
-	list_for_each_safe(pos, tmp, bl_list) {
-                struct ldlm_lock *lock =
-			list_entry(pos, struct ldlm_lock, l_bl_ast);
+	ENTRY;
 
+	list_for_each_entry_safe(lock, tmp, bl_list, l_bl_ast) {
+		LASSERT(!list_empty(&lock->l_bl_ast));
 		list_del_init(&lock->l_bl_ast);
-		LASSERT(ldlm_is_ast_sent(lock));
 		ldlm_clear_ast_sent(lock);
 		LASSERT(lock->l_bl_ast_run == 0);
-		LASSERT(lock->l_blocking_lock);
-		LDLM_LOCK_RELEASE(lock->l_blocking_lock);
-		lock->l_blocking_lock = NULL;
+		ldlm_clear_blocking_lock(lock);
 		LDLM_LOCK_RELEASE(lock);
 	}
 	EXIT;
 }
 
-#endif
-
 /**
  * Process a call to blocking AST callback for a lock in ast_work list
  */
@@ -2007,9 +2068,11 @@ static int
 ldlm_work_bl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
 {
 	struct ldlm_cb_set_arg *arg = opaq;
-	struct ldlm_lock_desc   d;
-	int                     rc;
-	struct ldlm_lock       *lock;
+	struct ldlm_lock *lock;
+	struct ldlm_lock_desc d;
+	struct ldlm_bl_desc bld;
+	int rc;
+
 	ENTRY;
 
 	if (list_empty(arg->list))
@@ -2017,66 +2080,49 @@ ldlm_work_bl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
 
 	lock = list_entry(arg->list->next, struct ldlm_lock, l_bl_ast);
 
-	/* nobody should touch l_bl_ast */
+	/* nobody should touch l_bl_ast but some locks in the list may become
+	 * granted after lock convert or COS downgrade, these locks should be
+	 * just skipped here and removed from the list.
+	 */
 	lock_res_and_lock(lock);
 	list_del_init(&lock->l_bl_ast);
 
+	/* lock is not blocking lock anymore, but was kept in the list because
+	 * it can managed only here.
+	 */
+	if (!ldlm_is_ast_sent(lock)) {
+		unlock_res_and_lock(lock);
+		LDLM_LOCK_RELEASE(lock);
+		RETURN(0);
+	}
+
+	LASSERT(lock->l_blocking_lock);
+	ldlm_lock2desc(lock->l_blocking_lock, &d);
+	/* copy blocking lock ibits in cancel_bits as well,
+	 * new client may use them for lock convert and it is
+	 * important to use new field to convert locks from
+	 * new servers only
+	 */
+	d.l_policy_data.l_inodebits.cancel_bits =
+		lock->l_blocking_lock->l_policy_data.l_inodebits.bits;
+
+	/* Blocking lock is being destroyed here but some information about it
+	 * may be needed inside l_blocking_ast() function below,
+	 * e.g. in mdt_blocking_ast(). So save needed data in bl_desc.
+	 */
+	bld.bl_same_client = lock->l_client_cookie ==
+			     lock->l_blocking_lock->l_client_cookie;
+	bld.bl_cos_incompat = ldlm_is_cos_incompat(lock->l_blocking_lock);
+	arg->bl_desc = &bld;
+
 	LASSERT(ldlm_is_ast_sent(lock));
 	LASSERT(lock->l_bl_ast_run == 0);
-	LASSERT(lock->l_blocking_lock);
 	lock->l_bl_ast_run++;
+	ldlm_clear_blocking_lock(lock);
 	unlock_res_and_lock(lock);
 
-	ldlm_lock2desc(lock->l_blocking_lock, &d);
-
 	rc = lock->l_blocking_ast(lock, &d, (void *)arg, LDLM_CB_BLOCKING);
-	LDLM_LOCK_RELEASE(lock->l_blocking_lock);
-	lock->l_blocking_lock = NULL;
-	LDLM_LOCK_RELEASE(lock);
 
-	RETURN(rc);
-}
-
-/**
- * Process a call to completion AST callback for a lock in ast_work list
- */
-static int
-ldlm_work_cp_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
-{
-	struct ldlm_cb_set_arg  *arg = opaq;
-	int                      rc = 0;
-	struct ldlm_lock        *lock;
-	ldlm_completion_callback completion_callback;
-	ENTRY;
-
-	if (list_empty(arg->list))
-		RETURN(-ENOENT);
-
-	lock = list_entry(arg->list->next, struct ldlm_lock, l_cp_ast);
-
-	/* It's possible to receive a completion AST before we've set
-	 * the l_completion_ast pointer: either because the AST arrived
-	 * before the reply, or simply because there's a small race
-	 * window between receiving the reply and finishing the local
-	 * enqueue. (bug 842)
-	 *
-	 * This can't happen with the blocking_ast, however, because we
-	 * will never call the local blocking_ast until we drop our
-	 * reader/writer reference, which we won't do until we get the
-	 * reply and finish enqueueing. */
-
-	/* nobody should touch l_cp_ast */
-	lock_res_and_lock(lock);
-	list_del_init(&lock->l_cp_ast);
-	LASSERT(ldlm_is_cp_reqd(lock));
-	/* save l_completion_ast since it can be changed by
-	 * mds_intent_policy(), see bug 14225 */
-	completion_callback = lock->l_completion_ast;
-	ldlm_clear_cp_reqd(lock);
-	unlock_res_and_lock(lock);
-
-	if (completion_callback != NULL)
-		rc = completion_callback(lock, 0, (void *)arg);
 	LDLM_LOCK_RELEASE(lock);
 
 	RETURN(rc);
@@ -2141,12 +2187,60 @@ int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
 		rc = 1;
 
 	LDLM_LOCK_RELEASE(lock);
-
-	if ((gl_work->gl_flags & LDLM_GL_WORK_NOFREE) == 0)
+	if (gl_work->gl_flags & LDLM_GL_WORK_SLAB_ALLOCATED)
+		OBD_SLAB_FREE_PTR(gl_work, ldlm_glimpse_work_kmem);
+	else
 		OBD_FREE_PTR(gl_work);
 
 	RETURN(rc);
 }
+#endif
+
+/**
+ * Process a call to completion AST callback for a lock in ast_work list
+ */
+static int
+ldlm_work_cp_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+	struct ldlm_cb_set_arg *arg = opaq;
+	struct ldlm_lock *lock;
+	ldlm_completion_callback completion_callback;
+	int rc = 0;
+
+	ENTRY;
+
+	if (list_empty(arg->list))
+		RETURN(-ENOENT);
+
+	lock = list_entry(arg->list->next, struct ldlm_lock, l_cp_ast);
+
+	/* It's possible to receive a completion AST before we've set
+	 * the l_completion_ast pointer: either because the AST arrived
+	 * before the reply, or simply because there's a small race
+	 * window between receiving the reply and finishing the local
+	 * enqueue. (bug 842)
+	 *
+	 * This can't happen with the blocking_ast, however, because we
+	 * will never call the local blocking_ast until we drop our
+	 * reader/writer reference, which we won't do until we get the
+	 * reply and finish enqueueing. */
+
+	/* nobody should touch l_cp_ast */
+	lock_res_and_lock(lock);
+	list_del_init(&lock->l_cp_ast);
+	LASSERT(ldlm_is_cp_reqd(lock));
+	/* save l_completion_ast since it can be changed by
+	 * mds_intent_policy(), see bug 14225 */
+	completion_callback = lock->l_completion_ast;
+	ldlm_clear_cp_reqd(lock);
+	unlock_res_and_lock(lock);
+
+	if (completion_callback != NULL)
+		rc = completion_callback(lock, 0, (void *)arg);
+	LDLM_LOCK_RELEASE(lock);
+
+	RETURN(rc);
+}
 
 /**
  * Process list of locks in need of ASTs being sent.
@@ -2155,11 +2249,11 @@ int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
  * one.
  */
 int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
-                      ldlm_desc_ast_t ast_type)
+		      ldlm_desc_ast_t ast_type)
 {
 	struct ldlm_cb_set_arg *arg;
-	set_producer_func       work_ast_lock;
-	int                     rc;
+	set_producer_func work_ast_lock;
+	int rc;
 
 	if (list_empty(rpc_list))
 		RETURN(0);
@@ -2172,24 +2266,26 @@ int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
 	arg->list = rpc_list;
 
 	switch (ast_type) {
-		case LDLM_WORK_BL_AST:
-			arg->type = LDLM_BL_CALLBACK;
-			work_ast_lock = ldlm_work_bl_ast_lock;
-			break;
-		case LDLM_WORK_CP_AST:
-			arg->type = LDLM_CP_CALLBACK;
-			work_ast_lock = ldlm_work_cp_ast_lock;
-			break;
-		case LDLM_WORK_REVOKE_AST:
-			arg->type = LDLM_BL_CALLBACK;
-			work_ast_lock = ldlm_work_revoke_ast_lock;
-			break;
-		case LDLM_WORK_GL_AST:
-			arg->type = LDLM_GL_CALLBACK;
-			work_ast_lock = ldlm_work_gl_ast_lock;
-			break;
-		default:
-			LBUG();
+	case LDLM_WORK_CP_AST:
+		arg->type = LDLM_CP_CALLBACK;
+		work_ast_lock = ldlm_work_cp_ast_lock;
+		break;
+#ifdef HAVE_SERVER_SUPPORT
+	case LDLM_WORK_BL_AST:
+		arg->type = LDLM_BL_CALLBACK;
+		work_ast_lock = ldlm_work_bl_ast_lock;
+		break;
+	case LDLM_WORK_REVOKE_AST:
+		arg->type = LDLM_BL_CALLBACK;
+		work_ast_lock = ldlm_work_revoke_ast_lock;
+		break;
+	case LDLM_WORK_GL_AST:
+		arg->type = LDLM_GL_CALLBACK;
+		work_ast_lock = ldlm_work_gl_ast_lock;
+		break;
+#endif
+	default:
+		LBUG();
 	}
 
 	/* We create a ptlrpc request set with flow control extension.
@@ -2201,7 +2297,7 @@ int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
 	if (arg->set == NULL)
 		GOTO(out, rc = -ENOMEM);
 
-	ptlrpc_set_wait(arg->set);
+	ptlrpc_set_wait(NULL, arg->set);
 	ptlrpc_set_destroy(arg->set);
 
 	rc = atomic_read(&arg->restart) ? -ERESTART : 0;
@@ -2214,26 +2310,29 @@ int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
 /**
  * Try to grant all waiting locks on a resource.
  *
- * Calls ldlm_reprocess_queue on converting and waiting queues.
+ * Calls ldlm_reprocess_queue on waiting queue.
  *
  * Typically called after some resource locks are cancelled to see
  * if anything could be granted as a result of the cancellation.
  */
 static void __ldlm_reprocess_all(struct ldlm_resource *res,
-				 enum ldlm_process_intention intention)
+				 enum ldlm_process_intention intention,
+				 struct ldlm_lock *hint)
 {
 	struct list_head rpc_list;
 #ifdef HAVE_SERVER_SUPPORT
+	ldlm_reprocessing_policy reprocess;
 	struct obd_device *obd;
-        int rc;
-        ENTRY;
+	int rc;
+
+	ENTRY;
 
 	INIT_LIST_HEAD(&rpc_list);
-        /* Local lock trees don't get reprocessed. */
-        if (ns_is_client(ldlm_res_to_ns(res))) {
-                EXIT;
-                return;
-        }
+	/* Local lock trees don't get reprocessed. */
+	if (ns_is_client(ldlm_res_to_ns(res))) {
+		EXIT;
+		return;
+	}
 
 	/* Disable reprocess during lock replay stage but allow during
 	 * request replay stage.
@@ -2244,35 +2343,32 @@ static void __ldlm_reprocess_all(struct ldlm_resource *res,
 		RETURN_EXIT;
 restart:
 	lock_res(res);
-	rc = ldlm_reprocess_queue(res, &res->lr_converting, &rpc_list,
-				  intention);
-	if (rc == LDLM_ITER_CONTINUE)
-		ldlm_reprocess_queue(res, &res->lr_waiting, &rpc_list,
-				     intention);
+	reprocess = ldlm_get_reprocessing_policy(res);
+	reprocess(res, &res->lr_waiting, &rpc_list, intention, hint);
 	unlock_res(res);
 
-        rc = ldlm_run_ast_work(ldlm_res_to_ns(res), &rpc_list,
-                               LDLM_WORK_CP_AST);
-        if (rc == -ERESTART) {
+	rc = ldlm_run_ast_work(ldlm_res_to_ns(res), &rpc_list,
+			       LDLM_WORK_CP_AST);
+	if (rc == -ERESTART) {
 		LASSERT(list_empty(&rpc_list));
-                goto restart;
-        }
+		goto restart;
+	}
 #else
-        ENTRY;
+	ENTRY;
 
 	INIT_LIST_HEAD(&rpc_list);
-        if (!ns_is_client(ldlm_res_to_ns(res))) {
-                CERROR("This is client-side-only module, cannot handle "
-                       "LDLM_NAMESPACE_SERVER resource type lock.\n");
-                LBUG();
-        }
+	if (!ns_is_client(ldlm_res_to_ns(res))) {
+		CERROR("This is client-side-only module, cannot handle "
+		       "LDLM_NAMESPACE_SERVER resource type lock.\n");
+		LBUG();
+	}
 #endif
-        EXIT;
+	EXIT;
 }
 
-void ldlm_reprocess_all(struct ldlm_resource *res)
+void ldlm_reprocess_all(struct ldlm_resource *res, struct ldlm_lock *hint)
 {
-	__ldlm_reprocess_all(res, LDLM_PROCESS_RESCAN);
+	__ldlm_reprocess_all(res, LDLM_PROCESS_RESCAN, hint);
 }
 EXPORT_SYMBOL(ldlm_reprocess_all);
 
@@ -2282,7 +2378,7 @@ static int ldlm_reprocess_res(struct cfs_hash *hs, struct cfs_hash_bd *bd,
 	struct ldlm_resource *res = cfs_hash_object(hs, hnode);
 
 	/* This is only called once after recovery done. LU-8306. */
-	__ldlm_reprocess_all(res, LDLM_PROCESS_RECOVERY);
+	__ldlm_reprocess_all(res, LDLM_PROCESS_RECOVERY, NULL);
 	return 0;
 }
 
@@ -2364,6 +2460,7 @@ void ldlm_lock_cancel(struct ldlm_lock *lock)
          * talking to me first. -phik */
         if (lock->l_readers || lock->l_writers) {
                 LDLM_ERROR(lock, "lock still has references");
+		unlock_res_and_lock(lock);
                 LBUG();
         }
 
@@ -2381,8 +2478,8 @@ void ldlm_lock_cancel(struct ldlm_lock *lock)
         ldlm_resource_unlink_lock(lock);
         ldlm_lock_destroy_nolock(lock);
 
-        if (lock->l_granted_mode == lock->l_req_mode)
-                ldlm_pool_del(&ns->ns_pool, lock);
+	if (ldlm_is_granted(lock))
+		ldlm_pool_del(&ns->ns_pool, lock);
 
         /* Make sure we will not be called again for same lock what is possible
          * if not to zero out lock->l_granted_mode */
@@ -2414,6 +2511,7 @@ int ldlm_lock_set_data(const struct lustre_handle *lockh, void *data)
 EXPORT_SYMBOL(ldlm_lock_set_data);
 
 struct export_cl_data {
+	const struct lu_env	*ecl_env;
 	struct obd_export	*ecl_exp;
 	int			ecl_loop;
 };
@@ -2426,10 +2524,10 @@ static void ldlm_cancel_lock_for_export(struct obd_export *exp,
 
 	res = ldlm_resource_getref(lock->l_resource);
 
-	ldlm_res_lvbo_update(res, NULL, 1);
+	ldlm_lvbo_update(res, lock, NULL, 1);
 	ldlm_lock_cancel(lock);
 	if (!exp->exp_obd->obd_stopping)
-		ldlm_reprocess_all(res);
+		ldlm_reprocess_all(res, lock);
 	ldlm_resource_putref(res);
 
 	ecl->ecl_loop++;
@@ -2466,10 +2564,17 @@ ldlm_cancel_locks_for_export_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd,
  */
 int ldlm_export_cancel_blocked_locks(struct obd_export *exp)
 {
+	struct lu_env env;
 	struct export_cl_data	ecl = {
 		.ecl_exp	= exp,
 		.ecl_loop	= 0,
 	};
+	int rc;
+
+	rc = lu_env_init(&env, LCT_DT_THREAD);
+	if (rc)
+		RETURN(rc);
+	ecl.ecl_env = &env;
 
 	while (!list_empty(&exp->exp_bl_list)) {
 		struct ldlm_lock *lock;
@@ -2492,6 +2597,8 @@ int ldlm_export_cancel_blocked_locks(struct obd_export *exp)
 		LDLM_LOCK_RELEASE(lock);
 	}
 
+	lu_env_fini(&env);
+
 	CDEBUG(D_DLMTRACE, "Export %p, canceled %d locks, "
 	       "left on hash table %d.\n", exp, ecl.ecl_loop,
 	       atomic_read(&exp->exp_lock_hash->hs_count));
@@ -2506,10 +2613,16 @@ int ldlm_export_cancel_blocked_locks(struct obd_export *exp)
  */
 int ldlm_export_cancel_locks(struct obd_export *exp)
 {
-	struct export_cl_data	ecl = {
-		.ecl_exp	= exp,
-		.ecl_loop	= 0,
-	};
+	struct export_cl_data ecl;
+	struct lu_env env;
+	int rc;
+
+	rc = lu_env_init(&env, LCT_DT_THREAD);
+	if (rc)
+		RETURN(rc);
+	ecl.ecl_env = &env;
+	ecl.ecl_exp = exp;
+	ecl.ecl_loop = 0;
 
 	cfs_hash_for_each_empty(exp->exp_lock_hash,
 				ldlm_cancel_locks_for_export_cb, &ecl);
@@ -2523,26 +2636,35 @@ int ldlm_export_cancel_locks(struct obd_export *exp)
 	    exp->exp_obd->obd_stopping)
 		ldlm_reprocess_recovery_done(exp->exp_obd->obd_namespace);
 
+	lu_env_fini(&env);
+
 	return ecl.ecl_loop;
 }
 
 /**
- * Downgrade an exclusive lock.
+ * Downgrade an PW/EX lock to COS | CR mode.
  *
- * A fast variant of ldlm_lock_convert for convertion of exclusive locks. The
+ * A lock mode convertion from PW/EX mode to less conflict mode. The
  * convertion may fail if lock was canceled before downgrade, but it doesn't
  * indicate any problem, because such lock has no reader or writer, and will
  * be released soon.
- * Used by Commit on Sharing (COS) code.
+ *
+ * Used by Commit on Sharing (COS) code to force object changes commit in case
+ * of conflict. Converted lock is considered as new lock and all blocking AST
+ * things are cleared, so any pending or new blocked lock on that lock will
+ * cause new call to blocking_ast and force resource object commit.
+ *
+ * Also used by layout_change to replace EX lock to CR lock.
  *
  * \param lock A lock to convert
  * \param new_mode new lock mode
  */
-void ldlm_lock_downgrade(struct ldlm_lock *lock, enum ldlm_mode new_mode)
+void ldlm_lock_mode_downgrade(struct ldlm_lock *lock, enum ldlm_mode new_mode)
 {
+#ifdef HAVE_SERVER_SUPPORT
 	ENTRY;
 
-	LASSERT(new_mode == LCK_COS);
+	LASSERT(new_mode == LCK_COS || new_mode == LCK_CR);
 
 	lock_res_and_lock(lock);
 
@@ -2560,146 +2682,22 @@ void ldlm_lock_downgrade(struct ldlm_lock *lock, enum ldlm_mode new_mode)
 	 * ldlm_grant_lock() called below.
 	 */
 	ldlm_pool_del(&ldlm_lock_to_ns(lock)->ns_pool, lock);
+
+	/* Consider downgraded lock as a new lock and clear all states
+	 * related to a previous blocking AST processing.
+	 */
+	ldlm_clear_blocking_data(lock);
+
 	lock->l_req_mode = new_mode;
 	ldlm_grant_lock(lock, NULL);
-
 	unlock_res_and_lock(lock);
 
-	ldlm_reprocess_all(lock->l_resource);
+	ldlm_reprocess_all(lock->l_resource, lock);
 
 	EXIT;
-}
-EXPORT_SYMBOL(ldlm_lock_downgrade);
-
-/**
- * Attempt to convert already granted lock to a different mode.
- *
- * While lock conversion is not currently used, future client-side
- * optimizations could take advantage of it to avoid discarding cached
- * pages on a file.
- */
-struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock,
-					enum ldlm_mode new_mode, __u32 *flags)
-{
-	struct list_head rpc_list;
-	struct ldlm_resource *res;
-	struct ldlm_namespace *ns;
-	int granted = 0;
-#ifdef HAVE_SERVER_SUPPORT
-	int old_mode;
-	struct sl_insert_point prev;
-#endif
-	struct ldlm_interval *node;
-	ENTRY;
-
-	INIT_LIST_HEAD(&rpc_list);
-	/* Just return if mode is unchanged. */
-	if (new_mode == lock->l_granted_mode) {
-		*flags |= LDLM_FL_BLOCK_GRANTED;
-		RETURN(lock->l_resource);
-	}
-
-	/* I can't check the type of lock here because the bitlock of lock
-	 * is not held here, so do the allocation blindly. -jay */
-	OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, GFP_NOFS);
-	if (node == NULL)  /* Actually, this causes EDEADLOCK to be returned */
-		RETURN(NULL);
-
-	LASSERTF((new_mode == LCK_PW && lock->l_granted_mode == LCK_PR),
-		 "new_mode %u, granted %u\n", new_mode, lock->l_granted_mode);
-
-	lock_res_and_lock(lock);
-
-	res = lock->l_resource;
-	ns  = ldlm_res_to_ns(res);
-
-#ifdef HAVE_SERVER_SUPPORT
-	old_mode = lock->l_req_mode;
 #endif
-	lock->l_req_mode = new_mode;
-	if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS) {
-#ifdef HAVE_SERVER_SUPPORT
-		/* remember the lock position where the lock might be
-		 * added back to the granted list later and also
-		 * remember the join mode for skiplist fixing. */
-		prev.res_link = lock->l_res_link.prev;
-		prev.mode_link = lock->l_sl_mode.prev;
-		prev.policy_link = lock->l_sl_policy.prev;
-#endif
-                ldlm_resource_unlink_lock(lock);
-        } else {
-                ldlm_resource_unlink_lock(lock);
-                if (res->lr_type == LDLM_EXTENT) {
-                        /* FIXME: ugly code, I have to attach the lock to a
-                         * interval node again since perhaps it will be granted
-                         * soon */
-			INIT_LIST_HEAD(&node->li_group);
-                        ldlm_interval_attach(node, lock);
-                        node = NULL;
-                }
-        }
-
-        /*
-         * Remove old lock from the pool before adding the lock with new
-         * mode below in ->policy()
-         */
-        ldlm_pool_del(&ns->ns_pool, lock);
-
-        /* If this is a local resource, put it on the appropriate list. */
-        if (ns_is_client(ldlm_res_to_ns(res))) {
-                if (*flags & (LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_GRANTED)) {
-                        ldlm_resource_add_lock(res, &res->lr_converting, lock);
-                } else {
-                        /* This should never happen, because of the way the
-                         * server handles conversions. */
-			LDLM_ERROR(lock, "Erroneous flags %x on local lock\n",
-                                   *flags);
-                        LBUG();
-
-                        ldlm_grant_lock(lock, &rpc_list);
-                        granted = 1;
-                        /* FIXME: completion handling not with lr_lock held ! */
-                        if (lock->l_completion_ast)
-                                lock->l_completion_ast(lock, 0, NULL);
-                }
-#ifdef HAVE_SERVER_SUPPORT
-	} else {
-		int rc;
-		enum ldlm_error err;
-		__u64 pflags = 0;
-		ldlm_processing_policy policy;
-
-                policy = ldlm_processing_policy_table[res->lr_type];
-		rc = policy(lock, &pflags, LDLM_PROCESS_RESCAN, &err,
-			    &rpc_list);
-                if (rc == LDLM_ITER_STOP) {
-                        lock->l_req_mode = old_mode;
-                        if (res->lr_type == LDLM_EXTENT)
-                                ldlm_extent_add_lock(res, lock);
-                        else
-                                ldlm_granted_list_add_lock(lock, &prev);
-
-                        res = NULL;
-                } else {
-                        *flags |= LDLM_FL_BLOCK_GRANTED;
-                        granted = 1;
-                }
-        }
-#else
-        } else {
-                CERROR("This is client-side-only module, cannot handle "
-                       "LDLM_NAMESPACE_SERVER resource type lock.\n");
-                LBUG();
-        }
-#endif
-        unlock_res_and_lock(lock);
-
-        if (granted)
-                ldlm_run_ast_work(ns, &rpc_list, LDLM_WORK_CP_AST);
-        if (node)
-                OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
-        RETURN(res);
 }
+EXPORT_SYMBOL(ldlm_lock_mode_downgrade);
 
 /**
  * Print lock with lock handle \a lockh description into debug log.
@@ -2749,17 +2747,17 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
         va_start(args, fmt);
 
         if (exp && exp->exp_connection) {
-                nid = libcfs_nid2str(exp->exp_connection->c_peer.nid);
+		nid = obd_export_nid2str(exp);
         } else if (exp && exp->exp_obd != NULL) {
                 struct obd_import *imp = exp->exp_obd->u.cli.cl_import;
-                nid = libcfs_nid2str(imp->imp_connection->c_peer.nid);
+		nid = obd_import_nid2str(imp);
         }
 
         if (resource == NULL) {
                 libcfs_debug_vmsg2(msgdata, fmt, args,
 		       " ns: \?\? lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
 		       "res: \?\? rrc=\?\? type: \?\?\? flags: %#llx nid: %s "
-		       "remote: %#llx expref: %d pid: %u timeout: %lu "
+		       "remote: %#llx expref: %d pid: %u timeout: %lld "
 		       "lvb_type: %d\n",
                        lock,
 		       lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
@@ -2779,7 +2777,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
 			" ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
 			"res: "DLDLMRES" rrc: %d type: %s [%llu->%llu] "
 			"(req %llu->%llu) flags: %#llx nid: %s remote: "
-			"%#llx expref: %d pid: %u timeout: %lu lvb_type: %d\n",
+			"%#llx expref: %d pid: %u timeout: %lld lvb_type: %d\n",
 			ldlm_lock_to_ns_name(lock), lock,
 			lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
 			lock->l_readers, lock->l_writers,
@@ -2802,7 +2800,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
 			" ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
 			"res: "DLDLMRES" rrc: %d type: %s pid: %d "
 			"[%llu->%llu] flags: %#llx nid: %s "
-			"remote: %#llx expref: %d pid: %u timeout: %lu\n",
+			"remote: %#llx expref: %d pid: %u timeout: %lld\n",
 			ldlm_lock_to_ns_name(lock), lock,
 			lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
 			lock->l_readers, lock->l_writers,
@@ -2822,9 +2820,9 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
 	case LDLM_IBITS:
 		libcfs_debug_vmsg2(msgdata, fmt, args,
 			" ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
-			"res: "DLDLMRES" bits %#llx rrc: %d type: %s "
+			"res: "DLDLMRES" bits %#llx/%#llx rrc: %d type: %s "
 			"flags: %#llx nid: %s remote: %#llx expref: %d "
-			"pid: %u timeout: %lu lvb_type: %d\n",
+			"pid: %u timeout: %lld lvb_type: %d\n",
 			ldlm_lock_to_ns_name(lock),
 			lock, lock->l_handle.h_cookie,
 			atomic_read(&lock->l_refc),
@@ -2833,6 +2831,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
 			ldlm_lockname[lock->l_req_mode],
 			PLDLMRES(resource),
 			lock->l_policy_data.l_inodebits.bits,
+			lock->l_policy_data.l_inodebits.try_bits,
 			atomic_read(&resource->lr_refcount),
 			ldlm_typename[resource->lr_type],
 			lock->l_flags, nid, lock->l_remote_handle.cookie,
@@ -2846,7 +2845,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
 			" ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
 			"res: "DLDLMRES" rrc: %d type: %s flags: %#llx "
 			"nid: %s remote: %#llx expref: %d pid: %u "
-			"timeout: %lu lvb_type: %d\n",
+			"timeout: %lld lvb_type: %d\n",
 			ldlm_lock_to_ns_name(lock),
 			lock, lock->l_handle.h_cookie,
 			atomic_read(&lock->l_refc),
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c
index 465ffda035dbe..ac7a9910e4d45 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2016, Intel Corporation.
+ * Copyright (c) 2010, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,7 +40,7 @@
 #include <linux/kthread.h>
 #include <linux/list.h>
 #include <libcfs/libcfs.h>
-#include <lustre/lustre_errno.h>
+#include <lustre_errno.h>
 #include <lustre_dlm.h>
 #include <obd_class.h>
 #include "ldlm_internal.h"
@@ -49,6 +49,11 @@ static int ldlm_num_threads;
 module_param(ldlm_num_threads, int, 0444);
 MODULE_PARM_DESC(ldlm_num_threads, "number of DLM service threads to start");
 
+static unsigned int ldlm_cpu_bind = 1;
+module_param(ldlm_cpu_bind, uint, 0444);
+MODULE_PARM_DESC(ldlm_cpu_bind,
+		 "bind DLM service threads to particular CPU partitions");
+
 static char *ldlm_cpts;
 module_param(ldlm_cpts, charp, 0444);
 MODULE_PARM_DESC(ldlm_cpts, "CPU partitions ldlm threads should run on");
@@ -64,18 +69,16 @@ struct kset *ldlm_svc_kset;
 
 static struct ldlm_state *ldlm_state;
 
-static inline cfs_time_t round_timeout(cfs_time_t timeout)
-{
-        return cfs_time_seconds((int)cfs_duration_sec(cfs_time_sub(timeout, 0)) + 1);
-}
-
-/* timeout for initial callback (AST) reply (bz10399) */
-static inline unsigned int ldlm_get_rq_timeout(void)
+/* timeout for initial callback (AST) reply (bz10399)
+ * Due to having to send a 32 bit time value over the
+ * wire return it as timeout_t instead of time64_t
+ */
+static inline timeout_t ldlm_get_rq_timeout(void)
 {
-        /* Non-AT value */
-        unsigned int timeout = min(ldlm_timeout, obd_timeout / 3);
+	/* Non-AT value */
+	timeout_t timeout = min(ldlm_timeout, obd_timeout / 3);
 
-        return timeout < 1 ? 1 : timeout;
+	return timeout < 1 ? 1 : timeout;
 }
 
 struct ldlm_bl_pool {
@@ -133,7 +136,7 @@ static DEFINE_SPINLOCK(waiting_locks_spinlock); /* BH lock (timer) */
  * All access to it should be under waiting_locks_spinlock.
  */
 static LIST_HEAD(waiting_locks_list);
-static void waiting_locks_callback(cfs_timer_cb_arg_t unused);
+static void waiting_locks_callback(TIMER_DATA_TYPE unused);
 static CFS_DEFINE_TIMER(waiting_locks_timer, waiting_locks_callback, 0, 0);
 
 enum elt_state {
@@ -147,6 +150,10 @@ static enum elt_state expired_lock_thread_state = ELT_STOPPED;
 static int expired_lock_dump;
 static LIST_HEAD(expired_lock_list);
 
+static int ldlm_lock_busy(struct ldlm_lock *lock);
+static int ldlm_add_waiting_lock(struct ldlm_lock *lock, time64_t timeout);
+static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, time64_t seconds);
+
 static inline int have_expired_locks(void)
 {
 	int need_to_run;
@@ -228,14 +235,30 @@ static int expired_lock_main(void *arg)
 			export = class_export_lock_get(lock->l_export, lock);
 			spin_unlock_bh(&waiting_locks_spinlock);
 
-			spin_lock_bh(&export->exp_bl_list_lock);
-			list_del_init(&lock->l_exp_list);
-			spin_unlock_bh(&export->exp_bl_list_lock);
-
-			do_dump++;
-			class_fail_export(export);
+			/* Check if we need to prolong timeout */
+			if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT) &&
+			    lock->l_callback_timeout != 0 && /* not AST error */
+			    ldlm_lock_busy(lock)) {
+				LDLM_DEBUG(lock, "prolong the busy lock");
+				lock_res_and_lock(lock);
+				ldlm_add_waiting_lock(lock,
+						ldlm_bl_timeout(lock) >> 1);
+				unlock_res_and_lock(lock);
+			} else {
+				spin_lock_bh(&export->exp_bl_list_lock);
+				list_del_init(&lock->l_exp_list);
+				spin_unlock_bh(&export->exp_bl_list_lock);
+
+				LDLM_ERROR(lock,
+					   "lock callback timer expired after %llds: evicting client at %s ",
+					   ktime_get_real_seconds() -
+					   lock->l_blast_sent,
+					   obd_export_nid2str(export));
+				ldlm_lock_to_ns(lock)->ns_timeouts++;
+				do_dump++;
+				class_fail_export(export);
+			}
 			class_export_lock_put(export, lock);
-
 			/* release extra ref grabbed by ldlm_add_waiting_lock()
 			 * or ldlm_failed_ast() */
 			LDLM_LOCK_RELEASE(lock);
@@ -258,9 +281,6 @@ static int expired_lock_main(void *arg)
 	RETURN(0);
 }
 
-static int ldlm_add_waiting_lock(struct ldlm_lock *lock);
-static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, int seconds);
-
 /**
  * Check if there is a request in the export request list
  * which prevents the lock canceling.
@@ -274,7 +294,7 @@ static int ldlm_lock_busy(struct ldlm_lock *lock)
 	if (lock->l_export == NULL)
 		return 0;
 
-	spin_lock_bh(&lock->l_export->exp_rpc_lock);
+	spin_lock(&lock->l_export->exp_rpc_lock);
 	list_for_each_entry(req, &lock->l_export->exp_hp_rpcs,
 				rq_exp_list) {
 		if (req->rq_ops->hpreq_lock_match) {
@@ -283,12 +303,12 @@ static int ldlm_lock_busy(struct ldlm_lock *lock)
 				break;
 		}
 	}
-	spin_unlock_bh(&lock->l_export->exp_rpc_lock);
+	spin_unlock(&lock->l_export->exp_rpc_lock);
 	RETURN(match);
 }
 
 /* This is called from within a timer interrupt and cannot schedule */
-static void waiting_locks_callback(cfs_timer_cb_arg_t unused)
+static void waiting_locks_callback(TIMER_DATA_TYPE unused)
 {
 	struct ldlm_lock	*lock;
 	int			need_dump = 0;
@@ -296,42 +316,10 @@ static void waiting_locks_callback(cfs_timer_cb_arg_t unused)
 	spin_lock_bh(&waiting_locks_spinlock);
 	while (!list_empty(&waiting_locks_list)) {
 		lock = list_entry(waiting_locks_list.next, struct ldlm_lock,
-                                      l_pending_chain);
-                if (cfs_time_after(lock->l_callback_timeout,
-                                   cfs_time_current()) ||
-                    (lock->l_req_mode == LCK_GROUP))
-                        break;
-
-                /* Check if we need to prolong timeout */
-                if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT) &&
-                    ldlm_lock_busy(lock)) {
-                        int cont = 1;
-
-                        if (lock->l_pending_chain.next == &waiting_locks_list)
-                                cont = 0;
-
-                        LDLM_LOCK_GET(lock);
-
-			spin_unlock_bh(&waiting_locks_spinlock);
-			LDLM_DEBUG(lock, "prolong the busy lock");
-			ldlm_refresh_waiting_lock(lock,
-						  ldlm_bl_timeout(lock) >> 1);
-			spin_lock_bh(&waiting_locks_spinlock);
-
-                        if (!cont) {
-                                LDLM_LOCK_RELEASE(lock);
-                                break;
-                        }
-
-                        LDLM_LOCK_RELEASE(lock);
-                        continue;
-                }
-                ldlm_lock_to_ns(lock)->ns_timeouts++;
-		LDLM_ERROR(lock, "lock callback timer expired after %llds: "
-                           "evicting client at %s ",
-			   ktime_get_real_seconds() - lock->l_blast_sent,
-                           libcfs_nid2str(
-                                   lock->l_export->exp_connection->c_peer.nid));
+				  l_pending_chain);
+		if (lock->l_callback_timeout > ktime_get_seconds() ||
+		    lock->l_req_mode == LCK_GROUP)
+			break;
 
                 /* no needs to take an extra ref on the lock since it was in
                  * the waiting_locks_list and ldlm_add_waiting_lock()
@@ -348,17 +336,18 @@ static void waiting_locks_callback(cfs_timer_cb_arg_t unused)
 		wake_up(&expired_lock_wait_queue);
 	}
 
-        /*
-         * Make sure the timer will fire again if we have any locks
-         * left.
-         */
+	/*
+	 * Make sure the timer will fire again if we have any locks
+	 * left.
+	 */
 	if (!list_empty(&waiting_locks_list)) {
-                cfs_time_t timeout_rounded;
+		unsigned long timeout_jiffies;
+
 		lock = list_entry(waiting_locks_list.next, struct ldlm_lock,
-                                      l_pending_chain);
-                timeout_rounded = (cfs_time_t)round_timeout(lock->l_callback_timeout);
-		mod_timer(&waiting_locks_timer, timeout_rounded);
-        }
+				  l_pending_chain);
+		timeout_jiffies = cfs_time_seconds(lock->l_callback_timeout);
+		mod_timer(&waiting_locks_timer, timeout_jiffies);
+	}
 	spin_unlock_bh(&waiting_locks_spinlock);
 }
 
@@ -374,10 +363,10 @@ static void waiting_locks_callback(cfs_timer_cb_arg_t unused)
  *
  * Called with the namespace lock held.
  */
-static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, int seconds)
+static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, time64_t seconds)
 {
-        cfs_time_t timeout;
-        cfs_time_t timeout_rounded;
+	unsigned long timeout_jiffies;
+	time64_t timeout;
 
 	if (!list_empty(&lock->l_pending_chain))
                 return 0;
@@ -386,28 +375,29 @@ static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, int seconds)
             OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT))
                 seconds = 1;
 
-        timeout = cfs_time_shift(seconds);
-        if (likely(cfs_time_after(timeout, lock->l_callback_timeout)))
+	timeout = ktime_get_seconds() + seconds;
+	if (likely(timeout > lock->l_callback_timeout))
                 lock->l_callback_timeout = timeout;
 
-        timeout_rounded = round_timeout(lock->l_callback_timeout);
+	timeout_jiffies = cfs_time_seconds(lock->l_callback_timeout);
 
-	if (cfs_time_before(timeout_rounded, waiting_locks_timer.expires) ||
-	    !timer_pending(&waiting_locks_timer)) {
-		mod_timer(&waiting_locks_timer, timeout_rounded);
-        }
-        /* if the new lock has a shorter timeout than something earlier on
-           the list, we'll wait the longer amount of time; no big deal. */
-        /* FIFO */
+	if (time_before(timeout_jiffies, waiting_locks_timer.expires) ||
+	    !timer_pending(&waiting_locks_timer))
+		mod_timer(&waiting_locks_timer, timeout_jiffies);
+
+	/* if the new lock has a shorter timeout than something earlier on
+	 * the list, we'll wait the longer amount of time; no big deal.
+	 */
+	/* FIFO */
 	list_add_tail(&lock->l_pending_chain, &waiting_locks_list);
-        return 1;
+	return 1;
 }
 
 static void ldlm_add_blocked_lock(struct ldlm_lock *lock)
 {
 	spin_lock_bh(&lock->l_export->exp_bl_list_lock);
 	if (list_empty(&lock->l_exp_list)) {
-		if (lock->l_granted_mode != lock->l_req_mode)
+		if (!ldlm_is_granted(lock))
 			list_add_tail(&lock->l_exp_list,
 				      &lock->l_export->exp_bl_list);
 		else
@@ -425,10 +415,9 @@ static void ldlm_add_blocked_lock(struct ldlm_lock *lock)
 		obd_stale_export_adjust(lock->l_export);
 }
 
-static int ldlm_add_waiting_lock(struct ldlm_lock *lock)
+static int ldlm_add_waiting_lock(struct ldlm_lock *lock, time64_t timeout)
 {
 	int ret;
-	int timeout = ldlm_bl_timeout(lock);
 
 	/* NB: must be called with hold of lock_res_and_lock() */
 	LASSERT(ldlm_is_res_locked(lock));
@@ -447,12 +436,12 @@ static int ldlm_add_waiting_lock(struct ldlm_lock *lock)
 	}
 
 	if (ldlm_is_destroyed(lock)) {
-		static cfs_time_t next;
+		static time64_t next;
 
 		spin_unlock_bh(&waiting_locks_spinlock);
 		LDLM_ERROR(lock, "not waiting on destroyed lock (bug 5653)");
-		if (cfs_time_after(cfs_time_current(), next)) {
-			next = cfs_time_shift(14400);
+		if (ktime_get_seconds() > next) {
+			next = ktime_get_seconds() + 14400;
 			libcfs_debug_dumpstack(NULL);
 		}
 		return 0;
@@ -471,7 +460,7 @@ static int ldlm_add_waiting_lock(struct ldlm_lock *lock)
 	if (ret)
 		ldlm_add_blocked_lock(lock);
 
-	LDLM_DEBUG(lock, "%sadding to wait list(timeout: %d, AT: %s)",
+	LDLM_DEBUG(lock, "%sadding to wait list(timeout: %lld, AT: %s)",
 		   ret == 0 ? "not re-" : "", timeout,
 		   AT_OFF ? "off" : "on");
 	return ret;
@@ -501,10 +490,11 @@ static int __ldlm_del_waiting_lock(struct ldlm_lock *lock)
 			del_timer(&waiting_locks_timer);
                 } else {
                         struct ldlm_lock *next;
+
 			next = list_entry(list_next, struct ldlm_lock,
-                                              l_pending_chain);
+					  l_pending_chain);
 			mod_timer(&waiting_locks_timer,
-				  round_timeout(next->l_callback_timeout));
+				  cfs_time_seconds(next->l_callback_timeout));
                 }
         }
 	list_del_init(&lock->l_pending_chain);
@@ -547,7 +537,7 @@ int ldlm_del_waiting_lock(struct ldlm_lock *lock)
  *
  * Called with namespace lock held.
  */
-int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout)
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, time64_t timeout)
 {
 	if (lock->l_export == NULL) {
 		/* We don't have a "waiting locks list" on clients. */
@@ -587,7 +577,7 @@ int ldlm_del_waiting_lock(struct ldlm_lock *lock)
         RETURN(0);
 }
 
-int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout)
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, time64_t timeout)
 {
         RETURN(0);
 }
@@ -605,9 +595,9 @@ int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout)
  *
  * \retval            timeout in seconds to wait for the client reply
  */
-unsigned int ldlm_bl_timeout(struct ldlm_lock *lock)
+time64_t ldlm_bl_timeout(struct ldlm_lock *lock)
 {
-	unsigned int timeout;
+	time64_t timeout;
 
 	if (AT_OFF)
 		return obd_timeout / 2;
@@ -617,7 +607,7 @@ unsigned int ldlm_bl_timeout(struct ldlm_lock *lock)
 	 * It would be nice to have some kind of "early reply" mechanism for
 	 * lock callbacks too... */
 	timeout = at_get(&lock->l_export->exp_bl_lock_at);
-	return max(timeout + (timeout >> 1), ldlm_enqueue_min);
+	return max(timeout + (timeout >> 1), (time64_t)ldlm_enqueue_min);
 }
 EXPORT_SYMBOL(ldlm_bl_timeout);
 
@@ -639,6 +629,7 @@ static void ldlm_failed_ast(struct ldlm_lock *lock, int rc,
 		/* the lock was not in any list, grab an extra ref before adding
 		 * the lock to the expired list */
 		LDLM_LOCK_GET(lock);
+	lock->l_callback_timeout = 0; /* differentiate it from expired locks */
 	list_add(&lock->l_pending_chain, &expired_lock_list);
 	wake_up(&expired_lock_wait_queue);
 	spin_unlock_bh(&waiting_locks_spinlock);
@@ -654,14 +645,7 @@ static int ldlm_handle_ast_error(struct ldlm_lock *lock,
 	struct lnet_process_id peer = req->rq_import->imp_connection->c_peer;
 
 	if (!req->rq_replied || (rc && rc != -EINVAL)) {
-		if (lock->l_export && lock->l_export->exp_libclient) {
-			LDLM_DEBUG(lock,
-				   "%s AST (req@%p x%llu) to liblustre client (nid %s) timeout, just cancelling lock",
-				   ast_type, req, req->rq_xid,
-				   libcfs_nid2str(peer.nid));
-			ldlm_lock_cancel(lock);
-			rc = -ERESTART;
-		} else if (ldlm_is_cancel(lock)) {
+		if (ldlm_is_cancel(lock)) {
 			LDLM_DEBUG(lock,
 				   "%s AST (req@%p x%llu) timeout from nid %s, but cancel was received (AST reply lost?)",
 				   ast_type, req, req->rq_xid,
@@ -713,7 +697,7 @@ static int ldlm_handle_ast_error(struct ldlm_lock *lock,
 			/* update lvbo to return proper attributes.
 			 * see bug 23174 */
 			ldlm_resource_getref(res);
-			ldlm_res_lvbo_update(res, NULL, 1);
+			ldlm_lvbo_update(res, lock, NULL, 1);
 			ldlm_resource_putref(res);
 		}
 		ldlm_lock_cancel(lock);
@@ -724,9 +708,9 @@ static int ldlm_handle_ast_error(struct ldlm_lock *lock,
 }
 
 static int ldlm_cb_interpret(const struct lu_env *env,
-                             struct ptlrpc_request *req, void *data, int rc)
+		             struct ptlrpc_request *req, void *args, int rc)
 {
-        struct ldlm_cb_async_args *ca   = data;
+        struct ldlm_cb_async_args *ca   = args;
         struct ldlm_lock          *lock = ca->ca_lock;
         struct ldlm_cb_set_arg    *arg  = ca->ca_set_arg;
         ENTRY;
@@ -744,15 +728,16 @@ static int ldlm_cb_interpret(const struct lu_env *env,
 		 *   -ELDLM_NO_LOCK_DATA when inode is cleared. LU-274
 		 */
 		if (unlikely(arg->gl_interpret_reply)) {
-			rc = arg->gl_interpret_reply(env, req, data, rc);
+			rc = arg->gl_interpret_reply(NULL, req, args, rc);
 		} else if (rc == -ELDLM_NO_LOCK_DATA) {
-			LDLM_DEBUG(lock, "lost race - client has a lock but no "
-				   "inode");
-			ldlm_res_lvbo_update(lock->l_resource, NULL, 1);
+			LDLM_DEBUG(lock,
+				   "lost race - client has a lock but no inode");
+			ldlm_lvbo_update(lock->l_resource, lock, NULL, 1);
 		} else if (rc != 0) {
 			rc = ldlm_handle_ast_error(lock, req, rc, "glimpse");
 		} else {
-			rc = ldlm_res_lvbo_update(lock->l_resource, req, 1);
+			rc = ldlm_lvbo_update(lock->l_resource,
+					      lock, req, 1);
 		}
 		break;
 	case LDLM_BL_CALLBACK:
@@ -780,8 +765,8 @@ static int ldlm_cb_interpret(const struct lu_env *env,
 
 static void ldlm_update_resend(struct ptlrpc_request *req, void *data)
 {
-	struct ldlm_cb_async_args *ca   = data;
-	struct ldlm_lock          *lock = ca->ca_lock;
+	struct ldlm_cb_async_args *ca = data;
+	struct ldlm_lock *lock = ca->ca_lock;
 
 	ldlm_refresh_waiting_lock(lock, ldlm_bl_timeout(lock));
 }
@@ -821,7 +806,7 @@ static void ldlm_lock_reorder_req(struct ldlm_lock *lock)
 		RETURN_EXIT;
 	}
 
-	spin_lock_bh(&lock->l_export->exp_rpc_lock);
+	spin_lock(&lock->l_export->exp_rpc_lock);
 	list_for_each_entry(req, &lock->l_export->exp_hp_rpcs,
 			    rq_exp_list) {
 		/* Do not process requests that were not yet added to there
@@ -835,7 +820,7 @@ static void ldlm_lock_reorder_req(struct ldlm_lock *lock)
 		    req->rq_ops->hpreq_lock_match(req, lock))
 			ptlrpc_nrs_req_hp_move(req);
 	}
-	spin_unlock_bh(&lock->l_export->exp_rpc_lock);
+	spin_unlock(&lock->l_export->exp_rpc_lock);
 	EXIT;
 }
 
@@ -874,18 +859,18 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock,
 
         ldlm_lock_reorder_req(lock);
 
-        req = ptlrpc_request_alloc_pack(lock->l_export->exp_imp_reverse,
-                                        &RQF_LDLM_BL_CALLBACK,
-                                        LUSTRE_DLM_VERSION, LDLM_BL_CALLBACK);
-        if (req == NULL)
-                RETURN(-ENOMEM);
+	req = ptlrpc_request_alloc_pack(lock->l_export->exp_imp_reverse,
+					&RQF_LDLM_BL_CALLBACK,
+					LUSTRE_DLM_VERSION, LDLM_BL_CALLBACK);
+	if (req == NULL)
+		RETURN(-ENOMEM);
 
-        CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
-        ca = ptlrpc_req_async_args(req);
-        ca->ca_set_arg = arg;
-        ca->ca_lock = lock;
+	CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
+	ca = ptlrpc_req_async_args(req);
+	ca->ca_set_arg = arg;
+	ca->ca_lock = lock;
 
-        req->rq_interpret_reply = ldlm_cb_interpret;
+	req->rq_interpret_reply = ldlm_cb_interpret;
 
 	lock_res_and_lock(lock);
 	if (ldlm_is_destroyed(lock)) {
@@ -895,7 +880,7 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock,
 		RETURN(0);
 	}
 
-	if (lock->l_granted_mode != lock->l_req_mode) {
+	if (!ldlm_is_granted(lock)) {
 		/* this blocking AST will be communicated as part of the
 		 * completion AST instead */
 		ldlm_add_blocked_lock(lock);
@@ -925,8 +910,8 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock,
 
 		req->rq_no_resend = 1;
 	} else {
-		LASSERT(lock->l_granted_mode == lock->l_req_mode);
-		ldlm_add_waiting_lock(lock);
+		LASSERT(ldlm_is_granted(lock));
+		ldlm_add_waiting_lock(lock, ldlm_bl_timeout(lock));
 		unlock_res_and_lock(lock);
 
 		/* Do not resend after lock callback timeout */
@@ -990,26 +975,25 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
 		lvb_len = 0;
 
 	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_CLIENT, lvb_len);
-        rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CP_CALLBACK);
-        if (rc) {
-                ptlrpc_request_free(req);
-                RETURN(rc);
-        }
+	rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CP_CALLBACK);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
 
-        CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
-        ca = ptlrpc_req_async_args(req);
-        ca->ca_set_arg = arg;
-        ca->ca_lock = lock;
+	CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
+	ca = ptlrpc_req_async_args(req);
+	ca->ca_set_arg = arg;
+	ca->ca_lock = lock;
 
-        req->rq_interpret_reply = ldlm_cb_interpret;
-        body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	req->rq_interpret_reply = ldlm_cb_interpret;
+	body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
 
-        body->lock_handle[0] = lock->l_remote_handle;
+	body->lock_handle[0] = lock->l_remote_handle;
 	body->lock_flags = ldlm_flags_to_wire(flags);
         ldlm_lock2desc(lock, &body->lock_desc);
 	if (lvb_len > 0) {
 		void *lvb = req_capsule_client_get(&req->rq_pill, &RMF_DLM_LVB);
-
 		lvb_len = ldlm_lvbo_fill(lock, lvb, &lvb_len);
 		if (lvb_len < 0) {
 			/* We still need to send the RPC to wake up the blocked
@@ -1060,7 +1044,7 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
 			lock_res_and_lock(lock);
 		} else {
 			/* start the lock-timeout clock */
-			ldlm_add_waiting_lock(lock);
+			ldlm_add_waiting_lock(lock, ldlm_bl_timeout(lock));
 			/* Do not resend after lock callback timeout */
 			req->rq_delay_limit = ldlm_bl_timeout(lock);
 			req->rq_resend_cb = ldlm_update_resend;
@@ -1098,7 +1082,7 @@ int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data)
 
 	if (arg->gl_desc != NULL)
 		/* There is a glimpse descriptor to pack */
-		req_fmt = &RQF_LDLM_GL_DESC_CALLBACK;
+		req_fmt = &RQF_LDLM_GL_CALLBACK_DESC;
 	else
 		req_fmt = &RQF_LDLM_GL_CALLBACK;
 
@@ -1116,9 +1100,9 @@ int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data)
 		*desc = *arg->gl_desc;
 	}
 
-        body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
-        body->lock_handle[0] = lock->l_remote_handle;
-        ldlm_lock2desc(lock, &body->lock_desc);
+	body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	body->lock_handle[0] = lock->l_remote_handle;
+	ldlm_lock2desc(lock, &body->lock_desc);
 
 	CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
 	ca = ptlrpc_req_async_args(req);
@@ -1146,6 +1130,7 @@ int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data)
 
 	RETURN(rc);
 }
+EXPORT_SYMBOL(ldlm_server_glimpse_ast);
 
 int ldlm_glimpse_locks(struct ldlm_resource *res,
 		       struct list_head *gl_work_list)
@@ -1156,7 +1141,7 @@ int ldlm_glimpse_locks(struct ldlm_resource *res,
 	rc = ldlm_run_ast_work(ldlm_res_to_ns(res), gl_work_list,
 			       LDLM_WORK_GL_AST);
 	if (rc == -ERESTART)
-		ldlm_reprocess_all(res);
+		ldlm_reprocess_all(res, NULL);
 
 	RETURN(rc);
 }
@@ -1178,40 +1163,6 @@ struct ldlm_lock *ldlm_request_lock(struct ptlrpc_request *req)
 }
 EXPORT_SYMBOL(ldlm_request_lock);
 
-static void ldlm_svc_get_eopc(const struct ldlm_request *dlm_req,
-                       struct lprocfs_stats *srv_stats)
-{
-        int lock_type = 0, op = 0;
-
-        lock_type = dlm_req->lock_desc.l_resource.lr_type;
-
-        switch (lock_type) {
-        case LDLM_PLAIN:
-                op = PTLRPC_LAST_CNTR + LDLM_PLAIN_ENQUEUE;
-                break;
-        case LDLM_EXTENT:
-                if (dlm_req->lock_flags & LDLM_FL_HAS_INTENT)
-                        op = PTLRPC_LAST_CNTR + LDLM_GLIMPSE_ENQUEUE;
-                else
-                        op = PTLRPC_LAST_CNTR + LDLM_EXTENT_ENQUEUE;
-                break;
-        case LDLM_FLOCK:
-                op = PTLRPC_LAST_CNTR + LDLM_FLOCK_ENQUEUE;
-                break;
-        case LDLM_IBITS:
-                op = PTLRPC_LAST_CNTR + LDLM_IBITS_ENQUEUE;
-                break;
-        default:
-                op = 0;
-                break;
-        }
-
-        if (op)
-                lprocfs_counter_incr(srv_stats, op);
-
-        return;
-}
-
 /**
  * Main server-side entry point into LDLM for enqueue. This is called by ptlrpc
  * service threads to carry out client lock enqueueing requests.
@@ -1228,6 +1179,7 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
 	void *cookie = NULL;
 	int rc = 0;
 	struct ldlm_resource *res = NULL;
+	const struct lu_env *env = req->rq_svc_thread->t_env;
 	ENTRY;
 
 	LDLM_DEBUG_NOLOCK("server-side enqueue handler START");
@@ -1237,7 +1189,9 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
 
 	LASSERT(req->rq_export);
 
-	if (ptlrpc_req2svc(req)->srv_stats != NULL)
+	/* for intent enqueue the stat will be updated inside intent policy */
+	if (ptlrpc_req2svc(req)->srv_stats != NULL &&
+	    !(dlm_req->lock_flags & LDLM_FL_HAS_INTENT))
 		ldlm_svc_get_eopc(dlm_req, ptlrpc_req2svc(req)->srv_stats);
 
         if (req->rq_export && req->rq_export->exp_nid_stats &&
@@ -1341,9 +1295,11 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
 				     &lock->l_policy_data);
 	if (dlm_req->lock_desc.l_resource.lr_type == LDLM_EXTENT)
 		lock->l_req_extent = lock->l_policy_data.l_extent;
+	else if (dlm_req->lock_desc.l_resource.lr_type == LDLM_IBITS)
+		lock->l_policy_data.l_inodebits.try_bits =
+			dlm_req->lock_desc.l_policy_data.l_inodebits.try_bits;
 
 existing_lock:
-
         if (flags & LDLM_FL_HAS_INTENT) {
                 /* In this case, the reply buffer is allocated deep in
                  * local_lock_enqueue by the policy function. */
@@ -1355,25 +1311,25 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
 		req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB,
 				     RCL_SERVER, ldlm_lvbo_size(lock));
 
-                if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR))
-                        GOTO(out, rc = -ENOMEM);
+		if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR))
+			GOTO(out, rc = -ENOMEM);
 
-                rc = req_capsule_server_pack(&req->rq_pill);
-                if (rc)
-                        GOTO(out, rc);
-        }
+		rc = req_capsule_server_pack(&req->rq_pill);
+		if (rc)
+			GOTO(out, rc);
+	}
 
-	err = ldlm_lock_enqueue(ns, &lock, cookie, &flags);
+	err = ldlm_lock_enqueue(env, ns, &lock, cookie, &flags);
 	if (err) {
 		if ((int)err < 0)
 			rc = (int)err;
 		GOTO(out, err);
 	}
 
-        dlm_rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+	dlm_rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
 
-        ldlm_lock2desc(lock, &dlm_rep->lock_desc);
-        ldlm_lock2handle(lock, &dlm_rep->lock_handle);
+	ldlm_lock2desc(lock, &dlm_rep->lock_desc);
+	ldlm_lock2handle(lock, &dlm_rep->lock_handle);
 
 	if (lock && lock->l_resource->lr_type == LDLM_EXTENT)
 		OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_BL_EVICT, 6);
@@ -1395,8 +1351,24 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
                 LDLM_ERROR(lock, "lock on destroyed export %p", req->rq_export);
                 rc = -ENOTCONN;
 	} else if (ldlm_is_ast_sent(lock)) {
+		/* fill lock desc for possible lock convert */
+		if (lock->l_blocking_lock &&
+		    lock->l_resource->lr_type == LDLM_IBITS) {
+			struct ldlm_lock *bl_lock = lock->l_blocking_lock;
+			struct ldlm_lock_desc *rep_desc = &dlm_rep->lock_desc;
+
+			LDLM_DEBUG(lock,
+				   "save blocking bits %llx in granted lock",
+				   bl_lock->l_policy_data.l_inodebits.bits);
+			/* If lock is blocked then save blocking ibits
+			 * in returned lock policy for the possible lock
+			 * convert on a client.
+			 */
+			rep_desc->l_policy_data.l_inodebits.cancel_bits =
+				bl_lock->l_policy_data.l_inodebits.bits;
+		}
 		dlm_rep->lock_flags |= ldlm_flags_to_wire(LDLM_FL_AST_SENT);
-                if (lock->l_granted_mode == lock->l_req_mode) {
+		if (ldlm_is_granted(lock)) {
                         /*
                          * Only cancel lock if it was granted, because it would
                          * be destroyed immediately and would never be granted
@@ -1408,38 +1380,15 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
                                 unlock_res_and_lock(lock);
                                 ldlm_lock_cancel(lock);
                                 lock_res_and_lock(lock);
-                        } else
-                                ldlm_add_waiting_lock(lock);
-                }
-        }
-        /* Make sure we never ever grant usual metadata locks to liblustre
-           clients */
-        if ((dlm_req->lock_desc.l_resource.lr_type == LDLM_PLAIN ||
-            dlm_req->lock_desc.l_resource.lr_type == LDLM_IBITS) &&
-             req->rq_export->exp_libclient) {
-		if (unlikely(!ldlm_is_cancel_on_block(lock) ||
-                             !(dlm_rep->lock_flags & LDLM_FL_CANCEL_ON_BLOCK))){
-                        CERROR("Granting sync lock to libclient. "
-			       "req fl %d, rep fl %d, lock fl %#llx\n",
-                               dlm_req->lock_flags, dlm_rep->lock_flags,
-                               lock->l_flags);
-                        LDLM_ERROR(lock, "sync lock");
-			if (dlm_req->lock_flags & LDLM_FL_HAS_INTENT) {
-				struct ldlm_intent *it;
-
-				it = req_capsule_client_get(&req->rq_pill,
-							    &RMF_LDLM_INTENT);
-				if (it != NULL) {
-					CERROR("This is intent %s (%llu)\n",
-					       ldlm_it2str(it->opc), it->opc);
-				}
+			} else {
+				ldlm_add_waiting_lock(lock,
+						      ldlm_bl_timeout(lock));
 			}
                 }
         }
+	unlock_res_and_lock(lock);
 
-        unlock_res_and_lock(lock);
-
-        EXIT;
+	EXIT;
  out:
         req->rq_status = rc ?: err; /* return either error - bug 11190 */
         if (!req->rq_packed_final) {
@@ -1522,114 +1471,126 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
 			}
 		}
 
-                if (!err && dlm_req->lock_desc.l_resource.lr_type != LDLM_FLOCK)
-                        ldlm_reprocess_all(lock->l_resource);
+		if (!err && !ldlm_is_cbpending(lock) &&
+		    dlm_req->lock_desc.l_resource.lr_type != LDLM_FLOCK)
+			ldlm_reprocess_all(lock->l_resource, lock);
 
-                LDLM_LOCK_RELEASE(lock);
-        }
+		LDLM_LOCK_RELEASE(lock);
+	}
 
-        LDLM_DEBUG_NOLOCK("server-side enqueue handler END (lock %p, rc %d)",
-                          lock, rc);
+	LDLM_DEBUG_NOLOCK("server-side enqueue handler END (lock %p, rc %d)",
+			  lock, rc);
 
         return rc;
 }
 
-/**
- * Old-style LDLM main entry point for server code enqueue.
+/* Clear the blocking lock, the race is possible between ldlm_handle_convert0()
+ * and ldlm_work_bl_ast_lock(), so this is done under lock with check for NULL.
  */
-int ldlm_handle_enqueue(struct ptlrpc_request *req,
-                        ldlm_completion_callback completion_callback,
-                        ldlm_blocking_callback blocking_callback,
-                        ldlm_glimpse_callback glimpse_callback)
+void ldlm_clear_blocking_lock(struct ldlm_lock *lock)
 {
-        struct ldlm_request *dlm_req;
-        struct ldlm_callback_suite cbs = {
-                .lcs_completion = completion_callback,
-                .lcs_blocking   = blocking_callback,
-                .lcs_glimpse    = glimpse_callback
-        };
-        int rc;
+	if (lock->l_blocking_lock) {
+		LDLM_LOCK_RELEASE(lock->l_blocking_lock);
+		lock->l_blocking_lock = NULL;
+	}
+}
 
-        dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
-        if (dlm_req != NULL) {
-                rc = ldlm_handle_enqueue0(req->rq_export->exp_obd->obd_namespace,
-                                          req, dlm_req, &cbs);
-        } else {
-                rc = -EFAULT;
-        }
-        return rc;
+/* A lock can be converted to new ibits or mode and should be considered
+ * as new lock. Clear all states related to a previous blocking AST
+ * processing so new conflicts will cause new blocking ASTs.
+ *
+ * This is used during lock convert below and lock downgrade to COS mode in
+ * ldlm_lock_mode_downgrade().
+ */
+void ldlm_clear_blocking_data(struct ldlm_lock *lock)
+{
+	ldlm_clear_ast_sent(lock);
+	lock->l_bl_ast_run = 0;
+	ldlm_clear_blocking_lock(lock);
 }
 
 /**
  * Main LDLM entry point for server code to process lock conversion requests.
  */
 int ldlm_handle_convert0(struct ptlrpc_request *req,
-                         const struct ldlm_request *dlm_req)
+			 const struct ldlm_request *dlm_req)
 {
-        struct ldlm_reply *dlm_rep;
-        struct ldlm_lock *lock;
-        int rc;
-        ENTRY;
+	struct obd_export *exp = req->rq_export;
+	struct ldlm_reply *dlm_rep;
+	struct ldlm_lock *lock;
+	__u64 bits;
+	__u64 new_bits;
+	int rc;
 
-        if (req->rq_export && req->rq_export->exp_nid_stats &&
-            req->rq_export->exp_nid_stats->nid_ldlm_stats)
-                lprocfs_counter_incr(req->rq_export->exp_nid_stats->nid_ldlm_stats,
-                                     LDLM_CONVERT - LDLM_FIRST_OPC);
+	ENTRY;
 
-        rc = req_capsule_server_pack(&req->rq_pill);
-        if (rc)
-                RETURN(rc);
+	if (exp && exp->exp_nid_stats && exp->exp_nid_stats->nid_ldlm_stats)
+		lprocfs_counter_incr(exp->exp_nid_stats->nid_ldlm_stats,
+				     LDLM_CONVERT - LDLM_FIRST_OPC);
 
-        dlm_rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
-        dlm_rep->lock_flags = dlm_req->lock_flags;
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc)
+		RETURN(rc);
 
-        lock = ldlm_handle2lock(&dlm_req->lock_handle[0]);
-        if (!lock) {
-		req->rq_status = LUSTRE_EINVAL;
-        } else {
-                void *res = NULL;
+	dlm_rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+	dlm_rep->lock_flags = dlm_req->lock_flags;
 
-                LDLM_DEBUG(lock, "server-side convert handler START");
+	lock = ldlm_handle2lock(&dlm_req->lock_handle[0]);
+	if (!lock) {
+		LDLM_DEBUG_NOLOCK("server lock is canceled already");
+		req->rq_status = ELDLM_NO_LOCK_DATA;
+		RETURN(0);
+	}
 
-                res = ldlm_lock_convert(lock, dlm_req->lock_desc.l_req_mode,
-                                        &dlm_rep->lock_flags);
-                if (res) {
-                        if (ldlm_del_waiting_lock(lock))
-                                LDLM_DEBUG(lock, "converted waiting lock");
-                        req->rq_status = 0;
-                } else {
-			req->rq_status = LUSTRE_EDEADLK;
-                }
-        }
+	LDLM_DEBUG(lock, "server-side convert handler START");
 
-        if (lock) {
-                if (!req->rq_status)
-                        ldlm_reprocess_all(lock->l_resource);
-                LDLM_DEBUG(lock, "server-side convert handler END");
-                LDLM_LOCK_PUT(lock);
-        } else
-                LDLM_DEBUG_NOLOCK("server-side convert handler END");
+	lock_res_and_lock(lock);
+	bits = lock->l_policy_data.l_inodebits.bits;
+	new_bits = dlm_req->lock_desc.l_policy_data.l_inodebits.bits;
 
-        RETURN(0);
-}
+	if (ldlm_is_cancel(lock)) {
+		LDLM_DEBUG(lock, "convert on canceled lock!");
+		unlock_res_and_lock(lock);
+		GOTO(out_put, rc = ELDLM_NO_LOCK_DATA);
+	}
 
-/**
- * Old-style main LDLM entry point for server code to process lock conversion
- * requests.
- */
-int ldlm_handle_convert(struct ptlrpc_request *req)
-{
-        int rc;
-        struct ldlm_request *dlm_req;
+	if (dlm_req->lock_desc.l_req_mode != lock->l_granted_mode) {
+		LDLM_ERROR(lock, "lock mode differs!");
+		unlock_res_and_lock(lock);
+		GOTO(out_put, rc = -EPROTO);
+	}
 
-        dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
-        if (dlm_req != NULL) {
-                rc = ldlm_handle_convert0(req, dlm_req);
-        } else {
-                CERROR ("Can't unpack dlm_req\n");
-                rc = -EFAULT;
-        }
-        return rc;
+	if (bits == new_bits) {
+		/*
+		 * This can be valid situation if CONVERT RPCs are
+		 * re-ordered. Just finish silently
+		 */
+		LDLM_DEBUG(lock, "lock is converted already!");
+		unlock_res_and_lock(lock);
+	} else {
+		if (ldlm_is_waited(lock))
+			ldlm_del_waiting_lock(lock);
+
+		ldlm_clear_cbpending(lock);
+		lock->l_policy_data.l_inodebits.cancel_bits = 0;
+		ldlm_inodebits_drop(lock, bits & ~new_bits);
+
+		ldlm_clear_blocking_data(lock);
+		unlock_res_and_lock(lock);
+
+		ldlm_reprocess_all(lock->l_resource, NULL);
+	}
+
+	dlm_rep->lock_handle = lock->l_remote_handle;
+	ldlm_ibits_policy_local_to_wire(&lock->l_policy_data,
+					&dlm_rep->lock_desc.l_policy_data);
+	rc = ELDLM_OK;
+	EXIT;
+out_put:
+	LDLM_DEBUG(lock, "server-side convert handler END, rc = %d", rc);
+	LDLM_LOCK_PUT(lock);
+	req->rq_status = rc;
+	return 0;
 }
 
 /**
@@ -1642,14 +1603,22 @@ int ldlm_request_cancel(struct ptlrpc_request *req,
 			const struct ldlm_request *dlm_req,
 			int first, enum lustre_at_flags flags)
 {
-        struct ldlm_resource *res, *pres = NULL;
-        struct ldlm_lock *lock;
-        int i, count, done = 0;
-        ENTRY;
+	struct ldlm_resource *res, *pres = NULL;
+	struct ldlm_lock *lock;
+	int i, count, done = 0;
+	unsigned int size;
 
-        count = dlm_req->lock_count ? dlm_req->lock_count : 1;
-        if (first >= count)
-                RETURN(0);
+	ENTRY;
+
+	size = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT);
+	if (size <= offsetof(struct ldlm_request, lock_handle) ||
+	    (size - offsetof(struct ldlm_request, lock_handle)) /
+	     sizeof(struct lustre_handle) < dlm_req->lock_count)
+		RETURN(0);
+
+	count = dlm_req->lock_count ? dlm_req->lock_count : 1;
+	if (first >= count)
+		RETURN(0);
 
 	if (count == 1 && dlm_req->lock_handle[0].cookie == 0)
 		RETURN(0);
@@ -1676,20 +1645,24 @@ int ldlm_request_cancel(struct ptlrpc_request *req,
 
 		/* This code is an optimization to only attempt lock
 		 * granting on the resource (that could be CPU-expensive)
-		 * after we are done cancelling lock in that resource. */
-                if (res != pres) {
-                        if (pres != NULL) {
-                                ldlm_reprocess_all(pres);
-                                LDLM_RESOURCE_DELREF(pres);
-                                ldlm_resource_putref(pres);
-                        }
-                        if (res != NULL) {
-                                ldlm_resource_getref(res);
-                                LDLM_RESOURCE_ADDREF(res);
-                                ldlm_res_lvbo_update(res, NULL, 1);
-                        }
-                        pres = res;
-                }
+		 * after we are done cancelling lock in that resource.
+		 */
+		if (res != pres) {
+			if (pres != NULL) {
+				ldlm_reprocess_all(pres, NULL);
+				LDLM_RESOURCE_DELREF(pres);
+				ldlm_resource_putref(pres);
+			}
+			if (res != NULL) {
+				ldlm_resource_getref(res);
+				LDLM_RESOURCE_ADDREF(res);
+
+				if (!ldlm_is_discard_data(lock))
+					ldlm_lvbo_update(res, lock,
+							 NULL, 1);
+			}
+			pres = res;
+		}
 
 		if ((flags & LATF_STATS) && ldlm_is_ast_sent(lock) &&
 		    lock->l_blast_sent != 0) {
@@ -1699,16 +1672,16 @@ int ldlm_request_cancel(struct ptlrpc_request *req,
 				   (s64)delay);
 			at_measured(&lock->l_export->exp_bl_lock_at, delay);
 		}
-                ldlm_lock_cancel(lock);
-                LDLM_LOCK_PUT(lock);
-        }
-        if (pres != NULL) {
-                ldlm_reprocess_all(pres);
-                LDLM_RESOURCE_DELREF(pres);
-                ldlm_resource_putref(pres);
-        }
-        LDLM_DEBUG_NOLOCK("server-side cancel handler END");
-        RETURN(done);
+		ldlm_lock_cancel(lock);
+		LDLM_LOCK_PUT(lock);
+	}
+	if (pres != NULL) {
+		ldlm_reprocess_all(pres, NULL);
+		LDLM_RESOURCE_DELREF(pres);
+		ldlm_resource_putref(pres);
+	}
+	LDLM_DEBUG_NOLOCK("server-side cancel handler END");
+	RETURN(done);
 }
 EXPORT_SYMBOL(ldlm_request_cancel);
 
@@ -1729,14 +1702,18 @@ int ldlm_handle_cancel(struct ptlrpc_request *req)
                 RETURN(-EFAULT);
         }
 
-        if (req->rq_export && req->rq_export->exp_nid_stats &&
-            req->rq_export->exp_nid_stats->nid_ldlm_stats)
-                lprocfs_counter_incr(req->rq_export->exp_nid_stats->nid_ldlm_stats,
-                                     LDLM_CANCEL - LDLM_FIRST_OPC);
+	if (req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT) <
+	    offsetof(struct ldlm_request, lock_handle[1]))
+		RETURN(-EPROTO);
 
-        rc = req_capsule_server_pack(&req->rq_pill);
-        if (rc)
-                RETURN(rc);
+	if (req->rq_export && req->rq_export->exp_nid_stats &&
+	    req->rq_export->exp_nid_stats->nid_ldlm_stats)
+		lprocfs_counter_incr(req->rq_export->exp_nid_stats->nid_ldlm_stats,
+				     LDLM_CANCEL - LDLM_FIRST_OPC);
+
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc)
+		RETURN(rc);
 
 	if (!ldlm_request_cancel(req, dlm_req, 0, LATF_STATS))
 		req->rq_status = LUSTRE_ESTALE;
@@ -1745,20 +1722,62 @@ int ldlm_handle_cancel(struct ptlrpc_request *req)
 }
 #endif /* HAVE_SERVER_SUPPORT */
 
+/**
+ * Server may pass additional information about blocking lock.
+ * For IBITS locks it is conflicting bits which can be used for
+ * lock convert instead of cancel.
+ */
+void ldlm_bl_desc2lock(const struct ldlm_lock_desc *ld, struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	check_res_locked(lock->l_resource);
+	if (ns_is_client(ns) && ld &&
+	    (lock->l_resource->lr_type == LDLM_IBITS)) {
+		/*
+		 * Lock description contains policy of blocking lock,
+		 * and its cancel_bits is used to pass conflicting bits.
+		 * NOTE: ld can be NULL or can be not NULL but zeroed if
+		 * passed from ldlm_bl_thread_blwi(), check below used bits
+		 * in ld to make sure it is valid description.
+		 */
+		if (ld->l_policy_data.l_inodebits.cancel_bits &&
+		    ldlm_res_eq(&ld->l_resource.lr_name,
+				&lock->l_resource->lr_name) &&
+		    !(ldlm_is_cbpending(lock) &&
+		      lock->l_policy_data.l_inodebits.cancel_bits == 0)) {
+			/* always combine conflicting ibits */
+			lock->l_policy_data.l_inodebits.cancel_bits |=
+				ld->l_policy_data.l_inodebits.cancel_bits;
+		} else {
+			/* If cancel_bits are not obtained or
+			 * if the lock is already CBPENDING and
+			 * has no cancel_bits set
+			 * - the full lock is to be cancelled
+			 */
+			lock->l_policy_data.l_inodebits.cancel_bits = 0;
+		}
+	}
+}
+
 /**
  * Callback handler for receiving incoming blocking ASTs.
  *
  * This can only happen on client side.
  */
 void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
-                             struct ldlm_lock_desc *ld, struct ldlm_lock *lock)
+			     struct ldlm_lock_desc *ld, struct ldlm_lock *lock)
 {
-        int do_ast;
-        ENTRY;
+	int do_ast;
 
-        LDLM_DEBUG(lock, "client blocking AST callback handler");
+	ENTRY;
 
-        lock_res_and_lock(lock);
+	LDLM_DEBUG(lock, "client blocking AST callback handler");
+
+	lock_res_and_lock(lock);
+
+	/* get extra information from desc if any */
+	ldlm_bl_desc2lock(ld, lock);
 	ldlm_set_cbpending(lock);
 
 	if (ldlm_is_cancel_on_block(lock))
@@ -1783,12 +1802,26 @@ void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
         EXIT;
 }
 
+static int ldlm_callback_reply(struct ptlrpc_request *req, int rc)
+{
+	if (req->rq_no_reply)
+		return 0;
+
+	req->rq_status = rc;
+	if (!req->rq_packed_final) {
+		rc = lustre_pack_reply(req, 1, NULL, NULL);
+		if (rc)
+			return rc;
+	}
+	return ptlrpc_reply(req);
+}
+
 /**
  * Callback handler for receiving incoming completion ASTs.
  *
  * This only can happen on client side.
  */
-static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
+static int ldlm_handle_cp_callback(struct ptlrpc_request *req,
                                     struct ldlm_namespace *ns,
                                     struct ldlm_request *dlm_req,
                                     struct ldlm_lock *lock)
@@ -1802,11 +1835,14 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
 
 	INIT_LIST_HEAD(&ast_list);
 	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) {
-		int to = cfs_time_seconds(1);
+		long to = cfs_time_seconds(1);
+
+		ldlm_callback_reply(req, 0);
+
 		while (to > 0) {
 			set_current_state(TASK_INTERRUPTIBLE);
-			schedule_timeout(to);
-			if (lock->l_granted_mode == lock->l_req_mode ||
+			to = schedule_timeout(to);
+			if (ldlm_is_granted(lock) ||
 			    ldlm_is_destroyed(lock))
 				break;
 		}
@@ -1832,8 +1868,29 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
 	}
 
 	lock_res_and_lock(lock);
+
+	if (!ldlm_res_eq(&dlm_req->lock_desc.l_resource.lr_name,
+			 &lock->l_resource->lr_name)) {
+		ldlm_resource_unlink_lock(lock);
+		unlock_res_and_lock(lock);
+		rc = ldlm_lock_change_resource(ns, lock,
+				&dlm_req->lock_desc.l_resource.lr_name);
+		if (rc < 0) {
+			LDLM_ERROR(lock, "Failed to allocate resource");
+			GOTO(out, rc);
+		}
+		LDLM_DEBUG(lock, "completion AST, new resource");
+		lock_res_and_lock(lock);
+	}
+
+	if (ldlm_is_failed(lock)) {
+		unlock_res_and_lock(lock);
+		LDLM_LOCK_RELEASE(lock);
+		RETURN(-EINVAL);
+	}
+
 	if (ldlm_is_destroyed(lock) ||
-	    lock->l_granted_mode == lock->l_req_mode) {
+	    ldlm_is_granted(lock)) {
 		/* bug 11300: the lock has already been granted */
 		unlock_res_and_lock(lock);
 		LDLM_DEBUG(lock, "Double grant race happened");
@@ -1855,26 +1912,15 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
 		LDLM_DEBUG(lock, "completion AST, new policy data");
 	}
 
-        ldlm_resource_unlink_lock(lock);
-        if (memcmp(&dlm_req->lock_desc.l_resource.lr_name,
-                   &lock->l_resource->lr_name,
-                   sizeof(lock->l_resource->lr_name)) != 0) {
-                unlock_res_and_lock(lock);
-		rc = ldlm_lock_change_resource(ns, lock,
-				&dlm_req->lock_desc.l_resource.lr_name);
-		if (rc < 0) {
-			LDLM_ERROR(lock, "Failed to allocate resource");
-			GOTO(out, rc);
-		}
-                LDLM_DEBUG(lock, "completion AST, new resource");
-                CERROR("change resource!\n");
-                lock_res_and_lock(lock);
-        }
+	ldlm_resource_unlink_lock(lock);
 
-        if (dlm_req->lock_flags & LDLM_FL_AST_SENT) {
-		/* BL_AST locks are not needed in LRU.
-		 * Let ldlm_cancel_lru() be fast. */
-                ldlm_lock_remove_from_lru(lock);
+	if (dlm_req->lock_flags & LDLM_FL_AST_SENT) {
+		/*
+		 * BL_AST locks are not needed in LRU.
+		 * Let ldlm_cancel_lru() be fast.
+		 */
+		ldlm_lock_remove_from_lru(lock);
+		ldlm_bl_desc2lock(&dlm_req->lock_desc, lock);
 		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST;
                 LDLM_DEBUG(lock, "completion AST includes blocking AST");
         }
@@ -1911,6 +1957,8 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
 		wake_up(&lock->l_waitq);
 	}
 	LDLM_LOCK_RELEASE(lock);
+
+	return 0;
 }
 
 /**
@@ -1925,10 +1973,12 @@ static void ldlm_handle_gl_callback(struct ptlrpc_request *req,
                                     struct ldlm_request *dlm_req,
                                     struct ldlm_lock *lock)
 {
-        int rc = -ENOSYS;
-        ENTRY;
+	struct ldlm_lock_desc *ld = &dlm_req->lock_desc;
+	int rc = -ENOSYS;
 
-        LDLM_DEBUG(lock, "client glimpse AST callback handler");
+	ENTRY;
+
+	LDLM_DEBUG(lock, "client glimpse AST callback handler");
 
         if (lock->l_glimpse_ast != NULL)
                 rc = lock->l_glimpse_ast(lock, req);
@@ -1945,10 +1995,17 @@ static void ldlm_handle_gl_callback(struct ptlrpc_request *req,
             !lock->l_readers && !lock->l_writers &&
 	    ktime_after(ktime_get(),
 			ktime_add(lock->l_last_used,
-				  ktime_set(10, 0)))) {
-                unlock_res_and_lock(lock);
-                if (ldlm_bl_to_thread_lock(ns, NULL, lock))
-                        ldlm_handle_bl_callback(ns, NULL, lock);
+				  ktime_set(ns->ns_dirty_age_limit, 0)))) {
+		unlock_res_and_lock(lock);
+
+		/* For MDS glimpse it is always DOM lock, set corresponding
+		 * cancel_bits to perform lock convert if needed
+		 */
+		if (lock->l_resource->lr_type == LDLM_IBITS)
+			ld->l_policy_data.l_inodebits.cancel_bits =
+							MDS_INODELOCK_DOM;
+		if (ldlm_bl_to_thread_lock(ns, ld, lock))
+			ldlm_handle_bl_callback(ns, ld, lock);
 
                 EXIT;
                 return;
@@ -1958,20 +2015,6 @@ static void ldlm_handle_gl_callback(struct ptlrpc_request *req,
         EXIT;
 }
 
-static int ldlm_callback_reply(struct ptlrpc_request *req, int rc)
-{
-        if (req->rq_no_reply)
-                return 0;
-
-        req->rq_status = rc;
-        if (!req->rq_packed_final) {
-                rc = lustre_pack_reply(req, 1, NULL, NULL);
-                if (rc)
-                        return rc;
-        }
-        return ptlrpc_reply(req);
-}
-
 static int __ldlm_bl_to_thread(struct ldlm_bl_work_item *blwi,
 			       enum ldlm_cancel_flags cancel_flags)
 {
@@ -2194,35 +2237,6 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
                 rc = ldlm_handle_setinfo(req);
                 ldlm_callback_reply(req, rc);
                 RETURN(0);
-        case LLOG_ORIGIN_HANDLE_CREATE:
-                req_capsule_set(&req->rq_pill, &RQF_LLOG_ORIGIN_HANDLE_CREATE);
-                if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
-                        RETURN(0);
-		rc = llog_origin_handle_open(req);
-                ldlm_callback_reply(req, rc);
-                RETURN(0);
-        case LLOG_ORIGIN_HANDLE_NEXT_BLOCK:
-                req_capsule_set(&req->rq_pill,
-                                &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK);
-                if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
-                        RETURN(0);
-                rc = llog_origin_handle_next_block(req);
-                ldlm_callback_reply(req, rc);
-                RETURN(0);
-        case LLOG_ORIGIN_HANDLE_READ_HEADER:
-                req_capsule_set(&req->rq_pill,
-                                &RQF_LLOG_ORIGIN_HANDLE_READ_HEADER);
-                if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
-                        RETURN(0);
-                rc = llog_origin_handle_read_header(req);
-                ldlm_callback_reply(req, rc);
-                RETURN(0);
-        case LLOG_ORIGIN_HANDLE_CLOSE:
-                if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
-                        RETURN(0);
-                rc = llog_origin_handle_close(req);
-                ldlm_callback_reply(req, rc);
-                RETURN(0);
         default:
                 CERROR("unknown opcode %u\n",
                        lustre_msg_get_opc(req->rq_reqmsg));
@@ -2307,30 +2321,31 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
                 CDEBUG(D_INODE, "blocking ast\n");
                 req_capsule_extend(&req->rq_pill, &RQF_LDLM_BL_CALLBACK);
 		if (!ldlm_is_cancel_on_block(lock)) {
-                        rc = ldlm_callback_reply(req, 0);
-                        if (req->rq_no_reply || rc)
-                                ldlm_callback_errmsg(req, "Normal process", rc,
-                                                     &dlm_req->lock_handle[0]);
-                }
-                if (ldlm_bl_to_thread_lock(ns, &dlm_req->lock_desc, lock))
-                        ldlm_handle_bl_callback(ns, &dlm_req->lock_desc, lock);
-                break;
-        case LDLM_CP_CALLBACK:
-                CDEBUG(D_INODE, "completion ast\n");
-                req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK);
-                ldlm_callback_reply(req, 0);
-                ldlm_handle_cp_callback(req, ns, dlm_req, lock);
-                break;
-        case LDLM_GL_CALLBACK:
-                CDEBUG(D_INODE, "glimpse ast\n");
-                req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK);
-                ldlm_handle_gl_callback(req, ns, dlm_req, lock);
-                break;
-        default:
-                LBUG();                         /* checked above */
-        }
+			rc = ldlm_callback_reply(req, 0);
+			if (req->rq_no_reply || rc)
+				ldlm_callback_errmsg(req, "Normal process", rc,
+						     &dlm_req->lock_handle[0]);
+		}
+		if (ldlm_bl_to_thread_lock(ns, &dlm_req->lock_desc, lock))
+			ldlm_handle_bl_callback(ns, &dlm_req->lock_desc, lock);
+		break;
+	case LDLM_CP_CALLBACK:
+		CDEBUG(D_INODE, "completion ast\n");
+		req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK);
+		rc = ldlm_handle_cp_callback(req, ns, dlm_req, lock);
+		if (!OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE))
+			ldlm_callback_reply(req, rc);
+		break;
+	case LDLM_GL_CALLBACK:
+		CDEBUG(D_INODE, "glimpse ast\n");
+		req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK);
+		ldlm_handle_gl_callback(req, ns, dlm_req, lock);
+		break;
+	default:
+		LBUG(); /* checked above */
+	}
 
-        RETURN(0);
+	RETURN(0);
 }
 
 #ifdef HAVE_SERVER_SUPPORT
@@ -2341,145 +2356,169 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
  */
 static int ldlm_cancel_handler(struct ptlrpc_request *req)
 {
-        int rc;
-        ENTRY;
-
-        /* Requests arrive in sender's byte order.  The ptlrpc service
-         * handler has already checked and, if necessary, byte-swapped the
-         * incoming request message body, but I am responsible for the
-         * message buffers. */
-
-        req_capsule_init(&req->rq_pill, req, RCL_SERVER);
+	int rc;
 
-        if (req->rq_export == NULL) {
-                struct ldlm_request *dlm_req;
-
-                CERROR("%s from %s arrived at %lu with bad export cookie "
-		       "%llu\n",
-                       ll_opcode2str(lustre_msg_get_opc(req->rq_reqmsg)),
-                       libcfs_nid2str(req->rq_peer.nid),
-                       req->rq_arrival_time.tv_sec,
-                       lustre_msg_get_handle(req->rq_reqmsg)->cookie);
-
-                if (lustre_msg_get_opc(req->rq_reqmsg) == LDLM_CANCEL) {
-                        req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK);
-                        dlm_req = req_capsule_client_get(&req->rq_pill,
-                                                         &RMF_DLM_REQ);
-                        if (dlm_req != NULL)
-                                ldlm_lock_dump_handle(D_ERROR,
-                                                      &dlm_req->lock_handle[0]);
-                }
-                ldlm_callback_reply(req, -ENOTCONN);
-                RETURN(0);
-        }
+	ENTRY;
 
-        switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+	/* Requests arrive in sender's byte order.  The ptlrpc service
+	 * handler has already checked and, if necessary, byte-swapped the
+	 * incoming request message body, but I am responsible for the
+	 * message buffers. */
+
+	req_capsule_init(&req->rq_pill, req, RCL_SERVER);
+
+	if (req->rq_export == NULL) {
+		struct ldlm_request *dlm_req;
+
+		CERROR("%s from %s arrived at %llu with bad export cookie %llu\n",
+		       ll_opcode2str(lustre_msg_get_opc(req->rq_reqmsg)),
+		       libcfs_nid2str(req->rq_peer.nid),
+		       (unsigned long long)req->rq_arrival_time.tv_sec,
+		       lustre_msg_get_handle(req->rq_reqmsg)->cookie);
+
+		if (lustre_msg_get_opc(req->rq_reqmsg) == LDLM_CANCEL) {
+			req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK);
+			dlm_req = req_capsule_client_get(&req->rq_pill,
+							 &RMF_DLM_REQ);
+			if (dlm_req != NULL)
+				ldlm_lock_dump_handle(D_ERROR,
+						      &dlm_req->lock_handle[0]);
+		}
+		ldlm_callback_reply(req, -ENOTCONN);
+		RETURN(0);
+	}
 
-        /* XXX FIXME move this back to mds/handler.c, bug 249 */
-        case LDLM_CANCEL:
-                req_capsule_set(&req->rq_pill, &RQF_LDLM_CANCEL);
-                CDEBUG(D_INODE, "cancel\n");
+	switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+	/* XXX FIXME move this back to mds/handler.c, bug 249 */
+	case LDLM_CANCEL:
+		req_capsule_set(&req->rq_pill, &RQF_LDLM_CANCEL);
+		CDEBUG(D_INODE, "cancel\n");
 		if (CFS_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_NET) ||
 		    CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND) ||
 		    CFS_FAIL_CHECK(OBD_FAIL_LDLM_BL_EVICT))
 			RETURN(0);
-                rc = ldlm_handle_cancel(req);
-                if (rc)
-                        break;
-                RETURN(0);
-        default:
-                CERROR("invalid opcode %d\n",
-                       lustre_msg_get_opc(req->rq_reqmsg));
-                req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK);
-                ldlm_callback_reply(req, -EINVAL);
-        }
+		rc = ldlm_handle_cancel(req);
+		break;
+	case LDLM_CONVERT:
+	{
+		struct ldlm_request *dlm_req;
 
-        RETURN(0);
+		req_capsule_set(&req->rq_pill, &RQF_LDLM_CONVERT);
+		CDEBUG(D_INODE, "convert\n");
+
+		dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+		if (dlm_req == NULL) {
+			CDEBUG(D_INFO, "bad request buffer for cancel\n");
+			rc = ldlm_callback_reply(req, -EPROTO);
+		} else {
+			req->rq_status = ldlm_handle_convert0(req, dlm_req);
+			rc = ptlrpc_reply(req);
+		}
+		break;
+	}
+	default:
+		CERROR("invalid opcode %d\n",
+		       lustre_msg_get_opc(req->rq_reqmsg));
+		req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK);
+		rc = ldlm_callback_reply(req, -EINVAL);
+	}
+
+	RETURN(rc);
 }
 
 static int ldlm_cancel_hpreq_lock_match(struct ptlrpc_request *req,
-                                        struct ldlm_lock *lock)
+					struct ldlm_lock *lock)
 {
-        struct ldlm_request *dlm_req;
-        struct lustre_handle lockh;
-        int rc = 0;
-        int i;
-        ENTRY;
-
-        dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
-        if (dlm_req == NULL)
-                RETURN(0);
+	struct ldlm_request *dlm_req;
+	struct lustre_handle lockh;
+	int rc = 0;
+	int i;
 
-        ldlm_lock2handle(lock, &lockh);
-        for (i = 0; i < dlm_req->lock_count; i++) {
-                if (lustre_handle_equal(&dlm_req->lock_handle[i],
-                                        &lockh)) {
-                        DEBUG_REQ(D_RPCTRACE, req,
-				  "Prio raised by lock %#llx.", lockh.cookie);
+	ENTRY;
 
-                        rc = 1;
-                        break;
-                }
-        }
+	dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	if (dlm_req == NULL)
+		RETURN(0);
 
-        RETURN(rc);
+	ldlm_lock2handle(lock, &lockh);
+	for (i = 0; i < dlm_req->lock_count; i++) {
+		if (lustre_handle_equal(&dlm_req->lock_handle[i],
+					&lockh)) {
+			DEBUG_REQ(D_RPCTRACE, req,
+				  "Prio raised by lock %#llx.", lockh.cookie);
+			rc = 1;
+			break;
+		}
+	}
 
+	RETURN(rc);
 }
 
 static int ldlm_cancel_hpreq_check(struct ptlrpc_request *req)
 {
-        struct ldlm_request *dlm_req;
-        int rc = 0;
-        int i;
-        ENTRY;
+	struct ldlm_request *dlm_req;
+	int rc = 0;
+	int i;
+	unsigned int size;
 
-        /* no prolong in recovery */
-        if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
-                RETURN(0);
+	ENTRY;
 
-        dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
-        if (dlm_req == NULL)
-                RETURN(-EFAULT);
+	/* no prolong in recovery */
+	if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
+		RETURN(0);
 
-        for (i = 0; i < dlm_req->lock_count; i++) {
-                struct ldlm_lock *lock;
+	dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	if (dlm_req == NULL)
+		RETURN(-EFAULT);
 
-                lock = ldlm_handle2lock(&dlm_req->lock_handle[i]);
-                if (lock == NULL)
-                        continue;
+	size = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT);
+	if (size <= offsetof(struct ldlm_request, lock_handle) ||
+	    (size - offsetof(struct ldlm_request, lock_handle)) /
+	     sizeof(struct lustre_handle) < dlm_req->lock_count)
+		RETURN(-EPROTO);
+
+	for (i = 0; i < dlm_req->lock_count; i++) {
+		struct ldlm_lock *lock;
+
+		lock = ldlm_handle2lock(&dlm_req->lock_handle[i]);
+		if (lock == NULL)
+			continue;
 
 		rc = ldlm_is_ast_sent(lock) ? 1 : 0;
-                if (rc)
-                        LDLM_DEBUG(lock, "hpreq cancel lock");
-                LDLM_LOCK_PUT(lock);
+		if (rc)
+			LDLM_DEBUG(lock, "hpreq cancel/convert lock");
+		LDLM_LOCK_PUT(lock);
 
-                if (rc)
-                        break;
-        }
+		if (rc)
+			break;
+	}
 
-        RETURN(rc);
+	RETURN(rc);
 }
 
 static struct ptlrpc_hpreq_ops ldlm_cancel_hpreq_ops = {
-        .hpreq_lock_match = ldlm_cancel_hpreq_lock_match,
+	.hpreq_lock_match = ldlm_cancel_hpreq_lock_match,
 	.hpreq_check      = ldlm_cancel_hpreq_check,
 	.hpreq_fini       = NULL,
 };
 
 static int ldlm_hpreq_handler(struct ptlrpc_request *req)
 {
-        ENTRY;
+	ENTRY;
 
-        req_capsule_init(&req->rq_pill, req, RCL_SERVER);
+	req_capsule_init(&req->rq_pill, req, RCL_SERVER);
 
-        if (req->rq_export == NULL)
-                RETURN(0);
+	if (req->rq_export == NULL)
+		RETURN(0);
 
-        if (LDLM_CANCEL == lustre_msg_get_opc(req->rq_reqmsg)) {
-                req_capsule_set(&req->rq_pill, &RQF_LDLM_CANCEL);
-                req->rq_ops = &ldlm_cancel_hpreq_ops;
-        }
-        RETURN(0);
+	if (LDLM_CANCEL == lustre_msg_get_opc(req->rq_reqmsg)) {
+		req_capsule_set(&req->rq_pill, &RQF_LDLM_CANCEL);
+		req->rq_ops = &ldlm_cancel_hpreq_ops;
+	} else if (LDLM_CONVERT == lustre_msg_get_opc(req->rq_reqmsg)) {
+		req_capsule_set(&req->rq_pill, &RQF_LDLM_CONVERT);
+		req->rq_ops = &ldlm_cancel_hpreq_ops;
+	}
+	RETURN(0);
 }
 
 static int ldlm_revoke_lock_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd,
@@ -2491,10 +2530,10 @@ static int ldlm_revoke_lock_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd,
 
         lock_res_and_lock(lock);
 
-        if (lock->l_req_mode != lock->l_granted_mode) {
-                unlock_res_and_lock(lock);
-                return 0;
-        }
+	if (!ldlm_is_granted(lock)) {
+		unlock_res_and_lock(lock);
+		return 0;
+	}
 
         LASSERT(lock->l_resource);
         if (lock->l_resource->lr_type != LDLM_IBITS &&
@@ -2726,9 +2765,22 @@ static int ldlm_bl_thread_exports(struct ldlm_bl_pool *blp,
  */
 static int ldlm_bl_thread_main(void *arg)
 {
-        struct ldlm_bl_pool *blp;
+	struct lu_env *env;
+	struct ldlm_bl_pool *blp;
 	struct ldlm_bl_thread_data *bltd = arg;
-        ENTRY;
+	int rc;
+
+	ENTRY;
+
+	OBD_ALLOC_PTR(env);
+	if (!env)
+		RETURN(-ENOMEM);
+	rc = lu_env_init(env, LCT_DT_THREAD);
+	if (rc)
+		GOTO(out_env, rc);
+	rc = lu_env_add(env);
+	if (rc)
+		GOTO(out_env_fini, rc);
 
 	blp = bltd->bltd_blp;
 
@@ -2772,7 +2824,13 @@ static int ldlm_bl_thread_main(void *arg)
 
 	atomic_dec(&blp->blp_num_threads);
 	complete(&blp->blp_comp);
-	RETURN(0);
+
+	lu_env_remove(env);
+out_env_fini:
+	lu_env_fini(env);
+out_env:
+	OBD_FREE_PTR(env);
+	RETURN(rc);
 }
 
 
@@ -2973,7 +3031,7 @@ static int ldlm_setup(void)
         if (ldlm_state == NULL)
                 RETURN(-ENOMEM);
 
-	ldlm_kobj = kobject_create_and_add("ldlm", lustre_kobj);
+	ldlm_kobj = kobject_create_and_add("ldlm", &lustre_kset->kobj);
 	if (!ldlm_kobj)
 		GOTO(out, -ENOMEM);
 
@@ -2989,11 +3047,9 @@ static int ldlm_setup(void)
 	if (!ldlm_svc_kset)
 		GOTO(out, -ENOMEM);
 
-#ifdef CONFIG_PROC_FS
-	rc = ldlm_proc_setup();
+	rc = ldlm_debugfs_setup();
 	if (rc != 0)
 		GOTO(out, rc);
-#endif /* CONFIG_PROC_FS */
 
 	memset(&conf, 0, sizeof(conf));
 	conf = (typeof(conf)) {
@@ -3014,18 +3070,20 @@ static int ldlm_setup(void)
 			.tc_nthrs_base		= LDLM_NTHRS_BASE,
 			.tc_nthrs_max		= LDLM_NTHRS_MAX,
 			.tc_nthrs_user		= ldlm_num_threads,
-			.tc_cpu_affinity	= 1,
+			.tc_cpu_bind		= ldlm_cpu_bind,
 			.tc_ctx_tags		= LCT_MD_THREAD | LCT_DT_THREAD,
 		},
 		.psc_cpt		= {
 			.cc_pattern		= ldlm_cpts,
+			.cc_affinity		= true,
 		},
 		.psc_ops		= {
 			.so_req_handler		= ldlm_callback_handler,
 		},
 	};
 	ldlm_state->ldlm_cb_service = \
-			ptlrpc_register_service(&conf, ldlm_svc_proc_dir);
+			ptlrpc_register_service(&conf, ldlm_svc_kset,
+						ldlm_svc_debugfs_dir);
 	if (IS_ERR(ldlm_state->ldlm_cb_service)) {
 		CERROR("failed to start service\n");
 		rc = PTR_ERR(ldlm_state->ldlm_cb_service);
@@ -3054,13 +3112,14 @@ static int ldlm_setup(void)
 			.tc_nthrs_base		= LDLM_NTHRS_BASE,
 			.tc_nthrs_max		= LDLM_NTHRS_MAX,
 			.tc_nthrs_user		= ldlm_num_threads,
-			.tc_cpu_affinity	= 1,
+			.tc_cpu_bind		= ldlm_cpu_bind,
 			.tc_ctx_tags		= LCT_MD_THREAD | \
 						  LCT_DT_THREAD | \
 						  LCT_CL_THREAD,
 		},
 		.psc_cpt		= {
 			.cc_pattern		= ldlm_cpts,
+			.cc_affinity		= true,
 		},
 		.psc_ops		= {
 			.so_req_handler		= ldlm_cancel_handler,
@@ -3068,7 +3127,8 @@ static int ldlm_setup(void)
 		},
 	};
 	ldlm_state->ldlm_cancel_service = \
-			ptlrpc_register_service(&conf, ldlm_svc_proc_dir);
+			ptlrpc_register_service(&conf, ldlm_svc_kset,
+						ldlm_svc_debugfs_dir);
 	if (IS_ERR(ldlm_state->ldlm_cancel_service)) {
 		CERROR("failed to start service\n");
 		rc = PTR_ERR(ldlm_state->ldlm_cancel_service);
@@ -3179,10 +3239,12 @@ static int ldlm_cleanup(void)
 		kset_unregister(ldlm_ns_kset);
 	if (ldlm_svc_kset)
 		kset_unregister(ldlm_svc_kset);
-	if (ldlm_kobj)
+	if (ldlm_kobj) {
+		sysfs_remove_group(ldlm_kobj, &ldlm_attr_group);
 		kobject_put(ldlm_kobj);
+	}
 
-	ldlm_proc_cleanup();
+	ldlm_debugfs_cleanup();
 
 #ifdef HAVE_SERVER_SUPPORT
 	if (expired_lock_thread_state != ELT_STOPPED) {
@@ -3209,7 +3271,7 @@ int ldlm_init(void)
 
 	ldlm_lock_slab = kmem_cache_create("ldlm_locks",
 			      sizeof(struct ldlm_lock), 0,
-			      SLAB_HWCACHE_ALIGN | SLAB_DESTROY_BY_RCU, NULL);
+			      SLAB_HWCACHE_ALIGN, NULL);
 	if (ldlm_lock_slab == NULL)
 		goto out_resource;
 
@@ -3225,11 +3287,30 @@ int ldlm_init(void)
 	if (ldlm_interval_tree_slab == NULL)
 		goto out_interval;
 
+#ifdef HAVE_SERVER_SUPPORT
+	ldlm_inodebits_slab = kmem_cache_create("ldlm_ibits_node",
+						sizeof(struct ldlm_ibits_node),
+						0, SLAB_HWCACHE_ALIGN, NULL);
+	if (ldlm_inodebits_slab == NULL)
+		goto out_interval_tree;
+
+	ldlm_glimpse_work_kmem = kmem_cache_create("ldlm_glimpse_work_kmem",
+					sizeof(struct ldlm_glimpse_work),
+					0, 0, NULL);
+	if (ldlm_glimpse_work_kmem == NULL)
+		goto out_inodebits;
+#endif
+
 #if LUSTRE_TRACKS_LOCK_EXP_REFS
 	class_export_dump_hook = ldlm_dump_export_locks;
 #endif
 	return 0;
-
+#ifdef HAVE_SERVER_SUPPORT
+out_inodebits:
+	kmem_cache_destroy(ldlm_inodebits_slab);
+out_interval_tree:
+	kmem_cache_destroy(ldlm_interval_tree_slab);
+#endif
 out_interval:
 	kmem_cache_destroy(ldlm_interval_slab);
 out_lock:
@@ -3245,11 +3326,17 @@ void ldlm_exit(void)
 	if (ldlm_refcount)
 		CERROR("ldlm_refcount is %d in ldlm_exit!\n", ldlm_refcount);
 	kmem_cache_destroy(ldlm_resource_slab);
-	/* ldlm_lock_put() use RCU to call ldlm_lock_free, so need call
-	 * synchronize_rcu() to wait a grace period elapsed, so that
-	 * ldlm_lock_free() get a chance to be called. */
-	synchronize_rcu();
+	/*
+	 * ldlm_lock_put() use RCU to call ldlm_lock_free, so need call
+	 * rcu_barrier() to wait all outstanding RCU callbacks to complete,
+	 * so that ldlm_lock_free() get a chance to be called.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(ldlm_lock_slab);
 	kmem_cache_destroy(ldlm_interval_slab);
 	kmem_cache_destroy(ldlm_interval_tree_slab);
+#ifdef HAVE_SERVER_SUPPORT
+	kmem_cache_destroy(ldlm_inodebits_slab);
+	kmem_cache_destroy(ldlm_glimpse_work_kmem);
+#endif
 }
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_plain.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_plain.c
index 6453cabf1921f..6407fd20884f8 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_plain.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_plain.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -129,14 +129,14 @@ int ldlm_process_plain_lock(struct ldlm_lock *lock, __u64 *flags,
 			    enum ldlm_error *err, struct list_head *work_list)
 {
 	struct ldlm_resource *res = lock->l_resource;
-	struct list_head rpc_list;
+	struct list_head *grant_work = intention == LDLM_PROCESS_ENQUEUE ?
+							NULL : work_list;
 	int rc;
 	ENTRY;
 
-	LASSERT(lock->l_granted_mode != lock->l_req_mode);
+	LASSERT(!ldlm_is_granted(lock));
 	check_res_locked(res);
-	LASSERT(list_empty(&res->lr_converting));
-	INIT_LIST_HEAD(&rpc_list);
+	*err = ELDLM_OK;
 
 	if (intention == LDLM_PROCESS_RESCAN) {
                 LASSERT(work_list != NULL);
@@ -148,31 +148,19 @@ int ldlm_process_plain_lock(struct ldlm_lock *lock, __u64 *flags,
                         RETURN(LDLM_ITER_STOP);
 
                 ldlm_resource_unlink_lock(lock);
-                ldlm_grant_lock(lock, work_list);
+		ldlm_grant_lock(lock, grant_work);
                 RETURN(LDLM_ITER_CONTINUE);
         }
 
-	LASSERT((intention == LDLM_PROCESS_ENQUEUE && work_list == NULL) ||
-		(intention == LDLM_PROCESS_RECOVERY && work_list != NULL));
- restart:
-        rc = ldlm_plain_compat_queue(&res->lr_granted, lock, &rpc_list);
-        rc += ldlm_plain_compat_queue(&res->lr_waiting, lock, &rpc_list);
-
-        if (rc != 2) {
-		rc = ldlm_handle_conflict_lock(lock, flags, &rpc_list, 0);
-		if (rc == -ERESTART)
-			GOTO(restart, rc);
-		*err = rc;
-	} else {
+	rc = ldlm_plain_compat_queue(&res->lr_granted, lock, work_list);
+	rc += ldlm_plain_compat_queue(&res->lr_waiting, lock, work_list);
+
+	if (rc == 2) {
 		ldlm_resource_unlink_lock(lock);
-		ldlm_grant_lock(lock, work_list);
-		rc = 0;
+		ldlm_grant_lock(lock, grant_work);
 	}
 
-	if (!list_empty(&rpc_list))
-		ldlm_discard_bl_list(&rpc_list);
-
-	RETURN(rc);
+	RETURN(LDLM_ITER_CONTINUE);
 }
 #endif /* HAVE_SERVER_SUPPORT */
 
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_pool.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_pool.c
index 2afed77ea5f70..0a423d5615b5b 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_pool.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_pool.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2016, Intel Corporation.
+ * Copyright (c) 2010, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -93,7 +93,8 @@
 
 #define DEBUG_SUBSYSTEM S_LDLM
 
-#include <linux/kthread.h>
+#include <linux/workqueue.h>
+#include <libcfs/linux/linux-mem.h>
 #include <lustre_dlm.h>
 #include <cl_object.h>
 #include <obd_class.h>
@@ -497,22 +498,14 @@ static int ldlm_cli_pool_recalc(struct ldlm_pool *pl)
         ldlm_cli_pool_pop_slv(pl);
 	spin_unlock(&pl->pl_lock);
 
-        /*
-         * Do not cancel locks in case lru resize is disabled for this ns.
-         */
-        if (!ns_connect_lru_resize(ldlm_pl2ns(pl)))
-		GOTO(out, ret = 0);
-
         /*
          * In the time of canceling locks on client we do not need to maintain
          * sharp timing, we only want to cancel locks asap according to new SLV.
          * It may be called when SLV has changed much, this is why we do not
          * take into account pl->pl_recalc_time here.
          */
-	ret = ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LCF_ASYNC,
-			      LDLM_LRU_FLAG_LRUR);
+	ret = ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LCF_ASYNC, 0);
 
-out:
 	spin_lock(&pl->pl_lock);
 	/*
 	 * Time of LRU resizing might be longer than period,
@@ -556,7 +549,7 @@ static int ldlm_cli_pool_shrink(struct ldlm_pool *pl,
 	if (nr == 0)
 		return (unused / 100) * sysctl_vfs_cache_pressure;
 	else
-		return ldlm_cancel_lru(ns, nr, LCF_ASYNC, LDLM_LRU_FLAG_SHRINK);
+		return ldlm_cancel_lru(ns, nr, LCF_ASYNC, 0);
 }
 
 static struct ldlm_pool_ops ldlm_srv_pool_ops = {
@@ -574,7 +567,7 @@ static struct ldlm_pool_ops ldlm_cli_pool_ops = {
  * Pool recalc wrapper. Will call either client or server pool recalc callback
  * depending what pool \a pl is used.
  */
-int ldlm_pool_recalc(struct ldlm_pool *pl)
+time64_t ldlm_pool_recalc(struct ldlm_pool *pl)
 {
 	time64_t recalc_interval_sec;
 	int count;
@@ -694,7 +687,8 @@ static int lprocfs_pool_state_seq_show(struct seq_file *m, void *unused)
 		   granted, limit);
 	return 0;
 }
-LPROC_SEQ_FOPS_RO(lprocfs_pool_state);
+
+LDEBUGFS_SEQ_FOPS_RO(lprocfs_pool_state);
 
 static ssize_t grant_speed_show(struct kobject *kobj, struct attribute *attr,
 				char *buf)
@@ -778,11 +772,11 @@ static int ldlm_pool_sysfs_init(struct ldlm_pool *pl)
 	return err;
 }
 
-static int ldlm_pool_proc_init(struct ldlm_pool *pl)
+static int ldlm_pool_debugfs_init(struct ldlm_pool *pl)
 {
 	struct ldlm_namespace *ns = ldlm_pl2ns(pl);
-	struct proc_dir_entry *parent_ns_proc;
-	struct lprocfs_vars pool_vars[2];
+	struct dentry *debugfs_ns_parent;
+	struct ldebugfs_vars pool_vars[2];
 	char *var_name = NULL;
 	int rc = 0;
 	ENTRY;
@@ -791,18 +785,18 @@ static int ldlm_pool_proc_init(struct ldlm_pool *pl)
 	if (!var_name)
 		RETURN(-ENOMEM);
 
-	parent_ns_proc = ns->ns_proc_dir_entry;
-	if (parent_ns_proc == NULL) {
-		CERROR("%s: proc entry is not initialized\n",
+	debugfs_ns_parent = ns->ns_debugfs_entry;
+	if (IS_ERR_OR_NULL(debugfs_ns_parent)) {
+		CERROR("%s: debugfs entry is not initialized\n",
 		       ldlm_ns_name(ns));
 		GOTO(out_free_name, rc = -EINVAL);
 	}
-	pl->pl_proc_dir = lprocfs_register("pool", parent_ns_proc,
-					   NULL, NULL);
-	if (IS_ERR(pl->pl_proc_dir)) {
-		rc = PTR_ERR(pl->pl_proc_dir);
-		pl->pl_proc_dir = NULL;
-		CERROR("%s: cannot create 'pool' proc entry: rc = %d\n",
+	pl->pl_debugfs_entry = ldebugfs_register("pool", debugfs_ns_parent,
+						 NULL, NULL);
+	if (IS_ERR(pl->pl_debugfs_entry)) {
+		rc = PTR_ERR(pl->pl_debugfs_entry);
+		pl->pl_debugfs_entry = NULL;
+		CERROR("%s: cannot create 'pool' debugfs entry: rc = %d\n",
 		       ldlm_ns_name(ns), rc);
 		GOTO(out_free_name, rc);
 	}
@@ -811,7 +805,7 @@ static int ldlm_pool_proc_init(struct ldlm_pool *pl)
 	memset(pool_vars, 0, sizeof(pool_vars));
 	pool_vars[0].name = var_name;
 
-	ldlm_add_var(&pool_vars[0], pl->pl_proc_dir, "state", pl,
+	ldlm_add_var(&pool_vars[0], pl->pl_debugfs_entry, "state", pl,
 		     &lprocfs_pool_state_fops);
 
         pl->pl_stats = lprocfs_alloc_stats(LDLM_POOL_LAST_STAT -
@@ -852,7 +846,8 @@ static int ldlm_pool_proc_init(struct ldlm_pool *pl)
         lprocfs_counter_init(pl->pl_stats, LDLM_POOL_TIMING_STAT,
                              LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
                              "recalc_timing", "sec");
-	rc = lprocfs_register_stats(pl->pl_proc_dir, "stats", pl->pl_stats);
+	rc = ldebugfs_register_stats(pl->pl_debugfs_entry, "stats",
+				     pl->pl_stats);
 
         EXIT;
 out_free_name:
@@ -866,15 +861,15 @@ static void ldlm_pool_sysfs_fini(struct ldlm_pool *pl)
 	wait_for_completion(&pl->pl_kobj_unregister);
 }
 
-static void ldlm_pool_proc_fini(struct ldlm_pool *pl)
+static void ldlm_pool_debugfs_fini(struct ldlm_pool *pl)
 {
         if (pl->pl_stats != NULL) {
                 lprocfs_free_stats(&pl->pl_stats);
                 pl->pl_stats = NULL;
         }
-        if (pl->pl_proc_dir != NULL) {
-                lprocfs_remove(&pl->pl_proc_dir);
-                pl->pl_proc_dir = NULL;
+	if (pl->pl_debugfs_entry != NULL) {
+		ldebugfs_remove(&pl->pl_debugfs_entry);
+		pl->pl_debugfs_entry = NULL;
         }
 }
 
@@ -908,7 +903,7 @@ int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
                 pl->pl_recalc_period = LDLM_POOL_CLI_DEF_RECALC_PERIOD;
         }
         pl->pl_client_lock_volume = 0;
-        rc = ldlm_pool_proc_init(pl);
+	rc = ldlm_pool_debugfs_init(pl);
         if (rc)
                 RETURN(rc);
 
@@ -925,7 +920,7 @@ void ldlm_pool_fini(struct ldlm_pool *pl)
 {
 	ENTRY;
 	ldlm_pool_sysfs_fini(pl);
-	ldlm_pool_proc_fini(pl);
+	ldlm_pool_debugfs_fini(pl);
 
         /*
          * Pool should not be used after this point. We can't free it here as
@@ -1070,10 +1065,8 @@ __u32 ldlm_pool_get_lvf(struct ldlm_pool *pl)
 	return atomic_read(&pl->pl_lock_volume_factor);
 }
 
-static struct ptlrpc_thread *ldlm_pools_thread;
 static struct shrinker *ldlm_pools_srv_shrinker;
 static struct shrinker *ldlm_pools_cli_shrinker;
-static struct completion ldlm_pools_comp;
 
 /*
 * count locks from all namespaces (if possible). Returns number of
@@ -1241,108 +1234,35 @@ static int ldlm_pools_cli_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
 
 #endif /* HAVE_SHRINKER_COUNT */
 
-int ldlm_pools_recalc(enum ldlm_side client)
+static time64_t ldlm_pools_recalc_delay(enum ldlm_side side)
 {
-	unsigned long nr_l = 0, nr_p = 0, l;
 	struct ldlm_namespace *ns;
 	struct ldlm_namespace *ns_old = NULL;
-	int nr, equal = 0;
 	/* seconds of sleep if no active namespaces */
-	int time = client ? LDLM_POOL_CLI_DEF_RECALC_PERIOD :
-			    LDLM_POOL_SRV_DEF_RECALC_PERIOD;
-
-	/*
-	 * No need to setup pool limit for client pools.
-	 */
-	if (client == LDLM_NAMESPACE_SERVER) {
+	time64_t delay = side == LDLM_NAMESPACE_SERVER ?
+				 LDLM_POOL_SRV_DEF_RECALC_PERIOD :
+				 LDLM_POOL_CLI_DEF_RECALC_PERIOD;
+	int nr;
+
+	/* Recalc at least ldlm_namespace_nr(side) namespaces. */
+	for (nr = ldlm_namespace_nr_read(side); nr > 0; nr--) {
+		int skip;
 		/*
-		 * Check all modest namespaces first.
+		 * Lock the list, get first @ns in the list, getref, move it
+		 * to the tail, unlock and call pool recalc. This way we avoid
+		 * calling recalc under @ns lock, which is really good as we
+		 * get rid of potential deadlock on side nodes when canceling
+		 * locks synchronously.
 		 */
-		mutex_lock(ldlm_namespace_lock(client));
-		list_for_each_entry(ns, ldlm_namespace_list(client),
-				    ns_list_chain)
-		{
-			if (ns->ns_appetite != LDLM_NAMESPACE_MODEST)
-				continue;
-
-                        l = ldlm_pool_granted(&ns->ns_pool);
-                        if (l == 0)
-                                l = 1;
-
-                        /*
-                         * Set the modest pools limit equal to their avg granted
-                         * locks + ~6%.
-                         */
-                        l += dru(l, LDLM_POOLS_MODEST_MARGIN_SHIFT, 0);
-                        ldlm_pool_setup(&ns->ns_pool, l);
-                        nr_l += l;
-                        nr_p++;
-                }
-
-                /*
-                 * Make sure that modest namespaces did not eat more that 2/3
-                 * of limit.
-                 */
-                if (nr_l >= 2 * (LDLM_POOL_HOST_L / 3)) {
-                        CWARN("\"Modest\" pools eat out 2/3 of server locks "
-                              "limit (%lu of %lu). This means that you have too "
-                              "many clients for this amount of server RAM. "
-                              "Upgrade server!\n", nr_l, LDLM_POOL_HOST_L);
-                        equal = 1;
-                }
-
-		/*
-		 * The rest is given to greedy namespaces.
-		 */
-		list_for_each_entry(ns, ldlm_namespace_list(client),
-				    ns_list_chain)
-		{
-			if (!equal && ns->ns_appetite != LDLM_NAMESPACE_GREEDY)
-				continue;
-
-                        if (equal) {
-                                /*
-                                 * In the case 2/3 locks are eaten out by
-                                 * modest pools, we re-setup equal limit
-                                 * for _all_ pools.
-                                 */
-                                l = LDLM_POOL_HOST_L /
-					ldlm_namespace_nr_read(client);
-                        } else {
-                                /*
-                                 * All the rest of greedy pools will have
-                                 * all locks in equal parts.
-                                 */
-                                l = (LDLM_POOL_HOST_L - nr_l) /
-					(ldlm_namespace_nr_read(client) -
-                                         nr_p);
-                        }
-                        ldlm_pool_setup(&ns->ns_pool, l);
-                }
-		mutex_unlock(ldlm_namespace_lock(client));
-        }
-
-        /*
-         * Recalc at least ldlm_namespace_nr(client) namespaces.
-         */
-	for (nr = ldlm_namespace_nr_read(client); nr > 0; nr--) {
-                int     skip;
-                /*
-                 * Lock the list, get first @ns in the list, getref, move it
-                 * to the tail, unlock and call pool recalc. This way we avoid
-                 * calling recalc under @ns lock what is really good as we get
-                 * rid of potential deadlock on client nodes when canceling
-                 * locks synchronously.
-                 */
-		mutex_lock(ldlm_namespace_lock(client));
-		if (list_empty(ldlm_namespace_list(client))) {
-			mutex_unlock(ldlm_namespace_lock(client));
+		mutex_lock(ldlm_namespace_lock(side));
+		if (list_empty(ldlm_namespace_list(side))) {
+			mutex_unlock(ldlm_namespace_lock(side));
 			break;
 		}
-		ns = ldlm_namespace_first_locked(client);
+		ns = ldlm_namespace_first_locked(side);
 
 		if (ns_old == ns) { /* Full pass complete */
-			mutex_unlock(ldlm_namespace_lock(client));
+			mutex_unlock(ldlm_namespace_lock(side));
 			break;
 		}
 
@@ -1357,8 +1277,8 @@ int ldlm_pools_recalc(enum ldlm_side client)
 		 *   there).
 		 */
 		if (ldlm_ns_empty(ns)) {
-			ldlm_namespace_move_to_inactive_locked(ns, client);
-			mutex_unlock(ldlm_namespace_lock(client));
+			ldlm_namespace_move_to_inactive_locked(ns, side);
+			mutex_unlock(ldlm_namespace_lock(side));
 			continue;
 		}
 
@@ -1378,144 +1298,118 @@ int ldlm_pools_recalc(enum ldlm_side client)
 		}
 		spin_unlock(&ns->ns_lock);
 
-		ldlm_namespace_move_to_active_locked(ns, client);
-		mutex_unlock(ldlm_namespace_lock(client));
+		ldlm_namespace_move_to_active_locked(ns, side);
+		mutex_unlock(ldlm_namespace_lock(side));
 
 		/*
 		 * After setup is done - recalc the pool.
 		 */
 		if (!skip) {
-			int ttime = ldlm_pool_recalc(&ns->ns_pool);
-
-			if (ttime < time)
-				time = ttime;
-
+			delay = min(delay, ldlm_pool_recalc(&ns->ns_pool));
 			ldlm_namespace_put(ns);
 		}
-        }
-
-	/* Wake up the blocking threads from time to time. */
-	ldlm_bl_thread_wakeup();
+	}
 
-	return time;
+	return delay;
 }
 
-static int ldlm_pools_thread_main(void *arg)
-{
-	struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg;
-	int s_time, c_time;
-	ENTRY;
+static void ldlm_pools_recalc_task(struct work_struct *ws);
+static DECLARE_DELAYED_WORK(ldlm_pools_recalc_work, ldlm_pools_recalc_task);
 
-	thread_set_flags(thread, SVC_RUNNING);
-	wake_up(&thread->t_ctl_waitq);
-
-	CDEBUG(D_DLMTRACE, "%s: pool thread starting, process %d\n",
-	       "ldlm_poold", current_pid());
+static void ldlm_pools_recalc_task(struct work_struct *ws)
+{
+	/* seconds of sleep if no active namespaces */
+	time64_t delay;
+#ifdef HAVE_SERVER_SUPPORT
+	struct ldlm_namespace *ns;
+	unsigned long nr_l = 0, nr_p = 0, l;
+	int equal = 0;
 
-        while (1) {
-                struct l_wait_info lwi;
+	/* Check all modest namespaces first. */
+	mutex_lock(ldlm_namespace_lock(LDLM_NAMESPACE_SERVER));
+	list_for_each_entry(ns, ldlm_namespace_list(LDLM_NAMESPACE_SERVER),
+			    ns_list_chain) {
+		if (ns->ns_appetite != LDLM_NAMESPACE_MODEST)
+			continue;
 
-		/*
-		 * Recal all pools on this tick.
-		 */
-		s_time = ldlm_pools_recalc(LDLM_NAMESPACE_SERVER);
-		c_time = ldlm_pools_recalc(LDLM_NAMESPACE_CLIENT);
+		l = ldlm_pool_granted(&ns->ns_pool);
+		if (l == 0)
+			l = 1;
 
 		/*
-		 * Wait until the next check time, or until we're
-		 * stopped.
+		 * Set the modest pools limit equal to their avg granted
+		 * locks + ~6%.
 		 */
-		lwi = LWI_TIMEOUT(cfs_time_seconds(min(s_time, c_time)),
-				  NULL, NULL);
-                l_wait_event(thread->t_ctl_waitq,
-                             thread_is_stopping(thread) ||
-                             thread_is_event(thread),
-                             &lwi);
-
-                if (thread_test_and_clear_flags(thread, SVC_STOPPING))
-                        break;
-                else
-                        thread_test_and_clear_flags(thread, SVC_EVENT);
-        }
-
-	thread_set_flags(thread, SVC_STOPPED);
-	wake_up(&thread->t_ctl_waitq);
-
-	CDEBUG(D_DLMTRACE, "%s: pool thread exiting, process %d\n",
-		"ldlm_poold", current_pid());
-
-	complete_and_exit(&ldlm_pools_comp, 0);
-}
-
-static int ldlm_pools_thread_start(void)
-{
-	struct l_wait_info lwi = { 0 };
-	struct task_struct *task;
-	ENTRY;
-
-	if (ldlm_pools_thread != NULL)
-		RETURN(-EALREADY);
-
-	OBD_ALLOC_PTR(ldlm_pools_thread);
-	if (ldlm_pools_thread == NULL)
-		RETURN(-ENOMEM);
-
-	init_completion(&ldlm_pools_comp);
-	init_waitqueue_head(&ldlm_pools_thread->t_ctl_waitq);
+		l += dru(l, LDLM_POOLS_MODEST_MARGIN_SHIFT, 0);
+		ldlm_pool_setup(&ns->ns_pool, l);
+		nr_l += l;
+		nr_p++;
+	}
 
-	task = kthread_run(ldlm_pools_thread_main, ldlm_pools_thread,
-			   "ldlm_poold");
-	if (IS_ERR(task)) {
-		CERROR("Can't start pool thread, error %ld\n", PTR_ERR(task));
-		OBD_FREE(ldlm_pools_thread, sizeof(*ldlm_pools_thread));
-		ldlm_pools_thread = NULL;
-		RETURN(PTR_ERR(task));
+	/*
+	 * Make sure than modest namespaces did not eat more that 2/3
+	 * of limit.
+	 */
+	if (nr_l >= 2 * (LDLM_POOL_HOST_L / 3)) {
+		CWARN("'Modest' pools eat out 2/3 of server locks "
+		      "limit (%lu of %lu). This means that you have too "
+		      "many clients for this amount of server RAM. "
+		      "Upgrade server!\n", nr_l, LDLM_POOL_HOST_L);
+		equal = 1;
 	}
-	l_wait_event(ldlm_pools_thread->t_ctl_waitq,
-		     thread_is_running(ldlm_pools_thread), &lwi);
-	RETURN(0);
-}
 
-static void ldlm_pools_thread_stop(void)
-{
-	ENTRY;
+	/* The rest is given to greedy namespaces. */
+	list_for_each_entry(ns, ldlm_namespace_list(LDLM_NAMESPACE_SERVER),
+			    ns_list_chain) {
+		if (!equal && ns->ns_appetite != LDLM_NAMESPACE_GREEDY)
+			continue;
 
-	if (ldlm_pools_thread == NULL) {
-		EXIT;
-		return;
+		if (equal) {
+			/*
+			 * In the case 2/3 locks are eaten out by
+			 * modest pools, we re-setup equal limit
+			 * for _all_ pools.
+			 */
+			l = LDLM_POOL_HOST_L /
+				ldlm_namespace_nr_read(LDLM_NAMESPACE_SERVER);
+		} else {
+			/*
+			 * All the rest of greedy pools will have
+			 * all locks in equal parts.
+			 */
+			l = (LDLM_POOL_HOST_L - nr_l) /
+				(ldlm_namespace_nr_read(LDLM_NAMESPACE_SERVER) -
+				 nr_p);
+		}
+		ldlm_pool_setup(&ns->ns_pool, l);
 	}
+	mutex_unlock(ldlm_namespace_lock(LDLM_NAMESPACE_SERVER));
 
-	thread_set_flags(ldlm_pools_thread, SVC_STOPPING);
-	wake_up(&ldlm_pools_thread->t_ctl_waitq);
+	delay = min(ldlm_pools_recalc_delay(LDLM_NAMESPACE_SERVER),
+		    ldlm_pools_recalc_delay(LDLM_NAMESPACE_CLIENT));
+#else  /* !HAVE_SERVER_SUPPORT */
+	delay = ldlm_pools_recalc_delay(LDLM_NAMESPACE_CLIENT);
+#endif /* HAVE_SERVER_SUPPORT */
 
-	/*
-	 * Make sure that pools thread is finished before freeing @thread.
-	 * This fixes possible race and oops due to accessing freed memory
-	 * in pools thread.
-	 */
-	wait_for_completion(&ldlm_pools_comp);
-	OBD_FREE_PTR(ldlm_pools_thread);
-	ldlm_pools_thread = NULL;
-	EXIT;
+	/* Wake up the blocking threads from time to time. */
+	ldlm_bl_thread_wakeup();
+
+	schedule_delayed_work(&ldlm_pools_recalc_work, cfs_time_seconds(delay));
 }
 
 int ldlm_pools_init(void)
 {
-	int rc;
 	DEF_SHRINKER_VAR(shsvar, ldlm_pools_srv_shrink,
 			 ldlm_pools_srv_count, ldlm_pools_srv_scan);
 	DEF_SHRINKER_VAR(shcvar, ldlm_pools_cli_shrink,
 			 ldlm_pools_cli_count, ldlm_pools_cli_scan);
-	ENTRY;
 
-	rc = ldlm_pools_thread_start();
-	if (rc == 0) {
-		ldlm_pools_srv_shrinker =
-			set_shrinker(DEFAULT_SEEKS, &shsvar);
-		ldlm_pools_cli_shrinker =
-			set_shrinker(DEFAULT_SEEKS, &shcvar);
-	}
-	RETURN(rc);
+	schedule_delayed_work(&ldlm_pools_recalc_work,
+			      LDLM_POOL_CLI_DEF_RECALC_PERIOD);
+	ldlm_pools_srv_shrinker = set_shrinker(DEFAULT_SEEKS, &shsvar);
+	ldlm_pools_cli_shrinker = set_shrinker(DEFAULT_SEEKS, &shcvar);
+
+	return 0;
 }
 
 void ldlm_pools_fini(void)
@@ -1528,7 +1422,7 @@ void ldlm_pools_fini(void)
 		remove_shrinker(ldlm_pools_cli_shrinker);
 		ldlm_pools_cli_shrinker = NULL;
 	}
-	ldlm_pools_thread_stop();
+	cancel_delayed_work_sync(&ldlm_pools_recalc_work);
 }
 
 #else /* !HAVE_LRU_RESIZE_SUPPORT */
@@ -1537,7 +1431,7 @@ int ldlm_pool_setup(struct ldlm_pool *pl, int limit)
         return 0;
 }
 
-int ldlm_pool_recalc(struct ldlm_pool *pl)
+time64_t ldlm_pool_recalc(struct ldlm_pool *pl)
 {
         return 0;
 }
@@ -1614,8 +1508,4 @@ void ldlm_pools_fini(void)
 	return;
 }
 
-int ldlm_pools_recalc(enum ldlm_side client)
-{
-	return 0;
-}
 #endif /* HAVE_LRU_RESIZE_SUPPORT */
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c
index d15cff5fb27b6..0bc4df685525c 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2016, Intel Corporation.
+ * Copyright (c) 2010, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -57,8 +57,7 @@
 
 #define DEBUG_SUBSYSTEM S_LDLM
 
-#include <lustre/lustre_errno.h>
-
+#include <lustre_errno.h>
 #include <lustre_dlm.h>
 #include <obd_class.h>
 #include <obd.h>
@@ -68,6 +67,7 @@
 unsigned int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT;
 module_param(ldlm_enqueue_min, uint, 0644);
 MODULE_PARM_DESC(ldlm_enqueue_min, "lock enqueue timeout minimum");
+EXPORT_SYMBOL(ldlm_enqueue_min);
 
 /* in client side, whether the cached locks will be canceled before replay */
 unsigned int ldlm_cancel_unused_locks_before_replay = 1;
@@ -121,16 +121,16 @@ int ldlm_expired_completion_wait(void *data)
 
         ENTRY;
         if (lock->l_conn_export == NULL) {
-                static cfs_time_t next_dump = 0, last_dump = 0;
+		static time64_t next_dump, last_dump;
 
 		LDLM_ERROR(lock, "lock timed out (enqueued at %lld, %llds ago); "
 			   "not entering recovery in server code, just going back to sleep",
 			   (s64)lock->l_activity,
 			   (s64)(ktime_get_real_seconds() -
 				 lock->l_activity));
-                if (cfs_time_after(cfs_time_current(), next_dump)) {
+		if (ktime_get_seconds() > next_dump) {
                         last_dump = next_dump;
-                        next_dump = cfs_time_shift(300);
+			next_dump = ktime_get_seconds() + 300;
                         ldlm_namespace_dump(D_DLMTRACE,
                                             ldlm_lock_to_ns(lock));
                         if (last_dump == 0)
@@ -150,6 +150,19 @@ int ldlm_expired_completion_wait(void *data)
         RETURN(0);
 }
 
+int is_granted_or_cancelled_nolock(struct ldlm_lock *lock)
+{
+	int ret = 0;
+
+	check_res_locked(lock->l_resource);
+	if (ldlm_is_granted(lock) && !ldlm_is_cp_reqd(lock))
+		ret = 1;
+	else if (ldlm_is_failed(lock) || ldlm_is_cancel(lock))
+		ret = 1;
+	return ret;
+}
+EXPORT_SYMBOL(is_granted_or_cancelled_nolock);
+
 /**
  * Calculate the Completion timeout (covering enqueue, BL AST, data flush,
  * lock cancel, and their replies). Used for lock completion timeout on the
@@ -162,9 +175,9 @@ int ldlm_expired_completion_wait(void *data)
 
 /* We use the same basis for both server side and client side functions
    from a single node. */
-static unsigned int ldlm_cp_timeout(struct ldlm_lock *lock)
+static time64_t ldlm_cp_timeout(struct ldlm_lock *lock)
 {
-	unsigned int timeout;
+	time64_t timeout;
 
 	if (AT_OFF)
 		return obd_timeout;
@@ -173,7 +186,7 @@ static unsigned int ldlm_cp_timeout(struct ldlm_lock *lock)
 	 * lock from another client.  Server will evict the other client if it
 	 * doesn't respond reasonably, and then give us the lock. */
 	timeout = at_get(ldlm_lock_to_ns_at(lock));
-	return max(3 * timeout, ldlm_enqueue_min);
+	return max(3 * timeout, (time64_t) ldlm_enqueue_min);
 }
 
 /**
@@ -221,9 +234,9 @@ int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data)
 		RETURN(ldlm_completion_tail(lock, data));
 	}
 
-	LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, "
-		   "going forward");
-	ldlm_reprocess_all(lock->l_resource);
+	LDLM_DEBUG(lock,
+		   "client-side enqueue returned a blocked lock, going forward");
+	ldlm_reprocess_all(lock->l_resource, NULL);
 	RETURN(0);
 }
 EXPORT_SYMBOL(ldlm_completion_ast_async);
@@ -243,8 +256,6 @@ EXPORT_SYMBOL(ldlm_completion_ast_async);
  *
  *     - to force all locks when resource is destroyed (cleanup_resource());
  *
- *     - during lock conversion (not used currently).
- *
  * If lock is not granted in the first case, this function waits until second
  * or penultimate cases happen in some other thread.
  *
@@ -256,7 +267,7 @@ int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
         struct obd_device *obd;
         struct obd_import *imp = NULL;
         struct l_wait_info lwi;
-        __u32 timeout;
+	time64_t timeout;
         int rc = 0;
         ENTRY;
 
@@ -285,7 +296,7 @@ int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
 	timeout = ldlm_cp_timeout(lock);
 
 	lwd.lwd_lock = lock;
-	lock->l_activity = cfs_time_current_sec();
+	lock->l_activity = ktime_get_real_seconds();
 
 	if (ldlm_is_no_timeout(lock)) {
                 LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT");
@@ -434,7 +445,8 @@ int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp)
 /**
  * Enqueue a local lock (typically on a server).
  */
-int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
+int ldlm_cli_enqueue_local(const struct lu_env *env,
+			   struct ldlm_namespace *ns,
 			   const struct ldlm_res_id *res_id,
 			   enum ldlm_type type, union ldlm_policy_data *policy,
 			   enum ldlm_mode mode, __u64 *flags,
@@ -467,6 +479,7 @@ int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
 	err = ldlm_lvbo_init(lock->l_resource);
 	if (err < 0) {
 		LDLM_ERROR(lock, "delayed lvb init failed (rc %d)", err);
+		ldlm_lock_destroy_nolock(lock);
 		GOTO(out, err);
 	}
 
@@ -491,15 +504,15 @@ int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
 		lock->l_req_extent = policy->l_extent;
 	}
 
-        err = ldlm_lock_enqueue(ns, &lock, policy, flags);
-        if (unlikely(err != ELDLM_OK))
-                GOTO(out, err);
+	err = ldlm_lock_enqueue(env, ns, &lock, policy, flags);
+	if (unlikely(err != ELDLM_OK))
+		GOTO(out, err);
 
-        if (policy != NULL)
-                *policy = lock->l_policy_data;
+	if (policy != NULL)
+		*policy = lock->l_policy_data;
 
-        if (lock->l_completion_ast)
-                lock->l_completion_ast(lock, *flags, NULL);
+	if (lock->l_completion_ast)
+		lock->l_completion_ast(lock, *flags, NULL);
 
         LDLM_DEBUG(lock, "client-side local enqueue handler, new lock created");
         EXIT;
@@ -517,9 +530,8 @@ static void failed_lock_cleanup(struct ldlm_namespace *ns,
 
         /* Set a flag to prevent us from sending a CANCEL (bug 407) */
         lock_res_and_lock(lock);
-        /* Check that lock is not granted or failed, we might race. */
-        if ((lock->l_req_mode != lock->l_granted_mode) &&
-	    !ldlm_is_failed(lock)) {
+	/* Check that lock is not granted or failed, we might race. */
+	if (!ldlm_is_granted(lock) && !ldlm_is_failed(lock)) {
 		/* Make sure that this lock will not be found by raced
 		 * bl_ast and -EINVAL reply is sent to server anyways.
 		 * b=17645*/
@@ -566,12 +578,16 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
 			  __u32 lvb_len, const struct lustre_handle *lockh,
 			  int rc)
 {
-        struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
-        int is_replay = *flags & LDLM_FL_REPLAY;
-        struct ldlm_lock *lock;
-        struct ldlm_reply *reply;
-        int cleanup_phase = 1;
-        ENTRY;
+	struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+	const struct lu_env *env = NULL;
+	int is_replay = *flags & LDLM_FL_REPLAY;
+	struct ldlm_lock *lock;
+	struct ldlm_reply *reply;
+	int cleanup_phase = 1;
+	ENTRY;
+
+	if (req && req->rq_svc_thread)
+		env = req->rq_svc_thread->t_env;
 
         lock = ldlm_handle2lock(lockh);
         /* ldlm_cli_enqueue is holding a reference on this lock. */
@@ -680,26 +696,27 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
 						&lock->l_policy_data);
 		}
 
-                if (type != LDLM_PLAIN)
-                        LDLM_DEBUG(lock,"client-side enqueue, new policy data");
-        }
+		if (type != LDLM_PLAIN)
+			LDLM_DEBUG(lock,"client-side enqueue, new policy data");
+	}
 
 	if ((*flags) & LDLM_FL_AST_SENT) {
-                lock_res_and_lock(lock);
+		lock_res_and_lock(lock);
+		ldlm_bl_desc2lock(&reply->lock_desc, lock);
 		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST;
-                unlock_res_and_lock(lock);
-                LDLM_DEBUG(lock, "enqueue reply includes blocking AST");
-        }
+		unlock_res_and_lock(lock);
+		LDLM_DEBUG(lock, "enqueue reply includes blocking AST");
+	}
 
-        /* If the lock has already been granted by a completion AST, don't
-         * clobber the LVB with an older one. */
+	/* If the lock has already been granted by a completion AST, don't
+	 * clobber the LVB with an older one. */
 	if (lvb_len > 0) {
 		/* We must lock or a racing completion might update lvb without
 		 * letting us know and we'll clobber the correct value.
 		 * Cannot unlock after the check either, a that still leaves
 		 * a tiny window for completion to get in */
 		lock_res_and_lock(lock);
-		if (lock->l_req_mode != lock->l_granted_mode)
+		if (!ldlm_is_granted(lock))
 			rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER,
 					   lock->l_lvb_data, lvb_len);
 		unlock_res_and_lock(lock);
@@ -709,16 +726,16 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
 		}
         }
 
-        if (!is_replay) {
-                rc = ldlm_lock_enqueue(ns, &lock, NULL, flags);
-                if (lock->l_completion_ast != NULL) {
-                        int err = lock->l_completion_ast(lock, *flags, NULL);
-                        if (!rc)
-                                rc = err;
-                        if (rc)
-                                cleanup_phase = 1;
-                }
-        }
+	if (!is_replay) {
+		rc = ldlm_lock_enqueue(env, ns, &lock, NULL, flags);
+		if (lock->l_completion_ast != NULL) {
+			int err = lock->l_completion_ast(lock, *flags, NULL);
+			if (!rc)
+				rc = err;
+			if (rc)
+				cleanup_phase = 1;
+		}
+	}
 
 	if (lvb_len > 0 && lvb != NULL) {
 		/* Copy the LVB here, and not earlier, because the completion
@@ -790,8 +807,7 @@ int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req,
 	struct req_capsule	*pill = &req->rq_pill;
 	struct ldlm_request	*dlm = NULL;
 	struct list_head	head = LIST_HEAD_INIT(head);
-	enum ldlm_lru_flags lru_flags;
-	int avail, to_free, pack = 0;
+	int avail, to_free = 0, pack = 0;
 	int rc;
 	ENTRY;
 
@@ -802,10 +818,10 @@ int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req,
 		req_capsule_filled_sizes(pill, RCL_CLIENT);
 		avail = ldlm_capsule_handles_avail(pill, RCL_CLIENT, canceloff);
 
-		lru_flags = LDLM_LRU_FLAG_NO_WAIT | (ns_connect_lru_resize(ns) ?
-			LDLM_LRU_FLAG_LRUR : LDLM_LRU_FLAG_AGED);
-		to_free = !ns_connect_lru_resize(ns) &&
-			opc == LDLM_ENQUEUE ? 1 : 0;
+		/* If we have reached the limit, free +1 slot for the new one */
+		if (!ns_connect_lru_resize(ns) && opc == LDLM_ENQUEUE &&
+		    ns->ns_nr_unused >= ns->ns_max_unused)
+			to_free = 1;
 
 		/* Cancel LRU locks here _only_ if the server supports
 		 * EARLY_CANCEL. Otherwise we have to send extra CANCEL
@@ -813,7 +829,7 @@ int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req,
 		if (avail > count)
 			count += ldlm_cancel_lru_local(ns, cancels, to_free,
 						       avail - count, 0,
-						       lru_flags);
+						       LDLM_LRU_FLAG_NO_WAIT);
 		if (avail > count)
 			pack = count;
 		else
@@ -927,6 +943,10 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
 					lvb_len, lvb_type);
 		if (IS_ERR(lock))
 			RETURN(PTR_ERR(lock));
+
+		if (einfo->ei_cb_created)
+			einfo->ei_cb_created(lock);
+
                 /* for the local lock, add the reference */
                 ldlm_lock_addref_internal(lock, einfo->ei_mode);
                 ldlm_lock2handle(lock, lockh);
@@ -948,7 +968,7 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
 	lock->l_export = NULL;
 	lock->l_blocking_ast = einfo->ei_cb_bl;
 	lock->l_flags |= (*flags & (LDLM_FL_NO_LRU | LDLM_FL_EXCL));
-        lock->l_activity = cfs_time_current_sec();
+	lock->l_activity = ktime_get_real_seconds();
 
 	/* lock not sent to server yet */
 	if (reqp == NULL || *reqp == NULL) {
@@ -972,12 +992,42 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
 			 DLM_LOCKREQ_OFF, len, (int)sizeof(*body));
 	}
 
+	if (*flags & LDLM_FL_NDELAY) {
+		DEBUG_REQ(D_DLMTRACE, req, "enque lock with no delay\n");
+		req->rq_no_resend = req->rq_no_delay = 1;
+		/* probably set a shorter timeout value and handle ETIMEDOUT
+		 * in osc_lock_upcall() correctly */
+		/* lustre_msg_set_timeout(req, req->rq_timeout / 2); */
+	}
+
 	/* Dump lock data into the request buffer */
 	body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
 	ldlm_lock2desc(lock, &body->lock_desc);
 	body->lock_flags = ldlm_flags_to_wire(*flags);
 	body->lock_handle[0] = *lockh;
 
+	/* extended LDLM opcodes in client stats */
+	if (exp->exp_obd->obd_svc_stats != NULL) {
+		bool glimpse = *flags & LDLM_FL_HAS_INTENT;
+
+		/* OST glimpse has no intent buffer */
+		if (req_capsule_has_field(&req->rq_pill, &RMF_LDLM_INTENT,
+					  RCL_CLIENT)) {
+			struct ldlm_intent *it;
+
+			it = req_capsule_client_get(&req->rq_pill,
+						    &RMF_LDLM_INTENT);
+			glimpse = (it && (it->opc == IT_GLIMPSE));
+		}
+
+		if (!glimpse)
+			ldlm_svc_get_eopc(body, exp->exp_obd->obd_svc_stats);
+		else
+			lprocfs_counter_incr(exp->exp_obd->obd_svc_stats,
+					     PTLRPC_LAST_CNTR +
+					     LDLM_GLIMPSE_ENQUEUE);
+	}
+
 	if (async) {
 		LASSERT(reqp != NULL);
 		RETURN(0);
@@ -1008,103 +1058,78 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
 }
 EXPORT_SYMBOL(ldlm_cli_enqueue);
 
-static int ldlm_cli_convert_local(struct ldlm_lock *lock, int new_mode,
-                                  __u32 *flags)
+/**
+ * Client-side IBITS lock convert.
+ *
+ * Inform server that lock has been converted instead of canceling.
+ * Server finishes convert on own side and does reprocess to grant
+ * all related waiting locks.
+ *
+ * Since convert means only ibits downgrading, client doesn't need to
+ * wait for server reply to finish local converting process so this request
+ * is made asynchronous.
+ *
+ */
+int ldlm_cli_convert_req(struct ldlm_lock *lock, __u32 *flags, __u64 new_bits)
 {
-        struct ldlm_resource *res;
-        int rc;
-        ENTRY;
-        if (ns_is_client(ldlm_lock_to_ns(lock))) {
-                CERROR("Trying to cancel local lock\n");
-                LBUG();
-        }
-        LDLM_DEBUG(lock, "client-side local convert");
+	struct ldlm_request *body;
+	struct ptlrpc_request *req;
+	struct obd_export *exp = lock->l_conn_export;
 
-        res = ldlm_lock_convert(lock, new_mode, flags);
-        if (res) {
-                ldlm_reprocess_all(res);
-                rc = 0;
-        } else {
-		rc = LUSTRE_EDEADLK;
-        }
-        LDLM_DEBUG(lock, "client-side local convert handler END");
-        LDLM_LOCK_PUT(lock);
-        RETURN(rc);
-}
+	ENTRY;
 
-/* FIXME: one of ldlm_cli_convert or the server side should reject attempted
- * conversion of locks which are on the waiting or converting queue */
-/* Caller of this code is supposed to take care of lock readers/writers
-   accounting */
-int ldlm_cli_convert(const struct lustre_handle *lockh, int new_mode,
-		     __u32 *flags)
-{
-        struct ldlm_request   *body;
-        struct ldlm_reply     *reply;
-        struct ldlm_lock      *lock;
-        struct ldlm_resource  *res;
-        struct ptlrpc_request *req;
-        int                    rc;
-        ENTRY;
+	LASSERT(exp != NULL);
 
-        lock = ldlm_handle2lock(lockh);
-        if (!lock) {
-                LBUG();
-                RETURN(-EINVAL);
-        }
-        *flags = 0;
+	/* this is better to check earlier and it is done so already,
+	 * but this check is kept too as final one to issue an error
+	 * if any new code will miss such check.
+	 */
+	if (!exp_connect_lock_convert(exp)) {
+		LDLM_ERROR(lock, "server doesn't support lock convert\n");
+		RETURN(-EPROTO);
+	}
 
-        if (lock->l_conn_export == NULL)
-                RETURN(ldlm_cli_convert_local(lock, new_mode, flags));
+	if (lock->l_resource->lr_type != LDLM_IBITS) {
+		LDLM_ERROR(lock, "convert works with IBITS locks only.");
+		RETURN(-EINVAL);
+	}
 
-        LDLM_DEBUG(lock, "client-side convert");
+	LDLM_DEBUG(lock, "client-side convert");
 
-        req = ptlrpc_request_alloc_pack(class_exp2cliimp(lock->l_conn_export),
-                                        &RQF_LDLM_CONVERT, LUSTRE_DLM_VERSION,
-                                        LDLM_CONVERT);
-        if (req == NULL) {
-                LDLM_LOCK_PUT(lock);
-                RETURN(-ENOMEM);
-        }
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+					&RQF_LDLM_CONVERT, LUSTRE_DLM_VERSION,
+					LDLM_CONVERT);
+	if (req == NULL)
+		RETURN(-ENOMEM);
 
-        body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
-        body->lock_handle[0] = lock->l_remote_handle;
+	body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	body->lock_handle[0] = lock->l_remote_handle;
+
+	body->lock_desc.l_req_mode = lock->l_req_mode;
+	body->lock_desc.l_granted_mode = lock->l_granted_mode;
+
+	body->lock_desc.l_policy_data.l_inodebits.bits = new_bits;
+	body->lock_desc.l_policy_data.l_inodebits.cancel_bits = 0;
 
-        body->lock_desc.l_req_mode = new_mode;
 	body->lock_flags = ldlm_flags_to_wire(*flags);
+	body->lock_count = 1;
 
+	ptlrpc_request_set_replen(req);
 
-        ptlrpc_request_set_replen(req);
-        rc = ptlrpc_queue_wait(req);
-        if (rc != ELDLM_OK)
-                GOTO(out, rc);
+	/*
+	 * Use cancel portals for convert as well as high-priority handling.
+	 */
+	req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
+	req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
 
-        reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
-        if (reply == NULL)
-                GOTO(out, rc = -EPROTO);
+	ptlrpc_at_set_req_timeout(req);
 
-        if (req->rq_status)
-                GOTO(out, rc = req->rq_status);
-
-        res = ldlm_lock_convert(lock, new_mode, &reply->lock_flags);
-        if (res != NULL) {
-                ldlm_reprocess_all(res);
-                /* Go to sleep until the lock is granted. */
-                /* FIXME: or cancelled. */
-                if (lock->l_completion_ast) {
-                        rc = lock->l_completion_ast(lock, LDLM_FL_WAIT_NOREPROC,
-                                                    NULL);
-                        if (rc)
-                                GOTO(out, rc);
-                }
-        } else {
-		rc = LUSTRE_EDEADLK;
-        }
-        EXIT;
- out:
-        LDLM_LOCK_PUT(lock);
-        ptlrpc_req_finished(req);
-        return rc;
+	if (exp->exp_obd->obd_svc_stats != NULL)
+		lprocfs_counter_incr(exp->exp_obd->obd_svc_stats,
+				     LDLM_CONVERT - LDLM_FIRST_OPC);
+
+	ptlrpcd_add_req(req);
+	RETURN(0);
 }
 
 /**
@@ -1122,9 +1147,12 @@ static __u64 ldlm_cli_cancel_local(struct ldlm_lock *lock)
         if (lock->l_conn_export) {
                 bool local_only;
 
-                LDLM_DEBUG(lock, "client-side cancel");
-                /* Set this flag to prevent others from getting new references*/
-                lock_res_and_lock(lock);
+		LDLM_DEBUG(lock, "client-side cancel");
+		OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL,
+				 cfs_fail_val);
+
+		/* Set this flag to prevent others from getting new references*/
+		lock_res_and_lock(lock);
 		ldlm_set_cbpending(lock);
 		local_only = !!(lock->l_flags &
 				(LDLM_FL_LOCAL_ONLY|LDLM_FL_CANCEL_ON_BLOCK));
@@ -1133,23 +1161,23 @@ static __u64 ldlm_cli_cancel_local(struct ldlm_lock *lock)
 			LDLM_FL_BL_AST : LDLM_FL_CANCELING;
 		unlock_res_and_lock(lock);
 
-                if (local_only) {
-                        CDEBUG(D_DLMTRACE, "not sending request (at caller's "
-                               "instruction)\n");
-                        rc = LDLM_FL_LOCAL_ONLY;
-                }
-                ldlm_lock_cancel(lock);
-        } else {
-                if (ns_is_client(ldlm_lock_to_ns(lock))) {
-                        LDLM_ERROR(lock, "Trying to cancel local lock");
-                        LBUG();
-                }
-                LDLM_DEBUG(lock, "server-side local cancel");
-                ldlm_lock_cancel(lock);
-                ldlm_reprocess_all(lock->l_resource);
-        }
+		if (local_only) {
+			CDEBUG(D_DLMTRACE,
+			       "not sending request (at caller's instruction)\n");
+			rc = LDLM_FL_LOCAL_ONLY;
+		}
+		ldlm_lock_cancel(lock);
+	} else {
+		if (ns_is_client(ldlm_lock_to_ns(lock))) {
+			LDLM_ERROR(lock, "Trying to cancel local lock");
+			LBUG();
+		}
+		LDLM_DEBUG(lock, "server-side local cancel");
+		ldlm_lock_cancel(lock);
+		ldlm_reprocess_all(lock->l_resource, lock);
+	}
 
-        RETURN(rc);
+	RETURN(rc);
 }
 
 /**
@@ -1347,6 +1375,27 @@ int ldlm_cli_update_pool(struct ptlrpc_request *req)
         RETURN(0);
 }
 
+int ldlm_cli_convert(struct ldlm_lock *lock,
+		     enum ldlm_cancel_flags cancel_flags)
+{
+	int rc = -EINVAL;
+
+	LASSERT(!lock->l_readers && !lock->l_writers);
+	LDLM_DEBUG(lock, "client lock convert START");
+
+	if (lock->l_resource->lr_type == LDLM_IBITS) {
+		lock_res_and_lock(lock);
+		do {
+			rc = ldlm_cli_inodebits_convert(lock, cancel_flags);
+		} while (rc == -EAGAIN);
+		unlock_res_and_lock(lock);
+	}
+
+	LDLM_DEBUG(lock, "client lock convert END");
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_cli_convert);
+
 /**
  * Client side lock cancel.
  *
@@ -1356,12 +1405,12 @@ int ldlm_cli_cancel(const struct lustre_handle *lockh,
 		    enum ldlm_cancel_flags cancel_flags)
 {
 	struct obd_export *exp;
-	enum ldlm_lru_flags lru_flags;
 	int avail, count = 1;
 	__u64 rc = 0;
 	struct ldlm_namespace *ns;
 	struct ldlm_lock *lock;
 	struct list_head cancels = LIST_HEAD_INIT(cancels);
+
 	ENTRY;
 
 	lock = ldlm_handle2lock_long(lockh, 0);
@@ -1371,6 +1420,8 @@ int ldlm_cli_cancel(const struct lustre_handle *lockh,
 	}
 
 	lock_res_and_lock(lock);
+	LASSERT(!ldlm_is_converting(lock));
+
 	/* Lock is being canceled and the caller doesn't want to wait */
 	if (ldlm_is_canceling(lock)) {
 		if (cancel_flags & LCF_ASYNC) {
@@ -1407,10 +1458,8 @@ int ldlm_cli_cancel(const struct lustre_handle *lockh,
 		LASSERT(avail > 0);
 
 		ns = ldlm_lock_to_ns(lock);
-		lru_flags = ns_connect_lru_resize(ns) ?
-			LDLM_LRU_FLAG_LRUR : LDLM_LRU_FLAG_AGED;
 		count += ldlm_cancel_lru_local(ns, &cancels, 0, avail - 1,
-					       LCF_BL_AST, lru_flags);
+					       LCF_BL_AST, 0);
 	}
 	ldlm_cli_cancel_list(&cancels, count, NULL, cancel_flags);
 	RETURN(0);
@@ -1473,11 +1522,11 @@ int ldlm_cli_cancel_list_local(struct list_head *cancels, int count,
  */
 static enum ldlm_policy_res
 ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock,
-			   int unused, int added, int count)
+			   int added, int min)
 {
 	enum ldlm_policy_res result = LDLM_POLICY_CANCEL_LOCK;
 
-	/* don't check added & count since we want to process all locks
+	/* don't check @added & @min since we want to process all locks
 	 * from unused list.
 	 * It's fine to not take lock to access lock->l_resource since
 	 * the lock has already been granted so it won't change. */
@@ -1486,7 +1535,7 @@ ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock,
 		case LDLM_IBITS:
 			if (ns->ns_cancel != NULL && ns->ns_cancel(lock) != 0)
 				break;
-			/* Fall through */
+			/* fallthrough */
 		default:
 			result = LDLM_POLICY_SKIP_LOCK;
 			break;
@@ -1497,8 +1546,8 @@ ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock,
 
 /**
  * Callback function for LRU-resize policy. Decides whether to keep
- * \a lock in LRU for current \a LRU size \a unused, added in current
- * scan \a added and number of locks to be preferably canceled \a count.
+ * \a lock in LRU for \a added in current scan and \a min number of locks
+ * to be preferably canceled.
  *
  * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
  *
@@ -1506,32 +1555,29 @@ ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock,
  */
 static enum ldlm_policy_res ldlm_cancel_lrur_policy(struct ldlm_namespace *ns,
 						    struct ldlm_lock *lock,
-						    int unused, int added,
-						    int count)
+						    int added, int min)
 {
 	ktime_t cur = ktime_get();
 	struct ldlm_pool *pl = &ns->ns_pool;
 	u64 slv, lvf, lv;
 	s64 la;
 
-	/* Stop LRU processing when we reach past @count or have checked all
-	 * locks in LRU. */
-	if (count && added >= count)
-		return LDLM_POLICY_KEEP_LOCK;
+	if (added < min)
+		return LDLM_POLICY_CANCEL_LOCK;
 
 	/* Despite of the LV, It doesn't make sense to keep the lock which
 	 * is unused for ns_max_age time.
 	 */
-	if (ktime_after(ktime_get(),
-			ktime_add(lock->l_last_used, ns->ns_max_age)))
+	if (ktime_after(cur, ktime_add(lock->l_last_used, ns->ns_max_age)))
 		return LDLM_POLICY_CANCEL_LOCK;
 
 	slv = ldlm_pool_get_slv(pl);
 	lvf = ldlm_pool_get_lvf(pl);
-	la = ktime_to_ns(ktime_sub(cur, lock->l_last_used)) / NSEC_PER_SEC;
-	lv = lvf * la * unused;
+	la = div_u64(ktime_to_ns(ktime_sub(cur, lock->l_last_used)),
+		     NSEC_PER_SEC);
+	lv = lvf * la * ns->ns_nr_unused;
 
-	/* Inform pool about current CLV to see it via proc. */
+	/* Inform pool about current CLV to see it via debugfs. */
 	ldlm_pool_set_clv(pl, lv);
 
 	/* Stop when SLV is not yet come from server or lv is smaller than
@@ -1545,42 +1591,21 @@ static enum ldlm_policy_res ldlm_cancel_lrur_policy(struct ldlm_namespace *ns,
 static enum ldlm_policy_res
 ldlm_cancel_lrur_no_wait_policy(struct ldlm_namespace *ns,
 				struct ldlm_lock *lock,
-				int unused, int added,
-				int count)
+				int added, int min)
 {
 	enum ldlm_policy_res result;
 
-	result = ldlm_cancel_lrur_policy(ns, lock, unused, added, count);
+	result = ldlm_cancel_lrur_policy(ns, lock, added, min);
 	if (result == LDLM_POLICY_KEEP_LOCK)
 		return result;
 
-	return ldlm_cancel_no_wait_policy(ns, lock, unused, added, count);
-}
-
-/**
- * Callback function for proc used policy. Makes decision whether to keep
- * \a lock in LRU for current \a LRU size \a unused, added in current scan \a
- * added and number of locks to be preferably canceled \a count.
- *
- * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
- *
- * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
- */
-static enum ldlm_policy_res ldlm_cancel_passed_policy(struct ldlm_namespace *ns,
-						      struct ldlm_lock *lock,
-						      int unused, int added,
-						      int count)
-{
-	/* Stop LRU processing when we reach past @count or have checked all
-	 * locks in LRU. */
-	return (added >= count) ?
-		LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+	return ldlm_cancel_no_wait_policy(ns, lock, added, min);
 }
 
 /**
- * Callback function for aged policy. Makes decision whether to keep \a lock in
- * LRU for current LRU size \a unused, added in current scan \a added and
- * number of locks to be preferably canceled \a count.
+ * Callback function for aged policy. Decides whether to keep
+ * \a lock in LRU for \a added in current scan and \a min number of locks
+ * to be preferably canceled.
  *
  * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
  *
@@ -1588,10 +1613,9 @@ static enum ldlm_policy_res ldlm_cancel_passed_policy(struct ldlm_namespace *ns,
  */
 static enum ldlm_policy_res ldlm_cancel_aged_policy(struct ldlm_namespace *ns,
 						    struct ldlm_lock *lock,
-						    int unused, int added,
-						    int count)
+						    int added, int min)
 {
-	if ((added >= count) &&
+	if ((added >= min) &&
 	    ktime_before(ktime_get(),
 			 ktime_add(lock->l_last_used, ns->ns_max_age)))
 		return LDLM_POLICY_KEEP_LOCK;
@@ -1602,76 +1626,43 @@ static enum ldlm_policy_res ldlm_cancel_aged_policy(struct ldlm_namespace *ns,
 static enum ldlm_policy_res
 ldlm_cancel_aged_no_wait_policy(struct ldlm_namespace *ns,
 				struct ldlm_lock *lock,
-				int unused, int added, int count)
+				int added, int min)
 {
 	enum ldlm_policy_res result;
 
-	result = ldlm_cancel_aged_policy(ns, lock, unused, added, count);
+	result = ldlm_cancel_aged_policy(ns, lock, added, min);
 	if (result == LDLM_POLICY_KEEP_LOCK)
 		return result;
 
-	return ldlm_cancel_no_wait_policy(ns, lock, unused, added, count);
-}
-
-/**
- * Callback function for default policy. Makes decision whether to keep \a lock
- * in LRU for current LRU size \a unused, added in current scan \a added and
- * number of locks to be preferably canceled \a count.
- *
- * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
- *
- * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
- */
-static
-enum ldlm_policy_res ldlm_cancel_default_policy(struct ldlm_namespace *ns,
-						struct ldlm_lock *lock,
-						int unused, int added,
-						int count)
-{
-	/* Stop LRU processing when we reach past count or have checked all
-	 * locks in LRU. */
-        return (added >= count) ?
-                LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+	return ldlm_cancel_no_wait_policy(ns, lock, added, min);
 }
 
 typedef enum ldlm_policy_res
 (*ldlm_cancel_lru_policy_t)(struct ldlm_namespace *ns, struct ldlm_lock *lock,
-			    int unused, int added, int count);
+			    int added, int min);
 
 static ldlm_cancel_lru_policy_t
 ldlm_cancel_lru_policy(struct ldlm_namespace *ns, enum ldlm_lru_flags lru_flags)
 {
 	if (ns_connect_lru_resize(ns)) {
-		if (lru_flags & LDLM_LRU_FLAG_SHRINK)
-			/* We kill passed number of old locks. */
-			return ldlm_cancel_passed_policy;
-		if (lru_flags & LDLM_LRU_FLAG_LRUR) {
-			if (lru_flags & LDLM_LRU_FLAG_NO_WAIT)
-				return ldlm_cancel_lrur_no_wait_policy;
-			else
-				return ldlm_cancel_lrur_policy;
-		}
-		if (lru_flags & LDLM_LRU_FLAG_PASSED)
-			return ldlm_cancel_passed_policy;
+		if (lru_flags & LDLM_LRU_FLAG_NO_WAIT)
+			return ldlm_cancel_lrur_no_wait_policy;
+		else
+			return ldlm_cancel_lrur_policy;
 	} else {
-		if (lru_flags & LDLM_LRU_FLAG_AGED) {
-			if (lru_flags & LDLM_LRU_FLAG_NO_WAIT)
-				return ldlm_cancel_aged_no_wait_policy;
-			else
-				return ldlm_cancel_aged_policy;
-		}
+		if (lru_flags & LDLM_LRU_FLAG_NO_WAIT)
+			return ldlm_cancel_aged_no_wait_policy;
+		else
+			return ldlm_cancel_aged_policy;
 	}
-	if (lru_flags & LDLM_LRU_FLAG_NO_WAIT)
-		return ldlm_cancel_no_wait_policy;
-
-	return ldlm_cancel_default_policy;
 }
 
 /**
- * - Free space in LRU for \a count new locks,
+ * - Free space in LRU for \a min new locks,
  *   redundant unused locks are canceled locally;
  * - also cancel locally unused aged locks;
  * - do not cancel more than \a max locks;
+ * - if some locks are cancelled, try to cancel at least \a batch locks
  * - GET the found locks and add them into the \a cancels list.
  *
  * A client lock can be added to the l_bl_ast list only when it is
@@ -1682,30 +1673,22 @@ ldlm_cancel_lru_policy(struct ldlm_namespace *ns, enum ldlm_lru_flags lru_flags)
  * attempt to cancel a lock rely on this flag, l_bl_ast list is accessed
  * later without any special locking.
  *
- * Calling policies for enabled LRU resize:
- * ----------------------------------------
- * flags & LDLM_LRU_FLAG_LRUR - use LRU resize policy (SLV from server) to
- *				cancel not more than \a count locks;
- *
- * flags & LDLM_LRU_FLAG_PASSED - cancel \a count number of old locks (located
- *				at the beginning of LRU list);
+ * Locks are cancelled according to the LRU resize policy (SLV from server)
+ * if LRU resize is enabled; otherwise, the "aged policy" is used;
  *
- * flags & LDLM_LRU_FLAG_SHRINK - cancel not more than \a count locks according
- *				to memory pressre policy function;
- *
- * flags & LDLM_LRU_FLAG_AGED - cancel \a count locks according to "aged policy"
+ * LRU flags:
+ * ----------------------------------------
  *
- * flags & LDLM_LRU_FLAG_NO_WAIT - cancel as many unused locks as possible
- *				(typically before replaying locks) w/o
- *				sending any RPCs or waiting for any
- *				outstanding RPC to complete.
+ * flags & LDLM_LRU_FLAG_NO_WAIT - cancel locks w/o sending any RPCs or waiting
+ *				for any outstanding RPC to complete.
  *
  * flags & LDLM_CANCEL_CLEANUP - when cancelling read locks, do not check for
- * 				other read locks covering the same pages, just
- * 				discard those pages.
+ *				 other read locks covering the same pages, just
+ *				 discard those pages.
  */
 static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
-				 struct list_head *cancels, int count, int max,
+				 struct list_head *cancels,
+				 int min, int max, int batch,
 				 enum ldlm_lru_flags lru_flags)
 {
 	ldlm_cancel_lru_policy_t pf;
@@ -1714,8 +1697,26 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
 
 	ENTRY;
 
+	/*
+	 * Let only 1 thread to proceed. However, not for those which have the
+	 * @max limit given (ELC), as LRU may be left not cleaned up in full.
+	 */
+	if (max == 0) {
+		if (test_and_set_bit(LDLM_LRU_CANCEL, &ns->ns_flags))
+			RETURN(0);
+	} else if (test_bit(LDLM_LRU_CANCEL, &ns->ns_flags))
+		RETURN(0);
+
+	LASSERT(ergo(max, min <= max));
+	/* No sense to give @batch for ELC */
+	LASSERT(ergo(max, batch == 0));
+
 	if (!ns_connect_lru_resize(ns))
-		count += ns->ns_nr_unused - ns->ns_max_unused;
+		min = max_t(int, min, ns->ns_nr_unused - ns->ns_max_unused);
+
+	/* If at least 1 lock is to be cancelled, cancel at least @batch locks */
+	if (min && min < batch)
+		min = batch;
 
 	pf = ldlm_cancel_lru_policy(ns, lru_flags);
 	LASSERT(pf != NULL);
@@ -1768,7 +1769,7 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
 		 * old locks, but additionally choose them by
 		 * their weight. Big extent locks will stay in
 		 * the cache. */
-		result = pf(ns, lock, ns->ns_nr_unused, added, count);
+		result = pf(ns, lock, added, min);
 		if (result == LDLM_POLICY_KEEP_LOCK) {
 			lu_ref_del(&lock->l_reference, __func__, current);
 			LDLM_LOCK_RELEASE(lock);
@@ -1777,7 +1778,6 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
 
 		if (result == LDLM_POLICY_SKIP_LOCK) {
 			lu_ref_del(&lock->l_reference, __func__, current);
-			LDLM_LOCK_RELEASE(lock);
 			if (no_wait) {
 				spin_lock(&ns->ns_lock);
 				if (!list_empty(&lock->l_lru) &&
@@ -1785,6 +1785,8 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
 					ns->ns_last_pos = &lock->l_lru;
 				spin_unlock(&ns->ns_lock);
 			}
+
+			LDLM_LOCK_RELEASE(lock);
 			continue;
 		}
 
@@ -1821,8 +1823,8 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
 		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING;
 
 		if ((lru_flags & LDLM_LRU_FLAG_CLEANUP) &&
-		    lock->l_resource->lr_type == LDLM_EXTENT &&
-		    lock->l_granted_mode == LCK_PR)
+		    (lock->l_resource->lr_type == LDLM_EXTENT ||
+		     ldlm_has_dom(lock)) && lock->l_granted_mode == LCK_PR)
 			ldlm_set_discard_data(lock);
 
 		/* We can't re-add to l_lru as it confuses the
@@ -1836,18 +1838,25 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
 		unlock_res_and_lock(lock);
 		lu_ref_del(&lock->l_reference, __FUNCTION__, current);
 		added++;
+		/* Once a lock added, batch the requested amount */
+		if (min == 0)
+			min = batch;
 	}
+
+	if (max == 0)
+		clear_bit(LDLM_LRU_CANCEL, &ns->ns_flags);
+
 	RETURN(added);
 }
 
 int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
-			  int count, int max,
+			  int min, int max,
 			  enum ldlm_cancel_flags cancel_flags,
 			  enum ldlm_lru_flags lru_flags)
 {
 	int added;
 
-	added = ldlm_prepare_lru_list(ns, cancels, count, max, lru_flags);
+	added = ldlm_prepare_lru_list(ns, cancels, min, max, 0, lru_flags);
 	if (added <= 0)
 		return added;
 
@@ -1855,14 +1864,14 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
 }
 
 /**
- * Cancel at least \a nr locks from given namespace LRU.
+ * Cancel at least \a min locks from given namespace LRU.
  *
  * When called with LCF_ASYNC the blocking callback will be handled
  * in a thread and this function will return after the thread has been
  * asked to call the callback.  When called with LCF_ASYNC the blocking
  * callback will be performed in this function.
  */
-int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr,
+int ldlm_cancel_lru(struct ldlm_namespace *ns, int min,
 		    enum ldlm_cancel_flags cancel_flags,
 		    enum ldlm_lru_flags lru_flags)
 {
@@ -1872,7 +1881,7 @@ int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr,
 
 	/* Just prepare the list of locks, do not actually cancel them yet.
 	 * Locks are cancelled later in a separate thread. */
-	count = ldlm_prepare_lru_list(ns, &cancels, nr, 0, lru_flags);
+	count = ldlm_prepare_lru_list(ns, &cancels, min, 0, 0, lru_flags);
 	rc = ldlm_bl_to_thread_list(ns, NULL, &cancels, count, cancel_flags);
 	if (rc == 0)
 		RETURN(count);
@@ -1894,47 +1903,50 @@ int ldlm_cancel_resource_local(struct ldlm_resource *res,
 {
 	struct ldlm_lock *lock;
 	int count = 0;
+
 	ENTRY;
 
 	lock_res(res);
 	list_for_each_entry(lock, &res->lr_granted, l_res_link) {
-                if (opaque != NULL && lock->l_ast_data != opaque) {
-                        LDLM_ERROR(lock, "data %p doesn't match opaque %p",
-                                   lock->l_ast_data, opaque);
-                        //LBUG();
-                        continue;
-                }
+		if (opaque != NULL && lock->l_ast_data != opaque) {
+			LDLM_ERROR(lock, "data %p doesn't match opaque %p",
+				   lock->l_ast_data, opaque);
+			continue;
+		}
 
-                if (lock->l_readers || lock->l_writers)
-                        continue;
+		if (lock->l_readers || lock->l_writers)
+			continue;
 
-		/* If somebody is already doing CANCEL, or blocking AST came,
-		 * skip this lock. */
+		/*
+		 * If somebody is already doing CANCEL, or blocking AST came
+		 * then skip this lock.
+		 */
 		if (ldlm_is_bl_ast(lock) || ldlm_is_canceling(lock))
 			continue;
 
-                if (lockmode_compat(lock->l_granted_mode, mode))
-                        continue;
+		if (lockmode_compat(lock->l_granted_mode, mode))
+			continue;
 
-                /* If policy is given and this is IBITS lock, add to list only
-                 * those locks that match by policy. */
-                if (policy && (lock->l_resource->lr_type == LDLM_IBITS) &&
-                    !(lock->l_policy_data.l_inodebits.bits &
-                      policy->l_inodebits.bits))
-                        continue;
+		/* If policy is given and this is IBITS lock, add to list only
+		 * those locks that match by policy.
+		 * Skip locks with DoM bit always to don't flush data.
+		 */
+		if (policy && (lock->l_resource->lr_type == LDLM_IBITS) &&
+		    (!(lock->l_policy_data.l_inodebits.bits &
+		      policy->l_inodebits.bits) || ldlm_has_dom(lock)))
+			continue;
 
 		/* See CBPENDING comment in ldlm_cancel_lru */
 		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING |
 				 lock_flags;
-
 		LASSERT(list_empty(&lock->l_bl_ast));
 		list_add(&lock->l_bl_ast, cancels);
-                LDLM_LOCK_GET(lock);
-                count++;
-        }
-        unlock_res(res);
+		LDLM_LOCK_GET(lock);
+		count++;
+	}
+	unlock_res(res);
 
-        RETURN(ldlm_cli_cancel_list_local(cancels, count, cancel_flags));
+	RETURN(ldlm_cli_cancel_list_local(cancels, count, cancel_flags));
 }
 EXPORT_SYMBOL(ldlm_cancel_resource_local);
 
@@ -2088,41 +2100,34 @@ int ldlm_cli_cancel_unused(struct ldlm_namespace *ns,
 /* Lock iterators. */
 
 int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter,
-                          void *closure)
+			  void *closure)
 {
 	struct list_head *tmp, *next;
-        struct ldlm_lock *lock;
-        int rc = LDLM_ITER_CONTINUE;
+	struct ldlm_lock *lock;
+	int rc = LDLM_ITER_CONTINUE;
 
-        ENTRY;
+	ENTRY;
 
-        if (!res)
-                RETURN(LDLM_ITER_CONTINUE);
+	if (!res)
+		RETURN(LDLM_ITER_CONTINUE);
 
-        lock_res(res);
+	lock_res(res);
 	list_for_each_safe(tmp, next, &res->lr_granted) {
 		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
 
-                if (iter(lock, closure) == LDLM_ITER_STOP)
-                        GOTO(out, rc = LDLM_ITER_STOP);
-        }
-
-	list_for_each_safe(tmp, next, &res->lr_converting) {
-		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
-
-                if (iter(lock, closure) == LDLM_ITER_STOP)
-                        GOTO(out, rc = LDLM_ITER_STOP);
-        }
+		if (iter(lock, closure) == LDLM_ITER_STOP)
+			GOTO(out, rc = LDLM_ITER_STOP);
+	}
 
 	list_for_each_safe(tmp, next, &res->lr_waiting) {
 		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
 
-                if (iter(lock, closure) == LDLM_ITER_STOP)
-                        GOTO(out, rc = LDLM_ITER_STOP);
-        }
- out:
-        unlock_res(res);
-        RETURN(rc);
+		if (iter(lock, closure) == LDLM_ITER_STOP)
+			GOTO(out, rc = LDLM_ITER_STOP);
+	}
+out:
+	unlock_res(res);
+	RETURN(rc);
 }
 
 struct iter_helper_data {
@@ -2216,6 +2221,8 @@ static int replay_lock_interpret(const struct lu_env *env,
 
 	ENTRY;
 	atomic_dec(&req->rq_import->imp_replay_inflight);
+	wake_up(&req->rq_import->imp_replay_waitq);
+
 	if (rc != ELDLM_OK)
 		GOTO(out, rc);
 
@@ -2281,28 +2288,23 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
                 RETURN(0);
         }
 
-        /*
-         * If granted mode matches the requested mode, this lock is granted.
-         *
-         * If they differ, but we have a granted mode, then we were granted
-         * one mode and now want another: ergo, converting.
-         *
-         * If we haven't been granted anything and are on a resource list,
-         * then we're blocked/waiting.
-         *
-         * If we haven't been granted anything and we're NOT on a resource list,
-         * then we haven't got a reply yet and don't have a known disposition.
-         * This happens whenever a lock enqueue is the request that triggers
-         * recovery.
-         */
-        if (lock->l_granted_mode == lock->l_req_mode)
-                flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_GRANTED;
-        else if (lock->l_granted_mode)
-                flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_CONV;
+	/*
+	 * If granted mode matches the requested mode, this lock is granted.
+	 *
+	 * If we haven't been granted anything and are on a resource list,
+	 * then we're blocked/waiting.
+	 *
+	 * If we haven't been granted anything and we're NOT on a resource list,
+	 * then we haven't got a reply yet and don't have a known disposition.
+	 * This happens whenever a lock enqueue is the request that triggers
+	 * recovery.
+	 */
+	if (ldlm_is_granted(lock))
+		flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_GRANTED;
 	else if (!list_empty(&lock->l_res_link))
-                flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_WAIT;
-        else
-                flags = LDLM_FL_REPLAY;
+		flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_WAIT;
+	else
+		flags = LDLM_FL_REPLAY;
 
         req = ptlrpc_request_alloc_pack(imp, &RQF_LDLM_ENQUEUE,
                                         LUSTRE_DLM_VERSION, LDLM_ENQUEUE);
@@ -2311,6 +2313,8 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
 
         /* We're part of recovery, so don't wait for it. */
         req->rq_send_state = LUSTRE_IMP_REPLAY_LOCKS;
+	/* If the state changed while we were prepared, don't wait */
+	req->rq_no_delay = 1;
 
         body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
         ldlm_lock2desc(lock, &body->lock_desc);
@@ -2369,7 +2373,20 @@ static void ldlm_cancel_unused_locks_for_replay(struct ldlm_namespace *ns)
 			   canceled, ldlm_ns_name(ns));
 }
 
-int ldlm_replay_locks(struct obd_import *imp)
+static int lock_can_replay(struct obd_import *imp)
+{
+	struct client_obd *cli = &imp->imp_obd->u.cli;
+
+	CDEBUG(D_HA, "check lock replay limit, inflights = %u(%u)\n",
+	       atomic_read(&imp->imp_replay_inflight) - 1,
+	       cli->cl_max_rpcs_in_flight);
+
+	/* +1 due to ldlm_lock_replay() increment */
+	return atomic_read(&imp->imp_replay_inflight) <
+	       1 + min_t(u32, cli->cl_max_rpcs_in_flight, 8);
+}
+
+int __ldlm_replay_locks(struct obd_import *imp, bool rate_limit)
 {
 	struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
 	struct list_head list = LIST_HEAD_INIT(list);
@@ -2378,15 +2395,12 @@ int ldlm_replay_locks(struct obd_import *imp)
 
 	ENTRY;
 
-	LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
+	LASSERT(atomic_read(&imp->imp_replay_inflight) == 1);
 
 	/* don't replay locks if import failed recovery */
 	if (imp->imp_vbr_failed)
 		RETURN(0);
 
-	/* ensure this doesn't fall to 0 before all have been queued */
-	atomic_inc(&imp->imp_replay_inflight);
-
 	if (ldlm_cancel_unused_locks_before_replay)
 		ldlm_cancel_unused_locks_for_replay(ns);
 
@@ -2394,15 +2408,64 @@ int ldlm_replay_locks(struct obd_import *imp)
 
 	list_for_each_entry_safe(lock, next, &list, l_pending_chain) {
 		list_del_init(&lock->l_pending_chain);
-		if (rc) {
+		/* If we disconnected in the middle - cleanup and let
+		 * reconnection to happen again. LU-14027 */
+		if (rc || (imp->imp_state != LUSTRE_IMP_REPLAY_LOCKS)) {
 			LDLM_LOCK_RELEASE(lock);
-			continue; /* or try to do the rest? */
+			continue;
 		}
 		rc = replay_one_lock(imp, lock);
 		LDLM_LOCK_RELEASE(lock);
+
+		if (rate_limit)
+			wait_event_idle_exclusive(imp->imp_replay_waitq,
+						  lock_can_replay(imp));
 	}
 
+	RETURN(rc);
+}
+
+/**
+ * Lock replay uses rate control and can sleep waiting so
+ * must be in separate thread from ptlrpcd itself
+ */
+static int ldlm_lock_replay_thread(void *data)
+{
+	struct obd_import *imp = data;
+
+	unshare_fs_struct();
+
+	CDEBUG(D_HA, "lock replay thread %s to %s@%s\n",
+	       imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
+	       imp->imp_connection->c_remote_uuid.uuid);
+
+	__ldlm_replay_locks(imp, true);
 	atomic_dec(&imp->imp_replay_inflight);
+	ptlrpc_import_recovery_state_machine(imp);
+	class_import_put(imp);
 
-	RETURN(rc);
+	return 0;
+}
+
+int ldlm_replay_locks(struct obd_import *imp)
+{
+	struct task_struct *task;
+	int rc = 0;
+
+	class_import_get(imp);
+	/* ensure this doesn't fall to 0 before all have been queued */
+	atomic_inc(&imp->imp_replay_inflight);
+
+	task = kthread_run(ldlm_lock_replay_thread, imp, "ldlm_lock_replay");
+	if (IS_ERR(task)) {
+		rc = PTR_ERR(task);
+		CDEBUG(D_HA, "can't start lock replay thread: rc = %d\n", rc);
+
+		/* run lock replay without rate control */
+		rc = __ldlm_replay_locks(imp, false);
+		atomic_dec(&imp->imp_replay_inflight);
+		class_import_put(imp);
+	}
+
+	return rc;
 }
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c
index 042633867837b..8b36f70af7f56 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2016, Intel Corporation.
+ * Copyright (c) 2010, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -43,6 +43,7 @@
 
 struct kmem_cache *ldlm_resource_slab, *ldlm_lock_slab;
 struct kmem_cache *ldlm_interval_tree_slab;
+struct kmem_cache *ldlm_inodebits_slab;
 
 int ldlm_srv_namespace_nr = 0;
 int ldlm_cli_namespace_nr = 0;
@@ -58,26 +59,45 @@ LIST_HEAD(ldlm_cli_active_namespace_list);
 /* Client namespaces that don't have any locks in them */
 LIST_HEAD(ldlm_cli_inactive_namespace_list);
 
-static struct proc_dir_entry *ldlm_type_proc_dir;
-static struct proc_dir_entry *ldlm_ns_proc_dir;
-struct proc_dir_entry *ldlm_svc_proc_dir;
+static struct dentry *ldlm_debugfs_dir;
+static struct dentry *ldlm_ns_debugfs_dir;
+struct dentry *ldlm_svc_debugfs_dir;
 
 /* during debug dump certain amount of granted locks for one resource to avoid
  * DDOS. */
 static unsigned int ldlm_dump_granted_max = 256;
 
-#ifdef CONFIG_PROC_FS
-static ssize_t
-lprocfs_dump_ns_seq_write(struct file *file, const char __user *buffer,
-			  size_t count, loff_t *off)
+static ssize_t ldebugfs_dump_ns_seq_write(struct file *file,
+					  const char __user *buffer,
+					  size_t count, loff_t *off)
 {
 	ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE);
 	ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE);
 	RETURN(count);
 }
-LPROC_SEQ_FOPS_WO_TYPE(ldlm, dump_ns);
 
-LPROC_SEQ_FOPS_RW_TYPE(ldlm_rw, uint);
+LDEBUGFS_FOPS_WR_ONLY(ldlm, dump_ns);
+
+static int ldlm_rw_uint_seq_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%u\n", *(unsigned int *)m->private);
+	return 0;
+}
+
+static ssize_t
+ldlm_rw_uint_seq_write(struct file *file, const char __user *buffer,
+		       size_t count, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+
+	if (!count)
+		return 0;
+
+	return kstrtouint_from_user(buffer, count, 0,
+				    (unsigned int *)seq->private);
+}
+
+LDEBUGFS_SEQ_FOPS(ldlm_rw_uint);
 
 #ifdef HAVE_SERVER_SUPPORT
 
@@ -97,7 +117,7 @@ static ssize_t seq_watermark_write(struct file *file,
 	bool wm_low = (data == &ldlm_reclaim_threshold_mb) ? true : false;
 	int rc;
 
-	rc = lprocfs_str_with_units_to_s64(file, buffer, count, &value, 'M');
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &value, 'M');
 	if (rc) {
 		CERROR("Failed to set %s, rc = %d.\n",
 		       wm_low ? "lock_reclaim_threshold_mb" : "lock_limit_mb",
@@ -144,7 +164,7 @@ static ssize_t seq_watermark_write(struct file *file,
 
 static int seq_watermark_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, seq_watermark_show, PDE_DATA(inode));
+	return single_open(file, seq_watermark_show, inode->i_private);
 }
 
 static const struct file_operations ldlm_watermark_fops = {
@@ -165,7 +185,7 @@ static int seq_granted_show(struct seq_file *m, void *data)
 
 static int seq_granted_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, seq_granted_show, PDE_DATA(inode));
+	return single_open(file, seq_granted_show, inode->i_private);
 }
 
 static const struct file_operations ldlm_granted_fops = {
@@ -178,59 +198,62 @@ static const struct file_operations ldlm_granted_fops = {
 
 #endif /* HAVE_SERVER_SUPPORT */
 
-int ldlm_proc_setup(void)
-{
-	int rc;
-	struct lprocfs_vars list[] = {
-		{ .name	=	"dump_namespaces",
-		  .fops	=	&ldlm_dump_ns_fops,
-		  .proc_mode =	0222 },
-		{ .name	=	"dump_granted_max",
-		  .fops	=	&ldlm_rw_uint_fops,
-		  .data	=	&ldlm_dump_granted_max },
+static struct ldebugfs_vars ldlm_debugfs_list[] = {
+	{ .name	=	"dump_namespaces",
+	  .fops	=	&ldlm_dump_ns_fops,
+	  .proc_mode =	0222 },
+	{ .name	=	"dump_granted_max",
+	  .fops	=	&ldlm_rw_uint_fops,
+	  .data	=	&ldlm_dump_granted_max },
 #ifdef HAVE_SERVER_SUPPORT
-		{ .name =	"lock_reclaim_threshold_mb",
-		  .fops =	&ldlm_watermark_fops,
-		  .data =	&ldlm_reclaim_threshold_mb },
-		{ .name =	"lock_limit_mb",
-		  .fops =	&ldlm_watermark_fops,
-		  .data =	&ldlm_lock_limit_mb },
-		{ .name =	"lock_granted_count",
-		  .fops =	&ldlm_granted_fops,
-		  .data =	&ldlm_granted_total },
+	{ .name =	"lock_reclaim_threshold_mb",
+	  .fops =	&ldlm_watermark_fops,
+	  .data =	&ldlm_reclaim_threshold_mb },
+	{ .name =	"lock_limit_mb",
+	  .fops =	&ldlm_watermark_fops,
+	  .data =	&ldlm_lock_limit_mb },
+	{ .name =	"lock_granted_count",
+	  .fops =	&ldlm_granted_fops,
+	  .data =	&ldlm_granted_total },
 #endif
-		{ NULL }};
-	ENTRY;
-	LASSERT(ldlm_ns_proc_dir == NULL);
+	{ NULL }
+};
 
-	ldlm_type_proc_dir = lprocfs_register(OBD_LDLM_DEVICENAME,
-					      proc_lustre_root,
-					      NULL, NULL);
-	if (IS_ERR(ldlm_type_proc_dir)) {
-		CERROR("LProcFS failed in ldlm-init\n");
-		rc = PTR_ERR(ldlm_type_proc_dir);
+int ldlm_debugfs_setup(void)
+{
+	int rc;
+
+	ENTRY;
+	ldlm_debugfs_dir = ldebugfs_register(OBD_LDLM_DEVICENAME,
+					     debugfs_lustre_root,
+					     NULL, NULL);
+	if (IS_ERR_OR_NULL(ldlm_debugfs_dir)) {
+		CERROR("LDebugFS failed in ldlm-init\n");
+		rc = ldlm_debugfs_dir ? PTR_ERR(ldlm_debugfs_dir) : -ENOMEM;
 		GOTO(err, rc);
 	}
 
-	ldlm_ns_proc_dir = lprocfs_register("namespaces",
-					    ldlm_type_proc_dir,
-					    NULL, NULL);
-	if (IS_ERR(ldlm_ns_proc_dir)) {
+	ldlm_ns_debugfs_dir = ldebugfs_register("namespaces",
+						ldlm_debugfs_dir,
+						NULL, NULL);
+	if (IS_ERR_OR_NULL(ldlm_ns_debugfs_dir)) {
 		CERROR("LProcFS failed in ldlm-init\n");
-		rc = PTR_ERR(ldlm_ns_proc_dir);
+		rc = ldlm_ns_debugfs_dir ? PTR_ERR(ldlm_ns_debugfs_dir)
+					 : -ENOMEM;
 		GOTO(err_type, rc);
 	}
 
-	ldlm_svc_proc_dir = lprocfs_register("services",
-					     ldlm_type_proc_dir,
-					     NULL, NULL);
-	if (IS_ERR(ldlm_svc_proc_dir)) {
+	ldlm_svc_debugfs_dir = ldebugfs_register("services",
+						 ldlm_debugfs_dir,
+						 NULL, NULL);
+	if (IS_ERR_OR_NULL(ldlm_svc_debugfs_dir)) {
 		CERROR("LProcFS failed in ldlm-init\n");
-		rc = PTR_ERR(ldlm_svc_proc_dir);
+		rc = ldlm_svc_debugfs_dir ? PTR_ERR(ldlm_svc_debugfs_dir)
+					  : -ENOMEM;
 		GOTO(err_ns, rc);
 	}
 
-	rc = lprocfs_add_vars(ldlm_type_proc_dir, list, NULL);
+	rc = ldebugfs_add_vars(ldlm_debugfs_dir, ldlm_debugfs_list, NULL);
 	if (rc != 0) {
 		CERROR("LProcFS failed in ldlm-init\n");
 		GOTO(err_svc, rc);
@@ -239,26 +262,32 @@ int ldlm_proc_setup(void)
 	RETURN(0);
 
 err_svc:
-	lprocfs_remove(&ldlm_svc_proc_dir);
+	ldebugfs_remove(&ldlm_svc_debugfs_dir);
 err_ns:
-        lprocfs_remove(&ldlm_ns_proc_dir);
+	ldebugfs_remove(&ldlm_ns_debugfs_dir);
 err_type:
-        lprocfs_remove(&ldlm_type_proc_dir);
+	ldebugfs_remove(&ldlm_debugfs_dir);
 err:
-        ldlm_svc_proc_dir = NULL;
-        RETURN(rc);
+	ldlm_svc_debugfs_dir = NULL;
+	ldlm_ns_debugfs_dir = NULL;
+	ldlm_debugfs_dir = NULL;
+	RETURN(rc);
 }
 
-void ldlm_proc_cleanup(void)
+void ldlm_debugfs_cleanup(void)
 {
-        if (ldlm_svc_proc_dir)
-                lprocfs_remove(&ldlm_svc_proc_dir);
+	if (!IS_ERR_OR_NULL(ldlm_svc_debugfs_dir))
+		ldebugfs_remove(&ldlm_svc_debugfs_dir);
 
-        if (ldlm_ns_proc_dir)
-                lprocfs_remove(&ldlm_ns_proc_dir);
+	if (!IS_ERR_OR_NULL(ldlm_ns_debugfs_dir))
+		ldebugfs_remove(&ldlm_ns_debugfs_dir);
 
-        if (ldlm_type_proc_dir)
-                lprocfs_remove(&ldlm_type_proc_dir);
+	if (!IS_ERR_OR_NULL(ldlm_debugfs_dir))
+		ldebugfs_remove(&ldlm_debugfs_dir);
+
+	ldlm_svc_debugfs_dir = NULL;
+	ldlm_ns_debugfs_dir = NULL;
+	ldlm_debugfs_dir = NULL;
 }
 
 static ssize_t resource_count_show(struct kobject *kobj, struct attribute *attr,
@@ -326,18 +355,8 @@ static ssize_t lru_size_store(struct kobject *kobj, struct attribute *attr,
                 CDEBUG(D_DLMTRACE,
                        "dropping all unused locks from namespace %s\n",
                        ldlm_ns_name(ns));
-                if (ns_connect_lru_resize(ns)) {
-			/* Try to cancel all @ns_nr_unused locks. */
-			ldlm_cancel_lru(ns, ns->ns_nr_unused, 0,
-					LDLM_LRU_FLAG_PASSED |
-					LDLM_LRU_FLAG_CLEANUP);
-		} else {
-			tmp = ns->ns_max_unused;
-			ns->ns_max_unused = 0;
-			ldlm_cancel_lru(ns, 0, 0, LDLM_LRU_FLAG_PASSED |
-					LDLM_LRU_FLAG_CLEANUP);
-			ns->ns_max_unused = tmp;
-		}
+		/* Try to cancel all @ns_nr_unused locks. */
+		ldlm_cancel_lru(ns, INT_MAX, 0, LDLM_LRU_FLAG_CLEANUP);
 		return count;
 	}
 
@@ -360,7 +379,6 @@ static ssize_t lru_size_store(struct kobject *kobj, struct attribute *attr,
 		       "changing namespace %s unused locks from %u to %u\n",
 		       ldlm_ns_name(ns), ns->ns_nr_unused,
 		       (unsigned int)tmp);
-		ldlm_cancel_lru(ns, tmp, LCF_ASYNC, LDLM_LRU_FLAG_PASSED);
 
 		if (!lru_resize) {
 			CDEBUG(D_DLMTRACE,
@@ -368,13 +386,12 @@ static ssize_t lru_size_store(struct kobject *kobj, struct attribute *attr,
 			       ldlm_ns_name(ns));
 			ns->ns_connect_flags &= ~OBD_CONNECT_LRU_RESIZE;
 		}
+		ldlm_cancel_lru(ns, tmp, LCF_ASYNC, 0);
         } else {
 		CDEBUG(D_DLMTRACE,
 		       "changing namespace %s max_unused from %u to %u\n",
 		       ldlm_ns_name(ns), ns->ns_max_unused,
 		       (unsigned int)tmp);
-		ns->ns_max_unused = (unsigned int)tmp;
-		ldlm_cancel_lru(ns, 0, LCF_ASYNC, LDLM_LRU_FLAG_PASSED);
 
 		/* Make sure that LRU resize was originally supported before
 		 * turning it on here.
@@ -386,6 +403,8 @@ static ssize_t lru_size_store(struct kobject *kobj, struct attribute *attr,
                                ldlm_ns_name(ns));
                         ns->ns_connect_flags |= OBD_CONNECT_LRU_RESIZE;
                 }
+		ns->ns_max_unused = (unsigned int)tmp;
+		ldlm_cancel_lru(ns, 0, LCF_ASYNC, 0);
         }
 
         return count;
@@ -409,7 +428,6 @@ static ssize_t lru_max_age_store(struct kobject *kobj, struct attribute *attr,
 	int scale = NSEC_PER_MSEC;
 	unsigned long long tmp;
 	char *buf;
-	int err;
 
 	/* Did the user ask in seconds or milliseconds. Default is in ms */
 	buf = strstr(buffer, "ms");
@@ -422,8 +440,7 @@ static ssize_t lru_max_age_store(struct kobject *kobj, struct attribute *attr,
 	if (buf)
 		*buf = '\0';
 
-	err = kstrtoull(buffer, 10, &tmp);
-	if (err != 0)
+	if (kstrtoull(buffer, 10, &tmp))
 		return -EINVAL;
 
 	ns->ns_max_age = ktime_set(0, tmp * scale);
@@ -464,6 +481,32 @@ static ssize_t early_lock_cancel_store(struct kobject *kobj,
 }
 LUSTRE_RW_ATTR(early_lock_cancel);
 
+static ssize_t dirty_age_limit_show(struct kobject *kobj,
+				    struct attribute *attr, char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+
+	return sprintf(buf, "%llu\n", ns->ns_dirty_age_limit);
+}
+
+static ssize_t dirty_age_limit_store(struct kobject *kobj,
+				     struct attribute *attr,
+				     const char *buffer, size_t count)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	unsigned long long tmp;
+
+	if (kstrtoull(buffer, 10, &tmp))
+		return -EINVAL;
+
+	ns->ns_dirty_age_limit = tmp;
+
+	return count;
+}
+LUSTRE_RW_ATTR(dirty_age_limit);
+
 #ifdef HAVE_SERVER_SUPPORT
 static ssize_t ctime_age_limit_show(struct kobject *kobj,
 				    struct attribute *attr, char *buf)
@@ -471,7 +514,7 @@ static ssize_t ctime_age_limit_show(struct kobject *kobj,
 	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
 						 ns_kobj);
 
-	return sprintf(buf, "%u\n", ns->ns_ctime_age_limit);
+	return sprintf(buf, "%llu\n", ns->ns_ctime_age_limit);
 }
 
 static ssize_t ctime_age_limit_store(struct kobject *kobj,
@@ -480,11 +523,9 @@ static ssize_t ctime_age_limit_store(struct kobject *kobj,
 {
 	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
 						 ns_kobj);
-	unsigned long tmp;
-	int err;
+	unsigned long long tmp;
 
-	err = kstrtoul(buffer, 10, &tmp);
-	if (err != 0)
+	if (kstrtoull(buffer, 10, &tmp))
 		return -EINVAL;
 
 	ns->ns_ctime_age_limit = tmp;
@@ -537,7 +578,7 @@ static ssize_t contention_seconds_show(struct kobject *kobj,
 	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
 						 ns_kobj);
 
-	return sprintf(buf, "%u\n", ns->ns_contention_time);
+	return sprintf(buf, "%llu\n", ns->ns_contention_time);
 }
 
 static ssize_t contention_seconds_store(struct kobject *kobj,
@@ -546,11 +587,9 @@ static ssize_t contention_seconds_store(struct kobject *kobj,
 {
 	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
 						 ns_kobj);
-	unsigned long tmp;
-	int err;
+	unsigned long long tmp;
 
-	err = kstrtoul(buffer, 10, &tmp);
-	if (err != 0)
+	if (kstrtoull(buffer, 10, &tmp))
 		return -EINVAL;
 
 	ns->ns_contention_time = tmp;
@@ -625,6 +664,7 @@ static struct attribute *ldlm_ns_attrs[] = {
 	&lustre_attr_lru_size.attr,
 	&lustre_attr_lru_max_age.attr,
 	&lustre_attr_early_lock_cancel.attr,
+	&lustre_attr_dirty_age_limit.attr,
 #ifdef HAVE_SERVER_SUPPORT
 	&lustre_attr_ctime_age_limit.attr,
 	&lustre_attr_lock_timeouts.attr,
@@ -649,13 +689,13 @@ static struct kobj_type ldlm_ns_ktype = {
 	.release	= ldlm_ns_release,
 };
 
-static void ldlm_namespace_proc_unregister(struct ldlm_namespace *ns)
+static void ldlm_namespace_debugfs_unregister(struct ldlm_namespace *ns)
 {
-	if (ns->ns_proc_dir_entry == NULL)
+	if (IS_ERR_OR_NULL(ns->ns_debugfs_entry))
                 CERROR("dlm namespace %s has no procfs dir?\n",
                        ldlm_ns_name(ns));
 	else
-		lprocfs_remove(&ns->ns_proc_dir_entry);
+		ldebugfs_remove(&ns->ns_debugfs_entry);
 
 	if (ns->ns_stats != NULL)
 		lprocfs_free_stats(&ns->ns_stats);
@@ -688,31 +728,23 @@ int ldlm_namespace_sysfs_register(struct ldlm_namespace *ns)
 	return err;
 }
 
-static int ldlm_namespace_proc_register(struct ldlm_namespace *ns)
+static int ldlm_namespace_debugfs_register(struct ldlm_namespace *ns)
 {
-	struct proc_dir_entry *ns_pde;
+	struct dentry *ns_entry;
 
-        LASSERT(ns != NULL);
-        LASSERT(ns->ns_rs_hash != NULL);
-
-	if (ns->ns_proc_dir_entry != NULL) {
-		ns_pde = ns->ns_proc_dir_entry;
+	if (!IS_ERR_OR_NULL(ns->ns_debugfs_entry)) {
+		ns_entry = ns->ns_debugfs_entry;
 	} else {
-		ns_pde = proc_mkdir(ldlm_ns_name(ns), ldlm_ns_proc_dir);
-		if (ns_pde == NULL)
+		ns_entry = debugfs_create_dir(ldlm_ns_name(ns),
+					      ldlm_ns_debugfs_dir);
+		if (!ns_entry)
 			return -ENOMEM;
-		ns->ns_proc_dir_entry = ns_pde;
+		ns->ns_debugfs_entry = ns_entry;
 	}
 
 	return 0;
 }
 #undef MAX_STRING_SIZE
-#else /* CONFIG_PROC_FS */
-
-#define ldlm_namespace_proc_unregister(ns)      ({;})
-#define ldlm_namespace_proc_register(ns)        ({0;})
-
-#endif /* CONFIG_PROC_FS */
 
 static unsigned ldlm_res_hop_hash(struct cfs_hash *hs,
                                   const void *key, unsigned mask)
@@ -927,9 +959,12 @@ struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name,
 		nsb->nsb_reclaim_start = 0;
         }
 
-        ns->ns_obd      = obd;
-        ns->ns_appetite = apt;
-        ns->ns_client   = client;
+	ns->ns_obd = obd;
+	ns->ns_appetite = apt;
+	ns->ns_client = client;
+	ns->ns_name = kstrdup(name, GFP_KERNEL);
+	if (!ns->ns_name)
+		goto out_hash;
 
 	INIT_LIST_HEAD(&ns->ns_list_chain);
 	INIT_LIST_HEAD(&ns->ns_unused_list);
@@ -946,12 +981,14 @@ struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name,
         ns->ns_max_unused         = LDLM_DEFAULT_LRU_SIZE;
 	ns->ns_max_age            = ktime_set(LDLM_DEFAULT_MAX_ALIVE, 0);
         ns->ns_ctime_age_limit    = LDLM_CTIME_AGE_LIMIT;
+	ns->ns_dirty_age_limit    = LDLM_DIRTY_AGE_LIMIT;
         ns->ns_timeouts           = 0;
         ns->ns_orig_connect_flags = 0;
         ns->ns_connect_flags      = 0;
         ns->ns_stopping           = 0;
 	ns->ns_reclaim_start	  = 0;
 	ns->ns_last_pos		  = &ns->ns_unused_list;
+	ns->ns_flags		  = 0;
 
 	rc = ldlm_namespace_sysfs_register(ns);
 	if (rc) {
@@ -959,7 +996,7 @@ struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name,
 		GOTO(out_hash, rc);
 	}
 
-	rc = ldlm_namespace_proc_register(ns);
+	rc = ldlm_namespace_debugfs_register(ns);
 	if (rc) {
 		CERROR("Can't initialize ns proc, rc %d\n", rc);
 		GOTO(out_sysfs, rc);
@@ -975,12 +1012,13 @@ struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name,
         ldlm_namespace_register(ns, client);
         RETURN(ns);
 out_proc:
-	ldlm_namespace_proc_unregister(ns);
+	ldlm_namespace_debugfs_unregister(ns);
 out_sysfs:
 	ldlm_namespace_sysfs_unregister(ns);
 	ldlm_namespace_cleanup(ns, 0);
 out_hash:
-        cfs_hash_putref(ns->ns_rs_hash);
+	kfree(ns->ns_name);
+	cfs_hash_putref(ns->ns_rs_hash);
 out_ns:
         OBD_FREE_PTR(ns);
 out_ref:
@@ -1079,14 +1117,13 @@ static void cleanup_resource(struct ldlm_resource *res, struct list_head *q,
 static int ldlm_resource_clean(struct cfs_hash *hs, struct cfs_hash_bd *bd,
 			       struct hlist_node *hnode, void *arg)
 {
-        struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+	struct ldlm_resource *res = cfs_hash_object(hs, hnode);
 	__u64 flags = *(__u64 *)arg;
 
-        cleanup_resource(res, &res->lr_granted, flags);
-        cleanup_resource(res, &res->lr_converting, flags);
-        cleanup_resource(res, &res->lr_waiting, flags);
+	cleanup_resource(res, &res->lr_granted, flags);
+	cleanup_resource(res, &res->lr_waiting, flags);
 
-        return 0;
+	return 0;
 }
 
 static int ldlm_resource_complain(struct cfs_hash *hs, struct cfs_hash_bd *bd,
@@ -1100,7 +1137,8 @@ static int ldlm_resource_complain(struct cfs_hash *hs, struct cfs_hash_bd *bd,
 	       ldlm_ns_name(ldlm_res_to_ns(res)), PLDLMRES(res), res,
 	       atomic_read(&res->lr_refcount) - 1);
 
-	ldlm_resource_dump(D_ERROR, res);
+	/* Use D_NETERROR since it is in the default mask */
+	ldlm_resource_dump(D_NETERROR, res);
 	unlock_res(res);
 	return 0;
 }
@@ -1242,12 +1280,14 @@ void ldlm_namespace_free_post(struct ldlm_namespace *ns)
 	 * Removing it after @dir may cause oops. */
 	ldlm_pool_fini(&ns->ns_pool);
 
-	ldlm_namespace_proc_unregister(ns);
+	ldlm_namespace_debugfs_unregister(ns);
 	ldlm_namespace_sysfs_unregister(ns);
 	cfs_hash_putref(ns->ns_rs_hash);
+	kfree(ns->ns_name);
 	/* Namespace \a ns should be not on list at this time, otherwise
 	 * this will cause issues related to using freed \a ns in poold
-	 * thread. */
+	 * thread.
+	 */
 	LASSERT(list_empty(&ns->ns_list_chain));
 	OBD_FREE_PTR(ns);
 	ldlm_put_ref();
@@ -1352,33 +1392,62 @@ struct ldlm_namespace *ldlm_namespace_first_locked(enum ldlm_side client)
 			    struct ldlm_namespace, ns_list_chain);
 }
 
+static bool ldlm_resource_extent_new(struct ldlm_resource *res)
+{
+	int idx;
+
+	OBD_SLAB_ALLOC(res->lr_itree, ldlm_interval_tree_slab,
+		       sizeof(*res->lr_itree) * LCK_MODE_NUM);
+	if (res->lr_itree == NULL)
+		return false;
+	/* Initialize interval trees for each lock mode. */
+	for (idx = 0; idx < LCK_MODE_NUM; idx++) {
+		res->lr_itree[idx].lit_size = 0;
+		res->lr_itree[idx].lit_mode = 1 << idx;
+		res->lr_itree[idx].lit_root = NULL;
+	}
+	return true;
+}
+
+static bool ldlm_resource_inodebits_new(struct ldlm_resource *res)
+{
+	int i;
+
+	OBD_ALLOC_PTR(res->lr_ibits_queues);
+	if (res->lr_ibits_queues == NULL)
+		return false;
+	for (i = 0; i < MDS_INODELOCK_NUMBITS; i++)
+		INIT_LIST_HEAD(&res->lr_ibits_queues->liq_waiting[i]);
+	return true;
+}
+
 /** Create and initialize new resource. */
 static struct ldlm_resource *ldlm_resource_new(enum ldlm_type ldlm_type)
 {
 	struct ldlm_resource *res;
-	int idx;
+	bool rc;
 
 	OBD_SLAB_ALLOC_PTR_GFP(res, ldlm_resource_slab, GFP_NOFS);
 	if (res == NULL)
 		return NULL;
 
-	if (ldlm_type == LDLM_EXTENT) {
-		OBD_SLAB_ALLOC(res->lr_itree, ldlm_interval_tree_slab,
-			       sizeof(*res->lr_itree) * LCK_MODE_NUM);
-		if (res->lr_itree == NULL) {
-			OBD_SLAB_FREE_PTR(res, ldlm_resource_slab);
-			return NULL;
-		}
-		/* Initialize interval trees for each lock mode. */
-		for (idx = 0; idx < LCK_MODE_NUM; idx++) {
-			res->lr_itree[idx].lit_size = 0;
-			res->lr_itree[idx].lit_mode = 1 << idx;
-			res->lr_itree[idx].lit_root = NULL;
-		}
+	switch (ldlm_type) {
+	case LDLM_EXTENT:
+		rc = ldlm_resource_extent_new(res);
+		break;
+	case LDLM_IBITS:
+		rc = ldlm_resource_inodebits_new(res);
+		break;
+	default:
+		rc = true;
+		break;
+	}
+	if (!rc) {
+		OBD_SLAB_FREE_PTR(res, ldlm_resource_slab);
+		return NULL;
 	}
 
 	INIT_LIST_HEAD(&res->lr_granted);
-	INIT_LIST_HEAD(&res->lr_converting);
 	INIT_LIST_HEAD(&res->lr_waiting);
 
 	atomic_set(&res->lr_refcount, 1);
@@ -1393,6 +1462,20 @@ static struct ldlm_resource *ldlm_resource_new(enum ldlm_type ldlm_type)
 	return res;
 }
 
+static void ldlm_resource_free(struct ldlm_resource *res)
+{
+	if (res->lr_type == LDLM_EXTENT) {
+		if (res->lr_itree != NULL)
+			OBD_SLAB_FREE(res->lr_itree, ldlm_interval_tree_slab,
+				      sizeof(*res->lr_itree) * LCK_MODE_NUM);
+	} else if (res->lr_type == LDLM_IBITS) {
+		if (res->lr_ibits_queues != NULL)
+			OBD_FREE_PTR(res->lr_ibits_queues);
+	}
+
+	OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res);
+}
+
 /**
  * Return a reference to resource with given name, creating it if necessary.
  * Args: namespace with ns_lock unlocked
@@ -1447,10 +1530,7 @@ ldlm_resource_get(struct ldlm_namespace *ns, struct ldlm_resource *parent,
 		cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
 		/* Clean lu_ref for failed resource. */
 		lu_ref_fini(&res->lr_reference);
-		if (res->lr_itree != NULL)
-			OBD_SLAB_FREE(res->lr_itree, ldlm_interval_tree_slab,
-				      sizeof(*res->lr_itree) * LCK_MODE_NUM);
-		OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res);
+		ldlm_resource_free(res);
 found:
 		res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
 		return res;
@@ -1491,28 +1571,23 @@ struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res)
 static void __ldlm_resource_putref_final(struct cfs_hash_bd *bd,
                                          struct ldlm_resource *res)
 {
-        struct ldlm_ns_bucket *nsb = res->lr_ns_bucket;
+	struct ldlm_ns_bucket *nsb = res->lr_ns_bucket;
 
 	if (!list_empty(&res->lr_granted)) {
-                ldlm_resource_dump(D_ERROR, res);
-                LBUG();
-        }
-
-	if (!list_empty(&res->lr_converting)) {
-                ldlm_resource_dump(D_ERROR, res);
-                LBUG();
-        }
+		ldlm_resource_dump(D_ERROR, res);
+		LBUG();
+	}
 
 	if (!list_empty(&res->lr_waiting)) {
-                ldlm_resource_dump(D_ERROR, res);
-                LBUG();
-        }
+		ldlm_resource_dump(D_ERROR, res);
+		LBUG();
+	}
 
-        cfs_hash_bd_del_locked(nsb->nsb_namespace->ns_rs_hash,
-                               bd, &res->lr_hash);
-        lu_ref_fini(&res->lr_reference);
-        if (cfs_hash_bd_count_get(bd) == 0)
-                ldlm_namespace_put(nsb->nsb_namespace);
+	cfs_hash_bd_del_locked(nsb->nsb_namespace->ns_rs_hash,
+			       bd, &res->lr_hash);
+	lu_ref_fini(&res->lr_reference);
+	if (cfs_hash_bd_count_get(bd) == 0)
+		ldlm_namespace_put(nsb->nsb_namespace);
 }
 
 /* Returns 1 if the resource was freed, 0 if it remains. */
@@ -1531,10 +1606,7 @@ int ldlm_resource_putref(struct ldlm_resource *res)
 		cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
 		if (ns->ns_lvbo && ns->ns_lvbo->lvbo_free)
 			ns->ns_lvbo->lvbo_free(res);
-		if (res->lr_itree != NULL)
-			OBD_SLAB_FREE(res->lr_itree, ldlm_interval_tree_slab,
-				      sizeof(*res->lr_itree) * LCK_MODE_NUM);
-		OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res);
+		ldlm_resource_free(res);
 		return 1;
 	}
 	return 0;
@@ -1559,6 +1631,9 @@ void ldlm_resource_add_lock(struct ldlm_resource *res, struct list_head *head,
 	LASSERT(list_empty(&lock->l_res_link));
 
 	list_add_tail(&lock->l_res_link, head);
+
+	if (res->lr_type == LDLM_IBITS)
+		ldlm_inodebits_add_lock(res, head, lock);
 }
 
 /**
@@ -1591,11 +1666,18 @@ void ldlm_resource_unlink_lock(struct ldlm_lock *lock)
 {
         int type = lock->l_resource->lr_type;
 
-        check_res_locked(lock->l_resource);
-        if (type == LDLM_IBITS || type == LDLM_PLAIN)
-                ldlm_unlink_lock_skiplist(lock);
-        else if (type == LDLM_EXTENT)
-                ldlm_extent_unlink_lock(lock);
+	check_res_locked(lock->l_resource);
+	switch (type) {
+	case LDLM_PLAIN:
+		ldlm_unlink_lock_skiplist(lock);
+		break;
+	case LDLM_EXTENT:
+		ldlm_extent_unlink_lock(lock);
+		break;
+	case LDLM_IBITS:
+		ldlm_inodebits_unlink_lock(lock);
+		break;
+	}
 	list_del_init(&lock->l_res_link);
 }
 EXPORT_SYMBOL(ldlm_resource_unlink_lock);
@@ -1655,14 +1737,14 @@ void ldlm_namespace_dump(int level, struct ldlm_namespace *ns)
 	       ldlm_ns_name(ns), atomic_read(&ns->ns_bref),
 	       ns_is_client(ns) ? "client" : "server");
 
-	if (cfs_time_before(cfs_time_current(), ns->ns_next_dump))
+	if (ktime_get_seconds() < ns->ns_next_dump)
 		return;
 
 	cfs_hash_for_each_nolock(ns->ns_rs_hash,
 				 ldlm_res_hash_dump,
 				 (void *)(unsigned long)level, 0);
 	spin_lock(&ns->ns_lock);
-	ns->ns_next_dump = cfs_time_shift(10);
+	ns->ns_next_dump = ktime_get_seconds() + 10;
 	spin_unlock(&ns->ns_lock);
 }
 
@@ -1695,15 +1777,11 @@ void ldlm_resource_dump(int level, struct ldlm_resource *res)
                         }
                 }
         }
-	if (!list_empty(&res->lr_converting)) {
-                CDEBUG(level, "Converting locks:\n");
-		list_for_each_entry(lock, &res->lr_converting, l_res_link)
-                        LDLM_DEBUG_LIMIT(level, lock, "###");
-        }
+
 	if (!list_empty(&res->lr_waiting)) {
-                CDEBUG(level, "Waiting locks:\n");
+		CDEBUG(level, "Waiting locks:\n");
 		list_for_each_entry(lock, &res->lr_waiting, l_res_link)
-                        LDLM_DEBUG_LIMIT(level, lock, "###");
-        }
+			LDLM_DEBUG_LIMIT(level, lock, "###");
+	}
 }
 EXPORT_SYMBOL(ldlm_resource_dump);
diff --git a/drivers/staging/lustrefsx/lustre/llite/Makefile b/drivers/staging/lustrefsx/lustre/llite/Makefile
index 96430e764665b..19f415face716 100644
--- a/drivers/staging/lustrefsx/lustre/llite/Makefile
+++ b/drivers/staging/lustrefsx/lustre/llite/Makefile
@@ -7,7 +7,7 @@ lustre-y	+= rw26.o super25.o statahead.o xattr_security.o
 lustre-y	+= glimpse.o
 lustre-y	+= lcommon_cl.o
 lustre-y	+= lcommon_misc.o
-lustre-y	+= vvp_dev.o vvp_page.o vvp_lock.o vvp_io.o vvp_object.o
+lustre-y	+= vvp_dev.o vvp_page.o vvp_io.o vvp_object.o
 lustre-y	+= range_lock.o
 
 include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/llite/dcache.c b/drivers/staging/lustrefsx/lustre/llite/dcache.c
index 6da6b5956ab4e..6a026f0f176e5 100644
--- a/drivers/staging/lustrefsx/lustre/llite/dcache.c
+++ b/drivers/staging/lustrefsx/lustre/llite/dcache.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,7 +38,6 @@
 #define DEBUG_SUBSYSTEM S_LLITE
 
 #include <obd_support.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_dlm.h>
 
 #include "llite_internal.h"
diff --git a/drivers/staging/lustrefsx/lustre/llite/dir.c b/drivers/staging/lustrefsx/lustre/llite/dir.c
index 6e987fe2f7387..dd2452a3459a6 100644
--- a/drivers/staging/lustrefsx/lustre/llite/dir.c
+++ b/drivers/staging/lustrefsx/lustre/llite/dir.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,21 +38,20 @@
 #include <linux/pagemap.h>
 #include <linux/mm.h>
 #include <linux/version.h>
+#include <linux/security.h>
 #include <linux/user_namespace.h>
 #ifdef HAVE_UIDGID_HEADER
 # include <linux/uidgid.h>
 #endif
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/buffer_head.h>   // for wait_on_buffer
 #include <linux/pagevec.h>
 
 #define DEBUG_SUBSYSTEM S_LLITE
 
-#include <lustre/lustre_idl.h>
-
 #include <obd_support.h>
 #include <obd_class.h>
-#include <uapi/linux/lustre_ioctl.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
 #include <lustre_lib.h>
 #include <lustre_dlm.h>
 #include <lustre_fid.h>
@@ -322,6 +321,7 @@ static int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
 	int			hash64	= sbi->ll_flags & LL_SBI_64BIT_HASH;
 	int			api32	= ll_need_32bit_api(sbi);
 	struct md_op_data	*op_data;
+	struct lu_fid		pfid = { 0 };
 	__u64			pos;
 	int			rc;
 	ENTRY;
@@ -341,34 +341,36 @@ static int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
 		 */
 		GOTO(out, rc = 0);
 
-	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
-				     LUSTRE_OPC_ANY, inode);
-	if (IS_ERR(op_data))
-		GOTO(out, rc = PTR_ERR(op_data));
-
-	if (unlikely(op_data->op_mea1 != NULL)) {
-		/* This is only needed for striped dir to fill ..,
-		 * see lmv_read_entry */
+	if (unlikely(ll_i2info(inode)->lli_lsm_md != NULL)) {
+		/*
+		 * This is only needed for striped dir to fill ..,
+		 * see lmv_read_page()
+		 */
 		if (file_dentry(filp)->d_parent != NULL &&
 		    file_dentry(filp)->d_parent->d_inode != NULL) {
-			__u64 ibits = MDS_INODELOCK_UPDATE;
+			__u64 ibits = MDS_INODELOCK_LOOKUP;
 			struct inode *parent =
 				file_dentry(filp)->d_parent->d_inode;
 
 			if (ll_have_md_lock(parent, &ibits, LCK_MINMODE))
-				op_data->op_fid3 = *ll_inode2fid(parent);
+				pfid = *ll_inode2fid(parent);
 		}
 
 		/* If it can not find in cache, do lookup .. on the master
 		 * object */
-		if (fid_is_zero(&op_data->op_fid3)) {
-			rc = ll_dir_get_parent_fid(inode, &op_data->op_fid3);
-			if (rc != 0) {
-				ll_finish_md_op_data(op_data);
+		if (fid_is_zero(&pfid)) {
+			rc = ll_dir_get_parent_fid(inode, &pfid);
+			if (rc != 0)
 				RETURN(rc);
-			}
 		}
 	}
+
+	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, inode);
+	if (IS_ERR(op_data))
+		GOTO(out, rc = PTR_ERR(op_data));
+	op_data->op_fid3 = pfid;
+
 #ifdef HAVE_DIR_CONTEXT
 	ctx->pos = pos;
 	rc = ll_dir_read(inode, &pos, op_data, ctx);
@@ -435,7 +437,7 @@ static int ll_send_mgc_param(struct obd_export *mgc, char *string)
  *                      <0 if the creation is failed.
  */
 static int ll_dir_setdirstripe(struct dentry *dparent, struct lmv_user_md *lump,
-			       const char *dirname, umode_t mode)
+			       size_t len, const char *dirname, umode_t mode)
 {
 	struct inode *parent = dparent->d_inode;
 	struct ptlrpc_request *request = NULL;
@@ -454,7 +456,8 @@ static int ll_dir_setdirstripe(struct dentry *dparent, struct lmv_user_md *lump,
 	int err;
 	ENTRY;
 
-	if (unlikely(lump->lum_magic != LMV_USER_MAGIC))
+	if (unlikely(lump->lum_magic != LMV_USER_MAGIC &&
+		     lump->lum_magic != LMV_USER_MAGIC_SPECIFIC))
 		RETURN(-EINVAL);
 
 	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p) name %s "
@@ -470,7 +473,8 @@ static int ll_dir_setdirstripe(struct dentry *dparent, struct lmv_user_md *lump,
 	    !OBD_FAIL_CHECK(OBD_FAIL_LLITE_NO_CHECK_DEAD))
 		RETURN(-ENOENT);
 
-	if (lump->lum_magic != cpu_to_le32(LMV_USER_MAGIC))
+	if (lump->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
+	    lump->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
 		lustre_swab_lmv_user_md(lump);
 
 	if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent)))
@@ -495,7 +499,7 @@ static int ll_dir_setdirstripe(struct dentry *dparent, struct lmv_user_md *lump,
 	}
 
 	op_data->op_cli_flags |= CLI_SET_MEA;
-	err = md_create(sbi->ll_md_exp, op_data, lump, sizeof(*lump), mode,
+	err = md_create(sbi->ll_md_exp, op_data, lump, len, mode,
 			from_kuid(&init_user_ns, current_fsuid()),
 			from_kgid(&init_user_ns, current_fsgid()),
 			cfs_curproc_cap_pack(), 0, &request);
@@ -536,69 +540,67 @@ static int ll_dir_setdirstripe(struct dentry *dparent, struct lmv_user_md *lump,
 int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
                      int set_default)
 {
-        struct ll_sb_info *sbi = ll_i2sbi(inode);
-        struct md_op_data *op_data;
-        struct ptlrpc_request *req = NULL;
-        int rc = 0;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct md_op_data *op_data;
+	struct ptlrpc_request *req = NULL;
+	int rc = 0;
 #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0)
-        struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
-        struct obd_device *mgc = lsi->lsi_mgc;
+	struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
+	struct obd_device *mgc = lsi->lsi_mgc;
 #endif
-        int lum_size;
+	int lum_size;
 	ENTRY;
 
-        if (lump != NULL) {
-                /*
-                 * This is coming from userspace, so should be in
-                 * local endian.  But the MDS would like it in little
-                 * endian, so we swab it before we send it.
-                 */
-                switch (lump->lmm_magic) {
-                case LOV_USER_MAGIC_V1: {
-                        if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V1))
-                                lustre_swab_lov_user_md_v1(lump);
-                        lum_size = sizeof(struct lov_user_md_v1);
-                        break;
-                }
-                case LOV_USER_MAGIC_V3: {
-                        if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V3))
-                                lustre_swab_lov_user_md_v3(
-                                        (struct lov_user_md_v3 *)lump);
-                        lum_size = sizeof(struct lov_user_md_v3);
-                        break;
-                }
-		case LOV_USER_MAGIC_COMP_V1: {
-			if (lump->lmm_magic !=
-			    cpu_to_le32(LOV_USER_MAGIC_COMP_V1))
-				lustre_swab_lov_comp_md_v1(
-					(struct lov_comp_md_v1 *)lump);
-			lum_size = le32_to_cpu(
-				((struct lov_comp_md_v1 *)lump)->lcm_size);
+	if (lump != NULL) {
+		switch (lump->lmm_magic) {
+		case LOV_USER_MAGIC_V1:
+			lum_size = sizeof(struct lov_user_md_v1);
 			break;
-		}
-		case LMV_USER_MAGIC: {
+		case LOV_USER_MAGIC_V3:
+			lum_size = sizeof(struct lov_user_md_v3);
+			break;
+		case LOV_USER_MAGIC_COMP_V1:
+			lum_size = ((struct lov_comp_md_v1 *)lump)->lcm_size;
+			break;
+		case LMV_USER_MAGIC:
 			if (lump->lmm_magic != cpu_to_le32(LMV_USER_MAGIC))
 				lustre_swab_lmv_user_md(
 					(struct lmv_user_md *)lump);
 			lum_size = sizeof(struct lmv_user_md);
 			break;
+		case LOV_USER_MAGIC_SPECIFIC: {
+			struct lov_user_md_v3 *v3 =
+				(struct lov_user_md_v3 *)lump;
+			if (v3->lmm_stripe_count > LOV_MAX_STRIPE_COUNT)
+				RETURN(-EINVAL);
+			lum_size = lov_user_md_size(v3->lmm_stripe_count,
+						    LOV_USER_MAGIC_SPECIFIC);
+			break;
 		}
-                default: {
-                        CDEBUG(D_IOCTL, "bad userland LOV MAGIC:"
-                                        " %#08x != %#08x nor %#08x\n",
-                                        lump->lmm_magic, LOV_USER_MAGIC_V1,
-                                        LOV_USER_MAGIC_V3);
-                        RETURN(-EINVAL);
-                }
-                }
-        } else {
-                lum_size = sizeof(struct lov_user_md_v1);
-        }
+		default:
+			CDEBUG(D_IOCTL, "bad userland LOV MAGIC:"
+					" %#08x != %#08x nor %#08x\n",
+					lump->lmm_magic, LOV_USER_MAGIC_V1,
+					LOV_USER_MAGIC_V3);
+			RETURN(-EINVAL);
+		}
+
+		/*
+		 * This is coming from userspace, so should be in
+		 * local endian.  But the MDS would like it in little
+		 * endian, so we swab it before we send it.
+		 */
+		if ((__swab32(lump->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) ==
+		    le32_to_cpu(LOV_MAGIC_MAGIC))
+			lustre_swab_lov_user_md(lump, 0);
+	} else {
+		lum_size = sizeof(struct lov_user_md_v1);
+	}
 
-        op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
-                                     LUSTRE_OPC_ANY, NULL);
-        if (IS_ERR(op_data))
-                RETURN(PTR_ERR(op_data));
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
 
 	/* swabbing is done in lov_setstripe() on server side */
 	rc = md_setattr(sbi->ll_md_exp, op_data, lump, lum_size, &req);
@@ -661,16 +663,10 @@ int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
 	RETURN(rc);
 }
 
-/**
- * This function will be used to get default LOV/LMV/Default LMV
- * @valid will be used to indicate which stripe it will retrieve
- * 	OBD_MD_MEA  		LMV stripe EA
- * 	OBD_MD_DEFAULT_MEA	Default LMV stripe EA
- *  	otherwise		Default LOV EA.
- * Each time, it can only retrieve 1 stripe EA
- **/
-int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size,
-		     struct ptlrpc_request **request, u64 valid)
+static int ll_dir_get_default_layout(struct inode *inode, void **plmm,
+				     int *plmm_size,
+				     struct ptlrpc_request **request, u64 valid,
+				     enum get_default_layout_type type)
 {
 	struct ll_sb_info *sbi = ll_i2sbi(inode);
 	struct mdt_body   *body;
@@ -678,6 +674,7 @@ int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size,
 	struct ptlrpc_request *req = NULL;
 	int rc, lmm_size;
 	struct md_op_data *op_data;
+	struct lu_fid fid;
 	ENTRY;
 
 	rc = ll_get_default_mdsize(sbi, &lmm_size);
@@ -691,11 +688,19 @@ int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size,
 		RETURN(PTR_ERR(op_data));
 
 	op_data->op_valid = valid | OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
+
+	if (type == GET_DEFAULT_LAYOUT_ROOT) {
+		lu_root_fid(&op_data->op_fid1);
+		fid = op_data->op_fid1;
+	} else {
+		fid = *ll_inode2fid(inode);
+	}
+
 	rc = md_getattr(sbi->ll_md_exp, op_data, &req);
 	ll_finish_md_op_data(op_data);
 	if (rc < 0) {
-		CDEBUG(D_INFO, "md_getattr failed on inode "
-		       DFID": rc %d\n", PFID(ll_inode2fid(inode)), rc);
+		CDEBUG(D_INFO, "md_getattr failed on inode "DFID": rc %d\n",
+		       PFID(&fid), rc);
 		GOTO(out, rc);
 	}
 
@@ -721,17 +726,11 @@ int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size,
 	/* We don't swab objects for directories */
 	switch (le32_to_cpu(lmm->lmm_magic)) {
 	case LOV_MAGIC_V1:
-		if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
-			lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
-		break;
 	case LOV_MAGIC_V3:
-		if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
-			lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
-		break;
 	case LOV_MAGIC_COMP_V1:
+	case LOV_USER_MAGIC_SPECIFIC:
 		if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
-			lustre_swab_lov_comp_md_v1(
-					(struct lov_comp_md_v1 *)lmm);
+			lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0);
 		break;
 	case LMV_MAGIC_V1:
 		if (LMV_MAGIC != cpu_to_le32(LMV_MAGIC))
@@ -752,6 +751,75 @@ int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size,
 	return rc;
 }
 
+/**
+ * This function will be used to get default LOV/LMV/Default LMV
+ * @valid will be used to indicate which stripe it will retrieve.
+ * If the directory does not have its own default layout, then the
+ * function will request the default layout from root FID.
+ *	OBD_MD_MEA		LMV stripe EA
+ *	OBD_MD_DEFAULT_MEA	Default LMV stripe EA
+ *	otherwise		Default LOV EA.
+ * Each time, it can only retrieve 1 stripe EA
+ **/
+int ll_dir_getstripe_default(struct inode *inode, void **plmm, int *plmm_size,
+			     struct ptlrpc_request **request,
+			     struct ptlrpc_request **root_request,
+			     u64 valid)
+{
+	struct ptlrpc_request *req = NULL;
+	struct ptlrpc_request *root_req = NULL;
+	struct lov_mds_md *lmm = NULL;
+	int lmm_size = 0;
+	int rc = 0;
+	ENTRY;
+
+	rc = ll_dir_get_default_layout(inode, (void **)&lmm, &lmm_size,
+				       &req, valid, 0);
+	if (rc == -ENODATA && !fid_is_root(ll_inode2fid(inode)) &&
+	    !(valid & (OBD_MD_MEA|OBD_MD_DEFAULT_MEA)) && root_request != NULL){
+		int rc2 = ll_dir_get_default_layout(inode, (void **)&lmm,
+						    &lmm_size, &root_req, valid,
+						    GET_DEFAULT_LAYOUT_ROOT);
+		if (rc2 == 0)
+			rc = 0;
+	}
+
+	*plmm = lmm;
+	*plmm_size = lmm_size;
+	*request = req;
+	if (root_request != NULL)
+		*root_request = root_req;
+
+	RETURN(rc);
+}
+
+/**
+ * This function will be used to get default LOV/LMV/Default LMV
+ * @valid will be used to indicate which stripe it will retrieve
+ *	OBD_MD_MEA		LMV stripe EA
+ *	OBD_MD_DEFAULT_MEA	Default LMV stripe EA
+ *	otherwise		Default LOV EA.
+ * Each time, it can only retrieve 1 stripe EA
+ **/
+int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size,
+		     struct ptlrpc_request **request, u64 valid)
+{
+	struct ptlrpc_request *req = NULL;
+	struct lov_mds_md *lmm = NULL;
+	int lmm_size = 0;
+	int rc = 0;
+	ENTRY;
+
+	rc = ll_dir_get_default_layout(inode, (void **)&lmm, &lmm_size,
+				       &req, valid, 0);
+
+	*plmm = lmm;
+	*plmm_size = lmm_size;
+	*request = req;
+
+	RETURN(rc);
+}
+
 int ll_get_mdt_idx_by_fid(struct ll_sb_info *sbi, const struct lu_fid *fid)
 {
 	struct md_op_data	*op_data;
@@ -960,25 +1028,110 @@ static int ll_ioc_copy_end(struct super_block *sb, struct hsm_copy *copy)
 }
 
 
-static int copy_and_ioctl(int cmd, struct obd_export *exp,
-			  const void __user *data, size_t size)
+static int copy_and_ct_start(int cmd, struct obd_export *exp,
+			     const struct lustre_kernelcomm __user *data)
 {
-	void *copy;
+	struct lustre_kernelcomm *lk;
+	struct lustre_kernelcomm *tmp;
+	size_t size = sizeof(*lk);
+	size_t new_size;
+	int i;
 	int rc;
 
-	OBD_ALLOC(copy, size);
-	if (copy == NULL)
+	/* copy data from userspace to get numbers of archive_id */
+	OBD_ALLOC(lk, size);
+	if (lk == NULL)
 		return -ENOMEM;
 
-	if (copy_from_user(copy, data, size)) {
-		rc = -EFAULT;
-		goto out;
+	if (copy_from_user(lk, data, size))
+		GOTO(out_lk, rc = -EFAULT);
+
+	if (lk->lk_flags & LK_FLG_STOP)
+		goto do_ioctl;
+
+	if (!(lk->lk_flags & LK_FLG_DATANR)) {
+		__u32 archive_mask = lk->lk_data_count;
+		int count;
+
+		/* old hsm agent to old MDS */
+		if (!exp_connect_archive_id_array(exp))
+			goto do_ioctl;
+
+		/* old hsm agent to new MDS */
+		lk->lk_flags |= LK_FLG_DATANR;
+
+		if (archive_mask == 0)
+			goto do_ioctl;
+
+		count = hweight32(archive_mask);
+		new_size = offsetof(struct lustre_kernelcomm, lk_data[count]);
+		OBD_ALLOC(tmp, new_size);
+		if (tmp == NULL)
+			GOTO(out_lk, rc = -ENOMEM);
+
+		memcpy(tmp, lk, size);
+		tmp->lk_data_count = count;
+		OBD_FREE(lk, size);
+		lk = tmp;
+		size = new_size;
+
+		count = 0;
+		for (i = 0; i < sizeof(archive_mask) * 8; i++) {
+			if ((1 << i) & archive_mask) {
+				lk->lk_data[count] = i + 1;
+				count++;
+			}
+		}
+		goto do_ioctl;
+	}
+
+	/* new hsm agent to new mds */
+	if (lk->lk_data_count > 0) {
+		new_size = offsetof(struct lustre_kernelcomm,
+				    lk_data[lk->lk_data_count]);
+		OBD_ALLOC(tmp, new_size);
+		if (tmp == NULL)
+			GOTO(out_lk, rc = -ENOMEM);
+
+		OBD_FREE(lk, size);
+		lk = tmp;
+		size = new_size;
+
+		if (copy_from_user(lk, data, size))
+			GOTO(out_lk, rc = -EFAULT);
 	}
 
-	rc = obd_iocontrol(cmd, exp, size, copy, NULL);
-out:
-	OBD_FREE(copy, size);
+	/* new hsm agent to old MDS */
+	if (!exp_connect_archive_id_array(exp)) {
+		__u32 archives = 0;
+
+		if (lk->lk_data_count > LL_HSM_ORIGIN_MAX_ARCHIVE)
+			GOTO(out_lk, rc = -EINVAL);
+
+		for (i = 0; i < lk->lk_data_count; i++) {
+			if (lk->lk_data[i] > LL_HSM_ORIGIN_MAX_ARCHIVE) {
+				rc = -EINVAL;
+				CERROR("%s: archive id %d requested but only "
+				       "[0 - %zu] supported: rc = %d\n",
+				       exp->exp_obd->obd_name, lk->lk_data[i],
+				       LL_HSM_ORIGIN_MAX_ARCHIVE, rc);
+				GOTO(out_lk, rc);
+			}
 
+			if (lk->lk_data[i] == 0) {
+				archives = 0;
+				break;
+			}
+
+			archives |= (1 << (lk->lk_data[i] - 1));
+		}
+		lk->lk_flags &= ~LK_FLG_DATANR;
+		lk->lk_data_count = archives;
+	}
+do_ioctl:
+	rc = obd_iocontrol(cmd, exp, size, lk, NULL);
+out_lk:
+	OBD_FREE(lk, size);
 	return rc;
 }
 
@@ -999,32 +1152,38 @@ static int check_owner(int type, int id)
 	return 0;
 }
 
-static int quotactl_ioctl(struct ll_sb_info *sbi, struct if_quotactl *qctl)
+static int quotactl_ioctl(struct super_block *sb, struct if_quotactl *qctl)
 {
-        int cmd = qctl->qc_cmd;
-        int type = qctl->qc_type;
-        int id = qctl->qc_id;
-        int valid = qctl->qc_valid;
-        int rc = 0;
-        ENTRY;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int cmd = qctl->qc_cmd;
+	int type = qctl->qc_type;
+	int id = qctl->qc_id;
+	int valid = qctl->qc_valid;
+	int rc = 0;
+	ENTRY;
 
-        switch (cmd) {
-        case Q_SETQUOTA:
-        case Q_SETINFO:
+	switch (cmd) {
+	case Q_SETQUOTA:
+	case Q_SETINFO:
+	case LUSTRE_Q_SETDEFAULT:
 		if (!cfs_capable(CFS_CAP_SYS_ADMIN))
 			RETURN(-EPERM);
+
+		if (sb->s_flags & SB_RDONLY)
+			RETURN(-EROFS);
 		break;
 	case Q_GETQUOTA:
+	case LUSTRE_Q_GETDEFAULT:
 		if (check_owner(type, id) &&
 		    (!cfs_capable(CFS_CAP_SYS_ADMIN)))
 			RETURN(-EPERM);
-                break;
-        case Q_GETINFO:
-                break;
-        default:
-                CERROR("unsupported quotactl op: %#x\n", cmd);
-                RETURN(-ENOTTY);
-        }
+		break;
+	case Q_GETINFO:
+		break;
+	default:
+		CERROR("unsupported quotactl op: %#x\n", cmd);
+		RETURN(-ENOTSUPP);
+	}
 
         if (valid != QC_GENERAL) {
                 if (cmd == Q_GETINFO)
@@ -1121,6 +1280,54 @@ static int quotactl_ioctl(struct ll_sb_info *sbi, struct if_quotactl *qctl)
         RETURN(rc);
 }
 
+int ll_rmfid(struct file *file, void __user *arg)
+{
+	const struct fid_array __user *ufa = arg;
+	struct fid_array *lfa = NULL;
+	size_t size;
+	unsigned nr;
+	int i, rc, *rcs = NULL;
+	ENTRY;
+
+	if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
+	    !(ll_i2sbi(file_inode(file))->ll_flags & LL_SBI_USER_FID2PATH))
+		RETURN(-EPERM);
+	/* Only need to get the buflen */
+	if (get_user(nr, &ufa->fa_nr))
+		RETURN(-EFAULT);
+	/* DoS protection */
+	if (nr > OBD_MAX_FIDS_IN_ARRAY)
+		RETURN(-E2BIG);
+
+	size = offsetof(struct fid_array, fa_fids[nr]);
+	OBD_ALLOC(lfa, size);
+	if (!lfa)
+		RETURN(-ENOMEM);
+	OBD_ALLOC(rcs, sizeof(int) * nr);
+	if (!rcs)
+		GOTO(free_lfa, rc = -ENOMEM);
+
+	if (copy_from_user(lfa, arg, size))
+		GOTO(free_rcs, rc = -EFAULT);
+
+	/* Call mdc_iocontrol */
+	rc = md_rmfid(ll_i2mdexp(file_inode(file)), lfa, rcs, NULL);
+	if (!rc) {
+		for (i = 0; i < nr; i++)
+			if (rcs[i])
+				lfa->fa_fids[i].f_ver = rcs[i];
+		if (copy_to_user(arg, lfa, size))
+			rc = -EFAULT;
+	}
+
+free_rcs:
+	OBD_FREE(rcs, sizeof(int) * nr);
+free_lfa:
+	OBD_FREE(lfa, size);
+
+	RETURN(rc);
+}
+
 /* This function tries to get a single name component,
  * to send to the server. No actual path traversal involved,
  * so we limit to NAME_MAX */
@@ -1153,46 +1360,46 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct dentry *dentry = file_dentry(file);
 	struct inode *inode = file_inode(file);
-        struct ll_sb_info *sbi = ll_i2sbi(inode);
-        struct obd_ioctl_data *data;
-        int rc = 0;
-        ENTRY;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct obd_ioctl_data *data;
+	int rc = 0;
+	ENTRY;
 
 	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%#x\n",
 	       PFID(ll_inode2fid(inode)), inode, cmd);
 
-        /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
-        if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
-                return -ENOTTY;
-
-        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
-        switch(cmd) {
-        case FSFILT_IOC_GETFLAGS:
-        case FSFILT_IOC_SETFLAGS:
-                RETURN(ll_iocontrol(inode, file, cmd, arg));
-        case FSFILT_IOC_GETVERSION_OLD:
-        case FSFILT_IOC_GETVERSION:
+	/* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
+	if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
+		return -ENOTTY;
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+	case FS_IOC_SETFLAGS:
+		RETURN(ll_iocontrol(inode, file, cmd, arg));
+	case FSFILT_IOC_GETVERSION:
+	case FS_IOC_GETVERSION:
 		RETURN(put_user(inode->i_generation, (int __user *)arg));
-        /* We need to special case any other ioctls we want to handle,
-         * to send them to the MDS/OST as appropriate and to properly
-         * network encode the arg field.
-        case FSFILT_IOC_SETVERSION_OLD:
-        case FSFILT_IOC_SETVERSION:
-        */
-        case LL_IOC_GET_MDTIDX: {
-                int mdtidx;
-
-                mdtidx = ll_get_mdt_idx(inode);
-                if (mdtidx < 0)
-                        RETURN(mdtidx);
+	/* We need to special case any other ioctls we want to handle,
+	 * to send them to the MDS/OST as appropriate and to properly
+	 * network encode the arg field. */
+	case FS_IOC_SETVERSION:
+		RETURN(-ENOTSUPP);
+
+	case LL_IOC_GET_MDTIDX: {
+		int mdtidx;
+
+		mdtidx = ll_get_mdt_idx(inode);
+		if (mdtidx < 0)
+			RETURN(mdtidx);
 
 		if (put_user((int)mdtidx, (int __user *)arg))
-                        RETURN(-EFAULT);
+			RETURN(-EFAULT);
 
-                return 0;
-        }
-        case IOC_MDC_LOOKUP: {
-		int namelen, len = 0;
+		return 0;
+	}
+	case IOC_MDC_LOOKUP: {
+				     int namelen, len = 0;
 		char *buf = NULL;
 		char *filename;
 
@@ -1248,8 +1455,9 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		lum = (struct lmv_user_md *)data->ioc_inlbuf2;
 		lumlen = data->ioc_inllen2;
 
-		if (lum->lum_magic != LMV_USER_MAGIC ||
-		    lumlen != sizeof(*lum)) {
+		if ((lum->lum_magic != LMV_USER_MAGIC &&
+		     lum->lum_magic != LMV_USER_MAGIC_SPECIFIC) ||
+		    lumlen < sizeof(*lum)) {
 			CERROR("%s: wrong lum magic %x or size %d: rc = %d\n",
 			       filename, lum->lum_magic, lumlen, -EFAULT);
 			GOTO(lmv_out_free, rc = -EINVAL);
@@ -1260,7 +1468,7 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 #else
 		mode = data->ioc_type;
 #endif
-		rc = ll_dir_setdirstripe(dentry, lum, filename, mode);
+		rc = ll_dir_setdirstripe(dentry, lum, lumlen, filename, mode);
 lmv_out_free:
 		OBD_FREE_LARGE(buf, len);
 		RETURN(rc);
@@ -1284,34 +1492,51 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	}
 	case LL_IOC_LOV_SETSTRIPE_NEW:
 	case LL_IOC_LOV_SETSTRIPE: {
-		struct lov_user_md_v3 lumv3;
-		struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
+		struct lov_user_md_v3 *lumv3 = NULL;
+		struct lov_user_md_v1 lumv1;
+		struct lov_user_md_v1 *lumv1_ptr = &lumv1;
 		struct lov_user_md_v1 __user *lumv1p =
 			(struct lov_user_md_v1 __user *)arg;
 		struct lov_user_md_v3 __user *lumv3p =
 			(struct lov_user_md_v3 __user *)arg;
+		int lum_size = 0;
 
 		int set_default = 0;
 
 		CLASSERT(sizeof(struct lov_user_md_v3) >
 			 sizeof(struct lov_comp_md_v1));
-		LASSERT(sizeof(lumv3) == sizeof(*lumv3p));
-		LASSERT(sizeof(lumv3.lmm_objects[0]) ==
-				sizeof(lumv3p->lmm_objects[0]));
+		CLASSERT(sizeof(*lumv3) == sizeof(*lumv3p));
 		/* first try with v1 which is smaller than v3 */
-		if (copy_from_user(lumv1, lumv1p, sizeof(*lumv1)))
-                        RETURN(-EFAULT);
-
-		if (lumv1->lmm_magic == LOV_USER_MAGIC_V3)
-			if (copy_from_user(&lumv3, lumv3p, sizeof(lumv3)))
-				RETURN(-EFAULT);
+		if (copy_from_user(&lumv1, lumv1p, sizeof(lumv1)))
+			RETURN(-EFAULT);
 
 		if (inode->i_sb->s_root == file_dentry(file))
 			set_default = 1;
 
-		/* in v1 and v3 cases lumv1 points to data */
-		rc = ll_dir_setstripe(inode, lumv1, set_default);
+		switch (lumv1.lmm_magic) {
+		case LOV_USER_MAGIC_V3:
+		case LOV_USER_MAGIC_SPECIFIC:
+			lum_size = ll_lov_user_md_size(&lumv1);
+			if (lum_size < 0)
+				RETURN(lum_size);
+			OBD_ALLOC(lumv3, lum_size);
+			if (!lumv3)
+				RETURN(-ENOMEM);
+			if (copy_from_user(lumv3, lumv3p, lum_size))
+				GOTO(out, rc = -EFAULT);
+			lumv1_ptr = (struct lov_user_md_v1 *)lumv3;
+			break;
+		case LOV_USER_MAGIC_V1:
+			break;
+		default:
+			GOTO(out, rc = -ENOTSUPP);
+		}
 
+		/* in v1 and v3 cases lumv1 points to data */
+		rc = ll_dir_setstripe(inode, lumv1_ptr, set_default);
+out:
+		if (lumv3)
+			OBD_FREE(lumv3, lum_size);
 		RETURN(rc);
 	}
 	case LL_IOC_LMV_GETSTRIPE: {
@@ -1319,6 +1544,7 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 					(struct lmv_user_md __user *)arg;
 		struct lmv_user_md	lum;
 		struct ptlrpc_request	*request = NULL;
+		struct ptlrpc_request	*root_request = NULL;
 		union lmv_mds_md	*lmm = NULL;
 		int			lmmsize;
 		u64			valid = 0;
@@ -1344,8 +1570,8 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		else
 			RETURN(-EINVAL);
 
-		rc = ll_dir_getstripe(inode, (void **)&lmm, &lmmsize, &request,
-				      valid);
+		rc = ll_dir_getstripe_default(inode, (void **)&lmm, &lmmsize,
+					      &request, &root_request, valid);
 		if (rc != 0)
 			GOTO(finish_req, rc);
 
@@ -1368,7 +1594,8 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			GOTO(finish_req, rc = -E2BIG);
 		}
 
-		lum_size = lmv_user_md_size(stripe_count, LMV_MAGIC_V1);
+		lum_size = lmv_user_md_size(stripe_count,
+					    LMV_USER_MAGIC_SPECIFIC);
 		OBD_ALLOC(tmp, lum_size);
 		if (tmp == NULL)
 			GOTO(finish_req, rc = -ENOMEM);
@@ -1385,12 +1612,15 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			struct lu_fid	fid;
 
 			fid_le_to_cpu(&fid, &lmm->lmv_md_v1.lmv_stripe_fids[i]);
-			mdt_index = ll_get_mdt_idx_by_fid(sbi, &fid);
-			if (mdt_index < 0)
-				GOTO(out_tmp, rc = mdt_index);
+			if (fid_is_sane(&fid)) {
+				mdt_index = ll_get_mdt_idx_by_fid(sbi, &fid);
+				if (mdt_index < 0)
+					GOTO(out_tmp, rc = mdt_index);
+
+				tmp->lum_objects[i].lum_mds = mdt_index;
+				tmp->lum_objects[i].lum_fid = fid;
+			}
 
-			tmp->lum_objects[i].lum_mds = mdt_index;
-			tmp->lum_objects[i].lum_fid = fid;
 			tmp->lum_stripe_count++;
 		}
 
@@ -1400,6 +1630,7 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		OBD_FREE(tmp, lum_size);
 finish_req:
 		ptlrpc_req_finished(request);
+		ptlrpc_req_finished(root_request);
 		return rc;
 	}
 
@@ -1430,6 +1661,8 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                         ll_putname(filename);
 		RETURN(rc);
 	}
+	case LL_IOC_RMFID:
+		RETURN(ll_rmfid(file, (void __user *)arg));
 	case LL_IOC_LOV_SWAP_LAYOUTS:
 		RETURN(-EPERM);
 	case IOC_OBD_STATFS:
@@ -1437,62 +1670,93 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case LL_IOC_LOV_GETSTRIPE:
 	case LL_IOC_LOV_GETSTRIPE_NEW:
 	case LL_IOC_MDC_GETINFO:
+	case LL_IOC_MDC_GETINFO_OLD:
 	case IOC_MDC_GETFILEINFO:
+	case IOC_MDC_GETFILEINFO_OLD:
 	case IOC_MDC_GETFILESTRIPE: {
 		struct ptlrpc_request *request = NULL;
+		struct ptlrpc_request *root_request = NULL;
 		struct lov_user_md __user *lump;
-                struct lov_mds_md *lmm = NULL;
-                struct mdt_body *body;
-                char *filename = NULL;
-                int lmmsize;
-
-                if (cmd == IOC_MDC_GETFILEINFO ||
-                    cmd == IOC_MDC_GETFILESTRIPE) {
+		struct lov_mds_md *lmm = NULL;
+		struct mdt_body *body;
+		char *filename = NULL;
+		lstat_t __user *statp = NULL;
+		lstatx_t __user *stxp = NULL;
+		__u64 __user *flagsp = NULL;
+		__u32 __user *lmmsizep = NULL;
+		struct lu_fid __user *fidp = NULL;
+		int lmmsize;
+
+		if (cmd == IOC_MDC_GETFILEINFO_OLD ||
+		    cmd == IOC_MDC_GETFILEINFO ||
+		    cmd == IOC_MDC_GETFILESTRIPE) {
 			filename = ll_getname((const char __user *)arg);
-                        if (IS_ERR(filename))
-                                RETURN(PTR_ERR(filename));
+			if (IS_ERR(filename))
+				RETURN(PTR_ERR(filename));
 
-                        rc = ll_lov_getstripe_ea_info(inode, filename, &lmm,
-                                                      &lmmsize, &request);
+			rc = ll_lov_getstripe_ea_info(inode, filename, &lmm,
+						      &lmmsize, &request);
 		} else {
-			rc = ll_dir_getstripe(inode, (void **)&lmm, &lmmsize,
-					      &request, 0);
+			rc = ll_dir_getstripe_default(inode, (void **)&lmm,
+						      &lmmsize, &request,
+						      &root_request, 0);
 		}
 
-                if (request) {
-                        body = req_capsule_server_get(&request->rq_pill,
-                                                      &RMF_MDT_BODY);
-                        LASSERT(body != NULL);
-                } else {
-                        GOTO(out_req, rc);
-                }
+		if (request) {
+			body = req_capsule_server_get(&request->rq_pill,
+						      &RMF_MDT_BODY);
+			LASSERT(body != NULL);
+		} else {
+			GOTO(out_req, rc);
+		}
 
-                if (rc < 0) {
-                        if (rc == -ENODATA && (cmd == IOC_MDC_GETFILEINFO ||
-                                               cmd == LL_IOC_MDC_GETINFO))
-                                GOTO(skip_lmm, rc = 0);
-                        else
-                                GOTO(out_req, rc);
-                }
+		if (rc == -ENODATA && (cmd == IOC_MDC_GETFILEINFO ||
+				       cmd == LL_IOC_MDC_GETINFO ||
+				       cmd == IOC_MDC_GETFILEINFO_OLD ||
+				       cmd == LL_IOC_MDC_GETINFO_OLD)) {
+			lmmsize = 0;
+			rc = 0;
+		}
+
+		if (rc < 0)
+			GOTO(out_req, rc);
 
 		if (cmd == IOC_MDC_GETFILESTRIPE ||
 		    cmd == LL_IOC_LOV_GETSTRIPE ||
 		    cmd == LL_IOC_LOV_GETSTRIPE_NEW) {
 			lump = (struct lov_user_md __user *)arg;
-                } else {
+		} else if (cmd == IOC_MDC_GETFILEINFO_OLD ||
+			   cmd == LL_IOC_MDC_GETINFO_OLD){
+			struct lov_user_mds_data_v1 __user *lmdp;
+
+			lmdp = (struct lov_user_mds_data_v1 __user *)arg;
+			statp = &lmdp->lmd_st;
+			lump = &lmdp->lmd_lmm;
+		} else {
 			struct lov_user_mds_data __user *lmdp;
+
 			lmdp = (struct lov_user_mds_data __user *)arg;
-                        lump = &lmdp->lmd_lmm;
-                }
-		if (copy_to_user(lump, lmm, lmmsize)) {
+			fidp = &lmdp->lmd_fid;
+			stxp = &lmdp->lmd_stx;
+			flagsp = &lmdp->lmd_flags;
+			lmmsizep = &lmdp->lmd_lmmsize;
+			lump = &lmdp->lmd_lmm;
+		}
+
+		if (lmmsize == 0) {
+			/* If the file has no striping then zero out *lump so
+			 * that the caller isn't confused by garbage. */
+			if (clear_user(lump, sizeof(*lump)))
+				GOTO(out_req, rc = -EFAULT);
+		} else if (copy_to_user(lump, lmm, lmmsize)) {
 			if (copy_to_user(lump, lmm, sizeof(*lump)))
-                                GOTO(out_req, rc = -EFAULT);
-                        rc = -EOVERFLOW;
-                }
-        skip_lmm:
-                if (cmd == IOC_MDC_GETFILEINFO || cmd == LL_IOC_MDC_GETINFO) {
-			struct lov_user_mds_data __user *lmdp;
-                        lstat_t st = { 0 };
+				GOTO(out_req, rc = -EFAULT);
+			rc = -EOVERFLOW;
+		}
+
+		if (cmd == IOC_MDC_GETFILEINFO_OLD ||
+		    cmd == LL_IOC_MDC_GETINFO_OLD) {
+			lstat_t st = { 0 };
 
 			st.st_dev	= inode->i_sb->s_dev;
 			st.st_mode	= body->mbo_mode;
@@ -1510,29 +1774,86 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 						sbi->ll_flags &
 						LL_SBI_32BIT_API);
 
-			lmdp = (struct lov_user_mds_data __user *)arg;
-			if (copy_to_user(&lmdp->lmd_st, &st, sizeof(st)))
-                                GOTO(out_req, rc = -EFAULT);
-                }
+			if (copy_to_user(statp, &st, sizeof(st)))
+				GOTO(out_req, rc = -EFAULT);
+		} else if (cmd == IOC_MDC_GETFILEINFO ||
+			   cmd == LL_IOC_MDC_GETINFO) {
+			lstatx_t stx = { 0 };
+			__u64 valid = body->mbo_valid;
+
+			stx.stx_blksize = PAGE_SIZE;
+			stx.stx_nlink = body->mbo_nlink;
+			stx.stx_uid = body->mbo_uid;
+			stx.stx_gid = body->mbo_gid;
+			stx.stx_mode = body->mbo_mode;
+			stx.stx_ino = cl_fid_build_ino(&body->mbo_fid1,
+						       sbi->ll_flags &
+						       LL_SBI_32BIT_API);
+			stx.stx_size = body->mbo_size;
+			stx.stx_blocks = body->mbo_blocks;
+			stx.stx_atime.tv_sec = body->mbo_atime;
+			stx.stx_ctime.tv_sec = body->mbo_ctime;
+			stx.stx_mtime.tv_sec = body->mbo_mtime;
+			stx.stx_rdev_major = MAJOR(body->mbo_rdev);
+			stx.stx_rdev_minor = MINOR(body->mbo_rdev);
+			stx.stx_dev_major = MAJOR(inode->i_sb->s_dev);
+			stx.stx_dev_minor = MINOR(inode->i_sb->s_dev);
+			stx.stx_mask |= STATX_BASIC_STATS;
 
-                EXIT;
-        out_req:
-                ptlrpc_req_finished(request);
-                if (filename)
-                        ll_putname(filename);
-                return rc;
-        }
+			/*
+			 * For a striped directory, the size and blocks returned
+			 * from MDT is not correct.
+			 * The size and blocks are aggregated by client across
+			 * all stripes.
+			 * Thus for a striped directory, do not return the valid
+			 * FLSIZE and FLBLOCKS flags to the caller.
+			 * However, this whould be better decided by the MDS
+			 * instead of the client.
+			 */
+			if (cmd == LL_IOC_MDC_GETINFO &&
+			    ll_i2info(inode)->lli_lsm_md != NULL)
+				valid &= ~(OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
+
+			if (flagsp && copy_to_user(flagsp, &valid,
+						   sizeof(*flagsp)))
+				GOTO(out_req, rc = -EFAULT);
+
+			if (fidp && copy_to_user(fidp, &body->mbo_fid1,
+						 sizeof(*fidp)))
+				GOTO(out_req, rc = -EFAULT);
+
+			if (!(valid & OBD_MD_FLSIZE))
+				stx.stx_mask &= ~STATX_SIZE;
+			if (!(valid & OBD_MD_FLBLOCKS))
+				stx.stx_mask &= ~STATX_BLOCKS;
+
+			if (stxp && copy_to_user(stxp, &stx, sizeof(stx)))
+				GOTO(out_req, rc = -EFAULT);
+
+			if (lmmsizep && copy_to_user(lmmsizep, &lmmsize,
+						     sizeof(*lmmsizep)))
+				GOTO(out_req, rc = -EFAULT);
+		}
+
+		EXIT;
+out_req:
+		ptlrpc_req_finished(request);
+		ptlrpc_req_finished(root_request);
+		if (filename)
+			ll_putname(filename);
+		return rc;
+	}
 	case OBD_IOC_QUOTACTL: {
-                struct if_quotactl *qctl;
+		struct if_quotactl *qctl;
 
-                OBD_ALLOC_PTR(qctl);
-                if (!qctl)
-                        RETURN(-ENOMEM);
+		OBD_ALLOC_PTR(qctl);
+		if (!qctl)
+			RETURN(-ENOMEM);
 
 		if (copy_from_user(qctl, (void __user *)arg, sizeof(*qctl)))
-                        GOTO(out_quotactl, rc = -EFAULT);
+			GOTO(out_quotactl, rc = -EFAULT);
 
-                rc = quotactl_ioctl(sbi, qctl);
+		rc = quotactl_ioctl(inode->i_sb, qctl);
 
 		if (rc == 0 &&
 		    copy_to_user((void __user *)arg, qctl, sizeof(*qctl)))
@@ -1683,8 +2004,8 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		if (!cfs_capable(CFS_CAP_SYS_ADMIN))
 			RETURN(-EPERM);
 
-		rc = copy_and_ioctl(cmd, sbi->ll_md_exp, (void __user *)arg,
-				    sizeof(struct lustre_kernelcomm));
+		rc = copy_and_ct_start(cmd, sbi->ll_md_exp,
+				       (struct lustre_kernelcomm __user *)arg);
 		RETURN(rc);
 
 	case LL_IOC_HSM_COPY_START: {
@@ -1726,15 +2047,15 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		RETURN(rc);
 	}
 	case LL_IOC_MIGRATE: {
-		char		*buf = NULL;
-		const char	*filename;
-		int		namelen = 0;
-		int		len;
-		int		rc;
-		int		mdtidx;
+		struct lmv_user_md *lum;
+		char *buf = NULL;
+		int len;
+		char *filename;
+		int namelen = 0;
+		int rc;
 
 		rc = obd_ioctl_getdata(&buf, &len, (void __user *)arg);
-		if (rc < 0)
+		if (rc)
 			RETURN(rc);
 
 		data = (struct obd_ioctl_data *)buf;
@@ -1744,15 +2065,22 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 		filename = data->ioc_inlbuf1;
 		namelen = data->ioc_inllen1;
-		/* \0 is packed at the end of filename */
-		if (namelen < 1 || namelen != strlen(filename) + 1)
-			GOTO(migrate_free, rc = -EINVAL);
 
-		if (data->ioc_inllen2 != sizeof(mdtidx))
+		if (namelen < 1 || namelen != strlen(filename) + 1) {
+			CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
 			GOTO(migrate_free, rc = -EINVAL);
-		mdtidx = *(int *)data->ioc_inlbuf2;
+		}
+
+		lum = (struct lmv_user_md *)data->ioc_inlbuf2;
+		if (lum->lum_magic != LMV_USER_MAGIC &&
+		    lum->lum_magic != LMV_USER_MAGIC_SPECIFIC) {
+			rc = -EINVAL;
+			CERROR("%s: wrong lum magic %x: rc = %d\n",
+			       filename, lum->lum_magic, rc);
+			GOTO(migrate_free, rc);
+		}
 
-		rc = ll_migrate(inode, file, mdtidx, filename, namelen - 1);
+		rc = ll_migrate(inode, file, lum, filename);
 migrate_free:
 		OBD_FREE_LARGE(buf, len);
 
diff --git a/drivers/staging/lustrefsx/lustre/llite/file.c b/drivers/staging/lustrefsx/lustre/llite/file.c
index 04cc72f451861..22b09065f90f5 100644
--- a/drivers/staging/lustrefsx/lustre/llite/file.c
+++ b/drivers/staging/lustrefsx/lustre/llite/file.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -45,15 +45,19 @@
 #ifdef HAVE_UIDGID_HEADER
 # include <linux/uidgid.h>
 #endif
-#include <lustre/ll_fiemap.h>
 
-#include <uapi/linux/lustre_ioctl.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
 #include <lustre_swab.h>
 
 #include "cl_object.h"
 #include "llite_internal.h"
 #include "vvp_internal.h"
 
+struct split_param {
+	struct inode	*sp_inode;
+	__u16		sp_mirror_id;
+};
+
 static int
 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
 
@@ -95,12 +99,15 @@ static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
 	op_data->op_attr.ia_mtime = inode->i_mtime;
 	op_data->op_attr.ia_ctime = inode->i_ctime;
 	op_data->op_attr.ia_size = i_size_read(inode);
-	op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
-				     ATTR_MTIME | ATTR_MTIME_SET |
-				     ATTR_CTIME | ATTR_CTIME_SET;
+	op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
+				      ATTR_MTIME | ATTR_MTIME_SET |
+				      ATTR_CTIME);
+	op_data->op_xvalid |= OP_XVALID_CTIME_SET;
 	op_data->op_attr_blocks = inode->i_blocks;
 	op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
-	op_data->op_handle = och->och_fh;
+	if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
+		op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
+	op_data->op_open_handle = och->och_open_handle;
 
 	if (och->och_flags & FMODE_WRITE &&
 	    ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
@@ -145,20 +152,53 @@ static int ll_close_inode_openhandle(struct inode *inode,
 
 	ll_prepare_close(inode, op_data, och);
 	switch (bias) {
-	case MDS_CLOSE_LAYOUT_SWAP:
+	case MDS_CLOSE_LAYOUT_MERGE:
+		/* merge blocks from the victim inode */
+		op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
+		op_data->op_attr.ia_valid |= ATTR_SIZE;
+		op_data->op_xvalid |= OP_XVALID_BLOCKS;
+		/* fallthrough */
+	case MDS_CLOSE_LAYOUT_SPLIT:
+	case MDS_CLOSE_LAYOUT_SWAP: {
+		struct split_param *sp = data;
+
 		LASSERT(data != NULL);
-		op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
+		op_data->op_bias |= bias;
 		op_data->op_data_version = 0;
 		op_data->op_lease_handle = och->och_lease_handle;
-		op_data->op_fid2 = *ll_inode2fid(data);
+		if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
+			op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
+			op_data->op_mirror_id = sp->sp_mirror_id;
+		} else {
+			op_data->op_fid2 = *ll_inode2fid(data);
+		}
+		break;
+	}
+
+	case MDS_CLOSE_RESYNC_DONE: {
+		struct ll_ioc_lease *ioc = data;
+
+		LASSERT(data != NULL);
+		op_data->op_attr_blocks +=
+			ioc->lil_count * op_data->op_attr_blocks;
+		op_data->op_attr.ia_valid |= ATTR_SIZE;
+		op_data->op_xvalid |= OP_XVALID_BLOCKS;
+		op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
+
+		op_data->op_lease_handle = och->och_lease_handle;
+		op_data->op_data = &ioc->lil_ids[0];
+		op_data->op_data_size =
+			ioc->lil_count * sizeof(ioc->lil_ids[0]);
 		break;
+	}
 
 	case MDS_HSM_RELEASE:
 		LASSERT(data != NULL);
 		op_data->op_bias |= MDS_HSM_RELEASE;
 		op_data->op_data_version = *(__u64 *)data;
 		op_data->op_lease_handle = och->och_lease_handle;
-		op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
+		op_data->op_attr.ia_valid |= ATTR_SIZE;
+		op_data->op_xvalid |= OP_XVALID_BLOCKS;
 		break;
 
 	default:
@@ -166,13 +206,17 @@ static int ll_close_inode_openhandle(struct inode *inode,
 		break;
 	}
 
+	if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
+		op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
+	if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
+		op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
+
 	rc = md_close(md_exp, op_data, och->och_mod, &req);
 	if (rc != 0 && rc != -EINTR)
 		CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 		       md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 
-	if (rc == 0 &&
-	    op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
+	if (rc == 0 && op_data->op_bias & bias) {
 		struct mdt_body *body;
 
 		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
@@ -185,7 +229,7 @@ static int ll_close_inode_openhandle(struct inode *inode,
 out:
 
 	md_clear_open_replay_data(md_exp, och);
-	och->och_fh.cookie = DEAD_HANDLE_MAGIC;
+	och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
 	OBD_FREE_PTR(och);
 
 	ptlrpc_req_finished(req);	/* This is close request */
@@ -287,7 +331,9 @@ static int ll_md_close(struct inode *inode, struct file *file)
 	}
 	mutex_unlock(&lli->lli_och_mutex);
 
-	if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
+	/* LU-4398: do not cache write open lock if the file has exec bit */
+	if ((lockmode == LCK_CW && inode->i_mode & S_IXUGO) ||
+	    !md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 			   LDLM_IBITS, &policy, lockmode, &lockh))
 		rc = ll_md_real_close(inode, fd->fd_omode);
 
@@ -344,12 +390,146 @@ int ll_file_release(struct inode *inode, struct file *file)
 	RETURN(rc);
 }
 
+static inline int ll_dom_readpage(void *data, struct page *page)
+{
+	struct niobuf_local *lnb = data;
+	void *kaddr;
+
+	kaddr = ll_kmap_atomic(page, KM_USER0);
+	memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
+	if (lnb->lnb_len < PAGE_SIZE)
+		memset(kaddr + lnb->lnb_len, 0,
+		       PAGE_SIZE - lnb->lnb_len);
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+	ll_kunmap_atomic(kaddr, KM_USER0);
+	unlock_page(page);
+
+	return 0;
+}
+
+void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req)
+{
+	struct lu_env *env;
+	struct cl_io *io;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct cl_object *obj = lli->lli_clob;
+	struct address_space *mapping = inode->i_mapping;
+	struct page *vmpage;
+	struct niobuf_remote *rnb;
+	struct mdt_body *body;
+	char *data;
+	unsigned long index, start;
+	struct niobuf_local lnb;
+	__u16 refcheck;
+	int rc;
+
+	ENTRY;
+
+	if (obj == NULL)
+		RETURN_EXIT;
+
+	if (!req_capsule_field_present(&req->rq_pill, &RMF_NIOBUF_INLINE,
+				       RCL_SERVER))
+		RETURN_EXIT;
+
+	rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
+	if (rnb == NULL || rnb->rnb_len == 0)
+		RETURN_EXIT;
+
+	/* LU-11595: Server may return whole file and that is OK always or
+	 * it may return just file tail and its offset must be aligned with
+	 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
+	 * smaller then offset may be not aligned and that data is just ignored.
+	 */
+	if (rnb->rnb_offset % PAGE_SIZE)
+		RETURN_EXIT;
+
+	/* Server returns whole file or just file tail if it fills in reply
+	 * buffer, in both cases total size should be equal to the file size.
+	 */
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (rnb->rnb_offset + rnb->rnb_len != body->mbo_dom_size) {
+		CERROR("%s: server returns off/len %llu/%u but size %llu\n",
+		       ll_get_fsname(inode->i_sb, NULL, 0), rnb->rnb_offset,
+		       rnb->rnb_len, body->mbo_dom_size);
+		RETURN_EXIT;
+	}
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN_EXIT;
+	io = vvp_env_thread_io(env);
+	io->ci_obj = obj;
+	io->ci_ignore_layout = 1;
+	rc = cl_io_init(env, io, CIT_MISC, obj);
+	if (rc)
+		GOTO(out_io, rc);
+
+	CDEBUG(D_INFO, "Get data along with open at %llu len %i, size %llu\n",
+	       rnb->rnb_offset, rnb->rnb_len, body->mbo_dom_size);
+
+	data = (char *)rnb + sizeof(*rnb);
+
+	lnb.lnb_file_offset = rnb->rnb_offset;
+	start = lnb.lnb_file_offset / PAGE_SIZE;
+	index = 0;
+	LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
+	lnb.lnb_page_offset = 0;
+	do {
+		struct cl_page *page;
+
+		lnb.lnb_data = data + (index << PAGE_SHIFT);
+		lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
+		if (lnb.lnb_len > PAGE_SIZE)
+			lnb.lnb_len = PAGE_SIZE;
+
+		vmpage = read_cache_page(mapping, index + start,
+					 ll_dom_readpage, &lnb);
+		if (IS_ERR(vmpage)) {
+			CWARN("%s: cannot fill page %lu for "DFID
+			      " with data: rc = %li\n",
+			      ll_get_fsname(inode->i_sb, NULL, 0),
+			      index + start, PFID(lu_object_fid(&obj->co_lu)),
+			      PTR_ERR(vmpage));
+			break;
+		}
+		lock_page(vmpage);
+		if (vmpage->mapping == NULL) {
+			unlock_page(vmpage);
+			put_page(vmpage);
+			/* page was truncated */
+			break;
+		}
+		/* attach VM page to CL page cache */
+		page = cl_page_find(env, obj, vmpage->index, vmpage,
+				    CPT_CACHEABLE);
+		if (IS_ERR(page)) {
+			ClearPageUptodate(vmpage);
+			unlock_page(vmpage);
+			put_page(vmpage);
+			break;
+		}
+		cl_page_export(env, page, 1);
+		cl_page_put(env, page);
+		unlock_page(vmpage);
+		put_page(vmpage);
+		index++;
+	} while (rnb->rnb_len > (index << PAGE_SHIFT));
+
+out_io:
+	cl_io_fini(env, io);
+	cl_env_put(env, &refcheck);
+
+	EXIT;
+}
+
 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 				struct lookup_intent *itp)
 {
 	struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 	struct dentry *parent = de->d_parent;
-	const char *name = NULL;
+	char *name = NULL;
 	int len = 0;
 	struct md_op_data *op_data;
 	struct ptlrpc_request *req = NULL;
@@ -361,21 +541,43 @@ static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 
 	/* if server supports open-by-fid, or file name is invalid, don't pack
 	 * name in open request */
-	if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
-	    lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
-		name = de->d_name.name;
+	if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
+	    !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
+retry:
 		len = de->d_name.len;
+		name = kmalloc(len + 1, GFP_NOFS);
+		if (!name)
+			RETURN(-ENOMEM);
+
+		/* race here */
+		spin_lock(&de->d_lock);
+		if (len != de->d_name.len) {
+			spin_unlock(&de->d_lock);
+			kfree(name);
+			goto retry;
+		}
+		memcpy(name, de->d_name.name, len);
+		name[len] = '\0';
+		spin_unlock(&de->d_lock);
+
+		if (!lu_name_is_valid_2(name, len)) {
+			kfree(name);
+			RETURN(-ESTALE);
+		}
 	}
 
 	op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 				     name, len, 0, LUSTRE_OPC_ANY, NULL);
-	if (IS_ERR(op_data))
+	if (IS_ERR(op_data)) {
+		kfree(name);
 		RETURN(PTR_ERR(op_data));
+	}
 	op_data->op_data = lmm;
 	op_data->op_data_size = lmmsize;
 
 	rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 			    &ll_md_blocking_ast, 0);
+	kfree(name);
 	ll_finish_md_op_data(op_data);
 	if (rc == -ESTALE) {
 		/* reason for keep own exit path - don`t flood log
@@ -398,8 +600,25 @@ static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 	}
 
 	rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
-	if (!rc && itp->it_lock_mode)
-		ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
+
+	if (!rc && itp->it_lock_mode) {
+		__u64 bits = 0;
+
+		/* If we got a lock back and it has a LOOKUP bit set,
+		 * make sure the dentry is marked as valid so we can find it.
+		 * We don't need to care about actual hashing since other bits
+		 * of kernel will deal with that later.
+		 */
+		ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, &bits);
+		if (bits & MDS_INODELOCK_LOOKUP)
+			d_lustre_revalidate(de);
+
+		/* if DoM bit returned along with LAYOUT bit then there
+		 * can be read-on-open data returned.
+		 */
+		if (bits & MDS_INODELOCK_DOM && bits & MDS_INODELOCK_LAYOUT)
+			ll_dom_finish_open(de->d_inode, req);
+	}
 
 out:
 	ptlrpc_req_finished(req);
@@ -424,7 +643,7 @@ static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 	struct mdt_body *body;
 
 	body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
-	och->och_fh = body->mbo_handle;
+	och->och_open_handle = body->mbo_open_handle;
 	och->och_fid = body->mbo_fid1;
 	och->och_lease_handle.cookie = it->it_lock_handle;
 	och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
@@ -494,7 +713,7 @@ int ll_file_open(struct inode *inode, struct file *file)
 
 	fd = ll_file_data_get();
 	if (fd == NULL)
-		GOTO(out_openerr, rc = -ENOMEM);
+		GOTO(out_nofiledata, rc = -ENOMEM);
 
 	fd->fd_file = file;
 	if (S_ISDIR(inode->i_mode))
@@ -514,12 +733,13 @@ int ll_file_open(struct inode *inode, struct file *file)
                 if (file->f_flags & O_TRUNC)
                         oit.it_flags |= FMODE_WRITE;
 
-                /* kernel only call f_op->open in dentry_open.  filp_open calls
-                 * dentry_open after call to open_namei that checks permissions.
-                 * Only nfsd_open call dentry_open directly without checking
-                 * permissions and because of that this code below is safe. */
-                if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
-                        oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
+		/* kernel only call f_op->open in dentry_open.  filp_open calls
+		 * dentry_open after call to open_namei that checks permissions.
+		 * Only nfsd_open call dentry_open directly without checking
+		 * permissions and because of that this code below is safe.
+		 */
+		if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
+			oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 
                 /* We do not want O_EXCL here, presumably we opened the file
                  * already? XXX - NFS implications? */
@@ -663,6 +883,7 @@ int ll_file_open(struct inode *inode, struct file *file)
                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
         }
 
+out_nofiledata:
 	if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 		ptlrpc_req_finished(it->it_request);
 		it_clear_disposition(it, DISP_ENQ_OPEN_REF);
@@ -700,7 +921,7 @@ static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
  * if it has an open lock in cache already.
  */
 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
-				struct lustre_handle *old_handle)
+				struct lustre_handle *old_open_handle)
 {
 	struct ll_inode_info *lli = ll_i2info(inode);
 	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
@@ -733,7 +954,7 @@ static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 		*och_p = NULL;
 	}
 
-	*old_handle = fd->fd_och->och_fh;
+	*old_open_handle = fd->fd_och->och_open_handle;
 
 	EXIT;
 out_unlock:
@@ -794,7 +1015,7 @@ ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 	struct ll_sb_info *sbi = ll_i2sbi(inode);
 	struct md_op_data *op_data;
 	struct ptlrpc_request *req = NULL;
-	struct lustre_handle old_handle = { 0 };
+	struct lustre_handle old_open_handle = { 0 };
 	struct obd_client_handle *och = NULL;
 	int rc;
 	int rc2;
@@ -807,7 +1028,7 @@ ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 		if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 			RETURN(ERR_PTR(-EPERM));
 
-		rc = ll_lease_och_acquire(inode, file, &old_handle);
+		rc = ll_lease_och_acquire(inode, file, &old_open_handle);
 		if (rc)
 			RETURN(ERR_PTR(rc));
 	}
@@ -822,7 +1043,7 @@ ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 		GOTO(out, rc = PTR_ERR(op_data));
 
 	/* To tell the MDT this openhandle is from the same owner */
-	op_data->op_handle = old_handle;
+	op_data->op_open_handle = old_open_handle;
 
 	it.it_flags = fmode | open_flags;
 	it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
@@ -848,7 +1069,9 @@ ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 		GOTO(out_release_it, rc);
 
 	LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
-	ll_och_fill(sbi->ll_md_exp, &it, och);
+	rc = ll_och_fill(sbi->ll_md_exp, &it, och);
+	if (rc)
+		GOTO(out_release_it, rc);
 
 	if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
 		GOTO(out_close, rc = -EOPNOTSUPP);
@@ -936,7 +1159,7 @@ static int ll_swap_layouts_close(struct obd_client_handle *och,
 	if (rc == 0)
 		GOTO(out_free_och, rc = -EINVAL);
 
-	/* Close the file and swap layouts between inode & inode2.
+	/* Close the file and {swap,merge} layouts between inode & inode2.
 	 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
 	 * because we still need it to pack l_remote_handle to MDT. */
 	rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
@@ -955,8 +1178,10 @@ static int ll_swap_layouts_close(struct obd_client_handle *och,
  * Release lease and close the file.
  * It will check if the lease has ever broken.
  */
-static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
-			  bool *lease_broken)
+static int ll_lease_close_intent(struct obd_client_handle *och,
+				 struct inode *inode,
+				 bool *lease_broken, enum mds_op_bias bias,
+				 void *data)
 {
 	struct ldlm_lock *lock;
 	bool cancelled = true;
@@ -971,19 +1196,71 @@ static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
 		LDLM_LOCK_PUT(lock);
 	}
 
-	CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
-	       PFID(&ll_i2info(inode)->lli_fid), cancelled);
-
-	if (!cancelled)
-		ldlm_cli_cancel(&och->och_lease_handle, 0);
+	CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
+	       PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
 
 	if (lease_broken != NULL)
 		*lease_broken = cancelled;
 
-	rc = ll_close_inode_openhandle(inode, och, 0, NULL);
+	if (!cancelled && !bias)
+		ldlm_cli_cancel(&och->och_lease_handle, 0);
+
+	if (cancelled) { /* no need to excute intent */
+		bias = 0;
+		data = NULL;
+	}
+
+	rc = ll_close_inode_openhandle(inode, och, bias, data);
 	RETURN(rc);
 }
 
+static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
+			  bool *lease_broken)
+{
+	return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
+}
+
+/**
+ * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
+ */
+static int ll_lease_file_resync(struct obd_client_handle *och,
+				struct inode *inode, unsigned long arg)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct md_op_data *op_data;
+	struct ll_ioc_lease_id ioc;
+	__u64 data_version_unused;
+	int rc;
+	ENTRY;
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
+			   sizeof(ioc)))
+		RETURN(-EFAULT);
+
+	/* before starting file resync, it's necessary to clean up page cache
+	 * in client memory, otherwise once the layout version is increased,
+	 * writing back cached data will be denied the OSTs. */
+	rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
+	if (rc)
+		GOTO(out, rc);
+
+	op_data->op_lease_handle = och->och_lease_handle;
+	op_data->op_mirror_id = ioc.lil_mirror_id;
+	rc = md_file_resync(sbi->ll_md_exp, op_data);
+	if (rc)
+		GOTO(out, rc);
+
+	EXIT;
+out:
+	ll_finish_md_op_data(op_data);
+	return rc;
+}
+
 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
 {
 	struct ll_inode_info *lli = ll_i2info(inode);
@@ -1023,11 +1300,14 @@ int ll_merge_attr(const struct lu_env *env, struct inode *inode)
 	ctime = inode->i_ctime.tv_sec;
 
 	cl_object_attr_lock(obj);
-	rc = cl_object_attr_get(env, obj, attr);
+	if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
+		rc = -EINVAL;
+	else
+		rc = cl_object_attr_get(env, obj, attr);
 	cl_object_attr_unlock(obj);
 
 	if (rc != 0)
-		GOTO(out_size_unlock, rc);
+		GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
 
 	if (atime < attr->cat_atime)
 		atime = attr->cat_atime;
@@ -1054,6 +1334,32 @@ int ll_merge_attr(const struct lu_env *env, struct inode *inode)
 	RETURN(rc);
 }
 
+/**
+ * Set designated mirror for I/O.
+ *
+ * So far only read, write, and truncated can support to issue I/O to
+ * designated mirror.
+ */
+void ll_io_set_mirror(struct cl_io *io, const struct file *file)
+{
+	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+	/* clear layout version for generic(non-resync) I/O in case it carries
+	 * stale layout version due to I/O restart */
+	io->ci_layout_version = 0;
+
+	/* FLR: disable non-delay for designated mirror I/O because obviously
+	 * only one mirror is available */
+	if (fd->fd_designated_mirror > 0) {
+		io->ci_ndelay = 0;
+		io->ci_designated_mirror = fd->fd_designated_mirror;
+		io->ci_layout_version = fd->fd_layout_version;
+	}
+
+	CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
+	       file->f_path.dentry->d_name.name, io->ci_designated_mirror);
+}
+
 static bool file_is_noatime(const struct file *file)
 {
 	const struct vfsmount *mnt = file->f_path.mnt;
@@ -1081,20 +1387,17 @@ static bool file_is_noatime(const struct file *file)
 	return false;
 }
 
-static int ll_file_io_ptask(struct cfs_ptask *ptask);
-
 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
 {
 	struct inode *inode = file_inode(file);
+	struct ll_file_data *fd  = LUSTRE_FPRIVATE(file);
+
+	io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
+	io->ci_lock_no_expand = fd->ll_lock_no_expand;
 
-	memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
-	init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
-	io->u.ci_rw.rw_file = file;
-	io->u.ci_rw.rw_ptask = ll_file_io_ptask;
-	io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
 	if (iot == CIT_WRITE) {
-		io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
-		io->u.ci_rw.rw_sync   = !!(file->f_flags & O_SYNC ||
+		io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
+		io->u.ci_wr.wr_sync   = !!(file->f_flags & O_SYNC ||
 					   file->f_flags & O_DIRECT ||
 					   IS_SYNC(inode));
 	}
@@ -1107,94 +1410,12 @@ static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
 		io->ci_lockreq = CILR_MANDATORY;
 	}
 	io->ci_noatime = file_is_noatime(file);
-	if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
-		io->ci_pio = !io->u.ci_rw.rw_append;
-	else
-		io->ci_pio = 0;
-}
 
-static int ll_file_io_ptask(struct cfs_ptask *ptask)
-{
-	struct cl_io_pt *pt = ptask->pt_cbdata;
-	struct file *file = pt->cip_file;
-	struct lu_env *env;
-	struct cl_io *io;
-	loff_t pos = pt->cip_pos;
-	int rc;
-	__u16 refcheck;
-	ENTRY;
+	/* FLR: only use non-delay I/O for read as there is only one
+	 * avaliable mirror for write. */
+	io->ci_ndelay = !(iot == CIT_WRITE);
 
-	env = cl_env_get(&refcheck);
-	if (IS_ERR(env))
-		RETURN(PTR_ERR(env));
-
-	CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
-		file_dentry(file)->d_name.name,
-		pt->cip_iot == CIT_READ ? "read" : "write",
-		pos, pos + pt->cip_count);
-
-restart:
-	io = vvp_env_thread_io(env);
-	ll_io_init(io, file, pt->cip_iot);
-	io->u.ci_rw.rw_iter = pt->cip_iter;
-	io->u.ci_rw.rw_iocb = pt->cip_iocb;
-	io->ci_pio = 0; /* It's already in parallel task */
-
-	rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
-			   pt->cip_count - pt->cip_result);
-	if (!rc) {
-		struct vvp_io *vio = vvp_env_io(env);
-
-		vio->vui_io_subtype = IO_NORMAL;
-		vio->vui_fd = LUSTRE_FPRIVATE(file);
-
-		ll_cl_add(file, env, io, LCC_RW);
-		rc = cl_io_loop(env, io);
-		ll_cl_remove(file, env);
-	} else {
-		/* cl_io_rw_init() handled IO */
-		rc = io->ci_result;
-	}
-
-	if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
-		if (io->ci_nob > 0)
-			io->ci_nob /= 2;
-		rc = -EIO;
-	}
-
-	if (io->ci_nob > 0) {
-		pt->cip_result += io->ci_nob;
-		iov_iter_advance(&pt->cip_iter, io->ci_nob);
-		pos += io->ci_nob;
-		pt->cip_iocb.ki_pos = pos;
-#ifdef HAVE_KIOCB_KI_LEFT
-		pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
-#elif defined(HAVE_KI_NBYTES)
-		pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
-#endif
-	}
-
-	cl_io_fini(env, io);
-
-	if ((rc == 0 || rc == -ENODATA) &&
-	    pt->cip_result < pt->cip_count &&
-	    io->ci_need_restart) {
-		CDEBUG(D_VFSTRACE,
-			"%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
-			file_dentry(file)->d_name.name,
-			pt->cip_iot == CIT_READ ? "read" : "write",
-			pos, pos + pt->cip_count - pt->cip_result,
-			pt->cip_result, rc);
-		goto restart;
-	}
-
-	CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
-		file_dentry(file)->d_name.name,
-		pt->cip_iot == CIT_READ ? "read" : "write",
-		pt->cip_result, rc);
-
-	cl_env_put(env, &refcheck);
-	RETURN(pt->cip_result > 0 ? 0 : rc);
+	ll_io_set_mirror(io, file);
 }
 
 static ssize_t
@@ -1202,45 +1423,43 @@ ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
 		   struct file *file, enum cl_io_type iot,
 		   loff_t *ppos, size_t count)
 {
-	struct range_lock	range;
 	struct vvp_io		*vio = vvp_env_io(env);
 	struct inode		*inode = file_inode(file);
 	struct ll_inode_info	*lli = ll_i2info(inode);
 	struct ll_file_data	*fd  = LUSTRE_FPRIVATE(file);
+	struct range_lock	range;
 	struct cl_io		*io;
-	loff_t			pos = *ppos;
 	ssize_t			result = 0;
 	int			rc = 0;
+	unsigned		retried = 0;
+	bool			restarted = false;
 
 	ENTRY;
 
-	CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
+	CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
 		file_dentry(file)->d_name.name,
-		iot == CIT_READ ? "read" : "write", pos, pos + count);
+		iot == CIT_READ ? "read" : "write", *ppos, count);
 
 restart:
 	io = vvp_env_thread_io(env);
 	ll_io_init(io, file, iot);
-	if (args->via_io_subtype == IO_NORMAL) {
-		io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
-		io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
-	} else {
-		io->ci_pio = 0;
-	}
+	io->ci_ndelay_tried = retried;
 
-	if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
+	if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
 		bool range_locked = false;
 
 		if (file->f_flags & O_APPEND)
 			range_lock_init(&range, 0, LUSTRE_EOF);
 		else
-			range_lock_init(&range, pos, pos + count - 1);
+			range_lock_init(&range, *ppos, *ppos + count - 1);
 
 		vio->vui_fd  = LUSTRE_FPRIVATE(file);
 		vio->vui_io_subtype = args->via_io_subtype;
 
 		switch (vio->vui_io_subtype) {
 		case IO_NORMAL:
+			vio->vui_iter = args->u.normal.via_iter;
+			vio->vui_iocb = args->u.normal.via_iocb;
 			/* Direct IO reads must also take range lock,
 			 * or multiple reads will try to work on the same pages
 			 * See LU-6227 for details. */
@@ -1266,16 +1485,7 @@ ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
 		}
 
 		ll_cl_add(file, env, io, LCC_RW);
-		if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
-		    !lli->lli_inode_locked) {
-			inode_lock(inode);
-			lli->lli_inode_locked = 1;
-		}
 		rc = cl_io_loop(env, io);
-		if (lli->lli_inode_locked) {
-			lli->lli_inode_locked = 0;
-			inode_unlock(inode);
-		}
 		ll_cl_remove(file, env);
 
 		if (range_locked) {
@@ -1291,38 +1501,29 @@ ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
 	if (io->ci_nob > 0) {
 		result += io->ci_nob;
 		count  -= io->ci_nob;
+		*ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
 
-		if (args->via_io_subtype == IO_NORMAL) {
-			iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
-
-			/* CLIO is too complicated. See LU-11069. */
-			if (cl_io_is_append(io))
-				pos = io->u.ci_rw.rw_iocb.ki_pos;
-			else
-				pos += io->ci_nob;
-
-			args->u.normal.via_iocb->ki_pos = pos;
-			if (io->ci_pio) {
-#ifdef HAVE_KIOCB_KI_LEFT
-				args->u.normal.via_iocb->ki_left = count;
-#elif defined(HAVE_KI_NBYTES)
-				args->u.normal.via_iocb->ki_nbytes = count;
-#endif
-			}
-		} else {
-			/* for splice */
-			pos = io->u.ci_rw.rw_range.cir_pos;
-		}
+		/* prepare IO restart */
+		if (count > 0 && args->via_io_subtype == IO_NORMAL)
+			args->u.normal.via_iter = vio->vui_iter;
 	}
 out:
 	cl_io_fini(env, io);
 
+	CDEBUG(D_VFSTRACE,
+	       "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
+	       file->f_path.dentry->d_name.name,
+	       iot, rc, result, io->ci_need_restart);
+
 	if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
 		CDEBUG(D_VFSTRACE,
-			"%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
-			file_dentry(file)->d_name.name,
-			iot == CIT_READ ? "read" : "write",
-			pos, pos + count, result, rc);
+		       "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
+		       file_dentry(file)->d_name.name,
+		       iot == CIT_READ ? "read" : "write",
+		       *ppos, count, result, rc);
+		/* preserve the tried count for FLR */
+		retried = io->ci_ndelay_tried;
+		restarted = true;
 		goto restart;
 	}
 
@@ -1346,11 +1547,7 @@ ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
 		}
 	}
 
-	CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
-		file_dentry(file)->d_name.name,
-		iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
-
-	*ppos = pos;
+	CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
 
 	RETURN(result > 0 ? result : rc);
 }
@@ -1391,8 +1588,7 @@ ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
  * \retval - number of bytes have been read, or error code if error occurred.
  */
 static ssize_t
-ll_do_fast_read(const struct lu_env *env, struct kiocb *iocb,
-		struct iov_iter *iter)
+ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
 {
 	ssize_t result;
 
@@ -1404,9 +1600,7 @@ ll_do_fast_read(const struct lu_env *env, struct kiocb *iocb,
 	if (iocb->ki_filp->f_flags & O_DIRECT)
 		return 0;
 
-	ll_cl_add(iocb->ki_filp, env, NULL, LCC_RW);
 	result = generic_file_read_iter(iocb, iter);
-	ll_cl_remove(iocb->ki_filp, env);
 
 	/* If the first page is not in cache, generic_file_aio_read() will be
 	 * returned with -ENODATA.
@@ -1428,34 +1622,101 @@ static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
 	struct lu_env *env;
 	struct vvp_io_args *args;
+	struct file *file = iocb->ki_filp;
 	ssize_t result;
 	ssize_t rc2;
 	__u16 refcheck;
 
-	env = cl_env_get(&refcheck);
-	if (IS_ERR(env))
-		return PTR_ERR(env);
+	if (!iov_iter_count(to))
+		return 0;
 
-	result = ll_do_fast_read(env, iocb, to);
+	result = ll_do_fast_read(iocb, to);
 	if (result < 0 || iov_iter_count(to) == 0)
 		GOTO(out, result);
 
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		return PTR_ERR(env);
+
 	args = ll_env_args(env, IO_NORMAL);
 	args->u.normal.via_iter = to;
 	args->u.normal.via_iocb = iocb;
 
-	rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
+	rc2 = ll_file_io_generic(env, args, file, CIT_READ,
 				 &iocb->ki_pos, iov_iter_count(to));
 	if (rc2 > 0)
 		result += rc2;
 	else if (result == 0)
 		result = rc2;
 
-out:
 	cl_env_put(env, &refcheck);
+out:
+	if (result > 0)
+		ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
+				  LUSTRE_FPRIVATE(file), iocb->ki_pos, result,
+				  READ);
+
 	return result;
 }
 
+/**
+ * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
+ * If a page is already in the page cache and dirty (and some other things -
+ * See ll_tiny_write_begin for the instantiation of these rules), then we can
+ * write to it without doing a full I/O, because Lustre already knows about it
+ * and will write it out.  This saves a lot of processing time.
+ *
+ * All writes here are within one page, so exclusion is handled by the page
+ * lock on the vm page.  We do not do tiny writes for writes which touch
+ * multiple pages because it's very unlikely multiple sequential pages are
+ * are already dirty.
+ *
+ * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
+ * and are unlikely to be to already dirty pages.
+ *
+ * Attribute updates are important here, we do them in ll_tiny_write_end.
+ */
+static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
+{
+	ssize_t count = iov_iter_count(iter);
+	struct  file *file = iocb->ki_filp;
+	struct  inode *inode = file_inode(file);
+	bool    lock_inode = !IS_NOSEC(inode);
+	ssize_t result = 0;
+
+	ENTRY;
+
+	/* Restrict writes to single page and < PAGE_SIZE.  See comment at top
+	 * of function for why.
+	 */
+	if (count >= PAGE_SIZE ||
+	    (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
+		RETURN(0);
+
+	if (unlikely(lock_inode))
+		inode_lock(inode);
+	result = __generic_file_write_iter(iocb, iter);
+
+	if (unlikely(lock_inode))
+		inode_unlock(inode);
+
+	/* If the page is not already dirty, ll_tiny_write_begin returns
+	 * -ENODATA.  We continue on to normal write.
+	 */
+	if (result == -ENODATA)
+		result = 0;
+
+	if (result > 0) {
+		ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
+				   result);
+		ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
+	}
+
+	CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
+
+	RETURN(result);
+}
+
 /*
  * Write to a file (through the page cache).
  */
@@ -1463,9 +1724,30 @@ static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct vvp_io_args *args;
 	struct lu_env *env;
-	ssize_t result;
+	ssize_t rc_tiny = 0, rc_normal;
+	struct file *file = iocb->ki_filp;
 	__u16 refcheck;
 
+	ENTRY;
+
+	if (!iov_iter_count(from))
+		GOTO(out, rc_normal = 0);
+
+	/* NB: we can't do direct IO for tiny writes because they use the page
+	 * cache, we can't do sync writes because tiny writes can't flush
+	 * pages, and we can't do append writes because we can't guarantee the
+	 * required DLM locks are held to protect file size.
+	 */
+	if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(file))) &&
+	    !(file->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
+		rc_tiny = ll_do_tiny_write(iocb, from);
+
+	/* In case of error, go on and try normal write - Only stop if tiny
+	 * write completed I/O.
+	 */
+	if (iov_iter_count(from) == 0)
+		GOTO(out, rc_normal = rc_tiny);
+
 	env = cl_env_get(&refcheck);
 	if (IS_ERR(env))
 		return PTR_ERR(env);
@@ -1474,10 +1756,25 @@ static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	args->u.normal.via_iter = from;
 	args->u.normal.via_iocb = iocb;
 
-	result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
-				    &iocb->ki_pos, iov_iter_count(from));
+	rc_normal = ll_file_io_generic(env, args, file, CIT_WRITE,
+				       &iocb->ki_pos, iov_iter_count(from));
+
+	/* On success, combine bytes written. */
+	if (rc_tiny >= 0 && rc_normal > 0)
+		rc_normal += rc_tiny;
+	/* On error, only return error from normal write if tiny write did not
+	 * write any bytes.  Otherwise return bytes written by tiny write.
+	 */
+	else if (rc_tiny > 0)
+		rc_normal = rc_tiny;
+
 	cl_env_put(env, &refcheck);
-	return result;
+out:
+	if (rc_normal > 0)
+		ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
+				  LUSTRE_FPRIVATE(file), iocb->ki_pos,
+				  rc_normal, WRITE);
+	RETURN(rc_normal);
 }
 
 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
@@ -1524,6 +1821,9 @@ static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 	if (result)
 		RETURN(result);
 
+	if (!iov_count)
+		RETURN(0);
+
 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
 	iov_iter_init(&to, READ, iov, nr_segs, iov_count);
 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
@@ -1538,30 +1838,26 @@ static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
 			    loff_t *ppos)
 {
-	struct lu_env *env;
 	struct iovec   iov = { .iov_base = buf, .iov_len = count };
-	struct kiocb  *kiocb;
+	struct kiocb   kiocb;
 	ssize_t        result;
-	__u16          refcheck;
+
 	ENTRY;
 
-	env = cl_env_get(&refcheck);
-	if (IS_ERR(env))
-		RETURN(PTR_ERR(env));
+	if (!count)
+		RETURN(0);
 
-	kiocb = &ll_env_info(env)->lti_kiocb;
-        init_sync_kiocb(kiocb, file);
-        kiocb->ki_pos = *ppos;
+	init_sync_kiocb(&kiocb, file);
+	kiocb.ki_pos = *ppos;
 #ifdef HAVE_KIOCB_KI_LEFT
-	kiocb->ki_left = count;
+	kiocb.ki_left = count;
 #elif defined(HAVE_KI_NBYTES)
-	kiocb->ki_nbytes = count;
+	kiocb.i_nbytes = count;
 #endif
 
-	result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
-	*ppos = kiocb->ki_pos;
+	result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
+	*ppos = kiocb.ki_pos;
 
-	cl_env_put(env, &refcheck);
 	RETURN(result);
 }
 
@@ -1581,6 +1877,9 @@ static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	if (result)
 		RETURN(result);
 
+	if (!iov_count)
+		RETURN(0);
+
 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
 	iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
@@ -1595,31 +1894,27 @@ static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 static ssize_t ll_file_write(struct file *file, const char __user *buf,
 			     size_t count, loff_t *ppos)
 {
-	struct lu_env *env;
 	struct iovec   iov = { .iov_base = (void __user *)buf,
 			       .iov_len = count };
-        struct kiocb  *kiocb;
-        ssize_t        result;
-	__u16          refcheck;
-        ENTRY;
+	struct kiocb   kiocb;
+	ssize_t        result;
 
-        env = cl_env_get(&refcheck);
-        if (IS_ERR(env))
-                RETURN(PTR_ERR(env));
+	ENTRY;
+
+	if (!count)
+		RETURN(0);
 
-	kiocb = &ll_env_info(env)->lti_kiocb;
-        init_sync_kiocb(kiocb, file);
-        kiocb->ki_pos = *ppos;
+	init_sync_kiocb(&kiocb, file);
+	kiocb.ki_pos = *ppos;
 #ifdef HAVE_KIOCB_KI_LEFT
-	kiocb->ki_left = count;
+	kiocb.ki_left = count;
 #elif defined(HAVE_KI_NBYTES)
-	kiocb->ki_nbytes = count;
+	kiocb.ki_nbytes = count;
 #endif
 
-	result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
-	*ppos = kiocb->ki_pos;
+	result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
+	*ppos = kiocb.ki_pos;
 
-	cl_env_put(env, &refcheck);
 	RETURN(result);
 }
 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
@@ -1647,6 +1942,11 @@ static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
 
         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
         cl_env_put(env, &refcheck);
+
+	if (result > 0)
+		ll_rw_stats_tally(ll_i2sbi(file_inode(in_file)), current->pid,
+				  LUSTRE_FPRIVATE(in_file), *ppos, result,
+				  READ);
         RETURN(result);
 }
 
@@ -1660,6 +1960,12 @@ int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
 	int rc;
 	ENTRY;
 
+	if ((__swab32(lum->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) ==
+	    le32_to_cpu(LOV_MAGIC_MAGIC)) {
+		/* this code will only exist for big-endian systems */
+		lustre_swab_lov_user_md(lum, 0);
+	}
+
 	ll_inode_size_lock(inode);
 	rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
 	if (rc < 0)
@@ -1722,13 +2028,14 @@ int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
 	    lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
 		GOTO(out, rc = -EPROTO);
 
-        /*
-         * This is coming from the MDS, so is probably in
-         * little endian.  We convert it to host endian before
-         * passing it to userspace.
-         */
-        if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
-		int stripe_count;
+	/*
+	 * This is coming from the MDS, so is probably in
+	 * little endian.  We convert it to host endian before
+	 * passing it to userspace.
+	 */
+	if ((lmm->lmm_magic & __swab32(LOV_MAGIC_MAGIC)) ==
+	    __swab32(LOV_MAGIC_MAGIC)) {
+		int stripe_count = 0;
 
 		if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
 		    lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
@@ -1738,27 +2045,19 @@ int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
 				stripe_count = 0;
 		}
 
-                /* if function called for directory - we should
-                 * avoid swab not existent lsm objects */
-                if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
-			lustre_swab_lov_user_md_v1(
-					(struct lov_user_md_v1 *)lmm);
-			if (S_ISREG(body->mbo_mode))
-				lustre_swab_lov_user_md_objects(
-				    ((struct lov_user_md_v1 *)lmm)->lmm_objects,
-				    stripe_count);
-		} else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
-			lustre_swab_lov_user_md_v3(
-					(struct lov_user_md_v3 *)lmm);
-			if (S_ISREG(body->mbo_mode))
-				lustre_swab_lov_user_md_objects(
-				    ((struct lov_user_md_v3 *)lmm)->lmm_objects,
-				    stripe_count);
-		} else if (lmm->lmm_magic ==
-			   cpu_to_le32(LOV_MAGIC_COMP_V1)) {
-			lustre_swab_lov_comp_md_v1(
-					(struct lov_comp_md_v1 *)lmm);
-		}
+		lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0);
+
+		/* if function called for directory - we should
+		 * avoid swab not existent lsm objects */
+		if (lmm->lmm_magic == LOV_MAGIC_V1 && S_ISREG(body->mbo_mode))
+			lustre_swab_lov_user_md_objects(
+				((struct lov_user_md_v1 *)lmm)->lmm_objects,
+				stripe_count);
+		else if (lmm->lmm_magic == LOV_MAGIC_V3 &&
+			 S_ISREG(body->mbo_mode))
+			lustre_swab_lov_user_md_objects(
+				((struct lov_user_md_v3 *)lmm)->lmm_objects,
+				stripe_count);
 	}
 
 out:
@@ -1845,7 +2144,7 @@ static int ll_lov_setstripe(struct inode *inode, struct file *file,
 	cl_lov_delay_create_clear(&file->f_flags);
 
 out:
-	OBD_FREE(klum, lum_size);
+	OBD_FREE_LARGE(klum, lum_size);
 	RETURN(rc);
 }
 
@@ -1888,6 +2187,10 @@ ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
 		struct cl_layout cl = {
 			.cl_is_composite = false,
 		};
+		struct lu_extent ext = {
+			.e_start = 0,
+			.e_end = OBD_OBJECT_EOF,
+		};
 
 		env = cl_env_get(&refcheck);
 		if (IS_ERR(env))
@@ -1895,7 +2198,8 @@ ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
 
 		rc = cl_object_layout_get(env, obj, &cl);
 		if (!rc && cl.cl_is_composite)
-			rc = ll_layout_write_intent(inode, 0, OBD_OBJECT_EOF);
+			rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
+						    &ext);
 
 		cl_env_put(env, &refcheck);
 		if (rc)
@@ -1989,7 +2293,9 @@ int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
         if (!och)
                 GOTO(out, rc = -ENOMEM);
 
-	ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
+	rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
+	if (rc)
+		GOTO(out, rc);
 
 	rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 out:
@@ -2105,18 +2411,8 @@ int ll_fid2path(struct inode *inode, void __user *arg)
 	RETURN(rc);
 }
 
-/*
- * Read the data_version for inode.
- *
- * This value is computed using stripe object version on OST.
- * Version is computed using server side locking.
- *
- * @param flags if do sync on the OST side;
- *		0: no sync
- *		LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
- *		LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
- */
-int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
+static int
+ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
 {
 	struct cl_object *obj = ll_i2info(inode)->lli_clob;
 	struct lu_env *env;
@@ -2126,11 +2422,12 @@ int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
 
 	ENTRY;
 
+	ioc->idv_version = 0;
+	ioc->idv_layout_version = UINT_MAX;
+
 	/* If no file object initialized, we consider its version is 0. */
-	if (obj == NULL) {
-		*data_version = 0;
+	if (obj == NULL)
 		RETURN(0);
-	}
 
 	env = cl_env_get(&refcheck);
 	if (IS_ERR(env))
@@ -2139,7 +2436,8 @@ int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
 	io = vvp_env_thread_io(env);
 	io->ci_obj = obj;
 	io->u.ci_data_version.dv_data_version = 0;
-	io->u.ci_data_version.dv_flags = flags;
+	io->u.ci_data_version.dv_layout_version = UINT_MAX;
+	io->u.ci_data_version.dv_flags = ioc->idv_flags;
 
 restart:
 	if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
@@ -2147,7 +2445,8 @@ int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
 	else
 		result = io->ci_result;
 
-	*data_version = io->u.ci_data_version.dv_data_version;
+	ioc->idv_version = io->u.ci_data_version.dv_data_version;
+	ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
 
 	cl_io_fini(env, io);
 
@@ -2159,6 +2458,29 @@ int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
 	RETURN(result);
 }
 
+/*
+ * Read the data_version for inode.
+ *
+ * This value is computed using stripe object version on OST.
+ * Version is computed using server side locking.
+ *
+ * @param flags if do sync on the OST side;
+ *		0: no sync
+ *		LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
+ *		LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
+ */
+int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
+{
+	struct ioc_data_version ioc = { .idv_flags = flags };
+	int rc;
+
+	rc = ll_ioc_data_version(inode, &ioc);
+	if (!rc)
+		*data_version = ioc.idv_version;
+
+	return rc;
+}
+
 /*
  * Trigger a HSM release request for the provided inode.
  */
@@ -2188,9 +2510,15 @@ int ll_hsm_release(struct inode *inode)
 	if (IS_ERR(env))
 		GOTO(out, rc = PTR_ERR(env));
 
-	ll_merge_attr(env, inode);
+	rc = ll_merge_attr(env, inode);
 	cl_env_put(env, &refcheck);
 
+	/* If error happen, we have the wrong size for a file.
+	 * Don't release it.
+	 */
+	if (rc != 0)
+		GOTO(out, rc);
+
 	/* Release the file.
 	 * NB: lease lock handle is released in mdc_hsm_release_pack() because
 	 * we still need it to pack l_remote_handle to MDT. */
@@ -2323,8 +2651,9 @@ static int ll_swap_layouts(struct file *file1, struct file *file2,
 
 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
 {
-	struct md_op_data	*op_data;
-	int			 rc;
+	struct obd_export *exp = ll_i2mdexp(inode);
+	struct md_op_data *op_data;
+	int rc;
 	ENTRY;
 
 	/* Detect out-of range masks */
@@ -2337,18 +2666,20 @@ int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
 	    !cfs_capable(CFS_CAP_SYS_ADMIN))
 		RETURN(-EPERM);
 
-	/* Detect out-of range archive id */
-	if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
-	    (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
-		RETURN(-EINVAL);
+	if (!exp_connect_archive_id_array(exp)) {
+		/* Detect out-of range archive id */
+		if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
+		    (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
+			RETURN(-EINVAL);
+	}
 
 	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
 				     LUSTRE_OPC_ANY, hss);
 	if (IS_ERR(op_data))
 		RETURN(PTR_ERR(op_data));
 
-	rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
-			   sizeof(*op_data), op_data, NULL);
+	rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
+			   op_data, NULL);
 
 	ll_finish_md_op_data(op_data);
 
@@ -2399,7 +2730,7 @@ static int ll_hsm_import(struct inode *inode, struct file *file,
 
 	inode_lock(inode);
 
-	rc = ll_setattr_raw(file_dentry(file), attr, true);
+	rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
 	if (rc == -ENODATA)
 		rc = 0;
 
@@ -2427,7 +2758,7 @@ static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
 	struct iattr ia = {
 		.ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
 			    ATTR_MTIME | ATTR_MTIME_SET |
-			    ATTR_CTIME | ATTR_CTIME_SET,
+			    ATTR_CTIME,
 		.ia_atime = {
 			.tv_sec = lfu->lfu_atime_sec,
 			.tv_nsec = lfu->lfu_atime_nsec,
@@ -2451,12 +2782,197 @@ static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
 		RETURN(-EINVAL);
 
 	inode_lock(inode);
-	rc = ll_setattr_raw(file_dentry(file), &ia, false);
+	rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
+			    false);
 	inode_unlock(inode);
 
 	RETURN(rc);
 }
 
+static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
+{
+	switch (mode) {
+	case MODE_READ_USER:
+		return CLM_READ;
+	case MODE_WRITE_USER:
+		return CLM_WRITE;
+	default:
+		return -EINVAL;
+	}
+}
+
+static const char *const user_lockname[] = LOCK_MODE_NAMES;
+
+/* Used to allow the upper layers of the client to request an LDLM lock
+ * without doing an actual read or write.
+ *
+ * Used for ladvise lockahead to manually request specific locks.
+ *
+ * \param[in] file	file this ladvise lock request is on
+ * \param[in] ladvise	ladvise struct describing this lock request
+ *
+ * \retval 0		success, no detailed result available (sync requests
+ *			and requests sent to the server [not handled locally]
+ *			cannot return detailed results)
+ * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
+ *					 see definitions for details.
+ * \retval negative	negative errno on error
+ */
+int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
+{
+	struct lu_env *env = NULL;
+	struct cl_io *io  = NULL;
+	struct cl_lock *lock = NULL;
+	struct cl_lock_descr *descr = NULL;
+	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	enum cl_lock_mode cl_mode;
+	off_t start = ladvise->lla_start;
+	off_t end = ladvise->lla_end;
+	int result;
+	__u16 refcheck;
+
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
+	       "start=%llu, end=%llu\n", dentry->d_name.len,
+	       dentry->d_name.name, dentry->d_inode,
+	       user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
+	       (__u64) end);
+
+	cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
+	if (cl_mode < 0)
+		GOTO(out, result = cl_mode);
+
+	/* Get IO environment */
+	result = cl_io_get(inode, &env, &io, &refcheck);
+	if (result <= 0)
+		GOTO(out, result);
+
+	result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+	if (result > 0) {
+		/*
+		 * nothing to do for this io. This currently happens when
+		 * stripe sub-object's are not yet created.
+		 */
+		result = io->ci_result;
+	} else if (result == 0) {
+		lock = vvp_env_lock(env);
+		descr = &lock->cll_descr;
+
+		descr->cld_obj   = io->ci_obj;
+		/* Convert byte offsets to pages */
+		descr->cld_start = cl_index(io->ci_obj, start);
+		descr->cld_end   = cl_index(io->ci_obj, end);
+		descr->cld_mode  = cl_mode;
+		/* CEF_MUST is used because we do not want to convert a
+		 * lockahead request to a lockless lock */
+		descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
+				       CEF_NONBLOCK;
+
+		if (ladvise->lla_peradvice_flags & LF_ASYNC)
+			descr->cld_enq_flags |= CEF_SPECULATIVE;
+
+		result = cl_lock_request(env, io, lock);
+
+		/* On success, we need to release the lock */
+		if (result >= 0)
+			cl_lock_release(env, lock);
+	}
+	cl_io_fini(env, io);
+	cl_env_put(env, &refcheck);
+
+	/* -ECANCELED indicates a matching lock with a different extent
+	 * was already present, and -EEXIST indicates a matching lock
+	 * on exactly the same extent was already present.
+	 * We convert them to positive values for userspace to make
+	 * recognizing true errors easier.
+	 * Note we can only return these detailed results on async requests,
+	 * as sync requests look the same as i/o requests for locking. */
+	if (result == -ECANCELED)
+		result = LLA_RESULT_DIFFERENT;
+	else if (result == -EEXIST)
+		result = LLA_RESULT_SAME;
+
+out:
+	RETURN(result);
+}
+static const char *const ladvise_names[] = LU_LADVISE_NAMES;
+
+static int ll_ladvise_sanity(struct inode *inode,
+			     struct llapi_lu_ladvise *ladvise)
+{
+	enum lu_ladvise_type advice = ladvise->lla_advice;
+	/* Note the peradvice flags is a 32 bit field, so per advice flags must
+	 * be in the first 32 bits of enum ladvise_flags */
+	__u32 flags = ladvise->lla_peradvice_flags;
+	/* 3 lines at 80 characters per line, should be plenty */
+	int rc = 0;
+
+	if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
+		rc = -EINVAL;
+		CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
+		       "last supported advice is %s (value '%d'): rc = %d\n",
+		       ll_get_fsname(inode->i_sb, NULL, 0), advice,
+		       ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
+		GOTO(out, rc);
+	}
+
+	/* Per-advice checks */
+	switch (advice) {
+	case LU_LADVISE_LOCKNOEXPAND:
+		if (flags & ~LF_LOCKNOEXPAND_MASK) {
+			rc = -EINVAL;
+			CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
+			       "rc = %d\n",
+			       ll_get_fsname(inode->i_sb, NULL, 0), flags,
+			       ladvise_names[advice], rc);
+			GOTO(out, rc);
+		}
+		break;
+	case LU_LADVISE_LOCKAHEAD:
+		/* Currently only READ and WRITE modes can be requested */
+		if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
+		    ladvise->lla_lockahead_mode == 0) {
+			rc = -EINVAL;
+			CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
+			       "rc = %d\n",
+			       ll_get_fsname(inode->i_sb, NULL, 0),
+			       ladvise->lla_lockahead_mode,
+			       ladvise_names[advice], rc);
+			GOTO(out, rc);
+		}
+		/* fallthrough */
+	case LU_LADVISE_WILLREAD:
+	case LU_LADVISE_DONTNEED:
+	default:
+		/* Note fall through above - These checks apply to all advices
+		 * except LOCKNOEXPAND */
+		if (flags & ~LF_DEFAULT_MASK) {
+			rc = -EINVAL;
+			CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
+			       "rc = %d\n",
+			       ll_get_fsname(inode->i_sb, NULL, 0), flags,
+			       ladvise_names[advice], rc);
+			GOTO(out, rc);
+		}
+		if (ladvise->lla_start >= ladvise->lla_end) {
+			rc = -EINVAL;
+			CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
+			       "for %s: rc = %d\n",
+			       ll_get_fsname(inode->i_sb, NULL, 0),
+			       ladvise->lla_start, ladvise->lla_end,
+			       ladvise_names[advice], rc);
+			GOTO(out, rc);
+		}
+		break;
+	}
+
+out:
+	return rc;
+}
+#undef ERRSIZE
+
 /*
  * Give file access advices
  *
@@ -2506,6 +3022,15 @@ static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
 	RETURN(rc);
 }
 
+static int ll_lock_noexpand(struct file *file, int flags)
+{
+	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+	fd->ll_lock_no_expand = !(flags & LF_UNSET);
+
+	return 0;
+}
+
 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
 			unsigned long arg)
 {
@@ -2516,64 +3041,287 @@ int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
 			   sizeof(fsxattr)))
 		RETURN(-EFAULT);
 
-	fsxattr.fsx_xflags = ll_inode_to_ext_flags(inode->i_flags);
+	fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
+	if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
+		fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
 	fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
 	if (copy_to_user((struct fsxattr __user *)arg,
 			 &fsxattr, sizeof(fsxattr)))
 		RETURN(-EFAULT);
 
-	RETURN(0);
+	RETURN(0);
+}
+
+int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
+{
+	/*
+	 * Project Quota ID state is only allowed to change from within the init
+	 * namespace. Enforce that restriction only if we are trying to change
+	 * the quota ID state. Everything else is allowed in user namespaces.
+	 */
+	if (current_user_ns() == &init_user_ns)
+		return 0;
+
+	if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
+		return -EINVAL;
+
+	if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
+		if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
+			return -EINVAL;
+	} else {
+		if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
+			unsigned long arg)
+{
+
+	struct md_op_data *op_data;
+	struct ptlrpc_request *req = NULL;
+	int rc = 0;
+	struct fsxattr fsxattr;
+	struct cl_object *obj;
+	struct iattr *attr;
+	int flags;
+
+	if (copy_from_user(&fsxattr,
+			   (const struct fsxattr __user *)arg,
+			   sizeof(fsxattr)))
+		RETURN(-EFAULT);
+
+	rc = ll_ioctl_check_project(inode, &fsxattr);
+	if (rc)
+		RETURN(rc);
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
+	op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
+	if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
+		op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
+	op_data->op_projid = fsxattr.fsx_projid;
+	op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
+	rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
+			0, &req);
+	ptlrpc_req_finished(req);
+	if (rc)
+		GOTO(out_fsxattr, rc);
+	ll_update_inode_flags(inode, op_data->op_attr_flags);
+	obj = ll_i2info(inode)->lli_clob;
+	if (obj == NULL)
+		GOTO(out_fsxattr, rc);
+
+	OBD_ALLOC_PTR(attr);
+	if (attr == NULL)
+		GOTO(out_fsxattr, rc = -ENOMEM);
+
+	rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
+			    fsxattr.fsx_xflags);
+	OBD_FREE_PTR(attr);
+out_fsxattr:
+	ll_finish_md_op_data(op_data);
+	RETURN(rc);
+}
+
+static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
+				 unsigned long arg)
+{
+	struct inode		*inode = file_inode(file);
+	struct ll_file_data	*fd = LUSTRE_FPRIVATE(file);
+	struct ll_inode_info	*lli = ll_i2info(inode);
+	struct obd_client_handle *och = NULL;
+	struct split_param sp;
+	bool lease_broken;
+	fmode_t fmode = 0;
+	enum mds_op_bias bias = 0;
+	struct file *layout_file = NULL;
+	void *data = NULL;
+	size_t data_size = 0;
+	long rc;
+	ENTRY;
+
+	mutex_lock(&lli->lli_och_mutex);
+	if (fd->fd_lease_och != NULL) {
+		och = fd->fd_lease_och;
+		fd->fd_lease_och = NULL;
+	}
+	mutex_unlock(&lli->lli_och_mutex);
+
+	if (och == NULL)
+		GOTO(out, rc = -ENOLCK);
+
+	fmode = och->och_flags;
+
+	switch (ioc->lil_flags) {
+	case LL_LEASE_RESYNC_DONE:
+		if (ioc->lil_count > IOC_IDS_MAX)
+			GOTO(out, rc = -EINVAL);
+
+		data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
+		OBD_ALLOC(data, data_size);
+		if (!data)
+			GOTO(out, rc = -ENOMEM);
+
+		if (copy_from_user(data, (void __user *)arg, data_size))
+			GOTO(out, rc = -EFAULT);
+
+		bias = MDS_CLOSE_RESYNC_DONE;
+		break;
+	case LL_LEASE_LAYOUT_MERGE: {
+		int fd;
+
+		if (ioc->lil_count != 1)
+			GOTO(out, rc = -EINVAL);
+
+		arg += sizeof(*ioc);
+		if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
+			GOTO(out, rc = -EFAULT);
+
+		layout_file = fget(fd);
+		if (!layout_file)
+			GOTO(out, rc = -EBADF);
+
+		if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
+				(layout_file->f_flags & O_ACCMODE) == O_RDONLY)
+			GOTO(out, rc = -EPERM);
+
+		data = file_inode(layout_file);
+		bias = MDS_CLOSE_LAYOUT_MERGE;
+		break;
+	}
+	case LL_LEASE_LAYOUT_SPLIT: {
+		int fdv;
+		int mirror_id;
+
+		if (ioc->lil_count != 2)
+			GOTO(out, rc = -EINVAL);
+
+		arg += sizeof(*ioc);
+		if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
+			GOTO(out, rc = -EFAULT);
+
+		arg += sizeof(__u32);
+		if (copy_from_user(&mirror_id, (void __user *)arg,
+				   sizeof(__u32)))
+			GOTO(out, rc = -EFAULT);
+
+		layout_file = fget(fdv);
+		if (!layout_file)
+			GOTO(out, rc = -EBADF);
+
+		sp.sp_inode = file_inode(layout_file);
+		sp.sp_mirror_id = (__u16)mirror_id;
+		data = &sp;
+		bias = MDS_CLOSE_LAYOUT_SPLIT;
+		break;
+	}
+	default:
+		/* without close intent */
+		break;
+	}
+
+	rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	rc = ll_lease_och_release(inode, file);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	if (lease_broken)
+		fmode = 0;
+	EXIT;
+
+out:
+	switch (ioc->lil_flags) {
+	case LL_LEASE_RESYNC_DONE:
+		if (data)
+			OBD_FREE(data, data_size);
+		break;
+	case LL_LEASE_LAYOUT_MERGE:
+	case LL_LEASE_LAYOUT_SPLIT:
+		if (layout_file)
+			fput(layout_file);
+		break;
+	}
+
+	if (!rc)
+		rc = ll_lease_type_from_fmode(fmode);
+	RETURN(rc);
 }
 
-int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
-			unsigned long arg)
+static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
+			      unsigned long arg)
 {
+	struct inode *inode = file_inode(file);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+	struct obd_client_handle *och = NULL;
+	__u64 open_flags = 0;
+	bool lease_broken;
+	fmode_t fmode;
+	long rc;
+	ENTRY;
 
-	struct md_op_data *op_data;
-	struct ptlrpc_request *req = NULL;
-	int rc = 0;
-	struct fsxattr fsxattr;
-	struct cl_object *obj;
-
-	/* only root could change project ID */
-	if (!cfs_capable(CFS_CAP_SYS_ADMIN))
-		RETURN(-EPERM);
-
-	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
-				     LUSTRE_OPC_ANY, NULL);
-	if (IS_ERR(op_data))
-		RETURN(PTR_ERR(op_data));
-
-	if (copy_from_user(&fsxattr,
-			   (const struct fsxattr __user *)arg,
-			   sizeof(fsxattr)))
-		GOTO(out_fsxattr1, rc = -EFAULT);
+	switch (ioc->lil_mode) {
+	case LL_LEASE_WRLCK:
+		if (!(file->f_mode & FMODE_WRITE))
+			RETURN(-EPERM);
+		fmode = FMODE_WRITE;
+		break;
+	case LL_LEASE_RDLCK:
+		if (!(file->f_mode & FMODE_READ))
+			RETURN(-EPERM);
+		fmode = FMODE_READ;
+		break;
+	case LL_LEASE_UNLCK:
+		RETURN(ll_file_unlock_lease(file, ioc, arg));
+	default:
+		RETURN(-EINVAL);
+	}
 
-	op_data->op_attr_flags = fsxattr.fsx_xflags;
-	op_data->op_projid = fsxattr.fsx_projid;
-	op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
-	rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
-			0, &req);
-	ptlrpc_req_finished(req);
+	CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
 
-	obj = ll_i2info(inode)->lli_clob;
-	if (obj) {
-		struct iattr *attr;
+	/* apply for lease */
+	if (ioc->lil_flags & LL_LEASE_RESYNC)
+		open_flags = MDS_OPEN_RESYNC;
+	och = ll_lease_open(inode, file, fmode, open_flags);
+	if (IS_ERR(och))
+		RETURN(PTR_ERR(och));
 
-		inode->i_flags = ll_ext_to_inode_flags(fsxattr.fsx_xflags);
-		OBD_ALLOC_PTR(attr);
-		if (attr == NULL)
-			GOTO(out_fsxattr1, rc = -ENOMEM);
-		attr->ia_valid = ATTR_ATTR_FLAG;
-		rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
+	if (ioc->lil_flags & LL_LEASE_RESYNC) {
+		rc = ll_lease_file_resync(och, inode, arg);
+		if (rc) {
+			ll_lease_close(och, inode, NULL);
+			RETURN(rc);
+		}
+		rc = ll_layout_refresh(inode, &fd->fd_layout_version);
+		if (rc) {
+			ll_lease_close(och, inode, NULL);
+			RETURN(rc);
+		}
+	}
 
-		OBD_FREE_PTR(attr);
+	rc = 0;
+	mutex_lock(&lli->lli_och_mutex);
+	if (fd->fd_lease_och == NULL) {
+		fd->fd_lease_och = och;
+		och = NULL;
+	}
+	mutex_unlock(&lli->lli_och_mutex);
+	if (och != NULL) {
+		/* impossible now that only excl is supported for now */
+		ll_lease_close(och, inode, &lease_broken);
+		rc = -EBUSY;
 	}
-out_fsxattr1:
-	ll_finish_md_op_data(op_data);
 	RETURN(rc);
-
-
 }
 
 static long
@@ -2586,15 +3334,15 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
 	       PFID(ll_inode2fid(inode)), inode, cmd);
-        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
 
-        /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
-        if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
-                RETURN(-ENOTTY);
+	/* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
+	if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
+		RETURN(-ENOTTY);
 
-        switch(cmd) {
-        case LL_IOC_GETFLAGS:
-                /* Get the current value of the file flags */
+	switch (cmd) {
+	case LL_IOC_GETFLAGS:
+		/* Get the current value of the file flags */
 		return put_user(fd->fd_flags, (int __user *)arg);
         case LL_IOC_SETFLAGS:
         case LL_IOC_CLRFLAGS:
@@ -2647,9 +3395,6 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			struct ll_inode_info		*lli;
 			struct obd_client_handle	*och = NULL;
 
-			if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
-				GOTO(out, rc = -EINVAL);
-
 			lli = ll_i2info(inode);
 			mutex_lock(&lli->lli_och_mutex);
 			if (fd->fd_lease_och != NULL) {
@@ -2671,12 +3416,18 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case LL_IOC_LOV_GETSTRIPE:
 	case LL_IOC_LOV_GETSTRIPE_NEW:
 		RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
-        case FSFILT_IOC_GETFLAGS:
-        case FSFILT_IOC_SETFLAGS:
-                RETURN(ll_iocontrol(inode, file, cmd, arg));
-        case FSFILT_IOC_GETVERSION_OLD:
-        case FSFILT_IOC_GETVERSION:
+	case FS_IOC_GETFLAGS:
+	case FS_IOC_SETFLAGS:
+		RETURN(ll_iocontrol(inode, file, cmd, arg));
+	case FSFILT_IOC_GETVERSION:
+	case FS_IOC_GETVERSION:
 		RETURN(put_user(inode->i_generation, (int __user *)arg));
+	/* We need to special case any other ioctls we want to handle,
+	 * to send them to the MDS/OST as appropriate and to properly
+	 * network encode the arg field. */
+	case FS_IOC_SETVERSION:
+		RETURN(-ENOTSUPP);
+
         case LL_IOC_GROUP_LOCK:
                 RETURN(ll_get_grouplock(inode, file, arg));
         case LL_IOC_GROUP_UNLOCK:
@@ -2684,12 +3435,6 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
         case IOC_OBD_STATFS:
 		RETURN(ll_obd_statfs(inode, (void __user *)arg));
 
-        /* We need to special case any other ioctls we want to handle,
-         * to send them to the MDS/OST as appropriate and to properly
-         * network encode the arg field.
-        case FSFILT_IOC_SETVERSION_OLD:
-        case FSFILT_IOC_SETVERSION:
-        */
 	case LL_IOC_FLUSHCTX:
 		RETURN(ll_flush_ctx(inode));
 	case LL_IOC_PATH2FID: {
@@ -2712,7 +3457,7 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			RETURN(-EFAULT);
 
 		idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
-		rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
+		rc = ll_ioc_data_version(inode, &idv);
 
 		if (rc == 0 &&
 		    copy_to_user((char __user *)arg, &idv, sizeof(idv)))
@@ -2806,71 +3551,18 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		OBD_FREE_PTR(hca);
 		RETURN(rc);
 	}
-	case LL_IOC_SET_LEASE: {
-		struct ll_inode_info *lli = ll_i2info(inode);
-		struct obd_client_handle *och = NULL;
-		bool lease_broken;
-		fmode_t fmode;
-
-		switch (arg) {
-		case LL_LEASE_WRLCK:
-			if (!(file->f_mode & FMODE_WRITE))
-				RETURN(-EPERM);
-			fmode = FMODE_WRITE;
-			break;
-		case LL_LEASE_RDLCK:
-			if (!(file->f_mode & FMODE_READ))
-				RETURN(-EPERM);
-			fmode = FMODE_READ;
-			break;
-		case LL_LEASE_UNLCK:
-			mutex_lock(&lli->lli_och_mutex);
-			if (fd->fd_lease_och != NULL) {
-				och = fd->fd_lease_och;
-				fd->fd_lease_och = NULL;
-			}
-			mutex_unlock(&lli->lli_och_mutex);
-
-			if (och == NULL)
-				RETURN(-ENOLCK);
-
-			fmode = och->och_flags;
-			rc = ll_lease_close(och, inode, &lease_broken);
-			if (rc < 0)
-				RETURN(rc);
-
-			rc = ll_lease_och_release(inode, file);
-			if (rc < 0)
-				RETURN(rc);
-
-			if (lease_broken)
-				fmode = 0;
+	case LL_IOC_SET_LEASE_OLD: {
+		struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
 
-			RETURN(ll_lease_type_from_fmode(fmode));
-		default:
-			RETURN(-EINVAL);
-		}
-
-		CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
+		RETURN(ll_file_set_lease(file, &ioc, 0));
+	}
+	case LL_IOC_SET_LEASE: {
+		struct ll_ioc_lease ioc;
 
-		/* apply for lease */
-		och = ll_lease_open(inode, file, fmode, 0);
-		if (IS_ERR(och))
-			RETURN(PTR_ERR(och));
+		if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
+			RETURN(-EFAULT);
 
-		rc = 0;
-		mutex_lock(&lli->lli_och_mutex);
-		if (fd->fd_lease_och == NULL) {
-			fd->fd_lease_och = och;
-			och = NULL;
-		}
-		mutex_unlock(&lli->lli_och_mutex);
-		if (och != NULL) {
-			/* impossible now that only excl is supported for now */
-			ll_lease_close(och, inode, &lease_broken);
-			rc = -EBUSY;
-		}
-		RETURN(rc);
+		RETURN(ll_file_set_lease(file, &ioc, arg));
 	}
 	case LL_IOC_GET_LEASE: {
 		struct ll_inode_info *lli = ll_i2info(inode);
@@ -2923,55 +3615,92 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		RETURN(ll_file_futimes_3(file, &lfu));
 	}
 	case LL_IOC_LADVISE: {
-		struct llapi_ladvise_hdr *ladvise_hdr;
+		struct llapi_ladvise_hdr *k_ladvise_hdr;
+		struct llapi_ladvise_hdr __user *u_ladvise_hdr;
 		int i;
 		int num_advise;
-		int alloc_size = sizeof(*ladvise_hdr);
+		int alloc_size = sizeof(*k_ladvise_hdr);
 
 		rc = 0;
-		OBD_ALLOC_PTR(ladvise_hdr);
-		if (ladvise_hdr == NULL)
+		u_ladvise_hdr = (void __user *)arg;
+		OBD_ALLOC_PTR(k_ladvise_hdr);
+		if (k_ladvise_hdr == NULL)
 			RETURN(-ENOMEM);
 
-		if (copy_from_user(ladvise_hdr,
-				   (const struct llapi_ladvise_hdr __user *)arg,
-				   alloc_size))
+		if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
 			GOTO(out_ladvise, rc = -EFAULT);
 
-		if (ladvise_hdr->lah_magic != LADVISE_MAGIC ||
-		    ladvise_hdr->lah_count < 1)
+		if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
+		    k_ladvise_hdr->lah_count < 1)
 			GOTO(out_ladvise, rc = -EINVAL);
 
-		num_advise = ladvise_hdr->lah_count;
+		num_advise = k_ladvise_hdr->lah_count;
 		if (num_advise >= LAH_COUNT_MAX)
 			GOTO(out_ladvise, rc = -EFBIG);
 
-		OBD_FREE_PTR(ladvise_hdr);
-		alloc_size = offsetof(typeof(*ladvise_hdr),
+		OBD_FREE_PTR(k_ladvise_hdr);
+		alloc_size = offsetof(typeof(*k_ladvise_hdr),
 				      lah_advise[num_advise]);
-		OBD_ALLOC(ladvise_hdr, alloc_size);
-		if (ladvise_hdr == NULL)
+		OBD_ALLOC(k_ladvise_hdr, alloc_size);
+		if (k_ladvise_hdr == NULL)
 			RETURN(-ENOMEM);
 
 		/*
 		 * TODO: submit multiple advices to one server in a single RPC
 		 */
-		if (copy_from_user(ladvise_hdr,
-				   (const struct llapi_ladvise_hdr __user *)arg,
-				   alloc_size))
+		if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
 			GOTO(out_ladvise, rc = -EFAULT);
 
 		for (i = 0; i < num_advise; i++) {
-			rc = ll_ladvise(inode, file, ladvise_hdr->lah_flags,
-					&ladvise_hdr->lah_advise[i]);
+			struct llapi_lu_ladvise *k_ladvise =
+					&k_ladvise_hdr->lah_advise[i];
+			struct llapi_lu_ladvise __user *u_ladvise =
+					&u_ladvise_hdr->lah_advise[i];
+
+			rc = ll_ladvise_sanity(inode, k_ladvise);
 			if (rc)
+				GOTO(out_ladvise, rc);
+
+			switch (k_ladvise->lla_advice) {
+			case LU_LADVISE_LOCKNOEXPAND:
+				rc = ll_lock_noexpand(file,
+					       k_ladvise->lla_peradvice_flags);
+				GOTO(out_ladvise, rc);
+			case LU_LADVISE_LOCKAHEAD:
+
+				rc = ll_file_lock_ahead(file, k_ladvise);
+
+				if (rc < 0)
+					GOTO(out_ladvise, rc);
+
+				if (put_user(rc,
+					     &u_ladvise->lla_lockahead_result))
+					GOTO(out_ladvise, rc = -EFAULT);
+				break;
+			default:
+				rc = ll_ladvise(inode, file,
+						k_ladvise_hdr->lah_flags,
+						k_ladvise);
+				if (rc)
+					GOTO(out_ladvise, rc);
 				break;
+			}
+
 		}
 
 out_ladvise:
-		OBD_FREE(ladvise_hdr, alloc_size);
+		OBD_FREE(k_ladvise_hdr, alloc_size);
 		RETURN(rc);
 	}
+	case LL_IOC_FLR_SET_MIRROR: {
+		/* mirror I/O must be direct to avoid polluting page cache
+		 * by stale data. */
+		if (!(file->f_flags & O_DIRECT))
+			RETURN(-EINVAL);
+
+		fd->fd_designated_mirror = (__u32)arg;
+		RETURN(0);
+	}
 	case LL_IOC_FSGETXATTR:
 		RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
 	case LL_IOC_FSSETXATTR:
@@ -3160,7 +3889,6 @@ int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct dentry *dentry = file_dentry(file);
-	bool lock_inode;
 #elif defined(HAVE_FILE_FSYNC_2ARGS)
 int ll_fsync(struct file *file, int datasync)
 {
@@ -3185,9 +3913,7 @@ int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
 
 #ifdef HAVE_FILE_FSYNC_4ARGS
 	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
-	lock_inode = !lli->lli_inode_locked;
-	if (lock_inode)
-		inode_lock(inode);
+	inode_lock(inode);
 #else
 	/* fsync's caller has already called _fdata{sync,write}, we want
 	 * that IO to finish before calling the osc and mdc sync methods */
@@ -3227,8 +3953,7 @@ int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
 	}
 
 #ifdef HAVE_FILE_FSYNC_4ARGS
-	if (lock_inode)
-		inode_unlock(inode);
+	inode_unlock(inode);
 #endif
 	RETURN(rc);
 }
@@ -3412,48 +4137,61 @@ int ll_get_fid_by_name(struct inode *parent, const char *name,
 	RETURN(rc);
 }
 
-int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
-	       const char *name, int namelen)
+int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
+	       const char *name)
 {
-	struct dentry         *dchild = NULL;
-	struct inode          *child_inode = NULL;
-	struct md_op_data     *op_data;
+	struct dentry *dchild = NULL;
+	struct inode *child_inode = NULL;
+	struct md_op_data *op_data;
 	struct ptlrpc_request *request = NULL;
 	struct obd_client_handle *och = NULL;
-	struct qstr           qstr;
-	struct mdt_body		*body;
-	int                    rc;
-	__u64			data_version = 0;
+	struct qstr qstr;
+	struct mdt_body	*body;
+	__u64 data_version = 0;
+	size_t namelen = strlen(name);
+	int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
+	int rc;
 	ENTRY;
 
-	CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
-	       name, PFID(ll_inode2fid(parent)), mdtidx);
+	CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
+	       PFID(ll_inode2fid(parent)), name,
+	       lum->lum_stripe_offset, lum->lum_stripe_count);
 
-	op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
-				     0, LUSTRE_OPC_ANY, NULL);
-	if (IS_ERR(op_data))
-		RETURN(PTR_ERR(op_data));
+	if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
+	    lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
+		lustre_swab_lmv_user_md(lum);
 
 	/* Get child FID first */
 	qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
 	qstr.name = name;
 	qstr.len = namelen;
 	dchild = d_lookup(file_dentry(file), &qstr);
-	if (dchild != NULL) {
-		if (dchild->d_inode != NULL)
+	if (dchild) {
+		if (dchild->d_inode)
 			child_inode = igrab(dchild->d_inode);
 		dput(dchild);
 	}
 
-	if (child_inode == NULL) {
-		rc = ll_get_fid_by_name(parent, name, namelen,
-					&op_data->op_fid3, &child_inode);
-		if (rc != 0)
-			GOTO(out_free, rc);
+	if (!child_inode) {
+		rc = ll_get_fid_by_name(parent, name, namelen, NULL,
+					&child_inode);
+		if (rc)
+			RETURN(rc);
 	}
 
-	if (child_inode == NULL)
-		GOTO(out_free, rc = -EINVAL);
+	if (!child_inode)
+		RETURN(-ENOENT);
+
+	if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
+	      OBD_CONNECT2_DIR_MIGRATE)) {
+		if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
+		    ll_i2info(child_inode)->lli_lsm_md) {
+			CERROR("%s: MDT doesn't support stripe directory "
+			       "migration!\n",
+			       ll_get_fsname(parent->i_sb, NULL, 0));
+			GOTO(out_iput, rc = -EOPNOTSUPP);
+		}
+	}
 
 	/*
 	 * lfs migrate command needs to be blocked on the client
@@ -3463,6 +4201,11 @@ int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
 	if (child_inode == parent->i_sb->s_root->d_inode)
 		GOTO(out_iput, rc = -EINVAL);
 
+	op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
+				     child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		GOTO(out_iput, rc = PTR_ERR(op_data));
+
 	inode_lock(child_inode);
 	op_data->op_fid3 = *ll_inode2fid(child_inode);
 	if (!fid_is_sane(&op_data->op_fid3)) {
@@ -3472,15 +4215,10 @@ int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
 		GOTO(out_unlock, rc = -EINVAL);
 	}
 
-	rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
-	if (rc < 0)
-		GOTO(out_unlock, rc);
+	op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
+	op_data->op_data = lum;
+	op_data->op_data_size = lumlen;
 
-	if (rc == mdtidx) {
-		CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
-		       PFID(&op_data->op_fid3), mdtidx);
-		GOTO(out_unlock, rc = 0);
-	}
 again:
 	if (S_ISREG(child_inode->i_mode)) {
 		och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
@@ -3495,17 +4233,18 @@ int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
 		if (rc != 0)
 			GOTO(out_close, rc);
 
-		op_data->op_handle = och->och_fh;
-		op_data->op_data = och->och_mod;
+		op_data->op_open_handle = och->och_open_handle;
 		op_data->op_data_version = data_version;
 		op_data->op_lease_handle = och->och_lease_handle;
-		op_data->op_bias |= MDS_RENAME_MIGRATE;
+		op_data->op_bias |= MDS_CLOSE_MIGRATE;
+
+		spin_lock(&och->och_mod->mod_open_req->rq_lock);
+		och->och_mod->mod_open_req->rq_replay = 0;
+		spin_unlock(&och->och_mod->mod_open_req->rq_lock);
 	}
 
-	op_data->op_mds = mdtidx;
-	op_data->op_cli_flags = CLI_MIGRATE;
-	rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
-		       namelen, name, namelen, &request);
+	rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
+		       name, namelen, &request);
 	if (rc == 0) {
 		LASSERT(request != NULL);
 		ll_update_times(request, parent);
@@ -3515,12 +4254,11 @@ int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
 
 		/* If the server does release layout lock, then we cleanup
 		 * the client och here, otherwise release it in out_close: */
-		if (och != NULL &&
-		    body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
+		if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
 			obd_mod_put(och->och_mod);
 			md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
 						  och);
-			och->och_fh.cookie = DEAD_HANDLE_MAGIC;
+			och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
 			OBD_FREE_PTR(och);
 			och = NULL;
 		}
@@ -3536,16 +4274,15 @@ int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
 		goto again;
 
 out_close:
-	if (och != NULL) /* close the file */
+	if (och)
 		ll_lease_close(och, child_inode, NULL);
-	if (rc == 0)
+	if (!rc)
 		clear_nlink(child_inode);
 out_unlock:
 	inode_unlock(child_inode);
+	ll_finish_md_op_data(op_data);
 out_iput:
 	iput(child_inode);
-out_free:
-	ll_finish_md_op_data(op_data);
 	RETURN(rc);
 }
 
@@ -3586,7 +4323,7 @@ int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
                ldlm_lockname[mode]);
 
 	flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
-	for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
+	for (i = 0; i < MDS_INODELOCK_NUMBITS && *bits != 0; i++) {
 		policy.l_inodebits.bits = *bits & (1 << i);
 		if (policy.l_inodebits.bits == 0)
 			continue;
@@ -3653,105 +4390,81 @@ static int ll_inode_revalidate_fini(struct inode *inode, int rc)
 	return rc;
 }
 
-static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
+static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
 {
-        struct inode *inode = dentry->d_inode;
-        struct ptlrpc_request *req = NULL;
-        struct obd_export *exp;
-        int rc = 0;
-        ENTRY;
-
-        LASSERT(inode != NULL);
+	struct inode *parent;
+	struct inode *inode = dentry->d_inode;
+	struct obd_export *exp = ll_i2mdexp(inode);
+	struct lookup_intent oit = {
+		.it_op = op,
+	};
+	struct ptlrpc_request *req = NULL;
+	struct md_op_data *op_data;
+	const char *name = NULL;
+	size_t namelen = 0;
+	int rc = 0;
+	ENTRY;
 
 	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
 	       PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
 
-        exp = ll_i2mdexp(inode);
-
-        /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
-         *      But under CMD case, it caused some lock issues, should be fixed
-         *      with new CMD ibits lock. See bug 12718 */
-	if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
-                struct lookup_intent oit = { .it_op = IT_GETATTR };
-                struct md_op_data *op_data;
-
-                if (ibits == MDS_INODELOCK_LOOKUP)
-                        oit.it_op = IT_LOOKUP;
-
-                /* Call getattr by fid, so do not provide name at all. */
-                op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
-                                             dentry->d_inode, NULL, 0, 0,
-                                             LUSTRE_OPC_ANY, NULL);
-                if (IS_ERR(op_data))
-                        RETURN(PTR_ERR(op_data));
-
-		rc = md_intent_lock(exp, op_data, &oit, &req,
-				    &ll_md_blocking_ast, 0);
-                ll_finish_md_op_data(op_data);
-                if (rc < 0) {
-                        rc = ll_inode_revalidate_fini(inode, rc);
-                        GOTO (out, rc);
-                }
-
-                rc = ll_revalidate_it_finish(req, &oit, dentry);
-                if (rc != 0) {
-                        ll_intent_release(&oit);
-                        GOTO(out, rc);
-                }
-
-                /* Unlinked? Unhash dentry, so it is not picked up later by
-                   do_lookup() -> ll_revalidate_it(). We cannot use d_drop
-                   here to preserve get_cwd functionality on 2.6.
-                   Bug 10503 */
-		if (!dentry->d_inode->i_nlink) {
-			ll_lock_dcache(inode);
-			d_lustre_invalidate(dentry, 0);
-			ll_unlock_dcache(inode);
-		}
+	if (exp_connect_flags2(exp) & OBD_CONNECT2_GETATTR_PFID) {
+		parent = dentry->d_parent->d_inode;
+		name = dentry->d_name.name;
+		namelen = dentry->d_name.len;
+	} else {
+		parent = inode;
+	}
 
-                ll_lookup_finish_locks(&oit, dentry);
-        } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
-		struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
-		u64 valid = OBD_MD_FLGETATTR;
-		struct md_op_data *op_data;
-		int ealen = 0;
+	op_data = ll_prep_md_op_data(NULL, parent, inode, name, namelen, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
 
-		if (S_ISREG(inode->i_mode)) {
-			rc = ll_get_default_mdsize(sbi, &ealen);
-			if (rc)
-				RETURN(rc);
-			valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
-		}
+	/* Call getattr by fid */
+	if (exp_connect_flags2(exp) & OBD_CONNECT2_GETATTR_PFID)
+		op_data->op_flags = MF_GETATTR_BY_FID;
+	rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
+	ll_finish_md_op_data(op_data);
+	if (rc < 0) {
+		rc = ll_inode_revalidate_fini(inode, rc);
+		GOTO(out, rc);
+	}
 
-                op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
-                                             0, ealen, LUSTRE_OPC_ANY,
-                                             NULL);
-                if (IS_ERR(op_data))
-                        RETURN(PTR_ERR(op_data));
+	rc = ll_revalidate_it_finish(req, &oit, dentry);
+	if (rc != 0) {
+		ll_intent_release(&oit);
+		GOTO(out, rc);
+	}
 
-                op_data->op_valid = valid;
-                rc = md_getattr(sbi->ll_md_exp, op_data, &req);
-                ll_finish_md_op_data(op_data);
-                if (rc) {
-                        rc = ll_inode_revalidate_fini(inode, rc);
-                        RETURN(rc);
-                }
+	/* Unlinked? Unhash dentry, so it is not picked up later by
+	 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
+	 * here to preserve get_cwd functionality on 2.6.
+	 * Bug 10503 */
+	if (!dentry->d_inode->i_nlink) {
+		ll_lock_dcache(inode);
+		d_lustre_invalidate(dentry, 0);
+		ll_unlock_dcache(inode);
+	}
 
-                rc = ll_prep_inode(&inode, req, NULL, NULL);
-        }
+	ll_lookup_finish_locks(&oit, dentry);
 out:
-        ptlrpc_req_finished(req);
-        return rc;
+	ptlrpc_req_finished(req);
+
+	return rc;
 }
 
 static int ll_merge_md_attr(struct inode *inode)
 {
+	struct ll_inode_info *lli = ll_i2info(inode);
 	struct cl_attr attr = { 0 };
 	int rc;
 
-	LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
+	LASSERT(lli->lli_lsm_md != NULL);
+	down_read(&lli->lli_lsm_sem);
 	rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
 			   &attr, ll_md_blocking_ast);
+	up_read(&lli->lli_lsm_sem);
 	if (rc != 0)
 		RETURN(rc);
 
@@ -3766,43 +4479,6 @@ static int ll_merge_md_attr(struct inode *inode)
 	RETURN(0);
 }
 
-static int
-ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
-{
-	struct inode	*inode = dentry->d_inode;
-	int		 rc;
-	ENTRY;
-
-	rc = __ll_inode_revalidate(dentry, ibits);
-	if (rc != 0)
-		RETURN(rc);
-
-	/* if object isn't regular file, don't validate size */
-	if (!S_ISREG(inode->i_mode)) {
-		if (S_ISDIR(inode->i_mode) &&
-		    ll_i2info(inode)->lli_lsm_md != NULL) {
-			rc = ll_merge_md_attr(inode);
-			if (rc != 0)
-				RETURN(rc);
-		}
-
-		inode->i_atime.tv_sec = ll_i2info(inode)->lli_atime;
-		inode->i_mtime.tv_sec = ll_i2info(inode)->lli_mtime;
-		inode->i_ctime.tv_sec = ll_i2info(inode)->lli_ctime;
-	} else {
-		/* In case of restore, the MDT has the right size and has
-		 * already send it back without granting the layout lock,
-		 * inode is up-to-date so glimpse is useless.
-		 * Also to glimpse we need the layout, in case of a running
-		 * restore the MDT holds the layout lock so the glimpse will
-		 * block up to the end of restore (getattr will block)
-		 */
-		if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
-			rc = ll_glimpse_size(inode);
-	}
-	RETURN(rc);
-}
-
 static inline dev_t ll_compat_encode_dev(dev_t dev)
 {
 	/* The compat_sys_*stat*() syscalls will fail unless the
@@ -3818,24 +4494,49 @@ static inline dev_t ll_compat_encode_dev(dev_t dev)
 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
 int ll_getattr(const struct path *path, struct kstat *stat,
 	       u32 request_mask, unsigned int flags)
-
 {
 	struct dentry *de = path->dentry;
 #else
 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
 {
 #endif
-        struct inode *inode = de->d_inode;
-        struct ll_sb_info *sbi = ll_i2sbi(inode);
-        struct ll_inode_info *lli = ll_i2info(inode);
-        int res = 0;
+	struct inode *inode = de->d_inode;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int rc;
+
+	ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
 
-	res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
-				      MDS_INODELOCK_LOOKUP);
-        ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
+	rc = ll_inode_revalidate(de, IT_GETATTR);
+	if (rc < 0)
+		RETURN(rc);
+
+	if (S_ISREG(inode->i_mode)) {
+		/* In case of restore, the MDT has the right size and has
+		 * already send it back without granting the layout lock,
+		 * inode is up-to-date so glimpse is useless.
+		 * Also to glimpse we need the layout, in case of a running
+		 * restore the MDT holds the layout lock so the glimpse will
+		 * block up to the end of restore (getattr will block)
+		 */
+		if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
+			rc = ll_glimpse_size(inode);
+			if (rc < 0)
+				RETURN(rc);
+		}
+	} else {
+		/* If object isn't regular a file then don't validate size. */
+		if (S_ISDIR(inode->i_mode) &&
+		    lli->lli_lsm_md != NULL) {
+			rc = ll_merge_md_attr(inode);
+			if (rc < 0)
+				RETURN(rc);
+		}
 
-        if (res)
-                return res;
+		inode->i_atime.tv_sec = lli->lli_atime;
+		inode->i_mtime.tv_sec = lli->lli_mtime;
+		inode->i_ctime.tv_sec = lli->lli_ctime;
+	}
 
 	OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
 
@@ -3925,28 +4626,28 @@ int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	const char *name = NULL;
 	char *value = NULL;
 	size_t value_size = 0;
-	int rc;
+	int rc = 0;
 	ENTRY;
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		name = XATTR_NAME_POSIX_ACL_ACCESS;
-		if (acl) {
+		if (acl)
 			rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
-			if (rc)
-				GOTO(out, rc);
-		}
-
 		break;
+
 	case ACL_TYPE_DEFAULT:
 		name = XATTR_NAME_POSIX_ACL_DEFAULT;
 		if (!S_ISDIR(inode->i_mode))
-			GOTO(out, rc = acl ? -EACCES : 0);
-
+			rc = acl ? -EACCES : 0;
 		break;
+
 	default:
-		GOTO(out, rc = -EINVAL);
+		rc = -EINVAL;
+		break;
 	}
+	if (rc)
+		return rc;
 
 	if (acl) {
 		value_size = posix_acl_xattr_size(acl->a_count);
@@ -3961,16 +4662,16 @@ int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 
 	rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
 			 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
-			 name, value, value_size, 0, 0, 0, &req);
+			 name, value, value_size, 0, 0, &req);
 
 	ptlrpc_req_finished(req);
 out_value:
 	kfree(value);
 out:
-	if (!rc)
-		set_cached_acl(inode, type, acl);
-	else
+	if (rc)
 		forget_cached_acl(inode, type);
+	else
+		set_cached_acl(inode, type, acl);
 	RETURN(rc);
 }
 #endif /* CONFIG_FS_POSIX_ACL */
@@ -4039,8 +4740,7 @@ int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
         * need to do it before permission check. */
 
         if (inode == inode->i_sb->s_root->d_inode) {
-		rc = __ll_inode_revalidate(inode->i_sb->s_root,
-					   MDS_INODELOCK_LOOKUP);
+		rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
                 if (rc)
                         RETURN(rc);
         }
@@ -4242,7 +4942,6 @@ static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
 {
 	struct ll_sb_info *sbi = ll_i2sbi(inode);
 	struct ptlrpc_request *req;
-	struct mdt_body *body;
 	void *lvbdata;
 	void *lmm;
 	int lmmsize;
@@ -4262,18 +4961,20 @@ static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
 	 * layout here. Please note that we can't use the LVB buffer in
 	 * completion AST because it doesn't have a large enough buffer */
 	rc = ll_get_default_mdsize(sbi, &lmmsize);
-	if (rc == 0)
-		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
-				OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
-				lmmsize, 0, &req);
 	if (rc < 0)
 		RETURN(rc);
 
-	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
-	if (body == NULL)
-		GOTO(out, rc = -EPROTO);
+	rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
+			 XATTR_NAME_LOV, lmmsize, &req);
+	if (rc < 0) {
+		if (rc == -ENODATA)
+			GOTO(out, rc = 0); /* empty layout */
+		else
+			RETURN(rc);
+	}
 
-	lmmsize = body->mbo_eadatasize;
+	lmmsize = rc;
+	rc = 0;
 	if (lmmsize == 0) /* empty layout */
 		GOTO(out, rc = 0);
 
@@ -4504,19 +5205,20 @@ int ll_layout_refresh(struct inode *inode, __u32 *gen)
  * Issue layout intent RPC indicating where in a file an IO is about to write.
  *
  * \param[in] inode	file inode.
- * \param[in] start	start offset of fille in bytes where an IO is about to
- *			write.
- * \param[in] end	exclusive end offset in bytes of the write range.
+ * \param[in] ext	write range with start offset of fille in bytes where
+ *			an IO is about to write, and exclusive end offset in
+ *			bytes.
  *
  * \retval 0	on success
  * \retval < 0	error code
  */
-int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end)
+int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
+			   struct lu_extent *ext)
 {
 	struct layout_intent intent = {
-		.li_opc = LAYOUT_INTENT_WRITE,
-		.li_start = start,
-		.li_end = end,
+		.li_opc = opc,
+		.li_extent.e_start = ext->e_start,
+		.li_extent.e_end = ext->e_end,
 	};
 	int rc;
 	ENTRY;
diff --git a/drivers/staging/lustrefsx/lustre/llite/glimpse.c b/drivers/staging/lustrefsx/lustre/llite/glimpse.c
index d34be28747bdd..ddbaa142514de 100644
--- a/drivers/staging/lustrefsx/lustre/llite/glimpse.c
+++ b/drivers/staging/lustrefsx/lustre/llite/glimpse.c
@@ -23,14 +23,13 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  * Lustre is a trademark of Sun Microsystems, Inc.
  *
- * glimpse code shared between vvp and liblustre (and other Lustre clients in
- * the future).
+ * glimpse code used by vvp (and other Lustre clients in the future).
  *
  *   Author: Nikita Danilov <nikita.danilov@sun.com>
  *   Author: Oleg Drokin <oleg.drokin@sun.com>
@@ -92,7 +91,7 @@ int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
 	CDEBUG(D_DLMTRACE, "Glimpsing inode "DFID"\n", PFID(fid));
 
 	/* NOTE: this looks like DLM lock request, but it may
-	 *       not be one. Due to CEF_ASYNC flag (translated
+	 *       not be one. Due to CEF_GLIMPSE flag (translated
 	 *       to LDLM_FL_HAS_INTENT by osc), this is
 	 *       glimpse request, that won't revoke any
 	 *       conflicting DLM locks held. Instead,
@@ -107,14 +106,10 @@ int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
 	*descr = whole_file;
 	descr->cld_obj = clob;
 	descr->cld_mode = CLM_READ;
-	descr->cld_enq_flags = CEF_ASYNC | CEF_MUST;
+	descr->cld_enq_flags = CEF_GLIMPSE | CEF_MUST;
 	if (agl)
-		descr->cld_enq_flags |= CEF_AGL;
+		descr->cld_enq_flags |= CEF_SPECULATIVE | CEF_NONBLOCK;
 	/*
-	 * CEF_ASYNC is used because glimpse sub-locks cannot
-	 * deadlock (because they never conflict with other
-	 * locks) and, hence, can be enqueued out-of-order.
-	 *
 	 * CEF_MUST protects glimpse lock from conversion into
 	 * a lockless mode.
 	 */
@@ -140,7 +135,20 @@ int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
 	RETURN(result);
 }
 
-static int cl_io_get(struct inode *inode, struct lu_env **envout,
+/**
+ * Get an IO environment for special operations such as glimpse locks and
+ * manually requested locks (ladvise lockahead)
+ *
+ * \param[in]  inode	inode the operation is being performed on
+ * \param[out] envout	thread specific execution environment
+ * \param[out] ioout	client io description
+ * \param[out] refcheck	reference check
+ *
+ * \retval 1		on success
+ * \retval 0		not a regular file, cannot get environment
+ * \retval negative	negative errno on error
+ */
+int cl_io_get(struct inode *inode, struct lu_env **envout,
 		     struct cl_io **ioout, __u16 *refcheck)
 {
 	struct lu_env		*env;
@@ -178,31 +186,37 @@ int cl_glimpse_size0(struct inode *inode, int agl)
          */
         struct lu_env          *env = NULL;
         struct cl_io           *io  = NULL;
-	__u16                   refcheck;
-        int                     result;
-
-        ENTRY;
-
-        result = cl_io_get(inode, &env, &io, &refcheck);
-        if (result > 0) {
-	again:
-		io->ci_verify_layout = 1;
-                result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
-                if (result > 0)
-                        /*
-                         * nothing to do for this io. This currently happens
-                         * when stripe sub-object's are not yet created.
-                         */
-                        result = io->ci_result;
-                else if (result == 0)
-                        result = cl_glimpse_lock(env, io, inode, io->ci_obj,
-                                                 agl);
+	__u16			refcheck;
+	int			retried = 0;
+	int                     result;
+
+	ENTRY;
+
+	result = cl_io_get(inode, &env, &io, &refcheck);
+	if (result <= 0)
+		RETURN(result);
+
+	do {
+		io->ci_ndelay_tried = retried++;
+		io->ci_ndelay = io->ci_verify_layout = 1;
+		result = cl_io_init(env, io, CIT_GLIMPSE, io->ci_obj);
+		if (result > 0) {
+			/*
+			 * nothing to do for this io. This currently happens
+			 * when stripe sub-object's are not yet created.
+			 */
+			result = io->ci_result;
+		} else if (result == 0) {
+			result = cl_glimpse_lock(env, io, inode, io->ci_obj,
+						 agl);
+			if (!agl && result == -EWOULDBLOCK)
+				io->ci_need_restart = 1;
+		}
 
 		OBD_FAIL_TIMEOUT(OBD_FAIL_GLIMPSE_DELAY, 2);
-                cl_io_fini(env, io);
-		if (unlikely(io->ci_need_restart))
-			goto again;
-		cl_env_put(env, &refcheck);
-	}
+		cl_io_fini(env, io);
+	} while (unlikely(io->ci_need_restart));
+
+	cl_env_put(env, &refcheck);
 	RETURN(result);
 }
diff --git a/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c b/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c
index a5fe1978c66a2..21a10ec551e44 100644
--- a/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c
+++ b/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -47,7 +47,6 @@
 #include <obd_support.h>
 #include <lustre_fid.h>
 #include <lustre_dlm.h>
-#include <lustre_ver.h>
 #include <lustre_mdc.h>
 #include <cl_object.h>
 
@@ -69,7 +68,7 @@ __u16 cl_inode_fini_refcheck;
 static DEFINE_MUTEX(cl_inode_fini_guard);
 
 int cl_setattr_ost(struct cl_object *obj, const struct iattr *attr,
-		   unsigned int attr_flags)
+		   enum op_xvalid xvalid, unsigned int attr_flags)
 {
         struct lu_env *env;
         struct cl_io  *io;
@@ -91,10 +90,14 @@ int cl_setattr_ost(struct cl_object *obj, const struct iattr *attr,
 	io->u.ci_setattr.sa_attr.lvb_ctime = attr->ia_ctime.tv_sec;
 	io->u.ci_setattr.sa_attr.lvb_size = attr->ia_size;
 	io->u.ci_setattr.sa_attr_flags = attr_flags;
-	io->u.ci_setattr.sa_valid = attr->ia_valid;
+	io->u.ci_setattr.sa_avalid = attr->ia_valid;
+	io->u.ci_setattr.sa_xvalid = xvalid;
 	io->u.ci_setattr.sa_parent_fid = lu_object_fid(&obj->co_lu);
 
 again:
+	if (attr->ia_valid & ATTR_FILE)
+		ll_io_set_mirror(io, attr->ia_file);
+
         if (cl_io_init(env, io, CIT_SETATTR, io->ci_obj) == 0) {
 		struct vvp_io *vio = vvp_env_io(env);
 
@@ -213,12 +216,12 @@ static void cl_object_put_last(struct lu_env *env, struct cl_object *obj)
 
 	if (unlikely(atomic_read(&header->loh_ref) != 1)) {
 		struct lu_site *site = obj->co_lu.lo_dev->ld_site;
-		struct lu_site_bkt_data *bkt;
+		wait_queue_head_t *wq;
 
-		bkt = lu_site_bkt_from_fid(site, &header->loh_fid);
+		wq = lu_site_wq_from_fid(site, &header->loh_fid);
 
 		init_waitqueue_entry(&waiter, current);
-		add_wait_queue(&bkt->lsb_marche_funebre, &waiter);
+		add_wait_queue(wq, &waiter);
 
 		while (1) {
 			set_current_state(TASK_UNINTERRUPTIBLE);
@@ -228,7 +231,7 @@ static void cl_object_put_last(struct lu_env *env, struct cl_object *obj)
 		}
 
 		set_current_state(TASK_RUNNING);
-		remove_wait_queue(&bkt->lsb_marche_funebre, &waiter);
+		remove_wait_queue(wq, &waiter);
 	}
 
 	cl_object_put(env, obj);
diff --git a/drivers/staging/lustrefsx/lustre/llite/lcommon_misc.c b/drivers/staging/lustrefsx/lustre/llite/lcommon_misc.c
index ced348a36b42a..5869d949ff97b 100644
--- a/drivers/staging/lustrefsx/lustre/llite/lcommon_misc.c
+++ b/drivers/staging/lustrefsx/lustre/llite/lcommon_misc.c
@@ -23,14 +23,13 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  * Lustre is a trademark of Sun Microsystems, Inc.
  *
- * cl code shared between vvp and liblustre (and other Lustre clients in the
- * future).
+ * cl code used by vvp (and other Lustre clients in the future).
  *
  */
 #define DEBUG_SUBSYSTEM S_LLITE
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_internal.h b/drivers/staging/lustrefsx/lustre/llite/llite_internal.h
index ce05c17a2231f..4f94f91131a51 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_internal.h
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -33,7 +33,6 @@
 #ifndef LLITE_INTERNAL_H
 #define LLITE_INTERNAL_H
 #include <lustre_debug.h>
-#include <lustre_ver.h>
 #include <lustre_disk.h>  /* for s2sbi */
 #include <lustre_eacl.h>
 #include <lustre_linkea.h>
@@ -45,8 +44,8 @@
 #include <lustre_intent.h>
 #include <linux/compat.h>
 #include <linux/aio.h>
-
 #include <lustre_compat.h>
+
 #include "vvp_internal.h"
 #include "range_lock.h"
 
@@ -136,8 +135,7 @@ struct ll_inode_info {
 
 	/* update atime from MDS no matter if it's older than
 	 * local inode atime. */
-	unsigned int	lli_update_atime:1,
-			lli_inode_locked:1;
+	unsigned int	lli_update_atime:1;
 
 	/* Try to make the d::member and f::member are aligned. Before using
 	 * these members, make clear whether it is directory or not. */
@@ -167,6 +165,8 @@ struct ll_inode_info {
 			unsigned int			lli_sa_enabled:1;
 			/* generation for statahead */
 			unsigned int			lli_sa_generation;
+			/* rw lock protects lli_lsm_md */
+			struct rw_semaphore		lli_lsm_sem;
 			/* directory stripe information */
 			struct lmv_stripe_md		*lli_lsm_md;
 			/* default directory stripe offset.  This is extracted
@@ -179,8 +179,8 @@ struct ll_inode_info {
 
 		/* for non-directory */
 		struct {
-			struct mutex			lli_size_mutex;
-			char			       *lli_symlink_name;
+			struct mutex		lli_size_mutex;
+			char		       *lli_symlink_name;
 			/*
 			 * struct rw_semaphore {
 			 *    signed long	count;     // align d.d_def_acl
@@ -188,23 +188,23 @@ struct ll_inode_info {
 			 *    struct list_head wait_list;
 			 * }
 			 */
-			struct rw_semaphore		lli_trunc_sem;
-			struct range_lock_tree		lli_write_tree;
+			struct rw_semaphore	lli_trunc_sem;
+			struct range_lock_tree	lli_write_tree;
 
-			struct rw_semaphore		lli_glimpse_sem;
-			cfs_time_t			lli_glimpse_time;
-			struct list_head		lli_agl_list;
-			__u64				lli_agl_index;
+			struct rw_semaphore	lli_glimpse_sem;
+			ktime_t			lli_glimpse_time;
+			struct list_head	lli_agl_list;
+			__u64			lli_agl_index;
 
 			/* for writepage() only to communicate to fsync */
-			int				lli_async_rc;
+			int			lli_async_rc;
 
 			/*
-			 * whenever a process try to read/write the file, the
+			 * Whenever a process try to read/write the file, the
 			 * jobid of the process will be saved here, and it'll
 			 * be packed into the write PRC when flush later.
 			 *
-			 * so the read/write statistics for jobid will not be
+			 * So the read/write statistics for jobid will not be
 			 * accurate if the file is shared by different jobs.
 			 */
 			char                    lli_jobid[LUSTRE_JOBID_SIZE];
@@ -261,6 +261,8 @@ enum ll_file_flags {
 	LLIF_FILE_RESTORING	= 1,
 	/* Xattr cache is attached to the file */
 	LLIF_XATTR_CACHE	= 2,
+	/* Project inherit */
+	LLIF_PROJECT_INHERIT	= 3,
 };
 
 static inline void ll_file_set_flag(struct ll_inode_info *lli,
@@ -295,12 +297,32 @@ int ll_xattr_cache_get(struct inode *inode,
 			size_t size,
 			__u64 valid);
 
+static inline bool obd_connect_has_secctx(struct obd_connect_data *data)
+{
+#if defined(HAVE_SECURITY_DENTRY_INIT_SECURITY) && defined(CONFIG_SECURITY)
+	return data->ocd_connect_flags & OBD_CONNECT_FLAGS2 &&
+	       data->ocd_connect_flags2 & OBD_CONNECT2_FILE_SECCTX;
+#else
+	return false;
+#endif /* HAVE_SECURITY_DENTRY_INIT_SECURITY */
+}
+
+static inline void obd_connect_set_secctx(struct obd_connect_data *data)
+{
+#if defined(HAVE_SECURITY_DENTRY_INIT_SECURITY) && defined(CONFIG_SECURITY)
+	data->ocd_connect_flags2 |= OBD_CONNECT2_FILE_SECCTX;
+#endif /* HAVE_SECURITY_DENTRY_INIT_SECURITY */
+}
+
 int ll_dentry_init_security(struct dentry *dentry, int mode, struct qstr *name,
 			    const char **secctx_name, void **secctx,
 			    __u32 *secctx_size);
 int ll_inode_init_security(struct dentry *dentry, struct inode *inode,
 			   struct inode *dir);
 
+int ll_listsecurity(struct inode *inode, char *secctx_name,
+		    size_t secctx_name_size);
+
 /*
  * Locking to guarantee consistency of non-atomic updates to long long i_size,
  * consistency between file size and KMS.
@@ -311,18 +333,19 @@ int ll_inode_init_security(struct dentry *dentry, struct inode *inode,
 void ll_inode_size_lock(struct inode *inode);
 void ll_inode_size_unlock(struct inode *inode);
 
-// FIXME: replace the name of this with LL_I to conform to kernel stuff
-// static inline struct ll_inode_info *LL_I(struct inode *inode)
 static inline struct ll_inode_info *ll_i2info(struct inode *inode)
 {
-        return container_of(inode, struct ll_inode_info, lli_vfs_inode);
+	return container_of(inode, struct ll_inode_info, lli_vfs_inode);
 }
 
+/* default to use at least 16M for fast read if possible */
+#define RA_REMAIN_WINDOW_MIN			MiB_TO_PAGES(16UL)
+
 /* default to about 64M of readahead on a given system. */
-#define SBI_DEFAULT_READAHEAD_MAX	(64UL << (20 - PAGE_SHIFT))
+#define SBI_DEFAULT_READAHEAD_MAX		MiB_TO_PAGES(64UL)
 
 /* default to read-ahead full files smaller than 2MB on the second read */
-#define SBI_DEFAULT_READAHEAD_WHOLE_MAX	(2UL << (20 - PAGE_SHIFT))
+#define SBI_DEFAULT_READAHEAD_WHOLE_MAX		MiB_TO_PAGES(2UL)
 
 enum ra_stat {
         RA_STAT_HIT = 0,
@@ -431,7 +454,9 @@ enum stats_track_type {
 				       * suppress_pings */
 #define LL_SBI_FAST_READ     0x400000 /* fast read support */
 #define LL_SBI_FILE_SECCTX   0x800000 /* set file security context at create */
-#define LL_SBI_PIO          0x1000000 /* parallel IO support */
+/*	LL_SBI_PIO	    0x1000000    parallel IO support, introduced in
+					 2.10, abandoned */
+#define LL_SBI_TINY_WRITE   0x2000000 /* tiny write support */
 
 #define LL_SBI_FLAGS { 	\
 	"nolck",	\
@@ -459,6 +484,7 @@ enum stats_track_type {
 	"fast_read",	\
 	"file_secctx",	\
 	"pio",		\
+	"tiny_write",	\
 }
 
 /* This is embedded into llite super-blocks to keep track of connect
@@ -477,20 +503,23 @@ struct lustre_client_ocd {
 struct ll_sb_info {
 	/* this protects pglist and ra_info.  It isn't safe to
 	 * grab from interrupt contexts */
-	spinlock_t		  ll_lock;
-	spinlock_t		  ll_pp_extent_lock; /* pp_extent entry*/
-	spinlock_t		  ll_process_lock; /* ll_rw_process_info */
-        struct obd_uuid           ll_sb_uuid;
-        struct obd_export        *ll_md_exp;
-        struct obd_export        *ll_dt_exp;
-        struct proc_dir_entry*    ll_proc_root;
-        struct lu_fid             ll_root_fid; /* root object fid */
+	spinlock_t		 ll_lock;
+	spinlock_t		 ll_pp_extent_lock; /* pp_extent entry*/
+	spinlock_t		 ll_process_lock; /* ll_rw_process_info */
+	struct obd_uuid		 ll_sb_uuid;
+	struct obd_export	*ll_md_exp;
+	struct obd_export	*ll_dt_exp;
+	struct obd_device	*ll_md_obd;
+	struct obd_device	*ll_dt_obd;
+	struct dentry		*ll_debugfs_entry;
+	struct lu_fid		 ll_root_fid; /* root object fid */
 
         int                       ll_flags;
 	unsigned int		  ll_umounting:1,
 				  ll_xattr_cache_enabled:1,
 				  ll_xattr_cache_set:1, /* already set to 0/1 */
-				  ll_client_common_fill_super_succeeded:1;
+				  ll_client_common_fill_super_succeeded:1,
+				  ll_checksum_set:1;
 
         struct lustre_client_ocd  ll_lco;
 
@@ -541,6 +570,12 @@ struct ll_sb_info {
 
 	/* st_blksize returned by stat(2), when non-zero */
 	unsigned int		  ll_stat_blksize;
+
+	/* maximum relative age of cached statfs results */
+	unsigned int		  ll_statfs_max_age;
+
+	struct kset		  ll_kset;	/* sysfs object */
+	struct completion	  ll_kobj_unregister;
 };
 
 /*
@@ -645,11 +680,19 @@ struct ll_file_data {
 	 * true: failure is known, not report again.
 	 * false: unknown failure, should report. */
 	bool fd_write_failed;
+	bool ll_lock_no_expand;
 	rwlock_t fd_lock; /* protect lcc list */
 	struct list_head fd_lccs; /* list of ll_cl_context */
+	/* Used by mirrored file to lead IOs to a specific mirror, usually
+	 * for mirror resync. 0 means default. */
+	__u32 fd_designated_mirror;
+	/* The layout version when resync starts. Resync I/O should carry this
+	 * layout version for verification to OST objects */
+	__u32 fd_layout_version;
 };
 
-extern struct proc_dir_entry *proc_lustre_fs_root;
+void llite_tunables_unregister(void);
+int llite_tunables_register(void);
 
 static inline struct inode *ll_info2i(struct ll_inode_info *lli)
 {
@@ -687,6 +730,11 @@ static inline bool ll_sbi_has_fast_read(struct ll_sb_info *sbi)
 	return !!(sbi->ll_flags & LL_SBI_FAST_READ);
 }
 
+static inline bool ll_sbi_has_tiny_write(struct ll_sb_info *sbi)
+{
+	return !!(sbi->ll_flags & LL_SBI_TINY_WRITE);
+}
+
 void ll_ras_enter(struct file *f);
 
 /* llite/lcommon_misc.c */
@@ -697,21 +745,9 @@ int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock,
 void cl_put_grouplock(struct ll_grouplock *lg);
 
 /* llite/lproc_llite.c */
-#ifdef CONFIG_PROC_FS
-int lprocfs_ll_register_mountpoint(struct proc_dir_entry *parent,
-				   struct super_block *sb);
-int lprocfs_ll_register_obd(struct super_block *sb, const char *obdname);
-void lprocfs_ll_unregister_mountpoint(struct ll_sb_info *sbi);
+int ll_debugfs_register_super(struct super_block *sb, const char *name);
+void ll_debugfs_unregister_super(struct super_block *sb);
 void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count);
-extern struct lprocfs_vars lprocfs_llite_obd_vars[];
-#else
-static inline int lprocfs_ll_register_mountpoint(struct proc_dir_entry *parent,
-					struct super_block *sb) {return 0; }
-static inline int lprocfs_ll_register_obd(struct super_block *sb,
-					  const char *obdname) {return 0; }
-static inline void lprocfs_ll_unregister_mountpoint(struct ll_sb_info *sbi) {}
-static void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count) {}
-#endif
 
 enum {
 	LPROC_LL_DIRTY_HITS,
@@ -753,6 +789,10 @@ enum {
 };
 
 /* llite/dir.c */
+enum get_default_layout_type {
+	GET_DEFAULT_LAYOUT_ROOT = 1,
+};
+
 struct ll_dir_chain {
 };
 
@@ -795,6 +835,8 @@ void ll_update_times(struct ptlrpc_request *request, struct inode *inode);
 int ll_writepage(struct page *page, struct writeback_control *wbc);
 int ll_writepages(struct address_space *, struct writeback_control *wbc);
 int ll_readpage(struct file *file, struct page *page);
+int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
+			   struct cl_page *page, struct file *file);
 void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras);
 int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io);
 
@@ -839,8 +881,25 @@ int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 #endif /* CONFIG_FS_POSIX_ACL */
 
 #endif
-int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
-	       const char *name, int namelen);
+
+static inline int ll_xflags_to_inode_flags(int xflags)
+{
+	return ((xflags & FS_XFLAG_SYNC)      ? S_SYNC      : 0) |
+	       ((xflags & FS_XFLAG_NOATIME)   ? S_NOATIME   : 0) |
+	       ((xflags & FS_XFLAG_APPEND)    ? S_APPEND    : 0) |
+	       ((xflags & FS_XFLAG_IMMUTABLE) ? S_IMMUTABLE : 0);
+}
+
+static inline int ll_inode_flags_to_xflags(int flags)
+{
+	return ((flags & S_SYNC)      ? FS_XFLAG_SYNC      : 0) |
+	       ((flags & S_NOATIME)   ? FS_XFLAG_NOATIME   : 0) |
+	       ((flags & S_APPEND)    ? FS_XFLAG_APPEND    : 0) |
+	       ((flags & S_IMMUTABLE) ? FS_XFLAG_IMMUTABLE : 0);
+}
+
+int ll_migrate(struct inode *parent, struct file *file,
+	       struct lmv_user_md *lum, const char *name);
 int ll_get_fid_by_name(struct inode *parent, const char *name,
 		       int namelen, struct lu_fid *fid, struct inode **inode);
 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
@@ -852,6 +911,7 @@ int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd);
 int ll_inode_permission(struct inode *inode, int mask);
 # endif
 #endif
+int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa);
 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
 			unsigned long arg);
 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
@@ -865,9 +925,11 @@ int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
                              struct ptlrpc_request **request);
 int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
                      int set_default);
-int ll_dir_getstripe(struct inode *inode, void **lmmp,
-		     int *lmm_size, struct ptlrpc_request **request,
-		     u64 valid);
+int ll_dir_getstripe_default(struct inode *inode, void **lmmp,
+			     int *lmm_size, struct ptlrpc_request **request,
+			     struct ptlrpc_request **root_request, u64 valid);
+int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size,
+		     struct ptlrpc_request **request, u64 valid);
 #ifdef HAVE_FILE_FSYNC_4ARGS
 int ll_fsync(struct file *file, loff_t start, loff_t end, int data);
 #elif defined(HAVE_FILE_FSYNC_2ARGS)
@@ -880,6 +942,7 @@ int ll_fid2path(struct inode *inode, void __user *arg);
 int ll_data_version(struct inode *inode, __u64 *data_version, int flags);
 int ll_hsm_release(struct inode *inode);
 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss);
+void ll_io_set_mirror(struct cl_io *io, const struct file *file);
 
 /* llite/dcache.c */
 
@@ -902,12 +965,14 @@ void ll_kill_super(struct super_block *sb);
 struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock);
 void ll_dir_clear_lsm_md(struct inode *inode);
 void ll_clear_inode(struct inode *inode);
-int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import);
+int ll_setattr_raw(struct dentry *dentry, struct iattr *attr,
+		   enum op_xvalid xvalid, bool hsm_import);
 int ll_setattr(struct dentry *de, struct iattr *attr);
 int ll_statfs(struct dentry *de, struct kstatfs *sfs);
-int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
-                       __u64 max_age, __u32 flags);
+int ll_statfs_internal(struct ll_sb_info *sbi, struct obd_statfs *osfs,
+		       u32 flags);
 int ll_update_inode(struct inode *inode, struct lustre_md *md);
+void ll_update_inode_flags(struct inode *inode, int ext_flags);
 int ll_read_inode2(struct inode *inode, void *opaque);
 void ll_delete_inode(struct inode *inode);
 int ll_iocontrol(struct inode *inode, struct file *file,
@@ -927,7 +992,6 @@ int ll_obd_statfs(struct inode *inode, void __user *arg);
 int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize);
 int ll_get_default_mdsize(struct ll_sb_info *sbi, int *default_mdsize);
 int ll_set_default_mdsize(struct ll_sb_info *sbi, int default_mdsize);
-int ll_process_config(struct lustre_cfg *lcfg);
 
 enum {
 	LUSTRE_OPC_MKDIR	= 0,
@@ -937,6 +1001,7 @@ enum {
 	LUSTRE_OPC_ANY		= 5,
 };
 
+void ll_unlock_md_op_lsm(struct md_op_data *op_data);
 struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
 				      struct inode *i1, struct inode *i2,
 				      const char *name, size_t namelen,
@@ -949,6 +1014,8 @@ ssize_t ll_copy_user_md(const struct lov_user_md __user *md,
 			struct lov_user_md **kbuf);
 void ll_open_cleanup(struct super_block *sb, struct ptlrpc_request *open_req);
 
+void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req);
+
 /* Compute expected user md size when passing in a md from user space */
 static inline ssize_t ll_lov_user_md_size(const struct lov_user_md *lum)
 {
@@ -1017,7 +1084,6 @@ struct ll_thread_info {
 	struct iov_iter		lti_iter;
 	struct vvp_io_args	lti_args;
 	struct ra_io_arg	lti_ria;
-	struct kiocb		lti_kiocb;
 	struct ll_cl_context	lti_io_ctx;
 };
 
@@ -1232,11 +1298,18 @@ static inline int cl_glimpse_size(struct inode *inode)
 	return cl_glimpse_size0(inode, 0);
 }
 
+/* AGL is 'asychronous glimpse lock', which is a speculative lock taken as
+ * part of statahead */
 static inline int cl_agl(struct inode *inode)
 {
 	return cl_glimpse_size0(inode, 1);
 }
 
+int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise);
+
+int cl_io_get(struct inode *inode, struct lu_env **envout,
+	      struct cl_io **ioout, __u16 *refcheck);
+
 static inline int ll_glimpse_size(struct inode *inode)
 {
 	struct ll_inode_info *lli = ll_i2info(inode);
@@ -1244,7 +1317,7 @@ static inline int ll_glimpse_size(struct inode *inode)
 
 	down_read(&lli->lli_glimpse_sem);
 	rc = cl_glimpse_size(inode);
-	lli->lli_glimpse_time = cfs_time_current();
+	lli->lli_glimpse_time = ktime_get();
 	up_read(&lli->lli_glimpse_sem);
 	return rc;
 }
@@ -1414,7 +1487,8 @@ static inline void d_lustre_revalidate(struct dentry *dentry)
 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf);
 int ll_layout_refresh(struct inode *inode, __u32 *gen);
 int ll_layout_restore(struct inode *inode, loff_t start, __u64 length);
-int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end);
+int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
+			   struct lu_extent *ext);
 
 int ll_xattr_init(void);
 void ll_xattr_fini(void);
@@ -1426,7 +1500,7 @@ int ll_getparent(struct file *file, struct getparent __user *arg);
 
 /* lcommon_cl.c */
 int cl_setattr_ost(struct cl_object *obj, const struct iattr *attr,
-		   unsigned int attr_flags);
+		   enum op_xvalid xvalid, unsigned int attr_flags);
 
 extern struct lu_env *cl_inode_fini_env;
 extern __u16 cl_inode_fini_refcheck;
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_lib.c b/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
index 644b1c4e26d47..52bea6c96dc1a 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -46,16 +46,18 @@
 #ifdef HAVE_UIDGID_HEADER
 # include <linux/uidgid.h>
 #endif
+#include <linux/security.h>
 
-#include <uapi/linux/lustre_ioctl.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
 #ifdef HAVE_UAPI_LINUX_MOUNT_H
 #include <uapi/linux/mount.h>
 #endif
+
 #include <lustre_ha.h>
 #include <lustre_dlm.h>
 #include <lprocfs_status.h>
 #include <lustre_disk.h>
-#include <uapi/linux/lustre_param.h>
+#include <uapi/linux/lustre/lustre_param.h>
 #include <lustre_log.h>
 #include <cl_object.h>
 #include <obd_cksum.h>
@@ -86,6 +88,7 @@ static struct ll_sb_info *ll_init_sbi(void)
 	spin_lock_init(&sbi->ll_pp_extent_lock);
 	spin_lock_init(&sbi->ll_process_lock);
         sbi->ll_rw_stats_on = 0;
+	sbi->ll_statfs_max_age = OBD_STATFS_CACHE_SECONDS;
 
         si_meminfo(&si);
         pages = si.totalram - si.totalhigh;
@@ -111,6 +114,9 @@ static struct ll_sb_info *ll_init_sbi(void)
 #ifdef ENABLE_CHECKSUM
         sbi->ll_flags |= LL_SBI_CHECKSUM;
 #endif
+#ifdef ENABLE_FLOCK
+	sbi->ll_flags |= LL_SBI_FLOCK;
+#endif
 
 #ifdef HAVE_LRU_RESIZE_SUPPORT
         sbi->ll_flags |= LL_SBI_LRU_RESIZE;
@@ -133,6 +139,7 @@ static struct ll_sb_info *ll_init_sbi(void)
 	atomic_set(&sbi->ll_agl_total, 0);
 	sbi->ll_flags |= LL_SBI_AGL_ENABLED;
 	sbi->ll_flags |= LL_SBI_FAST_READ;
+	sbi->ll_flags |= LL_SBI_TINY_WRITE;
 
 	/* root squash */
 	sbi->ll_squash.rsi_uid = 0;
@@ -160,30 +167,23 @@ static void ll_free_sbi(struct super_block *sb)
 	EXIT;
 }
 
-static inline int obd_connect_has_secctx(struct obd_connect_data *data)
-{
-	return data->ocd_connect_flags & OBD_CONNECT_FLAGS2 &&
-	       data->ocd_connect_flags2 & OBD_CONNECT2_FILE_SECCTX;
-}
-
 static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
                                     struct vfsmount *mnt)
 {
 	struct inode *root = NULL;
-        struct ll_sb_info *sbi = ll_s2sbi(sb);
-        struct obd_device *obd;
-        struct obd_statfs *osfs = NULL;
-        struct ptlrpc_request *request = NULL;
-        struct obd_connect_data *data = NULL;
-        struct obd_uuid *uuid;
-        struct md_op_data *op_data;
-        struct lustre_md lmd;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct obd_statfs *osfs = NULL;
+	struct ptlrpc_request *request = NULL;
+	struct obd_connect_data *data = NULL;
+	struct obd_uuid *uuid;
+	struct md_op_data *op_data;
+	struct lustre_md lmd;
 	u64 valid;
-        int size, err, checksum;
-        ENTRY;
+	int size, err, checksum;
 
-        obd = class_name2obd(md);
-        if (!obd) {
+	ENTRY;
+	sbi->ll_md_obd = class_name2obd(md);
+	if (!sbi->ll_md_obd) {
                 CERROR("MD %s: not setup or attached\n", md);
                 RETURN(-EINVAL);
         }
@@ -198,13 +198,18 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
                 RETURN(-ENOMEM);
         }
 
-        /* indicate the features supported by this client */
-        data->ocd_connect_flags = OBD_CONNECT_IBITS    | OBD_CONNECT_NODEVOH  |
-                                  OBD_CONNECT_ATTRFID  |
-                                  OBD_CONNECT_VERSION  | OBD_CONNECT_BRW_SIZE |
-                                  OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA |
-                                  OBD_CONNECT_CANCELSET | OBD_CONNECT_FID     |
-                                  OBD_CONNECT_AT       | OBD_CONNECT_LOV_V3   |
+	/* pass client page size via ocd_grant_blkbits, the server should report
+	 * back its backend blocksize for grant calculation purpose */
+	data->ocd_grant_blkbits = PAGE_SHIFT;
+
+	/* indicate MDT features supported by this client */
+	data->ocd_connect_flags = OBD_CONNECT_IBITS    | OBD_CONNECT_NODEVOH  |
+				  OBD_CONNECT_ATTRFID  | OBD_CONNECT_GRANT |
+				  OBD_CONNECT_VERSION  | OBD_CONNECT_BRW_SIZE |
+				  OBD_CONNECT_SRVLOCK  | OBD_CONNECT_TRUNCLOCK|
+				  OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA |
+				  OBD_CONNECT_CANCELSET | OBD_CONNECT_FID     |
+				  OBD_CONNECT_AT       | OBD_CONNECT_LOV_V3   |
 				  OBD_CONNECT_VBR | OBD_CONNECT_FULL20 |
 				  OBD_CONNECT_64BITHASH |
 				  OBD_CONNECT_EINPROGRESS |
@@ -215,11 +220,20 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 				  OBD_CONNECT_DISP_STRIPE | OBD_CONNECT_LFSCK |
 				  OBD_CONNECT_OPEN_BY_FID |
 				  OBD_CONNECT_DIR_STRIPE |
-				  OBD_CONNECT_BULK_MBITS |
+				  OBD_CONNECT_BULK_MBITS | OBD_CONNECT_CKSUM |
 				  OBD_CONNECT_SUBTREE |
-				  OBD_CONNECT_FLAGS2 | OBD_CONNECT_MULTIMODRPCS;
-
-	data->ocd_connect_flags2 = 0;
+				  OBD_CONNECT_MULTIMODRPCS |
+				  OBD_CONNECT_GRANT_PARAM |
+				  OBD_CONNECT_SHORTIO | OBD_CONNECT_FLAGS2;
+
+	data->ocd_connect_flags2 = OBD_CONNECT2_FLR |
+				   OBD_CONNECT2_LOCK_CONVERT |
+				   OBD_CONNECT2_DIR_MIGRATE |
+				   OBD_CONNECT2_SUM_STATFS |
+				   OBD_CONNECT2_ARCHIVE_ID_ARRAY |
+				   OBD_CONNECT2_LSOM |
+				   OBD_CONNECT2_ASYNC_DISCARD |
+				   OBD_CONNECT2_GETATTR_PFID;
 
 #ifdef HAVE_LRU_RESIZE_SUPPORT
         if (sbi->ll_flags & LL_SBI_LRU_RESIZE)
@@ -230,6 +244,8 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 				   OBD_CONNECT_LARGE_ACL;
 #endif
 
+	data->ocd_cksum_types = obd_cksum_types_supported_client();
+
 	if (OBD_FAIL_CHECK(OBD_FAIL_MDC_LIGHTWEIGHT))
 		/* flag mdc connection as lightweight, only used for test
 		 * purpose, use with care */
@@ -261,13 +277,16 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	if (sbi->ll_flags & LL_SBI_ALWAYS_PING)
 		data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS;
 
-#if defined(HAVE_SECURITY_DENTRY_INIT_SECURITY) && defined(CONFIG_SECURITY)
-	data->ocd_connect_flags2 |= OBD_CONNECT2_FILE_SECCTX;
-#endif /* HAVE_SECURITY_DENTRY_INIT_SECURITY */
+	obd_connect_set_secctx(data);
+
+#if defined(CONFIG_SECURITY)
+	data->ocd_connect_flags2 |= OBD_CONNECT2_SELINUX_POLICY;
+#endif
 
 	data->ocd_brw_size = MD_MAX_BRW_SIZE;
 
-        err = obd_connect(NULL, &sbi->ll_md_exp, obd, &sbi->ll_sb_uuid, data, NULL);
+	err = obd_connect(NULL, &sbi->ll_md_exp, sbi->ll_md_obd,
+			  &sbi->ll_sb_uuid, data, NULL);
 	if (err == -EBUSY) {
 		LCONSOLE_ERROR_MSG(0x14f, "An MDT (md %s) is performing "
 				   "recovery, of which this client is not a "
@@ -293,7 +312,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	 * can make sure the client can be mounted as long as MDT0 is
 	 * avaible */
 	err = obd_statfs(NULL, sbi->ll_md_exp, osfs,
-			cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			ktime_get_seconds() - sbi->ll_statfs_max_age,
 			OBD_STATFS_FOR_MDT0);
 	if (err)
 		GOTO(out_md_fid, err);
@@ -380,8 +399,8 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 		}
 	}
 
-	obd = class_name2obd(dt);
-	if (!obd) {
+	sbi->ll_dt_obd = class_name2obd(dt);
+	if (!sbi->ll_dt_obd) {
 		CERROR("DT %s: not setup or attached\n", dt);
 		GOTO(out_md_fid, err = -ENODEV);
 	}
@@ -390,6 +409,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	 * back its backend blocksize for grant calculation purpose */
 	data->ocd_grant_blkbits = PAGE_SHIFT;
 
+	/* indicate OST features supported by this client */
 	data->ocd_connect_flags = OBD_CONNECT_GRANT | OBD_CONNECT_VERSION |
 				  OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE |
 				  OBD_CONNECT_CANCELSET | OBD_CONNECT_FID |
@@ -401,23 +421,41 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 				  OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
 				  OBD_CONNECT_LAYOUTLOCK |
 				  OBD_CONNECT_PINGLESS | OBD_CONNECT_LFSCK |
-				  OBD_CONNECT_BULK_MBITS;
+				  OBD_CONNECT_BULK_MBITS | OBD_CONNECT_SHORTIO |
+				  OBD_CONNECT_FLAGS2 | OBD_CONNECT_GRANT_SHRINK;
+
+/* The client currently advertises support for OBD_CONNECT_LOCKAHEAD_OLD so it
+ * can interoperate with an older version of lockahead which was released prior
+ * to landing in master. This support will be dropped when 2.13 development
+ * starts.  At the point, we should not just drop the connect flag (below), we
+ * should also remove the support in the code.
+ *
+ * Removing it means a few things:
+ * 1. Remove this section here
+ * 2. Remove CEF_NONBLOCK in ll_file_lockahead()
+ * 3. Remove function exp_connect_lockahead_old
+ * 4. Remove LDLM_FL_LOCKAHEAD_OLD_RESERVED in lustre_dlm_flags.h
+ * */
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 12, 50, 0)
+	data->ocd_connect_flags |= OBD_CONNECT_LOCKAHEAD_OLD;
+#endif
 
-	data->ocd_connect_flags2 = 0;
+	data->ocd_connect_flags2 = OBD_CONNECT2_LOCKAHEAD;
 
 	if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_GRANT_PARAM))
 		data->ocd_connect_flags |= OBD_CONNECT_GRANT_PARAM;
 
 	/* OBD_CONNECT_CKSUM should always be set, even if checksums are
 	 * disabled by default, because it can still be enabled on the
-	 * fly via /proc. As a consequence, we still need to come to an
-	 * agreement on the supported algorithms at connect time */
+	 * fly via /sys. As a consequence, we still need to come to an
+	 * agreement on the supported algorithms at connect time
+	 */
 	data->ocd_connect_flags |= OBD_CONNECT_CKSUM;
 
 	if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CKSUM_ADLER_ONLY))
 		data->ocd_cksum_types = OBD_CKSUM_ADLER;
 	else
-		data->ocd_cksum_types = cksum_types_supported_client();
+		data->ocd_cksum_types = obd_cksum_types_supported_client();
 
 #ifdef HAVE_LRU_RESIZE_SUPPORT
 	data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
@@ -430,13 +468,13 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	       "ocd_grant: %d\n", data->ocd_connect_flags,
 	       data->ocd_version, data->ocd_grant);
 
-	obd->obd_upcall.onu_owner = &sbi->ll_lco;
-	obd->obd_upcall.onu_upcall = cl_ocd_update;
+	sbi->ll_dt_obd->obd_upcall.onu_owner = &sbi->ll_lco;
+	sbi->ll_dt_obd->obd_upcall.onu_upcall = cl_ocd_update;
 
 	data->ocd_brw_size = DT_MAX_BRW_SIZE;
 
-	err = obd_connect(NULL, &sbi->ll_dt_exp, obd, &sbi->ll_sb_uuid, data,
-			  NULL);
+	err = obd_connect(NULL, &sbi->ll_dt_exp, sbi->ll_dt_obd,
+			  &sbi->ll_sb_uuid, data, NULL);
 	if (err == -EBUSY) {
 		LCONSOLE_ERROR_MSG(0x150, "An OST (dt %s) is performing "
 				   "recovery, of which this client is not a "
@@ -452,10 +490,15 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	sbi->ll_dt_exp->exp_connect_data = *data;
 
 	/* Don't change value if it was specified in the config log */
-	if (sbi->ll_ra_info.ra_max_read_ahead_whole_pages == -1)
+	if (sbi->ll_ra_info.ra_max_read_ahead_whole_pages == -1) {
 		sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
 			max_t(unsigned long, SBI_DEFAULT_READAHEAD_WHOLE_MAX,
 			      (data->ocd_brw_size >> PAGE_SHIFT));
+		if (sbi->ll_ra_info.ra_max_read_ahead_whole_pages >
+		    sbi->ll_ra_info.ra_max_pages_per_file)
+			sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
+				sbi->ll_ra_info.ra_max_pages_per_file;
+	}
 
 	err = obd_fid_init(sbi->ll_dt_exp->exp_obd, sbi->ll_dt_exp,
 			   LUSTRE_SEQ_METADATA);
@@ -546,13 +589,15 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	}
 
 	checksum = sbi->ll_flags & LL_SBI_CHECKSUM;
-	err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM),
-				 KEY_CHECKSUM, sizeof(checksum), &checksum,
-				 NULL);
-	if (err) {
-		CERROR("%s: Set checksum failed: rc = %d\n",
-		       sbi->ll_dt_exp->exp_obd->obd_name, err);
-		GOTO(out_root, err);
+	if (sbi->ll_checksum_set) {
+		err = obd_set_info_async(NULL, sbi->ll_dt_exp,
+					 sizeof(KEY_CHECKSUM), KEY_CHECKSUM,
+					 sizeof(checksum), &checksum, NULL);
+		if (err) {
+			CERROR("%s: Set checksum failed: rc = %d\n",
+			       sbi->ll_dt_exp->exp_obd->obd_name, err);
+			GOTO(out_root, err);
+		}
 	}
 	cl_sb_init(sb);
 
@@ -591,14 +636,21 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	if (osfs != NULL)
 		OBD_FREE_PTR(osfs);
 
-	if (sbi->ll_proc_root != NULL) {
-		err = lprocfs_ll_register_obd(sb, dt);
+	if (sbi->ll_dt_obd) {
+		err = sysfs_create_link(&sbi->ll_kset.kobj,
+					&sbi->ll_dt_obd->obd_kset.kobj,
+					sbi->ll_dt_obd->obd_type->typ_name);
 		if (err < 0) {
 			CERROR("%s: could not register %s in llite: rc = %d\n",
 			       dt, ll_get_fsname(sb, NULL, 0), err);
 			err = 0;
 		}
-		err = lprocfs_ll_register_obd(sb, md);
+	}
+
+	if (sbi->ll_md_obd) {
+		err = sysfs_create_link(&sbi->ll_kset.kobj,
+					&sbi->ll_md_obd->obd_kset.kobj,
+					sbi->ll_md_obd->obd_type->typ_name);
 		if (err < 0) {
 			CERROR("%s: could not register %s in llite: rc = %d\n",
 			       md, ll_get_fsname(sb, NULL, 0), err);
@@ -615,11 +667,13 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 out_dt:
 	obd_disconnect(sbi->ll_dt_exp);
 	sbi->ll_dt_exp = NULL;
+	sbi->ll_dt_obd = NULL;
 out_md_fid:
 	obd_fid_fini(sbi->ll_md_exp->exp_obd);
 out_md:
 	obd_disconnect(sbi->ll_md_exp);
 	sbi->ll_md_exp = NULL;
+	sbi->ll_md_obd = NULL;
 out:
 	if (data != NULL)
 		OBD_FREE_PTR(data);
@@ -711,7 +765,7 @@ static void client_common_put_super(struct super_block *sb)
 	obd_disconnect(sbi->ll_dt_exp);
 	sbi->ll_dt_exp = NULL;
 
-	lprocfs_ll_unregister_mountpoint(sbi);
+	ll_debugfs_unregister_super(sb);
 
 	obd_fid_fini(sbi->ll_md_exp->exp_obd);
 	obd_disconnect(sbi->ll_md_exp);
@@ -749,56 +803,57 @@ void ll_kill_super(struct super_block *sb)
 
 static inline int ll_set_opt(const char *opt, char *data, int fl)
 {
-        if (strncmp(opt, data, strlen(opt)) != 0)
-                return(0);
-        else
-                return(fl);
+	if (strncmp(opt, data, strlen(opt)) != 0)
+		return 0;
+	else
+		return fl;
 }
 
 /* non-client-specific mount options are parsed in lmd_parse */
-static int ll_options(char *options, int *flags)
+static int ll_options(char *options, struct ll_sb_info *sbi)
 {
-        int tmp;
-        char *s1 = options, *s2;
-        ENTRY;
+	int tmp;
+	char *s1 = options, *s2;
+	int *flags = &sbi->ll_flags;
+	ENTRY;
 
-        if (!options)
-                RETURN(0);
+	if (!options)
+		RETURN(0);
 
-        CDEBUG(D_CONFIG, "Parsing opts %s\n", options);
+	CDEBUG(D_CONFIG, "Parsing opts %s\n", options);
 
-        while (*s1) {
-                CDEBUG(D_SUPER, "next opt=%s\n", s1);
-                tmp = ll_set_opt("nolock", s1, LL_SBI_NOLCK);
-                if (tmp) {
-                        *flags |= tmp;
-                        goto next;
-                }
-                tmp = ll_set_opt("flock", s1, LL_SBI_FLOCK);
-                if (tmp) {
-                        *flags |= tmp;
-                        goto next;
-                }
-                tmp = ll_set_opt("localflock", s1, LL_SBI_LOCALFLOCK);
-                if (tmp) {
-                        *flags |= tmp;
-                        goto next;
-                }
-                tmp = ll_set_opt("noflock", s1, LL_SBI_FLOCK|LL_SBI_LOCALFLOCK);
-                if (tmp) {
-                        *flags &= ~tmp;
-                        goto next;
-                }
-                tmp = ll_set_opt("user_xattr", s1, LL_SBI_USER_XATTR);
-                if (tmp) {
-                        *flags |= tmp;
-                        goto next;
-                }
-                tmp = ll_set_opt("nouser_xattr", s1, LL_SBI_USER_XATTR);
-                if (tmp) {
-                        *flags &= ~tmp;
-                        goto next;
-                }
+	while (*s1) {
+		CDEBUG(D_SUPER, "next opt=%s\n", s1);
+		tmp = ll_set_opt("nolock", s1, LL_SBI_NOLCK);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("flock", s1, LL_SBI_FLOCK);
+		if (tmp) {
+			*flags = (*flags & ~LL_SBI_LOCALFLOCK) | tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("localflock", s1, LL_SBI_LOCALFLOCK);
+		if (tmp) {
+			*flags = (*flags & ~LL_SBI_FLOCK) | tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("noflock", s1, LL_SBI_FLOCK|LL_SBI_LOCALFLOCK);
+		if (tmp) {
+			*flags &= ~tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("user_xattr", s1, LL_SBI_USER_XATTR);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("nouser_xattr", s1, LL_SBI_USER_XATTR);
+		if (tmp) {
+			*flags &= ~tmp;
+			goto next;
+		}
 		tmp = ll_set_opt("context", s1, 1);
 		if (tmp)
 			goto next;
@@ -822,16 +877,18 @@ static int ll_options(char *options, int *flags)
 			goto next;
 		}
 
-                tmp = ll_set_opt("checksum", s1, LL_SBI_CHECKSUM);
-                if (tmp) {
-                        *flags |= tmp;
-                        goto next;
-                }
-                tmp = ll_set_opt("nochecksum", s1, LL_SBI_CHECKSUM);
-                if (tmp) {
-                        *flags &= ~tmp;
-                        goto next;
-                }
+		tmp = ll_set_opt("checksum", s1, LL_SBI_CHECKSUM);
+		if (tmp) {
+			*flags |= tmp;
+			sbi->ll_checksum_set = 1;
+			goto next;
+		}
+		tmp = ll_set_opt("nochecksum", s1, LL_SBI_CHECKSUM);
+		if (tmp) {
+			*flags &= ~tmp;
+			sbi->ll_checksum_set = 1;
+			goto next;
+		}
                 tmp = ll_set_opt("lruresize", s1, LL_SBI_LRU_RESIZE);
                 if (tmp) {
                         *flags |= tmp;
@@ -918,21 +975,24 @@ void ll_lli_init(struct ll_inode_info *lli)
 		lli->lli_opendir_pid = 0;
 		lli->lli_sa_enabled = 0;
 		lli->lli_def_stripe_offset = -1;
+		init_rwsem(&lli->lli_lsm_sem);
 	} else {
 		mutex_init(&lli->lli_size_mutex);
 		lli->lli_symlink_name = NULL;
 		init_rwsem(&lli->lli_trunc_sem);
 		range_lock_tree_init(&lli->lli_write_tree);
 		init_rwsem(&lli->lli_glimpse_sem);
-		lli->lli_glimpse_time = 0;
+		lli->lli_glimpse_time = ktime_set(0, 0);
 		INIT_LIST_HEAD(&lli->lli_agl_list);
 		lli->lli_agl_index = 0;
 		lli->lli_async_rc = 0;
 	}
 	mutex_init(&lli->lli_layout_mutex);
-	memset(lli->lli_jobid, 0, LUSTRE_JOBID_SIZE);
+	memset(lli->lli_jobid, 0, sizeof(lli->lli_jobid));
 }
 
+#define MAX_STRING_SIZE 128
+
 #ifndef HAVE_SUPER_SETUP_BDI_NAME
 
 #define LSI_BDI_INITIALIZED	0x00400000
@@ -941,8 +1001,6 @@ void ll_lli_init(struct ll_inode_info *lli)
 # define BDI_CAP_MAP_COPY	0
 #endif
 
-#define MAX_STRING_SIZE 128
-
 static int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
 {
 	struct  lustre_sb_info *lsi = s2lsi(sb);
@@ -973,68 +1031,79 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
 {
 	struct	lustre_profile *lprof = NULL;
 	struct	lustre_sb_info *lsi = s2lsi(sb);
-	struct	ll_sb_info *sbi;
+	struct	ll_sb_info *sbi = NULL;
 	char	*dt = NULL, *md = NULL;
 	char	*profilenm = get_profile_name(sb);
 	struct config_llog_instance *cfg;
 	/* %p for void* in printf needs 16+2 characters: 0xffffffffffffffff */
-	const int instlen = sizeof(cfg->cfg_instance) * 2 + 2;
-	int	md_len = 0;
-	int	dt_len = 0;
-	int	err;
+	const int instlen = 16 + 2;
+	unsigned long cfg_instance = ll_get_cfg_instance(sb);
+	char name[MAX_STRING_SIZE];
+	int md_len = 0;
+	int dt_len = 0;
+	char *ptr;
+	int len;
+	int err;
+
 	ENTRY;
+	/* for ASLR, to map between cfg_instance and hashed ptr */
+	CDEBUG(D_VFSTRACE, "VFS Op: cfg_instance %s-%016lx (sb %p)\n",
+	       profilenm, cfg_instance, sb);
 
-	CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb);
+	try_module_get(THIS_MODULE);
 
 	OBD_ALLOC_PTR(cfg);
 	if (cfg == NULL)
-		RETURN(-ENOMEM);
-
-	try_module_get(THIS_MODULE);
+		GOTO(out_free_cfg, err = -ENOMEM);
 
 	/* client additional sb info */
 	lsi->lsi_llsbi = sbi = ll_init_sbi();
-	if (!sbi) {
-		module_put(THIS_MODULE);
-		OBD_FREE_PTR(cfg);
-		RETURN(-ENOMEM);
-	}
+	if (!sbi)
+		GOTO(out_free_cfg, err = -ENOMEM);
 
-	err = ll_options(lsi->lsi_lmd->lmd_opts, &sbi->ll_flags);
+	err = ll_options(lsi->lsi_lmd->lmd_opts, sbi);
 	if (err)
-		GOTO(out_free, err);
+		GOTO(out_free_cfg, err);
 
-	err = super_setup_bdi_name(sb, "lustre-%p", sb);
+	err = super_setup_bdi_name(sb, "lustre-%016lx", cfg_instance);
 	if (err)
-		GOTO(out_free, err);
+		GOTO(out_free_cfg, err);
 
 #ifndef HAVE_DCACHE_LOCK
 	/* kernel >= 2.6.38 store dentry operations in sb->s_d_op. */
 	sb->s_d_op = &ll_d_ops;
 #endif
+	/* Get fsname */
+	len = strlen(profilenm);
+	ptr = strrchr(profilenm, '-');
+	if (ptr && (strcmp(ptr, "-client") == 0))
+		len -= 7;
 
-	/* Call lprocfs_ll_register_mountpoint() before lustre_process_log()
-	 * so that "llite.*.*" params can be processed correctly. */
-	if (proc_lustre_fs_root != NULL) {
-		err = lprocfs_ll_register_mountpoint(proc_lustre_fs_root, sb);
-		if (err < 0) {
-			CERROR("%s: could not register mountpoint in llite: "
-			       "rc = %d\n", ll_get_fsname(sb, NULL, 0), err);
-			err = 0;
-		}
+	/* Mount info */
+	snprintf(name, MAX_STRING_SIZE, "%.*s-%016lx", len,
+		 profilenm, cfg_instance);
+
+	/* Call ll_debugfs_register_super() before lustre_process_log()
+	 * so that "llite.*.*" params can be processed correctly.
+	 */
+	err = ll_debugfs_register_super(sb, name);
+	if (err < 0) {
+		CERROR("%s: could not register mountpoint in llite: rc = %d\n",
+		       ll_get_fsname(sb, NULL, 0), err);
+		err = 0;
 	}
 
-	/* Generate a string unique to this super, in case some joker tries
-	   to mount the same fs at two mount points.
-	   Use the address of the super itself.*/
-	cfg->cfg_instance = sb;
+	/* The cfg_instance is a value unique to this super, in case some
+	 * joker tries to mount the same fs at two mount points.
+	 */
+	cfg->cfg_instance = cfg_instance;
 	cfg->cfg_uuid = lsi->lsi_llsbi->ll_sb_uuid;
 	cfg->cfg_callback = class_config_llog_handler;
 	cfg->cfg_sub_clds = CONFIG_SUB_CLIENT;
 	/* set up client obds */
 	err = lustre_process_log(sb, profilenm, cfg);
 	if (err < 0)
-		GOTO(out_proc, err);
+		GOTO(out_debugfs, err);
 
 	/* Profile set with LCFG_MOUNTOPT so we can find our mdc and osc obds */
 	lprof = class_get_profile(profilenm);
@@ -1042,7 +1111,7 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
 		LCONSOLE_ERROR_MSG(0x156, "The client profile '%s' could not be"
 				   " read from the MGS.  Does that filesystem "
 				   "exist?\n", profilenm);
-		GOTO(out_proc, err = -EINVAL);
+		GOTO(out_debugfs, err = -EINVAL);
 	}
 	CDEBUG(D_CONFIG, "Found profile %s: mdc=%s osc=%s\n", profilenm,
 	       lprof->lp_md, lprof->lp_dt);
@@ -1050,58 +1119,68 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
 	dt_len = strlen(lprof->lp_dt) + instlen + 2;
 	OBD_ALLOC(dt, dt_len);
 	if (!dt)
-		GOTO(out_proc, err = -ENOMEM);
-	snprintf(dt, dt_len - 1, "%s-%p", lprof->lp_dt, cfg->cfg_instance);
+		GOTO(out_profile, err = -ENOMEM);
+	snprintf(dt, dt_len - 1, "%s-%016lx", lprof->lp_dt, cfg_instance);
 
 	md_len = strlen(lprof->lp_md) + instlen + 2;
 	OBD_ALLOC(md, md_len);
 	if (!md)
-		GOTO(out_proc, err = -ENOMEM);
-	snprintf(md, md_len - 1, "%s-%p", lprof->lp_md, cfg->cfg_instance);
+		GOTO(out_free_dt, err = -ENOMEM);
+	snprintf(md, md_len - 1, "%s-%016lx", lprof->lp_md, cfg_instance);
 
 	/* connections, registrations, sb setup */
 	err = client_common_fill_super(sb, md, dt, mnt);
 	if (err < 0)
-		GOTO(out_proc, err);
+		GOTO(out_free_md, err);
 
 	sbi->ll_client_common_fill_super_succeeded = 1;
 
-out_proc:
-	if (err < 0)
-		lprocfs_ll_unregister_mountpoint(sbi);
-out_free:
+out_free_md:
 	if (md)
 		OBD_FREE(md, md_len);
+out_free_dt:
 	if (dt)
 		OBD_FREE(dt, dt_len);
-	if (lprof != NULL)
+out_profile:
+	if (lprof)
 		class_put_profile(lprof);
+out_debugfs:
+	if (err < 0)
+		ll_debugfs_unregister_super(sb);
+out_free_cfg:
+	if (cfg)
+		OBD_FREE_PTR(cfg);
+
 	if (err)
 		ll_put_super(sb);
 	else if (sbi->ll_flags & LL_SBI_VERBOSE)
 		LCONSOLE_WARN("Mounted %s\n", profilenm);
-
-	OBD_FREE_PTR(cfg);
 	RETURN(err);
 } /* ll_fill_super */
 
 void ll_put_super(struct super_block *sb)
 {
 	struct config_llog_instance cfg, params_cfg;
-        struct obd_device *obd;
-        struct lustre_sb_info *lsi = s2lsi(sb);
-        struct ll_sb_info *sbi = ll_s2sbi(sb);
-        char *profilenm = get_profile_name(sb);
+	struct obd_device *obd;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	char *profilenm = get_profile_name(sb);
+	unsigned long cfg_instance = ll_get_cfg_instance(sb);
 	long ccc_count;
 	int next, force = 1, rc = 0;
-        ENTRY;
+	ENTRY;
 
-        CDEBUG(D_VFSTRACE, "VFS Op: sb %p - %s\n", sb, profilenm);
+	if (!sbi)
+		GOTO(out_no_sbi, 0);
 
-        cfg.cfg_instance = sb;
-        lustre_end_log(sb, profilenm, &cfg);
+	/* Should replace instance_id with something better for ASLR */
+	CDEBUG(D_VFSTRACE, "VFS Op: cfg_instance %s-%016lx (sb %p)\n",
+	       profilenm, cfg_instance, sb);
 
-	params_cfg.cfg_instance = sb;
+	cfg.cfg_instance = cfg_instance;
+	lustre_end_log(sb, profilenm, &cfg);
+
+	params_cfg.cfg_instance = cfg_instance;
 	lustre_end_log(sb, PARAMS_FILENAME, &params_cfg);
 
         if (sbi->ll_md_exp) {
@@ -1122,7 +1201,6 @@ void ll_put_super(struct super_block *sb)
 	if (force == 0 && rc != -EINTR)
 		LASSERTF(ccc_count == 0, "count: %li\n", ccc_count);
 
-
         /* We need to set force before the lov_disconnect in
            lustre_common_put_super, since l_d cleans up osc's as well. */
         if (force) {
@@ -1158,7 +1236,7 @@ void ll_put_super(struct super_block *sb)
 
         ll_free_sbi(sb);
         lsi->lsi_llsbi = NULL;
-
+out_no_sbi:
 	lustre_common_put_super(sb);
 
 	cl_env_cache_purge(~0);
@@ -1262,108 +1340,124 @@ static int ll_init_lsm_md(struct inode *inode, struct lustre_md *md)
 {
 	struct lu_fid *fid;
 	struct lmv_stripe_md *lsm = md->lmv;
+	struct ll_inode_info *lli = ll_i2info(inode);
 	int i;
 
 	LASSERT(lsm != NULL);
+
+	CDEBUG(D_INODE, "%s: "DFID" set dir layout:\n",
+		ll_get_fsname(inode->i_sb, NULL, 0),
+		PFID(&lli->lli_fid));
+	lsm_md_dump(D_INODE, lsm);
+
 	/* XXX sigh, this lsm_root initialization should be in
 	 * LMV layer, but it needs ll_iget right now, so we
 	 * put this here right now. */
 	for (i = 0; i < lsm->lsm_md_stripe_count; i++) {
 		fid = &lsm->lsm_md_oinfo[i].lmo_fid;
 		LASSERT(lsm->lsm_md_oinfo[i].lmo_root == NULL);
+
+		if (!fid_is_sane(fid))
+			continue;
+
 		/* Unfortunately ll_iget will call ll_update_inode,
 		 * where the initialization of slave inode is slightly
 		 * different, so it reset lsm_md to NULL to avoid
 		 * initializing lsm for slave inode. */
-		/* For migrating inode, master stripe and master object will
-		 * be same, so we only need assign this inode */
-		if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION && i == 0)
-			lsm->lsm_md_oinfo[i].lmo_root = inode;
-		else
-			lsm->lsm_md_oinfo[i].lmo_root =
+		lsm->lsm_md_oinfo[i].lmo_root =
 				ll_iget_anon_dir(inode->i_sb, fid, md);
-
 		if (IS_ERR(lsm->lsm_md_oinfo[i].lmo_root)) {
 			int rc = PTR_ERR(lsm->lsm_md_oinfo[i].lmo_root);
 
 			lsm->lsm_md_oinfo[i].lmo_root = NULL;
+			while (i-- > 0) {
+				iput(lsm->lsm_md_oinfo[i].lmo_root);
+				lsm->lsm_md_oinfo[i].lmo_root = NULL;
+			}
 			return rc;
 		}
 	}
 
-	return 0;
-}
+	lli->lli_lsm_md = lsm;
 
-static inline int lli_lsm_md_eq(const struct lmv_stripe_md *lsm_md1,
-				const struct lmv_stripe_md *lsm_md2)
-{
-	return lsm_md1->lsm_md_magic == lsm_md2->lsm_md_magic &&
-	       lsm_md1->lsm_md_stripe_count == lsm_md2->lsm_md_stripe_count &&
-	       lsm_md1->lsm_md_master_mdt_index ==
-					lsm_md2->lsm_md_master_mdt_index &&
-	       lsm_md1->lsm_md_hash_type == lsm_md2->lsm_md_hash_type &&
-	       lsm_md1->lsm_md_layout_version ==
-					lsm_md2->lsm_md_layout_version &&
-	       strcmp(lsm_md1->lsm_md_pool_name,
-		      lsm_md2->lsm_md_pool_name) == 0;
+	return 0;
 }
 
 static int ll_update_lsm_md(struct inode *inode, struct lustre_md *md)
 {
 	struct ll_inode_info *lli = ll_i2info(inode);
 	struct lmv_stripe_md *lsm = md->lmv;
-	int	rc;
+	struct cl_attr	*attr;
+	int rc = 0;
+
 	ENTRY;
 
 	LASSERT(S_ISDIR(inode->i_mode));
 	CDEBUG(D_INODE, "update lsm %p of "DFID"\n", lli->lli_lsm_md,
 	       PFID(ll_inode2fid(inode)));
 
-	/* no striped information from request. */
-	if (lsm == NULL) {
-		if (lli->lli_lsm_md == NULL) {
-			RETURN(0);
-		} else if (lli->lli_lsm_md->lsm_md_hash_type &
-						LMV_HASH_FLAG_MIGRATION) {
-			/* migration is done, the temporay MIGRATE layout has
-			 * been removed */
-			CDEBUG(D_INODE, DFID" finish migration.\n",
-			       PFID(ll_inode2fid(inode)));
-			lmv_free_memmd(lli->lli_lsm_md);
-			lli->lli_lsm_md = NULL;
-			RETURN(0);
-		} else {
-			/* The lustre_md from req does not include stripeEA,
-			 * see ll_md_setattr */
-			RETURN(0);
-		}
-	}
-
-	/* set the directory layout */
-	if (lli->lli_lsm_md == NULL) {
-		struct cl_attr	*attr;
+	/*
+	 * no striped information from request, lustre_md from req does not
+	 * include stripeEA, see ll_md_setattr()
+	 */
+	if (!lsm)
+		RETURN(0);
 
-		rc = ll_init_lsm_md(inode, md);
-		if (rc != 0)
-			RETURN(rc);
+	/*
+	 * normally dir layout doesn't change, only take read lock to check
+	 * that to avoid blocking other MD operations.
+	 */
+	down_read(&lli->lli_lsm_sem);
 
-		/* set md->lmv to NULL, so the following free lustre_md
-		 * will not free this lsm */
-		md->lmv = NULL;
-		lli->lli_lsm_md = lsm;
+	/* some concurrent lookup initialized lsm, and unchanged */
+	if (lli->lli_lsm_md && lsm_md_eq(lli->lli_lsm_md, lsm))
+		GOTO(unlock, rc = 0);
 
-		OBD_ALLOC_PTR(attr);
-		if (attr == NULL)
-			RETURN(-ENOMEM);
+	/* if dir layout doesn't match, check whether version is increased,
+	 * which means layout is changed, this happens in dir split/merge and
+	 * lfsck.
+	 */
+	if (lli->lli_lsm_md &&
+	    lsm->lsm_md_layout_version <=
+	    lli->lli_lsm_md->lsm_md_layout_version) {
+		CERROR("%s: "DFID" dir layout mismatch:\n",
+		       ll_get_fsname(inode->i_sb, NULL, 0),
+		       PFID(&lli->lli_fid));
+		lsm_md_dump(D_ERROR, lli->lli_lsm_md);
+		lsm_md_dump(D_ERROR, lsm);
+		GOTO(unlock, rc = -EINVAL);
+	}
+  
+	up_read(&lli->lli_lsm_sem);
+	down_write(&lli->lli_lsm_sem);
+	/* clear existing lsm */
+	if (lli->lli_lsm_md) {
+		lmv_free_memmd(lli->lli_lsm_md);
+		lli->lli_lsm_md = NULL;
+ 	}
 
-		/* validate the lsm */
-		rc = md_merge_attr(ll_i2mdexp(inode), lsm, attr,
-				   ll_md_blocking_ast);
-		if (rc != 0) {
-			OBD_FREE_PTR(attr);
-			RETURN(rc);
-		}
+	rc = ll_init_lsm_md(inode, md);
+	up_write(&lli->lli_lsm_sem);
+	if (rc)
+		RETURN(rc);
+ 
+	/* set md->lmv to NULL, so the following free lustre_md will not free
+	 * this lsm.
+	 */
+	md->lmv = NULL;
 
+	/* md_merge_attr() may take long, since lsm is already set, switch to
+	 * read lock.
+	 */
+	down_read(&lli->lli_lsm_sem);
+	OBD_ALLOC_PTR(attr);
+	if (!attr)
+		GOTO(unlock, rc = -ENOMEM);
+ 
+	/* validate the lsm */
+	rc = md_merge_attr(ll_i2mdexp(inode), lli->lli_lsm_md, attr,
+			   ll_md_blocking_ast);
+	if (!rc) {
 		if (md->body->mbo_valid & OBD_MD_FLNLINK)
 			md->body->mbo_nlink = attr->cat_nlink;
 		if (md->body->mbo_valid & OBD_MD_FLSIZE)
@@ -1374,51 +1468,14 @@ static int ll_update_lsm_md(struct inode *inode, struct lustre_md *md)
 			md->body->mbo_ctime = attr->cat_ctime;
 		if (md->body->mbo_valid & OBD_MD_FLMTIME)
 			md->body->mbo_mtime = attr->cat_mtime;
-
-		OBD_FREE_PTR(attr);
-
-		CDEBUG(D_INODE, "Set lsm %p magic %x to "DFID"\n", lsm,
-		       lsm->lsm_md_magic, PFID(ll_inode2fid(inode)));
-		RETURN(0);
 	}
 
-	/* Compare the old and new stripe information */
-	if (!lsm_md_eq(lli->lli_lsm_md, lsm)) {
-		struct lmv_stripe_md	*old_lsm = lli->lli_lsm_md;
-		int			idx;
-
-		CERROR("%s: inode "DFID"(%p)'s lmv layout mismatch (%p)/(%p)"
-		       "magic:0x%x/0x%x stripe count: %d/%d master_mdt: %d/%d"
-		       "hash_type:0x%x/0x%x layout: 0x%x/0x%x pool:%s/%s\n",
-		       ll_get_fsname(inode->i_sb, NULL, 0), PFID(&lli->lli_fid),
-		       inode, lsm, old_lsm,
-		       lsm->lsm_md_magic, old_lsm->lsm_md_magic,
-		       lsm->lsm_md_stripe_count,
-		       old_lsm->lsm_md_stripe_count,
-		       lsm->lsm_md_master_mdt_index,
-		       old_lsm->lsm_md_master_mdt_index,
-		       lsm->lsm_md_hash_type, old_lsm->lsm_md_hash_type,
-		       lsm->lsm_md_layout_version,
-		       old_lsm->lsm_md_layout_version,
-		       lsm->lsm_md_pool_name,
-		       old_lsm->lsm_md_pool_name);
-
-		for (idx = 0; idx < old_lsm->lsm_md_stripe_count; idx++) {
-			CERROR("%s: sub FIDs in old lsm idx %d, old: "DFID"\n",
-			       ll_get_fsname(inode->i_sb, NULL, 0), idx,
-			       PFID(&old_lsm->lsm_md_oinfo[idx].lmo_fid));
-		}
-
-		for (idx = 0; idx < lsm->lsm_md_stripe_count; idx++) {
-			CERROR("%s: sub FIDs in new lsm idx %d, new: "DFID"\n",
-			       ll_get_fsname(inode->i_sb, NULL, 0), idx,
-			       PFID(&lsm->lsm_md_oinfo[idx].lmo_fid));
-		}
-
-		RETURN(-EIO);
-	}
+	OBD_FREE_PTR(attr);
+	GOTO(unlock, rc);
+unlock:
+	up_read(&lli->lli_lsm_sem);
 
-	RETURN(0);
+	return rc;
 }
 
 void ll_clear_inode(struct inode *inode)
@@ -1554,7 +1611,8 @@ static int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data)
  *
  * In case of HSMimport, we only set attr on MDS.
  */
-int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import)
+int ll_setattr_raw(struct dentry *dentry, struct iattr *attr,
+		   enum op_xvalid xvalid, bool hsm_import)
 {
         struct inode *inode = dentry->d_inode;
         struct ll_inode_info *lli = ll_i2info(inode);
@@ -1594,12 +1652,12 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import)
 			RETURN(-EPERM);
 	}
 
-        /* We mark all of the fields "set" so MDS/OST does not re-set them */
-	if (!(attr->ia_valid & ATTR_CTIME_SET) &&
-	    (attr->ia_valid & ATTR_CTIME)) {
+	/* We mark all of the fields "set" so MDS/OST does not re-set them */
+	if (!(xvalid & OP_XVALID_CTIME_SET) &&
+	     (attr->ia_valid & ATTR_CTIME)) {
 		attr->ia_ctime = current_time(inode);
-                attr->ia_valid |= ATTR_CTIME_SET;
-        }
+		xvalid |= OP_XVALID_CTIME_SET;
+	}
 	if (!(attr->ia_valid & ATTR_ATIME_SET) &&
 	    (attr->ia_valid & ATTR_ATIME)) {
 		attr->ia_atime = current_time(inode);
@@ -1631,13 +1689,22 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import)
 
 	if (!hsm_import && attr->ia_valid & ATTR_SIZE) {
 		/* If we are changing file size, file content is
-		 * modified, flag it. */
-		attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
+		 * modified, flag it.
+		 */
+		xvalid |= OP_XVALID_OWNEROVERRIDE;
 		op_data->op_bias |= MDS_DATA_MODIFIED;
 		ll_file_clear_flag(lli, LLIF_DATA_MODIFIED);
 	}
 
+	if (attr->ia_valid & ATTR_FILE) {
+		struct ll_file_data *fd = LUSTRE_FPRIVATE(attr->ia_file);
+
+		if (fd->fd_lease_och)
+			op_data->op_bias |= MDS_TRUNC_KEEP_LEASE;
+	}
+
 	op_data->op_attr = *attr;
+	op_data->op_xvalid = xvalid;
 
 	rc = ll_md_setattr(dentry, op_data);
 	if (rc)
@@ -1646,17 +1713,17 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import)
 	if (!S_ISREG(inode->i_mode) || hsm_import)
 		GOTO(out, rc = 0);
 
-	if (attr->ia_valid & (ATTR_SIZE |
-			      ATTR_ATIME | ATTR_ATIME_SET |
-			      ATTR_MTIME | ATTR_MTIME_SET |
-			      ATTR_CTIME | ATTR_CTIME_SET)) {
+	if (attr->ia_valid & (ATTR_SIZE | ATTR_ATIME | ATTR_ATIME_SET |
+			      ATTR_MTIME | ATTR_MTIME_SET | ATTR_CTIME) ||
+	    xvalid & OP_XVALID_CTIME_SET) {
 		/* For truncate and utimes sending attributes to OSTs, setting
 		 * mtime/atime to the past will be performed under PW [0:EOF]
 		 * extent lock (new_size:EOF for truncate).  It may seem
 		 * excessive to send mtime/atime updates to OSTs when not
 		 * setting times to past, but it is necessary due to possible
-		 * time de-synchronization between MDT inode and OST objects */
-		rc = cl_setattr_ost(lli->lli_clob, attr, 0);
+		 * time de-synchronization between MDT inode and OST objects
+		 */
+		rc = cl_setattr_ost(lli->lli_clob, attr, xvalid, 0);
 	}
 
 	/* If the file was restored, it needs to set dirty flag.
@@ -1716,10 +1783,11 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import)
 int ll_setattr(struct dentry *de, struct iattr *attr)
 {
 	int mode = de->d_inode->i_mode;
+	enum op_xvalid xvalid = 0;
 
 	if ((attr->ia_valid & (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) ==
 			      (ATTR_CTIME|ATTR_SIZE|ATTR_MODE))
-		attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
+		xvalid |= OP_XVALID_OWNEROVERRIDE;
 
 	if (((attr->ia_valid & (ATTR_MODE|ATTR_FORCE|ATTR_SIZE)) ==
 			       (ATTR_SIZE|ATTR_MODE)) &&
@@ -1740,61 +1808,60 @@ int ll_setattr(struct dentry *de, struct iattr *attr)
 	    !(attr->ia_valid & ATTR_KILL_SGID))
 		attr->ia_valid |= ATTR_KILL_SGID;
 
-	/* avoid polluted from ATTR_TIMES_SET,
-	 * projid is not expected to be set here */
-	attr->ia_valid &= ~MDS_ATTR_PROJID;
-
-	return ll_setattr_raw(de, attr, false);
+	return ll_setattr_raw(de, attr, xvalid, false);
 }
 
-int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
-                       __u64 max_age, __u32 flags)
+int ll_statfs_internal(struct ll_sb_info *sbi, struct obd_statfs *osfs,
+		       u32 flags)
 {
-        struct ll_sb_info *sbi = ll_s2sbi(sb);
-        struct obd_statfs obd_osfs;
-        int rc;
-        ENTRY;
+	struct obd_statfs obd_osfs = { 0 };
+	time64_t max_age;
+	int rc;
 
-        rc = obd_statfs(NULL, sbi->ll_md_exp, osfs, max_age, flags);
-        if (rc) {
-                CERROR("md_statfs fails: rc = %d\n", rc);
-                RETURN(rc);
-        }
+	ENTRY;
+	max_age = ktime_get_seconds() - sbi->ll_statfs_max_age;
 
-        osfs->os_type = sb->s_magic;
+	rc = obd_statfs(NULL, sbi->ll_md_exp, osfs, max_age, flags);
+	if (rc)
+		RETURN(rc);
+
+	osfs->os_type = LL_SUPER_MAGIC;
 
 	CDEBUG(D_SUPER, "MDC blocks %llu/%llu objects %llu/%llu\n",
-               osfs->os_bavail, osfs->os_blocks, osfs->os_ffree,osfs->os_files);
+	      osfs->os_bavail, osfs->os_blocks, osfs->os_ffree, osfs->os_files);
 
-        if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
-                flags |= OBD_STATFS_NODELAY;
+	if (osfs->os_state & OS_STATE_SUM)
+		GOTO(out, rc);
 
-        rc = obd_statfs_rqset(sbi->ll_dt_exp, &obd_osfs, max_age, flags);
-        if (rc) {
-                CERROR("obd_statfs fails: rc = %d\n", rc);
-                RETURN(rc);
-        }
+	if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
+		flags |= OBD_STATFS_NODELAY;
+
+	rc = obd_statfs(NULL, sbi->ll_dt_exp, &obd_osfs, max_age, flags);
+	if (rc) /* Possibly a filesystem with no OSTs.  Report MDT totals. */
+		GOTO(out, rc = 0);
 
 	CDEBUG(D_SUPER, "OSC blocks %llu/%llu objects %llu/%llu\n",
-               obd_osfs.os_bavail, obd_osfs.os_blocks, obd_osfs.os_ffree,
-               obd_osfs.os_files);
-
-        osfs->os_bsize = obd_osfs.os_bsize;
-        osfs->os_blocks = obd_osfs.os_blocks;
-        osfs->os_bfree = obd_osfs.os_bfree;
-        osfs->os_bavail = obd_osfs.os_bavail;
-
-        /* If we don't have as many objects free on the OST as inodes
-         * on the MDS, we reduce the total number of inodes to
-         * compensate, so that the "inodes in use" number is correct.
-         */
-        if (obd_osfs.os_ffree < osfs->os_ffree) {
-                osfs->os_files = (osfs->os_files - osfs->os_ffree) +
-                        obd_osfs.os_ffree;
-                osfs->os_ffree = obd_osfs.os_ffree;
-        }
+	       obd_osfs.os_bavail, obd_osfs.os_blocks, obd_osfs.os_ffree,
+	       obd_osfs.os_files);
+
+	osfs->os_bsize = obd_osfs.os_bsize;
+	osfs->os_blocks = obd_osfs.os_blocks;
+	osfs->os_bfree = obd_osfs.os_bfree;
+	osfs->os_bavail = obd_osfs.os_bavail;
+
+	/* If we have _some_ OSTs, but don't have as many free objects on the
+	 * OSTs as inodes on the MDTs, reduce the reported number of inodes
+	 * to compensate, so that the "inodes in use" number is correct.
+	 * This should be kept in sync with lod_statfs() behaviour.
+	 */
+	if (obd_osfs.os_files && obd_osfs.os_ffree < osfs->os_ffree) {
+		osfs->os_files = (osfs->os_files - osfs->os_ffree) +
+				 obd_osfs.os_ffree;
+		osfs->os_ffree = obd_osfs.os_ffree;
+	}
 
-        RETURN(rc);
+out:
+	RETURN(rc);
 }
 int ll_statfs(struct dentry *de, struct kstatfs *sfs)
 {
@@ -1806,12 +1873,10 @@ int ll_statfs(struct dentry *de, struct kstatfs *sfs)
 	CDEBUG(D_VFSTRACE, "VFS Op: at %llu jiffies\n", get_jiffies_64());
         ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_STAFS, 1);
 
-        /* Some amount of caching on the client is allowed */
-        rc = ll_statfs_internal(sb, &osfs,
-                                cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
-                                0);
-        if (rc)
-                return rc;
+	/* Some amount of caching on the client is allowed */
+	rc = ll_statfs_internal(ll_s2sbi(sb), &osfs, OBD_STATFS_SUM);
+	if (rc)
+		return rc;
 
         statfs_unpack(sfs, &osfs);
 
@@ -1855,6 +1920,15 @@ void ll_inode_size_unlock(struct inode *inode)
 	mutex_unlock(&lli->lli_size_mutex);
 }
 
+void ll_update_inode_flags(struct inode *inode, int ext_flags)
+{
+	inode->i_flags = ll_ext_to_inode_flags(ext_flags);
+	if (ext_flags & LUSTRE_PROJINHERIT_FL)
+		ll_file_set_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT);
+	else
+		ll_file_clear_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT);
+}
+
 int ll_update_inode(struct inode *inode, struct lustre_md *md)
 {
 	struct ll_inode_info *lli = ll_i2info(inode);
@@ -1912,7 +1986,7 @@ int ll_update_inode(struct inode *inode, struct lustre_md *md)
 
 	/* Clear i_flags to remove S_NOSEC before permissions are updated */
 	if (body->mbo_valid & OBD_MD_FLFLAGS)
-		inode->i_flags = ll_ext_to_inode_flags(body->mbo_flags);
+		ll_update_inode_flags(inode, body->mbo_flags);
 	if (body->mbo_valid & OBD_MD_FLMODE)
 		inode->i_mode = (inode->i_mode & S_IFMT) |
 				(body->mbo_mode & ~S_IFMT);
@@ -2040,11 +2114,17 @@ void ll_delete_inode(struct inode *inode)
 	unsigned long nrpages;
 	ENTRY;
 
-	if (S_ISREG(inode->i_mode) && lli->lli_clob != NULL)
+	if (S_ISREG(inode->i_mode) && lli->lli_clob != NULL) {
 		/* It is last chance to write out dirty pages,
-		 * otherwise we may lose data while umount */
-		cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, CL_FSYNC_LOCAL, 1);
-
+		 * otherwise we may lose data while umount.
+		 *
+		 * If i_nlink is 0 then just discard data. This is safe because
+		 * local inode gets i_nlink 0 from server only for the last
+		 * unlink, so that file is not opened somewhere else
+		 */
+		cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, inode->i_nlink ?
+				   CL_FSYNC_LOCAL : CL_FSYNC_DISCARD, 1);
+	}
 	truncate_inode_pages_final(mapping);
 
 	/* Workaround for LU-118: Note nrpages may not be totally updated when
@@ -2077,13 +2157,13 @@ void ll_delete_inode(struct inode *inode)
 int ll_iocontrol(struct inode *inode, struct file *file,
                  unsigned int cmd, unsigned long arg)
 {
-        struct ll_sb_info *sbi = ll_i2sbi(inode);
-        struct ptlrpc_request *req = NULL;
-        int rc, flags = 0;
-        ENTRY;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *req = NULL;
+	int rc, flags = 0;
+	ENTRY;
 
-        switch(cmd) {
-        case FSFILT_IOC_GETFLAGS: {
+	switch (cmd) {
+	case FS_IOC_GETFLAGS: {
                 struct mdt_body *body;
                 struct md_op_data *op_data;
 
@@ -2107,32 +2187,41 @@ int ll_iocontrol(struct inode *inode, struct file *file,
 
 		flags = body->mbo_flags;
 
-                ptlrpc_req_finished(req);
+		ptlrpc_req_finished(req);
 
 		RETURN(put_user(flags, (int __user *)arg));
-        }
-        case FSFILT_IOC_SETFLAGS: {
+	}
+	case FS_IOC_SETFLAGS: {
 		struct iattr *attr;
 		struct md_op_data *op_data;
 		struct cl_object *obj;
+		struct fsxattr fa = { 0 };
 
 		if (get_user(flags, (int __user *)arg))
 			RETURN(-EFAULT);
 
-                op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
-                                             LUSTRE_OPC_ANY, NULL);
-                if (IS_ERR(op_data))
-                        RETURN(PTR_ERR(op_data));
+		fa.fsx_projid = ll_i2info(inode)->lli_projid;
+		if (flags & LUSTRE_PROJINHERIT_FL)
+			fa.fsx_xflags = FS_XFLAG_PROJINHERIT;
+
+		rc = ll_ioctl_check_project(inode, &fa);
+		if (rc)
+			RETURN(rc);
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+					     LUSTRE_OPC_ANY, NULL);
+		if (IS_ERR(op_data))
+			RETURN(PTR_ERR(op_data));
 
 		op_data->op_attr_flags = flags;
-                op_data->op_attr.ia_valid |= ATTR_ATTR_FLAG;
+		op_data->op_xvalid |= OP_XVALID_FLAGS;
 		rc = md_setattr(sbi->ll_md_exp, op_data, NULL, 0, &req);
                 ll_finish_md_op_data(op_data);
                 ptlrpc_req_finished(req);
 		if (rc)
 			RETURN(rc);
 
-		inode->i_flags = ll_ext_to_inode_flags(flags);
+		ll_update_inode_flags(inode, flags);
 
 		obj = ll_i2info(inode)->lli_clob;
 		if (obj == NULL)
@@ -2142,8 +2231,7 @@ int ll_iocontrol(struct inode *inode, struct file *file,
 		if (attr == NULL)
 			RETURN(-ENOMEM);
 
-		attr->ia_valid = ATTR_ATTR_FLAG;
-		rc = cl_setattr_ost(obj, attr, flags);
+		rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS, flags);
 
 		OBD_FREE_PTR(attr);
 		RETURN(rc);
@@ -2291,7 +2379,7 @@ void ll_open_cleanup(struct super_block *sb, struct ptlrpc_request *open_req)
 	}
 
 	op_data->op_fid1 = body->mbo_fid1;
-	op_data->op_handle = body->mbo_handle;
+	op_data->op_open_handle = body->mbo_open_handle;
 	op_data->op_mod_time = ktime_get_real_seconds();
 	md_close(exp, op_data, NULL, &close_req);
 	ptlrpc_req_finished(close_req);
@@ -2384,8 +2472,10 @@ int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req,
 	md_free_lustre_md(sbi->ll_md_exp, &md);
 
 cleanup:
-	if (rc != 0 && it != NULL && it->it_op & IT_OPEN)
+	if (rc != 0 && it != NULL && it->it_op & IT_OPEN) {
+		ll_intent_drop_lock(it);
 		ll_open_cleanup(sb != NULL ? sb : (*inode)->i_sb, req);
+	}
 
 	return rc;
 }
@@ -2433,30 +2523,21 @@ int ll_obd_statfs(struct inode *inode, void __user *arg)
 	return rc;
 }
 
-int ll_process_config(struct lustre_cfg *lcfg)
+/*
+ * this is normally called in ll_fini_md_op_data(), but sometimes it needs to
+ * be called early to avoid deadlock.
+ */
+void ll_unlock_md_op_lsm(struct md_op_data *op_data)
 {
-	struct super_block *sb;
-	unsigned long x;
-	int rc = 0;
-	char *ptr;
+	if (op_data->op_mea2_sem) {
+		up_read(op_data->op_mea2_sem);
+		op_data->op_mea2_sem = NULL;
+	}
 
-	/* The instance name contains the sb: lustre-client-aacfe000 */
-	ptr = strrchr(lustre_cfg_string(lcfg, 0), '-');
-	if (!ptr || !*(++ptr))
-		return -EINVAL;
-	if (sscanf(ptr, "%lx", &x) != 1)
-		return -EINVAL;
-	sb = (struct super_block *)x;
-	/* This better be a real Lustre superblock! */
-	LASSERT(s2lsi(sb)->lsi_lmd->lmd_magic == LMD_MAGIC);
-
-	/* Note we have not called client_common_fill_super yet, so
-	   proc fns must be able to handle that! */
-	rc = class_process_proc_param(PARAM_LLITE, lprocfs_llite_obd_vars,
-				      lcfg, sb);
-	if (rc > 0)
-		rc = 0;
-	return rc;
+	if (op_data->op_mea1_sem) {
+		up_read(op_data->op_mea1_sem);
+		op_data->op_mea1_sem = NULL;
+	}
 }
 
 /* this function prepares md_op_data hint for passing it down to MD stack. */
@@ -2475,7 +2556,9 @@ struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
 		if (namelen > ll_i2sbi(i1)->ll_namelen)
 			return ERR_PTR(-ENAMETOOLONG);
 
-		if (!lu_name_is_valid_2(name, namelen))
+		/* "/" is not valid name, but it's allowed */
+		if (!lu_name_is_valid_2(name, namelen) &&
+		    strncmp("/", name, namelen) != 0)
 			return ERR_PTR(-EINVAL);
 	}
 
@@ -2488,7 +2571,10 @@ struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
 	ll_i2gids(op_data->op_suppgids, i1, i2);
 	op_data->op_fid1 = *ll_inode2fid(i1);
 	op_data->op_default_stripe_offset = -1;
+
 	if (S_ISDIR(i1->i_mode)) {
+		down_read(&ll_i2info(i1)->lli_lsm_sem);
+		op_data->op_mea1_sem = &ll_i2info(i1)->lli_lsm_sem;
 		op_data->op_mea1 = ll_i2info(i1)->lli_lsm_md;
 		if (opc == LUSTRE_OPC_MKDIR)
 			op_data->op_default_stripe_offset =
@@ -2497,8 +2583,14 @@ struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
 
 	if (i2) {
 		op_data->op_fid2 = *ll_inode2fid(i2);
-		if (S_ISDIR(i2->i_mode))
+		if (S_ISDIR(i2->i_mode)) {
+			if (i2 != i1) {
+				down_read(&ll_i2info(i2)->lli_lsm_sem);
+				op_data->op_mea2_sem =
+						&ll_i2info(i2)->lli_lsm_sem;
+			}
 			op_data->op_mea2 = ll_i2info(i2)->lli_lsm_md;
+		}
 	} else {
 		fid_zero(&op_data->op_fid2);
 	}
@@ -2512,15 +2604,14 @@ struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
 	op_data->op_name = name;
 	op_data->op_namelen = namelen;
 	op_data->op_mode = mode;
-	op_data->op_mod_time = cfs_time_current_sec();
+	op_data->op_mod_time = ktime_get_real_seconds();
 	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
 	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
 	op_data->op_cap = cfs_curproc_cap_pack();
+	op_data->op_mds = 0;
 	if ((opc == LUSTRE_OPC_CREATE) && (name != NULL) &&
 	     filename_is_volatile(name, namelen, &op_data->op_mds)) {
 		op_data->op_bias |= MDS_CREATE_VOLATILE;
-	} else {
-		op_data->op_mds = 0;
 	}
 	op_data->op_data = data;
 
@@ -2529,9 +2620,10 @@ struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
 
 void ll_finish_md_op_data(struct md_op_data *op_data)
 {
+	ll_unlock_md_op_lsm(op_data);
 	ll_security_release_secctx(op_data->op_file_secctx,
 				   op_data->op_file_secctx_size);
-        OBD_FREE_PTR(op_data);
+	OBD_FREE_PTR(op_data);
 }
 
 #ifdef HAVE_SUPEROPS_USE_DENTRY
@@ -2540,7 +2632,7 @@ int ll_show_options(struct seq_file *seq, struct dentry *dentry)
 int ll_show_options(struct seq_file *seq, struct vfsmount *vfs)
 #endif
 {
-        struct ll_sb_info *sbi;
+	struct ll_sb_info *sbi;
 
 #ifdef HAVE_SUPEROPS_USE_DENTRY
 	LASSERT((seq != NULL) && (dentry != NULL));
@@ -2550,20 +2642,25 @@ int ll_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	sbi = ll_s2sbi(vfs->mnt_sb);
 #endif
 
-        if (sbi->ll_flags & LL_SBI_NOLCK)
-                seq_puts(seq, ",nolock");
+	if (sbi->ll_flags & LL_SBI_NOLCK)
+		seq_puts(seq, ",nolock");
 
-        if (sbi->ll_flags & LL_SBI_FLOCK)
-                seq_puts(seq, ",flock");
-
-        if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
-                seq_puts(seq, ",localflock");
+	/* "flock" is the default since 2.13, but it wasn't for many years,
+	 * so it is still useful to print this to show it is enabled.
+	 * Start to print "noflock" so it is now clear when flock is disabled.
+	 */
+	if (sbi->ll_flags & LL_SBI_FLOCK)
+		seq_puts(seq, ",flock");
+	else if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
+		seq_puts(seq, ",localflock");
+	else
+		seq_puts(seq, ",noflock");
 
-        if (sbi->ll_flags & LL_SBI_USER_XATTR)
-                seq_puts(seq, ",user_xattr");
+	if (sbi->ll_flags & LL_SBI_USER_XATTR)
+		seq_puts(seq, ",user_xattr");
 
-        if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
-                seq_puts(seq, ",lazystatfs");
+	if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
+		seq_puts(seq, ",lazystatfs");
 
 	if (sbi->ll_flags & LL_SBI_USER_FID2PATH)
 		seq_puts(seq, ",user_fid2path");
@@ -2571,7 +2668,7 @@ int ll_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (sbi->ll_flags & LL_SBI_ALWAYS_PING)
 		seq_puts(seq, ",always_ping");
 
-        RETURN(0);
+	RETURN(0);
 }
 
 /**
@@ -2689,12 +2786,12 @@ ssize_t ll_copy_user_md(const struct lov_user_md __user *md,
 	if (lum_size < 0)
 		RETURN(lum_size);
 
-	OBD_ALLOC(*kbuf, lum_size);
+	OBD_ALLOC_LARGE(*kbuf, lum_size);
 	if (*kbuf == NULL)
 		RETURN(-ENOMEM);
 
 	if (copy_from_user(*kbuf, md, lum_size) != 0) {
-		OBD_FREE(*kbuf, lum_size);
+		OBD_FREE_LARGE(*kbuf, lum_size);
 		RETURN(-EFAULT);
 	}
 
@@ -2722,7 +2819,7 @@ void ll_compute_rootsquash_state(struct ll_sb_info *sbi)
 		matched = false;
 		i = 0;
 		while (LNetGetId(i++, &id) != -ENOENT) {
-			if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
+			if (id.nid == LNET_NID_LO_0)
 				continue;
 			if (cfs_match_nid(id.nid, &squash->rsi_nosquash_nids)) {
 				matched = true;
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c b/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
index e286c559c1f67..9be9bd690ee6d 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
@@ -150,7 +150,7 @@ static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage,
 	int                      result;
 	__u16			 refcheck;
 	sigset_t		 set;
-	struct inode             *inode;
+	struct inode             *inode = NULL;
 	struct ll_inode_info     *lli;
 	ENTRY;
 
@@ -222,6 +222,16 @@ static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage,
 	CDEBUG(D_MMAP, "%s mkwrite with %d\n", current->comm, result);
 	LASSERT(ergo(result == 0, PageLocked(vmpage)));
 
+	/* if page has been unmapped, presumably due to lock reclaim for
+	 * concurrent usage, add some delay before retrying to prevent
+	 * entering live-lock situation with competitors
+	 */
+	if (result == -ENODATA && inode != NULL) {
+		CDEBUG(D_MMAP, "delaying new page-fault for inode %p to "
+			       "prevent live-lock\n", inode);
+		msleep(10);
+	}
+
 	return result;
 }
 
@@ -383,6 +393,12 @@ static vm_fault_t ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                 result |= VM_FAULT_LOCKED;
         }
 	cfs_restore_sigs(set);
+
+	if (vmf->page && result == VM_FAULT_LOCKED)
+		ll_rw_stats_tally(ll_i2sbi(file_inode(vma->vm_file)),
+				  current->pid, LUSTRE_FPRIVATE(vma->vm_file),
+				  cl_offset(NULL, vmf->page->index), PAGE_SIZE,
+				  READ);
         return result;
 }
 
@@ -439,6 +455,11 @@ static vm_fault_t ll_page_mkwrite(struct vm_area_struct *vma,
                 break;
         }
 
+	if (result == VM_FAULT_LOCKED)
+		ll_rw_stats_tally(ll_i2sbi(file_inode(vma->vm_file)),
+				  current->pid, LUSTRE_FPRIVATE(vma->vm_file),
+				  cl_offset(NULL, vmf->page->index), PAGE_SIZE,
+				  WRITE);
         return result;
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_nfs.c b/drivers/staging/lustrefsx/lustre/llite/llite_nfs.c
index c24f7f6498ba0..2e207361dd908 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_nfs.c
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_nfs.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c b/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
old mode 100755
new mode 100644
index ee696ef0a4c79..7f95090796d42
--- a/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
+++ b/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -36,58 +36,124 @@
 #ifdef HAVE_UIDGID_HEADER
 # include <linux/uidgid.h>
 #endif
-#include <uapi/linux/lustre_param.h>
+#include <uapi/linux/lustre/lustre_param.h>
 #include <lprocfs_status.h>
 #include <obd_support.h>
 
 #include "llite_internal.h"
 #include "vvp_internal.h"
 
-struct proc_dir_entry *proc_lustre_fs_root;
+static struct kobject *llite_kobj;
+static struct dentry *llite_root;
 
-#ifdef CONFIG_PROC_FS
-/* /proc/lustre/llite mount point registration */
-static const struct proc_ops ll_rw_extents_stats_fops;
-static const struct proc_ops ll_rw_extents_stats_pp_fops;
-static const struct proc_ops ll_rw_offset_stats_fops;
-static __s64 ll_stats_pid_write(struct file *file,
-				const char __user *buf, size_t len);
+int llite_tunables_register(void)
+{
+	int rc = 0;
+
+	llite_kobj = class_setup_tunables("llite");
+	if (IS_ERR(llite_kobj))
+		return PTR_ERR(llite_kobj);
+
+	llite_root = debugfs_create_dir("llite", debugfs_lustre_root);
+	if (IS_ERR_OR_NULL(llite_root)) {
+		rc = llite_root ? PTR_ERR(llite_root) : -ENOMEM;
+		llite_root = NULL;
+		kobject_put(llite_kobj);
+		llite_kobj = NULL;
+	}
 
-static int ll_blksize_seq_show(struct seq_file *m, void *v)
+	return rc;
+}
+
+void llite_tunables_unregister(void)
 {
-	struct super_block *sb = m->private;
-	struct obd_statfs osfs;
+	if (llite_kobj) {
+		kobject_put(llite_kobj);
+		llite_kobj = NULL;
+	}
+
+	if (!IS_ERR_OR_NULL(llite_root)) {
+		debugfs_remove(llite_root);
+		llite_root = NULL;
+	}
+}
+
+/* <debugfs>/lustre/llite mount point registration */
+static const struct file_operations ll_rw_extents_stats_fops;
+static const struct file_operations ll_rw_extents_stats_pp_fops;
+static const struct file_operations ll_rw_offset_stats_fops;
+
+/**
+ * ll_stats_pid_write() - Determine if stats collection should be enabled
+ * @buf: Buffer containing the data written
+ * @len: Number of bytes in the buffer
+ *
+ * Several proc files begin collecting stats when a value is written, and stop
+ * collecting when either '0' or 'disable' is written. This function checks the
+ * written value to see if collection should be enabled or disabled.
+ *
+ * Return: If '0' or 'disable' is provided, 0 is returned. If the text
+ * equivalent of a number is written, that number is returned. Otherwise,
+ * 1 is returned. Non-zero return values indicate collection should be enabled.
+ */
+static s64 ll_stats_pid_write(const char __user *buf, size_t len)
+{
+	unsigned long long value = 1;
+	char kernbuf[16];
 	int rc;
 
-	LASSERT(sb != NULL);
-	rc = ll_statfs_internal(sb, &osfs,
-				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
-				OBD_STATFS_NODELAY);
-	if (!rc)
-		seq_printf(m, "%u\n", osfs.os_bsize);
-	return rc;
+	rc = kstrtoull_from_user(buf, len, 0, &value);
+	if (rc < 0 && len < sizeof(kernbuf)) {
+		if (copy_from_user(kernbuf, buf, len))
+			return -EFAULT;
+		kernbuf[len] = 0;
+
+		if (kernbuf[len - 1] == '\n')
+			kernbuf[len - 1] = 0;
+
+		if (strncasecmp(kernbuf, "disable", 7) == 0)
+			value = 0;
+	}
+
+	return value;
 }
-LPROC_SEQ_FOPS_RO(ll_blksize);
 
-static int ll_stat_blksize_seq_show(struct seq_file *m, void *v)
+static ssize_t blocksize_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
 {
-	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	struct obd_statfs osfs;
+	int rc;
 
-	seq_printf(m, "%u\n", sbi->ll_stat_blksize);
+	rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY);
+	if (rc)
+		return rc;
 
-	return 0;
+	return sprintf(buf, "%u\n", osfs.os_bsize);
 }
+LUSTRE_RO_ATTR(blocksize);
 
-static ssize_t ll_stat_blksize_seq_write(struct file *file,
-					 const char __user *buffer,
-					 size_t count, loff_t *off)
+static ssize_t stat_blocksize_show(struct kobject *kobj, struct attribute *attr,
+				   char *buf)
 {
-	struct seq_file *m = file->private_data;
-	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
-	__s64 val;
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return sprintf(buf, "%u\n", sbi->ll_stat_blksize);
+}
+
+static ssize_t stat_blocksize_store(struct kobject *kobj,
+				    struct attribute *attr,
+				    const char *buffer,
+				    size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	unsigned int val;
 	int rc;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtouint(buffer, 10, &val);
 	if (rc)
 		return rc;
 
@@ -98,173 +164,135 @@ static ssize_t ll_stat_blksize_seq_write(struct file *file,
 
 	return count;
 }
-LPROC_SEQ_FOPS(ll_stat_blksize);
+LUSTRE_RW_ATTR(stat_blocksize);
 
-static int ll_kbytestotal_seq_show(struct seq_file *m, void *v)
+static ssize_t kbytestotal_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
 {
-	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 	struct obd_statfs osfs;
+	u32 blk_size;
+	u64 result;
 	int rc;
 
-	LASSERT(sb != NULL);
-	rc = ll_statfs_internal(sb, &osfs,
-				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
-				OBD_STATFS_NODELAY);
-	if (!rc) {
-		__u32 blk_size = osfs.os_bsize >> 10;
-		__u64 result = osfs.os_blocks;
+	rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY);
+	if (rc)
+		return rc;
 
-		while (blk_size >>= 1)
-			result <<= 1;
+	blk_size = osfs.os_bsize >> 10;
+	result = osfs.os_blocks;
 
-		seq_printf(m, "%llu\n", result);
-	}
-	return rc;
+	while (blk_size >>= 1)
+		result <<= 1;
+
+	return sprintf(buf, "%llu\n", result);
 }
-LPROC_SEQ_FOPS_RO(ll_kbytestotal);
+LUSTRE_RO_ATTR(kbytestotal);
 
-static int ll_kbytesfree_seq_show(struct seq_file *m, void *v)
+static ssize_t kbytesfree_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
 {
-	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 	struct obd_statfs osfs;
+	u32 blk_size;
+	u64 result;
 	int rc;
 
-	LASSERT(sb != NULL);
-	rc = ll_statfs_internal(sb, &osfs,
-				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
-				OBD_STATFS_NODELAY);
-	if (!rc) {
-		__u32 blk_size = osfs.os_bsize >> 10;
-		__u64 result = osfs.os_bfree;
+	rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY);
+	if (rc)
+		return rc;
 
-		while (blk_size >>= 1)
-			result <<= 1;
+	blk_size = osfs.os_bsize >> 10;
+	result = osfs.os_bfree;
 
-		seq_printf(m, "%llu\n", result);
-	}
-	return rc;
+	while (blk_size >>= 1)
+		result <<= 1;
+
+	return sprintf(buf, "%llu\n", result);
 }
-LPROC_SEQ_FOPS_RO(ll_kbytesfree);
+LUSTRE_RO_ATTR(kbytesfree);
 
-static int ll_kbytesavail_seq_show(struct seq_file *m, void *v)
+static ssize_t kbytesavail_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
 {
-	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 	struct obd_statfs osfs;
+	u32 blk_size;
+	u64 result;
 	int rc;
 
-	LASSERT(sb != NULL);
-	rc = ll_statfs_internal(sb, &osfs,
-				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
-				OBD_STATFS_NODELAY);
-	if (!rc) {
-		__u32 blk_size = osfs.os_bsize >> 10;
-		__u64 result = osfs.os_bavail;
+	rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY);
+	if (rc)
+		return rc;
 
-		while (blk_size >>= 1)
-			result <<= 1;
+	blk_size = osfs.os_bsize >> 10;
+	result = osfs.os_bavail;
 
-		seq_printf(m, "%llu\n", result);
-	}
-	return rc;
+	while (blk_size >>= 1)
+		result <<= 1;
+
+	return sprintf(buf, "%llu\n", result);
 }
-LPROC_SEQ_FOPS_RO(ll_kbytesavail);
+LUSTRE_RO_ATTR(kbytesavail);
 
-static int ll_filestotal_seq_show(struct seq_file *m, void *v)
+static ssize_t filestotal_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
 {
-	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 	struct obd_statfs osfs;
 	int rc;
 
-	LASSERT(sb != NULL);
-	rc = ll_statfs_internal(sb, &osfs,
-				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
-				OBD_STATFS_NODELAY);
-	if (!rc)
-		seq_printf(m, "%llu\n", osfs.os_files);
-	return rc;
+	rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY);
+	if (rc)
+		return rc;
+
+	return sprintf(buf, "%llu\n", osfs.os_files);
 }
-LPROC_SEQ_FOPS_RO(ll_filestotal);
+LUSTRE_RO_ATTR(filestotal);
 
-static int ll_filesfree_seq_show(struct seq_file *m, void *v)
+static ssize_t filesfree_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
 {
-	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 	struct obd_statfs osfs;
 	int rc;
 
-	LASSERT(sb != NULL);
-	rc = ll_statfs_internal(sb, &osfs,
-				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
-				OBD_STATFS_NODELAY);
-	if (!rc)
-		seq_printf(m, "%llu\n", osfs.os_ffree);
-	return rc;
-}
-LPROC_SEQ_FOPS_RO(ll_filesfree);
-
-static int ll_client_type_seq_show(struct seq_file *m, void *v)
-{
-	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
-
-	LASSERT(sbi != NULL);
-
-	seq_puts(m, "local client\n");
-	return 0;
-}
-LPROC_SEQ_FOPS_RO(ll_client_type);
-
-static int ll_fstype_seq_show(struct seq_file *m, void *v)
-{
-	struct super_block *sb = m->private;
+	rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY);
+	if (rc)
+		return rc;
 
-	LASSERT(sb != NULL);
-	seq_printf(m, "%s\n", sb->s_type->name);
-	return 0;
+	return sprintf(buf, "%llu\n", osfs.os_ffree);
 }
-LPROC_SEQ_FOPS_RO(ll_fstype);
+LUSTRE_RO_ATTR(filesfree);
 
-static int ll_sb_uuid_seq_show(struct seq_file *m, void *v)
+static ssize_t client_type_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
 {
-	struct super_block *sb = m->private;
-
-	LASSERT(sb != NULL);
-	seq_printf(m, "%s\n", ll_s2sbi(sb)->ll_sb_uuid.uuid);
-	return 0;
+	return sprintf(buf, "local client\n");
 }
-LPROC_SEQ_FOPS_RO(ll_sb_uuid);
+LUSTRE_RO_ATTR(client_type);
 
-static int ll_xattr_cache_seq_show(struct seq_file *m, void *v)
+static ssize_t fstype_show(struct kobject *kobj, struct attribute *attr,
+			   char *buf)
 {
-	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
-
-	seq_printf(m, "%u\n", sbi->ll_xattr_cache_enabled);
-	return 0;
+	return sprintf(buf, "lustre\n");
 }
+LUSTRE_RO_ATTR(fstype);
 
-static ssize_t ll_xattr_cache_seq_write(struct file *file,
-					const char __user *buffer,
-					size_t count, loff_t *off)
+static ssize_t uuid_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf)
 {
-	struct seq_file *m = file->private_data;
-	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
-	__s64 val;
-	int rc;
-
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
-	if (rc)
-		return rc;
-
-	if (val != 0 && val != 1)
-		return -ERANGE;
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 
-	if (val == 1 && !(sbi->ll_flags & LL_SBI_XATTR_CACHE))
-		return -ENOTSUPP;
-
-	sbi->ll_xattr_cache_enabled = val;
-	sbi->ll_xattr_cache_set = 1;
-
-	return count;
+	return sprintf(buf, "%s\n", sbi->ll_sb_uuid.uuid);
 }
-LPROC_SEQ_FOPS(ll_xattr_cache);
+LUSTRE_RO_ATTR(uuid);
 
 static int ll_site_stats_seq_show(struct seq_file *m, void *v)
 {
@@ -276,21 +304,21 @@ static int ll_site_stats_seq_show(struct seq_file *m, void *v)
 	 */
 	return cl_site_stats_print(lu2cl_site(ll_s2sbi(sb)->ll_site), m);
 }
-LPROC_SEQ_FOPS_RO(ll_site_stats);
+
+LDEBUGFS_SEQ_FOPS_RO(ll_site_stats);
 
 static int ll_max_readahead_mb_seq_show(struct seq_file *m, void *v)
 {
 	struct super_block *sb = m->private;
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
-	long pages_number;
-	int mult;
+	unsigned long ra_max_mb;
 
 	spin_lock(&sbi->ll_lock);
-	pages_number = sbi->ll_ra_info.ra_max_pages;
+	ra_max_mb = PAGES_TO_MiB(sbi->ll_ra_info.ra_max_pages);
 	spin_unlock(&sbi->ll_lock);
 
-	mult = 1 << (20 - PAGE_SHIFT);
-	return lprocfs_seq_read_frac_helper(m, pages_number, mult);
+	seq_printf(m, "%lu\n", ra_max_mb);
+	return 0;
 }
 
 static ssize_t
@@ -300,45 +328,43 @@ ll_max_readahead_mb_seq_write(struct file *file, const char __user *buffer,
 	struct seq_file *m = file->private_data;
 	struct super_block *sb = m->private;
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
-	__s64 pages_number;
+	s64 ra_max_mb, pages_number;
 	int rc;
 
-	rc = lprocfs_str_with_units_to_s64(file, buffer, count,
-			&pages_number, 'M');
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &ra_max_mb, 'M');
 	if (rc)
 		return rc;
 
-	pages_number >>= PAGE_SHIFT;
-
+	pages_number = round_up(ra_max_mb, 1024 * 1024) >> PAGE_SHIFT;
 	if (pages_number < 0 || pages_number > cfs_totalram_pages() / 2) {
 		/* 1/2 of RAM */
-		CERROR("%s: can't set max_readahead_mb=%lu > %luMB\n",
-		       ll_get_fsname(sb, NULL, 0),
-		       (unsigned long)pages_number >> (20 - PAGE_SHIFT),
-		       cfs_totalram_pages() >> (20 - PAGE_SHIFT + 1));
+		CERROR("%s: can't set max_readahead_mb=%llu > %luMB\n",
+		       ll_get_fsname(sb, NULL, 0), PAGES_TO_MiB(pages_number),
+		       PAGES_TO_MiB(cfs_totalram_pages()));
 		return -ERANGE;
 	}
 
 	spin_lock(&sbi->ll_lock);
 	sbi->ll_ra_info.ra_max_pages = pages_number;
 	spin_unlock(&sbi->ll_lock);
+
 	return count;
 }
-LPROC_SEQ_FOPS(ll_max_readahead_mb);
+
+LDEBUGFS_SEQ_FOPS(ll_max_readahead_mb);
 
 static int ll_max_readahead_per_file_mb_seq_show(struct seq_file *m, void *v)
 {
 	struct super_block *sb = m->private;
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
-	long pages_number;
-	int mult;
+	unsigned long ra_max_file_mb;
 
 	spin_lock(&sbi->ll_lock);
-	pages_number = sbi->ll_ra_info.ra_max_pages_per_file;
+	ra_max_file_mb = PAGES_TO_MiB(sbi->ll_ra_info.ra_max_pages_per_file);
 	spin_unlock(&sbi->ll_lock);
 
-	mult = 1 << (20 - PAGE_SHIFT);
-	return lprocfs_seq_read_frac_helper(m, pages_number, mult);
+	seq_printf(m, "%lu\n", ra_max_file_mb);
+	return 0;
 }
 
 static ssize_t
@@ -349,44 +375,43 @@ ll_max_readahead_per_file_mb_seq_write(struct file *file,
 	struct seq_file *m = file->private_data;
 	struct super_block *sb = m->private;
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	s64 ra_max_file_mb, pages_number;
 	int rc;
-	__s64 pages_number;
 
-	rc = lprocfs_str_with_units_to_s64(file, buffer, count,
-			&pages_number, 'M');
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &ra_max_file_mb,
+					   'M');
 	if (rc)
 		return rc;
 
-	pages_number >>= PAGE_SHIFT;
-
+	pages_number = round_up(ra_max_file_mb, 1024 * 1024) >> PAGE_SHIFT;
 	if (pages_number < 0 || pages_number > sbi->ll_ra_info.ra_max_pages) {
-		CERROR("%s: can't set max_readahead_per_file_mb=%lu > "
-		       "max_read_ahead_mb=%lu\n", ll_get_fsname(sb, NULL, 0),
-		       (unsigned long)pages_number >> (20 - PAGE_SHIFT),
-		       sbi->ll_ra_info.ra_max_pages >> (20 - PAGE_SHIFT));
+		CERROR("%s: can't set max_readahead_per_file_mb=%llu > max_read_ahead_mb=%lu\n",
+		       ll_get_fsname(sb, NULL, 0), PAGES_TO_MiB(pages_number),
+		       PAGES_TO_MiB(sbi->ll_ra_info.ra_max_pages));
 		return -ERANGE;
 	}
 
 	spin_lock(&sbi->ll_lock);
 	sbi->ll_ra_info.ra_max_pages_per_file = pages_number;
 	spin_unlock(&sbi->ll_lock);
+
 	return count;
 }
-LPROC_SEQ_FOPS(ll_max_readahead_per_file_mb);
+
+LDEBUGFS_SEQ_FOPS(ll_max_readahead_per_file_mb);
 
 static int ll_max_read_ahead_whole_mb_seq_show(struct seq_file *m, void *v)
 {
 	struct super_block *sb = m->private;
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
-	long pages_number;
-	int mult;
+	unsigned long ra_max_whole_mb;
 
 	spin_lock(&sbi->ll_lock);
-	pages_number = sbi->ll_ra_info.ra_max_read_ahead_whole_pages;
+	ra_max_whole_mb = PAGES_TO_MiB(sbi->ll_ra_info.ra_max_read_ahead_whole_pages);
 	spin_unlock(&sbi->ll_lock);
 
-	mult = 1 << (20 - PAGE_SHIFT);
-	return lprocfs_seq_read_frac_helper(m, pages_number, mult);
+	seq_printf(m, "%lu\n", ra_max_whole_mb);
+	return 0;
 }
 
 static ssize_t
@@ -397,52 +422,50 @@ ll_max_read_ahead_whole_mb_seq_write(struct file *file,
 	struct seq_file *m = file->private_data;
 	struct super_block *sb = m->private;
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	s64 ra_max_whole_mb, pages_number;
 	int rc;
-	__s64 pages_number;
 
-	rc = lprocfs_str_with_units_to_s64(file, buffer, count,
-			&pages_number, 'M');
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &ra_max_whole_mb,
+					   'M');
 	if (rc)
 		return rc;
 
-	pages_number >>= PAGE_SHIFT;
-
+	pages_number = round_up(ra_max_whole_mb, 1024 * 1024) >> PAGE_SHIFT;
 	/* Cap this at the current max readahead window size, the readahead
-	 * algorithm does this anyway so it's pointless to set it larger. */
+	 * algorithm does this anyway so it's pointless to set it larger.
+	 */
 	if (pages_number < 0 ||
 	    pages_number > sbi->ll_ra_info.ra_max_pages_per_file) {
-		int pages_shift = 20 - PAGE_SHIFT;
-		CERROR("%s: can't set max_read_ahead_whole_mb=%lu > "
-		       "max_read_ahead_per_file_mb=%lu\n",
-		       ll_get_fsname(sb, NULL, 0),
-		       (unsigned long)pages_number >> pages_shift,
-		       sbi->ll_ra_info.ra_max_pages_per_file >> pages_shift);
+		CERROR("%s: can't set max_read_ahead_whole_mb=%llu > max_read_ahead_per_file_mb=%lu\n",
+		       ll_get_fsname(sb, NULL, 0), PAGES_TO_MiB(pages_number),
+		       PAGES_TO_MiB(sbi->ll_ra_info.ra_max_pages_per_file));
 		return -ERANGE;
 	}
 
 	spin_lock(&sbi->ll_lock);
 	sbi->ll_ra_info.ra_max_read_ahead_whole_pages = pages_number;
 	spin_unlock(&sbi->ll_lock);
+
 	return count;
 }
-LPROC_SEQ_FOPS(ll_max_read_ahead_whole_mb);
+
+LDEBUGFS_SEQ_FOPS(ll_max_read_ahead_whole_mb);
 
 static int ll_max_cached_mb_seq_show(struct seq_file *m, void *v)
 {
 	struct super_block     *sb    = m->private;
 	struct ll_sb_info      *sbi   = ll_s2sbi(sb);
 	struct cl_client_cache *cache = sbi->ll_cache;
-	int shift = 20 - PAGE_SHIFT;
 	long max_cached_mb;
 	long unused_mb;
 
-	max_cached_mb = cache->ccc_lru_max >> shift;
-	unused_mb = atomic_long_read(&cache->ccc_lru_left) >> shift;
+	max_cached_mb = PAGES_TO_MiB(cache->ccc_lru_max);
+	unused_mb = PAGES_TO_MiB(atomic_long_read(&cache->ccc_lru_left));
 	seq_printf(m, "users: %d\n"
-		   "max_cached_mb: %ld\n"
-		   "used_mb: %ld\n"
-		   "unused_mb: %ld\n"
-		   "reclaim_count: %u\n",
+		      "max_cached_mb: %ld\n"
+		      "used_mb: %ld\n"
+		      "unused_mb: %ld\n"
+		      "reclaim_count: %u\n",
 		   atomic_read(&cache->ccc_users),
 		   max_cached_mb,
 		   max_cached_mb - unused_mb,
@@ -451,9 +474,9 @@ static int ll_max_cached_mb_seq_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static ssize_t
-ll_max_cached_mb_seq_write(struct file *file, const char __user *buffer,
-			   size_t count, loff_t *off)
+static ssize_t ll_max_cached_mb_seq_write(struct file *file,
+					  const char __user *buffer,
+					  size_t count, loff_t *off)
 {
 	struct seq_file *m = file->private_data;
 	struct super_block *sb = m->private;
@@ -464,21 +487,20 @@ ll_max_cached_mb_seq_write(struct file *file, const char __user *buffer,
 	long nrpages = 0;
 	__u16 refcheck;
 	__s64 pages_number;
-	long rc;
+	int rc;
 	char kernbuf[128];
-	ENTRY;
 
+	ENTRY;
 	if (count >= sizeof(kernbuf))
 		RETURN(-EINVAL);
 
-	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
+	if (copy_from_user(kernbuf, buffer, count))
 		RETURN(-EFAULT);
 	kernbuf[count] = 0;
 
 	buffer += lprocfs_find_named_value(kernbuf, "max_cached_mb:", &count) -
 		  kernbuf;
-	rc = lprocfs_str_with_units_to_s64(file, buffer, count,
-			&pages_number, 'M');
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &pages_number, 'M');
 	if (rc)
 		RETURN(rc);
 
@@ -487,7 +509,7 @@ ll_max_cached_mb_seq_write(struct file *file, const char __user *buffer,
 	if (pages_number < 0 || pages_number > cfs_totalram_pages()) {
 		CERROR("%s: can't set max cache more than %lu MB\n",
 		       ll_get_fsname(sb, NULL, 0),
-		       cfs_totalram_pages() >> (20 - PAGE_SHIFT));
+		       PAGES_TO_MiB(cfs_totalram_pages()));
 		RETURN(-ERANGE);
 	}
 	/* Allow enough cache so clients can make well-formed RPCs */
@@ -505,7 +527,7 @@ ll_max_cached_mb_seq_write(struct file *file, const char __user *buffer,
 
 	env = cl_env_get(&refcheck);
 	if (IS_ERR(env))
-		RETURN(rc);
+		RETURN(PTR_ERR(env));
 
 	diff = -diff;
 	while (diff > 0) {
@@ -558,218 +580,225 @@ ll_max_cached_mb_seq_write(struct file *file, const char __user *buffer,
 	}
 	return rc;
 }
-LPROC_SEQ_FOPS(ll_max_cached_mb);
 
-static int ll_checksum_seq_show(struct seq_file *m, void *v)
+LDEBUGFS_SEQ_FOPS(ll_max_cached_mb);
+
+static ssize_t checksums_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
 {
-	struct super_block *sb = m->private;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 
-	seq_printf(m, "%u\n", (sbi->ll_flags & LL_SBI_CHECKSUM) ? 1 : 0);
-	return 0;
+	return sprintf(buf, "%u\n", (sbi->ll_flags & LL_SBI_CHECKSUM) ? 1 : 0);
 }
 
-static ssize_t ll_checksum_seq_write(struct file *file,
-				     const char __user *buffer,
-				     size_t count, loff_t *off)
+static ssize_t checksums_store(struct kobject *kobj, struct attribute *attr,
+			       const char *buffer, size_t count)
 {
-	struct seq_file *m = file->private_data;
-	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	bool val;
+	int tmp;
 	int rc;
-	__s64 val;
 
 	if (!sbi->ll_dt_exp)
 		/* Not set up yet */
 		return -EAGAIN;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtobool(buffer, &val);
 	if (rc)
 		return rc;
 	if (val)
 		sbi->ll_flags |= LL_SBI_CHECKSUM;
 	else
 		sbi->ll_flags &= ~LL_SBI_CHECKSUM;
+	tmp = val;
 
 	rc = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM),
-				KEY_CHECKSUM, sizeof(val), &val, NULL);
+				KEY_CHECKSUM, sizeof(tmp), &tmp, NULL);
 	if (rc)
 		CWARN("Failed to set OSC checksum flags: %d\n", rc);
 
 	return count;
 }
-LPROC_SEQ_FOPS(ll_checksum);
+LUSTRE_RW_ATTR(checksums);
 
-static int ll_rd_track_id(struct seq_file *m, enum stats_track_type type)
+LUSTRE_ATTR(checksum_pages, 0644, checksums_show, checksums_store);
+
+static ssize_t ll_rd_track_id(struct kobject *kobj, char *buf,
+			      enum stats_track_type type)
 {
-	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 
-	if (ll_s2sbi(sb)->ll_stats_track_type == type) {
-		seq_printf(m, "%d\n",
-			   ll_s2sbi(sb)->ll_stats_track_id);
-	} else if (ll_s2sbi(sb)->ll_stats_track_type == STATS_TRACK_ALL) {
-		seq_puts(m, "0 (all)\n");
-	} else {
-		seq_puts(m, "untracked\n");
-	}
-	return 0;
+	if (sbi->ll_stats_track_type == type)
+		return sprintf(buf, "%d\n", sbi->ll_stats_track_id);
+	else if (sbi->ll_stats_track_type == STATS_TRACK_ALL)
+		return sprintf(buf, "0 (all)\n");
+
+	return sprintf(buf, "untracked\n");
 }
 
-static int ll_wr_track_id(struct file *file,
-			  const char __user *buffer, unsigned long count,
-			  void *data, enum stats_track_type type)
+static ssize_t ll_wr_track_id(struct kobject *kobj, const char *buffer,
+			      size_t count, enum stats_track_type type)
 {
-	struct super_block *sb = data;
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	unsigned long pid;
 	int rc;
-	__s64 pid;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &pid);
+	rc = kstrtoul(buffer, 10, &pid);
 	if (rc)
 		return rc;
-	if (pid > INT_MAX || pid < 0)
-		return -ERANGE;
 
-	ll_s2sbi(sb)->ll_stats_track_id = pid;
+	sbi->ll_stats_track_id = pid;
 	if (pid == 0)
-		ll_s2sbi(sb)->ll_stats_track_type = STATS_TRACK_ALL;
+		sbi->ll_stats_track_type = STATS_TRACK_ALL;
 	else
-		ll_s2sbi(sb)->ll_stats_track_type = type;
-	lprocfs_clear_stats(ll_s2sbi(sb)->ll_stats);
+		sbi->ll_stats_track_type = type;
+	lprocfs_clear_stats(sbi->ll_stats);
 	return count;
 }
 
-static int ll_track_pid_seq_show(struct seq_file *m, void *v)
+static ssize_t stats_track_pid_show(struct kobject *kobj,
+				    struct attribute *attr,
+				    char *buf)
 {
-	return ll_rd_track_id(m, STATS_TRACK_PID);
+	return ll_rd_track_id(kobj, buf, STATS_TRACK_PID);
 }
 
-static ssize_t ll_track_pid_seq_write(struct file *file,
-				      const char __user *buffer,
-				      size_t count, loff_t *off)
+static ssize_t stats_track_pid_store(struct kobject *kobj,
+				     struct attribute *attr,
+				     const char *buffer,
+				     size_t count)
 {
-	struct seq_file *seq = file->private_data;
-	return ll_wr_track_id(file, buffer, count, seq->private,
-			STATS_TRACK_PID);
+	return ll_wr_track_id(kobj, buffer, count, STATS_TRACK_PID);
 }
-LPROC_SEQ_FOPS(ll_track_pid);
+LUSTRE_RW_ATTR(stats_track_pid);
 
-static int ll_track_ppid_seq_show(struct seq_file *m, void *v)
+static ssize_t stats_track_ppid_show(struct kobject *kobj,
+				     struct attribute *attr,
+				     char *buf)
 {
-	return ll_rd_track_id(m, STATS_TRACK_PPID);
+	return ll_rd_track_id(kobj, buf, STATS_TRACK_PPID);
 }
 
-static ssize_t ll_track_ppid_seq_write(struct file *file,
-				       const char __user *buffer,
-				       size_t count, loff_t *off)
+static ssize_t stats_track_ppid_store(struct kobject *kobj,
+				      struct attribute *attr,
+				      const char *buffer,
+				      size_t count)
 {
-	struct seq_file *seq = file->private_data;
-	return ll_wr_track_id(file, buffer, count, seq->private,
-			STATS_TRACK_PPID);
+	return ll_wr_track_id(kobj, buffer, count, STATS_TRACK_PPID);
 }
-LPROC_SEQ_FOPS(ll_track_ppid);
+LUSTRE_RW_ATTR(stats_track_ppid);
 
-static int ll_track_gid_seq_show(struct seq_file *m, void *v)
+static ssize_t stats_track_gid_show(struct kobject *kobj,
+				    struct attribute *attr,
+				    char *buf)
 {
-	return ll_rd_track_id(m, STATS_TRACK_GID);
+	return ll_rd_track_id(kobj, buf, STATS_TRACK_GID);
 }
 
-static ssize_t ll_track_gid_seq_write(struct file *file,
-				      const char __user *buffer,
-				      size_t count, loff_t *off)
+static ssize_t stats_track_gid_store(struct kobject *kobj,
+				     struct attribute *attr,
+				     const char *buffer,
+				     size_t count)
 {
-	struct seq_file *seq = file->private_data;
-	return ll_wr_track_id(file, buffer, count, seq->private,
-			STATS_TRACK_GID);
+	return ll_wr_track_id(kobj, buffer, count, STATS_TRACK_GID);
 }
-LPROC_SEQ_FOPS(ll_track_gid);
+LUSTRE_RW_ATTR(stats_track_gid);
 
-static int ll_statahead_running_max_seq_show(struct seq_file *m, void *v)
+static ssize_t statahead_running_max_show(struct kobject *kobj,
+					  struct attribute *attr,
+					  char *buf)
 {
-	struct super_block *sb = m->private;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 
-	seq_printf(m, "%u\n", sbi->ll_sa_running_max);
-	return 0;
+	return snprintf(buf, 16, "%u\n", sbi->ll_sa_running_max);
 }
 
-static ssize_t ll_statahead_running_max_seq_write(struct file *file,
-					   const char __user *buffer,
-					   size_t count, loff_t *off)
+static ssize_t statahead_running_max_store(struct kobject *kobj,
+					   struct attribute *attr,
+					   const char *buffer,
+					   size_t count)
 {
-	struct seq_file *m = file->private_data;
-	struct super_block *sb = m->private;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	unsigned long val;
 	int rc;
-	__s64 val;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtoul(buffer, 0, &val);
 	if (rc)
 		return rc;
 
-	if (val >= 0 || val <= LL_SA_RUNNING_MAX)
+	if (val <= LL_SA_RUNNING_MAX) {
 		sbi->ll_sa_running_max = val;
-	else
-		CERROR("%s: bad statahead_running_max value %lld. Valid values "
-		       "are in the range [0, %u]\n", ll_get_fsname(sb, NULL, 0),
-		       val, LL_SA_RUNNING_MAX);
+		return count;
+	}
 
-	return count;
+	CERROR("Bad statahead_running_max value %lu. Valid values "
+	       "are in the range [0, %d]\n", val, LL_SA_RUNNING_MAX);
+
+	return -ERANGE;
 }
-LPROC_SEQ_FOPS(ll_statahead_running_max);
+LUSTRE_RW_ATTR(statahead_running_max);
 
-static int ll_statahead_max_seq_show(struct seq_file *m, void *v)
+static ssize_t statahead_max_show(struct kobject *kobj,
+				  struct attribute *attr,
+				  char *buf)
 {
-	struct super_block *sb = m->private;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 
-	seq_printf(m, "%u\n", sbi->ll_sa_max);
-	return 0;
+	return sprintf(buf, "%u\n", sbi->ll_sa_max);
 }
 
-static ssize_t ll_statahead_max_seq_write(struct file *file,
-					  const char __user *buffer,
-					  size_t count, loff_t *off)
+static ssize_t statahead_max_store(struct kobject *kobj,
+				   struct attribute *attr,
+				   const char *buffer,
+				   size_t count)
 {
-	struct seq_file *m = file->private_data;
-	struct super_block *sb = m->private;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	unsigned long val;
 	int rc;
-	__s64 val;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtoul(buffer, 0, &val);
 	if (rc)
 		return rc;
 
-	if (val >= 0 && val <= LL_SA_RPC_MAX)
+	if (val <= LL_SA_RPC_MAX)
 		sbi->ll_sa_max = val;
 	else
-		CERROR("%s: bad statahead_max value %lld. Valid values are in "
-		       "are in the range [0, %u]\n", ll_get_fsname(sb, NULL, 0),
+		CERROR("Bad statahead_max value %lu. Valid values are in the range [0, %d]\n",
 		       val, LL_SA_RPC_MAX);
 
 	return count;
 }
-LPROC_SEQ_FOPS(ll_statahead_max);
+LUSTRE_RW_ATTR(statahead_max);
 
-static int ll_statahead_agl_seq_show(struct seq_file *m, void *v)
+static ssize_t statahead_agl_show(struct kobject *kobj,
+				  struct attribute *attr,
+				  char *buf)
 {
-	struct super_block *sb = m->private;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 
-	seq_printf(m, "%u\n",
-		   sbi->ll_flags & LL_SBI_AGL_ENABLED ? 1 : 0);
-	return 0;
+	return sprintf(buf, "%u\n", sbi->ll_flags & LL_SBI_AGL_ENABLED ? 1 : 0);
 }
 
-static ssize_t ll_statahead_agl_seq_write(struct file *file,
-					  const char __user *buffer,
-					  size_t count, loff_t *off)
+static ssize_t statahead_agl_store(struct kobject *kobj,
+				   struct attribute *attr,
+				   const char *buffer,
+				   size_t count)
 {
-	struct seq_file *m = file->private_data;
-	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	bool val;
 	int rc;
-	__s64 val;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtobool(buffer, &val);
 	if (rc)
 		return rc;
 
@@ -780,7 +809,7 @@ static ssize_t ll_statahead_agl_seq_write(struct file *file,
 
 	return count;
 }
-LPROC_SEQ_FOPS(ll_statahead_agl);
+LUSTRE_RW_ATTR(statahead_agl);
 
 static int ll_statahead_stats_seq_show(struct seq_file *m, void *v)
 {
@@ -788,35 +817,37 @@ static int ll_statahead_stats_seq_show(struct seq_file *m, void *v)
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
 
 	seq_printf(m, "statahead total: %u\n"
-		    "statahead wrong: %u\n"
-		    "agl total: %u\n",
-		    atomic_read(&sbi->ll_sa_total),
-		    atomic_read(&sbi->ll_sa_wrong),
-		    atomic_read(&sbi->ll_agl_total));
+		      "statahead wrong: %u\n"
+		      "agl total: %u\n",
+		   atomic_read(&sbi->ll_sa_total),
+		   atomic_read(&sbi->ll_sa_wrong),
+		   atomic_read(&sbi->ll_agl_total));
 	return 0;
 }
-LPROC_SEQ_FOPS_RO(ll_statahead_stats);
 
-static int ll_lazystatfs_seq_show(struct seq_file *m, void *v)
+LDEBUGFS_SEQ_FOPS_RO(ll_statahead_stats);
+
+static ssize_t lazystatfs_show(struct kobject *kobj,
+			       struct attribute *attr,
+			       char *buf)
 {
-	struct super_block *sb = m->private;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 
-	seq_printf(m, "%u\n",
-		   (sbi->ll_flags & LL_SBI_LAZYSTATFS) ? 1 : 0);
-	return 0;
+	return sprintf(buf, "%u\n", (sbi->ll_flags & LL_SBI_LAZYSTATFS) ? 1 : 0);
 }
 
-static ssize_t ll_lazystatfs_seq_write(struct file *file,
-				       const char __user *buffer,
-					size_t count, loff_t *off)
+static ssize_t lazystatfs_store(struct kobject *kobj,
+				struct attribute *attr,
+				const char *buffer,
+				size_t count)
 {
-	struct seq_file *m = file->private_data;
-	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	bool val;
 	int rc;
-	__s64 val;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtobool(buffer, &val);
 	if (rc)
 		return rc;
 
@@ -827,12 +858,44 @@ static ssize_t ll_lazystatfs_seq_write(struct file *file,
 
 	return count;
 }
-LPROC_SEQ_FOPS(ll_lazystatfs);
+LUSTRE_RW_ATTR(lazystatfs);
 
-static int ll_max_easize_seq_show(struct seq_file *m, void *v)
+static ssize_t statfs_max_age_show(struct kobject *kobj, struct attribute *attr,
+				   char *buf)
 {
-	struct super_block *sb = m->private;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", sbi->ll_statfs_max_age);
+}
+
+static ssize_t statfs_max_age_store(struct kobject *kobj,
+				    struct attribute *attr, const char *buffer,
+				    size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buffer, 10, &val);
+	if (rc)
+		return rc;
+	if (val > OBD_STATFS_CACHE_MAX_AGE)
+		return -EINVAL;
+
+	sbi->ll_statfs_max_age = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(statfs_max_age);
+
+static ssize_t max_easize_show(struct kobject *kobj,
+			       struct attribute *attr,
+			       char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 	unsigned int ealen;
 	int rc;
 
@@ -840,10 +903,9 @@ static int ll_max_easize_seq_show(struct seq_file *m, void *v)
 	if (rc)
 		return rc;
 
-	seq_printf(m, "%u\n", ealen);
-	return 0;
+	return sprintf(buf, "%u\n", ealen);
 }
-LPROC_SEQ_FOPS_RO(ll_max_easize);
+LUSTRE_RO_ATTR(max_easize);
 
 /**
  * Get default_easize.
@@ -856,10 +918,12 @@ LPROC_SEQ_FOPS_RO(ll_max_easize);
  * \retval 0		on success
  * \retval negative	negated errno on failure
  */
-static int ll_default_easize_seq_show(struct seq_file *m, void *v)
+static ssize_t default_easize_show(struct kobject *kobj,
+				   struct attribute *attr,
+				   char *buf)
 {
-	struct super_block *sb = m->private;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 	unsigned int ealen;
 	int rc;
 
@@ -867,8 +931,7 @@ static int ll_default_easize_seq_show(struct seq_file *m, void *v)
 	if (rc)
 		return rc;
 
-	seq_printf(m, "%u\n", ealen);
-	return 0;
+	return sprintf(buf, "%u\n", ealen);
 }
 
 /**
@@ -887,24 +950,22 @@ static int ll_default_easize_seq_show(struct seq_file *m, void *v)
  * \retval positive	\a count on success
  * \retval negative	negated errno on failure
  */
-static ssize_t ll_default_easize_seq_write(struct file *file,
-					   const char __user *buffer,
-					   size_t count, loff_t *unused)
-{
-	struct seq_file	*seq = file->private_data;
-	struct super_block *sb = (struct super_block *)seq->private;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
-	__s64 val;
+static ssize_t default_easize_store(struct kobject *kobj,
+				    struct attribute *attr,
+				    const char *buffer,
+				    size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	unsigned int val;
 	int rc;
 
 	if (count == 0)
 		return 0;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtouint(buffer, 10, &val);
 	if (rc)
 		return rc;
-	if (val < 0 || val > INT_MAX)
-		return -ERANGE;
 
 	rc = ll_set_default_mdsize(sbi, val);
 	if (rc)
@@ -912,7 +973,7 @@ static ssize_t ll_default_easize_seq_write(struct file *file,
 
 	return count;
 }
-LPROC_SEQ_FOPS(ll_default_easize);
+LUSTRE_RW_ATTR(default_easize);
 
 static int ll_sbi_flags_seq_show(struct seq_file *m, void *v)
 {
@@ -936,74 +997,112 @@ static int ll_sbi_flags_seq_show(struct seq_file *m, void *v)
 	seq_printf(m, "\b\n");
 	return 0;
 }
-LPROC_SEQ_FOPS_RO(ll_sbi_flags);
 
-static int ll_fast_read_seq_show(struct seq_file *m, void *v)
+LDEBUGFS_SEQ_FOPS_RO(ll_sbi_flags);
+
+static ssize_t xattr_cache_show(struct kobject *kobj,
+				struct attribute *attr,
+				char *buf)
 {
-	struct super_block *sb = m->private;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 
-	seq_printf(m, "%u\n", !!(sbi->ll_flags & LL_SBI_FAST_READ));
-	return 0;
+	return sprintf(buf, "%u\n", sbi->ll_xattr_cache_enabled);
 }
 
-static ssize_t
-ll_fast_read_seq_write(struct file *file, const char __user *buffer,
-		       size_t count, loff_t *off)
+static ssize_t xattr_cache_store(struct kobject *kobj,
+				 struct attribute *attr,
+				 const char *buffer,
+				 size_t count)
 {
-	struct seq_file *m = file->private_data;
-	struct super_block *sb = m->private;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	bool val;
 	int rc;
-	__s64 val;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtobool(buffer, &val);
+	if (rc)
+		return rc;
+
+	if (val && !(sbi->ll_flags & LL_SBI_XATTR_CACHE))
+		return -ENOTSUPP;
+
+	sbi->ll_xattr_cache_enabled = val;
+	sbi->ll_xattr_cache_set = 1;
+
+	return count;
+}
+LUSTRE_RW_ATTR(xattr_cache);
+
+static ssize_t tiny_write_show(struct kobject *kobj,
+			       struct attribute *attr,
+			       char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return sprintf(buf, "%u\n", !!(sbi->ll_flags & LL_SBI_TINY_WRITE));
+}
+
+static ssize_t tiny_write_store(struct kobject *kobj,
+				struct attribute *attr,
+				const char *buffer,
+				size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	bool val;
+	int rc;
+
+	rc = kstrtobool(buffer, &val);
 	if (rc)
 		return rc;
 
 	spin_lock(&sbi->ll_lock);
-	if (val == 1)
-		sbi->ll_flags |= LL_SBI_FAST_READ;
+	if (val)
+		sbi->ll_flags |= LL_SBI_TINY_WRITE;
 	else
-		sbi->ll_flags &= ~LL_SBI_FAST_READ;
+		sbi->ll_flags &= ~LL_SBI_TINY_WRITE;
 	spin_unlock(&sbi->ll_lock);
 
 	return count;
 }
-LPROC_SEQ_FOPS(ll_fast_read);
+LUSTRE_RW_ATTR(tiny_write);
 
-static int ll_pio_seq_show(struct seq_file *m, void *v)
+static ssize_t fast_read_show(struct kobject *kobj,
+			      struct attribute *attr,
+			      char *buf)
 {
-	struct super_block *sb = m->private;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 
-	seq_printf(m, "%u\n", !!(sbi->ll_flags & LL_SBI_PIO));
-	return 0;
+	return sprintf(buf, "%u\n", !!(sbi->ll_flags & LL_SBI_FAST_READ));
 }
 
-static ssize_t ll_pio_seq_write(struct file *file, const char __user *buffer,
-				size_t count, loff_t *off)
+static ssize_t fast_read_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buffer,
+			       size_t count)
 {
-	struct seq_file *m = file->private_data;
-	struct super_block *sb = m->private;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	bool val;
 	int rc;
-	__s64 val;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtobool(buffer, &val);
 	if (rc)
 		return rc;
 
 	spin_lock(&sbi->ll_lock);
-	if (val == 1)
-		sbi->ll_flags |= LL_SBI_PIO;
+	if (val)
+		sbi->ll_flags |= LL_SBI_FAST_READ;
 	else
-		sbi->ll_flags &= ~LL_SBI_PIO;
+		sbi->ll_flags &= ~LL_SBI_FAST_READ;
 	spin_unlock(&sbi->ll_lock);
 
 	return count;
 }
-LPROC_SEQ_FOPS(ll_pio);
+LUSTRE_RW_ATTR(fast_read);
 
 static int ll_unstable_stats_seq_show(struct seq_file *m, void *v)
 {
@@ -1017,8 +1116,8 @@ static int ll_unstable_stats_seq_show(struct seq_file *m, void *v)
 	mb    = (pages * PAGE_SIZE) >> 20;
 
 	seq_printf(m, "unstable_check:     %8d\n"
-		   "unstable_pages: %12ld\n"
-		   "unstable_mb:        %8d\n",
+		      "unstable_pages: %12ld\n"
+		      "unstable_mb:        %8d\n",
 		   cache->ccc_unstable_check, pages, mb);
 	return 0;
 }
@@ -1030,32 +1129,33 @@ static ssize_t ll_unstable_stats_seq_write(struct file *file,
 	struct seq_file *seq = file->private_data;
 	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)seq->private);
 	char kernbuf[128];
+	bool val;
 	int rc;
-	__s64 val;
 
 	if (count == 0)
 		return 0;
 	if (count >= sizeof(kernbuf))
 		return -EINVAL;
 
-	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
+	if (copy_from_user(kernbuf, buffer, count))
 		return -EFAULT;
 	kernbuf[count] = 0;
 
 	buffer += lprocfs_find_named_value(kernbuf, "unstable_check:", &count) -
 		  kernbuf;
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtobool_from_user(buffer, count, &val);
 	if (rc < 0)
 		return rc;
 
 	/* borrow lru lock to set the value */
 	spin_lock(&sbi->ll_cache->ccc_lru_lock);
-	sbi->ll_cache->ccc_unstable_check = !!val;
+	sbi->ll_cache->ccc_unstable_check = val;
 	spin_unlock(&sbi->ll_cache->ccc_lru_lock);
 
 	return count;
 }
-LPROC_SEQ_FOPS(ll_unstable_stats);
+
+LDEBUGFS_SEQ_FOPS(ll_unstable_stats);
 
 static int ll_root_squash_seq_show(struct seq_file *m, void *v)
 {
@@ -1076,10 +1176,11 @@ static ssize_t ll_root_squash_seq_write(struct file *file,
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
 	struct root_squash_info *squash = &sbi->ll_squash;
 
-	return lprocfs_wr_root_squash(file, buffer, count, squash,
+	return lprocfs_wr_root_squash(buffer, count, squash,
 				      ll_get_fsname(sb, NULL, 0));
 }
-LPROC_SEQ_FOPS(ll_root_squash);
+
+LDEBUGFS_SEQ_FOPS(ll_root_squash);
 
 static int ll_nosquash_nids_seq_show(struct seq_file *m, void *v)
 {
@@ -1112,7 +1213,7 @@ static ssize_t ll_nosquash_nids_seq_write(struct file *file,
 	struct root_squash_info *squash = &sbi->ll_squash;
 	int rc;
 
-	rc = lprocfs_wr_nosquash_nids(file, buffer, count, squash,
+	rc = lprocfs_wr_nosquash_nids(buffer, count, squash,
 				      ll_get_fsname(sb, NULL, 0));
 	if (rc < 0)
 		return rc;
@@ -1121,80 +1222,77 @@ static ssize_t ll_nosquash_nids_seq_write(struct file *file,
 
 	return rc;
 }
-LPROC_SEQ_FOPS(ll_nosquash_nids);
 
-struct lprocfs_vars lprocfs_llite_obd_vars[] = {
-	{ .name	=	"uuid",
-	  .fops	=	&ll_sb_uuid_fops			},
-	{ .name	=	"fstype",
-	  .fops	=	&ll_fstype_fops				},
+LDEBUGFS_SEQ_FOPS(ll_nosquash_nids);
+
+struct ldebugfs_vars lprocfs_llite_obd_vars[] = {
 	{ .name	=	"site",
 	  .fops	=	&ll_site_stats_fops			},
-	{ .name	=	"blocksize",
-	  .fops	=	&ll_blksize_fops			},
-	{ .name	=	"stat_blocksize",
-	  .fops	=	&ll_stat_blksize_fops			},
-	{ .name	=	"kbytestotal",
-	  .fops	=	&ll_kbytestotal_fops			},
-	{ .name	=	"kbytesfree",
-	  .fops	=	&ll_kbytesfree_fops			},
-	{ .name	=	"kbytesavail",
-	  .fops	=	&ll_kbytesavail_fops			},
-	{ .name	=	"filestotal",
-	  .fops	=	&ll_filestotal_fops			},
-	{ .name	=	"filesfree",
-	  .fops	=	&ll_filesfree_fops			},
-	{ .name	=	"client_type",
-	  .fops	=	&ll_client_type_fops			},
-	{ .name	=	"max_read_ahead_mb",
-	  .fops	=	&ll_max_readahead_mb_fops		},
-	{ .name	=	"max_read_ahead_per_file_mb",
-	  .fops	=	&ll_max_readahead_per_file_mb_fops	},
-	{ .name	=	"max_read_ahead_whole_mb",
-	  .fops	=	&ll_max_read_ahead_whole_mb_fops	},
+	{ .name =	"max_read_ahead_mb",
+	  .fops =	&ll_max_readahead_mb_fops		},
+	{ .name =	"max_read_ahead_per_file_mb",
+	  .fops =	&ll_max_readahead_per_file_mb_fops	},
+	{ .name =	"max_read_ahead_whole_mb",
+	  .fops =	&ll_max_read_ahead_whole_mb_fops	},
 	{ .name	=	"max_cached_mb",
 	  .fops	=	&ll_max_cached_mb_fops			},
-	{ .name	=	"checksum_pages",
-	  .fops	=	&ll_checksum_fops			},
-	{ .name	=	"stats_track_pid",
-	  .fops	=	&ll_track_pid_fops			},
-	{ .name	=	"stats_track_ppid",
-	  .fops	=	&ll_track_ppid_fops			},
-	{ .name	=	"stats_track_gid",
-	  .fops	=	&ll_track_gid_fops			},
-	{ .name	=	"statahead_max",
-	  .fops	=	&ll_statahead_max_fops			},
-	{ .name	=	"statahead_running_max",
-	  .fops	=	&ll_statahead_running_max_fops		},
-	{ .name	=	"statahead_agl",
-	  .fops	=	&ll_statahead_agl_fops			},
 	{ .name	=	"statahead_stats",
 	  .fops	=	&ll_statahead_stats_fops		},
-	{ .name	=	"lazystatfs",
-	  .fops	=	&ll_lazystatfs_fops			},
-	{ .name	=	"max_easize",
-	  .fops	=	&ll_max_easize_fops			},
-	{ .name	=	"default_easize",
-	  .fops	=	&ll_default_easize_fops			},
-	{ .name	=	"sbi_flags",
-	  .fops	=	&ll_sbi_flags_fops			},
-	{ .name	=	"xattr_cache",
-	  .fops	=	&ll_xattr_cache_fops			},
 	{ .name	=	"unstable_stats",
 	  .fops	=	&ll_unstable_stats_fops			},
+	{ .name =	"sbi_flags",
+	  .fops =	&ll_sbi_flags_fops			},
 	{ .name	=	"root_squash",
 	  .fops	=	&ll_root_squash_fops			},
 	{ .name	=	"nosquash_nids",
 	  .fops	=	&ll_nosquash_nids_fops			},
-	{ .name =	"fast_read",
-	  .fops =	&ll_fast_read_fops,			},
-	{ .name =	"pio",
-	  .fops =	&ll_pio_fops,				},
 	{ NULL }
 };
 
 #define MAX_STRING_SIZE 128
 
+static struct attribute *llite_attrs[] = {
+	&lustre_attr_blocksize.attr,
+	&lustre_attr_stat_blocksize.attr,
+	&lustre_attr_kbytestotal.attr,
+	&lustre_attr_kbytesfree.attr,
+	&lustre_attr_kbytesavail.attr,
+	&lustre_attr_filestotal.attr,
+	&lustre_attr_filesfree.attr,
+	&lustre_attr_client_type.attr,
+	&lustre_attr_fstype.attr,
+	&lustre_attr_uuid.attr,
+	&lustre_attr_checksums.attr,
+	&lustre_attr_checksum_pages.attr,
+	&lustre_attr_stats_track_pid.attr,
+	&lustre_attr_stats_track_ppid.attr,
+	&lustre_attr_stats_track_gid.attr,
+	&lustre_attr_statahead_running_max.attr,
+	&lustre_attr_statahead_max.attr,
+	&lustre_attr_statahead_agl.attr,
+	&lustre_attr_lazystatfs.attr,
+	&lustre_attr_statfs_max_age.attr,
+	&lustre_attr_max_easize.attr,
+	&lustre_attr_default_easize.attr,
+	&lustre_attr_xattr_cache.attr,
+	&lustre_attr_fast_read.attr,
+	&lustre_attr_tiny_write.attr,
+	NULL,
+};
+
+static void llite_kobj_release(struct kobject *kobj)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	complete(&sbi->ll_kobj_unregister);
+}
+
+static struct kobj_type llite_ktype = {
+	.default_attrs  = llite_attrs,
+	.sysfs_ops      = &lustre_sysfs_ops,
+	.release        = llite_kobj_release,
+};
+
 static const struct llite_file_opcode {
         __u32       opcode;
         __u32       type;
@@ -1280,60 +1378,45 @@ static const char *ra_stat_string[] = {
 	[RA_STAT_FAILED_REACH_END] = "failed to reach end"
 };
 
-LPROC_SEQ_FOPS_RO_TYPE(llite, name);
-LPROC_SEQ_FOPS_RO_TYPE(llite, uuid);
-
-int lprocfs_ll_register_mountpoint(struct proc_dir_entry *parent,
-				   struct super_block *sb)
+int ll_debugfs_register_super(struct super_block *sb, const char *name)
 {
-	struct lprocfs_vars lvars[2];
 	struct lustre_sb_info *lsi = s2lsi(sb);
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
-	char name[MAX_STRING_SIZE + 1], *ptr;
-	int err, id, len, rc;
-	ENTRY;
-
-	memset(lvars, 0, sizeof(lvars));
-
-	name[MAX_STRING_SIZE] = '\0';
-	lvars[0].name = name;
+	int err, id, rc;
 
-	LASSERT(sbi != NULL);
-
-	/* Get fsname */
-	len = strlen(lsi->lsi_lmd->lmd_profile);
-	ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-');
-	if (ptr && (strcmp(ptr, "-client") == 0))
-		len -= 7;
+	ENTRY;
+	LASSERT(sbi);
 
-	/* Mount info */
-	snprintf(name, MAX_STRING_SIZE, "%.*s-%p", len,
-		 lsi->lsi_lmd->lmd_profile, sb);
+	if (IS_ERR_OR_NULL(llite_root))
+		goto out_ll_kset;
 
-	sbi->ll_proc_root = lprocfs_register(name, parent, NULL, NULL);
-	if (IS_ERR(sbi->ll_proc_root)) {
-		err = PTR_ERR(sbi->ll_proc_root);
-		sbi->ll_proc_root = NULL;
+	sbi->ll_debugfs_entry = ldebugfs_register(name, llite_root,
+						  lprocfs_llite_obd_vars, sb);
+	if (IS_ERR_OR_NULL(sbi->ll_debugfs_entry)) {
+		err = sbi->ll_debugfs_entry ? PTR_ERR(sbi->ll_debugfs_entry) :
+					      -ENOMEM;
+		sbi->ll_debugfs_entry = NULL;
 		RETURN(err);
 	}
 
-	rc = lprocfs_seq_create(sbi->ll_proc_root, "dump_page_cache", 0444,
-				&vvp_dump_pgcache_file_ops, sbi);
+	rc = ldebugfs_seq_create(sbi->ll_debugfs_entry, "dump_page_cache",0444,
+				 &vvp_dump_pgcache_file_ops, sbi);
 	if (rc)
 		CWARN("Error adding the dump_page_cache file\n");
 
-	rc = lprocfs_seq_create(sbi->ll_proc_root, "extents_stats", 0644,
-				&ll_rw_extents_stats_fops, sbi);
+	rc = ldebugfs_seq_create(sbi->ll_debugfs_entry, "extents_stats", 0644,
+				 &ll_rw_extents_stats_fops, sbi);
 	if (rc)
 		CWARN("Error adding the extent_stats file\n");
 
-	rc = lprocfs_seq_create(sbi->ll_proc_root, "extents_stats_per_process",
-				0644, &ll_rw_extents_stats_pp_fops, sbi);
+	rc = ldebugfs_seq_create(sbi->ll_debugfs_entry,
+				 "extents_stats_per_process", 0644,
+				 &ll_rw_extents_stats_pp_fops, sbi);
 	if (rc)
 		CWARN("Error adding the extents_stats_per_process file\n");
 
-	rc = lprocfs_seq_create(sbi->ll_proc_root, "offset_stats", 0644,
-				&ll_rw_offset_stats_fops, sbi);
+	rc = ldebugfs_seq_create(sbi->ll_debugfs_entry, "offset_stats", 0644,
+				 &ll_rw_offset_stats_fops, sbi);
 	if (rc)
 		CWARN("Error adding the offset_stats file\n");
 
@@ -1341,11 +1424,13 @@ int lprocfs_ll_register_mountpoint(struct proc_dir_entry *parent,
 	sbi->ll_stats = lprocfs_alloc_stats(LPROC_LL_FILE_OPCODES,
 					    LPROCFS_STATS_FLAG_NONE);
 	if (sbi->ll_stats == NULL)
-		GOTO(out, err = -ENOMEM);
+		GOTO(out_debugfs, err = -ENOMEM);
+
 	/* do counter init */
 	for (id = 0; id < LPROC_LL_FILE_OPCODES; id++) {
-		__u32 type = llite_opcode_table[id].type;
+		u32 type = llite_opcode_table[id].type;
 		void *ptr = NULL;
+
 		if (type & LPROCFS_TYPE_REGS)
 			ptr = "regs";
 		else if (type & LPROCFS_TYPE_BYTES)
@@ -1357,98 +1442,78 @@ int lprocfs_ll_register_mountpoint(struct proc_dir_entry *parent,
 				     (type & LPROCFS_CNTR_AVGMINMAX),
 				     llite_opcode_table[id].opname, ptr);
 	}
-	err = lprocfs_register_stats(sbi->ll_proc_root, "stats", sbi->ll_stats);
+
+	err = ldebugfs_register_stats(sbi->ll_debugfs_entry, "stats",
+				      sbi->ll_stats);
 	if (err)
-		GOTO(out, err);
+		GOTO(out_stats, err);
 
 	sbi->ll_ra_stats = lprocfs_alloc_stats(ARRAY_SIZE(ra_stat_string),
 					       LPROCFS_STATS_FLAG_NONE);
 	if (sbi->ll_ra_stats == NULL)
-		GOTO(out, err = -ENOMEM);
+		GOTO(out_stats, err = -ENOMEM);
 
 	for (id = 0; id < ARRAY_SIZE(ra_stat_string); id++)
 		lprocfs_counter_init(sbi->ll_ra_stats, id, 0,
 				     ra_stat_string[id], "pages");
-	err = lprocfs_register_stats(sbi->ll_proc_root, "read_ahead_stats",
-				     sbi->ll_ra_stats);
-	if (err)
-		GOTO(out, err);
 
+	err = ldebugfs_register_stats(sbi->ll_debugfs_entry, "read_ahead_stats",
+				      sbi->ll_ra_stats);
+	if (err)
+		GOTO(out_ra_stats, err);
+
+out_ll_kset:
+	/* Yes we also register sysfs mount kset here as well */
+	sbi->ll_kset.kobj.parent = llite_kobj;
+	sbi->ll_kset.kobj.ktype = &llite_ktype;
+	init_completion(&sbi->ll_kobj_unregister);
+	err = kobject_set_name(&sbi->ll_kset.kobj, "%s", name);
+	if (err)
+		GOTO(out_ra_stats, err);
 
-	err = lprocfs_add_vars(sbi->ll_proc_root, lprocfs_llite_obd_vars, sb);
+	err = kset_register(&sbi->ll_kset);
 	if (err)
-		GOTO(out, err);
+		GOTO(out_ra_stats, err);
+
+	lsi->lsi_kobj = kobject_get(&sbi->ll_kset.kobj);
+
+	RETURN(0);
+out_ra_stats:
+	lprocfs_free_stats(&sbi->ll_ra_stats);
+out_stats:
+	lprocfs_free_stats(&sbi->ll_stats);
+out_debugfs:
+	ldebugfs_remove(&sbi->ll_debugfs_entry);
 
-out:
-	if (err) {
-		lprocfs_remove(&sbi->ll_proc_root);
-		lprocfs_free_stats(&sbi->ll_ra_stats);
-		lprocfs_free_stats(&sbi->ll_stats);
-	}
 	RETURN(err);
 }
 
-int lprocfs_ll_register_obd(struct super_block *sb, const char *obdname)
+void ll_debugfs_unregister_super(struct super_block *sb)
 {
-	struct lprocfs_vars lvars[2];
+	struct lustre_sb_info *lsi = s2lsi(sb);
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
-	struct obd_device *obd;
-	struct proc_dir_entry *dir;
-	char name[MAX_STRING_SIZE + 1];
-	int err;
-	ENTRY;
-
-	memset(lvars, 0, sizeof(lvars));
-
-	name[MAX_STRING_SIZE] = '\0';
-	lvars[0].name = name;
 
-	LASSERT(sbi != NULL);
-	LASSERT(obdname != NULL);
+	if (!IS_ERR_OR_NULL(sbi->ll_debugfs_entry))
+		ldebugfs_remove(&sbi->ll_debugfs_entry);
 
-	obd = class_name2obd(obdname);
+	if (sbi->ll_dt_obd)
+		sysfs_remove_link(&sbi->ll_kset.kobj,
+				  sbi->ll_dt_obd->obd_type->typ_name);
 
-	LASSERT(obd != NULL);
-	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
-	LASSERT(obd->obd_type->typ_name != NULL);
+	if (sbi->ll_md_obd)
+		sysfs_remove_link(&sbi->ll_kset.kobj,
+				  sbi->ll_md_obd->obd_type->typ_name);
 
-	dir = proc_mkdir(obd->obd_type->typ_name, sbi->ll_proc_root);
-	if (dir == NULL)
-		GOTO(out, err = -ENOMEM);
-
-	snprintf(name, MAX_STRING_SIZE, "common_name");
-	lvars[0].fops = &llite_name_fops;
-	err = lprocfs_add_vars(dir, lvars, obd);
-	if (err)
-		GOTO(out, err);
+	kobject_put(lsi->lsi_kobj);
 
-	snprintf(name, MAX_STRING_SIZE, "uuid");
-	lvars[0].fops = &llite_uuid_fops;
-	err = lprocfs_add_vars(dir, lvars, obd);
-	if (err)
-		GOTO(out, err);
+	kset_unregister(&sbi->ll_kset);
+	wait_for_completion(&sbi->ll_kobj_unregister);
 
-out:
-	if (err) {
-		lprocfs_remove(&sbi->ll_proc_root);
-		lprocfs_free_stats(&sbi->ll_ra_stats);
-		lprocfs_free_stats(&sbi->ll_stats);
-	}
-	RETURN(err);
-}
-
-void lprocfs_ll_unregister_mountpoint(struct ll_sb_info *sbi)
-{
-        if (sbi->ll_proc_root) {
-                lprocfs_remove(&sbi->ll_proc_root);
-                lprocfs_free_stats(&sbi->ll_ra_stats);
-                lprocfs_free_stats(&sbi->ll_stats);
-        }
+	lprocfs_free_stats(&sbi->ll_ra_stats);
+	lprocfs_free_stats(&sbi->ll_stats);
 }
 #undef MAX_STRING_SIZE
 
-#define pct(a,b) (b ? a * 100 / b : 0)
-
 static void ll_display_extents_info(struct ll_rw_extents_info *io_extents,
                                    struct seq_file *seq, int which)
 {
@@ -1472,14 +1537,14 @@ static void ll_display_extents_info(struct ll_rw_extents_info *io_extents,
                 w = pp_info->pp_w_hist.oh_buckets[i];
                 read_cum += r;
                 write_cum += w;
-                end = 1 << (i + LL_HIST_START - units);
-                seq_printf(seq, "%4lu%c - %4lu%c%c: %14lu %4lu %4lu  | "
-                           "%14lu %4lu %4lu\n", start, *unitp, end, *unitp,
+		end = BIT(i + LL_HIST_START - units);
+		seq_printf(seq, "%4lu%c - %4lu%c%c: %14lu %4u %4u  | "
+			   "%14lu %4u %4u\n", start, *unitp, end, *unitp,
                            (i == LL_HIST_MAX - 1) ? '+' : ' ',
                            r, pct(r, read_tot), pct(read_cum, read_tot),
                            w, pct(w, write_tot), pct(write_cum, write_tot));
                 start = end;
-                if (start == 1<<10) {
+		if (start == BIT(10)) {
                         start = 1;
                         units += 10;
                         unitp++;
@@ -1534,7 +1599,7 @@ static ssize_t ll_rw_extents_stats_pp_seq_write(struct file *file,
 	if (len == 0)
 		return -EINVAL;
 
-	value = ll_stats_pid_write(file, buf, len);
+	value = ll_stats_pid_write(buf, len);
 
 	if (value == 0)
 		sbi->ll_rw_stats_on = 0;
@@ -1551,7 +1616,7 @@ static ssize_t ll_rw_extents_stats_pp_seq_write(struct file *file,
 	return len;
 }
 
-LPROC_SEQ_FOPS(ll_rw_extents_stats_pp);
+LDEBUGFS_SEQ_FOPS(ll_rw_extents_stats_pp);
 
 static int ll_rw_extents_stats_seq_show(struct seq_file *seq, void *v)
 {
@@ -1592,7 +1657,7 @@ static ssize_t ll_rw_extents_stats_seq_write(struct file *file,
 	if (len == 0)
 		return -EINVAL;
 
-	value = ll_stats_pid_write(file, buf, len);
+	value = ll_stats_pid_write(buf, len);
 
 	if (value == 0)
 		sbi->ll_rw_stats_on = 0;
@@ -1609,7 +1674,8 @@ static ssize_t ll_rw_extents_stats_seq_write(struct file *file,
 
 	return len;
 }
-LPROC_SEQ_FOPS(ll_rw_extents_stats);
+
+LDEBUGFS_SEQ_FOPS(ll_rw_extents_stats);
 
 void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
                        struct ll_file_data *file, loff_t pos,
@@ -1646,15 +1712,15 @@ void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
                 lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_w_hist);
         }
 
-        for(i = 0; (count >= (1 << LL_HIST_START << i)) &&
-             (i < (LL_HIST_MAX - 1)); i++);
-        if (rw == 0) {
-                io_extents->pp_extents[cur].pp_r_hist.oh_buckets[i]++;
-                io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_r_hist.oh_buckets[i]++;
-        } else {
-                io_extents->pp_extents[cur].pp_w_hist.oh_buckets[i]++;
-                io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_w_hist.oh_buckets[i]++;
-        }
+	for (i = 0; (count >= BIT(LL_HIST_START + i)) &&
+	     (i < (LL_HIST_MAX - 1)); i++);
+	if (rw == 0) {
+		io_extents->pp_extents[cur].pp_r_hist.oh_buckets[i]++;
+		io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_r_hist.oh_buckets[i]++;
+	} else {
+		io_extents->pp_extents[cur].pp_w_hist.oh_buckets[i]++;
+		io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_w_hist.oh_buckets[i]++;
+	}
 	spin_unlock(&sbi->ll_pp_extent_lock);
 
 	spin_lock(&sbi->ll_process_lock);
@@ -1740,7 +1806,7 @@ static int ll_rw_offset_stats_seq_show(struct seq_file *seq, void *v)
 	for (i = 0; i < LL_OFFSET_HIST_MAX; i++) {
 		if (offset[i].rw_pid != 0)
 			seq_printf(seq,
-				   "%3c %10d %14Lu %14Lu %17lu %17lu %14Lu",
+				  "%3c %10d %14llu %14llu %17lu %17lu %14lld\n",
 				   offset[i].rw_op == READ ? 'R' : 'W',
 				   offset[i].rw_pid,
 				   offset[i].rw_range_start,
@@ -1754,7 +1820,7 @@ static int ll_rw_offset_stats_seq_show(struct seq_file *seq, void *v)
 	for (i = 0; i < LL_PROCESS_HIST_MAX; i++) {
 		if (process[i].rw_pid != 0)
 			seq_printf(seq,
-				   "%3c %10d %14Lu %14Lu %17lu %17lu %14Lu",
+				  "%3c %10d %14llu %14llu %17lu %17lu %14lld\n",
 				   process[i].rw_op == READ ? 'R' : 'W',
 				   process[i].rw_pid,
 				   process[i].rw_range_start,
@@ -1781,7 +1847,7 @@ static ssize_t ll_rw_offset_stats_seq_write(struct file *file,
 	if (len == 0)
 		return -EINVAL;
 
-	value = ll_stats_pid_write(file, buf, len);
+	value = ll_stats_pid_write(buf, len);
 
 	if (value == 0)
 		sbi->ll_rw_stats_on = 0;
@@ -1800,43 +1866,4 @@ static ssize_t ll_rw_offset_stats_seq_write(struct file *file,
 	return len;
 }
 
-/**
- * ll_stats_pid_write() - Determine if stats collection should be enabled
- * @buf: Buffer containing the data written
- * @len: Number of bytes in the buffer
- *
- * Several proc files begin collecting stats when a value is written, and stop
- * collecting when either '0' or 'disable' is written. This function checks the
- * written value to see if collection should be enabled or disabled.
- *
- * Return: If '0' or 'disable' is provided, 0 is returned. If the text
- * equivalent of a number is written, that number is returned. Otherwise,
- * 1 is returned. Non-zero return values indicate collection should be enabled.
- */
-static __s64 ll_stats_pid_write(struct file *file, const char __user *buf,
-				size_t len)
-{
-	__s64 value = 1;
-	int rc;
-	char kernbuf[16];
-
-	rc = lprocfs_str_to_s64(file, buf, len, &value);
-
-	if (rc < 0 && len < sizeof(kernbuf)) {
-
-		if (lprocfs_copy_from_user(file, kernbuf, buf, len))
-			return -EFAULT;
-		kernbuf[len] = 0;
-
-		if (kernbuf[len - 1] == '\n')
-			kernbuf[len - 1] = 0;
-
-		if (strncasecmp(kernbuf, "disable", 7) == 0)
-			value = 0;
-	}
-
-	return value;
-}
-
-LPROC_SEQ_FOPS(ll_rw_offset_stats);
-#endif /* CONFIG_PROC_FS */
+LDEBUGFS_SEQ_FOPS(ll_rw_offset_stats);
diff --git a/drivers/staging/lustrefsx/lustre/llite/namei.c b/drivers/staging/lustrefsx/lustre/llite/namei.c
index ae7101b1885f2..dea41a48b589a 100644
--- a/drivers/staging/lustrefsx/lustre/llite/namei.c
+++ b/drivers/staging/lustrefsx/lustre/llite/namei.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -46,7 +46,6 @@
 #include <obd_support.h>
 #include <lustre_fid.h>
 #include <lustre_dlm.h>
-#include <lustre_ver.h>
 #include "llite_internal.h"
 
 static int ll_create_it(struct inode *dir, struct dentry *dentry,
@@ -138,6 +137,9 @@ struct inode *ll_iget(struct super_block *sb, ino_t hash,
 			inode_has_no_xattr(inode);
 			unlock_new_inode(inode);
 		}
+	} else if (is_bad_inode(inode)) {
+		iput(inode);
+		inode = ERR_PTR(-ESTALE);
 	} else if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
 		rc = ll_update_inode(inode, md);
 		CDEBUG(D_VFSTRACE, "got inode: "DFID"(%p): rc = %d\n",
@@ -181,168 +183,314 @@ int ll_test_inode_by_fid(struct inode *inode, void *opaque)
 	return lu_fid_eq(&ll_i2info(inode)->lli_fid, opaque);
 }
 
-int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
-		       void *data, int flag)
+static int ll_dom_lock_cancel(struct inode *inode, struct ldlm_lock *lock)
 {
-	struct lustre_handle lockh;
+	struct lu_env *env;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	__u16 refcheck;
 	int rc;
 	ENTRY;
 
-	switch (flag) {
-	case LDLM_CB_BLOCKING:
-		ldlm_lock2handle(lock, &lockh);
-		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
-		if (rc < 0) {
-			CDEBUG(D_INODE, "ldlm_cli_cancel: rc = %d\n", rc);
-			RETURN(rc);
-		}
-		break;
-	case LDLM_CB_CANCELING: {
-		struct inode *inode = ll_inode_from_resource_lock(lock);
-		__u64 bits = lock->l_policy_data.l_inodebits.bits;
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	/* reach MDC layer to flush data under  the DoM ldlm lock */
+	rc = cl_object_flush(env, lli->lli_clob, lock);
+	if (rc == -ENODATA) {
+		CDEBUG(D_INODE, "inode "DFID" layout has no DoM stripe\n",
+		       PFID(ll_inode2fid(inode)));
+		/* most likely result of layout change, do nothing */
+		rc = 0;
+	}
 
-		/* Inode is set to lock->l_resource->lr_lvb_inode
-		 * for mdc - bug 24555 */
-		LASSERT(lock->l_ast_data == NULL);
+	cl_env_put(env, &refcheck);
+	RETURN(rc);
+}
 
-		if (inode == NULL)
-			break;
+static void ll_lock_cancel_bits(struct ldlm_lock *lock, __u64 to_cancel)
+{
+	struct inode *inode = ll_inode_from_resource_lock(lock);
+	__u64 bits = to_cancel;
+	int rc;
 
-		/* Invalidate all dentries associated with this inode */
-		LASSERT(ldlm_is_canceling(lock));
+	ENTRY;
+
+	if (!inode) {
+		/* That means the inode is evicted most likely and may cause
+		 * the skipping of lock cleanups below, so print the message
+		 * about that in log.
+		 */
+		if (lock->l_resource->lr_lvb_inode)
+			LDLM_DEBUG(lock,
+				   "can't take inode for the lock (%sevicted)\n",
+				   lock->l_resource->lr_lvb_inode->i_state &
+				   I_FREEING ? "" : "not ");
+		RETURN_EXIT;
+	}
+
+	if (!fid_res_name_eq(ll_inode2fid(inode),
+			     &lock->l_resource->lr_name)) {
+		LDLM_ERROR(lock, "data mismatch with object "DFID"(%p)",
+			   PFID(ll_inode2fid(inode)), inode);
+		LBUG();
+	}
+
+	if (bits & MDS_INODELOCK_XATTR) {
+		if (S_ISDIR(inode->i_mode))
+			ll_i2info(inode)->lli_def_stripe_offset = -1;
+		ll_xattr_cache_destroy(inode);
+		bits &= ~MDS_INODELOCK_XATTR;
+	}
 
-		if (!fid_res_name_eq(ll_inode2fid(inode),
-				     &lock->l_resource->lr_name)) {
-			LDLM_ERROR(lock, "data mismatch with object "DFID"(%p)",
-				   PFID(ll_inode2fid(inode)), inode);
+	/* For OPEN locks we differentiate between lock modes
+	 * LCK_CR, LCK_CW, LCK_PR - bug 22891 */
+	if (bits & MDS_INODELOCK_OPEN)
+		ll_have_md_lock(inode, &bits, lock->l_req_mode);
+
+	if (bits & MDS_INODELOCK_OPEN) {
+		fmode_t fmode;
+
+		switch (lock->l_req_mode) {
+		case LCK_CW:
+			fmode = FMODE_WRITE;
+			break;
+		case LCK_PR:
+			fmode = FMODE_EXEC;
+			break;
+		case LCK_CR:
+			fmode = FMODE_READ;
+			break;
+		default:
+			LDLM_ERROR(lock, "bad lock mode for OPEN lock");
 			LBUG();
 		}
 
-		if (bits & MDS_INODELOCK_XATTR) {
-			if (S_ISDIR(inode->i_mode))
-				ll_i2info(inode)->lli_def_stripe_offset = -1;
-			ll_xattr_cache_destroy(inode);
-			bits &= ~MDS_INODELOCK_XATTR;
-		}
+		ll_md_real_close(inode, fmode);
 
-		/* For OPEN locks we differentiate between lock modes
-		 * LCK_CR, LCK_CW, LCK_PR - bug 22891 */
-		if (bits & MDS_INODELOCK_OPEN)
-			ll_have_md_lock(inode, &bits, lock->l_req_mode);
-
-		if (bits & MDS_INODELOCK_OPEN) {
-			fmode_t fmode;
-
-			switch (lock->l_req_mode) {
-			case LCK_CW:
-				fmode = FMODE_WRITE;
-				break;
-			case LCK_PR:
-				fmode = FMODE_EXEC;
-				break;
-			case LCK_CR:
-				fmode = FMODE_READ;
-				break;
-			default:
-				LDLM_ERROR(lock, "bad lock mode for OPEN lock");
-				LBUG();
-			}
+		bits &= ~MDS_INODELOCK_OPEN;
+	}
 
-			ll_md_real_close(inode, fmode);
+	if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE |
+		    MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM |
+		    MDS_INODELOCK_DOM))
+		ll_have_md_lock(inode, &bits, LCK_MINMODE);
 
-			bits &= ~MDS_INODELOCK_OPEN;
-		}
+	if (bits & MDS_INODELOCK_DOM) {
+		rc =  ll_dom_lock_cancel(inode, lock);
+		if (rc < 0)
+			CDEBUG(D_INODE, "cannot flush DoM data "
+			       DFID": rc = %d\n",
+			       PFID(ll_inode2fid(inode)), rc);
+	}
 
-		if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE |
-			    MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM))
-			ll_have_md_lock(inode, &bits, LCK_MINMODE);
-
-		if (bits & MDS_INODELOCK_LAYOUT) {
-			struct cl_object_conf conf = {
-				.coc_opc = OBJECT_CONF_INVALIDATE,
-				.coc_inode = inode,
-			};
-
-			rc = ll_layout_conf(inode, &conf);
-			if (rc < 0)
-				CDEBUG(D_INODE, "cannot invalidate layout of "
-				       DFID": rc = %d\n",
-				       PFID(ll_inode2fid(inode)), rc);
-		}
+	if (bits & MDS_INODELOCK_LAYOUT) {
+		struct cl_object_conf conf = {
+			.coc_opc = OBJECT_CONF_INVALIDATE,
+			.coc_inode = inode,
+		};
 
-		if (bits & MDS_INODELOCK_UPDATE) {
-			struct ll_inode_info *lli = ll_i2info(inode);
-			lli->lli_update_atime = 1;
-		}
+		rc = ll_layout_conf(inode, &conf);
+		if (rc < 0)
+			CDEBUG(D_INODE, "cannot invalidate layout of "
+			       DFID": rc = %d\n",
+			       PFID(ll_inode2fid(inode)), rc);
+	}
 
-		if ((bits & MDS_INODELOCK_UPDATE) && S_ISDIR(inode->i_mode)) {
-			struct ll_inode_info *lli = ll_i2info(inode);
+	if (bits & MDS_INODELOCK_UPDATE) {
+		struct ll_inode_info *lli = ll_i2info(inode);
 
-			CDEBUG(D_INODE, "invalidating inode "DFID" lli = %p, "
-			       "pfid  = "DFID"\n", PFID(ll_inode2fid(inode)),
-			       lli, PFID(&lli->lli_pfid));
-			truncate_inode_pages(inode->i_mapping, 0);
+		lli->lli_update_atime = 1;
+	}
 
-			if (unlikely(!fid_is_zero(&lli->lli_pfid))) {
-				struct inode *master_inode = NULL;
-				unsigned long hash;
+	if ((bits & MDS_INODELOCK_UPDATE) && S_ISDIR(inode->i_mode)) {
+		struct ll_inode_info *lli = ll_i2info(inode);
 
-				/* This is slave inode, since all of the child
-				 * dentry is connected on the master inode, so
-				 * we have to invalidate the negative children
-				 * on master inode */
-				CDEBUG(D_INODE, "Invalidate s"DFID" m"DFID"\n",
-				       PFID(ll_inode2fid(inode)),
-				       PFID(&lli->lli_pfid));
+		CDEBUG(D_INODE, "invalidating inode "DFID" lli = %p, "
+		       "pfid  = "DFID"\n", PFID(ll_inode2fid(inode)),
+		       lli, PFID(&lli->lli_pfid));
+		truncate_inode_pages(inode->i_mapping, 0);
 
-				hash = cl_fid_build_ino(&lli->lli_pfid,
+		if (unlikely(!fid_is_zero(&lli->lli_pfid))) {
+			struct inode *master_inode = NULL;
+			unsigned long hash;
+
+			/* This is slave inode, since all of the child dentry
+			 * is connected on the master inode, so we have to
+			 * invalidate the negative children on master inode */
+			CDEBUG(D_INODE, "Invalidate s"DFID" m"DFID"\n",
+			       PFID(ll_inode2fid(inode)), PFID(&lli->lli_pfid));
+
+			hash = cl_fid_build_ino(&lli->lli_pfid,
 					ll_need_32bit_api(ll_i2sbi(inode)));
 
-				/* Do not lookup the inode with ilookup5,
-				 * otherwise it will cause dead lock,
-				 *
-				 * 1. Client1 send chmod req to the MDT0, then
-				 * on MDT0, it enqueues master and all of its
-				 * slaves lock, (mdt_attr_set() ->
-				 * mdt_lock_slaves()), after gets master and
-				 * stripe0 lock, it will send the enqueue req
-				 * (for stripe1) to MDT1, then MDT1 finds the
-				 * lock has been granted to client2. Then MDT1
-				 * sends blocking ast to client2.
-				 *
-				 * 2. At the same time, client2 tries to unlink
-				 * the striped dir (rm -rf striped_dir), and
-				 * during lookup, it will hold the master inode
-				 * of the striped directory, whose inode state
-				 * is NEW, then tries to revalidate all of its
-				 * slaves, (ll_prep_inode()->ll_iget()->
-				 * ll_read_inode2()-> ll_update_inode().). And
-				 * it will be blocked on the server side because
-				 * of 1.
-				 *
-				 * 3. Then the client get the blocking_ast req,
-				 * cancel the lock, but being blocked if using
-				 * ->ilookup5()), because master inode state is
-				 *  NEW. */
-				master_inode = ilookup5_nowait(inode->i_sb,
-						    hash, ll_test_inode_by_fid,
+			/* Do not lookup the inode with ilookup5, otherwise
+			 * it will cause dead lock,
+			 * 1. Client1 send chmod req to the MDT0, then on MDT0,
+			 * it enqueues master and all of its slaves lock,
+			 * (mdt_attr_set() -> mdt_lock_slaves()), after gets
+			 * master and stripe0 lock, it will send the enqueue
+			 * req (for stripe1) to MDT1, then MDT1 finds the lock
+			 * has been granted to client2. Then MDT1 sends blocking
+			 * ast to client2.
+			 * 2. At the same time, client2 tries to unlink
+			 * the striped dir (rm -rf striped_dir), and during
+			 * lookup, it will hold the master inode of the striped
+			 * directory, whose inode state is NEW, then tries to
+			 * revalidate all of its slaves, (ll_prep_inode()->
+			 * ll_iget()->ll_read_inode2()-> ll_update_inode().).
+			 * And it will be blocked on the server side because
+			 * of 1.
+			 * 3. Then the client get the blocking_ast req, cancel
+			 * the lock, but being blocked if using ->ilookup5()),
+			 * because master inode state is NEW. */
+			master_inode = ilookup5_nowait(inode->i_sb, hash,
+							ll_test_inode_by_fid,
 							(void *)&lli->lli_pfid);
-				if (master_inode) {
-					ll_invalidate_negative_children(
-								master_inode);
-					iput(master_inode);
-				}
-			} else {
-				ll_invalidate_negative_children(inode);
+			if (master_inode) {
+				ll_invalidate_negative_children(master_inode);
+				iput(master_inode);
 			}
+		} else {
+			ll_invalidate_negative_children(inode);
 		}
+	}
 
-		if ((bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM)) &&
-		    inode->i_sb->s_root != NULL &&
-		    inode != inode->i_sb->s_root->d_inode)
-			ll_invalidate_aliases(inode);
+	if ((bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM)) &&
+	    inode->i_sb->s_root != NULL &&
+	    inode != inode->i_sb->s_root->d_inode)
+		ll_invalidate_aliases(inode);
 
-		iput(inode);
+	if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM))
+		forget_all_cached_acls(inode);
+
+	iput(inode);
+	RETURN_EXIT;
+}
+
+/* Check if the given lock may be downgraded instead of canceling and
+ * that convert is really needed. */
+int ll_md_need_convert(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+	struct inode *inode;
+	__u64 wanted = lock->l_policy_data.l_inodebits.cancel_bits;
+	__u64 bits = lock->l_policy_data.l_inodebits.bits & ~wanted;
+	enum ldlm_mode mode = LCK_MINMODE;
+
+	if (!lock->l_conn_export ||
+	    !exp_connect_lock_convert(lock->l_conn_export))
+		return 0;
+
+	if (!wanted || !bits || ldlm_is_cancel(lock))
+		return 0;
+
+	/* do not convert locks other than DOM for now */
+	if (!((bits | wanted) & MDS_INODELOCK_DOM))
+		return 0;
+
+	/* We may have already remaining bits in some other lock so
+	 * lock convert will leave us just extra lock for the same bit.
+	 * Check if client has other lock with the same bits and the same
+	 * or lower mode and don't convert if any.
+	 */
+	switch (lock->l_req_mode) {
+	case LCK_PR:
+		mode = LCK_PR;
+		/* fallthrough */
+	case LCK_PW:
+		mode |= LCK_CR;
+		break;
+	case LCK_CW:
+		mode = LCK_CW;
+		/* fallthrough */
+	case LCK_CR:
+		mode |= LCK_CR;
+		break;
+	default:
+		/* do not convert other modes */
+		return 0;
+	}
+
+	/* is lock is too old to be converted? */
+	lock_res_and_lock(lock);
+	if (ktime_after(ktime_get(),
+			ktime_add(lock->l_last_used,
+				  ktime_set(ns->ns_dirty_age_limit, 0)))) {
+		unlock_res_and_lock(lock);
+		return 0;
+	}
+	unlock_res_and_lock(lock);
+
+	inode = ll_inode_from_resource_lock(lock);
+	ll_have_md_lock(inode, &bits, mode);
+	iput(inode);
+	return !!(bits);
+}
+
+int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *ld,
+		       void *data, int flag)
+{
+	struct lustre_handle lockh;
+	int rc;
+
+	ENTRY;
+
+	switch (flag) {
+	case LDLM_CB_BLOCKING:
+	{
+		__u64 cancel_flags = LCF_ASYNC;
+
+		/* if lock convert is not needed then still have to
+		 * pass lock via ldlm_cli_convert() to keep all states
+		 * correct, set cancel_bits to full lock bits to cause
+		 * full cancel to happen.
+		 */
+		if (!ll_md_need_convert(lock)) {
+			lock_res_and_lock(lock);
+			lock->l_policy_data.l_inodebits.cancel_bits =
+					lock->l_policy_data.l_inodebits.bits;
+			unlock_res_and_lock(lock);
+		}
+		rc = ldlm_cli_convert(lock, cancel_flags);
+		if (!rc)
+			RETURN(0);
+		/* continue with cancel otherwise */
+		ldlm_lock2handle(lock, &lockh);
+		rc = ldlm_cli_cancel(&lockh, cancel_flags);
+		if (rc < 0) {
+			CDEBUG(D_INODE, "ldlm_cli_cancel: rc = %d\n", rc);
+			RETURN(rc);
+		}
+		break;
+	}
+	case LDLM_CB_CANCELING:
+	{
+		__u64 to_cancel = lock->l_policy_data.l_inodebits.bits;
+
+		/* Nothing to do for non-granted locks */
+		if (!ldlm_is_granted(lock))
+			break;
+
+		/* If 'ld' is supplied then bits to be cancelled are passed
+		 * implicitly by lock converting and cancel_bits from 'ld'
+		 * should be used. Otherwise full cancel is being performed
+		 * and lock inodebits are used.
+		 *
+		 * Note: we cannot rely on cancel_bits in lock itself at this
+		 * moment because they can be changed by concurrent thread,
+		 * so ldlm_cli_inodebits_convert() pass cancel bits implicitly
+		 * in 'ld' parameter.
+		 */
+		if (ld) {
+			/* partial bits cancel allowed only during convert */
+			LASSERT(ldlm_is_converting(lock));
+			/* mask cancel bits by lock bits so only no any unused
+			 * bits are passed to ll_lock_cancel_bits()
+			 */
+			to_cancel &= ld->l_policy_data.l_inodebits.cancel_bits;
+		}
+		ll_lock_cancel_bits(lock, to_cancel);
 		break;
 	}
 	default:
@@ -462,7 +610,8 @@ struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de)
 
 static int ll_lookup_it_finish(struct ptlrpc_request *request,
 			       struct lookup_intent *it,
-			       struct inode *parent, struct dentry **de)
+			       struct inode *parent, struct dentry **de,
+			       void *secctx, __u32 secctxlen, ktime_t kstart)
 {
 	struct inode		 *inode = NULL;
 	__u64			  bits = 0;
@@ -475,20 +624,56 @@ static int ll_lookup_it_finish(struct ptlrpc_request *request,
 	CDEBUG(D_DENTRY, "it %p it_disposition %x\n", it,
 	       it->it_disposition);
 	if (!it_disposition(it, DISP_LOOKUP_NEG)) {
-                rc = ll_prep_inode(&inode, request, (*de)->d_sb, it);
-                if (rc)
-                        RETURN(rc);
-
-                ll_set_lock_data(ll_i2sbi(parent)->ll_md_exp, inode, it, &bits);
-
-                /* We used to query real size from OSTs here, but actually
-                   this is not needed. For stat() calls size would be updated
-                   from subsequent do_revalidate()->ll_inode_revalidate_it() in
-                   2.4 and
-                   vfs_getattr_it->ll_getattr()->ll_inode_revalidate_it() in 2.6
-                   Everybody else who needs correct file size would call
-                   ll_glimpse_size or some equivalent themselves anyway.
-                   Also see bug 7198. */
+		struct req_capsule *pill = &request->rq_pill;
+		struct mdt_body *body = req_capsule_server_get(pill,
+							       &RMF_MDT_BODY);
+
+		rc = ll_prep_inode(&inode, request, (*de)->d_sb, it);
+		if (rc)
+			RETURN(rc);
+
+		ll_set_lock_data(ll_i2sbi(parent)->ll_md_exp, inode, it, &bits);
+		/* OPEN can return data if lock has DoM+LAYOUT bits set */
+		if (it->it_op & IT_OPEN &&
+		    bits & MDS_INODELOCK_DOM && bits & MDS_INODELOCK_LAYOUT)
+			ll_dom_finish_open(inode, request);
+
+		/* We used to query real size from OSTs here, but actually
+		 * this is not needed. For stat() calls size would be updated
+		 * from subsequent do_revalidate()->ll_inode_revalidate_it() in
+		 * 2.4 and
+		 * vfs_getattr_it->ll_getattr()->ll_inode_revalidate_it() in 2.6
+		 * Everybody else who needs correct file size would call
+		 * ll_glimpse_size or some equivalent themselves anyway.
+		 * Also see bug 7198.
+		 */
+
+		/* If security context was returned by MDT, put it in
+		 * inode now to save an extra getxattr from security hooks,
+		 * and avoid deadlock.
+		 */
+		if (body->mbo_valid & OBD_MD_SECCTX) {
+			secctx = req_capsule_server_get(pill, &RMF_FILE_SECCTX);
+			secctxlen = req_capsule_get_size(pill,
+							   &RMF_FILE_SECCTX,
+							   RCL_SERVER);
+
+			if (secctxlen)
+				CDEBUG(D_SEC, "server returned security context"
+				       " for "DFID"\n",
+				       PFID(ll_inode2fid(inode)));
+		}
+
+		if (secctx && secctxlen) {
+			inode_lock(inode);
+			rc = security_inode_notifysecctx(inode, secctx,
+							 secctxlen);
+			inode_unlock(inode);
+			if (rc)
+				CWARN("cannot set security context for "
+				      DFID": rc = %d\n",
+				      PFID(ll_inode2fid(inode)), rc);
+		}
 	}
 
 	/* Only hash *de if it is unhashed (new dentry).
@@ -505,9 +690,9 @@ static int ll_lookup_it_finish(struct ptlrpc_request *request,
 		if (bits & MDS_INODELOCK_LOOKUP)
 			d_lustre_revalidate(*de);
 	} else if (!it_disposition(it, DISP_OPEN_CREATE)) {
-		/* If file created on server, don't depend on parent UPDATE
-		 * lock to unhide it. It is left hidden and next lookup can
-		 * find it in ll_splice_alias.
+		/*
+		 * If file was created on the server, the dentry is revalidated
+		 * in ll_create_it if the lock allows for it.
 		 */
 		/* Check that parent has UPDATE lock. */
 		struct lookup_intent parent_it = {
@@ -532,11 +717,18 @@ static int ll_lookup_it_finish(struct ptlrpc_request *request,
 		}
 	}
 
+	if (it_disposition(it, DISP_OPEN_CREATE)) {
+		ll_stats_ops_tally(ll_i2sbi(parent), LPROC_LL_MKNOD,
+				   ktime_us_delta(ktime_get(), kstart));
+	}
+
 	GOTO(out, rc = 0);
 
 out:
-	if (rc != 0 && it->it_op & IT_OPEN)
+	if (rc != 0 && it->it_op & IT_OPEN) {
+		ll_intent_drop_lock(it);
 		ll_open_cleanup((*de)->d_sb, request);
+	}
 
 	return rc;
 }
@@ -545,13 +737,16 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
 				   struct lookup_intent *it,
 				   void **secctx, __u32 *secctxlen)
 {
+	ktime_t kstart = ktime_get();
 	struct lookup_intent lookup_it = { .it_op = IT_LOOKUP };
 	struct dentry *save = dentry, *retval;
 	struct ptlrpc_request *req = NULL;
 	struct md_op_data *op_data = NULL;
-        __u32 opc;
-        int rc;
-        ENTRY;
+	__u32 opc;
+	int rc;
+	char secctx_name[XATTR_NAME_MAX + 1];
+
+	ENTRY;
 
         if (dentry->d_name.len > ll_i2sbi(parent)->ll_namelen)
                 RETURN(ERR_PTR(-ENAMETOOLONG));
@@ -599,10 +794,32 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
 					     &op_data->op_file_secctx_size);
 		if (rc < 0)
 			GOTO(out, retval = ERR_PTR(rc));
-		if (secctx != NULL)
+		if (secctx)
 			*secctx = op_data->op_file_secctx;
-		if (secctxlen != NULL)
+		if (secctxlen)
 			*secctxlen = op_data->op_file_secctx_size;
+	} else {
+		if (secctx)
+			*secctx = NULL;
+		if (secctxlen)
+			*secctxlen = 0;
+	}
+
+	/* ask for security context upon intent */
+	if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_OPEN)) {
+		/* get name of security xattr to request to server */
+		rc = ll_listsecurity(parent, secctx_name,
+				     sizeof(secctx_name));
+		if (rc < 0) {
+			CDEBUG(D_SEC, "cannot get security xattr name for "
+			       DFID": rc = %d\n",
+			       PFID(ll_inode2fid(parent)), rc);
+		} else if (rc > 0) {
+			op_data->op_file_secctx_name = secctx_name;
+			op_data->op_file_secctx_name_size = rc;
+			CDEBUG(D_SEC, "'%.*s' is security xattr for "DFID"\n",
+			       rc, secctx_name, PFID(ll_inode2fid(parent)));
+		}
 	}
 
 	rc = md_intent_lock(ll_i2mdexp(parent), op_data, it, &req,
@@ -636,11 +853,15 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
 	if (rc < 0)
 		GOTO(out, retval = ERR_PTR(rc));
 
-	rc = ll_lookup_it_finish(req, it, parent, &dentry);
-        if (rc != 0) {
-                ll_intent_release(it);
-                GOTO(out, retval = ERR_PTR(rc));
-        }
+	/* dir layout may change */
+	ll_unlock_md_op_lsm(op_data);
+	rc = ll_lookup_it_finish(req, it, parent, &dentry,
+				 secctx ? *secctx : NULL,
+				 secctxlen ? *secctxlen : 0, kstart);
+	if (rc != 0) {
+		ll_intent_release(it);
+		GOTO(out, retval = ERR_PTR(rc));
+	}
 
         if ((it->it_op & IT_OPEN) && dentry->d_inode &&
             !S_ISREG(dentry->d_inode->i_mode) &&
@@ -653,7 +874,7 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
 
 out:
 	if (op_data != NULL && !IS_ERR(op_data)) {
-		if (secctx != NULL && secctxlen != NULL) {
+		if (secctx && secctxlen) {
 			/* caller needs sec ctx info, so reset it in op_data to
 			 * prevent it from being freed */
 			op_data->op_file_secctx = NULL;
@@ -978,6 +1199,7 @@ static int ll_create_it(struct inode *dir, struct dentry *dentry,
 			void *secctx, __u32 secctxlen)
 {
 	struct inode *inode;
+	__u64 bits = 0;
 	int rc = 0;
 	ENTRY;
 
@@ -993,8 +1215,7 @@ static int ll_create_it(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		RETURN(PTR_ERR(inode));
 
-	if ((ll_i2sbi(inode)->ll_flags & LL_SBI_FILE_SECCTX) &&
-	    secctx != NULL) {
+	if ((ll_i2sbi(inode)->ll_flags & LL_SBI_FILE_SECCTX) && secctx) {
 		inode_lock(inode);
 		/* must be done before d_instantiate, because it calls
 		 * security_d_instantiate, which means a getxattr if security
@@ -1013,6 +1234,10 @@ static int ll_create_it(struct inode *dir, struct dentry *dentry,
 			RETURN(rc);
 	}
 
+	ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, inode, it, &bits);
+	if (bits & MDS_INODELOCK_LOOKUP)
+		d_lustre_revalidate(dentry);
+
 	RETURN(0);
 }
 
@@ -1152,38 +1377,38 @@ static int ll_mknod(struct inode *dir, struct dentry *dchild, ll_umode_t mode,
 {
 	struct qstr *name = &dchild->d_name;
 	int err;
-        ENTRY;
+	ENTRY;
 
 	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p) mode %o dev %x\n",
 	       name->len, name->name, PFID(ll_inode2fid(dir)), dir,
-               mode, rdev);
+	       mode, rdev);
 
 	if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir)))
 		mode &= ~current_umask();
 
-        switch (mode & S_IFMT) {
-        case 0:
-                mode |= S_IFREG; /* for mode = 0 case, fallthrough */
-		/* Fall through */
-        case S_IFREG:
-        case S_IFCHR:
-        case S_IFBLK:
-        case S_IFIFO:
-        case S_IFSOCK:
+	switch (mode & S_IFMT) {
+	case 0:
+		mode |= S_IFREG;
+		/* fallthrough */
+	case S_IFREG:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFIFO:
+	case S_IFSOCK:
 		err = ll_new_node(dir, dchild, NULL, mode, old_encode_dev(rdev),
 				  LUSTRE_OPC_MKNOD);
-                break;
-        case S_IFDIR:
-                err = -EPERM;
-                break;
-        default:
-                err = -EINVAL;
-        }
+		break;
+	case S_IFDIR:
+		err = -EPERM;
+		break;
+	default:
+		err = -EINVAL;
+	}
 
-        if (!err)
-                ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKNOD, 1);
+	if (!err)
+		ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKNOD, 1);
 
-        RETURN(err);
+	RETURN(err);
 }
 
 #ifdef HAVE_IOP_ATOMIC_OPEN
diff --git a/drivers/staging/lustrefsx/lustre/llite/range_lock.c b/drivers/staging/lustrefsx/lustre/llite/range_lock.c
index 56e129165c4be..7a4c9c4cb766a 100644
--- a/drivers/staging/lustrefsx/lustre/llite/range_lock.c
+++ b/drivers/staging/lustrefsx/lustre/llite/range_lock.c
@@ -33,8 +33,11 @@
  * Author: Prakash Surya <surya1@llnl.gov>
  * Author: Bobi Jam <bobijam.xu@intel.com>
  */
+#ifdef HAVE_SCHED_HEADERS
+#include <linux/sched/signal.h>
+#endif
 #include "range_lock.h"
-#include <lustre/lustre_user.h>
+#include <uapi/linux/lustre/lustre_user.h>
 
 /**
  * Initialize a range lock tree
diff --git a/drivers/staging/lustrefsx/lustre/llite/rw.c b/drivers/staging/lustrefsx/lustre/llite/rw.c
index a00ccef398702..a5f3f9c187d57 100644
--- a/drivers/staging/lustrefsx/lustre/llite/rw.c
+++ b/drivers/staging/lustrefsx/lustre/llite/rw.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -369,7 +369,7 @@ ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io,
 					 io->ci_obj, ra.cra_end, page_idx);
 				/* update read ahead RPC size.
 				 * NB: it's racy but doesn't matter */
-				if (ras->ras_rpc_size > ra.cra_rpc_size &&
+				if (ras->ras_rpc_size != ra.cra_rpc_size &&
 				    ra.cra_rpc_size > 0)
 					ras->ras_rpc_size = ra.cra_rpc_size;
 				/* trim it to align with optimal RPC size */
@@ -714,7 +714,10 @@ static void ras_increase_window(struct inode *inode,
 
 		wlen = min(ras->ras_window_len + ras->ras_rpc_size,
 			   ra->ra_max_pages_per_file);
-		ras->ras_window_len = ras_align(ras, wlen, NULL);
+		if (wlen < ras->ras_rpc_size)
+			ras->ras_window_len = wlen;
+		else
+			ras->ras_window_len = ras_align(ras, wlen, NULL);
 	}
 }
 
@@ -1074,7 +1077,7 @@ void ll_cl_remove(struct file *file, const struct lu_env *env)
 	write_unlock(&fd->fd_lock);
 }
 
-static int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
+int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
 			   struct cl_page *page, struct file *file)
 {
 	struct inode              *inode  = vvp_object_inode(page->cp_obj);
@@ -1082,6 +1085,7 @@ static int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
 	struct ll_file_data       *fd     = LUSTRE_FPRIVATE(file);
 	struct ll_readahead_state *ras    = &fd->fd_ras;
 	struct cl_2queue          *queue  = &io->ci_queue;
+	struct cl_sync_io	  *anchor = NULL;
 	struct vvp_page           *vpg;
 	int			   rc = 0;
 	bool			   uptodate;
@@ -1109,6 +1113,10 @@ static int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
 		cl_page_export(env, page, 1);
 		cl_page_disown(env, io, page);
 	} else {
+		anchor = &vvp_env_info(env)->vti_anchor;
+		cl_sync_io_init(anchor, 1, &cl_sync_io_end);
+		page->cp_sync_io = anchor;
+
 		cl_2queue_add(queue, page);
 	}
 
@@ -1129,10 +1137,30 @@ static int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
 			task_io_account_read(PAGE_SIZE * count);
 	}
 
-	/*
-	 * Unlock unsent pages in case of error.
-	 */
+
+	if (anchor != NULL && !cl_page_is_owned(page, io)) { /* have sent */
+		rc = cl_sync_io_wait(env, anchor, 0);
+
+		cl_page_assume(env, io, page);
+		cl_page_list_del(env, &queue->c2_qout, page);
+
+		if (!PageUptodate(cl_page_vmpage(page))) {
+			/* Failed to read a mirror, discard this page so that
+			 * new page can be created with new mirror.
+			 *
+			 * TODO: this is not needed after page reinit
+			 * route is implemented */
+			cl_page_discard(env, io, page);
+		}
+		cl_page_disown(env, io, page);
+	}
+
+	/* TODO: discard all pages until page reinit route is implemented */
+	cl_page_list_discard(env, io, &queue->c2_qin);
+
+	/* Unlock unsent read pages in case of error. */
 	cl_page_list_disown(env, io, &queue->c2_qin);
+
 	cl_2queue_fini(env, queue);
 
 	RETURN(rc);
@@ -1143,24 +1171,25 @@ int ll_readpage(struct file *file, struct page *vmpage)
 	struct inode *inode = file_inode(file);
 	struct cl_object *clob = ll_i2info(inode)->lli_clob;
 	struct ll_cl_context *lcc;
-	const struct lu_env  *env;
-	struct cl_io   *io;
+	const struct lu_env  *env = NULL;
+	struct cl_io   *io = NULL;
 	struct cl_page *page;
 	int result;
 	ENTRY;
 
 	lcc = ll_cl_find(file);
-	if (lcc == NULL) {
-		unlock_page(vmpage);
-		RETURN(-EIO);
+	if (lcc != NULL) {
+		env = lcc->lcc_env;
+		io  = lcc->lcc_io;
 	}
 
-	env = lcc->lcc_env;
-	io  = lcc->lcc_io;
 	if (io == NULL) { /* fast read */
 		struct inode *inode = file_inode(file);
 		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 		struct ll_readahead_state *ras = &fd->fd_ras;
+		struct lu_env  *local_env = NULL;
+		unsigned long fast_read_pages =
+			max(RA_REMAIN_WINDOW_MIN, ras->ras_rpc_size);
 		struct vvp_page *vpg;
 
 		result = -ENODATA;
@@ -1173,11 +1202,16 @@ int ll_readpage(struct file *file, struct page *vmpage)
 			RETURN(result);
 		}
 
+		if (!env) {
+			local_env = cl_env_percpu_get();
+			env = local_env;
+		}
+
 		vpg = cl2vvp_page(cl_object_page_slice(page->cp_obj, page));
 		if (vpg->vpg_defer_uptodate) {
 			enum ras_update_flags flags = LL_RAS_HIT;
 
-			if (lcc->lcc_type == LCC_MMAP)
+			if (lcc && lcc->lcc_type == LCC_MMAP)
 				flags |= LL_RAS_MMAP;
 
 			/* For fast read, it updates read ahead state only
@@ -1192,7 +1226,7 @@ int ll_readpage(struct file *file, struct page *vmpage)
 			 * the case, we can't do fast IO because we will need
 			 * a cl_io to issue the RPC. */
 			if (ras->ras_window_start + ras->ras_window_len <
-			    ras->ras_next_readahead + PTLRPC_MAX_BRW_PAGES) {
+			    ras->ras_next_readahead + fast_read_pages) {
 				/* export the page and skip io stack */
 				vpg->vpg_ra_used = 1;
 				cl_page_export(env, page, 1);
@@ -1200,8 +1234,14 @@ int ll_readpage(struct file *file, struct page *vmpage)
 			}
 		}
 
-		unlock_page(vmpage);
+		/* release page refcount before unlocking the page to ensure
+		 * the object won't be destroyed in the calling path of
+		 * cl_page_put(). Please see comment in ll_releasepage(). */
 		cl_page_put(env, page);
+		unlock_page(vmpage);
+		if (local_env)
+			cl_env_percpu_put(local_env);
+
 		RETURN(result);
 	}
 
@@ -1211,6 +1251,7 @@ int ll_readpage(struct file *file, struct page *vmpage)
 		LASSERT(page->cp_type == CPT_CACHEABLE);
 		if (likely(!PageUptodate(vmpage))) {
 			cl_page_assume(env, io, page);
+
 			result = ll_io_read_page(env, io, page, file);
 		} else {
 			/* Page from a non-object file. */
@@ -1224,28 +1265,3 @@ int ll_readpage(struct file *file, struct page *vmpage)
         }
 	RETURN(result);
 }
-
-int ll_page_sync_io(const struct lu_env *env, struct cl_io *io,
-		    struct cl_page *page, enum cl_req_type crt)
-{
-	struct cl_2queue  *queue;
-	int result;
-
-	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
-
-	queue = &io->ci_queue;
-	cl_2queue_init_page(queue, page);
-
-	result = cl_io_submit_sync(env, io, crt, queue, 0);
-	LASSERT(cl_page_is_owned(page, io));
-
-	if (crt == CRT_READ)
-		/*
-		 * in CRT_WRITE case page is left locked even in case of
-		 * error.
-		 */
-		cl_page_list_disown(env, io, &queue->c2_qin);
-	cl_2queue_fini(env, queue);
-
-	return result;
-}
diff --git a/drivers/staging/lustrefsx/lustre/llite/rw26.c b/drivers/staging/lustrefsx/lustre/llite/rw26.c
index 9cba2d0b5e8e3..9a1f0b6021baf 100644
--- a/drivers/staging/lustrefsx/lustre/llite/rw26.c
+++ b/drivers/staging/lustrefsx/lustre/llite/rw26.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -577,45 +577,83 @@ ll_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 
 /**
  * Prepare partially written-to page for a write.
+ * @pg is owned when passed in and disowned when it returns non-zero result to
+ * the caller.
  */
 static int ll_prepare_partial_page(const struct lu_env *env, struct cl_io *io,
-				   struct cl_page *pg)
+				   struct cl_page *pg, struct file *file)
 {
 	struct cl_attr *attr   = vvp_env_thread_attr(env);
 	struct cl_object *obj  = io->ci_obj;
 	struct vvp_page *vpg   = cl_object_page_slice(obj, pg);
 	loff_t          offset = cl_offset(obj, vvp_index(vpg));
 	int             result;
+	ENTRY;
 
 	cl_object_attr_lock(obj);
 	result = cl_object_attr_get(env, obj, attr);
 	cl_object_attr_unlock(obj);
-	if (result == 0) {
-		/*
-		 * If are writing to a new page, no need to read old data.
-		 * The extent locking will have updated the KMS, and for our
-		 * purposes here we can treat it like i_size.
-		 */
-		if (attr->cat_kms <= offset) {
-			char *kaddr = ll_kmap_atomic(vpg->vpg_page, KM_USER0);
+	if (result) {
+		cl_page_disown(env, io, pg);
+		GOTO(out, result);
+	}
 
-			memset(kaddr, 0, cl_page_size(obj));
-			ll_kunmap_atomic(kaddr, KM_USER0);
-		} else if (vpg->vpg_defer_uptodate)
-			vpg->vpg_ra_used = 1;
-		else
-			result = ll_page_sync_io(env, io, pg, CRT_READ);
+	/*
+	 * If are writing to a new page, no need to read old data.
+	 * The extent locking will have updated the KMS, and for our
+	 * purposes here we can treat it like i_size.
+	 */
+	if (attr->cat_kms <= offset) {
+		char *kaddr = ll_kmap_atomic(vpg->vpg_page, KM_USER0);
+
+		memset(kaddr, 0, cl_page_size(obj));
+		ll_kunmap_atomic(kaddr, KM_USER0);
+		GOTO(out, result = 0);
+	}
+
+	if (vpg->vpg_defer_uptodate) {
+		vpg->vpg_ra_used = 1;
+		GOTO(out, result = 0);
 	}
+
+	result = ll_io_read_page(env, io, pg, file);
+	if (result)
+		GOTO(out, result);
+
+	/* ll_io_read_page() disowns the page */
+	result = cl_page_own(env, io, pg);
+	if (!result) {
+		if (!PageUptodate(cl_page_vmpage(pg))) {
+			cl_page_disown(env, io, pg);
+			result = -EIO;
+		}
+	} else if (result == -ENOENT) {
+		/* page was truncated */
+		result = -EAGAIN;
+	}
+	EXIT;
+
+out:
 	return result;
 }
 
+static int ll_tiny_write_begin(struct page *vmpage)
+{
+	/* Page must be present, up to date, dirty, and not in writeback. */
+	if (!vmpage || !PageUptodate(vmpage) || !PageDirty(vmpage) ||
+	    PageWriteback(vmpage))
+		return -ENODATA;
+
+	return 0;
+}
+
 static int ll_write_begin(struct file *file, struct address_space *mapping,
 			  loff_t pos, unsigned len, unsigned flags,
 			  struct page **pagep, void **fsdata)
 {
-	struct ll_cl_context *lcc;
+	struct ll_cl_context *lcc = NULL;
 	const struct lu_env  *env = NULL;
-	struct cl_io   *io;
+	struct cl_io   *io = NULL;
 	struct cl_page *page = NULL;
 
 	struct cl_object *clob = ll_i2info(mapping->host)->lli_clob;
@@ -626,17 +664,27 @@ static int ll_write_begin(struct file *file, struct address_space *mapping,
 	int result = 0;
 	ENTRY;
 
-	CDEBUG(D_PAGE, "Writing %lu of %d to %d bytes\n", index, from, len);
+	CDEBUG(D_VFSTRACE, "Writing %lu of %d to %d bytes\n", index, from, len);
 
 	lcc = ll_cl_find(file);
 	if (lcc == NULL) {
-		io = NULL;
-		GOTO(out, result = -EIO);
+		vmpage = grab_cache_page_nowait(mapping, index);
+		result = ll_tiny_write_begin(vmpage);
+		GOTO(out, result);
 	}
 
 	env = lcc->lcc_env;
 	io  = lcc->lcc_io;
 
+	if (file->f_flags & O_DIRECT && io->ci_designated_mirror > 0) {
+		/* direct IO failed because it couldn't clean up cached pages,
+		 * this causes a problem for mirror write because the cached
+		 * page may belong to another mirror, which will result in
+		 * problem submitting the I/O. */
+		GOTO(out, result = -EBUSY);
+	}
+
+again:
 	/* To avoid deadlock, try to lock page first. */
 	vmpage = grab_cache_page_nowait(mapping, index);
 
@@ -689,13 +737,18 @@ static int ll_write_begin(struct file *file, struct address_space *mapping,
 			/* TODO: can be optimized at OSC layer to check if it
 			 * is a lockless IO. In that case, it's not necessary
 			 * to read the data. */
-			result = ll_prepare_partial_page(env, io, page);
-			if (result == 0)
-				SetPageUptodate(vmpage);
+			result = ll_prepare_partial_page(env, io, page, file);
+			if (result) {
+				/* vmpage should have been unlocked */
+				put_page(vmpage);
+				vmpage = NULL;
+
+				if (result == -EAGAIN)
+					goto again;
+				GOTO(out, result);
+			}
 		}
 	}
-	if (result < 0)
-		cl_page_unassume(env, io, page);
 	EXIT;
 out:
 	if (result < 0) {
@@ -703,6 +756,7 @@ static int ll_write_begin(struct file *file, struct address_space *mapping,
 			unlock_page(vmpage);
 			put_page(vmpage);
 		}
+		/* On tiny_write failure, page and io are always null. */
 		if (!IS_ERR_OR_NULL(page)) {
 			lu_ref_del(&page->cp_reference, "cl_io", io);
 			cl_page_put(env, page);
@@ -716,6 +770,47 @@ static int ll_write_begin(struct file *file, struct address_space *mapping,
 	RETURN(result);
 }
 
+static int ll_tiny_write_end(struct file *file, struct address_space *mapping,
+			     loff_t pos, unsigned int len, unsigned int copied,
+			     struct page *vmpage)
+{
+	struct cl_page *clpage = (struct cl_page *) vmpage->private;
+	loff_t kms = pos+copied;
+	loff_t to = kms & (PAGE_SIZE-1) ? kms & (PAGE_SIZE-1) : PAGE_SIZE;
+	__u16 refcheck;
+	struct lu_env *env = cl_env_get(&refcheck);
+	int rc = 0;
+
+	ENTRY;
+
+	if (IS_ERR(env)) {
+		rc = PTR_ERR(env);
+		goto out;
+	}
+
+	/* This page is dirty in cache, so it should have a cl_page pointer
+	 * set in vmpage->private.
+	 */
+	LASSERT(clpage != NULL);
+
+	if (copied == 0)
+		goto out_env;
+
+	/* Update the underlying size information in the OSC/LOV objects this
+	 * page is part of.
+	 */
+	cl_page_touch(env, clpage, to);
+
+out_env:
+	cl_env_put(env, &refcheck);
+
+out:
+	/* Must return page unlocked. */
+	unlock_page(vmpage);
+
+	RETURN(rc);
+}
+
 static int ll_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *vmpage, void *fsdata)
@@ -732,6 +827,14 @@ static int ll_write_end(struct file *file, struct address_space *mapping,
 
 	put_page(vmpage);
 
+	CDEBUG(D_VFSTRACE, "pos %llu, len %u, copied %u\n", pos, len, copied);
+
+	if (lcc == NULL) {
+		result = ll_tiny_write_end(file, mapping, pos, len, copied,
+					   vmpage);
+		GOTO(out, result);
+	}
+
 	LASSERT(lcc != NULL);
 	env  = lcc->lcc_env;
 	page = lcc->lcc_page;
@@ -761,7 +864,7 @@ static int ll_write_end(struct file *file, struct address_space *mapping,
 		if (plist->pl_nr >= PTLRPC_MAX_BRW_PAGES)
 			unplug = true;
 
-		CL_PAGE_DEBUG(D_PAGE, env, page,
+		CL_PAGE_DEBUG(D_VFSTRACE, env, page,
 			      "queued page: %d.\n", plist->pl_nr);
 	} else {
 		cl_page_disown(env, io, page);
@@ -773,11 +876,14 @@ static int ll_write_end(struct file *file, struct address_space *mapping,
 		/* page list is not contiguous now, commit it now */
 		unplug = true;
 	}
-	if (unplug || io->u.ci_rw.rw_sync)
+	if (unplug || io->u.ci_wr.wr_sync)
 		result = vvp_io_write_commit(env, io);
 
 	if (result < 0)
 		io->ci_result = result;
+
+
+out:
 	RETURN(result >= 0 ? copied : result);
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/llite/statahead.c b/drivers/staging/lustrefsx/lustre/llite/statahead.c
index 5b2af025d28f9..397712909b3f4 100644
--- a/drivers/staging/lustrefsx/lustre/llite/statahead.c
+++ b/drivers/staging/lustrefsx/lustre/llite/statahead.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -330,6 +330,58 @@ __sa_make_ready(struct ll_statahead_info *sai, struct sa_entry *entry, int ret)
 	return (index == sai->sai_index_wait);
 }
 
+/* finish async stat RPC arguments */
+static void sa_fini_data(struct md_enqueue_info *minfo)
+{
+	ll_unlock_md_op_lsm(&minfo->mi_data);
+	iput(minfo->mi_dir);
+	OBD_FREE_PTR(minfo);
+}
+
+static int ll_statahead_interpret(struct ptlrpc_request *req,
+				  struct md_enqueue_info *minfo, int rc);
+
+/*
+ * prepare arguments for async stat RPC.
+ */
+static struct md_enqueue_info *
+sa_prep_data(struct inode *dir, struct inode *child, struct sa_entry *entry)
+{
+	struct md_enqueue_info   *minfo;
+	struct ldlm_enqueue_info *einfo;
+	struct md_op_data        *op_data;
+
+	OBD_ALLOC_PTR(minfo);
+	if (minfo == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	op_data = ll_prep_md_op_data(&minfo->mi_data, dir, child,
+				     entry->se_qstr.name, entry->se_qstr.len, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data)) {
+		OBD_FREE_PTR(minfo);
+		return (struct md_enqueue_info *)op_data;
+	}
+
+	if (child == NULL)
+		op_data->op_fid2 = entry->se_fid;
+
+	minfo->mi_it.it_op = IT_GETATTR;
+	minfo->mi_dir = igrab(dir);
+	minfo->mi_cb = ll_statahead_interpret;
+	minfo->mi_cbdata = entry;
+
+	einfo = &minfo->mi_einfo;
+	einfo->ei_type   = LDLM_IBITS;
+	einfo->ei_mode   = it_to_lock_mode(&minfo->mi_it);
+	einfo->ei_cb_bl  = ll_md_blocking_ast;
+	einfo->ei_cb_cp  = ldlm_completion_ast;
+	einfo->ei_cb_gl  = NULL;
+	einfo->ei_cbdata = NULL;
+
+	return minfo;
+}
+
 /*
  * release resources used in async stat RPC, update entry state and wakeup if
  * scanner process it waiting on this entry.
@@ -346,8 +398,7 @@ sa_make_ready(struct ll_statahead_info *sai, struct sa_entry *entry, int ret)
 	if (minfo) {
 		entry->se_minfo = NULL;
 		ll_intent_release(&minfo->mi_it);
-		iput(minfo->mi_dir);
-		OBD_FREE_PTR(minfo);
+		sa_fini_data(minfo);
 	}
 
 	if (req) {
@@ -493,10 +544,11 @@ static void ll_sai_put(struct ll_statahead_info *sai)
 static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai)
 {
 	struct ll_inode_info *lli = ll_i2info(inode);
-	__u64 index = lli->lli_agl_index;
+	u64 index = lli->lli_agl_index;
+	ktime_t expire;
 	int rc;
-	ENTRY;
 
+	ENTRY;
 	LASSERT(list_empty(&lli->lli_agl_list));
 
         /* AGL maybe fall behind statahead with one entry */
@@ -539,8 +591,9 @@ static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai)
          *    relative rare. AGL can ignore such case, and it will not muchly
          *    affect the performance.
          */
-        if (lli->lli_glimpse_time != 0 &&
-            cfs_time_before(cfs_time_shift(-1), lli->lli_glimpse_time)) {
+	expire = ktime_sub_ns(ktime_get(), NSEC_PER_SEC);
+	if (ktime_to_ns(lli->lli_glimpse_time) &&
+	    ktime_before(expire, lli->lli_glimpse_time)) {
 		up_write(&lli->lli_glimpse_sem);
                 lli->lli_agl_index = 0;
                 iput(inode);
@@ -552,7 +605,7 @@ static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai)
 
         cl_agl(inode);
         lli->lli_agl_index = 0;
-        lli->lli_glimpse_time = cfs_time_current();
+	lli->lli_glimpse_time = ktime_get();
 	up_write(&lli->lli_glimpse_sem);
 
         CDEBUG(D_READA, "Handled (init) async glimpse: inode= "
@@ -580,14 +633,14 @@ static void sa_instantiate(struct ll_statahead_info *sai,
 	int rc = 0;
 	ENTRY;
 
-        LASSERT(entry->se_handle != 0);
+	LASSERT(entry->se_handle != 0);
 
-        minfo = entry->se_minfo;
-        it = &minfo->mi_it;
-        req = entry->se_req;
-        body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
-        if (body == NULL)
-                GOTO(out, rc = -EFAULT);
+	minfo = entry->se_minfo;
+	it = &minfo->mi_it;
+	req = entry->se_req;
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EFAULT);
 
 	child = entry->se_inode;
 	if (child != NULL) {
@@ -602,25 +655,25 @@ static void sa_instantiate(struct ll_statahead_info *sai,
 
 	it->it_lock_handle = entry->se_handle;
 	rc = md_revalidate_lock(ll_i2mdexp(dir), it, ll_inode2fid(dir), NULL);
-        if (rc != 1)
-                GOTO(out, rc = -EAGAIN);
+	if (rc != 1)
+		GOTO(out, rc = -EAGAIN);
 
-        rc = ll_prep_inode(&child, req, dir->i_sb, it);
-        if (rc)
-                GOTO(out, rc);
+	rc = ll_prep_inode(&child, req, dir->i_sb, it);
+	if (rc)
+		GOTO(out, rc);
 
 	CDEBUG(D_READA, "%s: setting %.*s"DFID" l_data to inode %p\n",
 	       ll_get_fsname(child->i_sb, NULL, 0),
 	       entry->se_qstr.len, entry->se_qstr.name,
 	       PFID(ll_inode2fid(child)), child);
-        ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, child, it, NULL);
+	ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, child, it, NULL);
 
-        entry->se_inode = child;
+	entry->se_inode = child;
 
-        if (agl_should_run(sai, child))
-                ll_agl_add(sai, child, entry->se_index);
+	if (agl_should_run(sai, child))
+		ll_agl_add(sai, child, entry->se_index);
 
-        EXIT;
+	EXIT;
 
 out:
 	/* sa_make_ready() will drop ldlm ibits lock refcount by calling
@@ -684,8 +737,7 @@ static int ll_statahead_interpret(struct ptlrpc_request *req,
 
 	if (rc != 0) {
 		ll_intent_release(it);
-		iput(dir);
-		OBD_FREE_PTR(minfo);
+		sa_fini_data(minfo);
 	} else {
 		/* release ibits lock ASAP to avoid deadlock when statahead
 		 * thread enqueues lock on parent in readdir and another
@@ -693,6 +745,7 @@ static int ll_statahead_interpret(struct ptlrpc_request *req,
 		 * unlink. */
 		handle = it->it_lock_handle;
 		ll_intent_drop_lock(it);
+		ll_unlock_md_op_lsm(&minfo->mi_data);
 	}
 
 	spin_lock(&lli->lli_sa_lock);
@@ -722,53 +775,6 @@ static int ll_statahead_interpret(struct ptlrpc_request *req,
 	RETURN(rc);
 }
 
-/* finish async stat RPC arguments */
-static void sa_fini_data(struct md_enqueue_info *minfo)
-{
-        iput(minfo->mi_dir);
-        OBD_FREE_PTR(minfo);
-}
-
-/*
- * prepare arguments for async stat RPC.
- */
-static struct md_enqueue_info *
-sa_prep_data(struct inode *dir, struct inode *child, struct sa_entry *entry)
-{
-	struct md_enqueue_info   *minfo;
-	struct ldlm_enqueue_info *einfo;
-	struct md_op_data        *op_data;
-
-	OBD_ALLOC_PTR(minfo);
-	if (minfo == NULL)
-		return ERR_PTR(-ENOMEM);
-
-	op_data = ll_prep_md_op_data(&minfo->mi_data, dir, child, NULL, 0, 0,
-				     LUSTRE_OPC_ANY, NULL);
-	if (IS_ERR(op_data)) {
-		OBD_FREE_PTR(minfo);
-		return (struct md_enqueue_info *)op_data;
-	}
-
-	if (child == NULL)
-		op_data->op_fid2 = entry->se_fid;
-
-	minfo->mi_it.it_op = IT_GETATTR;
-	minfo->mi_dir = igrab(dir);
-	minfo->mi_cb = ll_statahead_interpret;
-	minfo->mi_cbdata = entry;
-
-	einfo = &minfo->mi_einfo;
-	einfo->ei_type   = LDLM_IBITS;
-	einfo->ei_mode   = it_to_lock_mode(&minfo->mi_it);
-	einfo->ei_cb_bl  = ll_md_blocking_ast;
-	einfo->ei_cb_cp  = ldlm_completion_ast;
-	einfo->ei_cb_gl  = NULL;
-	einfo->ei_cbdata = NULL;
-
-	return minfo;
-}
-
 /* async stat for file not found in dcache */
 static int sa_lookup(struct inode *dir, struct sa_entry *entry)
 {
@@ -810,22 +816,20 @@ static int sa_revalidate(struct inode *dir, struct sa_entry *entry,
 	if (d_mountpoint(dentry))
 		RETURN(1);
 
+	minfo = sa_prep_data(dir, inode, entry);
+	if (IS_ERR(minfo))
+		RETURN(PTR_ERR(minfo));
+
 	entry->se_inode = igrab(inode);
 	rc = md_revalidate_lock(ll_i2mdexp(dir), &it, ll_inode2fid(inode),
 				NULL);
 	if (rc == 1) {
 		entry->se_handle = it.it_lock_handle;
 		ll_intent_release(&it);
+		sa_fini_data(minfo);
 		RETURN(1);
 	}
 
-	minfo = sa_prep_data(dir, inode, entry);
-	if (IS_ERR(minfo)) {
-		entry->se_inode = NULL;
-		iput(inode);
-		RETURN(PTR_ERR(minfo));
-	}
-
 	rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo);
 	if (rc < 0) {
 		entry->se_inode = NULL;
@@ -922,6 +926,7 @@ static int ll_agl_thread(void *arg)
 			list_del_init(&clli->lli_agl_list);
 			spin_unlock(&plli->lli_agl_lock);
 			ll_agl_trigger(&clli->lli_vfs_inode, sai);
+			cond_resched();
 		} else {
 			spin_unlock(&plli->lli_agl_lock);
 		}
@@ -999,8 +1004,7 @@ static int ll_statahead_thread(void *arg)
 	CDEBUG(D_READA, "statahead thread starting: sai %p, parent %.*s\n",
 	       sai, parent->d_name.len, parent->d_name.name);
 
-	op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0,
-				     LUSTRE_OPC_ANY, dir);
+	OBD_ALLOC_PTR(op_data);
 	if (IS_ERR(op_data))
 		GOTO(out, rc = PTR_ERR(op_data));
 
@@ -1022,8 +1026,16 @@ static int ll_statahead_thread(void *arg)
 		struct lu_dirpage *dp;
 		struct lu_dirent  *ent;
 
+		op_data = ll_prep_md_op_data(op_data, dir, dir, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, dir);
+		if (IS_ERR(op_data)) {
+			rc = PTR_ERR(op_data);
+			break;
+		}
+
 		sai->sai_in_readpage = 1;
 		page = ll_get_dir_page(dir, op_data, pos, &chain);
+		ll_unlock_md_op_lsm(op_data);
 		sai->sai_in_readpage = 0;
 		if (IS_ERR(page)) {
 			rc = PTR_ERR(page);
@@ -1109,7 +1121,7 @@ static int ll_statahead_thread(void *arg)
 
 					ll_agl_trigger(&clli->lli_vfs_inode,
 							sai);
-
+					cond_resched();
 					spin_lock(&lli->lli_agl_lock);
 				}
 				spin_unlock(&lli->lli_agl_lock);
@@ -1598,7 +1610,6 @@ static int start_statahead_thread(struct inode *dir, struct dentry *dentry)
 		spin_lock(&lli->lli_sa_lock);
 		lli->lli_sai = NULL;
 		spin_unlock(&lli->lli_sa_lock);
-		atomic_dec(&ll_i2sbi(parent->d_inode)->ll_sa_running);
 		rc = PTR_ERR(task);
 		CERROR("can't start ll_sa thread, rc: %d\n", rc);
 		GOTO(out, rc);
diff --git a/drivers/staging/lustrefsx/lustre/llite/super25.c b/drivers/staging/lustrefsx/lustre/llite/super25.c
index 7118cce98561b..84e5de9ea8782 100644
--- a/drivers/staging/lustrefsx/lustre/llite/super25.c
+++ b/drivers/staging/lustrefsx/lustre/llite/super25.c
@@ -95,12 +95,8 @@ struct super_operations lustre_super_operations =
         .show_options  = ll_show_options,
 };
 
-
-void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg));
-
 static int __init lustre_init(void)
 {
-	struct proc_dir_entry *entry;
 	struct lnet_process_id lnet_id;
 	struct timespec64 ts;
 	int i, rc, seed[2];
@@ -132,15 +128,9 @@ static int __init lustre_init(void)
 	if (ll_file_data_slab == NULL)
 		GOTO(out_cache, rc = -ENOMEM);
 
-	entry = lprocfs_register("llite", proc_lustre_root, NULL, NULL);
-	if (IS_ERR(entry)) {
-		rc = PTR_ERR(entry);
-		CERROR("cannot register '/proc/fs/lustre/llite': rc = %d\n",
-		       rc);
+	rc = llite_tunables_register();
+	if (rc)
 		GOTO(out_cache, rc);
-	}
-
-	proc_lustre_fs_root = entry;
 
 	cfs_get_random_bytes(seed, sizeof(seed));
 
@@ -150,7 +140,7 @@ static int __init lustre_init(void)
 		if (LNetGetId(i, &lnet_id) == -ENOENT)
 			break;
 
-		if (LNET_NETTYP(LNET_NIDNET(lnet_id.nid)) != LOLND)
+		if (lnet_id.nid != LNET_NID_LO_0)
 			seed[0] ^= LNET_NIDADDR(lnet_id.nid);
 	}
 
@@ -159,7 +149,7 @@ static int __init lustre_init(void)
 
 	rc = vvp_global_init();
 	if (rc != 0)
-		GOTO(out_proc, rc);
+		GOTO(out_tunables, rc);
 
 	cl_inode_fini_env = cl_env_alloc(&cl_inode_fini_refcheck,
 					 LCT_REMEMBER | LCT_NOREF);
@@ -174,7 +164,6 @@ static int __init lustre_init(void)
 
 	lustre_register_client_fill_super(ll_fill_super);
 	lustre_register_kill_super_cb(ll_kill_super);
-	lustre_register_client_process_config(ll_process_config);
 
 	RETURN(0);
 
@@ -182,15 +171,11 @@ static int __init lustre_init(void)
 	cl_env_put(cl_inode_fini_env, &cl_inode_fini_refcheck);
 out_vvp:
 	vvp_global_fini();
-out_proc:
-	lprocfs_remove(&proc_lustre_fs_root);
+out_tunables:
+	llite_tunables_unregister();
 out_cache:
-	if (ll_inode_cachep != NULL)
-		kmem_cache_destroy(ll_inode_cachep);
-
-	if (ll_file_data_slab != NULL)
-		kmem_cache_destroy(ll_file_data_slab);
-
+	kmem_cache_destroy(ll_inode_cachep);
+	kmem_cache_destroy(ll_file_data_slab);
 	return rc;
 }
 
@@ -198,14 +183,20 @@ static void __exit lustre_exit(void)
 {
 	lustre_register_client_fill_super(NULL);
 	lustre_register_kill_super_cb(NULL);
-	lustre_register_client_process_config(NULL);
 
-	lprocfs_remove(&proc_lustre_fs_root);
+	llite_tunables_unregister();
 
 	ll_xattr_fini();
 	cl_env_put(cl_inode_fini_env, &cl_inode_fini_refcheck);
 	vvp_global_fini();
 
+#ifdef HAVE_INODE_I_RCU
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+#endif
 	kmem_cache_destroy(ll_inode_cachep);
 	kmem_cache_destroy(ll_file_data_slab);
 }
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c b/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c
index 2f640635afea2..d36aed3919268 100644
--- a/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c
@@ -53,7 +53,6 @@
  */
 
 static struct kmem_cache *ll_thread_kmem;
-struct kmem_cache *vvp_lock_kmem;
 struct kmem_cache *vvp_object_kmem;
 static struct kmem_cache *vvp_session_kmem;
 static struct kmem_cache *vvp_thread_kmem;
@@ -64,11 +63,6 @@ static struct lu_kmem_descr vvp_caches[] = {
 		.ckd_name  = "ll_thread_kmem",
 		.ckd_size  = sizeof(struct ll_thread_info),
 	},
-	{
-		.ckd_cache = &vvp_lock_kmem,
-		.ckd_name  = "vvp_lock_kmem",
-		.ckd_size  = sizeof(struct vvp_lock),
-	},
 	{
 		.ckd_cache = &vvp_object_kmem,
 		.ckd_name  = "vvp_object_kmem",
@@ -361,26 +355,10 @@ int cl_sb_fini(struct super_block *sb)
 
 /****************************************************************************
  *
- * /proc/fs/lustre/llite/$MNT/dump_page_cache
+ * debugfs/lustre/llite/$MNT/dump_page_cache
  *
  ****************************************************************************/
 
-/*
- * To represent contents of a page cache as a byte stream, following
- * information if encoded in 64bit offset:
- *
- *       - file hash bucket in lu_site::ls_hash[]       28bits
- *
- *       - how far file is from bucket head              4bits
- *
- *       - page index                                   32bits
- *
- * First two data identify a file in the cache uniquely.
- */
-
-#define PGC_OBJ_SHIFT (32 + 4)
-#define PGC_DEPTH_SHIFT (32)
-
 struct vvp_pgcache_id {
         unsigned                 vpi_bucket;
         unsigned                 vpi_depth;
@@ -390,22 +368,18 @@ struct vvp_pgcache_id {
         struct lu_object_header *vpi_obj;
 };
 
-static void vvp_pgcache_id_unpack(loff_t pos, struct vvp_pgcache_id *id)
-{
-        CLASSERT(sizeof(pos) == sizeof(__u64));
-
-        id->vpi_index  = pos & 0xffffffff;
-        id->vpi_depth  = (pos >> PGC_DEPTH_SHIFT) & 0xf;
-        id->vpi_bucket = ((unsigned long long)pos >> PGC_OBJ_SHIFT);
-}
-
-static loff_t vvp_pgcache_id_pack(struct vvp_pgcache_id *id)
-{
-        return
-                ((__u64)id->vpi_index) |
-                ((__u64)id->vpi_depth  << PGC_DEPTH_SHIFT) |
-                ((__u64)id->vpi_bucket << PGC_OBJ_SHIFT);
-}
+struct vvp_seq_private {
+	struct ll_sb_info	*vsp_sbi;
+	struct lu_env		*vsp_env;
+	u16			vsp_refcheck;
+	struct cl_object	*vsp_clob;
+	struct vvp_pgcache_id	vvp_id;
+	/*
+	 * prev_pos is the 'pos' of the last object returned
+	 * by ->start of ->next.
+	 */
+	loff_t			vvp_prev_pos;
+};
 
 static int vvp_pgcache_obj_get(struct cfs_hash *hs, struct cfs_hash_bd *bd,
 			       struct hlist_node *hnode, void *data)
@@ -413,12 +387,12 @@ static int vvp_pgcache_obj_get(struct cfs_hash *hs, struct cfs_hash_bd *bd,
         struct vvp_pgcache_id   *id  = data;
         struct lu_object_header *hdr = cfs_hash_object(hs, hnode);
 
+	if (lu_object_is_dying(hdr))
+		return 0;
+
         if (id->vpi_curdep-- > 0)
                 return 0; /* continue */
 
-        if (lu_object_is_dying(hdr))
-                return 1;
-
         cfs_hash_get(hs, hnode);
         id->vpi_obj = hdr;
         return 1;
@@ -430,8 +404,7 @@ static struct cl_object *vvp_pgcache_obj(const struct lu_env *env,
 {
 	LASSERT(lu_device_is_cl(dev));
 
-	id->vpi_depth &= 0xf;
-	id->vpi_obj    = NULL;
+	id->vpi_obj = NULL;
 	id->vpi_curdep = id->vpi_depth;
 
 	cfs_hash_hlist_for_each(dev->ld_site->ls_obj_hash, id->vpi_bucket,
@@ -445,52 +418,42 @@ static struct cl_object *vvp_pgcache_obj(const struct lu_env *env,
 			return lu2cl(lu_obj);
 		}
 		lu_object_put(env, lu_object_top(id->vpi_obj));
-
-	} else if (id->vpi_curdep > 0) {
-		id->vpi_depth = 0xf;
 	}
 	return NULL;
 }
 
-static loff_t vvp_pgcache_find(const struct lu_env *env,
-			       struct lu_device *dev, loff_t pos)
+static struct page *vvp_pgcache_current(struct vvp_seq_private *priv)
 {
-	struct cl_object     *clob;
-	struct lu_site       *site;
-	struct vvp_pgcache_id id;
-
-	site = dev->ld_site;
-	vvp_pgcache_id_unpack(pos, &id);
+	struct lu_device *dev = &priv->vsp_sbi->ll_cl->cd_lu_dev;
 
 	while (1) {
-		if (id.vpi_bucket >= CFS_HASH_NHLIST(site->ls_obj_hash))
-			return ~0ULL;
-		clob = vvp_pgcache_obj(env, dev, &id);
-		if (clob != NULL) {
-			struct inode *inode = vvp_object_inode(clob);
-			struct page *vmpage;
-			int nr;
-
-			nr = find_get_pages_contig(inode->i_mapping,
-						   id.vpi_index, 1, &vmpage);
-			if (nr > 0) {
-				id.vpi_index = vmpage->index;
-				/* Cant support over 16T file */
-				nr = !(vmpage->index > 0xffffffff);
-				put_page(vmpage);
-			}
-
-			lu_object_ref_del(&clob->co_lu, "dump", current);
-			cl_object_put(env, clob);
-			if (nr > 0)
-				return vvp_pgcache_id_pack(&id);
+		struct inode *inode;
+		struct page *vmpage;
+		int nr;
+
+		if (!priv->vsp_clob) {
+			struct cl_object *clob;
+
+			while ((clob = vvp_pgcache_obj(priv->vsp_env, dev, &priv->vvp_id)) == NULL &&
+			       ++(priv->vvp_id.vpi_bucket) < CFS_HASH_NHLIST(dev->ld_site->ls_obj_hash))
+				priv->vvp_id.vpi_depth = 0;
+			if (!clob)
+				return NULL;
+			priv->vsp_clob = clob;
+			priv->vvp_id.vpi_index = 0;
+		}
+
+		inode = vvp_object_inode(priv->vsp_clob);
+		nr = find_get_pages_contig(inode->i_mapping, priv->vvp_id.vpi_index, 1, &vmpage);
+		if (nr > 0) {
+			priv->vvp_id.vpi_index = vmpage->index;
+			return vmpage;
 		}
-		/* to the next object. */
-		++id.vpi_depth;
-		id.vpi_depth &= 0xf;
-		if (id.vpi_depth == 0 && ++id.vpi_bucket == 0)
-			return ~0ULL;
-		id.vpi_index = 0;
+		lu_object_ref_del(&priv->vsp_clob->co_lu, "dump", current);
+		cl_object_put(priv->vsp_env, priv->vsp_clob);
+		priv->vsp_clob = NULL;
+		priv->vvp_id.vpi_index = 0;
+		priv->vvp_id.vpi_depth++;
 	}
 }
 
@@ -532,92 +495,72 @@ static void vvp_pgcache_page_show(const struct lu_env *env,
 
 static int vvp_pgcache_show(struct seq_file *f, void *v)
 {
-	loff_t                   pos;
-	struct ll_sb_info       *sbi;
-	struct cl_object        *clob;
-	struct lu_env           *env;
-	struct vvp_pgcache_id    id;
-	__u16                    refcheck;
-	int                      result;
-
-	env = cl_env_get(&refcheck);
-	if (!IS_ERR(env)) {
-		pos = *(loff_t *) v;
-		vvp_pgcache_id_unpack(pos, &id);
-		sbi = f->private;
-		clob = vvp_pgcache_obj(env, &sbi->ll_cl->cd_lu_dev, &id);
-		if (clob != NULL) {
-			struct inode *inode = vvp_object_inode(clob);
-			struct cl_page *page = NULL;
-			struct page *vmpage;
-
-			result = find_get_pages_contig(inode->i_mapping,
-						      id.vpi_index, 1, &vmpage);
-			if (result > 0) {
-				lock_page(vmpage);
-				page = cl_vmpage_page(vmpage, clob);
-				unlock_page(vmpage);
-
-				put_page(vmpage);
-			}
-
-			seq_printf(f, "%8x@"DFID": ", id.vpi_index,
-				   PFID(lu_object_fid(&clob->co_lu)));
-			if (page != NULL) {
-				vvp_pgcache_page_show(env, f, page);
-				cl_page_put(env, page);
-			} else
-				seq_puts(f, "missing\n");
-			lu_object_ref_del(&clob->co_lu, "dump", current);
-			cl_object_put(env, clob);
-		} else
-			seq_printf(f, "%llx missing\n", pos);
-		cl_env_put(env, &refcheck);
-		result = 0;
-	} else
-		result = PTR_ERR(env);
-	return result;
+	struct vvp_seq_private *priv = f->private;
+	struct page *vmpage = v;
+	struct cl_page *page;
+
+	seq_printf(f, "%8lx@" DFID ": ", vmpage->index,
+		   PFID(lu_object_fid(&priv->vsp_clob->co_lu)));
+	lock_page(vmpage);
+	page = cl_vmpage_page(vmpage, priv->vsp_clob);
+	unlock_page(vmpage);
+	put_page(vmpage);
+
+	if (page) {
+		vvp_pgcache_page_show(priv->vsp_env, f, page);
+		cl_page_put(priv->vsp_env, page);
+	} else {
+		seq_puts(f, "missing\n");
+	}
+
+	return 0;
 }
 
-static void *vvp_pgcache_start(struct seq_file *f, loff_t *pos)
+static void vvp_pgcache_rewind(struct vvp_seq_private *priv)
 {
-        struct ll_sb_info *sbi;
-        struct lu_env     *env;
-	__u16              refcheck;
+	if (priv->vvp_prev_pos) {
+		memset(&priv->vvp_id, 0, sizeof(priv->vvp_id));
+		priv->vvp_prev_pos = 0;
+		if (priv->vsp_clob) {
+			lu_object_ref_del(&priv->vsp_clob->co_lu, "dump",
+					  current);
+			cl_object_put(priv->vsp_env, priv->vsp_clob);
+		}
+		priv->vsp_clob = NULL;
+	}
+}
 
-        sbi = f->private;
+static struct page *vvp_pgcache_next_page(struct vvp_seq_private *priv)
+{
+	priv->vvp_id.vpi_index += 1;
+	return vvp_pgcache_current(priv);
+}
 
-        env = cl_env_get(&refcheck);
-        if (!IS_ERR(env)) {
-                sbi = f->private;
-                if (sbi->ll_site->ls_obj_hash->hs_cur_bits > 64 - PGC_OBJ_SHIFT)
-                        pos = ERR_PTR(-EFBIG);
-                else {
-                        *pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev,
-                                                *pos);
-                        if (*pos == ~0ULL)
-                                pos = NULL;
-                }
-                cl_env_put(env, &refcheck);
-        }
-        return pos;
+static void *vvp_pgcache_start(struct seq_file *f, loff_t *pos)
+{
+	struct vvp_seq_private *priv = f->private;
+
+	if (*pos == 0) {
+		vvp_pgcache_rewind(priv);
+	} else if (*pos == priv->vvp_prev_pos) {
+		/* Return the current item */;
+	} else {
+		WARN_ON(*pos != priv->vvp_prev_pos + 1);
+		priv->vvp_id.vpi_index += 1;
+	}
+
+	priv->vvp_prev_pos = *pos;
+	return vvp_pgcache_current(priv);
 }
 
 static void *vvp_pgcache_next(struct seq_file *f, void *v, loff_t *pos)
 {
-        struct ll_sb_info *sbi;
-        struct lu_env     *env;
-	__u16              refcheck;
+	struct vvp_seq_private *priv = f->private;
 
-        env = cl_env_get(&refcheck);
-        if (!IS_ERR(env)) {
-                sbi = f->private;
-                *pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev, *pos + 1);
-                if (*pos == ~0ULL)
-                        pos = NULL;
-                cl_env_put(env, &refcheck);
-        }
-        return pos;
+	WARN_ON(*pos != priv->vvp_prev_pos);
+	*pos += 1;
+	priv->vvp_prev_pos = *pos;
+	return vvp_pgcache_next_page(priv);
 }
 
 static void vvp_pgcache_stop(struct seq_file *f, void *v)
@@ -634,22 +577,44 @@ static struct seq_operations vvp_pgcache_ops = {
 
 static int vvp_dump_pgcache_seq_open(struct inode *inode, struct file *filp)
 {
-	struct ll_sb_info	*sbi = PDE_DATA(inode);
-	struct seq_file		*seq;
-	int			result;
-
-	result = seq_open(filp, &vvp_pgcache_ops);
-	if (result == 0) {
-		seq = filp->private_data;
-		seq->private = sbi;
+	struct vvp_seq_private *priv;
+
+	priv = __seq_open_private(filp, &vvp_pgcache_ops, sizeof(*priv));
+	if (!priv)
+		return -ENOMEM;
+
+	priv->vsp_sbi = inode->i_private;
+	priv->vsp_env = cl_env_get(&priv->vsp_refcheck);
+	priv->vsp_clob = NULL;
+	memset(&priv->vvp_id, 0, sizeof(priv->vvp_id));
+	if (IS_ERR(priv->vsp_env)) {
+		int err = PTR_ERR(priv->vsp_env);
+
+		seq_release_private(inode, filp);
+		return err;
 	}
-	return result;
+
+	return 0;
+}
+
+static int vvp_dump_pgcache_seq_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct vvp_seq_private *priv = seq->private;
+
+	if (priv->vsp_clob) {
+		lu_object_ref_del(&priv->vsp_clob->co_lu, "dump", current);
+		cl_object_put(priv->vsp_env, priv->vsp_clob);
+	}
+
+	cl_env_put(priv->vsp_env, &priv->vsp_refcheck);
+	return seq_release_private(inode, file);
 }
 
-const struct proc_ops vvp_dump_pgcache_file_ops = {
-	PROC_OWNER(THIS_MODULE)
-        .proc_open    = vvp_dump_pgcache_seq_open,
-        .proc_read    = seq_read,
-        .proc_lseek   = seq_lseek,
-        .proc_release = seq_release,
+const struct file_operations vvp_dump_pgcache_file_ops = {
+	.owner	 = THIS_MODULE,
+	.open	 = vvp_dump_pgcache_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = vvp_dump_pgcache_seq_release,
 };
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_internal.h b/drivers/staging/lustrefsx/lustre/llite/vvp_internal.h
index 9973d646ae703..0fb9b51a8f618 100644
--- a/drivers/staging/lustrefsx/lustre/llite/vvp_internal.h
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2013, 2016, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -37,7 +37,6 @@
 #ifndef VVP_INTERNAL_H
 #define VVP_INTERNAL_H
 
-#include <lustre/lustre_idl.h>
 #include <cl_object.h>
 
 enum obd_notify_event;
@@ -61,7 +60,13 @@ struct vvp_io {
 	/** super class */
 	struct cl_io_slice     vui_cl;
 	struct cl_io_lock_link vui_link;
-	/** Total size for the left IO. */
+	/**
+	 * I/O vector information to or from which read/write is going.
+	 */
+	struct iov_iter *vui_iter;
+	/**
+	 * Total size for the left IO.
+	 */
 	size_t vui_tot_count;
 
 	union {
@@ -88,6 +93,7 @@ struct vvp_io {
 			 * check that flags are from filemap_fault
 			 */
 			bool			 ft_flags_valid;
+			struct cl_page_list	 ft_queue;
 		} fault;
 		struct {
 			struct pipe_inode_info	*vui_pipe;
@@ -111,6 +117,7 @@ struct vvp_io {
 	* File descriptor against which IO is done.
 	*/
 	struct ll_file_data	*vui_fd;
+	struct kiocb		*vui_iocb;
 
 	/* Readahead state. */
 	pgoff_t	vui_ra_start;
@@ -124,7 +131,6 @@ extern struct lu_device_type vvp_device_type;
 extern struct lu_context_key vvp_session_key;
 extern struct lu_context_key vvp_thread_key;
 
-extern struct kmem_cache *vvp_lock_kmem;
 extern struct kmem_cache *vvp_object_kmem;
 
 struct vvp_thread_info {
@@ -132,6 +138,7 @@ struct vvp_thread_info {
 	struct cl_lock_descr	vti_descr;
 	struct cl_io		vti_io;
 	struct cl_attr		vti_attr;
+	struct cl_sync_io	vti_anchor;
 };
 
 static inline struct vvp_thread_info *vvp_env_info(const struct lu_env *env)
@@ -251,10 +258,6 @@ struct vvp_device {
 	struct cl_device   *vdv_next;
 };
 
-struct vvp_lock {
-	struct cl_lock_slice vlk_cl;
-};
-
 static inline struct lu_device *vvp2lu_dev(struct vvp_device *vdv)
 {
 	return &vdv->vdv_cl.cd_lu_dev;
@@ -293,11 +296,6 @@ static inline struct page *cl2vm_page(const struct cl_page_slice *slice)
 	return cl2vvp_page(slice)->vpg_page;
 }
 
-static inline struct vvp_lock *cl2vvp_lock(const struct cl_lock_slice *slice)
-{
-	return container_of(slice, struct vvp_lock, vlk_cl);
-}
-
 #ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
 # define CLOBINVRNT(env, clob, expr)					\
 	do {								\
@@ -317,8 +315,6 @@ int lov_read_and_clear_async_rc(struct cl_object *clob);
 int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
 		struct cl_io *io);
 int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io);
-int vvp_lock_init(const struct lu_env *env, struct cl_object *obj,
-		  struct cl_lock *lock, const struct cl_io *io);
 int vvp_page_init(const struct lu_env *env, struct cl_object *obj,
 		  struct cl_page *page, pgoff_t index);
 struct lu_object *vvp_object_alloc(const struct lu_env *env,
@@ -328,6 +324,6 @@ struct lu_object *vvp_object_alloc(const struct lu_env *env,
 int vvp_global_init(void);
 void vvp_global_fini(void);
 
-extern const struct proc_ops vvp_dump_pgcache_file_ops;
+extern const struct file_operations vvp_dump_pgcache_file_ops;
 
 #endif /* VVP_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_io.c b/drivers/staging/lustrefsx/lustre/llite/vvp_io.c
index 1bcadeb7cf0da..6d8070c5b8bfd 100644
--- a/drivers/staging/lustrefsx/lustre/llite/vvp_io.c
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_io.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -299,12 +299,14 @@ static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
 	struct cl_object *obj = io->ci_obj;
 	struct vvp_io    *vio = cl2vvp_io(env, ios);
 	struct inode     *inode = vvp_object_inode(obj);
+	__u32		  gen = 0;
 	int rc;
+	ENTRY;
 
 	CLOBINVRNT(env, obj, vvp_object_invariant(obj));
 
 	CDEBUG(D_VFSTRACE, DFID" ignore/verify layout %d/%d, layout version %d "
-	       "need write layout %d, restore needed %d\n",
+			   "need write layout %d, restore needed %d\n",
 	       PFID(lu_object_fid(&obj->co_lu)),
 	       io->ci_ignore_layout, io->ci_verify_layout,
 	       vio->vui_layout_gen, io->ci_need_write_intent,
@@ -321,18 +323,40 @@ static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
 		 * block on layout lock held by the MDT
 		 * as MDT will not send new layout in lvb (see LU-3124)
 		 * we have to explicitly fetch it, all this will be done
-		 * by ll_layout_refresh()
+		 * by ll_layout_refresh().
+		 * Even if ll_layout_restore() returns zero, it doesn't mean
+		 * that restore has been successful. Therefore it sets
+		 * ci_verify_layout so that it will check layout at the end
+		 * of this function.
 		 */
-		if (rc == 0) {
-			io->ci_restore_needed = 0;
-			io->ci_need_restart = 1;
-			io->ci_verify_layout = 1;
-		} else {
+		if (rc) {
 			io->ci_restore_needed = 1;
 			io->ci_need_restart = 0;
 			io->ci_verify_layout = 0;
 			io->ci_result = rc;
+			GOTO(out, rc);
+		}
+
+		io->ci_restore_needed = 0;
+
+		/* Even if ll_layout_restore() returns zero, it doesn't mean
+		 * that restore has been successful. Therefore it should verify
+		 * if there was layout change and restart I/O correspondingly.
+		 */
+		ll_layout_refresh(inode, &gen);
+		io->ci_need_restart = vio->vui_layout_gen != gen;
+		if (io->ci_need_restart) {
+			CDEBUG(D_VFSTRACE,
+			       DFID" layout changed from %d to %d.\n",
+			       PFID(lu_object_fid(&obj->co_lu)),
+			       vio->vui_layout_gen, gen);
+			/* today successful restore is the only possible
+			 * case */
+			/* restore was done, clear restoring state */
+			ll_file_clear_flag(ll_i2info(vvp_object_inode(obj)),
+					   LLIF_FILE_RESTORING);
 		}
+		GOTO(out, 0);
 	}
 
 	/**
@@ -340,47 +364,29 @@ static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
 	 * RPC.
 	 */
 	if (io->ci_need_write_intent) {
-		loff_t start = 0;
-		loff_t end = OBD_OBJECT_EOF;
+		enum layout_intent_opc opc = LAYOUT_INTENT_WRITE;
 
 		io->ci_need_write_intent = 0;
 
 		LASSERT(io->ci_type == CIT_WRITE ||
 			cl_io_is_trunc(io) || cl_io_is_mkwrite(io));
 
-		if (io->ci_type == CIT_WRITE) {
-			if (!cl_io_is_append(io)) {
-				start = io->u.ci_rw.rw_range.cir_pos;
-				end = start + io->u.ci_rw.rw_range.cir_count;
-			}
-		} else if (cl_io_is_trunc(io)) {
-			/* for writes, e_end is endpos, the location of the file
-			 * pointer after the write is completed, so it is not accessed.
-			 * For truncate, 'end' is the size, and *is* acccessed.
-			 * In other words, writes are [start, end), but truncate is
-			 * [start, size], where both are included.  So add 1 to the
-			 * size when creating the write intent to account for this.
-			 */
-			end = io->u.ci_setattr.sa_attr.lvb_size + 1;
-		} else { /* mkwrite */
-			pgoff_t index = io->u.ci_fault.ft_index;
+		CDEBUG(D_VFSTRACE, DFID" write layout, type %u "DEXT"\n",
+		       PFID(lu_object_fid(&obj->co_lu)), io->ci_type,
+		       PEXT(&io->ci_write_intent));
 
-			start = cl_offset(io->ci_obj, index);
-			end = cl_offset(io->ci_obj, index + 1);
-		}
+		if (cl_io_is_trunc(io))
+			opc = LAYOUT_INTENT_TRUNC;
 
-		CDEBUG(D_VFSTRACE, DFID" write layout, type %u [%llu, %llu)\n",
-		       PFID(lu_object_fid(&obj->co_lu)), io->ci_type,
-		       start, end);
-		rc = ll_layout_write_intent(inode, start, end);
+		rc = ll_layout_write_intent(inode, opc, &io->ci_write_intent);
 		io->ci_result = rc;
 		if (!rc)
 			io->ci_need_restart = 1;
+		GOTO(out, rc);
 	}
 
-	if (!io->ci_ignore_layout && io->ci_verify_layout) {
-		__u32 gen = 0;
-
+	if (!io->ci_need_restart &&
+	    !io->ci_ignore_layout && io->ci_verify_layout) {
 		/* check layout version */
 		ll_layout_refresh(inode, &gen);
 		io->ci_need_restart = vio->vui_layout_gen != gen;
@@ -389,13 +395,11 @@ static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
 			       DFID" layout changed from %d to %d.\n",
 			       PFID(lu_object_fid(&obj->co_lu)),
 			       vio->vui_layout_gen, gen);
-			/* today successful restore is the only possible
-			 * case */
-			/* restore was done, clear restoring state */
-			ll_file_clear_flag(ll_i2info(vvp_object_inode(obj)),
-					   LLIF_FILE_RESTORING);
 		}
+		GOTO(out, 0);
 	}
+out:
+	EXIT;
 }
 
 static void vvp_io_fault_fini(const struct lu_env *env,
@@ -426,7 +430,8 @@ static enum cl_lock_mode vvp_mode_from_vma(struct vm_area_struct *vma)
         return CLM_READ;
 }
 
-static int vvp_mmap_locks(const struct lu_env *env, struct cl_io *io)
+static int vvp_mmap_locks(const struct lu_env *env,
+			  struct vvp_io *vio, struct cl_io *io)
 {
 	struct vvp_thread_info *vti = vvp_env_info(env);
 	struct mm_struct *mm = current->mm;
@@ -445,14 +450,18 @@ static int vvp_mmap_locks(const struct lu_env *env, struct cl_io *io)
 	if (!cl_is_normalio(env, io))
 		RETURN(0);
 
+	/* nfs or loop back device write */
+	if (vio->vui_iter == NULL)
+		RETURN(0);
+
 	/* No MM (e.g. NFS)? No vmas too. */
 	if (mm == NULL)
 		RETURN(0);
 
-	if (!iter_is_iovec(&io->u.ci_rw.rw_iter) && !iov_iter_is_kvec(&io->u.ci_rw.rw_iter))
+	if (!iter_is_iovec(vio->vui_iter) && !iov_iter_is_kvec(vio->vui_iter))
 		RETURN(0);
 
-	for (i = io->u.ci_rw.rw_iter;
+	for (i = *vio->vui_iter;
 	     iov_iter_count(&i);
 	     iov_iter_advance(&i, iov.iov_len)) {
 		iov = iov_iter_iovec(&i);
@@ -528,37 +537,38 @@ static void vvp_io_advance(const struct lu_env *env,
 		return;
 
 	vio->vui_tot_count -= nob;
-	if (io->ci_pio) {
-		iov_iter_advance(&io->u.ci_rw.rw_iter, nob);
-		io->u.ci_rw.rw_iocb.ki_pos = io->u.ci_rw.rw_range.cir_pos;
-#ifdef HAVE_KIOCB_KI_LEFT
-		io->u.ci_rw.rw_iocb.ki_left = vio->vui_tot_count;
-#elif defined(HAVE_KI_NBYTES)
-		io->u.ci_rw.rw_iocb.ki_nbytes = vio->vui_tot_count;
-#endif
-	} else {
-		/* It was truncated to stripe size in vvp_io_rw_lock() */
-		iov_iter_reexpand(&io->u.ci_rw.rw_iter, vio->vui_tot_count);
-	}
+	iov_iter_reexpand(vio->vui_iter, vio->vui_tot_count);
+}
+
+static void vvp_io_update_iov(const struct lu_env *env,
+			      struct vvp_io *vio, struct cl_io *io)
+{
+	size_t size = io->u.ci_rw.crw_count;
+
+	if (!cl_is_normalio(env, io) || vio->vui_iter == NULL)
+		return;
+
+	iov_iter_truncate(vio->vui_iter, size);
 }
 
 static int vvp_io_rw_lock(const struct lu_env *env, struct cl_io *io,
                           enum cl_lock_mode mode, loff_t start, loff_t end)
 {
+	struct vvp_io *vio = vvp_env_io(env);
 	int result;
 	int ast_flags = 0;
 
 	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
 	ENTRY;
 
-	if (cl_is_normalio(env, io))
-		iov_iter_truncate(&io->u.ci_rw.rw_iter,
-				  io->u.ci_rw.rw_range.cir_count);
+	vvp_io_update_iov(env, vio, io);
 
-	if (io->u.ci_rw.rw_nonblock)
+	if (io->u.ci_rw.crw_nonblock)
 		ast_flags |= CEF_NONBLOCK;
+	if (io->ci_lock_no_expand)
+		ast_flags |= CEF_LOCK_NO_EXPAND;
 
-	result = vvp_mmap_locks(env, io);
+	result = vvp_mmap_locks(env, vio, io);
 	if (result == 0)
 		result = vvp_io_one_lock(env, io, ast_flags, mode, start, end);
 
@@ -569,13 +579,13 @@ static int vvp_io_read_lock(const struct lu_env *env,
                             const struct cl_io_slice *ios)
 {
 	struct cl_io *io = ios->cis_io;
-	struct cl_io_range *range = &io->u.ci_rw.rw_range;
-	int rc;
+	struct cl_io_rw_common *rd = &io->u.ci_rd.rd;
+	int result;
 
 	ENTRY;
-	rc = vvp_io_rw_lock(env, io, CLM_READ, range->cir_pos,
-			    range->cir_pos + range->cir_count - 1);
-	RETURN(rc);
+	result = vvp_io_rw_lock(env, io, CLM_READ, rd->crw_pos,
+				rd->crw_pos + rd->crw_count - 1);
+	RETURN(result);
 }
 
 static int vvp_io_fault_lock(const struct lu_env *env,
@@ -594,27 +604,26 @@ static int vvp_io_fault_lock(const struct lu_env *env,
 }
 
 static int vvp_io_write_lock(const struct lu_env *env,
-                             const struct cl_io_slice *ios)
+			     const struct cl_io_slice *ios)
 {
 	struct cl_io *io = ios->cis_io;
 	loff_t start;
 	loff_t end;
-	int rc;
 
-	ENTRY;
-	if (io->u.ci_rw.rw_append) {
+	if (io->u.ci_wr.wr_append) {
 		start = 0;
 		end   = OBD_OBJECT_EOF;
 	} else {
-		start = io->u.ci_rw.rw_range.cir_pos;
-		end   = start + io->u.ci_rw.rw_range.cir_count - 1;
+		start = io->u.ci_wr.wr.crw_pos;
+		end   = start + io->u.ci_wr.wr.crw_count - 1;
 	}
-	rc = vvp_io_rw_lock(env, io, CLM_WRITE, start, end);
-	RETURN(rc);
+
+	RETURN(vvp_io_rw_lock(env, io, CLM_WRITE, start, end));
 }
 
 static int vvp_io_setattr_iter_init(const struct lu_env *env,
 				    const struct cl_io_slice *ios)
+
 {
 	return 0;
 }
@@ -631,12 +640,12 @@ static int vvp_io_setattr_lock(const struct lu_env *env,
 	__u64 new_size;
 	__u32 enqflags = 0;
 
-        if (cl_io_is_trunc(io)) {
-                new_size = io->u.ci_setattr.sa_attr.lvb_size;
-                if (new_size == 0)
-                        enqflags = CEF_DISCARD_DATA;
-        } else {
-		unsigned int valid = io->u.ci_setattr.sa_valid;
+	if (cl_io_is_trunc(io)) {
+		new_size = io->u.ci_setattr.sa_attr.lvb_size;
+		if (new_size == 0)
+			enqflags = CEF_DISCARD_DATA;
+	} else {
+		unsigned int valid = io->u.ci_setattr.sa_avalid;
 
 		if (!(valid & TIMES_SET_FLAGS))
 			return 0;
@@ -685,16 +694,16 @@ static int vvp_io_setattr_time(const struct lu_env *env,
         int result;
         unsigned valid = CAT_CTIME;
 
-        cl_object_attr_lock(obj);
-        attr->cat_ctime = io->u.ci_setattr.sa_attr.lvb_ctime;
-        if (io->u.ci_setattr.sa_valid & ATTR_ATIME_SET) {
-                attr->cat_atime = io->u.ci_setattr.sa_attr.lvb_atime;
-                valid |= CAT_ATIME;
-        }
-        if (io->u.ci_setattr.sa_valid & ATTR_MTIME_SET) {
-                attr->cat_mtime = io->u.ci_setattr.sa_attr.lvb_mtime;
-                valid |= CAT_MTIME;
-        }
+	cl_object_attr_lock(obj);
+	attr->cat_ctime = io->u.ci_setattr.sa_attr.lvb_ctime;
+	if (io->u.ci_setattr.sa_avalid & ATTR_ATIME_SET) {
+		attr->cat_atime = io->u.ci_setattr.sa_attr.lvb_atime;
+		valid |= CAT_ATIME;
+	}
+	if (io->u.ci_setattr.sa_avalid & ATTR_MTIME_SET) {
+		attr->cat_mtime = io->u.ci_setattr.sa_attr.lvb_mtime;
+		valid |= CAT_MTIME;
+	}
 	result = cl_object_attr_update(env, obj, attr, valid);
 	cl_object_attr_unlock(obj);
 
@@ -716,7 +725,7 @@ static int vvp_io_setattr_start(const struct lu_env *env,
 		inode_lock(inode);
 	}
 
-	if (io->u.ci_setattr.sa_valid & TIMES_SET_FLAGS)
+	if (io->u.ci_setattr.sa_avalid & TIMES_SET_FLAGS)
 		return vvp_io_setattr_time(env, ios);
 
 	return 0;
@@ -764,34 +773,36 @@ static int vvp_io_read_start(const struct lu_env *env,
 	struct inode		*inode = vvp_object_inode(obj);
 	struct ll_inode_info	*lli   = ll_i2info(inode);
 	struct file		*file  = vio->vui_fd->fd_file;
-	struct cl_io_range	*range = &io->u.ci_rw.rw_range;
-	loff_t pos = range->cir_pos; /* for generic_file_splice_read() only */
-	size_t tot = vio->vui_tot_count;
-	int exceed = 0;
-	int result;
+	loff_t  pos = io->u.ci_rd.rd.crw_pos;
+	long    cnt = io->u.ci_rd.rd.crw_count;
+	long    tot = vio->vui_tot_count;
+	int     exceed = 0;
+	int     result;
+	ENTRY;
 
 	CLOBINVRNT(env, obj, vvp_object_invariant(obj));
 
 	CDEBUG(D_VFSTRACE, "%s: read [%llu, %llu)\n",
 		file_dentry(file)->d_name.name,
-		range->cir_pos, range->cir_pos + range->cir_count);
+		pos, pos + cnt);
 
 	if (vio->vui_io_subtype == IO_NORMAL)
 		down_read(&lli->lli_trunc_sem);
 
 	if (!can_populate_pages(env, io, inode))
-		return 0;
+		RETURN(0);
 
-	result = vvp_prep_size(env, obj, io, range->cir_pos, tot, &exceed);
+	/* Unless this is reading a sparse file, otherwise the lock has already
+	 * been acquired so vvp_prep_size() is an empty op. */
+	result = vvp_prep_size(env, obj, io, pos, cnt, &exceed);
 	if (result != 0)
-		return result;
+		RETURN(result);
 	else if (exceed != 0)
-		goto out;
+		GOTO(out, result);
 
 	LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu,
 			 "Read ino %lu, %lu bytes, offset %lld, size %llu\n",
-			 inode->i_ino, range->cir_count, range->cir_pos,
-			 i_size_read(inode));
+			 inode->i_ino, cnt, pos, i_size_read(inode));
 
 	/* turn off the kernel's read-ahead */
 	vio->vui_fd->fd_file->f_ra.ra_pages = 0;
@@ -799,7 +810,7 @@ static int vvp_io_read_start(const struct lu_env *env,
 	/* initialize read-ahead window once per syscall */
 	if (!vio->vui_ra_valid) {
 		vio->vui_ra_valid = true;
-		vio->vui_ra_start = cl_index(obj, range->cir_pos);
+		vio->vui_ra_start = cl_index(obj, pos);
 		vio->vui_ra_count = cl_index(obj, tot + PAGE_SIZE - 1);
 		ll_ras_enter(file);
 	}
@@ -808,17 +819,12 @@ static int vvp_io_read_start(const struct lu_env *env,
 	file_accessed(file);
 	switch (vio->vui_io_subtype) {
 	case IO_NORMAL:
-		LASSERTF(io->u.ci_rw.rw_iocb.ki_pos == range->cir_pos,
-			 "ki_pos %lld [%lld, %lld)\n",
-			 io->u.ci_rw.rw_iocb.ki_pos,
-			 range->cir_pos, range->cir_pos + range->cir_count);
-		result = generic_file_read_iter(&io->u.ci_rw.rw_iocb,
-						&io->u.ci_rw.rw_iter);
+		LASSERT(vio->vui_iocb->ki_pos == pos);
+		result = generic_file_read_iter(vio->vui_iocb, vio->vui_iter);
 		break;
 	case IO_SPLICE:
 		result = generic_file_splice_read(file, &pos,
-						  vio->u.splice.vui_pipe,
-						  range->cir_count,
+						  vio->u.splice.vui_pipe, cnt,
 						  vio->u.splice.vui_flags);
 		/* LU-1109: do splice read stripe by stripe otherwise if it
 		 * may make nfsd stuck if this read occupied all internal pipe
@@ -829,14 +835,13 @@ static int vvp_io_read_start(const struct lu_env *env,
 		CERROR("Wrong IO type %u\n", vio->vui_io_subtype);
 		LBUG();
 	}
+	GOTO(out, result);
 
 out:
 	if (result >= 0) {
-		if (result < range->cir_count)
+		if (result < cnt)
 			io->ci_continue = 0;
 		io->ci_nob += result;
-		ll_rw_stats_tally(ll_i2sbi(inode), current->pid, vio->vui_fd,
-				  range->cir_pos, result, READ);
 		result = 0;
 	}
 
@@ -892,6 +897,7 @@ static int vvp_io_commit_sync(const struct lu_env *env, struct cl_io *io,
 			SetPageUptodate(cl_page_vmpage(page));
 			cl_page_disown(env, io, page);
 
+			/* held in ll_cl_init() */
 			lu_ref_del(&page->cp_reference, "cl_io", io);
 			cl_page_put(env, page);
 		}
@@ -910,6 +916,7 @@ static void write_commit_callback(const struct lu_env *env, struct cl_io *io,
 
 	cl_page_disown(env, io, page);
 
+	/* held in ll_cl_init() */
 	lu_ref_del(&page->cp_reference, "cl_io", cl_io_top(io));
 	cl_page_put(env, page);
 }
@@ -1010,6 +1017,7 @@ int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io)
 
 		cl_page_disown(env, io, page);
 
+		/* held in ll_cl_init() */
 		lu_ref_del(&page->cp_reference, "cl_io", io);
 		cl_page_put(env, page);
 	}
@@ -1027,10 +1035,14 @@ static int vvp_io_write_start(const struct lu_env *env,
 	struct inode		*inode = vvp_object_inode(obj);
 	struct ll_inode_info	*lli   = ll_i2info(inode);
 	struct file		*file  = vio->vui_fd->fd_file;
-	struct cl_io_range	*range = &io->u.ci_rw.rw_range;
-	bool			 lock_inode = !lli->lli_inode_locked &&
-					      !IS_NOSEC(inode);
 	ssize_t			 result = 0;
+	loff_t			 pos = io->u.ci_wr.wr.crw_pos;
+	size_t			 cnt = io->u.ci_wr.wr.crw_count;
+	bool			 lock_inode = !IS_NOSEC(inode);
+	size_t nob = io->ci_nob;
+	struct iov_iter iter;
+	size_t written = 0;
+
 	ENTRY;
 
 	if (vio->vui_io_subtype == IO_NORMAL)
@@ -1045,29 +1057,28 @@ static int vvp_io_write_start(const struct lu_env *env,
 		 * out-of-order writes.
 		 */
 		ll_merge_attr(env, inode);
-		range->cir_pos = i_size_read(inode);
-		io->u.ci_rw.rw_iocb.ki_pos = range->cir_pos;
+		pos = io->u.ci_wr.wr.crw_pos = i_size_read(inode);
+		vio->vui_iocb->ki_pos = pos;
 	} else {
-		LASSERTF(io->u.ci_rw.rw_iocb.ki_pos == range->cir_pos,
+		LASSERTF(vio->vui_iocb->ki_pos == pos,
 			 "ki_pos %lld [%lld, %lld)\n",
-			 io->u.ci_rw.rw_iocb.ki_pos,
-			 range->cir_pos, range->cir_pos + range->cir_count);
+			 vio->vui_iocb->ki_pos,
+			 pos, pos + cnt);
 	}
 
 	CDEBUG(D_VFSTRACE, "%s: write [%llu, %llu)\n",
 		file_dentry(file)->d_name.name,
-		range->cir_pos, range->cir_pos + range->cir_count);
+		pos, pos + cnt);
 
 	/* The maximum Lustre file size is variable, based on the OST maximum
 	 * object size and number of stripes.  This needs another check in
 	 * addition to the VFS checks earlier. */
-	if (range->cir_pos + range->cir_count > ll_file_maxbytes(inode)) {
+	if (pos + cnt > ll_file_maxbytes(inode)) {
 		CDEBUG(D_INODE,
 		       "%s: file %s ("DFID") offset %llu > maxbytes %llu\n",
 		       ll_get_fsname(inode->i_sb, NULL, 0),
 		       file_dentry(file)->d_name.name,
-		       PFID(ll_inode2fid(inode)),
-		       range->cir_pos + range->cir_count,
+		       PFID(ll_inode2fid(inode)), pos + cnt,
 		       ll_file_maxbytes(inode));
 		RETURN(-EFBIG);
 	}
@@ -1079,52 +1090,85 @@ static int vvp_io_write_start(const struct lu_env *env,
 	if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_IMUTEX_NOSEC) && lock_inode)
 		RETURN(-EINVAL);
 
-	/*
-	 * When using the locked AIO function (generic_file_aio_write())
-	 * testing has shown the inode mutex to be a limiting factor
-	 * with multi-threaded single shared file performance. To get
-	 * around this, we now use the lockless version. To maintain
-	 * consistency, proper locking to protect against writes,
-	 * trucates, etc. is handled in the higher layers of lustre.
-	 */
-	if (lock_inode)
-		inode_lock(inode);
-	result = __generic_file_write_iter(&io->u.ci_rw.rw_iocb,
-					   &io->u.ci_rw.rw_iter);
-	if (lock_inode)
-		inode_unlock(inode);
-
-	if (result > 0 || result == -EIOCBQUEUED)
+	if (vio->vui_iter == NULL) {
+		/* from a temp io in ll_cl_init(). */
+		result = 0;
+	} else {
+		/*
+		 * When using the locked AIO function (generic_file_aio_write())
+		 * testing has shown the inode mutex to be a limiting factor
+		 * with multi-threaded single shared file performance. To get
+		 * around this, we now use the lockless version. To maintain
+		 * consistency, proper locking to protect against writes,
+		 * trucates, etc. is handled in the higher layers of lustre.
+		 */
+		lock_inode = !IS_NOSEC(inode);
+		iter = *vio->vui_iter;
+
+		if (unlikely(lock_inode))
+			inode_lock(inode);
+		result = __generic_file_write_iter(vio->vui_iocb,
+						   vio->vui_iter);
+		if (unlikely(lock_inode))
+			inode_unlock(inode);
+
+		written = result;
+		if (result > 0 || result == -EIOCBQUEUED)
 #ifdef HAVE_GENERIC_WRITE_SYNC_2ARGS
-		result = generic_write_sync(&io->u.ci_rw.rw_iocb, result);
+			result = generic_write_sync(vio->vui_iocb, result);
 #else
-	{
-		ssize_t err;
+		{
+			ssize_t err;
 
-		err = generic_write_sync(io->u.ci_rw.rw_iocb.ki_filp,
-					 range->cir_pos, result);
-		if (err < 0 && result > 0)
-			result = err;
-	}
+			err = generic_write_sync(vio->vui_iocb->ki_filp, pos,
+						 result);
+			if (err < 0 && result > 0)
+				result = err;
+		}
 #endif
+	}
 
 	if (result > 0) {
 		result = vvp_io_write_commit(env, io);
+		/* Simulate short commit */
+		if (CFS_FAULT_CHECK(OBD_FAIL_LLITE_SHORT_COMMIT)) {
+			vio->u.write.vui_written >>= 1;
+			if (vio->u.write.vui_written > 0)
+				io->ci_need_restart = 1;
+		}
 		if (vio->u.write.vui_written > 0) {
 			result = vio->u.write.vui_written;
 			CDEBUG(D_VFSTRACE, "%s: write nob %zd, result: %zd\n",
 				file_dentry(file)->d_name.name,
 				io->ci_nob, result);
 			io->ci_nob += result;
+		} else {
+			io->ci_continue = 0;
 		}
 	}
+	if (vio->vui_iocb->ki_pos != (pos + io->ci_nob - nob)) {
+		CDEBUG(D_VFSTRACE, "%s: write position mismatch: "
+		       "ki_pos %lld vs. pos %lld, written %ld, commit %ld "
+		       "rc %ld\n",
+		       file_dentry(file)->d_name.name,
+		       vio->vui_iocb->ki_pos, pos + io->ci_nob - nob,
+		       written, io->ci_nob - nob, result);
+		/*
+		 * Rewind ki_pos and vui_iter to where it has
+		 * successfully committed.
+		 */
+		vio->vui_iocb->ki_pos = pos + io->ci_nob - nob;
+		iov_iter_advance(&iter, io->ci_nob - nob);
+		vio->vui_iter->iov = iter.iov;
+		vio->vui_iter->nr_segs = iter.nr_segs;
+		vio->vui_iter->iov_offset = iter.iov_offset;
+		vio->vui_iter->count = iter.count;
+	}
 	if (result > 0) {
 		ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
 
-		if (result < range->cir_count)
+		if (result < cnt)
 			io->ci_continue = 0;
-		ll_rw_stats_tally(ll_i2sbi(inode), current->pid,
-				  vio->vui_fd, range->cir_pos, result, WRITE);
 		result = 0;
 	}
 
@@ -1279,7 +1323,7 @@ static int vvp_io_fault_start(const struct lu_env *env,
 	if (fio->ft_mkwrite) {
 		wait_on_page_writeback(vmpage);
 		if (!PageDirty(vmpage)) {
-			struct cl_page_list *plist = &io->ci_queue.c2_qin;
+			struct cl_page_list *plist = &vio->u.fault.ft_queue;
 			struct vvp_page *vpg = cl_object_page_slice(obj, page);
 			int to = PAGE_SIZE;
 
@@ -1291,13 +1335,34 @@ static int vvp_io_fault_start(const struct lu_env *env,
 
 			/* size fixup */
 			if (last_index == vvp_index(vpg))
-				to = size & ~PAGE_MASK;
+				to = ((size - 1) & ~PAGE_MASK) + 1;
 
 			/* Do not set Dirty bit here so that in case IO is
 			 * started before the page is really made dirty, we
 			 * still have chance to detect it. */
 			result = cl_io_commit_async(env, io, plist, 0, to,
 						    mkwrite_commit_callback);
+			/* Have overquota flag, trying sync write to check
+			 * whether indeed out of quota */
+			if (result == -EDQUOT) {
+				cl_page_get(page);
+				result = vvp_io_commit_sync(env, io,
+							    plist, 0, to);
+				if (result >= 0) {
+					io->ci_noquota = 1;
+					cl_page_own(env, io, page);
+					cl_page_list_add(plist, page);
+					lu_ref_add(&page->cp_reference,
+						   "cl_io", io);
+					result = cl_io_commit_async(env, io,
+						plist, 0, to,
+						mkwrite_commit_callback);
+					io->ci_noquota = 0;
+				} else {
+					cl_page_put(env, page);
+				}
+			}
+
 			LASSERT(cl_page_is_owned(page, io));
 			cl_page_list_fini(env, plist);
 
@@ -1312,8 +1377,9 @@ static int vvp_io_fault_start(const struct lu_env *env,
 				if (result == -EDQUOT)
 					result = -ENOSPC;
 				GOTO(out, result);
-			} else
+			} else {
 				cl_page_disown(env, io, page);
+			}
 		}
 	}
 
@@ -1422,6 +1488,9 @@ static const struct cl_io_operations vvp_io_ops = {
 			.cio_start	= vvp_io_fsync_start,
 			.cio_fini	= vvp_io_fini
 		},
+		[CIT_GLIMPSE] = {
+			.cio_fini	= vvp_io_fini
+		},
 		[CIT_MISC] = {
 			.cio_fini	= vvp_io_fini
 		},
@@ -1453,13 +1522,16 @@ int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
 	vio->vui_ra_valid = false;
 	result = 0;
 	if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE) {
+		size_t count;
 		struct ll_inode_info *lli = ll_i2info(inode);
 
-		vio->vui_tot_count = io->u.ci_rw.rw_range.cir_count;
+		count = io->u.ci_rw.crw_count;
 		/* "If nbyte is 0, read() will return 0 and have no other
 		 *  results."  -- Single Unix Spec */
-		if (vio->vui_tot_count == 0)
+		if (count == 0)
 			result = 1;
+		else
+			vio->vui_tot_count = count;
 
 		/* for read/write, we store the jobid in the inode, and
 		 * it'll be fetched by osc when building RPC.
@@ -1467,7 +1539,7 @@ int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
 		 * it's not accurate if the file is shared by different
 		 * jobs.
 		 */
-		lustre_get_jobid(lli->lli_jobid);
+		lustre_get_jobid(lli->lli_jobid, sizeof(lli->lli_jobid));
 	} else if (io->ci_type == CIT_SETATTR) {
 		if (!cl_io_is_trunc(io))
 			io->ci_lockreq = CILR_MANDATORY;
@@ -1490,5 +1562,6 @@ int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
 				PFID(lu_object_fid(&obj->co_lu)), result);
 	}
 
+	io->ci_result = result < 0 ? result : 0;
 	RETURN(result);
 }
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_lock.c b/drivers/staging/lustrefsx/lustre/llite/vvp_lock.c
deleted file mode 100644
index 651b8e128239d..0000000000000
--- a/drivers/staging/lustrefsx/lustre/llite/vvp_lock.c
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2014, 2015, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * Implementation of cl_lock for VVP layer.
- *
- *   Author: Nikita Danilov <nikita.danilov@sun.com>
- */
-
-#define DEBUG_SUBSYSTEM S_LLITE
-
-#include <obd_support.h>
-#include "vvp_internal.h"
-
-/*****************************************************************************
- *
- * Vvp lock functions.
- *
- */
-
-static void vvp_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice)
-{
-	struct vvp_lock *vlk = cl2vvp_lock(slice);
-
-	OBD_SLAB_FREE_PTR(vlk, vvp_lock_kmem);
-}
-
-static int vvp_lock_enqueue(const struct lu_env *env,
-			    const struct cl_lock_slice *slice,
-			    struct cl_io *unused, struct cl_sync_io *anchor)
-{
-	CLOBINVRNT(env, slice->cls_obj, vvp_object_invariant(slice->cls_obj));
-
-	return 0;
-}
-
-static const struct cl_lock_operations vvp_lock_ops = {
-	.clo_fini	= vvp_lock_fini,
-	.clo_enqueue	= vvp_lock_enqueue,
-};
-
-int vvp_lock_init(const struct lu_env *env, struct cl_object *obj,
-		  struct cl_lock *lock, const struct cl_io *unused)
-{
-	struct vvp_lock *vlk;
-	int result;
-
-	CLOBINVRNT(env, obj, vvp_object_invariant(obj));
-
-	OBD_SLAB_ALLOC_PTR_GFP(vlk, vvp_lock_kmem, GFP_NOFS);
-	if (vlk != NULL) {
-		cl_lock_slice_add(lock, &vlk->vlk_cl, obj, &vvp_lock_ops);
-		result = 0;
-	} else {
-		result = -ENOMEM;
-	}
-
-	return result;
-}
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_object.c b/drivers/staging/lustrefsx/lustre/llite/vvp_object.c
index fd7211f60c61f..c3bf715667577 100644
--- a/drivers/staging/lustrefsx/lustre/llite/vvp_object.c
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_object.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -169,6 +169,13 @@ static int vvp_prune(const struct lu_env *env, struct cl_object *obj)
 	}
 
 	truncate_inode_pages(inode->i_mapping, 0);
+	if (inode->i_mapping->nrpages) {
+		CDEBUG(D_VFSTRACE, DFID ": still has %lu pages remaining\n",
+		       PFID(lu_object_fid(&obj->co_lu)),
+		       inode->i_mapping->nrpages);
+		RETURN(-EIO);
+	}
+
 	RETURN(0);
 }
 
@@ -198,26 +205,25 @@ static void vvp_req_attr_set(const struct lu_env *env, struct cl_object *obj,
 {
 	struct inode *inode;
 	struct obdo  *oa;
-	u64 valid_flags = OBD_MD_FLTYPE;
+	u64 valid_flags = OBD_MD_FLTYPE | OBD_MD_FLUID | OBD_MD_FLGID;
 
 	oa = attr->cra_oa;
 	inode = vvp_object_inode(obj);
 
 	if (attr->cra_type == CRT_WRITE) {
-		valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME |
-			       OBD_MD_FLUID | OBD_MD_FLGID;
+		valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME;
 		obdo_set_o_projid(oa, ll_i2info(inode)->lli_projid);
 	}
 	obdo_from_inode(oa, inode, valid_flags & attr->cra_flags);
 	obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
 	if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_INVALID_PFID))
 		oa->o_parent_oid++;
-	memcpy(attr->cra_jobid, ll_i2info(inode)->lli_jobid, LUSTRE_JOBID_SIZE);
+	memcpy(attr->cra_jobid, ll_i2info(inode)->lli_jobid,
+	       sizeof(attr->cra_jobid));
 }
 
 static const struct cl_object_operations vvp_ops = {
 	.coo_page_init    = vvp_page_init,
-	.coo_lock_init    = vvp_lock_init,
 	.coo_io_init      = vvp_io_init,
 	.coo_attr_get     = vvp_attr_get,
 	.coo_attr_update  = vvp_attr_update,
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_page.c b/drivers/staging/lustrefsx/lustre/llite/vvp_page.c
index 47d48639ad43c..0f4e2a9e83dac 100644
--- a/drivers/staging/lustrefsx/lustre/llite/vvp_page.c
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_page.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -54,16 +54,22 @@
  *
  */
 
-static void vvp_page_fini_common(struct vvp_page *vpg)
+static void vvp_page_fini_common(struct vvp_page *vpg, struct pagevec *pvec)
 {
 	struct page *vmpage = vpg->vpg_page;
 
 	LASSERT(vmpage != NULL);
-	put_page(vmpage);
+	if (pvec) {
+		if (!pagevec_add(pvec, vmpage))
+			pagevec_release(pvec);
+	} else {
+		put_page(vmpage);
+	}
 }
 
 static void vvp_page_fini(const struct lu_env *env,
-			  struct cl_page_slice *slice)
+			  struct cl_page_slice *slice,
+			  struct pagevec *pvec)
 {
 	struct vvp_page *vpg     = cl2vvp_page(slice);
 	struct page     *vmpage  = vpg->vpg_page;
@@ -73,7 +79,7 @@ static void vvp_page_fini(const struct lu_env *env,
 	 * VPG_FREEING state.
 	 */
 	LASSERT((struct cl_page *)vmpage->private != slice->cpl_page);
-	vvp_page_fini_common(vpg);
+	vvp_page_fini_common(vpg, pvec);
 }
 
 static int vvp_page_own(const struct lu_env *env,
@@ -144,7 +150,7 @@ static void vvp_page_discard(const struct lu_env *env,
 	LASSERT(vmpage != NULL);
 	LASSERT(PageLocked(vmpage));
 
-	if (vpg->vpg_defer_uptodate && !vpg->vpg_ra_used)
+	if (vpg->vpg_defer_uptodate && !vpg->vpg_ra_used && vmpage->mapping)
 		ll_ra_stats_inc(vmpage->mapping->host, RA_STAT_DISCARDED);
 
 	ll_invalidate_page(vmpage);
@@ -154,14 +160,12 @@ static void vvp_page_delete(const struct lu_env *env,
 			    const struct cl_page_slice *slice)
 {
 	struct page      *vmpage = cl2vm_page(slice);
-	struct inode     *inode  = vmpage->mapping->host;
-	struct cl_object *obj    = slice->cpl_obj;
 	struct cl_page   *page   = slice->cpl_page;
 	int refc;
 
 	LASSERT(PageLocked(vmpage));
 	LASSERT((struct cl_page *)vmpage->private == page);
-	LASSERT(inode == vvp_object_inode(obj));
+
 
 	/* Drop the reference count held in vvp_page_init */
 	refc = atomic_dec_return(&page->cp_ref);
@@ -242,8 +246,8 @@ static void vvp_vmpage_error(struct inode *inode, struct page *vmpage, int ioret
 		else
 			set_bit(AS_EIO, &inode->i_mapping->flags);
 
-		if ((ioret == -ESHUTDOWN || ioret == -EINTR) &&
-		     obj->vob_discard_page_warned == 0) {
+		if ((ioret == -ESHUTDOWN || ioret == -EINTR ||
+		     ioret == -EIO) && obj->vob_discard_page_warned == 0) {
 			obj->vob_discard_page_warned = 1;
 			ll_dirty_page_discard_warn(vmpage, ioret);
 		}
@@ -269,8 +273,14 @@ static void vvp_page_completion_read(const struct lu_env *env,
 	if (ioret == 0)  {
 		if (!vpg->vpg_defer_uptodate)
 			cl_page_export(env, page, 1);
-	} else {
+	} else if (vpg->vpg_defer_uptodate) {
 		vpg->vpg_defer_uptodate = 0;
+		if (ioret == -EWOULDBLOCK) {
+			/* mirror read failed, it needs to destroy the page
+			 * because subpage would be from wrong osc when trying
+			 * to read from a new mirror */
+			ll_invalidate_page(vmpage);
+		}
 	}
 
 	if (page->cp_sync_io == NULL)
@@ -484,13 +494,14 @@ vvp_transient_page_completion(const struct lu_env *env,
 }
 
 static void vvp_transient_page_fini(const struct lu_env *env,
-				    struct cl_page_slice *slice)
+				    struct cl_page_slice *slice,
+				    struct pagevec *pvec)
 {
 	struct vvp_page *vpg = cl2vvp_page(slice);
 	struct cl_page *clp = slice->cpl_page;
 	struct vvp_object *clobj = cl2vvp(clp->cp_obj);
 
-	vvp_page_fini_common(vpg);
+	vvp_page_fini_common(vpg, pvec);
 	atomic_dec(&clobj->vob_transient_pages);
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr.c b/drivers/staging/lustrefsx/lustre/llite/xattr.c
index 78c774ef738c4..7e6bfc0a51839 100644
--- a/drivers/staging/lustrefsx/lustre/llite/xattr.c
+++ b/drivers/staging/lustrefsx/lustre/llite/xattr.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -42,8 +42,8 @@
 
 #include <obd_support.h>
 #include <lustre_dlm.h>
-#include <lustre_ver.h>
 #include <lustre_eacl.h>
+#include <lustre_swab.h>
 
 #include "llite_internal.h"
 
@@ -105,7 +105,10 @@ static int ll_xattr_set_common(const struct xattr_handler *handler,
 	int rc;
 	ENTRY;
 
-	if (flags == XATTR_REPLACE) {
+	/* When setxattr() is called with a size of 0 the value is
+	 * unconditionally replaced by "". When removexattr() is
+	 * called we get a NULL value and XATTR_REPLACE for flags. */
+	if (!value && flags == XATTR_REPLACE) {
 		ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REMOVEXATTR, 1);
 		valid = OBD_MD_FLXATTRRM;
 	} else {
@@ -156,7 +159,7 @@ static int ll_xattr_set_common(const struct xattr_handler *handler,
 		RETURN(-ENOMEM);
 
 	rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode), valid, fullname,
-			 pv, size, 0, flags, ll_i2suppgid(inode), &req);
+			 pv, size, flags, ll_i2suppgid(inode), &req);
 	kfree(fullname);
 	if (rc) {
 		if (rc == -EOPNOTSUPP && handler->flags == XATTR_USER_T) {
@@ -201,7 +204,7 @@ static int get_hsm_state(struct inode *inode, u32 *hus_states)
 	return rc;
 }
 
-static int ll_adjust_lum(struct inode *inode, struct lov_user_md *lump)
+static int ll_adjust_lum(struct inode *inode, struct lov_user_md *lump, size_t size)
 {
 	struct lov_comp_md_v1 *comp_v1 = (struct lov_comp_md_v1 *)lump;
 	struct lov_user_md *v1 = lump;
@@ -216,7 +219,12 @@ static int ll_adjust_lum(struct inode *inode, struct lov_user_md *lump)
 		return 0;
 
 	if (lump->lmm_magic == LOV_USER_MAGIC_COMP_V1) {
+		if (size < sizeof(*comp_v1))
+			return -ERANGE;
+
 		entry_count = comp_v1->lcm_entry_count;
+		if (size < offsetof(typeof(*comp_v1), lcm_entries[entry_count]))
+			return -ERANGE;
 		is_composite = true;
 	}
 
@@ -224,6 +232,10 @@ static int ll_adjust_lum(struct inode *inode, struct lov_user_md *lump)
 		if (lump->lmm_magic == LOV_USER_MAGIC_COMP_V1) {
 			void *ptr = comp_v1;
 
+			if (comp_v1->lcm_entries[i].lcme_offset + sizeof(*v1) >
+			    size)
+				return -ERANGE;
+
 			ptr += comp_v1->lcm_entries[i].lcme_offset;
 			v1 = (struct lov_user_md *)ptr;
 		}
@@ -271,7 +283,13 @@ static int ll_setstripe_ea(struct dentry *dentry, struct lov_user_md *lump,
 	if (!size && lump)
 		lump = NULL;
 
-	rc = ll_adjust_lum(inode, lump);
+	if (size && size < sizeof(*lump)) {
+		/* ll_adjust_lum() or ll_lov_user_md_size() might access
+		 * before size - just give up now.
+		 */
+		return -ERANGE;
+	}
+	rc = ll_adjust_lum(inode, lump, size);
 	if (rc)
 		return rc;
 
@@ -333,6 +351,11 @@ static int ll_xattr_set(const struct xattr_handler *handler,
 		return 0;
 	}
 
+	if (strncmp(name, "lov.", 4) == 0 &&
+	    (__swab32(((struct lov_user_md *)value)->lmm_magic) &
+	    le32_to_cpu(LOV_MAGIC_MASK)) == le32_to_cpu(LOV_MAGIC_MAGIC))
+		lustre_swab_lov_user_md((struct lov_user_md *)value, 0);
+
 	return ll_xattr_set_common(handler, dentry, inode, name, value, size,
 				   flags);
 }
@@ -343,7 +366,6 @@ int ll_xattr_list(struct inode *inode, const char *name, int type, void *buffer,
 	struct ll_inode_info *lli = ll_i2info(inode);
         struct ll_sb_info *sbi = ll_i2sbi(inode);
         struct ptlrpc_request *req = NULL;
-        struct mdt_body *body;
         void *xdata;
 	int rc;
 	ENTRY;
@@ -370,35 +392,25 @@ int ll_xattr_list(struct inode *inode, const char *name, int type, void *buffer,
 		}
 	} else {
 getxattr_nocache:
-		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
-				 valid, name, NULL, 0, size, 0, &req);
+		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), valid,
+				 name, size, &req);
 		if (rc < 0)
 			GOTO(out_xattr, rc);
 
-		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
-		LASSERT(body);
-
 		/* only detect the xattr size */
 		if (size == 0)
-			GOTO(out, rc = body->mbo_eadatasize);
+			GOTO(out, rc);
 
-		if (size < body->mbo_eadatasize) {
-			CERROR("server bug: replied size %u > %u\n",
-				body->mbo_eadatasize, (int)size);
+		if (size < rc)
 			GOTO(out, rc = -ERANGE);
-		}
-
-		if (body->mbo_eadatasize == 0)
-			GOTO(out, rc = -ENODATA);
 
 		/* do not need swab xattr data */
 		xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA,
-							body->mbo_eadatasize);
+						     rc);
 		if (!xdata)
-			GOTO(out, rc = -EFAULT);
+			GOTO(out, rc = -EPROTO);
 
-		memcpy(buffer, xdata, body->mbo_eadatasize);
-		rc = body->mbo_eadatasize;
+		memcpy(buffer, xdata, rc);
 	}
 
 	EXIT;
@@ -511,21 +523,37 @@ static ssize_t ll_getxattr_lov(struct inode *inode, void *buf, size_t buf_size)
 		 * recognizing layout gen as stripe offset when the
 		 * file is restored. See LU-2809.
 		 */
-		if (((struct lov_mds_md *)buf)->lmm_magic == LOV_MAGIC_COMP_V1)
+		if ((((struct lov_mds_md *)buf)->lmm_magic &
+		    __swab32(LOV_MAGIC_MAGIC)) == __swab32(LOV_MAGIC_MAGIC))
+			lustre_swab_lov_user_md((struct lov_user_md *)buf,
+						cl.cl_size);
+
+		switch (((struct lov_mds_md *)buf)->lmm_magic) {
+		case LOV_MAGIC_V1:
+		case LOV_MAGIC_V3:
+		case LOV_MAGIC_SPECIFIC:
+			((struct lov_mds_md *)buf)->lmm_layout_gen = 0;
+			break;
+		case LOV_MAGIC_COMP_V1:
 			goto out_env;
+		default:
+			CERROR("Invalid LOV magic %08x\n",
+			       ((struct lov_mds_md *)buf)->lmm_magic);
+			GOTO(out_env, rc = -EINVAL);
+		}
 
-		((struct lov_mds_md *)buf)->lmm_layout_gen = 0;
 out_env:
 		cl_env_put(env, &refcheck);
 
 		RETURN(rc);
 	} else if (S_ISDIR(inode->i_mode)) {
 		struct ptlrpc_request *req = NULL;
+		struct ptlrpc_request *root_req = NULL;
 		struct lov_mds_md *lmm = NULL;
 		int lmm_size = 0;
 
-		rc = ll_dir_getstripe(inode, (void **)&lmm, &lmm_size,
-				      &req, 0);
+		rc = ll_dir_getstripe_default(inode, (void **)&lmm, &lmm_size,
+					      &req, &root_req, 0);
 		if (rc < 0)
 			GOTO(out_req, rc);
 
@@ -540,6 +568,8 @@ static ssize_t ll_getxattr_lov(struct inode *inode, void *buf, size_t buf_size)
 out_req:
 		if (req)
 			ptlrpc_req_finished(req);
+		if (root_req)
+			ptlrpc_req_finished(root_req);
 
 		RETURN(rc);
 	} else {
diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr26.c b/drivers/staging/lustrefsx/lustre/llite/xattr26.c
index 84e9b8bcbe915..28772dd5a74a1 100644
--- a/drivers/staging/lustrefsx/lustre/llite/xattr26.c
+++ b/drivers/staging/lustrefsx/lustre/llite/xattr26.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,7 +40,7 @@
 
 #include <obd_support.h>
 #include <lustre_dlm.h>
-#include <lustre_ver.h>
+#include <uapi/linux/lustre/lustre_ver.h>
 #include <lustre_eacl.h>
 
 #include "llite_internal.h"
@@ -152,7 +152,7 @@ int ll_setxattr_common(struct inode *inode, const char *name,
 	}
 
 	rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode), valid, name, pv,
-			 size, 0, flags, ll_i2suppgid(inode), &req);
+			 size, flags, ll_i2suppgid(inode), &req);
 	if (rc) {
 		if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) {
 			LCONSOLE_INFO("Disabling user_xattr feature because "
@@ -329,7 +329,6 @@ int ll_getxattr_common(struct inode *inode, const char *name,
 {
 	struct ll_sb_info *sbi = ll_i2sbi(inode);
 	struct ptlrpc_request *req = NULL;
-	struct mdt_body *body;
 	int xattr_type, rc;
 	void *xdata;
 	struct ll_inode_info *lli = ll_i2info(inode);
@@ -405,36 +404,25 @@ int ll_getxattr_common(struct inode *inode, const char *name,
 		}
 	} else {
 getxattr_nocache:
-		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
-				valid, name, NULL, 0, size, 0, &req);
-
+		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), valid,
+				 name, size, &req);
 		if (rc < 0)
 			GOTO(out_xattr, rc);
 
-		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
-		LASSERT(body);
-
 		/* only detect the xattr size */
 		if (size == 0)
-			GOTO(out, rc = body->mbo_eadatasize);
+			GOTO(out, rc);
 
-		if (size < body->mbo_eadatasize) {
-			CERROR("server bug: replied size %u > %u\n",
-				body->mbo_eadatasize, (int)size);
+		if (size < rc)
 			GOTO(out, rc = -ERANGE);
-		}
-
-		if (body->mbo_eadatasize == 0)
-			GOTO(out, rc = -ENODATA);
 
 		/* do not need swab xattr data */
 		xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA,
-							body->mbo_eadatasize);
+						     rc);
 		if (!xdata)
-			GOTO(out, rc = -EFAULT);
+			GOTO(out, rc = -EPROTO);
 
-		memcpy(buffer, xdata, body->mbo_eadatasize);
-		rc = body->mbo_eadatasize;
+		memcpy(buffer, xdata, rc);
 	}
 
 	EXIT;
diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr_cache.c b/drivers/staging/lustrefsx/lustre/llite/xattr_cache.c
index a001e5c2d8a7b..f1022b0296f47 100644
--- a/drivers/staging/lustrefsx/lustre/llite/xattr_cache.c
+++ b/drivers/staging/lustrefsx/lustre/llite/xattr_cache.c
@@ -24,7 +24,7 @@
 /*
  * Copyright 2012 Xyratex Technology Limited
  *
- * Copyright (c) 2013, 2016, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
  *
  * Author: Andrew Perepechko <Andrew_Perepechko@xyratex.com>
  *
@@ -37,7 +37,6 @@
 #include <linux/mm.h>
 #include <obd_support.h>
 #include <lustre_dlm.h>
-#include <lustre_ver.h>
 #include "llite_internal.h"
 
 /* If we ever have hundreds of extended attributes, we might want to consider
diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr_security.c b/drivers/staging/lustrefsx/lustre/llite/xattr_security.c
index 8f2e2e5cc1fa0..094266223b3bd 100644
--- a/drivers/staging/lustrefsx/lustre/llite/xattr_security.c
+++ b/drivers/staging/lustrefsx/lustre/llite/xattr_security.c
@@ -205,3 +205,36 @@ ll_inode_init_security(struct dentry *dentry, struct inode *inode,
 	return err;
 }
 #endif /* HAVE_SECURITY_IINITSEC_CALLBACK */
+
+/**
+ * Get security context xattr name used by policy.
+ *
+ * \retval >= 0     length of xattr name
+ * \retval < 0      failure to get security context xattr name
+ */
+int
+ll_listsecurity(struct inode *inode, char *secctx_name, size_t secctx_name_size)
+{
+	int rc;
+
+	if (!selinux_is_enabled())
+		return 0;
+
+#ifdef HAVE_SECURITY_INODE_LISTSECURITY
+	rc = security_inode_listsecurity(inode, secctx_name, secctx_name_size);
+	if (rc >= secctx_name_size)
+		rc = -ERANGE;
+	else if (rc >= 0)
+		secctx_name[rc] = '\0';
+	return rc;
+#else /* !HAVE_SECURITY_INODE_LISTSECURITY */
+	rc = sizeof(XATTR_NAME_SELINUX);
+	if (secctx_name && rc < secctx_name_size) {
+		memcpy(secctx_name, XATTR_NAME_SELINUX, rc);
+		secctx_name[rc] = '\0';
+	} else {
+		rc = -ERANGE;
+	}
+	return rc;
+#endif /* HAVE_SECURITY_INODE_LISTSECURITY */
+}
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c b/drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c
index b5ec306dcc224..b439d87ae9348 100644
--- a/drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c
@@ -40,7 +40,6 @@
 #include <linux/seq_file.h>
 
 #include <obd_support.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_fid.h>
 #include <lustre_lib.h>
 #include <lustre_net.h>
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c b/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c
index bb792e751e94f..24c616b4b6cd9 100644
--- a/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c
@@ -42,7 +42,6 @@
 #include <lustre_intent.h>
 
 #include <obd_support.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_lib.h>
 #include <lustre_net.h>
 #include <lustre_dlm.h>
@@ -55,7 +54,8 @@ static int lmv_intent_remote(struct obd_export *exp, struct lookup_intent *it,
 			     const struct lu_fid *parent_fid,
 			     struct ptlrpc_request **reqp,
 			     ldlm_blocking_callback cb_blocking,
-			     __u64 extra_lock_flags)
+			     __u64 extra_lock_flags,
+			     const char *secctx_name, __u32 secctx_name_size)
 {
 	struct obd_device	*obd = exp->exp_obd;
 	struct lmv_obd		*lmv = &obd->u.lmv;
@@ -74,13 +74,6 @@ static int lmv_intent_remote(struct obd_export *exp, struct lookup_intent *it,
 
 	LASSERT((body->mbo_valid & OBD_MD_MDS));
 
-	/*
-	 * Unfortunately, we have to lie to MDC/MDS to retrieve
-	 * attributes llite needs and provideproper locking.
-	 */
-	if (it->it_op & IT_LOOKUP)
-		it->it_op = IT_GETATTR;
-
 	/*
 	 * We got LOOKUP lock, but we really need attrs.
 	 */
@@ -115,6 +108,16 @@ static int lmv_intent_remote(struct obd_export *exp, struct lookup_intent *it,
 	CDEBUG(D_INODE, "REMOTE_INTENT with fid="DFID" -> mds #%u\n",
 	       PFID(&body->mbo_fid1), tgt->ltd_idx);
 
+	/* ask for security context upon intent */
+	if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_OPEN) &&
+	    secctx_name_size != 0 && secctx_name != NULL) {
+		op_data->op_file_secctx_name = secctx_name;
+		op_data->op_file_secctx_name_size = secctx_name_size;
+		CDEBUG(D_SEC, "'%.*s' is security xattr to fetch for "
+		       DFID"\n",
+		       secctx_name_size, secctx_name, PFID(&body->mbo_fid1));
+	}
+
 	rc = md_intent_lock(tgt->ltd_exp, op_data, it, &req, cb_blocking,
 			    extra_lock_flags);
         if (rc)
@@ -153,13 +156,14 @@ int lmv_revalidate_slaves(struct obd_export *exp,
 			  ldlm_blocking_callback cb_blocking,
 			  int extra_lock_flags)
 {
-	struct obd_device      *obd = exp->exp_obd;
-	struct lmv_obd         *lmv = &obd->u.lmv;
-	struct ptlrpc_request	*req = NULL;
-	struct mdt_body		*body;
-	struct md_op_data      *op_data;
-	int                     i;
-	int                     rc = 0;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct ptlrpc_request *req = NULL;
+	struct mdt_body *body;
+	struct md_op_data *op_data;
+	int i;
+	int valid_stripe_count = 0;
+	int rc = 0;
 
 	ENTRY;
 
@@ -185,6 +189,9 @@ int lmv_revalidate_slaves(struct obd_export *exp,
 		fid = lsm->lsm_md_oinfo[i].lmo_fid;
 		inode = lsm->lsm_md_oinfo[i].lmo_root;
 
+		if (!inode)
+			continue;
+
 		/*
 		 * Prepare op_data for revalidating. Note that @fid2 shluld be
 		 * defined otherwise it will go to server and take new lock
@@ -193,8 +200,14 @@ int lmv_revalidate_slaves(struct obd_export *exp,
 		memset(op_data, 0, sizeof(*op_data));
 		op_data->op_fid1 = fid;
 		op_data->op_fid2 = fid;
+		/* shard revalidate only needs to fetch attributes and UPDATE
+		 * lock, which is similar to the bottom half of remote object
+		 * getattr, set this flag so that MDT skips checking whether
+		 * it's remote object.
+		 */
+		op_data->op_bias = MDS_CROSS_REF;
 
-		tgt = lmv_locate_mds(lmv, op_data, &fid);
+		tgt = lmv_get_target(lmv, lsm->lsm_md_oinfo[i].lmo_mds, NULL);
 		if (IS_ERR(tgt))
 			GOTO(cleanup, rc = PTR_ERR(tgt));
 
@@ -208,6 +221,12 @@ int lmv_revalidate_slaves(struct obd_export *exp,
 
 		rc = md_intent_lock(tgt->ltd_exp, op_data, &it, &req,
 				    cb_blocking, extra_lock_flags);
+		if (rc == -ENOENT) {
+			/* skip stripe is not exists */
+			rc = 0;
+			continue;
+		}
+
 		if (rc < 0)
 			GOTO(cleanup, rc);
 
@@ -243,17 +262,22 @@ int lmv_revalidate_slaves(struct obd_export *exp,
 			ldlm_lock_decref(lockh, it.it_lock_mode);
 			it.it_lock_mode = 0;
 		}
+
+		valid_stripe_count++;
 	}
 
 cleanup:
 	if (req != NULL)
 		ptlrpc_req_finished(req);
 
+	/* if all stripes are invalid, return -ENOENT to notify user */
+	if (!rc && !valid_stripe_count)
+		rc = -ENOENT;
+
 	OBD_FREE_PTR(op_data);
 	RETURN(rc);
 }
 
-
 /*
  * IT_OPEN is intended to open (and create, possible) an object. Parent (pid)
  * may be split dir.
@@ -264,13 +288,58 @@ static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
 			   ldlm_blocking_callback cb_blocking,
 			   __u64 extra_lock_flags)
 {
-	struct obd_device	*obd = exp->exp_obd;
-	struct lmv_obd		*lmv = &obd->u.lmv;
-	struct lmv_tgt_desc	*tgt;
-	struct mdt_body		*body;
-	int			rc;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	struct mdt_body *body;
+	__u64 flags = it->it_flags;
+	int rc;
+
 	ENTRY;
 
+	if ((it->it_op & IT_CREAT) && !(flags & MDS_OPEN_BY_FID)) {
+		/* don't allow create under dir with bad hash */
+		if (lmv_is_dir_bad_hash(op_data->op_mea1))
+			RETURN(-EBADF);
+
+		if (lmv_is_dir_migrating(op_data->op_mea1)) {
+			if (flags & O_EXCL) {
+				/*
+				 * open(O_CREAT | O_EXCL) needs to check
+				 * existing name, which should be done on both
+				 * old and new layout, to avoid creating new
+				 * file under old layout, check old layout on
+				 * client side.
+				 */
+				tgt = lmv_locate_tgt(lmv, op_data,
+						     &op_data->op_fid1);
+				if (IS_ERR(tgt))
+					RETURN(PTR_ERR(tgt));
+
+				rc = md_getattr_name(tgt->ltd_exp, op_data,
+						     reqp);
+				if (!rc) {
+					ptlrpc_req_finished(*reqp);
+					*reqp = NULL;
+					RETURN(-EEXIST);
+				}
+
+				if (rc != -ENOENT)
+					RETURN(rc);
+
+				op_data->op_post_migrate = true;
+			} else {
+				/*
+				 * open(O_CREAT) will be sent to MDT in old
+				 * layout first, to avoid creating new file
+				 * under old layout, clear O_CREAT.
+				 */
+				it->it_flags &= ~O_CREAT;
+			}
+		}
+	}
+
+retry:
 	if (it->it_flags & MDS_OPEN_BY_FID) {
 		LASSERT(fid_is_sane(&op_data->op_fid2));
 
@@ -290,7 +359,7 @@ static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
 		LASSERT(fid_is_zero(&op_data->op_fid2));
 		LASSERT(op_data->op_name != NULL);
 
-		tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+		tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
 		if (IS_ERR(tgt))
 			RETURN(PTR_ERR(tgt));
 	}
@@ -321,8 +390,21 @@ static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
 	 */
 	if ((it->it_disposition & DISP_LOOKUP_NEG) &&
 	    !(it->it_disposition & DISP_OPEN_CREATE) &&
-	    !(it->it_disposition & DISP_OPEN_OPEN))
+	    !(it->it_disposition & DISP_OPEN_OPEN)) {
+		if (!(it->it_flags & MDS_OPEN_BY_FID) &&
+		    lmv_dir_retry_check_update(op_data)) {
+			ptlrpc_req_finished(*reqp);
+			it->it_request = NULL;
+			it->it_disposition = 0;
+			*reqp = NULL;
+
+			it->it_flags = flags;
+			fid_zero(&op_data->op_fid2);
+			goto retry;
+		}
+
 		RETURN(rc);
+	}
 
 	body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
 	if (body == NULL)
@@ -331,7 +413,9 @@ static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
 	/* Not cross-ref case, just get out of here. */
 	if (unlikely((body->mbo_valid & OBD_MD_MDS))) {
 		rc = lmv_intent_remote(exp, it, &op_data->op_fid1, reqp,
-				       cb_blocking, extra_lock_flags);
+				       cb_blocking, extra_lock_flags,
+				       op_data->op_file_secctx_name,
+				       op_data->op_file_secctx_name_size);
 		if (rc != 0)
 			RETURN(rc);
 
@@ -352,42 +436,56 @@ lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data,
 		  ldlm_blocking_callback cb_blocking,
 		  __u64 extra_lock_flags)
 {
-	struct obd_device	*obd = exp->exp_obd;
-	struct lmv_obd		*lmv = &obd->u.lmv;
-	struct lmv_tgt_desc	*tgt = NULL;
-	struct mdt_body		*body;
-	struct lmv_stripe_md	*lsm = op_data->op_mea1;
-	int			rc = 0;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt = NULL;
+	struct mdt_body *body;
+	int rc;
 	ENTRY;
 
-	/* If it returns ERR_PTR(-EBADFD) then it is an unknown hash type
-	 * it will try all stripes to locate the object */
-	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
-	if (IS_ERR(tgt) && (PTR_ERR(tgt) != -EBADFD))
-		RETURN(PTR_ERR(tgt));
-
-	/* Both migrating dir and unknown hash dir need to try
-	 * all of sub-stripes */
-	if (lsm != NULL && !lmv_is_known_hash_type(lsm->lsm_md_hash_type)) {
-		struct lmv_oinfo *oinfo;
+retry:
+	if (op_data->op_flags & MF_GETATTR_BY_FID) {
+		/* getattr by FID, replace fid1 with stripe FID,
+		 * NB, don't replace if name is "/", because it may be a subtree
+		 * mount, and if it's a striped directory, fid1 will be replaced
+		 * to stripe FID by hash, while fid2 is master object FID, which
+		 * will be treated as a remote object if the two FIDs are
+		 * located on different MDTs, and LOOKUP lock can't be fetched.
+		 */
+		LASSERT(op_data->op_name);
+		if (op_data->op_namelen != 1 ||
+		    strncmp(op_data->op_name, "/", 1) != 0) {
+			tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
+			if (IS_ERR(tgt))
+				RETURN(PTR_ERR(tgt));
+		}
 
-		oinfo = &lsm->lsm_md_oinfo[0];
+		/* name is used to locate stripe target, clear it here
+		 * to avoid packing name in request, so that MDS knows
+		 * it's getattr by FID.
+		 */
+		op_data->op_name = NULL;
+		op_data->op_namelen = 0;
 
-		op_data->op_fid1 = oinfo->lmo_fid;
-		op_data->op_mds = oinfo->lmo_mds;
-		tgt = lmv_get_target(lmv, oinfo->lmo_mds, NULL);
-		if (IS_ERR(tgt))
-			RETURN(PTR_ERR(tgt));
+		/* getattr request is sent to MDT where fid2 inode is */
+		tgt = lmv_find_target(lmv, &op_data->op_fid2);
+	} else if (op_data->op_name) {
+		/* getattr by name */
+		tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
+		if (!fid_is_sane(&op_data->op_fid2))
+			fid_zero(&op_data->op_fid2);
+	} else {
+		/* old way to getattr by FID, parent FID not packed */
+		tgt = lmv_find_target(lmv, &op_data->op_fid1);
 	}
-
-	if (!fid_is_sane(&op_data->op_fid2))
-		fid_zero(&op_data->op_fid2);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
 
 	CDEBUG(D_INODE, "LOOKUP_INTENT with fid1="DFID", fid2="DFID
-	       ", name='%s' -> mds #%u lsm=%p lsm_magic=%x\n",
+	       ", name='%s' -> mds #%u\n",
 	       PFID(&op_data->op_fid1), PFID(&op_data->op_fid2),
 	       op_data->op_name ? op_data->op_name : "<NULL>",
-	       tgt->ltd_idx, lsm, lsm == NULL ? -1 : lsm->lsm_md_magic);
+	       tgt->ltd_idx);
 
 	op_data->op_bias &= ~MDS_CROSS_REF;
 
@@ -407,37 +505,14 @@ lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data,
 				RETURN(rc);
 		}
 		RETURN(rc);
-	} else if (it_disposition(it, DISP_LOOKUP_NEG) && lsm != NULL &&
-		   lmv_need_try_all_stripes(lsm)) {
-		/* For migrating and unknown hash type directory, it will
-		 * try to target the entry on other stripes */
-		int stripe_index;
-
-		for (stripe_index = 1;
-		     stripe_index < lsm->lsm_md_stripe_count &&
-		     it_disposition(it, DISP_LOOKUP_NEG); stripe_index++) {
-			struct lmv_oinfo *oinfo;
-
-			/* release the previous request */
-			ptlrpc_req_finished(*reqp);
-			it->it_request = NULL;
-			*reqp = NULL;
-
-			oinfo = &lsm->lsm_md_oinfo[stripe_index];
-			tgt = lmv_find_target(lmv, &oinfo->lmo_fid);
-			if (IS_ERR(tgt))
-				RETURN(PTR_ERR(tgt));
-
-			CDEBUG(D_INODE, "Try other stripes " DFID"\n",
-			       PFID(&oinfo->lmo_fid));
+	} else if (it_disposition(it, DISP_LOOKUP_NEG) &&
+		   lmv_dir_retry_check_update(op_data)) {
+		ptlrpc_req_finished(*reqp);
+		it->it_request = NULL;
+		it->it_disposition = 0;
+		*reqp = NULL;
 
-			op_data->op_fid1 = oinfo->lmo_fid;
-			it->it_disposition &= ~DISP_ENQ_COMPLETE;
-			rc = md_intent_lock(tgt->ltd_exp, op_data, it, reqp,
-					    cb_blocking, extra_lock_flags);
-			if (rc != 0)
-				RETURN(rc);
-		}
+		goto retry;
 	}
 
 	if (!it_has_reply_body(it))
@@ -454,7 +529,9 @@ lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data,
 	/* Not cross-ref case, just get out of here. */
 	if (unlikely((body->mbo_valid & OBD_MD_MDS))) {
 		rc = lmv_intent_remote(exp, it, NULL, reqp, cb_blocking,
-				       extra_lock_flags);
+				       extra_lock_flags,
+				       op_data->op_file_secctx_name,
+				       op_data->op_file_secctx_name_size);
 		if (rc != 0)
 			RETURN(rc);
 		body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h b/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h
index 8ef0631f3301a..0ad743244e93e 100644
--- a/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h
@@ -33,7 +33,6 @@
 #ifndef _LMV_INTERNAL_H_
 #define _LMV_INTERNAL_H_
 
-#include <lustre/lustre_idl.h>
 #include <obd.h>
 #include <lustre_lmv.h>
 
@@ -59,6 +58,9 @@ int lmv_revalidate_slaves(struct obd_export *exp,
 			  ldlm_blocking_callback cb_blocking,
 			  int extra_lock_flags);
 
+int lmv_getattr_name(struct obd_export *exp, struct md_op_data *op_data,
+		     struct ptlrpc_request **preq);
+
 static inline struct obd_device *lmv2obd_dev(struct lmv_obd *lmv)
 {
 	return container_of0(lmv, struct obd_device, u.lmv);
@@ -123,39 +125,90 @@ static inline int lmv_stripe_md_size(int stripe_count)
 	return sizeof(*lsm) + stripe_count * sizeof(lsm->lsm_md_oinfo[0]);
 }
 
+/* for file under migrating directory, return the target stripe info */
 static inline const struct lmv_oinfo *
 lsm_name_to_stripe_info(const struct lmv_stripe_md *lsm, const char *name,
-			int namelen)
+			int namelen, bool post_migrate)
 {
+	__u32 hash_type = lsm->lsm_md_hash_type;
+	__u32 stripe_count = lsm->lsm_md_stripe_count;
 	int stripe_index;
 
-	stripe_index = lmv_name_to_stripe_index(lsm->lsm_md_hash_type,
-						lsm->lsm_md_stripe_count,
+	if (hash_type & LMV_HASH_FLAG_MIGRATION) {
+		if (post_migrate) {
+			hash_type &= ~LMV_HASH_FLAG_MIGRATION;
+			stripe_count = lsm->lsm_md_migrate_offset;
+		} else {
+			hash_type = lsm->lsm_md_migrate_hash;
+			stripe_count -= lsm->lsm_md_migrate_offset;
+		}
+	}
+
+	stripe_index = lmv_name_to_stripe_index(hash_type, stripe_count,
 						name, namelen);
 	if (stripe_index < 0)
 		return ERR_PTR(stripe_index);
 
-	LASSERTF(stripe_index < lsm->lsm_md_stripe_count,
-		 "stripe_index = %d, stripe_count = %d hash_type = %x"
-		 "name = %.*s\n", stripe_index, lsm->lsm_md_stripe_count,
-		 lsm->lsm_md_hash_type, namelen, name);
+	if ((lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION) && !post_migrate)
+		stripe_index += lsm->lsm_md_migrate_offset;
+
+	if (stripe_index >= lsm->lsm_md_stripe_count) {
+		CERROR("stripe_index %d stripe_count %d hash_type %#x "
+			"migrate_offset %d migrate_hash %#x name %.*s\n",
+			stripe_index, lsm->lsm_md_stripe_count,
+			lsm->lsm_md_hash_type, lsm->lsm_md_migrate_offset,
+			lsm->lsm_md_migrate_hash, namelen, name);
+		return ERR_PTR(-EBADF);
+	}
 
 	return &lsm->lsm_md_oinfo[stripe_index];
 }
 
-static inline bool lmv_need_try_all_stripes(const struct lmv_stripe_md *lsm)
+static inline bool lmv_is_dir_migrating(const struct lmv_stripe_md *lsm)
 {
-	return !lmv_is_known_hash_type(lsm->lsm_md_hash_type) ||
-	       lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION;
+	return lsm ? lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION : false;
+}
+
+static inline bool lmv_is_dir_bad_hash(const struct lmv_stripe_md *lsm)
+{
+	if (!lsm)
+		return false;
+
+	if (lmv_is_dir_migrating(lsm)) {
+		if (lsm->lsm_md_stripe_count - lsm->lsm_md_migrate_offset > 1)
+			return !lmv_is_known_hash_type(
+					lsm->lsm_md_migrate_hash);
+		return false;
+	}
+
+	return !lmv_is_known_hash_type(lsm->lsm_md_hash_type);
 }
 
-struct lmv_tgt_desc
-*lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
-		struct lu_fid *fid);
+static inline bool lmv_dir_retry_check_update(struct md_op_data *op_data)
+{
+	const struct lmv_stripe_md *lsm = op_data->op_mea1;
+
+	if (!lsm)
+		return false;
+
+	if (lmv_is_dir_migrating(lsm) && !op_data->op_post_migrate) {
+		op_data->op_post_migrate = true;
+		return true;
+	}
+
+	if (lmv_is_dir_bad_hash(lsm) &&
+	    op_data->op_stripe_index < lsm->lsm_md_stripe_count - 1) {
+		op_data->op_stripe_index++;
+		return true;
+	}
+
+	return false;
+}
+
+struct lmv_tgt_desc *lmv_locate_tgt(struct lmv_obd *lmv,
+				    struct md_op_data *op_data,
+				    struct lu_fid *fid);
 /* lproc_lmv.c */
-#ifdef CONFIG_PROC_FS
-extern struct lprocfs_vars lprocfs_lmv_obd_vars[];
-#endif
-extern const struct proc_ops lmv_proc_target_fops;
+int lmv_tunables_init(struct obd_device *obd);
 
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c b/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c
index 8b073a6d9846f..078f6e2a59aad 100644
--- a/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -31,7 +31,8 @@
  */
 
 #define DEBUG_SUBSYSTEM S_LMV
-#include <linux/slab.h>
+
+#include <linux/file.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/user_namespace.h>
@@ -45,7 +46,6 @@
 #include <linux/seq_file.h>
 #include <linux/namei.h>
 
-#include <lustre/lustre_idl.h>
 #include <obd_support.h>
 #include <lustre_lib.h>
 #include <lustre_net.h>
@@ -54,7 +54,7 @@
 #include <lprocfs_status.h>
 #include <cl_object.h>
 #include <lustre_fid.h>
-#include <uapi/linux/lustre_ioctl.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
 #include <lustre_kernelcomm.h>
 #include "lmv_internal.h"
 
@@ -213,30 +213,24 @@ static int lmv_connect(const struct lu_env *env,
 	lmv->connected = 0;
 	lmv->conn_data = *data;
 
-	if (lmv->targets_proc_entry == NULL) {
-		lmv->targets_proc_entry = lprocfs_register("target_obds",
-							   obd->obd_proc_entry,
-							   NULL, NULL);
-		if (IS_ERR(lmv->targets_proc_entry)) {
-			CERROR("%s: cannot register "
-			       "/proc/fs/lustre/%s/%s/target_obds\n",
-			       obd->obd_name, obd->obd_type->typ_name,
-			       obd->obd_name);
-			lmv->targets_proc_entry = NULL;
-		}
+	lmv->lmv_tgts_kobj = kobject_create_and_add("target_obds",
+						    &obd->obd_kset.kobj);
+	if (!lmv->lmv_tgts_kobj) {
+		CERROR("%s: cannot create /sys/fs/lustre/%s/%s/target_obds\n",
+		       obd->obd_name, obd->obd_type->typ_name, obd->obd_name);
 	}
 
 	rc = lmv_check_connect(obd);
 	if (rc != 0)
-		GOTO(out_proc, rc);
+		GOTO(out_sysfs, rc);
 
 	*pexp = exp;
 
 	RETURN(rc);
 
-out_proc:
-	if (lmv->targets_proc_entry != NULL)
-		lprocfs_remove(&lmv->targets_proc_entry);
+out_sysfs:
+	if (lmv->lmv_tgts_kobj)
+		kobject_put(lmv->lmv_tgts_kobj);
 
 	class_disconnect(exp);
 
@@ -271,10 +265,12 @@ static int lmv_init_ea_size(struct obd_export *exp, __u32 easize,
 	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
 		struct lmv_tgt_desc *tgt = lmv->tgts[i];
 
-		if (tgt == NULL || tgt->ltd_exp == NULL || !tgt->ltd_active) {
+		if (tgt == NULL || tgt->ltd_exp == NULL) {
 			CWARN("%s: NULL export for %d\n", obd->obd_name, i);
 			continue;
 		}
+		if (!tgt->ltd_active)
+			continue;
 
 		rc = md_init_ea_size(tgt->ltd_exp, easize, def_easize);
 		if (rc) {
@@ -363,23 +359,11 @@ int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
 		mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
 		atomic_read(&obd->obd_refcount));
 
-	if (lmv->targets_proc_entry != NULL) {
-		struct proc_dir_entry *mdc_symlink;
-
-		LASSERT(mdc_obd->obd_type != NULL);
-		LASSERT(mdc_obd->obd_type->typ_name != NULL);
-		mdc_symlink = lprocfs_add_symlink(mdc_obd->obd_name,
-						  lmv->targets_proc_entry,
-						  "../../../%s/%s",
-						  mdc_obd->obd_type->typ_name,
-						  mdc_obd->obd_name);
-		if (mdc_symlink == NULL) {
-			CERROR("cannot register LMV target "
-			       "/proc/fs/lustre/%s/%s/target_obds/%s\n",
-			       obd->obd_type->typ_name, obd->obd_name,
-			       mdc_obd->obd_name);
-		}
-	}
+	if (lmv->lmv_tgts_kobj)
+		/* Even if we failed to create the link, that's fine */
+		rc = sysfs_create_link(lmv->lmv_tgts_kobj,
+				       &mdc_obd->obd_kset.kobj,
+				       mdc_obd->obd_name);
 	RETURN(0);
 }
 
@@ -415,7 +399,7 @@ static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
 	mutex_lock(&lmv->lmv_init_mutex);
 	if ((index < lmv->tgts_size) && (lmv->tgts[index] != NULL)) {
 		tgt = lmv->tgts[index];
-		CERROR("%s: UUID %s already assigned at LOV target index %d:"
+		CERROR("%s: UUID %s already assigned at LMV target index %d:"
 		       " rc = %d\n", obd->obd_name,
 		       obd_uuid2str(&tgt->ltd_uuid), index, -EEXIST);
 		mutex_unlock(&lmv->lmv_init_mutex);
@@ -584,9 +568,9 @@ static int lmv_disconnect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
                 mdc_obd->obd_fail = obd->obd_fail;
                 mdc_obd->obd_no_recov = obd->obd_no_recov;
 
-		if (lmv->targets_proc_entry != NULL)
-			lprocfs_remove_proc_entry(mdc_obd->obd_name,
-						  lmv->targets_proc_entry);
+		if (lmv->lmv_tgts_kobj)
+			sysfs_remove_link(lmv->lmv_tgts_kobj,
+					  mdc_obd->obd_name);
 	}
 
 	rc = obd_fid_fini(tgt->ltd_exp->exp_obd);
@@ -629,11 +613,8 @@ static int lmv_disconnect(struct obd_export *exp)
 		lmv_disconnect_mdc(obd, lmv->tgts[i]);
         }
 
-	if (lmv->targets_proc_entry != NULL)
-		lprocfs_remove(&lmv->targets_proc_entry);
-	else
-		CERROR("/proc/fs/lustre/%s/%s/target_obds missing\n",
-		       obd->obd_type->typ_name, obd->obd_name);
+	if (lmv->lmv_tgts_kobj)
+		kobject_put(lmv->lmv_tgts_kobj);
 
 out_local:
         /*
@@ -681,8 +662,8 @@ static int lmv_fid2path(struct obd_export *exp, int len, void *karg,
 		int len;
 
 		ori_gf = (struct getinfo_fid2path *)karg;
-		if (strlen(ori_gf->gf_u.gf_path) +
-		    strlen(gf->gf_u.gf_path) > ori_gf->gf_pathlen)
+		if (strlen(ori_gf->gf_u.gf_path) + 1 +
+		    strlen(gf->gf_u.gf_path) + 1 > ori_gf->gf_pathlen)
 			GOTO(out_fid2path, rc = -EOVERFLOW);
 
 		ptr = ori_gf->gf_u.gf_path;
@@ -819,23 +800,42 @@ static int lmv_hsm_ct_register(struct obd_device *obd, unsigned int cmd,
 			       void __user *uarg)
 {
 	struct lmv_obd *lmv = &obd->u.lmv;
-	struct file		*filp;
-	__u32			 i, j;
-	int			 err;
-	bool			 any_set = false;
-	struct kkuc_ct_data	 kcd = {
-		.kcd_magic   = KKUC_CT_DATA_MAGIC,
-		.kcd_archive = lk->lk_data,
-	};
-	int			 rc = 0;
+	struct file *filp;
+	__u32 i, j;
+	int err;
+	bool any_set = false;
+	struct kkuc_ct_data *kcd;
+	size_t kcd_size;
+	int rc = 0;
 	ENTRY;
 
 	filp = fget(lk->lk_wfd);
 	if (!filp)
 		RETURN(-EBADF);
 
+	if (lk->lk_flags & LK_FLG_DATANR)
+		kcd_size = offsetof(struct kkuc_ct_data,
+				    kcd_archives[lk->lk_data_count]);
+	else
+		kcd_size = sizeof(*kcd);
+
+	OBD_ALLOC(kcd, kcd_size);
+	if (kcd == NULL)
+		GOTO(err_fput, rc = -ENOMEM);
+
+	kcd->kcd_nr_archives = lk->lk_data_count;
+	if (lk->lk_flags & LK_FLG_DATANR) {
+		kcd->kcd_magic = KKUC_CT_DATA_ARRAY_MAGIC;
+		if (lk->lk_data_count > 0)
+			memcpy(kcd->kcd_archives, lk->lk_data,
+			       sizeof(*kcd->kcd_archives) * lk->lk_data_count);
+	} else {
+		kcd->kcd_magic = KKUC_CT_DATA_BITMAP_MAGIC;
+	}
+
 	rc = libcfs_kkuc_group_add(filp, &obd->obd_uuid, lk->lk_uid,
-				   lk->lk_group, &kcd, sizeof(kcd));
+				   lk->lk_group, kcd, kcd_size);
+	OBD_FREE(kcd, kcd_size);
 	if (rc)
 		GOTO(err_fput, rc);
 
@@ -934,7 +934,7 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
 			RETURN(-EFAULT);
 
 		rc = obd_statfs(NULL, tgt->ltd_exp, &stat_buf,
-				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
 				0);
 		if (rc)
 			RETURN(rc);
@@ -1175,7 +1175,7 @@ static int lmv_placement_policy(struct obd_device *obd,
 	 * 1. See if the stripe offset is specified by lum.
 	 * 2. Then check if there is default stripe offset.
 	 * 3. Finally choose MDS by name hash if the parent
-	 *    is striped directory. (see lmv_locate_mds()). */
+	 *    is striped directory. (see lmv_locate_tgt()). */
 	if (op_data->op_cli_flags & CLI_SET_MEA && lum != NULL &&
 	    le32_to_cpu(lum->lum_stripe_offset) != (__u32)-1) {
 		*mds = le32_to_cpu(lum->lum_stripe_offset);
@@ -1287,16 +1287,11 @@ static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 	spin_lock_init(&lmv->lmv_lock);
 	mutex_init(&lmv->lmv_init_mutex);
 
-#ifdef CONFIG_PROC_FS
-	obd->obd_vars = lprocfs_lmv_obd_vars;
-	lprocfs_obd_setup(obd);
-	lprocfs_alloc_md_stats(obd, 0);
-	rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd",
-				0444, &lmv_proc_target_fops, obd);
+	rc = lmv_tunables_init(obd);
 	if (rc)
-		CWARN("%s: error adding LMV target_obd file: rc = %d\n",
+		CWARN("%s: error adding LMV sysfs/debugfs files: rc = %d\n",
 		      obd->obd_name, rc);
-#endif
+
 	rc = fld_client_init(&lmv->lmv_fld, obd->obd_name,
 			     LUSTRE_CLI_FLD_HASH_DHT);
 	if (rc) {
@@ -1361,49 +1356,88 @@ static int lmv_process_config(struct obd_device *obd, size_t len, void *buf)
 	RETURN(rc);
 }
 
+static int lmv_select_statfs_mdt(struct lmv_obd *lmv, __u32 flags)
+{
+	int i;
+
+	if (flags & OBD_STATFS_FOR_MDT0)
+		return 0;
+
+	if (lmv->lmv_statfs_start || lmv->desc.ld_tgt_count == 1)
+		return lmv->lmv_statfs_start;
+
+	/* choose initial MDT for this client */
+	for (i = 0;; i++) {
+		struct lnet_process_id lnet_id;
+		if (LNetGetId(i, &lnet_id) == -ENOENT)
+			break;
+
+		if (lnet_id.nid != LNET_NID_LO_0) {
+			/* We dont need a full 64-bit modulus, just enough
+			 * to distribute the requests across MDTs evenly.
+			 */
+			lmv->lmv_statfs_start =
+				(u32)lnet_id.nid % lmv->desc.ld_tgt_count;
+			break;
+		}
+	}
+
+	return lmv->lmv_statfs_start;
+}
+
 static int lmv_statfs(const struct lu_env *env, struct obd_export *exp,
-                      struct obd_statfs *osfs, __u64 max_age, __u32 flags)
+		      struct obd_statfs *osfs, time64_t max_age, __u32 flags)
 {
 	struct obd_device	*obd = class_exp2obd(exp);
 	struct lmv_obd		*lmv = &obd->u.lmv;
 	struct obd_statfs	*temp;
 	int			 rc = 0;
-	__u32			 i;
+	__u32			 i, idx;
 	ENTRY;
 
         OBD_ALLOC(temp, sizeof(*temp));
         if (temp == NULL)
                 RETURN(-ENOMEM);
 
-        for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
-		if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+	/* distribute statfs among MDTs */
+	idx = lmv_select_statfs_mdt(lmv, flags);
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++, idx++) {
+		idx = idx % lmv->desc.ld_tgt_count;
+		if (lmv->tgts[idx] == NULL || lmv->tgts[idx]->ltd_exp == NULL)
 			continue;
 
-		rc = obd_statfs(env, lmv->tgts[i]->ltd_exp, temp,
+		rc = obd_statfs(env, lmv->tgts[idx]->ltd_exp, temp,
 				max_age, flags);
 		if (rc) {
-			CERROR("can't stat MDS #%d (%s), error %d\n", i,
-			       lmv->tgts[i]->ltd_exp->exp_obd->obd_name,
+			CERROR("%s: can't stat MDS #%d: rc = %d\n",
+			       lmv->tgts[idx]->ltd_exp->exp_obd->obd_name, i,
 			       rc);
 			GOTO(out_free_temp, rc);
 		}
 
+		if (temp->os_state & OS_STATE_SUM ||
+		    flags == OBD_STATFS_FOR_MDT0) {
+			/* reset to the last aggregated values
+			 * and don't sum with non-aggrated data */
+			/* If the statfs is from mount, it needs to retrieve
+			 * necessary information from MDT0. i.e. mount does
+			 * not need the merged osfs from all of MDT. Also
+			 * clients can be mounted as long as MDT0 is in
+			 * service */
+			*osfs = *temp;
+			break;
+		}
+
 		if (i == 0) {
 			*osfs = *temp;
-			/* If the statfs is from mount, it will needs
-			 * retrieve necessary information from MDT0.
-			 * i.e. mount does not need the merged osfs
-			 * from all of MDT.
-			 * And also clients can be mounted as long as
-			 * MDT0 is in service*/
-			if (flags & OBD_STATFS_FOR_MDT0)
-				GOTO(out_free_temp, rc);
-                } else {
-                        osfs->os_bavail += temp->os_bavail;
-                        osfs->os_blocks += temp->os_blocks;
-                        osfs->os_ffree += temp->os_ffree;
-                        osfs->os_files += temp->os_files;
-                }
+		} else {
+			osfs->os_bavail += temp->os_bavail;
+			osfs->os_blocks += temp->os_blocks;
+			osfs->os_ffree += temp->os_ffree;
+			osfs->os_files += temp->os_files;
+			osfs->os_granted += temp->os_granted;
+		}
         }
 
         EXIT;
@@ -1425,9 +1459,8 @@ static int lmv_get_root(struct obd_export *exp, const char *fileset,
 }
 
 static int lmv_getxattr(struct obd_export *exp, const struct lu_fid *fid,
-			u64 valid, const char *name,
-			const char *input, int input_size, int output_size,
-			int flags, struct ptlrpc_request **request)
+			u64 obd_md_valid, const char *name, size_t buf_size,
+			struct ptlrpc_request **req)
 {
         struct obd_device      *obd = exp->exp_obd;
         struct lmv_obd         *lmv = &obd->u.lmv;
@@ -1439,17 +1472,16 @@ static int lmv_getxattr(struct obd_export *exp, const struct lu_fid *fid,
         if (IS_ERR(tgt))
                 RETURN(PTR_ERR(tgt));
 
-	rc = md_getxattr(tgt->ltd_exp, fid, valid, name, input,
-			 input_size, output_size, flags, request);
+	rc = md_getxattr(tgt->ltd_exp, fid, obd_md_valid, name, buf_size, req);
 
 	RETURN(rc);
 }
 
 static int lmv_setxattr(struct obd_export *exp, const struct lu_fid *fid,
-			u64 valid, const char *name,
-			const char *input, int input_size, int output_size,
-			int flags, __u32 suppgid,
-			struct ptlrpc_request **request)
+			u64 obd_md_valid, const char *name,
+			const void *value, size_t value_size,
+			unsigned int xattr_flags, u32 suppgid,
+			struct ptlrpc_request **req)
 {
         struct obd_device      *obd = exp->exp_obd;
         struct lmv_obd         *lmv = &obd->u.lmv;
@@ -1461,9 +1493,8 @@ static int lmv_setxattr(struct obd_export *exp, const struct lu_fid *fid,
         if (IS_ERR(tgt))
                 RETURN(PTR_ERR(tgt));
 
-	rc = md_setxattr(tgt->ltd_exp, fid, valid, name, input,
-			 input_size, output_size, flags, suppgid,
-			 request);
+	rc = md_setxattr(tgt->ltd_exp, fid, obd_md_valid, name,
+			 value, value_size, xattr_flags, suppgid, req);
 
 	RETURN(rc);
 }
@@ -1532,81 +1563,93 @@ static int lmv_close(struct obd_export *exp, struct md_op_data *op_data,
         RETURN(rc);
 }
 
-/**
- * Choosing the MDT by name or FID in @op_data.
- * For non-striped directory, it will locate MDT by fid.
- * For striped-directory, it will locate MDT by name. And also
- * it will reset op_fid1 with the FID of the choosen stripe.
- **/
-struct lmv_tgt_desc *
-lmv_locate_target_for_name(struct lmv_obd *lmv, struct lmv_stripe_md *lsm,
-			   const char *name, int namelen, struct lu_fid *fid,
-			   u32 *mds)
+struct lmv_tgt_desc*
+__lmv_locate_tgt(struct lmv_obd *lmv, struct lmv_stripe_md *lsm,
+		 const char *name, int namelen, struct lu_fid *fid, u32 *mds,
+		 bool post_migrate)
 {
-	struct lmv_tgt_desc	*tgt;
-	const struct lmv_oinfo	*oinfo;
+	struct lmv_tgt_desc *tgt;
+	const struct lmv_oinfo *oinfo;
+
+	if (lsm == NULL || namelen == 0) {
+		tgt = lmv_find_target(lmv, fid);
+		if (IS_ERR(tgt))
+			return tgt;
+
+		LASSERT(mds);
+		*mds = tgt->ltd_idx;
+		return tgt;
+	}
 
 	if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_NAME_HASH)) {
 		if (cfs_fail_val >= lsm->lsm_md_stripe_count)
-			RETURN(ERR_PTR(-EBADF));
+			return ERR_PTR(-EBADF);
 		oinfo = &lsm->lsm_md_oinfo[cfs_fail_val];
 	} else {
-		oinfo = lsm_name_to_stripe_info(lsm, name, namelen);
+		oinfo = lsm_name_to_stripe_info(lsm, name, namelen,
+						post_migrate);
 		if (IS_ERR(oinfo))
-			RETURN(ERR_CAST(oinfo));
+			return ERR_CAST(oinfo);
 	}
 
 	if (fid != NULL)
 		*fid = oinfo->lmo_fid;
 	if (mds != NULL)
 		*mds = oinfo->lmo_mds;
+	/* check stripe FID is sane */
+	if (!fid_is_sane(&oinfo->lmo_fid))
+		return ERR_PTR(-ENODEV);
 
 	tgt = lmv_get_target(lmv, oinfo->lmo_mds, NULL);
 
 	CDEBUG(D_INFO, "locate on mds %u "DFID"\n", oinfo->lmo_mds,
 	       PFID(&oinfo->lmo_fid));
+
 	return tgt;
 }
 
+
 /**
- * Locate mds by fid or name
+ * Locate mdt by fid or name
  *
- * For striped directory (lsm != NULL), it will locate the stripe
- * by name hash (see lsm_name_to_stripe_info()). Note: if the hash_type
- * is unknown, it will return -EBADFD, and lmv_intent_lookup might need
- * walk through all of stripes to locate the entry.
+ * For striped directory, it will locate the stripe by name hash, if hash_type
+ * is unknown, it will return the stripe specified by 'op_data->op_stripe_index'
+ * which is set outside, and if dir is migrating, 'op_data->op_post_migrate'
+ * indicates whether old or new layout is used to locate.
  *
  * For normal direcotry, it will locate MDS by FID directly.
- * \param[in] lmv	LMV device
- * \param[in] op_data	client MD stack parameters, name, namelen
- *                      mds_num etc.
- * \param[in] fid	object FID used to locate MDS.
+ *
+ * \param[in] lmv		LMV device
+ * \param[in/out] op_data	client MD stack parameters, name, namelen etc,
+ *                      	op_mds and op_fid1 will be updated if op_mea1
+ *                      	indicates fid1 represents a striped directory.
+ * \param[out] fid		object FID used to locate MDS.
  *
  * retval		pointer to the lmv_tgt_desc if succeed.
  *                      ERR_PTR(errno) if failed.
  */
 struct lmv_tgt_desc*
-lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
+lmv_locate_tgt(struct lmv_obd *lmv, struct md_op_data *op_data,
 	       struct lu_fid *fid)
 {
-	struct lmv_stripe_md	*lsm = op_data->op_mea1;
-	struct lmv_tgt_desc	*tgt;
+	struct lmv_stripe_md *lsm = op_data->op_mea1;
+	struct lmv_oinfo *oinfo;
+	struct lmv_tgt_desc *tgt;
 
 	/* During creating VOLATILE file, it should honor the mdt
 	 * index if the file under striped dir is being restored, see
 	 * ct_restore(). */
 	if (op_data->op_bias & MDS_CREATE_VOLATILE &&
 	    (int)op_data->op_mds != -1) {
-		int i;
 		tgt = lmv_get_target(lmv, op_data->op_mds, NULL);
 		if (IS_ERR(tgt))
 			return tgt;
 
-		if (lsm != NULL) {
+		if (lsm) {
+			int i;
+
 			/* refill the right parent fid */
 			for (i = 0; i < lsm->lsm_md_stripe_count; i++) {
-				struct lmv_oinfo *oinfo;
-
 				oinfo = &lsm->lsm_md_oinfo[i];
 				if (oinfo->lmo_mds == op_data->op_mds) {
 					*fid = oinfo->lmo_fid;
@@ -1617,22 +1660,21 @@ lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
 			if (i == lsm->lsm_md_stripe_count)
 				*fid = lsm->lsm_md_oinfo[0].lmo_fid;
 		}
+	} else if (lmv_is_dir_bad_hash(lsm)) {
+		LASSERT(op_data->op_stripe_index < lsm->lsm_md_stripe_count);
+		oinfo = &lsm->lsm_md_oinfo[op_data->op_stripe_index];
 
-		return tgt;
-	}
-
-	if (lsm == NULL || op_data->op_namelen == 0) {
-		tgt = lmv_find_target(lmv, fid);
-		if (IS_ERR(tgt))
-			return tgt;
-
-		op_data->op_mds = tgt->ltd_idx;
-		return tgt;
+		*fid = oinfo->lmo_fid;
+		op_data->op_mds = oinfo->lmo_mds;
+		tgt = lmv_get_target(lmv, oinfo->lmo_mds, NULL);
+	} else {
+		tgt = __lmv_locate_tgt(lmv, lsm, op_data->op_name,
+				       op_data->op_namelen, fid,
+				       &op_data->op_mds,
+				       op_data->op_post_migrate);
 	}
 
-	return lmv_locate_target_for_name(lmv, lsm, op_data->op_name,
-					  op_data->op_namelen, fid,
-					  &op_data->op_mds);
+	return tgt;
 }
 
 int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
@@ -1649,7 +1691,33 @@ int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
 	if (!lmv->desc.ld_active_tgt_count)
 		RETURN(-EIO);
 
-	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (lmv_is_dir_bad_hash(op_data->op_mea1))
+		RETURN(-EBADF);
+
+	if (lmv_is_dir_migrating(op_data->op_mea1)) {
+		/*
+		 * if parent is migrating, create() needs to lookup existing
+		 * name, to avoid creating new file under old layout of
+		 * migrating directory, check old layout here.
+		 */
+		tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
+
+		rc = md_getattr_name(tgt->ltd_exp, op_data, request);
+		if (!rc) {
+			ptlrpc_req_finished(*request);
+			*request = NULL;
+			RETURN(-EEXIST);
+		}
+
+		if (rc != -ENOENT)
+			RETURN(rc);
+
+		op_data->op_post_migrate = true;
+	}
+
+	tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
 	if (IS_ERR(tgt))
 		RETURN(PTR_ERR(tgt));
 
@@ -1660,6 +1728,7 @@ int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
 	rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
 	if (rc)
 		RETURN(rc);
+
 	if (exp_connect_flags(exp) & OBD_CONNECT_DIR_STRIPE) {
 		/* Send the create request to the MDT where the object
 		 * will be located */
@@ -1699,7 +1768,7 @@ lmv_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
 
 	CDEBUG(D_INODE, "ENQUEUE on "DFID"\n", PFID(&op_data->op_fid1));
 
-	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	tgt = lmv_find_target(lmv, &op_data->op_fid1);
 	if (IS_ERR(tgt))
 		RETURN(PTR_ERR(tgt));
 
@@ -1712,19 +1781,20 @@ lmv_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
 	RETURN(rc);
 }
 
-static int
+int
 lmv_getattr_name(struct obd_export *exp,struct md_op_data *op_data,
 		 struct ptlrpc_request **preq)
 {
-	struct ptlrpc_request   *req = NULL;
-	struct obd_device       *obd = exp->exp_obd;
-	struct lmv_obd          *lmv = &obd->u.lmv;
-	struct lmv_tgt_desc     *tgt;
-	struct mdt_body         *body;
-	int                      rc;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	struct mdt_body *body;
+	int rc;
+
 	ENTRY;
 
-	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+retry:
+	tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
 	if (IS_ERR(tgt))
 		RETURN(PTR_ERR(tgt));
 
@@ -1733,31 +1803,28 @@ lmv_getattr_name(struct obd_export *exp,struct md_op_data *op_data,
 		PFID(&op_data->op_fid1), tgt->ltd_idx);
 
 	rc = md_getattr_name(tgt->ltd_exp, op_data, preq);
-	if (rc != 0)
+	if (rc == -ENOENT && lmv_dir_retry_check_update(op_data)) {
+		ptlrpc_req_finished(*preq);
+		*preq = NULL;
+		goto retry;
+	}
+
+	if (rc)
 		RETURN(rc);
 
 	body = req_capsule_server_get(&(*preq)->rq_pill, &RMF_MDT_BODY);
 	LASSERT(body != NULL);
 
 	if (body->mbo_valid & OBD_MD_MDS) {
-		struct lu_fid rid = body->mbo_fid1;
-		CDEBUG(D_INODE, "Request attrs for "DFID"\n",
-		       PFID(&rid));
-
-		tgt = lmv_find_target(lmv, &rid);
-		if (IS_ERR(tgt)) {
-			ptlrpc_req_finished(*preq);
-			preq = NULL;
-			RETURN(PTR_ERR(tgt));
-		}
-
-		op_data->op_fid1 = rid;
+		op_data->op_fid1 = body->mbo_fid1;
 		op_data->op_valid |= OBD_MD_FLCROSSREF;
 		op_data->op_namelen = 0;
 		op_data->op_name = NULL;
-		rc = md_getattr_name(tgt->ltd_exp, op_data, &req);
+
 		ptlrpc_req_finished(*preq);
-		*preq = req;
+		*preq = NULL;
+
+		goto retry;
 	}
 
 	RETURN(rc);
@@ -1827,19 +1894,40 @@ static int lmv_link(struct obd_export *exp, struct md_op_data *op_data,
 	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
 	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
 	op_data->op_cap = cfs_curproc_cap_pack();
-	if (op_data->op_mea2 != NULL) {
-		struct lmv_stripe_md	*lsm = op_data->op_mea2;
-		const struct lmv_oinfo	*oinfo;
 
-		oinfo = lsm_name_to_stripe_info(lsm, op_data->op_name,
-						op_data->op_namelen);
-		if (IS_ERR(oinfo))
-			RETURN(PTR_ERR(oinfo));
+	if (lmv_is_dir_migrating(op_data->op_mea2)) {
+		struct lu_fid fid1 = op_data->op_fid1;
+		struct lmv_stripe_md *lsm1 = op_data->op_mea1;
 
-		op_data->op_fid2 = oinfo->lmo_fid;
+		/*
+		 * avoid creating new file under old layout of migrating
+		 * directory, check it here.
+		 */
+		tgt = __lmv_locate_tgt(lmv, op_data->op_mea2, op_data->op_name,
+				       op_data->op_namelen, &op_data->op_fid2,
+				       &op_data->op_mds, false);
+		tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
+
+		op_data->op_fid1 = op_data->op_fid2;
+		op_data->op_mea1 = op_data->op_mea2;
+		rc = md_getattr_name(tgt->ltd_exp, op_data, request);
+		op_data->op_fid1 = fid1;
+		op_data->op_mea1 = lsm1;
+		if (!rc) {
+			ptlrpc_req_finished(*request);
+			*request = NULL;
+			RETURN(-EEXIST);
+		}
+
+		if (rc != -ENOENT)
+			RETURN(rc);
 	}
 
-	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
+	tgt = __lmv_locate_tgt(lmv, op_data->op_mea2, op_data->op_name,
+			       op_data->op_namelen, &op_data->op_fid2,
+			       &op_data->op_mds, true);
 	if (IS_ERR(tgt))
 		RETURN(PTR_ERR(tgt));
 
@@ -1857,158 +1945,323 @@ static int lmv_link(struct obd_export *exp, struct md_op_data *op_data,
 	RETURN(rc);
 }
 
-static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
-		      const char *old, size_t oldlen,
-		      const char *new, size_t newlen,
-		      struct ptlrpc_request **request)
+static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data,
+			const char *name, size_t namelen,
+			struct ptlrpc_request **request)
 {
-	struct obd_device       *obd = exp->exp_obd;
-	struct lmv_obd          *lmv = &obd->u.lmv;
-	struct lmv_tgt_desc     *src_tgt;
-	struct lmv_tgt_desc     *tgt_tgt;
-	struct obd_export	*target_exp;
-	struct mdt_body		*body;
-	int			rc;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_stripe_md *lsm = op_data->op_mea1;
+	struct lmv_tgt_desc *parent_tgt;
+	struct lmv_tgt_desc *sp_tgt;
+	struct lmv_tgt_desc *tp_tgt = NULL;
+	struct lmv_tgt_desc *child_tgt;
+	struct lmv_tgt_desc *tgt;
+	struct lu_fid target_fid;
+	int rc;
+
 	ENTRY;
 
-	LASSERT(oldlen != 0);
+	LASSERT(op_data->op_cli_flags & CLI_MIGRATE);
 
-	CDEBUG(D_INODE, "RENAME %.*s in "DFID":%d to %.*s in "DFID":%d\n",
-	       (int)oldlen, old, PFID(&op_data->op_fid1),
-	       op_data->op_mea1 ? op_data->op_mea1->lsm_md_stripe_count : 0,
-	       (int)newlen, new, PFID(&op_data->op_fid2),
-	       op_data->op_mea2 ? op_data->op_mea2->lsm_md_stripe_count : 0);
+	CDEBUG(D_INODE, "MIGRATE "DFID"/%.*s\n",
+	       PFID(&op_data->op_fid1), (int)namelen, name);
 
 	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
 	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
 	op_data->op_cap = cfs_curproc_cap_pack();
-	if (op_data->op_cli_flags & CLI_MIGRATE) {
-		LASSERTF(fid_is_sane(&op_data->op_fid3), "invalid FID "DFID"\n",
-			 PFID(&op_data->op_fid3));
-
-		if (op_data->op_mea1 != NULL) {
-			struct lmv_stripe_md	*lsm = op_data->op_mea1;
-			struct lmv_tgt_desc	*tmp;
-
-			/* Fix the parent fid for striped dir */
-			tmp = lmv_locate_target_for_name(lmv, lsm, old,
-							 oldlen,
-							 &op_data->op_fid1,
-							 NULL);
-			if (IS_ERR(tmp))
-				RETURN(PTR_ERR(tmp));
-		}
-
-		rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
-		if (rc != 0)
-			RETURN(rc);
 
-		src_tgt = lmv_find_target(lmv, &op_data->op_fid3);
-		if (IS_ERR(src_tgt))
-			RETURN(PTR_ERR(src_tgt));
+	parent_tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	if (IS_ERR(parent_tgt))
+		RETURN(PTR_ERR(parent_tgt));
 
-		target_exp = src_tgt->ltd_exp;
-	} else {
-		if (op_data->op_mea1 != NULL) {
-			struct lmv_stripe_md	*lsm = op_data->op_mea1;
+	if (lsm) {
+		__u32 hash_type = lsm->lsm_md_hash_type;
+		__u32 stripe_count = lsm->lsm_md_stripe_count;
 
-			src_tgt = lmv_locate_target_for_name(lmv, lsm, old,
-							     oldlen,
-							     &op_data->op_fid1,
-							     &op_data->op_mds);
-		} else {
-			src_tgt = lmv_find_target(lmv, &op_data->op_fid1);
+		/*
+		 * old stripes are appended after new stripes for migrating
+		 * directory.
+		 */
+		if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION) {
+			hash_type = lsm->lsm_md_migrate_hash;
+			stripe_count -= lsm->lsm_md_migrate_offset;
 		}
-		if (IS_ERR(src_tgt))
-			RETURN(PTR_ERR(src_tgt));
 
+		rc = lmv_name_to_stripe_index(hash_type, stripe_count, name,
+					      namelen);
+		if (rc < 0)
+			RETURN(rc);
 
-		if (op_data->op_mea2 != NULL) {
-			struct lmv_stripe_md	*lsm = op_data->op_mea2;
+		if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION)
+			rc += lsm->lsm_md_migrate_offset;
 
-			tgt_tgt = lmv_locate_target_for_name(lmv, lsm, new,
-							     newlen,
-							     &op_data->op_fid2,
-							     &op_data->op_mds);
-		} else {
-			tgt_tgt = lmv_find_target(lmv, &op_data->op_fid2);
+		/* save it in fid4 temporarily for early cancel */
+		op_data->op_fid4 = lsm->lsm_md_oinfo[rc].lmo_fid;
+		sp_tgt = lmv_get_target(lmv, lsm->lsm_md_oinfo[rc].lmo_mds,
+					NULL);
+		if (IS_ERR(sp_tgt))
+			RETURN(PTR_ERR(sp_tgt));
 
+		/*
+		 * if parent is being migrated too, fill op_fid2 with target
+		 * stripe fid, otherwise the target stripe is not created yet.
+		 */
+		if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION) {
+			hash_type = lsm->lsm_md_hash_type &
+				    ~LMV_HASH_FLAG_MIGRATION;
+			stripe_count = lsm->lsm_md_migrate_offset;
+
+			rc = lmv_name_to_stripe_index(hash_type, stripe_count,
+						      name, namelen);
+			if (rc < 0)
+				RETURN(rc);
+
+			op_data->op_fid2 = lsm->lsm_md_oinfo[rc].lmo_fid;
+			tp_tgt = lmv_get_target(lmv,
+						lsm->lsm_md_oinfo[rc].lmo_mds,
+						NULL);
+			if (IS_ERR(tp_tgt))
+				RETURN(PTR_ERR(tp_tgt));
 		}
-		if (IS_ERR(tgt_tgt))
-			RETURN(PTR_ERR(tgt_tgt));
-
-		target_exp = tgt_tgt->ltd_exp;
+	} else {
+		sp_tgt = parent_tgt;
 	}
 
-	/*
-	 * LOOKUP lock on src child (fid3) should also be cancelled for
-	 * src_tgt in mdc_rename.
-	 */
-	op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
-
-	/*
-	 * Cancel UPDATE locks on tgt parent (fid2), tgt_tgt is its
-	 * own target.
-	 */
-	rc = lmv_early_cancel(exp, NULL, op_data, src_tgt->ltd_idx,
-			      LCK_EX, MDS_INODELOCK_UPDATE,
-			      MF_MDC_CANCEL_FID2);
+	child_tgt = lmv_find_target(lmv, &op_data->op_fid3);
+	if (IS_ERR(child_tgt))
+		RETURN(PTR_ERR(child_tgt));
 
-	if (rc != 0)
+	if (!S_ISDIR(op_data->op_mode) && tp_tgt)
+		rc = __lmv_fid_alloc(lmv, &target_fid, tp_tgt->ltd_idx);
+	else
+		rc = lmv_fid_alloc(NULL, exp, &target_fid, op_data);
+	if (rc)
 		RETURN(rc);
+
 	/*
-	 * Cancel LOOKUP locks on source child (fid3) for parent tgt_tgt.
+	 * for directory, send migrate request to the MDT where the object will
+	 * be migrated to, because we can't create a striped directory remotely.
+	 *
+	 * otherwise, send to the MDT where source is located because regular
+	 * file may open lease.
+	 *
+	 * NB. if MDT doesn't support DIR_MIGRATE, send to source MDT too for
+	 * backward compatibility.
 	 */
-	if (fid_is_sane(&op_data->op_fid3)) {
-		struct lmv_tgt_desc *tgt;
-
-		tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	if (S_ISDIR(op_data->op_mode) &&
+	    (exp_connect_flags2(exp) & OBD_CONNECT2_DIR_MIGRATE)) {
+		tgt = lmv_find_target(lmv, &target_fid);
 		if (IS_ERR(tgt))
 			RETURN(PTR_ERR(tgt));
+	} else {
+		tgt = child_tgt;
+	}
 
-		/* Cancel LOOKUP lock on its parent */
-		rc = lmv_early_cancel(exp, tgt, op_data, src_tgt->ltd_idx,
-				      LCK_EX, MDS_INODELOCK_LOOKUP,
-				      MF_MDC_CANCEL_FID3);
-		if (rc != 0)
+	/* cancel UPDATE lock of parent master object */
+	rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_idx, LCK_EX,
+			      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1);
+	if (rc)
+		RETURN(rc);
+
+	/* cancel UPDATE lock of source parent */
+	if (sp_tgt != parent_tgt) {
+		/*
+		 * migrate RPC packs master object FID, because we can only pack
+		 * two FIDs in reint RPC, but MDS needs to know both source
+		 * parent and target parent, and it will obtain them from master
+		 * FID and LMV, the other FID in RPC is kept for target.
+		 *
+		 * since this FID is not passed to MDC, cancel it anyway.
+		 */
+		rc = lmv_early_cancel(exp, sp_tgt, op_data, -1, LCK_EX,
+				      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID4);
+		if (rc)
 			RETURN(rc);
 
-		rc = lmv_early_cancel(exp, NULL, op_data, src_tgt->ltd_idx,
-				      LCK_EX, MDS_INODELOCK_FULL,
+		op_data->op_flags &= ~MF_MDC_CANCEL_FID4;
+	}
+	op_data->op_fid4 = target_fid;
+
+	/* cancel UPDATE locks of target parent */
+	rc = lmv_early_cancel(exp, tp_tgt, op_data, tgt->ltd_idx, LCK_EX,
+			      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID2);
+	if (rc)
+		RETURN(rc);
+
+	/* cancel LOOKUP lock of source if source is remote object */
+	if (child_tgt != sp_tgt) {
+		rc = lmv_early_cancel(exp, sp_tgt, op_data, tgt->ltd_idx,
+				      LCK_EX, MDS_INODELOCK_LOOKUP,
 				      MF_MDC_CANCEL_FID3);
-		if (rc != 0)
+		if (rc)
 			RETURN(rc);
 	}
 
-retry_rename:
-	/*
-	 * Cancel all the locks on tgt child (fid4).
-	 */
-	if (fid_is_sane(&op_data->op_fid4)) {
-		struct lmv_tgt_desc *tgt;
+	/* cancel ELC locks of source */
+	rc = lmv_early_cancel(exp, child_tgt, op_data, tgt->ltd_idx, LCK_EX,
+			      MDS_INODELOCK_ELC, MF_MDC_CANCEL_FID3);
+	if (rc)
+		RETURN(rc);
 
-		rc = lmv_early_cancel(exp, NULL, op_data, src_tgt->ltd_idx,
-				      LCK_EX, MDS_INODELOCK_FULL,
-				      MF_MDC_CANCEL_FID4);
-		if (rc != 0)
+	rc = md_rename(tgt->ltd_exp, op_data, name, namelen, NULL, 0, request);
+
+	RETURN(rc);
+}
+
+static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
+		      const char *old, size_t oldlen,
+		      const char *new, size_t newlen,
+		      struct ptlrpc_request **request)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *sp_tgt;
+	struct lmv_tgt_desc *tp_tgt = NULL;
+	struct lmv_tgt_desc *src_tgt = NULL;
+	struct lmv_tgt_desc *tgt;
+	struct mdt_body *body;
+	int rc;
+
+	ENTRY;
+
+	LASSERT(oldlen != 0);
+
+	if (op_data->op_cli_flags & CLI_MIGRATE) {
+		rc = lmv_migrate(exp, op_data, old, oldlen, request);
+		RETURN(rc);
+	}
+
+	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
+	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
+	op_data->op_cap = cfs_curproc_cap_pack();
+
+	if (lmv_is_dir_migrating(op_data->op_mea2)) {
+		struct lu_fid fid1 = op_data->op_fid1;
+		struct lmv_stripe_md *lsm1 = op_data->op_mea1;
+
+		/*
+		 * we avoid creating new file under old layout of migrating
+		 * directory, if there is an existing file with new name under
+		 * old layout, we can't unlink file in old layout and rename to
+		 * new layout in one transaction, so return -EBUSY here.`
+		 */
+		tgt = __lmv_locate_tgt(lmv, op_data->op_mea2, new, newlen,
+				       &op_data->op_fid2, &op_data->op_mds,
+				       false);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
+
+		op_data->op_fid1 = op_data->op_fid2;
+		op_data->op_mea1 = op_data->op_mea2;
+		op_data->op_name = new;
+		op_data->op_namelen = newlen;
+		rc = md_getattr_name(tgt->ltd_exp, op_data, request);
+		op_data->op_fid1 = fid1;
+		op_data->op_mea1 = lsm1;
+		op_data->op_name = NULL;
+		op_data->op_namelen = 0;
+		if (!rc) {
+			ptlrpc_req_finished(*request);
+			*request = NULL;
+			RETURN(-EBUSY);
+		}
+
+		if (rc != -ENOENT)
 			RETURN(rc);
+	}
+
+	/* rename to new layout for migrating directory */
+	tp_tgt = __lmv_locate_tgt(lmv, op_data->op_mea2, new, newlen,
+				  &op_data->op_fid2, &op_data->op_mds, true);
+	if (IS_ERR(tp_tgt))
+		RETURN(PTR_ERR(tp_tgt));
 
+	/* Since the target child might be destroyed, and it might become
+	 * orphan, and we can only check orphan on the local MDT right now, so
+	 * we send rename request to the MDT where target child is located. If
+	 * target child does not exist, then it will send the request to the
+	 * target parent */
+	if (fid_is_sane(&op_data->op_fid4)) {
 		tgt = lmv_find_target(lmv, &op_data->op_fid4);
 		if (IS_ERR(tgt))
 			RETURN(PTR_ERR(tgt));
+	} else {
+		tgt = tp_tgt;
+	}
+
+	op_data->op_flags |= MF_MDC_CANCEL_FID4;
 
-		/* Since the target child might be destroyed, and it might
-		 * become orphan, and we can only check orphan on the local
-		 * MDT right now, so we send rename request to the MDT where
-		 * target child is located. If target child does not exist,
-		 * then it will send the request to the target parent */
-		target_exp = tgt->ltd_exp;
+	/* cancel UPDATE locks of target parent */
+	rc = lmv_early_cancel(exp, tp_tgt, op_data, tgt->ltd_idx, LCK_EX,
+			      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID2);
+	if (rc != 0)
+		RETURN(rc);
+
+	if (fid_is_sane(&op_data->op_fid4)) {
+		/* cancel LOOKUP lock of target on target parent */
+		if (tgt != tp_tgt) {
+			rc = lmv_early_cancel(exp, tp_tgt, op_data,
+					      tgt->ltd_idx, LCK_EX,
+					      MDS_INODELOCK_LOOKUP,
+					      MF_MDC_CANCEL_FID4);
+			if (rc != 0)
+				RETURN(rc);
+		}
 	}
 
-	rc = md_rename(target_exp, op_data, old, oldlen, new, newlen,
-		       request);
+	if (fid_is_sane(&op_data->op_fid3)) {
+		src_tgt = lmv_find_target(lmv, &op_data->op_fid3);
+		if (IS_ERR(src_tgt))
+			RETURN(PTR_ERR(src_tgt));
 
-	if (rc != 0 && rc != -EXDEV)
+		/* cancel ELC locks of source */
+		rc = lmv_early_cancel(exp, src_tgt, op_data, tgt->ltd_idx,
+				      LCK_EX, MDS_INODELOCK_ELC,
+				      MF_MDC_CANCEL_FID3);
+		if (rc != 0)
+			RETURN(rc);
+	}
+
+retry:
+	sp_tgt = __lmv_locate_tgt(lmv, op_data->op_mea1, old, oldlen,
+				  &op_data->op_fid1, &op_data->op_mds,
+				  op_data->op_post_migrate);
+	if (IS_ERR(sp_tgt))
+		RETURN(PTR_ERR(sp_tgt));
+
+	/* cancel UPDATE locks of source parent */
+	rc = lmv_early_cancel(exp, sp_tgt, op_data, tgt->ltd_idx, LCK_EX,
+			      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1);
+	if (rc != 0)
+		RETURN(rc);
+
+	if (fid_is_sane(&op_data->op_fid3)) {
+		/* cancel LOOKUP lock of source on source parent */
+		if (src_tgt != sp_tgt) {
+			rc = lmv_early_cancel(exp, sp_tgt, op_data,
+					      tgt->ltd_idx, LCK_EX,
+					      MDS_INODELOCK_LOOKUP,
+					      MF_MDC_CANCEL_FID3);
+			if (rc != 0)
+				RETURN(rc);
+		}
+	}
+
+rename:
+	CDEBUG(D_INODE, "RENAME "DFID"/%.*s to "DFID"/%.*s\n",
+		PFID(&op_data->op_fid1), (int)oldlen, old,
+		PFID(&op_data->op_fid2), (int)newlen, new);
+
+	rc = md_rename(tgt->ltd_exp, op_data, old, oldlen, new, newlen,
+			request);
+	if (rc == -ENOENT && lmv_dir_retry_check_update(op_data)) {
+		ptlrpc_req_finished(*request);
+		*request = NULL;
+		goto retry;
+	}
+
+	if (rc && rc != -EXDEV)
 		RETURN(rc);
 
 	body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY);
@@ -2019,13 +2272,28 @@ static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
 	if (likely(!(body->mbo_valid & OBD_MD_MDS)))
 		RETURN(rc);
 
-	CDEBUG(D_INODE, "%s: try rename to another MDT for "DFID"\n",
-	       exp->exp_obd->obd_name, PFID(&body->mbo_fid1));
-
 	op_data->op_fid4 = body->mbo_fid1;
+
 	ptlrpc_req_finished(*request);
 	*request = NULL;
-	goto retry_rename;
+
+	tgt = lmv_find_target(lmv, &op_data->op_fid4);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	if (fid_is_sane(&op_data->op_fid4)) {
+		/* cancel LOOKUP lock of target on target parent */
+		if (tgt != tp_tgt) {
+			rc = lmv_early_cancel(exp, tp_tgt, op_data,
+					      tgt->ltd_idx, LCK_EX,
+					      MDS_INODELOCK_LOOKUP,
+					      MF_MDC_CANCEL_FID4);
+			if (rc != 0)
+				RETURN(rc);
+		}
+	}
+
+	goto rename;
 }
 
 static int lmv_setattr(struct obd_export *exp, struct md_op_data *op_data,
@@ -2037,8 +2305,9 @@ static int lmv_setattr(struct obd_export *exp, struct md_op_data *op_data,
 	int                      rc = 0;
 	ENTRY;
 
-	CDEBUG(D_INODE, "SETATTR for "DFID", valid 0x%x\n",
-	       PFID(&op_data->op_fid1), op_data->op_attr.ia_valid);
+	CDEBUG(D_INODE, "SETATTR for "DFID", valid 0x%x/0x%x\n",
+	       PFID(&op_data->op_fid1), op_data->op_attr.ia_valid,
+	       op_data->op_xvalid);
 
 	op_data->op_flags |= MF_MDC_CANCEL_FID1;
 	tgt = lmv_find_target(lmv, &op_data->op_fid1);
@@ -2067,146 +2336,228 @@ static int lmv_fsync(struct obd_export *exp, const struct lu_fid *fid,
 	RETURN(rc);
 }
 
-/**
- * Get current minimum entry from striped directory
- *
- * This function will search the dir entry, whose hash value is the
- * closest(>=) to @hash_offset, from all of sub-stripes, and it is
- * only being called for striped directory.
- *
- * \param[in] exp		export of LMV
- * \param[in] op_data		parameters transferred beween client MD stack
- *                              stripe_information will be included in this
- *                              parameter
- * \param[in] cb_op		ldlm callback being used in enqueue in
- *                              mdc_read_page
- * \param[in] hash_offset	the hash value, which is used to locate
- *                              minum(closet) dir entry
- * \param[in|out] stripe_offset the caller use this to indicate the stripe
- *                              index of last entry, so to avoid hash conflict
- *                              between stripes. It will also be used to
- *                              return the stripe index of current dir entry.
- * \param[in|out] entp		the minum entry and it also is being used
- *                              to input the last dir entry to resolve the
- *                              hash conflict
- *
- * \param[out] ppage		the page which holds the minum entry
- *
- * \retval                      = 0 get the entry successfully
- *                              negative errno (< 0) does not get the entry
- */
-static int lmv_get_min_striped_entry(struct obd_export *exp,
-				     struct md_op_data *op_data,
-				     struct md_callback *cb_op,
-				     __u64 hash_offset, int *stripe_offset,
-				     struct lu_dirent **entp,
-				     struct page **ppage)
+struct stripe_dirent {
+	struct page		*sd_page;
+	struct lu_dirpage	*sd_dp;
+	struct lu_dirent	*sd_ent;
+	bool			 sd_eof;
+};
+
+struct lmv_dir_ctxt {
+	struct lmv_obd		*ldc_lmv;
+	struct md_op_data	*ldc_op_data;
+	struct md_callback	*ldc_cb_op;
+	__u64			 ldc_hash;
+	int			 ldc_count;
+	struct stripe_dirent	 ldc_stripes[0];
+};
+
+static inline void stripe_dirent_unload(struct stripe_dirent *stripe)
 {
-	struct obd_device	*obd = exp->exp_obd;
-	struct lmv_obd		*lmv = &obd->u.lmv;
-	struct lmv_stripe_md	*lsm = op_data->op_mea1;
-	struct lmv_tgt_desc	*tgt;
-	int			stripe_count;
-	struct lu_dirent	*min_ent = NULL;
-	struct page		*min_page = NULL;
-	int			min_idx = 0;
-	int			i;
-	int			rc = 0;
-	ENTRY;
+	if (stripe->sd_page) {
+		kunmap(stripe->sd_page);
+		put_page(stripe->sd_page);
+		stripe->sd_page = NULL;
+		stripe->sd_ent = NULL;
+	}
+}
 
-	stripe_count = lsm->lsm_md_stripe_count;
-	for (i = 0; i < stripe_count; i++) {
-		struct lu_dirent	*ent = NULL;
-		struct page		*page = NULL;
-		struct lu_dirpage	*dp;
-		__u64			stripe_hash = hash_offset;
+static inline void put_lmv_dir_ctxt(struct lmv_dir_ctxt *ctxt)
+{
+	int i;
 
-		tgt = lmv_get_target(lmv, lsm->lsm_md_oinfo[i].lmo_mds, NULL);
-		if (IS_ERR(tgt))
-			GOTO(out, rc = PTR_ERR(tgt));
-
-		/* op_data will be shared by each stripe, so we need
-		 * reset these value for each stripe */
-		op_data->op_fid1 = lsm->lsm_md_oinfo[i].lmo_fid;
-		op_data->op_fid2 = lsm->lsm_md_oinfo[i].lmo_fid;
-		op_data->op_data = lsm->lsm_md_oinfo[i].lmo_root;
-next:
-		rc = md_read_page(tgt->ltd_exp, op_data, cb_op, stripe_hash,
-				  &page);
-		if (rc != 0)
-			GOTO(out, rc);
+	for (i = 0; i < ctxt->ldc_count; i++)
+		stripe_dirent_unload(&ctxt->ldc_stripes[i]);
+}
 
-		dp = page_address(page);
-		for (ent = lu_dirent_start(dp); ent != NULL;
-		     ent = lu_dirent_next(ent)) {
-			/* Skip dummy entry */
-			if (le16_to_cpu(ent->lde_namelen) == 0)
-				continue;
+/* if @ent is dummy, or . .., get next */
+static struct lu_dirent *stripe_dirent_get(struct lmv_dir_ctxt *ctxt,
+					   struct lu_dirent *ent,
+					   int stripe_index)
+{
+	for (; ent; ent = lu_dirent_next(ent)) {
+		/* Skip dummy entry */
+		if (le16_to_cpu(ent->lde_namelen) == 0)
+			continue;
 
-			if (le64_to_cpu(ent->lde_hash) < hash_offset)
-				continue;
+		/* skip . and .. for other stripes */
+		if (stripe_index &&
+		    (strncmp(ent->lde_name, ".",
+			     le16_to_cpu(ent->lde_namelen)) == 0 ||
+		     strncmp(ent->lde_name, "..",
+			     le16_to_cpu(ent->lde_namelen)) == 0))
+			continue;
 
-			if (le64_to_cpu(ent->lde_hash) == hash_offset &&
-			    (*entp == ent || i < *stripe_offset))
-				continue;
+		if (le64_to_cpu(ent->lde_hash) >= ctxt->ldc_hash)
+			break;
+	}
 
-			/* skip . and .. for other stripes */
-			if (i != 0 &&
-			    (strncmp(ent->lde_name, ".",
-				     le16_to_cpu(ent->lde_namelen)) == 0 ||
-			     strncmp(ent->lde_name, "..",
-				     le16_to_cpu(ent->lde_namelen)) == 0))
-				continue;
+	return ent;
+}
+
+static struct lu_dirent *stripe_dirent_load(struct lmv_dir_ctxt *ctxt,
+					    struct stripe_dirent *stripe,
+					    int stripe_index)
+{
+	struct md_op_data *op_data = ctxt->ldc_op_data;
+	struct lmv_oinfo *oinfo;
+	struct lu_fid fid = op_data->op_fid1;
+	struct inode *inode = op_data->op_data;
+	struct lmv_tgt_desc *tgt;
+	struct lu_dirent *ent = stripe->sd_ent;
+	__u64 hash = ctxt->ldc_hash;
+	int rc = 0;
+
+	ENTRY;
+
+	LASSERT(stripe == &ctxt->ldc_stripes[stripe_index]);
+	LASSERT(!ent);
+
+	do {
+		if (stripe->sd_page) {
+			__u64 end = le64_to_cpu(stripe->sd_dp->ldp_hash_end);
+
+			/* @hash should be the last dirent hash */
+			LASSERTF(hash <= end,
+				 "ctxt@%p stripe@%p hash %llx end %llx\n",
+				 ctxt, stripe, hash, end);
+			/* unload last page */
+			stripe_dirent_unload(stripe);
+			/* eof */
+			if (end == MDS_DIR_END_OFF) {
+				stripe->sd_eof = true;
+				break;
+			}
+			hash = end;
+		}
+
+		oinfo = &op_data->op_mea1->lsm_md_oinfo[stripe_index];
+		if (!oinfo->lmo_root) {
+			rc = -ENOENT;
 			break;
 		}
 
-		if (ent == NULL) {
-			stripe_hash = le64_to_cpu(dp->ldp_hash_end);
+		tgt = lmv_get_target(ctxt->ldc_lmv, oinfo->lmo_mds, NULL);
+		if (IS_ERR(tgt)) {
+			rc = PTR_ERR(tgt);
+			break;
+		}
+
+		/* op_data is shared by stripes, reset after use */
+		op_data->op_fid1 = oinfo->lmo_fid;
+		op_data->op_fid2 = oinfo->lmo_fid;
+		op_data->op_data = oinfo->lmo_root;
 
-			kunmap(page);
-			put_page(page);
-			page = NULL;
+		rc = md_read_page(tgt->ltd_exp, op_data, ctxt->ldc_cb_op, hash,
+				  &stripe->sd_page);
+
+		op_data->op_fid1 = fid;
+		op_data->op_fid2 = fid;
+		op_data->op_data = inode;
+
+		if (rc)
+			break;
 
-			/* reach the end of current stripe, go to next stripe */
-			if (stripe_hash == MDS_DIR_END_OFF)
+		stripe->sd_dp = page_address(stripe->sd_page);
+		ent = stripe_dirent_get(ctxt, lu_dirent_start(stripe->sd_dp),
+					stripe_index);
+		/* in case a page filled with ., .. and dummy, read next */
+	} while (!ent);
+
+	stripe->sd_ent = ent;
+	if (rc) {
+		LASSERT(!ent);
+		/* treat error as eof, so dir can be partially accessed */
+		stripe->sd_eof = true;
+		LCONSOLE_WARN("dir "DFID" stripe %d readdir failed: %d, "
+			      "directory is partially accessed!\n",
+			      PFID(&ctxt->ldc_op_data->op_fid1), stripe_index,
+			      rc);
+	}
+
+	RETURN(ent);
+}
+
+static int lmv_file_resync(struct obd_export *exp, struct md_op_data *data)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lmv_obd		*lmv = &obd->u.lmv;
+	struct lmv_tgt_desc	*tgt;
+	int			 rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc != 0)
+		RETURN(rc);
+
+	tgt = lmv_find_target(lmv, &data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	data->op_flags |= MF_MDC_CANCEL_FID1;
+	rc = md_file_resync(tgt->ltd_exp, data);
+	RETURN(rc);
+}
+
+/**
+ * Get dirent with the closest hash for striped directory
+ *
+ * This function will search the dir entry, whose hash value is the
+ * closest(>=) to hash from all of sub-stripes, and it is only being called
+ * for striped directory.
+ *
+ * \param[in] ctxt		dir read context
+ *
+ * \retval                      dirent get the entry successfully
+ *                              NULL does not get the entry, normally it means
+ *                              it reaches the end of the directory, while read
+ *                              stripe dirent error is ignored to allow partial
+ *                              access.
+ */
+static struct lu_dirent *lmv_dirent_next(struct lmv_dir_ctxt *ctxt)
+{
+	struct stripe_dirent *stripe;
+	struct lu_dirent *ent = NULL;
+	int i;
+	int min = -1;
+
+	/* TODO: optimize with k-way merge sort */
+	for (i = 0; i < ctxt->ldc_count; i++) {
+		stripe = &ctxt->ldc_stripes[i];
+		if (stripe->sd_eof)
+			continue;
+
+		if (!stripe->sd_ent) {
+			stripe_dirent_load(ctxt, stripe, i);
+			if (!stripe->sd_ent) {
+				LASSERT(stripe->sd_eof);
 				continue;
-			else
-				goto next;
+			}
 		}
 
-		if (min_ent != NULL) {
-			if (le64_to_cpu(min_ent->lde_hash) >
-			    le64_to_cpu(ent->lde_hash)) {
-				min_ent = ent;
-				kunmap(min_page);
-				put_page(min_page);
-				min_idx = i;
-				min_page = page;
-			} else {
-				kunmap(page);
-				put_page(page);
-				page = NULL;
-			}
-		} else {
-			min_ent = ent;
-			min_page = page;
-			min_idx = i;
+		if (min == -1 ||
+		    le64_to_cpu(ctxt->ldc_stripes[min].sd_ent->lde_hash) >
+		    le64_to_cpu(stripe->sd_ent->lde_hash)) {
+			min = i;
+			if (le64_to_cpu(stripe->sd_ent->lde_hash) ==
+			    ctxt->ldc_hash)
+				break;
 		}
 	}
 
-out:
-	if (*ppage != NULL) {
-		kunmap(*ppage);
-		put_page(*ppage);
+	if (min != -1) {
+		stripe = &ctxt->ldc_stripes[min];
+		ent = stripe->sd_ent;
+		/* pop found dirent */
+		stripe->sd_ent = stripe_dirent_get(ctxt, lu_dirent_next(ent),
+						   min);
 	}
-	*stripe_offset = min_idx;
-	*entp = min_ent;
-	*ppage = min_page;
-	RETURN(rc);
+
+	return ent;
 }
 
 /**
- * Build dir entry page from a striped directory
+ * Build dir entry page for striped directory
  *
  * This function gets one entry by @offset from a striped directory. It will
  * read entries from all of stripes, and choose one closest to the required
@@ -2215,12 +2566,11 @@ static int lmv_get_min_striped_entry(struct obd_export *exp,
  * and .. in a directory.
  * 2. op_data will be shared by all of stripes, instead of allocating new
  * one, so need to restore before reusing.
- * 3. release the entry page if that is not being chosen.
  *
  * \param[in] exp	obd export refer to LMV
  * \param[in] op_data	hold those MD parameters of read_entry
  * \param[in] cb_op	ldlm callback being used in enqueue in mdc_read_entry
- * \param[out] ldp	the entry being read
+ * \param[in] offset	starting hash offset
  * \param[out] ppage	the page holding the entry. Note: because the entry
  *                      will be accessed in upper layer, so we need hold the
  *                      page until the usages of entry is finished, see
@@ -2229,124 +2579,117 @@ static int lmv_get_min_striped_entry(struct obd_export *exp,
  * retval		=0 if get entry successfully
  *                      <0 cannot get entry
  */
-static int lmv_read_striped_page(struct obd_export *exp,
+static int lmv_striped_read_page(struct obd_export *exp,
 				 struct md_op_data *op_data,
 				 struct md_callback *cb_op,
 				 __u64 offset, struct page **ppage)
 {
-	struct lu_fid		master_fid = op_data->op_fid1;
-	struct inode		*master_inode = op_data->op_data;
-	__u64			hash_offset = offset;
-	struct lu_dirpage	*dp;
-	struct page		*min_ent_page = NULL;
-	struct page		*ent_page = NULL;
-	struct lu_dirent	*ent;
-	void			*area;
-	int			ent_idx = 0;
-	struct lu_dirent	*min_ent = NULL;
-	struct lu_dirent	*last_ent;
-	size_t			left_bytes;
-	int			rc;
+	struct page *page = NULL;
+	struct lu_dirpage *dp;
+	void *start;
+	struct lu_dirent *ent;
+	struct lu_dirent *last_ent;
+	int stripe_count;
+	struct lmv_dir_ctxt *ctxt;
+	struct lu_dirent *next = NULL;
+	__u16 ent_size;
+	size_t left_bytes;
+	int rc = 0;
 	ENTRY;
 
 	/* Allocate a page and read entries from all of stripes and fill
 	 * the page by hash order */
-	ent_page = alloc_page(GFP_KERNEL);
-	if (ent_page == NULL)
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
 		RETURN(-ENOMEM);
 
 	/* Initialize the entry page */
-	dp = kmap(ent_page);
+	dp = kmap(page);
 	memset(dp, 0, sizeof(*dp));
 	dp->ldp_hash_start = cpu_to_le64(offset);
-	dp->ldp_flags |= LDF_COLLIDE;
 
-	area = dp + 1;
+	start = dp + 1;
 	left_bytes = PAGE_SIZE - sizeof(*dp);
-	ent = area;
+	ent = start;
 	last_ent = ent;
-	do {
-		__u16	ent_size;
 
-		/* Find the minum entry from all sub-stripes */
-		rc = lmv_get_min_striped_entry(exp, op_data, cb_op, hash_offset,
-					       &ent_idx, &min_ent,
-					       &min_ent_page);
-		if (rc != 0)
-			GOTO(out, rc);
-
-		/* If it can not get minum entry, it means it already reaches
-		 * the end of this directory */
-		if (min_ent == NULL) {
-			last_ent->lde_reclen = 0;
-			hash_offset = MDS_DIR_END_OFF;
-			GOTO(out, rc);
+	/* initalize dir read context */
+	stripe_count = op_data->op_mea1->lsm_md_stripe_count;
+	OBD_ALLOC(ctxt, offsetof(typeof(*ctxt), ldc_stripes[stripe_count]));
+	if (!ctxt)
+		GOTO(free_page, rc = -ENOMEM);
+	ctxt->ldc_lmv = &exp->exp_obd->u.lmv;
+	ctxt->ldc_op_data = op_data;
+	ctxt->ldc_cb_op = cb_op;
+	ctxt->ldc_hash = offset;
+	ctxt->ldc_count = stripe_count;
+
+	while (1) {
+		next = lmv_dirent_next(ctxt);
+
+		/* end of directory */
+		if (!next) {
+			ctxt->ldc_hash = MDS_DIR_END_OFF;
+			break;
 		}
+		ctxt->ldc_hash = le64_to_cpu(next->lde_hash);
 
-		ent_size = le16_to_cpu(min_ent->lde_reclen);
+		ent_size = le16_to_cpu(next->lde_reclen);
 
-		/* the last entry lde_reclen is 0, but it might not
-		 * the end of this entry of this temporay entry */
-		if (ent_size == 0)
+		/* the last entry lde_reclen is 0, but it might not be the last
+		 * one of this temporay dir page */
+		if (!ent_size)
 			ent_size = lu_dirent_calc_size(
-					le16_to_cpu(min_ent->lde_namelen),
-					le32_to_cpu(min_ent->lde_attrs));
-		if (ent_size > left_bytes) {
-			last_ent->lde_reclen = cpu_to_le16(0);
-			hash_offset = le64_to_cpu(min_ent->lde_hash);
-			GOTO(out, rc);
-		}
+					le16_to_cpu(next->lde_namelen),
+					le32_to_cpu(next->lde_attrs));
+		/* page full */
+		if (ent_size > left_bytes)
+			break;
 
-		memcpy(ent, min_ent, ent_size);
+		memcpy(ent, next, ent_size);
 
 		/* Replace . with master FID and Replace .. with the parent FID
 		 * of master object */
 		if (strncmp(ent->lde_name, ".",
 			    le16_to_cpu(ent->lde_namelen)) == 0 &&
 		    le16_to_cpu(ent->lde_namelen) == 1)
-			fid_cpu_to_le(&ent->lde_fid, &master_fid);
+			fid_cpu_to_le(&ent->lde_fid, &op_data->op_fid1);
 		else if (strncmp(ent->lde_name, "..",
 				   le16_to_cpu(ent->lde_namelen)) == 0 &&
 			   le16_to_cpu(ent->lde_namelen) == 2)
 			fid_cpu_to_le(&ent->lde_fid, &op_data->op_fid3);
 
+		CDEBUG(D_INODE, "entry %.*s hash %#llx\n",
+		       le16_to_cpu(ent->lde_namelen), ent->lde_name,
+		       le64_to_cpu(ent->lde_hash));
+
 		left_bytes -= ent_size;
 		ent->lde_reclen = cpu_to_le16(ent_size);
 		last_ent = ent;
 		ent = (void *)ent + ent_size;
-		hash_offset = le64_to_cpu(min_ent->lde_hash);
-		if (hash_offset == MDS_DIR_END_OFF) {
-			last_ent->lde_reclen = 0;
-			break;
-		}
-	} while (1);
-out:
-	if (min_ent_page != NULL) {
-		kunmap(min_ent_page);
-		put_page(min_ent_page);
-	}
+	};
 
-	if (unlikely(rc != 0)) {
-		__free_page(ent_page);
-		ent_page = NULL;
-	} else {
-		if (ent == area)
-			dp->ldp_flags |= LDF_EMPTY;
-		dp->ldp_flags = cpu_to_le32(dp->ldp_flags);
-		dp->ldp_hash_end = cpu_to_le64(hash_offset);
-	}
+	last_ent->lde_reclen = 0;
 
-	/* We do not want to allocate md_op_data during each
-	 * dir entry reading, so op_data will be shared by every stripe,
-	 * then we need to restore it back to original value before
-	 * return to the upper layer */
-	op_data->op_fid1 = master_fid;
-	op_data->op_fid2 = master_fid;
-	op_data->op_data = master_inode;
+	if (ent == start)
+		dp->ldp_flags |= LDF_EMPTY;
+	else if (ctxt->ldc_hash == le64_to_cpu(last_ent->lde_hash))
+		dp->ldp_flags |= LDF_COLLIDE;
+	dp->ldp_flags = cpu_to_le32(dp->ldp_flags);
+	dp->ldp_hash_end = cpu_to_le64(ctxt->ldc_hash);
 
-	*ppage = ent_page;
+	put_lmv_dir_ctxt(ctxt);
+	OBD_FREE(ctxt, offsetof(typeof(*ctxt), ldc_stripes[stripe_count]));
 
-	RETURN(rc);
+	*ppage = page;
+
+	RETURN(0);
+
+free_page:
+	kunmap(page);
+	__free_page(page);
+
+	return rc;
 }
 
 int lmv_read_page(struct obd_export *exp, struct md_op_data *op_data,
@@ -2361,7 +2704,7 @@ int lmv_read_page(struct obd_export *exp, struct md_op_data *op_data,
 	ENTRY;
 
 	if (unlikely(lsm != NULL)) {
-		rc = lmv_read_striped_page(exp, op_data, cb_op, offset, ppage);
+		rc = lmv_striped_read_page(exp, op_data, cb_op, offset, ppage);
 		RETURN(rc);
 	}
 
@@ -2399,68 +2742,34 @@ int lmv_read_page(struct obd_export *exp, struct md_op_data *op_data,
  *                      negative errno if failed.
  */
 static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data,
-                      struct ptlrpc_request **request)
+		      struct ptlrpc_request **request)
 {
-	struct obd_device       *obd = exp->exp_obd;
-	struct lmv_obd          *lmv = &obd->u.lmv;
-	struct lmv_tgt_desc     *tgt = NULL;
-	struct lmv_tgt_desc     *parent_tgt = NULL;
-	struct mdt_body		*body;
-	int                     rc;
-	int			stripe_index = 0;
-	struct lmv_stripe_md	*lsm = op_data->op_mea1;
-	ENTRY;
-
-retry_unlink:
-	/* For striped dir, we need to locate the parent as well */
-	if (lsm != NULL) {
-		struct lmv_tgt_desc *tmp;
-
-		LASSERT(op_data->op_name != NULL &&
-			op_data->op_namelen != 0);
-
-		tmp = lmv_locate_target_for_name(lmv, lsm,
-						 op_data->op_name,
-						 op_data->op_namelen,
-						 &op_data->op_fid1,
-						 &op_data->op_mds);
-
-		/* return -EBADFD means unknown hash type, might
-		 * need try all sub-stripe here */
-		if (IS_ERR(tmp) && PTR_ERR(tmp) != -EBADFD)
-			RETURN(PTR_ERR(tmp));
-
-		/* Note: both migrating dir and unknown hash dir need to
-		 * try all of sub-stripes, so we need start search the
-		 * name from stripe 0, but migrating dir is already handled
-		 * inside lmv_locate_target_for_name(), so we only check
-		 * unknown hash type directory here */
-		if (!lmv_is_known_hash_type(lsm->lsm_md_hash_type)) {
-			struct lmv_oinfo *oinfo;
-
-			oinfo = &lsm->lsm_md_oinfo[stripe_index];
-
-			op_data->op_fid1 = oinfo->lmo_fid;
-			op_data->op_mds = oinfo->lmo_mds;
-		}
-	}
-
-try_next_stripe:
-	/* Send unlink requests to the MDT where the child is located */
-	if (likely(!fid_is_zero(&op_data->op_fid2)))
-		tgt = lmv_find_target(lmv, &op_data->op_fid2);
-	else if (lsm != NULL)
-		tgt = lmv_get_target(lmv, op_data->op_mds, NULL);
-	else
-		tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	struct lmv_tgt_desc *parent_tgt;
+	struct mdt_body *body;
+	int rc;
 
-	if (IS_ERR(tgt))
-		RETURN(PTR_ERR(tgt));
+	ENTRY;
 
 	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
 	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
 	op_data->op_cap = cfs_curproc_cap_pack();
 
+retry:
+	parent_tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(parent_tgt))
+		RETURN(PTR_ERR(parent_tgt));
+
+	if (likely(!fid_is_zero(&op_data->op_fid2))) {
+		tgt = lmv_find_target(lmv, &op_data->op_fid2);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
+	} else {
+		tgt = parent_tgt;
+	}
+
 	/*
 	 * If child's fid is given, cancel unused locks for it if it is from
 	 * another export than parent.
@@ -2470,50 +2779,29 @@ static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data,
 	 */
 	op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
 
-	/*
-	 * Cancel FULL locks on child (fid3).
-	 */
-	parent_tgt = lmv_find_target(lmv, &op_data->op_fid1);
-	if (IS_ERR(parent_tgt))
-		RETURN(PTR_ERR(parent_tgt));
-
-	if (parent_tgt != tgt) {
+	if (parent_tgt != tgt)
 		rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_idx,
 				      LCK_EX, MDS_INODELOCK_LOOKUP,
 				      MF_MDC_CANCEL_FID3);
-	}
 
 	rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_idx, LCK_EX,
-			      MDS_INODELOCK_FULL, MF_MDC_CANCEL_FID3);
-	if (rc != 0)
+			      MDS_INODELOCK_ELC, MF_MDC_CANCEL_FID3);
+	if (rc)
 		RETURN(rc);
 
 	CDEBUG(D_INODE, "unlink with fid="DFID"/"DFID" -> mds #%u\n",
 	       PFID(&op_data->op_fid1), PFID(&op_data->op_fid2), tgt->ltd_idx);
 
 	rc = md_unlink(tgt->ltd_exp, op_data, request);
-	if (rc != 0 && rc != -EREMOTE && rc != -ENOENT)
-		RETURN(rc);
-
-	/* Try next stripe if it is needed. */
-	if (rc == -ENOENT && lsm != NULL && lmv_need_try_all_stripes(lsm)) {
-		struct lmv_oinfo *oinfo;
-
-		stripe_index++;
-		if (stripe_index >= lsm->lsm_md_stripe_count)
-			RETURN(rc);
-
-		oinfo = &lsm->lsm_md_oinfo[stripe_index];
-
-		op_data->op_fid1 = oinfo->lmo_fid;
-		op_data->op_mds = oinfo->lmo_mds;
-
+	if (rc == -ENOENT && lmv_dir_retry_check_update(op_data)) {
 		ptlrpc_req_finished(*request);
 		*request = NULL;
-
-		goto try_next_stripe;
+		goto retry;
 	}
 
+	if (rc != -EREMOTE)
+		RETURN(rc);
+
 	body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY);
 	if (body == NULL)
 		RETURN(-EPROTO);
@@ -2522,40 +2810,23 @@ static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data,
 	if (likely(!(body->mbo_valid & OBD_MD_MDS)))
 		RETURN(rc);
 
-	CDEBUG(D_INODE, "%s: try unlink to another MDT for "DFID"\n",
-	       exp->exp_obd->obd_name, PFID(&body->mbo_fid1));
-
-	/* This is a remote object, try remote MDT, Note: it may
-	 * try more than 1 time here, Considering following case
-	 * /mnt/lustre is root on MDT0, remote1 is on MDT1
-	 * 1. Initially A does not know where remote1 is, it send
-	 *    unlink RPC to MDT0, MDT0 return -EREMOTE, it will
-	 *    resend unlink RPC to MDT1 (retry 1st time).
-	 *
-	 * 2. During the unlink RPC in flight,
-	 *    client B mv /mnt/lustre/remote1 /mnt/lustre/remote2
-	 *    and create new remote1, but on MDT0
-	 *
-	 * 3. MDT1 get unlink RPC(from A), then do remote lock on
-	 *    /mnt/lustre, then lookup get fid of remote1, and find
-	 *    it is remote dir again, and replay -EREMOTE again.
-	 *
-	 * 4. Then A will resend unlink RPC to MDT0. (retry 2nd times).
-	 *
-	 * In theory, it might try unlimited time here, but it should
-	 * be very rare case.  */
+	/* This is a remote object, try remote MDT. */
 	op_data->op_fid2 = body->mbo_fid1;
 	ptlrpc_req_finished(*request);
 	*request = NULL;
 
-	goto retry_unlink;
+	tgt = lmv_find_target(lmv, &op_data->op_fid2);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	goto retry;
 }
 
 static int lmv_precleanup(struct obd_device *obd)
 {
 	ENTRY;
 	libcfs_kkuc_group_rem(&obd->obd_uuid, 0, KUC_GRP_HSM);
-	fld_client_proc_fini(&obd->u.lmv.lmv_fld);
+	fld_client_debugfs_fini(&obd->u.lmv.lmv_fld);
 	lprocfs_obd_cleanup(obd);
 	lprocfs_free_md_stats(obd);
 	RETURN(0);
@@ -2631,6 +2902,96 @@ static int lmv_get_info(const struct lu_env *env, struct obd_export *exp,
         RETURN(-EINVAL);
 }
 
+static int lmv_rmfid(struct obd_export *exp, struct fid_array *fa,
+		     int *__rcs, struct ptlrpc_request_set *_set)
+{
+	struct obd_device *obddev = class_exp2obd(exp);
+	struct ptlrpc_request_set *set = _set;
+	struct lmv_obd *lmv = &obddev->u.lmv;
+	int tgt_count = lmv->desc.ld_tgt_count;
+	struct fid_array *fat, **fas = NULL;
+	int i, rc, **rcs = NULL;
+
+	if (!set) {
+		set = ptlrpc_prep_set();
+		if (!set)
+			RETURN(-ENOMEM);
+	}
+
+	/* split FIDs by targets */
+	OBD_ALLOC(fas, sizeof(fas) * tgt_count);
+	if (fas == NULL)
+		GOTO(out, rc = -ENOMEM);
+	OBD_ALLOC(rcs, sizeof(int *) * tgt_count);
+	if (rcs == NULL)
+		GOTO(out_fas, rc = -ENOMEM);
+
+	for (i = 0; i < fa->fa_nr; i++) {
+		unsigned int idx;
+
+		rc = lmv_fld_lookup(lmv, &fa->fa_fids[i], &idx);
+		if (rc) {
+			CDEBUG(D_OTHER, "can't lookup "DFID": rc = %d\n",
+			       PFID(&fa->fa_fids[i]), rc);
+			continue;
+		}
+		LASSERT(idx < tgt_count);
+		if (!fas[idx])
+			OBD_ALLOC(fas[idx], offsetof(struct fid_array,
+				  fa_fids[fa->fa_nr]));
+		if (!fas[idx])
+			GOTO(out, rc = -ENOMEM);
+		if (!rcs[idx])
+			OBD_ALLOC(rcs[idx], sizeof(int) * fa->fa_nr);
+		if (!rcs[idx])
+			GOTO(out, rc = -ENOMEM);
+
+		fat = fas[idx];
+		fat->fa_fids[fat->fa_nr++] = fa->fa_fids[i];
+	}
+
+	for (i = 0; i < tgt_count; i++) {
+		fat = fas[i];
+		if (!fat || fat->fa_nr == 0)
+			continue;
+		rc = md_rmfid(lmv->tgts[i]->ltd_exp, fat, rcs[i], set);
+	}
+
+	rc = ptlrpc_set_wait(NULL, set);
+	if (rc == 0) {
+		int j = 0;
+		for (i = 0; i < tgt_count; i++) {
+			fat = fas[i];
+			if (!fat || fat->fa_nr == 0)
+				continue;
+			/* copy FIDs back */
+			memcpy(fa->fa_fids + j, fat->fa_fids,
+			       fat->fa_nr * sizeof(struct lu_fid));
+			/* copy rcs back */
+			memcpy(__rcs + j, rcs[i], fat->fa_nr * sizeof(**rcs));
+			j += fat->fa_nr;
+		}
+	}
+	if (set != _set)
+		ptlrpc_set_destroy(set);
+
+out:
+	for (i = 0; i < tgt_count; i++) {
+		if (fas && fas[i])
+			OBD_FREE(fas[i], offsetof(struct fid_array,
+						fa_fids[fa->fa_nr]));
+		if (rcs && rcs[i])
+			OBD_FREE(rcs[i], sizeof(int) * fa->fa_nr);
+	}
+	if (rcs)
+		OBD_FREE(rcs, sizeof(int *) * tgt_count);
+out_fas:
+	if (fas)
+		OBD_FREE(fas, sizeof(fas) * tgt_count);
+
+	RETURN(rc);
+}
+
 /**
  * Asynchronously set by key a value associated with a LMV device.
  *
@@ -2705,13 +3066,15 @@ static int lmv_unpack_md_v1(struct obd_export *exp, struct lmv_stripe_md *lsm,
 	else
 		lsm->lsm_md_hash_type = le32_to_cpu(lmm1->lmv_hash_type);
 	lsm->lsm_md_layout_version = le32_to_cpu(lmm1->lmv_layout_version);
+	lsm->lsm_md_migrate_offset = le32_to_cpu(lmm1->lmv_migrate_offset);
+	lsm->lsm_md_migrate_hash = le32_to_cpu(lmm1->lmv_migrate_hash);
 	cplen = strlcpy(lsm->lsm_md_pool_name, lmm1->lmv_pool_name,
 			sizeof(lsm->lsm_md_pool_name));
 
 	if (cplen >= sizeof(lsm->lsm_md_pool_name))
 		RETURN(-E2BIG);
 
-	CDEBUG(D_INFO, "unpack lsm count %d, master %d hash_type %d"
+	CDEBUG(D_INFO, "unpack lsm count %d, master %d hash_type %#x "
 	       "layout_version %d\n", lsm->lsm_md_stripe_count,
 	       lsm->lsm_md_master_mdt_index, lsm->lsm_md_hash_type,
 	       lsm->lsm_md_layout_version);
@@ -2720,10 +3083,22 @@ static int lmv_unpack_md_v1(struct obd_export *exp, struct lmv_stripe_md *lsm,
 	for (i = 0; i < stripe_count; i++) {
 		fid_le_to_cpu(&lsm->lsm_md_oinfo[i].lmo_fid,
 			      &lmm1->lmv_stripe_fids[i]);
+		/*
+		 * set default value -1, so lmv_locate_tgt() knows this stripe
+		 * target is not initialized.
+		 */
+		lsm->lsm_md_oinfo[i].lmo_mds = (u32)-1;
+		if (!fid_is_sane(&lsm->lsm_md_oinfo[i].lmo_fid))
+			continue;
+
 		rc = lmv_fld_lookup(lmv, &lsm->lsm_md_oinfo[i].lmo_fid,
 				    &lsm->lsm_md_oinfo[i].lmo_mds);
-		if (rc != 0)
+		if (rc == -ENOENT)
+			continue;
+
+		if (rc)
 			RETURN(rc);
+
 		CDEBUG(D_INFO, "unpack fid #%d "DFID"\n", i,
 		       PFID(&lsm->lsm_md_oinfo[i].lmo_fid));
 	}
@@ -2746,12 +3121,9 @@ static int lmv_unpackmd(struct obd_export *exp, struct lmv_stripe_md **lsmp,
 	/* Free memmd */
 	if (lsm != NULL && lmm == NULL) {
 		int i;
+
 		for (i = 0; i < lsm->lsm_md_stripe_count; i++) {
-			/* For migrating inode, the master stripe and master
-			 * object will be the same, so do not need iput, see
-			 * ll_update_lsm_md */
-			if (!(lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION &&
-			      i == 0) && lsm->lsm_md_oinfo[i].lmo_root != NULL)
+			if (lsm->lsm_md_oinfo[i].lmo_root)
 				iput(lsm->lsm_md_oinfo[i].lmo_root);
 		}
 		lsm_size = lmv_stripe_md_size(lsm->lsm_md_stripe_count);
@@ -2963,35 +3335,34 @@ int lmv_clear_open_replay_data(struct obd_export *exp,
 int lmv_intent_getattr_async(struct obd_export *exp,
 			     struct md_enqueue_info *minfo)
 {
-	struct md_op_data       *op_data = &minfo->mi_data;
-	struct obd_device       *obd = exp->exp_obd;
-	struct lmv_obd          *lmv = &obd->u.lmv;
-	struct lmv_tgt_desc     *ptgt = NULL;
-	struct lmv_tgt_desc     *ctgt = NULL;
-	int                      rc;
+	struct md_op_data *op_data = &minfo->mi_data;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *ptgt;
+	struct lmv_tgt_desc *ctgt;
+	int rc;
+
 	ENTRY;
 
 	if (!fid_is_sane(&op_data->op_fid2))
 		RETURN(-EINVAL);
 
-	ptgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	ptgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
 	if (IS_ERR(ptgt))
 		RETURN(PTR_ERR(ptgt));
 
-	ctgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
+	ctgt = lmv_find_target(lmv, &op_data->op_fid2);
 	if (IS_ERR(ctgt))
 		RETURN(PTR_ERR(ctgt));
 
-	/*
-	 * if child is on remote MDT, we need 2 async RPCs to fetch both LOOKUP
-	 * lock on parent, and UPDATE lock on child MDT, which makes all
-	 * complicated. Considering remote dir is rare case, and not supporting
-	 * it in statahead won't cause any issue, drop its support for now.
+	/* remote object needs two RPCs to lookup and getattr, considering the
+	 * complexity, don't support statahead for now.
 	 */
 	if (ptgt != ctgt)
-		RETURN(-ENOTSUPP);
+		RETURN(-EREMOTE);
 
 	rc = md_intent_getattr_async(ptgt->ltd_exp, minfo);
+
 	RETURN(rc);
 }
 
@@ -3019,7 +3390,7 @@ int lmv_get_fid_from_lsm(struct obd_export *exp,
 	const struct lmv_oinfo *oinfo;
 
 	LASSERT(lsm != NULL);
-	oinfo = lsm_name_to_stripe_info(lsm, name, namelen);
+	oinfo = lsm_name_to_stripe_info(lsm, name, namelen, false);
 	if (IS_ERR(oinfo))
 		return PTR_ERR(oinfo);
 
@@ -3096,6 +3467,9 @@ static int lmv_merge_attr(struct obd_export *exp,
 	for (i = 0; i < lsm->lsm_md_stripe_count; i++) {
 		struct inode *inode = lsm->lsm_md_oinfo[i].lmo_root;
 
+		if (!inode)
+			continue;
+
 		CDEBUG(D_INFO,
 		       "" DFID " size %llu, blocks %llu nlink %u, atime %lld ctime %lld, mtime %lld.\n",
 		       PFID(&lsm->lsm_md_oinfo[i].lmo_fid),
@@ -3156,6 +3530,7 @@ struct md_ops lmv_md_ops = {
         .m_setattr              = lmv_setattr,
         .m_setxattr             = lmv_setxattr,
 	.m_fsync		= lmv_fsync,
+	.m_file_resync		= lmv_file_resync,
 	.m_read_page		= lmv_read_page,
         .m_unlink               = lmv_unlink,
         .m_init_ea_size         = lmv_init_ea_size,
@@ -3171,6 +3546,7 @@ struct md_ops lmv_md_ops = {
 	.m_revalidate_lock      = lmv_revalidate_lock,
 	.m_get_fid_from_lsm	= lmv_get_fid_from_lsm,
 	.m_unpackmd		= lmv_unpackmd,
+	.m_rmfid		= lmv_rmfid,
 };
 
 static int __init lmv_init(void)
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c b/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c
index 37c22a92de716..dc35e7d9d9e66 100644
--- a/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c
+++ b/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -32,63 +32,58 @@
 
 #define DEBUG_SUBSYSTEM S_CLASS
 
-#include <linux/version.h>
 #include <linux/seq_file.h>
-#include <asm/statfs.h>
+#include <linux/statfs.h>
 #include <lprocfs_status.h>
 #include <obd_class.h>
 
 #include "lmv_internal.h"
 
-#ifndef CONFIG_PROC_FS
-static struct lprocfs_vars lprocfs_module_vars[] = { {0} };
-static struct lprocfs_vars lprocfs_obd_vars[] = { {0} };
-#else
-static int lmv_numobd_seq_show(struct seq_file *m, void *v)
+static ssize_t numobd_show(struct kobject *kobj, struct attribute *attr,
+			   char *buf)
 {
-	struct obd_device	*dev = (struct obd_device *)m->private;
-        struct lmv_desc         *desc;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lmv_desc *desc;
 
-        LASSERT(dev != NULL);
         desc = &dev->u.lmv.desc;
-	seq_printf(m, "%u\n", desc->ld_tgt_count);
-	return 0;
+	return sprintf(buf, "%u\n", desc->ld_tgt_count);
 }
-LPROC_SEQ_FOPS_RO(lmv_numobd);
+LUSTRE_RO_ATTR(numobd);
 
-static int lmv_activeobd_seq_show(struct seq_file *m, void *v)
+static ssize_t activeobd_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
 {
-	struct obd_device	*dev = (struct obd_device *)m->private;
-        struct lmv_desc         *desc;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lmv_desc *desc;
 
-        LASSERT(dev != NULL);
         desc = &dev->u.lmv.desc;
-	seq_printf(m, "%u\n", desc->ld_active_tgt_count);
-	return 0;
+	return sprintf(buf, "%u\n", desc->ld_active_tgt_count);
 }
-LPROC_SEQ_FOPS_RO(lmv_activeobd);
+LUSTRE_RO_ATTR(activeobd);
 
-static int lmv_desc_uuid_seq_show(struct seq_file *m, void *v)
+static ssize_t desc_uuid_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
 {
-	struct obd_device	*dev = (struct obd_device*)m->private;
-        struct lmv_obd          *lmv;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lmv_desc *desc;
 
-        LASSERT(dev != NULL);
-        lmv = &dev->u.lmv;
-	seq_printf(m, "%s\n", lmv->desc.ld_uuid.uuid);
-	return 0;
+	desc = &dev->u.lmv.desc;
+	return sprintf(buf, "%s\n", desc->ld_uuid.uuid);
 }
-LPROC_SEQ_FOPS_RO(lmv_desc_uuid);
+LUSTRE_RO_ATTR(desc_uuid);
 
+#ifdef CONFIG_PROC_FS
 static void *lmv_tgt_seq_start(struct seq_file *p, loff_t *pos)
 {
 	struct obd_device       *dev = p->private;
 	struct lmv_obd          *lmv = &dev->u.lmv;
 
 	while (*pos < lmv->tgts_size) {
-		if (lmv->tgts[*pos] != NULL)
+		if (lmv->tgts[*pos])
 			return lmv->tgts[*pos];
-
 		++*pos;
 	}
 
@@ -97,7 +92,6 @@ static void *lmv_tgt_seq_start(struct seq_file *p, loff_t *pos)
 
 static void lmv_tgt_seq_stop(struct seq_file *p, void *v)
 {
-        return;
 }
 
 static void *lmv_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos)
@@ -107,9 +101,8 @@ static void *lmv_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos)
 
 	++*pos;
 	while (*pos < lmv->tgts_size) {
-		if (lmv->tgts[*pos] != NULL)
+		if (lmv->tgts[*pos])
 			return lmv->tgts[*pos];
-
 		++*pos;
 	}
 
@@ -120,10 +113,12 @@ static int lmv_tgt_seq_show(struct seq_file *p, void *v)
 {
 	struct lmv_tgt_desc     *tgt = v;
 
-	if (tgt == NULL)
+	if (!tgt)
 		return 0;
-	seq_printf(p, "%u: %s %sACTIVE\n", tgt->ltd_idx,
-		  tgt->ltd_uuid.uuid, tgt->ltd_active ? "" : "IN");
+
+	seq_printf(p, "%u: %s %sACTIVE\n",
+		   tgt->ltd_idx, tgt->ltd_uuid.uuid,
+		   tgt->ltd_active ? "" : "IN");
 	return 0;
 }
 
@@ -148,21 +143,7 @@ static int lmv_target_seq_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
-LPROC_SEQ_FOPS_RO_TYPE(lmv, uuid);
-
-struct lprocfs_vars lprocfs_lmv_obd_vars[] = {
-	{ .name	=	"numobd",
-	  .fops	=	&lmv_numobd_fops	},
-	{ .name	=	"activeobd",
-	  .fops	=	&lmv_activeobd_fops	},
-	{ .name	=	"uuid",
-	  .fops	=	&lmv_uuid_fops		},
-	{ .name	=	"desc_uuid",
-	  .fops	=	&lmv_desc_uuid_fops	},
-	{ NULL }
-};
-
-const struct proc_ops lmv_proc_target_fops = {
+static const struct proc_ops lmv_proc_target_fops = {
 	PROC_OWNER(THIS_MODULE)
 	.proc_open	= lmv_target_seq_open,
 	.proc_read	= seq_read,
@@ -170,3 +151,39 @@ const struct proc_ops lmv_proc_target_fops = {
 	.proc_release	= seq_release,
 };
 #endif /* CONFIG_PROC_FS */
+
+static struct attribute *lmv_attrs[] = {
+	&lustre_attr_activeobd.attr,
+	&lustre_attr_desc_uuid.attr,
+	&lustre_attr_numobd.attr,
+	NULL,
+};
+
+int lmv_tunables_init(struct obd_device *obd)
+{
+	int rc;
+
+	obd->obd_ktype.default_attrs = lmv_attrs;
+	rc = lprocfs_obd_setup(obd, true);
+	if (rc)
+		goto out_failed;
+#ifdef CONFIG_PROC_FS
+	rc = lprocfs_alloc_md_stats(obd, 0);
+	if (rc) {
+		lprocfs_obd_cleanup(obd);
+		goto out_failed;
+	}
+
+	rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd",
+				0444, &lmv_proc_target_fops, obd);
+	if (rc) {
+		lprocfs_free_md_stats(obd);
+		lprocfs_obd_cleanup(obd);
+		CWARN("%s: error adding LMV target_obd file: rc = %d\n",
+		      obd->obd_name, rc);
+		rc = 0;
+	}
+#endif /* CONFIG_PROC_FS */
+out_failed:
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/lustre/lov/Makefile b/drivers/staging/lustrefsx/lustre/lov/Makefile
index e74389ed4c3e3..dae11b1647cbe 100644
--- a/drivers/staging/lustrefsx/lustre/lov/Makefile
+++ b/drivers/staging/lustrefsx/lustre/lov/Makefile
@@ -2,7 +2,7 @@ obj-$(CONFIG_LUSTREFSX_FS)	+= lov.o
 
 lov-y	:= lov_dev.o lov_ea.o lov_io.o lov_lock.o lov_merge.o lov_obd.o
 lov-y	+= lov_object.o lov_offset.o lov_pack.o lov_page.o lov_pool.o
-lov-y	+= lov_request.o lovsub_dev.o lovsub_lock.o lovsub_object.o
-lov-y	+= lovsub_page.o lproc_lov.o
+lov-y	+= lov_request.o lovsub_dev.o lovsub_object.o
+lov-y	+= lproc_lov.o
 
 include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_cl_internal.h b/drivers/staging/lustrefsx/lustre/lov/lov_cl_internal.h
index 0e84ab38e189a..62ee46daed68f 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_cl_internal.h
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_cl_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -81,7 +81,6 @@
 
 struct lovsub_device;
 struct lovsub_object;
-struct lovsub_lock;
 
 enum lov_device_flags {
         LOV_DEV_INITIALIZED = 1 << 0
@@ -91,6 +90,12 @@ enum lov_device_flags {
  * Upper half.
  */
 
+/* Data-on-MDT array item in lov_device::ld_md_tgts[] */
+struct lovdom_device {
+	struct cl_device	*ldm_mdc;
+	int			 ldm_idx;
+};
+
 struct lov_device {
         /*
          * XXX Locking of lov-private data is missing.
@@ -101,6 +106,13 @@ struct lov_device {
         __u32                     ld_target_nr;
         struct lovsub_device    **ld_target;
         __u32                     ld_flags;
+
+	/* Data-on-MDT devices */
+	__u32			  ld_md_tgts_nr;
+	struct lovdom_device	 *ld_md_tgts;
+	struct obd_device	 *ld_lmv;
+	/* LU site for subdevices */
+	struct lu_site		  ld_site;
 };
 
 /**
@@ -129,15 +141,48 @@ static inline char *llt2str(enum lov_layout_type llt)
 	return "";
 }
 
+/**
+ * Return lov_layout_entry_type associated with a given composite layout
+ * entry.
+ */
+static inline __u32 lov_entry_type(struct lov_stripe_md_entry *lsme)
+{
+	if ((lov_pattern(lsme->lsme_pattern) == LOV_PATTERN_RAID0) ||
+	    (lov_pattern(lsme->lsme_pattern) == LOV_PATTERN_MDT))
+		return lov_pattern(lsme->lsme_pattern);
+	return 0;
+}
+
+struct lov_layout_entry;
+struct lov_object;
+struct lov_lock_sub;
+
+struct lov_comp_layout_entry_ops {
+	int (*lco_init)(const struct lu_env *env, struct lov_device *dev,
+			struct lov_object *lov, unsigned int index,
+			const struct cl_object_conf *conf,
+			struct lov_layout_entry *lle);
+	void (*lco_fini)(const struct lu_env *env,
+			 struct lov_layout_entry *lle);
+	int  (*lco_getattr)(const struct lu_env *env, struct lov_object *obj,
+			    unsigned int index, struct lov_layout_entry *lle,
+			    struct cl_attr **attr);
+};
+
 struct lov_layout_raid0 {
 	unsigned               lo_nr;
+	/**
+	 * record the stripe no before the truncate size, used for setting OST
+	 * object size for truncate. LU-14128.
+	 */
+	int                    lo_trunc_stripeno;
 	/**
 	 * When this is true, lov_object::lo_attr contains
 	 * valid up to date attributes for a top-level
 	 * object. This field is reset to 0 when attributes of
 	 * any sub-object change.
 	 */
-	int		       lo_attr_valid;
+	bool		       lo_attr_valid;
 	/**
 	 * Array of sub-objects. Allocated when top-object is
 	 * created (lov_init_raid0()).
@@ -165,6 +210,38 @@ struct lov_layout_raid0 {
 	struct cl_attr         lo_attr;
 };
 
+struct lov_layout_dom {
+	/* keep this always at first place so DOM layout entry
+	 * can be addressed also as RAID0 after initialization.
+	 */
+	struct lov_layout_raid0 lo_dom_r0;
+	struct lovsub_object *lo_dom;
+	struct lov_oinfo *lo_loi;
+};
+
+struct lov_layout_entry {
+	__u32				lle_type;
+	unsigned int			lle_valid:1;
+	struct lu_extent		*lle_extent;
+	struct lov_stripe_md_entry	*lle_lsme;
+	struct lov_comp_layout_entry_ops *lle_comp_ops;
+	union {
+		struct lov_layout_raid0	lle_raid0;
+		struct lov_layout_dom	lle_dom;
+	};
+};
+
+struct lov_mirror_entry {
+	unsigned short	lre_mirror_id;
+	unsigned short	lre_preferred:1,
+			lre_stale:1,	/* set if any components is stale */
+			lre_valid:1;	/* set if at least one of components
+					 * in this mirror is valid */
+	unsigned short	lre_start;	/* index to lo_entries, start index of
+					 * this mirror */
+	unsigned short	lre_end;	/* end index of this mirror */
+};
+
 /**
  * lov-specific file state.
  *
@@ -180,7 +257,7 @@ struct lov_layout_raid0 {
  * function corresponding to the current layout type.
  */
 struct lov_object {
-	struct cl_object       lo_cl;
+	struct cl_object	lo_cl;
 	/**
 	 * Serializes object operations with transitions between layout types.
 	 *
@@ -220,13 +297,37 @@ struct lov_object {
 		} released;
 		struct lov_layout_composite {
 			/**
-			 * Current valid entry count of lo_entries.
+			 * flags of lov_comp_md_v1::lcm_flags. Mainly used
+			 * by FLR.
+			 */
+			uint32_t        lo_flags;
+			/**
+			 * For FLR: index of preferred mirror to read.
+			 * Preferred mirror is initialized by the preferred
+			 * bit of lsme. It can be changed when the preferred
+			 * is inaccessible.
+			 * In order to make lov_lsm_entry() return the same
+			 * mirror in the same IO context, it's only possible
+			 * to change the preferred mirror when the
+			 * lo_active_ios reaches zero.
+			 */
+			int             lo_preferred_mirror;
+			/**
+			 * For FLR: the lock to protect access to
+			 * lo_preferred_mirror.
 			 */
-			unsigned int lo_entry_count;
-			struct lov_layout_entry {
-				struct lu_extent lle_extent;
-				struct lov_layout_raid0 lle_raid0;
-			} *lo_entries;
+			spinlock_t      lo_write_lock;
+			/**
+			 * For FLR: Number of (valid) mirrors.
+			 */
+			unsigned        lo_mirror_count;
+			struct lov_mirror_entry *lo_mirrors;
+			/**
+			 * Current entry count of lo_entries, include
+			 * invalid entries.
+			 */
+			unsigned int    lo_entry_count;
+			struct lov_layout_entry *lo_entries;
 		} composite;
 	} u;
 	/**
@@ -236,11 +337,80 @@ struct lov_object {
 	struct task_struct            *lo_owner;
 };
 
-#define lov_foreach_layout_entry(lov, entry)			\
-	for (entry = &lov->u.composite.lo_entries[0];		\
-	     entry < &lov->u.composite.lo_entries		\
-			[lov->u.composite.lo_entry_count];	\
-	     entry++)
+static inline struct lov_layout_raid0 *lov_r0(struct lov_object *lov, int i)
+{
+	LASSERT(lov->lo_type == LLT_COMP);
+	LASSERTF(i < lov->u.composite.lo_entry_count,
+		 "entry %d entry_count %d", i, lov->u.composite.lo_entry_count);
+
+	return &lov->u.composite.lo_entries[i].lle_raid0;
+}
+
+static inline struct lov_stripe_md_entry *lov_lse(struct lov_object *lov, int i)
+{
+	LASSERT(lov->lo_lsm != NULL);
+	LASSERT(i < lov->lo_lsm->lsm_entry_count);
+
+	return lov->lo_lsm->lsm_entries[i];
+}
+
+static inline unsigned lov_flr_state(const struct lov_object *lov)
+{
+	if (lov->lo_type != LLT_COMP)
+		return LCM_FL_NONE;
+
+	return lov->u.composite.lo_flags & LCM_FL_FLR_MASK;
+}
+
+static inline bool lov_is_flr(const struct lov_object *lov)
+{
+	return lov_flr_state(lov) != LCM_FL_NONE;
+}
+
+static inline struct lov_layout_entry *lov_entry(struct lov_object *lov, int i)
+{
+	LASSERT(lov->lo_type == LLT_COMP);
+	LASSERTF(i < lov->u.composite.lo_entry_count,
+		 "entry %d entry_count %d", i, lov->u.composite.lo_entry_count);
+
+	return &lov->u.composite.lo_entries[i];
+}
+
+#define lov_for_layout_entry(lov, entry, start, end)			\
+	for (entry = lov_entry(lov, start);				\
+	     entry <= lov_entry(lov, end); entry++)
+
+#define lov_foreach_layout_entry(lov, entry)				\
+	lov_for_layout_entry(lov, entry, 0,				\
+			     (lov)->u.composite.lo_entry_count - 1)
+
+#define lov_foreach_mirror_layout_entry(lov, entry, lre)		\
+	lov_for_layout_entry(lov, entry, (lre)->lre_start, (lre)->lre_end)
+
+static inline struct lov_mirror_entry *
+lov_mirror_entry(struct lov_object *lov, int i)
+{
+	LASSERT(i < lov->u.composite.lo_mirror_count);
+	return &lov->u.composite.lo_mirrors[i];
+}
+
+#define lov_foreach_mirror_entry(lov, lre)				\
+	for (lre = lov_mirror_entry(lov, 0);				\
+	     lre <= lov_mirror_entry(lov,				\
+				lov->u.composite.lo_mirror_count - 1);	\
+	     lre++)
+
+static inline unsigned
+lov_layout_entry_index(struct lov_object *lov, struct lov_layout_entry *entry)
+{
+	struct lov_layout_entry *first = &lov->u.composite.lo_entries[0];
+	unsigned index = (unsigned)(entry - first);
+
+	LASSERT(entry >= first);
+	LASSERT(index < lov->u.composite.lo_entry_count);
+
+	return index;
+}
 
 /**
  * State lov_lock keeps for each sub-lock.
@@ -270,6 +440,8 @@ struct lov_page {
 	struct cl_page_slice	lps_cl;
 	/** layout_entry + stripe index, composed using lov_comp_index() */
 	unsigned int		lps_index;
+	/* the layout gen when this page was created */
+	__u32			lps_layout_gen;
 };
 
 /*
@@ -288,13 +460,6 @@ struct lovsub_object {
         int                     lso_index;
 };
 
-/**
- * Lock state at lovsub layer.
- */
-struct lovsub_lock {
-        struct cl_lock_slice  lss_cl;
-};
-
 /**
  * Describe the environment settings for sublocks.
  */
@@ -303,11 +468,6 @@ struct lov_sublock_env {
         struct cl_io        *lse_io;
 };
 
-struct lovsub_page {
-        struct cl_page_slice lsb_cl;
-};
-
-
 struct lov_thread_info {
 	struct cl_object_conf   lti_stripe_conf;
 	struct lu_fid           lti_fid;
@@ -356,6 +516,26 @@ struct lov_io_sub {
 struct lov_io {
         /** super-class */
         struct cl_io_slice lis_cl;
+
+	/**
+	 * FLR: index to lo_mirrors. Valid only if lov_is_flr() returns true.
+	 *
+	 * The mirror index of this io. Preserved over cl_io_init()
+	 * if io->ci_ndelay_tried is greater than zero.
+	 */
+	int			lis_mirror_index;
+	/**
+	 * FLR: the layout gen when lis_mirror_index was cached. The
+	 * mirror index makes sense only when the layout gen doesn't
+	 * change.
+	 */
+	int			lis_mirror_layout_gen;
+
+	/**
+	 * fields below this will be initialized in lov_io_init().
+	 */
+	unsigned		lis_preserved;
+
         /**
          * Pointer to the object slice. This is a duplicate of
          * lov_io::lis_cl::cis_object.
@@ -398,6 +578,7 @@ struct lov_io {
 	 * All sub-io's created in this lov_io.
 	 */
 	struct list_head	lis_subios;
+
 };
 
 struct lov_session {
@@ -416,7 +597,6 @@ extern struct kmem_cache *lov_object_kmem;
 extern struct kmem_cache *lov_thread_kmem;
 extern struct kmem_cache *lov_session_kmem;
 
-extern struct kmem_cache *lovsub_lock_kmem;
 extern struct kmem_cache *lovsub_object_kmem;
 
 int   lov_object_init     (const struct lu_env *env, struct lu_object *obj,
@@ -427,8 +607,6 @@ int   lov_lock_init       (const struct lu_env *env, struct cl_object *obj,
                            struct cl_lock *lock, const struct cl_io *io);
 int   lov_io_init         (const struct lu_env *env, struct cl_object *obj,
                            struct cl_io *io);
-int   lovsub_lock_init    (const struct lu_env *env, struct cl_object *obj,
-                           struct cl_lock *lock, const struct cl_io *io);
 
 int   lov_lock_init_composite(const struct lu_env *env, struct cl_object *obj,
                            struct cl_lock *lock, const struct cl_io *io);
@@ -446,8 +624,6 @@ struct lov_io_sub *lov_sub_get(const struct lu_env *env, struct lov_io *lio,
 
 int   lov_page_init       (const struct lu_env *env, struct cl_object *ob,
 			   struct cl_page *page, pgoff_t index);
-int   lovsub_page_init    (const struct lu_env *env, struct cl_object *ob,
-			   struct cl_page *page, pgoff_t index);
 int   lov_page_init_empty (const struct lu_env *env, struct cl_object *obj,
 			   struct cl_page *page, pgoff_t index);
 int   lov_page_init_composite(const struct lu_env *env, struct cl_object *obj,
@@ -461,11 +637,27 @@ struct lu_object *lovsub_object_alloc(const struct lu_env *env,
 
 struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov);
 int lov_page_stripe(const struct cl_page *page);
+bool lov_page_is_empty(const struct cl_page *page);
 int lov_lsm_entry(const struct lov_stripe_md *lsm, __u64 offset);
+int lov_io_layout_at(struct lov_io *lio, __u64 offset);
 
 #define lov_foreach_target(lov, var)                    \
         for (var = 0; var < lov_targets_nr(lov); ++var)
 
+static inline struct lu_extent *lov_io_extent(struct lov_io *io, int i)
+{
+	return &lov_lse(io->lis_object, i)->lsme_extent;
+}
+
+/**
+ * For layout entries within @ext.
+ */
+#define lov_foreach_io_layout(ind, lio, ext)				\
+	for (ind = lov_io_layout_at(lio, (ext)->e_start);		\
+	     ind >= 0 &&						\
+	     lu_extent_is_overlapped(lov_io_extent(lio, ind), ext);	\
+	     ind = lov_io_layout_at(lio, lov_io_extent(lio, ind)->e_end))
+
 /*****************************************************************************
  *
  * Type conversions.
@@ -575,22 +767,6 @@ static inline struct lovsub_object *lu2lovsub(const struct lu_object *obj)
         return container_of0(obj, struct lovsub_object, lso_cl.co_lu);
 }
 
-static inline struct lovsub_lock *
-cl2lovsub_lock(const struct cl_lock_slice *slice)
-{
-        LINVRNT(lovsub_is_object(&slice->cls_obj->co_lu));
-        return container_of(slice, struct lovsub_lock, lss_cl);
-}
-
-static inline struct lovsub_lock *cl2sub_lock(const struct cl_lock *lock)
-{
-        const struct cl_lock_slice *slice;
-
-        slice = cl_lock_at(lock, &lovsub_device_type);
-        LASSERT(slice != NULL);
-        return cl2lovsub_lock(slice);
-}
-
 static inline struct lov_lock *cl2lov_lock(const struct cl_lock_slice *slice)
 {
         LINVRNT(lov_is_object(&slice->cls_obj->co_lu));
@@ -603,13 +779,6 @@ static inline struct lov_page *cl2lov_page(const struct cl_page_slice *slice)
         return container_of0(slice, struct lov_page, lps_cl);
 }
 
-static inline struct lovsub_page *
-cl2lovsub_page(const struct cl_page_slice *slice)
-{
-        LINVRNT(lovsub_is_object(&slice->cpl_obj->co_lu));
-        return container_of0(slice, struct lovsub_page, lsb_cl);
-}
-
 static inline struct lov_io *cl2lov_io(const struct lu_env *env,
                                 const struct cl_io_slice *ios)
 {
@@ -634,23 +803,6 @@ static inline struct lov_thread_info *lov_env_info(const struct lu_env *env)
         return info;
 }
 
-static inline struct lov_layout_raid0 *lov_r0(struct lov_object *lov, int i)
-{
-	LASSERT(lov->lo_type == LLT_COMP);
-	LASSERTF(i < lov->u.composite.lo_entry_count,
-		 "entry %d entry_count %d", i, lov->u.composite.lo_entry_count);
-
-	return &lov->u.composite.lo_entries[i].lle_raid0;
-}
-
-static inline struct lov_stripe_md_entry *lov_lse(struct lov_object *lov, int i)
-{
-	LASSERT(lov->lo_lsm != NULL);
-	LASSERT(i < lov->lo_lsm->lsm_entry_count);
-
-	return lov->lo_lsm->lsm_entries[i];
-}
-
 /* lov_pack.c */
 int lov_getstripe(const struct lu_env *env, struct lov_object *obj,
 		  struct lov_stripe_md *lsm, struct lov_user_md __user *lump,
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_dev.c b/drivers/staging/lustrefsx/lustre/lov/lov_dev.c
index 2506c39ec7296..1faef7ad76afa 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_dev.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_dev.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -46,43 +46,37 @@ struct kmem_cache *lov_object_kmem;
 struct kmem_cache *lov_thread_kmem;
 struct kmem_cache *lov_session_kmem;
 
-struct kmem_cache *lovsub_lock_kmem;
 struct kmem_cache *lovsub_object_kmem;
 
 struct lu_kmem_descr lov_caches[] = {
-        {
-                .ckd_cache = &lov_lock_kmem,
-                .ckd_name  = "lov_lock_kmem",
-                .ckd_size  = sizeof (struct lov_lock)
-        },
-        {
-                .ckd_cache = &lov_object_kmem,
-                .ckd_name  = "lov_object_kmem",
-                .ckd_size  = sizeof (struct lov_object)
-        },
-        {
-                .ckd_cache = &lov_thread_kmem,
-                .ckd_name  = "lov_thread_kmem",
-                .ckd_size  = sizeof (struct lov_thread_info)
-        },
-        {
-                .ckd_cache = &lov_session_kmem,
-                .ckd_name  = "lov_session_kmem",
-                .ckd_size  = sizeof (struct lov_session)
-        },
-        {
-                .ckd_cache = &lovsub_lock_kmem,
-                .ckd_name  = "lovsub_lock_kmem",
-                .ckd_size  = sizeof (struct lovsub_lock)
-        },
-        {
-                .ckd_cache = &lovsub_object_kmem,
-                .ckd_name  = "lovsub_object_kmem",
-                .ckd_size  = sizeof (struct lovsub_object)
-        },
-        {
-                .ckd_cache = NULL
-        }
+	{
+		.ckd_cache = &lov_lock_kmem,
+		.ckd_name  = "lov_lock_kmem",
+		.ckd_size  = sizeof(struct lov_lock)
+	},
+	{
+		.ckd_cache = &lov_object_kmem,
+		.ckd_name  = "lov_object_kmem",
+		.ckd_size  = sizeof(struct lov_object)
+	},
+	{
+		.ckd_cache = &lov_thread_kmem,
+		.ckd_name  = "lov_thread_kmem",
+		.ckd_size  = sizeof(struct lov_thread_info)
+	},
+	{
+		.ckd_cache = &lov_session_kmem,
+		.ckd_name  = "lov_session_kmem",
+		.ckd_size  = sizeof(struct lov_session)
+	},
+	{
+		.ckd_cache = &lovsub_object_kmem,
+		.ckd_name  = "lovsub_object_kmem",
+		.ckd_size  = sizeof(struct lovsub_object)
+	},
+	{
+		.ckd_cache = NULL
+	}
 };
 
 /*****************************************************************************
@@ -97,7 +91,7 @@ static void *lov_key_init(const struct lu_context *ctx,
 	struct lov_thread_info *info;
 
 	OBD_SLAB_ALLOC_PTR_GFP(info, lov_thread_kmem, GFP_NOFS);
-	if (info == NULL)
+	if (!info)
 		info = ERR_PTR(-ENOMEM);
 	return info;
 }
@@ -110,9 +104,9 @@ static void lov_key_fini(const struct lu_context *ctx,
 }
 
 struct lu_context_key lov_key = {
-        .lct_tags = LCT_CL_THREAD,
-        .lct_init = lov_key_init,
-        .lct_fini = lov_key_fini
+	.lct_tags = LCT_CL_THREAD,
+	.lct_init = lov_key_init,
+	.lct_fini = lov_key_fini
 };
 
 static void *lov_session_key_init(const struct lu_context *ctx,
@@ -121,113 +115,180 @@ static void *lov_session_key_init(const struct lu_context *ctx,
 	struct lov_session *info;
 
 	OBD_SLAB_ALLOC_PTR_GFP(info, lov_session_kmem, GFP_NOFS);
-	if (info == NULL)
+	if (!info)
 		info = ERR_PTR(-ENOMEM);
 	return info;
 }
 
 static void lov_session_key_fini(const struct lu_context *ctx,
-                                 struct lu_context_key *key, void *data)
+				 struct lu_context_key *key, void *data)
 {
-        struct lov_session *info = data;
-        OBD_SLAB_FREE_PTR(info, lov_session_kmem);
+	struct lov_session *info = data;
+
+	OBD_SLAB_FREE_PTR(info, lov_session_kmem);
 }
 
 struct lu_context_key lov_session_key = {
-        .lct_tags = LCT_SESSION,
-        .lct_init = lov_session_key_init,
-        .lct_fini = lov_session_key_fini
+	.lct_tags = LCT_SESSION,
+	.lct_init = lov_session_key_init,
+	.lct_fini = lov_session_key_fini
 };
 
 /* type constructor/destructor: lov_type_{init,fini,start,stop}() */
 LU_TYPE_INIT_FINI(lov, &lov_key, &lov_session_key);
 
+
+static int lov_mdc_dev_init(const struct lu_env *env, struct lov_device *ld,
+			    struct lu_device *mdc_dev, __u32 idx, __u32 nr)
+{
+	struct cl_device *cl;
+
+	ENTRY;
+	cl = cl_type_setup(env, &ld->ld_site, &lovsub_device_type,
+			   mdc_dev);
+	if (IS_ERR(cl))
+		RETURN(PTR_ERR(cl));
+
+	ld->ld_md_tgts[nr].ldm_mdc = cl;
+	ld->ld_md_tgts[nr].ldm_idx = idx;
+	RETURN(0);
+}
+
 static struct lu_device *lov_device_fini(const struct lu_env *env,
-                                         struct lu_device *d)
+					 struct lu_device *d)
 {
-        int i;
-        struct lov_device *ld = lu2lov_dev(d);
+	struct lov_device *ld = lu2lov_dev(d);
+	int i;
 
-        LASSERT(ld->ld_lov != NULL);
-        if (ld->ld_target == NULL)
-                RETURN(NULL);
+	LASSERT(ld->ld_lov != NULL);
 
-        lov_foreach_target(ld, i) {
-                struct lovsub_device *lsd;
+	if (ld->ld_lmv) {
+		class_decref(ld->ld_lmv, "lov", d);
+		ld->ld_lmv = NULL;
+	}
 
-                lsd = ld->ld_target[i];
-                if (lsd != NULL) {
-                        cl_stack_fini(env, lovsub2cl_dev(lsd));
-                        ld->ld_target[i] = NULL;
-                }
-        }
-        RETURN(NULL);
+	if (ld->ld_md_tgts) {
+		for (i = 0; i < ld->ld_md_tgts_nr; i++) {
+			if (!ld->ld_md_tgts[i].ldm_mdc)
+				continue;
+
+			cl_stack_fini(env, ld->ld_md_tgts[i].ldm_mdc);
+			ld->ld_md_tgts[i].ldm_mdc = NULL;
+			ld->ld_lov->lov_mdc_tgts[i].lmtd_mdc = NULL;
+		}
+	}
+
+	if (ld->ld_target) {
+		lov_foreach_target(ld, i) {
+			struct lovsub_device *lsd;
+
+			lsd = ld->ld_target[i];
+			if (lsd) {
+				cl_stack_fini(env, lovsub2cl_dev(lsd));
+				ld->ld_target[i] = NULL;
+			}
+		}
+	}
+	RETURN(NULL);
 }
 
 static int lov_device_init(const struct lu_env *env, struct lu_device *d,
-                           const char *name, struct lu_device *next)
+			   const char *name, struct lu_device *next)
 {
-        struct lov_device *ld = lu2lov_dev(d);
-        int i;
-        int rc = 0;
-
-        LASSERT(d->ld_site != NULL);
-        if (ld->ld_target == NULL)
-                RETURN(rc);
-
-        lov_foreach_target(ld, i) {
-                struct lovsub_device *lsd;
-                struct cl_device     *cl;
-                struct lov_tgt_desc  *desc;
-
-                desc = ld->ld_lov->lov_tgts[i];
-                if (desc == NULL)
-                        continue;
-
-                cl = cl_type_setup(env, d->ld_site, &lovsub_device_type,
-                                   desc->ltd_obd->obd_lu_dev);
-                if (IS_ERR(cl)) {
-                        rc = PTR_ERR(cl);
-                        break;
-                }
-                lsd = cl2lovsub_dev(cl);
-                ld->ld_target[i] = lsd;
-        }
+	struct lov_device *ld = lu2lov_dev(d);
+	int i;
+	int rc = 0;
+
+	/* check all added already MDC subdevices and initialize them */
+	for (i = 0; i < ld->ld_md_tgts_nr; i++) {
+		struct obd_device *mdc;
+		__u32 idx;
+
+		mdc = ld->ld_lov->lov_mdc_tgts[i].lmtd_mdc;
+		idx = ld->ld_lov->lov_mdc_tgts[i].lmtd_index;
+
+		if (!mdc)
+			continue;
+
+		rc = lov_mdc_dev_init(env, ld, mdc->obd_lu_dev, idx, i);
+		if (rc) {
+			CERROR("%s: failed to add MDC %s as target: rc = %d\n",
+			       d->ld_obd->obd_name,
+			       obd_uuid2str(&mdc->obd_uuid), rc);
+			GOTO(out_err, rc);
+		}
+	}
+
+	if (!ld->ld_target)
+		RETURN(0);
+
+	lov_foreach_target(ld, i) {
+		struct lovsub_device *lsd;
+		struct cl_device *cl;
+		struct lov_tgt_desc *desc;
+
+		desc = ld->ld_lov->lov_tgts[i];
+		if (!desc)
+			continue;
 
-        if (rc)
-                lov_device_fini(env, d);
-        else
-                ld->ld_flags |= LOV_DEV_INITIALIZED;
+		cl = cl_type_setup(env, &ld->ld_site, &lovsub_device_type,
+				   desc->ltd_obd->obd_lu_dev);
+		if (IS_ERR(cl))
+			GOTO(out_err, rc = PTR_ERR(cl));
 
-        RETURN(rc);
+		lsd = cl2lovsub_dev(cl);
+		ld->ld_target[i] = lsd;
+	}
+	ld->ld_flags |= LOV_DEV_INITIALIZED;
+	RETURN(0);
+
+out_err:
+	lu_device_fini(d);
+	RETURN(rc);
 }
 
 /* Free the lov specific data created for the back end lu_device. */
 static struct lu_device *lov_device_free(const struct lu_env *env,
-                                         struct lu_device *d)
+					 struct lu_device *d)
 {
 	struct lov_device *ld = lu2lov_dev(d);
 	const int nr = ld->ld_target_nr;
 
+	lu_site_fini(&ld->ld_site);
+
 	cl_device_fini(lu2cl_dev(d));
-	if (ld->ld_target != NULL)
+	if (ld->ld_target) {
 		OBD_FREE(ld->ld_target, nr * sizeof ld->ld_target[0]);
+		ld->ld_target = NULL;
+	}
+	if (ld->ld_md_tgts) {
+		OBD_FREE(ld->ld_md_tgts,
+			 sizeof(*ld->ld_md_tgts) * LOV_MDC_TGT_MAX);
+		ld->ld_md_tgts = NULL;
+	}
+	/* free array of MDCs */
+	if (ld->ld_lov->lov_mdc_tgts) {
+		OBD_FREE(ld->ld_lov->lov_mdc_tgts,
+			 sizeof(*ld->ld_lov->lov_mdc_tgts) * LOV_MDC_TGT_MAX);
+		ld->ld_lov->lov_mdc_tgts = NULL;
+	}
 
 	OBD_FREE_PTR(ld);
 	return NULL;
 }
 
 static void lov_cl_del_target(const struct lu_env *env, struct lu_device *dev,
-                              __u32 index)
+			      __u32 index)
 {
-        struct lov_device *ld = lu2lov_dev(dev);
-        ENTRY;
+	struct lov_device *ld = lu2lov_dev(dev);
 
-        if (ld->ld_target[index] != NULL) {
-                cl_stack_fini(env, lovsub2cl_dev(ld->ld_target[index]));
-                ld->ld_target[index] = NULL;
-        }
-        EXIT;
+	ENTRY;
+
+	if (ld->ld_target[index]) {
+		cl_stack_fini(env, lovsub2cl_dev(ld->ld_target[index]));
+		ld->ld_target[index] = NULL;
+	}
+	EXIT;
 }
 
 static int lov_expand_targets(const struct lu_env *env, struct lov_device *dev)
@@ -245,7 +306,7 @@ static int lov_expand_targets(const struct lu_env *env, struct lov_device *dev)
 		const size_t sz = sizeof(newd[0]);
 
 		OBD_ALLOC(newd, tgt_size * sz);
-		if (newd != NULL) {
+		if (newd) {
 			if (sub_size > 0) {
 				memcpy(newd, dev->ld_target, sub_size * sz);
 				OBD_FREE(dev->ld_target, sub_size * sz);
@@ -262,32 +323,31 @@ static int lov_expand_targets(const struct lu_env *env, struct lov_device *dev)
 }
 
 static int lov_cl_add_target(const struct lu_env *env, struct lu_device *dev,
-                             __u32 index)
+			     __u32 index)
 {
-        struct obd_device    *obd = dev->ld_obd;
-        struct lov_device    *ld  = lu2lov_dev(dev);
-        struct lov_tgt_desc  *tgt;
-        struct lovsub_device *lsd;
-        struct cl_device     *cl;
-        int rc;
-        ENTRY;
-
-        obd_getref(obd);
-
-        tgt = obd->u.lov.lov_tgts[index];
-        LASSERT(tgt != NULL);
-        LASSERT(tgt->ltd_obd != NULL);
-
-        if (!tgt->ltd_obd->obd_set_up) {
-                CERROR("Target %s not set up\n", obd_uuid2str(&tgt->ltd_uuid));
-                RETURN(-EINVAL);
-        }
+	struct obd_device    *obd = dev->ld_obd;
+	struct lov_device    *ld  = lu2lov_dev(dev);
+	struct lov_tgt_desc  *tgt;
+	struct lovsub_device *lsd;
+	struct cl_device     *cl;
+	int rc;
+
+	ENTRY;
+
+	lov_tgts_getref(obd);
 
-        rc = lov_expand_targets(env, ld);
-        if (rc == 0 && ld->ld_flags & LOV_DEV_INITIALIZED) {
-                LASSERT(dev->ld_site != NULL);
+	tgt = obd->u.lov.lov_tgts[index];
+	LASSERT(tgt != NULL);
+	LASSERT(tgt->ltd_obd != NULL);
 
-		cl = cl_type_setup(env, dev->ld_site, &lovsub_device_type,
+	if (!tgt->ltd_obd->obd_set_up) {
+		CERROR("Target %s not set up\n", obd_uuid2str(&tgt->ltd_uuid));
+		RETURN(-EINVAL);
+	}
+
+	rc = lov_expand_targets(env, ld);
+	if (rc == 0 && ld->ld_flags & LOV_DEV_INITIALIZED) {
+		cl = cl_type_setup(env, &ld->ld_site, &lovsub_device_type,
 				   tgt->ltd_obd->obd_lu_dev);
 		if (!IS_ERR(cl)) {
 			lsd = cl2lovsub_dev(cl);
@@ -299,94 +359,239 @@ static int lov_cl_add_target(const struct lu_env *env, struct lu_device *dev,
 			rc = PTR_ERR(cl);
 		}
         }
-        obd_putref(obd);
-        RETURN(rc);
+
+	lov_tgts_putref(obd);
+
+	RETURN(rc);
+}
+
+/**
+ * Add new MDC target device in LOV.
+ *
+ * This function is part of the configuration log processing. It adds new MDC
+ * device to the MDC device array indexed by their indexes.
+ *
+ * \param[in] env	execution environment
+ * \param[in] d		LU device of LOV device
+ * \param[in] mdc	MDC device to add
+ * \param[in] idx	MDC device index
+ *
+ * \retval		0 if successful
+ * \retval		negative value on error
+ */
+static int lov_add_mdc_target(const struct lu_env *env, struct lu_device *d,
+			      struct obd_device *mdc, __u32 idx)
+{
+	struct lov_device *ld = lu2lov_dev(d);
+	struct obd_device *lov_obd = d->ld_obd;
+	struct obd_device *lmv_obd;
+	int next;
+	int rc = 0;
+
+	ENTRY;
+
+	LASSERT(mdc != NULL);
+	if (ld->ld_md_tgts_nr == LOV_MDC_TGT_MAX) {
+		/*
+		 * If the maximum value of LOV_MDC_TGT_MAX will become too
+		 * small then all MD target handling must be rewritten in LOD
+		 * manner, check lod_add_device() and related functionality.
+		 */
+		CERROR("%s: cannot serve more than %d MDC devices\n",
+		       lov_obd->obd_name, LOV_MDC_TGT_MAX);
+		RETURN(-ERANGE);
+	}
+
+	/*
+	 * grab FLD from lmv, do that here, when first MDC is added
+	 * to be sure LMV is set up and can be found
+	 */
+	if (!ld->ld_lmv) {
+		next = 0;
+		while ((lmv_obd = class_devices_in_group(&lov_obd->obd_uuid,
+							 &next)) != NULL) {
+			if ((strncmp(lmv_obd->obd_type->typ_name,
+				     LUSTRE_LMV_NAME,
+				     strlen(LUSTRE_LMV_NAME)) == 0))
+				break;
+		}
+		if (!lmv_obd) {
+			CERROR("%s: cannot find LMV OBD by UUID (%s)\n",
+			       lov_obd->obd_name,
+			       obd_uuid2str(&lmv_obd->obd_uuid));
+			RETURN(-ENODEV);
+		}
+		spin_lock(&lmv_obd->obd_dev_lock);
+		class_incref(lmv_obd, "lov", ld);
+		spin_unlock(&lmv_obd->obd_dev_lock);
+		ld->ld_lmv = lmv_obd;
+	}
+
+	LASSERT(lov_obd->u.lov.lov_mdc_tgts[ld->ld_md_tgts_nr].lmtd_mdc ==
+		NULL);
+
+	if (ld->ld_flags & LOV_DEV_INITIALIZED) {
+		rc = lov_mdc_dev_init(env, ld, mdc->obd_lu_dev, idx,
+				      ld->ld_md_tgts_nr);
+		if (rc) {
+			CERROR("%s: failed to add MDC %s as target: rc = %d\n",
+			       lov_obd->obd_name, obd_uuid2str(&mdc->obd_uuid),
+			       rc);
+			RETURN(rc);
+		}
+	}
+
+	lov_obd->u.lov.lov_mdc_tgts[ld->ld_md_tgts_nr].lmtd_mdc = mdc;
+	lov_obd->u.lov.lov_mdc_tgts[ld->ld_md_tgts_nr].lmtd_index = idx;
+	ld->ld_md_tgts_nr++;
+
+	RETURN(rc);
 }
 
 static int lov_process_config(const struct lu_env *env,
-                              struct lu_device *d, struct lustre_cfg *cfg)
+			      struct lu_device *d, struct lustre_cfg *cfg)
 {
-        struct obd_device *obd = d->ld_obd;
-        int cmd;
-        int rc;
-        int gen;
-        __u32 index;
-
-        obd_getref(obd);
-
-        cmd = cfg->lcfg_command;
-        rc = lov_process_config_base(d->ld_obd, cfg, &index, &gen);
-        if (rc == 0) {
-                switch(cmd) {
-                case LCFG_LOV_ADD_OBD:
-                case LCFG_LOV_ADD_INA:
-                        rc = lov_cl_add_target(env, d, index);
-                        if (rc != 0)
-				lov_del_target(d->ld_obd, index, NULL, 0);
-                        break;
-                case LCFG_LOV_DEL_OBD:
-                        lov_cl_del_target(env, d, index);
-                        break;
-                }
-        }
-        obd_putref(obd);
-        RETURN(rc);
+	struct obd_device *obd = d->ld_obd;
+	int cmd;
+	int rc;
+	int gen;
+	u32 index;
+
+	lov_tgts_getref(obd);
+
+	cmd = cfg->lcfg_command;
+
+	rc = lov_process_config_base(d->ld_obd, cfg, &index, &gen);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	switch (cmd) {
+	case LCFG_LOV_ADD_OBD:
+	case LCFG_LOV_ADD_INA:
+		rc = lov_cl_add_target(env, d, index);
+		if (rc != 0)
+			lov_del_target(d->ld_obd, index, NULL, 0);
+		break;
+	case LCFG_LOV_DEL_OBD:
+		lov_cl_del_target(env, d, index);
+		break;
+	case LCFG_ADD_MDC:
+	{
+		struct obd_device *mdc;
+		struct obd_uuid tgt_uuid;
+
+		/*
+		 * modify_mdc_tgts add 0:lustre-clilmv  1:lustre-MDT0000_UUID
+		 * 2:0  3:1  4:lustre-MDT0000-mdc_UUID
+		 */
+		if (LUSTRE_CFG_BUFLEN(cfg, 1) > sizeof(tgt_uuid.uuid))
+			GOTO(out, rc = -EINVAL);
+
+		obd_str2uuid(&tgt_uuid, lustre_cfg_buf(cfg, 1));
+
+		rc = kstrtou32(lustre_cfg_buf(cfg, 2), 10, &index);
+		if (rc)
+			GOTO(out, rc);
+
+		mdc = class_find_client_obd(&tgt_uuid, LUSTRE_MDC_NAME,
+					    &obd->obd_uuid);
+		if (!mdc)
+			GOTO(out, rc = -ENODEV);
+		rc = lov_add_mdc_target(env, d, mdc, index);
+		break;
+	}
+	}
+out:
+	lov_tgts_putref(obd);
+	RETURN(rc);
 }
 
 static const struct lu_device_operations lov_lu_ops = {
-        .ldo_object_alloc      = lov_object_alloc,
-        .ldo_process_config    = lov_process_config,
+	.ldo_object_alloc      = lov_object_alloc,
+	.ldo_process_config    = lov_process_config,
 };
 
 static struct lu_device *lov_device_alloc(const struct lu_env *env,
-                                          struct lu_device_type *t,
-                                          struct lustre_cfg *cfg)
+					  struct lu_device_type *t,
+					  struct lustre_cfg *cfg)
 {
-        struct lu_device *d;
-        struct lov_device *ld;
-        struct obd_device *obd;
-        int rc;
+	struct lu_device *d;
+	struct lov_device *ld;
+	struct obd_device *obd;
+	int rc;
 
-        OBD_ALLOC_PTR(ld);
-        if (ld == NULL)
-                RETURN(ERR_PTR(-ENOMEM));
+	OBD_ALLOC_PTR(ld);
+	if (!ld)
+		RETURN(ERR_PTR(-ENOMEM));
 
 	cl_device_init(&ld->ld_cl, t);
 	d = lov2lu_dev(ld);
 	d->ld_ops = &lov_lu_ops;
 
-        /* setup the LOV OBD */
-        obd = class_name2obd(lustre_cfg_string(cfg, 0));
-        LASSERT(obd != NULL);
-        rc = lov_setup(obd, cfg);
-        if (rc) {
-                lov_device_free(env, d);
-                RETURN(ERR_PTR(rc));
-        }
+	/* setup the LOV OBD */
+	obd = class_name2obd(lustre_cfg_string(cfg, 0));
+	LASSERT(obd != NULL);
+	rc = lov_setup(obd, cfg);
+	if (rc)
+		GOTO(out, rc);
+
+	/* Alloc MDC devices array */
+	/* XXX: need dynamic allocation at some moment */
+	OBD_ALLOC(ld->ld_md_tgts, sizeof(*ld->ld_md_tgts) * LOV_MDC_TGT_MAX);
+	if (!ld->ld_md_tgts)
+		GOTO(out, rc = -ENOMEM);
+
+	ld->ld_md_tgts_nr = 0;
+
+	ld->ld_lov = &obd->u.lov;
+	OBD_ALLOC(ld->ld_lov->lov_mdc_tgts,
+		  sizeof(*ld->ld_lov->lov_mdc_tgts) * LOV_MDC_TGT_MAX);
+	if (!ld->ld_lov->lov_mdc_tgts)
+		GOTO(out_md_tgts, rc = -ENOMEM);
+
+	rc = lu_site_init(&ld->ld_site, d);
+	if (rc != 0)
+		GOTO(out_mdc_tgts, rc);
+
+	rc = lu_site_init_finish(&ld->ld_site);
+	if (rc != 0)
+		GOTO(out_site, rc);
+
+	RETURN(d);
+out_site:
+	lu_site_fini(&ld->ld_site);
+out_mdc_tgts:
+	OBD_FREE(ld->ld_lov->lov_mdc_tgts,
+		 sizeof(*ld->ld_lov->lov_mdc_tgts) * LOV_MDC_TGT_MAX);
+	ld->ld_lov->lov_mdc_tgts = NULL;
+out_md_tgts:
+	OBD_FREE(ld->ld_md_tgts, sizeof(*ld->ld_md_tgts) * LOV_MDC_TGT_MAX);
+	ld->ld_md_tgts = NULL;
+out:
+	OBD_FREE_PTR(ld);
 
-        ld->ld_lov = &obd->u.lov;
-        RETURN(d);
+	return ERR_PTR(rc);
 }
 
 static const struct lu_device_type_operations lov_device_type_ops = {
-        .ldto_init = lov_type_init,
-        .ldto_fini = lov_type_fini,
+	.ldto_init = lov_type_init,
+	.ldto_fini = lov_type_fini,
 
-        .ldto_start = lov_type_start,
-        .ldto_stop  = lov_type_stop,
+	.ldto_start = lov_type_start,
+	.ldto_stop  = lov_type_stop,
 
-        .ldto_device_alloc = lov_device_alloc,
-        .ldto_device_free  = lov_device_free,
+	.ldto_device_alloc = lov_device_alloc,
+	.ldto_device_free  = lov_device_free,
 
-        .ldto_device_init    = lov_device_init,
-        .ldto_device_fini    = lov_device_fini
+	.ldto_device_init    = lov_device_init,
+	.ldto_device_fini    = lov_device_fini
 };
 
 struct lu_device_type lov_device_type = {
-        .ldt_tags     = LU_DEVICE_CL,
-        .ldt_name     = LUSTRE_LOV_NAME,
-        .ldt_ops      = &lov_device_type_ops,
-        .ldt_ctx_tags = LCT_CL_THREAD
+	.ldt_tags     = LU_DEVICE_CL,
+	.ldt_name     = LUSTRE_LOV_NAME,
+	.ldt_ops      = &lov_device_type_ops,
+	.ldt_ctx_tags = LCT_CL_THREAD
 };
 
 /** @} lov */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_ea.c b/drivers/staging/lustrefsx/lustre/lov/lov_ea.c
index 5b50b0a9294dc..1d388637d0235 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_ea.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_ea.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -41,9 +41,6 @@
 #include <libcfs/libcfs.h>
 
 #include <obd_class.h>
-#include <lustre/lustre_idl.h>
-#include <lustre/lustre_user.h>
-
 #include "lov_internal.h"
 
 static inline void
@@ -53,8 +50,10 @@ lu_extent_le_to_cpu(struct lu_extent *dst, const struct lu_extent *src)
 	dst->e_end = le64_to_cpu(src->e_end);
 }
 
-/* Find minimum stripe maxbytes value.  For inactive or
- * reconnecting targets use LUSTRE_EXT3_STRIPE_MAXBYTES. */
+/*
+ * Find minimum stripe maxbytes value.  For inactive or
+ * reconnecting targets use LUSTRE_EXT3_STRIPE_MAXBYTES.
+ */
 static loff_t lov_tgt_maxbytes(struct lov_tgt_desc *tgt)
 {
 	struct obd_import *imp;
@@ -64,11 +63,12 @@ static loff_t lov_tgt_maxbytes(struct lov_tgt_desc *tgt)
 		return maxbytes;
 
 	imp = tgt->ltd_obd->u.cli.cl_import;
-	if (imp == NULL)
+	if (!imp)
 		return maxbytes;
 
 	spin_lock(&imp->imp_lock);
-	if (imp->imp_state == LUSTRE_IMP_FULL &&
+	if ((imp->imp_state == LUSTRE_IMP_FULL ||
+	    imp->imp_state == LUSTRE_IMP_IDLE) &&
 	    (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES) &&
 	    imp->imp_connect_data.ocd_maxbytes > 0)
 		maxbytes = imp->imp_connect_data.ocd_maxbytes;
@@ -93,7 +93,8 @@ static int lsm_lmm_verify_v1v3(struct lov_mds_md *lmm, size_t lmm_size,
 		return -EINVAL;
 	}
 
-	if (lov_pattern(le32_to_cpu(lmm->lmm_pattern)) != LOV_PATTERN_RAID0) {
+	if (lov_pattern(le32_to_cpu(lmm->lmm_pattern)) != LOV_PATTERN_MDT &&
+	    lov_pattern(le32_to_cpu(lmm->lmm_pattern)) != LOV_PATTERN_RAID0) {
 		CERROR("bad striping pattern\n");
 		lov_dump_lmm_common(D_WARNING, lmm);
 		return -EINVAL;
@@ -184,7 +185,7 @@ lsme_unpack(struct lov_obd *lov, struct lov_mds_md *lmm, size_t buf_size,
 
 	lsme_size = offsetof(typeof(*lsme), lsme_oinfo[stripe_count]);
 	OBD_ALLOC_LARGE(lsme, lsme_size);
-	if (lsme == NULL)
+	if (!lsme)
 		RETURN(ERR_PTR(-ENOMEM));
 
 	lsme->lsme_magic = magic;
@@ -195,7 +196,7 @@ lsme_unpack(struct lov_obd *lov, struct lov_mds_md *lmm, size_t buf_size,
 	lsme->lsme_stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
 	lsme->lsme_layout_gen = le16_to_cpu(lmm->lmm_layout_gen);
 
-	if (pool_name != NULL) {
+	if (pool_name) {
 		size_t pool_name_len;
 
 		pool_name_len = strlcpy(lsme->lsme_pool_name, pool_name,
@@ -204,12 +205,22 @@ lsme_unpack(struct lov_obd *lov, struct lov_mds_md *lmm, size_t buf_size,
 			GOTO(out_lsme, rc = -E2BIG);
 	}
 
+	/* with Data-on-MDT set maxbytes to stripe size */
+	if (lsme_is_dom(lsme)) {
+		if (maxbytes) {
+			lov_bytes = lsme->lsme_stripe_size;
+			goto out_dom1;
+		} else {
+			goto out_dom2;
+		}
+	}
+
 	for (i = 0; i < stripe_count; i++) {
 		struct lov_oinfo *loi;
 		struct lov_tgt_desc *ltd;
 
 		OBD_SLAB_ALLOC_PTR_GFP(loi, lov_oinfo_slab, GFP_NOFS);
-		if (loi == NULL)
+		if (!loi)
 			GOTO(out_lsme, rc = -ENOMEM);
 
 		lsme->lsme_oinfo[i] = loi;
@@ -230,7 +241,7 @@ lsme_unpack(struct lov_obd *lov, struct lov_mds_md *lmm, size_t buf_size,
 		}
 
 		ltd = lov->lov_tgts[loi->loi_ost_idx];
-		if (ltd == NULL) {
+		if (!ltd) {
 			CERROR("%s: OST index %d missing\n",
 			       (char*)lov->desc.ld_uuid.uuid, loi->loi_ost_idx);
 			lov_dump_lmm_v1(D_WARNING, lmm);
@@ -242,17 +253,21 @@ lsme_unpack(struct lov_obd *lov, struct lov_mds_md *lmm, size_t buf_size,
 			min_stripe_maxbytes = lov_bytes;
 	}
 
-	if (min_stripe_maxbytes == 0)
-		min_stripe_maxbytes = LUSTRE_EXT3_STRIPE_MAXBYTES;
+	if (maxbytes) {
+		if (min_stripe_maxbytes == 0)
+			min_stripe_maxbytes = LUSTRE_EXT3_STRIPE_MAXBYTES;
 
-	lov_bytes = min_stripe_maxbytes * stripe_count;
+		if (stripe_count == 0)
+			stripe_count = lov->desc.ld_tgt_count;
 
-	if (maxbytes != NULL) {
-		if (lov_bytes < min_stripe_maxbytes) /* handle overflow */
-			*maxbytes = MAX_LFS_FILESIZE;
+		if (min_stripe_maxbytes <= LLONG_MAX / stripe_count)
+			lov_bytes = min_stripe_maxbytes * stripe_count;
 		else
-			*maxbytes = lov_bytes;
+			lov_bytes = MAX_LFS_FILESIZE;
+out_dom1:
+		*maxbytes = min_t(loff_t, lov_bytes, MAX_LFS_FILESIZE);
 	}
+out_dom2:
 
 	return lsme;
 
@@ -260,7 +275,7 @@ lsme_unpack(struct lov_obd *lov, struct lov_mds_md *lmm, size_t buf_size,
 	for (i = 0; i < stripe_count; i++) {
 		struct lov_oinfo *loi = lsme->lsme_oinfo[i];
 
-		if (loi != NULL)
+		if (loi)
 			OBD_SLAB_FREE_PTR(lsme->lsme_oinfo[i], lov_oinfo_slab);
 	}
 	OBD_FREE_LARGE(lsme, lsme_size);
@@ -293,7 +308,7 @@ lov_stripe_md *lsm_unpackmd_v1v3(struct lov_obd *lov, struct lov_mds_md *lmm,
 
 	lsm_size = offsetof(typeof(*lsm), lsm_entries[1]);
 	OBD_ALLOC(lsm, lsm_size);
-	if (lsm == NULL)
+	if (!lsm)
 		GOTO(out_lsme, rc = -ENOMEM);
 
 	atomic_set(&lsm->lsm_refc, 1);
@@ -384,7 +399,8 @@ lsme_unpack_comp(struct lov_obd *lov, struct lov_mds_md *lmm,
 	unsigned int stripe_count;
 
 	stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
-	if (stripe_count == 0)
+	if (stripe_count == 0 &&
+	    lov_pattern(le32_to_cpu(lmm->lmm_pattern)) != LOV_PATTERN_MDT)
 		RETURN(ERR_PTR(-EINVAL));
 	/* un-instantiated lmm contains no ost id info, i.e. lov_ost_data_v1 */
 	if (!inited)
@@ -427,7 +443,7 @@ lsm_unpackmd_comp_md_v1(struct lov_obd *lov, void *buf, size_t buf_size)
 
 	lsm_size = offsetof(typeof(*lsm), lsm_entries[entry_count]);
 	OBD_ALLOC(lsm, lsm_size);
-	if (lsm == NULL)
+	if (!lsm)
 		return ERR_PTR(-ENOMEM);
 
 	atomic_set(&lsm->lsm_refc, 1);
@@ -435,6 +451,8 @@ lsm_unpackmd_comp_md_v1(struct lov_obd *lov, void *buf, size_t buf_size)
 	lsm->lsm_magic = le32_to_cpu(lcm->lcm_magic);
 	lsm->lsm_layout_gen = le32_to_cpu(lcm->lcm_layout_gen);
 	lsm->lsm_entry_count = entry_count;
+	lsm->lsm_mirror_count = le16_to_cpu(lcm->lcm_mirror_count);
+	lsm->lsm_flags = le16_to_cpu(lcm->lcm_flags);
 	lsm->lsm_is_released = true;
 	lsm->lsm_maxbytes = LLONG_MIN;
 
@@ -463,16 +481,22 @@ lsm_unpackmd_comp_md_v1(struct lov_obd *lov, void *buf, size_t buf_size)
 		lsm->lsm_entries[i] = lsme;
 		lsme->lsme_id = le32_to_cpu(lcme->lcme_id);
 		lsme->lsme_flags = le32_to_cpu(lcme->lcme_flags);
+		if (lsme->lsme_flags & LCME_FL_NOSYNC)
+			lsme->lsme_timestamp =
+				le64_to_cpu(lcme->lcme_timestamp);
 		lu_extent_le_to_cpu(&lsme->lsme_extent, &lcme->lcme_extent);
 
 		if (i == entry_count - 1) {
 			lsm->lsm_maxbytes = (loff_t)lsme->lsme_extent.e_start +
 					    maxbytes;
-			/* the last component hasn't been defined, or
-			 * lsm_maxbytes overflowed. */
-			if (lsme->lsme_extent.e_end != LUSTRE_EOF ||
-			    lsm->lsm_maxbytes <
-			    (loff_t)lsme->lsme_extent.e_start)
+			/*
+			 * the last component hasn't been defined, or
+			 * lsm_maxbytes overflowed.
+			 */
+			if (!lsme_is_dom(lsme) &&
+			    (lsme->lsme_extent.e_end != LUSTRE_EOF ||
+			     lsm->lsm_maxbytes <
+			     (loff_t)lsme->lsme_extent.e_start))
 				lsm->lsm_maxbytes = MAX_LFS_FILESIZE;
 		}
 	}
@@ -481,7 +505,7 @@ lsm_unpackmd_comp_md_v1(struct lov_obd *lov, void *buf, size_t buf_size)
 
 out_lsm:
 	for (i = 0; i < entry_count; i++)
-		if (lsm->lsm_entries[i] != NULL)
+		if (lsm->lsm_entries[i])
 			lsme_free(lsm->lsm_entries[i]);
 
 	OBD_FREE(lsm, lsm_size);
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_internal.h b/drivers/staging/lustrefsx/lustre/lov/lov_internal.h
index 524b0a4eac681..a1cbea9a5c4d4 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_internal.h
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -34,7 +34,7 @@
 #define LOV_INTERNAL_H
 
 #include <obd_class.h>
-#include <lustre/lustre_user.h>
+#include <uapi/linux/lustre/lustre_user.h>
 
 /* If we are unable to get the maximum object size from the OST in
  * ocd_maxbytes using OBD_CONNECT_MAXBYTES, then we fall back to using
@@ -47,6 +47,7 @@ struct lov_stripe_md_entry {
 	u32			lsme_magic;
 	u32			lsme_flags;
 	u32			lsme_pattern;
+	u64			lsme_timestamp;
 	u32			lsme_stripe_size;
 	u16			lsme_stripe_count;
 	u16			lsme_layout_gen;
@@ -54,6 +55,11 @@ struct lov_stripe_md_entry {
 	struct lov_oinfo       *lsme_oinfo[];
 };
 
+static inline bool lsme_is_dom(struct lov_stripe_md_entry *lsme)
+{
+	return (lov_pattern(lsme->lsme_pattern) == LOV_PATTERN_MDT);
+}
+
 static inline void copy_lsm_entry(struct lov_stripe_md_entry *dst,
 				  struct lov_stripe_md_entry *src)
 {
@@ -75,8 +81,10 @@ struct lov_stripe_md {
 	struct ost_id	lsm_oi;
 	u32		lsm_magic;
 	u32		lsm_layout_gen;
-	u32		lsm_entry_count;
+	u16		lsm_flags;
 	bool		lsm_is_released;
+	u16		lsm_mirror_count;
+	u16		lsm_entry_count;
 	struct lov_stripe_md_entry *lsm_entries[];
 };
 
@@ -119,7 +127,7 @@ static inline size_t lov_comp_md_size(const struct lov_stripe_md *lsm)
 			stripe_count = 0;
 
 		size += sizeof(*lsme);
-		size += lov_mds_md_size(lsme->lsme_stripe_count,
+		size += lov_mds_md_size(stripe_count,
 					lsme->lsme_magic);
 	}
 
@@ -187,19 +195,22 @@ void lsm_free(struct lov_stripe_md *lsm);
 })
 #elif BITS_PER_LONG == 32
 # define lov_do_div64(n, base) ({					\
+	uint64_t __num = (n);						\
 	uint64_t __rem;							\
 	if ((sizeof(base) > 4) && (((base) & 0xffffffff00000000ULL) != 0)) {  \
-		int __remainder;					      \
-		LASSERTF(!((base) & (LOV_MIN_STRIPE_SIZE - 1)), "64 bit lov " \
-			 "division %llu / %llu\n", (n), (uint64_t)(base));    \
-		__remainder = (n) & (LOV_MIN_STRIPE_SIZE - 1);		\
-		(n) >>= LOV_MIN_STRIPE_BITS;				\
-		__rem = do_div(n, (base) >> LOV_MIN_STRIPE_BITS);	\
+		int __remainder;					\
+		LASSERTF(!((base) & (LOV_MIN_STRIPE_SIZE - 1)),		\
+			 "64 bit lov division %llu / %llu\n",		\
+			 __num, (uint64_t)(base));			\
+		__remainder = __num & (LOV_MIN_STRIPE_SIZE - 1);	\
+		__num >>= LOV_MIN_STRIPE_BITS;				\
+		__rem = do_div(__num, (base) >> LOV_MIN_STRIPE_BITS);	\
 		__rem <<= LOV_MIN_STRIPE_BITS;				\
 		__rem += __remainder;					\
 	} else {							\
-		__rem = do_div(n, base);				\
+		__rem = do_div(__num, base);				\
 	}								\
+	(n) = __num;							\
 	__rem;								\
 })
 #endif
@@ -246,6 +257,7 @@ int lov_merge_lvb_kms(struct lov_stripe_md *lsm, int index,
                       struct ost_lvb *lvb, __u64 *kms_place);
 
 /* lov_offset.c */
+loff_t stripe_width(struct lov_stripe_md *lsm, unsigned int index);
 u64 lov_stripe_size(struct lov_stripe_md *lsm, int index,
 		    u64 ost_size, int stripeno);
 int lov_stripe_offset(struct lov_stripe_md *lsm, int index, loff_t lov_off,
@@ -264,6 +276,8 @@ int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
 int lov_fini_statfs_set(struct lov_request_set *set);
 
 /* lov_obd.c */
+void lov_tgts_getref(struct obd_device *obd);
+void lov_tgts_putref(struct obd_device *obd);
 void lov_stripe_lock(struct lov_stripe_md *md);
 void lov_stripe_unlock(struct lov_stripe_md *md);
 void lov_fix_desc(struct lov_desc *desc);
@@ -273,13 +287,13 @@ void lov_fix_desc_pattern(__u32 *val);
 void lov_fix_desc_qos_maxage(__u32 *val);
 __u16 lov_get_stripe_count(struct lov_obd *lov, __u32 magic,
 			   __u16 stripe_count);
-int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
-                    struct obd_connect_data *data);
+int lov_connect_obd(struct obd_device *obd, u32 index, int activate,
+		    struct obd_connect_data *data);
 int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
 int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg,
-                            __u32 *indexp, int *genp);
-int lov_del_target(struct obd_device *obd, __u32 index,
-                   struct obd_uuid *uuidp, int gen);
+			    u32 *indexp, int *genp);
+int lov_del_target(struct obd_device *obd, u32 index,
+		   struct obd_uuid *uuidp, int gen);
 
 /* lov_pack.c */
 ssize_t lov_lsm_pack(const struct lov_stripe_md *lsm, void *buf,
@@ -298,14 +312,13 @@ void lsm_free_plain(struct lov_stripe_md *lsm);
 void dump_lsm(unsigned int level, const struct lov_stripe_md *lsm);
 
 /* lproc_lov.c */
-extern const struct proc_ops lov_proc_target_fops;
-#ifdef CONFIG_PROC_FS
-extern struct lprocfs_vars lprocfs_lov_obd_vars[];
-#endif
+int lov_tunables_init(struct obd_device *obd);
 
 /* lov_cl.c */
 extern struct lu_device_type lov_device_type;
 
+#define LOV_MDC_TGT_MAX 256
+
 /* pools */
 extern struct cfs_hash_ops pool_hash_operations;
 /* ost_pool methods */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_io.c b/drivers/staging/lustrefsx/lustre/lov/lov_io.c
index 5544a9744b73e..c6eb7121b5db9 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_io.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_io.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -56,7 +56,7 @@ static inline struct lov_io_sub *lov_sub_alloc(struct lov_io *lio, int index)
 		OBD_ALLOC_PTR(sub);
 	}
 
-	if (sub != NULL) {
+	if (sub) {
 		INIT_LIST_HEAD(&sub->sub_list);
 		INIT_LIST_HEAD(&sub->sub_linkage);
 		sub->sub_subio_index = index;
@@ -82,13 +82,22 @@ static void lov_io_sub_fini(const struct lu_env *env, struct lov_io *lio,
 
 	cl_io_fini(sub->sub_env, &sub->sub_io);
 
-	if (sub->sub_env != NULL && !IS_ERR(sub->sub_env)) {
+	if (sub->sub_env && !IS_ERR(sub->sub_env)) {
 		cl_env_put(sub->sub_env, &sub->sub_refcheck);
 		sub->sub_env = NULL;
 	}
 	EXIT;
 }
 
+static inline bool
+is_index_within_mirror(struct lov_object *lov, int index, int mirror_index)
+{
+	struct lov_layout_composite *comp = &lov->u.composite;
+	struct lov_mirror_entry *lre = &comp->lo_mirrors[mirror_index];
+
+	return (index >= lre->lre_start && index <= lre->lre_end);
+}
+
 static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio,
 			   struct lov_io_sub *sub)
 {
@@ -106,10 +115,17 @@ static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio,
 		     !lov_r0(lov, index)->lo_sub[stripe]))
 		RETURN(-EIO);
 
+	LASSERTF(is_index_within_mirror(lov, index, lio->lis_mirror_index),
+		 DFID "iot = %d, index = %d, mirror = %d\n",
+		 PFID(lu_object_fid(lov2lu(lov))), io->ci_type, index,
+		 lio->lis_mirror_index);
+
 	/* obtain new environment */
 	sub->sub_env = cl_env_get(&sub->sub_refcheck);
-	if (IS_ERR(sub->sub_env))
+	if (IS_ERR(sub->sub_env)) {
 		result = PTR_ERR(sub->sub_env);
+		RETURN(result);
+	}
 
 	sub_obj = lovsub2cl(lov_r0(lov, index)->lo_sub[stripe]);
 	sub_io  = &sub->sub_io;
@@ -122,7 +138,10 @@ static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio,
 	sub_io->ci_type    = io->ci_type;
 	sub_io->ci_no_srvlock = io->ci_no_srvlock;
 	sub_io->ci_noatime = io->ci_noatime;
-	sub_io->ci_pio = io->ci_pio;
+	sub_io->ci_lock_no_expand = io->ci_lock_no_expand;
+	sub_io->ci_ndelay = io->ci_ndelay;
+	sub_io->ci_layout_version = io->ci_layout_version;
+	sub_io->ci_tried_all_mirrors = io->ci_tried_all_mirrors;
 
 	result = cl_io_sub_init(sub->sub_env, sub_io, io->ci_type, sub_obj);
 
@@ -149,7 +168,7 @@ struct lov_io_sub *lov_sub_get(const struct lu_env *env,
 
 	if (rc == 0) {
 		sub = lov_sub_alloc(lio, index);
-		if (sub == NULL)
+		if (!sub)
 			GOTO(out, rc = -ENOMEM);
 
 		rc = lov_io_sub_init(env, lio, sub);
@@ -164,6 +183,8 @@ struct lov_io_sub *lov_sub_get(const struct lu_env *env,
 out:
 	if (rc < 0)
 		sub = ERR_PTR(rc);
+	else
+		sub->sub_io.ci_noquota = lio->lis_cl.cis_io->ci_noquota;
 	RETURN(sub);
 }
 
@@ -199,9 +220,270 @@ static int lov_io_subio_init(const struct lu_env *env, struct lov_io *lio,
 	RETURN(0);
 }
 
+/**
+ * Decide if it will need write intent RPC
+ */
+static int lov_io_mirror_write_intent(struct lov_io *lio,
+	struct lov_object *obj, struct cl_io *io)
+{
+	struct lov_layout_composite *comp = &obj->u.composite;
+	struct lu_extent *ext = &io->ci_write_intent;
+	struct lov_mirror_entry *lre;
+	struct lov_mirror_entry *primary;
+	struct lov_layout_entry *lle;
+	size_t count = 0;
+	ENTRY;
+
+	*ext = (typeof(*ext)) { lio->lis_pos, lio->lis_endpos };
+	io->ci_need_write_intent = 0;
+
+	if (!(io->ci_type == CIT_WRITE || cl_io_is_trunc(io) ||
+	      cl_io_is_mkwrite(io)))
+		RETURN(0);
+
+	/*
+	 * FLR: check if it needs to send a write intent RPC to server.
+	 * Writing to sync_pending file needs write intent RPC to change
+	 * the file state back to write_pending, so that the layout version
+	 * can be increased when the state changes to sync_pending at a later
+	 * time. Otherwise there exists a chance that an evicted client may
+	 * dirty the file data while resync client is working on it.
+	 * Designated I/O is allowed for resync workload.
+	 */
+	if (lov_flr_state(obj) == LCM_FL_RDONLY ||
+	    (lov_flr_state(obj) == LCM_FL_SYNC_PENDING &&
+	     io->ci_designated_mirror == 0)) {
+		io->ci_need_write_intent = 1;
+		RETURN(0);
+	}
+
+	LASSERT((lov_flr_state(obj) == LCM_FL_WRITE_PENDING));
+	LASSERT(comp->lo_preferred_mirror >= 0);
+
+	/*
+	 * need to iterate all components to see if there are
+	 * multiple components covering the writing component
+	 */
+	primary = &comp->lo_mirrors[comp->lo_preferred_mirror];
+	LASSERT(!primary->lre_stale);
+	lov_foreach_mirror_layout_entry(obj, lle, primary) {
+		LASSERT(lle->lle_valid);
+		if (!lu_extent_is_overlapped(ext, lle->lle_extent))
+			continue;
+
+		ext->e_start = MIN(ext->e_start, lle->lle_extent->e_start);
+		ext->e_end = MAX(ext->e_end, lle->lle_extent->e_end);
+		++count;
+	}
+	if (count == 0) {
+		CERROR(DFID ": cannot find any valid components covering "
+		       "file extent "DEXT", mirror: %d\n",
+		       PFID(lu_object_fid(lov2lu(obj))), PEXT(ext),
+		       primary->lre_mirror_id);
+		RETURN(-EIO);
+	}
+
+	count = 0;
+	lov_foreach_mirror_entry(obj, lre) {
+		if (lre == primary)
+			continue;
+
+		lov_foreach_mirror_layout_entry(obj, lle, lre) {
+			if (!lle->lle_valid)
+				continue;
+
+			if (lu_extent_is_overlapped(ext, lle->lle_extent)) {
+				++count;
+				break;
+			}
+		}
+	}
+
+	CDEBUG(D_VFSTRACE, DFID "there are %zd components to be staled to "
+	       "modify file extent "DEXT", iot: %d\n",
+	       PFID(lu_object_fid(lov2lu(obj))), count, PEXT(ext), io->ci_type);
+
+	io->ci_need_write_intent = count > 0;
+
+	RETURN(0);
+}
+
+static int lov_io_mirror_init(struct lov_io *lio, struct lov_object *obj,
+			       struct cl_io *io)
+{
+	struct lov_layout_composite *comp = &obj->u.composite;
+	int index;
+	int i;
+	int result;
+	ENTRY;
+
+	if (!lov_is_flr(obj)) {
+		/* only locks/pages are manipulated for CIT_MISC op, no
+		 * cl_io_loop() will be called, don't check/set mirror info.
+		 */
+		if (io->ci_type != CIT_MISC) {
+			LASSERT(comp->lo_preferred_mirror == 0);
+			lio->lis_mirror_index = comp->lo_preferred_mirror;
+		}
+		io->ci_ndelay = 0;
+		RETURN(0);
+	}
+
+	/* transfer the layout version for verification */
+	if (io->ci_layout_version == 0)
+		io->ci_layout_version = obj->lo_lsm->lsm_layout_gen;
+
+	/* find the corresponding mirror for designated mirror IO */
+	if (io->ci_designated_mirror > 0) {
+		struct lov_mirror_entry *entry;
+
+		LASSERT(!io->ci_ndelay);
+
+		CDEBUG(D_LAYOUT, "designated I/O mirror state: %d\n",
+		      lov_flr_state(obj));
+
+		if ((cl_io_is_trunc(io) || io->ci_type == CIT_WRITE) &&
+		    (io->ci_layout_version != obj->lo_lsm->lsm_layout_gen)) {
+			/*
+			 * For resync I/O, the ci_layout_version was the layout
+			 * version when resync starts. If it doesn't match the
+			 * current object layout version, it means the layout
+			 * has been changed
+			 */
+			RETURN(-ESTALE);
+		}
+
+		io->ci_layout_version |= LU_LAYOUT_RESYNC;
+
+		index = 0;
+		lio->lis_mirror_index = -1;
+		lov_foreach_mirror_entry(obj, entry) {
+			if (entry->lre_mirror_id ==
+			    io->ci_designated_mirror) {
+				lio->lis_mirror_index = index;
+				break;
+			}
+
+			index++;
+		}
+
+		RETURN(lio->lis_mirror_index < 0 ? -EINVAL : 0);
+	}
+
+	result = lov_io_mirror_write_intent(lio, obj, io);
+	if (result)
+		RETURN(result);
+
+	if (io->ci_need_write_intent) {
+		CDEBUG(D_VFSTRACE, DFID " need write intent for [%llu, %llu)\n",
+		       PFID(lu_object_fid(lov2lu(obj))),
+		       lio->lis_pos, lio->lis_endpos);
+
+		if (cl_io_is_trunc(io)) {
+			/**
+			 * for truncate, we uses [size, EOF) to judge whether
+			 * a write intent needs to be send, but we need to
+			 * restore the write extent to [0, size], in truncate,
+			 * the byte in the size position is accessed.
+			 */
+			io->ci_write_intent.e_start = 0;
+			io->ci_write_intent.e_end =
+					io->u.ci_setattr.sa_attr.lvb_size + 1;
+		}
+		/* stop cl_io_init() loop */
+		RETURN(1);
+	}
+
+	if (io->ci_ndelay_tried == 0 || /* first time to try */
+	    /* reset the mirror index if layout has changed */
+	    lio->lis_mirror_layout_gen != obj->lo_lsm->lsm_layout_gen) {
+		lio->lis_mirror_layout_gen = obj->lo_lsm->lsm_layout_gen;
+		index = lio->lis_mirror_index = comp->lo_preferred_mirror;
+	} else {
+		index = lio->lis_mirror_index;
+		LASSERT(index >= 0);
+
+		/* move mirror index to the next one */
+		index = (index + 1) % comp->lo_mirror_count;
+	}
+
+	for (i = 0; i < comp->lo_mirror_count; i++) {
+		struct lu_extent ext = { .e_start = lio->lis_pos,
+					 .e_end   = lio->lis_pos + 1 };
+		struct lov_mirror_entry *lre;
+		struct lov_layout_entry *lle;
+		bool found = false;
+
+		lre = &comp->lo_mirrors[(index + i) % comp->lo_mirror_count];
+		if (!lre->lre_valid)
+			continue;
+
+		lov_foreach_mirror_layout_entry(obj, lle, lre) {
+			if (!lle->lle_valid)
+				continue;
+
+			if (lu_extent_is_overlapped(&ext, lle->lle_extent)) {
+				found = true;
+				break;
+			}
+		} /* each component of the mirror */
+		if (found) {
+			index = (index + i) % comp->lo_mirror_count;
+			break;
+		}
+	} /* each mirror */
+
+	if (i == comp->lo_mirror_count) {
+		CERROR(DFID": failed to find a component covering "
+		       "I/O region at %llu\n",
+		       PFID(lu_object_fid(lov2lu(obj))), lio->lis_pos);
+
+		dump_lsm(D_ERROR, obj->lo_lsm);
+
+		RETURN(-EIO);
+	}
+
+	CDEBUG(D_VFSTRACE, DFID ": flr state: %d, move mirror from %d to %d, "
+	       "have retried: %d, mirror count: %d\n",
+	       PFID(lu_object_fid(lov2lu(obj))), lov_flr_state(obj),
+	       lio->lis_mirror_index, index, io->ci_ndelay_tried,
+	       comp->lo_mirror_count);
+
+	lio->lis_mirror_index = index;
+
+	/*
+	 * FLR: if all mirrors have been tried once, most likely the network
+	 * of this client has been partitioned. We should relinquish CPU for
+	 * a while before trying again.
+	 */
+	if (io->ci_ndelay && io->ci_ndelay_tried > 0 &&
+	    (io->ci_ndelay_tried % comp->lo_mirror_count == 0)) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC)); /* 10ms */
+		if (signal_pending(current))
+			RETURN(-EINTR);
+
+		/**
+		 * we'd set ci_tried_all_mirrors to turn off fast mirror
+		 * switching for read after we've tried all mirrors several
+		 * rounds.
+		 */
+		io->ci_tried_all_mirrors = io->ci_ndelay_tried %
+					   (comp->lo_mirror_count * 4) == 0;
+	}
+	++io->ci_ndelay_tried;
+
+	CDEBUG(D_VFSTRACE, "use %sdelayed RPC state for this IO\n",
+	       io->ci_ndelay ? "non-" : "");
+
+	RETURN(0);
+}
+
 static int lov_io_slice_init(struct lov_io *lio,
 			     struct lov_object *obj, struct cl_io *io)
 {
+	int index;
+	int result = 0;
 	ENTRY;
 
 	io->ci_result = 0;
@@ -212,42 +494,45 @@ static int lov_io_slice_init(struct lov_io *lio,
 	switch (io->ci_type) {
 	case CIT_READ:
 	case CIT_WRITE:
-		lio->lis_pos = io->u.ci_rw.rw_range.cir_pos;
-		lio->lis_endpos = lio->lis_pos + io->u.ci_rw.rw_range.cir_count;
+		lio->lis_pos = io->u.ci_rw.crw_pos;
+		lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count;
 		lio->lis_io_endpos = lio->lis_endpos;
 		if (cl_io_is_append(io)) {
 			LASSERT(io->ci_type == CIT_WRITE);
 
-			/* If there is LOV EA hole, then we may cannot locate
-			 * the current file-tail exactly. */
+			/*
+			 * If there is LOV EA hole, then we may cannot locate
+			 * the current file-tail exactly.
+			 */
 			if (unlikely(obj->lo_lsm->lsm_entries[0]->lsme_pattern &
 				     LOV_PATTERN_F_HOLE))
-				RETURN(-EIO);
+				GOTO(out, result = -EIO);
 
 			lio->lis_pos = 0;
 			lio->lis_endpos = OBD_OBJECT_EOF;
 		}
 		break;
 
-        case CIT_SETATTR:
-                if (cl_io_is_trunc(io))
-                        lio->lis_pos = io->u.ci_setattr.sa_attr.lvb_size;
-                else
-                        lio->lis_pos = 0;
-                lio->lis_endpos = OBD_OBJECT_EOF;
-                break;
+	case CIT_SETATTR:
+		if (cl_io_is_trunc(io))
+			lio->lis_pos = io->u.ci_setattr.sa_attr.lvb_size;
+		else
+			lio->lis_pos = 0;
+		lio->lis_endpos = OBD_OBJECT_EOF;
+		break;
 
 	case CIT_DATA_VERSION:
 		lio->lis_pos = 0;
 		lio->lis_endpos = OBD_OBJECT_EOF;
 		break;
 
-        case CIT_FAULT: {
-                pgoff_t index = io->u.ci_fault.ft_index;
-                lio->lis_pos = cl_offset(io->ci_obj, index);
-                lio->lis_endpos = cl_offset(io->ci_obj, index + 1);
-                break;
-        }
+	case CIT_FAULT: {
+		pgoff_t index = io->u.ci_fault.ft_index;
+
+		lio->lis_pos = cl_offset(io->ci_obj, index);
+		lio->lis_endpos = cl_offset(io->ci_obj, index + 1);
+		break;
+	}
 
 	case CIT_FSYNC: {
 		lio->lis_pos = io->u.ci_fsync.fi_start;
@@ -261,16 +546,84 @@ static int lov_io_slice_init(struct lov_io *lio,
 		break;
 	}
 
-        case CIT_MISC:
-                lio->lis_pos = 0;
-                lio->lis_endpos = OBD_OBJECT_EOF;
-                break;
+	case CIT_GLIMPSE:
+		lio->lis_pos = 0;
+		lio->lis_endpos = OBD_OBJECT_EOF;
 
-        default:
-                LBUG();
-        }
+		if (lov_flr_state(obj) == LCM_FL_RDONLY &&
+		    !OBD_FAIL_CHECK(OBD_FAIL_FLR_GLIMPSE_IMMUTABLE))
+			/* SoM is accurate, no need glimpse */
+			GOTO(out, result = 1);
+		break;
 
-	RETURN(0);
+	case CIT_MISC:
+		lio->lis_pos = 0;
+		lio->lis_endpos = OBD_OBJECT_EOF;
+		break;
+
+	default:
+		LBUG();
+	}
+
+	result = lov_io_mirror_init(lio, obj, io);
+	if (result)
+		GOTO(out, result);
+
+	/* check if it needs to instantiate layout */
+	if (!(io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io) ||
+	      (cl_io_is_trunc(io) && io->u.ci_setattr.sa_attr.lvb_size > 0)))
+		GOTO(out, result = 0);
+
+	/*
+	 * for truncate, it only needs to instantiate the components
+	 * before the truncated size.
+	 */
+	if (cl_io_is_trunc(io)) {
+		io->ci_write_intent.e_start = 0;
+		/* for writes, e_end is endpos, the location of the file
+		 * pointer after the write is completed, so it is not accessed.
+		 * For truncate, 'end' is the size, and *is* acccessed.
+		 * In other words, writes are [start, end), but truncate is
+		 * [start, size], where both are included.  So add 1 to the
+		 * size when creating the write intent to account for this.
+		 */
+		io->ci_write_intent.e_end =
+			io->u.ci_setattr.sa_attr.lvb_size + 1;
+	} else {
+		io->ci_write_intent.e_start = lio->lis_pos;
+		io->ci_write_intent.e_end = lio->lis_endpos;
+	}
+
+	index = 0;
+	lov_foreach_io_layout(index, lio, &io->ci_write_intent) {
+		if (!lsm_entry_inited(obj->lo_lsm, index)) {
+			io->ci_need_write_intent = 1;
+			break;
+		}
+	}
+
+	if (io->ci_need_write_intent && io->ci_designated_mirror > 0) {
+		/*
+		 * REINT_SYNC RPC has already tried to instantiate all of the
+		 * components involved, obviously it didn't succeed. Skip this
+		 * mirror for now. The server won't be able to figure out
+		 * which mirror it should instantiate components
+		 */
+		CERROR(DFID": trying to instantiate components for designated "
+		       "I/O, file state: %d\n",
+		       PFID(lu_object_fid(lov2lu(obj))), lov_flr_state(obj));
+
+		io->ci_need_write_intent = 0;
+		GOTO(out, result = -EIO);
+	}
+
+	if (io->ci_need_write_intent)
+		GOTO(out, result = 1);
+
+	EXIT;
+
+out:
+	return result;
 }
 
 static void lov_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
@@ -310,13 +663,13 @@ static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio,
 	int index = lov_comp_entry(sub->sub_subio_index);
 	int stripe = lov_comp_stripe(sub->sub_subio_index);
 
-	io->ci_pio = parent->ci_pio;
 	switch (io->ci_type) {
 	case CIT_SETATTR: {
 		io->u.ci_setattr.sa_attr = parent->u.ci_setattr.sa_attr;
 		io->u.ci_setattr.sa_attr_flags =
 			parent->u.ci_setattr.sa_attr_flags;
-		io->u.ci_setattr.sa_valid = parent->u.ci_setattr.sa_valid;
+		io->u.ci_setattr.sa_avalid = parent->u.ci_setattr.sa_avalid;
+		io->u.ci_setattr.sa_xvalid = parent->u.ci_setattr.sa_xvalid;
 		io->u.ci_setattr.sa_stripe_index = stripe;
 		io->u.ci_setattr.sa_parent_fid =
 					parent->u.ci_setattr.sa_parent_fid;
@@ -355,16 +708,13 @@ static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio,
 	}
 	case CIT_READ:
 	case CIT_WRITE: {
-		io->u.ci_rw.rw_ptask = parent->u.ci_rw.rw_ptask;
-		io->u.ci_rw.rw_iter = parent->u.ci_rw.rw_iter;
-		io->u.ci_rw.rw_iocb = parent->u.ci_rw.rw_iocb;
-		io->u.ci_rw.rw_file = parent->u.ci_rw.rw_file;
-		io->u.ci_rw.rw_sync = parent->u.ci_rw.rw_sync;
+		io->u.ci_wr.wr_sync = cl_io_is_sync_write(parent);
+		io->ci_tried_all_mirrors = parent->ci_tried_all_mirrors;
 		if (cl_io_is_append(parent)) {
-			io->u.ci_rw.rw_append = 1;
+			io->u.ci_wr.wr_append = 1;
 		} else {
-			io->u.ci_rw.rw_range.cir_pos = start;
-			io->u.ci_rw.rw_range.cir_count = end - start;
+			io->u.ci_rw.crw_pos = start;
+			io->u.ci_rw.crw_count = end - start;
 		}
 		break;
 	}
@@ -376,6 +726,8 @@ static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio,
 		io->u.ci_ladvise.li_flags = parent->u.ci_ladvise.li_flags;
 		break;
 	}
+	case CIT_GLIMPSE:
+	case CIT_MISC:
 	default:
 		break;
 	}
@@ -383,63 +735,75 @@ static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio,
 
 static loff_t lov_offset_mod(loff_t val, int delta)
 {
-        if (val != OBD_OBJECT_EOF)
-                val += delta;
-        return val;
+	if (val != OBD_OBJECT_EOF)
+		val += delta;
+	return val;
 }
 
+static int lov_io_add_sub(const struct lu_env *env, struct lov_io *lio,
+			  struct lov_io_sub *sub, u64 start, u64 end)
+{
+	int rc;
+
+	end = lov_offset_mod(end, 1);
+	lov_io_sub_inherit(sub, lio, start, end);
+	rc = cl_io_iter_init(sub->sub_env, &sub->sub_io);
+	if (rc != 0) {
+		cl_io_iter_fini(sub->sub_env, &sub->sub_io);
+		return rc;
+	}
+
+	list_add_tail(&sub->sub_linkage, &lio->lis_active);
+
+	return rc;
+}
 static int lov_io_iter_init(const struct lu_env *env,
 			    const struct cl_io_slice *ios)
 {
-	struct cl_io         *io = ios->cis_io;
-	struct lov_io        *lio = cl2lov_io(env, ios);
+	struct lov_io *lio = cl2lov_io(env, ios);
 	struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
-	struct lov_io_sub    *sub;
-	struct lov_layout_entry *le;
+	struct lov_io_sub *sub;
 	struct lu_extent ext;
 	int index;
 	int rc = 0;
 
-        ENTRY;
+	ENTRY;
 
 	ext.e_start = lio->lis_pos;
 	ext.e_end = lio->lis_endpos;
 
-	index = 0;
-	lov_foreach_layout_entry(lio->lis_object, le) {
+	lov_foreach_io_layout(index, lio, &ext) {
+		struct lov_layout_entry *le = lov_entry(lio->lis_object, index);
 		struct lov_layout_raid0 *r0 = &le->lle_raid0;
 		u64 start;
 		u64 end;
 		int stripe;
+		bool tested_trunc_stripe = false;
 
-		index++;
-		if (!lu_extent_is_overlapped(&ext, &le->lle_extent))
-			continue;
+		r0->lo_trunc_stripeno = -1;
 
 		CDEBUG(D_VFSTRACE, "component[%d] flags %#x\n",
-		       index - 1, lsm->lsm_entries[index - 1]->lsme_flags);
-		if (!lsm_entry_inited(lsm, index - 1)) {
-			/* truncate IO will trigger write intent as well, and
-			 * it's handled in lov_io_setattr_iter_init() */
-			if (io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io)) {
-				io->ci_need_write_intent = 1;
-				/* execute it in main thread */
-				io->ci_pio = 0;
-				rc = -ENODATA;
-				break;
-			}
-
-			/* Read from uninitialized components should return
-			 * zero filled pages. */
+		       index, lsm->lsm_entries[index]->lsme_flags);
+		if (!lsm_entry_inited(lsm, index)) {
+			/*
+			 * Read from uninitialized components should return
+			 * zero filled pages.
+			 */
 			continue;
 		}
 
+		if (!le->lle_valid && !ios->cis_io->ci_designated_mirror) {
+			CERROR("I/O to invalid component: %d, mirror: %d\n",
+			       index, lio->lis_mirror_index);
+			RETURN(-EIO);
+		}
+
 		for (stripe = 0; stripe < r0->lo_nr; stripe++) {
-			if (!lov_stripe_intersects(lsm, index - 1, stripe,
+			if (!lov_stripe_intersects(lsm, index, stripe,
 						   &ext, &start, &end))
 				continue;
 
-			if (unlikely(r0->lo_sub[stripe] == NULL)) {
+			if (unlikely(!r0->lo_sub[stripe])) {
 				if (ios->cis_io->ci_type == CIT_READ ||
 				    ios->cis_io->ci_type == CIT_WRITE ||
 				    ios->cis_io->ci_type == CIT_FAULT)
@@ -448,29 +812,79 @@ static int lov_io_iter_init(const struct lu_env *env,
 				continue;
 			}
 
-			end = lov_offset_mod(end, 1);
+			if (cl_io_is_trunc(ios->cis_io) &&
+			    !tested_trunc_stripe) {
+				int prev;
+				u64 tr_start;
+
+				prev = (stripe == 0) ? r0->lo_nr - 1 :
+							stripe - 1;
+				/**
+				 * Only involving previous stripe if the
+				 * truncate in this component is at the
+				 * beginning of this stripe.
+				 */
+				tested_trunc_stripe = true;
+				if (ext.e_start < lsm->lsm_entries[index]->
+							lsme_extent.e_start) {
+					/* need previous stripe involvement */
+					r0->lo_trunc_stripeno = prev;
+				} else {
+					tr_start = ext.e_start;
+					tr_start = lov_do_div64(tr_start,
+						      stripe_width(lsm, index));
+					/* tr_start %= stripe_swidth */
+					if (tr_start == stripe * lsm->
+							lsm_entries[index]->
+							lsme_stripe_size)
+						r0->lo_trunc_stripeno = prev;
+				}
+			}
+
+			/* if the last stripe is the trunc stripeno */
+			if (r0->lo_trunc_stripeno == stripe)
+				r0->lo_trunc_stripeno = -1;
+
 			sub = lov_sub_get(env, lio,
-					  lov_comp_index(index - 1, stripe));
-			if (IS_ERR(sub)) {
-				rc = PTR_ERR(sub);
+					  lov_comp_index(index, stripe));
+			if (IS_ERR(sub))
+				return PTR_ERR(sub);
+
+			rc = lov_io_add_sub(env, lio, sub, start, end);
+			if (rc != 0)
 				break;
+		}
+		if (rc != 0)
+			break;
+
+		if (r0->lo_trunc_stripeno != -1) {
+			stripe = r0->lo_trunc_stripeno;
+			if (unlikely(!r0->lo_sub[stripe])) {
+				r0->lo_trunc_stripeno = -1;
+				continue;
+			}
+			sub = lov_sub_get(env, lio,
+					  lov_comp_index(index, stripe));
+			if (IS_ERR(sub))
+				return PTR_ERR(sub);
+
+			/**
+			 * the prev sub could be used by another truncate, we'd
+			 * skip it. LU-14128 happends when expand truncate +
+			 * read get wrong kms.
+			 */
+			if (!list_empty(&sub->sub_linkage)) {
+				r0->lo_trunc_stripeno = -1;
+				continue;
 			}
 
-			lov_io_sub_inherit(sub, lio, start, end);
-			rc = cl_io_iter_init(sub->sub_env, &sub->sub_io);
-			if (rc != 0)
-				cl_io_iter_fini(sub->sub_env, &sub->sub_io);
+			(void)lov_stripe_intersects(lsm, index, stripe, &ext,
+						    &start, &end);
+			rc = lov_io_add_sub(env, lio, sub, start, end);
 			if (rc != 0)
 				break;
 
-			CDEBUG(D_VFSTRACE,
-				"shrink stripe: {%d, %d} range: [%llu, %llu)\n",
-				index, stripe, start, end);
-
-			list_add_tail(&sub->sub_linkage, &lio->lis_active);
 		}
-		if (rc != 0)
-			break;
 	}
 	RETURN(rc);
 }
@@ -478,12 +892,10 @@ static int lov_io_iter_init(const struct lu_env *env,
 static int lov_io_rw_iter_init(const struct lu_env *env,
 			       const struct cl_io_slice *ios)
 {
-	struct cl_io *io = ios->cis_io;
 	struct lov_io *lio = cl2lov_io(env, ios);
-	struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+	struct cl_io *io = ios->cis_io;
 	struct lov_stripe_md_entry *lse;
-	struct cl_io_range *range = &io->u.ci_rw.rw_range;
-	loff_t start = range->cir_pos;
+	loff_t start = io->u.ci_rw.crw_pos;
 	loff_t next;
 	int index;
 
@@ -493,14 +905,14 @@ static int lov_io_rw_iter_init(const struct lu_env *env,
 	if (cl_io_is_append(io))
 		RETURN(lov_io_iter_init(env, ios));
 
-	index = lov_lsm_entry(lsm, range->cir_pos);
+	index = lov_io_layout_at(lio, io->u.ci_rw.crw_pos);
 	if (index < 0) { /* non-existing layout component */
 		if (io->ci_type == CIT_READ) {
-			/* TODO: it needs to detect the next component and
-			 * then set the next pos */
+			/*
+			 * TODO: it needs to detect the next component and
+			 * then set the next pos
+			 */
 			io->ci_continue = 0;
-			/* execute it in main thread */
-			io->ci_pio = 0;
 
 			RETURN(lov_io_iter_init(env, ios));
 		}
@@ -508,6 +920,10 @@ static int lov_io_rw_iter_init(const struct lu_env *env,
 		RETURN(-ENODATA);
 	}
 
+	if (!lov_entry(lio->lis_object, index)->lle_valid &&
+	    !io->ci_designated_mirror)
+		RETURN(io->ci_type == CIT_READ ? -EAGAIN : -EIO);
+
 	lse = lov_lse(lio->lis_object, index);
 
 	next = MAX_LFS_FILESIZE;
@@ -520,37 +936,20 @@ static int lov_io_rw_iter_init(const struct lu_env *env,
 			next = MAX_LFS_FILESIZE;
 	}
 
-	LASSERTF(range->cir_pos >= lse->lsme_extent.e_start,
-		 "pos %lld, [%lld, %lld)\n", range->cir_pos,
+	LASSERTF(io->u.ci_rw.crw_pos >= lse->lsme_extent.e_start,
+		 "pos %lld, [%lld, %lld)\n", io->u.ci_rw.crw_pos,
 		 lse->lsme_extent.e_start, lse->lsme_extent.e_end);
 	next = min_t(__u64, next, lse->lsme_extent.e_end);
 	next = min_t(loff_t, next, lio->lis_io_endpos);
 
-	io->ci_continue  = next < lio->lis_io_endpos;
-	range->cir_count = next - range->cir_pos;
-	lio->lis_pos     = range->cir_pos;
-	lio->lis_endpos  = range->cir_pos + range->cir_count;
+	io->ci_continue = next < lio->lis_io_endpos;
+	io->u.ci_rw.crw_count = next - io->u.ci_rw.crw_pos;
+	lio->lis_pos    = io->u.ci_rw.crw_pos;
+	lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count;
 	CDEBUG(D_VFSTRACE,
-	       "stripe: {%d, %llu} range: [%llu, %llu) end: %llu, count: %zd\n",
-	       index, start, lio->lis_pos, lio->lis_endpos,
-	       lio->lis_io_endpos, range->cir_count);
-
-	if (!io->ci_continue) {
-		/* the last piece of IO, execute it in main thread */
-		io->ci_pio = 0;
-	}
-
-	if (io->ci_pio) {
-		/* it only splits IO here for parallel IO,
-		 * there will be no actual IO going to occur,
-		 * so it doesn't need to invoke lov_io_iter_init()
-		 * to initialize sub IOs. */
-		if (!lsm_entry_inited(lsm, index)) {
-			io->ci_need_write_intent = 1;
-			RETURN(-ENODATA);
-		}
-		RETURN(0);
-	}
+	       "stripe: %llu chunk: [%llu, %llu) %llu, %zd\n",
+	       (__u64)start, lio->lis_pos, lio->lis_endpos,
+	       (__u64)lio->lis_io_endpos, io->u.ci_rw.crw_count);
 
 	/*
 	 * XXX The following call should be optimized: we know, that
@@ -564,18 +963,14 @@ static int lov_io_setattr_iter_init(const struct lu_env *env,
 {
 	struct lov_io *lio = cl2lov_io(env, ios);
 	struct cl_io *io = ios->cis_io;
-	struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
 	int index;
 	ENTRY;
 
 	if (cl_io_is_trunc(io) && lio->lis_pos > 0) {
-		index = lov_lsm_entry(lsm, lio->lis_pos);
-		CDEBUG(D_VFSTRACE, "component[%d] flags %#x pos %llu\n",
-			index, lsm->lsm_entries[index]->lsme_flags, lio->lis_pos);
-		if (index > 0 && !lsm_entry_inited(lsm, index)) {
-			io->ci_need_write_intent = 1;
+		index = lov_io_layout_at(lio, lio->lis_pos - 1);
+		/* no entry found for such offset */
+		if (index < 0)
 			RETURN(io->ci_result = -ENODATA);
-		}
 	}
 
 	RETURN(lov_io_iter_init(env, ios));
@@ -602,49 +997,49 @@ static int lov_io_call(const struct lu_env *env, struct lov_io *lio,
 
 static int lov_io_lock(const struct lu_env *env, const struct cl_io_slice *ios)
 {
-        ENTRY;
-        RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_lock));
+	ENTRY;
+	RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_lock));
 }
 
 static int lov_io_start(const struct lu_env *env, const struct cl_io_slice *ios)
 {
-        ENTRY;
-        RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_start));
+	ENTRY;
+	RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_start));
 }
 
 static int lov_io_end_wrapper(const struct lu_env *env, struct cl_io *io)
 {
-        ENTRY;
-        /*
-         * It's possible that lov_io_start() wasn't called against this
-         * sub-io, either because previous sub-io failed, or upper layer
-         * completed IO.
-         */
-        if (io->ci_state == CIS_IO_GOING)
-                cl_io_end(env, io);
-        else
-                io->ci_state = CIS_IO_FINISHED;
-        RETURN(0);
+	ENTRY;
+	/*
+	 * It's possible that lov_io_start() wasn't called against this
+	 * sub-io, either because previous sub-io failed, or upper layer
+	 * completed IO.
+	 */
+	if (io->ci_state == CIS_IO_GOING)
+		cl_io_end(env, io);
+	else
+		io->ci_state = CIS_IO_FINISHED;
+	RETURN(0);
 }
 
 static int lov_io_iter_fini_wrapper(const struct lu_env *env, struct cl_io *io)
 {
-        cl_io_iter_fini(env, io);
-        RETURN(0);
+	cl_io_iter_fini(env, io);
+	RETURN(0);
 }
 
 static int lov_io_unlock_wrapper(const struct lu_env *env, struct cl_io *io)
 {
-        cl_io_unlock(env, io);
-        RETURN(0);
+	cl_io_unlock(env, io);
+	RETURN(0);
 }
 
 static void lov_io_end(const struct lu_env *env, const struct cl_io_slice *ios)
 {
-        int rc;
+	int rc;
 
-        rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_end_wrapper);
-        LASSERT(rc == 0);
+	rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_end_wrapper);
+	LASSERT(rc == 0);
 }
 
 static void
@@ -652,14 +1047,18 @@ lov_io_data_version_end(const struct lu_env *env, const struct cl_io_slice *ios)
 {
 	struct lov_io *lio = cl2lov_io(env, ios);
 	struct cl_io *parent = lio->lis_cl.cis_io;
+	struct cl_data_version_io *pdv = &parent->u.ci_data_version;
 	struct lov_io_sub *sub;
 
 	ENTRY;
 	list_for_each_entry(sub, &lio->lis_active, sub_linkage) {
-		lov_io_end_wrapper(env, &sub->sub_io);
+		struct cl_data_version_io *sdv = &sub->sub_io.u.ci_data_version;
 
-		parent->u.ci_data_version.dv_data_version +=
-			sub->sub_io.u.ci_data_version.dv_data_version;
+		lov_io_end_wrapper(sub->sub_env, &sub->sub_io);
+
+		pdv->dv_data_version += sdv->dv_data_version;
+		if (pdv->dv_layout_version > sdv->dv_layout_version)
+			pdv->dv_layout_version = sdv->dv_layout_version;
 
 		if (parent->ci_result == 0)
 			parent->ci_result = sub->sub_io.ci_result;
@@ -671,26 +1070,26 @@ lov_io_data_version_end(const struct lu_env *env, const struct cl_io_slice *ios)
 static void lov_io_iter_fini(const struct lu_env *env,
                              const struct cl_io_slice *ios)
 {
-        struct lov_io *lio = cl2lov_io(env, ios);
-        int rc;
+	struct lov_io *lio = cl2lov_io(env, ios);
+	int rc;
 
-        ENTRY;
-        rc = lov_io_call(env, lio, lov_io_iter_fini_wrapper);
-        LASSERT(rc == 0);
+	ENTRY;
+	rc = lov_io_call(env, lio, lov_io_iter_fini_wrapper);
+	LASSERT(rc == 0);
 	while (!list_empty(&lio->lis_active))
 		list_del_init(lio->lis_active.next);
-        EXIT;
+	EXIT;
 }
 
 static void lov_io_unlock(const struct lu_env *env,
                           const struct cl_io_slice *ios)
 {
-        int rc;
+	int rc;
 
-        ENTRY;
-        rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_unlock_wrapper);
-        LASSERT(rc == 0);
-        EXIT;
+	ENTRY;
+	rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_unlock_wrapper);
+	LASSERT(rc == 0);
+	EXIT;
 }
 
 static int lov_io_read_ahead(const struct lu_env *env,
@@ -712,14 +1111,18 @@ static int lov_io_read_ahead(const struct lu_env *env,
 	ENTRY;
 
 	offset = cl_offset(obj, start);
-	index = lov_lsm_entry(loo->lo_lsm, offset);
+	index = lov_io_layout_at(lio, offset);
 	if (index < 0 || !lsm_entry_inited(loo->lo_lsm, index))
 		RETURN(-ENODATA);
 
+	/* avoid readahead to expand to stale components */
+	if (!lov_entry(loo, index)->lle_valid)
+		RETURN(-EIO);
+
 	stripe = lov_stripe_number(loo->lo_lsm, index, offset);
 
 	r0 = lov_r0(loo, index);
-	if (unlikely(r0->lo_sub[stripe] == NULL))
+	if (unlikely(!r0->lo_sub[stripe]))
 		RETURN(-EIO);
 
 	sub = lov_sub_get(env, lio, lov_comp_index(index, stripe));
@@ -750,7 +1153,7 @@ static int lov_io_read_ahead(const struct lu_env *env,
 					       ra_end, stripe);
 
 	/* boundary of current component */
-	ra_end = cl_index(obj, (loff_t)lov_lse(loo, index)->lsme_extent.e_end);
+	ra_end = cl_index(obj, (loff_t)lov_io_extent(lio, index)->e_end);
 	if (ra_end != CL_PAGE_EOF && ra->cra_end >= ra_end)
 		ra->cra_end = ra_end - 1;
 
@@ -794,35 +1197,37 @@ static int lov_io_submit(const struct lu_env *env,
 	struct lov_io_sub	*sub;
 	struct cl_page_list	*plist = &lov_env_info(env)->lti_plist;
 	struct cl_page		*page;
+	struct cl_page		*tmp;
 	int index;
 	int rc = 0;
 	ENTRY;
 
-	if (lio->lis_nr_subios == 1) {
-		int idx = lio->lis_single_subio_index;
-
-		sub = lov_sub_get(env, lio, idx);
-		LASSERT(!IS_ERR(sub));
-		LASSERT(sub == &lio->lis_single_subio);
-		rc = cl_io_submit_rw(sub->sub_env, &sub->sub_io,
-				     crt, queue);
-		RETURN(rc);
-	}
-
 	cl_page_list_init(plist);
 	while (qin->pl_nr > 0) {
 		struct cl_2queue  *cl2q = &lov_env_info(env)->lti_cl2q;
 
-		cl_2queue_init(cl2q);
-
 		page = cl_page_list_first(qin);
+		if (lov_page_is_empty(page)) {
+			cl_page_list_move(&queue->c2_qout, qin, page);
+
+			/*
+			 * it could only be mirror read to get here therefore
+			 * the pages will be transient. We don't care about
+			 * the return code of cl_page_prep() at all.
+			 */
+			(void) cl_page_prep(env, ios->cis_io, page, crt);
+			cl_page_completion(env, page, crt, 0);
+			continue;
+		}
+
+		cl_2queue_init(cl2q);
 		cl_page_list_move(&cl2q->c2_qin, qin, page);
 
 		index = lov_page_index(page);
-		while (qin->pl_nr > 0) {
-			page = cl_page_list_first(qin);
+		cl_page_list_for_each_safe(page, tmp, qin) {
+			/* this page is not on this stripe */
 			if (index != lov_page_index(page))
-				break;
+				continue;
 
 			cl_page_list_move(&cl2q->c2_qin, qin, page);
 		}
@@ -855,7 +1260,7 @@ static int lov_io_commit_async(const struct lu_env *env,
 			       cl_commit_cbt cb)
 {
 	struct cl_page_list *plist = &lov_env_info(env)->lti_plist;
-	struct lov_io     *lio = cl2lov_io(env, ios);
+	struct lov_io *lio = cl2lov_io(env, ios);
 	struct lov_io_sub *sub;
 	struct cl_page *page;
 	int rc = 0;
@@ -864,6 +1269,8 @@ static int lov_io_commit_async(const struct lu_env *env,
 	if (lio->lis_nr_subios == 1) {
 		int idx = lio->lis_single_subio_index;
 
+		LASSERT(!lov_page_is_empty(cl_page_list_first(queue)));
+
 		sub = lov_sub_get(env, lio, idx);
 		LASSERT(!IS_ERR(sub));
 		LASSERT(sub == &lio->lis_single_subio);
@@ -879,6 +1286,8 @@ static int lov_io_commit_async(const struct lu_env *env,
 
 		LASSERT(plist->pl_nr == 0);
 		page = cl_page_list_first(queue);
+		LASSERT(!lov_page_is_empty(page));
+
 		cl_page_list_move(plist, queue, page);
 
 		index = lov_page_index(page);
@@ -957,25 +1366,25 @@ static void lov_io_fsync_end(const struct lu_env *env,
 }
 
 static const struct cl_io_operations lov_io_ops = {
-        .op = {
-                [CIT_READ] = {
-                        .cio_fini      = lov_io_fini,
-                        .cio_iter_init = lov_io_rw_iter_init,
-                        .cio_iter_fini = lov_io_iter_fini,
-                        .cio_lock      = lov_io_lock,
-                        .cio_unlock    = lov_io_unlock,
-                        .cio_start     = lov_io_start,
-                        .cio_end       = lov_io_end
-                },
-                [CIT_WRITE] = {
-                        .cio_fini      = lov_io_fini,
-                        .cio_iter_init = lov_io_rw_iter_init,
-                        .cio_iter_fini = lov_io_iter_fini,
-                        .cio_lock      = lov_io_lock,
-                        .cio_unlock    = lov_io_unlock,
-                        .cio_start     = lov_io_start,
-                        .cio_end       = lov_io_end
-                },
+	.op = {
+		[CIT_READ] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_rw_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_start,
+			.cio_end       = lov_io_end
+		},
+		[CIT_WRITE] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_rw_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_start,
+			.cio_end       = lov_io_end
+		},
 		[CIT_SETATTR] = {
 			.cio_fini      = lov_io_fini,
 			.cio_iter_init = lov_io_setattr_iter_init,
@@ -986,23 +1395,23 @@ static const struct cl_io_operations lov_io_ops = {
 			.cio_end       = lov_io_end
 		},
 		[CIT_DATA_VERSION] = {
-			.cio_fini	= lov_io_fini,
-			.cio_iter_init	= lov_io_iter_init,
-			.cio_iter_fini	= lov_io_iter_fini,
-			.cio_lock	= lov_io_lock,
-			.cio_unlock	= lov_io_unlock,
-			.cio_start	= lov_io_start,
-			.cio_end	= lov_io_data_version_end,
+			.cio_fini       = lov_io_fini,
+			.cio_iter_init  = lov_io_iter_init,
+			.cio_iter_fini  = lov_io_iter_fini,
+			.cio_lock       = lov_io_lock,
+			.cio_unlock     = lov_io_unlock,
+			.cio_start      = lov_io_start,
+			.cio_end        = lov_io_data_version_end,
+		},
+		[CIT_FAULT] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_fault_start,
+			.cio_end       = lov_io_end
 		},
-                [CIT_FAULT] = {
-                        .cio_fini      = lov_io_fini,
-                        .cio_iter_init = lov_io_iter_init,
-                        .cio_iter_fini = lov_io_iter_fini,
-                        .cio_lock      = lov_io_lock,
-                        .cio_unlock    = lov_io_unlock,
-                        .cio_start     = lov_io_fault_start,
-                        .cio_end       = lov_io_end
-                },
 		[CIT_FSYNC] = {
 			.cio_fini      = lov_io_fini,
 			.cio_iter_init = lov_io_iter_init,
@@ -1021,11 +1430,14 @@ static const struct cl_io_operations lov_io_ops = {
 			.cio_start     = lov_io_start,
 			.cio_end       = lov_io_end
 		},
+		[CIT_GLIMPSE] = {
+			.cio_fini      = lov_io_fini,
+		},
 		[CIT_MISC] = {
 			.cio_fini      = lov_io_fini
 		}
 	},
-	.cio_read_ahead		       = lov_io_read_ahead,
+	.cio_read_ahead                = lov_io_read_ahead,
 	.cio_submit                    = lov_io_submit,
 	.cio_commit_async              = lov_io_commit_async,
 };
@@ -1057,7 +1469,7 @@ static int lov_empty_io_submit(const struct lu_env *env,
 static void lov_empty_impossible(const struct lu_env *env,
                                  struct cl_io_slice *ios)
 {
-        LBUG();
+	LBUG();
 }
 
 #define LOV_EMPTY_IMPOSSIBLE ((void *)lov_empty_impossible)
@@ -1066,43 +1478,46 @@ static void lov_empty_impossible(const struct lu_env *env,
  * An io operation vector for files without stripes.
  */
 static const struct cl_io_operations lov_empty_io_ops = {
-        .op = {
-                [CIT_READ] = {
-                        .cio_fini       = lov_empty_io_fini,
+	.op = {
+		[CIT_READ] = {
+			.cio_fini       = lov_empty_io_fini,
 #if 0
-                        .cio_iter_init  = LOV_EMPTY_IMPOSSIBLE,
-                        .cio_lock       = LOV_EMPTY_IMPOSSIBLE,
-                        .cio_start      = LOV_EMPTY_IMPOSSIBLE,
-                        .cio_end        = LOV_EMPTY_IMPOSSIBLE
+			.cio_iter_init  = LOV_EMPTY_IMPOSSIBLE,
+			.cio_lock       = LOV_EMPTY_IMPOSSIBLE,
+			.cio_start      = LOV_EMPTY_IMPOSSIBLE,
+			.cio_end        = LOV_EMPTY_IMPOSSIBLE
 #endif
-                },
-                [CIT_WRITE] = {
-                        .cio_fini      = lov_empty_io_fini,
-                        .cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
-                        .cio_lock      = LOV_EMPTY_IMPOSSIBLE,
-                        .cio_start     = LOV_EMPTY_IMPOSSIBLE,
-                        .cio_end       = LOV_EMPTY_IMPOSSIBLE
-                },
-                [CIT_SETATTR] = {
-                        .cio_fini      = lov_empty_io_fini,
-                        .cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
-                        .cio_lock      = LOV_EMPTY_IMPOSSIBLE,
-                        .cio_start     = LOV_EMPTY_IMPOSSIBLE,
-                        .cio_end       = LOV_EMPTY_IMPOSSIBLE
-                },
-                [CIT_FAULT] = {
-                        .cio_fini      = lov_empty_io_fini,
-                        .cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
-                        .cio_lock      = LOV_EMPTY_IMPOSSIBLE,
-                        .cio_start     = LOV_EMPTY_IMPOSSIBLE,
-                        .cio_end       = LOV_EMPTY_IMPOSSIBLE
-                },
+		},
+		[CIT_WRITE] = {
+			.cio_fini      = lov_empty_io_fini,
+			.cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+			.cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+			.cio_start     = LOV_EMPTY_IMPOSSIBLE,
+			.cio_end       = LOV_EMPTY_IMPOSSIBLE
+		},
+		[CIT_SETATTR] = {
+			.cio_fini      = lov_empty_io_fini,
+			.cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+			.cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+			.cio_start     = LOV_EMPTY_IMPOSSIBLE,
+			.cio_end       = LOV_EMPTY_IMPOSSIBLE
+		},
+		[CIT_FAULT] = {
+			.cio_fini      = lov_empty_io_fini,
+			.cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+			.cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+			.cio_start     = LOV_EMPTY_IMPOSSIBLE,
+			.cio_end       = LOV_EMPTY_IMPOSSIBLE
+		},
 		[CIT_FSYNC] = {
 			.cio_fini      = lov_empty_io_fini
 		},
 		[CIT_LADVISE] = {
 			.cio_fini   = lov_empty_io_fini
 		},
+		[CIT_GLIMPSE] = {
+			.cio_fini      = lov_empty_io_fini
+		},
 		[CIT_MISC] = {
 			.cio_fini      = lov_empty_io_fini
 		}
@@ -1114,23 +1529,26 @@ static const struct cl_io_operations lov_empty_io_ops = {
 int lov_io_init_composite(const struct lu_env *env, struct cl_object *obj,
 			  struct cl_io *io)
 {
-	struct lov_io       *lio = lov_env_io(env);
-	struct lov_object   *lov = cl2lov(obj);
+	struct lov_io *lio = lov_env_io(env);
+	struct lov_object *lov = cl2lov(obj);
+	int result;
 
 	ENTRY;
+
 	INIT_LIST_HEAD(&lio->lis_active);
-	io->ci_result = lov_io_slice_init(lio, lov, io);
-	if (io->ci_result != 0)
-		RETURN(io->ci_result);
-
-	if (io->ci_result == 0) {
-		io->ci_result = lov_io_subio_init(env, lio, io);
-		if (io->ci_result == 0) {
-			cl_io_slice_add(io, &lio->lis_cl, obj, &lov_io_ops);
-			atomic_inc(&lov->lo_active_ios);
-		}
+	result = lov_io_slice_init(lio, lov, io);
+	if (result)
+		GOTO(out, result);
+
+	result = lov_io_subio_init(env, lio, io);
+	if (!result) {
+		cl_io_slice_add(io, &lio->lis_cl, obj, &lov_io_ops);
+		atomic_inc(&lov->lo_active_ios);
 	}
-	RETURN(io->ci_result);
+	EXIT;
+out:
+	io->ci_result = result < 0 ? result : 0;
+	return result;
 }
 
 int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj,
@@ -1146,6 +1564,7 @@ int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj,
 	default:
 		LBUG();
 	case CIT_MISC:
+	case CIT_GLIMPSE:
 	case CIT_READ:
 		result = 0;
 		break;
@@ -1189,6 +1608,7 @@ int lov_io_init_released(const struct lu_env *env, struct cl_object *obj,
 		LASSERTF(0, "invalid type %d\n", io->ci_type);
 		result = -EOPNOTSUPP;
 		break;
+	case CIT_GLIMPSE:
 	case CIT_MISC:
 	case CIT_FSYNC:
 	case CIT_LADVISE:
@@ -1196,7 +1616,8 @@ int lov_io_init_released(const struct lu_env *env, struct cl_object *obj,
 		result = 1;
 		break;
 	case CIT_SETATTR:
-		/* the truncate to 0 is managed by MDT:
+		/*
+		 * the truncate to 0 is managed by MDT:
 		 * - in open, for open O_TRUNC
 		 * - in setattr, for truncate
 		 */
@@ -1223,4 +1644,45 @@ int lov_io_init_released(const struct lu_env *env, struct cl_object *obj,
 	io->ci_result = result < 0 ? result : 0;
 	RETURN(result);
 }
+
+/**
+ * Return the index in composite:lo_entries by the file offset
+ */
+int lov_io_layout_at(struct lov_io *lio, __u64 offset)
+{
+	struct lov_object *lov = lio->lis_object;
+	struct lov_layout_composite *comp = &lov->u.composite;
+	int start_index = 0;
+	int end_index = comp->lo_entry_count - 1;
+	int i;
+
+	LASSERT(lov->lo_type == LLT_COMP);
+
+	/* This is actual file offset so nothing can cover eof. */
+	if (offset == LUSTRE_EOF)
+		return -1;
+
+	if (lov_is_flr(lov)) {
+		struct lov_mirror_entry *lre;
+
+		LASSERT(lio->lis_mirror_index >= 0);
+
+		lre = &comp->lo_mirrors[lio->lis_mirror_index];
+		start_index = lre->lre_start;
+		end_index = lre->lre_end;
+	}
+
+	for (i = start_index; i <= end_index; i++) {
+		struct lov_layout_entry *lle = lov_entry(lov, i);
+
+		if ((offset >= lle->lle_extent->e_start &&
+		     offset < lle->lle_extent->e_end) ||
+		    (offset == OBD_OBJECT_EOF &&
+		     lle->lle_extent->e_end == OBD_OBJECT_EOF))
+			return i;
+	}
+
+	return -1;
+}
+
 /** @} lov */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_lock.c b/drivers/staging/lustrefsx/lustre/lov/lov_lock.c
index efa4cc11ea94e..1b4a95876cc75 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_lock.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_lock.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -52,22 +52,22 @@ static struct lov_sublock_env *lov_sublock_env_get(const struct lu_env *env,
 						   const struct cl_lock *parent,
 						   struct lov_lock_sub *lls)
 {
-        struct lov_sublock_env *subenv;
-        struct lov_io          *lio    = lov_env_io(env);
-        struct cl_io           *io     = lio->lis_cl.cis_io;
-        struct lov_io_sub      *sub;
-
-        subenv = &lov_env_session(env)->ls_subenv;
-
-        /*
-         * FIXME: We tend to use the subio's env & io to call the sublock
-         * lock operations because osc lock sometimes stores some control
-         * variables in thread's IO infomation(Now only lockless information).
-         * However, if the lock's host(object) is different from the object
-         * for current IO, we have no way to get the subenv and subio because
-         * they are not initialized at all. As a temp fix, in this case,
-         * we still borrow the parent's env to call sublock operations.
-         */
+	struct lov_sublock_env *subenv;
+	struct lov_io          *lio    = lov_env_io(env);
+	struct cl_io           *io     = lio->lis_cl.cis_io;
+	struct lov_io_sub      *sub;
+
+	subenv = &lov_env_session(env)->ls_subenv;
+
+	/*
+	 * FIXME: We tend to use the subio's env & io to call the sublock
+	 * lock operations because osc lock sometimes stores some control
+	 * variables in thread's IO infomation(Now only lockless information).
+	 * However, if the lock's host(object) is different from the object
+	 * for current IO, we have no way to get the subenv and subio because
+	 * they are not initialized at all. As a temp fix, in this case,
+	 * we still borrow the parent's env to call sublock operations.
+	 */
 	if (!io || !cl_object_same(io->ci_obj, parent->cll_descr.cld_obj)) {
 		subenv->lse_env = env;
 		subenv->lse_io = io;
@@ -89,6 +89,7 @@ static int lov_sublock_init(const struct lu_env *env,
 {
 	struct lov_sublock_env *subenv;
 	int result;
+
 	ENTRY;
 
 	subenv = lov_sublock_env_get(env, parent, lls);
@@ -111,6 +112,7 @@ static int lov_sublock_init(const struct lu_env *env,
  * through already created sub-locks (possibly shared with other top-locks).
  */
 static struct lov_lock *lov_lock_sub_init(const struct lu_env *env,
+					  const struct cl_io *io,
 					  const struct cl_object *obj,
 					  struct cl_lock *lock)
 {
@@ -133,20 +135,18 @@ static struct lov_lock *lov_lock_sub_init(const struct lu_env *env,
 		ext.e_end  = cl_offset(obj, lock->cll_descr.cld_end + 1);
 
 	nr = 0;
-	for (index = lov_lsm_entry(lov->lo_lsm, ext.e_start);
-	     index >= 0 && index < lov->lo_lsm->lsm_entry_count; index++) {
+	lov_foreach_io_layout(index, lov_env_io(env), &ext) {
 		struct lov_layout_raid0 *r0 = lov_r0(lov, index);
 
-		/* assume lsm entries are sorted. */
-		if (!lu_extent_is_overlapped(&ext,
-					     &lov_lse(lov, index)->lsme_extent))
-			break;
-
 		for (i = 0; i < r0->lo_nr; i++) {
-			if (likely(r0->lo_sub[i] != NULL) && /* spare layout */
-			    lov_stripe_intersects(lov->lo_lsm, index, i,
-						  &ext, &start, &end))
-				nr++;
+			if (likely(r0->lo_sub[i])) {/* spare layout */
+				if (lov_stripe_intersects(lov->lo_lsm, index, i,
+							  &ext, &start, &end))
+					nr++;
+				else if (cl_io_is_trunc(io) &&
+					 r0->lo_trunc_stripeno == i)
+					nr++;
+			}
 		}
 	}
 	/**
@@ -156,28 +156,33 @@ static struct lov_lock *lov_lock_sub_init(const struct lu_env *env,
 	 */
 
 	OBD_ALLOC_LARGE(lovlck, offsetof(struct lov_lock, lls_sub[nr]));
-	if (lovlck == NULL)
+	if (!lovlck)
 		RETURN(ERR_PTR(-ENOMEM));
 
 	lovlck->lls_nr = nr;
 	nr = 0;
-	for (index = lov_lsm_entry(lov->lo_lsm, ext.e_start);
-	     index >= 0 && index < lov->lo_lsm->lsm_entry_count; index++) {
+	lov_foreach_io_layout(index, lov_env_io(env), &ext) {
 		struct lov_layout_raid0 *r0 = lov_r0(lov, index);
 
-		/* assume lsm entries are sorted. */
-		if (!lu_extent_is_overlapped(&ext,
-					     &lov_lse(lov, index)->lsme_extent))
-			break;
 		for (i = 0; i < r0->lo_nr; ++i) {
 			struct lov_lock_sub *lls = &lovlck->lls_sub[nr];
 			struct cl_lock_descr *descr = &lls->sub_lock.cll_descr;
+			bool intersect = false;
 
-			if (unlikely(r0->lo_sub[i] == NULL) ||
-			    !lov_stripe_intersects(lov->lo_lsm, index, i,
-						   &ext, &start, &end))
+			if (unlikely(!r0->lo_sub[i]))
 				continue;
 
+			intersect = lov_stripe_intersects(lov->lo_lsm, index, i,
+							  &ext, &start, &end);
+			if (intersect)
+				goto init_sublock;
+
+			if (cl_io_is_trunc(io) && i == r0->lo_trunc_stripeno)
+				goto init_sublock;
+
+			continue;
+
+init_sublock:
 			LASSERT(descr->cld_obj == NULL);
 			descr->cld_obj   = lovsub2cl(r0->lo_sub[i]);
 			descr->cld_start = cl_index(descr->cld_obj, start);
@@ -244,10 +249,10 @@ static int lov_lock_enqueue(const struct lu_env *env,
 			    const struct cl_lock_slice *slice,
 			    struct cl_io *io, struct cl_sync_io *anchor)
 {
-	struct cl_lock          *lock   = slice->cls_lock;
-	struct lov_lock         *lovlck = cl2lov_lock(slice);
-	int                     i;
-	int                     rc      = 0;
+	struct cl_lock *lock = slice->cls_lock;
+	struct lov_lock *lovlck = cl2lov_lock(slice);
+	int i;
+	int rc = 0;
 
 	ENTRY;
 
@@ -274,16 +279,16 @@ static int lov_lock_enqueue(const struct lu_env *env,
 static void lov_lock_cancel(const struct lu_env *env,
 			    const struct cl_lock_slice *slice)
 {
-	struct cl_lock  *lock   = slice->cls_lock;
+	struct cl_lock *lock = slice->cls_lock;
 	struct lov_lock *lovlck = cl2lov_lock(slice);
 	int i;
 
 	ENTRY;
 
 	for (i = 0; i < lovlck->lls_nr; ++i) {
-		struct lov_lock_sub     *lls = &lovlck->lls_sub[i];
-		struct cl_lock          *sublock = &lls->sub_lock;
-		struct lov_sublock_env  *subenv;
+		struct lov_lock_sub *lls = &lovlck->lls_sub[i];
+		struct cl_lock *sublock = &lls->sub_lock;
+		struct lov_sublock_env *subenv;
 
 		if (!lls->sub_is_enqueued)
 			continue;
@@ -301,27 +306,27 @@ static void lov_lock_cancel(const struct lu_env *env,
 }
 
 static int lov_lock_print(const struct lu_env *env, void *cookie,
-                          lu_printer_t p, const struct cl_lock_slice *slice)
+			  lu_printer_t p, const struct cl_lock_slice *slice)
 {
-        struct lov_lock *lck = cl2lov_lock(slice);
-        int              i;
+	struct lov_lock *lck = cl2lov_lock(slice);
+	int i;
 
-        (*p)(env, cookie, "%d\n", lck->lls_nr);
-        for (i = 0; i < lck->lls_nr; ++i) {
-                struct lov_lock_sub *sub;
+	(*p)(env, cookie, "%d\n", lck->lls_nr);
+	for (i = 0; i < lck->lls_nr; ++i) {
+		struct lov_lock_sub *sub;
 
-                sub = &lck->lls_sub[i];
+		sub = &lck->lls_sub[i];
 		(*p)(env, cookie, "    %d %x: ", i, sub->sub_is_enqueued);
 		cl_lock_print(env, cookie, p, &sub->sub_lock);
-        }
-        return 0;
+	}
+	return 0;
 }
 
 static const struct cl_lock_operations lov_lock_ops = {
-        .clo_fini      = lov_lock_fini,
-        .clo_enqueue   = lov_lock_enqueue,
-        .clo_cancel    = lov_lock_cancel,
-        .clo_print     = lov_lock_print
+	.clo_fini      = lov_lock_fini,
+	.clo_enqueue   = lov_lock_enqueue,
+	.clo_cancel    = lov_lock_cancel,
+	.clo_print     = lov_lock_print
 };
 
 int lov_lock_init_composite(const struct lu_env *env, struct cl_object *obj,
@@ -331,7 +336,7 @@ int lov_lock_init_composite(const struct lu_env *env, struct cl_object *obj,
 	int result = 0;
 
 	ENTRY;
-	lck = lov_lock_sub_init(env, obj, lock);
+	lck = lov_lock_sub_init(env, io, obj, lock);
 	if (!IS_ERR(lck))
 		cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_lock_ops);
 	else
@@ -343,6 +348,7 @@ static void lov_empty_lock_fini(const struct lu_env *env,
 				struct cl_lock_slice *slice)
 {
 	struct lov_lock *lck = cl2lov_lock(slice);
+
 	OBD_SLAB_FREE_PTR(lck, lov_lock_kmem);
 }
 
@@ -367,7 +373,7 @@ int lov_lock_init_empty(const struct lu_env *env, struct cl_object *obj,
 
 	ENTRY;
 	OBD_SLAB_ALLOC_PTR_GFP(lck, lov_lock_kmem, GFP_NOFS);
-	if (lck != NULL) {
+	if (lck) {
 		cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_empty_lock_ops);
 		result = 0;
 	}
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_merge.c b/drivers/staging/lustrefsx/lustre/lov/lov_merge.c
index de9e4298dd884..8a6ced24ff522 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_merge.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_merge.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_obd.c b/drivers/staging/lustrefsx/lustre/lov/lov_obd.c
index 8cdd60fc90171..b9c42313fe3ae 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_obd.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_obd.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,16 +40,14 @@
 #define DEBUG_SUBSYSTEM S_LOV
 #include <libcfs/libcfs.h>
 
-#include <lustre/lustre_idl.h>
-
 #include <cl_object.h>
 #include <lustre_dlm.h>
 #include <lustre_fid.h>
-#include <uapi/linux/lustre_ioctl.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
 #include <lustre_lib.h>
 #include <lustre_mds.h>
 #include <lustre_net.h>
-#include <uapi/linux/lustre_param.h>
+#include <uapi/linux/lustre/lustre_param.h>
 #include <lustre_swab.h>
 #include <lprocfs_status.h>
 #include <obd_class.h>
@@ -59,7 +57,7 @@
 
 /* Keep a refcount of lov->tgt usage to prevent racing with addition/deletion.
    Any function that expects lov_tgts to remain stationary must take a ref. */
-static void lov_getref(struct obd_device *obd)
+void lov_tgts_getref(struct obd_device *obd)
 {
 	struct lov_obd *lov = &obd->u.lov;
 
@@ -72,7 +70,7 @@ static void lov_getref(struct obd_device *obd)
 
 static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt);
 
-static void lov_putref(struct obd_device *obd)
+void lov_tgts_putref(struct obd_device *obd)
 {
 	struct lov_obd *lov = &obd->u.lov;
 
@@ -102,21 +100,21 @@ static void lov_putref(struct obd_device *obd)
 
 		list_for_each_entry_safe(tgt, n, &kill, ltd_kill) {
 			list_del(&tgt->ltd_kill);
-                        /* Disconnect */
-                        __lov_del_obd(obd, tgt);
-                }
-        } else {
+			/* Disconnect */
+			__lov_del_obd(obd, tgt);
+		}
+	} else {
 		mutex_unlock(&lov->lov_lock);
-        }
+	}
 }
 
 static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
-                              enum obd_notify_event ev);
+			      enum obd_notify_event ev);
 static int lov_notify(struct obd_device *obd, struct obd_device *watched,
 		      enum obd_notify_event ev);
 
-int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
-                    struct obd_connect_data *data)
+int lov_connect_obd(struct obd_device *obd, u32 index, int activate,
+		    struct obd_connect_data *data)
 {
 	struct lov_obd *lov = &obd->u.lov;
 	struct obd_uuid *tgt_uuid;
@@ -148,12 +146,12 @@ int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
          */
         imp = tgt_obd->u.cli.cl_import;
 
-        if (activate) {
-                tgt_obd->obd_no_recov = 0;
-                /* FIXME this is probably supposed to be
-                   ptlrpc_set_import_active.  Horrible naming. */
-                ptlrpc_activate_import(imp);
-        }
+	if (activate) {
+		tgt_obd->obd_no_recov = 0;
+		/* FIXME this is probably supposed to be
+		   ptlrpc_set_import_active.  Horrible naming. */
+		ptlrpc_activate_import(imp, false);
+	}
 
         rc = obd_register_observer(tgt_obd, obd);
         if (rc) {
@@ -182,26 +180,17 @@ int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
         CDEBUG(D_CONFIG, "Connected tgt idx %d %s (%s) %sactive\n", index,
                obd_uuid2str(tgt_uuid), tgt_obd->obd_name, activate ? "":"in");
 
-	if (lov->targets_proc_entry != NULL) {
-		struct proc_dir_entry *osc_symlink;
-		struct obd_device *osc_obd;
-
-		osc_obd = lov->lov_tgts[index]->ltd_exp->exp_obd;
-
-		LASSERT(osc_obd != NULL);
-		LASSERT(osc_obd->obd_magic == OBD_DEVICE_MAGIC);
-		LASSERT(osc_obd->obd_type->typ_name != NULL);
-
-		osc_symlink = lprocfs_add_symlink(osc_obd->obd_name,
-						  lov->targets_proc_entry,
-						  "../../../%s/%s",
-						  osc_obd->obd_type->typ_name,
-						  osc_obd->obd_name);
-		if (osc_symlink == NULL) {
-			CERROR("cannot register LOV target "
-			       "/proc/fs/lustre/%s/%s/target_obds/%s\n",
-			       obd->obd_type->typ_name, obd->obd_name,
-			       osc_obd->obd_name);
+	if (lov->lov_tgts_kobj) {
+		/* Even if we failed, that's ok */
+		rc = sysfs_create_link(lov->lov_tgts_kobj,
+				       &tgt_obd->obd_kset.kobj,
+				       tgt_obd->obd_name);
+		if (rc) {
+			CERROR("%s: can't register LOV target /sys/fs/lustre/%s/%s/target_obds/%s : rc = %d\n",
+			       obd->obd_name, obd->obd_type->typ_name,
+			       obd->obd_name,
+			       lov->lov_tgts[index]->ltd_exp->exp_obd->obd_name,
+			       rc);
 		}
 	}
 	RETURN(0);
@@ -234,17 +223,8 @@ static int lov_connect(const struct lu_env *env,
         if (data)
                 lov->lov_ocd = *data;
 
-	lov->targets_proc_entry = lprocfs_register("target_obds",
-						   obd->obd_proc_entry,
-						   NULL, NULL);
-	if (IS_ERR(lov->targets_proc_entry)) {
-		CERROR("%s: cannot register "
-		       "/proc/fs/lustre/%s/%s/target_obds\n",
-		       obd->obd_name, obd->obd_type->typ_name, obd->obd_name);
-		lov->targets_proc_entry = NULL;
-	}
+	lov_tgts_getref(obd);
 
-        obd_getref(obd);
         for (i = 0; i < lov->desc.ld_tgt_count; i++) {
                 tgt = lov->lov_tgts[i];
                 if (!tgt || obd_uuid_empty(&tgt->ltd_uuid))
@@ -267,9 +247,10 @@ static int lov_connect(const struct lu_env *env,
                                obd->obd_name, rc);
                 }
         }
-        obd_putref(obd);
 
-        RETURN(0);
+	lov_tgts_putref(obd);
+
+	RETURN(0);
 }
 
 static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
@@ -290,6 +271,10 @@ static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
         }
 
 	if (osc_obd) {
+		if (lov->lov_tgts_kobj)
+			sysfs_remove_link(lov->lov_tgts_kobj,
+					  osc_obd->obd_name);
+
 		/* Pass it on to our clients.
 		 * XXX This should be an argument to disconnect,
 		 * XXX not a back-door flag on the OBD.  Ah well.
@@ -318,40 +303,39 @@ static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
 
 static int lov_disconnect(struct obd_export *exp)
 {
-        struct obd_device *obd = class_exp2obd(exp);
-        struct lov_obd *lov = &obd->u.lov;
-        int i, rc;
-        ENTRY;
-
-        if (!lov->lov_tgts)
-                goto out;
-
-        /* Only disconnect the underlying layers on the final disconnect. */
-        lov->lov_connects--;
-        if (lov->lov_connects != 0) {
-                /* why should there be more than 1 connect? */
-                CERROR("disconnect #%d\n", lov->lov_connects);
-                goto out;
-        }
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lov_obd *lov = &obd->u.lov;
+	u32 index;
+	int rc;
 
-        /* Let's hold another reference so lov_del_obd doesn't spin through
-           putref every time */
-        obd_getref(obd);
+	ENTRY;
+	if (!lov->lov_tgts)
+		goto out;
+
+	/* Only disconnect the underlying layers on the final disconnect. */
+	lov->lov_connects--;
+	if (lov->lov_connects != 0) {
+		/* why should there be more than 1 connect? */
+		CWARN("%s: unexpected disconnect #%d\n",
+		      obd->obd_name, lov->lov_connects);
+		goto out;
+	}
 
-        for (i = 0; i < lov->desc.ld_tgt_count; i++) {
-                if (lov->lov_tgts[i] && lov->lov_tgts[i]->ltd_exp) {
-                        /* Disconnection is the last we know about an obd */
-			lov_del_target(obd, i, NULL, lov->lov_tgts[i]->ltd_gen);
-                }
-        }
-        obd_putref(obd);
+	/* hold another ref so lov_del_obd() doesn't spin in putref each time */
+	lov_tgts_getref(obd);
 
-	if (lov->targets_proc_entry != NULL)
-		lprocfs_remove(&lov->targets_proc_entry);
+	for (index = 0; index < lov->desc.ld_tgt_count; index++) {
+		if (lov->lov_tgts[index] && lov->lov_tgts[index]->ltd_exp) {
+			/* Disconnection is the last we know about an OBD */
+			lov_del_target(obd, index, NULL,
+				       lov->lov_tgts[index]->ltd_gen);
+		}
+	}
+	lov_tgts_putref(obd);
 
 out:
-        rc = class_disconnect(exp); /* bz 9811 */
-        RETURN(rc);
+	rc = class_disconnect(exp); /* bz 9811 */
+	RETURN(rc);
 }
 
 /* Error codes:
@@ -372,7 +356,7 @@ static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
         CDEBUG(D_INFO, "Searching in lov %p for uuid %s event(%d)\n",
                lov, uuid->uuid, ev);
 
-	obd_getref(obd);
+	lov_tgts_getref(obd);
 	for (index = 0; index < lov->desc.ld_tgt_count; index++) {
 		tgt = lov->lov_tgts[index];
 		if (!tgt)
@@ -447,7 +431,7 @@ static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
 		       index, tgt->ltd_exp->exp_handle.h_cookie);
 
  out:
-	obd_putref(obd);
+	lov_tgts_putref(obd);
 	RETURN(index);
 }
 
@@ -497,37 +481,37 @@ static int lov_notify(struct obd_device *obd, struct obd_device *watched,
 }
 
 static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
-                          __u32 index, int gen, int active)
+			  u32 index, int gen, int active)
 {
-        struct lov_obd *lov = &obd->u.lov;
-        struct lov_tgt_desc *tgt;
-        struct obd_device *tgt_obd;
-        int rc;
-        ENTRY;
+	struct lov_obd *lov = &obd->u.lov;
+	struct lov_tgt_desc *tgt;
+	struct obd_device *tgt_obd;
+	int rc;
 
-        CDEBUG(D_CONFIG, "uuid:%s idx:%d gen:%d active:%d\n",
-               uuidp->uuid, index, gen, active);
+	ENTRY;
+	CDEBUG(D_CONFIG, "uuid:%s idx:%u gen:%d active:%d\n",
+	       uuidp->uuid, index, gen, active);
 
-        if (gen <= 0) {
-                CERROR("request to add OBD %s with invalid generation: %d\n",
-                       uuidp->uuid, gen);
-                RETURN(-EINVAL);
-        }
+	if (gen <= 0) {
+		CERROR("%s: request to add '%s' with invalid generation: %d\n",
+		       obd->obd_name, uuidp->uuid, gen);
+		RETURN(-EINVAL);
+	}
 
-        tgt_obd = class_find_client_obd(uuidp, LUSTRE_OSC_NAME,
-                                        &obd->obd_uuid);
-        if (tgt_obd == NULL)
-                RETURN(-EINVAL);
+	tgt_obd = class_find_client_obd(uuidp, LUSTRE_OSC_NAME, &obd->obd_uuid);
+	if (tgt_obd == NULL)
+		RETURN(-EINVAL);
 
 	mutex_lock(&lov->lov_lock);
 
-        if ((index < lov->lov_tgt_size) && (lov->lov_tgts[index] != NULL)) {
-                tgt = lov->lov_tgts[index];
-                CERROR("UUID %s already assigned at LOV target index %d\n",
-                       obd_uuid2str(&tgt->ltd_uuid), index);
+	if ((index < lov->lov_tgt_size) && (lov->lov_tgts[index] != NULL)) {
+		tgt = lov->lov_tgts[index];
+		rc = -EEXIST;
+		CERROR("%s: UUID %s already assigned at index %d: rc = %d\n",
+		       obd->obd_name, obd_uuid2str(&tgt->ltd_uuid), index, rc);
 		mutex_unlock(&lov->lov_lock);
-                RETURN(-EEXIST);
-        }
+		RETURN(rc);
+	}
 
         if (index >= lov->lov_tgt_size) {
                 /* We need to reallocate the lov target array. */
@@ -595,7 +579,7 @@ static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
                 RETURN(0);
         }
 
-        obd_getref(obd);
+	lov_tgts_getref(obd);
 
         rc = lov_connect_obd(obd, index, active, &lov->lov_ocd);
         if (rc)
@@ -618,17 +602,17 @@ static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
 			active ? OBD_NOTIFY_CONNECT : OBD_NOTIFY_INACTIVE);
 
 out:
-        if (rc) {
-                CERROR("add failed (%d), deleting %s\n", rc,
-                       obd_uuid2str(&tgt->ltd_uuid));
+	if (rc) {
+		CERROR("%s: add failed, deleting %s: rc = %d\n",
+		       obd->obd_name, obd_uuid2str(&tgt->ltd_uuid), rc);
 		lov_del_target(obd, index, NULL, 0);
-        }
-        obd_putref(obd);
-        RETURN(rc);
+	}
+	lov_tgts_putref(obd);
+	RETURN(rc);
 }
 
 /* Schedule a target for deletion */
-int lov_del_target(struct obd_device *obd, __u32 index,
+int lov_del_target(struct obd_device *obd, u32 index,
                    struct obd_uuid *uuidp, int gen)
 {
         struct lov_obd *lov = &obd->u.lov;
@@ -644,7 +628,7 @@ int lov_del_target(struct obd_device *obd, __u32 index,
 
 	/* to make sure there's no ongoing lov_notify() now */
 	down_write(&lov->lov_notify_lock);
-        obd_getref(obd);
+	lov_tgts_getref(obd);
 
         if (!lov->lov_tgts[index]) {
                 CERROR("LOV target at index %d is not setup.\n", index);
@@ -665,12 +649,12 @@ int lov_del_target(struct obd_device *obd, __u32 index,
 
         lov->lov_tgts[index]->ltd_reap = 1;
         lov->lov_death_row++;
-        /* we really delete it from obd_putref */
+	/* we really delete it from lov_tgts_putref() */
 out:
-        obd_putref(obd);
+	lov_tgts_putref(obd);
 	up_write(&lov->lov_notify_lock);
 
-        RETURN(rc);
+	RETURN(rc);
 }
 
 static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
@@ -747,9 +731,6 @@ int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 {
 	struct lov_desc *desc;
 	struct lov_obd *lov = &obd->u.lov;
-#ifdef CONFIG_PROC_FS
-	struct obd_type *type;
-#endif
 	int rc;
 	ENTRY;
 
@@ -803,45 +784,12 @@ int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
         if (rc)
 		GOTO(out, rc);
 
-#ifdef CONFIG_PROC_FS
-	obd->obd_vars = lprocfs_lov_obd_vars;
-	/* If this is true then both client (lov) and server
-	 * (lod) are on the same node. The lod layer if loaded
-	 * first will register the lov proc directory. In that
-	 * case obd->obd_type->typ_procroot will be not set.
-	 * Instead we use type->typ_procsym as the parent. */
-	type = class_search_type(LUSTRE_LOD_NAME);
-	if (type != NULL && type->typ_procsym != NULL) {
-		obd->obd_proc_entry = lprocfs_register(obd->obd_name,
-						       type->typ_procsym,
-						       obd->obd_vars, obd);
-		if (IS_ERR(obd->obd_proc_entry)) {
-			rc = PTR_ERR(obd->obd_proc_entry);
-			CERROR("error %d setting up lprocfs for %s\n", rc,
-			       obd->obd_name);
-			obd->obd_proc_entry = NULL;
-		}
-	} else {
-		rc = lprocfs_obd_setup(obd);
-	}
+	rc = lov_tunables_init(obd);
+	if (rc)
+		GOTO(out, rc);
 
-	if (rc == 0) {
-		rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd",
-					0444, &lov_proc_target_fops, obd);
-		if (rc)
-			CWARN("Error adding the target_obd file\n");
-
-		lov->lov_pool_proc_entry = lprocfs_register("pools",
-							    obd->obd_proc_entry,
-							    NULL, NULL);
-		if (IS_ERR(lov->lov_pool_proc_entry)) {
-			rc = PTR_ERR(lov->lov_pool_proc_entry);
-			CERROR("error %d setting up lprocfs for pools\n", rc);
-			lov->lov_pool_proc_entry = NULL;
-		}
-	}
-#endif
-	RETURN(0);
+	lov->lov_tgts_kobj = kobject_create_and_add("target_obds",
+						    &obd->obd_kset.kobj);
 
 out:
 	return rc;
@@ -854,6 +802,11 @@ static int lov_cleanup(struct obd_device *obd)
         struct pool_desc *pool;
         ENTRY;
 
+	if (lov->lov_tgts_kobj) {
+		kobject_put(lov->lov_tgts_kobj);
+		lov->lov_tgts_kobj = NULL;
+	}
+
 	list_for_each_safe(pos, tmp, &lov->lov_pool_list) {
 		pool = list_entry(pos, struct pool_desc, pool_list);
                 /* free pool structs */
@@ -869,14 +822,13 @@ static int lov_cleanup(struct obd_device *obd)
 	lprocfs_obd_cleanup(obd);
         if (lov->lov_tgts) {
                 int i;
-                obd_getref(obd);
+		lov_tgts_getref(obd);
                 for (i = 0; i < lov->desc.ld_tgt_count; i++) {
 			if (!lov->lov_tgts[i])
 				continue;
 
 			/* Inactive targets may never have connected */
-			if (lov->lov_tgts[i]->ltd_active ||
-			    atomic_read(&lov->lov_refcount))
+			if (lov->lov_tgts[i]->ltd_active)
 				/* We should never get here - these
 				 * should have been removed in the
 				 * disconnect. */
@@ -886,7 +838,7 @@ static int lov_cleanup(struct obd_device *obd)
 				       atomic_read(&lov->lov_refcount));
 			lov_del_target(obd, i, NULL, 0);
 		}
-                obd_putref(obd);
+		lov_tgts_putref(obd);
                 OBD_FREE(lov->lov_tgts, sizeof(*lov->lov_tgts) *
                          lov->lov_tgt_size);
                 lov->lov_tgt_size = 0;
@@ -901,50 +853,56 @@ static int lov_cleanup(struct obd_device *obd)
 }
 
 int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg,
-                            __u32 *indexp, int *genp)
+			    u32 *indexp, int *genp)
 {
-        struct obd_uuid obd_uuid;
-        int cmd;
-        int rc = 0;
-        ENTRY;
+	struct obd_uuid obd_uuid;
+	int cmd;
+	int rc = 0;
 
-        switch(cmd = lcfg->lcfg_command) {
-        case LCFG_LOV_ADD_OBD:
-        case LCFG_LOV_ADD_INA:
-        case LCFG_LOV_DEL_OBD: {
-                __u32 index;
-                int gen;
-                /* lov_modify_tgts add  0:lov_mdsA  1:ost1_UUID  2:0  3:1 */
-                if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid))
-                        GOTO(out, rc = -EINVAL);
-
-                obd_str2uuid(&obd_uuid,  lustre_cfg_buf(lcfg, 1));
-
-		if (sscanf(lustre_cfg_buf(lcfg, 2), "%u", indexp) != 1)
-                        GOTO(out, rc = -EINVAL);
-                if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", genp) != 1)
-                        GOTO(out, rc = -EINVAL);
-                index = *indexp;
-                gen = *genp;
-                if (cmd == LCFG_LOV_ADD_OBD)
-                        rc = lov_add_target(obd, &obd_uuid, index, gen, 1);
-                else if (cmd == LCFG_LOV_ADD_INA)
-                        rc = lov_add_target(obd, &obd_uuid, index, gen, 0);
-                else
-                        rc = lov_del_target(obd, index, &obd_uuid, gen);
-                GOTO(out, rc);
-        }
-        case LCFG_PARAM: {
+	ENTRY;
+	switch (cmd = lcfg->lcfg_command) {
+	case LCFG_ADD_MDC:
+	case LCFG_DEL_MDC:
+		break;
+	case LCFG_LOV_ADD_OBD:
+	case LCFG_LOV_ADD_INA:
+	case LCFG_LOV_DEL_OBD: {
+		u32 index;
+		int gen;
+
+		/* lov_modify_tgts add  0:lov_mdsA  1:ost1_UUID  2:0  3:1 */
+		if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid))
+			GOTO(out, rc = -EINVAL);
+
+		obd_str2uuid(&obd_uuid,  lustre_cfg_buf(lcfg, 1));
+
+		rc = kstrtou32(lustre_cfg_buf(lcfg, 2), 10, indexp);
+		if (rc)
+			GOTO(out, rc);
+		rc = kstrtoint(lustre_cfg_buf(lcfg, 3), 10, genp);
+		if (rc)
+			GOTO(out, rc);
+		index = *indexp;
+		gen = *genp;
+		if (cmd == LCFG_LOV_ADD_OBD)
+			rc = lov_add_target(obd, &obd_uuid, index, gen, 1);
+		else if (cmd == LCFG_LOV_ADD_INA)
+			rc = lov_add_target(obd, &obd_uuid, index, gen, 0);
+		else
+			rc = lov_del_target(obd, index, &obd_uuid, gen);
+
+		GOTO(out, rc);
+	}
+	case LCFG_PARAM: {
 		struct lov_desc *desc = &(obd->u.lov.desc);
+		ssize_t count;
 
 		if (!desc)
 			GOTO(out, rc = -EINVAL);
 
-		rc = class_process_proc_param(PARAM_LOV, obd->obd_vars,
-					      lcfg, obd);
-		if (rc > 0)
-			rc = 0;
-                GOTO(out, rc);
+		count = class_modify_config(lcfg, PARAM_LOV,
+					    &obd->obd_kset.kobj);
+		GOTO(out, rc = count < 0 ? count : 0);
         }
         case LCFG_POOL_NEW:
         case LCFG_POOL_ADD:
@@ -962,84 +920,50 @@ int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg,
         RETURN(rc);
 }
 
-static int
-lov_statfs_interpret(struct ptlrpc_request_set *rqset, void *data, int rc)
-{
-	struct lov_request_set *lovset = (struct lov_request_set *)data;
-	int err;
-	ENTRY;
-
-	if (rc)
-		atomic_set(&lovset->set_completes, 0);
-
-	err = lov_fini_statfs_set(lovset);
-	RETURN(rc ? rc : err);
-}
-
-static int lov_statfs_async(struct obd_export *exp, struct obd_info *oinfo,
-                            __u64 max_age, struct ptlrpc_request_set *rqset)
-{
-        struct obd_device      *obd = class_exp2obd(exp);
-        struct lov_request_set *set;
-        struct lov_request *req;
-	struct list_head *pos;
-        struct lov_obd *lov;
-        int rc = 0;
-        ENTRY;
-
-        LASSERT(oinfo != NULL);
-        LASSERT(oinfo->oi_osfs != NULL);
-
-        lov = &obd->u.lov;
-        rc = lov_prep_statfs_set(obd, oinfo, &set);
-        if (rc)
-                RETURN(rc);
-
-	list_for_each(pos, &set->set_list) {
-		req = list_entry(pos, struct lov_request, rq_link);
-                rc = obd_statfs_async(lov->lov_tgts[req->rq_idx]->ltd_exp,
-                                      &req->rq_oi, max_age, rqset);
-                if (rc)
-                        break;
-        }
-
-	if (rc || list_empty(&rqset->set_requests)) {
-		int err;
-		if (rc)
-			atomic_set(&set->set_completes, 0);
-		err = lov_fini_statfs_set(set);
-		RETURN(rc ? rc : err);
-	}
-
-	LASSERT(rqset->set_interpret == NULL);
-	rqset->set_interpret = lov_statfs_interpret;
-	rqset->set_arg = (void *)set;
-	RETURN(0);
-}
-
 static int lov_statfs(const struct lu_env *env, struct obd_export *exp,
-		      struct obd_statfs *osfs, __u64 max_age, __u32 flags)
+		      struct obd_statfs *osfs, time64_t max_age, __u32 flags)
 {
-	struct ptlrpc_request_set *set = NULL;
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lov_obd *lov = &obd->u.lov;
 	struct obd_info oinfo = {
 		.oi_osfs = osfs,
 		.oi_flags = flags,
 	};
+	struct ptlrpc_request_set *rqset;
+	struct lov_request_set *set = NULL;
+	struct lov_request *req;
 	int rc = 0;
+	int rc2;
 
 	ENTRY;
 
-	/* for obdclass we forbid using obd_statfs_rqset, but prefer using async
-	 * statfs requests */
-	set = ptlrpc_prep_set();
-	if (set == NULL)
+	rqset = ptlrpc_prep_set();
+	if (rqset == NULL)
 		RETURN(-ENOMEM);
 
-	rc = lov_statfs_async(exp, &oinfo, max_age, set);
+	rc = lov_prep_statfs_set(obd, &oinfo, &set);
+	if (rc < 0)
+		GOTO(out_rqset, rc);
+
+	list_for_each_entry(req, &set->set_list, rq_link) {
+		rc = obd_statfs_async(lov->lov_tgts[req->rq_idx]->ltd_exp,
+				      &req->rq_oi, max_age, rqset);
+		if (rc < 0)
+			GOTO(out_set, rc);
+	}
+
+	rc = ptlrpc_set_wait(env, rqset);
+
+out_set:
+	if (rc < 0)
+		atomic_set(&set->set_completes, 0);
+
+	rc2 = lov_fini_statfs_set(set);
 	if (rc == 0)
-		rc = ptlrpc_set_wait(set);
+		rc = rc2;
 
-	ptlrpc_set_destroy(set);
+out_rqset:
+	ptlrpc_set_destroy(rqset);
 
 	RETURN(rc);
 }
@@ -1047,35 +971,39 @@ static int lov_statfs(const struct lu_env *env, struct obd_export *exp,
 static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
 			 void *karg, void __user *uarg)
 {
-        struct obd_device *obddev = class_exp2obd(exp);
-        struct lov_obd *lov = &obddev->u.lov;
-        int i = 0, rc = 0, count = lov->desc.ld_tgt_count;
-        struct obd_uuid *uuidp;
-        ENTRY;
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lov_obd *lov = &obd->u.lov;
+	int i = 0, rc = 0, count = lov->desc.ld_tgt_count;
+	struct obd_uuid *uuidp;
 
-        switch (cmd) {
-        case IOC_OBD_STATFS: {
-                struct obd_ioctl_data *data = karg;
-                struct obd_device *osc_obd;
-                struct obd_statfs stat_buf = {0};
-                __u32 index;
+	ENTRY;
+	switch (cmd) {
+	case IOC_OBD_STATFS: {
+		struct obd_ioctl_data *data = karg;
+		struct obd_device *osc_obd;
+		struct obd_statfs stat_buf = {0};
+		struct obd_import *imp;
+		__u32 index;
 		__u32 flags;
 
-                memcpy(&index, data->ioc_inlbuf2, sizeof(index));
-                if ((index >= count))
-                        RETURN(-ENODEV);
+		memcpy(&index, data->ioc_inlbuf2, sizeof(index));
+		if (index >= count)
+			RETURN(-ENODEV);
 
-                if (!lov->lov_tgts[index])
-                        /* Try again with the next index */
-                        RETURN(-EAGAIN);
-                if (!lov->lov_tgts[index]->ltd_active)
-                        RETURN(-ENODATA);
+		if (!lov->lov_tgts[index])
+			/* Try again with the next index */
+			RETURN(-EAGAIN);
 
-                osc_obd = class_exp2obd(lov->lov_tgts[index]->ltd_exp);
-                if (!osc_obd)
-                        RETURN(-EINVAL);
+		osc_obd = class_exp2obd(lov->lov_tgts[index]->ltd_exp);
+		if (!osc_obd)
+			RETURN(-EINVAL);
 
-                /* copy UUID */
+		imp = osc_obd->u.cli.cl_import;
+		if (!lov->lov_tgts[index]->ltd_active &&
+		    imp->imp_state != LUSTRE_IMP_IDLE)
+			RETURN(-ENODATA);
+
+		/* copy UUID */
 		if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(osc_obd),
 				 min_t(unsigned long, data->ioc_plen2,
 				       sizeof(struct obd_uuid))))
@@ -1084,12 +1012,12 @@ static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
 		memcpy(&flags, data->ioc_inlbuf1, sizeof(flags));
 		flags = flags & LL_STATFS_NODELAY ? OBD_STATFS_NODELAY : 0;
 
-                /* got statfs data */
-                rc = obd_statfs(NULL, lov->lov_tgts[index]->ltd_exp, &stat_buf,
-                                cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
-                                flags);
-                if (rc)
-                        RETURN(rc);
+		/* got statfs data */
+		rc = obd_statfs(NULL, lov->lov_tgts[index]->ltd_exp, &stat_buf,
+				ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
+				flags);
+		if (rc)
+			RETURN(rc);
 		if (copy_to_user(data->ioc_pbuf1, &stat_buf,
 				 min_t(unsigned long, data->ioc_plen1,
 				       sizeof(struct obd_statfs))))
@@ -1202,12 +1130,11 @@ static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                         if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp)
                                 continue;
 
-                        /* ll_umount_begin() sets force flag but for lov, not
-                         * osc. Let's pass it through */
-                        osc_obd = class_exp2obd(lov->lov_tgts[i]->ltd_exp);
-                        osc_obd->obd_force = obddev->obd_force;
-                        err = obd_iocontrol(cmd, lov->lov_tgts[i]->ltd_exp,
-                                            len, karg, uarg);
+			/* ll_umount_begin() sets force on lov, pass to osc */
+			osc_obd = class_exp2obd(lov->lov_tgts[i]->ltd_exp);
+			osc_obd->obd_force = obd->obd_force;
+			err = obd_iocontrol(cmd, lov->lov_tgts[i]->ltd_exp,
+					    len, karg, uarg);
 			if (err) {
                                 if (lov->lov_tgts[i]->ltd_active) {
                                         CDEBUG(err == -ENOTTY ?
@@ -1243,7 +1170,7 @@ static int lov_get_info(const struct lu_env *env, struct obd_export *exp,
 	if (vallen == NULL || val == NULL)
 		RETURN(-EFAULT);
 
-	obd_getref(obddev);
+	lov_tgts_getref(obddev);
 
 	if (KEY_IS(KEY_MAX_EASIZE)) {
 		u32 max_stripe_count = min_t(u32, ld->ld_active_tgt_count,
@@ -1261,7 +1188,7 @@ static int lov_get_info(const struct lu_env *env, struct obd_export *exp,
 		rc = -EINVAL;
 	}
 
-	obd_putref(obddev);
+	lov_tgts_putref(obddev);
 
 	RETURN(rc);
 }
@@ -1274,58 +1201,71 @@ static int lov_set_info_async(const struct lu_env *env, struct obd_export *exp,
 	struct obd_device *obddev = class_exp2obd(exp);
 	struct lov_obd *lov = &obddev->u.lov;
 	struct lov_tgt_desc *tgt;
-	int do_inactive = 0;
-	int no_set = 0;
-	u32 count;
+	bool do_inactive = false, no_set = false;
 	u32 i;
 	int rc = 0;
 	int err;
-        ENTRY;
 
-        if (set == NULL) {
-                no_set = 1;
-                set = ptlrpc_prep_set();
-                if (!set)
-                        RETURN(-ENOMEM);
-        }
+	ENTRY;
 
-        obd_getref(obddev);
-        count = lov->desc.ld_tgt_count;
+	if (set == NULL) {
+		no_set = true;
+		set = ptlrpc_prep_set();
+		if (!set)
+			RETURN(-ENOMEM);
+	}
+
+	lov_tgts_getref(obddev);
 
 	if (KEY_IS(KEY_CHECKSUM)) {
-                do_inactive = 1;
+		do_inactive = true;
 	} else if (KEY_IS(KEY_CACHE_SET)) {
 		LASSERT(lov->lov_cache == NULL);
 		lov->lov_cache = val;
-		do_inactive = 1;
+		do_inactive = true;
 		cl_cache_incref(lov->lov_cache);
 	}
 
-	for (i = 0; i < count; i++) {
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
 		tgt = lov->lov_tgts[i];
 
-                /* OST was disconnected */
-                if (!tgt || !tgt->ltd_exp)
-                        continue;
+		/* OST was disconnected */
+		if (tgt == NULL || tgt->ltd_exp == NULL)
+			continue;
 
-                /* OST is inactive and we don't want inactive OSCs */
-                if (!tgt->ltd_active && !do_inactive)
-                        continue;
+		/* OST is inactive and we don't want inactive OSCs */
+		if (!tgt->ltd_active && !do_inactive)
+			continue;
 
 		err = obd_set_info_async(env, tgt->ltd_exp, keylen, key,
 					 vallen, val, set);
-                if (!rc)
-                        rc = err;
-        }
 
-        obd_putref(obddev);
-        if (no_set) {
-                err = ptlrpc_set_wait(set);
-                if (!rc)
-                        rc = err;
-                ptlrpc_set_destroy(set);
-        }
-        RETURN(rc);
+		if (rc == 0)
+			rc = err;
+	}
+
+	/* cycle through MDC target for Data-on-MDT */
+	for (i = 0; i < LOV_MDC_TGT_MAX; i++) {
+		struct obd_device *mdc;
+
+		mdc = lov->lov_mdc_tgts[i].lmtd_mdc;
+		if (mdc == NULL)
+			continue;
+
+		err = obd_set_info_async(env, mdc->obd_self_export,
+					 keylen, key, vallen, val, set);
+		if (rc == 0)
+			rc = err;
+	}
+
+	lov_tgts_putref(obddev);
+	if (no_set) {
+		err = ptlrpc_set_wait(env, set);
+		if (rc == 0)
+			rc = err;
+		ptlrpc_set_destroy(set);
+	}
+	RETURN(rc);
 }
 
 void lov_stripe_lock(struct lov_stripe_md *md)
@@ -1363,7 +1303,7 @@ static int lov_quotactl(struct obd_device *obd, struct obd_export *exp,
 	}
 
         /* for lov tgt */
-        obd_getref(obd);
+	lov_tgts_getref(obd);
         for (i = 0; i < lov->desc.ld_tgt_count; i++) {
                 int err;
 
@@ -1395,7 +1335,7 @@ static int lov_quotactl(struct obd_device *obd, struct obd_export *exp,
                         bhardlimit += oqctl->qc_dqblk.dqb_bhardlimit;
                 }
         }
-        obd_putref(obd);
+	lov_tgts_putref(obd);
 
         if (oqctl->qc_cmd == Q_GETOQUOTA) {
                 oqctl->qc_dqblk.dqb_curspace = curspace;
@@ -1411,7 +1351,6 @@ static struct obd_ops lov_obd_ops = {
 	.o_connect		= lov_connect,
 	.o_disconnect		= lov_disconnect,
 	.o_statfs		= lov_statfs,
-	.o_statfs_async		= lov_statfs_async,
 	.o_iocontrol		= lov_iocontrol,
 	.o_get_info		= lov_get_info,
 	.o_set_info_async	= lov_set_info_async,
@@ -1420,8 +1359,6 @@ static struct obd_ops lov_obd_ops = {
 	.o_pool_rem		= lov_pool_remove,
 	.o_pool_add		= lov_pool_add,
 	.o_pool_del		= lov_pool_del,
-	.o_getref		= lov_getref,
-	.o_putref		= lov_putref,
 	.o_quotactl		= lov_quotactl,
 };
 
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_object.c b/drivers/staging/lustrefsx/lustre/lov/lov_object.c
index c1cf76367697e..f9f3522806a47 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_object.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_object.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -37,6 +37,8 @@
 
 #define DEBUG_SUBSYSTEM S_LOV
 
+#include <linux/random.h>
+
 #include "lov_cl_internal.h"
 
 static inline struct lov_device *lov_object_dev(struct lov_object *obj)
@@ -74,6 +76,8 @@ struct lov_layout_operations {
                             struct cl_object *obj, struct cl_io *io);
         int  (*llo_getattr)(const struct lu_env *env, struct cl_object *obj,
                             struct cl_attr *attr);
+	int  (*llo_flush)(const struct lu_env *env, struct cl_object *obj,
+			  struct ldlm_lock *lock);
 };
 
 static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov);
@@ -89,30 +93,40 @@ static void lov_lsm_put(struct lov_stripe_md *lsm)
  * Lov object layout operations.
  *
  */
-static int lov_init_empty(const struct lu_env *env, struct lov_device *dev,
-			  struct lov_object *lov, struct lov_stripe_md *lsm,
-			  const struct cl_object_conf *conf,
-			  union lov_layout_state *state)
+
+static struct cl_object *lov_sub_find(const struct lu_env *env,
+				      struct cl_device *dev,
+				      const struct lu_fid *fid,
+				      const struct cl_object_conf *conf)
 {
-	return 0;
+	struct lu_object *o;
+
+	ENTRY;
+
+	o = lu_object_find_at(env, cl2lu_dev(dev), fid, &conf->coc_lu);
+	LASSERT(ergo(!IS_ERR(o), o->lo_dev->ld_type == &lovsub_device_type));
+	RETURN(lu2cl(o));
 }
 
-static struct cl_object *lov_sub_find(const struct lu_env *env,
-                                      struct cl_device *dev,
-                                      const struct lu_fid *fid,
-                                      const struct cl_object_conf *conf)
+static int lov_page_slice_fixup(struct lov_object *lov,
+				struct cl_object *stripe)
 {
-        struct lu_object *o;
+	struct cl_object_header *hdr = cl_object_header(&lov->lo_cl);
+	struct cl_object *o;
 
-        ENTRY;
-        o = lu_object_find_at(env, cl2lu_dev(dev), fid, &conf->coc_lu);
-        LASSERT(ergo(!IS_ERR(o), o->lo_dev->ld_type == &lovsub_device_type));
-        RETURN(lu2cl(o));
+	if (stripe == NULL)
+		return hdr->coh_page_bufsize - lov->lo_cl.co_slice_off -
+		       cfs_size_round(sizeof(struct lov_page));
+
+	cl_object_for_each(o, stripe)
+		o->co_slice_off += hdr->coh_page_bufsize;
+
+	return cl_object_header(stripe)->coh_page_bufsize;
 }
 
 static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
-			struct cl_object *subobj, struct lov_layout_raid0 *r0,
-			struct lov_oinfo *oinfo, int idx)
+			struct cl_object *subobj, struct lov_oinfo *oinfo,
+			int idx)
 {
 	struct cl_object_header *hdr;
 	struct cl_object_header *subhdr;
@@ -132,7 +146,7 @@ static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
 		return -EIO;
 	}
 
-	hdr    = cl_object_header(lov2cl(lov));
+	hdr = cl_object_header(lov2cl(lov));
 	subhdr = cl_object_header(subobj);
 
 	CDEBUG(D_INODE, DFID"@%p[%d:%d] -> "DFID"@%p: ostid: "DOSTID
@@ -145,13 +159,14 @@ static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
 	spin_lock(&subhdr->coh_attr_guard);
 	parent = subhdr->coh_parent;
 	if (parent == NULL) {
+		struct lovsub_object *lso = cl2lovsub(subobj);
+
 		subhdr->coh_parent = hdr;
 		spin_unlock(&subhdr->coh_attr_guard);
 		subhdr->coh_nesting = hdr->coh_nesting + 1;
 		lu_object_ref_add(&subobj->co_lu, "lov-parent", lov);
-		r0->lo_sub[stripe] = cl2lovsub(subobj);
-		r0->lo_sub[stripe]->lso_super = lov;
-		r0->lo_sub[stripe]->lso_index = idx;
+		lso->lso_super = lov;
+		lso->lso_index = idx;
 		result = 0;
 	} else {
 		struct lu_object  *old_obj;
@@ -181,42 +196,28 @@ static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
 	return result;
 }
 
-static int lov_page_slice_fixup(struct lov_object *lov,
-				struct cl_object *stripe)
-{
-	struct cl_object_header *hdr = cl_object_header(&lov->lo_cl);
-	struct cl_object *o;
-
-	if (stripe == NULL)
-		return hdr->coh_page_bufsize - lov->lo_cl.co_slice_off -
-		       cfs_size_round(sizeof(struct lov_page));
-
-	cl_object_for_each(o, stripe)
-		o->co_slice_off += hdr->coh_page_bufsize;
-
-	return cl_object_header(stripe)->coh_page_bufsize;
-}
-
 static int lov_init_raid0(const struct lu_env *env, struct lov_device *dev,
-			  struct lov_object *lov, int index,
-			  struct lov_layout_raid0 *r0)
+			  struct lov_object *lov, unsigned int index,
+			  const struct cl_object_conf *conf,
+			  struct lov_layout_entry *lle)
 {
-	struct lov_thread_info  *lti     = lov_env_info(env);
-	struct cl_object_conf   *subconf = &lti->lti_stripe_conf;
-	struct lu_fid           *ofid    = &lti->lti_fid;
-	struct cl_object        *stripe;
+	struct lov_layout_raid0 *r0 = &lle->lle_raid0;
+	struct lov_thread_info *lti = lov_env_info(env);
+	struct cl_object_conf *subconf = &lti->lti_stripe_conf;
+	struct lu_fid *ofid = &lti->lti_fid;
+	struct cl_object *stripe;
 	struct lov_stripe_md_entry *lse  = lov_lse(lov, index);
 	int result;
-	int psz;
+	int psz, sz;
 	int i;
 
 	ENTRY;
 
 	spin_lock_init(&r0->lo_sub_lock);
 	r0->lo_nr = lse->lsme_stripe_count;
-	LASSERT(r0->lo_nr <= lov_targets_nr(dev));
+	r0->lo_trunc_stripeno = -1;
 
-	OBD_ALLOC_LARGE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]);
+	OBD_ALLOC_LARGE(r0->lo_sub, r0->lo_nr * sizeof(r0->lo_sub[0]));
 	if (r0->lo_sub == NULL)
 		GOTO(out, result = -ENOMEM);
 
@@ -255,7 +256,7 @@ static int lov_init_raid0(const struct lu_env *env, struct lov_device *dev,
 		if (IS_ERR(stripe))
 			GOTO(out, result = PTR_ERR(stripe));
 
-		result = lov_init_sub(env, lov, stripe, r0, oinfo,
+		result = lov_init_sub(env, lov, stripe, oinfo,
 				      lov_comp_index(index, i));
 		if (result == -EAGAIN) { /* try again */
 			--i;
@@ -264,7 +265,9 @@ static int lov_init_raid0(const struct lu_env *env, struct lov_device *dev,
 		}
 
 		if (result == 0) {
-			int sz = lov_page_slice_fixup(lov, stripe);
+			r0->lo_sub[i] = cl2lovsub(stripe);
+
+			sz = lov_page_slice_fixup(lov, stripe);
 			LASSERT(ergo(psz > 0, psz == sz));
 			psz = sz;
 		}
@@ -275,16 +278,369 @@ static int lov_init_raid0(const struct lu_env *env, struct lov_device *dev,
 	RETURN(result);
 }
 
+static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov,
+			       struct lov_layout_raid0 *r0,
+			       struct lovsub_object *los, int idx)
+{
+	struct cl_object        *sub;
+	struct lu_site          *site;
+	wait_queue_head_t *wq;
+	wait_queue_entry_t *waiter;
+
+        LASSERT(r0->lo_sub[idx] == los);
+
+	sub = lovsub2cl(los);
+	site = sub->co_lu.lo_dev->ld_site;
+	wq = lu_site_wq_from_fid(site, &sub->co_lu.lo_header->loh_fid);
+
+        cl_object_kill(env, sub);
+        /* release a reference to the sub-object and ... */
+        lu_object_ref_del(&sub->co_lu, "lov-parent", lov);
+        cl_object_put(env, sub);
+
+	/* ... wait until it is actually destroyed---sub-object clears its
+	 * ->lo_sub[] slot in lovsub_object_free() */
+	if (r0->lo_sub[idx] == los) {
+		waiter = &lov_env_info(env)->lti_waiter;
+		init_waitqueue_entry(waiter, current);
+		add_wait_queue(wq, waiter);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		while (1) {
+			/* this wait-queue is signaled at the end of
+			 * lu_object_free(). */
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			spin_lock(&r0->lo_sub_lock);
+			if (r0->lo_sub[idx] == los) {
+				spin_unlock(&r0->lo_sub_lock);
+				schedule();
+			} else {
+				spin_unlock(&r0->lo_sub_lock);
+				set_current_state(TASK_RUNNING);
+				break;
+			}
+		}
+		remove_wait_queue(wq, waiter);
+	}
+	LASSERT(r0->lo_sub[idx] == NULL);
+}
+
+static void lov_delete_raid0(const struct lu_env *env, struct lov_object *lov,
+			     struct lov_layout_entry *lle)
+{
+	struct lov_layout_raid0 *r0 = &lle->lle_raid0;
+
+	ENTRY;
+
+        if (r0->lo_sub != NULL) {
+		int i;
+
+		for (i = 0; i < r0->lo_nr; ++i) {
+			struct lovsub_object *los = r0->lo_sub[i];
+
+			if (los != NULL) {
+				cl_object_prune(env, &los->lso_cl);
+				/*
+				 * If top-level object is to be evicted from
+				 * the cache, so are its sub-objects.
+				 */
+				lov_subobject_kill(env, lov, r0, los, i);
+			}
+		}
+	}
+
+	EXIT;
+}
+
+static void lov_fini_raid0(const struct lu_env *env,
+			   struct lov_layout_entry *lle)
+{
+	struct lov_layout_raid0 *r0 = &lle->lle_raid0;
+
+	if (r0->lo_sub != NULL) {
+		OBD_FREE_LARGE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]);
+		r0->lo_sub = NULL;
+	}
+}
+
+static int lov_print_raid0(const struct lu_env *env, void *cookie,
+			   lu_printer_t p, const struct lov_layout_entry *lle)
+{
+	const struct lov_layout_raid0 *r0 = &lle->lle_raid0;
+	int i;
+
+	for (i = 0; i < r0->lo_nr; ++i) {
+		struct lu_object *sub;
+
+		if (r0->lo_sub[i] != NULL) {
+			sub = lovsub2lu(r0->lo_sub[i]);
+			lu_object_print(env, cookie, p, sub);
+		} else {
+			(*p)(env, cookie, "sub %d absent\n", i);
+		}
+	}
+	return 0;
+}
+
+static int lov_attr_get_raid0(const struct lu_env *env, struct lov_object *lov,
+			      unsigned int index, struct lov_layout_entry *lle,
+			      struct cl_attr **lov_attr)
+{
+	struct lov_layout_raid0 *r0 = &lle->lle_raid0;
+	struct lov_stripe_md *lsm = lov->lo_lsm;
+	struct ost_lvb *lvb = &lov_env_info(env)->lti_lvb;
+	struct cl_attr *attr = &r0->lo_attr;
+	__u64 kms = 0;
+	int result = 0;
+
+	if (r0->lo_attr_valid) {
+		*lov_attr = attr;
+		return 0;
+	}
+
+	memset(lvb, 0, sizeof(*lvb));
+
+	/* XXX: timestamps can be negative by sanity:test_39m,
+	 * how can it be? */
+	lvb->lvb_atime = LLONG_MIN;
+	lvb->lvb_ctime = LLONG_MIN;
+	lvb->lvb_mtime = LLONG_MIN;
+
+	/*
+	 * XXX that should be replaced with a loop over sub-objects,
+	 * doing cl_object_attr_get() on them. But for now, let's
+	 * reuse old lov code.
+	 */
+
+	/*
+	 * XXX take lsm spin-lock to keep lov_merge_lvb_kms()
+	 * happy. It's not needed, because new code uses
+	 * ->coh_attr_guard spin-lock to protect consistency of
+	 * sub-object attributes.
+	 */
+	lov_stripe_lock(lsm);
+	result = lov_merge_lvb_kms(lsm, index, lvb, &kms);
+	lov_stripe_unlock(lsm);
+	if (result == 0) {
+		cl_lvb2attr(attr, lvb);
+		attr->cat_kms = kms;
+		r0->lo_attr_valid = 1;
+		*lov_attr = attr;
+	}
+
+	return result;
+}
+
+static struct lov_comp_layout_entry_ops raid0_ops = {
+	.lco_init      = lov_init_raid0,
+	.lco_fini      = lov_fini_raid0,
+	.lco_getattr   = lov_attr_get_raid0,
+};
+
+static int lov_attr_get_dom(const struct lu_env *env, struct lov_object *lov,
+			    unsigned int index, struct lov_layout_entry *lle,
+			    struct cl_attr **lov_attr)
+{
+	struct lov_layout_dom *dom = &lle->lle_dom;
+	struct lov_oinfo *loi = dom->lo_loi;
+	struct cl_attr *attr = &dom->lo_dom_r0.lo_attr;
+
+	if (dom->lo_dom_r0.lo_attr_valid) {
+		*lov_attr = attr;
+		return 0;
+	}
+
+	if (OST_LVB_IS_ERR(loi->loi_lvb.lvb_blocks))
+		return OST_LVB_GET_ERR(loi->loi_lvb.lvb_blocks);
+
+	cl_lvb2attr(attr, &loi->loi_lvb);
+
+	/* DoM component size can be bigger than stripe size after
+	 * client's setattr RPC, so do not count anything beyond
+	 * component end. Alternatively, check that limit on server
+	 * and do not allow size overflow there. */
+	if (attr->cat_size > lle->lle_extent->e_end)
+		attr->cat_size = lle->lle_extent->e_end;
+
+	attr->cat_kms = attr->cat_size;
+
+	dom->lo_dom_r0.lo_attr_valid = 1;
+	*lov_attr = attr;
+
+	return 0;
+}
+
+/**
+ * Lookup FLD to get MDS index of the given DOM object FID.
+ *
+ * \param[in]  ld	LOV device
+ * \param[in]  fid	FID to lookup
+ * \param[out] nr	index in MDC array to return back
+ *
+ * \retval		0 and \a mds filled with MDS index if successful
+ * \retval		negative value on error
+ */
+static int lov_fld_lookup(struct lov_device *ld, const struct lu_fid *fid,
+			  __u32 *nr)
+{
+	__u32 mds_idx;
+	int i, rc;
+
+	ENTRY;
+
+	rc = fld_client_lookup(&ld->ld_lmv->u.lmv.lmv_fld, fid_seq(fid),
+			       &mds_idx, LU_SEQ_RANGE_MDT, NULL);
+	if (rc) {
+		CERROR("%s: error while looking for mds number. Seq %#llx"
+		       ", err = %d\n", lu_dev_name(cl2lu_dev(&ld->ld_cl)),
+		       fid_seq(fid), rc);
+		RETURN(rc);
+	}
+
+	CDEBUG(D_INODE, "FLD lookup got mds #%x for fid="DFID"\n",
+	       mds_idx, PFID(fid));
+
+	/* find proper MDC device in the array */
+	for (i = 0; i < ld->ld_md_tgts_nr; i++) {
+		if (ld->ld_md_tgts[i].ldm_mdc != NULL &&
+		    ld->ld_md_tgts[i].ldm_idx == mds_idx)
+			break;
+	}
+
+	if (i == ld->ld_md_tgts_nr) {
+		CERROR("%s: cannot find corresponding MDC device for mds #%x "
+		       "for fid="DFID"\n", lu_dev_name(cl2lu_dev(&ld->ld_cl)),
+		       mds_idx, PFID(fid));
+		rc = -EINVAL;
+	} else {
+		*nr = i;
+	}
+	RETURN(rc);
+}
+
+/**
+ * Implementation of lov_comp_layout_entry_ops::lco_init for DOM object.
+ *
+ * Init the DOM object for the first time. It prepares also RAID0 entry
+ * for it to use in common methods with ordinary RAID0 layout entries.
+ *
+ * \param[in] env	execution environment
+ * \param[in] dev	LOV device
+ * \param[in] lov	LOV object
+ * \param[in] index	Composite layout entry index in LSM
+ * \param[in] lle	Composite LOV layout entry
+ */
+static int lov_init_dom(const struct lu_env *env, struct lov_device *dev,
+			struct lov_object *lov, unsigned int index,
+			const struct cl_object_conf *conf,
+			struct lov_layout_entry *lle)
+{
+	struct lov_thread_info *lti = lov_env_info(env);
+	struct lov_stripe_md_entry *lsme = lov_lse(lov, index);
+	struct cl_object *clo;
+	struct lu_object *o = lov2lu(lov);
+	const struct lu_fid *fid = lu_object_fid(o);
+	struct cl_device *mdcdev;
+	struct lov_oinfo *loi = NULL;
+	struct cl_object_conf *sconf = &lti->lti_stripe_conf;
+
+	int rc;
+	__u32 idx = 0;
+
+	ENTRY;
+
+	LASSERT(index == 0);
+
+	/* find proper MDS device */
+	rc = lov_fld_lookup(dev, fid, &idx);
+	if (rc)
+		RETURN(rc);
+
+	LASSERTF(dev->ld_md_tgts[idx].ldm_mdc != NULL,
+		 "LOV md target[%u] is NULL\n", idx);
+
+	/* check lsm is DOM, more checks are needed */
+	LASSERT(lsme->lsme_stripe_count == 0);
+
+	/*
+	 * Create lower cl_objects.
+	 */
+	mdcdev = dev->ld_md_tgts[idx].ldm_mdc;
+
+	LASSERTF(mdcdev != NULL, "non-initialized mdc subdev\n");
+
+	/* DoM object has no oinfo in LSM entry, create it exclusively */
+	OBD_SLAB_ALLOC_PTR_GFP(loi, lov_oinfo_slab, GFP_NOFS);
+	if (loi == NULL)
+		RETURN(-ENOMEM);
+
+	fid_to_ostid(lu_object_fid(lov2lu(lov)), &loi->loi_oi);
+
+	sconf->u.coc_oinfo = loi;
+again:
+	clo = lov_sub_find(env, mdcdev, fid, sconf);
+	if (IS_ERR(clo))
+		GOTO(out, rc = PTR_ERR(clo));
+
+	rc = lov_init_sub(env, lov, clo, loi, lov_comp_index(index, 0));
+	if (rc == -EAGAIN) /* try again */
+		goto again;
+	else if (rc != 0)
+		GOTO(out, rc);
+
+	lle->lle_dom.lo_dom = cl2lovsub(clo);
+	spin_lock_init(&lle->lle_dom.lo_dom_r0.lo_sub_lock);
+	lle->lle_dom.lo_dom_r0.lo_nr = 1;
+	lle->lle_dom.lo_dom_r0.lo_sub = &lle->lle_dom.lo_dom;
+	lle->lle_dom.lo_loi = loi;
+
+	rc = lov_page_slice_fixup(lov, clo);
+	RETURN(rc);
+
+out:
+	if (loi != NULL)
+		OBD_SLAB_FREE_PTR(loi, lov_oinfo_slab);
+	return rc;
+}
+
+/**
+ * Implementation of lov_layout_operations::llo_fini for DOM object.
+ *
+ * Finish the DOM object and free related memory.
+ *
+ * \param[in] env	execution environment
+ * \param[in] lov	LOV object
+ * \param[in] state	LOV layout state
+ */
+static void lov_fini_dom(const struct lu_env *env,
+			 struct lov_layout_entry *lle)
+{
+	if (lle->lle_dom.lo_dom != NULL)
+		lle->lle_dom.lo_dom = NULL;
+	if (lle->lle_dom.lo_loi != NULL)
+		OBD_SLAB_FREE_PTR(lle->lle_dom.lo_loi, lov_oinfo_slab);
+}
+
+static struct lov_comp_layout_entry_ops dom_ops = {
+	.lco_init = lov_init_dom,
+	.lco_fini = lov_fini_dom,
+	.lco_getattr = lov_attr_get_dom,
+};
+
 static int lov_init_composite(const struct lu_env *env, struct lov_device *dev,
 			      struct lov_object *lov, struct lov_stripe_md *lsm,
 			      const struct cl_object_conf *conf,
 			      union lov_layout_state *state)
 {
 	struct lov_layout_composite *comp = &state->composite;
+	struct lov_layout_entry *lle;
+	struct lov_mirror_entry *lre;
 	unsigned int entry_count;
 	unsigned int psz = 0;
+	unsigned int mirror_count;
+	int flr_state = lsm->lsm_flags & LCM_FL_FLR_MASK;
 	int result = 0;
-	int i;
+	unsigned int seq;
+	int i, j;
 
 	ENTRY;
 
@@ -293,38 +649,157 @@ static int lov_init_composite(const struct lu_env *env, struct lov_device *dev,
 	lov->lo_lsm = lsm_addref(lsm);
 	lov->lo_layout_invalid = true;
 
+	dump_lsm(D_INODE, lsm);
+
 	entry_count = lsm->lsm_entry_count;
-	comp->lo_entry_count = entry_count;
+
+	spin_lock_init(&comp->lo_write_lock);
+	comp->lo_flags = lsm->lsm_flags;
+	comp->lo_mirror_count = lsm->lsm_mirror_count + 1;
+	comp->lo_entry_count = lsm->lsm_entry_count;
+	comp->lo_preferred_mirror = -1;
+
+	if (equi(flr_state == LCM_FL_NONE, comp->lo_mirror_count > 1))
+		RETURN(-EINVAL);
+
+	OBD_ALLOC(comp->lo_mirrors,
+		  comp->lo_mirror_count * sizeof(*comp->lo_mirrors));
+	if (comp->lo_mirrors == NULL)
+		RETURN(-ENOMEM);
 
 	OBD_ALLOC(comp->lo_entries, entry_count * sizeof(*comp->lo_entries));
 	if (comp->lo_entries == NULL)
 		RETURN(-ENOMEM);
 
-	for (i = 0; i < entry_count; i++) {
-		struct lov_layout_entry *le = &comp->lo_entries[i];
+	/* Initiate all entry types and extents data at first */
+	for (i = 0, j = 0, mirror_count = 1; i < entry_count; i++) {
+		int mirror_id = 0;
+
+		lle = &comp->lo_entries[i];
+
+		lle->lle_lsme = lsm->lsm_entries[i];
+		lle->lle_type = lov_entry_type(lle->lle_lsme);
+		switch (lle->lle_type) {
+		case LOV_PATTERN_RAID0:
+			lle->lle_comp_ops = &raid0_ops;
+			break;
+		case LOV_PATTERN_MDT:
+			lle->lle_comp_ops = &dom_ops;
+			break;
+		default:
+			CERROR("%s: unknown composite layout entry type %i\n",
+			       lov2obd(dev->ld_lov)->obd_name,
+			       lsm->lsm_entries[i]->lsme_pattern);
+			dump_lsm(D_ERROR, lsm);
+			RETURN(-EIO);
+		}
+
+		lle->lle_extent = &lle->lle_lsme->lsme_extent;
+		lle->lle_valid = !(lle->lle_lsme->lsme_flags & LCME_FL_STALE);
+
+		if (flr_state != LCM_FL_NONE)
+			mirror_id = mirror_id_of(lle->lle_lsme->lsme_id);
+
+		lre = &comp->lo_mirrors[j];
+		if (i > 0) {
+			if (mirror_id == lre->lre_mirror_id) {
+				lre->lre_valid |= lle->lle_valid;
+				lre->lre_stale |= !lle->lle_valid;
+				lre->lre_end = i;
+				continue;
+			}
+
+			/* new mirror detected, assume that the mirrors
+			 * are shorted in layout */
+			++mirror_count;
+			++j;
+			if (j >= comp->lo_mirror_count)
+				break;
+
+			lre = &comp->lo_mirrors[j];
+		}
+
+		/* entries must be sorted by mirrors */
+		lre->lre_mirror_id = mirror_id;
+		lre->lre_start = lre->lre_end = i;
+		lre->lre_preferred = !!(lle->lle_lsme->lsme_flags &
+					LCME_FL_PREF_RD);
+		lre->lre_valid = lle->lle_valid;
+		lre->lre_stale = !lle->lle_valid;
+	}
+
+	/* sanity check for FLR */
+	if (mirror_count != comp->lo_mirror_count) {
+		CDEBUG(D_INODE, DFID
+		       " doesn't have the # of mirrors it claims, %u/%u\n",
+		       PFID(lu_object_fid(lov2lu(lov))), mirror_count,
+		       comp->lo_mirror_count + 1);
+
+		GOTO(out, result = -EINVAL);
+	}
+
+	lov_foreach_layout_entry(lov, lle) {
+		int index = lov_layout_entry_index(lov, lle);
 
-		le->lle_extent = lsm->lsm_entries[i]->lsme_extent;
 		/**
 		 * If the component has not been init-ed on MDS side, for
 		 * PFL layout, we'd know that the components beyond this one
 		 * will be dynamically init-ed later on file write/trunc ops.
 		 */
-		if (!lsm_entry_inited(lsm, i))
+		if (!lsme_inited(lle->lle_lsme))
 			continue;
 
-		result = lov_init_raid0(env, dev, lov, i, &le->lle_raid0);
+		result = lle->lle_comp_ops->lco_init(env, dev, lov, index,
+						     conf, lle);
 		if (result < 0)
 			break;
 
 		LASSERT(ergo(psz > 0, psz == result));
 		psz = result;
 	}
+
 	if (psz > 0)
 		cl_object_header(&lov->lo_cl)->coh_page_bufsize += psz;
 
+	/* decide the preferred mirror. It uses the hash value of lov_object
+	 * so that different clients would use different mirrors for read. */
+	mirror_count = 0;
+	seq = hash_long((unsigned long)lov, 8);
+	for (i = 0; i < comp->lo_mirror_count; i++) {
+		unsigned int idx = (i + seq) % comp->lo_mirror_count;
+
+		lre = lov_mirror_entry(lov, idx);
+		if (lre->lre_stale)
+			continue;
+
+		mirror_count++; /* valid mirror */
+
+		if (lre->lre_preferred || comp->lo_preferred_mirror < 0)
+			comp->lo_preferred_mirror = idx;
+	}
+	if (!mirror_count) {
+		CDEBUG(D_INODE, DFID
+		       " doesn't have any valid mirrors\n",
+		       PFID(lu_object_fid(lov2lu(lov))));
+
+		comp->lo_preferred_mirror = 0;
+	}
+
+	LASSERT(comp->lo_preferred_mirror >= 0);
+
+	EXIT;
+out:
 	return result > 0 ? 0 : result;
 }
 
+static int lov_init_empty(const struct lu_env *env, struct lov_device *dev,
+			  struct lov_object *lov, struct lov_stripe_md *lsm,
+			  const struct cl_object_conf *conf,
+			  union lov_layout_state *state)
+{
+	return 0;
+}
+
 static int lov_init_released(const struct lu_env *env,
 			     struct lov_device *dev, struct lov_object *lov,
 			     struct lov_stripe_md *lsm,
@@ -339,43 +814,6 @@ static int lov_init_released(const struct lu_env *env,
 	return 0;
 }
 
-static struct cl_object *lov_find_subobj(const struct lu_env *env,
-					 struct lov_object *lov,
-					 struct lov_stripe_md *lsm,
-					 int index)
-{
-	struct lov_device	*dev = lu2lov_dev(lov2lu(lov)->lo_dev);
-	struct lov_thread_info  *lti = lov_env_info(env);
-	struct lu_fid		*ofid = &lti->lti_fid;
-	struct lov_oinfo	*oinfo;
-	struct cl_device	*subdev;
-	int			entry = lov_comp_entry(index);
-	int			stripe = lov_comp_stripe(index);
-	int			ost_idx;
-	int			rc;
-	struct cl_object	*result;
-
-	if (lov->lo_type != LLT_COMP)
-		GOTO(out, result = NULL);
-
-	if (entry >= lsm->lsm_entry_count ||
-	    stripe >= lsm->lsm_entries[entry]->lsme_stripe_count)
-		GOTO(out, result = NULL);
-
-	oinfo = lsm->lsm_entries[entry]->lsme_oinfo[stripe];
-	ost_idx = oinfo->loi_ost_idx;
-	rc = ostid_to_fid(ofid, &oinfo->loi_oi, ost_idx);
-	if (rc != 0)
-		GOTO(out, result = NULL);
-
-	subdev = lovsub2cl_dev(dev->ld_target[ost_idx]);
-	result = lov_sub_find(env, subdev, ofid, NULL);
-out:
-	if (result == NULL)
-		result = ERR_PTR(-EINVAL);
-	return result;
-}
-
 static int lov_delete_empty(const struct lu_env *env, struct lov_object *lov,
 			    union lov_layout_state *state)
 {
@@ -385,77 +823,6 @@ static int lov_delete_empty(const struct lu_env *env, struct lov_object *lov,
 	return 0;
 }
 
-static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov,
-			       struct lov_layout_raid0 *r0,
-			       struct lovsub_object *los, int idx)
-{
-	struct cl_object        *sub;
-	struct lu_site          *site;
-	struct lu_site_bkt_data *bkt;
-	wait_queue_entry_t      *waiter;
-
-        LASSERT(r0->lo_sub[idx] == los);
-
-        sub  = lovsub2cl(los);
-        site = sub->co_lu.lo_dev->ld_site;
-        bkt  = lu_site_bkt_from_fid(site, &sub->co_lu.lo_header->loh_fid);
-
-        cl_object_kill(env, sub);
-        /* release a reference to the sub-object and ... */
-        lu_object_ref_del(&sub->co_lu, "lov-parent", lov);
-        cl_object_put(env, sub);
-
-        /* ... wait until it is actually destroyed---sub-object clears its
-         * ->lo_sub[] slot in lovsub_object_fini() */
-	if (r0->lo_sub[idx] == los) {
-		waiter = &lov_env_info(env)->lti_waiter;
-		init_waitqueue_entry(waiter, current);
-		add_wait_queue(&bkt->lsb_marche_funebre, waiter);
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		while (1) {
-			/* this wait-queue is signaled at the end of
-			 * lu_object_free(). */
-			set_current_state(TASK_UNINTERRUPTIBLE);
-			spin_lock(&r0->lo_sub_lock);
-			if (r0->lo_sub[idx] == los) {
-				spin_unlock(&r0->lo_sub_lock);
-				schedule();
-			} else {
-				spin_unlock(&r0->lo_sub_lock);
-				set_current_state(TASK_RUNNING);
-				break;
-			}
-		}
-		remove_wait_queue(&bkt->lsb_marche_funebre, waiter);
-	}
-	LASSERT(r0->lo_sub[idx] == NULL);
-}
-
-static void lov_delete_raid0(const struct lu_env *env, struct lov_object *lov,
-			     struct lov_layout_raid0 *r0)
-{
-	ENTRY;
-
-        if (r0->lo_sub != NULL) {
-		int i;
-
-		for (i = 0; i < r0->lo_nr; ++i) {
-			struct lovsub_object *los = r0->lo_sub[i];
-
-			if (los != NULL) {
-				cl_object_prune(env, &los->lso_cl);
-				/*
-				 * If top-level object is to be evicted from
-				 * the cache, so are its sub-objects.
-				 */
-				lov_subobject_kill(env, lov, r0, los, i);
-			}
-		}
-	}
-
-	EXIT;
-}
-
 static int lov_delete_composite(const struct lu_env *env,
 				struct lov_object *lov,
 				union lov_layout_state *state)
@@ -470,7 +837,7 @@ static int lov_delete_composite(const struct lu_env *env,
 	lov_layout_wait(env, lov);
 	if (comp->lo_entries)
 		lov_foreach_layout_entry(lov, entry)
-			lov_delete_raid0(env, lov, &entry->lle_raid0);
+			lov_delete_raid0(env, lov, entry);
 
 	RETURN(0);
 }
@@ -481,15 +848,6 @@ static void lov_fini_empty(const struct lu_env *env, struct lov_object *lov,
 	LASSERT(lov->lo_type == LLT_EMPTY || lov->lo_type == LLT_RELEASED);
 }
 
-static void lov_fini_raid0(const struct lu_env *env,
-			   struct lov_layout_raid0 *r0)
-{
-	if (r0->lo_sub != NULL) {
-		OBD_FREE_LARGE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]);
-		r0->lo_sub = NULL;
-	}
-}
-
 static void lov_fini_composite(const struct lu_env *env,
 			       struct lov_object *lov,
 			       union lov_layout_state *state)
@@ -501,13 +859,21 @@ static void lov_fini_composite(const struct lu_env *env,
 		struct lov_layout_entry *entry;
 
 		lov_foreach_layout_entry(lov, entry)
-			lov_fini_raid0(env, &entry->lle_raid0);
+			entry->lle_comp_ops->lco_fini(env, entry);
 
 		OBD_FREE(comp->lo_entries,
 			 comp->lo_entry_count * sizeof(*comp->lo_entries));
 		comp->lo_entries = NULL;
 	}
 
+	if (comp->lo_mirrors != NULL) {
+		OBD_FREE(comp->lo_mirrors,
+			 comp->lo_mirror_count * sizeof(*comp->lo_mirrors));
+		comp->lo_mirrors = NULL;
+	}
+
+	memset(comp, 0, sizeof(*comp));
+
 	dump_lsm(D_INODE, lov->lo_lsm);
 	lov_free_memmd(&lov->lo_lsm);
 
@@ -530,24 +896,6 @@ static int lov_print_empty(const struct lu_env *env, void *cookie,
         return 0;
 }
 
-static int lov_print_raid0(const struct lu_env *env, void *cookie,
-			   lu_printer_t p, struct lov_layout_raid0 *r0)
-{
-	int i;
-
-	for (i = 0; i < r0->lo_nr; ++i) {
-		struct lu_object *sub;
-
-		if (r0->lo_sub[i] != NULL) {
-			sub = lovsub2lu(r0->lo_sub[i]);
-			lu_object_print(env, cookie, p, sub);
-		} else {
-			(*p)(env, cookie, "sub %d absent\n", i);
-		}
-	}
-	return 0;
-}
-
 static int lov_print_composite(const struct lu_env *env, void *cookie,
 			       lu_printer_t p, const struct lu_object *o)
 {
@@ -563,12 +911,15 @@ static int lov_print_composite(const struct lu_env *env, void *cookie,
 
 	for (i = 0; i < lsm->lsm_entry_count; i++) {
 		struct lov_stripe_md_entry *lse = lsm->lsm_entries[i];
+		struct lov_layout_entry *lle = lov_entry(lov, i);
 
-		(*p)(env, cookie, DEXT ": { 0x%08X, %u, %u, %#x, %u, %u }\n",
+		(*p)(env, cookie,
+		     DEXT ": { 0x%08X, %u, %#x, %u, %#x, %u, %u }\n",
 		     PEXT(&lse->lsme_extent), lse->lsme_magic,
-		     lse->lsme_id, lse->lsme_layout_gen, lse->lsme_flags,
-		     lse->lsme_stripe_count, lse->lsme_stripe_size);
-		lov_print_raid0(env, cookie, p, lov_r0(lov, i));
+		     lse->lsme_id, lse->lsme_pattern, lse->lsme_layout_gen,
+		     lse->lsme_flags, lse->lsme_stripe_count,
+		     lse->lsme_stripe_size);
+		lov_print_raid0(env, cookie, p, lle);
 	}
 
 	return 0;
@@ -602,51 +953,6 @@ static int lov_attr_get_empty(const struct lu_env *env, struct cl_object *obj,
         return 0;
 }
 
-static int lov_attr_get_raid0(const struct lu_env *env, struct lov_object *lov,
-			      unsigned int index, struct lov_layout_raid0 *r0)
-
-{
-	struct lov_stripe_md *lsm = lov->lo_lsm;
-	struct ost_lvb *lvb = &lov_env_info(env)->lti_lvb;
-	struct cl_attr *attr = &r0->lo_attr;
-	__u64 kms = 0;
-	int result = 0;
-
-	if (r0->lo_attr_valid)
-		return 0;
-
-	memset(lvb, 0, sizeof(*lvb));
-
-	/* XXX: timestamps can be negative by sanity:test_39m,
-	 * how can it be? */
-	lvb->lvb_atime = LLONG_MIN;
-	lvb->lvb_ctime = LLONG_MIN;
-	lvb->lvb_mtime = LLONG_MIN;
-
-	/*
-	 * XXX that should be replaced with a loop over sub-objects,
-	 * doing cl_object_attr_get() on them. But for now, let's
-	 * reuse old lov code.
-	 */
-
-	/*
-	 * XXX take lsm spin-lock to keep lov_merge_lvb_kms()
-	 * happy. It's not needed, because new code uses
-	 * ->coh_attr_guard spin-lock to protect consistency of
-	 * sub-object attributes.
-	 */
-	lov_stripe_lock(lsm);
-	result = lov_merge_lvb_kms(lsm, index, lvb, &kms);
-	lov_stripe_unlock(lsm);
-	if (result == 0) {
-		cl_lvb2attr(attr, lvb);
-		attr->cat_kms = kms;
-		r0->lo_attr_valid = 1;
-	}
-
-	return result;
-}
-
 static int lov_attr_get_composite(const struct lu_env *env,
 				  struct cl_object *obj,
 				  struct cl_attr *attr)
@@ -654,25 +960,34 @@ static int lov_attr_get_composite(const struct lu_env *env,
 	struct lov_object	*lov = cl2lov(obj);
 	struct lov_layout_entry *entry;
 	int			 result = 0;
-	int			 index = 0;
 
 	ENTRY;
 
 	attr->cat_size = 0;
 	attr->cat_blocks = 0;
 	lov_foreach_layout_entry(lov, entry) {
-		struct lov_layout_raid0 *r0 = &entry->lle_raid0;
-		struct cl_attr *lov_attr = &r0->lo_attr;
+		struct cl_attr *lov_attr = NULL;
+		int index = lov_layout_entry_index(lov, entry);
+
+		if (!entry->lle_valid)
+			continue;
 
 		/* PFL: This component has not been init-ed. */
 		if (!lsm_entry_inited(lov->lo_lsm, index))
-			break;
+			continue;
 
-		result = lov_attr_get_raid0(env, lov, index, r0);
-		if (result != 0)
-			break;
+		result = entry->lle_comp_ops->lco_getattr(env, lov, index,
+							  entry, &lov_attr);
+		if (result < 0)
+			RETURN(result);
 
-		index++;
+		if (lov_attr == NULL)
+			continue;
+
+		CDEBUG(D_INODE, "COMP ID #%i: s=%llu m=%llu a=%llu c=%llu "
+		       "b=%llu\n", index - 1, lov_attr->cat_size,
+		       lov_attr->cat_mtime, lov_attr->cat_atime,
+		       lov_attr->cat_ctime, lov_attr->cat_blocks);
 
 		/* merge results */
 		attr->cat_blocks += lov_attr->cat_blocks;
@@ -687,29 +1002,58 @@ static int lov_attr_get_composite(const struct lu_env *env,
 		if (attr->cat_mtime < lov_attr->cat_mtime)
 			attr->cat_mtime = lov_attr->cat_mtime;
 	}
-	RETURN(result);
+
+	RETURN(0);
+}
+
+static int lov_flush_composite(const struct lu_env *env,
+			       struct cl_object *obj,
+			       struct ldlm_lock *lock)
+{
+	struct lov_object *lov = cl2lov(obj);
+	struct lov_layout_entry *lle;
+	int rc = -ENODATA;
+
+	ENTRY;
+
+	lov_foreach_layout_entry(lov, lle) {
+		if (!lsme_is_dom(lle->lle_lsme))
+			continue;
+		rc = cl_object_flush(env, lovsub2cl(lle->lle_dom.lo_dom), lock);
+		break;
+	}
+
+	RETURN(rc);
+}
+
+static int lov_flush_empty(const struct lu_env *env, struct cl_object *obj,
+			   struct ldlm_lock *lock)
+{
+	return 0;
 }
 
 const static struct lov_layout_operations lov_dispatch[] = {
-        [LLT_EMPTY] = {
-                .llo_init      = lov_init_empty,
-                .llo_delete    = lov_delete_empty,
-                .llo_fini      = lov_fini_empty,
-                .llo_print     = lov_print_empty,
-                .llo_page_init = lov_page_init_empty,
-                .llo_lock_init = lov_lock_init_empty,
-                .llo_io_init   = lov_io_init_empty,
+	[LLT_EMPTY] = {
+		.llo_init      = lov_init_empty,
+		.llo_delete    = lov_delete_empty,
+		.llo_fini      = lov_fini_empty,
+		.llo_print     = lov_print_empty,
+		.llo_page_init = lov_page_init_empty,
+		.llo_lock_init = lov_lock_init_empty,
+		.llo_io_init   = lov_io_init_empty,
 		.llo_getattr   = lov_attr_get_empty,
-        },
-        [LLT_RELEASED] = {
-                .llo_init      = lov_init_released,
-                .llo_delete    = lov_delete_empty,
-                .llo_fini      = lov_fini_released,
-                .llo_print     = lov_print_released,
-                .llo_page_init = lov_page_init_empty,
-                .llo_lock_init = lov_lock_init_empty,
-                .llo_io_init   = lov_io_init_released,
+		.llo_flush     = lov_flush_empty,
+	},
+	[LLT_RELEASED] = {
+		.llo_init      = lov_init_released,
+		.llo_delete    = lov_delete_empty,
+		.llo_fini      = lov_fini_released,
+		.llo_print     = lov_print_released,
+		.llo_page_init = lov_page_init_empty,
+		.llo_lock_init = lov_lock_init_empty,
+		.llo_io_init   = lov_io_init_released,
 		.llo_getattr   = lov_attr_get_empty,
+		.llo_flush     = lov_flush_empty,
 	},
 	[LLT_COMP] = {
 		.llo_init      = lov_init_composite,
@@ -720,6 +1064,7 @@ const static struct lov_layout_operations lov_dispatch[] = {
 		.llo_lock_init = lov_lock_init_composite,
 		.llo_io_init   = lov_io_init_composite,
 		.llo_getattr   = lov_attr_get_composite,
+		.llo_flush     = lov_flush_composite,
 	},
 };
 
@@ -881,12 +1226,11 @@ static int lov_layout_change(const struct lu_env *unused,
 	CDEBUG(D_INODE, DFID "Apply new layout lov %p, type %d\n",
 	       PFID(lu_object_fid(lov2lu(lov))), lov, llt);
 
-	lov->lo_type = LLT_EMPTY;
-
 	/* page bufsize fixup */
 	cl_object_header(&lov->lo_cl)->coh_page_bufsize -=
 		lov_page_slice_fixup(lov, NULL);
 
+	lov->lo_type = llt;
 	rc = new_ops->llo_init(env, lov_dev, lov, lsm, conf, state);
 	if (rc != 0) {
 		struct obd_device *obd = lov2obd(lov_dev->ld_lov);
@@ -896,11 +1240,10 @@ static int lov_layout_change(const struct lu_env *unused,
 		new_ops->llo_delete(env, lov, state);
 		new_ops->llo_fini(env, lov, state);
 		/* this file becomes an EMPTY file. */
+		lov->lo_type = LLT_EMPTY;
 		GOTO(out, rc);
 	}
 
-	lov->lo_type = llt;
-
 out:
 	cl_env_put(env, &refcheck);
 	RETURN(rc);
@@ -1056,7 +1399,7 @@ int lov_page_init(const struct lu_env *env, struct cl_object *obj,
 int lov_io_init(const struct lu_env *env, struct cl_object *obj,
 		struct cl_io *io)
 {
-	CL_IO_SLICE_CLEAN(lov_env_io(env), lis_cl);
+	CL_IO_SLICE_CLEAN(lov_env_io(env), lis_preserved);
 
 	CDEBUG(D_INODE, DFID "io %p type %d ignore/verify layout %d/%d\n",
 	       PFID(lu_object_fid(&obj->co_lu)), io, io->ci_type,
@@ -1258,6 +1601,43 @@ struct fiemap_state {
 	bool			fs_enough;
 };
 
+static struct cl_object *lov_find_subobj(const struct lu_env *env,
+					 struct lov_object *lov,
+					 struct lov_stripe_md *lsm,
+					 int index)
+{
+	struct lov_device	*dev = lu2lov_dev(lov2lu(lov)->lo_dev);
+	struct lov_thread_info  *lti = lov_env_info(env);
+	struct lu_fid		*ofid = &lti->lti_fid;
+	struct lov_oinfo	*oinfo;
+	struct cl_device	*subdev;
+	int			entry = lov_comp_entry(index);
+	int			stripe = lov_comp_stripe(index);
+	int			ost_idx;
+	int			rc;
+	struct cl_object	*result;
+
+	if (lov->lo_type != LLT_COMP)
+		GOTO(out, result = NULL);
+
+	if (entry >= lsm->lsm_entry_count ||
+	    stripe >= lsm->lsm_entries[entry]->lsme_stripe_count)
+		GOTO(out, result = NULL);
+
+	oinfo = lsm->lsm_entries[entry]->lsme_oinfo[stripe];
+	ost_idx = oinfo->loi_ost_idx;
+	rc = ostid_to_fid(ofid, &oinfo->loi_oi, ost_idx);
+	if (rc != 0)
+		GOTO(out, result = NULL);
+
+	subdev = lovsub2cl_dev(dev->ld_target[ost_idx]);
+	result = lov_sub_find(env, subdev, ofid, NULL);
+out:
+	if (result == NULL)
+		result = ERR_PTR(-EINVAL);
+	return result;
+}
+
 int fiemap_for_stripe(const struct lu_env *env, struct cl_object *obj,
 		      struct lov_stripe_md *lsm, struct fiemap *fiemap,
 		      size_t *buflen, struct ll_fiemap_info_key *fmkey,
@@ -1298,7 +1678,7 @@ int fiemap_for_stripe(const struct lu_env *env, struct cl_object *obj,
 	if (lun_start == lun_end)
 		return 0;
 
-	req_fm_len = obd_object_end - lun_start;
+	req_fm_len = obd_object_end - lun_start + 1;
 	fs->fs_fm->fm_length = 0;
 	len_mapped_single_call = 0;
 
@@ -1341,7 +1721,7 @@ int fiemap_for_stripe(const struct lu_env *env, struct cl_object *obj,
 			fs->fs_fm->fm_mapped_extents = 1;
 
 			fm_ext[0].fe_logical = lun_start;
-			fm_ext[0].fe_length = obd_object_end - lun_start;
+			fm_ext[0].fe_length = obd_object_end - lun_start + 1;
 			fm_ext[0].fe_flags |= FIEMAP_EXTENT_UNKNOWN;
 
 			goto inactive_tgt;
@@ -1456,8 +1836,11 @@ static int lov_object_fiemap(const struct lu_env *env, struct cl_object *obj,
 	ENTRY;
 
 	lsm = lov_lsm_addref(cl2lov(obj));
-	if (lsm == NULL)
-		RETURN(-ENODATA);
+	if (lsm == NULL) {
+		/* no extent: there is no object for mapping */
+		fiemap->fm_mapped_extents = 0;
+		return 0;
+	}
 
 	if (!(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) {
 		/**
@@ -1471,6 +1854,10 @@ static int lov_object_fiemap(const struct lu_env *env, struct cl_object *obj,
 			GOTO(out_lsm, rc = -ENOTSUPP);
 	}
 
+	/* No support for DOM layout yet. */
+	if (lsme_is_dom(lsm->lsm_entries[0]))
+		GOTO(out_lsm, rc = -ENOTSUPP);
+
 	if (lsm->lsm_is_released) {
 		if (fiemap->fm_start < fmkey->lfik_oa.o_size) {
 			/**
@@ -1537,6 +1924,7 @@ static int lov_object_fiemap(const struct lu_env *env, struct cl_object *obj,
 	if (start_entry == -1 || end_entry == -1)
 		GOTO(out_fm_local, rc = -EINVAL);
 
+	/* TODO: rewrite it with lov_foreach_io_layout() */
 	for (entry = start_entry; entry <= end_entry; entry++) {
 		lsme = lsm->lsm_entries[entry];
 
@@ -1666,6 +2054,13 @@ static loff_t lov_object_maxbytes(struct cl_object *obj)
 	return maxbytes;
 }
 
+static int lov_object_flush(const struct lu_env *env, struct cl_object *obj,
+			    struct ldlm_lock *lock)
+{
+	return LOV_2DISPATCH_MAYLOCK(cl2lov(obj), llo_flush, true, env, obj,
+				     lock);
+}
+
 static const struct cl_object_operations lov_ops = {
 	.coo_page_init    = lov_page_init,
 	.coo_lock_init    = lov_lock_init,
@@ -1677,6 +2072,7 @@ static const struct cl_object_operations lov_ops = {
 	.coo_layout_get   = lov_object_layout_get,
 	.coo_maxbytes     = lov_object_maxbytes,
 	.coo_fiemap       = lov_object_fiemap,
+	.coo_object_flush = lov_object_flush
 };
 
 static const struct lu_object_operations lov_lu_obj_ops = {
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_offset.c b/drivers/staging/lustrefsx/lustre/lov/lov_offset.c
index 3ff0a38a7e263..de2e6c47da8ee 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_offset.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_offset.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,12 +38,15 @@
 
 #include "lov_internal.h"
 
-static loff_t stripe_width(struct lov_stripe_md *lsm, unsigned int index)
+loff_t stripe_width(struct lov_stripe_md *lsm, unsigned int index)
 {
 	struct lov_stripe_md_entry *entry = lsm->lsm_entries[index];
 
 	LASSERT(index < lsm->lsm_entry_count);
 
+	if (lsme_is_dom(entry))
+		return (loff_t)entry->lsme_stripe_size;
+
 	return (loff_t)entry->lsme_stripe_size * entry->lsme_stripe_count;
 }
 
@@ -55,10 +58,11 @@ u64 lov_stripe_size(struct lov_stripe_md *lsm, int index, u64 ost_size,
 	unsigned long stripe_size;
 	loff_t swidth;
 	loff_t lov_size;
-        ENTRY;
 
-        if (ost_size == 0)
-                RETURN(0);
+	ENTRY;
+
+	if (ost_size == 0)
+		RETURN(0);
 
 	swidth = stripe_width(lsm, index);
 
@@ -69,7 +73,7 @@ u64 lov_stripe_size(struct lov_stripe_md *lsm, int index, u64 ost_size,
 	else
 		lov_size = (ost_size - 1) * swidth + (stripeno + 1) * ssize;
 
-        RETURN(lov_size);
+	RETURN(lov_size);
 }
 
 /**
@@ -86,7 +90,8 @@ pgoff_t lov_stripe_pgoff(struct lov_stripe_md *lsm, int index,
 	return offset >> PAGE_SHIFT;
 }
 
-/* we have an offset in file backed by an lov and want to find out where
+/*
+ * we have an offset in file backed by an lov and want to find out where
  * that offset lands in our given stripe of the file.  for the easy
  * case where the offset is within the stripe, we just have to scale the
  * offset down to make it relative to the stripe instead of the lov.
@@ -133,7 +138,8 @@ pgoff_t lov_stripe_pgoff(struct lov_stripe_md *lsm, int index,
  * this function returns < 0 when the offset was "before" the stripe and
  * was moved forward to the start of the stripe in question;  0 when it
  * falls in the stripe and no shifting was done; > 0 when the offset
- * was outside the stripe and was pulled back to its final byte. */
+ * was outside the stripe and was pulled back to its final byte.
+ */
 int lov_stripe_offset(struct lov_stripe_md *lsm, int index, loff_t lov_off,
 		      int stripeno, loff_t *obdoff)
 {
@@ -141,12 +147,12 @@ int lov_stripe_offset(struct lov_stripe_md *lsm, int index, loff_t lov_off,
 	loff_t stripe_off;
 	loff_t this_stripe;
 	loff_t swidth;
-        int ret = 0;
+	int ret = 0;
 
-        if (lov_off == OBD_OBJECT_EOF) {
-                *obdoff = OBD_OBJECT_EOF;
-                return 0;
-        }
+	if (lov_off == OBD_OBJECT_EOF) {
+		*obdoff = OBD_OBJECT_EOF;
+		return 0;
+	}
 
 	swidth = stripe_width(lsm, index);
 
@@ -154,23 +160,24 @@ int lov_stripe_offset(struct lov_stripe_md *lsm, int index, loff_t lov_off,
 	stripe_off = lov_do_div64(lov_off, swidth);
 
 	this_stripe = (loff_t)stripeno * ssize;
-        if (stripe_off < this_stripe) {
-                stripe_off = 0;
-                ret = -1;
-        } else {
-                stripe_off -= this_stripe;
-
-                if (stripe_off >= ssize) {
-                        stripe_off = ssize;
-                        ret = 1;
-                }
-        }
-
-        *obdoff = lov_off * ssize + stripe_off;
-        return ret;
+	if (stripe_off < this_stripe) {
+		stripe_off = 0;
+		ret = -1;
+	} else {
+		stripe_off -= this_stripe;
+
+		if (stripe_off >= ssize) {
+			stripe_off = ssize;
+			ret = 1;
+		}
+	}
+
+	*obdoff = lov_off * ssize + stripe_off;
+	return ret;
 }
 
-/* Given a whole-file size and a stripe number, give the file size which
+/*
+ * Given a whole-file size and a stripe number, give the file size which
  * corresponds to the individual object of that stripe.
  *
  * This behaves basically in the same was as lov_stripe_offset, except that
@@ -197,8 +204,8 @@ loff_t lov_size_to_stripe(struct lov_stripe_md *lsm, int index, u64 file_size,
 	loff_t this_stripe;
 	loff_t swidth;
 
-        if (file_size == OBD_OBJECT_EOF)
-                return OBD_OBJECT_EOF;
+	if (file_size == OBD_OBJECT_EOF)
+		return OBD_OBJECT_EOF;
 
 	swidth = stripe_width(lsm, index);
 
@@ -206,35 +213,39 @@ loff_t lov_size_to_stripe(struct lov_stripe_md *lsm, int index, u64 file_size,
 	stripe_off = lov_do_div64(file_size, swidth);
 
 	this_stripe = (loff_t)stripeno * ssize;
-        if (stripe_off < this_stripe) {
-                /* Move to end of previous stripe, or zero */
-                if (file_size > 0) {
-                        file_size--;
-                        stripe_off = ssize;
-                } else {
-                        stripe_off = 0;
-                }
-        } else {
-                stripe_off -= this_stripe;
-
-                if (stripe_off >= ssize) {
-                        /* Clamp to end of this stripe */
-                        stripe_off = ssize;
-                }
-        }
-
-        return (file_size * ssize + stripe_off);
+	if (stripe_off < this_stripe) {
+		/* Move to end of previous stripe, or zero */
+		if (file_size > 0) {
+			file_size--;
+			stripe_off = ssize;
+		} else {
+			stripe_off = 0;
+		}
+	} else {
+		stripe_off -= this_stripe;
+
+		if (stripe_off >= ssize) {
+			/* Clamp to end of this stripe */
+			stripe_off = ssize;
+		}
+	}
+
+	return (file_size * ssize + stripe_off);
 }
 
-/* given an extent in an lov and a stripe, calculate the extent of the stripe
+/*
+ * given an extent in an lov and a stripe, calculate the extent of the stripe
  * that is contained within the lov extent.  this returns true if the given
- * stripe does intersect with the lov extent. */
+ * stripe does intersect with the lov extent.
+ *
+ * Closed interval [@obd_start, @obd_end] will be returned.
+ */
 int lov_stripe_intersects(struct lov_stripe_md *lsm, int index, int stripeno,
 			  struct lu_extent *ext, u64 *obd_start, u64 *obd_end)
 {
 	struct lov_stripe_md_entry *entry = lsm->lsm_entries[index];
 	u64 start, end;
-        int start_side, end_side;
+	int start_side, end_side;
 
 	if (!lu_extent_is_overlapped(ext, &entry->lsme_extent))
 			return 0;
@@ -250,24 +261,28 @@ int lov_stripe_intersects(struct lov_stripe_md *lsm, int index, int stripeno,
 	CDEBUG(D_INODE, "[%lld->%lld] -> [(%d) %lld->%lld (%d)]\n",
 		start, end, start_side, *obd_start, *obd_end, end_side);
 
-        /* this stripe doesn't intersect the file extent when neither
-         * start or the end intersected the stripe and obd_start and
-         * obd_end got rounded up to the save value. */
-        if (start_side != 0 && end_side != 0 && *obd_start == *obd_end)
-                return 0;
-
-        /* as mentioned in the lov_stripe_offset commentary, end
-         * might have been shifted in the wrong direction.  This
-         * happens when an end offset is before the stripe when viewed
-         * through the "mod stripe size" math. we detect it being shifted
-         * in the wrong direction and touch it up.
-         * interestingly, this can't underflow since end must be > start
-         * if we passed through the previous check.
-         * (should we assert for that somewhere?) */
-        if (end_side != 0)
-                (*obd_end)--;
-
-        return 1;
+	/*
+	 * this stripe doesn't intersect the file extent when neither
+	 * start or the end intersected the stripe and obd_start and
+	 * obd_end got rounded up to the save value.
+	 */
+	if (start_side != 0 && end_side != 0 && *obd_start == *obd_end)
+		return 0;
+
+	/*
+	 * as mentioned in the lov_stripe_offset commentary, end
+	 * might have been shifted in the wrong direction.  This
+	 * happens when an end offset is before the stripe when viewed
+	 * through the "mod stripe size" math. we detect it being shifted
+	 * in the wrong direction and touch it up.
+	 * interestingly, this can't underflow since end must be > start
+	 * if we passed through the previous check.
+	 * (should we assert for that somewhere?)
+	 */
+	if (end_side != 0)
+		(*obd_end)--;
+
+	return 1;
 }
 
 /* compute which stripe number "lov_off" will be written into */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_pack.c b/drivers/staging/lustrefsx/lustre/lov/lov_pack.c
index dd29ff51dcc1c..6fe3c2ff5bd5b 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_pack.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_pack.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,9 +38,6 @@
 
 #define DEBUG_SUBSYSTEM S_LOV
 
-#include <lustre/lustre_idl.h>
-#include <lustre/lustre_user.h>
-
 #include <lustre_net.h>
 #include <lustre_swab.h>
 #include <obd.h>
@@ -53,16 +50,16 @@
 void lov_dump_lmm_common(int level, void *lmmp)
 {
 	struct lov_mds_md *lmm = lmmp;
-	struct ost_id	oi;
+	struct ost_id oi;
 
 	lmm_oi_le_to_cpu(&oi, &lmm->lmm_oi);
-	CDEBUG_LIMIT(level, "objid "DOSTID", magic 0x%08x, pattern %#x\n",
-		     POSTID(&oi), le32_to_cpu(lmm->lmm_magic),
-		     le32_to_cpu(lmm->lmm_pattern));
-	CDEBUG_LIMIT(level, "stripe_size %u, stripe_count %u, layout_gen %u\n",
-		     le32_to_cpu(lmm->lmm_stripe_size),
-		     le16_to_cpu(lmm->lmm_stripe_count),
-		     le16_to_cpu(lmm->lmm_layout_gen));
+	CDEBUG(level, "objid "DOSTID", magic 0x%08x, pattern %#x\n",
+	       POSTID(&oi), le32_to_cpu(lmm->lmm_magic),
+	       le32_to_cpu(lmm->lmm_pattern));
+	CDEBUG(level, "stripe_size %u, stripe_count %u, layout_gen %u\n",
+	       le32_to_cpu(lmm->lmm_stripe_size),
+	       le16_to_cpu(lmm->lmm_stripe_count),
+	       le16_to_cpu(lmm->lmm_layout_gen));
 }
 
 static void lov_dump_lmm_objects(int level, struct lov_ost_data *lod,
@@ -71,9 +68,8 @@ static void lov_dump_lmm_objects(int level, struct lov_ost_data *lod,
 	int i;
 
 	if (stripe_count > LOV_V1_INSANE_STRIPE_COUNT) {
-		CDEBUG_LIMIT(level,
-			     "bad stripe_count %u > max_stripe_count %u\n",
-			     stripe_count, LOV_V1_INSANE_STRIPE_COUNT);
+		CDEBUG(level, "bad stripe_count %u > max_stripe_count %u\n",
+		       stripe_count, LOV_V1_INSANE_STRIPE_COUNT);
 		return;
 	}
 
@@ -81,22 +77,22 @@ static void lov_dump_lmm_objects(int level, struct lov_ost_data *lod,
 		struct ost_id oi;
 
 		ostid_le_to_cpu(&lod->l_ost_oi, &oi);
-		CDEBUG_LIMIT(level, "stripe %u idx %u subobj "DOSTID"\n", i,
-			     le32_to_cpu(lod->l_ost_idx), POSTID(&oi));
+		CDEBUG(level, "stripe %u idx %u subobj "DOSTID"\n", i,
+		       le32_to_cpu(lod->l_ost_idx), POSTID(&oi));
 	}
 }
 
 void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm)
 {
-        lov_dump_lmm_common(level, lmm);
-        lov_dump_lmm_objects(level, lmm->lmm_objects,
-                             le16_to_cpu(lmm->lmm_stripe_count));
+	lov_dump_lmm_common(level, lmm);
+	lov_dump_lmm_objects(level, lmm->lmm_objects,
+			     le16_to_cpu(lmm->lmm_stripe_count));
 }
 
 void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm)
 {
 	lov_dump_lmm_common(level, lmm);
-	CDEBUG_LIMIT(level, "pool_name "LOV_POOLNAMEF"\n", lmm->lmm_pool_name);
+	CDEBUG(level, "pool_name "LOV_POOLNAMEF"\n", lmm->lmm_pool_name);
 	lov_dump_lmm_objects(level, lmm->lmm_objects,
 			     le16_to_cpu(lmm->lmm_stripe_count));
 }
@@ -114,8 +110,8 @@ void lov_dump_lmm(int level, void *lmm)
 		lov_dump_lmm_v3(level, (struct lov_mds_md_v3 *)lmm);
 		break;
 	default:
-		CDEBUG_LIMIT(level, "unrecognized lmm_magic %x, assuming %x\n",
-			     magic, LOV_MAGIC_V1);
+		CDEBUG(level, "unrecognized lmm_magic %x, assuming %x\n",
+		       magic, LOV_MAGIC_V1);
 		lov_dump_lmm_common(level, lmm);
 		break;
 	}
@@ -137,6 +133,7 @@ ssize_t lov_lsm_pack_v1v3(const struct lov_stripe_md *lsm, void *buf,
 	struct lov_ost_data_v1 *lmm_objects;
 	size_t lmm_size;
 	unsigned int i;
+
 	ENTRY;
 
 	lmm_size = lov_mds_md_size(lsm->lsm_entries[0]->lsme_stripe_count,
@@ -147,7 +144,8 @@ ssize_t lov_lsm_pack_v1v3(const struct lov_stripe_md *lsm, void *buf,
 	if (buf_size < lmm_size)
 		RETURN(-ERANGE);
 
-	/* lmmv1 and lmmv3 point to the same struct and have the
+	/*
+	 * lmmv1 and lmmv3 point to the same struct and have the
 	 * same first fields
 	 */
 	lmmv1->lmm_magic = cpu_to_le32(lsm->lsm_magic);
@@ -195,6 +193,7 @@ ssize_t lov_lsm_pack(const struct lov_stripe_md *lsm, void *buf,
 	unsigned int offset;
 	unsigned int size;
 	unsigned int i;
+
 	ENTRY;
 
 	if (lsm->lsm_magic == LOV_MAGIC_V1 || lsm->lsm_magic == LOV_MAGIC_V3)
@@ -210,6 +209,8 @@ ssize_t lov_lsm_pack(const struct lov_stripe_md *lsm, void *buf,
 	lcmv1->lcm_magic = cpu_to_le32(lsm->lsm_magic);
 	lcmv1->lcm_size = cpu_to_le32(lmm_size);
 	lcmv1->lcm_layout_gen = cpu_to_le32(lsm->lsm_layout_gen);
+	lcmv1->lcm_flags = cpu_to_le16(lsm->lsm_flags);
+	lcmv1->lcm_mirror_count = cpu_to_le16(lsm->lsm_mirror_count);
 	lcmv1->lcm_entry_count = cpu_to_le16(lsm->lsm_entry_count);
 
 	offset = sizeof(*lcmv1) + sizeof(*lcme) * lsm->lsm_entry_count;
@@ -224,6 +225,9 @@ ssize_t lov_lsm_pack(const struct lov_stripe_md *lsm, void *buf,
 
 		lcme->lcme_id = cpu_to_le32(lsme->lsme_id);
 		lcme->lcme_flags = cpu_to_le32(lsme->lsme_flags);
+		if (lsme->lsme_flags & LCME_FL_NOSYNC)
+			lcme->lcme_timestamp =
+				cpu_to_le64(lsme->lsme_timestamp);
 		lcme->lcme_extent.e_start =
 			cpu_to_le64(lsme->lsme_extent.e_start);
 		lcme->lcme_extent.e_end =
@@ -286,8 +290,10 @@ __u16 lov_get_stripe_count(struct lov_obd *lov, __u32 magic, __u16 stripe_count)
 	if (!stripe_count)
 		stripe_count = 1;
 
-	/* stripe count is based on whether ldiskfs can handle
-	 * larger EA sizes */
+	/*
+	 * stripe count is based on whether ldiskfs can handle
+	 * larger EA sizes
+	 */
 	if (lov->lov_ocd.ocd_connect_flags & OBD_CONNECT_MAX_EASIZE &&
 	    lov->lov_ocd.ocd_max_easize)
 		max_stripes = lov_mds_md_max_stripe_count(
@@ -313,7 +319,8 @@ int lov_free_memmd(struct lov_stripe_md **lsmp)
 	return refc;
 }
 
-/* Unpack LOV object metadata from disk storage.  It is packed in LE byte
+/*
+ * Unpack LOV object metadata from disk storage.  It is packed in LE byte
  * order and is opaque to the networking layer.
  */
 struct lov_stripe_md *lov_unpackmd(struct lov_obd *lov, void *buf,
@@ -322,6 +329,7 @@ struct lov_stripe_md *lov_unpackmd(struct lov_obd *lov, void *buf,
 	const struct lsm_operations *op;
 	struct lov_stripe_md *lsm;
 	u32 magic;
+
 	ENTRY;
 
 	if (buf_size < sizeof(magic))
@@ -329,7 +337,7 @@ struct lov_stripe_md *lov_unpackmd(struct lov_obd *lov, void *buf,
 
 	magic = le32_to_cpu(*(u32 *)buf);
 	op = lsm_op_find(magic);
-	if (op == NULL)
+	if (!op)
 		RETURN(ERR_PTR(-EINVAL));
 
 	lsm = op->lsm_unpackmd(lov, buf, buf_size);
@@ -337,7 +345,8 @@ struct lov_stripe_md *lov_unpackmd(struct lov_obd *lov, void *buf,
 	RETURN(lsm);
 }
 
-/* Retrieve object striping information.
+/*
+ * Retrieve object striping information.
  *
  * @lump is a pointer to an in-core struct with lmm_ost_count indicating
  * the maximum number of OST indices which will fit in the user buffer.
@@ -353,10 +362,10 @@ int lov_getstripe(const struct lu_env *env, struct lov_object *obj,
 	/* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */
 	struct lov_mds_md *lmmk, *lmm;
 	struct lov_user_md_v1 lum;
-	size_t	lmmk_size;
-	ssize_t	lmm_size, lum_size = 0;
-	static bool printed;
-	int	rc = 0;
+	size_t lmmk_size, lum_size = 0;
+	ssize_t lmm_size;
+	int rc = 0;
+
 	ENTRY;
 
 	if (lsm->lsm_magic != LOV_MAGIC_V1 && lsm->lsm_magic != LOV_MAGIC_V3 &&
@@ -366,18 +375,10 @@ int lov_getstripe(const struct lu_env *env, struct lov_object *obj,
 		GOTO(out, rc = -EIO);
 	}
 
-	if (!printed) {
-		LCONSOLE_WARN("%s: using old ioctl(LL_IOC_LOV_GETSTRIPE) on "
-			      DFID", use llapi_layout_get_by_path()\n",
-			      current->comm,
-			      PFID(&obj->lo_cl.co_lu.lo_header->loh_fid));
-		printed = true;
-	}
-
 	lmmk_size = lov_comp_md_size(lsm);
 
 	OBD_ALLOC_LARGE(lmmk, lmmk_size);
-	if (lmmk == NULL)
+	if (!lmmk)
 		GOTO(out, rc = -ENOMEM);
 
 	lmm_size = lov_lsm_pack(lsm, lmmk, lmmk_size);
@@ -397,8 +398,10 @@ int lov_getstripe(const struct lu_env *env, struct lov_object *obj,
 		}
 	}
 
-	/* Legacy appication passes limited buffer, we need to figure out
-	 * the user buffer size by the passed in lmm_stripe_count. */
+	/*
+	 * Legacy appication passes limited buffer, we need to figure out
+	 * the user buffer size by the passed in lmm_stripe_count.
+	 */
 	if (copy_from_user(&lum, lump, sizeof(struct lov_user_md_v1)))
 		GOTO(out_free, rc = -EFAULT);
 
@@ -410,8 +413,10 @@ int lov_getstripe(const struct lu_env *env, struct lov_object *obj,
 	if (lum_size != 0) {
 		struct lov_mds_md *comp_md = lmmk;
 
-		/* Legacy app (ADIO for instance) treats the layout as V1/V3
-		 * blindly, we'd return a reasonable V1/V3 for them. */
+		/*
+		 * Legacy app (ADIO for instance) treats the layout as V1/V3
+		 * blindly, we'd return a reasonable V1/V3 for them.
+		 */
 		if (lmmk->lmm_magic == LOV_MAGIC_COMP_V1) {
 			struct lov_comp_md_v1 *comp_v1;
 			struct cl_object *cl_obj;
@@ -424,8 +429,10 @@ int lov_getstripe(const struct lu_env *env, struct lov_object *obj,
 			cl_object_attr_get(env, cl_obj, &attr);
 			cl_object_attr_unlock(cl_obj);
 
-			/* return the last instantiated component if file size
-			 * is non-zero, otherwise, return the last component.*/
+			/*
+			 * return the last instantiated component if file size
+			 * is non-zero, otherwise, return the last component.
+			 */
 			comp_v1 = (struct lov_comp_md_v1 *)lmmk;
 			i = attr.cat_size == 0 ? comp_v1->lcm_entry_count : 0;
 			for (; i < comp_v1->lcm_entry_count; i++) {
@@ -437,10 +444,11 @@ int lov_getstripe(const struct lu_env *env, struct lov_object *obj,
 				i--;
 			comp_md = (struct lov_mds_md *)((char *)comp_v1 +
 					comp_v1->lcm_entries[i].lcme_offset);
+			lum_size = comp_v1->lcm_entries[i].lcme_size;
 		}
 
 		lmm = comp_md;
-		lmm_size = lum_size;
+		lmm_size = min(lum_size, lmmk_size);
 	} else {
 		lmm = lmmk;
 		lmm_size = lmmk_size;
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_page.c b/drivers/staging/lustrefsx/lustre/lov/lov_page.c
index 869c0b8478760..34fbc66e47172 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_page.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_page.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -56,8 +56,8 @@ static int lov_comp_page_print(const struct lu_env *env,
 	struct lov_page *lp = cl2lov_page(slice);
 
 	return (*printer)(env, cookie,
-			  LUSTRE_LOV_NAME"-page@%p, comp index: %x\n",
-			  lp, lp->lps_index);
+			  LUSTRE_LOV_NAME"-page@%p, comp index: %x, gen: %u\n",
+			  lp, lp->lps_index, lp->lps_layout_gen);
 }
 
 static const struct cl_page_operations lov_comp_page_ops = {
@@ -68,21 +68,22 @@ int lov_page_init_composite(const struct lu_env *env, struct cl_object *obj,
 			    struct cl_page *page, pgoff_t index)
 {
 	struct lov_object *loo = cl2lov(obj);
-	struct lov_io     *lio = lov_env_io(env);
-	struct cl_object  *subobj;
-	struct cl_object  *o;
+	struct lov_io *lio = lov_env_io(env);
+	struct cl_object *subobj;
+	struct cl_object *o;
 	struct lov_io_sub *sub;
-	struct lov_page   *lpg = cl_object_page_slice(obj, page);
+	struct lov_page *lpg = cl_object_page_slice(obj, page);
 	struct lov_layout_raid0 *r0;
-	loff_t             offset;
-	loff_t             suboff;
-	int                entry;
-	int                stripe;
-	int                rc;
+	loff_t offset;
+	loff_t suboff;
+	int entry;
+	int stripe;
+	int rc;
+
 	ENTRY;
 
 	offset = cl_offset(obj, index);
-	entry = lov_lsm_entry(loo->lo_lsm, offset);
+	entry = lov_io_layout_at(lio, offset);
 	if (entry < 0 || !lsm_entry_inited(loo->lo_lsm, entry)) {
 		/* non-existing layout component */
 		lov_page_init_empty(env, obj, page, index);
@@ -96,6 +97,7 @@ int lov_page_init_composite(const struct lu_env *env, struct cl_object *obj,
 	LASSERT(rc == 0);
 
 	lpg->lps_index = lov_comp_index(entry, stripe);
+	lpg->lps_layout_gen = loo->lo_lsm->lsm_layout_gen;
 	cl_page_slice_add(page, &lpg->lps_cl, obj, index, &lov_comp_page_ops);
 
 	sub = lov_sub_get(env, lio, lpg->lps_index);
@@ -105,7 +107,7 @@ int lov_page_init_composite(const struct lu_env *env, struct cl_object *obj,
 	subobj = lovsub2cl(r0->lo_sub[stripe]);
 	list_for_each_entry(o, &subobj->co_lu.lo_header->loh_layers,
 			    co_lu.lo_linkage) {
-		if (o->co_ops->coo_page_init != NULL) {
+		if (o->co_ops->coo_page_init) {
 			rc = o->co_ops->coo_page_init(sub->sub_env, o, page,
 						      cl_index(subobj, suboff));
 			if (rc != 0)
@@ -120,9 +122,9 @@ static int lov_empty_page_print(const struct lu_env *env,
 				const struct cl_page_slice *slice,
 				void *cookie, lu_printer_t printer)
 {
-        struct lov_page *lp = cl2lov_page(slice);
+	struct lov_page *lp = cl2lov_page(slice);
 
-        return (*printer)(env, cookie, LUSTRE_LOV_NAME"-page@%p, empty.\n", lp);
+	return (*printer)(env, cookie, LUSTRE_LOV_NAME"-page@%p, empty.\n", lp);
 }
 
 static const struct cl_page_operations lov_empty_page_ops = {
@@ -134,8 +136,10 @@ int lov_page_init_empty(const struct lu_env *env, struct cl_object *obj,
 {
 	struct lov_page *lpg = cl_object_page_slice(obj, page);
 	void *addr;
+
 	ENTRY;
 
+	lpg->lps_index = ~0;
 	cl_page_slice_add(page, &lpg->lps_cl, obj, index, &lov_empty_page_ops);
 	addr = kmap(page->cp_vmpage);
 	memset(addr, 0, cl_page_size(obj));
@@ -144,6 +148,14 @@ int lov_page_init_empty(const struct lu_env *env, struct cl_object *obj,
 	RETURN(0);
 }
 
+bool lov_page_is_empty(const struct cl_page *page)
+{
+	const struct cl_page_slice *slice = cl_page_at(page, &lov_device_type);
+
+	LASSERT(slice != NULL);
+	return slice->cpl_ops == &lov_empty_page_ops;
+}
+
 
 /** @} lov */
 
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_pool.c b/drivers/staging/lustrefsx/lustre/lov/lov_pool.c
index 02b8899cb1b68..6173dbe1429ae 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_pool.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_pool.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2014, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -152,7 +152,6 @@ struct cfs_hash_ops pool_hash_operations = {
 };
 
 #ifdef CONFIG_PROC_FS
-/* ifdef needed for liblustre support */
 /*
  * pool /proc seq_file methods
  */
@@ -182,14 +181,11 @@ static void *pool_proc_next(struct seq_file *s, void *v, loff_t *pos)
 
         /* iterate to find a non empty entry */
         prev_idx = iter->idx;
-	down_read(&pool_tgt_rw_sem(iter->pool));
         iter->idx++;
-        if (iter->idx == pool_tgt_count(iter->pool)) {
+	if (iter->idx >= pool_tgt_count(iter->pool)) {
                 iter->idx = prev_idx; /* we stay on the last entry */
-		up_read(&pool_tgt_rw_sem(iter->pool));
                 return NULL;
         }
-	up_read(&pool_tgt_rw_sem(iter->pool));
         (*pos)++;
         /* return != NULL to continue */
         return iter;
@@ -220,6 +216,7 @@ static void *pool_proc_start(struct seq_file *s, loff_t *pos)
          * we can free it at stop() */
         /* /!\ do not forget to restore it to pool before freeing it */
         s->private = iter;
+	down_read(&pool_tgt_rw_sem(pool));
         if (*pos > 0) {
                 loff_t i;
                 void *ptr;
@@ -241,6 +238,7 @@ static void pool_proc_stop(struct seq_file *s, void *v)
          * calling start() method (see seq_read() from fs/seq_file.c)
          * we have to free only if s->private is an iterator */
         if ((iter) && (iter->magic == POOL_IT_MAGIC)) {
+		up_read(&pool_tgt_rw_sem(iter->pool));
                 /* we restore s->private so next call to pool_proc_start()
                  * will work */
                 s->private = iter->pool;
@@ -259,9 +257,7 @@ static int pool_proc_show(struct seq_file *s, void *v)
 	LASSERT(iter->pool != NULL);
 	LASSERT(iter->idx <= pool_tgt_count(iter->pool));
 
-	down_read(&pool_tgt_rw_sem(iter->pool));
         tgt = pool_tgt(iter->pool, iter->idx);
-	up_read(&pool_tgt_rw_sem(iter->pool));
         if (tgt)
                 seq_printf(s, "%s\n", obd_uuid2str(&(tgt->ltd_uuid)));
 
@@ -287,7 +283,7 @@ static int pool_proc_open(struct inode *inode, struct file *file)
         return rc;
 }
 
-static struct proc_ops pool_proc_operations = {
+const static struct proc_ops pool_proc_operations = {
 	.proc_open	= pool_proc_open,
 	.proc_read	= seq_read,
 	.proc_lseek	= seq_lseek,
@@ -549,7 +545,7 @@ int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname)
 
 
         /* search ost in lov array */
-        obd_getref(obd);
+	lov_tgts_getref(obd);
         for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) {
                 if (!lov->lov_tgts[lov_idx])
                         continue;
@@ -570,9 +566,10 @@ int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname)
 
         EXIT;
 out:
-        obd_putref(obd);
-        lov_pool_putref(pool);
-        return rc;
+	lov_tgts_putref(obd);
+	lov_pool_putref(pool);
+
+	return rc;
 }
 
 int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname)
@@ -592,7 +589,7 @@ int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname)
 
         obd_str2uuid(&ost_uuid, ostname);
 
-        obd_getref(obd);
+	lov_tgts_getref(obd);
         /* search ost in lov array, to get index */
         for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) {
                 if (!lov->lov_tgts[lov_idx])
@@ -614,7 +611,8 @@ int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname)
 
         EXIT;
 out:
-        obd_putref(obd);
-        lov_pool_putref(pool);
-        return rc;
+	lov_tgts_putref(obd);
+	lov_pool_putref(pool);
+
+	return rc;
 }
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_request.c b/drivers/staging/lustrefsx/lustre/lov/lov_request.c
index fe74af4b7f82d..75e5c901fd91e 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_request.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_request.c
@@ -35,8 +35,6 @@
 #include <libcfs/libcfs.h>
 
 #include <obd_class.h>
-#include <lustre/lustre_idl.h>
-
 #include "lov_internal.h"
 
 static void lov_init_set(struct lov_request_set *set)
@@ -51,6 +49,7 @@ static void lov_finish_set(struct lov_request_set *set)
 {
 	struct list_head *pos, *n;
 	struct lov_request *req;
+
 	ENTRY;
 
 	LASSERT(set != NULL);
@@ -58,7 +57,7 @@ static void lov_finish_set(struct lov_request_set *set)
 		req = list_entry(pos, struct lov_request, rq_link);
 		list_del_init(&req->rq_link);
 
-		if (req->rq_oi.oi_osfs != NULL)
+		if (req->rq_oi.oi_osfs)
 			OBD_FREE_PTR(req->rq_oi.oi_osfs);
 
 		OBD_FREE_PTR(req);
@@ -80,18 +79,18 @@ static void
 lov_set_add_req(struct lov_request *req, struct lov_request_set *set)
 {
 	list_add_tail(&req->rq_link, &set->set_list);
-        set->set_count++;
-        req->rq_rqset = set;
+	set->set_count++;
+	req->rq_rqset = set;
 }
 
 static int lov_check_set(struct lov_obd *lov, int idx)
 {
 	int rc = 0;
+
 	mutex_lock(&lov->lov_lock);
 
-	if (lov->lov_tgts[idx] == NULL ||
-	    lov->lov_tgts[idx]->ltd_active ||
-	    (lov->lov_tgts[idx]->ltd_exp != NULL &&
+	if (!lov->lov_tgts[idx] || lov->lov_tgts[idx]->ltd_active ||
+	    (lov->lov_tgts[idx]->ltd_exp &&
 	     class_exp2cliimp(lov->lov_tgts[idx]->ltd_exp)->imp_connect_tried))
 		rc = 1;
 
@@ -99,7 +98,8 @@ static int lov_check_set(struct lov_obd *lov, int idx)
 	return rc;
 }
 
-/* Check if the OSC connection exists and is active.
+/*
+ * Check if the OSC connection exists and is active.
  * If the OSC has not yet had a chance to connect to the OST the first time,
  * wait once for it to connect instead of returning an error.
  */
@@ -108,19 +108,24 @@ static int lov_check_and_wait_active(struct lov_obd *lov, int ost_idx)
 	wait_queue_head_t waitq;
 	struct l_wait_info lwi;
 	struct lov_tgt_desc *tgt;
+	struct obd_import *imp = NULL;
 	int rc = 0;
 
 	mutex_lock(&lov->lov_lock);
 
 	tgt = lov->lov_tgts[ost_idx];
 
-	if (unlikely(tgt == NULL))
+	if (unlikely(!tgt))
 		GOTO(out, rc = 0);
 
 	if (likely(tgt->ltd_active))
 		GOTO(out, rc = 1);
 
-	if (tgt->ltd_exp && class_exp2cliimp(tgt->ltd_exp)->imp_connect_tried)
+	if (tgt->ltd_exp)
+		imp = class_exp2cliimp(tgt->ltd_exp);
+	if (imp && imp->imp_connect_tried)
+		GOTO(out, rc = 0);
+	if (imp && imp->imp_state == LUSTRE_IMP_IDLE)
 		GOTO(out, rc = 0);
 
 	mutex_unlock(&lov->lov_lock);
@@ -142,20 +147,20 @@ static int lov_check_and_wait_active(struct lov_obd *lov, int ost_idx)
 
 #define LOV_U64_MAX ((__u64)~0ULL)
 #define LOV_SUM_MAX(tot, add)                                           \
-        do {                                                            \
-                if ((tot) + (add) < (tot))                              \
-                        (tot) = LOV_U64_MAX;                            \
-                else                                                    \
-                        (tot) += (add);                                 \
-        } while(0)
+	do {                                                            \
+		if ((tot) + (add) < (tot))                              \
+			(tot) = LOV_U64_MAX;                            \
+		else                                                    \
+			(tot) += (add);                                 \
+	} while (0)
 
 static int
 lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs, int success)
 {
-        ENTRY;
+	ENTRY;
 
-        if (success) {
-                __u32 expected_stripes = lov_get_stripe_count(&obd->u.lov,
+	if (success) {
+		__u32 expected_stripes = lov_get_stripe_count(&obd->u.lov,
 							      LOV_MAGIC, 0);
 		if (osfs->os_files != LOV_U64_MAX)
 			lov_do_div64(osfs->os_files, expected_stripes);
@@ -164,7 +169,7 @@ lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs, int success)
 
 		spin_lock(&obd->obd_osfs_lock);
 		memcpy(&obd->obd_osfs, osfs, sizeof(*osfs));
-		obd->obd_osfs_age = cfs_time_current_64();
+		obd->obd_osfs_age = ktime_get_seconds();
 		spin_unlock(&obd->obd_osfs_lock);
 		RETURN(0);
 	}
@@ -177,7 +182,7 @@ int lov_fini_statfs_set(struct lov_request_set *set)
 	int rc = 0;
 	ENTRY;
 
-	if (set == NULL)
+	if (!set)
 		RETURN(0);
 
 	if (atomic_read(&set->set_completes)) {
@@ -194,84 +199,91 @@ static void
 lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs,
 		  int success)
 {
-        int shift = 0, quit = 0;
-        __u64 tmp;
-
-        if (success == 0) {
-                memcpy(osfs, lov_sfs, sizeof(*lov_sfs));
-        } else {
-                if (osfs->os_bsize != lov_sfs->os_bsize) {
-                        /* assume all block sizes are always powers of 2 */
-                        /* get the bits difference */
-                        tmp = osfs->os_bsize | lov_sfs->os_bsize;
-                        for (shift = 0; shift <= 64; ++shift) {
-                                if (tmp & 1) {
-                                        if (quit)
-                                                break;
-                                        else
-                                                quit = 1;
-                                        shift = 0;
-                                }
-                                tmp >>= 1;
-                        }
-                }
-
-                if (osfs->os_bsize < lov_sfs->os_bsize) {
-                        osfs->os_bsize = lov_sfs->os_bsize;
-
-                        osfs->os_bfree  >>= shift;
-                        osfs->os_bavail >>= shift;
-                        osfs->os_blocks >>= shift;
-                } else if (shift != 0) {
-                        lov_sfs->os_bfree  >>= shift;
-                        lov_sfs->os_bavail >>= shift;
-                        lov_sfs->os_blocks >>= shift;
-                }
+	int shift = 0, quit = 0;
+	__u64 tmp;
+
+	if (success == 0) {
+		memcpy(osfs, lov_sfs, sizeof(*lov_sfs));
+	} else {
+		if (osfs->os_bsize != lov_sfs->os_bsize) {
+			/* assume all block sizes are always powers of 2 */
+			/* get the bits difference */
+			tmp = osfs->os_bsize | lov_sfs->os_bsize;
+			for (shift = 0; shift <= 64; ++shift) {
+				if (tmp & 1) {
+					if (quit)
+						break;
+					quit = 1;
+					shift = 0;
+				}
+				tmp >>= 1;
+			}
+		}
+
+		if (osfs->os_bsize < lov_sfs->os_bsize) {
+			osfs->os_bsize = lov_sfs->os_bsize;
+
+			osfs->os_bfree  >>= shift;
+			osfs->os_bavail >>= shift;
+			osfs->os_blocks >>= shift;
+		} else if (shift != 0) {
+			lov_sfs->os_bfree  >>= shift;
+			lov_sfs->os_bavail >>= shift;
+			lov_sfs->os_blocks >>= shift;
+		}
 #ifdef MIN_DF
-                /* Sandia requested that df (and so, statfs) only
-                   returned minimal available space on
-                   a single OST, so people would be able to
-                   write this much data guaranteed. */
-                if (osfs->os_bavail > lov_sfs->os_bavail) {
-                        /* Presumably if new bavail is smaller,
-                           new bfree is bigger as well */
-                        osfs->os_bfree = lov_sfs->os_bfree;
-                        osfs->os_bavail = lov_sfs->os_bavail;
-                }
+		/*
+		 * Sandia requested that df (and so, statfs) only
+		 * returned minimal available space on
+		 * a single OST, so people would be able to
+		 * write this much data guaranteed.
+		 */
+		if (osfs->os_bavail > lov_sfs->os_bavail) {
+			/*
+			 * Presumably if new bavail is smaller,
+			 * new bfree is bigger as well
+			 */
+			osfs->os_bfree = lov_sfs->os_bfree;
+			osfs->os_bavail = lov_sfs->os_bavail;
+		}
 #else
-                osfs->os_bfree += lov_sfs->os_bfree;
-                osfs->os_bavail += lov_sfs->os_bavail;
+		osfs->os_bfree += lov_sfs->os_bfree;
+		osfs->os_bavail += lov_sfs->os_bavail;
 #endif
-                osfs->os_blocks += lov_sfs->os_blocks;
-                /* XXX not sure about this one - depends on policy.
-                 *   - could be minimum if we always stripe on all OBDs
-                 *     (but that would be wrong for any other policy,
-                 *     if one of the OBDs has no more objects left)
-                 *   - could be sum if we stripe whole objects
-                 *   - could be average, just to give a nice number
-                 *
-                 * To give a "reasonable" (if not wholly accurate)
-                 * number, we divide the total number of free objects
-                 * by expected stripe count (watch out for overflow).
-                 */
-                LOV_SUM_MAX(osfs->os_files, lov_sfs->os_files);
-                LOV_SUM_MAX(osfs->os_ffree, lov_sfs->os_ffree);
-        }
+		osfs->os_blocks += lov_sfs->os_blocks;
+		/*
+		 * XXX not sure about this one - depends on policy.
+		 *   - could be minimum if we always stripe on all OBDs
+		 *     (but that would be wrong for any other policy,
+		 *     if one of the OBDs has no more objects left)
+		 *   - could be sum if we stripe whole objects
+		 *   - could be average, just to give a nice number
+		 *
+		 * To give a "reasonable" (if not wholly accurate)
+		 * number, we divide the total number of free objects
+		 * by expected stripe count (watch out for overflow).
+		 */
+		LOV_SUM_MAX(osfs->os_files, lov_sfs->os_files);
+		LOV_SUM_MAX(osfs->os_ffree, lov_sfs->os_ffree);
+	}
 }
 
-/* The callback for osc_statfs_async that finilizes a request info when a
- * response is received. */
+/*
+ * The callback for osc_statfs_async that finilizes a request info when a
+ * response is received.
+ */
 static int cb_statfs_update(void *cookie, int rc)
 {
-        struct obd_info *oinfo = cookie;
-        struct lov_request *lovreq;
-        struct lov_request_set *set;
-        struct obd_statfs *osfs, *lov_sfs;
-        struct lov_obd *lov;
-        struct lov_tgt_desc *tgt;
-        struct obd_device *lovobd, *tgtobd;
-        int success;
-        ENTRY;
+	struct obd_info *oinfo = cookie;
+	struct lov_request *lovreq;
+	struct lov_request_set *set;
+	struct obd_statfs *osfs, *lov_sfs;
+	struct lov_obd *lov;
+	struct lov_tgt_desc *tgt;
+	struct obd_device *lovobd, *tgtobd;
+	int success;
+
+	ENTRY;
 
 	lovreq = container_of(oinfo, struct lov_request, rq_oi);
 	set = lovreq->rq_rqset;
@@ -280,91 +292,101 @@ static int cb_statfs_update(void *cookie, int rc)
 	osfs = set->set_oi->oi_osfs;
 	lov_sfs = oinfo->oi_osfs;
 	success = atomic_read(&set->set_success);
-	/* XXX: the same is done in lov_update_common_set, however
-	   lovset->set_exp is not initialized. */
+	/*
+	 * XXX: the same is done in lov_update_common_set, however
+	 * lovset->set_exp is not initialized.
+	 */
 	lov_update_set(set, lovreq, rc);
 	if (rc)
 		GOTO(out, rc);
 
-        obd_getref(lovobd);
-        tgt = lov->lov_tgts[lovreq->rq_idx];
-        if (!tgt || !tgt->ltd_active)
-                GOTO(out_update, rc);
+	lov_tgts_getref(lovobd);
+	tgt = lov->lov_tgts[lovreq->rq_idx];
+	if (!tgt || !tgt->ltd_active)
+		GOTO(out_update, rc);
 
-        tgtobd = class_exp2obd(tgt->ltd_exp);
+	tgtobd = class_exp2obd(tgt->ltd_exp);
 	spin_lock(&tgtobd->obd_osfs_lock);
 	memcpy(&tgtobd->obd_osfs, lov_sfs, sizeof(*lov_sfs));
 	if ((oinfo->oi_flags & OBD_STATFS_FROM_CACHE) == 0)
-		tgtobd->obd_osfs_age = cfs_time_current_64();
+		tgtobd->obd_osfs_age = ktime_get_seconds();
 	spin_unlock(&tgtobd->obd_osfs_lock);
 
 out_update:
-        lov_update_statfs(osfs, lov_sfs, success);
-        obd_putref(lovobd);
-
+	lov_update_statfs(osfs, lov_sfs, success);
+	lov_tgts_putref(lovobd);
 out:
 	RETURN(0);
 }
 
 int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
-                        struct lov_request_set **reqset)
+			struct lov_request_set **reqset)
 {
-        struct lov_request_set *set;
-        struct lov_obd *lov = &obd->u.lov;
-        int rc = 0, i;
-        ENTRY;
+	struct lov_request_set *set;
+	struct lov_obd *lov = &obd->u.lov;
+	int rc = 0, i;
+
+	ENTRY;
 
-        OBD_ALLOC(set, sizeof(*set));
-        if (set == NULL)
-                RETURN(-ENOMEM);
-        lov_init_set(set);
+	OBD_ALLOC(set, sizeof(*set));
+	if (!set)
+		RETURN(-ENOMEM);
+	lov_init_set(set);
 
-        set->set_obd = obd;
-        set->set_oi = oinfo;
+	set->set_obd = obd;
+	set->set_oi = oinfo;
 
-        /* We only get block data from the OBD */
-        for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+	/* We only get block data from the OBD */
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+		struct lov_tgt_desc *ltd = lov->lov_tgts[i];
 		struct lov_request *req;
 
-		if (lov->lov_tgts[i] == NULL ||
-		    (oinfo->oi_flags & OBD_STATFS_NODELAY &&
-		     !lov->lov_tgts[i]->ltd_active)) {
+		if (!ltd) {
 			CDEBUG(D_HA, "lov idx %d inactive\n", i);
 			continue;
 		}
 
-		/* skip targets that have been explicitely disabled by the
-		 * administrator */
-		if (!lov->lov_tgts[i]->ltd_exp) {
+		/*
+		 * skip targets that have been explicitely disabled by the
+		 * administrator
+		 */
+		if (!ltd->ltd_exp) {
 			CDEBUG(D_HA, "lov idx %d administratively disabled\n",
 			       i);
 			continue;
 		}
 
-		if (!lov->lov_tgts[i]->ltd_active)
+		if (oinfo->oi_flags & OBD_STATFS_NODELAY &&
+		    class_exp2cliimp(ltd->ltd_exp)->imp_state !=
+		    LUSTRE_IMP_IDLE && !ltd->ltd_active) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", i);
+			continue;
+		}
+
+		if (!ltd->ltd_active)
 			lov_check_and_wait_active(lov, i);
 
 		OBD_ALLOC(req, sizeof(*req));
-		if (req == NULL)
+		if (!req)
 			GOTO(out_set, rc = -ENOMEM);
 
-                OBD_ALLOC(req->rq_oi.oi_osfs, sizeof(*req->rq_oi.oi_osfs));
-                if (req->rq_oi.oi_osfs == NULL) {
-                        OBD_FREE(req, sizeof(*req));
-                        GOTO(out_set, rc = -ENOMEM);
-                }
-
-                req->rq_idx = i;
-                req->rq_oi.oi_cb_up = cb_statfs_update;
-                req->rq_oi.oi_flags = oinfo->oi_flags;
-
-                lov_set_add_req(req, set);
-        }
-        if (!set->set_count)
-                GOTO(out_set, rc = -EIO);
-        *reqset = set;
-        RETURN(rc);
+		OBD_ALLOC(req->rq_oi.oi_osfs, sizeof(*req->rq_oi.oi_osfs));
+		if (!req->rq_oi.oi_osfs) {
+			OBD_FREE(req, sizeof(*req));
+			GOTO(out_set, rc = -ENOMEM);
+		}
+
+		req->rq_idx = i;
+		req->rq_oi.oi_cb_up = cb_statfs_update;
+		req->rq_oi.oi_flags = oinfo->oi_flags;
+
+		lov_set_add_req(req, set);
+	}
+	if (!set->set_count)
+		GOTO(out_set, rc = -EIO);
+	*reqset = set;
+	RETURN(rc);
 out_set:
-        lov_fini_statfs_set(set);
-        RETURN(rc);
+	lov_fini_statfs_set(set);
+	RETURN(rc);
 }
diff --git a/drivers/staging/lustrefsx/lustre/lov/lovsub_dev.c b/drivers/staging/lustrefsx/lustre/lov/lovsub_dev.c
index 0ada9b5b9ce53..90a11e75393b9 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lovsub_dev.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lovsub_dev.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2013, 2016, Intel Corporation.
+ * Copyright (c) 2013, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -49,33 +49,33 @@
  */
 
 static int lovsub_device_init(const struct lu_env *env, struct lu_device *d,
-                              const char *name, struct lu_device *next)
+			      const char *name, struct lu_device *next)
 {
-        struct lovsub_device  *lsd = lu2lovsub_dev(d);
-        struct lu_device_type *ldt;
-        int rc;
-
-        ENTRY;
-        next->ld_site = d->ld_site;
-        ldt = next->ld_type;
-        LASSERT(ldt != NULL);
-        rc = ldt->ldt_ops->ldto_device_init(env, next, ldt->ldt_name, NULL);
-        if (rc) {
-                next->ld_site = NULL;
-                RETURN(rc);
-        }
-
-        lu_device_get(next);
-        lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init);
-        lsd->acid_next = lu2cl_dev(next);
-        RETURN(rc);
+	struct lovsub_device  *lsd = lu2lovsub_dev(d);
+	struct lu_device_type *ldt;
+	int rc;
+
+	ENTRY;
+	next->ld_site = d->ld_site;
+	ldt = next->ld_type;
+	LASSERT(ldt != NULL);
+	rc = ldt->ldt_ops->ldto_device_init(env, next, ldt->ldt_name, NULL);
+	if (rc) {
+		next->ld_site = NULL;
+		RETURN(rc);
+	}
+
+	lu_device_get(next);
+	lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init);
+	lsd->acid_next = lu2cl_dev(next);
+	RETURN(rc);
 }
 
 static struct lu_device *lovsub_device_fini(const struct lu_env *env,
-                                            struct lu_device *d)
+					    struct lu_device *d)
 {
-        struct lu_device *next;
-        struct lovsub_device *lsd;
+	struct lu_device *next;
+	struct lovsub_device *lsd;
 
 	ENTRY;
 	lsd = lu2lovsub_dev(d);
@@ -87,8 +87,8 @@ static struct lu_device *lovsub_device_fini(const struct lu_env *env,
 static struct lu_device *lovsub_device_free(const struct lu_env *env,
 					    struct lu_device *d)
 {
-	struct lovsub_device *lsd  = lu2lovsub_dev(d);
-	struct lu_device     *next = cl2lu_dev(lsd->acid_next);
+	struct lovsub_device *lsd = lu2lovsub_dev(d);
+	struct lu_device *next = cl2lu_dev(lsd->acid_next);
 
 	if (atomic_read(&d->ld_ref) && d->ld_site) {
 		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_ERROR, NULL);
@@ -100,48 +100,48 @@ static struct lu_device *lovsub_device_free(const struct lu_env *env,
 }
 
 static const struct lu_device_operations lovsub_lu_ops = {
-        .ldo_object_alloc      = lovsub_object_alloc,
-        .ldo_process_config    = NULL,
-        .ldo_recovery_complete = NULL
+	.ldo_object_alloc      = lovsub_object_alloc,
+	.ldo_process_config    = NULL,
+	.ldo_recovery_complete = NULL
 };
 
 static struct lu_device *lovsub_device_alloc(const struct lu_env *env,
-                                             struct lu_device_type *t,
-                                             struct lustre_cfg *cfg)
+					     struct lu_device_type *t,
+					     struct lustre_cfg *cfg)
 {
-        struct lu_device     *d;
-        struct lovsub_device *lsd;
-
-        OBD_ALLOC_PTR(lsd);
-        if (lsd != NULL) {
-                int result;
-
-                result = cl_device_init(&lsd->acid_cl, t);
-                if (result == 0) {
-                        d = lovsub2lu_dev(lsd);
-                        d->ld_ops         = &lovsub_lu_ops;
-                } else
-                        d = ERR_PTR(result);
-        } else
-                d = ERR_PTR(-ENOMEM);
-        return d;
+	struct lu_device *d;
+	struct lovsub_device *lsd;
+
+	OBD_ALLOC_PTR(lsd);
+	if (lsd) {
+		int result;
+
+		result = cl_device_init(&lsd->acid_cl, t);
+		if (result == 0) {
+			d = lovsub2lu_dev(lsd);
+			d->ld_ops         = &lovsub_lu_ops;
+		} else
+			d = ERR_PTR(result);
+	} else
+		d = ERR_PTR(-ENOMEM);
+	return d;
 }
 
 static const struct lu_device_type_operations lovsub_device_type_ops = {
-        .ldto_device_alloc = lovsub_device_alloc,
-        .ldto_device_free  = lovsub_device_free,
+	.ldto_device_alloc = lovsub_device_alloc,
+	.ldto_device_free = lovsub_device_free,
 
-        .ldto_device_init    = lovsub_device_init,
-        .ldto_device_fini    = lovsub_device_fini
+	.ldto_device_init = lovsub_device_init,
+	.ldto_device_fini = lovsub_device_fini
 };
 
 #define LUSTRE_LOVSUB_NAME         "lovsub"
 
 struct lu_device_type lovsub_device_type = {
-        .ldt_tags     = LU_DEVICE_CL,
-        .ldt_name     = LUSTRE_LOVSUB_NAME,
-        .ldt_ops      = &lovsub_device_type_ops,
-        .ldt_ctx_tags = LCT_CL_THREAD
+	.ldt_tags     = LU_DEVICE_CL,
+	.ldt_name     = LUSTRE_LOVSUB_NAME,
+	.ldt_ops      = &lovsub_device_type_ops,
+	.ldt_ctx_tags = LCT_CL_THREAD
 };
 
 
diff --git a/drivers/staging/lustrefsx/lustre/lov/lovsub_lock.c b/drivers/staging/lustrefsx/lustre/lov/lovsub_lock.c
deleted file mode 100644
index de8b5c72260d7..0000000000000
--- a/drivers/staging/lustrefsx/lustre/lov/lovsub_lock.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2011, 2016, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * Implementation of cl_lock for LOVSUB layer.
- *
- *   Author: Nikita Danilov <nikita.danilov@sun.com>
- */
-
-#define DEBUG_SUBSYSTEM S_LOV
-
-#include "lov_cl_internal.h"
-
-/** \addtogroup lov
- *  @{
- */
-
-/*****************************************************************************
- *
- * Lovsub lock operations.
- *
- */
-
-static void lovsub_lock_fini(const struct lu_env *env,
-                             struct cl_lock_slice *slice)
-{
-        struct lovsub_lock   *lsl;
-
-	ENTRY;
-	lsl = cl2lovsub_lock(slice);
-	OBD_SLAB_FREE_PTR(lsl, lovsub_lock_kmem);
-	EXIT;
-}
-
-static const struct cl_lock_operations lovsub_lock_ops = {
-        .clo_fini    = lovsub_lock_fini,
-};
-
-int lovsub_lock_init(const struct lu_env *env, struct cl_object *obj,
-		     struct cl_lock *lock, const struct cl_io *io)
-{
-	struct lovsub_lock *lsk;
-	int result;
-
-	ENTRY;
-	OBD_SLAB_ALLOC_PTR_GFP(lsk, lovsub_lock_kmem, GFP_NOFS);
-	if (lsk != NULL) {
-		cl_lock_slice_add(lock, &lsk->lss_cl, obj, &lovsub_lock_ops);
-		result = 0;
-	} else
-		result = -ENOMEM;
-	RETURN(result);
-}
-
-/** @} lov */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lovsub_object.c b/drivers/staging/lustrefsx/lustre/lov/lovsub_object.c
index 1471de7915162..d219356cb3ad3 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lovsub_object.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lovsub_object.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -49,37 +49,39 @@
  */
 
 int lovsub_object_init(const struct lu_env *env, struct lu_object *obj,
-                       const struct lu_object_conf *conf)
+		       const struct lu_object_conf *conf)
 {
-        struct lovsub_device  *dev   = lu2lovsub_dev(obj->lo_dev);
-        struct lu_object      *below;
-        struct lu_device      *under;
-
-        int result;
-
-        ENTRY;
-        under = &dev->acid_next->cd_lu_dev;
-        below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under);
-        if (below != NULL) {
-                lu_object_add(obj, below);
-		cl_object_page_init(lu2cl(obj), sizeof(struct lovsub_page));
-                result = 0;
-        } else
-                result = -ENOMEM;
-        RETURN(result);
+	struct lovsub_device *dev = lu2lovsub_dev(obj->lo_dev);
+	struct lu_object *below;
+	struct lu_device *under;
+
+	int result;
+
+	ENTRY;
+	under = &dev->acid_next->cd_lu_dev;
+	below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under);
+	if (below) {
+		lu_object_add(obj, below);
+		cl_object_page_init(lu2cl(obj), 0);
+		result = 0;
+	} else
+		result = -ENOMEM;
+	RETURN(result);
 
 }
 
 static void lovsub_object_free(const struct lu_env *env, struct lu_object *obj)
 {
-        struct lovsub_object *los = lu2lovsub(obj);
-        struct lov_object    *lov = los->lso_super;
-        ENTRY;
-
-        /* We can't assume lov was assigned here, because of the shadow
-         * object handling in lu_object_find.
-         */
-	if (lov != NULL) {
+	struct lovsub_object *los = lu2lovsub(obj);
+	struct lov_object *lov = los->lso_super;
+
+	ENTRY;
+
+	/*
+	 * We can't assume lov was assigned here, because of the shadow
+	 * object handling in lu_object_find.
+	 */
+	if (lov) {
 		int index = lov_comp_entry(los->lso_index);
 		int stripe = lov_comp_stripe(los->lso_index);
 		struct lov_layout_raid0 *r0 = lov_r0(lov, index);
@@ -91,18 +93,18 @@ static void lovsub_object_free(const struct lu_env *env, struct lu_object *obj)
 		spin_unlock(&r0->lo_sub_lock);
 	}
 
-        lu_object_fini(obj);
-        lu_object_header_fini(&los->lso_header.coh_lu);
-        OBD_SLAB_FREE_PTR(los, lovsub_object_kmem);
-        EXIT;
+	lu_object_fini(obj);
+	lu_object_header_fini(&los->lso_header.coh_lu);
+	OBD_SLAB_FREE_PTR(los, lovsub_object_kmem);
+	EXIT;
 }
 
 static int lovsub_object_print(const struct lu_env *env, void *cookie,
-                               lu_printer_t p, const struct lu_object *obj)
+			       lu_printer_t p, const struct lu_object *obj)
 {
-        struct lovsub_object *los = lu2lovsub(obj);
+	struct lovsub_object *los = lu2lovsub(obj);
 
-        return (*p)(env, cookie, "[%d]", los->lso_index);
+	return (*p)(env, cookie, "[%d]", los->lso_index);
 }
 
 static int lovsub_attr_update(const struct lu_env *env, struct cl_object *obj,
@@ -117,13 +119,13 @@ static int lovsub_attr_update(const struct lu_env *env, struct cl_object *obj,
 }
 
 static int lovsub_object_glimpse(const struct lu_env *env,
-                                 const struct cl_object *obj,
-                                 struct ost_lvb *lvb)
+				 const struct cl_object *obj,
+				 struct ost_lvb *lvb)
 {
-        struct lovsub_object *los = cl2lovsub(obj);
+	struct lovsub_object *los = cl2lovsub(obj);
 
-        ENTRY;
-        RETURN(cl_object_glimpse(env, &los->lso_super->lo_cl, lvb));
+	ENTRY;
+	RETURN(cl_object_glimpse(env, &los->lso_super->lo_cl, lvb));
 }
 
 /**
@@ -136,6 +138,7 @@ static void lovsub_req_attr_set(const struct lu_env *env, struct cl_object *obj,
 {
 	struct lovsub_object *subobj = cl2lovsub(obj);
 	struct lov_stripe_md *lsm = subobj->lso_super->lo_lsm;
+
 	ENTRY;
 	cl_req_attr_set(env, &subobj->lso_super->lo_cl, attr);
 
@@ -151,20 +154,18 @@ static void lovsub_req_attr_set(const struct lu_env *env, struct cl_object *obj,
 }
 
 static const struct cl_object_operations lovsub_ops = {
-	.coo_page_init    = lovsub_page_init,
-	.coo_lock_init    = lovsub_lock_init,
 	.coo_attr_update  = lovsub_attr_update,
 	.coo_glimpse      = lovsub_object_glimpse,
 	.coo_req_attr_set = lovsub_req_attr_set
 };
 
 static const struct lu_object_operations lovsub_lu_obj_ops = {
-        .loo_object_init      = lovsub_object_init,
-        .loo_object_delete    = NULL,
-        .loo_object_release   = NULL,
-        .loo_object_free      = lovsub_object_free,
-        .loo_object_print     = lovsub_object_print,
-        .loo_object_invariant = NULL
+	.loo_object_init      = lovsub_object_init,
+	.loo_object_delete    = NULL,
+	.loo_object_release   = NULL,
+	.loo_object_free      = lovsub_object_free,
+	.loo_object_print     = lovsub_object_print,
+	.loo_object_invariant = NULL
 };
 
 struct lu_object *lovsub_object_alloc(const struct lu_env *env,
@@ -176,7 +177,7 @@ struct lu_object *lovsub_object_alloc(const struct lu_env *env,
 
 	ENTRY;
 	OBD_SLAB_ALLOC_PTR_GFP(los, lovsub_object_kmem, GFP_NOFS);
-	if (los != NULL) {
+	if (los) {
 		struct cl_object_header *hdr;
 
 		obj = lovsub2lu(los);
diff --git a/drivers/staging/lustrefsx/lustre/lov/lovsub_page.c b/drivers/staging/lustrefsx/lustre/lov/lovsub_page.c
deleted file mode 100644
index c10a3dfa38c1e..0000000000000
--- a/drivers/staging/lustrefsx/lustre/lov/lovsub_page.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2013, 2014, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * Implementation of cl_page for LOVSUB layer.
- *
- *   Author: Nikita Danilov <nikita.danilov@sun.com>
- */
-
-#define DEBUG_SUBSYSTEM S_LOV
-
-#include "lov_cl_internal.h"
-
-/** \addtogroup lov
- *  @{
- */
-
-/*****************************************************************************
- *
- * Lovsub page operations.
- *
- */
-
-static void lovsub_page_fini(const struct lu_env *env,
-                             struct cl_page_slice *slice)
-{
-}
-
-static const struct cl_page_operations lovsub_page_ops = {
-        .cpo_fini   = lovsub_page_fini
-};
-
-int lovsub_page_init(const struct lu_env *env, struct cl_object *obj,
-		     struct cl_page *page, pgoff_t index)
-{
-	struct lovsub_page *lsb = cl_object_page_slice(obj, page);
-	ENTRY;
-
-	cl_page_slice_add(page, &lsb->lsb_cl, obj, index, &lovsub_page_ops);
-	RETURN(0);
-}
-
-/** @} lov */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c b/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c
index 41215c11998ef..f6eeebed9e2b0 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -35,10 +35,9 @@
 #include <asm/statfs.h>
 #include <lprocfs_status.h>
 #include <obd_class.h>
-#include <uapi/linux/lustre_param.h>
+#include <uapi/linux/lustre/lustre_param.h>
 #include "lov_internal.h"
 
-#ifdef CONFIG_PROC_FS
 static int lov_stripesize_seq_show(struct seq_file *m, void *v)
 {
 	struct obd_device *dev = (struct obd_device *)m->private;
@@ -57,12 +56,12 @@ static ssize_t lov_stripesize_seq_write(struct file *file,
 {
 	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
 	struct lov_desc *desc;
-	__s64 val;
+	s64 val;
 	int rc;
 
 	LASSERT(dev != NULL);
 	desc = &dev->u.lov.desc;
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &val, '1');
 	if (rc)
 		return rc;
 	if (val < 0)
@@ -75,150 +74,135 @@ static ssize_t lov_stripesize_seq_write(struct file *file,
 }
 LPROC_SEQ_FOPS(lov_stripesize);
 
-static int lov_stripeoffset_seq_show(struct seq_file *m, void *v)
+static ssize_t stripeoffset_show(struct kobject *kobj, struct attribute *attr,
+				 char *buf)
 {
-	struct obd_device *dev = (struct obd_device *)m->private;
-	struct lov_desc *desc;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &dev->u.lov.desc;
 
-	LASSERT(dev != NULL);
-	desc = &dev->u.lov.desc;
-	seq_printf(m, "%lld\n", desc->ld_default_stripe_offset);
-	return 0;
+	return sprintf(buf, "%lld\n", desc->ld_default_stripe_offset);
 }
 
-static ssize_t lov_stripeoffset_seq_write(struct file *file,
-					  const char __user *buffer,
-					  size_t count, loff_t *off)
+static ssize_t stripeoffset_store(struct kobject *kobj, struct attribute *attr,
+				  const char *buf, size_t count)
 {
-	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
-	struct lov_desc *desc;
-	__s64 val;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &dev->u.lov.desc;
+	long val;
 	int rc;
 
-	LASSERT(dev != NULL);
-	desc = &dev->u.lov.desc;
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtol(buf, 0, &val);
 	if (rc)
 		return rc;
-	if (val < -1)
+	if (val < -1 || val > LOV_MAX_STRIPE_COUNT)
 		return -ERANGE;
 
 	desc->ld_default_stripe_offset = val;
 
 	return count;
 }
-LPROC_SEQ_FOPS(lov_stripeoffset);
+LUSTRE_RW_ATTR(stripeoffset);
 
-static int lov_stripetype_seq_show(struct seq_file *m, void *v)
+static ssize_t stripetype_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
 {
-	struct obd_device* dev = (struct obd_device*)m->private;
-	struct lov_desc *desc;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &dev->u.lov.desc;
 
-	LASSERT(dev != NULL);
-	desc = &dev->u.lov.desc;
-	seq_printf(m, "%u\n", desc->ld_pattern);
-	return 0;
+	return sprintf(buf, "%u\n", desc->ld_pattern);
 }
 
-static ssize_t lov_stripetype_seq_write(struct file *file,
-					const char __user *buffer,
-					size_t count, loff_t *off)
+static ssize_t stripetype_store(struct kobject *kobj, struct attribute *attr,
+				const char *buffer, size_t count)
 {
-	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
-	struct lov_desc *desc;
-	int pattern, rc;
-	__s64 val;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &dev->u.lov.desc;
+	u32 pattern;
+	int rc;
 
-	LASSERT(dev != NULL);
-	desc = &dev->u.lov.desc;
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtouint(buffer, 0, &pattern);
 	if (rc)
 		return rc;
-	if (val < INT_MIN || val > INT_MAX)
-		return -ERANGE;
 
-	pattern = val;
 	lov_fix_desc_pattern(&pattern);
 	desc->ld_pattern = pattern;
 
 	return count;
 }
-LPROC_SEQ_FOPS(lov_stripetype);
+LUSTRE_RW_ATTR(stripetype);
 
-static int lov_stripecount_seq_show(struct seq_file *m, void *v)
+static ssize_t stripecount_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
 {
-	struct obd_device *dev = (struct obd_device *)m->private;
-	struct lov_desc *desc;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &dev->u.lov.desc;
 
-	LASSERT(dev != NULL);
-	desc = &dev->u.lov.desc;
-	seq_printf(m, "%d\n",
-		  (__s16)(desc->ld_default_stripe_count + 1) - 1);
-	return 0;
+	return sprintf(buf, "%d\n",
+		       (__s16)(desc->ld_default_stripe_count + 1) - 1);
 }
 
-static ssize_t lov_stripecount_seq_write(struct file *file,
-					 const char __user *buffer,
-					 size_t count, loff_t *off)
+static ssize_t stripecount_store(struct kobject *kobj, struct attribute *attr,
+				 const char *buffer, size_t count)
 {
-	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
-	struct lov_desc *desc;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &dev->u.lov.desc;
+	int stripe_count;
 	int rc;
-	__u32 stripe_count;
-	__s64 val;
 
-	LASSERT(dev != NULL);
-	desc = &dev->u.lov.desc;
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtoint(buffer, 0, &stripe_count);
 	if (rc)
 		return rc;
-	if (val < -1)
+
+	if (stripe_count < -1)
 		return -ERANGE;
 
-	stripe_count = val;
 	lov_fix_desc_stripe_count(&stripe_count);
 	desc->ld_default_stripe_count = stripe_count;
 
 	return count;
 }
-LPROC_SEQ_FOPS(lov_stripecount);
+LUSTRE_RW_ATTR(stripecount);
 
-static int lov_numobd_seq_show(struct seq_file *m, void *v)
+static ssize_t numobd_show(struct kobject *kobj, struct attribute *attr,
+			   char *buf)
 {
-	struct obd_device *dev = (struct obd_device*)m->private;
-	struct lov_desc *desc;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &dev->u.lov.desc;
 
-	LASSERT(dev != NULL);
-	desc = &dev->u.lov.desc;
-	seq_printf(m, "%u\n", desc->ld_tgt_count);
-	return 0;
+	return sprintf(buf, "%u\n", desc->ld_tgt_count);
 }
-LPROC_SEQ_FOPS_RO(lov_numobd);
+LUSTRE_RO_ATTR(numobd);
 
-static int lov_activeobd_seq_show(struct seq_file *m, void *v)
+static ssize_t activeobd_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
 {
-	struct obd_device* dev = (struct obd_device*)m->private;
-	struct lov_desc *desc;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &dev->u.lov.desc;
 
-	LASSERT(dev != NULL);
-	desc = &dev->u.lov.desc;
-	seq_printf(m, "%u\n", desc->ld_active_tgt_count);
-	return 0;
+	return sprintf(buf, "%u\n", desc->ld_active_tgt_count);
 }
-LPROC_SEQ_FOPS_RO(lov_activeobd);
+LUSTRE_RO_ATTR(activeobd);
 
-static int lov_desc_uuid_seq_show(struct seq_file *m, void *v)
+static ssize_t desc_uuid_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
 {
-	struct obd_device *dev = m->private;
-	struct lov_obd *lov;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &dev->u.lov.desc;
 
-	LASSERT(dev != NULL);
-	lov = &dev->u.lov;
-	seq_printf(m, "%s\n", lov->desc.ld_uuid.uuid);
-	return 0;
+	return sprintf(buf, "%s\n", desc->ld_uuid.uuid);
 }
-LPROC_SEQ_FOPS_RO(lov_desc_uuid);
+LUSTRE_RO_ATTR(desc_uuid);
 
+#ifdef CONFIG_PROC_FS
 static void *lov_tgt_seq_start(struct seq_file *p, loff_t *pos)
 {
         struct obd_device *dev = p->private;
@@ -251,6 +235,7 @@ static void *lov_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos)
 static int lov_tgt_seq_show(struct seq_file *p, void *v)
 {
         struct lov_tgt_desc *tgt = v;
+
 	seq_printf(p, "%d: %s %sACTIVE\n", tgt->ltd_index,
 		   obd_uuid2str(&tgt->ltd_uuid),
 		   tgt->ltd_active ? "" : "IN");
@@ -269,10 +254,6 @@ static int lov_target_seq_open(struct inode *inode, struct file *file)
 	struct seq_file *seq;
 	int rc;
 
-	rc = LPROCFS_ENTRY_CHECK(inode);
-	if (rc < 0)
-		return rc;
-
 	rc = seq_open(file, &lov_tgt_sops);
 	if (rc)
 		return rc;
@@ -282,47 +263,13 @@ static int lov_target_seq_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
-LPROC_SEQ_FOPS_RO_TYPE(lov, uuid);
-LPROC_SEQ_FOPS_RO_TYPE(lov, filestotal);
-LPROC_SEQ_FOPS_RO_TYPE(lov, filesfree);
-LPROC_SEQ_FOPS_RO_TYPE(lov, blksize);
-LPROC_SEQ_FOPS_RO_TYPE(lov, kbytestotal);
-LPROC_SEQ_FOPS_RO_TYPE(lov, kbytesfree);
-LPROC_SEQ_FOPS_RO_TYPE(lov, kbytesavail);
-
 struct lprocfs_vars lprocfs_lov_obd_vars[] = {
-	{ .name	=	"uuid",
-	  .fops	=	&lov_uuid_fops		},
-	{ .name	=	"stripesize",
-	  .fops	=	&lov_stripesize_fops	},
-	{ .name	=	"stripeoffset",
-	  .fops	=	&lov_stripeoffset_fops	},
-	{ .name	=	"stripecount",
-	  .fops	=	&lov_stripecount_fops	},
-	{ .name	=	"stripetype",
-	  .fops	=	&lov_stripetype_fops	},
-	{ .name	=	"numobd",
-	  .fops	=	&lov_numobd_fops	},
-	{ .name	=	"activeobd",
-	  .fops	=	&lov_activeobd_fops	},
-	{ .name	=	"filestotal",
-	  .fops	=	&lov_filestotal_fops	},
-	{ .name	=	"filesfree",
-	  .fops	=	&lov_filesfree_fops	},
-	{ .name	=	"blocksize",
-	  .fops	=	&lov_blksize_fops	},
-	{ .name	=	"kbytestotal",
-	  .fops	=	&lov_kbytestotal_fops	},
-	{ .name	=	"kbytesfree",
-	  .fops	=	&lov_kbytesfree_fops	},
-	{ .name	=	"kbytesavail",
-	  .fops	=	&lov_kbytesavail_fops	},
-	{ .name	=	"desc_uuid",
-	  .fops	=	&lov_desc_uuid_fops	},
+	{ .name =	"stripesize",
+	  .fops =	&lov_stripesize_fops	},
 	{ NULL }
 };
 
-const struct proc_ops lov_proc_target_fops = {
+static const struct proc_ops lov_proc_target_fops = {
 	PROC_OWNER(THIS_MODULE)
 	.proc_open	= lov_target_seq_open,
 	.proc_read	= seq_read,
@@ -330,3 +277,68 @@ const struct proc_ops lov_proc_target_fops = {
 	.proc_release	= lprocfs_seq_release,
 };
 #endif /* CONFIG_PROC_FS */
+
+static struct attribute *lov_attrs[] = {
+	&lustre_attr_activeobd.attr,
+	&lustre_attr_numobd.attr,
+	&lustre_attr_desc_uuid.attr,
+	&lustre_attr_stripeoffset.attr,
+	&lustre_attr_stripetype.attr,
+	&lustre_attr_stripecount.attr,
+	NULL,
+};
+
+int lov_tunables_init(struct obd_device *obd)
+{
+	struct lov_obd *lov = &obd->u.lov;
+#if defined(CONFIG_PROC_FS) && defined(HAVE_SERVER_SUPPORT)
+	struct obd_type *type;
+#endif
+	int rc;
+
+	obd->obd_vars = lprocfs_lov_obd_vars;
+#if defined(CONFIG_PROC_FS) && defined(HAVE_SERVER_SUPPORT)
+	/* If this is true then both client (lov) and server
+	 * (lod) are on the same node. The lod layer if loaded
+	 * first will register the lov proc directory. In that
+	 * case obd->obd_type->typ_procroot will be not set.
+	 * Instead we use type->typ_procsym as the parent.
+	 */
+	type = class_search_type(LUSTRE_LOD_NAME);
+	if (type && type->typ_procsym) {
+		obd->obd_proc_entry = lprocfs_register(obd->obd_name,
+						       type->typ_procsym,
+						       obd->obd_vars, obd);
+		if (IS_ERR(obd->obd_proc_entry)) {
+			rc = PTR_ERR(obd->obd_proc_entry);
+			CERROR("error %d setting up lprocfs for %s\n", rc,
+			       obd->obd_name);
+			obd->obd_proc_entry = NULL;
+		}
+	}
+#endif
+	obd->obd_ktype.default_attrs = lov_attrs;
+	rc = lprocfs_obd_setup(obd, false);
+	if (rc)
+		GOTO(out, rc);
+
+#ifdef CONFIG_PROC_FS
+	rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd", 0444,
+				&lov_proc_target_fops, obd);
+	if (rc)
+		CWARN("%s: Error adding the target_obd file : rc %d\n",
+		      obd->obd_name, rc);
+
+	lov->lov_pool_proc_entry = lprocfs_register("pools",
+						    obd->obd_proc_entry,
+						    NULL, NULL);
+	if (IS_ERR(lov->lov_pool_proc_entry)) {
+		rc = PTR_ERR(lov->lov_pool_proc_entry);
+		CERROR("%s: error setting up debugfs for pools : rc %d\n",
+		       obd->obd_name, rc);
+		lov->lov_pool_proc_entry = NULL;
+	}
+#endif /* CONFIG_FS_PROC */
+out:
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/lustre/mdc/Makefile b/drivers/staging/lustrefsx/lustre/mdc/Makefile
index e13d6af6f9949..7c9329681bdf2 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/Makefile
+++ b/drivers/staging/lustrefsx/lustre/mdc/Makefile
@@ -1,6 +1,6 @@
 obj-$(CONFIG_LUSTREFSX_FS)	+= mdc.o
 
 mdc-y		:= mdc_request.o mdc_reint.o lproc_mdc.o mdc_lib.o mdc_locks.o
-mdc-y		+= mdc_changelog.o
+mdc-y		+= mdc_changelog.o mdc_dev.o
 
 include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c b/drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c
index 57cd679138950..0c2e79a2a336d 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c
+++ b/drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -31,127 +31,280 @@
  */
 #define DEBUG_SUBSYSTEM S_CLASS
 
-#include <linux/version.h>
 #include <linux/vfs.h>
 #include <obd_class.h>
 #include <lprocfs_status.h>
-
+#include <lustre_osc.h>
+#include <cl_object.h>
 #include "mdc_internal.h"
 
-#ifdef CONFIG_PROC_FS
-static int mdc_active_seq_show(struct seq_file *m, void *v)
+static ssize_t active_show(struct kobject *kobj, struct attribute *attr,
+			   char *buf)
 {
-	struct obd_device *dev = m->private;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	ssize_t len;
 
 	LPROCFS_CLIMP_CHECK(dev);
-	seq_printf(m, "%d\n", !dev->u.cli.cl_import->imp_deactive);
+	len = sprintf(buf, "%d\n", !dev->u.cli.cl_import->imp_deactive);
 	LPROCFS_CLIMP_EXIT(dev);
-	return 0;
+	return len;
 }
 
-static ssize_t mdc_active_seq_write(struct file *file,
-				    const char __user *buffer,
-				    size_t count, loff_t *off)
+static ssize_t active_store(struct kobject *kobj, struct attribute *attr,
+			    const char *buffer, size_t count)
 {
-	struct obd_device *dev;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	bool val;
 	int rc;
-	__s64 val;
 
-	dev = ((struct seq_file *)file->private_data)->private;
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtobool(buffer, &val);
 	if (rc)
 		return rc;
-	if (val < 0 || val > 1)
-		return -ERANGE;
 
 	/* opposite senses */
 	if (dev->u.cli.cl_import->imp_deactive == val)
 		rc = ptlrpc_set_import_active(dev->u.cli.cl_import, val);
 	else
-		CDEBUG(D_CONFIG, "activate %llu: ignoring repeat request\n",
+		CDEBUG(D_CONFIG, "activate %u: ignoring repeat request\n",
 		       val);
 
 	return count;
 }
-LPROC_SEQ_FOPS(mdc_active);
+LUSTRE_RW_ATTR(active);
 
-static int mdc_max_rpcs_in_flight_seq_show(struct seq_file *m, void *v)
+static ssize_t max_rpcs_in_flight_show(struct kobject *kobj,
+				       struct attribute *attr,
+				       char *buf)
 {
-	struct obd_device *dev = m->private;
-	__u32 max;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	ssize_t len;
+	u32 max;
 
 	max = obd_get_max_rpcs_in_flight(&dev->u.cli);
-	seq_printf(m, "%u\n", max);
+	len = sprintf(buf, "%u\n", max);
 
-	return 0;
+	return len;
 }
 
-static ssize_t mdc_max_rpcs_in_flight_seq_write(struct file *file,
-						const char __user *buffer,
-						size_t count, loff_t *off)
+static ssize_t max_rpcs_in_flight_store(struct kobject *kobj,
+					struct attribute *attr,
+					const char *buffer,
+					size_t count)
 {
-	struct obd_device *dev;
-	__s64 val;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	unsigned int val;
 	int rc;
 
-	dev = ((struct seq_file *)file->private_data)->private;
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtouint(buffer, 10, &val);
 	if (rc)
 		return rc;
 
-	if (val < 0 || val > UINT_MAX)
-		return -ERANGE;
-
 	rc = obd_set_max_rpcs_in_flight(&dev->u.cli, val);
+	if (rc)
+		count = rc;
+
+	return count;
+}
+LUSTRE_RW_ATTR(max_rpcs_in_flight);
+
+static ssize_t max_mod_rpcs_in_flight_show(struct kobject *kobj,
+					   struct attribute *attr,
+					   char *buf)
+{
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	u16 max;
+
+	max = obd_get_max_mod_rpcs_in_flight(&dev->u.cli);
+	return sprintf(buf, "%hu\n", max);
+}
+
+static ssize_t max_mod_rpcs_in_flight_store(struct kobject *kobj,
+					    struct attribute *attr,
+					    const char *buffer,
+					    size_t count)
+{
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	u16 val;
+	int rc;
+
+	rc = kstrtou16(buffer, 10, &val);
 	if (rc)
 		return rc;
 
+	rc = obd_set_max_mod_rpcs_in_flight(&dev->u.cli, val);
+	if (rc)
+		count = rc;
+
 	return count;
 }
-LPROC_SEQ_FOPS(mdc_max_rpcs_in_flight);
+LUSTRE_RW_ATTR(max_mod_rpcs_in_flight);
 
-static int mdc_max_mod_rpcs_in_flight_seq_show(struct seq_file *m, void *v)
+static int mdc_max_dirty_mb_seq_show(struct seq_file *m, void *v)
 {
 	struct obd_device *dev = m->private;
-	__u16 max;
+	struct client_obd *cli = &dev->u.cli;
+	unsigned long val;
 
-	max = obd_get_max_mod_rpcs_in_flight(&dev->u.cli);
-	seq_printf(m, "%hu\n", max);
+	spin_lock(&cli->cl_loi_list_lock);
+	val = PAGES_TO_MiB(cli->cl_dirty_max_pages);
+	spin_unlock(&cli->cl_loi_list_lock);
 
+	seq_printf(m, "%lu\n", val);
 	return 0;
 }
 
-static ssize_t mdc_max_mod_rpcs_in_flight_seq_write(struct file *file,
-						    const char __user *buffer,
-						    size_t count, loff_t *off)
+static ssize_t mdc_max_dirty_mb_seq_write(struct file *file,
+					  const char __user *buffer,
+					  size_t count, loff_t *off)
 {
-	struct obd_device *dev =
-			((struct seq_file *)file->private_data)->private;
-	__s64 val;
+	struct seq_file *sfl = file->private_data;
+	struct obd_device *dev = sfl->private;
+	struct client_obd *cli = &dev->u.cli;
+	s64 pages_number;
 	int rc;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &pages_number, 'M');
 	if (rc)
 		return rc;
 
-	if (val < 0 || val > USHRT_MAX)
+	/* MB -> pages */
+	pages_number = round_up(pages_number, 1024 * 1024) >> PAGE_SHIFT;
+	if (pages_number <= 0 ||
+	    pages_number >= MiB_TO_PAGES(OSC_MAX_DIRTY_MB_MAX) ||
+	    pages_number > cfs_totalram_pages() / 4) /* 1/4 of RAM */
 		return -ERANGE;
 
-	rc = obd_set_max_mod_rpcs_in_flight(&dev->u.cli, val);
+	spin_lock(&cli->cl_loi_list_lock);
+	cli->cl_dirty_max_pages = pages_number;
+	osc_wake_cache_waiters(cli);
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(mdc_max_dirty_mb);
+
+static ssize_t contention_seconds_show(struct kobject *kobj,
+				       struct attribute *attr,
+				       char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct osc_device *od = obd2osc_dev(obd);
+
+	return sprintf(buf, "%lld\n", od->od_contention_time);
+}
+
+static ssize_t contention_seconds_store(struct kobject *kobj,
+					struct attribute *attr,
+					const char *buffer,
+					size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct osc_device *od = obd2osc_dev(obd);
+	time64_t val;
+	int rc;
+
+	rc = kstrtoll(buffer, 0, &val);
 	if (rc)
-		count = rc;
+		return rc;
+
+	od->od_contention_time = val;
 
 	return count;
 }
-LPROC_SEQ_FOPS(mdc_max_mod_rpcs_in_flight);
+LUSTRE_RW_ATTR(contention_seconds);
 
-static int mdc_rpc_stats_seq_show(struct seq_file *seq, void *v)
+LUSTRE_ATTR(mds_conn_uuid, 0444, conn_uuid_show, NULL);
+LUSTRE_RO_ATTR(conn_uuid);
+
+LUSTRE_RW_ATTR(ping);
+
+static int mdc_cached_mb_seq_show(struct seq_file *m, void *v)
 {
-	struct obd_device *dev = seq->private;
+	struct obd_device *dev = m->private;
+	struct client_obd *cli = &dev->u.cli;
+	int shift = 20 - PAGE_SHIFT;
+
+	seq_printf(m, "used_mb: %ld\n"
+		   "busy_cnt: %ld\n"
+		   "reclaim: %llu\n",
+		   (atomic_long_read(&cli->cl_lru_in_list) +
+		    atomic_long_read(&cli->cl_lru_busy)) >> shift,
+		    atomic_long_read(&cli->cl_lru_busy),
+		   cli->cl_lru_reclaim);
 
-	return obd_mod_rpc_stats_seq_show(&dev->u.cli, seq);
+	return 0;
 }
 
+/* shrink the number of caching pages to a specific number */
+static ssize_t
+mdc_cached_mb_seq_write(struct file *file, const char __user *buffer,
+			size_t count, loff_t *off)
+{
+	struct seq_file *sfl = file->private_data;
+	struct obd_device *dev = sfl->private;
+	struct client_obd *cli = &dev->u.cli;
+	__s64 pages_number;
+	long rc;
+	char kernbuf[128];
+
+	if (count >= sizeof(kernbuf))
+		return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+	kernbuf[count] = 0;
+
+	buffer += lprocfs_find_named_value(kernbuf, "used_mb:", &count) -
+		  kernbuf;
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &pages_number, 'M');
+	if (rc)
+		return rc;
+
+	pages_number >>= PAGE_SHIFT;
+
+	if (pages_number < 0)
+		return -ERANGE;
+
+	rc = atomic_long_read(&cli->cl_lru_in_list) - pages_number;
+	if (rc > 0) {
+		struct lu_env *env;
+		__u16 refcheck;
+
+		env = cl_env_get(&refcheck);
+		if (!IS_ERR(env)) {
+			(void)osc_lru_shrink(env, cli, rc, true);
+			cl_env_put(env, &refcheck);
+		}
+	}
+
+	return count;
+}
+LPROC_SEQ_FOPS(mdc_cached_mb);
+
+static int mdc_unstable_stats_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+	struct client_obd *cli = &dev->u.cli;
+	long pages;
+	int mb;
+
+	pages = atomic_long_read(&cli->cl_unstable_count);
+	mb    = (pages * PAGE_SIZE) >> 20;
+
+	seq_printf(m, "unstable_pages: %20ld\n"
+		   "unstable_mb:              %10d\n", pages, mb);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(mdc_unstable_stats);
+
 static ssize_t mdc_rpc_stats_seq_write(struct file *file,
 				       const char __user *buf,
 				       size_t len, loff_t *off)
@@ -162,22 +315,174 @@ static ssize_t mdc_rpc_stats_seq_write(struct file *file,
 
 	lprocfs_oh_clear(&cli->cl_mod_rpcs_hist);
 
+	lprocfs_oh_clear(&cli->cl_read_rpc_hist);
+	lprocfs_oh_clear(&cli->cl_write_rpc_hist);
+	lprocfs_oh_clear(&cli->cl_read_page_hist);
+	lprocfs_oh_clear(&cli->cl_write_page_hist);
+	lprocfs_oh_clear(&cli->cl_read_offset_hist);
+	lprocfs_oh_clear(&cli->cl_write_offset_hist);
+
 	return len;
 }
+
+static int mdc_rpc_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct obd_device *dev = seq->private;
+	struct client_obd *cli = &dev->u.cli;
+	unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum;
+	int i;
+
+	obd_mod_rpc_stats_seq_show(&dev->u.cli, seq);
+
+	spin_lock(&cli->cl_loi_list_lock);
+
+	seq_printf(seq, "\nread RPCs in flight:  %d\n",
+		   cli->cl_r_in_flight);
+	seq_printf(seq, "write RPCs in flight: %d\n",
+		   cli->cl_w_in_flight);
+	seq_printf(seq, "pending write pages:  %d\n",
+		   atomic_read(&cli->cl_pending_w_pages));
+	seq_printf(seq, "pending read pages:   %d\n",
+		   atomic_read(&cli->cl_pending_r_pages));
+
+	seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
+	seq_printf(seq, "pages per rpc         rpcs   %% cum %% |");
+	seq_printf(seq, "       rpcs   %% cum %%\n");
+
+	read_tot = lprocfs_oh_sum(&cli->cl_read_page_hist);
+	write_tot = lprocfs_oh_sum(&cli->cl_write_page_hist);
+
+	read_cum = 0;
+	write_cum = 0;
+	for (i = 0; i < OBD_HIST_MAX; i++) {
+		unsigned long r = cli->cl_read_page_hist.oh_buckets[i];
+		unsigned long w = cli->cl_write_page_hist.oh_buckets[i];
+
+		read_cum += r;
+		write_cum += w;
+		seq_printf(seq, "%d:\t\t%10lu %3u %3u   | %10lu %3u %3u\n",
+			   1 << i, r, pct(r, read_tot),
+			   pct(read_cum, read_tot), w,
+			   pct(w, write_tot),
+			   pct(write_cum, write_tot));
+		if (read_cum == read_tot && write_cum == write_tot)
+			break;
+	}
+
+	seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
+	seq_printf(seq, "rpcs in flight        rpcs   %% cum %% |");
+	seq_printf(seq, "       rpcs   %% cum %%\n");
+
+	read_tot = lprocfs_oh_sum(&cli->cl_read_rpc_hist);
+	write_tot = lprocfs_oh_sum(&cli->cl_write_rpc_hist);
+
+	read_cum = 0;
+	write_cum = 0;
+	for (i = 0; i < OBD_HIST_MAX; i++) {
+		unsigned long r = cli->cl_read_rpc_hist.oh_buckets[i];
+		unsigned long w = cli->cl_write_rpc_hist.oh_buckets[i];
+
+		read_cum += r;
+		write_cum += w;
+		seq_printf(seq, "%d:\t\t%10lu %3u %3u   | %10lu %3u %3u\n",
+			   i, r, pct(r, read_tot), pct(read_cum, read_tot), w,
+			   pct(w, write_tot), pct(write_cum, write_tot));
+		if (read_cum == read_tot && write_cum == write_tot)
+			break;
+	}
+
+	seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
+	seq_printf(seq, "offset                rpcs   %% cum %% |");
+	seq_printf(seq, "       rpcs   %% cum %%\n");
+
+	read_tot = lprocfs_oh_sum(&cli->cl_read_offset_hist);
+	write_tot = lprocfs_oh_sum(&cli->cl_write_offset_hist);
+
+	read_cum = 0;
+	write_cum = 0;
+	for (i = 0; i < OBD_HIST_MAX; i++) {
+		unsigned long r = cli->cl_read_offset_hist.oh_buckets[i];
+		unsigned long w = cli->cl_write_offset_hist.oh_buckets[i];
+
+		read_cum += r;
+		write_cum += w;
+		seq_printf(seq, "%d:\t\t%10lu %3u %3u   | %10lu %3u %3u\n",
+			   (i == 0) ? 0 : 1 << (i - 1),
+			   r, pct(r, read_tot), pct(read_cum, read_tot),
+			   w, pct(w, write_tot), pct(write_cum, write_tot));
+		if (read_cum == read_tot && write_cum == write_tot)
+			break;
+	}
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	return 0;
+}
 LPROC_SEQ_FOPS(mdc_rpc_stats);
 
-LPROC_SEQ_FOPS_WO_TYPE(mdc, ping);
+static int mdc_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct timespec64 now;
+	struct obd_device *dev = seq->private;
+	struct osc_stats *stats = &obd2osc_dev(dev)->od_stats;
+
+	ktime_get_real_ts64(&now);
+
+	seq_printf(seq, "snapshot_time:         %lld.%09lu (secs.nsecs)\n",
+		   (s64)now.tv_sec, now.tv_nsec);
+	seq_printf(seq, "lockless_write_bytes\t\t%llu\n",
+		   stats->os_lockless_writes);
+	seq_printf(seq, "lockless_read_bytes\t\t%llu\n",
+		   stats->os_lockless_reads);
+	seq_printf(seq, "lockless_truncate\t\t%llu\n",
+		   stats->os_lockless_truncates);
+	return 0;
+}
+
+static ssize_t mdc_stats_seq_write(struct file *file,
+				   const char __user *buf,
+				   size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct obd_device *dev = seq->private;
+	struct osc_stats *stats = &obd2osc_dev(dev)->od_stats;
+
+	memset(stats, 0, sizeof(*stats));
+	return len;
+}
+LPROC_SEQ_FOPS(mdc_stats);
+
+static int mdc_dom_min_repsize_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+
+	seq_printf(m, "%u\n", dev->u.cli.cl_dom_min_inline_repsize);
+
+	return 0;
+}
+
+static ssize_t mdc_dom_min_repsize_seq_write(struct file *file,
+					     const char __user *buffer,
+					     size_t count, loff_t *off)
+{
+	struct obd_device *dev;
+	unsigned int val;
+	int rc;
+
+	dev =  ((struct seq_file *)file->private_data)->private;
+	rc = kstrtouint_from_user(buffer, count, 0, &val);
+	if (rc)
+		return rc;
+
+	if (val > MDC_DOM_MAX_INLINE_REPSIZE)
+		return -ERANGE;
+
+	dev->u.cli.cl_dom_min_inline_repsize = val;
+	return count;
+}
+LPROC_SEQ_FOPS(mdc_dom_min_repsize);
 
-LPROC_SEQ_FOPS_RO_TYPE(mdc, uuid);
 LPROC_SEQ_FOPS_RO_TYPE(mdc, connect_flags);
-LPROC_SEQ_FOPS_RO_TYPE(mdc, blksize);
-LPROC_SEQ_FOPS_RO_TYPE(mdc, kbytestotal);
-LPROC_SEQ_FOPS_RO_TYPE(mdc, kbytesfree);
-LPROC_SEQ_FOPS_RO_TYPE(mdc, kbytesavail);
-LPROC_SEQ_FOPS_RO_TYPE(mdc, filestotal);
-LPROC_SEQ_FOPS_RO_TYPE(mdc, filesfree);
 LPROC_SEQ_FOPS_RO_TYPE(mdc, server_uuid);
-LPROC_SEQ_FOPS_RO_TYPE(mdc, conn_uuid);
 LPROC_SEQ_FOPS_RO_TYPE(mdc, timeouts);
 LPROC_SEQ_FOPS_RO_TYPE(mdc, state);
 LPROC_SEQ_FOPS_RW_TYPE(mdc, obd_max_pages_per_rpc);
@@ -185,35 +490,16 @@ LPROC_SEQ_FOPS_RW_TYPE(mdc, import);
 LPROC_SEQ_FOPS_RW_TYPE(mdc, pinger_recov);
 
 struct lprocfs_vars lprocfs_mdc_obd_vars[] = {
-	{ .name	=	"uuid",
-	  .fops	=	&mdc_uuid_fops		},
-	{ .name	=	"ping",
-	  .fops	=	&mdc_ping_fops,
-	  .proc_mode =	0222			},
 	{ .name	=	"connect_flags",
 	  .fops	=	&mdc_connect_flags_fops	},
-	{ .name	=	"blocksize",
-	  .fops	=	&mdc_blksize_fops	},
-	{ .name	=	"kbytestotal",
-	  .fops	=	&mdc_kbytestotal_fops	},
-	{ .name	=	"kbytesfree",
-	  .fops	=	&mdc_kbytesfree_fops	},
-	{ .name	=	"kbytesavail",
-	  .fops	=	&mdc_kbytesavail_fops	},
-	{ .name	=	"filestotal",
-	  .fops	=	&mdc_filestotal_fops	},
-	{ .name	=	"filesfree",
-	  .fops	=	&mdc_filesfree_fops	},
 	{ .name	=	"mds_server_uuid",
 	  .fops	=	&mdc_server_uuid_fops	},
-	{ .name	=	"mds_conn_uuid",
-	  .fops	=	&mdc_conn_uuid_fops	},
-	{ .name	=	"max_pages_per_rpc",
-	  .fops	=	&mdc_obd_max_pages_per_rpc_fops	},
-	{ .name	=	"max_rpcs_in_flight",
-	  .fops	=	&mdc_max_rpcs_in_flight_fops	},
-	{ .name	=	"max_mod_rpcs_in_flight",
-	  .fops	=	&mdc_max_mod_rpcs_in_flight_fops	},
+	{ .name =	"max_pages_per_rpc",
+	  .fops =	&mdc_obd_max_pages_per_rpc_fops },
+	{ .name =	"max_dirty_mb",
+	  .fops =	&mdc_max_dirty_mb_fops		},
+	{ .name	=	"mdc_cached_mb",
+	  .fops	=	&mdc_cached_mb_fops		},
 	{ .name	=	"timeouts",
 	  .fops	=	&mdc_timeouts_fops		},
 	{ .name	=	"import",
@@ -224,8 +510,53 @@ struct lprocfs_vars lprocfs_mdc_obd_vars[] = {
 	  .fops	=	&mdc_pinger_recov_fops		},
 	{ .name	=	"rpc_stats",
 	  .fops	=	&mdc_rpc_stats_fops		},
-	{ .name	=	"active",
-	  .fops	=	&mdc_active_fops		},
+	{ .name	=	"unstable_stats",
+	  .fops	=	&mdc_unstable_stats_fops	},
+	{ .name	=	"mdc_stats",
+	  .fops	=	&mdc_stats_fops			},
+	{ .name	=	"mdc_dom_min_repsize",
+	  .fops	=	&mdc_dom_min_repsize_fops	},
 	{ NULL }
 };
-#endif /* CONFIG_PROC_FS */
+
+static struct attribute *mdc_attrs[] = {
+	&lustre_attr_active.attr,
+	&lustre_attr_max_rpcs_in_flight.attr,
+	&lustre_attr_max_mod_rpcs_in_flight.attr,
+	&lustre_attr_contention_seconds.attr,
+	&lustre_attr_mds_conn_uuid.attr,
+	&lustre_attr_conn_uuid.attr,
+	&lustre_attr_ping.attr,
+	NULL,
+};
+
+int mdc_tunables_init(struct obd_device *obd)
+{
+	int rc;
+
+	obd->obd_ktype.default_attrs = mdc_attrs;
+	obd->obd_vars = lprocfs_mdc_obd_vars;
+
+	rc = lprocfs_obd_setup(obd, false);
+	if (rc)
+		goto out_failed;
+#ifdef CONFIG_PROC_FS
+	rc = lprocfs_alloc_md_stats(obd, 0);
+	if (rc) {
+		lprocfs_obd_cleanup(obd);
+		goto out_failed;
+	}
+#endif
+	rc = sptlrpc_lprocfs_cliobd_attach(obd);
+	if (rc) {
+#ifdef CONFIG_PROC_FS
+		lprocfs_free_md_stats(obd);
+#endif
+		lprocfs_obd_cleanup(obd);
+		goto out_failed;
+	}
+	ptlrpc_lprocfs_register_obd(obd);
+
+out_failed:
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c
index c99a3bacf24d6..1c8eb65110500 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c
@@ -23,6 +23,8 @@
  * Copyright (c) 2017, Commissariat a l'Energie Atomique et aux Energies
  *                     Alternatives.
  *
+ * Copyright (c) 2017, Intel Corporation.
+ *
  * Author: Henri Doreau <henri.doreau@cea.fr>
  */
 
@@ -31,9 +33,11 @@
 #include <linux/init.h>
 #include <linux/kthread.h>
 #include <linux/poll.h>
-#include <linux/miscdevice.h>
+#include <linux/device.h>
+#include <linux/cdev.h>
 
 #include <lustre_log.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
 
 #include "mdc_internal.h"
 
@@ -55,38 +59,44 @@ static LIST_HEAD(chlg_registered_devices);
 
 struct chlg_registered_dev {
 	/* Device name of the form "changelog-{MDTNAME}" */
-	char			ced_name[32];
-	/* Misc device descriptor */
-	struct miscdevice	ced_misc;
+	char			 ced_name[32];
+	/* changelog char device */
+	struct cdev		 ced_cdev;
+	struct device		*ced_device;
 	/* OBDs referencing this device (multiple mount point) */
-	struct list_head	ced_obds;
+	struct list_head	 ced_obds;
 	/* Reference counter for proper deregistration */
-	struct kref		ced_refs;
+	struct kref		 ced_refs;
 	/* Link within the global chlg_registered_devices */
-	struct list_head	ced_link;
+	struct list_head	 ced_link;
 };
 
 struct chlg_reader_state {
 	/* Shortcut to the corresponding OBD device */
-	struct obd_device	*crs_obd;
+	struct obd_device	   *crs_obd;
+	/* the corresponding chlg_registered_dev */
+	struct chlg_registered_dev *crs_ced;
 	/* Producer thread (if any) */
-	struct task_struct	*crs_prod_task;
+	struct task_struct	   *crs_prod_task;
 	/* An error occurred that prevents from reading further */
-	bool			 crs_err;
+	int			    crs_err;
 	/* EOF, no more records available */
-	bool			 crs_eof;
+	bool			    crs_eof;
 	/* Desired start position */
-	__u64			 crs_start_offset;
+	__u64			    crs_start_offset;
 	/* Wait queue for the catalog processing thread */
-	wait_queue_head_t	 crs_waitq_prod;
+	wait_queue_head_t	    crs_waitq_prod;
 	/* Wait queue for the record copy threads */
-	wait_queue_head_t	 crs_waitq_cons;
+	wait_queue_head_t	    crs_waitq_cons;
 	/* Mutex protecting crs_rec_count and crs_rec_queue */
-	struct mutex		 crs_lock;
+	struct mutex		    crs_lock;
 	/* Number of item in the list */
-	__u64			 crs_rec_count;
+	__u64			    crs_rec_count;
 	/* List of prefetched enqueued_record::enq_linkage_items */
-	struct list_head	 crs_rec_queue;
+	struct list_head	    crs_rec_queue;
+	unsigned int		    crs_last_catidx;
+	unsigned int		    crs_last_idx;
+	bool			    crs_poll;
 };
 
 struct chlg_rec_entry {
@@ -103,6 +113,81 @@ enum {
 	CDEV_CHLG_MAX_PREFETCH = 1024,
 };
 
+static DEFINE_IDR(chlg_minor_idr);
+static DEFINE_SPINLOCK(chlg_minor_lock);
+
+static int chlg_minor_alloc(int *pminor)
+{
+	void *minor_allocated = (void *)-1;
+	int minor;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock(&chlg_minor_lock);
+	minor = idr_alloc(&chlg_minor_idr, minor_allocated, 0,
+			  MDC_CHANGELOG_DEV_COUNT, GFP_NOWAIT);
+	spin_unlock(&chlg_minor_lock);
+	idr_preload_end();
+
+	if (minor < 0)
+		return minor;
+
+	*pminor = minor;
+	return 0;
+}
+
+static void chlg_minor_free(int minor)
+{
+	spin_lock(&chlg_minor_lock);
+	idr_remove(&chlg_minor_idr, minor);
+	spin_unlock(&chlg_minor_lock);
+}
+
+static void chlg_device_release(struct device *dev)
+{
+	struct chlg_registered_dev *entry = dev_get_drvdata(dev);
+
+	chlg_minor_free(MINOR(entry->ced_cdev.dev));
+	OBD_FREE_PTR(entry);
+}
+
+/**
+ * Deregister a changelog character device whose refcount has reached zero.
+ */
+static void chlg_dev_clear(struct kref *kref)
+{
+	struct chlg_registered_dev *entry;
+	
+	ENTRY;
+	entry = container_of(kref, struct chlg_registered_dev,
+			     ced_refs);
+
+	list_del(&entry->ced_link);
+	cdev_del(&entry->ced_cdev);
+	device_destroy(mdc_changelog_class, entry->ced_cdev.dev);
+	EXIT;
+}
+
+static inline struct obd_device* chlg_obd_get(struct chlg_registered_dev *dev)
+{
+	struct obd_device *obd;
+
+	mutex_lock(&chlg_registered_dev_lock);
+	if (list_empty(&dev->ced_obds))
+		return NULL;
+
+	obd = list_first_entry(&dev->ced_obds, struct obd_device,
+			       u.cli.cl_chg_dev_linkage);
+	class_incref(obd, "changelog", dev);
+	mutex_unlock(&chlg_registered_dev_lock);
+	return obd;
+}
+
+static inline void chlg_obd_put(struct chlg_registered_dev *dev,
+			 struct obd_device *obd)
+{
+	class_decref(obd, "changelog", dev);
+}
+
 /**
  * ChangeLog catalog processing callback invoked on each record.
  * If the current record is eligible to userland delivery, push
@@ -122,7 +207,6 @@ static int chlg_read_cat_process_cb(const struct lu_env *env,
 	struct llog_changelog_rec *rec;
 	struct chlg_reader_state *crs = data;
 	struct chlg_rec_entry *enq;
-	struct l_wait_info lwi = { 0 };
 	size_t len;
 	int rc;
 	ENTRY;
@@ -132,6 +216,9 @@ static int chlg_read_cat_process_cb(const struct lu_env *env,
 
 	rec = container_of(hdr, struct llog_changelog_rec, cr_hdr);
 
+	crs->crs_last_catidx = llh->lgh_hdr->llh_cat_idx;
+	crs->crs_last_idx = hdr->lrh_index;
+
 	if (rec->cr_hdr.lrh_type != CHANGELOG_REC) {
 		rc = -EINVAL;
 		CERROR("%s: not a changelog rec %x/%d in llog "DFID" rc = %d\n",
@@ -152,9 +239,9 @@ static int chlg_read_cat_process_cb(const struct lu_env *env,
 	       PFID(&rec->cr.cr_tfid), PFID(&rec->cr.cr_pfid),
 	       rec->cr.cr_namelen, changelog_rec_name(&rec->cr));
 
-	l_wait_event(crs->crs_waitq_prod,
-		     (crs->crs_rec_count < CDEV_CHLG_MAX_PREFETCH ||
-		      kthread_should_stop()), &lwi);
+	wait_event_interruptible(crs->crs_waitq_prod,
+				 crs->crs_rec_count < CDEV_CHLG_MAX_PREFETCH ||
+				 kthread_should_stop());
 
 	if (kthread_should_stop())
 		RETURN(LLOG_PROC_BREAK);
@@ -197,13 +284,23 @@ static void enq_record_delete(struct chlg_rec_entry *rec)
 static int chlg_load(void *args)
 {
 	struct chlg_reader_state *crs = args;
-	struct obd_device *obd = crs->crs_obd;
+	struct chlg_registered_dev *ced = crs->crs_ced;
+	struct obd_device *obd = NULL;
 	struct llog_ctxt *ctx = NULL;
 	struct llog_handle *llh = NULL;
-	struct l_wait_info lwi = { 0 };
 	int rc;
 	ENTRY;
 
+	crs->crs_last_catidx = -1;
+	crs->crs_last_idx = 0;
+
+again:
+	obd = chlg_obd_get(ced);
+	if (obd == NULL)
+		RETURN(-ENODEV);
+
+	crs->crs_obd = obd;
+
 	ctx = llog_get_context(obd, LLOG_CHANGELOG_REPL_CTXT);
 	if (ctx == NULL)
 		GOTO(err_out, rc = -ENOENT);
@@ -216,24 +313,41 @@ static int chlg_load(void *args)
 		GOTO(err_out, rc);
 	}
 
-	rc = llog_init_handle(NULL, llh, LLOG_F_IS_CAT|LLOG_F_EXT_JOBID, NULL);
+
+	rc = llog_init_handle(NULL, llh,
+			      LLOG_F_IS_CAT |
+			      LLOG_F_EXT_JOBID |
+			      LLOG_F_EXT_EXTRA_FLAGS |
+			      LLOG_F_EXT_X_UIDGID |
+			      LLOG_F_EXT_X_NID |
+			      LLOG_F_EXT_X_OMODE |
+			      LLOG_F_EXT_X_XATTR,
+			      NULL);
 	if (rc) {
 		CERROR("%s: fail to init llog handle: rc = %d\n",
 		       obd->obd_name, rc);
 		GOTO(err_out, rc);
 	}
 
-	rc = llog_cat_process(NULL, llh, chlg_read_cat_process_cb, crs, 0, 0);
+	rc = llog_cat_process(NULL, llh, chlg_read_cat_process_cb, crs,
+				crs->crs_last_catidx, crs->crs_last_idx);
 	if (rc < 0) {
 		CERROR("%s: fail to process llog: rc = %d\n", obd->obd_name, rc);
 		GOTO(err_out, rc);
 	}
+	if (!kthread_should_stop() && crs->crs_poll) {
+		llog_cat_close(NULL, llh);
+		llog_ctxt_put(ctx);
+		class_decref(obd, "changelog", crs);
+		schedule_timeout_interruptible(HZ);
+		goto again;
+	}
 
 	crs->crs_eof = true;
 
 err_out:
 	if (rc < 0)
-		crs->crs_err = true;
+		crs->crs_err = rc;
 
 	wake_up_all(&crs->crs_waitq_cons);
 
@@ -243,7 +357,9 @@ static int chlg_load(void *args)
 	if (ctx != NULL)
 		llog_ctxt_put(ctx);
 
-	l_wait_event(crs->crs_waitq_prod, kthread_should_stop(), &lwi);
+	crs->crs_obd = NULL;
+	chlg_obd_put(ced, obd);
+	wait_event_interruptible(crs->crs_waitq_prod, kthread_should_stop());
 
 	RETURN(rc);
 }
@@ -266,17 +382,22 @@ static ssize_t chlg_read(struct file *file, char __user *buff, size_t count,
 	struct chlg_reader_state *crs = file->private_data;
 	struct chlg_rec_entry *rec;
 	struct chlg_rec_entry *tmp;
-	struct l_wait_info lwi = { 0 };
-	ssize_t  written_total = 0;
+	size_t written_total = 0;
+	ssize_t rc;
 	LIST_HEAD(consumed);
 	ENTRY;
 
-	if (file->f_flags & O_NONBLOCK && crs->crs_rec_count == 0)
-		RETURN(-EAGAIN);
+	if (file->f_flags & O_NONBLOCK && crs->crs_rec_count == 0) {
+		if (crs->crs_err < 0)
+			RETURN(crs->crs_err);
+		else if (crs->crs_eof)
+			RETURN(0);
+		else
+			RETURN(-EAGAIN);
+	}
 
-	l_wait_event(crs->crs_waitq_cons,
-		     crs->crs_rec_count > 0 || crs->crs_eof || crs->crs_err,
-		     &lwi);
+	rc = wait_event_interruptible(crs->crs_waitq_cons,
+			crs->crs_rec_count > 0 || crs->crs_eof || crs->crs_err);
 
 	mutex_lock(&crs->crs_lock);
 	list_for_each_entry_safe(rec, tmp, &crs->crs_rec_queue, enq_linkage) {
@@ -284,8 +405,7 @@ static ssize_t chlg_read(struct file *file, char __user *buff, size_t count,
 			break;
 
 		if (copy_to_user(buff, rec->enq_record, rec->enq_length)) {
-			if (written_total == 0)
-				written_total = -EFAULT;
+			rc = -EFAULT;
 			break;
 		}
 
@@ -299,15 +419,19 @@ static ssize_t chlg_read(struct file *file, char __user *buff, size_t count,
 	}
 	mutex_unlock(&crs->crs_lock);
 
-	if (written_total > 0)
+	if (written_total > 0) {
+		rc = written_total;
 		wake_up_all(&crs->crs_waitq_prod);
+	} else if (rc == 0) {
+		rc = crs->crs_err;
+	}
 
 	list_for_each_entry_safe(rec, tmp, &consumed, enq_linkage)
 		enq_record_delete(rec);
 
 	*ppos = crs->crs_start_offset;
 
-	RETURN(written_total);
+	RETURN(rc);
 }
 
 /**
@@ -392,15 +516,23 @@ static loff_t chlg_llseek(struct file *file, loff_t off, int whence)
  */
 static int chlg_clear(struct chlg_reader_state *crs, __u32 reader, __u64 record)
 {
-	struct obd_device *obd = crs->crs_obd;
+	struct obd_device *obd = NULL;
 	struct changelog_setinfo cs  = {
 		.cs_recno = record,
 		.cs_id    = reader
 	};
+	int rc;
+
+	obd = chlg_obd_get(crs->crs_ced);
+	if (obd == NULL)
+		return -ENODEV;
+
+	rc = obd_set_info_async(NULL, obd->obd_self_export,
+				strlen(KEY_CHANGELOG_CLEAR),
+				KEY_CHANGELOG_CLEAR, sizeof(cs), &cs, NULL);
 
-	return obd_set_info_async(NULL, obd->obd_self_export,
-				  strlen(KEY_CHANGELOG_CLEAR),
-				  KEY_CHANGELOG_CLEAR, sizeof(cs), &cs, NULL);
+	chlg_obd_put(crs->crs_ced, obd);
+	return rc;
 }
 
 /** Maximum changelog control command size */
@@ -449,31 +581,6 @@ static ssize_t chlg_write(struct file *file, const char __user *buff,
 	return rc < 0 ? rc : count;
 }
 
-/**
- * Find the OBD device associated to a changelog character device.
- * @param[in]  cdev  character device instance descriptor
- * @return corresponding OBD device or NULL if none was found.
- */
-static struct obd_device *chlg_obd_get(dev_t cdev)
-{
-	int minor = MINOR(cdev);
-	struct obd_device *obd = NULL;
-	struct chlg_registered_dev *curr;
-
-	mutex_lock(&chlg_registered_dev_lock);
-	list_for_each_entry(curr, &chlg_registered_devices, ced_link) {
-		if (curr->ced_misc.minor == minor) {
-			/* take the first available OBD device attached */
-			obd = list_first_entry(&curr->ced_obds,
-					       struct obd_device,
-					       u.cli.cl_chg_dev_linkage);
-			break;
-		}
-	}
-	mutex_unlock(&chlg_registered_dev_lock);
-	return obd;
-}
-
 /**
  * Open handler, initialize internal CRS state and spawn prefetch thread if
  * needed.
@@ -484,19 +591,19 @@ static struct obd_device *chlg_obd_get(dev_t cdev)
 static int chlg_open(struct inode *inode, struct file *file)
 {
 	struct chlg_reader_state *crs;
-	struct obd_device *obd = chlg_obd_get(inode->i_rdev);
+	struct chlg_registered_dev *dev;
 	struct task_struct *task;
 	int rc;
 	ENTRY;
 
-	if (!obd)
-		RETURN(-ENODEV);
+	dev = container_of(inode->i_cdev, struct chlg_registered_dev, ced_cdev);
 
 	OBD_ALLOC_PTR(crs);
 	if (!crs)
 		RETURN(-ENOMEM);
 
-	crs->crs_obd = obd;
+	kref_get(&dev->ced_refs);
+	crs->crs_ced = dev;
 	crs->crs_err = false;
 	crs->crs_eof = false;
 
@@ -510,7 +617,7 @@ static int chlg_open(struct inode *inode, struct file *file)
 		if (IS_ERR(task)) {
 			rc = PTR_ERR(task);
 			CERROR("%s: cannot start changelog thread: rc = %d\n",
-			       obd->obd_name, rc);
+			       dev->ced_name, rc);
 			GOTO(err_crs, rc);
 		}
 		crs->crs_prod_task = task;
@@ -520,6 +627,7 @@ static int chlg_open(struct inode *inode, struct file *file)
 	RETURN(0);
 
 err_crs:
+	kref_put(&dev->ced_refs, chlg_dev_clear);
 	OBD_FREE_PTR(crs);
 	return rc;
 }
@@ -536,15 +644,18 @@ static int chlg_release(struct inode *inode, struct file *file)
 	struct chlg_reader_state *crs = file->private_data;
 	struct chlg_rec_entry *rec;
 	struct chlg_rec_entry *tmp;
+	int rc = 0;
 
 	if (crs->crs_prod_task)
-		kthread_stop(crs->crs_prod_task);
+		rc = kthread_stop(crs->crs_prod_task);
 
 	list_for_each_entry_safe(rec, tmp, &crs->crs_rec_queue, enq_linkage)
 		enq_record_delete(rec);
 
+	kref_put(&crs->crs_ced->ced_refs, chlg_dev_clear);
 	OBD_FREE_PTR(crs);
-	return 0;
+
+	return rc;
 }
 
 /**
@@ -572,6 +683,23 @@ static unsigned int chlg_poll(struct file *file, poll_table *wait)
 	return mask;
 }
 
+static long chlg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int rc;
+
+	struct chlg_reader_state *crs = file->private_data;
+	switch (cmd) {
+	case OBD_IOC_CHLG_POLL:
+		crs->crs_poll = !!arg;
+		rc = 0;
+		break;
+	default:
+		rc = -EINVAL;
+		break;
+	}
+	return rc;
+}
+
 static const struct file_operations chlg_fops = {
 	.owner		= THIS_MODULE,
 	.llseek		= chlg_llseek,
@@ -580,17 +708,18 @@ static const struct file_operations chlg_fops = {
 	.open		= chlg_open,
 	.release	= chlg_release,
 	.poll		= chlg_poll,
+	.unlocked_ioctl	= chlg_ioctl,
 };
 
 /**
  * This uses obd_name of the form: "testfs-MDT0000-mdc-ffff88006501600"
  * and returns a name of the form: "changelog-testfs-MDT0000".
  */
-static void get_chlg_name(char *name, size_t name_len, struct obd_device *obd)
+static void get_target_name(char *name, size_t name_len, struct obd_device *obd)
 {
 	int i;
 
-	snprintf(name, name_len, "changelog-%s", obd->obd_name);
+	snprintf(name, name_len, "%s", obd->obd_name);
 
 	/* Find the 2nd '-' from the end and truncate on it */
 	for (i = 0; i < 2; i++) {
@@ -652,18 +781,16 @@ int mdc_changelog_cdev_init(struct obd_device *obd)
 {
 	struct chlg_registered_dev *exist;
 	struct chlg_registered_dev *entry;
-	int rc;
+	struct device *device;
+	dev_t dev;
+	int minor, rc;
 	ENTRY;
 
 	OBD_ALLOC_PTR(entry);
 	if (entry == NULL)
 		RETURN(-ENOMEM);
 
-	get_chlg_name(entry->ced_name, sizeof(entry->ced_name), obd);
-
-	entry->ced_misc.minor = MISC_DYNAMIC_MINOR;
-	entry->ced_misc.name  = entry->ced_name;
-	entry->ced_misc.fops  = &chlg_fops;
+	get_target_name(entry->ced_name, sizeof(entry->ced_name), obd);
 
 	kref_init(&entry->ced_refs);
 	INIT_LIST_HEAD(&entry->ced_obds);
@@ -677,15 +804,41 @@ int mdc_changelog_cdev_init(struct obd_device *obd)
 		GOTO(out_unlock, rc = 0);
 	}
 
+	list_add_tail(&obd->u.cli.cl_chg_dev_linkage, &entry->ced_obds);
+	list_add_tail(&entry->ced_link, &chlg_registered_devices);
+
 	/* Register new character device */
-	rc = misc_register(&entry->ced_misc);
-	if (rc != 0)
+	cdev_init(&entry->ced_cdev, &chlg_fops);
+	entry->ced_cdev.owner = THIS_MODULE;
+
+	rc = chlg_minor_alloc(&minor);
+	if (rc)
 		GOTO(out_unlock, rc);
 
-	list_add_tail(&obd->u.cli.cl_chg_dev_linkage, &entry->ced_obds);
-	list_add_tail(&entry->ced_link, &chlg_registered_devices);
+	dev = MKDEV(MAJOR(mdc_changelog_dev), minor);
+	rc = cdev_add(&entry->ced_cdev, dev, 1);
+	if (rc)
+		GOTO(out_minor, rc);
+
+	device = device_create(mdc_changelog_class, NULL, dev, entry, "%s-%s",
+			       MDC_CHANGELOG_DEV_NAME, entry->ced_name);
+	if (IS_ERR(device))
+		GOTO(out_cdev, rc = PTR_ERR(device));
+
+	device->release = chlg_device_release;
+	entry->ced_device = device;
 
 	entry = NULL;	/* prevent it from being freed below */
+	GOTO(out_unlock, rc = 0);
+
+out_cdev:
+	cdev_del(&entry->ced_cdev);
+
+out_minor:
+	chlg_minor_free(minor);
+
+	list_del_init(&obd->u.cli.cl_chg_dev_linkage);
+	list_del(&entry->ced_link);
 
 out_unlock:
 	mutex_unlock(&chlg_registered_dev_lock);
@@ -694,23 +847,6 @@ int mdc_changelog_cdev_init(struct obd_device *obd)
 	RETURN(rc);
 }
 
-/**
- * Deregister a changelog character device whose refcount has reached zero.
- */
-static void chlg_dev_clear(struct kref *kref)
-{
-	struct chlg_registered_dev *entry = container_of(kref,
-						      struct chlg_registered_dev,
-						      ced_refs);
-	ENTRY;
-
-	LASSERT(mutex_is_locked(&chlg_registered_dev_lock));
-	list_del(&entry->ced_link);
-	misc_deregister(&entry->ced_misc);
-	OBD_FREE_PTR(entry);
-	EXIT;
-}
-
 /**
  * Release OBD, decrease reference count of the corresponding changelog device.
  */
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_dev.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_dev.c
new file mode 100644
index 0000000000000..3606778434879
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_dev.c
@@ -0,0 +1,1564 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Implementation of cl_device, cl_req for MDC layer.
+ *
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+#include <obd_class.h>
+#include <lustre_osc.h>
+
+#include "mdc_internal.h"
+
+static void mdc_lock_build_policy(const struct lu_env *env,
+				  union ldlm_policy_data *policy)
+{
+	memset(policy, 0, sizeof *policy);
+	policy->l_inodebits.bits = MDS_INODELOCK_DOM;
+}
+
+int mdc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
+{
+	return osc_ldlm_glimpse_ast(dlmlock, data);
+}
+
+static void mdc_lock_build_einfo(const struct lu_env *env,
+				 const struct cl_lock *lock,
+				 struct osc_object *osc,
+				 struct ldlm_enqueue_info *einfo)
+{
+	einfo->ei_type = LDLM_IBITS;
+	einfo->ei_mode = osc_cl_lock2ldlm(lock->cll_descr.cld_mode);
+	einfo->ei_cb_bl = mdc_ldlm_blocking_ast;
+	einfo->ei_cb_cp = ldlm_completion_ast;
+	einfo->ei_cb_gl = mdc_ldlm_glimpse_ast;
+	einfo->ei_cbdata = osc; /* value to be put into ->l_ast_data */
+}
+
+static void mdc_lock_lvb_update(const struct lu_env *env,
+				struct osc_object *osc,
+				struct ldlm_lock *dlmlock,
+				struct ost_lvb *lvb);
+
+static int mdc_set_dom_lock_data(struct ldlm_lock *lock, void *data)
+{
+	int set = 0;
+
+	LASSERT(lock != NULL);
+	LASSERT(lock->l_glimpse_ast == mdc_ldlm_glimpse_ast);
+
+	lock_res_and_lock(lock);
+
+	if (lock->l_ast_data == NULL)
+		lock->l_ast_data = data;
+	if (lock->l_ast_data == data)
+		set = 1;
+
+	unlock_res_and_lock(lock);
+
+	return set;
+}
+
+int mdc_dom_lock_match(const struct lu_env *env, struct obd_export *exp,
+		       struct ldlm_res_id *res_id, enum ldlm_type type,
+		       union ldlm_policy_data *policy, enum ldlm_mode mode,
+		       __u64 *flags, struct osc_object *obj,
+		       struct lustre_handle *lockh, int unref)
+{
+	struct obd_device *obd = exp->exp_obd;
+	__u64 lflags = *flags;
+	enum ldlm_mode rc;
+
+	ENTRY;
+
+	rc = ldlm_lock_match(obd->obd_namespace, lflags,
+			     res_id, type, policy, mode, lockh, unref);
+	if (rc == 0 || lflags & LDLM_FL_TEST_LOCK)
+		RETURN(rc);
+
+	if (obj != NULL) {
+		struct ldlm_lock *lock = ldlm_handle2lock(lockh);
+
+		LASSERT(lock != NULL);
+		if (mdc_set_dom_lock_data(lock, obj)) {
+			lock_res_and_lock(lock);
+			if (!ldlm_is_lvb_cached(lock)) {
+				LASSERT(lock->l_ast_data == obj);
+				mdc_lock_lvb_update(env, obj, lock, NULL);
+				ldlm_set_lvb_cached(lock);
+			}
+			unlock_res_and_lock(lock);
+		} else {
+			ldlm_lock_decref(lockh, rc);
+			rc = 0;
+		}
+		LDLM_LOCK_PUT(lock);
+	}
+	RETURN(rc);
+}
+
+/**
+ * Finds an existing lock covering a page with given index.
+ * Copy of osc_obj_dlmlock_at_pgoff() but for DoM IBITS lock.
+ */
+struct ldlm_lock *mdc_dlmlock_at_pgoff(const struct lu_env *env,
+				       struct osc_object *obj, pgoff_t index,
+				       enum osc_dap_flags dap_flags)
+{
+	struct osc_thread_info *info = osc_env_info(env);
+	struct ldlm_res_id *resname = &info->oti_resname;
+	union ldlm_policy_data *policy = &info->oti_policy;
+	struct lustre_handle lockh;
+	struct ldlm_lock *lock = NULL;
+	enum ldlm_mode mode;
+	__u64 flags;
+
+	ENTRY;
+
+	fid_build_reg_res_name(lu_object_fid(osc2lu(obj)), resname);
+	mdc_lock_build_policy(env, policy);
+
+	flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
+	if (dap_flags & OSC_DAP_FL_TEST_LOCK)
+		flags |= LDLM_FL_TEST_LOCK;
+
+again:
+	/* Next, search for already existing extent locks that will cover us */
+	/* If we're trying to read, we also search for an existing PW lock.  The
+	 * VFS and page cache already protect us locally, so lots of readers/
+	 * writers can share a single PW lock. */
+	mode = mdc_dom_lock_match(env, osc_export(obj), resname, LDLM_IBITS,
+				  policy, LCK_PR | LCK_PW | LCK_GROUP, &flags,
+				  obj, &lockh,
+				  dap_flags & OSC_DAP_FL_CANCELING);
+	if (mode != 0) {
+		lock = ldlm_handle2lock(&lockh);
+		/* RACE: the lock is cancelled so let's try again */
+		if (unlikely(lock == NULL))
+			goto again;
+	}
+
+	RETURN(lock);
+}
+
+/**
+ * Check if page @page is covered by an extra lock or discard it.
+ */
+static int mdc_check_and_discard_cb(const struct lu_env *env, struct cl_io *io,
+				    struct osc_page *ops, void *cbdata)
+{
+	struct osc_thread_info *info = osc_env_info(env);
+	struct osc_object *osc = cbdata;
+	pgoff_t index;
+
+	index = osc_index(ops);
+	if (index >= info->oti_fn_index) {
+		struct ldlm_lock *tmp;
+		struct cl_page *page = ops->ops_cl.cpl_page;
+
+		/* refresh non-overlapped index */
+		tmp = mdc_dlmlock_at_pgoff(env, osc, index,
+					   OSC_DAP_FL_TEST_LOCK);
+		if (tmp != NULL) {
+			info->oti_fn_index = CL_PAGE_EOF;
+			LDLM_LOCK_PUT(tmp);
+		} else if (cl_page_own(env, io, page) == 0) {
+			/* discard the page */
+			cl_page_discard(env, io, page);
+			cl_page_disown(env, io, page);
+		} else {
+			LASSERT(page->cp_state == CPS_FREEING);
+		}
+	}
+
+	info->oti_next_index = index + 1;
+	return CLP_GANG_OKAY;
+}
+
+/**
+ * Discard pages protected by the given lock. This function traverses radix
+ * tree to find all covering pages and discard them. If a page is being covered
+ * by other locks, it should remain in cache.
+ *
+ * If error happens on any step, the process continues anyway (the reasoning
+ * behind this being that lock cancellation cannot be delayed indefinitely).
+ */
+static int mdc_lock_discard_pages(const struct lu_env *env,
+				  struct osc_object *osc,
+				  pgoff_t start, pgoff_t end,
+				  bool discard)
+{
+	struct osc_thread_info *info = osc_env_info(env);
+	struct cl_io *io = &info->oti_io;
+	osc_page_gang_cbt cb;
+	int res;
+	int result;
+
+	ENTRY;
+
+	io->ci_obj = cl_object_top(osc2cl(osc));
+	io->ci_ignore_layout = 1;
+	result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+	if (result != 0)
+		GOTO(out, result);
+
+	cb = discard ? osc_discard_cb : mdc_check_and_discard_cb;
+	info->oti_fn_index = info->oti_next_index = start;
+	do {
+		res = osc_page_gang_lookup(env, io, osc, info->oti_next_index,
+					   end, cb, (void *)osc);
+		if (info->oti_next_index > end)
+			break;
+
+		if (res == CLP_GANG_RESCHED)
+			cond_resched();
+	} while (res != CLP_GANG_OKAY);
+out:
+	cl_io_fini(env, io);
+	RETURN(result);
+}
+
+static int mdc_lock_flush(const struct lu_env *env, struct osc_object *obj,
+			  pgoff_t start, pgoff_t end, enum cl_lock_mode mode,
+			  bool discard)
+{
+	int result = 0;
+	int rc;
+
+	ENTRY;
+
+	if (mode == CLM_WRITE) {
+		result = osc_cache_writeback_range(env, obj, start, end, 1,
+						   discard);
+		CDEBUG(D_CACHE, "object %p: [%lu -> %lu] %d pages were %s.\n",
+		       obj, start, end, result,
+		       discard ? "discarded" : "written back");
+		if (result > 0)
+			result = 0;
+	}
+
+	rc = mdc_lock_discard_pages(env, obj, start, end, discard);
+	if (result == 0 && rc < 0)
+		result = rc;
+
+	RETURN(result);
+}
+
+void mdc_lock_lockless_cancel(const struct lu_env *env,
+			      const struct cl_lock_slice *slice)
+{
+	struct osc_lock *ols = cl2osc_lock(slice);
+	struct osc_object *osc = cl2osc(slice->cls_obj);
+	struct cl_lock_descr *descr = &slice->cls_lock->cll_descr;
+	int rc;
+
+	LASSERT(ols->ols_dlmlock == NULL);
+	rc = mdc_lock_flush(env, osc, descr->cld_start, descr->cld_end,
+			    descr->cld_mode, 0);
+	if (rc != 0)
+		CERROR("Pages for lockless lock %p were not purged(%d)\n",
+		       ols, rc);
+
+	osc_lock_wake_waiters(env, osc, ols);
+}
+
+/**
+ * Helper for osc_dlm_blocking_ast() handling discrepancies between cl_lock
+ * and ldlm_lock caches.
+ */
+static int mdc_dlm_blocking_ast0(const struct lu_env *env,
+				 struct ldlm_lock *dlmlock,
+				 int flag)
+{
+	struct cl_object *obj = NULL;
+	int result = 0;
+	bool discard;
+	enum cl_lock_mode mode = CLM_READ;
+
+	ENTRY;
+
+	LASSERT(flag == LDLM_CB_CANCELING);
+	LASSERT(dlmlock != NULL);
+
+	lock_res_and_lock(dlmlock);
+	if (dlmlock->l_granted_mode != dlmlock->l_req_mode) {
+		dlmlock->l_ast_data = NULL;
+		unlock_res_and_lock(dlmlock);
+		RETURN(0);
+	}
+
+	discard = ldlm_is_discard_data(dlmlock);
+	if (dlmlock->l_granted_mode & (LCK_PW | LCK_GROUP))
+		mode = CLM_WRITE;
+
+	if (dlmlock->l_ast_data != NULL) {
+		obj = osc2cl(dlmlock->l_ast_data);
+		dlmlock->l_ast_data = NULL;
+		cl_object_get(obj);
+	}
+	unlock_res_and_lock(dlmlock);
+
+	/* if l_ast_data is NULL, the dlmlock was enqueued by AGL or
+	 * the object has been destroyed. */
+	if (obj != NULL) {
+		struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+
+		/* Destroy pages covered by the extent of the DLM lock */
+		result = mdc_lock_flush(env, cl2osc(obj), cl_index(obj, 0),
+					CL_PAGE_EOF, mode, discard);
+		/* Losing a lock, set KMS to 0.
+		 * NB: assumed that DOM lock covers whole data on MDT.
+		 */
+		/* losing a lock, update kms */
+		lock_res_and_lock(dlmlock);
+		cl_object_attr_lock(obj);
+		attr->cat_kms = 0;
+		cl_object_attr_update(env, obj, attr, CAT_KMS);
+		cl_object_attr_unlock(obj);
+		unlock_res_and_lock(dlmlock);
+		cl_object_put(env, obj);
+	}
+	RETURN(result);
+}
+
+int mdc_ldlm_blocking_ast(struct ldlm_lock *dlmlock,
+			  struct ldlm_lock_desc *new, void *data, int flag)
+{
+	int rc = 0;
+
+	ENTRY;
+
+	switch (flag) {
+	case LDLM_CB_BLOCKING: {
+		struct lustre_handle lockh;
+
+		ldlm_lock2handle(dlmlock, &lockh);
+		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+		if (rc == -ENODATA)
+			rc = 0;
+		break;
+	}
+	case LDLM_CB_CANCELING: {
+		struct lu_env *env;
+		__u16 refcheck;
+
+		/*
+		 * This can be called in the context of outer IO, e.g.,
+		 *
+		 *    osc_enqueue_base()->...
+		 *      ->ldlm_prep_elc_req()->...
+		 *        ->ldlm_cancel_callback()->...
+		 *          ->osc_ldlm_blocking_ast()
+		 *
+		 * new environment has to be created to not corrupt outer
+		 * context.
+		 */
+		env = cl_env_get(&refcheck);
+		if (IS_ERR(env)) {
+			rc = PTR_ERR(env);
+			break;
+		}
+
+		rc = mdc_dlm_blocking_ast0(env, dlmlock, flag);
+		cl_env_put(env, &refcheck);
+		break;
+	}
+	default:
+		LBUG();
+	}
+	RETURN(rc);
+}
+
+/**
+ * Updates object attributes from a lock value block (lvb) received together
+ * with the DLM lock reply from the server.
+ * This can be optimized to not update attributes when lock is a result of a
+ * local match.
+ *
+ * Called under lock and resource spin-locks.
+ */
+void mdc_lock_lvb_update(const struct lu_env *env, struct osc_object *osc,
+			 struct ldlm_lock *dlmlock, struct ost_lvb *lvb)
+{
+	struct cl_object *obj = osc2cl(osc);
+	struct lov_oinfo *oinfo = osc->oo_oinfo;
+	struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+	unsigned valid = CAT_BLOCKS | CAT_ATIME | CAT_CTIME | CAT_MTIME |
+			 CAT_SIZE;
+	unsigned int setkms = 0;
+
+	ENTRY;
+
+	if (lvb == NULL) {
+		LASSERT(dlmlock != NULL);
+		lvb = &dlmlock->l_ost_lvb;
+	}
+	cl_lvb2attr(attr, lvb);
+
+	cl_object_attr_lock(obj);
+	if (dlmlock != NULL) {
+		__u64 size;
+
+		check_res_locked(dlmlock->l_resource);
+		size = lvb->lvb_size;
+
+		if (size >= oinfo->loi_kms) {
+			valid |= CAT_KMS;
+			attr->cat_kms = size;
+			setkms = 1;
+		}
+	}
+
+	/* The size should not be less than the kms */
+	if (attr->cat_size < oinfo->loi_kms)
+		attr->cat_size = oinfo->loi_kms;
+
+	LDLM_DEBUG(dlmlock, "acquired size %llu, setting rss=%llu;%s "
+		   "kms=%llu, end=%llu", lvb->lvb_size, attr->cat_size,
+		   setkms ? "" : " leaving",
+		   setkms ? attr->cat_kms : oinfo->loi_kms,
+		   dlmlock ? dlmlock->l_policy_data.l_extent.end : -1ull);
+
+	cl_object_attr_update(env, obj, attr, valid);
+	cl_object_attr_unlock(obj);
+	EXIT;
+}
+
+static void mdc_lock_granted(const struct lu_env *env, struct osc_lock *oscl,
+			     struct lustre_handle *lockh)
+{
+	struct osc_object *osc = cl2osc(oscl->ols_cl.cls_obj);
+	struct ldlm_lock *dlmlock;
+
+	ENTRY;
+
+	dlmlock = ldlm_handle2lock_long(lockh, 0);
+	LASSERT(dlmlock != NULL);
+
+	/* lock reference taken by ldlm_handle2lock_long() is
+	 * owned by osc_lock and released in osc_lock_detach()
+	 */
+	lu_ref_add(&dlmlock->l_reference, "osc_lock", oscl);
+	oscl->ols_has_ref = 1;
+
+	LASSERT(oscl->ols_dlmlock == NULL);
+	oscl->ols_dlmlock = dlmlock;
+
+	/* This may be a matched lock for glimpse request, do not hold
+	 * lock reference in that case. */
+	if (!oscl->ols_glimpse) {
+		/* hold a refc for non glimpse lock which will
+		 * be released in osc_lock_cancel() */
+		lustre_handle_copy(&oscl->ols_handle, lockh);
+		ldlm_lock_addref(lockh, oscl->ols_einfo.ei_mode);
+		oscl->ols_hold = 1;
+	}
+
+	/* Lock must have been granted. */
+	lock_res_and_lock(dlmlock);
+	if (dlmlock->l_granted_mode == dlmlock->l_req_mode) {
+		struct cl_lock_descr *descr = &oscl->ols_cl.cls_lock->cll_descr;
+
+		/* extend the lock extent, otherwise it will have problem when
+		 * we decide whether to grant a lockless lock. */
+		descr->cld_mode = osc_ldlm2cl_lock(dlmlock->l_granted_mode);
+		descr->cld_start = cl_index(descr->cld_obj, 0);
+		descr->cld_end = CL_PAGE_EOF;
+
+		/* no lvb update for matched lock */
+		if (!ldlm_is_lvb_cached(dlmlock)) {
+			LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY);
+			LASSERT(osc == dlmlock->l_ast_data);
+			mdc_lock_lvb_update(env, osc, dlmlock, NULL);
+			ldlm_set_lvb_cached(dlmlock);
+		}
+	}
+	unlock_res_and_lock(dlmlock);
+
+	LASSERT(oscl->ols_state != OLS_GRANTED);
+	oscl->ols_state = OLS_GRANTED;
+	EXIT;
+}
+
+/**
+ * Lock upcall function that is executed either when a reply to ENQUEUE rpc is
+ * received from a server, or after osc_enqueue_base() matched a local DLM
+ * lock.
+ */
+static int mdc_lock_upcall(void *cookie, struct lustre_handle *lockh,
+			   int errcode)
+{
+	struct osc_lock *oscl = cookie;
+	struct cl_lock_slice *slice = &oscl->ols_cl;
+	struct lu_env *env;
+	int rc;
+
+	ENTRY;
+
+	env = cl_env_percpu_get();
+	/* should never happen, similar to osc_ldlm_blocking_ast(). */
+	LASSERT(!IS_ERR(env));
+
+	rc = ldlm_error2errno(errcode);
+	if (oscl->ols_state == OLS_ENQUEUED) {
+		oscl->ols_state = OLS_UPCALL_RECEIVED;
+	} else if (oscl->ols_state == OLS_CANCELLED) {
+		rc = -EIO;
+	} else {
+		CERROR("Impossible state: %d\n", oscl->ols_state);
+		LBUG();
+	}
+
+	CDEBUG(D_INODE, "rc %d, err %d\n", rc, errcode);
+	if (rc == 0)
+		mdc_lock_granted(env, oscl, lockh);
+
+	/* Error handling, some errors are tolerable. */
+	if (oscl->ols_locklessable && rc == -EUSERS) {
+		/* This is a tolerable error, turn this lock into
+		 * lockless lock.
+		 */
+		osc_object_set_contended(cl2osc(slice->cls_obj));
+		LASSERT(slice->cls_ops != oscl->ols_lockless_ops);
+
+		/* Change this lock to ldlmlock-less lock. */
+		osc_lock_to_lockless(env, oscl, 1);
+		oscl->ols_state = OLS_GRANTED;
+		rc = 0;
+	} else if (oscl->ols_glimpse && rc == -ENAVAIL) {
+		LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY);
+		mdc_lock_lvb_update(env, cl2osc(slice->cls_obj),
+				    NULL, &oscl->ols_lvb);
+		/* Hide the error. */
+		rc = 0;
+	}
+
+	if (oscl->ols_owner != NULL)
+		cl_sync_io_note(env, oscl->ols_owner, rc);
+	cl_env_percpu_put(env);
+
+	RETURN(rc);
+}
+
+int mdc_fill_lvb(struct ptlrpc_request *req, struct ost_lvb *lvb)
+{
+	struct mdt_body *body;
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (!body)
+		RETURN(-EPROTO);
+
+	lvb->lvb_mtime = body->mbo_mtime;
+	lvb->lvb_atime = body->mbo_atime;
+	lvb->lvb_ctime = body->mbo_ctime;
+	lvb->lvb_blocks = body->mbo_dom_blocks;
+	lvb->lvb_size = body->mbo_dom_size;
+
+	RETURN(0);
+}
+
+int mdc_enqueue_fini(struct ptlrpc_request *req, osc_enqueue_upcall_f upcall,
+		     void *cookie, struct lustre_handle *lockh,
+		     enum ldlm_mode mode, __u64 *flags, int errcode)
+{
+	struct osc_lock *ols = cookie;
+	struct ldlm_lock *lock;
+	int rc = 0;
+
+	ENTRY;
+
+	/* The request was created before ldlm_cli_enqueue call. */
+	if (errcode == ELDLM_LOCK_ABORTED) {
+		struct ldlm_reply *rep;
+
+		rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+		LASSERT(rep != NULL);
+
+		rep->lock_policy_res2 =
+			ptlrpc_status_ntoh(rep->lock_policy_res2);
+		if (rep->lock_policy_res2)
+			errcode = rep->lock_policy_res2;
+
+		rc = mdc_fill_lvb(req, &ols->ols_lvb);
+		*flags |= LDLM_FL_LVB_READY;
+	} else if (errcode == ELDLM_OK) {
+		/* Callers have references, should be valid always */
+		lock = ldlm_handle2lock(lockh);
+		LASSERT(lock);
+
+		rc = mdc_fill_lvb(req, &lock->l_ost_lvb);
+		LDLM_LOCK_PUT(lock);
+		*flags |= LDLM_FL_LVB_READY;
+	}
+
+	/* Call the update callback. */
+	rc = (*upcall)(cookie, lockh, rc < 0 ? rc : errcode);
+
+	/* release the reference taken in ldlm_cli_enqueue() */
+	if (errcode == ELDLM_LOCK_MATCHED)
+		errcode = ELDLM_OK;
+	if (errcode == ELDLM_OK && lustre_handle_is_used(lockh))
+		ldlm_lock_decref(lockh, mode);
+
+	RETURN(rc);
+}
+
+int mdc_enqueue_interpret(const struct lu_env *env, struct ptlrpc_request *req,
+			  struct osc_enqueue_args *aa, int rc)
+{
+	struct ldlm_lock *lock;
+	struct lustre_handle *lockh = &aa->oa_lockh;
+	enum ldlm_mode mode = aa->oa_mode;
+
+	ENTRY;
+
+	LASSERT(!aa->oa_speculative);
+
+	/* ldlm_cli_enqueue is holding a reference on the lock, so it must
+	 * be valid. */
+	lock = ldlm_handle2lock(lockh);
+	LASSERTF(lock != NULL,
+		 "lockh %#llx, req %p, aa %p - client evicted?\n",
+		 lockh->cookie, req, aa);
+
+	/* Take an additional reference so that a blocking AST that
+	 * ldlm_cli_enqueue_fini() might post for a failed lock, is guaranteed
+	 * to arrive after an upcall has been executed by
+	 * osc_enqueue_fini(). */
+	ldlm_lock_addref(lockh, mode);
+
+	/* Let cl_lock_state_wait fail with -ERESTARTSYS to unuse sublocks. */
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_HANG, 2);
+
+	/* Let CP AST to grant the lock first. */
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1);
+
+	/* Complete obtaining the lock procedure. */
+	rc = ldlm_cli_enqueue_fini(aa->oa_exp, req, aa->oa_type, 1,
+				   aa->oa_mode, aa->oa_flags, NULL, 0,
+				   lockh, rc);
+	/* Complete mdc stuff. */
+	rc = mdc_enqueue_fini(req, aa->oa_upcall, aa->oa_cookie, lockh, mode,
+			      aa->oa_flags, rc);
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
+
+	ldlm_lock_decref(lockh, mode);
+	LDLM_LOCK_PUT(lock);
+	RETURN(rc);
+}
+
+/* When enqueuing asynchronously, locks are not ordered, we can obtain a lock
+ * from the 2nd OSC before a lock from the 1st one. This does not deadlock with
+ * other synchronous requests, however keeping some locks and trying to obtain
+ * others may take a considerable amount of time in a case of ost failure; and
+ * when other sync requests do not get released lock from a client, the client
+ * is excluded from the cluster -- such scenarious make the life difficult, so
+ * release locks just after they are obtained. */
+int mdc_enqueue_send(const struct lu_env *env, struct obd_export *exp,
+		     struct ldlm_res_id *res_id, __u64 *flags,
+		     union ldlm_policy_data *policy,
+		     struct ost_lvb *lvb, int kms_valid,
+		     osc_enqueue_upcall_f upcall, void *cookie,
+		     struct ldlm_enqueue_info *einfo, int async)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lustre_handle lockh = { 0 };
+	struct ptlrpc_request *req = NULL;
+	struct ldlm_intent *lit;
+	enum ldlm_mode mode;
+	bool glimpse = *flags & LDLM_FL_HAS_INTENT;
+	__u64 match_flags = *flags;
+	int rc;
+
+	ENTRY;
+
+	mode = einfo->ei_mode;
+	if (einfo->ei_mode == LCK_PR)
+		mode |= LCK_PW;
+
+	if (glimpse)
+		match_flags |= LDLM_FL_BLOCK_GRANTED;
+	/* DOM locking uses LDLM_FL_KMS_IGNORE to mark locks wich have no valid
+	 * LVB information, e.g. canceled locks or locks of just pruned object,
+	 * such locks should be skipped.
+	 */
+	mode = ldlm_lock_match(obd->obd_namespace, match_flags, res_id,
+			       einfo->ei_type, policy, mode, &lockh, 0);
+	if (mode) {
+		struct ldlm_lock *matched;
+
+		if (*flags & LDLM_FL_TEST_LOCK)
+			RETURN(ELDLM_OK);
+
+		matched = ldlm_handle2lock(&lockh);
+
+		if (OBD_FAIL_CHECK(OBD_FAIL_MDC_GLIMPSE_DDOS))
+			ldlm_set_kms_ignore(matched);
+
+		if (mdc_set_dom_lock_data(matched, einfo->ei_cbdata)) {
+			*flags |= LDLM_FL_LVB_READY;
+
+			/* We already have a lock, and it's referenced. */
+			(*upcall)(cookie, &lockh, ELDLM_LOCK_MATCHED);
+
+			ldlm_lock_decref(&lockh, mode);
+			LDLM_LOCK_PUT(matched);
+			RETURN(ELDLM_OK);
+		}
+		ldlm_lock_decref(&lockh, mode);
+		LDLM_LOCK_PUT(matched);
+	}
+
+	if (*flags & (LDLM_FL_TEST_LOCK | LDLM_FL_MATCH_LOCK))
+		RETURN(-ENOLCK);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_INTENT);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	/* pack the intent */
+	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+	lit->opc = glimpse ? IT_GLIMPSE : IT_BRW;
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, 0);
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, 0);
+	ptlrpc_request_set_replen(req);
+
+	/* users of mdc_enqueue() can pass this flag for ldlm_lock_match() */
+	*flags &= ~LDLM_FL_BLOCK_GRANTED;
+	/* All MDC IO locks are intents */
+	*flags |= LDLM_FL_HAS_INTENT;
+	rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, NULL,
+			      0, LVB_T_NONE, &lockh, async);
+	if (async) {
+		if (!rc) {
+			struct osc_enqueue_args *aa;
+
+			CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+			aa = ptlrpc_req_async_args(req);
+			aa->oa_exp = exp;
+			aa->oa_mode = einfo->ei_mode;
+			aa->oa_type = einfo->ei_type;
+			lustre_handle_copy(&aa->oa_lockh, &lockh);
+			aa->oa_upcall = upcall;
+			aa->oa_cookie = cookie;
+			aa->oa_speculative = false;
+			aa->oa_flags = flags;
+			aa->oa_lvb = lvb;
+
+			req->rq_interpret_reply =
+				(ptlrpc_interpterer_t)mdc_enqueue_interpret;
+			ptlrpcd_add_req(req);
+		} else {
+			ptlrpc_req_finished(req);
+		}
+		RETURN(rc);
+	}
+
+	rc = mdc_enqueue_fini(req, upcall, cookie, &lockh, einfo->ei_mode,
+			      flags, rc);
+	ptlrpc_req_finished(req);
+	RETURN(rc);
+}
+
+/**
+ * Implementation of cl_lock_operations::clo_enqueue() method for osc
+ * layer. This initiates ldlm enqueue:
+ *
+ *     - cancels conflicting locks early (osc_lock_enqueue_wait());
+ *
+ *     - calls osc_enqueue_base() to do actual enqueue.
+ *
+ * osc_enqueue_base() is supplied with an upcall function that is executed
+ * when lock is received either after a local cached ldlm lock is matched, or
+ * when a reply from the server is received.
+ *
+ * This function does not wait for the network communication to complete.
+ */
+static int mdc_lock_enqueue(const struct lu_env *env,
+			    const struct cl_lock_slice *slice,
+			    struct cl_io *unused, struct cl_sync_io *anchor)
+{
+	struct osc_thread_info *info = osc_env_info(env);
+	struct osc_io *oio = osc_env_io(env);
+	struct osc_object *osc = cl2osc(slice->cls_obj);
+	struct osc_lock *oscl = cl2osc_lock(slice);
+	struct cl_lock *lock = slice->cls_lock;
+	struct ldlm_res_id *resname = &info->oti_resname;
+	union ldlm_policy_data *policy = &info->oti_policy;
+	osc_enqueue_upcall_f upcall = mdc_lock_upcall;
+	void *cookie = (void *)oscl;
+	bool async = false;
+	int result;
+
+	ENTRY;
+
+	LASSERTF(ergo(oscl->ols_glimpse, lock->cll_descr.cld_mode <= CLM_READ),
+		"lock = %p, ols = %p\n", lock, oscl);
+
+	if (oscl->ols_state == OLS_GRANTED)
+		RETURN(0);
+
+	/* Lockahead is not supported on MDT yet */
+	if (oscl->ols_flags & LDLM_FL_NO_EXPANSION) {
+		result = -EOPNOTSUPP;
+		RETURN(result);
+	}
+
+	if (oscl->ols_flags & LDLM_FL_TEST_LOCK)
+		GOTO(enqueue_base, 0);
+
+	if (oscl->ols_glimpse) {
+		LASSERT(equi(oscl->ols_speculative, anchor == NULL));
+		async = true;
+		GOTO(enqueue_base, 0);
+	}
+
+	result = osc_lock_enqueue_wait(env, osc, oscl);
+	if (result < 0)
+		GOTO(out, result);
+
+	/* we can grant lockless lock right after all conflicting locks
+	 * are canceled. */
+	if (osc_lock_is_lockless(oscl)) {
+		oscl->ols_state = OLS_GRANTED;
+		oio->oi_lockless = 1;
+		RETURN(0);
+	}
+
+enqueue_base:
+	oscl->ols_state = OLS_ENQUEUED;
+	if (anchor != NULL) {
+		atomic_inc(&anchor->csi_sync_nr);
+		oscl->ols_owner = anchor;
+	}
+
+	/**
+	 * DLM lock's ast data must be osc_object;
+	 * DLM's enqueue callback set to osc_lock_upcall() with cookie as
+	 * osc_lock.
+	 */
+	fid_build_reg_res_name(lu_object_fid(osc2lu(osc)), resname);
+	mdc_lock_build_policy(env, policy);
+	LASSERT(!oscl->ols_speculative);
+	result = mdc_enqueue_send(env, osc_export(osc), resname,
+				  &oscl->ols_flags, policy,
+				  &oscl->ols_lvb, osc->oo_oinfo->loi_kms_valid,
+				  upcall, cookie, &oscl->ols_einfo, async);
+	if (result == 0) {
+		if (osc_lock_is_lockless(oscl)) {
+			oio->oi_lockless = 1;
+		} else if (!async) {
+			LASSERT(oscl->ols_state == OLS_GRANTED);
+			LASSERT(oscl->ols_hold);
+			LASSERT(oscl->ols_dlmlock != NULL);
+		}
+	}
+out:
+	if (result < 0) {
+		oscl->ols_state = OLS_CANCELLED;
+		osc_lock_wake_waiters(env, osc, oscl);
+
+		if (anchor != NULL)
+			cl_sync_io_note(env, anchor, result);
+	}
+	RETURN(result);
+}
+
+static const struct cl_lock_operations mdc_lock_lockless_ops = {
+	.clo_fini = osc_lock_fini,
+	.clo_enqueue = mdc_lock_enqueue,
+	.clo_cancel = mdc_lock_lockless_cancel,
+	.clo_print = osc_lock_print
+};
+
+static const struct cl_lock_operations mdc_lock_ops = {
+	.clo_fini	= osc_lock_fini,
+	.clo_enqueue	= mdc_lock_enqueue,
+	.clo_cancel	= osc_lock_cancel,
+	.clo_print	= osc_lock_print,
+};
+
+int mdc_lock_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_lock *lock, const struct cl_io *io)
+{
+	struct osc_lock *ols;
+	__u32 enqflags = lock->cll_descr.cld_enq_flags;
+	__u64 flags = osc_enq2ldlm_flags(enqflags);
+
+	ENTRY;
+
+	/* Ignore AGL for Data-on-MDT, stat returns size data */
+	if ((enqflags & CEF_SPECULATIVE) != 0)
+		RETURN(0);
+
+	OBD_SLAB_ALLOC_PTR_GFP(ols, osc_lock_kmem, GFP_NOFS);
+	if (unlikely(ols == NULL))
+		RETURN(-ENOMEM);
+
+	ols->ols_state = OLS_NEW;
+	spin_lock_init(&ols->ols_lock);
+	INIT_LIST_HEAD(&ols->ols_waiting_list);
+	INIT_LIST_HEAD(&ols->ols_wait_entry);
+	INIT_LIST_HEAD(&ols->ols_nextlock_oscobj);
+	ols->ols_lockless_ops = &mdc_lock_lockless_ops;
+
+	ols->ols_flags = flags;
+	ols->ols_speculative = !!(enqflags & CEF_SPECULATIVE);
+
+	if (ols->ols_flags & LDLM_FL_HAS_INTENT) {
+		ols->ols_flags |= LDLM_FL_BLOCK_GRANTED;
+		ols->ols_glimpse = 1;
+	}
+	mdc_lock_build_einfo(env, lock, cl2osc(obj), &ols->ols_einfo);
+
+	cl_lock_slice_add(lock, &ols->ols_cl, obj, &mdc_lock_ops);
+
+	if (!(enqflags & CEF_MUST))
+		osc_lock_to_lockless(env, ols, (enqflags & CEF_NEVER));
+	if (ols->ols_locklessable && !(enqflags & CEF_DISCARD_DATA))
+		ols->ols_flags |= LDLM_FL_DENY_ON_CONTENTION;
+
+	if (io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io))
+		osc_lock_set_writer(env, io, obj, ols);
+
+	LDLM_DEBUG_NOLOCK("lock %p, mdc lock %p, flags %llx\n",
+			  lock, ols, ols->ols_flags);
+	RETURN(0);
+}
+
+/**
+ * IO operations.
+ *
+ * An implementation of cl_io_operations specific methods for MDC layer.
+ *
+ */
+static int mdc_async_upcall(void *a, int rc)
+{
+	struct osc_async_cbargs *args = a;
+
+	args->opc_rc = rc;
+	complete(&args->opc_sync);
+	return 0;
+}
+
+static int mdc_get_lock_handle(const struct lu_env *env, struct osc_object *osc,
+			       pgoff_t index, struct lustre_handle *lh)
+{
+	struct ldlm_lock *lock;
+
+	/* find DOM lock protecting object */
+	lock = mdc_dlmlock_at_pgoff(env, osc, index,
+				    OSC_DAP_FL_TEST_LOCK |
+				    OSC_DAP_FL_CANCELING);
+	if (lock == NULL) {
+		struct ldlm_resource *res;
+		struct ldlm_res_id *resname;
+
+		resname = &osc_env_info(env)->oti_resname;
+		fid_build_reg_res_name(lu_object_fid(osc2lu(osc)), resname);
+		res = ldlm_resource_get(osc_export(osc)->exp_obd->obd_namespace,
+					NULL, resname, LDLM_IBITS, 0);
+		ldlm_resource_dump(D_ERROR, res);
+		libcfs_debug_dumpstack(NULL);
+		return -ENOENT;
+	} else {
+		*lh = lock->l_remote_handle;
+		LDLM_LOCK_PUT(lock);
+	}
+	return 0;
+}
+
+static int mdc_io_setattr_start(const struct lu_env *env,
+				const struct cl_io_slice *slice)
+{
+	struct cl_io *io = slice->cis_io;
+	struct osc_io *oio = cl2osc_io(env, slice);
+	struct cl_object *obj = slice->cis_obj;
+	struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo;
+	struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+	struct obdo *oa = &oio->oi_oa;
+	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+	__u64 size = io->u.ci_setattr.sa_attr.lvb_size;
+	unsigned int ia_avalid = io->u.ci_setattr.sa_avalid;
+	enum op_xvalid ia_xvalid = io->u.ci_setattr.sa_xvalid;
+	int rc;
+
+	/* silently ignore non-truncate setattr for Data-on-MDT object */
+	if (cl_io_is_trunc(io)) {
+		/* truncate cache dirty pages first */
+		rc = osc_cache_truncate_start(env, cl2osc(obj), size,
+					      &oio->oi_trunc);
+		if (rc < 0)
+			return rc;
+	}
+
+	if (oio->oi_lockless == 0) {
+		cl_object_attr_lock(obj);
+		rc = cl_object_attr_get(env, obj, attr);
+		if (rc == 0) {
+			struct ost_lvb *lvb = &io->u.ci_setattr.sa_attr;
+			unsigned int cl_valid = 0;
+
+			if (ia_avalid & ATTR_SIZE) {
+				attr->cat_size = size;
+				attr->cat_kms = size;
+				cl_valid = (CAT_SIZE | CAT_KMS);
+			}
+			if (ia_avalid & ATTR_MTIME_SET) {
+				attr->cat_mtime = lvb->lvb_mtime;
+				cl_valid |= CAT_MTIME;
+			}
+			if (ia_avalid & ATTR_ATIME_SET) {
+				attr->cat_atime = lvb->lvb_atime;
+				cl_valid |= CAT_ATIME;
+			}
+			if (ia_xvalid & OP_XVALID_CTIME_SET) {
+				attr->cat_ctime = lvb->lvb_ctime;
+				cl_valid |= CAT_CTIME;
+			}
+			rc = cl_object_attr_update(env, obj, attr, cl_valid);
+		}
+		cl_object_attr_unlock(obj);
+		if (rc < 0)
+			return rc;
+	}
+
+	if (!(ia_avalid & ATTR_SIZE))
+		return 0;
+
+	memset(oa, 0, sizeof(*oa));
+	oa->o_oi = loi->loi_oi;
+	oa->o_mtime = attr->cat_mtime;
+	oa->o_atime = attr->cat_atime;
+	oa->o_ctime = attr->cat_ctime;
+
+	oa->o_size = size;
+	oa->o_blocks = OBD_OBJECT_EOF;
+	oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLATIME |
+		      OBD_MD_FLCTIME | OBD_MD_FLMTIME | OBD_MD_FLSIZE |
+		      OBD_MD_FLBLOCKS;
+	if (oio->oi_lockless) {
+		oa->o_flags = OBD_FL_SRVLOCK;
+		oa->o_valid |= OBD_MD_FLFLAGS;
+	} else {
+		rc = mdc_get_lock_handle(env, cl2osc(obj), CL_PAGE_EOF,
+					 &oa->o_handle);
+		if (!rc)
+			oa->o_valid |= OBD_MD_FLHANDLE;
+	}
+
+	init_completion(&cbargs->opc_sync);
+
+	rc = osc_punch_send(osc_export(cl2osc(obj)), oa,
+			    mdc_async_upcall, cbargs);
+	cbargs->opc_rpc_sent = rc == 0;
+	return rc;
+}
+
+static int mdc_io_read_ahead(const struct lu_env *env,
+			     const struct cl_io_slice *ios,
+			     pgoff_t start, struct cl_read_ahead *ra)
+{
+	struct osc_object *osc = cl2osc(ios->cis_obj);
+	struct ldlm_lock *dlmlock;
+
+	ENTRY;
+
+	dlmlock = mdc_dlmlock_at_pgoff(env, osc, start, 0);
+	if (dlmlock == NULL)
+		RETURN(-ENODATA);
+
+	if (dlmlock->l_req_mode != LCK_PR) {
+		struct lustre_handle lockh;
+
+		ldlm_lock2handle(dlmlock, &lockh);
+		ldlm_lock_addref(&lockh, LCK_PR);
+		ldlm_lock_decref(&lockh, dlmlock->l_req_mode);
+	}
+
+	ra->cra_rpc_size = osc_cli(osc)->cl_max_pages_per_rpc;
+	ra->cra_end = CL_PAGE_EOF;
+	ra->cra_release = osc_read_ahead_release;
+	ra->cra_cbdata = dlmlock;
+
+	RETURN(0);
+}
+
+int mdc_io_fsync_start(const struct lu_env *env,
+		       const struct cl_io_slice *slice)
+{
+	struct cl_io *io = slice->cis_io;
+	struct cl_fsync_io *fio = &io->u.ci_fsync;
+	struct cl_object *obj = slice->cis_obj;
+	struct osc_object *osc = cl2osc(obj);
+	int result = 0;
+
+	ENTRY;
+
+	/* a MDC lock always covers whole object, do sync for whole
+	 * possible range despite of supplied start/end values.
+	 */
+	result = osc_cache_writeback_range(env, osc, 0, CL_PAGE_EOF, 0,
+					   fio->fi_mode == CL_FSYNC_DISCARD);
+	if (result > 0) {
+		fio->fi_nr_written += result;
+		result = 0;
+	}
+	if (fio->fi_mode == CL_FSYNC_ALL) {
+		int rc;
+
+		rc = osc_cache_wait_range(env, osc, 0, CL_PAGE_EOF);
+		if (result == 0)
+			result = rc;
+		/* Use OSC sync code because it is asynchronous.
+		 * It is to be added into MDC and avoid the using of
+		 * OST_SYNC at both MDC and MDT.
+		 */
+		rc = osc_fsync_ost(env, osc, fio);
+		if (result == 0)
+			result = rc;
+	}
+
+	RETURN(result);
+}
+
+struct mdc_data_version_args {
+	struct osc_io *dva_oio;
+};
+
+static int
+mdc_data_version_interpret(const struct lu_env *env, struct ptlrpc_request *req,
+			   void *arg, int rc)
+{
+	struct mdc_data_version_args *dva = arg;
+	struct osc_io *oio = dva->dva_oio;
+	const struct mdt_body *body;
+
+	ENTRY;
+	if (rc < 0)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	/* Prepare OBDO from mdt_body for CLIO */
+	oio->oi_oa.o_valid = body->mbo_valid;
+	oio->oi_oa.o_flags = body->mbo_flags;
+	oio->oi_oa.o_data_version = body->mbo_version;
+	oio->oi_oa.o_layout_version = body->mbo_layout_gen;
+	EXIT;
+out:
+	oio->oi_cbarg.opc_rc = rc;
+	complete(&oio->oi_cbarg.opc_sync);
+	return 0;
+}
+
+static int mdc_io_data_version_start(const struct lu_env *env,
+				     const struct cl_io_slice *slice)
+{
+	struct cl_data_version_io *dv = &slice->cis_io->u.ci_data_version;
+	struct osc_io *oio = cl2osc_io(env, slice);
+	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+	struct osc_object *obj = cl2osc(slice->cis_obj);
+	struct obd_export *exp = osc_export(obj);
+	struct ptlrpc_request *req;
+	struct mdt_body *body;
+	struct mdc_data_version_args *dva;
+	int rc;
+
+	ENTRY;
+
+	memset(&oio->oi_oa, 0, sizeof(oio->oi_oa));
+	oio->oi_oa.o_oi.oi_fid = *lu_object_fid(osc2lu(obj));
+	oio->oi_oa.o_valid = OBD_MD_FLID;
+
+	init_completion(&cbargs->opc_sync);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_MDT_BODY);
+	body->mbo_fid1 = *lu_object_fid(osc2lu(obj));
+	body->mbo_valid = OBD_MD_FLID;
+	/* Indicate that data version is needed */
+	body->mbo_valid |= OBD_MD_FLDATAVERSION;
+	body->mbo_flags = 0;
+
+	if (dv->dv_flags & (LL_DV_RD_FLUSH | LL_DV_WR_FLUSH)) {
+		body->mbo_valid |= OBD_MD_FLFLAGS;
+		body->mbo_flags |= OBD_FL_SRVLOCK;
+		if (dv->dv_flags & LL_DV_WR_FLUSH)
+			body->mbo_flags |= OBD_FL_FLUSH;
+	}
+
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, 0);
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, 0);
+	ptlrpc_request_set_replen(req);
+
+	req->rq_interpret_reply = mdc_data_version_interpret;
+	CLASSERT(sizeof(*dva) <= sizeof(req->rq_async_args));
+	dva = ptlrpc_req_async_args(req);
+	dva->dva_oio = oio;
+
+	ptlrpcd_add_req(req);
+
+	RETURN(0);
+}
+
+static void mdc_io_data_version_end(const struct lu_env *env,
+				    const struct cl_io_slice *slice)
+{
+	struct cl_data_version_io *dv = &slice->cis_io->u.ci_data_version;
+	struct osc_io *oio = cl2osc_io(env, slice);
+	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+
+	ENTRY;
+	wait_for_completion(&cbargs->opc_sync);
+
+	if (cbargs->opc_rc != 0) {
+		slice->cis_io->ci_result = cbargs->opc_rc;
+	} else {
+		slice->cis_io->ci_result = 0;
+		if (!(oio->oi_oa.o_valid &
+		      (OBD_MD_LAYOUT_VERSION | OBD_MD_FLDATAVERSION)))
+			slice->cis_io->ci_result = -ENOTSUPP;
+
+		if (oio->oi_oa.o_valid & OBD_MD_LAYOUT_VERSION)
+			dv->dv_layout_version = oio->oi_oa.o_layout_version;
+		if (oio->oi_oa.o_valid & OBD_MD_FLDATAVERSION)
+			dv->dv_data_version = oio->oi_oa.o_data_version;
+	}
+
+	EXIT;
+}
+
+static struct cl_io_operations mdc_io_ops = {
+	.op = {
+		[CIT_READ] = {
+			.cio_iter_init = osc_io_iter_init,
+			.cio_iter_fini = osc_io_iter_fini,
+			.cio_start     = osc_io_read_start,
+		},
+		[CIT_WRITE] = {
+			.cio_iter_init = osc_io_write_iter_init,
+			.cio_iter_fini = osc_io_write_iter_fini,
+			.cio_start     = osc_io_write_start,
+			.cio_end       = osc_io_end,
+		},
+		[CIT_SETATTR] = {
+			.cio_iter_init = osc_io_iter_init,
+			.cio_iter_fini = osc_io_iter_fini,
+			.cio_start     = mdc_io_setattr_start,
+			.cio_end       = osc_io_setattr_end,
+		},
+		[CIT_DATA_VERSION] = {
+			.cio_start = mdc_io_data_version_start,
+			.cio_end   = mdc_io_data_version_end,
+		},
+		[CIT_FAULT] = {
+			.cio_iter_init = osc_io_iter_init,
+			.cio_iter_fini = osc_io_iter_fini,
+			.cio_start     = osc_io_fault_start,
+			.cio_end       = osc_io_end,
+		},
+		[CIT_FSYNC] = {
+			.cio_start = mdc_io_fsync_start,
+			.cio_end   = osc_io_fsync_end,
+		},
+	},
+	.cio_read_ahead   = mdc_io_read_ahead,
+	.cio_submit	  = osc_io_submit,
+	.cio_commit_async = osc_io_commit_async,
+};
+
+int mdc_io_init(const struct lu_env *env, struct cl_object *obj,
+		struct cl_io *io)
+{
+	struct osc_io *oio = osc_env_io(env);
+
+	CL_IO_SLICE_CLEAN(oio, oi_cl);
+	cl_io_slice_add(io, &oio->oi_cl, obj, &mdc_io_ops);
+	return 0;
+}
+
+static void mdc_build_res_name(struct osc_object *osc,
+				   struct ldlm_res_id *resname)
+{
+	fid_build_reg_res_name(lu_object_fid(osc2lu(osc)), resname);
+}
+
+/**
+ * Implementation of struct cl_req_operations::cro_attr_set() for MDC
+ * layer. MDC is responsible for struct obdo::o_id and struct obdo::o_seq
+ * fields.
+ */
+static void mdc_req_attr_set(const struct lu_env *env, struct cl_object *obj,
+			     struct cl_req_attr *attr)
+{
+	u64 flags = attr->cra_flags;
+
+	/* Copy object FID to cl_attr */
+	attr->cra_oa->o_oi.oi_fid = *lu_object_fid(&obj->co_lu);
+
+	if (flags & OBD_MD_FLGROUP)
+		attr->cra_oa->o_valid |= OBD_MD_FLGROUP;
+
+	if (flags & OBD_MD_FLID)
+		attr->cra_oa->o_valid |= OBD_MD_FLID;
+
+	if (flags & OBD_MD_FLHANDLE) {
+		struct osc_page *opg;
+
+		opg = osc_cl_page_osc(attr->cra_page, cl2osc(obj));
+		if (!opg->ops_srvlock) {
+			int rc;
+
+			rc = mdc_get_lock_handle(env, cl2osc(obj),
+						 osc_index(opg),
+						 &attr->cra_oa->o_handle);
+			if (rc) {
+				CL_PAGE_DEBUG(D_ERROR, env, attr->cra_page,
+					      "uncovered page!\n");
+				LBUG();
+			} else {
+				attr->cra_oa->o_valid |= OBD_MD_FLHANDLE;
+			}
+		}
+	}
+}
+
+static int mdc_attr_get(const struct lu_env *env, struct cl_object *obj,
+			struct cl_attr *attr)
+{
+	struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+
+	if (OST_LVB_IS_ERR(oinfo->loi_lvb.lvb_blocks))
+		return OST_LVB_GET_ERR(oinfo->loi_lvb.lvb_blocks);
+
+	return osc_attr_get(env, obj, attr);
+}
+
+static int mdc_object_ast_clear(struct ldlm_lock *lock, void *data)
+{
+	struct osc_object *osc = (struct osc_object *)data;
+	struct ost_lvb *lvb = &lock->l_ost_lvb;
+	struct lov_oinfo *oinfo;
+	ENTRY;
+
+	if (lock->l_ast_data == data) {
+		lock->l_ast_data = NULL;
+
+		LASSERT(osc != NULL);
+		LASSERT(osc->oo_oinfo != NULL);
+		LASSERT(lvb != NULL);
+
+		/* Updates lvb in lock by the cached oinfo */
+		oinfo = osc->oo_oinfo;
+
+		LDLM_DEBUG(lock, "update lock size %llu blocks %llu [cma]time: "
+			   "%llu %llu %llu by oinfo size %llu blocks %llu "
+			   "[cma]time %llu %llu %llu", lvb->lvb_size,
+			   lvb->lvb_blocks, lvb->lvb_ctime, lvb->lvb_mtime,
+			   lvb->lvb_atime, oinfo->loi_lvb.lvb_size,
+			   oinfo->loi_lvb.lvb_blocks, oinfo->loi_lvb.lvb_ctime,
+			   oinfo->loi_lvb.lvb_mtime, oinfo->loi_lvb.lvb_atime);
+		LASSERT(oinfo->loi_lvb.lvb_size >= oinfo->loi_kms);
+
+		cl_object_attr_lock(&osc->oo_cl);
+		memcpy(lvb, &oinfo->loi_lvb, sizeof(oinfo->loi_lvb));
+		cl_object_attr_unlock(&osc->oo_cl);
+		ldlm_clear_lvb_cached(lock);
+	}
+	RETURN(LDLM_ITER_CONTINUE);
+}
+
+int mdc_object_prune(const struct lu_env *env, struct cl_object *obj)
+{
+	struct osc_object *osc = cl2osc(obj);
+	struct ldlm_res_id *resname = &osc_env_info(env)->oti_resname;
+
+	/* DLM locks don't hold a reference of osc_object so we have to
+	 * clear it before the object is being destroyed. */
+	osc_build_res_name(osc, resname);
+	ldlm_resource_iterate(osc_export(osc)->exp_obd->obd_namespace, resname,
+			      mdc_object_ast_clear, osc);
+	return 0;
+}
+
+static int mdc_object_flush(const struct lu_env *env, struct cl_object *obj,
+			    struct ldlm_lock *lock)
+{
+	/* if lock cancel is initiated from llite then it is combined
+	 * lock with DOM bit and it may have no l_ast_data initialized yet,
+	 * so init it here with given osc_object.
+	 */
+	mdc_set_dom_lock_data(lock, cl2osc(obj));
+	RETURN(mdc_dlm_blocking_ast0(env, lock, LDLM_CB_CANCELING));
+}
+
+static const struct cl_object_operations mdc_ops = {
+	.coo_page_init = osc_page_init,
+	.coo_lock_init = mdc_lock_init,
+	.coo_io_init = mdc_io_init,
+	.coo_attr_get = mdc_attr_get,
+	.coo_attr_update = osc_attr_update,
+	.coo_glimpse = osc_object_glimpse,
+	.coo_req_attr_set = mdc_req_attr_set,
+	.coo_prune = mdc_object_prune,
+	.coo_object_flush = mdc_object_flush
+};
+
+static const struct osc_object_operations mdc_object_ops = {
+	.oto_build_res_name = mdc_build_res_name,
+	.oto_dlmlock_at_pgoff = mdc_dlmlock_at_pgoff,
+};
+
+static int mdc_object_init(const struct lu_env *env, struct lu_object *obj,
+			   const struct lu_object_conf *conf)
+{
+	struct osc_object *osc = lu2osc(obj);
+
+	if (osc->oo_initialized)
+		return 0;
+
+	osc->oo_initialized = true;
+
+	return osc_object_init(env, obj, conf);
+}
+
+static void mdc_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+	osc_object_free(env, obj);
+}
+
+static const struct lu_object_operations mdc_lu_obj_ops = {
+	.loo_object_init = mdc_object_init,
+	.loo_object_delete = NULL,
+	.loo_object_release = NULL,
+	.loo_object_free = mdc_object_free,
+	.loo_object_print = osc_object_print,
+	.loo_object_invariant = NULL
+};
+
+struct lu_object *mdc_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *unused,
+				   struct lu_device *dev)
+{
+	struct osc_object *osc;
+	struct lu_object  *obj;
+
+	OBD_SLAB_ALLOC_PTR_GFP(osc, osc_object_kmem, GFP_NOFS);
+	if (osc != NULL) {
+		obj = osc2lu(osc);
+		lu_object_init(obj, NULL, dev);
+		osc->oo_cl.co_ops = &mdc_ops;
+		obj->lo_ops = &mdc_lu_obj_ops;
+		osc->oo_obj_ops = &mdc_object_ops;
+		osc->oo_initialized = false;
+	} else {
+		obj = NULL;
+	}
+	return obj;
+}
+
+static int mdc_cl_process_config(const struct lu_env *env,
+				 struct lu_device *d, struct lustre_cfg *cfg)
+{
+	return mdc_process_config(d->ld_obd, 0, cfg);
+}
+
+const struct lu_device_operations mdc_lu_ops = {
+	.ldo_object_alloc = mdc_object_alloc,
+	.ldo_process_config = mdc_cl_process_config,
+	.ldo_recovery_complete = NULL,
+};
+
+static struct lu_device *mdc_device_alloc(const struct lu_env *env,
+					  struct lu_device_type *t,
+					  struct lustre_cfg *cfg)
+{
+	struct lu_device *d;
+	struct osc_device *od;
+	struct obd_device *obd;
+	int rc;
+
+	OBD_ALLOC_PTR(od);
+	if (od == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	cl_device_init(&od->od_cl, t);
+	d = osc2lu_dev(od);
+	d->ld_ops = &mdc_lu_ops;
+
+	/* Setup MDC OBD */
+	obd = class_name2obd(lustre_cfg_string(cfg, 0));
+	if (obd == NULL)
+		RETURN(ERR_PTR(-ENODEV));
+
+	rc = mdc_setup(obd, cfg);
+	if (rc < 0) {
+		osc_device_free(env, d);
+		RETURN(ERR_PTR(rc));
+	}
+	od->od_exp = obd->obd_self_export;
+	RETURN(d);
+}
+
+static const struct lu_device_type_operations mdc_device_type_ops = {
+	.ldto_device_alloc = mdc_device_alloc,
+	.ldto_device_free = osc_device_free,
+	.ldto_device_init = osc_device_init,
+	.ldto_device_fini = osc_device_fini
+};
+
+struct lu_device_type mdc_device_type = {
+	.ldt_tags = LU_DEVICE_CL,
+	.ldt_name = LUSTRE_MDC_NAME,
+	.ldt_ops = &mdc_device_type_ops,
+	.ldt_ctx_tags = LCT_CL_THREAD
+};
+
+/** @} osc */
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_internal.h b/drivers/staging/lustrefsx/lustre/mdc/mdc_internal.h
index 98773524caee9..c0df4152bf80f 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/mdc_internal.h
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -35,9 +35,7 @@
 
 #include <lustre_mdc.h>
 
-#ifdef CONFIG_PROC_FS
-extern struct lprocfs_vars lprocfs_mdc_obd_vars[];
-#endif
+int mdc_tunables_init(struct obd_device *obd);
 
 void mdc_pack_body(struct ptlrpc_request *req, const struct lu_fid *fid,
 		   u64 valid, size_t ea_size, u32 suppgid, u32 flags);
@@ -58,6 +56,7 @@ void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
 void mdc_file_secctx_pack(struct ptlrpc_request *req,
 			  const char *secctx_name,
 			  const void *secctx, size_t secctx_size);
+void mdc_file_sepol_pack(struct ptlrpc_request *req);
 
 void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
 void mdc_getxattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
@@ -65,6 +64,8 @@ void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
 void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
 		     const char *old, size_t oldlen,
 		     const char *new, size_t newlen);
+void mdc_migrate_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+			const char *name, size_t namelen);
 void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
 
 /* mdc/mdc_locks.c */
@@ -95,6 +96,8 @@ int mdc_save_lovea(struct ptlrpc_request *req,
 /* mdc/mdc_request.c */
 int mdc_fid_alloc(const struct lu_env *env, struct obd_export *exp,
 		  struct lu_fid *fid, struct md_op_data *op_data);
+int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg);
+int mdc_process_config(struct obd_device *obd, size_t len, void *buf);
 
 struct obd_client_handle;
 
@@ -127,6 +130,7 @@ int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data,
 		void *ea, size_t ealen, struct ptlrpc_request **request);
 int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
 	       struct ptlrpc_request **request);
+int mdc_file_resync(struct obd_export *exp, struct md_op_data *data);
 int mdc_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
 		      union ldlm_policy_data *policy, enum ldlm_mode mode,
 		      enum ldlm_cancel_flags flags, void *opaque);
@@ -143,6 +147,11 @@ enum ldlm_mode mdc_lock_match(struct obd_export *exp, __u64 flags,
 			      enum ldlm_mode mode, struct lustre_handle *lockh);
 
 
+#define MDC_CHANGELOG_DEV_COUNT LMV_MAX_STRIPE_COUNT
+#define MDC_CHANGELOG_DEV_NAME	"changelog"
+extern struct class *mdc_changelog_class;
+extern dev_t mdc_changelog_dev;
+
 int mdc_changelog_cdev_init(struct obd_device *obd);
 
 void mdc_changelog_cdev_finish(struct obd_device *obd);
@@ -163,4 +172,15 @@ static inline unsigned long hash_x_index(__u64 hash, int hash64)
 	return ~0UL - (hash + !hash);
 }
 
+/* mdc_dev.c */
+extern struct lu_device_type mdc_device_type;
+int mdc_ldlm_blocking_ast(struct ldlm_lock *dlmlock,
+			  struct ldlm_lock_desc *new, void *data, int flag);
+int mdc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data);
+int mdc_fill_lvb(struct ptlrpc_request *req, struct ost_lvb *lvb);
+
+/* the minimum inline repsize should be PAGE_SIZE at least */
+#define MDC_DOM_DEF_INLINE_REPSIZE max(8192UL, PAGE_SIZE)
+#define MDC_DOM_MAX_INLINE_REPSIZE XATTR_SIZE_MAX
+
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c
index c93ec985f6581..dcc42508aca98 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -36,7 +36,6 @@
 # include <linux/uidgid.h>
 #endif
 #include <lustre_net.h>
-#include <lustre/lustre_idl.h>
 #include <obd_class.h>
 #include <obd.h>
 #include <cl_object.h>
@@ -148,6 +147,22 @@ void mdc_file_secctx_pack(struct ptlrpc_request *req, const char *secctx_name,
 	memcpy(buf, secctx, buf_size);
 }
 
+void mdc_file_sepol_pack(struct ptlrpc_request *req)
+{
+	void *buf;
+	size_t buf_size;
+
+	if (strlen(req->rq_sepol) == 0)
+		return;
+
+	buf = req_capsule_client_get(&req->rq_pill, &RMF_SELINUX_POL);
+	buf_size = req_capsule_get_size(&req->rq_pill, &RMF_SELINUX_POL,
+					RCL_CLIENT);
+
+	LASSERT(buf_size == strlen(req->rq_sepol) + 1);
+	snprintf(buf, strlen(req->rq_sepol) + 1, "%s", req->rq_sepol);
+}
+
 void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff, size_t size,
 		      const struct lu_fid *fid)
 {
@@ -166,9 +181,9 @@ void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
 		     const void *data, size_t datalen, umode_t mode,
 		     uid_t uid, gid_t gid, cfs_cap_t cap_effective, __u64 rdev)
 {
-	struct mdt_rec_create	*rec;
-	char			*tmp;
-	__u64			 flags;
+	struct mdt_rec_create *rec;
+	char *tmp;
+	__u64 flags;
 
 	CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_create));
 	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
@@ -201,13 +216,19 @@ void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
 	mdc_file_secctx_pack(req, op_data->op_file_secctx_name,
 			     op_data->op_file_secctx,
 			     op_data->op_file_secctx_size);
+
+	/* pack SELinux policy info if any */
+	mdc_file_sepol_pack(req);
 }
 
 static inline __u64 mds_pack_open_flags(__u64 flags)
 {
-	__u64 cr_flags = (flags & (FMODE_READ | FMODE_WRITE |
-				   MDS_OPEN_FL_INTERNAL));
+	__u64 cr_flags = (flags & MDS_OPEN_FL_INTERNAL);
 
+	if (flags & FMODE_READ)
+		cr_flags |= MDS_FMODE_READ;
+	if (flags & FMODE_WRITE)
+		cr_flags |= MDS_FMODE_WRITE;
 	if (flags & O_CREAT)
 		cr_flags |= MDS_OPEN_CREAT;
 	if (flags & O_EXCL)
@@ -261,7 +282,7 @@ void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
 		rec->cr_suppgid1   = op_data->op_suppgids[0];
 		rec->cr_suppgid2   = op_data->op_suppgids[1];
 		rec->cr_bias       = op_data->op_bias;
-		rec->cr_old_handle = op_data->op_handle;
+		rec->cr_open_handle_old = op_data->op_open_handle;
 
 		if (op_data->op_name) {
 			mdc_pack_name(req, &RMF_NAME, op_data->op_name,
@@ -274,6 +295,9 @@ void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
 		mdc_file_secctx_pack(req, op_data->op_file_secctx_name,
 				     op_data->op_file_secctx,
 				     op_data->op_file_secctx_size);
+
+		/* pack SELinux policy info if any */
+		mdc_file_sepol_pack(req);
 	}
 
 	if (lmm) {
@@ -284,8 +308,9 @@ void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
 	set_mrc_cr_flags(rec, cr_flags);
 }
 
-static inline __u64 attr_pack(unsigned int ia_valid) {
-        __u64 sa_valid = 0;
+static inline u64 attr_pack(unsigned int ia_valid, enum op_xvalid ia_xvalid)
+{
+	u64 sa_valid = 0;
 
         if (ia_valid & ATTR_MODE)
                 sa_valid |= MDS_ATTR_MODE;
@@ -307,23 +332,27 @@ static inline __u64 attr_pack(unsigned int ia_valid) {
                 sa_valid |= MDS_ATTR_MTIME_SET;
         if (ia_valid & ATTR_FORCE)
                 sa_valid |= MDS_ATTR_FORCE;
-        if (ia_valid & ATTR_ATTR_FLAG)
-                sa_valid |= MDS_ATTR_ATTR_FLAG;
-        if (ia_valid & ATTR_KILL_SUID)
-                sa_valid |=  MDS_ATTR_KILL_SUID;
-        if (ia_valid & ATTR_KILL_SGID)
-                sa_valid |= MDS_ATTR_KILL_SGID;
-        if (ia_valid & ATTR_CTIME_SET)
-                sa_valid |= MDS_ATTR_CTIME_SET;
-        if (ia_valid & ATTR_FROM_OPEN)
-                sa_valid |= MDS_ATTR_FROM_OPEN;
-        if (ia_valid & ATTR_BLOCKS)
-                sa_valid |= MDS_ATTR_BLOCKS;
-        if (ia_valid & MDS_OPEN_OWNEROVERRIDE)
-                /* NFSD hack (see bug 5781) */
-                sa_valid |= MDS_OPEN_OWNEROVERRIDE;
-	if (ia_valid & MDS_ATTR_PROJID)
+	if (ia_xvalid & OP_XVALID_FLAGS)
+		sa_valid |= MDS_ATTR_ATTR_FLAG;
+	if (ia_valid & ATTR_KILL_SUID)
+		sa_valid |=  MDS_ATTR_KILL_SUID;
+	if (ia_valid & ATTR_KILL_SGID)
+		sa_valid |= MDS_ATTR_KILL_SGID;
+	if (ia_xvalid & OP_XVALID_CTIME_SET)
+		sa_valid |= MDS_ATTR_CTIME_SET;
+	if (ia_valid & ATTR_OPEN)
+		sa_valid |= MDS_ATTR_FROM_OPEN;
+	if (ia_xvalid & OP_XVALID_BLOCKS)
+		sa_valid |= MDS_ATTR_BLOCKS;
+	if (ia_xvalid & OP_XVALID_OWNEROVERRIDE)
+		/* NFSD hack (see bug 5781) */
+		sa_valid |= MDS_OPEN_OWNEROVERRIDE;
+	if (ia_xvalid & OP_XVALID_PROJID)
 		sa_valid |= MDS_ATTR_PROJID;
+	if (ia_xvalid & OP_XVALID_LAZYSIZE)
+		sa_valid |= MDS_ATTR_LSIZE;
+	if (ia_xvalid & OP_XVALID_LAZYBLOCKS)
+		sa_valid |= MDS_ATTR_LBLOCKS;
         return sa_valid;
 }
 
@@ -337,7 +366,8 @@ static void mdc_setattr_pack_rec(struct mdt_rec_setattr *rec,
 	rec->sa_suppgid = -1;
 
 	rec->sa_fid    = op_data->op_fid1;
-	rec->sa_valid  = attr_pack(op_data->op_attr.ia_valid);
+	rec->sa_valid  = attr_pack(op_data->op_attr.ia_valid,
+				   op_data->op_xvalid);
 	rec->sa_mode   = op_data->op_attr.ia_mode;
 	rec->sa_uid    = from_kuid(&init_user_ns, op_data->op_attr.ia_uid);
 	rec->sa_gid    = from_kgid(&init_user_ns, op_data->op_attr.ia_gid);
@@ -361,7 +391,7 @@ static void mdc_setattr_pack_rec(struct mdt_rec_setattr *rec,
 static void mdc_ioepoch_pack(struct mdt_ioepoch *epoch,
 			     struct md_op_data *op_data)
 {
-	epoch->mio_handle = op_data->op_handle;
+	epoch->mio_open_handle = op_data->op_open_handle;
 	epoch->mio_unused1 = 0;
 	epoch->mio_unused2 = 0;
 	epoch->mio_padding = 0;
@@ -414,6 +444,9 @@ void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
         rec->ul_bias    = op_data->op_bias;
 
 	mdc_pack_name(req, &RMF_NAME, op_data->op_name, op_data->op_namelen);
+
+	/* pack SELinux policy info if any */
+	mdc_file_sepol_pack(req);
 }
 
 void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
@@ -436,17 +469,19 @@ void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
         rec->lk_bias     = op_data->op_bias;
 
 	mdc_pack_name(req, &RMF_NAME, op_data->op_name, op_data->op_namelen);
+
+	/* pack SELinux policy info if any */
+	mdc_file_sepol_pack(req);
 }
 
-static void mdc_intent_close_pack(struct ptlrpc_request *req,
+static void mdc_close_intent_pack(struct ptlrpc_request *req,
 				  struct md_op_data *op_data)
 {
 	struct close_data	*data;
 	struct ldlm_lock	*lock;
 	enum mds_op_bias	 bias = op_data->op_bias;
 
-	if (!(bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP |
-		      MDS_RENAME_MIGRATE)))
+	if (!(bias & (MDS_CLOSE_INTENT | MDS_CLOSE_MIGRATE)))
 		return;
 
 	data = req_capsule_client_get(&req->rq_pill, &RMF_CLOSE_DATA);
@@ -461,44 +496,90 @@ static void mdc_intent_close_pack(struct ptlrpc_request *req,
 
 	data->cd_data_version = op_data->op_data_version;
 	data->cd_fid = op_data->op_fid2;
+
+	if (bias & MDS_CLOSE_LAYOUT_SPLIT) {
+		data->cd_mirror_id = op_data->op_mirror_id;
+	} else if (bias & MDS_CLOSE_RESYNC_DONE) {
+		struct close_data_resync_done *sync = &data->cd_resync;
+
+		CLASSERT(sizeof(data->cd_resync) <= sizeof(data->cd_reserved));
+		sync->resync_count = op_data->op_data_size / sizeof(__u32);
+		if (sync->resync_count <= INLINE_RESYNC_ARRAY_SIZE) {
+			memcpy(sync->resync_ids_inline, op_data->op_data,
+			       op_data->op_data_size);
+		} else {
+			size_t count = sync->resync_count;
+
+			memcpy(req_capsule_client_get(&req->rq_pill, &RMF_U32),
+				op_data->op_data, count * sizeof(__u32));
+		}
+	}
 }
 
 void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
 		     const char *old, size_t oldlen,
 		     const char *new, size_t newlen)
 {
-        struct mdt_rec_rename *rec;
+	struct mdt_rec_rename *rec;
 
-        CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_rename));
-        rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+	CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_rename));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
 
-        /* XXX do something about time, uid, gid */
-	rec->rn_opcode  = op_data->op_cli_flags & CLI_MIGRATE ?
-					REINT_MIGRATE : REINT_RENAME;
-        rec->rn_fsuid    = op_data->op_fsuid;
-        rec->rn_fsgid    = op_data->op_fsgid;
-        rec->rn_cap      = op_data->op_cap;
-        rec->rn_suppgid1 = op_data->op_suppgids[0];
-        rec->rn_suppgid2 = op_data->op_suppgids[1];
-        rec->rn_fid1     = op_data->op_fid1;
-        rec->rn_fid2     = op_data->op_fid2;
-        rec->rn_time     = op_data->op_mod_time;
-        rec->rn_mode     = op_data->op_mode;
-        rec->rn_bias     = op_data->op_bias;
+	/* XXX do something about time, uid, gid */
+	rec->rn_opcode	 = REINT_RENAME;
+	rec->rn_fsuid    = op_data->op_fsuid;
+	rec->rn_fsgid    = op_data->op_fsgid;
+	rec->rn_cap      = op_data->op_cap;
+	rec->rn_suppgid1 = op_data->op_suppgids[0];
+	rec->rn_suppgid2 = op_data->op_suppgids[1];
+	rec->rn_fid1     = op_data->op_fid1;
+	rec->rn_fid2     = op_data->op_fid2;
+	rec->rn_time     = op_data->op_mod_time;
+	rec->rn_mode     = op_data->op_mode;
+	rec->rn_bias     = op_data->op_bias;
 
 	mdc_pack_name(req, &RMF_NAME, old, oldlen);
 
 	if (new != NULL)
 		mdc_pack_name(req, &RMF_SYMTGT, new, newlen);
 
-	if (op_data->op_cli_flags & CLI_MIGRATE &&
-	    op_data->op_bias & MDS_RENAME_MIGRATE) {
+	/* pack SELinux policy info if any */
+	mdc_file_sepol_pack(req);
+}
+
+void mdc_migrate_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		      const char *name, size_t namelen)
+{
+	struct mdt_rec_rename *rec;
+	char *ea;
+
+	CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_rename));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+	rec->rn_opcode	 = REINT_MIGRATE;
+	rec->rn_fsuid    = op_data->op_fsuid;
+	rec->rn_fsgid    = op_data->op_fsgid;
+	rec->rn_cap      = op_data->op_cap;
+	rec->rn_suppgid1 = op_data->op_suppgids[0];
+	rec->rn_suppgid2 = op_data->op_suppgids[1];
+	rec->rn_fid1     = op_data->op_fid1;
+	rec->rn_fid2	 = op_data->op_fid4;
+	rec->rn_time     = op_data->op_mod_time;
+	rec->rn_mode     = op_data->op_mode;
+	rec->rn_bias     = op_data->op_bias;
+
+	mdc_pack_name(req, &RMF_NAME, name, namelen);
+
+	if (op_data->op_bias & MDS_CLOSE_MIGRATE) {
 		struct mdt_ioepoch *epoch;
 
-		mdc_intent_close_pack(req, op_data);
+		mdc_close_intent_pack(req, op_data);
 		epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
 		mdc_ioepoch_pack(epoch, op_data);
 	}
+
+	ea = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+	memcpy(ea, op_data->op_data, op_data->op_data_size);
 }
 
 void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, __u32 flags,
@@ -508,8 +589,6 @@ void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, __u32 flags,
                                                     &RMF_MDT_BODY);
 
 	b->mbo_valid = valid;
-	if (op_data->op_bias & MDS_CHECK_SPLIT)
-		b->mbo_valid |= OBD_MD_FLCKSPLIT;
 	if (op_data->op_bias & MDS_CROSS_REF)
 		b->mbo_valid |= OBD_MD_FLCROSSREF;
 	b->mbo_eadatasize = ea_size;
@@ -547,5 +626,5 @@ void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
 		rec->sa_valid &= ~MDS_ATTR_ATIME;
 
 	mdc_ioepoch_pack(epoch, op_data);
-	mdc_intent_close_pack(req, op_data);
+	mdc_close_intent_pack(req, op_data);
 }
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c
index cb809c2ce4b89..1c1e54b87590f 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -43,6 +43,7 @@
 #include <lustre_net.h>
 #include <lustre_req_layout.h>
 #include <lustre_swab.h>
+#include <lustre_acl.h>
 
 #include "mdc_internal.h"
 
@@ -244,7 +245,7 @@ int mdc_save_lovea(struct ptlrpc_request *req,
 
 static struct ptlrpc_request *
 mdc_intent_open_pack(struct obd_export *exp, struct lookup_intent *it,
-		     struct md_op_data *op_data)
+		     struct md_op_data *op_data, __u32 acl_bufsize)
 {
 	struct ptlrpc_request	*req;
 	struct obd_device	*obddev = class_exp2obd(exp);
@@ -255,6 +256,8 @@ mdc_intent_open_pack(struct obd_export *exp, struct lookup_intent *it,
 	int			 count = 0;
 	enum ldlm_mode		 mode;
 	int			 rc;
+	int repsize, repsize_estimate;
+
 	ENTRY;
 
 	it->it_create_mode = (it->it_create_mode & ~S_IFMT) | S_IFREG;
@@ -263,12 +266,12 @@ mdc_intent_open_pack(struct obd_export *exp, struct lookup_intent *it,
 	/* If inode is known, cancel conflicting OPEN locks. */
 	if (fid_is_sane(&op_data->op_fid2)) {
 		if (it->it_flags & MDS_OPEN_LEASE) { /* try to get lease */
-			if (it->it_flags & FMODE_WRITE)
+			if (it->it_flags & MDS_FMODE_WRITE)
 				mode = LCK_EX;
 			else
 				mode = LCK_PR;
 		} else {
-			if (it->it_flags & (FMODE_WRITE|MDS_OPEN_TRUNC))
+			if (it->it_flags & (MDS_FMODE_WRITE | MDS_OPEN_TRUNC))
 				mode = LCK_CW;
 #ifdef FMODE_EXEC
 			else if (it->it_flags & FMODE_EXEC)
@@ -300,16 +303,32 @@ mdc_intent_open_pack(struct obd_export *exp, struct lookup_intent *it,
 
         req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
                              op_data->op_namelen + 1);
-	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+	if (cl_is_lov_delay_create(it->it_flags)) {
+		/* open(O_LOV_DELAY_CREATE) won't pack lmm */
+		LASSERT(lmmsize == 0);
+		req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, 0);
+	} else {
+		req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
 			     max(lmmsize, obddev->u.cli.cl_default_mds_easize));
+	}
 
 	req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX_NAME,
 			     RCL_CLIENT, op_data->op_file_secctx_name != NULL ?
-			     strlen(op_data->op_file_secctx_name) + 1 : 0);
+			     op_data->op_file_secctx_name_size : 0);
 
 	req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX, RCL_CLIENT,
 			     op_data->op_file_secctx_size);
 
+	/* get SELinux policy info if any */
+	rc = sptlrpc_get_sepol(req);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(ERR_PTR(rc));
+	}
+	req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
+			     strlen(req->rq_sepol) ?
+			     strlen(req->rq_sepol) + 1 : 0);
+
 	rc = ldlm_prep_enqueue_req(exp, req, &cancels, count);
 	if (rc < 0) {
 		ptlrpc_request_free(req);
@@ -330,10 +349,71 @@ mdc_intent_open_pack(struct obd_export *exp, struct lookup_intent *it,
 
 	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
 			     obddev->u.cli.cl_max_mds_easize);
-	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
-			     req->rq_import->imp_connect_data.ocd_max_easize);
-        ptlrpc_request_set_replen(req);
-        return req;
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, acl_bufsize);
+
+	if (!(it->it_op & IT_CREAT) && it->it_op & IT_OPEN &&
+	    req_capsule_has_field(&req->rq_pill, &RMF_FILE_SECCTX_NAME,
+				  RCL_CLIENT) &&
+	    op_data->op_file_secctx_name_size > 0 &&
+	    op_data->op_file_secctx_name != NULL) {
+		char *secctx_name;
+
+		secctx_name = req_capsule_client_get(&req->rq_pill,
+						     &RMF_FILE_SECCTX_NAME);
+		memcpy(secctx_name, op_data->op_file_secctx_name,
+		       op_data->op_file_secctx_name_size);
+		req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX,
+				     RCL_SERVER,
+				     obddev->u.cli.cl_max_mds_easize);
+
+		CDEBUG(D_SEC, "packed '%.*s' as security xattr name\n",
+		       op_data->op_file_secctx_name_size,
+		       op_data->op_file_secctx_name);
+
+	} else {
+		req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX,
+				     RCL_SERVER, 0);
+	}
+
+	/**
+	 * Inline buffer for possible data from Data-on-MDT files.
+	 */
+	req_capsule_set_size(&req->rq_pill, &RMF_NIOBUF_INLINE, RCL_SERVER,
+			     sizeof(struct niobuf_remote));
+	ptlrpc_request_set_replen(req);
+
+	/* Get real repbuf allocated size as rounded up power of 2 */
+	repsize = size_roundup_power2(req->rq_replen +
+				      lustre_msg_early_size());
+	/* Estimate free space for DoM files in repbuf */
+	repsize_estimate = repsize - (req->rq_replen -
+			   obddev->u.cli.cl_max_mds_easize +
+			   sizeof(struct lov_comp_md_v1) +
+			   sizeof(struct lov_comp_md_entry_v1) +
+			   lov_mds_md_size(0, LOV_MAGIC_V3));
+
+	if (repsize_estimate < obddev->u.cli.cl_dom_min_inline_repsize) {
+		repsize = obddev->u.cli.cl_dom_min_inline_repsize -
+			  repsize_estimate + sizeof(struct niobuf_remote);
+		req_capsule_set_size(&req->rq_pill, &RMF_NIOBUF_INLINE,
+				     RCL_SERVER,
+				     sizeof(struct niobuf_remote) + repsize);
+		ptlrpc_request_set_replen(req);
+		CDEBUG(D_INFO, "Increase repbuf by %d bytes, total: %d\n",
+		       repsize, req->rq_replen);
+		repsize = size_roundup_power2(req->rq_replen +
+					      lustre_msg_early_size());
+	}
+	/* The only way to report real allocated repbuf size to the server
+	 * is the lm_repsize but it must be set prior buffer allocation itself
+	 * due to security reasons - it is part of buffer used in signature
+	 * calculation (see LU-11414). Therefore the saved size is predicted
+	 * value as rq_replen rounded to the next higher power of 2.
+	 * Such estimation is safe. Though the final allocated buffer might
+	 * be even larger, it is not possible to know that at this point.
+	 */
+	req->rq_reqmsg->lm_repsize = repsize;
+	return req;
 }
 
 #define GA_DEFAULT_EA_NAME_LEN 20
@@ -349,7 +429,7 @@ mdc_intent_getxattr_pack(struct obd_export *exp,
 	struct ldlm_intent	*lit;
 	int			rc, count = 0;
 	struct list_head	cancels = LIST_HEAD_INIT(cancels);
-	u32 min_buf_size = 0;
+	u32 ea_vals_buf_size = GA_DEFAULT_EA_VAL_LEN * GA_DEFAULT_EA_NUM;
 
 	ENTRY;
 
@@ -358,6 +438,16 @@ mdc_intent_getxattr_pack(struct obd_export *exp,
 	if (req == NULL)
 		RETURN(ERR_PTR(-ENOMEM));
 
+	/* get SELinux policy info if any */
+	rc = sptlrpc_get_sepol(req);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(ERR_PTR(rc));
+	}
+	req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
+			     strlen(req->rq_sepol) ?
+			     strlen(req->rq_sepol) + 1 : 0);
+
 	rc = ldlm_prep_enqueue_req(exp, req, &cancels, count);
 	if (rc) {
 		ptlrpc_request_free(req);
@@ -367,6 +457,8 @@ mdc_intent_getxattr_pack(struct obd_export *exp,
 	/* pack the intent */
 	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
 	lit->opc = IT_GETXATTR;
+	CDEBUG(D_INFO, "%s: get xattrs for "DFID"\n",
+	       exp->exp_obd->obd_name, PFID(&op_data->op_fid1));
 
 #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
 	/* If the supplied buffer is too small then the server will
@@ -378,26 +470,25 @@ mdc_intent_getxattr_pack(struct obd_export *exp,
 	 * of LU-9417 when it would be *more* likely to crash the
 	 * server. See LU-9856. */
 	if (exp->exp_connect_data.ocd_version < OBD_OCD_VERSION(2, 10, 1, 0))
-		min_buf_size = exp->exp_connect_data.ocd_max_easize;
+		ea_vals_buf_size = max_t(u32, ea_vals_buf_size,
+					 exp->exp_connect_data.ocd_max_easize);
 #endif
 
 	/* pack the intended request */
 	mdc_pack_body(req, &op_data->op_fid1, op_data->op_valid,
-		      max_t(u32, min_buf_size,
-			    GA_DEFAULT_EA_VAL_LEN * GA_DEFAULT_EA_NUM),
-		      -1, 0);
+		      ea_vals_buf_size, -1, 0);
+
+	/* get SELinux policy info if any */
+	mdc_file_sepol_pack(req);
 
 	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER,
-			     max_t(u32, min_buf_size,
-				 GA_DEFAULT_EA_NAME_LEN * GA_DEFAULT_EA_NUM));
+			     GA_DEFAULT_EA_NAME_LEN * GA_DEFAULT_EA_NUM);
 
 	req_capsule_set_size(&req->rq_pill, &RMF_EAVALS, RCL_SERVER,
-			     max_t(u32, min_buf_size,
-				 GA_DEFAULT_EA_VAL_LEN * GA_DEFAULT_EA_NUM));
+			     ea_vals_buf_size);
 
 	req_capsule_set_size(&req->rq_pill, &RMF_EAVALS_LENS, RCL_SERVER,
-			     max_t(u32, min_buf_size,
-				 sizeof(__u32) * GA_DEFAULT_EA_NUM));
+			     sizeof(u32) * GA_DEFAULT_EA_NUM);
 
 	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, 0);
 
@@ -406,46 +497,9 @@ mdc_intent_getxattr_pack(struct obd_export *exp,
 	RETURN(req);
 }
 
-static struct ptlrpc_request *mdc_intent_unlink_pack(struct obd_export *exp,
-                                                     struct lookup_intent *it,
-                                                     struct md_op_data *op_data)
-{
-        struct ptlrpc_request *req;
-        struct obd_device     *obddev = class_exp2obd(exp);
-        struct ldlm_intent    *lit;
-        int                    rc;
-        ENTRY;
-
-        req = ptlrpc_request_alloc(class_exp2cliimp(exp),
-                                   &RQF_LDLM_INTENT_UNLINK);
-        if (req == NULL)
-                RETURN(ERR_PTR(-ENOMEM));
-
-        req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
-                             op_data->op_namelen + 1);
-
-        rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
-        if (rc) {
-                ptlrpc_request_free(req);
-                RETURN(ERR_PTR(rc));
-        }
-
-        /* pack the intent */
-        lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
-        lit->opc = (__u64)it->it_op;
-
-        /* pack the intended request */
-        mdc_unlink_pack(req, op_data);
-
-	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
-			     obddev->u.cli.cl_default_mds_easize);
-	ptlrpc_request_set_replen(req);
-	RETURN(req);
-}
-
-static struct ptlrpc_request *mdc_intent_getattr_pack(struct obd_export *exp,
-                                                      struct lookup_intent *it,
-                                                      struct md_op_data *op_data)
+static struct ptlrpc_request *
+mdc_intent_getattr_pack(struct obd_export *exp, struct lookup_intent *it,
+			struct md_op_data *op_data, __u32 acl_bufsize)
 {
 	struct ptlrpc_request	*req;
 	struct obd_device	*obddev = class_exp2obd(exp);
@@ -455,25 +509,38 @@ static struct ptlrpc_request *mdc_intent_getattr_pack(struct obd_export *exp,
 	struct ldlm_intent	*lit;
 	int			 rc;
 	__u32			 easize;
+	bool			 have_secctx = false;
 	ENTRY;
 
-        req = ptlrpc_request_alloc(class_exp2cliimp(exp),
-                                   &RQF_LDLM_INTENT_GETATTR);
-        if (req == NULL)
-                RETURN(ERR_PTR(-ENOMEM));
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_LDLM_INTENT_GETATTR);
+	if (req == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
 
-        req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
-                             op_data->op_namelen + 1);
+	/* send name of security xattr to get upon intent */
+	if (it->it_op & (IT_LOOKUP | IT_GETATTR) &&
+	    req_capsule_has_field(&req->rq_pill, &RMF_FILE_SECCTX_NAME,
+				  RCL_CLIENT) &&
+	    op_data->op_file_secctx_name_size > 0 &&
+	    op_data->op_file_secctx_name != NULL) {
+		have_secctx = true;
+		req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX_NAME,
+				     RCL_CLIENT,
+				     op_data->op_file_secctx_name_size);
+	}
 
-        rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
-        if (rc) {
-                ptlrpc_request_free(req);
-                RETURN(ERR_PTR(rc));
-        }
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+			     op_data->op_namelen + 1);
+
+	rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(ERR_PTR(rc));
+	}
 
         /* pack the intent */
-        lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
-        lit->opc = (__u64)it->it_op;
+	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+	lit->opc = (__u64)it->it_op;
 
 	if (obddev->u.cli.cl_default_mds_easize > 0)
 		easize = obddev->u.cli.cl_default_mds_easize;
@@ -484,8 +551,27 @@ static struct ptlrpc_request *mdc_intent_getattr_pack(struct obd_export *exp,
 	mdc_getattr_pack(req, valid, it->it_flags, op_data, easize);
 
 	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, easize);
-	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
-			     req->rq_import->imp_connect_data.ocd_max_easize);
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, acl_bufsize);
+
+	if (have_secctx) {
+		char *secctx_name;
+
+		secctx_name = req_capsule_client_get(&req->rq_pill,
+						     &RMF_FILE_SECCTX_NAME);
+		memcpy(secctx_name, op_data->op_file_secctx_name,
+		       op_data->op_file_secctx_name_size);
+
+		req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX,
+				     RCL_SERVER, easize);
+
+		CDEBUG(D_SEC, "packed '%.*s' as security xattr name\n",
+		       op_data->op_file_secctx_name_size,
+		       op_data->op_file_secctx_name);
+	} else {
+		req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX,
+				     RCL_SERVER, 0);
+	}
+
 	ptlrpc_request_set_replen(req);
 	RETURN(req);
 }
@@ -562,8 +648,10 @@ static int mdc_finish_enqueue(struct obd_export *exp,
 	struct ldlm_request *lockreq;
 	struct ldlm_reply   *lockrep;
 	struct ldlm_lock    *lock;
+	struct mdt_body     *body = NULL;
 	void                *lvb_data = NULL;
 	__u32                lvb_len = 0;
+
         ENTRY;
 
         LASSERT(rc >= 0);
@@ -622,8 +710,6 @@ static int mdc_finish_enqueue(struct obd_export *exp,
 
 	/* We know what to expect, so we do any byte flipping required here */
 	if (it_has_reply_body(it)) {
-                struct mdt_body *body;
-
                 body = req_capsule_server_get(pill, &RMF_MDT_BODY);
                 if (body == NULL) {
                         CERROR ("Can't swab mdt_body\n");
@@ -641,6 +727,12 @@ static int mdc_finish_enqueue(struct obd_export *exp,
 			mdc_set_open_replay_data(NULL, NULL, it);
 		}
 
+		if (it_disposition(it, DISP_OPEN_CREATE) &&
+		    !it_open_error(DISP_OPEN_CREATE, it)) {
+			lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+					     LPROC_MD_CREATE);
+		}
+
 		if (body->mbo_valid & (OBD_MD_FLDIREA | OBD_MD_FLEASIZE)) {
                         void *eadata;
 
@@ -708,7 +800,10 @@ static int mdc_finish_enqueue(struct obd_export *exp,
 	 * client still does this checking in case it's talking with an old
 	 * server. - Jinshan */
 	lock = ldlm_handle2lock(lockh);
-	if (lock != NULL && ldlm_has_layout(lock) && lvb_data != NULL &&
+	if (lock == NULL)
+		RETURN(rc);
+
+	if (ldlm_has_layout(lock) && lvb_data != NULL &&
 	    !(lockrep->lock_flags & LDLM_FL_BLOCKED_MASK)) {
 		void *lmm;
 
@@ -716,10 +811,9 @@ static int mdc_finish_enqueue(struct obd_export *exp,
 			ldlm_it2str(it->it_op), lvb_len);
 
 		OBD_ALLOC_LARGE(lmm, lvb_len);
-		if (lmm == NULL) {
-			LDLM_LOCK_PUT(lock);
-			RETURN(-ENOMEM);
-		}
+		if (lmm == NULL)
+			GOTO(out_lock, rc = -ENOMEM);
+
 		memcpy(lmm, lvb_data, lvb_len);
 
 		/* install lvb_data */
@@ -734,8 +828,24 @@ static int mdc_finish_enqueue(struct obd_export *exp,
 		if (lmm != NULL)
 			OBD_FREE_LARGE(lmm, lvb_len);
 	}
-	if (lock != NULL)
-		LDLM_LOCK_PUT(lock);
+
+	if (ldlm_has_dom(lock)) {
+		LASSERT(lock->l_glimpse_ast == mdc_ldlm_glimpse_ast);
+
+		body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+		if (!(body->mbo_valid & OBD_MD_DOM_SIZE)) {
+			LDLM_ERROR(lock, "%s: DoM lock without size.",
+				   exp->exp_obd->obd_name);
+			GOTO(out_lock, rc = -EPROTO);
+		}
+
+		LDLM_DEBUG(lock, "DoM lock is returned by: %s, size: %llu",
+			   ldlm_it2str(it->it_op), body->mbo_dom_size);
+
+		rc = mdc_fill_lvb(req, &lock->l_ost_lvb);
+	}
+out_lock:
+	LDLM_LOCK_PUT(lock);
 
 	RETURN(rc);
 }
@@ -764,6 +874,8 @@ static int mdc_enqueue_base(struct obd_export *exp,
 				  .l_inodebits = { MDS_INODELOCK_XATTR } };
 	int generation, resends = 0;
 	struct ldlm_reply *lockrep;
+	struct obd_import *imp = class_exp2cliimp(exp);
+	__u32 acl_bufsize;
 	enum lvb_type lvb_type = 0;
 	int rc;
 	ENTRY;
@@ -776,34 +888,37 @@ static int mdc_enqueue_base(struct obd_export *exp,
 		LASSERT(policy == NULL);
 
 		saved_flags |= LDLM_FL_HAS_INTENT;
-		if (it->it_op & (IT_UNLINK | IT_GETATTR | IT_READDIR))
+		if (it->it_op & (IT_GETATTR | IT_READDIR))
 			policy = &update_policy;
 		else if (it->it_op & IT_LAYOUT)
 			policy = &layout_policy;
-		else if (it->it_op & (IT_GETXATTR | IT_SETXATTR))
+		else if (it->it_op & IT_GETXATTR)
 			policy = &getxattr_policy;
 		else
 			policy = &lookup_policy;
 	}
 
-        generation = obddev->u.cli.cl_import->imp_generation;
+	generation = obddev->u.cli.cl_import->imp_generation;
+	if (!it || (it->it_op & (IT_OPEN | IT_CREAT)))
+		acl_bufsize = imp->imp_connect_data.ocd_max_easize;
+	else
+		acl_bufsize = LUSTRE_POSIX_ACL_MAX_SIZE_OLD;
+
 resend:
-        flags = saved_flags;
+	flags = saved_flags;
 	if (it == NULL) {
 		/* The only way right now is FLOCK. */
 		LASSERTF(einfo->ei_type == LDLM_FLOCK, "lock type %d\n",
 			 einfo->ei_type);
 		res_id.name[3] = LDLM_FLOCK;
 	} else if (it->it_op & IT_OPEN) {
-		req = mdc_intent_open_pack(exp, it, op_data);
-	} else if (it->it_op & IT_UNLINK) {
-		req = mdc_intent_unlink_pack(exp, it, op_data);
+		req = mdc_intent_open_pack(exp, it, op_data, acl_bufsize);
 	} else if (it->it_op & (IT_GETATTR | IT_LOOKUP)) {
-		req = mdc_intent_getattr_pack(exp, it, op_data);
+		req = mdc_intent_getattr_pack(exp, it, op_data, acl_bufsize);
 	} else if (it->it_op & IT_READDIR) {
 		req = mdc_enqueue_pack(exp, 0);
 	} else if (it->it_op & IT_LAYOUT) {
-		if (!imp_connect_lvb_type(class_exp2cliimp(exp)))
+		if (!imp_connect_lvb_type(imp))
 			RETURN(-EOPNOTSUPP);
 		req = mdc_intent_layout_pack(exp, it, op_data);
 		lvb_type = LVB_T_LAYOUT;
@@ -832,18 +947,25 @@ static int mdc_enqueue_base(struct obd_export *exp,
 		rc = obd_get_request_slot(&obddev->u.cli);
 		if (rc != 0) {
 			mdc_put_mod_rpc_slot(req, it);
-                        mdc_clear_replay_flag(req, 0);
-                        ptlrpc_req_finished(req);
-                        RETURN(rc);
-                }
-        }
+			mdc_clear_replay_flag(req, 0);
+			ptlrpc_req_finished(req);
+			RETURN(rc);
+		}
+	}
+
+	/* With Data-on-MDT the glimpse callback is needed too.
+	 * It is set here in advance but not in mdc_finish_enqueue()
+	 * to avoid possible races. It is safe to have glimpse handler
+	 * for non-DOM locks and costs nothing.*/
+	if (einfo->ei_cb_gl == NULL)
+		einfo->ei_cb_gl = mdc_ldlm_glimpse_ast;
 
-        rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, policy, &flags, NULL,
+	rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, policy, &flags, NULL,
 			      0, lvb_type, lockh, 0);
-        if (!it) {
-                /* For flock requests we immediatelly return without further
-                   delay and let caller deal with the rest, since rest of
-                   this function metadata processing makes no sense for flock
+	if (!it) {
+		/* For flock requests we immediatelly return without further
+		   delay and let caller deal with the rest, since rest of
+		   this function metadata processing makes no sense for flock
 		   requests anyway. But in case of problem during comms with
 		   Server (ETIMEDOUT) or any signal/kill attempt (EINTR), we
 		   can not rely on caller and this mainly for F_UNLCKs
@@ -898,6 +1020,15 @@ static int mdc_enqueue_base(struct obd_export *exp,
 		}
 	}
 
+	if ((int)lockrep->lock_policy_res2 == -ERANGE &&
+	    it->it_op & (IT_OPEN | IT_GETATTR | IT_LOOKUP) &&
+	    acl_bufsize != imp->imp_connect_data.ocd_max_easize) {
+		mdc_clear_replay_flag(req, -ERANGE);
+		ptlrpc_req_finished(req);
+		acl_bufsize = imp->imp_connect_data.ocd_max_easize;
+		goto resend;
+	}
+
 	rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc);
 	if (rc < 0) {
 		if (lustre_handle_is_used(lockh)) {
@@ -1071,7 +1202,6 @@ int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
 			 * but for old MDTs (< 2.4), permission is covered
 			 * by LOOKUP lock, so it needs to match all bits here.*/
 			policy.l_inodebits.bits = MDS_INODELOCK_UPDATE |
-						  MDS_INODELOCK_LOOKUP |
 						  MDS_INODELOCK_PERM;
 			break;
 		case IT_READDIR:
@@ -1138,6 +1268,7 @@ int mdc_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
 		.ei_mode	= it_to_lock_mode(it),
 		.ei_cb_bl	= cb_blocking,
 		.ei_cb_cp	= ldlm_completion_ast,
+		.ei_cb_gl	= mdc_ldlm_glimpse_ast,
 	};
 	struct lustre_handle lockh;
 	int rc = 0;
@@ -1254,7 +1385,10 @@ int mdc_intent_getattr_async(struct obd_export *exp,
 		PFID(&op_data->op_fid1), ldlm_it2str(it->it_op), it->it_flags);
 
 	fid_build_reg_res_name(&op_data->op_fid1, &res_id);
-	req = mdc_intent_getattr_pack(exp, it, op_data);
+	/* If the MDT return -ERANGE because of large ACL, then the sponsor
+	 * of the async getattr RPC will handle that by itself. */
+	req = mdc_intent_getattr_pack(exp, it, op_data,
+				      LUSTRE_POSIX_ACL_MAX_SIZE_OLD);
 	if (IS_ERR(req))
 		RETURN(PTR_ERR(req));
 
@@ -1264,6 +1398,13 @@ int mdc_intent_getattr_async(struct obd_export *exp,
 		RETURN(rc);
 	}
 
+	/* With Data-on-MDT the glimpse callback is needed too.
+	 * It is set here in advance but not in mdc_finish_enqueue()
+	 * to avoid possible races. It is safe to have glimpse handler
+	 * for non-DOM locks and costs nothing.*/
+	if (minfo->mi_einfo.ei_cb_gl == NULL)
+		minfo->mi_einfo.ei_cb_gl = mdc_ldlm_glimpse_ast;
+
 	rc = ldlm_cli_enqueue(exp, &req, &minfo->mi_einfo, &res_id, &policy,
 			      &flags, NULL, 0, LVB_T_NONE, &minfo->mi_lockh, 1);
 	if (rc < 0) {
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c
index db2e665658746..096b20fd4847a 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -201,6 +201,16 @@ int mdc_create(struct obd_export *exp, struct md_op_data *op_data,
 	req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX, RCL_CLIENT,
 			     op_data->op_file_secctx_size);
 
+	/* get SELinux policy info if any */
+	rc = sptlrpc_get_sepol(req);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
+			     strlen(req->rq_sepol) ?
+			     strlen(req->rq_sepol) + 1 : 0);
+
 	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
 	if (rc) {
 		ptlrpc_request_free(req);
@@ -275,9 +285,10 @@ int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
 						MDS_INODELOCK_UPDATE);
 	if ((op_data->op_flags & MF_MDC_CANCEL_FID3) &&
 	    (fid_is_sane(&op_data->op_fid3)))
+		/* don't cancel DoM lock which may cause data flush */
 		count += mdc_resource_get_unused(exp, &op_data->op_fid3,
 						 &cancels, LCK_EX,
-						 MDS_INODELOCK_FULL);
+						 MDS_INODELOCK_ELC);
         req = ptlrpc_request_alloc(class_exp2cliimp(exp),
                                    &RQF_MDS_REINT_UNLINK);
         if (req == NULL) {
@@ -288,6 +299,16 @@ int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
         req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
                              op_data->op_namelen + 1);
 
+	/* get SELinux policy info if any */
+	rc = sptlrpc_get_sepol(req);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
+			     strlen(req->rq_sepol) ?
+			     strlen(req->rq_sepol) + 1 : 0);
+
 	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
 	if (rc) {
 		ptlrpc_request_free(req);
@@ -336,6 +357,16 @@ int mdc_link(struct obd_export *exp, struct md_op_data *op_data,
         req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
                              op_data->op_namelen + 1);
 
+	/* get SELinux policy info if any */
+	rc = sptlrpc_get_sepol(req);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
+			     strlen(req->rq_sepol) ?
+			     strlen(req->rq_sepol) + 1 : 0);
+
 	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
 	if (rc) {
 		ptlrpc_request_free(req);
@@ -358,31 +389,32 @@ int mdc_rename(struct obd_export *exp, struct md_op_data *op_data,
 		struct ptlrpc_request **request)
 {
 	struct list_head cancels = LIST_HEAD_INIT(cancels);
-        struct obd_device *obd = exp->exp_obd;
-        struct ptlrpc_request *req;
-        int count = 0, rc;
-        ENTRY;
+	struct obd_device *obd = exp->exp_obd;
+	struct ptlrpc_request *req;
+	int count = 0, rc;
 
-        if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
-            (fid_is_sane(&op_data->op_fid1)))
-                count = mdc_resource_get_unused(exp, &op_data->op_fid1,
-                                                &cancels, LCK_EX,
-                                                MDS_INODELOCK_UPDATE);
-        if ((op_data->op_flags & MF_MDC_CANCEL_FID2) &&
-            (fid_is_sane(&op_data->op_fid2)))
-                count += mdc_resource_get_unused(exp, &op_data->op_fid2,
-                                                 &cancels, LCK_EX,
-                                                 MDS_INODELOCK_UPDATE);
-        if ((op_data->op_flags & MF_MDC_CANCEL_FID3) &&
-            (fid_is_sane(&op_data->op_fid3)))
-                count += mdc_resource_get_unused(exp, &op_data->op_fid3,
-                                                 &cancels, LCK_EX,
-                                                 MDS_INODELOCK_LOOKUP);
-        if ((op_data->op_flags & MF_MDC_CANCEL_FID4) &&
-             (fid_is_sane(&op_data->op_fid4)))
-                count += mdc_resource_get_unused(exp, &op_data->op_fid4,
-                                                 &cancels, LCK_EX,
-                                                 MDS_INODELOCK_FULL);
+	ENTRY;
+
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+	    (fid_is_sane(&op_data->op_fid1)))
+		count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+						&cancels, LCK_EX,
+						MDS_INODELOCK_UPDATE);
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID2) &&
+	    (fid_is_sane(&op_data->op_fid2)))
+		count += mdc_resource_get_unused(exp, &op_data->op_fid2,
+						 &cancels, LCK_EX,
+						 MDS_INODELOCK_UPDATE);
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID3) &&
+	    (fid_is_sane(&op_data->op_fid3)))
+		count += mdc_resource_get_unused(exp, &op_data->op_fid3,
+						 &cancels, LCK_EX,
+						 MDS_INODELOCK_LOOKUP);
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID4) &&
+	    (fid_is_sane(&op_data->op_fid4)))
+		count += mdc_resource_get_unused(exp, &op_data->op_fid4,
+						 &cancels, LCK_EX,
+						 MDS_INODELOCK_ELC);
 
 	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
 			   op_data->op_cli_flags & CLI_MIGRATE ?
@@ -392,8 +424,21 @@ int mdc_rename(struct obd_export *exp, struct md_op_data *op_data,
 		RETURN(-ENOMEM);
 	}
 
-        req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, oldlen + 1);
-        req_capsule_set_size(&req->rq_pill, &RMF_SYMTGT, RCL_CLIENT, newlen+1);
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, oldlen + 1);
+	req_capsule_set_size(&req->rq_pill, &RMF_SYMTGT, RCL_CLIENT, newlen+1);
+	if (op_data->op_cli_flags & CLI_MIGRATE)
+		req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+				     op_data->op_data_size);
+
+	/* get SELinux policy info if any */
+	rc = sptlrpc_get_sepol(req);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
+			     strlen(req->rq_sepol) ?
+			     strlen(req->rq_sepol) + 1 : 0);
 
 	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
 	if (rc) {
@@ -401,34 +446,76 @@ int mdc_rename(struct obd_export *exp, struct md_op_data *op_data,
 		RETURN(rc);
 	}
 
-	if (op_data->op_cli_flags & CLI_MIGRATE && op_data->op_data != NULL) {
-		struct md_open_data *mod = op_data->op_data;
+	if (exp_connect_cancelset(exp) && req)
+		ldlm_cli_cancel_list(&cancels, count, req, 0);
+
+	if (op_data->op_cli_flags & CLI_MIGRATE)
+		mdc_migrate_pack(req, op_data, old, oldlen);
+	else
+		mdc_rename_pack(req, op_data, old, oldlen, new, newlen);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     obd->u.cli.cl_default_mds_easize);
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_reint(req, LUSTRE_IMP_FULL);
+	*request = req;
+	if (rc == -ERESTARTSYS)
+		rc = 0;
+
+	RETURN(rc);
+}
 
-		LASSERTF(mod->mod_open_req != NULL &&
-			 mod->mod_open_req->rq_type != LI_POISON,
-			 "POISONED open %p!\n", mod->mod_open_req);
+int mdc_file_resync(struct obd_export *exp, struct md_op_data *op_data)
+{
+	struct list_head cancels = LIST_HEAD_INIT(cancels);
+	struct ptlrpc_request *req;
+	struct ldlm_lock *lock;
+	struct mdt_rec_resync *rec;
+	int count = 0, rc;
+	ENTRY;
 
-		DEBUG_REQ(D_HA, mod->mod_open_req, "matched open");
-		/* We no longer want to preserve this open for replay even
-		 * though the open was committed. b=3632, b=3633 */
-		spin_lock(&mod->mod_open_req->rq_lock);
-		mod->mod_open_req->rq_replay = 0;
-		spin_unlock(&mod->mod_open_req->rq_lock);
+	if (op_data->op_flags & MF_MDC_CANCEL_FID1 &&
+	    fid_is_sane(&op_data->op_fid1))
+		count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+						&cancels, LCK_EX,
+						MDS_INODELOCK_LAYOUT);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_REINT_RESYNC);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		RETURN(-ENOMEM);
 	}
 
-        if (exp_connect_cancelset(exp) && req)
-                ldlm_cli_cancel_list(&cancels, count, req, 0);
+	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
 
-        mdc_rename_pack(req, op_data, old, oldlen, new, newlen);
+	CLASSERT(sizeof(*rec) == sizeof(struct mdt_rec_reint));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+	rec->rs_opcode	= REINT_RESYNC;
+	rec->rs_fsuid	= op_data->op_fsuid;
+	rec->rs_fsgid	= op_data->op_fsgid;
+	rec->rs_cap	= op_data->op_cap;
+	rec->rs_fid	= op_data->op_fid1;
+	rec->rs_bias	= op_data->op_bias;
+	rec->rs_mirror_id = op_data->op_mirror_id;
+
+	lock = ldlm_handle2lock(&op_data->op_lease_handle);
+	if (lock != NULL) {
+		rec->rs_lease_handle = lock->l_remote_handle;
+		LDLM_LOCK_PUT(lock);
+	}
 
-	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
-			     obd->u.cli.cl_default_mds_easize);
 	ptlrpc_request_set_replen(req);
 
 	rc = mdc_reint(req, LUSTRE_IMP_FULL);
-        *request = req;
-        if (rc == -ERESTARTSYS)
-                rc = 0;
+	if (rc == -ERESTARTSYS)
+		rc = 0;
 
-        RETURN(rc);
+	ptlrpc_req_finished(req);
+	RETURN(rc);
 }
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c
index 6c8da5866a8b9..5a29a285e5943 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -41,21 +41,23 @@
 #ifdef HAVE_UIDGID_HEADER
 # include <linux/uidgid.h>
 #endif
+#include <linux/device.h>
 
-#include <lustre/lustre_errno.h>
+#include <lustre_errno.h>
 
 #include <cl_object.h>
 #include <llog_swab.h>
 #include <lprocfs_status.h>
 #include <lustre_acl.h>
 #include <lustre_fid.h>
-#include <uapi/linux/lustre_ioctl.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
 #include <lustre_kernelcomm.h>
 #include <lustre_lmv.h>
 #include <lustre_log.h>
-#include <uapi/linux/lustre_param.h>
+#include <uapi/linux/lustre/lustre_param.h>
 #include <lustre_swab.h>
 #include <obd_class.h>
+#include <lustre_osc.h>
 
 #include "mdc_internal.h"
 
@@ -191,20 +193,34 @@ static int mdc_getattr_common(struct obd_export *exp,
         RETURN(0);
 }
 
+static void mdc_reset_acl_req(struct ptlrpc_request *req)
+{
+	spin_lock(&req->rq_early_free_lock);
+	sptlrpc_cli_free_repbuf(req);
+	req->rq_repbuf = NULL;
+	req->rq_repbuf_len = 0;
+	req->rq_repdata = NULL;
+	req->rq_reqdata_len = 0;
+	spin_unlock(&req->rq_early_free_lock);
+}
+
 static int mdc_getattr(struct obd_export *exp, struct md_op_data *op_data,
 		       struct ptlrpc_request **request)
 {
-        struct ptlrpc_request *req;
-        int                    rc;
-        ENTRY;
+	struct ptlrpc_request *req;
+	struct obd_import *imp = class_exp2cliimp(exp);
+	__u32 acl_bufsize = LUSTRE_POSIX_ACL_MAX_SIZE_OLD;
+	int rc;
+	ENTRY;
 
 	/* Single MDS without an LMV case */
 	if (op_data->op_flags & MF_GET_MDT_IDX) {
 		op_data->op_mds = 0;
 		RETURN(0);
 	}
-        *request = NULL;
-        req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR);
+
+	*request = NULL;
+	req = ptlrpc_request_alloc(imp, &RQF_MDS_GETATTR);
         if (req == NULL)
                 RETURN(-ENOMEM);
 
@@ -214,33 +230,42 @@ static int mdc_getattr(struct obd_export *exp, struct md_op_data *op_data,
                 RETURN(rc);
         }
 
+again:
 	mdc_pack_body(req, &op_data->op_fid1, op_data->op_valid,
 		      op_data->op_mode, -1, 0);
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, acl_bufsize);
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     op_data->op_mode);
+	ptlrpc_request_set_replen(req);
 
-	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
-			     req->rq_import->imp_connect_data.ocd_max_easize);
-        req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
-                             op_data->op_mode);
-        ptlrpc_request_set_replen(req);
+	rc = mdc_getattr_common(exp, req);
+	if (rc) {
+		if (rc == -ERANGE &&
+		    acl_bufsize != imp->imp_connect_data.ocd_max_easize) {
+			acl_bufsize = imp->imp_connect_data.ocd_max_easize;
+			mdc_reset_acl_req(req);
+			goto again;
+		}
 
-        rc = mdc_getattr_common(exp, req);
-        if (rc)
-                ptlrpc_req_finished(req);
-        else
-                *request = req;
-        RETURN(rc);
+		ptlrpc_req_finished(req);
+	} else {
+		*request = req;
+	}
+
+	RETURN(rc);
 }
 
 static int mdc_getattr_name(struct obd_export *exp, struct md_op_data *op_data,
 			    struct ptlrpc_request **request)
 {
-        struct ptlrpc_request *req;
-        int                    rc;
-        ENTRY;
+	struct ptlrpc_request *req;
+	struct obd_import *imp = class_exp2cliimp(exp);
+	__u32 acl_bufsize = LUSTRE_POSIX_ACL_MAX_SIZE_OLD;
+	int rc;
+	ENTRY;
 
-        *request = NULL;
-        req = ptlrpc_request_alloc(class_exp2cliimp(exp),
-                                   &RQF_MDS_GETATTR_NAME);
+	*request = NULL;
+	req = ptlrpc_request_alloc(imp, &RQF_MDS_GETATTR_NAME);
         if (req == NULL)
                 RETURN(-ENOMEM);
 
@@ -253,9 +278,6 @@ static int mdc_getattr_name(struct obd_export *exp, struct md_op_data *op_data,
                 RETURN(rc);
         }
 
-	mdc_pack_body(req, &op_data->op_fid1, op_data->op_valid,
-		      op_data->op_mode, op_data->op_suppgids[0], 0);
-
         if (op_data->op_name) {
                 char *name = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
                 LASSERT(strnlen(op_data->op_name, op_data->op_namelen) ==
@@ -263,18 +285,29 @@ static int mdc_getattr_name(struct obd_export *exp, struct md_op_data *op_data,
                 memcpy(name, op_data->op_name, op_data->op_namelen);
         }
 
-        req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
-                             op_data->op_mode);
-	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
-			     req->rq_import->imp_connect_data.ocd_max_easize);
-        ptlrpc_request_set_replen(req);
+again:
+	mdc_pack_body(req, &op_data->op_fid1, op_data->op_valid,
+		      op_data->op_mode, op_data->op_suppgids[0], 0);
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     op_data->op_mode);
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, acl_bufsize);
+	ptlrpc_request_set_replen(req);
 
-        rc = mdc_getattr_common(exp, req);
-        if (rc)
-                ptlrpc_req_finished(req);
-        else
-                *request = req;
-        RETURN(rc);
+	rc = mdc_getattr_common(exp, req);
+	if (rc) {
+		if (rc == -ERANGE &&
+		    acl_bufsize != imp->imp_connect_data.ocd_max_easize) {
+			acl_bufsize = imp->imp_connect_data.ocd_max_easize;
+			mdc_reset_acl_req(req);
+			goto again;
+		}
+
+		ptlrpc_req_finished(req);
+	} else {
+		*request = req;
+	}
+
+	RETURN(rc);
 }
 
 static int mdc_xattr_common(struct obd_export *exp,const struct req_format *fmt,
@@ -294,16 +327,25 @@ static int mdc_xattr_common(struct obd_export *exp,const struct req_format *fmt,
         if (req == NULL)
                 RETURN(-ENOMEM);
 
-        if (xattr_name) {
-                xattr_namelen = strlen(xattr_name) + 1;
-                req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
-                                     xattr_namelen);
-        }
-        if (input_size) {
-                LASSERT(input);
-                req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
-                                     input_size);
-        }
+	if (xattr_name) {
+		xattr_namelen = strlen(xattr_name) + 1;
+		req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+				     xattr_namelen);
+	}
+	if (input_size)
+		LASSERT(input);
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+			     input_size);
+
+	/* get SELinux policy info if any */
+	rc = sptlrpc_get_sepol(req);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
+			     strlen(req->rq_sepol) ?
+			     strlen(req->rq_sepol) + 1 : 0);
 
 	/* Flush local XATTR locks to get rid of a possible cancel RPC */
 	if (opcode == MDS_REINT && fid_is_sane(fid) &&
@@ -333,11 +375,11 @@ static int mdc_xattr_common(struct obd_export *exp,const struct req_format *fmt,
 		}
 	}
 
-        if (opcode == MDS_REINT) {
-                struct mdt_rec_setxattr *rec;
+	if (opcode == MDS_REINT) {
+		struct mdt_rec_setxattr *rec;
 
-                CLASSERT(sizeof(struct mdt_rec_setxattr) ==
-                         sizeof(struct mdt_rec_reint));
+		CLASSERT(sizeof(struct mdt_rec_setxattr) ==
+			 sizeof(struct mdt_rec_reint));
 		rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
 		rec->sx_opcode = REINT_SETXATTR;
 		rec->sx_fsuid  = from_kuid(&init_user_ns, current_fsuid());
@@ -363,6 +405,8 @@ static int mdc_xattr_common(struct obd_export *exp,const struct req_format *fmt,
                 memcpy(tmp, input, input_size);
         }
 
+	mdc_file_sepol_pack(req);
+
         if (req_capsule_has_field(&req->rq_pill, &RMF_EADATA, RCL_SERVER))
                 req_capsule_set_size(&req->rq_pill, &RMF_EADATA,
                                      RCL_SERVER, output_size);
@@ -385,26 +429,77 @@ static int mdc_xattr_common(struct obd_export *exp,const struct req_format *fmt,
 }
 
 static int mdc_setxattr(struct obd_export *exp, const struct lu_fid *fid,
-			u64 valid, const char *xattr_name,
-			const char *input, int input_size, int output_size,
-			int flags, __u32 suppgid,
-			struct ptlrpc_request **request)
+			u64 obd_md_valid, const char *name,
+			const void *value, size_t value_size,
+			unsigned int xattr_flags, u32 suppgid,
+			struct ptlrpc_request **req)
 {
+	LASSERT(obd_md_valid == OBD_MD_FLXATTR ||
+		obd_md_valid == OBD_MD_FLXATTRRM);
+
 	return mdc_xattr_common(exp, &RQF_MDS_REINT_SETXATTR,
-				fid, MDS_REINT, valid, xattr_name,
-				input, input_size, output_size, flags,
-				suppgid, request);
+				fid, MDS_REINT, obd_md_valid, name,
+				value, value_size, 0, xattr_flags, suppgid,
+				req);
 }
 
 static int mdc_getxattr(struct obd_export *exp, const struct lu_fid *fid,
-			u64 valid, const char *xattr_name,
-			const char *input, int input_size, int output_size,
-			int flags, struct ptlrpc_request **request)
+			u64 obd_md_valid, const char *name, size_t buf_size,
+			struct ptlrpc_request **req)
 {
-	return mdc_xattr_common(exp, &RQF_MDS_GETXATTR,
-				fid, MDS_GETXATTR, valid, xattr_name,
-				input, input_size, output_size, flags,
-				-1, request);
+	struct mdt_body *body;
+	int rc;
+
+	LASSERT(obd_md_valid == OBD_MD_FLXATTR ||
+		obd_md_valid == OBD_MD_FLXATTRLS);
+
+	CDEBUG(D_INFO, "%s: get xattr '%s' for "DFID"\n",
+	       exp->exp_obd->obd_name, name, PFID(fid));
+	rc = mdc_xattr_common(exp, &RQF_MDS_GETXATTR, fid, MDS_GETXATTR,
+			      obd_md_valid, name, NULL, 0, buf_size, 0, -1,
+			      req);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&(*req)->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	/* only detect the xattr size */
+	if (buf_size == 0) {
+		/* LU-11109: Older MDTs do not distinguish
+		 * between nonexistent xattrs and zero length
+		 * values in this case. Newer MDTs will return
+		 * -ENODATA or set OBD_MD_FLXATTR. */
+		GOTO(out, rc = body->mbo_eadatasize);
+	}
+
+	if (body->mbo_eadatasize == 0) {
+		/* LU-11109: Newer MDTs set OBD_MD_FLXATTR on
+		 * success so that we can distinguish between
+		 * zero length value and nonexistent xattr.
+		 *
+		 * If OBD_MD_FLXATTR is not set then we keep
+		 * the old behavior and return -ENODATA for
+		 * getxattr() when mbo_eadatasize is 0. But
+		 * -ENODATA only makes sense for getxattr()
+		 * and not for listxattr(). */
+		if (body->mbo_valid & OBD_MD_FLXATTR)
+			GOTO(out, rc = 0);
+		else if (obd_md_valid == OBD_MD_FLXATTR)
+			GOTO(out, rc = -ENODATA);
+		else
+			GOTO(out, rc = 0);
+	}
+
+	GOTO(out, rc = body->mbo_eadatasize);
+out:
+	if (rc < 0) {
+		ptlrpc_req_finished(*req);
+		*req = NULL;
+	}
+
+	return rc;
 }
 
 #ifdef CONFIG_FS_POSIX_ACL
@@ -552,41 +647,41 @@ int mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md)
 
 void mdc_replay_open(struct ptlrpc_request *req)
 {
-        struct md_open_data *mod = req->rq_cb_data;
-        struct ptlrpc_request *close_req;
-        struct obd_client_handle *och;
-        struct lustre_handle old;
-        struct mdt_body *body;
-        ENTRY;
+	struct md_open_data *mod = req->rq_cb_data;
+	struct ptlrpc_request *close_req;
+	struct obd_client_handle *och;
+	struct lustre_handle old_open_handle = { };
+	struct mdt_body *body;
+	ENTRY;
 
-        if (mod == NULL) {
-                DEBUG_REQ(D_ERROR, req,
-                          "Can't properly replay without open data.");
-                EXIT;
-                return;
-        }
+	if (mod == NULL) {
+		DEBUG_REQ(D_ERROR, req,
+			  "Can't properly replay without open data.");
+		EXIT;
+		return;
+	}
 
-        body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
-        LASSERT(body != NULL);
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body != NULL);
 
 	spin_lock(&req->rq_lock);
 	och = mod->mod_och;
-	if (och && och->och_fh.cookie)
+	if (och && och->och_open_handle.cookie)
 		req->rq_early_free_repbuf = 1;
 	else
 		req->rq_early_free_repbuf = 0;
 	spin_unlock(&req->rq_lock);
 
 	if (req->rq_early_free_repbuf) {
-		struct lustre_handle *file_fh;
+		struct lustre_handle *file_open_handle;
 
 		LASSERT(och->och_magic == OBD_CLIENT_HANDLE_MAGIC);
 
-		file_fh = &och->och_fh;
+		file_open_handle = &och->och_open_handle;
 		CDEBUG(D_HA, "updating handle from %#llx to %#llx\n",
-		       file_fh->cookie, body->mbo_handle.cookie);
-		old = *file_fh;
-		*file_fh = body->mbo_handle;
+		       file_open_handle->cookie, body->mbo_open_handle.cookie);
+		old_open_handle = *file_open_handle;
+		*file_open_handle = body->mbo_open_handle;
 	}
 
 	close_req = mod->mod_close_req;
@@ -600,10 +695,11 @@ void mdc_replay_open(struct ptlrpc_request *req)
 		LASSERT(epoch);
 
 		if (req->rq_early_free_repbuf)
-			LASSERT(!memcmp(&old, &epoch->mio_handle, sizeof(old)));
+			LASSERT(old_open_handle.cookie ==
+				epoch->mio_open_handle.cookie);
 
 		DEBUG_REQ(D_HA, close_req, "updating close body with new fh");
-		epoch->mio_handle = body->mbo_handle;
+		epoch->mio_open_handle = body->mbo_open_handle;
 	}
 	EXIT;
 }
@@ -685,20 +781,20 @@ int mdc_set_open_replay_data(struct obd_export *exp,
 		open_req->rq_commit_cb = mdc_commit_open;
 		open_req->rq_early_free_repbuf = 1;
 		spin_unlock(&open_req->rq_lock);
-        }
+	}
 
 	rec->cr_fid2 = body->mbo_fid1;
-	rec->cr_ioepoch = body->mbo_ioepoch;
-	rec->cr_old_handle.cookie = body->mbo_handle.cookie;
+	rec->cr_open_handle_old = body->mbo_open_handle;
 	open_req->rq_replay_cb = mdc_replay_open;
 	if (!fid_is_sane(&body->mbo_fid1)) {
-                DEBUG_REQ(D_ERROR, open_req, "Saving replay request with "
-                          "insane fid");
-                LBUG();
-        }
+		DEBUG_REQ(D_ERROR, open_req,
+			  "saving replay request with insane FID " DFID,
+			  PFID(&body->mbo_fid1));
+		LBUG();
+	}
 
-        DEBUG_REQ(D_RPCTRACE, open_req, "Set up open replay data");
-        RETURN(0);
+	DEBUG_REQ(D_RPCTRACE, open_req, "Set up open replay data");
+	RETURN(0);
 }
 
 static void mdc_free_open(struct md_open_data *mod)
@@ -742,7 +838,7 @@ int mdc_clear_open_replay_data(struct obd_export *exp,
 
 	spin_lock(&mod->mod_open_req->rq_lock);
 	if (mod->mod_och)
-		mod->mod_och->och_fh.cookie = 0;
+		mod->mod_och->och_open_handle.cookie = 0;
 	mod->mod_open_req->rq_early_free_repbuf = 0;
 	spin_unlock(&mod->mod_open_req->rq_lock);
 	mdc_free_open(mod);
@@ -760,23 +856,35 @@ static int mdc_close(struct obd_export *exp, struct md_op_data *op_data,
 	struct obd_device     *obd = class_exp2obd(exp);
 	struct ptlrpc_request *req;
 	struct req_format     *req_fmt;
+	size_t		       u32_count = 0;
 	int                    rc;
 	int		       saved_rc = 0;
 	ENTRY;
 
-	if (op_data->op_bias & MDS_HSM_RELEASE) {
-		req_fmt = &RQF_MDS_INTENT_CLOSE;
+	CDEBUG(D_INODE, "%s: "DFID" file closed with intent: %x\n",
+	       exp->exp_obd->obd_name, PFID(&op_data->op_fid1),
+	       op_data->op_bias);
+
+	if (op_data->op_bias & MDS_CLOSE_INTENT) {
+		req_fmt = &RQF_MDS_CLOSE_INTENT;
+		if (op_data->op_bias & MDS_HSM_RELEASE) {
+			/* allocate a FID for volatile file */
+			rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2,
+					   op_data);
+			if (rc < 0) {
+				CERROR("%s: "DFID" allocating FID: rc = %d\n",
+				       obd->obd_name, PFID(&op_data->op_fid1),
+				       rc);
+				/* save the errcode and proceed to close */
+				saved_rc = rc;
+			}
+		}
+		if (op_data->op_bias & MDS_CLOSE_RESYNC_DONE) {
+			size_t count = op_data->op_data_size / sizeof(__u32);
 
-		/* allocate a FID for volatile file */
-		rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
-		if (rc < 0) {
-			CERROR("%s: "DFID" failed to allocate FID: %d\n",
-			       obd->obd_name, PFID(&op_data->op_fid1), rc);
-			/* save the errcode and proceed to close */
-			saved_rc = rc;
+			if (count > INLINE_RESYNC_ARRAY_SIZE)
+				u32_count = count;
 		}
-	} else if (op_data->op_bias & MDS_CLOSE_LAYOUT_SWAP) {
-		req_fmt = &RQF_MDS_INTENT_CLOSE;
 	} else {
 		req_fmt = &RQF_MDS_CLOSE;
 	}
@@ -814,6 +922,10 @@ static int mdc_close(struct obd_export *exp, struct md_op_data *op_data,
 		GOTO(out, rc = -ENOMEM);
 	}
 
+	if (u32_count > 0)
+		req_capsule_set_size(&req->rq_pill, &RMF_U32, RCL_CLIENT,
+				     u32_count * sizeof(__u32));
+
 	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_CLOSE);
 	if (rc) {
 		ptlrpc_request_free(req);
@@ -827,6 +939,9 @@ static int mdc_close(struct obd_export *exp, struct md_op_data *op_data,
         req->rq_request_portal = MDS_READPAGE_PORTAL;
         ptlrpc_at_set_req_timeout(req);
 
+	if (!(exp_connect_flags2(exp) & OBD_CONNECT2_LSOM))
+		op_data->op_xvalid &= ~(OP_XVALID_LAZYSIZE |
+					OP_XVALID_LAZYBLOCKS);
 
         mdc_close_pack(req, op_data);
 
@@ -1110,12 +1225,12 @@ static void mdc_adjust_dirpages(struct page **pages, int cfs_pgs, int lu_pgs)
 	int i;
 
 	for (i = 0; i < cfs_pgs; i++) {
-		struct lu_dirpage	*dp = kmap(pages[i]);
-		struct lu_dirpage	*first = dp;
-		struct lu_dirent	*end_dirent = NULL;
-		struct lu_dirent	*ent;
-		__u64		hash_end = le64_to_cpu(dp->ldp_hash_end);
-		__u32		flags = le32_to_cpu(dp->ldp_flags);
+		struct lu_dirpage *dp = kmap(pages[i]);
+		struct lu_dirpage *first = dp;
+		struct lu_dirent *end_dirent = NULL;
+		struct lu_dirent *ent;
+		__u64 hash_end = dp->ldp_hash_end;
+		__u32 flags = dp->ldp_flags;
 
 		while (--lu_pgs > 0) {
 			ent = lu_dirent_start(dp);
@@ -1130,8 +1245,8 @@ static void mdc_adjust_dirpages(struct page **pages, int cfs_pgs, int lu_pgs)
 				break;
 
 			/* Save the hash and flags of this lu_dirpage. */
-			hash_end = le64_to_cpu(dp->ldp_hash_end);
-			flags = le32_to_cpu(dp->ldp_flags);
+			hash_end = dp->ldp_hash_end;
+			flags = dp->ldp_flags;
 
 			/* Check if lu_dirpage contains no entries. */
 			if (end_dirent == NULL)
@@ -1429,33 +1544,48 @@ static int mdc_read_page(struct obd_export *exp, struct md_op_data *op_data,
 	goto out_unlock;
 }
 
-
 static int mdc_statfs(const struct lu_env *env,
                       struct obd_export *exp, struct obd_statfs *osfs,
-                      __u64 max_age, __u32 flags)
+		      time64_t max_age, __u32 flags)
 {
-        struct obd_device     *obd = class_exp2obd(exp);
-        struct ptlrpc_request *req;
-        struct obd_statfs     *msfs;
-        struct obd_import     *imp = NULL;
-        int                    rc;
-        ENTRY;
+	struct obd_device *obd = class_exp2obd(exp);
+	struct req_format *fmt;
+	struct ptlrpc_request *req;
+	struct obd_statfs *msfs;
+	struct obd_import *imp = NULL;
+	int rc;
+	ENTRY;
 
         /*
          * Since the request might also come from lprocfs, so we need
          * sync this with client_disconnect_export Bug15684
          */
 	down_read(&obd->u.cli.cl_sem);
-        if (obd->u.cli.cl_import)
-                imp = class_import_get(obd->u.cli.cl_import);
+	if (obd->u.cli.cl_import)
+		imp = class_import_get(obd->u.cli.cl_import);
 	up_read(&obd->u.cli.cl_sem);
-        if (!imp)
-                RETURN(-ENODEV);
+	if (!imp)
+		RETURN(-ENODEV);
+
+	fmt = &RQF_MDS_STATFS;
+	if ((exp_connect_flags2(exp) & OBD_CONNECT2_SUM_STATFS) &&
+	    (flags & OBD_STATFS_SUM))
+		fmt = &RQF_MDS_STATFS_NEW;
+	req = ptlrpc_request_alloc_pack(imp, fmt, LUSTRE_MDS_VERSION,
+					MDS_STATFS);
+	if (req == NULL)
+		GOTO(output, rc = -ENOMEM);
 
-        req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_STATFS,
-                                        LUSTRE_MDS_VERSION, MDS_STATFS);
-        if (req == NULL)
-                GOTO(output, rc = -ENOMEM);
+	if ((flags & OBD_STATFS_SUM) &&
+	    (exp_connect_flags2(exp) & OBD_CONNECT2_SUM_STATFS)) {
+		/* request aggregated states */
+		struct mdt_body *body;
+
+		body = req_capsule_client_get(&req->rq_pill, &RMF_MDT_BODY);
+		if (body == NULL)
+			GOTO(out, rc = -EPROTO);
+		body->mbo_valid = OBD_MD_FLAGSTATFS;
+	}
 
         ptlrpc_request_set_replen(req);
 
@@ -1571,29 +1701,53 @@ static int mdc_ioc_hsm_progress(struct obd_export *exp,
 	ptlrpc_req_finished(req);
 	return rc;
 }
-
-static int mdc_ioc_hsm_ct_register(struct obd_import *imp, __u32 archives)
+/**
+ * Send hsm_ct_register to MDS
+ *
+ * \param[in]	imp		import
+ * \param[in]	archive_count	if in bitmap format, it is the bitmap,
+ *				else it is the count of archive_ids
+ * \param[in]	archives	if in bitmap format, it is NULL,
+ *				else it is archive_id lists
+ */
+static int mdc_ioc_hsm_ct_register(struct obd_import *imp, __u32 archive_count,
+				   __u32 *archives)
 {
-	__u32			*archive_mask;
-	struct ptlrpc_request	*req;
-	int			 rc;
+	struct ptlrpc_request *req;
+	__u32 *archive_array;
+	size_t archives_size;
+	int rc;
 	ENTRY;
 
-	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_CT_REGISTER,
-					LUSTRE_MDS_VERSION,
-					MDS_HSM_CT_REGISTER);
+	req = ptlrpc_request_alloc(imp, &RQF_MDS_HSM_CT_REGISTER);
 	if (req == NULL)
-		GOTO(out, rc = -ENOMEM);
+		RETURN(-ENOMEM);
+
+	if (archives != NULL)
+		archives_size = sizeof(*archive_array) * archive_count;
+	else
+		archives_size = sizeof(archive_count);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDS_HSM_ARCHIVE,
+			     RCL_CLIENT, archives_size);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_CT_REGISTER);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(-ENOMEM);
+	}
 
 	mdc_pack_body(req, NULL, 0, 0, -1, 0);
 
-	/* Copy hsm_progress struct */
-	archive_mask = req_capsule_client_get(&req->rq_pill,
-					      &RMF_MDS_HSM_ARCHIVE);
-	if (archive_mask == NULL)
+	archive_array = req_capsule_client_get(&req->rq_pill,
+					       &RMF_MDS_HSM_ARCHIVE);
+	if (archive_array == NULL)
 		GOTO(out, rc = -EPROTO);
 
-	*archive_mask = archives;
+	if (archives != NULL)
+		memcpy(archive_array, archives, archives_size);
+	else
+		*archive_array = archive_count;
 
 	ptlrpc_request_set_replen(req);
 
@@ -1977,7 +2131,7 @@ static int mdc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
 			GOTO(out, rc = -EFAULT);
 
 		rc = mdc_statfs(NULL, obd->obd_self_export, &stat_buf,
-				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
 				0);
 		if (rc != 0)
 			GOTO(out, rc);
@@ -2061,6 +2215,13 @@ static int mdc_get_info_rpc(struct obd_export *exp,
                              RCL_SERVER, vallen);
         ptlrpc_request_set_replen(req);
 
+	/* if server failed to resolve FID, and OI scrub not able to fix it, it
+	 * will return -EINPROGRESS, ptlrpc_queue_wait() will keep retrying,
+	 * set request interruptible to avoid deadlock.
+	 */
+	if (KEY_IS(KEY_FID2PATH))
+		req->rq_allow_intr = 1;
+
 	rc = ptlrpc_queue_wait(req);
 	/* -EREMOTE means the get_info result is partial, and it needs to
 	 * continue on another MDT, see fid2path part in lmv_iocontrol */
@@ -2114,9 +2275,8 @@ static void lustre_swab_kuch(struct kuc_hdr *l)
 static int mdc_ioc_hsm_ct_start(struct obd_export *exp,
 				struct lustre_kernelcomm *lk)
 {
-	struct obd_import  *imp = class_exp2cliimp(exp);
-	__u32		    archive = lk->lk_data;
-	int		    rc = 0;
+	struct obd_import *imp = class_exp2cliimp(exp);
+	int rc = 0;
 
 	if (lk->lk_group != KUC_GRP_HSM) {
 		CERROR("Bad copytool group %d\n", lk->lk_group);
@@ -2130,7 +2290,12 @@ static int mdc_ioc_hsm_ct_start(struct obd_export *exp,
 		/* Unregister with the coordinator */
 		rc = mdc_ioc_hsm_ct_unregister(imp);
 	} else {
-		rc = mdc_ioc_hsm_ct_register(imp, archive);
+		__u32 *archives = NULL;
+
+		if ((lk->lk_flags & LK_FLG_DATANR) && lk->lk_data_count > 0)
+			archives = lk->lk_data;
+
+		rc = mdc_ioc_hsm_ct_register(imp, lk->lk_data_count, archives);
 	}
 
 	return rc;
@@ -2181,17 +2346,29 @@ static int mdc_hsm_copytool_send(const struct obd_uuid *uuid,
  */
 static int mdc_hsm_ct_reregister(void *data, void *cb_arg)
 {
-	struct kkuc_ct_data	*kcd = data;
-	struct obd_import	*imp = (struct obd_import *)cb_arg;
-	int			 rc;
+	struct obd_import *imp = (struct obd_import *)cb_arg;
+	struct kkuc_ct_data *kcd = data;
+	__u32 *archives = NULL;
+	int rc;
 
-	if (kcd == NULL || kcd->kcd_magic != KKUC_CT_DATA_MAGIC)
+	if (kcd == NULL ||
+	    (kcd->kcd_magic != KKUC_CT_DATA_ARRAY_MAGIC &&
+	     kcd->kcd_magic != KKUC_CT_DATA_BITMAP_MAGIC))
 		return -EPROTO;
 
-	CDEBUG(D_HA, "%s: recover copytool registration to MDT (archive=%#x)\n",
-	       imp->imp_obd->obd_name, kcd->kcd_archive);
-	rc = mdc_ioc_hsm_ct_register(imp, kcd->kcd_archive);
+	if (kcd->kcd_magic == KKUC_CT_DATA_BITMAP_MAGIC) {
+		CDEBUG(D_HA, "%s: recover copytool registration to MDT "
+		       "(archive=%#x)\n", imp->imp_obd->obd_name,
+		       kcd->kcd_nr_archives);
+	} else {
+		CDEBUG(D_HA, "%s: recover copytool registration to MDT "
+		       "(archive nr = %u)\n",
+		       imp->imp_obd->obd_name, kcd->kcd_nr_archives);
+		if (kcd->kcd_nr_archives != 0)
+			archives = kcd->kcd_archives;
+	}
 
+	rc = mdc_ioc_hsm_ct_register(imp, kcd->kcd_nr_archives, archives);
 	/* ignore error if the copytool is already registered */
 	return (rc == -EEXIST) ? 0 : rc;
 }
@@ -2237,14 +2414,6 @@ static int mdc_set_info_async(const struct lu_env *env,
                                        keylen, key, vallen, val, set);
                 RETURN(rc);
         }
-        if (KEY_IS(KEY_SPTLRPC_CONF)) {
-                sptlrpc_conf_client_adapt(exp->exp_obd);
-                RETURN(0);
-        }
-        if (KEY_IS(KEY_FLUSH_CTX)) {
-                sptlrpc_import_flush_my_ctx(imp);
-                RETURN(0);
-        }
         if (KEY_IS(KEY_CHANGELOG_CLEAR)) {
                 rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION,
                                        keylen, key, vallen, val, set);
@@ -2263,8 +2432,8 @@ static int mdc_set_info_async(const struct lu_env *env,
 		RETURN(0);
 	}
 
-	CERROR("Unknown key %s\n", (char *)key);
-	RETURN(-EINVAL);
+	rc = osc_set_info_async(env, exp, keylen, key, vallen, val, set);
+	RETURN(rc);
 }
 
 static int mdc_get_info(const struct lu_env *env, struct obd_export *exp,
@@ -2340,17 +2509,97 @@ static int mdc_fsync(struct obd_export *exp, const struct lu_fid *fid,
         RETURN(rc);
 }
 
+struct mdc_rmfid_args {
+	int *mra_rcs;
+	int mra_nr;
+};
+
+int mdc_rmfid_interpret(const struct lu_env *env, struct ptlrpc_request *req,
+			  void *args, int rc)
+{
+	struct mdc_rmfid_args *aa;
+	int *rcs, size;
+	ENTRY;
+
+	if (!rc) {
+		aa = ptlrpc_req_async_args(req);
+
+		size = req_capsule_get_size(&req->rq_pill, &RMF_RCS,
+					    RCL_SERVER);
+		LASSERT(size == sizeof(int) * aa->mra_nr);
+		rcs = req_capsule_server_get(&req->rq_pill, &RMF_RCS);
+		LASSERT(rcs);
+		LASSERT(aa->mra_rcs);
+		LASSERT(aa->mra_nr);
+		memcpy(aa->mra_rcs, rcs, size);
+	}
+
+	RETURN(rc);
+}
+
+static int mdc_rmfid(struct obd_export *exp, struct fid_array *fa,
+		     int *rcs, struct ptlrpc_request_set *set)
+{
+	struct ptlrpc_request *req;
+	struct mdc_rmfid_args *aa;
+	struct mdt_body *b;
+	struct lu_fid *tmp;
+	int rc, flen;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_RMFID);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	flen = fa->fa_nr * sizeof(struct lu_fid);
+	req_capsule_set_size(&req->rq_pill, &RMF_FID_ARRAY,
+			     RCL_CLIENT, flen);
+	req_capsule_set_size(&req->rq_pill, &RMF_FID_ARRAY,
+			     RCL_SERVER, flen);
+	req_capsule_set_size(&req->rq_pill, &RMF_RCS,
+			     RCL_SERVER, fa->fa_nr * sizeof(__u32));
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_RMFID);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_FID_ARRAY);
+	memcpy(tmp, fa->fa_fids, flen);
+
+	mdc_pack_body(req, NULL, 0, 0, -1, 0);
+	b = req_capsule_client_get(&req->rq_pill, &RMF_MDT_BODY);
+	b->mbo_ctime = ktime_get_real_seconds();
+
+	ptlrpc_request_set_replen(req);
+
+	LASSERT(rcs);
+	aa = ptlrpc_req_async_args(req);
+	aa->mra_rcs = rcs;
+	aa->mra_nr = fa->fa_nr;
+	req->rq_interpret_reply = mdc_rmfid_interpret;
+
+	ptlrpc_set_add_req(set, req);
+	ptlrpc_check_set(NULL, set);
+
+	RETURN(rc);
+}
+
 static int mdc_import_event(struct obd_device *obd, struct obd_import *imp,
 			    enum obd_import_event event)
 {
+	struct client_obd *cli = &obd->u.cli;
 	int rc = 0;
 
 	LASSERT(imp->imp_obd == obd);
 
 	switch (event) {
-
-	case IMP_EVENT_INACTIVE: {
-		struct client_obd *cli = &obd->u.cli;
+	case IMP_EVENT_DISCON:
+		spin_lock(&cli->cl_loi_list_lock);
+		cli->cl_avail_grant = 0;
+		cli->cl_lost_grant = 0;
+		spin_unlock(&cli->cl_loi_list_lock);
+		break;
+	case IMP_EVENT_INACTIVE:
 		/*
 		 * Flush current sequence to make client obtain new one
 		 * from server in case of disconnect/reconnect.
@@ -2362,12 +2611,28 @@ static int mdc_import_event(struct obd_device *obd, struct obd_import *imp,
 
 		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE);
 		break;
-	}
 	case IMP_EVENT_INVALIDATE: {
 		struct ldlm_namespace *ns = obd->obd_namespace;
+		struct lu_env *env;
+		__u16 refcheck;
 
 		ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
 
+		env = cl_env_get(&refcheck);
+		if (!IS_ERR(env)) {
+			/* Reset grants. All pages go to failing rpcs due to
+			 * the invalid import.
+			 */
+			osc_io_unplug(env, cli, NULL);
+
+			cfs_hash_for_each_nolock(ns->ns_rs_hash,
+						 osc_ldlm_resource_invalidate,
+						 env, 0);
+			cl_env_put(env, &refcheck);
+			ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+		} else {
+			rc = PTR_ERR(env);
+		}
 		break;
 	}
 	case IMP_EVENT_ACTIVE:
@@ -2376,10 +2641,15 @@ static int mdc_import_event(struct obd_device *obd, struct obd_import *imp,
 		if (rc == 0)
 			rc = mdc_kuc_reregister(imp);
 		break;
-	case IMP_EVENT_OCD:
+	case IMP_EVENT_OCD: {
+		struct obd_connect_data *ocd = &imp->imp_connect_data;
+
+		if (OCD_HAS_FLAG(ocd, GRANT))
+			osc_init_grant(cli, ocd);
+
 		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD);
 		break;
-	case IMP_EVENT_DISCON:
+	}
 	case IMP_EVENT_DEACTIVATE:
 	case IMP_EVENT_ACTIVATE:
 		break;
@@ -2428,6 +2698,12 @@ static int mdc_cancel_weight(struct ldlm_lock *lock)
 	if (lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_OPEN)
 		RETURN(0);
 
+	/* Special case for DoM locks, cancel only unused and granted locks */
+	if (ldlm_has_dom(lock) &&
+	    (lock->l_granted_mode != lock->l_req_mode ||
+	     osc_ldlm_weigh_ast(lock) != 0))
+		RETURN(0);
+
 	RETURN(1);
 }
 
@@ -2476,25 +2752,21 @@ static void mdc_llog_finish(struct obd_device *obd)
 	EXIT;
 }
 
-static int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg)
+int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg)
 {
-	int				rc;
+	int rc;
+
 	ENTRY;
 
-	rc = ptlrpcd_addref();
+	rc = osc_setup_common(obd, cfg);
 	if (rc < 0)
 		RETURN(rc);
 
-        rc = client_obd_setup(obd, cfg);
-        if (rc)
-		GOTO(err_ptlrpcd_decref, rc);
-#ifdef CONFIG_PROC_FS
-	obd->obd_vars = lprocfs_mdc_obd_vars;
-	lprocfs_obd_setup(obd);
-	lprocfs_alloc_md_stats(obd, 0);
-#endif
-	sptlrpc_lprocfs_cliobd_attach(obd);
-	ptlrpc_lprocfs_register_obd(obd);
+	rc = mdc_tunables_init(obd);
+	if (rc)
+		GOTO(err_osc_cleanup, rc);
+
+	obd->u.cli.cl_dom_min_inline_repsize = MDC_DOM_DEF_INLINE_REPSIZE;
 
 	ns_register_cancel(obd->obd_namespace, mdc_cancel_weight);
 
@@ -2504,25 +2776,26 @@ static int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg)
         if (rc) {
                 CERROR("%s: failed to setup llogging subsystems: rc = %d\n",
 		       obd->obd_name, rc);
-		GOTO(err_mdc_cleanup, rc);
+		GOTO(err_llog_cleanup, rc);
         }
 
 	rc = mdc_changelog_cdev_init(obd);
 	if (rc) {
 		CERROR("%s: failed to setup changelog char device: rc = %d\n",
 		       obd->obd_name, rc);
-		GOTO(err_mdc_cleanup, rc);
+		GOTO(err_changelog_cleanup, rc);
 	}
 
-	EXIT;
-err_mdc_cleanup:
-	if (rc)
-		client_obd_cleanup(obd);
+	RETURN(rc);
 
-err_ptlrpcd_decref:
-	if (rc)
-	        ptlrpcd_decref();
-        return rc;
+err_changelog_cleanup:
+	mdc_llog_finish(obd);
+err_llog_cleanup:
+	lprocfs_free_md_stats(obd);
+	ptlrpc_lprocfs_unregister_obd(obd);
+err_osc_cleanup:
+	osc_cleanup_common(obd);
+	return rc;
 }
 
 /* Initialize the default and maximum LOV EA sizes.  This allows
@@ -2553,6 +2826,7 @@ static int mdc_precleanup(struct obd_device *obd)
 {
 	ENTRY;
 
+	osc_precleanup_common(obd);
 	mdc_changelog_cdev_finish(obd);
 
 	obd_cleanup_client_import(obd);
@@ -2564,16 +2838,16 @@ static int mdc_precleanup(struct obd_device *obd)
 
 static int mdc_cleanup(struct obd_device *obd)
 {
-        ptlrpcd_decref();
-
-        return client_obd_cleanup(obd);
+	return osc_cleanup_common(obd);
 }
 
-static int mdc_process_config(struct obd_device *obd, size_t len, void *buf)
+int mdc_process_config(struct obd_device *obd, size_t len, void *buf)
 {
-        struct lustre_cfg *lcfg = buf;
-	int rc = class_process_proc_param(PARAM_MDC, obd->obd_vars, lcfg, obd);
-	return (rc > 0 ? 0: rc);
+	struct lustre_cfg *lcfg = buf;
+	size_t count  = class_modify_config(lcfg, PARAM_MDC,
+					    &obd->obd_kset.kobj);
+
+	return count > 0 ? 0 : count;
 }
 
 static struct obd_ops mdc_obd_ops = {
@@ -2584,7 +2858,8 @@ static struct obd_ops mdc_obd_ops = {
         .o_add_conn         = client_import_add_conn,
         .o_del_conn         = client_import_del_conn,
         .o_connect          = client_connect_import,
-        .o_disconnect       = client_disconnect_export,
+	.o_reconnect	    = osc_reconnect,
+	.o_disconnect	    = osc_disconnect,
         .o_iocontrol        = mdc_iocontrol,
         .o_set_info_async   = mdc_set_info_async,
         .o_statfs           = mdc_statfs,
@@ -2600,42 +2875,69 @@ static struct obd_ops mdc_obd_ops = {
 
 static struct md_ops mdc_md_ops = {
 	.m_get_root	    = mdc_get_root,
-        .m_null_inode	    = mdc_null_inode,
-        .m_close            = mdc_close,
-        .m_create           = mdc_create,
-        .m_enqueue          = mdc_enqueue,
-        .m_getattr          = mdc_getattr,
-        .m_getattr_name     = mdc_getattr_name,
-        .m_intent_lock      = mdc_intent_lock,
-        .m_link             = mdc_link,
-        .m_rename           = mdc_rename,
-        .m_setattr          = mdc_setattr,
-        .m_setxattr         = mdc_setxattr,
-        .m_getxattr         = mdc_getxattr,
+	.m_null_inode	    = mdc_null_inode,
+	.m_close            = mdc_close,
+	.m_create           = mdc_create,
+	.m_enqueue          = mdc_enqueue,
+	.m_getattr          = mdc_getattr,
+	.m_getattr_name     = mdc_getattr_name,
+	.m_intent_lock      = mdc_intent_lock,
+	.m_link             = mdc_link,
+	.m_rename           = mdc_rename,
+	.m_setattr          = mdc_setattr,
+	.m_setxattr         = mdc_setxattr,
+	.m_getxattr         = mdc_getxattr,
 	.m_fsync		= mdc_fsync,
+	.m_file_resync		= mdc_file_resync,
 	.m_read_page		= mdc_read_page,
-        .m_unlink           = mdc_unlink,
-        .m_cancel_unused    = mdc_cancel_unused,
-        .m_init_ea_size     = mdc_init_ea_size,
-        .m_set_lock_data    = mdc_set_lock_data,
-        .m_lock_match       = mdc_lock_match,
-        .m_get_lustre_md    = mdc_get_lustre_md,
-        .m_free_lustre_md   = mdc_free_lustre_md,
-        .m_set_open_replay_data = mdc_set_open_replay_data,
-        .m_clear_open_replay_data = mdc_clear_open_replay_data,
-        .m_intent_getattr_async = mdc_intent_getattr_async,
-        .m_revalidate_lock      = mdc_revalidate_lock
+	.m_unlink           = mdc_unlink,
+	.m_cancel_unused    = mdc_cancel_unused,
+	.m_init_ea_size     = mdc_init_ea_size,
+	.m_set_lock_data    = mdc_set_lock_data,
+	.m_lock_match       = mdc_lock_match,
+	.m_get_lustre_md    = mdc_get_lustre_md,
+	.m_free_lustre_md   = mdc_free_lustre_md,
+	.m_set_open_replay_data = mdc_set_open_replay_data,
+	.m_clear_open_replay_data = mdc_clear_open_replay_data,
+	.m_intent_getattr_async = mdc_intent_getattr_async,
+	.m_revalidate_lock      = mdc_revalidate_lock,
+	.m_rmfid		= mdc_rmfid,
 };
 
+dev_t mdc_changelog_dev;
+struct class *mdc_changelog_class;
 static int __init mdc_init(void)
 {
-	return class_register_type(&mdc_obd_ops, &mdc_md_ops, true, NULL,
-				   LUSTRE_MDC_NAME, NULL);
+	int rc = 0;
+	rc = alloc_chrdev_region(&mdc_changelog_dev, 0,
+				 MDC_CHANGELOG_DEV_COUNT,
+				 MDC_CHANGELOG_DEV_NAME);
+	if (rc)
+		return rc;
+
+	mdc_changelog_class = class_create(THIS_MODULE, MDC_CHANGELOG_DEV_NAME);
+	if (IS_ERR(mdc_changelog_class)) {
+		rc = PTR_ERR(mdc_changelog_class);
+		goto out_dev;
+	}
+
+	rc = class_register_type(&mdc_obd_ops, &mdc_md_ops, true, NULL,
+				 LUSTRE_MDC_NAME, &mdc_device_type);
+	if (rc)
+		goto out_dev;
+
+	return 0;
+
+out_dev:
+	unregister_chrdev_region(mdc_changelog_dev, MDC_CHANGELOG_DEV_COUNT);
+	return rc;
 }
 
 static void __exit mdc_exit(void)
 {
-        class_unregister_type(LUSTRE_MDC_NAME);
+	class_destroy(mdc_changelog_class);
+	unregister_chrdev_region(mdc_changelog_dev, MDC_CHANGELOG_DEV_COUNT);
+	class_unregister_type(LUSTRE_MDC_NAME);
 }
 
 MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
diff --git a/drivers/staging/lustrefsx/lustre/mgc/lproc_mgc.c b/drivers/staging/lustrefsx/lustre/mgc/lproc_mgc.c
index ab1985d9d9d24..f277d3e489e70 100644
--- a/drivers/staging/lustrefsx/lustre/mgc/lproc_mgc.c
+++ b/drivers/staging/lustrefsx/lustre/mgc/lproc_mgc.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -39,33 +39,26 @@
 
 #ifdef CONFIG_PROC_FS
 
-LPROC_SEQ_FOPS_RO_TYPE(mgc, uuid);
-LPROC_SEQ_FOPS_RO_TYPE(mgc, connect_flags);
-LPROC_SEQ_FOPS_RO_TYPE(mgc, server_uuid);
-LPROC_SEQ_FOPS_RO_TYPE(mgc, conn_uuid);
-LPROC_SEQ_FOPS_RO_TYPE(mgc, import);
-LPROC_SEQ_FOPS_RO_TYPE(mgc, state);
+LDEBUGFS_SEQ_FOPS_RO_TYPE(mgc, connect_flags);
 
-LPROC_SEQ_FOPS_WO_TYPE(mgc, ping);
+LDEBUGFS_SEQ_FOPS_RO_TYPE(mgc, server_uuid);
+
+LDEBUGFS_SEQ_FOPS_RO_TYPE(mgc, import);
+
+LDEBUGFS_SEQ_FOPS_RO_TYPE(mgc, state);
 
 static int mgc_ir_state_seq_show(struct seq_file *m, void *v)
 {
 	return lprocfs_mgc_rd_ir_state(m, m->private);
 }
-LPROC_SEQ_FOPS_RO(mgc_ir_state);
 
-struct lprocfs_vars lprocfs_mgc_obd_vars[] = {
-	{ .name	=	"uuid",
-	  .fops	=	&mgc_uuid_fops		},
-	{ .name	=	"ping",
-	  .fops	=	&mgc_ping_fops,
-	  .proc_mode =	0222			},
+LDEBUGFS_SEQ_FOPS_RO(mgc_ir_state);
+
+struct ldebugfs_vars ldebugfs_mgc_obd_vars[] = {
 	{ .name	=	"connect_flags",
 	  .fops	=	&mgc_connect_flags_fops	},
 	{ .name	=	"mgs_server_uuid",
 	  .fops	=	&mgc_server_uuid_fops	},
-	{ .name	=	"mgs_conn_uuid",
-	  .fops	=	&mgc_conn_uuid_fops	},
 	{ .name	=	"import",
 	  .fops	=	&mgc_import_fops	},
 	{ .name	=	"state",
@@ -75,3 +68,28 @@ struct lprocfs_vars lprocfs_mgc_obd_vars[] = {
 	{ NULL }
 };
 #endif /* CONFIG_PROC_FS */
+
+LUSTRE_ATTR(mgs_conn_uuid, 0444, conn_uuid_show, NULL);
+LUSTRE_RO_ATTR(conn_uuid);
+
+LUSTRE_RW_ATTR(ping);
+
+static struct attribute *mgc_attrs[] = {
+	&lustre_attr_mgs_conn_uuid.attr,
+	&lustre_attr_conn_uuid.attr,
+	&lustre_attr_ping.attr,
+	NULL,
+};
+
+int mgc_tunables_init(struct obd_device *obd)
+{
+	int rc;
+
+	obd->obd_ktype.default_attrs = mgc_attrs;
+	obd->obd_debugfs_vars = ldebugfs_mgc_obd_vars;
+	rc = lprocfs_obd_setup(obd, true);
+	if (rc)
+		return rc;
+
+	return sptlrpc_lprocfs_cliobd_attach(obd);
+}
diff --git a/drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h b/drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h
index 1a37720e901eb..50a13ebf4d3ca 100644
--- a/drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h
+++ b/drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -34,16 +34,13 @@
 #define _MGC_INTERNAL_H
 
 #include <libcfs/libcfs.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_lib.h>
 #include <lustre_dlm.h>
 #include <lustre_log.h>
 #include <lustre_export.h>
 
-#ifdef CONFIG_PROC_FS
-extern struct lprocfs_vars lprocfs_mgc_obd_vars[];
+int mgc_tunables_init(struct obd_device *obd);
 int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data);
-#endif /* CONFIG_PROC_FS */
 
 int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld);
 
diff --git a/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c b/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c
index a2a2bdd1f0732..a495e75ad5b4f 100644
--- a/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c
+++ b/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -118,7 +118,7 @@ EXPORT_SYMBOL(mgc_logname2resid);
 
 /********************** config llog list **********************/
 static struct list_head config_llog_list = LIST_HEAD_INIT(config_llog_list);
-static DEFINE_SPINLOCK(config_list_lock);
+static DEFINE_SPINLOCK(config_list_lock);	/* protects config_llog_list */
 
 /* Take a reference to a config log */
 static int config_log_get(struct config_llog_data *cld)
@@ -170,18 +170,18 @@ static
 struct config_llog_data *config_log_find(char *logname,
                                          struct config_llog_instance *cfg)
 {
-        struct config_llog_data *cld;
-        struct config_llog_data *found = NULL;
-        void *                   instance;
-        ENTRY;
+	struct config_llog_data *cld;
+	struct config_llog_data *found = NULL;
+	unsigned long cfg_instance;
 
-        LASSERT(logname != NULL);
+	ENTRY;
+	LASSERT(logname != NULL);
 
-        instance = cfg ? cfg->cfg_instance : NULL;
+	cfg_instance = cfg ? cfg->cfg_instance : 0;
 	spin_lock(&config_list_lock);
 	list_for_each_entry(cld, &config_llog_list, cld_list_chain) {
-		/* check if instance equals */
-		if (instance != cld->cld_cfg.cfg_instance)
+		/* check if cfg_instance is the one we want */
+		if (cfg_instance != cld->cld_cfg.cfg_instance)
 			continue;
 
 		/* instance may be NULL, should check name */
@@ -207,8 +207,8 @@ struct config_llog_data *do_config_log_add(struct obd_device *obd,
 
 	ENTRY;
 
-	CDEBUG(D_MGC, "do adding config log %s:%p\n", logname,
-	       cfg ? cfg->cfg_instance : NULL);
+	CDEBUG(D_MGC, "do adding config log %s-%016lx\n", logname,
+	       cfg ? cfg->cfg_instance : 0);
 
 	OBD_ALLOC(cld, sizeof(*cld) + strlen(logname) + 1);
 	if (!cld)
@@ -253,47 +253,49 @@ struct config_llog_data *do_config_log_add(struct obd_device *obd,
 }
 
 static struct config_llog_data *config_recover_log_add(struct obd_device *obd,
-        char *fsname,
-        struct config_llog_instance *cfg,
-        struct super_block *sb)
+					char *fsname,
+					struct config_llog_instance *cfg,
+					struct super_block *sb)
 {
-        struct config_llog_instance lcfg = *cfg;
-        struct lustre_sb_info *lsi = s2lsi(sb);
-        struct config_llog_data *cld;
-        char logname[32];
+	struct config_llog_instance lcfg = *cfg;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct config_llog_data *cld;
+	char logname[32];
 
 	if (IS_OST(lsi))
-                return NULL;
+		return NULL;
 
 	/* for osp-on-ost, see lustre_start_osp() */
 	if (IS_MDT(lsi) && lcfg.cfg_instance)
 		return NULL;
 
-        /* we have to use different llog for clients and mdts for cmd
-         * where only clients are notified if one of cmd server restarts */
-        LASSERT(strlen(fsname) < sizeof(logname) / 2);
-        strcpy(logname, fsname);
+	/* We have to use different llog for clients and MDTs for DNE,
+	 * where only clients are notified if one of DNE server restarts.
+	 */
+	LASSERT(strlen(fsname) < sizeof(logname) / 2);
+	strncpy(logname, fsname, sizeof(logname));
 	if (IS_SERVER(lsi)) { /* mdt */
-                LASSERT(lcfg.cfg_instance == NULL);
-                lcfg.cfg_instance = sb;
-                strcat(logname, "-mdtir");
-        } else {
-                LASSERT(lcfg.cfg_instance != NULL);
-                strcat(logname, "-cliir");
-        }
+		LASSERT(lcfg.cfg_instance == 0);
+		lcfg.cfg_instance = ll_get_cfg_instance(sb);
+		strncat(logname, "-mdtir", sizeof(logname));
+	} else {
+		LASSERT(lcfg.cfg_instance != 0);
+		strncat(logname, "-cliir", sizeof(logname));
+	}
 
-        cld = do_config_log_add(obd, logname, CONFIG_T_RECOVER, &lcfg, sb);
-        return cld;
+	cld = do_config_log_add(obd, logname, CONFIG_T_RECOVER, &lcfg, sb);
+	return cld;
 }
 
 static struct config_llog_data *config_log_find_or_add(struct obd_device *obd,
 				char *logname, struct super_block *sb, int type,
 				struct config_llog_instance *cfg)
 {
-	struct config_llog_instance	lcfg = *cfg;
-	struct config_llog_data		*cld;
+	struct config_llog_instance lcfg = *cfg;
+	struct config_llog_data *cld;
 
-	lcfg.cfg_instance = sb != NULL ? (void *)sb : (void *)obd;
+	/* Note class_config_llog_handler() depends on getting "obd" back */
+	lcfg.cfg_instance = sb ? ll_get_cfg_instance(sb) : (unsigned long)obd;
 
 	cld = config_log_find(logname, &lcfg);
 	if (unlikely(cld != NULL))
@@ -323,7 +325,8 @@ config_log_add(struct obd_device *obd, char *logname,
 	bool locked = false;
 	ENTRY;
 
-	CDEBUG(D_MGC, "adding config log %s:%p\n", logname, cfg->cfg_instance);
+	CDEBUG(D_MGC, "add config log %s-%016lx\n", logname,
+	       cfg->cfg_instance);
 
 	/*
 	 * for each regular log, the depended sptlrpc log name is
@@ -533,16 +536,15 @@ static int config_log_end(char *logname, struct config_llog_instance *cfg)
 	RETURN(rc);
 }
 
-#ifdef CONFIG_PROC_FS
 int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data)
 {
 	struct obd_device       *obd = data;
 	struct obd_import       *imp;
 	struct obd_connect_data *ocd;
 	struct config_llog_data *cld;
-	ENTRY;
 
-	LASSERT(obd != NULL);
+	ENTRY;
+	LASSERT(obd);
 	LPROCFS_CLIMP_CHECK(obd);
 	imp = obd->u.cli.cl_import;
 	ocd = &imp->imp_connect_data;
@@ -564,7 +566,6 @@ int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data)
 	LPROCFS_CLIMP_EXIT(obd);
 	RETURN(0);
 }
-#endif
 
 /* reenqueue any lost locks */
 #define RQ_RUNNING	0x1
@@ -962,11 +963,9 @@ static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 		GOTO(err_cleanup, rc);
 	}
 
-#ifdef CONFIG_PROC_FS
-	obd->obd_vars = lprocfs_mgc_obd_vars;
-	lprocfs_obd_setup(obd);
-#endif
-	sptlrpc_lprocfs_cliobd_attach(obd);
+	rc = mgc_tunables_init(obd);
+	if (rc)
+		GOTO(err_sysfs, rc);
 
 	if (atomic_inc_return(&mgc_count) == 1) {
 		rq_state = 0;
@@ -979,7 +978,7 @@ static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 			CERROR("%s: cannot start requeue thread: rc = %d; "
 			       "no more log updates\n",
 			       obd->obd_name, rc);
-			GOTO(err_cleanup, rc);
+			GOTO(err_sysfs, rc);
 		}
 		/* rc is the task_struct pointer of mgc_requeue_thread. */
 		rc = 0;
@@ -988,6 +987,8 @@ static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 
 	RETURN(rc);
 
+err_sysfs:
+	lprocfs_obd_cleanup(obd);
 err_cleanup:
 	client_obd_cleanup(obd);
 err_decref:
@@ -1404,34 +1405,34 @@ static int mgc_apply_recover_logs(struct obd_device *mgc,
 				  __u64 max_version,
 				  void *data, int datalen, bool mne_swab)
 {
-        struct config_llog_instance *cfg = &cld->cld_cfg;
-        struct lustre_sb_info       *lsi = s2lsi(cfg->cfg_sb);
-        struct mgs_nidtbl_entry *entry;
-        struct lustre_cfg       *lcfg;
-        struct lustre_cfg_bufs   bufs;
-        u64   prev_version = 0;
-        char *inst;
-        char *buf;
-        int   bufsz;
-        int   pos;
-        int   rc  = 0;
-        int   off = 0;
-        ENTRY;
+	struct config_llog_instance *cfg = &cld->cld_cfg;
+	struct lustre_sb_info *lsi = s2lsi(cfg->cfg_sb);
+	struct mgs_nidtbl_entry *entry;
+	struct lustre_cfg *lcfg;
+	struct lustre_cfg_bufs bufs;
+	u64 prev_version = 0;
+	char *inst;
+	char *buf;
+	int bufsz;
+	int pos;
+	int rc  = 0;
+	int off = 0;
 
-        LASSERT(cfg->cfg_instance != NULL);
-        LASSERT(cfg->cfg_sb == cfg->cfg_instance);
+	ENTRY;
+	LASSERT(cfg->cfg_instance != 0);
+	LASSERT(ll_get_cfg_instance(cfg->cfg_sb) == cfg->cfg_instance);
 
 	OBD_ALLOC(inst, PAGE_SIZE);
 	if (inst == NULL)
 		RETURN(-ENOMEM);
 
 	if (!IS_SERVER(lsi)) {
-		pos = snprintf(inst, PAGE_SIZE, "%p", cfg->cfg_instance);
+		pos = snprintf(inst, PAGE_SIZE, "%016lx", cfg->cfg_instance);
 		if (pos >= PAGE_SIZE) {
 			OBD_FREE(inst, PAGE_SIZE);
 			return -E2BIG;
 		}
-        } else {
+	} else {
 		LASSERT(IS_MDT(lsi));
 		rc = server_name2svname(lsi->lsi_svname, inst, NULL,
 					PAGE_SIZE);
@@ -1636,8 +1637,7 @@ static int mgc_process_recover_nodemap_log(struct obd_device *obd,
 	mgc_conn = class_exp2cliimp(cld->cld_mgcexp)->imp_connection;
 
 	/* don't need to get local config */
-	if (cld_is_nodemap(cld) &&
-	    (LNET_NETTYP(LNET_NIDNET(mgc_conn->c_peer.nid)) == LOLND))
+	if (cld_is_nodemap(cld) && LNetIsPeerLocal(mgc_conn->c_peer.nid))
 		GOTO(out, rc = 0);
 
         /* allocate buffer for bulk transfer.
@@ -1748,15 +1748,8 @@ static int mgc_process_recover_nodemap_log(struct obd_device *obd,
 #ifdef HAVE_SERVER_SUPPORT
 		/* config changed since first read RPC */
 		if (cld_is_nodemap(cld) && config_read_offset == 0) {
-			recent_nodemap = NULL;
-			nodemap_config_dealloc(new_config);
-			new_config = NULL;
-
 			CDEBUG(D_INFO, "nodemap config changed in transit, retrying\n");
-
-			/* setting eof to false, we request config again */
-			eof = false;
-			GOTO(out, rc = 0);
+			GOTO(out, rc = -EAGAIN);
 		}
 #endif
 		if (!eof)
@@ -1764,13 +1757,7 @@ static int mgc_process_recover_nodemap_log(struct obd_device *obd,
 		GOTO(out, rc);
 	}
 
-	mne_swab = !!ptlrpc_rep_need_swab(req);
-#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
-	/* This import flag means the server did an extra swab of IR MNE
-	 * records (fixed in LU-1252), reverse it here if needed. LU-1644 */
-	if (unlikely(req->rq_import->imp_need_mne_swab))
-		mne_swab = !mne_swab;
-#endif
+	mne_swab = ptlrpc_rep_need_swab(req);
 
 	/* When a nodemap config is received, we build a new nodemap config,
 	 * with new nodemap structs. We keep track of the most recently added
@@ -2062,12 +2049,12 @@ int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld)
 	mutex_lock(&cld->cld_lock);
 	if (cld->cld_stopping) {
 		mutex_unlock(&cld->cld_lock);
-                RETURN(0);
-        }
+		RETURN(0);
+	}
 
-        OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PAUSE_PROCESS_LOG, 20);
+	OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PAUSE_PROCESS_LOG, 20);
 
-	CDEBUG(D_MGC, "Process log %s:%p from %d\n", cld->cld_logname,
+	CDEBUG(D_MGC, "Process log %s-%016lx from %d\n", cld->cld_logname,
 	       cld->cld_cfg.cfg_instance, cld->cld_cfg.cfg_last_idx + 1);
 
 	/* Get the cfg lock on the llog */
@@ -2109,6 +2096,11 @@ int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld)
 				goto restart;
 			} else {
 				mutex_lock(&cld->cld_lock);
+				/* unlock/lock mutex, so check stopping again */
+				if (cld->cld_stopping) {
+					mutex_unlock(&cld->cld_lock);
+					RETURN(0);
+				}
 				spin_lock(&config_list_lock);
 				cld->cld_lostlock = 1;
 				spin_unlock(&config_list_lock);
@@ -2154,6 +2146,12 @@ int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld)
 			CERROR("Can't drop cfg lock: %d\n", rcl);
 	}
 
+	/* requeue nodemap lock immediately if transfer was interrupted */
+	if (cld_is_nodemap(cld) && rc == -EAGAIN) {
+		mgc_requeue_add(cld);
+		rc = 0;
+	}
+
 	RETURN(rc);
 }
 
@@ -2212,11 +2210,6 @@ static int mgc_process_config(struct obd_device *obd, size_t len, void *buf)
 			break;
 		}
 
-		/* COMPAT_146 */
-		/* FIXME only set this for old logs!  Right now this forces
-		   us to always skip the "inside markers" check */
-		cld->cld_cfg.cfg_flags |= CFG_F_COMPAT146;
-
 		rc = mgc_process_log(obd, cld);
 		if (rc == 0 && cld->cld_recover != NULL) {
 			if (OCD_HAS_FLAG(&obd->u.cli.cl_import->
@@ -2287,7 +2280,7 @@ static struct obd_ops mgc_obd_ops = {
 
 static int __init mgc_init(void)
 {
-	return class_register_type(&mgc_obd_ops, NULL, true, NULL,
+	return class_register_type(&mgc_obd_ops, NULL, false, NULL,
 				   LUSTRE_MGC_NAME, NULL);
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/Makefile b/drivers/staging/lustrefsx/lustre/obdclass/Makefile
index 57450ea2824c1..6f470dd9a2fc0 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/Makefile
+++ b/drivers/staging/lustrefsx/lustre/obdclass/Makefile
@@ -1,16 +1,14 @@
 obj-$(CONFIG_LUSTREFSX_FS)	+= obdclass.o
 
-obdclass-linux-objs := linux-module.o linux-obdo.o linux-sysctl.o
-obdclass-linux-objs := $(addprefix linux/,$(obdclass-linux-objs))
-
-obdclass-y := $(obdclass-linux-objs)
-obdclass-y += llog.o llog_cat.o llog_obd.o llog_swab.o llog_osd.o
-obdclass-y += class_obd.o debug.o genops.o uuid.o llog_ioctl.o
+obdclass-y := llog.o llog_cat.o llog_obd.o llog_swab.o llog_osd.o
+obdclass-y += class_obd.o debug.o genops.o llog_ioctl.o
 obdclass-y += lprocfs_status.o lprocfs_counters.o
 obdclass-y += lustre_handles.o lustre_peer.o local_storage.o
-obdclass-y += statfs_pack.o obdo.o obd_config.o obd_mount.o
+obdclass-y += statfs_pack.o obdo.o obd_config.o obd_mount.o obd_sysfs.o
 obdclass-y += lu_object.o dt_object.o
 obdclass-y += cl_object.o cl_page.o cl_lock.o cl_io.o lu_ref.o
-obdclass-y += linkea.o kernelcomm.o
+obdclass-y += linkea.o
+obdclass-y += kernelcomm.o jobid.o
+obdclass-y += integrity.o obd_cksum.o
 
 include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/acl.c b/drivers/staging/lustrefsx/lustre/obdclass/acl.c
index 77ea22644e27b..599946f846ec3 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/acl.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/acl.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2013, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -49,20 +49,22 @@
 #ifdef CONFIG_FS_POSIX_ACL
 
 static inline void lustre_posix_acl_le_to_cpu(posix_acl_xattr_entry *d,
-                                              posix_acl_xattr_entry *s)
+					      posix_acl_xattr_entry *s)
 {
-        d->e_tag        = le16_to_cpu(s->e_tag);
-        d->e_perm       = le16_to_cpu(s->e_perm);
-        d->e_id         = le32_to_cpu(s->e_id);
+	d->e_tag = le16_to_cpu(s->e_tag);
+	d->e_perm = le16_to_cpu(s->e_perm);
+	d->e_id = le32_to_cpu(s->e_id);
 }
 
-/*static inline void lustre_posix_acl_cpu_to_le(posix_acl_xattr_entry *d,
-                                              posix_acl_xattr_entry *s)
+#if 0
+static inline void lustre_posix_acl_cpu_to_le(posix_acl_xattr_entry *d,
+					      posix_acl_xattr_entry *s)
 {
-        d->e_tag        = cpu_to_le16(s->e_tag);
-        d->e_perm       = cpu_to_le16(s->e_perm);
-        d->e_id         = cpu_to_le32(s->e_id);
-}*/
+	d->e_tag = cpu_to_le16(s->e_tag);
+	d->e_perm = cpu_to_le16(s->e_perm);
+	d->e_id = cpu_to_le32(s->e_id);
+}
+#endif
 
 /*
  * Check permission based on POSIX ACL.
@@ -71,80 +73,79 @@ int lustre_posix_acl_permission(struct lu_ucred *mu, const struct lu_attr *la,
 				int want, posix_acl_xattr_entry *entry,
 				int count)
 {
-        posix_acl_xattr_entry *pa, *pe, *mask_obj;
-        posix_acl_xattr_entry ae, me;
-        int found = 0;
+	posix_acl_xattr_entry *pa, *pe, *mask_obj;
+	posix_acl_xattr_entry ae, me;
+	int found = 0;
 
-        if (count <= 0)
-                return -EACCES;
+	if (count <= 0)
+		return -EACCES;
 
-        for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) {
-                lustre_posix_acl_le_to_cpu(&ae, pa);
-                switch (ae.e_tag) {
-                case ACL_USER_OBJ:
-                        /* (May have been checked already) */
+	for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) {
+		lustre_posix_acl_le_to_cpu(&ae, pa);
+		switch (ae.e_tag) {
+		case ACL_USER_OBJ:
+			/* (May have been checked already) */
 			if (la->la_uid == mu->uc_fsuid)
 				goto check_perm;
-                        break;
-                case ACL_USER:
+			break;
+		case ACL_USER:
 			if (ae.e_id == mu->uc_fsuid)
 				goto mask;
-                        break;
-                case ACL_GROUP_OBJ:
-                        if (lustre_in_group_p(mu, la->la_gid)) {
-                                found = 1;
-                                if ((ae.e_perm & want) == want)
-                                        goto mask;
-                        }
-                        break;
-                case ACL_GROUP:
-                        if (lustre_in_group_p(mu, ae.e_id)) {
-                                found = 1;
-                                if ((ae.e_perm & want) == want)
-                                        goto mask;
-                        }
-                        break;
-                case ACL_MASK:
-                        break;
-                case ACL_OTHER:
-                        if (found)
-                                return -EACCES;
-                        else
-                                goto check_perm;
-                default:
-                        return -EIO;
-                }
-        }
-        return -EIO;
+			break;
+		case ACL_GROUP_OBJ:
+			if (lustre_in_group_p(mu, la->la_gid)) {
+				found = 1;
+				if ((ae.e_perm & want) == want)
+					goto mask;
+			}
+			break;
+		case ACL_GROUP:
+			if (lustre_in_group_p(mu, ae.e_id)) {
+				found = 1;
+				if ((ae.e_perm & want) == want)
+					goto mask;
+			}
+			break;
+		case ACL_MASK:
+			break;
+		case ACL_OTHER:
+			if (found)
+				return -EACCES;
+			goto check_perm;
+		default:
+			return -EIO;
+}
+	}
+	return -EIO;
 
 mask:
-        for (mask_obj = pa + 1; mask_obj <= pe; mask_obj++) {
-                lustre_posix_acl_le_to_cpu(&me, mask_obj);
-                if (me.e_tag == ACL_MASK) {
-                        if ((ae.e_perm & me.e_perm & want) == want)
-                                return 0;
+	for (mask_obj = pa + 1; mask_obj <= pe; mask_obj++) {
+		lustre_posix_acl_le_to_cpu(&me, mask_obj);
+		if (me.e_tag == ACL_MASK) {
+			if ((ae.e_perm & me.e_perm & want) == want)
+				return 0;
 
-                        return -EACCES;
-                }
-        }
+			return -EACCES;
+		}
+	}
 
 check_perm:
-        if ((ae.e_perm & want) == want)
-                return 0;
+	if ((ae.e_perm & want) == want)
+		return 0;
 
-        return -EACCES;
+	return -EACCES;
 }
 EXPORT_SYMBOL(lustre_posix_acl_permission);
 
 /*
  * Modify the ACL for the chmod.
  */
-int lustre_posix_acl_chmod_masq(posix_acl_xattr_entry *entry, __u32 mode,
-                                int count)
+int lustre_posix_acl_chmod_masq(posix_acl_xattr_entry *entry, u32 mode,
+				int count)
 {
 	posix_acl_xattr_entry *group_obj = NULL, *mask_obj = NULL, *pa, *pe;
 
-        for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) {
+	for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) {
 		switch (le16_to_cpu(pa->e_tag)) {
 		case ACL_USER_OBJ:
 			pa->e_perm = cpu_to_le16((mode & S_IRWXU) >> 6);
@@ -187,8 +188,8 @@ lustre_posix_acl_equiv_mode(posix_acl_xattr_entry *entry, mode_t *mode_p,
 			    int count)
 {
 	posix_acl_xattr_entry *pa, *pe;
-	mode_t                 mode = 0;
-	int                    not_equiv = 0;
+	mode_t mode = 0;
+	int not_equiv = 0;
 
 	for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) {
 		__u16 perm = le16_to_cpu(pa->e_perm);
@@ -224,19 +225,19 @@ EXPORT_SYMBOL(lustre_posix_acl_equiv_mode);
 /*
  * Modify acl when creating a new object.
  */
-int lustre_posix_acl_create_masq(posix_acl_xattr_entry *entry, __u32 *pmode,
-                                 int count)
+int lustre_posix_acl_create_masq(posix_acl_xattr_entry *entry, u32 *pmode,
+				 int count)
 {
-        posix_acl_xattr_entry *group_obj = NULL, *mask_obj = NULL, *pa, *pe;
-        posix_acl_xattr_entry ae;
-	__u32 mode = *pmode;
+	posix_acl_xattr_entry *group_obj = NULL, *mask_obj = NULL, *pa, *pe;
+	posix_acl_xattr_entry ae;
+	u32 mode = *pmode;
 	int not_equiv = 0;
 
-        for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) {
-                lustre_posix_acl_le_to_cpu(&ae, pa);
-                switch (ae.e_tag) {
-                case ACL_USER_OBJ:
-                        ae.e_perm &= (mode >> 6) | ~S_IRWXO;
+	for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) {
+		lustre_posix_acl_le_to_cpu(&ae, pa);
+		switch (ae.e_tag) {
+		case ACL_USER_OBJ:
+			ae.e_perm &= (mode >> 6) | ~(0007);
 			pa->e_perm = cpu_to_le16(ae.e_perm);
 			mode &= (ae.e_perm << 6) | ~S_IRWXU;
 			break;
@@ -244,39 +245,39 @@ int lustre_posix_acl_create_masq(posix_acl_xattr_entry *entry, __u32 *pmode,
 		case ACL_GROUP:
 			not_equiv = 1;
 			break;
-                case ACL_GROUP_OBJ:
+		case ACL_GROUP_OBJ:
 			group_obj = pa;
-                        break;
-                case ACL_OTHER:
-                        ae.e_perm &= mode | ~S_IRWXO;
+			break;
+		case ACL_OTHER:
+			ae.e_perm &= mode | ~(0007);
 			pa->e_perm = cpu_to_le16(ae.e_perm);
-			mode &= ae.e_perm | ~S_IRWXO;
-                        break;
-                case ACL_MASK:
+			mode &= ae.e_perm | ~(0007);
+			break;
+		case ACL_MASK:
 			mask_obj = pa;
 			not_equiv = 1;
-                        break;
+			break;
 		default:
 			return -EIO;
-                }
-        }
+		}
+	}
 
 	if (mask_obj) {
 		ae.e_perm = le16_to_cpu(mask_obj->e_perm) &
-                            ((mode >> 3) | ~S_IRWXO);
+					((mode >> 3) | ~(0007));
 		mode &= (ae.e_perm << 3) | ~S_IRWXG;
-                mask_obj->e_perm = cpu_to_le16(ae.e_perm);
+		mask_obj->e_perm = cpu_to_le16(ae.e_perm);
 	} else {
 		if (!group_obj)
 			return -EIO;
 		ae.e_perm = le16_to_cpu(group_obj->e_perm) &
-                            ((mode >> 3) | ~S_IRWXO);
+					((mode >> 3) | ~(0007));
 		mode &= (ae.e_perm << 3) | ~S_IRWXG;
-                group_obj->e_perm = cpu_to_le16(ae.e_perm);
+		group_obj->e_perm = cpu_to_le16(ae.e_perm);
 	}
 
 	*pmode = (*pmode & ~S_IRWXUGO) | mode;
-        return not_equiv;
+	return not_equiv;
 }
 EXPORT_SYMBOL(lustre_posix_acl_create_masq);
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_internal.h b/drivers/staging/lustrefsx/lustre/obdclass/cl_internal.h
index 0f95caf310755..0c1276deb37bc 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/cl_internal.h
+++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_io.c b/drivers/staging/lustrefsx/lustre/obdclass/cl_io.c
index fc22b2c89f17d..181ef89299b2d 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/cl_io.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_io.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -44,7 +44,6 @@
 #include <lustre_fid.h>
 #include <cl_object.h>
 #include "cl_internal.h"
-#include <lustre_compat.h>
 
 /*****************************************************************************
  *
@@ -122,6 +121,7 @@ void cl_io_fini(const struct lu_env *env, struct cl_io *io)
 		/* Check ignore layout change conf */
 		LASSERT(ergo(io->ci_ignore_layout || !io->ci_verify_layout,
 				!io->ci_need_restart));
+	case CIT_GLIMPSE:
 		break;
 	case CIT_LADVISE:
 		break;
@@ -188,9 +188,12 @@ EXPORT_SYMBOL(cl_io_sub_init);
 int cl_io_init(const struct lu_env *env, struct cl_io *io,
                enum cl_io_type iot, struct cl_object *obj)
 {
-        LASSERT(obj == cl_object_top(obj));
+	LASSERT(obj == cl_object_top(obj));
 
-        return cl_io_init0(env, io, iot, obj);
+	/* clear I/O restart from previous instance */
+	io->ci_need_restart = 0;
+
+	return cl_io_init0(env, io, iot, obj);
 }
 EXPORT_SYMBOL(cl_io_init);
 
@@ -200,33 +203,24 @@ EXPORT_SYMBOL(cl_io_init);
  * \pre iot == CIT_READ || iot == CIT_WRITE
  */
 int cl_io_rw_init(const struct lu_env *env, struct cl_io *io,
-                  enum cl_io_type iot, loff_t pos, size_t count)
+		  enum cl_io_type iot, loff_t pos, size_t count)
 {
 	LINVRNT(iot == CIT_READ || iot == CIT_WRITE);
 	LINVRNT(io->ci_obj != NULL);
 	ENTRY;
 
-	if (cfs_ptengine_weight(cl_io_engine) < 2)
-		io->ci_pio = 0;
-
 	LU_OBJECT_HEADER(D_VFSTRACE, env, &io->ci_obj->co_lu,
-			 "io %s range: [%llu, %llu) %s %s %s %s\n",
-			 iot == CIT_READ ? "read" : "write",
-			 pos, pos + count,
-			 io->u.ci_rw.rw_nonblock ? "nonblock" : "block",
-			 io->u.ci_rw.rw_append ? "append" : "-",
-			 io->u.ci_rw.rw_sync ? "sync" : "-",
-			 io->ci_pio ? "pio" : "-");
-
-	io->u.ci_rw.rw_range.cir_pos   = pos;
-	io->u.ci_rw.rw_range.cir_count = count;
-
+			 "io range: %u [%llu, %llu) %u %u\n",
+			 iot, (__u64)pos, (__u64)pos + count,
+			 io->u.ci_rw.crw_nonblock, io->u.ci_wr.wr_append);
+	io->u.ci_rw.crw_pos    = pos;
+	io->u.ci_rw.crw_count  = count;
 	RETURN(cl_io_init(env, io, iot, io->ci_obj));
 }
 EXPORT_SYMBOL(cl_io_rw_init);
 
 static int cl_lock_descr_sort(const struct cl_lock_descr *d0,
-                              const struct cl_lock_descr *d1)
+			      const struct cl_lock_descr *d1)
 {
 	return lu_fid_cmp(lu_object_fid(&d0->cld_obj->co_lu),
 			  lu_object_fid(&d1->cld_obj->co_lu));
@@ -470,25 +464,25 @@ EXPORT_SYMBOL(cl_io_iter_fini);
  */
 void cl_io_rw_advance(const struct lu_env *env, struct cl_io *io, size_t nob)
 {
-        const struct cl_io_slice *scan;
+	const struct cl_io_slice *scan;
 
-        LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE ||
-                nob == 0);
-        LINVRNT(cl_io_is_loopable(io));
-        LINVRNT(cl_io_invariant(io));
+	ENTRY;
 
-        ENTRY;
+	LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE ||
+		nob == 0);
+	LINVRNT(cl_io_is_loopable(io));
+	LINVRNT(cl_io_invariant(io));
 
-	io->u.ci_rw.rw_range.cir_pos   += nob;
-	io->u.ci_rw.rw_range.cir_count -= nob;
+	io->u.ci_rw.crw_pos   += nob;
+	io->u.ci_rw.crw_count -= nob;
 
-        /* layers have to be notified. */
+	/* layers have to be notified. */
 	list_for_each_entry_reverse(scan, &io->ci_layers, cis_linkage) {
 		if (scan->cis_iop->op[io->ci_type].cio_advance != NULL)
 			scan->cis_iop->op[io->ci_type].cio_advance(env, scan,
 								   nob);
 	}
-        EXIT;
+	EXIT;
 }
 
 /**
@@ -687,6 +681,7 @@ int cl_io_submit_sync(const struct lu_env *env, struct cl_io *io,
 	struct cl_sync_io *anchor = &cl_env_info(env)->clt_anchor;
 	struct cl_page *pg;
 	int rc;
+	ENTRY;
 
 	cl_page_list_for_each(pg, &queue->c2_qin) {
 		LASSERT(pg->cp_sync_io == NULL);
@@ -715,7 +710,7 @@ int cl_io_submit_sync(const struct lu_env *env, struct cl_io *io,
 		cl_page_list_for_each(pg, &queue->c2_qin)
 			pg->cp_sync_io = NULL;
 	}
-	return rc;
+	RETURN(rc);
 }
 EXPORT_SYMBOL(cl_io_submit_sync);
 
@@ -738,53 +733,6 @@ int cl_io_cancel(const struct lu_env *env, struct cl_io *io,
         return result;
 }
 
-static
-struct cl_io_pt *cl_io_submit_pt(struct cl_io *io, loff_t pos, size_t count)
-{
-	struct cl_io_pt *pt;
-	int rc;
-
-	OBD_ALLOC(pt, sizeof(*pt));
-	if (pt == NULL)
-		RETURN(ERR_PTR(-ENOMEM));
-
-	pt->cip_next = NULL;
-	init_sync_kiocb(&pt->cip_iocb, io->u.ci_rw.rw_file);
-	pt->cip_iocb.ki_pos = pos;
-#ifdef HAVE_KIOCB_KI_LEFT
-	pt->cip_iocb.ki_left = count;
-#elif defined(HAVE_KI_NBYTES)
-	pt->cip_iocb.ki_nbytes = count;
-#endif
-	pt->cip_iter = io->u.ci_rw.rw_iter;
-	iov_iter_truncate(&pt->cip_iter, count);
-	pt->cip_file   = io->u.ci_rw.rw_file;
-	pt->cip_iot    = io->ci_type;
-	pt->cip_pos    = pos;
-	pt->cip_count  = count;
-	pt->cip_result = 0;
-
-	rc = cfs_ptask_init(&pt->cip_task, io->u.ci_rw.rw_ptask, pt,
-			    PTF_ORDERED | PTF_COMPLETE |
-			    PTF_USER_MM | PTF_RETRY, smp_processor_id());
-	if (rc)
-		GOTO(out_error, rc);
-
-	CDEBUG(D_VFSTRACE, "submit %s range: [%llu, %llu)\n",
-		io->ci_type == CIT_READ ? "read" : "write",
-		pos, pos + count);
-
-	rc = cfs_ptask_submit(&pt->cip_task, cl_io_engine);
-	if (rc)
-		GOTO(out_error, rc);
-
-	RETURN(pt);
-
-out_error:
-	OBD_FREE(pt, sizeof(*pt));
-	RETURN(ERR_PTR(rc));
-}
-
 /**
  * Main io loop.
  *
@@ -806,124 +754,50 @@ struct cl_io_pt *cl_io_submit_pt(struct cl_io *io, loff_t pos, size_t count)
  */
 int cl_io_loop(const struct lu_env *env, struct cl_io *io)
 {
-	struct cl_io_pt *pt = NULL, *head = NULL;
-	struct cl_io_pt **tail = &head;
-	loff_t pos;
-	size_t count;
-	size_t last_chunk_count = 0;
-	bool short_io = false;
-	int rc = 0;
-	ENTRY;
+	int result   = 0;
 
 	LINVRNT(cl_io_is_loopable(io));
+	ENTRY;
 
 	do {
-		io->ci_continue = 0;
-
-		rc = cl_io_iter_init(env, io);
-		if (rc) {
-			cl_io_iter_fini(env, io);
-			break;
-		}
-
-		pos   = io->u.ci_rw.rw_range.cir_pos;
-		count = io->u.ci_rw.rw_range.cir_count;
-
-		if (io->ci_pio) {
-			/* submit this range for parallel execution */
-			pt = cl_io_submit_pt(io, pos, count);
-			if (IS_ERR(pt)) {
-				cl_io_iter_fini(env, io);
-				rc = PTR_ERR(pt);
-				break;
-			}
-
-			*tail = pt;
-			tail = &pt->cip_next;
-		} else {
-			size_t nob = io->ci_nob;
-
-			CDEBUG(D_VFSTRACE,
-				"execute type %u range: [%llu, %llu) nob: %zu %s\n",
-				io->ci_type, pos, pos + count, nob,
-				io->ci_continue ? "continue" : "stop");
+		size_t nob;
 
-			rc = cl_io_lock(env, io);
-			if (rc) {
-				cl_io_iter_fini(env, io);
-				break;
+		io->ci_continue = 0;
+		result = cl_io_iter_init(env, io);
+		if (result == 0) {
+			nob    = io->ci_nob;
+			result = cl_io_lock(env, io);
+			if (result == 0) {
+				/*
+				 * Notify layers that locks has been taken,
+				 * and do actual i/o.
+				 *
+				 *   - llite: kms, short read;
+				 *   - llite: generic_file_read();
+				 */
+				result = cl_io_start(env, io);
+				/*
+				 * Send any remaining pending
+				 * io, etc.
+				 *
+				 **   - llite: ll_rw_stats_tally.
+				 */
+				cl_io_end(env, io);
+				cl_io_unlock(env, io);
+				cl_io_rw_advance(env, io, io->ci_nob - nob);
 			}
-
-			/*
-			 * Notify layers that locks has been taken,
-			 * and do actual i/o.
-			 *
-			 *   - llite: kms, short read;
-			 *   - llite: generic_file_read();
-			 */
-			rc = cl_io_start(env, io);
-
-			/*
-			 * Send any remaining pending
-			 * io, etc.
-			 *
-			 *   - llite: ll_rw_stats_tally.
-			 */
-			cl_io_end(env, io);
-			cl_io_unlock(env, io);
-
-			count = io->ci_nob - nob;
-			last_chunk_count = count;
 		}
-
-		cl_io_rw_advance(env, io, count);
 		cl_io_iter_fini(env, io);
-	} while (!rc && io->ci_continue);
-
-	CDEBUG(D_VFSTRACE, "loop type %u done: nob: %zu, rc: %d %s\n",
-		io->ci_type, io->ci_nob, rc,
-		io->ci_continue ? "continue" : "stop");
-
-	while (head != NULL) {
-		int rc2;
-
-		pt = head;
-		head = head->cip_next;
-
-		rc2 = cfs_ptask_wait_for(&pt->cip_task);
-		LASSERTF(!rc2, "wait for task error: %d\n", rc2);
-
-		rc2 = cfs_ptask_result(&pt->cip_task);
-		CDEBUG(D_VFSTRACE,
-			"done %s range: [%llu, %llu) ret: %zd, rc: %d\n",
-			pt->cip_iot == CIT_READ ? "read" : "write",
-			pt->cip_pos, pt->cip_pos + pt->cip_count,
-			pt->cip_result, rc2);
-		if (rc2)
-			rc = rc ? rc : rc2;
-		if (!short_io) {
-			if (!rc2) /* IO is done by this task successfully */
-				io->ci_nob += pt->cip_result;
-			if (pt->cip_result < pt->cip_count) {
-				/* short IO happened.
-				 * Not necessary to be an error */
-				CDEBUG(D_VFSTRACE,
-					"incomplete range: [%llu, %llu) "
-					"last_chunk_count: %zu\n",
-					pt->cip_pos,
-					pt->cip_pos + pt->cip_count,
-					last_chunk_count);
-				io->ci_nob -= last_chunk_count;
-				short_io = true;
-			}
-		}
-		OBD_FREE(pt, sizeof(*pt));
-	}
+	} while (result == 0 && io->ci_continue);
 
-	CDEBUG(D_VFSTRACE, "return nob: %zu (%s io), rc: %d\n",
-		io->ci_nob, short_io ? "short" : "full", rc);
+	if (result == -EWOULDBLOCK && io->ci_ndelay) {
+		io->ci_need_restart = 1;
+		result = 0;
+	}
 
-	RETURN(rc < 0 ? rc : io->ci_result);
+	if (result == 0)
+		result = io->ci_result;
+	RETURN(result < 0 ? result : 0);
 }
 EXPORT_SYMBOL(cl_io_loop);
 
@@ -937,20 +811,20 @@ EXPORT_SYMBOL(cl_io_loop);
  * \see cl_lock_slice_add(), cl_req_slice_add(), cl_page_slice_add()
  */
 void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice,
-                     struct cl_object *obj,
-                     const struct cl_io_operations *ops)
+		     struct cl_object *obj,
+		     const struct cl_io_operations *ops)
 {
 	struct list_head *linkage = &slice->cis_linkage;
 
-        LASSERT((linkage->prev == NULL && linkage->next == NULL) ||
+	LASSERT((linkage->prev == NULL && linkage->next == NULL) ||
 		list_empty(linkage));
-        ENTRY;
+	ENTRY;
 
 	list_add_tail(linkage, &io->ci_layers);
-        slice->cis_io  = io;
-        slice->cis_obj = obj;
-        slice->cis_iop = ops;
-        EXIT;
+	slice->cis_io  = io;
+	slice->cis_obj = obj;
+	slice->cis_iop = ops;
+	EXIT;
 }
 EXPORT_SYMBOL(cl_io_slice_add);
 
@@ -1145,6 +1019,7 @@ void cl_page_list_discard(const struct lu_env *env, struct cl_io *io,
 		cl_page_discard(env, io, page);
 	EXIT;
 }
+EXPORT_SYMBOL(cl_page_list_discard);
 
 /**
  * Initialize dual page queue.
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_lock.c b/drivers/staging/lustrefsx/lustre/obdclass/cl_lock.c
index e92dbaf4fda68..30c7186651dba 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/cl_lock.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_lock.c
@@ -111,7 +111,10 @@ int cl_lock_init(const struct lu_env *env, struct cl_lock *lock,
 	INIT_LIST_HEAD(&lock->cll_layers);
 	list_for_each_entry(scan, &obj->co_lu.lo_header->loh_layers,
 			    co_lu.lo_linkage) {
-		result = scan->co_ops->coo_lock_init(env, scan, lock, io);
+		if (scan->co_ops->coo_lock_init != NULL)
+			result = scan->co_ops->coo_lock_init(env, scan, lock,
+							     io);
+
 		if (result != 0) {
 			cl_lock_fini(env, lock);
 			break;
@@ -167,8 +170,8 @@ EXPORT_SYMBOL(cl_lock_cancel);
 int cl_lock_enqueue(const struct lu_env *env, struct cl_io *io,
 		    struct cl_lock *lock, struct cl_sync_io *anchor)
 {
-	const struct cl_lock_slice	*slice;
-	int				rc = -ENOSYS;
+	const struct cl_lock_slice *slice;
+	int rc = 0;
 
 	ENTRY;
 
@@ -200,7 +203,7 @@ int cl_lock_request(const struct lu_env *env, struct cl_io *io,
 	if (rc < 0)
 		RETURN(rc);
 
-	if ((enq_flags & CEF_ASYNC) && !(enq_flags & CEF_AGL)) {
+	if ((enq_flags & CEF_GLIMPSE) && !(enq_flags & CEF_SPECULATIVE)) {
 		anchor = &cl_env_info(env)->clt_anchor;
 		cl_sync_io_init(anchor, 1, cl_sync_io_end);
 	}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_object.c b/drivers/staging/lustrefsx/lustre/obdclass/cl_object.c
index ddf97fc2cf057..5aa59de91b53e 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/cl_object.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_object.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -422,6 +422,24 @@ loff_t cl_object_maxbytes(struct cl_object *obj)
 }
 EXPORT_SYMBOL(cl_object_maxbytes);
 
+int cl_object_flush(const struct lu_env *env, struct cl_object *obj,
+			 struct ldlm_lock *lock)
+{
+	struct lu_object_header *top = obj->co_lu.lo_header;
+	int rc = 0;
+	ENTRY;
+
+	list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) {
+		if (obj->co_ops->coo_object_flush) {
+			rc = obj->co_ops->coo_object_flush(env, obj, lock);
+			if (rc)
+				break;
+		}
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(cl_object_flush);
+
 /**
  * Helper function removing all object locks, and marking object for
  * deletion. All object pages must have been deleted at this point.
@@ -550,19 +568,16 @@ EXPORT_SYMBOL(cl_site_stats_print);
 
 /**
  * The most efficient way is to store cl_env pointer in task specific
- * structures. On Linux, it wont' be easy to use task_struct->journal_info
- * because Lustre code may call into other fs which has certain assumptions
- * about journal_info. Currently following fields in task_struct are identified
- * can be used for this purpose:
- *  - cl_env: for liblustre.
- *  - tux_info: ony on RedHat kernel.
- *  - ...
+ * structures. On Linux, it isn't easy to use task_struct->journal_info
+ * because Lustre code may call into other fs during memory reclaim, which
+ * has certain assumptions about journal_info. There are not currently any
+ * fields in task_struct that can be used for this purpose.
  * \note As long as we use task_struct to store cl_env, we assume that once
  * called into Lustre, we'll never call into the other part of the kernel
  * which will use those fields in task_struct without explicitly exiting
  * Lustre.
  *
- * If there's no space in task_struct is available, hash will be used.
+ * Since there's no space in task_struct is available, hash will be used.
  * bz20044, bz22683.
  */
 
@@ -595,17 +610,20 @@ struct cl_env {
         void             *ce_debug;
 };
 
+static void cl_env_inc(enum cache_stats_item item)
+{
 #ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
-#define CL_ENV_INC(counter) atomic_inc(&cl_env_stats.cs_stats[CS_##counter])
-
-#define CL_ENV_DEC(counter) do {                                              \
-	LASSERT(atomic_read(&cl_env_stats.cs_stats[CS_##counter]) > 0);   \
-	atomic_dec(&cl_env_stats.cs_stats[CS_##counter]);                 \
-} while (0)
-#else
-#define CL_ENV_INC(counter)
-#define CL_ENV_DEC(counter)
+	atomic_inc(&cl_env_stats.cs_stats[item]);
 #endif
+}
+
+static void cl_env_dec(enum cache_stats_item item)
+{
+#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
+	LASSERT(atomic_read(&cl_env_stats.cs_stats[item]) > 0);
+	atomic_dec(&cl_env_stats.cs_stats[item]);
+#endif
+}
 
 static void cl_env_init0(struct cl_env *cle, void *debug)
 {
@@ -615,7 +633,7 @@ static void cl_env_init0(struct cl_env *cle, void *debug)
 
 	cle->ce_ref = 1;
 	cle->ce_debug = debug;
-	CL_ENV_INC(busy);
+	cl_env_inc(CS_busy);
 }
 
 static struct lu_env *cl_env_new(__u32 ctx_tags, __u32 ses_tags, void *debug)
@@ -645,8 +663,8 @@ static struct lu_env *cl_env_new(__u32 ctx_tags, __u32 ses_tags, void *debug)
 			OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
 			env = ERR_PTR(rc);
 		} else {
-			CL_ENV_INC(create);
-			CL_ENV_INC(total);
+			cl_env_inc(CS_create);
+			cl_env_inc(CS_total);
 		}
 	} else
 		env = ERR_PTR(-ENOMEM);
@@ -655,10 +673,10 @@ static struct lu_env *cl_env_new(__u32 ctx_tags, __u32 ses_tags, void *debug)
 
 static void cl_env_fini(struct cl_env *cle)
 {
-        CL_ENV_DEC(total);
-        lu_context_fini(&cle->ce_lu.le_ctx);
-        lu_context_fini(&cle->ce_ses);
-        OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
+	cl_env_dec(CS_total);
+	lu_context_fini(&cle->ce_lu.le_ctx);
+	lu_context_fini(&cle->ce_ses);
+	OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
 }
 
 static struct lu_env *cl_env_obtain(void *debug)
@@ -814,15 +832,15 @@ void cl_env_put(struct lu_env *env, __u16 *refcheck)
         if (--cle->ce_ref == 0) {
 		int cpu = get_cpu();
 
-                CL_ENV_DEC(busy);
-                cle->ce_debug = NULL;
-                cl_env_exit(cle);
-                /*
-                 * Don't bother to take a lock here.
-                 *
-                 * Return environment to the cache only when it was allocated
-                 * with the standard tags.
-                 */
+		cl_env_dec(CS_busy);
+		cle->ce_debug = NULL;
+		cl_env_exit(cle);
+		/*
+		 * Don't bother to take a lock here.
+		 *
+		 * Return environment to the cache only when it was allocated
+		 * with the standard tags.
+		 */
 		if (cl_envs[cpu].cec_count < cl_envs_cached_max &&
 		    (env->le_ctx.lc_tags & ~LCT_HAS_EXIT) == LCT_CL_THREAD &&
 		    (env->le_ses->lc_tags & ~LCT_HAS_EXIT) == LCT_SESSION) {
@@ -844,13 +862,11 @@ EXPORT_SYMBOL(cl_env_put);
  */
 void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr)
 {
-        ENTRY;
         lvb->lvb_size   = attr->cat_size;
         lvb->lvb_mtime  = attr->cat_mtime;
         lvb->lvb_atime  = attr->cat_atime;
         lvb->lvb_ctime  = attr->cat_ctime;
         lvb->lvb_blocks = attr->cat_blocks;
-        EXIT;
 }
 
 /**
@@ -860,13 +876,11 @@ void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr)
  */
 void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb)
 {
-        ENTRY;
         attr->cat_size   = lvb->lvb_size;
         attr->cat_mtime  = lvb->lvb_mtime;
         attr->cat_atime  = lvb->lvb_atime;
         attr->cat_ctime  = lvb->lvb_ctime;
         attr->cat_blocks = lvb->lvb_blocks;
-        EXIT;
 }
 EXPORT_SYMBOL(cl_lvb2attr);
 
@@ -951,7 +965,7 @@ void cl_env_percpu_put(struct lu_env *env)
 	cle->ce_ref--;
 	LASSERT(cle->ce_ref == 0);
 
-	CL_ENV_DEC(busy);
+	cl_env_dec(CS_busy);
 	cle->ce_debug = NULL;
 
 	put_cpu();
@@ -1043,8 +1057,6 @@ static struct lu_kmem_descr cl_object_caches[] = {
         }
 };
 
-struct cfs_ptask_engine *cl_io_engine;
-
 /**
  * Global initialization of cl-data. Create kmem caches, register
  * lu_context_key's, etc.
@@ -1072,17 +1084,8 @@ int cl_global_init(void)
 	if (result) /* no cl_env_percpu_fini on error */
 		GOTO(out_keys, result);
 
-	cl_io_engine = cfs_ptengine_init("clio", cpu_online_mask);
-	if (IS_ERR(cl_io_engine)) {
-		result = PTR_ERR(cl_io_engine);
-		cl_io_engine = NULL;
-		GOTO(out_percpu, result);
-	}
-
 	return 0;
 
-out_percpu:
-	cl_env_percpu_fini();
 out_keys:
 	lu_context_key_degister(&cl_key);
 out_kmem:
@@ -1098,8 +1101,6 @@ int cl_global_init(void)
  */
 void cl_global_fini(void)
 {
-	cfs_ptengine_fini(cl_io_engine);
-	cl_io_engine = NULL;
 	cl_env_percpu_fini();
 	lu_context_key_degister(&cl_key);
 	lu_kmem_fini(cl_object_caches);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_page.c b/drivers/staging/lustrefsx/lustre/obdclass/cl_page.c
index 74f9225ec1d59..a1b1e130f31c6 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/cl_page.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_page.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -74,21 +74,37 @@ static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg);
 #endif /* !CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK */
 
 /* Disable page statistic by default due to huge performance penalty. */
+static void cs_page_inc(const struct cl_object *obj,
+			enum cache_stats_item item)
+{
+#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
+	atomic_inc(&cl_object_site(obj)->cs_pages.cs_stats[item]);
+#endif
+}
+
+static void cs_page_dec(const struct cl_object *obj,
+			enum cache_stats_item item)
+{
 #ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
-#define CS_PAGE_INC(o, item) \
-	atomic_inc(&cl_object_site(o)->cs_pages.cs_stats[CS_##item])
-#define CS_PAGE_DEC(o, item) \
-	atomic_dec(&cl_object_site(o)->cs_pages.cs_stats[CS_##item])
-#define CS_PAGESTATE_INC(o, state) \
-	atomic_inc(&cl_object_site(o)->cs_pages_state[state])
-#define CS_PAGESTATE_DEC(o, state) \
-	atomic_dec(&cl_object_site(o)->cs_pages_state[state])
-#else
-#define CS_PAGE_INC(o, item)
-#define CS_PAGE_DEC(o, item)
-#define CS_PAGESTATE_INC(o, state)
-#define CS_PAGESTATE_DEC(o, state)
+	atomic_dec(&cl_object_site(obj)->cs_pages.cs_stats[item]);
 #endif
+}
+
+static void cs_pagestate_inc(const struct cl_object *obj,
+			     enum cl_page_state state)
+{
+#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
+	atomic_inc(&cl_object_site(obj)->cs_pages_state[state]);
+#endif
+}
+
+static void cs_pagestate_dec(const struct cl_object *obj,
+			      enum cl_page_state state)
+{
+#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
+	atomic_dec(&cl_object_site(obj)->cs_pages_state[state]);
+#endif
+}
 
 /**
  * Internal version of cl_page_get().
@@ -126,7 +142,8 @@ cl_page_at_trusted(const struct cl_page *page,
 	RETURN(NULL);
 }
 
-static void cl_page_free(const struct lu_env *env, struct cl_page *page)
+static void cl_page_free(const struct lu_env *env, struct cl_page *page,
+			 struct pagevec *pvec)
 {
 	struct cl_object *obj  = page->cp_obj;
 	int pagesize = cl_object_header(obj)->coh_page_bufsize;
@@ -143,10 +160,10 @@ static void cl_page_free(const struct lu_env *env, struct cl_page *page)
 				   struct cl_page_slice, cpl_linkage);
 		list_del_init(page->cp_layers.next);
 		if (unlikely(slice->cpl_ops->cpo_fini != NULL))
-			slice->cpl_ops->cpo_fini(env, slice);
+			slice->cpl_ops->cpo_fini(env, slice, pvec);
 	}
-	CS_PAGE_DEC(obj, total);
-	CS_PAGESTATE_DEC(obj, page->cp_state);
+	cs_page_dec(obj, CS_total);
+	cs_pagestate_dec(obj, page->cp_state);
 	lu_object_ref_del_at(&obj->co_lu, &page->cp_obj_ref, "cl_page", page);
 	cl_object_put(env, obj);
 	lu_ref_fini(&page->cp_reference);
@@ -196,16 +213,16 @@ struct cl_page *cl_page_alloc(const struct lu_env *env,
 								  ind);
 				if (result != 0) {
 					cl_page_delete0(env, page);
-					cl_page_free(env, page);
+					cl_page_free(env, page, NULL);
 					page = ERR_PTR(result);
 					break;
 				}
 			}
 		}
 		if (result == 0) {
-			CS_PAGE_INC(o, total);
-			CS_PAGE_INC(o, create);
-			CS_PAGESTATE_DEC(o, CPS_CACHED);
+			cs_page_inc(o, CS_total);
+			cs_page_inc(o, CS_create);
+			cs_pagestate_dec(o, CPS_CACHED);
 		}
 	} else {
 		page = ERR_PTR(-ENOMEM);
@@ -238,7 +255,7 @@ struct cl_page *cl_page_find(const struct lu_env *env,
 	ENTRY;
 
 	hdr = cl_object_header(o);
-	CS_PAGE_INC(o, lookup);
+	cs_page_inc(o, CS_lookup);
 
         CDEBUG(D_PAGE, "%lu@"DFID" %p %lx %d\n",
                idx, PFID(&hdr->coh_lu.loh_fid), vmpage, vmpage->private, type);
@@ -258,7 +275,7 @@ struct cl_page *cl_page_find(const struct lu_env *env,
                  */
                 page = cl_vmpage_page(vmpage, o);
 		if (page != NULL) {
-			CS_PAGE_INC(o, hit);
+			cs_page_inc(o, CS_hit);
 			RETURN(page);
 		}
         }
@@ -328,8 +345,8 @@ static void cl_page_state_set0(const struct lu_env *env,
 	PASSERT(env, page, page->cp_state == old);
 	PASSERT(env, page, equi(state == CPS_OWNED, page->cp_owner != NULL));
 
-	CS_PAGESTATE_DEC(page->cp_obj, page->cp_state);
-	CS_PAGESTATE_INC(page->cp_obj, state);
+	cs_pagestate_dec(page->cp_obj, page->cp_state);
+	cs_pagestate_inc(page->cp_obj, state);
 	cl_page_state_set_trust(page, state);
 	EXIT;
 }
@@ -357,15 +374,13 @@ void cl_page_get(struct cl_page *page)
 EXPORT_SYMBOL(cl_page_get);
 
 /**
- * Releases a reference to a page.
+ * Releases a reference to a page, use the pagevec to release the pages
+ * in batch if provided.
  *
- * When last reference is released, page is returned to the cache, unless it
- * is in cl_page_state::CPS_FREEING state, in which case it is immediately
- * destroyed.
- *
- * \see cl_object_put(), cl_lock_put().
+ * Users need to do a final pagevec_release() to release any trailing pages.
  */
-void cl_page_put(const struct lu_env *env, struct cl_page *page)
+void cl_pagevec_put(const struct lu_env *env, struct cl_page *page,
+		  struct pagevec *pvec)
 {
         ENTRY;
         CL_PAGE_HEADER(D_TRACE, env, page, "%d\n",
@@ -381,11 +396,26 @@ void cl_page_put(const struct lu_env *env, struct cl_page *page)
 		 * Page is no longer reachable by other threads. Tear
 		 * it down.
 		 */
-		cl_page_free(env, page);
+		cl_page_free(env, page, pvec);
 	}
 
 	EXIT;
 }
+EXPORT_SYMBOL(cl_pagevec_put);
+
+/**
+ * Releases a reference to a page, wrapper to cl_pagevec_put
+ *
+ * When last reference is released, page is returned to the cache, unless it
+ * is in cl_page_state::CPS_FREEING state, in which case it is immediately
+ * destroyed.
+ *
+ * \see cl_object_put(), cl_lock_put().
+ */
+void cl_page_put(const struct lu_env *env, struct cl_page *page)
+{
+	cl_pagevec_put(env, page, NULL);
+}
 EXPORT_SYMBOL(cl_page_put);
 
 /**
@@ -788,6 +818,22 @@ int cl_page_is_vmlocked(const struct lu_env *env, const struct cl_page *pg)
 }
 EXPORT_SYMBOL(cl_page_is_vmlocked);
 
+void cl_page_touch(const struct lu_env *env, const struct cl_page *pg,
+		  size_t to)
+{
+	const struct cl_page_slice *slice;
+
+	ENTRY;
+
+	list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) {
+		if (slice->cpl_ops->cpo_page_touch != NULL)
+			(*slice->cpl_ops->cpo_page_touch)(env, slice, to);
+	}
+
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_touch);
+
 static enum cl_page_state cl_req_type_state(enum cl_req_type crt)
 {
         ENTRY;
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c b/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c
index b6576eb9b52e0..3cf9b86b2835a 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c
@@ -23,7 +23,7 @@
  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -42,17 +42,16 @@
 
 #include <obd_support.h>
 #include <obd_class.h>
-#include <lnet/lnetctl.h>
+#include <uapi/linux/lnet/lnetctl.h>
 #include <lustre_debug.h>
 #include <lustre_kernelcomm.h>
 #include <lprocfs_status.h>
-#include <lustre_ver.h>
 #include <cl_object.h>
 #ifdef HAVE_SERVER_SUPPORT
 # include <dt_object.h>
 # include <md_object.h>
 #endif /* HAVE_SERVER_SUPPORT */
-#include <uapi/linux/lustre_ioctl.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
 #include "llog_internal.h"
 
 #ifdef CONFIG_PROC_FS
@@ -70,6 +69,8 @@ unsigned int obd_dump_on_timeout;
 EXPORT_SYMBOL(obd_dump_on_timeout);
 unsigned int obd_dump_on_eviction;
 EXPORT_SYMBOL(obd_dump_on_eviction);
+unsigned int obd_lbug_on_eviction;
+EXPORT_SYMBOL(obd_lbug_on_eviction);
 unsigned long obd_max_dirty_pages;
 EXPORT_SYMBOL(obd_max_dirty_pages);
 atomic_long_t obd_dirty_pages;
@@ -97,92 +98,11 @@ EXPORT_SYMBOL(at_early_margin);
 int at_extra = 30;
 EXPORT_SYMBOL(at_extra);
 
-atomic_long_t obd_dirty_transit_pages;
-EXPORT_SYMBOL(obd_dirty_transit_pages);
-
-char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE;
-
 #ifdef CONFIG_PROC_FS
 struct lprocfs_stats *obd_memory = NULL;
 EXPORT_SYMBOL(obd_memory);
 #endif
 
-char obd_jobid_node[LUSTRE_JOBID_SIZE + 1];
-
-/* Get jobid of current process by reading the environment variable
- * stored in between the "env_start" & "env_end" of task struct.
- *
- * TODO:
- * It's better to cache the jobid for later use if there is any
- * efficient way, the cl_env code probably could be reused for this
- * purpose.
- *
- * If some job scheduler doesn't store jobid in the "env_start/end",
- * then an upcall could be issued here to get the jobid by utilizing
- * the userspace tools/api. Then, the jobid must be cached.
- */
-int lustre_get_jobid(char *jobid)
-{
-	int jobid_len = LUSTRE_JOBID_SIZE;
-	char tmp_jobid[LUSTRE_JOBID_SIZE] = { 0 };
-	int rc = 0;
-	ENTRY;
-
-	/* Jobstats isn't enabled */
-	if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0)
-		GOTO(out, rc = 0);
-
-	/* Whole node dedicated to single job */
-	if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) == 0) {
-		memcpy(tmp_jobid, obd_jobid_node, LUSTRE_JOBID_SIZE);
-		GOTO(out, rc = 0);
-	}
-
-	/* Use process name + fsuid as jobid */
-	if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) {
-		snprintf(tmp_jobid, LUSTRE_JOBID_SIZE, "%s.%u",
-			 current_comm(),
-			 from_kuid(&init_user_ns, current_fsuid()));
-		GOTO(out, rc = 0);
-	}
-
-	rc = cfs_get_environ(obd_jobid_var, tmp_jobid, &jobid_len);
-	if (rc) {
-		if (rc == -EOVERFLOW) {
-			/* For the PBS_JOBID and LOADL_STEP_ID keys (which are
-			 * variable length strings instead of just numbers), it
-			 * might make sense to keep the unique parts for JobID,
-			 * instead of just returning an error.  That means a
-			 * larger temp buffer for cfs_get_environ(), then
-			 * truncating the string at some separator to fit into
-			 * the specified jobid_len.  Fix later if needed. */
-			static bool printed;
-			if (unlikely(!printed)) {
-				LCONSOLE_ERROR_MSG(0x16b, "%s value too large "
-						   "for JobID buffer (%d)\n",
-						   obd_jobid_var, jobid_len);
-				printed = true;
-			}
-		} else {
-			CDEBUG((rc == -ENOENT || rc == -EINVAL ||
-				rc == -EDEADLK) ? D_INFO : D_ERROR,
-			       "Get jobid for (%s) failed: rc = %d\n",
-			       obd_jobid_var, rc);
-		}
-	}
-
-out:
-	if (rc != 0)
-		RETURN(rc);
-
-	/* Only replace the job ID if it changed. */
-	if (strcmp(jobid, tmp_jobid) != 0)
-		memcpy(jobid, tmp_jobid, jobid_len);
-
-	RETURN(0);
-}
-EXPORT_SYMBOL(lustre_get_jobid);
-
 static int class_resolve_dev_name(__u32 len, const char *name)
 {
         int rc;
@@ -212,6 +132,159 @@ static int class_resolve_dev_name(__u32 len, const char *name)
         RETURN(rc);
 }
 
+#define OBD_MAX_IOCTL_BUFFER	8192
+
+static int obd_ioctl_is_invalid(struct obd_ioctl_data *data)
+{
+	if (data->ioc_len > BIT(30)) {
+		CERROR("OBD ioctl: ioc_len larger than 1<<30\n");
+		return 1;
+	}
+
+	if (data->ioc_inllen1 > BIT(30)) {
+		CERROR("OBD ioctl: ioc_inllen1 larger than 1<<30\n");
+		return 1;
+	}
+
+	if (data->ioc_inllen2 > BIT(30)) {
+		CERROR("OBD ioctl: ioc_inllen2 larger than 1<<30\n");
+		return 1;
+	}
+
+	if (data->ioc_inllen3 > BIT(30)) {
+		CERROR("OBD ioctl: ioc_inllen3 larger than 1<<30\n");
+		return 1;
+	}
+
+	if (data->ioc_inllen4 > BIT(30)) {
+		CERROR("OBD ioctl: ioc_inllen4 larger than 1<<30\n");
+		return 1;
+	}
+
+	if (data->ioc_inlbuf1 && data->ioc_inllen1 == 0) {
+		CERROR("OBD ioctl: inlbuf1 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (data->ioc_inlbuf2 && data->ioc_inllen2 == 0) {
+		CERROR("OBD ioctl: inlbuf2 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (data->ioc_inlbuf3 && data->ioc_inllen3 == 0) {
+		CERROR("OBD ioctl: inlbuf3 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (data->ioc_inlbuf4 && data->ioc_inllen4 == 0) {
+		CERROR("OBD ioctl: inlbuf4 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (data->ioc_pbuf1 && data->ioc_plen1 == 0) {
+		CERROR("OBD ioctl: pbuf1 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (data->ioc_pbuf2 && data->ioc_plen2 == 0) {
+		CERROR("OBD ioctl: pbuf2 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (!data->ioc_pbuf1 && data->ioc_plen1 != 0) {
+		CERROR("OBD ioctl: plen1 set but NULL pointer\n");
+		return 1;
+	}
+
+	if (!data->ioc_pbuf2 && data->ioc_plen2 != 0) {
+		CERROR("OBD ioctl: plen2 set but NULL pointer\n");
+		return 1;
+	}
+
+	if (obd_ioctl_packlen(data) > data->ioc_len) {
+		CERROR("OBD ioctl: packlen exceeds ioc_len (%d > %d)\n",
+		       obd_ioctl_packlen(data), data->ioc_len);
+		return 1;
+	}
+
+	return 0;
+}
+
+/* buffer MUST be at least the size of obd_ioctl_hdr */
+int obd_ioctl_getdata(char **buf, int *len, void __user *arg)
+{
+	struct obd_ioctl_hdr hdr;
+	struct obd_ioctl_data *data;
+	int offset = 0;
+
+	ENTRY;
+	if (copy_from_user(&hdr, arg, sizeof(hdr)))
+		RETURN(-EFAULT);
+
+	if (hdr.ioc_version != OBD_IOCTL_VERSION) {
+		CERROR("Version mismatch kernel (%x) vs application (%x)\n",
+		       OBD_IOCTL_VERSION, hdr.ioc_version);
+		RETURN(-EINVAL);
+	}
+
+	if (hdr.ioc_len > OBD_MAX_IOCTL_BUFFER) {
+		CERROR("User buffer len %d exceeds %d max buffer\n",
+		       hdr.ioc_len, OBD_MAX_IOCTL_BUFFER);
+		RETURN(-EINVAL);
+	}
+
+	if (hdr.ioc_len < sizeof(struct obd_ioctl_data)) {
+		CERROR("User buffer too small for ioctl (%d)\n", hdr.ioc_len);
+		RETURN(-EINVAL);
+	}
+
+	/* When there are lots of processes calling vmalloc on multi-core
+	 * system, the high lock contention will hurt performance badly,
+	 * obdfilter-survey is an example, which relies on ioctl. So we'd
+	 * better avoid vmalloc on ioctl path. LU-66
+	 */
+	OBD_ALLOC_LARGE(*buf, hdr.ioc_len);
+	if (!*buf) {
+		CERROR("Cannot allocate control buffer of len %d\n",
+		       hdr.ioc_len);
+		RETURN(-EINVAL);
+	}
+	*len = hdr.ioc_len;
+	data = (struct obd_ioctl_data *)*buf;
+
+	if (copy_from_user(*buf, arg, hdr.ioc_len)) {
+		OBD_FREE_LARGE(*buf, hdr.ioc_len);
+		RETURN(-EFAULT);
+	}
+
+	if (obd_ioctl_is_invalid(data)) {
+		CERROR("ioctl not correctly formatted\n");
+		OBD_FREE_LARGE(*buf, hdr.ioc_len);
+		RETURN(-EINVAL);
+	}
+
+	if (data->ioc_inllen1) {
+		data->ioc_inlbuf1 = &data->ioc_bulk[0];
+		offset += cfs_size_round(data->ioc_inllen1);
+	}
+
+	if (data->ioc_inllen2) {
+		data->ioc_inlbuf2 = &data->ioc_bulk[0] + offset;
+		offset += cfs_size_round(data->ioc_inllen2);
+	}
+
+	if (data->ioc_inllen3) {
+		data->ioc_inlbuf3 = &data->ioc_bulk[0] + offset;
+		offset += cfs_size_round(data->ioc_inllen3);
+	}
+
+	if (data->ioc_inllen4)
+		data->ioc_inlbuf4 = &data->ioc_bulk[0] + offset;
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(obd_ioctl_getdata);
+
 int class_handle_ioctl(unsigned int cmd, unsigned long arg)
 {
         char *buf = NULL;
@@ -427,8 +500,57 @@ int class_handle_ioctl(unsigned int cmd, unsigned long arg)
 	RETURN(err);
 } /* class_handle_ioctl */
 
-#define OBD_INIT_CHECK
-#ifdef OBD_INIT_CHECK
+/*  opening /dev/obd */
+static int obd_class_open(struct inode * inode, struct file * file)
+{
+	ENTRY;
+	try_module_get(THIS_MODULE);
+	RETURN(0);
+}
+
+/*  closing /dev/obd */
+static int obd_class_release(struct inode * inode, struct file * file)
+{
+	ENTRY;
+
+	module_put(THIS_MODULE);
+	RETURN(0);
+}
+
+/* to control /dev/obd */
+static long obd_class_ioctl(struct file *filp, unsigned int cmd,
+			    unsigned long arg)
+{
+	int err = 0;
+
+	ENTRY;
+	/* Allow non-root access for OBD_IOC_PING_TARGET - used by lfs check */
+	if (!cfs_capable(CFS_CAP_SYS_ADMIN) && (cmd != OBD_IOC_PING_TARGET))
+		RETURN(err = -EACCES);
+
+	if ((cmd & 0xffffff00) == ((int)'T') << 8) /* ignore all tty ioctls */
+		RETURN(err = -ENOTTY);
+
+	err = class_handle_ioctl(cmd, (unsigned long)arg);
+
+	RETURN(err);
+}
+
+/* declare character device */
+static struct file_operations obd_psdev_fops = {
+	.owner		= THIS_MODULE,
+	.unlocked_ioctl	= obd_class_ioctl,	/* unlocked_ioctl */
+	.open		= obd_class_open,	/* open */
+	.release	= obd_class_release,	/* release */
+};
+
+/* modules setup */
+struct miscdevice obd_psdev = {
+	.minor	= MISC_DYNAMIC_MINOR,
+	.name	= OBD_DEV_NAME,
+	.fops	= &obd_psdev_fops,
+};
+
 static int obd_init_checks(void)
 {
         __u64 u64val, div64val;
@@ -494,9 +616,6 @@ static int obd_init_checks(void)
 
         return ret;
 }
-#else
-#define obd_init_checks() do {} while(0)
-#endif
 
 static int __init obdclass_init(void)
 {
@@ -613,7 +732,6 @@ static int __init obdclass_init(void)
 	lu_global_fini();
 
 cleanup_class_procfs:
-	obd_sysctl_clean();
 	class_procfs_clean();
 
 cleanup_caches:
@@ -683,7 +801,6 @@ static void __exit obdclass_exit(void)
 	lu_global_fini();
 
         obd_cleanup_caches();
-        obd_sysctl_clean();
 
         class_procfs_clean();
 
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/dt_object.c b/drivers/staging/lustrefsx/lustre/obdclass/dt_object.c
index a48e7cbe7ec18..68952df7e1242 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/dt_object.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/dt_object.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,7 +40,7 @@
 #define DEBUG_SUBSYSTEM S_CLASS
 
 #include <linux/list.h>
-#include <obd.h>
+#include <obd_class.h>
 #include <dt_object.h>
 /* fid_be_to_cpu() */
 #include <lustre_fid.h>
@@ -53,12 +53,13 @@ LU_KEY_INIT(dt_global, struct dt_thread_info);
 LU_KEY_FINI(dt_global, struct dt_thread_info);
 
 struct lu_context_key dt_key = {
-        .lct_tags = LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD | LCT_LOCAL,
-        .lct_init = dt_global_key_init,
-        .lct_fini = dt_global_key_fini
+	.lct_tags = LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD | LCT_LOCAL,
+	.lct_init = dt_global_key_init,
+	.lct_fini = dt_global_key_fini
 };
 
-/* no lock is necessary to protect the list, because call-backs
+/*
+ * no lock is necessary to protect the list, because call-backs
  * are added during system startup. Please refer to "struct dt_device".
  */
 void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb)
@@ -74,7 +75,7 @@ void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb)
 EXPORT_SYMBOL(dt_txn_callback_del);
 
 int dt_txn_hook_start(const struct lu_env *env,
-                      struct dt_device *dev, struct thandle *th)
+		      struct dt_device *dev, struct thandle *th)
 {
 	int rc = 0;
 	struct dt_txn_callback *cb;
@@ -89,9 +90,11 @@ int dt_txn_hook_start(const struct lu_env *env,
 		    !(cb->dtc_tag & env->le_ctx.lc_tags))
 			continue;
 
-		/* Usually dt_txn_hook_start is called from bottom device,
+		/*
+		 * Usually dt_txn_hook_start is called from bottom device,
 		 * and if the thandle has th_top, then we need use top
-		 * thandle for the callback in the top thandle layer */
+		 * thandle for the callback in the top thandle layer
+		 */
 		if (th->th_top != NULL)
 			dtc_th = th->th_top;
 
@@ -105,9 +108,9 @@ EXPORT_SYMBOL(dt_txn_hook_start);
 
 int dt_txn_hook_stop(const struct lu_env *env, struct thandle *th)
 {
-	struct dt_device       *dev = th->th_dev;
+	struct dt_device *dev = th->th_dev;
 	struct dt_txn_callback *cb;
-	int                     rc = 0;
+	int rc = 0;
 
 	if (th->th_local)
 		return 0;
@@ -122,9 +125,11 @@ int dt_txn_hook_stop(const struct lu_env *env, struct thandle *th)
 		    !(cb->dtc_tag & env->le_ctx.lc_tags))
 			continue;
 
-		/* Usually dt_txn_hook_stop is called from bottom device,
+		/*
+		 * Usually dt_txn_hook_stop is called from bottom device,
 		 * and if the thandle has th_top, then we need use top
-		 * thandle for the callback in the top thandle layer */
+		 * thandle for the callback in the top thandle layer
+		 */
 		if (th->th_top != NULL)
 			dtc_th = th->th_top;
 
@@ -145,53 +150,53 @@ EXPORT_SYMBOL(dt_device_init);
 
 void dt_device_fini(struct dt_device *dev)
 {
-        lu_device_fini(&dev->dd_lu_dev);
+	lu_device_fini(&dev->dd_lu_dev);
 }
 EXPORT_SYMBOL(dt_device_fini);
 
 int dt_object_init(struct dt_object *obj,
-                   struct lu_object_header *h, struct lu_device *d)
+		   struct lu_object_header *h, struct lu_device *d)
 
 {
-        return lu_object_init(&obj->do_lu, h, d);
+	return lu_object_init(&obj->do_lu, h, d);
 }
 EXPORT_SYMBOL(dt_object_init);
 
 void dt_object_fini(struct dt_object *obj)
 {
-        lu_object_fini(&obj->do_lu);
+	lu_object_fini(&obj->do_lu);
 }
 EXPORT_SYMBOL(dt_object_fini);
 
 int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj)
 {
-        if (obj->do_index_ops == NULL)
-                obj->do_ops->do_index_try(env, obj, &dt_directory_features);
-        return obj->do_index_ops != NULL;
+	if (obj->do_index_ops == NULL)
+		obj->do_ops->do_index_try(env, obj, &dt_directory_features);
+	return obj->do_index_ops != NULL;
 }
 EXPORT_SYMBOL(dt_try_as_dir);
 
 enum dt_format_type dt_mode_to_dft(__u32 mode)
 {
-        enum dt_format_type result;
-
-        switch (mode & S_IFMT) {
-        case S_IFDIR:
-                result = DFT_DIR;
-                break;
-        case S_IFREG:
-                result = DFT_REGULAR;
-                break;
-        case S_IFLNK:
-                result = DFT_SYM;
-                break;
-        case S_IFCHR:
-        case S_IFBLK:
-        case S_IFIFO:
-        case S_IFSOCK:
-                result = DFT_NODE;
-                break;
-        default:
+	enum dt_format_type result;
+
+	switch (mode & S_IFMT) {
+	case S_IFDIR:
+		result = DFT_DIR;
+		break;
+	case S_IFREG:
+		result = DFT_REGULAR;
+		break;
+	case S_IFLNK:
+		result = DFT_SYM;
+		break;
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFIFO:
+	case S_IFSOCK:
+		result = DFT_NODE;
+		break;
+	default:
 		LASSERTF(0, "invalid mode %o\n", mode);
 		result = 0; /* Just for satisfying compiler. */
 		break;
@@ -214,8 +219,10 @@ int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir,
 }
 EXPORT_SYMBOL(dt_lookup_dir);
 
-/* this differs from dt_locate by top_dev as parameter
- * but not one from lu_site */
+/*
+ * this differs from dt_locate by top_dev as parameter
+ * but not one from lu_site
+ */
 struct dt_object *dt_locate_at(const struct lu_env *env,
 			       struct dt_device *dev,
 			       const struct lu_fid *fid,
@@ -236,6 +243,7 @@ struct dt_object *dt_locate_at(const struct lu_env *env,
 			return container_of0(n, struct dt_object, do_lu);
 	}
 
+	lu_object_put(env, lo);
 	return ERR_PTR(-ENOENT);
 }
 EXPORT_SYMBOL(dt_locate_at);
@@ -269,28 +277,28 @@ static int dt_find_entry(const struct lu_env *env, const char *entry,
  * path component to \a entry_func.
  */
 int dt_path_parser(const struct lu_env *env,
-                   char *path, dt_entry_func_t entry_func,
-                   void *data)
+		   char *path, dt_entry_func_t entry_func,
+		   void *data)
 {
-        char *e;
-        int rc = 0;
-
-        while (1) {
-                e = strsep(&path, "/");
-                if (e == NULL)
-                        break;
-
-                if (e[0] == 0) {
-                        if (!path || path[0] == '\0')
-                                break;
-                        continue;
-                }
-                rc = entry_func(env, e, data);
-                if (rc)
-                        break;
-        }
-
-        return rc;
+	char *e;
+	int rc = 0;
+
+	while (1) {
+		e = strsep(&path, "/");
+		if (e == NULL)
+			break;
+
+		if (e[0] == 0) {
+			if (!path || path[0] == '\0')
+				break;
+			continue;
+		}
+		rc = entry_func(env, e, data);
+		if (rc)
+			break;
+	}
+
+	return rc;
 }
 
 struct dt_object *
@@ -298,51 +306,50 @@ dt_store_resolve(const struct lu_env *env, struct dt_device *dt,
 		 const char *path, struct lu_fid *fid)
 {
 	struct dt_thread_info *info = dt_info(env);
-	struct dt_find_hint   *dfh = &info->dti_dfh;
-	struct dt_object      *obj;
-	int		       result;
+	struct dt_find_hint *dfh = &info->dti_dfh;
+	struct dt_object *obj;
+	int result;
 
 
-        dfh->dfh_dt = dt;
-        dfh->dfh_fid = fid;
+	dfh->dfh_dt = dt;
+	dfh->dfh_fid = fid;
 
 	strlcpy(info->dti_buf, path, sizeof(info->dti_buf));
 
-        result = dt->dd_ops->dt_root_get(env, dt, fid);
-        if (result == 0) {
-                obj = dt_locate(env, dt, fid);
-                if (!IS_ERR(obj)) {
-                        dfh->dfh_o = obj;
+	result = dt->dd_ops->dt_root_get(env, dt, fid);
+	if (result == 0) {
+		obj = dt_locate(env, dt, fid);
+		if (!IS_ERR(obj)) {
+			dfh->dfh_o = obj;
 			result = dt_path_parser(env, info->dti_buf,
 						dt_find_entry, dfh);
-                        if (result != 0)
-                                obj = ERR_PTR(result);
-                        else
-                                obj = dfh->dfh_o;
-                }
-        } else {
-                obj = ERR_PTR(result);
-        }
-        return obj;
+			if (result != 0)
+				obj = ERR_PTR(result);
+			else
+				obj = dfh->dfh_o;
+		}
+	} else {
+		obj = ERR_PTR(result);
+	}
+	return obj;
 }
 
 static struct dt_object *dt_reg_open(const struct lu_env *env,
-                                     struct dt_device *dt,
-                                     struct dt_object *p,
-                                     const char *name,
-                                     struct lu_fid *fid)
+				     struct dt_device *dt,
+				     struct dt_object *p,
+				     const char *name,
+				     struct lu_fid *fid)
 {
-        struct dt_object *o;
-        int result;
+	struct dt_object *o;
+	int result;
 
-        result = dt_lookup_dir(env, p, name, fid);
-        if (result == 0){
-                o = dt_locate(env, dt, fid);
-        }
-        else
-                o = ERR_PTR(result);
+	result = dt_lookup_dir(env, p, name, fid);
+	if (result == 0)
+		o = dt_locate(env, dt, fid);
+	else
+		o = ERR_PTR(result);
 
-        return o;
+	return o;
 }
 
 /**
@@ -369,47 +376,47 @@ struct dt_object *dt_store_open(const struct lu_env *env, struct dt_device *dt,
 }
 
 struct dt_object *dt_find_or_create(const struct lu_env *env,
-                                    struct dt_device *dt,
-                                    const struct lu_fid *fid,
-                                    struct dt_object_format *dof,
-                                    struct lu_attr *at)
+				    struct dt_device *dt,
+				    const struct lu_fid *fid,
+				    struct dt_object_format *dof,
+				    struct lu_attr *at)
 {
-        struct dt_object *dto;
-        struct thandle *th;
-        int rc;
+	struct dt_object *dto;
+	struct thandle *th;
+	int rc;
 
-        ENTRY;
+	ENTRY;
 
-        dto = dt_locate(env, dt, fid);
-        if (IS_ERR(dto))
-                RETURN(dto);
+	dto = dt_locate(env, dt, fid);
+	if (IS_ERR(dto))
+		RETURN(dto);
 
-        LASSERT(dto != NULL);
-        if (dt_object_exists(dto))
-                RETURN(dto);
+	LASSERT(dto != NULL);
+	if (dt_object_exists(dto))
+		RETURN(dto);
 
-        th = dt_trans_create(env, dt);
-        if (IS_ERR(th))
-                GOTO(out, rc = PTR_ERR(th));
+	th = dt_trans_create(env, dt);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
 
-        rc = dt_declare_create(env, dto, at, NULL, dof, th);
-        if (rc)
-                GOTO(trans_stop, rc);
+	rc = dt_declare_create(env, dto, at, NULL, dof, th);
+	if (rc)
+		GOTO(trans_stop, rc);
 
-        rc = dt_trans_start_local(env, dt, th);
-        if (rc)
-                GOTO(trans_stop, rc);
+	rc = dt_trans_start_local(env, dt, th);
+	if (rc)
+		GOTO(trans_stop, rc);
 
-        dt_write_lock(env, dto, 0);
-        if (dt_object_exists(dto))
-                GOTO(unlock, rc = 0);
+	dt_write_lock(env, dto, 0);
+	if (dt_object_exists(dto))
+		GOTO(unlock, rc = 0);
 
-        CDEBUG(D_OTHER, "create new object "DFID"\n", PFID(fid));
+	CDEBUG(D_OTHER, "create new object "DFID"\n", PFID(fid));
 
-        rc = dt_create(env, dto, at, NULL, dof, th);
-        if (rc)
+	rc = dt_create(env, dto, at, NULL, dof, th);
+	if (rc)
                 GOTO(unlock, rc);
-        LASSERT(dt_object_exists(dto));
+	LASSERT(dt_object_exists(dto));
 unlock:
 	dt_write_unlock(env, dto);
 trans_stop:
@@ -427,16 +434,16 @@ EXPORT_SYMBOL(dt_find_or_create);
 /* dt class init function. */
 int dt_global_init(void)
 {
-        int result;
+	int result;
 
-        LU_CONTEXT_KEY_INIT(&dt_key);
-        result = lu_context_key_register(&dt_key);
-        return result;
+	LU_CONTEXT_KEY_INIT(&dt_key);
+	result = lu_context_key_register(&dt_key);
+	return result;
 }
 
 void dt_global_fini(void)
 {
-        lu_context_key_degister(&dt_key);
+	lu_context_key_degister(&dt_key);
 }
 
 /**
@@ -451,7 +458,7 @@ void dt_global_fini(void)
  * \retval -ve errno on failure
  */
 int dt_read(const struct lu_env *env, struct dt_object *dt,
-            struct lu_buf *buf, loff_t *pos)
+	    struct lu_buf *buf, loff_t *pos)
 {
 	LASSERTF(dt != NULL, "dt is NULL when we want to read record\n");
 	return dt->do_body_ops->dbo_read(env, dt, buf, pos);
@@ -486,7 +493,7 @@ int dt_record_read(const struct lu_env *env, struct dt_object *dt,
 EXPORT_SYMBOL(dt_record_read);
 
 int dt_record_write(const struct lu_env *env, struct dt_object *dt,
-                    const struct lu_buf *buf, loff_t *pos, struct thandle *th)
+		    const struct lu_buf *buf, loff_t *pos, struct thandle *th)
 {
 	ssize_t size;
 
@@ -495,7 +502,7 @@ int dt_record_write(const struct lu_env *env, struct dt_object *dt,
 	LASSERT(dt->do_body_ops);
 	LASSERT(dt->do_body_ops->dbo_write);
 
-	size = dt->do_body_ops->dbo_write(env, dt, buf, pos, th, 1);
+	size = dt->do_body_ops->dbo_write(env, dt, buf, pos, th);
 	if (size < 0)
 		return size;
 	return (size == (ssize_t)buf->lb_len) ? 0 : -EFAULT;
@@ -503,53 +510,53 @@ int dt_record_write(const struct lu_env *env, struct dt_object *dt,
 EXPORT_SYMBOL(dt_record_write);
 
 int dt_declare_version_set(const struct lu_env *env, struct dt_object *o,
-                           struct thandle *th)
+			   struct thandle *th)
 {
-        struct lu_buf vbuf;
-        char *xname = XATTR_NAME_VERSION;
+	struct lu_buf vbuf;
+	char *xname = XATTR_NAME_VERSION;
 
-        LASSERT(o);
-        vbuf.lb_buf = NULL;
-        vbuf.lb_len = sizeof(dt_obj_version_t);
-        return dt_declare_xattr_set(env, o, &vbuf, xname, 0, th);
+	LASSERT(o);
+	vbuf.lb_buf = NULL;
+	vbuf.lb_len = sizeof(dt_obj_version_t);
+	return dt_declare_xattr_set(env, o, &vbuf, xname, 0, th);
 
 }
 EXPORT_SYMBOL(dt_declare_version_set);
 
 void dt_version_set(const struct lu_env *env, struct dt_object *o,
-                    dt_obj_version_t version, struct thandle *th)
+		    dt_obj_version_t version, struct thandle *th)
 {
-        struct lu_buf vbuf;
-        char *xname = XATTR_NAME_VERSION;
-        int rc;
+	struct lu_buf vbuf;
+	char *xname = XATTR_NAME_VERSION;
+	int rc;
 
-        LASSERT(o);
-        vbuf.lb_buf = &version;
-        vbuf.lb_len = sizeof(version);
+	LASSERT(o);
+	vbuf.lb_buf = &version;
+	vbuf.lb_len = sizeof(version);
 
 	rc = dt_xattr_set(env, o, &vbuf, xname, 0, th);
-        if (rc < 0)
-                CDEBUG(D_INODE, "Can't set version, rc %d\n", rc);
-        return;
+	if (rc < 0)
+		CDEBUG(D_INODE, "Can't set version, rc %d\n", rc);
+	return;
 }
 EXPORT_SYMBOL(dt_version_set);
 
 dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o)
 {
-        struct lu_buf vbuf;
-        char *xname = XATTR_NAME_VERSION;
-        dt_obj_version_t version;
-        int rc;
-
-        LASSERT(o);
-        vbuf.lb_buf = &version;
-        vbuf.lb_len = sizeof(version);
+	struct lu_buf vbuf;
+	char *xname = XATTR_NAME_VERSION;
+	dt_obj_version_t version;
+	int rc;
+
+	LASSERT(o);
+	vbuf.lb_buf = &version;
+	vbuf.lb_len = sizeof(version);
 	rc = dt_xattr_get(env, o, &vbuf, xname);
-        if (rc != sizeof(version)) {
-                CDEBUG(D_INODE, "Can't get version, rc %d\n", rc);
-                version = 0;
-        }
-        return version;
+	if (rc != sizeof(version)) {
+		CDEBUG(D_INODE, "Can't get version, rc %d\n", rc);
+		version = 0;
+	}
+	return version;
 }
 EXPORT_SYMBOL(dt_version_get);
 
@@ -568,8 +575,8 @@ const struct dt_index_features dt_lfsck_layout_orphan_features = {
 	.dif_flags		= 0,
 	.dif_keysize_min	= sizeof(struct lu_fid),
 	.dif_keysize_max	= sizeof(struct lu_fid),
-	.dif_recsize_min	= sizeof(struct lu_orphan_rec_v2),
-	.dif_recsize_max	= sizeof(struct lu_orphan_rec_v2),
+	.dif_recsize_min	= sizeof(struct lu_orphan_rec_v3),
+	.dif_recsize_max	= sizeof(struct lu_orphan_rec_v3),
 	.dif_ptrsize		= 4
 };
 EXPORT_SYMBOL(dt_lfsck_layout_orphan_features);
@@ -642,8 +649,10 @@ const struct dt_index_features dt_nodemap_features = {
 };
 EXPORT_SYMBOL(dt_nodemap_features);
 
-/* helper function returning what dt_index_features structure should be used
- * based on the FID sequence. This is used by OBD_IDX_READ RPC */
+/*
+ * helper function returning what dt_index_features structure should be used
+ * based on the FID sequence. This is used by OBD_IDX_READ RPC
+ */
 static inline const struct dt_index_features *dt_index_feat_select(__u64 seq,
 								   __u32 mode)
 {
@@ -689,11 +698,15 @@ static int dt_index_page_build(const struct lu_env *env, union lu_page *lp,
 			       size_t nob, const struct dt_it_ops *iops,
 			       struct dt_it *it, __u32 attr, void *arg)
 {
-	struct idx_info		*ii = (struct idx_info *)arg;
-	struct lu_idxpage	*lip = &lp->lp_idx;
-	char			*entry;
-	size_t			 size;
-	int			 rc;
+	struct idx_info *ii = (struct idx_info *)arg;
+	struct lu_idxpage *lip = &lp->lp_idx;
+	char *entry;
+	__u64 hash;
+	__u16 hashsize = 0;
+	__u16 keysize = 0;
+	__u16 recsize;
+	int rc;
+
 	ENTRY;
 
 	if (nob < LIP_HDR_SIZE)
@@ -704,20 +717,12 @@ static int dt_index_page_build(const struct lu_env *env, union lu_page *lp,
 	lip->lip_magic = LIP_MAGIC;
 	nob           -= LIP_HDR_SIZE;
 
-	/* compute size needed to store a key/record pair */
-	size = ii->ii_recsize + ii->ii_keysize;
-	if ((ii->ii_flags & II_FL_NOHASH) == 0)
-		/* add hash if the client wants it */
-		size += sizeof(__u64);
+	/* client wants to the 64-bit hash value associated with each record */
+	if (!(ii->ii_flags & II_FL_NOHASH))
+		hashsize = sizeof(hash);
 
 	entry = lip->lip_entries;
 	do {
-		char		*tmp_entry = entry;
-		struct dt_key	*key;
-		__u64		hash;
-		__u16		keysize;
-		__u16		recsize;
-
 		/* fetch 64-bit hash value */
 		hash = iops->store(env, it);
 		ii->ii_hash_end = hash;
@@ -727,56 +732,54 @@ static int dt_index_page_build(const struct lu_env *env, union lu_page *lp,
 				GOTO(out, rc = 0);
 		}
 
-		if (nob < size) {
-			if (lip->lip_nr == 0)
+		if (!(ii->ii_flags & II_FL_NOKEY)) {
+			keysize = iops->key_size(env, it);
+			if (!(ii->ii_flags & II_FL_VARKEY) &&
+			    keysize != ii->ii_keysize) {
+				CERROR("keysize mismatch %hu != %hu.\n",
+				       keysize, ii->ii_keysize);
 				GOTO(out, rc = -EINVAL);
-			GOTO(out, rc = 0);
-		}
-
-		if (!(ii->ii_flags & II_FL_NOHASH)) {
-			/* client wants to the 64-bit hash value associated with
-			 * each record */
-			memcpy(tmp_entry, &hash, sizeof(hash));
-			tmp_entry += sizeof(hash);
+			}
 		}
 
-		if (ii->ii_flags & II_FL_VARKEY)
-			keysize = iops->key_size(env, it);
+		/* and finally the record */
+		if (ii->ii_flags & II_FL_VARREC)
+			recsize = iops->rec_size(env, it, attr);
 		else
-			keysize = ii->ii_keysize;
+			recsize = ii->ii_recsize;
 
-		if (!(ii->ii_flags & II_FL_NOKEY)) {
-			/* then the key value */
-			key = iops->key(env, it);
-			memcpy(tmp_entry, key, keysize);
-			tmp_entry += keysize;
+		if (nob < hashsize + keysize + recsize) {
+			if (lip->lip_nr == 0)
+				GOTO(out, rc = -E2BIG);
+			GOTO(out, rc = 0);
 		}
 
-		/* and finally the record */
-		rc = iops->rec(env, it, (struct dt_rec *)tmp_entry, attr);
-		if (rc != -ESTALE) {
-			if (rc != 0)
-				GOTO(out, rc);
-
+		rc = iops->rec(env, it,
+			       (struct dt_rec *)(entry + hashsize + keysize),
+			       attr);
+		if (!rc) {
+			if (hashsize)
+				memcpy(entry, &hash, hashsize);
+			if (keysize) {
+				struct dt_key *key;
+
+				key = iops->key(env, it);
+				memcpy(entry + hashsize, key, keysize);
+			}
 			/* hash/key/record successfully copied! */
 			lip->lip_nr++;
 			if (unlikely(lip->lip_nr == 1 && ii->ii_count == 0))
 				ii->ii_hash_start = hash;
-
-			if (ii->ii_flags & II_FL_VARREC)
-				recsize = iops->rec_size(env, it, attr);
-			else
-				recsize = ii->ii_recsize;
-
-			entry = tmp_entry + recsize;
-			nob -= size;
+			entry += hashsize + keysize + recsize;
+			nob -= hashsize + keysize + recsize;
+		} else if (rc != -ESTALE) {
+			GOTO(out, rc);
 		}
 
 		/* move on to the next record */
 		do {
 			rc = iops->next(env, it);
 		} while (rc == -ESTALE);
-
 	} while (rc == 0);
 
 	GOTO(out, rc);
@@ -809,10 +812,10 @@ int dt_index_walk(const struct lu_env *env, struct dt_object *obj,
 		  const struct lu_rdpg *rdpg, dt_index_page_build_t filler,
 		  void *arg)
 {
-	struct dt_it		*it;
-	const struct dt_it_ops	*iops;
-	size_t			 pageidx, nob, nlupgs = 0;
-	int			 rc;
+	struct dt_it *it;
+	const struct dt_it_ops *iops;
+	size_t pageidx, nob, nlupgs = 0;
+	int rc;
 	ENTRY;
 
 	LASSERT(rdpg->rp_pages != NULL);
@@ -853,13 +856,15 @@ int dt_index_walk(const struct lu_env *env, struct dt_object *obj,
 		GOTO(out, rc);
 	}
 
-	/* Fill containers one after the other. There might be multiple
+	/*
+	 * Fill containers one after the other. There might be multiple
 	 * containers per physical page.
 	 *
 	 * At this point and across for-loop:
 	 *  rc == 0 -> ok, proceed.
 	 *  rc >  0 -> end of index.
-	 *  rc <  0 -> error. */
+	 *  rc <  0 -> error.
+	 */
 	for (pageidx = 0; rc == 0 && nob > 0; pageidx++) {
 		union lu_page	*lp;
 		int		 i;
@@ -915,8 +920,10 @@ int dt_index_read(const struct lu_env *env, struct dt_device *dev,
 	int				 rc;
 	ENTRY;
 
-	/* rp_count shouldn't be null and should be a multiple of the container
-	 * size */
+	/*
+	 * rp_count shouldn't be null and should be a multiple of the container
+	 * size
+	 */
 	if (rdpg->rp_count == 0 || (rdpg->rp_count & (LU_PAGE_SIZE - 1)) != 0)
 		RETURN(-EFAULT);
 
@@ -1077,3 +1084,221 @@ int lprocfs_dt_filesfree_seq_show(struct seq_file *m, void *v)
 EXPORT_SYMBOL(lprocfs_dt_filesfree_seq_show);
 
 #endif /* CONFIG_PROC_FS */
+
+static ssize_t uuid_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf)
+{
+	struct dt_device *dt = container_of(kobj, struct dt_device,
+					    dd_kobj);
+	struct lu_device *lu = dt2lu_dev(dt);
+
+	if (!lu->ld_obd)
+		return -ENODEV;
+
+	return sprintf(buf, "%s\n", lu->ld_obd->obd_uuid.uuid);
+}
+LUSTRE_RO_ATTR(uuid);
+
+static ssize_t blocksize_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
+{
+	struct dt_device *dt = container_of(kobj, struct dt_device,
+					    dd_kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = dt_statfs(NULL, dt, &osfs);
+	if (rc)
+		return rc;
+
+	return sprintf(buf, "%u\n", (unsigned) osfs.os_bsize);
+}
+LUSTRE_RO_ATTR(blocksize);
+
+static ssize_t kbytestotal_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	struct dt_device *dt = container_of(kobj, struct dt_device,
+					    dd_kobj);
+	struct obd_statfs osfs;
+	u32 blk_size;
+	u64 result;
+	int rc;
+
+	rc = dt_statfs(NULL, dt, &osfs);
+	if (rc)
+		return rc;
+
+	blk_size = osfs.os_bsize >> 10;
+	result = osfs.os_blocks;
+
+	while (blk_size >>= 1)
+		result <<= 1;
+
+	return sprintf(buf, "%llu\n", result);
+}
+LUSTRE_RO_ATTR(kbytestotal);
+
+static ssize_t kbytesfree_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	struct dt_device *dt = container_of(kobj, struct dt_device,
+					    dd_kobj);
+	struct obd_statfs osfs;
+	u32 blk_size;
+	u64 result;
+	int rc;
+
+	rc = dt_statfs(NULL, dt, &osfs);
+	if (rc)
+		return rc;
+
+	blk_size = osfs.os_bsize >> 10;
+	result = osfs.os_bfree;
+
+	while (blk_size >>= 1)
+		result <<= 1;
+
+	return sprintf(buf, "%llu\n", result);
+}
+LUSTRE_RO_ATTR(kbytesfree);
+
+static ssize_t kbytesavail_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	struct dt_device *dt = container_of(kobj, struct dt_device,
+					    dd_kobj);
+	struct obd_statfs osfs;
+	u32 blk_size;
+	u64 result;
+	int rc;
+
+	rc = dt_statfs(NULL, dt, &osfs);
+	if (rc)
+		return rc;
+
+	blk_size = osfs.os_bsize >> 10;
+	result = osfs.os_bavail;
+
+	while (blk_size >>= 1)
+		result <<= 1;
+
+	return sprintf(buf, "%llu\n", result);
+}
+LUSTRE_RO_ATTR(kbytesavail);
+
+static ssize_t filestotal_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	struct dt_device *dt = container_of(kobj, struct dt_device,
+					    dd_kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = dt_statfs(NULL, dt, &osfs);
+	if (rc)
+		return rc;
+
+	return sprintf(buf, "%llu\n", osfs.os_files);
+}
+LUSTRE_RO_ATTR(filestotal);
+
+static ssize_t filesfree_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
+{
+	struct dt_device *dt = container_of(kobj, struct dt_device,
+					    dd_kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = dt_statfs(NULL, dt, &osfs);
+	if (rc)
+		return rc;
+
+	return sprintf(buf, "%llu\n", osfs.os_ffree);
+}
+LUSTRE_RO_ATTR(filesfree);
+
+static const struct attribute *dt_def_attrs[] = {
+	&lustre_attr_uuid.attr,
+	&lustre_attr_blocksize.attr,
+	&lustre_attr_kbytestotal.attr,
+	&lustre_attr_kbytesfree.attr,
+	&lustre_attr_kbytesavail.attr,
+	&lustre_attr_filestotal.attr,
+	&lustre_attr_filesfree.attr,
+	NULL,
+};
+
+static void dt_sysfs_release(struct kobject *kobj)
+{
+	struct dt_device *dt = container_of(kobj, struct dt_device,
+					    dd_kobj);
+
+	complete(&dt->dd_kobj_unregister);
+}
+
+int dt_tunables_fini(struct dt_device *dt)
+{
+	if (!dt)
+		return -EINVAL;
+
+	if (!IS_ERR_OR_NULL(dt->dd_debugfs_entry))
+		ldebugfs_remove(&dt->dd_debugfs_entry);
+
+	if (dt->dd_def_attrs)
+		sysfs_remove_files(&dt->dd_kobj, dt->dd_def_attrs);
+
+	kobject_put(&dt->dd_kobj);
+	wait_for_completion(&dt->dd_kobj_unregister);
+
+	return 0;
+}
+EXPORT_SYMBOL(dt_tunables_fini);
+
+int dt_tunables_init(struct dt_device *dt, struct obd_type *type,
+		     const char *name, struct ldebugfs_vars *list)
+{
+	int rc;
+
+	dt->dd_ktype.sysfs_ops = &lustre_sysfs_ops;
+	dt->dd_ktype.release = dt_sysfs_release;
+
+	init_completion(&dt->dd_kobj_unregister);
+	rc = kobject_init_and_add(&dt->dd_kobj, &dt->dd_ktype, type->typ_kobj,
+				  "%s", name);
+	if (rc)
+		return rc;
+
+	dt->dd_def_attrs = dt_def_attrs;
+
+	rc = sysfs_create_files(&dt->dd_kobj, dt->dd_def_attrs);
+	if (rc) {
+		kobject_put(&dt->dd_kobj);
+		return rc;
+	}
+
+	/*
+	 * No need to register debugfs if no enteries. This allows us to
+	 * choose between using dt_device or obd_device for debugfs.
+	 */
+	if (!list)
+		return rc;
+
+	dt->dd_debugfs_entry = ldebugfs_register(name,
+						 type->typ_debugfs_entry,
+						 list, dt);
+	if (IS_ERR_OR_NULL(dt->dd_debugfs_entry)) {
+		rc = dt->dd_debugfs_entry ? PTR_ERR(dt->dd_debugfs_entry)
+					  : -ENOMEM;
+		CERROR("%s: error %d setting up debugfs\n",
+		       name, rc);
+		dt->dd_debugfs_entry = NULL;
+		sysfs_remove_files(&dt->dd_kobj, dt->dd_def_attrs);
+		kobject_put(&dt->dd_kobj);
+		return rc;
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(dt_tunables_init);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/genops.c b/drivers/staging/lustrefsx/lustre/obdclass/genops.c
index 2c8e4db905d01..bd9330daafd8a 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/genops.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/genops.c
@@ -23,7 +23,7 @@
  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,8 +38,10 @@
 #define DEBUG_SUBSYSTEM S_CLASS
 
 #include <linux/pid_namespace.h>
-#include <linux/kthread.h>
+#include <linux/workqueue.h>
+#include <lustre_compat.h>
 #include <obd_class.h>
+#include <lustre_log.h>
 #include <lprocfs_status.h>
 #include <lustre_disk.h>
 #include <lustre_kernelcomm.h>
@@ -50,15 +52,9 @@ DEFINE_RWLOCK(obd_dev_lock);
 static struct obd_device *obd_devs[MAX_OBD_DEVICES];
 
 static struct kmem_cache *obd_device_cachep;
-struct kmem_cache *obdo_cachep;
-EXPORT_SYMBOL(obdo_cachep);
-static struct kmem_cache *import_cachep;
 
-static LIST_HEAD(obd_zombie_imports);
-static LIST_HEAD(obd_zombie_exports);
-static DEFINE_SPINLOCK(obd_zombie_impexp_lock);
+static struct workqueue_struct *zombie_wq;
 
-static void obd_zombie_impexp_notify(void);
 static void obd_zombie_export_add(struct obd_export *exp);
 static void obd_zombie_import_add(struct obd_import *imp);
 static void print_export_data(struct obd_export *exp,
@@ -162,18 +158,57 @@ void class_put_type(struct obd_type *type)
 	spin_unlock(&type->obd_type_lock);
 }
 
+static void class_sysfs_release(struct kobject *kobj)
+{
+	OBD_FREE(kobj, sizeof(*kobj));
+}
+
+static struct kobj_type class_ktype = {
+	.sysfs_ops      = &lustre_sysfs_ops,
+	.release        = class_sysfs_release,
+};
+
+struct kobject *class_setup_tunables(const char *name)
+{
+	struct kobject *kobj;
+	int rc;
+
+#ifdef HAVE_SERVER_SUPPORT
+	kobj = kset_find_obj(lustre_kset, name);
+	if (kobj)
+		return kobj;
+#endif
+	OBD_ALLOC(kobj, sizeof(*kobj));
+	if (!kobj)
+		return ERR_PTR(-ENOMEM);
+
+	kobj->kset = lustre_kset;
+	kobject_init(kobj, &class_ktype);
+	rc = kobject_add(kobj, &lustre_kset->kobj, "%s", name);
+	if (rc) {
+		kobject_put(kobj);
+		return ERR_PTR(rc);
+	}
+	return kobj;
+}
+EXPORT_SYMBOL(class_setup_tunables);
+
 #define CLASS_MAX_NAME 1024
 
-int class_register_type(struct obd_ops *dt_ops, struct md_ops *md_ops,
-			bool enable_proc, struct lprocfs_vars *vars,
+int class_register_type(const struct obd_ops *dt_ops,
+			const struct md_ops *md_ops,
+			bool enable_proc, struct ldebugfs_vars *vars,
 			const char *name, struct lu_device_type *ldt)
 {
-        struct obd_type *type;
-        int rc = 0;
-        ENTRY;
+	struct obd_type *type;
+#ifdef HAVE_SERVER_SUPPORT
+	struct qstr dname;
+#endif /* HAVE_SERVER_SUPPORT */
+	int rc = 0;
 
-        /* sanity check */
-        LASSERT(strnlen(name, CLASS_MAX_NAME) < CLASS_MAX_NAME);
+	ENTRY;
+	/* sanity check */
+	LASSERT(strnlen(name, CLASS_MAX_NAME) < CLASS_MAX_NAME);
 
         if (class_search_type(name)) {
                 CDEBUG(D_IOCTL, "Type %s already registered\n", name);
@@ -205,7 +240,7 @@ int class_register_type(struct obd_ops *dt_ops, struct md_ops *md_ops,
 	if (enable_proc) {
 		type->typ_procroot = lprocfs_register(type->typ_name,
 						      proc_lustre_root,
-						      vars, type);
+						      NULL, type);
 		if (IS_ERR(type->typ_procroot)) {
 			rc = PTR_ERR(type->typ_procroot);
 			type->typ_procroot = NULL;
@@ -213,20 +248,57 @@ int class_register_type(struct obd_ops *dt_ops, struct md_ops *md_ops,
 		}
 	}
 #endif
-        if (ldt != NULL) {
-                type->typ_lu = ldt;
-                rc = lu_device_type_init(ldt);
-                if (rc != 0)
-                        GOTO (failed, rc);
-        }
+#ifdef HAVE_SERVER_SUPPORT
+	dname.name = name;
+	dname.len = strlen(dname.name);
+	dname.hash = ll_full_name_hash(debugfs_lustre_root, dname.name,
+				       dname.len);
+	type->typ_debugfs_entry = d_lookup(debugfs_lustre_root, &dname);
+	if (type->typ_debugfs_entry) {
+		dput(type->typ_debugfs_entry);
+		type->typ_sym_filter = true;
+		goto dir_exist;
+	}
+#endif /* HAVE_SERVER_SUPPORT */
+
+	type->typ_debugfs_entry = ldebugfs_register(type->typ_name,
+						    debugfs_lustre_root,
+						    vars, type);
+	if (IS_ERR_OR_NULL(type->typ_debugfs_entry)) {
+		rc = type->typ_debugfs_entry ? PTR_ERR(type->typ_debugfs_entry)
+					     : -ENOMEM;
+		type->typ_debugfs_entry = NULL;
+		GOTO(failed, rc);
+	}
+#ifdef HAVE_SERVER_SUPPORT
+dir_exist:
+#endif
+	type->typ_kobj = class_setup_tunables(type->typ_name);
+	if (IS_ERR(type->typ_kobj))
+		GOTO(failed, rc = PTR_ERR(type->typ_kobj));
+
+	if (ldt) {
+		type->typ_lu = ldt;
+		rc = lu_device_type_init(ldt);
+		if (rc) {
+			kobject_put(type->typ_kobj);
+			GOTO(failed, rc);
+		}
+	}
 
 	spin_lock(&obd_types_lock);
 	list_add(&type->typ_chain, &obd_types);
 	spin_unlock(&obd_types_lock);
 
-        RETURN (0);
+	RETURN(0);
 
 failed:
+#ifdef HAVE_SERVER_SUPPORT
+	if (type->typ_sym_filter)
+		type->typ_debugfs_entry = NULL;
+#endif
+	if (!IS_ERR_OR_NULL(type->typ_debugfs_entry))
+		ldebugfs_remove(&type->typ_debugfs_entry);
 	if (type->typ_name != NULL) {
 #ifdef CONFIG_PROC_FS
 		if (type->typ_procroot != NULL)
@@ -262,6 +334,8 @@ int class_unregister_type(const char *name)
                 RETURN(-EBUSY);
         }
 
+	kobject_put(type->typ_kobj);
+
 	/* we do not use type->typ_procroot as for compatibility purposes
 	 * other modules can share names (i.e. lod can use lov entry). so
 	 * we can't reference pointer as it can get invalided when another
@@ -272,6 +346,13 @@ int class_unregister_type(const char *name)
 	if (type->typ_procsym != NULL)
 		lprocfs_remove(&type->typ_procsym);
 #endif
+#ifdef HAVE_SERVER_SUPPORT
+	if (type->typ_sym_filter)
+		type->typ_debugfs_entry = NULL;
+#endif
+	if (!IS_ERR_OR_NULL(type->typ_debugfs_entry))
+		ldebugfs_remove(&type->typ_debugfs_entry);
+
         if (type->typ_lu)
                 lu_device_type_fini(type->typ_lu);
 
@@ -291,22 +372,20 @@ EXPORT_SYMBOL(class_unregister_type);
 /**
  * Create a new obd device.
  *
- * Find an empty slot in ::obd_devs[], create a new obd device in it.
+ * Allocate the new obd_device and initialize it.
  *
  * \param[in] type_name obd device type string.
  * \param[in] name      obd device name.
+ * \param[in] uuid      obd device UUID
  *
- * \retval NULL if create fails, otherwise return the obd device
- *         pointer created.
+ * \retval newdev         pointer to created obd_device
+ * \retval ERR_PTR(errno) on error
  */
-struct obd_device *class_newdev(const char *type_name, const char *name)
+struct obd_device *class_newdev(const char *type_name, const char *name,
+				const char *uuid)
 {
-        struct obd_device *result = NULL;
         struct obd_device *newdev;
         struct obd_type *type = NULL;
-        int i;
-        int new_obd_minor = 0;
-        bool retried = false;
         ENTRY;
 
         if (strlen(name) >= MAX_OBD_NAME) {
@@ -321,106 +400,197 @@ struct obd_device *class_newdev(const char *type_name, const char *name)
         }
 
         newdev = obd_device_alloc();
-	if (newdev == NULL)
-		GOTO(out_type, result = ERR_PTR(-ENOMEM));
-
+	if (newdev == NULL) {
+		class_put_type(type);
+		RETURN(ERR_PTR(-ENOMEM));
+	}
         LASSERT(newdev->obd_magic == OBD_DEVICE_MAGIC);
+	strncpy(newdev->obd_name, name, sizeof(newdev->obd_name) - 1);
+	newdev->obd_type = type;
+	newdev->obd_minor = -1;
+
+	rwlock_init(&newdev->obd_pool_lock);
+	newdev->obd_pool_limit = 0;
+	newdev->obd_pool_slv = 0;
+
+	INIT_LIST_HEAD(&newdev->obd_exports);
+	INIT_LIST_HEAD(&newdev->obd_unlinked_exports);
+	INIT_LIST_HEAD(&newdev->obd_delayed_exports);
+	INIT_LIST_HEAD(&newdev->obd_exports_timed);
+	INIT_LIST_HEAD(&newdev->obd_nid_stats);
+	spin_lock_init(&newdev->obd_nid_lock);
+	spin_lock_init(&newdev->obd_dev_lock);
+	mutex_init(&newdev->obd_dev_mutex);
+	spin_lock_init(&newdev->obd_osfs_lock);
+	/* newdev->obd_osfs_age must be set to a value in the distant
+	 * past to guarantee a fresh statfs is fetched on mount. */
+	newdev->obd_osfs_age = ktime_get_seconds() - 1000;
+
+	/* XXX belongs in setup not attach  */
+	init_rwsem(&newdev->obd_observer_link_sem);
+	/* recovery data */
+	spin_lock_init(&newdev->obd_recovery_task_lock);
+	init_waitqueue_head(&newdev->obd_next_transno_waitq);
+	init_waitqueue_head(&newdev->obd_evict_inprogress_waitq);
+	INIT_LIST_HEAD(&newdev->obd_req_replay_queue);
+	INIT_LIST_HEAD(&newdev->obd_lock_replay_queue);
+	INIT_LIST_HEAD(&newdev->obd_final_req_queue);
+	INIT_LIST_HEAD(&newdev->obd_evict_list);
+	INIT_LIST_HEAD(&newdev->obd_lwp_list);
+
+	llog_group_init(&newdev->obd_olg);
+	/* Detach drops this */
+	atomic_set(&newdev->obd_refcount, 1);
+	lu_ref_init(&newdev->obd_reference);
+	lu_ref_add(&newdev->obd_reference, "newdev", newdev);
+
+	newdev->obd_conn_inprogress = 0;
+
+	strncpy(newdev->obd_uuid.uuid, uuid, UUID_MAX);
+
+	CDEBUG(D_IOCTL, "Allocate new device %s (%p)\n",
+	       newdev->obd_name, newdev);
+
+	return newdev;
+}
 
-        again:
-	write_lock(&obd_dev_lock);
-        for (i = 0; i < class_devno_max(); i++) {
-                struct obd_device *obd = class_num2obd(i);
-
-		if (obd && (strcmp(name, obd->obd_name) == 0)) {
+/**
+ * Free obd device.
+ *
+ * \param[in] obd obd_device to be freed
+ *
+ * \retval none
+ */
+void class_free_dev(struct obd_device *obd)
+{
+	struct obd_type *obd_type = obd->obd_type;
 
-                        if (!retried) {
-                                write_unlock(&obd_dev_lock);
+	LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x "
+		 "!= %08x\n", obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+	LASSERTF(obd->obd_minor == -1 || obd_devs[obd->obd_minor] == obd,
+		 "obd %p != obd_devs[%d] %p\n",
+		 obd, obd->obd_minor, obd_devs[obd->obd_minor]);
+	LASSERTF(atomic_read(&obd->obd_refcount) == 0,
+		 "obd_refcount should be 0, not %d\n",
+		 atomic_read(&obd->obd_refcount));
+	LASSERT(obd_type != NULL);
 
-                                /* the obd_device could be waited to be
-                                 * destroyed by the "obd_zombie_impexp_thread".
-                                 */
-                                obd_zombie_barrier();
-                                retried = true;
-                                goto again;
-                        }
+	CDEBUG(D_INFO, "Release obd device %s obd_type name = %s\n",
+	       obd->obd_name, obd->obd_type->typ_name);
 
-                        CERROR("Device %s already exists at %d, won't add\n",
-                               name, i);
-                        if (result) {
-                                LASSERTF(result->obd_magic == OBD_DEVICE_MAGIC,
-                                         "%p obd_magic %08x != %08x\n", result,
-                                         result->obd_magic, OBD_DEVICE_MAGIC);
-                                LASSERTF(result->obd_minor == new_obd_minor,
-                                         "%p obd_minor %d != %d\n", result,
-                                         result->obd_minor, new_obd_minor);
-
-                                obd_devs[result->obd_minor] = NULL;
-                                result->obd_name[0]='\0';
-                         }
-                        result = ERR_PTR(-EEXIST);
-                        break;
-                }
-                if (!result && !obd) {
-                        result = newdev;
-                        result->obd_minor = i;
-                        new_obd_minor = i;
-                        result->obd_type = type;
-                        strncpy(result->obd_name, name,
-                                sizeof(result->obd_name) - 1);
-                        obd_devs[i] = result;
-                }
-        }
-	write_unlock(&obd_dev_lock);
-
-        if (result == NULL && i >= class_devno_max()) {
-                CERROR("all %u OBD devices used, increase MAX_OBD_DEVICES\n",
-                       class_devno_max());
-		GOTO(out, result = ERR_PTR(-EOVERFLOW));
-        }
+	CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n",
+			 obd->obd_name, obd->obd_uuid.uuid);
+	if (obd->obd_stopping) {
+		int err;
 
-	if (IS_ERR(result))
-	        GOTO(out, result);
+		/* If we're not stopping, we were never set up */
+		err = obd_cleanup(obd);
+		if (err)
+			CERROR("Cleanup %s returned %d\n",
+				obd->obd_name, err);
+	}
 
-	CDEBUG(D_IOCTL, "Adding new device %s (%p)\n",
-	       result->obd_name, result);
+	obd_device_free(obd);
 
-	RETURN(result);
-out:
-	obd_device_free(newdev);
-out_type:
-	class_put_type(type);
-	return result;
+	class_put_type(obd_type);
 }
 
-void class_release_dev(struct obd_device *obd)
+/**
+ * Unregister obd device.
+ *
+ * Free slot in obd_dev[] used by \a obd.
+ *
+ * \param[in] new_obd obd_device to be unregistered
+ *
+ * \retval none
+ */
+void class_unregister_device(struct obd_device *obd)
 {
-        struct obd_type *obd_type = obd->obd_type;
-
-        LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x != %08x\n",
-                 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
-        LASSERTF(obd == obd_devs[obd->obd_minor], "obd %p != obd_devs[%d] %p\n",
-                 obd, obd->obd_minor, obd_devs[obd->obd_minor]);
-        LASSERT(obd_type != NULL);
+	write_lock(&obd_dev_lock);
+	if (obd->obd_minor >= 0) {
+		LASSERT(obd_devs[obd->obd_minor] == obd);
+		obd_devs[obd->obd_minor] = NULL;
+		obd->obd_minor = -1;
+	}
+	write_unlock(&obd_dev_lock);
+}
 
-        CDEBUG(D_INFO, "Release obd device %s at %d obd_type name =%s\n",
-               obd->obd_name, obd->obd_minor, obd->obd_type->typ_name);
+/**
+ * Register obd device.
+ *
+ * Find free slot in obd_devs[], fills it with \a new_obd.
+ *
+ * \param[in] new_obd obd_device to be registered
+ *
+ * \retval 0          success
+ * \retval -EEXIST    device with this name is registered
+ * \retval -EOVERFLOW obd_devs[] is full
+ */
+int class_register_device(struct obd_device *new_obd)
+{
+	int ret = 0;
+	int i;
+	int new_obd_minor = 0;
+	bool minor_assign = false;
+	bool retried = false;
 
+again:
 	write_lock(&obd_dev_lock);
-        obd_devs[obd->obd_minor] = NULL;
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+
+		if (obd != NULL &&
+		    (strcmp(new_obd->obd_name, obd->obd_name) == 0)) {
+
+			if (!retried) {
+				write_unlock(&obd_dev_lock);
+
+				/* the obd_device could be waited to be
+ 				 * destroyed by the "obd_zombie_impexp_thread".
+ 				 */
+				obd_zombie_barrier();
+				retried = true;
+				goto again;
+			}
+
+			CERROR("%s: already exists, won't add\n",
+			       obd->obd_name);
+			/* in case we found a free slot before duplicate */
+			minor_assign = false;
+			ret = -EEXIST;
+			break;
+		}
+		if (!minor_assign && obd == NULL) {
+			new_obd_minor = i;
+			minor_assign = true;
+		}
+	}
+
+	if (minor_assign) {
+		new_obd->obd_minor = new_obd_minor;
+		LASSERTF(obd_devs[new_obd_minor] == NULL, "obd_devs[%d] "
+			 "%p\n", new_obd_minor, obd_devs[new_obd_minor]);
+		obd_devs[new_obd_minor] = new_obd;
+	} else {
+		if (ret == 0) {
+			ret = -EOVERFLOW;
+			CERROR("%s: all %u/%u devices used, increase "
+			       "MAX_OBD_DEVICES: rc = %d\n", new_obd->obd_name,
+			       i, class_devno_max(), ret);
+		}
+	}
 	write_unlock(&obd_dev_lock);
-        obd_device_free(obd);
 
-        class_put_type(obd_type);
+	RETURN(ret);
 }
 
-int class_name2dev(const char *name)
+static int class_name2dev_nolock(const char *name)
 {
         int i;
 
         if (!name)
                 return -1;
 
-	read_lock(&obd_dev_lock);
         for (i = 0; i < class_devno_max(); i++) {
                 struct obd_device *obd = class_num2obd(i);
 
@@ -429,17 +599,30 @@ int class_name2dev(const char *name)
                            out any references */
                         LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
                         if (obd->obd_attached) {
-				read_unlock(&obd_dev_lock);
                                 return i;
                         }
                         break;
                 }
         }
-	read_unlock(&obd_dev_lock);
 
         return -1;
 }
 
+int class_name2dev(const char *name)
+{
+	int i;
+
+	if (!name)
+		return -1;
+
+	read_lock(&obd_dev_lock);
+	i = class_name2dev_nolock(name);
+	read_unlock(&obd_dev_lock);
+
+	return i;
+}
+EXPORT_SYMBOL(class_name2dev);
+
 struct obd_device *class_name2obd(const char *name)
 {
         int dev = class_name2dev(name);
@@ -450,25 +633,34 @@ struct obd_device *class_name2obd(const char *name)
 }
 EXPORT_SYMBOL(class_name2obd);
 
-int class_uuid2dev(struct obd_uuid *uuid)
+int class_uuid2dev_nolock(struct obd_uuid *uuid)
 {
         int i;
 
-	read_lock(&obd_dev_lock);
         for (i = 0; i < class_devno_max(); i++) {
                 struct obd_device *obd = class_num2obd(i);
 
                 if (obd && obd_uuid_equals(uuid, &obd->obd_uuid)) {
                         LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
-			read_unlock(&obd_dev_lock);
                         return i;
                 }
         }
-	read_unlock(&obd_dev_lock);
 
         return -1;
 }
 
+int class_uuid2dev(struct obd_uuid *uuid)
+{
+	int i;
+
+	read_lock(&obd_dev_lock);
+	i = class_uuid2dev_nolock(uuid);
+	read_unlock(&obd_dev_lock);
+
+	return i;
+}
+EXPORT_SYMBOL(class_uuid2dev);
+
 struct obd_device *class_uuid2obd(struct obd_uuid *uuid)
 {
         int dev = class_uuid2dev(uuid);
@@ -506,6 +698,40 @@ struct obd_device *class_num2obd(int num)
         return obd;
 }
 
+/**
+ * Find obd in obd_dev[] by name or uuid.
+ *
+ * Increment obd's refcount if found.
+ *
+ * \param[in] str obd name or uuid
+ *
+ * \retval NULL    if not found
+ * \retval target  pointer to found obd_device
+ */
+struct obd_device *class_dev_by_str(const char *str)
+{
+	struct obd_device *target = NULL;
+	struct obd_uuid tgtuuid;
+	int rc;
+
+	obd_str2uuid(&tgtuuid, str);
+
+	read_lock(&obd_dev_lock);
+	rc = class_uuid2dev_nolock(&tgtuuid);
+	if (rc < 0)
+		rc = class_name2dev_nolock(str);
+
+	if (rc >= 0)
+		target = class_num2obd(rc);
+
+	if (target != NULL)
+		class_incref(target, "find", current);
+	read_unlock(&obd_dev_lock);
+
+	RETURN(target);
+}
+EXPORT_SYMBOL(class_dev_by_str);
+
 /**
  * Get obd devices count. Device in any
  *    state are counted
@@ -675,14 +901,6 @@ void obd_cleanup_caches(void)
 		kmem_cache_destroy(obd_device_cachep);
                 obd_device_cachep = NULL;
         }
-        if (obdo_cachep) {
-		kmem_cache_destroy(obdo_cachep);
-                obdo_cachep = NULL;
-        }
-        if (import_cachep) {
-		kmem_cache_destroy(import_cachep);
-                import_cachep = NULL;
-        }
 
         EXIT;
 }
@@ -699,19 +917,6 @@ int obd_init_caches(void)
 	if (!obd_device_cachep)
 		GOTO(out, rc = -ENOMEM);
 
-	LASSERT(obdo_cachep == NULL);
-	obdo_cachep = kmem_cache_create("ll_obdo_cache", sizeof(struct obdo),
-					0, 0, NULL);
-	if (!obdo_cachep)
-		GOTO(out, rc = -ENOMEM);
-
-	LASSERT(import_cachep == NULL);
-	import_cachep = kmem_cache_create("ll_import_cache",
-					  sizeof(struct obd_import),
-					  0, 0, NULL);
-	if (!import_cachep)
-		GOTO(out, rc = -ENOMEM);
-
 	RETURN(0);
 out:
 	obd_cleanup_caches();
@@ -748,18 +953,6 @@ struct obd_device *class_exp2obd(struct obd_export *exp)
 }
 EXPORT_SYMBOL(class_exp2obd);
 
-struct obd_device *class_conn2obd(struct lustre_handle *conn)
-{
-        struct obd_export *export;
-        export = class_conn2export(conn);
-        if (export) {
-                struct obd_device *obd = export->exp_obd;
-                class_export_put(export);
-                return obd;
-        }
-        return NULL;
-}
-
 struct obd_import *class_exp2cliimp(struct obd_export *exp)
 {
         struct obd_device *obd = exp->exp_obd;
@@ -769,14 +962,6 @@ struct obd_import *class_exp2cliimp(struct obd_export *exp)
 }
 EXPORT_SYMBOL(class_exp2cliimp);
 
-struct obd_import *class_conn2cliimp(struct lustre_handle *conn)
-{
-        struct obd_device *obd = class_conn2obd(conn);
-        if (obd == NULL)
-                return NULL;
-        return obd->u.cli.cl_import;
-}
-
 /* Export management functions */
 static void class_export_destroy(struct obd_export *exp)
 {
@@ -798,7 +983,10 @@ static void class_export_destroy(struct obd_export *exp)
 	LASSERT(list_empty(&exp->exp_req_replay_queue));
 	LASSERT(list_empty(&exp->exp_hp_rpcs));
         obd_destroy_export(exp);
-        class_decref(obd, "export", exp);
+	/* self export doesn't hold a reference to an obd, although it
+	 * exists until freeing of the obd */
+	if (exp != obd->obd_self_export)
+		class_decref(obd, "export", exp);
 
         OBD_FREE_RCU(exp, sizeof(*exp), &exp->exp_handle);
         EXIT;
@@ -831,24 +1019,46 @@ void class_export_put(struct obd_export *exp)
 	       atomic_read(&exp->exp_refcount) - 1);
 
 	if (atomic_dec_and_test(&exp->exp_refcount)) {
-		LASSERT(!list_empty(&exp->exp_obd_chain));
-		LASSERT(list_empty(&exp->exp_stale_list));
+		struct obd_device *obd = exp->exp_obd;
+
 		CDEBUG(D_IOCTL, "final put %p/%s\n",
 		       exp, exp->exp_client_uuid.uuid);
 
 		/* release nid stat refererence */
 		lprocfs_exp_cleanup(exp);
 
-		obd_zombie_export_add(exp);
+		if (exp == obd->obd_self_export) {
+			/* self export should be destroyed without
+			 * zombie thread as it doesn't hold a
+			 * reference to obd and doesn't hold any
+			 * resources */
+			class_export_destroy(exp);
+			/* self export is destroyed, no class
+			 * references exist and it is safe to free
+			 * obd */
+			class_free_dev(obd);
+		} else {
+			LASSERT(!list_empty(&exp->exp_obd_chain));
+			obd_zombie_export_add(exp);
+		}
+
 	}
 }
 EXPORT_SYMBOL(class_export_put);
 
+static void obd_zombie_exp_cull(struct work_struct *ws)
+{
+	struct obd_export *export;
+
+	export = container_of(ws, struct obd_export, exp_zombie_work);
+	class_export_destroy(export);
+}
+
 /* Creates a new export, adds it to the hash table, and returns a
  * pointer to it. The refcount is 2: one for the hash reference, and
  * one for the pointer returned by this function. */
-struct obd_export *class_new_export(struct obd_device *obd,
-                                    struct obd_uuid *cluuid)
+struct obd_export *__class_new_export(struct obd_device *obd,
+				      struct obd_uuid *cluuid, bool is_self)
 {
         struct obd_export *export;
 	struct cfs_hash *hash = NULL;
@@ -862,6 +1072,7 @@ struct obd_export *class_new_export(struct obd_device *obd,
         export->exp_conn_cnt = 0;
         export->exp_lock_hash = NULL;
 	export->exp_flock_hash = NULL;
+	/* 2 = class_handle_hash + last */
 	atomic_set(&export->exp_refcount, 2);
 	atomic_set(&export->exp_rpc_count, 0);
 	atomic_set(&export->exp_cb_count, 0);
@@ -876,11 +1087,11 @@ struct obd_export *class_new_export(struct obd_device *obd,
 	spin_lock_init(&export->exp_uncommitted_replies_lock);
 	INIT_LIST_HEAD(&export->exp_uncommitted_replies);
 	INIT_LIST_HEAD(&export->exp_req_replay_queue);
-	INIT_LIST_HEAD(&export->exp_handle.h_link);
+	INIT_LIST_HEAD_RCU(&export->exp_handle.h_link);
 	INIT_LIST_HEAD(&export->exp_hp_rpcs);
 	INIT_LIST_HEAD(&export->exp_reg_rpcs);
 	class_handle_hash(&export->exp_handle, &export_handle_ops);
-	export->exp_last_request_time = cfs_time_current_sec();
+	export->exp_last_request_time = ktime_get_real_seconds();
 	spin_lock_init(&export->exp_lock);
 	spin_lock_init(&export->exp_rpc_lock);
 	INIT_HLIST_NODE(&export->exp_uuid_hash);
@@ -889,23 +1100,24 @@ struct obd_export *class_new_export(struct obd_device *obd,
 	spin_lock_init(&export->exp_bl_list_lock);
 	INIT_LIST_HEAD(&export->exp_bl_list);
 	INIT_LIST_HEAD(&export->exp_stale_list);
+	INIT_WORK(&export->exp_zombie_work, obd_zombie_exp_cull);
 
 	export->exp_sp_peer = LUSTRE_SP_ANY;
 	export->exp_flvr.sf_rpc = SPTLRPC_FLVR_INVALID;
 	export->exp_client_uuid = *cluuid;
 	obd_init_export(export);
 
-	spin_lock(&obd->obd_dev_lock);
-	/* shouldn't happen, but might race */
-	if (obd->obd_stopping)
-		GOTO(exit_unlock, rc = -ENODEV);
+	if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) {
+		spin_lock(&obd->obd_dev_lock);
+		/* shouldn't happen, but might race */
+		if (obd->obd_stopping)
+			GOTO(exit_unlock, rc = -ENODEV);
 
-	hash = cfs_hash_getref(obd->obd_uuid_hash);
-	if (hash == NULL)
-		GOTO(exit_unlock, rc = -ENODEV);
-	spin_unlock(&obd->obd_dev_lock);
+		hash = cfs_hash_getref(obd->obd_uuid_hash);
+		if (hash == NULL)
+			GOTO(exit_unlock, rc = -ENODEV);
+		spin_unlock(&obd->obd_dev_lock);
 
-        if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) {
                 rc = cfs_hash_add_unique(hash, cluuid, &export->exp_uuid_hash);
                 if (rc != 0) {
                         LCONSOLE_WARN("%s: denying duplicate export for %s, %d\n",
@@ -917,17 +1129,24 @@ struct obd_export *class_new_export(struct obd_device *obd,
 	at_init(&export->exp_bl_lock_at, obd_timeout, 0);
 	spin_lock(&obd->obd_dev_lock);
         if (obd->obd_stopping) {
-                cfs_hash_del(hash, cluuid, &export->exp_uuid_hash);
-                GOTO(exit_unlock, rc = -ENODEV);
+		if (hash)
+			cfs_hash_del(hash, cluuid, &export->exp_uuid_hash);
+		GOTO(exit_unlock, rc = -ESHUTDOWN);
         }
 
-        class_incref(obd, "export", export);
-	list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports);
-	list_add_tail(&export->exp_obd_chain_timed,
-		      &export->exp_obd->obd_exports_timed);
-        export->exp_obd->obd_num_exports++;
+	if (!is_self) {
+		class_incref(obd, "export", export);
+		list_add_tail(&export->exp_obd_chain_timed,
+			      &obd->obd_exports_timed);
+		list_add(&export->exp_obd_chain, &obd->obd_exports);
+		obd->obd_num_exports++;
+	} else {
+		INIT_LIST_HEAD(&export->exp_obd_chain_timed);
+		INIT_LIST_HEAD(&export->exp_obd_chain);
+	}
 	spin_unlock(&obd->obd_dev_lock);
-	cfs_hash_putref(hash);
+	if (hash)
+		cfs_hash_putref(hash);
 	RETURN(export);
 
 exit_unlock:
@@ -941,12 +1160,29 @@ struct obd_export *class_new_export(struct obd_device *obd,
         OBD_FREE_PTR(export);
         return ERR_PTR(rc);
 }
+
+struct obd_export *class_new_export(struct obd_device *obd,
+				    struct obd_uuid *uuid)
+{
+	return __class_new_export(obd, uuid, false);
+}
 EXPORT_SYMBOL(class_new_export);
 
+struct obd_export *class_new_export_self(struct obd_device *obd,
+					 struct obd_uuid *uuid)
+{
+	return __class_new_export(obd, uuid, true);
+}
+
 void class_unlink_export(struct obd_export *exp)
 {
 	class_handle_unhash(&exp->exp_handle);
 
+	if (exp->exp_obd->obd_self_export == exp) {
+		class_export_put(exp);
+		return;
+	}
+
 	spin_lock(&exp->exp_obd->obd_dev_lock);
 	/* delete an uuid-export hashitem from hashtables */
 	if (!hlist_unhashed(&exp->exp_uuid_hash))
@@ -981,7 +1217,7 @@ void class_unlink_export(struct obd_export *exp)
 EXPORT_SYMBOL(class_unlink_export);
 
 /* Import management functions */
-static void class_import_destroy(struct obd_import *imp)
+static void obd_zombie_import_free(struct obd_import *imp)
 {
         ENTRY;
 
@@ -1003,21 +1239,13 @@ static void class_import_destroy(struct obd_import *imp)
         }
 
         LASSERT(imp->imp_sec == NULL);
+	LASSERTF(atomic_read(&imp->imp_reqs) == 0, "%s: imp_reqs = %d\n",
+		 imp->imp_obd->obd_name, atomic_read(&imp->imp_reqs));
         class_decref(imp->imp_obd, "import", imp);
-        OBD_FREE_RCU(imp, sizeof(*imp), &imp->imp_handle);
-        EXIT;
-}
-
-static void import_handle_addref(void *import)
-{
-        class_import_get(import);
+	OBD_FREE_PTR(imp);
+	EXIT;
 }
 
-static struct portals_handle_ops import_handle_ops = {
-	.hop_addref = import_handle_addref,
-	.hop_free   = NULL,
-};
-
 struct obd_import *class_import_get(struct obd_import *import)
 {
 	atomic_inc(&import->imp_refcount);
@@ -1032,7 +1260,6 @@ void class_import_put(struct obd_import *imp)
 {
 	ENTRY;
 
-	LASSERT(list_empty(&imp->imp_zombie_chain));
         LASSERT_ATOMIC_GT_LT(&imp->imp_refcount, 0, LI_POISON);
 
         CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", imp,
@@ -1044,8 +1271,6 @@ void class_import_put(struct obd_import *imp)
                 obd_zombie_import_add(imp);
         }
 
-	/* catch possible import put race */
-	LASSERT_ATOMIC_GE_LT(&imp->imp_refcount, 0, LI_POISON);
 	EXIT;
 }
 EXPORT_SYMBOL(class_import_put);
@@ -1062,6 +1287,14 @@ static void init_imp_at(struct imp_at *at) {
         }
 }
 
+static void obd_zombie_imp_cull(struct work_struct *ws)
+{
+	struct obd_import *import;
+
+	import = container_of(ws, struct obd_import, imp_zombie_work);
+	obd_zombie_import_free(import);
+}
+
 struct obd_import *class_new_import(struct obd_device *obd)
 {
 	struct obd_import *imp;
@@ -1072,7 +1305,6 @@ struct obd_import *class_new_import(struct obd_device *obd)
 		return NULL;
 
 	INIT_LIST_HEAD(&imp->imp_pinger_chain);
-	INIT_LIST_HEAD(&imp->imp_zombie_chain);
 	INIT_LIST_HEAD(&imp->imp_replay_list);
 	INIT_LIST_HEAD(&imp->imp_sending_list);
 	INIT_LIST_HEAD(&imp->imp_delayed_list);
@@ -1086,20 +1318,21 @@ struct obd_import *class_new_import(struct obd_device *obd)
 	imp->imp_obd = class_incref(obd, "import", imp);
 	mutex_init(&imp->imp_sec_mutex);
 	init_waitqueue_head(&imp->imp_recovery_waitq);
+	INIT_WORK(&imp->imp_zombie_work, obd_zombie_imp_cull);
 
-	if (curr_pid_ns->child_reaper)
+	if (curr_pid_ns && curr_pid_ns->child_reaper)
 		imp->imp_sec_refpid = curr_pid_ns->child_reaper->pid;
 	else
 		imp->imp_sec_refpid = 1;
 
 	atomic_set(&imp->imp_refcount, 2);
 	atomic_set(&imp->imp_unregistering, 0);
+	atomic_set(&imp->imp_reqs, 0);
 	atomic_set(&imp->imp_inflight, 0);
 	atomic_set(&imp->imp_replay_inflight, 0);
+	init_waitqueue_head(&imp->imp_replay_waitq);
 	atomic_set(&imp->imp_inval_count, 0);
 	INIT_LIST_HEAD(&imp->imp_conn_list);
-	INIT_LIST_HEAD(&imp->imp_handle.h_link);
-	class_handle_hash(&imp->imp_handle, &import_handle_ops);
 	init_imp_at(&imp->imp_at);
 
 	/* the default magic is V2, will be used in connect RPC, and
@@ -1115,8 +1348,6 @@ void class_destroy_import(struct obd_import *import)
 	LASSERT(import != NULL);
 	LASSERT(import != LP_POISON);
 
-	class_handle_unhash(&import->imp_handle);
-
 	spin_lock(&import->imp_lock);
 	import->imp_generation++;
 	spin_unlock(&import->imp_lock);
@@ -1329,7 +1560,7 @@ static void class_disconnect_export_list(struct list_head *list,
 
                 class_export_get(exp);
                 CDEBUG(D_HA, "%s: disconnecting export at %s (%p), "
-		       "last request at %ld\n",
+		       "last request at %lld\n",
                        exp->exp_obd->obd_name, obd_export_nid2str(exp),
                        exp, exp->exp_last_request_time);
                 /* release one export reference anyway */
@@ -1399,13 +1630,12 @@ void class_disconnect_stale_exports(struct obd_device *obd,
 		spin_unlock(&exp->exp_lock);
 
 		list_move(&exp->exp_obd_chain, &work_list);
-                evicted++;
-                CDEBUG(D_HA, "%s: disconnect stale client %s@%s\n",
-                       obd->obd_name, exp->exp_client_uuid.uuid,
-                       exp->exp_connection == NULL ? "<unknown>" :
-                       libcfs_nid2str(exp->exp_connection->c_peer.nid));
-                print_export_data(exp, "EVICTING", 0, D_HA);
-        }
+		evicted++;
+		CDEBUG(D_HA, "%s: disconnect stale client %s@%s\n",
+		       obd->obd_name, exp->exp_client_uuid.uuid,
+		       obd_export_nid2str(exp));
+		print_export_data(exp, "EVICTING", 0, D_HA);
+	}
 	spin_unlock(&obd->obd_dev_lock);
 
 	if (evicted)
@@ -1456,15 +1686,6 @@ void class_fail_export(struct obd_export *exp)
 }
 EXPORT_SYMBOL(class_fail_export);
 
-char *obd_export_nid2str(struct obd_export *exp)
-{
-        if (exp->exp_connection != NULL)
-                return libcfs_nid2str(exp->exp_connection->c_peer.nid);
-
-        return "(no nid)";
-}
-EXPORT_SYMBOL(obd_export_nid2str);
-
 int obd_export_evict_by_nid(struct obd_device *obd, const char *nid)
 {
 	struct cfs_hash *nid_hash;
@@ -1602,10 +1823,6 @@ void dump_exports(struct obd_device *obd, int locks, int debug_level)
 	list_for_each_entry(exp, &obd->obd_delayed_exports, exp_obd_chain)
 		print_export_data(exp, "DELAYED", locks, debug_level);
 	spin_unlock(&obd->obd_dev_lock);
-	spin_lock(&obd_zombie_impexp_lock);
-	list_for_each_entry(exp, &obd_zombie_exports, exp_obd_chain)
-		print_export_data(exp, "ZOMBIE", locks, debug_level);
-	spin_unlock(&obd_zombie_impexp_lock);
 }
 
 void obd_exports_barrier(struct obd_device *obd)
@@ -1632,83 +1849,6 @@ void obd_exports_barrier(struct obd_device *obd)
 }
 EXPORT_SYMBOL(obd_exports_barrier);
 
-/* Total amount of zombies to be destroyed */
-static int zombies_count = 0;
-
-/**
- * kill zombie imports and exports
- */
-void obd_zombie_impexp_cull(void)
-{
-	struct obd_import *import;
-	struct obd_export *export;
-	ENTRY;
-
-	do {
-		spin_lock(&obd_zombie_impexp_lock);
-
-		import = NULL;
-		if (!list_empty(&obd_zombie_imports)) {
-			import = list_entry(obd_zombie_imports.next,
-					    struct obd_import,
-					    imp_zombie_chain);
-			list_del_init(&import->imp_zombie_chain);
-		}
-
-		export = NULL;
-		if (!list_empty(&obd_zombie_exports)) {
-			export = list_entry(obd_zombie_exports.next,
-					    struct obd_export,
-					    exp_obd_chain);
-			list_del_init(&export->exp_obd_chain);
-		}
-
-		spin_unlock(&obd_zombie_impexp_lock);
-
-		if (import != NULL) {
-			class_import_destroy(import);
-			spin_lock(&obd_zombie_impexp_lock);
-			zombies_count--;
-			spin_unlock(&obd_zombie_impexp_lock);
-		}
-
-		if (export != NULL) {
-			class_export_destroy(export);
-			spin_lock(&obd_zombie_impexp_lock);
-			zombies_count--;
-			spin_unlock(&obd_zombie_impexp_lock);
-		}
-
-		cond_resched();
-	} while (import != NULL || export != NULL);
-	EXIT;
-}
-
-static DECLARE_COMPLETION(obd_zombie_start);
-static DECLARE_COMPLETION(obd_zombie_stop);
-static unsigned long obd_zombie_flags;
-static DECLARE_WAIT_QUEUE_HEAD(obd_zombie_waitq);
-static pid_t obd_zombie_pid;
-
-enum {
-	OBD_ZOMBIE_STOP		= 0x0001,
-};
-
-/**
- * check for work for kill zombie import/export thread.
- */
-static int obd_zombie_impexp_check(void *arg)
-{
-	int rc;
-
-	spin_lock(&obd_zombie_impexp_lock);
-	rc = (zombies_count == 0) &&
-	     !test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags);
-	spin_unlock(&obd_zombie_impexp_lock);
-
-	RETURN(rc);
-}
-
 /**
  * Add export to the obd_zombe thread and notify it.
  */
@@ -1718,12 +1858,8 @@ static void obd_zombie_export_add(struct obd_export *exp) {
 	LASSERT(!list_empty(&exp->exp_obd_chain));
 	list_del_init(&exp->exp_obd_chain);
 	spin_unlock(&exp->exp_obd->obd_dev_lock);
-	spin_lock(&obd_zombie_impexp_lock);
-	zombies_count++;
-	list_add(&exp->exp_obd_chain, &obd_zombie_exports);
-	spin_unlock(&obd_zombie_impexp_lock);
 
-	obd_zombie_impexp_notify();
+	queue_work(zombie_wq, &exp->exp_zombie_work);
 }
 
 /**
@@ -1731,40 +1867,8 @@ static void obd_zombie_export_add(struct obd_export *exp) {
  */
 static void obd_zombie_import_add(struct obd_import *imp) {
 	LASSERT(imp->imp_sec == NULL);
-	spin_lock(&obd_zombie_impexp_lock);
-	LASSERT(list_empty(&imp->imp_zombie_chain));
-	zombies_count++;
-	list_add(&imp->imp_zombie_chain, &obd_zombie_imports);
-	spin_unlock(&obd_zombie_impexp_lock);
 
-	obd_zombie_impexp_notify();
-}
-
-/**
- * notify import/export destroy thread about new zombie.
- */
-static void obd_zombie_impexp_notify(void)
-{
-	/*
-	 * Make sure obd_zomebie_impexp_thread get this notification.
-	 * It is possible this signal only get by obd_zombie_barrier, and
-	 * barrier gulps this notification and sleeps away and hangs ensues
-	 */
-	wake_up_all(&obd_zombie_waitq);
-}
-
-/**
- * check whether obd_zombie is idle
- */
-static int obd_zombie_is_idle(void)
-{
-	int rc;
-
-	LASSERT(!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags));
-	spin_lock(&obd_zombie_impexp_lock);
-	rc = (zombies_count == 0);
-	spin_unlock(&obd_zombie_impexp_lock);
-	return rc;
+	queue_work(zombie_wq, &imp->imp_zombie_work);
 }
 
 /**
@@ -1772,12 +1876,7 @@ static int obd_zombie_is_idle(void)
  */
 void obd_zombie_barrier(void)
 {
-	struct l_wait_info lwi = { 0 };
-
-	if (obd_zombie_pid == current_pid())
-		/* don't wait for myself */
-		return;
-	l_wait_event(obd_zombie_waitq, obd_zombie_is_idle(), &lwi);
+	flush_workqueue(zombie_wq);
 }
 EXPORT_SYMBOL(obd_zombie_barrier);
 
@@ -1852,58 +1951,24 @@ void obd_stale_export_adjust(struct obd_export *exp)
 }
 EXPORT_SYMBOL(obd_stale_export_adjust);
 
-/**
- * destroy zombie export/import thread.
- */
-static int obd_zombie_impexp_thread(void *unused)
-{
-	unshare_fs_struct();
-	complete(&obd_zombie_start);
-
-	obd_zombie_pid = current_pid();
-
-	while (!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags)) {
-		struct l_wait_info lwi = { 0 };
-
-		l_wait_event(obd_zombie_waitq,
-			     !obd_zombie_impexp_check(NULL), &lwi);
-		obd_zombie_impexp_cull();
-
-		/*
-		 * Notify obd_zombie_barrier callers that queues
-		 * may be empty.
-		 */
-		wake_up(&obd_zombie_waitq);
-	}
-
-	complete(&obd_zombie_stop);
-
-	RETURN(0);
-}
-
-
 /**
  * start destroy zombie import/export thread
  */
 int obd_zombie_impexp_init(void)
 {
-	struct task_struct *task;
-
-	task = kthread_run(obd_zombie_impexp_thread, NULL, "obd_zombid");
-	if (IS_ERR(task))
-		RETURN(PTR_ERR(task));
+	zombie_wq = alloc_workqueue("obd_zombid", 0, 0);
+	if (!zombie_wq)
+		return -ENOMEM;
 
-	wait_for_completion(&obd_zombie_start);
-	RETURN(0);
+	return 0;
 }
+
 /**
  * stop destroy zombie import/export thread
  */
 void obd_zombie_impexp_stop(void)
 {
-	set_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags);
-        obd_zombie_impexp_notify();
-	wait_for_completion(&obd_zombie_stop);
+	destroy_workqueue(zombie_wq);
 	LASSERT(list_empty(&obd_stale_exports));
 }
 
@@ -1989,14 +2054,14 @@ int obd_get_request_slot(struct client_obd *cli)
 	int				 rc;
 
 	spin_lock(&cli->cl_loi_list_lock);
-	if (cli->cl_r_in_flight < cli->cl_max_rpcs_in_flight) {
-		cli->cl_r_in_flight++;
+	if (cli->cl_rpcs_in_flight < cli->cl_max_rpcs_in_flight) {
+		cli->cl_rpcs_in_flight++;
 		spin_unlock(&cli->cl_loi_list_lock);
 		return 0;
 	}
 
 	init_waitqueue_head(&orsw.orsw_waitq);
-	list_add_tail(&orsw.orsw_entry, &cli->cl_loi_read_list);
+	list_add_tail(&orsw.orsw_entry, &cli->cl_flight_waiters);
 	orsw.orsw_signaled = false;
 	spin_unlock(&cli->cl_loi_list_lock);
 
@@ -2012,7 +2077,7 @@ int obd_get_request_slot(struct client_obd *cli)
 	if (rc != 0) {
 		if (!orsw.orsw_signaled) {
 			if (list_empty(&orsw.orsw_entry))
-				cli->cl_r_in_flight--;
+				cli->cl_rpcs_in_flight--;
 			else
 				list_del(&orsw.orsw_entry);
 		}
@@ -2034,15 +2099,15 @@ void obd_put_request_slot(struct client_obd *cli)
 	struct obd_request_slot_waiter *orsw;
 
 	spin_lock(&cli->cl_loi_list_lock);
-	cli->cl_r_in_flight--;
+	cli->cl_rpcs_in_flight--;
 
 	/* If there is free slot, wakeup the first waiter. */
-	if (!list_empty(&cli->cl_loi_read_list) &&
-	    likely(cli->cl_r_in_flight < cli->cl_max_rpcs_in_flight)) {
-		orsw = list_entry(cli->cl_loi_read_list.next,
+	if (!list_empty(&cli->cl_flight_waiters) &&
+	    likely(cli->cl_rpcs_in_flight < cli->cl_max_rpcs_in_flight)) {
+		orsw = list_entry(cli->cl_flight_waiters.next,
 				  struct obd_request_slot_waiter, orsw_entry);
 		list_del_init(&orsw->orsw_entry);
-		cli->cl_r_in_flight++;
+		cli->cl_rpcs_in_flight++;
 		wake_up(&orsw->orsw_waitq);
 	}
 	spin_unlock(&cli->cl_loi_list_lock);
@@ -2061,20 +2126,21 @@ int obd_set_max_rpcs_in_flight(struct client_obd *cli, __u32 max)
 	__u32				old;
 	int				diff;
 	int				i;
-	char				*typ_name;
 	int				rc;
 
 	if (max > OBD_MAX_RIF_MAX || max < 1)
 		return -ERANGE;
 
-	typ_name = cli->cl_import->imp_obd->obd_type->typ_name;
-	if (strcmp(typ_name, LUSTRE_MDC_NAME) == 0) {
+	CDEBUG(D_INFO, "%s: max = %hu max_mod = %u rif = %u\n",
+	       cli->cl_import->imp_obd->obd_name, max,
+	       cli->cl_max_mod_rpcs_in_flight, cli->cl_max_rpcs_in_flight);
+
+	if (strcmp(cli->cl_import->imp_obd->obd_type->typ_name,
+		   LUSTRE_MDC_NAME) == 0) {
 		/* adjust max_mod_rpcs_in_flight to ensure it is always
 		 * strictly lower that max_rpcs_in_flight */
 		if (max < 2) {
-			CERROR("%s: cannot set max_rpcs_in_flight to 1 "
-			       "because it must be higher than "
-			       "max_mod_rpcs_in_flight value",
+			CERROR("%s: cannot set mdc.*.max_rpcs_in_flight=1\n",
 			       cli->cl_import->imp_obd->obd_name);
 			return -ERANGE;
 		}
@@ -2088,17 +2154,19 @@ int obd_set_max_rpcs_in_flight(struct client_obd *cli, __u32 max)
 	spin_lock(&cli->cl_loi_list_lock);
 	old = cli->cl_max_rpcs_in_flight;
 	cli->cl_max_rpcs_in_flight = max;
+	client_adjust_max_dirty(cli);
+
 	diff = max - old;
 
 	/* We increase the max_rpcs_in_flight, then wakeup some waiters. */
 	for (i = 0; i < diff; i++) {
-		if (list_empty(&cli->cl_loi_read_list))
+		if (list_empty(&cli->cl_flight_waiters))
 			break;
 
-		orsw = list_entry(cli->cl_loi_read_list.next,
+		orsw = list_entry(cli->cl_flight_waiters.next,
 				  struct obd_request_slot_waiter, orsw_entry);
 		list_del_init(&orsw->orsw_entry);
-		cli->cl_r_in_flight++;
+		cli->cl_rpcs_in_flight++;
 		wake_up(&orsw->orsw_waitq);
 	}
 	spin_unlock(&cli->cl_loi_list_lock);
@@ -2115,32 +2183,50 @@ EXPORT_SYMBOL(obd_get_max_mod_rpcs_in_flight);
 
 int obd_set_max_mod_rpcs_in_flight(struct client_obd *cli, __u16 max)
 {
-	struct obd_connect_data	*ocd;
+	struct obd_connect_data *ocd;
 	__u16 maxmodrpcs;
 	__u16 prev;
 
 	if (max > OBD_MAX_RIF_MAX || max < 1)
 		return -ERANGE;
 
-	/* cannot exceed or equal max_rpcs_in_flight */
+	ocd = &cli->cl_import->imp_connect_data;
+	CDEBUG(D_INFO, "%s: max = %hu flags = %llx, max_mod = %u rif = %u\n",
+	       cli->cl_import->imp_obd->obd_name, max, ocd->ocd_connect_flags,
+	       ocd->ocd_maxmodrpcs, cli->cl_max_rpcs_in_flight);
+
+	if (max == OBD_MAX_RIF_MAX)
+		max = OBD_MAX_RIF_MAX - 1;
+
+	/* Cannot exceed or equal max_rpcs_in_flight.  If we are asked to
+	 * increase this value, also bump up max_rpcs_in_flight to match.
+	 */
 	if (max >= cli->cl_max_rpcs_in_flight) {
-		CERROR("%s: can't set max_mod_rpcs_in_flight to a value (%hu) "
-		       "higher or equal to max_rpcs_in_flight value (%u)\n",
-		       cli->cl_import->imp_obd->obd_name,
-		       max, cli->cl_max_rpcs_in_flight);
-		return -ERANGE;
+		CDEBUG(D_INFO,
+		       "%s: increasing max_rpcs_in_flight=%hu to allow larger max_mod_rpcs_in_flight=%u\n",
+		       cli->cl_import->imp_obd->obd_name, max + 1, max);
+		obd_set_max_rpcs_in_flight(cli, max + 1);
 	}
 
-	/* cannot exceed max modify RPCs in flight supported by the server */
-	ocd = &cli->cl_import->imp_connect_data;
-	if (ocd->ocd_connect_flags & OBD_CONNECT_MULTIMODRPCS)
+	/* cannot exceed max modify RPCs in flight supported by the server,
+	 * but verify ocd_connect_flags is at least initialized first.  If
+	 * not, allow it and fix value later in ptlrpc_connect_set_flags().
+	 */
+	if (!ocd->ocd_connect_flags) {
+		maxmodrpcs = cli->cl_max_rpcs_in_flight - 1;
+	} else if (ocd->ocd_connect_flags & OBD_CONNECT_MULTIMODRPCS) {
 		maxmodrpcs = ocd->ocd_maxmodrpcs;
-	else
+		if (maxmodrpcs == 0) { /* connection not finished yet */
+			maxmodrpcs = cli->cl_max_rpcs_in_flight - 1;
+			CDEBUG(D_INFO,
+			       "%s: partial connect, assume maxmodrpcs=%hu\n",
+			       cli->cl_import->imp_obd->obd_name, maxmodrpcs);
+		}
+	} else {
 		maxmodrpcs = 1;
+	}
 	if (max > maxmodrpcs) {
-		CERROR("%s: can't set max_mod_rpcs_in_flight to a value (%hu) "
-		       "higher than max_mod_rpcs_per_client value (%hu) "
-		       "returned by the server at connection\n",
+		CERROR("%s: can't set max_mod_rpcs_in_flight=%hu higher than ocd_maxmodrpcs=%hu returned by the server at connection\n",
 		       cli->cl_import->imp_obd->obd_name,
 		       max, maxmodrpcs);
 		return -ERANGE;
@@ -2161,8 +2247,6 @@ int obd_set_max_mod_rpcs_in_flight(struct client_obd *cli, __u16 max)
 }
 EXPORT_SYMBOL(obd_set_max_mod_rpcs_in_flight);
 
-
-#define pct(a, b) (b ? a * 100 / b : 0)
 int obd_mod_rpc_stats_seq_show(struct client_obd *cli,
 			       struct seq_file *seq)
 {
@@ -2188,7 +2272,7 @@ int obd_mod_rpc_stats_seq_show(struct client_obd *cli,
 	for (i = 0; i < OBD_HIST_MAX; i++) {
 		unsigned long mod = cli->cl_mod_rpcs_hist.oh_buckets[i];
 		mod_cum += mod;
-		seq_printf(seq, "%d:\t\t%10lu %3lu %3lu\n",
+		seq_printf(seq, "%d:\t\t%10lu %3u %3u\n",
 			   i, mod, pct(mod, mod_tot),
 			   pct(mod_cum, mod_tot));
 		if (mod_cum == mod_tot)
@@ -2200,8 +2284,6 @@ int obd_mod_rpc_stats_seq_show(struct client_obd *cli,
 	return 0;
 }
 EXPORT_SYMBOL(obd_mod_rpc_stats_seq_show);
-#undef pct
-
 
 /* The number of modify RPCs sent in parallel is limited
  * because the server has a finite number of slots per client to
@@ -2243,7 +2325,7 @@ static inline bool obd_skip_mod_rpc_slot(const struct lookup_intent *it)
 	if (it != NULL &&
 	    (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP ||
 	     it->it_op == IT_READDIR ||
-	     (it->it_op == IT_LAYOUT && !(it->it_flags & FMODE_WRITE))))
+	     (it->it_op == IT_LAYOUT && !(it->it_flags & MDS_FMODE_WRITE))))
 			return true;
 	return false;
 }
@@ -2297,8 +2379,9 @@ __u16 obd_get_mod_rpc_slot(struct client_obd *cli, __u32 opc,
 		       "opc %u, max %hu\n",
 		       cli->cl_import->imp_obd->obd_name, opc, max);
 
-		l_wait_event(cli->cl_mod_rpcs_waitq,
-			     obd_mod_rpc_slot_avail(cli, close_req), &lwi);
+		l_wait_event_exclusive(cli->cl_mod_rpcs_waitq,
+				       obd_mod_rpc_slot_avail(cli, close_req),
+				       &lwi);
 	} while (true);
 }
 EXPORT_SYMBOL(obd_get_mod_rpc_slot);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/idmap.c b/drivers/staging/lustrefsx/lustre/obdclass/idmap.c
index b45c6d6a55357..1fcbb2a839f9d 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/idmap.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/idmap.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -47,15 +47,6 @@
 #include <md_object.h>
 #include <obd_support.h>
 
-#define lustre_get_group_info(group_info) do {		\
-	atomic_inc(&(group_info)->usage);		\
-} while (0)
-
-#define lustre_put_group_info(group_info) do {		\
-	if (atomic_dec_and_test(&(group_info)->usage))	\
-		groups_free(group_info);		\
-} while (0)
-
 /*
  * groups_search() is copied from linux kernel!
  * A simple bsearch.
@@ -110,12 +101,12 @@ EXPORT_SYMBOL(lustre_groups_from_list);
 /* a simple shell-metzner sort */
 void lustre_groups_sort(struct group_info *group_info)
 {
-        int base, max, stride;
-        int gidsetsize = group_info->ngroups;
+	int base, max, stride;
+	int gidsetsize = group_info->ngroups;
 
-        for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
-                ; /* nothing */
-        stride /= 3;
+	for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
+		; /* nothing */
+	stride /= 3;
 
 	while (stride) {
 		max = gidsetsize - stride;
@@ -162,9 +153,10 @@ int lustre_in_group_p(struct lu_ucred *mu, gid_t grp)
 		if (!group_info)
 			return 0;
 
-		lustre_get_group_info(group_info);
+		atomic_inc(&group_info->usage);
 		rc = lustre_groups_search(group_info, grp);
-		lustre_put_group_info(group_info);
+		if (atomic_dec_and_test(&group_info->usage))
+			groups_free(group_info);
 	}
 	return rc;
 }
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/integrity.c b/drivers/staging/lustrefsx/lustre/obdclass/integrity.c
new file mode 100644
index 0000000000000..4a6d27aa6ae36
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/integrity.c
@@ -0,0 +1,277 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2018, DataDirect Networks Storage.
+ * Author: Li Xi.
+ *
+ * General data integrity functions
+ */
+#include <linux/blkdev.h>
+#include <linux/crc-t10dif.h>
+#include <asm/checksum.h>
+#include <obd_class.h>
+#include <obd_cksum.h>
+
+#if IS_ENABLED(CONFIG_CRC_T10DIF)
+__u16 obd_dif_crc_fn(void *data, unsigned int len)
+{
+	return cpu_to_be16(crc_t10dif(data, len));
+}
+EXPORT_SYMBOL(obd_dif_crc_fn);
+
+__u16 obd_dif_ip_fn(void *data, unsigned int len)
+{
+	return ip_compute_csum(data, len);
+}
+EXPORT_SYMBOL(obd_dif_ip_fn);
+
+int obd_page_dif_generate_buffer(const char *obd_name, struct page *page,
+				 __u32 offset, __u32 length,
+				 __u16 *guard_start, int guard_number,
+				 int *used_number, int sector_size,
+				 obd_dif_csum_fn *fn)
+{
+	unsigned int i = offset;
+	unsigned int end = offset + length;
+	char *data_buf;
+	__u16 *guard_buf = guard_start;
+	unsigned int data_size;
+	int used = 0;
+
+	data_buf = kmap(page) + offset;
+	while (i < end) {
+		if (used >= guard_number) {
+			CERROR("%s: unexpected used guard number of DIF %u/%u, "
+			       "data length %u, sector size %u: rc = %d\n",
+			       obd_name, used, guard_number, length,
+			       sector_size, -E2BIG);
+			return -E2BIG;
+		}
+		data_size = min(round_up(i + 1, sector_size), end) - i;
+		*guard_buf = fn(data_buf, data_size);
+		guard_buf++;
+		data_buf += data_size;
+		i += data_size;
+		used++;
+	}
+	kunmap(page);
+	*used_number = used;
+
+	return 0;
+}
+EXPORT_SYMBOL(obd_page_dif_generate_buffer);
+
+static int __obd_t10_performance_test(const char *obd_name,
+				      enum cksum_types cksum_type,
+				      struct page *data_page,
+				      int repeat_number)
+{
+	unsigned char cfs_alg = cksum_obd2cfs(OBD_CKSUM_T10_TOP);
+	struct ahash_request *req;
+	obd_dif_csum_fn *fn = NULL;
+	unsigned int bufsize;
+	unsigned char *buffer;
+	struct page *__page;
+	__u16 *guard_start;
+	int guard_number;
+	int used_number = 0;
+	int sector_size = 0;
+	__u32 cksum;
+	int rc = 0;
+	int rc2;
+	int used;
+	int i;
+
+	obd_t10_cksum2dif(cksum_type, &fn, &sector_size);
+	if (!fn)
+		return -EINVAL;
+
+	__page = alloc_page(GFP_KERNEL);
+	if (__page == NULL)
+		return -ENOMEM;
+
+	req = cfs_crypto_hash_init(cfs_alg, NULL, 0);
+	if (IS_ERR(req)) {
+		rc = PTR_ERR(req);
+		CERROR("%s: unable to initialize checksum hash %s: rc = %d\n",
+		       obd_name, cfs_crypto_hash_name(cfs_alg), rc);
+		GOTO(out, rc);
+	}
+
+	buffer = kmap(__page);
+	guard_start = (__u16 *)buffer;
+	guard_number = PAGE_SIZE / sizeof(*guard_start);
+	for (i = 0; i < repeat_number; i++) {
+		/*
+		 * The left guard number should be able to hold checksums of a
+		 * whole page
+		 */
+		rc = obd_page_dif_generate_buffer(obd_name, data_page, 0,
+						  PAGE_SIZE,
+						  guard_start + used_number,
+						  guard_number - used_number,
+						  &used, sector_size, fn);
+		if (rc)
+			break;
+
+		used_number += used;
+		if (used_number == guard_number) {
+			cfs_crypto_hash_update_page(req, __page, 0,
+				used_number * sizeof(*guard_start));
+			used_number = 0;
+		}
+	}
+	kunmap(__page);
+	if (rc)
+		GOTO(out_final, rc);
+
+	if (used_number != 0)
+		cfs_crypto_hash_update_page(req, __page, 0,
+			used_number * sizeof(*guard_start));
+
+	bufsize = sizeof(cksum);
+out_final:
+	rc2 = cfs_crypto_hash_final(req, (unsigned char *)&cksum, &bufsize);
+	rc = rc ? rc : rc2;
+out:
+	__free_page(__page);
+
+	return rc;
+}
+
+/**
+ *  Array of T10PI checksum algorithm speed in MByte per second
+ */
+static int obd_t10_cksum_speeds[OBD_T10_CKSUM_MAX];
+
+static enum obd_t10_cksum_type
+obd_t10_cksum2type(enum cksum_types cksum_type)
+{
+	switch (cksum_type) {
+	case OBD_CKSUM_T10IP512:
+		return OBD_T10_CKSUM_IP512;
+	case OBD_CKSUM_T10IP4K:
+		return OBD_T10_CKSUM_IP4K;
+	case OBD_CKSUM_T10CRC512:
+		return OBD_T10_CKSUM_CRC512;
+	case OBD_CKSUM_T10CRC4K:
+		return OBD_T10_CKSUM_CRC4K;
+	default:
+		return OBD_T10_CKSUM_UNKNOWN;
+	}
+}
+
+static const char *obd_t10_cksum_name(enum obd_t10_cksum_type index)
+{
+	DECLARE_CKSUM_NAME;
+
+	/* Need to skip "crc32", "adler", "crc32c", "reserved" */
+	return cksum_name[3 + index];
+}
+
+/**
+ * Compute the speed of specified T10PI checksum type
+ *
+ * Run a speed test on the given T10PI checksum on buffer using a 1MB buffer
+ * size. This is a reasonable buffer size for Lustre RPCs, even if the actual
+ * RPC size is larger or smaller.
+ *
+ * The speed is stored internally in the obd_t10_cksum_speeds[] array, and
+ * is available through the obd_t10_cksum_speed() function.
+ *
+ * This function needs to stay the same as cfs_crypto_performance_test() so
+ * that the speeds are comparable. And this function should reflect the real
+ * cost of the checksum calculation.
+ *
+ * \param[in] obd_name		name of the OBD device
+ * \param[in] cksum_type	checksum type (OBD_CKSUM_T10*)
+ */
+static void obd_t10_performance_test(const char *obd_name,
+				     enum cksum_types cksum_type)
+{
+	enum obd_t10_cksum_type index = obd_t10_cksum2type(cksum_type);
+	const int buf_len = max(PAGE_SIZE, 1048576UL);
+	unsigned long bcount;
+	unsigned long start;
+	unsigned long end;
+	struct page *page;
+	int rc = 0;
+	void *buf;
+
+	page = alloc_page(GFP_KERNEL);
+	if (page == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	buf = kmap(page);
+	memset(buf, 0xAD, PAGE_SIZE);
+	kunmap(page);
+
+	for (start = jiffies, end = start + msecs_to_jiffies(MSEC_PER_SEC / 4),
+	     bcount = 0; time_before(jiffies, end) && rc == 0; bcount++) {
+		rc = __obd_t10_performance_test(obd_name, cksum_type, page,
+						buf_len / PAGE_SIZE);
+		if (rc)
+			break;
+	}
+	end = jiffies;
+	__free_page(page);
+out:
+	if (rc) {
+		obd_t10_cksum_speeds[index] = rc;
+		CDEBUG(D_INFO, "%s: T10 checksum algorithm %s test error: "
+		       "rc = %d\n", obd_name, obd_t10_cksum_name(index), rc);
+	} else {
+		unsigned long tmp;
+
+		tmp = ((bcount * buf_len / jiffies_to_msecs(end - start)) *
+		       1000) / (1024 * 1024);
+		obd_t10_cksum_speeds[index] = (int)tmp;
+		CDEBUG(D_CONFIG, "%s: T10 checksum algorithm %s speed = %d "
+		       "MB/s\n", obd_name, obd_t10_cksum_name(index),
+		       obd_t10_cksum_speeds[index]);
+	}
+}
+#endif /* CONFIG_CRC_T10DIF */
+
+int obd_t10_cksum_speed(const char *obd_name,
+			enum cksum_types cksum_type)
+{
+#if IS_ENABLED(CONFIG_CRC_T10DIF)
+	enum obd_t10_cksum_type index = obd_t10_cksum2type(cksum_type);
+
+	if (unlikely(obd_t10_cksum_speeds[index] == 0)) {
+		static DEFINE_MUTEX(obd_t10_cksum_speed_mutex);
+
+		mutex_lock(&obd_t10_cksum_speed_mutex);
+		if (obd_t10_cksum_speeds[index] == 0)
+			obd_t10_performance_test(obd_name, cksum_type);
+		mutex_unlock(&obd_t10_cksum_speed_mutex);
+	}
+
+	return obd_t10_cksum_speeds[index];
+#else /* !CONFIG_CRC_T10DIF */
+	return 0;
+#endif /* !CONFIG_CRC_T10DIF */
+}
+EXPORT_SYMBOL(obd_t10_cksum_speed);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/jobid.c b/drivers/staging/lustrefsx/lustre/obdclass/jobid.c
new file mode 100644
index 0000000000000..b7a08d495b2ce
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/jobid.c
@@ -0,0 +1,575 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ *
+ * Copyright 2017 Cray Inc, all rights reserved.
+ * Author: Ben Evans.
+ *
+ * Store PID->JobID mappings
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <linux/user_namespace.h>
+#ifdef HAVE_UIDGID_HEADER
+#include <linux/uidgid.h>
+#endif
+#include <linux/utsname.h>
+
+#include <libcfs/libcfs.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+
+static struct cfs_hash *jobid_hash;
+static struct cfs_hash_ops jobid_hash_ops;
+spinlock_t jobid_hash_lock;
+
+#define RESCAN_INTERVAL 30
+#define DELETE_INTERVAL 300
+
+char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE;
+char obd_jobid_name[LUSTRE_JOBID_SIZE] = "%e.%u";
+
+/**
+ * Structure to store a single PID->JobID mapping
+ */
+struct jobid_pid_map {
+	struct hlist_node	jp_hash;
+	time64_t		jp_time;
+	spinlock_t		jp_lock; /* protects jp_jobid */
+	char			jp_jobid[LUSTRE_JOBID_SIZE];
+	unsigned int		jp_joblen;
+	atomic_t		jp_refcount;
+	pid_t			jp_pid;
+};
+
+/*
+ * Get jobid of current process by reading the environment variable
+ * stored in between the "env_start" & "env_end" of task struct.
+ *
+ * If some job scheduler doesn't store jobid in the "env_start/end",
+ * then an upcall could be issued here to get the jobid by utilizing
+ * the userspace tools/API. Then, the jobid must be cached.
+ */
+int jobid_get_from_environ(char *jobid_var, char *jobid, int *jobid_len)
+{
+	int rc;
+
+	rc = cfs_get_environ(jobid_var, jobid, jobid_len);
+	if (!rc)
+		goto out;
+
+	if (rc == -EOVERFLOW) {
+		/* For the PBS_JOBID and LOADL_STEP_ID keys (which are
+		 * variable length strings instead of just numbers), it
+		 * might make sense to keep the unique parts for JobID,
+		 * instead of just returning an error.  That means a
+		 * larger temp buffer for cfs_get_environ(), then
+		 * truncating the string at some separator to fit into
+		 * the specified jobid_len.  Fix later if needed. */
+		static ktime_t printed;
+
+		if (unlikely(ktime_to_ns(printed) == 0 ||
+			     ktime_after(ktime_get(),
+					 ktime_add_ns(printed,
+						      3600*24*NSEC_PER_SEC)))) {
+			LCONSOLE_WARN("jobid: '%s' value too large (%d)\n",
+				      obd_jobid_var, *jobid_len);
+			printed = ktime_get();
+		}
+
+		rc = 0;
+	} else {
+		CDEBUG_LIMIT((rc == -ENOENT || rc == -EINVAL ||
+			      rc == -EDEADLK) ? D_INFO : D_ERROR,
+			     "jobid: get '%s' failed: rc = %d\n",
+			     obd_jobid_var, rc);
+	}
+
+out:
+	return rc;
+}
+
+/*
+ * jobid_should_free_item
+ *
+ * Each item is checked to see if it should be released
+ * Removed from hash table by caller
+ * Actually freed in jobid_put_locked
+ *
+ * Returns 1 if item is to be freed, 0 if it is to be kept
+ */
+
+static int jobid_should_free_item(void *obj, void *data)
+{
+	char *jobid = data;
+	struct jobid_pid_map *pidmap = obj;
+	int rc = 0;
+
+	if (obj == NULL)
+		return 0;
+
+	if (jobid == NULL) {
+		WARN_ON_ONCE(atomic_read(&pidmap->jp_refcount) != 1);
+		return 1;
+	}
+
+	spin_lock(&pidmap->jp_lock);
+	/* prevent newly inserted items from deleting */
+	if (jobid[0] == '\0' && atomic_read(&pidmap->jp_refcount) == 1)
+		rc = 1;
+	else if (ktime_get_real_seconds() - pidmap->jp_time > DELETE_INTERVAL)
+		rc = 1;
+	else if (strcmp(pidmap->jp_jobid, jobid) == 0)
+		rc = 1;
+	spin_unlock(&pidmap->jp_lock);
+
+	return rc;
+}
+
+/*
+ * jobid_name_is_valid
+ *
+ * Checks if the jobid is a Lustre process
+ *
+ * Returns true if jobid is valid
+ * Returns false if jobid looks like it's a Lustre process
+ */
+static bool jobid_name_is_valid(char *jobid)
+{
+	const char *const lustre_reserved[] = { "ll_ping", "ptlrpc",
+						"ldlm", "ll_sa", NULL };
+	int i;
+
+	if (jobid[0] == '\0')
+		return false;
+
+	for (i = 0; lustre_reserved[i] != NULL; i++) {
+		if (strncmp(jobid, lustre_reserved[i],
+			    strlen(lustre_reserved[i])) == 0)
+			return false;
+	}
+	return true;
+}
+
+/*
+ * jobid_get_from_cache()
+ *
+ * Returns contents of jobid_var from process environment for current PID.
+ * This will be cached for some time to avoid overhead scanning environment.
+ *
+ * Return: -ENOMEM if allocating a new pidmap fails
+ *         -ENOENT if no entry could be found
+ *         +ve string length for success (something was returned in jobid)
+ */
+static int jobid_get_from_cache(char *jobid, size_t joblen)
+{
+	static time64_t last_expire;
+	bool expire_cache = false;
+	pid_t pid = current_pid();
+	struct jobid_pid_map *pidmap = NULL;
+	time64_t now = ktime_get_real_seconds();
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(jobid_hash != NULL);
+
+	/* scan hash periodically to remove old PID entries from cache */
+	spin_lock(&jobid_hash_lock);
+	if (unlikely(last_expire + DELETE_INTERVAL <= now)) {
+		expire_cache = true;
+		last_expire = now;
+	}
+	spin_unlock(&jobid_hash_lock);
+
+	if (expire_cache)
+		cfs_hash_cond_del(jobid_hash, jobid_should_free_item,
+				  "intentionally_bad_jobid");
+
+	/* first try to find PID in the hash and use that value */
+	pidmap = cfs_hash_lookup(jobid_hash, &pid);
+	if (pidmap == NULL) {
+		struct jobid_pid_map *pidmap2;
+
+		OBD_ALLOC_PTR(pidmap);
+		if (pidmap == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		pidmap->jp_pid = pid;
+		pidmap->jp_time = 0;
+		pidmap->jp_jobid[0] = '\0';
+		spin_lock_init(&pidmap->jp_lock);
+		INIT_HLIST_NODE(&pidmap->jp_hash);
+		/*
+		 * @pidmap might be reclaimed just after it is added into
+		 * hash list, init @jp_refcount as 1 to make sure memory
+		 * could be not freed during access.
+		 */
+		atomic_set(&pidmap->jp_refcount, 1);
+
+		/*
+		 * Add the newly created map to the hash, on key collision we
+		 * lost a racing addition and must destroy our newly allocated
+		 * map.  The object which exists in the hash will be returned.
+		 */
+		pidmap2 = cfs_hash_findadd_unique(jobid_hash, &pid,
+						  &pidmap->jp_hash);
+		if (unlikely(pidmap != pidmap2)) {
+			CDEBUG(D_INFO, "jobid: duplicate found for PID=%u\n",
+			       pid);
+			OBD_FREE_PTR(pidmap);
+			pidmap = pidmap2;
+		}
+	}
+
+	/*
+	 * If pidmap is old (this is always true for new entries) refresh it.
+	 * If obd_jobid_var is not found, cache empty entry and try again
+	 * later, to avoid repeat lookups for PID if obd_jobid_var missing.
+	 */
+	spin_lock(&pidmap->jp_lock);
+	if (pidmap->jp_time + RESCAN_INTERVAL <= now) {
+		char env_jobid[LUSTRE_JOBID_SIZE] = "";
+		int env_len = sizeof(env_jobid);
+
+		pidmap->jp_time = now;
+
+		spin_unlock(&pidmap->jp_lock);
+		rc = jobid_get_from_environ(obd_jobid_var, env_jobid, &env_len);
+
+		CDEBUG(D_INFO, "jobid: PID mapping established: %d->%s\n",
+		       pidmap->jp_pid, env_jobid);
+		spin_lock(&pidmap->jp_lock);
+		if (!rc) {
+			pidmap->jp_joblen = env_len;
+			strlcpy(pidmap->jp_jobid, env_jobid,
+				sizeof(pidmap->jp_jobid));
+			rc = 0;
+		} else if (rc == -ENOENT) {
+			/* It might have been deleted, clear out old entry */
+			pidmap->jp_joblen = 0;
+			pidmap->jp_jobid[0] = '\0';
+		}
+	}
+
+	/*
+	 * Regardless of how pidmap was found, if it contains a valid entry
+	 * use that for now.  If there was a technical error (e.g. -ENOMEM)
+	 * use the old cached value until it can be looked up again properly.
+	 * If a cached missing entry was found, return -ENOENT.
+	 */
+	if (pidmap->jp_joblen) {
+		strlcpy(jobid, pidmap->jp_jobid, joblen);
+		joblen = pidmap->jp_joblen;
+		rc = 0;
+	} else if (!rc) {
+		rc = -ENOENT;
+	}
+	spin_unlock(&pidmap->jp_lock);
+
+	cfs_hash_put(jobid_hash, &pidmap->jp_hash);
+
+	EXIT;
+out:
+	return rc < 0 ? rc : joblen;
+}
+
+/*
+ * jobid_interpret_string()
+ *
+ * Interpret the jobfmt string to expand specified fields, like coredumps do:
+ *   %e = executable
+ *   %g = gid
+ *   %h = hostname
+ *   %j = jobid from environment
+ *   %p = pid
+ *   %u = uid
+ *
+ * Unknown escape strings are dropped.  Other characters are copied through,
+ * excluding whitespace (to avoid making jobid parsing difficult).
+ *
+ * Return: -EOVERFLOW if the expanded string does not fit within @joblen
+ *         0 for success
+ */
+static int jobid_interpret_string(const char *jobfmt, char *jobid,
+				  ssize_t joblen)
+{
+	char c;
+
+	while ((c = *jobfmt++) && joblen > 1) {
+		char f;
+		int l;
+
+		if (isspace(c)) /* Don't allow embedded spaces */
+			continue;
+
+		if (c != '%') {
+			*jobid = c;
+			joblen--;
+			jobid++;
+			continue;
+		}
+
+		switch ((f = *jobfmt++)) {
+		case 'e': /* executable name */
+			l = snprintf(jobid, joblen, "%s", current_comm());
+			break;
+		case 'g': /* group ID */
+			l = snprintf(jobid, joblen, "%u",
+				     from_kgid(&init_user_ns, current_fsgid()));
+			break;
+		case 'h': /* hostname */
+			l = snprintf(jobid, joblen, "%s",
+				     init_utsname()->nodename);
+			break;
+		case 'j': /* jobid stored in process environment */
+			l = jobid_get_from_cache(jobid, joblen);
+			if (l < 0)
+				l = 0;
+			break;
+		case 'p': /* process ID */
+			l = snprintf(jobid, joblen, "%u", current_pid());
+			break;
+		case 'u': /* user ID */
+			l = snprintf(jobid, joblen, "%u",
+				     from_kuid(&init_user_ns, current_fsuid()));
+			break;
+		case '\0': /* '%' at end of format string */
+			l = 0;
+			goto out;
+		default: /* drop unknown %x format strings */
+			l = 0;
+			break;
+		}
+		jobid += l;
+		joblen -= l;
+	}
+	/*
+	 * This points at the end of the buffer, so long as jobid is always
+	 * incremented the same amount as joblen is decremented.
+	 */
+out:
+	jobid[joblen - 1] = '\0';
+
+	return joblen < 0 ? -EOVERFLOW : 0;
+}
+
+/*
+ * Hash initialization, copied from server-side job stats bucket sizes
+ */
+#define HASH_JOBID_BKT_BITS 5
+#define HASH_JOBID_CUR_BITS 7
+#define HASH_JOBID_MAX_BITS 12
+
+int jobid_cache_init(void)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (jobid_hash)
+		return 0;
+
+	spin_lock_init(&jobid_hash_lock);
+	jobid_hash = cfs_hash_create("JOBID_HASH", HASH_JOBID_CUR_BITS,
+				     HASH_JOBID_MAX_BITS, HASH_JOBID_BKT_BITS,
+				     0, CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA,
+				     &jobid_hash_ops, CFS_HASH_DEFAULT);
+	if (!jobid_hash)
+		rc = -ENOMEM;
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(jobid_cache_init);
+
+void jobid_cache_fini(void)
+{
+	struct cfs_hash *tmp_hash;
+	ENTRY;
+
+	spin_lock(&jobid_hash_lock);
+	tmp_hash = jobid_hash;
+	jobid_hash = NULL;
+	spin_unlock(&jobid_hash_lock);
+
+	if (tmp_hash != NULL) {
+		cfs_hash_cond_del(tmp_hash, jobid_should_free_item, NULL);
+		cfs_hash_putref(tmp_hash);
+	}
+
+	EXIT;
+}
+EXPORT_SYMBOL(jobid_cache_fini);
+
+/*
+ * Hash operations for pid<->jobid
+ */
+static unsigned jobid_hashfn(struct cfs_hash *hs, const void *key,
+			     unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, sizeof(pid_t), mask);
+}
+
+static void *jobid_key(struct hlist_node *hnode)
+{
+	struct jobid_pid_map *pidmap;
+
+	pidmap = hlist_entry(hnode, struct jobid_pid_map, jp_hash);
+	return &pidmap->jp_pid;
+}
+
+static int jobid_keycmp(const void *key, struct hlist_node *hnode)
+{
+	const pid_t *pid_key1;
+	const pid_t *pid_key2;
+
+	LASSERT(key != NULL);
+	pid_key1 = (pid_t *)key;
+	pid_key2 = (pid_t *)jobid_key(hnode);
+
+	return *pid_key1 == *pid_key2;
+}
+
+static void *jobid_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct jobid_pid_map, jp_hash);
+}
+
+static void jobid_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct jobid_pid_map *pidmap;
+
+	pidmap = hlist_entry(hnode, struct jobid_pid_map, jp_hash);
+
+	atomic_inc(&pidmap->jp_refcount);
+}
+
+static void jobid_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct jobid_pid_map *pidmap;
+
+	if (hnode == NULL)
+		return;
+
+	pidmap = hlist_entry(hnode, struct jobid_pid_map, jp_hash);
+	LASSERT(atomic_read(&pidmap->jp_refcount) > 0);
+	if (atomic_dec_and_test(&pidmap->jp_refcount)) {
+		CDEBUG(D_INFO, "Freeing: %d->%s\n",
+		       pidmap->jp_pid, pidmap->jp_jobid);
+
+		OBD_FREE_PTR(pidmap);
+	}
+}
+
+static struct cfs_hash_ops jobid_hash_ops = {
+	.hs_hash	= jobid_hashfn,
+	.hs_keycmp	= jobid_keycmp,
+	.hs_key		= jobid_key,
+	.hs_object	= jobid_object,
+	.hs_get		= jobid_get,
+	.hs_put		= jobid_put_locked,
+	.hs_put_locked	= jobid_put_locked,
+};
+
+/**
+ * Generate the job identifier string for this process for tracking purposes.
+ *
+ * Fill in @jobid string based on the value of obd_jobid_var:
+ * JOBSTATS_DISABLE:      none
+ * JOBSTATS_NODELOCAL:    content of obd_jobid_node (jobid_interpret_string())
+ * JOBSTATS_PROCNAME_UID: process name/UID
+ * anything else:         look up obd_jobid_var in the processes environment
+ *
+ * Return -ve error number, 0 on success.
+ */
+int lustre_get_jobid(char *jobid, size_t joblen)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (unlikely(joblen < 2)) {
+		if (joblen == 1)
+			jobid[0] = '\0';
+		RETURN(-EINVAL);
+	}
+
+	if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0) {
+		/* Jobstats isn't enabled */
+		memset(jobid, 0, joblen);
+	} else if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) == 0) {
+		/* Whole node dedicated to single job */
+		rc = jobid_interpret_string(obd_jobid_name, jobid, joblen);
+	} else if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) {
+		rc = jobid_interpret_string("%e.%u", jobid, joblen);
+	} else if (jobid_name_is_valid(current_comm())) {
+		/*
+		 * obd_jobid_var holds the jobid environment variable name.
+		 * Skip initial check if obd_jobid_name already uses "%j",
+		 * otherwise try just "%j" first, then fall back to whatever
+		 * is in obd_jobid_name if obd_jobid_var is not found.
+		 */
+		rc = -EAGAIN;
+		if (!strnstr(obd_jobid_name, "%j", joblen))
+			rc = jobid_get_from_cache(jobid, joblen);
+
+		/* fall back to jobid_node if jobid_var not in environment */
+		if (rc < 0) {
+			int rc2 = jobid_interpret_string(obd_jobid_name,
+							 jobid, joblen);
+			if (!rc2)
+				rc = 0;
+		}
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_get_jobid);
+
+/*
+ * lustre_jobid_clear
+ *
+ * Search cache for JobID given by @find_jobid.
+ * If any entries in the hash table match the value, they are removed
+ */
+void lustre_jobid_clear(const char *find_jobid)
+{
+	char jobid[LUSTRE_JOBID_SIZE];
+	char *end;
+
+	if (jobid_hash == NULL)
+		return;
+
+	strlcpy(jobid, find_jobid, sizeof(jobid));
+	/* trim \n off the end of the incoming jobid */
+	end = strchr(jobid, '\n');
+	if (end && *end == '\n')
+		*end = '\0';
+
+	CDEBUG(D_INFO, "Clearing Jobid: %s\n", jobid);
+	cfs_hash_cond_del(jobid_hash, jobid_should_free_item, jobid);
+
+	CDEBUG(D_INFO, "%d items remain in jobID table\n",
+	       atomic_read(&jobid_hash->hs_count));
+}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/kernelcomm.c b/drivers/staging/lustrefsx/lustre/obdclass/kernelcomm.c
index 79d176dcd3d53..7afb9484a8a69 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/kernelcomm.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/kernelcomm.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2015, Intel Corporation.
+ * Copyright (c) 2015, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -35,7 +35,8 @@
  */
 
 #define DEBUG_SUBSYSTEM S_CLASS
-#define D_KUC D_OTHER
+
+#include <linux/file.h>
 
 #include <obd_support.h>
 #include <lustre_kernelcomm.h>
@@ -73,7 +74,7 @@ int libcfs_kkuc_msg_put(struct file *filp, void *payload)
 	if (rc < 0)
 		CWARN("message send failed (%d)\n", rc);
 	else
-		CDEBUG(D_KUC, "Sent message rc=%d, fp=%p\n", rc, filp);
+		CDEBUG(D_HSM, "Sent message rc=%d, fp=%p\n", rc, filp);
 
 	return rc;
 }
@@ -142,7 +143,7 @@ int libcfs_kkuc_group_add(struct file *filp, const struct obd_uuid *uuid,
 	list_add(&reg->kr_chain, &kkuc_groups[group]);
 	up_write(&kg_sem);
 
-	CDEBUG(D_KUC, "Added uid=%d fp=%p to group %d\n", uid, filp, group);
+	CDEBUG(D_HSM, "Added uid=%d fp=%p to group %d\n", uid, filp, group);
 
 	return 0;
 }
@@ -174,7 +175,7 @@ int libcfs_kkuc_group_rem(const struct obd_uuid *uuid, int uid, int group)
 		if (obd_uuid_equals(uuid, &reg->kr_uuid) &&
 		    (uid == 0 || uid == reg->kr_uid)) {
 			list_del(&reg->kr_chain);
-			CDEBUG(D_KUC, "Removed uid=%d fp=%p from group %d\n",
+			CDEBUG(D_HSM, "Removed uid=%d fp=%p from group %d\n",
 				reg->kr_uid, reg->kr_fp, group);
 			if (reg->kr_fp != NULL)
 				fput(reg->kr_fp);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/linkea.c b/drivers/staging/lustrefsx/lustre/obdclass/linkea.c
index a1bcc3d7de608..cf17a50999f8d 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/linkea.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/linkea.c
@@ -21,13 +21,12 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2013, 2016, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
  * Use is subject to license terms.
  *
  * Author: Di Wang <di.wang@intel.com>
  */
 
-#include <lustre/lustre_idl.h>
 #include <obd.h>
 #include <lustre_linkea.h>
 
@@ -144,10 +143,11 @@ int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname,
 	reclen = lname->ln_namelen + sizeof(struct link_ea_entry);
 	if (unlikely(leh->leh_len + reclen > MAX_LINKEA_SIZE)) {
 		/* Use 32-bits to save the overflow time, although it will
-		 * shrink the cfs_time_current_sec() returned 64-bits value
+		 * shrink the ktime_get_real_seconds() returned 64-bits value
 		 * to 32-bits value, it is still quite large and can be used
-		 * for about 140 years. That is enough. */
-		leh->leh_overflow_time = cfs_time_current_sec();
+		 * for about 140 years. That is enough.
+		 */
+		leh->leh_overflow_time = ktime_get_real_seconds();
 		if (unlikely(leh->leh_overflow_time == 0))
 			leh->leh_overflow_time++;
 
@@ -236,7 +236,7 @@ int linkea_overflow_shrink(struct linkea_data *ldata)
 	if (unlikely(leh->leh_reccount == 0))
 		return 0;
 
-	leh->leh_overflow_time = cfs_time_current_sec();
+	leh->leh_overflow_time = ktime_get_real_seconds();
 	if (unlikely(leh->leh_overflow_time == 0))
 		leh->leh_overflow_time++;
 	ldata->ld_reclen = 0;
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-module.c b/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-module.c
deleted file mode 100644
index dabbf58057caf..0000000000000
--- a/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-module.c
+++ /dev/null
@@ -1,582 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2011, 2016, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lustre/obdclass/linux/linux-module.c
- *
- * Object Devices Class Driver
- * These are the only exported functions, they provide some generic
- * infrastructure for managing object devices
- */
-
-#define DEBUG_SUBSYSTEM S_CLASS
-
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/lp.h>
-#include <linux/slab.h>
-#include <linux/ioport.h>
-#include <linux/fcntl.h>
-#include <linux/delay.h>
-#include <linux/skbuff.h>
-#include <linux/proc_fs.h>
-#include <linux/fs.h>
-#include <linux/poll.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/highmem.h>
-#include <asm/io.h>
-#include <asm/ioctls.h>
-#include <asm/poll.h>
-#include <asm/uaccess.h>
-#include <linux/miscdevice.h>
-#include <linux/seq_file.h>
-#include <linux/kobject.h>
-
-#include <libcfs/libcfs.h>
-#include <obd_support.h>
-#include <obd_class.h>
-#include <lnet/lnetctl.h>
-#include <lprocfs_status.h>
-#include <uapi/linux/lustre_ioctl.h>
-#include <lustre_ver.h>
-
-static int obd_ioctl_is_invalid(struct obd_ioctl_data *data)
-{
-	if (data->ioc_len > BIT(30)) {
-		CERROR("OBD ioctl: ioc_len larger than 1<<30\n");
-		return 1;
-	}
-
-	if (data->ioc_inllen1 > BIT(30)) {
-		CERROR("OBD ioctl: ioc_inllen1 larger than 1<<30\n");
-		return 1;
-	}
-
-	if (data->ioc_inllen2 > BIT(30)) {
-		CERROR("OBD ioctl: ioc_inllen2 larger than 1<<30\n");
-		return 1;
-	}
-
-	if (data->ioc_inllen3 > BIT(30)) {
-		CERROR("OBD ioctl: ioc_inllen3 larger than 1<<30\n");
-		return 1;
-	}
-
-	if (data->ioc_inllen4 > BIT(30)) {
-		CERROR("OBD ioctl: ioc_inllen4 larger than 1<<30\n");
-		return 1;
-	}
-
-	if (data->ioc_inlbuf1 && data->ioc_inllen1 == 0) {
-		CERROR("OBD ioctl: inlbuf1 pointer but 0 length\n");
-		return 1;
-	}
-
-	if (data->ioc_inlbuf2 && data->ioc_inllen2 == 0) {
-		CERROR("OBD ioctl: inlbuf2 pointer but 0 length\n");
-		return 1;
-	}
-
-	if (data->ioc_inlbuf3 && data->ioc_inllen3 == 0) {
-		CERROR("OBD ioctl: inlbuf3 pointer but 0 length\n");
-		return 1;
-	}
-
-	if (data->ioc_inlbuf4 && data->ioc_inllen4 == 0) {
-		CERROR("OBD ioctl: inlbuf4 pointer but 0 length\n");
-		return 1;
-	}
-
-	if (data->ioc_pbuf1 && data->ioc_plen1 == 0) {
-		CERROR("OBD ioctl: pbuf1 pointer but 0 length\n");
-		return 1;
-	}
-
-	if (data->ioc_pbuf2 && data->ioc_plen2 == 0) {
-		CERROR("OBD ioctl: pbuf2 pointer but 0 length\n");
-		return 1;
-	}
-
-	if (!data->ioc_pbuf1 && data->ioc_plen1 != 0) {
-		CERROR("OBD ioctl: plen1 set but NULL pointer\n");
-		return 1;
-	}
-
-	if (!data->ioc_pbuf2 && data->ioc_plen2 != 0) {
-		CERROR("OBD ioctl: plen2 set but NULL pointer\n");
-		return 1;
-	}
-
-	if (obd_ioctl_packlen(data) > data->ioc_len) {
-		CERROR("OBD ioctl: packlen exceeds ioc_len (%d > %d)\n",
-		       obd_ioctl_packlen(data), data->ioc_len);
-		return 1;
-	}
-
-	return 0;
-}
-
-/* buffer MUST be at least the size of obd_ioctl_hdr */
-int obd_ioctl_getdata(char **buf, int *len, void __user *arg)
-{
-	struct obd_ioctl_hdr hdr;
-	struct obd_ioctl_data *data;
-	int offset = 0;
-	ENTRY;
-
-	if (copy_from_user(&hdr, arg, sizeof(hdr)))
-		RETURN(-EFAULT);
-
-        if (hdr.ioc_version != OBD_IOCTL_VERSION) {
-                CERROR("Version mismatch kernel (%x) vs application (%x)\n",
-                       OBD_IOCTL_VERSION, hdr.ioc_version);
-                RETURN(-EINVAL);
-        }
-
-        if (hdr.ioc_len > OBD_MAX_IOCTL_BUFFER) {
-                CERROR("User buffer len %d exceeds %d max buffer\n",
-                       hdr.ioc_len, OBD_MAX_IOCTL_BUFFER);
-                RETURN(-EINVAL);
-        }
-
-        if (hdr.ioc_len < sizeof(struct obd_ioctl_data)) {
-                CERROR("User buffer too small for ioctl (%d)\n", hdr.ioc_len);
-                RETURN(-EINVAL);
-        }
-
-        /* When there are lots of processes calling vmalloc on multi-core
-         * system, the high lock contention will hurt performance badly,
-         * obdfilter-survey is an example, which relies on ioctl. So we'd
-         * better avoid vmalloc on ioctl path. LU-66 */
-        OBD_ALLOC_LARGE(*buf, hdr.ioc_len);
-        if (*buf == NULL) {
-                CERROR("Cannot allocate control buffer of len %d\n",
-                       hdr.ioc_len);
-                RETURN(-EINVAL);
-        }
-        *len = hdr.ioc_len;
-        data = (struct obd_ioctl_data *)*buf;
-
-	if (copy_from_user(*buf, arg, hdr.ioc_len)) {
-		OBD_FREE_LARGE(*buf, hdr.ioc_len);
-		RETURN(-EFAULT);
-	}
-
-        if (obd_ioctl_is_invalid(data)) {
-                CERROR("ioctl not correctly formatted\n");
-                OBD_FREE_LARGE(*buf, hdr.ioc_len);
-                RETURN(-EINVAL);
-        }
-
-        if (data->ioc_inllen1) {
-                data->ioc_inlbuf1 = &data->ioc_bulk[0];
-                offset += cfs_size_round(data->ioc_inllen1);
-        }
-
-        if (data->ioc_inllen2) {
-                data->ioc_inlbuf2 = &data->ioc_bulk[0] + offset;
-                offset += cfs_size_round(data->ioc_inllen2);
-        }
-
-        if (data->ioc_inllen3) {
-                data->ioc_inlbuf3 = &data->ioc_bulk[0] + offset;
-                offset += cfs_size_round(data->ioc_inllen3);
-        }
-
-	if (data->ioc_inllen4)
-		data->ioc_inlbuf4 = &data->ioc_bulk[0] + offset;
-
-	RETURN(0);
-}
-EXPORT_SYMBOL(obd_ioctl_getdata);
-
-/*  opening /dev/obd */
-static int obd_class_open(struct inode * inode, struct file * file)
-{
-	ENTRY;
-
-	try_module_get(THIS_MODULE);
-	RETURN(0);
-}
-
-/*  closing /dev/obd */
-static int obd_class_release(struct inode * inode, struct file * file)
-{
-	ENTRY;
-
-	module_put(THIS_MODULE);
-	RETURN(0);
-}
-
-/* to control /dev/obd */
-static long obd_class_ioctl(struct file *filp, unsigned int cmd,
-			    unsigned long arg)
-{
-        int err = 0;
-        ENTRY;
-
-        /* Allow non-root access for OBD_IOC_PING_TARGET - used by lfs check */
-        if (!cfs_capable(CFS_CAP_SYS_ADMIN) && (cmd != OBD_IOC_PING_TARGET))
-                RETURN(err = -EACCES);
-        if ((cmd & 0xffffff00) == ((int)'T') << 8) /* ignore all tty ioctls */
-                RETURN(err = -ENOTTY);
-
-        err = class_handle_ioctl(cmd, (unsigned long)arg);
-
-        RETURN(err);
-}
-
-/* declare character device */
-static struct file_operations obd_psdev_fops = {
-	.owner          = THIS_MODULE,
-	.unlocked_ioctl = obd_class_ioctl, /* unlocked_ioctl */
-	.open           = obd_class_open,      /* open */
-	.release        = obd_class_release,   /* release */
-};
-
-/* modules setup */
-struct miscdevice obd_psdev = {
-	.minor	= MISC_DYNAMIC_MINOR,
-	.name	= OBD_DEV_NAME,
-	.fops	= &obd_psdev_fops,
-};
-
-static ssize_t version_show(struct kobject *kobj, struct attribute *attr,
-			    char *buf)
-{
-	return sprintf(buf, "%s\n", LUSTRE_VERSION_STRING);
-}
-
-static ssize_t pinger_show(struct kobject *kobj, struct attribute *attr,
-			   char *buf)
-{
-#ifdef ENABLE_PINGER
-	const char *state = "on";
-#else
-	const char *state = "off";
-#endif
-	return sprintf(buf, "%s\n", state);
-}
-
-/**
- * Check all obd devices health
- *
- * \param kobj
- * \param buf [in]
- *
- * \retval number of characters printed if healthy
- */
-static ssize_t
-health_check_show(struct kobject *kobj, struct attribute *attr, char *buf)
-{
-	bool healthy = true;
-	size_t len = 0;
-	int i;
-
-	if (libcfs_catastrophe) {
-		len = sprintf(buf, "LBUG\n");
-		healthy = false;
-	}
-
-	read_lock(&obd_dev_lock);
-	for (i = 0; i < class_devno_max(); i++) {
-		struct obd_device *obd;
-
-		obd = class_num2obd(i);
-		if (obd == NULL || !obd->obd_attached || !obd->obd_set_up)
-			continue;
-
-		LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
-		if (obd->obd_stopping)
-			continue;
-
-		class_incref(obd, __FUNCTION__, current);
-		read_unlock(&obd_dev_lock);
-
-		if (obd_health_check(NULL, obd)) {
-			len = sprintf(buf, "device %s reported unhealthy\n",
-				      obd->obd_name);
-			healthy = false;
-		}
-		class_decref(obd, __FUNCTION__, current);
-		read_lock(&obd_dev_lock);
-	}
-	read_unlock(&obd_dev_lock);
-
-	if (healthy)
-		len = sprintf(buf, "healthy\n");
-	else
-		len = sprintf(buf, "NOT HEALTHY\n");
-
-	return len;
-}
-
-static ssize_t jobid_var_show(struct kobject *kobj, struct attribute *attr,
-			      char *buf)
-{
-	int rc = 0;
-
-	if (strlen(obd_jobid_var))
-		rc = snprintf(buf, PAGE_SIZE, "%s\n", obd_jobid_var);
-	return rc;
-}
-
-static ssize_t jobid_var_store(struct kobject *kobj, struct attribute *attr,
-			       const char *buffer, size_t count)
-{
-	if (!count || count > JOBSTATS_JOBID_VAR_MAX_LEN)
-		return -EINVAL;
-
-	memset(obd_jobid_var, 0, JOBSTATS_JOBID_VAR_MAX_LEN + 1);
-
-	memcpy(obd_jobid_var, buffer, count);
-
-	/* Trim the trailing '\n' if any */
-	if (obd_jobid_var[count - 1] == '\n')
-		obd_jobid_var[count - 1] = 0;
-
-	return count;
-}
-
-static ssize_t jobid_name_show(struct kobject *kobj, struct attribute *attr,
-			       char *buf)
-{
-	int rc = 0;
-
-	if (strlen(obd_jobid_node))
-		rc = snprintf(buf, PAGE_SIZE, "%s\n", obd_jobid_node);
-	return rc;
-}
-
-static ssize_t jobid_name_store(struct kobject *kobj, struct attribute *attr,
-				const char *buffer, size_t count)
-{
-	if (!count || count > LUSTRE_JOBID_SIZE)
-		return -EINVAL;
-
-	/* clear previous value */
-	memset(obd_jobid_node, 0, LUSTRE_JOBID_SIZE);
-
-	memcpy(obd_jobid_node, buffer, count);
-
-	/* Trim the trailing '\n' if any */
-	if (obd_jobid_node[count - 1] == '\n') {
-		/* Don't echo just a newline */
-		if (count == 1)
-			return -EINVAL;
-		obd_jobid_node[count - 1] = 0;
-	}
-
-	return count;
-}
-
-/* Root for /sys/kernel/debug/lustre */
-struct dentry *debugfs_lustre_root;
-EXPORT_SYMBOL_GPL(debugfs_lustre_root);
-
-#ifdef CONFIG_PROC_FS
-/* Root for /proc/fs/lustre */
-struct proc_dir_entry *proc_lustre_root = NULL;
-EXPORT_SYMBOL(proc_lustre_root);
-#else
-#define lprocfs_base NULL
-#endif /* CONFIG_PROC_FS */
-
-LUSTRE_RO_ATTR(version);
-LUSTRE_RO_ATTR(pinger);
-LUSTRE_RO_ATTR(health_check);
-LUSTRE_RW_ATTR(jobid_var);
-LUSTRE_RW_ATTR(jobid_name);
-
-static struct attribute *lustre_attrs[] = {
-	&lustre_attr_version.attr,
-	&lustre_attr_pinger.attr,
-	&lustre_attr_health_check.attr,
-	&lustre_attr_jobid_name.attr,
-	&lustre_attr_jobid_var.attr,
-	NULL,
-};
-
-static void *obd_device_list_seq_start(struct seq_file *p, loff_t *pos)
-{
-        if (*pos >= class_devno_max())
-                return NULL;
-
-        return pos;
-}
-
-static void obd_device_list_seq_stop(struct seq_file *p, void *v)
-{
-}
-
-static void *obd_device_list_seq_next(struct seq_file *p, void *v, loff_t *pos)
-{
-        ++*pos;
-        if (*pos >= class_devno_max())
-                return NULL;
-
-        return pos;
-}
-
-static int obd_device_list_seq_show(struct seq_file *p, void *v)
-{
-        loff_t index = *(loff_t *)v;
-        struct obd_device *obd = class_num2obd((int)index);
-        char *status;
-
-        if (obd == NULL)
-                return 0;
-
-        LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
-        if (obd->obd_stopping)
-                status = "ST";
-        else if (obd->obd_inactive)
-                status = "IN";
-        else if (obd->obd_set_up)
-                status = "UP";
-        else if (obd->obd_attached)
-                status = "AT";
-        else
-                status = "--";
-
-	seq_printf(p, "%3d %s %s %s %s %d\n",
-		   (int)index, status, obd->obd_type->typ_name,
-		   obd->obd_name, obd->obd_uuid.uuid,
-		   atomic_read(&obd->obd_refcount));
-	return 0;
-}
-
-static const struct seq_operations obd_device_list_sops = {
-        .start = obd_device_list_seq_start,
-        .stop = obd_device_list_seq_stop,
-        .next = obd_device_list_seq_next,
-        .show = obd_device_list_seq_show,
-};
-
-static int obd_device_list_open(struct inode *inode, struct file *file)
-{
-	struct seq_file *seq;
-	int rc = seq_open(file, &obd_device_list_sops);
-
-	if (rc)
-		return rc;
-
-	seq = file->private_data;
-	seq->private = inode->i_private;
-	return 0;
-}
-
-static const struct file_operations obd_device_list_fops = {
-        .owner   = THIS_MODULE,
-        .open    = obd_device_list_open,
-        .read    = seq_read,
-        .llseek  = seq_lseek,
-        .release = seq_release,
-};
-
-struct kobject *lustre_kobj;
-EXPORT_SYMBOL_GPL(lustre_kobj);
-
-static struct attribute_group lustre_attr_group = {
-	.attrs = lustre_attrs,
-};
-
-int class_procfs_init(void)
-{
-	struct proc_dir_entry *entry;
-	struct dentry *file;
-	int rc = -ENOMEM;
-	ENTRY;
-
-	lustre_kobj = kobject_create_and_add("lustre", fs_kobj);
-	if (lustre_kobj == NULL)
-		goto out;
-
-	/* Create the files associated with this kobject */
-	rc = sysfs_create_group(lustre_kobj, &lustre_attr_group);
-	if (rc) {
-		kobject_put(lustre_kobj);
-		goto out;
-	}
-
-	rc = obd_sysctl_init();
-	if (rc) {
-		kobject_put(lustre_kobj);
-		goto out;
-	}
-
-	debugfs_lustre_root = debugfs_create_dir("lustre", NULL);
-	if (IS_ERR_OR_NULL(debugfs_lustre_root)) {
-		rc = debugfs_lustre_root ? PTR_ERR(debugfs_lustre_root)
-					 : -ENOMEM;
-		debugfs_lustre_root = NULL;
-		kobject_put(lustre_kobj);
-		goto out;
-	}
-
-	file = debugfs_create_file("devices", 0444, debugfs_lustre_root, NULL,
-				   &obd_device_list_fops);
-	if (IS_ERR_OR_NULL(file)) {
-		rc = file ? PTR_ERR(file) : -ENOMEM;
-		kobject_put(lustre_kobj);
-		goto out;
-	}
-
-	entry = lprocfs_register("fs/lustre", NULL, NULL, NULL);
-	if (IS_ERR(entry)) {
-		rc = PTR_ERR(entry);
-		CERROR("cannot create '/proc/fs/lustre': rc = %d\n", rc);
-		kobject_put(lustre_kobj);
-		goto out;
-	}
-
-	proc_lustre_root = entry;
-out:
-	RETURN(rc);
-}
-
-int class_procfs_clean(void)
-{
-	ENTRY;
-
-	debugfs_remove_recursive(debugfs_lustre_root);
-
-	debugfs_lustre_root = NULL;
-
-	if (proc_lustre_root)
-		lprocfs_remove(&proc_lustre_root);
-
-	kobject_put(lustre_kobj);
-
-	RETURN(0);
-}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-obdo.c b/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-obdo.c
deleted file mode 100644
index 5f8e2b55d7258..0000000000000
--- a/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-obdo.c
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2012, 2014, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lustre/obdclass/linux/linux-obdo.c
- *
- * Object Devices Class Driver
- * These are the only exported functions, they provide some generic
- * infrastructure for managing object devices
- */
-
-#define DEBUG_SUBSYSTEM S_CLASS
-
-#include <linux/fs.h>
-#include <linux/module.h>
-#include <linux/pagemap.h> /* for PAGE_SIZE */
-#include <lustre/lustre_idl.h>
-#include <obd_class.h>
-
-/*FIXME: Just copy from obdo_from_inode*/
-void obdo_from_la(struct obdo *dst, const struct lu_attr *la, u64 valid)
-{
-	u64 newvalid = 0;
-
-        if (valid & LA_ATIME) {
-                dst->o_atime = la->la_atime;
-                newvalid |= OBD_MD_FLATIME;
-        }
-        if (valid & LA_MTIME) {
-                dst->o_mtime = la->la_mtime;
-                newvalid |= OBD_MD_FLMTIME;
-        }
-        if (valid & LA_CTIME) {
-                dst->o_ctime = la->la_ctime;
-                newvalid |= OBD_MD_FLCTIME;
-        }
-        if (valid & LA_SIZE) {
-                dst->o_size = la->la_size;
-                newvalid |= OBD_MD_FLSIZE;
-        }
-        if (valid & LA_BLOCKS) {  /* allocation of space (x512 bytes) */
-                dst->o_blocks = la->la_blocks;
-                newvalid |= OBD_MD_FLBLOCKS;
-        }
-        if (valid & LA_TYPE) {
-                dst->o_mode = (dst->o_mode & S_IALLUGO) |
-                              (la->la_mode & S_IFMT);
-                newvalid |= OBD_MD_FLTYPE;
-        }
-        if (valid & LA_MODE) {
-                dst->o_mode = (dst->o_mode & S_IFMT) |
-                              (la->la_mode & S_IALLUGO);
-                newvalid |= OBD_MD_FLMODE;
-        }
-        if (valid & LA_UID) {
-                dst->o_uid = la->la_uid;
-                newvalid |= OBD_MD_FLUID;
-        }
-        if (valid & LA_GID) {
-                dst->o_gid = la->la_gid;
-                newvalid |= OBD_MD_FLGID;
-        }
-	if (valid & LA_PROJID) {
-		dst->o_projid = la->la_projid;
-		newvalid |= OBD_MD_FLPROJID;
-	}
-	if (valid & LA_FLAGS) {
-		dst->o_flags = la->la_flags;
-		newvalid |= OBD_MD_FLFLAGS;
-	}
-	dst->o_valid |= newvalid;
-}
-EXPORT_SYMBOL(obdo_from_la);
-
-/*FIXME: Just copy from obdo_from_inode*/
-void la_from_obdo(struct lu_attr *dst, const struct obdo *obdo, u64 valid)
-{
-	u64 newvalid = 0;
-
-        valid &= obdo->o_valid;
-
-        if (valid & OBD_MD_FLATIME) {
-                dst->la_atime = obdo->o_atime;
-                newvalid |= LA_ATIME;
-        }
-        if (valid & OBD_MD_FLMTIME) {
-                dst->la_mtime = obdo->o_mtime;
-                newvalid |= LA_MTIME;
-        }
-        if (valid & OBD_MD_FLCTIME) {
-                dst->la_ctime = obdo->o_ctime;
-                newvalid |= LA_CTIME;
-        }
-        if (valid & OBD_MD_FLSIZE) {
-                dst->la_size = obdo->o_size;
-                newvalid |= LA_SIZE;
-        }
-        if (valid & OBD_MD_FLBLOCKS) {
-                dst->la_blocks = obdo->o_blocks;
-                newvalid |= LA_BLOCKS;
-        }
-        if (valid & OBD_MD_FLTYPE) {
-                dst->la_mode = (dst->la_mode & S_IALLUGO) |
-                               (obdo->o_mode & S_IFMT);
-                newvalid |= LA_TYPE;
-        }
-        if (valid & OBD_MD_FLMODE) {
-                dst->la_mode = (dst->la_mode & S_IFMT) |
-                               (obdo->o_mode & S_IALLUGO);
-                newvalid |= LA_MODE;
-        }
-        if (valid & OBD_MD_FLUID) {
-                dst->la_uid = obdo->o_uid;
-                newvalid |= LA_UID;
-        }
-        if (valid & OBD_MD_FLGID) {
-                dst->la_gid = obdo->o_gid;
-                newvalid |= LA_GID;
-        }
-	if (valid & OBD_MD_FLPROJID) {
-		dst->la_projid = obdo->o_projid;
-		newvalid |= LA_PROJID;
-	}
-	if (valid & OBD_MD_FLFLAGS) {
-		dst->la_flags = obdo->o_flags;
-		newvalid |= LA_FLAGS;
-	}
-	dst->la_valid = newvalid;
-}
-EXPORT_SYMBOL(la_from_obdo);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-sysctl.c b/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-sysctl.c
deleted file mode 100644
index e8016c77c7506..0000000000000
--- a/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-sysctl.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2011, 2015, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- */
-
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/stat.h>
-#include <linux/ctype.h>
-#include <linux/bitops.h>
-#include <linux/uaccess.h>
-#include <linux/utsname.h>
-
-#define DEBUG_SUBSYSTEM S_CLASS
-
-#include <obd_support.h>
-#include <lprocfs_status.h>
-#include <obd_class.h>
-
-struct static_lustre_uintvalue_attr {
-	struct {
-		struct attribute attr;
-		ssize_t (*show)(struct kobject *kobj, struct attribute *attr,
-				char *buf);
-		ssize_t (*store)(struct kobject *kobj, struct attribute *attr,
-				 const char *buf, size_t len);
-	} u;
-	int *value;
-};
-
-static ssize_t static_uintvalue_show(struct kobject *kobj,
-				     struct attribute *attr,
-				     char *buf)
-{
-	struct static_lustre_uintvalue_attr *lattr = (void *)attr;
-
-	return sprintf(buf, "%d\n", *lattr->value);
-}
-
-static ssize_t static_uintvalue_store(struct kobject *kobj,
-				      struct attribute *attr,
-				      const char *buffer, size_t count)
-{
-	struct static_lustre_uintvalue_attr *lattr  = (void *)attr;
-	unsigned int val;
-	int rc;
-
-	rc = kstrtouint(buffer, 10, &val);
-	if (rc)
-		return rc;
-
-	*lattr->value = val;
-
-	return count;
-}
-
-#define LUSTRE_STATIC_UINT_ATTR(name, value) \
-static struct static_lustre_uintvalue_attr lustre_sattr_##name =	\
-					{__ATTR(name, 0644,		\
-						static_uintvalue_show,	\
-						static_uintvalue_store),\
-					  value }
-
-LUSTRE_STATIC_UINT_ATTR(timeout, &obd_timeout);
-
-static ssize_t max_dirty_mb_show(struct kobject *kobj, struct attribute *attr,
-				 char *buf)
-{
-	return sprintf(buf, "%lu\n",
-		       obd_max_dirty_pages / (1 << (20 - PAGE_SHIFT)));
-}
-
-static ssize_t max_dirty_mb_store(struct kobject *kobj, struct attribute *attr,
-				  const char *buffer, size_t count)
-{
-	unsigned long val;
-	int rc;
-
-	rc = kstrtoul(buffer, 10, &val);
-	if (rc)
-		return rc;
-
-	val *= 1 << (20 - PAGE_SHIFT); /* convert to pages */
-
-	if (val > ((cfs_totalram_pages() / 10) * 9)) {
-		/* Somebody wants to assign too much memory to dirty pages */
-		return -EINVAL;
-	}
-
-	if (val < 4 << (20 - PAGE_SHIFT)) {
-		/* Less than 4 Mb for dirty cache is also bad */
-		return -EINVAL;
-	}
-
-	obd_max_dirty_pages = val;
-
-	return count;
-}
-LUSTRE_RW_ATTR(max_dirty_mb);
-
-LUSTRE_STATIC_UINT_ATTR(debug_peer_on_timeout, &obd_debug_peer_on_timeout);
-LUSTRE_STATIC_UINT_ATTR(dump_on_timeout, &obd_dump_on_timeout);
-LUSTRE_STATIC_UINT_ATTR(dump_on_eviction, &obd_dump_on_eviction);
-LUSTRE_STATIC_UINT_ATTR(at_min, &at_min);
-LUSTRE_STATIC_UINT_ATTR(at_max, &at_max);
-LUSTRE_STATIC_UINT_ATTR(at_extra, &at_extra);
-LUSTRE_STATIC_UINT_ATTR(at_early_margin, &at_early_margin);
-LUSTRE_STATIC_UINT_ATTR(at_history, &at_history);
-
-#ifdef HAVE_SERVER_SUPPORT
-LUSTRE_STATIC_UINT_ATTR(ldlm_timeout, &ldlm_timeout);
-LUSTRE_STATIC_UINT_ATTR(bulk_timeout, &bulk_timeout);
-#endif
-
-static ssize_t memused_show(struct kobject *kobj, struct attribute *attr,
-			    char *buf)
-{
-	return sprintf(buf, "%llu\n", obd_memory_sum());
-}
-LUSTRE_RO_ATTR(memused);
-
-static ssize_t memused_max_show(struct kobject *kobj, struct attribute *attr,
-				char *buf)
-{
-	return sprintf(buf, "%llu\n", obd_memory_max());
-}
-LUSTRE_RO_ATTR(memused_max);
-
-static struct attribute *lustre_attrs[] = {
-	&lustre_sattr_timeout.u.attr,
-	&lustre_attr_max_dirty_mb.attr,
-	&lustre_sattr_debug_peer_on_timeout.u.attr,
-	&lustre_sattr_dump_on_timeout.u.attr,
-	&lustre_sattr_dump_on_eviction.u.attr,
-	&lustre_sattr_at_min.u.attr,
-	&lustre_sattr_at_max.u.attr,
-	&lustre_sattr_at_extra.u.attr,
-	&lustre_sattr_at_early_margin.u.attr,
-	&lustre_sattr_at_history.u.attr,
-	&lustre_attr_memused_max.attr,
-	&lustre_attr_memused.attr,
-#ifdef HAVE_SERVER_SUPPORT
-	&lustre_sattr_ldlm_timeout.u.attr,
-	&lustre_sattr_bulk_timeout.u.attr,
-#endif
-	NULL,
-};
-
-static struct attribute_group lustre_attr_group = {
-	.attrs = lustre_attrs,
-};
-
-int obd_sysctl_init(void)
-{
-	return sysfs_create_group(lustre_kobj, &lustre_attr_group);
-}
-
-void obd_sysctl_clean(void)
-{
-	sysfs_remove_group(lustre_kobj, &lustre_attr_group);
-}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog.c b/drivers/staging/lustrefsx/lustre/obdclass/llog.c
index 61c9a1d1f4e8a..e9228b33339f3 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/llog.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -47,8 +47,10 @@
 #include <linux/kthread.h>
 #include <llog_swab.h>
 #include <lustre_log.h>
+#include <obd_support.h>
 #include <obd_class.h>
 #include "llog_internal.h"
+
 /*
  * Allocate a new log or catalog handle
  * Used inside llog_open().
@@ -63,6 +65,7 @@ static struct llog_handle *llog_alloc_handle(void)
 
 	init_rwsem(&loghandle->lgh_lock);
 	mutex_init(&loghandle->lgh_hdr_mutex);
+	init_rwsem(&loghandle->lgh_last_sem);
 	INIT_LIST_HEAD(&loghandle->u.phd.phd_entry);
 	atomic_set(&loghandle->lgh_refcount, 1);
 
@@ -89,16 +92,30 @@ static void llog_free_handle(struct llog_handle *loghandle)
 	OBD_FREE_PTR(loghandle);
 }
 
-void llog_handle_get(struct llog_handle *loghandle)
+struct llog_handle *llog_handle_get(struct llog_handle *loghandle)
 {
-	atomic_inc(&loghandle->lgh_refcount);
+	if (atomic_inc_not_zero(&loghandle->lgh_refcount))
+		return loghandle;
+	return NULL;
 }
 
-void llog_handle_put(struct llog_handle *loghandle)
+int llog_handle_put(const struct lu_env *env, struct llog_handle *loghandle)
 {
-	LASSERT(atomic_read(&loghandle->lgh_refcount) > 0);
-	if (atomic_dec_and_test(&loghandle->lgh_refcount))
+	int rc = 0;
+
+	if (atomic_dec_and_test(&loghandle->lgh_refcount)) {
+		struct llog_operations *lop;
+
+		rc = llog_handle2ops(loghandle, &lop);
+		if (!rc) {
+			if (lop->lop_close)
+				rc = lop->lop_close(env, loghandle);
+			else
+				rc = -EOPNOTSUPP;
+		}
 		llog_free_handle(loghandle);
+	}
+	return rc;
 }
 
 static int llog_declare_destroy(const struct lu_env *env,
@@ -135,7 +152,7 @@ int llog_trans_destroy(const struct lu_env *env, struct llog_handle *handle,
 		RETURN(-EOPNOTSUPP);
 
 	LASSERT(handle->lgh_obj != NULL);
-	if (!dt_object_exists(handle->lgh_obj))
+	if (!llog_exist(handle))
 		RETURN(0);
 
 	rc = lop->lop_destroy(env, handle, th);
@@ -164,11 +181,14 @@ int llog_destroy(const struct lu_env *env, struct llog_handle *handle)
 		RETURN(rc);
 	}
 
-	if (!dt_object_exists(handle->lgh_obj))
+	if (!llog_exist(handle))
 		RETURN(0);
 
 	dt = lu2dt_dev(handle->lgh_obj->do_lu.lo_dev);
 
+	if (unlikely(unlikely(dt->dd_rdonly)))
+		RETURN(-EROFS);
+
 	th = dt_trans_create(env, dt);
 	if (IS_ERR(th))
 		RETURN(PTR_ERR(th));
@@ -196,14 +216,21 @@ int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
 {
 	struct llog_thread_info *lgi = llog_info(env);
 	struct dt_device	*dt;
-	struct llog_log_hdr	*llh = loghandle->lgh_hdr;
+	struct llog_log_hdr	*llh;
 	struct thandle		*th;
+	__u32			 tmp_lgc_index;
 	int			 rc;
 	int rc1;
 	bool subtract_count = false;
 
 	ENTRY;
 
+	LASSERT(loghandle != NULL);
+	LASSERT(loghandle->lgh_ctxt != NULL);
+	LASSERT(loghandle->lgh_obj != NULL);
+
+	llh = loghandle->lgh_hdr;
+
 	CDEBUG(D_RPCTRACE, "Canceling %d in log "DFID"\n", index,
 	       PFID(&loghandle->lgh_id.lgl_oi.oi_fid));
 
@@ -212,12 +239,11 @@ int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
 		RETURN(-EINVAL);
 	}
 
-	LASSERT(loghandle != NULL);
-	LASSERT(loghandle->lgh_ctxt != NULL);
-	LASSERT(loghandle->lgh_obj != NULL);
-
 	dt = lu2dt_dev(loghandle->lgh_obj->do_lu.lo_dev);
 
+	if (unlikely(unlikely(dt->dd_rdonly)))
+		RETURN(0);
+
 	th = dt_trans_create(env, dt);
 	if (IS_ERR(th))
 		RETURN(PTR_ERR(th));
@@ -247,12 +273,19 @@ int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
 
 	loghandle->lgh_hdr->llh_count--;
 	subtract_count = true;
+
+	/* Since llog_process_thread use lgi_cookie, it`s better to save them
+	 * and restore after using
+	 */
+	tmp_lgc_index = lgi->lgi_cookie.lgc_index;
 	/* Pass this index to llog_osd_write_rec(), which will use the index
 	 * to only update the necesary bitmap. */
 	lgi->lgi_cookie.lgc_index = index;
 	/* update header */
 	rc = llog_write_rec(env, loghandle, &llh->llh_hdr, &lgi->lgi_cookie,
 			    LLOG_HEADER_IDX, th);
+	lgi->lgi_cookie.lgc_index = tmp_lgc_index;
+
 	if (rc != 0)
 		GOTO(out_unlock, rc);
 
@@ -271,7 +304,7 @@ int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
 			 * be accessed anymore, let's return 0 for now, and
 			 * the orphan will be handled by LFSCK. */
 			CERROR("%s: can't destroy empty llog "DFID": rc = %d\n",
-			       loghandle->lgh_ctxt->loc_obd->obd_name,
+			       loghandle2name(loghandle),
 			       PFID(&loghandle->lgh_id.lgl_oi.oi_fid), rc);
 			GOTO(out_unlock, rc = 0);
 		}
@@ -366,7 +399,7 @@ int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
 			     (llh->llh_flags & LLOG_F_IS_CAT &&
 			      flags & LLOG_F_IS_PLAIN))) {
 			CERROR("%s: llog type is %s but initializing %s\n",
-			       handle->lgh_ctxt->loc_obd->obd_name,
+			       loghandle2name(handle),
 			       llh->llh_flags & LLOG_F_IS_CAT ?
 			       "catalog" : "plain",
 			       flags & LLOG_F_IS_CAT ? "catalog" : "plain");
@@ -386,7 +419,7 @@ int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
 		if (unlikely(uuid &&
 			     !obd_uuid_equals(uuid, &llh->llh_tgtuuid))) {
 			CERROR("%s: llog uuid mismatch: %s/%s\n",
-			       handle->lgh_ctxt->loc_obd->obd_name,
+			       loghandle2name(handle),
 			       (char *)uuid->uuid,
 			       (char *)llh->llh_tgtuuid.uuid);
 			GOTO(out, rc = -EEXIST);
@@ -399,8 +432,8 @@ int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
 		llh->llh_flags |= LLOG_F_IS_FIXSIZE;
 	} else if (!(flags & LLOG_F_IS_PLAIN)) {
 		CERROR("%s: unknown flags: %#x (expected %#x or %#x)\n",
-		       handle->lgh_ctxt->loc_obd->obd_name,
-		       flags, LLOG_F_IS_CAT, LLOG_F_IS_PLAIN);
+		       loghandle2name(handle), flags, LLOG_F_IS_CAT,
+		       LLOG_F_IS_PLAIN);
 		rc = -EINVAL;
 	}
 	llh->llh_flags |= fmt;
@@ -413,12 +446,37 @@ int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
 }
 EXPORT_SYMBOL(llog_init_handle);
 
+int llog_verify_record(const struct llog_handle *llh, struct llog_rec_hdr *rec)
+{
+	int chunk_size = llh->lgh_hdr->llh_hdr.lrh_len;
+
+	if (rec->lrh_len == 0 || rec->lrh_len > chunk_size) {
+		CERROR("%s: record is too large: %d > %d\n",
+		       loghandle2name(llh), rec->lrh_len, chunk_size);
+		return -EINVAL;
+	}
+	if (rec->lrh_index >= LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr)) {
+		CERROR("%s: index is too high: %d\n",
+		       loghandle2name(llh), rec->lrh_index);
+		return -EINVAL;
+	}
+	if ((rec->lrh_type & LLOG_OP_MASK) != LLOG_OP_MAGIC) {
+		CERROR("%s: magic %x is bad\n",
+		       loghandle2name(llh), rec->lrh_type);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(llog_verify_record);
+
 static int llog_process_thread(void *arg)
 {
 	struct llog_process_info	*lpi = arg;
 	struct llog_handle		*loghandle = lpi->lpi_loghandle;
 	struct llog_log_hdr		*llh = loghandle->lgh_hdr;
 	struct llog_process_cat_data	*cd  = lpi->lpi_catdata;
+	struct llog_thread_info		*lti;
 	char				*buf;
 	size_t				 chunk_size;
 	__u64				 cur_offset;
@@ -426,12 +484,15 @@ static int llog_process_thread(void *arg)
 	int				 saved_index = 0;
 	int				 last_called_index = 0;
 	bool				 repeated = false;
+	bool				refresh_idx = false;
 
 	ENTRY;
 
 	if (llh == NULL)
 		RETURN(-EINVAL);
 
+	lti = lpi->lpi_env == NULL ? NULL : llog_info(lpi->lpi_env);
+
 	cur_offset = chunk_size = llh->llh_hdr.lrh_len;
 	/* expect chunk_size to be power of two */
 	LASSERT(is_power_of_2(chunk_size));
@@ -457,6 +518,7 @@ static int llog_process_thread(void *arg)
 		unsigned int buf_offset = 0;
 		bool partial_chunk;
 		int	lh_last_idx;
+		int	synced_idx = 0;
 
 		/* skip records not set in bitmap */
 		while (index <= last_index &&
@@ -474,7 +536,8 @@ static int llog_process_thread(void *arg)
 		/* get the buf with our target record; avoid old garbage */
 		memset(buf, 0, chunk_size);
 		/* the record index for outdated chunk data */
-		lh_last_idx = loghandle->lgh_last_idx + 1;
+		/* it is safe to process buffer until saved lgh_last_idx */
+		lh_last_idx = LLOG_HDR_TAIL(llh)->lrt_index;
 		rc = llog_next_block(lpi->lpi_env, loghandle, &saved_index,
 				     index, &cur_offset, buf, chunk_size);
 		if (repeated && rc)
@@ -518,60 +581,72 @@ static int llog_process_thread(void *arg)
 			CDEBUG(D_OTHER, "after swabbing, type=%#x idx=%d\n",
 			       rec->lrh_type, rec->lrh_index);
 
+			if (index == (synced_idx + 1) &&
+			    synced_idx == LLOG_HDR_TAIL(llh)->lrt_index)
+				GOTO(out, rc = 0);
+
+			if (OBD_FAIL_PRECHECK(OBD_FAIL_LLOG_PROCESS_TIMEOUT) &&
+				cfs_fail_val == (unsigned int)
+					(loghandle->lgh_id.lgl_oi.oi.oi_id &
+					 0xFFFFFFFF)) {
+				OBD_RACE(OBD_FAIL_LLOG_PROCESS_TIMEOUT);
+			}
+
 			/* the bitmap could be changed during processing
 			 * records from the chunk. For wrapped catalog
 			 * it means we can read deleted record and try to
-			 * process it. Check this case and reread the chunk. */
-
-			/* for partial chunk the end of it is zeroed, check
-			 * for index 0 to distinguish it. */
-			if ((partial_chunk && rec->lrh_index == 0) ||
-			     (index == lh_last_idx &&
-			      lh_last_idx != (loghandle->lgh_last_idx + 1))) {
-				/* concurrent llog_add() might add new records
-				 * while llog_processing, check this is not
-				 * the case and re-read the current chunk
-				 * otherwise. */
-				int records;
-				/* lgh_last_idx could be less then index
-				 * for catalog, if catalog is wrapped */
-				if ((index > loghandle->lgh_last_idx &&
-				    !(loghandle->lgh_hdr->llh_flags &
-				      LLOG_F_IS_CAT)) || repeated ||
-				    (loghandle->lgh_obj != NULL &&
-				     dt_object_remote(loghandle->lgh_obj)))
-					GOTO(out, rc = 0);
-				/* <2 records means no more records
-				 * if the last record we processed was
-				 * the final one, then the underlying
-				 * object might have been destroyed yet.
-				 * we better don't access that.. */
-				mutex_lock(&loghandle->lgh_hdr_mutex);
-				records = loghandle->lgh_hdr->llh_count;
-				mutex_unlock(&loghandle->lgh_hdr_mutex);
-				if (records <= 1)
-					GOTO(out, rc = 0);
-				CDEBUG(D_OTHER, "Re-read last llog buffer for "
-				       "new records, index %u, last %u\n",
-				       index, loghandle->lgh_last_idx);
+			 * process it. Check this case and reread the chunk.
+			 * It is safe to process to lh_last_idx, including
+			 * lh_last_idx if it was synced. We can not do <=
+			 * comparison, cause for wrapped catalog lgh_last_idx
+			 * could be less than index. So we detect last index
+			 * for processing as index == lh_last_idx+1. But when
+			 * catalog is wrapped and full lgh_last_idx=llh_cat_idx,
+			 * the first processing index is llh_cat_idx+1.The
+			 * exception is !(lgh_last_idx == llh_cat_idx &&
+			 * index == llh_cat_idx + 1), and after simplification
+			 * it turns to
+			 * lh_last_idx != LLOG_HDR_TAIL(llh)->lrt_index
+			 * This exception is working for catalog only.
+			 */
+
+			if ((index == lh_last_idx && synced_idx != index) ||
+			    (index == (lh_last_idx + 1) &&
+			     lh_last_idx != LLOG_HDR_TAIL(llh)->lrt_index) ||
+			    (rec->lrh_index == 0 && !repeated)) {
+
 				/* save offset inside buffer for the re-read */
 				buf_offset = (char *)rec - (char *)buf;
 				cur_offset = chunk_offset;
 				repeated = true;
+				/* We need to be sure lgh_last_idx
+				 * record was saved to disk
+				 */
+				down_read(&loghandle->lgh_last_sem);
+				synced_idx = LLOG_HDR_TAIL(llh)->lrt_index;
+				up_read(&loghandle->lgh_last_sem);
+				CDEBUG(D_OTHER, "synced_idx: %d\n", synced_idx);
 				goto repeat;
+
 			}
 
 			repeated = false;
 
-			if (rec->lrh_len == 0 || rec->lrh_len > chunk_size) {
-				CWARN("%s: invalid length %d in llog "DFID
-				      "record for index %d/%d\n",
-				       loghandle->lgh_ctxt->loc_obd->obd_name,
-				       rec->lrh_len,
+			rc = llog_verify_record(loghandle, rec);
+			if (rc) {
+				CERROR("%s: invalid record in llog "DFID
+				       " record for index %d/%d: rc = %d\n",
+				       loghandle2name(loghandle),
 				       PFID(&loghandle->lgh_id.lgl_oi.oi_fid),
-				       rec->lrh_index, index);
-
-				GOTO(out, rc = -EINVAL);
+				       rec->lrh_index, index, rc);
+				/*
+				 * the block seem to be corrupted, let's try
+				 * with the next one. reset rc to go to the
+				 * next chunk.
+				 */
+				refresh_idx = true;
+				index = 0;
+				GOTO(repeat, rc = 0);
 			}
 
 			if (rec->lrh_index < index) {
@@ -581,12 +656,22 @@ static int llog_process_thread(void *arg)
 			}
 
 			if (rec->lrh_index != index) {
-				CERROR("%s: "DFID" Invalid record: index %u"
-				       " but expected %u\n",
-				       loghandle->lgh_ctxt->loc_obd->obd_name,
-				       PFID(&loghandle->lgh_id.lgl_oi.oi_fid),
-				       rec->lrh_index, index);
-				GOTO(out, rc = -ERANGE);
+				/*
+				 * the last time we couldn't parse the block due
+				 * to corruption, thus has no idea about the
+				 * next index, take it from the block, once.
+				 */
+				if (refresh_idx) {
+					refresh_idx = false;
+					index = rec->lrh_index;
+				} else {
+					CERROR("%s: "DFID" Invalid record: index"
+					       " %u but expected %u\n",
+					       loghandle2name(loghandle),
+					       PFID(&loghandle->lgh_id.lgl_oi.oi_fid),
+					       rec->lrh_index, index);
+					GOTO(out, rc = -ERANGE);
+				}
 			}
 
 			CDEBUG(D_OTHER,
@@ -594,15 +679,44 @@ static int llog_process_thread(void *arg)
 			       rec->lrh_index, rec->lrh_len,
 			       (int)(buf + chunk_size - (char *)rec));
 
-			loghandle->lgh_cur_idx = rec->lrh_index;
+			/* lgh_cur_offset is used only at llog_test_3 */
 			loghandle->lgh_cur_offset = (char *)rec - (char *)buf +
 						    chunk_offset;
 
 			/* if set, process the callback on this record */
 			if (ext2_test_bit(index, LLOG_HDR_BITMAP(llh))) {
+				struct llog_cookie *lgc;
+				__u64	tmp_off;
+				int	tmp_idx;
+
+				CDEBUG((llh->llh_flags & LLOG_F_IS_CAT ?
+					D_HA : D_OTHER),
+				       "index: %d, lh_last_idx: %d "
+				       "synced_idx: %d lgh_last_idx: %d\n",
+				       index, lh_last_idx, synced_idx,
+				       loghandle->lgh_last_idx);
+
+				if (lti != NULL) {
+					lgc = &lti->lgi_cookie;
+					/* store lu_env for recursive calls */
+					tmp_off = lgc->lgc_offset;
+					tmp_idx = lgc->lgc_index;
+
+					lgc->lgc_offset = (char *)rec -
+						(char *)buf + chunk_offset;
+					lgc->lgc_index = rec->lrh_index;
+				}
+				/* using lu_env for passing record offset to
+				 * llog_write through various callbacks */
 				rc = lpi->lpi_cb(lpi->lpi_env, loghandle, rec,
 						 lpi->lpi_cbdata);
 				last_called_index = index;
+
+				if (lti != NULL) {
+					lgc->lgc_offset = tmp_off;
+					lgc->lgc_index = tmp_idx;
+				}
+
 				if (rc == LLOG_PROC_BREAK) {
 					GOTO(out, rc);
 				} else if (rc == LLOG_DEL_RECORD) {
@@ -627,6 +741,11 @@ static int llog_process_thread(void *arg)
 	}
 
 out:
+	CDEBUG(D_HA, "stop processing %s "DOSTID":%x index %d count %d\n",
+	       ((llh->llh_flags & LLOG_F_IS_CAT) ? "catalog" : "plain"),
+	       POSTID(&loghandle->lgh_id.lgl_oi), loghandle->lgh_id.lgl_ogen,
+	       index, llh->llh_count);
+
 	if (cd != NULL)
 		cd->lpcd_last_idx = last_called_index;
 
@@ -638,7 +757,7 @@ static int llog_process_thread(void *arg)
 			 * retry until the umount or abort recovery, see
 			 * lod_sub_recovery_thread() */
 			CERROR("%s retry remote llog process\n",
-			       loghandle->lgh_ctxt->loc_obd->obd_name);
+			       loghandle2name(loghandle));
 			rc = -EAGAIN;
 		} else {
 			/* something bad happened to the processing of a local
@@ -647,7 +766,7 @@ static int llog_process_thread(void *arg)
 			 * discard any remaining bits in the header */
 			CERROR("%s: Local llog found corrupted #"DOSTID":%x"
 			       " %s index %d count %d\n",
-			       loghandle->lgh_ctxt->loc_obd->obd_name,
+			       loghandle2name(loghandle),
 			       POSTID(&loghandle->lgh_id.lgl_oi),
 			       loghandle->lgh_id.lgl_ogen,
 			       ((llh->llh_flags & LLOG_F_IS_CAT) ? "catalog" :
@@ -687,7 +806,8 @@ static int llog_process_thread_daemonize(void *arg)
 		 * used outside of the kernel itself, because it calls
 		 * free_nsproxy() which is not exported by the kernel
 		 * (defined in kernel/nsproxy.c) */
-		atomic_dec(&curr_ns->count);
+		if (curr_ns)
+			atomic_dec(&curr_ns->count);
 	}
 	task_unlock(lpi->lpi_reftask);
 
@@ -742,7 +862,7 @@ int llog_process_or_fork(const struct lu_env *env,
 		if (IS_ERR(task)) {
 			rc = PTR_ERR(task);
 			CERROR("%s: cannot start thread: rc = %d\n",
-			       loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+			       loghandle2name(loghandle), rc);
 			GOTO(out_lpi, rc);
 		}
 		wait_for_completion(&lpi->lpi_completion);
@@ -979,12 +1099,11 @@ int llog_write_rec(const struct lu_env *env, struct llog_handle *handle,
 		RETURN(-EPROTO);
 	} else if (th == NULL) {
 		CERROR("%s: missed transaction handle\n",
-			handle->lgh_obj->do_lu.lo_dev->ld_obd->obd_name);
+		       loghandle2name(handle));
 		RETURN(-EPROTO);
 	} else if (handle->lgh_hdr == NULL) {
 		CERROR("%s: loghandle %p with no header\n",
-			handle->lgh_obj->do_lu.lo_dev->ld_obd->obd_name,
-			handle);
+		       loghandle2name(handle), handle);
 		RETURN(-EPROTO);
 	}
 
@@ -1073,6 +1192,9 @@ int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt,
 
 	d = lu2dt_dev((*res)->lgh_obj->do_lu.lo_dev);
 
+	if (unlikely(unlikely(d->dd_rdonly)))
+		RETURN(-EROFS);
+
 	th = dt_trans_create(env, d);
 	if (IS_ERR(th))
 		GOTO(out, rc = PTR_ERR(th));
@@ -1140,7 +1262,8 @@ int llog_write(const struct lu_env *env, struct llog_handle *loghandle,
 {
 	struct dt_device	*dt;
 	struct thandle		*th;
-	int			 rc;
+	bool			need_cookie;
+	int			rc;
 
 	ENTRY;
 
@@ -1150,6 +1273,9 @@ int llog_write(const struct lu_env *env, struct llog_handle *loghandle,
 
 	dt = lu2dt_dev(loghandle->lgh_obj->do_lu.lo_dev);
 
+	if (unlikely(unlikely(dt->dd_rdonly)))
+		RETURN(-EROFS);
+
 	th = dt_trans_create(env, dt);
 	if (IS_ERR(th))
 		RETURN(PTR_ERR(th));
@@ -1163,8 +1289,21 @@ int llog_write(const struct lu_env *env, struct llog_handle *loghandle,
 	if (rc)
 		GOTO(out_trans, rc);
 
+	need_cookie = !(idx == LLOG_HEADER_IDX || idx == LLOG_NEXT_IDX);
+
 	down_write(&loghandle->lgh_lock);
-	rc = llog_write_rec(env, loghandle, rec, NULL, idx, th);
+	if (need_cookie) {
+		struct llog_thread_info *lti = llog_info(env);
+
+		/* cookie comes from llog_process_thread */
+		rc = llog_write_rec(env, loghandle, rec, &lti->lgi_cookie,
+				    rec->lrh_index, th);
+		/* upper layer didn`t pass cookie so change rc */
+		rc = (rc == 1 ? 0 : rc);
+	} else {
+		rc = llog_write_rec(env, loghandle, rec, NULL, idx, th);
+	}
+
 	up_write(&loghandle->lgh_lock);
 out_trans:
 	dt_trans_stop(env, dt, th);
@@ -1211,20 +1350,7 @@ EXPORT_SYMBOL(llog_open);
 
 int llog_close(const struct lu_env *env, struct llog_handle *loghandle)
 {
-	struct llog_operations	*lop;
-	int			 rc;
-
-	ENTRY;
-
-	rc = llog_handle2ops(loghandle, &lop);
-	if (rc)
-		GOTO(out, rc);
-	if (lop->lop_close == NULL)
-		GOTO(out, rc = -EOPNOTSUPP);
-	rc = lop->lop_close(env, loghandle);
-out:
-	llog_handle_put(loghandle);
-	RETURN(rc);
+	return llog_handle_put(env, loghandle);
 }
 EXPORT_SYMBOL(llog_close);
 
@@ -1348,8 +1474,9 @@ __u64 llog_size(const struct lu_env *env, struct llog_handle *llh)
 
 	rc = llh->lgh_obj->do_ops->do_attr_get(env, llh->lgh_obj, &la);
 	if (rc) {
-		CERROR("%s: attr_get failed, rc = %d\n",
-		       llh->lgh_ctxt->loc_obd->obd_name, rc);
+		CERROR("%s: attr_get failed for "DFID": rc = %d\n",
+		       loghandle2name(llh), PFID(&llh->lgh_id.lgl_oi.oi_fid),
+		       rc);
 		return 0;
 	}
 
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c
index e85e08bbd10c6..91f029052585e 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -88,13 +88,12 @@ static int llog_cat_new_log(const struct lu_env *env,
 		if (cathandle->lgh_name == NULL) {
 			CWARN("%s: there are no more free slots in catalog "
 			      DFID":%x\n",
-			      loghandle->lgh_ctxt->loc_obd->obd_name,
+			      loghandle2name(loghandle),
 			      PFID(&cathandle->lgh_id.lgl_oi.oi_fid),
 			      cathandle->lgh_id.lgl_ogen);
 		} else {
 			CWARN("%s: there are no more free slots in "
-			      "catalog %s\n",
-			      loghandle->lgh_ctxt->loc_obd->obd_name,
+			      "catalog %s\n", loghandle2name(loghandle),
 			      cathandle->lgh_name);
 		}
 		RETURN(-ENOSPC);
@@ -153,7 +152,7 @@ static int llog_cat_new_log(const struct lu_env *env,
 		GOTO(out, rc = 0);
 	} else if (rc != 0) {
 		CERROR("%s: can't create new plain llog in catalog: rc = %d\n",
-		       loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+		       loghandle2name(loghandle), rc);
 		GOTO(out, rc);
 	}
 
@@ -213,11 +212,137 @@ static int llog_cat_new_log(const struct lu_env *env,
 	loghandle->lgh_hdr->llh_flags &= ~LLOG_F_ZAP_WHEN_EMPTY;
 	/* this is to mimic full log, so another llog_cat_current_log()
 	 * can skip it and ask for another onet */
-	loghandle->lgh_last_idx = LLOG_HDR_BITMAP_SIZE(llh) + 1;
+	loghandle->lgh_last_idx = LLOG_HDR_BITMAP_SIZE(loghandle->lgh_hdr) + 1;
 	llog_trans_destroy(env, loghandle, th);
+	if (handle != NULL)
+		dt_trans_stop(env, dt, handle);
 	RETURN(rc);
 }
 
+static int llog_cat_refresh(const struct lu_env *env,
+			    struct llog_handle *cathandle)
+{
+	struct llog_handle *loghandle;
+	int rc;
+
+	down_write(&cathandle->lgh_lock);
+	list_for_each_entry(loghandle, &cathandle->u.chd.chd_head,
+			    u.phd.phd_entry) {
+		if (!llog_exist(loghandle))
+			continue;
+
+		rc = llog_read_header(env, loghandle, NULL);
+		if (rc)
+			goto unlock;
+	}
+
+	rc = llog_read_header(env, cathandle, NULL);
+unlock:
+	up_write(&loghandle->lgh_lock);
+
+	return rc;
+}
+
+/*
+ * prepare current/next log for catalog.
+ *
+ * if \a *ploghandle is NULL, open it, and declare create, NB, if \a
+ * *ploghandle is remote, create it synchronously here, see comments
+ * below.
+ *
+ * \a cathandle->lgh_lock is down_read-ed, it gets down_write-ed if \a
+ * *ploghandle has to be opened.
+ */
+static int llog_cat_prep_log(const struct lu_env *env,
+			     struct llog_handle *cathandle,
+			     struct llog_handle **ploghandle,
+			     struct thandle *th)
+{
+	int rc;
+	int sem_upgraded;
+
+start:
+	rc = 0;
+	sem_upgraded = 0;
+	if (IS_ERR_OR_NULL(*ploghandle)) {
+		up_read(&cathandle->lgh_lock);
+		down_write(&cathandle->lgh_lock);
+		sem_upgraded = 1;
+		if (IS_ERR_OR_NULL(*ploghandle)) {
+			struct llog_handle *loghandle;
+
+			rc = llog_open(env, cathandle->lgh_ctxt, &loghandle,
+				       NULL, NULL, LLOG_OPEN_NEW);
+			if (!rc) {
+				*ploghandle = loghandle;
+				list_add_tail(&loghandle->u.phd.phd_entry,
+					      &cathandle->u.chd.chd_head);
+			}
+		}
+		if (rc)
+			GOTO(out, rc);
+	}
+
+	rc = llog_exist(*ploghandle);
+	if (rc < 0)
+		GOTO(out, rc);
+	if (rc)
+		GOTO(out, rc = 0);
+
+	if (dt_object_remote(cathandle->lgh_obj)) {
+		down_write_nested(&(*ploghandle)->lgh_lock, LLOGH_LOG);
+		if (!llog_exist(*ploghandle)) {
+			/* For remote operation, if we put the llog object
+			 * creation in the current transaction, then the
+			 * llog object will not be created on the remote
+			 * target until the transaction stop, if other
+			 * operations start before the transaction stop,
+			 * and use the same llog object, will be dependent
+			 * on the success of this transaction. So let's
+			 * create the llog object synchronously here to
+			 * remove the dependency. */
+			rc = llog_cat_new_log(env, cathandle, *ploghandle,
+					      NULL);
+			if (rc == -ESTALE) {
+				up_write(&(*ploghandle)->lgh_lock);
+				if (sem_upgraded)
+					up_write(&cathandle->lgh_lock);
+				else
+					up_read(&cathandle->lgh_lock);
+
+				rc = llog_cat_refresh(env, cathandle);
+				down_read_nested(&cathandle->lgh_lock,
+						 LLOGH_CAT);
+				if (rc)
+					return rc;
+				/* *ploghandle might become NULL, restart */
+				goto start;
+			}
+		}
+		up_write(&(*ploghandle)->lgh_lock);
+	} else {
+		struct llog_thread_info	*lgi = llog_info(env);
+		struct llog_logid_rec *lirec = &lgi->lgi_logid;
+
+		rc = llog_declare_create(env, *ploghandle, th);
+		if (rc)
+			GOTO(out, rc);
+
+		lirec->lid_hdr.lrh_len = sizeof(*lirec);
+		rc = llog_declare_write_rec(env, cathandle, &lirec->lid_hdr, -1,
+					    th);
+	}
+
+out:
+	if (sem_upgraded) {
+		up_write(&cathandle->lgh_lock);
+		down_read_nested(&cathandle->lgh_lock, LLOGH_CAT);
+		if (rc == 0)
+			goto start;
+	}
+	return rc;
+}
+
 /* Open an existent log handle and add it to the open list.
  * This log handle will be closed when all of the records in it are removed.
  *
@@ -249,14 +374,21 @@ int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
 		    ostid_seq(&cgl->lgl_oi) == ostid_seq(&logid->lgl_oi)) {
 			if (cgl->lgl_ogen != logid->lgl_ogen) {
 				CWARN("%s: log "DFID" generation %x != %x\n",
-				      loghandle->lgh_ctxt->loc_obd->obd_name,
+				      loghandle2name(loghandle),
 				      PFID(&logid->lgl_oi.oi_fid),
 				      cgl->lgl_ogen, logid->lgl_ogen);
 				continue;
 			}
+			*res = llog_handle_get(loghandle);
+			if (!*res) {
+				CERROR("%s: log "DFID" refcount is zero!\n",
+				       loghandle2name(loghandle),
+				       PFID(&logid->lgl_oi.oi_fid));
+				continue;
+			}
 			loghandle->u.phd.phd_cat_handle = cathandle;
 			up_write(&cathandle->lgh_lock);
-			GOTO(out, rc = 0);
+			RETURN(rc);
 		}
 	}
 	up_write(&cathandle->lgh_lock);
@@ -265,18 +397,20 @@ int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
 		       LLOG_OPEN_EXISTS);
 	if (rc < 0) {
 		CERROR("%s: error opening log id "DFID":%x: rc = %d\n",
-		       cathandle->lgh_ctxt->loc_obd->obd_name,
-		       PFID(&logid->lgl_oi.oi_fid), logid->lgl_ogen, rc);
+		       loghandle2name(cathandle), PFID(&logid->lgl_oi.oi_fid),
+		       logid->lgl_ogen, rc);
 		RETURN(rc);
 	}
 
 	rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN | fmt, NULL);
 	if (rc < 0) {
 		llog_close(env, loghandle);
-		loghandle = NULL;
+		*res = NULL;
 		RETURN(rc);
 	}
 
+	*res = llog_handle_get(loghandle);
+	LASSERT(*res);
 	down_write(&cathandle->lgh_lock);
 	list_add_tail(&loghandle->u.phd.phd_entry, &cathandle->u.chd.chd_head);
 	up_write(&cathandle->lgh_lock);
@@ -285,11 +419,7 @@ int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
 	loghandle->u.phd.phd_cookie.lgc_lgl = cathandle->lgh_id;
 	loghandle->u.phd.phd_cookie.lgc_index =
 				loghandle->lgh_hdr->llh_cat_idx;
-	EXIT;
-out:
-	llog_handle_get(loghandle);
-	*res = loghandle;
-	return 0;
+	RETURN(0);
 }
 
 int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle)
@@ -314,8 +444,7 @@ int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle)
 			if (rc)
 				CERROR("%s: failure destroying log during "
 				       "cleanup: rc = %d\n",
-				       loghandle->lgh_ctxt->loc_obd->obd_name,
-				       rc);
+				       loghandle2name(loghandle), rc);
 
 			index = loghandle->u.phd.phd_cookie.lgc_index;
 			llog_cat_cleanup(env, cathandle, NULL, index);
@@ -401,7 +530,7 @@ static struct llog_handle *llog_cat_current_log(struct llog_handle *cathandle,
 	 * meet this situation. */
 	if (IS_ERR_OR_NULL(cathandle->u.chd.chd_next_log)) {
 		CERROR("%s: next log does not exist!\n",
-		       cathandle->lgh_ctxt->loc_obd->obd_name);
+		       loghandle2name(cathandle));
 		loghandle = ERR_PTR(-EIO);
 		if (cathandle->u.chd.chd_next_log == NULL) {
 			/* Store the error in chd_next_log, so
@@ -425,40 +554,6 @@ static struct llog_handle *llog_cat_current_log(struct llog_handle *cathandle,
 	RETURN(loghandle);
 }
 
-static int llog_cat_update_header(const struct lu_env *env,
-			   struct llog_handle *cathandle)
-{
-	struct llog_handle *loghandle;
-	int rc;
-	ENTRY;
-
-	/* refresh llog */
-	down_write(&cathandle->lgh_lock);
-	if (!cathandle->lgh_stale) {
-		up_write(&cathandle->lgh_lock);
-		RETURN(0);
-	}
-	list_for_each_entry(loghandle, &cathandle->u.chd.chd_head,
-			    u.phd.phd_entry) {
-		if (!llog_exist(loghandle))
-			continue;
-
-		rc = llog_read_header(env, loghandle, NULL);
-		if (rc != 0) {
-			up_write(&cathandle->lgh_lock);
-			GOTO(out, rc);
-		}
-	}
-	rc = llog_read_header(env, cathandle, NULL);
-	if (rc == 0)
-		cathandle->lgh_stale = 0;
-	up_write(&cathandle->lgh_lock);
-	if (rc != 0)
-		GOTO(out, rc);
-out:
-	RETURN(rc);
-}
-
 /* Add a single record to the recovery log(s) using a catalog
  * Returns as llog_write_record
  *
@@ -512,7 +607,7 @@ int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle,
 		if (retried++ == 0)
 			GOTO(retry, rc);
 		CERROR("%s: error on 2nd llog: rc = %d\n",
-		       cathandle->lgh_ctxt->loc_obd->obd_name, rc);
+		       loghandle2name(cathandle), rc);
 	}
 
 	RETURN(rc);
@@ -523,167 +618,43 @@ int llog_cat_declare_add_rec(const struct lu_env *env,
 			     struct llog_handle *cathandle,
 			     struct llog_rec_hdr *rec, struct thandle *th)
 {
-	struct llog_thread_info	*lgi = llog_info(env);
-	struct llog_logid_rec	*lirec = &lgi->lgi_logid;
-	struct llog_handle	*loghandle, *next;
-	int			 rc = 0;
+	int rc;
 
 	ENTRY;
 
-	if (cathandle->u.chd.chd_current_log == NULL) {
-		/* declare new plain llog */
-		down_write(&cathandle->lgh_lock);
-		if (cathandle->u.chd.chd_current_log == NULL) {
-			rc = llog_open(env, cathandle->lgh_ctxt, &loghandle,
-				       NULL, NULL, LLOG_OPEN_NEW);
-			if (rc == 0) {
-				cathandle->u.chd.chd_current_log = loghandle;
-				list_add_tail(&loghandle->u.phd.phd_entry,
-					      &cathandle->u.chd.chd_head);
-			}
-		}
-		up_write(&cathandle->lgh_lock);
-	} else if (cathandle->u.chd.chd_next_log == NULL ||
-		   IS_ERR(cathandle->u.chd.chd_next_log)) {
-		/* declare next plain llog */
-		down_write(&cathandle->lgh_lock);
-		if (cathandle->u.chd.chd_next_log == NULL ||
-		    IS_ERR(cathandle->u.chd.chd_next_log)) {
-			rc = llog_open(env, cathandle->lgh_ctxt, &loghandle,
-				       NULL, NULL, LLOG_OPEN_NEW);
-			if (rc == 0) {
-				cathandle->u.chd.chd_next_log = loghandle;
-				list_add_tail(&loghandle->u.phd.phd_entry,
-					      &cathandle->u.chd.chd_head);
-			}
-		}
-		up_write(&cathandle->lgh_lock);
-	}
+start:
+	down_read_nested(&cathandle->lgh_lock, LLOGH_CAT);
+	rc = llog_cat_prep_log(env, cathandle,
+			       &cathandle->u.chd.chd_current_log, th);
 	if (rc)
-		GOTO(out, rc);
+		GOTO(unlock, rc);
 
-	lirec->lid_hdr.lrh_len = sizeof(*lirec);
-
-	if (!llog_exist(cathandle->u.chd.chd_current_log)) {
-		if (dt_object_remote(cathandle->lgh_obj)) {
-			/* For remote operation, if we put the llog object
-			 * creation in the current transaction, then the
-			 * llog object will not be created on the remote
-			 * target until the transaction stop, if other
-			 * operations start before the transaction stop,
-			 * and use the same llog object, will be dependent
-			 * on the success of this transaction. So let's
-			 * create the llog object synchronously here to
-			 * remove the dependency. */
-create_again:
-			down_read_nested(&cathandle->lgh_lock, LLOGH_CAT);
-			loghandle = cathandle->u.chd.chd_current_log;
-			down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
-			if (cathandle->lgh_stale) {
-				up_write(&loghandle->lgh_lock);
-				up_read(&cathandle->lgh_lock);
-				GOTO(out, rc = -EIO);
-			}
-			if (!llog_exist(loghandle)) {
-				rc = llog_cat_new_log(env, cathandle, loghandle,
-						      NULL);
-				if (rc == -ESTALE)
-					cathandle->lgh_stale = 1;
-			}
-			up_write(&loghandle->lgh_lock);
-			up_read(&cathandle->lgh_lock);
-			if (rc == -ESTALE) {
-				rc = llog_cat_update_header(env, cathandle);
-				if (rc != 0)
-					GOTO(out, rc);
-				goto create_again;
-			} else if (rc < 0) {
-				GOTO(out, rc);
-			}
-		} else {
-			rc = llog_declare_create(env,
-					cathandle->u.chd.chd_current_log, th);
-			if (rc)
-				GOTO(out, rc);
-			llog_declare_write_rec(env, cathandle,
-					       &lirec->lid_hdr, -1, th);
-		}
-	}
+	rc = llog_cat_prep_log(env, cathandle, &cathandle->u.chd.chd_next_log,
+			       th);
+	if (rc)
+		GOTO(unlock, rc);
 
-write_again:
-	/* declare records in the llogs */
 	rc = llog_declare_write_rec(env, cathandle->u.chd.chd_current_log,
 				    rec, -1, th);
-	if (rc == -ESTALE) {
-		down_write(&cathandle->lgh_lock);
-		if (cathandle->lgh_stale) {
-			up_write(&cathandle->lgh_lock);
-			GOTO(out, rc = -EIO);
-		}
-
-		cathandle->lgh_stale = 1;
-		up_write(&cathandle->lgh_lock);
-		rc = llog_cat_update_header(env, cathandle);
-		if (rc != 0)
-			GOTO(out, rc);
-		goto write_again;
-	} else if (rc < 0) {
-		GOTO(out, rc);
+	if (rc == -ESTALE && dt_object_remote(cathandle->lgh_obj)) {
+		up_read(&cathandle->lgh_lock);
+		rc = llog_cat_refresh(env, cathandle);
+		if (rc)
+			RETURN(rc);
+		goto start;
 	}
 
-	next = cathandle->u.chd.chd_next_log;
-	if (!IS_ERR_OR_NULL(next)) {
-		if (!llog_exist(next)) {
-			if (dt_object_remote(cathandle->lgh_obj)) {
-				/* For remote operation, if we put the llog
-				 * object creation in the current transaction,
-				 * then the llog object will not be created on
-				 * the remote target until the transaction stop,
-				 * if other operations start before the
-				 * transaction stop, and use the same llog
-				 * object, will be dependent on the success of
-				 * this transaction. So let's create the llog
-				 * object synchronously here to remove the
-				 * dependency. */
-				down_write_nested(&cathandle->lgh_lock,
-						 LLOGH_CAT);
-				next = cathandle->u.chd.chd_next_log;
-				if (IS_ERR_OR_NULL(next)) {
-					/* Sigh, another thread just tried,
-					 * let's fail as well */
-					up_write(&cathandle->lgh_lock);
-					if (next == NULL)
-						rc = -EIO;
-					else
-						rc = PTR_ERR(next);
-					GOTO(out, rc);
-				}
-
-				down_write_nested(&next->lgh_lock, LLOGH_LOG);
-				if (!llog_exist(next)) {
-					rc = llog_cat_new_log(env, cathandle,
-							      next, NULL);
-					if (rc < 0)
-						cathandle->u.chd.chd_next_log =
-								ERR_PTR(rc);
-				}
-				up_write(&next->lgh_lock);
-				up_write(&cathandle->lgh_lock);
-				if (rc < 0)
-					GOTO(out, rc);
-			} else {
-				rc = llog_declare_create(env, next, th);
-				llog_declare_write_rec(env, cathandle,
-						&lirec->lid_hdr, -1, th);
-			}
-		}
-		/* XXX: we hope for declarations made for existing llog
-		 *	this might be not correct with some backends
-		 *	where declarations are expected against specific
-		 *	object like ZFS with full debugging enabled */
-		/*llog_declare_write_rec(env, next, rec, -1, th);*/
-	}
-out:
+#if 0
+	/*
+	 * XXX: we hope for declarations made for existing llog this might be
+	 * not correct with some backends where declarations are expected
+	 * against specific object like ZFS with full debugging enabled.
+	 */
+	rc = llog_declare_write_rec(env, cathandle->u.chd.chd_next_log, rec, -1,
+				    th);
+#endif
+unlock:
+	up_read(&cathandle->lgh_lock);
 	RETURN(rc);
 }
 EXPORT_SYMBOL(llog_cat_declare_add_rec);
@@ -746,8 +717,7 @@ int llog_cat_cancel_records(const struct lu_env *env,
 		rc = llog_cat_id2handle(env, cathandle, &loghandle, lgl);
 		if (rc) {
 			CDEBUG(D_HA, "%s: cannot find llog for handle "DFID":%x"
-			       ": rc = %d\n",
-			       cathandle->lgh_ctxt->loc_obd->obd_name,
+			       ": rc = %d\n", loghandle2name(cathandle),
 			       PFID(&lgl->lgl_oi.oi_fid), lgl->lgl_ogen, rc);
 			failed++;
 			continue;
@@ -762,8 +732,7 @@ int llog_cat_cancel_records(const struct lu_env *env,
 			 */
 			lrc = -ENOENT;
 			CDEBUG(D_HA, "%s: llog "DFID":%x does not exist"
-			       ": rc = %d\n",
-			       cathandle->lgh_ctxt->loc_obd->obd_name,
+			       ": rc = %d\n", loghandle2name(cathandle),
 			       PFID(&lgl->lgl_oi.oi_fid), lgl->lgl_ogen, lrc);
 			failed++;
 			if (rc == 0)
@@ -786,68 +755,86 @@ int llog_cat_cancel_records(const struct lu_env *env,
 			if (rc == 0)
 				rc = lrc;
 		}
-		llog_handle_put(loghandle);
+		llog_handle_put(env, loghandle);
 	}
 	if (rc)
 		CERROR("%s: fail to cancel %d of %d llog-records: rc = %d\n",
-		       cathandle->lgh_ctxt->loc_obd->obd_name, failed, count,
-		       rc);
+		       loghandle2name(cathandle), failed, count, rc);
 
 	RETURN(rc);
 }
 EXPORT_SYMBOL(llog_cat_cancel_records);
 
-static int llog_cat_process_cb(const struct lu_env *env,
-			       struct llog_handle *cat_llh,
-			       struct llog_rec_hdr *rec, void *data)
+static int llog_cat_process_common(const struct lu_env *env,
+				   struct llog_handle *cat_llh,
+				   struct llog_rec_hdr *rec,
+				   struct llog_handle **llhp)
 {
-	struct llog_process_data *d = data;
-	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
-	struct llog_handle *llh;
+	struct llog_logid_rec *lir = container_of(rec, typeof(*lir), lid_hdr);
 	struct llog_log_hdr *hdr;
 	int rc;
 
 	ENTRY;
-	if (rec->lrh_type != LLOG_LOGID_MAGIC) {
-		CERROR("invalid record in catalog\n");
-		RETURN(-EINVAL);
+	if (rec->lrh_type != le32_to_cpu(LLOG_LOGID_MAGIC)) {
+		rc = -EINVAL;
+		CWARN("%s: invalid record in catalog "DFID":%x: rc = %d\n",
+		      loghandle2name(cat_llh),
+		      PFID(&cat_llh->lgh_id.lgl_oi.oi_fid),
+		      cat_llh->lgh_id.lgl_ogen, rc);
+		RETURN(rc);
 	}
-	CDEBUG(D_HA, "processing log "DFID":%x at index %u of catalog "
-	       DFID"\n", PFID(&lir->lid_id.lgl_oi.oi_fid), lir->lid_id.lgl_ogen,
-	       rec->lrh_index, PFID(&cat_llh->lgh_id.lgl_oi.oi_fid));
+	CDEBUG(D_HA, "processing log "DFID":%x at index %u of catalog "DFID"\n",
+	       PFID(&lir->lid_id.lgl_oi.oi_fid), lir->lid_id.lgl_ogen,
+	       le32_to_cpu(rec->lrh_index),
+	       PFID(&cat_llh->lgh_id.lgl_oi.oi_fid));
 
-	rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id);
+	rc = llog_cat_id2handle(env, cat_llh, llhp, &lir->lid_id);
 	if (rc) {
-		CERROR("%s: cannot find handle for llog "DFID": rc = %d\n",
-		       cat_llh->lgh_ctxt->loc_obd->obd_name,
-		       PFID(&lir->lid_id.lgl_oi.oi_fid), rc);
-		if (rc == -ENOENT || rc == -ESTALE) {
-			/* After a server crash, a stub of index
-			 * record in catlog could be kept, because
-			 * plain log destroy + catlog index record
-			 * deletion are not atomic. So we end up with
-			 * an index but no actual record. Destroy the
-			 * index and move on. */
-			rc = llog_cat_cleanup(env, cat_llh, NULL,
-					      rec->lrh_index);
-		}
+		/* After a server crash, a stub of index record in catlog could
+		 * be kept, because plain log destroy + catlog index record
+		 * deletion are not atomic. So we end up with an index but no
+		 * actual record. Destroy the index and move on. */
+		if (rc == -ENOENT || rc == -ESTALE)
+			rc = LLOG_DEL_RECORD;
+		else if (rc)
+			CWARN("%s: can't find llog handle "DFID":%x: rc = %d\n",
+			      loghandle2name(cat_llh),
+			      PFID(&lir->lid_id.lgl_oi.oi_fid),
+			      lir->lid_id.lgl_ogen, rc);
 
 		RETURN(rc);
 	}
 
 	/* clean old empty llogs, do not consider current llog in use */
-	/* ignore remote (lgh_obj=NULL) llogs */
-	hdr = llh->lgh_hdr;
+	/* ignore remote (lgh_obj == NULL) llogs */
+	hdr = (*llhp)->lgh_hdr;
 	if ((hdr->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
 	    hdr->llh_count == 1 && cat_llh->lgh_obj != NULL &&
-	    llh != cat_llh->u.chd.chd_current_log) {
-		rc = llog_destroy(env, llh);
+	    *llhp != cat_llh->u.chd.chd_current_log) {
+		rc = llog_destroy(env, *llhp);
 		if (rc)
-			CERROR("%s: fail to destroy empty log: rc = %d\n",
-			       llh->lgh_ctxt->loc_obd->obd_name, rc);
-		GOTO(out, rc = LLOG_DEL_PLAIN);
+			CWARN("%s: can't destroy empty log "DFID": rc = %d\n",
+			      loghandle2name((*llhp)),
+			      PFID(&lir->lid_id.lgl_oi.oi_fid), rc);
+		rc = LLOG_DEL_PLAIN;
 	}
 
+	RETURN(rc);
+}
+
+static int llog_cat_process_cb(const struct lu_env *env,
+			       struct llog_handle *cat_llh,
+			       struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_process_data *d = data;
+	struct llog_handle *llh = NULL;
+	int rc;
+
+	ENTRY;
+	rc = llog_cat_process_common(env, cat_llh, rec, &llh);
+	if (rc)
+		GOTO(out, rc);
+
 	if (rec->lrh_index < d->lpd_startcat) {
 		/* Skip processing of the logs until startcat */
 		rc = 0;
@@ -864,13 +851,29 @@ static int llog_cat_process_cb(const struct lu_env *env,
 		rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data,
 					  NULL, false);
 	}
+	if (rc == -ENOENT && (cat_llh->lgh_hdr->llh_flags & LLOG_F_RM_ON_ERR)) {
+		/*
+		 * plain llog is reported corrupted, so better to just remove
+		 * it if the caller is fine with that.
+		 */
+		CERROR("%s: remove corrupted/missing llog "DFID"\n",
+		       loghandle2name(cat_llh),
+		       PFID(&llh->lgh_id.lgl_oi.oi_fid));
+		rc = LLOG_DEL_PLAIN;
+	}
 
 out:
 	/* The empty plain log was destroyed while processing */
-	if (rc == LLOG_DEL_PLAIN)
+	if (rc == LLOG_DEL_PLAIN) {
 		rc = llog_cat_cleanup(env, cat_llh, llh,
 				      llh->u.phd.phd_cookie.lgc_index);
-	llog_handle_put(llh);
+	} else if (rc == LLOG_DEL_RECORD) {
+		/* clear wrong catalog entry */
+		rc = llog_cat_cleanup(env, cat_llh, NULL, rec->lrh_index);
+	}
+
+	if (llh)
+		llog_handle_put(env, llh);
 
 	RETURN(rc);
 }
@@ -880,43 +883,62 @@ int llog_cat_process_or_fork(const struct lu_env *env,
 			     llog_cb_t cb, void *data, int startcat,
 			     int startidx, bool fork)
 {
-        struct llog_process_data d;
-        struct llog_log_hdr *llh = cat_llh->lgh_hdr;
-        int rc;
-        ENTRY;
+	struct llog_process_data d;
+	struct llog_log_hdr *llh = cat_llh->lgh_hdr;
+	int rc;
 
-        LASSERT(llh->llh_flags & LLOG_F_IS_CAT);
-        d.lpd_data = data;
-        d.lpd_cb = cb;
-        d.lpd_startcat = startcat;
-        d.lpd_startidx = startidx;
+	ENTRY;
+
+	LASSERT(llh->llh_flags & LLOG_F_IS_CAT);
+	d.lpd_data = data;
+	d.lpd_cb = cb;
+	d.lpd_startcat = (startcat == LLOG_CAT_FIRST ? 0 : startcat);
+	d.lpd_startidx = startidx;
 
 	if (llh->llh_cat_idx >= cat_llh->lgh_last_idx &&
 	    llh->llh_count > 1) {
 		struct llog_process_cat_data cd;
 
 		CWARN("%s: catlog "DFID" crosses index zero\n",
-		      cat_llh->lgh_ctxt->loc_obd->obd_name,
+		      loghandle2name(cat_llh),
 		      PFID(&cat_llh->lgh_id.lgl_oi.oi_fid));
-
-		cd.lpcd_first_idx = llh->llh_cat_idx;
-		cd.lpcd_last_idx = 0;
-		rc = llog_process_or_fork(env, cat_llh, cat_cb,
-					  &d, &cd, fork);
-		if (rc != 0)
-			RETURN(rc);
-
-		cd.lpcd_first_idx = 0;
+		/*startcat = 0 is default value for general processing */
+		if ((startcat != LLOG_CAT_FIRST &&
+		    startcat >= llh->llh_cat_idx) || !startcat) {
+			/* processing the catalog part at the end */
+			cd.lpcd_first_idx = (startcat ? startcat :
+					     llh->llh_cat_idx);
+			if (OBD_FAIL_PRECHECK(OBD_FAIL_CAT_RECORDS))
+				cd.lpcd_last_idx = cfs_fail_val;
+			else
+				cd.lpcd_last_idx = 0;
+			rc = llog_process_or_fork(env, cat_llh, cat_cb,
+						  &d, &cd, fork);
+			/* Reset the startcat becasue it has already reached
+			 * catalog bottom.
+			 */
+			startcat = 0;
+			if (rc != 0)
+				RETURN(rc);
+		}
+		/* processing the catalog part at the begining */
+		cd.lpcd_first_idx = (startcat == LLOG_CAT_FIRST) ? 0 : startcat;
+		/* Note, the processing will stop at the lgh_last_idx value,
+		 * and it could be increased during processing. So records
+		 * between current lgh_last_idx and lgh_last_idx in future
+		 * would left unprocessed.
+		 */
 		cd.lpcd_last_idx = cat_llh->lgh_last_idx;
 		rc = llog_process_or_fork(env, cat_llh, cat_cb,
 					  &d, &cd, fork);
-        } else {
+	} else {
 		rc = llog_process_or_fork(env, cat_llh, cat_cb,
 					  &d, NULL, fork);
-        }
+	}
 
-        RETURN(rc);
+	RETURN(rc);
 }
+EXPORT_SYMBOL(llog_cat_process_or_fork);
 
 int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh,
 		     llog_cb_t cb, void *data, int startcat, int startidx)
@@ -931,39 +953,33 @@ static int llog_cat_size_cb(const struct lu_env *env,
 			     struct llog_rec_hdr *rec, void *data)
 {
 	struct llog_process_data *d = data;
-	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
-	struct llog_handle *llh;
-	int rc;
+	struct llog_handle *llh = NULL;
 	__u64 *cum_size = d->lpd_data;
 	__u64 size;
+	int rc;
 
 	ENTRY;
-	if (rec->lrh_type != LLOG_LOGID_MAGIC) {
-		CERROR("%s: invalid record in catalog, rc = %d\n",
-		       cat_llh->lgh_ctxt->loc_obd->obd_name, -EINVAL);
-		RETURN(-EINVAL);
-	}
-	CDEBUG(D_HA, "processing log "DFID":%x at index %u of catalog "
-	       DFID"\n", PFID(&lir->lid_id.lgl_oi.oi_fid), lir->lid_id.lgl_ogen,
-	       rec->lrh_index, PFID(&cat_llh->lgh_id.lgl_oi.oi_fid));
+	rc = llog_cat_process_common(env, cat_llh, rec, &llh);
 
-	rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id);
-	if (rc) {
-		CWARN("%s: cannot find handle for llog "DFID": rc = %d\n",
-		      cat_llh->lgh_ctxt->loc_obd->obd_name,
-		      PFID(&lir->lid_id.lgl_oi.oi_fid), rc);
-		RETURN(0);
-	}
-	size = llog_size(env, llh);
-	*cum_size += size;
+	if (rc == LLOG_DEL_PLAIN) {
+		/* empty log was deleted, don't count it */
+		rc = llog_cat_cleanup(env, cat_llh, llh,
+				      llh->u.phd.phd_cookie.lgc_index);
+	} else if (rc == LLOG_DEL_RECORD) {
+		/* clear wrong catalog entry */
+		rc = llog_cat_cleanup(env, cat_llh, NULL, rec->lrh_index);
+	} else {
+		size = llog_size(env, llh);
+		*cum_size += size;
 
-	CDEBUG(D_INFO, "Add llog entry "DFID" size %llu\n",
-	       PFID(&llh->lgh_id.lgl_oi.oi_fid), size);
+		CDEBUG(D_INFO, "Add llog entry "DFID" size=%llu, tot=%llu\n",
+		       PFID(&llh->lgh_id.lgl_oi.oi_fid), size, *cum_size);
+	}
 
-	llog_handle_put(llh);
+	if (llh != NULL)
+		llog_handle_put(env, llh);
 
 	RETURN(0);
-
 }
 
 __u64 llog_cat_size(const struct lu_env *env, struct llog_handle *cat_llh)
@@ -977,65 +993,58 @@ __u64 llog_cat_size(const struct lu_env *env, struct llog_handle *cat_llh)
 }
 EXPORT_SYMBOL(llog_cat_size);
 
+/* currently returns the number of "free" entries in catalog,
+ * ie the available entries for a new plain LLOG file creation,
+ * even if catalog has wrapped
+ */
+__u32 llog_cat_free_space(struct llog_handle *cat_llh)
+{
+	/* simulate almost full Catalog */
+	if (OBD_FAIL_CHECK(OBD_FAIL_CAT_FREE_RECORDS))
+		return cfs_fail_val;
+
+	if (cat_llh->lgh_hdr->llh_count == 1)
+		return LLOG_HDR_BITMAP_SIZE(cat_llh->lgh_hdr) - 1;
+
+	if (cat_llh->lgh_last_idx > cat_llh->lgh_hdr->llh_cat_idx)
+		return LLOG_HDR_BITMAP_SIZE(cat_llh->lgh_hdr) - 1 +
+		       cat_llh->lgh_hdr->llh_cat_idx - cat_llh->lgh_last_idx;
+
+	/* catalog is presently wrapped */
+	return cat_llh->lgh_hdr->llh_cat_idx - cat_llh->lgh_last_idx;
+}
+EXPORT_SYMBOL(llog_cat_free_space);
+
 static int llog_cat_reverse_process_cb(const struct lu_env *env,
 				       struct llog_handle *cat_llh,
 				       struct llog_rec_hdr *rec, void *data)
 {
 	struct llog_process_data *d = data;
-	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
 	struct llog_handle *llh;
-	struct llog_log_hdr *hdr;
 	int rc;
 
-	if (le32_to_cpu(rec->lrh_type) != LLOG_LOGID_MAGIC) {
-		CERROR("invalid record in catalog\n");
-		RETURN(-EINVAL);
-	}
-	CDEBUG(D_HA, "processing log "DFID":%x at index %u of catalog "
-	       DFID"\n", PFID(&lir->lid_id.lgl_oi.oi_fid), lir->lid_id.lgl_ogen,
-	       le32_to_cpu(rec->lrh_index),
-	       PFID(&cat_llh->lgh_id.lgl_oi.oi_fid));
-
-	rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id);
-	if (rc) {
-		CERROR("%s: cannot find handle for llog "DFID": rc = %d\n",
-		       cat_llh->lgh_ctxt->loc_obd->obd_name,
-		       PFID(&lir->lid_id.lgl_oi.oi_fid), rc);
-		if (rc == -ENOENT || rc == -ESTALE) {
-			/* After a server crash, a stub of index
-			 * record in catlog could be kept, because
-			 * plain log destroy + catlog index record
-			 * deletion are not atomic. So we end up with
-			 * an index but no actual record. Destroy the
-			 * index and move on. */
-			rc = llog_cat_cleanup(env, cat_llh, NULL,
-					      rec->lrh_index);
-		}
-
-		RETURN(rc);
-	}
+	ENTRY;
+	rc = llog_cat_process_common(env, cat_llh, rec, &llh);
 
-	/* clean old empty llogs, do not consider current llog in use */
-	hdr = llh->lgh_hdr;
-	if ((hdr->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
-	    hdr->llh_count == 1 &&
-	    llh != cat_llh->u.chd.chd_current_log) {
-		rc = llog_destroy(env, llh);
-		if (rc)
-			CERROR("%s: fail to destroy empty log: rc = %d\n",
-			       llh->lgh_ctxt->loc_obd->obd_name, rc);
-		GOTO(out, rc = LLOG_DEL_PLAIN);
+	/* The empty plain log was destroyed while processing */
+	if (rc == LLOG_DEL_PLAIN) {
+		rc = llog_cat_cleanup(env, cat_llh, llh,
+				      llh->u.phd.phd_cookie.lgc_index);
+	} else if (rc == LLOG_DEL_RECORD) {
+		/* clear wrong catalog entry */
+		rc = llog_cat_cleanup(env, cat_llh, NULL, rec->lrh_index);
 	}
+	if (rc)
+		RETURN(rc);
 
 	rc = llog_reverse_process(env, llh, d->lpd_cb, d->lpd_data, NULL);
 
-out:
 	/* The empty plain was destroyed while processing */
 	if (rc == LLOG_DEL_PLAIN)
 		rc = llog_cat_cleanup(env, cat_llh, llh,
 				      llh->u.phd.phd_cookie.lgc_index);
 
-	llog_handle_put(llh);
+	llog_handle_put(env, llh);
 	RETURN(rc);
 }
 
@@ -1056,7 +1065,7 @@ int llog_cat_reverse_process(const struct lu_env *env,
 	if (llh->llh_cat_idx >= cat_llh->lgh_last_idx &&
 	    llh->llh_count > 1) {
 		CWARN("%s: catalog "DFID" crosses index zero\n",
-		      cat_llh->lgh_ctxt->loc_obd->obd_name,
+		      loghandle2name(cat_llh),
 		      PFID(&cat_llh->lgh_id.lgl_oi.oi_fid));
 
 		cd.lpcd_first_idx = 0;
@@ -1114,7 +1123,7 @@ static int llog_cat_set_first_idx(struct llog_handle *cathandle, int idx)
 			}
 		}
 
-		CDEBUG(D_RPCTRACE, "catlog "DFID" first idx %u, last_idx %u\n",
+		CDEBUG(D_HA, "catlog "DFID" first idx %u, last_idx %u\n",
 		       PFID(&cathandle->lgh_id.lgl_oi.oi_fid),
 		       llh->llh_cat_idx, cathandle->lgh_last_idx);
 	}
@@ -1127,11 +1136,13 @@ int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle,
 		     struct llog_handle *loghandle, int index)
 {
 	int rc;
+	struct lu_fid fid = {.f_seq = 0, .f_oid = 0, .f_ver = 0};
 
 	LASSERT(index);
 	if (loghandle != NULL) {
 		/* remove destroyed llog from catalog list and
 		 * chd_current_log variable */
+		fid = loghandle->lgh_id.lgl_oi.oi_fid;
 		down_write(&cathandle->lgh_lock);
 		if (cathandle->u.chd.chd_current_log == loghandle)
 			cathandle->u.chd.chd_current_log = NULL;
@@ -1150,7 +1161,9 @@ int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle,
 	llog_cat_set_first_idx(cathandle, index);
 	rc = llog_cancel_rec(env, cathandle, index);
 	if (rc == 0)
-		CDEBUG(D_HA, "cancel plain log at index %u of catalog "DFID"\n",
-		       index, PFID(&cathandle->lgh_id.lgl_oi.oi_fid));
+		CDEBUG(D_HA,
+		       "cancel plain log "DFID" at index %u of catalog "DFID"\n",
+		       PFID(&fid), index,
+		       PFID(&cathandle->lgh_id.lgl_oi.oi_fid));
 	return rc;
 }
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_internal.h b/drivers/staging/lustrefsx/lustre/obdclass/llog_internal.h
index eb9526ad504d0..c42f13ea6824f 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/llog_internal.h
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -74,8 +74,8 @@ static inline struct llog_thread_info *llog_info(const struct lu_env *env)
 int llog_info_init(void);
 void llog_info_fini(void);
 
-void llog_handle_get(struct llog_handle *loghandle);
-void llog_handle_put(struct llog_handle *loghandle);
+struct llog_handle *llog_handle_get(struct llog_handle *loghandle);
+int llog_handle_put(const struct lu_env *env, struct llog_handle *loghandle);
 int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
 		       struct llog_handle **res, struct llog_logid *logid);
 int class_config_dump_handler(const struct lu_env *env,
@@ -92,4 +92,9 @@ static inline struct llog_rec_hdr *llog_rec_hdr_next(struct llog_rec_hdr *rec)
 {
 	return (struct llog_rec_hdr *)((char *)rec + rec->lrh_len);
 }
+int llog_verify_record(const struct llog_handle *llh, struct llog_rec_hdr *rec);
+static inline char *loghandle2name(const struct llog_handle *lgh)
+{
+	return lgh->lgh_ctxt->loc_obd->obd_name;
+}
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_ioctl.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_ioctl.c
index 906e6e64ef4e6..276ffa8280c84 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/llog_ioctl.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_ioctl.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -33,14 +33,16 @@
 #define DEBUG_SUBSYSTEM S_LOG
 
 #include <obd_class.h>
-#include <uapi/linux/lustre_ioctl.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
 #include <lustre_log.h>
 #include "llog_internal.h"
 
 static int str2logid(struct llog_logid *logid, char *str, int len)
 {
-	char *start, *end, *endp;
-	__u64 id, seq;
+	unsigned long long id, seq;
+	char *start, *end;
+	u32 ogen;
+	int rc;
 
 	ENTRY;
 	start = str;
@@ -56,10 +58,12 @@ static int str2logid(struct llog_logid *logid, char *str, int len)
 	}
 
 #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 1, 53, 0)
-	/* logids used to be input in the form "#id#seq:ogen" before they
+	/*
+	 * logids used to be input in the form "#id#seq:ogen" before they
 	 * were changed over to accept the FID [seq:oid:ver] format.
 	 * This is accepted for compatibility reasons, though I doubt
-	 * anyone is actually using this for anything. */
+	 * anyone is actually using this for anything.
+	 */
 	if (start[0] != '#')
 		RETURN(-EINVAL);
 
@@ -71,34 +75,37 @@ static int str2logid(struct llog_logid *logid, char *str, int len)
 		RETURN(-EINVAL);
 
 	*end = '\0';
-	id = simple_strtoull(start, &endp, 0);
-        if (endp != end)
-                RETURN(-EINVAL);
+	rc = kstrtoull(start, 0, &id);
+	if (rc)
+		RETURN(rc);
 
-        start = ++end;
-        if (start - str >= len - 1)
-                RETURN(-EINVAL);
-        end = strchr(start, '#');
-        if (end == NULL || end == start)
-                RETURN(-EINVAL);
+	start = ++end;
+	if (start - str >= len - 1)
+		RETURN(-EINVAL);
 
-        *end = '\0';
-	seq = simple_strtoull(start, &endp, 0);
-        if (endp != end)
-                RETURN(-EINVAL);
+	end = strchr(start, '#');
+	if (!end || end == start)
+		RETURN(-EINVAL);
+
+	*end = '\0';
+	rc = kstrtoull(start, 0, &seq);
+	if (rc)
+		RETURN(rc);
 
 	ostid_set_seq(&logid->lgl_oi, seq);
 	if (ostid_set_id(&logid->lgl_oi, id))
 		RETURN(-EINVAL);
 
 	start = ++end;
-        if (start - str >= len - 1)
-                RETURN(-EINVAL);
-        logid->lgl_ogen = simple_strtoul(start, &endp, 16);
-        if (*endp != '\0')
+	if (start - str >= len - 1)
+		RETURN(-EINVAL);
+
+	rc = kstrtouint(start, 16, &ogen);
+	if (rc)
                 RETURN(-EINVAL);
+	logid->lgl_ogen = ogen;
 
-        RETURN(0);
+	RETURN(0);
 #else
 	RETURN(-EINVAL);
 #endif
@@ -107,29 +114,31 @@ static int str2logid(struct llog_logid *logid, char *str, int len)
 static int llog_check_cb(const struct lu_env *env, struct llog_handle *handle,
 			 struct llog_rec_hdr *rec, void *data)
 {
-        struct obd_ioctl_data *ioc_data = (struct obd_ioctl_data *)data;
+	struct obd_ioctl_data *ioc_data = data;
 	static int l, remains;
 	static long from, to;
-        static char *out;
-        char *endp;
-        int cur_index, rc = 0;
-
-        ENTRY;
+	static char *out;
+	int cur_index;
+	int rc = 0;
 
+	ENTRY;
 	if (ioc_data && ioc_data->ioc_inllen1 > 0) {
-                l = 0;
-                remains = ioc_data->ioc_inllen4 +
-                        cfs_size_round(ioc_data->ioc_inllen1) +
-                        cfs_size_round(ioc_data->ioc_inllen2) +
-                        cfs_size_round(ioc_data->ioc_inllen3);
-                from = simple_strtol(ioc_data->ioc_inlbuf2, &endp, 0);
-                if (*endp != '\0')
-                        RETURN(-EINVAL);
-                to = simple_strtol(ioc_data->ioc_inlbuf3, &endp, 0);
-                if (*endp != '\0')
-                        RETURN(-EINVAL);
-                ioc_data->ioc_inllen1 = 0;
-                out = ioc_data->ioc_bulk;
+		l = 0;
+		remains = ioc_data->ioc_inllen4 +
+			  round_up(ioc_data->ioc_inllen1, 8) +
+			  round_up(ioc_data->ioc_inllen2, 8) +
+			  round_up(ioc_data->ioc_inllen3, 8);
+
+		rc = kstrtol(ioc_data->ioc_inlbuf2, 0, &from);
+		if (rc)
+			RETURN(rc);
+
+		rc = kstrtol(ioc_data->ioc_inlbuf3, 0, &to);
+		if (rc)
+			RETURN(rc);
+
+		ioc_data->ioc_inllen1 = 0;
+		out = ioc_data->ioc_bulk;
 	}
 
 	cur_index = rec->lrh_index;
@@ -139,17 +148,17 @@ static int llog_check_cb(const struct lu_env *env, struct llog_handle *handle,
 		RETURN(-LLOG_EEMPTY);
 
 	if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) {
-		struct llog_logid_rec	*lir = (struct llog_logid_rec *)rec;
-		struct llog_handle	*loghandle;
-
-                if (rec->lrh_type != LLOG_LOGID_MAGIC) {
-                        l = snprintf(out, remains, "[index]: %05d  [type]: "
-                                     "%02x  [len]: %04d failed\n",
-                                     cur_index, rec->lrh_type,
-                                     rec->lrh_len);
-                }
-                if (handle->lgh_ctxt == NULL)
-                        RETURN(-EOPNOTSUPP);
+		struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+		struct llog_handle *loghandle;
+
+		if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+			l = snprintf(out, remains,
+				     "[index]: %05d  [type]: %02x  [len]: %04d failed\n",
+				     cur_index, rec->lrh_type,
+				     rec->lrh_len);
+		}
+		if (handle->lgh_ctxt == NULL)
+			RETURN(-EOPNOTSUPP);
 		rc = llog_cat_id2handle(env, handle, &loghandle, &lir->lid_id);
 		if (rc) {
 			CDEBUG(D_IOCTL, "cannot find log "DFID":%x\n",
@@ -158,16 +167,16 @@ static int llog_check_cb(const struct lu_env *env, struct llog_handle *handle,
 			RETURN(rc);
 		}
 		rc = llog_process(env, loghandle, llog_check_cb, NULL, NULL);
-		llog_handle_put(loghandle);
+		llog_handle_put(env, loghandle);
 	} else {
 		bool ok;
 
-                switch (rec->lrh_type) {
-                case OST_SZ_REC:
-                case MDS_UNLINK_REC:
+		switch (rec->lrh_type) {
+		case OST_SZ_REC:
+		case MDS_UNLINK_REC:
 		case MDS_UNLINK64_REC:
-                case MDS_SETATTR64_REC:
-                case OBD_CFG_REC:
+		case MDS_SETATTR64_REC:
+		case OBD_CFG_REC:
 		case LLOG_GEN_REC:
 		case LLOG_HDR_MAGIC:
 			ok = true;
@@ -194,43 +203,46 @@ static int llog_check_cb(const struct lu_env *env, struct llog_handle *handle,
 static int llog_print_cb(const struct lu_env *env, struct llog_handle *handle,
 			 struct llog_rec_hdr *rec, void *data)
 {
-        struct obd_ioctl_data *ioc_data = (struct obd_ioctl_data *)data;
+	struct obd_ioctl_data *ioc_data = data;
 	static int l, remains;
 	static long from, to;
-        static char *out;
-        char *endp;
-        int cur_index;
-
-        ENTRY;
-	if (ioc_data != NULL && ioc_data->ioc_inllen1 > 0) {
-                l = 0;
-                remains = ioc_data->ioc_inllen4 +
-                        cfs_size_round(ioc_data->ioc_inllen1) +
-                        cfs_size_round(ioc_data->ioc_inllen2) +
-                        cfs_size_round(ioc_data->ioc_inllen3);
-                from = simple_strtol(ioc_data->ioc_inlbuf2, &endp, 0);
-                if (*endp != '\0')
-                        RETURN(-EINVAL);
-                to = simple_strtol(ioc_data->ioc_inlbuf3, &endp, 0);
-                if (*endp != '\0')
-                        RETURN(-EINVAL);
-                out = ioc_data->ioc_bulk;
-                ioc_data->ioc_inllen1 = 0;
-        }
-
-        cur_index = rec->lrh_index;
-        if (cur_index < from)
-                RETURN(0);
-        if (to > 0 && cur_index > to)
-                RETURN(-LLOG_EEMPTY);
-
-        if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) {
-                struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
-
-                if (rec->lrh_type != LLOG_LOGID_MAGIC) {
-                        CERROR("invalid record in catalog\n");
-                        RETURN(-EINVAL);
-                }
+	static char *out;
+	int cur_index;
+	int rc;
+
+	ENTRY;
+	if (ioc_data && ioc_data->ioc_inllen1 > 0) {
+		l = 0;
+		remains = ioc_data->ioc_inllen4 +
+			  round_up(ioc_data->ioc_inllen1, 8) +
+			  round_up(ioc_data->ioc_inllen2, 8) +
+			  round_up(ioc_data->ioc_inllen3, 8);
+
+		rc = kstrtol(ioc_data->ioc_inlbuf2, 0, &from);
+		if (rc)
+			RETURN(rc);
+
+		rc = kstrtol(ioc_data->ioc_inlbuf3, 0, &to);
+		if (rc)
+			RETURN(rc);
+
+		out = ioc_data->ioc_bulk;
+		ioc_data->ioc_inllen1 = 0;
+	}
+
+	cur_index = rec->lrh_index;
+	if (cur_index < from)
+		RETURN(0);
+	if (to > 0 && cur_index > to)
+		RETURN(-LLOG_EEMPTY);
+
+	if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) {
+		struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+
+		if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+			CERROR("invalid record in catalog\n");
+			RETURN(-EINVAL);
+		}
 
 		l = snprintf(out, remains,
 			     "[index]: %05d  [logid]: "DFID":%x\n",
@@ -247,21 +259,21 @@ static int llog_print_cb(const struct lu_env *env, struct llog_handle *handle,
 		l = snprintf(out, remains,
 			     "[index]: %05d  [type]: %02x  [len]: %04d\n",
 			     cur_index, rec->lrh_type, rec->lrh_len);
-        }
-        out += l;
-        remains -= l;
-        if (remains <= 0) {
-                CERROR("not enough space for print log records\n");
-                RETURN(-LLOG_EEMPTY);
-        }
-
-        RETURN(0);
+	}
+	out += l;
+	remains -= l;
+	if (remains <= 0) {
+		CERROR("not enough space for print log records\n");
+		RETURN(-LLOG_EEMPTY);
+	}
+
+	RETURN(0);
 }
 static int llog_remove_log(const struct lu_env *env, struct llog_handle *cat,
 			   struct llog_logid *logid)
 {
-	struct llog_handle	*log;
-	int			 rc;
+	struct llog_handle *log;
+	int rc;
 
 	ENTRY;
 
@@ -280,7 +292,7 @@ static int llog_remove_log(const struct lu_env *env, struct llog_handle *cat,
 	}
 	llog_cat_cleanup(env, cat, log, log->u.phd.phd_cookie.lgc_index);
 out:
-	llog_handle_put(log);
+	llog_handle_put(env, log);
 	RETURN(rc);
 
 }
@@ -288,8 +300,8 @@ static int llog_remove_log(const struct lu_env *env, struct llog_handle *cat,
 static int llog_delete_cb(const struct lu_env *env, struct llog_handle *handle,
 			  struct llog_rec_hdr *rec, void *data)
 {
-	struct llog_logid_rec	*lir = (struct llog_logid_rec *)rec;
-	int			 rc;
+	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+	int rc;
 
 	ENTRY;
 	if (rec->lrh_type != LLOG_LOGID_MAGIC)
@@ -303,15 +315,16 @@ static int llog_delete_cb(const struct lu_env *env, struct llog_handle *handle,
 int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
 	       struct obd_ioctl_data *data)
 {
-	struct llog_logid	 logid;
-	int			 rc = 0;
-	struct llog_handle	*handle = NULL;
-	char *logname;
+	struct llog_logid logid;
+	int rc = 0;
+	struct llog_handle *handle = NULL;
+	char *logname, start;
 
 	ENTRY;
 
 	logname = data->ioc_inlbuf1;
-	if (logname[0] == '#' || logname[0] == '[') {
+	start = logname[0];
+	if (start == '#' || start == '[') {
 		rc = str2logid(&logid, logname, data->ioc_inllen1);
 		if (rc)
 			RETURN(rc);
@@ -319,8 +332,8 @@ int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
 			       LLOG_OPEN_EXISTS);
 		if (rc)
 			RETURN(rc);
-	} else if (logname[0] == '$' || isalpha(logname[0])) {
-		if (logname[0] == '$')
+	} else if (start == '$' || isalpha(start) || isdigit(start)) {
+		if (start == '$')
 			logname++;
 
 		rc = llog_open(env, ctxt, &handle, NULL, logname,
@@ -328,7 +341,10 @@ int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
 		if (rc)
 			RETURN(rc);
 	} else {
-		RETURN(-EINVAL);
+		rc = -EINVAL;
+		CDEBUG(D_INFO, "%s: invalid log name '%s': rc = %d\n",
+		      ctxt->loc_obd->obd_name, logname, rc);
+		RETURN(rc);
 	}
 
 	rc = llog_init_handle(env, handle, 0, NULL);
@@ -337,10 +353,10 @@ int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
 
 	switch (cmd) {
 	case OBD_IOC_LLOG_INFO: {
-		int	 l;
-		int	 remains = data->ioc_inllen2 +
+		int l;
+		int remains = data->ioc_inllen2 +
 				   cfs_size_round(data->ioc_inllen1);
-		char	*out = data->ioc_bulk;
+		char *out = data->ioc_bulk;
 
 		l = snprintf(out, remains,
 			     "logid:            "DFID":%x\n"
@@ -382,11 +398,12 @@ int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
 	case OBD_IOC_LLOG_CANCEL: {
 		struct llog_cookie cookie;
 		struct llog_logid plain;
-		char *endp;
+		u32 lgc_index;
 
-		cookie.lgc_index = simple_strtoul(data->ioc_inlbuf3, &endp, 0);
-		if (*endp != '\0')
-			GOTO(out_close, rc = -EINVAL);
+		rc = kstrtouint(data->ioc_inlbuf3, 0, &lgc_index);
+		if (rc)
+			GOTO(out_close, rc);
+		cookie.lgc_index = lgc_index;
 
 		if (handle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) {
 			rc = llog_cancel_rec(env, handle, cookie.lgc_index);
@@ -453,11 +470,11 @@ int llog_catalog_list(const struct lu_env *env, struct dt_device *d,
 		      int count, struct obd_ioctl_data *data,
 		      const struct lu_fid *fid)
 {
-	int			 size, i;
-	struct llog_catid	*idarray;
-	struct llog_logid	*id;
-	char			*out;
-	int			 l, remains, rc = 0;
+	int size, i;
+	struct llog_catid *idarray;
+	struct llog_logid *id;
+	char *out;
+	int l, remains, rc = 0;
 
 	ENTRY;
 
@@ -480,15 +497,28 @@ int llog_catalog_list(const struct lu_env *env, struct dt_device *d,
 
 	out = data->ioc_bulk;
 	remains = data->ioc_inllen1;
-	for (i = 0; i < count; i++) {
+	/* OBD_FAIL: fetch the catalog records from the specified one */
+	if (OBD_FAIL_CHECK(OBD_FAIL_CATLIST))
+		data->ioc_count = cfs_fail_val - 1;
+	for (i = data->ioc_count; i < count; i++) {
 		id = &idarray[i].lci_logid;
 		l = snprintf(out, remains, "catalog_log: "DFID":%x\n",
-			     PFID(&id->lgl_oi.oi_fid), id->lgl_ogen);
+			      PFID(&id->lgl_oi.oi_fid), id->lgl_ogen);
 		out += l;
 		remains -= l;
-		if (remains <= 0)
-			break;
+		if (remains <= 0) {
+			if (remains < 0) {
+				/* the print is not complete */
+				remains += l;
+				data->ioc_bulk[out - data->ioc_bulk - l] = '\0';
+				data->ioc_count = i;
+			} else {
+				data->ioc_count = i++;
+			}
+			goto out;
+		}
 	}
+	data->ioc_count = 0;
 out:
 	OBD_FREE_LARGE(idarray, size);
 	RETURN(rc);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_obd.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_obd.c
index a5cdc6e184185..1d1f953992301 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/llog_obd.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_obd.c
@@ -40,36 +40,36 @@
 /* helper functions for calling the llog obd methods */
 static struct llog_ctxt* llog_new_ctxt(struct obd_device *obd)
 {
-        struct llog_ctxt *ctxt;
+	struct llog_ctxt *ctxt;
 
-        OBD_ALLOC_PTR(ctxt);
-        if (!ctxt)
-                return NULL;
+	OBD_ALLOC_PTR(ctxt);
+	if (!ctxt)
+		return NULL;
 
-        ctxt->loc_obd = obd;
+	ctxt->loc_obd = obd;
 	atomic_set(&ctxt->loc_refcount, 1);
 
-        return ctxt;
+	return ctxt;
 }
 
 static void llog_ctxt_destroy(struct llog_ctxt *ctxt)
 {
-        if (ctxt->loc_exp) {
-                class_export_put(ctxt->loc_exp);
-                ctxt->loc_exp = NULL;
-        }
-        if (ctxt->loc_imp) {
-                class_import_put(ctxt->loc_imp);
-                ctxt->loc_imp = NULL;
-        }
-        OBD_FREE_PTR(ctxt);
+	if (ctxt->loc_exp) {
+		class_export_put(ctxt->loc_exp);
+		ctxt->loc_exp = NULL;
+	}
+	if (ctxt->loc_imp) {
+		class_import_put(ctxt->loc_imp);
+		ctxt->loc_imp = NULL;
+	}
+	OBD_FREE_PTR(ctxt);
 }
 
 int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt)
 {
-        struct obd_llog_group *olg = ctxt->loc_olg;
-        struct obd_device *obd;
-        int rc = 0;
+	struct obd_llog_group *olg = ctxt->loc_olg;
+	struct obd_device *obd;
+	int rc = 0;
 
 	spin_lock(&olg->olg_lock);
 	if (!atomic_dec_and_test(&ctxt->loc_refcount)) {
@@ -84,16 +84,18 @@ int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt)
 	/* sync with llog ctxt user thread */
 	spin_unlock(&obd->obd_dev_lock);
 
-        /* obd->obd_starting is needed for the case of cleanup
-         * in error case while obd is starting up. */
-        LASSERTF(obd->obd_starting == 1 ||
-                 obd->obd_stopping == 1 || obd->obd_set_up == 0,
-                 "wrong obd state: %d/%d/%d\n", !!obd->obd_starting,
-                 !!obd->obd_stopping, !!obd->obd_set_up);
+	/*
+	 * obd->obd_starting is needed for the case of cleanup
+	 * in error case while obd is starting up.
+	 */
+	LASSERTF(obd->obd_starting == 1 ||
+		 obd->obd_stopping == 1 || obd->obd_set_up == 0,
+		 "wrong obd state: %d/%d/%d\n", !!obd->obd_starting,
+		 !!obd->obd_stopping, !!obd->obd_set_up);
 
-        /* cleanup the llog ctxt here */
-        if (CTXTP(ctxt, cleanup))
-		rc = CTXTP(ctxt, cleanup)(env, ctxt);
+	/* cleanup the llog ctxt here */
+	if (ctxt->loc_logops->lop_cleanup)
+		rc = ctxt->loc_logops->lop_cleanup(env, ctxt);
 
 	llog_ctxt_destroy(ctxt);
 	wake_up(&olg->olg_waitq);
@@ -103,39 +105,40 @@ EXPORT_SYMBOL(__llog_ctxt_put);
 
 int llog_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt)
 {
-        struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
-        struct obd_llog_group *olg;
-        int rc, idx;
-        ENTRY;
+	struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+	struct obd_llog_group *olg;
+	int rc, idx;
+
+	ENTRY;
 
-        LASSERT(ctxt != NULL);
-        LASSERT(ctxt != LP_POISON);
+	LASSERT(ctxt != NULL);
+	LASSERT(ctxt != LP_POISON);
 
-        olg = ctxt->loc_olg;
-        LASSERT(olg != NULL);
-        LASSERT(olg != LP_POISON);
+	olg = ctxt->loc_olg;
+	LASSERT(olg != NULL);
+	LASSERT(olg != LP_POISON);
 
-        idx = ctxt->loc_idx;
+	idx = ctxt->loc_idx;
 
 	/*
-         * Banlance the ctxt get when calling llog_cleanup()
-         */
+	 * Banlance the ctxt get when calling llog_cleanup()
+	 */
 	LASSERT(atomic_read(&ctxt->loc_refcount) < LI_POISON);
 	LASSERT(atomic_read(&ctxt->loc_refcount) > 1);
-        llog_ctxt_put(ctxt);
+	llog_ctxt_put(ctxt);
 
 	/*
 	 * Try to free the ctxt.
 	 */
 	rc = __llog_ctxt_put(env, ctxt);
-        if (rc)
-                CERROR("Error %d while cleaning up ctxt %p\n",
-                       rc, ctxt);
+	if (rc)
+		CERROR("Error %d while cleaning up ctxt %p\n",
+			rc, ctxt);
 
-        l_wait_event(olg->olg_waitq,
-                     llog_group_ctxt_null(olg, idx), &lwi);
+	l_wait_event(olg->olg_waitq,
+		     llog_group_ctxt_null(olg, idx), &lwi);
 
-        RETURN(rc);
+	RETURN(rc);
 }
 EXPORT_SYMBOL(llog_cleanup);
 
@@ -143,23 +146,24 @@ int llog_setup(const struct lu_env *env, struct obd_device *obd,
 	       struct obd_llog_group *olg, int index,
 	       struct obd_device *disk_obd, struct llog_operations *op)
 {
-        struct llog_ctxt *ctxt;
-        int rc = 0;
-        ENTRY;
+	struct llog_ctxt *ctxt;
+	int rc = 0;
+
+	ENTRY;
 
-        if (index < 0 || index >= LLOG_MAX_CTXTS)
-                RETURN(-EINVAL);
+	if (index < 0 || index >= LLOG_MAX_CTXTS)
+		RETURN(-EINVAL);
 
-        LASSERT(olg != NULL);
+	LASSERT(olg != NULL);
 
-        ctxt = llog_new_ctxt(obd);
-        if (!ctxt)
-                RETURN(-ENOMEM);
+	ctxt = llog_new_ctxt(obd);
+	if (!ctxt)
+		RETURN(-ENOMEM);
 
-        ctxt->loc_obd = obd;
-        ctxt->loc_olg = olg;
-        ctxt->loc_idx = index;
-        ctxt->loc_logops = op;
+	ctxt->loc_obd = obd;
+	ctxt->loc_olg = olg;
+	ctxt->loc_idx = index;
+	ctxt->loc_logops = op;
 	mutex_init(&ctxt->loc_mutex);
 	if (disk_obd != NULL)
 		ctxt->loc_exp = class_export_get(disk_obd->obd_self_export);
@@ -169,11 +173,11 @@ int llog_setup(const struct lu_env *env, struct obd_device *obd,
 	ctxt->loc_flags = LLOG_CTXT_FLAG_UNINITIALIZED;
 	ctxt->loc_chunk_size = LLOG_MIN_CHUNK_SIZE;
 
-        rc = llog_group_set_ctxt(olg, ctxt, index);
-        if (rc) {
-                llog_ctxt_destroy(ctxt);
-                if (rc == -EEXIST) {
-                        ctxt = llog_group_get_ctxt(olg, index);
+	rc = llog_group_set_ctxt(olg, ctxt, index);
+	if (rc) {
+		llog_ctxt_destroy(ctxt);
+		if (rc == -EEXIST) {
+			ctxt = llog_group_get_ctxt(olg, index);
 			if (ctxt) {
 				CDEBUG(D_CONFIG, "%s: ctxt %d already set up\n",
 				       obd->obd_name, index);
@@ -188,10 +192,10 @@ int llog_setup(const struct lu_env *env, struct obd_device *obd,
 				LASSERT(ctxt->loc_logops == op);
 				llog_ctxt_put(ctxt);
 			}
-                        rc = 0;
-                }
-                RETURN(rc);
-        }
+			rc = 0;
+		}
+		RETURN(rc);
+	}
 
 	if (op->lop_setup) {
 		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LLOG_SETUP))
@@ -205,48 +209,28 @@ int llog_setup(const struct lu_env *env, struct obd_device *obd,
 		       obd->obd_name, index, op->lop_setup, rc);
 		llog_group_clear_ctxt(olg, index);
 		llog_ctxt_destroy(ctxt);
-        } else {
-                CDEBUG(D_CONFIG, "obd %s ctxt %d is initialized\n",
-                       obd->obd_name, index);
-                ctxt->loc_flags &= ~LLOG_CTXT_FLAG_UNINITIALIZED;
-        }
+	} else {
+		CDEBUG(D_CONFIG, "obd %s ctxt %d is initialized\n",
+		       obd->obd_name, index);
+		ctxt->loc_flags &= ~LLOG_CTXT_FLAG_UNINITIALIZED;
+	}
 
-        RETURN(rc);
+	RETURN(rc);
 }
 EXPORT_SYMBOL(llog_setup);
 
 int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags)
 {
-        int rc = 0;
-        ENTRY;
+	int rc = 0;
 
-        if (!ctxt)
-                RETURN(0);
+	ENTRY;
+	if (ctxt && ctxt->loc_logops->lop_sync)
+		rc = ctxt->loc_logops->lop_sync(ctxt, exp, flags);
 
-        if (CTXTP(ctxt, sync))
-		rc = CTXTP(ctxt, sync)(ctxt, exp, flags);
-
-        RETURN(rc);
+	RETURN(rc);
 }
 EXPORT_SYMBOL(llog_sync);
 
-int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt,
-		struct llog_cookie *cookies, int flags)
-{
-        int rc;
-        ENTRY;
-
-        if (!ctxt) {
-                CERROR("No ctxt\n");
-                RETURN(-ENODEV);
-        }
-
-        CTXT_CHECK_OP(ctxt, cancel, -EOPNOTSUPP);
-	rc = CTXTP(ctxt, cancel)(env, ctxt, cookies, flags);
-        RETURN(rc);
-}
-EXPORT_SYMBOL(llog_cancel);
-
 /* context key constructor/destructor: llog_key_init, llog_key_fini */
 LU_KEY_INIT_FINI(llog, struct llog_thread_info);
 /* context key: llog_thread_key */
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_osd.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_osd.c
index ffa1ad0149b25..55088d417146d 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/llog_osd.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_osd.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -44,6 +44,8 @@
 
 #define DEBUG_SUBSYSTEM S_LOG
 
+#include <linux/delay.h>
+
 #include <dt_object.h>
 #include <llog_swab.h>
 #include <lustre_fid.h>
@@ -124,8 +126,7 @@ static int llog_osd_create_new_object(const struct lu_env *env,
 static int llog_osd_exist(struct llog_handle *handle)
 {
 	LASSERT(handle->lgh_obj);
-	return dt_object_exists(handle->lgh_obj) &&
-		!lu_object_is_dying(handle->lgh_obj->do_lu.lo_header);
+	return dt_object_exists(handle->lgh_obj) && !handle->lgh_destroyed;
 }
 
 static void *rec_tail(struct llog_rec_hdr *rec)
@@ -362,7 +363,7 @@ static int llog_osd_declare_write_rec(const struct lu_env *env,
  *				the full llog record to write. This is
  *				the beginning of buffer to write, the length
  *				of buffer is stored in \a rec::lrh_len
- * \param[out] reccookie	pointer to the cookie to return back if needed.
+ * \param[in,out] reccookie	pointer to the cookie to return back if needed.
  *				It is used for further cancel of this llog
  *				record.
  * \param[in]  idx		index of the llog record. If \a idx == -1 then
@@ -490,26 +491,26 @@ static int llog_osd_write_rec(const struct lu_env *env,
 					     &lgi->lgi_off, th);
 
 			RETURN(rc);
-		} else if (loghandle->lgh_cur_idx > 0) {
+		} else if (llh->llh_flags & LLOG_F_IS_FIXSIZE) {
+			lgi->lgi_off = llh->llh_hdr.lrh_len +
+				       (idx - 1) * reclen;
+		} else if (reccookie != NULL && reccookie->lgc_index > 0) {
 			/**
-			 * The lgh_cur_offset can be used only if index is
+			 * The lgc_offset can be used only if index is
 			 * the same.
 			 */
-			if (idx != loghandle->lgh_cur_idx) {
+			if (idx != reccookie->lgc_index) {
 				CERROR("%s: modify index mismatch %d %d\n",
 				       o->do_lu.lo_dev->ld_obd->obd_name, idx,
-				       loghandle->lgh_cur_idx);
+				       reccookie->lgc_index);
 				RETURN(-EFAULT);
 			}
 
-			lgi->lgi_off = loghandle->lgh_cur_offset;
+			lgi->lgi_off = reccookie->lgc_offset;
 			CDEBUG(D_OTHER, "modify record "DFID": idx:%u, "
 			       "len:%u offset %llu\n",
 			       PFID(&loghandle->lgh_id.lgl_oi.oi_fid), idx,
 			       rec->lrh_len, (long long)lgi->lgi_off);
-		} else if (llh->llh_flags & LLOG_F_IS_FIXSIZE) {
-			lgi->lgi_off = llh->llh_hdr.lrh_len +
-				       (idx - 1) * reclen;
 		} else {
 			/* This can be result of lgh_cur_idx is not set during
 			 * llog processing or llh_size is not set to proper
@@ -590,6 +591,7 @@ static int llog_osd_write_rec(const struct lu_env *env,
 			RETURN(-ENOSPC);
 	}
 
+	down_write(&loghandle->lgh_last_sem);
 	/* increment the last_idx along with llh_tail index, they should
 	 * be equal for a llog lifetime */
 	loghandle->lgh_last_idx++;
@@ -673,6 +675,12 @@ static int llog_osd_write_rec(const struct lu_env *env,
 	if (rc)
 		GOTO(out, rc);
 
+	if (OBD_FAIL_PRECHECK(OBD_FAIL_LLOG_PROCESS_TIMEOUT) &&
+	   cfs_fail_val == (unsigned int)(loghandle->lgh_id.lgl_oi.oi.oi_id &
+					  0xFFFFFFFF)) {
+		OBD_RACE(OBD_FAIL_LLOG_PROCESS_TIMEOUT);
+		msleep(1 * MSEC_PER_SEC);
+	}
 	/* computed index can be used to determine offset for fixed-size
 	 * records. This also allows to handle Catalog wrap around case */
 	if (llh->llh_flags & LLOG_F_IS_FIXSIZE) {
@@ -693,6 +701,8 @@ static int llog_osd_write_rec(const struct lu_env *env,
 	if (rc < 0)
 		GOTO(out, rc);
 
+	up_write(&loghandle->lgh_last_sem);
+
 	CDEBUG(D_HA, "added record "DFID".%u, %u off%llu\n",
 	       PFID(lu_object_fid(&o->do_lu)), index, rec->lrh_len,
 	       lgi->lgi_off);
@@ -726,6 +736,7 @@ static int llog_osd_write_rec(const struct lu_env *env,
 	}
 
 	LLOG_HDR_TAIL(llh)->lrt_index = loghandle->lgh_last_idx;
+	up_write(&loghandle->lgh_last_sem);
 
 	RETURN(rc);
 }
@@ -781,19 +792,46 @@ static inline void llog_skip_over(struct llog_handle *lgh, __u64 *off,
  * big enough to handle the remapped records. It is also assumed that records
  * of a block have the same format (i.e.: the same features enabled).
  *
- * \param[in,out]    hdr	Header of the block of records to remap.
- * \param[in,out]    last_hdr   Last header, don't read past this point.
- * \param[in]        flags	Flags describing the fields to keep.
+ * \param[in,out]    hdr	   Header of the block of records to remap.
+ * \param[in,out]    last_hdr      Last header, don't read past this point.
+ * \param[in]        flags	   Flags describing the fields to keep.
+ * \param[in]        extra_flags   Flags describing the extra fields to keep.
  */
 static void changelog_block_trim_ext(struct llog_rec_hdr *hdr,
 				     struct llog_rec_hdr *last_hdr,
-				     enum changelog_rec_flags flags)
+				     struct llog_handle *loghandle)
 {
+	enum changelog_rec_flags flags = CLF_SUPPORTED;
+	enum changelog_rec_extra_flags extra_flags = CLFE_SUPPORTED;
+
+	if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_X_XATTR))
+		extra_flags &= ~CLFE_XATTR;
+	if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_X_OMODE))
+		extra_flags &= ~CLFE_OPEN;
+	if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_X_NID))
+		extra_flags &= ~CLFE_NID;
+	if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_X_UIDGID))
+		extra_flags &= ~CLFE_UIDGID;
+	if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_EXTRA_FLAGS))
+		flags &= ~CLF_EXTRA_FLAGS;
+	if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_JOBID))
+		flags &= ~CLF_JOBID;
+
+	if (flags == CLF_SUPPORTED && extra_flags == CLFE_SUPPORTED)
+		return;
+
 	if (hdr->lrh_type != CHANGELOG_REC)
 		return;
 
 	do {
 		struct changelog_rec *rec = (struct changelog_rec *)(hdr + 1);
+		enum changelog_rec_extra_flags xflag = CLFE_INVALID;
+
+		if (flags & CLF_EXTRA_FLAGS &&
+		    rec->cr_flags & CLF_EXTRA_FLAGS) {
+			xflag = changelog_rec_extra_flags(rec)->cr_extra_flags &
+				extra_flags;
+		}
 
 		if (unlikely(hdr->lrh_len == 0)) {
 			/* It is corruption case, we cannot know the next rec,
@@ -810,7 +848,7 @@ static void changelog_block_trim_ext(struct llog_rec_hdr *hdr,
 			break;
 		}
 
-		changelog_remap_rec(rec, rec->cr_flags & flags);
+		changelog_remap_rec(rec, rec->cr_flags & flags, xflag);
 		hdr = llog_rec_hdr_next(hdr);
 		/* Yield CPU to avoid soft-lockup if there are too many records
 		 * to be handled. */
@@ -864,7 +902,7 @@ static int llog_osd_next_block(const struct lu_env *env,
 
 	o = loghandle->lgh_obj;
 	LASSERT(o);
-	LASSERT(dt_object_exists(o));
+	LASSERT(llog_osd_exist(loghandle));
 	dt = lu2dt_dev(o->do_lu.lo_dev);
 	LASSERT(dt);
 
@@ -928,9 +966,25 @@ static int llog_osd_next_block(const struct lu_env *env,
 		rec = buf;
 		if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
 			lustre_swab_llog_rec(rec);
-
 		tail = (struct llog_rec_tail *)((char *)buf + rc -
 						sizeof(struct llog_rec_tail));
+
+		if (llog_verify_record(loghandle, rec)) {
+			/*
+			 * the block seems corrupted. make a pad record so the
+			 * caller can skip the block and try with the next one
+			 */
+			rec->lrh_len = rc;
+			rec->lrh_index = next_idx;
+			rec->lrh_type = LLOG_PAD_MAGIC;
+
+			tail = rec_tail(rec);
+			tail->lrt_len = rc;
+			tail->lrt_index = next_idx;
+
+			GOTO(out, rc = 0);
+		}
+
 		/* get the last record in block */
 		last_rec = (struct llog_rec_hdr *)((char *)buf + rc -
 						   tail->lrt_len);
@@ -969,7 +1023,7 @@ static int llog_osd_next_block(const struct lu_env *env,
 
 		/* sanity check that the start of the new buffer is no farther
 		 * than the record that we wanted.  This shouldn't happen. */
-		if (rec->lrh_index > next_idx) {
+		if (next_idx && rec->lrh_index > next_idx) {
 			if (!force_mini_rec && next_idx > last_idx)
 				goto retry;
 
@@ -980,9 +1034,7 @@ static int llog_osd_next_block(const struct lu_env *env,
 		}
 
 		/* Trim unsupported extensions for compat w/ older clients */
-		if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_JOBID))
-			changelog_block_trim_ext(rec, last_rec,
-						 CLF_VERSION | CLF_RENAME);
+		changelog_block_trim_ext(rec, last_rec, loghandle);
 
 		GOTO(out, rc = 0);
 
@@ -1040,7 +1092,7 @@ static int llog_osd_prev_block(const struct lu_env *env,
 
 	o = loghandle->lgh_obj;
 	LASSERT(o);
-	LASSERT(dt_object_exists(o));
+	LASSERT(llog_osd_exist(loghandle));
 	dt = lu2dt_dev(o->do_lu.lo_dev);
 	LASSERT(dt);
 
@@ -1117,9 +1169,7 @@ static int llog_osd_prev_block(const struct lu_env *env,
 		}
 
 		/* Trim unsupported extensions for compat w/ older clients */
-		if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_JOBID))
-			changelog_block_trim_ext(rec, last_rec,
-						 CLF_VERSION | CLF_RENAME);
+		changelog_block_trim_ext(rec, last_rec, loghandle);
 
 		GOTO(out, rc = 0);
 	}
@@ -1408,7 +1458,7 @@ llog_osd_regular_fid_add_name_entry(const struct lu_env *env,
 			       (struct dt_key *)name, th);
 	} else {
 		rc = dt_insert(env, dir, (struct dt_rec *)rec,
-			       (struct dt_key *)name, th, 1);
+			       (struct dt_key *)name, th);
 	}
 	dt_write_unlock(env, dir);
 
@@ -1575,8 +1625,7 @@ static int llog_osd_create(const struct lu_env *env, struct llog_handle *res,
 		rec->rec_type = S_IFREG;
 		dt_read_lock(env, llog_dir, 0);
 		rc = dt_insert(env, llog_dir, (struct dt_rec *)rec,
-			       (struct dt_key *)res->lgh_name,
-			       th, 1);
+			       (struct dt_key *)res->lgh_name, th);
 		dt_read_unlock(env, llog_dir);
 		dt_object_put(env, llog_dir);
 		if (rc)
@@ -1766,7 +1815,7 @@ static int llog_osd_destroy(const struct lu_env *env,
 	LASSERT(o != NULL);
 
 	dt_write_lock(env, o, 0);
-	if (!dt_object_exists(o))
+	if (!llog_osd_exist(loghandle))
 		GOTO(out_unlock, rc = 0);
 
 	if (loghandle->lgh_name) {
@@ -1792,6 +1841,7 @@ static int llog_osd_destroy(const struct lu_env *env,
 	if (rc < 0)
 		GOTO(out_unlock, rc);
 
+	loghandle->lgh_destroyed = true;
 	if (loghandle->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) {
 		rc = llog_osd_regular_fid_del_name_entry(env, o, th, false);
 		if (rc < 0)
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_swab.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_swab.c
index 3ab0b430fca14..c644efb64ac1f 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/llog_swab.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_swab.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -58,9 +58,9 @@ static void print_llogd_body(struct llogd_body *d)
 
 void lustre_swab_lu_fid(struct lu_fid *fid)
 {
-        __swab64s (&fid->f_seq);
-        __swab32s (&fid->f_oid);
-        __swab32s (&fid->f_ver);
+	__swab64s(&fid->f_seq);
+	__swab32s(&fid->f_oid);
+	__swab32s(&fid->f_ver);
 }
 EXPORT_SYMBOL(lustre_swab_lu_fid);
 
@@ -80,47 +80,47 @@ void lustre_swab_llog_id(struct llog_logid *log_id)
 {
 	__swab64s(&log_id->lgl_oi.oi.oi_id);
 	__swab64s(&log_id->lgl_oi.oi.oi_seq);
-        __swab32s(&log_id->lgl_ogen);
+	__swab32s(&log_id->lgl_ogen);
 }
 
 void lustre_swab_llogd_body (struct llogd_body *d)
 {
-        ENTRY;
-        print_llogd_body(d);
+	ENTRY;
+	print_llogd_body(d);
 	lustre_swab_llog_id(&d->lgd_logid);
-        __swab32s (&d->lgd_ctxt_idx);
-        __swab32s (&d->lgd_llh_flags);
-        __swab32s (&d->lgd_index);
-        __swab32s (&d->lgd_saved_index);
-        __swab32s (&d->lgd_len);
-        __swab64s (&d->lgd_cur_offset);
-        print_llogd_body(d);
-        EXIT;
+	__swab32s(&d->lgd_ctxt_idx);
+	__swab32s(&d->lgd_llh_flags);
+	__swab32s(&d->lgd_index);
+	__swab32s(&d->lgd_saved_index);
+	__swab32s(&d->lgd_len);
+	__swab64s(&d->lgd_cur_offset);
+	print_llogd_body(d);
+	EXIT;
 }
 EXPORT_SYMBOL(lustre_swab_llogd_body);
 
 void lustre_swab_llogd_conn_body (struct llogd_conn_body *d)
 {
-        __swab64s (&d->lgdc_gen.mnt_cnt);
-        __swab64s (&d->lgdc_gen.conn_cnt);
+	__swab64s(&d->lgdc_gen.mnt_cnt);
+	__swab64s(&d->lgdc_gen.conn_cnt);
 	lustre_swab_llog_id(&d->lgdc_logid);
-        __swab32s (&d->lgdc_ctxt_idx);
+	__swab32s(&d->lgdc_ctxt_idx);
 }
 EXPORT_SYMBOL(lustre_swab_llogd_conn_body);
 
 void lustre_swab_ll_fid(struct ll_fid *fid)
 {
-        __swab64s (&fid->id);
-        __swab32s (&fid->generation);
-        __swab32s (&fid->f_type);
+	__swab64s(&fid->id);
+	__swab32s(&fid->generation);
+	__swab32s(&fid->f_type);
 }
 
 void lustre_swab_lu_seq_range(struct lu_seq_range *range)
 {
-        __swab64s (&range->lsr_start);
-        __swab64s (&range->lsr_end);
-        __swab32s (&range->lsr_index);
-        __swab32s (&range->lsr_flags);
+	__swab64s(&range->lsr_start);
+	__swab64s(&range->lsr_end);
+	__swab32s(&range->lsr_index);
+	__swab32s(&range->lsr_flags);
 }
 EXPORT_SYMBOL(lustre_swab_lu_seq_range);
 
@@ -143,32 +143,32 @@ void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
 {
 	struct llog_rec_tail *tail = NULL;
 
-        __swab32s(&rec->lrh_len);
-        __swab32s(&rec->lrh_index);
-        __swab32s(&rec->lrh_type);
+	__swab32s(&rec->lrh_len);
+	__swab32s(&rec->lrh_index);
+	__swab32s(&rec->lrh_type);
 	__swab32s(&rec->lrh_id);
 
-        switch (rec->lrh_type) {
+	switch (rec->lrh_type) {
 	case OST_SZ_REC:
 	{
-                struct llog_size_change_rec *lsc =
-                        (struct llog_size_change_rec *)rec;
+		struct llog_size_change_rec *lsc =
+			(struct llog_size_change_rec *)rec;
 
-                lustre_swab_ll_fid(&lsc->lsc_fid);
-                __swab32s(&lsc->lsc_ioepoch);
+		lustre_swab_ll_fid(&lsc->lsc_fid);
+		__swab32s(&lsc->lsc_ioepoch);
 		tail = &lsc->lsc_tail;
-                break;
-        }
+		break;
+	}
 	case MDS_UNLINK_REC:
 	{
-                struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec;
+		struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec;
 
-                __swab64s(&lur->lur_oid);
-                __swab32s(&lur->lur_oseq);
-                __swab32s(&lur->lur_count);
+		__swab64s(&lur->lur_oid);
+		__swab32s(&lur->lur_oseq);
+		__swab32s(&lur->lur_count);
 		tail = &lur->lur_tail;
-                break;
-        }
+		break;
+	}
 	case MDS_UNLINK64_REC:
 	{
 		struct llog_unlink64_rec *lur =
@@ -199,8 +199,10 @@ void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
 			lustre_swab_lu_fid(&rnm->cr_sfid);
 			lustre_swab_lu_fid(&rnm->cr_spfid);
 		}
-		/* Because the tail follows a variable-length structure we need
-		 * to compute its location at runtime */
+		/*
+		 * Because the tail follows a variable-length structure we need
+		 * to compute its location at runtime
+		 */
 		tail = (struct llog_rec_tail *)((char *)&cr->cr +
 						changelog_rec_size(&cr->cr) +
 						cr->cr.cr_namelen);
@@ -209,14 +211,15 @@ void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
 
 	case CHANGELOG_USER_REC:
 	{
-                struct llog_changelog_user_rec *cur =
-                        (struct llog_changelog_user_rec*)rec;
+		struct llog_changelog_user_rec *cur =
+			(struct llog_changelog_user_rec *)rec;
 
-                __swab32s(&cur->cur_id);
-                __swab64s(&cur->cur_endrec);
+		__swab32s(&cur->cur_id);
+		__swab64s(&cur->cur_endrec);
+		__swab32s(&cur->cur_time);
 		tail = &cur->cur_tail;
-                break;
-        }
+		break;
+	}
 
 	case HSM_AGENT_REC: {
 		struct llog_agent_req_rec *arr =
@@ -230,8 +233,10 @@ void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
 		__swab64s(&arr->arr_hai.hai_extent.offset);
 		__swab64s(&arr->arr_hai.hai_extent.length);
 		__swab64s(&arr->arr_hai.hai_gid);
-		/* no swabing for opaque data */
-		/* hai_data[0]; */
+		/*
+		 * no swabing for opaque data
+		 * hai_data[0];
+		 */
 		break;
 	}
 
@@ -252,6 +257,7 @@ void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
 				(struct llog_setattr64_rec_v2 *)rec;
 
 			__swab32s(&lsr2->lsr_projid);
+			__swab32s(&lsr2->lsr_layout_version);
 			tail = &lsr2->lsr_tail;
 		} else {
 			tail = &lsr->lsr_tail;
@@ -291,8 +297,8 @@ void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
 		tail = &lgr->lgr_tail;
 		break;
 	}
-        case LLOG_PAD_MAGIC:
-                break;
+	case LLOG_PAD_MAGIC:
+		break;
 	case UPDATE_REC:
 	{
 		struct llog_update_record *lur =
@@ -312,10 +318,10 @@ void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
 						update_records_size(record));
 		break;
 	}
-        default:
-                CERROR("Unknown llog rec type %#x swabbing rec %p\n",
-                       rec->lrh_type, rec);
-        }
+	default:
+		CERROR("Unknown llog rec type %#x swabbing rec %p\n",
+			rec->lrh_type, rec);
+	}
 
 	if (tail) {
 		__swab32s(&tail->lrt_len);
@@ -343,31 +349,33 @@ static void print_llog_hdr(struct llog_log_hdr *h)
 
 void lustre_swab_llog_hdr (struct llog_log_hdr *h)
 {
-        ENTRY;
-        print_llog_hdr(h);
+	ENTRY;
+	print_llog_hdr(h);
 
 	lustre_swab_llog_rec(&h->llh_hdr);
 
-        print_llog_hdr(h);
-        EXIT;
+	print_llog_hdr(h);
+	EXIT;
 }
 EXPORT_SYMBOL(lustre_swab_llog_hdr);
 
 void print_lustre_cfg(struct lustre_cfg *lcfg)
 {
-        int i;
-        ENTRY;
+	int i;
 
-        if (!(libcfs_debug & D_OTHER)) /* don't loop on nothing */
-                return;
+	ENTRY;
 
-        CDEBUG(D_OTHER, "lustre_cfg: %p\n", lcfg);
-        CDEBUG(D_OTHER, "\tlcfg->lcfg_version: %#x\n", lcfg->lcfg_version);
+	if (!(libcfs_debug & D_OTHER)) /* don't loop on nothing */
+		return;
 
-        CDEBUG(D_OTHER, "\tlcfg->lcfg_command: %#x\n", lcfg->lcfg_command);
-        CDEBUG(D_OTHER, "\tlcfg->lcfg_num: %#x\n", lcfg->lcfg_num);
-        CDEBUG(D_OTHER, "\tlcfg->lcfg_flags: %#x\n", lcfg->lcfg_flags);
-        CDEBUG(D_OTHER, "\tlcfg->lcfg_nid: %s\n", libcfs_nid2str(lcfg->lcfg_nid));
+	CDEBUG(D_OTHER, "lustre_cfg: %p\n", lcfg);
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_version: %#x\n", lcfg->lcfg_version);
+
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_command: %#x\n", lcfg->lcfg_command);
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_num: %#x\n", lcfg->lcfg_num);
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_flags: %#x\n", lcfg->lcfg_flags);
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_nid: %s\n",
+	       libcfs_nid2str(lcfg->lcfg_nid));
 
 	CDEBUG(D_OTHER, "\tlcfg->lcfg_bufcount: %d\n", lcfg->lcfg_bufcount);
 	if (lcfg->lcfg_bufcount < LUSTRE_CFG_MAX_BUFCOUNT)
@@ -377,47 +385,48 @@ void print_lustre_cfg(struct lustre_cfg *lcfg)
 			       lustre_cfg_string(lcfg, i));
 		}
 
-        EXIT;
+	EXIT;
 }
 EXPORT_SYMBOL(print_lustre_cfg);
 
 void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg)
 {
-        int i;
-        ENTRY;
-
-        __swab32s(&lcfg->lcfg_version);
-
-        if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) {
-                CERROR("not swabbing lustre_cfg version %#x (expecting %#x)\n",
-                       lcfg->lcfg_version, LUSTRE_CFG_VERSION);
-                EXIT;
-                return;
-        }
-
-        __swab32s(&lcfg->lcfg_command);
-        __swab32s(&lcfg->lcfg_num);
-        __swab32s(&lcfg->lcfg_flags);
-        __swab64s(&lcfg->lcfg_nid);
-        __swab32s(&lcfg->lcfg_bufcount);
-        for (i = 0; i < lcfg->lcfg_bufcount && i < LUSTRE_CFG_MAX_BUFCOUNT; i++)
-                __swab32s(&lcfg->lcfg_buflens[i]);
-
-        print_lustre_cfg(lcfg);
-        EXIT;
-        return;
+	int i;
+
+	ENTRY;
+
+	__swab32s(&lcfg->lcfg_version);
+
+	if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) {
+		CERROR("not swabbing lustre_cfg version %#x (expecting %#x)\n",
+			lcfg->lcfg_version, LUSTRE_CFG_VERSION);
+		EXIT;
+		return;
+	}
+
+	__swab32s(&lcfg->lcfg_command);
+	__swab32s(&lcfg->lcfg_num);
+	__swab32s(&lcfg->lcfg_flags);
+	__swab64s(&lcfg->lcfg_nid);
+	__swab32s(&lcfg->lcfg_bufcount);
+	for (i = 0; i < lcfg->lcfg_bufcount && i < LUSTRE_CFG_MAX_BUFCOUNT; i++)
+		__swab32s(&lcfg->lcfg_buflens[i]);
+
+	print_lustre_cfg(lcfg);
+	EXIT;
+	return;
 }
 
 /* used only for compatibility with old on-disk cfg_marker data */
 struct cfg_marker32 {
-        __u32   cm_step;
-        __u32   cm_flags;
-        __u32   cm_vers;
-        __u32   padding;
-        __u32   cm_createtime;
-        __u32   cm_canceltime;
-        char    cm_tgtname[MTI_NAME_MAXLEN];
-        char    cm_comment[MTI_NAME_MAXLEN];
+	__u32   cm_step;
+	__u32   cm_flags;
+	__u32   cm_vers;
+	__u32   padding;
+	__u32   cm_createtime;
+	__u32   cm_canceltime;
+	char    cm_tgtname[MTI_NAME_MAXLEN];
+	char    cm_comment[MTI_NAME_MAXLEN];
 };
 
 #define MTI_NAMELEN32    (MTI_NAME_MAXLEN - \
@@ -425,48 +434,51 @@ struct cfg_marker32 {
 
 void lustre_swab_cfg_marker(struct cfg_marker *marker, int swab, int size)
 {
-        struct cfg_marker32 *cm32 = (struct cfg_marker32*)marker;
-        ENTRY;
-
-        if (swab) {
-                __swab32s(&marker->cm_step);
-                __swab32s(&marker->cm_flags);
-                __swab32s(&marker->cm_vers);
-        }
-        if (size == sizeof(*cm32)) {
-                __u32 createtime, canceltime;
-                /* There was a problem with the original declaration of
-                 * cfg_marker on 32-bit systems because it used time_t as
-                 * a wire protocol structure, and didn't verify this in
-                 * wirecheck.  We now have to convert the offsets of the
-                 * later fields in order to work on 32- and 64-bit systems.
-                 *
-                 * Fortunately, the cm_comment field has no functional use
-                 * so can be sacrificed when converting the timestamp size.
-                 *
-                 * Overwrite fields from the end first, so they are not
-                 * clobbered, and use memmove() instead of memcpy() because
-                 * the source and target buffers overlap.  bug 16771 */
-                createtime = cm32->cm_createtime;
-                canceltime = cm32->cm_canceltime;
-                memmove(marker->cm_comment, cm32->cm_comment, MTI_NAMELEN32);
-                marker->cm_comment[MTI_NAMELEN32 - 1] = '\0';
-                memmove(marker->cm_tgtname, cm32->cm_tgtname,
-                        sizeof(marker->cm_tgtname));
-                if (swab) {
-                        __swab32s(&createtime);
-                        __swab32s(&canceltime);
-                }
-                marker->cm_createtime = createtime;
-                marker->cm_canceltime = canceltime;
-                CDEBUG(D_CONFIG, "Find old cfg_marker(Srv32b,Clt64b) "
-                       "for target %s, converting\n",
-                       marker->cm_tgtname);
-        } else if (swab) {
-                __swab64s(&marker->cm_createtime);
-                __swab64s(&marker->cm_canceltime);
-        }
-
-        EXIT;
-        return;
+	struct cfg_marker32 *cm32 = (struct cfg_marker32 *)marker;
+
+	ENTRY;
+
+	if (swab) {
+		__swab32s(&marker->cm_step);
+		__swab32s(&marker->cm_flags);
+		__swab32s(&marker->cm_vers);
+	}
+	if (size == sizeof(*cm32)) {
+		__u32 createtime, canceltime;
+		/*
+		 * There was a problem with the original declaration of
+		 * cfg_marker on 32-bit systems because it used time_t as
+		 * a wire protocol structure, and didn't verify this in
+		 * wirecheck.  We now have to convert the offsets of the
+		 * later fields in order to work on 32- and 64-bit systems.
+		 *
+		 * Fortunately, the cm_comment field has no functional use
+		 * so can be sacrificed when converting the timestamp size.
+		 *
+		 * Overwrite fields from the end first, so they are not
+		 * clobbered, and use memmove() instead of memcpy() because
+		 * the source and target buffers overlap.  bug 16771
+		 */
+		createtime = cm32->cm_createtime;
+		canceltime = cm32->cm_canceltime;
+		memmove(marker->cm_comment, cm32->cm_comment, MTI_NAMELEN32);
+		marker->cm_comment[MTI_NAMELEN32 - 1] = '\0';
+		memmove(marker->cm_tgtname, cm32->cm_tgtname,
+			sizeof(marker->cm_tgtname));
+		if (swab) {
+			__swab32s(&createtime);
+			__swab32s(&canceltime);
+		}
+		marker->cm_createtime = createtime;
+		marker->cm_canceltime = canceltime;
+		CDEBUG(D_CONFIG,
+		       "Find old cfg_marker(Srv32b,Clt64b) for target %s, converting\n",
+		       marker->cm_tgtname);
+	} else if (swab) {
+		__swab64s(&marker->cm_createtime);
+		__swab64s(&marker->cm_canceltime);
+	}
+
+	EXIT;
+	return;
 }
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_test.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_test.c
index 27f52aa15078b..f1517ceef7198 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/llog_test.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_test.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -39,6 +39,8 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
 
 #include <obd_class.h>
 #include <lustre_fid.h>
@@ -47,15 +49,14 @@
 /* This is slightly more than the number of records that can fit into a
  * single llog file, because the llog_log_header takes up some of the
  * space in the first block that cannot be used for the bitmap. */
-#define LLOG_TEST_RECNUM  (LLOG_MIN_CHUNK_SIZE * 8)
-
+static int llog_test_recnum = (LLOG_MIN_CHUNK_SIZE * 8);
 static int llog_test_rand;
 static struct obd_uuid uuid = { .uuid = "test_uuid" };
 static struct llog_logid cat_logid;
 
 struct llog_mini_rec {
-        struct llog_rec_hdr     lmr_hdr;
-        struct llog_rec_tail    lmr_tail;
+	struct llog_rec_hdr lmr_hdr;
+	struct llog_rec_tail lmr_tail;
 } __attribute__((packed));
 
 static int verify_handle(char *test, struct llog_handle *llh, int num_recs)
@@ -101,8 +102,8 @@ static int verify_handle(char *test, struct llog_handle *llh, int num_recs)
 static int llog_test_1(const struct lu_env *env,
 		       struct obd_device *obd, char *name)
 {
-	struct llog_handle	*llh;
-	struct llog_ctxt	*ctxt;
+	struct llog_handle *llh;
+	struct llog_ctxt *ctxt;
 	int rc;
 	int rc2;
 
@@ -148,11 +149,11 @@ static int test_2_cancel_cb(const struct lu_env *env, struct llog_handle *llh,
 static int llog_test_2(const struct lu_env *env, struct obd_device *obd,
 		       char *name, struct llog_handle **llh)
 {
-	struct llog_ctxt	*ctxt;
-	struct llog_handle	*lgh;
-	struct llog_logid	 logid;
-	int			 rc;
-	struct llog_mini_rec	 lmr;
+	struct llog_ctxt *ctxt;
+	struct llog_handle *lgh;
+	struct llog_logid  logid;
+	int rc;
+	struct llog_mini_rec lmr;
 
 	ENTRY;
 
@@ -191,7 +192,7 @@ static int llog_test_2(const struct lu_env *env, struct obd_device *obd,
 	logid = lgh->lgh_id;
 
 	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
-	lmr.lmr_hdr.lrh_type = 0xf02f02;
+	lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC;
 
 	/* Check llog header values are correct after record add/cancel */
 	CWARN("2b: write 1 llog records, check llh_count\n");
@@ -301,8 +302,10 @@ static int test3_check_n_add_cb(const struct lu_env *env,
 	} else {
 		size_t chunk_size = lgh->lgh_hdr->llh_hdr.lrh_len;
 
-		/* For variable size records the start offset is unknown, trust
-		 * the first value and check others are consistent with it. */
+		/*
+		 * For variable size records the start offset is unknown, trust
+		 * the first value and check others are consistent with it.
+		 */
 		if (test_3_rec_off == 0)
 			test_3_rec_off = lgh->lgh_cur_offset;
 
@@ -337,8 +340,10 @@ static int test3_check_n_add_cb(const struct lu_env *env,
 	if (rc < 0)
 		CERROR("cb_test_3: cannot modify record while processing\n");
 
-	/* Add new record to the llog at *last_rec position one by one to
-	 * check that last block is re-read during processing */
+	/*
+	 * Add new record to the llog at *last_rec position one by one to
+	 * check that last block is re-read during processing
+	 */
 	if (cur_idx == *last_rec || cur_idx == (*last_rec + 1)) {
 		rc = llog_write(env, lgh, rec, LLOG_NEXT_IDX);
 		if (rc < 0)
@@ -404,7 +409,8 @@ static int llog_test_3(const struct lu_env *env, struct obd_device *obd,
 	llh->lgh_hdr->llh_size = sizeof(struct llog_gen_rec);
 	llh->lgh_hdr->llh_flags |= LLOG_F_IS_FIXSIZE;
 
-	/* Fill the llog with 64-bytes records, use 1023 records,
+	/*
+	 * Fill the llog with 64-bytes records, use 1023 records,
 	 * so last chunk will be partially full. Don't change this
 	 * value until record size is changed.
 	 */
@@ -466,14 +472,17 @@ static int llog_test_3(const struct lu_env *env, struct obd_device *obd,
 
 	CWARN("3b: write 566 variable size llog records\n");
 
-	/* Drop llh_size to 0 to mark llog as variable-size and write
-	 * header to make this change permanent. */
+	/*
+	 * Drop llh_size to 0 to mark llog as variable-size and write
+	 * header to make this change permanent.
+	 */
 	llh->lgh_hdr->llh_flags &= ~LLOG_F_IS_FIXSIZE;
 	llog_write(env, llh, &llh->lgh_hdr->llh_hdr, LLOG_HEADER_IDX);
 
 	hdr->lrh_type = OBD_CFG_REC;
 
-	/* there are 1025 64-bytes records in llog already,
+	/*
+	 * there are 1025 64-bytes records in llog already,
 	 * the last chunk contains single record, i.e. 64 bytes.
 	 * Each pair of variable size records is 200 bytes, so
 	 * we will have the following distribution per chunks:
@@ -566,15 +575,15 @@ static int llog_test_3(const struct lu_env *env, struct obd_device *obd,
 /* Test catalogue additions */
 static int llog_test_4(const struct lu_env *env, struct obd_device *obd)
 {
-	struct llog_handle	*cath;
-	char			 name[10];
-	int			 rc, rc2, i, buflen;
-	struct llog_mini_rec	 lmr;
-	struct llog_cookie	 cookie;
-	struct llog_ctxt	*ctxt;
-	int			 num_recs = 0;
-	char			*buf;
-	struct llog_rec_hdr	*rec;
+	struct llog_handle *cath, *llh;
+	char name[10];
+	int rc, rc2, i, buflen;
+	struct llog_mini_rec lmr;
+	struct llog_cookie cookie;
+	struct llog_ctxt *ctxt;
+	int num_recs = 0;
+	char *buf;
+	struct llog_rec_hdr *rec;
 
 	ENTRY;
 
@@ -582,7 +591,7 @@ static int llog_test_4(const struct lu_env *env, struct obd_device *obd)
 	LASSERT(ctxt);
 
 	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
-	lmr.lmr_hdr.lrh_type = 0xf00f00;
+	lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC;
 
 	sprintf(name, "%x", llog_test_rand + 1);
 	CWARN("4a: create a catalog log with name: %s\n", name);
@@ -615,6 +624,18 @@ static int llog_test_4(const struct lu_env *env, struct obd_device *obd)
 	if (rc)
 		GOTO(out, rc);
 
+	/* estimate the max number of record for the plain llog
+	 * cause it depends on disk size
+	 */
+	llh = cath->u.chd.chd_current_log;
+	if (llh->lgh_max_size != 0) {
+		llog_test_recnum = (llh->lgh_max_size -
+			sizeof(struct llog_log_hdr)) / LLOG_MIN_REC_SIZE;
+	}
+
+	if (llog_test_recnum >= LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr))
+		llog_test_recnum = LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr) - 1;
+
 	CWARN("4c: cancel 1 log record\n");
 	rc = llog_cat_cancel_records(env, cath, 1, &cookie);
 	if (rc) {
@@ -627,12 +648,12 @@ static int llog_test_4(const struct lu_env *env, struct obd_device *obd)
 	if (rc)
 		GOTO(out, rc);
 
-	CWARN("4d: write %d more log records\n", LLOG_TEST_RECNUM);
-	for (i = 0; i < LLOG_TEST_RECNUM; i++) {
+	CWARN("4d: write %d more log records\n", llog_test_recnum);
+	for (i = 0; i < llog_test_recnum; i++) {
 		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
 		if (rc) {
 			CERROR("4d: write %d records failed at #%d: %d\n",
-			       LLOG_TEST_RECNUM, i + 1, rc);
+			       llog_test_recnum, i + 1, rc);
 			GOTO(out, rc);
 		}
 		num_recs++;
@@ -680,8 +701,8 @@ static int cat_counter;
 static int cat_print_cb(const struct lu_env *env, struct llog_handle *llh,
 			struct llog_rec_hdr *rec, void *data)
 {
-	struct llog_logid_rec	*lir = (struct llog_logid_rec *)rec;
-	struct lu_fid		 fid = {0};
+	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+	struct lu_fid fid = {0};
 
 	if (rec->lrh_type != LLOG_LOGID_MAGIC) {
 		CERROR("invalid record in catalog\n");
@@ -739,7 +760,7 @@ static int llog_cancel_rec_cb(const struct lu_env *env,
 
 	llog_cat_cancel_records(env, llh->u.phd.phd_cat_handle, 1, &cookie);
 	cancel_count++;
-	if (cancel_count == LLOG_TEST_RECNUM)
+	if (cancel_count == llog_test_recnum)
 		RETURN(-LLOG_EEMPTY);
 	RETURN(0);
 }
@@ -747,11 +768,11 @@ static int llog_cancel_rec_cb(const struct lu_env *env,
 /* Test log and catalogue processing */
 static int llog_test_5(const struct lu_env *env, struct obd_device *obd)
 {
-	struct llog_handle	*llh = NULL;
-	char			 name[10];
-	int			 rc, rc2;
-	struct llog_mini_rec	 lmr;
-	struct llog_ctxt	*ctxt;
+	struct llog_handle *llh = NULL;
+	char name[10];
+	int rc, rc2;
+	struct llog_mini_rec lmr;
+	struct llog_ctxt *ctxt;
 
 	ENTRY;
 
@@ -759,7 +780,7 @@ static int llog_test_5(const struct lu_env *env, struct obd_device *obd)
 	LASSERT(ctxt);
 
 	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
-	lmr.lmr_hdr.lrh_type = 0xf00f00;
+	lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC;
 
 	CWARN("5a: re-open catalog by id\n");
 	rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS);
@@ -786,7 +807,7 @@ static int llog_test_5(const struct lu_env *env, struct obd_device *obd)
 		GOTO(out, rc = -EINVAL);
 	}
 
-	CWARN("5c: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
+	CWARN("5c: Cancel %d records, see one log zapped\n", llog_test_recnum);
 	cancel_count = 0;
 	rc = llog_cat_process(env, llh, llog_cancel_rec_cb, "foobar", 0, 0);
 	if (rc != -LLOG_EEMPTY) {
@@ -857,14 +878,14 @@ static int llog_test_5(const struct lu_env *env, struct obd_device *obd)
 static int llog_test_6(const struct lu_env *env, struct obd_device *obd,
 		       char *name)
 {
-	struct obd_device	*mgc_obd;
-	struct llog_ctxt	*ctxt;
-	struct obd_uuid		*mgs_uuid;
-	struct obd_export	*exp;
-	struct obd_uuid		 uuid = { "LLOG_TEST6_UUID" };
-	struct llog_handle	*llh = NULL;
-	struct llog_ctxt	*nctxt;
-	int			 rc, rc2;
+	struct obd_device *mgc_obd;
+	struct llog_ctxt *ctxt;
+	struct obd_uuid *mgs_uuid;
+	struct obd_export *exp;
+	struct obd_uuid uuid = { "LLOG_TEST6_UUID" };
+	struct llog_handle *llh = NULL;
+	struct llog_ctxt *nctxt;
+	int rc, rc2;
 
 	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
 	LASSERT(ctxt);
@@ -973,9 +994,9 @@ static int test_7_cancel_cb(const struct lu_env *env, struct llog_handle *llh,
 
 static int llog_test_7_sub(const struct lu_env *env, struct llog_ctxt *ctxt)
 {
-	struct llog_handle	*llh;
-	int			 rc = 0, i, process_count;
-	int			 num_recs = 0;
+	struct llog_handle *llh;
+	int rc = 0, i, process_count;
+	int num_recs = 0;
 
 	ENTRY;
 
@@ -1058,8 +1079,8 @@ static int llog_test_7_sub(const struct lu_env *env, struct llog_ctxt *ctxt)
 /* Test all llog records writing and processing */
 static int llog_test_7(const struct lu_env *env, struct obd_device *obd)
 {
-	struct llog_ctxt	*ctxt;
-	int			 rc;
+	struct llog_ctxt *ctxt;
+	int rc;
 
 	ENTRY;
 
@@ -1158,61 +1179,6 @@ static int llog_test_7(const struct lu_env *env, struct obd_device *obd)
 	RETURN(rc);
 }
 
-static int llog_truncate(const struct lu_env *env, struct dt_object *o)
-{
-	struct lu_attr		 la;
-	struct thandle		*th;
-	struct dt_device	*d;
-	int			 rc;
-	ENTRY;
-
-	LASSERT(o);
-	d = lu2dt_dev(o->do_lu.lo_dev);
-	LASSERT(d);
-
-	rc = dt_attr_get(env, o, &la);
-	if (rc)
-		RETURN(rc);
-
-	CDEBUG(D_OTHER, "original size %llu\n", la.la_size);
-	rc = sizeof(struct llog_log_hdr) + sizeof(struct llog_mini_rec);
-	if (la.la_size < rc) {
-		CERROR("too small llog: %llu\n", la.la_size);
-		RETURN(0);
-	}
-
-	/* drop 2 records */
-	la.la_size = la.la_size - (sizeof(struct llog_mini_rec) * 2);
-	la.la_valid = LA_SIZE;
-
-	th = dt_trans_create(env, d);
-	if (IS_ERR(th))
-		RETURN(PTR_ERR(th));
-
-	rc = dt_declare_attr_set(env, o, &la, th);
-	if (rc)
-		GOTO(stop, rc);
-
-	rc = dt_declare_punch(env, o, la.la_size, OBD_OBJECT_EOF, th);
-
-	rc = dt_trans_start_local(env, d, th);
-	if (rc)
-		GOTO(stop, rc);
-
-	rc = dt_punch(env, o, la.la_size, OBD_OBJECT_EOF, th);
-	if (rc)
-		GOTO(stop, rc);
-
-	rc = dt_attr_set(env, o, &la, th);
-	if (rc)
-		GOTO(stop, rc);
-
-stop:
-	dt_trans_stop(env, d, th);
-
-	RETURN(rc);
-}
-
 static int test_8_cb(const struct lu_env *env, struct llog_handle *llh,
 			  struct llog_rec_hdr *rec, void *data)
 {
@@ -1222,13 +1188,13 @@ static int test_8_cb(const struct lu_env *env, struct llog_handle *llh,
 
 static int llog_test_8(const struct lu_env *env, struct obd_device *obd)
 {
-	struct llog_handle	*llh = NULL;
-	char			 name[10];
-	int			 rc, rc2, i;
-	int			 orig_counter;
-	struct llog_mini_rec	 lmr;
-	struct llog_ctxt	*ctxt;
-	struct dt_object	*obj = NULL;
+	struct llog_handle *llh = NULL;
+	char name[10];
+	int rc, rc2, i;
+	int orig_counter;
+	struct llog_mini_rec lmr;
+	struct llog_ctxt *ctxt;
+	struct dt_object *obj = NULL;
 
 	ENTRY;
 
@@ -1236,7 +1202,7 @@ static int llog_test_8(const struct lu_env *env, struct obd_device *obd)
 	LASSERT(ctxt);
 
 	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
-	lmr.lmr_hdr.lrh_type = 0xf00f00;
+	lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC;
 
 	CWARN("8a: fill the first plain llog\n");
 	rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS);
@@ -1302,7 +1268,7 @@ static int llog_test_8(const struct lu_env *env, struct obd_device *obd)
 		}
 	}
 	CWARN("8b: second llog "DFID"\n",
-		PFID(lu_object_fid(&llh->u.chd.chd_current_log->lgh_obj->do_lu)));
+	      PFID(lu_object_fid(&llh->u.chd.chd_current_log->lgh_obj->do_lu)));
 
 	rc2 = llog_cat_close(env, llh);
 	if (rc2) {
@@ -1312,8 +1278,10 @@ static int llog_test_8(const struct lu_env *env, struct obd_device *obd)
 		GOTO(out_put, rc);
 	}
 
-	CWARN("8c: drop two records from the first plain llog\n");
-	llog_truncate(env, obj);
+	/* Here was 8c: drop two records from the first plain llog
+	 * llog_truncate was bad idea cause it creates a wrong state,
+	 * lgh_last_idx is wrong and two records belongs to zeroed buffer
+	 */
 
 	CWARN("8d: count survived records\n");
 	rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS);
@@ -1335,9 +1303,9 @@ static int llog_test_8(const struct lu_env *env, struct obd_device *obd)
 		GOTO(out, rc);
 	}
 
-	if (orig_counter + 200 - 2 != plain_counter) {
+	if (orig_counter + 200 != plain_counter) {
 		CERROR("found %d records (expected %d)\n", plain_counter,
-		       orig_counter + 200 - 2);
+		       orig_counter + 200);
 		rc = -EIO;
 	}
 
@@ -1360,9 +1328,9 @@ static int llog_test_8(const struct lu_env *env, struct obd_device *obd)
 
 static int llog_test_9_sub(const struct lu_env *env, struct llog_ctxt *ctxt)
 {
-	struct llog_handle	*llh;
-	struct lu_fid		 fid;
-	int			 rc = 0;
+	struct llog_handle *llh;
+	struct lu_fid fid;
+	int rc = 0;
 
 	ENTRY;
 
@@ -1397,8 +1365,8 @@ static int llog_test_9_sub(const struct lu_env *env, struct llog_ctxt *ctxt)
 /* Prepare different types of llog records for llog_reader test*/
 static int llog_test_9(const struct lu_env *env, struct obd_device *obd)
 {
-	struct llog_ctxt	*ctxt;
-	int			 rc;
+	struct llog_ctxt *ctxt;
+	int rc;
 
 	ENTRY;
 
@@ -1454,17 +1422,80 @@ static int llog_test_9(const struct lu_env *env, struct obd_device *obd)
 	RETURN(rc);
 }
 
+struct llog_process_info {
+	struct llog_handle *lpi_loghandle;
+	llog_cb_t lpi_cb;
+	void *lpi_cbdata;
+	void *lpi_catdata;
+	int lpi_rc;
+	struct completion lpi_completion;
+	const struct lu_env *lpi_env;
+	struct task_struct *lpi_reftask;
+};
+
+
+static int llog_test_process_thread(void *arg)
+{
+	struct llog_process_info *lpi = arg;
+	int rc;
+
+	rc = llog_cat_process_or_fork(NULL, lpi->lpi_loghandle, lpi->lpi_cb,
+				      NULL, lpi->lpi_cbdata, 1, 0, true);
+
+	complete(&lpi->lpi_completion);
+
+	lpi->lpi_rc = rc;
+	if (rc)
+		CWARN("10h: Error during catalog processing %d\n", rc);
+	return rc;
+}
+
+static int cat_check_old_cb(const struct lu_env *env, struct llog_handle *llh,
+			struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+	struct lu_fid fid = {0};
+	struct lu_fid *prev_fid = data;
+
+	if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+		CERROR("invalid record in catalog\n");
+		RETURN(-EINVAL);
+	}
+
+	logid_to_fid(&lir->lid_id, &fid);
+
+	CWARN("seeing record at index %d - "DFID" in log "DFID"\n",
+	      rec->lrh_index, PFID(&fid),
+	      PFID(lu_object_fid(&llh->lgh_obj->do_lu)));
+
+	if (prev_fid->f_oid > fid.f_oid) {
+		CWARN("processing old record, fail\n");
+		prev_fid->f_oid = 0xbad;
+		RETURN(-LLOG_EEMPTY);
+	}
+
+	if (prev_fid->f_oid == 0) {
+		cfs_fail_loc = OBD_FAIL_ONCE | OBD_FAIL_LLOG_PROCESS_TIMEOUT;
+		cfs_fail_val = (unsigned int) (llh->lgh_id.lgl_oi.oi.oi_id &
+					       0xFFFFFFFF);
+		msleep(1 * MSEC_PER_SEC);
+	}
+	*prev_fid = fid;
+
+	RETURN(0);
+}
+
 /* test catalog wrap around */
 static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 {
-	struct llog_handle	*cath;
-	char			 name[10];
-	int			 rc, rc2, i, enospc, eok;
-	struct llog_mini_rec	 lmr;
-	struct llog_ctxt	*ctxt;
-	struct lu_attr		 la;
-	__u64			 cat_max_size;
-	struct dt_device	*dt;
+	struct llog_handle *cath;
+	char name[10];
+	int rc, rc2, i, enospc, eok;
+	struct llog_mini_rec lmr;
+	struct llog_ctxt *ctxt;
+	struct lu_attr la;
+	__u64 cat_max_size;
+	struct dt_device *dt;
 
 	ENTRY;
 
@@ -1472,7 +1503,7 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	LASSERT(ctxt);
 
 	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
-	lmr.lmr_hdr.lrh_type = 0xf00f00;
+	lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC;
 
 	snprintf(name, sizeof(name), "%x", llog_test_rand + 2);
 	CWARN("10a: create a catalog log with name: %s\n", name);
@@ -1490,9 +1521,11 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	cat_logid = cath->lgh_id;
 	dt = lu2dt_dev(cath->lgh_obj->do_lu.lo_dev);
 
-	/* sync device to commit all recent LLOG changes to disk and avoid
+	/*
+	 * sync device to commit all recent LLOG changes to disk and avoid
 	 * to consume a huge space with delayed journal commit callbacks
-	 * particularly on low memory nodes or VMs */
+	 * particularly on low memory nodes or VMs
+	 */
 	rc = dt_sync(env, dt);
 	if (rc) {
 		CERROR("10c: sync failed: %d\n", rc);
@@ -1503,12 +1536,12 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	cfs_fail_loc = CFS_FAIL_SKIP|OBD_FAIL_CAT_RECORDS;
 	cfs_fail_val = 4;
 
-	CWARN("10b: write %d log records\n", LLOG_TEST_RECNUM);
-	for (i = 0; i < LLOG_TEST_RECNUM; i++) {
+	CWARN("10b: write %d log records\n", llog_test_recnum);
+	for (i = 0; i < llog_test_recnum; i++) {
 		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
 		if (rc) {
 			CERROR("10b: write %d records failed at #%d: %d\n",
-			       LLOG_TEST_RECNUM, i + 1, rc);
+			       llog_test_recnum, i + 1, rc);
 			GOTO(out, rc);
 		}
 	}
@@ -1518,21 +1551,23 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	if (rc)
 		GOTO(out, rc);
 
-	/* sync device to commit all recent LLOG changes to disk and avoid
+	/*
+	 * sync device to commit all recent LLOG changes to disk and avoid
 	 * to consume a huge space with delayed journal commit callbacks
-	 * particularly on low memory nodes or VMs */
+	 * particularly on low memory nodes or VMs
+	 */
 	rc = dt_sync(env, dt);
 	if (rc) {
 		CERROR("10b: sync failed: %d\n", rc);
 		GOTO(out, rc);
 	}
 
-	CWARN("10c: write %d more log records\n", 2 * LLOG_TEST_RECNUM);
-	for (i = 0; i < 2 * LLOG_TEST_RECNUM; i++) {
+	CWARN("10c: write %d more log records\n", 2 * llog_test_recnum);
+	for (i = 0; i < 2 * llog_test_recnum; i++) {
 		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
 		if (rc) {
 			CERROR("10c: write %d records failed at #%d: %d\n",
-			       2*LLOG_TEST_RECNUM, i + 1, rc);
+			       2*llog_test_recnum, i + 1, rc);
 			GOTO(out, rc);
 		}
 	}
@@ -1542,29 +1577,35 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	if (rc)
 		GOTO(out, rc);
 
-	/* sync device to commit all recent LLOG changes to disk and avoid
+	/*
+	 * sync device to commit all recent LLOG changes to disk and avoid
 	 * to consume a huge space with delayed journal commit callbacks
-	 * particularly on low memory nodes or VMs */
+	 * particularly on low memory nodes or VMs
+	 */
 	rc = dt_sync(env, dt);
 	if (rc) {
 		CERROR("10c: sync failed: %d\n", rc);
 		GOTO(out, rc);
 	}
 
-	/* fill last allocated plain LLOG and reach -ENOSPC condition
-	 * because no slot available in Catalog */
+	/*
+	 * fill last allocated plain LLOG and reach -ENOSPC condition
+	 * because no slot available in Catalog
+	 */
 	enospc = 0;
 	eok = 0;
-	CWARN("10c: write %d more log records\n", LLOG_TEST_RECNUM);
-	for (i = 0; i < LLOG_TEST_RECNUM; i++) {
+	CWARN("10c: write %d more log records\n", llog_test_recnum);
+	for (i = 0; i < llog_test_recnum; i++) {
 		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
 		if (rc && rc != -ENOSPC) {
 			CERROR("10c: write %d records failed at #%d: %d\n",
-			       LLOG_TEST_RECNUM, i + 1, rc);
+			       llog_test_recnum, i + 1, rc);
 			GOTO(out, rc);
 		}
-		/* after last added plain LLOG has filled up, all new
-		 * records add should fail with -ENOSPC */
+		/*
+		 * after last added plain LLOG has filled up, all new
+		 * records add should fail with -ENOSPC
+		 */
 		if (rc == -ENOSPC) {
 			enospc++;
 		} else {
@@ -1573,7 +1614,7 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 		}
 	}
 
-	if ((enospc == 0) && (enospc+eok != LLOG_TEST_RECNUM)) {
+	if ((enospc == 0) && (enospc+eok != llog_test_recnum)) {
 		CERROR("10c: all last records adds should have failed with"
 		       " -ENOSPC\n");
 		GOTO(out, rc = -EINVAL);
@@ -1595,15 +1636,19 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	}
 	cat_max_size = la.la_size;
 
-	/* cancel all 1st plain llog records to empty it, this will also cause
-	 * its catalog entry to be freed for next forced wrap in 10e */
-	CWARN("10d: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
+	/*
+	 * cancel all 1st plain llog records to empty it, this will also cause
+	 * its catalog entry to be freed for next forced wrap in 10e
+	 */
+	CWARN("10d: Cancel %d records, see one log zapped\n", llog_test_recnum);
 	cancel_count = 0;
 	rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0);
 	if (rc != -LLOG_EEMPTY) {
 		CERROR("10d: process with llog_cancel_rec_cb failed: %d\n", rc);
-		/* need to indicate error if for any reason LLOG_TEST_RECNUM is
-		 * not reached */
+		/*
+		 * need to indicate error if for any reason llog_test_recnum is
+		 * not reached
+		 */
 		if (rc == 0)
 			rc = -ERANGE;
 		GOTO(out, rc);
@@ -1626,9 +1671,11 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	if (rc)
 		GOTO(out, rc);
 
-	/* sync device to commit all recent LLOG changes to disk and avoid
+	/*
+	 * sync device to commit all recent LLOG changes to disk and avoid
 	 * to consume a huge space with delayed journal commit callbacks
-	 * particularly on low memory nodes or VMs */
+	 * particularly on low memory nodes or VMs
+	 */
 	rc = dt_sync(env, dt);
 	if (rc) {
 		CERROR("10d: sync failed: %d\n", rc);
@@ -1637,16 +1684,18 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 
 	enospc = 0;
 	eok = 0;
-	CWARN("10e: write %d more log records\n", LLOG_TEST_RECNUM);
-	for (i = 0; i < LLOG_TEST_RECNUM; i++) {
+	CWARN("10e: write %d more log records\n", llog_test_recnum);
+	for (i = 0; i < llog_test_recnum; i++) {
 		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
 		if (rc && rc != -ENOSPC) {
 			CERROR("10e: write %d records failed at #%d: %d\n",
-			       LLOG_TEST_RECNUM, i + 1, rc);
+			       llog_test_recnum, i + 1, rc);
 			GOTO(out, rc);
 		}
-		/* after last added plain LLOG has filled up, all new
-		 * records add should fail with -ENOSPC */
+		/*
+		 * after last added plain LLOG has filled up, all new
+		 * records add should fail with -ENOSPC
+		 */
 		if (rc == -ENOSPC) {
 			enospc++;
 		} else {
@@ -1655,7 +1704,7 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 		}
 	}
 
-	if ((enospc == 0) && (enospc+eok != LLOG_TEST_RECNUM)) {
+	if ((enospc == 0) && (enospc+eok != llog_test_recnum)) {
 		CERROR("10e: all last records adds should have failed with"
 		       " -ENOSPC\n");
 		GOTO(out, rc = -EINVAL);
@@ -1666,13 +1715,14 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 
 	CWARN("10e: print the catalog entries.. we expect 4\n");
 	cat_counter = 0;
-	rc = llog_process(env, cath, cat_print_cb, "test 10", NULL);
+	rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10",
+				      0, 0, false);
 	if (rc) {
-		CERROR("10d: process with cat_print_cb failed: %d\n", rc);
+		CERROR("10e: process with cat_print_cb failed: %d\n", rc);
 		GOTO(out, rc);
 	}
 	if (cat_counter != 4) {
-		CERROR("10d: %d entries in catalog\n", cat_counter);
+		CERROR("10e: %d entries in catalog\n", cat_counter);
 		GOTO(out, rc = -EINVAL);
 	}
 
@@ -1702,24 +1752,30 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	CWARN("10e: catalog successfully wrap around, last_idx %d, first %d\n",
 	      cath->lgh_last_idx, cath->lgh_hdr->llh_cat_idx);
 
-	/* sync device to commit all recent LLOG changes to disk and avoid
+	/*
+	 * sync device to commit all recent LLOG changes to disk and avoid
 	 * to consume a huge space with delayed journal commit callbacks
-	 * particularly on low memory nodes or VMs */
+	 * particularly on low memory nodes or VMs
+	 */
 	rc = dt_sync(env, dt);
 	if (rc) {
 		CERROR("10e: sync failed: %d\n", rc);
 		GOTO(out, rc);
 	}
 
-	/* cancel more records to free one more slot in Catalog
-	 * see if it is re-allocated when adding more records */
-	CWARN("10f: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
+	/*
+	 * cancel more records to free one more slot in Catalog
+	 * see if it is re-allocated when adding more records
+	 */
+	CWARN("10f: Cancel %d records, see one log zapped\n", llog_test_recnum);
 	cancel_count = 0;
 	rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0);
 	if (rc != -LLOG_EEMPTY) {
 		CERROR("10f: process with llog_cancel_rec_cb failed: %d\n", rc);
-		/* need to indicate error if for any reason LLOG_TEST_RECNUM is
-		 * not reached */
+		/*
+		 * need to indicate error if for any reason llog_test_recnum is
+		 * not reached
+		 */
 		if (rc == 0)
 			rc = -ERANGE;
 		GOTO(out, rc);
@@ -1727,7 +1783,8 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 
 	CWARN("10f: print the catalog entries.. we expect 3\n");
 	cat_counter = 0;
-	rc = llog_process(env, cath, cat_print_cb, "test 10", NULL);
+	rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10",
+				      0, 0, false);
 	if (rc) {
 		CERROR("10f: process with cat_print_cb failed: %d\n", rc);
 		GOTO(out, rc);
@@ -1742,9 +1799,11 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	if (rc)
 		GOTO(out, rc);
 
-	/* sync device to commit all recent LLOG changes to disk and avoid
+	/*
+	 * sync device to commit all recent LLOG changes to disk and avoid
 	 * to consume a huge space with delayed journal commit callbacks
-	 * particularly on low memory nodes or VMs */
+	 * particularly on low memory nodes or VMs
+	 */
 	rc = dt_sync(env, dt);
 	if (rc) {
 		CERROR("10f: sync failed: %d\n", rc);
@@ -1753,16 +1812,18 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 
 	enospc = 0;
 	eok = 0;
-	CWARN("10f: write %d more log records\n", LLOG_TEST_RECNUM);
-	for (i = 0; i < LLOG_TEST_RECNUM; i++) {
+	CWARN("10f: write %d more log records\n", llog_test_recnum);
+	for (i = 0; i < llog_test_recnum; i++) {
 		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
 		if (rc && rc != -ENOSPC) {
 			CERROR("10f: write %d records failed at #%d: %d\n",
-			       LLOG_TEST_RECNUM, i + 1, rc);
+			       llog_test_recnum, i + 1, rc);
 			GOTO(out, rc);
 		}
-		/* after last added plain LLOG has filled up, all new
-		 * records add should fail with -ENOSPC */
+		/*
+		 * after last added plain LLOG has filled up, all new
+		 * records add should fail with -ENOSPC
+		 */
 		if (rc == -ENOSPC) {
 			enospc++;
 		} else {
@@ -1771,7 +1832,7 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 		}
 	}
 
-	if ((enospc == 0) && (enospc+eok != LLOG_TEST_RECNUM)) {
+	if ((enospc == 0) && (enospc+eok != llog_test_recnum)) {
 		CERROR("10f: all last records adds should have failed with"
 		       " -ENOSPC\n");
 		GOTO(out, rc = -EINVAL);
@@ -1806,9 +1867,11 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 		GOTO(out, rc = -EINVAL);
 	}
 
-	/* sync device to commit all recent LLOG changes to disk and avoid
+	/*
+	 * sync device to commit all recent LLOG changes to disk and avoid
 	 * to consume a huge space with delayed journal commit callbacks
-	 * particularly on low memory nodes or VMs */
+	 * particularly on low memory nodes or VMs
+	 */
 	rc = dt_sync(env, dt);
 	if (rc) {
 		CERROR("10f: sync failed: %d\n", rc);
@@ -1817,16 +1880,18 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 
 	/* will llh_cat_idx also successfully wrap ? */
 
-	/* cancel all records in the plain LLOGs referenced by 2 last indexes in
-	 * Catalog */
+	/*
+	 * cancel all records in the plain LLOGs referenced by 2 last indexes in
+	 * Catalog
+	 */
 
 	/* cancel more records to free one more slot in Catalog */
-	CWARN("10g: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
+	CWARN("10g: Cancel %d records, see one log zapped\n", llog_test_recnum);
 	cancel_count = 0;
 	rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0);
 	if (rc != -LLOG_EEMPTY) {
 		CERROR("10g: process with llog_cancel_rec_cb failed: %d\n", rc);
-		/* need to indicate error if for any reason LLOG_TEST_RECNUM is
+		/* need to indicate error if for any reason llog_test_recnum is
 		 * not reached */
 		if (rc == 0)
 			rc = -ERANGE;
@@ -1835,7 +1900,8 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 
 	CWARN("10g: print the catalog entries.. we expect 3\n");
 	cat_counter = 0;
-	rc = llog_process(env, cath, cat_print_cb, "test 10", NULL);
+	rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10",
+				      0, 0, false);
 	if (rc) {
 		CERROR("10g: process with cat_print_cb failed: %d\n", rc);
 		GOTO(out, rc);
@@ -1850,9 +1916,11 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	if (rc)
 		GOTO(out, rc);
 
-	/* sync device to commit all recent LLOG changes to disk and avoid
+	/*
+	 * sync device to commit all recent LLOG changes to disk and avoid
 	 * to consume a huge space with delayed journal commit callbacks
-	 * particularly on low memory nodes or VMs */
+	 * particularly on low memory nodes or VMs
+	 */
 	rc = dt_sync(env, dt);
 	if (rc) {
 		CERROR("10g: sync failed: %d\n", rc);
@@ -1860,13 +1928,15 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	}
 
 	/* cancel more records to free one more slot in Catalog */
-	CWARN("10g: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
+	CWARN("10g: Cancel %d records, see one log zapped\n", llog_test_recnum);
 	cancel_count = 0;
 	rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0);
 	if (rc != -LLOG_EEMPTY) {
 		CERROR("10g: process with llog_cancel_rec_cb failed: %d\n", rc);
-		/* need to indicate error if for any reason LLOG_TEST_RECNUM is
-		 * not reached */
+		/*
+		 * need to indicate error if for any reason llog_test_recnum is
+		 * not reached
+		 */
 		if (rc == 0)
 			rc = -ERANGE;
 		GOTO(out, rc);
@@ -1874,7 +1944,8 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 
 	CWARN("10g: print the catalog entries.. we expect 2\n");
 	cat_counter = 0;
-	rc = llog_process(env, cath, cat_print_cb, "test 10", NULL);
+	rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10",
+				      0, 0, false);
 	if (rc) {
 		CERROR("10g: process with cat_print_cb failed: %d\n", rc);
 		GOTO(out, rc);
@@ -1897,9 +1968,11 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 		GOTO(out, rc = -EINVAL);
 	}
 
-	/* sync device to commit all recent LLOG changes to disk and avoid
+	/*
+	 * sync device to commit all recent LLOG changes to disk and avoid
 	 * to consume a huge space with delayed journal commit callbacks
-	 * particularly on low memory nodes or VMs */
+	 * particularly on low memory nodes or VMs
+	 */
 	rc = dt_sync(env, dt);
 	if (rc) {
 		CERROR("10g: sync failed: %d\n", rc);
@@ -1907,13 +1980,15 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	}
 
 	/* cancel more records to free one more slot in Catalog */
-	CWARN("10g: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
+	CWARN("10g: Cancel %d records, see one log zapped\n", llog_test_recnum);
 	cancel_count = 0;
 	rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0);
 	if (rc != -LLOG_EEMPTY) {
 		CERROR("10g: process with llog_cancel_rec_cb failed: %d\n", rc);
-		/* need to indicate error if for any reason LLOG_TEST_RECNUM is
-		 * not reached */
+		/*
+		 * need to indicate error if for any reason llog_test_recnum is
+		 * not reached
+		 */
 		if (rc == 0)
 			rc = -ERANGE;
 		GOTO(out, rc);
@@ -1921,7 +1996,8 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 
 	CWARN("10g: print the catalog entries.. we expect 1\n");
 	cat_counter = 0;
-	rc = llog_process(env, cath, cat_print_cb, "test 10", NULL);
+	rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10",
+				      0, 0, false);
 	if (rc) {
 		CERROR("10g: process with cat_print_cb failed: %d\n", rc);
 		GOTO(out, rc);
@@ -1946,6 +2022,64 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 
 	CWARN("10g: llh_cat_idx has also successfully wrapped!\n");
 
+	/*
+	 * catalog has only one valid entry other slots has outdated
+	 * records. Trying to race the llog_thread_process with llog_add
+	 * llog_thread_process read buffer and loop record on it.
+	 * llog_add adds a record and mark a record in bitmap.
+	 * llog_thread_process process record with old data.
+	 */
+	{
+	struct llog_process_info lpi;
+	struct lu_fid test_fid = {0};
+
+	lpi.lpi_loghandle = cath;
+	lpi.lpi_cb = cat_check_old_cb;
+	lpi.lpi_catdata = NULL;
+	lpi.lpi_cbdata = &test_fid;
+	init_completion(&lpi.lpi_completion);
+
+	kthread_run(llog_test_process_thread, &lpi, "llog_test_process_thread");
+
+	msleep(1 * MSEC_PER_SEC / 2);
+	enospc = 0;
+	eok = 0;
+	CWARN("10h: write %d more log records\n", llog_test_recnum);
+	for (i = 0; i < llog_test_recnum; i++) {
+		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
+		if (rc && rc != -ENOSPC) {
+			CERROR("10h: write %d records failed at #%d: %d\n",
+			       llog_test_recnum, i + 1, rc);
+			GOTO(out, rc);
+		}
+		/*
+		 * after last added plain LLOG has filled up, all new
+		 * records add should fail with -ENOSPC
+		 */
+		if (rc == -ENOSPC) {
+			enospc++;
+		} else {
+			enospc = 0;
+			eok++;
+		}
+	}
+
+	if ((enospc == 0) && (enospc+eok != llog_test_recnum)) {
+		CERROR("10h: all last records adds should have failed with"
+		       " -ENOSPC\n");
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CWARN("10h: wrote %d records then %d failed with ENOSPC\n", eok,
+	      enospc);
+
+	wait_for_completion(&lpi.lpi_completion);
+
+	if (lpi.lpi_rc != 0) {
+		CERROR("10h: race happened, old record was processed\n");
+		GOTO(out, rc = -EINVAL);
+	}
+	}
 out:
 	cfs_fail_loc = 0;
 	cfs_fail_val = 0;
@@ -1962,15 +2096,17 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	RETURN(rc);
 }
 
-/* -------------------------------------------------------------------------
+/*
+ * -------------------------------------------------------------------------
  * Tests above, boring obd functions below
- * ------------------------------------------------------------------------- */
+ * -------------------------------------------------------------------------
+ */
 static int llog_run_tests(const struct lu_env *env, struct obd_device *obd)
 {
-	struct llog_handle	*llh = NULL;
-	struct llog_ctxt	*ctxt;
-	int			 rc, err;
-	char			 name[10];
+	struct llog_handle *llh = NULL;
+	struct llog_ctxt *ctxt;
+	int rc, err;
+	char name[10];
 
 	ENTRY;
 	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
@@ -2032,9 +2168,9 @@ static int llog_run_tests(const struct lu_env *env, struct obd_device *obd)
 
 static int llog_test_cleanup(struct obd_device *obd)
 {
-	struct obd_device	*tgt;
-	struct lu_env		 env;
-	int			 rc;
+	struct obd_device *tgt;
+	struct lu_env env;
+	int rc;
 
 	ENTRY;
 
@@ -2052,32 +2188,32 @@ static int llog_test_cleanup(struct obd_device *obd)
 
 static int llog_test_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 {
-	struct obd_device	*tgt;
-	struct llog_ctxt	*ctxt;
-	struct dt_object	*o;
-	struct lu_env		 env;
-	struct lu_context	 test_session;
-	int			 rc;
-
-        ENTRY;
-
-        if (lcfg->lcfg_bufcount < 2) {
-                CERROR("requires a TARGET OBD name\n");
-                RETURN(-EINVAL);
-        }
+	struct obd_device *tgt;
+	struct llog_ctxt *ctxt;
+	struct dt_object *o;
+	struct lu_env env;
+	struct lu_context test_session;
+	int rc;
 
-        if (lcfg->lcfg_buflens[1] < 1) {
-                CERROR("requires a TARGET OBD name\n");
-                RETURN(-EINVAL);
-        }
+	ENTRY;
 
-        /* disk obd */
-        tgt = class_name2obd(lustre_cfg_string(lcfg, 1));
-        if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) {
-                CERROR("target device not attached or not set up (%s)\n",
-                       lustre_cfg_string(lcfg, 1));
-                RETURN(-EINVAL);
-        }
+	if (lcfg->lcfg_bufcount < 2) {
+		CERROR("requires a TARGET OBD name\n");
+		RETURN(-EINVAL);
+	}
+
+	if (lcfg->lcfg_buflens[1] < 1) {
+		CERROR("requires a TARGET OBD name\n");
+		RETURN(-EINVAL);
+	}
+
+	/* disk obd */
+	tgt = class_name2obd(lustre_cfg_string(lcfg, 1));
+	if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) {
+		CERROR("target device not attached or not set up (%s)\n",
+			lustre_cfg_string(lcfg, 1));
+		RETURN(-EINVAL);
+	}
 
 	rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
 	if (rc)
@@ -2126,14 +2262,14 @@ static int llog_test_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 }
 
 static struct obd_ops llog_obd_ops = {
-        .o_owner       = THIS_MODULE,
-        .o_setup       = llog_test_setup,
-        .o_cleanup     = llog_test_cleanup,
+	.o_owner       = THIS_MODULE,
+	.o_setup       = llog_test_setup,
+	.o_cleanup     = llog_test_cleanup,
 };
 
 static int __init llog_test_init(void)
 {
-	return class_register_type(&llog_obd_ops, NULL, true, NULL,
+	return class_register_type(&llog_obd_ops, NULL, false, NULL,
 				   "llog_test", NULL);
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/local_storage.c b/drivers/staging/lustrefsx/lustre/obdclass/local_storage.c
index 89b227b0cfa09..04c25ebd88274 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/local_storage.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/local_storage.c
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * lustre/obdclass/local_storage.c
@@ -388,14 +388,14 @@ static struct dt_object *__local_file_create(const struct lu_env *env,
 		rec->rec_fid = fid;
 		/* Add "." and ".." for newly created dir */
 		rc = dt_insert(env, dto, (const struct dt_rec *)rec,
-			       (const struct dt_key *)".", th, 1);
+			       (const struct dt_key *)".", th);
 		if (rc != 0)
 			GOTO(destroy, rc);
 
 		dt_ref_add(env, dto, th);
 		rec->rec_fid = lu_object_fid(&parent->do_lu);
 		rc = dt_insert(env, dto, (const struct dt_rec *)rec,
-			       (const struct dt_key *)"..", th, 1);
+			       (const struct dt_key *)"..", th);
 		if (rc != 0)
 			GOTO(destroy, rc);
 	}
@@ -404,7 +404,7 @@ static struct dt_object *__local_file_create(const struct lu_env *env,
 	rec->rec_type = dto->do_lu.lo_header->loh_attr;
 	dt_write_lock(env, parent, LOS_PARENT);
 	rc = dt_insert(env, parent, (const struct dt_rec *)rec,
-		       (const struct dt_key *)name, th, 1);
+		       (const struct dt_key *)name, th);
 	if (dti->dti_dof.dof_type == DFT_DIR)
 		dt_ref_add(env, parent, th);
 	dt_write_unlock(env, parent);
@@ -684,7 +684,7 @@ int local_object_unlink(const struct lu_env *env, struct dt_device *dt,
 		rec->rec_fid = &dti->dti_fid;
 		rec->rec_type = dto->do_lu.lo_header->loh_attr;
 		rc = dt_insert(env, parent, (const struct dt_rec *)rec,
-			       (const struct dt_key *)name, th, 1);
+			       (const struct dt_key *)name, th);
 		GOTO(unlock, rc);
 	}
 
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c
index 00395af273593..37d749d199275 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c
@@ -30,10 +30,8 @@
 
 #define DEBUG_SUBSYSTEM S_CLASS
 
-
 #include <obd_class.h>
 #include <lprocfs_status.h>
-#include <lustre/lustre_idl.h>
 
 #ifdef CONFIG_PROC_FS
 
@@ -67,8 +65,8 @@ struct job_stat {
 	struct hlist_node	js_hash;	/* hash struct for this jobid */
 	struct list_head	js_list;	/* on ojs_list, with ojs_lock */
 	atomic_t		js_refcount;	/* num users of this struct */
-	char			js_jobid[LUSTRE_JOBID_SIZE]; /* job name */
-	time_t			js_timestamp;	/* seconds of most recent stat*/
+	char			js_jobid[LUSTRE_JOBID_SIZE]; /* job name + NUL*/
+	time64_t		js_timestamp;	/* seconds of most recent stat*/
 	struct lprocfs_stats	*js_stats;	/* per-job statistics */
 	struct obd_job_stats	*js_jobstats;	/* for accessing ojs_lock */
 };
@@ -164,7 +162,7 @@ static int job_cleanup_iter_callback(struct cfs_hash *hs,
 				     struct cfs_hash_bd *bd,
 				     struct hlist_node *hnode, void *data)
 {
-	time_t oldest_time = *((time_t *)data);
+	time64_t oldest_time = *((time64_t *)data);
 	struct job_stat *job;
 
 	job = hlist_entry(hnode, struct job_stat, js_hash);
@@ -193,8 +191,8 @@ static int job_cleanup_iter_callback(struct cfs_hash *hs,
  */
 static void lprocfs_job_cleanup(struct obd_job_stats *stats, int before)
 {
-	time_t now = cfs_time_current_sec();
-	time_t oldest;
+	time64_t now = ktime_get_real_seconds();
+	time64_t oldest;
 
 	if (likely(before >= 0)) {
 		unsigned int cleanup_interval = stats->ojs_cleanup_interval;
@@ -234,7 +232,7 @@ static void lprocfs_job_cleanup(struct obd_job_stats *stats, int before)
 
 	write_lock(&stats->ojs_lock);
 	stats->ojs_cleaning = false;
-	stats->ojs_last_cleanup = cfs_time_current_sec();
+	stats->ojs_last_cleanup = ktime_get_real_seconds();
 	write_unlock(&stats->ojs_lock);
 }
 
@@ -254,8 +252,8 @@ static struct job_stat *job_alloc(char *jobid, struct obd_job_stats *jobs)
 
 	jobs->ojs_cntr_init_fn(job->js_stats);
 
-	memcpy(job->js_jobid, jobid, LUSTRE_JOBID_SIZE);
-	job->js_timestamp = cfs_time_current_sec();
+	memcpy(job->js_jobid, jobid, sizeof(job->js_jobid));
+	job->js_timestamp = ktime_get_real_seconds();
 	job->js_jobstats = jobs;
 	INIT_HLIST_NODE(&job->js_hash);
 	INIT_LIST_HEAD(&job->js_list);
@@ -315,7 +313,7 @@ int lprocfs_job_stats_log(struct obd_device *obd, char *jobid,
 
 found:
 	LASSERT(stats == job->js_jobstats);
-	job->js_timestamp = cfs_time_current_sec();
+	job->js_timestamp = ktime_get_real_seconds();
 	lprocfs_counter_add(job->js_stats, event, amount);
 
 	job_putref(job);
@@ -444,7 +442,7 @@ static int lprocfs_jobstats_seq_show(struct seq_file *p, void *v)
 	}
 	seq_putc(p, '\n');
 
-	seq_printf(p, "  %-16s %ld\n", "snapshot_time:", job->js_timestamp);
+	seq_printf(p, "  %-16s %lld\n", "snapshot_time:", job->js_timestamp);
 
 	s = job->js_stats;
 	for (i = 0; i < s->ls_num; i++) {
@@ -515,7 +513,7 @@ static ssize_t lprocfs_jobstats_seq_write(struct file *file,
 	if (stats->ojs_hash == NULL)
 		return -ENODEV;
 
-	if (lprocfs_copy_from_user(file, jobid, buf, len))
+	if (copy_from_user(jobid, buf, len))
 		return -EFAULT;
 	jobid[len] = 0;
 
@@ -615,7 +613,7 @@ int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
 	stats->ojs_cntr_num = cntr_num;
 	stats->ojs_cntr_init_fn = init_fn;
 	stats->ojs_cleanup_interval = 600; /* 10 mins by default */
-	stats->ojs_last_cleanup = cfs_time_current_sec();
+	stats->ojs_last_cleanup = ktime_get_real_seconds();
 
 	entry = lprocfs_add_simple(obd->obd_proc_entry, "job_stats", stats,
 				   &lprocfs_jobstats_seq_fops);
@@ -626,45 +624,38 @@ int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
 	RETURN(0);
 }
 EXPORT_SYMBOL(lprocfs_job_stats_init);
+#endif /* CONFIG_PROC_FS*/
 
-int lprocfs_job_interval_seq_show(struct seq_file *m, void *data)
+ssize_t job_cleanup_interval_show(struct kobject *kobj, struct attribute *attr,
+				  char *buf)
 {
-	struct obd_device *obd = m->private;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 	struct obd_job_stats *stats;
 
-	if (obd == NULL)
-		return -ENODEV;
-
 	stats = &obd->u.obt.obt_jobstats;
-	seq_printf(m, "%d\n", stats->ojs_cleanup_interval);
-	return 0;
+	return scnprintf(buf, PAGE_SIZE, "%d\n", stats->ojs_cleanup_interval);
 }
-EXPORT_SYMBOL(lprocfs_job_interval_seq_show);
+EXPORT_SYMBOL(job_cleanup_interval_show);
 
-ssize_t
-lprocfs_job_interval_seq_write(struct file *file, const char __user *buffer,
-				size_t count, loff_t *off)
+ssize_t job_cleanup_interval_store(struct kobject *kobj,
+				   struct attribute *attr,
+				   const char *buffer, size_t count)
 {
-	struct obd_device *obd;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 	struct obd_job_stats *stats;
+	unsigned int val;
 	int rc;
-	__s64 val;
-
-	obd = ((struct seq_file *)file->private_data)->private;
-	if (obd == NULL)
-		return -ENODEV;
 
 	stats = &obd->u.obt.obt_jobstats;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtouint(buffer, 0, &val);
 	if (rc)
 		return rc;
-	if (val < 0 || val > UINT_MAX)
-		return -ERANGE;
 
 	stats->ojs_cleanup_interval = val;
 	lprocfs_job_cleanup(stats, stats->ojs_cleanup_interval);
 	return count;
 }
-EXPORT_SYMBOL(lprocfs_job_interval_seq_write);
-#endif /* CONFIG_PROC_FS*/
+EXPORT_SYMBOL(job_cleanup_interval_store);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
index f3d2efc8403ba..7a365730746d6 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,7 +38,6 @@
 
 #include <obd_class.h>
 #include <lprocfs_status.h>
-#include <lustre/lustre_idl.h>
 
 #ifdef CONFIG_PROC_FS
 
@@ -48,52 +47,15 @@ MODULE_PARM_DESC(lprocfs_no_percpu_stats, "Do not alloc percpu data for lprocfs
 
 #define MAX_STRING_SIZE 128
 
-static const struct file_operations lprocfs_kernel_dummy = {};
-
-/*
- * Awful hacks to mark procfs seq writes as going to kernel space. Used
- * to be done with set_fs(KERNEL_DS), but that function is no more.
- * This should only be called from class_process_proc_param(), which passes
- * in a fake file structure. It should never, ever be used for anything else.
- */
-void lprocfs_file_set_kernel(struct file *file)
-{
-	LASSERT(file->f_op == NULL);
-	file->f_op = &lprocfs_kernel_dummy;
-}
-EXPORT_SYMBOL(lprocfs_file_set_kernel);
-
-bool lprocfs_file_is_kernel(struct file *file)
-{
-	return (file->f_op == &lprocfs_kernel_dummy);
-}
-EXPORT_SYMBOL(lprocfs_file_is_kernel);
-
-unsigned long
-lprocfs_copy_from_user(struct file *file, void *to,
-		       const void __user *from, unsigned long n)
-{
-	unsigned long res;
-
-	if (lprocfs_file_is_kernel(file)) {
-		memcpy(to, from, n);
-		res = 0;
-	} else
-		res = copy_from_user(to, from, n);
-
-	return res;
-}
-EXPORT_SYMBOL(lprocfs_copy_from_user);
-
 int lprocfs_single_release(struct inode *inode, struct file *file)
 {
-        return single_release(inode, file);
+	return single_release(inode, file);
 }
 EXPORT_SYMBOL(lprocfs_single_release);
 
 int lprocfs_seq_release(struct inode *inode, struct file *file)
 {
-        return seq_release(inode, file);
+	return seq_release(inode, file);
 }
 EXPORT_SYMBOL(lprocfs_seq_release);
 
@@ -116,8 +78,8 @@ lprocfs_add_simple(struct proc_dir_entry *root, char *name,
 	struct proc_dir_entry *proc;
 	umode_t mode;
 
-	if (root == NULL || name == NULL || fops == NULL)
-                return ERR_PTR(-EINVAL);
+	if (!root || !name || !fops)
+		return ERR_PTR(-EINVAL);
 
 	mode = default_mode(fops);
 	proc = proc_create_data(name, mode, root, fops, data);
@@ -126,42 +88,43 @@ lprocfs_add_simple(struct proc_dir_entry *root, char *name,
 		       name);
 		return ERR_PTR(-ENOMEM);
 	}
-        return proc;
+	return proc;
 }
 EXPORT_SYMBOL(lprocfs_add_simple);
 
 struct proc_dir_entry *lprocfs_add_symlink(const char *name,
-                        struct proc_dir_entry *parent, const char *format, ...)
+					   struct proc_dir_entry *parent,
+					   const char *format, ...)
 {
-        struct proc_dir_entry *entry;
-        char *dest;
-        va_list ap;
+	struct proc_dir_entry *entry;
+	char *dest;
+	va_list ap;
 
-        if (parent == NULL || format == NULL)
-                return NULL;
+	if (!parent || !format)
+		return NULL;
 
-        OBD_ALLOC_WAIT(dest, MAX_STRING_SIZE + 1);
-        if (dest == NULL)
-                return NULL;
+	OBD_ALLOC_WAIT(dest, MAX_STRING_SIZE + 1);
+	if (!dest)
+		return NULL;
 
-        va_start(ap, format);
-        vsnprintf(dest, MAX_STRING_SIZE, format, ap);
-        va_end(ap);
+	va_start(ap, format);
+	vsnprintf(dest, MAX_STRING_SIZE, format, ap);
+	va_end(ap);
 
-        entry = proc_symlink(name, parent, dest);
-	if (entry == NULL)
+	entry = proc_symlink(name, parent, dest);
+	if (!entry)
 		CERROR("LprocFS: Could not create symbolic link from "
 		       "%s to %s\n", name, dest);
 
-        OBD_FREE(dest, MAX_STRING_SIZE + 1);
-        return entry;
+	OBD_FREE(dest, MAX_STRING_SIZE + 1);
+	return entry;
 }
 EXPORT_SYMBOL(lprocfs_add_symlink);
 
 static const struct file_operations ldebugfs_empty_ops = { };
 
 int ldebugfs_add_vars(struct dentry *parent, struct ldebugfs_vars *list,
-		      void *data)
+		       void *data)
 {
 	if (IS_ERR_OR_NULL(parent) || IS_ERR_OR_NULL(list))
 		return -EINVAL;
@@ -206,10 +169,10 @@ int
 lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list,
 		 void *data)
 {
-	if (root == NULL || list == NULL)
+	if (!root || !list)
 		return -EINVAL;
 
-	while (list->name != NULL) {
+	while (list->name) {
 		struct proc_dir_entry *proc;
 		umode_t mode = 0;
 
@@ -220,7 +183,7 @@ lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list,
 		proc = proc_create_data(list->name, mode, root,
 					list->fops ?: &lprocfs_empty_ops,
 					list->data ?: data);
-		if (proc == NULL)
+		if (!proc)
 			return -ENOMEM;
 		list++;
 	}
@@ -230,7 +193,7 @@ EXPORT_SYMBOL(lprocfs_add_vars);
 
 void ldebugfs_remove(struct dentry **entryp)
 {
-	debugfs_remove(*entryp);
+	debugfs_remove_recursive(*entryp);
 	*entryp = NULL;
 }
 EXPORT_SYMBOL_GPL(ldebugfs_remove);
@@ -248,36 +211,38 @@ static void lprocfs_remove_nolock(struct proc_dir_entry **proot)
 	struct proc_dir_entry *parent;
 
 	*proot = NULL;
-	if (root == NULL || IS_ERR(root))
+	if (!root || IS_ERR(root))
 		return;
 
-        parent = root->parent;
-        LASSERT(parent != NULL);
+	parent = root->parent;
+	LASSERT(parent != NULL);
 
-        while (1) {
-                while (temp->subdir != NULL)
-                        temp = temp->subdir;
+	while (1) {
+		while (temp->subdir)
+			temp = temp->subdir;
 
-                rm_entry = temp;
-                temp = temp->parent;
+		rm_entry = temp;
+		temp = temp->parent;
 
-                /* Memory corruption once caused this to fail, and
-                   without this LASSERT we would loop here forever. */
-                LASSERTF(strlen(rm_entry->name) == rm_entry->namelen,
-                         "0x%p  %s/%s len %d\n", rm_entry, temp->name,
-                         rm_entry->name, (int)strlen(rm_entry->name));
+		/*
+		 * Memory corruption once caused this to fail, and
+		 * without this LASSERT we would loop here forever.
+		 */
+		LASSERTF(strlen(rm_entry->name) == rm_entry->namelen,
+			 "0x%p  %s/%s len %d\n", rm_entry, temp->name,
+			 rm_entry->name, (int)strlen(rm_entry->name));
 
-                remove_proc_entry(rm_entry->name, temp);
-                if (temp == parent)
-                        break;
-        }
+		remove_proc_entry(rm_entry->name, temp);
+		if (temp == parent)
+			break;
+	}
 }
 
 int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
 {
-	struct proc_dir_entry	 *t = NULL;
-	struct proc_dir_entry	**p;
-	int			  len, busy = 0;
+	struct proc_dir_entry *t = NULL;
+	struct proc_dir_entry **p;
+	int len, busy = 0;
 
 	LASSERT(parent != NULL);
 	len = strlen(name);
@@ -376,10 +341,10 @@ lprocfs_register(const char *name, struct proc_dir_entry *parent,
 	struct proc_dir_entry *newchild;
 
 	newchild = proc_mkdir(name, parent);
-	if (newchild == NULL)
+	if (!newchild)
 		return ERR_PTR(-ENOMEM);
 
-	if (list != NULL) {
+	if (list) {
 		int rc = lprocfs_add_vars(newchild, list, data);
 		if (rc) {
 			lprocfs_remove(&newchild);
@@ -391,93 +356,6 @@ lprocfs_register(const char *name, struct proc_dir_entry *parent,
 EXPORT_SYMBOL(lprocfs_register);
 
 /* Generic callbacks */
-int lprocfs_uint_seq_show(struct seq_file *m, void *data)
-{
-	seq_printf(m, "%u\n", *(unsigned int *)data);
-	return 0;
-}
-EXPORT_SYMBOL(lprocfs_uint_seq_show);
-
-int lprocfs_wr_uint(struct file *file, const char __user *buffer,
-                    unsigned long count, void *data)
-{
-	unsigned	*p = data;
-	char		 dummy[MAX_STRING_SIZE + 1];
-	char		*end;
-	unsigned long	 tmp;
-
-	if (count >= sizeof(dummy))
-		return -EINVAL;
-
-	if (count == 0)
-		return 0;
-
-	if (lprocfs_copy_from_user(file, dummy, buffer, count))
-		return -EFAULT;
-
-	dummy[count] = 0;
-
-	tmp = simple_strtoul(dummy, &end, 0);
-	if (dummy == end)
-		return -EINVAL;
-
-	*p = (unsigned int)tmp;
-	return count;
-}
-EXPORT_SYMBOL(lprocfs_wr_uint);
-
-ssize_t lprocfs_uint_seq_write(struct file *file, const char __user *buffer,
-			       size_t count, loff_t *off)
-{
-	int *data = ((struct seq_file *)file->private_data)->private;
-	int rc;
-	__s64 val = 0;
-
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
-	if (rc < 0)
-		return rc;
-
-	return lprocfs_wr_uint(file, buffer, count, data);
-}
-EXPORT_SYMBOL(lprocfs_uint_seq_write);
-
-int lprocfs_u64_seq_show(struct seq_file *m, void *data)
-{
-	LASSERT(data != NULL);
-	seq_printf(m, "%llu\n", *(__u64 *)data);
-	return 0;
-}
-EXPORT_SYMBOL(lprocfs_u64_seq_show);
-
-int lprocfs_atomic_seq_show(struct seq_file *m, void *data)
-{
-	atomic_t *atom = data;
-	LASSERT(atom != NULL);
-	seq_printf(m, "%d\n", atomic_read(atom));
-	return 0;
-}
-EXPORT_SYMBOL(lprocfs_atomic_seq_show);
-
-ssize_t
-lprocfs_atomic_seq_write(struct file *file, const char __user *buffer,
-			size_t count, loff_t *off)
-{
-	atomic_t *atm = ((struct seq_file *)file->private_data)->private;
-	__s64 val = 0;
-	int rc;
-
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
-	if (rc < 0)
-		return rc;
-
-	if (val <= 0 || val > INT_MAX)
-		return -ERANGE;
-
-	atomic_set(atm, val);
-	return count;
-}
-EXPORT_SYMBOL(lprocfs_atomic_seq_write);
-
 int lprocfs_uuid_seq_show(struct seq_file *m, void *data)
 {
 	struct obd_device *obd = data;
@@ -488,114 +366,163 @@ int lprocfs_uuid_seq_show(struct seq_file *m, void *data)
 }
 EXPORT_SYMBOL(lprocfs_uuid_seq_show);
 
-int lprocfs_name_seq_show(struct seq_file *m, void *data)
+static ssize_t uuid_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf)
 {
-	struct obd_device *dev = data;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 
-	LASSERT(dev != NULL);
-	seq_printf(m, "%s\n", dev->obd_name);
-	return 0;
+	return sprintf(buf, "%s\n", obd->obd_uuid.uuid);
 }
-EXPORT_SYMBOL(lprocfs_name_seq_show);
+LUSTRE_RO_ATTR(uuid);
 
-int lprocfs_blksize_seq_show(struct seq_file *m, void *data)
+static ssize_t blocksize_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
 {
-	struct obd_device *obd = data;
-	struct obd_statfs  osfs;
-	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
-			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
-			    OBD_STATFS_NODELAY);
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
+			OBD_STATFS_NODELAY);
 	if (!rc)
-		seq_printf(m, "%u\n", osfs.os_bsize);
+		return sprintf(buf, "%u\n", osfs.os_bsize);
+
 	return rc;
 }
-EXPORT_SYMBOL(lprocfs_blksize_seq_show);
+LUSTRE_RO_ATTR(blocksize);
 
-int lprocfs_kbytestotal_seq_show(struct seq_file *m, void *data)
+static ssize_t kbytestotal_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
 {
-	struct obd_device *obd = data;
-	struct obd_statfs  osfs;
-	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
-			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
-			    OBD_STATFS_NODELAY);
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
+			OBD_STATFS_NODELAY);
 	if (!rc) {
-		__u32 blk_size = osfs.os_bsize >> 10;
-		__u64 result = osfs.os_blocks;
+		u32 blk_size = osfs.os_bsize >> 10;
+		u64 result = osfs.os_blocks;
 
 		while (blk_size >>= 1)
 			result <<= 1;
 
-		seq_printf(m, "%llu\n", result);
+		return sprintf(buf, "%llu\n", result);
 	}
+
 	return rc;
 }
-EXPORT_SYMBOL(lprocfs_kbytestotal_seq_show);
+LUSTRE_RO_ATTR(kbytestotal);
 
-int lprocfs_kbytesfree_seq_show(struct seq_file *m, void *data)
+static ssize_t kbytesfree_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
 {
-	struct obd_device *obd = data;
-	struct obd_statfs  osfs;
-	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
-			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
-			    OBD_STATFS_NODELAY);
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
+			OBD_STATFS_NODELAY);
 	if (!rc) {
-		__u32 blk_size = osfs.os_bsize >> 10;
-		__u64 result = osfs.os_bfree;
+		u32 blk_size = osfs.os_bsize >> 10;
+		u64 result = osfs.os_bfree;
 
 		while (blk_size >>= 1)
 			result <<= 1;
 
-		seq_printf(m, "%llu\n", result);
+		return sprintf(buf, "%llu\n", result);
 	}
+
 	return rc;
 }
-EXPORT_SYMBOL(lprocfs_kbytesfree_seq_show);
+LUSTRE_RO_ATTR(kbytesfree);
 
-int lprocfs_kbytesavail_seq_show(struct seq_file *m, void *data)
+static ssize_t kbytesavail_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
 {
-	struct obd_device *obd = data;
-	struct obd_statfs  osfs;
-	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
-			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
-			    OBD_STATFS_NODELAY);
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
+			OBD_STATFS_NODELAY);
 	if (!rc) {
-		__u32 blk_size = osfs.os_bsize >> 10;
-		__u64 result = osfs.os_bavail;
+		u32 blk_size = osfs.os_bsize >> 10;
+		u64 result = osfs.os_bavail;
 
 		while (blk_size >>= 1)
 			result <<= 1;
 
-		seq_printf(m, "%llu\n", result);
+		return sprintf(buf, "%llu\n", result);
 	}
+
 	return rc;
 }
-EXPORT_SYMBOL(lprocfs_kbytesavail_seq_show);
+LUSTRE_RO_ATTR(kbytesavail);
 
-int lprocfs_filestotal_seq_show(struct seq_file *m, void *data)
+static ssize_t filestotal_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
 {
-	struct obd_device *obd = data;
-	struct obd_statfs  osfs;
-	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
-			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
-			    OBD_STATFS_NODELAY);
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
+			OBD_STATFS_NODELAY);
 	if (!rc)
-		seq_printf(m, "%llu\n", osfs.os_files);
+		return sprintf(buf, "%llu\n", osfs.os_files);
+
 	return rc;
 }
-EXPORT_SYMBOL(lprocfs_filestotal_seq_show);
+LUSTRE_RO_ATTR(filestotal);
 
-int lprocfs_filesfree_seq_show(struct seq_file *m, void *data)
+static ssize_t filesfree_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
 {
-	struct obd_device *obd = data;
-	struct obd_statfs  osfs;
-	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
-			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
-			    OBD_STATFS_NODELAY);
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
+			OBD_STATFS_NODELAY);
 	if (!rc)
-		seq_printf(m, "%llu\n", osfs.os_ffree);
+		return sprintf(buf, "%llu\n", osfs.os_ffree);
+
 	return rc;
 }
-EXPORT_SYMBOL(lprocfs_filesfree_seq_show);
+LUSTRE_RO_ATTR(filesfree);
+
+ssize_t conn_uuid_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct ptlrpc_connection *conn;
+	ssize_t count;
+
+	LPROCFS_CLIMP_CHECK(obd);
+	conn = obd->u.cli.cl_import->imp_connection;
+	if (conn && obd->u.cli.cl_import)
+		count = sprintf(buf, "%s\n", conn->c_remote_uuid.uuid);
+	else
+		count = sprintf(buf, "%s\n", "<none>");
+
+	LPROCFS_CLIMP_EXIT(obd);
+	return count;
+}
+EXPORT_SYMBOL(conn_uuid_show);
 
 int lprocfs_server_uuid_seq_show(struct seq_file *m, void *data)
 {
@@ -616,26 +543,6 @@ int lprocfs_server_uuid_seq_show(struct seq_file *m, void *data)
 }
 EXPORT_SYMBOL(lprocfs_server_uuid_seq_show);
 
-int lprocfs_conn_uuid_seq_show(struct seq_file *m, void *data)
-{
-	struct obd_device *obd = data;
-	struct ptlrpc_connection *conn;
-	int rc = 0;
-
-	LASSERT(obd != NULL);
-
-	LPROCFS_CLIMP_CHECK(obd);
-	conn = obd->u.cli.cl_import->imp_connection;
-	if (conn && obd->u.cli.cl_import)
-		seq_printf(m, "%s\n", conn->c_remote_uuid.uuid);
-	else
-		seq_printf(m, "%s\n", "<none>");
-
-	LPROCFS_CLIMP_EXIT(obd);
-	return rc;
-}
-EXPORT_SYMBOL(lprocfs_conn_uuid_seq_show);
-
 /** add up per-cpu counters */
 
 /**
@@ -729,14 +636,14 @@ void lprocfs_stats_unlock(struct lprocfs_stats *stats,
 void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
 			   struct lprocfs_counter *cnt)
 {
-	unsigned int			num_entry;
-	struct lprocfs_counter		*percpu_cntr;
-	int				i;
-	unsigned long			flags = 0;
+	unsigned int num_entry;
+	struct lprocfs_counter *percpu_cntr;
+	int i;
+	unsigned long flags = 0;
 
 	memset(cnt, 0, sizeof(*cnt));
 
-	if (stats == NULL) {
+	if (!stats) {
 		/* set count to 1 to avoid divide-by-zero errs in callers */
 		cnt->lc_count = 1;
 		return;
@@ -747,7 +654,7 @@ void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
 	num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
 
 	for (i = 0; i < num_entry; i++) {
-		if (stats->ls_percpu[i] == NULL)
+		if (!stats->ls_percpu[i])
 			continue;
 		percpu_cntr = lprocfs_stats_counter_get(stats, i, idx);
 
@@ -763,16 +670,6 @@ void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
 	lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
 }
 
-/**
- * Append a space separated list of current set flags to str.
- */
-#define flag2str(flag)						\
-	do {								\
-		if (imp->imp_##flag) {					\
-			seq_printf(m, "%s" #flag, first ? "" : ", ");	\
-			first = false;					\
-		}							\
-	} while (0)
 static void obd_import_flags2str(struct obd_import *imp, struct seq_file *m)
 {
 	bool first = true;
@@ -782,19 +679,16 @@ static void obd_import_flags2str(struct obd_import *imp, struct seq_file *m)
 		first = false;
 	}
 
-	flag2str(invalid);
-	flag2str(deactive);
-	flag2str(replayable);
-	flag2str(delayed_recovery);
-	flag2str(no_lock_replay);
-	flag2str(vbr_failed);
-	flag2str(pingable);
-	flag2str(resend_replay);
-	flag2str(no_pinger_recover);
-	flag2str(need_mne_swab);
-	flag2str(connect_tried);
+	flag2str(imp, invalid);
+	flag2str(imp, deactive);
+	flag2str(imp, replayable);
+	flag2str(imp, delayed_recovery);
+	flag2str(imp, vbr_failed);
+	flag2str(imp, pingable);
+	flag2str(imp, resend_replay);
+	flag2str(imp, no_pinger_recover);
+	flag2str(imp, connect_tried);
 }
-#undef flag2str
 
 static const char *obd_connect_names[] = {
 	/* flags names  */
@@ -858,17 +752,34 @@ static const char *obd_connect_names[] = {
 	"multi_mod_rpcs",
 	"dir_stripe",
 	"subtree",
-	"lock_ahead",
+	"lockahead",
 	"bulk_mbits",
 	"compact_obdo",
 	"second_flags",
 	/* flags2 names */
-	"file_secctx",
+	"file_secctx",	/* 0x01 */
+	"lockaheadv2",	/* 0x02 */
+	"dir_migrate",	/* 0x04 */
+	"sum_statfs",	/* 0x08 */
+	"overstriping",	/* 0x10 */
+	"flr",		/* 0x20 */
+	"wbc",		/* 0x40 */
+	"lock_convert",  /* 0x80 */
+	"archive_id_array",	/* 0x100 */
+	"increasing_xid",	/* 0x200 */
+	"selinux_policy",	/* 0x400 */
+	"lsom",			/* 0x800 */
+	"pcc",			/* 0x1000 */
+	"unknown",		/* 0x2000 */
+	"async_discard",	/* 0x4000 */
+	"client_encryption",	/* 0x8000 */
+	"fidmap",		/* 0x10000 */
+	"getattr_pfid",		/* 0x20000 */
 	NULL
 };
 
-static void obd_connect_seq_flags2str(struct seq_file *m, __u64 flags,
-				      __u64 flags2, const char *sep)
+void obd_connect_seq_flags2str(struct seq_file *m, __u64 flags, __u64 flags2,
+			       const char *sep)
 {
 	bool first = true;
 	__u64 mask;
@@ -905,6 +816,7 @@ static void obd_connect_seq_flags2str(struct seq_file *m, __u64 flags,
 		first = false;
 	}
 }
+EXPORT_SYMBOL(obd_connect_seq_flags2str);
 
 int obd_connect_flags2str(char *page, int count, __u64 flags, __u64 flags2,
 			  const char *sep)
@@ -941,8 +853,8 @@ int obd_connect_flags2str(char *page, int count, __u64 flags, __u64 flags2,
 }
 EXPORT_SYMBOL(obd_connect_flags2str);
 
-static void obd_connect_data_seqprint(struct seq_file *m,
-				      struct obd_connect_data *ocd)
+void
+obd_connect_data_seqprint(struct seq_file *m, struct obd_connect_data *ocd)
 {
 	__u64 flags;
 
@@ -998,16 +910,16 @@ static void obd_connect_data_seqprint(struct seq_file *m,
 
 int lprocfs_import_seq_show(struct seq_file *m, void *data)
 {
-	char				nidstr[LNET_NIDSTR_SIZE];
-	struct lprocfs_counter          ret;
-	struct lprocfs_counter_header   *header;
-	struct obd_device               *obd    = (struct obd_device *)data;
-	struct obd_import               *imp;
-	struct obd_import_conn          *conn;
-	struct obd_connect_data		*ocd;
-	int                             j;
-	int                             k;
-	int                             rw      = 0;
+	char nidstr[LNET_NIDSTR_SIZE];
+	struct lprocfs_counter ret;
+	struct lprocfs_counter_header *header;
+	struct obd_device *obd = (struct obd_device *)data;
+	struct obd_import *imp;
+	struct obd_import_conn *conn;
+	struct obd_connect_data *ocd;
+	int j;
+	int k;
+	int rw = 0;
 
 	LASSERT(obd != NULL);
 	LPROCFS_CLIMP_CHECK(obd);
@@ -1041,7 +953,7 @@ int lprocfs_import_seq_show(struct seq_file *m, void *data)
 		seq_printf(m, "%s%s", j ? ", " : "", nidstr);
 		j++;
 	}
-	if (imp->imp_connection != NULL)
+	if (imp->imp_connection)
 		libcfs_nid2str_r(imp->imp_connection->c_peer.nid,
 				 nidstr, sizeof(nidstr));
 	else
@@ -1050,14 +962,16 @@ int lprocfs_import_seq_show(struct seq_file *m, void *data)
 		   "       current_connection: %s\n"
 		   "       connection_attempts: %u\n"
 		   "       generation: %u\n"
-		   "       in-progress_invalidations: %u\n",
+		   "       in-progress_invalidations: %u\n"
+		   "       idle: %lld sec\n",
 		   nidstr,
 		   imp->imp_conn_cnt,
 		   imp->imp_generation,
-		   atomic_read(&imp->imp_inval_count));
+		   atomic_read(&imp->imp_inval_count),
+		   ktime_get_real_seconds() - imp->imp_last_reply_time);
 	spin_unlock(&imp->imp_lock);
 
-	if (obd->obd_svc_stats == NULL)
+	if (!obd->obd_svc_stats)
 		goto out_climp;
 
 	header = &obd->obd_svc_stats->ls_cnt_header[PTLRPC_REQWAIT_CNTR];
@@ -1239,14 +1153,83 @@ int lprocfs_connect_flags_seq_show(struct seq_file *m, void *data)
 }
 EXPORT_SYMBOL(lprocfs_connect_flags_seq_show);
 
-int
-lprocfs_obd_setup(struct obd_device *obd)
+static const struct attribute *obd_def_uuid_attrs[] = {
+	&lustre_attr_uuid.attr,
+	NULL,
+};
+
+static const struct attribute *obd_def_attrs[] = {
+	&lustre_attr_blocksize.attr,
+	&lustre_attr_kbytestotal.attr,
+	&lustre_attr_kbytesfree.attr,
+	&lustre_attr_kbytesavail.attr,
+	&lustre_attr_filestotal.attr,
+	&lustre_attr_filesfree.attr,
+	&lustre_attr_uuid.attr,
+	NULL,
+};
+
+static void obd_sysfs_release(struct kobject *kobj)
 {
-	int rc = 0;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 
-	LASSERT(obd != NULL);
-	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
-	LASSERT(obd->obd_type->typ_procroot != NULL);
+	complete(&obd->obd_kobj_unregister);
+}
+
+int lprocfs_obd_setup(struct obd_device *obd, bool uuid_only)
+{
+	struct ldebugfs_vars *debugfs_vars = NULL;
+	int rc;
+
+	if (!obd || obd->obd_magic != OBD_DEVICE_MAGIC)
+		return -ENODEV;
+
+	rc = kobject_set_name(&obd->obd_kset.kobj, "%s", obd->obd_name);
+	if (rc)
+		return rc;
+
+	obd->obd_ktype.sysfs_ops = &lustre_sysfs_ops;
+	obd->obd_ktype.release = obd_sysfs_release;
+
+	obd->obd_kset.kobj.parent = obd->obd_type->typ_kobj;
+	obd->obd_kset.kobj.ktype = &obd->obd_ktype;
+	init_completion(&obd->obd_kobj_unregister);
+	rc = kset_register(&obd->obd_kset);
+	if (rc)
+		return rc;
+
+	if (uuid_only)
+		obd->obd_attrs = obd_def_uuid_attrs;
+	else
+		obd->obd_attrs = obd_def_attrs;
+
+	rc = sysfs_create_files(&obd->obd_kset.kobj, obd->obd_attrs);
+	if (rc) {
+		kset_unregister(&obd->obd_kset);
+		return rc;
+	}
+
+	if (!obd->obd_type->typ_procroot)
+		debugfs_vars = obd->obd_debugfs_vars;
+	obd->obd_debugfs_entry = ldebugfs_register(obd->obd_name,
+						   obd->obd_type->typ_debugfs_entry,
+						   debugfs_vars, obd);
+	if (IS_ERR_OR_NULL(obd->obd_debugfs_entry)) {
+		rc = obd->obd_debugfs_entry ? PTR_ERR(obd->obd_debugfs_entry)
+					    : -ENOMEM;
+		CERROR("error %d setting up debugfs for %s\n",
+		       rc, obd->obd_name);
+		obd->obd_debugfs_entry = NULL;
+
+		sysfs_remove_files(&obd->obd_kset.kobj, obd->obd_attrs);
+		obd->obd_attrs = NULL;
+		kset_unregister(&obd->obd_kset);
+		return rc;
+	}
+
+	if (obd->obd_proc_entry || !obd->obd_type->typ_procroot)
+		GOTO(already_registered, rc);
 
 	obd->obd_proc_entry = lprocfs_register(obd->obd_name,
 					       obd->obd_type->typ_procroot,
@@ -1255,42 +1238,66 @@ lprocfs_obd_setup(struct obd_device *obd)
 		rc = PTR_ERR(obd->obd_proc_entry);
 		CERROR("error %d setting up lprocfs for %s\n",rc,obd->obd_name);
 		obd->obd_proc_entry = NULL;
+
+		ldebugfs_remove(&obd->obd_debugfs_entry);
+		sysfs_remove_files(&obd->obd_kset.kobj, obd->obd_attrs);
+		obd->obd_attrs = NULL;
+		kset_unregister(&obd->obd_kset);
+		return rc;
 	}
+already_registered:
 	return rc;
 }
 EXPORT_SYMBOL(lprocfs_obd_setup);
 
 int lprocfs_obd_cleanup(struct obd_device *obd)
 {
-        if (!obd)
-                return -EINVAL;
-        if (obd->obd_proc_exports_entry) {
-                /* Should be no exports left */
-                lprocfs_remove(&obd->obd_proc_exports_entry);
-                obd->obd_proc_exports_entry = NULL;
-        }
-        if (obd->obd_proc_entry) {
-                lprocfs_remove(&obd->obd_proc_entry);
-                obd->obd_proc_entry = NULL;
-        }
-        return 0;
+	if (!obd)
+		return -EINVAL;
+
+	if (obd->obd_proc_exports_entry) {
+		/* Should be no exports left */
+		lprocfs_remove(&obd->obd_proc_exports_entry);
+		obd->obd_proc_exports_entry = NULL;
+	}
+
+	if (obd->obd_proc_entry) {
+		lprocfs_remove(&obd->obd_proc_entry);
+		obd->obd_proc_entry = NULL;
+	}
+
+	if (!IS_ERR_OR_NULL(obd->obd_debugfs_entry))
+		ldebugfs_remove(&obd->obd_debugfs_entry);
+
+	/* obd device never allocated a kset */
+	if (!obd->obd_kset.kobj.state_initialized)
+		return 0;
+
+	if (obd->obd_attrs) {
+		sysfs_remove_files(&obd->obd_kset.kobj, obd->obd_attrs);
+		obd->obd_attrs = NULL;
+	}
+
+	kset_unregister(&obd->obd_kset);
+	wait_for_completion(&obd->obd_kobj_unregister);
+	return 0;
 }
 EXPORT_SYMBOL(lprocfs_obd_cleanup);
 
 int lprocfs_stats_alloc_one(struct lprocfs_stats *stats, unsigned int cpuid)
 {
-	struct lprocfs_counter  *cntr;
-	unsigned int            percpusize;
-	int                     rc = -ENOMEM;
-	unsigned long           flags = 0;
-	int                     i;
+	struct lprocfs_counter *cntr;
+	unsigned int percpusize;
+	int rc = -ENOMEM;
+	unsigned long flags = 0;
+	int i;
 
 	LASSERT(stats->ls_percpu[cpuid] == NULL);
 	LASSERT((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0);
 
 	percpusize = lprocfs_stats_counter_size(stats);
 	LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[cpuid], percpusize);
-	if (stats->ls_percpu[cpuid] != NULL) {
+	if (stats->ls_percpu[cpuid]) {
 		rc = 0;
 		if (unlikely(stats->ls_biggest_alloc_num <= cpuid)) {
 			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
@@ -1317,16 +1324,16 @@ int lprocfs_stats_alloc_one(struct lprocfs_stats *stats, unsigned int cpuid)
 struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num,
                                           enum lprocfs_stats_flags flags)
 {
-	struct lprocfs_stats	*stats;
-	unsigned int		num_entry;
-	unsigned int		percpusize = 0;
-	int			i;
+	struct lprocfs_stats *stats;
+	unsigned int num_entry;
+	unsigned int percpusize = 0;
+	int i;
 
-        if (num == 0)
-                return NULL;
+	if (num == 0)
+		return NULL;
 
-        if (lprocfs_no_percpu_stats != 0)
-                flags |= LPROCFS_STATS_FLAG_NOPERCPU;
+	if (lprocfs_no_percpu_stats != 0)
+		flags |= LPROCFS_STATS_FLAG_NOPERCPU;
 
 	if (flags & LPROCFS_STATS_FLAG_NOPERCPU)
 		num_entry = 1;
@@ -1335,7 +1342,7 @@ struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num,
 
 	/* alloc percpu pointers for all possible cpu slots */
 	LIBCFS_ALLOC(stats, offsetof(typeof(*stats), ls_percpu[num_entry]));
-	if (stats == NULL)
+	if (!stats)
 		return NULL;
 
 	stats->ls_num = num;
@@ -1345,14 +1352,14 @@ struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num,
 	/* alloc num of counter headers */
 	LIBCFS_ALLOC(stats->ls_cnt_header,
 		     stats->ls_num * sizeof(struct lprocfs_counter_header));
-	if (stats->ls_cnt_header == NULL)
+	if (!stats->ls_cnt_header)
 		goto fail;
 
 	if ((flags & LPROCFS_STATS_FLAG_NOPERCPU) != 0) {
 		/* contains only one set counters */
 		percpusize = lprocfs_stats_counter_size(stats);
 		LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[0], percpusize);
-		if (stats->ls_percpu[0] == NULL)
+		if (!stats->ls_percpu[0])
 			goto fail;
 		stats->ls_biggest_alloc_num = 1;
 	} else if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) {
@@ -1377,9 +1384,9 @@ void lprocfs_free_stats(struct lprocfs_stats **statsh)
 	unsigned int percpusize;
 	unsigned int i;
 
-        if (stats == NULL || stats->ls_num == 0)
-                return;
-        *statsh = NULL;
+	if (!stats || stats->ls_num == 0)
+		return;
+	*statsh = NULL;
 
 	if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU)
 		num_entry = 1;
@@ -1388,9 +1395,9 @@ void lprocfs_free_stats(struct lprocfs_stats **statsh)
 
 	percpusize = lprocfs_stats_counter_size(stats);
 	for (i = 0; i < num_entry; i++)
-		if (stats->ls_percpu[i] != NULL)
+		if (stats->ls_percpu[i])
 			LIBCFS_FREE(stats->ls_percpu[i], percpusize);
-	if (stats->ls_cnt_header != NULL)
+	if (stats->ls_cnt_header)
 		LIBCFS_FREE(stats->ls_cnt_header, stats->ls_num *
 					sizeof(struct lprocfs_counter_header));
 	LIBCFS_FREE(stats, offsetof(typeof(*stats), ls_percpu[num_entry]));
@@ -1425,16 +1432,16 @@ EXPORT_SYMBOL(lprocfs_stats_collector);
 
 void lprocfs_clear_stats(struct lprocfs_stats *stats)
 {
-	struct lprocfs_counter		*percpu_cntr;
-	int				i;
-	int				j;
-	unsigned int			num_entry;
-	unsigned long			flags = 0;
+	struct lprocfs_counter *percpu_cntr;
+	int i;
+	int j;
+	unsigned int num_entry;
+	unsigned long flags = 0;
 
 	num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
 
 	for (i = 0; i < num_entry; i++) {
-		if (stats->ls_percpu[i] == NULL)
+		if (!stats->ls_percpu[i])
 			continue;
 		for (j = 0; j < stats->ls_num; j++) {
 			percpu_cntr = lprocfs_stats_counter_get(stats, i, j);
@@ -1456,12 +1463,12 @@ static ssize_t lprocfs_stats_seq_write(struct file *file,
 				       const char __user *buf,
 				       size_t len, loff_t *off)
 {
-        struct seq_file *seq = file->private_data;
-        struct lprocfs_stats *stats = seq->private;
+	struct seq_file *seq = file->private_data;
+	struct lprocfs_stats *stats = seq->private;
 
-        lprocfs_clear_stats(stats);
+	lprocfs_clear_stats(stats);
 
-        return len;
+	return len;
 }
 
 static void *lprocfs_stats_seq_start(struct seq_file *p, loff_t *pos)
@@ -1485,10 +1492,10 @@ static void *lprocfs_stats_seq_next(struct seq_file *p, void *v, loff_t *pos)
 /* seq file export of one lprocfs counter */
 static int lprocfs_stats_seq_show(struct seq_file *p, void *v)
 {
-	struct lprocfs_stats		*stats	= p->private;
-	struct lprocfs_counter_header	*hdr;
-	struct lprocfs_counter		 ctr;
-	int				 idx	= *(loff_t *)v;
+	struct lprocfs_stats *stats = p->private;
+	struct lprocfs_counter_header *hdr;
+	struct lprocfs_counter ctr;
+	int idx = *(loff_t *)v;
 
 	if (idx == 0) {
 		struct timespec64 now;
@@ -1537,10 +1544,20 @@ static int lprocfs_stats_seq_open(struct inode *inode, struct file *file)
 	if (rc)
 		return rc;
 	seq = file->private_data;
-	seq->private = inode->i_private ? : PDE_DATA(inode);
+	seq->private = inode->i_private ? inode->i_private : PDE_DATA(inode);
 	return 0;
 }
 
+const struct file_operations ldebugfs_stats_seq_fops = {
+	.owner   = THIS_MODULE,
+	.open    = lprocfs_stats_seq_open,
+	.read    = seq_read,
+	.write   = lprocfs_stats_seq_write,
+	.llseek  = seq_lseek,
+	.release = lprocfs_seq_release,
+};
+EXPORT_SYMBOL(ldebugfs_stats_seq_fops);
+
 static const struct proc_ops lprocfs_stats_seq_fops = {
 	PROC_OWNER(THIS_MODULE)
 	.proc_open	= lprocfs_stats_seq_open,
@@ -1550,15 +1567,6 @@ static const struct proc_ops lprocfs_stats_seq_fops = {
 	.proc_release	= lprocfs_seq_release,
 };
 
-static const struct file_operations ldebugfs_stats_seq_fops = {
-	.owner	 = THIS_MODULE,
-	.open	 = lprocfs_stats_seq_open,
-	.read	 = seq_read,
-	.write	 = lprocfs_stats_seq_write,
-	.llseek	 = seq_lseek,
-	.release = lprocfs_seq_release,
-};
-
 int ldebugfs_register_stats(struct dentry *parent, const char *name,
 			    struct lprocfs_stats *stats)
 {
@@ -1583,7 +1591,7 @@ int lprocfs_register_stats(struct proc_dir_entry *root, const char *name,
 
 	entry = proc_create_data(name, 0644, root,
 				 &lprocfs_stats_seq_fops, stats);
-	if (entry == NULL)
+	if (!entry)
 		return -ENOMEM;
 	return 0;
 }
@@ -1592,11 +1600,11 @@ EXPORT_SYMBOL(lprocfs_register_stats);
 void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
 			  unsigned conf, const char *name, const char *units)
 {
-	struct lprocfs_counter_header	*header;
-	struct lprocfs_counter		*percpu_cntr;
-	unsigned long			flags = 0;
-	unsigned int			i;
-	unsigned int			num_cpu;
+	struct lprocfs_counter_header *header;
+	struct lprocfs_counter *percpu_cntr;
+	unsigned long flags = 0;
+	unsigned int i;
+	unsigned int num_cpu;
 
 	LASSERT(stats != NULL);
 
@@ -1610,7 +1618,7 @@ void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
 
 	num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
 	for (i = 0; i < num_cpu; ++i) {
-		if (stats->ls_percpu[i] == NULL)
+		if (!stats->ls_percpu[i])
 			continue;
 		percpu_cntr = lprocfs_stats_counter_get(stats, i, index);
 		percpu_cntr->lc_count		= 0;
@@ -1625,49 +1633,23 @@ void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
 }
 EXPORT_SYMBOL(lprocfs_counter_init);
 
-/* Note that we only init md counters for ops whose offset is less
- * than NUM_MD_STATS. This is explained in a comment in the definition
- * of struct md_ops. */
-#define LPROCFS_MD_OP_INIT(base, stats, op)				       \
-	do {								       \
-		unsigned int _idx = base + MD_COUNTER_OFFSET(op);	       \
-									       \
-		if (MD_COUNTER_OFFSET(op) < NUM_MD_STATS) {		       \
-			LASSERT(_idx < stats->ls_num);			       \
-			lprocfs_counter_init(stats, _idx, 0, #op, "reqs");     \
-		}							       \
-	} while (0)
-
-void lprocfs_init_mps_stats(int num_private_stats, struct lprocfs_stats *stats)
-{
-	LPROCFS_MD_OP_INIT(num_private_stats, stats, get_root);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, null_inode);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, close);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, create);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, enqueue);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr_name);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_lock);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, link);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, rename);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, setattr);
-	LPROCFS_MD_OP_INIT(num_private_stats, stats, fsync);
-	LPROCFS_MD_OP_INIT(num_private_stats, stats, read_page);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, unlink);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, setxattr);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, getxattr);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, init_ea_size);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, get_lustre_md);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, free_lustre_md);
-	LPROCFS_MD_OP_INIT(num_private_stats, stats, merge_attr);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, set_open_replay_data);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, clear_open_replay_data);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, set_lock_data);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, lock_match);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, cancel_unused);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_getattr_async);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, revalidate_lock);
-}
+static const char * const mps_stats[] = {
+	[LPROC_MD_CLOSE]		= "close",
+	[LPROC_MD_CREATE]		= "create",
+	[LPROC_MD_ENQUEUE]		= "enqueue",
+	[LPROC_MD_GETATTR]		= "getattr",
+	[LPROC_MD_INTENT_LOCK]		= "intent_lock",
+	[LPROC_MD_LINK]			= "link",
+	[LPROC_MD_RENAME]		= "rename",
+	[LPROC_MD_SETATTR]		= "setattr",
+	[LPROC_MD_FSYNC]		= "fsync",
+	[LPROC_MD_READ_PAGE]		= "read_page",
+	[LPROC_MD_UNLINK]		= "unlink",
+	[LPROC_MD_SETXATTR]		= "setxattr",
+	[LPROC_MD_GETXATTR]		= "getxattr",
+	[LPROC_MD_INTENT_GETATTR_ASYNC]	= "intent_getattr_async",
+	[LPROC_MD_REVALIDATE_LOCK]	= "revalidate_lock",
+};
 
 int lprocfs_alloc_md_stats(struct obd_device *obd,
 			   unsigned int num_private_stats)
@@ -1676,11 +1658,8 @@ int lprocfs_alloc_md_stats(struct obd_device *obd,
 	unsigned int num_stats;
 	int rc, i;
 
-	CLASSERT(offsetof(struct md_ops, MD_STATS_FIRST_OP) == 0);
-	CLASSERT(_MD_COUNTER_OFFSET(MD_STATS_FIRST_OP) == 0);
-	CLASSERT(_MD_COUNTER_OFFSET(MD_STATS_LAST_OP) > 0);
-
-	/* TODO Ensure that this function is only used where
+	/*
+	 * TODO Ensure that this function is only used where
 	 * appropriate by adding an assertion to the effect that
 	 * obd->obd_type->typ_md_ops is not NULL. We can't do this now
 	 * because mdt_procfs_init() uses this function to allocate
@@ -1690,20 +1669,17 @@ int lprocfs_alloc_md_stats(struct obd_device *obd,
 	 */
 	LASSERT(obd->obd_proc_entry != NULL);
 	LASSERT(obd->obd_md_stats == NULL);
-	LASSERT(obd->obd_md_cntr_base == 0);
 
-	num_stats = NUM_MD_STATS + num_private_stats;
+	num_stats = ARRAY_SIZE(mps_stats) + num_private_stats;
 	stats = lprocfs_alloc_stats(num_stats, 0);
-	if (stats == NULL)
+	if (!stats)
 		return -ENOMEM;
 
-	lprocfs_init_mps_stats(num_private_stats, stats);
-
-	for (i = num_private_stats; i < num_stats; i++) {
-		if (stats->ls_cnt_header[i].lc_name == NULL) {
-			CERROR("Missing md_stat initializer md_op "
-			       "operation at offset %d. Aborting.\n",
-			       i - num_private_stats);
+	for (i = 0; i < ARRAY_SIZE(mps_stats); i++) {
+		lprocfs_counter_init(stats, i, 0, mps_stats[i], "reqs");
+		if (!stats->ls_cnt_header[i].lc_name) {
+			CERROR("Missing md_stat initializer md_op operation at offset %d. Aborting.\n",
+			       i);
 			LBUG();
 		}
 	}
@@ -1713,7 +1689,6 @@ int lprocfs_alloc_md_stats(struct obd_device *obd,
 		lprocfs_free_stats(&stats);
 	} else {
 		obd->obd_md_stats = stats;
-		obd->obd_md_cntr_base = num_private_stats;
 	}
 
 	return rc;
@@ -1724,9 +1699,8 @@ void lprocfs_free_md_stats(struct obd_device *obd)
 {
 	struct lprocfs_stats *stats = obd->obd_md_stats;
 
-	if (stats != NULL) {
+	if (stats) {
 		obd->obd_md_stats = NULL;
-		obd->obd_md_cntr_base = 0;
 		lprocfs_free_stats(&stats);
 	}
 }
@@ -1734,24 +1708,24 @@ EXPORT_SYMBOL(lprocfs_free_md_stats);
 
 void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
 {
-        lprocfs_counter_init(ldlm_stats,
-                             LDLM_ENQUEUE - LDLM_FIRST_OPC,
-                             0, "ldlm_enqueue", "reqs");
-        lprocfs_counter_init(ldlm_stats,
-                             LDLM_CONVERT - LDLM_FIRST_OPC,
-                             0, "ldlm_convert", "reqs");
-        lprocfs_counter_init(ldlm_stats,
-                             LDLM_CANCEL - LDLM_FIRST_OPC,
-                             0, "ldlm_cancel", "reqs");
-        lprocfs_counter_init(ldlm_stats,
-                             LDLM_BL_CALLBACK - LDLM_FIRST_OPC,
-                             0, "ldlm_bl_callback", "reqs");
-        lprocfs_counter_init(ldlm_stats,
-                             LDLM_CP_CALLBACK - LDLM_FIRST_OPC,
-                             0, "ldlm_cp_callback", "reqs");
-        lprocfs_counter_init(ldlm_stats,
-                             LDLM_GL_CALLBACK - LDLM_FIRST_OPC,
-                             0, "ldlm_gl_callback", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_ENQUEUE - LDLM_FIRST_OPC,
+			     0, "ldlm_enqueue", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_CONVERT - LDLM_FIRST_OPC,
+			     0, "ldlm_convert", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_CANCEL - LDLM_FIRST_OPC,
+			     0, "ldlm_cancel", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_BL_CALLBACK - LDLM_FIRST_OPC,
+			     0, "ldlm_bl_callback", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_CP_CALLBACK - LDLM_FIRST_OPC,
+			     0, "ldlm_cp_callback", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_GL_CALLBACK - LDLM_FIRST_OPC,
+			     0, "ldlm_gl_callback", "reqs");
 }
 EXPORT_SYMBOL(lprocfs_init_ldlm_stats);
 
@@ -1762,7 +1736,7 @@ __s64 lprocfs_read_helper(struct lprocfs_counter *lc,
 {
 	__s64 ret = 0;
 
-	if (lc == NULL || header == NULL)
+	if (!lc || !header)
 		RETURN(0);
 
 	switch (field) {
@@ -1796,86 +1770,6 @@ __s64 lprocfs_read_helper(struct lprocfs_counter *lc,
 }
 EXPORT_SYMBOL(lprocfs_read_helper);
 
-int lprocfs_read_frac_helper(char *buffer, unsigned long count, long val,
-                             int mult)
-{
-        long decimal_val, frac_val;
-        int prtn;
-
-        if (count < 10)
-                return -EINVAL;
-
-        decimal_val = val / mult;
-        prtn = snprintf(buffer, count, "%ld", decimal_val);
-        frac_val = val % mult;
-
-        if (prtn < (count - 4) && frac_val > 0) {
-                long temp_frac;
-                int i, temp_mult = 1, frac_bits = 0;
-
-                temp_frac = frac_val * 10;
-                buffer[prtn++] = '.';
-                while (frac_bits < 2 && (temp_frac / mult) < 1 ) {
-                        /* only reserved 2 bits fraction */
-                        buffer[prtn++] ='0';
-                        temp_frac *= 10;
-                        frac_bits++;
-                }
-                /*
-                 * Need to think these cases :
-                 *      1. #echo x.00 > /proc/xxx       output result : x
-                 *      2. #echo x.0x > /proc/xxx       output result : x.0x
-                 *      3. #echo x.x0 > /proc/xxx       output result : x.x
-                 *      4. #echo x.xx > /proc/xxx       output result : x.xx
-                 *      Only reserved 2 bits fraction.
-                 */
-                for (i = 0; i < (5 - prtn); i++)
-                        temp_mult *= 10;
-
-                frac_bits = min((int)count - prtn, 3 - frac_bits);
-                prtn += snprintf(buffer + prtn, frac_bits, "%ld",
-                                 frac_val * temp_mult / mult);
-
-                prtn--;
-                while(buffer[prtn] < '1' || buffer[prtn] > '9') {
-                        prtn--;
-                        if (buffer[prtn] == '.') {
-                                prtn--;
-                                break;
-                        }
-                }
-                prtn++;
-        }
-        buffer[prtn++] ='\n';
-        return prtn;
-}
-EXPORT_SYMBOL(lprocfs_read_frac_helper);
-
-int lprocfs_seq_read_frac_helper(struct seq_file *m, long val, int mult)
-{
-	long decimal_val, frac_val;
-
-	decimal_val = val / mult;
-	seq_printf(m, "%ld", decimal_val);
-	frac_val = val % mult;
-
-	if (frac_val > 0) {
-		frac_val *= 100;
-		frac_val /= mult;
-	}
-	if (frac_val > 0) {
-		/* Three cases: x0, xx, 0x */
-		if ((frac_val % 10) != 0)
-			seq_printf(m, ".%ld", frac_val);
-		else
-			seq_printf(m, ".%ld", frac_val / 10);
-	}
-
-	seq_printf(m, "\n");
-	return 0;
-}
-EXPORT_SYMBOL(lprocfs_seq_read_frac_helper);
-
 /* Obtains the conversion factor for the unit specified */
 static int get_mult(char unit, __u64 *mult)
 {
@@ -1886,19 +1780,19 @@ static int get_mult(char unit, __u64 *mult)
 	case 'p':
 	case 'P':
 		units <<= 10;
-		/* Fall through */
+		/* fallthrough */
 	case 't':
 	case 'T':
 		units <<= 10;
-		/* Fall through */
+		/* fallthrough */
 	case 'g':
 	case 'G':
 		units <<= 10;
-		/* Fall through */
+		/* fallthrough */
 	case 'm':
 	case 'M':
 		units <<= 10;
-		/* Fall through */
+		/* fallthrough */
 	case 'k':
 	case 'K':
 		units <<= 10;
@@ -2043,7 +1937,7 @@ static int str_to_u64_parse(char *buffer, unsigned long count,
 	}
 
 	/* the multiplier limits how large the value can be */
-	wrap_indicator /=  mult;
+	wrap_indicator = div64_u64(wrap_indicator, mult);
 
 	if (strwhole) {
 		rc = kstrtoull(strwhole, base, &whole);
@@ -2094,8 +1988,7 @@ static int str_to_u64_parse(char *buffer, unsigned long count,
  * of the signed integer.
  */
 static int str_to_s64_internal(const char __user *buffer, unsigned long count,
-			       __s64 *val, __u64 def_mult, bool allow_units,
-			       bool kernel_space)
+			       __s64 *val, __u64 def_mult, bool allow_units)
 {
 	char kernbuf[22];
 	__u64 tmp;
@@ -2107,12 +2000,8 @@ static int str_to_s64_internal(const char __user *buffer, unsigned long count,
 	if (count > (sizeof(kernbuf) - 1))
 		return -EINVAL;
 
-	if (kernel_space) {
-		memcpy(kernbuf, buffer, count);
-	} else {
-		if (copy_from_user(kernbuf, buffer, count))
-			return -EFAULT;
-	}
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
 
 	kernbuf[count] = '\0';
 
@@ -2138,29 +2027,6 @@ static int str_to_s64_internal(const char __user *buffer, unsigned long count,
 	return 0;
 }
 
-/**
- * Convert a user string into a signed 64 bit number. This function produces
- * an error when the value parsed from the string underflows or
- * overflows. This function accepts strings which contain digits and
- * optionally a decimal or hex strings which are prefixed with "0x".
- *
- * \param[in] buffer	string consisting of numbers and optionally a decimal
- * \param[in] count	buffer length
- * \param[in] val	if successful, the value represented by the string
- *
- * \retval		0 on success
- * \retval		negative number on error
- */
-int lprocfs_str_to_s64(struct file *file, const char __user *buffer,
-		       unsigned long count, __s64 *val)
-{
-	bool kernel_space;
-
-	kernel_space = lprocfs_file_is_kernel(file);
-	return str_to_s64_internal(buffer, count, val, 1, false, kernel_space);
-}
-EXPORT_SYMBOL(lprocfs_str_to_s64);
-
 /**
  * Convert a user string into a signed 64 bit number. This function produces
  * an error when the value parsed from the string times multiplier underflows or
@@ -2178,12 +2044,11 @@ EXPORT_SYMBOL(lprocfs_str_to_s64);
  * \retval		0 on success
  * \retval		negative number on error
  */
-int lprocfs_str_with_units_to_s64(struct file *file, const char __user *buffer,
+int lprocfs_str_with_units_to_s64(const char __user *buffer,
 				  unsigned long count, __s64 *val, char defunit)
 {
 	__u64 mult = 1;
 	int rc;
-	bool kernel_space;
 
 	if (defunit != '1') {
 		rc = get_mult(defunit, &mult);
@@ -2191,10 +2056,7 @@ int lprocfs_str_with_units_to_s64(struct file *file, const char __user *buffer,
 			return rc;
 	}
 
-	kernel_space = lprocfs_file_is_kernel(file);
-
-	return str_to_s64_internal(buffer, count, val, mult, true,
-			kernel_space);
+	return str_to_s64_internal(buffer, count, val, mult, true);
 }
 EXPORT_SYMBOL(lprocfs_str_with_units_to_s64);
 
@@ -2228,7 +2090,7 @@ char *lprocfs_find_named_value(const char *buffer, const char *name,
 
 	/* there is no strnstr() in rhel5 and ubuntu kernels */
 	val = lprocfs_strnstr(buffer, name, buflen);
-	if (val == NULL)
+	if (!val)
 		return (char *)buffer;
 
 	val += strlen(name);                             /* skip prefix */
@@ -2275,7 +2137,7 @@ int lprocfs_seq_create(struct proc_dir_entry *parent,
 
 	entry = proc_create_data(name, mode, parent, seq_fops, data);
 
-	if (entry == NULL)
+	if (!entry)
 		RETURN(-ENOMEM);
 
 	RETURN(0);
@@ -2317,12 +2179,12 @@ EXPORT_SYMBOL(lprocfs_oh_tally_log2);
 
 unsigned long lprocfs_oh_sum(struct obd_histogram *oh)
 {
-        unsigned long ret = 0;
-        int i;
+	unsigned long ret = 0;
+	int i;
 
-        for (i = 0; i < OBD_HIST_MAX; i++)
-                ret +=  oh->oh_buckets[i];
-        return ret;
+	for (i = 0; i < OBD_HIST_MAX; i++)
+		ret +=  oh->oh_buckets[i];
+	return ret;
 }
 EXPORT_SYMBOL(lprocfs_oh_sum);
 
@@ -2379,9 +2241,9 @@ ssize_t lprocfs_obd_max_pages_per_rpc_seq_write(struct file *file,
 	struct client_obd *cli = &dev->u.cli;
 	struct obd_connect_data *ocd = &cli->cl_import->imp_connect_data;
 	int chunk_mask, rc;
-	__s64 val;
+	s64 val;
 
-	rc = lprocfs_str_with_units_to_s64(file, buffer, count, &val, '1');
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &val, '1');
 	if (rc)
 		return rc;
 	if (val < 0)
@@ -2411,9 +2273,59 @@ ssize_t lprocfs_obd_max_pages_per_rpc_seq_write(struct file *file,
 }
 EXPORT_SYMBOL(lprocfs_obd_max_pages_per_rpc_seq_write);
 
-int lprocfs_wr_root_squash(struct file *file, const char __user *buffer,
-			   unsigned long count, struct root_squash_info *squash,
-			   char *name)
+ssize_t short_io_bytes_show(struct kobject *kobj, struct attribute *attr,
+			    char *buf)
+{
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct client_obd *cli = &dev->u.cli;
+	int rc;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	rc = sprintf(buf, "%d\n", cli->cl_max_short_io_bytes);
+	spin_unlock(&cli->cl_loi_list_lock);
+	return rc;
+}
+EXPORT_SYMBOL(short_io_bytes_show);
+
+/* Used to catch people who think they're specifying pages. */
+#define MIN_SHORT_IO_BYTES 64U
+
+ssize_t short_io_bytes_store(struct kobject *kobj, struct attribute *attr,
+			     const char *buffer, size_t count)
+{
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct client_obd *cli = &dev->u.cli;
+	u32 val;
+	int rc;
+
+	LPROCFS_CLIMP_CHECK(dev);
+
+	rc = kstrtouint(buffer, 0, &val);
+	if (rc)
+		GOTO(out, rc);
+
+	if (val && (val < MIN_SHORT_IO_BYTES || val > OBD_MAX_SHORT_IO_BYTES))
+		GOTO(out, rc = -ERANGE);
+
+	rc = count;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	if (val > (cli->cl_max_pages_per_rpc << PAGE_SHIFT))
+		rc = -ERANGE;
+	else
+		cli->cl_max_short_io_bytes = val;
+	spin_unlock(&cli->cl_loi_list_lock);
+
+out:
+	LPROCFS_CLIMP_EXIT(dev);
+	return rc;
+}
+EXPORT_SYMBOL(short_io_bytes_store);
+
+int lprocfs_wr_root_squash(const char __user *buffer, unsigned long count,
+			   struct root_squash_info *squash, char *name)
 {
 	int rc;
 	char kernbuf[64], *tmp, *errmsg;
@@ -2424,7 +2336,7 @@ int lprocfs_wr_root_squash(struct file *file, const char __user *buffer,
 		errmsg = "string too long";
 		GOTO(failed_noprint, rc = -EINVAL);
 	}
-	if (lprocfs_copy_from_user(file, kernbuf, buffer, count)) {
+	if (copy_from_user(kernbuf, buffer, count)) {
 		errmsg = "bad address";
 		GOTO(failed_noprint, rc = -EFAULT);
 	}
@@ -2432,7 +2344,7 @@ int lprocfs_wr_root_squash(struct file *file, const char __user *buffer,
 
 	/* look for uid gid separator */
 	tmp = strchr(kernbuf, ':');
-	if (tmp == NULL) {
+	if (!tmp) {
 		errmsg = "needs uid:gid format";
 		GOTO(failed, rc = -EINVAL);
 	}
@@ -2459,7 +2371,7 @@ int lprocfs_wr_root_squash(struct file *file, const char __user *buffer,
 	RETURN(count);
 
 failed:
-	if (tmp != NULL) {
+	if (tmp) {
 		tmp--;
 		*tmp = ':';
 	}
@@ -2474,8 +2386,7 @@ int lprocfs_wr_root_squash(struct file *file, const char __user *buffer,
 EXPORT_SYMBOL(lprocfs_wr_root_squash);
 
 
-int lprocfs_wr_nosquash_nids(struct file *file, const char __user *buffer,
-			     unsigned long count,
+int lprocfs_wr_nosquash_nids(const char __user *buffer, unsigned long count,
 			     struct root_squash_info *squash, char *name)
 {
 	int rc;
@@ -2491,11 +2402,11 @@ int lprocfs_wr_nosquash_nids(struct file *file, const char __user *buffer,
 	}
 
 	OBD_ALLOC(kernbuf, count + 1);
-	if (kernbuf == NULL) {
+	if (!kernbuf) {
 		errmsg = "no memory";
 		GOTO(failed, rc = -ENOMEM);
 	}
-	if (lprocfs_copy_from_user(file, kernbuf, buffer, count)) {
+	if (copy_from_user(kernbuf, buffer, count)) {
 		errmsg = "bad address";
 		GOTO(failed, rc = -EFAULT);
 	}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c
index 6d78831dd37fe..4df66a941e535 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2014, 2016, Intel Corporation.
+ * Copyright (c) 2014, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -34,12 +34,57 @@
 
 #define DEBUG_SUBSYSTEM S_CLASS
 
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
 
 #include <obd_class.h>
 #include <lprocfs_status.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_nodemap.h>
 
+#define MAX_STRING_SIZE 128
+
+struct dentry *ldebugfs_add_symlink(const char *name, const char *target,
+				    const char *format, ...)
+{
+	struct dentry *entry = NULL;
+	struct dentry *parent;
+	struct qstr dname;
+	va_list ap;
+	char *dest;
+
+	if (!target || !format)
+		return NULL;
+
+	dname.name = target;
+	dname.len = strlen(dname.name);
+	dname.hash = ll_full_name_hash(debugfs_lustre_root,
+				       dname.name, dname.len);
+	parent = d_lookup(debugfs_lustre_root, &dname);
+	if (!parent)
+		return NULL;
+
+	OBD_ALLOC_WAIT(dest, MAX_STRING_SIZE + 1);
+	if (!dest)
+		goto no_entry;
+
+	va_start(ap, format);
+	vsnprintf(dest, MAX_STRING_SIZE, format, ap);
+	va_end(ap);
+
+	entry = debugfs_create_symlink(name, parent, dest);
+	if (IS_ERR_OR_NULL(entry)) {
+		CERROR("LdebugFS: Could not create symbolic link from %s to %s\n",
+		       name, dest);
+		entry = NULL;
+	}
+
+	OBD_FREE(dest, MAX_STRING_SIZE + 1);
+no_entry:
+	dput(parent);
+	return entry;
+}
+EXPORT_SYMBOL(ldebugfs_add_symlink);
+
 #ifdef CONFIG_PROC_FS
 
 int lprocfs_evict_client_open(struct inode *inode, struct file *f)
@@ -79,7 +124,7 @@ lprocfs_evict_client_seq_write(struct file *file, const char __user *buffer,
 	 * bytes into kbuf, to ensure that the string is NUL-terminated.
 	 * UUID_MAX should include a trailing NUL already.
 	 */
-	if (lprocfs_copy_from_user(file, kbuf, buffer,
+	if (copy_from_user(kbuf, buffer,
 			   min_t(unsigned long, BUFLEN - 1, count))) {
 		count = -EFAULT;
 		goto out;
@@ -104,15 +149,108 @@ EXPORT_SYMBOL(lprocfs_evict_client_seq_write);
 
 #undef BUFLEN
 
-int lprocfs_num_exports_seq_show(struct seq_file *m, void *data)
+ssize_t num_exports_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf)
 {
-	struct obd_device *obd = data;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", obd->obd_num_exports);
+}
+EXPORT_SYMBOL(num_exports_show);
+
+static int obd_export_flags2str(struct obd_export *exp, struct seq_file *m)
+{
+	bool first = true;
+
+	flag2str(exp, failed);
+	flag2str(exp, in_recovery);
+	flag2str(exp, disconnected);
+	flag2str(exp, connecting);
 
-	LASSERT(obd != NULL);
-	seq_printf(m, "%u\n", obd->obd_num_exports);
 	return 0;
 }
-EXPORT_SYMBOL(lprocfs_num_exports_seq_show);
+
+static int
+lprocfs_exp_print_export_seq(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			     struct hlist_node *hnode, void *cb_data)
+{
+	struct seq_file		*m = cb_data;
+	struct obd_export	*exp = cfs_hash_object(hs, hnode);
+	struct obd_device	*obd;
+	struct obd_connect_data	*ocd;
+
+	LASSERT(exp != NULL);
+	if (exp->exp_nid_stats == NULL)
+		goto out;
+	obd = exp->exp_obd;
+	ocd = &exp->exp_connect_data;
+
+	seq_printf(m, "%s:\n"
+		   "    name: %s\n"
+		   "    client: %s\n"
+		   "    connect_flags: [ ",
+		   obd_uuid2str(&exp->exp_client_uuid),
+		   obd->obd_name,
+		   obd_export_nid2str(exp));
+	obd_connect_seq_flags2str(m, ocd->ocd_connect_flags,
+				  ocd->ocd_connect_flags2, ", ");
+	seq_printf(m, " ]\n");
+	obd_connect_data_seqprint(m, ocd);
+	seq_printf(m, "    export_flags: [ ");
+	obd_export_flags2str(exp, m);
+	seq_printf(m, " ]\n");
+
+	if (obd->obd_type &&
+	    strcmp(obd->obd_type->typ_name, "obdfilter") == 0) {
+		struct filter_export_data *fed = &exp->exp_filter_data;
+
+		seq_printf(m, "    grant:\n");
+		seq_printf(m, "       granted: %ld\n",
+			fed->fed_ted.ted_grant);
+		seq_printf(m, "       dirty: %ld\n",
+			fed->fed_ted.ted_dirty);
+		seq_printf(m, "       pending: %ld\n",
+			fed->fed_ted.ted_pending);
+	}
+
+out:
+	return 0;
+}
+
+/**
+ * RPC connections are composed of an import and an export. Using the
+ * lctl utility we can extract important information about the state.
+ * The lprocfs_exp_export_seq_show routine displays the state information
+ * for the export.
+ *
+ * \param[in] m		seq file
+ * \param[in] data	unused
+ *
+ * \retval		0 on success
+ *
+ * The format of the export state information is like:
+ * a793e354-49c0-aa11-8c4f-a4f2b1a1a92b:
+ *     name: MGS
+ *     client: 10.211.55.10@tcp
+ *     connect_flags: [ version, barrier, adaptive_timeouts, ... ]
+ *     connect_data:
+ *        flags: 0x2000011005002020
+ *        instance: 0
+ *        target_version: 2.10.51.0
+ *        export_flags: [ ... ]
+ *
+ */
+static int lprocfs_exp_export_seq_show(struct seq_file *m, void *data)
+{
+	struct nid_stat *stats = m->private;
+	struct obd_device *obd = stats->nid_obd;
+
+	cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
+			      lprocfs_exp_print_export_seq, m);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(lprocfs_exp_export);
 
 static void lprocfs_free_client_stats(struct nid_stat *client_stat)
 {
@@ -259,6 +397,30 @@ int lprocfs_exp_replydata_seq_show(struct seq_file *m, void *data)
 }
 LPROC_SEQ_FOPS_RO(lprocfs_exp_replydata);
 
+int lprocfs_exp_print_fmd_count_seq(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+				    struct hlist_node *hnode, void *cb_data)
+
+{
+	struct obd_export *exp = cfs_hash_object(hs, hnode);
+	struct seq_file *m = cb_data;
+	struct tg_export_data *ted = &exp->exp_target_data;
+
+	seq_printf(m, "%d\n", ted->ted_fmd_count);
+
+	return 0;
+}
+
+int lprocfs_exp_fmd_count_seq_show(struct seq_file *m, void *data)
+{
+	struct nid_stat *stats = m->private;
+	struct obd_device *obd = stats->nid_obd;
+
+	cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
+			      lprocfs_exp_print_fmd_count_seq, m);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(lprocfs_exp_fmd_count);
+
 int lprocfs_nid_stats_clear_seq_show(struct seq_file *m, void *data)
 {
 	seq_puts(m, "Write into this file to clear all nid stats and stale nid entries\n");
@@ -384,7 +546,8 @@ int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid)
 				   &lprocfs_exp_nodemap_fops);
 	if (IS_ERR(entry)) {
 		rc = PTR_ERR(entry);
-		CWARN("Error adding the nodemap file: rc = %d\n", rc);
+		CWARN("%s: error adding the nodemap file: rc = %d\n",
+		      obd->obd_name, rc);
 		GOTO(destroy_new_ns, rc);
 	}
 
@@ -392,7 +555,8 @@ int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid)
 				   &lprocfs_exp_uuid_fops);
 	if (IS_ERR(entry)) {
 		rc = PTR_ERR(entry);
-		CWARN("Error adding the NID stats file: rc = %d\n", rc);
+		CWARN("%s: error adding the NID stats file: rc = %d\n",
+		      obd->obd_name, rc);
 		GOTO(destroy_new_ns, rc);
 	}
 
@@ -400,7 +564,17 @@ int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid)
 				   &lprocfs_exp_hash_fops);
 	if (IS_ERR(entry)) {
 		rc = PTR_ERR(entry);
-		CWARN("Error adding the hash file: rc = %d\n", rc);
+		CWARN("%s: error adding the hash file: rc = %d\n",
+		      obd->obd_name, rc);
+		GOTO(destroy_new_ns, rc);
+	}
+
+	entry = lprocfs_add_simple(new_stat->nid_proc, "export",
+				   new_stat, &lprocfs_exp_export_fops);
+	if (IS_ERR(entry)) {
+		rc = PTR_ERR(entry);
+		CWARN("%s: error adding the export file: rc = %d\n",
+		      obd->obd_name, rc);
 		GOTO(destroy_new_ns, rc);
 	}
 
@@ -408,7 +582,16 @@ int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid)
 				   &lprocfs_exp_replydata_fops);
 	if (IS_ERR(entry)) {
 		rc = PTR_ERR(entry);
-		CWARN("%s: Error adding the reply_data file: rc = %d\n",
+		CWARN("%s: error adding the reply_data file: rc = %d\n",
+		      obd->obd_name, rc);
+		GOTO(destroy_new_ns, rc);
+	}
+
+	entry = lprocfs_add_simple(new_stat->nid_proc, "fmd_count", new_stat,
+				   &lprocfs_exp_fmd_count_fops);
+	if (IS_ERR(entry)) {
+		rc = PTR_ERR(entry);
+		CWARN("%s: error adding the fmd_count file: rc = %d\n",
 		      obd->obd_name, rc);
 		GOTO(destroy_new_ns, rc);
 	}
@@ -449,92 +632,24 @@ int lprocfs_exp_cleanup(struct obd_export *exp)
 	return 0;
 }
 
-#define LPROCFS_OBD_OP_INIT(base, stats, op)			\
-do {								\
-	unsigned int coffset = base + OBD_COUNTER_OFFSET(op);	\
-	LASSERT(coffset < stats->ls_num);			\
-	lprocfs_counter_init(stats, coffset, 0, #op, "reqs");	\
-} while (0)
-
-void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats)
-{
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, iocontrol);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_info);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, set_info_async);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, setup);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, precleanup);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, cleanup);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, process_config);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, postrecov);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, add_conn);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, del_conn);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, connect);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, reconnect);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, disconnect);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_init);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_fini);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_alloc);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs_async);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, create);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, preprw);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, commitrw);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, init_export);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy_export);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, import_event);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, notify);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, health_check);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_uuid);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotactl);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, ping);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_new);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_rem);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_add);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_del);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, getref);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, putref);
-
-	CLASSERT(NUM_OBD_STATS == OBD_COUNTER_OFFSET(putref) + 1);
-}
-EXPORT_SYMBOL(lprocfs_init_ops_stats);
-
-int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats)
+int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned int num_stats)
 {
 	struct lprocfs_stats *stats;
-	unsigned int num_stats;
-	int rc, i;
+	int rc;
 
 	LASSERT(obd->obd_stats == NULL);
 	LASSERT(obd->obd_proc_entry != NULL);
-	LASSERT(obd->obd_cntr_base == 0);
 
-	num_stats = NUM_OBD_STATS + num_private_stats;
 	stats = lprocfs_alloc_stats(num_stats, 0);
 	if (stats == NULL)
 		return -ENOMEM;
 
-	lprocfs_init_ops_stats(num_private_stats, stats);
-
-	for (i = num_private_stats; i < num_stats; i++) {
-		/* If this LBUGs, it is likely that an obd
-		 * operation was added to struct obd_ops in
-		 * <obd.h>, and that the corresponding line item
-		 * LPROCFS_OBD_OP_INIT(.., .., opname)
-		 * is missing from the list above. */
-		LASSERTF(stats->ls_cnt_header[i].lc_name != NULL,
-			 "Missing obd_stat initializer obd_op "
-			 "operation at offset %d.\n", i - num_private_stats);
-	}
 	rc = lprocfs_register_stats(obd->obd_proc_entry, "stats", stats);
-	if (rc < 0) {
+	if (rc < 0)
 		lprocfs_free_stats(&stats);
-	} else {
-		obd->obd_stats  = stats;
-		obd->obd_cntr_base = num_private_stats;
-	}
+	else
+		obd->obd_stats = stats;
+
 	return rc;
 }
 EXPORT_SYMBOL(lprocfs_alloc_obd_stats);
@@ -569,7 +684,7 @@ int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data)
 	LASSERT(obd != NULL);
 
 	seq_printf(m, "status: ");
-	if (obd->obd_max_recoverable_clients == 0) {
+	if (atomic_read(&obd->obd_max_recoverable_clients) == 0) {
 		seq_printf(m, "INACTIVE\n");
 		goto out;
 	}
@@ -585,9 +700,9 @@ int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data)
 			   ktime_get_real_seconds() - obd->obd_recovery_start);
 		/* Number of clients that have completed recovery */
 		seq_printf(m, "completed_clients: %d/%d\n",
-			   obd->obd_max_recoverable_clients -
+			   atomic_read(&obd->obd_max_recoverable_clients) -
 			   obd->obd_stale_clients,
-			   obd->obd_max_recoverable_clients);
+			   atomic_read(&obd->obd_max_recoverable_clients));
 		seq_printf(m, "replayed_requests: %d\n",
 			   obd->obd_replayed_requests);
 		seq_printf(m, "last_transno: %lld\n",
@@ -643,7 +758,7 @@ int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data)
 			 ktime_get_real_seconds()));
 	seq_printf(m, "connected_clients: %d/%d\n",
 		   atomic_read(&obd->obd_connected_clients),
-		   obd->obd_max_recoverable_clients);
+		   atomic_read(&obd->obd_max_recoverable_clients));
 	/* Number of clients that have completed recovery */
 	seq_printf(m, "req_replay_clients: %d\n",
 		   atomic_read(&obd->obd_req_replay_clients));
@@ -663,27 +778,25 @@ int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data)
 }
 EXPORT_SYMBOL(lprocfs_recovery_status_seq_show);
 
-int lprocfs_ir_factor_seq_show(struct seq_file *m, void *data)
+ssize_t ir_factor_show(struct kobject *kobj, struct attribute *attr,
+		       char *buf)
 {
-	struct obd_device *obd = m->private;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 
-	LASSERT(obd != NULL);
-	seq_printf(m, "%d\n", obd->obd_recovery_ir_factor);
-	return 0;
+	return scnprintf(buf, PAGE_SIZE, "%d\n", obd->obd_recovery_ir_factor);
 }
-EXPORT_SYMBOL(lprocfs_ir_factor_seq_show);
+EXPORT_SYMBOL(ir_factor_show);
 
-ssize_t
-lprocfs_ir_factor_seq_write(struct file *file, const char __user *buffer,
-			    size_t count, loff_t *off)
+ssize_t ir_factor_store(struct kobject *kobj, struct attribute *attr,
+			const char *buffer, size_t count)
 {
-	struct seq_file *m = file->private_data;
-	struct obd_device *obd = m->private;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	int val;
 	int rc;
-	__s64 val;
 
-	LASSERT(obd != NULL);
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtoint(buffer, 10, &val);
 	if (rc)
 		return rc;
 
@@ -693,7 +806,7 @@ lprocfs_ir_factor_seq_write(struct file *file, const char __user *buffer,
 	obd->obd_recovery_ir_factor = val;
 	return count;
 }
-EXPORT_SYMBOL(lprocfs_ir_factor_seq_write);
+EXPORT_SYMBOL(ir_factor_store);
 
 int lprocfs_checksum_dump_seq_show(struct seq_file *m, void *data)
 {
@@ -711,93 +824,85 @@ lprocfs_checksum_dump_seq_write(struct file *file, const char __user *buffer,
 {
 	struct seq_file *m = file->private_data;
 	struct obd_device *obd = m->private;
+	bool val;
 	int rc;
-	__s64 val;
 
 	LASSERT(obd != NULL);
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtobool_from_user(buffer, count, &val);
 	if (rc)
 		return rc;
 
-	obd->obd_checksum_dump = !!val;
+	obd->obd_checksum_dump = val;
 	return count;
 }
 EXPORT_SYMBOL(lprocfs_checksum_dump_seq_write);
 
-int lprocfs_recovery_time_soft_seq_show(struct seq_file *m, void *data)
+ssize_t recovery_time_soft_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
 {
-	struct obd_device *obd = m->private;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 
-	LASSERT(obd != NULL);
-	seq_printf(m, "%llu\n", obd->obd_recovery_timeout);
-	return 0;
+	return scnprintf(buf, PAGE_SIZE, "%ld\n", obd->obd_recovery_timeout);
 }
-EXPORT_SYMBOL(lprocfs_recovery_time_soft_seq_show);
+EXPORT_SYMBOL(recovery_time_soft_show);
 
-ssize_t
-lprocfs_recovery_time_soft_seq_write(struct file *file,
-				     const char __user *buffer,
-				     size_t count, loff_t *off)
+ssize_t recovery_time_soft_store(struct kobject *kobj,
+				 struct attribute *attr,
+				 const char *buffer, size_t count)
 {
-	struct seq_file *m = file->private_data;
-	struct obd_device *obd = m->private;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	unsigned int val;
 	int rc;
-	__s64 val;
 
-	LASSERT(obd != NULL);
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtouint(buffer, 0, &val);
 	if (rc)
 		return rc;
-	if (val < 0 || val > INT_MAX)
-		return -ERANGE;
 
 	obd->obd_recovery_timeout = val;
 	return count;
 }
-EXPORT_SYMBOL(lprocfs_recovery_time_soft_seq_write);
+EXPORT_SYMBOL(recovery_time_soft_store);
 
-int lprocfs_recovery_time_hard_seq_show(struct seq_file *m, void *data)
+ssize_t recovery_time_hard_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
 {
-	struct obd_device *obd = m->private;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 
-	LASSERT(obd != NULL);
-	seq_printf(m, "%lld\n", obd->obd_recovery_time_hard);
-	return 0;
+	return scnprintf(buf, PAGE_SIZE, "%ld\n", obd->obd_recovery_time_hard);
 }
-EXPORT_SYMBOL(lprocfs_recovery_time_hard_seq_show);
+EXPORT_SYMBOL(recovery_time_hard_show);
 
-ssize_t
-lprocfs_recovery_time_hard_seq_write(struct file *file,
-				     const char __user *buffer,
-				     size_t count, loff_t *off)
+ssize_t recovery_time_hard_store(struct kobject *kobj,
+				 struct attribute *attr,
+				 const char *buffer, size_t count)
 {
-	struct seq_file *m = file->private_data;
-	struct obd_device *obd = m->private;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	unsigned int val;
 	int rc;
-	__s64 val;
 
-	LASSERT(obd != NULL);
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtouint(buffer, 0, &val);
 	if (rc)
 		return rc;
-	if (val < 0 || val > INT_MAX)
-		return -ERANGE;
 
 	obd->obd_recovery_time_hard = val;
 	return count;
 }
-EXPORT_SYMBOL(lprocfs_recovery_time_hard_seq_write);
+EXPORT_SYMBOL(recovery_time_hard_store);
 
-int lprocfs_target_instance_seq_show(struct seq_file *m, void *data)
+ssize_t instance_show(struct kobject *kobj, struct attribute *attr,
+		      char *buf)
 {
-	struct obd_device *obd = m->private;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 	struct obd_device_target *target = &obd->u.obt;
 
-	LASSERT(obd != NULL);
 	LASSERT(target->obt_magic == OBT_MAGIC);
-	seq_printf(m, "%u\n", obd->u.obt.obt_instance);
-	return 0;
+	return scnprintf(buf, PAGE_SIZE, "%u\n", obd->u.obt.obt_instance);
 }
-EXPORT_SYMBOL(lprocfs_target_instance_seq_show);
+EXPORT_SYMBOL(instance_show);
 
 #endif /* CONFIG_PROC_FS*/
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c b/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c
index 21a137bad0bae..42e880e8a3948 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -44,6 +44,8 @@
 #include <linux/list.h>
 #include <libcfs/libcfs.h>
 #include <libcfs/libcfs_hash.h> /* hash_long() */
+#include <libcfs/linux/linux-mem.h>
+#include <libcfs/linux/linux-hash.h>
 #include <obd_class.h>
 #include <obd_support.h>
 #include <lustre_disk.h>
@@ -51,6 +53,28 @@
 #include <lu_object.h>
 #include <lu_ref.h>
 
+struct lu_site_bkt_data {
+	/**
+	 * LRU list, updated on each access to object. Protected by
+	 * bucket lock of lu_site::ls_obj_hash.
+	 *
+	 * "Cold" end of LRU is lu_site::ls_lru.next. Accessed object are
+	 * moved to the lu_site::ls_lru.prev (this is due to the non-existence
+	 * of list_for_each_entry_safe_reverse()).
+	 */
+	struct list_head		lsb_lru;
+	/**
+	 * Wait-queue signaled when an object in this site is ultimately
+	 * destroyed (lu_object_free()) or initialized (lu_object_start()).
+	 * It is used by lu_object_find() to wait before re-trying when
+	 * object in the process of destruction is found in the hash table;
+	 * or wait object to be initialized by the allocator.
+	 *
+	 * \see htable_lookup().
+	 */
+	wait_queue_head_t		lsb_waitq;
+};
+
 enum {
 	LU_CACHE_PERCENT_MAX     = 50,
 	LU_CACHE_PERCENT_DEFAULT = 20
@@ -85,6 +109,18 @@ MODULE_PARM_DESC(lu_cache_nr, "Maximum number of objects in lu_object cache");
 static void lu_object_free(const struct lu_env *env, struct lu_object *o);
 static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx);
 
+wait_queue_head_t *
+lu_site_wq_from_fid(struct lu_site *site, struct lu_fid *fid)
+{
+	struct cfs_hash_bd bd;
+	struct lu_site_bkt_data *bkt;
+
+	cfs_hash_bd_get(site->ls_obj_hash, fid, &bd);
+	bkt = cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
+	return &bkt->lsb_waitq;
+}
+EXPORT_SYMBOL(lu_site_wq_from_fid);
+
 /**
  * Decrease reference counter on object. If last reference is freed, return
  * object to the cache, unless lu_object_is_dying(o) holds. In the latter
@@ -93,22 +129,18 @@ static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx);
 void lu_object_put(const struct lu_env *env, struct lu_object *o)
 {
 	struct lu_site_bkt_data *bkt;
-	struct lu_object_header *top;
-	struct lu_site *site;
-	struct lu_object *orig;
+	struct lu_object_header *top = o->lo_header;
+	struct lu_site *site = o->lo_dev->ld_site;
+	struct lu_object *orig = o;
 	struct cfs_hash_bd bd;
-	const struct lu_fid *fid;
-
-	top  = o->lo_header;
-	site = o->lo_dev->ld_site;
-	orig = o;
+	const struct lu_fid *fid = lu_object_fid(o);
+	bool is_dying;
 
 	/*
 	 * till we have full fids-on-OST implemented anonymous objects
 	 * are possible in OSP. such an object isn't listed in the site
 	 * so we should not remove it from the site.
 	 */
-	fid = lu_object_fid(o);
 	if (fid_is_zero(fid)) {
 		LASSERT(top->loh_hash.next == NULL
 			&& top->loh_hash.pprev == NULL);
@@ -126,13 +158,19 @@ void lu_object_put(const struct lu_env *env, struct lu_object *o)
 	cfs_hash_bd_get(site->ls_obj_hash, &top->loh_fid, &bd);
 	bkt = cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
 
+	is_dying = lu_object_is_dying(top);
 	if (!cfs_hash_bd_dec_and_lock(site->ls_obj_hash, &bd, &top->loh_ref)) {
-		if (lu_object_is_dying(top)) {
+		/* at this point the object reference is dropped and lock is
+		 * not taken, so lu_object should not be touched because it
+		 * can be freed by concurrent thread. Use local variable for
+		 * check.
+		 */
+		if (is_dying) {
 			/*
 			 * somebody may be waiting for this, currently only
 			 * used for cl_object, see cl_object_put_last().
 			 */
-			wake_up_all(&bkt->lsb_marche_funebre);
+			wake_up_all(&bkt->lsb_waitq);
 		}
 		return;
 	}
@@ -146,15 +184,17 @@ void lu_object_put(const struct lu_env *env, struct lu_object *o)
 			o->lo_ops->loo_object_release(env, o);
 	}
 
+	/* don't use local 'is_dying' here because if was taken without lock
+	 * but here we need the latest actual value of it so check lu_object
+	 * directly here.
+	 */
 	if (!lu_object_is_dying(top) &&
 	    (lu_object_exists(orig) || lu_object_is_cl(orig))) {
 		LASSERT(list_empty(&top->loh_lru));
 		list_add_tail(&top->loh_lru, &bkt->lsb_lru);
-		bkt->lsb_lru_len++;
 		percpu_counter_inc(&site->ls_lru_len_counter);
-		CDEBUG(D_INODE, "Add %p to site lru. hash: %p, bkt: %p, "
-		       "lru_len: %ld\n",
-		       o, site->ls_obj_hash, bkt, bkt->lsb_lru_len);
+		CDEBUG(D_INODE, "Add %p/%p to site lru. hash: %p, bkt: %p\n",
+		       orig, top, site->ls_obj_hash, bkt);
 		cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1);
 		return;
 	}
@@ -213,7 +253,6 @@ void lu_object_unhash(const struct lu_env *env, struct lu_object *o)
 
 			list_del_init(&top->loh_lru);
 			bkt = cfs_hash_bd_extra_get(obj_hash, &bd);
-			bkt->lsb_lru_len--;
 			percpu_counter_dec(&site->ls_lru_len_counter);
 		}
 		cfs_hash_bd_del_locked(obj_hash, &bd, &top->loh_hash);
@@ -230,17 +269,9 @@ EXPORT_SYMBOL(lu_object_unhash);
  */
 static struct lu_object *lu_object_alloc(const struct lu_env *env,
 					 struct lu_device *dev,
-					 const struct lu_fid *f,
-					 const struct lu_object_conf *conf)
+					 const struct lu_fid *f)
 {
-	struct lu_object *scan;
 	struct lu_object *top;
-	struct list_head *layers;
-	unsigned int init_mask = 0;
-	unsigned int init_flag;
-	int clean;
-	int result;
-	ENTRY;
 
 	/*
 	 * Create top-level object slice. This will also create
@@ -248,15 +279,36 @@ static struct lu_object *lu_object_alloc(const struct lu_env *env,
 	 */
 	top = dev->ld_ops->ldo_object_alloc(env, NULL, dev);
 	if (top == NULL)
-		RETURN(ERR_PTR(-ENOMEM));
+		return ERR_PTR(-ENOMEM);
 	if (IS_ERR(top))
-		RETURN(top);
-        /*
-         * This is the only place where object fid is assigned. It's constant
-         * after this point.
-         */
-        top->lo_header->loh_fid = *f;
-        layers = &top->lo_header->loh_layers;
+		return top;
+	/*
+	 * This is the only place where object fid is assigned. It's constant
+	 * after this point.
+	 */
+	top->lo_header->loh_fid = *f;
+
+	return top;
+}
+
+/**
+ * Initialize object.
+ *
+ * This is called after object hash insertion to avoid returning an object with
+ * stale attributes.
+ */
+static int lu_object_start(const struct lu_env *env, struct lu_device *dev,
+			   struct lu_object *top,
+			   const struct lu_object_conf *conf)
+{
+	struct lu_object *scan;
+	struct list_head *layers;
+	unsigned int init_mask = 0;
+	unsigned int init_flag;
+	int clean;
+	int result;
+
+	layers = &top->lo_header->loh_layers;
 
 	do {
 		/*
@@ -271,10 +323,9 @@ static struct lu_object *lu_object_alloc(const struct lu_env *env,
 			clean = 0;
 			scan->lo_header = top->lo_header;
 			result = scan->lo_ops->loo_object_init(env, scan, conf);
-			if (result != 0) {
-				lu_object_free(env, top);
-				RETURN(ERR_PTR(result));
-			}
+			if (result)
+				return result;
+
 			init_mask |= init_flag;
 next:
 			init_flag <<= 1;
@@ -282,17 +333,18 @@ static struct lu_object *lu_object_alloc(const struct lu_env *env,
 	} while (!clean);
 
 	list_for_each_entry_reverse(scan, layers, lo_linkage) {
-                if (scan->lo_ops->loo_object_start != NULL) {
-                        result = scan->lo_ops->loo_object_start(env, scan);
-                        if (result != 0) {
-                                lu_object_free(env, top);
-                                RETURN(ERR_PTR(result));
-                        }
-                }
-        }
+		if (scan->lo_ops->loo_object_start != NULL) {
+			result = scan->lo_ops->loo_object_start(env, scan);
+			if (result)
+				return result;
+		}
+	}
+
+	lprocfs_counter_incr(dev->ld_site->ls_stats, LU_SS_CREATED);
 
-        lprocfs_counter_incr(dev->ld_site->ls_stats, LU_SS_CREATED);
-        RETURN(top);
+	set_bit(LU_OBJECT_INITED, &top->lo_header->loh_flags);
+
+	return 0;
 }
 
 /**
@@ -300,15 +352,15 @@ static struct lu_object *lu_object_alloc(const struct lu_env *env,
  */
 static void lu_object_free(const struct lu_env *env, struct lu_object *o)
 {
-	struct lu_site_bkt_data *bkt;
+	wait_queue_head_t *wq;
 	struct lu_site		*site;
 	struct lu_object	*scan;
 	struct list_head	*layers;
 	struct list_head	 splice;
 
-        site   = o->lo_dev->ld_site;
-        layers = &o->lo_header->loh_layers;
-        bkt    = lu_site_bkt_from_fid(site, &o->lo_header->loh_fid);
+	site = o->lo_dev->ld_site;
+	layers = &o->lo_header->loh_layers;
+	wq = lu_site_wq_from_fid(site, &o->lo_header->loh_fid);
         /*
          * First call ->loo_object_delete() method to release all resources.
          */
@@ -337,8 +389,8 @@ static void lu_object_free(const struct lu_env *env, struct lu_object *o)
 		o->lo_ops->loo_object_free(env, o);
 	}
 
-	if (waitqueue_active(&bkt->lsb_marche_funebre))
-		wake_up_all(&bkt->lsb_marche_funebre);
+	if (waitqueue_active(wq))
+		wake_up_all(wq);
 }
 
 /**
@@ -399,7 +451,6 @@ int lu_site_purge_objects(const struct lu_env *env, struct lu_site *s,
                         cfs_hash_bd_del_locked(s->ls_obj_hash,
                                                &bd2, &h->loh_hash);
 			list_move(&h->loh_lru, &dispose);
-			bkt->lsb_lru_len--;
 			percpu_counter_dec(&s->ls_lru_len_counter);
                         if (did_sth == 0)
                                 did_sth = 1;
@@ -591,7 +642,6 @@ static struct lu_object *htable_lookup(struct lu_site *s,
 				       const struct lu_fid *f,
 				       __u64 *version)
 {
-	struct lu_site_bkt_data	*bkt;
 	struct lu_object_header	*h;
 	struct hlist_node *hnode;
 	__u64 ver = cfs_hash_bd_version_get(bd);
@@ -600,7 +650,6 @@ static struct lu_object *htable_lookup(struct lu_site *s,
 		return ERR_PTR(-ENOENT);
 
 	*version = ver;
-	bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, bd);
 	/* cfs_hash_bd_peek_locked is a somehow "internal" function
 	 * of cfs_hash, it doesn't add refcount on object. */
 	hnode = cfs_hash_bd_peek_locked(s->ls_obj_hash, bd, (void *)f);
@@ -614,7 +663,6 @@ static struct lu_object *htable_lookup(struct lu_site *s,
 	lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_HIT);
 	if (!list_empty(&h->loh_lru)) {
 		list_del_init(&h->loh_lru);
-		bkt->lsb_lru_len--;
 		percpu_counter_dec(&s->ls_lru_len_counter);
 	}
 	return lu_object_top(h);
@@ -657,29 +705,6 @@ static void lu_object_limit(const struct lu_env *env,
 			      MIN(size - nr, LU_CACHE_NR_MAX_ADJUST), 0);
 }
 
-static struct lu_object *lu_object_new(const struct lu_env *env,
-				       struct lu_device *dev,
-				       const struct lu_fid *f,
-				       const struct lu_object_conf *conf)
-{
-	struct lu_object *o;
-	struct cfs_hash *hs;
-	struct cfs_hash_bd bd;
-
-	o = lu_object_alloc(env, dev, f, conf);
-	if (unlikely(IS_ERR(o)))
-		return o;
-
-	hs = dev->ld_site->ls_obj_hash;
-	cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1);
-	cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
-	cfs_hash_bd_unlock(hs, &bd, 1);
-
-	lu_object_limit(env, dev);
-
-	return o;
-}
-
 /**
  * Core logic of lu_object_find*() functions.
  *
@@ -697,7 +722,19 @@ struct lu_object *lu_object_find_at(const struct lu_env *env,
 	struct lu_site *s;
 	struct cfs_hash *hs;
 	struct cfs_hash_bd bd;
+	struct lu_site_bkt_data *bkt;
+	struct l_wait_info lwi = { 0 };
 	__u64 version = 0;
+	int rc;
+
+	ENTRY;
+
+	/* FID is from disk or network, zero FID is meaningless, return error
+	 * early to avoid assertion in lu_object_put. If a zero FID is wanted,
+	 * it should be allocated via lu_object_anon().
+	 */
+	if (fid_is_zero(f))
+		RETURN(ERR_PTR(-EINVAL));
 
 	/*
 	 * This uses standard index maintenance protocol:
@@ -716,46 +753,99 @@ struct lu_object *lu_object_find_at(const struct lu_env *env,
 	 * It is unnecessary to perform lookup-alloc-lookup-insert, instead,
 	 * just alloc and insert directly.
 	 *
-	 * If dying object is found during index search, add @waiter to the
-	 * site wait-queue and return ERR_PTR(-EAGAIN).
 	 */
-	if (conf && conf->loc_flags & LOC_F_NEW)
-		return lu_object_new(env, dev, f, conf);
-
 	s  = dev->ld_site;
 	hs = s->ls_obj_hash;
-	cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1);
-	o = htable_lookup(s, &bd, f, &version);
-	cfs_hash_bd_unlock(hs, &bd, 1);
-	if (!IS_ERR(o) || PTR_ERR(o) != -ENOENT)
-		return o;
+
+	if (unlikely(OBD_FAIL_PRECHECK(OBD_FAIL_OBD_ZERO_NLINK_RACE)))
+		lu_site_purge(env, s, -1);
+
+	cfs_hash_bd_get(hs, f, &bd);
+	bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd);
+	if (!(conf && conf->loc_flags & LOC_F_NEW)) {
+		cfs_hash_bd_lock(hs, &bd, 1);
+		o = htable_lookup(s, &bd, f, &version);
+		cfs_hash_bd_unlock(hs, &bd, 1);
+
+		if (!IS_ERR(o)) {
+			if (likely(lu_object_is_inited(o->lo_header)))
+				RETURN(o);
+
+			l_wait_event(bkt->lsb_waitq,
+				     lu_object_is_inited(o->lo_header) ||
+				     lu_object_is_dying(o->lo_header), &lwi);
+
+			if (lu_object_is_dying(o->lo_header)) {
+				lu_object_put(env, o);
+
+				RETURN(ERR_PTR(-ENOENT));
+			}
+
+			RETURN(o);
+		}
+
+		if (PTR_ERR(o) != -ENOENT)
+			RETURN(o);
+	}
 
 	/*
-	 * Allocate new object. This may result in rather complicated
-	 * operations, including fld queries, inode loading, etc.
+	 * Allocate new object, NB, object is unitialized in case object
+	 * is changed between allocation and hash insertion, thus the object
+	 * with stale attributes is returned.
 	 */
-	o = lu_object_alloc(env, dev, f, conf);
-	if (unlikely(IS_ERR(o)))
-		return o;
+	o = lu_object_alloc(env, dev, f);
+	if (IS_ERR(o))
+		RETURN(o);
 
 	LASSERT(lu_fid_eq(lu_object_fid(o), f));
 
+	CFS_RACE_WAIT(OBD_FAIL_OBD_ZERO_NLINK_RACE);
+
 	cfs_hash_bd_lock(hs, &bd, 1);
 
-	shadow = htable_lookup(s, &bd, f, &version);
-	if (likely(IS_ERR(shadow) && PTR_ERR(shadow) == -ENOENT)) {
+	if (conf && conf->loc_flags & LOC_F_NEW)
+		shadow = ERR_PTR(-ENOENT);
+	else
+		shadow = htable_lookup(s, &bd, f, &version);
+	if (likely(PTR_ERR(shadow) == -ENOENT)) {
 		cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
 		cfs_hash_bd_unlock(hs, &bd, 1);
 
+		/*
+		 * This may result in rather complicated operations, including
+		 * fld queries, inode loading, etc.
+		 */
+		rc = lu_object_start(env, dev, o, conf);
+		if (rc) {
+			lu_object_put_nocache(env, o);
+			RETURN(ERR_PTR(rc));
+		}
+
+		wake_up_all(&bkt->lsb_waitq);
+
 		lu_object_limit(env, dev);
 
-		return o;
+		RETURN(o);
 	}
 
 	lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_RACE);
 	cfs_hash_bd_unlock(hs, &bd, 1);
 	lu_object_free(env, o);
-	return shadow;
+
+	if (!(conf && conf->loc_flags & LOC_F_NEW) &&
+	    !lu_object_is_inited(shadow->lo_header)) {
+		l_wait_event(bkt->lsb_waitq,
+			     lu_object_is_inited(shadow->lo_header) ||
+			     lu_object_is_dying(shadow->lo_header), &lwi);
+
+		if (lu_object_is_dying(shadow->lo_header)) {
+			lu_object_put(env, shadow);
+
+			RETURN(ERR_PTR(-ENOENT));
+		}
+	}
+
+	RETURN(shadow);
 }
 EXPORT_SYMBOL(lu_object_find_at);
 
@@ -1042,7 +1132,7 @@ int lu_site_init(struct lu_site *s, struct lu_device *top)
 	cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) {
 		bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd);
 		INIT_LIST_HEAD(&bkt->lsb_lru);
-		init_waitqueue_head(&bkt->lsb_marche_funebre);
+		init_waitqueue_head(&bkt->lsb_waitq);
 	}
 
         s->ls_stats = lprocfs_alloc_stats(LU_SS_LAST_STAT, 0);
@@ -1386,7 +1476,8 @@ static void key_fini(struct lu_context *ctx, int index)
 
                 key->lct_fini(ctx, key, ctx->lc_value[index]);
                 lu_ref_del(&key->lct_reference, "ctx", ctx);
-		atomic_dec(&key->lct_used);
+		if (atomic_dec_and_test(&key->lct_used))
+			wake_up_var(&key->lct_used);
 
 		LASSERT(key->lct_owner != NULL);
 		if ((ctx->lc_tags & LCT_NOREF) == 0) {
@@ -1407,29 +1498,23 @@ void lu_context_key_degister(struct lu_context_key *key)
 
 	lu_context_key_quiesce(key);
 
-	write_lock(&lu_keys_guard);
-	++key_set_version;
 	key_fini(&lu_shrink_env.le_ctx, key->lct_index);
 
 	/**
 	 * Wait until all transient contexts referencing this key have
 	 * run lu_context_key::lct_fini() method.
 	 */
-	while (atomic_read(&key->lct_used) > 1) {
-		write_unlock(&lu_keys_guard);
-		CDEBUG(D_INFO, "lu_context_key_degister: \"%s\" %p, %d\n",
-		       key->lct_owner ? key->lct_owner->name : "", key,
-		       atomic_read(&key->lct_used));
-		schedule();
-		write_lock(&lu_keys_guard);
-	}
+	atomic_dec(&key->lct_used);
+	wait_var_event(&key->lct_used, atomic_read(&key->lct_used) == 0);
+
+	write_lock(&lu_keys_guard);
 	if (lu_keys[key->lct_index]) {
 		lu_keys[key->lct_index] = NULL;
 		lu_ref_fini(&key->lct_reference);
 	}
 	write_unlock(&lu_keys_guard);
 
-	LASSERTF(atomic_read(&key->lct_used) == 1,
+	LASSERTF(atomic_read(&key->lct_used) == 0,
 		 "key has instances: %d\n",
 		 atomic_read(&key->lct_used));
 }
@@ -1893,6 +1978,119 @@ int lu_env_refill_by_tags(struct lu_env *env, __u32 ctags,
 }
 EXPORT_SYMBOL(lu_env_refill_by_tags);
 
+#ifdef HAVE_SERVER_SUPPORT
+struct lu_env_item {
+	struct task_struct *lei_task;	/* rhashtable key */
+	struct rhash_head lei_linkage;
+	struct lu_env *lei_env;
+	struct rcu_head lei_rcu_head;
+};
+
+static const struct rhashtable_params lu_env_rhash_params = {
+	.key_len     = sizeof(struct task_struct *),
+	.key_offset  = offsetof(struct lu_env_item, lei_task),
+	.head_offset = offsetof(struct lu_env_item, lei_linkage),
+};
+
+struct rhashtable lu_env_rhash;
+
+struct lu_env_percpu {
+	struct task_struct *lep_task;
+	struct lu_env *lep_env ____cacheline_aligned_in_smp;
+};
+
+static struct lu_env_percpu lu_env_percpu[NR_CPUS];
+
+int lu_env_add(struct lu_env *env)
+{
+	struct lu_env_item *lei, *old;
+
+	LASSERT(env);
+
+	OBD_ALLOC_PTR(lei);
+	if (!lei)
+		return -ENOMEM;
+
+	lei->lei_task = current;
+	lei->lei_env = env;
+
+	old = rhashtable_lookup_get_insert_fast(&lu_env_rhash,
+						&lei->lei_linkage,
+						lu_env_rhash_params);
+	LASSERT(!old);
+
+	return 0;
+}
+EXPORT_SYMBOL(lu_env_add);
+
+static void lu_env_item_free(struct rcu_head *head)
+{
+	struct lu_env_item *lei;
+
+	lei = container_of(head, struct lu_env_item, lei_rcu_head);
+	OBD_FREE_PTR(lei);
+}
+
+void lu_env_remove(struct lu_env *env)
+{
+	struct lu_env_item *lei;
+	const void *task = current;
+	int i;
+
+	for_each_possible_cpu(i) {
+		if (lu_env_percpu[i].lep_env == env) {
+			LASSERT(lu_env_percpu[i].lep_task == task);
+			lu_env_percpu[i].lep_task = NULL;
+			lu_env_percpu[i].lep_env = NULL;
+		}
+	}
+
+	/* The rcu_lock is not taking in this case since the key
+	 * used is the actual task_struct. This implies that each
+	 * object is only removed by the owning thread, so there
+	 * can never be a race on a particular object.
+	 */
+	lei = rhashtable_lookup_fast(&lu_env_rhash, &task,
+				     lu_env_rhash_params);
+	if (lei && rhashtable_remove_fast(&lu_env_rhash, &lei->lei_linkage,
+					  lu_env_rhash_params) == 0)
+		call_rcu(&lei->lei_rcu_head, lu_env_item_free);
+}
+EXPORT_SYMBOL(lu_env_remove);
+
+struct lu_env *lu_env_find(void)
+{
+	struct lu_env *env = NULL;
+	struct lu_env_item *lei;
+	const void *task = current;
+	int i = get_cpu();
+
+	if (lu_env_percpu[i].lep_task == current) {
+		env = lu_env_percpu[i].lep_env;
+		put_cpu();
+		LASSERT(env);
+		return env;
+	}
+
+	lei = rhashtable_lookup_fast(&lu_env_rhash, &task,
+				     lu_env_rhash_params);
+	if (lei) {
+		env = lei->lei_env;
+		lu_env_percpu[i].lep_task = current;
+		lu_env_percpu[i].lep_env = env;
+	}
+	put_cpu();
+
+	return env;
+}
+EXPORT_SYMBOL(lu_env_find);
+#define lu_env_rhash_init(rhash, params) rhashtable_init(rhash, params)
+#define lu_env_rhash_destroy(rhash)	 rhashtable_destroy(rhash)
+#else
+#define lu_env_rhash_init(rhash, params) 0
+#define lu_env_rhash_destroy(rhash)	 do {} while (0)
+#endif /* HAVE_SERVER_SUPPORT */
+
 static struct shrinker *lu_site_shrinker;
 
 typedef struct lu_site_stats{
@@ -1902,19 +2100,24 @@ typedef struct lu_site_stats{
         unsigned        lss_busy;
 } lu_site_stats_t;
 
-static void lu_site_stats_get(struct cfs_hash *hs,
+static void lu_site_stats_get(const struct lu_site *s,
                               lu_site_stats_t *stats, int populated)
 {
+	struct cfs_hash *hs = s->ls_obj_hash;
 	struct cfs_hash_bd bd;
-	unsigned int  i;
+	unsigned int i;
+	/*
+	 * percpu_counter_sum_positive() won't accept a const pointer
+	 * as it does modify the struct by taking a spinlock
+	 */
+	struct lu_site *s2 = (struct lu_site *)s;
 
+	stats->lss_busy += cfs_hash_size_get(hs) -
+		percpu_counter_sum_positive(&s2->ls_lru_len_counter);
         cfs_hash_for_each_bucket(hs, &bd, i) {
-                struct lu_site_bkt_data *bkt = cfs_hash_bd_extra_get(hs, &bd);
-		struct hlist_head	*hhead;
+		struct hlist_head *hhead;
 
                 cfs_hash_bd_lock(hs, &bd, 1);
-		stats->lss_busy  +=
-			cfs_hash_bd_count_get(&bd) - bkt->lsb_lru_len;
                 stats->lss_total += cfs_hash_bd_count_get(&bd);
                 stats->lss_max_search = max((int)stats->lss_max_search,
                                             cfs_hash_bd_depmax_get(&bd));
@@ -2103,7 +2306,7 @@ void lu_context_keys_dump(void)
  */
 int lu_global_init(void)
 {
-        int result;
+	int result;
 	DEF_SHRINKER_VAR(shvar, lu_cache_shrink,
 			 lu_cache_shrink_count, lu_cache_shrink_scan);
 
@@ -2138,6 +2341,8 @@ int lu_global_init(void)
         if (lu_site_shrinker == NULL)
                 return -ENOMEM;
 
+	result = lu_env_rhash_init(&lu_env_rhash, &lu_env_rhash_params);
+
         return result;
 }
 
@@ -2161,6 +2366,8 @@ void lu_global_fini(void)
         lu_env_fini(&lu_shrink_env);
 	up_write(&lu_sites_guard);
 
+	lu_env_rhash_destroy(&lu_env_rhash);
+
         lu_ref_global_fini();
 }
 
@@ -2185,7 +2392,7 @@ int lu_site_stats_seq_print(const struct lu_site *s, struct seq_file *m)
 	lu_site_stats_t stats;
 
 	memset(&stats, 0, sizeof(stats));
-	lu_site_stats_get(s->ls_obj_hash, &stats, 1);
+	lu_site_stats_get(s, &stats, 1);
 
 	seq_printf(m, "%d/%d %d/%d %d %d %d %d %d %d %d\n",
 		   stats.lss_busy,
@@ -2283,11 +2490,19 @@ struct lu_object *lu_object_anon(const struct lu_env *env,
 				 struct lu_device *dev,
 				 const struct lu_object_conf *conf)
 {
-	struct lu_fid     fid;
+	struct lu_fid fid;
 	struct lu_object *o;
+	int rc;
 
 	fid_zero(&fid);
-	o = lu_object_alloc(env, dev, &fid, conf);
+	o = lu_object_alloc(env, dev, &fid);
+	if (!IS_ERR(o)) {
+		rc = lu_object_start(env, dev, o, conf);
+		if (rc) {
+			lu_object_free(env, o);
+			return ERR_PTR(rc);
+		}
+	}
 
 	return o;
 }
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c b/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c
index bef29033f30ee..e0a75791f1e6e 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -65,14 +65,14 @@
 static struct kmem_cache *lu_ref_link_kmem;
 
 static struct lu_kmem_descr lu_ref_caches[] = {
-        {
-                .ckd_cache = &lu_ref_link_kmem,
-                .ckd_name  = "lu_ref_link_kmem",
-                .ckd_size  = sizeof (struct lu_ref_link)
-        },
-        {
-                .ckd_cache = NULL
-        }
+	{
+		.ckd_cache = &lu_ref_link_kmem,
+		.ckd_name  = "lu_ref_link_kmem",
+		.ckd_size  = sizeof(struct lu_ref_link)
+	},
+	{
+		.ckd_cache = NULL
+	}
 };
 
 /**
@@ -90,18 +90,18 @@ static struct lu_ref lu_ref_marker = {
 
 void lu_ref_print(const struct lu_ref *ref)
 {
-        struct lu_ref_link *link;
+	struct lu_ref_link *link;
 
-        CERROR("lu_ref: %p %d %d %s:%d\n",
-               ref, ref->lf_refs, ref->lf_failed, ref->lf_func, ref->lf_line);
+	CERROR("lu_ref: %p %d %d %s:%d\n",
+	       ref, ref->lf_refs, ref->lf_failed, ref->lf_func, ref->lf_line);
 	list_for_each_entry(link, &ref->lf_list, ll_linkage) {
-                CERROR("     link: %s %p\n", link->ll_scope, link->ll_source);
-        }
+		CERROR("     link: %s %p\n", link->ll_scope, link->ll_source);
+	}
 }
 
 static int lu_ref_is_marker(const struct lu_ref *ref)
 {
-        return (ref == &lu_ref_marker);
+	return ref == &lu_ref_marker;
 }
 
 void lu_ref_print_all(void)
@@ -146,19 +146,19 @@ void lu_ref_fini(struct lu_ref *ref)
 EXPORT_SYMBOL(lu_ref_fini);
 
 static struct lu_ref_link *lu_ref_add_context(struct lu_ref *ref,
-                                              int flags,
-                                              const char *scope,
-                                              const void *source)
+					      int flags,
+					      const char *scope,
+					      const void *source)
 {
-        struct lu_ref_link *link;
-
-        link = NULL;
-        if (lu_ref_link_kmem != NULL) {
-                OBD_SLAB_ALLOC_PTR_GFP(link, lu_ref_link_kmem, flags);
-                if (link != NULL) {
-                        link->ll_ref    = ref;
-                        link->ll_scope  = scope;
-                        link->ll_source = source;
+	struct lu_ref_link *link;
+
+	link = NULL;
+	if (lu_ref_link_kmem != NULL) {
+		OBD_SLAB_ALLOC_PTR_GFP(link, lu_ref_link_kmem, flags);
+		if (link != NULL) {
+			link->ll_ref = ref;
+			link->ll_scope = scope;
+			link->ll_source = source;
 			spin_lock(&ref->lf_guard);
 			list_add_tail(&link->ll_linkage, &ref->lf_list);
 			ref->lf_refs++;
@@ -207,9 +207,10 @@ void lu_ref_add_atomic(struct lu_ref *ref, const char *scope,
 EXPORT_SYMBOL(lu_ref_add_atomic);
 
 static inline int lu_ref_link_eq(const struct lu_ref_link *link,
-                                 const char *scope, const void *source)
+				 const char *scope,
+				 const void *source)
 {
-        return link->ll_source == source && !strcmp(link->ll_scope, scope);
+	return link->ll_source == source && !strcmp(link->ll_scope, scope);
 }
 
 /**
@@ -223,22 +224,22 @@ static unsigned lu_ref_chain_max_length = 127;
 static struct lu_ref_link *lu_ref_find(struct lu_ref *ref, const char *scope,
                                        const void *source)
 {
-        struct lu_ref_link *link;
-        unsigned            iterations;
+	struct lu_ref_link *link;
+	unsigned int iterations;
 
-        iterations = 0;
+	iterations = 0;
 	list_for_each_entry(link, &ref->lf_list, ll_linkage) {
-                ++iterations;
-                if (lu_ref_link_eq(link, scope, source)) {
-                        if (iterations > lu_ref_chain_max_length) {
-                                CWARN("Long lu_ref chain %d \"%s\":%p\n",
-                                      iterations, scope, source);
-                                lu_ref_chain_max_length = iterations * 3 / 2;
-                        }
-                        return link;
-                }
-        }
-        return NULL;
+		++iterations;
+		if (lu_ref_link_eq(link, scope, source)) {
+			if (iterations > lu_ref_chain_max_length) {
+				CWARN("Long lu_ref chain %d \"%s\":%p\n",
+				      iterations, scope, source);
+				lu_ref_chain_max_length = iterations * 3 / 2;
+			}
+			return link;
+		}
+	}
+	return NULL;
 }
 
 void lu_ref_del(struct lu_ref *ref, const char *scope, const void *source)
@@ -302,10 +303,10 @@ static void *lu_ref_seq_start(struct seq_file *seq, loff_t *pos)
 
 static void *lu_ref_seq_next(struct seq_file *seq, void *p, loff_t *pos)
 {
-        struct lu_ref *ref = p;
-        struct lu_ref *next;
+	struct lu_ref *ref = p;
+	struct lu_ref *next;
 
-        LASSERT(seq->private == p);
+	LASSERT(seq->private == p);
 	LASSERT(!list_empty(&ref->lf_linkage));
 
 	spin_lock(&lu_ref_refs_guard);
@@ -322,7 +323,7 @@ static void *lu_ref_seq_next(struct seq_file *seq, void *p, loff_t *pos)
 
 static void lu_ref_seq_stop(struct seq_file *seq, void *p)
 {
-        /* Nothing to do */
+	/* Nothing to do */
 }
 
 
@@ -340,19 +341,19 @@ static int lu_ref_seq_show(struct seq_file *seq, void *p)
 
 	/* print the entry */
 	spin_lock(&next->lf_guard);
-        seq_printf(seq, "lu_ref: %p %d %d %s:%d\n",
-                   next, next->lf_refs, next->lf_failed,
-                   next->lf_func, next->lf_line);
-        if (next->lf_refs > 64) {
-                seq_printf(seq, "  too many references, skip\n");
-        } else {
-                struct lu_ref_link *link;
-                int i = 0;
+	seq_printf(seq, "lu_ref: %p %d %d %s:%d\n",
+		   next, next->lf_refs, next->lf_failed,
+		   next->lf_func, next->lf_line);
+	if (next->lf_refs > 64) {
+		seq_puts(seq, "  too many references, skip\n");
+	} else {
+		struct lu_ref_link *link;
+		int i = 0;
 
 		list_for_each_entry(link, &next->lf_list, ll_linkage)
-                        seq_printf(seq, "  #%d link: %s %p\n",
-                                   i++, link->ll_scope, link->ll_source);
-        }
+			seq_printf(seq, "  #%d link: %s %p\n",
+				   i++, link->ll_scope, link->ll_source);
+	}
 	spin_unlock(&next->lf_guard);
 	spin_unlock(&lu_ref_refs_guard);
 
@@ -360,10 +361,10 @@ static int lu_ref_seq_show(struct seq_file *seq, void *p)
 }
 
 static struct seq_operations lu_ref_seq_ops = {
-        .start = lu_ref_seq_start,
-        .stop  = lu_ref_seq_stop,
-        .next  = lu_ref_seq_next,
-        .show  = lu_ref_seq_show
+	.start = lu_ref_seq_start,
+	.stop  = lu_ref_seq_stop,
+	.next  = lu_ref_seq_next,
+	.show  = lu_ref_seq_show
 };
 
 static int lu_ref_seq_open(struct inode *inode, struct file *file)
@@ -380,15 +381,16 @@ static int lu_ref_seq_open(struct inode *inode, struct file *file)
 			list_add(&marker->lf_linkage, &lu_ref_refs);
 		spin_unlock(&lu_ref_refs_guard);
 
-                if (result == 0) {
-                        struct seq_file *f = file->private_data;
-                        f->private = marker;
-                } else {
-                        seq_release(inode, file);
-                }
-        }
+		if (result == 0) {
+			struct seq_file *f = file->private_data;
+
+			f->private = marker;
+		} else {
+			seq_release(inode, file);
+		}
+	}
 
-        return result;
+	return result;
 }
 
 static int lu_ref_seq_release(struct inode *inode, struct file *file)
@@ -403,11 +405,11 @@ static int lu_ref_seq_release(struct inode *inode, struct file *file)
 }
 
 static struct file_operations lu_ref_dump_fops = {
-        .owner   = THIS_MODULE,
-        .open    = lu_ref_seq_open,
-        .read    = seq_read,
-        .llseek  = seq_lseek,
-        .release = lu_ref_seq_release
+	.owner   = THIS_MODULE,
+	.open    = lu_ref_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = lu_ref_seq_release
 };
 
 #endif /* CONFIG_PROC_FS */
@@ -419,26 +421,26 @@ int lu_ref_global_init(void)
 	CDEBUG(D_CONSOLE,
 	       "lu_ref tracking is enabled. Performance isn't.\n");
 
-        result = lu_kmem_init(lu_ref_caches);
+	result = lu_kmem_init(lu_ref_caches);
 
 #ifdef CONFIG_PROC_FS
-        if (result == 0) {
-                result = lprocfs_seq_create(proc_lustre_root, "lu_refs",
-                                            0444, &lu_ref_dump_fops, NULL);
-                if (result)
-                        lu_kmem_fini(lu_ref_caches);
-        }
+	if (result == 0) {
+		result = lprocfs_seq_create(proc_lustre_root, "lu_refs",
+					    0444, &lu_ref_dump_fops, NULL);
+		if (result)
+			lu_kmem_fini(lu_ref_caches);
+	}
 #endif /* CONFIG_PROC_FS */
 
-        return result;
+	return result;
 }
 
 void lu_ref_global_fini(void)
 {
 #ifdef CONFIG_PROC_FS
-        lprocfs_remove_proc_entry("lu_refs", proc_lustre_root);
+	lprocfs_remove_proc_entry("lu_refs", proc_lustre_root);
 #endif /* CONFIG_PROC_FS */
-        lu_kmem_fini(lu_ref_caches);
+	lu_kmem_fini(lu_ref_caches);
 }
 
 #endif /* USE_LU_REF */
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lustre_handles.c b/drivers/staging/lustrefsx/lustre/obdclass/lustre_handles.c
index bd149ddf7a967..4161b2dabfd72 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lustre_handles.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lustre_handles.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -46,7 +46,7 @@ static __u64 handle_base;
 static DEFINE_SPINLOCK(handle_base_lock);
 
 static struct handle_bucket {
-	spinlock_t	 lock;
+	spinlock_t lock;
 	struct list_head head;
 } *handle_hash;
 
@@ -60,16 +60,17 @@ static struct handle_bucket {
 void class_handle_hash(struct portals_handle *h,
 		       struct portals_handle_ops *ops)
 {
-        struct handle_bucket *bucket;
-        ENTRY;
+	struct handle_bucket *bucket;
+
+	ENTRY;
 
-        LASSERT(h != NULL);
+	LASSERT(h != NULL);
 	LASSERT(list_empty(&h->h_link));
 
-        /*
-         * This is fast, but simplistic cookie generation algorithm, it will
-         * need a re-do at some point in the future for security.
-         */
+	/*
+	 * This is fast, but simplistic cookie generation algorithm, it will
+	 * need a re-do at some point in the future for security.
+	 */
 	spin_lock(&handle_base_lock);
 	handle_base += HANDLE_INCR;
 
@@ -104,12 +105,12 @@ static void class_handle_unhash_nolock(struct portals_handle *h)
 {
 	if (list_empty(&h->h_link)) {
 		CERROR("removing an already-removed handle (%#llx)\n",
-                       h->h_cookie);
-                return;
-        }
+		       h->h_cookie);
+		return;
+	}
 
 	CDEBUG(D_INFO, "removing object %p with handle %#llx from hash\n",
-               h, h->h_cookie);
+	       h, h->h_cookie);
 
 	spin_lock(&h->h_lock);
 	if (h->h_in == 0) {
@@ -150,21 +151,24 @@ EXPORT_SYMBOL(class_handle_hash_back);
 
 void *class_handle2object(__u64 cookie, const void *owner)
 {
-        struct handle_bucket *bucket;
-        struct portals_handle *h;
-        void *retval = NULL;
-        ENTRY;
+	struct handle_bucket *bucket;
+	struct portals_handle *h;
+	void *retval = NULL;
+
+	ENTRY;
 
-        LASSERT(handle_hash != NULL);
+	LASSERT(handle_hash != NULL);
 
-	/* Be careful when you want to change this code. See the
-	 * rcu_read_lock() definition on top this file. - jxiong */
-        bucket = handle_hash + (cookie & HANDLE_HASH_MASK);
+	/*
+	 * Be careful when you want to change this code. See the
+	 * rcu_read_lock() definition on top this file. - jxiong
+	 */
+	bucket = handle_hash + (cookie & HANDLE_HASH_MASK);
 
-        rcu_read_lock();
-        list_for_each_entry_rcu(h, &bucket->head, h_link) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(h, &bucket->head, h_link) {
 		if (h->h_cookie != cookie || h->h_owner != owner)
-                        continue;
+			continue;
 
 		spin_lock(&h->h_lock);
 		if (likely(h->h_in != 0)) {
@@ -197,15 +201,15 @@ EXPORT_SYMBOL(class_handle_free_cb);
 
 int class_handle_init(void)
 {
-        struct handle_bucket *bucket;
+	struct handle_bucket *bucket;
 	struct timespec64 ts;
-        int seed[2];
+	int seed[2];
 
-        LASSERT(handle_hash == NULL);
+	LASSERT(handle_hash == NULL);
 
-        OBD_ALLOC_LARGE(handle_hash, sizeof(*bucket) * HANDLE_HASH_SIZE);
-        if (handle_hash == NULL)
-                return -ENOMEM;
+	OBD_ALLOC_LARGE(handle_hash, sizeof(*bucket) * HANDLE_HASH_SIZE);
+	if (handle_hash == NULL)
+		return -ENOMEM;
 
 	for (bucket = handle_hash + HANDLE_HASH_SIZE - 1; bucket >= handle_hash;
 	     bucket--) {
@@ -218,10 +222,10 @@ int class_handle_init(void)
 	ktime_get_ts64(&ts);
 	cfs_srand(ts.tv_sec ^ seed[0], ts.tv_nsec ^ seed[1]);
 
-        cfs_get_random_bytes(&handle_base, sizeof(handle_base));
-        LASSERT(handle_base != 0ULL);
+	cfs_get_random_bytes(&handle_base, sizeof(handle_base));
+	LASSERT(handle_base != 0ULL);
 
-        return 0;
+	return 0;
 }
 
 static int cleanup_all_handles(void)
@@ -248,14 +252,15 @@ static int cleanup_all_handles(void)
 
 void class_handle_cleanup(void)
 {
-        int count;
-        LASSERT(handle_hash != NULL);
+	int count;
+
+	LASSERT(handle_hash != NULL);
 
-        count = cleanup_all_handles();
+	count = cleanup_all_handles();
 
-        OBD_FREE_LARGE(handle_hash, sizeof(*handle_hash) * HANDLE_HASH_SIZE);
-        handle_hash = NULL;
+	OBD_FREE_LARGE(handle_hash, sizeof(*handle_hash) * HANDLE_HASH_SIZE);
+	handle_hash = NULL;
 
-        if (count != 0)
-                CERROR("handle_count at cleanup: %d\n", count);
+	if (count != 0)
+		CERROR("handle_count at cleanup: %d\n", count);
 }
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lustre_peer.c b/drivers/staging/lustrefsx/lustre/obdclass/lustre_peer.c
index 95716e1ccac88..535d78eac5578 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lustre_peer.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lustre_peer.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2012, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -80,51 +80,51 @@ EXPORT_SYMBOL(lustre_uuid_to_peer);
    LNET will choose the best one. */
 int class_add_uuid(const char *uuid, __u64 nid)
 {
-        struct uuid_nid_data *data, *entry;
-        int found = 0;
+	struct uuid_nid_data *data, *entry;
+	int found = 0;
 
-        LASSERT(nid != 0);  /* valid newconfig NID is never zero */
+	LASSERT(nid != 0);  /* valid newconfig NID is never zero */
 
-        if (strlen(uuid) > UUID_MAX - 1)
-                return -EOVERFLOW;
+	if (strlen(uuid) > UUID_MAX - 1)
+		return -EOVERFLOW;
 
-        OBD_ALLOC_PTR(data);
-        if (data == NULL)
-                return -ENOMEM;
+	OBD_ALLOC_PTR(data);
+	if (data == NULL)
+		return -ENOMEM;
 
-        obd_str2uuid(&data->un_uuid, uuid);
-        data->un_nids[0] = nid;
-        data->un_nid_count = 1;
+	obd_str2uuid(&data->un_uuid, uuid);
+	data->un_nids[0] = nid;
+	data->un_nid_count = 1;
 
 	spin_lock(&g_uuid_lock);
 	list_for_each_entry(entry, &g_uuid_list, un_list) {
-                if (obd_uuid_equals(&entry->un_uuid, &data->un_uuid)) {
-                        int i;
-
-                        found = 1;
-                        for (i = 0; i < entry->un_nid_count; i++)
-                                if (nid == entry->un_nids[i])
-                                        break;
-
-                        if (i == entry->un_nid_count) {
-                                LASSERT(entry->un_nid_count < NIDS_MAX);
-                                entry->un_nids[entry->un_nid_count++] = nid;
-                        }
-                        break;
-                }
-        }
-        if (!found)
+		if (obd_uuid_equals(&entry->un_uuid, &data->un_uuid)) {
+			int i;
+
+			found = 1;
+			for (i = 0; i < entry->un_nid_count; i++)
+				if (nid == entry->un_nids[i])
+					break;
+
+			if (i == entry->un_nid_count) {
+				LASSERT(entry->un_nid_count < NIDS_MAX);
+				entry->un_nids[entry->un_nid_count++] = nid;
+			}
+			break;
+		}
+	}
+	if (!found)
 		list_add(&data->un_list, &g_uuid_list);
 	spin_unlock(&g_uuid_lock);
 
-        if (found) {
-                CDEBUG(D_INFO, "found uuid %s %s cnt=%d\n", uuid,
-                       libcfs_nid2str(nid), entry->un_nid_count);
-                OBD_FREE(data, sizeof(*data));
-        } else {
-                CDEBUG(D_INFO, "add uuid %s %s\n", uuid, libcfs_nid2str(nid));
-        }
-        return 0;
+	if (found) {
+		CDEBUG(D_INFO, "found uuid %s %s cnt=%d\n", uuid,
+		       libcfs_nid2str(nid), entry->un_nid_count);
+		OBD_FREE(data, sizeof(*data));
+	} else {
+		CDEBUG(D_INFO, "add uuid %s %s\n", uuid, libcfs_nid2str(nid));
+	}
+	return 0;
 }
 
 /* Delete the nids for one uuid if specified, otherwise delete all */
@@ -173,29 +173,30 @@ int class_del_uuid(const char *uuid)
 /* check if @nid exists in nid list of @uuid */
 int class_check_uuid(struct obd_uuid *uuid, __u64 nid)
 {
-        struct uuid_nid_data *entry;
-        int found = 0;
-        ENTRY;
+	struct uuid_nid_data *entry;
+	int found = 0;
 
-        CDEBUG(D_INFO, "check if uuid %s has %s.\n",
-               obd_uuid2str(uuid), libcfs_nid2str(nid));
+	ENTRY;
+
+	CDEBUG(D_INFO, "check if uuid %s has %s.\n",
+	       obd_uuid2str(uuid), libcfs_nid2str(nid));
 
 	spin_lock(&g_uuid_lock);
 	list_for_each_entry(entry, &g_uuid_list, un_list) {
-                int i;
+		int i;
 
-                if (!obd_uuid_equals(&entry->un_uuid, uuid))
+		if (!obd_uuid_equals(&entry->un_uuid, uuid))
                         continue;
 
-                /* found the uuid, check if it has @nid */
-                for (i = 0; i < entry->un_nid_count; i++) {
-                        if (entry->un_nids[i] == nid) {
-                                found = 1;
-                                break;
-                        }
-                }
-                break;
-        }
+		/* found the uuid, check if it has @nid */
+		for (i = 0; i < entry->un_nid_count; i++) {
+			if (entry->un_nids[i] == nid) {
+				found = 1;
+				break;
+			}
+		}
+		break;
+	}
 	spin_unlock(&g_uuid_lock);
 	RETURN(found);
 }
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/md_attrs.c b/drivers/staging/lustrefsx/lustre/obdclass/md_attrs.c
index 85003937e7466..d0ca4f17b1cb3 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/md_attrs.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/md_attrs.c
@@ -21,14 +21,11 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  * Use is subject to license terms.
  *
  * Author: Johann Lombardi <johann.lombardi@intel.com>
  */
-
-#include <lustre/lustre_idl.h>
-
 #include <llog_swab.h>
 #include <lustre_swab.h>
 #include <obd.h>
@@ -50,9 +47,9 @@ void lustre_lma_init(struct lustre_mdt_attrs *lma, const struct lu_fid *fid,
 
 	/* If a field is added in struct lustre_mdt_attrs, zero it explicitly
 	 * and change the test below. */
-	LASSERT(sizeof(*lma) ==
-		(offsetof(struct lustre_mdt_attrs, lma_self_fid) +
-		 sizeof(lma->lma_self_fid)));
+	CLASSERT(sizeof(*lma) ==
+		 (offsetof(struct lustre_mdt_attrs, lma_self_fid) +
+		  sizeof(lma->lma_self_fid)));
 }
 EXPORT_SYMBOL(lustre_lma_init);
 
@@ -114,6 +111,22 @@ void lustre_loa_swab(struct lustre_ost_attrs *loa, bool to_cpu)
 }
 EXPORT_SYMBOL(lustre_loa_swab);
 
+/**
+ * Swab, if needed, SOM structure which is stored on-disk in little-endian
+ * order.
+ *
+ * \param attrs - is a pointer to the SOM structure to be swabbed.
+ */
+void lustre_som_swab(struct lustre_som_attrs *attrs)
+{
+#ifdef __BIG_ENDIAN
+	__swab16s(&attrs->lsa_valid);
+	__swab64s(&attrs->lsa_size);
+	__swab64s(&attrs->lsa_blocks);
+#endif
+}
+EXPORT_SYMBOL(lustre_som_swab);
+
 /**
  * Swab, if needed, HSM structure which is stored on-disk in little-endian
  * order.
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_cksum.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_cksum.c
new file mode 100644
index 0000000000000..16e6f12f8a05c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_cksum.c
@@ -0,0 +1,149 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2018, DataDirect Networks Storage.
+ * Author: Li Xi.
+ *
+ * Checksum functions
+ */
+#include <obd_class.h>
+#include <obd_cksum.h>
+
+/* Server uses algos that perform at 50% or better of the Adler */
+enum cksum_types obd_cksum_types_supported_server(const char *obd_name)
+{
+	enum cksum_types ret = OBD_CKSUM_ADLER;
+	int base_speed;
+
+	CDEBUG(D_INFO, "%s: checksum speed: crc %d, crc32c %d, adler %d, "
+	       "t10ip512 %d, t10ip4k %d, t10crc512 %d, t10crc4k %d\n",
+	       obd_name,
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)),
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)),
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)),
+	       obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP512),
+	       obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP4K),
+	       obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC512),
+	       obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC4K));
+
+	base_speed = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)) / 2;
+
+	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) >=
+	    base_speed)
+		ret |= OBD_CKSUM_CRC32C;
+
+	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) >=
+	    base_speed)
+		ret |= OBD_CKSUM_CRC32;
+
+	if (obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP512) >= base_speed)
+		ret |= OBD_CKSUM_T10IP512;
+
+	if (obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP4K) >= base_speed)
+		ret |= OBD_CKSUM_T10IP4K;
+
+	if (obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC512) >= base_speed)
+		ret |= OBD_CKSUM_T10CRC512;
+
+	if (obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC4K) >= base_speed)
+		ret |= OBD_CKSUM_T10CRC4K;
+
+	return ret;
+}
+EXPORT_SYMBOL(obd_cksum_types_supported_server);
+
+/* The OBD_FL_CKSUM_* flags is packed into 5 bits of o_flags, since there can
+ * only be a single checksum type per RPC.
+ *
+ * The OBD_CKSUM_* type bits passed in ocd_cksum_types are a 32-bit bitmask
+ * since they need to represent the full range of checksum algorithms that
+ * both the client and server can understand.
+ *
+ * In case of an unsupported types/flags we fall back to ADLER
+ * because that is supported by all clients since 1.8
+ *
+ * In case multiple algorithms are supported the best one is used. */
+u32 obd_cksum_type_pack(const char *obd_name, enum cksum_types cksum_type)
+{
+	unsigned int performance = 0, tmp;
+	u32 flag = OBD_FL_CKSUM_ADLER;
+
+	if (cksum_type & OBD_CKSUM_CRC32) {
+		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32));
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_CRC32;
+		}
+	}
+	if (cksum_type & OBD_CKSUM_CRC32C) {
+		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C));
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_CRC32C;
+		}
+	}
+	if (cksum_type & OBD_CKSUM_ADLER) {
+		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER));
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_ADLER;
+		}
+	}
+
+	if (cksum_type & OBD_CKSUM_T10IP512) {
+		tmp = obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP512);
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_T10IP512;
+		}
+	}
+
+	if (cksum_type & OBD_CKSUM_T10IP4K) {
+		tmp = obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP4K);
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_T10IP4K;
+		}
+	}
+
+	if (cksum_type & OBD_CKSUM_T10CRC512) {
+		tmp = obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC512);
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_T10CRC512;
+		}
+	}
+
+	if (cksum_type & OBD_CKSUM_T10CRC4K) {
+		tmp = obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC4K);
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_T10CRC4K;
+		}
+	}
+
+	if (unlikely(cksum_type && !(cksum_type & OBD_CKSUM_ALL)))
+		CWARN("%s: unknown cksum type %x\n", obd_name, cksum_type);
+
+	return flag;
+}
+EXPORT_SYMBOL(obd_cksum_type_pack);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
index 924322ef86e8c..a5b5dcfe572fe 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -36,14 +36,15 @@
 
 #define DEBUG_SUBSYSTEM S_CLASS
 
+#include <linux/kobject.h>
 #include <linux/string.h>
 
 #include <llog_swab.h>
 #include <lprocfs_status.h>
 #include <lustre_disk.h>
-#include <uapi/linux/lustre_ioctl.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
 #include <lustre_log.h>
-#include <uapi/linux/lustre_param.h>
+#include <uapi/linux/lustre/lustre_param.h>
 #include <obd_class.h>
 
 #include "llog_internal.h"
@@ -365,6 +366,7 @@ EXPORT_SYMBOL(lustre_cfg_string);
  */
 int class_attach(struct lustre_cfg *lcfg)
 {
+	struct obd_export *exp;
         struct obd_device *obd = NULL;
         char *typename, *name, *uuid;
         int rc, len;
@@ -381,90 +383,54 @@ int class_attach(struct lustre_cfg *lcfg)
                 RETURN(-EINVAL);
         }
         name = lustre_cfg_string(lcfg, 0);
-
         if (!LUSTRE_CFG_BUFLEN(lcfg, 2)) {
                 CERROR("No UUID passed!\n");
                 RETURN(-EINVAL);
         }
-        uuid = lustre_cfg_string(lcfg, 2);
 
-        CDEBUG(D_IOCTL, "attach type %s name: %s uuid: %s\n",
-               MKSTR(typename), MKSTR(name), MKSTR(uuid));
+	uuid = lustre_cfg_string(lcfg, 2);
+	len = strlen(uuid);
+	if (len >= sizeof(obd->obd_uuid)) {
+		CERROR("%s: uuid must be < %d bytes long\n",
+		       name, (int)sizeof(obd->obd_uuid));
+		RETURN(-EINVAL);
+	}
 
-        obd = class_newdev(typename, name);
-        if (IS_ERR(obd)) {
-                /* Already exists or out of obds */
-                rc = PTR_ERR(obd);
-                obd = NULL;
+	obd = class_newdev(typename, name, uuid);
+	if (IS_ERR(obd)) { /* Already exists or out of obds */
+		rc = PTR_ERR(obd);
                 CERROR("Cannot create device %s of type %s : %d\n",
                        name, typename, rc);
-                GOTO(out, rc);
+		RETURN(rc);
         }
-        LASSERTF(obd != NULL, "Cannot get obd device %s of type %s\n",
-                 name, typename);
         LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
                  "obd %p obd_magic %08X != %08X\n",
                  obd, obd->obd_magic, OBD_DEVICE_MAGIC);
         LASSERTF(strncmp(obd->obd_name, name, strlen(name)) == 0,
                  "%p obd_name %s != %s\n", obd, obd->obd_name, name);
 
-	rwlock_init(&obd->obd_pool_lock);
-	obd->obd_pool_limit = 0;
-	obd->obd_pool_slv = 0;
-
-	INIT_LIST_HEAD(&obd->obd_exports);
-	INIT_LIST_HEAD(&obd->obd_unlinked_exports);
-	INIT_LIST_HEAD(&obd->obd_delayed_exports);
-	INIT_LIST_HEAD(&obd->obd_exports_timed);
-	INIT_LIST_HEAD(&obd->obd_nid_stats);
-	spin_lock_init(&obd->obd_nid_lock);
-	spin_lock_init(&obd->obd_dev_lock);
-	mutex_init(&obd->obd_dev_mutex);
-	spin_lock_init(&obd->obd_osfs_lock);
-	/* obd->obd_osfs_age must be set to a value in the distant
-	 * past to guarantee a fresh statfs is fetched on mount. */
-	obd->obd_osfs_age = cfs_time_shift_64(-1000);
-
-	/* XXX belongs in setup not attach  */
-	init_rwsem(&obd->obd_observer_link_sem);
-	/* recovery data */
-	spin_lock_init(&obd->obd_recovery_task_lock);
-	init_waitqueue_head(&obd->obd_next_transno_waitq);
-	init_waitqueue_head(&obd->obd_evict_inprogress_waitq);
-	INIT_LIST_HEAD(&obd->obd_req_replay_queue);
-	INIT_LIST_HEAD(&obd->obd_lock_replay_queue);
-	INIT_LIST_HEAD(&obd->obd_final_req_queue);
-	INIT_LIST_HEAD(&obd->obd_evict_list);
-	INIT_LIST_HEAD(&obd->obd_lwp_list);
-
-	llog_group_init(&obd->obd_olg);
-
-	obd->obd_conn_inprogress = 0;
-
-        len = strlen(uuid);
-        if (len >= sizeof(obd->obd_uuid)) {
-                CERROR("uuid must be < %d bytes long\n",
-                       (int)sizeof(obd->obd_uuid));
-                GOTO(out, rc = -EINVAL);
-        }
-        memcpy(obd->obd_uuid.uuid, uuid, len);
+	exp = class_new_export_self(obd, &obd->obd_uuid);
+	if (IS_ERR(exp)) {
+		rc = PTR_ERR(exp);
+		class_free_dev(obd);
+		RETURN(rc);
+	}
 
-        /* Detach drops this */
-	spin_lock(&obd->obd_dev_lock);
-	atomic_set(&obd->obd_refcount, 1);
-	spin_unlock(&obd->obd_dev_lock);
-        lu_ref_init(&obd->obd_reference);
-        lu_ref_add(&obd->obd_reference, "attach", obd);
+	obd->obd_self_export = exp;
+	list_del_init(&exp->exp_obd_chain_timed);
+	class_export_put(exp);
+
+	rc = class_register_device(obd);
+	if (rc != 0) {
+		class_decref(obd, "newdev", obd);
+		RETURN(rc);
+	}
 
-        obd->obd_attached = 1;
-        CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n",
+	obd->obd_attached = 1;
+	CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n",
 	       obd->obd_minor, typename, atomic_read(&obd->obd_refcount));
-        RETURN(0);
- out:
-        if (obd != NULL) {
-                class_release_dev(obd);
-        }
-        return rc;
+
+	RETURN(0);
 }
 EXPORT_SYMBOL(class_attach);
 
@@ -474,7 +440,6 @@ EXPORT_SYMBOL(class_attach);
 int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 {
         int err = 0;
-        struct obd_export *exp;
         ENTRY;
 
         LASSERT(obd != NULL);
@@ -523,7 +488,7 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
                                              CFS_HASH_MAX_THETA,
                                              &uuid_hash_ops, CFS_HASH_DEFAULT);
         if (!obd->obd_uuid_hash)
-                GOTO(err_hash, err = -ENOMEM);
+		GOTO(err_exit, err = -ENOMEM);
 
         /* create a nid-export lustre hash */
         obd->obd_nid_hash = cfs_hash_create("NID_HASH",
@@ -534,7 +499,7 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
                                             CFS_HASH_MAX_THETA,
                                             &nid_hash_ops, CFS_HASH_DEFAULT);
         if (!obd->obd_nid_hash)
-                GOTO(err_hash, err = -ENOMEM);
+		GOTO(err_exit, err = -ENOMEM);
 
         /* create a nid-stats lustre hash */
         obd->obd_nid_stats_hash = cfs_hash_create("NID_STATS",
@@ -544,8 +509,8 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
                                                   CFS_HASH_MIN_THETA,
                                                   CFS_HASH_MAX_THETA,
                                                   &nid_stat_hash_ops, CFS_HASH_DEFAULT);
-        if (!obd->obd_nid_stats_hash)
-                GOTO(err_hash, err = -ENOMEM);
+	if (!obd->obd_nid_stats_hash)
+		GOTO(err_exit, err = -ENOMEM);
 
 	/* create a client_generation-export lustre hash */
 	obd->obd_gen_hash = cfs_hash_create("UUID_HASH",
@@ -556,21 +521,13 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 					    CFS_HASH_MAX_THETA,
 					    &gen_hash_ops, CFS_HASH_DEFAULT);
 	if (!obd->obd_gen_hash)
-		GOTO(err_hash, err = -ENOMEM);
+		GOTO(err_exit, err = -ENOMEM);
 
-        exp = class_new_export(obd, &obd->obd_uuid);
-        if (IS_ERR(exp))
-                GOTO(err_hash, err = PTR_ERR(exp));
-
-        obd->obd_self_export = exp;
-	list_del_init(&exp->exp_obd_chain_timed);
-        class_export_put(exp);
-
-        err = obd_setup(obd, lcfg);
-        if (err)
-                GOTO(err_exp, err);
+	err = obd_setup(obd, lcfg);
+	if (err)
+		GOTO(err_exit, err);
 
-        obd->obd_set_up = 1;
+	obd->obd_set_up = 1;
 
 	spin_lock(&obd->obd_dev_lock);
 	/* cleanup drops this */
@@ -581,12 +538,7 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
                obd->obd_name, obd->obd_uuid.uuid);
 
         RETURN(0);
-err_exp:
-        if (obd->obd_self_export) {
-                class_unlink_export(obd->obd_self_export);
-                obd->obd_self_export = NULL;
-        }
-err_hash:
+err_exit:
         if (obd->obd_uuid_hash) {
                 cfs_hash_putref(obd->obd_uuid_hash);
                 obd->obd_uuid_hash = NULL;
@@ -630,10 +582,14 @@ int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg)
 	obd->obd_attached = 0;
 	spin_unlock(&obd->obd_dev_lock);
 
+	/* cleanup in progress. we don't like to find this device after now */
+	class_unregister_device(obd);
+
         CDEBUG(D_IOCTL, "detach on obd %s (uuid %s)\n",
                obd->obd_name, obd->obd_uuid.uuid);
 
-        class_decref(obd, "attach", obd);
+	class_decref(obd, "newdev", obd);
+
         RETURN(0);
 }
 EXPORT_SYMBOL(class_detach);
@@ -663,6 +619,9 @@ int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
 	}
 	/* Leave this on forever */
 	obd->obd_stopping = 1;
+	/* function can't return error after that point, so clear setup flag
+	 * as early as possible to avoid finding via obd_devs / hash */
+	obd->obd_set_up = 0;
 	spin_unlock(&obd->obd_dev_lock);
 
 	/* wait for already-arrived-connections to finish. */
@@ -695,17 +654,11 @@ int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
 
 	LASSERT(obd->obd_self_export);
 
-	/* The three references that should be remaining are the
-	 * obd_self_export and the attach and setup references. */
-	if (atomic_read(&obd->obd_refcount) > 3) {
-		/* refcounf - 3 might be the number of real exports
-		   (excluding self export). But class_incref is called
-		   by other things as well, so don't count on it. */
-		CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d\n",
-		       obd->obd_name, atomic_read(&obd->obd_refcount) - 3);
-		dump_exports(obd, 0, D_HA);
-		class_disconnect_exports(obd);
-	}
+	CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d/%d\n",
+	       obd->obd_name, obd->obd_num_exports,
+	       atomic_read(&obd->obd_refcount) - 2);
+	dump_exports(obd, 0, D_HA);
+	class_disconnect_exports(obd);
 
 	/* Precleanup, we must make sure all exports get destroyed. */
 	err = obd_precleanup(obd);
@@ -757,43 +710,27 @@ EXPORT_SYMBOL(class_incref);
 
 void class_decref(struct obd_device *obd, const char *scope, const void *source)
 {
-	int err;
-	int refs;
+	int last;
 
-	spin_lock(&obd->obd_dev_lock);
-	atomic_dec(&obd->obd_refcount);
-	refs = atomic_read(&obd->obd_refcount);
-	spin_unlock(&obd->obd_dev_lock);
+	CDEBUG(D_INFO, "Decref %s (%p) now %d - %s\n", obd->obd_name, obd,
+	       atomic_read(&obd->obd_refcount), scope);
+
+	LASSERT(obd->obd_num_exports >= 0);
+	last = atomic_dec_and_test(&obd->obd_refcount);
 	lu_ref_del(&obd->obd_reference, scope, source);
 
-	CDEBUG(D_INFO, "Decref %s (%p) now %d\n", obd->obd_name, obd, refs);
+	if (last) {
+		struct obd_export *exp;
 
-	if ((refs == 1) && obd->obd_stopping) {
+		LASSERT(!obd->obd_attached);
 		/* All exports have been destroyed; there should
-		   be no more in-progress ops by this point.*/
+		 * be no more in-progress ops by this point.*/
+		exp = obd->obd_self_export;
 
-		spin_lock(&obd->obd_self_export->exp_lock);
-		obd->obd_self_export->exp_flags |= exp_flags_from_obd(obd);
-		spin_unlock(&obd->obd_self_export->exp_lock);
-
-                /* note that we'll recurse into class_decref again */
-                class_unlink_export(obd->obd_self_export);
-                return;
-        }
-
-        if (refs == 0) {
-                CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n",
-                       obd->obd_name, obd->obd_uuid.uuid);
-                LASSERT(!obd->obd_attached);
-                if (obd->obd_stopping) {
-                        /* If we're not stopping, we were never set up */
-                        err = obd_cleanup(obd);
-                        if (err)
-                                CERROR("Cleanup %s returned %d\n",
-                                       obd->obd_name, err);
+		if (exp) {
+			exp->exp_flags |= exp_flags_from_obd(obd);
+			class_unlink_export(exp);
                 }
-
-                class_release_dev(obd);
         }
 }
 EXPORT_SYMBOL(class_decref);
@@ -869,7 +806,7 @@ static int class_del_conn(struct obd_device *obd, struct lustre_cfg *lcfg)
 static LIST_HEAD(lustre_profile_list);
 static DEFINE_SPINLOCK(lustre_profile_list_lock);
 
-struct lustre_profile *class_get_profile(const char * prof)
+struct lustre_profile *class_get_profile(const char *prof)
 {
 	struct lustre_profile *lprof;
 
@@ -1011,40 +948,12 @@ void class_del_profiles(void)
 }
 EXPORT_SYMBOL(class_del_profiles);
 
-static int class_set_global(char *ptr, int val, struct lustre_cfg *lcfg)
-{
-	ENTRY;
-	if (class_match_param(ptr, PARAM_AT_MIN, NULL) == 0)
-		at_min = val;
-	else if (class_match_param(ptr, PARAM_AT_MAX, NULL) == 0)
-		at_max = val;
-	else if (class_match_param(ptr, PARAM_AT_EXTRA, NULL) == 0)
-		at_extra = val;
-	else if (class_match_param(ptr, PARAM_AT_EARLY_MARGIN, NULL) == 0)
-		at_early_margin = val;
-	else if (class_match_param(ptr, PARAM_AT_HISTORY, NULL) == 0)
-		at_history = val;
-	else if (class_match_param(ptr, PARAM_JOBID_VAR, NULL) == 0)
-		strlcpy(obd_jobid_var, lustre_cfg_string(lcfg, 2),
-			JOBSTATS_JOBID_VAR_MAX_LEN + 1);
-	else
-		RETURN(-EINVAL);
-
-	CDEBUG(D_IOCTL, "global %s = %d\n", ptr, val);
-	RETURN(0);
-}
-
-
-/* We can't call ll_process_config or lquota_process_config directly because
- * it lives in a module that must be loaded after this one. */
-static int (*client_process_config)(struct lustre_cfg *lcfg) = NULL;
+/* We can't call lquota_process_config directly because
+ * it lives in a module that must be loaded after this one.
+ */
+#ifdef HAVE_SERVER_SUPPORT
 static int (*quota_process_config)(struct lustre_cfg *lcfg) = NULL;
-
-void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg))
-{
-        client_process_config = cpc;
-}
-EXPORT_SYMBOL(lustre_register_client_process_config);
+#endif /* HAVE_SERVER_SUPPORT */
 
 /**
  * Rename the proc parameter in \a cfg with a new name \a new_name.
@@ -1121,10 +1030,12 @@ struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg,
 }
 EXPORT_SYMBOL(lustre_cfg_rename);
 
-static int process_param2_config(struct lustre_cfg *lcfg)
+static ssize_t process_param2_config(struct lustre_cfg *lcfg)
 {
 	char *param = lustre_cfg_string(lcfg, 1);
 	char *upcall = lustre_cfg_string(lcfg, 2);
+	struct kobject *kobj = NULL;
+	const char *subsys = param;
 	char *argv[] = {
 		[0] = "/usr/sbin/lctl",
 		[1] = "set_param",
@@ -1133,8 +1044,44 @@ static int process_param2_config(struct lustre_cfg *lcfg)
 	};
 	ktime_t start;
 	ktime_t end;
-	int		rc;
+	size_t len;
+	int rc;
+
 	ENTRY;
+	print_lustre_cfg(lcfg);
+
+	len = strcspn(param, ".=");
+	if (!len)
+		return -EINVAL;
+
+	/* If we find '=' then its the top level sysfs directory */
+	if (param[len] == '=')
+		return class_set_global(param);
+
+	subsys = kstrndup(param, len, GFP_KERNEL);
+	if (!subsys)
+		return -ENOMEM;
+
+	kobj = kset_find_obj(lustre_kset, subsys);
+	kfree(subsys);
+	if (kobj) {
+		char *value = param;
+		char *envp[3];
+		int i;
+
+		param = strsep(&value, "=");
+		envp[0] = kasprintf(GFP_KERNEL, "PARAM=%s", param);
+		envp[1] = kasprintf(GFP_KERNEL, "SETTING=%s", value);
+		envp[2] = NULL;
+
+		rc = kobject_uevent_env(kobj, KOBJ_CHANGE, envp);
+		for (i = 0; i < ARRAY_SIZE(envp); i++)
+			kfree(envp[i]);
+
+		kobject_put(kobj);
+
+		RETURN(rc);
+	}
 
 	/* Add upcall processing here. Now only lctl is supported */
 	if (strcmp(upcall, LCTL_UPCALL) != 0) {
@@ -1160,11 +1107,13 @@ static int process_param2_config(struct lustre_cfg *lcfg)
 	RETURN(rc);
 }
 
+#ifdef HAVE_SERVER_SUPPORT
 void lustre_register_quota_process_config(int (*qpc)(struct lustre_cfg *lcfg))
 {
 	quota_process_config = qpc;
 }
 EXPORT_SYMBOL(lustre_register_quota_process_config);
+#endif /* HAVE_SERVER_SUPPORT */
 
 /** Process configuration commands given in lustre_cfg form.
  * These may come from direct calls (e.g. class_manual_cleanup)
@@ -1251,29 +1200,51 @@ int class_process_config(struct lustre_cfg *lcfg)
         }
         case LCFG_PARAM: {
                 char *tmp;
+
                 /* llite has no obd */
-                if ((class_match_param(lustre_cfg_string(lcfg, 1),
-				       PARAM_LLITE, NULL) == 0) &&
-                    client_process_config) {
-                        err = (*client_process_config)(lcfg);
-                        GOTO(out, err);
+		if (class_match_param(lustre_cfg_string(lcfg, 1),
+				      PARAM_LLITE, NULL) == 0) {
+			struct lustre_sb_info *lsi;
+			unsigned long addr;
+			ssize_t count;
+
+			/* The instance name contains the sb:
+			 * lustre-client-aacfe000
+			 */
+			tmp = strrchr(lustre_cfg_string(lcfg, 0), '-');
+			if (!tmp || !*(++tmp))
+				GOTO(out, err = -EINVAL);
+
+			if (sscanf(tmp, "%lx", &addr) != 1)
+				GOTO(out, err = -EINVAL);
+
+			lsi = s2lsi((struct super_block *)addr);
+			/* This better be a real Lustre superblock! */
+			LASSERT(lsi->lsi_lmd->lmd_magic == LMD_MAGIC);
+
+			count = class_modify_config(lcfg, PARAM_LLITE,
+						    lsi->lsi_kobj);
+			err = count < 0 ? count : 0;
+			GOTO(out, err);
                 } else if ((class_match_param(lustre_cfg_string(lcfg, 1),
                                               PARAM_SYS, &tmp) == 0)) {
                         /* Global param settings */
-			err = class_set_global(tmp, lcfg->lcfg_num, lcfg);
+			err = class_set_global(tmp);
 			/*
 			 * Client or server should not fail to mount if
 			 * it hits an unknown configuration parameter.
 			 */
-			if (err != 0)
+			if (err < 0)
 				CWARN("Ignoring unknown param %s\n", tmp);
 
 			GOTO(out, err = 0);
+#ifdef HAVE_SERVER_SUPPORT
 		} else if ((class_match_param(lustre_cfg_string(lcfg, 1),
 					      PARAM_QUOTA, &tmp) == 0) &&
 			   quota_process_config) {
 			err = (*quota_process_config)(lcfg);
 			GOTO(out, err);
+#endif /* HAVE_SERVER_SUPPORT */
 		}
 
 		break;
@@ -1294,7 +1265,6 @@ int class_process_config(struct lustre_cfg *lcfg)
 
                 GOTO(out, err = -EINVAL);
         }
-
 	switch(lcfg->lcfg_command) {
 	case LCFG_SETUP: {
 		err = class_setup(obd, lcfg);
@@ -1334,12 +1304,47 @@ int class_process_config(struct lustre_cfg *lcfg)
                 err = obd_pool_del(obd, lustre_cfg_string(lcfg, 2));
                 GOTO(out, err = 0);
         }
-        default: {
-                err = obd_process_config(obd, sizeof(*lcfg), lcfg);
-                GOTO(out, err);
+	/* Process config log ADD_MDC record twice to add MDC also to LOV
+	 * for Data-on-MDT:
+	 *
+	 * add 0:lustre-clilmv 1:lustre-MDT0000_UUID 2:0 3:1
+	 *     4:lustre-MDT0000-mdc_UUID
+	 */
+	case LCFG_ADD_MDC: {
+		struct obd_device *lov_obd;
+		char *clilmv;
+
+		err = obd_process_config(obd, sizeof(*lcfg), lcfg);
+		if (err)
+			GOTO(out, err);
+
+		/* make sure this is client LMV log entry */
+		clilmv = strstr(lustre_cfg_string(lcfg, 0), "clilmv");
+		if (!clilmv)
+			GOTO(out, err);
+
+		/* replace 'lmv' with 'lov' name to address LOV device and
+		 * process llog record to add MDC there. */
+		clilmv[4] = 'o';
+		lov_obd = class_name2obd(lustre_cfg_string(lcfg, 0));
+		if (lov_obd == NULL) {
+			err = -ENOENT;
+			CERROR("%s: Cannot find LOV by %s name, rc = %d\n",
+			       obd->obd_name, lustre_cfg_string(lcfg, 0), err);
+		} else {
+			err = obd_process_config(lov_obd, sizeof(*lcfg), lcfg);
+		}
+		/* restore 'lmv' name */
+		clilmv[4] = 'm';
+		GOTO(out, err);
+	}
+	default: {
+		err = obd_process_config(obd, sizeof(*lcfg), lcfg);
+		GOTO(out, err);
 
         }
         }
+	EXIT;
 out:
         if ((err < 0) && !(lcfg->lcfg_command & LCFG_REQUIRED)) {
                 CWARN("Ignoring error %d on optional command %#x\n", err,
@@ -1350,97 +1355,89 @@ int class_process_config(struct lustre_cfg *lcfg)
 }
 EXPORT_SYMBOL(class_process_config);
 
-int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
-			     struct lustre_cfg *lcfg, void *data)
+ssize_t class_modify_config(struct lustre_cfg *lcfg, const char *prefix,
+			    struct kobject *kobj)
 {
-	struct lprocfs_vars *var;
-	struct file fakefile = {};
-	struct seq_file fake_seqfile;
-	char *key, *sval;
-	int i, keylen, vallen;
-	int matched = 0, j = 0;
-	int rc = 0;
-	int skip = 0;
-	ENTRY;
+	struct kobj_type *typ;
+	ssize_t count = 0;
+	int i;
 
 	if (lcfg->lcfg_command != LCFG_PARAM) {
 		CERROR("Unknown command: %d\n", lcfg->lcfg_command);
-		RETURN(-EINVAL);
+		return -EINVAL;
 	}
 
-	/* fake a seq file so that var->fops->proc_write can work... */
-	lprocfs_file_set_kernel(&fakefile);
-	fakefile.private_data = &fake_seqfile;
-	fake_seqfile.private = data;
-	/* e.g. tunefs.lustre --param mdt.group_upcall=foo /r/tmp/lustre-mdt
-	   or   lctl conf_param lustre-MDT0000.mdt.group_upcall=bar
-	   or   lctl conf_param lustre-OST0000.osc.max_dirty_mb=36 */
+	typ = get_ktype(kobj);
+	if (!typ || !typ->default_attrs)
+		return -ENODEV;
+
+	print_lustre_cfg(lcfg);
+
+	/*
+	 * e.g. tunefs.lustre --param mdt.group_upcall=foo /r/tmp/lustre-mdt
+	 * or   lctl conf_param lustre-MDT0000.mdt.group_upcall=bar
+	 * or   lctl conf_param lustre-OST0000.osc.max_dirty_mb=36
+	 */
 	for (i = 1; i < lcfg->lcfg_bufcount; i++) {
+		struct attribute *attr;
+		size_t keylen;
+		char *value;
+		char *key;
+		int j;
+
 		key = lustre_cfg_buf(lcfg, i);
 		/* Strip off prefix */
 		if (class_match_param(key, prefix, &key))
 			/* If the prefix doesn't match, return error so we
-			 * can pass it down the stack */
-			RETURN(-ENOSYS);
-		sval = strchr(key, '=');
-		if (!sval || *(sval + 1) == 0) {
+			 * can pass it down the stack
+			 */
+			return -EINVAL;
+
+		value = strchr(key, '=');
+		if (!value || *(value + 1) == 0) {
 			CERROR("%s: can't parse param '%s' (missing '=')\n",
 			       lustre_cfg_string(lcfg, 0),
 			       lustre_cfg_string(lcfg, i));
-			/* rc = -EINVAL;        continue parsing other params */
+			/* continue parsing other params */
 			continue;
 		}
-		keylen = sval - key;
-		sval++;
-		vallen = strlen(sval);
-		matched = 0;
-		j = 0;
-		/* Search proc entries */
-		while (lvars[j].name) {
-			var = &lvars[j];
-			if (class_match_param(key, var->name, NULL) == 0 &&
-			    keylen == strlen(var->name)) {
-				matched++;
-				rc = -EROFS;
-
-				if (var->fops && var->fops->proc_write) {
-					rc = (var->fops->proc_write)(&fakefile,
-								     sval,
-								     vallen,
-								     NULL);
-				}
+		keylen = value - key;
+		value++;
+
+		attr = NULL;
+		for (j = 0; typ->default_attrs[j]; j++) {
+			if (!strncmp(typ->default_attrs[j]->name, key,
+				     keylen)) {
+				attr = typ->default_attrs[j];
 				break;
 			}
-			j++;
 		}
-		if (!matched) {
-			/* It was upgraded from old MDT/OST device,
-			 * ignore the obsolete "sec_level" parameter. */
-			if (strncmp("sec_level", key, keylen) == 0)
-				continue;
 
-			CERROR("%s: unknown config parameter '%s'\n",
-			       lustre_cfg_string(lcfg, 0),
-			       lustre_cfg_string(lcfg, i));
-			/* rc = -EINVAL;        continue parsing other params */
-			skip++;
-		} else if (rc < 0) {
-			CERROR("%s: error writing parameter '%s': rc = %d\n",
-			       lustre_cfg_string(lcfg, 0), key, rc);
-			rc = 0;
+		if (!attr) {
+			char *envp[3];
+
+			envp[0] = kasprintf(GFP_KERNEL, "PARAM=%s.%s.%.*s",
+					    kobject_name(kobj->parent),
+					    kobject_name(kobj),
+					    (int) keylen, key);
+			envp[1] = kasprintf(GFP_KERNEL, "SETTING=%s", value);
+			envp[2] = NULL;
+
+			if (kobject_uevent_env(kobj, KOBJ_CHANGE, envp)) {
+				CERROR("%s: failed to send uevent %s\n",
+				       kobject_name(kobj), key);
+			}
+
+			for (i = 0; i < ARRAY_SIZE(envp); i++)
+				kfree(envp[i]);
 		} else {
-			CDEBUG(D_CONFIG, "%s: set parameter '%s'\n",
-			       lustre_cfg_string(lcfg, 0), key);
+			count += lustre_attr_store(kobj, attr, value,
+						   strlen(value));
 		}
 	}
-
-	if (rc > 0)
-		rc = 0;
-	if (!rc && skip)
-		rc = skip;
-	RETURN(rc);
+	return count;
 }
-EXPORT_SYMBOL(class_process_proc_param);
+EXPORT_SYMBOL(class_modify_config);
 
 /*
  * Supplemental functions for config logs, it allocates lustre_cfg
@@ -1542,12 +1539,11 @@ int class_config_llog_handler(const struct lu_env *env,
 			}
 		}
 		/* A config command without a start marker before it is
-		   illegal (post 146) */
-		if (!(cfg->cfg_flags & CFG_F_COMPAT146) &&
-		    !(cfg->cfg_flags & CFG_F_MARKER) &&
+		 * illegal
+		 */
+		if (!(cfg->cfg_flags & CFG_F_MARKER) &&
 		    (lcfg->lcfg_command != LCFG_MARKER)) {
-			CWARN("Config not inside markers, ignoring! "
-			      "(inst: %p, uuid: %s, flags: %#x)\n",
+			CWARN("Skip config outside markers, (inst: %016lx, uuid: %s, flags: %#x)\n",
 				cfg->cfg_instance,
 				cfg->cfg_uuid.uuid, cfg->cfg_flags);
 			cfg->cfg_flags |= CFG_F_SKIP;
@@ -1623,12 +1619,11 @@ int class_config_llog_handler(const struct lu_env *env,
 		if (cfg->cfg_instance &&
 		    lcfg->lcfg_command != LCFG_SPTLRPC_CONF &&
 		    LUSTRE_CFG_BUFLEN(lcfg, 0) > 0) {
-			inst_len = LUSTRE_CFG_BUFLEN(lcfg, 0) +
-				   sizeof(cfg->cfg_instance) * 2 + 4;
+			inst_len = LUSTRE_CFG_BUFLEN(lcfg, 0) + 16 + 4;
 			OBD_ALLOC(inst_name, inst_len);
 			if (inst_name == NULL)
 				GOTO(out, rc = -ENOMEM);
-			snprintf(inst_name, inst_len, "%s-%p",
+			snprintf(inst_name, inst_len, "%s-%016lx",
 				lustre_cfg_string(lcfg, 0),
 				cfg->cfg_instance);
 			lustre_cfg_bufs_set_string(&bufs, 0, inst_name);
@@ -1636,23 +1631,22 @@ int class_config_llog_handler(const struct lu_env *env,
 			       lcfg->lcfg_command, inst_name);
 		}
 
-                /* we override the llog's uuid for clients, to insure they
-                are unique */
-		if (cfg->cfg_instance != NULL &&
-		    lcfg->lcfg_command == LCFG_ATTACH) {
+		/* override llog UUID for clients, to insure they are unique */
+		if (cfg->cfg_instance && lcfg->lcfg_command == LCFG_ATTACH)
 			lustre_cfg_bufs_set_string(&bufs, 2,
 						   cfg->cfg_uuid.uuid);
-		}
-                /*
-                 * sptlrpc config record, we expect 2 data segments:
-                 *  [0]: fs_name/target_name,
-                 *  [1]: rule string
-                 * moving them to index [1] and [2], and insert MGC's
-                 * obdname at index [0].
-                 */
+		/*
+		 * sptlrpc config record, we expect 2 data segments:
+		 *  [0]: fs_name/target_name,
+		 *  [1]: rule string
+		 * moving them to index [1] and [2], and insert MGC's
+		 * obdname at index [0].
+		 */
 		if (cfg->cfg_instance &&
 		    lcfg->lcfg_command == LCFG_SPTLRPC_CONF) {
-			struct obd_device *obd = cfg->cfg_instance;
+			/* After ASLR changes cfg_instance this needs fixing */
+			/* "obd" is set in config_log_find_or_add() */
+			struct obd_device *obd = (void *)cfg->cfg_instance;
 
 			lustre_cfg_bufs_set(&bufs, 2, bufs.lcfg_buf[1],
 					    bufs.lcfg_buflen[1]);
@@ -1796,55 +1790,6 @@ int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
 }
 EXPORT_SYMBOL(class_config_parse_llog);
 
-static struct lcfg_type_data {
-	__u32	 ltd_type;
-	char	*ltd_name;
-	char	*ltd_bufs[4];
-} lcfg_data_table[] = {
-	{ LCFG_ATTACH, "attach", { "type", "UUID", "3", "4" } },
-	{ LCFG_DETACH, "detach", { "1", "2", "3", "4" } },
-	{ LCFG_SETUP, "setup", { "UUID", "node", "options", "failout" } },
-	{ LCFG_CLEANUP, "cleanup", { "1", "2", "3", "4" } },
-	{ LCFG_ADD_UUID, "add_uuid", { "node", "2", "3", "4" }  },
-	{ LCFG_DEL_UUID, "del_uuid", { "1", "2", "3", "4" }  },
-	{ LCFG_MOUNTOPT, "new_profile", { "name", "lov", "lmv", "4" }  },
-	{ LCFG_DEL_MOUNTOPT, "del_mountopt", { "1", "2", "3", "4" } , },
-	{ LCFG_SET_TIMEOUT, "set_timeout", { "parameter", "2", "3", "4" }  },
-	{ LCFG_SET_UPCALL, "set_upcall", { "1", "2", "3", "4" }  },
-	{ LCFG_ADD_CONN, "add_conn", { "node", "2", "3", "4" }  },
-	{ LCFG_DEL_CONN, "del_conn", { "1", "2", "3", "4" }  },
-	{ LCFG_LOV_ADD_OBD, "add_osc", { "ost", "index", "gen", "UUID" } },
-	{ LCFG_LOV_DEL_OBD, "del_osc", { "1", "2", "3", "4" } },
-	{ LCFG_PARAM, "conf_param", { "parameter", "value", "3", "4" } },
-	{ LCFG_MARKER, "marker", { "1", "2", "3", "4" } },
-	{ LCFG_LOG_START, "log_start", { "1", "2", "3", "4" } },
-	{ LCFG_LOG_END, "log_end", { "1", "2", "3", "4" } },
-	{ LCFG_LOV_ADD_INA, "add_osc_inactive", { "1", "2", "3", "4" }  },
-	{ LCFG_ADD_MDC, "add_mdc", { "mdt", "index", "gen", "UUID" } },
-	{ LCFG_DEL_MDC, "del_mdc", { "1", "2", "3", "4" } },
-	{ LCFG_SPTLRPC_CONF, "security", { "parameter", "2", "3", "4" } },
-	{ LCFG_POOL_NEW, "new_pool", { "fsname", "pool", "3", "4" }  },
-	{ LCFG_POOL_ADD, "add_pool", { "fsname", "pool", "ost", "4" } },
-	{ LCFG_POOL_REM, "remove_pool", { "fsname", "pool", "ost", "4" } },
-	{ LCFG_POOL_DEL, "del_pool", { "fsname", "pool", "3", "4" } },
-	{ LCFG_SET_LDLM_TIMEOUT, "set_ldlm_timeout",
-	  { "parameter", "2", "3", "4" } },
-	{ LCFG_SET_PARAM, "set_param", { "parameter", "value", "3", "4" } },
-	{ 0, NULL, { NULL, NULL, NULL, NULL } }
-};
-
-static struct lcfg_type_data *lcfg_cmd2data(__u32 cmd)
-{
-	int i = 0;
-
-	while (lcfg_data_table[i].ltd_type != 0) {
-		if (lcfg_data_table[i].ltd_type == cmd)
-			return &lcfg_data_table[i];
-		i++;
-	}
-	return NULL;
-}
-
 /**
  * Parse config record and output dump in supplied buffer.
  *
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c
index e3390507d900e..3c7a51ffd38a1 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -43,11 +43,10 @@
 
 #include <obd.h>
 #include <obd_class.h>
-#include <lustre/lustre_user.h>
 #include <linux/version.h>
 #include <lustre_log.h>
 #include <lustre_disk.h>
-#include <uapi/linux/lustre_param.h>
+#include <uapi/linux/lustre/lustre_param.h>
 
 static int (*client_fill_super)(struct super_block *sb,
 				struct vfsmount *mnt);
@@ -220,7 +219,7 @@ int lustre_start_mgc(struct super_block *sb)
         struct lustre_sb_info *lsi = s2lsi(sb);
         struct obd_device *obd;
         struct obd_export *exp;
-        struct obd_uuid *uuid;
+	struct obd_uuid *uuid = NULL;
         class_uuid_t uuidc;
         lnet_nid_t nid;
 	char nidstr[LNET_NIDSTR_SIZE];
@@ -243,7 +242,7 @@ int lustre_start_mgc(struct super_block *sb)
 			struct lnet_process_id id;
 
                         while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
-                                if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
+                                if (id.nid == LNET_NID_LO_0)
                                         continue;
                                 nid = id.nid;
                                 i++;
@@ -409,7 +408,6 @@ int lustre_start_mgc(struct super_block *sb)
         rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME,
 				 (char *)uuid->uuid, LUSTRE_MGS_OBDNAME,
 				 niduuid, NULL, NULL);
-        OBD_FREE_PTR(uuid);
         if (rc)
                 GOTO(out_free, rc);
 
@@ -470,7 +468,7 @@ int lustre_start_mgc(struct super_block *sb)
             lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR)
                 data->ocd_connect_flags &= ~OBD_CONNECT_IMP_RECOV;
         data->ocd_version = LUSTRE_VERSION_CODE;
-        rc = obd_connect(NULL, &exp, obd, &(obd->obd_uuid), data, NULL);
+	rc = obd_connect(NULL, &exp, obd, uuid, data, NULL);
         if (rc) {
                 CERROR("connect failed %d\n", rc);
                 GOTO(out, rc);
@@ -485,6 +483,8 @@ int lustre_start_mgc(struct super_block *sb)
 out_free:
 	mutex_unlock(&mgc_start_lock);
 
+	if (uuid)
+		OBD_FREE_PTR(uuid);
         if (data)
                 OBD_FREE_PTR(data);
         if (mgcname)
@@ -591,7 +591,7 @@ static struct lustre_sb_info *lustre_init_lsi(struct super_block *sb)
         /* Default umount style */
         lsi->lsi_flags = LSI_UMOUNT_FAILOVER;
 	INIT_LIST_HEAD(&lsi->lsi_lwp_list);
-	spin_lock_init(&lsi->lsi_lwp_lock);
+	mutex_init(&lsi->lsi_lwp_mutex);
 
 	RETURN(lsi);
 }
@@ -1156,37 +1156,52 @@ static int lmd_parse_mgs(struct lustre_mount_data *lmd, char **ptr)
  * make \a *endh point to the string starting with the delimiter. The commas
  * in expression list [...] will be skipped.
  *
- * \param[in] buf	a delimiter-separated string
- * \param[in] endh	a pointer to a pointer that will point to the string
- *			starting with the delimiter
+ * @buf		a delimiter-separated string
+ * @endh	a pointer to a pointer that will point to the string
+ *		starting with the delimiter
  *
- * \retval 0		if delimiter is found
- * \retval 1		if delimiter is not found
+ * RETURNS	true if delimiter is found, false if delimiter is not found
  */
-static int lmd_find_delimiter(char *buf, char **endh)
+static bool lmd_find_delimiter(char *buf, char **endh)
 {
 	char *c = buf;
-	int   skip = 0;
-
-	if (buf == NULL)
-		return 1;
+	size_t pos;
+	bool found;
+
+	if (!buf)
+		return false;
+try_again:
+	if (*c == ',' || *c == ':')
+		return true;
+
+	pos = strcspn(c, "[:,]");
+	if (!pos)
+		return false;
+
+	/* Not a valid mount string */
+	if (*c == ']') {
+		CWARN("invalid mount string format\n");
+		return false;
+	}
 
-	while (*c != '\0') {
-		if (*c == '[')
-			skip++;
-		else if (*c == ']')
-			skip--;
+	c += pos;
+	if (*c == '[') {
+		c = strchr(c, ']');
 
-		if ((*c == ',' || *c == ':') && skip == 0) {
-			if (endh != NULL)
-				*endh = c;
-			return 0;
+		/* invalid mount string */
+		if (!c) {
+			CWARN("invalid mount string format\n");
+			return false;
 		}
-
 		c++;
+		goto try_again;
 	}
 
-	return 1;
+	found = *c != '\0';
+	if (found && endh)
+		*endh = c;
+
+	return found;
 }
 
 /**
@@ -1215,7 +1230,7 @@ static int lmd_parse_nidlist(char *buf, char **endh)
 	if (*buf == ' ' || *buf == '/' || *buf == '\0')
 		return 1;
 
-	if (lmd_find_delimiter(buf, &endp) != 0)
+	if (!lmd_find_delimiter(buf, &endp))
 		endp = buf + strlen(buf);
 
 	tmp = *endp;
@@ -1360,9 +1375,8 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd)
 		} else if (strncmp(s1, "param=", 6) == 0) {
 			size_t length, params_length;
 			char  *tail = s1;
-			if (lmd_find_delimiter(s1 + 6, &tail) != 0)
-				length = strlen(s1);
-			else {
+
+			if (lmd_find_delimiter(s1 + 6, &tail)) {
 				char *param_str = tail + 1;
 				int   supplementary = 1;
 				while (lmd_parse_nidlist(param_str,
@@ -1370,6 +1384,8 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd)
 					supplementary = 0;
 				}
 				length = param_str - s1 - supplementary;
+			} else {
+				length = strlen(s1);
 			}
 			length -= 6;
 			params_length = strlen(lmd->lmd_params);
@@ -1398,6 +1414,15 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd)
 			rc = lmd_parse_network(lmd, s1 + 8);
 			if (rc)
 				goto invalid;
+
+			/* check if LNet dynamic peer discovery is activated */
+			if (LNetGetPeerDiscoveryStatus()) {
+				CERROR("LNet Dynamic Peer Discovery is enabled "
+				       "on this node. 'network' mount option "
+				       "cannot be taken into account.\n");
+				goto invalid;
+			}
+
 			clear++;
 		}
 
@@ -1476,6 +1501,8 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd)
 	s1 = options + strlen(options) - 1;
 	while (s1 >= options && (*s1 == ',' || *s1 == ' '))
 		*s1-- = 0;
+	while (*options && (*options == ',' || *options == ' '))
+		options++;
 	if (*options != 0) {
 		/* Freed in lustre_free_lsi */
 		OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1);
@@ -1648,7 +1675,12 @@ static struct file_system_type lustre_fs_type = {
         .get_sb       = lustre_get_sb,
 #endif
         .kill_sb      = lustre_kill_super,
-	.fs_flags     = FS_REQUIRES_DEV | FS_HAS_FIEMAP | FS_RENAME_DOES_D_MOVE,
+	.fs_flags     = FS_HAS_FIEMAP | FS_RENAME_DOES_D_MOVE |
+#ifdef HAVE_SERVER_SUPPORT
+			FS_REQUIRES_DEV,
+#else
+			0,
+#endif
 };
 MODULE_ALIAS_FS("lustre");
 
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c
index b1f59d8f6b303..b23a4ccf0bd9d 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2013, 2016, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -52,14 +52,11 @@
 #include <linux/smp_lock.h>
 #endif
 
-#include <lustre/lustre_idl.h>
-#include <lustre/lustre_user.h>
-
 #include <llog_swab.h>
 #include <lustre_disk.h>
-#include <uapi/linux/lustre_ioctl.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
 #include <lustre_log.h>
-#include <uapi/linux/lustre_param.h>
+#include <uapi/linux/lustre/lustre_param.h>
 #include <obd.h>
 #include <obd_class.h>
 
@@ -511,7 +508,7 @@ struct obd_export *lustre_find_lwp_by_index(const char *dev, __u32 idx)
 	}
 
 	snprintf(lwp_name, sizeof(lwp_name), "%s-MDT%04x", fsname, idx);
-	spin_lock(&lsi->lsi_lwp_lock);
+	mutex_lock(&lsi->lsi_lwp_mutex);
 	list_for_each_entry(lwp, &lsi->lsi_lwp_list, obd_lwp_list) {
 		char *ptr = strstr(lwp->obd_name, lwp_name);
 
@@ -520,7 +517,7 @@ struct obd_export *lustre_find_lwp_by_index(const char *dev, __u32 idx)
 			break;
 		}
 	}
-	spin_unlock(&lsi->lsi_lwp_lock);
+	mutex_unlock(&lsi->lsi_lwp_mutex);
 
 err_lmi:
 	server_put_mount(dev, false);
@@ -681,9 +678,9 @@ static int lustre_lwp_setup(struct lustre_cfg *lcfg, struct lustre_sb_info *lsi,
 	rc = lustre_lwp_connect(obd, strstr(lsi->lsi_svname, "-MDT") != NULL);
 	if (rc == 0) {
 		obd->u.cli.cl_max_mds_easize = MAX_MD_SIZE;
-		spin_lock(&lsi->lsi_lwp_lock);
+		mutex_lock(&lsi->lsi_lwp_mutex);
 		list_add_tail(&obd->obd_lwp_list, &lsi->lsi_lwp_list);
-		spin_unlock(&lsi->lsi_lwp_lock);
+		mutex_unlock(&lsi->lsi_lwp_mutex);
 	} else {
 		CERROR("%s: connect failed: rc = %d\n", lwpname, rc);
 	}
@@ -939,7 +936,7 @@ static int lustre_disconnect_lwp(struct super_block *sb)
 			GOTO(out, rc = -ENOMEM);
 
 		/* end log first */
-		cfg->cfg_instance = sb;
+		cfg->cfg_instance = ll_get_cfg_instance(sb);
 		rc = lustre_end_log(sb, logname, cfg);
 		if (rc != 0 && rc != -ENOENT)
 			GOTO(out, rc);
@@ -951,6 +948,7 @@ static int lustre_disconnect_lwp(struct super_block *sb)
 	if (bufs == NULL)
 		GOTO(out, rc = -ENOMEM);
 
+	mutex_lock(&lsi->lsi_lwp_mutex);
 	list_for_each_entry(lwp, &lsi->lsi_lwp_list, obd_lwp_list) {
 		struct lustre_cfg *lcfg;
 
@@ -963,8 +961,10 @@ static int lustre_disconnect_lwp(struct super_block *sb)
 		lustre_cfg_bufs_set_string(bufs, 1, NULL);
 		OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount,
 					       bufs->lcfg_buflen));
-		if (!lcfg)
-			GOTO(out, rc = -ENOMEM);
+		if (!lcfg) {
+			rc = -ENOMEM;
+			break;
+		}
 		lustre_cfg_init(lcfg, LCFG_CLEANUP, bufs);
 
 		/* Disconnect import first. NULL is passed for the '@env',
@@ -979,6 +979,7 @@ static int lustre_disconnect_lwp(struct super_block *sb)
 			rc1 = rc;
 		}
 	}
+	mutex_unlock(&lsi->lsi_lwp_mutex);
 
 	GOTO(out, rc);
 
@@ -1004,18 +1005,23 @@ static int lustre_stop_lwp(struct super_block *sb)
 	int			 rc1 = 0;
 	ENTRY;
 
+	mutex_lock(&lsi->lsi_lwp_mutex);
 	while (!list_empty(&lsi->lsi_lwp_list)) {
 		lwp = list_entry(lsi->lsi_lwp_list.next, struct obd_device,
 				 obd_lwp_list);
 		list_del_init(&lwp->obd_lwp_list);
 		lwp->obd_force = 1;
+		mutex_unlock(&lsi->lsi_lwp_mutex);
+
 		rc = class_manual_cleanup(lwp);
 		if (rc != 0) {
 			CERROR("%s: fail to stop LWP: rc = %d\n",
 			       lwp->obd_name, rc);
 			rc1 = rc;
 		}
+		mutex_lock(&lsi->lsi_lwp_mutex);
 	}
+	mutex_unlock(&lsi->lsi_lwp_mutex);
 
 	RETURN(rc1 != 0 ? rc1 : rc);
 }
@@ -1051,7 +1057,7 @@ static int lustre_start_lwp(struct super_block *sb)
 		GOTO(out, rc = -ENOMEM);
 
 	cfg->cfg_callback = client_lwp_config_process;
-	cfg->cfg_instance = sb;
+	cfg->cfg_instance = ll_get_cfg_instance(sb);
 	rc = lustre_process_log(sb, logname, cfg);
 	/* need to remove config llog from mgc */
 	lsi->lsi_lwp_started = 1;
@@ -1133,7 +1139,7 @@ static int server_lsi2mti(struct lustre_sb_info *lsi,
 
 	mti->mti_nid_count = 0;
 	while (LNetGetId(i++, &id) != -ENOENT) {
-		if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
+		if (id.nid == LNET_NID_LO_0)
 			continue;
 
 		/* server use --servicenode param, only allow specified
@@ -1682,6 +1688,63 @@ static int server_statfs(struct dentry *dentry, struct kstatfs *buf)
 	RETURN(0);
 }
 
+#ifdef HAVE_SUPEROPS_USE_DENTRY
+int server_show_options(struct seq_file *seq, struct dentry *dentry)
+#else
+int server_show_options(struct seq_file *seq, struct vfsmount *vfs)
+#endif
+{
+	struct lustre_sb_info *lsi;
+	struct lustre_mount_data *lmd;
+
+#ifdef HAVE_SUPEROPS_USE_DENTRY
+	LASSERT(seq != NULL && dentry != NULL);
+	lsi = s2lsi(dentry->d_sb);
+#else
+	LASSERT(seq != NULL && vfs != NULL);
+	lsi = s2lsi(vfs->mnt_sb);
+#endif
+
+	lmd = lsi->lsi_lmd;
+	seq_printf(seq, ",svname=%s", lmd->lmd_profile);
+
+	if  (lmd->lmd_flags & LMD_FLG_ABORT_RECOV)
+		seq_puts(seq, ",abort_recov");
+
+	if (lmd->lmd_flags & LMD_FLG_NOIR)
+		seq_puts(seq, ",noir");
+
+	if (lmd->lmd_flags & LMD_FLG_NOSVC)
+		seq_puts(seq, ",nosvc");
+
+	if (lmd->lmd_flags & LMD_FLG_NOMGS)
+		seq_puts(seq, ",nomgs");
+
+	if (lmd->lmd_flags & LMD_FLG_NOSCRUB)
+		seq_puts(seq, ",noscrub");
+	if (lmd->lmd_flags & LMD_FLG_SKIP_LFSCK)
+		seq_puts(seq, ",skip_lfsck");
+
+	if (lmd->lmd_flags & LMD_FLG_DEV_RDONLY)
+		seq_puts(seq, ",rdonly_dev");
+
+	if (lmd->lmd_flags & LMD_FLG_MGS)
+		seq_puts(seq, ",mgs");
+
+	if (lmd->lmd_mgs != NULL)
+		seq_printf(seq, ",mgsnode=%s", lmd->lmd_mgs);
+
+	if (lmd->lmd_osd_type != NULL)
+		seq_printf(seq, ",osd=%s", lmd->lmd_osd_type);
+
+	if (lmd->lmd_opts != NULL) {
+		seq_putc(seq, ',');
+		seq_puts(seq, lmd->lmd_opts);
+	}
+
+	RETURN(0);
+}
+
 /** The operations we support directly on the superblock:
  * mount, umount, and df.
  */
@@ -1689,6 +1752,7 @@ static struct super_operations server_ops = {
 	.put_super	= server_put_super,
 	.umount_begin	= server_umount_begin, /* umount -f */
 	.statfs		= server_statfs,
+	.show_options	= server_show_options,
 };
 
 /*
@@ -1716,6 +1780,43 @@ static ssize_t lustre_listxattr(struct dentry *d_entry, char *name,
 	return -EOPNOTSUPP;
 }
 
+static bool is_cmd_supported(unsigned int command)
+{
+	switch (command) {
+	case FITRIM:
+		return true;
+	default:
+		return false;
+	}
+
+	return false;
+}
+
+static long server_ioctl(struct file *filp, unsigned int command,
+			 unsigned long arg)
+{
+	struct file active_filp;
+	struct inode *inode = file_inode(filp);
+	struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
+	struct super_block *dd_sb = dt_mnt_sb_get(lsi->lsi_dt_dev);
+	struct inode *active_inode;
+	int err = -EOPNOTSUPP;
+
+	if (IS_ERR(dd_sb) || !is_cmd_supported(command))
+		return err;
+
+	active_inode = igrab(dd_sb->s_root->d_inode);
+	if (!active_inode)
+		return -EACCES;
+
+	active_filp.f_inode = active_inode;
+	if (active_inode->i_fop && active_inode->i_fop->unlocked_ioctl)
+		err = active_inode->i_fop->unlocked_ioctl(&active_filp,
+							  command, arg);
+	iput(active_inode);
+	return err;
+}
+
 static const struct inode_operations server_inode_operations = {
 #ifdef HAVE_IOP_XATTR
 	.setxattr       = lustre_setxattr,
@@ -1724,6 +1825,10 @@ static const struct inode_operations server_inode_operations = {
 	.listxattr      = lustre_listxattr,
 };
 
+static const struct file_operations server_file_operations = {
+	.unlocked_ioctl = server_ioctl,
+};
+
 #define log2(n) ffz(~(n))
 #define LUSTRE_SUPER_MAGIC 0x0BD00BD1
 
@@ -1752,6 +1857,7 @@ static int server_fill_super_common(struct super_block *sb)
 	/* apparently we need to be a directory for the mount to finish */
 	root->i_mode = S_IFDIR;
 	root->i_op = &server_inode_operations;
+	root->i_fop = &server_file_operations;
 	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
 		CERROR("%s: can't make root dentry\n", sb->s_id);
@@ -1764,10 +1870,10 @@ static int server_fill_super_common(struct super_block *sb)
 static int osd_start(struct lustre_sb_info *lsi, unsigned long mflags)
 {
 	struct lustre_mount_data *lmd = lsi->lsi_lmd;
-	struct obd_device	 *obd;
-	struct dt_device_param    p;
-	char			  flagstr[16];
-	int			  rc;
+	struct obd_device *obd;
+	struct dt_device_param p;
+	char flagstr[20 + 1 + 10 + 1];
+	int rc;
 	ENTRY;
 
 	CDEBUG(D_MOUNT,
@@ -1777,7 +1883,7 @@ static int osd_start(struct lustre_sb_info *lsi, unsigned long mflags)
 	sprintf(lsi->lsi_osd_obdname, "%s-osd", lsi->lsi_svname);
 	strcpy(lsi->lsi_osd_uuid, lsi->lsi_osd_obdname);
 	strcat(lsi->lsi_osd_uuid, "_UUID");
-	sprintf(flagstr, "%lu:%lu", mflags, (unsigned long) lmd->lmd_flags);
+	snprintf(flagstr, sizeof(flagstr), "%lu:%u", mflags, lmd->lmd_flags);
 
 	obd = class_name2obd(lsi->lsi_osd_obdname);
 	if (obd == NULL) {
@@ -1840,8 +1946,10 @@ int server_fill_super(struct super_block *sb)
 	OBD_RACE(OBD_FAIL_TGT_MOUNT_RACE);
 
 	rc = lsi_prepare(lsi);
-	if (rc)
+	if (rc) {
+		lustre_put_lsi(sb);
 		RETURN(rc);
+	}
 
 	/* Start low level OSD */
 	rc = osd_start(lsi, sb->s_flags);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_sysfs.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_sysfs.c
new file mode 100644
index 0000000000000..53b0b3130b717
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_sysfs.c
@@ -0,0 +1,535 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obd_sysfs.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/lp.h>
+#include <linux/slab.h>
+#include <linux/ioport.h>
+#include <linux/fcntl.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <asm/io.h>
+#include <asm/ioctls.h>
+#include <asm/poll.h>
+#include <asm/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/seq_file.h>
+#include <linux/kobject.h>
+
+#include <libcfs/libcfs.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <uapi/linux/lnet/lnetctl.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
+#include <uapi/linux/lustre/lustre_ver.h>
+
+struct static_lustre_uintvalue_attr {
+	struct {
+		struct attribute attr;
+		ssize_t (*show)(struct kobject *kobj, struct attribute *attr,
+				char *buf);
+		ssize_t (*store)(struct kobject *kobj, struct attribute *attr,
+				 const char *buf, size_t len);
+	} u;
+	int *value;
+};
+
+static ssize_t static_uintvalue_show(struct kobject *kobj,
+				     struct attribute *attr,
+				     char *buf)
+{
+	struct static_lustre_uintvalue_attr *lattr = (void *)attr;
+
+	return sprintf(buf, "%d\n", *lattr->value);
+}
+
+static ssize_t static_uintvalue_store(struct kobject *kobj,
+				      struct attribute *attr,
+				      const char *buffer, size_t count)
+{
+	struct static_lustre_uintvalue_attr *lattr = (void *)attr;
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buffer, 10, &val);
+	if (rc)
+		return rc;
+
+	*lattr->value = val;
+
+	return count;
+}
+
+#define LUSTRE_STATIC_UINT_ATTR(name, value)				\
+static struct static_lustre_uintvalue_attr lustre_sattr_##name =	\
+	{ __ATTR(name, 0644, static_uintvalue_show,			\
+		 static_uintvalue_store), value }
+
+LUSTRE_STATIC_UINT_ATTR(timeout, &obd_timeout);
+LUSTRE_STATIC_UINT_ATTR(debug_peer_on_timeout, &obd_debug_peer_on_timeout);
+LUSTRE_STATIC_UINT_ATTR(dump_on_timeout, &obd_dump_on_timeout);
+LUSTRE_STATIC_UINT_ATTR(dump_on_eviction, &obd_dump_on_eviction);
+LUSTRE_STATIC_UINT_ATTR(at_min, &at_min);
+LUSTRE_STATIC_UINT_ATTR(at_max, &at_max);
+LUSTRE_STATIC_UINT_ATTR(at_extra, &at_extra);
+LUSTRE_STATIC_UINT_ATTR(at_early_margin, &at_early_margin);
+LUSTRE_STATIC_UINT_ATTR(at_history, &at_history);
+LUSTRE_STATIC_UINT_ATTR(lbug_on_eviction, &obd_lbug_on_eviction);
+
+#ifdef HAVE_SERVER_SUPPORT
+LUSTRE_STATIC_UINT_ATTR(ldlm_timeout, &ldlm_timeout);
+LUSTRE_STATIC_UINT_ATTR(bulk_timeout, &bulk_timeout);
+#endif
+
+static ssize_t memused_show(struct kobject *kobj, struct attribute *attr,
+			    char *buf)
+{
+	return sprintf(buf, "%llu\n", obd_memory_sum());
+}
+LUSTRE_RO_ATTR(memused);
+
+static ssize_t memused_max_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	return sprintf(buf, "%llu\n", obd_memory_max());
+}
+LUSTRE_RO_ATTR(memused_max);
+
+static ssize_t max_dirty_mb_show(struct kobject *kobj, struct attribute *attr,
+				 char *buf)
+{
+	return sprintf(buf, "%lu\n",
+		       obd_max_dirty_pages / (1 << (20 - PAGE_SHIFT)));
+}
+
+static ssize_t max_dirty_mb_store(struct kobject *kobj, struct attribute *attr,
+				  const char *buffer, size_t count)
+{
+	unsigned long val;
+	int rc;
+
+	rc = kstrtoul(buffer, 10, &val);
+	if (rc)
+		return rc;
+
+	val *= 1 << (20 - PAGE_SHIFT); /* convert to pages */
+
+	if (val > ((cfs_totalram_pages() / 10) * 9)) {
+		/* Somebody wants to assign too much memory to dirty pages */
+		return -EINVAL;
+	}
+
+	if (val < 4 << (20 - PAGE_SHIFT)) {
+		/* Less than 4 Mb for dirty cache is also bad */
+		return -EINVAL;
+	}
+
+	obd_max_dirty_pages = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(max_dirty_mb);
+
+static ssize_t version_show(struct kobject *kobj, struct attribute *attr,
+			    char *buf)
+{
+	return sprintf(buf, "%s\n", LUSTRE_VERSION_STRING);
+}
+
+static ssize_t pinger_show(struct kobject *kobj, struct attribute *attr,
+			   char *buf)
+{
+#ifdef ENABLE_PINGER
+	const char *state = "on";
+#else
+	const char *state = "off";
+#endif
+	return sprintf(buf, "%s\n", state);
+}
+
+/**
+ * Check all obd devices health
+ *
+ * \param kobj
+ * \param buf [in]
+ *
+ * \retval number of characters printed if healthy
+ */
+static ssize_t
+health_check_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	bool healthy = true;
+	size_t len = 0;
+	int i;
+
+	if (libcfs_catastrophe) {
+		len = sprintf(buf, "LBUG\n");
+		healthy = false;
+	}
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd;
+
+		obd = class_num2obd(i);
+		if (obd == NULL || !obd->obd_attached || !obd->obd_set_up)
+			continue;
+
+		LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+		if (obd->obd_stopping)
+			continue;
+
+		class_incref(obd, __FUNCTION__, current);
+		read_unlock(&obd_dev_lock);
+
+		if (obd_health_check(NULL, obd)) {
+			len = sprintf(buf, "device %s reported unhealthy\n",
+				      obd->obd_name);
+			healthy = false;
+		}
+		class_decref(obd, __FUNCTION__, current);
+		read_lock(&obd_dev_lock);
+	}
+	read_unlock(&obd_dev_lock);
+
+	if (healthy)
+		len = sprintf(buf, "healthy\n");
+	else
+		len = sprintf(buf, "NOT HEALTHY\n");
+
+	return len;
+}
+
+static ssize_t jobid_var_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
+{
+	int rc = 0;
+
+	if (strlen(obd_jobid_var))
+		rc = snprintf(buf, PAGE_SIZE, "%s\n", obd_jobid_var);
+	return rc;
+}
+
+static ssize_t jobid_var_store(struct kobject *kobj, struct attribute *attr,
+			       const char *buffer, size_t count)
+{
+	if (!count || count > JOBSTATS_JOBID_VAR_MAX_LEN)
+		return -EINVAL;
+
+	memset(obd_jobid_var, 0, JOBSTATS_JOBID_VAR_MAX_LEN + 1);
+
+	memcpy(obd_jobid_var, buffer, count);
+
+	/* Trim the trailing '\n' if any */
+	if (obd_jobid_var[count - 1] == '\n')
+		obd_jobid_var[count - 1] = 0;
+
+	return count;
+}
+
+static ssize_t jobid_name_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	int rc = 0;
+
+	if (strlen(obd_jobid_name))
+		rc = snprintf(buf, PAGE_SIZE, "%s\n", obd_jobid_name);
+	return rc;
+}
+
+static ssize_t jobid_name_store(struct kobject *kobj, struct attribute *attr,
+				const char *buffer, size_t count)
+{
+	if (!count || count > LUSTRE_JOBID_SIZE)
+		return -EINVAL;
+
+	if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) != 0 &&
+	    !strchr(buffer, '%')) {
+		lustre_jobid_clear(buffer);
+		return count;
+	}
+
+	/* clear previous value */
+	memset(obd_jobid_name, 0, LUSTRE_JOBID_SIZE);
+
+	memcpy(obd_jobid_name, buffer, count);
+
+	/* Trim the trailing '\n' if any */
+	if (obd_jobid_name[count - 1] == '\n') {
+		/* Don't echo just a newline */
+		if (count == 1)
+			return -EINVAL;
+		obd_jobid_name[count - 1] = 0;
+	}
+
+	return count;
+}
+
+/* Root for /sys/kernel/debug/lustre */
+struct dentry *debugfs_lustre_root;
+EXPORT_SYMBOL_GPL(debugfs_lustre_root);
+
+#ifdef CONFIG_PROC_FS
+/* Root for /proc/fs/lustre */
+struct proc_dir_entry *proc_lustre_root;
+EXPORT_SYMBOL(proc_lustre_root);
+#else
+#define lprocfs_base NULL
+#endif /* CONFIG_PROC_FS */
+
+LUSTRE_RO_ATTR(version);
+LUSTRE_RO_ATTR(pinger);
+LUSTRE_RO_ATTR(health_check);
+LUSTRE_RW_ATTR(jobid_var);
+LUSTRE_RW_ATTR(jobid_name);
+
+static struct attribute *lustre_attrs[] = {
+	&lustre_attr_version.attr,
+	&lustre_attr_pinger.attr,
+	&lustre_attr_health_check.attr,
+	&lustre_attr_jobid_name.attr,
+	&lustre_attr_jobid_var.attr,
+	&lustre_sattr_timeout.u.attr,
+	&lustre_attr_max_dirty_mb.attr,
+	&lustre_sattr_debug_peer_on_timeout.u.attr,
+	&lustre_sattr_dump_on_timeout.u.attr,
+	&lustre_sattr_dump_on_eviction.u.attr,
+	&lustre_sattr_at_min.u.attr,
+	&lustre_sattr_at_max.u.attr,
+	&lustre_sattr_at_extra.u.attr,
+	&lustre_sattr_at_early_margin.u.attr,
+	&lustre_sattr_at_history.u.attr,
+	&lustre_attr_memused_max.attr,
+	&lustre_attr_memused.attr,
+#ifdef HAVE_SERVER_SUPPORT
+	&lustre_sattr_ldlm_timeout.u.attr,
+	&lustre_sattr_bulk_timeout.u.attr,
+#endif
+	&lustre_sattr_lbug_on_eviction.u.attr,
+	NULL,
+};
+
+static void *obd_device_list_seq_start(struct seq_file *p, loff_t *pos)
+{
+	if (*pos >= class_devno_max())
+		return NULL;
+
+	return pos;
+}
+
+static void obd_device_list_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *obd_device_list_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	++*pos;
+	if (*pos >= class_devno_max())
+		return NULL;
+
+	return pos;
+}
+
+static int obd_device_list_seq_show(struct seq_file *p, void *v)
+{
+	loff_t index = *(loff_t *)v;
+	struct obd_device *obd = class_num2obd((int)index);
+	char *status;
+
+	if (obd == NULL)
+		return 0;
+
+	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+	if (obd->obd_stopping)
+		status = "ST";
+	else if (obd->obd_inactive)
+		status = "IN";
+	else if (obd->obd_set_up)
+		status = "UP";
+	else if (obd->obd_attached)
+		status = "AT";
+	else
+		status = "--";
+
+	seq_printf(p, "%3d %s %s %s %s %d\n",
+		   (int)index, status, obd->obd_type->typ_name,
+		   obd->obd_name, obd->obd_uuid.uuid,
+		   atomic_read(&obd->obd_refcount));
+	return 0;
+}
+
+static const struct seq_operations obd_device_list_sops = {
+	.start = obd_device_list_seq_start,
+	.stop = obd_device_list_seq_stop,
+	.next = obd_device_list_seq_next,
+	.show = obd_device_list_seq_show,
+};
+
+static int obd_device_list_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc = seq_open(file, &obd_device_list_sops);
+
+	if (rc)
+		return rc;
+
+	seq = file->private_data;
+	seq->private = inode->i_private;
+	return 0;
+}
+
+static const struct file_operations obd_device_list_fops = {
+	.owner   = THIS_MODULE,
+	.open    = obd_device_list_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+struct kset *lustre_kset;
+EXPORT_SYMBOL_GPL(lustre_kset);
+
+static struct attribute_group lustre_attr_group = {
+	.attrs = lustre_attrs,
+};
+
+ssize_t class_set_global(const char *param)
+{
+	const char *value = strchr(param, '=') + 1;
+	size_t off = value - param - 1;
+	ssize_t count = -ENOENT;
+	int i;
+
+	for (i = 0; lustre_attrs[i]; i++) {
+		if (!strncmp(lustre_attrs[i]->name, param, off)) {
+			count = lustre_attr_store(&lustre_kset->kobj,
+						  lustre_attrs[i], value,
+						  strlen(value));
+			break;
+		}
+	}
+	return count;
+}
+
+int class_procfs_init(void)
+{
+	struct proc_dir_entry *entry;
+	struct dentry *file;
+	int rc = -ENOMEM;
+
+	ENTRY;
+
+	lustre_kset = kset_create_and_add("lustre", NULL, fs_kobj);
+	if (!lustre_kset)
+		goto out;
+
+	/* Create the files associated with this kobject */
+	rc = sysfs_create_group(&lustre_kset->kobj, &lustre_attr_group);
+	if (rc) {
+		kset_unregister(lustre_kset);
+		goto out;
+	}
+
+	rc = jobid_cache_init();
+	if (rc) {
+		kset_unregister(lustre_kset);
+		goto out;
+	}
+
+	debugfs_lustre_root = debugfs_create_dir("lustre", NULL);
+	if (IS_ERR_OR_NULL(debugfs_lustre_root)) {
+		rc = debugfs_lustre_root ? PTR_ERR(debugfs_lustre_root)
+					 : -ENOMEM;
+		debugfs_lustre_root = NULL;
+		kset_unregister(lustre_kset);
+		goto out;
+	}
+
+	file = debugfs_create_file("devices", 0444, debugfs_lustre_root, NULL,
+				   &obd_device_list_fops);
+	if (IS_ERR_OR_NULL(file)) {
+		rc = file ? PTR_ERR(file) : -ENOMEM;
+		debugfs_remove(debugfs_lustre_root);
+		kset_unregister(lustre_kset);
+		goto out;
+	}
+
+	entry = lprocfs_register("fs/lustre", NULL, NULL, NULL);
+	if (IS_ERR(entry)) {
+		rc = PTR_ERR(entry);
+		CERROR("cannot create '/proc/fs/lustre': rc = %d\n", rc);
+		debugfs_remove_recursive(debugfs_lustre_root);
+		kset_unregister(lustre_kset);
+		goto out;
+	}
+
+	proc_lustre_root = entry;
+out:
+	RETURN(rc);
+}
+
+int class_procfs_clean(void)
+{
+	ENTRY;
+
+	debugfs_remove_recursive(debugfs_lustre_root);
+
+	debugfs_lustre_root = NULL;
+	jobid_cache_fini();
+
+	if (proc_lustre_root)
+		lprocfs_remove(&proc_lustre_root);
+
+	sysfs_remove_group(&lustre_kset->kobj, &lustre_attr_group);
+
+	kset_unregister(lustre_kset);
+
+	RETURN(0);
+}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obdo.c b/drivers/staging/lustrefsx/lustre/obdclass/obdo.c
index 7d14851f799f0..0367cfd1bef67 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/obdo.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obdo.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2014, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -43,15 +43,14 @@
 # include <linux/uidgid.h>
 #endif
 #include <obd_class.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_obdo.h>
 
 void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent)
 {
-        dst->o_parent_oid = fid_oid(parent);
-        dst->o_parent_seq = fid_seq(parent);
-        dst->o_parent_ver = fid_ver(parent);
-        dst->o_valid |= OBD_MD_FLGENER | OBD_MD_FLFID;
+	dst->o_parent_oid = fid_oid(parent);
+	dst->o_parent_seq = fid_seq(parent);
+	dst->o_parent_ver = fid_ver(parent);
+	dst->o_valid |= OBD_MD_FLPARENT | OBD_MD_FLFID;
 }
 EXPORT_SYMBOL(obdo_set_parent_fid);
 
@@ -62,8 +61,10 @@ void obdo_set_o_projid(struct obdo *dst, u32 projid)
 }
 EXPORT_SYMBOL(obdo_set_o_projid);
 
-/* WARNING: the file systems must take care not to tinker with
-   attributes they don't manage (such as blocks). */
+/*
+ * WARNING: the file systems must take care not to tinker with
+ * attributes they don't manage (such as blocks).
+ */
 void obdo_from_inode(struct obdo *dst, struct inode *src, u64 valid)
 {
 	u64 newvalid = 0;
@@ -73,40 +74,40 @@ void obdo_from_inode(struct obdo *dst, struct inode *src, u64 valid)
 		       valid, (s64) src->i_mtime.tv_sec,
 		       (s64) src->i_ctime.tv_sec);
 
-        if (valid & OBD_MD_FLATIME) {
+	if (valid & OBD_MD_FLATIME) {
 		dst->o_atime = src->i_atime.tv_sec;
-                newvalid |= OBD_MD_FLATIME;
-        }
-        if (valid & OBD_MD_FLMTIME) {
+		newvalid |= OBD_MD_FLATIME;
+	}
+	if (valid & OBD_MD_FLMTIME) {
 		dst->o_mtime = src->i_mtime.tv_sec;
-                newvalid |= OBD_MD_FLMTIME;
-        }
-        if (valid & OBD_MD_FLCTIME) {
+		newvalid |= OBD_MD_FLMTIME;
+	}
+	if (valid & OBD_MD_FLCTIME) {
 		dst->o_ctime = src->i_ctime.tv_sec;
-                newvalid |= OBD_MD_FLCTIME;
-        }
-        if (valid & OBD_MD_FLSIZE) {
-                dst->o_size = i_size_read(src);
-                newvalid |= OBD_MD_FLSIZE;
-        }
-        if (valid & OBD_MD_FLBLOCKS) {  /* allocation of space (x512 bytes) */
-                dst->o_blocks = src->i_blocks;
-                newvalid |= OBD_MD_FLBLOCKS;
-        }
-        if (valid & OBD_MD_FLBLKSZ) {   /* optimal block size */
+		newvalid |= OBD_MD_FLCTIME;
+	}
+	if (valid & OBD_MD_FLSIZE) {
+		dst->o_size = i_size_read(src);
+		newvalid |= OBD_MD_FLSIZE;
+	}
+	if (valid & OBD_MD_FLBLOCKS) { /* allocation of space (x512 bytes) */
+		dst->o_blocks = src->i_blocks;
+		newvalid |= OBD_MD_FLBLOCKS;
+	}
+	if (valid & OBD_MD_FLBLKSZ) { /* optimal block size */
 		dst->o_blksize = 1U << src->i_blkbits;
-                newvalid |= OBD_MD_FLBLKSZ;
-        }
-        if (valid & OBD_MD_FLTYPE) {
-                dst->o_mode = (dst->o_mode & S_IALLUGO) |
-                              (src->i_mode & S_IFMT);
-                newvalid |= OBD_MD_FLTYPE;
-        }
-        if (valid & OBD_MD_FLMODE) {
-                dst->o_mode = (dst->o_mode & S_IFMT) |
-                              (src->i_mode & S_IALLUGO);
-                newvalid |= OBD_MD_FLMODE;
-        }
+		newvalid |= OBD_MD_FLBLKSZ;
+	}
+	if (valid & OBD_MD_FLTYPE) {
+		dst->o_mode = (dst->o_mode & S_IALLUGO) |
+			      (src->i_mode & S_IFMT);
+		newvalid |= OBD_MD_FLTYPE;
+	}
+	if (valid & OBD_MD_FLMODE) {
+		dst->o_mode = (dst->o_mode & S_IFMT) |
+			      (src->i_mode & S_IALLUGO);
+		newvalid |= OBD_MD_FLMODE;
+	}
 	if (valid & OBD_MD_FLUID) {
 		dst->o_uid = from_kuid(&init_user_ns, src->i_uid);
 		newvalid |= OBD_MD_FLUID;
@@ -126,39 +127,39 @@ EXPORT_SYMBOL(obdo_from_inode);
 void obdo_cpy_md(struct obdo *dst, const struct obdo *src, u64 valid)
 {
 	CDEBUG(D_INODE, "src obdo "DOSTID" valid %#llx, dst obdo "DOSTID"\n",
-               POSTID(&src->o_oi), src->o_valid, POSTID(&dst->o_oi));
-        if (valid & OBD_MD_FLATIME)
-                dst->o_atime = src->o_atime;
-        if (valid & OBD_MD_FLMTIME)
-                dst->o_mtime = src->o_mtime;
-        if (valid & OBD_MD_FLCTIME)
-                dst->o_ctime = src->o_ctime;
-        if (valid & OBD_MD_FLSIZE)
-                dst->o_size = src->o_size;
-        if (valid & OBD_MD_FLBLOCKS) /* allocation of space */
-                dst->o_blocks = src->o_blocks;
-        if (valid & OBD_MD_FLBLKSZ)
-                dst->o_blksize = src->o_blksize;
-        if (valid & OBD_MD_FLTYPE)
-                dst->o_mode = (dst->o_mode & ~S_IFMT) | (src->o_mode & S_IFMT);
-        if (valid & OBD_MD_FLMODE)
-                dst->o_mode = (dst->o_mode & S_IFMT) | (src->o_mode & ~S_IFMT);
-        if (valid & OBD_MD_FLUID)
-                dst->o_uid = src->o_uid;
-        if (valid & OBD_MD_FLGID)
-                dst->o_gid = src->o_gid;
-        if (valid & OBD_MD_FLFLAGS)
-                dst->o_flags = src->o_flags;
-        if (valid & OBD_MD_FLFID) {
-                dst->o_parent_seq = src->o_parent_seq;
-                dst->o_parent_ver = src->o_parent_ver;
-        }
-        if (valid & OBD_MD_FLGENER)
-                dst->o_parent_oid = src->o_parent_oid;
-        if (valid & OBD_MD_FLHANDLE)
-                dst->o_handle = src->o_handle;
-
-        dst->o_valid |= valid;
+	       POSTID(&src->o_oi), src->o_valid, POSTID(&dst->o_oi));
+	if (valid & OBD_MD_FLATIME)
+		dst->o_atime = src->o_atime;
+	if (valid & OBD_MD_FLMTIME)
+		dst->o_mtime = src->o_mtime;
+	if (valid & OBD_MD_FLCTIME)
+		dst->o_ctime = src->o_ctime;
+	if (valid & OBD_MD_FLSIZE)
+		dst->o_size = src->o_size;
+	if (valid & OBD_MD_FLBLOCKS) /* allocation of space */
+		dst->o_blocks = src->o_blocks;
+	if (valid & OBD_MD_FLBLKSZ)
+		dst->o_blksize = src->o_blksize;
+	if (valid & OBD_MD_FLTYPE)
+		dst->o_mode = (dst->o_mode & ~S_IFMT) | (src->o_mode & S_IFMT);
+	if (valid & OBD_MD_FLMODE)
+		dst->o_mode = (dst->o_mode & S_IFMT) | (src->o_mode & ~S_IFMT);
+	if (valid & OBD_MD_FLUID)
+		dst->o_uid = src->o_uid;
+	if (valid & OBD_MD_FLGID)
+		dst->o_gid = src->o_gid;
+	if (valid & OBD_MD_FLFLAGS)
+		dst->o_flags = src->o_flags;
+	if (valid & OBD_MD_FLFID) {
+		dst->o_parent_seq = src->o_parent_seq;
+		dst->o_parent_ver = src->o_parent_ver;
+	}
+	if (valid & OBD_MD_FLPARENT)
+		dst->o_parent_oid = src->o_parent_oid;
+	if (valid & OBD_MD_FLHANDLE)
+		dst->o_handle = src->o_handle;
+
+	dst->o_valid |= valid;
 }
 EXPORT_SYMBOL(obdo_cpy_md);
 
@@ -168,39 +169,48 @@ void obdo_to_ioobj(const struct obdo *oa, struct obd_ioobj *ioobj)
 	if (unlikely(!(oa->o_valid & OBD_MD_FLGROUP)))
 		ostid_set_seq_mdt0(&ioobj->ioo_oid);
 
-	/* Since 2.4 this does not contain o_mode in the low 16 bits.
-	 * Instead, it holds (bd_md_max_brw - 1) for multi-bulk BRW RPCs */
+	/*
+	 * Since 2.4 this does not contain o_mode in the low 16 bits.
+	 * Instead, it holds (bd_md_max_brw - 1) for multi-bulk BRW RPCs
+	 */
 	ioobj->ioo_max_brw = 0;
 }
 EXPORT_SYMBOL(obdo_to_ioobj);
 
-/**
+/*
  * Create an obdo to send over the wire
  */
 void lustre_set_wire_obdo(const struct obd_connect_data *ocd,
-				 struct obdo *wobdo,
-				 const struct obdo *lobdo)
+			  struct obdo *wobdo,
+			  const struct obdo *lobdo)
 {
 	*wobdo = *lobdo;
 	if (ocd == NULL)
 		return;
 
+	if (!(wobdo->o_valid & OBD_MD_FLUID))
+		wobdo->o_uid = from_kuid(&init_user_ns, current_uid());
+	if (!(wobdo->o_valid & OBD_MD_FLGID))
+		wobdo->o_gid = from_kgid(&init_user_ns, current_gid());
+
 	if (unlikely(!(ocd->ocd_connect_flags & OBD_CONNECT_FID)) &&
 	    fid_seq_is_echo(ostid_seq(&lobdo->o_oi))) {
-		/* Currently OBD_FL_OSTID will only be used when 2.4 echo
-		 * client communicate with pre-2.4 server */
+		/*
+		 * Currently OBD_FL_OSTID will only be used when 2.4 echo
+		 * client communicate with pre-2.4 server
+		 */
 		wobdo->o_oi.oi.oi_id = fid_oid(&lobdo->o_oi.oi_fid);
 		wobdo->o_oi.oi.oi_seq = fid_seq(&lobdo->o_oi.oi_fid);
 	}
 }
 EXPORT_SYMBOL(lustre_set_wire_obdo);
 
-/**
+/*
  * Create a local obdo from a wire based odbo
  */
 void lustre_get_wire_obdo(const struct obd_connect_data *ocd,
-				 struct obdo *lobdo,
-				 const struct obdo *wobdo)
+			  struct obdo *lobdo,
+			  const struct obdo *wobdo)
 {
 	*lobdo = *wobdo;
 	if (ocd == NULL)
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obdo_server.c b/drivers/staging/lustrefsx/lustre/obdclass/obdo_server.c
new file mode 100644
index 0000000000000..0f7f474f7fbb9
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obdo_server.c
@@ -0,0 +1,156 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/linux/linux-obdo.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/pagemap.h> /* for PAGE_SIZE */
+#include <obd_class.h>
+
+/*FIXME: Just copy from obdo_from_inode*/
+void obdo_from_la(struct obdo *dst, const struct lu_attr *la, u64 valid)
+{
+	u64 newvalid = 0;
+
+	if (valid & LA_ATIME) {
+		dst->o_atime = la->la_atime;
+		newvalid |= OBD_MD_FLATIME;
+	}
+	if (valid & LA_MTIME) {
+		dst->o_mtime = la->la_mtime;
+		newvalid |= OBD_MD_FLMTIME;
+	}
+	if (valid & LA_CTIME) {
+		dst->o_ctime = la->la_ctime;
+		newvalid |= OBD_MD_FLCTIME;
+	}
+	if (valid & LA_SIZE) {
+		dst->o_size = la->la_size;
+		newvalid |= OBD_MD_FLSIZE;
+	}
+	if (valid & LA_BLOCKS) {  /* allocation of space (x512 bytes) */
+		dst->o_blocks = la->la_blocks;
+		newvalid |= OBD_MD_FLBLOCKS;
+	}
+	if (valid & LA_TYPE) {
+		dst->o_mode = (dst->o_mode & S_IALLUGO) |
+			(la->la_mode & S_IFMT);
+		newvalid |= OBD_MD_FLTYPE;
+	}
+	if (valid & LA_MODE) {
+		dst->o_mode = (dst->o_mode & S_IFMT) |
+			(la->la_mode & S_IALLUGO);
+		newvalid |= OBD_MD_FLMODE;
+	}
+	if (valid & LA_UID) {
+		dst->o_uid = la->la_uid;
+		newvalid |= OBD_MD_FLUID;
+	}
+	if (valid & LA_GID) {
+		dst->o_gid = la->la_gid;
+		newvalid |= OBD_MD_FLGID;
+	}
+	if (valid & LA_PROJID) {
+		dst->o_projid = la->la_projid;
+		newvalid |= OBD_MD_FLPROJID;
+	}
+	if (valid & LA_FLAGS) {
+		dst->o_flags = la->la_flags;
+		newvalid |= OBD_MD_FLFLAGS;
+	}
+	dst->o_valid |= newvalid;
+}
+EXPORT_SYMBOL(obdo_from_la);
+
+/*FIXME: Just copy from obdo_from_inode*/
+void la_from_obdo(struct lu_attr *dst, const struct obdo *obdo, u64 valid)
+{
+	u64 newvalid = 0;
+
+	valid &= obdo->o_valid;
+
+	if (valid & OBD_MD_FLATIME) {
+		dst->la_atime = obdo->o_atime;
+		newvalid |= LA_ATIME;
+	}
+	if (valid & OBD_MD_FLMTIME) {
+		dst->la_mtime = obdo->o_mtime;
+		newvalid |= LA_MTIME;
+	}
+	if (valid & OBD_MD_FLCTIME) {
+		dst->la_ctime = obdo->o_ctime;
+		newvalid |= LA_CTIME;
+	}
+	if (valid & OBD_MD_FLSIZE) {
+		dst->la_size = obdo->o_size;
+		newvalid |= LA_SIZE;
+	}
+	if (valid & OBD_MD_FLBLOCKS) {
+		dst->la_blocks = obdo->o_blocks;
+		newvalid |= LA_BLOCKS;
+	}
+	if (valid & OBD_MD_FLTYPE) {
+		dst->la_mode = (dst->la_mode & S_IALLUGO) |
+			(obdo->o_mode & S_IFMT);
+		newvalid |= LA_TYPE;
+	}
+	if (valid & OBD_MD_FLMODE) {
+		dst->la_mode = (dst->la_mode & S_IFMT) |
+			(obdo->o_mode & S_IALLUGO);
+		newvalid |= LA_MODE;
+	}
+	if (valid & OBD_MD_FLUID) {
+		dst->la_uid = obdo->o_uid;
+		newvalid |= LA_UID;
+	}
+	if (valid & OBD_MD_FLGID) {
+		dst->la_gid = obdo->o_gid;
+		newvalid |= LA_GID;
+	}
+	if (valid & OBD_MD_FLPROJID) {
+		dst->la_projid = obdo->o_projid;
+		newvalid |= LA_PROJID;
+	}
+	if (valid & OBD_MD_FLFLAGS) {
+		dst->la_flags = obdo->o_flags;
+		newvalid |= LA_FLAGS;
+	}
+	dst->la_valid = newvalid;
+}
+EXPORT_SYMBOL(la_from_obdo);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/scrub.c b/drivers/staging/lustrefsx/lustre/obdclass/scrub.c
new file mode 100644
index 0000000000000..b2e93c6dcc408
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/scrub.c
@@ -0,0 +1,1216 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Intel Corporation.
+ */
+/*
+ * lustre/obdclass/scrub.c
+ *
+ * The OI scrub is used for checking and (re)building Object Index files
+ * that are usually backend special. Here are some general scrub related
+ * functions that can be shared by different backends for OI scrub.
+ *
+ * Author: Fan Yong <fan.yong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LFSCK
+
+#include <linux/kthread.h>
+#include <lustre_scrub.h>
+#include <lustre_lib.h>
+#include <lustre_fid.h>
+
+static inline struct dt_device *scrub_obj2dev(struct dt_object *obj)
+{
+	return container_of0(obj->do_lu.lo_dev, struct dt_device, dd_lu_dev);
+}
+
+static void scrub_file_to_cpu(struct scrub_file *des, struct scrub_file *src)
+{
+	memcpy(des->sf_uuid, src->sf_uuid, 16);
+	des->sf_flags	= le64_to_cpu(src->sf_flags);
+	des->sf_magic	= le32_to_cpu(src->sf_magic);
+	des->sf_status	= le16_to_cpu(src->sf_status);
+	des->sf_param	= le16_to_cpu(src->sf_param);
+	des->sf_time_last_complete      =
+				le64_to_cpu(src->sf_time_last_complete);
+	des->sf_time_latest_start       =
+				le64_to_cpu(src->sf_time_latest_start);
+	des->sf_time_last_checkpoint    =
+				le64_to_cpu(src->sf_time_last_checkpoint);
+	des->sf_pos_latest_start	=
+				le64_to_cpu(src->sf_pos_latest_start);
+	des->sf_pos_last_checkpoint     =
+				le64_to_cpu(src->sf_pos_last_checkpoint);
+	des->sf_pos_first_inconsistent  =
+				le64_to_cpu(src->sf_pos_first_inconsistent);
+	des->sf_items_checked		=
+				le64_to_cpu(src->sf_items_checked);
+	des->sf_items_updated		=
+				le64_to_cpu(src->sf_items_updated);
+	des->sf_items_failed		=
+				le64_to_cpu(src->sf_items_failed);
+	des->sf_items_updated_prior     =
+				le64_to_cpu(src->sf_items_updated_prior);
+	des->sf_run_time	= le32_to_cpu(src->sf_run_time);
+	des->sf_success_count   = le32_to_cpu(src->sf_success_count);
+	des->sf_oi_count	= le16_to_cpu(src->sf_oi_count);
+	des->sf_internal_flags	= le16_to_cpu(src->sf_internal_flags);
+	memcpy(des->sf_oi_bitmap, src->sf_oi_bitmap, SCRUB_OI_BITMAP_SIZE);
+}
+
+static void scrub_file_to_le(struct scrub_file *des, struct scrub_file *src)
+{
+	memcpy(des->sf_uuid, src->sf_uuid, 16);
+	des->sf_flags	= cpu_to_le64(src->sf_flags);
+	des->sf_magic	= cpu_to_le32(src->sf_magic);
+	des->sf_status	= cpu_to_le16(src->sf_status);
+	des->sf_param	= cpu_to_le16(src->sf_param);
+	des->sf_time_last_complete      =
+				cpu_to_le64(src->sf_time_last_complete);
+	des->sf_time_latest_start       =
+				cpu_to_le64(src->sf_time_latest_start);
+	des->sf_time_last_checkpoint    =
+				cpu_to_le64(src->sf_time_last_checkpoint);
+	des->sf_pos_latest_start	=
+				cpu_to_le64(src->sf_pos_latest_start);
+	des->sf_pos_last_checkpoint     =
+				cpu_to_le64(src->sf_pos_last_checkpoint);
+	des->sf_pos_first_inconsistent  =
+				cpu_to_le64(src->sf_pos_first_inconsistent);
+	des->sf_items_checked		=
+				cpu_to_le64(src->sf_items_checked);
+	des->sf_items_updated		=
+				cpu_to_le64(src->sf_items_updated);
+	des->sf_items_failed		=
+				cpu_to_le64(src->sf_items_failed);
+	des->sf_items_updated_prior     =
+				cpu_to_le64(src->sf_items_updated_prior);
+	des->sf_run_time	= cpu_to_le32(src->sf_run_time);
+	des->sf_success_count   = cpu_to_le32(src->sf_success_count);
+	des->sf_oi_count	= cpu_to_le16(src->sf_oi_count);
+	des->sf_internal_flags	= cpu_to_le16(src->sf_internal_flags);
+	memcpy(des->sf_oi_bitmap, src->sf_oi_bitmap, SCRUB_OI_BITMAP_SIZE);
+}
+
+void scrub_file_init(struct lustre_scrub *scrub, __u8 *uuid)
+{
+	struct scrub_file *sf = &scrub->os_file;
+
+	memset(sf, 0, sizeof(*sf));
+	memcpy(sf->sf_uuid, uuid, 16);
+	sf->sf_magic = SCRUB_MAGIC_V1;
+	sf->sf_status = SS_INIT;
+}
+EXPORT_SYMBOL(scrub_file_init);
+
+void scrub_file_reset(struct lustre_scrub *scrub, __u8 *uuid, __u64 flags)
+{
+	struct scrub_file *sf = &scrub->os_file;
+
+	CDEBUG(D_LFSCK, "%s: reset OI scrub file, old flags = "
+	       "%#llx, add flags = %#llx\n",
+	       scrub->os_name, sf->sf_flags, flags);
+
+	memcpy(sf->sf_uuid, uuid, 16);
+	sf->sf_status = SS_INIT;
+	sf->sf_flags |= flags;
+	sf->sf_flags &= ~SF_AUTO;
+	sf->sf_run_time = 0;
+	sf->sf_time_latest_start = 0;
+	sf->sf_time_last_checkpoint = 0;
+	sf->sf_pos_latest_start = 0;
+	sf->sf_pos_last_checkpoint = 0;
+	sf->sf_pos_first_inconsistent = 0;
+	sf->sf_items_checked = 0;
+	sf->sf_items_updated = 0;
+	sf->sf_items_failed = 0;
+	sf->sf_items_noscrub = 0;
+	sf->sf_items_igif = 0;
+	if (!scrub->os_in_join)
+		sf->sf_items_updated_prior = 0;
+}
+EXPORT_SYMBOL(scrub_file_reset);
+
+int scrub_file_load(const struct lu_env *env, struct lustre_scrub *scrub)
+{
+	struct scrub_file *sf = &scrub->os_file;
+	struct lu_buf buf = {
+		.lb_buf = &scrub->os_file_disk,
+		.lb_len = sizeof(scrub->os_file_disk)
+	};
+	loff_t pos = 0;
+	int rc;
+
+	rc = dt_read(env, scrub->os_obj, &buf, &pos);
+	/* failure */
+	if (rc < 0) {
+		CERROR("%s: fail to load scrub file: rc = %d\n",
+		       scrub->os_name, rc);
+		return rc;
+	}
+
+	/* empty */
+	if (!rc)
+		return -ENOENT;
+
+	/* corrupted */
+	if (rc < buf.lb_len) {
+		CDEBUG(D_LFSCK, "%s: fail to load scrub file, "
+		       "expected = %d: rc = %d\n",
+		       scrub->os_name, (int)buf.lb_len, rc);
+		return -EFAULT;
+	}
+
+	scrub_file_to_cpu(sf, &scrub->os_file_disk);
+	if (sf->sf_magic != SCRUB_MAGIC_V1) {
+		CDEBUG(D_LFSCK, "%s: invalid scrub magic 0x%x != 0x%x\n",
+		       scrub->os_name, sf->sf_magic, SCRUB_MAGIC_V1);
+		return -EFAULT;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(scrub_file_load);
+
+int scrub_file_store(const struct lu_env *env, struct lustre_scrub *scrub)
+{
+	struct scrub_file *sf = &scrub->os_file_disk;
+	struct dt_object *obj = scrub->os_obj;
+	struct dt_device *dev = scrub_obj2dev(obj);
+	struct lu_buf buf = {
+		.lb_buf = sf,
+		.lb_len = sizeof(*sf)
+	};
+	struct thandle *th;
+	loff_t pos = 0;
+	int rc;
+	ENTRY;
+
+	/* Skip store under rdonly mode. */
+	if (dev->dd_rdonly)
+		RETURN(0);
+
+	scrub_file_to_le(sf, &scrub->os_file);
+	th = dt_trans_create(env, dev);
+	if (IS_ERR(th))
+		GOTO(log, rc = PTR_ERR(th));
+
+	rc = dt_declare_record_write(env, obj, &buf, pos, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_trans_start_local(env, dev, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_record_write(env, obj, &buf, &pos, th);
+
+	GOTO(stop, rc);
+
+stop:
+	dt_trans_stop(env, dev, th);
+
+log:
+	if (rc)
+		CERROR("%s: store scrub file: rc = %d\n",
+		       scrub->os_name, rc);
+	else
+		CDEBUG(D_LFSCK, "%s: store scrub file: rc = %d\n",
+		       scrub->os_name, rc);
+
+	scrub->os_time_last_checkpoint = ktime_get_seconds();
+	scrub->os_time_next_checkpoint = scrub->os_time_last_checkpoint +
+					 SCRUB_CHECKPOINT_INTERVAL;
+	return rc;
+}
+EXPORT_SYMBOL(scrub_file_store);
+
+int scrub_checkpoint(const struct lu_env *env, struct lustre_scrub *scrub)
+{
+	struct scrub_file *sf = &scrub->os_file;
+	time64_t now = ktime_get_seconds();
+	int rc;
+
+	if (likely(now < scrub->os_time_next_checkpoint ||
+		   scrub->os_new_checked == 0))
+		return 0;
+
+	CDEBUG(D_LFSCK, "%s: OI scrub checkpoint at pos %llu\n",
+	       scrub->os_name, scrub->os_pos_current);
+
+	down_write(&scrub->os_rwsem);
+	sf->sf_items_checked += scrub->os_new_checked;
+	scrub->os_new_checked = 0;
+	sf->sf_pos_last_checkpoint = scrub->os_pos_current;
+	sf->sf_time_last_checkpoint = ktime_get_real_seconds();
+	sf->sf_run_time += now - scrub->os_time_last_checkpoint;
+	rc = scrub_file_store(env, scrub);
+	up_write(&scrub->os_rwsem);
+
+	return rc;
+}
+EXPORT_SYMBOL(scrub_checkpoint);
+
+int scrub_start(int (*threadfn)(void *data), struct lustre_scrub *scrub,
+		void *data, __u32 flags)
+{
+	struct ptlrpc_thread *thread = &scrub->os_thread;
+	struct l_wait_info lwi = { 0 };
+	struct task_struct *task;
+	int rc;
+	ENTRY;
+
+again:
+	/* os_lock: sync status between stop and scrub thread */
+	spin_lock(&scrub->os_lock);
+	if (thread_is_running(thread)) {
+		spin_unlock(&scrub->os_lock);
+		RETURN(-EALREADY);
+	}
+
+	if (unlikely(thread_is_stopping(thread))) {
+		spin_unlock(&scrub->os_lock);
+		l_wait_event(thread->t_ctl_waitq,
+			     thread_is_stopped(thread),
+			     &lwi);
+		goto again;
+	}
+	spin_unlock(&scrub->os_lock);
+
+	if (scrub->os_file.sf_status == SS_COMPLETED) {
+		if (!(flags & SS_SET_FAILOUT))
+			flags |= SS_CLEAR_FAILOUT;
+
+		if (!(flags & SS_SET_DRYRUN))
+			flags |= SS_CLEAR_DRYRUN;
+
+		flags |= SS_RESET;
+	}
+
+	scrub->os_start_flags = flags;
+	thread_set_flags(thread, 0);
+	task = kthread_run(threadfn, data, "OI_scrub");
+	if (IS_ERR(task)) {
+		rc = PTR_ERR(task);
+		CERROR("%s: cannot start iteration thread: rc = %d\n",
+		       scrub->os_name, rc);
+		RETURN(rc);
+	}
+
+	l_wait_event(thread->t_ctl_waitq,
+		     thread_is_running(thread) || thread_is_stopped(thread),
+		     &lwi);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(scrub_start);
+
+void scrub_stop(struct lustre_scrub *scrub)
+{
+	struct ptlrpc_thread *thread = &scrub->os_thread;
+	struct l_wait_info lwi = { 0 };
+
+	/* os_lock: sync status between stop and scrub thread */
+	spin_lock(&scrub->os_lock);
+	if (!thread_is_init(thread) && !thread_is_stopped(thread)) {
+		thread_set_flags(thread, SVC_STOPPING);
+		spin_unlock(&scrub->os_lock);
+		wake_up_all(&thread->t_ctl_waitq);
+		l_wait_event(thread->t_ctl_waitq,
+			     thread_is_stopped(thread),
+			     &lwi);
+		/* Do not skip the last lock/unlock, which can guarantee that
+		 * the caller cannot return until the OI scrub thread exit. */
+		spin_lock(&scrub->os_lock);
+	}
+	spin_unlock(&scrub->os_lock);
+}
+EXPORT_SYMBOL(scrub_stop);
+
+const char *scrub_status_names[] = {
+	"init",
+	"scanning",
+	"completed",
+	"failed",
+	"stopped",
+	"paused",
+	"crashed",
+	NULL
+};
+
+const char *scrub_flags_names[] = {
+	"recreated",
+	"inconsistent",
+	"auto",
+	"upgrade",
+	NULL
+};
+
+const char *scrub_param_names[] = {
+	"failout",
+	"dryrun",
+	NULL
+};
+
+static void scrub_bits_dump(struct seq_file *m, int bits, const char *names[],
+			    const char *prefix)
+{
+	int flag;
+	int i;
+
+	seq_printf(m, "%s:%c", prefix, bits != 0 ? ' ' : '\n');
+
+	for (i = 0, flag = 1; bits != 0; i++, flag = 1 << i) {
+		if (flag & bits) {
+			bits &= ~flag;
+			seq_printf(m, "%s%c", names[i],
+				   bits != 0 ? ',' : '\n');
+		}
+	}
+}
+
+static void scrub_time_dump(struct seq_file *m, time64_t time,
+			    const char *prefix)
+{
+	if (time != 0)
+		seq_printf(m, "%s: %llu seconds\n", prefix,
+			   ktime_get_real_seconds() - time);
+	else
+		seq_printf(m, "%s: N/A\n", prefix);
+}
+
+static void scrub_pos_dump(struct seq_file *m, __u64 pos, const char *prefix)
+{
+	if (pos != 0)
+		seq_printf(m, "%s: %llu\n", prefix, pos);
+	else
+		seq_printf(m, "%s: N/A\n", prefix);
+}
+
+void scrub_dump(struct seq_file *m, struct lustre_scrub *scrub)
+{
+	struct scrub_file *sf = &scrub->os_file;
+	u64 checked;
+	s64 speed;
+
+	down_read(&scrub->os_rwsem);
+	seq_printf(m, "name: OI_scrub\n"
+		   "magic: 0x%x\n"
+		   "oi_files: %d\n"
+		   "status: %s\n",
+		   sf->sf_magic, (int)sf->sf_oi_count,
+		   scrub_status_names[sf->sf_status]);
+
+	scrub_bits_dump(m, sf->sf_flags, scrub_flags_names, "flags");
+
+	scrub_bits_dump(m, sf->sf_param, scrub_param_names, "param");
+
+	scrub_time_dump(m, sf->sf_time_last_complete,
+			"time_since_last_completed");
+
+	scrub_time_dump(m, sf->sf_time_latest_start,
+			"time_since_latest_start");
+
+	scrub_time_dump(m, sf->sf_time_last_checkpoint,
+			"time_since_last_checkpoint");
+
+	scrub_pos_dump(m, sf->sf_pos_latest_start,
+			"latest_start_position");
+
+	scrub_pos_dump(m, sf->sf_pos_last_checkpoint,
+			"last_checkpoint_position");
+
+	scrub_pos_dump(m, sf->sf_pos_first_inconsistent,
+			"first_failure_position");
+
+	checked = sf->sf_items_checked + scrub->os_new_checked;
+	seq_printf(m, "checked: %llu\n"
+		   "%s: %llu\n"
+		   "failed: %llu\n"
+		   "prior_%s: %llu\n"
+		   "noscrub: %llu\n"
+		   "igif: %llu\n"
+		   "success_count: %u\n",
+		   checked,
+		   sf->sf_param & SP_DRYRUN ? "inconsistent" : "updated",
+		   sf->sf_items_updated, sf->sf_items_failed,
+		   sf->sf_param & SP_DRYRUN ? "inconsistent" : "updated",
+		   sf->sf_items_updated_prior, sf->sf_items_noscrub,
+		   sf->sf_items_igif, sf->sf_success_count);
+
+	speed = checked;
+	if (thread_is_running(&scrub->os_thread)) {
+		s64 new_checked = scrub->os_new_checked;
+		time64_t duration;
+		time64_t rtime;
+
+		/* Since the time resolution is in seconds for new system
+		 * or small devices it ismore likely that duration will be
+		 * zero which will lead to inaccurate results.
+		 */
+		duration = ktime_get_seconds() -
+			   scrub->os_time_last_checkpoint;
+		if (duration != 0)
+			new_checked = div_s64(new_checked, duration);
+
+		rtime = sf->sf_run_time + duration;
+		if (rtime != 0)
+			speed = div_s64(speed, rtime);
+
+		seq_printf(m, "run_time: %lld seconds\n"
+			   "average_speed: %lld objects/sec\n"
+			   "real-time_speed: %lld objects/sec\n"
+			   "current_position: %llu\n"
+			   "scrub_in_prior: %s\n"
+			   "scrub_full_speed: %s\n"
+			   "partial_scan: %s\n",
+			   rtime, speed, new_checked,
+			   scrub->os_pos_current,
+			   scrub->os_in_prior ? "yes" : "no",
+			   scrub->os_full_speed ? "yes" : "no",
+			   scrub->os_partial_scan ? "yes" : "no");
+	} else {
+		if (sf->sf_run_time != 0)
+			speed = div_s64(speed, sf->sf_run_time);
+		seq_printf(m, "run_time: %ld seconds\n"
+			   "average_speed: %lld objects/sec\n"
+			   "real-time_speed: N/A\n"
+			   "current_position: N/A\n",
+			   sf->sf_run_time, speed);
+	}
+
+	up_read(&scrub->os_rwsem);
+}
+EXPORT_SYMBOL(scrub_dump);
+
+int lustre_liru_new(struct list_head *head, const struct lu_fid *pfid,
+		    const struct lu_fid *cfid, __u64 child,
+		    const char *name, int namelen)
+{
+	struct lustre_index_restore_unit *liru;
+	int len = sizeof(*liru) + namelen + 1;
+
+	OBD_ALLOC(liru, len);
+	if (!liru)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&liru->liru_link);
+	liru->liru_pfid = *pfid;
+	liru->liru_cfid = *cfid;
+	liru->liru_clid = child;
+	liru->liru_len = len;
+	memcpy(liru->liru_name, name, namelen);
+	liru->liru_name[namelen] = 0;
+	list_add_tail(&liru->liru_link, head);
+
+	return 0;
+}
+EXPORT_SYMBOL(lustre_liru_new);
+
+int lustre_index_register(struct dt_device *dev, const char *devname,
+			  struct list_head *head, spinlock_t *lock, int *guard,
+			  const struct lu_fid *fid,
+			  __u32 keysize, __u32 recsize)
+{
+	struct lustre_index_backup_unit *libu, *pos;
+	int rc = 0;
+	ENTRY;
+
+	if (dev->dd_rdonly || *guard)
+		RETURN(1);
+
+	OBD_ALLOC_PTR(libu);
+	if (!libu)
+		RETURN(-ENOMEM);
+
+	INIT_LIST_HEAD(&libu->libu_link);
+	libu->libu_keysize = keysize;
+	libu->libu_recsize = recsize;
+	libu->libu_fid = *fid;
+
+	spin_lock(lock);
+	if (unlikely(*guard)) {
+		spin_unlock(lock);
+		OBD_FREE_PTR(libu);
+
+		RETURN(1);
+	}
+
+	list_for_each_entry_reverse(pos, head, libu_link) {
+		rc = lu_fid_cmp(&pos->libu_fid, fid);
+		if (rc < 0) {
+			list_add(&libu->libu_link, &pos->libu_link);
+			spin_unlock(lock);
+
+			RETURN(0);
+		}
+
+		if (!rc) {
+			/* Registered already. But the former registered one
+			 * has different keysize/recsize. It may because that
+			 * the former values are from disk and corrupted, then
+			 * replace it with new values. */
+			if (unlikely(keysize != pos->libu_keysize ||
+				     recsize != pos->libu_recsize)) {
+				CWARN("%s: the index "DFID" has registered "
+				      "with %u/%u, may be invalid, replace "
+				      "with %u/%u\n",
+				      devname, PFID(fid), pos->libu_keysize,
+				      pos->libu_recsize, keysize, recsize);
+
+				pos->libu_keysize = keysize;
+				pos->libu_recsize = recsize;
+			} else {
+				rc = 1;
+			}
+
+			spin_unlock(lock);
+			OBD_FREE_PTR(libu);
+
+			RETURN(rc);
+		}
+	}
+
+	list_add(&libu->libu_link, head);
+	spin_unlock(lock);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lustre_index_register);
+
+static void lustre_index_degister(struct list_head *head, spinlock_t *lock,
+				  const struct lu_fid *fid)
+{
+	struct lustre_index_backup_unit *libu;
+	int rc = -ENOENT;
+
+	spin_lock(lock);
+	list_for_each_entry_reverse(libu, head, libu_link) {
+		rc = lu_fid_cmp(&libu->libu_fid, fid);
+		/* NOT registered. */
+		if (rc < 0)
+			break;
+
+		if (!rc) {
+			list_del(&libu->libu_link);
+			break;
+		}
+	}
+	spin_unlock(lock);
+
+	if (!rc)
+		OBD_FREE_PTR(libu);
+}
+
+static void
+lustre_index_backup_make_header(struct lustre_index_backup_header *header,
+				__u32 keysize, __u32 recsize,
+				const struct lu_fid *fid, __u32 count)
+{
+	memset(header, 0, sizeof(*header));
+	header->libh_magic = cpu_to_le32(INDEX_BACKUP_MAGIC_V1);
+	header->libh_count = cpu_to_le32(count);
+	header->libh_keysize = cpu_to_le32(keysize);
+	header->libh_recsize = cpu_to_le32(recsize);
+	fid_cpu_to_le(&header->libh_owner, fid);
+}
+
+static int lustre_index_backup_body(const struct lu_env *env,
+				    struct dt_object *obj, loff_t *pos,
+				    void *buf, int bufsize)
+{
+	struct dt_device *dev = lu2dt_dev(obj->do_lu.lo_dev);
+	struct thandle *th;
+	struct lu_buf lbuf = {
+		.lb_buf = buf,
+		.lb_len = bufsize
+	};
+	int rc;
+	ENTRY;
+
+	th = dt_trans_create(env, dev);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	rc = dt_declare_record_write(env, obj, &lbuf, *pos, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_trans_start_local(env, dev, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_record_write(env, obj, &lbuf, pos, th);
+
+	GOTO(stop, rc);
+
+stop:
+	dt_trans_stop(env, dev, th);
+	return rc;
+}
+
+static int lustre_index_backup_header(const struct lu_env *env,
+				      struct dt_object *obj,
+				      const struct lu_fid *tgt_fid,
+				      __u32 keysize, __u32 recsize,
+				      void *buf, int bufsize, int count)
+{
+	struct dt_device *dev = lu2dt_dev(obj->do_lu.lo_dev);
+	struct lustre_index_backup_header *header = buf;
+	struct lu_attr *la = buf;
+	struct thandle *th;
+	struct lu_buf lbuf = {
+		.lb_buf = header,
+		.lb_len = sizeof(*header)
+	};
+	loff_t size = sizeof(*header) + (keysize + recsize) * count;
+	loff_t pos = 0;
+	int rc;
+	bool punch = false;
+	ENTRY;
+
+	LASSERT(sizeof(*la) <= bufsize);
+	LASSERT(sizeof(*header) <= bufsize);
+
+	rc = dt_attr_get(env, obj, la);
+	if (rc)
+		RETURN(rc);
+
+	if (la->la_size > size)
+		punch = true;
+
+	lustre_index_backup_make_header(header, keysize, recsize,
+					tgt_fid, count);
+	th = dt_trans_create(env, dev);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	rc = dt_declare_record_write(env, obj, &lbuf, pos, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	if (punch) {
+		rc = dt_declare_punch(env, obj, size, OBD_OBJECT_EOF, th);
+		if (rc)
+			GOTO(stop, rc);
+	}
+
+	rc = dt_trans_start_local(env, dev, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_record_write(env, obj, &lbuf, &pos, th);
+	if (!rc && punch)
+		rc = dt_punch(env, obj, size, OBD_OBJECT_EOF, th);
+
+	GOTO(stop, rc);
+
+stop:
+	dt_trans_stop(env, dev, th);
+	return rc;
+}
+
+static int lustre_index_update_lma(const struct lu_env *env,
+				   struct dt_object *obj,
+				   void *buf, int bufsize)
+{
+	struct dt_device *dev = lu2dt_dev(obj->do_lu.lo_dev);
+	struct lustre_mdt_attrs *lma = buf;
+	struct lu_buf lbuf = {
+		.lb_buf = lma,
+		.lb_len = sizeof(struct lustre_ost_attrs)
+	};
+	struct thandle *th;
+	int fl = LU_XATTR_REPLACE;
+	int rc;
+	ENTRY;
+
+	LASSERT(bufsize >= lbuf.lb_len);
+
+	rc = dt_xattr_get(env, obj, &lbuf, XATTR_NAME_LMA);
+	if (unlikely(rc == -ENODATA)) {
+		fl = LU_XATTR_CREATE;
+		lustre_lma_init(lma, lu_object_fid(&obj->do_lu),
+				LMAC_IDX_BACKUP, 0);
+		rc = sizeof(*lma);
+	} else if (rc < sizeof(*lma)) {
+		RETURN(rc < 0 ? rc : -EFAULT);
+	} else {
+		lustre_lma_swab(lma);
+		if (lma->lma_compat & LMAC_IDX_BACKUP)
+			RETURN(0);
+
+		lma->lma_compat |= LMAC_IDX_BACKUP;
+	}
+
+	lustre_lma_swab(lma);
+	lbuf.lb_len = rc;
+	th = dt_trans_create(env, dev);
+	if (IS_ERR(th))
+		RETURN(rc);
+
+	rc = dt_declare_xattr_set(env, obj, &lbuf, XATTR_NAME_LMA, fl, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_trans_start_local(env, dev, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_xattr_set(env, obj, &lbuf, XATTR_NAME_LMA, fl, th);
+
+	GOTO(stop, rc);
+
+stop:
+	dt_trans_stop(env, dev, th);
+	return rc;
+}
+
+static int lustre_index_backup_one(const struct lu_env *env,
+				   struct local_oid_storage *los,
+				   struct dt_object *parent,
+				   struct lustre_index_backup_unit *libu,
+				   char *buf, int bufsize)
+{
+	struct dt_device *dev = scrub_obj2dev(parent);
+	struct dt_object *tgt_obj = NULL;
+	struct dt_object *bak_obj = NULL;
+	const struct dt_it_ops *iops;
+	struct dt_it *di;
+	loff_t pos = sizeof(struct lustre_index_backup_header);
+	int count = 0;
+	int size = 0;
+	int rc;
+	ENTRY;
+
+	tgt_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev,
+					     &libu->libu_fid, NULL));
+	if (IS_ERR_OR_NULL(tgt_obj))
+		GOTO(out, rc = tgt_obj ? PTR_ERR(tgt_obj) : -ENOENT);
+
+	if (!dt_object_exists(tgt_obj))
+		GOTO(out, rc = 0);
+
+	if (!tgt_obj->do_index_ops) {
+		struct dt_index_features feat;
+
+		feat.dif_flags = DT_IND_UPDATE;
+		feat.dif_keysize_min = libu->libu_keysize;
+		feat.dif_keysize_max = libu->libu_keysize;
+		feat.dif_recsize_min = libu->libu_recsize;
+		feat.dif_recsize_max = libu->libu_recsize;
+		feat.dif_ptrsize = 4;
+		rc = tgt_obj->do_ops->do_index_try(env, tgt_obj, &feat);
+		if (rc)
+			GOTO(out, rc);
+	}
+
+	lustre_fid2lbx(buf, &libu->libu_fid, bufsize);
+	bak_obj = local_file_find_or_create(env, los, parent, buf,
+					    S_IFREG | S_IRUGO | S_IWUSR);
+	if (IS_ERR_OR_NULL(bak_obj))
+		GOTO(out, rc = bak_obj ? PTR_ERR(bak_obj) : -ENOENT);
+
+	iops = &tgt_obj->do_index_ops->dio_it;
+	di = iops->init(env, tgt_obj, 0);
+	if (IS_ERR(di))
+		GOTO(out, rc = PTR_ERR(di));
+
+	rc = iops->load(env, di, 0);
+	if (!rc)
+		rc = iops->next(env, di);
+	else if (rc > 0)
+		rc = 0;
+
+	while (!rc) {
+		void *key;
+		void *rec;
+
+		key = iops->key(env, di);
+		memcpy(&buf[size], key, libu->libu_keysize);
+		size += libu->libu_keysize;
+		rec = &buf[size];
+		rc = iops->rec(env, di, rec, 0);
+		if (rc)
+			GOTO(fini, rc);
+
+		size += libu->libu_recsize;
+		count++;
+		if (size + libu->libu_keysize + libu->libu_recsize > bufsize) {
+			rc = lustre_index_backup_body(env, bak_obj, &pos,
+						      buf, size);
+			if (rc)
+				GOTO(fini, rc);
+
+			size = 0;
+		}
+
+		rc = iops->next(env, di);
+	}
+
+	if (rc >= 0 && size > 0)
+		rc = lustre_index_backup_body(env, bak_obj, &pos, buf, size);
+
+	if (rc < 0)
+		GOTO(fini, rc);
+
+	rc = lustre_index_backup_header(env, bak_obj, &libu->libu_fid,
+					libu->libu_keysize, libu->libu_recsize,
+					buf, bufsize, count);
+	if (!rc)
+		rc = lustre_index_update_lma(env, tgt_obj, buf, bufsize);
+
+	if (!rc && OBD_FAIL_CHECK(OBD_FAIL_OSD_INDEX_CRASH)) {
+		LASSERT(bufsize >= 512);
+
+		pos = 0;
+		memset(buf, 0, 512);
+		lustre_index_backup_body(env, tgt_obj, &pos, buf, 512);
+	}
+
+	GOTO(fini, rc);
+
+fini:
+	iops->fini(env, di);
+out:
+	if (!IS_ERR_OR_NULL(tgt_obj))
+		dt_object_put_nocache(env, tgt_obj);
+	if (!IS_ERR_OR_NULL(bak_obj))
+		dt_object_put_nocache(env, bak_obj);
+	return rc;
+}
+
+void lustre_index_backup(const struct lu_env *env, struct dt_device *dev,
+			 const char *devname, struct list_head *head,
+			 spinlock_t *lock, int *guard, bool backup)
+{
+	struct lustre_index_backup_unit *libu;
+	struct local_oid_storage *los = NULL;
+	struct dt_object *parent = NULL;
+	char *buf = NULL;
+	struct lu_fid fid;
+	int rc;
+	ENTRY;
+
+	if (dev->dd_rdonly || *guard)
+		RETURN_EXIT;
+
+	spin_lock(lock);
+	*guard = 1;
+	spin_unlock(lock);
+
+	if (list_empty(head))
+		RETURN_EXIT;
+
+	/* Handle kinds of failures during mount process. */
+	if (!dev->dd_lu_dev.ld_site || !dev->dd_lu_dev.ld_site->ls_top_dev)
+		backup = false;
+
+	if (backup) {
+		OBD_ALLOC_LARGE(buf, INDEX_BACKUP_BUFSIZE);
+		if (!buf) {
+			backup = false;
+			goto scan;
+		}
+
+		lu_local_obj_fid(&fid, INDEX_BACKUP_OID);
+		parent = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev,
+						    &fid, NULL));
+		if (IS_ERR_OR_NULL(parent)) {
+			CERROR("%s: failed to locate backup dir: rc = %ld\n",
+			       devname, parent ? PTR_ERR(parent) : -ENOENT);
+			backup = false;
+			goto scan;
+		}
+
+		lu_local_name_obj_fid(&fid, 1);
+		rc = local_oid_storage_init(env, dev, &fid, &los);
+		if (rc) {
+			CERROR("%s: failed to init local storage: rc = %d\n",
+			       devname, rc);
+			backup = false;
+		}
+	}
+
+scan:
+	spin_lock(lock);
+	while (!list_empty(head)) {
+		libu = list_entry(head->next,
+				  struct lustre_index_backup_unit, libu_link);
+		list_del_init(&libu->libu_link);
+		spin_unlock(lock);
+
+		if (backup) {
+			rc = lustre_index_backup_one(env, los, parent, libu,
+						     buf, INDEX_BACKUP_BUFSIZE);
+			CDEBUG(D_WARNING, "%s: backup index "DFID": rc = %d\n",
+			       devname, PFID(&libu->libu_fid), rc);
+		}
+
+		OBD_FREE_PTR(libu);
+		spin_lock(lock);
+	}
+	spin_unlock(lock);
+
+	if (los)
+		local_oid_storage_fini(env, los);
+	if (parent)
+		dt_object_put_nocache(env, parent);
+	if (buf)
+		OBD_FREE_LARGE(buf, INDEX_BACKUP_BUFSIZE);
+
+	EXIT;
+}
+EXPORT_SYMBOL(lustre_index_backup);
+
+int lustre_index_restore(const struct lu_env *env, struct dt_device *dev,
+			 const struct lu_fid *parent_fid,
+			 const struct lu_fid *tgt_fid,
+			 const struct lu_fid *bak_fid, const char *name,
+			 struct list_head *head, spinlock_t *lock,
+			 char *buf, int bufsize)
+{
+	struct dt_object *parent_obj = NULL;
+	struct dt_object *tgt_obj = NULL;
+	struct dt_object *bak_obj = NULL;
+	struct lustre_index_backup_header *header;
+	struct dt_index_features *feat;
+	struct dt_object_format *dof;
+	struct lu_attr *la;
+	struct thandle *th;
+	struct lu_object_conf conf;
+	struct dt_insert_rec ent;
+	struct lu_buf lbuf;
+	struct lu_fid tfid;
+	loff_t pos = 0;
+	__u32 keysize;
+	__u32 recsize;
+	__u32 pairsize;
+	int count;
+	int rc;
+	bool registered = false;
+	ENTRY;
+
+	LASSERT(bufsize >= sizeof(*la) + sizeof(*dof) +
+		sizeof(*feat) + sizeof(*header));
+
+	memset(buf, 0, bufsize);
+	la = (struct lu_attr *)buf;
+	dof = (void *)la + sizeof(*la);
+	feat = (void *)dof + sizeof(*dof);
+	header = (void *)feat + sizeof(*feat);
+	lbuf.lb_buf = header;
+	lbuf.lb_len = sizeof(*header);
+
+	tgt_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev,
+					     tgt_fid, NULL));
+	if (IS_ERR_OR_NULL(tgt_obj))
+		GOTO(out, rc = tgt_obj ? PTR_ERR(tgt_obj) : -ENOENT);
+
+	bak_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev,
+					     bak_fid, NULL));
+	if (IS_ERR_OR_NULL(bak_obj))
+		GOTO(out, rc = bak_obj ? PTR_ERR(bak_obj) : -ENOENT);
+
+	if (!dt_object_exists(bak_obj))
+		GOTO(out, rc = -ENOENT);
+
+	parent_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev,
+						parent_fid, NULL));
+	if (IS_ERR_OR_NULL(parent_obj))
+		GOTO(out, rc = parent_obj ? PTR_ERR(parent_obj) : -ENOENT);
+
+	LASSERT(dt_object_exists(parent_obj));
+
+	if (unlikely(!dt_try_as_dir(env, parent_obj)))
+		GOTO(out, rc = -ENOTDIR);
+
+	rc = dt_attr_get(env, tgt_obj, la);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = dt_record_read(env, bak_obj, &lbuf, &pos);
+	if (rc)
+		GOTO(out, rc);
+
+	if (le32_to_cpu(header->libh_magic) != INDEX_BACKUP_MAGIC_V1)
+		GOTO(out, rc = -EINVAL);
+
+	fid_le_to_cpu(&tfid, &header->libh_owner);
+	if (unlikely(!lu_fid_eq(tgt_fid, &tfid)))
+		GOTO(out, rc = -EINVAL);
+
+	keysize = le32_to_cpu(header->libh_keysize);
+	recsize = le32_to_cpu(header->libh_recsize);
+	pairsize = keysize + recsize;
+
+	memset(feat, 0, sizeof(*feat));
+	feat->dif_flags = DT_IND_UPDATE;
+	feat->dif_keysize_min = feat->dif_keysize_max = keysize;
+	feat->dif_recsize_min = feat->dif_recsize_max = recsize;
+	feat->dif_ptrsize = 4;
+
+	/* T1: remove old name entry and destroy old index. */
+	th = dt_trans_create(env, dev);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
+
+	rc = dt_declare_delete(env, parent_obj,
+			       (const struct dt_key *)name, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_declare_destroy(env, tgt_obj, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_trans_start_local(env, dev, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_delete(env, parent_obj, (const struct dt_key *)name, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	dt_write_lock(env, tgt_obj, 0);
+	rc = dt_destroy(env, tgt_obj, th);
+	dt_write_unlock(env, tgt_obj);
+	dt_trans_stop(env, dev, th);
+	if (rc)
+		GOTO(out, rc);
+
+	la->la_valid = LA_MODE | LA_UID | LA_GID;
+	conf.loc_flags = LOC_F_NEW;
+	dof->u.dof_idx.di_feat = feat;
+	dof->dof_type = DFT_INDEX;
+	ent.rec_type = S_IFREG;
+	ent.rec_fid = tgt_fid;
+
+	/* Drop cache before re-create it. */
+	dt_object_put_nocache(env, tgt_obj);
+	tgt_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev,
+					     tgt_fid, &conf));
+	if (IS_ERR_OR_NULL(tgt_obj))
+		GOTO(out, rc = tgt_obj ? PTR_ERR(tgt_obj) : -ENOENT);
+
+	LASSERT(!dt_object_exists(tgt_obj));
+
+	/* T2: create new index and insert new name entry. */
+	th = dt_trans_create(env, dev);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
+
+	rc = dt_declare_create(env, tgt_obj, la, NULL, dof, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_declare_insert(env, parent_obj, (const struct dt_rec *)&ent,
+			       (const struct dt_key *)name, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_trans_start_local(env, dev, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	dt_write_lock(env, tgt_obj, 0);
+	rc = dt_create(env, tgt_obj, la, NULL, dof, th);
+	dt_write_unlock(env, tgt_obj);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_insert(env, parent_obj, (const struct dt_rec *)&ent,
+		       (const struct dt_key *)name, th);
+	dt_trans_stop(env, dev, th);
+	/* Some index name may has been inserted by OSD
+	 * automatically when create the index object. */
+	if (unlikely(rc == -EEXIST))
+		rc = 0;
+	if (rc)
+		GOTO(out, rc);
+
+	/* The new index will register via index_try. */
+	rc = tgt_obj->do_ops->do_index_try(env, tgt_obj, feat);
+	if (rc)
+		GOTO(out, rc);
+
+	registered = true;
+	count = le32_to_cpu(header->libh_count);
+	while (!rc && count > 0) {
+		int size = pairsize * count;
+		int items = count;
+		int i;
+
+		if (size > bufsize) {
+			items = bufsize / pairsize;
+			size = pairsize * items;
+		}
+
+		lbuf.lb_buf = buf;
+		lbuf.lb_len = size;
+		rc = dt_record_read(env, bak_obj, &lbuf, &pos);
+		for (i = 0; i < items && !rc; i++) {
+			void *key = &buf[i * pairsize];
+			void *rec = &buf[i * pairsize + keysize];
+
+			/* Tn: restore the records. */
+			th = dt_trans_create(env, dev);
+			if (!th)
+				GOTO(out, rc = -ENOMEM);
+
+			rc = dt_declare_insert(env, tgt_obj, rec, key, th);
+			if (rc)
+				GOTO(stop, rc);
+
+			rc = dt_trans_start_local(env, dev, th);
+			if (rc)
+				GOTO(stop, rc);
+
+			rc = dt_insert(env, tgt_obj, rec, key, th);
+			if (unlikely(rc == -EEXIST))
+				rc = 0;
+
+			dt_trans_stop(env, dev, th);
+		}
+
+		count -= items;
+	}
+
+	GOTO(out, rc);
+
+stop:
+	dt_trans_stop(env, dev, th);
+	if (rc && registered)
+		/* Degister the index to avoid overwriting the backup. */
+		lustre_index_degister(head, lock, tgt_fid);
+
+out:
+	if (!IS_ERR_OR_NULL(tgt_obj))
+		dt_object_put_nocache(env, tgt_obj);
+	if (!IS_ERR_OR_NULL(bak_obj))
+		dt_object_put_nocache(env, bak_obj);
+	if (!IS_ERR_OR_NULL(parent_obj))
+		dt_object_put_nocache(env, parent_obj);
+	return rc;
+}
+EXPORT_SYMBOL(lustre_index_restore);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/statfs_pack.c b/drivers/staging/lustrefsx/lustre/obdclass/statfs_pack.c
index 2a36051e52356..9c52f8094e9fe 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/statfs_pack.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/statfs_pack.c
@@ -46,28 +46,28 @@
 
 void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs)
 {
-        memset(osfs, 0, sizeof(*osfs));
-        osfs->os_type = sfs->f_type;
-        osfs->os_blocks = sfs->f_blocks;
-        osfs->os_bfree = sfs->f_bfree;
-        osfs->os_bavail = sfs->f_bavail;
-        osfs->os_files = sfs->f_files;
-        osfs->os_ffree = sfs->f_ffree;
-        osfs->os_bsize = sfs->f_bsize;
-        osfs->os_namelen = sfs->f_namelen;
+	memset(osfs, 0, sizeof(*osfs));
+	osfs->os_type = sfs->f_type;
+	osfs->os_blocks = sfs->f_blocks;
+	osfs->os_bfree = sfs->f_bfree;
+	osfs->os_bavail = sfs->f_bavail;
+	osfs->os_files = sfs->f_files;
+	osfs->os_ffree = sfs->f_ffree;
+	osfs->os_bsize = sfs->f_bsize;
+	osfs->os_namelen = sfs->f_namelen;
 }
 EXPORT_SYMBOL(statfs_pack);
 
 void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs)
 {
-        memset(sfs, 0, sizeof(*sfs));
-        sfs->f_type = osfs->os_type;
-        sfs->f_blocks = osfs->os_blocks;
-        sfs->f_bfree = osfs->os_bfree;
-        sfs->f_bavail = osfs->os_bavail;
-        sfs->f_files = osfs->os_files;
-        sfs->f_ffree = osfs->os_ffree;
-        sfs->f_bsize = osfs->os_bsize;
-        sfs->f_namelen = osfs->os_namelen;
+	memset(sfs, 0, sizeof(*sfs));
+	sfs->f_type = osfs->os_type;
+	sfs->f_blocks = osfs->os_blocks;
+	sfs->f_bfree = osfs->os_bfree;
+	sfs->f_bavail = osfs->os_bavail;
+	sfs->f_files = osfs->os_files;
+	sfs->f_ffree = osfs->os_ffree;
+	sfs->f_bsize = osfs->os_bsize;
+	sfs->f_namelen = osfs->os_namelen;
 }
 EXPORT_SYMBOL(statfs_unpack);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/upcall_cache.c b/drivers/staging/lustrefsx/lustre/obdclass/upcall_cache.c
index 2112733e50c54..5622410784d7a 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/upcall_cache.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/upcall_cache.c
@@ -35,9 +35,8 @@
  */
 #define DEBUG_SUBSYSTEM S_SEC
 
-#include <libcfs/linux/linux-misc.h>
 #include <libcfs/libcfs.h>
-#include <lnet/types.h>
+#include <uapi/linux/lnet/lnet-types.h>
 #include <upcall_cache.h>
 
 static struct upcall_cache_entry *alloc_entry(struct upcall_cache *cache,
@@ -115,14 +114,14 @@ static inline void put_entry(struct upcall_cache *cache,
 static int check_unlink_entry(struct upcall_cache *cache,
 			      struct upcall_cache_entry *entry)
 {
-	if (UC_CACHE_IS_VALID(entry) &&
-	    cfs_time_before(cfs_time_current(), entry->ue_expire))
+	time64_t now = ktime_get_seconds();
+
+	if (UC_CACHE_IS_VALID(entry) && now < entry->ue_expire)
 		return 0;
 
 	if (UC_CACHE_IS_ACQUIRING(entry)) {
 		if (entry->ue_acquire_expire == 0 ||
-		    cfs_time_before(cfs_time_current(),
-				    entry->ue_acquire_expire))
+		    now < entry->ue_acquire_expire)
 			return 0;
 
 		UC_CACHE_SET_EXPIRED(entry);
@@ -198,8 +197,8 @@ struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *cache,
 		spin_unlock(&cache->uc_lock);
 		rc = refresh_entry(cache, entry);
 		spin_lock(&cache->uc_lock);
-		entry->ue_acquire_expire =
-			cfs_time_shift(cache->uc_acquire_expire);
+		entry->ue_acquire_expire = ktime_get_seconds() +
+					   cache->uc_acquire_expire;
 		if (rc < 0) {
 			UC_CACHE_CLEAR_ACQUIRING(entry);
 			UC_CACHE_SET_INVALID(entry);
@@ -340,7 +339,7 @@ int upcall_cache_downcall(struct upcall_cache *cache, __u32 err, __u64 key,
 	if (rc)
 		GOTO(out, rc);
 
-	entry->ue_expire = cfs_time_shift(cache->uc_entry_expire);
+	entry->ue_expire = ktime_get_seconds() + cache->uc_entry_expire;
 	UC_CACHE_SET_VALID(entry);
 	CDEBUG(D_OTHER, "%s: created upcall cache entry %p for key %llu\n",
 	       cache->uc_name, entry, entry->ue_key);
@@ -400,10 +399,10 @@ void upcall_cache_flush_one(struct upcall_cache *cache, __u64 key, void *args)
 
 	if (found) {
 		CWARN("%s: flush entry %p: key %llu, ref %d, fl %x, "
-		      "cur %lu, ex %ld/%ld\n",
+		      "cur %lld, ex %lld/%lld\n",
 		      cache->uc_name, entry, entry->ue_key,
 		      atomic_read(&entry->ue_refcount), entry->ue_flags,
-		      cfs_time_current_sec(), entry->ue_acquire_expire,
+		      ktime_get_real_seconds(), entry->ue_acquire_expire,
 		      entry->ue_expire);
 		UC_CACHE_SET_EXPIRED(entry);
 		if (!atomic_read(&entry->ue_refcount))
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/uuid.c b/drivers/staging/lustrefsx/lustre/obdclass/uuid.c
deleted file mode 100644
index cc0092687511b..0000000000000
--- a/drivers/staging/lustrefsx/lustre/obdclass/uuid.c
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2014, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lustre/obdclass/uuid.c
- *
- * Public include file for the UUID library
- */
-
-#define DEBUG_SUBSYSTEM S_CLASS
-
-#include <libcfs/libcfs.h>
-#include <obd_support.h>
-#include <obd_class.h>
-
-static inline size_t consume(size_t nob, __u8 **ptr)
-{
-	size_t value;
-
-	LASSERT(nob <= sizeof(value));
-
-	for (value = 0; nob > 0; --nob)
-		value = (value << 8) | *((*ptr)++);
-	return value;
-}
-
-#define CONSUME(val, ptr) (val) = consume(sizeof(val), (ptr))
-
-static void uuid_unpack(class_uuid_t in, __u16 *uu, size_t nr)
-{
-	__u8 *ptr = in;
-
-	LASSERT(nr * sizeof(*uu) == sizeof(class_uuid_t));
-
-	while (nr-- > 0)
-		CONSUME(uu[nr], &ptr);
-}
-
-void class_uuid_unparse(class_uuid_t uu, struct obd_uuid *out)
-{
-	/* uu as an array of __u16's */
-        __u16 uuid[sizeof(class_uuid_t) / sizeof(__u16)];
-
-	CLASSERT(ARRAY_SIZE(uuid) == 8);
-
-        uuid_unpack(uu, uuid, ARRAY_SIZE(uuid));
-        sprintf(out->uuid, "%04x%04x-%04x-%04x-%04x-%04x%04x%04x",
-		uuid[0], uuid[1], uuid[2], uuid[3],
-		uuid[4], uuid[5], uuid[6], uuid[7]);
-}
-EXPORT_SYMBOL(class_uuid_unparse);
diff --git a/drivers/staging/lustrefsx/lustre/obdecho/echo.c b/drivers/staging/lustrefsx/lustre/obdecho/echo.c
index de7fd77920392..0f97a830f9b37 100644
--- a/drivers/staging/lustrefsx/lustre/obdecho/echo.c
+++ b/drivers/staging/lustrefsx/lustre/obdecho/echo.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2016, Intel Corporation.
+ * Copyright (c) 2010, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -59,6 +59,21 @@ enum {
         LPROC_ECHO_LAST = LPROC_ECHO_WRITE_BYTES +1
 };
 
+struct echo_srv_device {
+	struct lu_device esd_dev;
+	struct lu_target esd_lut;
+};
+
+static inline struct echo_srv_device *echo_srv_dev(struct lu_device *d)
+{
+	return container_of0(d, struct echo_srv_device, esd_dev);
+}
+
+static inline struct obd_device *echo_srv_obd(struct echo_srv_device *esd)
+{
+	return esd->esd_dev.ld_obd;
+}
+
 static int echo_connect(const struct lu_env *env,
                         struct obd_export **exp, struct obd_device *obd,
                         struct obd_uuid *cluuid, struct obd_connect_data *data,
@@ -115,115 +130,6 @@ static u64 echo_next_id(struct obd_device *obddev)
 	return id;
 }
 
-static int echo_create(const struct lu_env *env, struct obd_export *exp,
-		       struct obdo *oa)
-{
-        struct obd_device *obd = class_exp2obd(exp);
-
-        if (!obd) {
-		CERROR("invalid client cookie %#llx\n",
-                       exp->exp_handle.h_cookie);
-                return -EINVAL;
-        }
-
-	if (!(oa->o_mode & S_IFMT)) {
-		CERROR("echo obd: no type!\n");
-		return -ENOENT;
-	}
-
-        if (!(oa->o_valid & OBD_MD_FLTYPE)) {
-		CERROR("invalid o_valid %#llx\n", oa->o_valid);
-                return -EINVAL;
-        }
-
-	ostid_set_seq_echo(&oa->o_oi);
-	if (ostid_set_id(&oa->o_oi, echo_next_id(obd))) {
-		CERROR("Bad %llu to set " DOSTID "\n",
-		       echo_next_id(obd), POSTID(&oa->o_oi));
-		return -EINVAL;
-	}
-	oa->o_valid = OBD_MD_FLID;
-
-	return 0;
-}
-
-static int echo_destroy(const struct lu_env *env, struct obd_export *exp,
-			struct obdo *oa)
-{
-        struct obd_device *obd = class_exp2obd(exp);
-
-        ENTRY;
-        if (!obd) {
-		CERROR("invalid client cookie %#llx\n",
-                       exp->exp_handle.h_cookie);
-                RETURN(-EINVAL);
-        }
-
-        if (!(oa->o_valid & OBD_MD_FLID)) {
-		CERROR("obdo missing FLID valid flag: %#llx\n", oa->o_valid);
-                RETURN(-EINVAL);
-        }
-
-	if (ostid_id(&oa->o_oi) > obd->u.echo.eo_lastino ||
-	    ostid_id(&oa->o_oi) < ECHO_INIT_OID) {
-		CERROR("bad destroy objid: "DOSTID"\n", POSTID(&oa->o_oi));
-		RETURN(-EINVAL);
-	}
-
-        RETURN(0);
-}
-
-static int echo_getattr(const struct lu_env *env, struct obd_export *exp,
-			struct obdo *oa)
-{
-	struct obd_device *obd = class_exp2obd(exp);
-	u64 id = ostid_id(&oa->o_oi);
-
-	ENTRY;
-	if (!obd) {
-		CERROR("invalid client cookie %#llx\n",
-		       exp->exp_handle.h_cookie);
-		RETURN(-EINVAL);
-	}
-
-	if (!(oa->o_valid & OBD_MD_FLID)) {
-		CERROR("obdo missing FLID valid flag: %#llx\n", oa->o_valid);
-		RETURN(-EINVAL);
-	}
-
-	obdo_cpy_md(oa, &obd->u.echo.eo_oa, oa->o_valid);
-	ostid_set_seq_echo(&oa->o_oi);
-	if (ostid_set_id(&oa->o_oi, id)) {
-		CERROR("Bad %llu to set " DOSTID "\n",
-		       id, POSTID(&oa->o_oi));
-		RETURN(-EINVAL);
-	}
-
-	RETURN(0);
-}
-
-static int echo_setattr(const struct lu_env *env, struct obd_export *exp,
-			struct obdo *oa)
-{
-	struct obd_device *obd = class_exp2obd(exp);
-
-	ENTRY;
-	if (!obd) {
-		CERROR("invalid client cookie %#llx\n",
-		       exp->exp_handle.h_cookie);
-		RETURN(-EINVAL);
-	}
-
-	if (!(oa->o_valid & OBD_MD_FLID)) {
-		CERROR("obdo missing FLID valid flag: %#llx\n", oa->o_valid);
-		RETURN(-EINVAL);
-	}
-
-	obd->u.echo.eo_oa = *oa;
-
-	RETURN(0);
-}
-
 static void
 echo_page_debug_setup(struct page *page, int rw, u64 id,
 		      __u64 offset, int len)
@@ -548,41 +454,317 @@ static int echo_commitrw(const struct lu_env *env, int cmd,
 
 LPROC_SEQ_FOPS_RO_TYPE(echo, uuid);
 static struct lprocfs_vars lprocfs_echo_obd_vars[] = {
-	{ .name	=	"uuid",
-	  .fops	=	&echo_uuid_fops		},
+	{ .name =       "uuid",
+	  .fops =       &echo_uuid_fops         },
 	{ NULL }
 };
 
-static int echo_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+struct obd_ops echo_obd_ops = {
+	.o_owner           = THIS_MODULE,
+	.o_connect         = echo_connect,
+	.o_disconnect      = echo_disconnect,
+	.o_init_export     = echo_init_export,
+	.o_destroy_export  = echo_destroy_export,
+	.o_preprw          = echo_preprw,
+	.o_commitrw        = echo_commitrw,
+};
+
+/**
+ * Echo Server request handler for OST_CREATE RPC.
+ *
+ * This is part of request processing. Its simulates the object
+ * creation on OST.
+ *
+ * \param[in] tsi	target session environment for this request
+ *
+ * \retval		0 if successful
+ * \retval		negative value on error
+ */
+static int esd_create_hdl(struct tgt_session_info *tsi)
+{
+	const struct obdo *oa = &tsi->tsi_ost_body->oa;
+	struct obd_device *obd = tsi->tsi_exp->exp_obd;
+	struct ost_body *repbody;
+	struct obdo *rep_oa;
+
+	ENTRY;
+
+	repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY);
+	if (repbody == NULL)
+		RETURN(-ENOMEM);
+
+	if (!(oa->o_mode & S_IFMT)) {
+		CERROR("%s: no type is set in obdo!\n",
+		       tsi->tsi_exp->exp_obd->obd_name);
+		RETURN(-ENOENT);
+	}
+
+	if (!(oa->o_valid & OBD_MD_FLTYPE)) {
+		CERROR("%s: invalid o_valid in obdo: %#llx\n",
+		       tsi->tsi_exp->exp_obd->obd_name, oa->o_valid);
+		RETURN(-EINVAL);
+	}
+
+	rep_oa = &repbody->oa;
+
+	if (!fid_seq_is_echo(ostid_seq(&oa->o_oi))) {
+		CERROR("%s: invalid seq %#llx\n",
+		       tsi->tsi_exp->exp_obd->obd_name, ostid_seq(&oa->o_oi));
+		return -EINVAL;
+	}
+
+	ostid_set_seq_echo(&rep_oa->o_oi);
+	ostid_set_id(&rep_oa->o_oi, echo_next_id(obd));
+
+	CDEBUG(D_INFO, "%s: Create object "DOSTID"\n",
+	       tsi->tsi_exp->exp_obd->obd_name, POSTID(&rep_oa->o_oi));
+
+	rep_oa->o_valid |= OBD_MD_FLID | OBD_MD_FLGROUP;
+
+	RETURN(0);
+}
+
+/**
+ * Echo Server request handler for OST_DESTROY RPC.
+ *
+ * This is Echo Server part of request handling. It simulates the objects
+ * destroy on OST.
+ *
+ * \param[in] tsi	target session environment for this request
+ *
+ * \retval		0 if successful
+ * \retval		negative value on error
+ */
+static int esd_destroy_hdl(struct tgt_session_info *tsi)
+{
+	const struct obdo *oa = &tsi->tsi_ost_body->oa;
+	struct obd_device *obd = tsi->tsi_exp->exp_obd;
+	struct ost_body *repbody;
+	u64 oid;
+
+	ENTRY;
+
+	oid = ostid_id(&oa->o_oi);
+	LASSERT(oid != 0);
+
+	if (!(oa->o_valid & OBD_MD_FLID)) {
+		CERROR("%s: obdo missing FLID valid flag: %#llx\n",
+		       tsi->tsi_exp->exp_obd->obd_name, oa->o_valid);
+		RETURN(-EINVAL);
+	}
+
+	repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY);
+
+	if (ostid_id(&oa->o_oi) > obd->u.echo.eo_lastino ||
+	    ostid_id(&oa->o_oi) < ECHO_INIT_OID) {
+		CERROR("%s: bad objid to destroy: "DOSTID"\n",
+		       tsi->tsi_exp->exp_obd->obd_name, POSTID(&oa->o_oi));
+		RETURN(-EINVAL);
+	}
+
+	CDEBUG(D_INFO, "%s: Destroy object "DOSTID"\n",
+	       tsi->tsi_exp->exp_obd->obd_name, POSTID(&oa->o_oi));
+
+	repbody->oa.o_oi = oa->o_oi;
+	RETURN(0);
+}
+
+/**
+ * Echo Server request handler for OST_GETATTR RPC.
+ *
+ * This is Echo Server part of request handling. It returns an object
+ * attributes to the client. All objects have the same attributes in
+ * Echo Server.
+ *
+ * \param[in] tsi	target session environment for this request
+ *
+ * \retval		0 if successful
+ * \retval		negative value on error
+ */
+static int esd_getattr_hdl(struct tgt_session_info *tsi)
+{
+	const struct obdo *oa = &tsi->tsi_ost_body->oa;
+	struct obd_device *obd = tsi->tsi_exp->exp_obd;
+	struct ost_body *repbody;
+
+	ENTRY;
+
+	if (!(oa->o_valid & OBD_MD_FLID)) {
+		CERROR("%s: obdo missing FLID valid flag: %#llx\n",
+		       tsi->tsi_exp->exp_obd->obd_name, oa->o_valid);
+		RETURN(-EINVAL);
+	}
+
+	repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY);
+	if (repbody == NULL)
+		RETURN(-ENOMEM);
+
+	repbody->oa.o_oi = oa->o_oi;
+	repbody->oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+
+	obdo_cpy_md(&repbody->oa, &obd->u.echo.eo_oa, oa->o_valid);
+
+	repbody->oa.o_valid |= OBD_MD_FLFLAGS;
+	repbody->oa.o_flags = OBD_FL_FLUSH;
+
+	RETURN(0);
+}
+
+/**
+ * Echo Server request handler for OST_SETATTR RPC.
+ *
+ * This is Echo Server part of request handling. It sets common
+ * attributes from request to the Echo Server objects.
+ *
+ * \param[in] tsi	target session environment for this request
+ *
+ * \retval		0 if successful
+ * \retval		negative value on error
+ */
+static int esd_setattr_hdl(struct tgt_session_info *tsi)
+{
+	struct ost_body *body = tsi->tsi_ost_body;
+	struct obd_device *obd = tsi->tsi_exp->exp_obd;
+	struct ost_body *repbody;
+
+	ENTRY;
+
+	if (!(body->oa.o_valid & OBD_MD_FLID)) {
+		CERROR("%s: obdo missing FLID valid flag: %#llx\n",
+		       tsi->tsi_exp->exp_obd->obd_name,
+		       body->oa.o_valid);
+		RETURN(-EINVAL);
+	}
+
+	repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY);
+	if (repbody == NULL)
+		RETURN(-ENOMEM);
+
+	repbody->oa.o_oi = body->oa.o_oi;
+	repbody->oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+
+	obd->u.echo.eo_oa = body->oa;
+
+	RETURN(0);
+}
+
+#define OBD_FAIL_OST_READ_NET	OBD_FAIL_OST_BRW_NET
+#define OBD_FAIL_OST_WRITE_NET	OBD_FAIL_OST_BRW_NET
+#define OST_BRW_READ	OST_READ
+#define OST_BRW_WRITE	OST_WRITE
+
+/**
+ * Table of Echo Server specific request handlers
+ *
+ * This table contains all opcodes accepted by Echo Server and
+ * specifies handlers for them. The tgt_request_handler()
+ * uses such table from each target to process incoming
+ * requests.
+ */
+static struct tgt_handler esd_tgt_handlers[] = {
+TGT_RPC_HANDLER(OST_FIRST_OPC, 0, OST_CONNECT, tgt_connect,
+		&RQF_CONNECT, LUSTRE_OBD_VERSION),
+TGT_RPC_HANDLER(OST_FIRST_OPC, 0, OST_DISCONNECT, tgt_disconnect,
+		&RQF_OST_DISCONNECT, LUSTRE_OBD_VERSION),
+TGT_OST_HDL(HABEO_CORPUS | HABEO_REFERO, OST_GETATTR, esd_getattr_hdl),
+TGT_OST_HDL(HABEO_CORPUS | HABEO_REFERO | MUTABOR, OST_SETATTR,
+	    esd_setattr_hdl),
+TGT_OST_HDL(HABEO_REFERO | MUTABOR, OST_CREATE, esd_create_hdl),
+TGT_OST_HDL(HABEO_REFERO | MUTABOR, OST_DESTROY, esd_destroy_hdl),
+TGT_OST_HDL(HABEO_CORPUS | HABEO_REFERO, OST_BRW_READ, tgt_brw_read),
+TGT_OST_HDL(HABEO_CORPUS | MUTABOR, OST_BRW_WRITE, tgt_brw_write),
+};
+
+static struct tgt_opc_slice esd_common_slice[] = {
+	{
+		.tos_opc_start	= OST_FIRST_OPC,
+		.tos_opc_end	= OST_LAST_OPC,
+		.tos_hs		= esd_tgt_handlers
+	},
+	{
+		.tos_opc_start	= OBD_FIRST_OPC,
+		.tos_opc_end	= OBD_LAST_OPC,
+		.tos_hs		= tgt_obd_handlers
+	},
+	{
+		.tos_opc_start	= LDLM_FIRST_OPC,
+		.tos_opc_end	= LDLM_LAST_OPC,
+		.tos_hs		= tgt_dlm_handlers
+	},
+	{
+		.tos_opc_start  = SEC_FIRST_OPC,
+		.tos_opc_end    = SEC_LAST_OPC,
+		.tos_hs         = tgt_sec_ctx_handlers
+	},
+	{
+		.tos_hs		= NULL
+	}
+};
+
+/**
+ * lu_device_operations matrix for ECHO SRV device is NULL,
+ * this device is just serving incoming requests immediately
+ * without building a stack of lu_devices.
+ */
+static struct lu_device_operations echo_srv_lu_ops = { 0 };
+
+/**
+ * Initialize Echo Server device with parameters in the config log \a cfg.
+ *
+ * This is the main starting point of Echo Server initialization. It fills all
+ * parameters with their initial values and starts Echo Server.
+ *
+ * \param[in] env	execution environment
+ * \param[in] m		Echo Server device
+ * \param[in] ldt	LU device type of Echo Server
+ * \param[in] cfg	configuration log
+ *
+ * \retval		0 if successful
+ * \retval		negative value on error
+ */
+static int echo_srv_init0(const struct lu_env *env,
+			  struct echo_srv_device *esd,
+			  struct lu_device_type *ldt, struct lustre_cfg *cfg)
 {
-	int			rc;
-	__u64			lock_flags = 0;
-	struct ldlm_res_id	res_id = {.name = {1}};
-	char			ns_name[48];
+	const char *dev = lustre_cfg_string(cfg, 0);
+	struct obd_device *obd;
+	char ns_name[48];
+	int rc;
+
 	ENTRY;
 
-        obd->u.echo.eo_obt.obt_magic = OBT_MAGIC;
+	obd = class_name2obd(dev);
+	if (obd == NULL) {
+		CERROR("Cannot find obd with name %s\n", dev);
+		RETURN(-ENODEV);
+	}
+
 	spin_lock_init(&obd->u.echo.eo_lock);
-        obd->u.echo.eo_lastino = ECHO_INIT_OID;
-
-        sprintf(ns_name, "echotgt-%s", obd->obd_uuid.uuid);
-        obd->obd_namespace = ldlm_namespace_new(obd, ns_name,
-                                                LDLM_NAMESPACE_SERVER,
-                                                LDLM_NAMESPACE_MODEST,
-                                                LDLM_NS_TYPE_OST);
-        if (obd->obd_namespace == NULL) {
-                LBUG();
-                RETURN(-ENOMEM);
-        }
+	obd->u.echo.eo_lastino = ECHO_INIT_OID;
 
-        rc = ldlm_cli_enqueue_local(obd->obd_namespace, &res_id, LDLM_PLAIN,
-                                    NULL, LCK_NL, &lock_flags, NULL,
-				    ldlm_completion_ast, NULL, NULL, 0,
-				    LVB_T_NONE, NULL, &obd->u.echo.eo_nl_lock);
-        LASSERT (rc == ELDLM_OK);
+	esd->esd_dev.ld_ops = &echo_srv_lu_ops;
+	esd->esd_dev.ld_obd = obd;
+	/* set this lu_device to obd, because error handling need it */
+	obd->obd_lu_dev = &esd->esd_dev;
+
+	/* No connection accepted until configurations will finish */
+	spin_lock(&obd->obd_dev_lock);
+	obd->obd_no_conn = 1;
+	spin_unlock(&obd->obd_dev_lock);
+
+	/* non-replayable target */
+	obd->obd_replayable = 0;
+
+	snprintf(ns_name, sizeof(ns_name), "echotgt-%s", obd->obd_uuid.uuid);
+	obd->obd_namespace = ldlm_namespace_new(obd, ns_name,
+						LDLM_NAMESPACE_SERVER,
+						LDLM_NAMESPACE_MODEST,
+						LDLM_NS_TYPE_OST);
+	if (obd->obd_namespace == NULL)
+		RETURN(-ENOMEM);
 
 	obd->obd_vars = lprocfs_echo_obd_vars;
-	if (lprocfs_obd_setup(obd) == 0 &&
+	if (!lprocfs_obd_setup(obd, true) &&
             lprocfs_alloc_obd_stats(obd, LPROC_ECHO_LAST) == 0) {
                 lprocfs_counter_init(obd->obd_stats, LPROC_ECHO_READ_BYTES,
                                      LPROCFS_CNTR_AVGMINMAX,
@@ -594,48 +776,158 @@ static int echo_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 
 	ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL,
 			   "echo_ldlm_cb_client", &obd->obd_ldlm_client);
-        RETURN(0);
+
+	rc = tgt_init(env, &esd->esd_lut, obd, NULL, esd_common_slice,
+		      OBD_FAIL_OST_ALL_REQUEST_NET,
+		      OBD_FAIL_OST_ALL_REPLY_NET);
+	if (rc)
+		GOTO(err_out, rc);
+
+	spin_lock(&obd->obd_dev_lock);
+	obd->obd_no_conn = 0;
+	spin_unlock(&obd->obd_dev_lock);
+
+	RETURN(0);
+
+err_out:
+	ldlm_namespace_free(obd->obd_namespace, NULL, obd->obd_force);
+	obd->obd_namespace = NULL;
+
+	lprocfs_obd_cleanup(obd);
+	lprocfs_free_obd_stats(obd);
+	RETURN(rc);
 }
 
-static int echo_cleanup(struct obd_device *obd)
+/**
+ * Stop the Echo Server device.
+ *
+ * This function stops the Echo Server device and all its subsystems.
+ * This is the end of Echo Server lifecycle.
+ *
+ * \param[in] env	execution environment
+ * \param[in] esd		ESD device
+ */
+static void echo_srv_fini(const struct lu_env *env,
+			  struct echo_srv_device *esd)
 {
+	struct obd_device *obd = echo_srv_obd(esd);
+	struct lu_device *d = &esd->esd_dev;
 	int leaked;
+
 	ENTRY;
 
-	lprocfs_obd_cleanup(obd);
-	lprocfs_free_obd_stats(obd);
+	class_disconnect_exports(obd);
+	if (obd->obd_namespace != NULL)
+		ldlm_namespace_free_prior(obd->obd_namespace, NULL,
+					  obd->obd_force);
 
-	ldlm_lock_decref(&obd->u.echo.eo_nl_lock, LCK_NL);
+	obd_exports_barrier(obd);
+	obd_zombie_barrier();
 
-	/* XXX Bug 3413; wait for a bit to ensure the BL callback has
-	 * happened before calling ldlm_namespace_free() */
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	schedule_timeout(cfs_time_seconds(1));
+	tgt_fini(env, &esd->esd_lut);
 
-	ldlm_namespace_free(obd->obd_namespace, NULL, obd->obd_force);
-	obd->obd_namespace = NULL;
+	if (obd->obd_namespace != NULL) {
+		ldlm_namespace_free_post(obd->obd_namespace);
+		obd->obd_namespace = NULL;
+	}
+
+	lprocfs_obd_cleanup(obd);
+	lprocfs_free_obd_stats(obd);
 
 	leaked = atomic_read(&obd->u.echo.eo_prep);
 	if (leaked != 0)
 		CERROR("%d prep/commitrw pages leaked\n", leaked);
 
-	RETURN(0);
+	LASSERT(atomic_read(&d->ld_ref) == 0);
+	EXIT;
 }
 
-struct obd_ops echo_obd_ops = {
-        .o_owner           = THIS_MODULE,
-        .o_connect         = echo_connect,
-        .o_disconnect      = echo_disconnect,
-        .o_init_export     = echo_init_export,
-        .o_destroy_export  = echo_destroy_export,
-        .o_create          = echo_create,
-        .o_destroy         = echo_destroy,
-        .o_getattr         = echo_getattr,
-        .o_setattr         = echo_setattr,
-        .o_preprw          = echo_preprw,
-        .o_commitrw        = echo_commitrw,
-        .o_setup           = echo_setup,
-        .o_cleanup         = echo_cleanup
+/**
+ * Implementation of lu_device_type_operations::ldto_device_fini.
+ *
+ * Finalize device. Dual to echo_srv_device_init(). It is called from
+ * obd_precleanup() and stops the current device.
+ *
+ * \param[in] env	execution environment
+ * \param[in] d		LU device of ESD
+ *
+ * \retval		NULL
+ */
+static struct lu_device *echo_srv_device_fini(const struct lu_env *env,
+					      struct lu_device *d)
+{
+	ENTRY;
+	echo_srv_fini(env, echo_srv_dev(d));
+	RETURN(NULL);
+}
+
+/**
+ * Implementation of lu_device_type_operations::ldto_device_free.
+ *
+ * Free Echo Server device. Dual to echo_srv_device_alloc().
+ *
+ * \param[in] env	execution environment
+ * \param[in] d		LU device of ESD
+ *
+ * \retval		NULL
+ */
+static struct lu_device *echo_srv_device_free(const struct lu_env *env,
+					      struct lu_device *d)
+{
+	struct echo_srv_device *esd = echo_srv_dev(d);
+
+	lu_device_fini(&esd->esd_dev);
+	OBD_FREE_PTR(esd);
+	RETURN(NULL);
+}
+
+/**
+ * Implementation of lu_device_type_operations::ldto_device_alloc.
+ *
+ * This function allocates the new Echo Server device. It is called from
+ * obd_setup() if OBD device had lu_device_type defined.
+ *
+ * \param[in] env	execution environment
+ * \param[in] t		lu_device_type of ESD device
+ * \param[in] cfg	configuration log
+ *
+ * \retval		pointer to the lu_device of just allocated OFD
+ * \retval		ERR_PTR of return value on error
+ */
+static struct lu_device *echo_srv_device_alloc(const struct lu_env *env,
+					       struct lu_device_type *t,
+					       struct lustre_cfg *cfg)
+{
+	struct echo_srv_device *esd;
+	struct lu_device *l;
+	int rc;
+
+	OBD_ALLOC_PTR(esd);
+	if (esd == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	l = &esd->esd_dev;
+	lu_device_init(l, t);
+	rc = echo_srv_init0(env, esd, t, cfg);
+	if (rc != 0) {
+		echo_srv_device_free(env, l);
+		l = ERR_PTR(rc);
+	}
+
+	return l;
+}
+
+static const struct lu_device_type_operations echo_srv_type_ops = {
+	.ldto_device_alloc = echo_srv_device_alloc,
+	.ldto_device_free = echo_srv_device_free,
+	.ldto_device_fini = echo_srv_device_fini
+};
+
+struct lu_device_type echo_srv_type = {
+	.ldt_tags = LU_DEVICE_DT,
+	.ldt_name = LUSTRE_ECHO_NAME,
+	.ldt_ops = &echo_srv_type_ops,
+	.ldt_ctx_tags = LCT_DT_THREAD,
 };
 
 void echo_persistent_pages_fini(void)
diff --git a/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c b/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c
index 26065b110e592..53620c7e19c37 100644
--- a/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c
+++ b/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -47,7 +47,7 @@
 #include <lustre_fid.h>
 #include <lustre_lmv.h>
 #include <lustre_acl.h>
-#include <uapi/linux/lustre_ioctl.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
 #include <lustre_net.h>
 #ifdef HAVE_SERVER_SUPPORT
 # include <md_object.h>
@@ -328,7 +328,8 @@ static void echo_page_completion(const struct lu_env *env,
 }
 
 static void echo_page_fini(const struct lu_env *env,
-			   struct cl_page_slice *slice)
+			   struct cl_page_slice *slice,
+			   struct pagevec *pvec)
 {
 	struct echo_object *eco = cl2echo_obj(slice->cpl_obj);
 	ENTRY;
@@ -506,11 +507,18 @@ static int echo_object_init(const struct lu_env *env, struct lu_object *obj,
 	RETURN(0);
 }
 
-static void echo_object_free(const struct lu_env *env, struct lu_object *obj)
+static void echo_object_delete(const struct lu_env *env, struct lu_object *obj)
 {
-        struct echo_object *eco    = cl2echo_obj(lu2cl(obj));
-        struct echo_client_obd *ec = eco->eo_dev->ed_ec;
-        ENTRY;
+	struct echo_object *eco    = cl2echo_obj(lu2cl(obj));
+	struct echo_client_obd *ec;
+
+	ENTRY;
+
+	/* object delete called unconditolally - layer init or not */
+	if (eco->eo_dev == NULL)
+		return;
+
+	ec = eco->eo_dev->ed_ec;
 
 	LASSERT(atomic_read(&eco->eo_npages) == 0);
 
@@ -518,11 +526,18 @@ static void echo_object_free(const struct lu_env *env, struct lu_object *obj)
 	list_del_init(&eco->eo_obj_chain);
 	spin_unlock(&ec->ec_lock);
 
-        lu_object_fini(obj);
-        lu_object_header_fini(obj->lo_header);
-
 	if (eco->eo_oinfo != NULL)
 		OBD_FREE_PTR(eco->eo_oinfo);
+}
+
+static void echo_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+	struct echo_object *eco    = cl2echo_obj(lu2cl(obj));
+
+	ENTRY;
+
+	lu_object_fini(obj);
+	lu_object_header_fini(obj->lo_header);
 
 	OBD_SLAB_FREE_PTR(eco, echo_object_kmem);
 	EXIT;
@@ -537,12 +552,12 @@ static int echo_object_print(const struct lu_env *env, void *cookie,
 }
 
 static const struct lu_object_operations echo_lu_obj_ops = {
-        .loo_object_init      = echo_object_init,
-        .loo_object_delete    = NULL,
-        .loo_object_release   = NULL,
-        .loo_object_free      = echo_object_free,
-        .loo_object_print     = echo_object_print,
-        .loo_object_invariant = NULL
+	.loo_object_init      = echo_object_init,
+	.loo_object_delete    = echo_object_delete,
+	.loo_object_release   = NULL,
+	.loo_object_free      = echo_object_free,
+	.loo_object_print     = echo_object_print,
+	.loo_object_invariant = NULL
 };
 /** @} echo_lu_ops */
 
@@ -962,19 +977,18 @@ static struct lu_device *echo_device_alloc(const struct lu_env *env,
                         CERROR("Cleanup obd device %s error(%d)\n",
                                obd->obd_name, rc2);
         }
-	/* Fall through */
+	/* fallthrough */
 
         case 3:
                 echo_site_fini(env, ed);
-		/* Fall through */
+		/* fallthrough */
         case 2:
                 cl_device_fini(&ed->ed_cl);
-		/* Fall through */
+		/* fallthrough */
         case 1:
                 OBD_FREE_PTR(ed);
-		/* Fall through */
+		/* fallthrough */
         case 0:
-		/* Fall through */
         default:
                 break;
         }
@@ -1714,7 +1728,7 @@ static int echo_create_md_object(const struct lu_env *env,
 	memset(spec, 0, sizeof(*spec));
 	echo_set_lmm_size(env, ld, ma);
 	if (stripe_count != 0) {
-		spec->sp_cr_flags |= FMODE_WRITE;
+		spec->sp_cr_flags |= MDS_FMODE_WRITE;
 		if (stripe_count != -1) {
 			if (S_ISDIR(mode)) {
 				struct lmv_user_md *lmu;
@@ -1742,7 +1756,7 @@ static int echo_create_md_object(const struct lu_env *env,
 
 	ma->ma_attr.la_mode = mode;
 	ma->ma_attr.la_valid = LA_CTIME | LA_MODE;
-        ma->ma_attr.la_ctime = cfs_time_current_64();
+	ma->ma_attr.la_ctime = ktime_get_real_seconds();
 
 	if (name != NULL) {
 		lname->ln_name = name;
@@ -2085,7 +2099,7 @@ static int echo_destroy_object(const struct lu_env *env,
         memset(ma, 0, sizeof(*ma));
         ma->ma_attr.la_mode = mode;
         ma->ma_attr.la_valid = LA_CTIME;
-        ma->ma_attr.la_ctime = cfs_time_current_64();
+	ma->ma_attr.la_ctime = ktime_get_real_seconds();
         ma->ma_need = MA_INODE;
         ma->ma_valid = 0;
 
@@ -2579,11 +2593,11 @@ static int echo_client_prep_commit(const struct lu_env *env,
 				   u64 offset, u64 count,
 				   u64 batch, int async)
 {
-	struct obd_ioobj	 ioo;
-	struct niobuf_local	*lnb;
-	struct niobuf_remote	 rnb;
-	u64			 off;
-	u64			 npages, tot_pages, apc;
+	struct obd_ioobj ioo;
+	struct niobuf_local *lnb;
+	struct niobuf_remote rnb;
+	u64 off;
+	u64 npages, tot_pages, apc;
 	int i, ret = 0, brw_flags = 0;
 
 	ENTRY;
@@ -2594,7 +2608,7 @@ static int echo_client_prep_commit(const struct lu_env *env,
 	apc = npages = batch >> PAGE_SHIFT;
 	tot_pages = count >> PAGE_SHIFT;
 
-	OBD_ALLOC(lnb, apc * sizeof(struct niobuf_local));
+	OBD_ALLOC_LARGE(lnb, apc * sizeof(struct niobuf_local));
 	if (lnb == NULL)
 		RETURN(-ENOMEM);
 
@@ -2660,7 +2674,7 @@ static int echo_client_prep_commit(const struct lu_env *env,
 	}
 
 out:
-	OBD_FREE(lnb, apc * sizeof(struct niobuf_local));
+	OBD_FREE_LARGE(lnb, apc * sizeof(struct niobuf_local));
 
 	RETURN(ret);
 }
@@ -2762,6 +2776,9 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
 	rc = lu_env_init(env, LCT_DT_THREAD);
 	if (rc)
 		GOTO(out_alloc, rc = -ENOMEM);
+	lu_env_add(env);
+	if (rc)
+		GOTO(out_env_fini, rc = -ENOMEM);
 
 #ifdef HAVE_SERVER_SUPPORT
 	env->le_ses = &echo_session;
@@ -2903,6 +2920,8 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
 	lu_context_fini(env->le_ses);
 out_env:
 #endif
+	lu_env_remove(env);
+out_env_fini:
         lu_env_fini(env);
 out_alloc:
         OBD_FREE_PTR(env);
@@ -3072,15 +3091,15 @@ static int __init obdecho_init(void)
                 goto failed_0;
 
 	rc = class_register_type(&echo_obd_ops, NULL, true, NULL,
-				 LUSTRE_ECHO_NAME, NULL);
+				 LUSTRE_ECHO_NAME, &echo_srv_type);
 	if (rc != 0)
 		goto failed_1;
 # endif
 
 	rc = lu_kmem_init(echo_caches);
 	if (rc == 0) {
-		rc = class_register_type(&echo_client_obd_ops, NULL, true, NULL,
-					 LUSTRE_ECHO_CLIENT_NAME,
+		rc = class_register_type(&echo_client_obd_ops, NULL, false,
+					 NULL, LUSTRE_ECHO_CLIENT_NAME,
 					 &echo_device_type);
 		if (rc)
 			lu_kmem_fini(echo_caches);
diff --git a/drivers/staging/lustrefsx/lustre/obdecho/echo_internal.h b/drivers/staging/lustrefsx/lustre/obdecho/echo_internal.h
index 8c72c40ebb767..469d68e94f02f 100644
--- a/drivers/staging/lustrefsx/lustre/obdecho/echo_internal.h
+++ b/drivers/staging/lustrefsx/lustre/obdecho/echo_internal.h
@@ -45,6 +45,7 @@
 
 #ifdef HAVE_SERVER_SUPPORT
 extern struct obd_ops echo_obd_ops;
+extern struct lu_device_type echo_srv_type;
 int echo_persistent_pages_init(void);
 void echo_persistent_pages_fini(void);
 #endif /* HAVE_SERVER_SUPPORT */
diff --git a/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c b/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c
index d6123c61af113..ab8cfca3601eb 100644
--- a/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c
+++ b/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -37,69 +37,78 @@
 #include <obd_class.h>
 #include <lprocfs_status.h>
 #include <linux/seq_file.h>
+#include <lustre_osc.h>
+
 #include "osc_internal.h"
 
-#ifdef CONFIG_PROC_FS
-static int osc_active_seq_show(struct seq_file *m, void *v)
+static ssize_t active_show(struct kobject *kobj, struct attribute *attr,
+			   char *buf)
 {
-	struct obd_device *dev = m->private;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	int rc;
 
 	LPROCFS_CLIMP_CHECK(dev);
-	seq_printf(m, "%d\n", !dev->u.cli.cl_import->imp_deactive);
+	rc = sprintf(buf, "%d\n", !dev->u.cli.cl_import->imp_deactive);
 	LPROCFS_CLIMP_EXIT(dev);
-	return 0;
+	return rc;
 }
 
-static ssize_t osc_active_seq_write(struct file *file,
-				    const char __user *buffer,
-				    size_t count, loff_t *off)
+static ssize_t active_store(struct kobject *kobj, struct attribute *attr,
+			    const char *buffer, size_t count)
 {
-	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	bool val;
 	int rc;
-	__s64 val;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtobool(buffer, &val);
 	if (rc)
 		return rc;
-	if (val < 0 || val > 1)
-		return -ERANGE;
 
 	/* opposite senses */
 	if (dev->u.cli.cl_import->imp_deactive == val)
 		rc = ptlrpc_set_import_active(dev->u.cli.cl_import, val);
 	else
-		CDEBUG(D_CONFIG, "activate %d: ignoring repeat request\n",
-			(int)val);
+		CDEBUG(D_CONFIG, "activate %u: ignoring repeat request\n",
+		       (unsigned int)val);
 
 	return count;
 }
-LPROC_SEQ_FOPS(osc_active);
+LUSTRE_RW_ATTR(active);
 
-static int osc_max_rpcs_in_flight_seq_show(struct seq_file *m, void *v)
+static ssize_t max_rpcs_in_flight_show(struct kobject *kobj,
+				       struct attribute *attr,
+				       char *buf)
 {
-	struct obd_device *dev = m->private;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 	struct client_obd *cli = &dev->u.cli;
+	ssize_t len;
 
 	spin_lock(&cli->cl_loi_list_lock);
-	seq_printf(m, "%u\n", cli->cl_max_rpcs_in_flight);
+	len = sprintf(buf, "%u\n", cli->cl_max_rpcs_in_flight);
 	spin_unlock(&cli->cl_loi_list_lock);
-	return 0;
+	return len;
 }
 
-static ssize_t osc_max_rpcs_in_flight_seq_write(struct file *file,
-						const char __user *buffer,
-						size_t count, loff_t *off)
+static ssize_t max_rpcs_in_flight_store(struct kobject *kobj,
+					struct attribute *attr,
+					const char *buffer,
+					size_t count)
 {
-	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 	struct client_obd *cli = &dev->u.cli;
-	int rc;
 	int adding, added, req_count;
-	__s64 val;
+	unsigned int val;
+	int rc;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtouint(buffer, 0, &val);
 	if (rc)
 		return rc;
-	if (val < 1 || val > OSC_MAX_RIF_MAX)
+
+	if (val == 0 || val > OSC_MAX_RIF_MAX)
 		return -ERANGE;
 
 	LPROCFS_CLIMP_CHECK(dev);
@@ -126,41 +135,42 @@ static ssize_t osc_max_rpcs_in_flight_seq_write(struct file *file,
 	LPROCFS_CLIMP_EXIT(dev);
 	return count;
 }
-LPROC_SEQ_FOPS(osc_max_rpcs_in_flight);
+LUSTRE_RW_ATTR(max_rpcs_in_flight);
 
-static int osc_max_dirty_mb_seq_show(struct seq_file *m, void *v)
+static ssize_t max_dirty_mb_show(struct kobject *kobj,
+				 struct attribute *attr,
+				 char *buf)
 {
-	struct obd_device *dev = m->private;
-        struct client_obd *cli = &dev->u.cli;
-        long val;
-        int mult;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct client_obd *cli = &dev->u.cli;
+	unsigned long val;
 
 	spin_lock(&cli->cl_loi_list_lock);
-	val = cli->cl_dirty_max_pages;
+	val = PAGES_TO_MiB(cli->cl_dirty_max_pages);
 	spin_unlock(&cli->cl_loi_list_lock);
 
-	mult = 1 << (20 - PAGE_SHIFT);
-	return lprocfs_seq_read_frac_helper(m, val, mult);
+	return sprintf(buf, "%lu\n", val);
 }
 
-static ssize_t osc_max_dirty_mb_seq_write(struct file *file,
-					  const char __user *buffer,
-					  size_t count, loff_t *off)
+static ssize_t max_dirty_mb_store(struct kobject *kobj,
+				  struct attribute *attr,
+				  const char *buffer,
+				  size_t count)
 {
-	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 	struct client_obd *cli = &dev->u.cli;
+	unsigned long pages_number, max_dirty_mb;
 	int rc;
-	__s64 pages_number;
 
-	rc = lprocfs_str_with_units_to_s64(file, buffer, count,
-			&pages_number, 'M');
+	rc = kstrtoul(buffer, 10, &max_dirty_mb);
 	if (rc)
 		return rc;
 
-	pages_number >>= PAGE_SHIFT;
+	pages_number = MiB_TO_PAGES(max_dirty_mb);
 
-	if (pages_number <= 0 ||
-	    pages_number >= OSC_MAX_DIRTY_MB_MAX << (20 - PAGE_SHIFT) ||
+	if (pages_number >= MiB_TO_PAGES(OSC_MAX_DIRTY_MB_MAX) ||
 	    pages_number > cfs_totalram_pages() / 4) /* 1/4 of RAM */
 		return -ERANGE;
 
@@ -171,7 +181,12 @@ static ssize_t osc_max_dirty_mb_seq_write(struct file *file,
 
 	return count;
 }
-LPROC_SEQ_FOPS(osc_max_dirty_mb);
+LUSTRE_RW_ATTR(max_dirty_mb);
+
+LUSTRE_ATTR(ost_conn_uuid, 0444, conn_uuid_show, NULL);
+LUSTRE_RO_ATTR(conn_uuid);
+
+LUSTRE_RW_ATTR(ping);
 
 static int osc_cached_mb_seq_show(struct seq_file *m, void *v)
 {
@@ -191,9 +206,9 @@ static int osc_cached_mb_seq_show(struct seq_file *m, void *v)
 }
 
 /* shrink the number of caching pages to a specific number */
-static ssize_t
-osc_cached_mb_seq_write(struct file *file, const char __user *buffer,
-			size_t count, loff_t *off)
+static ssize_t osc_cached_mb_seq_write(struct file *file,
+				       const char __user *buffer,
+				       size_t count, loff_t *off)
 {
 	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
 	struct client_obd *cli = &dev->u.cli;
@@ -204,14 +219,13 @@ osc_cached_mb_seq_write(struct file *file, const char __user *buffer,
 	if (count >= sizeof(kernbuf))
 		return -EINVAL;
 
-	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
+	if (copy_from_user(kernbuf, buffer, count))
 		return -EFAULT;
 	kernbuf[count] = 0;
 
 	buffer += lprocfs_find_named_value(kernbuf, "used_mb:", &count) -
 		  kernbuf;
-	rc = lprocfs_str_with_units_to_s64(file, buffer, count,
-			&pages_number, 'M');
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &pages_number, 'M');
 	if (rc)
 		return rc;
 
@@ -234,19 +248,25 @@ osc_cached_mb_seq_write(struct file *file, const char __user *buffer,
 
 	return count;
 }
+
 LPROC_SEQ_FOPS(osc_cached_mb);
 
-static int osc_cur_dirty_bytes_seq_show(struct seq_file *m, void *v)
+static ssize_t cur_dirty_bytes_show(struct kobject *kobj,
+				    struct attribute *attr,
+				    char *buf)
 {
-	struct obd_device *dev = m->private;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 	struct client_obd *cli = &dev->u.cli;
+	ssize_t len;
 
 	spin_lock(&cli->cl_loi_list_lock);
-	seq_printf(m, "%lu\n", cli->cl_dirty_pages << PAGE_SHIFT);
+	len = sprintf(buf, "%lu\n", cli->cl_dirty_pages << PAGE_SHIFT);
 	spin_unlock(&cli->cl_loi_list_lock);
-	return 0;
+
+	return len;
 }
-LPROC_SEQ_FOPS_RO(osc_cur_dirty_bytes);
+LUSTRE_RO_ATTR(cur_dirty_bytes);
 
 static int osc_cur_grant_bytes_seq_show(struct seq_file *m, void *v)
 {
@@ -265,17 +285,17 @@ static ssize_t osc_cur_grant_bytes_seq_write(struct file *file,
 {
 	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
 	struct client_obd *cli = &obd->u.cli;
-	int                rc;
-	__s64              val;
+	s64 val;
+	int rc;
 
 	if (obd == NULL)
 		return 0;
 
-	rc = lprocfs_str_with_units_to_s64(file, buffer, count, &val, '1');
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &val, '1');
 	if (rc)
 		return rc;
 	if (val < 0)
-		return -ERANGE;
+		return val;
 
 	/* this is only for shrinking grant */
 	spin_lock(&cli->cl_loi_list_lock);
@@ -290,102 +310,89 @@ static ssize_t osc_cur_grant_bytes_seq_write(struct file *file,
 	if (cli->cl_import->imp_state == LUSTRE_IMP_FULL)
 		rc = osc_shrink_grant_to_target(cli, val);
 	LPROCFS_CLIMP_EXIT(obd);
-	if (rc)
-		return rc;
-	return count;
-}
-LPROC_SEQ_FOPS(osc_cur_grant_bytes);
-
-static int osc_cur_lost_grant_bytes_seq_show(struct seq_file *m, void *v)
-{
-	struct obd_device *dev = m->private;
-	struct client_obd *cli = &dev->u.cli;
 
-	spin_lock(&cli->cl_loi_list_lock);
-	seq_printf(m, "%lu\n", cli->cl_lost_grant);
-	spin_unlock(&cli->cl_loi_list_lock);
-	return 0;
+	return rc ? rc : count;
 }
-LPROC_SEQ_FOPS_RO(osc_cur_lost_grant_bytes);
+LPROC_SEQ_FOPS(osc_cur_grant_bytes);
 
-static int osc_cur_dirty_grant_bytes_seq_show(struct seq_file *m, void *v)
+static ssize_t cur_lost_grant_bytes_show(struct kobject *kobj,
+					 struct attribute *attr,
+					 char *buf)
 {
-	struct obd_device *dev = m->private;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 	struct client_obd *cli = &dev->u.cli;
+	ssize_t len;
 
 	spin_lock(&cli->cl_loi_list_lock);
-	seq_printf(m, "%lu\n", cli->cl_dirty_grant);
+	len = sprintf(buf, "%lu\n", cli->cl_lost_grant);
 	spin_unlock(&cli->cl_loi_list_lock);
-	return 0;
+	return len;
 }
-LPROC_SEQ_FOPS_RO(osc_cur_dirty_grant_bytes);
+LUSTRE_RO_ATTR(cur_lost_grant_bytes);
 
-static int osc_grant_shrink_interval_seq_show(struct seq_file *m, void *v)
+static ssize_t grant_shrink_interval_show(struct kobject *kobj,
+					  struct attribute *attr,
+					  char *buf)
 {
-	struct obd_device *obd = m->private;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 
-	if (obd == NULL)
-		return 0;
-	seq_printf(m, "%d\n",
-		   obd->u.cli.cl_grant_shrink_interval);
-	return 0;
+	return sprintf(buf, "%lld\n", obd->u.cli.cl_grant_shrink_interval);
 }
 
-static ssize_t osc_grant_shrink_interval_seq_write(struct file *file,
-						   const char __user *buffer,
-						   size_t count, loff_t *off)
+static ssize_t grant_shrink_interval_store(struct kobject *kobj,
+					   struct attribute *attr,
+					   const char *buffer,
+					   size_t count)
 {
-	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	unsigned int val;
 	int rc;
-	__s64 val;
 
-	if (obd == NULL)
-		return 0;
-
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtouint(buffer, 0, &val);
 	if (rc)
 		return rc;
 
-	if (val <= 0 || val > INT_MAX)
+	if (val == 0)
 		return -ERANGE;
 
 	obd->u.cli.cl_grant_shrink_interval = val;
 
 	return count;
 }
-LPROC_SEQ_FOPS(osc_grant_shrink_interval);
+LUSTRE_RW_ATTR(grant_shrink_interval);
 
-static int osc_checksum_seq_show(struct seq_file *m, void *v)
+static ssize_t checksums_show(struct kobject *kobj,
+			      struct attribute *attr,
+			      char *buf)
 {
-	struct obd_device *obd = m->private;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 
-	if (obd == NULL)
-		return 0;
-
-	seq_printf(m, "%d\n", obd->u.cli.cl_checksum ? 1 : 0);
-	return 0;
+	return sprintf(buf, "%d\n", obd->u.cli.cl_checksum ? 1 : 0);
 }
 
-static ssize_t osc_checksum_seq_write(struct file *file,
-				      const char __user *buffer,
-				      size_t count, loff_t *off)
+static ssize_t checksums_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buffer,
+			       size_t count)
 {
-	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	bool val;
 	int rc;
-	__s64 val;
 
-	if (obd == NULL)
-		return 0;
-
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtobool(buffer, &val);
 	if (rc)
 		return rc;
 
-	obd->u.cli.cl_checksum = !!val;
+	obd->u.cli.cl_checksum = val;
 
 	return count;
 }
-LPROC_SEQ_FOPS(osc_checksum);
+LUSTRE_RW_ATTR(checksums);
 
 static int osc_checksum_type_seq_show(struct seq_file *m, void *v)
 {
@@ -422,7 +429,7 @@ static ssize_t osc_checksum_type_seq_write(struct file *file,
 
         if (count > sizeof(kernbuf) - 1)
                 return -EINVAL;
-	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
+	if (copy_from_user(kernbuf, buffer, count))
                 return -EFAULT;
         if (count > 0 && kernbuf[count - 1] == '\n')
                 kernbuf[count - 1] = '\0';
@@ -441,139 +448,147 @@ static ssize_t osc_checksum_type_seq_write(struct file *file,
 }
 LPROC_SEQ_FOPS(osc_checksum_type);
 
-static int osc_resend_count_seq_show(struct seq_file *m, void *v)
+static ssize_t resend_count_show(struct kobject *kobj,
+				 struct attribute *attr,
+				 char *buf)
 {
-	struct obd_device *obd = m->private;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 
-	seq_printf(m, "%u\n", atomic_read(&obd->u.cli.cl_resends));
-	return 0;
+	return sprintf(buf, "%u\n", atomic_read(&obd->u.cli.cl_resends));
 }
 
-static ssize_t osc_resend_count_seq_write(struct file *file,
-					  const char __user *buffer,
-					  size_t count, loff_t *off)
+static ssize_t resend_count_store(struct kobject *kobj,
+				  struct attribute *attr,
+				  const char *buffer,
+				  size_t count)
 {
-	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	unsigned int val;
 	int rc;
-	__s64 val;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtouint(buffer, 10, &val);
 	if (rc)
 		return rc;
 
-	if (val < 0 || val > INT_MAX)
-		return -EINVAL;
-
 	atomic_set(&obd->u.cli.cl_resends, val);
 
 	return count;
 }
-LPROC_SEQ_FOPS(osc_resend_count);
+LUSTRE_RW_ATTR(resend_count);
 
-static int osc_checksum_dump_seq_show(struct seq_file *m, void *v)
+static ssize_t checksum_dump_show(struct kobject *kobj,
+				  struct attribute *attr,
+				  char *buf)
 {
-	struct obd_device *obd = m->private;
-
-	if (obd == NULL)
-		return 0;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 
-	seq_printf(m, "%d\n", obd->u.cli.cl_checksum_dump ? 1 : 0);
-	return 0;
+	return sprintf(buf, "%d\n", obd->u.cli.cl_checksum_dump ? 1 : 0);
 }
 
-static ssize_t osc_checksum_dump_seq_write(struct file *file,
-					   const char __user *buffer,
-					   size_t count, loff_t *off)
+static ssize_t checksum_dump_store(struct kobject *kobj,
+				   struct attribute *attr,
+				   const char *buffer,
+				   size_t count)
 {
-	struct obd_device *obd;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	bool val;
 	int rc;
-	__s64 val;
 
-	obd = ((struct seq_file *)file->private_data)->private;
-	if (obd == NULL)
-		return 0;
-
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtobool(buffer, &val);
 	if (rc)
 		return rc;
 
-	obd->u.cli.cl_checksum_dump = (val ? 1 : 0);
+	obd->u.cli.cl_checksum_dump = val;
 
 	return count;
 }
-LPROC_SEQ_FOPS(osc_checksum_dump);
+LUSTRE_RW_ATTR(checksum_dump);
 
-static int osc_contention_seconds_seq_show(struct seq_file *m, void *v)
+static ssize_t contention_seconds_show(struct kobject *kobj,
+				       struct attribute *attr,
+				       char *buf)
 {
-	struct obd_device *obd = m->private;
-	struct osc_device *od  = obd2osc_dev(obd);
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct osc_device *od = obd2osc_dev(obd);
 
-	seq_printf(m, "%u\n", od->od_contention_time);
-	return 0;
+	return sprintf(buf, "%lld\n", od->od_contention_time);
 }
 
-static ssize_t osc_contention_seconds_seq_write(struct file *file,
-						const char __user *buffer,
-						size_t count, loff_t *off)
+static ssize_t contention_seconds_store(struct kobject *kobj,
+					struct attribute *attr,
+					const char *buffer,
+					size_t count)
 {
-	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
-	struct osc_device *od  = obd2osc_dev(obd);
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct osc_device *od = obd2osc_dev(obd);
+	unsigned int val;
 	int rc;
-	__s64 val;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtouint(buffer, 0, &val);
 	if (rc)
 		return rc;
-	if (val < 0 || val > INT_MAX)
-		return -ERANGE;
 
 	od->od_contention_time = val;
 
 	return count;
 }
-LPROC_SEQ_FOPS(osc_contention_seconds);
+LUSTRE_RW_ATTR(contention_seconds);
 
-static int osc_lockless_truncate_seq_show(struct seq_file *m, void *v)
+static ssize_t lockless_truncate_show(struct kobject *kobj,
+				      struct attribute *attr,
+				      char *buf)
 {
-	struct obd_device *obd = m->private;
-	struct osc_device *od  = obd2osc_dev(obd);
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct osc_device *od = obd2osc_dev(obd);
 
-	seq_printf(m, "%u\n", od->od_lockless_truncate);
-	return 0;
+	return sprintf(buf, "%u\n", od->od_lockless_truncate);
 }
 
-static ssize_t osc_lockless_truncate_seq_write(struct file *file,
-					       const char __user *buffer,
-				    size_t count, loff_t *off)
+static ssize_t lockless_truncate_store(struct kobject *kobj,
+				       struct attribute *attr,
+				       const char *buffer,
+				       size_t count)
 {
-	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
-        struct osc_device *od  = obd2osc_dev(obd);
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct osc_device *od = obd2osc_dev(obd);
+	bool val;
 	int rc;
-	__s64 val;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtobool(buffer, &val);
 	if (rc)
 		return rc;
-	if (val < 0)
-		return -ERANGE;
 
-	od->od_lockless_truncate = !!val;
+	od->od_lockless_truncate = val;
 
 	return count;
 }
-LPROC_SEQ_FOPS(osc_lockless_truncate);
+LUSTRE_RW_ATTR(lockless_truncate);
 
-static int osc_destroys_in_flight_seq_show(struct seq_file *m, void *v)
+static ssize_t destroys_in_flight_show(struct kobject *kobj,
+				       struct attribute *attr,
+				       char *buf)
 {
-	struct obd_device *obd = m->private;
-	seq_printf(m, "%u\n",
-		   atomic_read(&obd->u.cli.cl_destroy_in_flight));
-	return 0;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+
+	return sprintf(buf, "%u\n",
+		       atomic_read(&obd->u.cli.cl_destroy_in_flight));
 }
-LPROC_SEQ_FOPS_RO(osc_destroys_in_flight);
+LUSTRE_RO_ATTR(destroys_in_flight);
 
 LPROC_SEQ_FOPS_RW_TYPE(osc, obd_max_pages_per_rpc);
 
+LUSTRE_RW_ATTR(short_io_bytes);
+
+#ifdef CONFIG_PROC_FS
 static int osc_unstable_stats_seq_show(struct seq_file *m, void *v)
 {
 	struct obd_device *dev = m->private;
@@ -591,84 +606,154 @@ static int osc_unstable_stats_seq_show(struct seq_file *m, void *v)
 }
 LPROC_SEQ_FOPS_RO(osc_unstable_stats);
 
-LPROC_SEQ_FOPS_RO_TYPE(osc, uuid);
+static ssize_t idle_timeout_show(struct kobject *kobj, struct attribute *attr,
+				 char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct client_obd *cli = &obd->u.cli;
+	int ret;
+
+	LPROCFS_CLIMP_CHECK(obd);
+	ret = sprintf(buf, "%u\n", cli->cl_import->imp_idle_timeout);
+	LPROCFS_CLIMP_EXIT(obd);
+
+	return ret;
+}
+
+static ssize_t idle_timeout_store(struct kobject *kobj, struct attribute *attr,
+				  const char *buffer, size_t count)
+{
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct client_obd *cli = &dev->u.cli;
+	struct ptlrpc_request *req;
+	unsigned int idle_debug = 0;
+	unsigned int val;
+	int rc;
+
+	if (strncmp(buffer, "debug", 5) == 0) {
+		idle_debug = D_CONSOLE;
+	} else if (strncmp(buffer, "nodebug", 6) == 0) {
+		idle_debug = D_HA;
+	} else {
+		rc = kstrtouint(buffer, 10, &val);
+		if (rc)
+			return rc;
+
+		if (val > CONNECTION_SWITCH_MAX)
+			return -ERANGE;
+	}
+
+	LPROCFS_CLIMP_CHECK(dev);
+	if (idle_debug) {
+		cli->cl_import->imp_idle_debug = idle_debug;
+	} else {
+		if (!val) {
+			/* initiate the connection if it's in IDLE state */
+			req = ptlrpc_request_alloc(cli->cl_import,
+						   &RQF_OST_STATFS);
+			if (req != NULL)
+				ptlrpc_req_finished(req);
+		}
+		cli->cl_import->imp_idle_timeout = val;
+	}
+	LPROCFS_CLIMP_EXIT(dev);
+
+	return count;
+}
+LUSTRE_RW_ATTR(idle_timeout);
+
+static ssize_t idle_connect_store(struct kobject *kobj, struct attribute *attr,
+				  const char *buffer, size_t count)
+{
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct client_obd *cli = &dev->u.cli;
+	struct ptlrpc_request *req;
+
+	LPROCFS_CLIMP_CHECK(dev);
+	/* to initiate the connection if it's in IDLE state */
+	req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_STATFS);
+	if (req)
+		ptlrpc_req_finished(req);
+	ptlrpc_pinger_force(cli->cl_import);
+	LPROCFS_CLIMP_EXIT(dev);
+
+	return count;
+}
+LUSTRE_WO_ATTR(idle_connect);
+
+static ssize_t grant_shrink_show(struct kobject *kobj, struct attribute *attr,
+				 char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_import *imp;
+	ssize_t len;
+
+	LPROCFS_CLIMP_CHECK(obd);
+	imp = obd->u.cli.cl_import;
+	len = snprintf(buf, PAGE_SIZE, "%d\n",
+		       !imp->imp_grant_shrink_disabled &&
+		       OCD_HAS_FLAG(&imp->imp_connect_data, GRANT_SHRINK));
+	LPROCFS_CLIMP_EXIT(obd);
+
+	return len;
+}
+
+static ssize_t grant_shrink_store(struct kobject *kobj, struct attribute *attr,
+				  const char *buffer, size_t count)
+{
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_import *imp;
+	bool val;
+	int rc;
+
+	if (dev == NULL)
+		return 0;
+
+	rc = kstrtobool(buffer, &val);
+	if (rc)
+		return rc;
+
+	LPROCFS_CLIMP_CHECK(dev);
+
+	imp = dev->u.cli.cl_import;
+	spin_lock(&imp->imp_lock);
+	imp->imp_grant_shrink_disabled = !val;
+	spin_unlock(&imp->imp_lock);
+
+	LPROCFS_CLIMP_EXIT(dev);
+
+	return count;
+}
+LUSTRE_RW_ATTR(grant_shrink);
+
 LPROC_SEQ_FOPS_RO_TYPE(osc, connect_flags);
-LPROC_SEQ_FOPS_RO_TYPE(osc, blksize);
-LPROC_SEQ_FOPS_RO_TYPE(osc, kbytestotal);
-LPROC_SEQ_FOPS_RO_TYPE(osc, kbytesfree);
-LPROC_SEQ_FOPS_RO_TYPE(osc, kbytesavail);
-LPROC_SEQ_FOPS_RO_TYPE(osc, filestotal);
-LPROC_SEQ_FOPS_RO_TYPE(osc, filesfree);
 LPROC_SEQ_FOPS_RO_TYPE(osc, server_uuid);
-LPROC_SEQ_FOPS_RO_TYPE(osc, conn_uuid);
 LPROC_SEQ_FOPS_RO_TYPE(osc, timeouts);
 LPROC_SEQ_FOPS_RO_TYPE(osc, state);
 
-LPROC_SEQ_FOPS_WO_TYPE(osc, ping);
-
 LPROC_SEQ_FOPS_RW_TYPE(osc, import);
 LPROC_SEQ_FOPS_RW_TYPE(osc, pinger_recov);
 
 struct lprocfs_vars lprocfs_osc_obd_vars[] = {
-	{ .name	=	"uuid",
-	  .fops	=	&osc_uuid_fops			},
-	{ .name	=	"ping",
-	  .fops	=	&osc_ping_fops,
-	  .proc_mode =	0222				},
 	{ .name	=	"connect_flags",
 	  .fops	=	&osc_connect_flags_fops		},
-	{ .name	=	"blocksize",
-	  .fops	=	&osc_blksize_fops		},
-	{ .name	=	"kbytestotal",
-	  .fops	=	&osc_kbytestotal_fops		},
-	{ .name	=	"kbytesfree",
-	  .fops	=	&osc_kbytesfree_fops		},
-	{ .name	=	"kbytesavail",
-	  .fops	=	&osc_kbytesavail_fops		},
-	{ .name	=	"filestotal",
-	  .fops	=	&osc_filestotal_fops		},
-	{ .name	=	"filesfree",
-	  .fops	=	&osc_filesfree_fops		},
 	{ .name	=	"ost_server_uuid",
 	  .fops	=	&osc_server_uuid_fops		},
-	{ .name	=	"ost_conn_uuid",
-	  .fops	=	&osc_conn_uuid_fops		},
-	{ .name	=	"active",
-	  .fops	=	&osc_active_fops		},
-	{ .name	=	"max_pages_per_rpc",
-	  .fops	=	&osc_obd_max_pages_per_rpc_fops	},
-	{ .name	=	"max_rpcs_in_flight",
-	  .fops	=	&osc_max_rpcs_in_flight_fops	},
-	{ .name	=	"destroys_in_flight",
-	  .fops	=	&osc_destroys_in_flight_fops	},
-	{ .name	=	"max_dirty_mb",
-	  .fops	=	&osc_max_dirty_mb_fops		},
+	{ .name =	"max_pages_per_rpc",
+	  .fops =	&osc_obd_max_pages_per_rpc_fops	},
 	{ .name	=	"osc_cached_mb",
 	  .fops	=	&osc_cached_mb_fops		},
-	{ .name	=	"cur_dirty_bytes",
-	  .fops	=	&osc_cur_dirty_bytes_fops	},
-	{ .name	=	"cur_grant_bytes",
-	  .fops	=	&osc_cur_grant_bytes_fops	},
-	{ .name	=	"cur_lost_grant_bytes",
-	  .fops	=	&osc_cur_lost_grant_bytes_fops	},
-	{ .name	=	"cur_dirty_grant_bytes",
-	  .fops	=	&osc_cur_dirty_grant_bytes_fops	},
-	{ .name	=	"grant_shrink_interval",
-	  .fops	=	&osc_grant_shrink_interval_fops	},
-	{ .name	=	"checksums",
-	  .fops	=	&osc_checksum_fops		},
+	{ .name =	"cur_grant_bytes",
+	  .fops =	&osc_cur_grant_bytes_fops	},
 	{ .name	=	"checksum_type",
 	  .fops	=	&osc_checksum_type_fops		},
-	{ .name	=	"checksum_dump",
-	  .fops	=	&osc_checksum_dump_fops		},
-	{ .name	=	"resend_count",
-	  .fops	=	&osc_resend_count_fops		},
 	{ .name	=	"timeouts",
 	  .fops	=	&osc_timeouts_fops		},
-	{ .name	=	"contention_seconds",
-	  .fops	=	&osc_contention_seconds_fops	},
-	{ .name	=	"lockless_truncate",
-	  .fops	=	&osc_lockless_truncate_fops	},
 	{ .name	=	"import",
 	  .fops	=	&osc_import_fops		},
 	{ .name	=	"state",
@@ -680,8 +765,6 @@ struct lprocfs_vars lprocfs_osc_obd_vars[] = {
 	{ NULL }
 };
 
-#define pct(a,b) (b ? a * 100 / b : 0)
-
 static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v)
 {
 	struct timespec64 now;
@@ -720,7 +803,7 @@ static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v)
 
 		read_cum += r;
 		write_cum += w;
-		seq_printf(seq, "%d:\t\t%10lu %3lu %3lu   | %10lu %3lu %3lu\n",
+		seq_printf(seq, "%d:\t\t%10lu %3u %3u   | %10lu %3u %3u\n",
 			   1 << i, r, pct(r, read_tot),
 			   pct(read_cum, read_tot), w,
 			   pct(w, write_tot),
@@ -743,7 +826,7 @@ static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v)
                 unsigned long w = cli->cl_write_rpc_hist.oh_buckets[i];
                 read_cum += r;
                 write_cum += w;
-		seq_printf(seq, "%d:\t\t%10lu %3lu %3lu   | %10lu %3lu %3lu\n",
+		seq_printf(seq, "%d:\t\t%10lu %3u %3u   | %10lu %3u %3u\n",
 			   i, r, pct(r, read_tot),
 			   pct(read_cum, read_tot), w,
 			   pct(w, write_tot),
@@ -766,10 +849,10 @@ static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v)
                 unsigned long w = cli->cl_write_offset_hist.oh_buckets[i];
                 read_cum += r;
                 write_cum += w;
-                seq_printf(seq, "%d:\t\t%10lu %3lu %3lu   | %10lu %3lu %3lu\n",
-                           (i == 0) ? 0 : 1 << (i - 1),
-                           r, pct(r, read_tot), pct(read_cum, read_tot),
-                           w, pct(w, write_tot), pct(write_cum, write_tot));
+		seq_printf(seq, "%d:\t\t%10lu %3u %3u   | %10lu %3u %3u\n",
+			   (i == 0) ? 0 : 1 << (i - 1),
+			   r, pct(r, read_tot), pct(read_cum, read_tot),
+			   w, pct(w, write_tot), pct(write_cum, write_tot));
                 if (read_cum == read_tot && write_cum == write_tot)
                         break;
         }
@@ -778,7 +861,6 @@ static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v)
 
         return 0;
 }
-#undef pct
 
 static ssize_t osc_rpc_stats_seq_write(struct file *file,
 				       const char __user *buf,
@@ -832,7 +914,7 @@ static ssize_t osc_stats_seq_write(struct file *file,
 
 LPROC_SEQ_FOPS(osc_stats);
 
-int lproc_osc_attach_seqstat(struct obd_device *dev)
+int lprocfs_osc_attach_seqstat(struct obd_device *dev)
 {
 	int rc;
 
@@ -845,3 +927,77 @@ int lproc_osc_attach_seqstat(struct obd_device *dev)
 	return rc;
 }
 #endif /* CONFIG_PROC_FS */
+
+static struct attribute *osc_attrs[] = {
+	&lustre_attr_active.attr,
+	&lustre_attr_checksums.attr,
+	&lustre_attr_checksum_dump.attr,
+	&lustre_attr_contention_seconds.attr,
+	&lustre_attr_cur_dirty_bytes.attr,
+	&lustre_attr_cur_lost_grant_bytes.attr,
+	&lustre_attr_destroys_in_flight.attr,
+	&lustre_attr_grant_shrink_interval.attr,
+	&lustre_attr_lockless_truncate.attr,
+	&lustre_attr_max_dirty_mb.attr,
+	&lustre_attr_max_rpcs_in_flight.attr,
+	&lustre_attr_short_io_bytes.attr,
+	&lustre_attr_resend_count.attr,
+	&lustre_attr_ost_conn_uuid.attr,
+	&lustre_attr_conn_uuid.attr,
+	&lustre_attr_ping.attr,
+	&lustre_attr_idle_timeout.attr,
+	&lustre_attr_idle_connect.attr,
+	&lustre_attr_grant_shrink.attr,
+	NULL,
+};
+
+int osc_tunables_init(struct obd_device *obd)
+{
+#if defined(CONFIG_PROC_FS) && defined(HAVE_SERVER_SUPPORT)
+	struct obd_type *type;
+#endif
+	int rc;
+
+	obd->obd_vars = lprocfs_osc_obd_vars;
+#if defined(CONFIG_PROC_FS) && defined(HAVE_SERVER_SUPPORT)
+	/* If this is true then both client (osc) and server (osp) are on the
+	 * same node. The osp layer if loaded first will register the osc proc
+	 * directory. In that case this obd_device will be attached its proc
+	 * tree to type->typ_procsym instead of obd->obd_type->typ_procroot.
+	 */
+	type = class_search_type(LUSTRE_OSP_NAME);
+	if (type && type->typ_procsym) {
+		obd->obd_proc_entry = lprocfs_register(obd->obd_name,
+						       type->typ_procsym,
+						       obd->obd_vars, obd);
+		if (IS_ERR(obd->obd_proc_entry)) {
+			rc = PTR_ERR(obd->obd_proc_entry);
+			CERROR("error %d setting up lprocfs for %s\n", rc,
+			       obd->obd_name);
+			obd->obd_proc_entry = NULL;
+		}
+	}
+#endif
+	obd->obd_ktype.default_attrs = osc_attrs;
+	rc = lprocfs_obd_setup(obd, false);
+	if (rc)
+		return rc;
+#ifdef CONFIG_PROC_FS
+	/* If the basic OSC proc tree construction succeeded then
+	 * lets do the rest.
+	 */
+	rc = lprocfs_osc_attach_seqstat(obd);
+	if (rc)
+		goto obd_cleanup;
+
+#endif /* CONFIG_PROC_FS */
+	rc = sptlrpc_lprocfs_cliobd_attach(obd);
+	if (rc)
+		goto obd_cleanup;
+
+	ptlrpc_lprocfs_register_obd(obd);
+obd_cleanup:
+	if (rc)
+		lprocfs_obd_cleanup(obd);
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_cache.c b/drivers/staging/lustrefsx/lustre/osc/osc_cache.c
index 178340e255ac9..5652e74222bea 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_cache.c
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_cache.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  *
  */
 /*
@@ -37,7 +37,9 @@
 
 #define DEBUG_SUBSYSTEM S_OSC
 
-#include "osc_cl_internal.h"
+#include <lustre_osc.h>
+#include <lustre_dlm.h>
+
 #include "osc_internal.h"
 
 static int extent_debug; /* set it to be true for more debug */
@@ -214,7 +216,7 @@ static int osc_extent_sanity_check0(struct osc_extent *ext,
 			GOTO(out, rc = 60);
 		if (ext->oe_fsync_wait && !ext->oe_urgent && !ext->oe_hp)
 			GOTO(out, rc = 65);
-		/* Fall through */
+		/* fallthrough */
 	default:
 		if (atomic_read(&ext->oe_users) > 0)
 			GOTO(out, rc = 70);
@@ -226,7 +228,9 @@ static int osc_extent_sanity_check0(struct osc_extent *ext,
 	if (ext->oe_sync && ext->oe_grants > 0)
 		GOTO(out, rc = 90);
 
-	if (ext->oe_dlmlock != NULL && !ldlm_is_failed(ext->oe_dlmlock)) {
+	if (ext->oe_dlmlock != NULL &&
+	    ext->oe_dlmlock->l_resource->lr_type == LDLM_EXTENT &&
+	    !ldlm_is_failed(ext->oe_dlmlock)) {
 		struct ldlm_extent *extent;
 
 		extent = &ext->oe_dlmlock->l_policy_data.l_extent;
@@ -592,7 +596,10 @@ int osc_extent_release(const struct lu_env *env, struct osc_extent *ext)
 			if (grant > 0)
 				osc_unreserve_grant(cli, 0, grant);
 
-			if (ext->oe_urgent)
+			if (ext->oe_hp)
+				list_move_tail(&ext->oe_link,
+					       &obj->oo_hp_exts);
+			else if (ext->oe_urgent)
 				list_move_tail(&ext->oe_link,
 					       &obj->oo_urgent_exts);
 			else if (ext->oe_nr_pages == ext->oe_mppr) {
@@ -697,7 +704,7 @@ static struct osc_extent *osc_extent_find(const struct lu_env *env,
 		pgoff_t ext_chk_end   = ext->oe_end   >> ppc_bits;
 
 		LASSERT(sanity_check_nolock(ext) == 0);
-		if (chunk > ext_chk_end + 1)
+		if (chunk > ext_chk_end + 1 || chunk < ext_chk_start)
 			break;
 
 		/* if covering by different locks, no chance to match */
@@ -974,6 +981,7 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
 	struct client_obd     *cli = osc_cli(obj);
 	struct osc_async_page *oap;
 	struct osc_async_page *tmp;
+	struct pagevec        *pvec;
 	int                    pages_in_chunk = 0;
 	int                    ppc_bits    = cli->cl_chunkbits -
 					     PAGE_SHIFT;
@@ -995,9 +1003,11 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
 	if (IS_ERR(env))
 		RETURN(PTR_ERR(env));
 
-	io  = &osc_env_info(env)->oti_io;
+	io  = osc_env_thread_io(env);
 	io->ci_obj = cl_object_top(osc2cl(obj));
 	io->ci_ignore_layout = 1;
+	pvec = &osc_env_info(env)->oti_pagevec;
+	ll_pagevec_init(pvec, 0);
 	rc = cl_io_init(env, io, CIT_MISC, io->ci_obj);
 	if (rc < 0)
 		GOTO(out, rc);
@@ -1035,11 +1045,13 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
 		}
 
 		lu_ref_del(&page->cp_reference, "truncate", current);
-		cl_page_put(env, page);
+		cl_pagevec_put(env, page, pvec);
 
 		--ext->oe_nr_pages;
 		++nr_pages;
 	}
+	pagevec_release(pvec);
+
 	EASSERTF(ergo(ext->oe_start >= trunc_index + !!partial,
 		      ext->oe_nr_pages == 0),
 		ext, "trunc_index %lu, partial %d\n", trunc_index, partial);
@@ -1284,7 +1296,7 @@ static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap,
 	ENTRY;
 	result = cl_page_make_ready(env, page, CRT_WRITE);
 	if (result == 0)
-		opg->ops_submit_time = cfs_time_current();
+		opg->ops_submit_time = ktime_get();
 	RETURN(result);
 }
 
@@ -1295,7 +1307,6 @@ static int osc_refresh_count(const struct lu_env *env,
 	pgoff_t index = osc_index(oap2osc(oap));
 	struct cl_object *obj;
 	struct cl_attr   *attr = &osc_env_info(env)->oti_attr;
-
 	int result;
 	loff_t kms;
 
@@ -1341,7 +1352,7 @@ static int osc_completion(const struct lu_env *env, struct osc_async_page *oap,
 	/* Clear opg->ops_transfer_pinned before VM lock is released. */
 	opg->ops_transfer_pinned = 0;
 
-	opg->ops_submit_time = 0;
+	opg->ops_submit_time = ktime_set(0, 0);
 	srvlock = oap->oap_brw_flags & OBD_BRW_SRVLOCK;
 
 	/* statistic */
@@ -1392,7 +1403,6 @@ static void osc_consume_write_grant(struct client_obd *cli,
 {
 	assert_spin_locked(&cli->cl_loi_list_lock);
 	LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT));
-	atomic_long_inc(&obd_dirty_pages);
 	cli->cl_dirty_pages++;
 	pga->flag |= OBD_BRW_FROM_GRANT;
 	CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n",
@@ -1416,11 +1426,6 @@ static void osc_release_write_grant(struct client_obd *cli,
 	pga->flag &= ~OBD_BRW_FROM_GRANT;
 	atomic_long_dec(&obd_dirty_pages);
 	cli->cl_dirty_pages--;
-	if (pga->flag & OBD_BRW_NOCACHE) {
-		pga->flag &= ~OBD_BRW_NOCACHE;
-		atomic_long_dec(&obd_dirty_transit_pages);
-		cli->cl_dirty_transit--;
-	}
 	EXIT;
 }
 
@@ -1526,7 +1531,7 @@ static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap)
  */
 static int osc_enter_cache_try(struct client_obd *cli,
 			       struct osc_async_page *oap,
-			       int bytes, int transient)
+			       int bytes)
 {
 	int rc;
 
@@ -1536,31 +1541,38 @@ static int osc_enter_cache_try(struct client_obd *cli,
 	if (rc < 0)
 		return 0;
 
-	if (cli->cl_dirty_pages < cli->cl_dirty_max_pages &&
-	    1 + atomic_long_read(&obd_dirty_pages) <= obd_max_dirty_pages) {
-		osc_consume_write_grant(cli, &oap->oap_brw_page);
-		if (transient) {
-			cli->cl_dirty_transit++;
-			atomic_long_inc(&obd_dirty_transit_pages);
-			oap->oap_brw_flags |= OBD_BRW_NOCACHE;
+	if (cli->cl_dirty_pages < cli->cl_dirty_max_pages) {
+		if (atomic_long_add_return(1, &obd_dirty_pages) <=
+		    obd_max_dirty_pages) {
+			osc_consume_write_grant(cli, &oap->oap_brw_page);
+			return 1;
 		}
-		rc = 1;
-	} else {
-		__osc_unreserve_grant(cli, bytes, bytes);
-		rc = 0;
+		atomic_long_dec(&obd_dirty_pages);
 	}
-	return rc;
+	__osc_unreserve_grant(cli, bytes, bytes);
+	return 0;
 }
 
-static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw)
+/* Following two inlines exist to pass code fragments
+ * to wait_event_idle_exclusive_timeout_cmd().  Passing
+ * code fragments as macro args can look confusing, so
+ * we provide inlines to encapsulate them.
+ */
+static inline void cli_unlock_and_unplug(const struct lu_env *env,
+					 struct client_obd *cli,
+					 struct osc_async_page *oap)
 {
-	int rc;
-	spin_lock(&cli->cl_loi_list_lock);
-	rc = list_empty(&ocw->ocw_entry);
 	spin_unlock(&cli->cl_loi_list_lock);
-	return rc;
+	osc_io_unplug_async(env, cli, NULL);
+	CDEBUG(D_CACHE,
+	       "%s: sleeping for cache space for %p\n",
+	       cli_name(cli), oap);
 }
 
+static inline void cli_lock_after_unplug(struct client_obd *cli)
+{
+	spin_lock(&cli->cl_loi_list_lock);
+}
 /**
  * The main entry to reserve dirty page accounting. Usually the grant reserved
  * in this function will be freed in bulk in osc_free_grant() unless it fails
@@ -1571,15 +1583,23 @@ static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw)
 static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli,
 			   struct osc_async_page *oap, int bytes)
 {
-	struct osc_object	*osc = oap->oap_obj;
-	struct lov_oinfo	*loi = osc->oo_oinfo;
-	struct osc_cache_waiter	 ocw;
-	struct l_wait_info	 lwi;
-	int			 rc = -EDQUOT;
-	ENTRY;
+	struct osc_object *osc = oap->oap_obj;
+	struct lov_oinfo *loi = osc->oo_oinfo;
+	int rc = -EDQUOT;
+	int remain;
+	bool entered = false;
+	/* We cannot wait for a long time here since we are holding ldlm lock
+	 * across the actual IO. If no requests complete fast (e.g. due to
+	 * overloaded OST that takes a long time to process everything, we'd
+	 * get evicted if we wait for a normal obd_timeout or some such.
+	 * So we try to wait half the time it would take the client to be
+	 * evicted by server which is half obd_timeout when AT is off
+	 * or at least ldlm_enqueue_min with AT on.
+	 * See LU-13131 */
+	unsigned long timeout = cfs_time_seconds(AT_OFF ? obd_timeout / 2 :
+							  ldlm_enqueue_min / 2);
 
-	lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(AT_OFF ? obd_timeout : at_max),
-			       NULL, LWI_ON_SIGNAL_NOOP, NULL);
+	ENTRY;
 
 	OSC_DUMP_GRANT(D_CACHE, cli, "need:%d\n", bytes);
 
@@ -1594,76 +1614,40 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli,
 		GOTO(out, rc = -EDQUOT);
 	}
 
-	/* Hopefully normal case - cache space and write credits available */
-	if (osc_enter_cache_try(cli, oap, bytes, 0)) {
-		OSC_DUMP_GRANT(D_CACHE, cli, "granted from cache\n");
-		GOTO(out, rc = 0);
-	}
-
-	/* We can get here for two reasons: too many dirty pages in cache, or
+	/*
+	 * We can wait here for two reasons: too many dirty pages in cache, or
 	 * run out of grants. In both cases we should write dirty pages out.
 	 * Adding a cache waiter will trigger urgent write-out no matter what
 	 * RPC size will be.
-	 * The exiting condition is no avail grants and no dirty pages caching,
-	 * that really means there is no space on the OST. */
-	init_waitqueue_head(&ocw.ocw_waitq);
-	ocw.ocw_oap   = oap;
-	ocw.ocw_grant = bytes;
-	while (cli->cl_dirty_pages > 0 || cli->cl_w_in_flight > 0) {
-		list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters);
-		ocw.ocw_rc = 0;
-		spin_unlock(&cli->cl_loi_list_lock);
-
-		osc_io_unplug_async(env, cli, NULL);
-
-		CDEBUG(D_CACHE, "%s: sleeping for cache space @ %p for %p\n",
-		       cli_name(cli), &ocw, oap);
-
-		rc = l_wait_event(ocw.ocw_waitq, ocw_granted(cli, &ocw), &lwi);
-
-		spin_lock(&cli->cl_loi_list_lock);
-
-		if (rc < 0) {
-			/* l_wait_event is interrupted by signal or timed out */
-			list_del_init(&ocw.ocw_entry);
-			break;
-		}
-		LASSERT(list_empty(&ocw.ocw_entry));
-		rc = ocw.ocw_rc;
-
-		if (rc != -EDQUOT)
-			break;
-		if (osc_enter_cache_try(cli, oap, bytes, 0)) {
-			rc = 0;
-			break;
-		}
-	}
-
-	switch (rc) {
-	case 0:
-		OSC_DUMP_GRANT(D_CACHE, cli, "finally got grant space\n");
-		break;
-	case -ETIMEDOUT:
+	 * The exiting condition (other than success) is no avail grants
+	 * and no dirty pages caching, that really means there is no space
+	 * on the OST.
+	 */
+	remain = wait_event_idle_exclusive_timeout_cmd(
+		cli->cl_cache_waiters,
+		(entered = osc_enter_cache_try(cli, oap, bytes)) ||
+		(cli->cl_dirty_pages == 0 && cli->cl_w_in_flight == 0),
+		timeout,
+		cli_unlock_and_unplug(env, cli, oap),
+		cli_lock_after_unplug(cli));
+
+	if (entered) {
+		if (remain == timeout)
+			OSC_DUMP_GRANT(D_CACHE, cli, "granted from cache\n");
+		else
+			OSC_DUMP_GRANT(D_CACHE, cli,
+				       "finally got grant space\n");
+		wake_up(&cli->cl_cache_waiters);
+		rc = 0;
+	} else if (remain == 0) {
 		OSC_DUMP_GRANT(D_CACHE, cli,
 			       "timeout, fall back to sync i/o\n");
 		osc_extent_tree_dump(D_CACHE, osc);
 		/* fall back to synchronous I/O */
-		rc = -EDQUOT;
-		break;
-	case -EINTR:
-		/* Ensures restartability - LU-3581 */
-		OSC_DUMP_GRANT(D_CACHE, cli, "interrupted\n");
-		rc = -ERESTARTSYS;
-		break;
-	case -EDQUOT:
+	} else {
 		OSC_DUMP_GRANT(D_CACHE, cli,
 			       "no grant space, fall back to sync i/o\n");
-		break;
-	default:
-		CDEBUG(D_CACHE, "%s: event for cache space @ %p never arrived "
-		       "due to %d, fall back to sync i/o\n",
-		       cli_name(cli), &ocw, rc);
-		break;
+		wake_up_all(&cli->cl_cache_waiters);
 	}
 	EXIT;
 out:
@@ -1671,41 +1655,6 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli,
 	RETURN(rc);
 }
 
-/* caller must hold loi_list_lock */
-void osc_wake_cache_waiters(struct client_obd *cli)
-{
-	struct list_head *l, *tmp;
-	struct osc_cache_waiter *ocw;
-
-	ENTRY;
-	list_for_each_safe(l, tmp, &cli->cl_cache_waiters) {
-		ocw = list_entry(l, struct osc_cache_waiter, ocw_entry);
-		list_del_init(&ocw->ocw_entry);
-
-		ocw->ocw_rc = -EDQUOT;
-		/* we can't dirty more */
-		if ((cli->cl_dirty_pages  >= cli->cl_dirty_max_pages) ||
-		    (1 + atomic_long_read(&obd_dirty_pages) >
-		     obd_max_dirty_pages)) {
-			CDEBUG(D_CACHE, "no dirty room: dirty: %ld "
-			       "osc max %ld, sys max %ld\n",
-			       cli->cl_dirty_pages, cli->cl_dirty_max_pages,
-			       obd_max_dirty_pages);
-			goto wakeup;
-		}
-
-		if (osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant, 0))
-			ocw->ocw_rc = 0;
-wakeup:
-		CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant %ld, %d\n",
-		       ocw, ocw->ocw_oap, cli->cl_avail_grant, ocw->ocw_rc);
-
-		wake_up(&ocw->ocw_waitq);
-	}
-
-	EXIT;
-}
-
 static int osc_max_rpc_in_flight(struct client_obd *cli, struct osc_object *osc)
 {
 	int hprpc = !!list_empty(&osc->oo_hp_exts);
@@ -1745,8 +1694,9 @@ static int osc_makes_rpc(struct client_obd *cli, struct osc_object *osc,
 		}
 		/* trigger a write rpc stream as long as there are dirtiers
 		 * waiting for space.  as they're waiting, they're not going to
-		 * create more pages to coalesce with what's waiting.. */
-		if (!list_empty(&cli->cl_cache_waiters)) {
+		 * create more pages to coalesce with what's waiting..
+		 */
+		if (waitqueue_active(&cli->cl_cache_waiters)) {
 			CDEBUG(D_CACHE, "cache waiters forcing RPC\n");
 			RETURN(1);
 		}
@@ -1968,6 +1918,7 @@ static int try_to_add_extent_for_io(struct client_obd *cli,
 
 		if (tmp->oe_srvlock != ext->oe_srvlock ||
 		    !tmp->oe_grants != !ext->oe_grants ||
+		    tmp->oe_ndelay != ext->oe_ndelay ||
 		    tmp->oe_no_merge || ext->oe_no_merge)
 			RETURN(0);
 
@@ -2043,7 +1994,6 @@ static unsigned int get_write_extents(struct osc_object *obj,
 	while (!list_empty(&obj->oo_hp_exts)) {
 		ext = list_entry(obj->oo_hp_exts.next, struct osc_extent,
 				 oe_link);
-		LASSERT(ext->oe_state == OES_CACHE);
 		if (!try_to_add_extent_for_io(cli, ext, &data))
 			return data.erd_page_count;
 		EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext);
@@ -2229,8 +2179,9 @@ static struct osc_object *osc_next_obj(struct client_obd *cli)
 	/* then if we have cache waiters, return all objects with queued
 	 * writes.  This is especially important when many small files
 	 * have filled up the cache and not been fired into rpcs because
-	 * they don't pass the nr_pending/object threshhold */
-	if (!list_empty(&cli->cl_cache_waiters) &&
+	 * they don't pass the nr_pending/object threshhold
+	 */
+	if (waitqueue_active(&cli->cl_cache_waiters) &&
 	    !list_empty(&cli->cl_loi_write_list))
 		RETURN(list_to_obj(&cli->cl_loi_write_list, write_item));
 
@@ -2261,7 +2212,12 @@ __must_hold(&cli->cl_loi_list_lock)
 
 		OSC_IO_DEBUG(osc, "%lu in flight\n", rpcs_in_flight(cli));
 
-		if (osc_max_rpc_in_flight(cli, osc)) {
+		/* even if we have reached our max in flight RPCs, we still
+		 * allow all high-priority RPCs through to prevent their
+		 * starvation and leading to server evicting us for not
+		 * writing out pages in a timely manner LU-13131 */
+		if (osc_max_rpc_in_flight(cli, osc) &&
+		    list_empty(&osc->oo_hp_exts)) {
 			__osc_list_maint(cli, osc);
 			break;
 		}
@@ -2316,8 +2272,8 @@ __must_hold(&cli->cl_loi_list_lock)
 	}
 }
 
-static int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli,
-			  struct osc_object *osc, int async)
+int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli,
+		   struct osc_object *osc, int async)
 {
 	int rc = 0;
 
@@ -2335,18 +2291,7 @@ static int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli,
 	}
 	return rc;
 }
-
-static int osc_io_unplug_async(const struct lu_env *env,
-				struct client_obd *cli, struct osc_object *osc)
-{
-	return osc_io_unplug0(env, cli, osc, 1);
-}
-
-void osc_io_unplug(const struct lu_env *env, struct client_obd *cli,
-		   struct osc_object *osc)
-{
-	(void)osc_io_unplug0(env, cli, osc, 0);
-}
+EXPORT_SYMBOL(osc_io_unplug0);
 
 int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
 			struct page *page, loff_t offset)
@@ -2366,9 +2311,6 @@ int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
 	oap->oap_obj_off = offset;
 	LASSERT(!(offset & ~PAGE_MASK));
 
-	if (cfs_capable(CFS_CAP_SYS_RESOURCE))
-		oap->oap_brw_flags = OBD_BRW_NOQUOTA;
-
 	INIT_LIST_HEAD(&oap->oap_pending_item);
 	INIT_LIST_HEAD(&oap->oap_rpc_item);
 
@@ -2377,6 +2319,7 @@ int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
 	       oap, page, oap->oap_obj_off);
 	RETURN(0);
 }
+EXPORT_SYMBOL(osc_prep_async_page);
 
 int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
 		       struct osc_page *ops)
@@ -2407,7 +2350,7 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
 
 	/* Set the OBD_BRW_SRVLOCK before the page is queued. */
 	brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0;
-	if (cfs_capable(CFS_CAP_SYS_RESOURCE)) {
+	if (oio->oi_cap_sys_resource || io->ci_noquota) {
 		brw_flags |= OBD_BRW_NOQUOTA;
 		cmd |= OBD_BRW_NOQUOTA;
 	}
@@ -2463,7 +2406,7 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
 
 		/* it doesn't need any grant to dirty this page */
 		spin_lock(&cli->cl_loi_list_lock);
-		rc = osc_enter_cache_try(cli, oap, grants, 0);
+		rc = osc_enter_cache_try(cli, oap, grants);
 		spin_unlock(&cli->cl_loi_list_lock);
 		if (rc == 0) { /* try failed */
 			grants = 0;
@@ -2540,7 +2483,11 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
 		++ext->oe_nr_pages;
 		list_add_tail(&oap->oap_pending_item, &ext->oe_pages);
 		osc_object_unlock(osc);
+
+		if (!ext->oe_layout_version)
+			ext->oe_layout_version = io->ci_layout_version;
 	}
+
 	RETURN(rc);
 }
 
@@ -2726,8 +2673,9 @@ int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops)
 	RETURN(rc);
 }
 
-int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
-			 struct list_head *list, int cmd, int brw_flags)
+int osc_queue_sync_pages(const struct lu_env *env, const struct cl_io *io,
+			 struct osc_object *obj, struct list_head *list,
+			 int brw_flags)
 {
 	struct client_obd     *cli = osc_cli(obj);
 	struct osc_extent     *ext;
@@ -2765,7 +2713,7 @@ int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
 		RETURN(-ENOMEM);
 	}
 
-	ext->oe_rw = !!(cmd & OBD_BRW_READ);
+	ext->oe_rw = !!(brw_flags & OBD_BRW_READ);
 	ext->oe_sync = 1;
 	ext->oe_no_merge = !can_merge;
 	ext->oe_urgent = 1;
@@ -2773,15 +2721,52 @@ int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
 	ext->oe_end = ext->oe_max_end = end;
 	ext->oe_obj = obj;
 	ext->oe_srvlock = !!(brw_flags & OBD_BRW_SRVLOCK);
+	ext->oe_ndelay = !!(brw_flags & OBD_BRW_NDELAY);
+	if (brw_flags & OBD_BRW_NOCACHE && !ext->oe_rw) { /* direct io write */
+		int grants;
+		int ppc;
+
+		ppc = 1 << (cli->cl_chunkbits - PAGE_SHIFT);
+		grants = cli->cl_grant_extent_tax;
+		grants += (1 << cli->cl_chunkbits) *
+			((page_count + ppc - 1) / ppc);
+
+		spin_lock(&cli->cl_loi_list_lock);
+		if (osc_reserve_grant(cli, grants) == 0) {
+			list_for_each_entry(oap, list, oap_pending_item) {
+				osc_consume_write_grant(cli,
+							&oap->oap_brw_page);
+				atomic_long_inc(&obd_dirty_pages);
+			}
+			__osc_unreserve_grant(cli, grants, 0);
+			ext->oe_grants = grants;
+		}
+		spin_unlock(&cli->cl_loi_list_lock);
+	}
 	ext->oe_nr_pages = page_count;
 	ext->oe_mppr = mppr;
 	list_splice_init(list, &ext->oe_pages);
+	ext->oe_layout_version = io->ci_layout_version;
 
 	osc_object_lock(obj);
 	/* Reuse the initial refcount for RPC, don't drop it */
 	osc_extent_state_set(ext, OES_LOCK_DONE);
-	if (cmd & OBD_BRW_WRITE) {
-		list_add_tail(&ext->oe_link, &obj->oo_urgent_exts);
+	if (!ext->oe_rw) { /* write */
+		if (!ext->oe_srvlock) {
+			/* The most likely case here is from lack of grants
+			 * so we are either out of quota or out of space.
+			 * Since this means we are holding locks across
+			 * potentially multi-striped IO, we must send out
+			 * everything out instantly to avoid prolonged
+			 * waits resulting in lock eviction (likely since
+			 * the extended wait in osc_cache_enter() did not
+			 * yield any additional grant due to a timeout.
+			 * LU-13131 */
+			ext->oe_hp = 1;
+			list_add_tail(&ext->oe_link, &obj->oo_hp_exts);
+		} else {
+			list_add_tail(&ext->oe_link, &obj->oo_urgent_exts);
+		}
 		osc_update_pending(obj, OBD_BRW_WRITE, page_count);
 	} else {
 		list_add_tail(&ext->oe_link, &obj->oo_reading_exts);
@@ -2919,6 +2904,7 @@ int osc_cache_truncate_start(const struct lu_env *env, struct osc_object *obj,
 	}
 	RETURN(result);
 }
+EXPORT_SYMBOL(osc_cache_truncate_start);
 
 /**
  * Called after osc_io_setattr_end to add oio->oi_trunc back to cache.
@@ -3005,6 +2991,7 @@ int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
 	OSC_IO_DEBUG(obj, "sync file range.\n");
 	RETURN(result);
 }
+EXPORT_SYMBOL(osc_cache_wait_range);
 
 /**
  * Called to write out a range of osc object.
@@ -3044,7 +3031,7 @@ int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
 					EASSERT(!ext->oe_hp, ext);
 					ext->oe_hp = 1;
 					list = &obj->oo_hp_exts;
-				} else if (!ext->oe_urgent) {
+				} else if (!ext->oe_urgent && !ext->oe_hp) {
 					ext->oe_urgent = 1;
 					list = &obj->oo_urgent_exts;
 				}
@@ -3052,10 +3039,25 @@ int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
 					list_move_tail(&ext->oe_link, list);
 				unplug = true;
 			} else {
+				struct client_obd *cli = osc_cli(obj);
+				int pcc_bits = cli->cl_chunkbits - PAGE_SHIFT;
+				pgoff_t align_by = (1 << pcc_bits);
+				pgoff_t a_start = round_down(start, align_by);
+				pgoff_t a_end = round_up(end, align_by);
+
+				/* overflow case */
+				if (end && !a_end)
+					a_end = CL_PAGE_EOF;
 				/* the only discarder is lock cancelling, so
-				 * [start, end] must contain this extent */
-				EASSERT(ext->oe_start >= start &&
-					ext->oe_max_end <= end, ext);
+				 * [start, end], aligned by chunk size, must
+				 * contain this extent */
+				LASSERTF(ext->oe_start >= a_start &&
+					 ext->oe_end <= a_end,
+					 "ext [%lu, %lu] reg [%lu, %lu] "
+					 "orig [%lu %lu] align %lu bits "
+					 "%d\n", ext->oe_start, ext->oe_end,
+					 a_start, a_end, start, end,
+					 align_by, pcc_bits);
 				osc_extent_state_set(ext, OES_LOCKING);
 				ext->oe_owner = current;
 				list_move_tail(&ext->oe_link,
@@ -3121,6 +3123,7 @@ int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
 	OSC_IO_DEBUG(obj, "pageout [%lu, %lu], %d.\n", start, end, result);
 	RETURN(result);
 }
+EXPORT_SYMBOL(osc_cache_writeback_range);
 
 /**
  * Returns a list of pages by a given [start, end] of \a obj.
@@ -3139,6 +3142,7 @@ int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
 			osc_page_gang_cbt cb, void *cbdata)
 {
 	struct osc_page *ops;
+	struct pagevec	*pagevec;
 	void            **pvec;
 	pgoff_t         idx;
 	unsigned int    nr;
@@ -3150,6 +3154,8 @@ int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
 
 	idx = start;
 	pvec = osc_env_info(env)->oti_pvec;
+	pagevec = &osc_env_info(env)->oti_pagevec;
+	ll_pagevec_init(pagevec, 0);
 	spin_lock(&osc->oo_tree_lock);
 	while ((nr = radix_tree_gang_lookup(&osc->oo_tree, pvec,
 					    idx, OTI_PVEC_SIZE)) > 0) {
@@ -3196,8 +3202,10 @@ int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
 
 			page = ops->ops_cl.cpl_page;
 			lu_ref_del(&page->cp_reference, "gang_lookup", current);
-			cl_page_put(env, page);
+			cl_pagevec_put(env, page, pagevec);
 		}
+		pagevec_release(pagevec);
+
 		if (nr < OTI_PVEC_SIZE || end_of_region)
 			break;
 
@@ -3213,6 +3221,7 @@ int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
 		spin_unlock(&osc->oo_tree_lock);
 	RETURN(res);
 }
+EXPORT_SYMBOL(osc_page_gang_lookup);
 
 /**
  * Check if page @page is covered by an extra lock or discard it.
@@ -3255,8 +3264,8 @@ static int check_and_discard_cb(const struct lu_env *env, struct cl_io *io,
 	return CLP_GANG_OKAY;
 }
 
-static int discard_cb(const struct lu_env *env, struct cl_io *io,
-		      struct osc_page *ops, void *cbdata)
+int osc_discard_cb(const struct lu_env *env, struct cl_io *io,
+		   struct osc_page *ops, void *cbdata)
 {
 	struct osc_thread_info *info = osc_env_info(env);
 	struct cl_page *page = ops->ops_cl.cpl_page;
@@ -3278,6 +3287,7 @@ static int discard_cb(const struct lu_env *env, struct cl_io *io,
 
 	return CLP_GANG_OKAY;
 }
+EXPORT_SYMBOL(osc_discard_cb);
 
 /**
  * Discard pages protected by the given lock. This function traverses radix
@@ -3291,7 +3301,7 @@ int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc,
 			   pgoff_t start, pgoff_t end, bool discard)
 {
 	struct osc_thread_info *info = osc_env_info(env);
-	struct cl_io *io = &info->oti_io;
+	struct cl_io *io = osc_env_thread_io(env);
 	osc_page_gang_cbt cb;
 	int res;
 	int result;
@@ -3304,7 +3314,7 @@ int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc,
 	if (result != 0)
 		GOTO(out, result);
 
-	cb = discard ? discard_cb : check_and_discard_cb;
+	cb = discard ? osc_discard_cb : check_and_discard_cb;
 	info->oti_fn_index = info->oti_next_index = start;
 	do {
 		res = osc_page_gang_lookup(env, io, osc,
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_dev.c b/drivers/staging/lustrefsx/lustre/osc/osc_dev.c
index c06a5deb339b7..cbddab5c0f319 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_dev.c
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_dev.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,19 +38,24 @@
 
 /* class_name2obd() */
 #include <obd_class.h>
+#include <lustre_osc.h>
 
-#include "osc_cl_internal.h"
+#include "osc_internal.h"
 
-/** \addtogroup osc 
- * @{ 
+/** \addtogroup osc
+ * @{
  */
 
 struct kmem_cache *osc_lock_kmem;
+EXPORT_SYMBOL(osc_lock_kmem);
 struct kmem_cache *osc_object_kmem;
+EXPORT_SYMBOL(osc_object_kmem);
+
 struct kmem_cache *osc_thread_kmem;
 struct kmem_cache *osc_session_kmem;
 struct kmem_cache *osc_extent_kmem;
 struct kmem_cache *osc_quota_kmem;
+struct kmem_cache *osc_obdo_kmem;
 
 struct lu_kmem_descr osc_caches[] = {
         {
@@ -84,21 +89,15 @@ struct lu_kmem_descr osc_caches[] = {
 		.ckd_size  = sizeof(struct osc_quota_info)
 	},
 	{
-                .ckd_cache = NULL
-        }
+		.ckd_cache = &osc_obdo_kmem,
+		.ckd_name  = "osc_obdo_kmem",
+		.ckd_size  = sizeof(struct obdo)
+	},
+	{
+		.ckd_cache = NULL
+	}
 };
 
-/*****************************************************************************
- *
- * Type conversions.
- *
- */
-
-static struct lu_device *osc2lu_dev(struct osc_device *osc)
-{
-        return &osc->od_cl.cd_lu_dev;
-}
-
 /*****************************************************************************
  *
  * Osc device and device type functions.
@@ -130,6 +129,7 @@ struct lu_context_key osc_key = {
         .lct_init = osc_key_init,
         .lct_fini = osc_key_fini
 };
+EXPORT_SYMBOL(osc_key);
 
 static void *osc_session_init(const struct lu_context *ctx,
 			      struct lu_context_key *key)
@@ -154,6 +154,7 @@ struct lu_context_key osc_session_key = {
         .lct_init = osc_session_init,
         .lct_fini = osc_session_fini
 };
+EXPORT_SYMBOL(osc_session_key);
 
 /* type constructor/destructor: osc_type_{init,fini,start,stop}(). */
 LU_TYPE_INIT_FINI(osc, &osc_key, &osc_session_key);
@@ -171,27 +172,30 @@ static const struct lu_device_operations osc_lu_ops = {
         .ldo_recovery_complete = NULL
 };
 
-static int osc_device_init(const struct lu_env *env, struct lu_device *d,
-                           const char *name, struct lu_device *next)
+int osc_device_init(const struct lu_env *env, struct lu_device *d,
+		    const char *name, struct lu_device *next)
 {
         RETURN(0);
 }
+EXPORT_SYMBOL(osc_device_init);
 
-static struct lu_device *osc_device_fini(const struct lu_env *env,
-                                         struct lu_device *d)
+struct lu_device *osc_device_fini(const struct lu_env *env,
+				  struct lu_device *d)
 {
 	return NULL;
 }
+EXPORT_SYMBOL(osc_device_fini);
 
-static struct lu_device *osc_device_free(const struct lu_env *env,
-                                         struct lu_device *d)
+struct lu_device *osc_device_free(const struct lu_env *env,
+				  struct lu_device *d)
 {
-        struct osc_device *od = lu2osc_dev(d);
+	struct osc_device *od = lu2osc_dev(d);
 
-        cl_device_fini(lu2cl_dev(d));
-        OBD_FREE_PTR(od);
-        return NULL;
+	cl_device_fini(lu2cl_dev(d));
+	OBD_FREE_PTR(od);
+	return NULL;
 }
+EXPORT_SYMBOL(osc_device_free);
 
 static struct lu_device *osc_device_alloc(const struct lu_env *env,
                                           struct lu_device_type *t,
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_internal.h b/drivers/staging/lustrefsx/lustre/osc/osc_internal.h
index 24766263514a6..519a4d1f4b57e 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_internal.h
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -35,93 +35,45 @@
 
 #define OAP_MAGIC 8675309
 
+#include <libcfs/linux/linux-mem.h>
+#include <lustre_osc.h>
+
 extern atomic_t osc_pool_req_count;
 extern unsigned int osc_reqpool_maxreqcount;
 extern struct ptlrpc_request_pool *osc_rq_pool;
 
-struct lu_env;
-
-enum async_flags {
-        ASYNC_READY = 0x1, /* ap_make_ready will not be called before this
-                              page is added to an rpc */
-        ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */
-        ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called
-                                     to give the caller a chance to update
-                                     or cancel the size of the io */
-        ASYNC_HP = 0x10,
-};
-
-struct osc_async_page {
-        int                     oap_magic;
-        unsigned short          oap_cmd;
-        unsigned short          oap_interrupted:1;
-
-	struct list_head	oap_pending_item;
-	struct list_head	oap_rpc_item;
-
-	loff_t			oap_obj_off;
-        unsigned                oap_page_off;
-        enum async_flags        oap_async_flags;
-
-        struct brw_page         oap_brw_page;
-
-        struct ptlrpc_request   *oap_request;
-        struct client_obd       *oap_cli;
-	struct osc_object       *oap_obj;
-
-	spinlock_t		 oap_lock;
-};
-
-#define oap_page        oap_brw_page.pg
-#define oap_count       oap_brw_page.count
-#define oap_brw_flags   oap_brw_page.flag
-
-static inline struct osc_async_page *brw_page2oap(struct brw_page *pga)
-{
-	return (struct osc_async_page *)container_of(pga, struct osc_async_page,
-						     oap_brw_page);
-}
-
-struct osc_cache_waiter {
-	struct list_head	ocw_entry;
-	wait_queue_head_t	ocw_waitq;
-	struct osc_async_page  *ocw_oap;
-	int                     ocw_grant;
-	int                     ocw_rc;
-};
-
-void osc_wake_cache_waiters(struct client_obd *cli);
 int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes);
 void osc_update_next_shrink(struct client_obd *cli);
-
-/*
- * cl integration.
- */
-#include <cl_object.h>
+int lru_queue_work(const struct lu_env *env, void *data);
+int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
+		      int sent, int rc);
+int osc_extent_release(const struct lu_env *env, struct osc_extent *ext);
+int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc,
+			   pgoff_t start, pgoff_t end, bool discard);
 
 extern struct ptlrpc_request_set *PTLRPCD_SET;
 
-typedef int (*osc_enqueue_upcall_f)(void *cookie, struct lustre_handle *lockh,
-				    int rc);
+void osc_lock_lvb_update(const struct lu_env *env,
+			 struct osc_object *osc,
+			 struct ldlm_lock *dlmlock,
+			 struct ost_lvb *lvb);
 
 int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
 		     __u64 *flags, union ldlm_policy_data *policy,
-		     struct ost_lvb *lvb, int kms_valid,
-		     osc_enqueue_upcall_f upcall,
+		     struct ost_lvb *lvb, osc_enqueue_upcall_f upcall,
 		     void *cookie, struct ldlm_enqueue_info *einfo,
-		     struct ptlrpc_request_set *rqset, int async, int agl);
+		     struct ptlrpc_request_set *rqset, int async,
+		     bool speculative);
 
-int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id,
-		   enum ldlm_type type, union ldlm_policy_data *policy,
-		   enum ldlm_mode mode, __u64 *flags, void *data,
+int osc_match_base(const struct lu_env *env, struct obd_export *exp,
+		   struct ldlm_res_id *res_id, enum ldlm_type type,
+		   union ldlm_policy_data *policy, enum ldlm_mode mode,
+		   __u64 *flags, struct osc_object *obj,
 		   struct lustre_handle *lockh, int unref);
 
 int osc_setattr_async(struct obd_export *exp, struct obdo *oa,
 		      obd_enqueue_update_f upcall, void *cookie,
 		      struct ptlrpc_request_set *rqset);
-int osc_punch_base(struct obd_export *exp, struct obdo *oa,
-                   obd_enqueue_update_f upcall, void *cookie,
-                   struct ptlrpc_request_set *rqset);
 int osc_sync_base(struct osc_object *obj, struct obdo *oa,
 		  obd_enqueue_update_f upcall, void *cookie,
 		  struct ptlrpc_request_set *rqset);
@@ -132,8 +84,6 @@ int osc_ladvise_base(struct obd_export *exp, struct obdo *oa,
 int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *cfg);
 int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 		  struct list_head *ext_list, int cmd);
-long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
-		   long target, bool force);
 unsigned long osc_lru_reserve(struct client_obd *cli, unsigned long npages);
 void osc_lru_unreserve(struct client_obd *cli, unsigned long npages);
 
@@ -144,15 +94,36 @@ unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock);
 int osc_cleanup(struct obd_device *obd);
 int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
 
-#ifdef CONFIG_PROC_FS
-extern struct lprocfs_vars lprocfs_osc_obd_vars[];
-int lproc_osc_attach_seqstat(struct obd_device *dev);
-#else
-static inline int lproc_osc_attach_seqstat(struct obd_device *dev) {return 0;}
-#endif
+int osc_tunables_init(struct obd_device *obd);
 
 extern struct lu_device_type osc_device_type;
 
+static inline struct cl_io *osc_env_thread_io(const struct lu_env *env)
+{
+	struct cl_io *io = &osc_env_info(env)->oti_io;
+
+	memset(io, 0, sizeof(*io));
+	return io;
+}
+
+static inline int osc_is_object(const struct lu_object *obj)
+{
+	return obj->lo_dev->ld_type == &osc_device_type;
+}
+
+static inline struct osc_lock *osc_lock_at(const struct cl_lock *lock)
+{
+	return cl2osc_lock(cl_lock_at(lock, &osc_device_type));
+}
+
+int osc_lock_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_lock *lock, const struct cl_io *io);
+int osc_io_init(const struct lu_env *env, struct cl_object *obj,
+		struct cl_io *io);
+struct lu_object *osc_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *hdr,
+				   struct lu_device *dev);
+
 static inline int osc_recoverable_error(int rc)
 {
         return (rc == -EIO || rc == -EROFS || rc == -ENOMEM ||
@@ -174,41 +145,13 @@ static inline char *cli_name(struct client_obd *cli)
         ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; })
 #endif
 
-struct osc_device {
-        struct cl_device    od_cl;
-        struct obd_export  *od_exp;
-
-        /* Write stats is actually protected by client_obd's lock. */
-        struct osc_stats {
-                uint64_t     os_lockless_writes;          /* by bytes */
-                uint64_t     os_lockless_reads;           /* by bytes */
-                uint64_t     os_lockless_truncates;       /* by times */
-        } od_stats;
-
-        /* configuration item(s) */
-        int                 od_contention_time;
-        int                 od_lockless_truncate;
-};
-
-static inline struct osc_device *obd2osc_dev(const struct obd_device *d)
-{
-        return container_of0(d->obd_lu_dev, struct osc_device, od_cl.cd_lu_dev);
-}
-
-extern struct kmem_cache *osc_quota_kmem;
-struct osc_quota_info {
-	/** linkage for quota hash table */
-	struct hlist_node oqi_hash;
-	u32		  oqi_id;
-};
-
 struct osc_async_args {
 	struct obd_info	*aa_oi;
 };
 
 int osc_quota_setup(struct obd_device *obd);
 int osc_quota_cleanup(struct obd_device *obd);
-int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[],
+int osc_quota_setdq(struct client_obd *cli, __u64 xid, const unsigned int qid[],
 		    u64 valid, u32 flags);
 int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[]);
 int osc_quotactl(struct obd_device *unused, struct obd_export *exp,
@@ -216,24 +159,14 @@ int osc_quotactl(struct obd_device *unused, struct obd_export *exp,
 void osc_inc_unstable_pages(struct ptlrpc_request *req);
 void osc_dec_unstable_pages(struct ptlrpc_request *req);
 bool osc_over_unstable_soft_limit(struct client_obd *cli);
-/**
- * Bit flags for osc_dlm_lock_at_pageoff().
- */
-enum osc_dap_flags {
-	/**
-	 * Just check if the desired lock exists, it won't hold reference
-	 * count on lock.
-	 */
-	OSC_DAP_FL_TEST_LOCK = 1 << 0,
-	/**
-	 * Return the lock even if it is being canceled.
-	 */
-	OSC_DAP_FL_CANCELING = 1 << 1
-};
-struct ldlm_lock *osc_dlmlock_at_pgoff(const struct lu_env *env,
-				       struct osc_object *obj, pgoff_t index,
-				       enum osc_dap_flags flags);
-void osc_pack_req_body(struct ptlrpc_request *req, struct obdo *oa);
+void osc_page_touch_at(const struct lu_env *env, struct cl_object *obj,
+		       pgoff_t idx, size_t to);
+
+struct ldlm_lock *osc_obj_dlmlock_at_pgoff(const struct lu_env *env,
+					   struct osc_object *obj,
+					   pgoff_t index,
+					   enum osc_dap_flags flags);
+
 int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc);
 
 /** osc shrink list to link all osc client obd */
@@ -245,4 +178,14 @@ extern unsigned long osc_cache_shrink_count(struct shrinker *sk,
 extern unsigned long osc_cache_shrink_scan(struct shrinker *sk,
 					   struct shrink_control *sc);
 
+static inline void osc_set_io_portal(struct ptlrpc_request *req)
+{
+	struct obd_import *imp = req->rq_import;
+
+	/* Distinguish OSC from MDC here to use OST or MDS portal */
+	if (OCD_HAS_FLAG(&imp->imp_connect_data, IBITS))
+		req->rq_request_portal = MDS_IO_PORTAL;
+	else
+		req->rq_request_portal = OST_IO_PORTAL;
+}
 #endif /* OSC_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_io.c b/drivers/staging/lustrefsx/lustre/osc/osc_io.c
index 38fe2532829fd..4a51b9912d72f 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_io.c
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_io.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,27 +38,14 @@
 #define DEBUG_SUBSYSTEM S_OSC
 
 #include <lustre_obdo.h>
+#include <lustre_osc.h>
 
-#include "osc_cl_internal.h"
+#include "osc_internal.h"
 
-/** \addtogroup osc 
- *  @{ 
+/** \addtogroup osc
+ *  @{
  */
 
-/*****************************************************************************
- *
- * Type conversions.
- *
- */
-
-static struct osc_io *cl2osc_io(const struct lu_env *env,
-                                const struct cl_io_slice *slice)
-{
-        struct osc_io *oio = container_of0(slice, struct osc_io, oi_cl);
-        LINVRNT(oio == osc_env_io(env));
-        return oio;
-}
-
 /*****************************************************************************
  *
  * io operations.
@@ -69,8 +56,7 @@ static void osc_io_fini(const struct lu_env *env, const struct cl_io_slice *io)
 {
 }
 
-static void osc_read_ahead_release(const struct lu_env *env,
-				   void *cbdata)
+void osc_read_ahead_release(const struct lu_env *env, void *cbdata)
 {
 	struct ldlm_lock *dlmlock = cbdata;
 	struct lustre_handle lockh;
@@ -79,6 +65,7 @@ static void osc_read_ahead_release(const struct lu_env *env,
 	ldlm_lock_decref(&lockh, LCK_PR);
 	LDLM_LOCK_PUT(dlmlock);
 }
+EXPORT_SYMBOL(osc_read_ahead_release);
 
 static int osc_io_read_ahead(const struct lu_env *env,
 			     const struct cl_io_slice *ios,
@@ -117,9 +104,8 @@ static int osc_io_read_ahead(const struct lu_env *env,
  * or, if page is already submitted, changes osc flags through
  * osc_set_async_flags().
  */
-static int osc_io_submit(const struct lu_env *env,
-                         const struct cl_io_slice *ios,
-			 enum cl_req_type crt, struct cl_2queue *queue)
+int osc_io_submit(const struct lu_env *env, const struct cl_io_slice *ios,
+		  enum cl_req_type crt, struct cl_2queue *queue)
 {
 	struct cl_page	  *page;
 	struct cl_page	  *tmp;
@@ -133,7 +119,6 @@ static int osc_io_submit(const struct lu_env *env,
 	struct cl_page_list *qout     = &queue->c2_qout;
 	unsigned int queued = 0;
 	int result = 0;
-	int cmd;
 	int brw_flags;
 	unsigned int max_pages;
 
@@ -145,8 +130,14 @@ static int osc_io_submit(const struct lu_env *env,
 	cli = osc_cli(osc);
 	max_pages = cli->cl_max_pages_per_rpc;
 
-	cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
 	brw_flags = osc_io_srvlock(cl2osc_io(env, ios)) ? OBD_BRW_SRVLOCK : 0;
+	brw_flags |= crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
+	if (crt == CRT_READ && ios->cis_io->ci_ndelay)
+		brw_flags |= OBD_BRW_NDELAY;
+
+	page = cl_page_list_first(qin);
+	if (page->cp_type == CPT_TRANSIENT)
+		brw_flags |= OBD_BRW_NOCACHE;
 
         /*
          * NOTE: here @page is a top-level page. This is done to avoid
@@ -200,7 +191,7 @@ static int osc_io_submit(const struct lu_env *env,
 
 		if (++queued == max_pages) {
 			queued = 0;
-			result = osc_queue_sync_pages(env, osc, &list, cmd,
+			result = osc_queue_sync_pages(env, io, osc, &list,
 						      brw_flags);
 			if (result < 0)
 				break;
@@ -208,7 +199,7 @@ static int osc_io_submit(const struct lu_env *env,
 	}
 
 	if (queued > 0)
-		result = osc_queue_sync_pages(env, osc, &list, cmd, brw_flags);
+		result = osc_queue_sync_pages(env, io, osc, &list, brw_flags);
 
 	/* Update c/mtime for sync write. LU-7310 */
 	if (crt == CRT_WRITE && qout->pl_nr > 0 && result == 0) {
@@ -224,36 +215,31 @@ static int osc_io_submit(const struct lu_env *env,
 	CDEBUG(D_INFO, "%d/%d %d\n", qin->pl_nr, qout->pl_nr, result);
 	return qout->pl_nr > 0 ? 0 : result;
 }
+EXPORT_SYMBOL(osc_io_submit);
 
 /**
- * This is called when a page is accessed within file in a way that creates
- * new page, if one were missing (i.e., if there were a hole at that place in
- * the file, or accessed page is beyond the current file size).
+ * This is called to update the attributes when modifying a specific page,
+ * both when making new pages and when doing updates to existing cached pages.
  *
  * Expand stripe KMS if necessary.
  */
-static void osc_page_touch_at(const struct lu_env *env,
-			      struct cl_object *obj, pgoff_t idx, size_t to)
+void osc_page_touch_at(const struct lu_env *env, struct cl_object *obj,
+		       pgoff_t idx, size_t to)
 {
-        struct lov_oinfo  *loi  = cl2osc(obj)->oo_oinfo;
-        struct cl_attr    *attr = &osc_env_info(env)->oti_attr;
-        int valid;
-        __u64 kms;
+	struct lov_oinfo  *loi  = cl2osc(obj)->oo_oinfo;
+	struct cl_attr    *attr = &osc_env_info(env)->oti_attr;
+	int valid;
+	__u64 kms;
 
-        /* offset within stripe */
-        kms = cl_offset(obj, idx) + to;
+	ENTRY;
 
-        cl_object_attr_lock(obj);
-        /*
-         * XXX old code used
-         *
-         *         ll_inode_size_lock(inode, 0); lov_stripe_lock(lsm);
-         *
-         * here
-         */
+	/* offset within stripe */
+	kms = cl_offset(obj, idx) + to;
+
+	cl_object_attr_lock(obj);
 	CDEBUG(D_INODE, "stripe KMS %sincreasing %llu->%llu %llu\n",
-               kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms,
-               loi->loi_lvb.lvb_size);
+	       kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms,
+	       loi->loi_lvb.lvb_size);
 
 	attr->cat_mtime = attr->cat_ctime = ktime_get_real_seconds();
 	valid = CAT_MTIME | CAT_CTIME;
@@ -267,12 +253,14 @@ static void osc_page_touch_at(const struct lu_env *env,
 	}
 	cl_object_attr_update(env, obj, attr, valid);
 	cl_object_attr_unlock(obj);
+
+	EXIT;
 }
 
-static int osc_io_commit_async(const struct lu_env *env,
-				const struct cl_io_slice *ios,
-				struct cl_page_list *qin, int from, int to,
-				cl_commit_cbt cb)
+int osc_io_commit_async(const struct lu_env *env,
+			const struct cl_io_slice *ios,
+			struct cl_page_list *qin, int from, int to,
+			cl_commit_cbt cb)
 {
 	struct cl_io    *io = ios->cis_io;
 	struct osc_io   *oio = cl2osc_io(env, ios);
@@ -306,6 +294,9 @@ static int osc_io_commit_async(const struct lu_env *env,
 		opg = osc_cl_page_osc(page, osc);
 		oap = &opg->ops_oap;
 
+		LASSERTF(osc == oap->oap_obj,
+			 "obj mismatch: %p / %p\n", osc, oap->oap_obj);
+
 		if (!list_empty(&oap->oap_rpc_item)) {
 			CDEBUG(D_CACHE, "Busy oap %p page %p for submit.\n",
 			       oap, opg);
@@ -341,29 +332,47 @@ static int osc_io_commit_async(const struct lu_env *env,
 	CDEBUG(D_INFO, "%d %d\n", qin->pl_nr, result);
 	RETURN(result);
 }
+EXPORT_SYMBOL(osc_io_commit_async);
 
-static int osc_io_iter_init(const struct lu_env *env,
-			    const struct cl_io_slice *ios)
+static bool osc_import_not_healthy(struct obd_import *imp)
+{
+	return imp->imp_invalid || imp->imp_deactive ||
+	       !(imp->imp_state == LUSTRE_IMP_FULL ||
+		 imp->imp_state == LUSTRE_IMP_IDLE);
+}
+
+int osc_io_iter_init(const struct lu_env *env, const struct cl_io_slice *ios)
 {
 	struct osc_object *osc = cl2osc(ios->cis_obj);
 	struct obd_import *imp = osc_cli(osc)->cl_import;
+	struct osc_io *oio = osc_env_io(env);
 	int rc = -EIO;
+	ENTRY;
 
 	spin_lock(&imp->imp_lock);
-	if (likely(!imp->imp_invalid)) {
-		struct osc_io *oio = osc_env_io(env);
-
+	/**
+	 * check whether this OSC device is available for non-delay read,
+	 * fast switching mirror if we haven't tried all mirrors.
+	 */
+	if (ios->cis_io->ci_type == CIT_READ && ios->cis_io->ci_ndelay &&
+	    !ios->cis_io->ci_tried_all_mirrors && osc_import_not_healthy(imp)) {
+		rc = -EWOULDBLOCK;
+	} else if (likely(!imp->imp_invalid)) {
 		atomic_inc(&osc->oo_nr_ios);
 		oio->oi_is_active = 1;
 		rc = 0;
 	}
 	spin_unlock(&imp->imp_lock);
 
-	return rc;
+	if (cfs_capable(CFS_CAP_SYS_RESOURCE))
+		oio->oi_cap_sys_resource = 1;
+
+	RETURN(rc);
 }
+EXPORT_SYMBOL(osc_io_iter_init);
 
-static int osc_io_write_iter_init(const struct lu_env *env,
-				  const struct cl_io_slice *ios)
+int osc_io_write_iter_init(const struct lu_env *env,
+			   const struct cl_io_slice *ios)
 {
 	struct cl_io *io = ios->cis_io;
 	struct osc_io *oio = osc_env_io(env);
@@ -374,17 +383,18 @@ static int osc_io_write_iter_init(const struct lu_env *env,
 	if (cl_io_is_append(io))
 		RETURN(osc_io_iter_init(env, ios));
 
-	npages = io->u.ci_rw.rw_range.cir_count >> PAGE_SHIFT;
-	if (io->u.ci_rw.rw_range.cir_pos & ~PAGE_MASK)
+	npages = io->u.ci_rw.crw_count >> PAGE_SHIFT;
+	if (io->u.ci_rw.crw_pos & ~PAGE_MASK)
 		++npages;
 
 	oio->oi_lru_reserved = osc_lru_reserve(osc_cli(osc), npages);
 
 	RETURN(osc_io_iter_init(env, ios));
 }
+EXPORT_SYMBOL(osc_io_write_iter_init);
 
-static void osc_io_iter_fini(const struct lu_env *env,
-			     const struct cl_io_slice *ios)
+void osc_io_iter_fini(const struct lu_env *env,
+		      const struct cl_io_slice *ios)
 {
 	struct osc_io *oio = osc_env_io(env);
 
@@ -397,9 +407,10 @@ static void osc_io_iter_fini(const struct lu_env *env,
 			wake_up_all(&osc->oo_io_waitq);
 	}
 }
+EXPORT_SYMBOL(osc_io_iter_fini);
 
-static void osc_io_write_iter_fini(const struct lu_env *env,
-				   const struct cl_io_slice *ios)
+void osc_io_write_iter_fini(const struct lu_env *env,
+			    const struct cl_io_slice *ios)
 {
 	struct osc_io *oio = osc_env_io(env);
 	struct osc_object *osc = cl2osc(ios->cis_obj);
@@ -412,9 +423,9 @@ static void osc_io_write_iter_fini(const struct lu_env *env,
 
 	osc_io_iter_fini(env, ios);
 }
+EXPORT_SYMBOL(osc_io_write_iter_fini);
 
-static int osc_io_fault_start(const struct lu_env *env,
-			      const struct cl_io_slice *ios)
+int osc_io_fault_start(const struct lu_env *env, const struct cl_io_slice *ios)
 {
 	struct cl_io       *io;
 	struct cl_fault_io *fio;
@@ -434,6 +445,8 @@ static int osc_io_fault_start(const struct lu_env *env,
 				  fio->ft_index, fio->ft_nob);
 	RETURN(0);
 }
+EXPORT_SYMBOL(osc_io_fault_start);
+
 
 static int osc_async_upcall(void *a, int rc)
 {
@@ -497,10 +510,11 @@ static int osc_io_setattr_start(const struct lu_env *env,
         struct obdo             *oa     = &oio->oi_oa;
 	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
 	__u64                    size   = io->u.ci_setattr.sa_attr.lvb_size;
-	unsigned int             ia_valid = io->u.ci_setattr.sa_valid;
-	int                      result = 0;
-	ENTRY;
+	unsigned int ia_avalid = io->u.ci_setattr.sa_avalid;
+	enum op_xvalid ia_xvalid = io->u.ci_setattr.sa_xvalid;
+	int result = 0;
 
+	ENTRY;
 	/* truncate cache dirty pages first */
 	if (cl_io_is_trunc(io))
 		result = osc_cache_truncate_start(env, cl2osc(obj), size,
@@ -513,19 +527,20 @@ static int osc_io_setattr_start(const struct lu_env *env,
 			struct ost_lvb *lvb = &io->u.ci_setattr.sa_attr;
 			unsigned int cl_valid = 0;
 
-			if (ia_valid & ATTR_SIZE) {
-				attr->cat_size = attr->cat_kms = size;
+			if (ia_avalid & ATTR_SIZE) {
+				attr->cat_size = size;
+				attr->cat_kms = size;
 				cl_valid = (CAT_SIZE | CAT_KMS);
 			}
-			if (ia_valid & ATTR_MTIME_SET) {
+			if (ia_avalid & ATTR_MTIME_SET) {
 				attr->cat_mtime = lvb->lvb_mtime;
 				cl_valid |= CAT_MTIME;
 			}
-			if (ia_valid & ATTR_ATIME_SET) {
+			if (ia_avalid & ATTR_ATIME_SET) {
 				attr->cat_atime = lvb->lvb_atime;
 				cl_valid |= CAT_ATIME;
 			}
-			if (ia_valid & ATTR_CTIME_SET) {
+			if (ia_xvalid & OP_XVALID_CTIME_SET) {
 				attr->cat_ctime = lvb->lvb_ctime;
 				cl_valid |= CAT_CTIME;
 			}
@@ -542,42 +557,47 @@ static int osc_io_setattr_start(const struct lu_env *env,
 		oa->o_layout = io->u.ci_setattr.sa_layout;
 		oa->o_valid |= OBD_MD_FLID | OBD_MD_FLGROUP |
 			OBD_MD_FLOSTLAYOUT;
-		if (ia_valid & ATTR_CTIME) {
+		if (ia_avalid & ATTR_CTIME) {
 			oa->o_valid |= OBD_MD_FLCTIME;
 			oa->o_ctime = attr->cat_ctime;
 		}
-		if (ia_valid & ATTR_ATIME) {
+		if (ia_avalid & ATTR_ATIME) {
 			oa->o_valid |= OBD_MD_FLATIME;
 			oa->o_atime = attr->cat_atime;
 		}
-		if (ia_valid & ATTR_MTIME) {
+		if (ia_avalid & ATTR_MTIME) {
 			oa->o_valid |= OBD_MD_FLMTIME;
 			oa->o_mtime = attr->cat_mtime;
 		}
-                if (ia_valid & ATTR_SIZE) {
-                        oa->o_size = size;
-                        oa->o_blocks = OBD_OBJECT_EOF;
-                        oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
-
-                        if (oio->oi_lockless) {
-                                oa->o_flags = OBD_FL_SRVLOCK;
-                                oa->o_valid |= OBD_MD_FLFLAGS;
-                        }
-                } else {
-                        LASSERT(oio->oi_lockless == 0);
-                }
+		if (ia_avalid & ATTR_SIZE) {
+			oa->o_size = size;
+			oa->o_blocks = OBD_OBJECT_EOF;
+			oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+
+			if (oio->oi_lockless) {
+				oa->o_flags = OBD_FL_SRVLOCK;
+				oa->o_valid |= OBD_MD_FLFLAGS;
+			}
+
+			if (io->ci_layout_version > 0) {
+				/* verify layout version */
+				oa->o_valid |= OBD_MD_LAYOUT_VERSION;
+				oa->o_layout_version = io->ci_layout_version;
+			}
+		} else {
+			LASSERT(oio->oi_lockless == 0);
+		}
 
-		if (ia_valid & ATTR_ATTR_FLAG) {
+		if (ia_xvalid & OP_XVALID_FLAGS) {
 			oa->o_flags = io->u.ci_setattr.sa_attr_flags;
 			oa->o_valid |= OBD_MD_FLFLAGS;
 		}
 
 		init_completion(&cbargs->opc_sync);
 
-		if (ia_valid & ATTR_SIZE)
-			result = osc_punch_base(osc_export(cl2osc(obj)),
-						oa, osc_async_upcall,
-						cbargs, PTLRPCD_SET);
+		if (ia_avalid & ATTR_SIZE)
+			result = osc_punch_send(osc_export(cl2osc(obj)),
+						oa, osc_async_upcall, cbargs);
 		else
 			result = osc_setattr_async(osc_export(cl2osc(obj)),
 						   oa, osc_async_upcall,
@@ -589,37 +609,50 @@ static int osc_io_setattr_start(const struct lu_env *env,
 	RETURN(result);
 }
 
-static void osc_io_setattr_end(const struct lu_env *env,
-                               const struct cl_io_slice *slice)
+void osc_io_setattr_end(const struct lu_env *env,
+			const struct cl_io_slice *slice)
 {
 	struct cl_io     *io  = slice->cis_io;
 	struct osc_io    *oio = cl2osc_io(env, slice);
 	struct cl_object *obj = slice->cis_obj;
 	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
-        int result = 0;
+	struct cl_attr  *attr = &osc_env_info(env)->oti_attr;
+	struct obdo *oa = &oio->oi_oa;
+	unsigned int cl_valid = 0;
+	int result = 0;
 
 	if (cbargs->opc_rpc_sent) {
 		wait_for_completion(&cbargs->opc_sync);
 		result = io->ci_result = cbargs->opc_rc;
 	}
-        if (result == 0) {
-                if (oio->oi_lockless) {
-                        /* lockless truncate */
-                        struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
-
-                        LASSERT(cl_io_is_trunc(io));
-                        /* XXX: Need a lock. */
-                        osd->od_stats.os_lockless_truncates++;
-                }
-        }
+
+	if (result == 0) {
+		if (oio->oi_lockless) {
+			/* lockless truncate */
+			struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
+
+			LASSERT(cl_io_is_trunc(io));
+			/* XXX: Need a lock. */
+			osd->od_stats.os_lockless_truncates++;
+		}
+	}
 
 	if (cl_io_is_trunc(io)) {
 		__u64 size = io->u.ci_setattr.sa_attr.lvb_size;
+		cl_object_attr_lock(obj);
+		if (oa->o_valid & OBD_MD_FLBLOCKS) {
+			attr->cat_blocks = oa->o_blocks;
+			cl_valid |= CAT_BLOCKS;
+		}
+
+		cl_object_attr_update(env, obj, attr, cl_valid);
+		cl_object_attr_unlock(obj);
 		osc_trunc_check(env, io, oio, size);
 		osc_cache_truncate_end(env, oio->oi_trunc);
 		oio->oi_trunc = NULL;
 	}
 }
+EXPORT_SYMBOL(osc_io_setattr_end);
 
 struct osc_data_version_args {
 	struct osc_io *dva_oio;
@@ -716,18 +749,23 @@ static void osc_io_data_version_end(const struct lu_env *env,
 
 	if (cbargs->opc_rc != 0) {
 		slice->cis_io->ci_result = cbargs->opc_rc;
-	} else if (!(oio->oi_oa.o_valid & OBD_MD_FLDATAVERSION)) {
-		slice->cis_io->ci_result = -EOPNOTSUPP;
 	} else {
-		dv->dv_data_version = oio->oi_oa.o_data_version;
 		slice->cis_io->ci_result = 0;
+		if (!(oio->oi_oa.o_valid &
+		      (OBD_MD_LAYOUT_VERSION | OBD_MD_FLDATAVERSION)))
+			slice->cis_io->ci_result = -ENOTSUPP;
+
+		if (oio->oi_oa.o_valid & OBD_MD_LAYOUT_VERSION)
+			dv->dv_layout_version = oio->oi_oa.o_layout_version;
+		if (oio->oi_oa.o_valid & OBD_MD_FLDATAVERSION)
+			dv->dv_data_version = oio->oi_oa.o_data_version;
 	}
 
 	EXIT;
 }
 
-static int osc_io_read_start(const struct lu_env *env,
-                             const struct cl_io_slice *slice)
+int osc_io_read_start(const struct lu_env *env,
+		      const struct cl_io_slice *slice)
 {
 	struct cl_object *obj  = slice->cis_obj;
 	struct cl_attr	 *attr = &osc_env_info(env)->oti_attr;
@@ -743,9 +781,10 @@ static int osc_io_read_start(const struct lu_env *env,
 
 	RETURN(rc);
 }
+EXPORT_SYMBOL(osc_io_read_start);
 
-static int osc_io_write_start(const struct lu_env *env,
-                              const struct cl_io_slice *slice)
+int osc_io_write_start(const struct lu_env *env,
+		       const struct cl_io_slice *slice)
 {
 	struct cl_object *obj   = slice->cis_obj;
 	struct cl_attr   *attr  = &osc_env_info(env)->oti_attr;
@@ -760,9 +799,10 @@ static int osc_io_write_start(const struct lu_env *env,
 
 	RETURN(rc);
 }
+EXPORT_SYMBOL(osc_io_write_start);
 
-static int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj,
-			 struct cl_fsync_io *fio)
+int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj,
+		  struct cl_fsync_io *fio)
 {
 	struct osc_io    *oio   = osc_env_io(env);
 	struct obdo      *oa    = &oio->oi_oa;
@@ -787,9 +827,10 @@ static int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj,
 	rc = osc_sync_base(obj, oa, osc_async_upcall, cbargs, PTLRPCD_SET);
 	RETURN(rc);
 }
+EXPORT_SYMBOL(osc_fsync_ost);
 
-static int osc_io_fsync_start(const struct lu_env *env,
-			      const struct cl_io_slice *slice)
+int osc_io_fsync_start(const struct lu_env *env,
+		       const struct cl_io_slice *slice)
 {
 	struct cl_io       *io  = slice->cis_io;
 	struct cl_fsync_io *fio = &io->u.ci_fsync;
@@ -828,8 +869,8 @@ static int osc_io_fsync_start(const struct lu_env *env,
 	RETURN(result);
 }
 
-static void osc_io_fsync_end(const struct lu_env *env,
-			     const struct cl_io_slice *slice)
+void osc_io_fsync_end(const struct lu_env *env,
+		      const struct cl_io_slice *slice)
 {
 	struct cl_fsync_io *fio = &slice->cis_io->u.ci_fsync;
 	struct cl_object   *obj = slice->cis_obj;
@@ -849,6 +890,7 @@ static void osc_io_fsync_end(const struct lu_env *env,
 	}
 	slice->cis_io->ci_result = result;
 }
+EXPORT_SYMBOL(osc_io_fsync_end);
 
 static int osc_io_ladvise_start(const struct lu_env *env,
 				const struct cl_io_slice *slice)
@@ -920,8 +962,7 @@ static void osc_io_ladvise_end(const struct lu_env *env,
 	slice->cis_io->ci_result = result;
 }
 
-static void osc_io_end(const struct lu_env *env,
-		       const struct cl_io_slice *slice)
+void osc_io_end(const struct lu_env *env, const struct cl_io_slice *slice)
 {
 	struct osc_io *oio = cl2osc_io(env, slice);
 
@@ -930,6 +971,7 @@ static void osc_io_end(const struct lu_env *env,
 		oio->oi_active = NULL;
 	}
 }
+EXPORT_SYMBOL(osc_io_end);
 
 static const struct cl_io_operations osc_io_ops = {
 	.op = {
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_lock.c b/drivers/staging/lustrefsx/lustre/osc/osc_lock.c
index 6d53b5b80c580..dd956fd8532b2 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_lock.c
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_lock.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -37,32 +37,16 @@
 
 #define DEBUG_SUBSYSTEM S_OSC
 
-#include <libcfs/libcfs.h>
 /* fid_build_reg_res_name() */
 #include <lustre_fid.h>
+#include <lustre_osc.h>
 
-#include "osc_cl_internal.h"
+#include "osc_internal.h"
 
 /** \addtogroup osc
  *  @{
  */
 
-/*****************************************************************************
- *
- * Type conversions.
- *
- */
-
-static const struct cl_lock_operations osc_lock_ops;
-static const struct cl_lock_operations osc_lock_lockless_ops;
-static void osc_lock_to_lockless(const struct lu_env *env,
-                                 struct osc_lock *ols, int force);
-
-int osc_lock_is_lockless(const struct osc_lock *olck)
-{
-        return (olck->ols_cl.cls_ops == &osc_lock_lockless_ops);
-}
-
 /**
  * Returns a weak pointer to the ldlm lock identified by a handle. Returned
  * pointer cannot be dereferenced, as lock is not protected from concurrent
@@ -122,7 +106,7 @@ static int osc_lock_invariant(struct osc_lock *ols)
 
 	if (! ergo(ols->ols_state == OLS_GRANTED,
 		   olock != NULL &&
-		   olock->l_req_mode == olock->l_granted_mode &&
+		   ldlm_is_granted(olock) &&
 		   ols->ols_hold))
 		return 0;
 	return 1;
@@ -134,8 +118,7 @@ static int osc_lock_invariant(struct osc_lock *ols)
  *
  */
 
-static void osc_lock_fini(const struct lu_env *env,
-                          struct cl_lock_slice *slice)
+void osc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice)
 {
 	struct osc_lock  *ols = cl2osc_lock(slice);
 
@@ -144,6 +127,7 @@ static void osc_lock_fini(const struct lu_env *env,
 
 	OBD_SLAB_FREE_PTR(ols, osc_lock_kmem);
 }
+EXPORT_SYMBOL(osc_lock_fini);
 
 static void osc_lock_build_policy(const struct lu_env *env,
 				  const struct cl_lock *lock,
@@ -155,44 +139,22 @@ static void osc_lock_build_policy(const struct lu_env *env,
 	policy->l_extent.gid = d->cld_gid;
 }
 
-static __u64 osc_enq2ldlm_flags(__u32 enqflags)
-{
-	__u64 result = 0;
-
-	LASSERT((enqflags & ~CEF_MASK) == 0);
-
-	if (enqflags & CEF_NONBLOCK)
-		result |= LDLM_FL_BLOCK_NOWAIT;
-	if (enqflags & CEF_ASYNC)
-		result |= LDLM_FL_HAS_INTENT;
-	if (enqflags & CEF_DISCARD_DATA)
-		result |= LDLM_FL_AST_DISCARD_DATA;
-	if (enqflags & CEF_PEEK)
-		result |= LDLM_FL_TEST_LOCK;
-	if (enqflags & CEF_LOCK_MATCH)
-		result |= LDLM_FL_MATCH_LOCK;
-	return result;
-}
-
 /**
  * Updates object attributes from a lock value block (lvb) received together
  * with the DLM lock reply from the server. Copy of osc_update_enqueue()
  * logic.
  *
- * This can be optimized to not update attributes when lock is a result of a
- * local match.
- *
  * Called under lock and resource spin-locks.
  */
-static void osc_lock_lvb_update(const struct lu_env *env,
-				struct osc_object *osc,
-				struct ldlm_lock *dlmlock,
-				struct ost_lvb *lvb)
+void osc_lock_lvb_update(const struct lu_env *env,
+			 struct osc_object *osc,
+			 struct ldlm_lock *dlmlock,
+			 struct ost_lvb *lvb)
 {
-	struct cl_object  *obj = osc2cl(osc);
-	struct lov_oinfo  *oinfo = osc->oo_oinfo;
-	struct cl_attr    *attr = &osc_env_info(env)->oti_attr;
-	unsigned           valid;
+	struct cl_object *obj = osc2cl(osc);
+	struct lov_oinfo *oinfo = osc->oo_oinfo;
+	struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+	unsigned valid, setkms = 0;
 
 	ENTRY;
 
@@ -217,19 +179,23 @@ static void osc_lock_lvb_update(const struct lu_env *env,
                 if (size > dlmlock->l_policy_data.l_extent.end)
                         size = dlmlock->l_policy_data.l_extent.end + 1;
                 if (size >= oinfo->loi_kms) {
-			LDLM_DEBUG(dlmlock, "lock acquired, setting rss=%llu"
-				   ", kms=%llu", lvb->lvb_size, size);
                         valid |= CAT_KMS;
                         attr->cat_kms = size;
-                } else {
-                        LDLM_DEBUG(dlmlock, "lock acquired, setting rss="
-				   "%llu; leaving kms=%llu, end=%llu",
-                                   lvb->lvb_size, oinfo->loi_kms,
-                                   dlmlock->l_policy_data.l_extent.end);
+			setkms = 1;
                 }
 		ldlm_lock_allow_match_locked(dlmlock);
 	}
 
+	/* The size should not be less than the kms */
+	if (attr->cat_size < oinfo->loi_kms)
+		attr->cat_size = oinfo->loi_kms;
+
+	LDLM_DEBUG(dlmlock, "acquired size %llu, setting rss=%llu;%s "
+		   "kms=%llu, end=%llu", lvb->lvb_size, attr->cat_size,
+		   setkms ? "" : " leaving",
+		   setkms ? attr->cat_kms : oinfo->loi_kms,
+		   dlmlock ? dlmlock->l_policy_data.l_extent.end : -1ull);
+
 	cl_object_attr_update(env, obj, attr, valid);
 	cl_object_attr_unlock(obj);
 
@@ -237,8 +203,9 @@ static void osc_lock_lvb_update(const struct lu_env *env,
 }
 
 static void osc_lock_granted(const struct lu_env *env, struct osc_lock *oscl,
-			     struct lustre_handle *lockh, bool lvb_update)
+			     struct lustre_handle *lockh)
 {
+	struct osc_object *osc = cl2osc(oscl->ols_cl.cls_obj);
 	struct ldlm_lock *dlmlock;
 
 	dlmlock = ldlm_handle2lock_long(lockh, 0);
@@ -265,7 +232,7 @@ static void osc_lock_granted(const struct lu_env *env, struct osc_lock *oscl,
 
 	/* Lock must have been granted. */
 	lock_res_and_lock(dlmlock);
-	if (dlmlock->l_granted_mode == dlmlock->l_req_mode) {
+	if (ldlm_is_granted(dlmlock)) {
 		struct ldlm_extent *ext = &dlmlock->l_policy_data.l_extent;
 		struct cl_lock_descr *descr = &oscl->ols_cl.cls_lock->cll_descr;
 
@@ -277,10 +244,11 @@ static void osc_lock_granted(const struct lu_env *env, struct osc_lock *oscl,
 		descr->cld_gid   = ext->gid;
 
 		/* no lvb update for matched lock */
-		if (lvb_update) {
+		if (!ldlm_is_lvb_cached(dlmlock)) {
 			LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY);
-			osc_lock_lvb_update(env, cl2osc(oscl->ols_cl.cls_obj),
-					    dlmlock, NULL);
+			LASSERT(osc == dlmlock->l_ast_data);
+			osc_lock_lvb_update(env, osc, dlmlock, NULL);
+			ldlm_set_lvb_cached(dlmlock);
 		}
 		LINVRNT(osc_lock_invariant(oscl));
 	}
@@ -320,7 +288,7 @@ static int osc_lock_upcall(void *cookie, struct lustre_handle *lockh,
 	}
 
 	if (rc == 0)
-		osc_lock_granted(env, oscl, lockh, errcode == ELDLM_OK);
+		osc_lock_granted(env, oscl, lockh);
 
 	/* Error handling, some errors are tolerable. */
 	if (oscl->ols_locklessable && rc == -EUSERS) {
@@ -328,7 +296,7 @@ static int osc_lock_upcall(void *cookie, struct lustre_handle *lockh,
 		 * lockless lock.
 		 */
 		osc_object_set_contended(cl2osc(slice->cls_obj));
-		LASSERT(slice->cls_ops == &osc_lock_ops);
+		LASSERT(slice->cls_ops != oscl->ols_lockless_ops);
 
 		/* Change this lock to ldlmlock-less lock. */
 		osc_lock_to_lockless(env, oscl, 1);
@@ -340,6 +308,8 @@ static int osc_lock_upcall(void *cookie, struct lustre_handle *lockh,
 				    NULL, &oscl->ols_lvb);
 		/* Hide the error. */
 		rc = 0;
+	} else if (rc < 0 && oscl->ols_flags & LDLM_FL_NDELAY) {
+		rc = -EWOULDBLOCK;
 	}
 
 	if (oscl->ols_owner != NULL)
@@ -349,8 +319,9 @@ static int osc_lock_upcall(void *cookie, struct lustre_handle *lockh,
 	RETURN(rc);
 }
 
-static int osc_lock_upcall_agl(void *cookie, struct lustre_handle *lockh,
-			       int errcode)
+static int osc_lock_upcall_speculative(void *cookie,
+				       struct lustre_handle *lockh,
+				       int errcode)
 {
 	struct osc_object	*osc = cookie;
 	struct ldlm_lock	*dlmlock;
@@ -371,9 +342,10 @@ static int osc_lock_upcall_agl(void *cookie, struct lustre_handle *lockh,
 	LASSERT(dlmlock != NULL);
 
 	lock_res_and_lock(dlmlock);
-	LASSERT(dlmlock->l_granted_mode == dlmlock->l_req_mode);
+	LASSERT(ldlm_is_granted(dlmlock));
 
-	/* there is no osc_lock associated with AGL lock */
+	/* there is no osc_lock associated with speculative locks
+	 * thus no need to set LDLM_FL_LVB_CACHED */
 	osc_lock_lvb_update(env, osc, dlmlock, NULL);
 
 	unlock_res_and_lock(dlmlock);
@@ -409,7 +381,12 @@ static int osc_lock_flush(struct osc_object *obj, pgoff_t start, pgoff_t end,
 			rc = 0;
 	}
 
-	rc2 = osc_lock_discard_pages(env, obj, start, end, discard);
+	/*
+	 * Do not try to match other locks with CLM_WRITE since we already
+	 * know there're none
+	 */
+	rc2 = osc_lock_discard_pages(env, obj, start, end,
+				     mode == CLM_WRITE || discard);
 	if (rc == 0 && rc2 < 0)
 		rc = rc2;
 
@@ -434,7 +411,7 @@ static int osc_dlm_blocking_ast0(const struct lu_env *env,
 	LASSERT(flag == LDLM_CB_CANCELING);
 
 	lock_res_and_lock(dlmlock);
-	if (dlmlock->l_granted_mode != dlmlock->l_req_mode) {
+	if (!ldlm_is_granted(dlmlock)) {
 		dlmlock->l_ast_data = NULL;
 		unlock_res_and_lock(dlmlock);
 		RETURN(0);
@@ -574,13 +551,17 @@ static int osc_ldlm_blocking_ast(struct ldlm_lock *dlmlock,
 	RETURN(result);
 }
 
-static int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
+int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
 {
 	struct ptlrpc_request	*req  = data;
 	struct lu_env		*env;
 	struct ost_lvb		*lvb;
 	struct req_capsule	*cap;
 	struct cl_object	*obj = NULL;
+	struct ldlm_resource	*res = dlmlock->l_resource;
+	struct ldlm_match_data  matchdata = { 0 };
+	union ldlm_policy_data  policy;
+	enum ldlm_mode		mode = LCK_PW | LCK_GROUP | LCK_PR;
 	int			result;
 	__u16			refcheck;
 
@@ -592,13 +573,40 @@ static int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
 	if (IS_ERR(env))
 		GOTO(out, result = PTR_ERR(env));
 
+	policy.l_extent.start = 0;
+	policy.l_extent.end = LUSTRE_EOF;
 
-	lock_res_and_lock(dlmlock);
-	if (dlmlock->l_ast_data != NULL) {
-		obj = osc2cl(dlmlock->l_ast_data);
-		cl_object_get(obj);
+	matchdata.lmd_mode = &mode;
+	matchdata.lmd_policy = &policy;
+	matchdata.lmd_flags = LDLM_FL_TEST_LOCK | LDLM_FL_CBPENDING;
+	matchdata.lmd_unref = 1;
+	matchdata.lmd_has_ast_data = true;
+
+	LDLM_LOCK_GET(dlmlock);
+
+	/* If any dlmlock has l_ast_data set, we must find it or we risk
+	 * missing a size update done under a different lock.
+	 */
+	while (dlmlock) {
+		lock_res_and_lock(dlmlock);
+		if (dlmlock->l_ast_data) {
+			obj = osc2cl(dlmlock->l_ast_data);
+			cl_object_get(obj);
+		}
+		unlock_res_and_lock(dlmlock);
+		LDLM_LOCK_RELEASE(dlmlock);
+
+		dlmlock = NULL;
+
+		if (obj == NULL && res->lr_type == LDLM_EXTENT) {
+			if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_SIZE_DATA))
+				break;
+
+			lock_res(res);
+			dlmlock = search_itree(res, &matchdata);
+			unlock_res(res);
+		}
 	}
-	unlock_res_and_lock(dlmlock);
 
 	if (obj != NULL) {
 		/* Do not grab the mutex of cl_lock for glimpse.
@@ -636,15 +644,15 @@ static int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
 	req->rq_status = result;
 	RETURN(result);
 }
+EXPORT_SYMBOL(osc_ldlm_glimpse_ast);
 
 static int weigh_cb(const struct lu_env *env, struct cl_io *io,
 		    struct osc_page *ops, void *cbdata)
 {
 	struct cl_page *page = ops->ops_cl.cpl_page;
 
-	if (cl_page_is_vmlocked(env, page)
-	    || PageDirty(page->cp_vmpage) || PageWriteback(page->cp_vmpage)
-	   )
+	if (cl_page_is_vmlocked(env, page) || PageDirty(page->cp_vmpage) ||
+	    PageWriteback(page->cp_vmpage))
 		return CLP_GANG_ABORT;
 
 	*(pgoff_t *)cbdata = osc_index(ops) + 1;
@@ -653,12 +661,13 @@ static int weigh_cb(const struct lu_env *env, struct cl_io *io,
 
 static unsigned long osc_lock_weight(const struct lu_env *env,
 				     struct osc_object *oscobj,
-				     struct ldlm_extent *extent)
+				     loff_t start, loff_t end)
 {
-	struct cl_io     *io = &osc_env_info(env)->oti_io;
+	struct cl_io *io = osc_env_thread_io(env);
 	struct cl_object *obj = cl_object_top(&oscobj->oo_cl);
-	pgoff_t          page_index;
-	int              result;
+	pgoff_t page_index;
+	int result;
+
 	ENTRY;
 
 	io->ci_obj = obj;
@@ -667,11 +676,10 @@ static unsigned long osc_lock_weight(const struct lu_env *env,
 	if (result != 0)
 		RETURN(result);
 
-	page_index = cl_index(obj, extent->start);
+	page_index = cl_index(obj, start);
 	do {
 		result = osc_page_gang_lookup(env, io, oscobj,
-					      page_index,
-					      cl_index(obj, extent->end),
+					      page_index, cl_index(obj, end),
 					      weigh_cb, (void *)&page_index);
 		if (result == CLP_GANG_ABORT)
 			break;
@@ -688,12 +696,13 @@ static unsigned long osc_lock_weight(const struct lu_env *env,
  */
 unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock)
 {
-	struct lu_env           *env;
-	struct osc_object	*obj;
-	struct osc_lock		*oscl;
-	unsigned long            weight;
-	bool			found = false;
-	__u16			refcheck;
+	struct lu_env *env;
+	struct osc_object *obj;
+	struct osc_lock *oscl;
+	unsigned long weight;
+	bool found = false;
+	__u16 refcheck;
+
 	ENTRY;
 
 	might_sleep();
@@ -709,7 +718,9 @@ unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock)
 		/* Mostly because lack of memory, do not eliminate this lock */
 		RETURN(1);
 
-	LASSERT(dlmlock->l_resource->lr_type == LDLM_EXTENT);
+	LASSERT(dlmlock->l_resource->lr_type == LDLM_EXTENT ||
+		dlmlock->l_resource->lr_type == LDLM_IBITS);
+
 	lock_res_and_lock(dlmlock);
 	obj = dlmlock->l_ast_data;
 	if (obj)
@@ -721,9 +732,10 @@ unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock)
 
 	spin_lock(&obj->oo_ol_spin);
 	list_for_each_entry(oscl, &obj->oo_ol_list, ols_nextlock_oscobj) {
-		if (oscl->ols_dlmlock != NULL && oscl->ols_dlmlock != dlmlock)
-			continue;
-		found = true;
+		if (oscl->ols_dlmlock == dlmlock) {
+			found = true;
+			break;
+		}
 	}
 	spin_unlock(&obj->oo_ol_spin);
 	if (found) {
@@ -733,7 +745,18 @@ unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock)
 		GOTO(out, weight = 1);
 	}
 
-	weight = osc_lock_weight(env, obj, &dlmlock->l_policy_data.l_extent);
+	if (dlmlock->l_resource->lr_type == LDLM_EXTENT)
+		weight = osc_lock_weight(env, obj,
+					 dlmlock->l_policy_data.l_extent.start,
+					 dlmlock->l_policy_data.l_extent.end);
+	else if (ldlm_has_dom(dlmlock))
+		weight = osc_lock_weight(env, obj, 0, OBD_OBJECT_EOF);
+	/* The DOM bit can be cancelled at any time; in that case, we know
+	 * there are no pages, so just return weight of 0
+	 */
+	else
+		weight = 0;
+
 	EXIT;
 
 out:
@@ -743,6 +766,7 @@ unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock)
 	cl_env_put(env, &refcheck);
 	return weight;
 }
+EXPORT_SYMBOL(osc_ldlm_weigh_ast);
 
 static void osc_lock_build_einfo(const struct lu_env *env,
 				 const struct cl_lock *lock,
@@ -769,46 +793,46 @@ static void osc_lock_build_einfo(const struct lu_env *env,
  *  Additional policy can be implemented here, e.g., never do lockless-io
  *  for large extents.
  */
-static void osc_lock_to_lockless(const struct lu_env *env,
-                                 struct osc_lock *ols, int force)
+void osc_lock_to_lockless(const struct lu_env *env,
+			  struct osc_lock *ols, int force)
 {
-        struct cl_lock_slice *slice = &ols->ols_cl;
-
-        LASSERT(ols->ols_state == OLS_NEW ||
-                ols->ols_state == OLS_UPCALL_RECEIVED);
-
-        if (force) {
-                ols->ols_locklessable = 1;
-                slice->cls_ops = &osc_lock_lockless_ops;
-        } else {
-                struct osc_io *oio     = osc_env_io(env);
-                struct cl_io  *io      = oio->oi_cl.cis_io;
-                struct cl_object *obj  = slice->cls_obj;
-                struct osc_object *oob = cl2osc(obj);
-                const struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
-                struct obd_connect_data *ocd;
-
-                LASSERT(io->ci_lockreq == CILR_MANDATORY ||
-                        io->ci_lockreq == CILR_MAYBE ||
-                        io->ci_lockreq == CILR_NEVER);
-
-                ocd = &class_exp2cliimp(osc_export(oob))->imp_connect_data;
-                ols->ols_locklessable = (io->ci_type != CIT_SETATTR) &&
-                                (io->ci_lockreq == CILR_MAYBE) &&
-                                (ocd->ocd_connect_flags & OBD_CONNECT_SRVLOCK);
-                if (io->ci_lockreq == CILR_NEVER ||
-                        /* lockless IO */
-                    (ols->ols_locklessable && osc_object_is_contended(oob)) ||
-                        /* lockless truncate */
-                    (cl_io_is_trunc(io) &&
-                     (ocd->ocd_connect_flags & OBD_CONNECT_TRUNCLOCK) &&
-                      osd->od_lockless_truncate)) {
-                        ols->ols_locklessable = 1;
-                        slice->cls_ops = &osc_lock_lockless_ops;
-                }
-        }
-        LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols)));
+	struct cl_lock_slice *slice = &ols->ols_cl;
+	struct osc_io *oio = osc_env_io(env);
+	struct cl_io *io = oio->oi_cl.cis_io;
+	struct cl_object *obj = slice->cls_obj;
+	struct osc_object *oob = cl2osc(obj);
+	const struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
+	struct obd_connect_data *ocd;
+
+	LASSERT(ols->ols_state == OLS_NEW ||
+		ols->ols_state == OLS_UPCALL_RECEIVED);
+
+	if (force) {
+		ols->ols_locklessable = 1;
+		slice->cls_ops = ols->ols_lockless_ops;
+	} else {
+		LASSERT(io->ci_lockreq == CILR_MANDATORY ||
+			io->ci_lockreq == CILR_MAYBE ||
+			io->ci_lockreq == CILR_NEVER);
+
+		ocd = &class_exp2cliimp(osc_export(oob))->imp_connect_data;
+		ols->ols_locklessable = (io->ci_type != CIT_SETATTR) &&
+					(io->ci_lockreq == CILR_MAYBE) &&
+					(ocd->ocd_connect_flags &
+					 OBD_CONNECT_SRVLOCK);
+		if (io->ci_lockreq == CILR_NEVER ||
+		    /* lockless IO */
+		    (ols->ols_locklessable && osc_object_is_contended(oob)) ||
+		    /* lockless truncate */
+		    (cl_io_is_trunc(io) && osd->od_lockless_truncate &&
+		     (ocd->ocd_connect_flags & OBD_CONNECT_TRUNCLOCK))) {
+			ols->ols_locklessable = 1;
+			slice->cls_ops = ols->ols_lockless_ops;
+		}
+	}
+	LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols)));
 }
+EXPORT_SYMBOL(osc_lock_to_lockless);
 
 static bool osc_lock_compatible(const struct osc_lock *qing,
 				const struct osc_lock *qed)
@@ -816,7 +840,7 @@ static bool osc_lock_compatible(const struct osc_lock *qing,
 	struct cl_lock_descr *qed_descr = &qed->ols_cl.cls_lock->cll_descr;
 	struct cl_lock_descr *qing_descr = &qing->ols_cl.cls_lock->cll_descr;
 
-	if (qed->ols_glimpse)
+	if (qed->ols_glimpse || qed->ols_speculative)
 		return true;
 
 	if (qing_descr->cld_mode == CLM_READ && qed_descr->cld_mode == CLM_READ)
@@ -833,9 +857,8 @@ static bool osc_lock_compatible(const struct osc_lock *qing,
 	return false;
 }
 
-static void osc_lock_wake_waiters(const struct lu_env *env,
-				  struct osc_object *osc,
-				  struct osc_lock *oscl)
+void osc_lock_wake_waiters(const struct lu_env *env, struct osc_object *osc,
+			   struct osc_lock *oscl)
 {
 	spin_lock(&osc->oo_ol_spin);
 	list_del_init(&oscl->ols_nextlock_oscobj);
@@ -853,14 +876,16 @@ static void osc_lock_wake_waiters(const struct lu_env *env,
 	}
 	spin_unlock(&oscl->ols_lock);
 }
+EXPORT_SYMBOL(osc_lock_wake_waiters);
 
-static int osc_lock_enqueue_wait(const struct lu_env *env,
-		struct osc_object *obj, struct osc_lock *oscl)
+int osc_lock_enqueue_wait(const struct lu_env *env, struct osc_object *obj,
+			  struct osc_lock *oscl)
 {
 	struct osc_lock         *tmp_oscl;
 	struct cl_lock_descr    *need = &oscl->ols_cl.cls_lock->cll_descr;
 	struct cl_sync_io       *waiter = &osc_env_info(env)->oti_anchor;
 	int rc = 0;
+
 	ENTRY;
 
 	spin_lock(&obj->oo_ol_spin);
@@ -911,6 +936,7 @@ static int osc_lock_enqueue_wait(const struct lu_env *env,
 
 	RETURN(rc);
 }
+EXPORT_SYMBOL(osc_lock_enqueue_wait);
 
 /**
  * Implementation of cl_lock_operations::clo_enqueue() method for osc
@@ -934,6 +960,7 @@ static int osc_lock_enqueue(const struct lu_env *env,
 	struct osc_io			*oio   = osc_env_io(env);
 	struct osc_object		*osc   = cl2osc(slice->cls_obj);
 	struct osc_lock			*oscl  = cl2osc_lock(slice);
+	struct obd_export		*exp   = osc_export(osc);
 	struct cl_lock			*lock  = slice->cls_lock;
 	struct ldlm_res_id		*resname = &info->oti_resname;
 	union ldlm_policy_data		*policy  = &info->oti_policy;
@@ -950,11 +977,22 @@ static int osc_lock_enqueue(const struct lu_env *env,
 	if (oscl->ols_state == OLS_GRANTED)
 		RETURN(0);
 
+	if ((oscl->ols_flags & LDLM_FL_NO_EXPANSION) &&
+	    !(exp_connect_lockahead_old(exp) || exp_connect_lockahead(exp))) {
+		result = -EOPNOTSUPP;
+		CERROR("%s: server does not support lockahead/locknoexpand:"
+		       "rc = %d\n", exp->exp_obd->obd_name, result);
+		RETURN(result);
+	}
+
 	if (oscl->ols_flags & LDLM_FL_TEST_LOCK)
 		GOTO(enqueue_base, 0);
 
-	if (oscl->ols_glimpse) {
-		LASSERT(equi(oscl->ols_agl, anchor == NULL));
+	/* For glimpse and/or speculative locks, do not wait for reply from
+	 * server on LDLM request */
+	if (oscl->ols_glimpse || oscl->ols_speculative) {
+		/* Speculative and glimpse locks do not have an anchor */
+		LASSERT(equi(oscl->ols_speculative, anchor == NULL));
 		async = true;
 		GOTO(enqueue_base, 0);
 	}
@@ -980,25 +1018,30 @@ static int osc_lock_enqueue(const struct lu_env *env,
 
 	/**
 	 * DLM lock's ast data must be osc_object;
-	 * if glimpse or AGL lock, async of osc_enqueue_base() must be true,
+	 * if glimpse or speculative lock, async of osc_enqueue_base()
+	 * must be true
+	 *
+	 * For non-speculative locks:
 	 * DLM's enqueue callback set to osc_lock_upcall() with cookie as
 	 * osc_lock.
+	 * For speculative locks:
+	 * osc_lock_upcall_speculative & cookie is the osc object, since
+	 * there is no osc_lock
 	 */
 	ostid_build_res_name(&osc->oo_oinfo->loi_oi, resname);
 	osc_lock_build_policy(env, lock, policy);
-	if (oscl->ols_agl) {
+	if (oscl->ols_speculative) {
 		oscl->ols_einfo.ei_cbdata = NULL;
 		/* hold a reference for callback */
 		cl_object_get(osc2cl(osc));
-		upcall = osc_lock_upcall_agl;
+		upcall = osc_lock_upcall_speculative;
 		cookie = osc;
 	}
-	result = osc_enqueue_base(osc_export(osc), resname, &oscl->ols_flags,
+	result = osc_enqueue_base(exp, resname, &oscl->ols_flags,
 				  policy, &oscl->ols_lvb,
-				  osc->oo_oinfo->loi_kms_valid,
 				  upcall, cookie,
 				  &oscl->ols_einfo, PTLRPCD_SET, async,
-				  oscl->ols_agl);
+				  oscl->ols_speculative);
 	if (result == 0) {
 		if (osc_lock_is_lockless(oscl)) {
 			oio->oi_lockless = 1;
@@ -1007,9 +1050,12 @@ static int osc_lock_enqueue(const struct lu_env *env,
 			LASSERT(oscl->ols_hold);
 			LASSERT(oscl->ols_dlmlock != NULL);
 		}
-	} else if (oscl->ols_agl) {
+	} else if (oscl->ols_speculative) {
 		cl_object_put(env, osc2cl(osc));
-		result = 0;
+		if (oscl->ols_glimpse) {
+			/* hide error for AGL request */
+			result = 0;
+		}
 	}
 
 out:
@@ -1067,8 +1113,8 @@ static void osc_lock_detach(const struct lu_env *env, struct osc_lock *olck)
  *
  *     - cancels ldlm lock (ldlm_cli_cancel()).
  */
-static void osc_lock_cancel(const struct lu_env *env,
-                            const struct cl_lock_slice *slice)
+void osc_lock_cancel(const struct lu_env *env,
+		     const struct cl_lock_slice *slice)
 {
 	struct osc_object *obj  = cl2osc(slice->cls_obj);
 	struct osc_lock	  *oscl = cl2osc_lock(slice);
@@ -1084,9 +1130,10 @@ static void osc_lock_cancel(const struct lu_env *env,
 	osc_lock_wake_waiters(env, obj, oscl);
 	EXIT;
 }
+EXPORT_SYMBOL(osc_lock_cancel);
 
-static int osc_lock_print(const struct lu_env *env, void *cookie,
-			  lu_printer_t p, const struct cl_lock_slice *slice)
+int osc_lock_print(const struct lu_env *env, void *cookie,
+		   lu_printer_t p, const struct cl_lock_slice *slice)
 {
 	struct osc_lock *lock = cl2osc_lock(slice);
 
@@ -1096,6 +1143,7 @@ static int osc_lock_print(const struct lu_env *env, void *cookie,
 	osc_lvb_print(env, cookie, p, &lock->ols_lvb);
 	return 0;
 }
+EXPORT_SYMBOL(osc_lock_print);
 
 static const struct cl_lock_operations osc_lock_ops = {
         .clo_fini    = osc_lock_fini,
@@ -1129,9 +1177,8 @@ static const struct cl_lock_operations osc_lock_lockless_ops = {
         .clo_print     = osc_lock_print
 };
 
-static void osc_lock_set_writer(const struct lu_env *env,
-				const struct cl_io *io,
-				struct cl_object *obj, struct osc_lock *oscl)
+void osc_lock_set_writer(const struct lu_env *env, const struct cl_io *io,
+			 struct cl_object *obj, struct osc_lock *oscl)
 {
 	struct cl_lock_descr *descr = &oscl->ols_cl.cls_lock->cll_descr;
 	pgoff_t io_start;
@@ -1141,9 +1188,9 @@ static void osc_lock_set_writer(const struct lu_env *env,
 		return;
 
 	if (likely(io->ci_type == CIT_WRITE)) {
-		io_start = cl_index(obj, io->u.ci_rw.rw_range.cir_pos);
-		io_end = cl_index(obj, io->u.ci_rw.rw_range.cir_pos +
-				  io->u.ci_rw.rw_range.cir_count - 1);
+		io_start = cl_index(obj, io->u.ci_rw.crw_pos);
+		io_end = cl_index(obj, io->u.ci_rw.crw_pos +
+						io->u.ci_rw.crw_count - 1);
 	} else {
 		LASSERT(cl_io_is_mkwrite(io));
 		io_start = io_end = io->u.ci_fault.ft_index;
@@ -1159,6 +1206,7 @@ static void osc_lock_set_writer(const struct lu_env *env,
 		oio->oi_write_osclock = oscl;
 	}
 }
+EXPORT_SYMBOL(osc_lock_set_writer);
 
 int osc_lock_init(const struct lu_env *env,
 		  struct cl_object *obj, struct cl_lock *lock,
@@ -1176,15 +1224,23 @@ int osc_lock_init(const struct lu_env *env,
 	INIT_LIST_HEAD(&oscl->ols_waiting_list);
 	INIT_LIST_HEAD(&oscl->ols_wait_entry);
 	INIT_LIST_HEAD(&oscl->ols_nextlock_oscobj);
+	oscl->ols_lockless_ops = &osc_lock_lockless_ops;
+
+	/* Speculative lock requests must be either no_expand or glimpse
+	 * request (CEF_GLIMPSE).  non-glimpse no_expand speculative extent
+	 * locks will break ofd_intent_cb. (see comment there)*/
+	LASSERT(ergo((enqflags & CEF_SPECULATIVE) != 0,
+		(enqflags & (CEF_LOCK_NO_EXPAND | CEF_GLIMPSE)) != 0));
 
 	oscl->ols_flags = osc_enq2ldlm_flags(enqflags);
-	oscl->ols_agl = !!(enqflags & CEF_AGL);
-	if (oscl->ols_agl)
-		oscl->ols_flags |= LDLM_FL_BLOCK_NOWAIT;
+	oscl->ols_speculative = !!(enqflags & CEF_SPECULATIVE);
+
 	if (oscl->ols_flags & LDLM_FL_HAS_INTENT) {
 		oscl->ols_flags |= LDLM_FL_BLOCK_GRANTED;
 		oscl->ols_glimpse = 1;
 	}
+	if (io->ci_ndelay && cl_object_same(io->ci_obj, obj))
+		oscl->ols_flags |= LDLM_FL_NDELAY;
 	osc_lock_build_einfo(env, lock, cl2osc(obj), &oscl->ols_einfo);
 
 	cl_lock_slice_add(lock, &oscl->ols_cl, obj, &osc_lock_ops);
@@ -1208,9 +1264,10 @@ int osc_lock_init(const struct lu_env *env,
  * Finds an existing lock covering given index and optionally different from a
  * given \a except lock.
  */
-struct ldlm_lock *osc_dlmlock_at_pgoff(const struct lu_env *env,
-				       struct osc_object *obj, pgoff_t index,
-				       enum osc_dap_flags dap_flags)
+struct ldlm_lock *osc_obj_dlmlock_at_pgoff(const struct lu_env *env,
+					   struct osc_object *obj,
+					   pgoff_t index,
+					   enum osc_dap_flags dap_flags)
 {
 	struct osc_thread_info *info = osc_env_info(env);
 	struct ldlm_res_id *resname = &info->oti_resname;
@@ -1234,9 +1291,9 @@ struct ldlm_lock *osc_dlmlock_at_pgoff(const struct lu_env *env,
 	 * with a uniq gid and it conflicts with all other lock modes too
 	 */
 again:
-	mode = osc_match_base(osc_export(obj), resname, LDLM_EXTENT, policy,
-			       LCK_PR | LCK_PW | LCK_GROUP, &flags, obj, &lockh,
-			       dap_flags & OSC_DAP_FL_CANCELING);
+	mode = osc_match_base(env, osc_export(obj), resname, LDLM_EXTENT,
+			      policy, LCK_PR | LCK_PW | LCK_GROUP, &flags,
+			      obj, &lockh, dap_flags & OSC_DAP_FL_CANCELING);
 	if (mode != 0) {
 		lock = ldlm_handle2lock(&lockh);
 		/* RACE: the lock is cancelled so let's try again */
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_object.c b/drivers/staging/lustrefsx/lustre/osc/osc_object.c
index 052f8bc90525c..a99747cecf011 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_object.c
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_object.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -36,8 +36,9 @@
  */
 
 #define DEBUG_SUBSYSTEM S_OSC
+#include <lustre_osc.h>
 
-#include "osc_cl_internal.h"
+#include "osc_internal.h"
 
 /** \addtogroup osc
  *  @{
@@ -45,34 +46,27 @@
 
 /*****************************************************************************
  *
- * Type conversions.
+ * Object operations.
  *
  */
-
-static struct lu_object *osc2lu(struct osc_object *osc)
-{
-        return &osc->oo_cl.co_lu;
-}
-
-static struct osc_object *lu2osc(const struct lu_object *obj)
+static void osc_obj_build_res_name(struct osc_object *osc,
+				   struct ldlm_res_id *resname)
 {
-        LINVRNT(osc_is_object(obj));
-        return container_of0(obj, struct osc_object, oo_cl.co_lu);
+	ostid_build_res_name(&osc->oo_oinfo->loi_oi, resname);
 }
 
-/*****************************************************************************
- *
- * Object operations.
- *
- */
+static const struct osc_object_operations osc_object_ops = {
+	.oto_build_res_name = osc_obj_build_res_name,
+	.oto_dlmlock_at_pgoff = osc_obj_dlmlock_at_pgoff,
+};
 
-static int osc_object_init(const struct lu_env *env, struct lu_object *obj,
-                           const struct lu_object_conf *conf)
+int osc_object_init(const struct lu_env *env, struct lu_object *obj,
+		    const struct lu_object_conf *conf)
 {
         struct osc_object           *osc   = lu2osc(obj);
         const struct cl_object_conf *cconf = lu2cl_conf(conf);
 
-        osc->oo_oinfo = cconf->u.coc_oinfo;
+	osc->oo_oinfo = cconf->u.coc_oinfo;
 #ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
 	mutex_init(&osc->oo_debug_mutex);
 #endif
@@ -96,12 +90,15 @@ static int osc_object_init(const struct lu_env *env, struct lu_object *obj,
 	atomic_set(&osc->oo_nr_ios, 0);
 	init_waitqueue_head(&osc->oo_io_waitq);
 
+	LASSERT(osc->oo_obj_ops != NULL);
+
 	cl_object_page_init(lu2cl(obj), sizeof(struct osc_page));
 
 	return 0;
 }
+EXPORT_SYMBOL(osc_object_init);
 
-static void osc_object_free(const struct lu_env *env, struct lu_object *obj)
+void osc_object_free(const struct lu_env *env, struct lu_object *obj)
 {
 	struct osc_object *osc = lu2osc(obj);
 
@@ -123,22 +120,24 @@ static void osc_object_free(const struct lu_env *env, struct lu_object *obj)
 	lu_object_fini(obj);
 	OBD_SLAB_FREE_PTR(osc, osc_object_kmem);
 }
+EXPORT_SYMBOL(osc_object_free);
 
 int osc_lvb_print(const struct lu_env *env, void *cookie,
-                  lu_printer_t p, const struct ost_lvb *lvb)
+		  lu_printer_t p, const struct ost_lvb *lvb)
 {
 	return (*p)(env, cookie, "size: %llu mtime: %llu atime: %llu "
 		    "ctime: %llu blocks: %llu",
                     lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime,
                     lvb->lvb_ctime, lvb->lvb_blocks);
 }
+EXPORT_SYMBOL(osc_lvb_print);
 
-static int osc_object_print(const struct lu_env *env, void *cookie,
-                            lu_printer_t p, const struct lu_object *obj)
+int osc_object_print(const struct lu_env *env, void *cookie,
+		     lu_printer_t p, const struct lu_object *obj)
 {
-	struct osc_object   *osc   = lu2osc(obj);
-	struct lov_oinfo    *oinfo = osc->oo_oinfo;
-	struct osc_async_rc *ar    = &oinfo->loi_ar;
+	struct osc_object *osc = lu2osc(obj);
+	struct lov_oinfo *oinfo = osc->oo_oinfo;
+	struct osc_async_rc *ar = &oinfo->loi_ar;
 
 	(*p)(env, cookie, "id: "DOSTID" "
 	     "idx: %d gen: %d kms_valid: %u kms %llu "
@@ -149,20 +148,22 @@ static int osc_object_print(const struct lu_env *env, void *cookie,
 	osc_lvb_print(env, cookie, p, &oinfo->loi_lvb);
 	return 0;
 }
+EXPORT_SYMBOL(osc_object_print);
 
 
-static int osc_attr_get(const struct lu_env *env, struct cl_object *obj,
-                        struct cl_attr *attr)
+int osc_attr_get(const struct lu_env *env, struct cl_object *obj,
+		 struct cl_attr *attr)
 {
-        struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+	struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
 
-        cl_lvb2attr(attr, &oinfo->loi_lvb);
-        attr->cat_kms = oinfo->loi_kms_valid ? oinfo->loi_kms : 0;
-        return 0;
+	cl_lvb2attr(attr, &oinfo->loi_lvb);
+	attr->cat_kms = oinfo->loi_kms_valid ? oinfo->loi_kms : 0;
+	return 0;
 }
+EXPORT_SYMBOL(osc_attr_get);
 
-static int osc_attr_update(const struct lu_env *env, struct cl_object *obj,
-			   const struct cl_attr *attr, unsigned valid)
+int osc_attr_update(const struct lu_env *env, struct cl_object *obj,
+		    const struct cl_attr *attr, unsigned valid)
 {
 	struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
 	struct ost_lvb   *lvb   = &oinfo->loi_lvb;
@@ -184,39 +185,66 @@ static int osc_attr_update(const struct lu_env *env, struct cl_object *obj,
 	}
 	return 0;
 }
+EXPORT_SYMBOL(osc_attr_update);
 
-static int osc_object_glimpse(const struct lu_env *env,
-                              const struct cl_object *obj, struct ost_lvb *lvb)
+int osc_object_glimpse(const struct lu_env *env, const struct cl_object *obj,
+		       struct ost_lvb *lvb)
 {
-        struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+	struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
 
-        ENTRY;
-        lvb->lvb_size   = oinfo->loi_kms;
-        lvb->lvb_blocks = oinfo->loi_lvb.lvb_blocks;
-        RETURN(0);
+	lvb->lvb_size = oinfo->loi_kms;
+	lvb->lvb_blocks = oinfo->loi_lvb.lvb_blocks;
+	return 0;
 }
+EXPORT_SYMBOL(osc_object_glimpse);
 
 static int osc_object_ast_clear(struct ldlm_lock *lock, void *data)
 {
+	struct osc_object *osc = (struct osc_object *)data;
+	struct ost_lvb *lvb = lock->l_lvb_data;
+	struct lov_oinfo *oinfo;
 	ENTRY;
 
-	if (lock->l_ast_data == data)
+	if (lock->l_ast_data == data) {
 		lock->l_ast_data = NULL;
+
+		LASSERT(osc != NULL);
+		LASSERT(osc->oo_oinfo != NULL);
+		LASSERT(lvb != NULL);
+
+		/* Updates lvb in lock by the cached oinfo */
+		oinfo = osc->oo_oinfo;
+
+		LDLM_DEBUG(lock, "update lock size %llu blocks %llu [cma]time: "
+			   "%llu %llu %llu by oinfo size %llu blocks %llu "
+			   "[cma]time %llu %llu %llu", lvb->lvb_size,
+			   lvb->lvb_blocks, lvb->lvb_ctime, lvb->lvb_mtime,
+			   lvb->lvb_atime, oinfo->loi_lvb.lvb_size,
+			   oinfo->loi_lvb.lvb_blocks, oinfo->loi_lvb.lvb_ctime,
+			   oinfo->loi_lvb.lvb_mtime, oinfo->loi_lvb.lvb_atime);
+		LASSERT(oinfo->loi_lvb.lvb_size >= oinfo->loi_kms);
+
+		cl_object_attr_lock(&osc->oo_cl);
+		memcpy(lvb, &oinfo->loi_lvb, sizeof(oinfo->loi_lvb));
+		cl_object_attr_unlock(&osc->oo_cl);
+		ldlm_clear_lvb_cached(lock);
+	}
 	RETURN(LDLM_ITER_CONTINUE);
 }
 
-static int osc_object_prune(const struct lu_env *env, struct cl_object *obj)
+int osc_object_prune(const struct lu_env *env, struct cl_object *obj)
 {
-	struct osc_object       *osc = cl2osc(obj);
-	struct ldlm_res_id      *resname = &osc_env_info(env)->oti_resname;
+	struct osc_object  *osc = cl2osc(obj);
+	struct ldlm_res_id *resname = &osc_env_info(env)->oti_resname;
 
 	/* DLM locks don't hold a reference of osc_object so we have to
 	 * clear it before the object is being destroyed. */
-	ostid_build_res_name(&osc->oo_oinfo->loi_oi, resname);
+	osc_build_res_name(osc, resname);
 	ldlm_resource_iterate(osc_export(osc)->exp_obd->obd_namespace, resname,
 			      osc_object_ast_clear, osc);
 	return 0;
 }
+EXPORT_SYMBOL(osc_object_prune);
 
 static int osc_object_fiemap(const struct lu_env *env, struct cl_object *obj,
 			     struct ll_fiemap_info_key *fmkey,
@@ -303,24 +331,11 @@ static int osc_object_fiemap(const struct lu_env *env, struct cl_object *obj,
 	RETURN(rc);
 }
 
-void osc_object_set_contended(struct osc_object *obj)
-{
-        obj->oo_contention_time = cfs_time_current();
-        /* mb(); */
-        obj->oo_contended = 1;
-}
-
-void osc_object_clear_contended(struct osc_object *obj)
-{
-        obj->oo_contended = 0;
-}
-
 int osc_object_is_contended(struct osc_object *obj)
 {
-        struct osc_device *dev  = lu2osc_dev(obj->oo_cl.co_lu.lo_dev);
-        int osc_contention_time = dev->od_contention_time;
-        cfs_time_t cur_time     = cfs_time_current();
-        cfs_time_t retry_time;
+	struct osc_device *dev = lu2osc_dev(obj->oo_cl.co_lu.lo_dev);
+	time64_t osc_contention_time = dev->od_contention_time;
+	ktime_t retry_time;
 
         if (OBD_FAIL_CHECK(OBD_FAIL_OSC_OBJECT_CONTENTION))
                 return 1;
@@ -328,18 +343,19 @@ int osc_object_is_contended(struct osc_object *obj)
         if (!obj->oo_contended)
                 return 0;
 
-        /*
-         * I like copy-paste. the code is copied from
-         * ll_file_is_contended.
-         */
-        retry_time = cfs_time_add(obj->oo_contention_time,
-                                  cfs_time_seconds(osc_contention_time));
-        if (cfs_time_after(cur_time, retry_time)) {
-                osc_object_clear_contended(obj);
-                return 0;
-        }
-        return 1;
+	/*
+	 * I like copy-paste. the code is copied from
+	 * ll_file_is_contended.
+	 */
+	retry_time = ktime_add_ns(obj->oo_contention_time,
+				  osc_contention_time * NSEC_PER_SEC);
+	if (ktime_after(ktime_get(), retry_time)) {
+		osc_object_clear_contended(obj);
+		return 0;
+	}
+	return 1;
 }
+EXPORT_SYMBOL(osc_object_is_contended);
 
 /**
  * Implementation of struct cl_object_operations::coo_req_attr_set() for osc
@@ -452,6 +468,7 @@ struct lu_object *osc_object_alloc(const struct lu_env *env,
 		lu_object_init(obj, NULL, dev);
 		osc->oo_cl.co_ops = &osc_ops;
 		obj->lo_ops = &osc_lu_obj_ops;
+		osc->oo_obj_ops = &osc_object_ops;
 	} else
 		obj = NULL;
 	return obj;
@@ -478,5 +495,5 @@ int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc)
 
 	RETURN(0);
 }
-
+EXPORT_SYMBOL(osc_object_invalidate);
 /** @} osc */
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_page.c b/drivers/staging/lustrefsx/lustre/osc/osc_page.c
index c89d11333357d..a37c185772a00 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_page.c
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_page.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -36,8 +36,9 @@
  */
 
 #define DEBUG_SUBSYSTEM S_OSC
+#include <lustre_osc.h>
 
-#include "osc_cl_internal.h"
+#include "osc_internal.h"
 
 static void osc_lru_del(struct client_obd *cli, struct osc_page *opg);
 static void osc_lru_use(struct client_obd *cli, struct osc_page *opg);
@@ -118,12 +119,12 @@ static const char *osc_list(struct list_head *head)
 	return list_empty(head) ? "-" : "+";
 }
 
-static inline cfs_time_t osc_submit_duration(struct osc_page *opg)
+static inline s64 osc_submit_duration(struct osc_page *opg)
 {
-        if (opg->ops_submit_time == 0)
-                return 0;
+	if (ktime_to_ns(opg->ops_submit_time) == 0)
+		return 0;
 
-        return (cfs_time_current() - opg->ops_submit_time);
+	return ktime_ms_delta(ktime_get(), opg->ops_submit_time);
 }
 
 static int osc_page_print(const struct lu_env *env,
@@ -138,8 +139,8 @@ static int osc_page_print(const struct lu_env *env,
 	return (*printer)(env, cookie, LUSTRE_OSC_NAME"-page@%p %lu: "
 			  "1< %#x %d %u %s %s > "
 			  "2< %lld %u %u %#x %#x | %p %p %p > "
-			  "3< %d %lu %d > "
-			  "4< %d %d %d %lu %s | %s %s %s %s > "
+			  "3< %d %lld %d > "
+			  "4< %d %d %d %lu %c | %s %s %s %s > "
 			  "5< %s %s %s %s | %d %s | %d %s %s>\n",
 			  opg, osc_index(opg),
                           /* 1 */
@@ -158,7 +159,7 @@ static int osc_page_print(const struct lu_env *env,
                           cli->cl_r_in_flight, cli->cl_w_in_flight,
                           cli->cl_max_rpcs_in_flight,
                           cli->cl_avail_grant,
-                          osc_list(&cli->cl_cache_waiters),
+			  waitqueue_active(&cli->cl_cache_waiters) ? '+' : '-',
                           osc_list(&cli->cl_loi_ready_list),
                           osc_list(&cli->cl_loi_hp_ready_list),
                           osc_list(&cli->cl_loi_write_list),
@@ -254,12 +255,22 @@ static int osc_page_flush(const struct lu_env *env,
 	RETURN(rc);
 }
 
+static void osc_page_touch(const struct lu_env *env,
+			  const struct cl_page_slice *slice, size_t to)
+{
+	struct osc_page *opg = cl2osc_page(slice);
+	struct cl_object *obj = opg->ops_cl.cpl_obj;
+
+	osc_page_touch_at(env, obj, osc_index(opg), to);
+}
+
 static const struct cl_page_operations osc_page_ops = {
 	.cpo_print         = osc_page_print,
 	.cpo_delete        = osc_page_delete,
 	.cpo_clip           = osc_page_clip,
 	.cpo_cancel         = osc_page_cancel,
-	.cpo_flush          = osc_page_flush
+	.cpo_flush          = osc_page_flush,
+	.cpo_page_touch	   = osc_page_touch,
 };
 
 int osc_page_init(const struct lu_env *env, struct cl_object *obj,
@@ -307,6 +318,7 @@ int osc_page_init(const struct lu_env *env, struct cl_object *obj,
 
 	return result;
 }
+EXPORT_SYMBOL(osc_page_init);
 
 /**
  * Helper function called by osc_io_submit() for every page in an immediate
@@ -315,6 +327,7 @@ int osc_page_init(const struct lu_env *env, struct cl_object *obj,
 void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
 		     enum cl_req_type crt, int brw_flags)
 {
+	struct osc_io *oio = osc_env_io(env);
 	struct osc_async_page *oap = &opg->ops_oap;
 
 	LASSERTF(oap->oap_magic == OAP_MAGIC, "Bad oap magic: oap %p, "
@@ -327,12 +340,12 @@ void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
 	oap->oap_count     = opg->ops_to - opg->ops_from;
 	oap->oap_brw_flags = OBD_BRW_SYNC | brw_flags;
 
-	if (cfs_capable(CFS_CAP_SYS_RESOURCE)) {
+	if (oio->oi_cap_sys_resource) {
 		oap->oap_brw_flags |= OBD_BRW_NOQUOTA;
 		oap->oap_cmd |= OBD_BRW_NOQUOTA;
 	}
 
-	opg->ops_submit_time = cfs_time_current();
+	opg->ops_submit_time = ktime_get();
 	osc_page_transfer_get(opg, "transfer\0imm");
 	osc_page_transfer_add(env, opg, crt);
 }
@@ -516,19 +529,22 @@ static void osc_lru_use(struct client_obd *cli, struct osc_page *opg)
 static void discard_pagevec(const struct lu_env *env, struct cl_io *io,
 				struct cl_page **pvec, int max_index)
 {
-        int i;
+	struct pagevec *pagevec = &osc_env_info(env)->oti_pagevec;
+	int i;
 
-        for (i = 0; i < max_index; i++) {
-                struct cl_page *page = pvec[i];
+	ll_pagevec_init(pagevec, 0);
+	for (i = 0; i < max_index; i++) {
+		struct cl_page *page = pvec[i];
 
 		LASSERT(cl_page_is_owned(page, io));
 		cl_page_delete(env, page);
 		cl_page_discard(env, io, page);
 		cl_page_disown(env, io, page);
-                cl_page_put(env, page);
+		cl_pagevec_put(env, page, pagevec);
 
-                pvec[i] = NULL;
-        }
+		pvec[i] = NULL;
+	}
+	pagevec_release(pagevec);
 }
 
 /**
@@ -588,7 +604,7 @@ long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
 	}
 
 	pvec = (struct cl_page **)osc_env_info(env)->oti_pvec;
-	io = &osc_env_info(env)->oti_io;
+	io = osc_env_thread_io(env);
 
 	spin_lock(&cli->cl_lru_list_lock);
 	if (force)
@@ -690,6 +706,7 @@ long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
 	}
 	RETURN(count > 0 ? count : rc);
 }
+EXPORT_SYMBOL(osc_lru_shrink);
 
 /**
  * Reclaim LRU pages by an IO thread. The caller wants to reclaim at least
@@ -782,6 +799,7 @@ static int osc_lru_alloc(const struct lu_env *env, struct client_obd *cli,
 	struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
 	struct osc_io *oio = osc_env_io(env);
 	int rc = 0;
+
 	ENTRY;
 
 	if (cli->cl_cache == NULL) /* shall not be in LRU */
@@ -887,17 +905,27 @@ void osc_lru_unreserve(struct client_obd *cli, unsigned long npages)
 #endif
 
 static inline void unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
+					    struct osc_brw_async_args *aa,
 					    int factor)
 {
-	int page_count = desc->bd_iov_count;
+	int page_count;
 	void *zone = NULL;
 	int count = 0;
 	int i;
 
-	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
+	if (desc != NULL) {
+		LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
+		page_count = desc->bd_iov_count;
+	} else {
+		page_count = aa->aa_page_count;
+	}
 
 	for (i = 0; i < page_count; i++) {
-		void *pz = page_zone(BD_GET_KIOV(desc, i).kiov_page);
+		void *pz;
+		if (desc)
+			pz = page_zone(BD_GET_KIOV(desc, i).kiov_page);
+		else
+			pz = page_zone(aa->aa_ppga[i]->pg);
 
 		if (likely(pz == zone)) {
 			++count;
@@ -916,14 +944,16 @@ static inline void unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
 		mod_zone_page_state(zone, NR_WRITEBACK, factor * count);
 }
 
-static inline void add_unstable_page_accounting(struct ptlrpc_bulk_desc *desc)
+static inline void add_unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
+						struct osc_brw_async_args *aa)
 {
-	unstable_page_accounting(desc, 1);
+	unstable_page_accounting(desc, aa, 1);
 }
 
-static inline void dec_unstable_page_accounting(struct ptlrpc_bulk_desc *desc)
+static inline void dec_unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
+						struct osc_brw_async_args *aa)
 {
-	unstable_page_accounting(desc, -1);
+	unstable_page_accounting(desc, aa, -1);
 }
 
 /**
@@ -940,12 +970,19 @@ static inline void dec_unstable_page_accounting(struct ptlrpc_bulk_desc *desc)
 void osc_dec_unstable_pages(struct ptlrpc_request *req)
 {
 	struct ptlrpc_bulk_desc *desc       = req->rq_bulk;
+	struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
 	struct client_obd       *cli        = &req->rq_import->imp_obd->u.cli;
-	int			 page_count = desc->bd_iov_count;
+	int			 page_count;
 	long			 unstable_count;
 
+	if (desc)
+		page_count = desc->bd_iov_count;
+	else
+		page_count = aa->aa_page_count;
+
 	LASSERT(page_count >= 0);
-	dec_unstable_page_accounting(desc);
+
+	dec_unstable_page_accounting(desc, aa);
 
 	unstable_count = atomic_long_sub_return(page_count,
 						&cli->cl_unstable_count);
@@ -967,14 +1004,20 @@ void osc_dec_unstable_pages(struct ptlrpc_request *req)
 void osc_inc_unstable_pages(struct ptlrpc_request *req)
 {
 	struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+	struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
 	struct client_obd       *cli  = &req->rq_import->imp_obd->u.cli;
-	long			 page_count = desc->bd_iov_count;
+	long			 page_count;
 
 	/* No unstable page tracking */
 	if (cli->cl_cache == NULL || !cli->cl_cache->ccc_unstable_check)
 		return;
 
-	add_unstable_page_accounting(desc);
+	if (desc)
+		page_count = desc->bd_iov_count;
+	else
+		page_count = aa->aa_page_count;
+
+	add_unstable_page_accounting(desc, aa);
 	atomic_long_add(page_count, &cli->cl_unstable_count);
 	atomic_long_add(page_count, &cli->cl_cache->ccc_unstable_nr);
 
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_quota.c b/drivers/staging/lustrefsx/lustre/osc/osc_quota.c
index 7dcbbd79a5de0..a0aaae784515a 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_quota.c
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_quota.c
@@ -23,12 +23,14 @@
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  *
  * Code originally extracted from quota directory
  */
 
 #include <obd.h>
+#include <lustre_osc.h>
+
 #include "osc_internal.h"
 
 static inline struct osc_quota_info *osc_oqi_alloc(u32 id)
@@ -94,7 +96,7 @@ static inline u32 fl_quota_flag(int qtype)
 	}
 }
 
-int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[],
+int osc_quota_setdq(struct client_obd *cli, __u64 xid, const unsigned int qid[],
 		    u64 valid, u32 flags)
 {
 	int type;
@@ -105,6 +107,17 @@ int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[],
 	if ((valid & (OBD_MD_FLALLQUOTA)) == 0)
 		RETURN(0);
 
+	mutex_lock(&cli->cl_quota_mutex);
+	/* still mark the quots is running out for the old request, because it
+	 * could be processed after the new request at OST, the side effect is
+	 * the following request will be processed synchronously, but it will
+	 * not break the quota enforcement. */
+	if (cli->cl_quota_last_xid > xid && !(flags & OBD_FL_NO_QUOTA_ALL))
+		GOTO(out_unlock, rc);
+
+	if (cli->cl_quota_last_xid < xid)
+		cli->cl_quota_last_xid = xid;
+
 	for (type = 0; type < LL_MAXQUOTAS; type++) {
 		struct osc_quota_info *oqi;
 
@@ -151,6 +164,8 @@ int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[],
 		}
 	}
 
+out_unlock:
+	mutex_unlock(&cli->cl_quota_mutex);
 	RETURN(rc);
 }
 
@@ -230,6 +245,8 @@ int osc_quota_setup(struct obd_device *obd)
 	int i, type;
 	ENTRY;
 
+	mutex_init(&cli->cl_quota_mutex);
+
 	for (type = 0; type < LL_MAXQUOTAS; type++) {
 		cli->cl_quota_hash[type] = cfs_hash_create("QUOTA_HASH",
 							   HASH_QUOTA_CUR_BITS,
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_request.c b/drivers/staging/lustrefsx/lustre/osc/osc_request.c
index b50f4d6ee5019..80695d5805915 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_request.c
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_request.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -32,24 +32,21 @@
 
 #define DEBUG_SUBSYSTEM S_OSC
 
-#include <libcfs/libcfs.h>
-
-#include <lustre/lustre_user.h>
-
+#include <linux/workqueue.h>
 #include <lprocfs_status.h>
 #include <lustre_debug.h>
 #include <lustre_dlm.h>
 #include <lustre_fid.h>
 #include <lustre_ha.h>
-#include <uapi/linux/lustre_ioctl.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
 #include <lustre_net.h>
 #include <lustre_obdo.h>
-#include <uapi/linux/lustre_param.h>
+#include <uapi/linux/lustre/lustre_param.h>
 #include <obd.h>
 #include <obd_cksum.h>
 #include <obd_class.h>
+#include <lustre_osc.h>
 
-#include "osc_cl_internal.h"
 #include "osc_internal.h"
 
 atomic_t osc_pool_req_count;
@@ -60,17 +57,8 @@ struct ptlrpc_request_pool *osc_rq_pool;
 static unsigned int osc_reqpool_mem_max = 5;
 module_param(osc_reqpool_mem_max, uint, 0444);
 
-struct osc_brw_async_args {
-	struct obdo		 *aa_oa;
-	int			  aa_requested_nob;
-	int			  aa_nio_count;
-	u32			  aa_page_count;
-	int			  aa_resends;
-	struct brw_page	**aa_ppga;
-	struct client_obd	 *aa_cli;
-	struct list_head	  aa_oaps;
-	struct list_head	  aa_exts;
-};
+static int osc_idle_timeout = 20;
+module_param(osc_idle_timeout, uint, 0644);
 
 #define osc_grant_args osc_brw_async_args
 
@@ -93,18 +81,6 @@ struct osc_ladvise_args {
 	void			*la_cookie;
 };
 
-struct osc_enqueue_args {
-	struct obd_export	*oa_exp;
-	enum ldlm_type		oa_type;
-	enum ldlm_mode		oa_mode;
-	__u64			*oa_flags;
-	osc_enqueue_upcall_f	oa_upcall;
-	void			*oa_cookie;
-	struct ost_lvb		*oa_lvb;
-	struct lustre_handle	oa_lockh;
-	unsigned int		oa_agl:1;
-};
-
 static void osc_release_ppga(struct brw_page **ppga, size_t count);
 static int brw_interpret(const struct lu_env *env, struct ptlrpc_request *req,
 			 void *data, int rc);
@@ -410,31 +386,34 @@ static int osc_create(const struct lu_env *env, struct obd_export *exp,
 	RETURN(rc);
 }
 
-int osc_punch_base(struct obd_export *exp, struct obdo *oa,
-                   obd_enqueue_update_f upcall, void *cookie,
-                   struct ptlrpc_request_set *rqset)
+int osc_punch_send(struct obd_export *exp, struct obdo *oa,
+		   obd_enqueue_update_f upcall, void *cookie)
 {
-        struct ptlrpc_request   *req;
-        struct osc_setattr_args *sa;
-        struct ost_body         *body;
-        int                      rc;
-        ENTRY;
+	struct ptlrpc_request *req;
+	struct osc_setattr_args *sa;
+	struct obd_import *imp = class_exp2cliimp(exp);
+	struct ost_body *body;
+	int rc;
 
-        req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_PUNCH);
-        if (req == NULL)
-                RETURN(-ENOMEM);
+	ENTRY;
 
-        rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH);
-        if (rc) {
-                ptlrpc_request_free(req);
-                RETURN(rc);
-        }
-        req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
-        ptlrpc_at_set_req_timeout(req);
+	req = ptlrpc_request_alloc(imp, &RQF_OST_PUNCH);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	osc_set_io_portal(req);
+
+	ptlrpc_at_set_req_timeout(req);
 
 	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
-	LASSERT(body);
-	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+
+	lustre_set_wire_obdo(&imp->imp_connect_data, &body->oa, oa);
 
 	ptlrpc_request_set_replen(req);
 
@@ -444,13 +423,12 @@ int osc_punch_base(struct obd_export *exp, struct obdo *oa,
 	sa->sa_oa = oa;
 	sa->sa_upcall = upcall;
 	sa->sa_cookie = cookie;
-	if (rqset == PTLRPCD_SET)
-		ptlrpcd_add_req(req);
-	else
-		ptlrpc_set_add_req(rqset, req);
+
+	ptlrpcd_add_req(req);
 
 	RETURN(0);
 }
+EXPORT_SYMBOL(osc_punch_send);
 
 static int osc_sync_interpret(const struct lu_env *env,
                               struct ptlrpc_request *req,
@@ -673,21 +651,18 @@ static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
 		oa->o_dirty = cli->cl_dirty_grant;
 	else
 		oa->o_dirty = cli->cl_dirty_pages << PAGE_SHIFT;
-	if (unlikely(cli->cl_dirty_pages - cli->cl_dirty_transit >
-		     cli->cl_dirty_max_pages)) {
-		CERROR("dirty %lu - %lu > dirty_max %lu\n",
-		       cli->cl_dirty_pages, cli->cl_dirty_transit,
+	if (unlikely(cli->cl_dirty_pages > cli->cl_dirty_max_pages)) {
+		CERROR("dirty %lu > dirty_max %lu\n",
+		       cli->cl_dirty_pages,
 		       cli->cl_dirty_max_pages);
 		oa->o_undirty = 0;
-	} else if (unlikely(atomic_long_read(&obd_dirty_pages) -
-			    atomic_long_read(&obd_dirty_transit_pages) >
+	} else if (unlikely(atomic_long_read(&obd_dirty_pages) >
 			    (long)(obd_max_dirty_pages + 1))) {
 		/* The atomic_read() allowing the atomic_inc() are
 		 * not covered by a lock thus they may safely race and trip
 		 * this CERROR() unless we add in a small fudge factor (+1). */
-		CERROR("%s: dirty %ld - %ld > system dirty_max %ld\n",
+		CERROR("%s: dirty %ld > system dirty_max %ld\n",
 		       cli_name(cli), atomic_long_read(&obd_dirty_pages),
-		       atomic_long_read(&obd_dirty_transit_pages),
 		       obd_max_dirty_pages);
 		oa->o_undirty = 0;
 	} else if (unlikely(cli->cl_dirty_max_pages - cli->cl_dirty_pages >
@@ -716,23 +691,33 @@ static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
 		/* Do not ask for more than OBD_MAX_GRANT - a margin for server
 		 * to add extent tax, etc.
 		 */
-		oa->o_undirty = min(undirty, OBD_MAX_GRANT -
-				    (PTLRPC_MAX_BRW_PAGES << PAGE_SHIFT)*4UL);
+		oa->o_undirty = min(undirty, OBD_MAX_GRANT &
+				    ~(PTLRPC_MAX_BRW_SIZE * 4UL));
         }
 	oa->o_grant = cli->cl_avail_grant + cli->cl_reserved_grant;
-        oa->o_dropped = cli->cl_lost_grant;
-        cli->cl_lost_grant = 0;
+	/* o_dropped AKA o_misc is 32 bits, but cl_lost_grant is 64 bits */
+	if (cli->cl_lost_grant > INT_MAX) {
+		CDEBUG(D_CACHE,
+		      "%s: avoided o_dropped overflow: cl_lost_grant %lu\n",
+		      cli_name(cli), cli->cl_lost_grant);
+		oa->o_dropped = INT_MAX;
+	} else {
+		oa->o_dropped = cli->cl_lost_grant;
+	}
+	cli->cl_lost_grant -= oa->o_dropped;
 	spin_unlock(&cli->cl_loi_list_lock);
-	CDEBUG(D_CACHE, "dirty: %llu undirty: %u dropped %u grant: %llu\n",
-               oa->o_dirty, oa->o_undirty, oa->o_dropped, oa->o_grant);
+	CDEBUG(D_CACHE, "%s: dirty: %llu undirty: %u dropped %u grant: %llu"
+	       " cl_lost_grant %lu\n", cli_name(cli), oa->o_dirty,
+	       oa->o_undirty, oa->o_dropped, oa->o_grant, cli->cl_lost_grant);
 }
 
 void osc_update_next_shrink(struct client_obd *cli)
 {
-        cli->cl_next_shrink_grant =
-                cfs_time_shift(cli->cl_grant_shrink_interval);
-        CDEBUG(D_CACHE, "next time %ld to shrink grant \n",
-               cli->cl_next_shrink_grant);
+	cli->cl_next_shrink_grant = ktime_get_seconds() +
+				    cli->cl_grant_shrink_interval;
+
+	CDEBUG(D_CACHE, "next time %lld to shrink grant\n",
+	       cli->cl_next_shrink_grant);
 }
 
 static void __osc_update_grant(struct client_obd *cli, u64 grant)
@@ -750,30 +735,36 @@ static void osc_update_grant(struct client_obd *cli, struct ost_body *body)
         }
 }
 
-static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
-			      u32 keylen, void *key,
-			      u32 vallen, void *val,
-			      struct ptlrpc_request_set *set);
+/**
+ * grant thread data for shrinking space.
+ */
+struct grant_thread_data {
+	struct list_head	gtd_clients;
+	struct mutex		gtd_mutex;
+	unsigned long		gtd_stopped:1;
+};
+static struct grant_thread_data client_gtd;
 
 static int osc_shrink_grant_interpret(const struct lu_env *env,
-                                      struct ptlrpc_request *req,
-                                      void *aa, int rc)
+				      struct ptlrpc_request *req,
+				      void *aa, int rc)
 {
-        struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
-        struct obdo *oa = ((struct osc_grant_args *)aa)->aa_oa;
-        struct ost_body *body;
+	struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+	struct obdo *oa = ((struct osc_grant_args *)aa)->aa_oa;
+	struct ost_body *body;
 
-        if (rc != 0) {
-                __osc_update_grant(cli, oa->o_grant);
-                GOTO(out, rc);
-        }
+	if (rc != 0) {
+		__osc_update_grant(cli, oa->o_grant);
+		GOTO(out, rc);
+	}
 
-        body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
-        LASSERT(body);
-        osc_update_grant(cli, body);
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+	osc_update_grant(cli, body);
 out:
-        OBDO_FREE(oa);
-        return rc;
+	OBD_SLAB_FREE_PTR(oa, osc_obdo_kmem);
+	oa = NULL;
+	return rc;
 }
 
 static void osc_shrink_grant_local(struct client_obd *cli, struct obdo *oa)
@@ -833,6 +824,11 @@ int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes)
 	osc_announce_cached(cli, &body->oa, 0);
 
 	spin_lock(&cli->cl_loi_list_lock);
+	if (target_bytes >= cli->cl_avail_grant) {
+		/* available grant has changed since target calculation */
+		spin_unlock(&cli->cl_loi_list_lock);
+		GOTO(out_free, rc = 0);
+	}
 	body->oa.o_grant = cli->cl_avail_grant - target_bytes;
 	cli->cl_avail_grant = target_bytes;
 	spin_unlock(&cli->cl_loi_list_lock);
@@ -848,20 +844,25 @@ int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes)
                                 sizeof(*body), body, NULL);
         if (rc != 0)
                 __osc_update_grant(cli, body->oa.o_grant);
+out_free:
         OBD_FREE_PTR(body);
         RETURN(rc);
 }
 
 static int osc_should_shrink_grant(struct client_obd *client)
 {
-        cfs_time_t time = cfs_time_current();
-        cfs_time_t next_shrink = client->cl_next_shrink_grant;
+	time64_t next_shrink = client->cl_next_shrink_grant;
 
-        if ((client->cl_import->imp_connect_data.ocd_connect_flags &
-             OBD_CONNECT_GRANT_SHRINK) == 0)
-                return 0;
+	if (client->cl_import == NULL)
+		return 0;
 
-	if (cfs_time_aftereq(time, next_shrink - 5 * CFS_TICK)) {
+	if (!OCD_HAS_FLAG(&client->cl_import->imp_connect_data, GRANT_SHRINK) ||
+	    client->cl_import->imp_grant_shrink_disabled) {
+		osc_update_next_shrink(client);
+		return 0;
+	}
+
+	if (ktime_get_seconds() >= next_shrink - 5) {
 		/* Get the current RPC size directly, instead of going via:
 		 * cli_brw_size(obd->u.cli.cl_import->imp_obd->obd_self_export)
 		 * Keep comment here so that it can be found by searching. */
@@ -876,41 +877,88 @@ static int osc_should_shrink_grant(struct client_obd *client)
         return 0;
 }
 
-static int osc_grant_shrink_grant_cb(struct timeout_item *item, void *data)
+#define GRANT_SHRINK_RPC_BATCH	100
+
+static struct delayed_work work;
+
+static void osc_grant_work_handler(struct work_struct *data)
 {
-	struct client_obd *client;
+	struct client_obd *cli;
+	int rpc_sent;
+	bool init_next_shrink = true;
+	time64_t next_shrink = ktime_get_seconds() + GRANT_SHRINK_INTERVAL;
+
+	rpc_sent = 0;
+	mutex_lock(&client_gtd.gtd_mutex);
+	list_for_each_entry(cli, &client_gtd.gtd_clients,
+			    cl_grant_chain) {
+		if (rpc_sent < GRANT_SHRINK_RPC_BATCH &&
+		    osc_should_shrink_grant(cli)) {
+			osc_shrink_grant(cli);
+			rpc_sent++;
+		}
 
-	list_for_each_entry(client, &item->ti_obd_list, cl_grant_shrink_list) {
-		if (osc_should_shrink_grant(client))
-			osc_shrink_grant(client);
+		if (!init_next_shrink) {
+			if (cli->cl_next_shrink_grant < next_shrink &&
+			    cli->cl_next_shrink_grant > ktime_get_seconds())
+				next_shrink = cli->cl_next_shrink_grant;
+		} else {
+			init_next_shrink = false;
+			next_shrink = cli->cl_next_shrink_grant;
+		}
 	}
-	return 0;
+	mutex_unlock(&client_gtd.gtd_mutex);
+
+	if (client_gtd.gtd_stopped == 1)
+		return;
+
+	if (next_shrink > ktime_get_seconds())
+		schedule_delayed_work(&work, msecs_to_jiffies(
+					(next_shrink - ktime_get_seconds()) *
+					MSEC_PER_SEC));
+	else
+		schedule_work(&work.work);
 }
 
-static int osc_add_shrink_grant(struct client_obd *client)
+/**
+ * Start grant thread for returing grant to server for idle clients.
+ */
+static int osc_start_grant_work(void)
 {
-	int rc;
+	client_gtd.gtd_stopped = 0;
+	mutex_init(&client_gtd.gtd_mutex);
+	INIT_LIST_HEAD(&client_gtd.gtd_clients);
+
+	INIT_DELAYED_WORK(&work, osc_grant_work_handler);
+	schedule_work(&work.work);
 
-	rc = ptlrpc_add_timeout_client(client->cl_grant_shrink_interval,
-				       TIMEOUT_GRANT,
-				       osc_grant_shrink_grant_cb, NULL,
-				       &client->cl_grant_shrink_list);
-	if (rc) {
-		CERROR("add grant client %s error %d\n", cli_name(client), rc);
-		return rc;
-	}
-	CDEBUG(D_CACHE, "add grant client %s\n", cli_name(client));
-	osc_update_next_shrink(client);
 	return 0;
 }
 
-static int osc_del_shrink_grant(struct client_obd *client)
+static void osc_stop_grant_work(void)
+{
+	client_gtd.gtd_stopped = 1;
+	cancel_delayed_work_sync(&work);
+}
+
+static void osc_add_grant_list(struct client_obd *client)
 {
-        return ptlrpc_del_timeout_client(&client->cl_grant_shrink_list,
-                                         TIMEOUT_GRANT);
+	mutex_lock(&client_gtd.gtd_mutex);
+	list_add(&client->cl_grant_chain, &client_gtd.gtd_clients);
+	mutex_unlock(&client_gtd.gtd_mutex);
 }
 
-static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
+static void osc_del_grant_list(struct client_obd *client)
+{
+	if (list_empty(&client->cl_grant_chain))
+		return;
+
+	mutex_lock(&client_gtd.gtd_mutex);
+	list_del_init(&client->cl_grant_chain);
+	mutex_unlock(&client_gtd.gtd_mutex);
+}
+
+void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
 {
 	/*
 	 * ocd_grant is the total grant amount we're expect to hold: if we've
@@ -924,12 +972,19 @@ static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
 	spin_lock(&cli->cl_loi_list_lock);
 	cli->cl_avail_grant = ocd->ocd_grant;
 	if (cli->cl_import->imp_state != LUSTRE_IMP_EVICTED) {
-		cli->cl_avail_grant -= cli->cl_reserved_grant;
+		unsigned long consumed = cli->cl_reserved_grant;
+
 		if (OCD_HAS_FLAG(ocd, GRANT_PARAM))
-			cli->cl_avail_grant -= cli->cl_dirty_grant;
+			consumed += cli->cl_dirty_grant;
 		else
-			cli->cl_avail_grant -=
-					cli->cl_dirty_pages << PAGE_SHIFT;
+			consumed += cli->cl_dirty_pages << PAGE_SHIFT;
+		if (cli->cl_avail_grant < consumed) {
+			CERROR("%s: granted %ld but already consumed %ld\n",
+			       cli_name(cli), cli->cl_avail_grant, consumed);
+			cli->cl_avail_grant = 0;
+		} else {
+			cli->cl_avail_grant -= consumed;
+		}
 	}
 
 	if (OCD_HAS_FLAG(ocd, GRANT_PARAM)) {
@@ -963,10 +1018,10 @@ static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
 		cli->cl_avail_grant, cli->cl_lost_grant, cli->cl_chunkbits,
 		cli->cl_max_extent_pages);
 
-	if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT_SHRINK &&
-	    list_empty(&cli->cl_grant_shrink_list))
-		osc_add_shrink_grant(cli);
+	if (OCD_HAS_FLAG(ocd, GRANT_SHRINK) && list_empty(&cli->cl_grant_chain))
+		osc_add_grant_list(cli);
 }
+EXPORT_SYMBOL(osc_init_grant);
 
 /* We assume that the reason this OSC got a short read is because it read
  * beyond the end of a stripe file; i.e. lustre is reading a sparse file
@@ -1033,8 +1088,8 @@ static int check_write_rcs(struct ptlrpc_request *req,
                         return(-EPROTO);
                 }
         }
-
-        if (req->rq_bulk->bd_nob_transferred != requested_nob) {
+	if (req->rq_bulk != NULL &&
+	    req->rq_bulk->bd_nob_transferred != requested_nob) {
                 CERROR("Unexpected # bytes transferred: %d (requested %d)\n",
                        req->rq_bulk->bd_nob_transferred, requested_nob);
                 return(-EPROTO);
@@ -1046,9 +1101,9 @@ static int check_write_rcs(struct ptlrpc_request *req,
 static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2)
 {
         if (p1->flag != p2->flag) {
-		unsigned mask = ~(OBD_BRW_FROM_GRANT | OBD_BRW_NOCACHE |
-				  OBD_BRW_SYNC       | OBD_BRW_ASYNC   |
-				  OBD_BRW_NOQUOTA    | OBD_BRW_SOFT_SYNC);
+		unsigned mask = ~(OBD_BRW_FROM_GRANT | OBD_BRW_SYNC |
+				  OBD_BRW_ASYNC | OBD_BRW_NOQUOTA |
+				  OBD_BRW_SOFT_SYNC);
 
                 /* warn if we try to combine flags that we don't know to be
                  * safe to combine */
@@ -1063,23 +1118,128 @@ static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2)
         return (p1->off + p1->count == p2->off);
 }
 
-static u32 osc_checksum_bulk(int nob, size_t pg_count,
+#if IS_ENABLED(CONFIG_CRC_T10DIF)
+static int osc_checksum_bulk_t10pi(const char *obd_name, int nob,
+				   size_t pg_count, struct brw_page **pga,
+				   int opc, obd_dif_csum_fn *fn,
+				   int sector_size,
+				   u32 *check_sum)
+{
+	struct ahash_request *req;
+	/* Used Adler as the default checksum type on top of DIF tags */
+	unsigned char cfs_alg = cksum_obd2cfs(OBD_CKSUM_T10_TOP);
+	struct page *__page;
+	unsigned char *buffer;
+	__u16 *guard_start;
+	unsigned int bufsize;
+	int guard_number;
+	int used_number = 0;
+	int used;
+	u32 cksum;
+	int rc = 0;
+	int i = 0;
+
+	LASSERT(pg_count > 0);
+
+	__page = alloc_page(GFP_KERNEL);
+	if (__page == NULL)
+		return -ENOMEM;
+
+	req = cfs_crypto_hash_init(cfs_alg, NULL, 0);
+	if (IS_ERR(req)) {
+		rc = PTR_ERR(req);
+		CERROR("%s: unable to initialize checksum hash %s: rc = %d\n",
+		       obd_name, cfs_crypto_hash_name(cfs_alg), rc);
+		GOTO(out, rc);
+	}
+
+	buffer = kmap(__page);
+	guard_start = (__u16 *)buffer;
+	guard_number = PAGE_SIZE / sizeof(*guard_start);
+	while (nob > 0 && pg_count > 0) {
+		unsigned int count = pga[i]->count > nob ? nob : pga[i]->count;
+
+		/* corrupt the data before we compute the checksum, to
+		 * simulate an OST->client data error */
+		if (unlikely(i == 0 && opc == OST_READ &&
+			     OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE))) {
+			unsigned char *ptr = kmap(pga[i]->pg);
+			int off = pga[i]->off & ~PAGE_MASK;
+
+			memcpy(ptr + off, "bad1", min_t(typeof(nob), 4, nob));
+			kunmap(pga[i]->pg);
+		}
+
+		/*
+		 * The left guard number should be able to hold checksums of a
+		 * whole page
+		 */
+		rc = obd_page_dif_generate_buffer(obd_name, pga[i]->pg,
+						  pga[i]->off & ~PAGE_MASK,
+						  count,
+						  guard_start + used_number,
+						  guard_number - used_number,
+						  &used, sector_size,
+						  fn);
+		if (rc)
+			break;
+
+		used_number += used;
+		if (used_number == guard_number) {
+			cfs_crypto_hash_update_page(req, __page, 0,
+				used_number * sizeof(*guard_start));
+			used_number = 0;
+		}
+
+		nob -= pga[i]->count;
+		pg_count--;
+		i++;
+	}
+	kunmap(__page);
+	if (rc)
+		GOTO(out, rc);
+
+	if (used_number != 0)
+		cfs_crypto_hash_update_page(req, __page, 0,
+			used_number * sizeof(*guard_start));
+
+	bufsize = sizeof(cksum);
+	cfs_crypto_hash_final(req, (unsigned char *)&cksum, &bufsize);
+
+	/* For sending we only compute the wrong checksum instead
+	 * of corrupting the data so it is still correct on a redo */
+	if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND))
+		cksum++;
+
+	*check_sum = cksum;
+out:
+	__free_page(__page);
+	return rc;
+}
+#else /* !CONFIG_CRC_T10DIF */
+#define obd_dif_ip_fn NULL
+#define obd_dif_crc_fn NULL
+#define osc_checksum_bulk_t10pi(name, nob, pgc, pga, opc, fn, ssize, csum)  \
+	-EOPNOTSUPP
+#endif /* CONFIG_CRC_T10DIF */
+
+static int osc_checksum_bulk(int nob, size_t pg_count,
 			     struct brw_page **pga, int opc,
-			     cksum_type_t cksum_type)
+			     enum cksum_types cksum_type,
+			     u32 *cksum)
 {
-	u32				cksum;
 	int				i = 0;
-	struct cfs_crypto_hash_desc	*hdesc;
+	struct ahash_request	       *req;
 	unsigned int			bufsize;
 	unsigned char			cfs_alg = cksum_obd2cfs(cksum_type);
 
 	LASSERT(pg_count > 0);
 
-	hdesc = cfs_crypto_hash_init(cfs_alg, NULL, 0);
-	if (IS_ERR(hdesc)) {
+	req = cfs_crypto_hash_init(cfs_alg, NULL, 0);
+	if (IS_ERR(req)) {
 		CERROR("Unable to initialize checksum hash %s\n",
 		       cfs_crypto_hash_name(cfs_alg));
-		return PTR_ERR(hdesc);
+		return PTR_ERR(req);
 	}
 
 	while (nob > 0 && pg_count > 0) {
@@ -1095,7 +1255,7 @@ static u32 osc_checksum_bulk(int nob, size_t pg_count,
 			memcpy(ptr + off, "bad1", min_t(typeof(nob), 4, nob));
 			kunmap(pga[i]->pg);
 		}
-		cfs_crypto_hash_update_page(hdesc, pga[i]->pg,
+		cfs_crypto_hash_update_page(req, pga[i]->pg,
 					    pga[i]->off & ~PAGE_MASK,
 					    count);
 		LL_CDEBUG_PAGE(D_PAGE, pga[i]->pg, "off %d\n",
@@ -1106,15 +1266,38 @@ static u32 osc_checksum_bulk(int nob, size_t pg_count,
 		i++;
 	}
 
-	bufsize = sizeof(cksum);
-	cfs_crypto_hash_final(hdesc, (unsigned char *)&cksum, &bufsize);
+	bufsize = sizeof(*cksum);
+	cfs_crypto_hash_final(req, (unsigned char *)cksum, &bufsize);
 
 	/* For sending we only compute the wrong checksum instead
 	 * of corrupting the data so it is still correct on a redo */
 	if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND))
-		cksum++;
+		(*cksum)++;
 
-	return cksum;
+	return 0;
+}
+
+static int osc_checksum_bulk_rw(const char *obd_name,
+				enum cksum_types cksum_type,
+				int nob, size_t pg_count,
+				struct brw_page **pga, int opc,
+				u32 *check_sum)
+{
+	obd_dif_csum_fn *fn = NULL;
+	int sector_size = 0;
+	int rc;
+
+	ENTRY;
+	obd_t10_cksum2dif(cksum_type, &fn, &sector_size);
+
+	if (fn)
+		rc = osc_checksum_bulk_t10pi(obd_name, nob, pg_count, pga,
+					     opc, fn, sector_size, check_sum);
+	else
+		rc = osc_checksum_bulk(nob, pg_count, pga, opc, cksum_type,
+				       check_sum);
+
+	RETURN(rc);
 }
 
 static int
@@ -1127,10 +1310,12 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
         struct ost_body         *body;
         struct obd_ioobj        *ioobj;
         struct niobuf_remote    *niobuf;
-        int niocount, i, requested_nob, opc, rc;
+	int niocount, i, requested_nob, opc, rc, short_io_size = 0;
         struct osc_brw_async_args *aa;
         struct req_capsule      *pill;
         struct brw_page *pg_prev;
+	void *short_io_buf;
+	const char *obd_name = cli->cl_import->imp_obd->obd_name;
 
         ENTRY;
         if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ))
@@ -1161,17 +1346,38 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
         req_capsule_set_size(pill, &RMF_NIOBUF_REMOTE, RCL_CLIENT,
                              niocount * sizeof(*niobuf));
 
+	for (i = 0; i < page_count; i++)
+		short_io_size += pga[i]->count;
+
+	/* Check if read/write is small enough to be a short io. */
+	if (short_io_size > cli->cl_max_short_io_bytes || niocount > 1 ||
+	    !imp_connect_shortio(cli->cl_import))
+		short_io_size = 0;
+
+	req_capsule_set_size(pill, &RMF_SHORT_IO, RCL_CLIENT,
+			     opc == OST_READ ? 0 : short_io_size);
+	if (opc == OST_READ)
+		req_capsule_set_size(pill, &RMF_SHORT_IO, RCL_SERVER,
+				     short_io_size);
+
         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, opc);
         if (rc) {
                 ptlrpc_request_free(req);
                 RETURN(rc);
         }
-        req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
-        ptlrpc_at_set_req_timeout(req);
+	osc_set_io_portal(req);
+
+	ptlrpc_at_set_req_timeout(req);
 	/* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own
 	 * retry logic */
 	req->rq_no_retry_einprogress = 1;
 
+	if (short_io_size != 0) {
+		desc = NULL;
+		short_io_buf = NULL;
+		goto no_bulk;
+	}
+
 	desc = ptlrpc_prep_bulk_imp(req, page_count,
 		cli->cl_import->imp_connect_data.ocd_brw_size >> LNET_MTU_BITS,
 		(opc == OST_WRITE ? PTLRPC_BULK_GET_SOURCE :
@@ -1183,7 +1389,7 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
         if (desc == NULL)
                 GOTO(out, rc = -ENOMEM);
         /* NB request now owns desc and will free it when it gets freed */
-
+no_bulk:
         body = req_capsule_client_get(pill, &RMF_OST_BODY);
         ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ);
         niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
@@ -1191,6 +1397,15 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
 
 	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
 
+	/* For READ and WRITE, we can't fill o_uid and o_gid using from_kuid()
+	 * and from_kgid(), because they are asynchronous. Fortunately, variable
+	 * oa contains valid o_uid and o_gid in these two operations.
+	 * Besides, filling o_uid and o_gid is enough for nrs-tbf, see LU-9658.
+	 * OBD_MD_FLUID and OBD_MD_FLUID is not set in order to avoid breaking
+	 * other process logic */
+	body->oa.o_uid = oa->o_uid;
+	body->oa.o_gid = oa->o_gid;
+
 	obdo_to_ioobj(oa, ioobj);
 	ioobj->ioo_bufcnt = niocount;
 	/* The high bits of ioo_max_brw tells server _maximum_ number of bulks
@@ -1198,7 +1413,26 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
 	 * when the RPC is finally sent in ptlrpc_register_bulk(). It sends
 	 * "max - 1" for old client compatibility sending "0", and also so the
 	 * the actual maximum is a power-of-two number, not one less. LU-1431 */
-	ioobj_max_brw_set(ioobj, desc->bd_md_max_brw);
+	if (desc != NULL)
+		ioobj_max_brw_set(ioobj, desc->bd_md_max_brw);
+	else /* short io */
+		ioobj_max_brw_set(ioobj, 0);
+
+	if (short_io_size != 0) {
+		if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
+			body->oa.o_valid |= OBD_MD_FLFLAGS;
+			body->oa.o_flags = 0;
+		}
+		body->oa.o_flags |= OBD_FL_SHORT_IO;
+		CDEBUG(D_CACHE, "Using short io for data transfer, size = %d\n",
+		       short_io_size);
+		if (opc == OST_WRITE) {
+			short_io_buf = req_capsule_client_get(pill,
+							      &RMF_SHORT_IO);
+			LASSERT(short_io_buf != NULL);
+		}
+	}
+
 	LASSERT(page_count > 0);
 	pg_prev = pga[0];
         for (requested_nob = i = 0; i < page_count; i++, niobuf++) {
@@ -1223,9 +1457,19 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
                          pg_prev->pg->index, pg_prev->off);
                 LASSERT((pga[0]->flag & OBD_BRW_SRVLOCK) ==
                         (pg->flag & OBD_BRW_SRVLOCK));
-
-		desc->bd_frag_ops->add_kiov_frag(desc, pg->pg, poff, pg->count);
-                requested_nob += pg->count;
+		if (short_io_size != 0 && opc == OST_WRITE) {
+			unsigned char *ptr = ll_kmap_atomic(pg->pg, KM_USER0);
+
+			LASSERT(short_io_size >= requested_nob + pg->count);
+			memcpy(short_io_buf + requested_nob,
+			       ptr + poff,
+			       pg->count);
+			ll_kunmap_atomic(ptr, KM_USER0);
+		} else if (short_io_size == 0) {
+			desc->bd_frag_ops->add_kiov_frag(desc, pg->pg, poff,
+							 pg->count);
+		}
+		requested_nob += pg->count;
 
                 if (i > 0 && can_merge_pages(pg_prev, pg)) {
                         niobuf--;
@@ -1261,22 +1505,31 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
                     !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
                         /* store cl_cksum_type in a local variable since
                          * it can be changed via lprocfs */
-                        cksum_type_t cksum_type = cli->cl_cksum_type;
+			enum cksum_types cksum_type = cli->cl_cksum_type;
 
                         if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0)
                                 body->oa.o_flags = 0;
 
-                        body->oa.o_flags |= cksum_type_pack(cksum_type);
+			body->oa.o_flags |= obd_cksum_type_pack(obd_name,
+								cksum_type);
                         body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
-                        body->oa.o_cksum = osc_checksum_bulk(requested_nob,
-                                                             page_count, pga,
-                                                             OST_WRITE,
-                                                             cksum_type);
+
+			rc = osc_checksum_bulk_rw(obd_name, cksum_type,
+						  requested_nob, page_count,
+						  pga, OST_WRITE,
+						  &body->oa.o_cksum);
+			if (rc < 0) {
+				CDEBUG(D_PAGE, "failed to checksum, rc = %d\n",
+				       rc);
+				GOTO(out, rc);
+			}
                         CDEBUG(D_PAGE, "checksum at write origin: %x\n",
                                body->oa.o_cksum);
+
                         /* save this in 'oa', too, for later checking */
                         oa->o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
-                        oa->o_flags |= cksum_type_pack(cksum_type);
+			oa->o_flags |= obd_cksum_type_pack(obd_name,
+							   cksum_type);
                 } else {
                         /* clear out the checksum flag, in case this is a
                          * resend but cl_checksum is no longer set. b=11238 */
@@ -1291,26 +1544,27 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
                     !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
                         if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0)
                                 body->oa.o_flags = 0;
-                        body->oa.o_flags |= cksum_type_pack(cli->cl_cksum_type);
+			body->oa.o_flags |= obd_cksum_type_pack(obd_name,
+				cli->cl_cksum_type);
                         body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
-                }
+		}
 
 		/* Client cksum has been already copied to wire obdo in previous
 		 * lustre_set_wire_obdo(), and in the case a bulk-read is being
 		 * resent due to cksum error, this will allow Server to
 		 * check+dump pages on its side */
 	}
-        ptlrpc_request_set_replen(req);
+	ptlrpc_request_set_replen(req);
 
-        CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
-        aa = ptlrpc_req_async_args(req);
-        aa->aa_oa = oa;
-        aa->aa_requested_nob = requested_nob;
-        aa->aa_nio_count = niocount;
-        aa->aa_page_count = page_count;
-        aa->aa_resends = 0;
-        aa->aa_ppga = pga;
-        aa->aa_cli = cli;
+	CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+	aa = ptlrpc_req_async_args(req);
+	aa->aa_oa = oa;
+	aa->aa_requested_nob = requested_nob;
+	aa->aa_nio_count = niocount;
+	aa->aa_page_count = page_count;
+	aa->aa_resends = 0;
+	aa->aa_ppga = pga;
+	aa->aa_cli = cli;
 	INIT_LIST_HEAD(&aa->aa_oaps);
 
 	*reqp = req;
@@ -1389,13 +1643,17 @@ static void dump_all_bulk_pages(struct obdo *oa, __u32 page_count,
 }
 
 static int
-check_write_checksum(struct obdo *oa, const lnet_process_id_t *peer,
-				__u32 client_cksum, __u32 server_cksum,
-				struct osc_brw_async_args *aa)
-{
-        __u32 new_cksum;
-        char *msg;
-        cksum_type_t cksum_type;
+check_write_checksum(struct obdo *oa, const struct lnet_process_id *peer,
+		     __u32 client_cksum, __u32 server_cksum,
+		     struct osc_brw_async_args *aa)
+{
+	const char *obd_name = aa->aa_cli->cl_import->imp_obd->obd_name;
+	enum cksum_types cksum_type;
+	obd_dif_csum_fn *fn = NULL;
+	int sector_size = 0;
+	__u32 new_cksum;
+	char *msg;
+	int rc;
 
         if (server_cksum == client_cksum) {
                 CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
@@ -1406,12 +1664,43 @@ check_write_checksum(struct obdo *oa, const lnet_process_id_t *peer,
 		dump_all_bulk_pages(oa, aa->aa_page_count, aa->aa_ppga,
 				    server_cksum, client_cksum);
 
-	cksum_type = cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
-				       oa->o_flags : 0);
-	new_cksum = osc_checksum_bulk(aa->aa_requested_nob, aa->aa_page_count,
-				      aa->aa_ppga, OST_WRITE, cksum_type);
+	cksum_type = obd_cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
+					   oa->o_flags : 0);
+
+	switch (cksum_type) {
+	case OBD_CKSUM_T10IP512:
+		fn = obd_dif_ip_fn;
+		sector_size = 512;
+		break;
+	case OBD_CKSUM_T10IP4K:
+		fn = obd_dif_ip_fn;
+		sector_size = 4096;
+		break;
+	case OBD_CKSUM_T10CRC512:
+		fn = obd_dif_crc_fn;
+		sector_size = 512;
+		break;
+	case OBD_CKSUM_T10CRC4K:
+		fn = obd_dif_crc_fn;
+		sector_size = 4096;
+		break;
+	default:
+		break;
+	}
+
+	if (fn)
+		rc = osc_checksum_bulk_t10pi(obd_name, aa->aa_requested_nob,
+					     aa->aa_page_count, aa->aa_ppga,
+					     OST_WRITE, fn, sector_size,
+					     &new_cksum);
+	else
+		rc = osc_checksum_bulk(aa->aa_requested_nob, aa->aa_page_count,
+				       aa->aa_ppga, OST_WRITE, cksum_type,
+				       &new_cksum);
 
-	if (cksum_type != cksum_type_unpack(aa->aa_oa->o_flags))
+	if (rc < 0)
+		msg = "failed to calculate the client write checksum";
+	else if (cksum_type != obd_cksum_type_unpack(aa->aa_oa->o_flags))
                 msg = "the server did not use the checksum type specified in "
                       "the original request - likely a protocol problem";
         else if (new_cksum == server_cksum)
@@ -1427,15 +1716,15 @@ check_write_checksum(struct obdo *oa, const lnet_process_id_t *peer,
 			   DFID " object "DOSTID" extent [%llu-%llu], original "
 			   "client csum %x (type %x), server csum %x (type %x),"
 			   " client csum now %x\n",
-			   aa->aa_cli->cl_import->imp_obd->obd_name,
-			   msg, libcfs_nid2str(peer->nid),
+			   obd_name, msg, libcfs_nid2str(peer->nid),
 			   oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0,
 			   oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
 			   oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
 			   POSTID(&oa->o_oi), aa->aa_ppga[0]->off,
 			   aa->aa_ppga[aa->aa_page_count - 1]->off +
 				aa->aa_ppga[aa->aa_page_count-1]->count - 1,
-			   client_cksum, cksum_type_unpack(aa->aa_oa->o_flags),
+			   client_cksum,
+			   obd_cksum_type_unpack(aa->aa_oa->o_flags),
 			   server_cksum, cksum_type, new_cksum);
 	return 1;
 }
@@ -1443,11 +1732,12 @@ check_write_checksum(struct obdo *oa, const lnet_process_id_t *peer,
 /* Note rc enters this function as number of bytes transferred */
 static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
 {
-        struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
+	struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
+	struct client_obd *cli = aa->aa_cli;
+	const char *obd_name = cli->cl_import->imp_obd->obd_name;
 	const struct lnet_process_id *peer =
-                        &req->rq_import->imp_connection->c_peer;
-        struct client_obd *cli = aa->aa_cli;
-        struct ost_body *body;
+		&req->rq_import->imp_connection->c_peer;
+	struct ost_body *body;
 	u32 client_cksum = 0;
         ENTRY;
 
@@ -1472,7 +1762,7 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
 		CDEBUG(D_QUOTA, "setdq for [%u %u %u] with valid %#llx, flags %x\n",
 		       body->oa.o_uid, body->oa.o_gid, body->oa.o_projid,
 		       body->oa.o_valid, body->oa.o_flags);
-		       osc_quota_setdq(cli, qid, body->oa.o_valid,
+		       osc_quota_setdq(cli, req->rq_xid, qid, body->oa.o_valid,
 				       body->oa.o_flags);
         }
 
@@ -1489,9 +1779,9 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
                         CERROR("Unexpected +ve rc %d\n", rc);
                         RETURN(-EPROTO);
                 }
-                LASSERT(req->rq_bulk->bd_nob == aa->aa_requested_nob);
 
-                if (sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk))
+		if (req->rq_bulk != NULL &&
+		    sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk))
                         RETURN(-EAGAIN);
 
                 if ((aa->aa_oa->o_valid & OBD_MD_FLCKSUM) && client_cksum &&
@@ -1506,8 +1796,14 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
 
         /* The rest of this function executes only for OST_READs */
 
-        /* if unwrap_bulk failed, return -EAGAIN to retry */
-        rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc);
+	if (req->rq_bulk == NULL) {
+		rc = req_capsule_get_size(&req->rq_pill, &RMF_SHORT_IO,
+					  RCL_SERVER);
+		LASSERT(rc == req->rq_status);
+	} else {
+		/* if unwrap_bulk failed, return -EAGAIN to retry */
+		rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc);
+	}
         if (rc < 0)
                 GOTO(out, rc = -EAGAIN);
 
@@ -1517,12 +1813,41 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
                 RETURN(-EPROTO);
         }
 
-        if (rc != req->rq_bulk->bd_nob_transferred) {
+	if (req->rq_bulk != NULL && rc != req->rq_bulk->bd_nob_transferred) {
                 CERROR ("Unexpected rc %d (%d transferred)\n",
                         rc, req->rq_bulk->bd_nob_transferred);
                 return (-EPROTO);
         }
 
+	if (req->rq_bulk == NULL) {
+		/* short io */
+		int nob, pg_count, i = 0;
+		unsigned char *buf;
+
+		CDEBUG(D_CACHE, "Using short io read, size %d\n", rc);
+		pg_count = aa->aa_page_count;
+		buf = req_capsule_server_sized_get(&req->rq_pill, &RMF_SHORT_IO,
+						   rc);
+		nob = rc;
+		while (nob > 0 && pg_count > 0) {
+			unsigned char *ptr;
+			int count = aa->aa_ppga[i]->count > nob ?
+				    nob : aa->aa_ppga[i]->count;
+
+			CDEBUG(D_CACHE, "page %p count %d\n",
+			       aa->aa_ppga[i]->pg, count);
+			ptr = ll_kmap_atomic(aa->aa_ppga[i]->pg, KM_USER0);
+			memcpy(ptr + (aa->aa_ppga[i]->off & ~PAGE_MASK), buf,
+			       count);
+			ll_kunmap_atomic((void *) ptr, KM_USER0);
+
+			buf += count;
+			nob -= count;
+			i++;
+			pg_count--;
+		}
+	}
+
         if (rc < aa->aa_requested_nob)
                 handle_short_read(rc, aa->aa_page_count, aa->aa_ppga);
 
@@ -1531,15 +1856,19 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
 		u32        server_cksum = body->oa.o_cksum;
 		char      *via = "";
 		char      *router = "";
-                cksum_type_t cksum_type;
-
-                cksum_type = cksum_type_unpack(body->oa.o_valid &OBD_MD_FLFLAGS?
-                                               body->oa.o_flags : 0);
-                client_cksum = osc_checksum_bulk(rc, aa->aa_page_count,
-                                                 aa->aa_ppga, OST_READ,
-                                                 cksum_type);
-
-		if (peer->nid != req->rq_bulk->bd_sender) {
+		enum cksum_types cksum_type;
+		u32 o_flags = body->oa.o_valid & OBD_MD_FLFLAGS ?
+			body->oa.o_flags : 0;
+
+		cksum_type = obd_cksum_type_unpack(o_flags);
+		rc = osc_checksum_bulk_rw(obd_name, cksum_type, rc,
+					  aa->aa_page_count, aa->aa_ppga,
+					  OST_READ, &client_cksum);
+		if (rc < 0)
+			GOTO(out, rc);
+
+		if (req->rq_bulk != NULL &&
+		    peer->nid != req->rq_bulk->bd_sender) {
 			via = " via ";
 			router = libcfs_nid2str(req->rq_bulk->bd_sender);
 		}
@@ -1559,7 +1888,7 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
 					   "%s%s%s inode "DFID" object "DOSTID
 					   " extent [%llu-%llu], client %x, "
 					   "server %x, cksum_type %x\n",
-					   req->rq_import->imp_obd->obd_name,
+					   obd_name,
 					   libcfs_nid2str(peer->nid),
 					   via, router,
 					   clbody->oa.o_valid & OBD_MD_FLFID ?
@@ -1713,13 +2042,14 @@ static int brw_interpret(const struct lu_env *env,
 	struct osc_extent *ext;
 	struct osc_extent *tmp;
 	struct client_obd *cli = aa->aa_cli;
+	unsigned long		transferred = 0;
         ENTRY;
 
         rc = osc_brw_fini_request(req, rc);
         CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc);
         /* When server return -EINPROGRESS, client should always retry
          * regardless of the number of times the bulk was resent already. */
-	if (osc_recoverable_error(rc)) {
+	if (osc_recoverable_error(rc) && !req->rq_no_delay) {
 		if (req->rq_import_generation !=
 		    req->rq_import->imp_generation) {
 			CDEBUG(D_HA, "%s: resend cross eviction for object: "
@@ -1793,20 +2123,26 @@ static int brw_interpret(const struct lu_env *env,
 			cl_object_attr_update(env, obj, attr, valid);
 		cl_object_attr_unlock(obj);
 	}
-	OBDO_FREE(aa->aa_oa);
+	OBD_SLAB_FREE_PTR(aa->aa_oa, osc_obdo_kmem);
+	aa->aa_oa = NULL;
 
 	if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE && rc == 0)
 		osc_inc_unstable_pages(req);
 
 	list_for_each_entry_safe(ext, tmp, &aa->aa_exts, oe_link) {
 		list_del_init(&ext->oe_link);
-		osc_extent_finish(env, ext, 1, rc);
+		osc_extent_finish(env, ext, 1,
+				  rc && req->rq_no_delay ? -EWOULDBLOCK : rc);
 	}
 	LASSERT(list_empty(&aa->aa_exts));
 	LASSERT(list_empty(&aa->aa_oaps));
 
+	transferred = (req->rq_bulk == NULL ? /* short io */
+		       aa->aa_requested_nob :
+		       req->rq_bulk->bd_nob_transferred);
+
 	osc_release_ppga(aa->aa_ppga, aa->aa_page_count);
-	ptlrpc_lprocfs_brw(req, req->rq_bulk->bd_nob_transferred);
+	ptlrpc_lprocfs_brw(req, transferred);
 
 	spin_lock(&cli->cl_loi_list_lock);
 	/* We need to decrement before osc_ap_completion->osc_wake_cache_waiters
@@ -1864,9 +2200,11 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 	int				page_count = 0;
 	bool				soft_sync = false;
 	bool				interrupted = false;
+	bool				ndelay = false;
 	int				i;
 	int				grant = 0;
 	int				rc;
+	__u32				layout_version = 0;
 	struct list_head		rpc_list = LIST_HEAD_INIT(rpc_list);
 	struct ost_body			*body;
 	ENTRY;
@@ -1878,6 +2216,7 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 		mem_tight |= ext->oe_memalloc;
 		grant += ext->oe_grants;
 		page_count += ext->oe_nr_pages;
+		layout_version = MAX(layout_version, ext->oe_layout_version);
 		if (obj == NULL)
 			obj = ext->oe_obj;
 	}
@@ -1890,7 +2229,7 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 	if (pga == NULL)
 		GOTO(out, rc = -ENOMEM);
 
-	OBDO_ALLOC(oa);
+	OBD_SLAB_ALLOC_PTR_GFP(oa, osc_obdo_kmem, GFP_NOFS);
 	if (oa == NULL)
 		GOTO(out, rc = -ENOMEM);
 
@@ -1920,6 +2259,8 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 			if (oap->oap_interrupted)
 				interrupted = true;
 		}
+		if (ext->oe_ndelay)
+			ndelay = true;
 	}
 
 	/* first page in the list */
@@ -1933,8 +2274,16 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 	crattr->cra_oa = oa;
 	cl_req_attr_set(env, osc2cl(obj), crattr);
 
-	if (cmd == OBD_BRW_WRITE)
+	if (cmd == OBD_BRW_WRITE) {
 		oa->o_grant_used = grant;
+		if (layout_version > 0) {
+			CDEBUG(D_LAYOUT, DFID": write with layout version %u\n",
+			       PFID(&oa->o_oi.oi_fid), layout_version);
+
+			oa->o_layout_version = layout_version;
+			oa->o_valid |= OBD_MD_LAYOUT_VERSION;
+		}
+	}
 
 	sort_brw_pages(pga, page_count);
 	rc = osc_brw_prep_request(cmd, cli, oa, page_count, pga, &req, 0);
@@ -1949,6 +2298,12 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 	oap->oap_request = ptlrpc_request_addref(req);
 	if (interrupted && !req->rq_intr)
 		ptlrpc_mark_interrupted(req);
+	if (ndelay) {
+		req->rq_no_resend = req->rq_no_delay = 1;
+		/* probably set a shorter timeout value.
+		 * to handle ETIMEDOUT in brw_interpret() correctly. */
+		/* lustre_msg_set_timeout(req, req->rq_timeout / 2); */
+	}
 
 	/* Need to update the timestamps after the request is built in case
 	 * we race with setattr (locally or in queue at OST).  If OST gets
@@ -1957,7 +2312,7 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 	 * way to do this in a single call.  bug 10150 */
 	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
 	crattr->cra_oa = &body->oa;
-	crattr->cra_flags = OBD_MD_FLMTIME|OBD_MD_FLCTIME|OBD_MD_FLATIME;
+	crattr->cra_flags = OBD_MD_FLMTIME | OBD_MD_FLCTIME | OBD_MD_FLATIME;
 	cl_req_attr_set(env, osc2cl(obj), crattr);
 	lustre_msg_set_jobid(req->rq_reqmsg, crattr->cra_jobid);
 
@@ -2002,7 +2357,7 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 		LASSERT(req == NULL);
 
 		if (oa)
-			OBDO_FREE(oa);
+			OBD_SLAB_FREE_PTR(oa, osc_obdo_kmem);
 		if (pga)
 			OBD_FREE(pga, sizeof(*pga) * page_count);
 		/* this should happen rarely and is pretty bad, it makes the
@@ -2035,10 +2390,10 @@ static int osc_set_lock_data(struct ldlm_lock *lock, void *data)
 	return set;
 }
 
-static int osc_enqueue_fini(struct ptlrpc_request *req,
-			    osc_enqueue_upcall_f upcall, void *cookie,
-			    struct lustre_handle *lockh, enum ldlm_mode mode,
-			    __u64 *flags, int agl, int errcode)
+int osc_enqueue_fini(struct ptlrpc_request *req, osc_enqueue_upcall_f upcall,
+		     void *cookie, struct lustre_handle *lockh,
+		     enum ldlm_mode mode, __u64 *flags, bool speculative,
+		     int errcode)
 {
 	bool intent = *flags & LDLM_FL_HAS_INTENT;
 	int rc;
@@ -2055,7 +2410,7 @@ static int osc_enqueue_fini(struct ptlrpc_request *req,
 			ptlrpc_status_ntoh(rep->lock_policy_res1);
 		if (rep->lock_policy_res1)
 			errcode = rep->lock_policy_res1;
-		if (!agl)
+		if (!speculative)
 			*flags |= LDLM_FL_LVB_READY;
 	} else if (errcode == ELDLM_OK) {
 		*flags |= LDLM_FL_LVB_READY;
@@ -2070,12 +2425,11 @@ static int osc_enqueue_fini(struct ptlrpc_request *req,
 	if (errcode == ELDLM_OK && lustre_handle_is_used(lockh))
 		ldlm_lock_decref(lockh, mode);
 
-        RETURN(rc);
+	RETURN(rc);
 }
 
-static int osc_enqueue_interpret(const struct lu_env *env,
-				 struct ptlrpc_request *req,
-				 struct osc_enqueue_args *aa, int rc)
+int osc_enqueue_interpret(const struct lu_env *env, struct ptlrpc_request *req,
+			  struct osc_enqueue_args *aa, int rc)
 {
 	struct ldlm_lock *lock;
 	struct lustre_handle *lockh = &aa->oa_lockh;
@@ -2105,7 +2459,7 @@ static int osc_enqueue_interpret(const struct lu_env *env,
 	/* Let CP AST to grant the lock first. */
 	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1);
 
-	if (aa->oa_agl) {
+	if (aa->oa_speculative) {
 		LASSERT(aa->oa_lvb == NULL);
 		LASSERT(aa->oa_flags == NULL);
 		aa->oa_flags = &flags;
@@ -2117,9 +2471,9 @@ static int osc_enqueue_interpret(const struct lu_env *env,
 				   lockh, rc);
 	/* Complete osc stuff. */
 	rc = osc_enqueue_fini(req, aa->oa_upcall, aa->oa_cookie, lockh, mode,
-			      aa->oa_flags, aa->oa_agl, rc);
+			      aa->oa_flags, aa->oa_speculative, rc);
 
-        OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
 
 	ldlm_lock_decref(lockh, mode);
 	LDLM_LOCK_PUT(lock);
@@ -2137,10 +2491,10 @@ struct ptlrpc_request_set *PTLRPCD_SET = (void *)1;
  * release locks just after they are obtained. */
 int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
 		     __u64 *flags, union ldlm_policy_data *policy,
-		     struct ost_lvb *lvb, int kms_valid,
-		     osc_enqueue_upcall_f upcall, void *cookie,
-		     struct ldlm_enqueue_info *einfo,
-		     struct ptlrpc_request_set *rqset, int async, int agl)
+		     struct ost_lvb *lvb, osc_enqueue_upcall_f upcall,
+		     void *cookie, struct ldlm_enqueue_info *einfo,
+		     struct ptlrpc_request_set *rqset, int async,
+		     bool speculative)
 {
 	struct obd_device *obd = exp->exp_obd;
 	struct lustre_handle lockh = { 0 };
@@ -2156,15 +2510,6 @@ int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
 	policy->l_extent.start -= policy->l_extent.start & ~PAGE_MASK;
 	policy->l_extent.end |= ~PAGE_MASK;
 
-        /*
-         * kms is not valid when either object is completely fresh (so that no
-         * locks are cached), or object was evicted. In the latter case cached
-         * lock cannot be used, because it would prime inode state with
-         * potentially stale LVB.
-         */
-        if (!kms_valid)
-                goto no_match;
-
         /* Next, search for already existing extent locks that will cover us */
         /* If we're trying to read, we also search for an existing PW lock.  The
          * VFS and page cache already protect us locally, so lots of readers/
@@ -2180,7 +2525,10 @@ int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
         mode = einfo->ei_mode;
         if (einfo->ei_mode == LCK_PR)
                 mode |= LCK_PW;
-	if (agl == 0)
+	/* Normal lock requests must wait for the LVB to be ready before
+	 * matching a lock; speculative lock requests do not need to,
+	 * because they will not actually use the lock. */
+	if (!speculative)
 		match_flags |= LDLM_FL_LVB_READY;
 	if (intent != 0)
 		match_flags |= LDLM_FL_BLOCK_GRANTED;
@@ -2193,13 +2541,22 @@ int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
 			RETURN(ELDLM_OK);
 
 		matched = ldlm_handle2lock(&lockh);
-		if (agl) {
-			/* AGL enqueues DLM locks speculatively. Therefore if
-			 * it already exists a DLM lock, it wll just inform the
-			 * caller to cancel the AGL process for this stripe. */
+		if (speculative) {
+			/* This DLM lock request is speculative, and does not
+			 * have an associated IO request. Therefore if there
+			 * is already a DLM lock, it wll just inform the
+			 * caller to cancel the request for this stripe.*/
+			lock_res_and_lock(matched);
+			if (ldlm_extent_equal(&policy->l_extent,
+			    &matched->l_policy_data.l_extent))
+				rc = -EEXIST;
+			else
+				rc = -ECANCELED;
+			unlock_res_and_lock(matched);
+
 			ldlm_lock_decref(&lockh, mode);
 			LDLM_LOCK_PUT(matched);
-			RETURN(-ECANCELED);
+			RETURN(rc);
 		} else if (osc_set_lock_data(matched, einfo->ei_cbdata)) {
 			*flags |= LDLM_FL_LVB_READY;
 
@@ -2215,7 +2572,6 @@ int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
 		}
 	}
 
-no_match:
 	if (*flags & (LDLM_FL_TEST_LOCK | LDLM_FL_MATCH_LOCK))
 		RETURN(-ENOLCK);
 
@@ -2246,20 +2602,20 @@ int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
 			struct osc_enqueue_args *aa;
 			CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
 			aa = ptlrpc_req_async_args(req);
-			aa->oa_exp    = exp;
-			aa->oa_mode   = einfo->ei_mode;
-			aa->oa_type   = einfo->ei_type;
+			aa->oa_exp	   = exp;
+			aa->oa_mode	   = einfo->ei_mode;
+			aa->oa_type	   = einfo->ei_type;
 			lustre_handle_copy(&aa->oa_lockh, &lockh);
-			aa->oa_upcall = upcall;
-			aa->oa_cookie = cookie;
-			aa->oa_agl    = !!agl;
-			if (!agl) {
+			aa->oa_upcall	   = upcall;
+			aa->oa_cookie	   = cookie;
+			aa->oa_speculative = speculative;
+			if (!speculative) {
 				aa->oa_flags  = flags;
 				aa->oa_lvb    = lvb;
 			} else {
-				/* AGL is essentially to enqueue an DLM lock
-				 * in advance, so we don't care about the
-				 * result of AGL enqueue. */
+				/* speculative locks are essentially to enqueue
+				 * a DLM lock  in advance, so we don't care
+				 * about the result of the enqueue. */
 				aa->oa_lvb    = NULL;
 				aa->oa_flags  = NULL;
 			}
@@ -2277,16 +2633,17 @@ int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
 	}
 
 	rc = osc_enqueue_fini(req, upcall, cookie, &lockh, einfo->ei_mode,
-			      flags, agl, rc);
+			      flags, speculative, rc);
 	if (intent)
 		ptlrpc_req_finished(req);
 
 	RETURN(rc);
 }
 
-int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id,
-		   enum ldlm_type type, union ldlm_policy_data *policy,
-		   enum ldlm_mode mode, __u64 *flags, void *data,
+int osc_match_base(const struct lu_env *env, struct obd_export *exp,
+		   struct ldlm_res_id *res_id, enum ldlm_type type,
+		   union ldlm_policy_data *policy, enum ldlm_mode mode,
+		   __u64 *flags, struct osc_object *obj,
 		   struct lustre_handle *lockh, int unref)
 {
 	struct obd_device *obd = exp->exp_obd;
@@ -2314,11 +2671,19 @@ int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id,
 	if (rc == 0 || lflags & LDLM_FL_TEST_LOCK)
 		RETURN(rc);
 
-	if (data != NULL) {
+	if (obj != NULL) {
 		struct ldlm_lock *lock = ldlm_handle2lock(lockh);
 
 		LASSERT(lock != NULL);
-		if (!osc_set_lock_data(lock, data)) {
+		if (osc_set_lock_data(lock, obj)) {
+			lock_res_and_lock(lock);
+			if (!ldlm_is_lvb_cached(lock)) {
+				LASSERT(lock->l_ast_data == obj);
+				osc_lock_lvb_update(env, obj, lock, NULL);
+				ldlm_set_lvb_cached(lock);
+			}
+			unlock_res_and_lock(lock);
+		} else {
 			ldlm_lock_decref(lockh, rc);
 			rc = 0;
 		}
@@ -2361,13 +2726,13 @@ static int osc_statfs_interpret(const struct lu_env *env,
 }
 
 static int osc_statfs_async(struct obd_export *exp,
-                            struct obd_info *oinfo, __u64 max_age,
+			    struct obd_info *oinfo, time64_t max_age,
                             struct ptlrpc_request_set *rqset)
 {
         struct obd_device     *obd = class_exp2obd(exp);
         struct ptlrpc_request *req;
         struct osc_async_args *aa;
-        int                    rc;
+	int rc;
         ENTRY;
 
         /* We could possibly pass max_age in the request (as an absolute
@@ -2385,34 +2750,35 @@ static int osc_statfs_async(struct obd_export *exp,
                 ptlrpc_request_free(req);
                 RETURN(rc);
         }
-        ptlrpc_request_set_replen(req);
-        req->rq_request_portal = OST_CREATE_PORTAL;
-        ptlrpc_at_set_req_timeout(req);
+	ptlrpc_request_set_replen(req);
+	req->rq_request_portal = OST_CREATE_PORTAL;
+	ptlrpc_at_set_req_timeout(req);
 
-        if (oinfo->oi_flags & OBD_STATFS_NODELAY) {
-                /* procfs requests not want stat in wait for avoid deadlock */
-                req->rq_no_resend = 1;
-                req->rq_no_delay = 1;
-        }
+	if (oinfo->oi_flags & OBD_STATFS_NODELAY) {
+		/* procfs requests not want stat in wait for avoid deadlock */
+		req->rq_no_resend = 1;
+		req->rq_no_delay = 1;
+	}
 
-        req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_statfs_interpret;
-        CLASSERT (sizeof(*aa) <= sizeof(req->rq_async_args));
-        aa = ptlrpc_req_async_args(req);
-        aa->aa_oi = oinfo;
+	req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_statfs_interpret;
+	CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+	aa = ptlrpc_req_async_args(req);
+	aa->aa_oi = oinfo;
 
-        ptlrpc_set_add_req(rqset, req);
-        RETURN(0);
+	ptlrpc_set_add_req(rqset, req);
+	RETURN(0);
 }
 
 static int osc_statfs(const struct lu_env *env, struct obd_export *exp,
-                      struct obd_statfs *osfs, __u64 max_age, __u32 flags)
+		      struct obd_statfs *osfs, time64_t max_age, __u32 flags)
 {
-        struct obd_device     *obd = class_exp2obd(exp);
-        struct obd_statfs     *msfs;
-        struct ptlrpc_request *req;
-        struct obd_import     *imp = NULL;
-        int rc;
-        ENTRY;
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct obd_statfs     *msfs;
+	struct ptlrpc_request *req;
+	struct obd_import     *imp = NULL;
+	int rc;
+	ENTRY;
+
 
         /*Since the request might also come from lprocfs, so we need
          *sync this with client_disconnect_export Bug15684*/
@@ -2423,49 +2789,48 @@ static int osc_statfs(const struct lu_env *env, struct obd_export *exp,
         if (!imp)
                 RETURN(-ENODEV);
 
-        /* We could possibly pass max_age in the request (as an absolute
-         * timestamp or a "seconds.usec ago") so the target can avoid doing
-         * extra calls into the filesystem if that isn't necessary (e.g.
-         * during mount that would help a bit).  Having relative timestamps
-         * is not so great if request processing is slow, while absolute
-         * timestamps are not ideal because they need time synchronization. */
-        req = ptlrpc_request_alloc(imp, &RQF_OST_STATFS);
+	/* We could possibly pass max_age in the request (as an absolute
+	 * timestamp or a "seconds.usec ago") so the target can avoid doing
+	 * extra calls into the filesystem if that isn't necessary (e.g.
+	 * during mount that would help a bit).  Having relative timestamps
+	 * is not so great if request processing is slow, while absolute
+	 * timestamps are not ideal because they need time synchronization. */
+	req = ptlrpc_request_alloc(imp, &RQF_OST_STATFS);
 
-        class_import_put(imp);
+	class_import_put(imp);
 
-        if (req == NULL)
-                RETURN(-ENOMEM);
+	if (req == NULL)
+		RETURN(-ENOMEM);
 
-        rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
-        if (rc) {
-                ptlrpc_request_free(req);
-                RETURN(rc);
-        }
-        ptlrpc_request_set_replen(req);
-        req->rq_request_portal = OST_CREATE_PORTAL;
-        ptlrpc_at_set_req_timeout(req);
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	ptlrpc_request_set_replen(req);
+	req->rq_request_portal = OST_CREATE_PORTAL;
+	ptlrpc_at_set_req_timeout(req);
 
-        if (flags & OBD_STATFS_NODELAY) {
-                /* procfs requests not want stat in wait for avoid deadlock */
-                req->rq_no_resend = 1;
-                req->rq_no_delay = 1;
-        }
+	if (flags & OBD_STATFS_NODELAY) {
+		/* procfs requests not want stat in wait for avoid deadlock */
+		req->rq_no_resend = 1;
+		req->rq_no_delay = 1;
+	}
 
-        rc = ptlrpc_queue_wait(req);
-        if (rc)
-                GOTO(out, rc);
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
 
-        msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
-        if (msfs == NULL) {
-                GOTO(out, rc = -EPROTO);
-        }
+	msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+	if (msfs == NULL)
+		GOTO(out, rc = -EPROTO);
 
-        *osfs = *msfs;
+	*osfs = *msfs;
 
-        EXIT;
- out:
-        ptlrpc_req_finished(req);
-        return rc;
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+	return rc;
 }
 
 static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
@@ -2505,10 +2870,9 @@ static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
 	return err;
 }
 
-static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
-			      u32 keylen, void *key,
-			      u32 vallen, void *val,
-			      struct ptlrpc_request_set *set)
+int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+		       u32 keylen, void *key, u32 vallen, void *val,
+		       struct ptlrpc_request_set *set)
 {
         struct ptlrpc_request *req;
         struct obd_device     *obd = exp->exp_obd;
@@ -2595,23 +2959,23 @@ static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
 	tmp = req_capsule_client_get(&req->rq_pill, KEY_IS(KEY_GRANT_SHRINK) ?
 							&RMF_OST_BODY :
 							&RMF_SETINFO_VAL);
-        memcpy(tmp, val, vallen);
+	memcpy(tmp, val, vallen);
 
 	if (KEY_IS(KEY_GRANT_SHRINK)) {
-                struct osc_grant_args *aa;
-                struct obdo *oa;
-
-                CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
-                aa = ptlrpc_req_async_args(req);
-                OBDO_ALLOC(oa);
-                if (!oa) {
-                        ptlrpc_req_finished(req);
-                        RETURN(-ENOMEM);
-                }
-                *oa = ((struct ost_body *)val)->oa;
-                aa->aa_oa = oa;
-                req->rq_interpret_reply = osc_shrink_grant_interpret;
-        }
+		struct osc_grant_args *aa;
+		struct obdo *oa;
+
+		CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+		aa = ptlrpc_req_async_args(req);
+		OBD_SLAB_ALLOC_PTR_GFP(oa, osc_obdo_kmem, GFP_NOFS);
+		if (!oa) {
+			ptlrpc_req_finished(req);
+			RETURN(-ENOMEM);
+		}
+		*oa = ((struct ost_body *)val)->oa;
+		aa->aa_oa = oa;
+		req->rq_interpret_reply = osc_shrink_grant_interpret;
+	}
 
 	ptlrpc_request_set_replen(req);
 	if (!KEY_IS(KEY_GRANT_SHRINK)) {
@@ -2624,25 +2988,27 @@ static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
 
 	RETURN(0);
 }
+EXPORT_SYMBOL(osc_set_info_async);
 
-static int osc_reconnect(const struct lu_env *env,
-                         struct obd_export *exp, struct obd_device *obd,
-                         struct obd_uuid *cluuid,
-                         struct obd_connect_data *data,
-                         void *localdata)
+int osc_reconnect(const struct lu_env *env, struct obd_export *exp,
+		  struct obd_device *obd, struct obd_uuid *cluuid,
+		  struct obd_connect_data *data, void *localdata)
 {
-        struct client_obd *cli = &obd->u.cli;
+	struct client_obd *cli = &obd->u.cli;
 
-        if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) {
-                long lost_grant;
+	if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) {
+		long lost_grant;
 		long grant;
 
 		spin_lock(&cli->cl_loi_list_lock);
 		grant = cli->cl_avail_grant + cli->cl_reserved_grant;
-		if (data->ocd_connect_flags & OBD_CONNECT_GRANT_PARAM)
+		if (data->ocd_connect_flags & OBD_CONNECT_GRANT_PARAM) {
+			/* restore ocd_grant_blkbits as client page bits */
+			data->ocd_grant_blkbits = PAGE_SHIFT;
 			grant += cli->cl_dirty_grant;
-		else
+		} else {
 			grant += cli->cl_dirty_pages << PAGE_SHIFT;
+		}
 		data->ocd_grant = grant ? : 2 * cli_brw_size(obd);
 		lost_grant = cli->cl_lost_grant;
 		cli->cl_lost_grant = 0;
@@ -2655,37 +3021,36 @@ static int osc_reconnect(const struct lu_env *env,
 
 	RETURN(0);
 }
+EXPORT_SYMBOL(osc_reconnect);
 
-static int osc_disconnect(struct obd_export *exp)
+int osc_disconnect(struct obd_export *exp)
 {
 	struct obd_device *obd = class_exp2obd(exp);
 	int rc;
 
-        rc = client_disconnect_export(exp);
-        /**
-         * Initially we put del_shrink_grant before disconnect_export, but it
-         * causes the following problem if setup (connect) and cleanup
-         * (disconnect) are tangled together.
-         *      connect p1                     disconnect p2
-         *   ptlrpc_connect_import
-         *     ...............               class_manual_cleanup
-         *                                     osc_disconnect
-         *                                     del_shrink_grant
-         *   ptlrpc_connect_interrupt
-         *     init_grant_shrink
-         *   add this client to shrink list
-         *                                      cleanup_osc
-         * Bang! pinger trigger the shrink.
-         * So the osc should be disconnected from the shrink list, after we
-         * are sure the import has been destroyed. BUG18662
-         */
-        if (obd->u.cli.cl_import == NULL)
-                osc_del_shrink_grant(&obd->u.cli);
-        return rc;
-}
-
-static int osc_ldlm_resource_invalidate(struct cfs_hash *hs,
-	struct cfs_hash_bd *bd, struct hlist_node *hnode, void *arg)
+	rc = client_disconnect_export(exp);
+	/**
+	 * Initially we put del_shrink_grant before disconnect_export, but it
+	 * causes the following problem if setup (connect) and cleanup
+	 * (disconnect) are tangled together.
+	 *      connect p1                     disconnect p2
+	 *   ptlrpc_connect_import
+	 *     ...............               class_manual_cleanup
+	 *                                     osc_disconnect
+	 *                                     del_shrink_grant
+	 *   ptlrpc_connect_interrupt
+	 *     osc_init_grant
+	 *   add this client to shrink list
+	 *                                      cleanup_osc
+	 * Bang! grant shrink thread trigger the shrink. BUG18662
+	 */
+	osc_del_grant_list(&obd->u.cli);
+	return rc;
+}
+EXPORT_SYMBOL(osc_disconnect);
+
+int osc_ldlm_resource_invalidate(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+				 struct hlist_node *hnode, void *arg)
 {
 	struct lu_env *env = arg;
 	struct ldlm_resource *res = cfs_hash_object(hs, hnode);
@@ -2714,6 +3079,7 @@ static int osc_ldlm_resource_invalidate(struct cfs_hash *hs,
 
 	RETURN(0);
 }
+EXPORT_SYMBOL(osc_ldlm_resource_invalidate);
 
 static int osc_import_event(struct obd_device *obd,
                             struct obd_import *imp,
@@ -2804,7 +3170,7 @@ static int osc_cancel_weight(struct ldlm_lock *lock)
 	 * Cancel all unused and granted extent lock.
 	 */
 	if (lock->l_resource->lr_type == LDLM_EXTENT &&
-	    lock->l_granted_mode == lock->l_req_mode &&
+	    ldlm_is_granted(lock) &&
 	    osc_ldlm_weigh_ast(lock) == 0)
 		RETURN(1);
 
@@ -2821,15 +3187,12 @@ static int brw_queue_work(const struct lu_env *env, void *data)
 	RETURN(0);
 }
 
-int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+int osc_setup_common(struct obd_device *obd, struct lustre_cfg *lcfg)
 {
 	struct client_obd *cli = &obd->u.cli;
-	struct obd_type	  *type;
-	void		  *handler;
-	int		   rc;
-	int		   adding;
-	int		   added;
-	int		   req_count;
+	void *handler;
+	int rc;
+
 	ENTRY;
 
 	rc = ptlrpcd_addref();
@@ -2840,9 +3203,10 @@ int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 	if (rc)
 		GOTO(out_ptlrpcd, rc);
 
+
 	handler = ptlrpcd_alloc_work(cli->cl_import, brw_queue_work, cli);
 	if (IS_ERR(handler))
-		GOTO(out_client_setup, rc = PTR_ERR(handler));
+		GOTO(out_ptlrpcd_work, rc = PTR_ERR(handler));
 	cli->cl_writeback_work = handler;
 
 	handler = ptlrpcd_alloc_work(cli->cl_import, lru_queue_work, cli);
@@ -2855,36 +3219,43 @@ int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 		GOTO(out_ptlrpcd_work, rc);
 
 	cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL;
+	osc_update_next_shrink(cli);
 
-#ifdef CONFIG_PROC_FS
-	obd->obd_vars = lprocfs_osc_obd_vars;
-#endif
-	/* If this is true then both client (osc) and server (osp) are on the
-	 * same node. The osp layer if loaded first will register the osc proc
-	 * directory. In that case this obd_device will be attached its proc
-	 * tree to type->typ_procsym instead of obd->obd_type->typ_procroot. */
-	type = class_search_type(LUSTRE_OSP_NAME);
-	if (type && type->typ_procsym) {
-		obd->obd_proc_entry = lprocfs_register(obd->obd_name,
-						       type->typ_procsym,
-						       obd->obd_vars, obd);
-		if (IS_ERR(obd->obd_proc_entry)) {
-			rc = PTR_ERR(obd->obd_proc_entry);
-			CERROR("error %d setting up lprocfs for %s\n", rc,
-			       obd->obd_name);
-			obd->obd_proc_entry = NULL;
-		}
-	} else {
-		rc = lprocfs_obd_setup(obd);
-	}
+	RETURN(rc);
 
-	/* If the basic OSC proc tree construction succeeded then
-	 * lets do the rest. */
-	if (rc == 0) {
-		lproc_osc_attach_seqstat(obd);
-		sptlrpc_lprocfs_cliobd_attach(obd);
-		ptlrpc_lprocfs_register_obd(obd);
+out_ptlrpcd_work:
+	if (cli->cl_writeback_work != NULL) {
+		ptlrpcd_destroy_work(cli->cl_writeback_work);
+		cli->cl_writeback_work = NULL;
 	}
+	if (cli->cl_lru_work != NULL) {
+		ptlrpcd_destroy_work(cli->cl_lru_work);
+		cli->cl_lru_work = NULL;
+	}
+	client_obd_cleanup(obd);
+out_ptlrpcd:
+	ptlrpcd_decref();
+	RETURN(rc);
+}
+EXPORT_SYMBOL(osc_setup_common);
+
+int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct client_obd *cli = &obd->u.cli;
+	int		   adding;
+	int		   added;
+	int		   req_count;
+	int		   rc;
+
+	ENTRY;
+
+	rc = osc_setup_common(obd, lcfg);
+	if (rc < 0)
+		RETURN(rc);
+
+	rc = osc_tunables_init(obd);
+	if (rc)
+		RETURN(rc);
 
 	/*
 	 * We try to control the total number of requests with a upper limit
@@ -2901,32 +3272,18 @@ int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 		atomic_add(added, &osc_pool_req_count);
 	}
 
-	INIT_LIST_HEAD(&cli->cl_grant_shrink_list);
 	ns_register_cancel(obd->obd_namespace, osc_cancel_weight);
 
 	spin_lock(&osc_shrink_lock);
 	list_add_tail(&cli->cl_shrink_list, &osc_shrink_list);
 	spin_unlock(&osc_shrink_lock);
+	cli->cl_import->imp_idle_timeout = osc_idle_timeout;
+	cli->cl_import->imp_idle_debug = D_HA;
 
 	RETURN(0);
-
-out_ptlrpcd_work:
-	if (cli->cl_writeback_work != NULL) {
-		ptlrpcd_destroy_work(cli->cl_writeback_work);
-		cli->cl_writeback_work = NULL;
-	}
-	if (cli->cl_lru_work != NULL) {
-		ptlrpcd_destroy_work(cli->cl_lru_work);
-		cli->cl_lru_work = NULL;
-	}
-out_client_setup:
-	client_obd_cleanup(obd);
-out_ptlrpcd:
-	ptlrpcd_decref();
-	RETURN(rc);
 }
 
-static int osc_precleanup(struct obd_device *obd)
+int osc_precleanup_common(struct obd_device *obd)
 {
 	struct client_obd *cli = &obd->u.cli;
 	ENTRY;
@@ -2952,11 +3309,21 @@ static int osc_precleanup(struct obd_device *obd)
 	}
 
 	obd_cleanup_client_import(obd);
+	RETURN(0);
+}
+EXPORT_SYMBOL(osc_precleanup_common);
+
+static int osc_precleanup(struct obd_device *obd)
+{
+	ENTRY;
+
+	osc_precleanup_common(obd);
+
 	ptlrpc_lprocfs_unregister_obd(obd);
 	RETURN(0);
 }
 
-int osc_cleanup(struct obd_device *obd)
+int osc_cleanup_common(struct obd_device *obd)
 {
 	struct client_obd *cli = &obd->u.cli;
 	int rc;
@@ -2986,11 +3353,13 @@ int osc_cleanup(struct obd_device *obd)
 	ptlrpcd_decref();
 	RETURN(rc);
 }
+EXPORT_SYMBOL(osc_cleanup_common);
 
 int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg)
 {
-	int rc = class_process_proc_param(PARAM_OSC, obd->obd_vars, lcfg, obd);
-	return rc > 0 ? 0: rc;
+	ssize_t count  = class_modify_config(lcfg, PARAM_OSC,
+					     &obd->obd_kset.kobj);
+	return count > 0 ? 0 : count;
 }
 
 static int osc_process_config(struct obd_device *obd, size_t len, void *buf)
@@ -3002,7 +3371,7 @@ static struct obd_ops osc_obd_ops = {
         .o_owner                = THIS_MODULE,
         .o_setup                = osc_setup,
         .o_precleanup           = osc_precleanup,
-        .o_cleanup              = osc_cleanup,
+	.o_cleanup              = osc_cleanup_common,
         .o_add_conn             = client_import_add_conn,
         .o_del_conn             = client_import_del_conn,
         .o_connect              = client_connect_import,
@@ -3095,19 +3464,28 @@ static int __init osc_init(void)
 	osc_rq_pool = ptlrpc_init_rq_pool(0, OST_IO_MAXREQSIZE,
 					  ptlrpc_add_rqs_to_pool);
 
-	if (osc_rq_pool != NULL)
-		GOTO(out, rc);
-	rc = -ENOMEM;
+	if (osc_rq_pool == NULL)
+		GOTO(out_type, rc = -ENOMEM);
+
+	rc = osc_start_grant_work();
+	if (rc != 0)
+		GOTO(out_req_pool, rc);
+
+	RETURN(rc);
+
+out_req_pool:
+	ptlrpc_free_rq_pool(osc_rq_pool);
 out_type:
 	class_unregister_type(LUSTRE_OSC_NAME);
 out_kmem:
 	lu_kmem_fini(osc_caches);
-out:
+
 	RETURN(rc);
 }
 
 static void __exit osc_exit(void)
 {
+	osc_stop_grant_work();
 	remove_shrinker(osc_cache_shrinker);
 	class_unregister_type(LUSTRE_OSC_NAME);
 	lu_kmem_fini(osc_caches);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/client.c b/drivers/staging/lustrefsx/lustre/ptlrpc/client.c
index 9642a5644009f..b9888d92b1fd8 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/client.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/client.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -34,6 +34,7 @@
 
 #define DEBUG_SUBSYSTEM S_RPC
 
+#include <linux/delay.h>
 #include <obd_support.h>
 #include <obd_class.h>
 #include <lustre_lib.h>
@@ -126,6 +127,12 @@ struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned nfrags, unsigned max_brw,
 		(ptlrpc_is_bulk_desc_kvec(type) &&
 		 ops->add_iov_frag != NULL));
 
+	if (max_brw > PTLRPC_BULK_OPS_COUNT)
+		RETURN(NULL);
+
+	if (nfrags > LNET_MAX_IOV * max_brw)
+		RETURN(NULL);
+
 	OBD_ALLOC_PTR(desc);
 	if (desc == NULL)
 		return NULL;
@@ -148,6 +155,7 @@ struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned nfrags, unsigned max_brw,
 	desc->bd_portal = portal;
 	desc->bd_type = type;
 	desc->bd_md_count = 0;
+	desc->bd_nob_last = LNET_MTU;
 	desc->bd_frag_ops = (struct ptlrpc_bulk_frag_ops *) ops;
 	LASSERT(max_brw > 0);
 	desc->bd_md_max_brw = min(max_brw, PTLRPC_BULK_OPS_COUNT);
@@ -214,7 +222,15 @@ void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
 	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
 
 	kiov = &BD_GET_KIOV(desc, desc->bd_iov_count);
+	if (((desc->bd_iov_count % LNET_MAX_IOV) == 0) ||
+	     ((desc->bd_nob_last + len) > LNET_MTU)) {
+		desc->bd_mds_off[desc->bd_md_count] = desc->bd_iov_count;
+		desc->bd_md_count++;
+		desc->bd_nob_last = 0;
+		LASSERT(desc->bd_md_count <= PTLRPC_BULK_OPS_COUNT);
+	}
 
+	desc->bd_nob_last += len;
 	desc->bd_nob += len;
 
 	if (pin)
@@ -240,7 +256,15 @@ int ptlrpc_prep_bulk_frag(struct ptlrpc_bulk_desc *desc,
 	LASSERT(ptlrpc_is_bulk_desc_kvec(desc->bd_type));
 
 	iovec = &BD_GET_KVEC(desc, desc->bd_iov_count);
+	if (((desc->bd_iov_count % LNET_MAX_IOV) == 0) ||
+	     ((desc->bd_nob_last + len) > LNET_MTU)) {
+		desc->bd_mds_off[desc->bd_md_count] = desc->bd_iov_count;
+		desc->bd_md_count++;
+		desc->bd_nob_last = 0;
+		LASSERT(desc->bd_md_count <= PTLRPC_BULK_OPS_COUNT);
+	}
 
+	desc->bd_nob_last += len;
 	desc->bd_nob += len;
 
 	iovec->iov_base = frag;
@@ -258,7 +282,7 @@ void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc)
 
 	LASSERT(desc != NULL);
 	LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */
-	LASSERT(desc->bd_md_count == 0);         /* network hands off */
+	LASSERT(desc->bd_refs == 0);         /* network hands off */
 	LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL));
 	LASSERT(desc->bd_frag_ops != NULL);
 
@@ -353,7 +377,7 @@ int ptlrpc_at_get_net_latency(struct ptlrpc_request *req)
 
 /* Adjust expected network latency */
 void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
-			       unsigned int service_time)
+			       timeout_t service_timeout)
 {
         unsigned int nl, oldnl;
         struct imp_at *at;
@@ -361,8 +385,9 @@ void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
 
         LASSERT(req->rq_import);
 
-	if (service_time > now - req->rq_sent + 3) {
-		/* bz16408, however, this can also happen if early reply
+	if (service_timeout > now - req->rq_sent + 3) {
+		/*
+		 * b=16408, however, this can also happen if early reply
 		 * is lost and client RPC is expired and resent, early reply
 		 * or reply of original RPC can still be fit in reply buffer
 		 * of resent RPC, now client is measuring time from the
@@ -372,13 +397,13 @@ void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
 		CDEBUG((lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) ?
 		       D_ADAPTTO : D_WARNING,
 		       "Reported service time %u > total measured time %lld\n",
-		       service_time, now - req->rq_sent);
+		       service_timeout, now - req->rq_sent);
 		return;
 	}
 
         /* Network latency is total time less server processing time */
 	nl = max_t(int, now - req->rq_sent -
-			service_time, 0) + 1; /* st rounding */
+			service_timeout, 0) + 1; /* st rounding */
 	at = &req->rq_import->imp_at;
 
         oldnl = at_measured(&at->iat_net_latency, nl);
@@ -419,6 +444,7 @@ static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req)
 __must_hold(&req->rq_lock)
 {
 	struct ptlrpc_request *early_req;
+	timeout_t service_timeout;
 	time64_t olddl;
 	int rc;
 
@@ -448,8 +474,8 @@ __must_hold(&req->rq_lock)
 	lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
 
 	/* Network latency can be adjusted, it is pure network delays */
-	ptlrpc_at_adj_net_latency(req,
-			lustre_msg_get_service_time(early_req->rq_repmsg));
+	service_timeout = lustre_msg_get_service_timeout(early_req->rq_repmsg);
+	ptlrpc_at_adj_net_latency(req, service_timeout);
 
 	sptlrpc_cli_finish_early_reply(early_req);
 
@@ -777,6 +803,7 @@ int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
 	LASSERT(!request->rq_pool);
 	sptlrpc_cli_ctx_put(request->rq_cli_ctx, 1);
 out_free:
+	atomic_dec(&imp->imp_reqs);
 	class_import_put(imp);
 
 	return rc;
@@ -845,6 +872,7 @@ struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp,
 		LASSERT(imp->imp_client != LP_POISON);
 
 		request->rq_import = class_import_get(imp);
+		atomic_inc(&imp->imp_reqs);
 	} else {
 		CERROR("request allocation out of memory\n");
 	}
@@ -852,6 +880,33 @@ struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp,
 	return request;
 }
 
+static int ptlrpc_reconnect_if_idle(struct obd_import *imp)
+{
+	int rc;
+
+	/*
+	 * initiate connection if needed when the import has been
+	 * referenced by the new request to avoid races with disconnect.
+	 * serialize this check against conditional state=IDLE
+	 * in ptlrpc_disconnect_idle_interpret()
+	 */
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_IDLE) {
+		imp->imp_generation++;
+		imp->imp_initiated_at = imp->imp_generation;
+		imp->imp_state = LUSTRE_IMP_NEW;
+
+		/* connect_import_locked releases imp_lock */
+		rc = ptlrpc_connect_import_locked(imp);
+		if (rc)
+			return rc;
+		ptlrpc_pinger_add_import(imp);
+	} else {
+		spin_unlock(&imp->imp_lock);
+	}
+	return 0;
+}
+
 /**
  * Helper function for creating a request.
  * Calls __ptlrpc_request_alloc to allocate new request sturcture and inits
@@ -863,11 +918,21 @@ ptlrpc_request_alloc_internal(struct obd_import *imp,
                               struct ptlrpc_request_pool * pool,
                               const struct req_format *format)
 {
-        struct ptlrpc_request *request;
+	struct ptlrpc_request *request;
 
-        request = __ptlrpc_request_alloc(imp, pool);
-        if (request == NULL)
-                return NULL;
+	request = __ptlrpc_request_alloc(imp, pool);
+	if (request == NULL)
+		return NULL;
+
+	/* don't make expensive check for idling connection
+	 * if it's already connected */
+	if (unlikely(imp->imp_state != LUSTRE_IMP_FULL)) {
+		if (ptlrpc_reconnect_if_idle(imp) < 0) {
+			atomic_dec(&imp->imp_reqs);
+			ptlrpc_request_free(request);
+			return NULL;
+		}
+	}
 
         req_capsule_init(&request->rq_pill, request, RCL_CLIENT);
         req_capsule_set(&request->rq_pill, format);
@@ -956,7 +1021,6 @@ struct ptlrpc_request_set *ptlrpc_prep_set(void)
 	atomic_set(&set->set_remaining, 0);
 	spin_lock_init(&set->set_new_req_lock);
 	INIT_LIST_HEAD(&set->set_new_requests);
-	INIT_LIST_HEAD(&set->set_cblist);
 	set->set_max_inflight = UINT_MAX;
 	set->set_producer     = NULL;
 	set->set_producer_arg = NULL;
@@ -1051,27 +1115,6 @@ void ptlrpc_set_destroy(struct ptlrpc_request_set *set)
 }
 EXPORT_SYMBOL(ptlrpc_set_destroy);
 
-/**
- * Add a callback function \a fn to the set.
- * This function would be called when all requests on this set are completed.
- * The function will be passed \a data argument.
- */
-int ptlrpc_set_add_cb(struct ptlrpc_request_set *set,
-                      set_interpreter_func fn, void *data)
-{
-	struct ptlrpc_set_cbdata *cbdata;
-
-	OBD_ALLOC_PTR(cbdata);
-	if (cbdata == NULL)
-		RETURN(-ENOMEM);
-
-	cbdata->psc_interpret = fn;
-	cbdata->psc_data = data;
-	list_add_tail(&cbdata->psc_item, &set->set_cblist);
-
-	RETURN(0);
-}
-
 /**
  * Add a new request to the general purpose request set.
  * Assumes request reference from the caller.
@@ -1079,6 +1122,7 @@ int ptlrpc_set_add_cb(struct ptlrpc_request_set *set,
 void ptlrpc_set_add_req(struct ptlrpc_request_set *set,
                         struct ptlrpc_request *req)
 {
+	LASSERT(req->rq_import->imp_state != LUSTRE_IMP_IDLE);
 	LASSERT(list_empty(&req->rq_set_chain));
 
 	if (req->rq_allow_intr)
@@ -1088,7 +1132,7 @@ void ptlrpc_set_add_req(struct ptlrpc_request_set *set,
 	list_add_tail(&req->rq_set_chain, &set->set_requests);
 	req->rq_set = set;
 	atomic_inc(&set->set_remaining);
-	req->rq_queued_time = cfs_time_current();
+	req->rq_queued_time = ktime_get_seconds();
 
 	if (req->rq_reqmsg != NULL)
 		lustre_msg_set_jobid(req->rq_reqmsg, NULL);
@@ -1119,7 +1163,7 @@ void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
 	 * The set takes over the caller's request reference.
 	 */
 	req->rq_set = set;
-	req->rq_queued_time = cfs_time_current();
+	req->rq_queued_time = ktime_get_seconds();
 	list_add_tail(&req->rq_set_chain, &set->set_new_requests);
 	count = atomic_inc_return(&set->set_new_count);
 	spin_unlock(&set->set_new_req_lock);
@@ -1155,17 +1199,19 @@ static int ptlrpc_import_delay_req(struct obd_import *imp,
         LASSERT (status != NULL);
         *status = 0;
 
-        if (req->rq_ctx_init || req->rq_ctx_fini) {
-                /* always allow ctx init/fini rpc go through */
-        } else if (imp->imp_state == LUSTRE_IMP_NEW) {
-                DEBUG_REQ(D_ERROR, req, "Uninitialized import.");
-                *status = -EIO;
+	if (req->rq_ctx_init || req->rq_ctx_fini) {
+		/* always allow ctx init/fini rpc go through */
+	} else if (imp->imp_state == LUSTRE_IMP_NEW) {
+		DEBUG_REQ(D_ERROR, req, "Uninitialized import.");
+		*status = -EIO;
 	} else if (imp->imp_state == LUSTRE_IMP_CLOSED) {
-		/* pings may safely race with umount */
-		DEBUG_REQ(lustre_msg_get_opc(req->rq_reqmsg) == OBD_PING ?
+		unsigned int opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+		/* pings or MDS-equivalent STATFS may safely race with umount */
+		DEBUG_REQ((opc == OBD_PING || opc == OST_STATFS) ?
 			  D_HA : D_ERROR, req, "IMP_CLOSED ");
 		*status = -EIO;
-        } else if (ptlrpc_send_limit_expired(req)) {
+	} else if (ptlrpc_send_limit_expired(req)) {
 		/* probably doesn't need to be a D_ERROR after initial testing*/
 		DEBUG_REQ(D_HA, req, "send limit expired ");
 		*status = -ETIMEDOUT;
@@ -1188,7 +1234,9 @@ static int ptlrpc_import_delay_req(struct obd_import *imp,
 		if (atomic_read(&imp->imp_inval_count) != 0) {
                         DEBUG_REQ(D_ERROR, req, "invalidate in flight");
                         *status = -EIO;
-		} else if (req->rq_no_delay) {
+		} else if (req->rq_no_delay &&
+			   imp->imp_generation != imp->imp_initiated_at) {
+			/* ignore nodelay for requests initiating connections */
                         *status = -EWOULDBLOCK;
 		} else if (req->rq_allow_replay &&
 			  (imp->imp_state == LUSTRE_IMP_REPLAY ||
@@ -1213,16 +1261,12 @@ static int ptlrpc_import_delay_req(struct obd_import *imp,
  * \retval false if no message should be printed
  * \retval true  if console message should be printed
  */
-static bool ptlrpc_console_allow(struct ptlrpc_request *req)
+static bool ptlrpc_console_allow(struct ptlrpc_request *req, __u32 opc, int err)
 {
-	__u32 opc;
-
 	LASSERT(req->rq_reqmsg != NULL);
-	opc = lustre_msg_get_opc(req->rq_reqmsg);
 
 	/* Suppress particular reconnect errors which are to be expected. */
 	if (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT) {
-		int err;
 
 		/* Suppress timed out reconnect requests */
 		if (lustre_handle_is_used(&req->rq_import->imp_remote_handle) ||
@@ -1232,12 +1276,20 @@ static bool ptlrpc_console_allow(struct ptlrpc_request *req)
 		/* Suppress most unavailable/again reconnect requests, but
 		 * print occasionally so it is clear client is trying to
 		 * connect to a server where no target is running. */
-		err = lustre_msg_get_status(req->rq_repmsg);
 		if ((err == -ENODEV || err == -EAGAIN) &&
 		    req->rq_import->imp_conn_cnt % 30 != 20)
 			return false;
 	}
 
+	if (opc == LDLM_ENQUEUE && err == -EAGAIN)
+		/* -EAGAIN is normal when using POSIX flocks */
+		return false;
+
+	if (opc == OBD_PING && (err == -ENODEV || err == -ENOTCONN) &&
+	    (req->rq_xid & 0xf) != 10)
+		/* Suppress most ping requests, they may fail occasionally */
+		return false;
+
 	return true;
 }
 
@@ -1256,9 +1308,7 @@ static int ptlrpc_check_status(struct ptlrpc_request *req)
 		lnet_nid_t nid = imp->imp_connection->c_peer.nid;
 		__u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
 
-		/* -EAGAIN is normal when using POSIX flocks */
-		if (ptlrpc_console_allow(req) &&
-		    !(opc == LDLM_ENQUEUE && err == -EAGAIN))
+		if (ptlrpc_console_allow(req, opc, err))
 			LCONSOLE_ERROR_MSG(0x11, "%s: operation %s to node %s "
 					   "failed: rc = %d\n",
 					   imp->imp_obd->obd_name,
@@ -1429,8 +1479,8 @@ static int after_reply(struct ptlrpc_request *req)
         if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING)
                 CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, cfs_fail_val);
         ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg));
-        ptlrpc_at_adj_net_latency(req,
-                                  lustre_msg_get_service_time(req->rq_repmsg));
+	ptlrpc_at_adj_net_latency(req,
+				  lustre_msg_get_service_timeout(req->rq_repmsg));
 
         rc = ptlrpc_check_status(req);
         imp->imp_connect_error = rc;
@@ -1557,8 +1607,7 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req)
 		req->rq_waiting = 1;
 		spin_unlock(&req->rq_lock);
 
-		DEBUG_REQ(D_HA, req, "req from PID %d waiting for recovery: "
-			  "(%s != %s)", lustre_msg_get_status(req->rq_reqmsg),
+		DEBUG_REQ(D_HA, req, "req waiting for recovery: (%s != %s)",
 			  ptlrpc_import_state_name(req->rq_send_state),
 			  ptlrpc_import_state_name(imp->imp_state));
 		LASSERT(list_empty(&req->rq_list));
@@ -1616,8 +1665,7 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req)
 	       " %s:%s:%d:%llu:%s:%d\n", current_comm(),
 	       imp->imp_obd->obd_uuid.uuid,
 	       lustre_msg_get_status(req->rq_reqmsg), req->rq_xid,
-	       libcfs_nid2str(imp->imp_connection->c_peer.nid),
-	       lustre_msg_get_opc(req->rq_reqmsg));
+	       obd_import_nid2str(imp), lustre_msg_get_opc(req->rq_reqmsg));
 
         rc = ptl_send_rpc(req, 0);
 	if (rc == -ENOMEM) {
@@ -1871,8 +1919,11 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
 					spin_unlock(&imp->imp_lock);
 					GOTO(interpret, req->rq_status);
 				}
+				/* ignore on just initiated connections */
 				if (ptlrpc_no_resend(req) &&
-				    !req->rq_wait_ctx) {
+				    !req->rq_wait_ctx &&
+				    imp->imp_generation !=
+				    imp->imp_initiated_at) {
 					req->rq_status = -ENOTCONN;
 					ptlrpc_rqphase_move(req,
 							    RQ_PHASE_INTERPRET);
@@ -2043,7 +2094,7 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
 			       imp->imp_obd->obd_uuid.uuid,
 			       lustre_msg_get_status(req->rq_reqmsg),
 			       req->rq_xid,
-			       libcfs_nid2str(imp->imp_connection->c_peer.nid),
+			       obd_import_nid2str(imp),
 			       lustre_msg_get_opc(req->rq_reqmsg));
 
 		spin_lock(&imp->imp_lock);
@@ -2100,6 +2151,7 @@ EXPORT_SYMBOL(ptlrpc_check_set);
 int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
 {
 	struct obd_import *imp = req->rq_import;
+	unsigned int debug_mask = D_RPCTRACE;
 	int rc = 0;
 	ENTRY;
 
@@ -2107,12 +2159,15 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
 	req->rq_timedout = 1;
 	spin_unlock(&req->rq_lock);
 
-	DEBUG_REQ(D_WARNING, req, "Request sent has %s: [sent %lld/real %lld]",
-                  req->rq_net_err ? "failed due to network error" :
-                     ((req->rq_real_sent == 0 ||
+	if (ptlrpc_console_allow(req, lustre_msg_get_opc(req->rq_reqmsg),
+				  lustre_msg_get_status(req->rq_reqmsg)))
+		debug_mask = D_WARNING;
+	DEBUG_REQ(debug_mask, req, "Request sent has %s: [sent %lld/real %lld]",
+		  req->rq_net_err ? "failed due to network error" :
+		     ((req->rq_real_sent == 0 ||
 		       req->rq_real_sent < req->rq_sent ||
 		       req->rq_real_sent >= req->rq_deadline) ?
-                      "timed out for sent delay" : "timed out for slow reply"),
+		      "timed out for sent delay" : "timed out for slow reply"),
 		  (s64)req->rq_sent, (s64)req->rq_real_sent);
 
 	if (imp != NULL && obd_debug_peer_on_timeout)
@@ -2253,7 +2308,7 @@ static void ptlrpc_interrupted_set(void *data)
 /**
  * Get the smallest timeout in the set; this does NOT set a timeout.
  */
-int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
+time64_t ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
 {
 	struct list_head *tmp;
 	time64_t now = ktime_get_real_seconds();
@@ -2306,13 +2361,14 @@ int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
  * error or otherwise be interrupted).
  * Returns 0 on success or error code otherwise.
  */
-int ptlrpc_set_wait(struct ptlrpc_request_set *set)
+int ptlrpc_set_wait(const struct lu_env *env, struct ptlrpc_request_set *set)
 {
-	struct list_head            *tmp;
-        struct ptlrpc_request *req;
-        struct l_wait_info     lwi;
-        int                    rc, timeout;
-        ENTRY;
+	struct list_head *tmp;
+	struct ptlrpc_request *req;
+	struct l_wait_info lwi;
+	time64_t timeout;
+	int rc;
+	ENTRY;
 
 	if (set->set_producer)
 		(void)ptlrpc_set_producer(set);
@@ -2327,13 +2383,13 @@ int ptlrpc_set_wait(struct ptlrpc_request_set *set)
 	if (list_empty(&set->set_requests))
                 RETURN(0);
 
-        do {
-                timeout = ptlrpc_set_next_timeout(set);
+	do {
+		timeout = ptlrpc_set_next_timeout(set);
 
                 /* wait until all complete, interrupted, or an in-flight
                  * req times out */
-                CDEBUG(D_RPCTRACE, "set %p going to sleep for %d seconds\n",
-                       set, timeout);
+		CDEBUG(D_RPCTRACE, "set %p going to sleep for %lld seconds\n",
+			set, timeout);
 
 		if ((timeout == 0 && !signal_pending(current)) ||
 		    set->set_allow_intr)
@@ -2354,7 +2410,8 @@ int ptlrpc_set_wait(struct ptlrpc_request_set *set)
                         lwi = LWI_TIMEOUT(cfs_time_seconds(timeout? timeout : 1),
                                           ptlrpc_expired_set, set);
 
-                rc = l_wait_event(set->set_waitq, ptlrpc_check_set(NULL, set), &lwi);
+		rc = l_wait_event(set->set_waitq,
+				  ptlrpc_check_set(NULL, set), &lwi);
 
                 /* LU-769 - if we ignored the signal because it was already
                  * pending when we started, we need to handle it now or we risk
@@ -2405,25 +2462,7 @@ int ptlrpc_set_wait(struct ptlrpc_request_set *set)
                         rc = req->rq_status;
         }
 
-        if (set->set_interpret != NULL) {
-                int (*interpreter)(struct ptlrpc_request_set *set,void *,int) =
-                        set->set_interpret;
-                rc = interpreter (set, set->set_arg, rc);
-        } else {
-                struct ptlrpc_set_cbdata *cbdata, *n;
-                int err;
-
-		list_for_each_entry_safe(cbdata, n,
-                                         &set->set_cblist, psc_item) {
-			list_del_init(&cbdata->psc_item);
-                        err = cbdata->psc_interpret(set, cbdata->psc_data, rc);
-                        if (err && !rc)
-                                rc = err;
-                        OBD_FREE_PTR(cbdata);
-                }
-        }
-
-        RETURN(rc);
+	RETURN(rc);
 }
 EXPORT_SYMBOL(ptlrpc_set_wait);
 
@@ -2473,9 +2512,13 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
                 sptlrpc_cli_free_repbuf(request);
 
         if (request->rq_import != NULL) {
-                class_import_put(request->rq_import);
-                request->rq_import = NULL;
-        }
+		if (!ptlrpcd_check_work(request)) {
+			LASSERT(atomic_read(&request->rq_import->imp_reqs) > 0);
+			atomic_dec(&request->rq_import->imp_reqs);
+		}
+		class_import_put(request->rq_import);
+		request->rq_import = NULL;
+	}
 	if (request->rq_bulk != NULL)
 		ptlrpc_free_bulk(request->rq_bulk);
 
@@ -2679,8 +2722,11 @@ void ptlrpc_request_committed(struct ptlrpc_request *req, int force)
 		return;
 	}
 
-	if (force || req->rq_transno <= imp->imp_peer_committed_transno)
+	if (force || req->rq_transno <= imp->imp_peer_committed_transno) {
+		if (imp->imp_replay_cursor == &req->rq_replay_list)
+			imp->imp_replay_cursor = req->rq_replay_list.next;
 		ptlrpc_free_request(req);
+	}
 
 	spin_unlock(&imp->imp_lock);
 }
@@ -2792,7 +2838,7 @@ void ptlrpc_cleanup_client(struct obd_import *imp)
  */
 void ptlrpc_resend_req(struct ptlrpc_request *req)
 {
-        DEBUG_REQ(D_HA, req, "going to resend");
+	DEBUG_REQ(D_HA, req, "going to resend");
 	spin_lock(&req->rq_lock);
 
 	/* Request got reply but linked to the import list still.
@@ -2803,14 +2849,13 @@ void ptlrpc_resend_req(struct ptlrpc_request *req)
 		return;
 	}
 
-        lustre_msg_set_handle(req->rq_reqmsg, &(struct lustre_handle){ 0 });
-        req->rq_status = -EAGAIN;
+	req->rq_status = -EAGAIN;
 
-        req->rq_resend = 1;
-        req->rq_net_err = 0;
-        req->rq_timedout = 0;
+	req->rq_resend = 1;
+	req->rq_net_err = 0;
+	req->rq_timedout = 0;
 
-        ptlrpc_client_wake_req(req);
+	ptlrpc_client_wake_req(req);
 	spin_unlock(&req->rq_lock);
 }
 
@@ -2920,13 +2965,13 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
 	/* for distributed debugging */
 	lustre_msg_set_status(req->rq_reqmsg, current_pid());
 
-        /* add a ref for the set (see comment in ptlrpc_set_add_req) */
-        ptlrpc_request_addref(req);
-        ptlrpc_set_add_req(set, req);
-        rc = ptlrpc_set_wait(set);
-        ptlrpc_set_destroy(set);
+	/* add a ref for the set (see comment in ptlrpc_set_add_req) */
+	ptlrpc_request_addref(req);
+	ptlrpc_set_add_req(set, req);
+	rc = ptlrpc_set_wait(NULL, set);
+	ptlrpc_set_destroy(set);
 
-        RETURN(rc);
+	RETURN(rc);
 }
 EXPORT_SYMBOL(ptlrpc_queue_wait);
 
@@ -2966,7 +3011,6 @@ static int ptlrpc_replay_interpret(const struct lu_env *env,
 		DEBUG_REQ(D_WARNING, req, "Version mismatch during replay\n");
 		spin_lock(&imp->imp_lock);
 		imp->imp_vbr_failed = 1;
-		imp->imp_no_lock_replay = 1;
 		spin_unlock(&imp->imp_lock);
 		lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
         } else {
@@ -2980,9 +3024,6 @@ static int ptlrpc_replay_interpret(const struct lu_env *env,
         }
 
 	spin_lock(&imp->imp_lock);
-	/** if replays by version then gap occur on server, no trust to locks */
-	if (lustre_msg_get_flags(req->rq_repmsg) & MSG_VERSION_REPLAY)
-		imp->imp_no_lock_replay = 1;
 	imp->imp_last_replay_transno = lustre_msg_get_transno(req->rq_reqmsg);
 	spin_unlock(&imp->imp_lock);
         LASSERT(imp->imp_last_replay_transno);
@@ -3081,14 +3122,15 @@ static int ptlrpc_replay_interpret(const struct lu_env *env,
  */
 int ptlrpc_replay_req(struct ptlrpc_request *req)
 {
-        struct ptlrpc_replay_async_args *aa;
-        ENTRY;
+	struct ptlrpc_replay_async_args *aa;
 
-        LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY);
+	ENTRY;
 
-        LASSERT (sizeof (*aa) <= sizeof (req->rq_async_args));
-        aa = ptlrpc_req_async_args(req);
-        memset(aa, 0, sizeof *aa);
+	LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY);
+
+	CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+	aa = ptlrpc_req_async_args(req);
+	memset(aa, 0, sizeof(*aa));
 
         /* Prepare request to be resent with ptlrpcd */
         aa->praa_old_state = req->rq_send_state;
@@ -3104,8 +3146,8 @@ int ptlrpc_replay_req(struct ptlrpc_request *req)
 
         /* Tell server the net_latency, so the server can calculate how long
          * it should wait for next replay */
-        lustre_msg_set_service_time(req->rq_reqmsg,
-                                    ptlrpc_at_get_net_latency(req));
+	lustre_msg_set_service_timeout(req->rq_reqmsg,
+				       ptlrpc_at_get_net_latency(req));
         DEBUG_REQ(D_HA, req, "REPLAY");
 
 	atomic_inc(&req->rq_import->imp_replay_inflight);
@@ -3126,11 +3168,12 @@ void ptlrpc_abort_inflight(struct obd_import *imp)
 	struct list_head *tmp, *n;
 	ENTRY;
 
-	/* Make sure that no new requests get processed for this import.
+	/*
+	 * Make sure that no new requests get processed for this import.
 	 * ptlrpc_{queue,set}_wait must (and does) hold imp_lock while testing
 	 * this flag and then putting requests on sending_list or delayed_list.
 	 */
-	spin_lock(&imp->imp_lock);
+	assert_spin_locked(&imp->imp_lock);
 
 	/* XXX locking?  Maybe we should remove each request with the list
 	 * locked?  Also, how do we know if the requests on the list are
@@ -3172,8 +3215,6 @@ void ptlrpc_abort_inflight(struct obd_import *imp)
 	if (imp->imp_replayable)
 		ptlrpc_free_committed(imp);
 
-	spin_unlock(&imp->imp_lock);
-
 	EXIT;
 }
 
@@ -3313,8 +3354,7 @@ void ptlrpc_set_bulk_mbits(struct ptlrpc_request *req)
 	/* For multi-bulk RPCs, rq_mbits is the last mbits needed for bulks so
 	 * that server can infer the number of bulks that were prepared,
 	 * see LU-1431 */
-	req->rq_mbits += ((bd->bd_iov_count + LNET_MAX_IOV - 1) /
-			  LNET_MAX_IOV) - 1;
+	req->rq_mbits += bd->bd_md_count - 1;
 
 	/* Set rq_xid as rq_mbits to indicate the final bulk for the old
 	 * server which does not support OBD_CONNECT_BULK_MBITS. LU-6808.
@@ -3442,7 +3482,7 @@ void *ptlrpcd_alloc_work(struct obd_import *imp,
 	req->rq_no_delay = req->rq_no_resend = 1;
 	req->rq_pill.rc_fmt = (void *)&worker_format;
 
-	CLASSERT (sizeof(*args) <= sizeof(req->rq_async_args));
+	CLASSERT(sizeof(*args) <= sizeof(req->rq_async_args));
 	args = ptlrpc_req_async_args(req);
 	args->cb     = cb;
 	args->cbdata = cbdata;
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/errno.c b/drivers/staging/lustrefsx/lustre/ptlrpc/errno.c
index fb302c70d08be..a3d31a853244c 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/errno.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/errno.c
@@ -26,9 +26,10 @@
  */
 
 #include <libcfs/libcfs.h>
-#include <lustre/lustre_errno.h>
+#include <lustre_errno.h>
 
 #ifdef LUSTRE_TRANSLATE_ERRNOS
+#include <lustre_dlm.h>
 
 /*
  * The two translation tables below must define a one-to-one mapping between
@@ -185,7 +186,20 @@ static int lustre_errno_hton_mapping[] = {
 	[ESERVERFAULT]		= LUSTRE_ESERVERFAULT,
 	[EBADTYPE]		= LUSTRE_EBADTYPE,
 	[EJUKEBOX]		= LUSTRE_EJUKEBOX,
-	[EIOCBQUEUED]		= LUSTRE_EIOCBQUEUED
+	[EIOCBQUEUED]		= LUSTRE_EIOCBQUEUED,
+
+	/*
+	 * The ELDLM errors are Lustre specific errors whose ranges
+	 * lie in the middle of the above system errors. The ELDLM
+	 * numbers must be preserved to avoid LU-9793.
+	 */
+	[ELDLM_LOCK_CHANGED]	= ELDLM_LOCK_CHANGED,
+	[ELDLM_LOCK_ABORTED]	= ELDLM_LOCK_ABORTED,
+	[ELDLM_LOCK_REPLACED]	= ELDLM_LOCK_REPLACED,
+	[ELDLM_NO_LOCK_DATA]	= ELDLM_NO_LOCK_DATA,
+	[ELDLM_LOCK_WOULDBLOCK]	= ELDLM_LOCK_WOULDBLOCK,
+	[ELDLM_NAMESPACE_EXISTS]= ELDLM_NAMESPACE_EXISTS,
+	[ELDLM_BAD_NAMESPACE]	= ELDLM_BAD_NAMESPACE
 };
 
 static int lustre_errno_ntoh_mapping[] = {
@@ -331,7 +345,20 @@ static int lustre_errno_ntoh_mapping[] = {
 	[LUSTRE_ESERVERFAULT]		= ESERVERFAULT,
 	[LUSTRE_EBADTYPE]		= EBADTYPE,
 	[LUSTRE_EJUKEBOX]		= EJUKEBOX,
-	[LUSTRE_EIOCBQUEUED]		= EIOCBQUEUED
+	[LUSTRE_EIOCBQUEUED]		= EIOCBQUEUED,
+
+	/*
+	 * The ELDLM errors are Lustre specific errors whose ranges
+	 * lie in the middle of the above system errors. The ELDLM
+	 * numbers must be preserved to avoid LU-9793.
+	 */
+	[ELDLM_LOCK_CHANGED]		= ELDLM_LOCK_CHANGED,
+	[ELDLM_LOCK_ABORTED]		= ELDLM_LOCK_ABORTED,
+	[ELDLM_LOCK_REPLACED]		= ELDLM_LOCK_REPLACED,
+	[ELDLM_NO_LOCK_DATA]		= ELDLM_NO_LOCK_DATA,
+	[ELDLM_LOCK_WOULDBLOCK]		= ELDLM_LOCK_WOULDBLOCK,
+	[ELDLM_NAMESPACE_EXISTS]	= ELDLM_NAMESPACE_EXISTS,
+	[ELDLM_BAD_NAMESPACE]		= ELDLM_BAD_NAMESPACE
 };
 
 unsigned int lustre_errno_hton(unsigned int h)
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/events.c b/drivers/staging/lustrefsx/lustre/ptlrpc/events.c
index 28533cca19a32..6c713b22b94ae 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/events.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/events.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -56,6 +56,11 @@ void request_out_callback(struct lnet_event *ev)
 
 	DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status);
 
+	/* Do not update imp_next_ping for connection request */
+	if (lustre_msg_get_opc(req->rq_reqmsg) !=
+	    req->rq_import->imp_connect_op)
+		ptlrpc_pinger_sending_on_import(req->rq_import);
+
 	sptlrpc_request_out_callback(req);
 
 	spin_lock(&req->rq_lock);
@@ -161,12 +166,13 @@ void reply_in_callback(struct lnet_event *ev)
                           ev->mlength, ev->offset, req->rq_replen);
         }
 
-	req->rq_import->imp_last_reply_time = ktime_get_real_seconds();
+	if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING)
+		req->rq_import->imp_last_reply_time = ktime_get_real_seconds();
 
 out_wake:
-        /* NB don't unlock till after wakeup; req can disappear under us
-         * since we don't have our own ref */
-        ptlrpc_client_wake_req(req);
+	/* NB don't unlock till after wakeup; req can disappear under us
+	 * since we don't have our own ref */
+	ptlrpc_client_wake_req(req);
 	spin_unlock(&req->rq_lock);
 	EXIT;
 }
@@ -200,8 +206,8 @@ void client_bulk_callback(struct lnet_event *ev)
 
 	spin_lock(&desc->bd_lock);
 	req = desc->bd_req;
-	LASSERT(desc->bd_md_count > 0);
-	desc->bd_md_count--;
+	LASSERT(desc->bd_refs > 0);
+	desc->bd_refs--;
 
 	if (ev->type != LNET_EVENT_UNLINK && ev->status == 0) {
 		desc->bd_nob_transferred += ev->mlength;
@@ -218,7 +224,7 @@ void client_bulk_callback(struct lnet_event *ev)
 
 	/* NB don't unlock till after wakeup; desc can disappear under us
 	 * otherwise */
-	if (desc->bd_md_count == 0)
+	if (desc->bd_refs == 0)
 		ptlrpc_client_wake_req(desc->bd_req);
 
 	spin_unlock(&desc->bd_lock);
@@ -450,7 +456,7 @@ void server_bulk_callback(struct lnet_event *ev)
 
 	spin_lock(&desc->bd_lock);
 
-	LASSERT(desc->bd_md_count > 0);
+	LASSERT(desc->bd_refs > 0);
 
 	if ((ev->type == LNET_EVENT_ACK ||
 	     ev->type == LNET_EVENT_REPLY) &&
@@ -466,9 +472,9 @@ void server_bulk_callback(struct lnet_event *ev)
 		desc->bd_failure = 1;
 
 	if (ev->unlinked) {
-		desc->bd_md_count--;
+		desc->bd_refs--;
 		/* This is the last callback no matter what... */
-		if (desc->bd_md_count == 0)
+		if (desc->bd_refs == 0)
 			wake_up(&desc->bd_waitq);
 	}
 
@@ -500,14 +506,14 @@ static void ptlrpc_master_callback(struct lnet_event *ev)
 int ptlrpc_uuid_to_peer(struct obd_uuid *uuid,
 			struct lnet_process_id *peer, lnet_nid_t *self)
 {
-	int               best_dist = 0;
-	__u32             best_order = 0;
-	int               count = 0;
-	int               rc = -ENOENT;
-	int               dist;
-	__u32             order;
-	lnet_nid_t        dst_nid;
-	lnet_nid_t        src_nid;
+	int best_dist = 0;
+	__u32 best_order = 0;
+	int count = 0;
+	int rc = -ENOENT;
+	int dist;
+	__u32 order;
+	lnet_nid_t dst_nid;
+	lnet_nid_t src_nid;
 
 	peer->pid = LNET_PID_LUSTRE;
 
@@ -522,7 +528,7 @@ int ptlrpc_uuid_to_peer(struct obd_uuid *uuid,
 			continue;
 
 		if (dist == 0) {                /* local! use loopback LND */
-			peer->nid = *self = LNET_MKNID(LNET_MKNET(LOLND, 0), 0);
+			peer->nid = *self = LNET_NID_LO_0;
 			rc = 0;
 			break;
 		}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_api.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_api.h
index a5bbaea6065d3..a5f203e215389 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_api.h
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_api.h
@@ -21,10 +21,16 @@
 
 struct gss_api_mech;
 
+typedef int (*digest_hash)(
+	struct ahash_request *req, rawobj_t *hdr,
+	int msgcnt, rawobj_t *msgs,
+	int iovcnt, lnet_kiov_t *iovs);
+
 /* The mechanism-independent gss-api context: */
 struct gss_ctx {
-        struct gss_api_mech    *mech_type;
-        void                   *internal_ctx_id;
+	struct gss_api_mech *mech_type;
+	void *internal_ctx_id;
+	digest_hash hash_func;
 };
 
 #define GSS_C_NO_BUFFER         ((rawobj_t) 0)
@@ -44,7 +50,7 @@ __u32 lgss_copy_reverse_context(
                 struct gss_ctx         **ctx_new);
 __u32 lgss_inquire_context(
                 struct gss_ctx          *ctx,
-                unsigned long           *endtime);
+		time64_t *endtime);
 __u32 lgss_get_mic(
                 struct gss_ctx          *ctx,
                 int                      msgcnt,
@@ -119,7 +125,7 @@ struct gss_api_ops {
                         struct gss_ctx         *ctx_new);
         __u32 (*gss_inquire_context)(
                         struct gss_ctx         *ctx,
-                        unsigned long          *endtime);
+			time64_t *endtime);
         __u32 (*gss_get_mic)(
                         struct gss_ctx         *ctx,
                         int                     msgcnt,
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_bulk.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_bulk.c
index 3f703372d272f..041dd12dac593 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_bulk.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_bulk.c
@@ -46,7 +46,6 @@
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_net.h>
 #include <lustre_import.h>
 #include <lustre_sec.h>
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_cli_upcall.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_cli_upcall.c
index d1fa9200452ba..70d4711c67a96 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_cli_upcall.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_cli_upcall.c
@@ -45,7 +45,6 @@
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_net.h>
 #include <lustre_import.h>
 #include <lustre_sec.h>
@@ -60,82 +59,85 @@
 
 static
 int ctx_init_pack_request(struct obd_import *imp,
-                          struct ptlrpc_request *req,
-                          int lustre_srv,
-                          uid_t uid, gid_t gid,
-                          long token_size,
-                          char __user *token)
+			  struct ptlrpc_request *req,
+			  int lustre_srv,
+			  uid_t uid, gid_t gid,
+			  long token_size,
+			  char __user *token)
 {
-        struct lustre_msg       *msg = req->rq_reqbuf;
-        struct gss_sec          *gsec;
-        struct gss_header       *ghdr;
-        struct ptlrpc_user_desc *pud;
-        __u32                   *p, size, offset = 2;
-        rawobj_t                 obj;
-
-        LASSERT(msg->lm_bufcount <= 4);
-        LASSERT(req->rq_cli_ctx);
-        LASSERT(req->rq_cli_ctx->cc_sec);
-
-        /* gss hdr */
-        ghdr = lustre_msg_buf(msg, 0, sizeof(*ghdr));
-        ghdr->gh_version = PTLRPC_GSS_VERSION;
-        ghdr->gh_sp = (__u8) imp->imp_sec->ps_part;
-        ghdr->gh_flags = 0;
-        ghdr->gh_proc = PTLRPC_GSS_PROC_INIT;
-        ghdr->gh_seq = 0;
-        ghdr->gh_svc = SPTLRPC_SVC_NULL;
-        ghdr->gh_handle.len = 0;
-
-        /* fix the user desc */
-        if (req->rq_pack_udesc) {
-                ghdr->gh_flags |= LUSTRE_GSS_PACK_USER;
-
-                pud = lustre_msg_buf(msg, offset, sizeof(*pud));
-                LASSERT(pud);
-                pud->pud_uid = pud->pud_fsuid = uid;
-                pud->pud_gid = pud->pud_fsgid = gid;
-                pud->pud_cap = 0;
-                pud->pud_ngroups = 0;
-                offset++;
-        }
+	struct lustre_msg       *msg = req->rq_reqbuf;
+	struct gss_sec          *gsec;
+	struct gss_header       *ghdr;
+	struct ptlrpc_user_desc *pud;
+	__u32                   *p, size, offset = 2;
+	rawobj_t                 obj;
+
+	LASSERT(msg->lm_bufcount <= 4);
+	LASSERT(req->rq_cli_ctx);
+	LASSERT(req->rq_cli_ctx->cc_sec);
+
+	/* gss hdr */
+	ghdr = lustre_msg_buf(msg, 0, sizeof(*ghdr));
+	ghdr->gh_version = PTLRPC_GSS_VERSION;
+	ghdr->gh_sp = (__u8) imp->imp_sec->ps_part;
+	ghdr->gh_flags = 0;
+	ghdr->gh_proc = PTLRPC_GSS_PROC_INIT;
+	ghdr->gh_seq = 0;
+	ghdr->gh_svc = SPTLRPC_SVC_NULL;
+	ghdr->gh_handle.len = 0;
+
+	/* fix the user desc */
+	if (req->rq_pack_udesc) {
+		ghdr->gh_flags |= LUSTRE_GSS_PACK_USER;
+
+		pud = lustre_msg_buf(msg, offset, sizeof(*pud));
+		LASSERT(pud);
+		pud->pud_uid = pud->pud_fsuid = uid;
+		pud->pud_gid = pud->pud_fsgid = gid;
+		pud->pud_cap = 0;
+		pud->pud_ngroups = 0;
+		offset++;
+	}
 
-        /* security payload */
-        p = lustre_msg_buf(msg, offset, 0);
-        size = msg->lm_buflens[offset];
-        LASSERT(p);
-
-        /* 1. lustre svc type */
-        LASSERT(size > 4);
-        *p++ = cpu_to_le32(lustre_srv);
-        size -= 4;
-
-        /* 2. target uuid */
-        obj.len = strlen(imp->imp_obd->u.cli.cl_target_uuid.uuid) + 1;
-        obj.data = imp->imp_obd->u.cli.cl_target_uuid.uuid;
-        if (rawobj_serialize(&obj, &p, &size))
-                LBUG();
-
-        /* 3. reverse context handle. actually only needed by root user,
-         *    but we send it anyway. */
-        gsec = sec2gsec(req->rq_cli_ctx->cc_sec);
-        obj.len = sizeof(gsec->gs_rvs_hdl);
-        obj.data = (__u8 *) &gsec->gs_rvs_hdl;
-        if (rawobj_serialize(&obj, &p, &size))
-                LBUG();
-
-        /* 4. now the token */
-        LASSERT(size >= (sizeof(__u32) + token_size));
-        *p++ = cpu_to_le32(((__u32) token_size));
+	/* new clients are expected to set KCSUM flag */
+	ghdr->gh_flags |= LUSTRE_GSS_PACK_KCSUM;
+
+	/* security payload */
+	p = lustre_msg_buf(msg, offset, 0);
+	size = msg->lm_buflens[offset];
+	LASSERT(p);
+
+	/* 1. lustre svc type */
+	LASSERT(size > 4);
+	*p++ = cpu_to_le32(lustre_srv);
+	size -= 4;
+
+	/* 2. target uuid */
+	obj.len = strlen(imp->imp_obd->u.cli.cl_target_uuid.uuid) + 1;
+	obj.data = imp->imp_obd->u.cli.cl_target_uuid.uuid;
+	if (rawobj_serialize(&obj, &p, &size))
+		LBUG();
+
+	/* 3. reverse context handle. actually only needed by root user,
+	 *    but we send it anyway. */
+	gsec = sec2gsec(req->rq_cli_ctx->cc_sec);
+	obj.len = sizeof(gsec->gs_rvs_hdl);
+	obj.data = (__u8 *) &gsec->gs_rvs_hdl;
+	if (rawobj_serialize(&obj, &p, &size))
+		LBUG();
+
+	/* 4. now the token */
+	LASSERT(size >= (sizeof(__u32) + token_size));
+	*p++ = cpu_to_le32(((__u32) token_size));
 	if (copy_from_user(p, token, token_size)) {
-                CERROR("can't copy token\n");
-                return -EFAULT;
-        }
-        size -= sizeof(__u32) + cfs_size_round4(token_size);
+		CERROR("can't copy token\n");
+		return -EFAULT;
+	}
+	size -= sizeof(__u32) + cfs_size_round4(token_size);
 
-        req->rq_reqdata_len = lustre_shrink_msg(req->rq_reqbuf, offset,
-                                                msg->lm_buflens[offset] - size, 0);
-        return 0;
+	req->rq_reqdata_len = lustre_shrink_msg(req->rq_reqbuf, offset,
+					     msg->lm_buflens[offset] - size, 0);
+	return 0;
 }
 
 static
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.c
index 17fd9cf3c00c1..7be412d2d4a72 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.c
@@ -55,12 +55,12 @@
 #include "gss_internal.h"
 #include "gss_crypto.h"
 
-int gss_keyblock_init(struct gss_keyblock *kb, char *alg_name,
+int gss_keyblock_init(struct gss_keyblock *kb, const char *alg_name,
 		      const int alg_mode)
 {
 	int rc;
 
-	kb->kb_tfm = crypto_alloc_blkcipher(alg_name, alg_mode, 0);
+	kb->kb_tfm = crypto_alloc_sync_skcipher(alg_name, alg_mode, 0);
 	if (IS_ERR(kb->kb_tfm)) {
 		rc = PTR_ERR(kb->kb_tfm);
 		kb->kb_tfm = NULL;
@@ -69,8 +69,8 @@ int gss_keyblock_init(struct gss_keyblock *kb, char *alg_name,
 		return rc;
 	}
 
-	rc = crypto_blkcipher_setkey(kb->kb_tfm, kb->kb_key.data,
-				     kb->kb_key.len);
+	rc = crypto_sync_skcipher_setkey(kb->kb_tfm, kb->kb_key.data,
+					 kb->kb_key.len);
 	if (rc) {
 		CERROR("failed to set %s key, len %d, rc = %d\n", alg_name,
 		       kb->kb_key.len, rc);
@@ -84,7 +84,7 @@ void gss_keyblock_free(struct gss_keyblock *kb)
 {
 	rawobj_free(&kb->kb_key);
 	if (kb->kb_tfm)
-		crypto_free_blkcipher(kb->kb_tfm);
+		crypto_free_sync_skcipher(kb->kb_tfm);
 }
 
 int gss_keyblock_dup(struct gss_keyblock *new, struct gss_keyblock *kb)
@@ -226,86 +226,76 @@ void gss_teardown_sgtable(struct sg_table *sgt)
 		sg_free_table(sgt);
 }
 
-int gss_crypt_generic(struct crypto_blkcipher *tfm, int decrypt, const void *iv,
-		      const void *in, void *out, size_t length)
+int gss_crypt_generic(struct crypto_sync_skcipher *tfm, int decrypt,
+		      const void *iv, const void *in, void *out, size_t length)
 {
-	struct blkcipher_desc desc;
 	struct scatterlist sg;
 	struct sg_table sg_out;
 	__u8 local_iv[16] = {0};
 	__u32 ret = -EINVAL;
+	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
 
 	LASSERT(tfm);
-	desc.tfm = tfm;
-	desc.info = local_iv;
-	desc.flags = 0;
 
-	if (length % crypto_blkcipher_blocksize(tfm) != 0) {
+	if (length % crypto_sync_skcipher_blocksize(tfm) != 0) {
 		CERROR("output length %zu mismatch blocksize %d\n",
-		       length, crypto_blkcipher_blocksize(tfm));
+		       length, crypto_sync_skcipher_blocksize(tfm));
 		goto out;
 	}
 
-	if (crypto_blkcipher_ivsize(tfm) > ARRAY_SIZE(local_iv)) {
-		CERROR("iv size too large %d\n", crypto_blkcipher_ivsize(tfm));
+	if (crypto_sync_skcipher_ivsize(tfm) > ARRAY_SIZE(local_iv)) {
+		CERROR("iv size too large %d\n",
+			crypto_sync_skcipher_ivsize(tfm));
 		goto out;
 	}
 
 	if (iv)
-		memcpy(local_iv, iv, crypto_blkcipher_ivsize(tfm));
+		memcpy(local_iv, iv, crypto_sync_skcipher_ivsize(tfm));
 
-	memcpy(out, in, length);
+	if (in != out)
+		memmove(out, in, length);
 
 	ret = gss_setup_sgtable(&sg_out, &sg, out, length);
 	if (ret != 0)
 		goto out;
 
+	skcipher_request_set_sync_tfm(req, tfm);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, &sg, &sg, length, local_iv);
+
 	if (decrypt)
-		ret = crypto_blkcipher_decrypt_iv(&desc, &sg, &sg, length);
+		ret = crypto_skcipher_decrypt_iv(req, &sg, &sg, length);
 	else
-		ret = crypto_blkcipher_encrypt_iv(&desc, &sg, &sg, length);
+		ret = crypto_skcipher_encrypt_iv(req, &sg, &sg, length);
 
+	skcipher_request_zero(req);
 	gss_teardown_sgtable(&sg_out);
 out:
 	return ret;
 }
 
-int gss_digest_hmac(struct crypto_hash *tfm,
-		    rawobj_t *key,
-		    rawobj_t *hdr,
-		    int msgcnt, rawobj_t *msgs,
-		    int iovcnt, lnet_kiov_t *iovs,
-		    rawobj_t *cksum)
+int gss_digest_hash(struct ahash_request *req,
+		    rawobj_t *hdr, int msgcnt, rawobj_t *msgs,
+		    int iovcnt, lnet_kiov_t *iovs)
 {
-	struct hash_desc desc = {
-		.tfm = tfm,
-		.flags = 0,
-	};
 	struct scatterlist sg[1];
 	struct sg_table sgt;
+	int rc = 0;
 	int i;
-	int rc;
-
-	rc = crypto_hash_setkey(tfm, key->data, key->len);
-	if (rc)
-		return rc;
-
-	rc = crypto_hash_init(&desc);
-	if (rc)
-		return rc;
 
 	for (i = 0; i < msgcnt; i++) {
 		if (msgs[i].len == 0)
 			continue;
 
 		rc = gss_setup_sgtable(&sgt, sg, msgs[i].data, msgs[i].len);
-		if (rc != 0)
-			return rc;
-		rc = crypto_hash_update(&desc, sg, msgs[i].len);
 		if (rc)
 			return rc;
 
+		ahash_request_set_crypt(req, sg, NULL, msgs[i].len);
+		rc = crypto_ahash_update(req);
 		gss_teardown_sgtable(&sgt);
+		if (rc)
+			return rc;
 	}
 
 	for (i = 0; i < iovcnt; i++) {
@@ -315,59 +305,50 @@ int gss_digest_hmac(struct crypto_hash *tfm,
 		sg_init_table(sg, 1);
 		sg_set_page(&sg[0], iovs[i].kiov_page, iovs[i].kiov_len,
 			    iovs[i].kiov_offset);
-		rc = crypto_hash_update(&desc, sg, iovs[i].kiov_len);
+
+		ahash_request_set_crypt(req, sg, NULL, iovs[i].kiov_len);
+		rc = crypto_ahash_update(req);
 		if (rc)
 			return rc;
 	}
 
 	if (hdr) {
-		rc = gss_setup_sgtable(&sgt, sg, hdr, sizeof(*hdr));
-		if (rc != 0)
-			return rc;
-		rc = crypto_hash_update(&desc, sg, sizeof(hdr->len));
+		rc = gss_setup_sgtable(&sgt, sg, hdr->data, hdr->len);
 		if (rc)
 			return rc;
 
+		ahash_request_set_crypt(req, sg, NULL, hdr->len);
+		rc = crypto_ahash_update(req);
 		gss_teardown_sgtable(&sgt);
+		if (rc)
+			return rc;
 	}
 
-	return crypto_hash_final(&desc, cksum->data);
+	return rc;
 }
 
-int gss_digest_norm(struct crypto_hash *tfm,
-		    struct gss_keyblock *kb,
-		    rawobj_t *hdr,
-		    int msgcnt, rawobj_t *msgs,
-		    int iovcnt, lnet_kiov_t *iovs,
-		    rawobj_t *cksum)
+int gss_digest_hash_compat(struct ahash_request *req,
+			   rawobj_t *hdr, int msgcnt, rawobj_t *msgs,
+			   int iovcnt, lnet_kiov_t *iovs)
 {
-	struct hash_desc   desc;
 	struct scatterlist sg[1];
 	struct sg_table sgt;
-	int                i;
-	int                rc;
-
-	LASSERT(kb->kb_tfm);
-	desc.tfm = tfm;
-	desc.flags = 0;
-
-	rc = crypto_hash_init(&desc);
-	if (rc)
-		return rc;
+	int rc = 0;
+	int i;
 
 	for (i = 0; i < msgcnt; i++) {
 		if (msgs[i].len == 0)
 			continue;
 
 		rc = gss_setup_sgtable(&sgt, sg, msgs[i].data, msgs[i].len);
-		if (rc != 0)
-			return rc;
-
-		rc = crypto_hash_update(&desc, sg, msgs[i].len);
 		if (rc)
 			return rc;
 
+		ahash_request_set_crypt(req, sg, NULL, msgs[i].len);
+		rc = crypto_ahash_update(req);
 		gss_teardown_sgtable(&sgt);
+		if (rc)
+			return rc;
 	}
 
 	for (i = 0; i < iovcnt; i++) {
@@ -377,29 +358,26 @@ int gss_digest_norm(struct crypto_hash *tfm,
 		sg_init_table(sg, 1);
 		sg_set_page(&sg[0], iovs[i].kiov_page, iovs[i].kiov_len,
 			    iovs[i].kiov_offset);
-		rc = crypto_hash_update(&desc, sg, iovs[i].kiov_len);
+
+		ahash_request_set_crypt(req, sg, NULL, iovs[i].kiov_len);
+		rc = crypto_ahash_update(req);
 		if (rc)
 			return rc;
 	}
 
 	if (hdr) {
-		rc = gss_setup_sgtable(&sgt, sg, hdr, sizeof(*hdr));
-		if (rc != 0)
-			return rc;
-
-		rc = crypto_hash_update(&desc, sg, sizeof(*hdr));
+		rc = gss_setup_sgtable(&sgt, sg, &(hdr->len), sizeof(hdr->len));
 		if (rc)
 			return rc;
 
+		ahash_request_set_crypt(req, sg, NULL, sizeof(hdr->len));
+		rc = crypto_ahash_update(req);
 		gss_teardown_sgtable(&sgt);
+		if (rc)
+			return rc;
 	}
 
-	rc = crypto_hash_final(&desc, cksum->data);
-	if (rc)
-		return rc;
-
-	return gss_crypt_generic(kb->kb_tfm, 0, NULL, cksum->data,
-				 cksum->data, cksum->len);
+	return rc;
 }
 
 int gss_add_padding(rawobj_t *msg, int msg_buflen, int blocksize)
@@ -422,11 +400,10 @@ int gss_add_padding(rawobj_t *msg, int msg_buflen, int blocksize)
 	return 0;
 }
 
-int gss_crypt_rawobjs(struct crypto_blkcipher *tfm, __u8 *iv,
+int gss_crypt_rawobjs(struct crypto_sync_skcipher *tfm, __u8 *iv,
 		      int inobj_cnt, rawobj_t *inobjs, rawobj_t *outobj,
 		      int enc)
 {
-	struct blkcipher_desc desc;
 	struct scatterlist src;
 	struct scatterlist dst;
 	struct sg_table sg_dst;
@@ -434,12 +411,13 @@ int gss_crypt_rawobjs(struct crypto_blkcipher *tfm, __u8 *iv,
 	__u8 *buf;
 	__u32 datalen = 0;
 	int i, rc;
+	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
+
 	ENTRY;
 
 	buf = outobj->data;
-	desc.tfm  = tfm;
-	desc.info = iv;
-	desc.flags = 0;
+	skcipher_request_set_sync_tfm(req, tfm);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
 
 	for (i = 0; i < inobj_cnt; i++) {
 		LASSERT(buf + inobjs[i].len <= outobj->data + outobj->len);
@@ -456,35 +434,30 @@ int gss_crypt_rawobjs(struct crypto_blkcipher *tfm, __u8 *iv,
 			RETURN(rc);
 		}
 
-		if (iv) {
-			if (enc)
-				rc = crypto_blkcipher_encrypt_iv(&desc, &dst,
-								 &src,
-								 src.length);
-			else
-				rc = crypto_blkcipher_decrypt_iv(&desc, &dst,
-								 &src,
-								 src.length);
-		} else {
-			if (enc)
-				rc = crypto_blkcipher_encrypt(&desc, &dst, &src,
-							      src.length);
-			else
-				rc = crypto_blkcipher_decrypt(&desc, &dst, &src,
-							      src.length);
-		}
+		skcipher_request_set_crypt(req, &src, &dst, src.length, iv);
+		if (!iv)
+			skcipher_request_set_crypt_iv(req);
+
+		if (enc)
+			rc = crypto_skcipher_encrypt_iv(req, &dst, &src,
+							src.length);
+		else
+			rc = crypto_skcipher_decrypt_iv(req, &dst, &src,
+							src.length);
 
 		gss_teardown_sgtable(&sg_src);
 		gss_teardown_sgtable(&sg_dst);
 
 		if (rc) {
 			CERROR("encrypt error %d\n", rc);
+			skcipher_request_zero(req);
 			RETURN(rc);
 		}
 
 		datalen += inobjs[i].len;
 		buf += inobjs[i].len;
 	}
+	skcipher_request_zero(req);
 
 	outobj->len = datalen;
 	RETURN(0);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.h
index ad15cdedd66d5..7ed680a4c8430 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.h
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.h
@@ -1,14 +1,79 @@
 #ifndef PTLRPC_GSS_CRYPTO_H
 #define PTLRPC_GSS_CRYPTO_H
 
+#include <linux/scatterlist.h>
+
 #include "gss_internal.h"
 
+#include <crypto/skcipher.h>
+
+/*
+ * linux v4.19-rc2-66-gb350bee5ea0f
+ * crypto: skcipher - Introduce crypto_sync_skcipher
+ *
+ * crypto_sync_skcipher will replace crypto_blkcipher so start using
+ * crypto_sync_skcipher and provide wrappers for older kernels
+ */
+#ifdef SYNC_SKCIPHER_REQUEST_ON_STACK
+
+#define crypto_skcipher_encrypt_iv(desc, dst, src, blocksize)		\
+	crypto_skcipher_encrypt((desc))
+
+#define crypto_skcipher_decrypt_iv(desc, dst, src, blocksize)		\
+	crypto_skcipher_decrypt((desc))
+
+#define skcipher_request_set_crypt_iv(d)
+
+#else /* ! SYNC_SKCIPHER_REQUEST_ON_STACK */
+
+#define	crypto_sync_skcipher		crypto_blkcipher
+
+#define SYNC_SKCIPHER_REQUEST_ON_STACK(name, tfm)			\
+	struct blkcipher_desc __##name##_obj, *name = (void *)&__##name##_obj
+
+#define skcipher_request_set_sync_tfm(d, _tfm)				\
+	do { (d)->tfm = _tfm; } while (0)
+
+#define skcipher_request_set_callback(d, f, c, data)			\
+	do { (d)->flags = f; } while (0)
+
+#define skcipher_request_set_crypt(d, src, dst, cryptlen, iv)		\
+	do { (d)->info = iv; } while (0)
+
+#define skcipher_request_set_crypt_iv(d)				\
+	do { (d)->info = crypto_blkcipher_crt((d)->tfm)->iv; } while (0)
+
+#define crypto_sync_skcipher_blocksize(tfm)				\
+	crypto_blkcipher_blocksize((tfm))
+
+#define crypto_sync_skcipher_setkey(tfm, key, keylen)			\
+	crypto_blkcipher_setkey((tfm), (key), (keylen))
+
+#define crypto_alloc_sync_skcipher(name, type, mask)			\
+	crypto_alloc_blkcipher((name), (type), (mask))
+
+#define crypto_free_sync_skcipher(tfm)					\
+	crypto_free_blkcipher((tfm))
+
+#define crypto_sync_skcipher_ivsize(tfm)				\
+	crypto_blkcipher_ivsize((tfm))
+
+#define crypto_skcipher_encrypt_iv(desc, dst, src, len)			\
+	crypto_blkcipher_encrypt_iv((desc), (dst), (src), (len))
+
+#define crypto_skcipher_decrypt_iv(desc, dst, src, len)			\
+	crypto_blkcipher_decrypt_iv((desc), (dst), (src), (len))
+
+#define skcipher_request_zero(req) /* nop */
+
+#endif /* SYNC_SKCIPHER_REQUEST_ON_STACK */
+
 struct gss_keyblock {
-	rawobj_t		 kb_key;
-	struct crypto_blkcipher *kb_tfm;
+	rawobj_t kb_key;
+	struct crypto_sync_skcipher *kb_tfm;
 };
 
-int gss_keyblock_init(struct gss_keyblock *kb, char *alg_name,
+int gss_keyblock_init(struct gss_keyblock *kb, const char *alg_name,
 		      const int alg_mode);
 void gss_keyblock_free(struct gss_keyblock *kb);
 int gss_keyblock_dup(struct gss_keyblock *new, struct gss_keyblock *kb);
@@ -19,16 +84,15 @@ int gss_get_keyblock(char **ptr, const char *end, struct gss_keyblock *kb,
 int gss_setup_sgtable(struct sg_table *sgt, struct scatterlist *prealloc_sg,
 		      const void *buf, unsigned int buf_len);
 void gss_teardown_sgtable(struct sg_table *sgt);
-int gss_crypt_generic(struct crypto_blkcipher *tfm, int decrypt, const void *iv,
-		      const void *in, void *out, size_t length);
-int gss_digest_hmac(struct crypto_hash *tfm, rawobj_t *key, rawobj_t *hdr,
-		    int msgcnt, rawobj_t *msgs, int iovcnt, lnet_kiov_t *iovs,
-		    rawobj_t *cksum);
-int gss_digest_norm(struct crypto_hash *tfm, struct gss_keyblock *kb,
-		    rawobj_t *hdr, int msgcnt, rawobj_t *msgs, int iovcnt,
-		    lnet_kiov_t *iovs, rawobj_t *cksum);
+int gss_crypt_generic(struct crypto_sync_skcipher *tfm, int decrypt,
+		      const void *iv, const void *in, void *out, size_t length);
+int gss_digest_hash(struct ahash_request *req, rawobj_t *hdr,
+		    int msgcnt, rawobj_t *msgs, int iovcnt, lnet_kiov_t *iovs);
+int gss_digest_hash_compat(struct ahash_request *req,
+			   rawobj_t *hdr, int msgcnt, rawobj_t *msgs,
+			   int iovcnt, lnet_kiov_t *iovs);
 int gss_add_padding(rawobj_t *msg, int msg_buflen, int blocksize);
-int gss_crypt_rawobjs(struct crypto_blkcipher *tfm, __u8 *iv,
+int gss_crypt_rawobjs(struct crypto_sync_skcipher *tfm, __u8 *iv,
 		      int inobj_cnt, rawobj_t *inobjs, rawobj_t *outobj,
 		      int enc);
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_generic_token.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_generic_token.c
index 3c4e63b992bee..23506f89d67c2 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_generic_token.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_generic_token.c
@@ -50,7 +50,6 @@
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_net.h>
 #include <lustre_import.h>
 #include <lustre_sec.h>
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h
index eb86ba1627103..c49a54021688f 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h
@@ -11,7 +11,8 @@
 #ifndef __PTLRPC_GSS_GSS_INTERNAL_H_
 #define __PTLRPC_GSS_GSS_INTERNAL_H_
 
-#include <linux/crypto.h>
+#include <crypto/hash.h>
+#include <libcfs/libcfs_crypto.h>
 #include <lustre_sec.h>
 
 /*
@@ -72,17 +73,16 @@ int buffer_extract_bytes(const void **buf, __u32 *buflen,
  */
 #define GSS_GC_INTERVAL                 (60 * 60) /* 60 minutes */
 
-static inline
-unsigned long gss_round_ctx_expiry(unsigned long expiry,
-                                   unsigned long sec_flags)
+static inline time64_t gss_round_ctx_expiry(time64_t expiry,
+					    unsigned long sec_flags)
 {
-        if (sec_flags & PTLRPC_SEC_FL_REVERSE)
-                return expiry;
+	if (sec_flags & PTLRPC_SEC_FL_REVERSE)
+		return expiry;
 
-        if (ktime_get_real_seconds() + __TIMEOUT_DELTA <= expiry)
-                return expiry - __TIMEOUT_DELTA;
+	if (ktime_get_real_seconds() + __TIMEOUT_DELTA <= expiry)
+		return expiry - __TIMEOUT_DELTA;
 
-        return expiry;
+	return expiry;
 }
 
 /*
@@ -117,8 +117,9 @@ enum ptlrpc_gss_tgt {
 };
 
 enum ptlrpc_gss_header_flags {
-        LUSTRE_GSS_PACK_BULK            = 1,
-        LUSTRE_GSS_PACK_USER            = 2,
+	LUSTRE_GSS_PACK_BULK            = 1,
+	LUSTRE_GSS_PACK_USER            = 2,
+	LUSTRE_GSS_PACK_KCSUM           = 4,
 };
 
 static inline
@@ -286,9 +287,9 @@ struct gss_cli_ctx {
 };
 
 struct gss_cli_ctx_keyring {
-        struct gss_cli_ctx      gck_base;
-        struct key             *gck_key;
-        struct timer_list      *gck_timer;
+	struct gss_cli_ctx      gck_base;
+	struct key             *gck_key;
+	struct timer_list       gck_timer;
 };
 
 struct gss_sec {
@@ -357,6 +358,14 @@ static inline struct gss_sec_keyring *sec2gsec_keyring(struct ptlrpc_sec *sec)
         return container_of(sec2gsec(sec), struct gss_sec_keyring, gsk_base);
 }
 
+#ifdef HAVE_CACHE_HASH_SPINLOCK
+# define sunrpc_cache_lookup(c, i, h) sunrpc_cache_lookup_rcu((c), (i), (h))
+# define cache_read_lock(cdetail)   spin_lock(&((cdetail)->hash_lock))
+# define cache_read_unlock(cdetail) spin_unlock(&((cdetail)->hash_lock))
+#else /* ! HAVE_CACHE_HASH_SPINLOCK */
+# define cache_read_lock(cdetail)   read_lock(&((cdetail)->hash_lock))
+# define cache_read_unlock(cdetail) read_unlock(&((cdetail)->hash_lock))
+#endif
 
 #define GSS_CTX_INIT_MAX_LEN            (1024)
 
@@ -509,6 +518,7 @@ void gss_svc_upcall_destroy_ctx(struct gss_svc_ctx *ctx);
 
 int  __init gss_init_svc_upcall(void);
 void gss_exit_svc_upcall(void);
+extern unsigned int krb5_allow_old_client_csum;
 
 /* lproc_gss.c */
 void gss_stat_oos_record_cli(int behind);
@@ -554,4 +564,13 @@ void __dbg_memdump(char *name, void *ptr, int size)
         OBD_FREE(buf, bufsize);
 }
 
+static inline unsigned int ll_read_key_usage(struct key *key)
+{
+#ifdef HAVE_KEY_USAGE_REFCOUNT
+	return refcount_read(&key->usage);
+#else
+	return atomic_read(&key->usage);
+#endif
+}
+
 #endif /* __PTLRPC_GSS_GSS_INTERNAL_H_ */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_keyring.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_keyring.c
index 81aad1ffea6e2..15bf99427489b 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_keyring.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_keyring.c
@@ -51,7 +51,7 @@
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 #include <lustre_sec.h>
 #include <lustre_net.h>
 #include <lustre_import.h>
@@ -60,6 +60,10 @@
 #include "gss_internal.h"
 #include "gss_api.h"
 
+#ifdef HAVE_GET_REQUEST_KEY_AUTH
+#include <keys/request_key_auth-type.h>
+#endif
+
 static struct ptlrpc_sec_policy gss_policy_keyring;
 static struct ptlrpc_ctx_ops gss_keyring_ctxops;
 static struct key_type gss_key_type;
@@ -82,45 +86,6 @@ static int sec_install_rctx_kr(struct ptlrpc_sec *sec,
  * internal helpers                     *
  ****************************************/
 
-#define DUMP_PROCESS_KEYRINGS(tsk)					\
-{									\
-	CWARN("DUMP PK: %s[%u,%u/%u](<-%s[%u,%u/%u]): "			\
-	      "a %d, t %d, p %d, s %d, u %d, us %d, df %d\n",		\
-	      tsk->comm, tsk->pid, tsk->uid, tsk->fsuid,		\
-	      tsk->parent->comm, tsk->parent->pid,			\
-	      tsk->parent->uid, tsk->parent->fsuid,			\
-	      tsk->request_key_auth ?					\
-	      tsk->request_key_auth->serial : 0,			\
-	      key_cred(tsk)->thread_keyring ?				\
-	      key_cred(tsk)->thread_keyring->serial : 0,		\
-	      key_tgcred(tsk)->process_keyring ?			\
-	      key_tgcred(tsk)->process_keyring->serial : 0,		\
-	      key_tgcred(tsk)->session_keyring ?			\
-	      key_tgcred(tsk)->session_keyring->serial : 0,		\
-	      key_cred(tsk)->user->uid_keyring ?			\
-	      key_cred(tsk)->user->uid_keyring->serial : 0,		\
-	      key_cred(tsk)->user->session_keyring ?			\
-	      key_cred(tsk)->user->session_keyring->serial : 0,		\
-	      key_cred(tsk)->jit_keyring				\
-	     );								\
-}
-
-#define DUMP_KEY(key)                                                   \
-{                                                                       \
-        CWARN("DUMP KEY: %p(%d) ref %d u%u/g%u desc %s\n",              \
-              key, key->serial, atomic_read(&key->usage),               \
-              key->uid, key->gid,                                       \
-              key->description ? key->description : "n/a"               \
-             );                                                         \
-}
-
-#define key_cred(tsk)   ((tsk)->cred)
-#ifdef HAVE_CRED_TGCRED
-#define key_tgcred(tsk) ((tsk)->cred->tgcred)
-#else
-#define key_tgcred(tsk) key_cred(tsk)
-#endif
-
 static inline void keyring_upcall_lock(struct gss_sec_keyring *gsec_kr)
 {
 #ifdef HAVE_KEYRING_UPCALL_SERIALIZED
@@ -140,10 +105,12 @@ static inline void key_revoke_locked(struct key *key)
         set_bit(KEY_FLAG_REVOKED, &key->flags);
 }
 
-static void ctx_upcall_timeout_kr(unsigned long data)
+static void ctx_upcall_timeout_kr(cfs_timer_cb_arg_t data)
 {
-        struct ptlrpc_cli_ctx *ctx = (struct ptlrpc_cli_ctx *) data;
-        struct key            *key = ctx2gctx_keyring(ctx)->gck_key;
+	struct gss_cli_ctx_keyring *gctx_kr = cfs_from_timer(gctx_kr,
+							     data, gck_timer);
+	struct ptlrpc_cli_ctx *ctx = &(gctx_kr->gck_base.gc_base);
+	struct key *key	= gctx_kr->gck_key;
 
         CWARN("ctx %p, key %p\n", ctx, key);
 
@@ -153,22 +120,18 @@ static void ctx_upcall_timeout_kr(unsigned long data)
         key_revoke_locked(key);
 }
 
-static void ctx_start_timer_kr(struct ptlrpc_cli_ctx *ctx, long timeout)
+static void ctx_start_timer_kr(struct ptlrpc_cli_ctx *ctx, time64_t timeout)
 {
 	struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx);
-	struct timer_list          *timer = gctx_kr->gck_timer;
+	struct timer_list *timer = &gctx_kr->gck_timer;
 
 	LASSERT(timer);
 
-	CDEBUG(D_SEC, "ctx %p: start timer %lds\n", ctx, timeout);
-	timeout = msecs_to_jiffies(timeout * MSEC_PER_SEC) +
-		  cfs_time_current();
-
-	init_timer(timer);
-	timer->expires = timeout;
-	timer->data = (unsigned long ) ctx;
-	timer->function = ctx_upcall_timeout_kr;
+	CDEBUG(D_SEC, "ctx %p: start timer %llds\n", ctx, timeout);
 
+	cfs_timer_setup(timer, ctx_upcall_timeout_kr,
+			(unsigned long)gctx_kr, 0);
+	timer->expires = cfs_time_seconds(timeout) + jiffies;
 	add_timer(timer);
 }
 
@@ -179,47 +142,34 @@ static
 void ctx_clear_timer_kr(struct ptlrpc_cli_ctx *ctx)
 {
         struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx);
-        struct timer_list          *timer = gctx_kr->gck_timer;
-
-        if (timer == NULL)
-                return;
+	struct timer_list          *timer = &gctx_kr->gck_timer;
 
         CDEBUG(D_SEC, "ctx %p, key %p\n", ctx, gctx_kr->gck_key);
 
-        gctx_kr->gck_timer = NULL;
-
         del_singleshot_timer_sync(timer);
-
-        OBD_FREE_PTR(timer);
 }
 
 static
 struct ptlrpc_cli_ctx *ctx_create_kr(struct ptlrpc_sec *sec,
                                      struct vfs_cred *vcred)
 {
-        struct ptlrpc_cli_ctx      *ctx;
-        struct gss_cli_ctx_keyring *gctx_kr;
+	struct ptlrpc_cli_ctx      *ctx;
+	struct gss_cli_ctx_keyring *gctx_kr;
 
-        OBD_ALLOC_PTR(gctx_kr);
-        if (gctx_kr == NULL)
-                return NULL;
+	OBD_ALLOC_PTR(gctx_kr);
+	if (gctx_kr == NULL)
+		return NULL;
 
-        OBD_ALLOC_PTR(gctx_kr->gck_timer);
-        if (gctx_kr->gck_timer == NULL) {
-                OBD_FREE_PTR(gctx_kr);
-                return NULL;
-        }
-        init_timer(gctx_kr->gck_timer);
+	cfs_timer_setup(&gctx_kr->gck_timer, NULL, 0, 0);
 
-        ctx = &gctx_kr->gck_base.gc_base;
+	ctx = &gctx_kr->gck_base.gc_base;
 
-        if (gss_cli_ctx_init_common(sec, ctx, &gss_keyring_ctxops, vcred)) {
-                OBD_FREE_PTR(gctx_kr->gck_timer);
-                OBD_FREE_PTR(gctx_kr);
-                return NULL;
-        }
+	if (gss_cli_ctx_init_common(sec, ctx, &gss_keyring_ctxops, vcred)) {
+		OBD_FREE_PTR(gctx_kr);
+		return NULL;
+	}
 
-	ctx->cc_expire = cfs_time_current_sec() + KEYRING_UPCALL_TIMEOUT;
+	ctx->cc_expire = ktime_get_real_seconds() + KEYRING_UPCALL_TIMEOUT;
 	clear_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags);
 	atomic_inc(&ctx->cc_refcount); /* for the caller */
 
@@ -241,7 +191,6 @@ static void ctx_destroy_kr(struct ptlrpc_cli_ctx *ctx)
         LASSERT(gctx_kr->gck_key == NULL);
 
 	ctx_clear_timer_kr(ctx);
-	LASSERT(gctx_kr->gck_timer == NULL);
 
 	if (gss_cli_ctx_fini_common(sec, ctx))
 		return;
@@ -388,7 +337,7 @@ static int key_set_payload(struct key *key, unsigned int index,
 static void bind_key_ctx(struct key *key, struct ptlrpc_cli_ctx *ctx)
 {
 	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
-        LASSERT(atomic_read(&key->usage) > 0);
+	LASSERT(ll_read_key_usage(key) > 0);
 	LASSERT(ctx2gctx_keyring(ctx)->gck_key == NULL);
 	LASSERT(!key_get_payload(key, 0));
 
@@ -561,17 +510,17 @@ void rvs_sec_install_root_ctx_kr(struct ptlrpc_sec *sec,
                                  struct ptlrpc_cli_ctx *new_ctx,
                                  struct key *key)
 {
-	struct gss_sec_keyring	*gsec_kr = sec2gsec_keyring(sec);
-	struct hlist_node	__maybe_unused *hnode;
-	struct ptlrpc_cli_ctx	*ctx;
-	cfs_time_t		now;
-	ENTRY;
+	struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec);
+	struct hlist_node __maybe_unused *hnode;
+	struct ptlrpc_cli_ctx *ctx;
+	time64_t now;
 
-        LASSERT(sec_is_reverse(sec));
+	ENTRY;
+	LASSERT(sec_is_reverse(sec));
 
 	spin_lock(&sec->ps_lock);
 
-        now = cfs_time_current_sec();
+	now = ktime_get_real_seconds();
 
         /* set all existing ctxs short expiry */
         cfs_hlist_for_each_entry(ctx, hnode, &gsec_kr->gsk_clist, cc_cache) {
@@ -666,39 +615,104 @@ static inline int user_is_root(struct ptlrpc_sec *sec, struct vfs_cred *vcred)
                 return 0;
 }
 
+/*
+ * kernel 5.3: commit 0f44e4d976f96c6439da0d6717238efa4b91196e
+ * keys: Move the user and user-session keyrings to the user_namespace
+ *
+ * When lookup_user_key is available use the kernel API rather than directly
+ * accessing the uid_keyring and session_keyring via the current process
+ * credentials.
+ */
+#ifdef HAVE_LOOKUP_USER_KEY
+
+/* from Linux security/keys/internal.h: */
+#ifndef KEY_LOOKUP_FOR_UNLINK
+#define KEY_LOOKUP_FOR_UNLINK		0x04
+#endif
+
+static struct key *_user_key(key_serial_t id)
+{
+	key_ref_t ref;
+
+	might_sleep();
+	ref = lookup_user_key(id, KEY_LOOKUP_FOR_UNLINK, 0);
+	if (IS_ERR(ref))
+		return NULL;
+	return key_ref_to_ptr(ref);
+}
+
+static inline struct key *get_user_session_keyring(const struct cred *cred)
+{
+	return _user_key(KEY_SPEC_USER_SESSION_KEYRING);
+}
+
+static inline struct key *get_user_keyring(const struct cred *cred)
+{
+	return _user_key(KEY_SPEC_USER_KEYRING);
+}
+#else
+static inline struct key *get_user_session_keyring(const struct cred *cred)
+{
+	return key_get(cred->user->session_keyring);
+}
+
+static inline struct key *get_user_keyring(const struct cred *cred)
+{
+	return key_get(cred->user->uid_keyring);
+}
+#endif
+
 /*
  * unlink request key from it's ring, which is linked during request_key().
  * sadly, we have to 'guess' which keyring it's linked to.
  *
- * FIXME this code is fragile, depend on how request_key_link() is implemented.
+ * FIXME this code is fragile, it depends on how request_key() is implemented.
  */
 static void request_key_unlink(struct key *key)
 {
-	struct task_struct *tsk = current;
-	struct key *ring;
+	const struct cred *cred = current_cred();
+	struct key *ring = NULL;
 
-	switch (key_cred(tsk)->jit_keyring) {
+	switch (cred->jit_keyring) {
 	case KEY_REQKEY_DEFL_DEFAULT:
+	case KEY_REQKEY_DEFL_REQUESTOR_KEYRING:
+#ifdef HAVE_GET_REQUEST_KEY_AUTH
+		if (cred->request_key_auth) {
+			struct request_key_auth *rka;
+			struct key *authkey = cred->request_key_auth;
+
+			down_read(&authkey->sem);
+			rka = get_request_key_auth(authkey);
+			if (!test_bit(KEY_FLAG_REVOKED, &authkey->flags))
+				ring = key_get(rka->dest_keyring);
+			up_read(&authkey->sem);
+			if (ring)
+				break;
+		}
+#endif
+		/* fall through */
 	case KEY_REQKEY_DEFL_THREAD_KEYRING:
-		ring = key_get(key_cred(tsk)->thread_keyring);
+		ring = key_get(cred->thread_keyring);
 		if (ring)
 			break;
+		/* fallthrough */
 	case KEY_REQKEY_DEFL_PROCESS_KEYRING:
-		ring = key_get(key_tgcred(tsk)->process_keyring);
+		ring = key_get(cred->process_keyring);
 		if (ring)
 			break;
+		/* fallthrough */
 	case KEY_REQKEY_DEFL_SESSION_KEYRING:
 		rcu_read_lock();
-		ring = key_get(rcu_dereference(key_tgcred(tsk)
-					       ->session_keyring));
+		ring = key_get(rcu_dereference(cred->session_keyring));
 		rcu_read_unlock();
 		if (ring)
 			break;
+		/* fallthrough */
 	case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
-		ring = key_get(key_cred(tsk)->user->session_keyring);
+		ring = get_user_session_keyring(cred);
 		break;
 	case KEY_REQKEY_DEFL_USER_KEYRING:
-		ring = key_get(key_cred(tsk)->user->uid_keyring);
+		ring = get_user_keyring(cred);
 		break;
 	case KEY_REQKEY_DEFL_GROUP_KEYRING:
 	default:
@@ -863,7 +877,7 @@ struct ptlrpc_cli_ctx * gss_sec_lookup_ctx_kr(struct ptlrpc_sec *sec,
 	if (likely(ctx)) {
 		LASSERT(atomic_read(&ctx->cc_refcount) >= 1);
 		LASSERT(ctx2gctx_keyring(ctx)->gck_key == key);
-		LASSERT(atomic_read(&key->usage) >= 2);
+		LASSERT(ll_read_key_usage(key) >= 2);
 
 		/* simply take a ref and return. it's upper layer's
 		 * responsibility to detect & replace dead ctx. */
@@ -1067,13 +1081,13 @@ void gss_sec_gc_ctx_kr(struct ptlrpc_sec *sec)
 static
 int gss_sec_display_kr(struct ptlrpc_sec *sec, struct seq_file *seq)
 {
-	struct gss_sec_keyring	*gsec_kr = sec2gsec_keyring(sec);
-	struct hlist_node	__maybe_unused *pos, *next;
-	struct ptlrpc_cli_ctx	*ctx;
-	struct gss_cli_ctx	*gctx;
-	time_t			 now = cfs_time_current_sec();
-	ENTRY;
+	struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec);
+	struct hlist_node __maybe_unused *pos, *next;
+	struct ptlrpc_cli_ctx *ctx;
+	struct gss_cli_ctx *gctx;
+	time64_t now = ktime_get_real_seconds();
 
+	ENTRY;
 	spin_lock(&sec->ps_lock);
         cfs_hlist_for_each_entry_safe(ctx, pos, next,
 				      &gsec_kr->gsk_clist, cc_cache) {
@@ -1093,9 +1107,8 @@ int gss_sec_display_kr(struct ptlrpc_sec *sec, struct seq_file *seq)
                         snprintf(mech, sizeof(mech), "N/A");
                 mech[sizeof(mech) - 1] = '\0';
 
-		seq_printf(seq, "%p: uid %u, ref %d, expire %lu(%+ld), fl %s, "
-			   "seq %d, win %u, key %08x(ref %d), "
-			   "hdl %#llx:%#llx, mech: %s\n",
+		seq_printf(seq,
+			   "%p: uid %u, ref %d, expire %lld(%+lld), fl %s, seq %d, win %u, key %08x(ref %d), hdl %#llx:%#llx, mech: %s\n",
 			   ctx, ctx->cc_vcred.vc_uid,
 			   atomic_read(&ctx->cc_refcount),
 			   ctx->cc_expire,
@@ -1104,7 +1117,7 @@ int gss_sec_display_kr(struct ptlrpc_sec *sec, struct seq_file *seq)
 			   atomic_read(&gctx->gc_seq),
 			   gctx->gc_win,
 			   key ? key->serial : 0,
-			   key ? atomic_read(&key->usage) : 0,
+			   key ? ll_read_key_usage(key) : 0,
 			   gss_handle_to_u64(&gctx->gc_handle),
 			   gss_handle_to_u64(&gctx->gc_svc_handle),
 			   mech);
@@ -1121,8 +1134,16 @@ int gss_sec_display_kr(struct ptlrpc_sec *sec, struct seq_file *seq)
 static
 int gss_cli_ctx_refresh_kr(struct ptlrpc_cli_ctx *ctx)
 {
-        /* upcall is already on the way */
-        return 0;
+	/* upcall is already on the way */
+	struct gss_cli_ctx *gctx = ctx ? ctx2gctx(ctx) : NULL;
+
+	/* record latest sequence number in buddy svcctx */
+	if (gctx && !rawobj_empty(&gctx->gc_svc_handle) &&
+	    sec_is_reverse(gctx->gc_base.cc_sec)) {
+		return gss_svc_upcall_update_sequence(&gctx->gc_svc_handle,
+					     (__u32)atomic_read(&gctx->gc_seq));
+	}
+	return 0;
 }
 
 static
@@ -1325,15 +1346,15 @@ int gss_kt_instantiate(struct key *key, const void *data, size_t datalen)
          * the session keyring is created upon upcall, and don't change all
          * the way until upcall finished, so rcu lock is not needed here.
          */
-	LASSERT(key_tgcred(current)->session_keyring);
+	LASSERT(current_cred()->session_keyring);
 
 	lockdep_off();
-	rc = key_link(key_tgcred(current)->session_keyring, key);
+	rc = key_link(current_cred()->session_keyring, key);
 	lockdep_on();
 	if (unlikely(rc)) {
 		CERROR("failed to link key %08x to keyring %08x: %d\n",
 		       key->serial,
-		       key_tgcred(current)->session_keyring->serial, rc);
+		       current_cred()->session_keyring->serial, rc);
 		RETURN(rc);
 	}
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5.h
index 97ad55e3025c0..611160458d9b1 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5.h
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5.h
@@ -80,7 +80,7 @@ struct krb5_ctx {
 				kc_cfx:1,
 				kc_seed_init:1,
 				kc_have_acceptor_subkey:1;
-	__s32			kc_endtime;
+	time64_t		kc_endtime;
 	__u8			kc_seed[16];
 	__u64			kc_seq_send;
 	__u64			kc_seq_recv;
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5_mech.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5_mech.c
index 000d7a8e87b47..bd3a94ba162b3 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5_mech.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5_mech.c
@@ -58,7 +58,6 @@
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_net.h>
 #include <lustre_import.h>
 #include <lustre_sec.h>
@@ -95,18 +94,20 @@ static struct krb5_enctype enctypes[] = {
 		.ke_hash_size	= 16,
 		.ke_conf_size	= 8,
 	},
+#ifdef HAVE_DES3_SUPPORT
 	[ENCTYPE_DES3_CBC_RAW] = {		/* des3-hmac-sha1 */
 		.ke_dispname	= "des3-hmac-sha1",
 		.ke_enc_name	= "cbc(des3_ede)",
-		.ke_hash_name	= "hmac(sha1)",
+		.ke_hash_name	= "sha1",
 		.ke_hash_size	= 20,
 		.ke_conf_size	= 8,
 		.ke_hash_hmac	= 1,
 	},
+#endif
 	[ENCTYPE_AES128_CTS_HMAC_SHA1_96] = {	/* aes128-cts */
 		.ke_dispname	= "aes128-cts-hmac-sha1-96",
 		.ke_enc_name	= "cbc(aes)",
-		.ke_hash_name	= "hmac(sha1)",
+		.ke_hash_name	= "sha1",
 		.ke_hash_size	= 12,
 		.ke_conf_size	= 16,
 		.ke_hash_hmac	= 1,
@@ -114,7 +115,7 @@ static struct krb5_enctype enctypes[] = {
 	[ENCTYPE_AES256_CTS_HMAC_SHA1_96] = {	/* aes256-cts */
 		.ke_dispname	= "aes256-cts-hmac-sha1-96",
 		.ke_enc_name	= "cbc(aes)",
-		.ke_hash_name	= "hmac(sha1)",
+		.ke_hash_name	= "sha1",
 		.ke_hash_size	= 12,
 		.ke_conf_size	= 16,
 		.ke_hash_hmac	= 1,
@@ -122,33 +123,31 @@ static struct krb5_enctype enctypes[] = {
 	[ENCTYPE_ARCFOUR_HMAC] = {		/* arcfour-hmac-md5 */
 		.ke_dispname	= "arcfour-hmac-md5",
 		.ke_enc_name	= "ecb(arc4)",
-		.ke_hash_name	= "hmac(md5)",
+		.ke_hash_name	= "md5",
 		.ke_hash_size	= 16,
 		.ke_conf_size	= 8,
 		.ke_hash_hmac	= 1,
 	}
 };
 
-#define MAX_ENCTYPES    sizeof(enctypes)/sizeof(struct krb5_enctype)
-
 static const char * enctype2str(__u32 enctype)
 {
-        if (enctype < MAX_ENCTYPES && enctypes[enctype].ke_dispname)
-                return enctypes[enctype].ke_dispname;
+	if (enctype < ARRAY_SIZE(enctypes) && enctypes[enctype].ke_dispname)
+		return enctypes[enctype].ke_dispname;
 
-        return "unknown";
+	return "unknown";
 }
 
 static
 int krb5_init_keys(struct krb5_ctx *kctx)
 {
-        struct krb5_enctype *ke;
+	struct krb5_enctype *ke;
 
-        if (kctx->kc_enctype >= MAX_ENCTYPES ||
-            enctypes[kctx->kc_enctype].ke_hash_size == 0) {
-                CERROR("unsupported enctype %x\n", kctx->kc_enctype);
-                return -1;
-        }
+	if (kctx->kc_enctype >= ARRAY_SIZE(enctypes) ||
+	    enctypes[kctx->kc_enctype].ke_hash_size == 0) {
+		CERROR("unsupported enctype %x\n", kctx->kc_enctype);
+		return -1;
+	}
 
         ke = &enctypes[kctx->kc_enctype];
 
@@ -197,8 +196,13 @@ __u32 import_context_rfc1964(struct krb5_ctx *kctx, char *p, char *end)
 	    gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
 		goto out_err;
 
-	/* end time */
-	if (gss_get_bytes(&p, end, &kctx->kc_endtime, sizeof(kctx->kc_endtime)))
+	/* end time. While kc_endtime might be 64 bit the krb5 API
+	 * still uses 32 bits. To delay the 2038 bug see the incoming
+	 * value as a u32 which give us until 2106. See the link for details:
+	 *
+	 * http://web.mit.edu/kerberos/www/krb5-current/doc/appdev/y2038.html
+	 */
+	if (gss_get_bytes(&p, end, &kctx->kc_endtime, sizeof(u32)))
 		goto out_err;
 
 	/* seq send */
@@ -262,8 +266,13 @@ __u32 import_context_rfc4121(struct krb5_ctx *kctx, char *p, char *end)
 {
 	unsigned int    tmp_uint, keysize;
 
-	/* end time */
-	if (gss_get_bytes(&p, end, &kctx->kc_endtime, sizeof(kctx->kc_endtime)))
+	/* end time. While kc_endtime might be 64 bit the krb5 API
+	 * still uses 32 bits. To delay the 2038 bug see the incoming
+	 * value as a u32 which give us until 2106. See the link for details:
+	 *
+	 * http://web.mit.edu/kerberos/www/krb5-current/doc/appdev/y2038.html
+	 */
+	if (gss_get_bytes(&p, end, &kctx->kc_endtime, sizeof(u32)))
 		goto out_err;
 
 	/* flags */
@@ -411,11 +420,11 @@ __u32 gss_copy_reverse_context_kerberos(struct gss_ctx *gctx,
 
 static
 __u32 gss_inquire_context_kerberos(struct gss_ctx *gctx,
-                                   unsigned long  *endtime)
+				   time64_t *endtime)
 {
         struct krb5_ctx *kctx = gctx->internal_ctx_id;
 
-	*endtime = (unsigned long)((__u32) kctx->kc_endtime);
+	*endtime = kctx->kc_endtime;
         return GSS_S_COMPLETE;
 }
 
@@ -438,41 +447,66 @@ __s32 krb5_make_checksum(__u32 enctype,
 			 struct krb5_header *khdr,
 			 int msgcnt, rawobj_t *msgs,
 			 int iovcnt, lnet_kiov_t *iovs,
-			 rawobj_t *cksum)
+			 rawobj_t *cksum,
+			 digest_hash hash_func)
 {
-        struct krb5_enctype   *ke = &enctypes[enctype];
-	struct crypto_hash    *tfm;
-	rawobj_t	       hdr;
-        __u32                  code = GSS_S_FAILURE;
-        int                    rc;
-
-	if (!(tfm = crypto_alloc_hash(ke->ke_hash_name, 0, 0))) {
-                CERROR("failed to alloc TFM: %s\n", ke->ke_hash_name);
-                return GSS_S_FAILURE;
-        }
+	struct krb5_enctype *ke = &enctypes[enctype];
+	struct ahash_request *req = NULL;
+	enum cfs_crypto_hash_alg hash_algo;
+	rawobj_t hdr;
+	int rc;
+
+	hash_algo = cfs_crypto_hash_alg(ke->ke_hash_name);
+
+	/* For the cbc(des) case we want md5 instead of hmac(md5) */
+	if (strcmp(ke->ke_enc_name, "cbc(des)"))
+		req = cfs_crypto_hash_init(hash_algo, kb->kb_key.data,
+					   kb->kb_key.len);
+	else
+		req = cfs_crypto_hash_init(hash_algo, NULL, 0);
+	if (IS_ERR(req)) {
+		rc = PTR_ERR(req);
+		CERROR("failed to alloc hash %s : rc = %d\n",
+		       ke->ke_hash_name, rc);
+		goto out_no_hash;
+	}
 
-	cksum->len = crypto_hash_digestsize(tfm);
-        OBD_ALLOC_LARGE(cksum->data, cksum->len);
-        if (!cksum->data) {
-                cksum->len = 0;
-                goto out_tfm;
-        }
+	cksum->len = cfs_crypto_hash_digestsize(hash_algo);
+	OBD_ALLOC_LARGE(cksum->data, cksum->len);
+	if (!cksum->data) {
+		cksum->len = 0;
+		rc = -ENOMEM;
+		goto out_free_hash;
+	}
 
 	hdr.data = (__u8 *)khdr;
 	hdr.len = sizeof(*khdr);
 
-        if (ke->ke_hash_hmac)
-		rc = gss_digest_hmac(tfm, &kb->kb_key,
-				     &hdr, msgcnt, msgs, iovcnt, iovs, cksum);
-        else
-		rc = gss_digest_norm(tfm, kb,
-				     &hdr, msgcnt, msgs, iovcnt, iovs, cksum);
+	if (!hash_func) {
+		rc = -EPROTO;
+		CERROR("hash function for %s undefined\n",
+		       ke->ke_hash_name);
+		goto out_free_hash;
+	}
+	rc = hash_func(req, &hdr, msgcnt, msgs, iovcnt, iovs);
+	if (rc)
+		goto out_free_hash;
+
+	if (!ke->ke_hash_hmac) {
+		LASSERT(kb->kb_tfm);
+
+		cfs_crypto_hash_final(req, cksum->data, &cksum->len);
+		rc = gss_crypt_generic(kb->kb_tfm, 0, NULL,
+				       cksum->data, cksum->data,
+				       cksum->len);
+		goto out_no_hash;
+	}
 
-        if (rc == 0)
-                code = GSS_S_COMPLETE;
-out_tfm:
-	crypto_free_hash(tfm);
-        return code;
+out_free_hash:
+	if (req)
+		cfs_crypto_hash_final(req, cksum->data, &cksum->len);
+out_no_hash:
+	return rc ? GSS_S_FAILURE : GSS_S_COMPLETE;
 }
 
 static void fill_krb5_header(struct krb5_ctx *kctx,
@@ -545,118 +579,118 @@ static __u32 verify_krb5_header(struct krb5_ctx *kctx,
 
 static
 __u32 gss_get_mic_kerberos(struct gss_ctx *gctx,
-                           int msgcnt,
-                           rawobj_t *msgs,
-                           int iovcnt,
-                           lnet_kiov_t *iovs,
-                           rawobj_t *token)
+			   int msgcnt,
+			   rawobj_t *msgs,
+			   int iovcnt,
+			   lnet_kiov_t *iovs,
+			   rawobj_t *token)
 {
-        struct krb5_ctx     *kctx = gctx->internal_ctx_id;
-        struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
-        struct krb5_header  *khdr;
-        rawobj_t             cksum = RAWOBJ_EMPTY;
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header  *khdr;
+	rawobj_t cksum = RAWOBJ_EMPTY;
+	u32 major;
 
-        /* fill krb5 header */
-        LASSERT(token->len >= sizeof(*khdr));
+	/* fill krb5 header */
+	LASSERT(token->len >= sizeof(*khdr));
 	khdr = (struct krb5_header *)token->data;
-        fill_krb5_header(kctx, khdr, 0);
-
-        /* checksum */
-        if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc,
-                               khdr, msgcnt, msgs, iovcnt, iovs, &cksum))
-                return GSS_S_FAILURE;
+	fill_krb5_header(kctx, khdr, 0);
 
-        LASSERT(cksum.len >= ke->ke_hash_size);
-        LASSERT(token->len >= sizeof(*khdr) + ke->ke_hash_size);
-        memcpy(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size,
-               ke->ke_hash_size);
+	/* checksum */
+	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc, khdr,
+			       msgcnt, msgs, iovcnt, iovs, &cksum,
+			       gctx->hash_func))
+		GOTO(out_free_cksum, major = GSS_S_FAILURE);
 
-        token->len = sizeof(*khdr) + ke->ke_hash_size;
-        rawobj_free(&cksum);
-        return GSS_S_COMPLETE;
+	LASSERT(cksum.len >= ke->ke_hash_size);
+	LASSERT(token->len >= sizeof(*khdr) + ke->ke_hash_size);
+	memcpy(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size,
+	       ke->ke_hash_size);
+
+	token->len = sizeof(*khdr) + ke->ke_hash_size;
+	major = GSS_S_COMPLETE;
+out_free_cksum:
+	rawobj_free(&cksum);
+	return major;
 }
 
 static
 __u32 gss_verify_mic_kerberos(struct gss_ctx *gctx,
-                              int msgcnt,
-                              rawobj_t *msgs,
-                              int iovcnt,
-                              lnet_kiov_t *iovs,
-                              rawobj_t *token)
+			      int msgcnt,
+			      rawobj_t *msgs,
+			      int iovcnt,
+			      lnet_kiov_t *iovs,
+			      rawobj_t *token)
 {
-        struct krb5_ctx     *kctx = gctx->internal_ctx_id;
-        struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
-        struct krb5_header  *khdr;
-        rawobj_t             cksum = RAWOBJ_EMPTY;
-        __u32                major;
-
-        if (token->len < sizeof(*khdr)) {
-                CERROR("short signature: %u\n", token->len);
-                return GSS_S_DEFECTIVE_TOKEN;
-        }
+	struct krb5_ctx *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header *khdr;
+	rawobj_t cksum = RAWOBJ_EMPTY;
+	u32 major;
+
+	if (token->len < sizeof(*khdr)) {
+		CERROR("short signature: %u\n", token->len);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
 
 	khdr = (struct krb5_header *)token->data;
 
-        major = verify_krb5_header(kctx, khdr, 0);
-        if (major != GSS_S_COMPLETE) {
-                CERROR("bad krb5 header\n");
-                return major;
-        }
-
-        if (token->len < sizeof(*khdr) + ke->ke_hash_size) {
-                CERROR("short signature: %u, require %d\n",
-                       token->len, (int) sizeof(*khdr) + ke->ke_hash_size);
-                return GSS_S_FAILURE;
-        }
+	major = verify_krb5_header(kctx, khdr, 0);
+	if (major != GSS_S_COMPLETE) {
+		CERROR("bad krb5 header\n");
+		goto out;
+	}
 
-        if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc,
-                               khdr, msgcnt, msgs, iovcnt, iovs, &cksum)) {
-                CERROR("failed to make checksum\n");
-                return GSS_S_FAILURE;
-        }
+	if (token->len < sizeof(*khdr) + ke->ke_hash_size) {
+		CERROR("short signature: %u, require %d\n",
+		       token->len, (int) sizeof(*khdr) + ke->ke_hash_size);
+		GOTO(out, major = GSS_S_FAILURE);
+	}
 
-        LASSERT(cksum.len >= ke->ke_hash_size);
-        if (memcmp(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size,
-                   ke->ke_hash_size)) {
-                CERROR("checksum mismatch\n");
-                rawobj_free(&cksum);
-                return GSS_S_BAD_SIG;
-        }
+	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc,
+			       khdr, msgcnt, msgs, iovcnt, iovs, &cksum,
+			       gctx->hash_func))
+		GOTO(out_free_cksum, major = GSS_S_FAILURE);
 
-        rawobj_free(&cksum);
-        return GSS_S_COMPLETE;
+	LASSERT(cksum.len >= ke->ke_hash_size);
+	if (memcmp(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size,
+		   ke->ke_hash_size)) {
+		CERROR("checksum mismatch\n");
+		GOTO(out_free_cksum, major = GSS_S_BAD_SIG);
+	}
+	major = GSS_S_COMPLETE;
+out_free_cksum:
+	rawobj_free(&cksum);
+out:
+	return major;
 }
 
 /*
  * if adj_nob != 0, we adjust desc->bd_nob to the actual cipher text size.
  */
 static
-int krb5_encrypt_bulk(struct crypto_blkcipher *tfm,
-                      struct krb5_header *khdr,
-                      char *confounder,
-                      struct ptlrpc_bulk_desc *desc,
-                      rawobj_t *cipher,
-                      int adj_nob)
+int krb5_encrypt_bulk(struct crypto_sync_skcipher *tfm,
+		      struct krb5_header *khdr,
+		      char *confounder,
+		      struct ptlrpc_bulk_desc *desc,
+		      rawobj_t *cipher,
+		      int adj_nob)
 {
-        struct blkcipher_desc   ciph_desc;
-        __u8                    local_iv[16] = {0};
-        struct scatterlist      src, dst;
-	struct sg_table		sg_src, sg_dst;
-        int                     blocksize, i, rc, nob = 0;
+	__u8 local_iv[16] = {0};
+	struct scatterlist src, dst;
+	struct sg_table sg_src, sg_dst;
+	int blocksize, i, rc, nob = 0;
+	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
 
 	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
-        LASSERT(desc->bd_iov_count);
+	LASSERT(desc->bd_iov_count);
 	LASSERT(GET_ENC_KIOV(desc));
 
-	blocksize = crypto_blkcipher_blocksize(tfm);
-        LASSERT(blocksize > 1);
-        LASSERT(cipher->len == blocksize + sizeof(*khdr));
-
-        ciph_desc.tfm  = tfm;
-        ciph_desc.info = local_iv;
-        ciph_desc.flags = 0;
+	blocksize = crypto_sync_skcipher_blocksize(tfm);
+	LASSERT(blocksize > 1);
+	LASSERT(cipher->len == blocksize + sizeof(*khdr));
 
-        /* encrypt confounder */
+	/* encrypt confounder */
 	rc = gss_setup_sgtable(&sg_src, &src, confounder, blocksize);
 	if (rc != 0)
 		return rc;
@@ -666,20 +700,24 @@ int krb5_encrypt_bulk(struct crypto_blkcipher *tfm,
 		gss_teardown_sgtable(&sg_src);
 		return rc;
 	}
+	skcipher_request_set_sync_tfm(req, tfm);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl,
+				   blocksize, local_iv);
 
-	rc = crypto_blkcipher_encrypt_iv(&ciph_desc, sg_dst.sgl,
-					 sg_src.sgl, blocksize);
+	rc = crypto_skcipher_encrypt_iv(req, sg_dst.sgl, sg_src.sgl, blocksize);
 
 	gss_teardown_sgtable(&sg_dst);
 	gss_teardown_sgtable(&sg_src);
 
-        if (rc) {
-                CERROR("error to encrypt confounder: %d\n", rc);
-                return rc;
-        }
+	if (rc) {
+		CERROR("error to encrypt confounder: %d\n", rc);
+		skcipher_request_zero(req);
+		return rc;
+	}
 
-        /* encrypt clear pages */
-        for (i = 0; i < desc->bd_iov_count; i++) {
+	/* encrypt clear pages */
+	for (i = 0; i < desc->bd_iov_count; i++) {
 		sg_init_table(&src, 1);
 		sg_set_page(&src, BD_GET_KIOV(desc, i).kiov_page,
 			    (BD_GET_KIOV(desc, i).kiov_len +
@@ -695,28 +733,36 @@ int krb5_encrypt_bulk(struct crypto_blkcipher *tfm,
 		BD_GET_ENC_KIOV(desc, i).kiov_offset = dst.offset;
 		BD_GET_ENC_KIOV(desc, i).kiov_len = dst.length;
 
-		rc = crypto_blkcipher_encrypt_iv(&ciph_desc, &dst, &src,
-                                                    src.length);
-                if (rc) {
-                        CERROR("error to encrypt page: %d\n", rc);
-                        return rc;
-                }
-        }
+		skcipher_request_set_crypt(req, &src, &dst,
+					  src.length, local_iv);
+		rc = crypto_skcipher_encrypt_iv(req, &dst, &src, src.length);
+		if (rc) {
+			CERROR("error to encrypt page: %d\n", rc);
+			skcipher_request_zero(req);
+			return rc;
+		}
+	}
 
-        /* encrypt krb5 header */
+	/* encrypt krb5 header */
 	rc = gss_setup_sgtable(&sg_src, &src, khdr, sizeof(*khdr));
-	if (rc != 0)
+	if (rc != 0) {
+		skcipher_request_zero(req);
 		return rc;
+	}
 
 	rc = gss_setup_sgtable(&sg_dst, &dst, cipher->data + blocksize,
 			   sizeof(*khdr));
 	if (rc != 0) {
 		gss_teardown_sgtable(&sg_src);
+		skcipher_request_zero(req);
 		return rc;
 	}
 
-	rc = crypto_blkcipher_encrypt_iv(&ciph_desc, sg_dst.sgl, sg_src.sgl,
-					 sizeof(*khdr));
+	skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl,
+				   sizeof(*khdr), local_iv);
+	rc = crypto_skcipher_encrypt_iv(req, sg_dst.sgl, sg_src.sgl,
+					sizeof(*khdr));
+	skcipher_request_zero(req);
 
 	gss_teardown_sgtable(&sg_dst);
 	gss_teardown_sgtable(&sg_src);
@@ -751,39 +797,35 @@ int krb5_encrypt_bulk(struct crypto_blkcipher *tfm,
  *   should have been done by prep_bulk().
  */
 static
-int krb5_decrypt_bulk(struct crypto_blkcipher *tfm,
-                      struct krb5_header *khdr,
-                      struct ptlrpc_bulk_desc *desc,
-                      rawobj_t *cipher,
-                      rawobj_t *plain,
-                      int adj_nob)
+int krb5_decrypt_bulk(struct crypto_sync_skcipher *tfm,
+		      struct krb5_header *khdr,
+		      struct ptlrpc_bulk_desc *desc,
+		      rawobj_t *cipher,
+		      rawobj_t *plain,
+		      int adj_nob)
 {
-        struct blkcipher_desc   ciph_desc;
-        __u8                    local_iv[16] = {0};
-        struct scatterlist      src, dst;
-	struct sg_table		sg_src, sg_dst;
-        int                     ct_nob = 0, pt_nob = 0;
-        int                     blocksize, i, rc;
+	__u8 local_iv[16] = {0};
+	struct scatterlist src, dst;
+	struct sg_table sg_src, sg_dst;
+	int ct_nob = 0, pt_nob = 0;
+	int blocksize, i, rc;
+	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
 
 	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
-        LASSERT(desc->bd_iov_count);
+	LASSERT(desc->bd_iov_count);
 	LASSERT(GET_ENC_KIOV(desc));
-        LASSERT(desc->bd_nob_transferred);
-
-	blocksize = crypto_blkcipher_blocksize(tfm);
-        LASSERT(blocksize > 1);
-        LASSERT(cipher->len == blocksize + sizeof(*khdr));
+	LASSERT(desc->bd_nob_transferred);
 
-        ciph_desc.tfm  = tfm;
-        ciph_desc.info = local_iv;
-        ciph_desc.flags = 0;
+	blocksize = crypto_sync_skcipher_blocksize(tfm);
+	LASSERT(blocksize > 1);
+	LASSERT(cipher->len == blocksize + sizeof(*khdr));
 
-        if (desc->bd_nob_transferred % blocksize) {
-                CERROR("odd transferred nob: %d\n", desc->bd_nob_transferred);
-                return -EPROTO;
-        }
+	if (desc->bd_nob_transferred % blocksize) {
+		CERROR("odd transferred nob: %d\n", desc->bd_nob_transferred);
+		return -EPROTO;
+	}
 
-        /* decrypt head (confounder) */
+	/* decrypt head (confounder) */
 	rc = gss_setup_sgtable(&sg_src, &src, cipher->data, blocksize);
 	if (rc != 0)
 		return rc;
@@ -794,27 +836,31 @@ int krb5_decrypt_bulk(struct crypto_blkcipher *tfm,
 		return rc;
 	}
 
-	rc = crypto_blkcipher_decrypt_iv(&ciph_desc, sg_dst.sgl,
-					 sg_src.sgl, blocksize);
+	skcipher_request_set_sync_tfm(req, tfm);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl,
+				   blocksize, local_iv);
+
+	rc = crypto_skcipher_encrypt_iv(req, sg_dst.sgl, sg_src.sgl, blocksize);
 
 	gss_teardown_sgtable(&sg_dst);
 	gss_teardown_sgtable(&sg_src);
 
-        if (rc) {
-                CERROR("error to decrypt confounder: %d\n", rc);
-                return rc;
-        }
+	if (rc) {
+		CERROR("error to decrypt confounder: %d\n", rc);
+		skcipher_request_zero(req);
+		return rc;
+	}
 
 	for (i = 0; i < desc->bd_iov_count && ct_nob < desc->bd_nob_transferred;
 	     i++) {
-		if (BD_GET_ENC_KIOV(desc, i).kiov_offset % blocksize
-		    != 0 ||
-		    BD_GET_ENC_KIOV(desc, i).kiov_len % blocksize
-		    != 0) {
+		if (BD_GET_ENC_KIOV(desc, i).kiov_offset % blocksize != 0 ||
+		    BD_GET_ENC_KIOV(desc, i).kiov_len % blocksize != 0) {
 			CERROR("page %d: odd offset %u len %u, blocksize %d\n",
 			       i, BD_GET_ENC_KIOV(desc, i).kiov_offset,
 			       BD_GET_ENC_KIOV(desc, i).kiov_len,
 			       blocksize);
+			skcipher_request_zero(req);
 			return -EFAULT;
 		}
 
@@ -851,12 +897,14 @@ int krb5_decrypt_bulk(struct crypto_blkcipher *tfm,
 			sg_assign_page(&dst,
 				       BD_GET_KIOV(desc, i).kiov_page);
 
-		rc = crypto_blkcipher_decrypt_iv(&ciph_desc, &dst, &src,
-						 src.length);
-                if (rc) {
-                        CERROR("error to decrypt page: %d\n", rc);
-                        return rc;
-                }
+		skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl,
+					   src.length, local_iv);
+		rc = crypto_skcipher_decrypt_iv(req, &dst, &src, src.length);
+		if (rc) {
+			CERROR("error to decrypt page: %d\n", rc);
+			skcipher_request_zero(req);
+			return rc;
+		}
 
 		if (BD_GET_KIOV(desc, i).kiov_len % blocksize != 0) {
 			memcpy(page_address(BD_GET_KIOV(desc, i).kiov_page) +
@@ -871,24 +919,26 @@ int krb5_decrypt_bulk(struct crypto_blkcipher *tfm,
 		pt_nob += BD_GET_KIOV(desc, i).kiov_len;
 	}
 
-        if (unlikely(ct_nob != desc->bd_nob_transferred)) {
-                CERROR("%d cipher text transferred but only %d decrypted\n",
-                       desc->bd_nob_transferred, ct_nob);
-                return -EFAULT;
-        }
+	if (unlikely(ct_nob != desc->bd_nob_transferred)) {
+		CERROR("%d cipher text transferred but only %d decrypted\n",
+		       desc->bd_nob_transferred, ct_nob);
+		skcipher_request_zero(req);
+		return -EFAULT;
+	}
 
-        if (unlikely(!adj_nob && pt_nob != desc->bd_nob)) {
-                CERROR("%d plain text expected but only %d received\n",
-                       desc->bd_nob, pt_nob);
-                return -EFAULT;
-        }
+	if (unlikely(!adj_nob && pt_nob != desc->bd_nob)) {
+		CERROR("%d plain text expected but only %d received\n",
+		       desc->bd_nob, pt_nob);
+		skcipher_request_zero(req);
+		return -EFAULT;
+	}
 
 	/* if needed, clear up the rest unused iovs */
 	if (adj_nob)
 		while (i < desc->bd_iov_count)
 			BD_GET_KIOV(desc, i++).kiov_len = 0;
 
-        /* decrypt tail (krb5 header) */
+	/* decrypt tail (krb5 header) */
 	rc = gss_setup_sgtable(&sg_src, &src, cipher->data + blocksize,
 			       sizeof(*khdr));
 	if (rc != 0)
@@ -901,166 +951,170 @@ int krb5_decrypt_bulk(struct crypto_blkcipher *tfm,
 		return rc;
 	}
 
-	rc = crypto_blkcipher_decrypt_iv(&ciph_desc, sg_dst.sgl, sg_src.sgl,
-					 sizeof(*khdr));
-
+	skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl,
+				  src.length, local_iv);
+	rc = crypto_skcipher_decrypt_iv(req, sg_dst.sgl, sg_src.sgl,
+					sizeof(*khdr));
 	gss_teardown_sgtable(&sg_src);
 	gss_teardown_sgtable(&sg_dst);
 
-        if (rc) {
-                CERROR("error to decrypt tail: %d\n", rc);
-                return rc;
-        }
+	skcipher_request_zero(req);
+	if (rc) {
+		CERROR("error to decrypt tail: %d\n", rc);
+		return rc;
+	}
 
-        if (memcmp(cipher->data + blocksize, khdr, sizeof(*khdr))) {
-                CERROR("krb5 header doesn't match\n");
-                return -EACCES;
-        }
+	if (memcmp(cipher->data + blocksize, khdr, sizeof(*khdr))) {
+		CERROR("krb5 header doesn't match\n");
+		return -EACCES;
+	}
 
-        return 0;
+	return 0;
 }
 
 static
 __u32 gss_wrap_kerberos(struct gss_ctx *gctx,
-                        rawobj_t *gsshdr,
-                        rawobj_t *msg,
-                        int msg_buflen,
-                        rawobj_t *token)
+			rawobj_t *gsshdr,
+			rawobj_t *msg,
+			int msg_buflen,
+			rawobj_t *token)
 {
-        struct krb5_ctx     *kctx = gctx->internal_ctx_id;
-        struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
-        struct krb5_header  *khdr;
-        int                  blocksize;
-        rawobj_t             cksum = RAWOBJ_EMPTY;
-        rawobj_t             data_desc[3], cipher;
-        __u8                 conf[GSS_MAX_CIPHER_BLOCK];
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header  *khdr;
+	int                  blocksize;
+	rawobj_t             cksum = RAWOBJ_EMPTY;
+	rawobj_t             data_desc[3], cipher;
+	__u8                 conf[GSS_MAX_CIPHER_BLOCK];
 	__u8                 local_iv[16] = {0};
-        int                  rc = 0;
-
-        LASSERT(ke);
-        LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK);
-        LASSERT(kctx->kc_keye.kb_tfm == NULL ||
-                ke->ke_conf_size >=
-		crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm));
-
-        /*
-         * final token format:
-         * ---------------------------------------------------
-         * | krb5 header | cipher text | checksum (16 bytes) |
-         * ---------------------------------------------------
-         */
-
-        /* fill krb5 header */
-        LASSERT(token->len >= sizeof(*khdr));
+	u32 major;
+	int                  rc = 0;
+
+	LASSERT(ke);
+	LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK);
+	LASSERT(kctx->kc_keye.kb_tfm == NULL ||
+		ke->ke_conf_size >=
+		crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm));
+
+	/*
+	 * final token format:
+	 * ---------------------------------------------------
+	 * | krb5 header | cipher text | checksum (16 bytes) |
+	 * ---------------------------------------------------
+	 */
+
+	/* fill krb5 header */
+	LASSERT(token->len >= sizeof(*khdr));
 	khdr = (struct krb5_header *)token->data;
-        fill_krb5_header(kctx, khdr, 1);
+	fill_krb5_header(kctx, khdr, 1);
 
-        /* generate confounder */
-        cfs_get_random_bytes(conf, ke->ke_conf_size);
+	/* generate confounder */
+	cfs_get_random_bytes(conf, ke->ke_conf_size);
 
-        /* get encryption blocksize. note kc_keye might not associated with
-         * a tfm, currently only for arcfour-hmac */
-        if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
-                LASSERT(kctx->kc_keye.kb_tfm == NULL);
-                blocksize = 1;
-        } else {
-                LASSERT(kctx->kc_keye.kb_tfm);
-		blocksize = crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
-        }
-        LASSERT(blocksize <= ke->ke_conf_size);
+	/* get encryption blocksize. note kc_keye might not associated with
+	 * a tfm, currently only for arcfour-hmac */
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		LASSERT(kctx->kc_keye.kb_tfm == NULL);
+		blocksize = 1;
+	} else {
+		LASSERT(kctx->kc_keye.kb_tfm);
+		blocksize = crypto_sync_skcipher_blocksize(
+							kctx->kc_keye.kb_tfm);
+	}
+	LASSERT(blocksize <= ke->ke_conf_size);
 
 	/* padding the message */
 	if (gss_add_padding(msg, msg_buflen, blocksize))
 		return GSS_S_FAILURE;
 
-        /*
-         * clear text layout for checksum:
-         * ------------------------------------------------------
-         * | confounder | gss header | clear msgs | krb5 header |
-         * ------------------------------------------------------
-         */
-        data_desc[0].data = conf;
-        data_desc[0].len = ke->ke_conf_size;
-        data_desc[1].data = gsshdr->data;
-        data_desc[1].len = gsshdr->len;
-        data_desc[2].data = msg->data;
-        data_desc[2].len = msg->len;
-
-        /* compute checksum */
-        if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
-                               khdr, 3, data_desc, 0, NULL, &cksum))
-                return GSS_S_FAILURE;
-        LASSERT(cksum.len >= ke->ke_hash_size);
-
-        /*
-         * clear text layout for encryption:
-         * -----------------------------------------
-         * | confounder | clear msgs | krb5 header |
-         * -----------------------------------------
-         */
-        data_desc[0].data = conf;
-        data_desc[0].len = ke->ke_conf_size;
-        data_desc[1].data = msg->data;
-        data_desc[1].len = msg->len;
-        data_desc[2].data = (__u8 *) khdr;
-        data_desc[2].len = sizeof(*khdr);
-
-        /* cipher text will be directly inplace */
+	/*
+	 * clear text layout for checksum:
+	 * ------------------------------------------------------
+	 * | confounder | gss header | clear msgs | krb5 header |
+	 * ------------------------------------------------------
+	 */
+	data_desc[0].data = conf;
+	data_desc[0].len = ke->ke_conf_size;
+	data_desc[1].data = gsshdr->data;
+	data_desc[1].len = gsshdr->len;
+	data_desc[2].data = msg->data;
+	data_desc[2].len = msg->len;
+
+	/* compute checksum */
+	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+			       khdr, 3, data_desc, 0, NULL, &cksum,
+			       gctx->hash_func))
+		GOTO(out_free_cksum, major = GSS_S_FAILURE);
+	LASSERT(cksum.len >= ke->ke_hash_size);
+
+	/*
+	 * clear text layout for encryption:
+	 * -----------------------------------------
+	 * | confounder | clear msgs | krb5 header |
+	 * -----------------------------------------
+	 */
+	data_desc[0].data = conf;
+	data_desc[0].len = ke->ke_conf_size;
+	data_desc[1].data = msg->data;
+	data_desc[1].len = msg->len;
+	data_desc[2].data = (__u8 *) khdr;
+	data_desc[2].len = sizeof(*khdr);
+
+	/* cipher text will be directly inplace */
 	cipher.data = (__u8 *)(khdr + 1);
-        cipher.len = token->len - sizeof(*khdr);
-        LASSERT(cipher.len >= ke->ke_conf_size + msg->len + sizeof(*khdr));
+	cipher.len = token->len - sizeof(*khdr);
+	LASSERT(cipher.len >= ke->ke_conf_size + msg->len + sizeof(*khdr));
 
 	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
-		rawobj_t		 arc4_keye;
-		struct crypto_blkcipher *arc4_tfm;
+		rawobj_t arc4_keye = RAWOBJ_EMPTY;
+		struct crypto_sync_skcipher *arc4_tfm;
 
 		if (krb5_make_checksum(ENCTYPE_ARCFOUR_HMAC, &kctx->kc_keyi,
-				       NULL, 1, &cksum, 0, NULL, &arc4_keye)) {
+				       NULL, 1, &cksum, 0, NULL, &arc4_keye,
+				       gctx->hash_func)) {
 			CERROR("failed to obtain arc4 enc key\n");
-			GOTO(arc4_out, rc = -EACCES);
+			GOTO(arc4_out_key, rc = -EACCES);
 		}
 
-		arc4_tfm = crypto_alloc_blkcipher("ecb(arc4)", 0, 0);
+		arc4_tfm = crypto_alloc_sync_skcipher("ecb(arc4)", 0, 0);
 		if (IS_ERR(arc4_tfm)) {
 			CERROR("failed to alloc tfm arc4 in ECB mode\n");
 			GOTO(arc4_out_key, rc = -EACCES);
 		}
 
-		if (crypto_blkcipher_setkey(arc4_tfm, arc4_keye.data,
-                                               arc4_keye.len)) {
-                        CERROR("failed to set arc4 key, len %d\n",
-                               arc4_keye.len);
-                        GOTO(arc4_out_tfm, rc = -EACCES);
-                }
+		if (crypto_sync_skcipher_setkey(arc4_tfm, arc4_keye.data,
+						arc4_keye.len)) {
+			CERROR("failed to set arc4 key, len %d\n",
+			       arc4_keye.len);
+			GOTO(arc4_out_tfm, rc = -EACCES);
+		}
 
 		rc = gss_crypt_rawobjs(arc4_tfm, NULL, 3, data_desc,
 				       &cipher, 1);
 arc4_out_tfm:
-		crypto_free_blkcipher(arc4_tfm);
+		crypto_free_sync_skcipher(arc4_tfm);
 arc4_out_key:
-                rawobj_free(&arc4_keye);
-arc4_out:
-                do {} while(0); /* just to avoid compile warning */
-        } else {
+		rawobj_free(&arc4_keye);
+	} else {
 		rc = gss_crypt_rawobjs(kctx->kc_keye.kb_tfm, local_iv, 3,
 				       data_desc, &cipher, 1);
-        }
-
-        if (rc != 0) {
-                rawobj_free(&cksum);
-                return GSS_S_FAILURE;
-        }
-
-        /* fill in checksum */
-        LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size);
-        memcpy((char *)(khdr + 1) + cipher.len,
-               cksum.data + cksum.len - ke->ke_hash_size,
-               ke->ke_hash_size);
-        rawobj_free(&cksum);
+	}
 
-        /* final token length */
-        token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size;
-        return GSS_S_COMPLETE;
+	if (rc)
+		GOTO(out_free_cksum, major = GSS_S_FAILURE);
+
+	/* fill in checksum */
+	LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size);
+	memcpy((char *)(khdr + 1) + cipher.len,
+	       cksum.data + cksum.len - ke->ke_hash_size,
+	       ke->ke_hash_size);
+
+	/* final token length */
+	token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size;
+	major = GSS_S_COMPLETE;
+out_free_cksum:
+	rawobj_free(&cksum);
+	return major;
 }
 
 static
@@ -1075,7 +1129,7 @@ __u32 gss_prep_bulk_kerberos(struct gss_ctx *gctx,
 	LASSERT(GET_ENC_KIOV(desc));
 	LASSERT(kctx->kc_keye.kb_tfm);
 
-	blocksize = crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+	blocksize = crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm);
 
 	for (i = 0; i < desc->bd_iov_count; i++) {
 		LASSERT(BD_GET_ENC_KIOV(desc, i).kiov_page);
@@ -1101,375 +1155,377 @@ __u32 gss_prep_bulk_kerberos(struct gss_ctx *gctx,
 
 static
 __u32 gss_wrap_bulk_kerberos(struct gss_ctx *gctx,
-                             struct ptlrpc_bulk_desc *desc,
-                             rawobj_t *token, int adj_nob)
+			     struct ptlrpc_bulk_desc *desc,
+			     rawobj_t *token, int adj_nob)
 {
-        struct krb5_ctx     *kctx = gctx->internal_ctx_id;
-        struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
-        struct krb5_header  *khdr;
-        int                  blocksize;
-        rawobj_t             cksum = RAWOBJ_EMPTY;
-        rawobj_t             data_desc[1], cipher;
-        __u8                 conf[GSS_MAX_CIPHER_BLOCK];
-        int                  rc = 0;
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header  *khdr;
+	int                  blocksz;
+	rawobj_t             cksum = RAWOBJ_EMPTY;
+	rawobj_t             data_desc[1], cipher;
+	__u8                 conf[GSS_MAX_CIPHER_BLOCK];
+	int rc = 0;
+	u32 major;
 
 	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
-        LASSERT(ke);
-        LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK);
-
-        /*
-         * final token format:
-         * --------------------------------------------------
-         * | krb5 header | head/tail cipher text | checksum |
-         * --------------------------------------------------
-         */
-
-        /* fill krb5 header */
-        LASSERT(token->len >= sizeof(*khdr));
+	LASSERT(ke);
+	LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK);
+
+	/*
+	 * final token format:
+	 * --------------------------------------------------
+	 * | krb5 header | head/tail cipher text | checksum |
+	 * --------------------------------------------------
+	 */
+
+	/* fill krb5 header */
+	LASSERT(token->len >= sizeof(*khdr));
 	khdr = (struct krb5_header *)token->data;
-        fill_krb5_header(kctx, khdr, 1);
+	fill_krb5_header(kctx, khdr, 1);
 
-        /* generate confounder */
-        cfs_get_random_bytes(conf, ke->ke_conf_size);
+	/* generate confounder */
+	cfs_get_random_bytes(conf, ke->ke_conf_size);
 
-        /* get encryption blocksize. note kc_keye might not associated with
-         * a tfm, currently only for arcfour-hmac */
-        if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
-                LASSERT(kctx->kc_keye.kb_tfm == NULL);
-                blocksize = 1;
-        } else {
-                LASSERT(kctx->kc_keye.kb_tfm);
-		blocksize = crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
-        }
+	/* get encryption blocksize. note kc_keye might not associated with
+	 * a tfm, currently only for arcfour-hmac */
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		LASSERT(kctx->kc_keye.kb_tfm == NULL);
+		blocksz = 1;
+	} else {
+		LASSERT(kctx->kc_keye.kb_tfm);
+		blocksz = crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm);
+	}
 
-        /*
-         * we assume the size of krb5_header (16 bytes) must be n * blocksize.
-         * the bulk token size would be exactly (sizeof(krb5_header) +
-         * blocksize + sizeof(krb5_header) + hashsize)
-         */
-        LASSERT(blocksize <= ke->ke_conf_size);
-        LASSERT(sizeof(*khdr) >= blocksize && sizeof(*khdr) % blocksize == 0);
-        LASSERT(token->len >= sizeof(*khdr) + blocksize + sizeof(*khdr) + 16);
-
-        /*
-         * clear text layout for checksum:
-         * ------------------------------------------
-         * | confounder | clear pages | krb5 header |
-         * ------------------------------------------
-         */
-        data_desc[0].data = conf;
-        data_desc[0].len = ke->ke_conf_size;
+	/*
+	 * we assume the size of krb5_header (16 bytes) must be n * blocksize.
+	 * the bulk token size would be exactly (sizeof(krb5_header) +
+	 * blocksize + sizeof(krb5_header) + hashsize)
+	 */
+	LASSERT(blocksz <= ke->ke_conf_size);
+	LASSERT(sizeof(*khdr) >= blocksz && sizeof(*khdr) % blocksz == 0);
+	LASSERT(token->len >= sizeof(*khdr) + blocksz + sizeof(*khdr) + 16);
+
+	/*
+	 * clear text layout for checksum:
+	 * ------------------------------------------
+	 * | confounder | clear pages | krb5 header |
+	 * ------------------------------------------
+	 */
+	data_desc[0].data = conf;
+	data_desc[0].len = ke->ke_conf_size;
 
 	/* compute checksum */
 	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
 			       khdr, 1, data_desc,
 			       desc->bd_iov_count, GET_KIOV(desc),
-			       &cksum))
-		return GSS_S_FAILURE;
+			       &cksum, gctx->hash_func))
+		GOTO(out_free_cksum, major = GSS_S_FAILURE);
 	LASSERT(cksum.len >= ke->ke_hash_size);
 
-        /*
-         * clear text layout for encryption:
-         * ------------------------------------------
-         * | confounder | clear pages | krb5 header |
-         * ------------------------------------------
-         *        |              |             |
-         *        ----------  (cipher pages)   |
-         * result token:   |                   |
-         * -------------------------------------------
-         * | krb5 header | cipher text | cipher text |
-         * -------------------------------------------
-         */
-        data_desc[0].data = conf;
-        data_desc[0].len = ke->ke_conf_size;
+	/*
+	 * clear text layout for encryption:
+	 * ------------------------------------------
+	 * | confounder | clear pages | krb5 header |
+	 * ------------------------------------------
+	 *        |              |             |
+	 *        ----------  (cipher pages)   |
+	 * result token:   |                   |
+	 * -------------------------------------------
+	 * | krb5 header | cipher text | cipher text |
+	 * -------------------------------------------
+	 */
+	data_desc[0].data = conf;
+	data_desc[0].len = ke->ke_conf_size;
 
 	cipher.data = (__u8 *)(khdr + 1);
-        cipher.len = blocksize + sizeof(*khdr);
-
-        if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
-                LBUG();
-                rc = 0;
-        } else {
-                rc = krb5_encrypt_bulk(kctx->kc_keye.kb_tfm, khdr,
-                                       conf, desc, &cipher, adj_nob);
-        }
+	cipher.len = blocksz + sizeof(*khdr);
 
-        if (rc != 0) {
-                rawobj_free(&cksum);
-                return GSS_S_FAILURE;
-        }
-
-        /* fill in checksum */
-        LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size);
-        memcpy((char *)(khdr + 1) + cipher.len,
-               cksum.data + cksum.len - ke->ke_hash_size,
-               ke->ke_hash_size);
-        rawobj_free(&cksum);
-
-        /* final token length */
-        token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size;
-        return GSS_S_COMPLETE;
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		LBUG();
+		rc = 0;
+	} else {
+		rc = krb5_encrypt_bulk(kctx->kc_keye.kb_tfm, khdr,
+				       conf, desc, &cipher, adj_nob);
+	}
+	if (rc)
+		GOTO(out_free_cksum, major = GSS_S_FAILURE);
+
+	/* fill in checksum */
+	LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size);
+	memcpy((char *)(khdr + 1) + cipher.len,
+	       cksum.data + cksum.len - ke->ke_hash_size,
+	       ke->ke_hash_size);
+
+	/* final token length */
+	token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size;
+	major = GSS_S_COMPLETE;
+out_free_cksum:
+	rawobj_free(&cksum);
+	return major;
 }
 
 static
 __u32 gss_unwrap_kerberos(struct gss_ctx  *gctx,
-                          rawobj_t        *gsshdr,
-                          rawobj_t        *token,
-                          rawobj_t        *msg)
+			  rawobj_t        *gsshdr,
+			  rawobj_t        *token,
+			  rawobj_t        *msg)
 {
-        struct krb5_ctx     *kctx = gctx->internal_ctx_id;
-        struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
-        struct krb5_header  *khdr;
-        unsigned char       *tmpbuf;
-        int                  blocksize, bodysize;
-        rawobj_t             cksum = RAWOBJ_EMPTY;
-        rawobj_t             cipher_in, plain_out;
-        rawobj_t             hash_objs[3];
-        int                  rc = 0;
-        __u32                major;
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header  *khdr;
+	unsigned char       *tmpbuf;
+	int                  blocksz, bodysize;
+	rawobj_t             cksum = RAWOBJ_EMPTY;
+	rawobj_t             cipher_in, plain_out;
+	rawobj_t             hash_objs[3];
+	int                  rc = 0;
+	__u32                major;
 	__u8                 local_iv[16] = {0};
 
-        LASSERT(ke);
+	LASSERT(ke);
 
-        if (token->len < sizeof(*khdr)) {
-                CERROR("short signature: %u\n", token->len);
-                return GSS_S_DEFECTIVE_TOKEN;
-        }
+	if (token->len < sizeof(*khdr)) {
+		CERROR("short signature: %u\n", token->len);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
 
 	khdr = (struct krb5_header *)token->data;
 
-        major = verify_krb5_header(kctx, khdr, 1);
-        if (major != GSS_S_COMPLETE) {
-                CERROR("bad krb5 header\n");
-                return major;
-        }
+	major = verify_krb5_header(kctx, khdr, 1);
+	if (major != GSS_S_COMPLETE) {
+		CERROR("bad krb5 header\n");
+		return major;
+	}
 
-        /* block size */
-        if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
-                LASSERT(kctx->kc_keye.kb_tfm == NULL);
-                blocksize = 1;
-        } else {
-                LASSERT(kctx->kc_keye.kb_tfm);
-		blocksize = crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
-        }
+	/* block size */
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		LASSERT(kctx->kc_keye.kb_tfm == NULL);
+		blocksz = 1;
+	} else {
+		LASSERT(kctx->kc_keye.kb_tfm);
+		blocksz = crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm);
+	}
 
-        /* expected token layout:
-         * ----------------------------------------
-         * | krb5 header | cipher text | checksum |
-         * ----------------------------------------
-         */
-        bodysize = token->len - sizeof(*khdr) - ke->ke_hash_size;
+	/* expected token layout:
+	 * ----------------------------------------
+	 * | krb5 header | cipher text | checksum |
+	 * ----------------------------------------
+	 */
+	bodysize = token->len - sizeof(*khdr) - ke->ke_hash_size;
 
-        if (bodysize % blocksize) {
-                CERROR("odd bodysize %d\n", bodysize);
-                return GSS_S_DEFECTIVE_TOKEN;
-        }
+	if (bodysize % blocksz) {
+		CERROR("odd bodysize %d\n", bodysize);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
 
-        if (bodysize <= ke->ke_conf_size + sizeof(*khdr)) {
-                CERROR("incomplete token: bodysize %d\n", bodysize);
-                return GSS_S_DEFECTIVE_TOKEN;
-        }
+	if (bodysize <= ke->ke_conf_size + sizeof(*khdr)) {
+		CERROR("incomplete token: bodysize %d\n", bodysize);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
 
-        if (msg->len < bodysize - ke->ke_conf_size - sizeof(*khdr)) {
-                CERROR("buffer too small: %u, require %d\n",
-                       msg->len, bodysize - ke->ke_conf_size);
-                return GSS_S_FAILURE;
-        }
+	if (msg->len < bodysize - ke->ke_conf_size - sizeof(*khdr)) {
+		CERROR("buffer too small: %u, require %d\n",
+		       msg->len, bodysize - ke->ke_conf_size);
+		return GSS_S_FAILURE;
+	}
 
-        /* decrypting */
-        OBD_ALLOC_LARGE(tmpbuf, bodysize);
-        if (!tmpbuf)
-                return GSS_S_FAILURE;
+	/* decrypting */
+	OBD_ALLOC_LARGE(tmpbuf, bodysize);
+	if (!tmpbuf)
+		return GSS_S_FAILURE;
 
-        major = GSS_S_FAILURE;
+	major = GSS_S_FAILURE;
 
 	cipher_in.data = (__u8 *)(khdr + 1);
-        cipher_in.len = bodysize;
-        plain_out.data = tmpbuf;
-        plain_out.len = bodysize;
+	cipher_in.len = bodysize;
+	plain_out.data = tmpbuf;
+	plain_out.len = bodysize;
 
 	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
 		rawobj_t		 arc4_keye;
-		struct crypto_blkcipher *arc4_tfm;
+		struct crypto_sync_skcipher *arc4_tfm;
 
 		cksum.data = token->data + token->len - ke->ke_hash_size;
 		cksum.len = ke->ke_hash_size;
 
 		if (krb5_make_checksum(ENCTYPE_ARCFOUR_HMAC, &kctx->kc_keyi,
-				       NULL, 1, &cksum, 0, NULL, &arc4_keye)) {
+				       NULL, 1, &cksum, 0, NULL, &arc4_keye,
+				       gctx->hash_func)) {
 			CERROR("failed to obtain arc4 enc key\n");
 			GOTO(arc4_out, rc = -EACCES);
 		}
 
-		arc4_tfm = crypto_alloc_blkcipher("ecb(arc4)", 0, 0);
+		arc4_tfm = crypto_alloc_sync_skcipher("ecb(arc4)", 0, 0);
 		if (IS_ERR(arc4_tfm)) {
 			CERROR("failed to alloc tfm arc4 in ECB mode\n");
 			GOTO(arc4_out_key, rc = -EACCES);
 		}
 
-		if (crypto_blkcipher_setkey(arc4_tfm,
-                                         arc4_keye.data, arc4_keye.len)) {
-                        CERROR("failed to set arc4 key, len %d\n",
-                               arc4_keye.len);
-                        GOTO(arc4_out_tfm, rc = -EACCES);
-                }
+		if (crypto_sync_skcipher_setkey(arc4_tfm, arc4_keye.data,
+						arc4_keye.len)) {
+			CERROR("failed to set arc4 key, len %d\n",
+			       arc4_keye.len);
+			GOTO(arc4_out_tfm, rc = -EACCES);
+		}
 
 		rc = gss_crypt_rawobjs(arc4_tfm, NULL, 1, &cipher_in,
 				       &plain_out, 0);
 arc4_out_tfm:
-		crypto_free_blkcipher(arc4_tfm);
+		crypto_free_sync_skcipher(arc4_tfm);
 arc4_out_key:
-                rawobj_free(&arc4_keye);
+		rawobj_free(&arc4_keye);
 arc4_out:
-                cksum = RAWOBJ_EMPTY;
-        } else {
+		cksum = RAWOBJ_EMPTY;
+	} else {
 		rc = gss_crypt_rawobjs(kctx->kc_keye.kb_tfm, local_iv, 1,
 				       &cipher_in, &plain_out, 0);
-        }
+	}
 
-        if (rc != 0) {
-                CERROR("error decrypt\n");
-                goto out_free;
-        }
-        LASSERT(plain_out.len == bodysize);
-
-        /* expected clear text layout:
-         * -----------------------------------------
-         * | confounder | clear msgs | krb5 header |
-         * -----------------------------------------
-         */
-
-        /* verify krb5 header in token is not modified */
-        if (memcmp(khdr, plain_out.data + plain_out.len - sizeof(*khdr),
-                   sizeof(*khdr))) {
-                CERROR("decrypted krb5 header mismatch\n");
-                goto out_free;
-        }
+	if (rc != 0) {
+		CERROR("error decrypt\n");
+		goto out_free;
+	}
+	LASSERT(plain_out.len == bodysize);
 
-        /* verify checksum, compose clear text as layout:
-         * ------------------------------------------------------
-         * | confounder | gss header | clear msgs | krb5 header |
-         * ------------------------------------------------------
-         */
-        hash_objs[0].len = ke->ke_conf_size;
-        hash_objs[0].data = plain_out.data;
-        hash_objs[1].len = gsshdr->len;
-        hash_objs[1].data = gsshdr->data;
-        hash_objs[2].len = plain_out.len - ke->ke_conf_size - sizeof(*khdr);
-        hash_objs[2].data = plain_out.data + ke->ke_conf_size;
-        if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
-                               khdr, 3, hash_objs, 0, NULL, &cksum))
-                goto out_free;
-
-        LASSERT(cksum.len >= ke->ke_hash_size);
-        if (memcmp((char *)(khdr + 1) + bodysize,
-                   cksum.data + cksum.len - ke->ke_hash_size,
-                   ke->ke_hash_size)) {
-                CERROR("checksum mismatch\n");
-                goto out_free;
-        }
+	/* expected clear text layout:
+	 * -----------------------------------------
+	 * | confounder | clear msgs | krb5 header |
+	 * -----------------------------------------
+	 */
+
+	/* verify krb5 header in token is not modified */
+	if (memcmp(khdr, plain_out.data + plain_out.len - sizeof(*khdr),
+		   sizeof(*khdr))) {
+		CERROR("decrypted krb5 header mismatch\n");
+		goto out_free;
+	}
+
+	/* verify checksum, compose clear text as layout:
+	 * ------------------------------------------------------
+	 * | confounder | gss header | clear msgs | krb5 header |
+	 * ------------------------------------------------------
+	 */
+	hash_objs[0].len = ke->ke_conf_size;
+	hash_objs[0].data = plain_out.data;
+	hash_objs[1].len = gsshdr->len;
+	hash_objs[1].data = gsshdr->data;
+	hash_objs[2].len = plain_out.len - ke->ke_conf_size - sizeof(*khdr);
+	hash_objs[2].data = plain_out.data + ke->ke_conf_size;
+	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+			       khdr, 3, hash_objs, 0, NULL, &cksum,
+			       gctx->hash_func))
+		goto out_free;
+
+	LASSERT(cksum.len >= ke->ke_hash_size);
+	if (memcmp((char *)(khdr + 1) + bodysize,
+		   cksum.data + cksum.len - ke->ke_hash_size,
+		   ke->ke_hash_size)) {
+		CERROR("checksum mismatch\n");
+		goto out_free;
+	}
 
-        msg->len =  bodysize - ke->ke_conf_size - sizeof(*khdr);
-        memcpy(msg->data, tmpbuf + ke->ke_conf_size, msg->len);
+	msg->len =  bodysize - ke->ke_conf_size - sizeof(*khdr);
+	memcpy(msg->data, tmpbuf + ke->ke_conf_size, msg->len);
 
-        major = GSS_S_COMPLETE;
+	major = GSS_S_COMPLETE;
 out_free:
-        OBD_FREE_LARGE(tmpbuf, bodysize);
-        rawobj_free(&cksum);
-        return major;
+	OBD_FREE_LARGE(tmpbuf, bodysize);
+	rawobj_free(&cksum);
+	return major;
 }
 
 static
 __u32 gss_unwrap_bulk_kerberos(struct gss_ctx *gctx,
-                               struct ptlrpc_bulk_desc *desc,
-                               rawobj_t *token, int adj_nob)
+			       struct ptlrpc_bulk_desc *desc,
+			       rawobj_t *token, int adj_nob)
 {
-        struct krb5_ctx     *kctx = gctx->internal_ctx_id;
-        struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
-        struct krb5_header  *khdr;
-        int                  blocksize;
-        rawobj_t             cksum = RAWOBJ_EMPTY;
-        rawobj_t             cipher, plain;
-        rawobj_t             data_desc[1];
-        int                  rc;
-        __u32                major;
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header  *khdr;
+	int                  blocksz;
+	rawobj_t             cksum = RAWOBJ_EMPTY;
+	rawobj_t             cipher, plain;
+	rawobj_t             data_desc[1];
+	int                  rc;
+	__u32                major;
 
 	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
-        LASSERT(ke);
+	LASSERT(ke);
 
-        if (token->len < sizeof(*khdr)) {
-                CERROR("short signature: %u\n", token->len);
-                return GSS_S_DEFECTIVE_TOKEN;
-        }
+	if (token->len < sizeof(*khdr)) {
+		CERROR("short signature: %u\n", token->len);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
 
 	khdr = (struct krb5_header *)token->data;
 
-        major = verify_krb5_header(kctx, khdr, 1);
-        if (major != GSS_S_COMPLETE) {
-                CERROR("bad krb5 header\n");
-                return major;
-        }
-
-        /* block size */
-        if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
-                LASSERT(kctx->kc_keye.kb_tfm == NULL);
-                blocksize = 1;
-                LBUG();
-        } else {
-                LASSERT(kctx->kc_keye.kb_tfm);
-		blocksize = crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
-        }
-        LASSERT(sizeof(*khdr) >= blocksize && sizeof(*khdr) % blocksize == 0);
-
-        /*
-         * token format is expected as:
-         * -----------------------------------------------
-         * | krb5 header | head/tail cipher text | cksum |
-         * -----------------------------------------------
-         */
-        if (token->len < sizeof(*khdr) + blocksize + sizeof(*khdr) +
-                         ke->ke_hash_size) {
-                CERROR("short token size: %u\n", token->len);
-                return GSS_S_DEFECTIVE_TOKEN;
-        }
+	major = verify_krb5_header(kctx, khdr, 1);
+	if (major != GSS_S_COMPLETE) {
+		CERROR("bad krb5 header\n");
+		return major;
+	}
 
-        cipher.data = (__u8 *) (khdr + 1);
-        cipher.len = blocksize + sizeof(*khdr);
-        plain.data = cipher.data;
-        plain.len = cipher.len;
+	/* block size */
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		LASSERT(kctx->kc_keye.kb_tfm == NULL);
+		blocksz = 1;
+		LBUG();
+	} else {
+		LASSERT(kctx->kc_keye.kb_tfm);
+		blocksz = crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm);
+	}
+	LASSERT(sizeof(*khdr) >= blocksz && sizeof(*khdr) % blocksz == 0);
 
-        rc = krb5_decrypt_bulk(kctx->kc_keye.kb_tfm, khdr,
-                               desc, &cipher, &plain, adj_nob);
-        if (rc)
-                return GSS_S_DEFECTIVE_TOKEN;
+	/*
+	 * token format is expected as:
+	 * -----------------------------------------------
+	 * | krb5 header | head/tail cipher text | cksum |
+	 * -----------------------------------------------
+	 */
+	if (token->len < sizeof(*khdr) + blocksz + sizeof(*khdr) +
+	    ke->ke_hash_size) {
+		CERROR("short token size: %u\n", token->len);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
 
-        /*
-         * verify checksum, compose clear text as layout:
-         * ------------------------------------------
-         * | confounder | clear pages | krb5 header |
-         * ------------------------------------------
-         */
-        data_desc[0].data = plain.data;
-        data_desc[0].len = blocksize;
+	cipher.data = (__u8 *) (khdr + 1);
+	cipher.len = blocksz + sizeof(*khdr);
+	plain.data = cipher.data;
+	plain.len = cipher.len;
+
+	rc = krb5_decrypt_bulk(kctx->kc_keye.kb_tfm, khdr,
+			       desc, &cipher, &plain, adj_nob);
+	if (rc)
+		return GSS_S_DEFECTIVE_TOKEN;
+
+	/*
+	 * verify checksum, compose clear text as layout:
+	 * ------------------------------------------
+	 * | confounder | clear pages | krb5 header |
+	 * ------------------------------------------
+	 */
+	data_desc[0].data = plain.data;
+	data_desc[0].len = blocksz;
 
 	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
 			       khdr, 1, data_desc,
 			       desc->bd_iov_count,
 			       GET_KIOV(desc),
-			       &cksum))
+			       &cksum, gctx->hash_func))
 		return GSS_S_FAILURE;
 	LASSERT(cksum.len >= ke->ke_hash_size);
 
-        if (memcmp(plain.data + blocksize + sizeof(*khdr),
-                   cksum.data + cksum.len - ke->ke_hash_size,
-                   ke->ke_hash_size)) {
-                CERROR("checksum mismatch\n");
-                rawobj_free(&cksum);
-                return GSS_S_BAD_SIG;
-        }
+	if (memcmp(plain.data + blocksz + sizeof(*khdr),
+		   cksum.data + cksum.len - ke->ke_hash_size,
+		   ke->ke_hash_size)) {
+		CERROR("checksum mismatch\n");
+		rawobj_free(&cksum);
+		return GSS_S_BAD_SIG;
+	}
 
-        rawobj_free(&cksum);
-        return GSS_S_COMPLETE;
+	rawobj_free(&cksum);
+	return GSS_S_COMPLETE;
 }
 
 int gss_display_kerberos(struct gss_ctx        *ctx,
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_mech_switch.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_mech_switch.c
index be66ffde266d4..3ee125f1070bf 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_mech_switch.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_mech_switch.c
@@ -52,7 +52,6 @@
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_net.h>
 #include <lustre_import.h>
 #include <lustre_sec.h>
@@ -60,6 +59,7 @@
 #include "gss_err.h"
 #include "gss_internal.h"
 #include "gss_api.h"
+#include "gss_crypto.h"
 
 static struct list_head registered_mechs = LIST_HEAD_INIT(registered_mechs);
 static DEFINE_SPINLOCK(registered_mechs_lock);
@@ -69,7 +69,7 @@ int lgss_mech_register(struct gss_api_mech *gm)
 	spin_lock(&registered_mechs_lock);
 	list_add(&gm->gm_list, &registered_mechs);
 	spin_unlock(&registered_mechs_lock);
-	CWARN("Register %s mechanism\n", gm->gm_name);
+	CDEBUG(D_SEC, "register %s mechanism\n", gm->gm_name);
 	return 0;
 }
 
@@ -78,7 +78,7 @@ void lgss_mech_unregister(struct gss_api_mech *gm)
 	spin_lock(&registered_mechs_lock);
 	list_del(&gm->gm_list);
 	spin_unlock(&registered_mechs_lock);
-	CWARN("Unregister %s mechanism\n", gm->gm_name);
+	CDEBUG(D_SEC, "Unregister %s mechanism\n", gm->gm_name);
 }
 
 
@@ -148,50 +148,52 @@ __u32 lgss_import_sec_context(rawobj_t *input_token,
                               struct gss_api_mech *mech,
                               struct gss_ctx **ctx_id)
 {
-        OBD_ALLOC_PTR(*ctx_id);
-        if (*ctx_id == NULL)
-                return GSS_S_FAILURE;
+	OBD_ALLOC_PTR(*ctx_id);
+	if (*ctx_id == NULL)
+		return GSS_S_FAILURE;
 
-        (*ctx_id)->mech_type = lgss_mech_get(mech);
+	(*ctx_id)->mech_type = lgss_mech_get(mech);
+	(*ctx_id)->hash_func = gss_digest_hash;
 
-        LASSERT(mech);
-        LASSERT(mech->gm_ops);
-        LASSERT(mech->gm_ops->gss_import_sec_context);
-        return mech->gm_ops->gss_import_sec_context(input_token, *ctx_id);
+	LASSERT(mech);
+	LASSERT(mech->gm_ops);
+	LASSERT(mech->gm_ops->gss_import_sec_context);
+	return mech->gm_ops->gss_import_sec_context(input_token, *ctx_id);
 }
 
 __u32 lgss_copy_reverse_context(struct gss_ctx *ctx_id,
-                                struct gss_ctx **ctx_id_new)
+				struct gss_ctx **ctx_id_new)
 {
-        struct gss_api_mech *mech = ctx_id->mech_type;
-        __u32                major;
+	struct gss_api_mech *mech = ctx_id->mech_type;
+	__u32                major;
 
-        LASSERT(mech);
+	LASSERT(mech);
 
-        OBD_ALLOC_PTR(*ctx_id_new);
-        if (*ctx_id_new == NULL)
-                return GSS_S_FAILURE;
+	OBD_ALLOC_PTR(*ctx_id_new);
+	if (*ctx_id_new == NULL)
+		return GSS_S_FAILURE;
 
-        (*ctx_id_new)->mech_type = lgss_mech_get(mech);
+	(*ctx_id_new)->mech_type = lgss_mech_get(mech);
+	(*ctx_id_new)->hash_func = ctx_id->hash_func;
 
-        LASSERT(mech);
-        LASSERT(mech->gm_ops);
-        LASSERT(mech->gm_ops->gss_copy_reverse_context);
+	LASSERT(mech);
+	LASSERT(mech->gm_ops);
+	LASSERT(mech->gm_ops->gss_copy_reverse_context);
 
-        major = mech->gm_ops->gss_copy_reverse_context(ctx_id, *ctx_id_new);
-        if (major != GSS_S_COMPLETE) {
-                lgss_mech_put(mech);
-                OBD_FREE_PTR(*ctx_id_new);
-                *ctx_id_new = NULL;
-        }
-        return major;
+	major = mech->gm_ops->gss_copy_reverse_context(ctx_id, *ctx_id_new);
+	if (major != GSS_S_COMPLETE) {
+		lgss_mech_put(mech);
+		OBD_FREE_PTR(*ctx_id_new);
+		*ctx_id_new = NULL;
+	}
+	return major;
 }
 
 /*
  * this interface is much simplified, currently we only need endtime.
  */
 __u32 lgss_inquire_context(struct gss_ctx *context_handle,
-                           unsigned long  *endtime)
+			   time64_t *endtime)
 {
         LASSERT(context_handle);
         LASSERT(context_handle->mech_type);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_null_mech.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_null_mech.c
index fddd3ed3443c1..1e946f8ba2aff 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_null_mech.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_null_mech.c
@@ -92,10 +92,10 @@ __u32 gss_copy_reverse_context_null(struct gss_ctx *gss_context_old,
 
 static
 __u32 gss_inquire_context_null(struct gss_ctx *gss_context,
-			       unsigned long *endtime)
+			       time64_t *endtime)
 {
 	/* quick timeout for testing purposes */
-	*endtime = cfs_time_current_sec() + 60;
+	*endtime = ktime_get_real_seconds() + 60;
 	return GSS_S_COMPLETE;
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c
index 016d455040972..5e1e7caa1aae6 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c
@@ -62,7 +62,7 @@ struct rpc_clnt; /* for rpc_pipefs */
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 #include <lustre_sec.h>
 #include <lustre_net.h>
 #include <lustre_import.h>
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c
index fd1b071d6f549..69e92bcb28311 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c
@@ -39,7 +39,6 @@
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
-#include <lustre/lustre_user.h>
 
 #include "gss_err.h"
 #include "gss_crypto.h"
@@ -62,14 +61,14 @@
 #define SK_IV_REV_START (1ULL << 63)
 
 struct sk_ctx {
-	__u16			sc_hmac;
-	__u16			sc_crypt;
-	__u32			sc_expire;
-	__u32			sc_host_random;
-	__u32			sc_peer_random;
-	atomic64_t		sc_iv;
-	rawobj_t		sc_hmac_key;
-	struct gss_keyblock	sc_session_kb;
+	enum cfs_crypto_crypt_alg sc_crypt;
+	enum cfs_crypto_hash_alg  sc_hmac;
+	__u32			  sc_expire;
+	__u32			  sc_host_random;
+	__u32			  sc_peer_random;
+	atomic64_t		  sc_iv;
+	rawobj_t		  sc_hmac_key;
+	struct gss_keyblock	  sc_session_kb;
 };
 
 struct sk_hdr {
@@ -88,24 +87,6 @@ struct sk_wire {
 	rawobj_t		skw_hmac;
 };
 
-static struct sk_crypt_type sk_crypt_types[] = {
-	[SK_CRYPT_AES256_CTR] = {
-		.sct_name = "ctr(aes)",
-		.sct_bytes = 32,
-	},
-};
-
-static struct sk_hmac_type sk_hmac_types[] = {
-	[SK_HMAC_SHA256] = {
-		.sht_name = "hmac(sha256)",
-		.sht_bytes = 32,
-	},
-	[SK_HMAC_SHA512] = {
-		.sht_name = "hmac(sha512)",
-		.sht_bytes = 64,
-	},
-};
-
 static inline unsigned long sk_block_mask(unsigned long len, int blocksize)
 {
 	return (len + blocksize - 1) & (~(blocksize - 1));
@@ -148,22 +129,18 @@ void sk_construct_rfc3686_iv(__u8 *iv, __u32 nonce, __u64 partial_iv)
 	memcpy(iv, &ctr, sizeof(ctr));
 }
 
-static int sk_init_keys(struct sk_ctx *skc)
-{
-	return gss_keyblock_init(&skc->sc_session_kb,
-				 sk_crypt_types[skc->sc_crypt].sct_name, 0);
-}
-
 static int sk_fill_context(rawobj_t *inbuf, struct sk_ctx *skc)
 {
 	char *ptr = inbuf->data;
 	char *end = inbuf->data + inbuf->len;
-	__u32 tmp;
+	char sk_hmac[CRYPTO_MAX_ALG_NAME];
+	char sk_crypt[CRYPTO_MAX_ALG_NAME];
+	u32 tmp;
 
 	/* see sk_serialize_kctx() for format from userspace side */
 	/*  1. Version */
 	if (gss_get_bytes(&ptr, end, &tmp, sizeof(tmp))) {
-		CERROR("Failed to read shared key interface version");
+		CERROR("Failed to read shared key interface version\n");
 		return -1;
 	}
 	if (tmp != SK_INTERFACE_VERSION) {
@@ -172,49 +149,55 @@ static int sk_fill_context(rawobj_t *inbuf, struct sk_ctx *skc)
 	}
 
 	/* 2. HMAC type */
-	if (gss_get_bytes(&ptr, end, &skc->sc_hmac, sizeof(skc->sc_hmac))) {
-		CERROR("Failed to read HMAC algorithm type");
+	if (gss_get_bytes(&ptr, end, &sk_hmac, sizeof(sk_hmac))) {
+		CERROR("Failed to read HMAC algorithm type\n");
 		return -1;
 	}
-	if (skc->sc_hmac <= SK_HMAC_EMPTY || skc->sc_hmac >= SK_HMAC_MAX) {
-		CERROR("Invalid hmac type: %d\n", skc->sc_hmac);
+
+	skc->sc_hmac = cfs_crypto_hash_alg(sk_hmac);
+	if (skc->sc_hmac != CFS_HASH_ALG_NULL &&
+	    skc->sc_hmac != CFS_HASH_ALG_SHA256 &&
+	    skc->sc_hmac != CFS_HASH_ALG_SHA512) {
+		CERROR("Invalid hmac type: %s\n", sk_hmac);
 		return -1;
 	}
 
 	/* 3. crypt type */
-	if (gss_get_bytes(&ptr, end, &skc->sc_crypt, sizeof(skc->sc_crypt))) {
-		CERROR("Failed to read crypt algorithm type");
+	if (gss_get_bytes(&ptr, end, &sk_crypt, sizeof(sk_crypt))) {
+		CERROR("Failed to read crypt algorithm type\n");
 		return -1;
 	}
-	if (skc->sc_crypt <= SK_CRYPT_EMPTY || skc->sc_crypt >= SK_CRYPT_MAX) {
-		CERROR("Invalid crypt type: %d\n", skc->sc_crypt);
+
+	skc->sc_crypt = cfs_crypto_crypt_alg(sk_crypt);
+	if (skc->sc_crypt == CFS_CRYPT_ALG_UNKNOWN) {
+		CERROR("Invalid crypt type: %s\n", sk_crypt);
 		return -1;
 	}
 
 	/* 4. expiration time */
 	if (gss_get_bytes(&ptr, end, &tmp, sizeof(tmp))) {
-		CERROR("Failed to read context expiration time");
+		CERROR("Failed to read context expiration time\n");
 		return -1;
 	}
-	skc->sc_expire = tmp + cfs_time_current_sec();
+	skc->sc_expire = tmp + ktime_get_real_seconds();
 
 	/* 5. host random is used as nonce for encryption */
 	if (gss_get_bytes(&ptr, end, &skc->sc_host_random,
 			  sizeof(skc->sc_host_random))) {
-		CERROR("Failed to read host random ");
+		CERROR("Failed to read host random\n");
 		return -1;
 	}
 
 	/* 6. peer random is used as nonce for decryption */
 	if (gss_get_bytes(&ptr, end, &skc->sc_peer_random,
 			  sizeof(skc->sc_peer_random))) {
-		CERROR("Failed to read peer random ");
+		CERROR("Failed to read peer random\n");
 		return -1;
 	}
 
 	/* 7. HMAC key */
 	if (gss_get_rawobj(&ptr, end, &skc->sc_hmac_key)) {
-		CERROR("Failed to read HMAC key");
+		CERROR("Failed to read HMAC key\n");
 		return -1;
 	}
 	if (skc->sc_hmac_key.len <= SK_MIN_SIZE) {
@@ -225,7 +208,7 @@ static int sk_fill_context(rawobj_t *inbuf, struct sk_ctx *skc)
 
 	/* 8. Session key, can be empty if not using privacy mode */
 	if (gss_get_rawobj(&ptr, end, &skc->sc_session_kb.kb_key)) {
-		CERROR("Failed to read session key");
+		CERROR("Failed to read session key\n");
 		return -1;
 	}
 
@@ -263,13 +246,14 @@ __u32 gss_import_sec_context_sk(rawobj_t *inbuf, struct gss_ctx *gss_context)
 	/* Only privacy mode needs to initialize keys */
 	if (skc->sc_session_kb.kb_key.len > 0) {
 		privacy = true;
-		if (sk_init_keys(skc))
+		if (gss_keyblock_init(&skc->sc_session_kb,
+				      cfs_crypto_crypt_name(skc->sc_crypt), 0))
 			goto out_err;
 	}
 
 	gss_context->internal_ctx_id = skc;
 	CDEBUG(D_SEC, "successfully imported sk%s context\n",
-	       privacy ? "pi" : "i");
+	       privacy ? " (with privacy)" : "");
 
 	return GSS_S_COMPLETE;
 
@@ -304,7 +288,9 @@ __u32 gss_copy_reverse_context_sk(struct gss_ctx *gss_context_old,
 
 	/* Only privacy mode needs to initialize keys */
 	if (skc_new->sc_session_kb.kb_key.len > 0)
-		if (sk_init_keys(skc_new))
+		if (gss_keyblock_init(&skc_new->sc_session_kb,
+				      cfs_crypto_crypt_name(skc_new->sc_crypt),
+				      0))
 			goto out_err;
 
 	gss_context_new->internal_ctx_id = skc_new;
@@ -319,7 +305,7 @@ __u32 gss_copy_reverse_context_sk(struct gss_ctx *gss_context_old,
 
 static
 __u32 gss_inquire_context_sk(struct gss_ctx *gss_context,
-			     unsigned long *endtime)
+			     time64_t *endtime)
 {
 	struct sk_ctx *skc = gss_context->internal_ctx_id;
 
@@ -328,24 +314,32 @@ __u32 gss_inquire_context_sk(struct gss_ctx *gss_context,
 }
 
 static
-__u32 sk_make_hmac(char *alg_name, rawobj_t *key, int msg_count, rawobj_t *msgs,
-		   int iov_count, lnet_kiov_t *iovs, rawobj_t *token)
+u32 sk_make_hmac(enum cfs_crypto_hash_alg algo, rawobj_t *key, int msg_count,
+		 rawobj_t *msgs, int iov_count, lnet_kiov_t *iovs,
+		 rawobj_t *token, digest_hash hash_func)
 {
-	struct crypto_hash *tfm;
-	int rc;
+	struct ahash_request *req;
+	int rc2, rc;
 
-	tfm = crypto_alloc_hash(alg_name, 0, 0);
-	if (IS_ERR(tfm))
-		return GSS_S_FAILURE;
+	req = cfs_crypto_hash_init(algo, key->data, key->len);
+	if (IS_ERR(req)) {
+		rc = PTR_ERR(req);
+		goto out_init_failed;
+	}
 
-	rc = GSS_S_FAILURE;
-	LASSERT(token->len >= crypto_hash_digestsize(tfm));
-	if (!gss_digest_hmac(tfm, key, NULL, msg_count, msgs, iov_count, iovs,
-			    token))
-		rc = GSS_S_COMPLETE;
 
-	crypto_free_hash(tfm);
-	return rc;
+	if (hash_func)
+		rc2 = hash_func(req, NULL, msg_count, msgs, iov_count,
+				iovs);
+	else
+		rc2 = gss_digest_hash(req, NULL, msg_count, msgs, iov_count,
+				      iovs);
+
+	rc = cfs_crypto_hash_final(req, token->data, &token->len);
+	if (!rc && rc2)
+		rc = rc2;
+out_init_failed:
+	return rc ? GSS_S_FAILURE : GSS_S_COMPLETE;
 }
 
 static
@@ -357,20 +351,22 @@ __u32 gss_get_mic_sk(struct gss_ctx *gss_context,
 		     rawobj_t *token)
 {
 	struct sk_ctx *skc = gss_context->internal_ctx_id;
-	return sk_make_hmac(sk_hmac_types[skc->sc_hmac].sht_name,
+
+	return sk_make_hmac(skc->sc_hmac,
 			    &skc->sc_hmac_key, message_count, messages,
-			    iov_count, iovs, token);
+			    iov_count, iovs, token, gss_context->hash_func);
 }
 
 static
-__u32 sk_verify_hmac(struct sk_hmac_type *sht, rawobj_t *key, int message_count,
-			 rawobj_t *messages, int iov_count, lnet_kiov_t *iovs,
-			 rawobj_t *token)
+u32 sk_verify_hmac(enum cfs_crypto_hash_alg algo, rawobj_t *key,
+		   int message_count, rawobj_t *messages,
+		   int iov_count, lnet_kiov_t *iovs,
+		   rawobj_t *token, digest_hash hash_func)
 {
 	rawobj_t checksum = RAWOBJ_EMPTY;
 	__u32 rc = GSS_S_FAILURE;
 
-	checksum.len = sht->sht_bytes;
+	checksum.len = cfs_crypto_hash_digestsize(algo);
 	if (token->len < checksum.len) {
 		CDEBUG(D_SEC, "Token received too short, expected %d "
 		       "received %d\n", token->len, checksum.len);
@@ -381,8 +377,9 @@ __u32 sk_verify_hmac(struct sk_hmac_type *sht, rawobj_t *key, int message_count,
 	if (!checksum.data)
 		return rc;
 
-	if (sk_make_hmac(sht->sht_name, key, message_count, messages,
-			 iov_count, iovs, &checksum)) {
+	if (sk_make_hmac(algo, key, message_count,
+			 messages, iov_count, iovs, &checksum,
+			 hash_func)) {
 		CDEBUG(D_SEC, "Failed to create checksum to validate\n");
 		goto cleanup;
 	}
@@ -405,23 +402,19 @@ __u32 sk_verify_hmac(struct sk_hmac_type *sht, rawobj_t *key, int message_count,
  * to decrypt up to the number of bytes actually specified from the sender
  * (bd_nob) otherwise the calulated HMAC will be incorrect. */
 static
-__u32 sk_verify_bulk_hmac(struct sk_hmac_type *sht, rawobj_t *key,
-			  int msgcnt, rawobj_t *msgs, int iovcnt,
-			  lnet_kiov_t *iovs, int iov_bytes, rawobj_t *token)
+u32 sk_verify_bulk_hmac(enum cfs_crypto_hash_alg sc_hmac, rawobj_t *key,
+			int msgcnt, rawobj_t *msgs, int iovcnt,
+			lnet_kiov_t *iovs, int iov_bytes, rawobj_t *token)
 {
 	rawobj_t checksum = RAWOBJ_EMPTY;
-	struct crypto_hash *tfm;
-	struct hash_desc desc = {
-		.tfm = NULL,
-		.flags = 0,
-	};
+	struct ahash_request *req;
 	struct scatterlist sg[1];
+	int rc = 0;
 	struct sg_table sgt;
 	int bytes;
 	int i;
-	int rc = GSS_S_FAILURE;
 
-	checksum.len = sht->sht_bytes;
+	checksum.len = cfs_crypto_hash_digestsize(sc_hmac);
 	if (token->len < checksum.len) {
 		CDEBUG(D_SEC, "Token received too short, expected %d "
 		       "received %d\n", token->len, checksum.len);
@@ -430,33 +423,24 @@ __u32 sk_verify_bulk_hmac(struct sk_hmac_type *sht, rawobj_t *key,
 
 	OBD_ALLOC_LARGE(checksum.data, checksum.len);
 	if (!checksum.data)
-		return rc;
+		return GSS_S_FAILURE;
 
-	tfm = crypto_alloc_hash(sht->sht_name, 0, 0);
-	if (IS_ERR(tfm))
+	req = cfs_crypto_hash_init(sc_hmac, key->data, key->len);
+	if (IS_ERR(req)) {
+		rc = GSS_S_FAILURE;
 		goto cleanup;
-
-	desc.tfm = tfm;
-
-	LASSERT(token->len >= crypto_hash_digestsize(tfm));
-
-	rc = crypto_hash_setkey(tfm, key->data, key->len);
-	if (rc)
-		goto hash_cleanup;
-
-	rc = crypto_hash_init(&desc);
-	if (rc)
-		goto hash_cleanup;
+	}
 
 	for (i = 0; i < msgcnt; i++) {
-		if (msgs[i].len == 0)
+		if (!msgs[i].len)
 			continue;
 
 		rc = gss_setup_sgtable(&sgt, sg, msgs[i].data, msgs[i].len);
 		if (rc != 0)
 			goto hash_cleanup;
 
-		rc = crypto_hash_update(&desc, sg, msgs[i].len);
+		ahash_request_set_crypt(req, sg, NULL, msgs[i].len);
+		rc = crypto_ahash_update(req);
 		if (rc) {
 			gss_teardown_sgtable(&sgt);
 			goto hash_cleanup;
@@ -475,22 +459,21 @@ __u32 sk_verify_bulk_hmac(struct sk_hmac_type *sht, rawobj_t *key,
 		sg_init_table(sg, 1);
 		sg_set_page(&sg[0], iovs[i].kiov_page, bytes,
 			    iovs[i].kiov_offset);
-		rc = crypto_hash_update(&desc, sg, bytes);
+		ahash_request_set_crypt(req, sg, NULL, bytes);
+		rc = crypto_ahash_update(req);
 		if (rc)
 			goto hash_cleanup;
 	}
 
-	crypto_hash_final(&desc, checksum.data);
+hash_cleanup:
+	cfs_crypto_hash_final(req, checksum.data, &checksum.len);
+	if (rc)
+		goto cleanup;
 
-	if (memcmp(token->data, checksum.data, checksum.len)) {
+	if (memcmp(token->data, checksum.data, checksum.len))
 		rc = GSS_S_BAD_SIG;
-		goto hash_cleanup;
-	}
-
-	rc = GSS_S_COMPLETE;
-
-hash_cleanup:
-	crypto_free_hash(tfm);
+	else
+		rc = GSS_S_COMPLETE;
 
 cleanup:
 	OBD_FREE_LARGE(checksum.data, checksum.len);
@@ -507,8 +490,10 @@ __u32 gss_verify_mic_sk(struct gss_ctx *gss_context,
 			rawobj_t *token)
 {
 	struct sk_ctx *skc = gss_context->internal_ctx_id;
-	return sk_verify_hmac(&sk_hmac_types[skc->sc_hmac], &skc->sc_hmac_key,
-			      message_count, messages, iov_count, iovs, token);
+
+	return sk_verify_hmac(skc->sc_hmac, &skc->sc_hmac_key,
+			      message_count, messages, iov_count, iovs, token,
+			      gss_context->hash_func);
 }
 
 static
@@ -517,7 +502,7 @@ __u32 gss_wrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header,
 		    rawobj_t *token)
 {
 	struct sk_ctx *skc = gss_context->internal_ctx_id;
-	struct sk_hmac_type *sht = &sk_hmac_types[skc->sc_hmac];
+	size_t sht_bytes = cfs_crypto_hash_digestsize(skc->sc_hmac);
 	struct sk_wire skw;
 	struct sk_hdr skh;
 	rawobj_t msgbufs[3];
@@ -526,7 +511,7 @@ __u32 gss_wrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header,
 
 	LASSERT(skc->sc_session_kb.kb_tfm);
 
-	blocksize = crypto_blkcipher_blocksize(skc->sc_session_kb.kb_tfm);
+	blocksize = crypto_sync_skcipher_blocksize(skc->sc_session_kb.kb_tfm);
 	if (gss_add_padding(message, message_buffer_length, blocksize))
 		return GSS_S_FAILURE;
 
@@ -541,7 +526,7 @@ __u32 gss_wrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header,
 
 	sk_construct_rfc3686_iv(local_iv, skc->sc_host_random, skh.skh_iv);
 	skw.skw_cipher.data = skw.skw_header.data + skw.skw_header.len;
-	skw.skw_cipher.len = token->len - skw.skw_header.len - sht->sht_bytes;
+	skw.skw_cipher.len = token->len - skw.skw_header.len - sht_bytes;
 	if (gss_crypt_rawobjs(skc->sc_session_kb.kb_tfm, local_iv, 1, message,
 			      &skw.skw_cipher, 1))
 		return GSS_S_FAILURE;
@@ -552,9 +537,10 @@ __u32 gss_wrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header,
 	msgbufs[2] = skw.skw_cipher;
 
 	skw.skw_hmac.data = skw.skw_cipher.data + skw.skw_cipher.len;
-	skw.skw_hmac.len = sht->sht_bytes;
-	if (sk_make_hmac(sht->sht_name, &skc->sc_hmac_key, 3, msgbufs, 0,
-			 NULL, &skw.skw_hmac))
+	skw.skw_hmac.len = sht_bytes;
+	if (sk_make_hmac(skc->sc_hmac, &skc->sc_hmac_key,
+			 3, msgbufs, 0, NULL, &skw.skw_hmac,
+			 gss_context->hash_func))
 		return GSS_S_FAILURE;
 
 	token->len = skw.skw_header.len + skw.skw_cipher.len + skw.skw_hmac.len;
@@ -567,7 +553,7 @@ __u32 gss_unwrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header,
 		      rawobj_t *token, rawobj_t *message)
 {
 	struct sk_ctx *skc = gss_context->internal_ctx_id;
-	struct sk_hmac_type *sht = &sk_hmac_types[skc->sc_hmac];
+	size_t sht_bytes = cfs_crypto_hash_digestsize(skc->sc_hmac);
 	struct sk_wire skw;
 	struct sk_hdr *skh;
 	rawobj_t msgbufs[3];
@@ -577,17 +563,17 @@ __u32 gss_unwrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header,
 
 	LASSERT(skc->sc_session_kb.kb_tfm);
 
-	if (token->len < sizeof(skh) + sht->sht_bytes)
+	if (token->len < sizeof(skh) + sht_bytes)
 		return GSS_S_DEFECTIVE_TOKEN;
 
 	skw.skw_header.data = token->data;
 	skw.skw_header.len = sizeof(struct sk_hdr);
 	skw.skw_cipher.data = skw.skw_header.data + skw.skw_header.len;
-	skw.skw_cipher.len = token->len - skw.skw_header.len - sht->sht_bytes;
+	skw.skw_cipher.len = token->len - skw.skw_header.len - sht_bytes;
 	skw.skw_hmac.data = skw.skw_cipher.data + skw.skw_cipher.len;
-	skw.skw_hmac.len = sht->sht_bytes;
+	skw.skw_hmac.len = sht_bytes;
 
-	blocksize = crypto_blkcipher_blocksize(skc->sc_session_kb.kb_tfm);
+	blocksize = crypto_sync_skcipher_blocksize(skc->sc_session_kb.kb_tfm);
 	if (skw.skw_cipher.len % blocksize != 0)
 		return GSS_S_DEFECTIVE_TOKEN;
 
@@ -600,8 +586,8 @@ __u32 gss_unwrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header,
 	msgbufs[0] = skw.skw_header;
 	msgbufs[1] = *gss_header;
 	msgbufs[2] = skw.skw_cipher;
-	rc = sk_verify_hmac(sht, &skc->sc_hmac_key, 3, msgbufs, 0, NULL,
-			    &skw.skw_hmac);
+	rc = sk_verify_hmac(skc->sc_hmac, &skc->sc_hmac_key, 3, msgbufs,
+			    0, NULL, &skw.skw_hmac, gss_context->hash_func);
 	if (rc)
 		return rc;
 
@@ -623,7 +609,7 @@ __u32 gss_prep_bulk_sk(struct gss_ctx *gss_context,
 	int i;
 
 	LASSERT(skc->sc_session_kb.kb_tfm);
-	blocksize = crypto_blkcipher_blocksize(skc->sc_session_kb.kb_tfm);
+	blocksize = crypto_sync_skcipher_blocksize(skc->sc_session_kb.kb_tfm);
 
 	for (i = 0; i < desc->bd_iov_count; i++) {
 		if (BD_GET_KIOV(desc, i).kiov_offset & blocksize) {
@@ -641,27 +627,26 @@ __u32 gss_prep_bulk_sk(struct gss_ctx *gss_context,
 	return GSS_S_COMPLETE;
 }
 
-static __u32 sk_encrypt_bulk(struct crypto_blkcipher *tfm, __u8 *iv,
+static __u32 sk_encrypt_bulk(struct crypto_sync_skcipher *tfm, __u8 *iv,
 			     struct ptlrpc_bulk_desc *desc, rawobj_t *cipher,
 			     int adj_nob)
 {
-	struct blkcipher_desc cdesc = {
-		.tfm = tfm,
-		.info = iv,
-		.flags = 0,
-	};
 	struct scatterlist ptxt;
 	struct scatterlist ctxt;
 	int blocksize;
 	int i;
 	int rc;
 	int nob = 0;
+	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
 
-	blocksize = crypto_blkcipher_blocksize(tfm);
+	blocksize = crypto_sync_skcipher_blocksize(tfm);
 
 	sg_init_table(&ptxt, 1);
 	sg_init_table(&ctxt, 1);
 
+	skcipher_request_set_sync_tfm(req, tfm);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+
 	for (i = 0; i < desc->bd_iov_count; i++) {
 		sg_set_page(&ptxt, BD_GET_KIOV(desc, i).kiov_page,
 			    sk_block_mask(BD_GET_KIOV(desc, i).kiov_len,
@@ -675,13 +660,15 @@ static __u32 sk_encrypt_bulk(struct crypto_blkcipher *tfm, __u8 *iv,
 		BD_GET_ENC_KIOV(desc, i).kiov_offset = ctxt.offset;
 		BD_GET_ENC_KIOV(desc, i).kiov_len = ctxt.length;
 
-		rc = crypto_blkcipher_encrypt_iv(&cdesc, &ctxt, &ptxt,
-						 ptxt.length);
+		skcipher_request_set_crypt(req, &ptxt, &ctxt, ptxt.length, iv);
+		rc = crypto_skcipher_encrypt_iv(req, &ctxt, &ptxt, ptxt.length);
 		if (rc) {
 			CERROR("failed to encrypt page: %d\n", rc);
+			skcipher_request_zero(req);
 			return rc;
 		}
 	}
+	skcipher_request_zero(req);
 
 	if (adj_nob)
 		desc->bd_nob = nob;
@@ -689,15 +676,10 @@ static __u32 sk_encrypt_bulk(struct crypto_blkcipher *tfm, __u8 *iv,
 	return 0;
 }
 
-static __u32 sk_decrypt_bulk(struct crypto_blkcipher *tfm, __u8 *iv,
+static __u32 sk_decrypt_bulk(struct crypto_sync_skcipher *tfm, __u8 *iv,
 			     struct ptlrpc_bulk_desc *desc, rawobj_t *cipher,
 			     int adj_nob)
 {
-	struct blkcipher_desc cdesc = {
-		.tfm = tfm,
-		.info = iv,
-		.flags = 0,
-	};
 	struct scatterlist ptxt;
 	struct scatterlist ctxt;
 	int blocksize;
@@ -705,17 +687,21 @@ static __u32 sk_decrypt_bulk(struct crypto_blkcipher *tfm, __u8 *iv,
 	int rc;
 	int pnob = 0;
 	int cnob = 0;
+	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
 
 	sg_init_table(&ptxt, 1);
 	sg_init_table(&ctxt, 1);
 
-	blocksize = crypto_blkcipher_blocksize(tfm);
+	blocksize = crypto_sync_skcipher_blocksize(tfm);
 	if (desc->bd_nob_transferred % blocksize != 0) {
 		CERROR("Transfer not a multiple of block size: %d\n",
 		       desc->bd_nob_transferred);
 		return GSS_S_DEFECTIVE_TOKEN;
 	}
 
+	skcipher_request_set_sync_tfm(req, tfm);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+
 	for (i = 0; i < desc->bd_iov_count && cnob < desc->bd_nob_transferred;
 	     i++) {
 		lnet_kiov_t *piov = &BD_GET_KIOV(desc, i);
@@ -724,6 +710,7 @@ static __u32 sk_decrypt_bulk(struct crypto_blkcipher *tfm, __u8 *iv,
 		if (ciov->kiov_offset % blocksize != 0 ||
 		    ciov->kiov_len % blocksize != 0) {
 			CERROR("Invalid bulk descriptor vector\n");
+			skcipher_request_zero(req);
 			return GSS_S_DEFECTIVE_TOKEN;
 		}
 
@@ -747,6 +734,7 @@ static __u32 sk_decrypt_bulk(struct crypto_blkcipher *tfm, __u8 *iv,
 			if (ciov->kiov_len + cnob > desc->bd_nob_transferred ||
 			    piov->kiov_len > ciov->kiov_len) {
 				CERROR("Invalid decrypted length\n");
+				skcipher_request_zero(req);
 				return GSS_S_FAILURE;
 			}
 		}
@@ -765,10 +753,11 @@ static __u32 sk_decrypt_bulk(struct crypto_blkcipher *tfm, __u8 *iv,
 		if (piov->kiov_len % blocksize == 0)
 			sg_assign_page(&ptxt, piov->kiov_page);
 
-		rc = crypto_blkcipher_decrypt_iv(&cdesc, &ptxt, &ctxt,
-						 ctxt.length);
+		skcipher_request_set_crypt(req, &ctxt, &ptxt, ptxt.length, iv);
+		rc = crypto_skcipher_decrypt_iv(req, &ptxt, &ctxt, ptxt.length);
 		if (rc) {
 			CERROR("Decryption failed for page: %d\n", rc);
+			skcipher_request_zero(req);
 			return GSS_S_FAILURE;
 		}
 
@@ -783,6 +772,7 @@ static __u32 sk_decrypt_bulk(struct crypto_blkcipher *tfm, __u8 *iv,
 		cnob += ciov->kiov_len;
 		pnob += piov->kiov_len;
 	}
+	skcipher_request_zero(req);
 
 	/* if needed, clear up the rest unused iovs */
 	if (adj_nob)
@@ -810,7 +800,7 @@ __u32 gss_wrap_bulk_sk(struct gss_ctx *gss_context,
 		       int adj_nob)
 {
 	struct sk_ctx *skc = gss_context->internal_ctx_id;
-	struct sk_hmac_type *sht = &sk_hmac_types[skc->sc_hmac];
+	size_t sht_bytes = cfs_crypto_hash_digestsize(skc->sc_hmac);
 	struct sk_wire skw;
 	struct sk_hdr skh;
 	__u8 local_iv[SK_IV_SIZE];
@@ -827,15 +817,16 @@ __u32 gss_wrap_bulk_sk(struct gss_ctx *gss_context,
 
 	sk_construct_rfc3686_iv(local_iv, skc->sc_host_random, skh.skh_iv);
 	skw.skw_cipher.data = skw.skw_header.data + skw.skw_header.len;
-	skw.skw_cipher.len = token->len - skw.skw_header.len - sht->sht_bytes;
+	skw.skw_cipher.len = token->len - skw.skw_header.len - sht_bytes;
 	if (sk_encrypt_bulk(skc->sc_session_kb.kb_tfm, local_iv,
 			    desc, &skw.skw_cipher, adj_nob))
 		return GSS_S_FAILURE;
 
 	skw.skw_hmac.data = skw.skw_cipher.data + skw.skw_cipher.len;
-	skw.skw_hmac.len = sht->sht_bytes;
-	if (sk_make_hmac(sht->sht_name, &skc->sc_hmac_key, 1, &skw.skw_cipher,
-			 desc->bd_iov_count, GET_ENC_KIOV(desc), &skw.skw_hmac))
+	skw.skw_hmac.len = sht_bytes;
+	if (sk_make_hmac(skc->sc_hmac, &skc->sc_hmac_key, 1, &skw.skw_cipher,
+			 desc->bd_iov_count, GET_ENC_KIOV(desc), &skw.skw_hmac,
+			 gss_context->hash_func))
 		return GSS_S_FAILURE;
 
 	return GSS_S_COMPLETE;
@@ -847,7 +838,7 @@ __u32 gss_unwrap_bulk_sk(struct gss_ctx *gss_context,
 			   rawobj_t *token, int adj_nob)
 {
 	struct sk_ctx *skc = gss_context->internal_ctx_id;
-	struct sk_hmac_type *sht = &sk_hmac_types[skc->sc_hmac];
+	size_t sht_bytes = cfs_crypto_hash_digestsize(skc->sc_hmac);
 	struct sk_wire skw;
 	struct sk_hdr *skh;
 	__u8 local_iv[SK_IV_SIZE];
@@ -855,25 +846,25 @@ __u32 gss_unwrap_bulk_sk(struct gss_ctx *gss_context,
 
 	LASSERT(skc->sc_session_kb.kb_tfm);
 
-	if (token->len < sizeof(skh) + sht->sht_bytes)
+	if (token->len < sizeof(skh) + sht_bytes)
 		return GSS_S_DEFECTIVE_TOKEN;
 
 	skw.skw_header.data = token->data;
 	skw.skw_header.len = sizeof(struct sk_hdr);
 	skw.skw_cipher.data = skw.skw_header.data + skw.skw_header.len;
-	skw.skw_cipher.len = token->len - skw.skw_header.len - sht->sht_bytes;
+	skw.skw_cipher.len = token->len - skw.skw_header.len - sht_bytes;
 	skw.skw_hmac.data = skw.skw_cipher.data + skw.skw_cipher.len;
-	skw.skw_hmac.len = sht->sht_bytes;
+	skw.skw_hmac.len = sht_bytes;
 
 	skh = (struct sk_hdr *)skw.skw_header.data;
 	rc = sk_verify_header(skh);
 	if (rc != GSS_S_COMPLETE)
 		return rc;
 
-	rc = sk_verify_bulk_hmac(&sk_hmac_types[skc->sc_hmac],
-				 &skc->sc_hmac_key, 1, &skw.skw_cipher,
-				 desc->bd_iov_count, GET_ENC_KIOV(desc),
-				 desc->bd_nob, &skw.skw_hmac);
+	rc = sk_verify_bulk_hmac(skc->sc_hmac, &skc->sc_hmac_key, 1,
+				 &skw.skw_cipher, desc->bd_iov_count,
+				 GET_ENC_KIOV(desc), desc->bd_nob,
+				 &skw.skw_hmac);
 	if (rc)
 		return rc;
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_svc_upcall.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_svc_upcall.c
index 4798711dbe983..2202e3f56f8c5 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_svc_upcall.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_svc_upcall.c
@@ -60,7 +60,6 @@
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_import.h>
 #include <lustre_net.h>
 #include <lustre_nodemap.h>
@@ -69,12 +68,15 @@
 #include "gss_err.h"
 #include "gss_internal.h"
 #include "gss_api.h"
+#include "gss_crypto.h"
 
 #define GSS_SVC_UPCALL_TIMEOUT  (20)
 
 static spinlock_t __ctx_index_lock;
 static __u64 __ctx_index;
 
+unsigned int krb5_allow_old_client_csum;
+
 __u64 gss_get_next_ctx_index(void)
 {
 	__u64 idx;
@@ -160,6 +162,18 @@ static struct cache_detail rsi_cache;
 static struct rsi *rsi_update(struct rsi *new, struct rsi *old);
 static struct rsi *rsi_lookup(struct rsi *item);
 
+#ifdef HAVE_CACHE_DETAIL_WRITERS
+static inline int channel_users(struct cache_detail *cd)
+{
+	return atomic_read(&cd->writers);
+}
+#else
+static inline int channel_users(struct cache_detail *cd)
+{
+	return atomic_read(&cd->readers);
+}
+#endif
+
 static inline int rsi_hash(struct rsi *item)
 {
         return hash_mem((char *)item->in_handle.data, item->in_handle.len,
@@ -299,10 +313,9 @@ static struct cache_head *rsi_alloc(void)
 static int rsi_parse(struct cache_detail *cd, char *mesg, int mlen)
 {
         char           *buf = mesg;
-        char           *ep;
         int             len;
         struct rsi      rsii, *rsip = NULL;
-        time_t          expiry;
+	time64_t expiry;
         int             status = -EINVAL;
         ENTRY;
 
@@ -341,18 +354,21 @@ static int rsi_parse(struct cache_detail *cd, char *mesg, int mlen)
         if (len <= 0)
                 goto out;
 
-        /* major */
-        rsii.major_status = simple_strtol(buf, &ep, 10);
-        if (*ep)
-                goto out;
+	/* major */
+	status = kstrtoint(buf, 10, &rsii.major_status);
+	if (status)
+		goto out;
 
-        /* minor */
-        len = qword_get(&mesg, buf, mlen);
-        if (len <= 0)
-                goto out;
-        rsii.minor_status = simple_strtol(buf, &ep, 10);
-        if (*ep)
-                goto out;
+	/* minor */
+	len = qword_get(&mesg, buf, mlen);
+	if (len <= 0) {
+		status = -EINVAL;
+		goto out;
+	}
+
+	status = kstrtoint(buf, 10, &rsii.minor_status);
+	if (status)
+		goto out;
 
         /* out_handle */
         len = qword_get(&mesg, buf, mlen);
@@ -544,7 +560,7 @@ static int rsc_parse(struct cache_detail *cd, char *mesg, int mlen)
         char                *buf = mesg;
         int                  len, rv, tmp_int;
         struct rsc           rsci, *rscp = NULL;
-        time_t               expiry;
+	time64_t expiry;
         int                  status = -EINVAL;
         struct gss_api_mech *gm = NULL;
 
@@ -649,8 +665,7 @@ static int rsc_parse(struct cache_detail *cd, char *mesg, int mlen)
 		/* currently the expiry time passed down from user-space
 		 * is invalid, here we retrive it from mech.
 		 */
-		if (lgss_inquire_context(rsci.ctx.gsc_mechctx,
-					 (unsigned long *)&ctx_expiry)) {
+		if (lgss_inquire_context(rsci.ctx.gsc_mechctx, &ctx_expiry)) {
 			CERROR("unable to get expire time, drop it\n");
 			goto out;
 		}
@@ -720,85 +735,6 @@ static struct rsc *rsc_update(struct rsc *new, struct rsc *old)
  * rsc cache flush                      *
  ****************************************/
 
-typedef int rsc_entry_match(struct rsc *rscp, long data);
-
-static void rsc_flush(rsc_entry_match *match, long data)
-{
-#ifdef HAVE_CACHE_HEAD_HLIST
-	struct cache_head *ch = NULL;
-	struct hlist_head *head;
-#else
-	struct cache_head **ch;
-#endif
-        struct rsc *rscp;
-        int n;
-        ENTRY;
-
-	write_lock(&rsc_cache.hash_lock);
-        for (n = 0; n < RSC_HASHMAX; n++) {
-#ifdef HAVE_CACHE_HEAD_HLIST
-		head = &rsc_cache.hash_table[n];
-		hlist_for_each_entry(ch, head, cache_list) {
-			rscp = container_of(ch, struct rsc, h);
-#else
-		for (ch = &rsc_cache.hash_table[n]; *ch;) {
-			rscp = container_of(*ch, struct rsc, h);
-#endif
-
-                        if (!match(rscp, data)) {
-#ifndef HAVE_CACHE_HEAD_HLIST
-				ch = &((*ch)->next);
-#endif
-                                continue;
-                        }
-
-                        /* it seems simply set NEGATIVE doesn't work */
-#ifdef HAVE_CACHE_HEAD_HLIST
-			hlist_del_init(&ch->cache_list);
-#else
-			*ch = (*ch)->next;
-			rscp->h.next = NULL;
-#endif
-                        cache_get(&rscp->h);
-			set_bit(CACHE_NEGATIVE, &rscp->h.flags);
-                        COMPAT_RSC_PUT(&rscp->h, &rsc_cache);
-                        rsc_cache.entries--;
-                }
-        }
-	write_unlock(&rsc_cache.hash_lock);
-        EXIT;
-}
-
-static int match_uid(struct rsc *rscp, long uid)
-{
-        if ((int) uid == -1)
-                return 1;
-        return ((int) rscp->ctx.gsc_uid == (int) uid);
-}
-
-static int match_target(struct rsc *rscp, long target)
-{
-        return (rscp->target == (struct obd_device *) target);
-}
-
-static inline void rsc_flush_uid(int uid)
-{
-        if (uid == -1)
-                CWARN("flush all gss contexts...\n");
-
-        rsc_flush(match_uid, (long) uid);
-}
-
-static inline void rsc_flush_target(struct obd_device *target)
-{
-        rsc_flush(match_target, (long) target);
-}
-
-void gss_secsvc_flush(struct obd_device *target)
-{
-        rsc_flush_target(target);
-}
-
 static struct rsc *gss_svc_searchbyctx(rawobj_t *handle)
 {
         struct rsc  rsci;
@@ -822,7 +758,7 @@ int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp,
                                    struct gss_cli_ctx *gctx)
 {
         struct rsc      rsci, *rscp = NULL;
-        unsigned long   ctx_expiry;
+	time64_t ctx_expiry;
         __u32           major;
         int             rc;
         ENTRY;
@@ -846,7 +782,7 @@ int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp,
                 CERROR("unable to get expire time, drop it\n");
                 GOTO(out, rc = -EINVAL);
         }
-        rsci.h.expiry_time = (time_t) ctx_expiry;
+	rsci.h.expiry_time = ctx_expiry;
 
 	switch (imp->imp_obd->u.cli.cl_sp_to) {
 	case LUSTRE_SP_MDT:
@@ -857,6 +793,13 @@ int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp,
 		break;
 	case LUSTRE_SP_CLI:
 		rsci.ctx.gsc_usr_root = 1;
+		break;
+	case LUSTRE_SP_MGS:
+		/* by convention, all 3 set to 1 means MGS */
+		rsci.ctx.gsc_usr_mds = 1;
+		rsci.ctx.gsc_usr_oss = 1;
+		rsci.ctx.gsc_usr_root = 1;
+		break;
 	default:
 		break;
 	}
@@ -884,15 +827,15 @@ int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp,
 
 int gss_svc_upcall_expire_rvs_ctx(rawobj_t *handle)
 {
-        const cfs_time_t        expire = 20;
-        struct rsc             *rscp;
+	const time64_t expire = 20;
+	struct rsc *rscp;
 
         rscp = gss_svc_searchbyctx(handle);
         if (rscp) {
                 CDEBUG(D_SEC, "reverse svcctx %p (rsc %p) expire soon\n",
                        &rscp->ctx, rscp);
 
-                rscp->h.expiry_time = cfs_time_current_sec() + expire;
+		rscp->h.expiry_time = ktime_get_real_seconds() + expire;
                 COMPAT_RSC_PUT(&rscp->h, &rsc_cache);
         }
         return 0;
@@ -946,7 +889,11 @@ int gss_svc_upcall_handle_init(struct ptlrpc_request *req,
 
 	memset(&rsikey, 0, sizeof(rsikey));
 	rsikey.lustre_svc = lustre_svc;
-	rsikey.nid = (__u64) req->rq_peer.nid;
+	/* In case of MR, rq_peer is not the NID from which request is received,
+	 * but primary NID of peer.
+	 * So we need rq_source, which contains the NID actually in use.
+	 */
+	rsikey.nid = (__u64) req->rq_source.nid;
 	nodemap_test_nid(req->rq_peer.nid, rsikey.nm_name,
 			 sizeof(rsikey.nm_name));
 
@@ -991,11 +938,11 @@ int gss_svc_upcall_handle_init(struct ptlrpc_request *req,
 		if (first_check) {
 			first_check = 0;
 
-			read_lock(&rsi_cache.hash_lock);
+			cache_read_lock(&rsi_cache);
 			valid = test_bit(CACHE_VALID, &rsip->h.flags);
 			if (valid == 0)
 				set_current_state(TASK_INTERRUPTIBLE);
-			read_unlock(&rsi_cache.hash_lock);
+			cache_read_unlock(&rsi_cache);
 
 			if (valid == 0) {
 				unsigned long jiffies;
@@ -1044,6 +991,20 @@ int gss_svc_upcall_handle_init(struct ptlrpc_request *req,
                 grctx->src_ctx = &rsci->ctx;
         }
 
+	if (gw->gw_flags & LUSTRE_GSS_PACK_KCSUM) {
+		grctx->src_ctx->gsc_mechctx->hash_func = gss_digest_hash;
+	} else if (!strcmp(grctx->src_ctx->gsc_mechctx->mech_type->gm_name,
+			   "krb5") &&
+		   !krb5_allow_old_client_csum) {
+		CWARN("%s: deny connection from '%s' due to missing 'krb_csum' feature, set 'sptlrpc.gss.krb5_allow_old_client_csum=1' to allow, but recommend client upgrade: rc = %d\n",
+		      target->obd_name, libcfs_nid2str(req->rq_peer.nid),
+		      -EPROTO);
+		GOTO(out, rc = SECSVC_DROP);
+	} else {
+		grctx->src_ctx->gsc_mechctx->hash_func =
+			gss_digest_hash_compat;
+	}
+
         if (rawobj_dup(&rsci->ctx.gsc_rvs_hdl, rvs_hdl)) {
                 CERROR("failed duplicate reverse handle\n");
                 GOTO(out, rc);
@@ -1172,17 +1133,18 @@ int __init gss_init_svc_upcall(void)
 	/* FIXME this looks stupid. we intend to give lsvcgssd a chance to open
 	 * the init upcall channel, otherwise there's big chance that the first
 	 * upcall issued before the channel be opened thus nfsv4 cache code will
-	 * drop the request direclty, thus lead to unnecessary recovery time.
-	 * here we wait at miximum 1.5 seconds. */
+	 * drop the request directly, thus lead to unnecessary recovery time.
+	 * Here we wait at minimum 1.5 seconds.
+	 */
 	for (i = 0; i < 6; i++) {
-		if (atomic_read(&rsi_cache.readers) > 0)
+		if (channel_users(&rsi_cache) > 0)
 			break;
 		set_current_state(TASK_UNINTERRUPTIBLE);
-		LASSERT(msecs_to_jiffies(MSEC_PER_SEC) >= 4);
+		LASSERT(msecs_to_jiffies(MSEC_PER_SEC / 4) > 0);
 		schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC / 4));
 	}
 
-	if (atomic_read(&rsi_cache.readers) == 0)
+	if (channel_users(&rsi_cache) == 0)
 		CWARN("Init channel is not opened by lsvcgssd, following "
 		      "request might be dropped until lsvcgssd is active\n");
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/lproc_gss.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/lproc_gss.c
index 610f0b38c8d4f..1335ffd466ff3 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/lproc_gss.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/lproc_gss.c
@@ -41,7 +41,6 @@
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_net.h>
 #include <lustre_import.h>
 #include <lprocfs_status.h>
@@ -133,7 +132,29 @@ static const struct file_operations gss_proc_secinit = {
 	.write = gss_proc_write_secinit,
 };
 
-static struct lprocfs_vars gss_lprocfs_vars[] = {
+int sptlrpc_krb5_allow_old_client_csum_seq_show(struct seq_file *m, void *data)
+{
+	seq_printf(m, "%u\n", krb5_allow_old_client_csum);
+	return 0;
+}
+
+ssize_t sptlrpc_krb5_allow_old_client_csum_seq_write(struct file *file,
+						     const char __user *buffer,
+						     size_t count, loff_t *off)
+{
+	bool val;
+	int rc;
+
+	rc = kstrtobool_from_user(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	krb5_allow_old_client_csum = val;
+	return count;
+}
+LPROC_SEQ_FOPS(sptlrpc_krb5_allow_old_client_csum);
+
+static struct ldebugfs_vars gss_debugfs_vars[] = {
 	{ .name	=	"replays",
 	  .fops	=	&gss_proc_oos_fops	},
 	{ .name	=	"init_channel",
@@ -142,6 +163,12 @@ static struct lprocfs_vars gss_lprocfs_vars[] = {
 	{ NULL }
 };
 
+static struct lprocfs_vars gss_lprocfs_vars[] = {
+	{ .name	=	"krb5_allow_old_client_csum",
+	  .fops	=	&sptlrpc_krb5_allow_old_client_csum_fops },
+	{ NULL }
+};
+
 /*
  * for userspace helper lgss_keyring.
  *
@@ -159,14 +186,14 @@ static ssize_t
 gss_lk_proc_dl_seq_write(struct file *file, const char __user *buffer,
 				size_t count, loff_t *off)
 {
+	unsigned int val;
 	int rc;
-	__s64 val;
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = kstrtouint_from_user(buffer, count, 0, &val);
 	if (rc < 0)
 		return rc;
 
-	if (val < 0 || val > 4)
+	if (val > 4)
 		return -ERANGE;
 
 	gss_lk_debug_level = val;
@@ -175,7 +202,7 @@ gss_lk_proc_dl_seq_write(struct file *file, const char __user *buffer,
 }
 LPROC_SEQ_FOPS(gss_lk_proc_dl);
 
-static struct lprocfs_vars gss_lk_lprocfs_vars[] = {
+static struct ldebugfs_vars gss_lk_debugfs_vars[] = {
 	{ .name	=	"debug_level",
 	  .fops	=	&gss_lk_proc_dl_fops	},
 	{ NULL }
@@ -209,7 +236,7 @@ int gss_init_lproc(void)
 	}
 
 	gss_proc_lk = lprocfs_register("lgss_keyring", gss_proc_root,
-				       gss_lk_lprocfs_vars, NULL);
+				       gss_lk_ldebugfs_vars, NULL);
 	if (IS_ERR(gss_proc_lk)) {
 		rc = PTR_ERR(gss_proc_lk);
 		gss_proc_lk = NULL;
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/sec_gss.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/sec_gss.c
index bee52f3751356..845bfbca44d51 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/sec_gss.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/sec_gss.c
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2015, Intel Corporation.
  *
  * Author: Eric Mei <ericm@clusterfs.com>
  */
@@ -59,7 +59,6 @@
 #include <obd_class.h>
 #include <obd_support.h>
 #include <obd_cksum.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_net.h>
 #include <lustre_import.h>
 #include <lustre_sec.h>
@@ -309,11 +308,11 @@ int cli_ctx_expire(struct ptlrpc_cli_ctx *ctx)
 		if (!ctx->cc_early_expire)
 			clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags);
 
-		CWARN("ctx %p(%u->%s) get expired: %lu(%+lds)\n",
+		CWARN("ctx %p(%u->%s) get expired: %lld(%+llds)\n",
 		      ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
 		      ctx->cc_expire,
 		      ctx->cc_expire == 0 ? 0 :
-		      cfs_time_sub(ctx->cc_expire, cfs_time_current_sec()));
+		      ctx->cc_expire - ktime_get_real_seconds());
 
 		sptlrpc_cli_ctx_wakeup(ctx);
 		return 1;
@@ -336,7 +335,7 @@ int cli_ctx_check_death(struct ptlrpc_cli_ctx *ctx)
                 return 0;
 
         /* check real expiration */
-        if (cfs_time_after(ctx->cc_expire, cfs_time_current_sec()))
+	if (ctx->cc_expire > ktime_get_real_seconds())
                 return 0;
 
         cli_ctx_expire(ctx);
@@ -345,8 +344,8 @@ int cli_ctx_check_death(struct ptlrpc_cli_ctx *ctx)
 
 void gss_cli_ctx_uptodate(struct gss_cli_ctx *gctx)
 {
-        struct ptlrpc_cli_ctx  *ctx = &gctx->gc_base;
-        unsigned long           ctx_expiry;
+	struct ptlrpc_cli_ctx *ctx = &gctx->gc_base;
+	time64_t ctx_expiry;
 
         if (lgss_inquire_context(gctx->gc_mechctx, &ctx_expiry)) {
                 CERROR("ctx %p(%u): unable to inquire, expire it now\n",
@@ -365,17 +364,17 @@ void gss_cli_ctx_uptodate(struct gss_cli_ctx *gctx)
 
 	if (sec_is_reverse(ctx->cc_sec)) {
 		CWARN("server installed reverse ctx %p idx %#llx, "
-		      "expiry %lu(%+lds)\n", ctx,
+		      "expiry %lld(%+llds)\n", ctx,
 		      gss_handle_to_u64(&gctx->gc_handle),
 		      ctx->cc_expire,
-		      cfs_time_sub(ctx->cc_expire, cfs_time_current_sec()));
+		      ctx->cc_expire - ktime_get_real_seconds());
         } else {
 		CWARN("client refreshed ctx %p idx %#llx (%u->%s), "
-		      "expiry %lu(%+lds)\n", ctx,
+		      "expiry %lld(%+llds)\n", ctx,
 		      gss_handle_to_u64(&gctx->gc_handle),
 		      ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
 		      ctx->cc_expire,
-		      cfs_time_sub(ctx->cc_expire, cfs_time_current_sec()));
+		      ctx->cc_expire - ktime_get_real_seconds());
 
 		/* install reverse svc ctx for root context */
 		if (ctx->cc_vcred.vc_uid == 0)
@@ -1103,6 +1102,9 @@ int gss_sec_create_common(struct gss_sec *gsec,
 	sec->ps_import = class_import_get(imp);
 	spin_lock_init(&sec->ps_lock);
 	INIT_LIST_HEAD(&sec->ps_gc_list);
+	sec->ps_sepol_mtime = ktime_set(0, 0);
+	sec->ps_sepol_checknext = ktime_set(0, 0);
+	sec->ps_sepol[0] = '\0';
 
         if (!svcctx) {
                 sec->ps_gc_interval = GSS_GC_INTERVAL;
@@ -2055,16 +2057,17 @@ int gss_svc_handle_init(struct ptlrpc_request *req,
         if (rc != SECSVC_OK)
                 RETURN(rc);
 
-        if (grctx->src_ctx->gsc_usr_mds || grctx->src_ctx->gsc_usr_oss ||
-            grctx->src_ctx->gsc_usr_root)
-                CWARN("create svc ctx %p: user from %s authenticated as %s\n",
-                      grctx->src_ctx, libcfs_nid2str(req->rq_peer.nid),
-                      grctx->src_ctx->gsc_usr_mds ? "mds" :
-                        (grctx->src_ctx->gsc_usr_oss ? "oss" : "root"));
-        else
-                CWARN("create svc ctx %p: accept user %u from %s\n",
-                      grctx->src_ctx, grctx->src_ctx->gsc_uid,
-                      libcfs_nid2str(req->rq_peer.nid));
+	if (grctx->src_ctx->gsc_usr_mds || grctx->src_ctx->gsc_usr_oss ||
+	    grctx->src_ctx->gsc_usr_root)
+		CWARN("create svc ctx %p: user from %s authenticated as %s\n",
+		      grctx->src_ctx, libcfs_nid2str(req->rq_peer.nid),
+		      grctx->src_ctx->gsc_usr_root ? "root" :
+		      (grctx->src_ctx->gsc_usr_mds ? "mds" :
+		       (grctx->src_ctx->gsc_usr_oss ? "oss" : "null")));
+	else
+		CWARN("create svc ctx %p: accept user %u from %s\n",
+		      grctx->src_ctx, grctx->src_ctx->gsc_uid,
+		      libcfs_nid2str(req->rq_peer.nid));
 
         if (gw->gw_flags & LUSTRE_GSS_PACK_USER) {
                 if (reqbuf->lm_bufcount < 4) {
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/import.c b/drivers/staging/lustrefsx/lustre/ptlrpc/import.c
index 827a989f1e139..46d92bf4ed2d0 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/import.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/import.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -56,10 +56,10 @@ struct ptlrpc_connect_async_args {
 
 /**
  * Updates import \a imp current state to provided \a state value
- * Helper function. Must be called under imp_lock.
+ * Helper function.
  */
-static void __import_set_state(struct obd_import *imp,
-                               enum lustre_imp_state state)
+static void import_set_state_nolock(struct obd_import *imp,
+				    enum lustre_imp_state state)
 {
 	switch (state) {
 	case LUSTRE_IMP_CLOSED:
@@ -72,7 +72,20 @@ static void __import_set_state(struct obd_import *imp,
 		break;
 	default:
 		imp->imp_replay_state = LUSTRE_IMP_REPLAY;
+		break;
 	}
+
+	/* A CLOSED import should remain so. */
+	if (imp->imp_state == LUSTRE_IMP_CLOSED)
+		return;
+
+	if (imp->imp_state != LUSTRE_IMP_NEW) {
+		CDEBUG(D_HA, "%p %s: changing import state from %s to %s\n",
+		       imp, obd2cli_tgt(imp->imp_obd),
+		       ptlrpc_import_state_name(imp->imp_state),
+		       ptlrpc_import_state_name(state));
+	}
+
         imp->imp_state = state;
         imp->imp_state_hist[imp->imp_state_hist_idx].ish_state = state;
         imp->imp_state_hist[imp->imp_state_hist_idx].ish_time =
@@ -81,28 +94,17 @@ static void __import_set_state(struct obd_import *imp,
                 IMP_STATE_HIST_LEN;
 }
 
-/* A CLOSED import should remain so. */
-#define IMPORT_SET_STATE_NOLOCK(imp, state)                                    \
-do {                                                                           \
-        if (imp->imp_state != LUSTRE_IMP_CLOSED) {                             \
-               CDEBUG(D_HA, "%p %s: changing import state from %s to %s\n",    \
-                      imp, obd2cli_tgt(imp->imp_obd),                          \
-                      ptlrpc_import_state_name(imp->imp_state),                \
-                      ptlrpc_import_state_name(state));                        \
-               __import_set_state(imp, state);                                 \
-        }                                                                      \
-} while(0)
-
-#define IMPORT_SET_STATE(imp, state)					\
-do {									\
-	spin_lock(&imp->imp_lock);					\
-	IMPORT_SET_STATE_NOLOCK(imp, state);				\
-	spin_unlock(&imp->imp_lock);					\
-} while(0)
+static void import_set_state(struct obd_import *imp,
+			     enum lustre_imp_state new_state)
+{
+	spin_lock(&imp->imp_lock);
+	import_set_state_nolock(imp, new_state);
+	spin_unlock(&imp->imp_lock);
+}
 
 void ptlrpc_import_enter_resend(struct obd_import *imp)
 {
-	IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+	import_set_state(imp, LUSTRE_IMP_RECOVER);
 }
 EXPORT_SYMBOL(ptlrpc_import_enter_resend);
 
@@ -146,6 +148,21 @@ void deuuidify(char *uuid, const char *prefix, char **uuid_start, int *uuid_len)
                 *uuid_len -= strlen(UUID_STR);
 }
 
+/* Must be called with imp_lock held! */
+static void ptlrpc_deactivate_import_nolock(struct obd_import *imp)
+{
+	ENTRY;
+
+	assert_spin_locked(&imp->imp_lock);
+	CDEBUG(D_HA, "setting import %s INVALID\n", obd2cli_tgt(imp->imp_obd));
+	imp->imp_invalid = 1;
+	imp->imp_generation++;
+
+	ptlrpc_abort_inflight(imp);
+
+	EXIT;
+}
+
 /**
  * Returns true if import was FULL, false if import was already not
  * connected.
@@ -156,8 +173,10 @@ void deuuidify(char *uuid, const char *prefix, char **uuid_start, int *uuid_len)
  *             bulk requests) and if one has already caused a reconnection
  *             (increasing the import->conn_cnt) the older failure should
  *             not also cause a reconnection.  If zero it forces a reconnect.
+ * @invalid - set import invalid flag
  */
-int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt)
+int ptlrpc_set_import_discon(struct obd_import *imp,
+			     __u32 conn_cnt, bool invalid)
 {
 	int rc = 0;
 
@@ -167,31 +186,43 @@ int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt)
             (conn_cnt == 0 || conn_cnt == imp->imp_conn_cnt)) {
                 char *target_start;
                 int   target_len;
+		bool  inact = false;
 
                 deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
                           &target_start, &target_len);
 
+		import_set_state_nolock(imp, LUSTRE_IMP_DISCON);
                 if (imp->imp_replayable) {
                         LCONSOLE_WARN("%s: Connection to %.*s (at %s) was "
                                "lost; in progress operations using this "
                                "service will wait for recovery to complete\n",
                                imp->imp_obd->obd_name, target_len, target_start,
-                               libcfs_nid2str(imp->imp_connection->c_peer.nid));
-                } else {
-                        LCONSOLE_ERROR_MSG(0x166, "%s: Connection to "
-                               "%.*s (at %s) was lost; in progress "
-                               "operations using this service will fail\n",
-                               imp->imp_obd->obd_name,
-                               target_len, target_start,
-                               libcfs_nid2str(imp->imp_connection->c_peer.nid));
-                }
-                IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
+			       obd_import_nid2str(imp));
+		} else {
+			LCONSOLE_ERROR_MSG(0x166, "%s: Connection to "
+			       "%.*s (at %s) was lost; in progress "
+			       "operations using this service will fail\n",
+			       imp->imp_obd->obd_name, target_len, target_start,
+			       obd_import_nid2str(imp));
+			if (invalid) {
+				CDEBUG(D_HA, "import %s@%s for %s not "
+				       "replayable, auto-deactivating\n",
+				       obd2cli_tgt(imp->imp_obd),
+				       imp->imp_connection->c_remote_uuid.uuid,
+				       imp->imp_obd->obd_name);
+				ptlrpc_deactivate_import_nolock(imp);
+				inact = true;
+			}
+		}
 		spin_unlock(&imp->imp_lock);
 
 		if (obd_dump_on_timeout)
 			libcfs_debug_dumplog();
 
 		obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON);
+
+		if (inact)
+			obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
 		rc = 1;
 	} else {
 		spin_unlock(&imp->imp_lock);
@@ -206,23 +237,6 @@ int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt)
         return rc;
 }
 
-/* Must be called with imp_lock held! */
-static void ptlrpc_deactivate_and_unlock_import(struct obd_import *imp)
-{
-	ENTRY;
-	assert_spin_locked(&imp->imp_lock);
-
-	CDEBUG(D_HA, "setting import %s INVALID\n", obd2cli_tgt(imp->imp_obd));
-	imp->imp_invalid = 1;
-	imp->imp_generation++;
-	spin_unlock(&imp->imp_lock);
-
-	ptlrpc_abort_inflight(imp);
-	obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
-
-	EXIT;
-}
-
 /*
  * This acts as a barrier; all existing requests are rejected, and
  * no new requests will be accepted until the import is valid again.
@@ -230,14 +244,17 @@ static void ptlrpc_deactivate_and_unlock_import(struct obd_import *imp)
 void ptlrpc_deactivate_import(struct obd_import *imp)
 {
 	spin_lock(&imp->imp_lock);
-	ptlrpc_deactivate_and_unlock_import(imp);
+	ptlrpc_deactivate_import_nolock(imp);
+	spin_unlock(&imp->imp_lock);
+
+	obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
 }
 EXPORT_SYMBOL(ptlrpc_deactivate_import);
 
-static unsigned int
-ptlrpc_inflight_deadline(struct ptlrpc_request *req, time64_t now)
+static time64_t ptlrpc_inflight_deadline(struct ptlrpc_request *req,
+					 time64_t now)
 {
-        long dl;
+	time64_t dl;
 
         if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
               (req->rq_phase == RQ_PHASE_BULK) ||
@@ -258,12 +275,12 @@ ptlrpc_inflight_deadline(struct ptlrpc_request *req, time64_t now)
         return dl - now;
 }
 
-static unsigned int ptlrpc_inflight_timeout(struct obd_import *imp)
+static time64_t ptlrpc_inflight_timeout(struct obd_import *imp)
 {
 	time64_t now = ktime_get_real_seconds();
 	struct list_head *tmp, *n;
 	struct ptlrpc_request *req;
-	unsigned int timeout = 0;
+	time64_t timeout = 0;
 
 	spin_lock(&imp->imp_lock);
 	list_for_each_safe(tmp, n, &imp->imp_sending_list) {
@@ -285,7 +302,7 @@ void ptlrpc_invalidate_import(struct obd_import *imp)
 	struct list_head *tmp, *n;
 	struct ptlrpc_request *req;
 	struct l_wait_info lwi;
-	unsigned int timeout;
+	time64_t timeout;
 	int rc;
 
 	atomic_inc(&imp->imp_inval_count);
@@ -305,30 +322,35 @@ void ptlrpc_invalidate_import(struct obd_import *imp)
          * unlink. We can't do anything before that because there is really
          * no guarantee that some rdma transfer is not in progress right now. */
         do {
+		long timeout_jiffies;
+
                 /* Calculate max timeout for waiting on rpcs to error
                  * out. Use obd_timeout if calculated value is smaller
-                 * than it. */
-                if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
-                        timeout = ptlrpc_inflight_timeout(imp);
-                        timeout += timeout / 3;
-
-                        if (timeout == 0)
-                                timeout = obd_timeout;
-                } else {
-                        /* decrease the interval to increase race condition */
-                        timeout = 1;
-                }
+		 * than it.
+		 */
+		if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
+			timeout = ptlrpc_inflight_timeout(imp);
+			timeout += div_u64(timeout, 3);
+
+			if (timeout == 0)
+				timeout = obd_timeout;
+		} else {
+			/* decrease the interval to increase race condition */
+			timeout = 1;
+		}
 
-                CDEBUG(D_RPCTRACE,"Sleeping %d sec for inflight to error out\n",
-                       timeout);
+		CDEBUG(D_RPCTRACE, "Sleeping %llds for inflight to error out\n",
+		       timeout);
 
 		/* Wait for all requests to error out and call completion
 		 * callbacks. Cap it at obd_timeout -- these should all
-		 * have been locally cancelled by ptlrpc_abort_inflight. */
-		lwi = LWI_TIMEOUT_INTERVAL(
-			cfs_timeout_cap(cfs_time_seconds(timeout)),
-			(timeout > 1)?cfs_time_seconds(1):cfs_time_seconds(1)/2,
-			NULL, NULL);
+		 * have been locally cancelled by ptlrpc_abort_inflight.
+		 */
+		timeout_jiffies = max_t(long, cfs_time_seconds(timeout), 1);
+		lwi = LWI_TIMEOUT_INTERVAL(timeout_jiffies,
+					   (timeout > 1) ? cfs_time_seconds(1) :
+							   cfs_time_seconds(1) / 2,
+							   NULL, NULL);
 		rc = l_wait_event(imp->imp_recovery_waitq,
 				  (atomic_read(&imp->imp_inflight) == 0),
 				  &lwi);
@@ -396,17 +418,23 @@ void ptlrpc_invalidate_import(struct obd_import *imp)
 EXPORT_SYMBOL(ptlrpc_invalidate_import);
 
 /* unset imp_invalid */
-void ptlrpc_activate_import(struct obd_import *imp)
+void ptlrpc_activate_import(struct obd_import *imp, bool set_state_full)
 {
 	struct obd_device *obd = imp->imp_obd;
 
 	spin_lock(&imp->imp_lock);
 	if (imp->imp_deactive != 0) {
+		LASSERT(imp->imp_state != LUSTRE_IMP_FULL);
+		if (imp->imp_state != LUSTRE_IMP_DISCON)
+			import_set_state_nolock(imp, LUSTRE_IMP_DISCON);
 		spin_unlock(&imp->imp_lock);
 		return;
 	}
+	if (set_state_full)
+		import_set_state_nolock(imp, LUSTRE_IMP_FULL);
 
 	imp->imp_invalid = 0;
+
 	spin_unlock(&imp->imp_lock);
 	obd_import_event(obd, imp, IMP_EVENT_ACTIVE);
 }
@@ -428,45 +456,36 @@ EXPORT_SYMBOL(ptlrpc_pinger_force);
 
 void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt)
 {
-        ENTRY;
-
-        LASSERT(!imp->imp_dlm_fake);
+	ENTRY;
 
-        if (ptlrpc_set_import_discon(imp, conn_cnt)) {
-                if (!imp->imp_replayable) {
-                        CDEBUG(D_HA, "import %s@%s for %s not replayable, "
-                               "auto-deactivating\n",
-                               obd2cli_tgt(imp->imp_obd),
-                               imp->imp_connection->c_remote_uuid.uuid,
-                               imp->imp_obd->obd_name);
-                        ptlrpc_deactivate_import(imp);
-                }
+	LASSERT(!imp->imp_dlm_fake);
 
+	if (ptlrpc_set_import_discon(imp, conn_cnt, true))
 		ptlrpc_pinger_force(imp);
-	}
+
 	EXIT;
 }
 
 int ptlrpc_reconnect_import(struct obd_import *imp)
 {
 #ifdef ENABLE_PINGER
+	long timeout_jiffies = cfs_time_seconds(obd_timeout);
 	struct l_wait_info lwi;
-	int secs = cfs_time_seconds(obd_timeout);
 	int rc;
 
 	ptlrpc_pinger_force(imp);
 
 	CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n",
-	       obd2cli_tgt(imp->imp_obd), secs);
+	       obd2cli_tgt(imp->imp_obd), obd_timeout);
 
-	lwi = LWI_TIMEOUT(secs, NULL, NULL);
+	lwi = LWI_TIMEOUT(timeout_jiffies, NULL, NULL);
 	rc = l_wait_event(imp->imp_recovery_waitq,
 			  !ptlrpc_import_in_recovery(imp), &lwi);
 	CDEBUG(D_HA, "%s: recovery finished s:%s\n", obd2cli_tgt(imp->imp_obd),
 	       ptlrpc_import_state_name(imp->imp_state));
 	return rc;
 #else
-	ptlrpc_set_import_discon(imp, 0);
+	ptlrpc_set_import_discon(imp, 0, false);
 	/* Force a new connect attempt */
 	ptlrpc_invalidate_import(imp);
 	/* Do a fresh connect next time by zeroing the handle */
@@ -487,7 +506,7 @@ int ptlrpc_reconnect_import(struct obd_import *imp)
 	/* Allow reconnect attempts */
 	imp->imp_obd->obd_no_recov = 0;
 	/* Remove 'invalid' flag */
-	ptlrpc_activate_import(imp);
+	ptlrpc_activate_import(imp, false);
 	/* Attempt a new connect */
 	ptlrpc_recover_import(imp, NULL, 0);
 	return 0;
@@ -518,7 +537,7 @@ static int import_select_connection(struct obd_import *imp)
 	}
 
 	list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
-		CDEBUG(D_HA, "%s: connect to NID %s last attempt %llu\n",
+		CDEBUG(D_HA, "%s: connect to NID %s last attempt %lld\n",
                        imp->imp_obd->obd_name,
                        libcfs_nid2str(conn->oic_conn->c_peer.nid),
                        conn->oic_last_attempt);
@@ -526,8 +545,7 @@ static int import_select_connection(struct obd_import *imp)
                 /* If we have not tried this connection since
                    the last successful attempt, go with this one */
                 if ((conn->oic_last_attempt == 0) ||
-                    cfs_time_beforeq_64(conn->oic_last_attempt,
-                                       imp->imp_last_success_conn)) {
+		    conn->oic_last_attempt <= imp->imp_last_success_conn) {
                         imp_conn = conn;
                         tried_all = 0;
                         break;
@@ -538,8 +556,7 @@ static int import_select_connection(struct obd_import *imp)
                    least recently used */
                 if (!imp_conn)
                         imp_conn = conn;
-                else if (cfs_time_before_64(conn->oic_last_attempt,
-                                            imp_conn->oic_last_attempt))
+		else if (imp_conn->oic_last_attempt > conn->oic_last_attempt)
                         imp_conn = conn;
         }
 
@@ -568,7 +585,7 @@ static int import_select_connection(struct obd_import *imp)
 			"to %ds\n", imp->imp_obd->obd_name, at_get(at));
 	}
 
-        imp_conn->oic_last_attempt = cfs_time_current_64();
+	imp_conn->oic_last_attempt = ktime_get_seconds();
 
         /* switch connection, don't mind if it's same as the current one */
         if (imp->imp_connection)
@@ -639,29 +656,41 @@ static int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno)
 	return 0;
 }
 
+int ptlrpc_connect_import(struct obd_import *imp)
+{
+	spin_lock(&imp->imp_lock);
+	return ptlrpc_connect_import_locked(imp);
+}
+
 /**
  * Attempt to (re)connect import \a imp. This includes all preparations,
  * initializing CONNECT RPC request and passing it to ptlrpcd for
  * actual sending.
+ *
+ * Assumes imp->imp_lock is held, and releases it.
+ *
  * Returns 0 on success or error code.
  */
-int ptlrpc_connect_import(struct obd_import *imp)
+int ptlrpc_connect_import_locked(struct obd_import *imp)
 {
 	struct obd_device *obd = imp->imp_obd;
 	int initial_connect = 0;
 	int set_transno = 0;
 	__u64 committed_before_reconnect = 0;
 	struct ptlrpc_request *request;
+	struct obd_connect_data ocd;
 	char *bufs[] = { NULL,
 			 obd2cli_tgt(imp->imp_obd),
 			 obd->obd_uuid.uuid,
 			 (char *)&imp->imp_dlm_handle,
-			 (char *)&imp->imp_connect_data };
+			 (char *)&ocd,
+			 NULL };
 	struct ptlrpc_connect_async_args *aa;
 	int rc;
 	ENTRY;
 
-	spin_lock(&imp->imp_lock);
+	assert_spin_locked(&imp->imp_lock);
+
 	if (imp->imp_state == LUSTRE_IMP_CLOSED) {
 		spin_unlock(&imp->imp_lock);
 		CERROR("can't connect to a closed import\n");
@@ -678,7 +707,7 @@ int ptlrpc_connect_import(struct obd_import *imp)
 		RETURN(-EALREADY);
 	}
 
-	IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CONNECTING);
+	import_set_state_nolock(imp, LUSTRE_IMP_CONNECTING);
 
 	imp->imp_conn_cnt++;
 	imp->imp_resend_replay = 0;
@@ -702,15 +731,16 @@ int ptlrpc_connect_import(struct obd_import *imp)
 
 	/* Reset connect flags to the originally requested flags, in case
 	 * the server is updated on-the-fly we will get the new features. */
-	imp->imp_connect_data.ocd_connect_flags = imp->imp_connect_flags_orig;
-	imp->imp_connect_data.ocd_connect_flags2 = imp->imp_connect_flags2_orig;
+	ocd = imp->imp_connect_data;
+	ocd.ocd_connect_flags = imp->imp_connect_flags_orig;
+	ocd.ocd_connect_flags2 = imp->imp_connect_flags2_orig;
 	/* Reset ocd_version each time so the server knows the exact versions */
-	imp->imp_connect_data.ocd_version = LUSTRE_VERSION_CODE;
+	ocd.ocd_version = LUSTRE_VERSION_CODE;
 	imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
 	imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18;
 
 	rc = obd_reconnect(NULL, imp->imp_obd->obd_self_export, obd,
-			   &obd->obd_uuid, &imp->imp_connect_data, NULL);
+			   &obd->obd_uuid, &ocd, NULL);
 	if (rc)
 		GOTO(out, rc);
 
@@ -718,6 +748,19 @@ int ptlrpc_connect_import(struct obd_import *imp)
 	if (request == NULL)
 		GOTO(out, rc = -ENOMEM);
 
+	/* get SELinux policy info if any */
+	rc = sptlrpc_get_sepol(request);
+	if (rc < 0) {
+		ptlrpc_request_free(request);
+		GOTO(out, rc);
+	}
+
+	bufs[5] = request->rq_sepol;
+
+	req_capsule_set_size(&request->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
+			     strlen(request->rq_sepol) ?
+			     strlen(request->rq_sepol) + 1 : 0);
+
 	rc = ptlrpc_request_bufs_pack(request, LUSTRE_OBD_VERSION,
 				      imp->imp_connect_op, bufs, NULL);
 	if (rc) {
@@ -727,8 +770,8 @@ int ptlrpc_connect_import(struct obd_import *imp)
 
 	/* Report the rpc service time to the server so that it knows how long
 	 * to wait for clients to join recovery */
-	lustre_msg_set_service_time(request->rq_reqmsg,
-				    at_timeout2est(request->rq_timeout));
+	lustre_msg_set_service_timeout(request->rq_reqmsg,
+				       at_timeout2est(request->rq_timeout));
 
 	/* The amount of time we give the server to process the connect req.
 	 * import_select_connection will increase the net latency on
@@ -771,7 +814,7 @@ int ptlrpc_connect_import(struct obd_import *imp)
 	rc = 0;
 out:
 	if (rc != 0)
-		IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
+		import_set_state(imp, LUSTRE_IMP_DISCON);
 
 	RETURN(rc);
 }
@@ -795,9 +838,9 @@ static int ptlrpc_busy_reconnect(int rc)
 }
 
 static int ptlrpc_connect_set_flags(struct obd_import *imp,
-				     struct obd_connect_data *ocd,
-				     __u64 old_connect_flags,
-				     struct obd_export *exp, int init_connect)
+				    struct obd_connect_data *ocd,
+				    __u64 old_connect_flags,
+				    struct obd_export *exp, int init_connect)
 {
 	static bool warned;
 	struct client_obd *cli = &imp->imp_obd->u.cli;
@@ -811,7 +854,6 @@ static int ptlrpc_connect_set_flags(struct obd_import *imp,
 
 	spin_unlock(&imp->imp_lock);
 
-
 	if (!warned && (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
 	    (ocd->ocd_version > LUSTRE_VERSION_CODE +
 				LUSTRE_VERSION_OFFSET_WARN ||
@@ -822,7 +864,7 @@ static int ptlrpc_connect_set_flags(struct obd_import *imp,
 		const char *older = "older than client. "
 				    "Consider upgrading server";
 		const char *newer = "newer than client. "
-				    "Consider recompiling application";
+				    "Consider upgrading client";
 
 		LCONSOLE_WARN("Server %s version (%d.%d.%d.%d) "
 			      "is much %s (%s)\n",
@@ -836,37 +878,18 @@ static int ptlrpc_connect_set_flags(struct obd_import *imp,
 		warned = true;
 	}
 
-#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
-	/* Check if server has LU-1252 fix applied to not always swab
-	 * the IR MNE entries. Do this only once per connection.  This
-	 * fixup is version-limited, because we don't want to carry the
-	 * OBD_CONNECT_MNE_SWAB flag around forever, just so long as we
-	 * need interop with unpatched 2.2 servers.  For newer servers,
-	 * the client will do MNE swabbing only as needed.  LU-1644 */
-	if (unlikely((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
-		     !(ocd->ocd_connect_flags & OBD_CONNECT_MNE_SWAB) &&
-		     OBD_OCD_VERSION_MAJOR(ocd->ocd_version) == 2 &&
-		     OBD_OCD_VERSION_MINOR(ocd->ocd_version) == 2 &&
-		     OBD_OCD_VERSION_PATCH(ocd->ocd_version) < 55 &&
-		     strcmp(imp->imp_obd->obd_type->typ_name,
-			    LUSTRE_MGC_NAME) == 0))
-		imp->imp_need_mne_swab = 1;
-	else /* clear if server was upgraded since last connect */
-		imp->imp_need_mne_swab = 0;
-#endif
-
 	if (ocd->ocd_connect_flags & OBD_CONNECT_CKSUM) {
 		/* We sent to the server ocd_cksum_types with bits set
 		 * for algorithms we understand. The server masked off
 		 * the checksum types it doesn't support */
 		if ((ocd->ocd_cksum_types &
-		     cksum_types_supported_client()) == 0) {
+		     obd_cksum_types_supported_client()) == 0) {
 			LCONSOLE_ERROR("The negotiation of the checksum "
 				       "alogrithm to use with server %s "
 				       "failed (%x/%x)\n",
 				       obd2cli_tgt(imp->imp_obd),
 				       ocd->ocd_cksum_types,
-				       cksum_types_supported_client());
+				       obd_cksum_types_supported_client());
 			return -EPROTO;
 		} else {
 			cli->cl_supp_cksum_types = ocd->ocd_cksum_types;
@@ -876,7 +899,8 @@ static int ptlrpc_connect_set_flags(struct obd_import *imp,
 		 * Enforce ADLER for backward compatibility*/
 		cli->cl_supp_cksum_types = OBD_CKSUM_ADLER;
 	}
-	cli->cl_cksum_type = cksum_type_select(cli->cl_supp_cksum_types);
+	cli->cl_cksum_type = obd_cksum_type_select(imp->imp_obd->obd_name,
+						   cli->cl_supp_cksum_types);
 
 	if (ocd->ocd_connect_flags & OBD_CONNECT_BRW_SIZE)
 		cli->cl_max_pages_per_rpc =
@@ -905,13 +929,17 @@ static int ptlrpc_connect_set_flags(struct obd_import *imp,
 	 * this leads to losing user settings done before such as
 	 * disable lru_resize, etc. */
 	if (old_connect_flags != exp_connect_flags(exp) || init_connect) {
+		struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
+		__u64 changed_flags;
+
+		changed_flags =
+			ns->ns_connect_flags ^ ns->ns_orig_connect_flags;
 		CDEBUG(D_HA, "%s: Resetting ns_connect_flags to server "
 			     "flags: %#llx\n", imp->imp_obd->obd_name,
 			     ocd->ocd_connect_flags);
-		imp->imp_obd->obd_namespace->ns_connect_flags =
-			ocd->ocd_connect_flags;
-		imp->imp_obd->obd_namespace->ns_orig_connect_flags =
-			ocd->ocd_connect_flags;
+		ns->ns_connect_flags = (ns->ns_connect_flags & changed_flags) |
+				      (ocd->ocd_connect_flags & ~changed_flags);
+		ns->ns_orig_connect_flags = ocd->ocd_connect_flags;
 	}
 
 	if (ocd->ocd_connect_flags & OBD_CONNECT_AT)
@@ -977,6 +1005,7 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
         struct obd_import *imp = request->rq_import;
         struct lustre_handle old_hdl;
         __u64 old_connect_flags;
+	timeout_t service_timeout;
         int msg_flags;
 	struct obd_connect_data *ocd;
 	struct obd_export *exp = NULL;
@@ -991,11 +1020,25 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
 	}
 
 	if (rc) {
+		struct ptlrpc_request *free_req;
+		struct ptlrpc_request *tmp;
+
+		/* abort all delayed requests initiated connection */
+		list_for_each_entry_safe(free_req, tmp, &imp->imp_delayed_list,
+					 rq_list) {
+			spin_lock(&free_req->rq_lock);
+			if (free_req->rq_no_resend) {
+				free_req->rq_err = 1;
+				free_req->rq_status = -EIO;
+				ptlrpc_client_wake_req(free_req);
+			}
+			spin_unlock(&free_req->rq_lock);
+		}
+
 		/* if this reconnect to busy export - not need select new target
 		 * for connecting*/
 		imp->imp_force_reconnect = ptlrpc_busy_reconnect(rc);
 		spin_unlock(&imp->imp_lock);
-		ptlrpc_maybe_ping_import_soon(imp);
 		GOTO(out, rc);
 	}
 
@@ -1095,10 +1138,11 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
 	imp->imp_obd->obd_self_export->exp_connect_data = *ocd;
 
 	/* The net statistics after (re-)connect is not valid anymore,
-	 * because may reflect other routing, etc. */
+	 * because may reflect other routing, etc.
+	 */
+	service_timeout = lustre_msg_get_service_timeout(request->rq_repmsg);
 	at_reinit(&imp->imp_at.iat_net_latency, 0, 0);
-	ptlrpc_at_adj_net_latency(request,
-			lustre_msg_get_service_time(request->rq_repmsg));
+	ptlrpc_at_adj_net_latency(request, service_timeout);
 
 	/* Import flags should be updated before waking import at FULL state */
 	rc = ptlrpc_connect_set_flags(imp, ocd, old_connect_flags, exp,
@@ -1115,12 +1159,10 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
 		spin_lock(&imp->imp_lock);
 		if (msg_flags & MSG_CONNECT_REPLAYABLE) {
 			imp->imp_replayable = 1;
-			spin_unlock(&imp->imp_lock);
 			CDEBUG(D_HA, "connected to replayable target: %s\n",
 			       obd2cli_tgt(imp->imp_obd));
 		} else {
 			imp->imp_replayable = 0;
-			spin_unlock(&imp->imp_lock);
 		}
 
                 /* if applies, adjust the imp->imp_msg_magic here
@@ -1135,10 +1177,11 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
                 if (msg_flags & MSG_CONNECT_RECOVERING) {
                         CDEBUG(D_HA, "connect to %s during recovery\n",
                                obd2cli_tgt(imp->imp_obd));
-                        IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
+			import_set_state_nolock(imp, LUSTRE_IMP_REPLAY_LOCKS);
+			spin_unlock(&imp->imp_lock);
                 } else {
-			IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
-			ptlrpc_activate_import(imp);
+			spin_unlock(&imp->imp_lock);
+			ptlrpc_activate_import(imp, true);
                 }
 
                 GOTO(finish, rc = 0);
@@ -1196,7 +1239,7 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
                                      *lustre_msg_get_handle(request->rq_repmsg);
 
                         if (!(MSG_CONNECT_RECOVERING & msg_flags)) {
-                                IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+				import_set_state(imp, LUSTRE_IMP_EVICTED);
                                 GOTO(finish, rc = 0);
                         }
 
@@ -1209,7 +1252,7 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
                 if (imp->imp_invalid) {
                         CDEBUG(D_HA, "%s: reconnected but import is invalid; "
                                "marking evicted\n", imp->imp_obd->obd_name);
-                        IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+			import_set_state(imp, LUSTRE_IMP_EVICTED);
                 } else if (MSG_CONNECT_RECOVERING & msg_flags) {
                         CDEBUG(D_HA, "%s: reconnected to %s during replay\n",
                                imp->imp_obd->obd_name,
@@ -1219,9 +1262,9 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
 			imp->imp_resend_replay = 1;
 			spin_unlock(&imp->imp_lock);
 
-			IMPORT_SET_STATE(imp, imp->imp_replay_state);
+			import_set_state(imp, imp->imp_replay_state);
                 } else {
-                        IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+			import_set_state(imp, LUSTRE_IMP_RECOVER);
                 }
         } else if ((MSG_CONNECT_RECOVERING & msg_flags) && !imp->imp_invalid) {
                 LASSERT(imp->imp_replayable);
@@ -1229,13 +1272,13 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
                                 *lustre_msg_get_handle(request->rq_repmsg);
                 imp->imp_last_replay_transno = 0;
 		imp->imp_replay_cursor = &imp->imp_committed_list;
-                IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
-        } else {
+		import_set_state(imp, LUSTRE_IMP_REPLAY);
+	} else {
                 DEBUG_REQ(D_HA, request, "%s: evicting (reconnect/recover flags"
                           " not set: %x)", imp->imp_obd->obd_name, msg_flags);
                 imp->imp_remote_handle =
                                 *lustre_msg_get_handle(request->rq_repmsg);
-                IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+		import_set_state(imp, LUSTRE_IMP_EVICTED);
         }
 
         /* Sanity checks for a reconnected import. */
@@ -1272,40 +1315,45 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
 	}
 
 out:
+	if (exp != NULL)
+		class_export_put(exp);
+
 	spin_lock(&imp->imp_lock);
 	imp->imp_connected = 0;
 	imp->imp_connect_tried = 1;
-	spin_unlock(&imp->imp_lock);
 
-	if (exp != NULL)
-		class_export_put(exp);
-
-        if (rc != 0) {
-                IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
-                if (rc == -EACCES) {
-                        /*
-                         * Give up trying to reconnect
-                         * EACCES means client has no permission for connection
-                         */
-                        imp->imp_obd->obd_no_recov = 1;
-                        ptlrpc_deactivate_import(imp);
-                }
+	if (rc != 0) {
+		bool inact = false;
+		time64_t now = ktime_get_seconds();
+		time64_t next_connect;
+
+		import_set_state_nolock(imp, LUSTRE_IMP_DISCON);
+		if (rc == -EACCES) {
+			/*
+			 * Give up trying to reconnect
+			 * EACCES means client has no permission for connection
+			 */
+			imp->imp_obd->obd_no_recov = 1;
+			ptlrpc_deactivate_import_nolock(imp);
+			inact = true;
+		} else if (rc == -EPROTO) {
+			struct obd_connect_data *ocd;
+
+			/* reply message might not be ready */
+			if (request->rq_repmsg == NULL) {
+				spin_unlock(&imp->imp_lock);
+				RETURN(-EPROTO);
+			}
 
-                if (rc == -EPROTO) {
-                        struct obd_connect_data *ocd;
-
-                        /* reply message might not be ready */
-                        if (request->rq_repmsg == NULL)
-                                RETURN(-EPROTO);
-
-                        ocd = req_capsule_server_get(&request->rq_pill,
-                                                     &RMF_CONNECT_DATA);
-                        if (ocd &&
-                            (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
-                            (ocd->ocd_version != LUSTRE_VERSION_CODE)) {
-                           /* Actually servers are only supposed to refuse
-                              connection from liblustre clients, so we should
-                              never see this from VFS context */
+			ocd = req_capsule_server_get(&request->rq_pill,
+						     &RMF_CONNECT_DATA);
+			/* Servers are not supposed to refuse connections from
+			 * clients based on version, only connection feature
+			 * flags.  We should never see this from llite, but it
+			 * may be useful for debugging in the future. */
+			if (ocd &&
+			    (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+			    (ocd->ocd_version != LUSTRE_VERSION_CODE)) {
                                 LCONSOLE_ERROR_MSG(0x16a, "Server %s version "
                                         "(%d.%d.%d.%d)"
                                         " refused connection from this client "
@@ -1317,17 +1365,59 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
                                         OBD_OCD_VERSION_PATCH(ocd->ocd_version),
                                         OBD_OCD_VERSION_FIX(ocd->ocd_version),
                                         LUSTRE_VERSION_STRING);
-                                ptlrpc_deactivate_import(imp);
-                                IMPORT_SET_STATE(imp, LUSTRE_IMP_CLOSED);
-                        }
-                        RETURN(-EPROTO);
-                }
+				ptlrpc_deactivate_import_nolock(imp);
+				import_set_state_nolock(imp, LUSTRE_IMP_CLOSED);
+				inact = true;
+			}
+		} else if (rc == -ENODEV || rc == -ETIMEDOUT) {
+			/* ENODEV means there is no service, force reconnection
+			 * to a pair if attempt happen ptlrpc_next_reconnect
+			 * before now. ETIMEDOUT could be set during network
+			 * error and do not guarantee request deadline happened.
+			 */
+			struct obd_import_conn *conn;
+			time64_t reconnect_time;
+
+			/* Same as ptlrpc_next_reconnect, but in past */
+			reconnect_time = now - INITIAL_CONNECT_TIMEOUT;
+			list_for_each_entry(conn, &imp->imp_conn_list,
+					    oic_item) {
+				if (conn->oic_last_attempt <= reconnect_time) {
+					imp->imp_force_verify = 1;
+					break;
+				}
+			}
+		}
+
+		next_connect = imp->imp_conn_current->oic_last_attempt +
+			       (request->rq_deadline - request->rq_sent);
+		spin_unlock(&imp->imp_lock);
+
+		if (inact)
+			obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
+
+		if (rc == -EPROTO)
+			RETURN(rc);
+
+		/* adjust imp_next_ping to request deadline + 1 and reschedule
+		 * a pinger if import lost processing during CONNECTING or far
+		 * away from request deadline. It could happen when connection
+		 * was initiated outside of pinger, like
+		 * ptlrpc_set_import_discon().
+		 */
+		if (!imp->imp_force_verify && (imp->imp_next_ping <= now ||
+		    imp->imp_next_ping > next_connect)) {
+			imp->imp_next_ping = max(now, next_connect) + 1;
+			ptlrpc_pinger_wake_up();
+		}
 
 		ptlrpc_maybe_ping_import_soon(imp);
 
 		CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n",
 		       obd2cli_tgt(imp->imp_obd),
 		       (char *)imp->imp_connection->c_remote_uuid.uuid, rc);
+	} else {
+		spin_unlock(&imp->imp_lock);
 	}
 
 	wake_up_all(&imp->imp_recovery_waitq);
@@ -1376,8 +1466,8 @@ static int signal_completed_replay(struct obd_import *imp)
 	if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_FINISH_REPLAY)))
 		RETURN(0);
 
-	LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
-	atomic_inc(&imp->imp_replay_inflight);
+	if (!atomic_add_unless(&imp->imp_replay_inflight, 1, 1))
+		RETURN(0);
 
 	req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING, LUSTRE_OBD_VERSION,
 					OBD_PING);
@@ -1422,7 +1512,7 @@ static int ptlrpc_invalidate_import_thread(void *data)
                 libcfs_debug_dumplog();
         }
 
-        IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+	import_set_state(imp, LUSTRE_IMP_RECOVER);
         ptlrpc_import_recovery_state_machine(imp);
 
         class_import_put(imp);
@@ -1458,6 +1548,8 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
 
         ENTRY;
         if (imp->imp_state == LUSTRE_IMP_EVICTED) {
+		struct task_struct *task;
+
                 deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
                           &target_start, &target_len);
                 /* Don't care about MGC eviction */
@@ -1468,6 +1560,7 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
 					   "using this service will fail.\n",
 					   imp->imp_obd->obd_name, target_len,
 					   target_start);
+			LASSERTF(!obd_lbug_on_eviction, "LBUG upon eviction");
                 }
                 CDEBUG(D_HA, "evicted from %s@%s; invalidating\n",
                        obd2cli_tgt(imp->imp_obd),
@@ -1477,24 +1570,22 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
 		imp->imp_vbr_failed = 0;
 		spin_unlock(&imp->imp_lock);
 
-		{
-		struct task_struct *task;
 		/* bug 17802:  XXX client_disconnect_export vs connect request
 		 * race. if client is evicted at this time then we start
 		 * invalidate thread without reference to import and import can
 		 * be freed at same time. */
 		class_import_get(imp);
 		task = kthread_run(ptlrpc_invalidate_import_thread, imp,
-				     "ll_imp_inval");
+				   "ll_imp_inval");
 		if (IS_ERR(task)) {
 			class_import_put(imp);
-			CERROR("error starting invalidate thread: %d\n", rc);
 			rc = PTR_ERR(task);
+			CERROR("%s: can't start invalidate thread: rc = %d\n",
+			       imp->imp_obd->obd_name, rc);
 		} else {
 			rc = 0;
 		}
 		RETURN(rc);
-		}
         }
 
 	if (imp->imp_state == LUSTRE_IMP_REPLAY) {
@@ -1503,7 +1594,7 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
 		rc = ptlrpc_replay_next(imp, &inflight);
 		if (inflight == 0 &&
 		    atomic_read(&imp->imp_replay_inflight) == 0) {
-			IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
+			import_set_state(imp, LUSTRE_IMP_REPLAY_LOCKS);
 			rc = ldlm_replay_locks(imp);
 			if (rc)
 				GOTO(out, rc);
@@ -1513,7 +1604,7 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
 
 	if (imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS) {
 		if (atomic_read(&imp->imp_replay_inflight) == 0) {
-			IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_WAIT);
+			import_set_state(imp, LUSTRE_IMP_REPLAY_WAIT);
 			rc = signal_completed_replay(imp);
 			if (rc)
 				GOTO(out, rc);
@@ -1522,24 +1613,28 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
 
 	if (imp->imp_state == LUSTRE_IMP_REPLAY_WAIT) {
 		if (atomic_read(&imp->imp_replay_inflight) == 0) {
-			IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+			import_set_state(imp, LUSTRE_IMP_RECOVER);
 		}
 	}
 
-        if (imp->imp_state == LUSTRE_IMP_RECOVER) {
+	if (imp->imp_state == LUSTRE_IMP_RECOVER) {
 		struct ptlrpc_connection *conn = imp->imp_connection;
 
-                rc = ptlrpc_resend(imp);
-                if (rc)
-                        GOTO(out, rc);
-                IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
-                ptlrpc_activate_import(imp);
-
-		LCONSOLE_INFO("%s: Connection restored to %s (at %s)\n",
-			      imp->imp_obd->obd_name,
-			      obd_uuid2str(&conn->c_remote_uuid),
-			      libcfs_nid2str(imp->imp_connection->c_peer.nid));
-        }
+		rc = ptlrpc_resend(imp);
+		if (rc)
+			GOTO(out, rc);
+		ptlrpc_activate_import(imp, true);
+
+		CDEBUG_LIMIT(imp->imp_was_idle ?
+				imp->imp_idle_debug : D_CONSOLE,
+			     "%s: Connection restored to %s (at %s)\n",
+			     imp->imp_obd->obd_name,
+			     obd_uuid2str(&conn->c_remote_uuid),
+			     obd_import_nid2str(imp));
+		spin_lock(&imp->imp_lock);
+		imp->imp_was_idle = 0;
+		spin_unlock(&imp->imp_lock);
+	}
 
 	if (imp->imp_state == LUSTRE_IMP_FULL) {
 		wake_up_all(&imp->imp_recovery_waitq);
@@ -1550,15 +1645,12 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
 	RETURN(rc);
 }
 
-int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
+static struct ptlrpc_request *ptlrpc_disconnect_prep_req(struct obd_import *imp)
 {
 	struct ptlrpc_request *req;
 	int rq_opc, rc = 0;
 	ENTRY;
 
-	if (imp->imp_obd->obd_force)
-		GOTO(set_state, rc);
-
 	switch (imp->imp_connect_op) {
 	case OST_CONNECT:
 		rq_opc = OST_DISCONNECT;
@@ -1575,26 +1667,67 @@ int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
 		       "(connect_op %d): rc = %d\n",
 		       imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
 		       imp->imp_connect_op, rc);
-		RETURN(rc);
+		RETURN(ERR_PTR(rc));
 	}
 
-        if (ptlrpc_import_in_recovery(imp)) {
-                struct l_wait_info lwi;
-                cfs_duration_t timeout;
+	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_DISCONNECT,
+					LUSTRE_OBD_VERSION, rq_opc);
+	if (req == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
 
-                if (AT_OFF) {
-                        if (imp->imp_server_timeout)
-                                timeout = cfs_time_seconds(obd_timeout / 2);
-                        else
-                                timeout = cfs_time_seconds(obd_timeout);
-                } else {
-                        int idx = import_at_get_index(imp,
-                                imp->imp_client->cli_request_portal);
-                        timeout = cfs_time_seconds(
-                                at_get(&imp->imp_at.iat_service_estimate[idx]));
+	/* We are disconnecting, do not retry a failed DISCONNECT rpc if
+	 * it fails.  We can get through the above with a down server
+	 * if the client doesn't know the server is gone yet. */
+	req->rq_no_resend = 1;
+
+	/* We want client umounts to happen quickly, no matter the
+	   server state... */
+	req->rq_timeout = min_t(timeout_t, req->rq_timeout,
+				INITIAL_CONNECT_TIMEOUT);
+
+	import_set_state(imp, LUSTRE_IMP_CONNECTING);
+	req->rq_send_state =  LUSTRE_IMP_CONNECTING;
+	ptlrpc_request_set_replen(req);
+
+	RETURN(req);
+}
+
+int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
+{
+	struct ptlrpc_request *req;
+	int rc = 0;
+	ENTRY;
+
+	if (imp->imp_obd->obd_force)
+		GOTO(set_state, rc);
+
+	/* probably the import has been disconnected already being idle */
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_IDLE)
+		GOTO(out, rc);
+	spin_unlock(&imp->imp_lock);
+
+	if (ptlrpc_import_in_recovery(imp)) {
+		struct l_wait_info lwi;
+		long timeout_jiffies;
+		time64_t timeout;
+
+		if (AT_OFF) {
+			if (imp->imp_server_timeout)
+				timeout = obd_timeout >> 1;
+			else
+				timeout = obd_timeout;
+		} else {
+			u32 req_portal;
+			int idx;
+
+			req_portal = imp->imp_client->cli_request_portal;
+			idx = import_at_get_index(imp, req_portal);
+			timeout = at_get(&imp->imp_at.iat_service_estimate[idx]);
                 }
 
-                lwi = LWI_TIMEOUT_INTR(cfs_timeout_cap(timeout),
+		timeout_jiffies = cfs_time_seconds(timeout);
+		lwi = LWI_TIMEOUT_INTR(max_t(long, timeout_jiffies, 1),
                                        back_to_sleep, LWI_ON_SIGNAL_NOOP, NULL);
                 rc = l_wait_event(imp->imp_recovery_waitq,
                                   !ptlrpc_import_in_recovery(imp), &lwi);
@@ -1606,33 +1739,19 @@ int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
 		GOTO(out, rc);
 	spin_unlock(&imp->imp_lock);
 
-        req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_DISCONNECT,
-                                        LUSTRE_OBD_VERSION, rq_opc);
-        if (req) {
-                /* We are disconnecting, do not retry a failed DISCONNECT rpc if
-                 * it fails.  We can get through the above with a down server
-                 * if the client doesn't know the server is gone yet. */
-                req->rq_no_resend = 1;
-
-                /* We want client umounts to happen quickly, no matter the
-                   server state... */
-                req->rq_timeout = min_t(int, req->rq_timeout,
-                                        INITIAL_CONNECT_TIMEOUT);
-
-                IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING);
-                req->rq_send_state =  LUSTRE_IMP_CONNECTING;
-                ptlrpc_request_set_replen(req);
-                rc = ptlrpc_queue_wait(req);
-                ptlrpc_req_finished(req);
-        }
+	req = ptlrpc_disconnect_prep_req(imp);
+	if (IS_ERR(req))
+		GOTO(set_state, rc = PTR_ERR(req));
+	rc = ptlrpc_queue_wait(req);
+	ptlrpc_req_finished(req);
 
 set_state:
 	spin_lock(&imp->imp_lock);
 out:
 	if (noclose)
-		IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
+		import_set_state_nolock(imp, LUSTRE_IMP_DISCON);
 	else
-		IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
+		import_set_state_nolock(imp, LUSTRE_IMP_CLOSED);
 	memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle));
 	spin_unlock(&imp->imp_lock);
 
@@ -1642,16 +1761,116 @@ int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
 }
 EXPORT_SYMBOL(ptlrpc_disconnect_import);
 
+static void ptlrpc_reset_reqs_generation(struct obd_import *imp)
+{
+	struct ptlrpc_request *old, *tmp;
+
+	/* tag all resendable requests generated before disconnection
+	 * notice this code is part of disconnect-at-idle path only */
+	list_for_each_entry_safe(old, tmp, &imp->imp_delayed_list,
+			rq_list) {
+		spin_lock(&old->rq_lock);
+		if (old->rq_import_generation == imp->imp_generation - 1 &&
+		    !old->rq_no_resend)
+			old->rq_import_generation = imp->imp_generation;
+		spin_unlock(&old->rq_lock);
+	}
+}
+
+static int ptlrpc_disconnect_idle_interpret(const struct lu_env *env,
+					    struct ptlrpc_request *req,
+					    void *data, int rc)
+{
+	struct obd_import *imp = req->rq_import;
+	int connect = 0;
+
+	DEBUG_REQ(D_HA, req, "inflight=%d, refcount=%d: rc = %d ",
+		  atomic_read(&imp->imp_inflight),
+		  atomic_read(&imp->imp_refcount), rc);
+
+	spin_lock(&imp->imp_lock);
+	/* DISCONNECT reply can be late and another connection can just
+	 * be initiated. so we have to abort disconnection. */
+	if (req->rq_import_generation == imp->imp_generation &&
+	    imp->imp_state != LUSTRE_IMP_CLOSED) {
+		LASSERTF(imp->imp_state == LUSTRE_IMP_CONNECTING,
+			 "%s\n", ptlrpc_import_state_name(imp->imp_state));
+		memset(&imp->imp_remote_handle, 0,
+		       sizeof(imp->imp_remote_handle));
+		/* take our DISCONNECT into account */
+		if (atomic_read(&imp->imp_reqs) > 1) {
+			imp->imp_generation++;
+			imp->imp_initiated_at = imp->imp_generation;
+			import_set_state_nolock(imp, LUSTRE_IMP_NEW);
+			ptlrpc_reset_reqs_generation(imp);
+			connect = 1;
+		} else {
+			/* do not expose transient IDLE state */
+			import_set_state_nolock(imp, LUSTRE_IMP_IDLE);
+		}
+	}
+
+	if (connect) {
+		rc = ptlrpc_connect_import_locked(imp);
+		if (rc >= 0)
+			ptlrpc_pinger_add_import(imp);
+	} else {
+		spin_unlock(&imp->imp_lock);
+	}
+
+	return 0;
+}
+
+int ptlrpc_disconnect_and_idle_import(struct obd_import *imp)
+{
+	struct ptlrpc_request *req;
+	ENTRY;
+
+	if (imp->imp_obd->obd_force)
+		RETURN(0);
+
+	if (ptlrpc_import_in_recovery(imp))
+		RETURN(0);
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state != LUSTRE_IMP_FULL) {
+		spin_unlock(&imp->imp_lock);
+		RETURN(0);
+	}
+	spin_unlock(&imp->imp_lock);
+
+	req = ptlrpc_disconnect_prep_req(imp);
+	if (IS_ERR(req))
+		RETURN(PTR_ERR(req));
+
+	CDEBUG_LIMIT(imp->imp_idle_debug, "%s: disconnect after %llus idle\n",
+		     imp->imp_obd->obd_name,
+		     ktime_get_real_seconds() - imp->imp_last_reply_time);
+
+	/* don't make noise at reconnection */
+	spin_lock(&imp->imp_lock);
+	imp->imp_was_idle = 1;
+	spin_unlock(&imp->imp_lock);
+
+	req->rq_interpret_reply = ptlrpc_disconnect_idle_interpret;
+	ptlrpcd_add_req(req);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_disconnect_and_idle_import);
+
 void ptlrpc_cleanup_imp(struct obd_import *imp)
 {
 	ENTRY;
 
 	spin_lock(&imp->imp_lock);
-	IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
+
+	import_set_state_nolock(imp, LUSTRE_IMP_CLOSED);
 	imp->imp_generation++;
-	spin_unlock(&imp->imp_lock);
 	ptlrpc_abort_inflight(imp);
 
+	spin_unlock(&imp->imp_lock);
+
 	EXIT;
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/layout.c b/drivers/staging/lustrefsx/lustre/ptlrpc/layout.c
index d720645bafc16..7db9465a3569f 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/layout.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/layout.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -46,18 +46,16 @@
 
 #include <linux/module.h>
 
-#include <lustre/lustre_idl.h>
-
 #include <llog_swab.h>
 #include <lustre_debug.h>
 #include <lustre_swab.h>
-#include <lustre_ver.h>
 #include <obd.h>
 #include <obd_support.h>
 
 /* struct ptlrpc_request, lustre_msg* */
 #include <lustre_req_layout.h>
 #include <lustre_acl.h>
+#include <lustre_nodemap.h>
 
 /*
  * RQFs (see below) refer to two struct req_msg_field arrays describing the
@@ -90,11 +88,6 @@ static const struct req_msg_field *mgs_config_read_server[] = {
         &RMF_MGS_CONFIG_RES
 };
 
-static const struct req_msg_field *log_cancel_client[] = {
-        &RMF_PTLRPC_BODY,
-        &RMF_LOGCOOKIES
-};
-
 static const struct req_msg_field *mdt_body_only[] = {
         &RMF_PTLRPC_BODY,
         &RMF_MDT_BODY
@@ -137,12 +130,13 @@ static const struct req_msg_field *mdt_close_client[] = {
         &RMF_CAPA1
 };
 
-static const struct req_msg_field *mdt_intent_close_client[] = {
+static const struct req_msg_field *mdt_close_intent_client[] = {
 	&RMF_PTLRPC_BODY,
 	&RMF_MDT_EPOCH,
 	&RMF_REC_REINT,
 	&RMF_CAPA1,
-	&RMF_CLOSE_DATA
+	&RMF_CLOSE_DATA,
+	&RMF_U32
 };
 
 static const struct req_msg_field *obd_statfs_server[] = {
@@ -218,7 +212,8 @@ static const struct req_msg_field *mds_reint_create_acl_client[] = {
 	&RMF_EADATA,
 	&RMF_DLM_REQ,
 	&RMF_FILE_SECCTX_NAME,
-	&RMF_FILE_SECCTX
+	&RMF_FILE_SECCTX,
+	&RMF_SELINUX_POL
 };
 
 static const struct req_msg_field *mds_reint_create_sym_client[] = {
@@ -229,7 +224,8 @@ static const struct req_msg_field *mds_reint_create_sym_client[] = {
 	&RMF_SYMTGT,
 	&RMF_DLM_REQ,
 	&RMF_FILE_SECCTX_NAME,
-	&RMF_FILE_SECCTX
+	&RMF_FILE_SECCTX,
+	&RMF_SELINUX_POL
 };
 
 static const struct req_msg_field *mds_reint_open_client[] = {
@@ -240,7 +236,8 @@ static const struct req_msg_field *mds_reint_open_client[] = {
 	&RMF_NAME,
 	&RMF_EADATA,
 	&RMF_FILE_SECCTX_NAME,
-	&RMF_FILE_SECCTX
+	&RMF_FILE_SECCTX,
+	&RMF_SELINUX_POL
 };
 
 static const struct req_msg_field *mds_reint_open_server[] = {
@@ -253,30 +250,33 @@ static const struct req_msg_field *mds_reint_open_server[] = {
 };
 
 static const struct req_msg_field *mds_reint_unlink_client[] = {
-        &RMF_PTLRPC_BODY,
-        &RMF_REC_REINT,
-        &RMF_CAPA1,
-        &RMF_NAME,
-        &RMF_DLM_REQ
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_DLM_REQ,
+	&RMF_SELINUX_POL
 };
 
 static const struct req_msg_field *mds_reint_link_client[] = {
-        &RMF_PTLRPC_BODY,
-        &RMF_REC_REINT,
-        &RMF_CAPA1,
-        &RMF_CAPA2,
-        &RMF_NAME,
-        &RMF_DLM_REQ
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_NAME,
+	&RMF_DLM_REQ,
+	&RMF_SELINUX_POL
 };
 
 static const struct req_msg_field *mds_reint_rename_client[] = {
-        &RMF_PTLRPC_BODY,
-        &RMF_REC_REINT,
-        &RMF_CAPA1,
-        &RMF_CAPA2,
-        &RMF_NAME,
-        &RMF_SYMTGT,
-        &RMF_DLM_REQ
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_NAME,
+	&RMF_SYMTGT,
+	&RMF_DLM_REQ,
+	&RMF_SELINUX_POL
 };
 
 static const struct req_msg_field *mds_reint_migrate_client[] = {
@@ -287,8 +287,10 @@ static const struct req_msg_field *mds_reint_migrate_client[] = {
 	&RMF_NAME,
 	&RMF_SYMTGT,
 	&RMF_DLM_REQ,
+	&RMF_SELINUX_POL,
 	&RMF_MDT_EPOCH,
-	&RMF_CLOSE_DATA
+	&RMF_CLOSE_DATA,
+	&RMF_EADATA
 };
 
 static const struct req_msg_field *mds_last_unlink_server[] = {
@@ -316,6 +318,13 @@ static const struct req_msg_field *mds_reint_setxattr_client[] = {
         &RMF_CAPA1,
         &RMF_NAME,
         &RMF_EADATA,
+	&RMF_DLM_REQ,
+	&RMF_SELINUX_POL
+};
+
+static const struct req_msg_field *mds_reint_resync[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
 	&RMF_DLM_REQ
 };
 
@@ -328,12 +337,28 @@ static const struct req_msg_field *mdt_swap_layouts[] = {
 	&RMF_DLM_REQ
 };
 
+static const struct req_msg_field *mds_rmfid_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_FID_ARRAY,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+};
+
+static const struct req_msg_field *mds_rmfid_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_FID_ARRAY,
+	&RMF_RCS,
+};
+
 static const struct req_msg_field *obd_connect_client[] = {
-        &RMF_PTLRPC_BODY,
-        &RMF_TGTUUID,
-        &RMF_CLUUID,
-        &RMF_CONN,
-        &RMF_CONNECT_DATA
+	&RMF_PTLRPC_BODY,
+	&RMF_TGTUUID,
+	&RMF_CLUUID,
+	&RMF_CONN,
+	&RMF_CONNECT_DATA,
+	&RMF_SELINUX_POL
 };
 
 static const struct req_msg_field *obd_connect_server[] = {
@@ -425,32 +450,37 @@ static const struct req_msg_field *ldlm_intent_layout_client[] = {
 	&RMF_LAYOUT_INTENT,
 	&RMF_EADATA /* for new layout to be set up */
 };
+
 static const struct req_msg_field *ldlm_intent_open_server[] = {
-        &RMF_PTLRPC_BODY,
-        &RMF_DLM_REP,
-        &RMF_MDT_BODY,
-        &RMF_MDT_MD,
-        &RMF_ACL,
-        &RMF_CAPA1,
-        &RMF_CAPA2
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_ACL,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_NIOBUF_INLINE,
+	&RMF_FILE_SECCTX
 };
 
 static const struct req_msg_field *ldlm_intent_getattr_client[] = {
-        &RMF_PTLRPC_BODY,
-        &RMF_DLM_REQ,
-        &RMF_LDLM_INTENT,
-        &RMF_MDT_BODY,     /* coincides with mds_getattr_name_client[] */
-        &RMF_CAPA1,
-        &RMF_NAME
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_MDT_BODY,     /* coincides with mds_getattr_name_client[] */
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_FILE_SECCTX_NAME
 };
 
 static const struct req_msg_field *ldlm_intent_getattr_server[] = {
-        &RMF_PTLRPC_BODY,
-        &RMF_DLM_REP,
-        &RMF_MDT_BODY,
-        &RMF_MDT_MD,
-        &RMF_ACL,
-        &RMF_CAPA1
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_ACL,
+	&RMF_CAPA1,
+	&RMF_FILE_SECCTX
 };
 
 static const struct req_msg_field *ldlm_intent_create_client[] = {
@@ -462,7 +492,8 @@ static const struct req_msg_field *ldlm_intent_create_client[] = {
 	&RMF_NAME,
 	&RMF_EADATA,
 	&RMF_FILE_SECCTX_NAME,
-	&RMF_FILE_SECCTX
+	&RMF_FILE_SECCTX,
+	&RMF_SELINUX_POL
 };
 
 static const struct req_msg_field *ldlm_intent_open_client[] = {
@@ -475,16 +506,8 @@ static const struct req_msg_field *ldlm_intent_open_client[] = {
 	&RMF_NAME,
 	&RMF_EADATA,
 	&RMF_FILE_SECCTX_NAME,
-	&RMF_FILE_SECCTX
-};
-
-static const struct req_msg_field *ldlm_intent_unlink_client[] = {
-        &RMF_PTLRPC_BODY,
-        &RMF_DLM_REQ,
-        &RMF_LDLM_INTENT,
-        &RMF_REC_REINT,    /* coincides with mds_reint_unlink_client[] */
-        &RMF_CAPA1,
-        &RMF_NAME
+	&RMF_FILE_SECCTX,
+	&RMF_SELINUX_POL
 };
 
 static const struct req_msg_field *ldlm_intent_getxattr_client[] = {
@@ -493,6 +516,7 @@ static const struct req_msg_field *ldlm_intent_getxattr_client[] = {
 	&RMF_LDLM_INTENT,
 	&RMF_MDT_BODY,
 	&RMF_CAPA1,
+	&RMF_SELINUX_POL
 };
 
 static const struct req_msg_field *ldlm_intent_getxattr_server[] = {
@@ -513,11 +537,12 @@ static const struct req_msg_field *mds_get_root_client[] = {
 };
 
 static const struct req_msg_field *mds_getxattr_client[] = {
-        &RMF_PTLRPC_BODY,
-        &RMF_MDT_BODY,
-        &RMF_CAPA1,
-        &RMF_NAME,
-        &RMF_EADATA
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_EADATA,
+	&RMF_SELINUX_POL
 };
 
 static const struct req_msg_field *mds_getxattr_server[] = {
@@ -571,11 +596,6 @@ static const struct req_msg_field *llog_log_hdr_only[] = {
         &RMF_LLOG_LOG_HDR
 };
 
-static const struct req_msg_field *llogd_conn_body_only[] = {
-        &RMF_PTLRPC_BODY,
-        &RMF_LLOGD_CONN_BODY
-};
-
 static const struct req_msg_field *llog_origin_handle_next_block_server[] = {
         &RMF_PTLRPC_BODY,
         &RMF_LLOGD_BODY,
@@ -612,16 +632,18 @@ static const struct req_msg_field *ost_destroy_client[] = {
 
 
 static const struct req_msg_field *ost_brw_client[] = {
-        &RMF_PTLRPC_BODY,
-        &RMF_OST_BODY,
-        &RMF_OBD_IOOBJ,
-        &RMF_NIOBUF_REMOTE,
-        &RMF_CAPA1
+	&RMF_PTLRPC_BODY,
+	&RMF_OST_BODY,
+	&RMF_OBD_IOOBJ,
+	&RMF_NIOBUF_REMOTE,
+	&RMF_CAPA1,
+	&RMF_SHORT_IO
 };
 
 static const struct req_msg_field *ost_brw_read_server[] = {
-        &RMF_PTLRPC_BODY,
-        &RMF_OST_BODY
+	&RMF_PTLRPC_BODY,
+	&RMF_OST_BODY,
+	&RMF_SHORT_IO
 };
 
 static const struct req_msg_field *ost_brw_write_server[] = {
@@ -729,43 +751,45 @@ static const struct req_msg_field *obd_lfsck_reply[] = {
 };
 
 static struct req_format *req_formats[] = {
-        &RQF_OBD_PING,
-        &RQF_OBD_SET_INFO,
+	&RQF_OBD_PING,
+	&RQF_OBD_SET_INFO,
 	&RQF_OBD_IDX_READ,
-        &RQF_SEC_CTX,
-        &RQF_MGS_TARGET_REG,
+	&RQF_SEC_CTX,
+	&RQF_MGS_TARGET_REG,
 #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0)
-        &RQF_MGS_SET_INFO,
+	&RQF_MGS_SET_INFO,
 #endif
-        &RQF_MGS_CONFIG_READ,
-        &RQF_SEQ_QUERY,
-        &RQF_FLD_QUERY,
+	&RQF_MGS_CONFIG_READ,
+	&RQF_SEQ_QUERY,
+	&RQF_FLD_QUERY,
 	&RQF_FLD_READ,
-        &RQF_MDS_CONNECT,
-        &RQF_MDS_DISCONNECT,
-        &RQF_MDS_GET_INFO,
+	&RQF_MDS_CONNECT,
+	&RQF_MDS_DISCONNECT,
+	&RQF_MDS_GET_INFO,
 	&RQF_MDS_GET_ROOT,
-        &RQF_MDS_STATFS,
-        &RQF_MDS_GETATTR,
-        &RQF_MDS_GETATTR_NAME,
-        &RQF_MDS_GETXATTR,
-        &RQF_MDS_SYNC,
-        &RQF_MDS_CLOSE,
-	&RQF_MDS_INTENT_CLOSE,
+	&RQF_MDS_STATFS,
+	&RQF_MDS_STATFS_NEW,
+	&RQF_MDS_GETATTR,
+	&RQF_MDS_GETATTR_NAME,
+	&RQF_MDS_GETXATTR,
+	&RQF_MDS_SYNC,
+	&RQF_MDS_CLOSE,
+	&RQF_MDS_CLOSE_INTENT,
 	&RQF_MDS_READPAGE,
 	&RQF_MDS_REINT,
 	&RQF_MDS_REINT_CREATE,
 	&RQF_MDS_REINT_CREATE_ACL,
-        &RQF_MDS_REINT_CREATE_SLAVE,
-        &RQF_MDS_REINT_CREATE_SYM,
-        &RQF_MDS_REINT_OPEN,
-        &RQF_MDS_REINT_UNLINK,
-        &RQF_MDS_REINT_LINK,
-        &RQF_MDS_REINT_RENAME,
+	&RQF_MDS_REINT_CREATE_SLAVE,
+	&RQF_MDS_REINT_CREATE_SYM,
+	&RQF_MDS_REINT_OPEN,
+	&RQF_MDS_REINT_UNLINK,
+	&RQF_MDS_REINT_LINK,
+	&RQF_MDS_REINT_RENAME,
 	&RQF_MDS_REINT_MIGRATE,
-        &RQF_MDS_REINT_SETATTR,
-        &RQF_MDS_REINT_SETXATTR,
-        &RQF_MDS_QUOTACTL,
+	&RQF_MDS_REINT_SETATTR,
+	&RQF_MDS_REINT_SETXATTR,
+	&RQF_MDS_REINT_RESYNC,
+	&RQF_MDS_QUOTACTL,
 	&RQF_MDS_HSM_PROGRESS,
 	&RQF_MDS_HSM_CT_REGISTER,
 	&RQF_MDS_HSM_CT_UNREGISTER,
@@ -774,22 +798,23 @@ static struct req_format *req_formats[] = {
 	&RQF_MDS_HSM_ACTION,
 	&RQF_MDS_HSM_REQUEST,
 	&RQF_MDS_SWAP_LAYOUTS,
+	&RQF_MDS_RMFID,
 	&RQF_OUT_UPDATE,
-        &RQF_OST_CONNECT,
-        &RQF_OST_DISCONNECT,
-        &RQF_OST_QUOTACTL,
-        &RQF_OST_GETATTR,
-        &RQF_OST_SETATTR,
-        &RQF_OST_CREATE,
-        &RQF_OST_PUNCH,
-        &RQF_OST_SYNC,
-        &RQF_OST_DESTROY,
-        &RQF_OST_BRW_READ,
-        &RQF_OST_BRW_WRITE,
-        &RQF_OST_STATFS,
-        &RQF_OST_SET_GRANT_INFO,
+	&RQF_OST_CONNECT,
+	&RQF_OST_DISCONNECT,
+	&RQF_OST_QUOTACTL,
+	&RQF_OST_GETATTR,
+	&RQF_OST_SETATTR,
+	&RQF_OST_CREATE,
+	&RQF_OST_PUNCH,
+	&RQF_OST_SYNC,
+	&RQF_OST_DESTROY,
+	&RQF_OST_BRW_READ,
+	&RQF_OST_BRW_WRITE,
+	&RQF_OST_STATFS,
+	&RQF_OST_SET_GRANT_INFO,
 	&RQF_OST_GET_INFO,
-        &RQF_OST_GET_INFO_LAST_ID,
+	&RQF_OST_GET_INFO_LAST_ID,
 	&RQF_OST_GET_INFO_LAST_FID,
 	&RQF_OST_SET_INFO_LAST_FID,
 	&RQF_OST_GET_INFO_FIEMAP,
@@ -799,27 +824,23 @@ static struct req_format *req_formats[] = {
 	&RQF_LDLM_CONVERT,
 	&RQF_LDLM_CANCEL,
 	&RQF_LDLM_CALLBACK,
-        &RQF_LDLM_CP_CALLBACK,
-        &RQF_LDLM_BL_CALLBACK,
-        &RQF_LDLM_GL_CALLBACK,
-	&RQF_LDLM_GL_DESC_CALLBACK,
-        &RQF_LDLM_INTENT,
+	&RQF_LDLM_CP_CALLBACK,
+	&RQF_LDLM_BL_CALLBACK,
+	&RQF_LDLM_GL_CALLBACK,
+	&RQF_LDLM_GL_CALLBACK_DESC,
+	&RQF_LDLM_INTENT,
 	&RQF_LDLM_INTENT_BASIC,
-        &RQF_LDLM_INTENT_LAYOUT,
-        &RQF_LDLM_INTENT_GETATTR,
-        &RQF_LDLM_INTENT_OPEN,
-        &RQF_LDLM_INTENT_CREATE,
-        &RQF_LDLM_INTENT_UNLINK,
+	&RQF_LDLM_INTENT_LAYOUT,
+	&RQF_LDLM_INTENT_GETATTR,
+	&RQF_LDLM_INTENT_OPEN,
+	&RQF_LDLM_INTENT_CREATE,
 	&RQF_LDLM_INTENT_GETXATTR,
 	&RQF_LDLM_INTENT_QUOTA,
 	&RQF_QUOTA_DQACQ,
-        &RQF_LOG_CANCEL,
-        &RQF_LLOG_ORIGIN_HANDLE_CREATE,
-        &RQF_LLOG_ORIGIN_HANDLE_DESTROY,
-        &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK,
-        &RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK,
-        &RQF_LLOG_ORIGIN_HANDLE_READ_HEADER,
-	&RQF_LLOG_ORIGIN_CONNECT,
+	&RQF_LLOG_ORIGIN_HANDLE_CREATE,
+	&RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK,
+	&RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK,
+	&RQF_LLOG_ORIGIN_HANDLE_READ_HEADER,
 	&RQF_CONNECT,
 	&RQF_LFSCK_NOTIFY,
 	&RQF_LFSCK_QUERY,
@@ -901,8 +922,8 @@ struct req_msg_field RMF_MGS_CONFIG_RES =
 EXPORT_SYMBOL(RMF_MGS_CONFIG_RES);
 
 struct req_msg_field RMF_U32 =
-        DEFINE_MSGF("generic u32", 0,
-                    sizeof(__u32), lustre_swab_generic_32s, NULL);
+	DEFINE_MSGF("generic u32", RMF_F_STRUCT_ARRAY,
+		    sizeof(__u32), lustre_swab_generic_32s, NULL);
 EXPORT_SYMBOL(RMF_U32);
 
 struct req_msg_field RMF_SETINFO_VAL =
@@ -988,6 +1009,10 @@ struct req_msg_field RMF_NAME =
         DEFINE_MSGF("name", RMF_F_STRING, -1, NULL, NULL);
 EXPORT_SYMBOL(RMF_NAME);
 
+struct req_msg_field RMF_FID_ARRAY =
+	DEFINE_MSGF("fid_array", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_FID_ARRAY);
+
 struct req_msg_field RMF_SYMTGT =
         DEFINE_MSGF("symtgt", RMF_F_STRING, -1, NULL, NULL);
 EXPORT_SYMBOL(RMF_SYMTGT);
@@ -1011,7 +1036,7 @@ struct req_msg_field RMF_FILE_SECCTX_NAME =
 EXPORT_SYMBOL(RMF_FILE_SECCTX_NAME);
 
 struct req_msg_field RMF_FILE_SECCTX =
-	DEFINE_MSGF("file_secctx", 0, -1, NULL, NULL);
+	DEFINE_MSGF("file_secctx", RMF_F_NO_SIZE_CHECK, -1, NULL, NULL);
 EXPORT_SYMBOL(RMF_FILE_SECCTX);
 
 struct req_msg_field RMF_LLOGD_BODY =
@@ -1098,13 +1123,11 @@ struct req_msg_field RMF_LOGCOOKIES =
 EXPORT_SYMBOL(RMF_LOGCOOKIES);
 
 struct req_msg_field RMF_CAPA1 =
-        DEFINE_MSGF("capa", 0, sizeof(struct lustre_capa),
-                    lustre_swab_lustre_capa, NULL);
+	DEFINE_MSGF("capa", 0, 0, NULL, NULL);
 EXPORT_SYMBOL(RMF_CAPA1);
 
 struct req_msg_field RMF_CAPA2 =
-        DEFINE_MSGF("capa", 0, sizeof(struct lustre_capa),
-                    lustre_swab_lustre_capa, NULL);
+	DEFINE_MSGF("capa", 0, 0, NULL, NULL);
 EXPORT_SYMBOL(RMF_CAPA2);
 
 struct req_msg_field RMF_LAYOUT_INTENT =
@@ -1113,6 +1136,10 @@ struct req_msg_field RMF_LAYOUT_INTENT =
 		    NULL);
 EXPORT_SYMBOL(RMF_LAYOUT_INTENT);
 
+struct req_msg_field RMF_SELINUX_POL =
+	DEFINE_MSGF("selinux_pol", RMF_F_STRING, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SELINUX_POL);
+
 /*
  * OST request field.
  */
@@ -1133,9 +1160,15 @@ struct req_msg_field RMF_NIOBUF_REMOTE =
                     dump_rniobuf);
 EXPORT_SYMBOL(RMF_NIOBUF_REMOTE);
 
+struct req_msg_field RMF_NIOBUF_INLINE =
+	DEFINE_MSGF("niobuf_inline", RMF_F_NO_SIZE_CHECK,
+		    sizeof(struct niobuf_remote), lustre_swab_niobuf_remote,
+		    dump_rniobuf);
+EXPORT_SYMBOL(RMF_NIOBUF_INLINE);
+
 struct req_msg_field RMF_RCS =
-        DEFINE_MSGF("niobuf_remote", RMF_F_STRUCT_ARRAY, sizeof(__u32),
-                    lustre_swab_generic_32s, dump_rcs);
+	DEFINE_MSGF("niobuf_rcs", RMF_F_STRUCT_ARRAY, sizeof(__u32),
+		    lustre_swab_generic_32s, dump_rcs);
 EXPORT_SYMBOL(RMF_RCS);
 
 struct req_msg_field RMF_EAVALS_LENS =
@@ -1159,8 +1192,8 @@ struct req_msg_field RMF_OST_ID =
 EXPORT_SYMBOL(RMF_OST_ID);
 
 struct req_msg_field RMF_FIEMAP_KEY =
-        DEFINE_MSGF("fiemap", 0, sizeof(struct ll_fiemap_info_key),
-                    lustre_swab_fiemap, NULL);
+	DEFINE_MSGF("fiemap_key", 0, sizeof(struct ll_fiemap_info_key),
+		    lustre_swab_fiemap_info_key, NULL);
 EXPORT_SYMBOL(RMF_FIEMAP_KEY);
 
 struct req_msg_field RMF_FIEMAP_VAL =
@@ -1171,6 +1204,9 @@ struct req_msg_field RMF_IDX_INFO =
 	DEFINE_MSGF("idx_info", 0, sizeof(struct idx_info),
 		    lustre_swab_idx_info, NULL);
 EXPORT_SYMBOL(RMF_IDX_INFO);
+struct req_msg_field RMF_SHORT_IO =
+	DEFINE_MSGF("short_io", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SHORT_IO);
 struct req_msg_field RMF_HSM_USER_STATE =
 	DEFINE_MSGF("hsm_user_state", 0, sizeof(struct hsm_user_state),
 		    lustre_swab_hsm_user_state, NULL);
@@ -1198,7 +1234,7 @@ struct req_msg_field RMF_MDS_HSM_USER_ITEM =
 EXPORT_SYMBOL(RMF_MDS_HSM_USER_ITEM);
 
 struct req_msg_field RMF_MDS_HSM_ARCHIVE =
-	DEFINE_MSGF("hsm_archive", 0,
+	DEFINE_MSGF("hsm_archive", RMF_F_STRUCT_ARRAY,
 		    sizeof(__u32), lustre_swab_generic_32s, NULL);
 EXPORT_SYMBOL(RMF_MDS_HSM_ARCHIVE);
 
@@ -1344,10 +1380,6 @@ struct req_format RQF_FLD_READ =
 	DEFINE_REQ_FMT0("FLD_READ", fld_read_client, fld_read_server);
 EXPORT_SYMBOL(RQF_FLD_READ);
 
-struct req_format RQF_LOG_CANCEL =
-        DEFINE_REQ_FMT0("OBD_LOG_CANCEL", log_cancel_client, empty);
-EXPORT_SYMBOL(RQF_LOG_CANCEL);
-
 struct req_format RQF_MDS_QUOTACTL =
         DEFINE_REQ_FMT0("MDS_QUOTACTL", quotactl_only, quotactl_only);
 EXPORT_SYMBOL(RQF_MDS_QUOTACTL);
@@ -1371,9 +1403,13 @@ struct req_format RQF_MDS_GET_ROOT =
 EXPORT_SYMBOL(RQF_MDS_GET_ROOT);
 
 struct req_format RQF_MDS_STATFS =
-        DEFINE_REQ_FMT0("MDS_STATFS", empty, obd_statfs_server);
+	DEFINE_REQ_FMT0("MDS_STATFS", empty, obd_statfs_server);
 EXPORT_SYMBOL(RQF_MDS_STATFS);
 
+struct req_format RQF_MDS_STATFS_NEW =
+	DEFINE_REQ_FMT0("MDS_STATFS_NEW", mdt_body_only, obd_statfs_server);
+EXPORT_SYMBOL(RQF_MDS_STATFS_NEW);
+
 struct req_format RQF_MDS_SYNC =
         DEFINE_REQ_FMT0("MDS_SYNC", mdt_body_capa, mdt_body_only);
 EXPORT_SYMBOL(RQF_MDS_SYNC);
@@ -1451,6 +1487,10 @@ struct req_format RQF_MDS_REINT_SETXATTR =
 			mds_reint_setxattr_client, mdt_body_only);
 EXPORT_SYMBOL(RQF_MDS_REINT_SETXATTR);
 
+struct req_format RQF_MDS_REINT_RESYNC =
+	DEFINE_REQ_FMT0("MDS_REINT_RESYNC", mds_reint_resync, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT_RESYNC);
+
 struct req_format RQF_MDS_CONNECT =
         DEFINE_REQ_FMT0("MDS_CONNECT",
                         obd_connect_client, obd_connect_server);
@@ -1506,10 +1546,10 @@ struct req_format RQF_LDLM_GL_CALLBACK =
                         ldlm_gl_callback_server);
 EXPORT_SYMBOL(RQF_LDLM_GL_CALLBACK);
 
-struct req_format RQF_LDLM_GL_DESC_CALLBACK =
+struct req_format RQF_LDLM_GL_CALLBACK_DESC =
 	DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_gl_callback_desc_client,
 			ldlm_gl_callback_server);
-EXPORT_SYMBOL(RQF_LDLM_GL_DESC_CALLBACK);
+EXPORT_SYMBOL(RQF_LDLM_GL_CALLBACK_DESC);
 
 struct req_format RQF_LDLM_INTENT_BASIC =
 	DEFINE_REQ_FMT0("LDLM_INTENT_BASIC",
@@ -1522,7 +1562,7 @@ struct req_format RQF_LDLM_INTENT =
 EXPORT_SYMBOL(RQF_LDLM_INTENT);
 
 struct req_format RQF_LDLM_INTENT_LAYOUT =
-	DEFINE_REQ_FMT0("LDLM_INTENT_LAYOUT ",
+	DEFINE_REQ_FMT0("LDLM_INTENT_LAYOUT",
 			ldlm_intent_layout_client, ldlm_enqueue_lvb_server);
 EXPORT_SYMBOL(RQF_LDLM_INTENT_LAYOUT);
 
@@ -1541,11 +1581,6 @@ struct req_format RQF_LDLM_INTENT_CREATE =
                         ldlm_intent_create_client, ldlm_intent_getattr_server);
 EXPORT_SYMBOL(RQF_LDLM_INTENT_CREATE);
 
-struct req_format RQF_LDLM_INTENT_UNLINK =
-        DEFINE_REQ_FMT0("LDLM_INTENT_UNLINK",
-                        ldlm_intent_unlink_client, ldlm_intent_server);
-EXPORT_SYMBOL(RQF_LDLM_INTENT_UNLINK);
-
 struct req_format RQF_LDLM_INTENT_GETXATTR =
 	DEFINE_REQ_FMT0("LDLM_INTENT_GETXATTR",
 			ldlm_intent_getxattr_client,
@@ -1557,10 +1592,10 @@ struct req_format RQF_MDS_CLOSE =
                         mdt_close_client, mds_last_unlink_server);
 EXPORT_SYMBOL(RQF_MDS_CLOSE);
 
-struct req_format RQF_MDS_INTENT_CLOSE =
-	DEFINE_REQ_FMT0("MDS_CLOSE",
-			mdt_intent_close_client, mds_last_unlink_server);
-EXPORT_SYMBOL(RQF_MDS_INTENT_CLOSE);
+struct req_format RQF_MDS_CLOSE_INTENT =
+	DEFINE_REQ_FMT0("MDS_CLOSE_INTENT",
+			mdt_close_intent_client, mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_CLOSE_INTENT);
 
 struct req_format RQF_MDS_READPAGE =
         DEFINE_REQ_FMT0("MDS_READPAGE",
@@ -1601,16 +1636,16 @@ struct req_format RQF_MDS_SWAP_LAYOUTS =
 			mdt_swap_layouts, empty);
 EXPORT_SYMBOL(RQF_MDS_SWAP_LAYOUTS);
 
+struct req_format RQF_MDS_RMFID =
+	DEFINE_REQ_FMT0("MDS_RMFID", mds_rmfid_client,
+			mds_rmfid_server);
+EXPORT_SYMBOL(RQF_MDS_RMFID);
+
 struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE =
         DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_CREATE",
                         llog_origin_handle_create_client, llogd_body_only);
 EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_CREATE);
 
-struct req_format RQF_LLOG_ORIGIN_HANDLE_DESTROY =
-        DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_DESTROY",
-                        llogd_body_only, llogd_body_only);
-EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_DESTROY);
-
 struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK =
         DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_NEXT_BLOCK",
                         llogd_body_only, llog_origin_handle_next_block_server);
@@ -1626,10 +1661,6 @@ struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER =
                         llogd_body_only, llog_log_hdr_only);
 EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_READ_HEADER);
 
-struct req_format RQF_LLOG_ORIGIN_CONNECT =
-        DEFINE_REQ_FMT0("LLOG_ORIGIN_CONNECT", llogd_conn_body_only, empty);
-EXPORT_SYMBOL(RQF_LLOG_ORIGIN_CONNECT);
-
 struct req_format RQF_CONNECT =
 	DEFINE_REQ_FMT0("CONNECT", obd_connect_client, obd_connect_server);
 EXPORT_SYMBOL(RQF_CONNECT);
@@ -2340,12 +2371,13 @@ __u32 req_capsule_fmt_size(__u32 magic, const struct req_format *fmt,
 	if (size == 0)
 		return size;
 
-        for (; i < fmt->rf_fields[loc].nr; ++i)
-                if (fmt->rf_fields[loc].d[i]->rmf_size != -1)
-                        size += cfs_size_round(fmt->rf_fields[loc].d[i]->
-                                               rmf_size);
-        return size;
+	for (; i < fmt->rf_fields[loc].nr; ++i)
+		if (fmt->rf_fields[loc].d[i]->rmf_size != -1)
+			size += cfs_size_round(fmt->rf_fields[loc].d[i]->
+					       rmf_size);
+	return size;
 }
+EXPORT_SYMBOL(req_capsule_fmt_size);
 
 /**
  * Changes the format of an RPC.
@@ -2539,3 +2571,46 @@ int req_capsule_server_grow(struct req_capsule *pill,
         return 0;
 }
 EXPORT_SYMBOL(req_capsule_server_grow);
+
+int req_check_sepol(struct req_capsule *pill)
+{
+	int rc = 0;
+#ifdef HAVE_SERVER_SUPPORT
+	struct obd_export *export;
+	struct lu_nodemap *nm = NULL;
+	const char *sepol = NULL;
+	const char *nm_sepol = NULL;
+
+	if (!pill->rc_req)
+		return -EPROTO;
+
+	export = pill->rc_req->rq_export;
+	if (!export || !exp_connect_sepol(export) ||
+	    !req_capsule_has_field(pill, &RMF_SELINUX_POL, RCL_CLIENT))
+		goto nm;
+
+	if (req_capsule_get_size(pill, &RMF_SELINUX_POL, RCL_CLIENT) == 0)
+		goto nm;
+
+	sepol = req_capsule_client_get(pill, &RMF_SELINUX_POL);
+	CDEBUG(D_SEC, "retrieved sepol %s\n", sepol);
+
+nm:
+	if (export) {
+		nm = nodemap_get_from_exp(export);
+		if (!IS_ERR_OR_NULL(nm)) {
+			nm_sepol = nodemap_get_sepol(nm);
+			if (nm_sepol && nm_sepol[0])
+				if (sepol == NULL ||
+				    strcmp(sepol, nm_sepol) != 0)
+					rc = -EACCES;
+		}
+	}
+
+	if (!IS_ERR_OR_NULL(nm))
+		nodemap_putref(nm);
+#endif
+
+	return rc;
+}
+EXPORT_SYMBOL(req_check_sepol);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/llog_client.c b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_client.c
index a39db55028dc5..0f149b692362c 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/llog_client.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_client.c
@@ -136,41 +136,6 @@ static int llog_client_open(const struct lu_env *env,
 	return rc;
 }
 
-static int llog_client_destroy(const struct lu_env *env,
-			       struct llog_handle *loghandle,
-			       struct thandle *th)
-{
-        struct obd_import     *imp;
-        struct ptlrpc_request *req = NULL;
-        struct llogd_body     *body;
-        int                    rc;
-        ENTRY;
-
-        LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp);
-        req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_DESTROY,
-                                        LUSTRE_LOG_VERSION,
-                                        LLOG_ORIGIN_HANDLE_DESTROY);
-        if (req == NULL)
-                GOTO(err_exit, rc =-ENOMEM);
-
-        body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
-        body->lgd_logid = loghandle->lgh_id;
-        body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags;
-
-	if (!(body->lgd_llh_flags & LLOG_F_IS_PLAIN))
-		CERROR("%s: wrong llog flags %x\n", imp->imp_obd->obd_name,
-		       body->lgd_llh_flags);
-
-        ptlrpc_request_set_replen(req);
-        rc = ptlrpc_queue_wait(req);
-
-        ptlrpc_req_finished(req);
-err_exit:
-        LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp);
-        RETURN(rc);
-}
-
-
 static int llog_client_next_block(const struct lu_env *env,
 				  struct llog_handle *loghandle,
 				  int *cur_idx, int next_idx,
@@ -368,7 +333,6 @@ struct llog_operations llog_client_ops = {
 	.lop_prev_block		= llog_client_prev_block,
 	.lop_read_header	= llog_client_read_header,
 	.lop_open		= llog_client_open,
-	.lop_destroy		= llog_client_destroy,
 	.lop_close		= llog_client_close,
 };
 EXPORT_SYMBOL(llog_client_ops);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/llog_server.c b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_server.c
index 4864b499120df..ca91a1c9491ac 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/llog_server.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_server.c
@@ -111,45 +111,6 @@ int llog_origin_handle_open(struct ptlrpc_request *req)
 	return rc;
 }
 
-int llog_origin_handle_destroy(struct ptlrpc_request *req)
-{
-	struct llogd_body	*body;
-	struct llog_logid	*logid = NULL;
-	struct llog_ctxt	*ctxt;
-	int			 rc;
-
-	ENTRY;
-
-	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
-	if (body == NULL)
-		RETURN(err_serious(-EFAULT));
-
-	rc = req_capsule_server_pack(&req->rq_pill);
-	if (rc < 0)
-		RETURN(err_serious(-ENOMEM));
-
-	if (ostid_id(&body->lgd_logid.lgl_oi) > 0)
-		logid = &body->lgd_logid;
-
-	if (!(body->lgd_llh_flags & LLOG_F_IS_PLAIN))
-		CERROR("%s: wrong llog flags %x\n",
-		       req->rq_export->exp_obd->obd_name, body->lgd_llh_flags);
-
-	if (body->lgd_ctxt_idx >= LLOG_MAX_CTXTS) {
-		CDEBUG(D_WARNING, "%s: bad ctxt ID: idx=%d\n",
-		       req->rq_export->exp_obd->obd_name, body->lgd_ctxt_idx);
-		RETURN(-EPROTO);
-	}
-
-	ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx);
-	if (ctxt == NULL)
-		RETURN(-ENODEV);
-
-	rc = llog_erase(req->rq_svc_thread->t_env, ctxt, logid, NULL);
-	llog_ctxt_put(ctxt);
-	RETURN(rc);
-}
-
 int llog_origin_handle_next_block(struct ptlrpc_request *req)
 {
 	struct llog_handle	*loghandle;
@@ -324,15 +285,3 @@ int llog_origin_handle_read_header(struct ptlrpc_request *req)
 	llog_ctxt_put(ctxt);
 	return rc;
 }
-
-int llog_origin_handle_close(struct ptlrpc_request *req)
-{
-	int	 rc;
-
-	ENTRY;
-
-	rc = req_capsule_server_pack(&req->rq_pill);
-	if (rc)
-		RETURN(err_serious(-ENOMEM));
-	RETURN(0);
-}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c b/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
index 933183a83dbb3..df178e0a02c82 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -35,7 +35,6 @@
 #include <obd_support.h>
 #include <obd.h>
 #include <lprocfs_status.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_net.h>
 #include <obd_class.h>
 #include "ptlrpc_internal.h"
@@ -96,6 +95,7 @@ static struct ll_rpc_opcode {
 	{ MDS_HSM_CT_REGISTER, "mds_hsm_ct_register" },
 	{ MDS_HSM_CT_UNREGISTER, "mds_hsm_ct_unregister" },
 	{ MDS_SWAP_LAYOUTS,	"mds_swap_layouts" },
+	{ MDS_RMFID,		"mds_rmfid" },
         { LDLM_ENQUEUE,     "ldlm_enqueue" },
         { LDLM_CONVERT,     "ldlm_convert" },
         { LDLM_CANCEL,      "ldlm_cancel" },
@@ -110,17 +110,17 @@ static struct ll_rpc_opcode {
         { MGS_TARGET_DEL,   "mgs_target_del" },
         { MGS_SET_INFO,     "mgs_set_info" },
         { MGS_CONFIG_READ,  "mgs_config_read" },
-        { OBD_PING,         "obd_ping" },
-	{ OBD_LOG_CANCEL,	"llog_cancel" },
-        { OBD_QC_CALLBACK,  "obd_quota_callback" },
-	{ OBD_IDX_READ,	    "dt_index_read" },
+	{ OBD_PING,			 "obd_ping" },
+	{ 401, /* was OBD_LOG_CANCEL */	 "llog_cancel" },
+	{ 402, /* was OBD_QC_CALLBACK */ "obd_quota_callback" },
+	{ OBD_IDX_READ,			 "dt_index_read" },
 	{ LLOG_ORIGIN_HANDLE_CREATE,	 "llog_origin_handle_open" },
         { LLOG_ORIGIN_HANDLE_NEXT_BLOCK, "llog_origin_handle_next_block" },
         { LLOG_ORIGIN_HANDLE_READ_HEADER,"llog_origin_handle_read_header" },
-        { LLOG_ORIGIN_HANDLE_WRITE_REC,  "llog_origin_handle_write_rec" },
-        { LLOG_ORIGIN_HANDLE_CLOSE,      "llog_origin_handle_close" },
-        { LLOG_ORIGIN_CONNECT,           "llog_origin_connect" },
-        { LLOG_CATINFO,                  "llog_catinfo" },
+        { 504, /*LLOG_ORIGIN_HANDLE_WRITE_REC*/"llog_origin_handle_write_rec" },
+        { 505, /* was LLOG_ORIGIN_HANDLE_CLOSE */ "llog_origin_handle_close" },
+        { 506, /* was LLOG_ORIGIN_CONNECT */ "llog_origin_connect" },
+        { 507, /* was LLOG_CATINFO */	 "llog_catinfo" },
         { LLOG_ORIGIN_HANDLE_PREV_BLOCK, "llog_origin_handle_prev_block" },
         { LLOG_ORIGIN_HANDLE_DESTROY,    "llog_origin_handle_destroy" },
         { QUOTA_DQACQ,      "quota_acquire" },
@@ -140,20 +140,21 @@ static struct ll_eopcode {
      __u32       opcode;
      const char *opname;
 } ll_eopcode_table[EXTRA_LAST_OPC] = {
-        { LDLM_GLIMPSE_ENQUEUE, "ldlm_glimpse_enqueue" },
-        { LDLM_PLAIN_ENQUEUE,   "ldlm_plain_enqueue" },
-        { LDLM_EXTENT_ENQUEUE,  "ldlm_extent_enqueue" },
-        { LDLM_FLOCK_ENQUEUE,   "ldlm_flock_enqueue" },
-        { LDLM_IBITS_ENQUEUE,   "ldlm_ibits_enqueue" },
-        { MDS_REINT_SETATTR,    "mds_reint_setattr" },
-        { MDS_REINT_CREATE,     "mds_reint_create" },
-        { MDS_REINT_LINK,       "mds_reint_link" },
-        { MDS_REINT_UNLINK,     "mds_reint_unlink" },
-        { MDS_REINT_RENAME,     "mds_reint_rename" },
-        { MDS_REINT_OPEN,       "mds_reint_open" },
-        { MDS_REINT_SETXATTR,   "mds_reint_setxattr" },
-        { BRW_READ_BYTES,       "read_bytes" },
-        { BRW_WRITE_BYTES,      "write_bytes" },
+	{ LDLM_GLIMPSE_ENQUEUE, "ldlm_glimpse_enqueue" },
+	{ LDLM_PLAIN_ENQUEUE,   "ldlm_plain_enqueue" },
+	{ LDLM_EXTENT_ENQUEUE,  "ldlm_extent_enqueue" },
+	{ LDLM_FLOCK_ENQUEUE,   "ldlm_flock_enqueue" },
+	{ LDLM_IBITS_ENQUEUE,   "ldlm_ibits_enqueue" },
+	{ MDS_REINT_SETATTR,    "mds_reint_setattr" },
+	{ MDS_REINT_CREATE,     "mds_reint_create" },
+	{ MDS_REINT_LINK,       "mds_reint_link" },
+	{ MDS_REINT_UNLINK,     "mds_reint_unlink" },
+	{ MDS_REINT_RENAME,     "mds_reint_rename" },
+	{ MDS_REINT_OPEN,       "mds_reint_open" },
+	{ MDS_REINT_SETXATTR,   "mds_reint_setxattr" },
+	{ MDS_REINT_RESYNC,	"mds_reint_resync" },
+	{ BRW_READ_BYTES,       "read_bytes" },
+	{ BRW_WRITE_BYTES,      "write_bytes" },
 };
 
 const char *ll_opcode2str(__u32 opcode)
@@ -194,32 +195,33 @@ static const char *ll_eopcode2str(__u32 opcode)
         return ll_eopcode_table[opcode].opname;
 }
 
-#ifdef CONFIG_PROC_FS
-static void ptlrpc_lprocfs_register(struct proc_dir_entry *root, char *dir,
-                             char *name, struct proc_dir_entry **procroot_ret,
-                             struct lprocfs_stats **stats_ret)
+static void
+ptlrpc_ldebugfs_register(struct dentry *root, char *dir, char *name,
+			 struct dentry **debugfs_root_ret,
+			 struct lprocfs_stats **stats_ret)
 {
-        struct proc_dir_entry *svc_procroot;
+	struct dentry *svc_debugfs_entry;
         struct lprocfs_stats *svc_stats;
         int i, rc;
         unsigned int svc_counter_config = LPROCFS_CNTR_AVGMINMAX |
                                           LPROCFS_CNTR_STDDEV;
 
-        LASSERT(*procroot_ret == NULL);
-        LASSERT(*stats_ret == NULL);
+	LASSERT(!*debugfs_root_ret);
+	LASSERT(!*stats_ret);
 
-        svc_stats = lprocfs_alloc_stats(EXTRA_MAX_OPCODES+LUSTRE_MAX_OPCODES,0);
-        if (svc_stats == NULL)
+	svc_stats = lprocfs_alloc_stats(EXTRA_MAX_OPCODES + LUSTRE_MAX_OPCODES,
+					0);
+	if (!svc_stats)
                 return;
 
         if (dir) {
-		svc_procroot = lprocfs_register(dir, root, NULL, NULL);
-                if (IS_ERR(svc_procroot)) {
+		svc_debugfs_entry = ldebugfs_register(dir, root, NULL, NULL);
+		if (IS_ERR(svc_debugfs_entry)) {
                         lprocfs_free_stats(&svc_stats);
                         return;
                 }
         } else {
-                svc_procroot = root;
+		svc_debugfs_entry = root;
         }
 
         lprocfs_counter_init(svc_stats, PTLRPC_REQWAIT_CNTR,
@@ -235,7 +237,7 @@ static void ptlrpc_lprocfs_register(struct proc_dir_entry *root, char *dir,
         for (i = 0; i < EXTRA_LAST_OPC; i++) {
                 char *units;
 
-                switch(i) {
+		switch (i) {
                 case BRW_WRITE_BYTES:
                 case BRW_READ_BYTES:
                         units = "bytes";
@@ -255,14 +257,14 @@ static void ptlrpc_lprocfs_register(struct proc_dir_entry *root, char *dir,
                                      ll_opcode2str(opcode), "usec");
         }
 
-        rc = lprocfs_register_stats(svc_procroot, name, svc_stats);
+	rc = ldebugfs_register_stats(svc_debugfs_entry, name, svc_stats);
         if (rc < 0) {
                 if (dir)
-                        lprocfs_remove(&svc_procroot);
+			ldebugfs_remove(&svc_debugfs_entry);
                 lprocfs_free_stats(&svc_stats);
         } else {
                 if (dir)
-                        *procroot_ret = svc_procroot;
+			*debugfs_root_ret = svc_debugfs_entry;
                 *stats_ret = svc_stats;
         }
 }
@@ -281,7 +283,9 @@ ptlrpc_lprocfs_req_history_len_seq_show(struct seq_file *m, void *v)
 	seq_printf(m, "%d\n", total);
 	return 0;
 }
-LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_req_history_len);
+
+
+LDEBUGFS_SEQ_FOPS_RO(ptlrpc_lprocfs_req_history_len);
 
 static int
 ptlrpc_lprocfs_req_history_max_seq_show(struct seq_file *m, void *n)
@@ -305,11 +309,12 @@ ptlrpc_lprocfs_req_history_max_seq_write(struct file *file,
 {
 	struct seq_file *m = file->private_data;
 	struct ptlrpc_service *svc = m->private;
+	unsigned long long val;
+	unsigned long long limit;
 	int bufpages;
-	__s64 val;
 	int rc;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtoull_from_user(buffer, count, 0, &val);
 	if (rc < 0)
 		return rc;
 
@@ -318,10 +323,15 @@ ptlrpc_lprocfs_req_history_max_seq_write(struct file *file,
 
 	/* This sanity check is more of an insanity check; we can still
 	 * hose a kernel by allowing the request history to grow too
-	 * far. */
-	bufpages = (svc->srv_buf_size + PAGE_SIZE - 1) >>
+	 * far. The roundup to the next power of two is an empirical way
+	 * to take care that request buffer is allocated in Slab and thus
+	 * will be upgraded */
+	bufpages = (roundup_pow_of_two(svc->srv_buf_size) + PAGE_SIZE - 1) >>
 							PAGE_SHIFT;
-	if (val > cfs_totalram_pages() / (2 * bufpages))
+	limit = cfs_totalram_pages() / (2 * bufpages);
+	/* do not allow history to consume more than half max number of rqbds */
+	if ((svc->srv_nrqbds_max == 0 && val > limit) ||
+	    (svc->srv_nrqbds_max != 0 && val > svc->srv_nrqbds_max / 2))
 		return -ERANGE;
 
 	spin_lock(&svc->srv_lock);
@@ -336,28 +346,64 @@ ptlrpc_lprocfs_req_history_max_seq_write(struct file *file,
 
 	return count;
 }
-LPROC_SEQ_FOPS(ptlrpc_lprocfs_req_history_max);
+
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_req_history_max);
 
 static int
-ptlrpc_lprocfs_threads_min_seq_show(struct seq_file *m, void *n)
+ptlrpc_lprocfs_req_buffers_max_seq_show(struct seq_file *m, void *n)
 {
 	struct ptlrpc_service *svc = m->private;
 
-	seq_printf(m, "%d\n",
-		   svc->srv_nthrs_cpt_init * svc->srv_ncpts);
+	seq_printf(m, "%d\n", svc->srv_nrqbds_max);
 	return 0;
 }
 
 static ssize_t
-ptlrpc_lprocfs_threads_min_seq_write(struct file *file,
-				     const char __user *buffer,
-				     size_t count, loff_t *off)
+ptlrpc_lprocfs_req_buffers_max_seq_write(struct file *file,
+					 const char __user *buffer,
+					 size_t count, loff_t *off)
 {
 	struct seq_file *m = file->private_data;
 	struct ptlrpc_service *svc = m->private;
-	__s64 val;
-	int rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	int val;
+	int rc;
 
+	rc = kstrtoint_from_user(buffer, count, 0, &val);
+	if (rc < 0)
+		return rc;
+
+	if (val < svc->srv_nbuf_per_group && val != 0)
+		return -ERANGE;
+
+	spin_lock(&svc->srv_lock);
+
+	svc->srv_nrqbds_max = (uint)val;
+
+	spin_unlock(&svc->srv_lock);
+
+	return count;
+}
+
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_req_buffers_max);
+
+static ssize_t threads_min_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
+						  srv_kobj);
+
+	return sprintf(buf, "%d\n", svc->srv_nthrs_cpt_init * svc->srv_ncpts);
+}
+
+static ssize_t threads_min_store(struct kobject *kobj, struct attribute *attr,
+				 const char *buffer, size_t count)
+{
+	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
+						  srv_kobj);
+	unsigned long val;
+	int rc;
+
+	rc = kstrtoul(buffer, 10, &val);
 	if (rc < 0)
 		return rc;
 
@@ -376,44 +422,43 @@ ptlrpc_lprocfs_threads_min_seq_write(struct file *file,
 
 	return count;
 }
-LPROC_SEQ_FOPS(ptlrpc_lprocfs_threads_min);
+LUSTRE_RW_ATTR(threads_min);
 
-static int
-ptlrpc_lprocfs_threads_started_seq_show(struct seq_file *m, void *n)
+static ssize_t threads_started_show(struct kobject *kobj,
+				    struct attribute *attr,
+				    char *buf)
 {
-	struct ptlrpc_service		*svc = m->private;
-	struct ptlrpc_service_part	*svcpt;
-	int	total = 0;
-	int	i;
+	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
+						  srv_kobj);
+	struct ptlrpc_service_part *svcpt;
+	int total = 0;
+	int i;
 
 	ptlrpc_service_for_each_part(svcpt, i, svc)
 		total += svcpt->scp_nthrs_running;
 
-	seq_printf(m, "%d\n", total);
-	return 0;
+	return sprintf(buf, "%d\n", total);
 }
-LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_threads_started);
+LUSTRE_RO_ATTR(threads_started);
 
-static int
-ptlrpc_lprocfs_threads_max_seq_show(struct seq_file *m, void *n)
+static ssize_t threads_max_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
 {
-	struct ptlrpc_service *svc = m->private;
+	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
+						  srv_kobj);
 
-	seq_printf(m, "%d\n",
-		   svc->srv_nthrs_cpt_limit * svc->srv_ncpts);
-	return 0;
+	return sprintf(buf, "%d\n", svc->srv_nthrs_cpt_limit * svc->srv_ncpts);
 }
 
-static ssize_t
-ptlrpc_lprocfs_threads_max_seq_write(struct file *file,
-				     const char __user *buffer,
-				     size_t count, loff_t *off)
+static ssize_t threads_max_store(struct kobject *kobj, struct attribute *attr,
+				 const char *buffer, size_t count)
 {
-	struct seq_file *m = file->private_data;
-	struct ptlrpc_service *svc = m->private;
-	__s64 val;
-	int rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
+						  srv_kobj);
+	unsigned long val;
+	int rc;
 
+	rc = kstrtoul(buffer, 10, &val);
 	if (rc < 0)
 		return rc;
 
@@ -432,7 +477,7 @@ ptlrpc_lprocfs_threads_max_seq_write(struct file *file,
 
 	return count;
 }
-LPROC_SEQ_FOPS(ptlrpc_lprocfs_threads_max);
+LUSTRE_RW_ATTR(threads_max);
 
 /**
  * Translates \e ptlrpc_nrs_pol_state values to human-readable strings.
@@ -472,7 +517,7 @@ void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy,
 	LASSERT(info != NULL);
 	assert_spin_locked(&policy->pol_nrs->nrs_lock);
 
-	LASSERT(sizeof(info->pi_arg) == sizeof(policy->pol_arg));
+	CLASSERT(sizeof(info->pi_arg) == sizeof(policy->pol_arg));
 	memcpy(info->pi_name, policy->pol_desc->pd_name, NRS_POL_NAME_MAX);
 	memcpy(info->pi_arg, policy->pol_arg, sizeof(policy->pol_arg));
 
@@ -555,20 +600,39 @@ static int ptlrpc_lprocfs_nrs_seq_show(struct seq_file *m, void *n)
 				 * sanity-check the values we get.
 				 */
 			} else {
-				LASSERT(strncmp(infos[pol_idx].pi_name,
-						tmp.pi_name,
-						NRS_POL_NAME_MAX) == 0);
-				LASSERT(strncmp(infos[pol_idx].pi_arg,
-						tmp.pi_arg,
-						sizeof(tmp.pi_arg)) == 0);
+				if (strncmp(infos[pol_idx].pi_name,
+					    tmp.pi_name,
+					    NRS_POL_NAME_MAX) != 0) {
+					spin_unlock(&nrs->nrs_lock);
+					rc = -EINVAL;
+					CERROR("%s: failed to check pi_name: rc = %d\n",
+					       svc->srv_thread_name, rc);
+					GOTO(out, rc);
+				}
+				if (strncmp(infos[pol_idx].pi_arg,
+					    tmp.pi_arg,
+					    sizeof(tmp.pi_arg)) != 0) {
+					spin_unlock(&nrs->nrs_lock);
+					rc = -EINVAL;
+					CERROR("%s: failed to check pi_arg: rc = %d\n",
+					       svc->srv_thread_name, rc);
+					GOTO(out, rc);
+				}
 				/**
-				 * Not asserting ptlrpc_nrs_pol_info::pi_state,
+				 * Not checking ptlrpc_nrs_pol_info::pi_state,
 				 * because it may be different between
 				 * instances of the same policy in different
 				 * service partitions.
 				 */
-				LASSERT(infos[pol_idx].pi_fallback ==
-					tmp.pi_fallback);
+
+				if (infos[pol_idx].pi_fallback !=
+				    tmp.pi_fallback) {
+					spin_unlock(&nrs->nrs_lock);
+					rc = -EINVAL;
+					CERROR("%s: failed to check pi_fallback: rc = %d\n",
+					       svc->srv_thread_name, rc);
+					GOTO(out, rc);
+				}
 			}
 
 			infos[pol_idx].pi_req_queued += tmp.pi_req_queued;
@@ -692,7 +756,7 @@ ptlrpc_lprocfs_nrs_seq_write(struct file *file, const char __user *buffer,
 	 */
 	cmd_copy = cmd;
 
-	if (lprocfs_copy_from_user(file, cmd, buffer, count))
+	if (copy_from_user(cmd, buffer, count))
 		GOTO(out, rc = -EFAULT);
 
 	cmd[count] = '\0';
@@ -747,7 +811,8 @@ ptlrpc_lprocfs_nrs_seq_write(struct file *file, const char __user *buffer,
 
 	RETURN(rc < 0 ? rc : count);
 }
-LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs);
+
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs);
 
 /** @} nrs */
 
@@ -867,10 +932,12 @@ ptlrpc_lprocfs_svc_req_history_start(struct seq_file *s, loff_t *pos)
 		if (i > cpt) /* make up the lowest position for this CPT */
 			*pos = PTLRPC_REQ_CPT2POS(svc, i);
 
+		mutex_lock(&svcpt->scp_mutex);
 		spin_lock(&svcpt->scp_lock);
 		rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi,
 				PTLRPC_REQ_POS2SEQ(svc, *pos));
 		spin_unlock(&svcpt->scp_lock);
+		mutex_unlock(&svcpt->scp_mutex);
 		if (rc == 0) {
 			*pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq);
 			srhi->srhi_idx = i;
@@ -912,9 +979,11 @@ ptlrpc_lprocfs_svc_req_history_next(struct seq_file *s,
 			seq = srhi->srhi_seq + (1 << svc->srv_cpt_bits);
 		}
 
+		mutex_lock(&svcpt->scp_mutex);
 		spin_lock(&svcpt->scp_lock);
 		rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, seq);
 		spin_unlock(&svcpt->scp_lock);
+		mutex_unlock(&svcpt->scp_mutex);
 		if (rc == 0) {
 			*pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq);
 			srhi->srhi_idx = i;
@@ -968,6 +1037,7 @@ static int ptlrpc_lprocfs_svc_req_history_show(struct seq_file *s, void *iter)
 
 	svcpt = svc->srv_parts[srhi->srhi_idx];
 
+	mutex_lock(&svcpt->scp_mutex);
 	spin_lock(&svcpt->scp_lock);
 
 	rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, srhi->srhi_seq);
@@ -1008,6 +1078,8 @@ static int ptlrpc_lprocfs_svc_req_history_show(struct seq_file *s, void *iter)
 	}
 
 	spin_unlock(&svcpt->scp_lock);
+	mutex_unlock(&svcpt->scp_mutex);
+
 	return rc;
 }
 
@@ -1032,7 +1104,7 @@ ptlrpc_lprocfs_svc_req_history_open(struct inode *inode, struct file *file)
 		return rc;
 
 	seqf = file->private_data;
-	seqf->private = PDE_DATA(inode);
+	seqf->private = inode->i_private;
 	return 0;
 }
 
@@ -1066,98 +1138,130 @@ static int ptlrpc_lprocfs_timeouts_seq_show(struct seq_file *m, void *n)
 
 	return 0;
 }
-LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_timeouts);
 
-static int ptlrpc_lprocfs_hp_ratio_seq_show(struct seq_file *m, void *v)
+LDEBUGFS_SEQ_FOPS_RO(ptlrpc_lprocfs_timeouts);
+
+static ssize_t high_priority_ratio_show(struct kobject *kobj,
+					struct attribute *attr,
+					char *buf)
 {
-	struct ptlrpc_service *svc = m->private;
-	seq_printf(m, "%d\n", svc->srv_hpreq_ratio);
-	return 0;
+	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
+						  srv_kobj);
+
+	return sprintf(buf, "%d\n", svc->srv_hpreq_ratio);
 }
 
-static ssize_t
-ptlrpc_lprocfs_hp_ratio_seq_write(struct file *file, const char __user *buffer,
-				  size_t count, loff_t *off)
+static ssize_t high_priority_ratio_store(struct kobject *kobj,
+					 struct attribute *attr,
+					 const char *buffer,
+					 size_t count)
 {
-	struct seq_file *m = file->private_data;
-	struct ptlrpc_service *svc = m->private;
+	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
+						  srv_kobj);
 	int rc;
-	__s64 val;
+	unsigned long val;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtoul(buffer, 10, &val);
 	if (rc < 0)
 		return rc;
 
-	if (val < 0 || val > INT_MAX)
-		return -ERANGE;
-
 	spin_lock(&svc->srv_lock);
 	svc->srv_hpreq_ratio = val;
 	spin_unlock(&svc->srv_lock);
 
 	return count;
 }
-LPROC_SEQ_FOPS(ptlrpc_lprocfs_hp_ratio);
+LUSTRE_RW_ATTR(high_priority_ratio);
+
+static struct attribute *ptlrpc_svc_attrs[] = {
+	&lustre_attr_threads_min.attr,
+	&lustre_attr_threads_started.attr,
+	&lustre_attr_threads_max.attr,
+	&lustre_attr_high_priority_ratio.attr,
+	NULL,
+};
 
-void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry,
-                                     struct ptlrpc_service *svc)
+static void ptlrpc_sysfs_svc_release(struct kobject *kobj)
 {
-	struct lprocfs_vars lproc_vars[] = {
-		{ .name	= "high_priority_ratio",
-		  .fops	= &ptlrpc_lprocfs_hp_ratio_fops,
-		  .data = svc },
+	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
+						  srv_kobj);
+
+	complete(&svc->srv_kobj_unregister);
+}
+
+static struct kobj_type ptlrpc_svc_ktype = {
+	.default_attrs	= ptlrpc_svc_attrs,
+	.sysfs_ops	= &lustre_sysfs_ops,
+	.release	= ptlrpc_sysfs_svc_release,
+};
+
+void ptlrpc_sysfs_unregister_service(struct ptlrpc_service *svc)
+{
+	/* Let's see if we had a chance at initialization first */
+	if (svc->srv_kobj.kset) {
+		kobject_put(&svc->srv_kobj);
+		wait_for_completion(&svc->srv_kobj_unregister);
+	}
+}
+
+int ptlrpc_sysfs_register_service(struct kset *parent,
+				  struct ptlrpc_service *svc)
+{
+	svc->srv_kobj.kset = parent;
+	init_completion(&svc->srv_kobj_unregister);
+	return kobject_init_and_add(&svc->srv_kobj, &ptlrpc_svc_ktype,
+				    &parent->kobj, "%s", svc->srv_name);
+}
+
+void ptlrpc_ldebugfs_register_service(struct dentry *entry,
+				      struct ptlrpc_service *svc)
+{
+	struct ldebugfs_vars ldebugfs_vars[] = {
 		{ .name	= "req_buffer_history_len",
 		  .fops	= &ptlrpc_lprocfs_req_history_len_fops,
 		  .data	= svc },
 		{ .name = "req_buffer_history_max",
 		  .fops	= &ptlrpc_lprocfs_req_history_max_fops,
 		  .data	= svc },
-		{ .name = "threads_min",
-		  .fops = &ptlrpc_lprocfs_threads_min_fops,
-		  .data = svc },
-		{ .name = "threads_max",
-		  .fops = &ptlrpc_lprocfs_threads_max_fops,
-		  .data = svc },
-		{ .name = "threads_started",
-		  .fops = &ptlrpc_lprocfs_threads_started_fops,
-		  .data = svc },
 		{ .name = "timeouts",
 		  .fops = &ptlrpc_lprocfs_timeouts_fops,
 		  .data = svc },
 		{ .name = "nrs_policies",
 		  .fops = &ptlrpc_lprocfs_nrs_fops,
 		  .data = svc },
+		{ .name = "req_buffers_max",
+		  .fops = &ptlrpc_lprocfs_req_buffers_max_fops,
+		  .data = svc },
 		{ NULL }
         };
-        static struct proc_ops req_history_fops = {
-		PROC_OWNER(THIS_MODULE)
-                .proc_open    = ptlrpc_lprocfs_svc_req_history_open,
-                .proc_read    = seq_read,
-                .proc_lseek   = seq_lseek,
-                .proc_release = lprocfs_seq_release,
+        static struct file_operations req_history_fops = {
+                .owner       = THIS_MODULE,
+                .open        = ptlrpc_lprocfs_svc_req_history_open,
+                .read        = seq_read,
+                .llseek      = seq_lseek,
+                .release     = lprocfs_seq_release,
         };
 
         int rc;
 
-        ptlrpc_lprocfs_register(entry, svc->srv_name,
-				"stats", &svc->srv_procroot,
-				&svc->srv_stats);
-	if (svc->srv_procroot == NULL)
+	ptlrpc_ldebugfs_register(entry, svc->srv_name, "stats",
+				 &svc->srv_debugfs_entry, &svc->srv_stats);
+	if (IS_ERR_OR_NULL(svc->srv_debugfs_entry))
 		return;
 
-	lprocfs_add_vars(svc->srv_procroot, lproc_vars, NULL);
+	ldebugfs_add_vars(svc->srv_debugfs_entry, ldebugfs_vars, NULL);
 
-	rc = lprocfs_seq_create(svc->srv_procroot, "req_history",
-				0400, &req_history_fops, svc);
+	rc = ldebugfs_seq_create(svc->srv_debugfs_entry, "req_history",
+				 0400, &req_history_fops, svc);
 	if (rc)
 		CWARN("Error adding the req_history file\n");
 }
 
 void ptlrpc_lprocfs_register_obd(struct obd_device *obddev)
 {
-        ptlrpc_lprocfs_register(obddev->obd_proc_entry, NULL, "stats",
-                                &obddev->obd_svc_procroot,
-                                &obddev->obd_svc_stats);
+	ptlrpc_ldebugfs_register(obddev->obd_debugfs_entry, NULL, "stats",
+				 &obddev->obd_svc_debugfs_entry,
+				 &obddev->obd_svc_stats);
 }
 EXPORT_SYMBOL(ptlrpc_lprocfs_register_obd);
 
@@ -1205,8 +1309,8 @@ EXPORT_SYMBOL(ptlrpc_lprocfs_brw);
 
 void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc)
 {
-        if (svc->srv_procroot != NULL)
-                lprocfs_remove(&svc->srv_procroot);
+	if (!IS_ERR_OR_NULL(svc->srv_debugfs_entry))
+		ldebugfs_remove(&svc->srv_debugfs_entry);
 
         if (svc->srv_stats)
                 lprocfs_free_stats(&svc->srv_stats);
@@ -1219,48 +1323,53 @@ void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd)
 	 */
 	lprocfs_obd_cleanup(obd);
 
-        if (obd->obd_svc_procroot)
-                lprocfs_remove(&obd->obd_svc_procroot);
+	if (!IS_ERR_OR_NULL(obd->obd_svc_debugfs_entry))
+		ldebugfs_remove(&obd->obd_svc_debugfs_entry);
 
         if (obd->obd_svc_stats)
                 lprocfs_free_stats(&obd->obd_svc_stats);
 }
 EXPORT_SYMBOL(ptlrpc_lprocfs_unregister_obd);
 
-ssize_t
-lprocfs_ping_seq_write(struct file *file, const char __user *buffer,
-		       size_t count, loff_t *off)
+ssize_t ping_show(struct kobject *kobj, struct attribute *attr,
+		  char *buffer)
 {
-	struct seq_file		*m = file->private_data;
-	struct obd_device	*obd = m->private;
-	struct ptlrpc_request	*req;
-	int			rc;
-	ENTRY;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct ptlrpc_request *req;
+	int rc;
 
+	ENTRY;
 	LPROCFS_CLIMP_CHECK(obd);
 	req = ptlrpc_prep_ping(obd->u.cli.cl_import);
 	LPROCFS_CLIMP_EXIT(obd);
-	if (req == NULL)
+	if (!req)
 		RETURN(-ENOMEM);
 
 	req->rq_send_state = LUSTRE_IMP_FULL;
 
 	rc = ptlrpc_queue_wait(req);
-
 	ptlrpc_req_finished(req);
-	if (rc >= 0)
-		RETURN(count);
+
 	RETURN(rc);
 }
-EXPORT_SYMBOL(lprocfs_ping_seq_write);
+EXPORT_SYMBOL(ping_show);
+
+/* kept for older verison of tools. */
+ssize_t ping_store(struct kobject *kobj, struct attribute *attr,
+		   const char *buffer, size_t count)
+{
+	return ping_show(kobj, attr, (char *)buffer);
+}
+EXPORT_SYMBOL(ping_store);
 
 /* Write the connection UUID to this file to attempt to connect to that node.
  * The connection UUID is a node's primary NID. For example,
  * "echo connection=192.168.0.1@tcp0::instance > .../import".
  */
 ssize_t
-lprocfs_import_seq_write(struct file *file, const char __user *buffer,
-			 size_t count, loff_t *off)
+ldebugfs_import_seq_write(struct file *file, const char __user *buffer,
+			  size_t count, loff_t *off)
 {
 	struct seq_file	  *m	= file->private_data;
 	struct obd_device *obd	= m->private;
@@ -1279,7 +1388,7 @@ lprocfs_import_seq_write(struct file *file, const char __user *buffer,
 	if (kbuf == NULL)
 		return -ENOMEM;
 
-	if (lprocfs_copy_from_user(file, kbuf, buffer, count))
+	if (copy_from_user(kbuf, buffer, count))
 		GOTO(out, count = -EFAULT);
 
 	kbuf[count] = 0;
@@ -1291,14 +1400,14 @@ lprocfs_import_seq_write(struct file *file, const char __user *buffer,
 	uuid = kbuf + prefix_len;
 	ptr = strstr(uuid, "::");
 	if (ptr) {
-		__u32 inst;
-		char *endptr;
+		u32 inst;
+		int rc;
 
 		*ptr = 0;
 		do_reconn = 0;
 		ptr += 2; /* Skip :: */
-		inst = simple_strtol(ptr, &endptr, 10);
-		if (*endptr) {
+		rc = kstrtouint(ptr, 10, &inst);
+		if (rc) {
 			CERROR("config: wrong instance # %s\n", ptr);
 		} else if (inst != imp->imp_connect_data.ocd_instance) {
 			CDEBUG(D_INFO, "IR: %s is connecting to an obsoleted "
@@ -1320,7 +1429,7 @@ lprocfs_import_seq_write(struct file *file, const char __user *buffer,
 	OBD_FREE(kbuf, count + 1);
 	return count;
 }
-EXPORT_SYMBOL(lprocfs_import_seq_write);
+EXPORT_SYMBOL(ldebugfs_import_seq_write);
 
 int lprocfs_pinger_recov_seq_show(struct seq_file *m, void *n)
 {
@@ -1342,16 +1451,13 @@ lprocfs_pinger_recov_seq_write(struct file *file, const char __user *buffer,
 	struct obd_device *obd = m->private;
 	struct client_obd *cli = &obd->u.cli;
 	struct obd_import *imp = cli->cl_import;
+	bool val;
 	int rc;
-	__s64 val;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtobool_from_user(buffer, count, &val);
 	if (rc < 0)
 		return rc;
 
-	if (val != 0 && val != 1)
-		return -ERANGE;
-
 	LPROCFS_CLIMP_CHECK(obd);
 	spin_lock(&imp->imp_lock);
 	imp->imp_no_pinger_recover = !val;
@@ -1360,5 +1466,3 @@ lprocfs_pinger_recov_seq_write(struct file *file, const char __user *buffer,
 	return count;
 }
 EXPORT_SYMBOL(lprocfs_pinger_recov_seq_write);
-
-#endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/niobuf.c b/drivers/staging/lustrefsx/lustre/ptlrpc/niobuf.c
index 999869000c35b..f6e0f57e2c785 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/niobuf.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/niobuf.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -167,7 +167,6 @@ int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc)
 		RETURN(0);
 
 	/* NB no locking required until desc is on the network */
-	LASSERT(desc->bd_md_count == 0);
 	LASSERT(ptlrpc_is_bulk_op_active(desc->bd_type));
 
 	LASSERT(desc->bd_cbid.cbid_fn == server_bulk_callback);
@@ -190,7 +189,7 @@ int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc)
 	mbits = desc->bd_req->rq_mbits & ~((__u64)desc->bd_md_max_brw - 1);
 	total_md = desc->bd_req->rq_mbits - mbits + 1;
 
-	desc->bd_md_count = total_md;
+	desc->bd_refs = total_md;
 	desc->bd_failure = 0;
 
 	md.user_ptr = &desc->bd_cbid;
@@ -231,7 +230,7 @@ int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc)
 				     desc->bd_portal, mbits, 0, 0);
 		else
 			rc = LNetGet(self_nid, desc->bd_mds[posted_md],
-				     peer_id, desc->bd_portal, mbits, 0);
+				     peer_id, desc->bd_portal, mbits, 0, false);
 
 		posted_md++;
 		if (rc != 0) {
@@ -248,9 +247,9 @@ int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc)
 		 * event this creates will signal completion with failure,
 		 * so we return SUCCESS here! */
 		spin_lock(&desc->bd_lock);
-		desc->bd_md_count -= total_md - posted_md;
+		desc->bd_refs -= total_md - posted_md;
 		spin_unlock(&desc->bd_lock);
-		LASSERT(desc->bd_md_count >= 0);
+		LASSERT(desc->bd_refs >= 0);
 
 		mdunlink_iterate_helper(desc->bd_mds, posted_md);
 		RETURN(0);
@@ -327,7 +326,6 @@ int ptlrpc_register_bulk(struct ptlrpc_request *req)
 
 	/* NB no locking required until desc is on the network */
 	LASSERT(desc->bd_nob > 0);
-	LASSERT(desc->bd_md_count == 0);
 	LASSERT(desc->bd_md_max_brw <= PTLRPC_BULK_OPS_COUNT);
 	LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES);
 	LASSERT(desc->bd_req != NULL);
@@ -349,9 +347,9 @@ int ptlrpc_register_bulk(struct ptlrpc_request *req)
 	LASSERT(desc->bd_cbid.cbid_fn == client_bulk_callback);
 	LASSERT(desc->bd_cbid.cbid_arg == desc);
 
-	total_md = (desc->bd_iov_count + LNET_MAX_IOV - 1) / LNET_MAX_IOV;
+	total_md = desc->bd_md_count;
 	/* rq_mbits is matchbits of the final bulk */
-	mbits = req->rq_mbits - total_md + 1;
+	mbits = req->rq_mbits - desc->bd_md_count + 1;
 
 	LASSERTF(mbits == (req->rq_mbits & PTLRPC_BULK_OPS_MASK),
 		 "first mbits = x%llu, last mbits = x%llu\n",
@@ -364,19 +362,25 @@ int ptlrpc_register_bulk(struct ptlrpc_request *req)
 
 	desc->bd_registered = 1;
 	desc->bd_last_mbits = mbits;
-	desc->bd_md_count = total_md;
+	desc->bd_refs = total_md;
 	md.user_ptr = &desc->bd_cbid;
 	md.eq_handle = ptlrpc_eq_h;
 	md.threshold = 1;                       /* PUT or GET */
 
-	for (posted_md = 0; posted_md < total_md; posted_md++, mbits++) {
+	for (posted_md = 0; posted_md < desc->bd_md_count;
+	     posted_md++, mbits++) {
 		md.options = PTLRPC_MD_OPTIONS |
 			     (ptlrpc_is_bulk_op_get(desc->bd_type) ?
 			      LNET_MD_OP_GET : LNET_MD_OP_PUT);
 		ptlrpc_fill_bulk_md(&md, desc, posted_md);
 
-		rc = LNetMEAttach(desc->bd_portal, peer, mbits, 0,
+		if (posted_md > 0 && posted_md + 1 == desc->bd_md_count &&
+		    OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_ATTACH)) {
+			rc = -ENOMEM;
+		} else {
+			rc = LNetMEAttach(desc->bd_portal, peer, mbits, 0,
 				  LNET_UNLINK, LNET_INS_AFTER, &me_h);
+		}
 		if (rc != 0) {
 			CERROR("%s: LNetMEAttach failed x%llu/%d: rc = %d\n",
 			       desc->bd_import->imp_obd->obd_name, mbits,
@@ -400,24 +404,26 @@ int ptlrpc_register_bulk(struct ptlrpc_request *req)
 	if (rc != 0) {
 		LASSERT(rc == -ENOMEM);
 		spin_lock(&desc->bd_lock);
-		desc->bd_md_count -= total_md - posted_md;
+		desc->bd_refs -= total_md - posted_md;
 		spin_unlock(&desc->bd_lock);
-		LASSERT(desc->bd_md_count >= 0);
+		LASSERT(desc->bd_refs >= 0);
 		mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw);
 		req->rq_status = -ENOMEM;
+		desc->bd_registered = 0;
 		RETURN(-ENOMEM);
 	}
 
 	spin_lock(&desc->bd_lock);
 	/* Holler if peer manages to touch buffers before he knows the mbits */
-	if (desc->bd_md_count != total_md)
+	if (desc->bd_refs != total_md)
 		CWARN("%s: Peer %s touched %d buffers while I registered\n",
 		      desc->bd_import->imp_obd->obd_name, libcfs_id2str(peer),
-		      total_md - desc->bd_md_count);
+		      total_md - desc->bd_refs);
 	spin_unlock(&desc->bd_lock);
 
-	CDEBUG(D_NET, "Setup %u bulk %s buffers: %u pages %u bytes, "
-	       "mbits x%#llx-%#llx, portal %u\n", desc->bd_md_count,
+	CDEBUG(D_NET,
+	       "Setup %u bulk %s buffers: %u pages %u bytes, mbits x%#llx-%#llx, portal %u\n",
+	       desc->bd_refs,
 	       ptlrpc_is_bulk_op_get(desc->bd_type) ? "get-source" : "put-sink",
 	       desc->bd_iov_count, desc->bd_nob,
 	       desc->bd_last_mbits, req->rq_mbits, desc->bd_portal);
@@ -492,9 +498,11 @@ static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags)
 {
 	struct ptlrpc_service_part	*svcpt = req->rq_rqbd->rqbd_svcpt;
 	struct ptlrpc_service		*svc = svcpt->scp_service;
-	int service_time = max_t(int, ktime_get_real_seconds() -
-                                 req->rq_arrival_time.tv_sec, 1);
+	timeout_t service_timeout;
 
+	service_timeout = clamp_t(timeout_t, ktime_get_real_seconds() -
+					     req->rq_arrival_time.tv_sec, 1,
+				  (AT_OFF ? obd_timeout * 3 / 2 : at_max));
         if (!(flags & PTLRPC_REPLY_EARLY) &&
             (req->rq_type != PTL_RPC_MSG_ERR) &&
             (req->rq_reqmsg != NULL) &&
@@ -503,7 +511,8 @@ static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags)
                MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE))) {
                 /* early replies, errors and recovery requests don't count
                  * toward our service time estimate */
-		int oldse = at_measured(&svcpt->scp_at_estimate, service_time);
+		int oldse = at_measured(&svcpt->scp_at_estimate,
+					service_timeout);
 
 		if (oldse != 0) {
 			DEBUG_REQ(D_ADAPTTO, req,
@@ -513,7 +522,7 @@ static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags)
 		}
         }
         /* Report actual service time for client latency calc */
-        lustre_msg_set_service_time(req->rq_repmsg, service_time);
+	lustre_msg_set_service_timeout(req->rq_repmsg, service_timeout);
 	/* Report service time estimate for future client reqs, but report 0
 	 * (to be ignored by client) if it's an error reply during recovery.
 	 * b=15815
@@ -780,8 +789,8 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
 		if (request->rq_resend_cb != NULL)
 			request->rq_resend_cb(request, &request->rq_async_args);
 	}
-        if (request->rq_memalloc)
-                mpflag = cfs_memory_pressure_get_and_set();
+	if (request->rq_memalloc)
+		mpflag = cfs_memory_pressure_get_and_set();
 
 	rc = sptlrpc_cli_wrap_request(request);
 	if (rc)
@@ -791,7 +800,7 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
 	if (request->rq_bulk != NULL) {
 		rc = ptlrpc_register_bulk (request);
 		if (rc != 0)
-			GOTO(out, rc);
+			GOTO(cleanup_bulk, rc);
 		/*
 		 * All the mds in the request will have the same cpt
 		 * encoded in the cookie. So we can just get the first
@@ -813,13 +822,13 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
 				spin_lock(&request->rq_lock);
 				request->rq_err = 1;
 				spin_unlock(&request->rq_lock);
-                                request->rq_status = rc;
-                                GOTO(cleanup_bulk, rc);
-                        }
-                } else {
-                        request->rq_repdata = NULL;
-                        request->rq_repmsg = NULL;
-                }
+				request->rq_status = rc;
+				GOTO(cleanup_bulk, rc);
+			}
+		} else {
+			request->rq_repdata = NULL;
+			request->rq_repmsg = NULL;
+		}
 
                 rc = LNetMEAttach(request->rq_reply_portal,/*XXX FIXME bug 249*/
                                   connection->c_peer, request->rq_xid, 0,
@@ -893,8 +902,6 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
         request->rq_deadline = request->rq_sent + request->rq_timeout +
                 ptlrpc_at_get_net_latency(request);
 
-	ptlrpc_pinger_sending_on_import(imp);
-
 	DEBUG_REQ(D_INFO, request, "send flg=%x",
 		  lustre_msg_get_flags(request->rq_reqmsg));
 	rc = ptl_send_buf(&request->rq_req_md_h,
@@ -912,18 +919,20 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
                 GOTO(out, rc);
 
  cleanup_me:
-        /* MEUnlink is safe; the PUT didn't even get off the ground, and
-         * nobody apart from the PUT's target has the right nid+XID to
-         * access the reply buffer. */
-        rc2 = LNetMEUnlink(reply_me_h);
-        LASSERT (rc2 == 0);
-        /* UNLINKED callback called synchronously */
-        LASSERT(!request->rq_receiving_reply);
+	/* MEUnlink is safe; the PUT didn't even get off the ground, and
+	 * nobody apart from the PUT's target has the right nid+XID to
+	 * access the reply buffer. */
+	rc2 = LNetMEUnlink(reply_me_h);
+	LASSERT (rc2 == 0);
+	/* UNLINKED callback called synchronously */
+	LASSERT(!request->rq_receiving_reply);
 
  cleanup_bulk:
-        /* We do sync unlink here as there was no real transfer here so
-         * the chance to have long unlink to sluggish net is smaller here. */
+	/* We do sync unlink here as there was no real transfer here so
+	 * the chance to have long unlink to sluggish net is smaller here. */
         ptlrpc_unregister_bulk(request, 0);
+	if (request->rq_bulk != NULL)
+		request->rq_bulk->bd_registered = 0;
  out:
 	if (rc == -ENOMEM) {
 		/* set rq_sent so that this request is treated
@@ -944,7 +953,10 @@ EXPORT_SYMBOL(ptl_send_rpc);
 int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd)
 {
 	struct ptlrpc_service *service = rqbd->rqbd_svcpt->scp_service;
-	static struct lnet_process_id match_id = {LNET_NID_ANY, LNET_PID_ANY};
+	static struct lnet_process_id match_id = {
+		.nid = LNET_NID_ANY,
+		.pid = LNET_PID_ANY
+	};
 	int rc;
 	struct lnet_md md;
 	struct lnet_handle_me me_h;
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nodemap_internal.h b/drivers/staging/lustrefsx/lustre/ptlrpc/nodemap_internal.h
index 851bdc0dc354a..6d6b9d7a04541 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/nodemap_internal.h
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nodemap_internal.h
@@ -22,7 +22,7 @@
 /*
  * Copyright (C) 2013, Trustees of Indiana University
  *
- * Copyright (c) 2013, 2014, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
  *
  * Author: Joshua Walgenbach <jjw@iu.edu>
  */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c
index 7423e981d9e37..94d21d42f87df 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2013, 2016, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
  *
  * Copyright 2012 Xyratex Technology Limited
  */
@@ -610,10 +610,8 @@ static void nrs_crrn_req_stop(struct ptlrpc_nrs_policy *policy,
 	       libcfs_id2str(req->rq_peer), nrq->nr_u.crr.cr_round);
 }
 
-#ifdef CONFIG_PROC_FS
-
 /**
- * lprocfs interface
+ * debugfs interface
  */
 
 /**
@@ -718,7 +716,7 @@ ptlrpc_lprocfs_nrs_crrn_quantum_seq_write(struct file *file,
         if (count > (sizeof(kernbuf) - 1))
                 return -EINVAL;
 
-	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
+	if (copy_from_user(kernbuf, buffer, count))
 		return -EFAULT;
 
         kernbuf[count] = '\0';
@@ -731,7 +729,9 @@ ptlrpc_lprocfs_nrs_crrn_quantum_seq_write(struct file *file,
 	val = lprocfs_find_named_value(kernbuf, NRS_LPROCFS_QUANTUM_NAME_REG,
 				       &count_copy);
 	if (val != kernbuf) {
-		quantum_reg = simple_strtol(val, NULL, 10);
+		rc = kstrtol(val, 10, &quantum_reg);
+		if (rc)
+			return rc;
 
 		queue |= PTLRPC_NRS_QUEUE_REG;
 	}
@@ -747,7 +747,9 @@ ptlrpc_lprocfs_nrs_crrn_quantum_seq_write(struct file *file,
 		if (!nrs_svc_has_hp(svc))
 			return -ENODEV;
 
-		quantum_hp = simple_strtol(val, NULL, 10);
+		rc = kstrtol(val, 10, &quantum_hp);
+		if (rc)
+			return rc;
 
 		queue |= PTLRPC_NRS_QUEUE_HP;
 	}
@@ -757,10 +759,9 @@ ptlrpc_lprocfs_nrs_crrn_quantum_seq_write(struct file *file,
 	 * value
 	 */
 	if (queue == 0) {
-		if (!isdigit(kernbuf[0]))
-			return -EINVAL;
-
-		quantum_reg = simple_strtol(kernbuf, NULL, 10);
+		rc = kstrtol(kernbuf, 10, &quantum_reg);
+		if (rc)
+			return rc;
 
 		queue = PTLRPC_NRS_QUEUE_REG;
 
@@ -808,7 +809,8 @@ ptlrpc_lprocfs_nrs_crrn_quantum_seq_write(struct file *file,
 
 	return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count;
 }
-LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_crrn_quantum);
+
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_crrn_quantum);
 
 /**
  * Initializes a CRR-N policy's lprocfs interface for service \a svc
@@ -820,34 +822,19 @@ LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_crrn_quantum);
  */
 static int nrs_crrn_lprocfs_init(struct ptlrpc_service *svc)
 {
-	struct lprocfs_vars nrs_crrn_lprocfs_vars[] = {
+	struct ldebugfs_vars nrs_crrn_lprocfs_vars[] = {
 		{ .name		= "nrs_crrn_quantum",
 		  .fops		= &ptlrpc_lprocfs_nrs_crrn_quantum_fops,
 		  .data = svc },
 		{ NULL }
 	};
 
-	if (svc->srv_procroot == NULL)
+	if (IS_ERR_OR_NULL(svc->srv_debugfs_entry))
 		return 0;
 
-	return lprocfs_add_vars(svc->srv_procroot, nrs_crrn_lprocfs_vars, NULL);
-}
-
-/**
- * Cleans up a CRR-N policy's lprocfs interface for service \a svc
- *
- * \param[in] svc the service
- */
-static void nrs_crrn_lprocfs_fini(struct ptlrpc_service *svc)
-{
-	if (svc->srv_procroot == NULL)
-		return;
-
-	lprocfs_remove_proc_entry("nrs_crrn_quantum", svc->srv_procroot);
+	return ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_crrn_lprocfs_vars, NULL);
 }
 
-#endif /* CONFIG_PROC_FS */
-
 /**
  * CRR-N policy operations
  */
@@ -861,10 +848,7 @@ static const struct ptlrpc_nrs_pol_ops nrs_crrn_ops = {
 	.op_req_enqueue		= nrs_crrn_req_add,
 	.op_req_dequeue		= nrs_crrn_req_del,
 	.op_req_stop		= nrs_crrn_req_stop,
-#ifdef CONFIG_PROC_FS
 	.op_lprocfs_init	= nrs_crrn_lprocfs_init,
-	.op_lprocfs_fini	= nrs_crrn_lprocfs_fini,
-#endif
 };
 
 /**
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c
index 403b74efe6415..c8a1e6637d261 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c
@@ -362,11 +362,9 @@ static int nrs_delay_ctl(struct ptlrpc_nrs_policy *policy,
 }
 
 /**
- * lprocfs interface
+ * debugfs interface
  */
 
-#ifdef CONFIG_PROC_FS
-
 /* nrs_delay_min and nrs_delay_max are bounded by these values */
 #define LPROCFS_NRS_DELAY_LOWER_BOUND		0
 #define LPROCFS_NRS_DELAY_UPPER_BOUND		65535
@@ -419,7 +417,7 @@ static int nrs_delay_ctl(struct ptlrpc_nrs_policy *policy,
  * Helper for delay's seq_write functions.
  */
 static ssize_t
-lprocfs_nrs_delay_seq_write_common(struct file *file, const char __user *buffer,
+lprocfs_nrs_delay_seq_write_common(const char __user *buffer,
 				   unsigned int bufsize, size_t count,
 				   const char *var_name, unsigned int min_val,
 				   unsigned int max_val,
@@ -443,7 +441,7 @@ lprocfs_nrs_delay_seq_write_common(struct file *file, const char __user *buffer,
 	if (kernbuf == NULL)
 		return -ENOMEM;
 
-	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
+	if (copy_from_user(kernbuf, buffer, count))
 		GOTO(free_kernbuf, rc = -EFAULT);
 
 	tmpsize = strlen("reg_") + strlen(var_name) + 1;
@@ -598,7 +596,7 @@ ptlrpc_lprocfs_nrs_delay_min_seq_write(struct file *file,
 	struct seq_file *m = file->private_data;
 	struct ptlrpc_service *svc = m->private;
 
-	return lprocfs_nrs_delay_seq_write_common(file, buffer,
+	return lprocfs_nrs_delay_seq_write_common(buffer,
 						  LPROCFS_NRS_DELAY_MIN_SIZE,
 						  count,
 						  LPROCFS_NRS_DELAY_MIN_NAME,
@@ -607,7 +605,7 @@ ptlrpc_lprocfs_nrs_delay_min_seq_write(struct file *file,
 						  svc, NRS_POL_NAME_DELAY,
 						  NRS_CTL_DELAY_WR_MIN, false);
 }
-LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_min);
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_min);
 
 /**
  * Retrieves the value of the maximum delay for delay policy instances on both
@@ -681,7 +679,7 @@ ptlrpc_lprocfs_nrs_delay_max_seq_write(struct file *file,
 	struct seq_file *m = file->private_data;
 	struct ptlrpc_service *svc = m->private;
 
-	return lprocfs_nrs_delay_seq_write_common(file, buffer,
+	return lprocfs_nrs_delay_seq_write_common(buffer,
 						  LPROCFS_NRS_DELAY_MAX_SIZE,
 						  count,
 						  LPROCFS_NRS_DELAY_MAX_NAME,
@@ -690,7 +688,7 @@ ptlrpc_lprocfs_nrs_delay_max_seq_write(struct file *file,
 						  svc, NRS_POL_NAME_DELAY,
 						  NRS_CTL_DELAY_WR_MAX, false);
 }
-LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_max);
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_max);
 
 /**
  * Retrieves the value of the percentage of requests which should be delayed
@@ -765,7 +763,7 @@ ptlrpc_lprocfs_nrs_delay_pct_seq_write(struct file *file,
 	struct seq_file *m = file->private_data;
 	struct ptlrpc_service *svc = m->private;
 
-	return lprocfs_nrs_delay_seq_write_common(file, buffer,
+	return lprocfs_nrs_delay_seq_write_common(buffer,
 						  LPROCFS_NRS_DELAY_PCT_SIZE,
 						  count,
 						  LPROCFS_NRS_DELAY_PCT_NAME,
@@ -774,11 +772,12 @@ ptlrpc_lprocfs_nrs_delay_pct_seq_write(struct file *file,
 						  svc, NRS_POL_NAME_DELAY,
 						  NRS_CTL_DELAY_WR_PCT, false);
 }
-LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_pct);
+
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_pct);
 
 static int nrs_delay_lprocfs_init(struct ptlrpc_service *svc)
 {
-	struct lprocfs_vars nrs_delay_lprocfs_vars[] = {
+	struct ldebugfs_vars nrs_delay_lprocfs_vars[] = {
 		{ .name		= "nrs_delay_min",
 		  .fops		= &ptlrpc_lprocfs_nrs_delay_min_fops,
 		  .data		= svc },
@@ -791,25 +790,13 @@ static int nrs_delay_lprocfs_init(struct ptlrpc_service *svc)
 		{ NULL }
 	};
 
-	if (svc->srv_procroot == NULL)
+	if (IS_ERR_OR_NULL(svc->srv_debugfs_entry))
 		return 0;
 
-	return lprocfs_add_vars(svc->srv_procroot, nrs_delay_lprocfs_vars,
-				NULL);
+	return ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_delay_lprocfs_vars,
+				 NULL);
 }
 
-static void nrs_delay_lprocfs_fini(struct ptlrpc_service *svc)
-{
-	if (svc->srv_procroot == NULL)
-		return;
-
-	lprocfs_remove_proc_entry("nrs_delay_min", svc->srv_procroot);
-	lprocfs_remove_proc_entry("nrs_delay_max", svc->srv_procroot);
-	lprocfs_remove_proc_entry("nrs_delay_pct", svc->srv_procroot);
-}
-
-#endif /* CONFIG_PROC_FS */
-
 /**
  * Delay policy operations
  */
@@ -822,10 +809,7 @@ static const struct ptlrpc_nrs_pol_ops nrs_delay_ops = {
 	.op_req_enqueue		= nrs_delay_req_add,
 	.op_req_dequeue		= nrs_delay_req_del,
 	.op_req_stop		= nrs_delay_req_stop,
-#ifdef CONFIG_PROC_FS
 	.op_lprocfs_init	= nrs_delay_lprocfs_init,
-	.op_lprocfs_fini	= nrs_delay_lprocfs_fini,
-#endif
 };
 
 /**
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c
index 96c3a6593d2dd..8b8e092dd8209 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2013, 2016, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
  *
  * Copyright 2012 Xyratex Technology Limited
  */
@@ -45,7 +45,6 @@
 #include <obd_support.h>
 #include <obd_class.h>
 #include <lustre_net.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_req_layout.h>
 #include "ptlrpc_internal.h"
 
@@ -1161,11 +1160,9 @@ static void nrs_orr_req_stop(struct ptlrpc_nrs_policy *policy,
 }
 
 /**
- * lprocfs interface
+ * debugfs interface
  */
 
-#ifdef CONFIG_PROC_FS
-
 /**
  * This allows to bundle the policy name into the lprocfs_vars::data pointer
  * so that lprocfs read/write functions can be used by both the ORR and TRR
@@ -1297,7 +1294,7 @@ ptlrpc_lprocfs_nrs_orr_quantum_seq_write(struct file *file,
         if (count > (sizeof(kernbuf) - 1))
                 return -EINVAL;
 
-	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
+	if (copy_from_user(kernbuf, buffer, count))
 		return -EFAULT;
 
         kernbuf[count] = '\0';
@@ -1310,8 +1307,9 @@ ptlrpc_lprocfs_nrs_orr_quantum_seq_write(struct file *file,
 	val = lprocfs_find_named_value(kernbuf, NRS_LPROCFS_QUANTUM_NAME_REG,
 				       &count_copy);
 	if (val != kernbuf) {
-		quantum_reg = simple_strtol(val, NULL, 10);
-
+		rc = kstrtol(val, 10, &quantum_reg);
+		if (rc)
+			return rc;
 		queue |= PTLRPC_NRS_QUEUE_REG;
 	}
 
@@ -1326,7 +1324,9 @@ ptlrpc_lprocfs_nrs_orr_quantum_seq_write(struct file *file,
 		if (!nrs_svc_has_hp(svc))
 			return -ENODEV;
 
-		quantum_hp = simple_strtol(val, NULL, 10);
+		rc = kstrtol(val, 10, &quantum_hp);
+		if (rc)
+			return rc;
 
 		queue |= PTLRPC_NRS_QUEUE_HP;
 	}
@@ -1336,10 +1336,9 @@ ptlrpc_lprocfs_nrs_orr_quantum_seq_write(struct file *file,
 	 * value
 	 */
 	if (queue == 0) {
-		if (!isdigit(kernbuf[0]))
-			return -EINVAL;
-
-		quantum_reg = simple_strtol(kernbuf, NULL, 10);
+		rc = kstrtol(kernbuf, 10, &quantum_reg);
+		if (rc)
+			return rc;
 
 		queue = PTLRPC_NRS_QUEUE_REG;
 
@@ -1387,7 +1386,8 @@ ptlrpc_lprocfs_nrs_orr_quantum_seq_write(struct file *file,
 
 	return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count;
 }
-LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_quantum);
+
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_quantum);
 
 #define LPROCFS_NRS_OFF_NAME_REG		"reg_offset_type:"
 #define LPROCFS_NRS_OFF_NAME_HP			"hp_offset_type:"
@@ -1512,7 +1512,7 @@ ptlrpc_lprocfs_nrs_orr_offset_type_seq_write(struct file *file,
         if (count > (sizeof(kernbuf) - 1))
                 return -EINVAL;
 
-	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
+	if (copy_from_user(kernbuf, buffer, count))
 		return -EFAULT;
 
         kernbuf[count] = '\0';
@@ -1607,7 +1607,8 @@ ptlrpc_lprocfs_nrs_orr_offset_type_seq_write(struct file *file,
 
 	return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count;
 }
-LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_offset_type);
+
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_offset_type);
 
 #define NRS_LPROCFS_REQ_SUPP_NAME_REG		"reg_supported:"
 #define NRS_LPROCFS_REQ_SUPP_NAME_HP		"hp_supported:"
@@ -1772,7 +1773,7 @@ ptlrpc_lprocfs_nrs_orr_supported_seq_write(struct file *file,
         if (count > (sizeof(kernbuf) - 1))
                 return -EINVAL;
 
-	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
+	if (copy_from_user(kernbuf, buffer, count))
 		return -EFAULT;
 
         kernbuf[count] = '\0';
@@ -1858,13 +1859,14 @@ ptlrpc_lprocfs_nrs_orr_supported_seq_write(struct file *file,
 
 	return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count;
 }
-LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_supported);
+
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_supported);
 
 static int nrs_orr_lprocfs_init(struct ptlrpc_service *svc)
 {
 	int	i;
 
-	struct lprocfs_vars nrs_orr_lprocfs_vars[] = {
+	struct ldebugfs_vars nrs_orr_lprocfs_vars[] = {
 		{ .name		= "nrs_orr_quantum",
 		  .fops		= &ptlrpc_lprocfs_nrs_orr_quantum_fops	},
 		{ .name		= "nrs_orr_offset_type",
@@ -1874,7 +1876,7 @@ static int nrs_orr_lprocfs_init(struct ptlrpc_service *svc)
 		{ NULL }
 	};
 
-	if (svc->srv_procroot == NULL)
+	if (IS_ERR_OR_NULL(svc->srv_debugfs_entry))
 		return 0;
 
 	lprocfs_orr_data.svc = svc;
@@ -1882,21 +1884,10 @@ static int nrs_orr_lprocfs_init(struct ptlrpc_service *svc)
 	for (i = 0; i < ARRAY_SIZE(nrs_orr_lprocfs_vars); i++)
 		nrs_orr_lprocfs_vars[i].data = &lprocfs_orr_data;
 
-	return lprocfs_add_vars(svc->srv_procroot, nrs_orr_lprocfs_vars, NULL);
-}
-
-static void nrs_orr_lprocfs_fini(struct ptlrpc_service *svc)
-{
-	if (svc->srv_procroot == NULL)
-		return;
-
-	lprocfs_remove_proc_entry("nrs_orr_quantum", svc->srv_procroot);
-	lprocfs_remove_proc_entry("nrs_orr_offset_type", svc->srv_procroot);
-	lprocfs_remove_proc_entry("nrs_orr_supported", svc->srv_procroot);
+	return ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_orr_lprocfs_vars,
+				 NULL);
 }
 
-#endif /* CONFIG_PROC_FS */
-
 static const struct ptlrpc_nrs_pol_ops nrs_orr_ops = {
 	.op_policy_init		= nrs_orr_init,
 	.op_policy_start	= nrs_orr_start,
@@ -1908,10 +1899,7 @@ static const struct ptlrpc_nrs_pol_ops nrs_orr_ops = {
 	.op_req_enqueue		= nrs_orr_req_add,
 	.op_req_dequeue		= nrs_orr_req_del,
 	.op_req_stop		= nrs_orr_req_stop,
-#ifdef CONFIG_PROC_FS
 	.op_lprocfs_init	= nrs_orr_lprocfs_init,
-	.op_lprocfs_fini	= nrs_orr_lprocfs_fini,
-#endif
 };
 
 struct ptlrpc_nrs_pol_conf nrs_conf_orr = {
@@ -1926,14 +1914,11 @@ struct ptlrpc_nrs_pol_conf nrs_conf_orr = {
  *
  * TRR reuses much of the functions and data structures of ORR
  */
-
-#ifdef CONFIG_PROC_FS
-
 static int nrs_trr_lprocfs_init(struct ptlrpc_service *svc)
 {
 	int	i;
 
-	struct lprocfs_vars nrs_trr_lprocfs_vars[] = {
+	struct ldebugfs_vars nrs_trr_lprocfs_vars[] = {
 		{ .name		= "nrs_trr_quantum",
 		  .fops		= &ptlrpc_lprocfs_nrs_orr_quantum_fops },
 		{ .name		= "nrs_trr_offset_type",
@@ -1943,7 +1928,7 @@ static int nrs_trr_lprocfs_init(struct ptlrpc_service *svc)
 		{ NULL }
 	};
 
-	if (svc->srv_procroot == NULL)
+	if (IS_ERR_OR_NULL(svc->srv_debugfs_entry))
 		return 0;
 
 	lprocfs_trr_data.svc = svc;
@@ -1951,21 +1936,10 @@ static int nrs_trr_lprocfs_init(struct ptlrpc_service *svc)
 	for (i = 0; i < ARRAY_SIZE(nrs_trr_lprocfs_vars); i++)
 		nrs_trr_lprocfs_vars[i].data = &lprocfs_trr_data;
 
-	return lprocfs_add_vars(svc->srv_procroot, nrs_trr_lprocfs_vars, NULL);
+	return ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_trr_lprocfs_vars,
+				 NULL);
 }
 
-static void nrs_trr_lprocfs_fini(struct ptlrpc_service *svc)
-{
-	if (svc->srv_procroot == NULL)
-		return;
-
-	lprocfs_remove_proc_entry("nrs_trr_quantum", svc->srv_procroot);
-	lprocfs_remove_proc_entry("nrs_trr_offset_type", svc->srv_procroot);
-	lprocfs_remove_proc_entry("nrs_trr_supported", svc->srv_procroot);
-}
-
-#endif /* CONFIG_PROC_FS */
-
 /**
  * Reuse much of the ORR functionality for TRR.
  */
@@ -1980,10 +1954,7 @@ static const struct ptlrpc_nrs_pol_ops nrs_trr_ops = {
 	.op_req_enqueue		= nrs_orr_req_add,
 	.op_req_dequeue		= nrs_orr_req_del,
 	.op_req_stop		= nrs_orr_req_stop,
-#ifdef CONFIG_PROC_FS
 	.op_lprocfs_init	= nrs_trr_lprocfs_init,
-	.op_lprocfs_fini	= nrs_trr_lprocfs_fini,
-#endif
 };
 
 struct ptlrpc_nrs_pol_conf nrs_conf_trr = {
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c
index a81485554013b..07710bdb7bfd9 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c
@@ -42,6 +42,7 @@
 #include <obd_support.h>
 #include <obd_class.h>
 #include <libcfs/libcfs.h>
+#include <lustre_req_layout.h>
 #include "ptlrpc_internal.h"
 
 /**
@@ -300,6 +301,7 @@ nrs_tbf_rule_start(struct ptlrpc_nrs_policy *policy,
 
 	memcpy(rule->tr_name, start->tc_name, strlen(start->tc_name));
 	rule->tr_rpc_rate = start->u.tc_start.ts_rpc_rate;
+	rule->tr_flags = start->u.tc_start.ts_rule_flags;
 	rule->tr_nsecs = NSEC_PER_SEC;
 	do_div(rule->tr_nsecs, rule->tr_rpc_rate);
 	rule->tr_depth = tbf_depth;
@@ -521,11 +523,9 @@ tbf_cli_compare(struct cfs_binheap_node *e1, struct cfs_binheap_node *e2)
 	cli1 = container_of(e1, struct nrs_tbf_client, tc_node);
 	cli2 = container_of(e2, struct nrs_tbf_client, tc_node);
 
-	if (cli1->tc_check_time + cli1->tc_nsecs <
-	    cli2->tc_check_time + cli2->tc_nsecs)
+	if (cli1->tc_deadline < cli2->tc_deadline)
 		return 1;
-	else if (cli1->tc_check_time + cli1->tc_nsecs >
-		 cli2->tc_check_time + cli2->tc_nsecs)
+	else if (cli1->tc_deadline > cli2->tc_deadline)
 		return 0;
 
 	if (cli1->tc_check_time < cli2->tc_check_time)
@@ -570,7 +570,7 @@ static void *nrs_tbf_jobid_hop_key(struct hlist_node *hnode)
 	return cli->tc_jobid;
 }
 
-static void *nrs_tbf_jobid_hop_object(struct hlist_node *hnode)
+static void *nrs_tbf_hop_object(struct hlist_node *hnode)
 {
 	return hlist_entry(hnode, struct nrs_tbf_client, tc_hnode);
 }
@@ -609,7 +609,7 @@ static struct cfs_hash_ops nrs_tbf_jobid_hash_ops = {
 	.hs_hash	= nrs_tbf_jobid_hop_hash,
 	.hs_keycmp	= nrs_tbf_jobid_hop_keycmp,
 	.hs_key		= nrs_tbf_jobid_hop_key,
-	.hs_object	= nrs_tbf_jobid_hop_object,
+	.hs_object	= nrs_tbf_hop_object,
 	.hs_get		= nrs_tbf_jobid_hop_get,
 	.hs_put		= nrs_tbf_jobid_hop_put,
 	.hs_put_locked	= nrs_tbf_jobid_hop_put,
@@ -1071,11 +1071,6 @@ static void *nrs_tbf_nid_hop_key(struct hlist_node *hnode)
 	return &cli->tc_nid;
 }
 
-static void *nrs_tbf_nid_hop_object(struct hlist_node *hnode)
-{
-	return hlist_entry(hnode, struct nrs_tbf_client, tc_hnode);
-}
-
 static void nrs_tbf_nid_hop_get(struct cfs_hash *hs, struct hlist_node *hnode)
 {
 	struct nrs_tbf_client *cli = hlist_entry(hnode,
@@ -1111,7 +1106,7 @@ static struct cfs_hash_ops nrs_tbf_nid_hash_ops = {
 	.hs_hash	= nrs_tbf_nid_hop_hash,
 	.hs_keycmp	= nrs_tbf_nid_hop_keycmp,
 	.hs_key		= nrs_tbf_nid_hop_key,
-	.hs_object	= nrs_tbf_nid_hop_object,
+	.hs_object	= nrs_tbf_hop_object,
 	.hs_get		= nrs_tbf_nid_hop_get,
 	.hs_put		= nrs_tbf_nid_hop_put,
 	.hs_put_locked	= nrs_tbf_nid_hop_put,
@@ -1307,11 +1302,6 @@ static void *nrs_tbf_hop_key(struct hlist_node *hnode)
 	return cli->tc_key;
 }
 
-static void *nrs_tbf_hop_object(struct hlist_node *hnode)
-{
-	return hlist_entry(hnode, struct nrs_tbf_client, tc_hnode);
-}
-
 static void nrs_tbf_hop_get(struct cfs_hash *hs, struct hlist_node *hnode)
 {
 	struct nrs_tbf_client *cli = hlist_entry(hnode,
@@ -1415,23 +1405,263 @@ nrs_tbf_cli_hash_lookup(struct cfs_hash *hs, struct cfs_hash_bd *bd,
 	return cli;
 }
 
+/**
+ * ONLY opcode presented in this function will be checked in
+ * nrs_tbf_id_cli_set(). That means, we can add or remove an
+ * opcode to enable or disable requests handled in nrs_tbf
+ */
+static struct req_format *req_fmt(__u32 opcode)
+{
+	switch (opcode) {
+	case OST_GETATTR:
+		return &RQF_OST_GETATTR;
+	case OST_SETATTR:
+		return &RQF_OST_SETATTR;
+	case OST_READ:
+		return &RQF_OST_BRW_READ;
+	case OST_WRITE:
+		return &RQF_OST_BRW_WRITE;
+	/* FIXME: OST_CREATE and OST_DESTROY comes from MDS
+	 * in most case. Should they be removed? */
+	case OST_CREATE:
+		return &RQF_OST_CREATE;
+	case OST_DESTROY:
+		return &RQF_OST_DESTROY;
+	case OST_PUNCH:
+		return &RQF_OST_PUNCH;
+	case OST_SYNC:
+		return &RQF_OST_SYNC;
+	case OST_LADVISE:
+		return &RQF_OST_LADVISE;
+	case MDS_GETATTR:
+		return &RQF_MDS_GETATTR;
+	case MDS_GETATTR_NAME:
+		return &RQF_MDS_GETATTR_NAME;
+	/* close is skipped to avoid LDLM cancel slowness */
+#if 0
+	case MDS_CLOSE:
+		return &RQF_MDS_CLOSE;
+#endif
+	case MDS_REINT:
+		return &RQF_MDS_REINT;
+	case MDS_READPAGE:
+		return &RQF_MDS_READPAGE;
+	case MDS_GET_ROOT:
+		return &RQF_MDS_GET_ROOT;
+	case MDS_STATFS:
+		return &RQF_MDS_STATFS;
+	case MDS_SYNC:
+		return &RQF_MDS_SYNC;
+	case MDS_QUOTACTL:
+		return &RQF_MDS_QUOTACTL;
+	case MDS_GETXATTR:
+		return &RQF_MDS_GETXATTR;
+	case MDS_GET_INFO:
+		return &RQF_MDS_GET_INFO;
+	/* HSM op is skipped */
+#if 0 
+	case MDS_HSM_STATE_GET:
+		return &RQF_MDS_HSM_STATE_GET;
+	case MDS_HSM_STATE_SET:
+		return &RQF_MDS_HSM_STATE_SET;
+	case MDS_HSM_ACTION:
+		return &RQF_MDS_HSM_ACTION;
+	case MDS_HSM_CT_REGISTER:
+		return &RQF_MDS_HSM_CT_REGISTER;
+	case MDS_HSM_CT_UNREGISTER:
+		return &RQF_MDS_HSM_CT_UNREGISTER;
+#endif
+	case MDS_SWAP_LAYOUTS:
+		return &RQF_MDS_SWAP_LAYOUTS;
+	case LDLM_ENQUEUE:
+		return &RQF_LDLM_ENQUEUE;
+	default:
+		return NULL;
+	}
+}
+
+static struct req_format *intent_req_fmt(__u32 it_opc)
+{
+	if (it_opc & (IT_OPEN | IT_CREAT))
+		return &RQF_LDLM_INTENT_OPEN;
+	else if (it_opc & (IT_GETATTR | IT_LOOKUP))
+		return &RQF_LDLM_INTENT_GETATTR;
+	else if (it_opc & IT_GETXATTR)
+		return &RQF_LDLM_INTENT_GETXATTR;
+	else if (it_opc & (IT_GLIMPSE | IT_BRW))
+		return &RQF_LDLM_INTENT;
+	else
+		return NULL;
+}
+
+static int ost_tbf_id_cli_set(struct ptlrpc_request *req,
+			      struct tbf_id *id)
+{
+	struct ost_body *body;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body != NULL) {
+		id->ti_uid = body->oa.o_uid;
+		id->ti_gid = body->oa.o_gid;
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static void unpack_ugid_from_mdt_body(struct ptlrpc_request *req,
+				      struct tbf_id *id)
+{
+	struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+						    &RMF_MDT_BODY);
+	LASSERT(b != NULL);
+
+	/* TODO: nodemaping feature converts {ug}id from individual
+	 * clients to the actual ones of the file system. Some work
+	 * may be needed to fix this. */
+	id->ti_uid = b->mbo_uid;
+	id->ti_gid = b->mbo_gid;
+}
+
+static void unpack_ugid_from_mdt_rec_reint(struct ptlrpc_request *req,
+					   struct tbf_id *id)
+{
+	struct mdt_rec_reint *rec;
+
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+	LASSERT(rec != NULL);
+
+	/* use the fs{ug}id as {ug}id of the process */
+	id->ti_uid = rec->rr_fsuid;
+	id->ti_gid = rec->rr_fsgid;
+}
+
+static int mdt_tbf_id_cli_set(struct ptlrpc_request *req,
+			      struct tbf_id *id)
+{
+	u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
+	int rc = 0;
+
+	switch (opc) {
+	case MDS_GETATTR:
+	case MDS_GETATTR_NAME:
+	case MDS_GET_ROOT:
+	case MDS_READPAGE:
+	case MDS_SYNC:
+	case MDS_GETXATTR:
+	case MDS_HSM_STATE_GET ... MDS_SWAP_LAYOUTS:
+		unpack_ugid_from_mdt_body(req, id);
+		break;
+	case MDS_CLOSE:
+	case MDS_REINT:
+		unpack_ugid_from_mdt_rec_reint(req, id);
+		break;
+	default:
+		rc = -EINVAL;
+		break;
+	}
+	return rc;
+}
+
+static int ldlm_tbf_id_cli_set(struct ptlrpc_request *req,
+			      struct tbf_id *id)
+{
+	struct ldlm_intent *lit;
+	struct req_format *fmt;
+
+	if (req->rq_reqmsg->lm_bufcount <= DLM_INTENT_IT_OFF)
+		return -EINVAL;
+
+	req_capsule_extend(&req->rq_pill, &RQF_LDLM_INTENT_BASIC);
+	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+	if (lit == NULL)
+		return -EINVAL;
+
+	fmt = intent_req_fmt(lit->opc);
+	if (fmt == NULL)
+		return -EINVAL;
+
+	req_capsule_extend(&req->rq_pill, fmt);
+
+	if (lit->opc & (IT_GETXATTR | IT_GETATTR | IT_LOOKUP))
+		unpack_ugid_from_mdt_body(req, id);
+	else if (lit->opc & (IT_OPEN | IT_OPEN | IT_GLIMPSE | IT_BRW))
+		unpack_ugid_from_mdt_rec_reint(req, id);
+	else
+		return -EINVAL;
+	return 0;
+}
+
+static int nrs_tbf_id_cli_set(struct ptlrpc_request *req, struct tbf_id *id,
+			      enum nrs_tbf_flag ti_type)
+{
+	u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
+	struct req_format *fmt = req_fmt(opc);
+	bool fmt_unset = false;
+	int rc;
+
+	memset(id, 0, sizeof(struct tbf_id));
+	id->ti_type = ti_type;
+
+	if (fmt == NULL)
+		return -EINVAL;
+	req_capsule_init(&req->rq_pill, req, RCL_SERVER);
+	if (req->rq_pill.rc_fmt == NULL) {
+		req_capsule_set(&req->rq_pill, fmt);
+		fmt_unset = true;
+	}
+
+	if (opc < OST_LAST_OPC)
+		rc = ost_tbf_id_cli_set(req, id);
+	else if (opc >= MDS_FIRST_OPC && opc < MDS_LAST_OPC)
+		rc = mdt_tbf_id_cli_set(req, id);
+	else if (opc == LDLM_ENQUEUE)
+		rc = ldlm_tbf_id_cli_set(req, id);
+	else
+		rc = -EINVAL;
+
+	/* restore it to the initialized state */
+	if (fmt_unset)
+		req->rq_pill.rc_fmt = NULL;
+	return rc;
+}
+
+static inline void nrs_tbf_cli_gen_key(struct nrs_tbf_client *cli,
+				       struct ptlrpc_request *req,
+				       char *keystr, size_t keystr_sz)
+{
+	const char *jobid;
+	u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
+	struct tbf_id id;
+
+	nrs_tbf_id_cli_set(req, &id, NRS_TBF_FLAG_UID | NRS_TBF_FLAG_GID);
+	jobid = lustre_msg_get_jobid(req->rq_reqmsg);
+	if (jobid == NULL)
+		jobid = NRS_TBF_JOBID_NULL;
+
+	snprintf(keystr, keystr_sz, "%s_%s_%d_%u_%u", jobid,
+		 libcfs_nid2str(req->rq_peer.nid), opc, id.ti_uid,
+		 id.ti_gid);
+
+	if (cli) {
+		INIT_LIST_HEAD(&cli->tc_lru);
+		strlcpy(cli->tc_key, keystr, sizeof(cli->tc_key));
+		strlcpy(cli->tc_jobid, jobid, sizeof(cli->tc_jobid));
+		cli->tc_nid = req->rq_peer.nid;
+		cli->tc_opcode = opc;
+		cli->tc_id = id;
+	}
+}
+
 static struct nrs_tbf_client *
 nrs_tbf_cli_find(struct nrs_tbf_head *head, struct ptlrpc_request *req)
 {
 	struct nrs_tbf_client *cli;
 	struct cfs_hash *hs = head->th_cli_hash;
 	struct cfs_hash_bd bd;
-	char keystr[NRS_TBF_KEY_LEN] = { '\0' };
-	const char *jobid;
-	__u32 opc;
+	char keystr[NRS_TBF_KEY_LEN];
 
-	jobid = lustre_msg_get_jobid(req->rq_reqmsg);
-	if (jobid == NULL)
-		jobid = NRS_TBF_JOBID_NULL;
-	opc = lustre_msg_get_opc(req->rq_reqmsg);
-	snprintf(keystr, sizeof(keystr), "%s_%s_%d", jobid,
-		 libcfs_nid2str(req->rq_peer.nid), opc);
-	LASSERT(strlen(keystr) < NRS_TBF_KEY_LEN);
+	nrs_tbf_cli_gen_key(NULL, req, keystr, sizeof(keystr));
 	cfs_hash_bd_get_and_lock(hs, (void *)keystr, &bd, 1);
 	cli = nrs_tbf_cli_hash_lookup(hs, &bd, keystr);
 	cfs_hash_bd_unlock(hs, &bd, 1);
@@ -1506,22 +1736,19 @@ nrs_tbf_generic_cli_init(struct nrs_tbf_client *cli,
 			 struct ptlrpc_request *req)
 {
 	char keystr[NRS_TBF_KEY_LEN];
-	const char *jobid;
-	__u32 opc;
 
-	jobid = lustre_msg_get_jobid(req->rq_reqmsg);
-	if (jobid == NULL)
-		jobid = NRS_TBF_JOBID_NULL;
-	opc = lustre_msg_get_opc(req->rq_reqmsg);
-	snprintf(keystr, sizeof(keystr), "%s_%s_%d", jobid,
-		 libcfs_nid2str(req->rq_peer.nid), opc);
+	nrs_tbf_cli_gen_key(cli, req, keystr, sizeof(keystr));
+}
 
-	LASSERT(strlen(keystr) < NRS_TBF_KEY_LEN);
-	INIT_LIST_HEAD(&cli->tc_lru);
-	memcpy(cli->tc_key, keystr, strlen(keystr));
-	memcpy(cli->tc_jobid, jobid, strlen(jobid));
-	cli->tc_nid = req->rq_peer.nid;
-	cli->tc_opcode = opc;
+static void
+nrs_tbf_id_list_free(struct list_head *uid_list)
+{
+	struct nrs_tbf_id *nti_id, *n;
+
+	list_for_each_entry_safe(nti_id, n, uid_list, nti_linkage) {
+		list_del_init(&nti_id->nti_linkage);
+		OBD_FREE_PTR(nti_id);
+	}
 }
 
 static void
@@ -1539,6 +1766,10 @@ nrs_tbf_expression_free(struct nrs_tbf_expression *expr)
 	case NRS_TBF_FIELD_OPCODE:
 		CFS_FREE_BITMAP(expr->te_opcodes);
 		break;
+	case NRS_TBF_FIELD_UID:
+	case NRS_TBF_FIELD_GID:
+		nrs_tbf_id_list_free(&expr->te_cond);
+		break;
 	default:
 		LBUG();
 	}
@@ -1598,6 +1829,9 @@ nrs_tbf_check_field(struct cfs_lstr *field, char *str)
 
 static int
 nrs_tbf_opcode_list_parse(char *str, int len, struct cfs_bitmap **bitmaptr);
+static int
+nrs_tbf_id_list_parse(char *str, int len, struct list_head *id_list,
+		      enum nrs_tbf_flag tif);
 
 static int
 nrs_tbf_expression_parse(struct cfs_lstr *src, struct list_head *cond_list)
@@ -1637,8 +1871,23 @@ nrs_tbf_expression_parse(struct cfs_lstr *src, struct list_head *cond_list)
 					      &expr->te_opcodes) < 0)
 			GOTO(out, rc = -EINVAL);
 		expr->te_field = NRS_TBF_FIELD_OPCODE;
-	} else
+	} else if (nrs_tbf_check_field(&field, "uid")) {
+		if (nrs_tbf_id_list_parse(src->ls_str,
+					  src->ls_len,
+					  &expr->te_cond,
+					  NRS_TBF_FLAG_UID) < 0)
+			GOTO(out, rc = -EINVAL);
+		expr->te_field = NRS_TBF_FIELD_UID;
+	} else if (nrs_tbf_check_field(&field, "gid")) {
+		if (nrs_tbf_id_list_parse(src->ls_str,
+					  src->ls_len,
+					  &expr->te_cond,
+					  NRS_TBF_FLAG_GID) < 0)
+			GOTO(out, rc = -EINVAL);
+		expr->te_field = NRS_TBF_FIELD_GID;
+	} else {
 		GOTO(out, rc = -EINVAL);
+	}
 
 	list_add_tail(&expr->te_linkage, cond_list);
 	return 0;
@@ -1719,6 +1968,9 @@ nrs_tbf_generic_parse(struct nrs_tbf_cmd *cmd, const char *id)
 	return rc;
 }
 
+static int
+nrs_tbf_id_list_match(struct list_head *id_list, struct tbf_id id);
+
 static int
 nrs_tbf_expression_match(struct nrs_tbf_expression *expr,
 			 struct nrs_tbf_rule *rule,
@@ -1731,6 +1983,9 @@ nrs_tbf_expression_match(struct nrs_tbf_expression *expr,
 		return nrs_tbf_jobid_list_match(&expr->te_cond, cli->tc_jobid);
 	case NRS_TBF_FIELD_OPCODE:
 		return cfs_bitmap_check(expr->te_opcodes, cli->tc_opcode);
+	case NRS_TBF_FIELD_UID:
+	case NRS_TBF_FIELD_GID:
+		return nrs_tbf_id_list_match(&expr->te_cond, cli->tc_id);
 	default:
 		return 0;
 	}
@@ -1868,11 +2123,6 @@ static void *nrs_tbf_opcode_hop_key(struct hlist_node *hnode)
 	return &cli->tc_opcode;
 }
 
-static void *nrs_tbf_opcode_hop_object(struct hlist_node *hnode)
-{
-	return hlist_entry(hnode, struct nrs_tbf_client, tc_hnode);
-}
-
 static void nrs_tbf_opcode_hop_get(struct cfs_hash *hs,
 				   struct hlist_node *hnode)
 {
@@ -1911,7 +2161,7 @@ static struct cfs_hash_ops nrs_tbf_opcode_hash_ops = {
 	.hs_hash	= nrs_tbf_opcode_hop_hash,
 	.hs_keycmp	= nrs_tbf_opcode_hop_keycmp,
 	.hs_key		= nrs_tbf_opcode_hop_key,
-	.hs_object	= nrs_tbf_opcode_hop_object,
+	.hs_object	= nrs_tbf_hop_object,
 	.hs_get		= nrs_tbf_opcode_hop_get,
 	.hs_put		= nrs_tbf_opcode_hop_put,
 	.hs_put_locked	= nrs_tbf_opcode_hop_put,
@@ -2127,6 +2377,340 @@ struct nrs_tbf_ops nrs_tbf_opcode_ops = {
 	.o_rule_fini = nrs_tbf_opcode_rule_fini,
 };
 
+static unsigned nrs_tbf_id_hop_hash(struct cfs_hash *hs, const void *key,
+				    unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, sizeof(struct tbf_id), mask);
+}
+
+static int nrs_tbf_id_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+	const struct tbf_id *opc = key;
+	enum nrs_tbf_flag ntf;
+	struct nrs_tbf_client *cli = hlist_entry(hnode, struct nrs_tbf_client,
+						 tc_hnode);
+	ntf = opc->ti_type & cli->tc_id.ti_type;
+	if ((ntf & NRS_TBF_FLAG_UID) && opc->ti_uid != cli->tc_id.ti_uid)
+		return 0;
+
+	if ((ntf & NRS_TBF_FLAG_GID) && opc->ti_gid != cli->tc_id.ti_gid)
+		return 0;
+
+	return 1;
+}
+
+static void *nrs_tbf_id_hop_key(struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+	return &cli->tc_id;
+}
+
+static void nrs_tbf_id_hop_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	atomic_inc(&cli->tc_ref);
+}
+
+static void nrs_tbf_id_hop_put(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	atomic_dec(&cli->tc_ref);
+}
+
+static void
+nrs_tbf_id_hop_exit(struct cfs_hash *hs, struct hlist_node *hnode)
+
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	LASSERT(atomic_read(&cli->tc_ref) == 0);
+	nrs_tbf_cli_fini(cli);
+}
+
+static struct cfs_hash_ops nrs_tbf_id_hash_ops = {
+	.hs_hash	= nrs_tbf_id_hop_hash,
+	.hs_keycmp	= nrs_tbf_id_hop_keycmp,
+	.hs_key		= nrs_tbf_id_hop_key,
+	.hs_object	= nrs_tbf_hop_object,
+	.hs_get		= nrs_tbf_id_hop_get,
+	.hs_put		= nrs_tbf_id_hop_put,
+	.hs_put_locked	= nrs_tbf_id_hop_put,
+	.hs_exit	= nrs_tbf_id_hop_exit,
+};
+
+static int
+nrs_tbf_id_startup(struct ptlrpc_nrs_policy *policy,
+		   struct nrs_tbf_head *head)
+{
+	struct nrs_tbf_cmd start;
+	int rc;
+
+	head->th_cli_hash = cfs_hash_create("nrs_tbf_id_hash",
+					    NRS_TBF_NID_BITS,
+					    NRS_TBF_NID_BITS,
+					    NRS_TBF_NID_BKT_BITS, 0,
+					    CFS_HASH_MIN_THETA,
+					    CFS_HASH_MAX_THETA,
+					    &nrs_tbf_id_hash_ops,
+					    CFS_HASH_RW_BKTLOCK);
+	if (head->th_cli_hash == NULL)
+		return -ENOMEM;
+
+	memset(&start, 0, sizeof(start));
+	start.u.tc_start.ts_ids_str = "*";
+	start.u.tc_start.ts_rpc_rate = tbf_rate;
+	start.u.tc_start.ts_rule_flags = NTRS_DEFAULT;
+	start.tc_name = NRS_TBF_DEFAULT_RULE;
+	INIT_LIST_HEAD(&start.u.tc_start.ts_ids);
+	rc = nrs_tbf_rule_start(policy, head, &start);
+	if (rc) {
+		cfs_hash_putref(head->th_cli_hash);
+		head->th_cli_hash = NULL;
+	}
+
+	return rc;
+}
+
+static struct nrs_tbf_client *
+nrs_tbf_id_cli_find(struct nrs_tbf_head *head,
+		    struct ptlrpc_request *req)
+{
+	struct tbf_id id;
+
+	LASSERT(head->th_type_flag == NRS_TBF_FLAG_UID ||
+		head->th_type_flag == NRS_TBF_FLAG_GID);
+
+	nrs_tbf_id_cli_set(req, &id, head->th_type_flag);
+	return cfs_hash_lookup(head->th_cli_hash, &id);
+}
+
+static struct nrs_tbf_client *
+nrs_tbf_id_cli_findadd(struct nrs_tbf_head *head,
+		       struct nrs_tbf_client *cli)
+{
+	return cfs_hash_findadd_unique(head->th_cli_hash, &cli->tc_id,
+				       &cli->tc_hnode);
+}
+
+static void
+nrs_tbf_uid_cli_init(struct nrs_tbf_client *cli,
+		     struct ptlrpc_request *req)
+{
+	nrs_tbf_id_cli_set(req, &cli->tc_id, NRS_TBF_FLAG_UID);
+}
+
+static void
+nrs_tbf_gid_cli_init(struct nrs_tbf_client *cli,
+		     struct ptlrpc_request *req)
+{
+	nrs_tbf_id_cli_set(req, &cli->tc_id, NRS_TBF_FLAG_GID);
+}
+
+static int
+nrs_tbf_id_list_match(struct list_head *id_list, struct tbf_id id)
+{
+	struct nrs_tbf_id *nti_id;
+	enum nrs_tbf_flag flag;
+
+	list_for_each_entry(nti_id, id_list, nti_linkage) {
+		flag = id.ti_type & nti_id->nti_id.ti_type;
+		if (!flag)
+			continue;
+
+		if ((flag & NRS_TBF_FLAG_UID) &&
+		    (id.ti_uid != nti_id->nti_id.ti_uid))
+			continue;
+
+		if ((flag & NRS_TBF_FLAG_GID) &&
+		    (id.ti_gid != nti_id->nti_id.ti_gid))
+			continue;
+
+		return 1;
+	}
+	return 0;
+}
+
+static int
+nrs_tbf_id_rule_match(struct nrs_tbf_rule *rule,
+		      struct nrs_tbf_client *cli)
+{
+	return nrs_tbf_id_list_match(&rule->tr_ids, cli->tc_id);
+}
+
+static void nrs_tbf_id_cmd_fini(struct nrs_tbf_cmd *cmd)
+{
+	nrs_tbf_id_list_free(&cmd->u.tc_start.ts_ids);
+
+	if (cmd->u.tc_start.ts_ids_str)
+		OBD_FREE(cmd->u.tc_start.ts_ids_str,
+			 strlen(cmd->u.tc_start.ts_ids_str) + 1);
+}
+
+static int
+nrs_tbf_id_list_parse(char *str, int len, struct list_head *id_list,
+		      enum nrs_tbf_flag tif)
+{
+	struct cfs_lstr src;
+	struct cfs_lstr res;
+	int rc = 0;
+	struct tbf_id id = { 0 };
+	ENTRY;
+
+	if (tif != NRS_TBF_FLAG_UID && tif != NRS_TBF_FLAG_GID)
+		RETURN(-EINVAL);
+
+	src.ls_str = str;
+	src.ls_len = len;
+	INIT_LIST_HEAD(id_list);
+	while (src.ls_str) {
+		struct nrs_tbf_id *nti_id;
+
+		if (cfs_gettok(&src, ' ', &res) == 0)
+			GOTO(out, rc = -EINVAL);
+
+		id.ti_type = tif;
+		if (tif == NRS_TBF_FLAG_UID) {
+			if (!cfs_str2num_check(res.ls_str, res.ls_len,
+					       &id.ti_uid, 0, (u32)~0U))
+				GOTO(out, rc = -EINVAL);
+		} else {
+			if (!cfs_str2num_check(res.ls_str, res.ls_len,
+					       &id.ti_gid, 0, (u32)~0U))
+				GOTO(out, rc = -EINVAL);
+		}
+
+		OBD_ALLOC_PTR(nti_id);
+		if (nti_id == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		nti_id->nti_id = id;
+		list_add_tail(&nti_id->nti_linkage, id_list);
+	}
+out:
+	if (rc)
+		nrs_tbf_id_list_free(id_list);
+	RETURN(rc);
+}
+
+static int nrs_tbf_ug_id_parse(struct nrs_tbf_cmd *cmd, char *id)
+{
+	struct cfs_lstr src;
+	int rc;
+	enum nrs_tbf_flag tif;
+
+	tif = cmd->u.tc_start.ts_valid_type;
+
+	src.ls_str = id;
+	src.ls_len = strlen(id);
+
+	rc = nrs_tbf_check_id_value(&src,
+				    tif == NRS_TBF_FLAG_UID ? "uid" : "gid");
+	if (rc)
+		return rc;
+
+	OBD_ALLOC(cmd->u.tc_start.ts_ids_str, src.ls_len + 1);
+	if (cmd->u.tc_start.ts_ids_str == NULL)
+		return -ENOMEM;
+
+	strlcpy(cmd->u.tc_start.ts_ids_str, src.ls_str, src.ls_len + 1);
+
+	rc = nrs_tbf_id_list_parse(cmd->u.tc_start.ts_ids_str,
+				   strlen(cmd->u.tc_start.ts_ids_str),
+				   &cmd->u.tc_start.ts_ids, tif);
+	if (rc)
+		nrs_tbf_id_cmd_fini(cmd);
+
+	return rc;
+}
+
+static int
+nrs_tbf_id_rule_init(struct ptlrpc_nrs_policy *policy,
+		     struct nrs_tbf_rule *rule,
+		     struct nrs_tbf_cmd *start)
+{
+	struct nrs_tbf_head *head = rule->tr_head;
+	int rc = 0;
+	enum nrs_tbf_flag tif = head->th_type_flag;
+	int ids_len = strlen(start->u.tc_start.ts_ids_str) + 1;
+
+	LASSERT(start->u.tc_start.ts_ids_str);
+	INIT_LIST_HEAD(&rule->tr_ids);
+
+	OBD_ALLOC(rule->tr_ids_str, ids_len);
+	if (rule->tr_ids_str == NULL)
+		return -ENOMEM;
+
+	strlcpy(rule->tr_ids_str, start->u.tc_start.ts_ids_str,
+		ids_len);
+
+	if (!list_empty(&start->u.tc_start.ts_ids)) {
+		rc = nrs_tbf_id_list_parse(rule->tr_ids_str,
+					   strlen(rule->tr_ids_str),
+					   &rule->tr_ids, tif);
+		if (rc)
+			CERROR("%ss {%s} illegal\n",
+			       tif == NRS_TBF_FLAG_UID ? "uid" : "gid",
+			       rule->tr_ids_str);
+	}
+	if (rc) {
+		OBD_FREE(rule->tr_ids_str, ids_len);
+		rule->tr_ids_str = NULL;
+	}
+	return rc;
+}
+
+static int
+nrs_tbf_id_rule_dump(struct nrs_tbf_rule *rule, struct seq_file *m)
+{
+	seq_printf(m, "%s {%s} %llu, ref %d\n", rule->tr_name,
+		   rule->tr_ids_str, rule->tr_rpc_rate,
+		   atomic_read(&rule->tr_ref) - 1);
+	return 0;
+}
+
+static void nrs_tbf_id_rule_fini(struct nrs_tbf_rule *rule)
+{
+	nrs_tbf_id_list_free(&rule->tr_ids);
+	if (rule->tr_ids_str != NULL)
+		OBD_FREE(rule->tr_ids_str, strlen(rule->tr_ids_str) + 1);
+}
+
+struct nrs_tbf_ops nrs_tbf_uid_ops = {
+	.o_name = NRS_TBF_TYPE_UID,
+	.o_startup = nrs_tbf_id_startup,
+	.o_cli_find = nrs_tbf_id_cli_find,
+	.o_cli_findadd = nrs_tbf_id_cli_findadd,
+	.o_cli_put = nrs_tbf_nid_cli_put,
+	.o_cli_init = nrs_tbf_uid_cli_init,
+	.o_rule_init = nrs_tbf_id_rule_init,
+	.o_rule_dump = nrs_tbf_id_rule_dump,
+	.o_rule_match = nrs_tbf_id_rule_match,
+	.o_rule_fini = nrs_tbf_id_rule_fini,
+};
+
+struct nrs_tbf_ops nrs_tbf_gid_ops = {
+	.o_name = NRS_TBF_TYPE_GID,
+	.o_startup = nrs_tbf_id_startup,
+	.o_cli_find = nrs_tbf_id_cli_find,
+	.o_cli_findadd = nrs_tbf_id_cli_findadd,
+	.o_cli_put = nrs_tbf_nid_cli_put,
+	.o_cli_init = nrs_tbf_gid_cli_init,
+	.o_rule_init = nrs_tbf_id_rule_init,
+	.o_rule_dump = nrs_tbf_id_rule_dump,
+	.o_rule_match = nrs_tbf_id_rule_match,
+	.o_rule_fini = nrs_tbf_id_rule_fini,
+};
+
 static struct nrs_tbf_type nrs_tbf_types[] = {
 	{
 		.ntt_name = NRS_TBF_TYPE_JOBID,
@@ -2148,6 +2732,16 @@ static struct nrs_tbf_type nrs_tbf_types[] = {
 		.ntt_flag = NRS_TBF_FLAG_GENERIC,
 		.ntt_ops = &nrs_tbf_generic_ops,
 	},
+	{
+		.ntt_name = NRS_TBF_TYPE_UID,
+		.ntt_flag = NRS_TBF_FLAG_UID,
+		.ntt_ops = &nrs_tbf_uid_ops,
+	},
+	{
+		.ntt_name = NRS_TBF_TYPE_GID,
+		.ntt_flag = NRS_TBF_FLAG_GID,
+		.ntt_ops = &nrs_tbf_gid_ops,
+	},
 };
 
 /**
@@ -2476,10 +3070,12 @@ struct ptlrpc_nrs_request *nrs_tbf_req_get(struct ptlrpc_nrs_policy *policy,
 				     struct ptlrpc_nrs_request,
 				     nr_u.tbf.tr_list);
 	} else {
+		struct nrs_tbf_rule *rule = cli->tc_rule;
 		__u64 now = ktime_to_ns(ktime_get());
 		__u64 passed;
 		__u64 ntoken;
 		__u64 deadline;
+		__u64 old_resid = 0;
 
 		deadline = cli->tc_check_time +
 			  cli->tc_nsecs;
@@ -2487,9 +3083,19 @@ struct ptlrpc_nrs_request *nrs_tbf_req_get(struct ptlrpc_nrs_policy *policy,
 		passed = now - cli->tc_check_time;
 		ntoken = passed * cli->tc_rpc_rate;
 		do_div(ntoken, NSEC_PER_SEC);
+
 		ntoken += cli->tc_ntoken;
-		if (ntoken > cli->tc_depth)
+		if (rule->tr_flags & NTRS_REALTIME) {
+			LASSERT(cli->tc_nsecs_resid < cli->tc_nsecs);
+			old_resid = cli->tc_nsecs_resid;
+			cli->tc_nsecs_resid += passed % cli->tc_nsecs;
+			if (cli->tc_nsecs_resid > cli->tc_nsecs) {
+				ntoken++;
+				cli->tc_nsecs_resid -= cli->tc_nsecs;
+			}
+		} else if (ntoken > cli->tc_depth)
 			ntoken = cli->tc_depth;
+
 		if (ntoken > 0) {
 			struct ptlrpc_request *req;
 			nrq = list_entry(cli->tc_list.next,
@@ -2507,6 +3113,8 @@ struct ptlrpc_nrs_request *nrs_tbf_req_get(struct ptlrpc_nrs_policy *policy,
 						   &cli->tc_node);
 				cli->tc_in_heap = false;
 			} else {
+				if (!(rule->tr_flags & NTRS_REALTIME))
+					cli->tc_deadline = now + cli->tc_nsecs;
 				cfs_binheap_relocate(head->th_binheap,
 						     &cli->tc_node);
 			}
@@ -2520,6 +3128,15 @@ struct ptlrpc_nrs_request *nrs_tbf_req_get(struct ptlrpc_nrs_policy *policy,
 		} else {
 			ktime_t time;
 
+			if (rule->tr_flags & NTRS_REALTIME) {
+				cli->tc_deadline = deadline;
+				cli->tc_nsecs_resid = old_resid;
+				cfs_binheap_relocate(head->th_binheap,
+						     &cli->tc_node);
+				if (node != cfs_binheap_root(head->th_binheap))
+					return nrs_tbf_req_get(policy,
+							       peek, force);
+			}
 			policy->pol_nrs->nrs_throttling = 1;
 			head->th_deadline = deadline;
 			time = ktime_set(0, 0);
@@ -2555,6 +3172,7 @@ static int nrs_tbf_req_add(struct ptlrpc_nrs_policy *policy,
 			    struct nrs_tbf_head, th_res);
 	if (list_empty(&cli->tc_list)) {
 		LASSERT(!cli->tc_in_heap);
+		cli->tc_deadline = cli->tc_check_time + cli->tc_nsecs;
 		rc = cfs_binheap_insert(head->th_binheap, &cli->tc_node);
 		if (rc == 0) {
 			cli->tc_in_heap = true;
@@ -2562,8 +3180,7 @@ static int nrs_tbf_req_add(struct ptlrpc_nrs_policy *policy,
 			list_add_tail(&nrq->nr_u.tbf.tr_list,
 					  &cli->tc_list);
 			if (policy->pol_nrs->nrs_throttling) {
-				__u64 deadline = cli->tc_check_time +
-						 cli->tc_nsecs;
+				__u64 deadline = cli->tc_deadline;
 				if ((head->th_deadline > deadline) &&
 				    (hrtimer_try_to_cancel(&head->th_timer)
 				     >= 0)) {
@@ -2649,10 +3266,8 @@ static void nrs_tbf_req_stop(struct ptlrpc_nrs_policy *policy,
 	       nrq->nr_u.tbf.tr_sequence);
 }
 
-#ifdef CONFIG_PROC_FS
-
 /**
- * lprocfs interface
+ * debugfs interface
  */
 
 /**
@@ -2719,6 +3334,7 @@ ptlrpc_lprocfs_nrs_tbf_rule_seq_show(struct seq_file *m, void *data)
 static int nrs_tbf_id_parse(struct nrs_tbf_cmd *cmd, char *token)
 {
 	int rc;
+	ENTRY;
 
 	switch (cmd->u.tc_start.ts_valid_type) {
 	case NRS_TBF_FLAG_JOBID:
@@ -2733,24 +3349,41 @@ static int nrs_tbf_id_parse(struct nrs_tbf_cmd *cmd, char *token)
 	case NRS_TBF_FLAG_GENERIC:
 		rc = nrs_tbf_generic_parse(cmd, token);
 		break;
+	case NRS_TBF_FLAG_UID:
+	case NRS_TBF_FLAG_GID:
+		rc = nrs_tbf_ug_id_parse(cmd, token);
+		break;
 	default:
 		RETURN(-EINVAL);
 	}
 
-	return rc;
+	RETURN(rc);
 }
 
 static void nrs_tbf_cmd_fini(struct nrs_tbf_cmd *cmd)
 {
 	if (cmd->tc_cmd == NRS_CTL_TBF_START_RULE) {
-		if (cmd->u.tc_start.ts_valid_type == NRS_TBF_FLAG_JOBID)
+		switch (cmd->u.tc_start.ts_valid_type) {
+		case NRS_TBF_FLAG_JOBID:
 			nrs_tbf_jobid_cmd_fini(cmd);
-		else if (cmd->u.tc_start.ts_valid_type == NRS_TBF_FLAG_NID)
+			break;
+		case NRS_TBF_FLAG_NID:
 			nrs_tbf_nid_cmd_fini(cmd);
-		else if (cmd->u.tc_start.ts_valid_type == NRS_TBF_FLAG_OPCODE)
+			break;
+		case NRS_TBF_FLAG_OPCODE:
 			nrs_tbf_opcode_cmd_fini(cmd);
-		else if (cmd->u.tc_start.ts_valid_type == NRS_TBF_FLAG_GENERIC)
+			break;
+		case NRS_TBF_FLAG_GENERIC:
 			nrs_tbf_generic_cmd_fini(cmd);
+			break;
+		case NRS_TBF_FLAG_UID:
+		case NRS_TBF_FLAG_GID:
+			nrs_tbf_id_cmd_fini(cmd);
+			break;
+		default:
+			CWARN("unknown NRS_TBF_FLAGS:0x%x\n",
+			      cmd->u.tc_start.ts_valid_type);
+		}
 	}
 }
 
@@ -2804,6 +3437,15 @@ nrs_tbf_parse_value_pair(struct nrs_tbf_cmd *cmd, char *buffer)
 			cmd->u.tc_change.tc_next_name = val;
 		else
 			return -EINVAL;
+	} else if (strcmp(key, "realtime") == 0) {
+		unsigned long realtime;
+
+		rc = kstrtoul(val, 10, &realtime);
+		if (rc)
+			return rc;
+
+		if (realtime > 0)
+			cmd->u.tc_start.ts_rule_flags |= NTRS_REALTIME;
 	} else {
 		return -EINVAL;
 	}
@@ -2965,7 +3607,7 @@ ptlrpc_lprocfs_nrs_tbf_rule_seq_write(struct file *file,
 	if (count > LPROCFS_WR_NRS_TBF_MAX_CMD - 1)
 		GOTO(out_free_kernbuff, rc = -EINVAL);
 
-	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
+	if (copy_from_user(kernbuf, buffer, count))
 		GOTO(out_free_kernbuff, rc = -EFAULT);
 
 	val = kernbuf;
@@ -3013,7 +3655,8 @@ ptlrpc_lprocfs_nrs_tbf_rule_seq_write(struct file *file,
 out:
 	return rc ? rc : count;
 }
-LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_tbf_rule);
+
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_tbf_rule);
 
 /**
  * Initializes a TBF policy's lprocfs interface for service \a svc
@@ -3025,34 +3668,20 @@ LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_tbf_rule);
  */
 static int nrs_tbf_lprocfs_init(struct ptlrpc_service *svc)
 {
-	struct lprocfs_vars nrs_tbf_lprocfs_vars[] = {
+	struct ldebugfs_vars nrs_tbf_lprocfs_vars[] = {
 		{ .name		= "nrs_tbf_rule",
 		  .fops		= &ptlrpc_lprocfs_nrs_tbf_rule_fops,
 		  .data = svc },
 		{ NULL }
 	};
 
-	if (svc->srv_procroot == NULL)
+	if (IS_ERR_OR_NULL(svc->srv_debugfs_entry))
 		return 0;
 
-	return lprocfs_add_vars(svc->srv_procroot, nrs_tbf_lprocfs_vars, NULL);
+	return ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_tbf_lprocfs_vars,
+				 NULL);
 }
 
-/**
- * Cleans up a TBF policy's lprocfs interface for service \a svc
- *
- * \param[in] svc the service
- */
-static void nrs_tbf_lprocfs_fini(struct ptlrpc_service *svc)
-{
-	if (svc->srv_procroot == NULL)
-		return;
-
-	lprocfs_remove_proc_entry("nrs_tbf_rule", svc->srv_procroot);
-}
-
-#endif /* CONFIG_PROC_FS */
-
 /**
  * TBF policy operations
  */
@@ -3066,10 +3695,7 @@ static const struct ptlrpc_nrs_pol_ops nrs_tbf_ops = {
 	.op_req_enqueue		= nrs_tbf_req_add,
 	.op_req_dequeue		= nrs_tbf_req_del,
 	.op_req_stop		= nrs_tbf_req_stop,
-#ifdef CONFIG_PROC_FS
 	.op_lprocfs_init	= nrs_tbf_lprocfs_init,
-	.op_lprocfs_fini	= nrs_tbf_lprocfs_fini,
-#endif
 };
 
 /**
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c b/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c
index 3e97aa6332ed3..5e2d384435fbb 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -42,8 +42,6 @@
 
 #include <libcfs/libcfs.h>
 
-#include <lustre/ll_fiemap.h>
-
 #include <llog_swab.h>
 #include <lustre_net.h>
 #include <lustre_swab.h>
@@ -62,13 +60,15 @@ static inline __u32 lustre_msg_hdr_size_v2(__u32 count)
 
 __u32 lustre_msg_hdr_size(__u32 magic, __u32 count)
 {
-        switch (magic) {
-        case LUSTRE_MSG_MAGIC_V2:
-                return lustre_msg_hdr_size_v2(count);
-        default:
-                LASSERTF(0, "incorrect message magic: %08x\n", magic);
+	LASSERT(count > 0);
+
+	switch (magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_msg_hdr_size_v2(count);
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", magic);
 		return 0;
-        }
+	}
 }
 
 void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout,
@@ -80,25 +80,26 @@ void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout,
                 lustre_set_rep_swabbed(req, index);
 }
 
-int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout,
-			 __u32 index)
+bool ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout,
+			  __u32 index)
 {
-        if (inout)
-                return (ptlrpc_req_need_swab(req) &&
-                        !lustre_req_swabbed(req, index));
-        else
-                return (ptlrpc_rep_need_swab(req) &&
-                        !lustre_rep_swabbed(req, index));
+	if (inout)
+		return (ptlrpc_req_need_swab(req) &&
+			!lustre_req_swabbed(req, index));
+
+	return (ptlrpc_rep_need_swab(req) && !lustre_rep_swabbed(req, index));
 }
 
 static inline int lustre_msg_check_version_v2(struct lustre_msg_v2 *msg,
-						__u32 version)
+					      enum lustre_msg_version version)
 {
-        __u32 ver = lustre_msg_get_version(msg);
-        return (ver & LUSTRE_VERSION_MASK) != version;
+	enum lustre_msg_version ver = lustre_msg_get_version(msg);
+
+	return (ver & LUSTRE_VERSION_MASK) != version;
 }
 
-int lustre_msg_check_version(struct lustre_msg *msg, __u32 version)
+int lustre_msg_check_version(struct lustre_msg *msg,
+			     enum lustre_msg_version version)
 {
 #define LUSTRE_MSG_MAGIC_V1 0x0BD00BD0
 	switch (msg->lm_magic) {
@@ -136,13 +137,14 @@ EXPORT_SYMBOL(lustre_msg_early_size);
 __u32 lustre_msg_size_v2(int count, __u32 *lengths)
 {
 	__u32 size;
-        int i;
+	int i;
 
-        size = lustre_msg_hdr_size_v2(count);
-        for (i = 0; i < count; i++)
-                size += cfs_size_round(lengths[i]);
+	LASSERT(count > 0);
+	size = lustre_msg_hdr_size_v2(count);
+	for (i = 0; i < count; i++)
+		size += cfs_size_round(lengths[i]);
 
-        return size;
+	return size;
 }
 EXPORT_SYMBOL(lustre_msg_size_v2);
 
@@ -185,22 +187,25 @@ __u32 lustre_packed_msg_size(struct lustre_msg *msg)
                 return 0;
         }
 }
+EXPORT_SYMBOL(lustre_packed_msg_size);
 
 void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens,
-                        char **bufs)
+			char **bufs)
 {
-        char *ptr;
-        int i;
+	char *ptr;
+	int i;
 
-        msg->lm_bufcount = count;
-        /* XXX: lm_secflvr uninitialized here */
-        msg->lm_magic = LUSTRE_MSG_MAGIC_V2;
+	LASSERT(count > 0);
 
-        for (i = 0; i < count; i++)
-                msg->lm_buflens[i] = lens[i];
+	msg->lm_bufcount = count;
+	/* XXX: lm_secflvr uninitialized here */
+	msg->lm_magic = LUSTRE_MSG_MAGIC_V2;
 
-        if (bufs == NULL)
-                return;
+	for (i = 0; i < count; i++)
+		msg->lm_buflens[i] = lens[i];
+
+	if (bufs == NULL)
+		return;
 
 	ptr = (char *)msg + lustre_msg_hdr_size_v2(count);
 	for (i = 0; i < count; i++) {
@@ -327,24 +332,25 @@ void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs)
 }
 
 int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
-                         __u32 *lens, char **bufs, int flags)
+			 __u32 *lens, char **bufs, int flags)
 {
-        struct ptlrpc_reply_state *rs;
-        int                        msg_len, rc;
-        ENTRY;
+	struct ptlrpc_reply_state *rs;
+	int                        msg_len, rc;
+	ENTRY;
 
-        LASSERT(req->rq_reply_state == NULL);
+	LASSERT(req->rq_reply_state == NULL);
+	LASSERT(count > 0);
 
-        if ((flags & LPRFL_EARLY_REPLY) == 0) {
+	if ((flags & LPRFL_EARLY_REPLY) == 0) {
 		spin_lock(&req->rq_lock);
 		req->rq_packed_final = 1;
 		spin_unlock(&req->rq_lock);
-        }
+	}
 
-        msg_len = lustre_msg_size_v2(count, lens);
-        rc = sptlrpc_svc_alloc_rs(req, msg_len);
-        if (rc)
-                RETURN(rc);
+	msg_len = lustre_msg_size_v2(count, lens);
+	rc = sptlrpc_svc_alloc_rs(req, msg_len);
+	if (rc)
+		RETURN(rc);
 
 	rs = req->rq_reply_state;
 	atomic_set(&rs->rs_refcount, 1);	/* 1 ref for rq_reply_state */
@@ -356,16 +362,16 @@ int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
 	INIT_LIST_HEAD(&rs->rs_list);
 	spin_lock_init(&rs->rs_lock);
 
-        req->rq_replen = msg_len;
-        req->rq_reply_state = rs;
-        req->rq_repmsg = rs->rs_msg;
+	req->rq_replen = msg_len;
+	req->rq_reply_state = rs;
+	req->rq_repmsg = rs->rs_msg;
 
-        lustre_init_msg_v2(rs->rs_msg, count, lens, bufs);
-        lustre_msg_add_version(rs->rs_msg, PTLRPC_MSG_VERSION);
+	lustre_init_msg_v2(rs->rs_msg, count, lens, bufs);
+	lustre_msg_add_version(rs->rs_msg, PTLRPC_MSG_VERSION);
 
-        PTLRPC_RS_DEBUG_LRU_ADD(rs);
+	PTLRPC_RS_DEBUG_LRU_ADD(rs);
 
-        RETURN(0);
+	RETURN(0);
 }
 EXPORT_SYMBOL(lustre_pack_reply_v2);
 
@@ -409,28 +415,29 @@ void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, __u32 n, __u32 min_size)
 {
 	__u32 i, offset, buflen, bufcount;
 
-        LASSERT(m != NULL);
+	LASSERT(m != NULL);
+	LASSERT(m->lm_bufcount > 0);
 
-        bufcount = m->lm_bufcount;
-        if (unlikely(n >= bufcount)) {
-                CDEBUG(D_INFO, "msg %p buffer[%d] not present (count %d)\n",
-                       m, n, bufcount);
-                return NULL;
-        }
+	bufcount = m->lm_bufcount;
+	if (unlikely(n >= bufcount)) {
+		CDEBUG(D_INFO, "msg %p buffer[%d] not present (count %d)\n",
+		       m, n, bufcount);
+		return NULL;
+	}
 
-        buflen = m->lm_buflens[n];
-        if (unlikely(buflen < min_size)) {
-                CERROR("msg %p buffer[%d] size %d too small "
-                       "(required %d, opc=%d)\n", m, n, buflen, min_size,
-                       n == MSG_PTLRPC_BODY_OFF ? -1 : lustre_msg_get_opc(m));
-                return NULL;
-        }
+	buflen = m->lm_buflens[n];
+	if (unlikely(buflen < min_size)) {
+		CERROR("msg %p buffer[%d] size %d too small "
+		       "(required %d, opc=%d)\n", m, n, buflen, min_size,
+		       n == MSG_PTLRPC_BODY_OFF ? -1 : lustre_msg_get_opc(m));
+		return NULL;
+	}
 
-        offset = lustre_msg_hdr_size_v2(bufcount);
-        for (i = 0; i < n; i++)
-                offset += cfs_size_round(m->lm_buflens[i]);
+	offset = lustre_msg_hdr_size_v2(bufcount);
+	for (i = 0; i < n; i++)
+		offset += cfs_size_round(m->lm_buflens[i]);
 
-        return (char *)m + offset;
+	return (char *)m + offset;
 }
 
 void *lustre_msg_buf(struct lustre_msg *m, __u32 n, __u32 min_size)
@@ -523,52 +530,60 @@ void lustre_free_reply_state(struct ptlrpc_reply_state *rs)
 
 static int lustre_unpack_msg_v2(struct lustre_msg_v2 *m, int len)
 {
-        int swabbed, required_len, i;
+	int swabbed, required_len, i, buflen;
 
-        /* Now we know the sender speaks my language. */
-        required_len = lustre_msg_hdr_size_v2(0);
-        if (len < required_len) {
-                /* can't even look inside the message */
-                CERROR("message length %d too small for lustre_msg\n", len);
-                return -EINVAL;
-        }
+	/* Now we know the sender speaks my language. */
+	required_len = lustre_msg_hdr_size_v2(0);
+	if (len < required_len) {
+		/* can't even look inside the message */
+		CERROR("message length %d too small for lustre_msg\n", len);
+		return -EINVAL;
+	}
 
-        swabbed = (m->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED);
-
-        if (swabbed) {
-                __swab32s(&m->lm_magic);
-                __swab32s(&m->lm_bufcount);
-                __swab32s(&m->lm_secflvr);
-                __swab32s(&m->lm_repsize);
-                __swab32s(&m->lm_cksum);
-                __swab32s(&m->lm_flags);
-                CLASSERT(offsetof(typeof(*m), lm_padding_2) != 0);
-                CLASSERT(offsetof(typeof(*m), lm_padding_3) != 0);
-        }
+	swabbed = (m->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED);
 
-        required_len = lustre_msg_hdr_size_v2(m->lm_bufcount);
-        if (len < required_len) {
-                /* didn't receive all the buffer lengths */
-                CERROR ("message length %d too small for %d buflens\n",
-                        len, m->lm_bufcount);
-                return -EINVAL;
-        }
+	if (swabbed) {
+		__swab32s(&m->lm_magic);
+		__swab32s(&m->lm_bufcount);
+		__swab32s(&m->lm_secflvr);
+		__swab32s(&m->lm_repsize);
+		__swab32s(&m->lm_cksum);
+		__swab32s(&m->lm_flags);
+		CLASSERT(offsetof(typeof(*m), lm_padding_2) != 0);
+		CLASSERT(offsetof(typeof(*m), lm_padding_3) != 0);
+	}
 
-        for (i = 0; i < m->lm_bufcount; i++) {
-                if (swabbed)
-                        __swab32s(&m->lm_buflens[i]);
-                required_len += cfs_size_round(m->lm_buflens[i]);
-        }
+	if (m->lm_bufcount == 0 || m->lm_bufcount > PTLRPC_MAX_BUFCOUNT) {
+		CERROR("message bufcount %d is not valid\n", m->lm_bufcount);
+		return -EINVAL;
+	}
+	required_len = lustre_msg_hdr_size_v2(m->lm_bufcount);
+	if (len < required_len) {
+		/* didn't receive all the buffer lengths */
+		CERROR("message length %d too small for %d buflens\n",
+		       len, m->lm_bufcount);
+		return -EINVAL;
+	}
 
-        if (len < required_len) {
-                CERROR("len: %d, required_len %d\n", len, required_len);
-                CERROR("bufcount: %d\n", m->lm_bufcount);
-                for (i = 0; i < m->lm_bufcount; i++)
-                        CERROR("buffer %d length %d\n", i, m->lm_buflens[i]);
-                return -EINVAL;
-        }
+	for (i = 0; i < m->lm_bufcount; i++) {
+		if (swabbed)
+			__swab32s(&m->lm_buflens[i]);
+		buflen = cfs_size_round(m->lm_buflens[i]);
+		if (buflen < 0 || buflen > PTLRPC_MAX_BUFLEN) {
+			CERROR("buffer %d length %d is not valid\n", i, buflen);
+			return -EINVAL;
+		}
+		required_len += buflen;
+	}
+	if (len < required_len || required_len > PTLRPC_MAX_BUFLEN) {
+		CERROR("len: %d, required_len %d, bufcount: %d\n",
+		       len, required_len, m->lm_bufcount);
+		for (i = 0; i < m->lm_bufcount; i++)
+			CERROR("buffer %d length %d\n", i, m->lm_buflens[i]);
+		return -EINVAL;
+	}
 
-        return swabbed;
+	return swabbed;
 }
 
 int __lustre_unpack_msg(struct lustre_msg *m, int len)
@@ -757,6 +772,11 @@ char *lustre_msg_string(struct lustre_msg *m, __u32 index, __u32 max_len)
                         "msg %p buffer[%d] len %d\n", m, index, blen);
                 return NULL;
         }
+	if (blen > PTLRPC_MAX_BUFLEN) {
+		CERROR("buffer length of msg %p buffer[%d] is invalid(%d)\n",
+		       m, index, blen);
+		return NULL;
+	}
 
         if (max_len == 0) {
                 if (slen != blen - 1) {
@@ -802,7 +822,7 @@ static inline struct ptlrpc_body *lustre_msg_ptlrpc_body(struct lustre_msg *msg)
 				 sizeof(struct ptlrpc_body_v2));
 }
 
-__u32 lustre_msghdr_get_flags(struct lustre_msg *msg)
+enum lustre_msghdr lustre_msghdr_get_flags(struct lustre_msg *msg)
 {
 	switch (msg->lm_magic) {
 	case LUSTRE_MSG_MAGIC_V2:
@@ -836,7 +856,7 @@ __u32 lustre_msg_get_flags(struct lustre_msg *msg)
 
 		CERROR("invalid msg %p: no ptlrpc body!\n", msg);
 	}
-	/* Fall through */
+	/* fallthrough */
 	default:
 		/* flags might be printed in debug code while message
 		 * uninitialized */
@@ -880,7 +900,8 @@ void lustre_msg_clear_flags(struct lustre_msg *msg, __u32 flags)
 	case LUSTRE_MSG_MAGIC_V2: {
 		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
 		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
-		pb->pb_flags &= ~(MSG_GEN_FLAG_MASK & flags);
+		pb->pb_flags &= ~flags;
+
 		return;
 	}
 	default:
@@ -899,7 +920,7 @@ __u32 lustre_msg_get_op_flags(struct lustre_msg *msg)
 
 		CERROR("invalid msg %p: no ptlrpc body!\n", msg);
 	}
-	/* Fall through */
+	/* fallthrough */
 	default:
 		return 0;
 	}
@@ -955,7 +976,7 @@ __u32 lustre_msg_get_type(struct lustre_msg *msg)
 }
 EXPORT_SYMBOL(lustre_msg_get_type);
 
-__u32 lustre_msg_get_version(struct lustre_msg *msg)
+enum lustre_msg_version lustre_msg_get_version(struct lustre_msg *msg)
 {
 	switch (msg->lm_magic) {
 	case LUSTRE_MSG_MAGIC_V2: {
@@ -1104,7 +1125,7 @@ int lustre_msg_get_status(struct lustre_msg *msg)
 			return pb->pb_status;
 		CERROR("invalid msg %p: no ptlrpc body!\n", msg);
 	}
-	/* Fall through */
+	/* fallthrough */
 	default:
 		/* status might be printed in debug code while message
 		* uninitialized */
@@ -1214,11 +1235,12 @@ __u32 lustre_msg_get_magic(struct lustre_msg *msg)
 	}
 }
 
-__u32 lustre_msg_get_timeout(struct lustre_msg *msg)
+timeout_t lustre_msg_get_timeout(struct lustre_msg *msg)
 {
 	switch (msg->lm_magic) {
 	case LUSTRE_MSG_MAGIC_V2: {
 		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+
 		if (pb == NULL) {
 			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
 			return 0;
@@ -1231,11 +1253,12 @@ __u32 lustre_msg_get_timeout(struct lustre_msg *msg)
 	}
 }
 
-__u32 lustre_msg_get_service_time(struct lustre_msg *msg)
+timeout_t lustre_msg_get_service_timeout(struct lustre_msg *msg)
 {
 	switch (msg->lm_magic) {
 	case LUSTRE_MSG_MAGIC_V2: {
 		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+
 		if (pb == NULL) {
 			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
 			return 0;
@@ -1465,11 +1488,13 @@ void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt)
 	}
 }
 
-void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout)
+void lustre_msg_set_timeout(struct lustre_msg *msg, timeout_t timeout)
 {
 	switch (msg->lm_magic) {
 	case LUSTRE_MSG_MAGIC_V2: {
 		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+
+		LASSERT(timeout >= 0);
 		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
 		pb->pb_timeout = timeout;
 		return;
@@ -1479,13 +1504,16 @@ void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout)
 	}
 }
 
-void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time)
+void lustre_msg_set_service_timeout(struct lustre_msg *msg,
+				    timeout_t service_timeout)
 {
 	switch (msg->lm_magic) {
 	case LUSTRE_MSG_MAGIC_V2: {
 		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+
+		LASSERT(service_timeout >= 0);
 		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
-		pb->pb_service_time = service_time;
+		pb->pb_service_time = service_timeout;
 		return;
 	}
 	default:
@@ -1511,9 +1539,9 @@ void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid)
 		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
 
 		if (jobid != NULL)
-			memcpy(pb->pb_jobid, jobid, LUSTRE_JOBID_SIZE);
+			memcpy(pb->pb_jobid, jobid, sizeof(pb->pb_jobid));
 		else if (pb->pb_jobid[0] == '\0')
-			lustre_get_jobid(pb->pb_jobid);
+			lustre_get_jobid(pb->pb_jobid, sizeof(pb->pb_jobid));
 		return;
 	}
 	default:
@@ -1618,39 +1646,40 @@ EXPORT_SYMBOL(do_set_info_async);
 /* byte flipping routines for all wire types declared in
  * lustre_idl.h implemented here.
  */
-void lustre_swab_ptlrpc_body(struct ptlrpc_body *b)
-{
-        __swab32s (&b->pb_type);
-        __swab32s (&b->pb_version);
-        __swab32s (&b->pb_opc);
-        __swab32s (&b->pb_status);
-        __swab64s (&b->pb_last_xid);
-	__swab16s (&b->pb_tag);
-        __swab64s (&b->pb_last_committed);
-        __swab64s (&b->pb_transno);
-        __swab32s (&b->pb_flags);
-        __swab32s (&b->pb_op_flags);
-        __swab32s (&b->pb_conn_cnt);
-        __swab32s (&b->pb_timeout);
-        __swab32s (&b->pb_service_time);
-        __swab32s (&b->pb_limit);
-        __swab64s (&b->pb_slv);
-        __swab64s (&b->pb_pre_versions[0]);
-        __swab64s (&b->pb_pre_versions[1]);
-        __swab64s (&b->pb_pre_versions[2]);
-        __swab64s (&b->pb_pre_versions[3]);
-	__swab64s(&b->pb_mbits);
-	CLASSERT(offsetof(typeof(*b), pb_padding0) != 0);
-	CLASSERT(offsetof(typeof(*b), pb_padding1) != 0);
-	CLASSERT(offsetof(typeof(*b), pb_padding64_0) != 0);
-	CLASSERT(offsetof(typeof(*b), pb_padding64_1) != 0);
-	CLASSERT(offsetof(typeof(*b), pb_padding64_2) != 0);
+void lustre_swab_ptlrpc_body(struct ptlrpc_body *body)
+{
+	__swab32s(&body->pb_type);
+	__swab32s(&body->pb_version);
+	__swab32s(&body->pb_opc);
+	__swab32s(&body->pb_status);
+	__swab64s(&body->pb_last_xid);
+	__swab16s(&body->pb_tag);
+	CLASSERT(offsetof(typeof(*body), pb_padding0) != 0);
+	CLASSERT(offsetof(typeof(*body), pb_padding1) != 0);
+	__swab64s(&body->pb_last_committed);
+	__swab64s(&body->pb_transno);
+	__swab32s(&body->pb_flags);
+	__swab32s(&body->pb_op_flags);
+	__swab32s(&body->pb_conn_cnt);
+	__swab32s(&body->pb_timeout);
+	__swab32s(&body->pb_service_time);
+	__swab32s(&body->pb_limit);
+	__swab64s(&body->pb_slv);
+	__swab64s(&body->pb_pre_versions[0]);
+	__swab64s(&body->pb_pre_versions[1]);
+	__swab64s(&body->pb_pre_versions[2]);
+	__swab64s(&body->pb_pre_versions[3]);
+	__swab64s(&body->pb_mbits);
+	CLASSERT(offsetof(typeof(*body), pb_padding64_0) != 0);
+	CLASSERT(offsetof(typeof(*body), pb_padding64_1) != 0);
+	CLASSERT(offsetof(typeof(*body), pb_padding64_2) != 0);
 	/* While we need to maintain compatibility between
 	 * clients and servers without ptlrpc_body_v2 (< 2.3)
 	 * do not swab any fields beyond pb_jobid, as we are
 	 * using this swab function for both ptlrpc_body
 	 * and ptlrpc_body_v2. */
-	CLASSERT(offsetof(typeof(*b), pb_jobid) != 0);
+	/* pb_jobid is an ASCII string and should not be swabbed */
+	CLASSERT(offsetof(typeof(*body), pb_jobid) != 0);
 }
 
 void lustre_swab_connect(struct obd_connect_data *ocd)
@@ -1730,7 +1759,7 @@ void lustre_swab_obdo (struct obdo  *o)
 	__swab32s(&o->o_stripe_idx);
 	__swab32s(&o->o_parent_ver);
 	lustre_swab_ost_layout(&o->o_layout);
-	CLASSERT(offsetof(typeof(*o), o_padding_3) != 0);
+	__swab32s(&o->o_layout_version);
 	__swab32s(&o->o_uid_h);
 	__swab32s(&o->o_gid_h);
 	__swab64s(&o->o_data_version);
@@ -1744,26 +1773,26 @@ EXPORT_SYMBOL(lustre_swab_obdo);
 
 void lustre_swab_obd_statfs (struct obd_statfs *os)
 {
-        __swab64s (&os->os_type);
-        __swab64s (&os->os_blocks);
-        __swab64s (&os->os_bfree);
-        __swab64s (&os->os_bavail);
-        __swab64s (&os->os_files);
-        __swab64s (&os->os_ffree);
-        /* no need to swab os_fsid */
-        __swab32s (&os->os_bsize);
-        __swab32s (&os->os_namelen);
-        __swab64s (&os->os_maxbytes);
-        __swab32s (&os->os_state);
-	CLASSERT(offsetof(typeof(*os), os_fprecreated) != 0);
-        CLASSERT(offsetof(typeof(*os), os_spare2) != 0);
-        CLASSERT(offsetof(typeof(*os), os_spare3) != 0);
-        CLASSERT(offsetof(typeof(*os), os_spare4) != 0);
-        CLASSERT(offsetof(typeof(*os), os_spare5) != 0);
-        CLASSERT(offsetof(typeof(*os), os_spare6) != 0);
-        CLASSERT(offsetof(typeof(*os), os_spare7) != 0);
-        CLASSERT(offsetof(typeof(*os), os_spare8) != 0);
-        CLASSERT(offsetof(typeof(*os), os_spare9) != 0);
+	__swab64s(&os->os_type);
+	__swab64s(&os->os_blocks);
+	__swab64s(&os->os_bfree);
+	__swab64s(&os->os_bavail);
+	__swab64s(&os->os_files);
+	__swab64s(&os->os_ffree);
+	/* no need to swab os_fsid */
+	__swab32s(&os->os_bsize);
+	__swab32s(&os->os_namelen);
+	__swab64s(&os->os_maxbytes);
+	__swab32s(&os->os_state);
+	__swab32s(&os->os_fprecreated);
+	__swab32s(&os->os_granted);
+	CLASSERT(offsetof(typeof(*os), os_spare3) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare4) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare5) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare6) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare7) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare8) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare9) != 0);
 }
 
 void lustre_swab_obd_ioobj(struct obd_ioobj *ioo)
@@ -1868,7 +1897,7 @@ void lustre_swab_mdt_body (struct mdt_body *b)
 	__swab64s(&b->mbo_atime);
 	__swab64s(&b->mbo_ctime);
 	__swab64s(&b->mbo_blocks);
-	__swab64s(&b->mbo_ioepoch);
+	__swab64s(&b->mbo_version);
 	__swab64s(&b->mbo_t_state);
 	__swab32s(&b->mbo_fsuid);
 	__swab32s(&b->mbo_fsgid);
@@ -1879,7 +1908,7 @@ void lustre_swab_mdt_body (struct mdt_body *b)
 	__swab32s(&b->mbo_flags);
 	__swab32s(&b->mbo_rdev);
 	__swab32s(&b->mbo_nlink);
-	CLASSERT(offsetof(typeof(*b), mbo_unused2) != 0);
+	__swab32s(&b->mbo_layout_gen);
 	__swab32s(&b->mbo_suppgid);
 	__swab32s(&b->mbo_eadatasize);
 	__swab32s(&b->mbo_aclsize);
@@ -1888,8 +1917,8 @@ void lustre_swab_mdt_body (struct mdt_body *b)
 	__swab32s(&b->mbo_uid_h);
 	__swab32s(&b->mbo_gid_h);
 	__swab32s(&b->mbo_projid);
-	CLASSERT(offsetof(typeof(*b), mbo_padding_6) != 0);
-	CLASSERT(offsetof(typeof(*b), mbo_padding_7) != 0);
+	__swab64s(&b->mbo_dom_size);
+	__swab64s(&b->mbo_dom_blocks);
 	CLASSERT(offsetof(typeof(*b), mbo_padding_8) != 0);
 	CLASSERT(offsetof(typeof(*b), mbo_padding_9) != 0);
 	CLASSERT(offsetof(typeof(*b), mbo_padding_10) != 0);
@@ -1897,7 +1926,7 @@ void lustre_swab_mdt_body (struct mdt_body *b)
 
 void lustre_swab_mdt_ioepoch(struct mdt_ioepoch *b)
 {
-	/* mio_handle is opaque */
+	/* mio_open_handle is opaque */
 	CLASSERT(offsetof(typeof(*b), mio_unused1) != 0);
 	CLASSERT(offsetof(typeof(*b), mio_unused2) != 0);
 	CLASSERT(offsetof(typeof(*b), mio_padding) != 0);
@@ -1905,38 +1934,39 @@ void lustre_swab_mdt_ioepoch(struct mdt_ioepoch *b)
 
 void lustre_swab_mgs_target_info(struct mgs_target_info *mti)
 {
-        int i;
-        __swab32s(&mti->mti_lustre_ver);
-        __swab32s(&mti->mti_stripe_index);
-        __swab32s(&mti->mti_config_ver);
-        __swab32s(&mti->mti_flags);
-        __swab32s(&mti->mti_instance);
-        __swab32s(&mti->mti_nid_count);
-        CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
-        for (i = 0; i < MTI_NIDS_MAX; i++)
-                __swab64s(&mti->mti_nids[i]);
+	int i;
+
+	__swab32s(&mti->mti_lustre_ver);
+	__swab32s(&mti->mti_stripe_index);
+	__swab32s(&mti->mti_config_ver);
+	__swab32s(&mti->mti_flags);
+	__swab32s(&mti->mti_instance);
+	__swab32s(&mti->mti_nid_count);
+	CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
+	for (i = 0; i < MTI_NIDS_MAX; i++)
+		__swab64s(&mti->mti_nids[i]);
 }
 
 void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *entry)
 {
 	__u8 i;
 
-        __swab64s(&entry->mne_version);
-        __swab32s(&entry->mne_instance);
-        __swab32s(&entry->mne_index);
-        __swab32s(&entry->mne_length);
-
-        /* mne_nid_(count|type) must be one byte size because we're gonna
-         * access it w/o swapping. */
-        CLASSERT(sizeof(entry->mne_nid_count) == sizeof(__u8));
-        CLASSERT(sizeof(entry->mne_nid_type) == sizeof(__u8));
-
-        /* remove this assertion if ipv6 is supported. */
-        LASSERT(entry->mne_nid_type == 0);
-        for (i = 0; i < entry->mne_nid_count; i++) {
-                CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
-                __swab64s(&entry->u.nids[i]);
-        }
+	__swab64s(&entry->mne_version);
+	__swab32s(&entry->mne_instance);
+	__swab32s(&entry->mne_index);
+	__swab32s(&entry->mne_length);
+
+	/* mne_nid_(count|type) must be one byte size because we're gonna
+	 * access it w/o swapping. */
+	CLASSERT(sizeof(entry->mne_nid_count) == sizeof(__u8));
+	CLASSERT(sizeof(entry->mne_nid_type) == sizeof(__u8));
+
+	/* remove this assertion if ipv6 is supported. */
+	LASSERT(entry->mne_nid_type == 0);
+	for (i = 0; i < entry->mne_nid_count; i++) {
+		CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
+		__swab64s(&entry->u.nids[i]);
+	}
 }
 EXPORT_SYMBOL(lustre_swab_mgs_nidtbl_entry);
 
@@ -2003,21 +2033,32 @@ static void lustre_swab_fiemap_extent(struct fiemap_extent *fm_extent)
         __swab32s(&fm_extent->fe_device);
 }
 
+static void lustre_swab_fiemap_hdr(struct fiemap *fiemap)
+{
+	__swab64s(&fiemap->fm_start);
+	__swab64s(&fiemap->fm_length);
+	__swab32s(&fiemap->fm_flags);
+	__swab32s(&fiemap->fm_mapped_extents);
+	__swab32s(&fiemap->fm_extent_count);
+	__swab32s(&fiemap->fm_reserved);
+}
+
 void lustre_swab_fiemap(struct fiemap *fiemap)
 {
 	__u32 i;
 
-        __swab64s(&fiemap->fm_start);
-        __swab64s(&fiemap->fm_length);
-        __swab32s(&fiemap->fm_flags);
-        __swab32s(&fiemap->fm_mapped_extents);
-        __swab32s(&fiemap->fm_extent_count);
-        __swab32s(&fiemap->fm_reserved);
+	lustre_swab_fiemap_hdr(fiemap);
 
         for (i = 0; i < fiemap->fm_mapped_extents; i++)
                 lustre_swab_fiemap_extent(&fiemap->fm_extents[i]);
 }
 
+void lustre_swab_fiemap_info_key(struct ll_fiemap_info_key *fiemap_info)
+{
+	lustre_swab_obdo(&fiemap_info->lfik_oa);
+	lustre_swab_fiemap_hdr(&fiemap_info->lfik_fiemap);
+}
+
 void lustre_swab_idx_info(struct idx_info *ii)
 {
 	__swab32s(&ii->ii_magic);
@@ -2065,6 +2106,7 @@ void lustre_swab_mdt_rec_reint (struct mdt_rec_reint *rr)
 	__swab32s(&rr->rr_flags);
 	__swab32s(&rr->rr_flags_h);
 	__swab32s(&rr->rr_umask);
+	__swab16s(&rr->rr_mirror_id);
 
 	CLASSERT(offsetof(typeof(*rr), rr_padding_4) != 0);
 };
@@ -2119,14 +2161,37 @@ void lustre_swab_lmv_mds_md(union lmv_mds_md *lmm)
 }
 EXPORT_SYMBOL(lustre_swab_lmv_mds_md);
 
+void lustre_swab_lmv_user_md_objects(struct lmv_user_mds_data *lmd,
+				     int stripe_count)
+{
+	int i;
+
+	for (i = 0; i < stripe_count; i++)
+		__swab32s(&(lmd[i].lum_mds));
+}
+EXPORT_SYMBOL(lustre_swab_lmv_user_md_objects);
+
+
 void lustre_swab_lmv_user_md(struct lmv_user_md *lum)
 {
+	__u32 count = lum->lum_stripe_count;
+
 	__swab32s(&lum->lum_magic);
 	__swab32s(&lum->lum_stripe_count);
 	__swab32s(&lum->lum_stripe_offset);
 	__swab32s(&lum->lum_hash_type);
 	__swab32s(&lum->lum_type);
 	CLASSERT(offsetof(typeof(*lum), lum_padding1) != 0);
+	switch (lum->lum_magic) {
+	case LMV_USER_MAGIC_SPECIFIC:
+		count = lum->lum_stripe_count;
+		/* fallthrough */
+	case __swab32(LMV_USER_MAGIC_SPECIFIC):
+		lustre_swab_lmv_user_md_objects(lum->lum_objects, count);
+		break;
+	default:
+		break;
+	}
 }
 EXPORT_SYMBOL(lustre_swab_lmv_user_md);
 
@@ -2186,6 +2251,7 @@ void lustre_print_user_md(unsigned int lvl, struct lov_user_md *lum,
 	CDEBUG(lvl, "\tlcm_layout_gen: %#x\n", comp_v1->lcm_layout_gen);
 	CDEBUG(lvl, "\tlcm_flags: %#x\n", comp_v1->lcm_flags);
 	CDEBUG(lvl, "\tlcm_entry_count: %#x\n\n", comp_v1->lcm_entry_count);
+	CDEBUG(lvl, "\tlcm_mirror_count: %#x\n\n", comp_v1->lcm_mirror_count);
 
 	for (i = 0; i < comp_v1->lcm_entry_count; i++) {
 		struct lov_comp_md_entry_v1 *ent = &comp_v1->lcm_entries[i];
@@ -2194,6 +2260,9 @@ void lustre_print_user_md(unsigned int lvl, struct lov_user_md *lum,
 		CDEBUG(lvl, "\tentry %d:\n", i);
 		CDEBUG(lvl, "\tlcme_id: %#x\n", ent->lcme_id);
 		CDEBUG(lvl, "\tlcme_flags: %#x\n", ent->lcme_flags);
+		if (ent->lcme_flags & LCME_FL_NOSYNC)
+			CDEBUG(lvl, "\tlcme_timestamp: %llu\n",
+					ent->lcme_timestamp);
 		CDEBUG(lvl, "\tlcme_extent.e_start: %llu\n",
 		       ent->lcme_extent.e_start);
 		CDEBUG(lvl, "\tlcme_extent.e_end: %llu\n",
@@ -2267,6 +2336,7 @@ void lustre_swab_lov_comp_md_v1(struct lov_comp_md_v1 *lum)
 	__swab32s(&lum->lcm_layout_gen);
 	__swab16s(&lum->lcm_flags);
 	__swab16s(&lum->lcm_entry_count);
+	__swab16s(&lum->lcm_mirror_count);
 	CLASSERT(offsetof(typeof(*lum), lcm_padding1) != 0);
 	CLASSERT(offsetof(typeof(*lum), lcm_padding2) != 0);
 
@@ -2281,11 +2351,13 @@ void lustre_swab_lov_comp_md_v1(struct lov_comp_md_v1 *lum)
 		}
 		__swab32s(&ent->lcme_id);
 		__swab32s(&ent->lcme_flags);
+		__swab64s(&ent->lcme_timestamp);
 		__swab64s(&ent->lcme_extent.e_start);
 		__swab64s(&ent->lcme_extent.e_end);
 		__swab32s(&ent->lcme_offset);
 		__swab32s(&ent->lcme_size);
-		CLASSERT(offsetof(typeof(*ent), lcme_padding) != 0);
+		__swab32s(&ent->lcme_layout_gen);
+		CLASSERT(offsetof(typeof(*ent), lcme_padding_1) != 0);
 
 		v1 = (struct lov_user_md_v1 *)((char *)lum + off);
 		stripe_count = v1->lmm_stripe_count;
@@ -2314,20 +2386,6 @@ void lustre_swab_lov_comp_md_v1(struct lov_comp_md_v1 *lum)
 }
 EXPORT_SYMBOL(lustre_swab_lov_comp_md_v1);
 
-void lustre_swab_lov_mds_md(struct lov_mds_md *lmm)
-{
-	ENTRY;
-	CDEBUG(D_IOCTL, "swabbing lov_mds_md\n");
-	__swab32s(&lmm->lmm_magic);
-	__swab32s(&lmm->lmm_pattern);
-	lustre_swab_lmm_oi(&lmm->lmm_oi);
-	__swab32s(&lmm->lmm_stripe_size);
-	__swab16s(&lmm->lmm_stripe_count);
-	__swab16s(&lmm->lmm_layout_gen);
-	EXIT;
-}
-EXPORT_SYMBOL(lustre_swab_lov_mds_md);
-
 void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
                                      int stripe_count)
 {
@@ -2342,6 +2400,83 @@ void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
 }
 EXPORT_SYMBOL(lustre_swab_lov_user_md_objects);
 
+void lustre_swab_lov_user_md(struct lov_user_md *lum, size_t size)
+{
+	struct lov_user_md_v1 *v1;
+	struct lov_user_md_v3 *v3;
+	__u16 stripe_count;
+	ENTRY;
+
+	CDEBUG(D_IOCTL, "swabbing lov_user_md\n");
+	switch (lum->lmm_magic) {
+	case __swab32(LOV_MAGIC_V1):
+	case LOV_USER_MAGIC_V1:
+	{
+		v1 = (struct lov_user_md_v1 *)lum;
+		stripe_count = v1->lmm_stripe_count;
+
+		if (lum->lmm_magic != LOV_USER_MAGIC_V1)
+			__swab16s(&stripe_count);
+
+		lustre_swab_lov_user_md_v1(v1);
+		if (size > sizeof(*v1))
+			lustre_swab_lov_user_md_objects(v1->lmm_objects,
+							stripe_count);
+
+		break;
+	}
+	case __swab32(LOV_MAGIC_V3):
+	case LOV_USER_MAGIC_V3:
+	{
+		v3 = (struct lov_user_md_v3 *)lum;
+		stripe_count = v3->lmm_stripe_count;
+
+		if (lum->lmm_magic != LOV_USER_MAGIC_V3)
+			__swab16s(&stripe_count);
+
+		lustre_swab_lov_user_md_v3(v3);
+		if (size > sizeof(*v3))
+			lustre_swab_lov_user_md_objects(v3->lmm_objects,
+							stripe_count);
+		break;
+	}
+	case __swab32(LOV_USER_MAGIC_SPECIFIC):
+	case LOV_USER_MAGIC_SPECIFIC:
+	{
+		v3 = (struct lov_user_md_v3 *)lum;
+		stripe_count = v3->lmm_stripe_count;
+
+		if (lum->lmm_magic != LOV_USER_MAGIC_SPECIFIC)
+			__swab16s(&stripe_count);
+
+		lustre_swab_lov_user_md_v3(v3);
+		lustre_swab_lov_user_md_objects(v3->lmm_objects, stripe_count);
+		break;
+	}
+	case __swab32(LOV_MAGIC_COMP_V1):
+	case LOV_USER_MAGIC_COMP_V1:
+		lustre_swab_lov_comp_md_v1((struct lov_comp_md_v1 *)lum);
+		break;
+	default:
+		CDEBUG(D_IOCTL, "Invalid LOV magic %08x\n", lum->lmm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_swab_lov_user_md);
+
+void lustre_swab_lov_mds_md(struct lov_mds_md *lmm)
+{
+	ENTRY;
+	CDEBUG(D_IOCTL, "swabbing lov_mds_md\n");
+	__swab32s(&lmm->lmm_magic);
+	__swab32s(&lmm->lmm_pattern);
+	lustre_swab_lmm_oi(&lmm->lmm_oi);
+	__swab32s(&lmm->lmm_stripe_size);
+	__swab16s(&lmm->lmm_stripe_count);
+	__swab16s(&lmm->lmm_layout_gen);
+	EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_lov_mds_md);
+
 void lustre_swab_ldlm_res_id (struct ldlm_res_id *id)
 {
         int  i;
@@ -2435,54 +2570,51 @@ void dump_obdo(struct obdo *oa)
 	if (valid & OBD_MD_FLFID)
 		CDEBUG(D_RPCTRACE, "obdo: o_parent_seq = %#llx\n",
 		       oa->o_parent_seq);
-        if (valid & OBD_MD_FLSIZE)
+	if (valid & OBD_MD_FLSIZE)
 		CDEBUG(D_RPCTRACE, "obdo: o_size = %lld\n", oa->o_size);
-        if (valid & OBD_MD_FLMTIME)
+	if (valid & OBD_MD_FLMTIME)
 		CDEBUG(D_RPCTRACE, "obdo: o_mtime = %lld\n", oa->o_mtime);
-        if (valid & OBD_MD_FLATIME)
+	if (valid & OBD_MD_FLATIME)
 		CDEBUG(D_RPCTRACE, "obdo: o_atime = %lld\n", oa->o_atime);
-        if (valid & OBD_MD_FLCTIME)
+	if (valid & OBD_MD_FLCTIME)
 		CDEBUG(D_RPCTRACE, "obdo: o_ctime = %lld\n", oa->o_ctime);
-        if (valid & OBD_MD_FLBLOCKS)   /* allocation of space */
+	if (valid & OBD_MD_FLBLOCKS)   /* allocation of space */
 		CDEBUG(D_RPCTRACE, "obdo: o_blocks = %lld\n", oa->o_blocks);
-        if (valid & OBD_MD_FLGRANT)
+	if (valid & OBD_MD_FLGRANT)
 		CDEBUG(D_RPCTRACE, "obdo: o_grant = %lld\n", oa->o_grant);
-        if (valid & OBD_MD_FLBLKSZ)
-                CDEBUG(D_RPCTRACE, "obdo: o_blksize = %d\n", oa->o_blksize);
-        if (valid & (OBD_MD_FLTYPE | OBD_MD_FLMODE))
-                CDEBUG(D_RPCTRACE, "obdo: o_mode = %o\n",
-                       oa->o_mode & ((valid & OBD_MD_FLTYPE ?  S_IFMT : 0) |
-                                     (valid & OBD_MD_FLMODE ? ~S_IFMT : 0)));
-        if (valid & OBD_MD_FLUID)
-                CDEBUG(D_RPCTRACE, "obdo: o_uid = %u\n", oa->o_uid);
-        if (valid & OBD_MD_FLUID)
-                CDEBUG(D_RPCTRACE, "obdo: o_uid_h = %u\n", oa->o_uid_h);
-        if (valid & OBD_MD_FLGID)
-                CDEBUG(D_RPCTRACE, "obdo: o_gid = %u\n", oa->o_gid);
-        if (valid & OBD_MD_FLGID)
-                CDEBUG(D_RPCTRACE, "obdo: o_gid_h = %u\n", oa->o_gid_h);
-        if (valid & OBD_MD_FLFLAGS)
-                CDEBUG(D_RPCTRACE, "obdo: o_flags = %x\n", oa->o_flags);
-        if (valid & OBD_MD_FLNLINK)
-                CDEBUG(D_RPCTRACE, "obdo: o_nlink = %u\n", oa->o_nlink);
-        else if (valid & OBD_MD_FLCKSUM)
-                CDEBUG(D_RPCTRACE, "obdo: o_checksum (o_nlink) = %u\n",
-                       oa->o_nlink);
-        if (valid & OBD_MD_FLGENER)
-                CDEBUG(D_RPCTRACE, "obdo: o_parent_oid = %x\n",
-                       oa->o_parent_oid);
-        if (valid & OBD_MD_FLEPOCH)
-		CDEBUG(D_RPCTRACE, "obdo: o_ioepoch = %lld\n",
-                       oa->o_ioepoch);
-        if (valid & OBD_MD_FLFID) {
-                CDEBUG(D_RPCTRACE, "obdo: o_stripe_idx = %u\n",
-                       oa->o_stripe_idx);
-                CDEBUG(D_RPCTRACE, "obdo: o_parent_ver = %x\n",
-                       oa->o_parent_ver);
-        }
-        if (valid & OBD_MD_FLHANDLE)
+	if (valid & OBD_MD_FLBLKSZ)
+		CDEBUG(D_RPCTRACE, "obdo: o_blksize = %d\n", oa->o_blksize);
+	if (valid & (OBD_MD_FLTYPE | OBD_MD_FLMODE))
+		CDEBUG(D_RPCTRACE, "obdo: o_mode = %o\n",
+		       oa->o_mode & ((valid & OBD_MD_FLTYPE ?  S_IFMT : 0) |
+				     (valid & OBD_MD_FLMODE ? ~S_IFMT : 0)));
+	if (valid & OBD_MD_FLUID)
+		CDEBUG(D_RPCTRACE, "obdo: o_uid = %u\n", oa->o_uid);
+	if (valid & OBD_MD_FLUID)
+		CDEBUG(D_RPCTRACE, "obdo: o_uid_h = %u\n", oa->o_uid_h);
+	if (valid & OBD_MD_FLGID)
+		CDEBUG(D_RPCTRACE, "obdo: o_gid = %u\n", oa->o_gid);
+	if (valid & OBD_MD_FLGID)
+		CDEBUG(D_RPCTRACE, "obdo: o_gid_h = %u\n", oa->o_gid_h);
+	if (valid & OBD_MD_FLFLAGS)
+		CDEBUG(D_RPCTRACE, "obdo: o_flags = %x\n", oa->o_flags);
+	if (valid & OBD_MD_FLNLINK)
+		CDEBUG(D_RPCTRACE, "obdo: o_nlink = %u\n", oa->o_nlink);
+	else if (valid & OBD_MD_FLCKSUM)
+		CDEBUG(D_RPCTRACE, "obdo: o_checksum (o_nlink) = %u\n",
+		       oa->o_nlink);
+	if (valid & OBD_MD_FLPARENT)
+		CDEBUG(D_RPCTRACE, "obdo: o_parent_oid = %x\n",
+		       oa->o_parent_oid);
+	if (valid & OBD_MD_FLFID) {
+		CDEBUG(D_RPCTRACE, "obdo: o_stripe_idx = %u\n",
+		       oa->o_stripe_idx);
+		CDEBUG(D_RPCTRACE, "obdo: o_parent_ver = %x\n",
+		       oa->o_parent_ver);
+	}
+	if (valid & OBD_MD_FLHANDLE)
 		CDEBUG(D_RPCTRACE, "obdo: o_handle = %lld\n",
-                       oa->o_handle.cookie);
+		       oa->o_handle.cookie);
 }
 
 void dump_ost_body(struct ost_body *ob)
@@ -2629,12 +2761,17 @@ void lustre_swab_hsm_user_item(struct hsm_user_item *hui)
 	lustre_swab_hsm_extent(&hui->hui_extent);
 }
 
+void lustre_swab_lu_extent(struct lu_extent *le)
+{
+	__swab64s(&le->e_start);
+	__swab64s(&le->e_end);
+}
+
 void lustre_swab_layout_intent(struct layout_intent *li)
 {
 	__swab32s(&li->li_opc);
 	__swab32s(&li->li_flags);
-	__swab64s(&li->li_start);
-	__swab64s(&li->li_end);
+	lustre_swab_lu_extent(&li->li_extent);
 }
 
 void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk)
@@ -2746,6 +2883,19 @@ void lustre_swab_close_data(struct close_data *cd)
 	__swab64s(&cd->cd_data_version);
 }
 
+void lustre_swab_close_data_resync_done(struct close_data_resync_done *resync)
+{
+	int i;
+
+	__swab32s(&resync->resync_count);
+	/* after swab, resync_count must in CPU endian */
+	if (resync->resync_count <= INLINE_RESYNC_ARRAY_SIZE) {
+		for (i = 0; i < resync->resync_count; i++)
+			__swab32s(&resync->resync_ids_inline[i]);
+	}
+}
+EXPORT_SYMBOL(lustre_swab_close_data_resync_done);
+
 void lustre_swab_lfsck_request(struct lfsck_request *lr)
 {
 	__swab32s(&lr->lr_event);
@@ -2797,6 +2947,18 @@ void lustre_swab_orphan_ent_v2(struct lu_orphan_ent_v2 *ent)
 }
 EXPORT_SYMBOL(lustre_swab_orphan_ent_v2);
 
+void lustre_swab_orphan_ent_v3(struct lu_orphan_ent_v3 *ent)
+{
+	lustre_swab_lu_fid(&ent->loe_key);
+	lustre_swab_orphan_rec(&ent->loe_rec.lor_rec);
+	lustre_swab_ost_layout(&ent->loe_rec.lor_layout);
+	__swab32s(&ent->loe_rec.lor_layout_version);
+	__swab32s(&ent->loe_rec.lor_range);
+	CLASSERT(offsetof(typeof(ent->loe_rec), lor_padding_1) != 0);
+	CLASSERT(offsetof(typeof(ent->loe_rec), lor_padding_2) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_orphan_ent_v3);
+
 void lustre_swab_ladvise(struct lu_ladvise *ladvise)
 {
 	__swab16s(&ladvise->lla_advice);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/pers.c b/drivers/staging/lustrefsx/lustre/ptlrpc/pers.c
index 51e17e2c2b459..d0c8fa7a1e6ac 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/pers.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/pers.c
@@ -44,6 +44,8 @@
 void ptlrpc_fill_bulk_md(struct lnet_md *md, struct ptlrpc_bulk_desc *desc,
 			 int mdidx)
 {
+	unsigned int start = desc->bd_mds_off[mdidx];
+
 	CLASSERT(PTLRPC_MAX_BRW_PAGES < LI_POISON);
 
 	LASSERT(mdidx < desc->bd_md_max_brw);
@@ -51,23 +53,34 @@ void ptlrpc_fill_bulk_md(struct lnet_md *md, struct ptlrpc_bulk_desc *desc,
 	LASSERT(!(md->options & (LNET_MD_IOVEC | LNET_MD_KIOV |
 				 LNET_MD_PHYS)));
 
-	md->length = max(0, desc->bd_iov_count - mdidx * LNET_MAX_IOV);
-	md->length = min_t(unsigned int, LNET_MAX_IOV, md->length);
+	/* just send a lnet header */
+	if (mdidx >= desc->bd_md_count) {
+		if (ptlrpc_is_bulk_desc_kiov(desc->bd_type))
+			md->options |= LNET_MD_KIOV;
+		else if (ptlrpc_is_bulk_desc_kvec(desc->bd_type))
+			md->options |= LNET_MD_IOVEC;
+		md->length = 0;
+		md->start = NULL;
+		return;
+	}
+
+	if (mdidx == (desc->bd_md_count - 1))
+		md->length = desc->bd_iov_count - start;
+	else
+		md->length = desc->bd_mds_off[mdidx + 1] - start;
 
 	if (ptlrpc_is_bulk_desc_kiov(desc->bd_type)) {
 		md->options |= LNET_MD_KIOV;
 		if (GET_ENC_KIOV(desc))
-			md->start = &BD_GET_ENC_KIOV(desc, mdidx *
-						     LNET_MAX_IOV);
+			md->start = &BD_GET_ENC_KIOV(desc, start);
 		else
-			md->start = &BD_GET_KIOV(desc, mdidx * LNET_MAX_IOV);
+			md->start = &BD_GET_KIOV(desc, start);
 	} else if (ptlrpc_is_bulk_desc_kvec(desc->bd_type)) {
 		md->options |= LNET_MD_IOVEC;
 		if (GET_ENC_KVEC(desc))
-			md->start = &BD_GET_ENC_KVEC(desc, mdidx *
-						      LNET_MAX_IOV);
+			md->start = &BD_GET_ENC_KVEC(desc, start);
 		else
-			md->start = &BD_GET_KVEC(desc, mdidx * LNET_MAX_IOV);
+			md->start = &BD_GET_KVEC(desc, start);
 	}
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/pinger.c b/drivers/staging/lustrefsx/lustre/ptlrpc/pinger.c
index 15fb0965241eb..d965c0838d8d5 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/pinger.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/pinger.c
@@ -37,6 +37,7 @@
 #define DEBUG_SUBSYSTEM S_RPC
 
 #include <linux/kthread.h>
+#include <linux/workqueue.h>
 #include <obd_support.h>
 #include <obd_class.h>
 #include "ptlrpc_internal.h"
@@ -48,8 +49,6 @@ MODULE_PARM_DESC(suppress_pings, "Suppress pings");
 struct mutex pinger_mutex;
 static struct list_head pinger_imports =
 		LIST_HEAD_INIT(pinger_imports);
-static struct list_head timeout_list =
-		LIST_HEAD_INIT(timeout_list);
 
 int ptlrpc_pinger_suppress_pings()
 {
@@ -91,11 +90,51 @@ int ptlrpc_obd_ping(struct obd_device *obd)
 }
 EXPORT_SYMBOL(ptlrpc_obd_ping);
 
+static bool ptlrpc_check_import_is_idle(struct obd_import *imp)
+{
+	struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
+	time64_t now;
+
+	if (!imp->imp_idle_timeout)
+		return false;
+
+	if (atomic_read(&imp->imp_reqs) > 0)
+		return false;
+
+	/* any lock increases ns_bref being a resource holder */
+	if (ns && atomic_read(&ns->ns_bref) > 0)
+		return false;
+
+	now = ktime_get_real_seconds();
+	if (now - imp->imp_last_reply_time < imp->imp_idle_timeout)
+		return false;
+
+	return true;
+}
+
+static void ptlrpc_update_next_ping(struct obd_import *imp, int soon)
+{
+#ifdef CONFIG_LUSTRE_FS_PINGER
+	time64_t time = soon ? PING_INTERVAL_SHORT : PING_INTERVAL;
+
+	if (imp->imp_state == LUSTRE_IMP_DISCON) {
+		time64_t dtime = max_t(time64_t, CONNECTION_SWITCH_MIN,
+				       AT_OFF ? 0 :
+				       at_get(&imp->imp_at.iat_net_latency));
+		time = min(time, dtime);
+	}
+	imp->imp_next_ping = ktime_get_seconds() + time;
+#endif /* CONFIG_LUSTRE_FS_PINGER */
+}
+
 static int ptlrpc_ping(struct obd_import *imp)
 {
 	struct ptlrpc_request	*req;
 	ENTRY;
 
+	if (ptlrpc_check_import_is_idle(imp))
+		RETURN(ptlrpc_disconnect_and_idle_import(imp));
+
 	req = ptlrpc_prep_ping(imp);
 	if (req == NULL) {
 		CERROR("OOM trying to ping %s->%s\n",
@@ -106,28 +145,20 @@ static int ptlrpc_ping(struct obd_import *imp)
 
 	DEBUG_REQ(D_INFO, req, "pinging %s->%s",
 		  imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
+	/* Updating imp_next_ping early, it allows pinger_check_timeout to
+	 * see an actual time for next awake. request_out_callback update
+	 * happens at another thread, and ptlrpc_pinger_main may sleep
+	 * already.
+	 */
+	ptlrpc_update_next_ping(imp, 0);
 	ptlrpcd_add_req(req);
 
 	RETURN(0);
 }
 
-static void ptlrpc_update_next_ping(struct obd_import *imp, int soon)
-{
-#ifdef ENABLE_PINGER
-        int time = soon ? PING_INTERVAL_SHORT : PING_INTERVAL;
-        if (imp->imp_state == LUSTRE_IMP_DISCON) {
-                int dtime = max_t(int, CONNECTION_SWITCH_MIN,
-                                  AT_OFF ? 0 :
-                                  at_get(&imp->imp_at.iat_net_latency));
-                time = min(time, dtime);
-        }
-        imp->imp_next_ping = cfs_time_shift(time);
-#endif /* ENABLE_PINGER */
-}
-
 void ptlrpc_ping_import_soon(struct obd_import *imp)
 {
-        imp->imp_next_ping = cfs_time_current();
+	imp->imp_next_ping = ktime_get_seconds();
 }
 
 static inline int imp_is_deactive(struct obd_import *imp)
@@ -136,34 +167,36 @@ static inline int imp_is_deactive(struct obd_import *imp)
                 OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_IMP_DEACTIVE));
 }
 
-static inline int ptlrpc_next_reconnect(struct obd_import *imp)
+static inline time64_t ptlrpc_next_reconnect(struct obd_import *imp)
 {
-        if (imp->imp_server_timeout)
-                return cfs_time_shift(obd_timeout / 2);
-        else
-                return cfs_time_shift(obd_timeout);
+	return ktime_get_seconds() + INITIAL_CONNECT_TIMEOUT;
 }
 
-static cfs_duration_t pinger_check_timeout(cfs_time_t time)
+static s32 pinger_check_timeout(time64_t time)
 {
-        struct timeout_item *item;
-        cfs_time_t timeout = PING_INTERVAL;
+	s32 timeout = PING_INTERVAL;
+	s32 next_timeout;
+	time64_t now;
+	struct list_head *iter;
+	struct obd_import *imp;
 
-	/* This list is sorted in increasing timeout order */
 	mutex_lock(&pinger_mutex);
-	list_for_each_entry(item, &timeout_list, ti_chain) {
-		int ti_timeout = item->ti_timeout;
-		if (timeout > ti_timeout)
-			timeout = ti_timeout;
-		break;
+	now = ktime_get_seconds();
+	/* Process imports to find a nearest next ping */
+	list_for_each(iter, &pinger_imports) {
+		imp = list_entry(iter, struct obd_import, imp_pinger_chain);
+		if (!imp->imp_pingable || imp->imp_next_ping < now)
+			continue;
+		next_timeout = imp->imp_next_ping - now;
+		/* make sure imp_next_ping in the future from time */
+		if (next_timeout > (now - time) && timeout > next_timeout)
+			timeout = next_timeout;
 	}
 	mutex_unlock(&pinger_mutex);
 
-        return cfs_time_sub(cfs_time_add(time, cfs_time_seconds(timeout)),
-                                         cfs_time_current());
+	return timeout - (now - time);
 }
 
-
 static bool ir_up;
 
 void ptlrpc_pinger_ir_up(void)
@@ -181,7 +214,7 @@ void ptlrpc_pinger_ir_down(void)
 EXPORT_SYMBOL(ptlrpc_pinger_ir_down);
 
 static void ptlrpc_pinger_process_import(struct obd_import *imp,
-                                         unsigned long this_ping)
+					 time64_t this_ping)
 {
 	int level;
 	int force;
@@ -200,16 +233,13 @@ static void ptlrpc_pinger_process_import(struct obd_import *imp,
 
 	imp->imp_force_verify = 0;
 
-	if (cfs_time_aftereq(imp->imp_next_ping - 5 * CFS_TICK, this_ping) &&
-	    !force) {
+	if (imp->imp_next_ping - 5 >= this_ping && !force) {
 		spin_unlock(&imp->imp_lock);
 		return;
 	}
 
 	imp->imp_force_next_verify = 0;
 
-	spin_unlock(&imp->imp_lock);
-
 	CDEBUG(level == LUSTRE_IMP_FULL ? D_INFO : D_HA, "%s->%s: level %s/%u "
 	       "force %u force_next %u deactive %u pingable %u suppress %u\n",
 	       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
@@ -219,130 +249,91 @@ static void ptlrpc_pinger_process_import(struct obd_import *imp,
         if (level == LUSTRE_IMP_DISCON && !imp_is_deactive(imp)) {
                 /* wait for a while before trying recovery again */
                 imp->imp_next_ping = ptlrpc_next_reconnect(imp);
+		spin_unlock(&imp->imp_lock);
                 if (!imp->imp_no_pinger_recover)
                         ptlrpc_initiate_recovery(imp);
-        } else if (level != LUSTRE_IMP_FULL ||
-                   imp->imp_obd->obd_no_recov ||
-                   imp_is_deactive(imp)) {
+	} else if (level != LUSTRE_IMP_FULL || imp->imp_obd->obd_no_recov ||
+		   imp_is_deactive(imp)) {
 		CDEBUG(D_HA, "%s->%s: not pinging (in recovery "
 		       "or recovery disabled: %s)\n",
 		       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
 		       ptlrpc_import_state_name(level));
-		if (force) {
-			spin_lock(&imp->imp_lock);
+		if (force)
 			imp->imp_force_verify = 1;
-			spin_unlock(&imp->imp_lock);
-		}
+		spin_unlock(&imp->imp_lock);
 	} else if ((imp->imp_pingable && !suppress) || force_next || force) {
+		spin_unlock(&imp->imp_lock);
 		ptlrpc_ping(imp);
+	} else {
+		spin_unlock(&imp->imp_lock);
 	}
 }
 
-static int ptlrpc_pinger_main(void *arg)
-{
-	struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg;
-	ENTRY;
+static struct workqueue_struct *pinger_wq;
+static void ptlrpc_pinger_main(struct work_struct *ws);
+static DECLARE_DELAYED_WORK(ping_work, ptlrpc_pinger_main);
 
-	/* Record that the thread is running */
-	thread_set_flags(thread, SVC_RUNNING);
-	wake_up(&thread->t_ctl_waitq);
+static void ptlrpc_pinger_main(struct work_struct *ws)
+{
+	time64_t this_ping, time_after_ping;
+	s32 time_to_next_wake;
+	struct obd_import *imp;
+	struct list_head *iter;
 
-	/* And now, loop forever, pinging as needed. */
-	while (1) {
-		cfs_time_t this_ping = cfs_time_current();
-		struct l_wait_info lwi;
-		cfs_duration_t time_to_next_wake;
-		struct timeout_item *item;
-		struct list_head *iter;
+	do {
+		this_ping = ktime_get_seconds();
 
 		mutex_lock(&pinger_mutex);
-		list_for_each_entry(item, &timeout_list, ti_chain)
-                        item->ti_cb(item, item->ti_cb_data);
 
 		list_for_each(iter, &pinger_imports) {
-			struct obd_import *imp = list_entry(iter,
-							    struct obd_import,
-							    imp_pinger_chain);
-
-                        ptlrpc_pinger_process_import(imp, this_ping);
-                        /* obd_timeout might have changed */
-                        if (imp->imp_pingable && imp->imp_next_ping &&
-                            cfs_time_after(imp->imp_next_ping,
-                                           cfs_time_add(this_ping,
-                                                        cfs_time_seconds(PING_INTERVAL))))
-                                ptlrpc_update_next_ping(imp, 0);
-                }
+			imp = list_entry(iter, struct obd_import,
+					 imp_pinger_chain);
+
+			ptlrpc_pinger_process_import(imp, this_ping);
+			/* obd_timeout might have changed */
+			if (imp->imp_pingable && imp->imp_next_ping &&
+			    imp->imp_next_ping > this_ping + PING_INTERVAL)
+				ptlrpc_update_next_ping(imp, 0);
+		}
 		mutex_unlock(&pinger_mutex);
-                /* update memory usage info */
-                obd_update_maxusage();
-
-                /* Wait until the next ping time, or until we're stopped. */
-                time_to_next_wake = pinger_check_timeout(this_ping);
-                /* The ping sent by ptlrpc_send_rpc may get sent out
-                   say .01 second after this.
-                   ptlrpc_pinger_sending_on_import will then set the
-                   next ping time to next_ping + .01 sec, which means
-                   we will SKIP the next ping at next_ping, and the
-                   ping will get sent 2 timeouts from now!  Beware. */
-		CDEBUG(D_INFO, "next wakeup in "CFS_DURATION_T" (%ld)\n",
-		       time_to_next_wake,
-                       cfs_time_add(this_ping,cfs_time_seconds(PING_INTERVAL)));
-                if (time_to_next_wake > 0) {
-                        lwi = LWI_TIMEOUT(max_t(cfs_duration_t,
-                                                time_to_next_wake,
-                                                cfs_time_seconds(1)),
-                                          NULL, NULL);
-                        l_wait_event(thread->t_ctl_waitq,
-                                     thread_is_stopping(thread) ||
-                                     thread_is_event(thread),
-                                     &lwi);
-                        if (thread_test_and_clear_flags(thread, SVC_STOPPING)) {
-                                EXIT;
-                                break;
-                        } else {
-                                /* woken after adding import to reset timer */
-                                thread_test_and_clear_flags(thread, SVC_EVENT);
-                        }
-                }
-        }
-
-	thread_set_flags(thread, SVC_STOPPED);
-	wake_up(&thread->t_ctl_waitq);
 
-	CDEBUG(D_NET, "pinger thread exiting, process %d\n", current_pid());
-	return 0;
+		time_after_ping = ktime_get_seconds();
+		/* update memory usage info */
+		obd_update_maxusage();
+
+		if ((ktime_get_seconds() - this_ping - 3) > PING_INTERVAL)
+			CDEBUG(D_HA, "long time to ping: %lld, %lld, %lld\n",
+			       this_ping, time_after_ping, ktime_get_seconds());
+
+		/* Wait until the next ping time, or until we're stopped. */
+		time_to_next_wake = pinger_check_timeout(this_ping);
+		/* The ping sent by ptlrpc_send_rpc may get sent out
+		 * say .01 second after this.
+		 * ptlrpc_pinger_sending_on_import will then set the
+		 * next ping time to next_ping + .01 sec, which means
+		 * we will SKIP the next ping at next_ping, and the
+		 * ping will get sent 2 timeouts from now!  Beware. */
+		CDEBUG(D_INFO, "next wakeup in %d (%lld)\n",
+		       time_to_next_wake, this_ping + PING_INTERVAL);
+	} while (time_to_next_wake <= 0);
+
+	queue_delayed_work(pinger_wq, &ping_work,
+			   cfs_time_seconds(max(time_to_next_wake, 1)));
 }
 
-static struct ptlrpc_thread pinger_thread;
-
 int ptlrpc_start_pinger(void)
 {
-	struct l_wait_info lwi = { 0 };
-	struct task_struct *task;
-	int rc;
-#ifndef ENABLE_PINGER
-	return 0;
-#endif
-	ENTRY;
-
-	if (!thread_is_init(&pinger_thread) &&
-	    !thread_is_stopped(&pinger_thread))
-		RETURN(-EALREADY);
-
-	init_waitqueue_head(&pinger_thread.t_ctl_waitq);
-
-	strcpy(pinger_thread.t_name, "ll_ping");
+#ifdef ENABLE_PINGER
+	if (pinger_wq)
+		return -EALREADY;
 
-	task = kthread_run(ptlrpc_pinger_main, &pinger_thread,
-			   pinger_thread.t_name);
-	if (IS_ERR(task)) {
-		rc = PTR_ERR(task);
-		CERROR("cannot start pinger thread: rc = %d\n", rc);
-		RETURN(rc);
+	pinger_wq = alloc_workqueue("ptlrpc_pinger", 0, 1);
+	if (!pinger_wq) {
+		CERROR("cannot start pinger workqueue\n");
+		return -ENOMEM;
 	}
 
-	l_wait_event(pinger_thread.t_ctl_waitq,
-		     thread_is_running(&pinger_thread), &lwi);
+	queue_delayed_work(pinger_wq, &ping_work, 0);
 
 	if (suppress_pings)
 		CWARN("Pings will be suppressed at the request of the "
@@ -350,32 +341,21 @@ int ptlrpc_start_pinger(void)
 		      "additional requirements described in the manual.  "
 		      "(Search for the \"suppress_pings\" kernel module "
 		      "parameter.)\n");
-
-	RETURN(0);
+#endif
+	return 0;
 }
 
-int ptlrpc_pinger_remove_timeouts(void);
-
 int ptlrpc_stop_pinger(void)
 {
-	struct l_wait_info lwi = { 0 };
-#ifndef ENABLE_PINGER
-	return 0;
-#endif
-	ENTRY;
-
-	if (thread_is_init(&pinger_thread) ||
-	    thread_is_stopped(&pinger_thread))
-		RETURN(-EALREADY);
-
-	ptlrpc_pinger_remove_timeouts();
-
-	thread_set_flags(&pinger_thread, SVC_STOPPING);
-	wake_up(&pinger_thread.t_ctl_waitq);
+#ifdef ENABLE_PINGER
+	if (!pinger_wq)
+		return -EALREADY;
 
-	l_wait_event(pinger_thread.t_ctl_waitq,
-		     thread_is_stopped(&pinger_thread), &lwi);
-	RETURN(0);
+	cancel_delayed_work_sync(&ping_work);
+	destroy_workqueue(pinger_wq);
+	pinger_wq = NULL;
+#endif
+	return 0;
 }
 
 void ptlrpc_pinger_sending_on_import(struct obd_import *imp)
@@ -440,129 +420,10 @@ int ptlrpc_pinger_del_import(struct obd_import *imp)
 }
 EXPORT_SYMBOL(ptlrpc_pinger_del_import);
 
-/**
- * Register a timeout callback to the pinger list, and the callback will
- * be called when timeout happens.
- */
-static struct timeout_item *ptlrpc_new_timeout(int time,
-					       enum timeout_event event,
-					       timeout_cb_t cb, void *data)
-{
-        struct timeout_item *ti;
-
-        OBD_ALLOC_PTR(ti);
-        if (!ti)
-                return(NULL);
-
-	INIT_LIST_HEAD(&ti->ti_obd_list);
-	INIT_LIST_HEAD(&ti->ti_chain);
-        ti->ti_timeout = time;
-        ti->ti_event = event;
-        ti->ti_cb = cb;
-        ti->ti_cb_data = data;
-
-        return ti;
-}
-
-/**
- * Register timeout event on the the pinger thread.
- * Note: the timeout list is an sorted list with increased timeout value.
- */
-static struct timeout_item*
-ptlrpc_pinger_register_timeout(int time, enum timeout_event event,
-                               timeout_cb_t cb, void *data)
-{
-	struct timeout_item *item, *tmp;
-
-	LASSERT(mutex_is_locked(&pinger_mutex));
-
-	list_for_each_entry(item, &timeout_list, ti_chain)
-		if (item->ti_event == event)
-			goto out;
-
-	item = ptlrpc_new_timeout(time, event, cb, data);
-	if (item) {
-		list_for_each_entry_reverse(tmp, &timeout_list, ti_chain) {
-			if (tmp->ti_timeout < time) {
-				list_add(&item->ti_chain, &tmp->ti_chain);
-				goto out;
-			}
-		}
-		list_add(&item->ti_chain, &timeout_list);
-	}
-out:
-	return item;
-}
-
-/* Add a client_obd to the timeout event list, when timeout(@time)
- * happens, the callback(@cb) will be called.
- */
-int ptlrpc_add_timeout_client(int time, enum timeout_event event,
-                              timeout_cb_t cb, void *data,
-			      struct list_head *obd_list)
-{
-        struct timeout_item *ti;
-
-	mutex_lock(&pinger_mutex);
-        ti = ptlrpc_pinger_register_timeout(time, event, cb, data);
-        if (!ti) {
-		mutex_unlock(&pinger_mutex);
-                return (-EINVAL);
-        }
-	list_add(obd_list, &ti->ti_obd_list);
-	mutex_unlock(&pinger_mutex);
-        return 0;
-}
-EXPORT_SYMBOL(ptlrpc_add_timeout_client);
-
-int ptlrpc_del_timeout_client(struct list_head *obd_list,
-			      enum timeout_event event)
-{
-	struct timeout_item *ti = NULL, *item;
-
-	if (list_empty(obd_list))
-		return 0;
-	mutex_lock(&pinger_mutex);
-	list_del_init(obd_list);
-	/**
-	 * If there are no obd attached to the timeout event
-	 * list, remove this timeout event from the pinger
-	 */
-	list_for_each_entry(item, &timeout_list, ti_chain) {
-		if (item->ti_event == event) {
-			ti = item;
-			break;
-		}
-	}
-	LASSERTF(ti != NULL, "ti is NULL !\n");
-	if (list_empty(&ti->ti_obd_list)) {
-		list_del(&ti->ti_chain);
-		OBD_FREE_PTR(ti);
-	}
-	mutex_unlock(&pinger_mutex);
-	return 0;
-}
-EXPORT_SYMBOL(ptlrpc_del_timeout_client);
-
-int ptlrpc_pinger_remove_timeouts(void)
-{
-        struct timeout_item *item, *tmp;
-
-	mutex_lock(&pinger_mutex);
-	list_for_each_entry_safe(item, tmp, &timeout_list, ti_chain) {
-		LASSERT(list_empty(&item->ti_obd_list));
-		list_del(&item->ti_chain);
-                OBD_FREE_PTR(item);
-        }
-	mutex_unlock(&pinger_mutex);
-        return 0;
-}
-
 void ptlrpc_pinger_wake_up()
 {
 #ifdef ENABLE_PINGER
-	thread_add_flags(&pinger_thread, SVC_EVENT);
-	wake_up(&pinger_thread.t_ctl_waitq);
+	mod_delayed_work(pinger_wq, &ping_work, 0);
 #endif
 }
 
@@ -600,12 +461,12 @@ int ping_evictor_wake(struct obd_export *exp)
 
 static int ping_evictor_main(void *arg)
 {
-        struct obd_device *obd;
-        struct obd_export *exp;
-        struct l_wait_info lwi = { 0 };
-        time_t expire_time;
-        ENTRY;
+	struct obd_device *obd;
+	struct obd_export *exp;
+	struct l_wait_info lwi = { 0 };
+	time64_t expire_time;
 
+	ENTRY;
 	unshare_fs_struct();
 
 	CDEBUG(D_HA, "Starting Ping Evictor\n");
@@ -626,9 +487,9 @@ static int ping_evictor_main(void *arg)
 				 obd_evict_list);
 		spin_unlock(&pet_lock);
 
-		expire_time = cfs_time_current_sec() - PING_EVICT_TIMEOUT;
+		expire_time = ktime_get_real_seconds() - PING_EVICT_TIMEOUT;
 
-		CDEBUG(D_HA, "evicting all exports of obd %s older than %ld\n",
+		CDEBUG(D_HA, "evicting all exports of obd %s older than %lld\n",
 		       obd->obd_name, expire_time);
 
 		/* Exports can't be deleted out of the list while we hold
@@ -644,19 +505,19 @@ static int ping_evictor_main(void *arg)
 				class_export_get(exp);
 				spin_unlock(&obd->obd_dev_lock);
 				LCONSOLE_WARN("%s: haven't heard from client %s"
-                                              " (at %s) in %ld seconds. I think"
+					      " (at %s) in %lld seconds. I think"
                                               " it's dead, and I am evicting"
-                                              " it. exp %p, cur %ld expire %ld"
-                                              " last %ld\n",
+					      " it. exp %p, cur %lld expire %lld"
+					      " last %lld\n",
                                               obd->obd_name,
                                               obd_uuid2str(&exp->exp_client_uuid),
                                               obd_export_nid2str(exp),
-                                              (long)(cfs_time_current_sec() -
-                                                     exp->exp_last_request_time),
-                                              exp, (long)cfs_time_current_sec(),
-                                              (long)expire_time,
-                                              (long)exp->exp_last_request_time);
-                                CDEBUG(D_HA, "Last request was at %ld\n",
+					      ktime_get_real_seconds() -
+					      exp->exp_last_request_time,
+					      exp, ktime_get_real_seconds(),
+					      expire_time,
+					      exp->exp_last_request_time);
+				CDEBUG(D_HA, "Last request was at %lld\n",
                                        exp->exp_last_request_time);
                                 class_fail_export(exp);
                                 class_export_put(exp);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_internal.h b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_internal.h
index cfd1de5bb3d45..41b9a268d52a6 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_internal.h
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -69,7 +69,7 @@ int ptlrpcd_start(struct ptlrpcd_ctl *pc);
 
 /* client.c */
 void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
-			       unsigned int service_time);
+			       timeout_t service_timeout);
 struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw,
 					 enum ptlrpc_bulk_op_type type,
 					 unsigned portal,
@@ -83,7 +83,7 @@ void ptlrpc_init_xid(void);
 void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
 			    struct ptlrpc_request *req);
 int ptlrpc_expired_set(void *data);
-int ptlrpc_set_next_timeout(struct ptlrpc_request_set *);
+time64_t ptlrpc_set_next_timeout(struct ptlrpc_request_set *);
 void ptlrpc_resend_req(struct ptlrpc_request *request);
 void ptlrpc_set_bulk_mbits(struct ptlrpc_request *req);
 void ptlrpc_assign_next_xid_nolock(struct ptlrpc_request *req);
@@ -97,7 +97,8 @@ void ptlrpc_exit_portals(void);
 void ptlrpc_request_handle_notconn(struct ptlrpc_request *);
 void lustre_assert_wire_constants(void);
 int ptlrpc_import_in_recovery(struct obd_import *imp);
-int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt);
+int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt,
+			     bool invalid);
 void ptlrpc_handle_failed_import(struct obd_import *imp);
 int ptlrpc_replay_next(struct obd_import *imp, int *inflight);
 void ptlrpc_initiate_recovery(struct obd_import *imp);
@@ -105,15 +106,18 @@ void ptlrpc_initiate_recovery(struct obd_import *imp);
 int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset);
 int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset);
 
+int ptlrpc_sysfs_register_service(struct kset *parent,
+				  struct ptlrpc_service *svc);
+void ptlrpc_sysfs_unregister_service(struct ptlrpc_service *svc);
+
+void ptlrpc_ldebugfs_register_service(struct dentry *debugfs_entry,
+				      struct ptlrpc_service *svc);
 #ifdef CONFIG_PROC_FS
-void ptlrpc_lprocfs_register_service(struct proc_dir_entry *proc_entry,
-                                     struct ptlrpc_service *svc);
 void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc);
 void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount);
 void ptlrpc_lprocfs_do_request_stat (struct ptlrpc_request *req,
                                      long q_usec, long work_usec);
 #else
-#define ptlrpc_lprocfs_register_service(params...) do{}while(0)
 #define ptlrpc_lprocfs_unregister_service(params...) do{}while(0)
 #define ptlrpc_lprocfs_rpc_sent(params...) do{}while(0)
 #define ptlrpc_lprocfs_do_request_stat(params...) do{}while(0)
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c
index 0532c4d22d8bd..b98d082660628 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -212,7 +212,7 @@ void ptlrpcd_add_rqset(struct ptlrpc_request_set *set)
 
 		LASSERT(req->rq_phase == RQ_PHASE_NEW);
 		req->rq_set = new;
-		req->rq_queued_time = cfs_time_current();
+		req->rq_queued_time = ktime_get_seconds();
 	}
 
 	spin_lock(&new->set_new_req_lock);
@@ -476,7 +476,7 @@ static int ptlrpcd(void *arg)
          */
         do {
                 struct l_wait_info lwi;
-                int timeout;
+		time64_t timeout;
 
                 timeout = ptlrpc_set_next_timeout(set);
 		lwi = LWI_TIMEOUT(cfs_time_seconds(timeout),
@@ -503,11 +503,11 @@ static int ptlrpcd(void *arg)
                  */
         } while (exit < 2);
 
-        /*
-         * Wait for inflight requests to drain.
-         */
+	/*
+	 * Wait for inflight requests to drain.
+	 */
 	if (!list_empty(&set->set_requests))
-                ptlrpc_set_wait(set);
+		ptlrpc_set_wait(&env, set);
 	lu_context_fini(&env.le_ctx);
 	lu_context_fini(env.le_ses);
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/recover.c b/drivers/staging/lustrefsx/lustre/ptlrpc/recover.c
index aacb929beae23..c923ab9386901 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/recover.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/recover.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -228,30 +228,22 @@ void ptlrpc_wake_delayed(struct obd_import *imp)
 
 void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req)
 {
-        struct obd_import *imp = failed_req->rq_import;
-        ENTRY;
+	struct obd_import *imp = failed_req->rq_import;
+	int conn = lustre_msg_get_conn_cnt(failed_req->rq_reqmsg);
+	ENTRY;
 
-        CDEBUG(D_HA, "import %s of %s@%s abruptly disconnected: reconnecting\n",
-               imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
-               imp->imp_connection->c_remote_uuid.uuid);
-
-        if (ptlrpc_set_import_discon(imp,
-                              lustre_msg_get_conn_cnt(failed_req->rq_reqmsg))) {
-                if (!imp->imp_replayable) {
-                        CDEBUG(D_HA, "import %s@%s for %s not replayable, "
-                               "auto-deactivating\n",
-                               obd2cli_tgt(imp->imp_obd),
-                               imp->imp_connection->c_remote_uuid.uuid,
-                               imp->imp_obd->obd_name);
-                        ptlrpc_deactivate_import(imp);
-                }
-                /* to control recovery via lctl {disable|enable}_recovery */
-                if (imp->imp_deactive == 0)
-                        ptlrpc_connect_import(imp);
-        }
+	CDEBUG(D_HA, "import %s of %s@%s abruptly disconnected: reconnecting\n",
+		imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
+		imp->imp_connection->c_remote_uuid.uuid);
+
+	if (ptlrpc_set_import_discon(imp, conn, true)) {
+		/* to control recovery via lctl {disable|enable}_recovery */
+		if (imp->imp_deactive == 0)
+			ptlrpc_connect_import(imp);
+	}
 
-        /* Wait for recovery to complete and resend. If evicted, then
-           this request will be errored out later.*/
+	/* Wait for recovery to complete and resend. If evicted, then
+	   this request will be errored out later.*/
 	spin_lock(&failed_req->rq_lock);
 	if (!failed_req->rq_no_resend)
 		failed_req->rq_resend = 1;
@@ -261,7 +253,7 @@ void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req)
 }
 
 /**
- * Administratively active/deactive a client. 
+ * Administratively active/deactive a client.
  * This should only be called by the ioctl interface, currently
  *  - the lctl deactivate and activate commands
  *  - echo 0/1 >> /proc/osc/XXX/active
@@ -320,21 +312,21 @@ int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async)
 	    atomic_read(&imp->imp_inval_count))
 		rc = -EINVAL;
 	spin_unlock(&imp->imp_lock);
-        if (rc)
-                GOTO(out, rc);
+	if (rc)
+		GOTO(out, rc);
 
-        /* force import to be disconnected. */
-        ptlrpc_set_import_discon(imp, 0);
+	/* force import to be disconnected. */
+	ptlrpc_set_import_discon(imp, 0, false);
 
-        if (new_uuid) {
-                struct obd_uuid uuid;
+	if (new_uuid) {
+		struct obd_uuid uuid;
 
-                /* intruct import to use new uuid */
-                obd_str2uuid(&uuid, new_uuid);
-                rc = import_set_conn_priority(imp, &uuid);
-                if (rc)
-                        GOTO(out, rc);
-        }
+		/* intruct import to use new uuid */
+		obd_str2uuid(&uuid, new_uuid);
+		rc = import_set_conn_priority(imp, &uuid);
+		if (rc)
+			GOTO(out, rc);
+	}
 
         /* Check if reconnect is already in progress */
 	spin_lock(&imp->imp_lock);
@@ -354,9 +346,9 @@ int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async)
 
         if (!async) {
                 struct l_wait_info lwi;
-                int secs = cfs_time_seconds(obd_timeout);
+		long secs = cfs_time_seconds(obd_timeout);
 
-                CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n",
+		CDEBUG(D_HA, "%s: recovery started, waiting %lu seconds\n",
                        obd2cli_tgt(imp->imp_obd), secs);
 
                 lwi = LWI_TIMEOUT(secs, NULL, NULL);
@@ -377,9 +369,8 @@ int ptlrpc_import_in_recovery(struct obd_import *imp)
 	int in_recovery = 1;
 
 	spin_lock(&imp->imp_lock);
-	if (imp->imp_state == LUSTRE_IMP_FULL ||
-	    imp->imp_state == LUSTRE_IMP_CLOSED ||
-	    imp->imp_state == LUSTRE_IMP_DISCON ||
+	if (imp->imp_state <= LUSTRE_IMP_DISCON ||
+	    imp->imp_state >= LUSTRE_IMP_FULL ||
 	    imp->imp_obd->obd_no_recov)
 		in_recovery = 0;
 	spin_unlock(&imp->imp_lock);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec.c
index 92d39ece51d16..78c07fcefec3a 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -43,6 +43,10 @@
 #include <linux/crypto.h>
 #include <linux/key.h>
 
+#ifdef HAVE_LINUX_SELINUX_IS_ENABLED
+#include <linux/selinux.h>
+#endif
+
 #include <libcfs/libcfs.h>
 #include <obd.h>
 #include <obd_class.h>
@@ -54,6 +58,10 @@
 
 #include "ptlrpc_internal.h"
 
+static int send_sepol;
+module_param(send_sepol, int, 0644);
+MODULE_PARM_DESC(send_sepol, "Client sends SELinux policy status");
+
 /***********************************************
  * policy registers                            *
  ***********************************************/
@@ -402,11 +410,12 @@ static int import_sec_validate_get(struct obd_import *imp,
 	}
 
 	*sec = sptlrpc_import_sec_ref(imp);
-	/* Only output an error when the import is still active */
 	if (*sec == NULL) {
-		if (list_empty(&imp->imp_zombie_chain))
+		/* Only output an error when the import is still active */
+		if (!test_bit(WORK_STRUCT_PENDING_BIT,
+			      work_data_bits(&imp->imp_zombie_work)))
 			CERROR("import %p (%s) with no sec\n",
-				imp, ptlrpc_import_state_name(imp->imp_state));
+			       imp, ptlrpc_import_state_name(imp->imp_state));
 		return -EACCES;
 	}
 
@@ -709,12 +718,12 @@ int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout)
         sptlrpc_sec_put(sec);
 
         if (cli_ctx_is_eternal(ctx))
-                RETURN(0);
+		RETURN(0);
 
 	if (unlikely(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags))) {
-                LASSERT(ctx->cc_ops->refresh);
-                ctx->cc_ops->refresh(ctx);
-        }
+		if (ctx->cc_ops->refresh)
+			ctx->cc_ops->refresh(ctx);
+	}
 	LASSERT(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags) == 0);
 
         LASSERT(ctx->cc_ops->validate);
@@ -836,7 +845,30 @@ int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout)
                 RETURN(rc);
         }
 
-        goto again;
+	goto again;
+}
+
+/* Bring ptlrpc_sec context up-to-date */
+int sptlrpc_export_update_ctx(struct obd_export *exp)
+{
+	struct obd_import *imp = exp ? exp->exp_imp_reverse : NULL;
+	struct ptlrpc_sec *sec = NULL;
+	struct ptlrpc_cli_ctx *ctx = NULL;
+	int rc = 0;
+
+	if (imp)
+		sec = sptlrpc_import_sec_ref(imp);
+	if (sec) {
+		ctx = get_my_ctx(sec);
+		sptlrpc_sec_put(sec);
+	}
+
+	if (ctx) {
+		if (ctx->cc_ops->refresh)
+			rc = ctx->cc_ops->refresh(ctx);
+		sptlrpc_cli_ctx_put(ctx, 1);
+	}
+	return rc;
 }
 
 /**
@@ -1726,6 +1758,7 @@ void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req)
         req->rq_repmsg = NULL;
         EXIT;
 }
+EXPORT_SYMBOL(sptlrpc_cli_free_repbuf);
 
 int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp,
                                 struct ptlrpc_cli_ctx *ctx)
@@ -1747,6 +1780,128 @@ int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp,
         return policy->sp_sops->install_rctx(imp, ctx);
 }
 
+/* Get SELinux policy info from userspace */
+static int sepol_helper(struct obd_import *imp)
+{
+	char mtime_str[21] = { 0 }, mode_str[2] = { 0 };
+	char *argv[] = {
+		[0] = "/usr/sbin/l_getsepol",
+		[1] = "-o",
+		[2] = NULL,	    /* obd type */
+		[3] = "-n",
+		[4] = NULL,	    /* obd name */
+		[5] = "-t",
+		[6] = mtime_str,    /* policy mtime */
+		[7] = "-m",
+		[8] = mode_str,	    /* enforcing mode */
+		[9] = NULL
+	};
+	char *envp[] = {
+		[0] = "HOME=/",
+		[1] = "PATH=/sbin:/usr/sbin",
+		[2] = NULL
+	};
+	signed short ret;
+	int rc = 0;
+
+	if (imp == NULL || imp->imp_obd == NULL ||
+	    imp->imp_obd->obd_type == NULL) {
+		rc = -EINVAL;
+	} else {
+		argv[2] = imp->imp_obd->obd_type->typ_name;
+		argv[4] = imp->imp_obd->obd_name;
+		spin_lock(&imp->imp_sec->ps_lock);
+		if (ktime_to_ns(imp->imp_sec->ps_sepol_mtime) == 0 &&
+		    imp->imp_sec->ps_sepol[0] == '\0') {
+			/* ps_sepol has not been initialized */
+			argv[5] = NULL;
+			argv[7] = NULL;
+		} else {
+			time64_t mtime_ms;
+
+			mtime_ms = ktime_to_ms(imp->imp_sec->ps_sepol_mtime);
+			snprintf(mtime_str, sizeof(mtime_str), "%lld",
+				 mtime_ms / MSEC_PER_SEC);
+			mode_str[0] = imp->imp_sec->ps_sepol[0];
+		}
+		spin_unlock(&imp->imp_sec->ps_lock);
+		ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+		rc = ret>>8;
+	}
+
+	return rc;
+}
+
+static inline int sptlrpc_sepol_needs_check(struct ptlrpc_sec *imp_sec)
+{
+	ktime_t checknext;
+
+	if (send_sepol == 0 || !selinux_is_enabled())
+		return 0;
+
+	if (send_sepol == -1)
+		/* send_sepol == -1 means fetch sepol status every time */
+		return 1;
+
+	spin_lock(&imp_sec->ps_lock);
+	checknext = imp_sec->ps_sepol_checknext;
+	spin_unlock(&imp_sec->ps_lock);
+
+	/* next check is too far in time, please update */
+	if (ktime_after(checknext,
+			ktime_add(ktime_get(), ktime_set(send_sepol, 0))))
+		goto setnext;
+
+	if (ktime_before(ktime_get(), checknext))
+		/* too early to fetch sepol status */
+		return 0;
+
+setnext:
+	/* define new sepol_checknext time */
+	spin_lock(&imp_sec->ps_lock);
+	imp_sec->ps_sepol_checknext = ktime_add(ktime_get(),
+						ktime_set(send_sepol, 0));
+	spin_unlock(&imp_sec->ps_lock);
+
+	return 1;
+}
+
+int sptlrpc_get_sepol(struct ptlrpc_request *req)
+{
+	struct ptlrpc_sec *imp_sec = req->rq_import->imp_sec;
+	int rc = 0;
+
+	ENTRY;
+
+	(req->rq_sepol)[0] = '\0';
+
+#ifndef HAVE_SELINUX
+	if (unlikely(send_sepol != 0))
+		CDEBUG(D_SEC, "Client cannot report SELinux status, "
+			      "it was not built against libselinux.\n");
+	RETURN(0);
+#endif
+
+	if (send_sepol == 0 || !selinux_is_enabled())
+		RETURN(0);
+
+	if (imp_sec == NULL)
+		RETURN(-EINVAL);
+
+	/* Retrieve SELinux status info */
+	if (sptlrpc_sepol_needs_check(imp_sec))
+		rc = sepol_helper(req->rq_import);
+	if (likely(rc == 0)) {
+		spin_lock(&imp_sec->ps_lock);
+		memcpy(req->rq_sepol, imp_sec->ps_sepol,
+		       sizeof(req->rq_sepol));
+		spin_unlock(&imp_sec->ps_lock);
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(sptlrpc_get_sepol);
+
 /****************************************
  * server side security                 *
  ****************************************/
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c
index 42841f0c0aaf1..216c2f2a0820b 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -36,7 +36,7 @@
 
 #define DEBUG_SUBSYSTEM S_SEC
 
-#include <libcfs/libcfs.h>
+#include <libcfs/linux/linux-mem.h>
 
 #include <obd.h>
 #include <obd_cksum.h>
@@ -114,7 +114,7 @@ static struct ptlrpc_enc_page_pool {
         unsigned long    epp_st_missings;       /* # of cache missing */
         unsigned long    epp_st_lowfree;        /* lowest free pages reached */
         unsigned int     epp_st_max_wqlen;      /* highest waitqueue length */
-        cfs_time_t       epp_st_max_wait;       /* in jeffies */
+	ktime_t		epp_st_max_wait;	/* in nanoseconds */
 	unsigned long	 epp_st_outofmem;	/* # of out of mem requests */
 	/*
 	 * pointers to pools, may be vmalloc'd
@@ -143,8 +143,8 @@ int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v)
 		   "total pages:             %lu\n"
 		   "total free:              %lu\n"
 		   "idle index:              %lu/100\n"
-		   "last shrink:             %lds\n"
-		   "last access:             %lds\n"
+		   "last shrink:             %llds\n"
+		   "last access:             %llds\n"
 		   "max pages reached:       %lu\n"
 		   "grows:                   %u\n"
 		   "grows failure:           %u\n"
@@ -153,7 +153,7 @@ int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v)
 		   "cache missing:           %lu\n"
 		   "low free mark:           %lu\n"
 		   "max waitqueue depth:     %u\n"
-		   "max wait time:           %ld/%lu\n"
+		   "max wait time ms:        %lld\n"
 		   "out of mem:              %lu\n",
 		   cfs_totalram_pages(), PAGES_PER_POOL,
 		   page_pools.epp_max_pages,
@@ -161,8 +161,8 @@ int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v)
 		   page_pools.epp_total_pages,
 		   page_pools.epp_free_pages,
 		   page_pools.epp_idle_idx,
-		   (long)(ktime_get_seconds() - page_pools.epp_last_shrink),
-		   (long)(ktime_get_seconds() - page_pools.epp_last_access),
+		   ktime_get_seconds() - page_pools.epp_last_shrink,
+		   ktime_get_seconds() - page_pools.epp_last_access,
 		   page_pools.epp_st_max_pages,
 		   page_pools.epp_st_grows,
 		   page_pools.epp_st_grow_fails,
@@ -171,8 +171,7 @@ int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v)
 		   page_pools.epp_st_missings,
 		   page_pools.epp_st_lowfree,
 		   page_pools.epp_st_max_wqlen,
-		   page_pools.epp_st_max_wait,
-		   msecs_to_jiffies(MSEC_PER_SEC),
+		   ktime_to_ms(page_pools.epp_st_max_wait),
 		   page_pools.epp_st_outofmem);
 
 	spin_unlock(&page_pools.epp_lock);
@@ -234,7 +233,7 @@ static unsigned long enc_pools_shrink_count(struct shrinker *s,
 	 * if no pool access for a long time, we consider it's fully idle.
 	 * a little race here is fine.
 	 */
-	if (unlikely(ktime_get_real_seconds() - page_pools.epp_last_access >
+	if (unlikely(ktime_get_seconds() - page_pools.epp_last_access >
 		     CACHE_QUIESCENT_PERIOD)) {
 		spin_lock(&page_pools.epp_lock);
 		page_pools.epp_idle_idx = IDLE_IDX_MAX;
@@ -265,7 +264,7 @@ static unsigned long enc_pools_shrink_scan(struct shrinker *s,
 		       (long)sc->nr_to_scan, page_pools.epp_free_pages);
 
 		page_pools.epp_st_shrinks++;
-		page_pools.epp_last_shrink = ktime_get_real_seconds();
+		page_pools.epp_last_shrink = ktime_get_seconds();
 	}
 	spin_unlock(&page_pools.epp_lock);
 
@@ -273,7 +272,7 @@ static unsigned long enc_pools_shrink_scan(struct shrinker *s,
 	 * if no pool access for a long time, we consider it's fully idle.
 	 * a little race here is fine.
 	 */
-	if (unlikely(ktime_get_real_seconds() - page_pools.epp_last_access >
+	if (unlikely(ktime_get_seconds() - page_pools.epp_last_access >
 		     CACHE_QUIESCENT_PERIOD)) {
 		spin_lock(&page_pools.epp_lock);
 		page_pools.epp_idle_idx = IDLE_IDX_MAX;
@@ -542,11 +541,11 @@ EXPORT_SYMBOL(pool_is_at_full_capacity);
 int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc)
 {
 	wait_queue_entry_t waitlink;
-	unsigned long   this_idle = -1;
-	cfs_time_t      tick = 0;
-	long            now;
-	int             p_idx, g_idx;
-	int             i;
+	unsigned long this_idle = -1;
+	u64 tick_ns = 0;
+	time64_t now;
+	int p_idx, g_idx;
+	int i;
 
 	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
 	LASSERT(desc->bd_iov_count > 0);
@@ -566,8 +565,8 @@ int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc)
 	page_pools.epp_st_access++;
 again:
 	if (unlikely(page_pools.epp_free_pages < desc->bd_iov_count)) {
-		if (tick == 0)
-			tick = cfs_time_current();
+		if (tick_ns == 0)
+			tick_ns = ktime_get_ns();
 
 		now = ktime_get_real_seconds();
 
@@ -625,12 +624,13 @@ int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc)
 		goto again;
 	}
 
-        /* record max wait time */
-        if (unlikely(tick != 0)) {
-                tick = cfs_time_current() - tick;
-                if (tick > page_pools.epp_st_max_wait)
-                        page_pools.epp_st_max_wait = tick;
-        }
+	/* record max wait time */
+	if (unlikely(tick_ns)) {
+		ktime_t tick = ktime_sub_ns(ktime_get(), tick_ns);
+
+		if (ktime_after(tick, page_pools.epp_st_max_wait))
+			page_pools.epp_st_max_wait = tick;
+	}
 
         /* proceed with rest of allocation */
         page_pools.epp_free_pages -= desc->bd_iov_count;
@@ -664,7 +664,7 @@ int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc)
                                    this_idle) /
                                   (IDLE_IDX_WEIGHT + 1);
 
-	page_pools.epp_last_access = ktime_get_real_seconds();
+	page_pools.epp_last_access = ktime_get_seconds();
 
 	spin_unlock(&page_pools.epp_lock);
 	return 0;
@@ -789,8 +789,8 @@ int sptlrpc_enc_pool_init(void)
         page_pools.epp_growing = 0;
 
         page_pools.epp_idle_idx = 0;
-	page_pools.epp_last_shrink = ktime_get_real_seconds();
-	page_pools.epp_last_access = ktime_get_real_seconds();
+	page_pools.epp_last_shrink = ktime_get_seconds();
+	page_pools.epp_last_access = ktime_get_seconds();
 
 	spin_lock_init(&page_pools.epp_lock);
         page_pools.epp_total_pages = 0;
@@ -804,7 +804,7 @@ int sptlrpc_enc_pool_init(void)
         page_pools.epp_st_missings = 0;
         page_pools.epp_st_lowfree = 0;
         page_pools.epp_st_max_wqlen = 0;
-        page_pools.epp_st_max_wait = 0;
+	page_pools.epp_st_max_wait = ktime_set(0, 0);
 	page_pools.epp_st_outofmem = 0;
 
         enc_pools_alloc();
@@ -838,13 +838,12 @@ void sptlrpc_enc_pool_fini(void)
 
 	if (page_pools.epp_st_access > 0) {
 		CDEBUG(D_SEC,
-		       "max pages %lu, grows %u, grow fails %u, shrinks %u, access %lu, missing %lu, max qlen %u, max wait %ld/%lu, out of mem %lu\n",
+		       "max pages %lu, grows %u, grow fails %u, shrinks %u, access %lu, missing %lu, max qlen %u, max wait ms %lld, out of mem %lu\n",
 		       page_pools.epp_st_max_pages, page_pools.epp_st_grows,
 		       page_pools.epp_st_grow_fails,
 		       page_pools.epp_st_shrinks, page_pools.epp_st_access,
 		       page_pools.epp_st_missings, page_pools.epp_st_max_wqlen,
-		       page_pools.epp_st_max_wait,
-		       msecs_to_jiffies(MSEC_PER_SEC),
+		       ktime_to_ms(page_pools.epp_st_max_wait),
 		       page_pools.epp_st_outofmem);
 	}
 }
@@ -917,7 +916,7 @@ EXPORT_SYMBOL(bulk_sec_desc_unpack);
 int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
 			      void *buf, int buflen)
 {
-	struct cfs_crypto_hash_desc	*hdesc;
+	struct ahash_request	       *req;
 	int				hashsize;
 	unsigned int			bufsize;
 	int				i, err;
@@ -926,17 +925,17 @@ int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
 	LASSERT(alg > BULK_HASH_ALG_NULL && alg < BULK_HASH_ALG_MAX);
 	LASSERT(buflen >= 4);
 
-	hdesc = cfs_crypto_hash_init(cfs_hash_alg_id[alg], NULL, 0);
-	if (IS_ERR(hdesc)) {
+	req = cfs_crypto_hash_init(cfs_hash_alg_id[alg], NULL, 0);
+	if (IS_ERR(req)) {
 		CERROR("Unable to initialize checksum hash %s\n",
 		       cfs_crypto_hash_name(cfs_hash_alg_id[alg]));
-		return PTR_ERR(hdesc);
+		return PTR_ERR(req);
 	}
 
 	hashsize = cfs_crypto_hash_digestsize(cfs_hash_alg_id[alg]);
 
 	for (i = 0; i < desc->bd_iov_count; i++) {
-		cfs_crypto_hash_update_page(hdesc,
+		cfs_crypto_hash_update_page(req,
 				  BD_GET_KIOV(desc, i).kiov_page,
 				  BD_GET_KIOV(desc, i).kiov_offset &
 					      ~PAGE_MASK,
@@ -949,11 +948,11 @@ int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
 		bufsize = sizeof(hashbuf);
 		LASSERTF(bufsize >= hashsize, "bufsize = %u < hashsize %u\n",
 			 bufsize, hashsize);
-		err = cfs_crypto_hash_final(hdesc, hashbuf, &bufsize);
+		err = cfs_crypto_hash_final(req, hashbuf, &bufsize);
 		memcpy(buf, hashbuf, buflen);
 	} else {
 		bufsize = buflen;
-		err = cfs_crypto_hash_final(hdesc, buf, &bufsize);
+		err = cfs_crypto_hash_final(req, buf, &bufsize);
 	}
 
 	return err;
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_config.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_config.c
index 550abeafceea0..b661ff8696530 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_config.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_config.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -44,7 +44,7 @@
 #include <lustre_log.h>
 #include <lustre_disk.h>
 #include <lustre_dlm.h>
-#include <uapi/linux/lustre_param.h>
+#include <uapi/linux/lustre/lustre_param.h>
 #include <lustre_sec.h>
 
 #include "ptlrpc_internal.h"
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c
index 766b21d10c20c..dc9f38c7036ba 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c
@@ -21,7 +21,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2014, Intel Corporation.
+ * Copyright (c) 2014, 2017, Intel Corporation.
  */
 
 #define DEBUG_SUBSYSTEM S_FILTER
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_gc.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_gc.c
index f8ec60b1adb01..042a632390cfe 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_gc.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_gc.c
@@ -36,7 +36,7 @@
 
 #define DEBUG_SUBSYSTEM S_SEC
 
-#include <linux/kthread.h>
+#include <linux/workqueue.h>
 #include <libcfs/libcfs.h>
 
 #include <obd_support.h>
@@ -48,7 +48,6 @@
 
 #define SEC_GC_INTERVAL (30 * 60)
 
-
 static struct mutex sec_gc_mutex;
 static spinlock_t sec_gc_list_lock;
 static struct list_head sec_gc_list;
@@ -56,10 +55,8 @@ static struct list_head sec_gc_list;
 static spinlock_t sec_gc_ctx_list_lock;
 static struct list_head sec_gc_ctx_list;
 
-static struct ptlrpc_thread sec_gc_thread;
 static atomic_t sec_gc_wait_del = ATOMIC_INIT(0);
 
-
 void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec)
 {
         LASSERT(sec->ps_policy->sp_cops->gc_ctx);
@@ -98,6 +95,9 @@ void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec)
 	CDEBUG(D_SEC, "del sec %p(%s)\n", sec, sec->ps_policy->sp_name);
 }
 
+static void sec_gc_main(struct work_struct *ws);
+static DECLARE_DELAYED_WORK(sec_gc_work, sec_gc_main);
+
 void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx)
 {
 	LASSERT(list_empty(&ctx->cc_gc_chain));
@@ -108,8 +108,7 @@ void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx)
 	list_add(&ctx->cc_gc_chain, &sec_gc_ctx_list);
 	spin_unlock(&sec_gc_ctx_list_lock);
 
-	thread_add_flags(&sec_gc_thread, SVC_SIGNAL);
-	wake_up(&sec_gc_thread.t_ctl_waitq);
+	mod_delayed_work(system_wq, &sec_gc_work, 0);
 }
 EXPORT_SYMBOL(sptlrpc_gc_add_ctx);
 
@@ -156,68 +155,41 @@ static void sec_do_gc(struct ptlrpc_sec *sec)
 	sec->ps_gc_next = ktime_get_real_seconds() + sec->ps_gc_interval;
 }
 
-static int sec_gc_main(void *arg)
+static void sec_gc_main(struct work_struct *ws)
 {
-	struct ptlrpc_thread *thread = (struct ptlrpc_thread *) arg;
-	struct l_wait_info    lwi;
-
-	unshare_fs_struct();
-
-	/* Record that the thread is running */
-	thread_set_flags(thread, SVC_RUNNING);
-	wake_up(&thread->t_ctl_waitq);
+	struct ptlrpc_sec *sec;
 
-	while (1) {
-		struct ptlrpc_sec *sec;
-
-		thread_clear_flags(thread, SVC_SIGNAL);
-		sec_process_ctx_list();
+	sec_process_ctx_list();
 again:
-		/* go through sec list do gc.
-		 * FIXME here we iterate through the whole list each time which
-		 * is not optimal. we perhaps want to use balanced binary tree
-		 * to trace each sec as order of expiry time.
-		 * another issue here is we wakeup as fixed interval instead of
-		 * according to each sec's expiry time */
-		mutex_lock(&sec_gc_mutex);
-		list_for_each_entry(sec, &sec_gc_list, ps_gc_list) {
-			/* if someone is waiting to be deleted, let it
-			 * proceed as soon as possible. */
-			if (atomic_read(&sec_gc_wait_del)) {
-				CDEBUG(D_SEC, "deletion pending, start over\n");
-				mutex_unlock(&sec_gc_mutex);
-				goto again;
-			}
-
-			sec_do_gc(sec);
+	/* go through sec list do gc.
+	 * FIXME here we iterate through the whole list each time which
+	 * is not optimal. we perhaps want to use balanced binary tree
+	 * to trace each sec as order of expiry time.
+	 * another issue here is we wakeup as fixed interval instead of
+	 * according to each sec's expiry time
+	 */
+	mutex_lock(&sec_gc_mutex);
+	list_for_each_entry(sec, &sec_gc_list, ps_gc_list) {
+		/* if someone is waiting to be deleted, let it
+		 * proceed as soon as possible.
+		 */
+		if (atomic_read(&sec_gc_wait_del)) {
+			CDEBUG(D_SEC, "deletion pending, start over\n");
+			mutex_unlock(&sec_gc_mutex);
+			goto again;
 		}
-		mutex_unlock(&sec_gc_mutex);
-
-		/* check ctx list again before sleep */
-		sec_process_ctx_list();
-
-		lwi = LWI_TIMEOUT(msecs_to_jiffies(SEC_GC_INTERVAL *
-						   MSEC_PER_SEC),
-				  NULL, NULL);
-		l_wait_event(thread->t_ctl_waitq,
-			     thread_is_stopping(thread) ||
-			     thread_is_signal(thread),
-			     &lwi);
 
-		if (thread_test_and_clear_flags(thread, SVC_STOPPING))
-			break;
+		sec_do_gc(sec);
 	}
+	mutex_unlock(&sec_gc_mutex);
 
-	thread_set_flags(thread, SVC_STOPPED);
-	wake_up(&thread->t_ctl_waitq);
-	return 0;
+	/* check ctx list again before sleep */
+	sec_process_ctx_list();
+	schedule_delayed_work(&sec_gc_work, cfs_time_seconds(SEC_GC_INTERVAL));
 }
 
 int sptlrpc_gc_init(void)
 {
-	struct l_wait_info lwi = { 0 };
-	struct task_struct *task;
-
 	mutex_init(&sec_gc_mutex);
 	spin_lock_init(&sec_gc_list_lock);
 	spin_lock_init(&sec_gc_ctx_list_lock);
@@ -225,28 +197,11 @@ int sptlrpc_gc_init(void)
 	INIT_LIST_HEAD(&sec_gc_list);
 	INIT_LIST_HEAD(&sec_gc_ctx_list);
 
-	/* initialize thread control */
-	memset(&sec_gc_thread, 0, sizeof(sec_gc_thread));
-	init_waitqueue_head(&sec_gc_thread.t_ctl_waitq);
-
-	task = kthread_run(sec_gc_main, &sec_gc_thread, "sptlrpc_gc");
-	if (IS_ERR(task)) {
-		CERROR("can't start gc thread: %ld\n", PTR_ERR(task));
-		return PTR_ERR(task);
-	}
-
-	l_wait_event(sec_gc_thread.t_ctl_waitq,
-		     thread_is_running(&sec_gc_thread), &lwi);
+	schedule_delayed_work(&sec_gc_work, cfs_time_seconds(SEC_GC_INTERVAL));
 	return 0;
 }
 
 void sptlrpc_gc_fini(void)
 {
-	struct l_wait_info lwi = { 0 };
-
-	thread_set_flags(&sec_gc_thread, SVC_STOPPING);
-	wake_up(&sec_gc_thread.t_ctl_waitq);
-
-	l_wait_event(sec_gc_thread.t_ctl_waitq,
-		     thread_is_stopped(&sec_gc_thread), &lwi);
+	cancel_delayed_work_sync(&sec_gc_work);
 }
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_lproc.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_lproc.c
index 96acb183270e4..4f8efe44aa678 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_lproc.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_lproc.c
@@ -110,7 +110,8 @@ static int sptlrpc_info_lprocfs_seq_show(struct seq_file *seq, void *v)
 out:
         return 0;
 }
-LPROC_SEQ_FOPS_RO(sptlrpc_info_lprocfs);
+
+LDEBUGFS_SEQ_FOPS_RO(sptlrpc_info_lprocfs);
 
 static int sptlrpc_ctxs_lprocfs_seq_show(struct seq_file *seq, void *v)
 {
@@ -136,11 +137,81 @@ static int sptlrpc_ctxs_lprocfs_seq_show(struct seq_file *seq, void *v)
 out:
         return 0;
 }
-LPROC_SEQ_FOPS_RO(sptlrpc_ctxs_lprocfs);
+
+LDEBUGFS_SEQ_FOPS_RO(sptlrpc_ctxs_lprocfs);
+
+static ssize_t
+ldebugfs_sptlrpc_sepol_seq_write(struct file *file, const char __user *buffer,
+				size_t count, void *data)
+{
+	struct seq_file	*seq = file->private_data;
+	struct obd_device *dev = seq->private;
+	struct client_obd *cli = &dev->u.cli;
+	struct obd_import *imp = cli->cl_import;
+	struct sepol_downcall_data *param;
+	int size = sizeof(*param);
+	int rc = 0;
+
+	if (count < size) {
+		CERROR("%s: invalid data count = %lu, size = %d\n",
+		       dev->obd_name, (unsigned long) count, size);
+		return -EINVAL;
+	}
+
+	OBD_ALLOC(param, size);
+	if (param == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(param, buffer, size)) {
+		CERROR("%s: bad sepol data\n", dev->obd_name);
+		GOTO(out, rc = -EFAULT);
+	}
+
+	if (param->sdd_magic != SEPOL_DOWNCALL_MAGIC) {
+		CERROR("%s: sepol downcall bad params\n",
+		       dev->obd_name);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	if (param->sdd_sepol_len == 0 ||
+	    param->sdd_sepol_len >= sizeof(imp->imp_sec->ps_sepol)) {
+		CERROR("%s: invalid sepol data returned\n",
+		       dev->obd_name);
+		GOTO(out, rc = -EINVAL);
+	}
+	rc = param->sdd_sepol_len; /* save sdd_sepol_len */
+	OBD_FREE(param, size);
+	size = offsetof(struct sepol_downcall_data,
+			sdd_sepol[rc]);
+
+	/* alloc again with real size */
+	rc = 0;
+	OBD_ALLOC(param, size);
+	if (param == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(param, buffer, size)) {
+		CERROR("%s: bad sepol data\n", dev->obd_name);
+		GOTO(out, rc = -EFAULT);
+	}
+
+	spin_lock(&imp->imp_sec->ps_lock);
+	snprintf(imp->imp_sec->ps_sepol, param->sdd_sepol_len + 1, "%s",
+		 param->sdd_sepol);
+	imp->imp_sec->ps_sepol_mtime = ktime_set(param->sdd_sepol_mtime, 0);
+	spin_unlock(&imp->imp_sec->ps_lock);
+
+out:
+	if (param != NULL)
+		OBD_FREE(param, size);
+
+	return rc ? rc : count;
+}
+LDEBUGFS_FOPS_WR_ONLY(srpc, sptlrpc_sepol);
 
 int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev)
 {
-        int     rc;
+	int     rc;
 
 	if (strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) != 0 &&
 	    strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) != 0 &&
@@ -152,23 +223,31 @@ int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev)
 		return -EINVAL;
 	}
 
-        rc = lprocfs_obd_seq_create(dev, "srpc_info", 0444,
-                                    &sptlrpc_info_lprocfs_fops, dev);
-        if (rc) {
-                CERROR("create proc entry srpc_info for %s: %d\n",
-                       dev->obd_name, rc);
-                return rc;
-        }
+	rc = ldebugfs_seq_create(dev->obd_debugfs_entry, "srpc_info", 0444,
+				 &sptlrpc_info_lprocfs_fops, dev);
+	if (rc) {
+		CERROR("create proc entry srpc_info for %s: %d\n",
+		       dev->obd_name, rc);
+		return rc;
+	}
 
-        rc = lprocfs_obd_seq_create(dev, "srpc_contexts", 0444,
-                                    &sptlrpc_ctxs_lprocfs_fops, dev);
-        if (rc) {
-                CERROR("create proc entry srpc_contexts for %s: %d\n",
-                       dev->obd_name, rc);
-                return rc;
-        }
+	rc = ldebugfs_seq_create(dev->obd_debugfs_entry, "srpc_contexts",
+				 0444, &sptlrpc_ctxs_lprocfs_fops, dev);
+	if (rc) {
+		CERROR("create proc entry srpc_contexts for %s: %d\n",
+		       dev->obd_name, rc);
+		return rc;
+	}
 
-        return 0;
+	rc = ldebugfs_seq_create(dev->obd_debugfs_entry, "srpc_sepol",
+				 0200, &srpc_sptlrpc_sepol_fops, dev);
+	if (rc) {
+		CERROR("create proc entry srpc_sepol for %s: %d\n",
+		       dev->obd_name, rc);
+		return rc;
+	}
+
+	return 0;
 }
 EXPORT_SYMBOL(sptlrpc_lprocfs_cliobd_attach);
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_null.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_null.c
index 52af519a291d7..a17a4e182233e 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_null.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_null.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -63,14 +63,7 @@ void null_encode_sec_part(struct lustre_msg *msg, enum lustre_sec_part sp)
 static inline
 enum lustre_sec_part null_decode_sec_part(struct lustre_msg *msg)
 {
-        return (msg->lm_secflvr >> 24) & 0xFF;
-}
-
-static int null_ctx_refresh(struct ptlrpc_cli_ctx *ctx)
-{
-        /* should never reach here */
-        LBUG();
-        return 0;
+	return (msg->lm_secflvr >> 24) & 0xFF;
 }
 
 static
@@ -370,11 +363,9 @@ int null_authorize(struct ptlrpc_request *req)
 }
 
 static struct ptlrpc_ctx_ops null_ctx_ops = {
-        .refresh                = null_ctx_refresh,
-        .sign                   = null_ctx_sign,
-        .verify                 = null_ctx_verify,
+	.sign                   = null_ctx_sign,
+	.verify                 = null_ctx_verify,
 };
-
 static struct ptlrpc_sec_cops null_sec_cops = {
         .create_sec             = null_create_sec,
         .destroy_sec            = null_destroy_sec,
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_plain.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_plain.c
index a0f192cecf633..dea70d160b54e 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_plain.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_plain.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -215,12 +215,12 @@ int plain_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
 static
 int plain_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
 {
-        struct lustre_msg   *msg = req->rq_repdata;
-        struct plain_header *phdr;
-        __u32                cksum;
-        int                  swabbed;
-        ENTRY;
+	struct lustre_msg *msg = req->rq_repdata;
+	struct plain_header *phdr;
+	__u32 cksum;
+	bool swabbed;
 
+	ENTRY;
         if (msg->lm_bufcount != PLAIN_PACK_SEGMENTS) {
                 CERROR("unexpected reply buf count %u\n", msg->lm_bufcount);
                 RETURN(-EPROTO);
@@ -723,16 +723,15 @@ static struct ptlrpc_svc_ctx plain_svc_ctx = {
         .sc_policy      = &plain_policy,
 };
 
-static
-int plain_accept(struct ptlrpc_request *req)
+static int plain_accept(struct ptlrpc_request *req)
 {
-        struct lustre_msg   *msg = req->rq_reqbuf;
-        struct plain_header *phdr;
-        int                  swabbed;
-        ENTRY;
+	struct lustre_msg *msg = req->rq_reqbuf;
+	struct plain_header *phdr;
+	bool swabbed;
 
-        LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) ==
-                SPTLRPC_POLICY_PLAIN);
+	ENTRY;
+	LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) ==
+		SPTLRPC_POLICY_PLAIN);
 
         if (SPTLRPC_FLVR_BASE(req->rq_flvr.sf_rpc) !=
             SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN) ||
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/service.c b/drivers/staging/lustrefsx/lustre/ptlrpc/service.c
index 6e3172cdeb5a7..6373c36865f3d 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/service.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/service.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2016, Intel Corporation.
+ * Copyright (c) 2010, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -31,13 +31,15 @@
  */
 
 #define DEBUG_SUBSYSTEM S_RPC
+
 #include <linux/kthread.h>
 #include <obd_support.h>
 #include <obd_class.h>
 #include <lustre_net.h>
 #include <lu_object.h>
-#include <lnet/types.h>
+#include <uapi/linux/lnet/lnet-types.h>
 #include "ptlrpc_internal.h"
+#include <linux/delay.h>
 
 /* The following are visible and mutable through /sys/module/ptlrpc */
 int test_req_buffer_pressure = 0;
@@ -139,7 +141,9 @@ ptlrpc_grow_req_bufs(struct ptlrpc_service_part *svcpt, int post)
         for (i = 0; i < svc->srv_nbuf_per_group; i++) {
                 /* NB: another thread might have recycled enough rqbds, we
 		 * need to make sure it wouldn't over-allocate, see LU-1212. */
-		if (svcpt->scp_nrqbds_posted >= svc->srv_nbuf_per_group)
+		if (svcpt->scp_nrqbds_posted >= svc->srv_nbuf_per_group ||
+		    (svc->srv_nrqbds_max != 0 &&
+		     svcpt->scp_nrqbds_total > svc->srv_nrqbds_max))
 			break;
 
 		rqbd = ptlrpc_alloc_rqbd(svcpt);
@@ -479,7 +483,7 @@ static void ptlrpc_at_timer(cfs_timer_cb_arg_t data)
 	svcpt = cfs_from_timer(svcpt, data, scp_at_timer);
 
 	svcpt->scp_at_check = 1;
-	svcpt->scp_at_checktime = cfs_time_current();
+	svcpt->scp_at_checktime = ktime_get();
 	wake_up(&svcpt->scp_waitq);
 }
 
@@ -602,6 +606,7 @@ ptlrpc_service_part_init(struct ptlrpc_service *svc,
 
 	/* rqbd and incoming request queue */
 	spin_lock_init(&svcpt->scp_lock);
+	mutex_init(&svcpt->scp_mutex);
 	INIT_LIST_HEAD(&svcpt->scp_rqbd_idle);
 	INIT_LIST_HEAD(&svcpt->scp_rqbd_posted);
 	INIT_LIST_HEAD(&svcpt->scp_req_incoming);
@@ -683,7 +688,8 @@ ptlrpc_service_part_init(struct ptlrpc_service *svc,
  */
 struct ptlrpc_service *
 ptlrpc_register_service(struct ptlrpc_service_conf *conf,
-			struct proc_dir_entry *proc_entry)
+			struct kset *parent,
+			struct dentry *debugfs_entry)
 {
 	struct ptlrpc_service_cpt_conf	*cconf = &conf->psc_cpt;
 	struct ptlrpc_service		*service;
@@ -705,7 +711,13 @@ ptlrpc_register_service(struct ptlrpc_service_conf *conf,
 	if (cptable == NULL)
 		cptable = cfs_cpt_table;
 
-	if (!conf->psc_thr.tc_cpu_affinity) {
+	if (conf->psc_thr.tc_cpu_bind > 1) {
+		CERROR("%s: Invalid cpu bind value %d, only 1 or 0 allowed\n",
+		       conf->psc_name, conf->psc_thr.tc_cpu_bind);
+		RETURN(ERR_PTR(-EINVAL));
+	}
+
+	if (!cconf->cc_affinity) {
 		ncpts = 1;
 	} else {
 		ncpts = cfs_cpt_number(cptable);
@@ -744,6 +756,7 @@ ptlrpc_register_service(struct ptlrpc_service_conf *conf,
 	service->srv_cptable		= cptable;
 	service->srv_cpts		= cpts;
 	service->srv_ncpts		= ncpts;
+	service->srv_cpt_bind		= conf->psc_thr.tc_cpu_bind;
 
 	service->srv_cpt_bits = 0; /* it's zero already, easy to read... */
 	while ((1 << service->srv_cpt_bits) < cfs_cpt_number(cptable))
@@ -758,6 +771,9 @@ ptlrpc_register_service(struct ptlrpc_service_conf *conf,
 	/* buffer configuration */
 	service->srv_nbuf_per_group	= test_req_buffer_pressure ?
 					  1 : conf->psc_buf.bc_nbufs;
+	/* do not limit max number of rqbds by default */
+	service->srv_nrqbds_max		= 0;
+
 	service->srv_max_req_size	= conf->psc_buf.bc_req_max_size +
 					  SPTLRPC_MAX_PAYLOAD;
 	service->srv_buf_size		= conf->psc_buf.bc_buf_size;
@@ -776,7 +792,7 @@ ptlrpc_register_service(struct ptlrpc_service_conf *conf,
 	service->srv_ops		= conf->psc_ops;
 
 	for (i = 0; i < ncpts; i++) {
-		if (!conf->psc_thr.tc_cpu_affinity)
+		if (!cconf->cc_affinity)
 			cpt = CFS_CPT_ANY;
 		else
 			cpt = cpts != NULL ? cpts[i] : i;
@@ -800,8 +816,14 @@ ptlrpc_register_service(struct ptlrpc_service_conf *conf,
 	list_add(&service->srv_list, &ptlrpc_all_services);
 	mutex_unlock(&ptlrpc_all_services_mutex);
 
-	if (proc_entry != NULL)
-		ptlrpc_lprocfs_register_service(proc_entry, service);
+	if (parent) {
+		rc = ptlrpc_sysfs_register_service(parent, service);
+		if (rc)
+			GOTO(failed, rc);
+	}
+
+	if (debugfs_entry != NULL)
+		ptlrpc_ldebugfs_register_service(debugfs_entry, service);
 
 	rc = ptlrpc_service_nrs_setup(service);
 	if (rc != 0)
@@ -939,8 +961,10 @@ void ptlrpc_server_drop_request(struct ptlrpc_request *req)
 			 */
 			LASSERT(atomic_read(&rqbd->rqbd_req.rq_refcount) == 0);
 			if (svcpt->scp_nrqbds_posted >=
-			    svc->srv_nbuf_per_group &&
-			    !test_req_buffer_pressure) {
+			    svc->srv_nbuf_per_group ||
+			    (svc->srv_nrqbds_max != 0 &&
+			     svcpt->scp_nrqbds_total > svc->srv_nrqbds_max) ||
+			    test_req_buffer_pressure) {
 				/* like in ptlrpc_free_rqbd() */
 				svcpt->scp_nrqbds_total--;
 				OBD_FREE_LARGE(rqbd->rqbd_buffer,
@@ -977,18 +1001,18 @@ void ptlrpc_request_change_export(struct ptlrpc_request *req,
 	if (req->rq_export != NULL) {
 		LASSERT(!list_empty(&req->rq_exp_list));
 		/* remove rq_exp_list from last export */
-		spin_lock_bh(&req->rq_export->exp_rpc_lock);
+		spin_lock(&req->rq_export->exp_rpc_lock);
 		list_del_init(&req->rq_exp_list);
-		spin_unlock_bh(&req->rq_export->exp_rpc_lock);
+		spin_unlock(&req->rq_export->exp_rpc_lock);
 		/* export has one reference already, so it`s safe to
 		 * add req to export queue here and get another
 		 * reference for request later */
-		spin_lock_bh(&export->exp_rpc_lock);
+		spin_lock(&export->exp_rpc_lock);
 		if (req->rq_ops != NULL) /* hp request */
 			list_add(&req->rq_exp_list, &export->exp_hp_rpcs);
 		else
 			list_add(&req->rq_exp_list, &export->exp_reg_rpcs);
-		spin_unlock_bh(&export->exp_rpc_lock);
+		spin_unlock(&export->exp_rpc_lock);
 
 		class_export_rpc_dec(req->rq_export);
 		class_export_put(req->rq_export);
@@ -1041,10 +1065,10 @@ static void ptlrpc_server_finish_active_request(
  * This function is only called when some export receives a message (i.e.,
  * the network is up.)
  */
-void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay)
+void ptlrpc_update_export_timer(struct obd_export *exp, time64_t extra_delay)
 {
-        struct obd_export *oldest_exp;
-        time_t oldest_time, new_time;
+	struct obd_export *oldest_exp;
+	time64_t oldest_time, new_time;
 
         ENTRY;
 
@@ -1057,7 +1081,7 @@ void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay)
            will make it to the top of the list. */
 
         /* Do not pay attention on 1sec or smaller renewals. */
-        new_time = cfs_time_current_sec() + extra_delay;
+	new_time = ktime_get_real_seconds() + extra_delay;
         if (exp->exp_last_request_time + 1 /*second */ >= new_time)
                 RETURN_EXIT;
 
@@ -1088,33 +1112,35 @@ void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay)
                 return;
         }
 
-        /* Note - racing to start/reset the obd_eviction timer is safe */
-        if (exp->exp_obd->obd_eviction_timer == 0) {
-                /* Check if the oldest entry is expired. */
-                if (cfs_time_current_sec() > (oldest_time + PING_EVICT_TIMEOUT +
-                                              extra_delay)) {
-                        /* We need a second timer, in case the net was down and
-                         * it just came back. Since the pinger may skip every
-                         * other PING_INTERVAL (see note in ptlrpc_pinger_main),
-                         * we better wait for 3. */
-                        exp->exp_obd->obd_eviction_timer =
-                                cfs_time_current_sec() + 3 * PING_INTERVAL;
-			CDEBUG(D_HA, "%s: Think about evicting %s from %ld\n",
-                               exp->exp_obd->obd_name,
-                               obd_export_nid2str(oldest_exp), oldest_time);
-                }
-        } else {
-                if (cfs_time_current_sec() >
-                    (exp->exp_obd->obd_eviction_timer + extra_delay)) {
-                        /* The evictor won't evict anyone who we've heard from
-                         * recently, so we don't have to check before we start
-                         * it. */
-                        if (!ping_evictor_wake(exp))
-                                exp->exp_obd->obd_eviction_timer = 0;
-                }
-        }
+	/* Note - racing to start/reset the obd_eviction timer is safe */
+	if (exp->exp_obd->obd_eviction_timer == 0) {
+		/* Check if the oldest entry is expired. */
+		if (ktime_get_real_seconds() >
+		    oldest_time + PING_EVICT_TIMEOUT + extra_delay) {
+			/* We need a second timer, in case the net was down and
+			 * it just came back. Since the pinger may skip every
+			 * other PING_INTERVAL (see note in ptlrpc_pinger_main),
+			 * we better wait for 3.
+			 */
+			exp->exp_obd->obd_eviction_timer =
+				ktime_get_real_seconds() + 3 * PING_INTERVAL;
+			CDEBUG(D_HA, "%s: Think about evicting %s from %lld\n",
+			       exp->exp_obd->obd_name,
+			       obd_export_nid2str(oldest_exp), oldest_time);
+		}
+	} else {
+		if (ktime_get_real_seconds() >
+		    (exp->exp_obd->obd_eviction_timer + extra_delay)) {
+			/* The evictor won't evict anyone who we've heard from
+			 * recently, so we don't have to check before we start
+			 * it.
+			 */
+			if (!ping_evictor_wake(exp))
+				exp->exp_obd->obd_eviction_timer = 0;
+		}
+	}
 
-        EXIT;
+	EXIT;
 }
 
 /**
@@ -1166,7 +1192,7 @@ static int ptlrpc_check_req(struct ptlrpc_request *req)
 static void ptlrpc_at_set_timer(struct ptlrpc_service_part *svcpt)
 {
 	struct ptlrpc_at_array *array = &svcpt->scp_at_array;
-	__s32 next;
+	time64_t next;
 
 	if (array->paa_count == 0) {
 		del_timer(&svcpt->scp_at_timer);
@@ -1174,13 +1200,14 @@ static void ptlrpc_at_set_timer(struct ptlrpc_service_part *svcpt)
 	}
 
 	/* Set timer for closest deadline */
-	next = (__s32)(array->paa_deadline - ktime_get_real_seconds() -
-		       at_early_margin);
+	next = array->paa_deadline - ktime_get_real_seconds() -
+	       at_early_margin;
 	if (next <= 0) {
 		ptlrpc_at_timer(cfs_timer_cb_arg(svcpt, scp_at_timer));
 	} else {
-		mod_timer(&svcpt->scp_at_timer, cfs_time_shift(next));
-		CDEBUG(D_INFO, "armed %s at %+ds\n",
+		mod_timer(&svcpt->scp_at_timer,
+			  jiffies + nsecs_to_jiffies(next * NSEC_PER_SEC));
+		CDEBUG(D_INFO, "armed %s at %+llds\n",
 		       svcpt->scp_service->srv_name, next);
 	}
 }
@@ -1432,16 +1459,16 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt)
         __u32  index, count;
 	time64_t deadline;
 	time64_t now = ktime_get_real_seconds();
-        cfs_duration_t delay;
-        int first, counter = 0;
-        ENTRY;
+	s64 delay;
+	int first, counter = 0;
 
+	ENTRY;
 	spin_lock(&svcpt->scp_at_lock);
 	if (svcpt->scp_at_check == 0) {
 		spin_unlock(&svcpt->scp_at_lock);
 		RETURN(0);
 	}
-	delay = cfs_time_sub(cfs_time_current(), svcpt->scp_at_checktime);
+	delay = ktime_ms_delta(ktime_get(), svcpt->scp_at_checktime);
 	svcpt->scp_at_check = 0;
 
 	if (array->paa_count == 0) {
@@ -1477,14 +1504,18 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt)
 				break;
 			}
 
-			ptlrpc_at_remove_timed(rq);
 			/**
 			 * ptlrpc_server_drop_request() may drop
 			 * refcount to 0 already. Let's check this and
 			 * don't add entry to work_list
 			 */
-			if (likely(atomic_inc_not_zero(&rq->rq_refcount)))
+			if (likely(atomic_inc_not_zero(&rq->rq_refcount))) {
+				ptlrpc_at_remove_timed(rq);
 				list_add(&rq->rq_timed_list, &work_list);
+			} else {
+				ptlrpc_at_remove_timed(rq);
+			}
+
 			counter++;
 		}
 
@@ -1505,7 +1536,7 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt)
                 LCONSOLE_WARN("%s: This server is not able to keep up with "
 			      "request traffic (cpu-bound).\n",
 			      svcpt->scp_service->srv_name);
-		CWARN("earlyQ=%d reqQ=%d recA=%d, svcEst=%d, delay=%ld(jiff)\n",
+		CWARN("earlyQ=%d reqQ=%d recA=%d, svcEst=%d, delay=%lld\n",
 		      counter, svcpt->scp_nreqs_incoming,
 		      svcpt->scp_nreqs_active,
 		      at_get(&svcpt->scp_at_estimate), delay);
@@ -1529,18 +1560,14 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt)
 
 /* Check if we are already handling earlier incarnation of this request.
  * Called under &req->rq_export->exp_rpc_lock locked */
-static int ptlrpc_server_check_resend_in_progress(struct ptlrpc_request *req)
+static struct ptlrpc_request*
+ptlrpc_server_check_resend_in_progress(struct ptlrpc_request *req)
 {
 	struct ptlrpc_request	*tmp = NULL;
 
 	if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) ||
 	    (atomic_read(&req->rq_export->exp_rpc_count) == 0))
-		return 0;
-
-	/* bulk request are aborted upon reconnect, don't try to
-	 * find a match */
-	if (req->rq_bulk_write || req->rq_bulk_read)
-		return 0;
+		return NULL;
 
 	/* This list should not be longer than max_requests in
 	 * flights on the client, so it is not all that long.
@@ -1558,12 +1585,12 @@ static int ptlrpc_server_check_resend_in_progress(struct ptlrpc_request *req)
 		if (tmp->rq_xid == req->rq_xid)
 			goto found;
 	}
-	return 0;
+	return NULL;
 
 found:
 	DEBUG_REQ(D_HA, req, "Found duplicate req in processing");
 	DEBUG_REQ(D_HA, tmp, "Request being processed");
-	return -EBUSY;
+	return tmp;
 }
 
 /**
@@ -1617,9 +1644,9 @@ static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req)
 		if (req->rq_ops && req->rq_ops->hpreq_fini)
 			req->rq_ops->hpreq_fini(req);
 
-		spin_lock_bh(&req->rq_export->exp_rpc_lock);
+		spin_lock(&req->rq_export->exp_rpc_lock);
 		list_del_init(&req->rq_exp_list);
-		spin_unlock_bh(&req->rq_export->exp_rpc_lock);
+		spin_unlock(&req->rq_export->exp_rpc_lock);
 	}
 	EXIT;
 }
@@ -1653,6 +1680,7 @@ static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt,
 {
 	int rc;
 	bool hp;
+	struct ptlrpc_request *orig;
 	ENTRY;
 
 	rc = ptlrpc_server_hpreq_init(svcpt, req);
@@ -1662,18 +1690,43 @@ static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt,
 	hp = rc > 0;
 	ptlrpc_nrs_req_initialize(svcpt, req, hp);
 
-	if (req->rq_export != NULL) {
+	while (req->rq_export != NULL) {
 		struct obd_export *exp = req->rq_export;
 
 		/* do search for duplicated xid and the adding to the list
 		 * atomically */
 		spin_lock_bh(&exp->exp_rpc_lock);
-		rc = ptlrpc_server_check_resend_in_progress(req);
-		if (rc < 0) {
+		orig = ptlrpc_server_check_resend_in_progress(req);
+		if (orig && OBD_FAIL_PRECHECK(OBD_FAIL_PTLRPC_RESEND_RACE)) {
+			spin_unlock_bh(&exp->exp_rpc_lock);
+
+			OBD_RACE(OBD_FAIL_PTLRPC_RESEND_RACE);
+			msleep(4 * MSEC_PER_SEC);
+			continue;
+		}
+		if (orig && likely(atomic_inc_not_zero(&orig->rq_refcount))) {
+			bool linked;
+
 			spin_unlock_bh(&exp->exp_rpc_lock);
 
+			/*
+			 * When the client resend request and the server has
+			 * the previous copy of it, we need to update deadlines,
+			 * to be sure that the client and the server have equal
+			 *  request deadlines.
+			 */
+
+			spin_lock(&orig->rq_rqbd->rqbd_svcpt->scp_at_lock);
+			linked = orig->rq_at_linked;
+			if (likely(linked))
+				ptlrpc_at_remove_timed(orig);
+			spin_unlock(&orig->rq_rqbd->rqbd_svcpt->scp_at_lock);
+			orig->rq_deadline = req->rq_deadline;
+			if (likely(linked))
+				ptlrpc_at_add_timed(orig);
+			ptlrpc_server_drop_request(orig);
 			ptlrpc_nrs_req_finalize(req);
-			RETURN(rc);
+			RETURN(-EBUSY);
 		}
 
 		if (hp || req->rq_ops != NULL)
@@ -1681,6 +1734,7 @@ static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt,
 		else
 			list_add(&req->rq_exp_list, &exp->exp_reg_rpcs);
 		spin_unlock_bh(&exp->exp_rpc_lock);
+		break;
 	}
 
 	/* the current thread is not the processing thread for this request
@@ -2064,7 +2118,8 @@ ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
 		if (unlikely(ptlrpc_check_req(request)))
 			goto put_conn;
 		ptlrpc_update_export_timer(request->rq_export,
-					   timediff_usecs >> 19);
+					   div_u64(timediff_usecs,
+						   USEC_PER_SEC / 2));
         }
 
         /* Discard requests queued for longer than the deadline.
@@ -2151,7 +2206,7 @@ ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
 		DEBUG_REQ(D_ADAPTTO, request,
 			  "sent %d early replies before finishing in %llds",
 			  request->rq_early_count,
-			  arrived_usecs / USEC_PER_SEC);
+			  div_u64(arrived_usecs, USEC_PER_SEC));
 	}
 
 	ptlrpc_server_finish_active_request(svcpt, request);
@@ -2239,7 +2294,7 @@ ptlrpc_handle_rs(struct ptlrpc_reply_state *rs)
 
 				while (nlocks-- > 0) {
 					lock = ack_locks[nlocks];
-					ldlm_lock_downgrade(lock, LCK_COS);
+					ldlm_lock_mode_downgrade(lock, LCK_COS);
 					LDLM_LOCK_PUT(lock);
 				}
 				RETURN(0);
@@ -2453,40 +2508,39 @@ static int ptlrpc_main(void *arg)
 	thread->t_pid = current_pid();
 	unshare_fs_struct();
 
-	/* NB: we will call cfs_cpt_bind() for all threads, because we
-	 * might want to run lustre server only on a subset of system CPUs,
-	 * in that case ->scp_cpt is CFS_CPT_ANY */
-	rc = cfs_cpt_bind(svc->srv_cptable, svcpt->scp_cpt);
-	if (rc != 0) {
-		CWARN("%s: failed to bind %s on CPT %d\n",
-		      svc->srv_name, thread->t_name, svcpt->scp_cpt);
+	if (svc->srv_cpt_bind) {
+		rc = cfs_cpt_bind(svc->srv_cptable, svcpt->scp_cpt);
+		if (rc != 0) {
+			CWARN("%s: failed to bind %s on CPT %d\n",
+			      svc->srv_name, thread->t_name, svcpt->scp_cpt);
+		}
 	}
 
 	ginfo = groups_alloc(0);
-	if (!ginfo) {
-		rc = -ENOMEM;
-		goto out;
-	}
+	if (!ginfo)
+		GOTO(out, rc = -ENOMEM);
 
 	set_current_groups(ginfo);
 	put_group_info(ginfo);
 
 	if (svc->srv_ops.so_thr_init != NULL) {
 		rc = svc->srv_ops.so_thr_init(thread);
-                if (rc)
-                        goto out;
-        }
 
-        OBD_ALLOC_PTR(env);
-        if (env == NULL) {
-                rc = -ENOMEM;
-                goto out_srv_fini;
-        }
+		if (rc)
+			GOTO(out, rc);
+	}
 
-        rc = lu_context_init(&env->le_ctx,
-                             svc->srv_ctx_tags|LCT_REMEMBER|LCT_NOREF);
-        if (rc)
-                goto out_srv_fini;
+	OBD_ALLOC_PTR(env);
+	if (env == NULL)
+		GOTO(out_srv_fini, rc = -ENOMEM);
+	rc = lu_env_add(env);
+	if (rc)
+		GOTO(out_env, rc);
+
+	rc = lu_context_init(&env->le_ctx,
+			     svc->srv_ctx_tags|LCT_REMEMBER|LCT_NOREF);
+	if (rc)
+		GOTO(out_env_remove, rc);
 
         thread->t_env = env;
         env->le_ctx.lc_thread = thread;
@@ -2499,15 +2553,13 @@ static int ptlrpc_main(void *arg)
 
 		CERROR("Failed to post rqbd for %s on CPT %d: %d\n",
 			svc->srv_name, svcpt->scp_cpt, rc);
-		goto out_srv_fini;
+		GOTO(out_ctx_fini, rc);
 	}
 
 	/* Alloc reply state structure for this one */
 	OBD_ALLOC_LARGE(rs, svc->srv_max_reply_size);
-	if (!rs) {
-		rc = -ENOMEM;
-		goto out_srv_fini;
-	}
+	if (!rs)
+		GOTO(out_ctx_fini, rc = -ENOMEM);
 
 	spin_lock(&svcpt->scp_lock);
 
@@ -2553,6 +2605,9 @@ static int ptlrpc_main(void *arg)
 
 		/* reset le_ses to initial state */
 		env->le_ses = NULL;
+		/* Refill the context before execution to make sure
+		 * all thread keys are allocated */
+		lu_env_refill(env);
 		/* Process all incoming reqs before handling any */
 		if (ptlrpc_server_request_incoming(svcpt)) {
 			lu_context_enter(&env->le_ctx);
@@ -2588,17 +2643,18 @@ static int ptlrpc_main(void *arg)
         lc_watchdog_delete(thread->t_watchdog);
         thread->t_watchdog = NULL;
 
+out_ctx_fini:
+	lu_context_fini(&env->le_ctx);
+out_env_remove:
+	lu_env_remove(env);
+out_env:
+	OBD_FREE_PTR(env);
 out_srv_fini:
         /*
          * deconstruct service specific state created by ptlrpc_start_thread()
          */
 	if (svc->srv_ops.so_thr_done != NULL)
 		svc->srv_ops.so_thr_done(thread);
-
-        if (env != NULL) {
-                lu_context_fini(&env->le_ctx);
-                OBD_FREE_PTR(env);
-        }
 out:
         CDEBUG(D_RPCTRACE, "service thread [ %p : %u ] %d exiting: rc %d\n",
                thread, thread->t_pid, thread->t_id, rc);
@@ -2644,8 +2700,13 @@ static int ptlrpc_hr_main(void *arg)
 	struct ptlrpc_hr_thread		*hrt = (struct ptlrpc_hr_thread *)arg;
 	struct ptlrpc_hr_partition	*hrp = hrt->hrt_partition;
 	struct list_head		replies;
+	struct lu_env			*env;
 	int				rc;
 
+	OBD_ALLOC_PTR(env);
+	if (env == NULL)
+		RETURN(-ENOMEM);
+
 	INIT_LIST_HEAD(&replies);
 	unshare_fs_struct();
 
@@ -2659,6 +2720,15 @@ static int ptlrpc_hr_main(void *arg)
 		      threadname, hrp->hrp_cpt, ptlrpc_hr.hr_cpt_table, rc);
 	}
 
+	rc = lu_context_init(&env->le_ctx, LCT_MD_THREAD | LCT_DT_THREAD |
+			     LCT_REMEMBER | LCT_NOREF);
+	if (rc)
+		GOTO(out_env, rc);
+
+	rc = lu_env_add(env);
+	if (rc)
+		GOTO(out_ctx_fini, rc);
+
 	atomic_inc(&hrp->hrp_nstarted);
 	wake_up(&ptlrpc_hr.hr_waitq);
 
@@ -2672,13 +2742,22 @@ static int ptlrpc_hr_main(void *arg)
 					struct ptlrpc_reply_state,
 					rs_list);
 			list_del_init(&rs->rs_list);
+			/* refill keys if needed */
+			lu_env_refill(env);
+			lu_context_enter(&env->le_ctx);
 			ptlrpc_handle_rs(rs);
+			lu_context_exit(&env->le_ctx);
 		}
 	}
 
 	atomic_inc(&hrp->hrp_nstopped);
 	wake_up(&ptlrpc_hr.hr_waitq);
 
+	lu_env_remove(env);
+out_ctx_fini:
+	lu_context_fini(&env->le_ctx);
+out_env:
+	OBD_FREE_PTR(env);
 	return 0;
 }
 
@@ -3243,6 +3322,7 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service)
 	ptlrpc_service_nrs_cleanup(service);
 
 	ptlrpc_lprocfs_unregister_service(service);
+	ptlrpc_sysfs_unregister_service(service);
 
 	ptlrpc_service_free(service);
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/wirehdr.c b/drivers/staging/lustrefsx/lustre/ptlrpc/wirehdr.c
index 3a9daf899c26e..7f9fb09ee4ffd 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/wirehdr.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/wirehdr.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,5 +40,7 @@
 #include <obd_support.h>
 #include <obd_class.h>
 #include <lustre_net.h>
-#include <lustre/lustre_lfsck_user.h>
 #include <lustre_disk.h>
+#include <uapi/linux/lustre/lustre_lfsck_user.h>
+#include <uapi/linux/lustre/lustre_cfg.h>
+
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c b/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c
index 94828872d70ac..c0c7b0c5f1f05 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,15 +40,16 @@
 #include <obd_support.h>
 #include <obd_class.h>
 #include <lustre_net.h>
-#include <lustre/lustre_lfsck_user.h>
 #include <lustre_disk.h>
+#include <uapi/linux/lustre/lustre_lfsck_user.h>
+#include <uapi/linux/lustre/lustre_cfg.h>
+
+
 void lustre_assert_wire_constants(void)
 {
-	 /* Wire protocol assertions generated by 'wirecheck'
-	  * (make -C lustre/utils newwiretest)
-	  * running on Linux centss05 2.6.32.431.29.2.el6_lustre #1 SMP Tue Sep 23 16:06:38 CDT 2014 x
-	  * with gcc version 4.4.7 20120313 (Red Hat 4.4.7-4) (GCC)  */
-
+	/* Wire protocol assertions generated by 'wirecheck'
+	 * (make -C lustre/utils newwiretest)
+	 */
 
 	/* Constants... */
 	LASSERTF(PTL_RPC_MSG_REQUEST == 4711, "found %lld\n",
@@ -174,7 +175,9 @@ void lustre_assert_wire_constants(void)
 		 (long long)MDS_HSM_CT_UNREGISTER);
 	LASSERTF(MDS_SWAP_LAYOUTS == 61, "found %lld\n",
 		 (long long)MDS_SWAP_LAYOUTS);
-	LASSERTF(MDS_LAST_OPC == 62, "found %lld\n",
+	LASSERTF(MDS_RMFID == 62, "found %lld\n",
+		 (long long)MDS_RMFID);
+	LASSERTF(MDS_LAST_OPC == 63, "found %lld\n",
 		 (long long)MDS_LAST_OPC);
 	LASSERTF(REINT_SETATTR == 1, "found %lld\n",
 		 (long long)REINT_SETATTR);
@@ -194,7 +197,7 @@ void lustre_assert_wire_constants(void)
 		 (long long)REINT_RMENTRY);
 	LASSERTF(REINT_MIGRATE == 9, "found %lld\n",
 		 (long long)REINT_MIGRATE);
-	LASSERTF(REINT_MAX == 10, "found %lld\n",
+	LASSERTF(REINT_MAX == 11, "found %lld\n",
 		 (long long)REINT_MAX);
 	LASSERTF(DISP_IT_EXECD == 0x00000001UL, "found 0x%.8xUL\n",
 		(unsigned)DISP_IT_EXECD);
@@ -252,9 +255,14 @@ void lustre_assert_wire_constants(void)
 			(long long)MDS_ATTR_FROM_OPEN);
 	LASSERTF(MDS_ATTR_BLOCKS == 0x0000000000008000ULL, "found 0x%.16llxULL\n",
 			(long long)MDS_ATTR_BLOCKS);
-
 	LASSERTF(MDS_ATTR_PROJID == 0x0000000000010000ULL, "found 0x%.16llxULL\n",
 			(long long)MDS_ATTR_PROJID);
+	LASSERTF(MDS_ATTR_LSIZE == 0x0000000000020000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_LSIZE);
+	LASSERTF(MDS_ATTR_LBLOCKS == 0x0000000000040000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_LBLOCKS);
+	LASSERTF(MDS_ATTR_OVERRIDE == 0x0000000002000000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_OVERRIDE);
 	LASSERTF(FLD_QUERY == 900, "found %lld\n",
 		 (long long)FLD_QUERY);
 	LASSERTF(FLD_READ == 901, "found %lld\n",
@@ -339,10 +347,6 @@ void lustre_assert_wire_constants(void)
 	CLASSERT(LQUOTA_RES_DT == 2);
 	LASSERTF(OBD_PING == 400, "found %lld\n",
 		 (long long)OBD_PING);
-	LASSERTF(OBD_LOG_CANCEL == 401, "found %lld\n",
-		 (long long)OBD_LOG_CANCEL);
-	LASSERTF(OBD_QC_CALLBACK == 402, "found %lld\n",
-		 (long long)OBD_QC_CALLBACK);
 	LASSERTF(OBD_IDX_READ == 403, "found %lld\n",
 		 (long long)OBD_IDX_READ);
 	LASSERTF(OBD_LAST_OPC == 404, "found %lld\n",
@@ -365,6 +369,8 @@ void lustre_assert_wire_constants(void)
 		 (long long)MGS_TARGET_DEL);
 	LASSERTF(MGS_SET_INFO == 255, "found %lld\n",
 		 (long long)MGS_SET_INFO);
+	LASSERTF(MGS_CONFIG_READ == 256, "found %lld\n",
+		 (long long)MGS_CONFIG_READ);
 	LASSERTF(MGS_LAST_OPC == 257, "found %lld\n",
 		 (long long)MGS_LAST_OPC);
 	LASSERTF(SEC_CTX_INIT == 801, "found %lld\n",
@@ -500,6 +506,30 @@ void lustre_assert_wire_constants(void)
 		 (long long)OUT_PUNCH);
 	LASSERTF(OUT_READ == 15, "found %lld\n",
 		 (long long)OUT_READ);
+	LASSERTF(OUT_NOOP == 16, "found %lld\n",
+		 (long long)OUT_NOOP);
+	LASSERTF(OUT_XATTR_LIST == 17, "found %lld\n",
+		 (long long)OUT_XATTR_LIST);
+
+	/* Checks for struct lustre_som_attrs */
+	LASSERTF((int)sizeof(struct lustre_som_attrs) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_som_attrs));
+	LASSERTF((int)offsetof(struct lustre_som_attrs, lsa_valid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_som_attrs, lsa_valid));
+	LASSERTF((int)sizeof(((struct lustre_som_attrs *)0)->lsa_valid) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_som_attrs *)0)->lsa_valid));
+	LASSERTF((int)offsetof(struct lustre_som_attrs, lsa_reserved) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_som_attrs, lsa_reserved));
+	LASSERTF((int)sizeof(((struct lustre_som_attrs *)0)->lsa_reserved) == 6, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_som_attrs *)0)->lsa_reserved));
+	LASSERTF((int)offsetof(struct lustre_som_attrs, lsa_size) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_som_attrs, lsa_size));
+	LASSERTF((int)sizeof(((struct lustre_som_attrs *)0)->lsa_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_som_attrs *)0)->lsa_size));
+	LASSERTF((int)offsetof(struct lustre_som_attrs, lsa_blocks) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_som_attrs, lsa_blocks));
+	LASSERTF((int)sizeof(((struct lustre_som_attrs *)0)->lsa_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_som_attrs *)0)->lsa_blocks));
 
 	/* Checks for struct hsm_attrs */
 	LASSERTF((int)sizeof(struct hsm_attrs) == 24, "found %lld\n",
@@ -656,6 +686,78 @@ void lustre_assert_wire_constants(void)
 	LASSERTF((int)sizeof(union lu_page) == 4096, "found %lld\n",
 		 (long long)(int)sizeof(union lu_page));
 
+	/* Checks for struct lu_ladvise */
+	LASSERTF((int)sizeof(struct lu_ladvise) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lu_ladvise));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_advice) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_advice));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_advice) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_advice));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_value1) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_value1));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value1) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value1));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_value2) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_value2));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value2));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_start) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_start));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_start));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_end) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_end));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_end));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_value3) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_value3));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value3));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_value4) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_value4));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value4));
+	LASSERTF(LU_LADVISE_WILLREAD == 1, "found %lld\n",
+		 (long long)LU_LADVISE_WILLREAD);
+	LASSERTF(LU_LADVISE_DONTNEED == 2, "found %lld\n",
+		 (long long)LU_LADVISE_DONTNEED);
+
+	/* Checks for struct ladvise_hdr */
+	LASSERTF((int)sizeof(struct ladvise_hdr) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct ladvise_hdr));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_magic));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_magic));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_count));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_count));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_flags) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_flags));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_flags));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_value1) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_value1));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value1));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_value2) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_value2));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value2));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_value3) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_value3));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value3));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_advise) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_advise));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_advise) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_advise));
+	LASSERTF(LF_ASYNC == 1, "found %lld\n",
+		 (long long)LF_ASYNC);
+	LASSERTF(LADVISE_MAGIC == 450829536, "found %lld\n",
+		 (long long)LADVISE_MAGIC);
+
 	/* Checks for struct lustre_handle */
 	LASSERTF((int)sizeof(struct lustre_handle) == 8, "found %lld\n",
 		 (long long)(int)sizeof(struct lustre_handle));
@@ -703,10 +805,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct lustre_msg_v2, lm_buflens[0]));
 	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0]) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0]));
-	LASSERTF(LUSTRE_MSG_MAGIC_V2 == 0x0BD00BD3, "found 0x%.8x\n",
-		LUSTRE_MSG_MAGIC_V2);
-	LASSERTF(LUSTRE_MSG_MAGIC_V2_SWABBED == 0xD30BD00B, "found 0x%.8x\n",
-		LUSTRE_MSG_MAGIC_V2_SWABBED);
+	LASSERTF(LUSTRE_MSG_MAGIC_V2 == 0x0bd00bd3UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_MSG_MAGIC_V2);
+	LASSERTF(LUSTRE_MSG_MAGIC_V2_SWABBED == 0xd30bd00bUL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_MSG_MAGIC_V2_SWABBED);
 
 	/* Checks for struct ptlrpc_body */
 	LASSERTF((int)sizeof(struct ptlrpc_body_v3) == 184, "found %lld\n",
@@ -921,42 +1023,30 @@ void lustre_assert_wire_constants(void)
 		 (long long)DLM_REPLY_REC_OFF);
 	LASSERTF(MSG_PTLRPC_HEADER_OFF == 31, "found %lld\n",
 		 (long long)MSG_PTLRPC_HEADER_OFF);
-	LASSERTF(PTLRPC_MSG_VERSION == 0x00000003, "found 0x%.8x\n",
-		PTLRPC_MSG_VERSION);
-	LASSERTF(LUSTRE_VERSION_MASK == 0xffff0000, "found 0x%.8x\n",
-		LUSTRE_VERSION_MASK);
-	LASSERTF(LUSTRE_OBD_VERSION == 0x00010000, "found 0x%.8x\n",
-		LUSTRE_OBD_VERSION);
-	LASSERTF(LUSTRE_MDS_VERSION == 0x00020000, "found 0x%.8x\n",
-		LUSTRE_MDS_VERSION);
-	LASSERTF(LUSTRE_OST_VERSION == 0x00030000, "found 0x%.8x\n",
-		LUSTRE_OST_VERSION);
-	LASSERTF(LUSTRE_DLM_VERSION == 0x00040000, "found 0x%.8x\n",
-		LUSTRE_DLM_VERSION);
-	LASSERTF(LUSTRE_LOG_VERSION == 0x00050000, "found 0x%.8x\n",
-		LUSTRE_LOG_VERSION);
-	LASSERTF(LUSTRE_MGS_VERSION == 0x00060000, "found 0x%.8x\n",
-		LUSTRE_MGS_VERSION);
+	LASSERTF(PTLRPC_MSG_VERSION == 0x00000003UL, "found 0x%.8xUL\n",
+		(unsigned)PTLRPC_MSG_VERSION);
+	LASSERTF(LUSTRE_VERSION_MASK == 0xffff0000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_VERSION_MASK);
+	LASSERTF(LUSTRE_OBD_VERSION == 0x00010000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_OBD_VERSION);
+	LASSERTF(LUSTRE_MDS_VERSION == 0x00020000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_MDS_VERSION);
+	LASSERTF(LUSTRE_OST_VERSION == 0x00030000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_OST_VERSION);
+	LASSERTF(LUSTRE_DLM_VERSION == 0x00040000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_DLM_VERSION);
+	LASSERTF(LUSTRE_LOG_VERSION == 0x00050000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_LOG_VERSION);
+	LASSERTF(LUSTRE_MGS_VERSION == 0x00060000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_MGS_VERSION);
 	LASSERTF(MSGHDR_AT_SUPPORT == 1, "found %lld\n",
 		 (long long)MSGHDR_AT_SUPPORT);
 	LASSERTF(MSGHDR_CKSUM_INCOMPAT18 == 2, "found %lld\n",
 		 (long long)MSGHDR_CKSUM_INCOMPAT18);
-	LASSERTF(MSG_OP_FLAG_MASK == 0xffff0000UL, "found 0x%.8xUL\n",
-		(unsigned)MSG_OP_FLAG_MASK);
-	LASSERTF(MSG_OP_FLAG_SHIFT == 16, "found %lld\n",
-		 (long long)MSG_OP_FLAG_SHIFT);
-	LASSERTF(MSG_GEN_FLAG_MASK == 0x0000ffffUL, "found 0x%.8xUL\n",
-		(unsigned)MSG_GEN_FLAG_MASK);
-	LASSERTF(MSG_LAST_REPLAY == 0x00000001UL, "found 0x%.8xUL\n",
-		(unsigned)MSG_LAST_REPLAY);
 	LASSERTF(MSG_RESENT == 0x00000002UL, "found 0x%.8xUL\n",
 		(unsigned)MSG_RESENT);
 	LASSERTF(MSG_REPLAY == 0x00000004UL, "found 0x%.8xUL\n",
 		(unsigned)MSG_REPLAY);
-	LASSERTF(MSG_DELAY_REPLAY == 0x00000010UL, "found 0x%.8xUL\n",
-		(unsigned)MSG_DELAY_REPLAY);
-	LASSERTF(MSG_VERSION_REPLAY == 0x00000020UL, "found 0x%.8xUL\n",
-		(unsigned)MSG_VERSION_REPLAY);
 	LASSERTF(MSG_REQ_REPLAY_DONE == 0x00000040UL, "found 0x%.8xUL\n",
 		(unsigned)MSG_REQ_REPLAY_DONE);
 	LASSERTF(MSG_LOCK_REPLAY_DONE == 0x00000080UL, "found 0x%.8xUL\n",
@@ -971,8 +1061,6 @@ void lustre_assert_wire_constants(void)
 		(unsigned)MSG_CONNECT_LIBCLIENT);
 	LASSERTF(MSG_CONNECT_INITIAL == 0x00000020UL, "found 0x%.8xUL\n",
 		(unsigned)MSG_CONNECT_INITIAL);
-	LASSERTF(MSG_CONNECT_ASYNC == 0x00000040UL, "found 0x%.8xUL\n",
-		(unsigned)MSG_CONNECT_ASYNC);
 	LASSERTF(MSG_CONNECT_NEXT_VER == 0x00000080UL, "found 0x%.8xUL\n",
 		(unsigned)MSG_CONNECT_NEXT_VER);
 	LASSERTF(MSG_CONNECT_TRANSNO == 0x00000100UL, "found 0x%.8xUL\n",
@@ -1229,8 +1317,8 @@ void lustre_assert_wire_constants(void)
 		 OBD_CONNECT_DIR_STRIPE);
 	LASSERTF(OBD_CONNECT_SUBTREE == 0x800000000000000ULL, "found 0x%.16llxULL\n",
 		 OBD_CONNECT_SUBTREE);
-	LASSERTF(OBD_CONNECT_LOCK_AHEAD == 0x1000000000000000ULL, "found 0x%.16llxULL\n",
-		 OBD_CONNECT_LOCK_AHEAD);
+	LASSERTF(OBD_CONNECT_LOCKAHEAD_OLD == 0x1000000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LOCKAHEAD_OLD);
 	LASSERTF(OBD_CONNECT_BULK_MBITS == 0x2000000000000000ULL, "found 0x%.16llxULL\n",
 		 OBD_CONNECT_BULK_MBITS);
 	LASSERTF(OBD_CONNECT_OBDOPACK == 0x4000000000000000ULL, "found 0x%.16llxULL\n",
@@ -1239,12 +1327,48 @@ void lustre_assert_wire_constants(void)
 		 OBD_CONNECT_FLAGS2);
 	LASSERTF(OBD_CONNECT2_FILE_SECCTX == 0x1ULL, "found 0x%.16llxULL\n",
 		 OBD_CONNECT2_FILE_SECCTX);
+	LASSERTF(OBD_CONNECT2_LOCKAHEAD == 0x2ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_LOCKAHEAD);
+	LASSERTF(OBD_CONNECT2_DIR_MIGRATE == 0x4ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_DIR_MIGRATE);
+	LASSERTF(OBD_CONNECT2_FLR == 0x20ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_FLR);
+	LASSERTF(OBD_CONNECT2_WBC_INTENTS == 0x40ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_WBC_INTENTS);
+	LASSERTF(OBD_CONNECT2_LOCK_CONVERT == 0x80ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_LOCK_CONVERT);
+	LASSERTF(OBD_CONNECT2_ARCHIVE_ID_ARRAY == 0x100ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_ARCHIVE_ID_ARRAY);
+	LASSERTF(OBD_CONNECT2_SELINUX_POLICY == 0x400ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_SELINUX_POLICY);
+	LASSERTF(OBD_CONNECT2_LSOM == 0x800ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_LSOM);
+	LASSERTF(OBD_CONNECT2_ASYNC_DISCARD == 0x4000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_ASYNC_DISCARD);
+	LASSERTF(OBD_CONNECT2_ENCRYPT == 0x8000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_ENCRYPT);
+	LASSERTF(OBD_CONNECT2_FIDMAP== 0x10000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_FIDMAP);
+	LASSERTF(OBD_CONNECT2_GETATTR_PFID== 0x20000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_GETATTR_PFID);
 	LASSERTF(OBD_CKSUM_CRC32 == 0x00000001UL, "found 0x%.8xUL\n",
 		(unsigned)OBD_CKSUM_CRC32);
 	LASSERTF(OBD_CKSUM_ADLER == 0x00000002UL, "found 0x%.8xUL\n",
 		(unsigned)OBD_CKSUM_ADLER);
 	LASSERTF(OBD_CKSUM_CRC32C == 0x00000004UL, "found 0x%.8xUL\n",
 		(unsigned)OBD_CKSUM_CRC32C);
+	LASSERTF(OBD_CKSUM_RESERVED == 0x00000008UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_RESERVED);
+	LASSERTF(OBD_CKSUM_T10IP512 == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_T10IP512);
+	LASSERTF(OBD_CKSUM_T10IP4K == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_T10IP4K);
+	LASSERTF(OBD_CKSUM_T10CRC512 == 0x00000040UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_T10CRC512);
+	LASSERTF(OBD_CKSUM_T10CRC4K == 0x00000080UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_T10CRC4K);
+	LASSERTF(OBD_CKSUM_T10_TOP == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_T10_TOP);
 
 	/* Checks for struct ost_layout */
 	LASSERTF((int)sizeof(struct ost_layout) == 28, "found %lld\n",
@@ -1361,10 +1485,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct obdo, o_layout));
 	LASSERTF((int)sizeof(((struct obdo *)0)->o_layout) == 28, "found %lld\n",
 		 (long long)(int)sizeof(((struct obdo *)0)->o_layout));
-	LASSERTF((int)offsetof(struct obdo, o_padding_3) == 164, "found %lld\n",
-		 (long long)(int)offsetof(struct obdo, o_padding_3));
-	LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_3) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct obdo *)0)->o_padding_3));
+	LASSERTF((int)offsetof(struct obdo, o_layout_version) == 164, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_layout_version));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_layout_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_layout_version));
 	LASSERTF((int)offsetof(struct obdo, o_uid_h) == 168, "found %lld\n",
 		 (long long)(int)offsetof(struct obdo, o_uid_h));
 	LASSERTF((int)sizeof(((struct obdo *)0)->o_uid_h) == 4, "found %lld\n",
@@ -1419,8 +1543,8 @@ void lustre_assert_wire_constants(void)
 		 OBD_MD_FLFLAGS);
 	LASSERTF(OBD_MD_FLNLINK == (0x00002000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLNLINK);
-	LASSERTF(OBD_MD_FLGENER == (0x00004000ULL), "found 0x%.16llxULL\n",
-		 OBD_MD_FLGENER);
+	LASSERTF(OBD_MD_FLPARENT == (0x00004000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLPARENT);
 	LASSERTF(OBD_MD_FLRDEV == (0x00010000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLRDEV);
 	LASSERTF(OBD_MD_FLEASIZE == (0x00020000ULL), "found 0x%.16llxULL\n",
@@ -1431,14 +1555,10 @@ void lustre_assert_wire_constants(void)
 		 OBD_MD_FLHANDLE);
 	LASSERTF(OBD_MD_FLCKSUM == (0x00100000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLCKSUM);
-	LASSERTF(OBD_MD_FLQOS == (0x00200000ULL), "found 0x%.16llxULL\n",
-		 OBD_MD_FLQOS);
 	LASSERTF(OBD_MD_FLGROUP == (0x01000000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLGROUP);
 	LASSERTF(OBD_MD_FLFID == (0x02000000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLFID);
-	LASSERTF(OBD_MD_FLEPOCH == (0x04000000ULL), "found 0x%.16llxULL\n",
-		 OBD_MD_FLEPOCH);
 	LASSERTF(OBD_MD_FLGRANT == (0x08000000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLGRANT);
 	LASSERTF(OBD_MD_FLDIREA == (0x10000000ULL), "found 0x%.16llxULL\n",
@@ -1451,8 +1571,6 @@ void lustre_assert_wire_constants(void)
 		 OBD_MD_FLMODEASIZE);
 	LASSERTF(OBD_MD_MDS == (0x0000000100000000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_MDS);
-	LASSERTF(OBD_MD_REINT == (0x0000000200000000ULL), "found 0x%.16llxULL\n",
-		 OBD_MD_REINT);
 	LASSERTF(OBD_MD_MEA == (0x0000000400000000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_MEA);
 	LASSERTF(OBD_MD_TSTATE == (0x0000000800000000ULL), "found 0x%.16llxULL\n",
@@ -1465,12 +1583,6 @@ void lustre_assert_wire_constants(void)
 		 OBD_MD_FLXATTRRM);
 	LASSERTF(OBD_MD_FLACL == (0x0000008000000000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLACL);
-	LASSERTF(OBD_MD_FLMDSCAPA == (0x0000020000000000ULL), "found 0x%.16llxULL\n",
-		 OBD_MD_FLMDSCAPA);
-	LASSERTF(OBD_MD_FLOSSCAPA == (0x0000040000000000ULL), "found 0x%.16llxULL\n",
-		 OBD_MD_FLOSSCAPA);
-	LASSERTF(OBD_MD_FLCKSPLIT == (0x0000080000000000ULL), "found 0x%.16llxULL\n",
-		 OBD_MD_FLCKSPLIT);
 	LASSERTF(OBD_MD_FLCROSSREF == (0x0000100000000000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLCROSSREF);
 	LASSERTF(OBD_MD_FLGETATTRLOCK == (0x0000200000000000ULL), "found 0x%.16llxULL\n",
@@ -1483,7 +1595,6 @@ void lustre_assert_wire_constants(void)
 		 OBD_MD_DEFAULT_MEA);
 	LASSERTF(OBD_MD_FLOSTLAYOUT == (0x0080000000000000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLOSTLAYOUT);
-
 	LASSERTF(OBD_MD_FLPROJID == (0x0100000000000000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLPROJID);
 	CLASSERT(OBD_FL_INLINEDATA == 0x00000001);
@@ -1500,7 +1611,10 @@ void lustre_assert_wire_constants(void)
 	CLASSERT(OBD_FL_CKSUM_CRC32 == 0x00001000);
 	CLASSERT(OBD_FL_CKSUM_ADLER == 0x00002000);
 	CLASSERT(OBD_FL_CKSUM_CRC32C == 0x00004000);
-	CLASSERT(OBD_FL_CKSUM_RSVD2 == 0x00008000);
+	CLASSERT(OBD_FL_CKSUM_T10IP512 == 0x00005000);
+	CLASSERT(OBD_FL_CKSUM_T10IP4K == 0x00006000);
+	CLASSERT(OBD_FL_CKSUM_T10CRC512 == 0x00007000);
+	CLASSERT(OBD_FL_CKSUM_T10CRC4K == 0x00008000);
 	CLASSERT(OBD_FL_CKSUM_RSVD3 == 0x00010000);
 	CLASSERT(OBD_FL_SHRINK_GRANT == 0x00020000);
 	CLASSERT(OBD_FL_MMAP == 0x00040000);
@@ -1599,8 +1713,8 @@ void lustre_assert_wire_constants(void)
 		(unsigned)LOV_PATTERN_RAID0);
 	LASSERTF(LOV_PATTERN_RAID1 == 0x00000002UL, "found 0x%.8xUL\n",
 		(unsigned)LOV_PATTERN_RAID1);
-	LASSERTF(LOV_PATTERN_FIRST == 0x00000100UL, "found 0x%.8xUL\n",
-		(unsigned)LOV_PATTERN_FIRST);
+	LASSERTF(LOV_PATTERN_MDT == 0x00000100UL, "found 0x%.8xUL\n",
+		(unsigned)LOV_PATTERN_MDT);
 	LASSERTF(LOV_PATTERN_CMOBD == 0x00000200UL, "found 0x%.8xUL\n",
 		(unsigned)LOV_PATTERN_CMOBD);
 
@@ -1627,12 +1741,22 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_size));
 	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_size) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_size));
-	LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_padding) == 32, "found %lld\n",
-		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_padding));
-	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_padding) == 16, "found %lld\n",
-		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_padding));
+	LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_layout_gen) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_layout_gen));
+	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_layout_gen) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_layout_gen));
+	LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_timestamp) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_timestamp));
+	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_timestamp) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_timestamp));
+	LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_padding_1) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_padding_1));
+	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_padding_1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_padding_1));
 	LASSERTF(LCME_FL_INIT == 0x00000010UL, "found 0x%.8xUL\n",
 		(unsigned)LCME_FL_INIT);
+	LASSERTF(LCME_FL_NEG == 0x80000000UL, "found 0x%.8xUL\n",
+		(unsigned)LCME_FL_NEG);
 
 	/* Checks for struct lov_comp_md_v1 */
 	LASSERTF((int)sizeof(struct lov_comp_md_v1) == 32, "found %lld\n",
@@ -1657,9 +1781,13 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_entry_count));
 	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entry_count) == 2, "found %lld\n",
 		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entry_count));
-	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding1) == 16, "found %lld\n",
+	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_mirror_count) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_mirror_count));
+	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_mirror_count) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_mirror_count));
+	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding1) == 18, "found %lld\n",
 		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_padding1));
-	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1) == 8, "found %lld\n",
+	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1) == 6, "found %lld\n",
 		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1));
 	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding2) == 24, "found %lld\n",
 		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_padding2));
@@ -1670,6 +1798,14 @@ void lustre_assert_wire_constants(void)
 	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entries[0]) == 48, "found %lld\n",
 		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entries[0]));
 	CLASSERT(LOV_MAGIC_COMP_V1 == (0x0BD60000 | 0x0BD0));
+	LASSERTF(LCM_FL_NONE == 0, "found %lld\n",
+		 (long long)LCM_FL_NONE);
+	LASSERTF(LCM_FL_RDONLY == 1, "found %lld\n",
+		 (long long)LCM_FL_RDONLY);
+	LASSERTF(LCM_FL_WRITE_PENDING == 2, "found %lld\n",
+		 (long long)LCM_FL_WRITE_PENDING);
+	LASSERTF(LCM_FL_SYNC_PENDING == 3, "found %lld\n",
+		 (long long)LCM_FL_SYNC_PENDING);
 
 	/* Checks for struct lmv_mds_md_v1 */
 	LASSERTF((int)sizeof(struct lmv_mds_md_v1) == 56, "found %lld\n",
@@ -1694,13 +1830,17 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_layout_version));
 	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_layout_version) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_layout_version));
-	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_padding1) == 20, "found %lld\n",
-		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_padding1));
-	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding1) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding1));
-	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_padding2) == 24, "found %lld\n",
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_migrate_offset) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_migrate_offset));
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_migrate_offset) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_migrate_offset));
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_migrate_hash) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_migrate_hash));
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_migrate_hash) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_migrate_hash));
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_padding2) == 28, "found %lld\n",
 		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_padding2));
-	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding2) == 8, "found %lld\n",
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding2) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding2));
 	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_padding3) == 32, "found %lld\n",
 		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_padding3));
@@ -1741,6 +1881,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct obd_statfs, os_bavail));
 	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bavail) == 8, "found %lld\n",
 		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_bavail));
+	LASSERTF((int)offsetof(struct obd_statfs, os_files) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_files));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_files) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_files));
 	LASSERTF((int)offsetof(struct obd_statfs, os_ffree) == 40, "found %lld\n",
 		 (long long)(int)offsetof(struct obd_statfs, os_ffree));
 	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_ffree) == 8, "found %lld\n",
@@ -1757,6 +1901,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct obd_statfs, os_namelen));
 	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_namelen) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_namelen));
+	LASSERTF((int)offsetof(struct obd_statfs, os_maxbytes) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_maxbytes));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_maxbytes) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_maxbytes));
 	LASSERTF((int)offsetof(struct obd_statfs, os_state) == 104, "found %lld\n",
 		 (long long)(int)offsetof(struct obd_statfs, os_state));
 	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_state) == 4, "found %lld\n",
@@ -1765,10 +1913,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct obd_statfs, os_fprecreated));
 	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fprecreated) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_fprecreated));
-	LASSERTF((int)offsetof(struct obd_statfs, os_spare2) == 112, "found %lld\n",
-		 (long long)(int)offsetof(struct obd_statfs, os_spare2));
-	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare2) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare2));
+	LASSERTF((int)offsetof(struct obd_statfs, os_granted) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_granted));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_granted) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_granted));
 	LASSERTF((int)offsetof(struct obd_statfs, os_spare3) == 116, "found %lld\n",
 		 (long long)(int)offsetof(struct obd_statfs, os_spare3));
 	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare3) == 4, "found %lld\n",
@@ -1797,6 +1945,20 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct obd_statfs, os_spare9));
 	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare9) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare9));
+	LASSERTF(OS_STATE_DEGRADED == 0x1, "found %lld\n",
+		 (long long)OS_STATE_DEGRADED);
+	LASSERTF(OS_STATE_READONLY == 0x2, "found %lld\n",
+		 (long long)OS_STATE_READONLY);
+	LASSERTF(OS_STATE_NOPRECREATE == 0x4, "found %lld\n",
+		 (long long)OS_STATE_NOPRECREATE);
+	LASSERTF(OS_STATE_ENOSPC == 0x20, "found %lld\n",
+		 (long long)OS_STATE_ENOSPC);
+	LASSERTF(OS_STATE_ENOINO == 0x40, "found %lld\n",
+		 (long long)OS_STATE_ENOINO);
+	LASSERTF(OS_STATE_SUM == 0x100, "found %lld\n",
+		 (long long)OS_STATE_SUM);
+	LASSERTF(OS_STATE_NONROT == 0x200, "found %lld\n",
+		 (long long)OS_STATE_NONROT);
 
 	/* Checks for struct obd_ioobj */
 	LASSERTF((int)sizeof(struct obd_ioobj) == 24, "found %lld\n",
@@ -2127,6 +2289,33 @@ void lustre_assert_wire_constants(void)
 	LASSERTF((int)sizeof(((struct ll_fid *)0)->f_type) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct ll_fid *)0)->f_type));
 
+	LASSERTF(MDS_CROSS_REF == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_CROSS_REF);
+	LASSERTF(MDS_PERM_BYPASS == 0x00000008UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_PERM_BYPASS);
+	LASSERTF(MDS_QUOTA_IGNORE == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_QUOTA_IGNORE);
+	LASSERTF(MDS_KEEP_ORPHAN == 0x00000080UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_KEEP_ORPHAN);
+	LASSERTF(MDS_RECOV_OPEN == 0x00000100UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_RECOV_OPEN);
+	LASSERTF(MDS_DATA_MODIFIED == 0x00000200UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_DATA_MODIFIED);
+	LASSERTF(MDS_CREATE_VOLATILE == 0x00000400UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_CREATE_VOLATILE);
+	LASSERTF(MDS_OWNEROVERRIDE == 0x00000800UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_OWNEROVERRIDE);
+	LASSERTF(MDS_HSM_RELEASE == 0x00001000UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_HSM_RELEASE);
+	LASSERTF(MDS_CLOSE_LAYOUT_SWAP == 0x00004000UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_CLOSE_LAYOUT_SWAP);
+	LASSERTF(MDS_CLOSE_LAYOUT_MERGE == 0x00008000UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_CLOSE_LAYOUT_MERGE);
+	LASSERTF(MDS_CLOSE_RESYNC_DONE == 0x00010000UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_CLOSE_RESYNC_DONE);
+	LASSERTF(MDS_CLOSE_LAYOUT_SPLIT == 0x00020000UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_CLOSE_LAYOUT_SPLIT);
+
 	/* Checks for struct mdt_body */
 	LASSERTF((int)sizeof(struct mdt_body) == 216, "found %lld\n",
 		 (long long)(int)sizeof(struct mdt_body));
@@ -2138,10 +2327,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct mdt_body, mbo_fid2));
 	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_fid2) == 16, "found %lld\n",
 		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_fid2));
-	LASSERTF((int)offsetof(struct mdt_body, mbo_handle) == 32, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_body, mbo_handle));
-	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_handle) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_handle));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_open_handle) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_open_handle));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_open_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_open_handle));
 	LASSERTF((int)offsetof(struct mdt_body, mbo_valid) == 40, "found %lld\n",
 		 (long long)(int)offsetof(struct mdt_body, mbo_valid));
 	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_valid) == 8, "found %lld\n",
@@ -2166,6 +2355,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct mdt_body, mbo_blocks));
 	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_blocks) == 8, "found %lld\n",
 		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_blocks));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_version) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_version));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_version) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_version));
 	LASSERTF((int)offsetof(struct mdt_body, mbo_t_state) == 96, "found %lld\n",
 		 (long long)(int)offsetof(struct mdt_body, mbo_t_state));
 	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_t_state) == 8, "found %lld\n",
@@ -2206,10 +2399,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct mdt_body, mbo_nlink));
 	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_nlink) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_nlink));
-	LASSERTF((int)offsetof(struct mdt_body, mbo_unused2) == 140, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_body, mbo_unused2));
-	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_unused2) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_unused2));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_layout_gen) == 140, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_layout_gen));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_layout_gen) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_layout_gen));
 	LASSERTF((int)offsetof(struct mdt_body, mbo_suppgid) == 144, "found %lld\n",
 		 (long long)(int)offsetof(struct mdt_body, mbo_suppgid));
 	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_suppgid) == 4, "found %lld\n",
@@ -2242,14 +2435,14 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct mdt_body, mbo_projid));
 	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_projid) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_projid));
-	LASSERTF((int)offsetof(struct mdt_body, mbo_padding_6) == 176, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_body, mbo_padding_6));
-	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_6) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_6));
-	LASSERTF((int)offsetof(struct mdt_body, mbo_padding_7) == 184, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_body, mbo_padding_7));
-	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_7) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_7));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_dom_size) == 176, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_dom_size));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_dom_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_dom_size));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_dom_blocks) == 184, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_dom_blocks));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_dom_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_dom_blocks));
 	LASSERTF((int)offsetof(struct mdt_body, mbo_padding_8) == 192, "found %lld\n",
 		 (long long)(int)offsetof(struct mdt_body, mbo_padding_8));
 	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_8) == 8, "found %lld\n",
@@ -2268,8 +2461,6 @@ void lustre_assert_wire_constants(void)
 		MDS_FMODE_EXEC);
 	LASSERTF(MDS_OPEN_CREATED == 000000000010UL, "found 0%.11oUL\n",
 		MDS_OPEN_CREATED);
-	LASSERTF(MDS_OPEN_CROSS == 000000000020UL, "found 0%.11oUL\n",
-		MDS_OPEN_CROSS);
 	LASSERTF(MDS_OPEN_CREAT == 000000000100UL, "found 0%.11oUL\n",
 		MDS_OPEN_CREAT);
 	LASSERTF(MDS_OPEN_EXCL == 000000000200UL, "found 0%.11oUL\n",
@@ -2332,14 +2523,20 @@ void lustre_assert_wire_constants(void)
 		MDS_INODELOCK_OPEN);
 	LASSERTF(MDS_INODELOCK_LAYOUT == 0x000008, "found 0x%.8x\n",
 		MDS_INODELOCK_LAYOUT);
+	LASSERTF(MDS_INODELOCK_PERM == 0x000010, "found 0x%.8x\n",
+		MDS_INODELOCK_PERM);
+	LASSERTF(MDS_INODELOCK_XATTR == 0x000020, "found 0x%.8x\n",
+		MDS_INODELOCK_XATTR);
+	LASSERTF(MDS_INODELOCK_DOM == 0x000040, "found 0x%.8x\n",
+		MDS_INODELOCK_DOM);
 
 	/* Checks for struct mdt_ioepoch */
 	LASSERTF((int)sizeof(struct mdt_ioepoch) == 24, "found %lld\n",
 		 (long long)(int)sizeof(struct mdt_ioepoch));
-	LASSERTF((int)offsetof(struct mdt_ioepoch, mio_handle) == 0, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_ioepoch, mio_handle));
-	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_handle) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_ioepoch *)0)->mio_handle));
+	LASSERTF((int)offsetof(struct mdt_ioepoch, mio_open_handle) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_ioepoch, mio_open_handle));
+	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_open_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_ioepoch *)0)->mio_open_handle));
 	LASSERTF((int)offsetof(struct mdt_ioepoch, mio_unused1) == 8, "found %lld\n",
 		 (long long)(int)offsetof(struct mdt_ioepoch, mio_unused1));
 	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_unused1) == 8, "found %lld\n",
@@ -2508,10 +2705,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct mdt_rec_create, cr_fid2));
 	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid2) == 16, "found %lld\n",
 		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid2));
-	LASSERTF((int)offsetof(struct mdt_rec_create, cr_old_handle) == 72, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_rec_create, cr_old_handle));
-	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_old_handle) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_old_handle));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_open_handle_old) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_open_handle_old));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_open_handle_old) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_open_handle_old));
 	LASSERTF((int)offsetof(struct mdt_rec_create, cr_time) == 80, "found %lld\n",
 		 (long long)(int)offsetof(struct mdt_rec_create, cr_time));
 	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_time) == 8, "found %lld\n",
@@ -2945,6 +3142,102 @@ void lustre_assert_wire_constants(void)
 	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11));
 
+	/* Checks for struct mdt_rec_resync */
+	LASSERTF((int)sizeof(struct mdt_rec_resync) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_resync));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fid) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_fid));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fid));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding0) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding0));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding0) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding0));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding1) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding1));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding1));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding2) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding2));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding2));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding3) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding3));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding3));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding4) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding4));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding4));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_bias) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding5) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding5));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding5) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding5));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding6) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding6));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding6) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding6));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding7) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding7));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding7) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding7));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding8) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding8));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding8) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding8));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_mirror_id) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_mirror_id));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_mirror_id) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_mirror_id));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding9) == 134, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding9));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding9) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding9));
+
 	/* Checks for struct mdt_rec_reint */
 	LASSERTF((int)sizeof(struct mdt_rec_reint) == 136, "found %lld\n",
 		 (long long)(int)sizeof(struct mdt_rec_reint));
@@ -3036,9 +3329,13 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct mdt_rec_reint, rr_umask));
 	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_umask) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_umask));
-	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_padding_4) == 132, "found %lld\n",
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mirror_id) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_mirror_id));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mirror_id) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mirror_id));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_padding_4) == 134, "found %lld\n",
 		 (long long)(int)offsetof(struct mdt_rec_reint, rr_padding_4));
-	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4) == 4, "found %lld\n",
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4) == 2, "found %lld\n",
 		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4));
 
 	/* Checks for struct lmv_desc */
@@ -3164,12 +3461,16 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)sizeof(((struct ldlm_extent *)0)->gid));
 
 	/* Checks for struct ldlm_inodebits */
-	LASSERTF((int)sizeof(struct ldlm_inodebits) == 8, "found %lld\n",
+	LASSERTF((int)sizeof(struct ldlm_inodebits) == 16, "found %lld\n",
 		 (long long)(int)sizeof(struct ldlm_inodebits));
 	LASSERTF((int)offsetof(struct ldlm_inodebits, bits) == 0, "found %lld\n",
 		 (long long)(int)offsetof(struct ldlm_inodebits, bits));
 	LASSERTF((int)sizeof(((struct ldlm_inodebits *)0)->bits) == 8, "found %lld\n",
 		 (long long)(int)sizeof(((struct ldlm_inodebits *)0)->bits));
+	LASSERTF((int)offsetof(struct ldlm_inodebits, try_bits) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_inodebits, try_bits));
+	LASSERTF((int)sizeof(((struct ldlm_inodebits *)0)->try_bits) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_inodebits *)0)->try_bits));
 
 	/* Checks for struct ldlm_flock_wire */
 	LASSERTF((int)sizeof(struct ldlm_flock_wire) == 32, "found %lld\n",
@@ -3212,24 +3513,14 @@ void lustre_assert_wire_constants(void)
 		 (long long)IT_GETATTR);
 	LASSERTF(IT_LOOKUP == 16, "found %lld\n",
 		 (long long)IT_LOOKUP);
-	LASSERTF(IT_UNLINK == 32, "found %lld\n",
-		 (long long)IT_UNLINK);
-	LASSERTF(IT_TRUNC == 64, "found %lld\n",
-		 (long long)IT_TRUNC);
 	LASSERTF(IT_GETXATTR == 128, "found %lld\n",
 		 (long long)IT_GETXATTR);
-	LASSERTF(IT_EXEC == 256, "found %lld\n",
-		 (long long)IT_EXEC);
-	LASSERTF(IT_PIN == 512, "found %lld\n",
-		 (long long)IT_PIN);
 	LASSERTF(IT_LAYOUT == 1024, "found %lld\n",
 		 (long long)IT_LAYOUT);
 	LASSERTF(IT_QUOTA_DQACQ == 2048, "found %lld\n",
 		 (long long)IT_QUOTA_DQACQ);
 	LASSERTF(IT_QUOTA_CONN == 4096, "found %lld\n",
 		 (long long)IT_QUOTA_CONN);
-	LASSERTF(IT_SETXATTR == 8192, "found %lld\n",
-		 (long long)IT_SETXATTR);
 
 	/* Checks for struct ldlm_resource_desc */
 	LASSERTF((int)sizeof(struct ldlm_resource_desc) == 40, "found %lld\n",
@@ -3702,14 +3993,14 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_valid));
 	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_valid) == 8, "found %lld\n",
 		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_valid));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_tail) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_tail));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail));
 	LASSERTF((int)offsetof(struct llog_setattr64_rec_v2, lsr_projid) == 56, "found %lld\n",
 		 (long long)(int)offsetof(struct llog_setattr64_rec_v2, lsr_projid));
 	LASSERTF((int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_projid) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_projid));
-	LASSERTF((int)offsetof(struct llog_setattr64_rec_v2, lsr_tail) == 80, "found %lld\n",
-		 (long long)(int)offsetof(struct llog_setattr64_rec_v2, lsr_tail));
-	LASSERTF((int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_tail) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_tail));
 
 	/* Checks for struct llog_size_change_rec */
 	LASSERTF((int)sizeof(struct llog_size_change_rec) == 64, "found %lld\n",
@@ -3838,10 +4129,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_id));
 	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id));
-	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_padding) == 20, "found %lld\n",
-		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_padding));
-	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_padding) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_padding));
+	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_time) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_time));
+	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_time) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_time));
 	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_endrec) == 24, "found %lld\n",
 		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_endrec));
 	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec) == 8, "found %lld\n",
@@ -3949,12 +4240,7 @@ void lustre_assert_wire_constants(void)
 	CLASSERT(LLOG_ORIGIN_HANDLE_CREATE == 501);
 	CLASSERT(LLOG_ORIGIN_HANDLE_NEXT_BLOCK == 502);
 	CLASSERT(LLOG_ORIGIN_HANDLE_READ_HEADER == 503);
-	CLASSERT(LLOG_ORIGIN_HANDLE_WRITE_REC == 504);
-	CLASSERT(LLOG_ORIGIN_HANDLE_CLOSE == 505);
-	CLASSERT(LLOG_ORIGIN_CONNECT == 506);
-	CLASSERT(LLOG_CATINFO == 507);
 	CLASSERT(LLOG_ORIGIN_HANDLE_PREV_BLOCK == 508);
-	CLASSERT(LLOG_ORIGIN_HANDLE_DESTROY == 509);
 	CLASSERT(LLOG_FIRST_OPC == 501);
 	CLASSERT(LLOG_LAST_OPC == 510);
 	CLASSERT(LLOG_CONFIG_ORIG_CTXT == 0);
@@ -4426,14 +4712,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct layout_intent, li_flags));
 	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_flags) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct layout_intent *)0)->li_flags));
-	LASSERTF((int)offsetof(struct layout_intent, li_start) == 8, "found %lld\n",
-		 (long long)(int)offsetof(struct layout_intent, li_start));
-	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_start) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct layout_intent *)0)->li_start));
-	LASSERTF((int)offsetof(struct layout_intent, li_end) == 16, "found %lld\n",
-		 (long long)(int)offsetof(struct layout_intent, li_end));
-	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_end) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct layout_intent *)0)->li_end));
+	LASSERTF((int)offsetof(struct layout_intent, li_extent) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct layout_intent, li_extent));
+	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_extent) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct layout_intent *)0)->li_extent));
 	LASSERTF(LAYOUT_INTENT_ACCESS == 0, "found %lld\n",
 		 (long long)LAYOUT_INTENT_ACCESS);
 	LASSERTF(LAYOUT_INTENT_READ == 1, "found %lld\n",
@@ -5089,12 +5371,14 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct lfsck_request, lr_padding_3));
 	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_padding_3) == 8, "found %lld\n",
 		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_padding_3));
+#ifdef HAVE_SERVER_SUPPORT
 	LASSERTF(LFSCK_TYPE_SCRUB == 0x00000000UL, "found 0x%.8xUL\n",
 		(unsigned)LFSCK_TYPE_SCRUB);
 	LASSERTF(LFSCK_TYPE_LAYOUT == 0x00000001UL, "found 0x%.8xUL\n",
 		(unsigned)LFSCK_TYPE_LAYOUT);
 	LASSERTF(LFSCK_TYPE_NAMESPACE == 0x00000004UL, "found 0x%.8xUL\n",
 		(unsigned)LFSCK_TYPE_NAMESPACE);
+#endif
 	LASSERTF(LE_LASTID_REBUILDING == 1, "found %lld\n",
 		 (long long)LE_LASTID_REBUILDING);
 	LASSERTF(LE_LASTID_REBUILT == 2, "found %lld\n",
@@ -5157,7 +5441,7 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)sizeof(((struct update_params *)0)->up_params));
 
 	/* Checks for struct update_op */
-	LASSERTF((int)sizeof(struct update_op) == 24, "found %lld\n",
+	LASSERTF((int)sizeof(struct update_op) == 20, "found %lld\n",
 		 (long long)(int)sizeof(struct update_op));
 	LASSERTF((int)offsetof(struct update_op, uop_fid) == 0, "found %lld\n",
 		 (long long)(int)offsetof(struct update_op, uop_fid));
@@ -5226,75 +5510,145 @@ void lustre_assert_wire_constants(void)
 	LASSERTF((int)sizeof(((struct llog_update_record *)0)->lur_update_rec) == 32, "found %lld\n",
 		 (long long)(int)sizeof(((struct llog_update_record *)0)->lur_update_rec));
 
-	/* Checks for struct lu_ladvise */
-	LASSERTF((int)sizeof(struct lu_ladvise) == 32, "found %lld\n",
-		 (long long)(int)sizeof(struct lu_ladvise));
-	LASSERTF((int)offsetof(struct lu_ladvise, lla_advice) == 0, "found %lld\n",
-		 (long long)(int)offsetof(struct lu_ladvise, lla_advice));
-	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_advice) == 2, "found %lld\n",
-		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_advice));
-	LASSERTF((int)offsetof(struct lu_ladvise, lla_value1) == 2, "found %lld\n",
-		 (long long)(int)offsetof(struct lu_ladvise, lla_value1));
-	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value1) == 2, "found %lld\n",
-		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value1));
-	LASSERTF((int)offsetof(struct lu_ladvise, lla_value2) == 4, "found %lld\n",
-		 (long long)(int)offsetof(struct lu_ladvise, lla_value2));
-	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value2) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value2));
-	LASSERTF((int)offsetof(struct lu_ladvise, lla_start) == 8, "found %lld\n",
-		 (long long)(int)offsetof(struct lu_ladvise, lla_start));
-	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_start) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_start));
-	LASSERTF((int)offsetof(struct lu_ladvise, lla_end) == 16, "found %lld\n",
-		 (long long)(int)offsetof(struct lu_ladvise, lla_end));
-	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_end) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_end));
-	LASSERTF((int)offsetof(struct lu_ladvise, lla_value3) == 24, "found %lld\n",
-		 (long long)(int)offsetof(struct lu_ladvise, lla_value3));
-	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value3) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value3));
-	LASSERTF((int)offsetof(struct lu_ladvise, lla_value4) == 28, "found %lld\n",
-		 (long long)(int)offsetof(struct lu_ladvise, lla_value4));
-	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value4) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value4));
-	LASSERTF(LU_LADVISE_WILLREAD == 1, "found %lld\n",
-		 (long long)LU_LADVISE_WILLREAD);
-	LASSERTF(LU_LADVISE_DONTNEED == 2, "found %lld\n",
-		 (long long)LU_LADVISE_DONTNEED);
-
-	/* Checks for struct ladvise_hdr */
-	LASSERTF(LADVISE_MAGIC == 0x1ADF1CE0, "found 0x%.8x\n",
-		 LADVISE_MAGIC);
-	LASSERTF((int)sizeof(struct ladvise_hdr) == 32, "found %lld\n",
-		 (long long)(int)sizeof(struct ladvise_hdr));
-	LASSERTF((int)offsetof(struct ladvise_hdr, lah_magic) == 0, "found %lld\n",
-		 (long long)(int)offsetof(struct ladvise_hdr, lah_magic));
-	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_magic) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_magic));
-	LASSERTF((int)offsetof(struct ladvise_hdr, lah_count) == 4, "found %lld\n",
-		 (long long)(int)offsetof(struct ladvise_hdr, lah_count));
-	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_count) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_count));
-	LASSERTF((int)offsetof(struct ladvise_hdr, lah_flags) == 8, "found %lld\n",
-		 (long long)(int)offsetof(struct ladvise_hdr, lah_flags));
-	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_flags) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_flags));
-	LASSERTF((int)offsetof(struct ladvise_hdr, lah_value1) == 16, "found %lld\n",
-		 (long long)(int)offsetof(struct ladvise_hdr, lah_value1));
-	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value1) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value1));
-	LASSERTF((int)offsetof(struct ladvise_hdr, lah_value2) == 20, "found %lld\n",
-		 (long long)(int)offsetof(struct ladvise_hdr, lah_value2));
-	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value2) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value2));
-	LASSERTF((int)offsetof(struct ladvise_hdr, lah_value3) == 24, "found %lld\n",
-		 (long long)(int)offsetof(struct ladvise_hdr, lah_value3));
-	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value3) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value3));
-	LASSERTF((int)offsetof(struct ladvise_hdr, lah_advise) == 32, "found %lld\n",
-		 (long long)(int)offsetof(struct ladvise_hdr, lah_advise));
-	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_advise) == 0, "found %lld\n",
-		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_advise));
-	LASSERTF(LF_ASYNC == 0x00000001UL, "found 0x%.8xUL\n",
-		(unsigned)LF_ASYNC);
+	/* Checks for struct lustre_cfg */
+	LASSERTF((int)sizeof(struct lustre_cfg) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_cfg));
+	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_version) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_cfg, lcfg_version));
+	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_version));
+	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_command) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_cfg, lcfg_command));
+	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_command) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_command));
+	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_num) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_cfg, lcfg_num));
+	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_num) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_num));
+	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_flags) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_cfg, lcfg_flags));
+	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_flags));
+	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_nid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_cfg, lcfg_nid));
+	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_nid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_nid));
+	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_nal) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_cfg, lcfg_nal));
+	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_nal) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_nal));
+	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_bufcount) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_cfg, lcfg_bufcount));
+	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_bufcount) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_bufcount));
+	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_buflens[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_cfg, lcfg_buflens[0]));
+	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_buflens[0]) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_buflens[0]));
+	LASSERTF(LCFG_ATTACH == 0x000cf001UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_ATTACH);
+	LASSERTF(LCFG_DETACH == 0x000cf002UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_DETACH);
+	LASSERTF(LCFG_SETUP == 0x000cf003UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_SETUP);
+	LASSERTF(LCFG_CLEANUP == 0x000cf004UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_CLEANUP);
+	LASSERTF(LCFG_ADD_UUID == 0x000cf005UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_ADD_UUID);
+	LASSERTF(LCFG_DEL_UUID == 0x000cf006UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_DEL_UUID);
+	LASSERTF(LCFG_MOUNTOPT == 0x000cf007UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_MOUNTOPT);
+	LASSERTF(LCFG_DEL_MOUNTOPT == 0x000cf008UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_DEL_MOUNTOPT);
+	LASSERTF(LCFG_SET_TIMEOUT == 0x000cf009UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_SET_TIMEOUT);
+	LASSERTF(LCFG_SET_UPCALL == 0x000cf00aUL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_SET_UPCALL);
+	LASSERTF(LCFG_ADD_CONN == 0x000cf00bUL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_ADD_CONN);
+	LASSERTF(LCFG_DEL_CONN == 0x000cf00cUL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_DEL_CONN);
+	LASSERTF(LCFG_LOV_ADD_OBD == 0x000cf00dUL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_LOV_ADD_OBD);
+	LASSERTF(LCFG_LOV_DEL_OBD == 0x000cf00eUL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_LOV_DEL_OBD);
+	LASSERTF(LCFG_PARAM == 0x000cf00fUL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_PARAM);
+	LASSERTF(LCFG_MARKER == 0x000cf010UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_MARKER);
+	LASSERTF(LCFG_LOG_START == 0x000ce011UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_LOG_START);
+	LASSERTF(LCFG_LOG_END == 0x000ce012UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_LOG_END);
+	LASSERTF(LCFG_LOV_ADD_INA == 0x000ce013UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_LOV_ADD_INA);
+	LASSERTF(LCFG_ADD_MDC == 0x000cf014UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_ADD_MDC);
+	LASSERTF(LCFG_DEL_MDC == 0x000cf015UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_DEL_MDC);
+	LASSERTF(LCFG_SPTLRPC_CONF == 0x000ce016UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_SPTLRPC_CONF);
+	LASSERTF(LCFG_POOL_NEW == 0x000ce020UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_POOL_NEW);
+	LASSERTF(LCFG_POOL_ADD == 0x000ce021UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_POOL_ADD);
+	LASSERTF(LCFG_POOL_REM == 0x000ce022UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_POOL_REM);
+	LASSERTF(LCFG_POOL_DEL == 0x000ce023UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_POOL_DEL);
+	LASSERTF(LCFG_SET_LDLM_TIMEOUT == 0x000ce030UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_SET_LDLM_TIMEOUT);
+	LASSERTF(LCFG_PRE_CLEANUP == 0x000cf031UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_PRE_CLEANUP);
+	LASSERTF(LCFG_SET_PARAM == 0x000ce032UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_SET_PARAM);
+	LASSERTF(LCFG_NODEMAP_ADD == 0x000ce040UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_ADD);
+	LASSERTF(LCFG_NODEMAP_DEL == 0x000ce041UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_DEL);
+	LASSERTF(LCFG_NODEMAP_ADD_RANGE == 0x000ce042UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_ADD_RANGE);
+	LASSERTF(LCFG_NODEMAP_DEL_RANGE == 0x000ce043UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_DEL_RANGE);
+	LASSERTF(LCFG_NODEMAP_ADD_UIDMAP == 0x000ce044UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_ADD_UIDMAP);
+	LASSERTF(LCFG_NODEMAP_DEL_UIDMAP == 0x000ce045UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_DEL_UIDMAP);
+	LASSERTF(LCFG_NODEMAP_ADD_GIDMAP == 0x000ce046UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_ADD_GIDMAP);
+	LASSERTF(LCFG_NODEMAP_DEL_GIDMAP == 0x000ce047UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_DEL_GIDMAP);
+	LASSERTF(LCFG_NODEMAP_ACTIVATE == 0x000ce048UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_ACTIVATE);
+	LASSERTF(LCFG_NODEMAP_ADMIN == 0x000ce049UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_ADMIN);
+	LASSERTF(LCFG_NODEMAP_TRUSTED == 0x000ce050UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_TRUSTED);
+	LASSERTF(LCFG_NODEMAP_SQUASH_UID == 0x000ce051UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_SQUASH_UID);
+	LASSERTF(LCFG_NODEMAP_SQUASH_GID == 0x000ce052UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_SQUASH_GID);
+	LASSERTF(LCFG_NODEMAP_ADD_SHKEY == 0x000ce053UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_ADD_SHKEY);
+	LASSERTF(LCFG_NODEMAP_DEL_SHKEY == 0x000ce054UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_DEL_SHKEY);
+	LASSERTF(LCFG_NODEMAP_TEST_NID == 0x000ce055UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_TEST_NID);
+	LASSERTF(LCFG_NODEMAP_TEST_ID == 0x000ce056UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_TEST_ID);
+	LASSERTF(LCFG_NODEMAP_SET_FILESET == 0x000ce057UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_SET_FILESET);
+	LASSERTF(LCFG_NODEMAP_DENY_UNKNOWN == 0x000ce058UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_DENY_UNKNOWN);
+	LASSERTF(LCFG_NODEMAP_MAP_MODE == 0x000ce059UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_MAP_MODE);
+	LASSERTF(LCFG_NODEMAP_AUDIT_MODE == 0x000ce05aUL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_AUDIT_MODE);
+	LASSERTF(LCFG_NODEMAP_SET_SEPOL == 0x000ce05bUL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_SET_SEPOL);
+	LASSERTF(PORTALS_CFG_TYPE == 1, "found %lld\n",
+		 (long long)PORTALS_CFG_TYPE);
+	LASSERTF(LUSTRE_CFG_TYPE == 123, "found %lld\n",
+		 (long long)LUSTRE_CFG_TYPE);
 }
diff --git a/drivers/staging/lustrefsx/lustre/target/barrier.c b/drivers/staging/lustrefsx/lustre/target/barrier.c
index 6145e0e37a711..54b3e567b3605 100644
--- a/drivers/staging/lustrefsx/lustre/target/barrier.c
+++ b/drivers/staging/lustrefsx/lustre/target/barrier.c
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2016, Intel Corporation.
+ * Copyright (c) 2017, Intel Corporation.
  *
  * lustre/target/barrier.c
  *
@@ -35,12 +35,11 @@
 
 #include <linux/percpu_counter.h>
 
-#include <lustre/lustre_idl.h>
 #include <dt_object.h>
 #include <obd.h>
 #include <obd_class.h>
 #include <lustre_barrier.h>
-#include <lustre/lustre_barrier_user.h>
+#include <uapi/linux/lustre/lustre_barrier_user.h>
 
 static LIST_HEAD(barrier_instance_list);
 static DEFINE_SPINLOCK(barrier_instance_lock);
@@ -53,7 +52,7 @@ struct barrier_instance {
 	rwlock_t		 bi_rwlock;
 	struct percpu_counter	 bi_writers;
 	atomic_t		 bi_ref;
-	time_t			 bi_deadline;
+	time64_t		 bi_deadline;
 	__u32			 bi_status;
 };
 
@@ -173,7 +172,7 @@ static void barrier_set(struct barrier_instance *barrier, __u32 status)
 static int barrier_freeze(const struct lu_env *env,
 			  struct barrier_instance *barrier, bool phase1)
 {
-	int left;
+	time64_t left;
 	int rc = 0;
 	__s64 inflight = 0;
 	ENTRY;
@@ -195,7 +194,7 @@ static int barrier_freeze(const struct lu_env *env,
 
 	LASSERT(barrier->bi_deadline != 0);
 
-	left = barrier->bi_deadline - cfs_time_current_sec();
+	left = barrier->bi_deadline - ktime_get_real_seconds();
 	if (left <= 0)
 		RETURN(1);
 
@@ -214,8 +213,7 @@ static int barrier_freeze(const struct lu_env *env,
 		if (rc)
 			RETURN(rc);
 
-		if (cfs_time_beforeq(barrier->bi_deadline,
-				     cfs_time_current_sec()))
+		if (ktime_get_real_seconds() > barrier->bi_deadline)
 			RETURN(1);
 	}
 
@@ -252,7 +250,7 @@ bool barrier_entry(struct dt_device *key)
 	if (likely(barrier->bi_status != BS_FREEZING_P1 &&
 		   barrier->bi_status != BS_FREEZING_P2 &&
 		   barrier->bi_status != BS_FROZEN) ||
-	    cfs_time_beforeq(barrier->bi_deadline, cfs_time_current_sec())) {
+	    ktime_get_real_seconds() > barrier->bi_deadline) {
 		percpu_counter_inc(&barrier->bi_writers);
 		entered = true;
 	}
@@ -292,7 +290,7 @@ int barrier_handler(struct dt_device *key, struct ptlrpc_request *req)
 	ENTRY;
 
 	/* glimpse on barrier locks always packs a glimpse descriptor */
-	req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_DESC_CALLBACK);
+	req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK_DESC);
 	desc = req_capsule_client_get(&req->rq_pill, &RMF_DLM_GL_DESC);
 	if (!desc)
 		GOTO(out, rc = -EPROTO);
@@ -326,8 +324,8 @@ int barrier_handler(struct dt_device *key, struct ptlrpc_request *req)
 		if (OBD_FAIL_CHECK(OBD_FAIL_BARRIER_FAILURE))
 			GOTO(fini, rc = -EINVAL);
 
-		barrier->bi_deadline = cfs_time_current_sec() +
-					desc->lgbd_timeout;
+		barrier->bi_deadline = ktime_get_real_seconds() +
+				       desc->lgbd_timeout;
 		rc = barrier_freeze(&env, barrier,
 				    desc->lgbd_status == BS_FREEZING_P1);
 		break;
@@ -358,7 +356,7 @@ int barrier_handler(struct dt_device *key, struct ptlrpc_request *req)
 	lvb->lvb_index = barrier_dev_idx(barrier);
 
 	CDEBUG(D_SNAPSHOT, "%s: handled barrier request: status %u, "
-	       "deadline %lu: rc = %d\n", barrier_barrier2name(barrier),
+	       "deadline %lld: rc = %d\n", barrier_barrier2name(barrier),
 	       lvb->lvb_status, barrier->bi_deadline, rc);
 
 	barrier_instance_put(barrier);
diff --git a/drivers/staging/lustrefsx/lustre/target/out_handler.c b/drivers/staging/lustrefsx/lustre/target/out_handler.c
index c342ae41f95c0..a238f588e0cd1 100644
--- a/drivers/staging/lustrefsx/lustre/target/out_handler.c
+++ b/drivers/staging/lustrefsx/lustre/target/out_handler.c
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2013, 2016, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
  *
  * lustre/target/out_handler.c
  *
@@ -52,7 +52,7 @@ static void out_reconstruct(const struct lu_env *env, struct dt_device *dt,
 			    struct object_update_reply *reply,
 			    int index)
 {
-	CDEBUG(D_INFO, "%s: fork reply reply %p index %d: rc = %d\n",
+	CDEBUG(D_HA, "%s: fork reply reply %p index %d: rc = %d\n",
 	       dt_obd_name(dt), reply, index, 0);
 
 	object_update_result_insert(reply, NULL, 0, index, 0);
@@ -65,16 +65,10 @@ typedef void (*out_reconstruct_t)(const struct lu_env *env,
 				  struct object_update_reply *reply,
 				  int index);
 
-static inline int out_check_resent(const struct lu_env *env,
-				   struct dt_device *dt,
-				   struct dt_object *obj,
-				   struct ptlrpc_request *req,
-				   out_reconstruct_t reconstruct,
-				   struct object_update_reply *reply,
-				   int index)
+static inline bool out_check_resent(struct ptlrpc_request *req)
 {
 	if (likely(!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT)))
-		return 0;
+		return false;
 
 	if (req_xid_is_last(req)) {
 		struct lsd_client_data *lcd;
@@ -90,14 +84,12 @@ static inline int out_check_resent(const struct lu_env *env,
 		lustre_msg_set_transno(req->rq_repmsg, req->rq_transno);
 		lustre_msg_set_status(req->rq_repmsg, req->rq_status);
 
-		DEBUG_REQ(D_RPCTRACE, req, "restoring resent RPC");
-
-		reconstruct(env, dt, obj, reply, index);
-		return 1;
+		DEBUG_REQ(D_HA, req, "reconstruct resent RPC");
+		return true;
 	}
-	DEBUG_REQ(D_HA, req, "no reply for RESENT req (have %lld)",
-		 req->rq_export->exp_target_data.ted_lcd->lcd_last_xid);
-	return 0;
+	DEBUG_REQ(D_HA, req, "reprocess RESENT req, last_xid is %lld",
+		  req->rq_export->exp_target_data.ted_lcd->lcd_last_xid);
+	return false;
 }
 
 static int out_create(struct tgt_session_info *tsi)
@@ -289,10 +281,62 @@ static int out_xattr_get(struct tgt_session_info *tsi)
 	} else if (lbuf->lb_buf) {
 		lbuf->lb_len = rc;
 	}
-
-	CDEBUG(D_INFO, "%s: "DFID" get xattr %s len %d: rc = %d\n",
+	CDEBUG(D_INFO, "%s: "DFID" get xattr %s len %d\n",
 	       tgt_name(tsi->tsi_tgt), PFID(lu_object_fid(&obj->do_lu)),
-	       name, (int)lbuf->lb_len, rc);
+	       name, rc);
+
+	GOTO(out, rc);
+
+out:
+	object_update_result_insert(reply, lbuf->lb_buf, lbuf->lb_len, idx, rc);
+	RETURN(0);
+}
+
+static int out_xattr_list(struct tgt_session_info *tsi)
+{
+	const struct lu_env *env = tsi->tsi_env;
+	struct tgt_thread_info *tti = tgt_th_info(env);
+	struct lu_buf *lbuf = &tti->tti_buf;
+	struct object_update_reply *reply = tti->tti_u.update.tti_update_reply;
+	struct dt_object *obj = tti->tti_u.update.tti_dt_object;
+	struct object_update_result *update_result;
+	int idx = tti->tti_u.update.tti_update_reply_index;
+	int rc;
+
+	ENTRY;
+
+	if (!lu_object_exists(&obj->do_lu)) {
+		set_bit(LU_OBJECT_HEARD_BANSHEE,
+			&obj->do_lu.lo_header->loh_flags);
+		RETURN(-ENOENT);
+	}
+
+	update_result = object_update_result_get(reply, 0, NULL);
+	if (!update_result) {
+		rc = -EPROTO;
+		CERROR("%s: empty buf for xattr list: rc = %d\n",
+		       tgt_name(tsi->tsi_tgt), rc);
+		RETURN(rc);
+	}
+
+	lbuf->lb_len = (int)tti->tti_u.update.tti_update->ou_result_size;
+	lbuf->lb_buf = update_result->our_data;
+	if (lbuf->lb_len == 0)
+		lbuf->lb_buf = 0;
+
+	dt_read_lock(env, obj, MOR_TGT_CHILD);
+	rc = dt_xattr_list(env, obj, lbuf);
+	dt_read_unlock(env, obj);
+	if (rc <= 0) {
+		lbuf->lb_len = 0;
+		if (unlikely(!rc))
+			rc = -ENODATA;
+	} else if (lbuf->lb_buf) {
+		lbuf->lb_len = rc;
+	}
+
+	CDEBUG(D_INFO, "%s: "DFID" list xattr len %d\n",
+	       tgt_name(tsi->tsi_tgt), PFID(lu_object_fid(&obj->do_lu)), rc);
 
 	/* Since we directly use update_result->our_data as the lbuf->lb_buf,
 	 * then use NULL for result_insert to avoid unnecessary memory copy. */
@@ -759,6 +803,8 @@ static struct tgt_handler out_update_ops[] = {
 	DEF_OUT_HNDL(OUT_WRITE, "out_write", MUTABOR | HABEO_REFERO, out_write),
 	DEF_OUT_HNDL(OUT_READ, "out_read", HABEO_REFERO, out_read),
 	DEF_OUT_HNDL(OUT_NOOP, "out_noop", HABEO_REFERO, out_noop),
+	DEF_OUT_HNDL(OUT_XATTR_LIST, "out_xattr_list", HABEO_REFERO,
+		     out_xattr_list),
 };
 
 static struct tgt_handler *out_handler_find(__u32 opc)
@@ -917,6 +963,8 @@ int out_handle(struct tgt_session_info *tsi)
 	int				rc1 = 0;
 	int				ouh_size, reply_size;
 	int				updates;
+	bool need_reconstruct;
+
 	ENTRY;
 
 	req_capsule_set(pill, &RQF_OUT_UPDATE);
@@ -1054,6 +1102,8 @@ int out_handle(struct tgt_session_info *tsi)
 	tti->tti_u.update.tti_update_reply = reply;
 	tti->tti_mult_trans = !req_is_replay(tgt_ses_req(tsi));
 
+	need_reconstruct = out_check_resent(pill->rc_req);
+
 	/* Walk through updates in the request to execute them */
 	for (i = 0; i < update_buf_count; i++) {
 		struct tgt_handler	*h;
@@ -1101,12 +1151,19 @@ int out_handle(struct tgt_session_info *tsi)
 
 			/* Check resend case only for modifying RPC */
 			if (h->th_flags & MUTABOR) {
-				struct ptlrpc_request *req = tgt_ses_req(tsi);
+				/* sanity check for last XID changing */
+				if (unlikely(!need_reconstruct &&
+					     req_xid_is_last(pill->rc_req))) {
+					DEBUG_REQ(D_ERROR, pill->rc_req,
+						  "unexpected last XID change");
+					GOTO(next, rc = -EINVAL);
+				}
 
-				if (out_check_resent(env, dt, dt_obj, req,
-						     out_reconstruct, reply,
-						     reply_index))
+				if (need_reconstruct) {
+					out_reconstruct(env, dt, dt_obj, reply,
+							reply_index);
 					GOTO(next, rc = 0);
+				}
 
 				if (dt->dd_rdonly)
 					GOTO(next, rc = -EROFS);
@@ -1115,6 +1172,10 @@ int out_handle(struct tgt_session_info *tsi)
 			/* start transaction for modification RPC only */
 			if (h->th_flags & MUTABOR && current_batchid == -1) {
 				current_batchid = update->ou_batchid;
+
+				if (reply_index == 0)
+					CFS_RACE(OBD_FAIL_PTLRPC_RESEND_RACE);
+
 				rc = out_tx_start(env, dt, ta, tsi->tsi_exp);
 				if (rc != 0)
 					GOTO(next, rc);
diff --git a/drivers/staging/lustrefsx/lustre/target/out_lib.c b/drivers/staging/lustrefsx/lustre/target/out_lib.c
index c267ed20bf485..e8ebf95f4786c 100644
--- a/drivers/staging/lustrefsx/lustre/target/out_lib.c
+++ b/drivers/staging/lustrefsx/lustre/target/out_lib.c
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2014, 2015, Intel Corporation.
+ * Copyright (c) 2014, 2017, Intel Corporation.
  */
 /*
  * lustre/target/out_lib.c
@@ -53,6 +53,7 @@ const char *update_op_str(__u16 opc)
 		[OUT_ATTR_GET] = "attr_get",
 		[OUT_XATTR_SET] = "xattr_set",
 		[OUT_XATTR_GET] = "xattr_get",
+		[OUT_XATTR_LIST] = "xattr_list",
 		[OUT_INDEX_LOOKUP] = "lookup",
 		[OUT_INDEX_INSERT] = "insert",
 		[OUT_INDEX_DELETE] = "delete",
@@ -102,7 +103,7 @@ int out_update_header_pack(const struct lu_env *env,
 	unsigned int			i;
 	size_t				update_size;
 
-	if (((reply_size + 7) >> 3) >= 1ULL << 16)
+	if (reply_size  >= LNET_MTU)
 		return -EINVAL;
 
 	/* Check whether the packing exceeding the maxima update length */
@@ -404,6 +405,15 @@ int out_xattr_get_pack(const struct lu_env *env, struct object_update *update,
 }
 EXPORT_SYMBOL(out_xattr_get_pack);
 
+int out_xattr_list_pack(const struct lu_env *env, struct object_update *update,
+		       size_t *max_update_size, const struct lu_fid *fid,
+		       const int bufsize)
+{
+	return out_update_pack(env, update, max_update_size, OUT_XATTR_LIST,
+			       fid, 0, NULL, NULL, bufsize);
+}
+EXPORT_SYMBOL(out_xattr_list_pack);
+
 int out_read_pack(const struct lu_env *env, struct object_update *update,
 		  size_t *max_update_size, const struct lu_fid *fid,
 		  size_t size, loff_t pos)
@@ -588,6 +598,10 @@ int out_create_add_exec(const struct lu_env *env, struct dt_object *obj,
 	struct tx_arg *arg;
 	int rc;
 
+	/* LU-13653: ignore quota for DNE directory creation */
+	if (dof->dof_type == DFT_DIR)
+		th->th_ignore_quota = 1;
+
 	rc = dt_declare_create(env, obj, attr, NULL, dof, th);
 	if (rc != 0)
 		return rc;
@@ -657,6 +671,10 @@ int out_attr_set_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
 	if (rc != 0)
 		return rc;
 
+	if (attr->la_valid & LA_FLAGS &&
+	    attr->la_flags & LUSTRE_SET_SYNC_FL)
+		th->th_sync |= 1;
+
 	arg = tx_add_exec(ta, out_tx_attr_set_exec, out_tx_attr_set_undo,
 			  file, line);
 	if (IS_ERR(arg))
@@ -797,8 +815,7 @@ static int out_tx_xattr_set_exec(const struct lu_env *env,
 
 				lu_buf_free(&tbuf);
 				if (update) {
-					leh->leh_overflow_time =
-							cfs_time_current_sec();
+					leh->leh_overflow_time = ktime_get_real_seconds();
 					if (unlikely(!leh->leh_overflow_time))
 						leh->leh_overflow_time++;
 				}
@@ -1060,7 +1077,7 @@ static int out_obj_index_insert(const struct lu_env *env,
 		return -ENOTDIR;
 
 	dt_write_lock(env, dt_obj, MOR_TGT_CHILD);
-	rc = dt_insert(env, dt_obj, rec, key, th, 0);
+	rc = dt_insert(env, dt_obj, rec, key, th);
 	dt_write_unlock(env, dt_obj);
 
 	return rc;
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_fmd.c b/drivers/staging/lustrefsx/lustre/target/tgt_fmd.c
new file mode 100644
index 0000000000000..afbf668e38a70
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_fmd.c
@@ -0,0 +1,363 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ *
+ * Copyright (c) 2019, DDN Storage Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/target/tgt_fmd.c
+ *
+ * This file provides functions to handle Filter Modification Data (FMD).
+ * The FMD is responsible for file attributes to be applied in
+ * Transaction ID (XID) order, so older requests can't re-write newer
+ * attributes.
+ *
+ * FMD is organized as per-client list and identified by FID of object. Each
+ * FMD stores FID of object and the highest received XID of modification
+ * request for this object.
+ *
+ * FMD can expire if there are no updates for a long time to keep the list
+ * reasonably small.
+ *
+ * Author: Andreas Dilger <adilger@whamcloud.com>
+ * Author: Mike Pershin <mpershin@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd.h>
+#include <obd_class.h>
+
+#include "tgt_internal.h"
+
+/**
+ * Drop FMD reference and free it if reference drops to zero.
+ *
+ * Must be called with ted_fmd_lock held.
+ *
+ * \param[in] exp	OBD export
+ * \param[in] fmd	FMD to put
+ */
+static inline void tgt_fmd_put_nolock(struct obd_export *exp,
+				      struct tgt_fmd_data *fmd)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+
+	assert_spin_locked(&ted->ted_fmd_lock);
+	if (--fmd->fmd_refcount == 0) {
+		ted->ted_fmd_count--;
+		list_del(&fmd->fmd_list);
+		OBD_SLAB_FREE_PTR(fmd, tgt_fmd_kmem);
+	}
+}
+
+/**
+ * Wrapper to drop FMD reference with ted_fmd_lock held.
+ *
+ * \param[in] exp	OBD export
+ * \param[in] fmd	FMD to put
+ */
+void tgt_fmd_put(struct obd_export *exp, struct tgt_fmd_data *fmd)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+
+	spin_lock(&ted->ted_fmd_lock);
+	tgt_fmd_put_nolock(exp, fmd); /* caller reference */
+	spin_unlock(&ted->ted_fmd_lock);
+}
+
+/**
+ * Expire FMD entries.
+ *
+ * Expire entries from the FMD list if there are too many
+ * of them or they are too old.
+ *
+ * This function must be called with ted_fmd_lock held.
+ *
+ * The \a keep FMD is not to be expired in any case. This parameter is used
+ * by ofd_fmd_find_nolock() to prohibit a FMD that was just found from
+ * expiring.
+ *
+ * \param[in] exp	OBD export
+ * \param[in] keep	FMD to keep always
+ */
+static void tgt_fmd_expire_nolock(struct obd_export *exp,
+				  struct tgt_fmd_data *keep)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+	struct lu_target *lut = exp->exp_obd->u.obt.obt_lut;
+	time64_t now = ktime_get_seconds();
+	struct tgt_fmd_data *fmd, *tmp;
+
+	list_for_each_entry_safe(fmd, tmp, &ted->ted_fmd_list, fmd_list) {
+		if (fmd == keep)
+			break;
+
+		if (now < fmd->fmd_expire &&
+		    ted->ted_fmd_count < lut->lut_fmd_max_num)
+			break;
+
+		list_del_init(&fmd->fmd_list);
+		tgt_fmd_put_nolock(exp, fmd); /* list reference */
+	}
+}
+
+/**
+ * Expire FMD entries.
+ *
+ * This is a wrapper to call ofd_fmd_expire_nolock() with the required lock.
+ *
+ * \param[in] exp	OBD export
+ */
+void tgt_fmd_expire(struct obd_export *exp)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+
+	spin_lock(&ted->ted_fmd_lock);
+	tgt_fmd_expire_nolock(exp, NULL);
+	spin_unlock(&ted->ted_fmd_lock);
+}
+
+/**
+ * Find FMD by specified FID.
+ *
+ * Function finds FMD entry by FID in the tg_export_data::ted_fmd_list.
+ *
+ * Caller must hold tg_export_data::ted_fmd_lock and take FMD reference.
+ *
+ * \param[in] exp	OBD export
+ * \param[in] fid	FID of FMD to find
+ *
+ * \retval		struct tgt_fmd_data found by FID
+ * \retval		NULL is FMD is not found
+ */
+static struct tgt_fmd_data *tgt_fmd_find_nolock(struct obd_export *exp,
+						const struct lu_fid *fid)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+	struct tgt_fmd_data *found = NULL, *fmd;
+	struct lu_target *lut = exp->exp_obd->u.obt.obt_lut;
+	time64_t now = ktime_get_seconds();
+
+	assert_spin_locked(&ted->ted_fmd_lock);
+
+	list_for_each_entry_reverse(fmd, &ted->ted_fmd_list, fmd_list) {
+		if (lu_fid_eq(&fmd->fmd_fid, fid)) {
+			found = fmd;
+			list_move_tail(&fmd->fmd_list, &ted->ted_fmd_list);
+			fmd->fmd_expire = now + lut->lut_fmd_max_age;
+			break;
+		}
+	}
+
+	tgt_fmd_expire_nolock(exp, found);
+
+	return found;
+}
+
+/**
+ * Find FMD by specified FID with locking.
+ *
+ * Wrapper to the ofd_fmd_find_nolock() with correct locks.
+ *
+ * \param[in] exp	OBD export
+ * \param[in] fid	FID of FMD to find
+ *
+ * \retval		struct tgt_fmd_data found by FID
+ * \retval		NULL indicates FMD is not found
+ */
+struct tgt_fmd_data *tgt_fmd_find(struct obd_export *exp,
+				  const struct lu_fid *fid)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+	struct tgt_fmd_data *fmd;
+
+	spin_lock(&ted->ted_fmd_lock);
+	fmd = tgt_fmd_find_nolock(exp, fid);
+	if (fmd)
+		fmd->fmd_refcount++;    /* caller reference */
+	spin_unlock(&ted->ted_fmd_lock);
+
+	return fmd;
+}
+
+/**
+ * Find FMD by FID or create a new one if none is found.
+ *
+ * It is possible for this function to return NULL under memory pressure,
+ * or if the passed FID is zero (which will only cause old entries to expire).
+ * Currently this is not fatal because any FMD state is transient and
+ * may also be freed when it gets sufficiently old.
+ *
+ * \param[in] exp	OBD export
+ * \param[in] fid	FID of FMD to find
+ *
+ * \retval		struct tgt_fmd_data found by FID
+ * \retval		NULL indicates FMD is not found
+ */
+struct tgt_fmd_data *tgt_fmd_get(struct obd_export *exp,
+				 const struct lu_fid *fid)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+	struct tgt_fmd_data *found = NULL, *fmd_new = NULL;
+
+	OBD_SLAB_ALLOC_PTR(fmd_new, tgt_fmd_kmem);
+
+	spin_lock(&ted->ted_fmd_lock);
+	found = tgt_fmd_find_nolock(exp, fid);
+	if (fmd_new) {
+		if (!found) {
+			list_add_tail(&fmd_new->fmd_list, &ted->ted_fmd_list);
+			fmd_new->fmd_fid = *fid;
+			fmd_new->fmd_refcount++;   /* list reference */
+			found = fmd_new;
+			ted->ted_fmd_count++;
+		} else {
+			OBD_SLAB_FREE_PTR(fmd_new, tgt_fmd_kmem);
+		}
+	}
+	if (found) {
+		found->fmd_refcount++; /* caller reference */
+		found->fmd_expire = ktime_get_seconds() +
+			class_exp2tgt(exp)->lut_fmd_max_age;
+	} else {
+		LCONSOLE_WARN("%s: cannot allocate FMD for "DFID
+			      ", timestamps may be out of sync\n",
+			      exp->exp_obd->obd_name, PFID(fid));
+	}
+	spin_unlock(&ted->ted_fmd_lock);
+
+	return found;
+}
+
+#ifdef DO_FMD_DROP
+/**
+ * Drop FMD list reference so it will disappear when last reference is dropped
+ * to zero.
+ *
+ * This function is called from ofd_destroy() and may only affect
+ * the one client that is doing the unlink and at worst we have an stale entry
+ * referencing an object that should never be used again.
+ *
+ * NB: this function is used only if DO_FMD_DROP is defined. It is not
+ * currently defined, so FMD drop doesn't happen and FMD are dropped only
+ * when expired.
+ *
+ * \param[in] exp	OBD export
+ * \param[in] fid	FID of FMD to drop
+ */
+void tgt_fmd_drop(struct obd_export *exp, const struct lu_fid *fid)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+	struct tgt_fmd_data *fmd = NULL;
+
+	spin_lock(&ted->ted_fmd_lock);
+	fmd = tgt_fmd_find_nolock(exp, fid);
+	if (fmd) {
+		list_del_init(&fmd->fmd_list);
+		tgt_fmd_put_nolock(exp, fmd);
+	}
+	spin_unlock(&ted->ted_fmd_lock);
+}
+EXPORT_SYMBOL(tgt_fmd_drop);
+#endif
+
+/**
+ * Remove all entries from FMD list.
+ *
+ * Cleanup function to free all FMD enries on the given export.
+ *
+ * \param[in] exp	OBD export
+ */
+void tgt_fmd_cleanup(struct obd_export *exp)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+	struct tgt_fmd_data *fmd = NULL, *tmp;
+
+	spin_lock(&ted->ted_fmd_lock);
+	list_for_each_entry_safe(fmd, tmp, &ted->ted_fmd_list, fmd_list) {
+		list_del_init(&fmd->fmd_list);
+		if (fmd->fmd_refcount > 1) {
+			CDEBUG(D_INFO,
+			       "fmd %p still referenced (refcount = %d)\n",
+			       fmd, fmd->fmd_refcount);
+		}
+		tgt_fmd_put_nolock(exp, fmd);
+	}
+	spin_unlock(&ted->ted_fmd_lock);
+	LASSERT(list_empty(&exp->exp_target_data.ted_fmd_list));
+}
+
+/**
+ * Update FMD with the latest request XID.
+ *
+ * Save a new setattr/punch XID in FMD if exists.
+ *
+ * \param[in] exp	OBD export
+ * \param[in] fid	FID of FMD to find
+ * \param[in] xid	request XID
+ */
+void tgt_fmd_update(struct obd_export *exp, const struct lu_fid *fid, __u64 xid)
+{
+	struct tgt_fmd_data *fmd;
+
+	fmd = tgt_fmd_get(exp, fid);
+	if (fmd) {
+		if (fmd->fmd_mactime_xid < xid)
+			fmd->fmd_mactime_xid = xid;
+		tgt_fmd_put(exp, fmd);
+	}
+}
+EXPORT_SYMBOL(tgt_fmd_update);
+
+/**
+ * Chech that time can be updated by the request with given XID.
+ *
+ * Check FMD XID if exists to be less than supplied XID
+ *
+ * \param[in] exp	OBD export
+ * \param[in] fid	FID of FMD to find
+ * \param[in] xid	request XID
+ *
+ * \retval true if FMD has no greater XID, so time attr can be updated
+ */
+bool tgt_fmd_check(struct obd_export *exp, const struct lu_fid *fid, __u64 xid)
+{
+	struct tgt_fmd_data *fmd;
+	bool can_update = true;
+
+	fmd = tgt_fmd_find(exp, fid);
+	if (fmd) {
+		can_update = fmd->fmd_mactime_xid < xid;
+		tgt_fmd_put(exp, fmd);
+	}
+
+	return can_update;
+}
+EXPORT_SYMBOL(tgt_fmd_check);
+
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_grant.c b/drivers/staging/lustrefsx/lustre/target/tgt_grant.c
index 083e40020f1fc..3c5eec062cb4e 100644
--- a/drivers/staging/lustrefsx/lustre/target/tgt_grant.c
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_grant.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * lustre/target/tgt_grant.c
@@ -71,7 +71,7 @@
  * Author: Johann Lombardi <johann.lombardi@intel.com>
  */
 
-#define DEBUG_SUBSYSTEM S_FILTER
+#define DEBUG_SUBSYSTEM S_CLASS
 
 #include <obd.h>
 #include <obd_class.h>
@@ -138,11 +138,6 @@ static int tgt_check_export_grants(struct obd_export *exp, u64 *dirty,
 	struct tg_export_data *ted = &exp->exp_target_data;
 	int level = D_CACHE;
 
-	if (exp->exp_obd->obd_self_export == exp)
-		CDEBUG(D_CACHE, "%s: processing self export: %ld %ld "
-		       "%ld\n", exp->exp_obd->obd_name, ted->ted_grant,
-		       ted->ted_pending, ted->ted_dirty);
-
 	if (ted->ted_grant < 0 || ted->ted_pending < 0 || ted->ted_dirty < 0)
 		level = D_ERROR;
 	CDEBUG_LIMIT(level, "%s: cli %s/%p dirty %ld pend %ld grant %ld\n",
@@ -188,6 +183,7 @@ void tgt_grant_sanity_check(struct obd_device *obd, const char *func)
 	struct lu_target *lut = obd->u.obt.obt_lut;
 	struct tg_grants_data *tgd = &lut->lut_tgd;
 	struct obd_export *exp;
+	struct tg_export_data *ted;
 	u64		   maxsize;
 	u64		   tot_dirty = 0;
 	u64		   tot_pending = 0;
@@ -209,6 +205,15 @@ void tgt_grant_sanity_check(struct obd_device *obd, const char *func)
 
 	spin_lock(&obd->obd_dev_lock);
 	spin_lock(&tgd->tgd_grant_lock);
+	exp = obd->obd_self_export;
+	ted = &exp->exp_target_data;
+	CDEBUG(D_CACHE, "%s: processing self export: %ld %ld "
+	       "%ld\n", obd->obd_name, ted->ted_grant,
+	       ted->ted_pending, ted->ted_dirty);
+	tot_granted += ted->ted_grant + ted->ted_pending;
+	tot_pending += ted->ted_pending;
+	tot_dirty += ted->ted_dirty;
+
 	list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) {
 		error = tgt_check_export_grants(exp, &tot_dirty, &tot_pending,
 						&tot_granted, maxsize);
@@ -275,14 +280,14 @@ EXPORT_SYMBOL(tgt_grant_sanity_check);
  * \retval		negative value on error
  */
 int tgt_statfs_internal(const struct lu_env *env, struct lu_target *lut,
-			struct obd_statfs *osfs, __u64 max_age, int *from_cache)
+			struct obd_statfs *osfs, time64_t max_age, int *from_cache)
 {
 	struct tg_grants_data *tgd = &lut->lut_tgd;
 	int rc = 0;
 	ENTRY;
 
 	spin_lock(&tgd->tgd_osfs_lock);
-	if (cfs_time_before_64(tgd->tgd_osfs_age, max_age) || max_age == 0) {
+	if (tgd->tgd_osfs_age < max_age || max_age == 0) {
 		u64 unstable;
 
 		/* statfs data are too old, get up-to-date one.
@@ -308,6 +313,8 @@ int tgt_statfs_internal(const struct lu_env *env, struct lu_target *lut,
 		if (unlikely(rc))
 			GOTO(out, rc);
 
+		osfs->os_namelen = min_t(__u32, osfs->os_namelen, NAME_MAX);
+
 		spin_lock(&tgd->tgd_grant_lock);
 		spin_lock(&tgd->tgd_osfs_lock);
 		/* calculate how much space was written while we released the
@@ -337,7 +344,7 @@ int tgt_statfs_internal(const struct lu_env *env, struct lu_target *lut,
 
 		/* finally udpate cached statfs data */
 		tgd->tgd_osfs = *osfs;
-		tgd->tgd_osfs_age = cfs_time_current_64();
+		tgd->tgd_osfs_age = ktime_get_seconds();
 
 		tgd->tgd_statfs_inflight--; /* stop tracking */
 		if (tgd->tgd_statfs_inflight == 0)
@@ -383,13 +390,13 @@ static void tgt_grant_statfs(const struct lu_env *env, struct obd_export *exp,
 	struct tg_grants_data	*tgd = &lut->lut_tgd;
 	struct tgt_thread_info	*tti;
 	struct obd_statfs	*osfs;
-	__u64			 max_age;
-	int			 rc;
+	time64_t max_age;
+	int rc;
 
 	if (force)
 		max_age = 0; /* get fresh statfs data */
 	else
-		max_age = cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS);
+		max_age = ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS;
 
 	tti = tgt_th_info(env);
 	osfs = &tti->tti_u.osfs;
@@ -428,6 +435,7 @@ static u64 tgt_grant_space_left(struct obd_export *exp)
 	u64			 left;
 	u64			 avail;
 	u64			 unstable;
+	u64			 reserved;
 
 	ENTRY;
 	assert_spin_locked(&tgd->tgd_grant_lock);
@@ -438,7 +446,8 @@ static u64 tgt_grant_space_left(struct obd_export *exp)
 	unstable = tgd->tgd_osfs_unstable; /* those might be accounted twice */
 	spin_unlock(&tgd->tgd_osfs_lock);
 
-	tot_granted = tgd->tgd_tot_granted;
+	reserved = left * tgd->tgd_reserved_pcnt / 100;
+	tot_granted = tgd->tgd_tot_granted + reserved;
 
 	if (left < tot_granted) {
 		int mask = (left + unstable <
@@ -490,8 +499,7 @@ static void tgt_grant_incoming(const struct lu_env *env, struct obd_export *exp,
 	struct tg_export_data	*ted = &exp->exp_target_data;
 	struct obd_device	*obd = exp->exp_obd;
 	struct tg_grants_data	*tgd = &obd->u.obt.obt_lut->lut_tgd;
-	long			 dirty;
-	long			 dropped;
+	long long		 dirty, dropped;
 	ENTRY;
 
 	assert_spin_locked(&tgd->tgd_grant_lock);
@@ -515,10 +523,19 @@ static void tgt_grant_incoming(const struct lu_env *env, struct obd_export *exp,
 
 	/* inflate grant counters if required */
 	if (!exp_grant_param_supp(exp)) {
+		u64 tmp;
 		oa->o_grant	= tgt_grant_inflate(tgd, oa->o_grant);
 		oa->o_dirty	= tgt_grant_inflate(tgd, oa->o_dirty);
-		oa->o_dropped	= tgt_grant_inflate(tgd, (u64)oa->o_dropped);
-		oa->o_undirty	= tgt_grant_inflate(tgd, oa->o_undirty);
+		/* inflation can bump client's wish to >4GB which doesn't fit
+		 * 32bit o_undirty, limit that ..  */
+		tmp = tgt_grant_inflate(tgd, oa->o_undirty);
+		if (tmp >= OBD_MAX_GRANT)
+			tmp = OBD_MAX_GRANT & ~(1ULL << tgd->tgd_blockbits);
+		oa->o_undirty = tmp;
+		tmp = tgt_grant_inflate(tgd, oa->o_dropped);
+		if (tmp >= OBD_MAX_GRANT)
+			tmp = OBD_MAX_GRANT & ~(1ULL << tgd->tgd_blockbits);
+		oa->o_dropped = tmp;
 	}
 
 	dirty = oa->o_dirty;
@@ -533,13 +550,13 @@ static void tgt_grant_incoming(const struct lu_env *env, struct obd_export *exp,
 	tgd->tgd_tot_dirty += dirty - ted->ted_dirty;
 	if (ted->ted_grant < dropped) {
 		CDEBUG(D_CACHE,
-		       "%s: cli %s/%p reports %lu dropped > grant %lu\n",
+		       "%s: cli %s/%p reports %llu dropped > grant %lu\n",
 		       obd->obd_name, exp->exp_client_uuid.uuid, exp, dropped,
 		       ted->ted_grant);
 		dropped = 0;
 	}
 	if (tgd->tgd_tot_granted < dropped) {
-		CERROR("%s: cli %s/%p reports %lu dropped > tot_grant %llu\n",
+		CERROR("%s: cli %s/%p reports %llu dropped > tot_grant %llu\n",
 		       obd->obd_name, exp->exp_client_uuid.uuid, exp,
 		       dropped, tgd->tgd_tot_granted);
 		dropped = 0;
@@ -588,6 +605,14 @@ static void tgt_grant_shrink(struct obd_export *exp, struct obdo *oa,
 
 	grant_shrink = oa->o_grant;
 
+	if (ted->ted_grant < grant_shrink) {
+		CDEBUG(D_CACHE,
+		       "%s: cli %s/%p wants %lu shrinked > grant %lu\n",
+		       obd->obd_name, exp->exp_client_uuid.uuid, exp,
+		       grant_shrink, ted->ted_grant);
+		grant_shrink = ted->ted_grant;
+	}
+
 	ted->ted_grant -= grant_shrink;
 	tgd->tgd_tot_granted -= grant_shrink;
 
@@ -859,6 +884,7 @@ static void tgt_grant_check(const struct lu_env *env, struct obd_export *exp,
  *				have
  * \param[in] left		remaining free space with granted space taken
  *				out
+ * \param[in] chunk		grant allocation unit
  * \param[in] conservative	if set to true, the server should be cautious
  *				and limit how much space is granted back to the
  *				client. Otherwise, the server should try hard to
@@ -877,6 +903,9 @@ static long tgt_grant_alloc(struct obd_export *exp, u64 curgrant,
 
 	ENTRY;
 
+	if (OBD_FAIL_CHECK(OBD_FAIL_TGT_NO_GRANT))
+		RETURN(0);
+
 	/* When tgd_grant_compat_disable is set, we don't grant any space to
 	 * clients not supporting OBD_CONNECT_GRANT_PARAM.
 	 * Otherwise, space granted to such a client is inflated since it
@@ -928,18 +957,19 @@ static long tgt_grant_alloc(struct obd_export *exp, u64 curgrant,
 	 * client would like to have by more than grants for 2 full
 	 * RPCs
 	 */
+	if (want + chunk <= ted->ted_grant)
+		RETURN(0);
 	if (ted->ted_grant + grant > want + chunk)
 		grant = want + chunk - ted->ted_grant;
 
 	tgd->tgd_tot_granted += grant;
 	ted->ted_grant += grant;
 
-	if (ted->ted_grant < 0) {
+	if (unlikely(ted->ted_grant < 0 || ted->ted_grant > want + chunk)) {
 		CERROR("%s: cli %s/%p grant %ld want %llu current %llu\n",
 		       obd->obd_name, exp->exp_client_uuid.uuid, exp,
 		       ted->ted_grant, want, curgrant);
 		spin_unlock(&tgd->tgd_grant_lock);
-		LBUG();
 	}
 
 	CDEBUG(D_CACHE,
@@ -1053,28 +1083,51 @@ EXPORT_SYMBOL(tgt_grant_connect);
 void tgt_grant_discard(struct obd_export *exp)
 {
 	struct obd_device	*obd = exp->exp_obd;
-	struct tg_grants_data	*tgd = &obd->u.obt.obt_lut->lut_tgd;
+	struct lu_target        *lut = class_exp2tgt(exp);
 	struct tg_export_data	*ted = &exp->exp_target_data;
+	struct tg_grants_data	*tgd;
+
+	if (!lut)
+		return;
 
+	tgd = &lut->lut_tgd;
 	spin_lock(&tgd->tgd_grant_lock);
-	LASSERTF(tgd->tgd_tot_granted >= ted->ted_grant,
-		 "%s: tot_granted %llu cli %s/%p ted_grant %ld\n",
-		 obd->obd_name, tgd->tgd_tot_granted,
-		 exp->exp_client_uuid.uuid, exp, ted->ted_grant);
-	tgd->tgd_tot_granted -= ted->ted_grant;
+	if (unlikely(tgd->tgd_tot_granted < ted->ted_grant ||
+		     tgd->tgd_tot_dirty < ted->ted_dirty)) {
+		struct obd_export *e;
+		u64 ttg = 0;
+		u64 ttd = 0;
+
+		list_for_each_entry(e, &obd->obd_exports, exp_obd_chain) {
+			LASSERT(exp != e);
+			ttg += e->exp_target_data.ted_grant;
+			ttg += e->exp_target_data.ted_pending;
+			ttd += e->exp_target_data.ted_dirty;
+		}
+		if (tgd->tgd_tot_granted < ted->ted_grant)
+			CERROR("%s: cli %s/%p: tot_granted %llu < ted_grant %ld, corrected to %llu",
+			       obd->obd_name,  exp->exp_client_uuid.uuid, exp,
+			       tgd->tgd_tot_granted, ted->ted_grant, ttg);
+		if (tgd->tgd_tot_dirty < ted->ted_dirty)
+			CERROR("%s: cli %s/%p: tot_dirty %llu < ted_dirty %ld, corrected to %llu",
+			       obd->obd_name, exp->exp_client_uuid.uuid, exp,
+			       tgd->tgd_tot_dirty, ted->ted_dirty, ttd);
+		tgd->tgd_tot_granted = ttg;
+		tgd->tgd_tot_dirty = ttd;
+	} else {
+		tgd->tgd_tot_granted -= ted->ted_grant;
+		tgd->tgd_tot_dirty -= ted->ted_dirty;
+	}
 	ted->ted_grant = 0;
-	LASSERTF(tgd->tgd_tot_pending >= ted->ted_pending,
-		 "%s: tot_pending %llu cli %s/%p ted_pending %ld\n",
-		 obd->obd_name, tgd->tgd_tot_pending,
-		 exp->exp_client_uuid.uuid, exp, ted->ted_pending);
+	ted->ted_dirty = 0;
+
+	if (tgd->tgd_tot_pending < ted->ted_pending) {
+		CERROR("%s: tot_pending %llu < cli %s/%p ted_pending %ld\n",
+		       obd->obd_name, tgd->tgd_tot_pending,
+		       exp->exp_client_uuid.uuid, exp, ted->ted_pending);
+	}
 	/* tgd_tot_pending is handled in tgt_grant_commit as bulk
 	 * commmits */
-	LASSERTF(tgd->tgd_tot_dirty >= ted->ted_dirty,
-		 "%s: tot_dirty %llu cli %s/%p ted_dirty %ld\n",
-		 obd->obd_name, tgd->tgd_tot_dirty,
-		 exp->exp_client_uuid.uuid, exp, ted->ted_dirty);
-	tgd->tgd_tot_dirty -= ted->ted_dirty;
-	ted->ted_dirty = 0;
 	spin_unlock(&tgd->tgd_grant_lock);
 }
 EXPORT_SYMBOL(tgt_grant_discard);
@@ -1509,3 +1562,131 @@ int tgt_grant_commit_cb_add(struct thandle *th, struct obd_export *exp,
 	RETURN(rc);
 }
 EXPORT_SYMBOL(tgt_grant_commit_cb_add);
+
+/**
+ * Show estimate of total amount of dirty data on clients.
+ *
+ * @kobj		kobject embedded in obd_device
+ * @attr		unused
+ * @buf			buf used by sysfs to print out data
+ *
+ * Return:		0 on success
+ *			negative value on error
+ */
+ssize_t tot_dirty_show(struct kobject *kobj, struct attribute *attr,
+		       char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct tg_grants_data *tgd;
+
+	tgd = &obd->u.obt.obt_lut->lut_tgd;
+	return scnprintf(buf, PAGE_SIZE, "%llu\n", tgd->tgd_tot_dirty);
+}
+EXPORT_SYMBOL(tot_dirty_show);
+
+/**
+ * Show total amount of space granted to clients.
+ *
+ * @kobj		kobject embedded in obd_device
+ * @attr		unused
+ * @buf			buf used by sysfs to print out data
+ *
+ * Return:		0 on success
+ *			negative value on error
+ */
+ssize_t tot_granted_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct tg_grants_data *tgd;
+
+	tgd = &obd->u.obt.obt_lut->lut_tgd;
+	return scnprintf(buf, PAGE_SIZE, "%llu\n", tgd->tgd_tot_granted);
+}
+EXPORT_SYMBOL(tot_granted_show);
+
+/**
+ * Show total amount of space used by IO in progress.
+ *
+ * @kobj		kobject embedded in obd_device
+ * @attr		unused
+ * @buf			buf used by sysfs to print out data
+ *
+ * Return:		0 on success
+ *			negative value on error
+ */
+ssize_t tot_pending_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct tg_grants_data *tgd;
+
+	tgd = &obd->u.obt.obt_lut->lut_tgd;
+	return scnprintf(buf, PAGE_SIZE, "%llu\n", tgd->tgd_tot_pending);
+}
+EXPORT_SYMBOL(tot_pending_show);
+
+/**
+ * Show if grants compatibility mode is disabled.
+ *
+ * When tgd_grant_compat_disable is set, we don't grant any space to clients
+ * not supporting OBD_CONNECT_GRANT_PARAM. Otherwise, space granted to such
+ * a client is inflated since it consumes PAGE_SIZE of grant space per
+ * block, (i.e. typically 4kB units), but underlaying file system might have
+ * block size bigger than page size, e.g. ZFS. See LU-2049 for details.
+ *
+ * @kobj		kobject embedded in obd_device
+ * @attr		unused
+ * @buf			buf used by sysfs to print out data
+ *
+ * Return:		string length of @buf output on success
+ */
+ssize_t grant_compat_disable_show(struct kobject *kobj, struct attribute *attr,
+				  char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", tgd->tgd_grant_compat_disable);
+}
+EXPORT_SYMBOL(grant_compat_disable_show);
+
+/**
+ * Change grant compatibility mode.
+ *
+ * Setting tgd_grant_compat_disable prohibit any space granting to clients
+ * not supporting OBD_CONNECT_GRANT_PARAM. See details above.
+ *
+ * @kobj	kobject embedded in obd_device
+ * @attr	unused
+ * @buffer	string which represents mode
+ *		1: disable compatibility mode
+ *		0: enable compatibility mode
+ * @count	@buffer length
+ *
+ * Return:	@count on success
+ *		negative number on error
+ */
+ssize_t grant_compat_disable_store(struct kobject *kobj,
+				   struct attribute *attr,
+				   const char *buffer, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
+	bool val;
+	int rc;
+
+	rc = kstrtobool(buffer, &val);
+	if (rc)
+		return rc;
+
+	tgd->tgd_grant_compat_disable = val;
+
+	return count;
+}
+EXPORT_SYMBOL(grant_compat_disable_store);
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_handler.c b/drivers/staging/lustrefsx/lustre/target/tgt_handler.c
index d2113af69436b..2ec6d01e60d91 100644
--- a/drivers/staging/lustrefsx/lustre/target/tgt_handler.c
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_handler.c
@@ -21,7 +21,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2013, 2016, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
  */
 /*
  * lustre/target/tgt_handler.c
@@ -343,10 +343,13 @@ static int tgt_request_preprocess(struct tgt_session_info *tsi,
 
 		dlm_req = req_capsule_client_get(pill, &RMF_DLM_REQ);
 		if (dlm_req != NULL) {
+			union ldlm_wire_policy_data *policy =
+					&dlm_req->lock_desc.l_policy_data;
+
 			if (unlikely(dlm_req->lock_desc.l_resource.lr_type ==
 				     LDLM_IBITS &&
-				     dlm_req->lock_desc.l_policy_data.\
-				     l_inodebits.bits == 0)) {
+				     (policy->l_inodebits.bits |
+				      policy->l_inodebits.try_bits) == 0)) {
 				/*
 				 * Lock without inodebits makes no sense and
 				 * will oops later in ldlm. If client miss to
@@ -431,6 +434,20 @@ static int tgt_handle_request0(struct tgt_session_info *tsi,
 					     &RMF_ACL, RCL_SERVER,
 					     LUSTRE_POSIX_ACL_MAX_SIZE_OLD);
 
+		if (req_capsule_has_field(tsi->tsi_pill, &RMF_SHORT_IO,
+					  RCL_SERVER)) {
+			struct niobuf_remote *remote_nb =
+				req_capsule_client_get(tsi->tsi_pill,
+						       &RMF_NIOBUF_REMOTE);
+			struct ost_body *body = tsi->tsi_ost_body;
+
+			req_capsule_set_size(tsi->tsi_pill, &RMF_SHORT_IO,
+					 RCL_SERVER,
+					 (body->oa.o_valid & OBD_MD_FLFLAGS &&
+					  body->oa.o_flags & OBD_FL_SHORT_IO) ?
+					 remote_nb[0].rnb_len : 0);
+		}
+
 		rc = req_capsule_server_pack(tsi->tsi_pill);
 	}
 
@@ -596,8 +613,14 @@ static struct tgt_handler *tgt_handler_find_check(struct ptlrpc_request *req)
 
 	/* opcode was not found in slice */
 	if (unlikely(s->tos_hs == NULL)) {
-		CERROR("%s: no handlers for opcode 0x%x\n", tgt_name(tgt),
-		       opc);
+		static bool printed;
+
+		/* don't print error messages for known unhandled RPCs */
+		if (opc != OST_FALLOCATE && opc != OST_SEEK && !printed) {
+			CERROR("%s: no handler for opcode 0x%x from %s\n",
+			       tgt_name(tgt), opc, libcfs_id2str(req->rq_peer));
+			printed = true;
+		}
 		RETURN(ERR_PTR(-ENOTSUPP));
 	}
 
@@ -645,6 +668,19 @@ static int process_req_last_xid(struct ptlrpc_request *req)
 			RETURN(-EPROTO);
 	}
 
+	/* The "last_xid" is the minimum xid among unreplied requests,
+	 * if the request is from the previous connection, its xid can
+	 * still be larger than "exp_last_xid", then the above check of
+	 * xid is not enough to determine whether the request is delayed.
+	 *
+	 * For example, if some replay request was delayed and caused
+	 * timeout at client and the replay is restarted, the delayed
+	 * replay request will have the larger xid than "exp_last_xid"
+	 */
+	if (req->rq_export->exp_conn_cnt >
+	    lustre_msg_get_conn_cnt(req->rq_reqmsg))
+		RETURN(-ESTALE);
+
 	/* try to release in-memory reply data */
 	if (tgt_is_multimodrpcs_client(req->rq_export)) {
 		tgt_handle_received_xid(req->rq_export,
@@ -671,8 +707,18 @@ int tgt_request_handle(struct ptlrpc_request *req)
 	bool			 is_connect = false;
 	ENTRY;
 
-	/* Refill the context, to make sure all thread keys are allocated */
-	lu_env_refill(req->rq_svc_thread->t_env);
+	if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_TGT_RECOVERY_REQ_RACE))) {
+		if (cfs_fail_val == 0 &&
+		    lustre_msg_get_opc(msg) != OBD_PING &&
+		    lustre_msg_get_flags(msg) & MSG_REQ_REPLAY_DONE) {
+			struct l_wait_info lwi =  { 0 };
+
+			cfs_fail_val = 1;
+			cfs_race_state = 0;
+			l_wait_event(cfs_race_waitq, (cfs_race_state == 1),
+				     &lwi);
+		}
+	}
 
 	req_capsule_init(&req->rq_pill, req, RCL_SERVER);
 	tsi->tsi_pill = &req->rq_pill;
@@ -836,9 +882,9 @@ EXPORT_SYMBOL(tgt_counter_incr);
 
 int tgt_connect_check_sptlrpc(struct ptlrpc_request *req, struct obd_export *exp)
 {
-	struct lu_target	*tgt = class_exp2tgt(exp);
-	struct sptlrpc_flavor	 flvr;
-	int			 rc = 0;
+	struct lu_target *tgt = class_exp2tgt(exp);
+	struct sptlrpc_flavor flvr;
+	int rc = 0;
 
 	LASSERT(tgt);
 	LASSERT(tgt->lut_obd);
@@ -863,13 +909,13 @@ int tgt_connect_check_sptlrpc(struct ptlrpc_request *req, struct obd_export *exp
 		exp->exp_sp_peer = req->rq_sp_from;
 		exp->exp_flvr = flvr;
 
-		/* when on mgs, if no restriction is set, or if client
-		 * is loopback, allow any flavor */
+		/* when on mgs, if no restriction is set, or if the client
+		 * NID is on the local node, allow any flavor
+		 */
 		if ((strcmp(exp->exp_obd->obd_type->typ_name,
 			   LUSTRE_MGS_NAME) == 0) &&
 		     (exp->exp_flvr.sf_rpc == SPTLRPC_FLVR_NULL ||
-		      LNET_NETTYP(LNET_NIDNET(exp->exp_connection->c_peer.nid))
-		      == LOLND))
+		      LNetIsPeerLocal(exp->exp_connection->c_peer.nid)))
 			exp->exp_flvr.sf_rpc = SPTLRPC_FLVR_ANY;
 
 		if (exp->exp_flvr.sf_rpc != SPTLRPC_FLVR_ANY &&
@@ -949,9 +995,19 @@ int tgt_connect(struct tgt_session_info *tsi)
 	reply = req_capsule_server_get(tsi->tsi_pill, &RMF_CONNECT_DATA);
 	spin_lock(&tsi->tsi_exp->exp_lock);
 	*exp_connect_flags_ptr(tsi->tsi_exp) = reply->ocd_connect_flags;
+	if (reply->ocd_connect_flags & OBD_CONNECT_FLAGS2)
+		*exp_connect_flags2_ptr(tsi->tsi_exp) =
+			reply->ocd_connect_flags2;
 	tsi->tsi_exp->exp_connect_data.ocd_brw_size = reply->ocd_brw_size;
 	spin_unlock(&tsi->tsi_exp->exp_lock);
 
+	if (strcmp(tsi->tsi_exp->exp_obd->obd_type->typ_name,
+		   LUSTRE_MDT_NAME) == 0) {
+		rc = req_check_sepol(tsi->tsi_pill);
+		if (rc)
+			GOTO(out, rc);
+	}
+
 	RETURN(0);
 out:
 	obd_disconnect(class_export_get(tsi->tsi_exp));
@@ -965,6 +1021,8 @@ int tgt_disconnect(struct tgt_session_info *tsi)
 
 	ENTRY;
 
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OST_DISCONNECT_DELAY, cfs_fail_val);
+
 	rc = target_handle_disconnect(tgt_ses_req(tsi));
 	if (rc)
 		RETURN(err_serious(rc));
@@ -982,7 +1040,16 @@ int tgt_obd_ping(struct tgt_session_info *tsi)
 
 	ENTRY;
 
-	rc = target_handle_ping(tgt_ses_req(tsi));
+	/* The target-specific part of OBD_PING request handling.
+	 * It controls Filter Modification Data (FMD) expiration each time
+	 * PING is received.
+	 *
+	 * Valid only for replayable targets, e.g. MDT and OFD
+	 */
+	if (tsi->tsi_exp->exp_obd->obd_replayable)
+		tgt_fmd_expire(tsi->tsi_exp);
+
+	rc = req_capsule_server_pack(tsi->tsi_pill);
 	if (rc)
 		RETURN(err_serious(rc));
 
@@ -1152,7 +1219,6 @@ static int tgt_obd_idx_read(struct tgt_session_info *tsi)
 
 struct tgt_handler tgt_obd_handlers[] = {
 TGT_OBD_HDL    (0,	OBD_PING,		tgt_obd_ping),
-TGT_OBD_HDL_VAR(0,	OBD_LOG_CANCEL,		tgt_obd_log_cancel),
 TGT_OBD_HDL    (0,	OBD_IDX_READ,		tgt_obd_idx_read)
 };
 EXPORT_SYMBOL(tgt_obd_handlers);
@@ -1216,8 +1282,8 @@ static int tgt_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
 
 	if (flag == LDLM_CB_CANCELING &&
 	    (lock->l_granted_mode & (LCK_EX | LCK_PW | LCK_GROUP)) &&
-	    (tgt->lut_sync_lock_cancel == ALWAYS_SYNC_ON_CANCEL ||
-	     (tgt->lut_sync_lock_cancel == BLOCKING_SYNC_ON_CANCEL &&
+	    (tgt->lut_sync_lock_cancel == SYNC_LOCK_CANCEL_ALWAYS ||
+	     (tgt->lut_sync_lock_cancel == SYNC_LOCK_CANCEL_BLOCKING &&
 	      ldlm_is_cbpending(lock))) &&
 	    ((exp_connect_flags(lock->l_export) & OBD_CONNECT_MDS_MDS) ||
 	     lock->l_resource->lr_type == LDLM_EXTENT)) {
@@ -1226,7 +1292,7 @@ static int tgt_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
 
 		rc = lu_env_init(&env, LCT_DT_THREAD);
 		if (unlikely(rc != 0))
-			RETURN(rc);
+			GOTO(err, rc);
 
 		ost_fid_from_resid(&fid, &lock->l_resource->lr_name,
 				   tgt->lut_lsd.lsd_osd_index);
@@ -1257,7 +1323,7 @@ static int tgt_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
 err_env:
 		lu_env_fini(&env);
 	}
-
+err:
 	rc = ldlm_server_blocking_ast(lock, desc, data, flag);
 	RETURN(rc);
 }
@@ -1329,7 +1395,7 @@ int tgt_cp_callback(struct tgt_session_info *tsi)
 /* generic LDLM target handler */
 struct tgt_handler tgt_dlm_handlers[] = {
 TGT_DLM_HDL    (HABEO_CLAVIS,	LDLM_ENQUEUE,		tgt_enqueue),
-TGT_DLM_HDL_VAR(HABEO_CLAVIS,	LDLM_CONVERT,		tgt_convert),
+TGT_DLM_HDL    (HABEO_CLAVIS,	LDLM_CONVERT,		tgt_convert),
 TGT_DLM_HDL_VAR(0,		LDLM_BL_CALLBACK,	tgt_bl_callback),
 TGT_DLM_HDL_VAR(0,		LDLM_CP_CALLBACK,	tgt_cp_callback)
 };
@@ -1350,30 +1416,6 @@ int tgt_llog_open(struct tgt_session_info *tsi)
 }
 EXPORT_SYMBOL(tgt_llog_open);
 
-int tgt_llog_close(struct tgt_session_info *tsi)
-{
-	int rc;
-
-	ENTRY;
-
-	rc = llog_origin_handle_close(tgt_ses_req(tsi));
-
-	RETURN(rc);
-}
-EXPORT_SYMBOL(tgt_llog_close);
-
-
-int tgt_llog_destroy(struct tgt_session_info *tsi)
-{
-	int rc;
-
-	ENTRY;
-
-	rc = llog_origin_handle_destroy(tgt_ses_req(tsi));
-
-	RETURN(rc);
-}
-
 int tgt_llog_read_header(struct tgt_session_info *tsi)
 {
 	int rc;
@@ -1416,8 +1458,6 @@ TGT_LLOG_HDL    (0,	LLOG_ORIGIN_HANDLE_CREATE,	tgt_llog_open),
 TGT_LLOG_HDL    (0,	LLOG_ORIGIN_HANDLE_NEXT_BLOCK,	tgt_llog_next_block),
 TGT_LLOG_HDL    (0,	LLOG_ORIGIN_HANDLE_READ_HEADER,	tgt_llog_read_header),
 TGT_LLOG_HDL    (0,	LLOG_ORIGIN_HANDLE_PREV_BLOCK,	tgt_llog_prev_block),
-TGT_LLOG_HDL    (0,	LLOG_ORIGIN_HANDLE_DESTROY,	tgt_llog_destroy),
-TGT_LLOG_HDL_VAR(0,	LLOG_ORIGIN_HANDLE_CLOSE,	tgt_llog_close),
 };
 EXPORT_SYMBOL(tgt_llog_handlers);
 
@@ -1567,13 +1607,48 @@ void tgt_io_thread_done(struct ptlrpc_thread *thread)
 	EXIT;
 }
 EXPORT_SYMBOL(tgt_io_thread_done);
+
+/**
+ * Helper function for getting Data-on-MDT file server DLM lock
+ * if asked by client.
+ */
+int tgt_mdt_data_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
+		      struct lustre_handle *lh, int mode, __u64 *flags)
+{
+	union ldlm_policy_data policy = {
+		.l_inodebits.bits = MDS_INODELOCK_DOM,
+	};
+	int rc;
+
+	ENTRY;
+
+	LASSERT(lh != NULL);
+	LASSERT(ns != NULL);
+	LASSERT(!lustre_handle_is_used(lh));
+
+	rc = ldlm_cli_enqueue_local(NULL, ns, res_id, LDLM_IBITS, &policy, mode,
+				    flags, ldlm_blocking_ast,
+				    ldlm_completion_ast, ldlm_glimpse_ast,
+				    NULL, 0, LVB_T_NONE, NULL, lh);
+
+	RETURN(rc == ELDLM_OK ? 0 : -EIO);
+}
+EXPORT_SYMBOL(tgt_mdt_data_lock);
+
+void tgt_mdt_data_unlock(struct lustre_handle *lh, enum ldlm_mode mode)
+{
+	LASSERT(lustre_handle_is_used(lh));
+	ldlm_lock_decref(lh, mode);
+}
+EXPORT_SYMBOL(tgt_mdt_data_unlock);
+
 /**
  * Helper function for getting server side [start, start+count] DLM lock
  * if asked by client.
  */
-int tgt_extent_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
-		    __u64 start, __u64 end, struct lustre_handle *lh,
-		    int mode, __u64 *flags)
+int tgt_extent_lock(const struct lu_env *env, struct ldlm_namespace *ns,
+		    struct ldlm_res_id *res_id, __u64 start, __u64 end,
+		    struct lustre_handle *lh, int mode, __u64 *flags)
 {
 	union ldlm_policy_data policy;
 	int rc;
@@ -1596,8 +1671,8 @@ int tgt_extent_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
 	else
 		policy.l_extent.end = end | ~PAGE_MASK;
 
-	rc = ldlm_cli_enqueue_local(ns, res_id, LDLM_EXTENT, &policy, mode,
-				    flags, ldlm_blocking_ast,
+	rc = ldlm_cli_enqueue_local(env, ns, res_id, LDLM_EXTENT, &policy,
+				    mode, flags, ldlm_blocking_ast,
 				    ldlm_completion_ast, ldlm_glimpse_ast,
 				    NULL, 0, LVB_T_NONE, NULL, lh);
 	RETURN(rc == ELDLM_OK ? 0 : -EIO);
@@ -1611,13 +1686,16 @@ void tgt_extent_unlock(struct lustre_handle *lh, enum ldlm_mode mode)
 }
 EXPORT_SYMBOL(tgt_extent_unlock);
 
-int tgt_brw_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
-		 struct obd_ioobj *obj, struct niobuf_remote *nb,
-		 struct lustre_handle *lh, enum ldlm_mode mode)
+static int tgt_brw_lock(const struct lu_env *env, struct obd_export *exp,
+			struct ldlm_res_id *res_id, struct obd_ioobj *obj,
+			struct niobuf_remote *nb, struct lustre_handle *lh,
+			enum ldlm_mode mode)
 {
+	struct ldlm_namespace	*ns = exp->exp_obd->obd_namespace;
 	__u64			 flags = 0;
 	int			 nrbufs = obj->ioo_bufcnt;
 	int			 i;
+	int			 rc;
 
 	ENTRY;
 
@@ -1634,14 +1712,19 @@ int tgt_brw_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
 		if (!(nb[i].rnb_flags & OBD_BRW_SRVLOCK))
 			RETURN(-EFAULT);
 
-	RETURN(tgt_extent_lock(ns, res_id, nb[0].rnb_offset,
-			       nb[nrbufs - 1].rnb_offset +
-			       nb[nrbufs - 1].rnb_len - 1,
-			       lh, mode, &flags));
+	/* MDT IO for data-on-mdt */
+	if (exp->exp_connect_data.ocd_connect_flags & OBD_CONNECT_IBITS)
+		rc = tgt_mdt_data_lock(ns, res_id, lh, mode, &flags);
+	else
+		rc = tgt_extent_lock(env, ns, res_id, nb[0].rnb_offset,
+				     nb[nrbufs - 1].rnb_offset +
+				     nb[nrbufs - 1].rnb_len - 1,
+				     lh, mode, &flags);
+	RETURN(rc);
 }
 
-void tgt_brw_unlock(struct obd_ioobj *obj, struct niobuf_remote *niob,
-		    struct lustre_handle *lh, enum ldlm_mode mode)
+static void tgt_brw_unlock(struct obd_ioobj *obj, struct niobuf_remote *niob,
+			   struct lustre_handle *lh, enum ldlm_mode mode)
 {
 	ENTRY;
 
@@ -1654,86 +1737,82 @@ void tgt_brw_unlock(struct obd_ioobj *obj, struct niobuf_remote *niob,
 		tgt_extent_unlock(lh, mode);
 	EXIT;
 }
-
-static __u32 tgt_checksum_bulk(struct lu_target *tgt,
-			       struct ptlrpc_bulk_desc *desc, int opc,
-			       cksum_type_t cksum_type)
+static int tgt_checksum_niobuf(struct lu_target *tgt,
+				 struct niobuf_local *local_nb, int npages,
+				 int opc, enum cksum_types cksum_type,
+				 __u32 *cksum)
 {
-	struct cfs_crypto_hash_desc	*hdesc;
+	struct ahash_request	       *req;
 	unsigned int			bufsize;
 	int				i, err;
 	unsigned char			cfs_alg = cksum_obd2cfs(cksum_type);
-	__u32				cksum;
-
-	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
 
-	hdesc = cfs_crypto_hash_init(cfs_alg, NULL, 0);
-	if (IS_ERR(hdesc)) {
+	req = cfs_crypto_hash_init(cfs_alg, NULL, 0);
+	if (IS_ERR(req)) {
 		CERROR("%s: unable to initialize checksum hash %s\n",
 		       tgt_name(tgt), cfs_crypto_hash_name(cfs_alg));
-		return PTR_ERR(hdesc);
+		return PTR_ERR(req);
 	}
 
 	CDEBUG(D_INFO, "Checksum for algo %s\n", cfs_crypto_hash_name(cfs_alg));
-	for (i = 0; i < desc->bd_iov_count; i++) {
+	for (i = 0; i < npages; i++) {
 		/* corrupt the data before we compute the checksum, to
 		 * simulate a client->OST data error */
 		if (i == 0 && opc == OST_WRITE &&
 		    OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_RECEIVE)) {
-			int off = BD_GET_KIOV(desc, i).kiov_offset &
-				~PAGE_MASK;
-			int len = BD_GET_KIOV(desc, i).kiov_len;
+			int off = local_nb[i].lnb_page_offset & ~PAGE_MASK;
+			int len = local_nb[i].lnb_len;
 			struct page *np = tgt_page_to_corrupt;
-			char *ptr = kmap(BD_GET_KIOV(desc, i).kiov_page) + off;
 
 			if (np) {
-				char *ptr2 = kmap(np) + off;
+				char *ptr = ll_kmap_atomic(local_nb[i].lnb_page,
+							KM_USER0);
+				char *ptr2 = page_address(np);
 
-				memcpy(ptr2, ptr, len);
-				memcpy(ptr2, "bad3", min(4, len));
-				kunmap(np);
+				memcpy(ptr2 + off, ptr + off, len);
+				memcpy(ptr2 + off, "bad3", min(4, len));
+				ll_kunmap_atomic(ptr, KM_USER0);
 
 				/* LU-8376 to preserve original index for
 				 * display in dump_all_bulk_pages() */
-				np->index = BD_GET_KIOV(desc,
-							i).kiov_page->index;
+				np->index = i;
 
-				BD_GET_KIOV(desc, i).kiov_page = np;
+				cfs_crypto_hash_update_page(req, np, off,
+							    len);
+				continue;
 			} else {
 				CERROR("%s: can't alloc page for corruption\n",
 				       tgt_name(tgt));
 			}
 		}
-		cfs_crypto_hash_update_page(hdesc,
-				  BD_GET_KIOV(desc, i).kiov_page,
-				  BD_GET_KIOV(desc, i).kiov_offset &
-					~PAGE_MASK,
-				  BD_GET_KIOV(desc, i).kiov_len);
+		cfs_crypto_hash_update_page(req, local_nb[i].lnb_page,
+				  local_nb[i].lnb_page_offset & ~PAGE_MASK,
+				  local_nb[i].lnb_len);
 
 		 /* corrupt the data after we compute the checksum, to
 		 * simulate an OST->client data error */
 		if (i == 0 && opc == OST_READ &&
 		    OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_SEND)) {
-			int off = BD_GET_KIOV(desc, i).kiov_offset
-			  & ~PAGE_MASK;
-			int len = BD_GET_KIOV(desc, i).kiov_len;
+			int off = local_nb[i].lnb_page_offset & ~PAGE_MASK;
+			int len = local_nb[i].lnb_len;
 			struct page *np = tgt_page_to_corrupt;
-			char *ptr =
-			  kmap(BD_GET_KIOV(desc, i).kiov_page) + off;
 
 			if (np) {
-				char *ptr2 = kmap(np) + off;
+				char *ptr = ll_kmap_atomic(local_nb[i].lnb_page,
+							KM_USER0);
+				char *ptr2 = page_address(np);
 
-				memcpy(ptr2, ptr, len);
-				memcpy(ptr2, "bad4", min(4, len));
-				kunmap(np);
+				memcpy(ptr2 + off, ptr + off, len);
+				memcpy(ptr2 + off, "bad4", min(4, len));
+				ll_kunmap_atomic(ptr, KM_USER0);
 
 				/* LU-8376 to preserve original index for
 				 * display in dump_all_bulk_pages() */
-				np->index = BD_GET_KIOV(desc,
-							i).kiov_page->index;
+				np->index = i;
 
-				BD_GET_KIOV(desc, i).kiov_page = np;
+				cfs_crypto_hash_update_page(req, np, off,
+							    len);
+				continue;
 			} else {
 				CERROR("%s: can't alloc page for corruption\n",
 				       tgt_name(tgt));
@@ -1741,17 +1820,17 @@ static __u32 tgt_checksum_bulk(struct lu_target *tgt,
 		}
 	}
 
-	bufsize = sizeof(cksum);
-	err = cfs_crypto_hash_final(hdesc, (unsigned char *)&cksum, &bufsize);
+	bufsize = sizeof(*cksum);
+	err = cfs_crypto_hash_final(req, (unsigned char *)cksum, &bufsize);
 
-	return cksum;
+	return 0;
 }
 
 char dbgcksum_file_name[PATH_MAX];
 
 static void dump_all_bulk_pages(struct obdo *oa, int count,
-				    lnet_kiov_t *iov, __u32 server_cksum,
-				    __u32 client_cksum)
+				struct niobuf_local *local_nb,
+				__u32 server_cksum, __u32 client_cksum)
 {
 	struct file *filp;
 	int rc, i;
@@ -1768,9 +1847,9 @@ static void dump_all_bulk_pages(struct obdo *oa, int count,
 		 oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0,
 		 oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
 		 oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
-		 (__u64)iov[0].kiov_page->index << PAGE_SHIFT,
-		 ((__u64)iov[count - 1].kiov_page->index << PAGE_SHIFT) +
-		 iov[count - 1].kiov_len - 1, client_cksum, server_cksum);
+		 local_nb[0].lnb_file_offset,
+		 local_nb[count-1].lnb_file_offset +
+		 local_nb[count-1].lnb_len - 1, client_cksum, server_cksum);
 	filp = filp_open(dbgcksum_file_name,
 			 O_CREAT | O_EXCL | O_WRONLY | O_LARGEFILE, 0600);
 	if (IS_ERR(filp)) {
@@ -1786,8 +1865,8 @@ static void dump_all_bulk_pages(struct obdo *oa, int count,
 	}
 
 	for (i = 0; i < count; i++) {
-		len = iov[i].kiov_len;
-		buf = kmap(iov[i].kiov_page);
+		len = local_nb[i].lnb_len;
+		buf = kmap(local_nb[i].lnb_page);
 		while (len != 0) {
 			rc = cfs_kernel_write(filp, buf, len, &filp->f_pos);
 			if (rc < 0) {
@@ -1800,7 +1879,7 @@ static void dump_all_bulk_pages(struct obdo *oa, int count,
 			CDEBUG(D_INFO, "%s: wrote %d bytes\n",
 			       dbgcksum_file_name, rc);
 		}
-		kunmap(iov[i].kiov_page);
+		kunmap(local_nb[i].lnb_page);
 	}
 
 	rc = ll_vfs_fsync_range(filp, 0, LLONG_MAX, 1);
@@ -1810,13 +1889,15 @@ static void dump_all_bulk_pages(struct obdo *oa, int count,
 	return;
 }
 
-static int check_read_checksum(struct ptlrpc_bulk_desc *desc, struct obdo *oa,
-			       const lnet_process_id_t *peer,
+static int check_read_checksum(struct niobuf_local *local_nb, int npages,
+			       struct obd_export *exp, struct obdo *oa,
+			       const struct lnet_process_id *peer,
 			       __u32 client_cksum, __u32 server_cksum,
-			       cksum_type_t server_cksum_type)
+			       enum cksum_types server_cksum_type)
 {
 	char *msg;
-	cksum_type_t cksum_type;
+	enum cksum_types cksum_type;
+	loff_t start, end;
 
 	/* unlikely to happen and only if resend does not occur due to cksum
 	 * control failure on Client */
@@ -1826,13 +1907,12 @@ static int check_read_checksum(struct ptlrpc_bulk_desc *desc, struct obdo *oa,
 		return 0;
 	}
 
-	if (desc->bd_export->exp_obd->obd_checksum_dump)
-		dump_all_bulk_pages(oa, desc->bd_iov_count,
-				    &BD_GET_KIOV(desc, 0), server_cksum,
+	if (exp->exp_obd->obd_checksum_dump)
+		dump_all_bulk_pages(oa, npages, local_nb, server_cksum,
 				    client_cksum);
 
-	cksum_type = cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
-				       oa->o_flags : 0);
+	cksum_type = obd_cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
+					   oa->o_flags : 0);
 
 	if (cksum_type != server_cksum_type)
 		msg = "the server may have not used the checksum type specified"
@@ -1840,24 +1920,237 @@ static int check_read_checksum(struct ptlrpc_bulk_desc *desc, struct obdo *oa,
 	else
 		msg = "should have changed on the client or in transit";
 
+	start = local_nb[0].lnb_file_offset;
+	end = local_nb[npages-1].lnb_file_offset +
+					local_nb[npages-1].lnb_len - 1;
+
 	LCONSOLE_ERROR_MSG(0x132, "%s: BAD READ CHECKSUM: %s: from %s inode "
 		DFID " object "DOSTID" extent [%llu-%llu], client returned csum"
 		" %x (type %x), server csum %x (type %x)\n",
-		desc->bd_export->exp_obd->obd_name,
+		exp->exp_obd->obd_name,
 		msg, libcfs_nid2str(peer->nid),
 		oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : 0ULL,
 		oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
 		oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
 		POSTID(&oa->o_oi),
-		(__u64)BD_GET_KIOV(desc, 0).kiov_page->index << PAGE_SHIFT,
-		((__u64)BD_GET_KIOV(desc,
-				    desc->bd_iov_count - 1).kiov_page->index
-			<< PAGE_SHIFT) +
-			BD_GET_KIOV(desc, desc->bd_iov_count - 1).kiov_len - 1,
-		client_cksum, cksum_type, server_cksum, server_cksum_type);
+		start, end, client_cksum, cksum_type, server_cksum,
+		server_cksum_type);
+
 	return 1;
 }
 
+static int tgt_pages2shortio(struct niobuf_local *local, int npages,
+			     unsigned char *buf, int size)
+{
+	int	i, off, len, copied = size;
+	char	*ptr;
+
+	for (i = 0; i < npages; i++) {
+		off = local[i].lnb_page_offset & ~PAGE_MASK;
+		len = local[i].lnb_len;
+
+		CDEBUG(D_PAGE, "index %d offset = %d len = %d left = %d\n",
+		       i, off, len, size);
+		if (len > size)
+			return -EINVAL;
+
+		ptr = ll_kmap_atomic(local[i].lnb_page, KM_USER0);
+		memcpy(buf + off, ptr, len);
+		ll_kunmap_atomic(ptr, KM_USER0);
+		buf += len;
+		size -= len;
+	}
+	return copied - size;
+}
+
+static int tgt_checksum_niobuf_t10pi(struct lu_target *tgt,
+				     struct niobuf_local *local_nb,
+				     int npages, int opc,
+				     obd_dif_csum_fn *fn,
+				     int sector_size,
+				     u32 *check_sum)
+{
+	enum cksum_types t10_cksum_type = tgt->lut_dt_conf.ddp_t10_cksum_type;
+	unsigned char cfs_alg = cksum_obd2cfs(OBD_CKSUM_T10_TOP);
+	const char *obd_name = tgt->lut_obd->obd_name;
+	struct ahash_request *req;
+	unsigned int bufsize;
+	unsigned char *buffer;
+	struct page *__page;
+	__u16 *guard_start;
+	int guard_number;
+	int used_number = 0;
+	__u32 cksum;
+	int rc = 0;
+	int used;
+	int i;
+
+	__page = alloc_page(GFP_KERNEL);
+	if (__page == NULL)
+		return -ENOMEM;
+
+	req = cfs_crypto_hash_init(cfs_alg, NULL, 0);
+	if (IS_ERR(req)) {
+		CERROR("%s: unable to initialize checksum hash %s\n",
+		       tgt_name(tgt), cfs_crypto_hash_name(cfs_alg));
+		return PTR_ERR(req);
+	}
+
+	buffer = kmap(__page);
+	guard_start = (__u16 *)buffer;
+	guard_number = PAGE_SIZE / sizeof(*guard_start);
+	for (i = 0; i < npages; i++) {
+		/* corrupt the data before we compute the checksum, to
+		 * simulate a client->OST data error */
+		if (i == 0 && opc == OST_WRITE &&
+		    OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_RECEIVE)) {
+			int off = local_nb[i].lnb_page_offset & ~PAGE_MASK;
+			int len = local_nb[i].lnb_len;
+			struct page *np = tgt_page_to_corrupt;
+
+			if (np) {
+				char *ptr = ll_kmap_atomic(local_nb[i].lnb_page,
+							KM_USER0);
+				char *ptr2 = page_address(np);
+
+				memcpy(ptr2 + off, ptr + off, len);
+				memcpy(ptr2 + off, "bad3", min(4, len));
+				ll_kunmap_atomic(ptr, KM_USER0);
+
+				/* LU-8376 to preserve original index for
+				 * display in dump_all_bulk_pages() */
+				np->index = i;
+
+				cfs_crypto_hash_update_page(req, np, off,
+							    len);
+				continue;
+			} else {
+				CERROR("%s: can't alloc page for corruption\n",
+				       tgt_name(tgt));
+			}
+		}
+
+		/*
+		 * The left guard number should be able to hold checksums of a
+		 * whole page
+		 */
+		if (t10_cksum_type && opc == OST_READ &&
+		    local_nb[i].lnb_guard_disk) {
+			used = DIV_ROUND_UP(local_nb[i].lnb_len, sector_size);
+			if (used > (guard_number - used_number)) {
+				rc = -E2BIG;
+				break;
+			}
+			memcpy(guard_start + used_number,
+			       local_nb[i].lnb_guards,
+			       used * sizeof(*local_nb[i].lnb_guards));
+		} else {
+			rc = obd_page_dif_generate_buffer(obd_name,
+				local_nb[i].lnb_page,
+				local_nb[i].lnb_page_offset & ~PAGE_MASK,
+				local_nb[i].lnb_len, guard_start + used_number,
+				guard_number - used_number, &used, sector_size,
+				fn);
+			if (rc)
+				break;
+		}
+
+		LASSERT(used <= MAX_GUARD_NUMBER);
+		/*
+		 * If disk support T10PI checksum, copy guards to local_nb.
+		 * If the write is partial page, do not use the guards for bio
+		 * submission since the data might not be full-sector. The bio
+		 * guards will be generated later based on the full sectors. If
+		 * the sector size is 512B rather than 4 KB, or the page size
+		 * is larger than 4KB, this might drop some useful guards for
+		 * partial page write, but it will only add minimal extra time
+		 * of checksum calculation.
+		 */
+		if (t10_cksum_type && opc == OST_WRITE &&
+		    local_nb[i].lnb_len == PAGE_SIZE) {
+			local_nb[i].lnb_guard_rpc = 1;
+			memcpy(local_nb[i].lnb_guards,
+			       guard_start + used_number,
+			       used * sizeof(*local_nb[i].lnb_guards));
+		}
+
+		used_number += used;
+		if (used_number == guard_number) {
+			cfs_crypto_hash_update_page(req, __page, 0,
+				used_number * sizeof(*guard_start));
+			used_number = 0;
+		}
+
+		 /* corrupt the data after we compute the checksum, to
+		 * simulate an OST->client data error */
+		if (unlikely(i == 0 && opc == OST_READ &&
+			     OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_SEND))) {
+			int off = local_nb[i].lnb_page_offset & ~PAGE_MASK;
+			int len = local_nb[i].lnb_len;
+			struct page *np = tgt_page_to_corrupt;
+
+			if (np) {
+				char *ptr = ll_kmap_atomic(local_nb[i].lnb_page,
+							KM_USER0);
+				char *ptr2 = page_address(np);
+
+				memcpy(ptr2 + off, ptr + off, len);
+				memcpy(ptr2 + off, "bad4", min(4, len));
+				ll_kunmap_atomic(ptr, KM_USER0);
+
+				/* LU-8376 to preserve original index for
+				 * display in dump_all_bulk_pages() */
+				np->index = i;
+
+				cfs_crypto_hash_update_page(req, np, off,
+							    len);
+				continue;
+			} else {
+				CERROR("%s: can't alloc page for corruption\n",
+				       tgt_name(tgt));
+			}
+		}
+	}
+	kunmap(__page);
+	if (rc)
+		GOTO(out, rc);
+
+	if (used_number != 0)
+		cfs_crypto_hash_update_page(req, __page, 0,
+			used_number * sizeof(*guard_start));
+
+	bufsize = sizeof(cksum);
+	rc = cfs_crypto_hash_final(req, (unsigned char *)&cksum, &bufsize);
+
+	if (rc == 0)
+		*check_sum = cksum;
+out:
+	__free_page(__page);
+	return rc;
+}
+
+static int tgt_checksum_niobuf_rw(struct lu_target *tgt,
+				  enum cksum_types cksum_type,
+				  struct niobuf_local *local_nb,
+				  int npages, int opc, u32 *check_sum)
+{
+	obd_dif_csum_fn *fn = NULL;
+	int sector_size = 0;
+	int rc;
+
+	ENTRY;
+	obd_t10_cksum2dif(cksum_type, &fn, &sector_size);
+
+	if (fn)
+		rc = tgt_checksum_niobuf_t10pi(tgt, local_nb, npages,
+					       opc, fn, sector_size,
+					       check_sum);
+	else
+		rc = tgt_checksum_niobuf(tgt, local_nb, npages, opc,
+					 cksum_type, check_sum);
+	RETURN(rc);
+}
+
 int tgt_brw_read(struct tgt_session_info *tsi)
 {
 	struct ptlrpc_request	*req = tgt_ses_req(tsi);
@@ -1869,12 +2162,15 @@ int tgt_brw_read(struct tgt_session_info *tsi)
 	struct ost_body		*body, *repbody;
 	struct l_wait_info	 lwi;
 	struct lustre_handle	 lockh = { 0 };
-	int			 npages, nob = 0, rc, i, no_reply = 0;
+	int			 npages, nob = 0, rc, i, no_reply = 0,
+				 npages_read;
 	struct tgt_thread_big_cache *tbc = req->rq_svc_thread->t_data;
+	const char *obd_name = exp->exp_obd->obd_name;
 
 	ENTRY;
 
-	if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL) {
+	if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL &&
+	    ptlrpc_req2svc(req)->srv_req_portal != MDS_IO_PORTAL) {
 		CERROR("%s: deny read request from %s to portal %u\n",
 		       tgt_name(tsi->tsi_tgt),
 		       obd_export_nid2str(req->rq_export),
@@ -1917,8 +2213,8 @@ int tgt_brw_read(struct tgt_session_info *tsi)
 
 	local_nb = tbc->local;
 
-	rc = tgt_brw_lock(exp->exp_obd->obd_namespace, &tsi->tsi_resid, ioo,
-			  remote_nb, &lockh, LCK_PR);
+	rc = tgt_brw_lock(tsi->tsi_env, exp, &tsi->tsi_resid, ioo, remote_nb,
+			  &lockh, LCK_PR);
 	if (rc != 0)
 		RETURN(rc);
 
@@ -1936,6 +2232,17 @@ int tgt_brw_read(struct tgt_session_info *tsi)
 		GOTO(out_lock, rc = -ETIMEDOUT);
 	}
 
+	/*
+	 * Because we already sync grant info with client when
+	 * reconnect, grant info will be cleared for resent req,
+	 * otherwise, outdated grant count in the rpc would de-sync
+	 * grant counters in case of shrink
+	 */
+	if (lustre_msg_get_flags(req->rq_reqmsg) & (MSG_RESENT | MSG_REPLAY)) {
+		DEBUG_REQ(D_CACHE, req, "clear resent/replay req grant info");
+		body->oa.o_valid &= ~OBD_MD_FLGRANT;
+	}
+
 	repbody = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
 	repbody->oa = body->oa;
 
@@ -1945,33 +2252,42 @@ int tgt_brw_read(struct tgt_session_info *tsi)
 	if (rc != 0)
 		GOTO(out_lock, rc);
 
-	desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
-				    PTLRPC_BULK_PUT_SOURCE |
-					PTLRPC_BULK_BUF_KIOV,
-				    OST_BULK_PORTAL,
-				    &ptlrpc_bulk_kiov_nopin_ops);
-	if (desc == NULL)
-		GOTO(out_commitrw, rc = -ENOMEM);
+	if (body->oa.o_valid & OBD_MD_FLFLAGS &&
+	    body->oa.o_flags & OBD_FL_SHORT_IO) {
+		desc = NULL;
+	} else {
+		desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
+					    PTLRPC_BULK_PUT_SOURCE |
+						PTLRPC_BULK_BUF_KIOV,
+					    OST_BULK_PORTAL,
+					    &ptlrpc_bulk_kiov_nopin_ops);
+		if (desc == NULL)
+			GOTO(out_commitrw, rc = -ENOMEM);
+	}
 
 	nob = 0;
+	npages_read = npages;
 	for (i = 0; i < npages; i++) {
 		int page_rc = local_nb[i].lnb_rc;
 
 		if (page_rc < 0) {
 			rc = page_rc;
+			npages_read = i;
 			break;
 		}
 
 		nob += page_rc;
-		if (page_rc != 0) { /* some data! */
+		if (page_rc != 0 && desc != NULL) { /* some data! */
 			LASSERT(local_nb[i].lnb_page != NULL);
 			desc->bd_frag_ops->add_kiov_frag
 			  (desc, local_nb[i].lnb_page,
-			   local_nb[i].lnb_page_offset,
+			   local_nb[i].lnb_page_offset & ~PAGE_MASK,
 			   page_rc);
 		}
 
 		if (page_rc != local_nb[i].lnb_len) { /* short read */
+			local_nb[i].lnb_len = page_rc;
+			npages_read = i + (page_rc != 0 ? 1 : 0);
 			/* All subsequent pages should be 0 */
 			while (++i < npages)
 				LASSERT(local_nb[i].lnb_rc == 0);
@@ -1983,14 +2299,19 @@ int tgt_brw_read(struct tgt_session_info *tsi)
 		rc = -E2BIG;
 
 	if (body->oa.o_valid & OBD_MD_FLCKSUM) {
-		cksum_type_t cksum_type =
-			cksum_type_unpack(body->oa.o_valid & OBD_MD_FLFLAGS ?
-					  body->oa.o_flags : 0);
+		u32 flag = body->oa.o_valid & OBD_MD_FLFLAGS ?
+			   body->oa.o_flags : 0;
+		enum cksum_types cksum_type = obd_cksum_type_unpack(flag);
 
-		repbody->oa.o_flags = cksum_type_pack(cksum_type);
+		repbody->oa.o_flags = obd_cksum_type_pack(obd_name,
+							  cksum_type);
 		repbody->oa.o_valid = OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
-		repbody->oa.o_cksum = tgt_checksum_bulk(tsi->tsi_tgt, desc,
-							OST_READ, cksum_type);
+
+		rc = tgt_checksum_niobuf_rw(tsi->tsi_tgt, cksum_type,
+					    local_nb, npages_read, OST_READ,
+					    &repbody->oa.o_cksum);
+		if (rc < 0)
+			GOTO(out_commitrw, rc);
 		CDEBUG(D_PAGE, "checksum at read origin: %x\n",
 		       repbody->oa.o_cksum);
 
@@ -1999,21 +2320,46 @@ int tgt_brw_read(struct tgt_session_info *tsi)
 		 * zero-cksum case) */
 		if ((body->oa.o_valid & OBD_MD_FLFLAGS) &&
 		    (body->oa.o_flags & OBD_FL_RECOV_RESEND))
-			check_read_checksum(desc, &body->oa, &req->rq_peer,
+			check_read_checksum(local_nb, npages_read, exp,
+					    &body->oa, &req->rq_peer,
 					    body->oa.o_cksum,
 					    repbody->oa.o_cksum, cksum_type);
 	} else {
 		repbody->oa.o_valid = 0;
 	}
+	if (body->oa.o_valid & OBD_MD_FLGRANT)
+		repbody->oa.o_valid |= OBD_MD_FLGRANT;
 	/* We're finishing using body->oa as an input variable */
 
 	/* Check if client was evicted while we were doing i/o before touching
 	 * network */
-	if (likely(rc == 0 &&
-		   !CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2) &&
-		   !CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_BULK))) {
-		rc = target_bulk_io(exp, desc, &lwi);
+	if (rc == 0) {
+		if (body->oa.o_valid & OBD_MD_FLFLAGS &&
+		    body->oa.o_flags & OBD_FL_SHORT_IO) {
+			unsigned char *short_io_buf;
+			int short_io_size;
+
+			short_io_buf = req_capsule_server_get(&req->rq_pill,
+							      &RMF_SHORT_IO);
+			short_io_size = req_capsule_get_size(&req->rq_pill,
+							     &RMF_SHORT_IO,
+							     RCL_SERVER);
+			rc = tgt_pages2shortio(local_nb, npages_read,
+					       short_io_buf, short_io_size);
+			if (rc >= 0)
+				req_capsule_shrink(&req->rq_pill,
+						   &RMF_SHORT_IO, rc,
+						   RCL_SERVER);
+			rc = rc > 0 ? 0 : rc;
+		} else if (!CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)) {
+			rc = target_bulk_io(exp, desc, &lwi);
+		}
 		no_reply = rc != 0;
+	} else {
+		if (body->oa.o_valid & OBD_MD_FLFLAGS &&
+		    body->oa.o_flags & OBD_FL_SHORT_IO)
+			req_capsule_shrink(&req->rq_pill, &RMF_SHORT_IO, 0,
+					   RCL_SERVER);
 	}
 
 out_commitrw:
@@ -2036,13 +2382,15 @@ int tgt_brw_read(struct tgt_session_info *tsi)
 		ptlrpc_req_drop_rs(req);
 		LCONSOLE_WARN("%s: Bulk IO read error with %s (at %s), "
 			      "client will retry: rc %d\n",
-			      exp->exp_obd->obd_name,
+			      obd_name,
 			      obd_uuid2str(&exp->exp_client_uuid),
 			      obd_export_nid2str(exp), rc);
 	}
 	/* send a bulk after reply to simulate a network delay or reordering
-	 * by a router */
-	if (unlikely(CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2))) {
+	 * by a router - Note that !desc implies short io, so there is no bulk
+	 * to reorder. */
+	if (unlikely(CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)) &&
+	    desc) {
 		wait_queue_head_t	 waitq;
 		struct l_wait_info	 lwi1;
 
@@ -2059,6 +2407,32 @@ int tgt_brw_read(struct tgt_session_info *tsi)
 }
 EXPORT_SYMBOL(tgt_brw_read);
 
+static int tgt_shortio2pages(struct niobuf_local *local, int npages,
+			     unsigned char *buf, unsigned int size)
+{
+	int	i, off, len;
+	char	*ptr;
+
+	for (i = 0; i < npages; i++) {
+		off = local[i].lnb_page_offset & ~PAGE_MASK;
+		len = local[i].lnb_len;
+
+		if (len == 0)
+			continue;
+
+		CDEBUG(D_PAGE, "index %d offset = %d len = %d left = %d\n",
+		       i, off, len, size);
+		ptr = ll_kmap_atomic(local[i].lnb_page, KM_USER0);
+		if (ptr == NULL)
+			return -EINVAL;
+		memcpy(ptr + off, buf, len < size ? len : size);
+		ll_kunmap_atomic(ptr, KM_USER0);
+		buf += len;
+		size -= len;
+	}
+	return 0;
+}
+
 static void tgt_warn_on_cksum(struct ptlrpc_request *req,
 			      struct ptlrpc_bulk_desc *desc,
 			      struct niobuf_local *local_nb, int npages,
@@ -2073,14 +2447,13 @@ static void tgt_warn_on_cksum(struct ptlrpc_request *req,
 	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
 	LASSERT(body != NULL);
 
-	if (req->rq_peer.nid != desc->bd_sender) {
+	if (desc && req->rq_peer.nid != desc->bd_sender) {
 		via = " via ";
 		router = libcfs_nid2str(desc->bd_sender);
 	}
 
 	if (exp->exp_obd->obd_checksum_dump)
-		dump_all_bulk_pages(&body->oa, desc->bd_iov_count,
-				    &BD_GET_KIOV(desc, 0), server_cksum,
+		dump_all_bulk_pages(&body->oa, npages, local_nb, server_cksum,
 				    client_cksum);
 
 	if (mmap) {
@@ -2121,14 +2494,16 @@ int tgt_brw_write(struct tgt_session_info *tsi)
 	__u32			*rcs;
 	int			 objcount, niocount, npages;
 	int			 rc, i, j;
-	cksum_type_t		 cksum_type = OBD_CKSUM_CRC32;
+	enum cksum_types cksum_type = OBD_CKSUM_CRC32;
 	bool			 no_reply = false, mmap;
 	struct tgt_thread_big_cache *tbc = req->rq_svc_thread->t_data;
 	bool wait_sync = false;
+	const char *obd_name = exp->exp_obd->obd_name;
 
 	ENTRY;
 
-	if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL) {
+	if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL &&
+	    ptlrpc_req2svc(req)->srv_req_portal != MDS_IO_PORTAL) {
 		CERROR("%s: deny write request from %s to portal %u\n",
 		       tgt_name(tsi->tsi_tgt),
 		       obd_export_nid2str(req->rq_export),
@@ -2152,6 +2527,9 @@ int tgt_brw_write(struct tgt_session_info *tsi)
 	CFS_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK, cfs_fail_val > 0 ?
 			 cfs_fail_val : (obd_timeout + 1) / 4);
 
+	/* Delay write commit to show stale size information */
+	CFS_FAIL_TIMEOUT(OBD_FAIL_OSC_NO_SIZE_DATA, cfs_fail_val);
+
 	/* There must be big cache in current thread to process this request
 	 * if it is NULL then something went wrong and it wasn't allocated,
 	 * report -ENOMEM in that case */
@@ -2192,8 +2570,8 @@ int tgt_brw_write(struct tgt_session_info *tsi)
 
 	local_nb = tbc->local;
 
-	rc = tgt_brw_lock(exp->exp_obd->obd_namespace, &tsi->tsi_resid, ioo,
-			  remote_nb, &lockh, LCK_PW);
+	rc = tgt_brw_lock(tsi->tsi_env, exp, &tsi->tsi_resid, ioo, remote_nb,
+			  &lockh, LCK_PW);
 	if (rc != 0)
 		GOTO(out, rc);
 
@@ -2230,26 +2608,46 @@ int tgt_brw_write(struct tgt_session_info *tsi)
 			objcount, ioo, remote_nb, &npages, local_nb);
 	if (rc < 0)
 		GOTO(out_lock, rc);
+	if (body->oa.o_valid & OBD_MD_FLFLAGS &&
+	    body->oa.o_flags & OBD_FL_SHORT_IO) {
+		unsigned int short_io_size;
+		unsigned char *short_io_buf;
+
+		short_io_size = req_capsule_get_size(&req->rq_pill,
+						     &RMF_SHORT_IO,
+						     RCL_CLIENT);
+		short_io_buf = req_capsule_client_get(&req->rq_pill,
+						      &RMF_SHORT_IO);
+		CDEBUG(D_INFO, "Client use short io for data transfer,"
+			       " size = %d\n", short_io_size);
+
+		/* Copy short io buf to pages */
+		rc = tgt_shortio2pages(local_nb, npages, short_io_buf,
+				       short_io_size);
+		desc = NULL;
+	} else {
+		desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
+					    PTLRPC_BULK_GET_SINK |
+					    PTLRPC_BULK_BUF_KIOV,
+					    OST_BULK_PORTAL,
+					    &ptlrpc_bulk_kiov_nopin_ops);
+		if (desc == NULL)
+			GOTO(skip_transfer, rc = -ENOMEM);
+
+		/* NB Having prepped, we must commit... */
+		for (i = 0; i < npages; i++)
+			desc->bd_frag_ops->add_kiov_frag(desc,
+					local_nb[i].lnb_page,
+					local_nb[i].lnb_page_offset & ~PAGE_MASK,
+					local_nb[i].lnb_len);
+
+		rc = sptlrpc_svc_prep_bulk(req, desc);
+		if (rc != 0)
+			GOTO(skip_transfer, rc);
 
-	desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
-				    PTLRPC_BULK_GET_SINK | PTLRPC_BULK_BUF_KIOV,
-				    OST_BULK_PORTAL,
-				    &ptlrpc_bulk_kiov_nopin_ops);
-	if (desc == NULL)
-		GOTO(skip_transfer, rc = -ENOMEM);
-
-	/* NB Having prepped, we must commit... */
-	for (i = 0; i < npages; i++)
-		desc->bd_frag_ops->add_kiov_frag(desc,
-						 local_nb[i].lnb_page,
-						 local_nb[i].lnb_page_offset,
-						 local_nb[i].lnb_len);
-
-	rc = sptlrpc_svc_prep_bulk(req, desc);
-	if (rc != 0)
-		GOTO(skip_transfer, rc);
+		rc = target_bulk_io(exp, desc, &lwi);
+	}
 
-	rc = target_bulk_io(exp, desc, &lwi);
 	no_reply = rc != 0;
 
 skip_transfer:
@@ -2257,13 +2655,19 @@ int tgt_brw_write(struct tgt_session_info *tsi)
 		static int cksum_counter;
 
 		if (body->oa.o_valid & OBD_MD_FLFLAGS)
-			cksum_type = cksum_type_unpack(body->oa.o_flags);
+			cksum_type = obd_cksum_type_unpack(body->oa.o_flags);
 
 		repbody->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
 		repbody->oa.o_flags &= ~OBD_FL_CKSUM_ALL;
-		repbody->oa.o_flags |= cksum_type_pack(cksum_type);
-		repbody->oa.o_cksum = tgt_checksum_bulk(tsi->tsi_tgt, desc,
-							OST_WRITE, cksum_type);
+		repbody->oa.o_flags |= obd_cksum_type_pack(obd_name,
+							   cksum_type);
+
+		rc = tgt_checksum_niobuf_rw(tsi->tsi_tgt, cksum_type,
+					    local_nb, npages, OST_WRITE,
+					    &repbody->oa.o_cksum);
+		if (rc < 0)
+			GOTO(out_commitrw, rc);
+
 		cksum_counter++;
 
 		if (unlikely(body->oa.o_cksum != repbody->oa.o_cksum)) {
@@ -2282,6 +2686,7 @@ int tgt_brw_write(struct tgt_session_info *tsi)
 		}
 	}
 
+out_commitrw:
 	/* Must commit after prep above in all cases */
 	rc = obd_commitrw(tsi->tsi_env, OBD_BRW_WRITE, exp, &repbody->oa,
 			  objcount, ioo, remote_nb, npages, local_nb, rc);
@@ -2337,7 +2742,7 @@ int tgt_brw_write(struct tgt_session_info *tsi)
 		if (!exp->exp_obd->obd_no_transno)
 			LCONSOLE_WARN("%s: Bulk IO write error with %s (at %s),"
 				      " client will retry: rc = %d\n",
-				      exp->exp_obd->obd_name,
+				      obd_name,
 				      obd_uuid2str(&exp->exp_client_uuid),
 				      obd_export_nid2str(exp), rc);
 	}
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_internal.h b/drivers/staging/lustrefsx/lustre/target/tgt_internal.h
index 981e2ab9f9ade..ac7c3c17feb9d 100644
--- a/drivers/staging/lustrefsx/lustre/target/tgt_internal.h
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_internal.h
@@ -21,7 +21,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * lustre/target/tgt_internal.h
@@ -35,7 +35,6 @@
 #define _TG_INTERNAL_H
 
 #include <lustre_net.h>
-#include <lustre/lustre_idl.h>
 #include <lu_target.h>
 #include <lustre_export.h>
 #include <lustre_fid.h>
@@ -288,4 +287,19 @@ int top_trans_create_tmt(const struct lu_env *env,
 void tgt_cancel_slc_locks(struct lu_target *tgt, __u64 transno);
 void barrier_init(void);
 void barrier_fini(void);
+
+/* FMD tracking data */
+struct tgt_fmd_data {
+	struct list_head fmd_list;	  /* linked to tgt_fmd_list */
+	struct lu_fid	 fmd_fid;	  /* FID being written to */
+	__u64		 fmd_mactime_xid; /* xid highest {m,a,c}time setattr */
+	time64_t	 fmd_expire;	  /* time when the fmd should expire */
+	int		 fmd_refcount;	  /* reference counter - list holds 1 */
+};
+
+/* tgt_fmd.c */
+extern struct kmem_cache *tgt_fmd_kmem;
+void tgt_fmd_expire(struct obd_export *exp);
+void tgt_fmd_cleanup(struct obd_export *exp);
+
 #endif /* _TG_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_lastrcvd.c b/drivers/staging/lustrefsx/lustre/target/tgt_lastrcvd.c
index c7aecdf2171ea..0d2fde1be1bc3 100644
--- a/drivers/staging/lustrefsx/lustre/target/tgt_lastrcvd.c
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_lastrcvd.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -148,6 +148,13 @@ static int tgt_clear_reply_slot(struct lu_target *lut, int idx)
 	int chunk;
 	int b;
 
+	if (lut->lut_obd->obd_stopping)
+		/*
+		 * in case of failover keep the bit set in order to
+		 * avoid overwriting slots in reply_data which might
+		 * be required by resent rpcs
+		 */
+		return 0;
 	chunk = idx / LUT_REPLY_SLOTS_PER_CHUNK;
 	b = idx % LUT_REPLY_SLOTS_PER_CHUNK;
 
@@ -388,6 +395,8 @@ int tgt_client_alloc(struct obd_export *exp)
 
 	spin_lock_init(&exp->exp_target_data.ted_nodemap_lock);
 	INIT_LIST_HEAD(&exp->exp_target_data.ted_nodemap_member);
+	spin_lock_init(&exp->exp_target_data.ted_fmd_lock);
+	INIT_LIST_HEAD(&exp->exp_target_data.ted_fmd_list);
 
 	OBD_ALLOC_PTR(exp->exp_target_data.ted_lcd);
 	if (exp->exp_target_data.ted_lcd == NULL)
@@ -411,6 +420,8 @@ void tgt_client_free(struct obd_export *exp)
 
 	LASSERT(exp != exp->exp_obd->obd_self_export);
 
+	tgt_fmd_cleanup(exp);
+
 	/* free reply data */
 	mutex_lock(&ted->ted_lcd_lock);
 	list_for_each_entry_safe(trd, tmp, &ted->ted_reply_list, trd_list) {
@@ -833,7 +844,7 @@ void tgt_boot_epoch_update(struct lu_target *tgt)
 	 * - there is no client to recover or the recovery was aborted
 	 */
 	if (!strncmp(tgt->lut_obd->obd_type->typ_name, LUSTRE_MDT_NAME, 3) &&
-	    (tgt->lut_obd->obd_max_recoverable_clients == 0 ||
+	    (atomic_read(&tgt->lut_obd->obd_max_recoverable_clients) == 0 ||
 	    tgt->lut_obd->obd_abort_recovery))
 		tgt->lut_lsd.lsd_feature_incompat &= ~OBD_INCOMPAT_MULTI_RPCS;
 
@@ -1517,7 +1528,7 @@ static int tgt_clients_data_init(const struct lu_env *env,
 		exp->exp_connecting = 0;
 		exp->exp_in_recovery = 0;
 		spin_unlock(&exp->exp_lock);
-		obd->obd_max_recoverable_clients++;
+		atomic_inc(&obd->obd_max_recoverable_clients);
 
 		if (tgt->lut_lsd.lsd_feature_incompat &
 		    OBD_INCOMPAT_MULTI_RPCS &&
@@ -1889,7 +1900,6 @@ int tgt_reply_data_init(const struct lu_env *env, struct lu_target *tgt)
 	unsigned long		 reply_data_size;
 	int			 rc;
 	struct lsd_reply_header	*lrh = NULL;
-	struct lsd_client_data  *lcd = NULL;
 	struct tg_reply_data	*trd = NULL;
 	int                      idx;
 	loff_t			 off;
@@ -1938,10 +1948,6 @@ int tgt_reply_data_init(const struct lu_env *env, struct lu_target *tgt)
 		if (hash == NULL)
 			GOTO(out, rc = -ENODEV);
 
-		OBD_ALLOC_PTR(lcd);
-		if (lcd == NULL)
-			GOTO(out, rc = -ENOMEM);
-
 		OBD_ALLOC_PTR(trd);
 		if (trd == NULL)
 			GOTO(out, rc = -ENOMEM);
@@ -1993,6 +1999,13 @@ int tgt_reply_data_init(const struct lu_env *env, struct lu_target *tgt)
 			/* update export last committed transation */
 			exp->exp_last_committed = max(exp->exp_last_committed,
 						      lrd->lrd_transno);
+			/* Update lcd_last_transno as well for check in
+			 * tgt_release_reply_data() or the latest client
+			 * transno can be lost.
+			 */
+			ted->ted_lcd->lcd_last_transno =
+				max(ted->ted_lcd->lcd_last_transno,
+				    exp->exp_last_committed);
 
 			mutex_unlock(&ted->ted_lcd_lock);
 			class_export_put(exp);
@@ -2024,8 +2037,6 @@ int tgt_reply_data_init(const struct lu_env *env, struct lu_target *tgt)
 out:
 	if (hash != NULL)
 		cfs_hash_putref(hash);
-	if (lcd != NULL)
-		OBD_FREE_PTR(lcd);
 	if (trd != NULL)
 		OBD_FREE_PTR(trd);
 	if (lrh != NULL)
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_main.c b/drivers/staging/lustrefsx/lustre/target/tgt_main.c
index 12f9fdc1c2138..ce158941f9c06 100644
--- a/drivers/staging/lustrefsx/lustre/target/tgt_main.c
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_main.c
@@ -21,7 +21,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * lustre/target/tgt_main.c
@@ -37,6 +37,243 @@
 #include "tgt_internal.h"
 #include "../ptlrpc/ptlrpc_internal.h"
 
+/* This must be longer than the longest string below */
+#define SYNC_STATES_MAXLEN 16
+static char *sync_lock_cancel_states[] = {
+	[SYNC_LOCK_CANCEL_NEVER]	= "never",
+	[SYNC_LOCK_CANCEL_BLOCKING]	= "blocking",
+	[SYNC_LOCK_CANCEL_ALWAYS]	= "always",
+};
+
+/**
+ * Show policy for handling dirty data under a lock being cancelled.
+ *
+ * \param[in] kobj	sysfs kobject
+ * \param[in] attr	sysfs attribute
+ * \param[in] buf	buffer for data
+ *
+ * \retval		0 and buffer filled with data on success
+ * \retval		negative value on error
+ */
+ssize_t sync_lock_cancel_show(struct kobject *kobj,
+			      struct attribute *attr, char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lu_target *tgt = obd->u.obt.obt_lut;
+
+	return sprintf(buf, "%s\n",
+		       sync_lock_cancel_states[tgt->lut_sync_lock_cancel]);
+}
+EXPORT_SYMBOL(sync_lock_cancel_show);
+
+/**
+ * Change policy for handling dirty data under a lock being cancelled.
+ *
+ * This variable defines what action target takes upon lock cancel
+ * There are three possible modes:
+ * 1) never - never do sync upon lock cancel. This can lead to data
+ *    inconsistencies if both the OST and client crash while writing a file
+ *    that is also concurrently being read by another client. In these cases,
+ *    this may allow the file data to "rewind" to an earlier state.
+ * 2) blocking - do sync only if there is blocking lock, e.g. if another
+ *    client is trying to access this same object
+ * 3) always - do sync always
+ *
+ * \param[in] kobj	kobject
+ * \param[in] attr	attribute to show
+ * \param[in] buf	buffer for data
+ * \param[in] count	buffer size
+ *
+ * \retval		\a count on success
+ * \retval		negative value on error
+ */
+ssize_t sync_lock_cancel_store(struct kobject *kobj, struct attribute *attr,
+			       const char *buffer, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lu_target *tgt = obd->u.obt.obt_lut;
+	int val = -1;
+	enum tgt_sync_lock_cancel slc;
+
+	if (count == 0 || count >= SYNC_STATES_MAXLEN)
+		return -EINVAL;
+
+	for (slc = 0; slc < ARRAY_SIZE(sync_lock_cancel_states); slc++) {
+		if (strcmp(buffer, sync_lock_cancel_states[slc]) == 0) {
+			val = slc;
+			break;
+		}
+	}
+
+	/* Legacy numeric codes */
+	if (val == -1) {
+		int rc = kstrtoint(buffer, 0, &val);
+		if (rc)
+			return rc;
+	}
+
+	if (val < 0 || val > 2)
+		return -EINVAL;
+
+	spin_lock(&tgt->lut_flags_lock);
+	tgt->lut_sync_lock_cancel = val;
+	spin_unlock(&tgt->lut_flags_lock);
+	return count;
+}
+EXPORT_SYMBOL(sync_lock_cancel_store);
+LUSTRE_RW_ATTR(sync_lock_cancel);
+
+/**
+ * Show maximum number of Filter Modification Data (FMD) maintained.
+ *
+ * \param[in] kobj	kobject
+ * \param[in] attr	attribute to show
+ * \param[in] buf	buffer for data
+ *
+ * \retval		0 and buffer filled with data on success
+ * \retval		negative value on error
+ */
+ssize_t tgt_fmd_count_show(struct kobject *kobj, struct attribute *attr,
+			   char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lu_target *lut = obd->u.obt.obt_lut;
+
+	return sprintf(buf, "%u\n", lut->lut_fmd_max_num);
+}
+
+/**
+ * Change number of FMDs maintained by target.
+ *
+ * This defines how large the list of FMDs can be.
+ *
+ * \param[in] kobj	kobject
+ * \param[in] attr	attribute to show
+ * \param[in] buf	buffer for data
+ * \param[in] count	buffer size
+ *
+ * \retval		\a count on success
+ * \retval		negative value on error
+ */
+ssize_t tgt_fmd_count_store(struct kobject *kobj, struct attribute *attr,
+			    const char *buffer, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lu_target *lut = obd->u.obt.obt_lut;
+	int val, rc;
+
+	rc = kstrtoint(buffer, 0, &val);
+	if (rc)
+		return rc;
+
+	if (val < 1 || val > 65536)
+		return -EINVAL;
+
+	lut->lut_fmd_max_num = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(tgt_fmd_count);
+
+/**
+ * Show the maximum age of FMD data in seconds.
+ *
+ * \param[in] kobj	kobject
+ * \param[in] attr	attribute to show
+ * \param[in] buf	buffer for data
+ *
+ * \retval		0 and buffer filled with data on success
+ * \retval		negative value on error
+ */
+ssize_t tgt_fmd_seconds_show(struct kobject *kobj, struct attribute *attr,
+			     char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lu_target *lut = obd->u.obt.obt_lut;
+
+	return sprintf(buf, "%lld\n", lut->lut_fmd_max_age);
+}
+
+/**
+ * Set the maximum age of FMD data in seconds.
+ *
+ * This defines how long FMD data stays in the FMD list.
+ *
+ * \param[in] kobj	kobject
+ * \param[in] attr	attribute to show
+ * \param[in] buf	buffer for data
+ * \param[in] count	buffer size
+ *
+ * \retval		\a count on success
+ * \retval		negative number on error
+ */
+ssize_t tgt_fmd_seconds_store(struct kobject *kobj, struct attribute *attr,
+			      const char *buffer, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lu_target *lut = obd->u.obt.obt_lut;
+	time64_t val;
+	int rc;
+
+	rc = kstrtoll(buffer, 0, &val);
+	if (rc)
+		return rc;
+
+	if (val < 1 || val > 65536) /* ~ 18 hour max */
+		return -EINVAL;
+
+	lut->lut_fmd_max_age = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(tgt_fmd_seconds);
+
+/* These two aliases are old names and kept for compatibility, they were
+ * changed to 'tgt_fmd_count' and 'tgt_fmd_seconds'.
+ * This change was made in Lustre 2.13, so these aliases can be removed
+ * when back compatibility is not needed with any Lustre version prior 2.13
+ */
+static struct lustre_attr tgt_fmd_count_compat = __ATTR(client_cache_count,
+			0644, tgt_fmd_count_show, tgt_fmd_count_store);
+static struct lustre_attr tgt_fmd_seconds_compat = __ATTR(client_cache_seconds,
+			0644, tgt_fmd_seconds_show, tgt_fmd_seconds_store);
+
+static const struct attribute *tgt_attrs[] = {
+	&lustre_attr_sync_lock_cancel.attr,
+	&lustre_attr_tgt_fmd_count.attr,
+	&lustre_attr_tgt_fmd_seconds.attr,
+	&tgt_fmd_count_compat.attr,
+	&tgt_fmd_seconds_compat.attr,
+	NULL,
+};
+
+int tgt_tunables_init(struct lu_target *lut)
+{
+	int rc;
+
+	rc = sysfs_create_files(&lut->lut_obd->obd_kset.kobj, tgt_attrs);
+	if (!rc)
+		lut->lut_attrs = tgt_attrs;
+	return rc;
+}
+EXPORT_SYMBOL(tgt_tunables_init);
+
+void tgt_tunables_fini(struct lu_target *lut)
+{
+	if (lut->lut_attrs) {
+		sysfs_remove_files(&lut->lut_obd->obd_kset.kobj,
+				   lut->lut_attrs);
+		lut->lut_attrs = NULL;
+	}
+}
+EXPORT_SYMBOL(tgt_tunables_fini);
+
 /*
  * Save cross-MDT lock in lut_slc_locks.
  *
@@ -152,6 +389,8 @@ int tgt_init(const struct lu_env *env, struct lu_target *lut,
 	struct lu_attr		 attr;
 	struct lu_fid		 fid;
 	struct dt_object	*o;
+	struct tg_grants_data	*tgd = &lut->lut_tgd;
+	struct obd_statfs	*osfs;
 	int i, rc = 0;
 
 	ENTRY;
@@ -179,7 +418,7 @@ int tgt_init(const struct lu_env *env, struct lu_target *lut,
 	sptlrpc_rule_set_init(&lut->lut_sptlrpc_rset);
 
 	spin_lock_init(&lut->lut_flags_lock);
-	lut->lut_sync_lock_cancel = NEVER_SYNC_ON_CANCEL;
+	lut->lut_sync_lock_cancel = SYNC_LOCK_CANCEL_NEVER;
 
 	spin_lock_init(&lut->lut_slc_locks_guard);
 	INIT_LIST_HEAD(&lut->lut_slc_locks);
@@ -188,6 +427,38 @@ int tgt_init(const struct lu_env *env, struct lu_target *lut,
 	if (!obd->obd_replayable)
 		RETURN(0);
 
+	/* initialize grant and statfs data in target */
+	dt_conf_get(env, lut->lut_bottom, &lut->lut_dt_conf);
+
+	/* statfs data */
+	spin_lock_init(&tgd->tgd_osfs_lock);
+	tgd->tgd_osfs_age = ktime_get_seconds() - 1000;
+	tgd->tgd_osfs_unstable = 0;
+	tgd->tgd_statfs_inflight = 0;
+	tgd->tgd_osfs_inflight = 0;
+
+	/* grant data */
+	spin_lock_init(&tgd->tgd_grant_lock);
+	tgd->tgd_tot_dirty = 0;
+	tgd->tgd_tot_granted = 0;
+	tgd->tgd_tot_pending = 0;
+	tgd->tgd_grant_compat_disable = 0;
+
+	/* populate cached statfs data */
+	osfs = &tgt_th_info(env)->tti_u.osfs;
+	rc = tgt_statfs_internal(env, lut, osfs, 0, NULL);
+	if (rc != 0) {
+		CERROR("%s: can't get statfs data, rc %d\n", tgt_name(lut),
+			rc);
+		GOTO(out, rc);
+	}
+	if (!is_power_of_2(osfs->os_bsize)) {
+		CERROR("%s: blocksize (%d) is not a power of 2\n",
+			tgt_name(lut), osfs->os_bsize);
+		GOTO(out, rc = -EPROTO);
+	}
+	tgd->tgd_blockbits = fls(osfs->os_bsize) - 1;
+
 	spin_lock_init(&lut->lut_translock);
 	spin_lock_init(&lut->lut_client_bitmap_lock);
 
@@ -225,6 +496,11 @@ int tgt_init(const struct lu_env *env, struct lu_target *lut,
 	dt_txn_callback_add(lut->lut_bottom, &lut->lut_txn_cb);
 	lut->lut_bottom->dd_lu_dev.ld_site->ls_tgt = lut;
 
+	lut->lut_fmd_max_num = LUT_FMD_MAX_NUM_DEFAULT;
+	lut->lut_fmd_max_age = LUT_FMD_MAX_AGE_DEFAULT;
+
+	atomic_set(&lut->lut_sync_count, 0);
+
 	/* reply_data is supported by MDT targets only for now */
 	if (strncmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME, 3) != 0)
 		RETURN(0);
@@ -254,8 +530,6 @@ int tgt_init(const struct lu_env *env, struct lu_target *lut,
 	if (rc < 0)
 		GOTO(out, rc);
 
-	atomic_set(&lut->lut_sync_count, 0);
-
 	RETURN(0);
 
 out:
@@ -337,8 +611,44 @@ void tgt_fini(const struct lu_env *env, struct lu_target *lut)
 }
 EXPORT_SYMBOL(tgt_fini);
 
+static struct kmem_cache *tgt_thread_kmem;
+static struct kmem_cache *tgt_session_kmem;
+struct kmem_cache *tgt_fmd_kmem;
+
+static struct lu_kmem_descr tgt_caches[] = {
+	{
+		.ckd_cache = &tgt_thread_kmem,
+		.ckd_name  = "tgt_thread_kmem",
+		.ckd_size  = sizeof(struct tgt_thread_info),
+	},
+	{
+		.ckd_cache = &tgt_session_kmem,
+		.ckd_name  = "tgt_session_kmem",
+		.ckd_size  = sizeof(struct tgt_session_info)
+	},
+	{
+		.ckd_cache = &tgt_fmd_kmem,
+		.ckd_name  = "tgt_fmd_cache",
+		.ckd_size  = sizeof(struct tgt_fmd_data)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+
 /* context key constructor/destructor: tg_key_init, tg_key_fini */
-LU_KEY_INIT(tgt, struct tgt_thread_info);
+static void *tgt_key_init(const struct lu_context *ctx,
+				  struct lu_context_key *key)
+{
+	struct tgt_thread_info *thread;
+
+	OBD_SLAB_ALLOC_PTR_GFP(thread, tgt_thread_kmem, GFP_NOFS);
+	if (thread == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	return thread;
+}
 
 static void tgt_key_fini(const struct lu_context *ctx,
 			 struct lu_context_key *key, void *data)
@@ -355,7 +665,7 @@ static void tgt_key_fini(const struct lu_context *ctx,
 	if (args->ta_args != NULL)
 		OBD_FREE(args->ta_args, sizeof(args->ta_args[0]) *
 					args->ta_alloc_args);
-	OBD_FREE_PTR(info);
+	OBD_SLAB_FREE_PTR(info, tgt_thread_kmem);
 }
 
 static void tgt_key_exit(const struct lu_context *ctx,
@@ -377,8 +687,25 @@ struct lu_context_key tgt_thread_key = {
 
 LU_KEY_INIT_GENERIC(tgt);
 
-/* context key constructor/destructor: tgt_ses_key_init, tgt_ses_key_fini */
-LU_KEY_INIT_FINI(tgt_ses, struct tgt_session_info);
+static void *tgt_ses_key_init(const struct lu_context *ctx,
+			      struct lu_context_key *key)
+{
+	struct tgt_session_info *session;
+
+	OBD_SLAB_ALLOC_PTR_GFP(session, tgt_session_kmem, GFP_NOFS);
+	if (session == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	return session;
+}
+
+static void tgt_ses_key_fini(const struct lu_context *ctx,
+			     struct lu_context_key *key, void *data)
+{
+	struct tgt_session_info *session = data;
+
+	OBD_SLAB_FREE_PTR(session, tgt_session_kmem);
+}
 
 /* context key: tgt_session_key */
 struct lu_context_key tgt_session_key = {
@@ -401,8 +728,13 @@ struct page *tgt_page_to_corrupt;
 
 int tgt_mod_init(void)
 {
+	int	result;
 	ENTRY;
 
+	result = lu_kmem_init(tgt_caches);
+	if (result != 0)
+		RETURN(result);
+
 	tgt_page_to_corrupt = alloc_page(GFP_KERNEL);
 
 	tgt_key_init_generic(&tgt_thread_key, NULL);
@@ -426,5 +758,7 @@ void tgt_mod_exit(void)
 	lu_context_key_degister(&tgt_thread_key);
 	lu_context_key_degister(&tgt_session_key);
 	update_info_fini();
+
+	lu_kmem_fini(tgt_caches);
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/target/update_records.c b/drivers/staging/lustrefsx/lustre/target/update_records.c
index a36d554525507..5fb706c5090a5 100644
--- a/drivers/staging/lustrefsx/lustre/target/update_records.c
+++ b/drivers/staging/lustrefsx/lustre/target/update_records.c
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2015, Intel Corporation.
+ * Copyright (c) 2015, 2017, Intel Corporation.
  */
 
 /*
diff --git a/drivers/staging/lustrefsx/lustre/target/update_recovery.c b/drivers/staging/lustrefsx/lustre/target/update_recovery.c
index 3769d09d19282..ac47105a633b9 100644
--- a/drivers/staging/lustrefsx/lustre/target/update_recovery.c
+++ b/drivers/staging/lustrefsx/lustre/target/update_recovery.c
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2015, 2016, Intel Corporation.
+ * Copyright (c) 2015, 2017, Intel Corporation.
  */
 
 /*
diff --git a/drivers/staging/lustrefsx/lustre/target/update_trans.c b/drivers/staging/lustrefsx/lustre/target/update_trans.c
index 6c3e41438347c..b8150fa5c694c 100644
--- a/drivers/staging/lustrefsx/lustre/target/update_trans.c
+++ b/drivers/staging/lustrefsx/lustre/target/update_trans.c
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2015, 2016, Intel Corporation.
+ * Copyright (c) 2015, 2017, Intel Corporation.
  */
 /*
  * lustre/target/update_trans.c
@@ -82,9 +82,11 @@ static void top_multiple_thandle_dump(struct top_multiple_thandle *tmt,
 	list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) {
 		struct sub_thandle_cookie *stc;
 
-		CDEBUG(mask, "st %p obd %s committed %d stopped %d sub_th %p\n",
+		CDEBUG(mask, "st %p obd %s committed %d started %d stopped %d "
+		       "result %d sub_th %p\n",
 		       st, st->st_dt->dd_lu_dev.ld_obd->obd_name,
-		       st->st_committed, st->st_stopped, st->st_sub_th);
+		       st->st_committed, st->st_started, st->st_stopped,
+		       st->st_result, st->st_sub_th);
 
 		list_for_each_entry(stc, &st->st_cookie_list, stc_list) {
 			CDEBUG(mask, " cookie "DFID".%u\n",
@@ -526,6 +528,7 @@ static void sub_trans_stop_cb(struct lu_env *env,
 	struct top_multiple_thandle	*tmt = cb->dcb_data;
 	ENTRY;
 
+	spin_lock(&tmt->tmt_sub_lock);
 	list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) {
 		if (st->st_stopped)
 			continue;
@@ -536,6 +539,7 @@ static void sub_trans_stop_cb(struct lu_env *env,
 			break;
 		}
 	}
+	spin_unlock(&tmt->tmt_sub_lock);
 
 	wake_up(&tmt->tmt_stop_waitq);
 	RETURN_EXIT;
@@ -1016,6 +1020,8 @@ int top_trans_stop(const struct lu_env *env, struct dt_device *master_dev,
 			sub_trans_commit_cb_internal(tmt,
 						master_st->st_sub_th, rc);
 		if (rc < 0) {
+			CERROR("%s: stop trans failed: rc = %d\n",
+			       master_dev->dd_lu_dev.ld_obd->obd_name, rc);
 			th->th_result = rc;
 			GOTO(stop_other_trans, rc);
 		} else if (tur != NULL && tur->tur_update_records != NULL) {
@@ -1053,6 +1059,9 @@ int top_trans_stop(const struct lu_env *env, struct dt_device *master_dev,
 
 			rc = sub_updates_write(env, lur, st);
 			if (rc < 0) {
+				CERROR("%s: write updates failed: rc = %d\n",
+				       st->st_dt->dd_lu_dev.ld_obd->obd_name,
+				       rc);
 				th->th_result = rc;
 				break;
 			}
@@ -1072,8 +1081,12 @@ int top_trans_stop(const struct lu_env *env, struct dt_device *master_dev,
 		st->st_sub_th->th_result = th->th_result;
 		rc = dt_trans_stop(env, st->st_sub_th->th_dev,
 				   st->st_sub_th);
-		if (unlikely(rc < 0 && th->th_result == 0))
-			th->th_result = rc;
+		if (rc < 0) {
+			CERROR("%s: stop trans failed: rc = %d\n",
+			       st->st_dt->dd_lu_dev.ld_obd->obd_name, rc);
+			if (th->th_result == 0)
+				th->th_result = rc;
+		}
 	}
 
 	rc = top_trans_wait_result(top_th);
diff --git a/drivers/staging/lustrefsx/undef.h b/drivers/staging/lustrefsx/undef.h
index aa1343bf5a36d..bc1b3326d7a40 100644
--- a/drivers/staging/lustrefsx/undef.h
+++ b/drivers/staging/lustrefsx/undef.h
@@ -23,15 +23,9 @@
 /* extened attributes for ldiskfs */
 #undef CONFIG_LDISKFS_FS_XATTR
 
-/* Max LNET payload */
-#undef CONFIG_LNET_MAX_PAYLOAD
-
 /* enable invariant checking */
 #undef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
 
-/* IOCTL Buffer Size */
-#undef CONFIG_LUSTRE_OBD_MAX_IOCTL_BUFFER
-
 /* kernel has cpu affinity support */
 #undef CPU_AFFINITY
 
@@ -56,9 +50,15 @@
 /* do data checksums */
 #undef ENABLE_CHECKSUM
 
+/* enable flock by default */
+#undef ENABLE_FLOCK
+
 /* Use the Pinger */
 #undef ENABLE_PINGER
 
+/* aes-sha2 is supported by krb5 */
+#undef HAVE_AES_SHA2_SUPPORT
+
 /* Define to 1 if you have the <asm/types.h> header file. */
 #undef HAVE_ASM_TYPES_H
 
@@ -77,6 +77,12 @@
 /* 'bio_integrity_enabled' is available */
 #undef HAVE_BIO_INTEGRITY_ENABLED
 
+/* kernel has bio_integrity_prep_fn */
+#undef HAVE_BIO_INTEGRITY_PREP_FN
+
+/* bio_integrity_payload.bip_iter exist */
+#undef HAVE_BIP_ITER_BIO_INTEGRITY_PAYLOAD
+
 /* 'bi_bdev' is available */
 #undef HAVE_BI_BDEV
 
@@ -101,9 +107,18 @@
 /* blk_queue_max_segments is defined */
 #undef HAVE_BLK_QUEUE_MAX_SEGMENTS
 
+/* kernel hash_64() is broken */
+#undef HAVE_BROKEN_HASH_64
+
 /* kernel has struct bvec_iter */
 #undef HAVE_BVEC_ITER
 
+/* struct cache_detail has writers */
+#undef HAVE_CACHE_DETAIL_WRITERS
+
+/* if cache_detail->hash_lock is a spinlock */
+#undef HAVE_CACHE_HASH_SPINLOCK
+
 /* cache_head has hlist cache_list */
 #undef HAVE_CACHE_HEAD_HLIST
 
@@ -116,24 +131,24 @@
 /* kernel has clean_bdev_aliases */
 #undef HAVE_CLEAN_BDEV_ALIASES
 
+/* 'clear_and_wake_up_bit' is available */
+#undef HAVE_CLEAR_AND_WAKE_UP_BIT
+
 /* have clear_inode */
 #undef HAVE_CLEAR_INODE
 
 /* compat rdma found */
 #undef HAVE_COMPAT_RDMA
 
-/* cpumap_print_to_pagebuf is available */
-#undef HAVE_CPUMASK_PRINT_TO_PAGEBUF
-
 /* kernel compiled with CRC32 functions */
 #undef HAVE_CRC32
 
-/* struct cred has member tgcred */
-#undef HAVE_CRED_TGCRED
-
 /* crypto hash helper functions are available */
 #undef HAVE_CRYPTO_HASH_HELPERS
 
+/* 'CRYPTO_MAX_ALG_NAME' is 128 */
+#undef HAVE_CRYPTO_MAX_ALG_NAME_128
+
 /* current_time() has replaced CURRENT_TIME */
 #undef HAVE_CURRENT_TIME
 
@@ -152,6 +167,9 @@
 /* dentry_open uses struct path as first argument */
 #undef HAVE_DENTRY_OPEN_USE_PATH
 
+/* DES3 enctype is supported by krb5 */
+#undef HAVE_DES3_SUPPORT
+
 /* direct_IO need 2 arguments */
 #undef HAVE_DIRECTIO_2ARGS
 
@@ -233,6 +251,9 @@
 /* d_delete first parameter declared is not const */
 #undef HAVE_D_DELETE_CONST
 
+/* d_hash_and_lookup is exported by the kernel */
+#undef HAVE_D_HASH_AND_LOOKUP
+
 /* have d_make_root */
 #undef HAVE_D_MAKE_ROOT
 
@@ -320,15 +341,15 @@
 /* Define to 1 if you have the `gethostbyname' function. */
 #undef HAVE_GETHOSTBYNAME
 
+/* get_request_key_auth() is available */
+#undef HAVE_GET_REQUEST_KEY_AUTH
+
 /* get_user_pages takes 6 arguments */
 #undef HAVE_GET_USER_PAGES_6ARG
 
 /* get_user_pages takes gup_flags in arguments */
 #undef HAVE_GET_USER_PAGES_GUP_FLAGS
 
-/* get_user_pages takes gup_flags in arguments with 7 args */
-#undef HAVE_GET_USER_PAGES_GUP_FLAGS_7ARGS
-
 /* struct group_info has member gid */
 #undef HAVE_GROUP_INFO_GID
 
@@ -341,6 +362,9 @@
 /* Define this if the Kerberos GSS library supports gss_krb5_ccache_name */
 #undef HAVE_GSS_KRB5_CCACHE_NAME
 
+/* '__rhashtable_insert_fast()' returns int */
+#undef HAVE_HASHTABLE_INSERT_FAST_RETURN_INT
+
 /* Define this if you have Heimdal Kerberos libraries */
 #undef HAVE_HEIMDAL
 
@@ -389,6 +413,9 @@
 /* if ib_sg_dma_address wrapper exists */
 #undef HAVE_IB_SG_DMA_ADDRESS
 
+/* INIT_LIST_HEAD_RCU exists */
+#undef HAVE_INIT_LIST_HEAD_RCU
+
 /* inode_operations .getattr member function can gather advance stats */
 #undef HAVE_INODEOPS_ENHANCED_GETATTR
 
@@ -413,6 +440,15 @@
 /* inode_operations->permission has two args */
 #undef HAVE_INODE_PERMISION_2ARGS
 
+/* inode times are using timespec64 */
+#undef HAVE_INODE_TIMESPEC64
+
+/* blk_integrity.interval exist */
+#undef HAVE_INTERVAL_BLK_INTEGRITY
+
+/* blk_integrity.interval_exp exist */
+#undef HAVE_INTERVAL_EXP_BLK_INTEGRITY
+
 /* Define to 1 if you have the <inttypes.h> header file. */
 #undef HAVE_INTTYPES_H
 
@@ -422,6 +458,9 @@
 /* have in_compat_syscall */
 #undef HAVE_IN_COMPAT_SYSCALL
 
+/* 'in_dev_for_each_ifa_rtnl' is defined */
+#undef HAVE_IN_DEV_FOR_EACH_IFA_RTNL
+
 /* inode_operations->rename need flags as argument */
 #undef HAVE_IOPS_RENAME_WITH_FLAGS
 
@@ -461,18 +500,27 @@
 /* is_sxid is defined */
 #undef HAVE_IS_SXID
 
+/* 'iterate_shared' is available */
+#undef HAVE_ITERATE_SHARED
+
 /* struct address_space has i_pages */
 #undef HAVE_I_PAGES
 
 /* i_uid_read is present */
 #undef HAVE_I_UID_READ
 
-/* jiffies_to_timespec64() is available */
-#undef HAVE_JIFFIES_TO_TIMESPEC64
+/* kallsyms_lookup_name is exported by kernel */
+#undef HAVE_KALLSYMS_LOOKUP_NAME
 
 /* kernel_locked is defined */
 #undef HAVE_KERNEL_LOCKED
 
+/* 'kernel_param_[un]lock' is available */
+#undef HAVE_KERNEL_PARAM_LOCK
+
+/* 'struct kernel_param_ops' is available */
+#undef HAVE_KERNEL_PARAM_OPS
+
 /* kernel_setsockopt still in use */
 #undef HAVE_KERNEL_SETSOCKOPT
 
@@ -491,6 +539,9 @@
 /* key_type->instantiate has two args */
 #undef HAVE_KEY_TYPE_INSTANTIATE_2ARGS
 
+/* key.usage is of type refcount_t */
+#undef HAVE_KEY_USAGE_REFCOUNT
+
 /* ki_left exist */
 #undef HAVE_KIOCB_KI_LEFT
 
@@ -519,12 +570,15 @@
    available */
 #undef HAVE_KRB5_GET_INIT_CREDS_OPT_SET_ADDRESSLESS
 
+/* kset_find_obj is exported by the kernel */
+#undef HAVE_KSET_FIND_OBJ
+
+/* kernel has kstrtobool_from_user */
+#undef HAVE_KSTRTOBOOL_FROM_USER
+
 /* kernel has kstrtoul */
 #undef HAVE_KSTRTOUL
 
-/* kernel has ksys_close */
-#undef HAVE_KSYS_CLOSE
-
 /* kthread_worker found */
 #undef HAVE_KTHREAD_WORK
 
@@ -552,6 +606,9 @@
 /* 'ktime_get_ts64' is available */
 #undef HAVE_KTIME_GET_TS64
 
+/* 'ktime_ms_delta' is available */
+#undef HAVE_KTIME_MS_DELTA
+
 /* 'ktime_to_timespec64' is available */
 #undef HAVE_KTIME_TO_TIMESPEC64
 
@@ -579,21 +636,12 @@
 /* readline library is available */
 #undef HAVE_LIBREADLINE
 
-/* Define to 1 if you have the <linux/random.h> header file. */
-#undef HAVE_LINUX_RANDOM_H
+/* linux/rhashtable.h is present */
+#undef HAVE_LINUX_RHASHTABLE_H
 
 /* if linux/selinux.h exists */
 #undef HAVE_LINUX_SELINUX_IS_ENABLED
 
-/* Define to 1 if you have the <linux/types.h> header file. */
-#undef HAVE_LINUX_TYPES_H
-
-/* Define to 1 if you have the <linux/unistd.h> header file. */
-#undef HAVE_LINUX_UNISTD_H
-
-/* Define to 1 if you have the <linux/version.h> header file. */
-#undef HAVE_LINUX_VERSION_H
-
 /* lock_manager_operations has lm_compare_owner */
 #undef HAVE_LM_COMPARE_OWNER
 
@@ -603,6 +651,9 @@
 /* kernel has locks_lock_file_wait */
 #undef HAVE_LOCKS_LOCK_FILE_WAIT
 
+/* lookup_user_key() is available */
+#undef HAVE_LOOKUP_USER_KEY
+
 /* kernel has LOOP_CTL_GET_FREE */
 #undef HAVE_LOOP_CTL_GET_FREE
 
@@ -631,6 +682,9 @@
 /* kernel module loading is possible */
 #undef HAVE_MODULE_LOADING_SUPPORT
 
+/* locking module param is supported */
+#undef HAVE_MODULE_PARAM_LOCKING
+
 /* Define to 1 if you have the `name_to_handle_at' function. */
 #undef HAVE_NAME_TO_HANDLE_AT
 
@@ -640,15 +694,24 @@
 /* cancel_dirty_page with one arguement is available */
 #undef HAVE_NEW_CANCEL_DIRTY_PAGE
 
+/* DEFINE_TIMER uses only 2 arguements */
+#undef HAVE_NEW_DEFINE_TIMER
+
 /* 'kernel_write' aligns with read/write helpers */
 #undef HAVE_NEW_KERNEL_WRITE
 
 /* NR_UNSTABLE_NFS is still in use. */
 #undef HAVE_NR_UNSTABLE_NFS
 
+/* ns_to_timespec64() is available */
+#undef HAVE_NS_TO_TIMESPEC64
+
 /* with oldsize */
 #undef HAVE_OLDSIZE_TRUNCATE_PAGECACHE
 
+/* openssl-devel is present */
+#undef HAVE_OPENSSL_GETSEPOL
+
 /* OpenSSL HMAC functions needed for SSK */
 #undef HAVE_OPENSSL_SSK
 
@@ -673,6 +736,9 @@
 /* posix_acl_valid takes struct user_namespace */
 #undef HAVE_POSIX_ACL_VALID_USER_NS
 
+/* 'prepare_to_wait_event' is available */
+#undef HAVE_PREPARE_TO_WAIT_EVENT
+
 /* struct proc_ops exists */
 #undef HAVE_PROC_OPS
 
@@ -685,12 +751,18 @@
 /* inode->i_nlink is protected from direct modification */
 #undef HAVE_PROTECT_I_NLINK
 
+/* 'PTR_ERR_OR_ZERO' exist */
+#undef HAVE_PTR_ERR_OR_ZERO
+
 /* have quota64 */
 #undef HAVE_QUOTA64
 
 /* radix_tree_exceptional_entry exist */
 #undef HAVE_RADIX_EXCEPTION_ENTRY
 
+/* rdma_connect_locked is defined */
+#undef HAVE_RDMA_CONNECT_LOCKED
+
 /* rdma_create_id wants 4 args */
 #undef HAVE_RDMA_CREATE_ID_4ARG
 
@@ -700,15 +772,24 @@
 /* rdma_reject has 4 arguments */
 #undef HAVE_RDMA_REJECT_4ARGS
 
-/* reinit_completion is exist */
-#undef HAVE_REINIT_COMPLETION
-
 /* kernel export remove_from_page_cache */
 #undef HAVE_REMOVE_FROM_PAGE_CACHE
 
 /* remove_proc_subtree is defined */
 #undef HAVE_REMOVE_PROC_SUBTREE
 
+/* rhashtable_lookup() is available */
+#undef HAVE_RHASHTABLE_LOOKUP
+
+/* rhashtable_lookup_get_insert_fast() is available */
+#undef HAVE_RHASHTABLE_LOOKUP_GET_INSERT_FAST
+
+/* struct rhltable exist */
+#undef HAVE_RHLTABLE
+
+/* save_stack_trace_tsk is exported */
+#undef HAVE_SAVE_STACK_TRACE_TSK
+
 /* Have sa_spill_alloc in ZFS */
 #undef HAVE_SA_SPILL_ALLOC
 
@@ -733,6 +814,9 @@
 /* security_inode_init_security takes a 'struct qstr' parameter */
 #undef HAVE_SECURITY_IINITSEC_QSTR
 
+/* security_inode_listsecurity() is available/exported */
+#undef HAVE_SECURITY_INODE_LISTSECURITY
+
 /* security_release_secctx has 1 arg. */
 #undef HAVE_SEC_RELEASE_SECCTX_1ARG
 
@@ -776,36 +860,27 @@
 /* Have spa_maxblocksize in ZFS */
 #undef HAVE_SPA_MAXBLOCKSIZE
 
-/* spinlock_t is defined */
-#undef HAVE_SPINLOCK_T
-
 /* struct stacktrace_ops exists */
 #undef HAVE_STACKTRACE_OPS
 
 /* stacktrace_ops.warning is exist */
 #undef HAVE_STACKTRACE_WARNING
 
-/* stack_trace_print() exists */
-#undef HAVE_STACK_TRACE_PRINT
-
 /* Define to 1 if you have the <stdint.h> header file. */
 #undef HAVE_STDINT_H
 
 /* Define to 1 if you have the <stdlib.h> header file. */
 #undef HAVE_STDLIB_H
 
+/* stringhash.h is present */
+#undef HAVE_STRINGHASH
+
 /* Define to 1 if you have the <strings.h> header file. */
 #undef HAVE_STRINGS_H
 
 /* Define to 1 if you have the <string.h> header file. */
 #undef HAVE_STRING_H
 
-/* Define to 1 if you have the `strlcat' function. */
-#undef HAVE_STRLCAT
-
-/* Define to 1 if you have the `strlcpy' function. */
-#undef HAVE_STRLCPY
-
 /* Define to 1 if you have the `strnlen' function. */
 #undef HAVE_STRNLEN
 
@@ -833,9 +908,6 @@
 /* ctl_table has ctl_name field */
 #undef HAVE_SYSCTL_CTLNAME
 
-/* Define to 1 if you have the <sys/ioctl.h> header file. */
-#undef HAVE_SYS_IOCTL_H
-
 /* Define to 1 if you have <sys/quota.h>. */
 #undef HAVE_SYS_QUOTA_H
 
@@ -866,9 +938,6 @@
 /* 'timespec64_to_ktime' is available */
 #undef HAVE_TIMESPEC64_TO_KTIME
 
-/* have_time_t */
-#undef HAVE_TIME_T
-
 /* topology_sibling_cpumask is available */
 #undef HAVE_TOPOLOGY_SIBLING_CPUMASK
 
@@ -917,9 +986,18 @@
 /* 'struct vm_operations' remove struct vm_area_struct argument */
 #undef HAVE_VM_OPS_USE_VM_FAULT_ONLY
 
+/* wait_bit.h is present */
+#undef HAVE_WAIT_BIT_HEADER_H
+
 /* 'wait_queue_entry_t' is available */
 #undef HAVE_WAIT_QUEUE_ENTRY
 
+/* linux wait_queue_head_t list_head is name head */
+#undef HAVE_WAIT_QUEUE_ENTRY_LIST
+
+/* 'wait_var_event' is available */
+#undef HAVE_WAIT_VAR_EVENT
+
 /* flags field exist */
 #undef HAVE_XATTR_HANDLER_FLAGS
 
@@ -944,9 +1022,18 @@
 /* Have zap_remove_by_dnode() in ZFS */
 #undef HAVE_ZAP_REMOVE_ADD_BY_DNODE
 
+/* Have inode_timespec_t */
+#undef HAVE_ZFS_INODE_TIMESPEC
+
+/* Have multihost protection in ZFS */
+#undef HAVE_ZFS_MULTIHOST
+
 /* Enable zfs osd */
 #undef HAVE_ZFS_OSD
 
+/* Have zfs_refcount_add */
+#undef HAVE_ZFS_REFCOUNT_ADD
+
 /* __add_wait_queue_exclusive exists */
 #undef HAVE___ADD_WAIT_QUEUE_EXCLUSIVE
 
@@ -996,6 +1083,9 @@
 /* need pclmulqdq based crc32 */
 #undef NEED_CRC32_ACCEL
 
+/* 'ktime_get_ns' is not available */
+#undef NEED_KTIME_GET_NS
+
 /* 'ktime_get_real_ns' is not available */
 #undef NEED_KTIME_GET_REAL_NS
 
@@ -1026,9 +1116,6 @@
 /* name of parallel fsck program */
 #undef PFSCK
 
-/* proc handler methods use __user */
-#undef PROC_HANDLER_USE_USER_ATTR
-
 /* enable randomly alloc failure */
 #undef RANDOM_FAIL_ALLOC
 

From 54568b354e29c621ad16250a49b2b7d844cdcf83 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 18 Oct 2021 16:41:05 +0200
Subject: [PATCH 346/737] sched: Improve wake_up_all_idle_cpus() take #2

As reported by syzbot and experienced by Pavel, using cpus_read_lock()
in wake_up_all_idle_cpus() generates lock inversion (against mmap_sem
and possibly others).

Instead, shrink the preempt disable region by iterating all CPUs and
checking the online status for each individual CPU while having
preemption disabled.

Fixes: 8850cb663b5c ("sched: Simplify wake_up_*idle*()")
Reported-by: syzbot+d5b23b18d2f4feae8a67@syzkaller.appspotmail.com
Reported-by: Pavel Machek <pavel@ucw.cz>
Reported-by: Qian Cai <quic_qiancai@quicinc.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Qian Cai <quic_qiancai@quicinc.com>
---
 kernel/smp.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/kernel/smp.c b/kernel/smp.c
index 0e13d65e348db..8ba0fd953f001 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -952,14 +952,12 @@ void wake_up_all_idle_cpus(void)
 {
 	int cpu;
 
-	cpus_read_lock();
-	for_each_online_cpu(cpu) {
-		if (cpu == raw_smp_processor_id())
-			continue;
-
-		wake_up_if_idle(cpu);
+	for_each_possible_cpu(cpu) {
+		preempt_disable();
+		if (cpu != smp_processor_id() && cpu_online(cpu))
+			wake_up_if_idle(cpu);
+		preempt_enable();
 	}
-	cpus_read_unlock();
 }
 EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
 

From b1dea0d7468f4814fb8adfac57768692b88f8595 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 10 Dec 2021 14:46:22 -0800
Subject: [PATCH 347/737] timers: implement usleep_idle_range()

Patch series "mm/damon: Fix fake /proc/loadavg reports", v3.

This patchset fixes DAMON's fake load report issue.  The first patch
makes yet another variant of usleep_range() for this fix, and the second
patch fixes the issue of DAMON by making it using the newly introduced
function.

This patch (of 2):

Some kernel threads such as DAMON could need to repeatedly sleep in
micro seconds level.  Because usleep_range() sleeps in uninterruptible
state, however, such threads would make /proc/loadavg reports fake load.

To help such cases, this commit implements a variant of usleep_range()
called usleep_idle_range().  It is same to usleep_range() but sets the
state of the current task as TASK_IDLE while sleeping.

Link: https://lkml.kernel.org/r/20211126145015.15862-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20211126145015.15862-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Cc: John Stultz <john.stultz@linaro.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/delay.h | 14 +++++++++++++-
 kernel/time/timer.c   | 16 +++++++++-------
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/include/linux/delay.h b/include/linux/delay.h
index 1d0e2ce6b6d9f..e8607992c68a5 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -20,6 +20,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/sched.h>
 
 extern unsigned long loops_per_jiffy;
 
@@ -58,7 +59,18 @@ void calibrate_delay(void);
 void __attribute__((weak)) calibration_delay_done(void);
 void msleep(unsigned int msecs);
 unsigned long msleep_interruptible(unsigned int msecs);
-void usleep_range(unsigned long min, unsigned long max);
+void usleep_range_state(unsigned long min, unsigned long max,
+			unsigned int state);
+
+static inline void usleep_range(unsigned long min, unsigned long max)
+{
+	usleep_range_state(min, max, TASK_UNINTERRUPTIBLE);
+}
+
+static inline void usleep_idle_range(unsigned long min, unsigned long max)
+{
+	usleep_range_state(min, max, TASK_IDLE);
+}
 
 static inline void ssleep(unsigned int seconds)
 {
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index e87e638c31bdf..f7d3a108e27c9 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -2050,26 +2050,28 @@ unsigned long msleep_interruptible(unsigned int msecs)
 EXPORT_SYMBOL(msleep_interruptible);
 
 /**
- * usleep_range - Sleep for an approximate time
- * @min: Minimum time in usecs to sleep
- * @max: Maximum time in usecs to sleep
+ * usleep_range_state - Sleep for an approximate time in a given state
+ * @min:	Minimum time in usecs to sleep
+ * @max:	Maximum time in usecs to sleep
+ * @state:	State of the current task that will be while sleeping
  *
  * In non-atomic context where the exact wakeup time is flexible, use
- * usleep_range() instead of udelay().  The sleep improves responsiveness
+ * usleep_range_state() instead of udelay().  The sleep improves responsiveness
  * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces
  * power usage by allowing hrtimers to take advantage of an already-
  * scheduled interrupt instead of scheduling a new one just for this sleep.
  */
-void __sched usleep_range(unsigned long min, unsigned long max)
+void __sched usleep_range_state(unsigned long min, unsigned long max,
+				unsigned int state)
 {
 	ktime_t exp = ktime_add_us(ktime_get(), min);
 	u64 delta = (u64)(max - min) * NSEC_PER_USEC;
 
 	for (;;) {
-		__set_current_state(TASK_UNINTERRUPTIBLE);
+		__set_current_state(state);
 		/* Do not return before the requested sleep time has elapsed */
 		if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS))
 			break;
 	}
 }
-EXPORT_SYMBOL(usleep_range);
+EXPORT_SYMBOL(usleep_range_state);

From d00e38a3a8c92d76bb5f56e24511b263c4a8128b Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 10 Dec 2021 14:46:25 -0800
Subject: [PATCH 348/737] mm/damon/core: fix fake load reports due to
 uninterruptible sleeps

Because DAMON sleeps in uninterruptible mode, /proc/loadavg reports fake
load while DAMON is turned on, though it is doing nothing.  This can
confuse users[1].  To avoid the case, this commit makes DAMON sleeps in
idle mode.

[1] https://lore.kernel.org/all/11868371.O9o76ZdvQC@natalenko.name/

Link: https://lkml.kernel.org/r/20211126145015.15862-3-sj@kernel.org
Fixes: 2224d8485492 ("mm: introduce Data Access MONitor (DAMON)")
Reported-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Signed-off-by: SeongJae Park <sj@kernel.org>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index c381b3c525d0b..2daffd5820fe0 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -981,9 +981,9 @@ static unsigned long damos_wmark_wait_us(struct damos *scheme)
 static void kdamond_usleep(unsigned long usecs)
 {
 	if (usecs > 100 * 1000)
-		schedule_timeout_interruptible(usecs_to_jiffies(usecs));
+		schedule_timeout_idle(usecs_to_jiffies(usecs));
 	else
-		usleep_range(usecs, usecs + 1);
+		usleep_idle_range(usecs, usecs + 1);
 }
 
 /* Returns negative error code if it's not activated but should return */
@@ -1038,7 +1038,7 @@ static int kdamond_fn(void *data)
 				ctx->callback.after_sampling(ctx))
 			done = true;
 
-		usleep_range(ctx->sample_interval, ctx->sample_interval + 1);
+		kdamond_usleep(ctx->sample_interval);
 
 		if (ctx->primitive.check_accesses)
 			max_nr_accesses = ctx->primitive.check_accesses(ctx);

From b23b8ceb6d41d79f526192b59133df9f6a639d61 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 10 Dec 2021 14:46:28 -0800
Subject: [PATCH 349/737] mm/damon/core: use better timer mechanisms selection
 threshold

Patch series "mm/damon: Trivial fixups and improvements".

This patchset contains trivial fixups and improvements for DAMON and its
kunit/kselftest tests.

This patch (of 11):

DAMON is using hrtimer if requested sleep time is <=100ms, while the
suggested threshold[1] is <=20ms.  This commit applies the threshold.

[1] Documentation/timers/timers-howto.rst

Link: https://lkml.kernel.org/r/20211201150440.1088-2-sj@kernel.org
Fixes: ee801b7dd7822 ("mm/damon/schemes: activate schemes based on a watermarks mechanism")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index 2daffd5820fe0..eefb2ada67cae 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -980,7 +980,8 @@ static unsigned long damos_wmark_wait_us(struct damos *scheme)
 
 static void kdamond_usleep(unsigned long usecs)
 {
-	if (usecs > 100 * 1000)
+	/* See Documentation/timers/timers-howto.rst for the thresholds */
+	if (usecs > 20 * USEC_PER_MSEC)
 		schedule_timeout_idle(usecs_to_jiffies(usecs));
 	else
 		usleep_idle_range(usecs, usecs + 1);

From 3ab9d8b477cebd7f34356d18dc6b7422d0ae5dc6 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 10 Dec 2021 14:46:31 -0800
Subject: [PATCH 350/737] mm/damon/dbgfs: remove an unnecessary error message

When wrong scheme action is requested via the debugfs interface, DAMON
prints an error message.  Because the function returns error code, this
is not really needed.  Because the code path is triggered by the user
specified input, this can result in kernel log mistakenly being messy.
To avoid the case, this commit removes the message.

Link: https://lkml.kernel.org/r/20211201150440.1088-3-sj@kernel.org
Fixes: af122dd8f3c0 ("mm/damon/dbgfs: support DAMON-based Operation Schemes")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/dbgfs.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 9b520bb4a3e70..1efac0022e9a4 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -210,10 +210,8 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
 				&wmarks.low, &parsed);
 		if (ret != 18)
 			break;
-		if (!damos_action_valid(action)) {
-			pr_err("wrong action %d\n", action);
+		if (!damos_action_valid(action))
 			goto fail;
-		}
 
 		pos += parsed;
 		scheme = damon_new_scheme(min_sz, max_sz, min_nr_a, max_nr_a,

From e0d84428742c2a94ac7a781b8b2365d7aaaf98b2 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 10 Dec 2021 14:46:34 -0800
Subject: [PATCH 351/737] mm/damon/core: remove unnecessary error messages

DAMON core prints error messages when damon_target object creation is
failed or wrong monitoring attributes are given.  Because appropriate
error code is returned for each case, the messages are not essential.
Also, because the code path can be triggered with user-specified input,
this could result in kernel log mistakenly being messy.  To avoid the
case, this commit removes the messages.

Link: https://lkml.kernel.org/r/20211201150440.1088-4-sj@kernel.org
Fixes: 4bc05954d007 ("mm/damon: implement a debugfs-based user space interface")
Fixes: b9a6ac4e4ede ("mm/damon: adaptively adjust regions")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/core.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index eefb2ada67cae..e924978952025 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -282,7 +282,6 @@ int damon_set_targets(struct damon_ctx *ctx,
 	for (i = 0; i < nr_ids; i++) {
 		t = damon_new_target(ids[i]);
 		if (!t) {
-			pr_err("Failed to alloc damon_target\n");
 			/* The caller should do cleanup of the ids itself */
 			damon_for_each_target_safe(t, next, ctx)
 				damon_destroy_target(t);
@@ -312,16 +311,10 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
 		    unsigned long aggr_int, unsigned long primitive_upd_int,
 		    unsigned long min_nr_reg, unsigned long max_nr_reg)
 {
-	if (min_nr_reg < 3) {
-		pr_err("min_nr_regions (%lu) must be at least 3\n",
-				min_nr_reg);
+	if (min_nr_reg < 3)
 		return -EINVAL;
-	}
-	if (min_nr_reg > max_nr_reg) {
-		pr_err("invalid nr_regions.  min (%lu) > max (%lu)\n",
-				min_nr_reg, max_nr_reg);
+	if (min_nr_reg > max_nr_reg)
 		return -EINVAL;
-	}
 
 	ctx->sample_interval = sample_int;
 	ctx->aggr_interval = aggr_int;

From 8d93d75ed7e03f87cd20b4a72aa0e55f74bc28c1 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 10 Dec 2021 14:46:37 -0800
Subject: [PATCH 352/737] mm/damon/vaddr: remove an unnecessary warning message

The DAMON virtual address space monitoring primitive prints a warning
message for wrong DAMOS action.  However, it is not essential as the
code returns appropriate failure in the case.  This commit removes the
message to make the log clean.

Link: https://lkml.kernel.org/r/20211201150440.1088-5-sj@kernel.org
Fixes: 6dea8add4d28 ("mm/damon/vaddr: support DAMON-based Operation Schemes")
Signed-off-by: SeongJae Park <sj@kernel.org>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/vaddr.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 47f47f60440eb..20a9a9d69eb19 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -627,7 +627,6 @@ int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t,
 	case DAMOS_STAT:
 		return 0;
 	default:
-		pr_warn("Wrong action %d\n", scheme->action);
 		return -EINVAL;
 	}
 

From edea5a9a2cc5db26f2304d5fa55c552c8308ff1a Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 10 Dec 2021 14:46:40 -0800
Subject: [PATCH 353/737] mm/damon/vaddr-test: split a test function having
 >1024 bytes frame size

On some configuration[1], 'damon_test_split_evenly()' kunit test
function has >1024 bytes frame size, so below build warning is
triggered:

      CC      mm/damon/vaddr.o
    In file included from mm/damon/vaddr.c:672:
    mm/damon/vaddr-test.h: In function 'damon_test_split_evenly':
    mm/damon/vaddr-test.h:309:1: warning: the frame size of 1064 bytes is larger than 1024 bytes [-Wframe-larger-than=]
      309 | }
          | ^

This commit fixes the warning by separating the common logic in the
function.

[1] https://lore.kernel.org/linux-mm/202111182146.OV3C4uGr-lkp@intel.com/

Link: https://lkml.kernel.org/r/20211201150440.1088-6-sj@kernel.org
Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests")
Signed-off-by: SeongJae Park <sj@kernel.org>
Reported-by: kernel test robot <lkp@intel.com>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/vaddr-test.h | 77 ++++++++++++++++++++++---------------------
 1 file changed, 40 insertions(+), 37 deletions(-)

diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h
index ecfd0b2ed222d..3097ef9c662af 100644
--- a/mm/damon/vaddr-test.h
+++ b/mm/damon/vaddr-test.h
@@ -252,59 +252,62 @@ static void damon_test_apply_three_regions4(struct kunit *test)
 			new_three_regions, expected, ARRAY_SIZE(expected));
 }
 
-static void damon_test_split_evenly(struct kunit *test)
+static void damon_test_split_evenly_fail(struct kunit *test,
+		unsigned long start, unsigned long end, unsigned int nr_pieces)
 {
-	struct damon_ctx *c = damon_new_ctx();
-	struct damon_target *t;
-	struct damon_region *r;
-	unsigned long i;
-
-	KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(NULL, NULL, 5),
-			-EINVAL);
-
-	t = damon_new_target(42);
-	r = damon_new_region(0, 100);
-	KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 0), -EINVAL);
+	struct damon_target *t = damon_new_target(42);
+	struct damon_region *r = damon_new_region(start, end);
 
 	damon_add_region(r, t);
-	KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 10), 0);
-	KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 10u);
+	KUNIT_EXPECT_EQ(test,
+			damon_va_evenly_split_region(t, r, nr_pieces), -EINVAL);
+	KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1u);
 
-	i = 0;
 	damon_for_each_region(r, t) {
-		KUNIT_EXPECT_EQ(test, r->ar.start, i++ * 10);
-		KUNIT_EXPECT_EQ(test, r->ar.end, i * 10);
+		KUNIT_EXPECT_EQ(test, r->ar.start, start);
+		KUNIT_EXPECT_EQ(test, r->ar.end, end);
 	}
+
 	damon_free_target(t);
+}
+
+static void damon_test_split_evenly_succ(struct kunit *test,
+	unsigned long start, unsigned long end, unsigned int nr_pieces)
+{
+	struct damon_target *t = damon_new_target(42);
+	struct damon_region *r = damon_new_region(start, end);
+	unsigned long expected_width = (end - start) / nr_pieces;
+	unsigned long i = 0;
 
-	t = damon_new_target(42);
-	r = damon_new_region(5, 59);
 	damon_add_region(r, t);
-	KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 5), 0);
-	KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 5u);
+	KUNIT_EXPECT_EQ(test,
+			damon_va_evenly_split_region(t, r, nr_pieces), 0);
+	KUNIT_EXPECT_EQ(test, damon_nr_regions(t), nr_pieces);
 
-	i = 0;
 	damon_for_each_region(r, t) {
-		if (i == 4)
+		if (i == nr_pieces - 1)
 			break;
-		KUNIT_EXPECT_EQ(test, r->ar.start, 5 + 10 * i++);
-		KUNIT_EXPECT_EQ(test, r->ar.end, 5 + 10 * i);
+		KUNIT_EXPECT_EQ(test,
+				r->ar.start, start + i++ * expected_width);
+		KUNIT_EXPECT_EQ(test, r->ar.end, start + i * expected_width);
 	}
-	KUNIT_EXPECT_EQ(test, r->ar.start, 5 + 10 * i);
-	KUNIT_EXPECT_EQ(test, r->ar.end, 59ul);
+	KUNIT_EXPECT_EQ(test, r->ar.start, start + i * expected_width);
+	KUNIT_EXPECT_EQ(test, r->ar.end, end);
 	damon_free_target(t);
+}
 
-	t = damon_new_target(42);
-	r = damon_new_region(5, 6);
-	damon_add_region(r, t);
-	KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 2), -EINVAL);
-	KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1u);
+static void damon_test_split_evenly(struct kunit *test)
+{
+	struct damon_ctx *c = damon_new_ctx();
+
+	KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(NULL, NULL, 5),
+			-EINVAL);
+
+	damon_test_split_evenly_fail(test, 0, 100, 0);
+	damon_test_split_evenly_succ(test, 0, 100, 10);
+	damon_test_split_evenly_succ(test, 5, 59, 5);
+	damon_test_split_evenly_fail(test, 5, 6, 2);
 
-	damon_for_each_region(r, t) {
-		KUNIT_EXPECT_EQ(test, r->ar.start, 5ul);
-		KUNIT_EXPECT_EQ(test, r->ar.end, 6ul);
-	}
-	damon_free_target(t);
 	damon_destroy_ctx(c);
 }
 

From f7742661db626582a38a3fbf7f6fe555c9d6e19f Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 10 Dec 2021 14:46:43 -0800
Subject: [PATCH 354/737] mm/damon/vaddr-test: remove unnecessary variables

A couple of test functions in DAMON virtual address space monitoring
primitives implementation has unnecessary damon_ctx variables.  This
commit removes those.

Link: https://lkml.kernel.org/r/20211201150440.1088-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/vaddr-test.h | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h
index 3097ef9c662af..6a1b9272ea123 100644
--- a/mm/damon/vaddr-test.h
+++ b/mm/damon/vaddr-test.h
@@ -135,7 +135,6 @@ static void damon_do_test_apply_three_regions(struct kunit *test,
 				struct damon_addr_range *three_regions,
 				unsigned long *expected, int nr_expected)
 {
-	struct damon_ctx *ctx = damon_new_ctx();
 	struct damon_target *t;
 	struct damon_region *r;
 	int i;
@@ -145,7 +144,6 @@ static void damon_do_test_apply_three_regions(struct kunit *test,
 		r = damon_new_region(regions[i * 2], regions[i * 2 + 1]);
 		damon_add_region(r, t);
 	}
-	damon_add_target(ctx, t);
 
 	damon_va_apply_three_regions(t, three_regions);
 
@@ -154,8 +152,6 @@ static void damon_do_test_apply_three_regions(struct kunit *test,
 		KUNIT_EXPECT_EQ(test, r->ar.start, expected[i * 2]);
 		KUNIT_EXPECT_EQ(test, r->ar.end, expected[i * 2 + 1]);
 	}
-
-	damon_destroy_ctx(ctx);
 }
 
 /*
@@ -298,8 +294,6 @@ static void damon_test_split_evenly_succ(struct kunit *test,
 
 static void damon_test_split_evenly(struct kunit *test)
 {
-	struct damon_ctx *c = damon_new_ctx();
-
 	KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(NULL, NULL, 5),
 			-EINVAL);
 
@@ -307,8 +301,6 @@ static void damon_test_split_evenly(struct kunit *test)
 	damon_test_split_evenly_succ(test, 0, 100, 10);
 	damon_test_split_evenly_succ(test, 5, 59, 5);
 	damon_test_split_evenly_fail(test, 5, 6, 2);
-
-	damon_destroy_ctx(c);
 }
 
 static struct kunit_case damon_test_cases[] = {

From 4416639e4ec7db8f2dbdd155384c838fe2ade13c Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 10 Dec 2021 14:46:46 -0800
Subject: [PATCH 355/737] selftests/damon: skip test if DAMON is running

Testing the DAMON debugfs files while DAMON is running makes no sense,
as any write to the debugfs files will fail.  This commit makes the test
be skipped in this case.

Link: https://lkml.kernel.org/r/20211201150440.1088-8-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/damon/debugfs_attrs.sh | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tools/testing/selftests/damon/debugfs_attrs.sh b/tools/testing/selftests/damon/debugfs_attrs.sh
index 196b6640bf378..fc80380c59f02 100644
--- a/tools/testing/selftests/damon/debugfs_attrs.sh
+++ b/tools/testing/selftests/damon/debugfs_attrs.sh
@@ -44,6 +44,15 @@ test_content() {
 
 source ./_chk_dependency.sh
 
+ksft_skip=4
+
+damon_onoff="$DBGFS/monitor_on"
+if [ $(cat "$damon_onoff") = "on" ]
+then
+	echo "monitoring is on"
+	exit $ksft_skip
+fi
+
 # Test attrs file
 # ===============
 

From 7500be50a71aad0da4c566e192ec7ea71947c5c7 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 10 Dec 2021 14:46:49 -0800
Subject: [PATCH 356/737] selftests/damon: test DAMON enabling with empty
 target_ids case

DAMON debugfs didn't check empty targets when starting monitoring, and
the issue is fixed with commit b5ca3e83ddb0 ("mm/damon/dbgfs: add
adaptive_targets list check before enable monitor_on").  To avoid future
regression, this commit adds a test case for that in DAMON selftests.

Link: https://lkml.kernel.org/r/20211201150440.1088-9-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/damon/debugfs_attrs.sh | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tools/testing/selftests/damon/debugfs_attrs.sh b/tools/testing/selftests/damon/debugfs_attrs.sh
index fc80380c59f02..d0916373f310a 100644
--- a/tools/testing/selftests/damon/debugfs_attrs.sh
+++ b/tools/testing/selftests/damon/debugfs_attrs.sh
@@ -94,4 +94,13 @@ test_write_succ "$file" "" "$orig_content" "empty input"
 test_content "$file" "$orig_content" "" "empty input written"
 echo "$orig_content" > "$file"
 
+# Test empty targets case
+# =======================
+
+orig_target_ids=$(cat "$DBGFS/target_ids")
+echo "" > "$DBGFS/target_ids"
+orig_monitor_on=$(cat "$DBGFS/monitor_on")
+test_write_fail "$DBGFS/monitor_on" "on" "orig_monitor_on" "empty target ids"
+echo "$orig_target_ids" > "$DBGFS/target_ids"
+
 echo "PASS"

From af329d823e4daa3a619c1c7f42efaa52c9bbaf3c Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 10 Dec 2021 14:46:52 -0800
Subject: [PATCH 357/737] selftests/damon: test wrong DAMOS condition ranges
 input

A patch titled "mm/damon/schemes: add the validity judgment of
thresholds"[1] makes DAMON debugfs interface to validate DAMON scheme
inputs.  This commit adds a test case for the validation logic in DAMON
selftests.

[1] https://lore.kernel.org/linux-mm/d78360e52158d786fcbf20bc62c96785742e76d3.1637239568.git.xhao@linux.alibaba.com/

Link: https://lkml.kernel.org/r/20211201150440.1088-10-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/damon/debugfs_attrs.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/testing/selftests/damon/debugfs_attrs.sh b/tools/testing/selftests/damon/debugfs_attrs.sh
index d0916373f310a..1ef1186171679 100644
--- a/tools/testing/selftests/damon/debugfs_attrs.sh
+++ b/tools/testing/selftests/damon/debugfs_attrs.sh
@@ -77,6 +77,8 @@ test_write_succ "$file" "1 2 3 4 5 6 4 0 0 0 1 2 3 1 100 3 2 1" \
 test_write_fail "$file" "1 2
 3 4 5 6 3 0 0 0 1 2 3 1 100 3 2 1" "$orig_content" "multi lines"
 test_write_succ "$file" "" "$orig_content" "disabling"
+test_write_fail "$file" "2 1 2 1 10 1 3 10 1 1 1 1 1 1 1 1 2 3" \
+	"$orig_content" "wrong condition ranges"
 echo "$orig_content" > "$file"
 
 # Test target_ids file

From 13a42b15272883cb2fa12dd7fd22b92309099c44 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 10 Dec 2021 14:46:55 -0800
Subject: [PATCH 358/737] selftests/damon: test debugfs file reads/writes with
 huge count

DAMON debugfs interface users were able to trigger warning by writing
some files with arbitrarily large 'count' parameter.  The issue is fixed
with commit db7a347b26fe ("mm/damon/dbgfs: use '__GFP_NOWARN' for
user-specified size buffer allocation").  This commit adds a test case
for the issue in DAMON selftests to avoid future regressions.

Link: https://lkml.kernel.org/r/20211201150440.1088-11-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/damon/.gitignore      |  2 +
 tools/testing/selftests/damon/Makefile        |  2 +
 .../testing/selftests/damon/debugfs_attrs.sh  | 18 +++++++++
 .../selftests/damon/huge_count_read_write.c   | 39 +++++++++++++++++++
 4 files changed, 61 insertions(+)
 create mode 100644 tools/testing/selftests/damon/.gitignore
 create mode 100644 tools/testing/selftests/damon/huge_count_read_write.c

diff --git a/tools/testing/selftests/damon/.gitignore b/tools/testing/selftests/damon/.gitignore
new file mode 100644
index 0000000000000..c6c2965a66075
--- /dev/null
+++ b/tools/testing/selftests/damon/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+huge_count_read_write
diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile
index 8a3f2cd9fec0c..f0aa954b5d135 100644
--- a/tools/testing/selftests/damon/Makefile
+++ b/tools/testing/selftests/damon/Makefile
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0
 # Makefile for damon selftests
 
+TEST_GEN_FILES += huge_count_read_write
+
 TEST_FILES = _chk_dependency.sh
 TEST_PROGS = debugfs_attrs.sh
 
diff --git a/tools/testing/selftests/damon/debugfs_attrs.sh b/tools/testing/selftests/damon/debugfs_attrs.sh
index 1ef1186171679..23a7b48ca7d36 100644
--- a/tools/testing/selftests/damon/debugfs_attrs.sh
+++ b/tools/testing/selftests/damon/debugfs_attrs.sh
@@ -105,4 +105,22 @@ orig_monitor_on=$(cat "$DBGFS/monitor_on")
 test_write_fail "$DBGFS/monitor_on" "on" "orig_monitor_on" "empty target ids"
 echo "$orig_target_ids" > "$DBGFS/target_ids"
 
+# Test huge count read write
+# ==========================
+
+dmesg -C
+
+for file in "$DBGFS/"*
+do
+	./huge_count_read_write "$file"
+done
+
+if dmesg | grep -q WARNING
+then
+	dmesg
+	exit 1
+else
+	exit 0
+fi
+
 echo "PASS"
diff --git a/tools/testing/selftests/damon/huge_count_read_write.c b/tools/testing/selftests/damon/huge_count_read_write.c
new file mode 100644
index 0000000000000..ad7a6b4cf3387
--- /dev/null
+++ b/tools/testing/selftests/damon/huge_count_read_write.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdio.h>
+
+void write_read_with_huge_count(char *file)
+{
+	int filedesc = open(file, O_RDWR);
+	char buf[25];
+	int ret;
+
+	printf("%s %s\n", __func__, file);
+	if (filedesc < 0) {
+		fprintf(stderr, "failed opening %s\n", file);
+		exit(1);
+	}
+
+	write(filedesc, "", 0xfffffffful);
+	perror("after write: ");
+	ret = read(filedesc, buf, 0xfffffffful);
+	perror("after read: ");
+	close(filedesc);
+}
+
+int main(int argc, char *argv[])
+{
+	if (argc != 2) {
+		fprintf(stderr, "Usage: %s <file>\n", argv[0]);
+		exit(1);
+	}
+	write_read_with_huge_count(argv[1]);
+
+	return 0;
+}

From 31c568430b080d43d804fd5fd287db838572ac9d Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 10 Dec 2021 14:46:59 -0800
Subject: [PATCH 359/737] selftests/damon: split test cases

Currently, the single test program, debugfs.sh, contains all test cases
for DAMON.  When one of the cases fails, finding which case is failed
from the test log is not so easy, and all remaining tests will be
skipped.  To improve the situation, this commit splits the single
program into small test programs having their own names.

Link: https://lkml.kernel.org/r/20211201150440.1088-12-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/damon/Makefile        |   5 +-
 .../selftests/damon/_debugfs_common.sh        |  52 ++++++++
 .../testing/selftests/damon/debugfs_attrs.sh  | 111 +-----------------
 .../selftests/damon/debugfs_empty_targets.sh  |  13 ++
 .../damon/debugfs_huge_count_read_write.sh    |  22 ++++
 .../selftests/damon/debugfs_schemes.sh        |  19 +++
 .../selftests/damon/debugfs_target_ids.sh     |  19 +++
 7 files changed, 129 insertions(+), 112 deletions(-)
 create mode 100644 tools/testing/selftests/damon/_debugfs_common.sh
 create mode 100644 tools/testing/selftests/damon/debugfs_empty_targets.sh
 create mode 100644 tools/testing/selftests/damon/debugfs_huge_count_read_write.sh
 create mode 100644 tools/testing/selftests/damon/debugfs_schemes.sh
 create mode 100644 tools/testing/selftests/damon/debugfs_target_ids.sh

diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile
index f0aa954b5d135..937d36ae9a69c 100644
--- a/tools/testing/selftests/damon/Makefile
+++ b/tools/testing/selftests/damon/Makefile
@@ -3,7 +3,8 @@
 
 TEST_GEN_FILES += huge_count_read_write
 
-TEST_FILES = _chk_dependency.sh
-TEST_PROGS = debugfs_attrs.sh
+TEST_FILES = _chk_dependency.sh _debugfs_common.sh
+TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh
+TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh
 
 include ../lib.mk
diff --git a/tools/testing/selftests/damon/_debugfs_common.sh b/tools/testing/selftests/damon/_debugfs_common.sh
new file mode 100644
index 0000000000000..48989d4813ae8
--- /dev/null
+++ b/tools/testing/selftests/damon/_debugfs_common.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+test_write_result() {
+	file=$1
+	content=$2
+	orig_content=$3
+	expect_reason=$4
+	expected=$5
+
+	echo "$content" > "$file"
+	if [ $? -ne "$expected" ]
+	then
+		echo "writing $content to $file doesn't return $expected"
+		echo "expected because: $expect_reason"
+		echo "$orig_content" > "$file"
+		exit 1
+	fi
+}
+
+test_write_succ() {
+	test_write_result "$1" "$2" "$3" "$4" 0
+}
+
+test_write_fail() {
+	test_write_result "$1" "$2" "$3" "$4" 1
+}
+
+test_content() {
+	file=$1
+	orig_content=$2
+	expected=$3
+	expect_reason=$4
+
+	content=$(cat "$file")
+	if [ "$content" != "$expected" ]
+	then
+		echo "reading $file expected $expected but $content"
+		echo "expected because: $expect_reason"
+		echo "$orig_content" > "$file"
+		exit 1
+	fi
+}
+
+source ./_chk_dependency.sh
+
+damon_onoff="$DBGFS/monitor_on"
+if [ $(cat "$damon_onoff") = "on" ]
+then
+	echo "monitoring is on"
+	exit $ksft_skip
+fi
diff --git a/tools/testing/selftests/damon/debugfs_attrs.sh b/tools/testing/selftests/damon/debugfs_attrs.sh
index 23a7b48ca7d36..902e312bca898 100644
--- a/tools/testing/selftests/damon/debugfs_attrs.sh
+++ b/tools/testing/selftests/damon/debugfs_attrs.sh
@@ -1,57 +1,7 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
-test_write_result() {
-	file=$1
-	content=$2
-	orig_content=$3
-	expect_reason=$4
-	expected=$5
-
-	echo "$content" > "$file"
-	if [ $? -ne "$expected" ]
-	then
-		echo "writing $content to $file doesn't return $expected"
-		echo "expected because: $expect_reason"
-		echo "$orig_content" > "$file"
-		exit 1
-	fi
-}
-
-test_write_succ() {
-	test_write_result "$1" "$2" "$3" "$4" 0
-}
-
-test_write_fail() {
-	test_write_result "$1" "$2" "$3" "$4" 1
-}
-
-test_content() {
-	file=$1
-	orig_content=$2
-	expected=$3
-	expect_reason=$4
-
-	content=$(cat "$file")
-	if [ "$content" != "$expected" ]
-	then
-		echo "reading $file expected $expected but $content"
-		echo "expected because: $expect_reason"
-		echo "$orig_content" > "$file"
-		exit 1
-	fi
-}
-
-source ./_chk_dependency.sh
-
-ksft_skip=4
-
-damon_onoff="$DBGFS/monitor_on"
-if [ $(cat "$damon_onoff") = "on" ]
-then
-	echo "monitoring is on"
-	exit $ksft_skip
-fi
+source _debugfs_common.sh
 
 # Test attrs file
 # ===============
@@ -65,62 +15,3 @@ test_write_fail "$file" "1 2 3 5 4" "$orig_content" \
 	"min_nr_regions > max_nr_regions"
 test_content "$file" "$orig_content" "1 2 3 4 5" "successfully written"
 echo "$orig_content" > "$file"
-
-# Test schemes file
-# =================
-
-file="$DBGFS/schemes"
-orig_content=$(cat "$file")
-
-test_write_succ "$file" "1 2 3 4 5 6 4 0 0 0 1 2 3 1 100 3 2 1" \
-	"$orig_content" "valid input"
-test_write_fail "$file" "1 2
-3 4 5 6 3 0 0 0 1 2 3 1 100 3 2 1" "$orig_content" "multi lines"
-test_write_succ "$file" "" "$orig_content" "disabling"
-test_write_fail "$file" "2 1 2 1 10 1 3 10 1 1 1 1 1 1 1 1 2 3" \
-	"$orig_content" "wrong condition ranges"
-echo "$orig_content" > "$file"
-
-# Test target_ids file
-# ====================
-
-file="$DBGFS/target_ids"
-orig_content=$(cat "$file")
-
-test_write_succ "$file" "1 2 3 4" "$orig_content" "valid input"
-test_write_succ "$file" "1 2 abc 4" "$orig_content" "still valid input"
-test_content "$file" "$orig_content" "1 2" "non-integer was there"
-test_write_succ "$file" "abc 2 3" "$orig_content" "the file allows wrong input"
-test_content "$file" "$orig_content" "" "wrong input written"
-test_write_succ "$file" "" "$orig_content" "empty input"
-test_content "$file" "$orig_content" "" "empty input written"
-echo "$orig_content" > "$file"
-
-# Test empty targets case
-# =======================
-
-orig_target_ids=$(cat "$DBGFS/target_ids")
-echo "" > "$DBGFS/target_ids"
-orig_monitor_on=$(cat "$DBGFS/monitor_on")
-test_write_fail "$DBGFS/monitor_on" "on" "orig_monitor_on" "empty target ids"
-echo "$orig_target_ids" > "$DBGFS/target_ids"
-
-# Test huge count read write
-# ==========================
-
-dmesg -C
-
-for file in "$DBGFS/"*
-do
-	./huge_count_read_write "$file"
-done
-
-if dmesg | grep -q WARNING
-then
-	dmesg
-	exit 1
-else
-	exit 0
-fi
-
-echo "PASS"
diff --git a/tools/testing/selftests/damon/debugfs_empty_targets.sh b/tools/testing/selftests/damon/debugfs_empty_targets.sh
new file mode 100644
index 0000000000000..87aff8083822f
--- /dev/null
+++ b/tools/testing/selftests/damon/debugfs_empty_targets.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source _debugfs_common.sh
+
+# Test empty targets case
+# =======================
+
+orig_target_ids=$(cat "$DBGFS/target_ids")
+echo "" > "$DBGFS/target_ids"
+orig_monitor_on=$(cat "$DBGFS/monitor_on")
+test_write_fail "$DBGFS/monitor_on" "on" "orig_monitor_on" "empty target ids"
+echo "$orig_target_ids" > "$DBGFS/target_ids"
diff --git a/tools/testing/selftests/damon/debugfs_huge_count_read_write.sh b/tools/testing/selftests/damon/debugfs_huge_count_read_write.sh
new file mode 100644
index 0000000000000..922cadac29506
--- /dev/null
+++ b/tools/testing/selftests/damon/debugfs_huge_count_read_write.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source _debugfs_common.sh
+
+# Test huge count read write
+# ==========================
+
+dmesg -C
+
+for file in "$DBGFS/"*
+do
+	./huge_count_read_write "$file"
+done
+
+if dmesg | grep -q WARNING
+then
+	dmesg
+	exit 1
+else
+	exit 0
+fi
diff --git a/tools/testing/selftests/damon/debugfs_schemes.sh b/tools/testing/selftests/damon/debugfs_schemes.sh
new file mode 100644
index 0000000000000..5b39ab44731cf
--- /dev/null
+++ b/tools/testing/selftests/damon/debugfs_schemes.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source _debugfs_common.sh
+
+# Test schemes file
+# =================
+
+file="$DBGFS/schemes"
+orig_content=$(cat "$file")
+
+test_write_succ "$file" "1 2 3 4 5 6 4 0 0 0 1 2 3 1 100 3 2 1" \
+	"$orig_content" "valid input"
+test_write_fail "$file" "1 2
+3 4 5 6 3 0 0 0 1 2 3 1 100 3 2 1" "$orig_content" "multi lines"
+test_write_succ "$file" "" "$orig_content" "disabling"
+test_write_fail "$file" "2 1 2 1 10 1 3 10 1 1 1 1 1 1 1 1 2 3" \
+	"$orig_content" "wrong condition ranges"
+echo "$orig_content" > "$file"
diff --git a/tools/testing/selftests/damon/debugfs_target_ids.sh b/tools/testing/selftests/damon/debugfs_target_ids.sh
new file mode 100644
index 0000000000000..49aeabdb0aae3
--- /dev/null
+++ b/tools/testing/selftests/damon/debugfs_target_ids.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source _debugfs_common.sh
+
+# Test target_ids file
+# ====================
+
+file="$DBGFS/target_ids"
+orig_content=$(cat "$file")
+
+test_write_succ "$file" "1 2 3 4" "$orig_content" "valid input"
+test_write_succ "$file" "1 2 abc 4" "$orig_content" "still valid input"
+test_content "$file" "$orig_content" "1 2" "non-integer was there"
+test_write_succ "$file" "abc 2 3" "$orig_content" "the file allows wrong input"
+test_content "$file" "$orig_content" "" "wrong input written"
+test_write_succ "$file" "" "$orig_content" "empty input"
+test_content "$file" "$orig_content" "" "empty input written"
+echo "$orig_content" > "$file"

From f94ee9a780991cfce598cc174d6e4aa422aeb7f0 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 24 Dec 2021 21:12:54 -0800
Subject: [PATCH 360/737] mm/damon/dbgfs: protect targets destructions with
 kdamond_lock

DAMON debugfs interface iterates current monitoring targets in
'dbgfs_target_ids_read()' while holding the corresponding
'kdamond_lock'.  However, it also destructs the monitoring targets in
'dbgfs_before_terminate()' without holding the lock.  This can result in
a use_after_free bug.  This commit avoids the race by protecting the
destruction with the corresponding 'kdamond_lock'.

Link: https://lkml.kernel.org/r/20211221094447.2241-1-sj@kernel.org
Reported-by: Sangwoo Bae <sangwoob@amazon.com>
Fixes: 4bc05954d007 ("mm/damon: implement a debugfs-based user space interface")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>	[5.15.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/dbgfs.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 1efac0022e9a4..4fbd729edc9e7 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -650,10 +650,12 @@ static void dbgfs_before_terminate(struct damon_ctx *ctx)
 	if (!targetid_is_pid(ctx))
 		return;
 
+	mutex_lock(&ctx->kdamond_lock);
 	damon_for_each_target_safe(t, next, ctx) {
 		put_pid((struct pid *)t->id);
 		damon_destroy_target(t);
 	}
+	mutex_unlock(&ctx->kdamond_lock);
 }
 
 static struct damon_ctx *dbgfs_new_ctx(void)

From b9f1b0ef917d11d1438a21f61b59ecb936bbf897 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 30 Dec 2021 20:12:34 -0800
Subject: [PATCH 361/737] mm/damon/dbgfs: fix 'struct pid' leaks in
 'dbgfs_target_ids_write()'

DAMON debugfs interface increases the reference counts of 'struct pid's
for targets from the 'target_ids' file write callback
('dbgfs_target_ids_write()'), but decreases the counts only in DAMON
monitoring termination callback ('dbgfs_before_terminate()').

Therefore, when 'target_ids' file is repeatedly written without DAMON
monitoring start/termination, the reference count is not decreased and
therefore memory for the 'struct pid' cannot be freed.  This commit
fixes this issue by decreasing the reference counts when 'target_ids' is
written.

Link: https://lkml.kernel.org/r/20211229124029.23348-1-sj@kernel.org
Fixes: 4bc05954d007 ("mm/damon: implement a debugfs-based user space interface")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>	[5.15+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/dbgfs.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 4fbd729edc9e7..ad65436756aff 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -353,6 +353,7 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
 		const char __user *buf, size_t count, loff_t *ppos)
 {
 	struct damon_ctx *ctx = file->private_data;
+	struct damon_target *t, *next_t;
 	bool id_is_pid = true;
 	char *kbuf, *nrs;
 	unsigned long *targets;
@@ -397,8 +398,12 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
 		goto unlock_out;
 	}
 
-	/* remove targets with previously-set primitive */
-	damon_set_targets(ctx, NULL, 0);
+	/* remove previously set targets */
+	damon_for_each_target_safe(t, next_t, ctx) {
+		if (targetid_is_pid(ctx))
+			put_pid((struct pid *)t->id);
+		damon_destroy_target(t);
+	}
 
 	/* Configure the context for the address space type */
 	if (id_is_pid)

From a1eee17d03c75d8db68afaab77f0c5bcfa99c1ca Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Fri, 20 Nov 2020 09:06:27 +0000
Subject: [PATCH 362/737] sched/numa: Rename nr_running and break out the magic
 number

This is simply a preparation patch to make the following patches easier
to read. No functional change.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lkml.kernel.org/r/20201120090630.3286-2-mgorman@techsingularity.net
---
 kernel/sched/fair.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d53f57ac76094..eaaba86732946 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1560,7 +1560,7 @@ struct task_numa_env {
 static unsigned long cpu_load(struct rq *rq);
 static unsigned long cpu_runnable(struct rq *rq);
 static unsigned long cpu_util(int cpu);
-static inline long adjust_numa_imbalance(int imbalance, int nr_running);
+static inline long adjust_numa_imbalance(int imbalance, int dst_running);
 
 static inline enum
 numa_type numa_classify(unsigned int imbalance_pct,
@@ -9317,7 +9317,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 	}
 }
 
-static inline long adjust_numa_imbalance(int imbalance, int nr_running)
+#define NUMA_IMBALANCE_MIN 2
+
+static inline long adjust_numa_imbalance(int imbalance, int dst_running)
 {
 	unsigned int imbalance_min;
 
@@ -9325,8 +9327,8 @@ static inline long adjust_numa_imbalance(int imbalance, int nr_running)
 	 * Allow a small imbalance based on a simple pair of communicating
 	 * tasks that remain local when the source domain is almost idle.
 	 */
-	imbalance_min = 2;
-	if (nr_running <= imbalance_min)
+	imbalance_min = NUMA_IMBALANCE_MIN;
+	if (dst_running <= imbalance_min)
 		return 0;
 
 	return imbalance;

From 911eb5079911b552fc970b493eccda5e3dfd401d Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Fri, 20 Nov 2020 09:06:28 +0000
Subject: [PATCH 363/737] sched: Avoid unnecessary calculation of load
 imbalance at clone time

In find_idlest_group(), the load imbalance is only relevant when the group
is either overloaded or fully busy but it is calculated unconditionally.
This patch moves the imbalance calculation to the context it is required.
Technically, it is a micro-optimisation but really the benefit is avoiding
confusing one type of imbalance with another depending on the group_type
in the next patch.

No functional change.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lkml.kernel.org/r/20201120090630.3286-3-mgorman@techsingularity.net
---
 kernel/sched/fair.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index eaaba86732946..4d2dcb1c10b73 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9103,9 +9103,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 			.group_type = group_overloaded,
 	};
 
-	imbalance = scale_load_down(NICE_0_LOAD) *
-				(sd->imbalance_pct-100) / 100;
-
 	do {
 		int local_group;
 
@@ -9159,6 +9156,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 	switch (local_sgs.group_type) {
 	case group_overloaded:
 	case group_fully_busy:
+
+		/* Calculate allowed imbalance based on load */
+		imbalance = scale_load_down(NICE_0_LOAD) *
+				(sd->imbalance_pct-100) / 100;
+
 		/*
 		 * When comparing groups across NUMA domains, it's possible for
 		 * the local domain to be very lightly loaded relative to the

From aa10c3ec8b43be2016a665d491d5cc3f3c81dcdb Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Fri, 20 Nov 2020 09:06:29 +0000
Subject: [PATCH 364/737] sched/numa: Allow a floating imbalance between NUMA
 nodes

Currently, an imbalance is only allowed when a destination node
is almost completely idle. This solved one basic class of problems
and was the cautious approach.

This patch revisits the possibility that NUMA nodes can be imbalanced
until 25% of the CPUs are occupied. The reasoning behind 25% is somewhat
superficial -- it's half the cores when HT is enabled.  At higher
utilisations, balancing should continue as normal and keep things even
until scheduler domains are fully busy or over utilised.

Note that this is not expected to be a universal win. Any benchmark
that prefers spreading as wide as possible with limited communication
will favour the old behaviour as there is more memory bandwidth.
Workloads that communicate heavily in pairs such as netperf or tbench
benefit. For the tests I ran, the vast majority of workloads saw
a benefit so it seems to be a worthwhile trade-off.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lkml.kernel.org/r/20201120090630.3286-4-mgorman@techsingularity.net
---
 kernel/sched/fair.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4d2dcb1c10b73..a09f54378f296 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1560,7 +1560,8 @@ struct task_numa_env {
 static unsigned long cpu_load(struct rq *rq);
 static unsigned long cpu_runnable(struct rq *rq);
 static unsigned long cpu_util(int cpu);
-static inline long adjust_numa_imbalance(int imbalance, int dst_running);
+static inline long adjust_numa_imbalance(int imbalance,
+					int dst_running, int dst_weight);
 
 static inline enum
 numa_type numa_classify(unsigned int imbalance_pct,
@@ -1940,7 +1941,8 @@ static void task_numa_find_cpu(struct task_numa_env *env,
 		src_running = env->src_stats.nr_running - 1;
 		dst_running = env->dst_stats.nr_running + 1;
 		imbalance = max(0, dst_running - src_running);
-		imbalance = adjust_numa_imbalance(imbalance, dst_running);
+		imbalance = adjust_numa_imbalance(imbalance, dst_running,
+							env->dst_stats.weight);
 
 		/* Use idle CPU if there is no imbalance */
 		if (!imbalance) {
@@ -9321,16 +9323,14 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 
 #define NUMA_IMBALANCE_MIN 2
 
-static inline long adjust_numa_imbalance(int imbalance, int dst_running)
+static inline long adjust_numa_imbalance(int imbalance,
+				int dst_running, int dst_weight)
 {
-	unsigned int imbalance_min;
-
 	/*
 	 * Allow a small imbalance based on a simple pair of communicating
-	 * tasks that remain local when the source domain is almost idle.
+	 * tasks that remain local when the destination is lightly loaded.
 	 */
-	imbalance_min = NUMA_IMBALANCE_MIN;
-	if (dst_running <= imbalance_min)
+	if (dst_running < (dst_weight >> 2) && imbalance <= NUMA_IMBALANCE_MIN)
 		return 0;
 
 	return imbalance;
@@ -9433,9 +9433,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 		}
 
 		/* Consider allowing a small imbalance between NUMA groups */
-		if (env->sd->flags & SD_NUMA)
+		if (env->sd->flags & SD_NUMA) {
 			env->imbalance = adjust_numa_imbalance(env->imbalance,
-						busiest->sum_nr_running);
+				busiest->sum_nr_running, busiest->group_weight);
+		}
 
 		return;
 	}

From fb8083cab93f93b65ca7390210f4f975bec1ad0f Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Fri, 20 Nov 2020 09:06:30 +0000
Subject: [PATCH 365/737] sched: Limit the amount of NUMA imbalance that can
 exist at fork time

At fork time currently, a local node can be allowed to fill completely
and allow the periodic load balancer to fix the problem. This can be
problematic in cases where a task creates lots of threads that idle until
woken as part of a worker poll causing a memory bandwidth problem.

However, a "real" workload suffers badly from this behaviour. The workload
in question is mostly NUMA aware but spawns large numbers of threads
that act as a worker pool that can be called from anywhere. These need
to spread early to get reasonable behaviour.

This patch limits how much a local node can fill before spilling over
to another node and it will not be a universal win. Specifically,
very short-lived workloads that fit within a NUMA node would prefer
the memory bandwidth.

As I cannot describe the "real" workload, the best proxy measure I found
for illustration was a page fault microbenchmark. It's not representative
of the workload but demonstrates the hazard of the current behaviour.

pft timings
                                 5.10.0-rc2             5.10.0-rc2
                          imbalancefloat-v2          forkspread-v2
Amean     elapsed-1        46.37 (   0.00%)       46.05 *   0.69%*
Amean     elapsed-4        12.43 (   0.00%)       12.49 *  -0.47%*
Amean     elapsed-7         7.61 (   0.00%)        7.55 *   0.81%*
Amean     elapsed-12        4.79 (   0.00%)        4.80 (  -0.17%)
Amean     elapsed-21        3.13 (   0.00%)        2.89 *   7.74%*
Amean     elapsed-30        3.65 (   0.00%)        2.27 *  37.62%*
Amean     elapsed-48        3.08 (   0.00%)        2.13 *  30.69%*
Amean     elapsed-79        2.00 (   0.00%)        1.90 *   4.95%*
Amean     elapsed-80        2.00 (   0.00%)        1.90 *   4.70%*

This is showing the time to fault regions belonging to threads. The target
machine has 80 logical CPUs and two nodes. Note the ~30% gain when the
machine is approximately the point where one node becomes fully utilised.
The slower results are borderline noise.

Kernel building shows similar benefits around the same balance point.
Generally performance was either neutral or better in the tests conducted.
The main consideration with this patch is the point where fork stops
spreading a task so some workloads may benefit from different balance
points but it would be a risky tuning parameter.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lkml.kernel.org/r/20201120090630.3286-5-mgorman@techsingularity.net
---
 kernel/sched/fair.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a09f54378f296..96b6fe8cc35bf 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9087,6 +9087,16 @@ static bool update_pick_idlest(struct sched_group *idlest,
 	return true;
 }
 
+/*
+ * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain.
+ * This is an approximation as the number of running tasks may not be
+ * related to the number of busy CPUs due to sched_setaffinity.
+ */
+static inline bool allow_numa_imbalance(int dst_running, int dst_weight)
+{
+	return (dst_running < (dst_weight >> 2));
+}
+
 /*
  * find_idlest_group() finds and returns the least busy CPU group within the
  * domain.
@@ -9219,7 +9229,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 			 * a real need of migration, periodic load balance will
 			 * take care of it.
 			 */
-			if (local_sgs.idle_cpus)
+			if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight))
 				return NULL;
 		}
 
@@ -9326,11 +9336,14 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 static inline long adjust_numa_imbalance(int imbalance,
 				int dst_running, int dst_weight)
 {
+	if (!allow_numa_imbalance(dst_running, dst_weight))
+		return imbalance;
+
 	/*
 	 * Allow a small imbalance based on a simple pair of communicating
 	 * tasks that remain local when the destination is lightly loaded.
 	 */
-	if (dst_running < (dst_weight >> 2) && imbalance <= NUMA_IMBALANCE_MIN)
+	if (imbalance <= NUMA_IMBALANCE_MIN)
 		return 0;
 
 	return imbalance;

From 1592645fe17109efea46aec18cbf46d2df2e9b21 Mon Sep 17 00:00:00 2001
From: Shaoying Xu <shaoyi@amazon.com>
Date: Sat, 5 Feb 2022 02:24:15 +0000
Subject: [PATCH 366/737] Add out-of-tree smartpqi driver Version 2.1.14-030 as
 external module under drivers/amazon

---
 drivers/amazon/Kconfig                        |    18 +
 drivers/amazon/Makefile                       |     1 +
 drivers/amazon/scsi/Makefile                  |     4 +
 drivers/amazon/scsi/smartpqi/Makefile         |     4 +
 drivers/amazon/scsi/smartpqi/smartpqi.h       |  1702 +++
 drivers/amazon/scsi/smartpqi/smartpqi_init.c  | 10246 ++++++++++++++++
 .../scsi/smartpqi/smartpqi_kernel_compat.c    |   391 +
 .../scsi/smartpqi/smartpqi_kernel_compat.h    |   674 +
 .../scsi/smartpqi/smartpqi_sas_transport.c    |   585 +
 drivers/amazon/scsi/smartpqi/smartpqi_sis.c   |   511 +
 drivers/amazon/scsi/smartpqi/smartpqi_sis.h   |    42 +
 11 files changed, 14178 insertions(+)
 create mode 100644 drivers/amazon/scsi/Makefile
 create mode 100644 drivers/amazon/scsi/smartpqi/Makefile
 create mode 100644 drivers/amazon/scsi/smartpqi/smartpqi.h
 create mode 100644 drivers/amazon/scsi/smartpqi/smartpqi_init.c
 create mode 100644 drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.c
 create mode 100644 drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.h
 create mode 100644 drivers/amazon/scsi/smartpqi/smartpqi_sas_transport.c
 create mode 100644 drivers/amazon/scsi/smartpqi/smartpqi_sis.c
 create mode 100644 drivers/amazon/scsi/smartpqi/smartpqi_sis.h

diff --git a/drivers/amazon/Kconfig b/drivers/amazon/Kconfig
index 2012cb50eb2a1..098784081e0bf 100644
--- a/drivers/amazon/Kconfig
+++ b/drivers/amazon/Kconfig
@@ -39,4 +39,22 @@ config AMAZON_IGB_UIO
 	  To compile this driver as a module, choose M here.
 	  The module will be called igb_uio.
 
+config AMAZON_SCSI_SMARTPQI
+	tristate "Microsemi PQI Driver"
+	depends on PCI && SCSI && !S390
+	select SCSI_SAS_ATTRS
+	select RAID_ATTRS
+	help
+	This driver supports Microsemi PQI controllers.
+
+	<http://www.microsemi.com>
+
+	To compile this driver as a module, choose M here: the
+	module will be called smartpqi.
+
+        Note: the aacraid driver will not manage a smartpqi
+              controller. You need to enable smartpqi for smartpqi
+              controllers. For more information, please see
+              Documentation/scsi/smartpqi.rst
+
 endif # AMAZON_DRIVER_UPDATES
diff --git a/drivers/amazon/Makefile b/drivers/amazon/Makefile
index fc5f70dd7487d..b10122feac02a 100644
--- a/drivers/amazon/Makefile
+++ b/drivers/amazon/Makefile
@@ -2,3 +2,4 @@
 # Amazon Driver Updates
 #
 obj-$(CONFIG_AMAZON_DRIVER_UPDATES)	+= net/
+obj-$(CONFIG_AMAZON_DRIVER_UPDATES)     += scsi/
diff --git a/drivers/amazon/scsi/Makefile b/drivers/amazon/scsi/Makefile
new file mode 100644
index 0000000000000..760bfe47e4cb5
--- /dev/null
+++ b/drivers/amazon/scsi/Makefile
@@ -0,0 +1,4 @@
+#
+# Amazon Driver Updates
+#
+obj-$(CONFIG_AMAZON_SCSI_SMARTPQI)	+= smartpqi/
diff --git a/drivers/amazon/scsi/smartpqi/Makefile b/drivers/amazon/scsi/smartpqi/Makefile
new file mode 100644
index 0000000000000..4b7ba538fb1fa
--- /dev/null
+++ b/drivers/amazon/scsi/smartpqi/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_AMAZON_SCSI_SMARTPQI) += smartpqi.o
+smartpqi-objs := smartpqi_init.o smartpqi_sis.o smartpqi_sas_transport.o smartpqi_kernel_compat.o
+EXTRA_CFLAGS += -DKCLASS5D
diff --git a/drivers/amazon/scsi/smartpqi/smartpqi.h b/drivers/amazon/scsi/smartpqi/smartpqi.h
new file mode 100644
index 0000000000000..7582041c5dda2
--- /dev/null
+++ b/drivers/amazon/scsi/smartpqi/smartpqi.h
@@ -0,0 +1,1702 @@
+/*
+ *    driver for Microchip PQI-based storage controllers
+ *    Copyright (c) 2019-2021 Microchip Technology Inc. and its subsidiaries
+ *    Copyright (c) 2016-2018 Microsemi Corporation
+ *    Copyright (c) 2016 PMC-Sierra, Inc.
+ *
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; version 2 of the License.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *    NON INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ *    Questions/Comments/Bugfixes to storagedev@microchip.com
+ *
+ */
+
+#if !defined(_SMARTPQI_H)
+#define _SMARTPQI_H
+
+#define TORTUGA				0
+
+#include <scsi/scsi_host.h>
+#include <linux/bsg-lib.h>
+
+#pragma pack(1)
+
+#define PQI_DEVICE_SIGNATURE	"PQI DREG"
+
+/* This structure is defined by the PQI specification. */
+struct pqi_device_registers {
+	__le64	signature;
+	u8	function_and_status_code;
+	u8	reserved[7];
+	u8	max_admin_iq_elements;
+	u8	max_admin_oq_elements;
+	u8	admin_iq_element_length;	/* in 16-byte units */
+	u8	admin_oq_element_length;	/* in 16-byte units */
+	__le16	max_reset_timeout;		/* in 100-millisecond units */
+	u8	reserved1[2];
+	__le32	legacy_intx_status;
+	__le32	legacy_intx_mask_set;
+	__le32	legacy_intx_mask_clear;
+	u8	reserved2[28];
+	__le32	device_status;
+	u8	reserved3[4];
+	__le64	admin_iq_pi_offset;
+	__le64	admin_oq_ci_offset;
+	__le64	admin_iq_element_array_addr;
+	__le64	admin_oq_element_array_addr;
+	__le64	admin_iq_ci_addr;
+	__le64	admin_oq_pi_addr;
+	u8	admin_iq_num_elements;
+	u8	admin_oq_num_elements;
+	__le16	admin_queue_int_msg_num;
+	u8	reserved4[4];
+	__le32	device_error;
+	u8	reserved5[4];
+	__le64	error_details;
+	__le32	device_reset;
+	__le32	power_action;
+	u8	reserved6[104];
+};
+
+/*
+ * controller registers
+ *
+ * These are defined by the Microchip implementation.
+ *
+ * Some registers (those named sis_*) are only used when in
+ * legacy SIS mode before we transition the controller into
+ * PQI mode.  There are a number of other SIS mode registers,
+ * but we don't use them, so only the SIS registers that we
+ * care about are defined here.  The offsets mentioned in the
+ * comments are the offsets from the PCIe BAR 0.
+ */
+struct pqi_ctrl_registers {
+	u8	reserved[0x20];
+	__le32	sis_host_to_ctrl_doorbell;		/* 20h */
+	u8	reserved1[0x34 - (0x20 + sizeof(__le32))];
+	__le32	sis_interrupt_mask;			/* 34h */
+	u8	reserved2[0x9c - (0x34 + sizeof(__le32))];
+	__le32	sis_ctrl_to_host_doorbell;		/* 9Ch */
+	u8	reserved3[0xa0 - (0x9c + sizeof(__le32))];
+	__le32	sis_ctrl_to_host_doorbell_clear;	/* A0h */
+	u8	reserved4[0xb0 - (0xa0 + sizeof(__le32))];
+	__le32	sis_driver_scratch;			/* B0h */
+	__le32  sis_product_identifier;			/* B4h */
+	u8	reserved5[0xbc - (0xb4 + sizeof(__le32))];
+	__le32	sis_firmware_status;			/* BCh */
+	u8	reserved6[0xcc - (0xbc + sizeof(__le32))];
+	__le32	sis_ctrl_shutdown_reason_code;		/* CCh */
+	u8	reserved7[0x1000 - (0xcc + sizeof(__le32))];
+	__le32	sis_mailbox[8];				/* 1000h */
+	u8	reserved8[0x4000 - (0x1000 + (sizeof(__le32) * 8))];
+	/*
+	 * The PQI spec states that the PQI registers should be at
+	 * offset 0 from the PCIe BAR 0.  However, we can't map
+	 * them at offset 0 because that would break compatibility
+	 * with the SIS registers.  So we map them at offset 4000h.
+	 */
+	struct pqi_device_registers pqi_registers;	/* 4000h */
+};
+
+#define PQI_DEVICE_REGISTERS_OFFSET	0x4000
+
+/* shutdown reasons for taking the controller offline */
+enum pqi_ctrl_shutdown_reason {
+	PQI_IQ_NOT_DRAINED_TIMEOUT = 1,
+	PQI_LUN_RESET_TIMEOUT = 2,
+	PQI_IO_PENDING_POST_LUN_RESET_TIMEOUT = 3,
+	PQI_NO_HEARTBEAT = 4,
+	PQI_FIRMWARE_KERNEL_NOT_UP = 5,
+	PQI_OFA_RESPONSE_TIMEOUT = 6,
+	PQI_INVALID_REQ_ID = 7,
+	PQI_UNMATCHED_REQ_ID = 8,
+	PQI_IO_PI_OUT_OF_RANGE = 9,
+	PQI_EVENT_PI_OUT_OF_RANGE = 10,
+	PQI_UNEXPECTED_IU_TYPE = 11
+};
+
+enum pqi_io_path {
+	RAID_PATH = 0,
+	AIO_PATH = 1
+};
+
+enum pqi_irq_mode {
+	IRQ_MODE_NONE,
+	IRQ_MODE_INTX,
+	IRQ_MODE_MSIX
+};
+
+struct pqi_sg_descriptor {
+	__le64	address;
+	__le32	length;
+	__le32	flags;
+};
+
+/* manifest constants for the flags field of pqi_sg_descriptor */
+#define CISS_SG_LAST	0x40000000
+#define CISS_SG_CHAIN	0x80000000
+
+struct pqi_iu_header {
+	u8	iu_type;
+	u8	reserved;
+	__le16	iu_length;	/* in bytes - does not include the length */
+				/* of this header */
+	__le16	response_queue_id;	/* specifies the OQ where the */
+					/* response IU is to be delivered */
+	u16	driver_flags;	/* reserved for driver use */
+};
+
+/* manifest constants for pqi_iu_header.driver_flags */
+#define PQI_DRIVER_NONBLOCKABLE_REQUEST		0x1
+
+/*
+ * According to the PQI spec, the IU header is only the first 4 bytes of our
+ * pqi_iu_header structure.
+ */
+#define PQI_REQUEST_HEADER_LENGTH	4
+
+struct pqi_general_admin_request {
+	struct pqi_iu_header header;
+	__le16	request_id;
+	u8	function_code;
+	union {
+		struct {
+			u8	reserved[33];
+			__le32	buffer_length;
+			struct pqi_sg_descriptor sg_descriptor;
+		} report_device_capability;
+
+		struct {
+			u8	reserved;
+			__le16	queue_id;
+			u8	reserved1[2];
+			__le64	element_array_addr;
+			__le64	ci_addr;
+			__le16	num_elements;
+			__le16	element_length;
+			u8	queue_protocol;
+			u8	reserved2[23];
+			__le32	vendor_specific;
+		} create_operational_iq;
+
+		struct {
+			u8	reserved;
+			__le16	queue_id;
+			u8	reserved1[2];
+			__le64	element_array_addr;
+			__le64	pi_addr;
+			__le16	num_elements;
+			__le16	element_length;
+			u8	queue_protocol;
+			u8	reserved2[3];
+			__le16	int_msg_num;
+			__le16	coalescing_count;
+			__le32	min_coalescing_time;
+			__le32	max_coalescing_time;
+			u8	reserved3[8];
+			__le32	vendor_specific;
+		} create_operational_oq;
+
+		struct {
+			u8	reserved;
+			__le16	queue_id;
+			u8	reserved1[50];
+		} delete_operational_queue;
+
+		struct {
+			u8	reserved;
+			__le16	queue_id;
+			u8	reserved1[46];
+			__le32	vendor_specific;
+		} change_operational_iq_properties;
+
+	} data;
+};
+
+struct pqi_general_admin_response {
+	struct pqi_iu_header header;
+	__le16	request_id;
+	u8	function_code;
+	u8	status;
+	union {
+		struct {
+			u8	status_descriptor[4];
+			__le64	iq_pi_offset;
+			u8	reserved[40];
+		} create_operational_iq;
+
+		struct {
+			u8	status_descriptor[4];
+			__le64	oq_ci_offset;
+			u8	reserved[40];
+		} create_operational_oq;
+	} data;
+};
+
+struct pqi_iu_layer_descriptor {
+	u8	inbound_spanning_supported : 1;
+	u8	reserved : 7;
+	u8	reserved1[5];
+	__le16	max_inbound_iu_length;
+	u8	outbound_spanning_supported : 1;
+	u8	reserved2 : 7;
+	u8	reserved3[5];
+	__le16	max_outbound_iu_length;
+};
+
+struct pqi_device_capability {
+	__le16	data_length;
+	u8	reserved[6];
+	u8	iq_arbitration_priority_support_bitmask;
+	u8	maximum_aw_a;
+	u8	maximum_aw_b;
+	u8	maximum_aw_c;
+	u8	max_arbitration_burst : 3;
+	u8	reserved1 : 4;
+	u8	iqa : 1;
+	u8	reserved2[2];
+	u8	iq_freeze : 1;
+	u8	reserved3 : 7;
+	__le16	max_inbound_queues;
+	__le16	max_elements_per_iq;
+	u8	reserved4[4];
+	__le16	max_iq_element_length;
+	__le16	min_iq_element_length;
+	u8	reserved5[2];
+	__le16	max_outbound_queues;
+	__le16	max_elements_per_oq;
+	__le16	intr_coalescing_time_granularity;
+	__le16	max_oq_element_length;
+	__le16	min_oq_element_length;
+	u8	reserved6[24];
+	struct pqi_iu_layer_descriptor iu_layer_descriptors[32];
+};
+
+#define PQI_MAX_EMBEDDED_SG_DESCRIPTORS		4
+#define PQI_MAX_EMBEDDED_R56_SG_DESCRIPTORS	3
+
+struct pqi_raid_path_request {
+	struct pqi_iu_header header;
+	__le16	request_id;
+	__le16	nexus_id;
+	__le32	buffer_length;
+	u8	lun_number[8];
+	__le16	protocol_specific;
+	u8	data_direction : 2;
+	u8	partial : 1;
+	u8	reserved1 : 4;
+	u8	fence : 1;
+	__le16	error_index;
+	u8	reserved2;
+	u8	task_attribute : 3;
+	u8	command_priority : 4;
+	u8	reserved3 : 1;
+	u8	reserved4 : 2;
+	u8	additional_cdb_bytes_usage : 3;
+	u8	reserved5 : 3;
+	u8	cdb[16];
+	u8	reserved6[12];
+	__le32	timeout;
+	struct pqi_sg_descriptor sg_descriptors[PQI_MAX_EMBEDDED_SG_DESCRIPTORS];
+};
+
+struct pqi_aio_path_request {
+	struct pqi_iu_header header;
+	__le16	request_id;
+	u8	reserved1[2];
+	__le32	nexus_id;
+	__le32	buffer_length;
+	u8	data_direction : 2;
+	u8	partial : 1;
+	u8	memory_type : 1;
+	u8	fence : 1;
+	u8	encryption_enable : 1;
+	u8	reserved2 : 2;
+	u8	task_attribute : 3;
+	u8	command_priority : 4;
+	u8	reserved3 : 1;
+	__le16	data_encryption_key_index;
+	__le32	encrypt_tweak_lower;
+	__le32	encrypt_tweak_upper;
+	u8	cdb[16];
+	__le16	error_index;
+	u8	num_sg_descriptors;
+	u8	cdb_length;
+	u8	lun_number[8];
+	u8	reserved4[4];
+	struct pqi_sg_descriptor sg_descriptors[PQI_MAX_EMBEDDED_SG_DESCRIPTORS];
+};
+
+#define PQI_RAID1_NVME_XFER_LIMIT	(32 * 1024)	/* 32 KiB */
+
+struct pqi_aio_r1_path_request {
+	struct pqi_iu_header header;
+	__le16	request_id;
+	__le16	volume_id;	/* ID of the RAID volume */
+	__le32	it_nexus_1;	/* IT nexus of the 1st drive in the RAID volume */
+	__le32	it_nexus_2;	/* IT nexus of the 2nd drive in the RAID volume */
+	__le32	it_nexus_3;	/* IT nexus of the 3rd drive in the RAID volume */
+	__le32	data_length;	/* total bytes to read/write */
+	u8	data_direction : 2;
+	u8	partial : 1;
+	u8	memory_type : 1;
+	u8	fence : 1;
+	u8	encryption_enable : 1;
+	u8	reserved : 2;
+	u8	task_attribute : 3;
+	u8	command_priority : 4;
+	u8	reserved2 : 1;
+	__le16	data_encryption_key_index;
+	u8	cdb[16];
+	__le16	error_index;
+	u8	num_sg_descriptors;
+	u8	cdb_length;
+	u8	num_drives;	/* number of drives in the RAID volume (2 or 3) */
+	u8	reserved3[3];
+	__le32	encrypt_tweak_lower;
+	__le32	encrypt_tweak_upper;
+	struct pqi_sg_descriptor sg_descriptors[PQI_MAX_EMBEDDED_SG_DESCRIPTORS];
+};
+
+#define PQI_DEFAULT_MAX_WRITE_RAID_5_6			(8 * 1024U)
+#define PQI_DEFAULT_MAX_TRANSFER_ENCRYPTED_SAS_SATA	(~0U)
+#define PQI_DEFAULT_MAX_TRANSFER_ENCRYPTED_NVME		(32 * 1024U)
+
+struct pqi_aio_r56_path_request {
+	struct pqi_iu_header header;
+	__le16	request_id;
+	__le16	volume_id;		/* ID of the RAID volume */
+	__le32	data_it_nexus;		/* IT nexus for the data drive */
+	__le32	p_parity_it_nexus;	/* IT nexus for the P parity drive */
+	__le32	q_parity_it_nexus;	/* IT nexus for the Q parity drive */
+	__le32	data_length;		/* total bytes to read/write */
+	u8	data_direction : 2;
+	u8	partial : 1;
+	u8	mem_type : 1;		/* 0 = PCIe, 1 = DDR */
+	u8	fence : 1;
+	u8	encryption_enable : 1;
+	u8	reserved : 2;
+	u8	task_attribute : 3;
+	u8	command_priority : 4;
+	u8	reserved1 : 1;
+	__le16	data_encryption_key_index;
+	u8	cdb[16];
+	__le16	error_index;
+	u8	num_sg_descriptors;
+	u8	cdb_length;
+	u8	xor_multiplier;
+	u8	reserved2[3];
+	__le32	encrypt_tweak_lower;
+	__le32	encrypt_tweak_upper;
+	__le64	row;			/* row = logical LBA/blocks per row */
+	u8	reserved3[8];
+	struct pqi_sg_descriptor sg_descriptors[PQI_MAX_EMBEDDED_R56_SG_DESCRIPTORS];
+};
+
+struct pqi_io_response {
+	struct pqi_iu_header header;
+	__le16	request_id;
+	__le16	error_index;
+	u8	reserved2[4];
+};
+
+struct pqi_general_management_request {
+	struct pqi_iu_header header;
+	__le16	request_id;
+	union {
+		struct {
+			u8	reserved[2];
+			__le32	buffer_length;
+			struct pqi_sg_descriptor sg_descriptors[3];
+		} report_event_configuration;
+
+		struct {
+			__le16	global_event_oq_id;
+			__le32	buffer_length;
+			struct pqi_sg_descriptor sg_descriptors[3];
+		} set_event_configuration;
+	} data;
+};
+
+struct pqi_event_descriptor {
+	u8	event_type;
+	u8	reserved;
+	__le16	oq_id;
+};
+
+struct pqi_event_config {
+	u8	reserved[2];
+	u8	num_event_descriptors;
+	u8	reserved1;
+	struct pqi_event_descriptor descriptors[1];
+};
+
+#define PQI_MAX_EVENT_DESCRIPTORS	255
+
+#define PQI_EVENT_OFA_MEMORY_ALLOCATION	0x0
+#define PQI_EVENT_OFA_QUIESCE		0x1
+#define PQI_EVENT_OFA_CANCELED		0x2
+
+struct pqi_event_response {
+	struct pqi_iu_header header;
+	u8	event_type;
+	u8	reserved2 : 7;
+	u8	request_acknowledge : 1;
+	__le16	event_id;
+	__le32	additional_event_id;
+	union {
+		struct {
+			__le32	bytes_requested;
+			u8	reserved[12];
+		} ofa_memory_allocation;
+
+		struct {
+			__le16	reason;		/* reason for cancellation */
+			u8	reserved[14];
+		} ofa_cancelled;
+	} data;
+};
+
+struct pqi_event_acknowledge_request {
+	struct pqi_iu_header header;
+	u8	event_type;
+	u8	reserved2;
+	__le16	event_id;
+	__le32	additional_event_id;
+};
+
+struct pqi_task_management_request {
+	struct pqi_iu_header header;
+	__le16	request_id;
+	__le16	nexus_id;
+	u8	reserved[2];
+	__le16  timeout;
+	u8	lun_number[8];
+	__le16	protocol_specific;
+	__le16	outbound_queue_id_to_manage;
+	__le16	request_id_to_manage;
+	u8	task_management_function;
+	u8	reserved2 : 7;
+	u8	fence : 1;
+};
+
+#define SOP_TASK_MANAGEMENT_LUN_RESET	0x8
+
+struct pqi_task_management_response {
+	struct pqi_iu_header header;
+	__le16	request_id;
+	__le16	nexus_id;
+	u8	additional_response_info[3];
+	u8	response_code;
+};
+
+struct pqi_vendor_general_request {
+	struct pqi_iu_header header;
+	__le16	request_id;
+	__le16	function_code;
+	union {
+		struct {
+			__le16	first_section;
+			__le16	last_section;
+			u8	reserved[48];
+		} config_table_update;
+
+		struct {
+			__le64	buffer_address;
+			__le32	buffer_length;
+			u8	reserved[40];
+		} ofa_memory_allocation;
+	} data;
+};
+
+struct pqi_vendor_general_response {
+	struct pqi_iu_header header;
+	__le16	request_id;
+	__le16	function_code;
+	__le16	status;
+	u8	reserved[2];
+};
+
+#define PQI_VENDOR_GENERAL_CONFIG_TABLE_UPDATE	0
+#define PQI_VENDOR_GENERAL_HOST_MEMORY_UPDATE	1
+
+#define PQI_OFA_VERSION			1
+#define PQI_OFA_SIGNATURE		"OFA_QRM"
+#define PQI_OFA_MAX_SG_DESCRIPTORS	64
+
+struct pqi_ofa_memory {
+	__le64	signature;	/* "OFA_QRM" */
+	__le16	version;	/* version of this struct (1 = 1st version) */
+	u8	reserved[62];
+	__le32	bytes_allocated;	/* total allocated memory in bytes */
+	__le16	num_memory_descriptors;
+	u8	reserved1[2];
+	struct pqi_sg_descriptor sg_descriptor[PQI_OFA_MAX_SG_DESCRIPTORS];
+};
+
+struct pqi_aio_error_info {
+	u8	status;
+	u8	service_response;
+	u8	data_present;
+	u8	reserved;
+	__le32	residual_count;
+	__le16	data_length;
+	__le16	reserved1;
+	u8	data[256];
+};
+
+struct pqi_raid_error_info {
+	u8	data_in_result;
+	u8	data_out_result;
+	u8	reserved[3];
+	u8	status;
+	__le16	status_qualifier;
+	__le16	sense_data_length;
+	__le16	response_data_length;
+	__le32	data_in_transferred;
+	__le32	data_out_transferred;
+	u8	data[256];
+};
+
+#define PQI_REQUEST_IU_TASK_MANAGEMENT			0x13
+#define PQI_REQUEST_IU_RAID_PATH_IO			0x14
+#define PQI_REQUEST_IU_AIO_PATH_IO			0x15
+#define PQI_REQUEST_IU_AIO_PATH_RAID5_IO		0x18
+#define PQI_REQUEST_IU_AIO_PATH_RAID6_IO		0x19
+#define PQI_REQUEST_IU_AIO_PATH_RAID1_IO		0x1A
+#define PQI_REQUEST_IU_GENERAL_ADMIN			0x60
+#define PQI_REQUEST_IU_REPORT_VENDOR_EVENT_CONFIG	0x72
+#define PQI_REQUEST_IU_SET_VENDOR_EVENT_CONFIG		0x73
+#define PQI_REQUEST_IU_VENDOR_GENERAL			0x75
+#define PQI_REQUEST_IU_ACKNOWLEDGE_VENDOR_EVENT		0xf6
+
+#define PQI_RESPONSE_IU_GENERAL_MANAGEMENT		0x81
+#define PQI_RESPONSE_IU_TASK_MANAGEMENT			0x93
+#define PQI_RESPONSE_IU_GENERAL_ADMIN			0xe0
+#define PQI_RESPONSE_IU_RAID_PATH_IO_SUCCESS		0xf0
+#define PQI_RESPONSE_IU_AIO_PATH_IO_SUCCESS		0xf1
+#define PQI_RESPONSE_IU_RAID_PATH_IO_ERROR		0xf2
+#define PQI_RESPONSE_IU_AIO_PATH_IO_ERROR		0xf3
+#define PQI_RESPONSE_IU_AIO_PATH_DISABLED		0xf4
+#define PQI_RESPONSE_IU_VENDOR_EVENT			0xf5
+#define PQI_RESPONSE_IU_VENDOR_GENERAL			0xf7
+
+#define PQI_GENERAL_ADMIN_FUNCTION_REPORT_DEVICE_CAPABILITY	0x0
+#define PQI_GENERAL_ADMIN_FUNCTION_CREATE_IQ			0x10
+#define PQI_GENERAL_ADMIN_FUNCTION_CREATE_OQ			0x11
+#define PQI_GENERAL_ADMIN_FUNCTION_DELETE_IQ			0x12
+#define PQI_GENERAL_ADMIN_FUNCTION_DELETE_OQ			0x13
+#define PQI_GENERAL_ADMIN_FUNCTION_CHANGE_IQ_PROPERTY		0x14
+
+#define PQI_GENERAL_ADMIN_STATUS_SUCCESS	0x0
+
+#define PQI_IQ_PROPERTY_IS_AIO_QUEUE	0x1
+
+#define PQI_GENERAL_ADMIN_IU_LENGTH		0x3c
+#define PQI_PROTOCOL_SOP			0x0
+
+#define PQI_DATA_IN_OUT_GOOD					0x0
+#define PQI_DATA_IN_OUT_UNDERFLOW				0x1
+#define PQI_DATA_IN_OUT_BUFFER_ERROR				0x40
+#define PQI_DATA_IN_OUT_BUFFER_OVERFLOW				0x41
+#define PQI_DATA_IN_OUT_BUFFER_OVERFLOW_DESCRIPTOR_AREA		0x42
+#define PQI_DATA_IN_OUT_BUFFER_OVERFLOW_BRIDGE			0x43
+#define PQI_DATA_IN_OUT_PCIE_FABRIC_ERROR			0x60
+#define PQI_DATA_IN_OUT_PCIE_COMPLETION_TIMEOUT			0x61
+#define PQI_DATA_IN_OUT_PCIE_COMPLETER_ABORT_RECEIVED		0x62
+#define PQI_DATA_IN_OUT_PCIE_UNSUPPORTED_REQUEST_RECEIVED	0x63
+#define PQI_DATA_IN_OUT_PCIE_ECRC_CHECK_FAILED			0x64
+#define PQI_DATA_IN_OUT_PCIE_UNSUPPORTED_REQUEST		0x65
+#define PQI_DATA_IN_OUT_PCIE_ACS_VIOLATION			0x66
+#define PQI_DATA_IN_OUT_PCIE_TLP_PREFIX_BLOCKED			0x67
+#define PQI_DATA_IN_OUT_PCIE_POISONED_MEMORY_READ		0x6F
+#define PQI_DATA_IN_OUT_ERROR					0xf0
+#define PQI_DATA_IN_OUT_PROTOCOL_ERROR				0xf1
+#define PQI_DATA_IN_OUT_HARDWARE_ERROR				0xf2
+#define PQI_DATA_IN_OUT_UNSOLICITED_ABORT			0xf3
+#define PQI_DATA_IN_OUT_ABORTED					0xf4
+#define PQI_DATA_IN_OUT_TIMEOUT					0xf5
+
+#define CISS_CMD_STATUS_SUCCESS			0x0
+#define CISS_CMD_STATUS_TARGET_STATUS		0x1
+#define CISS_CMD_STATUS_DATA_UNDERRUN		0x2
+#define CISS_CMD_STATUS_DATA_OVERRUN		0x3
+#define CISS_CMD_STATUS_INVALID			0x4
+#define CISS_CMD_STATUS_PROTOCOL_ERROR		0x5
+#define CISS_CMD_STATUS_HARDWARE_ERROR		0x6
+#define CISS_CMD_STATUS_CONNECTION_LOST		0x7
+#define CISS_CMD_STATUS_ABORTED			0x8
+#define CISS_CMD_STATUS_ABORT_FAILED		0x9
+#define CISS_CMD_STATUS_UNSOLICITED_ABORT	0xa
+#define CISS_CMD_STATUS_TIMEOUT			0xb
+#define CISS_CMD_STATUS_UNABORTABLE		0xc
+#define CISS_CMD_STATUS_TMF			0xd
+#define CISS_CMD_STATUS_AIO_DISABLED		0xe
+
+#define PQI_CMD_STATUS_ABORTED	CISS_CMD_STATUS_ABORTED
+
+#define PQI_NUM_EVENT_QUEUE_ELEMENTS	32
+#define PQI_EVENT_OQ_ELEMENT_LENGTH	sizeof(struct pqi_event_response)
+
+#define PQI_EVENT_TYPE_HOTPLUG			0x1
+#define PQI_EVENT_TYPE_HARDWARE			0x2
+#define PQI_EVENT_TYPE_PHYSICAL_DEVICE		0x4
+#define PQI_EVENT_TYPE_LOGICAL_DEVICE		0x5
+#define PQI_EVENT_TYPE_OFA			0xfb
+#define PQI_EVENT_TYPE_AIO_STATE_CHANGE		0xfd
+#define PQI_EVENT_TYPE_AIO_CONFIG_CHANGE	0xfe
+
+#pragma pack()
+
+#define PQI_ERROR_BUFFER_ELEMENT_LENGTH		\
+	sizeof(struct pqi_raid_error_info)
+
+/* these values are based on our implementation */
+#define PQI_ADMIN_IQ_NUM_ELEMENTS		8
+#define PQI_ADMIN_OQ_NUM_ELEMENTS		20
+#define PQI_ADMIN_IQ_ELEMENT_LENGTH		64
+#define PQI_ADMIN_OQ_ELEMENT_LENGTH		64
+
+#define PQI_OPERATIONAL_IQ_ELEMENT_LENGTH	128
+#define PQI_OPERATIONAL_OQ_ELEMENT_LENGTH	16
+
+#define PQI_MIN_MSIX_VECTORS		1
+#define PQI_MAX_MSIX_VECTORS		64
+
+/* these values are defined by the PQI spec */
+#define PQI_MAX_NUM_ELEMENTS_ADMIN_QUEUE	255
+#define PQI_MAX_NUM_ELEMENTS_OPERATIONAL_QUEUE	65535
+
+#define PQI_QUEUE_ELEMENT_ARRAY_ALIGNMENT	64
+#define PQI_QUEUE_ELEMENT_LENGTH_ALIGNMENT	16
+#define PQI_ADMIN_INDEX_ALIGNMENT		64
+#define PQI_OPERATIONAL_INDEX_ALIGNMENT		4
+
+#define PQI_MIN_OPERATIONAL_QUEUE_ID		1
+#define PQI_MAX_OPERATIONAL_QUEUE_ID		65535
+
+#define PQI_AIO_SERV_RESPONSE_COMPLETE		0
+#define PQI_AIO_SERV_RESPONSE_FAILURE		1
+#define PQI_AIO_SERV_RESPONSE_TMF_COMPLETE	2
+#define PQI_AIO_SERV_RESPONSE_TMF_SUCCEEDED	3
+#define PQI_AIO_SERV_RESPONSE_TMF_REJECTED	4
+#define PQI_AIO_SERV_RESPONSE_TMF_INCORRECT_LUN	5
+
+#define PQI_AIO_STATUS_IO_ERROR			0x1
+#define PQI_AIO_STATUS_IO_ABORTED		0x2
+#define PQI_AIO_STATUS_NO_PATH_TO_DEVICE	0x3
+#define PQI_AIO_STATUS_INVALID_DEVICE		0x4
+#define PQI_AIO_STATUS_AIO_PATH_DISABLED	0xe
+#define PQI_AIO_STATUS_UNDERRUN			0x51
+#define PQI_AIO_STATUS_OVERRUN			0x75
+
+typedef u32 pqi_index_t;
+
+/* SOP data direction flags */
+#define SOP_NO_DIRECTION_FLAG	0
+#define SOP_WRITE_FLAG		1	/* host writes data to Data-Out */
+					/* buffer */
+#define SOP_READ_FLAG		2	/* host receives data from Data-In */
+					/* buffer */
+#define SOP_BIDIRECTIONAL	3	/* data is transferred from the */
+					/* Data-Out buffer and data is */
+					/* transferred to the Data-In buffer */
+
+#define SOP_TASK_ATTRIBUTE_SIMPLE		0
+#define SOP_TASK_ATTRIBUTE_HEAD_OF_QUEUE	1
+#define SOP_TASK_ATTRIBUTE_ORDERED		2
+#define SOP_TASK_ATTRIBUTE_ACA			4
+
+#define SOP_TMF_COMPLETE		0x0
+#define SOP_TMF_REJECTED		0x4
+#define SOP_TMF_FUNCTION_SUCCEEDED	0x8
+
+/* additional CDB bytes usage field codes */
+#define SOP_ADDITIONAL_CDB_BYTES_0	0	/* 16-byte CDB */
+#define SOP_ADDITIONAL_CDB_BYTES_4	1	/* 20-byte CDB */
+#define SOP_ADDITIONAL_CDB_BYTES_8	2	/* 24-byte CDB */
+#define SOP_ADDITIONAL_CDB_BYTES_12	3	/* 28-byte CDB */
+#define SOP_ADDITIONAL_CDB_BYTES_16	4	/* 32-byte CDB */
+
+/*
+ * The purpose of this structure is to obtain proper alignment of objects in
+ * an admin queue pair.
+ */
+struct pqi_admin_queues_aligned {
+	__aligned(PQI_QUEUE_ELEMENT_ARRAY_ALIGNMENT)
+		u8	iq_element_array[PQI_ADMIN_IQ_ELEMENT_LENGTH]
+					[PQI_ADMIN_IQ_NUM_ELEMENTS];
+	__aligned(PQI_QUEUE_ELEMENT_ARRAY_ALIGNMENT)
+		u8	oq_element_array[PQI_ADMIN_OQ_ELEMENT_LENGTH]
+					[PQI_ADMIN_OQ_NUM_ELEMENTS];
+	__aligned(PQI_ADMIN_INDEX_ALIGNMENT) pqi_index_t iq_ci;
+	__aligned(PQI_ADMIN_INDEX_ALIGNMENT) pqi_index_t oq_pi;
+};
+
+struct pqi_admin_queues {
+	void		*iq_element_array;
+	void		*oq_element_array;
+	pqi_index_t __iomem *iq_ci;
+	pqi_index_t __iomem *oq_pi;
+	dma_addr_t	iq_element_array_bus_addr;
+	dma_addr_t	oq_element_array_bus_addr;
+	dma_addr_t	iq_ci_bus_addr;
+	dma_addr_t	oq_pi_bus_addr;
+	__le32 __iomem	*iq_pi;
+	pqi_index_t	iq_pi_copy;
+	__le32 __iomem	*oq_ci;
+	pqi_index_t	oq_ci_copy;
+	struct task_struct *task;
+	u16		int_msg_num;
+};
+
+struct pqi_queue_group {
+	struct pqi_ctrl_info *ctrl_info;	/* backpointer */
+	u16		iq_id[2];
+	u16		oq_id;
+	u16		int_msg_num;
+	void		*iq_element_array[2];
+	void		*oq_element_array;
+	dma_addr_t	iq_element_array_bus_addr[2];
+	dma_addr_t	oq_element_array_bus_addr;
+	__le32 __iomem	*iq_pi[2];
+	pqi_index_t	iq_pi_copy[2];
+	pqi_index_t __iomem *iq_ci[2];
+	pqi_index_t __iomem *oq_pi;
+	dma_addr_t	iq_ci_bus_addr[2];
+	dma_addr_t	oq_pi_bus_addr;
+	__le32 __iomem	*oq_ci;
+	pqi_index_t	oq_ci_copy;
+	spinlock_t	submit_lock[2];	/* protect submission queue */
+	struct list_head request_list[2];
+};
+
+struct pqi_event_queue {
+	u16		oq_id;
+	u16		int_msg_num;
+	void		*oq_element_array;
+	pqi_index_t __iomem *oq_pi;
+	dma_addr_t	oq_element_array_bus_addr;
+	dma_addr_t	oq_pi_bus_addr;
+	__le32 __iomem	*oq_ci;
+	pqi_index_t	oq_ci_copy;
+};
+
+#define PQI_DEFAULT_QUEUE_GROUP		0
+#if TORTUGA
+#define PQI_MAX_QUEUE_GROUPS		1
+#else
+#define PQI_MAX_QUEUE_GROUPS		PQI_MAX_MSIX_VECTORS
+#endif
+
+struct pqi_encryption_info {
+	u16	data_encryption_key_index;
+	u32	encrypt_tweak_lower;
+	u32	encrypt_tweak_upper;
+};
+
+#pragma pack(1)
+
+#define PQI_CONFIG_TABLE_SIGNATURE	"CFGTABLE"
+#define PQI_CONFIG_TABLE_MAX_LENGTH	((u16)~0)
+
+/* configuration table section IDs */
+#define PQI_CONFIG_TABLE_ALL_SECTIONS			(-1)
+#define PQI_CONFIG_TABLE_SECTION_GENERAL_INFO		0
+#define PQI_CONFIG_TABLE_SECTION_FIRMWARE_FEATURES	1
+#define PQI_CONFIG_TABLE_SECTION_FIRMWARE_ERRATA	2
+#define PQI_CONFIG_TABLE_SECTION_DEBUG			3
+#define PQI_CONFIG_TABLE_SECTION_HEARTBEAT		4
+#define PQI_CONFIG_TABLE_SECTION_SOFT_RESET		5
+
+struct pqi_config_table {
+	u8	signature[8];		/* "CFGTABLE" */
+	__le32	first_section_offset;	/* offset in bytes from the base */
+					/* address of this table to the */
+					/* first section */
+};
+
+struct pqi_config_table_section_header {
+	__le16	section_id;		/* as defined by the */
+					/* PQI_CONFIG_TABLE_SECTION_* */
+					/* manifest constants above */
+	__le16	next_section_offset;	/* offset in bytes from base */
+					/* address of the table of the */
+					/* next section or 0 if last entry */
+};
+
+struct pqi_config_table_general_info {
+	struct pqi_config_table_section_header header;
+	__le32	section_length;		/* size of this section in bytes */
+					/* including the section header */
+	__le32	max_outstanding_requests;	/* max. outstanding */
+						/* commands supported by */
+						/* the controller */
+	__le32	max_sg_size;		/* max. transfer size of a single */
+					/* command */
+	__le32	max_sg_per_request;	/* max. number of scatter-gather */
+					/* entries supported in a single */
+					/* command */
+};
+
+struct pqi_config_table_firmware_features {
+	struct pqi_config_table_section_header header;
+	__le16	num_elements;
+	u8	features_supported[];
+/*	u8	features_requested_by_host[]; */
+/*	u8	features_enabled[]; */
+/* The 2 fields below are only valid if the MAX_KNOWN_FEATURE bit is set. */
+/*	__le16	firmware_max_known_feature; */
+/*	__le16	host_max_known_feature; */
+};
+
+#define PQI_FIRMWARE_FEATURE_OFA				0
+#define PQI_FIRMWARE_FEATURE_SMP				1
+#define PQI_FIRMWARE_FEATURE_MAX_KNOWN_FEATURE			2
+#define PQI_FIRMWARE_FEATURE_RAID_0_READ_BYPASS			3
+#define PQI_FIRMWARE_FEATURE_RAID_1_READ_BYPASS			4
+#define PQI_FIRMWARE_FEATURE_RAID_5_READ_BYPASS			5
+#define PQI_FIRMWARE_FEATURE_RAID_6_READ_BYPASS			6
+#define PQI_FIRMWARE_FEATURE_RAID_0_WRITE_BYPASS		7
+#define PQI_FIRMWARE_FEATURE_RAID_1_WRITE_BYPASS		8
+#define PQI_FIRMWARE_FEATURE_RAID_5_WRITE_BYPASS		9
+#define PQI_FIRMWARE_FEATURE_RAID_6_WRITE_BYPASS		10
+#define PQI_FIRMWARE_FEATURE_SOFT_RESET_HANDSHAKE		11
+#define PQI_FIRMWARE_FEATURE_UNIQUE_SATA_WWN			12
+#define PQI_FIRMWARE_FEATURE_RAID_IU_TIMEOUT			13
+#define PQI_FIRMWARE_FEATURE_TMF_IU_TIMEOUT			14
+#define PQI_FIRMWARE_FEATURE_RAID_BYPASS_ON_ENCRYPTED_NVME	15
+#define PQI_FIRMWARE_FEATURE_UNIQUE_WWID_IN_REPORT_PHYS_LUN	16
+#define PQI_FIRMWARE_FEATURE_FW_TRIAGE				17
+#define PQI_FIRMWARE_FEATURE_RPL_EXTENDED_FORMAT_4_5		18
+#define PQI_FIRMWARE_FEATURE_MAXIMUM				18
+
+struct pqi_config_table_debug {
+	struct pqi_config_table_section_header header;
+	__le32	scratchpad;
+};
+
+struct pqi_config_table_heartbeat {
+	struct pqi_config_table_section_header header;
+	__le32	heartbeat_counter;
+};
+
+struct pqi_config_table_soft_reset {
+	struct pqi_config_table_section_header header;
+	u8 soft_reset_status;
+};
+
+#define PQI_SOFT_RESET_INITIATE		0x1
+#define PQI_SOFT_RESET_ABORT		0x2
+
+enum pqi_soft_reset_status {
+	RESET_INITIATE_FIRMWARE,
+	RESET_INITIATE_DRIVER,
+	RESET_ABORT,
+	RESET_NORESPONSE,
+	RESET_TIMEDOUT
+};
+
+union pqi_reset_register {
+	struct {
+		u32	reset_type : 3;
+		u32	reserved : 2;
+		u32	reset_action : 3;
+		u32	hold_in_pd1 : 1;
+		u32	reserved2 : 23;
+	} bits;
+	u32	all_bits;
+};
+
+#define PQI_RESET_ACTION_RESET		0x1
+
+#define PQI_RESET_TYPE_NO_RESET		0x0
+#define PQI_RESET_TYPE_SOFT_RESET	0x1
+#define PQI_RESET_TYPE_FIRM_RESET	0x2
+#define PQI_RESET_TYPE_HARD_RESET	0x3
+
+#define PQI_RESET_ACTION_COMPLETED	0x2
+
+#define PQI_RESET_POLL_INTERVAL_MSECS	100
+
+#if TORTUGA
+#define PQI_MAX_OUTSTANDING_REQUESTS		32
+#define PQI_MAX_OUTSTANDING_REQUESTS_KDUMP	PQI_MAX_OUTSTANDING_REQUESTS
+#define PQI_MAX_TRANSFER_SIZE			(512 * 1024U)
+#define PQI_MAX_TRANSFER_SIZE_KDUMP		PQI_MAX_TRANSFER_SIZE
+#else
+#define PQI_MAX_OUTSTANDING_REQUESTS		((u32)~0)
+#define PQI_MAX_OUTSTANDING_REQUESTS_KDUMP	32
+#define PQI_MAX_TRANSFER_SIZE			(4 * 1024U * 1024U)
+#define PQI_MAX_TRANSFER_SIZE_KDUMP		(512 * 1024U)
+#endif
+
+#define RAID_MAP_MAX_ENTRIES		1024
+
+#define PQI_PHYSICAL_DEVICE_BUS		0
+#define PQI_RAID_VOLUME_BUS		1
+#define PQI_HBA_BUS			2
+#define PQI_EXTERNAL_RAID_VOLUME_BUS	3
+#define PQI_MAX_BUS			PQI_EXTERNAL_RAID_VOLUME_BUS
+#define PQI_VSEP_CISS_BTL		379
+
+struct report_lun_header {
+	__be32	list_length;
+	u8	flags;
+	u8	reserved[3];
+};
+
+/* for flags field of struct report_lun_header */
+#define CISS_REPORT_LOG_FLAG_UNIQUE_LUN_ID	(1 << 0)
+#define CISS_REPORT_LOG_FLAG_QUEUE_DEPTH	(1 << 5)
+#define CISS_REPORT_LOG_FLAG_DRIVE_TYPE_MIX	(1 << 6)
+
+#define CISS_REPORT_PHYS_FLAG_EXTENDED_FORMAT_2		0x2
+#define CISS_REPORT_PHYS_FLAG_EXTENDED_FORMAT_4		0x4
+#define CISS_REPORT_PHYS_FLAG_EXTENDED_FORMAT_MASK	0xf
+
+
+struct report_log_lun {
+	u8	lunid[8];
+	u8	volume_id[16];
+};
+
+struct report_log_lun_list {
+	struct report_lun_header header;
+	struct report_log_lun lun_entries[1];
+};
+
+struct report_phys_lun_8byte_wwid {
+	u8	lunid[8];
+	__be64	wwid;
+	u8	device_type;
+	u8	device_flags;
+	u8	lun_count;	/* number of LUNs in a multi-LUN device */
+	u8	redundant_paths;
+	u32	aio_handle;
+};
+
+struct report_phys_lun_16byte_wwid {
+	u8	lunid[8];
+	u8	wwid[16];
+	u8	device_type;
+	u8	device_flags;
+	u8	lun_count;	/* number of LUNs in a multi-LUN device */
+	u8	redundant_paths;
+	u32	aio_handle;
+};
+
+/* for device_flags field of struct report_phys_lun_extended_entry */
+#define CISS_REPORT_PHYS_DEV_FLAG_AIO_ENABLED	0x8
+
+struct report_phys_lun_8byte_wwid_list {
+	struct report_lun_header header;
+	struct report_phys_lun_8byte_wwid lun_entries[1];
+};
+
+struct report_phys_lun_16byte_wwid_list {
+	struct report_lun_header header;
+	struct report_phys_lun_16byte_wwid lun_entries[1];
+};
+
+struct raid_map_disk_data {
+	u32	aio_handle;
+	u8	xor_mult[2];
+	u8	reserved[2];
+};
+
+/* for flags field of RAID map */
+#define RAID_MAP_ENCRYPTION_ENABLED	0x1
+
+struct raid_map {
+	__le32	structure_size;		/* size of entire structure in bytes */
+	__le32	volume_blk_size;	/* bytes / block in the volume */
+	__le64	volume_blk_cnt;		/* logical blocks on the volume */
+	u8	phys_blk_shift;		/* shift factor to convert between */
+					/* units of logical blocks and */
+					/* physical disk blocks */
+	u8	parity_rotation_shift;	/* shift factor to convert between */
+					/* units of logical stripes and */
+					/* physical stripes */
+	__le16	strip_size;		/* blocks used on each disk / stripe */
+	__le64	disk_starting_blk;	/* first disk block used in volume */
+	__le64	disk_blk_cnt;		/* disk blocks used by volume / disk */
+	__le16	data_disks_per_row;	/* data disk entries / row in the map */
+	__le16	metadata_disks_per_row;	/* mirror/parity disk entries / row */
+					/* in the map */
+	__le16	row_cnt;		/* rows in each layout map */
+	__le16	layout_map_count;	/* layout maps (1 map per */
+					/* mirror parity group) */
+	__le16	flags;
+	__le16	data_encryption_key_index;
+	u8	reserved[16];
+	struct raid_map_disk_data disk_data[RAID_MAP_MAX_ENTRIES];
+};
+
+#pragma pack()
+
+struct pqi_scsi_dev_raid_map_data {
+	bool	is_write;
+	u8	raid_level;
+	u32	map_index;
+	u64	first_block;
+	u64	last_block;
+	u32	data_length;
+	u32	block_cnt;
+	u32	blocks_per_row;
+	u64	first_row;
+	u64	last_row;
+	u32	first_row_offset;
+	u32	last_row_offset;
+	u32	first_column;
+	u32	last_column;
+	u64	r5or6_first_row;
+	u64	r5or6_last_row;
+	u32	r5or6_first_row_offset;
+	u32	r5or6_last_row_offset;
+	u32	r5or6_first_column;
+	u32	r5or6_last_column;
+	u16	data_disks_per_row;
+	u32	total_disks_per_row;
+	u16	layout_map_count;
+	u32	stripesize;
+	u16	strip_size;
+	u32	first_group;
+	u32	last_group;
+	u32	map_row;
+	u32	aio_handle;
+	u64	disk_block;
+	u32	disk_block_cnt;
+	u8	cdb[16];
+	u8	cdb_length;
+
+	/* RAID 1 specific */
+#define NUM_RAID1_MAP_ENTRIES	3
+	u32	num_it_nexus_entries;
+	u32	it_nexus[NUM_RAID1_MAP_ENTRIES];
+
+	/* RAID 5 / RAID 6 specific */
+	u32	p_parity_it_nexus;	/* aio_handle */
+	u32	q_parity_it_nexus;	/* aio_handle */
+	u8	xor_mult;
+	u64	row;
+	u64	stripe_lba;
+	u32	p_index;
+	u32	q_index;
+};
+
+#define RAID_CTLR_LUNID		"\0\0\0\0\0\0\0\0"
+
+#define NUM_STREAMS_PER_LUN	8
+
+struct pqi_stream_data {
+	u64	next_lba;
+	u32	last_accessed;
+};
+
+struct pqi_scsi_dev {
+	int	devtype;		/* as reported by INQUIRY commmand */
+	u8	device_type;		/* as reported by */
+					/* BMIC_IDENTIFY_PHYSICAL_DEVICE */
+					/* only valid for devtype = TYPE_DISK */
+	int	bus;
+	int	target;
+	int	lun;
+	u8	scsi3addr[8];
+	u8	wwid[16];
+	u8	volume_id[16];
+	u8	is_physical_device : 1;
+	u8	is_external_raid_device : 1;
+	u8	is_expander_smp_device : 1;
+	u8	target_lun_valid : 1;
+	u8	device_gone : 1;
+	u8	new_device : 1;
+	u8	keep_device : 1;
+	u8	volume_offline : 1;
+	u8	rescan : 1;
+	u8	ignore_device : 1;
+	bool	aio_enabled;		/* only valid for physical disks */
+	bool	in_remove;
+	bool	device_offline;
+	u8	vendor[8];		/* bytes 8-15 of inquiry data */
+	u8	model[16];		/* bytes 16-31 of inquiry data */
+	u64	sas_address;
+	u8	raid_level;
+	u16	queue_depth;		/* max. queue_depth for this device */
+	u16	advertised_queue_depth;
+	u32	aio_handle;
+	u8	volume_status;
+	u8	active_path_index;
+	u8	path_map;
+	u8	bay;
+	u8	box_index;
+	u8	phys_box_on_bus;
+	u8	phy_connected_dev_type;
+	u8	box[8];
+	u16	phys_connector[8];
+	u8	phy_id;
+	u8	ncq_prio_enable;
+	u8	ncq_prio_support;
+	bool	raid_bypass_configured;	/* RAID bypass configured */
+	bool	raid_bypass_enabled;	/* RAID bypass enabled */
+	u32	next_bypass_group;
+	struct raid_map *raid_map;	/* RAID bypass map */
+	u32	max_transfer_encrypted;
+
+	struct pqi_sas_port *sas_port;
+	struct scsi_device *sdev;
+
+	struct list_head scsi_device_list_entry;
+	struct list_head new_device_list_entry;
+	struct list_head add_list_entry;
+	struct list_head delete_list_entry;
+
+	struct pqi_stream_data stream_data[NUM_STREAMS_PER_LUN];
+	atomic_t scsi_cmds_outstanding;
+	atomic_t raid_bypass_cnt;
+	u8	page_83_identifier[16];
+};
+
+/* VPD inquiry pages */
+#define CISS_VPD_LV_DEVICE_GEOMETRY	0xc1	/* vendor-specific page */
+#define CISS_VPD_LV_BYPASS_STATUS	0xc2	/* vendor-specific page */
+#define CISS_VPD_LV_STATUS		0xc3	/* vendor-specific page */
+
+#define VPD_PAGE	(1 << 8)
+
+#pragma pack(1)
+
+/* structure for CISS_VPD_LV_STATUS */
+struct ciss_vpd_logical_volume_status {
+	u8	peripheral_info;
+	u8	page_code;
+	u8	reserved;
+	u8	page_length;
+	u8	volume_status;
+	u8	reserved2[3];
+	__be32	flags;
+};
+
+#pragma pack()
+
+/* constants for volume_status field of ciss_vpd_logical_volume_status */
+#define CISS_LV_OK					0
+#define CISS_LV_FAILED					1
+#define CISS_LV_NOT_CONFIGURED				2
+#define CISS_LV_DEGRADED				3
+#define CISS_LV_READY_FOR_RECOVERY			4
+#define CISS_LV_UNDERGOING_RECOVERY			5
+#define CISS_LV_WRONG_PHYSICAL_DRIVE_REPLACED		6
+#define CISS_LV_PHYSICAL_DRIVE_CONNECTION_PROBLEM	7
+#define CISS_LV_HARDWARE_OVERHEATING			8
+#define CISS_LV_HARDWARE_HAS_OVERHEATED			9
+#define CISS_LV_UNDERGOING_EXPANSION			10
+#define CISS_LV_NOT_AVAILABLE				11
+#define CISS_LV_QUEUED_FOR_EXPANSION			12
+#define CISS_LV_DISABLED_SCSI_ID_CONFLICT		13
+#define CISS_LV_EJECTED					14
+#define CISS_LV_UNDERGOING_ERASE			15
+/* state 16 not used */
+#define CISS_LV_READY_FOR_PREDICTIVE_SPARE_REBUILD	17
+#define CISS_LV_UNDERGOING_RPI				18
+#define CISS_LV_PENDING_RPI				19
+#define CISS_LV_ENCRYPTED_NO_KEY			20
+/* state 21 not used */
+#define CISS_LV_UNDERGOING_ENCRYPTION			22
+#define CISS_LV_UNDERGOING_ENCRYPTION_REKEYING		23
+#define CISS_LV_ENCRYPTED_IN_NON_ENCRYPTED_CONTROLLER	24
+#define CISS_LV_PENDING_ENCRYPTION			25
+#define CISS_LV_PENDING_ENCRYPTION_REKEYING		26
+#define CISS_LV_NOT_SUPPORTED				27
+#define CISS_LV_STATUS_UNAVAILABLE			255
+
+/* constants for flags field of ciss_vpd_logical_volume_status */
+#define CISS_LV_FLAGS_NO_HOST_IO	0x1	/* volume not available for */
+						/* host I/O */
+
+/* for SAS hosts and SAS expanders */
+struct pqi_sas_node {
+	struct device *parent_dev;
+	struct list_head port_list_head;
+};
+
+struct pqi_sas_port {
+	struct list_head port_list_entry;
+	u64	sas_address;
+	struct pqi_scsi_dev *device;
+	struct sas_port *port;
+	int	next_phy_index;
+	struct list_head phy_list_head;
+	struct pqi_sas_node *parent_node;
+	struct sas_rphy *rphy;
+};
+
+struct pqi_sas_phy {
+	struct list_head phy_list_entry;
+	struct sas_phy *phy;
+	struct pqi_sas_port *parent_port;
+	bool	added_to_port;
+};
+
+struct pqi_io_request {
+	atomic_t	refcount;
+	u16		index;
+	void (*io_complete_callback)(struct pqi_io_request *io_request,
+		void *context);
+	void		*context;
+	u8		raid_bypass : 1;
+	int		status;
+	struct pqi_queue_group *queue_group;
+	struct scsi_cmnd *scmd;
+	void		*error_info;
+	struct pqi_sg_descriptor *sg_chain_buffer;
+	dma_addr_t	sg_chain_buffer_dma_handle;
+	void		*iu;
+	struct list_head request_list_entry;
+};
+
+#define PQI_NUM_SUPPORTED_EVENTS	7
+
+struct pqi_event {
+	bool	pending;
+	u8	event_type;
+	u16	event_id;
+	u32	additional_event_id;
+};
+
+#define PQI_RESERVED_IO_SLOTS_LUN_RESET			1
+#define PQI_RESERVED_IO_SLOTS_EVENT_ACK			PQI_NUM_SUPPORTED_EVENTS
+#define PQI_RESERVED_IO_SLOTS_SYNCHRONOUS_REQUESTS	3
+#define PQI_RESERVED_IO_SLOTS				\
+	(PQI_RESERVED_IO_SLOTS_LUN_RESET + PQI_RESERVED_IO_SLOTS_EVENT_ACK + \
+	PQI_RESERVED_IO_SLOTS_SYNCHRONOUS_REQUESTS)
+
+#define PQI_CTRL_PRODUCT_ID_GEN1	0
+#define PQI_CTRL_PRODUCT_ID_GEN2	7
+#define PQI_CTRL_PRODUCT_REVISION_A	0
+#define PQI_CTRL_PRODUCT_REVISION_B	1
+
+struct pqi_ctrl_info {
+	unsigned int	ctrl_id;
+	struct pci_dev	*pci_dev;
+	char		firmware_version[32];
+	char		serial_number[17];
+	char		model[17];
+	char		vendor[9];
+	u8		product_id;
+	u8		product_revision;
+	void __iomem	*iomem_base;
+	struct pqi_ctrl_registers __iomem *registers;
+	struct pqi_device_registers __iomem *pqi_registers;
+	u32		max_sg_entries;
+	u32		config_table_offset;
+	u32		config_table_length;
+	u16		max_inbound_queues;
+	u16		max_elements_per_iq;
+	u16		max_iq_element_length;
+	u16		max_outbound_queues;
+	u16		max_elements_per_oq;
+	u16		max_oq_element_length;
+	u32		max_transfer_size;
+	u32		max_outstanding_requests;
+	u32		max_io_slots;
+	unsigned int	scsi_ml_can_queue;
+	unsigned short	sg_tablesize;
+	unsigned int	max_sectors;
+	u32		error_buffer_length;
+	void		*error_buffer;
+	dma_addr_t	error_buffer_dma_handle;
+	size_t		sg_chain_buffer_length;
+	unsigned int	num_queue_groups;
+	u16		max_hw_queue_index;
+	u16		num_elements_per_iq;
+	u16		num_elements_per_oq;
+	u16		max_inbound_iu_length_per_firmware;
+	u16		max_inbound_iu_length;
+	unsigned int	max_sg_per_iu;
+	unsigned int	max_sg_per_r56_iu;
+	void		*admin_queue_memory_base;
+	u32		admin_queue_memory_length;
+	dma_addr_t	admin_queue_memory_base_dma_handle;
+	void		*queue_memory_base;
+	u32		queue_memory_length;
+	dma_addr_t	queue_memory_base_dma_handle;
+	struct pqi_admin_queues admin_queues;
+	struct pqi_queue_group queue_groups[PQI_MAX_QUEUE_GROUPS];
+	struct pqi_event_queue event_queue;
+	enum pqi_irq_mode irq_mode;
+	int		max_msix_vectors;
+	int		num_msix_vectors_enabled;
+	int		num_msix_vectors_initialized;
+	u32		msix_vectors[PQI_MAX_MSIX_VECTORS];
+	void		*intr_data[PQI_MAX_MSIX_VECTORS];
+	int		event_irq;
+	struct Scsi_Host *scsi_host;
+
+	struct mutex	scan_mutex;
+	struct mutex	lun_reset_mutex;
+	bool		controller_online;
+	bool		block_requests;
+	bool		scan_blocked;
+	u8		inbound_spanning_supported : 1;
+	u8		outbound_spanning_supported : 1;
+	u8		pqi_mode_enabled : 1;
+	u8		pqi_reset_quiesce_supported : 1;
+	u8		soft_reset_handshake_supported : 1;
+	u8		raid_iu_timeout_supported : 1;
+	u8		tmf_iu_timeout_supported : 1;
+	u8		unique_wwid_in_report_phys_lun_supported : 1;
+	u8		firmware_triage_supported : 1;
+	u8		rpl_extended_format_4_5_supported : 1;
+	u8		enable_r1_writes : 1;
+	u8		enable_r5_writes : 1;
+	u8		enable_r6_writes : 1;
+	u8		lv_drive_type_mix_valid : 1;
+	u8		enable_stream_detection : 1;
+
+	u8		ciss_report_log_flags;
+	u32		max_transfer_encrypted_sas_sata;
+	u32		max_transfer_encrypted_nvme;
+	u32		max_write_raid_5_6;
+	u32		max_write_raid_1_10_2drive;
+	u32		max_write_raid_1_10_3drive;
+
+	struct list_head scsi_device_list;
+	spinlock_t	scsi_device_list_lock;
+
+	struct delayed_work rescan_work;
+	struct delayed_work update_time_work;
+
+	struct pqi_sas_node *sas_host;
+	u64		sas_address;
+
+	struct pqi_io_request *io_request_pool;
+	u16		next_io_request_slot;
+
+	struct pqi_event events[PQI_NUM_SUPPORTED_EVENTS];
+	struct work_struct event_work;
+
+	atomic_t	num_interrupts;
+	int		previous_num_interrupts;
+	u32		previous_heartbeat_count;
+	__le32 __iomem	*heartbeat_counter;
+	u8 __iomem	*soft_reset_status;
+	struct timer_list heartbeat_timer;
+	struct work_struct ctrl_offline_work;
+
+	struct semaphore sync_request_sem;
+	atomic_t	num_busy_threads;
+	atomic_t	num_blocked_threads;
+	wait_queue_head_t block_requests_wait;
+
+	struct mutex	ofa_mutex;
+	struct pqi_ofa_memory *pqi_ofa_mem_virt_addr;
+	dma_addr_t	pqi_ofa_mem_dma_handle;
+	void		**pqi_ofa_chunk_virt_addr;
+	struct work_struct ofa_memory_alloc_work;
+	struct work_struct ofa_quiesce_work;
+	u32		ofa_bytes_requested;
+	u16		ofa_cancel_reason;
+
+	atomic_t	total_scmds_outstanding;
+};
+
+enum pqi_ctrl_mode {
+	SIS_MODE = 0,
+	PQI_MODE
+};
+
+/*
+ * assume worst case: SATA queue depth of 31 minus 4 internal firmware commands
+ */
+#define PQI_PHYSICAL_DISK_DEFAULT_MAX_QUEUE_DEPTH	27
+
+/* CISS commands */
+#define CISS_READ		0xc0
+#define CISS_REPORT_LOG		0xc2	/* Report Logical LUNs */
+#define CISS_REPORT_PHYS	0xc3	/* Report Physical LUNs */
+#define CISS_GET_RAID_MAP	0xc8
+
+/* BMIC commands */
+#define BMIC_IDENTIFY_CONTROLLER		0x11
+#define BMIC_IDENTIFY_PHYSICAL_DEVICE		0x15
+#define BMIC_READ				0x26
+#define BMIC_WRITE				0x27
+#define BMIC_SENSE_FEATURE			0x61
+#define BMIC_SENSE_CONTROLLER_PARAMETERS	0x64
+#define BMIC_SENSE_SUBSYSTEM_INFORMATION	0x66
+#define BMIC_CSMI_PASSTHRU			0x68
+#define BMIC_WRITE_HOST_WELLNESS		0xa5
+#define BMIC_FLUSH_CACHE			0xc2
+#define BMIC_SET_DIAG_OPTIONS			0xf4
+#define BMIC_SENSE_DIAG_OPTIONS			0xf5
+
+#define CSMI_CC_SAS_SMP_PASSTHRU		0x17
+
+#define SA_FLUSH_CACHE				0x1
+
+#define MASKED_DEVICE(lunid)			((lunid)[3] & 0xc0)
+#define CISS_GET_LEVEL_2_BUS(lunid)		((lunid)[7] & 0x3f)
+#define CISS_GET_LEVEL_2_TARGET(lunid)		((lunid)[6])
+#define CISS_GET_DRIVE_NUMBER(lunid)		\
+	(((CISS_GET_LEVEL_2_BUS((lunid)) - 1) << 8) + \
+	CISS_GET_LEVEL_2_TARGET((lunid)))
+
+#define LV_GET_DRIVE_TYPE_MIX(lunid)		((lunid)[6])
+
+#define LV_DRIVE_TYPE_MIX_UNKNOWN		0
+#define LV_DRIVE_TYPE_MIX_NO_RESTRICTION	1
+#define LV_DRIVE_TYPE_MIX_SAS_HDD_ONLY		2
+#define LV_DRIVE_TYPE_MIX_SATA_HDD_ONLY		3
+#define LV_DRIVE_TYPE_MIX_SAS_OR_SATA_SSD_ONLY	4
+#define LV_DRIVE_TYPE_MIX_SAS_SSD_ONLY		5
+#define LV_DRIVE_TYPE_MIX_SATA_SSD_ONLY		6
+#define LV_DRIVE_TYPE_MIX_SAS_ONLY		7
+#define LV_DRIVE_TYPE_MIX_SATA_ONLY		8
+#define LV_DRIVE_TYPE_MIX_NVME_ONLY		9
+
+#define NO_TIMEOUT		((unsigned long) -1)
+
+#pragma pack(1)
+
+struct bmic_identify_controller {
+	u8	configured_logical_drive_count;
+	__le32	configuration_signature;
+	u8	firmware_version_short[4];
+	u8	reserved[145];
+	__le16	extended_logical_unit_count;
+	u8	reserved1[34];
+	__le16	firmware_build_number;
+	u8	reserved2[8];
+	u8	vendor_id[8];
+	u8	product_id[16];
+	u8	reserved3[62];
+	__le32	extra_controller_flags;
+	u8	reserved4[2];
+	u8	controller_mode;
+	u8	spare_part_number[32];
+	u8	firmware_version_long[32];
+};
+
+/* constants for extra_controller_flags field of bmic_identify_controller */
+#define BMIC_IDENTIFY_EXTRA_FLAGS_LONG_FW_VERSION_SUPPORTED	0x20000000
+
+struct bmic_sense_subsystem_info {
+	u8	reserved[44];
+	u8	ctrl_serial_number[16];
+};
+
+/* constants for device_type field */
+#define SA_DEVICE_TYPE_SATA		0x1
+#define SA_DEVICE_TYPE_SAS		0x2
+#define SA_DEVICE_TYPE_EXPANDER_SMP	0x5
+#define SA_DEVICE_TYPE_SES		0x6
+#define SA_DEVICE_TYPE_CONTROLLER	0x7
+#define SA_DEVICE_TYPE_NVME		0x9
+
+struct bmic_identify_physical_device {
+	u8	scsi_bus;		/* SCSI Bus number on controller */
+	u8	scsi_id;		/* SCSI ID on this bus */
+	__le16	block_size;		/* sector size in bytes */
+	__le32	total_blocks;		/* number for sectors on drive */
+	__le32	reserved_blocks;	/* controller reserved (RIS) */
+	u8	model[40];		/* Physical Drive Model */
+	u8	serial_number[40];	/* Drive Serial Number */
+	u8	firmware_revision[8];	/* drive firmware revision */
+	u8	scsi_inquiry_bits;	/* inquiry byte 7 bits */
+	u8	compaq_drive_stamp;	/* 0 means drive not stamped */
+	u8	last_failure_reason;
+	u8	flags;
+	u8	more_flags;
+	u8	scsi_lun;		/* SCSI LUN for phys drive */
+	u8	yet_more_flags;
+	u8	even_more_flags;
+	__le32	spi_speed_rules;
+	u8	phys_connector[2];	/* connector number on controller */
+	u8	phys_box_on_bus;	/* phys enclosure this drive resides */
+	u8	phys_bay_in_box;	/* phys drv bay this drive resides */
+	__le32	rpm;			/* drive rotational speed in RPM */
+	u8	device_type;		/* type of drive */
+	u8	sata_version;		/* only valid when device_type = */
+					/* SA_DEVICE_TYPE_SATA */
+	__le64	big_total_block_count;
+	__le64	ris_starting_lba;
+	__le32	ris_size;
+	u8	wwid[20];
+	u8	controller_phy_map[32];
+	__le16	phy_count;
+	u8	phy_connected_dev_type[256];
+	u8	phy_to_drive_bay_num[256];
+	__le16	phy_to_attached_dev_index[256];
+	u8	box_index;
+	u8	reserved;
+	__le16	extra_physical_drive_flags;
+	u8	negotiated_link_rate[256];
+	u8	phy_to_phy_map[256];
+	u8	redundant_path_present_map;
+	u8	redundant_path_failure_map;
+	u8	active_path_number;
+	__le16	alternate_paths_phys_connector[8];
+	u8	alternate_paths_phys_box_on_port[8];
+	u8	multi_lun_device_lun_count;
+	u8	minimum_good_fw_revision[8];
+	u8	unique_inquiry_bytes[20];
+	u8	current_temperature_degrees;
+	u8	temperature_threshold_degrees;
+	u8	max_temperature_degrees;
+	u8	logical_blocks_per_phys_block_exp;
+	__le16	current_queue_depth_limit;
+	u8	switch_name[10];
+	__le16	switch_port;
+	u8	alternate_paths_switch_name[40];
+	u8	alternate_paths_switch_port[8];
+	__le16	power_on_hours;
+	__le16	percent_endurance_used;
+	u8	drive_authentication;
+	u8	smart_carrier_authentication;
+	u8	smart_carrier_app_fw_version;
+	u8	smart_carrier_bootloader_fw_version;
+	u8	sanitize_flags;
+	u8	encryption_key_flags;
+	u8	encryption_key_name[64];
+	__le32	misc_drive_flags;
+	__le16	dek_index;
+	__le16	hba_drive_encryption_flags;
+	__le16	max_overwrite_time;
+	__le16	max_block_erase_time;
+	__le16	max_crypto_erase_time;
+	u8	connector_info[5];
+	u8	connector_name[8][8];
+	u8	page_83_identifier[16];
+	u8	maximum_link_rate[256];
+	u8	negotiated_physical_link_rate[256];
+	u8	box_connector_name[8];
+	u8	padding_to_multiple_of_512[9];
+};
+
+#define BMIC_SENSE_FEATURE_IO_PAGE		0x8
+#define BMIC_SENSE_FEATURE_IO_PAGE_AIO_SUBPAGE	0x2
+
+struct bmic_sense_feature_buffer_header {
+	u8	page_code;
+	u8	subpage_code;
+	__le16	buffer_length;
+};
+
+struct bmic_sense_feature_page_header {
+	u8	page_code;
+	u8	subpage_code;
+	__le16	page_length;
+};
+
+struct bmic_sense_feature_io_page_aio_subpage {
+	struct bmic_sense_feature_page_header header;
+	u8	firmware_read_support;
+	u8	driver_read_support;
+	u8	firmware_write_support;
+	u8	driver_write_support;
+	__le16	max_transfer_encrypted_sas_sata;
+	__le16	max_transfer_encrypted_nvme;
+	__le16	max_write_raid_5_6;
+	__le16	max_write_raid_1_10_2drive;
+	__le16	max_write_raid_1_10_3drive;
+};
+
+struct bmic_smp_request {
+	u8	frame_type;
+	u8	function;
+	u8	allocated_response_length;
+	u8	request_length;
+	u8	additional_request_bytes[1016];
+};
+
+struct  bmic_smp_response {
+	u8	frame_type;
+	u8	function;
+	u8	function_result;
+	u8	response_length;
+	u8	additional_response_bytes[1016];
+};
+
+struct bmic_csmi_ioctl_header {
+	__le32	header_length;
+	u8	signature[8];
+	__le32	timeout;
+	__le32	control_code;
+	__le32	return_code;
+	__le32	length;
+};
+
+struct bmic_csmi_smp_passthru {
+	u8	phy_identifier;
+	u8	port_identifier;
+	u8	connection_rate;
+	u8	reserved;
+	__be64	destination_sas_address;
+	__le32	request_length;
+	struct bmic_smp_request request;
+	u8	connection_status;
+	u8	reserved1[3];
+	__le32	response_length;
+	struct bmic_smp_response response;
+};
+
+struct bmic_csmi_smp_passthru_buffer {
+	struct bmic_csmi_ioctl_header ioctl_header;
+	struct bmic_csmi_smp_passthru parameters;
+};
+
+struct bmic_flush_cache {
+	u8	disable_flag;
+	u8	system_power_action;
+	u8	ndu_flush;
+	u8	shutdown_event;
+	u8	reserved[28];
+};
+
+/* for shutdown_event member of struct bmic_flush_cache */
+enum bmic_flush_cache_shutdown_event {
+	NONE_CACHE_FLUSH_ONLY = 0,
+	SHUTDOWN = 1,
+	HIBERNATE = 2,
+	SUSPEND = 3,
+	RESTART = 4
+};
+
+struct bmic_diag_options {
+	__le32 options;
+};
+
+#pragma pack()
+
+static inline struct pqi_ctrl_info *shost_to_hba(struct Scsi_Host *shost)
+{
+	void *hostdata = shost_priv(shost);
+
+	return *((struct pqi_ctrl_info **)hostdata);
+}
+
+void pqi_sas_smp_handler(struct bsg_job *job, struct Scsi_Host *shost,
+	struct sas_rphy *rphy);
+int pqi_scsi_queue_command(struct Scsi_Host *shost, struct scsi_cmnd *scmd);
+int pqi_add_sas_host(struct Scsi_Host *shost, struct pqi_ctrl_info *ctrl_info);
+void pqi_delete_sas_host(struct pqi_ctrl_info *ctrl_info);
+int pqi_add_sas_device(struct pqi_sas_node *pqi_sas_node,
+	struct pqi_scsi_dev *device);
+void pqi_remove_sas_device(struct pqi_scsi_dev *device);
+struct pqi_scsi_dev *pqi_find_device_by_sas_rphy(
+	struct pqi_ctrl_info *ctrl_info, struct sas_rphy *rphy);
+void pqi_prep_for_scsi_done(struct scsi_cmnd *scmd);
+int pqi_csmi_smp_passthru(struct pqi_ctrl_info *ctrl_info,
+	struct bmic_csmi_smp_passthru_buffer *buffer, size_t buffer_length,
+	struct pqi_raid_error_info *error_info);
+
+extern struct sas_function_template pqi_sas_transport_functions;
+
+#endif	/* _SMARTPQI_H */
diff --git a/drivers/amazon/scsi/smartpqi/smartpqi_init.c b/drivers/amazon/scsi/smartpqi/smartpqi_init.c
new file mode 100644
index 0000000000000..db6e6f50fd745
--- /dev/null
+++ b/drivers/amazon/scsi/smartpqi/smartpqi_init.c
@@ -0,0 +1,10246 @@
+/*
+ *    driver for Microchip PQI-based storage controllers
+ *    Copyright (c) 2019-2021 Microchip Technology Inc. and its subsidiaries
+ *    Copyright (c) 2016-2018 Microsemi Corporation
+ *    Copyright (c) 2016 PMC-Sierra, Inc.
+ *
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; version 2 of the License.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *    NON INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ *    Questions/Comments/Bugfixes to storagedev@microchip.com
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/rtc.h>
+#include <linux/bcd.h>
+#include <linux/reboot.h>
+#include <linux/cciss_ioctl.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_eh.h>
+#include <scsi/scsi_transport_sas.h>
+#include <scsi/scsi_dbg.h>
+#include <asm/unaligned.h>
+#include "smartpqi.h"
+#include "smartpqi_sis.h"
+#include "smartpqi_kernel_compat.h"
+
+#if !defined(BUILD_TIMESTAMP)
+#define BUILD_TIMESTAMP
+#endif
+
+#define DRIVER_VERSION		"2.1.14-030"
+#define DRIVER_MAJOR		2
+#define DRIVER_MINOR		1
+#define DRIVER_RELEASE		14
+#define DRIVER_REVISION		24
+
+#define DRIVER_NAME		"Microchip SmartPQI Driver (v" \
+				DRIVER_VERSION BUILD_TIMESTAMP ")"
+#define DRIVER_NAME_SHORT	"smartpqi"
+
+#define PQI_EXTRA_SGL_MEMORY	(12 * sizeof(struct pqi_sg_descriptor))
+#define PQI_1MB_SECTORS		2048 /* sectors */
+
+#define PQI_POST_RESET_DELAY_SECS			5
+#define PQI_POST_OFA_RESET_DELAY_UPON_TIMEOUT_SECS	10
+
+MODULE_AUTHOR("Microchip");
+#if TORTUGA
+MODULE_DESCRIPTION("Driver for Microchip Smart Family Controller version "
+	DRIVER_VERSION " (d-108cd0d/s-8601640)" " (d147/s325)");
+#else
+MODULE_DESCRIPTION("Driver for Microchip Smart Family Controller version "
+	DRIVER_VERSION " (d-108cd0d/s-8601640)");
+#endif
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL");
+
+static void pqi_take_ctrl_offline(struct pqi_ctrl_info *ctrl_info,
+	enum pqi_ctrl_shutdown_reason ctrl_shutdown_reason);
+static void pqi_ctrl_offline_worker(struct work_struct *work);
+static int pqi_scan_scsi_devices(struct pqi_ctrl_info *ctrl_info);
+static void pqi_scan_start(struct Scsi_Host *shost);
+static void pqi_start_io(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_queue_group *queue_group, enum pqi_io_path path,
+	struct pqi_io_request *io_request);
+static int pqi_submit_raid_request_synchronous(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_iu_header *request, unsigned int flags,
+	struct pqi_raid_error_info *error_info);
+static int pqi_aio_submit_io(struct pqi_ctrl_info *ctrl_info,
+	struct scsi_cmnd *scmd, u32 aio_handle, u8 *cdb,
+	unsigned int cdb_length, struct pqi_queue_group *queue_group,
+	struct pqi_encryption_info *encryption_info, bool raid_bypass, bool io_high_prio);
+static  int pqi_aio_submit_r1_write_io(struct pqi_ctrl_info *ctrl_info,
+	struct scsi_cmnd *scmd, struct pqi_queue_group *queue_group,
+	struct pqi_encryption_info *encryption_info, struct pqi_scsi_dev *device,
+	struct pqi_scsi_dev_raid_map_data *rmd);
+static int pqi_aio_submit_r56_write_io(struct pqi_ctrl_info *ctrl_info,
+	struct scsi_cmnd *scmd, struct pqi_queue_group *queue_group,
+	struct pqi_encryption_info *encryption_info, struct pqi_scsi_dev *device,
+	struct pqi_scsi_dev_raid_map_data *rmd);
+static void pqi_ofa_ctrl_quiesce(struct pqi_ctrl_info *ctrl_info);
+static void pqi_ofa_ctrl_unquiesce(struct pqi_ctrl_info *ctrl_info);
+static int pqi_ofa_ctrl_restart(struct pqi_ctrl_info *ctrl_info, unsigned int delay_secs);
+static void pqi_ofa_setup_host_buffer(struct pqi_ctrl_info *ctrl_info);
+static void pqi_ofa_free_host_buffer(struct pqi_ctrl_info *ctrl_info);
+static int pqi_ofa_host_memory_update(struct pqi_ctrl_info *ctrl_info);
+static int pqi_device_wait_for_pending_io(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *device, unsigned long timeout_msecs);
+
+/* for flags argument to pqi_submit_raid_request_synchronous() */
+#define PQI_SYNC_FLAGS_INTERRUPTABLE	0x1
+
+static struct scsi_transport_template *pqi_sas_transport_template;
+
+static atomic_t pqi_controller_count = ATOMIC_INIT(0);
+
+enum pqi_lockup_action {
+	NONE,
+	REBOOT,
+	PANIC
+};
+
+static enum pqi_lockup_action pqi_lockup_action = NONE;
+
+static struct {
+	enum pqi_lockup_action	action;
+	char			*name;
+} pqi_lockup_actions[] = {
+	{
+		.action = NONE,
+		.name = "none",
+	},
+	{
+		.action = REBOOT,
+		.name = "reboot",
+	},
+	{
+		.action = PANIC,
+		.name = "panic",
+	},
+};
+
+static unsigned int pqi_supported_event_types[] = {
+	PQI_EVENT_TYPE_HOTPLUG,
+	PQI_EVENT_TYPE_HARDWARE,
+	PQI_EVENT_TYPE_PHYSICAL_DEVICE,
+	PQI_EVENT_TYPE_LOGICAL_DEVICE,
+	PQI_EVENT_TYPE_OFA,
+	PQI_EVENT_TYPE_AIO_STATE_CHANGE,
+	PQI_EVENT_TYPE_AIO_CONFIG_CHANGE,
+};
+
+static int pqi_disable_device_id_wildcards;
+module_param_named(disable_device_id_wildcards,
+	pqi_disable_device_id_wildcards, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(disable_device_id_wildcards,
+	"Disable device ID wildcards.");
+
+static int pqi_disable_heartbeat;
+module_param_named(disable_heartbeat,
+	pqi_disable_heartbeat, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(disable_heartbeat,
+	"Disable heartbeat.");
+
+static int pqi_disable_ctrl_shutdown;
+module_param_named(disable_ctrl_shutdown,
+	pqi_disable_ctrl_shutdown, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(disable_ctrl_shutdown,
+	"Disable controller shutdown when controller locked up.");
+
+static char *pqi_lockup_action_param;
+module_param_named(lockup_action,
+	pqi_lockup_action_param, charp, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(lockup_action, "Action to take when controller locked up.\n"
+	"\t\tSupported: none, reboot, panic\n"
+	"\t\tDefault: none");
+
+static int pqi_expose_ld_first;
+module_param_named(expose_ld_first,
+	pqi_expose_ld_first, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(expose_ld_first, "Expose logical drives before physical drives.");
+
+static int pqi_hide_vsep;
+module_param_named(hide_vsep,
+	pqi_hide_vsep, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(hide_vsep, "Hide the virtual SEP for direct attached drives.");
+
+static int pqi_limit_xfer_to_1MB;
+module_param_named(limit_xfer_size_to_1MB,
+	pqi_limit_xfer_to_1MB, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(limit_xfer_size_to_1MB, "Limit max transfer size to 1MB.");
+
+static char *raid_levels[] = {
+	"RAID-0",
+	"RAID-4",
+	"RAID-1(1+0)",
+	"RAID-5",
+	"RAID-5+1",
+	"RAID-6",
+	"RAID-1(Triple)",
+};
+
+static char *pqi_raid_level_to_string(u8 raid_level)
+{
+	if (raid_level < ARRAY_SIZE(raid_levels))
+		return raid_levels[raid_level];
+
+	return "RAID UNKNOWN";
+}
+
+#define SA_RAID_0		0
+#define SA_RAID_4		1
+#define SA_RAID_1		2	/* also used for RAID 10 */
+#define SA_RAID_5		3	/* also used for RAID 50 */
+#define SA_RAID_51		4
+#define SA_RAID_6		5	/* also used for RAID 60 */
+#define SA_RAID_TRIPLE		6	/* also used for RAID 1+0 Triple */
+#define SA_RAID_MAX		SA_RAID_TRIPLE
+#define SA_RAID_UNKNOWN		0xff
+
+static inline bool pqi_scsi3addr_equal(u8 *scsi3addr1, u8 *scsi3addr2)
+{
+	return memcmp(scsi3addr1, scsi3addr2, 8) == 0;
+}
+
+static inline bool pqi_is_logical_device(struct pqi_scsi_dev *device)
+{
+	return !device->is_physical_device;
+}
+
+static inline bool pqi_is_external_raid_addr(u8 *scsi3addr)
+{
+	return scsi3addr[2] != 0;
+}
+
+static inline bool pqi_ctrl_offline(struct pqi_ctrl_info *ctrl_info)
+{
+	return !ctrl_info->controller_online;
+}
+
+static inline void pqi_check_ctrl_health(struct pqi_ctrl_info *ctrl_info)
+{
+	if (ctrl_info->controller_online)
+		if (!sis_is_firmware_running(ctrl_info))
+			pqi_take_ctrl_offline(ctrl_info, PQI_FIRMWARE_KERNEL_NOT_UP);
+}
+
+static inline bool pqi_is_hba_lunid(u8 *scsi3addr)
+{
+	return pqi_scsi3addr_equal(scsi3addr, RAID_CTLR_LUNID);
+}
+
+#define PQI_DRIVER_SCRATCH_PQI_MODE			0x1
+#define PQI_DRIVER_SCRATCH_FW_TRIAGE_SUPPORTED		0x2
+
+static inline enum pqi_ctrl_mode pqi_get_ctrl_mode(struct pqi_ctrl_info *ctrl_info)
+{
+	return sis_read_driver_scratch(ctrl_info) & PQI_DRIVER_SCRATCH_PQI_MODE ? PQI_MODE : SIS_MODE;
+}
+
+static inline void pqi_save_ctrl_mode(struct pqi_ctrl_info *ctrl_info,
+	enum pqi_ctrl_mode mode)
+{
+	u32 driver_scratch;
+
+	driver_scratch = sis_read_driver_scratch(ctrl_info);
+
+	if (mode == PQI_MODE)
+		driver_scratch |= PQI_DRIVER_SCRATCH_PQI_MODE;
+	else
+		driver_scratch &= ~PQI_DRIVER_SCRATCH_PQI_MODE;
+
+	sis_write_driver_scratch(ctrl_info, driver_scratch);
+}
+
+static inline bool pqi_is_fw_triage_supported(struct pqi_ctrl_info *ctrl_info)
+{
+	return (sis_read_driver_scratch(ctrl_info) & PQI_DRIVER_SCRATCH_FW_TRIAGE_SUPPORTED) != 0;
+}
+
+static inline void pqi_save_fw_triage_setting(struct pqi_ctrl_info *ctrl_info, bool is_supported)
+{
+	u32 driver_scratch;
+
+	driver_scratch = sis_read_driver_scratch(ctrl_info);
+
+	if (is_supported)
+		driver_scratch |= PQI_DRIVER_SCRATCH_FW_TRIAGE_SUPPORTED;
+	else
+		driver_scratch &= ~PQI_DRIVER_SCRATCH_FW_TRIAGE_SUPPORTED;
+
+	sis_write_driver_scratch(ctrl_info, driver_scratch);
+}
+
+static inline void pqi_ctrl_block_scan(struct pqi_ctrl_info *ctrl_info)
+{
+	ctrl_info->scan_blocked = true;
+	mutex_lock(&ctrl_info->scan_mutex);
+}
+
+static inline void pqi_ctrl_unblock_scan(struct pqi_ctrl_info *ctrl_info)
+{
+	ctrl_info->scan_blocked = false;
+	mutex_unlock(&ctrl_info->scan_mutex);
+}
+
+static inline bool pqi_ctrl_scan_blocked(struct pqi_ctrl_info *ctrl_info)
+{
+	return ctrl_info->scan_blocked;
+}
+
+static inline void pqi_ctrl_block_device_reset(struct pqi_ctrl_info *ctrl_info)
+{
+	mutex_lock(&ctrl_info->lun_reset_mutex);
+}
+
+static inline void pqi_ctrl_unblock_device_reset(struct pqi_ctrl_info *ctrl_info)
+{
+	mutex_unlock(&ctrl_info->lun_reset_mutex);
+}
+
+static inline void pqi_scsi_block_requests(struct pqi_ctrl_info *ctrl_info)
+{
+	struct Scsi_Host *shost;
+	unsigned int num_loops;
+	int msecs_sleep;
+
+	shost = ctrl_info->scsi_host;
+
+	scsi_block_requests(shost);
+
+	num_loops = 0;
+	msecs_sleep = 20;
+	while (pqi_scsi_host_busy(shost)) {
+		num_loops++;
+		if (num_loops == 10) {
+			dev_warn(&ctrl_info->pci_dev->dev,
+			"shost %d Waited for %d  milli seconds to be unbusy\n",
+			shost->host_no, num_loops * msecs_sleep);
+			msecs_sleep = 500;
+		}
+		msleep(msecs_sleep);
+		if(num_loops % 20 == 0)
+			dev_warn(&ctrl_info->pci_dev->dev,
+			"shost %d waited for %d more seconds to be unbusy\n",
+			shost->host_no, msecs_sleep * 20 / 1000);
+	}
+}
+
+static inline void pqi_scsi_unblock_requests(struct pqi_ctrl_info *ctrl_info)
+{
+	scsi_unblock_requests(ctrl_info->scsi_host);
+}
+
+static inline void pqi_ctrl_busy(struct pqi_ctrl_info *ctrl_info)
+{
+	atomic_inc(&ctrl_info->num_busy_threads);
+}
+
+static inline void pqi_ctrl_unbusy(struct pqi_ctrl_info *ctrl_info)
+{
+	atomic_dec(&ctrl_info->num_busy_threads);
+}
+
+static inline bool pqi_ctrl_blocked(struct pqi_ctrl_info *ctrl_info)
+{
+	return ctrl_info->block_requests;
+}
+
+static inline void pqi_ctrl_block_requests(struct pqi_ctrl_info *ctrl_info)
+{
+	ctrl_info->block_requests = true;
+}
+
+static inline void pqi_ctrl_unblock_requests(struct pqi_ctrl_info *ctrl_info)
+{
+	ctrl_info->block_requests = false;
+	wake_up_all(&ctrl_info->block_requests_wait);
+}
+
+static void pqi_wait_if_ctrl_blocked(struct pqi_ctrl_info *ctrl_info)
+{
+	if (!pqi_ctrl_blocked(ctrl_info))
+		return;
+
+	atomic_inc(&ctrl_info->num_blocked_threads);
+	wait_event(ctrl_info->block_requests_wait,
+		!pqi_ctrl_blocked(ctrl_info));
+	atomic_dec(&ctrl_info->num_blocked_threads);
+}
+
+#define PQI_QUIESE_WARNING_TIMEOUT_SECS		(10 * HZ)
+
+static inline void pqi_ctrl_wait_until_quiesced(struct pqi_ctrl_info *ctrl_info)
+{
+	unsigned long start_jiffies;
+	unsigned long warning_timeout;
+	bool displayed_warning;
+
+	displayed_warning = false;
+	start_jiffies = jiffies;
+	warning_timeout = PQI_QUIESE_WARNING_TIMEOUT_SECS + start_jiffies;
+
+	while (atomic_read(&ctrl_info->num_busy_threads) >
+		atomic_read(&ctrl_info->num_blocked_threads)) {
+		if (time_after(jiffies, warning_timeout)) {
+			dev_warn(&ctrl_info->pci_dev->dev,
+				"waiting %u seconds for driver activity to quiesce\n",
+				jiffies_to_msecs(jiffies - start_jiffies) / 1000);
+			displayed_warning = true;
+			warning_timeout = PQI_QUIESE_WARNING_TIMEOUT_SECS + jiffies;
+		}
+		msleep(1);
+	}
+
+	if (displayed_warning)
+		dev_warn(&ctrl_info->pci_dev->dev,
+			"driver activity quiesced after waiting for %u seconds\n",
+			jiffies_to_msecs(jiffies - start_jiffies) / 1000);
+}
+
+static inline bool pqi_device_offline(struct pqi_scsi_dev *device)
+{
+	return device->device_offline;
+}
+
+static inline void pqi_ctrl_ofa_start(struct pqi_ctrl_info *ctrl_info)
+{
+	mutex_lock(&ctrl_info->ofa_mutex);
+}
+
+static inline void pqi_ctrl_ofa_done(struct pqi_ctrl_info *ctrl_info)
+{
+	mutex_unlock(&ctrl_info->ofa_mutex);
+}
+
+static inline void pqi_wait_until_ofa_finished(struct pqi_ctrl_info *ctrl_info)
+{
+	mutex_lock(&ctrl_info->ofa_mutex);
+	mutex_unlock(&ctrl_info->ofa_mutex);
+}
+
+static inline bool pqi_ofa_in_progress(struct pqi_ctrl_info *ctrl_info)
+{
+	return mutex_is_locked(&ctrl_info->ofa_mutex);
+}
+
+static inline void pqi_device_remove_start(struct pqi_scsi_dev *device)
+{
+	device->in_remove = true;
+}
+
+static inline bool pqi_device_in_remove(struct pqi_scsi_dev *device)
+{
+	return device->in_remove;
+}
+
+static inline int pqi_event_type_to_event_index(unsigned int event_type)
+{
+	int index;
+
+	for (index = 0; index < ARRAY_SIZE(pqi_supported_event_types); index++)
+		if (event_type == pqi_supported_event_types[index])
+			return index;
+
+	return -1;
+}
+
+static inline bool pqi_is_supported_event(unsigned int event_type)
+{
+	return pqi_event_type_to_event_index(event_type) != -1;
+}
+
+static inline void pqi_schedule_rescan_worker_with_delay(struct pqi_ctrl_info *ctrl_info,
+	unsigned long delay)
+{
+	if (pqi_ctrl_offline(ctrl_info))
+		return;
+
+	schedule_delayed_work(&ctrl_info->rescan_work, delay);
+}
+
+static inline void pqi_schedule_rescan_worker(struct pqi_ctrl_info *ctrl_info)
+{
+	pqi_schedule_rescan_worker_with_delay(ctrl_info, 0);
+}
+
+#define PQI_RESCAN_WORK_DELAY	(10 * HZ)
+
+static inline void pqi_schedule_rescan_worker_delayed(struct pqi_ctrl_info *ctrl_info)
+{
+	pqi_schedule_rescan_worker_with_delay(ctrl_info, PQI_RESCAN_WORK_DELAY);
+}
+
+static inline void pqi_cancel_rescan_worker(struct pqi_ctrl_info *ctrl_info)
+{
+	cancel_delayed_work_sync(&ctrl_info->rescan_work);
+}
+
+static inline void pqi_cancel_event_worker(struct pqi_ctrl_info *ctrl_info)
+{
+	cancel_work_sync(&ctrl_info->event_work);
+}
+
+static inline u32 pqi_read_heartbeat_counter(struct pqi_ctrl_info *ctrl_info)
+{
+	if (!ctrl_info->heartbeat_counter)
+		return 0;
+
+	return readl(ctrl_info->heartbeat_counter);
+}
+
+static inline u8 pqi_read_soft_reset_status(struct pqi_ctrl_info *ctrl_info)
+{
+	return readb(ctrl_info->soft_reset_status);
+}
+
+static inline void pqi_clear_soft_reset_status(struct pqi_ctrl_info *ctrl_info)
+{
+	u8 status;
+
+	status = pqi_read_soft_reset_status(ctrl_info);
+	status &= ~PQI_SOFT_RESET_ABORT;
+	writeb(status, ctrl_info->soft_reset_status);
+}
+
+static int pqi_map_single(struct pci_dev *pci_dev,
+	struct pqi_sg_descriptor *sg_descriptor, void *buffer,
+	size_t buffer_length, enum dma_data_direction data_direction)
+{
+	dma_addr_t bus_address;
+
+	if (!buffer || buffer_length == 0 || data_direction == DMA_NONE)
+		return 0;
+
+	bus_address = dma_map_single(&pci_dev->dev, buffer, buffer_length,
+		data_direction);
+	if (dma_mapping_error(&pci_dev->dev, bus_address))
+		return -ENOMEM;
+
+	put_unaligned_le64((u64)bus_address, &sg_descriptor->address);
+	put_unaligned_le32(buffer_length, &sg_descriptor->length);
+	put_unaligned_le32(CISS_SG_LAST, &sg_descriptor->flags);
+
+	return 0;
+}
+
+static void pqi_pci_unmap(struct pci_dev *pci_dev,
+	struct pqi_sg_descriptor *descriptors, int num_descriptors,
+	enum dma_data_direction data_direction)
+{
+	int i;
+
+	if (data_direction == DMA_NONE)
+		return;
+
+	for (i = 0; i < num_descriptors; i++)
+		dma_unmap_single(&pci_dev->dev,
+			(dma_addr_t)get_unaligned_le64(&descriptors[i].address),
+			get_unaligned_le32(&descriptors[i].length),
+			data_direction);
+}
+
+static int pqi_build_raid_path_request(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_raid_path_request *request, u8 cmd,
+	u8 *scsi3addr, void *buffer, size_t buffer_length,
+	u16 vpd_page, enum dma_data_direction *dir)
+{
+	u8 *cdb;
+	size_t cdb_length = buffer_length;
+
+	memset(request, 0, sizeof(*request));
+
+	request->header.iu_type = PQI_REQUEST_IU_RAID_PATH_IO;
+	put_unaligned_le16(offsetof(struct pqi_raid_path_request,
+		sg_descriptors[1]) - PQI_REQUEST_HEADER_LENGTH,
+		&request->header.iu_length);
+	put_unaligned_le32(buffer_length, &request->buffer_length);
+	memcpy(request->lun_number, scsi3addr, sizeof(request->lun_number));
+	request->task_attribute = SOP_TASK_ATTRIBUTE_SIMPLE;
+	request->additional_cdb_bytes_usage = SOP_ADDITIONAL_CDB_BYTES_0;
+
+	cdb = request->cdb;
+
+	switch (cmd) {
+	case TEST_UNIT_READY:
+		request->data_direction = SOP_READ_FLAG;
+		cdb[0] = TEST_UNIT_READY;
+		break;
+	case INQUIRY:
+		request->data_direction = SOP_READ_FLAG;
+		cdb[0] = INQUIRY;
+		if (vpd_page & VPD_PAGE) {
+			cdb[1] = 0x1;
+			cdb[2] = (u8)vpd_page;
+		}
+		cdb[4] = (u8)cdb_length;
+		break;
+	case CISS_REPORT_LOG:
+	case CISS_REPORT_PHYS:
+		request->data_direction = SOP_READ_FLAG;
+		cdb[0] = cmd;
+		if (cmd == CISS_REPORT_PHYS) {
+			if (ctrl_info->rpl_extended_format_4_5_supported)
+				cdb[1] = CISS_REPORT_PHYS_FLAG_EXTENDED_FORMAT_4;
+			else
+				cdb[1] = CISS_REPORT_PHYS_FLAG_EXTENDED_FORMAT_2;
+		} else {
+			cdb[1] = ctrl_info->ciss_report_log_flags;
+		}
+		put_unaligned_be32(cdb_length, &cdb[6]);
+		break;
+	case CISS_GET_RAID_MAP:
+		request->data_direction = SOP_READ_FLAG;
+		cdb[0] = CISS_READ;
+		cdb[1] = CISS_GET_RAID_MAP;
+		put_unaligned_be32(cdb_length, &cdb[6]);
+		break;
+	case SA_FLUSH_CACHE:
+		request->header.driver_flags = PQI_DRIVER_NONBLOCKABLE_REQUEST;
+		request->data_direction = SOP_WRITE_FLAG;
+		cdb[0] = BMIC_WRITE;
+		cdb[6] = BMIC_FLUSH_CACHE;
+		put_unaligned_be16(cdb_length, &cdb[7]);
+		break;
+	case BMIC_SENSE_DIAG_OPTIONS:
+		cdb_length = 0;
+		/* fall through */
+	case BMIC_IDENTIFY_CONTROLLER:
+	case BMIC_IDENTIFY_PHYSICAL_DEVICE:
+	case BMIC_SENSE_SUBSYSTEM_INFORMATION:
+	case BMIC_SENSE_FEATURE:
+		request->data_direction = SOP_READ_FLAG;
+		cdb[0] = BMIC_READ;
+		cdb[6] = cmd;
+		put_unaligned_be16(cdb_length, &cdb[7]);
+		break;
+	case BMIC_SET_DIAG_OPTIONS:
+		cdb_length = 0;
+		/* fall through */
+	case BMIC_WRITE_HOST_WELLNESS:
+		request->data_direction = SOP_WRITE_FLAG;
+		cdb[0] = BMIC_WRITE;
+		cdb[6] = cmd;
+		put_unaligned_be16(cdb_length, &cdb[7]);
+		break;
+	case BMIC_CSMI_PASSTHRU:
+		request->data_direction = SOP_BIDIRECTIONAL;
+		cdb[0] = BMIC_WRITE;
+		cdb[5] = CSMI_CC_SAS_SMP_PASSTHRU;
+		cdb[6] = cmd;
+		put_unaligned_be16(cdb_length, &cdb[7]);
+		break;
+	default:
+		dev_err(&ctrl_info->pci_dev->dev, "unknown command 0x%c\n", cmd);
+		BUG();
+		break;
+	}
+
+	switch (request->data_direction) {
+	case SOP_READ_FLAG:
+		*dir = DMA_FROM_DEVICE;
+		break;
+	case SOP_WRITE_FLAG:
+		*dir = DMA_TO_DEVICE;
+		break;
+	case SOP_NO_DIRECTION_FLAG:
+		*dir = DMA_NONE;
+		break;
+	default:
+		*dir = DMA_BIDIRECTIONAL;
+		break;
+	}
+
+	return pqi_map_single(ctrl_info->pci_dev, &request->sg_descriptors[0],
+		buffer, buffer_length, *dir);
+}
+
+static inline void pqi_reinit_io_request(struct pqi_io_request *io_request)
+{
+	io_request->scmd = NULL;
+	io_request->status = 0;
+	io_request->error_info = NULL;
+	io_request->raid_bypass = false;
+}
+
+static struct pqi_io_request *pqi_alloc_io_request(
+	struct pqi_ctrl_info *ctrl_info)
+{
+	struct pqi_io_request *io_request;
+	u16 i = ctrl_info->next_io_request_slot;	/* benignly racy */
+
+	while (1) {
+		io_request = &ctrl_info->io_request_pool[i];
+		if (atomic_inc_return(&io_request->refcount) == 1)
+			break;
+		atomic_dec(&io_request->refcount);
+		i = (i + 1) % ctrl_info->max_io_slots;
+	}
+
+	/* benignly racy */
+	ctrl_info->next_io_request_slot = (i + 1) % ctrl_info->max_io_slots;
+
+	pqi_reinit_io_request(io_request);
+
+	return io_request;
+}
+
+static void pqi_free_io_request(struct pqi_io_request *io_request)
+{
+	atomic_dec(&io_request->refcount);
+}
+
+static int pqi_send_scsi_raid_request(struct pqi_ctrl_info *ctrl_info, u8 cmd,
+	u8 *scsi3addr, void *buffer, size_t buffer_length, u16 vpd_page,
+	struct pqi_raid_error_info *error_info)
+{
+	int rc;
+	struct pqi_raid_path_request request;
+	enum dma_data_direction dir;
+
+	rc = pqi_build_raid_path_request(ctrl_info, &request, cmd, scsi3addr,
+		buffer, buffer_length, vpd_page, &dir);
+	if (rc)
+		return rc;
+
+	rc = pqi_submit_raid_request_synchronous(ctrl_info, &request.header, 0,
+		error_info);
+
+	pqi_pci_unmap(ctrl_info->pci_dev, request.sg_descriptors, 1, dir);
+
+	return rc;
+}
+
+/* helper functions for pqi_send_scsi_raid_request */
+
+static inline int pqi_send_ctrl_raid_request(struct pqi_ctrl_info *ctrl_info,
+	u8 cmd, void *buffer, size_t buffer_length)
+{
+	return pqi_send_scsi_raid_request(ctrl_info, cmd, RAID_CTLR_LUNID,
+		buffer, buffer_length, 0, NULL);
+}
+
+static inline int pqi_send_ctrl_raid_with_error(struct pqi_ctrl_info *ctrl_info,
+	u8 cmd, void *buffer, size_t buffer_length,
+	struct pqi_raid_error_info *error_info)
+{
+	return pqi_send_scsi_raid_request(ctrl_info, cmd, RAID_CTLR_LUNID,
+		buffer, buffer_length, 0, error_info);
+}
+
+static inline int pqi_identify_controller(struct pqi_ctrl_info *ctrl_info,
+	struct bmic_identify_controller *buffer)
+{
+	return pqi_send_ctrl_raid_request(ctrl_info, BMIC_IDENTIFY_CONTROLLER,
+		buffer, sizeof(*buffer));
+}
+
+static inline int pqi_sense_subsystem_info(struct  pqi_ctrl_info *ctrl_info,
+	struct bmic_sense_subsystem_info *sense_info)
+{
+	return pqi_send_ctrl_raid_request(ctrl_info,
+		BMIC_SENSE_SUBSYSTEM_INFORMATION, sense_info,
+		sizeof(*sense_info));
+}
+
+static inline int pqi_scsi_inquiry(struct pqi_ctrl_info *ctrl_info,
+	u8 *scsi3addr, u16 vpd_page, void *buffer, size_t buffer_length)
+{
+	return pqi_send_scsi_raid_request(ctrl_info, INQUIRY, scsi3addr,
+		buffer, buffer_length, vpd_page, NULL);
+}
+
+static int pqi_identify_physical_device(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *device,
+	struct bmic_identify_physical_device *buffer, size_t buffer_length)
+{
+	int rc;
+	enum dma_data_direction dir;
+	u16 bmic_device_index;
+	struct pqi_raid_path_request request;
+
+	rc = pqi_build_raid_path_request(ctrl_info, &request,
+		BMIC_IDENTIFY_PHYSICAL_DEVICE, RAID_CTLR_LUNID, buffer,
+		buffer_length, 0, &dir);
+	if (rc)
+		return rc;
+
+	bmic_device_index = CISS_GET_DRIVE_NUMBER(device->scsi3addr);
+	request.cdb[2] = (u8)bmic_device_index;
+	request.cdb[9] = (u8)(bmic_device_index >> 8);
+
+	rc = pqi_submit_raid_request_synchronous(ctrl_info, &request.header, 0, NULL);
+
+	pqi_pci_unmap(ctrl_info->pci_dev, request.sg_descriptors, 1, dir);
+
+	return rc;
+}
+
+static inline u32 pqi_aio_limit_to_bytes(__le16 *limit)
+{
+	u32 bytes;
+
+	bytes = get_unaligned_le16(limit);
+	if (bytes == 0)
+		bytes = ~0;
+	else
+		bytes *= 1024;
+
+	return bytes;
+}
+
+#pragma pack(1)
+
+struct bmic_sense_feature_buffer {
+	struct bmic_sense_feature_buffer_header header;
+	struct bmic_sense_feature_io_page_aio_subpage aio_subpage;
+};
+
+#pragma pack()
+
+#define MINIMUM_AIO_SUBPAGE_BUFFER_LENGTH	\
+	offsetofend(struct bmic_sense_feature_buffer, \
+		aio_subpage.max_write_raid_1_10_3drive)
+
+#define MINIMUM_AIO_SUBPAGE_LENGTH	\
+	(offsetofend(struct bmic_sense_feature_io_page_aio_subpage, \
+		max_write_raid_1_10_3drive) - \
+	FIELD_SIZEOF(struct bmic_sense_feature_io_page_aio_subpage, header))
+
+static int pqi_get_advanced_raid_bypass_config(struct pqi_ctrl_info *ctrl_info)
+{
+	int rc;
+	enum dma_data_direction dir;
+	struct pqi_raid_path_request request;
+	struct bmic_sense_feature_buffer *buffer;
+
+	buffer = kmalloc(sizeof(*buffer), GFP_KERNEL);
+	if (!buffer)
+		return -ENOMEM;
+
+	rc = pqi_build_raid_path_request(ctrl_info, &request, BMIC_SENSE_FEATURE, RAID_CTLR_LUNID,
+		buffer, sizeof(*buffer), 0, &dir);
+	if (rc)
+		goto error;
+
+	request.cdb[2] = BMIC_SENSE_FEATURE_IO_PAGE;
+	request.cdb[3] = BMIC_SENSE_FEATURE_IO_PAGE_AIO_SUBPAGE;
+
+	rc = pqi_submit_raid_request_synchronous(ctrl_info, &request.header, 0, NULL);
+
+	pqi_pci_unmap(ctrl_info->pci_dev, request.sg_descriptors, 1, dir);
+
+	if (rc)
+		goto error;
+
+	if (buffer->header.page_code != BMIC_SENSE_FEATURE_IO_PAGE ||
+		buffer->header.subpage_code !=
+			BMIC_SENSE_FEATURE_IO_PAGE_AIO_SUBPAGE ||
+		get_unaligned_le16(&buffer->header.buffer_length) <
+			MINIMUM_AIO_SUBPAGE_BUFFER_LENGTH ||
+		buffer->aio_subpage.header.page_code !=
+			BMIC_SENSE_FEATURE_IO_PAGE ||
+		buffer->aio_subpage.header.subpage_code !=
+			BMIC_SENSE_FEATURE_IO_PAGE_AIO_SUBPAGE ||
+		get_unaligned_le16(&buffer->aio_subpage.header.page_length) <
+			MINIMUM_AIO_SUBPAGE_LENGTH) {
+		goto error;
+	}
+
+	ctrl_info->max_transfer_encrypted_sas_sata =
+		pqi_aio_limit_to_bytes(
+			&buffer->aio_subpage.max_transfer_encrypted_sas_sata);
+
+	ctrl_info->max_transfer_encrypted_nvme =
+		pqi_aio_limit_to_bytes(
+			&buffer->aio_subpage.max_transfer_encrypted_nvme);
+
+	ctrl_info->max_write_raid_5_6 =
+		pqi_aio_limit_to_bytes(
+			&buffer->aio_subpage.max_write_raid_5_6);
+
+	ctrl_info->max_write_raid_1_10_2drive =
+		pqi_aio_limit_to_bytes(
+			&buffer->aio_subpage.max_write_raid_1_10_2drive);
+
+	ctrl_info->max_write_raid_1_10_3drive =
+		pqi_aio_limit_to_bytes(
+			&buffer->aio_subpage.max_write_raid_1_10_3drive);
+
+error:
+	kfree(buffer);
+
+	return rc;
+}
+
+static int pqi_flush_cache(struct pqi_ctrl_info *ctrl_info,
+	enum bmic_flush_cache_shutdown_event shutdown_event)
+{
+	int rc;
+	struct bmic_flush_cache *flush_cache;
+
+	flush_cache = kzalloc(sizeof(*flush_cache), GFP_KERNEL);
+	if (!flush_cache)
+		return -ENOMEM;
+
+	flush_cache->shutdown_event = shutdown_event;
+
+	rc = pqi_send_ctrl_raid_request(ctrl_info, SA_FLUSH_CACHE, flush_cache,
+		sizeof(*flush_cache));
+
+	kfree(flush_cache);
+
+	return rc;
+}
+
+int pqi_csmi_smp_passthru(struct pqi_ctrl_info *ctrl_info,
+	struct bmic_csmi_smp_passthru_buffer *buffer, size_t buffer_length,
+	struct pqi_raid_error_info *error_info)
+{
+	return pqi_send_ctrl_raid_with_error(ctrl_info, BMIC_CSMI_PASSTHRU,
+		buffer, buffer_length, error_info);
+}
+
+#define PQI_FETCH_PTRAID_DATA		(1 << 31)
+
+static int pqi_set_diag_rescan(struct pqi_ctrl_info *ctrl_info)
+{
+	int rc;
+	struct bmic_diag_options *diag;
+
+	diag = kzalloc(sizeof(*diag), GFP_KERNEL);
+	if (!diag)
+		return -ENOMEM;
+
+	rc = pqi_send_ctrl_raid_request(ctrl_info, BMIC_SENSE_DIAG_OPTIONS,
+		diag, sizeof(*diag));
+	if (rc)
+		goto out;
+
+	diag->options |= cpu_to_le32(PQI_FETCH_PTRAID_DATA);
+
+	rc = pqi_send_ctrl_raid_request(ctrl_info, BMIC_SET_DIAG_OPTIONS, diag,
+		sizeof(*diag));
+
+out:
+	kfree(diag);
+
+	return rc;
+}
+
+static inline int pqi_write_host_wellness(struct pqi_ctrl_info *ctrl_info,
+	void *buffer, size_t buffer_length)
+{
+	return pqi_send_ctrl_raid_request(ctrl_info, BMIC_WRITE_HOST_WELLNESS,
+		buffer, buffer_length);
+}
+
+#pragma pack(1)
+
+struct bmic_host_wellness_driver_version {
+	u8	start_tag[4];
+	u8	driver_version_tag[2];
+	__le16	driver_version_length;
+	char	driver_version[32];
+	u8	dont_write_tag[2];
+	u8	end_tag[2];
+};
+
+#pragma pack()
+
+static int pqi_write_driver_version_to_host_wellness(
+	struct pqi_ctrl_info *ctrl_info)
+{
+	int rc;
+	struct bmic_host_wellness_driver_version *buffer;
+	size_t buffer_length;
+
+	buffer_length = sizeof(*buffer);
+
+	buffer = kmalloc(buffer_length, GFP_KERNEL);
+	if (!buffer)
+		return -ENOMEM;
+
+	buffer->start_tag[0] = '<';
+	buffer->start_tag[1] = 'H';
+	buffer->start_tag[2] = 'W';
+	buffer->start_tag[3] = '>';
+	buffer->driver_version_tag[0] = 'D';
+	buffer->driver_version_tag[1] = 'V';
+	put_unaligned_le16(sizeof(buffer->driver_version),
+		&buffer->driver_version_length);
+	strncpy(buffer->driver_version, "Linux " DRIVER_VERSION,
+		sizeof(buffer->driver_version) - 1);
+	buffer->driver_version[sizeof(buffer->driver_version) - 1] = '\0';
+	buffer->dont_write_tag[0] = 'D';
+	buffer->dont_write_tag[1] = 'W';
+	buffer->end_tag[0] = 'Z';
+	buffer->end_tag[1] = 'Z';
+
+	rc = pqi_write_host_wellness(ctrl_info, buffer, buffer_length);
+
+	kfree(buffer);
+
+	return rc;
+}
+
+#pragma pack(1)
+
+struct bmic_host_wellness_time {
+	u8	start_tag[4];
+	u8	time_tag[2];
+	__le16	time_length;
+	u8	time[8];
+	u8	dont_write_tag[2];
+	u8	end_tag[2];
+};
+
+#pragma pack()
+
+static int pqi_write_current_time_to_host_wellness(
+	struct pqi_ctrl_info *ctrl_info)
+{
+	int rc;
+	struct bmic_host_wellness_time *buffer;
+	size_t buffer_length;
+	unsigned long local_time;
+	unsigned int year;
+	struct tm tm;
+
+	buffer_length = sizeof(*buffer);
+
+	buffer = kmalloc(buffer_length, GFP_KERNEL);
+	if (!buffer)
+		return -ENOMEM;
+
+	buffer->start_tag[0] = '<';
+	buffer->start_tag[1] = 'H';
+	buffer->start_tag[2] = 'W';
+	buffer->start_tag[3] = '>';
+	buffer->time_tag[0] = 'T';
+	buffer->time_tag[1] = 'D';
+	put_unaligned_le16(sizeof(buffer->time),
+		&buffer->time_length);
+
+	local_time = ktime_get_real_seconds();
+	time64_to_tm(local_time, -sys_tz.tz_minuteswest * 60, &tm);
+	year = tm.tm_year + 1900;
+
+	buffer->time[0] = bin2bcd(tm.tm_hour);
+	buffer->time[1] = bin2bcd(tm.tm_min);
+	buffer->time[2] = bin2bcd(tm.tm_sec);
+	buffer->time[3] = 0;
+	buffer->time[4] = bin2bcd(tm.tm_mon + 1);
+	buffer->time[5] = bin2bcd(tm.tm_mday);
+	buffer->time[6] = bin2bcd(year / 100);
+	buffer->time[7] = bin2bcd(year % 100);
+
+	buffer->dont_write_tag[0] = 'D';
+	buffer->dont_write_tag[1] = 'W';
+	buffer->end_tag[0] = 'Z';
+	buffer->end_tag[1] = 'Z';
+
+	rc = pqi_write_host_wellness(ctrl_info, buffer, buffer_length);
+
+	kfree(buffer);
+
+	return rc;
+}
+
+#define PQI_UPDATE_TIME_WORK_INTERVAL	(24UL * 60 * 60 * HZ)
+
+static void pqi_update_time_worker(struct work_struct *work)
+{
+	int rc;
+	struct pqi_ctrl_info *ctrl_info;
+
+	ctrl_info = container_of(to_delayed_work(work), struct pqi_ctrl_info,
+		update_time_work);
+
+	rc = pqi_write_current_time_to_host_wellness(ctrl_info);
+	if (rc)
+		dev_warn(&ctrl_info->pci_dev->dev,
+			"error updating time on controller\n");
+
+	schedule_delayed_work(&ctrl_info->update_time_work,
+		PQI_UPDATE_TIME_WORK_INTERVAL);
+}
+
+static inline void pqi_schedule_update_time_worker(struct pqi_ctrl_info *ctrl_info)
+{
+	schedule_delayed_work(&ctrl_info->update_time_work, 0);
+}
+
+static inline void pqi_cancel_update_time_worker(struct pqi_ctrl_info *ctrl_info)
+{
+	cancel_delayed_work_sync(&ctrl_info->update_time_work);
+}
+
+static inline int pqi_report_luns(struct pqi_ctrl_info *ctrl_info, u8 cmd, void *buffer,
+	size_t buffer_length)
+{
+	return pqi_send_ctrl_raid_request(ctrl_info, cmd, buffer, buffer_length);
+}
+
+static int pqi_report_phys_logical_luns(struct pqi_ctrl_info *ctrl_info, u8 cmd, void **buffer)
+{
+	int rc;
+	size_t lun_list_length;
+	size_t lun_data_length;
+	size_t new_lun_list_length;
+	void *lun_data = NULL;
+	struct report_lun_header *report_lun_header;
+
+	report_lun_header = kmalloc(sizeof(*report_lun_header), GFP_KERNEL);
+	if (!report_lun_header) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	rc = pqi_report_luns(ctrl_info, cmd, report_lun_header, sizeof(*report_lun_header));
+	if (rc)
+		goto out;
+
+	lun_list_length = get_unaligned_be32(&report_lun_header->list_length);
+
+again:
+	lun_data_length = sizeof(struct report_lun_header) + lun_list_length;
+
+	lun_data = kmalloc(lun_data_length, GFP_KERNEL);
+	if (!lun_data) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	if (lun_list_length == 0) {
+		memcpy(lun_data, report_lun_header, sizeof(*report_lun_header));
+		goto out;
+	}
+
+	rc = pqi_report_luns(ctrl_info, cmd, lun_data, lun_data_length);
+	if (rc)
+		goto out;
+
+	new_lun_list_length =
+		get_unaligned_be32(&((struct report_lun_header *)lun_data)->list_length);
+
+	if (new_lun_list_length > lun_list_length) {
+		lun_list_length = new_lun_list_length;
+		kfree(lun_data);
+		goto again;
+	}
+
+out:
+	kfree(report_lun_header);
+
+	if (rc) {
+		kfree(lun_data);
+		lun_data = NULL;
+	}
+
+	*buffer = lun_data;
+
+	return rc;
+}
+
+static inline int pqi_report_phys_luns(struct pqi_ctrl_info *ctrl_info, void **buffer)
+{
+	int rc;
+	unsigned int i;
+	u8 rpl_response_format;
+	u32 num_physicals;
+	size_t rpl_16byte_wwid_list_length;
+	void *rpl_list;
+	struct report_lun_header *rpl_header;
+	struct report_phys_lun_8byte_wwid_list *rpl_8byte_wwid_list;
+	struct report_phys_lun_16byte_wwid_list *rpl_16byte_wwid_list;
+
+	rc = pqi_report_phys_logical_luns(ctrl_info, CISS_REPORT_PHYS, &rpl_list);
+	if (rc)
+		return rc;
+
+	if (ctrl_info->rpl_extended_format_4_5_supported) {
+		rpl_header = rpl_list;
+		rpl_response_format = rpl_header->flags & CISS_REPORT_PHYS_FLAG_EXTENDED_FORMAT_MASK;
+		if (rpl_response_format == CISS_REPORT_PHYS_FLAG_EXTENDED_FORMAT_4) {
+			*buffer = rpl_list;
+			return 0;
+		} else if (rpl_response_format != CISS_REPORT_PHYS_FLAG_EXTENDED_FORMAT_2) {
+			dev_err(&ctrl_info->pci_dev->dev,
+				"RPL returned unsupported data format %u\n",
+				rpl_response_format);
+			return -EINVAL;
+		} else {
+			dev_warn(&ctrl_info->pci_dev->dev,
+				"RPL returned extended format 2 instead of 4\n");
+		}
+	}
+
+	rpl_8byte_wwid_list = rpl_list;
+	num_physicals = get_unaligned_be32(&rpl_8byte_wwid_list->header.list_length) / sizeof(rpl_8byte_wwid_list->lun_entries[0]);
+	rpl_16byte_wwid_list_length = sizeof(struct report_lun_header) + (num_physicals * sizeof(struct report_phys_lun_16byte_wwid));
+
+	rpl_16byte_wwid_list = kmalloc(rpl_16byte_wwid_list_length, GFP_KERNEL);
+	if (!rpl_16byte_wwid_list)
+		return -ENOMEM;
+
+	put_unaligned_be32(num_physicals * sizeof(struct report_phys_lun_16byte_wwid),
+		&rpl_16byte_wwid_list->header.list_length);
+	rpl_16byte_wwid_list->header.flags = rpl_8byte_wwid_list->header.flags;
+
+	for (i = 0; i < num_physicals; i++) {
+		memcpy(&rpl_16byte_wwid_list->lun_entries[i].lunid, &rpl_8byte_wwid_list->lun_entries[i].lunid, sizeof(rpl_8byte_wwid_list->lun_entries[i].lunid));
+		memset(&rpl_16byte_wwid_list->lun_entries[i].wwid, 0, 8);
+		memcpy(&rpl_16byte_wwid_list->lun_entries[i].wwid[8], &rpl_8byte_wwid_list->lun_entries[i].wwid, sizeof(rpl_8byte_wwid_list->lun_entries[i].wwid));
+		rpl_16byte_wwid_list->lun_entries[i].device_type = rpl_8byte_wwid_list->lun_entries[i].device_type;
+		rpl_16byte_wwid_list->lun_entries[i].device_flags = rpl_8byte_wwid_list->lun_entries[i].device_flags;
+		rpl_16byte_wwid_list->lun_entries[i].lun_count = rpl_8byte_wwid_list->lun_entries[i].lun_count;
+		rpl_16byte_wwid_list->lun_entries[i].redundant_paths = rpl_8byte_wwid_list->lun_entries[i].redundant_paths;
+		rpl_16byte_wwid_list->lun_entries[i].aio_handle = rpl_8byte_wwid_list->lun_entries[i].aio_handle;
+	}
+
+	kfree(rpl_8byte_wwid_list);
+	*buffer = rpl_16byte_wwid_list;
+
+	return 0;
+}
+
+static inline int pqi_report_logical_luns(struct pqi_ctrl_info *ctrl_info, void **buffer)
+{
+	return pqi_report_phys_logical_luns(ctrl_info, CISS_REPORT_LOG, buffer);
+}
+
+static int pqi_get_device_lists(struct pqi_ctrl_info *ctrl_info,
+	struct report_phys_lun_16byte_wwid_list **physdev_list,
+	struct report_log_lun_list **logdev_list)
+{
+	int rc;
+	size_t logdev_list_length;
+	size_t logdev_data_length;
+	struct report_log_lun_list *internal_logdev_list;
+	struct report_log_lun_list *logdev_data;
+	struct report_lun_header report_lun_header;
+
+	rc = pqi_report_phys_luns(ctrl_info, (void **)physdev_list);
+	if (rc)
+		dev_err(&ctrl_info->pci_dev->dev,
+			"report physical LUNs failed\n");
+
+	rc = pqi_report_logical_luns(ctrl_info, (void **)logdev_list);
+	if (rc)
+		dev_err(&ctrl_info->pci_dev->dev,
+			"report logical LUNs failed\n");
+
+	/*
+	 * Tack the controller itself onto the end of the logical device list.
+	 */
+
+	logdev_data = *logdev_list;
+
+	if (logdev_data) {
+		logdev_list_length =
+			get_unaligned_be32(&logdev_data->header.list_length);
+	} else {
+		memset(&report_lun_header, 0, sizeof(report_lun_header));
+		logdev_data =
+			(struct report_log_lun_list *)&report_lun_header;
+		logdev_list_length = 0;
+	}
+
+	logdev_data_length = sizeof(struct report_lun_header) +
+		logdev_list_length;
+
+	internal_logdev_list = kmalloc(logdev_data_length +
+		sizeof(struct report_log_lun), GFP_KERNEL);
+	if (!internal_logdev_list) {
+		kfree(*logdev_list);
+		*logdev_list = NULL;
+		return -ENOMEM;
+	}
+
+	memcpy(internal_logdev_list, logdev_data, logdev_data_length);
+	memset((u8 *)internal_logdev_list + logdev_data_length, 0,
+		sizeof(struct report_log_lun));
+	put_unaligned_be32(logdev_list_length +
+		sizeof(struct report_log_lun),
+		&internal_logdev_list->header.list_length);
+
+	kfree(*logdev_list);
+	*logdev_list = internal_logdev_list;
+
+	return 0;
+}
+
+static inline void pqi_set_bus_target_lun(struct pqi_scsi_dev *device,
+	int bus, int target, int lun)
+{
+	device->bus = bus;
+	device->target = target;
+	device->lun = lun;
+}
+
+static void pqi_assign_bus_target_lun(struct pqi_scsi_dev *device)
+{
+	u8 *scsi3addr;
+	u32 lunid;
+	int bus;
+	int target;
+	int lun;
+
+	scsi3addr = device->scsi3addr;
+	lunid = get_unaligned_le32(scsi3addr);
+
+	if (pqi_is_hba_lunid(scsi3addr)) {
+		/* The specified device is the controller. */
+		pqi_set_bus_target_lun(device, PQI_HBA_BUS, 0, lunid & 0x3fff);
+		device->target_lun_valid = true;
+		return;
+	}
+
+	if (pqi_is_logical_device(device)) {
+		if (device->is_external_raid_device) {
+			bus = PQI_EXTERNAL_RAID_VOLUME_BUS;
+			target = (lunid >> 16) & 0x3fff;
+			lun = lunid & 0xff;
+		} else {
+			bus = PQI_RAID_VOLUME_BUS;
+			target = 0;
+			lun = lunid & 0x3fff;
+		}
+		pqi_set_bus_target_lun(device, bus, target, lun);
+		device->target_lun_valid = true;
+		return;
+	}
+
+	/*
+	 * Defer target and LUN assignment for non-controller physical devices
+	 * because the SAS transport layer will make these assignments later.
+	 */
+	pqi_set_bus_target_lun(device, PQI_PHYSICAL_DEVICE_BUS, 0, 0);
+}
+
+static void pqi_get_raid_level(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *device)
+{
+	int rc;
+	u8 raid_level;
+	u8 *buffer;
+
+	raid_level = SA_RAID_UNKNOWN;
+
+	buffer = kmalloc(64, GFP_KERNEL);
+	if (buffer) {
+		rc = pqi_scsi_inquiry(ctrl_info, device->scsi3addr,
+			VPD_PAGE | CISS_VPD_LV_DEVICE_GEOMETRY, buffer, 64);
+		if (rc == 0) {
+			raid_level = buffer[8];
+			if (raid_level > SA_RAID_MAX)
+				raid_level = SA_RAID_UNKNOWN;
+		}
+		kfree(buffer);
+	}
+
+	device->raid_level = raid_level;
+}
+
+static int pqi_validate_raid_map(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *device, struct raid_map *raid_map)
+{
+	char *err_msg;
+	u32 raid_map_size;
+	u32 r5or6_blocks_per_row;
+
+	raid_map_size = get_unaligned_le32(&raid_map->structure_size);
+
+	if (raid_map_size < offsetof(struct raid_map, disk_data)) {
+		err_msg = "RAID map too small";
+		goto bad_raid_map;
+	}
+
+	if (device->raid_level == SA_RAID_1) {
+		if (get_unaligned_le16(&raid_map->layout_map_count) != 2) {
+			err_msg = "invalid RAID-1 map";
+			goto bad_raid_map;
+		}
+	} else if (device->raid_level == SA_RAID_TRIPLE) {
+		if (get_unaligned_le16(&raid_map->layout_map_count) != 3) {
+			err_msg = "invalid RAID-1(Triple) map";
+			goto bad_raid_map;
+		}
+	} else if ((device->raid_level == SA_RAID_5 ||
+		device->raid_level == SA_RAID_6) &&
+		get_unaligned_le16(&raid_map->layout_map_count) > 1) {
+		/* RAID 50/60 */
+		r5or6_blocks_per_row =
+			get_unaligned_le16(&raid_map->strip_size) *
+			get_unaligned_le16(&raid_map->data_disks_per_row);
+		if (r5or6_blocks_per_row == 0) {
+			err_msg = "invalid RAID-5 or RAID-6 map";
+			goto bad_raid_map;
+		}
+	}
+
+	return 0;
+
+bad_raid_map:
+	dev_warn(&ctrl_info->pci_dev->dev,
+		"logical device %08x%08x %s\n",
+		*((u32 *)&device->scsi3addr),
+		*((u32 *)&device->scsi3addr[4]), err_msg);
+
+	return -EINVAL;
+}
+
+static int pqi_get_raid_map(struct pqi_ctrl_info *ctrl_info, struct pqi_scsi_dev *device)
+{
+	int rc;
+	u32 raid_map_size;
+	u32 structure_size;
+	struct raid_map *raid_map;
+
+	raid_map_size = sizeof(*raid_map);
+
+	while (1) {
+		raid_map = kmalloc(raid_map_size, GFP_KERNEL);
+		if (!raid_map)
+			return -ENOMEM;
+
+		rc = pqi_send_scsi_raid_request(ctrl_info, CISS_GET_RAID_MAP,
+			device->scsi3addr, raid_map, raid_map_size, 0, NULL);
+		if (rc)
+			goto error;
+
+		structure_size = get_unaligned_le32(&raid_map->structure_size);
+		if (structure_size <= raid_map_size)
+			break;
+
+		kfree(raid_map);
+		raid_map_size = structure_size;
+	}
+
+	rc = pqi_validate_raid_map(ctrl_info, device, raid_map);
+	if (rc)
+		goto error;
+
+	device->raid_map = raid_map;
+
+	return 0;
+
+error:
+	kfree(raid_map);
+
+	return rc;
+}
+
+static void pqi_set_max_transfer_encrypted(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *device)
+{
+	if (!ctrl_info->lv_drive_type_mix_valid) {
+		device->max_transfer_encrypted = ~0;
+		return;
+	}
+
+	switch (LV_GET_DRIVE_TYPE_MIX(device->scsi3addr)) {
+	case LV_DRIVE_TYPE_MIX_SAS_HDD_ONLY:
+	case LV_DRIVE_TYPE_MIX_SATA_HDD_ONLY:
+	case LV_DRIVE_TYPE_MIX_SAS_OR_SATA_SSD_ONLY:
+	case LV_DRIVE_TYPE_MIX_SAS_SSD_ONLY:
+	case LV_DRIVE_TYPE_MIX_SATA_SSD_ONLY:
+	case LV_DRIVE_TYPE_MIX_SAS_ONLY:
+	case LV_DRIVE_TYPE_MIX_SATA_ONLY:
+		device->max_transfer_encrypted =
+			ctrl_info->max_transfer_encrypted_sas_sata;
+		break;
+	case LV_DRIVE_TYPE_MIX_NVME_ONLY:
+		device->max_transfer_encrypted =
+			ctrl_info->max_transfer_encrypted_nvme;
+		break;
+	case LV_DRIVE_TYPE_MIX_UNKNOWN:
+	case LV_DRIVE_TYPE_MIX_NO_RESTRICTION:
+	default:
+		device->max_transfer_encrypted =
+			min(ctrl_info->max_transfer_encrypted_sas_sata,
+				ctrl_info->max_transfer_encrypted_nvme);
+		break;
+	}
+}
+
+static void pqi_get_raid_bypass_status(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *device)
+{
+	int rc;
+	u8 *buffer;
+	u8 bypass_status;
+
+	buffer = kmalloc(64, GFP_KERNEL);
+	if (!buffer)
+		return;
+
+	rc = pqi_scsi_inquiry(ctrl_info, device->scsi3addr,
+		VPD_PAGE | CISS_VPD_LV_BYPASS_STATUS, buffer, 64);
+	if (rc)
+		goto out;
+
+#define RAID_BYPASS_STATUS		4
+#define RAID_BYPASS_CONFIGURED		0x1
+#define RAID_BYPASS_ENABLED		0x2
+
+	bypass_status = buffer[RAID_BYPASS_STATUS];
+	device->raid_bypass_configured =
+		(bypass_status & RAID_BYPASS_CONFIGURED) != 0;
+	if (device->raid_bypass_configured &&
+		(bypass_status & RAID_BYPASS_ENABLED) &&
+		pqi_get_raid_map(ctrl_info, device) == 0) {
+		device->raid_bypass_enabled = true;
+		if (get_unaligned_le16(&device->raid_map->flags) &
+			RAID_MAP_ENCRYPTION_ENABLED)
+			pqi_set_max_transfer_encrypted(ctrl_info, device);
+	}
+
+out:
+	kfree(buffer);
+}
+
+/*
+ * Use vendor-specific VPD to determine online/offline status of a volume.
+ */
+
+static void pqi_get_volume_status(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *device)
+{
+	int rc;
+	size_t page_length;
+	u8 volume_status = CISS_LV_STATUS_UNAVAILABLE;
+	bool volume_offline = true;
+	u32 volume_flags;
+	struct ciss_vpd_logical_volume_status *vpd;
+
+	vpd = kmalloc(sizeof(*vpd), GFP_KERNEL);
+	if (!vpd)
+		goto no_buffer;
+
+	rc = pqi_scsi_inquiry(ctrl_info, device->scsi3addr,
+		VPD_PAGE | CISS_VPD_LV_STATUS, vpd, sizeof(*vpd));
+	if (rc)
+		goto out;
+
+	if (vpd->page_code != CISS_VPD_LV_STATUS)
+		goto out;
+
+	page_length = offsetof(struct ciss_vpd_logical_volume_status,
+		volume_status) + vpd->page_length;
+	if (page_length < sizeof(*vpd))
+		goto out;
+
+	volume_status = vpd->volume_status;
+	volume_flags = get_unaligned_be32(&vpd->flags);
+	volume_offline = (volume_flags & CISS_LV_FLAGS_NO_HOST_IO) != 0;
+
+out:
+	kfree(vpd);
+no_buffer:
+	device->volume_status = volume_status;
+	device->volume_offline = volume_offline;
+}
+
+#define PQI_DEVICE_NCQ_PRIO_SUPPORTED	0x01
+#define PQI_DEVICE_PHY_MAP_SUPPORTED	0x10
+
+static int pqi_get_physical_device_info(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *device,
+	struct bmic_identify_physical_device *id_phys)
+{
+	int rc;
+
+	memset(id_phys, 0, sizeof(*id_phys));
+
+	rc = pqi_identify_physical_device(ctrl_info, device,
+		id_phys, sizeof(*id_phys));
+	if (rc) {
+		device->queue_depth = PQI_PHYSICAL_DISK_DEFAULT_MAX_QUEUE_DEPTH;
+		return rc;
+	}
+
+	scsi_sanitize_inquiry_string(&id_phys->model[0], 8);
+	scsi_sanitize_inquiry_string(&id_phys->model[8], 16);
+
+	memcpy(device->vendor, &id_phys->model[0], sizeof(device->vendor));
+	memcpy(device->model, &id_phys->model[8], sizeof(device->model));
+
+	device->box_index = id_phys->box_index;
+	device->phys_box_on_bus = id_phys->phys_box_on_bus;
+	device->phy_connected_dev_type = id_phys->phy_connected_dev_type[0];
+	device->queue_depth =
+		get_unaligned_le16(&id_phys->current_queue_depth_limit);
+	device->active_path_index = id_phys->active_path_number;
+	device->path_map = id_phys->redundant_path_present_map;
+	memcpy(&device->box,
+		&id_phys->alternate_paths_phys_box_on_port,
+		sizeof(device->box));
+	memcpy(&device->phys_connector,
+		&id_phys->alternate_paths_phys_connector,
+		sizeof(device->phys_connector));
+	device->bay = id_phys->phys_bay_in_box;
+
+	memcpy(&device->page_83_identifier, &id_phys->page_83_identifier,
+		sizeof(device->page_83_identifier));
+
+	if ((id_phys->even_more_flags & PQI_DEVICE_PHY_MAP_SUPPORTED) &&
+		id_phys->phy_count)
+		device->phy_id =
+			id_phys->phy_to_phy_map[device->active_path_index];
+	else
+		device->phy_id = 0xFF;
+
+	device->ncq_prio_support =
+		((get_unaligned_le32(&id_phys->misc_drive_flags) >> 16) &
+		PQI_DEVICE_NCQ_PRIO_SUPPORTED);
+
+	return 0;
+}
+
+static int pqi_get_logical_device_info(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *device)
+{
+	int rc;
+	u8 *buffer;
+
+	buffer = kmalloc(64, GFP_KERNEL);
+	if (!buffer)
+		return -ENOMEM;
+
+	/* Send an inquiry to the device to see what it is. */
+	rc = pqi_scsi_inquiry(ctrl_info, device->scsi3addr, 0, buffer, 64);
+	if (rc)
+		goto out;
+
+	scsi_sanitize_inquiry_string(&buffer[8], 8);
+	scsi_sanitize_inquiry_string(&buffer[16], 16);
+
+	device->devtype = buffer[0] & 0x1f;
+	memcpy(device->vendor, &buffer[8], sizeof(device->vendor));
+	memcpy(device->model, &buffer[16], sizeof(device->model));
+
+	if (device->devtype == TYPE_DISK) {
+		if (device->is_external_raid_device) {
+			device->raid_level = SA_RAID_UNKNOWN;
+			device->volume_status = CISS_LV_OK;
+			device->volume_offline = false;
+		} else {
+			pqi_get_raid_level(ctrl_info, device);
+			pqi_get_raid_bypass_status(ctrl_info, device);
+			pqi_get_volume_status(ctrl_info, device);
+		}
+	}
+
+out:
+	kfree(buffer);
+
+	return rc;
+}
+
+/*
+ * Prevent adding drive to OS for some corner cases such as a drive
+ * undergoing a sanitize operation. Some OSes will continue to poll
+ * the drive until the sanitize completes, which can take hours,
+ * resulting in long bootup delays. Commands such as TUR, READ_CAP
+ * are allowed, but READ/WRITE cause check condition. So the OS
+ * cannot check/read the partition table.
+ * Note: devices that have completed sanitize must be re-enabled
+ *       using the management utility.
+ */
+static bool pqi_keep_device_offline(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *device)
+{
+	u8 scsi_status;
+	int rc;
+	enum dma_data_direction dir;
+	char *buffer;
+	int buffer_length = 64;
+	size_t sense_data_length;
+	struct scsi_sense_hdr sshdr;
+	struct pqi_raid_path_request request;
+	struct pqi_raid_error_info error_info;
+	bool offline = false; /* Assume keep online */
+
+	/* Do not check controllers. */
+	if (pqi_is_hba_lunid(device->scsi3addr))
+		return false;
+
+	/* Do not check LVs. */
+	if (pqi_is_logical_device(device))
+		return false;
+
+	buffer = kmalloc(buffer_length, GFP_KERNEL);
+	if (!buffer)
+		return false; /* Assume not offline */
+
+	/* Check for SANITIZE in progress using TUR */
+	rc = pqi_build_raid_path_request(ctrl_info, &request,
+		TEST_UNIT_READY, RAID_CTLR_LUNID, buffer,
+		buffer_length, 0, &dir);
+	if (rc)
+		goto out; /* Assume not offline */
+
+	memcpy(request.lun_number, device->scsi3addr, sizeof(request.lun_number));
+
+	rc = pqi_submit_raid_request_synchronous(ctrl_info, &request.header, 0, &error_info);
+
+	if (rc)
+		goto out; /* Assume not offline */
+
+	scsi_status = error_info.status;
+	sense_data_length = get_unaligned_le16(&error_info.sense_data_length);
+	if (sense_data_length == 0)
+		sense_data_length =
+			get_unaligned_le16(&error_info.response_data_length);
+	if (sense_data_length) {
+		if (sense_data_length > sizeof(error_info.data))
+			sense_data_length = sizeof(error_info.data);
+
+		/*
+		 * Check for sanitize in progress: asc:0x04, ascq: 0x1b
+		 */
+		if (scsi_status == SAM_STAT_CHECK_CONDITION &&
+			scsi_normalize_sense(error_info.data,
+				sense_data_length, &sshdr) &&
+				sshdr.sense_key == NOT_READY &&
+				sshdr.asc == 0x04 &&
+				sshdr.ascq == 0x1b) {
+
+			device->device_offline = true;
+			offline = true;
+			goto out; /* Keep device offline */
+		}
+	}
+
+out:
+	kfree(buffer);
+	return offline;
+}
+
+static int pqi_get_device_info(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *device,
+	struct bmic_identify_physical_device *id_phys)
+{
+	int rc;
+
+	if (device->is_expander_smp_device)
+		return 0;
+
+	if (pqi_is_logical_device(device))
+		rc = pqi_get_logical_device_info(ctrl_info, device);
+	else
+		rc = pqi_get_physical_device_info(ctrl_info, device, id_phys);
+
+	return rc;
+}
+
+static void pqi_show_volume_status(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *device)
+{
+	char *status;
+	static const char unknown_state_str[] =
+		"Volume is in an unknown state (%u)";
+	char unknown_state_buffer[sizeof(unknown_state_str) + 10];
+
+	switch (device->volume_status) {
+	case CISS_LV_OK:
+		status = "Volume online";
+		break;
+	case CISS_LV_FAILED:
+		status = "Volume failed";
+		break;
+	case CISS_LV_NOT_CONFIGURED:
+		status = "Volume not configured";
+		break;
+	case CISS_LV_DEGRADED:
+		status = "Volume degraded";
+		break;
+	case CISS_LV_READY_FOR_RECOVERY:
+		status = "Volume ready for recovery operation";
+		break;
+	case CISS_LV_UNDERGOING_RECOVERY:
+		status = "Volume undergoing recovery";
+		break;
+	case CISS_LV_WRONG_PHYSICAL_DRIVE_REPLACED:
+		status = "Wrong physical drive was replaced";
+		break;
+	case CISS_LV_PHYSICAL_DRIVE_CONNECTION_PROBLEM:
+		status = "A physical drive not properly connected";
+		break;
+	case CISS_LV_HARDWARE_OVERHEATING:
+		status = "Hardware is overheating";
+		break;
+	case CISS_LV_HARDWARE_HAS_OVERHEATED:
+		status = "Hardware has overheated";
+		break;
+	case CISS_LV_UNDERGOING_EXPANSION:
+		status = "Volume undergoing expansion";
+		break;
+	case CISS_LV_NOT_AVAILABLE:
+		status = "Volume waiting for transforming volume";
+		break;
+	case CISS_LV_QUEUED_FOR_EXPANSION:
+		status = "Volume queued for expansion";
+		break;
+	case CISS_LV_DISABLED_SCSI_ID_CONFLICT:
+		status = "Volume disabled due to SCSI ID conflict";
+		break;
+	case CISS_LV_EJECTED:
+		status = "Volume has been ejected";
+		break;
+	case CISS_LV_UNDERGOING_ERASE:
+		status = "Volume undergoing background erase";
+		break;
+	case CISS_LV_READY_FOR_PREDICTIVE_SPARE_REBUILD:
+		status = "Volume ready for predictive spare rebuild";
+		break;
+	case CISS_LV_UNDERGOING_RPI:
+		status = "Volume undergoing rapid parity initialization";
+		break;
+	case CISS_LV_PENDING_RPI:
+		status = "Volume queued for rapid parity initialization";
+		break;
+	case CISS_LV_ENCRYPTED_NO_KEY:
+		status = "Encrypted volume inaccessible - key not present";
+		break;
+	case CISS_LV_UNDERGOING_ENCRYPTION:
+		status = "Volume undergoing encryption process";
+		break;
+	case CISS_LV_UNDERGOING_ENCRYPTION_REKEYING:
+		status = "Volume undergoing encryption re-keying process";
+		break;
+	case CISS_LV_ENCRYPTED_IN_NON_ENCRYPTED_CONTROLLER:
+		status = "Volume encrypted but encryption is disabled";
+		break;
+	case CISS_LV_PENDING_ENCRYPTION:
+		status = "Volume pending migration to encrypted state";
+		break;
+	case CISS_LV_PENDING_ENCRYPTION_REKEYING:
+		status = "Volume pending encryption rekeying";
+		break;
+	case CISS_LV_NOT_SUPPORTED:
+		status = "Volume not supported on this controller";
+		break;
+	case CISS_LV_STATUS_UNAVAILABLE:
+		status = "Volume status not available";
+		break;
+	default:
+		snprintf(unknown_state_buffer, sizeof(unknown_state_buffer),
+			unknown_state_str, device->volume_status);
+		status = unknown_state_buffer;
+		break;
+	}
+
+	dev_info(&ctrl_info->pci_dev->dev,
+		"scsi %d:%d:%d:%d %s\n",
+		ctrl_info->scsi_host->host_no,
+		device->bus, device->target, device->lun, status);
+}
+
+static void pqi_rescan_worker(struct work_struct *work)
+{
+	struct pqi_ctrl_info *ctrl_info;
+
+	ctrl_info = container_of(to_delayed_work(work), struct pqi_ctrl_info,
+		rescan_work);
+
+	pqi_scan_scsi_devices(ctrl_info);
+}
+
+static int pqi_add_device(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *device)
+{
+	int rc;
+
+	if (pqi_is_logical_device(device))
+		rc = scsi_add_device(ctrl_info->scsi_host, device->bus,
+			device->target, device->lun);
+	else
+		rc = pqi_add_sas_device(ctrl_info->sas_host, device);
+
+	return rc;
+}
+
+#define PQI_REMOVE_DEVICE_PENDING_IO_TIMEOUT_MSECS	(20 * 1000)
+
+static inline void pqi_remove_device(struct pqi_ctrl_info *ctrl_info, struct pqi_scsi_dev *device)
+{
+	int rc;
+
+	rc = pqi_device_wait_for_pending_io(ctrl_info, device,
+		PQI_REMOVE_DEVICE_PENDING_IO_TIMEOUT_MSECS);
+	if (rc)
+		dev_err(&ctrl_info->pci_dev->dev,
+			"scsi %d:%d:%d:%d removing device with %d outstanding command(s)\n",
+			ctrl_info->scsi_host->host_no, device->bus,
+			device->target, device->lun,
+			atomic_read(&device->scsi_cmds_outstanding));
+
+	if (pqi_is_logical_device(device))
+		scsi_remove_device(device->sdev);
+	else
+		pqi_remove_sas_device(device);
+
+	pqi_device_remove_start(device);
+}
+
+/* Assumes the SCSI device list lock is held. */
+
+static struct pqi_scsi_dev *pqi_find_scsi_dev(struct pqi_ctrl_info *ctrl_info,
+	int bus, int target, int lun)
+{
+	struct pqi_scsi_dev *device;
+
+	list_for_each_entry(device, &ctrl_info->scsi_device_list, scsi_device_list_entry)
+		if (device->bus == bus && device->target == target && device->lun == lun)
+			return device;
+
+	return NULL;
+}
+
+static inline bool pqi_device_equal(struct pqi_scsi_dev *dev1, struct pqi_scsi_dev *dev2)
+{
+	if (dev1->is_physical_device != dev2->is_physical_device)
+		return false;
+
+	if (dev1->is_physical_device)
+		return memcmp(dev1->wwid, dev2->wwid, sizeof(dev1->wwid)) == 0;
+
+	return memcmp(dev1->volume_id, dev2->volume_id, sizeof(dev1->volume_id)) == 0;
+}
+
+enum pqi_find_result {
+	DEVICE_NOT_FOUND,
+	DEVICE_CHANGED,
+	DEVICE_SAME,
+};
+
+static enum pqi_find_result pqi_scsi_find_entry(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *device_to_find, struct pqi_scsi_dev **matching_device)
+{
+	struct pqi_scsi_dev *device;
+
+	list_for_each_entry(device, &ctrl_info->scsi_device_list, scsi_device_list_entry) {
+		if (pqi_scsi3addr_equal(device_to_find->scsi3addr, device->scsi3addr)) {
+			*matching_device = device;
+			if (pqi_device_equal(device_to_find, device)) {
+				if (device_to_find->volume_offline)
+					return DEVICE_CHANGED;
+				return DEVICE_SAME;
+			}
+			return DEVICE_CHANGED;
+		}
+	}
+
+	return DEVICE_NOT_FOUND;
+}
+
+static inline const char *pqi_device_type(struct pqi_scsi_dev *device)
+{
+	if (device->is_expander_smp_device)
+		return "Enclosure SMP    ";
+
+	return scsi_device_type(device->devtype);
+}
+
+#define PQI_DEV_INFO_BUFFER_LENGTH	128
+
+static void pqi_dev_info(struct pqi_ctrl_info *ctrl_info,
+	char *action, struct pqi_scsi_dev *device)
+{
+	ssize_t count;
+	char buffer[PQI_DEV_INFO_BUFFER_LENGTH];
+
+	count = scnprintf(buffer, PQI_DEV_INFO_BUFFER_LENGTH,
+		"%d:%d:", ctrl_info->scsi_host->host_no, device->bus);
+
+	if (device->target_lun_valid)
+		count += scnprintf(buffer + count,
+			PQI_DEV_INFO_BUFFER_LENGTH - count,
+			"%d:%d",
+			device->target,
+			device->lun);
+	else
+		count += scnprintf(buffer + count,
+			PQI_DEV_INFO_BUFFER_LENGTH - count,
+			"-:-");
+
+	if (pqi_is_logical_device(device))
+		count += scnprintf(buffer + count,
+			PQI_DEV_INFO_BUFFER_LENGTH - count,
+			" %08x%08x",
+			*((u32 *)&device->scsi3addr),
+			*((u32 *)&device->scsi3addr[4]));
+	else
+		count += scnprintf(buffer + count,
+			PQI_DEV_INFO_BUFFER_LENGTH - count,
+			" %016llx%016llx", 
+			get_unaligned_be64(&device->wwid[0]),
+			get_unaligned_be64(&device->wwid[8]));
+
+	count += scnprintf(buffer + count, PQI_DEV_INFO_BUFFER_LENGTH - count,
+		" %s %.8s %.16s ",
+		pqi_device_type(device),
+		device->vendor,
+		device->model);
+
+	if (pqi_is_logical_device(device)) {
+		if (device->devtype == TYPE_DISK)
+			count += scnprintf(buffer + count,
+				PQI_DEV_INFO_BUFFER_LENGTH - count,
+				"SSDSmartPathCap%c En%c %-12s",
+				device->raid_bypass_configured ? '+' : '-',
+				device->raid_bypass_enabled ? '+' : '-',
+				pqi_raid_level_to_string(device->raid_level));
+	} else {
+		count += scnprintf(buffer + count,
+			PQI_DEV_INFO_BUFFER_LENGTH - count,
+			"AIO%c", device->aio_enabled ? '+' : '-');
+		if (device->devtype == TYPE_DISK ||
+			device->devtype == TYPE_ZBC)
+			count += scnprintf(buffer + count,
+				PQI_DEV_INFO_BUFFER_LENGTH - count,
+				" qd=%-6d", device->queue_depth);
+	}
+
+	dev_info(&ctrl_info->pci_dev->dev, "%s %s\n", action, buffer);
+}
+
+/* Assumes the SCSI device list lock is held. */
+
+static void pqi_scsi_update_device(struct pqi_scsi_dev *existing_device,
+	struct pqi_scsi_dev *new_device)
+{
+	existing_device->device_type = new_device->device_type;
+	existing_device->bus = new_device->bus;
+	if (new_device->target_lun_valid) {
+		existing_device->target = new_device->target;
+		existing_device->lun = new_device->lun;
+		existing_device->target_lun_valid = true;
+	}
+
+	if ((existing_device->volume_status == CISS_LV_QUEUED_FOR_EXPANSION ||
+		existing_device->volume_status == CISS_LV_UNDERGOING_EXPANSION) &&
+		new_device->volume_status == CISS_LV_OK)
+		existing_device->rescan = true;
+
+	/* By definition, the scsi3addr and wwid fields are already the same. */
+
+	existing_device->is_physical_device = new_device->is_physical_device;
+	existing_device->is_external_raid_device =
+		new_device->is_external_raid_device;
+	existing_device->is_expander_smp_device =
+		new_device->is_expander_smp_device;
+	existing_device->aio_enabled = new_device->aio_enabled;
+	memcpy(existing_device->vendor, new_device->vendor,
+		sizeof(existing_device->vendor));
+	memcpy(existing_device->model, new_device->model,
+		sizeof(existing_device->model));
+	existing_device->sas_address = new_device->sas_address;
+	existing_device->raid_level = new_device->raid_level;
+	existing_device->queue_depth = new_device->queue_depth;
+	existing_device->aio_handle = new_device->aio_handle;
+	existing_device->volume_status = new_device->volume_status;
+	existing_device->active_path_index = new_device->active_path_index;
+	existing_device->phy_id = new_device->phy_id;
+	existing_device->path_map = new_device->path_map;
+	existing_device->bay = new_device->bay;
+	existing_device->box_index = new_device->box_index;
+	existing_device->phys_box_on_bus = new_device->phys_box_on_bus;
+	existing_device->phy_connected_dev_type = new_device->phy_connected_dev_type;
+	memcpy(existing_device->box, new_device->box,
+		sizeof(existing_device->box));
+	memcpy(existing_device->phys_connector, new_device->phys_connector,
+		sizeof(existing_device->phys_connector));
+	existing_device->next_bypass_group = 0;
+	kfree(existing_device->raid_map);
+	existing_device->raid_map = new_device->raid_map;
+	existing_device->raid_bypass_configured =
+		new_device->raid_bypass_configured;
+	existing_device->raid_bypass_enabled =
+		new_device->raid_bypass_enabled;
+	existing_device->device_offline = false;
+
+	/* To prevent this from being freed later. */
+	new_device->raid_map = NULL;
+}
+
+static inline void pqi_free_device(struct pqi_scsi_dev *device)
+{
+	if (device) {
+		kfree(device->raid_map);
+		kfree(device);
+	}
+}
+
+/*
+ * Called when exposing a new device to the OS fails in order to re-adjust
+ * our internal SCSI device list to match the SCSI ML's view.
+ */
+
+static inline void pqi_fixup_botched_add(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *device)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
+	list_del(&device->scsi_device_list_entry);
+	spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+
+	/* Allow the device structure to be freed later. */
+	device->keep_device = false;
+}
+
+static inline bool pqi_is_device_added(struct pqi_scsi_dev *device)
+{
+	if (device->is_expander_smp_device)
+		return device->sas_port != NULL;
+
+	return device->sdev != NULL;
+}
+
+static void pqi_update_device_list(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *new_device_list[], unsigned int num_new_devices)
+{
+	int rc;
+	unsigned int i;
+	unsigned long flags;
+	enum pqi_find_result find_result;
+	struct pqi_scsi_dev *device;
+	struct pqi_scsi_dev *next;
+	struct pqi_scsi_dev *matching_device;
+	LIST_HEAD(add_list);
+	LIST_HEAD(delete_list);
+
+	/*
+	 * The idea here is to do as little work as possible while holding the
+	 * spinlock.  That's why we go to great pains to defer anything other
+	 * than updating the internal device list until after we release the
+	 * spinlock.
+	 */
+
+	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
+
+	/* Assume that all devices in the existing list have gone away. */
+	list_for_each_entry(device, &ctrl_info->scsi_device_list, scsi_device_list_entry)
+		device->device_gone = true;
+
+	for (i = 0; i < num_new_devices; i++) {
+		device = new_device_list[i];
+
+		find_result = pqi_scsi_find_entry(ctrl_info, device,
+			&matching_device);
+
+		switch (find_result) {
+		case DEVICE_SAME:
+			/*
+			 * The newly found device is already in the existing
+			 * device list.
+			 */
+			device->new_device = false;
+			matching_device->device_gone = false;
+			pqi_scsi_update_device(matching_device, device);
+			break;
+		case DEVICE_NOT_FOUND:
+			/*
+			 * The newly found device is NOT in the existing device
+			 * list.
+			 */
+			device->new_device = true;
+			break;
+		case DEVICE_CHANGED:
+			/*
+			 * The original device has gone away and we need to add
+			 * the new device.
+			 */
+			device->new_device = true;
+			break;
+		default:
+			BUG();
+			break;
+		}
+	}
+
+	/* Process all devices that have gone away. */
+	list_for_each_entry_safe(device, next, &ctrl_info->scsi_device_list,
+		scsi_device_list_entry) {
+		if (device->device_gone) {
+			list_del(&device->scsi_device_list_entry);
+			list_add_tail(&device->delete_list_entry, &delete_list);
+		}
+	}
+
+	/* Process all new devices. */
+	for (i = 0; i < num_new_devices; i++) {
+		device = new_device_list[i];
+		if (!device->new_device)
+			continue;
+		if (device->volume_offline)
+			continue;
+		list_add_tail(&device->scsi_device_list_entry,
+			&ctrl_info->scsi_device_list);
+		list_add_tail(&device->add_list_entry, &add_list);
+		/* To prevent this device structure from being freed later. */
+		device->keep_device = true;
+	}
+
+	spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+
+	if (pqi_ofa_in_progress(ctrl_info)) {
+		list_for_each_entry_safe(device, next, &delete_list, delete_list_entry)
+			if (pqi_is_device_added(device))
+				pqi_device_remove_start(device);
+		pqi_ctrl_unblock_device_reset(ctrl_info);
+		pqi_scsi_unblock_requests(ctrl_info);
+	}
+
+	/* Remove all devices that have gone away. */
+	list_for_each_entry_safe(device, next, &delete_list, delete_list_entry) {
+		if (device->volume_offline) {
+			pqi_dev_info(ctrl_info, "offline", device);
+			pqi_show_volume_status(ctrl_info, device);
+		} else {
+			pqi_dev_info(ctrl_info, "removed", device);
+		}
+		if (pqi_is_device_added(device))
+			pqi_remove_device(ctrl_info, device);
+		list_del(&device->delete_list_entry);
+		pqi_free_device(device);
+	}
+
+	/*
+	 * Notify the SCSI ML if the queue depth of any existing device has
+	 * changed.
+	 */
+	list_for_each_entry(device, &ctrl_info->scsi_device_list, scsi_device_list_entry) {
+		if (device->sdev && device->queue_depth != device->advertised_queue_depth) {
+			device->advertised_queue_depth = device->queue_depth;
+			scsi_change_queue_depth(device->sdev, device->advertised_queue_depth);
+		}
+		if (device->rescan) {
+			scsi_rescan_device(&device->sdev->sdev_gendev);
+			device->rescan = false;
+		}
+	}
+
+	/* Expose any new devices. */
+	list_for_each_entry_safe(device, next, &add_list, add_list_entry) {
+		if (!pqi_is_device_added(device)) {
+			rc = pqi_add_device(ctrl_info, device);
+			if (rc == 0) {
+				pqi_dev_info(ctrl_info, "added", device);
+			} else {
+				dev_warn(&ctrl_info->pci_dev->dev,
+					"scsi %d:%d:%d:%d addition failed, device not added\n",
+					ctrl_info->scsi_host->host_no,
+					device->bus, device->target,
+					device->lun);
+				pqi_fixup_botched_add(ctrl_info, device);
+			}
+		}
+	}
+}
+
+static inline bool pqi_is_supported_device(struct pqi_scsi_dev *device)
+{
+	/*
+	 * Only support the HBA controller itself as a RAID
+	 * controller.  If it's a RAID controller other than
+	 * the HBA itself (an external RAID controller, for
+	 * example), we don't support it.
+	 */
+	if (device->device_type == SA_DEVICE_TYPE_CONTROLLER &&
+		!pqi_is_hba_lunid(device->scsi3addr))
+			return false;
+
+	return true;
+}
+
+static inline bool pqi_skip_device(u8 *scsi3addr)
+{
+	/* Ignore all masked devices. */
+	if (MASKED_DEVICE(scsi3addr))
+		return true;
+
+	return false;
+}
+
+static inline void pqi_mask_device(u8 *scsi3addr)
+{
+	scsi3addr[3] |= 0xc0;
+}
+
+static inline bool pqi_is_device_with_sas_address(struct pqi_scsi_dev *device)
+{
+	switch (device->device_type) {
+	case SA_DEVICE_TYPE_SAS:
+	case SA_DEVICE_TYPE_EXPANDER_SMP:
+	case SA_DEVICE_TYPE_SES:
+		return true;
+	}
+
+	return false;
+}
+
+static inline bool pqi_is_multipath_device(struct pqi_scsi_dev *device)
+{
+	if (pqi_is_logical_device(device))
+		return false;
+
+	return (device->path_map & (device->path_map - 1)) != 0;
+}
+
+static inline bool pqi_expose_device(struct pqi_scsi_dev *device)
+{
+	return !device->is_physical_device || !pqi_skip_device(device->scsi3addr);
+}
+
+static inline void pqi_set_physical_device_wwid(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *device, struct report_phys_lun_16byte_wwid *phys_lun)
+{
+	if (ctrl_info->unique_wwid_in_report_phys_lun_supported ||
+		ctrl_info->rpl_extended_format_4_5_supported ||
+		pqi_is_device_with_sas_address(device))
+		memcpy(device->wwid, phys_lun->wwid, sizeof(device->wwid));
+	else
+		memcpy(&device->wwid[8], device->page_83_identifier, 8);
+}
+
+static int pqi_update_scsi_devices(struct pqi_ctrl_info *ctrl_info)
+{
+	int i;
+	int rc;
+	LIST_HEAD(new_device_list_head);
+	struct report_phys_lun_16byte_wwid_list *physdev_list = NULL;
+	struct report_log_lun_list *logdev_list = NULL;
+	struct report_phys_lun_16byte_wwid *phys_lun;
+	struct report_log_lun *log_lun;
+	struct bmic_identify_physical_device *id_phys = NULL;
+	u32 num_physicals;
+	u32 num_logicals;
+	struct pqi_scsi_dev **new_device_list = NULL;
+	struct pqi_scsi_dev *device;
+	struct pqi_scsi_dev *next;
+	unsigned int num_new_devices;
+	unsigned int num_valid_devices;
+	bool is_physical_device;
+	u8 *scsi3addr;
+	unsigned int physical_index;
+	unsigned int logical_index;
+	static char *out_of_memory_msg =
+		"failed to allocate memory, device discovery stopped";
+
+	rc = pqi_get_device_lists(ctrl_info, &physdev_list, &logdev_list);
+	if (rc)
+		goto out;
+
+	if (physdev_list)
+		num_physicals =
+			get_unaligned_be32(&physdev_list->header.list_length)
+				/ sizeof(physdev_list->lun_entries[0]);
+	else
+		num_physicals = 0;
+
+	if (logdev_list)
+		num_logicals =
+			get_unaligned_be32(&logdev_list->header.list_length)
+				/ sizeof(logdev_list->lun_entries[0]);
+	else
+		num_logicals = 0;
+
+	if (num_physicals) {
+		/*
+		 * We need this buffer for calls to pqi_get_physical_disk_info()
+		 * below.  We allocate it here instead of inside
+		 * pqi_get_physical_disk_info() because it's a fairly large
+		 * buffer.
+		 */
+		id_phys = kmalloc(sizeof(*id_phys), GFP_KERNEL);
+		if (!id_phys) {
+			dev_warn(&ctrl_info->pci_dev->dev, "%s\n",
+				out_of_memory_msg);
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		if (pqi_hide_vsep) {
+			for (i = num_physicals - 1; i >= 0; i--) {
+				phys_lun = &physdev_list->lun_entries[i];
+				if (CISS_GET_DRIVE_NUMBER(phys_lun->lunid) == PQI_VSEP_CISS_BTL) {
+					pqi_mask_device(phys_lun->lunid);
+					break;
+				}
+			}
+		}
+	}
+
+	if (num_logicals &&
+		(logdev_list->header.flags & CISS_REPORT_LOG_FLAG_DRIVE_TYPE_MIX))
+		ctrl_info->lv_drive_type_mix_valid = true;
+
+	num_new_devices = num_physicals + num_logicals;
+
+	new_device_list = kmalloc(sizeof(*new_device_list) *
+		num_new_devices, GFP_KERNEL);
+	if (!new_device_list) {
+		dev_warn(&ctrl_info->pci_dev->dev, "%s\n", out_of_memory_msg);
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < num_new_devices; i++) {
+		device = kzalloc(sizeof(*device), GFP_KERNEL);
+		if (!device) {
+			dev_warn(&ctrl_info->pci_dev->dev, "%s\n",
+				out_of_memory_msg);
+			rc = -ENOMEM;
+			goto out;
+		}
+		list_add_tail(&device->new_device_list_entry,
+			&new_device_list_head);
+	}
+
+	device = NULL;
+	num_valid_devices = 0;
+	physical_index = 0;
+	logical_index = 0;
+
+	for (i = 0; i < num_new_devices; i++) {
+
+		if ((!pqi_expose_ld_first && i < num_physicals) ||
+			(pqi_expose_ld_first && i >= num_logicals)) {
+			is_physical_device = true;
+			phys_lun = &physdev_list->lun_entries[physical_index++];
+			log_lun = NULL;
+			scsi3addr = phys_lun->lunid;
+		} else {
+			is_physical_device = false;
+			phys_lun = NULL;
+			log_lun = &logdev_list->lun_entries[logical_index++];
+			scsi3addr = log_lun->lunid;
+		}
+
+		if (is_physical_device && pqi_skip_device(scsi3addr))
+			continue;
+
+		if (device)
+			device = list_next_entry(device, new_device_list_entry);
+		else
+			device = list_first_entry(&new_device_list_head,
+				struct pqi_scsi_dev, new_device_list_entry);
+
+		memcpy(device->scsi3addr, scsi3addr, sizeof(device->scsi3addr));
+		device->is_physical_device = is_physical_device;
+		if (is_physical_device) {
+			device->device_type = phys_lun->device_type;
+			if (device->device_type == SA_DEVICE_TYPE_EXPANDER_SMP)
+				device->is_expander_smp_device = true;
+		} else {
+			device->is_external_raid_device =
+				pqi_is_external_raid_addr(scsi3addr);
+		}
+
+		if (!pqi_is_supported_device(device))
+			continue;
+
+		/* Do not present disks that the OS cannot fully probe */
+		if (pqi_keep_device_offline(ctrl_info, device))
+			continue;
+
+		/* Gather information about the device. */
+		rc = pqi_get_device_info(ctrl_info, device, id_phys);
+		if (rc == -ENOMEM) {
+			dev_warn(&ctrl_info->pci_dev->dev, "%s\n",
+				out_of_memory_msg);
+			goto out;
+		}
+		if (rc) {
+			if (device->is_physical_device)
+				dev_warn(&ctrl_info->pci_dev->dev,
+					"obtaining device info failed, skipping physical device %016llx%016llx\n",
+					get_unaligned_be64(&phys_lun->wwid[0]),
+					get_unaligned_be64(&phys_lun->wwid[8]));
+			else
+				dev_warn(&ctrl_info->pci_dev->dev,
+					"obtaining device info failed, skipping logical device %08x%08x\n",
+					*((u32 *)&device->scsi3addr),
+					*((u32 *)&device->scsi3addr[4]));
+			rc = 0;
+			continue;
+		}
+
+		pqi_assign_bus_target_lun(device);
+
+		if (device->is_physical_device) {
+			pqi_set_physical_device_wwid(ctrl_info, device, phys_lun);
+			if ((phys_lun->device_flags &
+				CISS_REPORT_PHYS_DEV_FLAG_AIO_ENABLED) &&
+				phys_lun->aio_handle) {
+					device->aio_enabled = true;
+					device->aio_handle = 
+						phys_lun->aio_handle;
+			}
+		} else {
+			memcpy(device->volume_id, log_lun->volume_id,
+				sizeof(device->volume_id));
+		}
+
+		if (pqi_is_device_with_sas_address(device))
+			device->sas_address = get_unaligned_be64(&device->wwid[8]);
+
+		new_device_list[num_valid_devices++] = device;
+	}
+
+	pqi_update_device_list(ctrl_info, new_device_list, num_valid_devices);
+
+out:
+	list_for_each_entry_safe(device, next, &new_device_list_head,
+		new_device_list_entry) {
+		if (device->keep_device)
+			continue;
+		list_del(&device->new_device_list_entry);
+		pqi_free_device(device);
+	}
+
+	kfree(new_device_list);
+	kfree(physdev_list);
+	kfree(logdev_list);
+	kfree(id_phys);
+
+	return rc;
+}
+
+static void pqi_remove_all_scsi_devices(struct pqi_ctrl_info *ctrl_info)
+{
+	unsigned long flags;
+	struct pqi_scsi_dev *device;
+	struct pqi_scsi_dev *next;
+
+	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
+
+	list_for_each_entry_safe(device, next, &ctrl_info->scsi_device_list,
+		scsi_device_list_entry) {
+		if (pqi_is_device_added(device))
+			pqi_remove_device(ctrl_info, device);
+		list_del(&device->scsi_device_list_entry);
+		pqi_free_device(device);
+	}
+
+	spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+}
+
+#if TORTUGA
+
+static int pqi_add_controller(struct pqi_ctrl_info *ctrl_info)
+{
+	int rc;
+	unsigned long flags;
+	struct pqi_scsi_dev *device;
+
+	device = kzalloc(sizeof(*device), GFP_KERNEL);
+	if (!device) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"failed to allocate memory for controller device\n");
+		return -ENOMEM;
+	}
+
+	device->devtype = TYPE_RAID;
+	pqi_assign_bus_target_lun(device);
+
+	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
+	list_add_tail(&device->scsi_device_list_entry, &ctrl_info->scsi_device_list);
+	spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+
+	rc = pqi_add_device(ctrl_info, device);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"scsi %d:%d:%d:%d addition failed, device not added\n",
+			ctrl_info->scsi_host->host_no,
+			device->bus, device->target,
+			device->lun);
+		goto error;
+	}
+
+	return 0;
+
+error:
+	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
+	list_del(&device->scsi_device_list_entry);
+	spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+
+	kfree(device);
+
+	return rc;
+}
+
+static int pqi_scan_scsi_devices(struct pqi_ctrl_info *ctrl_info)
+{
+	int rc;
+
+	if (list_empty(&ctrl_info->scsi_device_list))
+		rc = pqi_add_controller(ctrl_info);
+	else
+		rc = 0;
+
+	return rc;
+}
+
+#else
+
+static int pqi_scan_scsi_devices(struct pqi_ctrl_info *ctrl_info)
+{
+	int rc;
+	int mutex_acquired;
+
+	if (pqi_ctrl_offline(ctrl_info))
+		return -ENXIO;
+
+	mutex_acquired = mutex_trylock(&ctrl_info->scan_mutex);
+
+	if (!mutex_acquired) {
+		if (pqi_ctrl_scan_blocked(ctrl_info))
+			return -EBUSY;
+		pqi_schedule_rescan_worker_delayed(ctrl_info);
+		return -EINPROGRESS;
+	}
+
+	rc = pqi_update_scsi_devices(ctrl_info);
+	if (rc && !pqi_ctrl_scan_blocked(ctrl_info))
+		pqi_schedule_rescan_worker_delayed(ctrl_info);
+
+	mutex_unlock(&ctrl_info->scan_mutex);
+
+	return rc;
+}
+
+#endif
+
+static void pqi_scan_start(struct Scsi_Host *shost)
+{
+	struct pqi_ctrl_info *ctrl_info;
+
+	ctrl_info = shost_to_hba(shost);
+
+	pqi_scan_scsi_devices(ctrl_info);
+}
+
+/* Returns TRUE if scan is finished. */
+
+static int pqi_scan_finished(struct Scsi_Host *shost,
+	unsigned long elapsed_time)
+{
+	struct pqi_ctrl_info *ctrl_info;
+
+	ctrl_info = shost_priv(shost);
+
+	return !mutex_is_locked(&ctrl_info->scan_mutex);
+}
+
+static inline void pqi_set_encryption_info(struct pqi_encryption_info *encryption_info,
+	struct raid_map *raid_map, u64 first_block)
+{
+	u32 volume_blk_size;
+
+	/*
+	 * Set the encryption tweak values based on logical block address.
+	 * If the block size is 512, the tweak value is equal to the LBA.
+	 * For other block sizes, tweak value is (LBA * block size) / 512.
+	 */
+	volume_blk_size = get_unaligned_le32(&raid_map->volume_blk_size);
+	if (volume_blk_size != 512)
+		first_block = (first_block * volume_blk_size) / 512;
+
+	encryption_info->data_encryption_key_index =
+		get_unaligned_le16(&raid_map->data_encryption_key_index);
+	encryption_info->encrypt_tweak_lower = lower_32_bits(first_block);
+	encryption_info->encrypt_tweak_upper = upper_32_bits(first_block);
+}
+
+/*
+ * Attempt to perform RAID bypass mapping for a logical volume I/O.
+ */
+
+static bool pqi_aio_raid_level_supported(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *device, struct pqi_scsi_dev_raid_map_data *rmd)
+{
+	bool is_supported = true;
+
+	switch (rmd->raid_level) {
+	case SA_RAID_0:
+		break;
+	case SA_RAID_1:
+		if (rmd->is_write && (!ctrl_info->enable_r1_writes ||
+			rmd->data_length > ctrl_info->max_write_raid_1_10_2drive))
+			is_supported = false;
+		break;
+	case SA_RAID_TRIPLE:
+		if (rmd->is_write && (!ctrl_info->enable_r1_writes ||
+			rmd->data_length > ctrl_info->max_write_raid_1_10_3drive))
+			is_supported = false;
+		break;
+	case SA_RAID_5:
+		if (rmd->is_write && (!ctrl_info->enable_r5_writes ||
+			rmd->data_length > ctrl_info->max_write_raid_5_6))
+			is_supported = false;
+		break;
+	case SA_RAID_6:
+		if (rmd->is_write && (!ctrl_info->enable_r6_writes ||
+			rmd->data_length > ctrl_info->max_write_raid_5_6))
+			is_supported = false;
+		break;
+	default:
+		is_supported = false;
+		break;
+	}
+
+	return is_supported;
+}
+
+#define PQI_RAID_BYPASS_INELIGIBLE	1
+
+static int pqi_get_aio_lba_and_block_count(struct scsi_cmnd *scmd,
+	struct pqi_scsi_dev_raid_map_data *rmd)
+{
+	/* Check for valid opcode, get LBA and block count. */
+	switch (scmd->cmnd[0]) {
+	case WRITE_6:
+		rmd->is_write = true;
+		/* fall through */
+	case READ_6:
+		rmd->first_block = (u64)(((scmd->cmnd[1] & 0x1f) << 16) |
+			(scmd->cmnd[2] << 8) | scmd->cmnd[3]);
+		rmd->block_cnt = (u32)scmd->cmnd[4];
+		if (rmd->block_cnt == 0)
+			rmd->block_cnt = 256;
+		break;
+	case WRITE_10:
+		rmd->is_write = true;
+		/* fall through */
+	case READ_10:
+		rmd->first_block = (u64)get_unaligned_be32(&scmd->cmnd[2]);
+		rmd->block_cnt = (u32)get_unaligned_be16(&scmd->cmnd[7]);
+		break;
+	case WRITE_12:
+		rmd->is_write = true;
+		/* fall through */
+	case READ_12:
+		rmd->first_block = (u64)get_unaligned_be32(&scmd->cmnd[2]);
+		rmd->block_cnt = get_unaligned_be32(&scmd->cmnd[6]);
+		break;
+	case WRITE_16:
+		rmd->is_write = true;
+		/* fall through */
+	case READ_16:
+		rmd->first_block = get_unaligned_be64(&scmd->cmnd[2]);
+		rmd->block_cnt = get_unaligned_be32(&scmd->cmnd[10]);
+		break;
+	default:
+		/* Process via normal I/O path. */
+		return PQI_RAID_BYPASS_INELIGIBLE;
+	}
+
+	put_unaligned_le32(scsi_bufflen(scmd), &rmd->data_length);
+
+	return 0;
+}
+
+static int pci_get_aio_common_raid_map_values(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev_raid_map_data *rmd, struct raid_map *raid_map)
+{
+	rmd->last_block = rmd->first_block + rmd->block_cnt - 1;
+
+	/* Check for invalid block or wraparound. */
+	if (rmd->last_block >=
+		get_unaligned_le64(&raid_map->volume_blk_cnt) ||
+		rmd->last_block < rmd->first_block)
+		return PQI_RAID_BYPASS_INELIGIBLE;
+
+	rmd->data_disks_per_row =
+		get_unaligned_le16(&raid_map->data_disks_per_row);
+	rmd->strip_size = get_unaligned_le16(&raid_map->strip_size);
+	rmd->layout_map_count = get_unaligned_le16(&raid_map->layout_map_count);
+
+	/* Calculate stripe information for the request. */
+	rmd->blocks_per_row = rmd->data_disks_per_row * rmd->strip_size;
+	rmd->first_row = rmd->first_block / rmd->blocks_per_row;
+	rmd->last_row = rmd->last_block / rmd->blocks_per_row;
+	rmd->first_row_offset = (u32)(rmd->first_block -
+		(rmd->first_row * rmd->blocks_per_row));
+	rmd->last_row_offset = (u32)(rmd->last_block - (rmd->last_row *
+		rmd->blocks_per_row));
+	rmd->first_column = rmd->first_row_offset / rmd->strip_size;
+	rmd->last_column = rmd->last_row_offset / rmd->strip_size;
+
+	/* If this isn't a single row/column then give to the controller. */
+	if (rmd->first_row != rmd->last_row ||
+		rmd->first_column != rmd->last_column)
+		return PQI_RAID_BYPASS_INELIGIBLE;
+
+	/* Proceeding with driver mapping. */
+	rmd->total_disks_per_row = rmd->data_disks_per_row +
+		get_unaligned_le16(&raid_map->metadata_disks_per_row);
+	rmd->map_row = ((u32)(rmd->first_row >>
+		raid_map->parity_rotation_shift)) %
+		get_unaligned_le16(&raid_map->row_cnt);
+	rmd->map_index = (rmd->map_row * rmd->total_disks_per_row) +
+		rmd->first_column;
+
+	return 0;
+}
+
+static int pqi_calc_aio_r5_or_r6(struct pqi_scsi_dev_raid_map_data *rmd,
+	struct raid_map *raid_map)
+{
+	/* RAID 50/60 */
+	/* Verify first and last block are in same RAID group. */
+	rmd->stripesize = rmd->blocks_per_row * rmd->layout_map_count;
+	rmd->first_group = (rmd->first_block %
+			rmd->stripesize) / rmd->blocks_per_row;
+	rmd->last_group = (rmd->last_block %
+			rmd->stripesize) / rmd->blocks_per_row;
+	if (rmd->first_group != rmd->last_group)
+		return PQI_RAID_BYPASS_INELIGIBLE;
+
+	/* Verify request is in a single row of RAID 5/6. */
+	rmd->first_row = rmd->r5or6_first_row =
+		rmd->first_block / rmd->stripesize;
+	rmd->r5or6_last_row = rmd->last_block / rmd->stripesize;
+	if (rmd->r5or6_first_row != rmd->r5or6_last_row)
+		return PQI_RAID_BYPASS_INELIGIBLE;
+
+	/* Verify request is in a single column. */
+	rmd->first_row_offset = rmd->r5or6_first_row_offset =
+		(u32)((rmd->first_block % rmd->stripesize) %
+		rmd->blocks_per_row);
+
+	rmd->r5or6_last_row_offset =
+		(u32)((rmd->last_block % rmd->stripesize) %
+		rmd->blocks_per_row);
+
+	rmd->first_column =
+		rmd->r5or6_first_row_offset / rmd->strip_size;
+	rmd->r5or6_first_column = rmd->first_column;
+	rmd->r5or6_last_column = rmd->r5or6_last_row_offset / rmd->strip_size;
+	if (rmd->r5or6_first_column != rmd->r5or6_last_column)
+		return PQI_RAID_BYPASS_INELIGIBLE;
+
+	/* Request is eligible. */
+	rmd->map_row =
+		((u32)(rmd->first_row >> raid_map->parity_rotation_shift)) %
+		get_unaligned_le16(&raid_map->row_cnt);
+
+	rmd->map_index = (rmd->first_group *
+		(get_unaligned_le16(&raid_map->row_cnt) *
+		rmd->total_disks_per_row)) +
+		(rmd->map_row * rmd->total_disks_per_row) + rmd->first_column;
+
+	if (rmd->is_write) {
+		u32 index;
+
+		index = DIV_ROUND_UP(rmd->map_index + 1, rmd->total_disks_per_row);
+		index *= rmd->total_disks_per_row;
+		index -= get_unaligned_le16(&raid_map->metadata_disks_per_row);
+
+		rmd->p_parity_it_nexus = raid_map->disk_data[index].aio_handle;
+		if (rmd->raid_level == SA_RAID_6) {
+			rmd->q_parity_it_nexus = raid_map->disk_data[index + 1].aio_handle;
+			rmd->xor_mult = raid_map->disk_data[rmd->map_index].xor_mult[1];
+		}
+		if (rmd->blocks_per_row == 0)
+			return PQI_RAID_BYPASS_INELIGIBLE;
+		rmd->row = rmd->first_block / rmd->blocks_per_row;
+	}
+
+	return 0;
+}
+
+static void pqi_set_aio_cdb(struct pqi_scsi_dev_raid_map_data *rmd)
+{
+	/* Build the new CDB for the physical disk I/O. */
+	if (rmd->disk_block > 0xffffffff) {
+		rmd->cdb[0] = rmd->is_write ? WRITE_16 : READ_16;
+		rmd->cdb[1] = 0;
+		put_unaligned_be64(rmd->disk_block, &rmd->cdb[2]);
+		put_unaligned_be32(rmd->disk_block_cnt, &rmd->cdb[10]);
+		rmd->cdb[14] = 0;
+		rmd->cdb[15] = 0;
+		rmd->cdb_length = 16;
+	} else {
+		rmd->cdb[0] = rmd->is_write ? WRITE_10 : READ_10;
+		rmd->cdb[1] = 0;
+		put_unaligned_be32((u32)rmd->disk_block, &rmd->cdb[2]);
+		rmd->cdb[6] = 0;
+		put_unaligned_be16((u16)rmd->disk_block_cnt, &rmd->cdb[7]);
+		rmd->cdb[9] = 0;
+		rmd->cdb_length = 10;
+	}
+}
+
+static void pqi_calc_aio_r1_nexus(struct raid_map *raid_map,
+	struct pqi_scsi_dev_raid_map_data *rmd)
+{
+	u32 index;
+	u32 group;
+
+	group = rmd->map_index / rmd->data_disks_per_row;
+
+	index = rmd->map_index - (group * rmd->data_disks_per_row);
+	rmd->it_nexus[0] = raid_map->disk_data[index].aio_handle;
+	index += rmd->data_disks_per_row;
+	rmd->it_nexus[1] = raid_map->disk_data[index].aio_handle;
+	if (rmd->layout_map_count > 2) {
+		index += rmd->data_disks_per_row;
+		rmd->it_nexus[2] = raid_map->disk_data[index].aio_handle;
+	}
+
+	rmd->num_it_nexus_entries = rmd->layout_map_count;
+}
+
+static int pqi_raid_bypass_submit_scsi_cmd(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *device, struct scsi_cmnd *scmd,
+	struct pqi_queue_group *queue_group)
+{
+	int rc;
+	struct raid_map *raid_map;
+	u32 group;
+	u32 next_bypass_group;
+	struct pqi_encryption_info *encryption_info_ptr;
+	struct pqi_encryption_info encryption_info;
+	struct pqi_scsi_dev_raid_map_data rmd = { 0 };
+
+	rc = pqi_get_aio_lba_and_block_count(scmd, &rmd);
+	if (rc)
+		return PQI_RAID_BYPASS_INELIGIBLE;
+
+	rmd.raid_level = device->raid_level;
+
+	if (!pqi_aio_raid_level_supported(ctrl_info, device, &rmd))
+		return PQI_RAID_BYPASS_INELIGIBLE;
+
+	if (unlikely(rmd.block_cnt == 0))
+		return PQI_RAID_BYPASS_INELIGIBLE;
+
+	raid_map = device->raid_map;
+
+	rc = pci_get_aio_common_raid_map_values(ctrl_info, &rmd, raid_map);
+	if (rc)
+		return PQI_RAID_BYPASS_INELIGIBLE;
+
+	if (device->raid_level == SA_RAID_1 ||
+		device->raid_level == SA_RAID_TRIPLE) {
+		if (rmd.is_write) {
+			pqi_calc_aio_r1_nexus(raid_map, &rmd);
+		} else {
+			group = device->next_bypass_group;
+			next_bypass_group = group + 1;
+			if (next_bypass_group >= rmd.layout_map_count)
+				next_bypass_group = 0;
+			device->next_bypass_group = next_bypass_group;
+			rmd.map_index += group * rmd.data_disks_per_row;
+		}
+	} else if ((device->raid_level == SA_RAID_5 ||
+		device->raid_level == SA_RAID_6) &&
+		(rmd.layout_map_count > 1 || rmd.is_write)) {
+		rc = pqi_calc_aio_r5_or_r6(&rmd, raid_map);
+		if (rc)
+			return PQI_RAID_BYPASS_INELIGIBLE;
+	}
+
+	if (unlikely(rmd.map_index >= RAID_MAP_MAX_ENTRIES))
+		return PQI_RAID_BYPASS_INELIGIBLE;
+
+	rmd.aio_handle = raid_map->disk_data[rmd.map_index].aio_handle;
+	rmd.disk_block = get_unaligned_le64(&raid_map->disk_starting_blk) +
+		rmd.first_row * rmd.strip_size +
+		(rmd.first_row_offset - rmd.first_column * rmd.strip_size);
+	rmd.disk_block_cnt = rmd.block_cnt;
+
+	/* Handle differing logical/physical block sizes. */
+	if (raid_map->phys_blk_shift) {
+		rmd.disk_block <<= raid_map->phys_blk_shift;
+		rmd.disk_block_cnt <<= raid_map->phys_blk_shift;
+	}
+
+	if (unlikely(rmd.disk_block_cnt > 0xffff))
+		return PQI_RAID_BYPASS_INELIGIBLE;
+
+	pqi_set_aio_cdb(&rmd);
+
+	if (get_unaligned_le16(&raid_map->flags) & RAID_MAP_ENCRYPTION_ENABLED) {
+		if (rmd.data_length > device->max_transfer_encrypted)
+			return PQI_RAID_BYPASS_INELIGIBLE;
+		pqi_set_encryption_info(&encryption_info, raid_map, rmd.first_block);
+		encryption_info_ptr = &encryption_info;
+	} else {
+		encryption_info_ptr = NULL;
+	}
+
+	if (rmd.is_write) {
+		switch (device->raid_level) {
+		case SA_RAID_1:
+		case SA_RAID_TRIPLE:
+			return pqi_aio_submit_r1_write_io(ctrl_info, scmd, queue_group,
+				encryption_info_ptr, device, &rmd);
+		case SA_RAID_5:
+		case SA_RAID_6:
+			return pqi_aio_submit_r56_write_io(ctrl_info, scmd, queue_group,
+				encryption_info_ptr, device, &rmd);
+		}
+	}
+
+	return pqi_aio_submit_io(ctrl_info, scmd, rmd.aio_handle,
+		rmd.cdb, rmd.cdb_length, queue_group,
+		encryption_info_ptr, true, false);
+}
+
+#define PQI_STATUS_IDLE		0x0
+
+#define PQI_CREATE_ADMIN_QUEUE_PAIR	1
+#define PQI_DELETE_ADMIN_QUEUE_PAIR	2
+
+#define PQI_DEVICE_STATE_POWER_ON_AND_RESET		0x0
+#define PQI_DEVICE_STATE_STATUS_AVAILABLE		0x1
+#define PQI_DEVICE_STATE_ALL_REGISTERS_READY		0x2
+#define PQI_DEVICE_STATE_ADMIN_QUEUE_PAIR_READY		0x3
+#define PQI_DEVICE_STATE_ERROR				0x4
+
+#define PQI_MODE_READY_TIMEOUT_SECS		(30 * HZ)
+#define PQI_MODE_READY_POLL_INTERVAL_MSECS	1
+
+static int pqi_wait_for_pqi_mode_ready(struct pqi_ctrl_info *ctrl_info)
+{
+	struct pqi_device_registers __iomem *pqi_registers;
+	unsigned long timeout;
+	u64 signature;
+	u8 status;
+
+	pqi_registers = ctrl_info->pqi_registers;
+	timeout = PQI_MODE_READY_TIMEOUT_SECS + jiffies;
+
+	while (1) {
+		signature = readq(&pqi_registers->signature);
+		if (memcmp(&signature, PQI_DEVICE_SIGNATURE,
+			sizeof(signature)) == 0)
+			break;
+		if (time_after(jiffies, timeout)) {
+			dev_err(&ctrl_info->pci_dev->dev,
+				"timed out waiting for PQI signature\n");
+			return -ETIMEDOUT;
+		}
+		msleep(PQI_MODE_READY_POLL_INTERVAL_MSECS);
+	}
+
+	while (1) {
+		status = readb(&pqi_registers->function_and_status_code);
+		if (status == PQI_STATUS_IDLE)
+			break;
+		if (time_after(jiffies, timeout)) {
+			dev_err(&ctrl_info->pci_dev->dev,
+				"timed out waiting for PQI IDLE\n");
+			return -ETIMEDOUT;
+		}
+		msleep(PQI_MODE_READY_POLL_INTERVAL_MSECS);
+	}
+
+	while (1) {
+		if (readl(&pqi_registers->device_status) ==
+			PQI_DEVICE_STATE_ALL_REGISTERS_READY)
+			break;
+		if (time_after(jiffies, timeout)) {
+			dev_err(&ctrl_info->pci_dev->dev,
+				"timed out waiting for PQI all registers ready\n");
+			return -ETIMEDOUT;
+		}
+		msleep(PQI_MODE_READY_POLL_INTERVAL_MSECS);
+	}
+
+	return 0;
+}
+
+static inline void pqi_aio_path_disabled(struct pqi_io_request *io_request)
+{
+	struct pqi_scsi_dev *device;
+
+	device = io_request->scmd->device->hostdata;
+	device->raid_bypass_enabled = false;
+	device->aio_enabled = false;
+}
+
+static inline void pqi_take_device_offline(struct scsi_device *sdev, char *path)
+{
+	struct pqi_ctrl_info *ctrl_info;
+	struct pqi_scsi_dev *device;
+
+	device = sdev->hostdata;
+	if (device->device_offline)
+		return;
+
+	device->device_offline = true;
+	ctrl_info = shost_to_hba(sdev->host);
+	pqi_schedule_rescan_worker(ctrl_info);
+	dev_err(&ctrl_info->pci_dev->dev, "re-scanning %s scsi %d:%d:%d:%d\n",
+		path, ctrl_info->scsi_host->host_no, device->bus,
+		device->target, device->lun);
+}
+
+static void pqi_process_raid_io_error(struct pqi_io_request *io_request)
+{
+	u8 scsi_status;
+	u8 host_byte;
+	struct scsi_cmnd *scmd;
+	struct pqi_raid_error_info *error_info;
+	size_t sense_data_length;
+	int residual_count;
+	int xfer_count;
+	struct scsi_sense_hdr sshdr;
+
+	scmd = io_request->scmd;
+	if (!scmd)
+		return;
+
+	error_info = io_request->error_info;
+	scsi_status = error_info->status;
+	host_byte = DID_OK;
+
+	switch (error_info->data_out_result) {
+	case PQI_DATA_IN_OUT_GOOD:
+		break;
+	case PQI_DATA_IN_OUT_UNDERFLOW:
+		xfer_count =
+			get_unaligned_le32(&error_info->data_out_transferred);
+		residual_count = scsi_bufflen(scmd) - xfer_count;
+		scsi_set_resid(scmd, residual_count);
+		if (xfer_count < scmd->underflow)
+			host_byte = DID_SOFT_ERROR;
+		break;
+	case PQI_DATA_IN_OUT_UNSOLICITED_ABORT:
+	case PQI_DATA_IN_OUT_ABORTED:
+		host_byte = DID_ABORT;
+		break;
+	case PQI_DATA_IN_OUT_TIMEOUT:
+		host_byte = DID_TIME_OUT;
+		break;
+	case PQI_DATA_IN_OUT_BUFFER_OVERFLOW:
+	case PQI_DATA_IN_OUT_PROTOCOL_ERROR:
+	case PQI_DATA_IN_OUT_BUFFER_ERROR:
+	case PQI_DATA_IN_OUT_BUFFER_OVERFLOW_DESCRIPTOR_AREA:
+	case PQI_DATA_IN_OUT_BUFFER_OVERFLOW_BRIDGE:
+	case PQI_DATA_IN_OUT_ERROR:
+	case PQI_DATA_IN_OUT_HARDWARE_ERROR:
+	case PQI_DATA_IN_OUT_PCIE_FABRIC_ERROR:
+	case PQI_DATA_IN_OUT_PCIE_COMPLETION_TIMEOUT:
+	case PQI_DATA_IN_OUT_PCIE_COMPLETER_ABORT_RECEIVED:
+	case PQI_DATA_IN_OUT_PCIE_UNSUPPORTED_REQUEST_RECEIVED:
+	case PQI_DATA_IN_OUT_PCIE_ECRC_CHECK_FAILED:
+	case PQI_DATA_IN_OUT_PCIE_UNSUPPORTED_REQUEST:
+	case PQI_DATA_IN_OUT_PCIE_ACS_VIOLATION:
+	case PQI_DATA_IN_OUT_PCIE_TLP_PREFIX_BLOCKED:
+	case PQI_DATA_IN_OUT_PCIE_POISONED_MEMORY_READ:
+	default:
+		host_byte = DID_ERROR;
+		break;
+	}
+
+	sense_data_length = get_unaligned_le16(&error_info->sense_data_length);
+	if (sense_data_length == 0)
+		sense_data_length =
+			get_unaligned_le16(&error_info->response_data_length);
+	if (sense_data_length) {
+		if (sense_data_length > sizeof(error_info->data))
+			sense_data_length = sizeof(error_info->data);
+
+		if (scsi_status == SAM_STAT_CHECK_CONDITION &&
+			scsi_normalize_sense(error_info->data,
+				sense_data_length, &sshdr) &&
+				sshdr.sense_key == HARDWARE_ERROR &&
+				sshdr.asc == 0x3e &&
+				sshdr.ascq == 0x1) {
+			pqi_take_device_offline(scmd->device, "RAID");
+			host_byte = DID_NO_CONNECT;
+		}
+
+		if (sense_data_length > SCSI_SENSE_BUFFERSIZE)
+			sense_data_length = SCSI_SENSE_BUFFERSIZE;
+		memcpy(scmd->sense_buffer, error_info->data,
+			sense_data_length);
+	}
+
+	scmd->result = scsi_status;
+	set_host_byte(scmd, host_byte);
+}
+
+static void pqi_process_aio_io_error(struct pqi_io_request *io_request)
+{
+	u8 scsi_status;
+	u8 host_byte;
+	struct scsi_cmnd *scmd;
+	struct pqi_aio_error_info *error_info;
+	size_t sense_data_length;
+	int residual_count;
+	int xfer_count;
+	bool device_offline;
+	struct pqi_scsi_dev *device;
+
+	scmd = io_request->scmd;
+	error_info = io_request->error_info;
+	host_byte = DID_OK;
+	sense_data_length = 0;
+	device_offline = false;
+	device = scmd->device->hostdata;
+
+	switch (error_info->service_response) {
+	case PQI_AIO_SERV_RESPONSE_COMPLETE:
+		scsi_status = error_info->status;
+		break;
+	case PQI_AIO_SERV_RESPONSE_FAILURE:
+		switch (error_info->status) {
+		case PQI_AIO_STATUS_IO_ABORTED:
+			scsi_status = SAM_STAT_TASK_ABORTED;
+			break;
+		case PQI_AIO_STATUS_UNDERRUN:
+			scsi_status = SAM_STAT_GOOD;
+			residual_count = get_unaligned_le32(
+						&error_info->residual_count);
+			scsi_set_resid(scmd, residual_count);
+			xfer_count = scsi_bufflen(scmd) - residual_count;
+			if (xfer_count < scmd->underflow)
+				host_byte = DID_SOFT_ERROR;
+			break;
+		case PQI_AIO_STATUS_OVERRUN:
+			scsi_status = SAM_STAT_GOOD;
+			break;
+		case PQI_AIO_STATUS_AIO_PATH_DISABLED:
+			pqi_aio_path_disabled(io_request);
+			if (pqi_is_multipath_device(device)) {
+				pqi_device_remove_start(device);
+				host_byte = DID_NO_CONNECT;
+				scsi_status = SAM_STAT_CHECK_CONDITION;
+			} else {
+				scsi_status = SAM_STAT_GOOD;
+				io_request->status = -EAGAIN;
+			}
+			break;
+		case PQI_AIO_STATUS_NO_PATH_TO_DEVICE:
+		case PQI_AIO_STATUS_INVALID_DEVICE:
+			if (!io_request->raid_bypass) {
+				device_offline = true;
+				pqi_take_device_offline(scmd->device, "AIO");
+				host_byte = DID_NO_CONNECT;
+			}
+			scsi_status = SAM_STAT_CHECK_CONDITION;
+			break;
+		case PQI_AIO_STATUS_IO_ERROR:
+		default:
+			scsi_status = SAM_STAT_CHECK_CONDITION;
+			break;
+		}
+		break;
+	case PQI_AIO_SERV_RESPONSE_TMF_COMPLETE:
+	case PQI_AIO_SERV_RESPONSE_TMF_SUCCEEDED:
+		scsi_status = SAM_STAT_GOOD;
+		break;
+	case PQI_AIO_SERV_RESPONSE_TMF_REJECTED:
+	case PQI_AIO_SERV_RESPONSE_TMF_INCORRECT_LUN:
+	default:
+		scsi_status = SAM_STAT_CHECK_CONDITION;
+		break;
+	}
+
+	if (error_info->data_present) {
+		sense_data_length =
+			get_unaligned_le16(&error_info->data_length);
+		if (sense_data_length) {
+			if (sense_data_length > sizeof(error_info->data))
+				sense_data_length = sizeof(error_info->data);
+			if (sense_data_length > SCSI_SENSE_BUFFERSIZE)
+				sense_data_length = SCSI_SENSE_BUFFERSIZE;
+			memcpy(scmd->sense_buffer, error_info->data,
+				sense_data_length);
+		}
+	}
+
+	if (device_offline && sense_data_length == 0)
+		scsi_build_sense_buffer(0, scmd->sense_buffer, HARDWARE_ERROR,
+			0x3e, 0x1);
+
+	scmd->result = scsi_status;
+	set_host_byte(scmd, host_byte);
+}
+
+static void pqi_process_io_error(unsigned int iu_type,
+	struct pqi_io_request *io_request)
+{
+	switch (iu_type) {
+	case PQI_RESPONSE_IU_RAID_PATH_IO_ERROR:
+		pqi_process_raid_io_error(io_request);
+		break;
+	case PQI_RESPONSE_IU_AIO_PATH_IO_ERROR:
+		pqi_process_aio_io_error(io_request);
+		break;
+	}
+}
+
+static int pqi_interpret_task_management_response(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_task_management_response *response)
+{
+	int rc;
+
+	switch (response->response_code) {
+	case SOP_TMF_COMPLETE:
+	case SOP_TMF_FUNCTION_SUCCEEDED:
+		rc = 0;
+		break;
+	case SOP_TMF_REJECTED:
+		rc = -EAGAIN;
+		break;
+	default:
+		rc = -EIO;
+		break;
+	}
+
+	if (rc)
+		dev_err(&ctrl_info->pci_dev->dev,
+			"Task Management Function error: %d (response code: %u)\n", rc, response->response_code);
+
+	return rc;
+}
+
+static inline void pqi_invalid_response(struct pqi_ctrl_info *ctrl_info,
+	enum pqi_ctrl_shutdown_reason ctrl_shutdown_reason)
+{
+	pqi_take_ctrl_offline(ctrl_info, ctrl_shutdown_reason);
+}
+
+static int pqi_process_io_intr(struct pqi_ctrl_info *ctrl_info, struct pqi_queue_group *queue_group)
+{
+	int num_responses;
+	pqi_index_t oq_pi;
+	pqi_index_t oq_ci;
+	struct pqi_io_request *io_request;
+	struct pqi_io_response *response;
+	u16 request_id;
+
+	num_responses = 0;
+	oq_ci = queue_group->oq_ci_copy;
+
+	while (1) {
+		oq_pi = readl(queue_group->oq_pi);
+		if (oq_pi >= ctrl_info->num_elements_per_oq) {
+			pqi_invalid_response(ctrl_info, PQI_IO_PI_OUT_OF_RANGE);
+			dev_err(&ctrl_info->pci_dev->dev,
+				"I/O interrupt: producer index (%u) out of range (0-%u): consumer index: %u\n",
+				oq_pi, ctrl_info->num_elements_per_oq - 1, oq_ci);
+			return -1;
+		}
+		if (oq_pi == oq_ci)
+			break;
+
+		num_responses++;
+		response = queue_group->oq_element_array +
+			(oq_ci * PQI_OPERATIONAL_OQ_ELEMENT_LENGTH);
+
+		request_id = get_unaligned_le16(&response->request_id);
+		if (request_id >= ctrl_info->max_io_slots) {
+			pqi_invalid_response(ctrl_info, PQI_INVALID_REQ_ID);
+			dev_err(&ctrl_info->pci_dev->dev,
+				"request ID in response (%u) out of range (0-%u): producer index: %u  consumer index: %u\n",
+				request_id, ctrl_info->max_io_slots - 1, oq_pi, oq_ci);
+			return -1;
+		}
+
+		io_request = &ctrl_info->io_request_pool[request_id];
+		if (atomic_read(&io_request->refcount) == 0) {
+			pqi_invalid_response(ctrl_info, PQI_UNMATCHED_REQ_ID);
+			dev_err(&ctrl_info->pci_dev->dev,
+				"request ID in response (%u) does not match an outstanding I/O request: producer index: %u  consumer index: %u\n",
+				request_id, oq_pi, oq_ci);
+			return -1;
+		}
+
+		switch (response->header.iu_type) {
+		case PQI_RESPONSE_IU_RAID_PATH_IO_SUCCESS:
+		case PQI_RESPONSE_IU_AIO_PATH_IO_SUCCESS:
+			if (io_request->scmd)
+				io_request->scmd->result = 0;
+			/* fall through */
+		case PQI_RESPONSE_IU_GENERAL_MANAGEMENT:
+			break;
+		case PQI_RESPONSE_IU_VENDOR_GENERAL:
+			io_request->status =
+				get_unaligned_le16(
+				&((struct pqi_vendor_general_response *)response)->status);
+			break;
+		case PQI_RESPONSE_IU_TASK_MANAGEMENT:
+			io_request->status = pqi_interpret_task_management_response(ctrl_info,
+				(void *)response);
+			break;
+		case PQI_RESPONSE_IU_AIO_PATH_DISABLED:
+			pqi_aio_path_disabled(io_request);
+			io_request->status = -EAGAIN;
+			break;
+		case PQI_RESPONSE_IU_RAID_PATH_IO_ERROR:
+		case PQI_RESPONSE_IU_AIO_PATH_IO_ERROR:
+			io_request->error_info = ctrl_info->error_buffer +
+				(get_unaligned_le16(&response->error_index) *
+				PQI_ERROR_BUFFER_ELEMENT_LENGTH);
+			pqi_process_io_error(response->header.iu_type, io_request);
+			break;
+		default:
+			pqi_invalid_response(ctrl_info, PQI_UNEXPECTED_IU_TYPE);
+			dev_err(&ctrl_info->pci_dev->dev,
+				"unexpected IU type: 0x%x: producer index: %u  consumer index: %u\n",
+				response->header.iu_type, oq_pi, oq_ci);
+			return -1;
+		}
+
+		io_request->io_complete_callback(io_request, io_request->context);
+
+		/*
+		 * Note that the I/O request structure CANNOT BE TOUCHED after
+		 * returning from the I/O completion callback!
+		 */
+		oq_ci = (oq_ci + 1) % ctrl_info->num_elements_per_oq;
+	}
+
+	if (num_responses) {
+		queue_group->oq_ci_copy = oq_ci;
+		writel(oq_ci, queue_group->oq_ci);
+	}
+
+	return num_responses;
+}
+
+static inline unsigned int pqi_num_elements_free(unsigned int pi,
+	unsigned int ci, unsigned int elements_in_queue)
+{
+	unsigned int num_elements_used;
+
+	if (pi >= ci)
+		num_elements_used = pi - ci;
+	else
+		num_elements_used = elements_in_queue - ci + pi;
+
+	return elements_in_queue - num_elements_used - 1;
+}
+
+static void pqi_send_event_ack(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_event_acknowledge_request *iu, size_t iu_length)
+{
+	pqi_index_t iq_pi;
+	pqi_index_t iq_ci;
+	unsigned long flags;
+	void *next_element;
+	struct pqi_queue_group *queue_group;
+
+	queue_group = &ctrl_info->queue_groups[PQI_DEFAULT_QUEUE_GROUP];
+	put_unaligned_le16(queue_group->oq_id, &iu->header.response_queue_id);
+
+	while (1) {
+		spin_lock_irqsave(&queue_group->submit_lock[RAID_PATH], flags);
+
+		iq_pi = queue_group->iq_pi_copy[RAID_PATH];
+		iq_ci = readl(queue_group->iq_ci[RAID_PATH]);
+
+		if (pqi_num_elements_free(iq_pi, iq_ci,
+			ctrl_info->num_elements_per_iq))
+			break;
+
+		spin_unlock_irqrestore(
+			&queue_group->submit_lock[RAID_PATH], flags);
+
+		if (pqi_ctrl_offline(ctrl_info))
+			return;
+	}
+
+	next_element = queue_group->iq_element_array[RAID_PATH] +
+		(iq_pi * PQI_OPERATIONAL_IQ_ELEMENT_LENGTH);
+
+	memcpy(next_element, iu, iu_length);
+
+	iq_pi = (iq_pi + 1) % ctrl_info->num_elements_per_iq;
+	queue_group->iq_pi_copy[RAID_PATH] = iq_pi;
+
+	/*
+	 * This write notifies the controller that an IU is available to be
+	 * processed.
+	 */
+	writel(iq_pi, queue_group->iq_pi[RAID_PATH]);
+
+	spin_unlock_irqrestore(&queue_group->submit_lock[RAID_PATH], flags);
+}
+
+static void pqi_acknowledge_event(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_event *event)
+{
+	struct pqi_event_acknowledge_request request;
+
+	memset(&request, 0, sizeof(request));
+
+	request.header.iu_type = PQI_REQUEST_IU_ACKNOWLEDGE_VENDOR_EVENT;
+	put_unaligned_le16(sizeof(request) - PQI_REQUEST_HEADER_LENGTH,
+		&request.header.iu_length);
+	request.event_type = event->event_type;
+	put_unaligned_le16(event->event_id, &request.event_id);
+	put_unaligned_le16(event->additional_event_id, &request.additional_event_id);
+
+	pqi_send_event_ack(ctrl_info, &request, sizeof(request));
+}
+
+#define PQI_SOFT_RESET_STATUS_TIMEOUT_SECS		(30 * HZ)
+#define PQI_SOFT_RESET_STATUS_POLL_INTERVAL_SECS	1
+
+static enum pqi_soft_reset_status pqi_poll_for_soft_reset_status(
+	struct pqi_ctrl_info *ctrl_info)
+{
+	u8 status;
+	unsigned long timeout;
+
+	timeout = PQI_SOFT_RESET_STATUS_TIMEOUT_SECS + jiffies;
+
+	while (1) {
+		status = pqi_read_soft_reset_status(ctrl_info);
+		if (status & PQI_SOFT_RESET_INITIATE)
+			return RESET_INITIATE_DRIVER;
+
+		if (status & PQI_SOFT_RESET_ABORT)
+			return RESET_ABORT;
+
+		if (!sis_is_firmware_running(ctrl_info))
+			return RESET_NORESPONSE;
+
+		if (time_after(jiffies, timeout)) {
+			dev_warn(&ctrl_info->pci_dev->dev,
+				"timed out waiting for soft reset status\n");
+			return RESET_TIMEDOUT;
+		}
+
+		ssleep(PQI_SOFT_RESET_STATUS_POLL_INTERVAL_SECS);
+	}
+}
+
+static void pqi_process_soft_reset(struct pqi_ctrl_info *ctrl_info)
+{
+	int rc;
+	unsigned int delay_secs;
+	enum pqi_soft_reset_status reset_status;
+
+	if (ctrl_info->soft_reset_handshake_supported)
+		reset_status = pqi_poll_for_soft_reset_status(ctrl_info);
+	else
+		reset_status = RESET_INITIATE_FIRMWARE;
+
+	delay_secs = PQI_POST_RESET_DELAY_SECS;
+
+	switch (reset_status) {
+		case RESET_TIMEDOUT:
+			delay_secs = PQI_POST_OFA_RESET_DELAY_UPON_TIMEOUT_SECS;
+			/* fall through */
+		case RESET_INITIATE_DRIVER:
+			dev_info(&ctrl_info->pci_dev->dev,
+				"Online Firmware Activation: resetting controller\n");
+			sis_soft_reset(ctrl_info);
+			/* fall through */
+		case RESET_INITIATE_FIRMWARE:
+			ctrl_info->pqi_mode_enabled = false;
+			pqi_save_ctrl_mode(ctrl_info, SIS_MODE);
+			rc = pqi_ofa_ctrl_restart(ctrl_info, delay_secs);
+			pqi_ofa_free_host_buffer(ctrl_info);
+			pqi_ctrl_ofa_done(ctrl_info);
+			dev_info(&ctrl_info->pci_dev->dev,
+				"Online Firmware Activation: %s\n",
+				rc == 0 ? "SUCCESS" : "FAILED");
+			break;
+		case RESET_ABORT:
+			dev_info(&ctrl_info->pci_dev->dev,
+				"Online Firmware Activation ABORTED\n");
+			if (ctrl_info->soft_reset_handshake_supported)
+				pqi_clear_soft_reset_status(ctrl_info);
+			pqi_ofa_free_host_buffer(ctrl_info);
+			pqi_ctrl_ofa_done(ctrl_info);
+			pqi_ofa_ctrl_unquiesce(ctrl_info);
+			break;
+		case RESET_NORESPONSE:
+		default:
+			dev_err(&ctrl_info->pci_dev->dev,
+				"unexpected Online Firmware Activation reset status: 0x%x\n",
+				reset_status);
+			pqi_ofa_free_host_buffer(ctrl_info);
+			pqi_ctrl_ofa_done(ctrl_info);
+			pqi_ofa_ctrl_unquiesce(ctrl_info);
+			pqi_take_ctrl_offline(ctrl_info, PQI_OFA_RESPONSE_TIMEOUT);
+			break;
+	}
+}
+
+static void pqi_ofa_memory_alloc_worker(struct work_struct *work)
+{
+	struct pqi_ctrl_info *ctrl_info;
+
+	ctrl_info = container_of(work, struct pqi_ctrl_info, ofa_memory_alloc_work);
+
+	pqi_ctrl_ofa_start(ctrl_info);
+	pqi_ofa_setup_host_buffer(ctrl_info);
+	pqi_ofa_host_memory_update(ctrl_info);
+}
+
+static void pqi_ofa_quiesce_worker(struct work_struct *work)
+{
+	struct pqi_ctrl_info *ctrl_info;
+	struct pqi_event *event;
+
+	ctrl_info = container_of(work, struct pqi_ctrl_info, ofa_quiesce_work);
+
+	event = &ctrl_info->events[pqi_event_type_to_event_index(PQI_EVENT_TYPE_OFA)];
+
+	pqi_ofa_ctrl_quiesce(ctrl_info);
+	pqi_acknowledge_event(ctrl_info, event);
+	pqi_process_soft_reset(ctrl_info);
+}
+
+static bool pqi_ofa_process_event(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_event *event)
+{
+	bool ack_event;
+
+	ack_event = true;
+
+	switch (event->event_id) {
+	case PQI_EVENT_OFA_MEMORY_ALLOCATION:
+		dev_info(&ctrl_info->pci_dev->dev,
+			"received Online Firmware Activation memory allocation request\n");
+		schedule_work(&ctrl_info->ofa_memory_alloc_work);
+		break;
+	case PQI_EVENT_OFA_QUIESCE:
+		dev_info(&ctrl_info->pci_dev->dev,
+			"received Online Firmware Activation quiesce request\n");
+		schedule_work(&ctrl_info->ofa_quiesce_work);
+		ack_event = false;
+		break;
+	case PQI_EVENT_OFA_CANCELED:
+		dev_info(&ctrl_info->pci_dev->dev,
+			"received Online Firmware Activation cancel request: reason: %u\n",
+			ctrl_info->ofa_cancel_reason);
+		pqi_ofa_free_host_buffer(ctrl_info);
+		pqi_ctrl_ofa_done(ctrl_info);
+		break;
+	default:
+		dev_err(&ctrl_info->pci_dev->dev,
+			"received unknown Online Firmware Activation request: event ID: %u\n",
+			event->event_id);
+		break;
+	}
+
+	return ack_event;
+}
+
+static void pqi_event_worker(struct work_struct *work)
+{
+	unsigned int i;
+	bool rescan_needed;
+	struct pqi_ctrl_info *ctrl_info;
+	struct pqi_event *event;
+	bool ack_event;
+
+	ctrl_info = container_of(work, struct pqi_ctrl_info, event_work);
+
+	pqi_ctrl_busy(ctrl_info);
+	pqi_wait_if_ctrl_blocked(ctrl_info);
+	if (pqi_ctrl_offline(ctrl_info))
+		goto out;
+
+	rescan_needed = false;
+	event = ctrl_info->events;
+	for (i = 0; i < PQI_NUM_SUPPORTED_EVENTS; i++) {
+		if (event->pending) {
+			event->pending = false;
+			if (event->event_type == PQI_EVENT_TYPE_OFA) {
+				ack_event = pqi_ofa_process_event(ctrl_info, event);
+			} else {
+				ack_event = true;
+				rescan_needed = true;
+			}
+			if (ack_event)
+				pqi_acknowledge_event(ctrl_info, event);
+		}
+		event++;
+	}
+
+	if (rescan_needed)
+		pqi_schedule_rescan_worker_delayed(ctrl_info);
+
+out:
+	pqi_ctrl_unbusy(ctrl_info);
+}
+
+#define PQI_HEARTBEAT_TIMER_INTERVAL	(10 * HZ)
+
+static void pqi_heartbeat_timer_handler(struct timer_list *t)
+{
+	int num_interrupts;
+	u32 heartbeat_count;
+	struct pqi_ctrl_info *ctrl_info;
+
+	ctrl_info = from_timer(ctrl_info, t, heartbeat_timer);
+
+	pqi_check_ctrl_health(ctrl_info);
+	if (pqi_ctrl_offline(ctrl_info))
+		return;
+
+	num_interrupts = atomic_read(&ctrl_info->num_interrupts);
+	heartbeat_count = pqi_read_heartbeat_counter(ctrl_info);
+
+	if (num_interrupts == ctrl_info->previous_num_interrupts) {
+		if (heartbeat_count == ctrl_info->previous_heartbeat_count) {
+			dev_err(&ctrl_info->pci_dev->dev,
+				"no heartbeat detected - last heartbeat count: %u\n",
+				heartbeat_count);
+			pqi_take_ctrl_offline(ctrl_info, PQI_NO_HEARTBEAT);
+			return;
+		}
+	} else
+		ctrl_info->previous_num_interrupts = num_interrupts;
+
+	ctrl_info->previous_heartbeat_count = heartbeat_count;
+	mod_timer(&ctrl_info->heartbeat_timer,
+		jiffies + PQI_HEARTBEAT_TIMER_INTERVAL);
+}
+
+static void pqi_start_heartbeat_timer(struct pqi_ctrl_info *ctrl_info)
+{
+	if (!ctrl_info->heartbeat_counter)
+		return;
+
+	ctrl_info->previous_num_interrupts =
+		atomic_read(&ctrl_info->num_interrupts);
+	ctrl_info->previous_heartbeat_count =
+		pqi_read_heartbeat_counter(ctrl_info);
+
+	ctrl_info->heartbeat_timer.expires =
+		jiffies + PQI_HEARTBEAT_TIMER_INTERVAL;
+	add_timer(&ctrl_info->heartbeat_timer);
+}
+
+static inline void pqi_stop_heartbeat_timer(struct pqi_ctrl_info *ctrl_info)
+{
+	del_timer_sync(&ctrl_info->heartbeat_timer);
+}
+
+static void pqi_ofa_capture_event_payload(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_event *event, struct pqi_event_response *response)
+{
+	switch (event->event_id) {
+	case PQI_EVENT_OFA_MEMORY_ALLOCATION:
+		ctrl_info->ofa_bytes_requested =
+			get_unaligned_le32(&response->data.ofa_memory_allocation.bytes_requested);
+		break;
+	case PQI_EVENT_OFA_CANCELED:
+		ctrl_info->ofa_cancel_reason =
+			get_unaligned_le16(&response->data.ofa_cancelled.reason);
+		break;
+	}
+}
+
+static int pqi_process_event_intr(struct pqi_ctrl_info *ctrl_info)
+{
+	int num_events;
+	pqi_index_t oq_pi;
+	pqi_index_t oq_ci;
+	struct pqi_event_queue *event_queue;
+	struct pqi_event_response *response;
+	struct pqi_event *event;
+	int event_index;
+
+	event_queue = &ctrl_info->event_queue;
+	num_events = 0;
+	oq_ci = event_queue->oq_ci_copy;
+
+	while (1) {
+		oq_pi = readl(event_queue->oq_pi);
+		if (oq_pi >= PQI_NUM_EVENT_QUEUE_ELEMENTS) {
+			pqi_invalid_response(ctrl_info, PQI_EVENT_PI_OUT_OF_RANGE);
+			dev_err(&ctrl_info->pci_dev->dev,
+				"event interrupt: producer index (%u) out of range (0-%u): consumer index: %u\n",
+				oq_pi, PQI_NUM_EVENT_QUEUE_ELEMENTS - 1, oq_ci);
+			return -1;
+		}
+
+		if (oq_pi == oq_ci)
+			break;
+
+		num_events++;
+		response = event_queue->oq_element_array + (oq_ci * PQI_EVENT_OQ_ELEMENT_LENGTH);
+
+		event_index = pqi_event_type_to_event_index(response->event_type);
+
+		if (event_index >= 0 && response->request_acknowledge) {
+			event = &ctrl_info->events[event_index];
+			event->pending = true;
+			event->event_type = response->event_type;
+			event->event_id = get_unaligned_le16(&response->event_id);
+			event->additional_event_id =
+				get_unaligned_le32(&response->additional_event_id);
+			if (event->event_type == PQI_EVENT_TYPE_OFA)
+				pqi_ofa_capture_event_payload(ctrl_info, event, response);
+		}
+
+		oq_ci = (oq_ci + 1) % PQI_NUM_EVENT_QUEUE_ELEMENTS;
+	}
+
+	if (num_events) {
+		event_queue->oq_ci_copy = oq_ci;
+		writel(oq_ci, event_queue->oq_ci);
+		schedule_work(&ctrl_info->event_work);
+	}
+
+	return num_events;
+}
+
+#define PQI_LEGACY_INTX_MASK	0x1
+
+static inline void pqi_configure_legacy_intx(struct pqi_ctrl_info *ctrl_info, bool enable_intx)
+{
+	u32 intx_mask;
+	struct pqi_device_registers __iomem *pqi_registers;
+	volatile void __iomem *register_addr;
+
+	pqi_registers = ctrl_info->pqi_registers;
+
+	if (enable_intx)
+		register_addr = &pqi_registers->legacy_intx_mask_clear;
+	else
+		register_addr = &pqi_registers->legacy_intx_mask_set;
+
+	intx_mask = readl(register_addr);
+	intx_mask |= PQI_LEGACY_INTX_MASK;
+	writel(intx_mask, register_addr);
+}
+
+static void pqi_change_irq_mode(struct pqi_ctrl_info *ctrl_info,
+	enum pqi_irq_mode new_mode)
+{
+	switch (ctrl_info->irq_mode) {
+	case IRQ_MODE_MSIX:
+		switch (new_mode) {
+		case IRQ_MODE_MSIX:
+			break;
+		case IRQ_MODE_INTX:
+			pqi_configure_legacy_intx(ctrl_info, true);
+			sis_enable_intx(ctrl_info);
+			break;
+		case IRQ_MODE_NONE:
+			break;
+		}
+		break;
+	case IRQ_MODE_INTX:
+		switch (new_mode) {
+		case IRQ_MODE_MSIX:
+			pqi_configure_legacy_intx(ctrl_info, false);
+			sis_enable_msix(ctrl_info);
+			break;
+		case IRQ_MODE_INTX:
+			break;
+		case IRQ_MODE_NONE:
+			pqi_configure_legacy_intx(ctrl_info, false);
+			break;
+		}
+		break;
+	case IRQ_MODE_NONE:
+		switch (new_mode) {
+		case IRQ_MODE_MSIX:
+			sis_enable_msix(ctrl_info);
+			break;
+		case IRQ_MODE_INTX:
+			pqi_configure_legacy_intx(ctrl_info, true);
+			sis_enable_intx(ctrl_info);
+			break;
+		case IRQ_MODE_NONE:
+			break;
+		}
+		break;
+	}
+
+	ctrl_info->irq_mode = new_mode;
+}
+
+#define PQI_LEGACY_INTX_PENDING		0x1
+
+static inline bool pqi_is_valid_irq(struct pqi_ctrl_info *ctrl_info)
+{
+	bool valid_irq;
+	u32 intx_status;
+
+	switch (ctrl_info->irq_mode) {
+	case IRQ_MODE_MSIX:
+		valid_irq = true;
+		break;
+	case IRQ_MODE_INTX:
+		intx_status = readl(&ctrl_info->pqi_registers->legacy_intx_status);
+		if (intx_status & PQI_LEGACY_INTX_PENDING)
+			valid_irq = true;
+		else
+			valid_irq = false;
+		break;
+	case IRQ_MODE_NONE:
+	default:
+		valid_irq = false;
+		break;
+	}
+
+	return valid_irq;
+}
+
+static irqreturn_t pqi_irq_handler(int irq, void *data)
+{
+	struct pqi_ctrl_info *ctrl_info;
+	struct pqi_queue_group *queue_group;
+	int num_io_responses_handled;
+	int num_events_handled;
+
+	queue_group = data;
+	ctrl_info = queue_group->ctrl_info;
+
+	if (!pqi_is_valid_irq(ctrl_info))
+		return IRQ_NONE;
+
+	num_io_responses_handled = pqi_process_io_intr(ctrl_info, queue_group);
+	if (num_io_responses_handled < 0)
+		goto out;
+
+	if (irq == ctrl_info->event_irq) {
+		num_events_handled = pqi_process_event_intr(ctrl_info);
+		if (num_events_handled < 0)
+			goto out;
+	} else {
+		num_events_handled = 0;
+	}
+
+	if (num_io_responses_handled + num_events_handled > 0)
+		atomic_inc(&ctrl_info->num_interrupts);
+
+	pqi_start_io(ctrl_info, queue_group, RAID_PATH, NULL);
+	pqi_start_io(ctrl_info, queue_group, AIO_PATH, NULL);
+
+out:
+	return IRQ_HANDLED;
+}
+
+
+static int pqi_request_irqs(struct pqi_ctrl_info *ctrl_info)
+{
+	struct pci_dev *pci_dev = ctrl_info->pci_dev;
+	int i;
+	int rc;
+
+	ctrl_info->event_irq = pqi_pci_irq_vector(pci_dev, 0);
+
+	for (i = 0; i < ctrl_info->num_msix_vectors_enabled; i++) {
+		rc = request_irq(pqi_pci_irq_vector(pci_dev, i), pqi_irq_handler, 0,
+		 	DRIVER_NAME_SHORT, pqi_get_irq_cookie(ctrl_info, i));
+		if (rc) {
+			dev_err(&pci_dev->dev,
+				"irq %u init failed with error %d\n",
+				pqi_pci_irq_vector(pci_dev, i), rc);
+			return rc;
+		}
+		ctrl_info->num_msix_vectors_initialized++;
+	}
+
+	return 0;
+}
+
+static void pqi_free_irqs(struct pqi_ctrl_info *ctrl_info)
+{
+	int i;
+
+	for (i = 0; i < ctrl_info->num_msix_vectors_initialized; i++)
+		free_irq(pqi_pci_irq_vector(ctrl_info->pci_dev, i),
+			pqi_get_irq_cookie(ctrl_info, i));
+
+	ctrl_info->num_msix_vectors_initialized = 0;
+}
+
+static int pqi_enable_msix_interrupts(struct pqi_ctrl_info *ctrl_info)
+{
+	int num_vectors_enabled;
+
+	num_vectors_enabled = pqi_pci_alloc_irq_vectors(ctrl_info->pci_dev,
+			PQI_MIN_MSIX_VECTORS, ctrl_info->num_queue_groups,
+			PCI_IRQ_MSIX | PCI_IRQ_AFFINITY);
+
+	if (num_vectors_enabled < 0) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"MSI-X init failed with error %d\n",
+			num_vectors_enabled);
+		return num_vectors_enabled;
+	}
+
+	ctrl_info->num_msix_vectors_enabled = num_vectors_enabled;
+	ctrl_info->irq_mode = IRQ_MODE_MSIX;
+
+	return 0;
+}
+
+static void pqi_disable_msix_interrupts(struct pqi_ctrl_info *ctrl_info)
+{
+	if (ctrl_info->num_msix_vectors_enabled) {
+		pqi_pci_free_irq_vectors(ctrl_info->pci_dev);
+		ctrl_info->num_msix_vectors_enabled = 0;
+	}
+}
+
+static int pqi_alloc_operational_queues(struct pqi_ctrl_info *ctrl_info)
+{
+	unsigned int i;
+	size_t alloc_length;
+	size_t element_array_length_per_iq;
+	size_t element_array_length_per_oq;
+	void *element_array;
+	void __iomem *next_queue_index;
+	void *aligned_pointer;
+	unsigned int num_inbound_queues;
+	unsigned int num_outbound_queues;
+	unsigned int num_queue_indexes;
+	struct pqi_queue_group *queue_group;
+
+	element_array_length_per_iq =
+		PQI_OPERATIONAL_IQ_ELEMENT_LENGTH *
+		ctrl_info->num_elements_per_iq;
+	element_array_length_per_oq =
+		PQI_OPERATIONAL_OQ_ELEMENT_LENGTH *
+		ctrl_info->num_elements_per_oq;
+	num_inbound_queues = ctrl_info->num_queue_groups * 2;
+	num_outbound_queues = ctrl_info->num_queue_groups;
+	num_queue_indexes = (ctrl_info->num_queue_groups * 3) + 1;
+
+	aligned_pointer = NULL;
+
+	for (i = 0; i < num_inbound_queues; i++) {
+		aligned_pointer = PTR_ALIGN(aligned_pointer,
+			PQI_QUEUE_ELEMENT_ARRAY_ALIGNMENT);
+		aligned_pointer += element_array_length_per_iq;
+	}
+
+	for (i = 0; i < num_outbound_queues; i++) {
+		aligned_pointer = PTR_ALIGN(aligned_pointer,
+			PQI_QUEUE_ELEMENT_ARRAY_ALIGNMENT);
+		aligned_pointer += element_array_length_per_oq;
+	}
+
+	aligned_pointer = PTR_ALIGN(aligned_pointer,
+		PQI_QUEUE_ELEMENT_ARRAY_ALIGNMENT);
+	aligned_pointer += PQI_NUM_EVENT_QUEUE_ELEMENTS *
+		PQI_EVENT_OQ_ELEMENT_LENGTH;
+
+	for (i = 0; i < num_queue_indexes; i++) {
+		aligned_pointer = PTR_ALIGN(aligned_pointer,
+			PQI_OPERATIONAL_INDEX_ALIGNMENT);
+		aligned_pointer += sizeof(pqi_index_t);
+	}
+
+	alloc_length = (size_t)aligned_pointer +
+		PQI_QUEUE_ELEMENT_ARRAY_ALIGNMENT;
+
+	alloc_length += PQI_EXTRA_SGL_MEMORY;
+
+	ctrl_info->queue_memory_base =
+		dma_zalloc_coherent(&ctrl_info->pci_dev->dev,
+			alloc_length,
+			&ctrl_info->queue_memory_base_dma_handle, GFP_KERNEL);
+
+	if (!ctrl_info->queue_memory_base)
+		return -ENOMEM;
+
+	ctrl_info->queue_memory_length = alloc_length;
+
+	element_array = PTR_ALIGN(ctrl_info->queue_memory_base,
+		PQI_QUEUE_ELEMENT_ARRAY_ALIGNMENT);
+
+	for (i = 0; i < ctrl_info->num_queue_groups; i++) {
+		queue_group = &ctrl_info->queue_groups[i];
+		queue_group->iq_element_array[RAID_PATH] = element_array;
+		queue_group->iq_element_array_bus_addr[RAID_PATH] =
+			ctrl_info->queue_memory_base_dma_handle +
+				(element_array - ctrl_info->queue_memory_base);
+		element_array += element_array_length_per_iq;
+		element_array = PTR_ALIGN(element_array,
+			PQI_QUEUE_ELEMENT_ARRAY_ALIGNMENT);
+		queue_group->iq_element_array[AIO_PATH] = element_array;
+		queue_group->iq_element_array_bus_addr[AIO_PATH] =
+			ctrl_info->queue_memory_base_dma_handle +
+			(element_array - ctrl_info->queue_memory_base);
+		element_array += element_array_length_per_iq;
+		element_array = PTR_ALIGN(element_array,
+			PQI_QUEUE_ELEMENT_ARRAY_ALIGNMENT);
+	}
+
+	for (i = 0; i < ctrl_info->num_queue_groups; i++) {
+		queue_group = &ctrl_info->queue_groups[i];
+		queue_group->oq_element_array = element_array;
+		queue_group->oq_element_array_bus_addr =
+			ctrl_info->queue_memory_base_dma_handle +
+			(element_array - ctrl_info->queue_memory_base);
+		element_array += element_array_length_per_oq;
+		element_array = PTR_ALIGN(element_array,
+			PQI_QUEUE_ELEMENT_ARRAY_ALIGNMENT);
+	}
+
+	ctrl_info->event_queue.oq_element_array = element_array;
+	ctrl_info->event_queue.oq_element_array_bus_addr =
+		ctrl_info->queue_memory_base_dma_handle +
+		(element_array - ctrl_info->queue_memory_base);
+	element_array += PQI_NUM_EVENT_QUEUE_ELEMENTS *
+		PQI_EVENT_OQ_ELEMENT_LENGTH;
+
+	next_queue_index = (void __iomem *)PTR_ALIGN(element_array,
+		PQI_OPERATIONAL_INDEX_ALIGNMENT);
+
+	for (i = 0; i < ctrl_info->num_queue_groups; i++) {
+		queue_group = &ctrl_info->queue_groups[i];
+		queue_group->iq_ci[RAID_PATH] = next_queue_index;
+		queue_group->iq_ci_bus_addr[RAID_PATH] =
+			ctrl_info->queue_memory_base_dma_handle +
+			(next_queue_index -
+			(void __iomem *)ctrl_info->queue_memory_base);
+		next_queue_index += sizeof(pqi_index_t);
+		next_queue_index = PTR_ALIGN(next_queue_index,
+			PQI_OPERATIONAL_INDEX_ALIGNMENT);
+		queue_group->iq_ci[AIO_PATH] = next_queue_index;
+		queue_group->iq_ci_bus_addr[AIO_PATH] =
+			ctrl_info->queue_memory_base_dma_handle +
+			(next_queue_index -
+			(void __iomem *)ctrl_info->queue_memory_base);
+		next_queue_index += sizeof(pqi_index_t);
+		next_queue_index = PTR_ALIGN(next_queue_index,
+			PQI_OPERATIONAL_INDEX_ALIGNMENT);
+		queue_group->oq_pi = next_queue_index;
+		queue_group->oq_pi_bus_addr =
+			ctrl_info->queue_memory_base_dma_handle +
+			(next_queue_index -
+			(void __iomem *)ctrl_info->queue_memory_base);
+		next_queue_index += sizeof(pqi_index_t);
+		next_queue_index = PTR_ALIGN(next_queue_index,
+			PQI_OPERATIONAL_INDEX_ALIGNMENT);
+	}
+
+	ctrl_info->event_queue.oq_pi = next_queue_index;
+	ctrl_info->event_queue.oq_pi_bus_addr =
+		ctrl_info->queue_memory_base_dma_handle +
+		(next_queue_index -
+		(void __iomem *)ctrl_info->queue_memory_base);
+
+	return 0;
+}
+
+static void pqi_init_operational_queues(struct pqi_ctrl_info *ctrl_info)
+{
+	unsigned int i;
+	u16 next_iq_id = PQI_MIN_OPERATIONAL_QUEUE_ID;
+	u16 next_oq_id = PQI_MIN_OPERATIONAL_QUEUE_ID;
+
+	/*
+	 * Initialize the backpointers to the controller structure in
+	 * each operational queue group structure.
+	 */
+	for (i = 0; i < ctrl_info->num_queue_groups; i++)
+		ctrl_info->queue_groups[i].ctrl_info = ctrl_info;
+
+	/*
+	 * Assign IDs to all operational queues.  Note that the IDs
+	 * assigned to operational IQs are independent of the IDs
+	 * assigned to operational OQs.
+	 */
+	ctrl_info->event_queue.oq_id = next_oq_id++;
+	for (i = 0; i < ctrl_info->num_queue_groups; i++) {
+		ctrl_info->queue_groups[i].iq_id[RAID_PATH] = next_iq_id++;
+		ctrl_info->queue_groups[i].iq_id[AIO_PATH] = next_iq_id++;
+		ctrl_info->queue_groups[i].oq_id = next_oq_id++;
+	}
+
+	/*
+	 * Assign MSI-X table entry indexes to all queues.  Note that the
+	 * interrupt for the event queue is shared with the first queue group.
+	 */
+	ctrl_info->event_queue.int_msg_num = 0;
+	for (i = 0; i < ctrl_info->num_queue_groups; i++)
+		ctrl_info->queue_groups[i].int_msg_num = i;
+
+	for (i = 0; i < ctrl_info->num_queue_groups; i++) {
+		spin_lock_init(&ctrl_info->queue_groups[i].submit_lock[0]);
+		spin_lock_init(&ctrl_info->queue_groups[i].submit_lock[1]);
+		INIT_LIST_HEAD(&ctrl_info->queue_groups[i].request_list[0]);
+		INIT_LIST_HEAD(&ctrl_info->queue_groups[i].request_list[1]);
+	}
+}
+
+static int pqi_alloc_admin_queues(struct pqi_ctrl_info *ctrl_info)
+{
+	size_t alloc_length;
+	struct pqi_admin_queues_aligned *admin_queues_aligned;
+	struct pqi_admin_queues *admin_queues;
+
+	alloc_length = sizeof(struct pqi_admin_queues_aligned) +
+		PQI_QUEUE_ELEMENT_ARRAY_ALIGNMENT;
+
+	ctrl_info->admin_queue_memory_base =
+		dma_zalloc_coherent(&ctrl_info->pci_dev->dev,
+			alloc_length,
+			&ctrl_info->admin_queue_memory_base_dma_handle,
+			GFP_KERNEL);
+
+	if (!ctrl_info->admin_queue_memory_base)
+		return -ENOMEM;
+
+	ctrl_info->admin_queue_memory_length = alloc_length;
+
+	admin_queues = &ctrl_info->admin_queues;
+	admin_queues_aligned = PTR_ALIGN(ctrl_info->admin_queue_memory_base,
+		PQI_QUEUE_ELEMENT_ARRAY_ALIGNMENT);
+	admin_queues->iq_element_array =
+		&admin_queues_aligned->iq_element_array;
+	admin_queues->oq_element_array =
+		&admin_queues_aligned->oq_element_array;
+	admin_queues->iq_ci = &admin_queues_aligned->iq_ci;
+	admin_queues->oq_pi =
+		(pqi_index_t __iomem *)&admin_queues_aligned->oq_pi;
+
+	admin_queues->iq_element_array_bus_addr =
+		ctrl_info->admin_queue_memory_base_dma_handle +
+		(admin_queues->iq_element_array -
+		ctrl_info->admin_queue_memory_base);
+	admin_queues->oq_element_array_bus_addr =
+		ctrl_info->admin_queue_memory_base_dma_handle +
+		(admin_queues->oq_element_array -
+		ctrl_info->admin_queue_memory_base);
+	admin_queues->iq_ci_bus_addr =
+		ctrl_info->admin_queue_memory_base_dma_handle +
+		((void *)admin_queues->iq_ci -
+		ctrl_info->admin_queue_memory_base);
+	admin_queues->oq_pi_bus_addr =
+		ctrl_info->admin_queue_memory_base_dma_handle +
+		((void __iomem *)admin_queues->oq_pi -
+		(void __iomem *)ctrl_info->admin_queue_memory_base);
+
+	return 0;
+}
+
+#define PQI_ADMIN_QUEUE_CREATE_TIMEOUT_JIFFIES		HZ
+#define PQI_ADMIN_QUEUE_CREATE_POLL_INTERVAL_MSECS	1
+
+static int pqi_create_admin_queues(struct pqi_ctrl_info *ctrl_info)
+{
+	struct pqi_device_registers __iomem *pqi_registers;
+	struct pqi_admin_queues *admin_queues;
+	unsigned long timeout;
+	u8 status;
+	u32 reg;
+
+	pqi_registers = ctrl_info->pqi_registers;
+	admin_queues = &ctrl_info->admin_queues;
+
+	writeq((u64)admin_queues->iq_element_array_bus_addr,
+		&pqi_registers->admin_iq_element_array_addr);
+	writeq((u64)admin_queues->oq_element_array_bus_addr,
+		&pqi_registers->admin_oq_element_array_addr);
+	writeq((u64)admin_queues->iq_ci_bus_addr,
+		&pqi_registers->admin_iq_ci_addr);
+	writeq((u64)admin_queues->oq_pi_bus_addr,
+		&pqi_registers->admin_oq_pi_addr);
+
+	reg = PQI_ADMIN_IQ_NUM_ELEMENTS |
+		(PQI_ADMIN_OQ_NUM_ELEMENTS << 8) |
+		(admin_queues->int_msg_num << 16);
+	writel(reg, &pqi_registers->admin_iq_num_elements);
+
+	writel(PQI_CREATE_ADMIN_QUEUE_PAIR,
+		&pqi_registers->function_and_status_code);
+
+	timeout = PQI_ADMIN_QUEUE_CREATE_TIMEOUT_JIFFIES + jiffies;
+	while (1) {
+		msleep(PQI_ADMIN_QUEUE_CREATE_POLL_INTERVAL_MSECS);
+		status = readb(&pqi_registers->function_and_status_code);
+		if (status == PQI_STATUS_IDLE)
+			break;
+		if (time_after(jiffies, timeout))
+			return -ETIMEDOUT;
+	}
+
+	/*
+	 * The offset registers are not initialized to the correct
+	 * offsets until *after* the create admin queue pair command
+	 * completes successfully.
+	 */
+	admin_queues->iq_pi = ctrl_info->iomem_base +
+		PQI_DEVICE_REGISTERS_OFFSET +
+		readq(&pqi_registers->admin_iq_pi_offset);
+	admin_queues->oq_ci = ctrl_info->iomem_base +
+		PQI_DEVICE_REGISTERS_OFFSET +
+		readq(&pqi_registers->admin_oq_ci_offset);
+
+	return 0;
+}
+
+static void pqi_submit_admin_request(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_general_admin_request *request)
+{
+	struct pqi_admin_queues *admin_queues;
+	void *next_element;
+	pqi_index_t iq_pi;
+
+	admin_queues = &ctrl_info->admin_queues;
+	iq_pi = admin_queues->iq_pi_copy;
+
+	next_element = admin_queues->iq_element_array +
+		(iq_pi * PQI_ADMIN_IQ_ELEMENT_LENGTH);
+
+	memcpy(next_element, request, sizeof(*request));
+
+	iq_pi = (iq_pi + 1) % PQI_ADMIN_IQ_NUM_ELEMENTS;
+	admin_queues->iq_pi_copy = iq_pi;
+
+	/*
+	 * This write notifies the controller that an IU is available to be
+	 * processed.
+	 */
+	writel(iq_pi, admin_queues->iq_pi);
+}
+
+#define PQI_ADMIN_REQUEST_TIMEOUT_SECS	(60 * HZ)
+
+static int pqi_poll_for_admin_response(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_general_admin_response *response)
+{
+	struct pqi_admin_queues *admin_queues;
+	pqi_index_t oq_pi;
+	pqi_index_t oq_ci;
+	unsigned long timeout;
+
+	admin_queues = &ctrl_info->admin_queues;
+	oq_ci = admin_queues->oq_ci_copy;
+
+	timeout = PQI_ADMIN_REQUEST_TIMEOUT_SECS + jiffies;
+
+	while (1) {
+		oq_pi = readl(admin_queues->oq_pi);
+		if (oq_pi != oq_ci)
+			break;
+		if (time_after(jiffies, timeout)) {
+			dev_err(&ctrl_info->pci_dev->dev,
+				"timed out waiting for admin response\n");
+			return -ETIMEDOUT;
+		}
+		if (!sis_is_firmware_running(ctrl_info))
+			return -ENXIO;
+		msleep(1);
+	}
+
+	memcpy(response, admin_queues->oq_element_array +
+		(oq_ci * PQI_ADMIN_OQ_ELEMENT_LENGTH), sizeof(*response));
+
+	oq_ci = (oq_ci + 1) % PQI_ADMIN_OQ_NUM_ELEMENTS;
+	admin_queues->oq_ci_copy = oq_ci;
+	writel(oq_ci, admin_queues->oq_ci);
+
+	return 0;
+}
+
+static void pqi_start_io(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_queue_group *queue_group, enum pqi_io_path path,
+	struct pqi_io_request *io_request)
+{
+	struct pqi_io_request *next;
+	void *next_element;
+	pqi_index_t iq_pi;
+	pqi_index_t iq_ci;
+	size_t iu_length;
+	unsigned long flags;
+	unsigned int num_elements_needed;
+	unsigned int num_elements_to_end_of_queue;
+	size_t copy_count;
+	struct pqi_iu_header *request;
+
+	spin_lock_irqsave(&queue_group->submit_lock[path], flags);
+
+	if (io_request) {
+		io_request->queue_group = queue_group;
+		list_add_tail(&io_request->request_list_entry,
+			&queue_group->request_list[path]);
+	}
+
+	iq_pi = queue_group->iq_pi_copy[path];
+
+	list_for_each_entry_safe(io_request, next,
+		&queue_group->request_list[path], request_list_entry) {
+
+		request = io_request->iu;
+
+		iu_length = get_unaligned_le16(&request->iu_length) +
+			PQI_REQUEST_HEADER_LENGTH;
+		num_elements_needed =
+			DIV_ROUND_UP(iu_length,
+				PQI_OPERATIONAL_IQ_ELEMENT_LENGTH);
+
+		iq_ci = readl(queue_group->iq_ci[path]);
+
+		if (num_elements_needed > pqi_num_elements_free(iq_pi, iq_ci,
+			ctrl_info->num_elements_per_iq))
+			break;
+
+		put_unaligned_le16(queue_group->oq_id,
+			&request->response_queue_id);
+
+		next_element = queue_group->iq_element_array[path] +
+			(iq_pi * PQI_OPERATIONAL_IQ_ELEMENT_LENGTH);
+
+		num_elements_to_end_of_queue =
+			ctrl_info->num_elements_per_iq - iq_pi;
+
+		if (num_elements_needed <= num_elements_to_end_of_queue) {
+			memcpy(next_element, request, iu_length);
+		} else {
+			copy_count = num_elements_to_end_of_queue *
+				PQI_OPERATIONAL_IQ_ELEMENT_LENGTH;
+			memcpy(next_element, request, copy_count);
+			memcpy(queue_group->iq_element_array[path],
+				(u8 *)request + copy_count,
+				iu_length - copy_count);
+		}
+
+		iq_pi = (iq_pi + num_elements_needed) %
+			ctrl_info->num_elements_per_iq;
+
+		list_del(&io_request->request_list_entry);
+	}
+
+	if (iq_pi != queue_group->iq_pi_copy[path]) {
+		queue_group->iq_pi_copy[path] = iq_pi;
+		/*
+		 * This write notifies the controller that one or more IUs are
+		 * available to be processed.
+		 */
+		writel(iq_pi, queue_group->iq_pi[path]);
+	}
+
+	spin_unlock_irqrestore(&queue_group->submit_lock[path], flags);
+}
+
+#define PQI_WAIT_FOR_COMPLETION_IO_TIMEOUT_SECS		(10 * HZ)
+
+static int pqi_wait_for_completion_io(struct pqi_ctrl_info *ctrl_info,
+	struct completion *wait)
+{
+	int rc;
+
+	while (1) {
+		if (wait_for_completion_io_timeout(wait,
+			PQI_WAIT_FOR_COMPLETION_IO_TIMEOUT_SECS)) {
+			rc = 0;
+			break;
+		}
+
+		pqi_check_ctrl_health(ctrl_info);
+		if (pqi_ctrl_offline(ctrl_info)) {
+			rc = -ENXIO;
+			break;
+		}
+	}
+
+	return rc;
+}
+
+static void pqi_raid_synchronous_complete(struct pqi_io_request *io_request,
+	void *context)
+{
+	struct completion *waiting = context;
+
+	complete(waiting);
+}
+
+static int pqi_process_raid_io_error_synchronous(
+	struct pqi_raid_error_info *error_info)
+{
+	int rc = -EIO;
+
+	switch (error_info->data_out_result) {
+	case PQI_DATA_IN_OUT_GOOD:
+		if (error_info->status == SAM_STAT_GOOD)
+			rc = 0;
+		break;
+	case PQI_DATA_IN_OUT_UNDERFLOW:
+		if (error_info->status == SAM_STAT_GOOD ||
+			error_info->status == SAM_STAT_CHECK_CONDITION)
+			rc = 0;
+		break;
+	case PQI_DATA_IN_OUT_ABORTED:
+		rc = PQI_CMD_STATUS_ABORTED;
+		break;
+	}
+
+	return rc;
+}
+
+static inline bool pqi_is_blockable_request(struct pqi_iu_header *request)
+{
+	return (request->driver_flags & PQI_DRIVER_NONBLOCKABLE_REQUEST) == 0;
+}
+
+static int pqi_submit_raid_request_synchronous(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_iu_header *request, unsigned int flags,
+	struct pqi_raid_error_info *error_info)
+{
+	int rc = 0;
+	struct pqi_io_request *io_request;
+	size_t iu_length;
+	DECLARE_COMPLETION_ONSTACK(wait);
+
+	if (flags & PQI_SYNC_FLAGS_INTERRUPTABLE) {
+		if (down_interruptible(&ctrl_info->sync_request_sem))
+			return -ERESTARTSYS;
+	} else {
+		down(&ctrl_info->sync_request_sem);
+	}
+
+	pqi_ctrl_busy(ctrl_info);
+	if (pqi_is_blockable_request(request))
+		pqi_wait_if_ctrl_blocked(ctrl_info);
+
+	if (pqi_ctrl_offline(ctrl_info)) {
+		rc = -ENXIO;
+		goto out;
+	}
+
+	io_request = pqi_alloc_io_request(ctrl_info);
+
+	put_unaligned_le16(io_request->index,
+		&(((struct pqi_raid_path_request *)request)->request_id));
+
+	if (request->iu_type == PQI_REQUEST_IU_RAID_PATH_IO)
+		((struct pqi_raid_path_request *)request)->error_index =
+			((struct pqi_raid_path_request *)request)->request_id;
+
+	iu_length = get_unaligned_le16(&request->iu_length) +
+		PQI_REQUEST_HEADER_LENGTH;
+	memcpy(io_request->iu, request, iu_length);
+
+	io_request->io_complete_callback = pqi_raid_synchronous_complete;
+	io_request->context = &wait;
+
+	pqi_start_io(ctrl_info, &ctrl_info->queue_groups[PQI_DEFAULT_QUEUE_GROUP], RAID_PATH,
+		io_request);
+
+	pqi_wait_for_completion_io(ctrl_info, &wait);
+
+	if (error_info) {
+		if (io_request->error_info)
+			memcpy(error_info, io_request->error_info, sizeof(*error_info));
+		else
+			memset(error_info, 0, sizeof(*error_info));
+	} else if (rc == 0 && io_request->error_info) {
+		rc = pqi_process_raid_io_error_synchronous(io_request->error_info);
+	}
+
+	pqi_free_io_request(io_request);
+
+out:
+	pqi_ctrl_unbusy(ctrl_info);
+	up(&ctrl_info->sync_request_sem);
+
+	return rc;
+}
+
+static int pqi_validate_admin_response(
+	struct pqi_general_admin_response *response, u8 expected_function_code)
+{
+	if (response->header.iu_type != PQI_RESPONSE_IU_GENERAL_ADMIN)
+		return -EINVAL;
+
+	if (get_unaligned_le16(&response->header.iu_length) !=
+		PQI_GENERAL_ADMIN_IU_LENGTH)
+		return -EINVAL;
+
+	if (response->function_code != expected_function_code)
+		return -EINVAL;
+
+	if (response->status != PQI_GENERAL_ADMIN_STATUS_SUCCESS)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int pqi_submit_admin_request_synchronous(
+	struct pqi_ctrl_info *ctrl_info,
+	struct pqi_general_admin_request *request,
+	struct pqi_general_admin_response *response)
+{
+	int rc;
+
+	pqi_submit_admin_request(ctrl_info, request);
+
+	rc = pqi_poll_for_admin_response(ctrl_info, response);
+
+	if (rc == 0)
+		rc = pqi_validate_admin_response(response, request->function_code);
+
+	return rc;
+}
+
+static int pqi_report_device_capability(struct pqi_ctrl_info *ctrl_info)
+{
+	int rc;
+	struct pqi_general_admin_request request;
+	struct pqi_general_admin_response response;
+	struct pqi_device_capability *capability;
+	struct pqi_iu_layer_descriptor *sop_iu_layer_descriptor;
+
+	capability = kmalloc(sizeof(*capability), GFP_KERNEL);
+	if (!capability)
+		return -ENOMEM;
+
+	memset(&request, 0, sizeof(request));
+
+	request.header.iu_type = PQI_REQUEST_IU_GENERAL_ADMIN;
+	put_unaligned_le16(PQI_GENERAL_ADMIN_IU_LENGTH,
+		&request.header.iu_length);
+	request.function_code =
+		PQI_GENERAL_ADMIN_FUNCTION_REPORT_DEVICE_CAPABILITY;
+	put_unaligned_le32(sizeof(*capability),
+		&request.data.report_device_capability.buffer_length);
+
+	rc = pqi_map_single(ctrl_info->pci_dev,
+		&request.data.report_device_capability.sg_descriptor,
+		capability, sizeof(*capability),
+		DMA_FROM_DEVICE);
+	if (rc)
+		goto out;
+
+	rc = pqi_submit_admin_request_synchronous(ctrl_info, &request, &response);
+
+	pqi_pci_unmap(ctrl_info->pci_dev,
+		&request.data.report_device_capability.sg_descriptor, 1,
+		DMA_FROM_DEVICE);
+
+	if (rc)
+		goto out;
+
+	if (response.status != PQI_GENERAL_ADMIN_STATUS_SUCCESS) {
+		rc = -EIO;
+		goto out;
+	}
+
+	ctrl_info->max_inbound_queues =
+		get_unaligned_le16(&capability->max_inbound_queues);
+	ctrl_info->max_elements_per_iq =
+		get_unaligned_le16(&capability->max_elements_per_iq);
+	ctrl_info->max_iq_element_length =
+		get_unaligned_le16(&capability->max_iq_element_length)
+		* 16;
+	ctrl_info->max_outbound_queues =
+		get_unaligned_le16(&capability->max_outbound_queues);
+	ctrl_info->max_elements_per_oq =
+		get_unaligned_le16(&capability->max_elements_per_oq);
+	ctrl_info->max_oq_element_length =
+		get_unaligned_le16(&capability->max_oq_element_length)
+		* 16;
+
+	sop_iu_layer_descriptor =
+		&capability->iu_layer_descriptors[PQI_PROTOCOL_SOP];
+
+	ctrl_info->max_inbound_iu_length_per_firmware =
+		get_unaligned_le16(
+			&sop_iu_layer_descriptor->max_inbound_iu_length);
+	ctrl_info->inbound_spanning_supported =
+		sop_iu_layer_descriptor->inbound_spanning_supported;
+	ctrl_info->outbound_spanning_supported =
+		sop_iu_layer_descriptor->outbound_spanning_supported;
+
+out:
+	kfree(capability);
+
+	return rc;
+}
+
+static int pqi_validate_device_capability(struct pqi_ctrl_info *ctrl_info)
+{
+	if (ctrl_info->max_iq_element_length <
+		PQI_OPERATIONAL_IQ_ELEMENT_LENGTH) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"max. inbound queue element length of %d is less than the required length of %d\n",
+			ctrl_info->max_iq_element_length,
+			PQI_OPERATIONAL_IQ_ELEMENT_LENGTH);
+		return -EINVAL;
+	}
+
+	if (ctrl_info->max_oq_element_length <
+		PQI_OPERATIONAL_OQ_ELEMENT_LENGTH) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"max. outbound queue element length of %d is less than the required length of %d\n",
+			ctrl_info->max_oq_element_length,
+			PQI_OPERATIONAL_OQ_ELEMENT_LENGTH);
+		return -EINVAL;
+	}
+
+	if (ctrl_info->max_inbound_iu_length_per_firmware <
+		PQI_OPERATIONAL_IQ_ELEMENT_LENGTH) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"max. inbound IU length of %u is less than the min. required length of %d\n",
+			ctrl_info->max_inbound_iu_length_per_firmware,
+			PQI_OPERATIONAL_IQ_ELEMENT_LENGTH);
+		return -EINVAL;
+	}
+
+	if (!ctrl_info->inbound_spanning_supported) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"the controller does not support inbound spanning\n");
+		return -EINVAL;
+	}
+
+	if (ctrl_info->outbound_spanning_supported) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"the controller supports outbound spanning but this driver does not\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int pqi_create_event_queue(struct pqi_ctrl_info *ctrl_info)
+{
+	int rc;
+	struct pqi_event_queue *event_queue;
+	struct pqi_general_admin_request request;
+	struct pqi_general_admin_response response;
+
+	event_queue = &ctrl_info->event_queue;
+
+	/*
+	 * Create OQ (Outbound Queue - device to host queue) to dedicate
+	 * to events.
+	 */
+	memset(&request, 0, sizeof(request));
+	request.header.iu_type = PQI_REQUEST_IU_GENERAL_ADMIN;
+	put_unaligned_le16(PQI_GENERAL_ADMIN_IU_LENGTH,
+		&request.header.iu_length);
+	request.function_code = PQI_GENERAL_ADMIN_FUNCTION_CREATE_OQ;
+	put_unaligned_le16(event_queue->oq_id,
+		&request.data.create_operational_oq.queue_id);
+	put_unaligned_le64((u64)event_queue->oq_element_array_bus_addr,
+		&request.data.create_operational_oq.element_array_addr);
+	put_unaligned_le64((u64)event_queue->oq_pi_bus_addr,
+		&request.data.create_operational_oq.pi_addr);
+	put_unaligned_le16(PQI_NUM_EVENT_QUEUE_ELEMENTS,
+		&request.data.create_operational_oq.num_elements);
+	put_unaligned_le16(PQI_EVENT_OQ_ELEMENT_LENGTH / 16,
+		&request.data.create_operational_oq.element_length);
+	request.data.create_operational_oq.queue_protocol = PQI_PROTOCOL_SOP;
+	put_unaligned_le16(event_queue->int_msg_num,
+		&request.data.create_operational_oq.int_msg_num);
+
+	rc = pqi_submit_admin_request_synchronous(ctrl_info, &request,
+		&response);
+	if (rc)
+		return rc;
+
+	event_queue->oq_ci = ctrl_info->iomem_base +
+		PQI_DEVICE_REGISTERS_OFFSET +
+		get_unaligned_le64(
+			&response.data.create_operational_oq.oq_ci_offset);
+
+	return 0;
+}
+
+static int pqi_create_queue_group(struct pqi_ctrl_info *ctrl_info,
+	unsigned int group_number)
+{
+	int rc;
+	struct pqi_queue_group *queue_group;
+	struct pqi_general_admin_request request;
+	struct pqi_general_admin_response response;
+
+	queue_group = &ctrl_info->queue_groups[group_number];
+
+	/*
+	 * Create IQ (Inbound Queue - host to device queue) for
+	 * RAID path.
+	 */
+	memset(&request, 0, sizeof(request));
+	request.header.iu_type = PQI_REQUEST_IU_GENERAL_ADMIN;
+	put_unaligned_le16(PQI_GENERAL_ADMIN_IU_LENGTH,
+		&request.header.iu_length);
+	request.function_code = PQI_GENERAL_ADMIN_FUNCTION_CREATE_IQ;
+	put_unaligned_le16(queue_group->iq_id[RAID_PATH],
+		&request.data.create_operational_iq.queue_id);
+	put_unaligned_le64(
+		(u64)queue_group->iq_element_array_bus_addr[RAID_PATH],
+		&request.data.create_operational_iq.element_array_addr);
+	put_unaligned_le64((u64)queue_group->iq_ci_bus_addr[RAID_PATH],
+		&request.data.create_operational_iq.ci_addr);
+	put_unaligned_le16(ctrl_info->num_elements_per_iq,
+		&request.data.create_operational_iq.num_elements);
+	put_unaligned_le16(PQI_OPERATIONAL_IQ_ELEMENT_LENGTH / 16,
+		&request.data.create_operational_iq.element_length);
+	request.data.create_operational_iq.queue_protocol = PQI_PROTOCOL_SOP;
+
+	rc = pqi_submit_admin_request_synchronous(ctrl_info, &request,
+		&response);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"error creating inbound RAID queue\n");
+		return rc;
+	}
+
+	queue_group->iq_pi[RAID_PATH] = ctrl_info->iomem_base +
+		PQI_DEVICE_REGISTERS_OFFSET +
+		get_unaligned_le64(
+			&response.data.create_operational_iq.iq_pi_offset);
+
+	/*
+	 * Create IQ (Inbound Queue - host to device queue) for
+	 * Advanced I/O (AIO) path.
+	 */
+	memset(&request, 0, sizeof(request));
+	request.header.iu_type = PQI_REQUEST_IU_GENERAL_ADMIN;
+	put_unaligned_le16(PQI_GENERAL_ADMIN_IU_LENGTH,
+		&request.header.iu_length);
+	request.function_code = PQI_GENERAL_ADMIN_FUNCTION_CREATE_IQ;
+	put_unaligned_le16(queue_group->iq_id[AIO_PATH],
+		&request.data.create_operational_iq.queue_id);
+	put_unaligned_le64((u64)queue_group->
+		iq_element_array_bus_addr[AIO_PATH],
+		&request.data.create_operational_iq.element_array_addr);
+	put_unaligned_le64((u64)queue_group->iq_ci_bus_addr[AIO_PATH],
+		&request.data.create_operational_iq.ci_addr);
+	put_unaligned_le16(ctrl_info->num_elements_per_iq,
+		&request.data.create_operational_iq.num_elements);
+	put_unaligned_le16(PQI_OPERATIONAL_IQ_ELEMENT_LENGTH / 16,
+		&request.data.create_operational_iq.element_length);
+	request.data.create_operational_iq.queue_protocol = PQI_PROTOCOL_SOP;
+
+	rc = pqi_submit_admin_request_synchronous(ctrl_info, &request,
+		&response);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"error creating inbound AIO queue\n");
+		return rc;
+	}
+
+	queue_group->iq_pi[AIO_PATH] = ctrl_info->iomem_base +
+		PQI_DEVICE_REGISTERS_OFFSET +
+		get_unaligned_le64(
+			&response.data.create_operational_iq.iq_pi_offset);
+
+	/*
+	 * Designate the 2nd IQ as the AIO path.  By default, all IQs are
+	 * assumed to be for RAID path I/O unless we change the queue's
+	 * property.
+	 */
+	memset(&request, 0, sizeof(request));
+	request.header.iu_type = PQI_REQUEST_IU_GENERAL_ADMIN;
+	put_unaligned_le16(PQI_GENERAL_ADMIN_IU_LENGTH,
+		&request.header.iu_length);
+	request.function_code = PQI_GENERAL_ADMIN_FUNCTION_CHANGE_IQ_PROPERTY;
+	put_unaligned_le16(queue_group->iq_id[AIO_PATH],
+		&request.data.change_operational_iq_properties.queue_id);
+	put_unaligned_le32(PQI_IQ_PROPERTY_IS_AIO_QUEUE,
+		&request.data.change_operational_iq_properties.vendor_specific);
+
+	rc = pqi_submit_admin_request_synchronous(ctrl_info, &request,
+		&response);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"error changing queue property\n");
+		return rc;
+	}
+
+	/*
+	 * Create OQ (Outbound Queue - device to host queue).
+	 */
+	memset(&request, 0, sizeof(request));
+	request.header.iu_type = PQI_REQUEST_IU_GENERAL_ADMIN;
+	put_unaligned_le16(PQI_GENERAL_ADMIN_IU_LENGTH,
+		&request.header.iu_length);
+	request.function_code = PQI_GENERAL_ADMIN_FUNCTION_CREATE_OQ;
+	put_unaligned_le16(queue_group->oq_id,
+		&request.data.create_operational_oq.queue_id);
+	put_unaligned_le64((u64)queue_group->oq_element_array_bus_addr,
+		&request.data.create_operational_oq.element_array_addr);
+	put_unaligned_le64((u64)queue_group->oq_pi_bus_addr,
+		&request.data.create_operational_oq.pi_addr);
+	put_unaligned_le16(ctrl_info->num_elements_per_oq,
+		&request.data.create_operational_oq.num_elements);
+	put_unaligned_le16(PQI_OPERATIONAL_OQ_ELEMENT_LENGTH / 16,
+		&request.data.create_operational_oq.element_length);
+	request.data.create_operational_oq.queue_protocol = PQI_PROTOCOL_SOP;
+	put_unaligned_le16(queue_group->int_msg_num,
+		&request.data.create_operational_oq.int_msg_num);
+
+	rc = pqi_submit_admin_request_synchronous(ctrl_info, &request,
+		&response);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"error creating outbound queue\n");
+		return rc;
+	}
+
+	queue_group->oq_ci = ctrl_info->iomem_base +
+		PQI_DEVICE_REGISTERS_OFFSET +
+		get_unaligned_le64(
+			&response.data.create_operational_oq.oq_ci_offset);
+
+	return 0;
+}
+
+static int pqi_create_queues(struct pqi_ctrl_info *ctrl_info)
+{
+	int rc;
+	unsigned int i;
+
+	rc = pqi_create_event_queue(ctrl_info);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"error creating event queue\n");
+		return rc;
+	}
+
+	for (i = 0; i < ctrl_info->num_queue_groups; i++) {
+		rc = pqi_create_queue_group(ctrl_info, i);
+		if (rc) {
+			dev_err(&ctrl_info->pci_dev->dev,
+				"error creating queue group number %u/%u\n",
+				i, ctrl_info->num_queue_groups);
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+#define PQI_REPORT_EVENT_CONFIG_BUFFER_LENGTH	\
+	(offsetof(struct pqi_event_config, descriptors) + \
+	(PQI_MAX_EVENT_DESCRIPTORS * sizeof(struct pqi_event_descriptor)))
+
+static int pqi_configure_events(struct pqi_ctrl_info *ctrl_info,
+	bool enable_events)
+{
+	int rc;
+	unsigned int i;
+	struct pqi_event_config *event_config;
+	struct pqi_event_descriptor *event_descriptor;
+	struct pqi_general_management_request request;
+
+	event_config = kmalloc(PQI_REPORT_EVENT_CONFIG_BUFFER_LENGTH,
+		GFP_KERNEL);
+	if (!event_config)
+		return -ENOMEM;
+
+	memset(&request, 0, sizeof(request));
+
+	request.header.iu_type = PQI_REQUEST_IU_REPORT_VENDOR_EVENT_CONFIG;
+	put_unaligned_le16(offsetof(struct pqi_general_management_request,
+		data.report_event_configuration.sg_descriptors[1]) -
+		PQI_REQUEST_HEADER_LENGTH, &request.header.iu_length);
+	put_unaligned_le32(PQI_REPORT_EVENT_CONFIG_BUFFER_LENGTH,
+		&request.data.report_event_configuration.buffer_length);
+
+	rc = pqi_map_single(ctrl_info->pci_dev,
+		request.data.report_event_configuration.sg_descriptors,
+		event_config, PQI_REPORT_EVENT_CONFIG_BUFFER_LENGTH,
+		DMA_FROM_DEVICE);
+	if (rc)
+		goto out;
+
+	rc = pqi_submit_raid_request_synchronous(ctrl_info, &request.header,
+		0, NULL);
+
+	pqi_pci_unmap(ctrl_info->pci_dev,
+		request.data.report_event_configuration.sg_descriptors, 1,
+		DMA_FROM_DEVICE);
+
+	if (rc)
+		goto out;
+
+	for (i = 0; i < event_config->num_event_descriptors; i++) {
+		event_descriptor = &event_config->descriptors[i];
+		if (enable_events &&
+			pqi_is_supported_event(event_descriptor->event_type))
+				put_unaligned_le16(ctrl_info->event_queue.oq_id,
+					&event_descriptor->oq_id);
+		else
+			put_unaligned_le16(0, &event_descriptor->oq_id);
+	}
+
+	memset(&request, 0, sizeof(request));
+
+	request.header.iu_type = PQI_REQUEST_IU_SET_VENDOR_EVENT_CONFIG;
+	put_unaligned_le16(offsetof(struct pqi_general_management_request,
+		data.report_event_configuration.sg_descriptors[1]) -
+		PQI_REQUEST_HEADER_LENGTH, &request.header.iu_length);
+	put_unaligned_le32(PQI_REPORT_EVENT_CONFIG_BUFFER_LENGTH,
+		&request.data.report_event_configuration.buffer_length);
+
+	rc = pqi_map_single(ctrl_info->pci_dev,
+		request.data.report_event_configuration.sg_descriptors,
+		event_config, PQI_REPORT_EVENT_CONFIG_BUFFER_LENGTH,
+		DMA_TO_DEVICE);
+	if (rc)
+		goto out;
+
+	rc = pqi_submit_raid_request_synchronous(ctrl_info, &request.header, 0,
+		NULL);
+
+	pqi_pci_unmap(ctrl_info->pci_dev,
+		request.data.report_event_configuration.sg_descriptors, 1,
+		DMA_TO_DEVICE);
+
+out:
+	kfree(event_config);
+
+	return rc;
+}
+
+static inline int pqi_enable_events(struct pqi_ctrl_info *ctrl_info)
+{
+	return pqi_configure_events(ctrl_info, true);
+}
+
+static inline int pqi_disable_events(struct pqi_ctrl_info *ctrl_info)
+{
+	return pqi_configure_events(ctrl_info, false);
+}
+
+static void pqi_free_all_io_requests(struct pqi_ctrl_info *ctrl_info)
+{
+	unsigned int i;
+	struct device *dev;
+	size_t sg_chain_buffer_length;
+	struct pqi_io_request *io_request;
+
+	if (!ctrl_info->io_request_pool)
+		return;
+
+	dev = &ctrl_info->pci_dev->dev;
+	sg_chain_buffer_length = ctrl_info->sg_chain_buffer_length;
+	io_request = ctrl_info->io_request_pool;
+
+	for (i = 0; i < ctrl_info->max_io_slots; i++) {
+		kfree(io_request->iu);
+		if (!io_request->sg_chain_buffer)
+			break;
+		dma_free_coherent(dev, sg_chain_buffer_length,
+			io_request->sg_chain_buffer,
+			io_request->sg_chain_buffer_dma_handle);
+		io_request++;
+	}
+
+	kfree(ctrl_info->io_request_pool);
+	ctrl_info->io_request_pool = NULL;
+}
+
+static inline int pqi_alloc_error_buffer(struct pqi_ctrl_info *ctrl_info)
+{
+	ctrl_info->error_buffer = dma_zalloc_coherent(&ctrl_info->pci_dev->dev,
+		ctrl_info->error_buffer_length,
+		&ctrl_info->error_buffer_dma_handle, GFP_KERNEL);
+
+	if (!ctrl_info->error_buffer)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int pqi_alloc_io_resources(struct pqi_ctrl_info *ctrl_info)
+{
+	unsigned int i;
+	void *sg_chain_buffer;
+	size_t sg_chain_buffer_length;
+	dma_addr_t sg_chain_buffer_dma_handle;
+	struct device *dev;
+	struct pqi_io_request *io_request;
+
+	ctrl_info->io_request_pool = kzalloc(ctrl_info->max_io_slots *
+		sizeof(ctrl_info->io_request_pool[0]), GFP_KERNEL);
+
+	if (!ctrl_info->io_request_pool) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"failed to allocate I/O request pool\n");
+		goto error;
+	}
+
+	dev = &ctrl_info->pci_dev->dev;
+	sg_chain_buffer_length = ctrl_info->sg_chain_buffer_length;
+	io_request = ctrl_info->io_request_pool;
+
+	for (i = 0; i < ctrl_info->max_io_slots; i++) {
+		io_request->iu = kmalloc(ctrl_info->max_inbound_iu_length, GFP_KERNEL);
+
+		if (!io_request->iu) {
+			dev_err(&ctrl_info->pci_dev->dev,
+				"failed to allocate IU buffers\n");
+			goto error;
+		}
+
+		sg_chain_buffer = dma_alloc_coherent(dev,
+			sg_chain_buffer_length, &sg_chain_buffer_dma_handle,
+			GFP_KERNEL);
+
+		if (!sg_chain_buffer) {
+			dev_err(&ctrl_info->pci_dev->dev,
+				"failed to allocate PQI scatter-gather chain buffers\n");
+			goto error;
+		}
+
+		io_request->index = i;
+		io_request->sg_chain_buffer = sg_chain_buffer;
+		io_request->sg_chain_buffer_dma_handle = sg_chain_buffer_dma_handle;
+		io_request++;
+	}
+
+	return 0;
+
+error:
+	pqi_free_all_io_requests(ctrl_info);
+
+	return -ENOMEM;
+}
+
+/*
+ * Calculate required resources that are sized based on max. outstanding
+ * requests and max. transfer size.
+ */
+
+static void pqi_calculate_io_resources(struct pqi_ctrl_info *ctrl_info)
+{
+	u32 max_transfer_size;
+	u32 max_sg_entries;
+
+	ctrl_info->scsi_ml_can_queue =
+		ctrl_info->max_outstanding_requests - PQI_RESERVED_IO_SLOTS;
+	ctrl_info->max_io_slots = ctrl_info->max_outstanding_requests;
+
+	ctrl_info->error_buffer_length =
+		ctrl_info->max_io_slots * PQI_ERROR_BUFFER_ELEMENT_LENGTH;
+
+	if (reset_devices)
+		max_transfer_size = min(ctrl_info->max_transfer_size,
+			PQI_MAX_TRANSFER_SIZE_KDUMP);
+	else
+		max_transfer_size = min(ctrl_info->max_transfer_size,
+			PQI_MAX_TRANSFER_SIZE);
+
+	max_sg_entries = max_transfer_size / PAGE_SIZE;
+
+	/* +1 to cover when the buffer is not page-aligned. */
+	max_sg_entries++;
+
+	max_sg_entries = min(ctrl_info->max_sg_entries, max_sg_entries);
+
+	max_transfer_size = (max_sg_entries - 1) * PAGE_SIZE;
+
+	ctrl_info->sg_chain_buffer_length =
+		(max_sg_entries * sizeof(struct pqi_sg_descriptor)) +
+		PQI_EXTRA_SGL_MEMORY;
+	ctrl_info->sg_tablesize = max_sg_entries;
+	ctrl_info->max_sectors = max_transfer_size / 512;
+}
+
+static void pqi_calculate_queue_resources(struct pqi_ctrl_info *ctrl_info)
+{
+	int num_queue_groups;
+	u16 num_elements_per_iq;
+	u16 num_elements_per_oq;
+
+	if (reset_devices) {
+		num_queue_groups = 1;
+	} else {
+		int num_cpus;
+		int max_queue_groups;
+
+		max_queue_groups = min(ctrl_info->max_inbound_queues / 2,
+			ctrl_info->max_outbound_queues - 1);
+		max_queue_groups = min(max_queue_groups, PQI_MAX_QUEUE_GROUPS);
+
+		num_cpus = num_online_cpus();
+		num_queue_groups = min(num_cpus, ctrl_info->max_msix_vectors);
+		num_queue_groups = min(num_queue_groups, max_queue_groups);
+	}
+
+	ctrl_info->num_queue_groups = num_queue_groups;
+	ctrl_info->max_hw_queue_index = num_queue_groups - 1;
+
+	/*
+	 * Make sure that the max. inbound IU length is an even multiple
+	 * of our inbound element length.
+	 */
+	ctrl_info->max_inbound_iu_length =
+		(ctrl_info->max_inbound_iu_length_per_firmware /
+		PQI_OPERATIONAL_IQ_ELEMENT_LENGTH) *
+		PQI_OPERATIONAL_IQ_ELEMENT_LENGTH;
+
+	num_elements_per_iq =
+		(ctrl_info->max_inbound_iu_length /
+		PQI_OPERATIONAL_IQ_ELEMENT_LENGTH);
+
+	/* Add one because one element in each queue is unusable. */
+	num_elements_per_iq++;
+
+	num_elements_per_iq = min(num_elements_per_iq,
+		ctrl_info->max_elements_per_iq);
+
+	num_elements_per_oq = ((num_elements_per_iq - 1) * 2) + 1;
+	num_elements_per_oq = min(num_elements_per_oq,
+		ctrl_info->max_elements_per_oq);
+
+	ctrl_info->num_elements_per_iq = num_elements_per_iq;
+	ctrl_info->num_elements_per_oq = num_elements_per_oq;
+
+	ctrl_info->max_sg_per_iu =
+		((ctrl_info->max_inbound_iu_length -
+		PQI_OPERATIONAL_IQ_ELEMENT_LENGTH) /
+		sizeof(struct pqi_sg_descriptor)) +
+		PQI_MAX_EMBEDDED_SG_DESCRIPTORS;
+
+	ctrl_info->max_sg_per_r56_iu =
+		((ctrl_info->max_inbound_iu_length -
+		PQI_OPERATIONAL_IQ_ELEMENT_LENGTH) /
+		sizeof(struct pqi_sg_descriptor)) +
+		PQI_MAX_EMBEDDED_R56_SG_DESCRIPTORS;
+}
+
+static inline void pqi_set_sg_descriptor(struct pqi_sg_descriptor *sg_descriptor,
+	struct scatterlist *sg)
+{
+	u64 address = (u64)sg_dma_address(sg);
+	unsigned int length = sg_dma_len(sg);
+
+	put_unaligned_le64(address, &sg_descriptor->address);
+	put_unaligned_le32(length, &sg_descriptor->length);
+	put_unaligned_le32(0, &sg_descriptor->flags);
+}
+
+static unsigned int pqi_build_sg_list(struct pqi_sg_descriptor *sg_descriptor,
+	struct scatterlist *sg, int sg_count, struct pqi_io_request *io_request,
+	int max_sg_per_iu, bool *chained)
+{
+	int i;
+	unsigned int num_sg_in_iu;
+
+	*chained = false;
+	i = 0;
+	num_sg_in_iu = 0;
+	max_sg_per_iu--;	/* Subtract 1 to leave room for chain marker. */
+
+	while (1) {
+		pqi_set_sg_descriptor(sg_descriptor, sg);
+		if (!*chained)
+			num_sg_in_iu++;
+		i++;
+		if (i == sg_count)
+			break;
+		sg_descriptor++;
+		if (i == max_sg_per_iu) {
+			put_unaligned_le64((u64)io_request->sg_chain_buffer_dma_handle,
+				&sg_descriptor->address);
+			put_unaligned_le32((sg_count - num_sg_in_iu) * sizeof(*sg_descriptor),
+				&sg_descriptor->length);
+			put_unaligned_le32(CISS_SG_CHAIN, &sg_descriptor->flags);
+			*chained = true;
+			num_sg_in_iu++;
+			sg_descriptor = io_request->sg_chain_buffer;
+		}
+		sg = sg_next(sg);
+	}
+
+	put_unaligned_le32(CISS_SG_LAST, &sg_descriptor->flags);
+
+	return num_sg_in_iu;
+}
+
+static int pqi_build_raid_sg_list(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_raid_path_request *request, struct scsi_cmnd *scmd,
+	struct pqi_io_request *io_request)
+{
+	u16 iu_length;
+	int sg_count;
+	bool chained;
+	unsigned int num_sg_in_iu;
+	struct scatterlist *sg;
+	struct pqi_sg_descriptor *sg_descriptor;
+
+	sg_count = scsi_dma_map(scmd);
+	if (sg_count < 0)
+		return sg_count;
+
+	iu_length = offsetof(struct pqi_raid_path_request, sg_descriptors) -
+		PQI_REQUEST_HEADER_LENGTH;
+
+	if (sg_count == 0)
+		goto out;
+
+	sg = scsi_sglist(scmd);
+	sg_descriptor = request->sg_descriptors;
+
+	num_sg_in_iu = pqi_build_sg_list(sg_descriptor, sg, sg_count, io_request,
+		ctrl_info->max_sg_per_iu, &chained);
+
+	request->partial = chained;
+	iu_length += num_sg_in_iu * sizeof(*sg_descriptor);
+
+out:
+	put_unaligned_le16(iu_length, &request->header.iu_length);
+
+	return 0;
+}
+
+static int pqi_build_aio_r1_sg_list(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_aio_r1_path_request *request, struct scsi_cmnd *scmd,
+	struct pqi_io_request *io_request)
+{
+	u16 iu_length;
+	int sg_count;
+	bool chained;
+	unsigned int num_sg_in_iu;
+	struct scatterlist *sg;
+	struct pqi_sg_descriptor *sg_descriptor;
+
+	sg_count = scsi_dma_map(scmd);
+	if (sg_count < 0)
+		return sg_count;
+
+	iu_length = offsetof(struct pqi_aio_r1_path_request, sg_descriptors) -
+		PQI_REQUEST_HEADER_LENGTH;
+	num_sg_in_iu = 0;
+
+	if (sg_count == 0)
+		goto out;
+
+	sg = scsi_sglist(scmd);
+	sg_descriptor = request->sg_descriptors;
+
+	num_sg_in_iu = pqi_build_sg_list(sg_descriptor, sg, sg_count, io_request,
+		ctrl_info->max_sg_per_iu, &chained);
+
+	request->partial = chained;
+	iu_length += num_sg_in_iu * sizeof(*sg_descriptor);
+
+out:
+	put_unaligned_le16(iu_length, &request->header.iu_length);
+	request->num_sg_descriptors = num_sg_in_iu;
+
+	return 0;
+}
+
+static int pqi_build_aio_r56_sg_list(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_aio_r56_path_request *request, struct scsi_cmnd *scmd,
+	struct pqi_io_request *io_request)
+{
+	u16 iu_length;
+	int sg_count;
+	bool chained;
+	unsigned int num_sg_in_iu;
+	struct scatterlist *sg;
+	struct pqi_sg_descriptor *sg_descriptor;
+
+	sg_count = scsi_dma_map(scmd);
+	if (sg_count < 0)
+		return sg_count;
+
+	iu_length = offsetof(struct pqi_aio_r56_path_request, sg_descriptors) -
+		PQI_REQUEST_HEADER_LENGTH;
+	num_sg_in_iu = 0;
+
+	if (sg_count == 0)
+		goto out;
+
+	sg = scsi_sglist(scmd);
+	sg_descriptor = request->sg_descriptors;
+
+	num_sg_in_iu = pqi_build_sg_list(sg_descriptor, sg, sg_count, io_request,
+		ctrl_info->max_sg_per_r56_iu, &chained);
+
+	request->partial = chained;
+	iu_length += num_sg_in_iu * sizeof(*sg_descriptor);
+
+out:
+	put_unaligned_le16(iu_length, &request->header.iu_length);
+	request->num_sg_descriptors = num_sg_in_iu;
+
+	return 0;
+}
+
+static int pqi_build_aio_sg_list(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_aio_path_request *request, struct scsi_cmnd *scmd,
+	struct pqi_io_request *io_request)
+{
+	u16 iu_length;
+	int sg_count;
+	bool chained;
+	unsigned int num_sg_in_iu;
+	struct scatterlist *sg;
+	struct pqi_sg_descriptor *sg_descriptor;
+
+	sg_count = scsi_dma_map(scmd);
+	if (sg_count < 0)
+		return sg_count;
+
+	iu_length = offsetof(struct pqi_aio_path_request, sg_descriptors) -
+		PQI_REQUEST_HEADER_LENGTH;
+	num_sg_in_iu = 0;
+
+	if (sg_count == 0)
+		goto out;
+
+	sg = scsi_sglist(scmd);
+	sg_descriptor = request->sg_descriptors;
+
+	num_sg_in_iu = pqi_build_sg_list(sg_descriptor, sg, sg_count, io_request,
+		ctrl_info->max_sg_per_iu, &chained);
+
+	request->partial = chained;
+	iu_length += num_sg_in_iu * sizeof(*sg_descriptor);
+
+out:
+	put_unaligned_le16(iu_length, &request->header.iu_length);
+	request->num_sg_descriptors = num_sg_in_iu;
+
+	return 0;
+}
+
+static void pqi_raid_io_complete(struct pqi_io_request *io_request,
+	void *context)
+{
+	struct scsi_cmnd *scmd;
+
+	scmd = io_request->scmd;
+	pqi_free_io_request(io_request);
+	scsi_dma_unmap(scmd);
+	pqi_scsi_done(scmd);
+}
+
+static int pqi_raid_submit_scsi_cmd_with_io_request(
+	struct pqi_ctrl_info *ctrl_info, struct pqi_io_request *io_request,
+	struct pqi_scsi_dev *device, struct scsi_cmnd *scmd,
+	struct pqi_queue_group *queue_group)
+{
+	int rc;
+	size_t cdb_length;
+	struct pqi_raid_path_request *request;
+
+	io_request->io_complete_callback = pqi_raid_io_complete;
+	io_request->scmd = scmd;
+
+	request = io_request->iu;
+	memset(request, 0, offsetof(struct pqi_raid_path_request, sg_descriptors));
+
+	request->header.iu_type = PQI_REQUEST_IU_RAID_PATH_IO;
+	put_unaligned_le32(scsi_bufflen(scmd), &request->buffer_length);
+	request->task_attribute = SOP_TASK_ATTRIBUTE_SIMPLE;
+	put_unaligned_le16(io_request->index, &request->request_id);
+	request->error_index = request->request_id;
+	memcpy(request->lun_number, device->scsi3addr, sizeof(request->lun_number));
+
+	cdb_length = min_t(size_t, scmd->cmd_len, sizeof(request->cdb));
+	memcpy(request->cdb, scmd->cmnd, cdb_length);
+
+	switch (cdb_length) {
+	case 6:
+	case 10:
+	case 12:
+	case 16:
+		request->additional_cdb_bytes_usage = SOP_ADDITIONAL_CDB_BYTES_0;
+		break;
+	case 20:
+		request->additional_cdb_bytes_usage = SOP_ADDITIONAL_CDB_BYTES_4;
+		break;
+	case 24:
+		request->additional_cdb_bytes_usage = SOP_ADDITIONAL_CDB_BYTES_8;
+		break;
+	case 28:
+		request->additional_cdb_bytes_usage = SOP_ADDITIONAL_CDB_BYTES_12;
+		break;
+	case 32:
+	default:
+		request->additional_cdb_bytes_usage = SOP_ADDITIONAL_CDB_BYTES_16;
+		break;
+	}
+
+#if TORTUGA
+	scmd->sc_data_direction = DMA_BIDIRECTIONAL;
+#endif
+
+	switch (scmd->sc_data_direction) {
+	case DMA_TO_DEVICE:
+		request->data_direction = SOP_READ_FLAG;
+		break;
+	case DMA_FROM_DEVICE:
+		request->data_direction = SOP_WRITE_FLAG;
+		break;
+	case DMA_NONE:
+		request->data_direction = SOP_NO_DIRECTION_FLAG;
+		break;
+	case DMA_BIDIRECTIONAL:
+		request->data_direction = SOP_BIDIRECTIONAL;
+		break;
+	default:
+		dev_err(&ctrl_info->pci_dev->dev,
+			"unknown data direction: %d\n",
+			scmd->sc_data_direction);
+		BUG();
+		break;
+	}
+
+	rc = pqi_build_raid_sg_list(ctrl_info, request, scmd, io_request);
+	if (rc) {
+		pqi_free_io_request(io_request);
+		return SCSI_MLQUEUE_HOST_BUSY;
+	}
+
+	pqi_start_io(ctrl_info, queue_group, RAID_PATH, io_request);
+
+	return 0;
+}
+
+static inline int pqi_raid_submit_scsi_cmd(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *device, struct scsi_cmnd *scmd,
+	struct pqi_queue_group *queue_group)
+{
+	struct pqi_io_request *io_request;
+
+	io_request = pqi_alloc_io_request(ctrl_info);
+
+	return pqi_raid_submit_scsi_cmd_with_io_request(ctrl_info, io_request,
+		device, scmd, queue_group);
+}
+
+static bool pqi_raid_bypass_retry_needed(struct pqi_io_request *io_request)
+{
+	struct scsi_cmnd *scmd;
+	struct pqi_scsi_dev *device;
+	struct pqi_ctrl_info *ctrl_info;
+
+	if (!io_request->raid_bypass)
+		return false;
+
+	scmd = io_request->scmd;
+	if ((scmd->result & 0xff) == SAM_STAT_GOOD)
+		return false;
+	if (host_byte(scmd->result) == DID_NO_CONNECT)
+		return false;
+
+	device = scmd->device->hostdata;
+	if (pqi_device_offline(device) || pqi_device_in_remove(device))
+		return false;
+
+	ctrl_info = shost_to_hba(scmd->device->host);
+	if (pqi_ctrl_offline(ctrl_info))
+		return false;
+
+	return true;
+}
+
+static void pqi_aio_io_complete(struct pqi_io_request *io_request,
+	void *context)
+{
+	struct scsi_cmnd *scmd;
+
+	scmd = io_request->scmd;
+	scsi_dma_unmap(scmd);
+	if (io_request->status == -EAGAIN || pqi_raid_bypass_retry_needed(io_request)) {
+		set_host_byte(scmd, DID_IMM_RETRY);
+		scmd->SCp.this_residual++;
+	}
+
+	pqi_free_io_request(io_request);
+	pqi_scsi_done(scmd);
+}
+
+static inline bool pqi_is_io_high_priority(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *device, struct scsi_cmnd *scmd)
+{
+	bool io_high_prio;
+	int priority_class;
+
+	io_high_prio = false;
+
+	if (device->ncq_prio_enable) {
+		priority_class =
+			IOPRIO_PRIO_CLASS(req_get_ioprio(scmd->request));
+		if (priority_class == IOPRIO_CLASS_RT) {
+			/* Set NCQ priority for read/write commands. */
+			switch (scmd->cmnd[0]) {
+			case WRITE_16:
+			case READ_16:
+			case WRITE_12:
+			case READ_12:
+			case WRITE_10:
+			case READ_10:
+			case WRITE_6:
+			case READ_6:
+				io_high_prio = true;
+				break;
+			}
+		}
+	}
+
+	return io_high_prio;
+}
+
+static inline int pqi_aio_submit_scsi_cmd(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *device, struct scsi_cmnd *scmd,
+	struct pqi_queue_group *queue_group)
+{
+	bool io_high_prio;
+
+	io_high_prio = pqi_is_io_high_priority(ctrl_info, device, scmd);
+
+	return pqi_aio_submit_io(ctrl_info, scmd, device->aio_handle,
+		scmd->cmnd, scmd->cmd_len, queue_group, NULL,
+		false, io_high_prio);
+}
+
+static int pqi_aio_submit_io(struct pqi_ctrl_info *ctrl_info,
+	struct scsi_cmnd *scmd, u32 aio_handle, u8 *cdb,
+	unsigned int cdb_length, struct pqi_queue_group *queue_group,
+	struct pqi_encryption_info *encryption_info, bool raid_bypass,
+	bool io_high_prio)
+{
+	int rc;
+	struct pqi_io_request *io_request;
+	struct pqi_aio_path_request *request;
+
+	io_request = pqi_alloc_io_request(ctrl_info);
+	io_request->io_complete_callback = pqi_aio_io_complete;
+	io_request->scmd = scmd;
+	io_request->raid_bypass = raid_bypass;
+
+	request = io_request->iu;
+	memset(request, 0, offsetof(struct pqi_aio_path_request, sg_descriptors));
+
+	request->header.iu_type = PQI_REQUEST_IU_AIO_PATH_IO;
+	put_unaligned_le32(aio_handle, &request->nexus_id);
+	put_unaligned_le32(scsi_bufflen(scmd), &request->buffer_length);
+	request->task_attribute = SOP_TASK_ATTRIBUTE_SIMPLE;
+	request->command_priority = io_high_prio;
+	put_unaligned_le16(io_request->index, &request->request_id);
+	request->error_index = request->request_id;
+	if (cdb_length > sizeof(request->cdb))
+		cdb_length = sizeof(request->cdb);
+	request->cdb_length = cdb_length;
+	memcpy(request->cdb, cdb, cdb_length);
+
+	switch (scmd->sc_data_direction) {
+	case DMA_TO_DEVICE:
+		request->data_direction = SOP_READ_FLAG;
+		break;
+	case DMA_FROM_DEVICE:
+		request->data_direction = SOP_WRITE_FLAG;
+		break;
+	case DMA_NONE:
+		request->data_direction = SOP_NO_DIRECTION_FLAG;
+		break;
+	case DMA_BIDIRECTIONAL:
+		request->data_direction = SOP_BIDIRECTIONAL;
+		break;
+	default:
+		dev_err(&ctrl_info->pci_dev->dev,
+			"unknown data direction: %d\n",
+			scmd->sc_data_direction);
+		BUG();
+		break;
+	}
+
+	if (encryption_info) {
+		request->encryption_enable = true;
+		put_unaligned_le16(encryption_info->data_encryption_key_index,
+			&request->data_encryption_key_index);
+		put_unaligned_le32(encryption_info->encrypt_tweak_lower,
+			&request->encrypt_tweak_lower);
+		put_unaligned_le32(encryption_info->encrypt_tweak_upper,
+			&request->encrypt_tweak_upper);
+	}
+
+	rc = pqi_build_aio_sg_list(ctrl_info, request, scmd, io_request);
+	if (rc) {
+		pqi_free_io_request(io_request);
+		return SCSI_MLQUEUE_HOST_BUSY;
+	}
+
+	pqi_start_io(ctrl_info, queue_group, AIO_PATH, io_request);
+
+	return 0;
+}
+
+static  int pqi_aio_submit_r1_write_io(struct pqi_ctrl_info *ctrl_info,
+	struct scsi_cmnd *scmd, struct pqi_queue_group *queue_group,
+	struct pqi_encryption_info *encryption_info, struct pqi_scsi_dev *device,
+	struct pqi_scsi_dev_raid_map_data *rmd)
+{
+	int rc;
+	struct pqi_io_request *io_request;
+	struct pqi_aio_r1_path_request *r1_request;
+
+	io_request = pqi_alloc_io_request(ctrl_info);
+	io_request->io_complete_callback = pqi_aio_io_complete;
+	io_request->scmd = scmd;
+	io_request->raid_bypass = true;
+
+	r1_request = io_request->iu;
+	memset(r1_request, 0, offsetof(struct pqi_aio_r1_path_request, sg_descriptors));
+
+	r1_request->header.iu_type = PQI_REQUEST_IU_AIO_PATH_RAID1_IO;
+	put_unaligned_le16(*(u16 *)device->scsi3addr & 0x3fff, &r1_request->volume_id);
+	r1_request->num_drives = rmd->num_it_nexus_entries;
+	put_unaligned_le32(rmd->it_nexus[0], &r1_request->it_nexus_1);
+	put_unaligned_le32(rmd->it_nexus[1], &r1_request->it_nexus_2);
+	if (rmd->num_it_nexus_entries == 3)
+		put_unaligned_le32(rmd->it_nexus[2], &r1_request->it_nexus_3);
+
+	put_unaligned_le32(scsi_bufflen(scmd), &r1_request->data_length);
+	r1_request->task_attribute = SOP_TASK_ATTRIBUTE_SIMPLE;
+	put_unaligned_le16(io_request->index, &r1_request->request_id);
+	r1_request->error_index = r1_request->request_id;
+	if (rmd->cdb_length > sizeof(r1_request->cdb))
+		rmd->cdb_length = sizeof(r1_request->cdb);
+	r1_request->cdb_length = rmd->cdb_length;
+	memcpy(r1_request->cdb, rmd->cdb, rmd->cdb_length);
+
+	switch (scmd->sc_data_direction) {
+	case DMA_TO_DEVICE:
+		r1_request->data_direction = SOP_READ_FLAG;
+		break;
+	case DMA_FROM_DEVICE:
+		r1_request->data_direction = SOP_WRITE_FLAG;
+		break;
+	case DMA_NONE:
+		r1_request->data_direction = SOP_NO_DIRECTION_FLAG;
+		break;
+	case DMA_BIDIRECTIONAL:
+		r1_request->data_direction = SOP_BIDIRECTIONAL;
+		break;
+	default:
+		dev_err(&ctrl_info->pci_dev->dev,
+			"unknown data direction: %d\n",
+			scmd->sc_data_direction);
+		BUG();
+		break;
+	}
+
+	if (encryption_info) {
+		r1_request->encryption_enable = true;
+		put_unaligned_le16(encryption_info->data_encryption_key_index,
+				&r1_request->data_encryption_key_index);
+		put_unaligned_le32(encryption_info->encrypt_tweak_lower,
+				&r1_request->encrypt_tweak_lower);
+		put_unaligned_le32(encryption_info->encrypt_tweak_upper,
+				&r1_request->encrypt_tweak_upper);
+	}
+
+	rc = pqi_build_aio_r1_sg_list(ctrl_info, r1_request, scmd, io_request);
+	if (rc) {
+		pqi_free_io_request(io_request);
+		return SCSI_MLQUEUE_HOST_BUSY;
+	}
+
+	pqi_start_io(ctrl_info, queue_group, AIO_PATH, io_request);
+
+	return 0;
+}
+
+static int pqi_aio_submit_r56_write_io(struct pqi_ctrl_info *ctrl_info,
+	struct scsi_cmnd *scmd, struct pqi_queue_group *queue_group,
+	struct pqi_encryption_info *encryption_info, struct pqi_scsi_dev *device,
+	struct pqi_scsi_dev_raid_map_data *rmd)
+{
+	int rc;
+	struct pqi_io_request *io_request;
+	struct pqi_aio_r56_path_request *r56_request;
+
+	io_request = pqi_alloc_io_request(ctrl_info);
+	io_request->io_complete_callback = pqi_aio_io_complete;
+	io_request->scmd = scmd;
+	io_request->raid_bypass = true;
+
+	r56_request = io_request->iu;
+	memset(r56_request, 0, offsetof(struct pqi_aio_r56_path_request, sg_descriptors));
+
+	if (device->raid_level == SA_RAID_5 || device->raid_level == SA_RAID_51)
+		r56_request->header.iu_type = PQI_REQUEST_IU_AIO_PATH_RAID5_IO;
+	else
+		r56_request->header.iu_type = PQI_REQUEST_IU_AIO_PATH_RAID6_IO;
+
+	put_unaligned_le16(*(u16 *)device->scsi3addr & 0x3fff, &r56_request->volume_id);
+	put_unaligned_le32(rmd->aio_handle, &r56_request->data_it_nexus);
+	put_unaligned_le32(rmd->p_parity_it_nexus, &r56_request->p_parity_it_nexus);
+	if (rmd->raid_level == SA_RAID_6) {
+		put_unaligned_le32(rmd->q_parity_it_nexus, &r56_request->q_parity_it_nexus);
+		r56_request->xor_multiplier = rmd->xor_mult;
+	}
+	put_unaligned_le32(scsi_bufflen(scmd), &r56_request->data_length);
+	r56_request->task_attribute = SOP_TASK_ATTRIBUTE_SIMPLE;
+	put_unaligned_le64(rmd->row, &r56_request->row);
+
+	put_unaligned_le16(io_request->index, &r56_request->request_id);
+	r56_request->error_index = r56_request->request_id;
+
+	if (rmd->cdb_length > sizeof(r56_request->cdb))
+		rmd->cdb_length = sizeof(r56_request->cdb);
+	r56_request->cdb_length = rmd->cdb_length;
+	memcpy(r56_request->cdb, rmd->cdb, rmd->cdb_length);
+
+	switch (scmd->sc_data_direction) {
+	case DMA_TO_DEVICE:
+		r56_request->data_direction = SOP_READ_FLAG;
+		break;
+	case DMA_FROM_DEVICE:
+		r56_request->data_direction = SOP_WRITE_FLAG;
+		break;
+	case DMA_NONE:
+		r56_request->data_direction = SOP_NO_DIRECTION_FLAG;
+		break;
+	case DMA_BIDIRECTIONAL:
+		r56_request->data_direction = SOP_BIDIRECTIONAL;
+		break;
+	default:
+		dev_err(&ctrl_info->pci_dev->dev,
+			"unknown data direction: %d\n",
+			scmd->sc_data_direction);
+		BUG();
+		break;
+	}
+
+	if (encryption_info) {
+		r56_request->encryption_enable = true;
+		put_unaligned_le16(encryption_info->data_encryption_key_index,
+				&r56_request->data_encryption_key_index);
+		put_unaligned_le32(encryption_info->encrypt_tweak_lower,
+				&r56_request->encrypt_tweak_lower);
+		put_unaligned_le32(encryption_info->encrypt_tweak_upper,
+				&r56_request->encrypt_tweak_upper);
+	}
+
+	rc = pqi_build_aio_r56_sg_list(ctrl_info, r56_request, scmd, io_request);
+	if (rc) {
+		pqi_free_io_request(io_request);
+		return SCSI_MLQUEUE_HOST_BUSY;
+	}
+
+	pqi_start_io(ctrl_info, queue_group, AIO_PATH, io_request);
+
+	return 0;
+}
+
+static inline bool pqi_is_bypass_eligible_request(struct scsi_cmnd *scmd)
+{
+	if (blk_rq_is_passthrough(scmd->request))
+		return false;
+
+	return scmd->SCp.this_residual == 0;
+}
+
+/*
+ * This function gets called just before we hand the completed SCSI request
+ * back to the SML.
+ */
+
+void pqi_prep_for_scsi_done(struct scsi_cmnd *scmd)
+{
+	struct pqi_scsi_dev *device;
+	struct pqi_ctrl_info *ctrl_info;
+	struct Scsi_Host *shost;
+
+	if (!scmd->device) {
+		set_host_byte(scmd, DID_NO_CONNECT);
+		return;
+	}
+
+	device = scmd->device->hostdata;
+	if (!device) {
+		set_host_byte(scmd, DID_NO_CONNECT);
+		return;
+	}
+
+	shost = scmd->device->host;
+	ctrl_info = shost_to_hba(shost);
+
+	atomic_dec(&device->scsi_cmds_outstanding);
+	atomic_dec(&ctrl_info->total_scmds_outstanding);
+}
+
+static bool pqi_is_parity_write_stream(struct pqi_ctrl_info *ctrl_info,
+	struct scsi_cmnd *scmd)
+{
+	u32 oldest_jiffies;
+	u8 lru_index;
+	int i;
+	int rc;
+	struct pqi_scsi_dev *device;
+	struct pqi_stream_data *pqi_stream_data;
+	struct pqi_scsi_dev_raid_map_data rmd;
+
+	if (!ctrl_info->enable_stream_detection)
+		return false;
+
+	rc = pqi_get_aio_lba_and_block_count(scmd, &rmd);
+	if (rc)
+		return false;
+
+	/* Check writes only. */
+	if (!rmd.is_write)
+		return false;
+
+	device = scmd->device->hostdata;
+
+	/* Check for RAID 5/6 streams. */
+	if (device->raid_level != SA_RAID_5 && device->raid_level != SA_RAID_6)
+		return false;
+
+	/*
+	 * If controller does not support AIO RAID{5,6} writes, need to send
+	 * requests down non-AIO path.
+	 */
+	if ((device->raid_level == SA_RAID_5 && !ctrl_info->enable_r5_writes) ||
+		(device->raid_level == SA_RAID_6 && !ctrl_info->enable_r6_writes))
+		return true;
+
+	lru_index = 0;
+	oldest_jiffies = INT_MAX;
+	for (i = 0; i < NUM_STREAMS_PER_LUN; i++) {
+		pqi_stream_data = &device->stream_data[i];
+		/*
+		 * Check for adjacent request or request is within
+		 * the previous request.
+		 */
+		if ((pqi_stream_data->next_lba &&
+			rmd.first_block >= pqi_stream_data->next_lba) &&
+			rmd.first_block <= pqi_stream_data->next_lba +
+				rmd.block_cnt) {
+			pqi_stream_data->next_lba = rmd.first_block +
+				rmd.block_cnt;
+			pqi_stream_data->last_accessed = jiffies;
+			return true;
+		}
+
+		/* unused entry */
+		if (pqi_stream_data->last_accessed == 0) {
+			lru_index = i;
+			break;
+		}
+
+		/* Find entry with oldest last accessed time. */
+		if (pqi_stream_data->last_accessed <= oldest_jiffies) {
+			oldest_jiffies = pqi_stream_data->last_accessed;
+			lru_index = i;
+		}
+	}
+
+	/* Set LRU entry. */
+	pqi_stream_data = &device->stream_data[lru_index];
+	pqi_stream_data->last_accessed = jiffies;
+	pqi_stream_data->next_lba = rmd.first_block + rmd.block_cnt;
+
+	return false;
+}
+
+int pqi_scsi_queue_command(struct Scsi_Host *shost, struct scsi_cmnd *scmd)
+{
+	int rc;
+	struct pqi_ctrl_info *ctrl_info;
+	struct pqi_scsi_dev *device;
+	u16 hw_queue;
+	struct pqi_queue_group *queue_group;
+	bool raid_bypassed;
+
+	device = scmd->device->hostdata;
+	ctrl_info = shost_to_hba(shost);
+
+	if (!device) {
+		set_host_byte(scmd, DID_NO_CONNECT);
+		pqi_scsi_done(scmd);
+		return 0;
+	}
+
+	atomic_inc(&device->scsi_cmds_outstanding);
+	if (atomic_inc_return(&ctrl_info->total_scmds_outstanding) >
+		ctrl_info->scsi_ml_can_queue) {
+		rc = SCSI_MLQUEUE_HOST_BUSY;
+		goto out;
+	}
+
+	if (pqi_ctrl_offline(ctrl_info) || pqi_device_in_remove(device)) {
+		set_host_byte(scmd, DID_NO_CONNECT);
+		pqi_scsi_done(scmd);
+		return 0;
+	}
+
+	if (pqi_ctrl_blocked(ctrl_info)) {
+		rc = SCSI_MLQUEUE_HOST_BUSY;
+		goto out;
+	}
+
+	/*
+	 * This is necessary because the SML doesn't zero out this field during
+	 * error recovery.
+	 */
+	scmd->result = 0;
+
+	hw_queue = pqi_get_hw_queue(ctrl_info, scmd);
+	queue_group = &ctrl_info->queue_groups[hw_queue];
+
+	if (pqi_is_logical_device(device)) {
+		raid_bypassed = false;
+		if (device->raid_bypass_enabled &&
+			pqi_is_bypass_eligible_request(scmd) &&
+			!pqi_is_parity_write_stream(ctrl_info, scmd)) {
+			rc = pqi_raid_bypass_submit_scsi_cmd(ctrl_info, device, scmd, queue_group);
+			if (rc == 0 || rc == SCSI_MLQUEUE_HOST_BUSY) {
+				raid_bypassed = true;
+				atomic_inc(&device->raid_bypass_cnt);
+			}
+		}
+		if (!raid_bypassed)
+			rc = pqi_raid_submit_scsi_cmd(ctrl_info, device, scmd, queue_group);
+	} else {
+		if (device->aio_enabled)
+			rc = pqi_aio_submit_scsi_cmd(ctrl_info, device, scmd, queue_group);
+		else
+			rc = pqi_raid_submit_scsi_cmd(ctrl_info, device, scmd, queue_group);
+	}
+
+out:
+	if (rc) {
+		atomic_dec(&device->scsi_cmds_outstanding);
+		atomic_dec(&ctrl_info->total_scmds_outstanding);
+	}
+
+	return rc;
+}
+
+static unsigned int pqi_queued_io_count(struct pqi_ctrl_info *ctrl_info)
+{
+	unsigned int i;
+	unsigned int path;
+	unsigned long flags;
+	unsigned int queued_io_count;
+	struct pqi_queue_group *queue_group;
+	struct pqi_io_request *io_request;
+
+	queued_io_count = 0;
+
+	for (i = 0; i < ctrl_info->num_queue_groups; i++) {
+		queue_group = &ctrl_info->queue_groups[i];
+		for (path = 0; path < 2; path++) {
+			spin_lock_irqsave(&queue_group->submit_lock[path], flags);
+			list_for_each_entry(io_request, &queue_group->request_list[path], request_list_entry)
+				queued_io_count++;
+			spin_unlock_irqrestore(&queue_group->submit_lock[path], flags);
+		}
+	}
+
+	return queued_io_count;
+}
+
+static unsigned int pqi_nonempty_inbound_queue_count(struct pqi_ctrl_info *ctrl_info)
+{
+	unsigned int i;
+	unsigned int path;
+	unsigned int nonempty_inbound_queue_count;
+	struct pqi_queue_group *queue_group;
+	pqi_index_t iq_pi;
+	pqi_index_t iq_ci;
+
+	nonempty_inbound_queue_count = 0;
+
+	for (i = 0; i < ctrl_info->num_queue_groups; i++) {
+		queue_group = &ctrl_info->queue_groups[i];
+		for (path = 0; path < 2; path++) {
+			iq_pi = queue_group->iq_pi_copy[path];
+			iq_ci = readl(queue_group->iq_ci[path]);
+			if (iq_ci != iq_pi)
+				nonempty_inbound_queue_count++;
+		}
+	}
+
+	return nonempty_inbound_queue_count;
+}
+
+#define PQI_INBOUND_QUEUES_NONEMPTY_WARNING_TIMEOUT_SECS	(10 * HZ)
+
+static int pqi_wait_until_inbound_queues_empty(struct pqi_ctrl_info *ctrl_info)
+{
+	unsigned long start_jiffies;
+	unsigned long warning_timeout;
+	unsigned int queued_io_count;
+	unsigned int nonempty_inbound_queue_count;
+	bool displayed_warning;
+
+	displayed_warning = false;
+	start_jiffies = jiffies;
+	warning_timeout = PQI_INBOUND_QUEUES_NONEMPTY_WARNING_TIMEOUT_SECS + start_jiffies;
+
+	while (1) {
+		queued_io_count = pqi_queued_io_count(ctrl_info);
+		nonempty_inbound_queue_count = pqi_nonempty_inbound_queue_count(ctrl_info);
+		if (queued_io_count == 0 && nonempty_inbound_queue_count == 0)
+			break;
+		pqi_check_ctrl_health(ctrl_info);
+		if (pqi_ctrl_offline(ctrl_info))
+			return -ENXIO;
+		if (time_after(jiffies, warning_timeout)) {
+			dev_warn(&ctrl_info->pci_dev->dev,
+				"waiting %u seconds for queued I/O to drain (queued I/O count: %u; non-empty inbound queue count: %u)\n",
+				jiffies_to_msecs(jiffies - start_jiffies) / 1000, queued_io_count, nonempty_inbound_queue_count);
+			displayed_warning = true;
+			warning_timeout = PQI_INBOUND_QUEUES_NONEMPTY_WARNING_TIMEOUT_SECS + jiffies;
+		}
+		msleep(1);
+	}
+
+	if (displayed_warning)
+		dev_warn(&ctrl_info->pci_dev->dev,
+			"queued I/O drained after waiting for %u seconds\n",
+			jiffies_to_msecs(jiffies - start_jiffies) / 1000);
+
+	return 0;
+}
+
+static void pqi_fail_io_queued_for_device(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *device)
+{
+	unsigned int i;
+	unsigned int path;
+	struct pqi_queue_group *queue_group;
+	unsigned long flags;
+	struct pqi_io_request *io_request;
+	struct pqi_io_request *next;
+	struct scsi_cmnd *scmd;
+	struct pqi_scsi_dev *scsi_device;
+
+	for (i = 0; i < ctrl_info->num_queue_groups; i++) {
+		queue_group = &ctrl_info->queue_groups[i];
+
+		for (path = 0; path < 2; path++) {
+			spin_lock_irqsave(
+				&queue_group->submit_lock[path], flags);
+
+			list_for_each_entry_safe(io_request, next,
+				&queue_group->request_list[path],
+				request_list_entry) {
+
+				scmd = io_request->scmd;
+				if (!scmd)
+					continue;
+
+				scsi_device = scmd->device->hostdata;
+				if (scsi_device != device)
+					continue;
+
+				list_del(&io_request->request_list_entry);
+				set_host_byte(scmd, DID_RESET);
+				pqi_free_io_request(io_request);
+				scsi_dma_unmap(scmd);
+				pqi_scsi_done(scmd);
+			}
+
+			spin_unlock_irqrestore(
+				&queue_group->submit_lock[path], flags);
+		}
+	}
+}
+
+#define PQI_PENDING_IO_WARNING_TIMEOUT_SECS	(10 * HZ)
+
+static int pqi_device_wait_for_pending_io(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *device, unsigned long timeout_msecs)
+{
+	int cmds_outstanding;
+	unsigned long start_jiffies;
+	unsigned long warning_timeout;
+	unsigned long msecs_waiting;
+
+	start_jiffies = jiffies;
+	warning_timeout = PQI_PENDING_IO_WARNING_TIMEOUT_SECS + start_jiffies;
+
+	while ((cmds_outstanding = atomic_read(&device->scsi_cmds_outstanding)) > 0) {
+		pqi_check_ctrl_health(ctrl_info);
+		if (pqi_ctrl_offline(ctrl_info))
+			return -ENXIO;
+		msecs_waiting = jiffies_to_msecs(jiffies - start_jiffies);
+		if (msecs_waiting >= timeout_msecs) {
+			dev_err(&ctrl_info->pci_dev->dev,
+				"scsi %d:%d:%d:%d: timed out after %lu seconds waiting for %d outstanding command(s)\n",
+				ctrl_info->scsi_host->host_no, device->bus, device->target,
+				device->lun, msecs_waiting / 1000, cmds_outstanding);
+			return -ETIMEDOUT;
+		}
+		if (time_after(jiffies, warning_timeout)) {
+			dev_warn(&ctrl_info->pci_dev->dev,
+				"scsi %d:%d:%d:%d: waiting %lu seconds for %d outstanding command(s)\n",
+				ctrl_info->scsi_host->host_no, device->bus, device->target,
+				device->lun, msecs_waiting / 1000, cmds_outstanding);
+			warning_timeout = PQI_PENDING_IO_WARNING_TIMEOUT_SECS + jiffies;
+		}
+		msleep(1);
+	}
+
+	return 0;
+}
+
+static void pqi_lun_reset_complete(struct pqi_io_request *io_request,
+	void *context)
+{
+	struct completion *waiting = context;
+
+	complete(waiting);
+}
+
+#define PQI_LUN_RESET_POLL_COMPLETION_SECS	10
+
+static int pqi_wait_for_lun_reset_completion(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *device, struct completion *wait)
+{
+	int rc;
+	unsigned int wait_secs;
+	int cmds_outstanding;
+
+	wait_secs = 0;
+
+	while (1) {
+		if (wait_for_completion_io_timeout(wait,
+			PQI_LUN_RESET_POLL_COMPLETION_SECS * HZ)) {
+			rc = 0;
+			break;
+		}
+
+		pqi_check_ctrl_health(ctrl_info);
+		if (pqi_ctrl_offline(ctrl_info)) {
+			rc = -ENXIO;
+			break;
+		}
+
+		wait_secs += PQI_LUN_RESET_POLL_COMPLETION_SECS;
+		cmds_outstanding = atomic_read(&device->scsi_cmds_outstanding);
+		dev_warn(&ctrl_info->pci_dev->dev,
+			"scsi %d:%d:%d:%d: waiting %u seconds for LUN reset to complete (%d command(s) outstanding)\n",
+			ctrl_info->scsi_host->host_no, device->bus, device->target, device->lun, wait_secs, cmds_outstanding);
+	}
+
+	return rc;
+}
+
+#define PQI_LUN_RESET_FIRMWARE_TIMEOUT_SECS	30
+
+static int pqi_lun_reset(struct pqi_ctrl_info *ctrl_info, struct pqi_scsi_dev *device)
+{
+	int rc;
+	struct pqi_io_request *io_request;
+	DECLARE_COMPLETION_ONSTACK(wait);
+	struct pqi_task_management_request *request;
+
+	io_request = pqi_alloc_io_request(ctrl_info);
+	io_request->io_complete_callback = pqi_lun_reset_complete;
+	io_request->context = &wait;
+
+	request = io_request->iu;
+	memset(request, 0, sizeof(*request));
+
+	request->header.iu_type = PQI_REQUEST_IU_TASK_MANAGEMENT;
+	put_unaligned_le16(sizeof(*request) - PQI_REQUEST_HEADER_LENGTH,
+		&request->header.iu_length);
+	put_unaligned_le16(io_request->index, &request->request_id);
+	memcpy(request->lun_number, device->scsi3addr,
+		sizeof(request->lun_number));
+	request->task_management_function = SOP_TASK_MANAGEMENT_LUN_RESET;
+	if (ctrl_info->tmf_iu_timeout_supported)
+		put_unaligned_le16(PQI_LUN_RESET_FIRMWARE_TIMEOUT_SECS, &request->timeout);
+
+	pqi_start_io(ctrl_info, &ctrl_info->queue_groups[PQI_DEFAULT_QUEUE_GROUP], RAID_PATH,
+		io_request);
+
+	rc = pqi_wait_for_lun_reset_completion(ctrl_info, device, &wait);
+	if (rc == 0)
+		rc = io_request->status;
+
+	pqi_free_io_request(io_request);
+
+	return rc;
+}
+
+#define PQI_LUN_RESET_RETRIES				3
+#define PQI_LUN_RESET_RETRY_INTERVAL_MSECS		(10 * 1000)
+#define PQI_LUN_RESET_PENDING_IO_TIMEOUT_MSECS		(10 * 60 * 1000)
+#define PQI_LUN_RESET_FAILED_PENDING_IO_TIMEOUT_MSECS	(2 * 60 * 1000)
+
+static int pqi_lun_reset_with_retries(struct pqi_ctrl_info *ctrl_info, struct pqi_scsi_dev *device)
+{
+	int reset_rc;
+	int wait_rc;
+	unsigned int retries;
+	unsigned long timeout_msecs;
+
+	for (retries = 0;;) {
+		reset_rc = pqi_lun_reset(ctrl_info, device);
+		if (reset_rc == 0 || ++retries > PQI_LUN_RESET_RETRIES)
+			break;
+		msleep(PQI_LUN_RESET_RETRY_INTERVAL_MSECS);
+	}
+
+	timeout_msecs = reset_rc ? PQI_LUN_RESET_FAILED_PENDING_IO_TIMEOUT_MSECS :
+		PQI_LUN_RESET_PENDING_IO_TIMEOUT_MSECS;
+
+	wait_rc = pqi_device_wait_for_pending_io(ctrl_info, device, timeout_msecs);
+	if (wait_rc && reset_rc == 0)
+		reset_rc = wait_rc;
+
+	return reset_rc == 0 ? SUCCESS : FAILED;
+}
+
+static int pqi_device_reset(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *device)
+{
+	int rc;
+
+	pqi_ctrl_block_requests(ctrl_info);
+	pqi_ctrl_wait_until_quiesced(ctrl_info);
+	pqi_fail_io_queued_for_device(ctrl_info, device);
+	rc = pqi_wait_until_inbound_queues_empty(ctrl_info);
+	if (rc)
+		rc = FAILED;
+	else
+		rc = pqi_lun_reset_with_retries(ctrl_info, device);
+	pqi_ctrl_unblock_requests(ctrl_info);
+
+	return rc;
+}
+
+static int pqi_eh_device_reset_handler(struct scsi_cmnd *scmd)
+{
+	int rc;
+	struct Scsi_Host *shost;
+	struct pqi_ctrl_info *ctrl_info;
+	struct pqi_scsi_dev *device;
+
+	shost = scmd->device->host;
+	ctrl_info = shost_to_hba(shost);
+	device = scmd->device->hostdata;
+
+	mutex_lock(&ctrl_info->lun_reset_mutex);
+
+	dev_err(&ctrl_info->pci_dev->dev,
+		"resetting scsi %d:%d:%d:%d due to cmd 0x%02x\n",
+		shost->host_no,
+		device->bus, device->target, device->lun,
+		scmd->cmd_len > 0 ? scmd->cmnd[0] : 0xff);
+
+	pqi_check_ctrl_health(ctrl_info);
+	if (pqi_ctrl_offline(ctrl_info))
+		rc = FAILED;
+	else
+		rc = pqi_device_reset(ctrl_info, device);
+
+	dev_err(&ctrl_info->pci_dev->dev,
+		"reset of scsi %d:%d:%d:%d: %s\n",
+		shost->host_no, device->bus, device->target, device->lun,
+		rc == SUCCESS ? "SUCCESS" : "FAILED");
+
+	mutex_unlock(&ctrl_info->lun_reset_mutex);
+
+	return rc;
+}
+
+static int pqi_slave_alloc(struct scsi_device *sdev)
+{
+	struct pqi_scsi_dev *device;
+	unsigned long flags;
+	struct pqi_ctrl_info *ctrl_info;
+	struct scsi_target *starget;
+	struct sas_rphy *rphy;
+
+	ctrl_info = shost_to_hba(sdev->host);
+
+	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
+
+	if (sdev_channel(sdev) == PQI_PHYSICAL_DEVICE_BUS) {
+		starget = scsi_target(sdev);
+		rphy = target_to_rphy(starget);
+		device = pqi_find_device_by_sas_rphy(ctrl_info, rphy);
+		if (device) {
+			if (device->target_lun_valid) {
+				device->ignore_device = true;
+			} else {
+				device->target = sdev_id(sdev);
+				device->lun = sdev->lun;
+				device->target_lun_valid = true;
+			}
+		}
+	} else {
+		device = pqi_find_scsi_dev(ctrl_info, sdev_channel(sdev),
+			sdev_id(sdev), sdev->lun);
+	}
+
+	if (device) {
+		sdev->hostdata = device;
+		device->sdev = sdev;
+		if (device->queue_depth) {
+			device->advertised_queue_depth = device->queue_depth;
+			scsi_change_queue_depth(sdev,
+				device->advertised_queue_depth);
+		}
+		if (pqi_is_logical_device(device)) {
+			pqi_disable_write_same(sdev);
+			if (pqi_limit_xfer_to_1MB)
+				blk_queue_max_hw_sectors(sdev->request_queue,
+					PQI_1MB_SECTORS);
+		} else {
+			sdev->allow_restart = 1;
+			if (device->device_type == SA_DEVICE_TYPE_NVME)
+				pqi_disable_write_same(sdev);
+		}
+	}
+
+	spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+
+	return 0;
+}
+
+static inline bool pqi_is_tape_changer_device(struct pqi_scsi_dev *device)
+{
+	return device->devtype == TYPE_TAPE || device->devtype == TYPE_MEDIUM_CHANGER;
+}
+
+static int pqi_slave_configure(struct scsi_device *sdev)
+{
+	int rc = 0;
+	struct pqi_scsi_dev *device;
+
+	device = sdev->hostdata;
+	device->devtype = sdev->type;
+
+	if (pqi_is_tape_changer_device(device) && device->ignore_device) {
+		rc = -ENXIO;
+		device->ignore_device = false;
+	}
+
+	return rc;
+}
+
+static int pqi_getpciinfo_ioctl(struct pqi_ctrl_info *ctrl_info, void __user *arg)
+{
+	struct pci_dev *pci_dev;
+	u32 subsystem_vendor;
+	u32 subsystem_device;
+	cciss_pci_info_struct pciinfo;
+
+	if (!arg)
+		return -EINVAL;
+
+	pci_dev = ctrl_info->pci_dev;
+
+	pciinfo.domain = pci_domain_nr(pci_dev->bus);
+	pciinfo.bus = pci_dev->bus->number;
+	pciinfo.dev_fn = pci_dev->devfn;
+	subsystem_vendor = pci_dev->subsystem_vendor;
+	subsystem_device = pci_dev->subsystem_device;
+	pciinfo.board_id = ((subsystem_device << 16) & 0xffff0000) | subsystem_vendor;
+
+	if (copy_to_user(arg, &pciinfo, sizeof(pciinfo)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int pqi_getdrivver_ioctl(void __user *arg)
+{
+	u32 version;
+
+	if (!arg)
+		return -EINVAL;
+
+	version = (DRIVER_MAJOR << 28) | (DRIVER_MINOR << 24) |
+		(DRIVER_RELEASE << 16) | DRIVER_REVISION;
+
+	if (copy_to_user(arg, &version, sizeof(version)))
+		return -EFAULT;
+
+	return 0;
+}
+
+struct ciss_error_info {
+	u8	scsi_status;
+	int	command_status;
+	size_t	sense_data_length;
+};
+
+static void pqi_error_info_to_ciss(struct pqi_raid_error_info *pqi_error_info,
+	struct ciss_error_info *ciss_error_info)
+{
+	int ciss_cmd_status;
+	size_t sense_data_length;
+
+	switch (pqi_error_info->data_out_result) {
+	case PQI_DATA_IN_OUT_GOOD:
+		ciss_cmd_status = CISS_CMD_STATUS_SUCCESS;
+		break;
+	case PQI_DATA_IN_OUT_UNDERFLOW:
+		ciss_cmd_status = CISS_CMD_STATUS_DATA_UNDERRUN;
+		break;
+	case PQI_DATA_IN_OUT_BUFFER_OVERFLOW:
+		ciss_cmd_status = CISS_CMD_STATUS_DATA_OVERRUN;
+		break;
+	case PQI_DATA_IN_OUT_PROTOCOL_ERROR:
+	case PQI_DATA_IN_OUT_BUFFER_ERROR:
+	case PQI_DATA_IN_OUT_BUFFER_OVERFLOW_DESCRIPTOR_AREA:
+	case PQI_DATA_IN_OUT_BUFFER_OVERFLOW_BRIDGE:
+	case PQI_DATA_IN_OUT_ERROR:
+		ciss_cmd_status = CISS_CMD_STATUS_PROTOCOL_ERROR;
+		break;
+	case PQI_DATA_IN_OUT_HARDWARE_ERROR:
+	case PQI_DATA_IN_OUT_PCIE_FABRIC_ERROR:
+	case PQI_DATA_IN_OUT_PCIE_COMPLETION_TIMEOUT:
+	case PQI_DATA_IN_OUT_PCIE_COMPLETER_ABORT_RECEIVED:
+	case PQI_DATA_IN_OUT_PCIE_UNSUPPORTED_REQUEST_RECEIVED:
+	case PQI_DATA_IN_OUT_PCIE_ECRC_CHECK_FAILED:
+	case PQI_DATA_IN_OUT_PCIE_UNSUPPORTED_REQUEST:
+	case PQI_DATA_IN_OUT_PCIE_ACS_VIOLATION:
+	case PQI_DATA_IN_OUT_PCIE_TLP_PREFIX_BLOCKED:
+	case PQI_DATA_IN_OUT_PCIE_POISONED_MEMORY_READ:
+		ciss_cmd_status = CISS_CMD_STATUS_HARDWARE_ERROR;
+		break;
+	case PQI_DATA_IN_OUT_UNSOLICITED_ABORT:
+		ciss_cmd_status = CISS_CMD_STATUS_UNSOLICITED_ABORT;
+		break;
+	case PQI_DATA_IN_OUT_ABORTED:
+		ciss_cmd_status = CISS_CMD_STATUS_ABORTED;
+		break;
+	case PQI_DATA_IN_OUT_TIMEOUT:
+		ciss_cmd_status = CISS_CMD_STATUS_TIMEOUT;
+		break;
+	default:
+		ciss_cmd_status = CISS_CMD_STATUS_TARGET_STATUS;
+		break;
+	}
+
+	sense_data_length =
+		get_unaligned_le16(&pqi_error_info->sense_data_length);
+	if (sense_data_length == 0)
+		sense_data_length =
+		get_unaligned_le16(&pqi_error_info->response_data_length);
+	if (sense_data_length)
+		if (sense_data_length > sizeof(pqi_error_info->data))
+			sense_data_length = sizeof(pqi_error_info->data);
+
+	ciss_error_info->scsi_status = pqi_error_info->status;
+	ciss_error_info->command_status = ciss_cmd_status;
+	ciss_error_info->sense_data_length = sense_data_length;
+}
+
+static int pqi_passthru_ioctl(struct pqi_ctrl_info *ctrl_info, void __user *arg)
+{
+	int rc;
+	char *kernel_buffer = NULL;
+	u16 iu_length;
+	size_t sense_data_length;
+	IOCTL_Command_struct iocommand;
+	struct pqi_raid_path_request request;
+	struct pqi_raid_error_info pqi_error_info;
+	struct ciss_error_info ciss_error_info;
+
+	if (pqi_ctrl_offline(ctrl_info))
+		return -ENXIO;
+	if (pqi_ofa_in_progress(ctrl_info) && pqi_ctrl_blocked(ctrl_info))
+		return -EBUSY;
+	if (!arg)
+		return -EINVAL;
+	if (!capable(CAP_SYS_RAWIO))
+		return -EPERM;
+	if (copy_from_user(&iocommand, arg, sizeof(iocommand)))
+		return -EFAULT;
+	if (iocommand.buf_size < 1 &&
+		iocommand.Request.Type.Direction != XFER_NONE)
+		return -EINVAL;
+	if (iocommand.Request.CDBLen > sizeof(request.cdb))
+		return -EINVAL;
+	if (iocommand.Request.Type.Type != TYPE_CMD)
+		return -EINVAL;
+
+	switch (iocommand.Request.Type.Direction) {
+	case XFER_NONE:
+	case XFER_WRITE:
+	case XFER_READ:
+	case XFER_READ | XFER_WRITE:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (iocommand.buf_size > 0) {
+		kernel_buffer = kmalloc(iocommand.buf_size, GFP_KERNEL);
+		if (!kernel_buffer)
+			return -ENOMEM;
+		if (iocommand.Request.Type.Direction & XFER_WRITE) {
+			if (copy_from_user(kernel_buffer, iocommand.buf,
+				iocommand.buf_size)) {
+				rc = -EFAULT;
+				goto out;
+			}
+		} else {
+			memset(kernel_buffer, 0, iocommand.buf_size);
+		}
+	}
+
+	memset(&request, 0, sizeof(request));
+
+	request.header.iu_type = PQI_REQUEST_IU_RAID_PATH_IO;
+	iu_length = offsetof(struct pqi_raid_path_request, sg_descriptors) -
+		PQI_REQUEST_HEADER_LENGTH;
+	memcpy(request.lun_number, iocommand.LUN_info.LunAddrBytes,
+		sizeof(request.lun_number));
+	memcpy(request.cdb, iocommand.Request.CDB, iocommand.Request.CDBLen);
+	request.additional_cdb_bytes_usage = SOP_ADDITIONAL_CDB_BYTES_0;
+
+	switch (iocommand.Request.Type.Direction) {
+	case XFER_NONE:
+		request.data_direction = SOP_NO_DIRECTION_FLAG;
+		break;
+	case XFER_WRITE:
+		request.data_direction = SOP_WRITE_FLAG;
+		break;
+	case XFER_READ:
+		request.data_direction = SOP_READ_FLAG;
+		break;
+	case XFER_READ | XFER_WRITE:
+		request.data_direction = SOP_BIDIRECTIONAL;
+		break;
+	}
+
+	request.task_attribute = SOP_TASK_ATTRIBUTE_SIMPLE;
+
+	if (iocommand.buf_size > 0) {
+		put_unaligned_le32(iocommand.buf_size, &request.buffer_length);
+
+		rc = pqi_map_single(ctrl_info->pci_dev,
+			&request.sg_descriptors[0], kernel_buffer,
+			iocommand.buf_size, DMA_BIDIRECTIONAL);
+		if (rc)
+			goto out;
+
+		iu_length += sizeof(request.sg_descriptors[0]);
+	}
+
+	put_unaligned_le16(iu_length, &request.header.iu_length);
+
+	if (ctrl_info->raid_iu_timeout_supported)
+		put_unaligned_le32(iocommand.Request.Timeout, &request.timeout);
+
+	rc = pqi_submit_raid_request_synchronous(ctrl_info, &request.header,
+		PQI_SYNC_FLAGS_INTERRUPTABLE, &pqi_error_info);
+
+	if (iocommand.buf_size > 0)
+		pqi_pci_unmap(ctrl_info->pci_dev, request.sg_descriptors, 1,
+			DMA_BIDIRECTIONAL);
+
+	memset(&iocommand.error_info, 0, sizeof(iocommand.error_info));
+
+	if (rc == 0) {
+		pqi_error_info_to_ciss(&pqi_error_info, &ciss_error_info);
+		iocommand.error_info.ScsiStatus = ciss_error_info.scsi_status;
+		iocommand.error_info.CommandStatus =
+			ciss_error_info.command_status;
+		sense_data_length = ciss_error_info.sense_data_length;
+		if (sense_data_length) {
+			if (sense_data_length >
+				sizeof(iocommand.error_info.SenseInfo))
+				sense_data_length =
+					sizeof(iocommand.error_info.SenseInfo);
+			memcpy(iocommand.error_info.SenseInfo,
+				pqi_error_info.data, sense_data_length);
+			iocommand.error_info.SenseLen = sense_data_length;
+		}
+	}
+
+	if (copy_to_user(arg, &iocommand, sizeof(iocommand))) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	if (rc == 0 && iocommand.buf_size > 0 &&
+		(iocommand.Request.Type.Direction & XFER_READ)) {
+		if (copy_to_user(iocommand.buf, kernel_buffer,
+			iocommand.buf_size)) {
+			rc = -EFAULT;
+		}
+	}
+
+out:
+	kfree(kernel_buffer);
+
+	return rc;
+}
+
+static int pqi_ioctl(struct scsi_device *sdev, IOCTL_INT cmd, void __user *arg)
+{
+	int rc;
+	struct pqi_ctrl_info *ctrl_info;
+
+	ctrl_info = shost_to_hba(sdev->host);
+
+	switch (cmd) {
+	case CCISS_DEREGDISK:
+	case CCISS_REGNEWDISK:
+	case CCISS_REGNEWD:
+		rc = pqi_scan_scsi_devices(ctrl_info);
+		break;
+	case CCISS_GETPCIINFO:
+		rc = pqi_getpciinfo_ioctl(ctrl_info, arg);
+		break;
+	case CCISS_GETDRIVVER:
+		rc = pqi_getdrivver_ioctl(arg);
+		break;
+	case CCISS_PASSTHRU:
+		rc = pqi_passthru_ioctl(ctrl_info, arg);
+		break;
+	default:
+		rc = -EINVAL;
+		break;
+	}
+
+	return rc;
+}
+
+static ssize_t pqi_firmware_version_show(struct device *dev,
+	struct device_attribute *attr, char *buffer)
+{
+	struct Scsi_Host *shost;
+	struct pqi_ctrl_info *ctrl_info;
+
+	shost = class_to_shost(dev);
+	ctrl_info = shost_to_hba(shost);
+
+	return scnprintf(buffer, PAGE_SIZE, "%s\n", ctrl_info->firmware_version);
+}
+
+static ssize_t pqi_driver_version_show(struct device *dev,
+	struct device_attribute *attr, char *buffer)
+{
+	return scnprintf(buffer, PAGE_SIZE, "%s\n", DRIVER_VERSION BUILD_TIMESTAMP);
+}
+
+static ssize_t pqi_serial_number_show(struct device *dev,
+	struct device_attribute *attr, char *buffer)
+{
+	struct Scsi_Host *shost;
+	struct pqi_ctrl_info *ctrl_info;
+
+	shost = class_to_shost(dev);
+	ctrl_info = shost_to_hba(shost);
+
+	return scnprintf(buffer, PAGE_SIZE, "%s\n", ctrl_info->serial_number);
+}
+
+static ssize_t pqi_model_show(struct device *dev,
+	struct device_attribute *attr, char *buffer)
+{
+	struct Scsi_Host *shost;
+	struct pqi_ctrl_info *ctrl_info;
+
+	shost = class_to_shost(dev);
+	ctrl_info = shost_to_hba(shost);
+
+	return scnprintf(buffer, PAGE_SIZE, "%s\n", ctrl_info->model);
+}
+
+static ssize_t pqi_vendor_show(struct device *dev,
+	struct device_attribute *attr, char *buffer)
+{
+	struct Scsi_Host *shost;
+	struct pqi_ctrl_info *ctrl_info;
+
+	shost = class_to_shost(dev);
+	ctrl_info = shost_to_hba(shost);
+
+	return scnprintf(buffer, PAGE_SIZE, "%s\n", ctrl_info->vendor);
+}
+
+static ssize_t pqi_host_rescan_store(struct device *dev,
+	struct device_attribute *attr, const char *buffer, size_t count)
+{
+	struct Scsi_Host *shost = class_to_shost(dev);
+
+	pqi_scan_start(shost);
+
+	return count;
+}
+
+static ssize_t pqi_lockup_action_show(struct device *dev,
+	struct device_attribute *attr, char *buffer)
+{
+	int count = 0;
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(pqi_lockup_actions); i++) {
+		if (pqi_lockup_actions[i].action == pqi_lockup_action)
+			count += scnprintf(buffer + count, PAGE_SIZE - count,
+				"[%s] ", pqi_lockup_actions[i].name);
+		else
+			count += scnprintf(buffer + count, PAGE_SIZE - count,
+				"%s ", pqi_lockup_actions[i].name);
+	}
+
+	count += scnprintf(buffer + count, PAGE_SIZE - count, "\n");
+
+	return count;
+}
+
+static ssize_t pqi_lockup_action_store(struct device *dev,
+	struct device_attribute *attr, const char *buffer, size_t count)
+{
+	unsigned int i;
+	char *action_name;
+	char action_name_buffer[32];
+
+	strlcpy(action_name_buffer, buffer, sizeof(action_name_buffer));
+	action_name = strstrip(action_name_buffer);
+
+	for (i = 0; i < ARRAY_SIZE(pqi_lockup_actions); i++) {
+		if (strcmp(action_name, pqi_lockup_actions[i].name) == 0) {
+			pqi_lockup_action = pqi_lockup_actions[i].action;
+			return count;
+		}
+	}
+
+	return -EINVAL;
+}
+
+static ssize_t pqi_host_enable_stream_detection_show(struct device *dev,
+	struct device_attribute *attr, char *buffer)
+{
+	struct Scsi_Host *shost = class_to_shost(dev);
+	struct pqi_ctrl_info *ctrl_info = shost_to_hba(shost);
+
+	return snprintf(buffer, 10, "%hhx\n",
+			ctrl_info->enable_stream_detection);
+}
+
+static ssize_t pqi_host_enable_stream_detection_store(struct device *dev,
+	struct device_attribute *attr, const char *buffer, size_t count)
+{
+	struct Scsi_Host *shost = class_to_shost(dev);
+	struct pqi_ctrl_info *ctrl_info = shost_to_hba(shost);
+	u8 set_stream_detection = 0;
+
+	if (sscanf(buffer, "%hhx", &set_stream_detection) != 1)
+		return -EINVAL;
+
+	ctrl_info->enable_stream_detection = set_stream_detection;
+
+	return count;
+}
+
+static ssize_t pqi_host_enable_r5_writes_show(struct device *dev,
+	struct device_attribute *attr, char *buffer)
+{
+	struct Scsi_Host *shost = class_to_shost(dev);
+	struct pqi_ctrl_info *ctrl_info = shost_to_hba(shost);
+
+	return snprintf(buffer, 10, "%hhx\n", ctrl_info->enable_r5_writes);
+}
+
+static ssize_t pqi_host_enable_r5_writes_store(struct device *dev,
+	struct device_attribute *attr, const char *buffer, size_t count)
+{
+	struct Scsi_Host *shost = class_to_shost(dev);
+	struct pqi_ctrl_info *ctrl_info = shost_to_hba(shost);
+	u8 set_r5_writes = 0;
+
+	if (sscanf(buffer, "%hhx", &set_r5_writes) != 1)
+		return -EINVAL;
+
+	ctrl_info->enable_r5_writes = set_r5_writes;
+
+	return count;
+}
+
+static ssize_t pqi_host_enable_r6_writes_show(struct device *dev,
+	struct device_attribute *attr, char *buffer)
+{
+	struct Scsi_Host *shost = class_to_shost(dev);
+	struct pqi_ctrl_info *ctrl_info = shost_to_hba(shost);
+
+	return snprintf(buffer, 10, "%hhx\n", ctrl_info->enable_r6_writes);
+}
+
+static ssize_t pqi_host_enable_r6_writes_store(struct device *dev,
+	struct device_attribute *attr, const char *buffer, size_t count)
+{
+	struct Scsi_Host *shost = class_to_shost(dev);
+	struct pqi_ctrl_info *ctrl_info = shost_to_hba(shost);
+	u8 set_r6_writes = 0;
+
+	if (sscanf(buffer, "%hhx", &set_r6_writes) != 1)
+		return -EINVAL;
+
+	ctrl_info->enable_r6_writes = set_r6_writes;
+
+	return count;
+}
+
+static DEVICE_ATTR(driver_version, S_IRUGO, pqi_driver_version_show, NULL);
+static DEVICE_ATTR(firmware_version, S_IRUGO, pqi_firmware_version_show, NULL);
+static DEVICE_ATTR(model, S_IRUGO, pqi_model_show, NULL);
+static DEVICE_ATTR(serial_number, S_IRUGO, pqi_serial_number_show, NULL);
+static DEVICE_ATTR(vendor, S_IRUGO, pqi_vendor_show, NULL);
+static DEVICE_ATTR(rescan, S_IWUSR, NULL, pqi_host_rescan_store);
+static DEVICE_ATTR(lockup_action, S_IWUSR | S_IRUGO, pqi_lockup_action_show,
+	pqi_lockup_action_store);
+static DEVICE_ATTR(enable_stream_detection, S_IWUSR | S_IRUGO,
+	pqi_host_enable_stream_detection_show,
+	pqi_host_enable_stream_detection_store);
+static DEVICE_ATTR(enable_r5_writes, S_IWUSR | S_IRUGO,
+	pqi_host_enable_r5_writes_show, pqi_host_enable_r5_writes_store);
+static DEVICE_ATTR(enable_r6_writes, S_IWUSR | S_IRUGO,
+	pqi_host_enable_r6_writes_show, pqi_host_enable_r6_writes_store);
+
+static struct device_attribute *pqi_shost_attrs[] = {
+	&dev_attr_driver_version,
+	&dev_attr_firmware_version,
+	&dev_attr_model,
+	&dev_attr_serial_number,
+	&dev_attr_vendor,
+	&dev_attr_rescan,
+	&dev_attr_lockup_action,
+	&dev_attr_enable_stream_detection,
+	&dev_attr_enable_r5_writes,
+	&dev_attr_enable_r6_writes,
+	NULL
+};
+
+static ssize_t pqi_unique_id_show(struct device *dev,
+	struct device_attribute *attr, char *buffer)
+{
+	struct pqi_ctrl_info *ctrl_info;
+	struct scsi_device *sdev;
+	struct pqi_scsi_dev *device;
+	unsigned long flags;
+	u8 unique_id[16];
+
+	sdev = to_scsi_device(dev);
+	ctrl_info = shost_to_hba(sdev->host);
+
+	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
+
+	device = sdev->hostdata;
+	if (!device) {
+		spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+		return -ENODEV;
+	}
+
+	if (device->is_physical_device)
+		memcpy(unique_id, device->wwid, sizeof(device->wwid));
+	else
+		memcpy(unique_id, device->volume_id, sizeof(device->volume_id));
+
+	spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+
+	return scnprintf(buffer, PAGE_SIZE,
+		"%02X%02X%02X%02X%02X%02X%02X%02X"
+		"%02X%02X%02X%02X%02X%02X%02X%02X\n",
+		unique_id[0], unique_id[1], unique_id[2], unique_id[3],
+		unique_id[4], unique_id[5], unique_id[6], unique_id[7],
+		unique_id[8], unique_id[9], unique_id[10], unique_id[11],
+		unique_id[12], unique_id[13], unique_id[14], unique_id[15]);
+}
+
+static ssize_t pqi_lunid_show(struct device *dev,
+	struct device_attribute *attr, char *buffer)
+{
+	struct pqi_ctrl_info *ctrl_info;
+	struct scsi_device *sdev;
+	struct pqi_scsi_dev *device;
+	unsigned long flags;
+	u8 lunid[8];
+
+	sdev = to_scsi_device(dev);
+	ctrl_info = shost_to_hba(sdev->host);
+
+	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
+
+	device = sdev->hostdata;
+	if (!device) {
+		spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+		return -ENODEV;
+	}
+
+	memcpy(lunid, device->scsi3addr, sizeof(lunid));
+
+	spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+
+	return scnprintf(buffer, PAGE_SIZE, "0x%8phN\n", lunid);
+}
+
+#define MAX_PATHS	8
+
+static ssize_t pqi_path_info_show(struct device *dev,
+	struct device_attribute *attr, char *buf)
+{
+	struct pqi_ctrl_info *ctrl_info;
+	struct scsi_device *sdev;
+	struct pqi_scsi_dev *device;
+	unsigned long flags;
+	int i;
+	int output_len = 0;
+	u8 box;
+	u8 bay;
+	u8 path_map_index;
+	char *active;
+	u8 phys_connector[2];
+
+	sdev = to_scsi_device(dev);
+	ctrl_info = shost_to_hba(sdev->host);
+
+	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
+
+	device = sdev->hostdata;
+	if (!device) {
+		spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+		return -ENODEV;
+	}
+
+	bay = device->bay;
+	for (i = 0; i < MAX_PATHS; i++) {
+		path_map_index = 1 << i;
+		if (i == device->active_path_index)
+			active = "Active";
+		else if (device->path_map & path_map_index)
+			active = "Inactive";
+		else
+			continue;
+
+		output_len += scnprintf(buf + output_len,
+					PAGE_SIZE - output_len,
+					"[%d:%d:%d:%d] %20.20s ",
+					ctrl_info->scsi_host->host_no,
+					device->bus, device->target,
+					device->lun,
+					scsi_device_type(device->devtype));
+
+		if (device->devtype == TYPE_RAID ||
+			pqi_is_logical_device(device))
+			goto end_buffer;
+
+		memcpy(&phys_connector, &device->phys_connector[i],
+			sizeof(phys_connector));
+		if (phys_connector[0] < '0')
+			phys_connector[0] = '0';
+		if (phys_connector[1] < '0')
+			phys_connector[1] = '0';
+
+		output_len += scnprintf(buf + output_len,
+					PAGE_SIZE - output_len,
+					"PORT: %.2s ", phys_connector);
+
+		box = device->box[i];
+		if (box != 0 && box != 0xFF)
+			output_len += scnprintf(buf + output_len,
+						PAGE_SIZE - output_len,
+						"BOX: %hhu ", box);
+
+		if ((device->devtype == TYPE_DISK ||
+			device->devtype == TYPE_ZBC) &&
+			pqi_expose_device(device))
+			output_len += scnprintf(buf + output_len,
+						PAGE_SIZE - output_len,
+						"BAY: %hhu ", bay);
+
+end_buffer:
+		output_len += scnprintf(buf + output_len,
+					PAGE_SIZE - output_len,
+					"%s\n", active);
+	}
+
+	spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+
+	return output_len;
+}
+
+static ssize_t pqi_sas_address_show(struct device *dev,
+	struct device_attribute *attr, char *buffer)
+{
+	struct pqi_ctrl_info *ctrl_info;
+	struct scsi_device *sdev;
+	struct pqi_scsi_dev *device;
+	unsigned long flags;
+	u64 sas_address;
+
+	sdev = to_scsi_device(dev);
+	ctrl_info = shost_to_hba(sdev->host);
+
+	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
+
+	device = sdev->hostdata;
+	if (!device || !pqi_is_device_with_sas_address(device)) {
+		spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+		return -ENODEV;
+	}
+
+	sas_address = device->sas_address;
+
+	spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+
+	return scnprintf(buffer, PAGE_SIZE, "0x%016llx\n", sas_address);
+}
+
+static ssize_t pqi_ssd_smart_path_enabled_show(struct device *dev,
+	struct device_attribute *attr, char *buffer)
+{
+	struct pqi_ctrl_info *ctrl_info;
+	struct scsi_device *sdev;
+	struct pqi_scsi_dev *device;
+	unsigned long flags;
+
+	sdev = to_scsi_device(dev);
+	ctrl_info = shost_to_hba(sdev->host);
+
+	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
+
+	device = sdev->hostdata;
+	if (!device) {
+		spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+		return -ENODEV;
+	}
+
+	buffer[0] = device->raid_bypass_enabled ? '1' : '0';
+	buffer[1] = '\n';
+	buffer[2] = '\0';
+
+	spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+
+	return 2;
+}
+
+static ssize_t pqi_raid_level_show(struct device *dev,
+	struct device_attribute *attr, char *buffer)
+{
+	struct pqi_ctrl_info *ctrl_info;
+	struct scsi_device *sdev;
+	struct pqi_scsi_dev *device;
+	unsigned long flags;
+	char *raid_level;
+
+	sdev = to_scsi_device(dev);
+	ctrl_info = shost_to_hba(sdev->host);
+
+	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
+
+	device = sdev->hostdata;
+	if (!device) {
+		spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+		return -ENODEV;
+	}
+
+	if (pqi_is_logical_device(device))
+		raid_level = pqi_raid_level_to_string(device->raid_level);
+	else
+		raid_level = "N/A";
+
+	spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+
+	return scnprintf(buffer, PAGE_SIZE, "%s\n", raid_level);
+}
+
+static ssize_t pqi_raid_bypass_cnt_show(struct device *dev,
+	struct device_attribute *attr, char *buffer)
+{
+	struct pqi_ctrl_info *ctrl_info;
+	struct scsi_device *sdev;
+	struct pqi_scsi_dev *device;
+	unsigned long flags;
+	int raid_bypass_cnt;
+
+	sdev = to_scsi_device(dev);
+	ctrl_info = shost_to_hba(sdev->host);
+
+	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
+
+	device = sdev->hostdata;
+	if (!device) {
+		spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+		return -ENODEV;
+	}
+
+	raid_bypass_cnt = atomic_read(&device->raid_bypass_cnt);
+
+	spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+
+	return scnprintf(buffer, PAGE_SIZE, "0x%x\n", raid_bypass_cnt);
+}
+
+static ssize_t pqi_sas_ncq_prio_enable_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct pqi_ctrl_info *ctrl_info;
+	struct scsi_device *sdev;
+	struct pqi_scsi_dev *device;
+	unsigned long flags;
+	int output_len = 0;
+
+	sdev = to_scsi_device(dev);
+	ctrl_info = shost_to_hba(sdev->host);
+
+	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
+
+	device = sdev->hostdata;
+	if (!device) {
+		spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+		return -ENODEV;
+	}
+
+	output_len = snprintf(buf, PAGE_SIZE, "%d\n",
+				device->ncq_prio_enable);
+	spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+
+	return output_len;
+}
+
+static ssize_t pqi_sas_ncq_prio_enable_store(struct device *dev,
+			struct device_attribute *attr,
+			const char *buf, size_t count)
+{
+	struct pqi_ctrl_info *ctrl_info;
+	struct scsi_device *sdev;
+	struct pqi_scsi_dev *device;
+	unsigned long flags;
+	u8 ncq_prio_enable = 0;
+
+	if (kstrtou8(buf, 0, &ncq_prio_enable))
+		return -EINVAL;
+
+
+	sdev = to_scsi_device(dev);
+	ctrl_info = shost_to_hba(sdev->host);
+
+	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
+
+	device = sdev->hostdata;
+
+	if (!device) {
+		spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+		return -ENODEV;
+	}
+
+	if (!device->ncq_prio_support ||
+		!device->is_physical_device) {
+		spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+		return -EINVAL;
+	}
+
+	device->ncq_prio_enable = ncq_prio_enable;
+
+	spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+
+	return  strlen(buf);
+}
+
+static DEVICE_ATTR(lunid, S_IRUGO, pqi_lunid_show, NULL);
+static DEVICE_ATTR(unique_id, S_IRUGO, pqi_unique_id_show, NULL);
+static DEVICE_ATTR(path_info, S_IRUGO, pqi_path_info_show, NULL);
+static DEVICE_ATTR(sas_address, S_IRUGO, pqi_sas_address_show, NULL);
+static DEVICE_ATTR(ssd_smart_path_enabled, S_IRUGO, pqi_ssd_smart_path_enabled_show, NULL);
+static DEVICE_ATTR(raid_level, S_IRUGO, pqi_raid_level_show, NULL);
+static DEVICE_ATTR(raid_bypass_cnt, S_IRUGO, pqi_raid_bypass_cnt_show, NULL);
+static DEVICE_ATTR(sas_ncq_prio_enable, S_IWUSR | S_IRUGO,
+		pqi_sas_ncq_prio_enable_show, pqi_sas_ncq_prio_enable_store);
+
+static struct device_attribute *pqi_sdev_attrs[] = {
+	&dev_attr_lunid,
+	&dev_attr_unique_id,
+	&dev_attr_path_info,
+	&dev_attr_sas_address,
+	&dev_attr_ssd_smart_path_enabled,
+	&dev_attr_raid_level,
+	&dev_attr_raid_bypass_cnt,
+	NULL
+};
+
+struct device_attribute *pqi_ncq_prio_sdev_attrs[] = {
+	&dev_attr_lunid,
+	&dev_attr_unique_id,
+	&dev_attr_path_info,
+	&dev_attr_sas_address,
+	&dev_attr_ssd_smart_path_enabled,
+	&dev_attr_raid_level,
+	&dev_attr_raid_bypass_cnt,
+	&dev_attr_sas_ncq_prio_enable,
+	NULL
+};
+
+static struct scsi_host_template pqi_driver_template = {
+	.module = THIS_MODULE,
+	.name = DRIVER_NAME_SHORT,
+	.proc_name = DRIVER_NAME_SHORT,
+	.queuecommand = PQI_SCSI_QUEUE_COMMAND,
+	.scan_start = pqi_scan_start,
+	.scan_finished = pqi_scan_finished,
+	.this_id = -1,
+	.eh_device_reset_handler = pqi_eh_device_reset_handler,
+	.ioctl = pqi_ioctl,
+	.slave_alloc = pqi_slave_alloc,
+	.slave_configure = pqi_slave_configure,
+	.sdev_attrs = pqi_sdev_attrs,
+	.shost_attrs = pqi_shost_attrs,
+};
+
+static int pqi_register_scsi(struct pqi_ctrl_info *ctrl_info)
+{
+	int rc;
+	struct Scsi_Host *shost;
+
+	pqi_compat_init_scsi_host_template(&pqi_driver_template);
+
+	shost = scsi_host_alloc(&pqi_driver_template, sizeof(ctrl_info));
+	if (!shost) {
+		dev_err(&ctrl_info->pci_dev->dev, "scsi_host_alloc failed\n");
+		return -ENOMEM;
+	}
+
+	shost->io_port = 0;
+	shost->n_io_port = 0;
+	shost->this_id = -1;
+	shost->max_channel = PQI_MAX_BUS;
+	shost->max_cmd_len = MAX_COMMAND_SIZE;
+	shost->max_lun = ~0;
+	shost->max_id = ~0;
+	shost->max_sectors = ctrl_info->max_sectors;
+	shost->can_queue = ctrl_info->scsi_ml_can_queue;
+	shost->cmd_per_lun = shost->can_queue;
+	shost->sg_tablesize = ctrl_info->sg_tablesize;
+	shost->transportt = pqi_sas_transport_template;
+	shost->irq = pqi_pci_irq_vector(ctrl_info->pci_dev, 0);
+	shost->unique_id = shost->irq;
+	shost->hostdata[0] = (unsigned long)ctrl_info;
+	pqi_compat_init_scsi_host(shost, ctrl_info);
+
+	rc = scsi_add_host(shost, &ctrl_info->pci_dev->dev);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev, "scsi_add_host failed\n");
+		goto free_host;
+	}
+
+	rc = pqi_add_sas_host(shost, ctrl_info);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev, "add SAS host failed\n");
+		goto remove_host;
+	}
+
+	ctrl_info->scsi_host = shost;
+
+	return 0;
+
+remove_host:
+	scsi_remove_host(shost);
+free_host:
+	scsi_host_put(shost);
+
+	return rc;
+}
+
+static void pqi_unregister_scsi(struct pqi_ctrl_info *ctrl_info)
+{
+	struct Scsi_Host *shost;
+
+	pqi_delete_sas_host(ctrl_info);
+
+	shost = ctrl_info->scsi_host;
+	if (!shost)
+		return;
+
+	scsi_remove_host(shost);
+	scsi_host_put(shost);
+}
+
+static int pqi_wait_for_pqi_reset_completion(struct pqi_ctrl_info *ctrl_info)
+{
+	int rc = 0;
+	struct pqi_device_registers __iomem *pqi_registers;
+	unsigned long timeout;
+	unsigned int timeout_msecs;
+	union pqi_reset_register reset_reg;
+
+	pqi_registers = ctrl_info->pqi_registers;
+	timeout_msecs = readw(&pqi_registers->max_reset_timeout) * 100;
+	timeout = msecs_to_jiffies(timeout_msecs) + jiffies;
+
+	while (1) {
+		msleep(PQI_RESET_POLL_INTERVAL_MSECS);
+		reset_reg.all_bits = readl(&pqi_registers->device_reset);
+		if (reset_reg.bits.reset_action == PQI_RESET_ACTION_COMPLETED)
+			break;
+		pqi_check_ctrl_health(ctrl_info);
+		if (pqi_ctrl_offline(ctrl_info)) {
+			rc = -ENXIO;
+			break;
+		}
+		if (time_after(jiffies, timeout)) {
+			rc = -ETIMEDOUT;
+			break;
+		}
+	}
+
+	return rc;
+}
+
+static int pqi_reset(struct pqi_ctrl_info *ctrl_info)
+{
+	int rc;
+	union pqi_reset_register reset_reg;
+
+	if (ctrl_info->pqi_reset_quiesce_supported) {
+		rc = sis_pqi_reset_quiesce(ctrl_info);
+		if (rc) {
+			dev_err(&ctrl_info->pci_dev->dev,
+				"PQI reset failed during quiesce with error %d\n", rc);
+			return rc;
+		}
+	}
+
+	reset_reg.all_bits = 0;
+	reset_reg.bits.reset_type = PQI_RESET_TYPE_HARD_RESET;
+	reset_reg.bits.reset_action = PQI_RESET_ACTION_RESET;
+
+	writel(reset_reg.all_bits, &ctrl_info->pqi_registers->device_reset);
+
+	rc = pqi_wait_for_pqi_reset_completion(ctrl_info);
+	if (rc)
+		dev_err(&ctrl_info->pci_dev->dev,
+			"PQI reset failed with error %d\n", rc);
+
+	return rc;
+}
+
+static int pqi_get_ctrl_serial_number(struct pqi_ctrl_info *ctrl_info)
+{
+	int rc;
+	struct bmic_sense_subsystem_info *sense_info;
+
+	sense_info = kzalloc(sizeof(*sense_info), GFP_KERNEL);
+	if (!sense_info)
+		return -ENOMEM;
+
+	rc = pqi_sense_subsystem_info(ctrl_info, sense_info);
+	if (rc)
+		goto out;
+
+	memcpy(ctrl_info->serial_number, sense_info->ctrl_serial_number,
+		sizeof(sense_info->ctrl_serial_number));
+	ctrl_info->serial_number[sizeof(sense_info->ctrl_serial_number)] = '\0';
+
+out:
+	kfree(sense_info);
+
+	return rc;
+}
+
+static int pqi_get_ctrl_product_details(struct pqi_ctrl_info *ctrl_info)
+{
+	int rc;
+	struct bmic_identify_controller *identify;
+
+	identify = kmalloc(sizeof(*identify), GFP_KERNEL);
+	if (!identify)
+		return -ENOMEM;
+
+	rc = pqi_identify_controller(ctrl_info, identify);
+	if (rc)
+		goto out;
+
+	if (get_unaligned_le32(&identify->extra_controller_flags) &
+		BMIC_IDENTIFY_EXTRA_FLAGS_LONG_FW_VERSION_SUPPORTED) {
+		memcpy(ctrl_info->firmware_version,
+			identify->firmware_version_long,
+			sizeof(identify->firmware_version_long));
+	} else {
+		memcpy(ctrl_info->firmware_version,
+			identify->firmware_version_short,
+			sizeof(identify->firmware_version_short));
+		ctrl_info->firmware_version
+			[sizeof(identify->firmware_version_short)] = '\0';
+		snprintf(ctrl_info->firmware_version +
+			strlen(ctrl_info->firmware_version),
+			sizeof(ctrl_info->firmware_version),
+			"-%u",
+			get_unaligned_le16(&identify->firmware_build_number));
+	}
+
+	memcpy(ctrl_info->model, identify->product_id,
+		sizeof(identify->product_id));
+	ctrl_info->model[sizeof(identify->product_id)] = '\0';
+
+	memcpy(ctrl_info->vendor, identify->vendor_id,
+		sizeof(identify->vendor_id));
+	ctrl_info->vendor[sizeof(identify->vendor_id)] = '\0';
+
+out:
+	kfree(identify);
+
+	return rc;
+}
+
+struct pqi_config_table_section_info {
+	struct pqi_ctrl_info *ctrl_info;
+	void		*section;
+	u32		section_offset;
+	void __iomem	*section_iomem_addr;
+};
+
+static inline bool pqi_is_firmware_feature_supported(
+	struct pqi_config_table_firmware_features *firmware_features,
+	unsigned int bit_position)
+{
+	unsigned int byte_index;
+
+	byte_index = bit_position / BITS_PER_BYTE;
+
+	if (byte_index >= le16_to_cpu(firmware_features->num_elements))
+		return false;
+
+	return firmware_features->features_supported[byte_index] &
+		(1 << (bit_position % BITS_PER_BYTE)) ? true : false;
+}
+
+static inline bool pqi_is_firmware_feature_enabled(
+	struct pqi_config_table_firmware_features *firmware_features,
+	void __iomem *firmware_features_iomem_addr,
+	unsigned int bit_position)
+{
+	unsigned int byte_index;
+	u8 __iomem *features_enabled_iomem_addr;
+
+	byte_index = (bit_position / BITS_PER_BYTE) +
+		(le16_to_cpu(firmware_features->num_elements) * 2);
+
+	features_enabled_iomem_addr = firmware_features_iomem_addr +
+		offsetof(struct pqi_config_table_firmware_features,
+			features_supported) + byte_index;
+
+	return *((__force u8 *)features_enabled_iomem_addr) &
+		(1 << (bit_position % BITS_PER_BYTE)) ? true : false;
+}
+
+static inline void pqi_request_firmware_feature(
+	struct pqi_config_table_firmware_features *firmware_features,
+	unsigned int bit_position)
+{
+	unsigned int byte_index;
+
+	byte_index = (bit_position / BITS_PER_BYTE) +
+		le16_to_cpu(firmware_features->num_elements);
+
+	firmware_features->features_supported[byte_index] |=
+		(1 << (bit_position % BITS_PER_BYTE));
+}
+
+static int pqi_config_table_update(struct pqi_ctrl_info *ctrl_info,
+	u16 first_section, u16 last_section)
+{
+	struct pqi_vendor_general_request request;
+
+	memset(&request, 0, sizeof(request));
+
+	request.header.iu_type = PQI_REQUEST_IU_VENDOR_GENERAL;
+	put_unaligned_le16(sizeof(request) - PQI_REQUEST_HEADER_LENGTH,
+		&request.header.iu_length);
+	put_unaligned_le16(PQI_VENDOR_GENERAL_CONFIG_TABLE_UPDATE,
+		&request.function_code);
+	put_unaligned_le16(first_section,
+		&request.data.config_table_update.first_section);
+	put_unaligned_le16(last_section,
+		&request.data.config_table_update.last_section);
+
+	return pqi_submit_raid_request_synchronous(ctrl_info, &request.header,
+		0, NULL);
+}
+
+static int pqi_enable_firmware_features(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_config_table_firmware_features *firmware_features,
+	void __iomem *firmware_features_iomem_addr)
+{
+	void *features_requested;
+	void __iomem *features_requested_iomem_addr;
+	void __iomem *host_max_known_feature_iomem_addr;
+
+	features_requested = firmware_features->features_supported +
+		le16_to_cpu(firmware_features->num_elements);
+
+	features_requested_iomem_addr = firmware_features_iomem_addr +
+		(features_requested - (void *)firmware_features);
+
+	memcpy_toio(features_requested_iomem_addr, features_requested,
+		le16_to_cpu(firmware_features->num_elements));
+
+	if (pqi_is_firmware_feature_supported(firmware_features,
+		PQI_FIRMWARE_FEATURE_MAX_KNOWN_FEATURE)) {
+		host_max_known_feature_iomem_addr =
+			features_requested_iomem_addr +
+			(le16_to_cpu(firmware_features->num_elements) * 2) +
+			sizeof(__le16);
+		writew(PQI_FIRMWARE_FEATURE_MAXIMUM,
+			host_max_known_feature_iomem_addr);
+	}
+
+	return pqi_config_table_update(ctrl_info,
+		PQI_CONFIG_TABLE_SECTION_FIRMWARE_FEATURES,
+		PQI_CONFIG_TABLE_SECTION_FIRMWARE_FEATURES);
+}
+
+struct pqi_firmware_feature {
+	char		*feature_name;
+	unsigned int	feature_bit;
+	bool		supported;
+	bool		enabled;
+	void (*feature_status)(struct pqi_ctrl_info *ctrl_info,
+		struct pqi_firmware_feature *firmware_feature);
+};
+
+static void pqi_firmware_feature_status(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_firmware_feature *firmware_feature)
+{
+	if (!firmware_feature->supported)
+		return;
+
+	if (firmware_feature->enabled) {
+		dev_info(&ctrl_info->pci_dev->dev,
+			"%s enabled\n", firmware_feature->feature_name);
+		return;
+	}
+
+	dev_err(&ctrl_info->pci_dev->dev, "failed to enable %s\n",
+		firmware_feature->feature_name);
+}
+
+static void pqi_ctrl_update_feature_flags(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_firmware_feature *firmware_feature)
+{
+	switch (firmware_feature->feature_bit) {
+	case PQI_FIRMWARE_FEATURE_RAID_1_WRITE_BYPASS:
+		ctrl_info->enable_r1_writes = firmware_feature->enabled;
+		break;
+	case PQI_FIRMWARE_FEATURE_RAID_5_WRITE_BYPASS:
+		ctrl_info->enable_r5_writes = firmware_feature->enabled;
+		break;
+	case PQI_FIRMWARE_FEATURE_RAID_6_WRITE_BYPASS:
+		ctrl_info->enable_r6_writes = firmware_feature->enabled;
+		break;
+	case PQI_FIRMWARE_FEATURE_SOFT_RESET_HANDSHAKE:
+		ctrl_info->soft_reset_handshake_supported =
+			firmware_feature->enabled &&
+			ctrl_info->soft_reset_status;
+		break;
+	case PQI_FIRMWARE_FEATURE_RAID_IU_TIMEOUT:
+		ctrl_info->raid_iu_timeout_supported = firmware_feature->enabled;
+		break;
+	case PQI_FIRMWARE_FEATURE_TMF_IU_TIMEOUT:
+		ctrl_info->tmf_iu_timeout_supported = firmware_feature->enabled;
+		break;
+	case PQI_FIRMWARE_FEATURE_UNIQUE_WWID_IN_REPORT_PHYS_LUN:
+		ctrl_info->unique_wwid_in_report_phys_lun_supported =
+			firmware_feature->enabled;
+		break;
+	case PQI_FIRMWARE_FEATURE_FW_TRIAGE:
+		ctrl_info->firmware_triage_supported = firmware_feature->enabled;
+		pqi_save_fw_triage_setting(ctrl_info, firmware_feature->enabled);
+		break;
+	case PQI_FIRMWARE_FEATURE_RPL_EXTENDED_FORMAT_4_5:
+		ctrl_info->rpl_extended_format_4_5_supported = firmware_feature->enabled;
+		break;
+	}
+
+	pqi_firmware_feature_status(ctrl_info, firmware_feature);
+}
+
+static inline void pqi_firmware_feature_update(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_firmware_feature *firmware_feature)
+{
+	if (firmware_feature->feature_status)
+		firmware_feature->feature_status(ctrl_info, firmware_feature);
+}
+
+static DEFINE_MUTEX(pqi_firmware_features_mutex);
+
+static struct pqi_firmware_feature pqi_firmware_features[] = {
+	{
+		.feature_name = "Online Firmware Activation",
+		.feature_bit = PQI_FIRMWARE_FEATURE_OFA,
+		.feature_status = pqi_firmware_feature_status,
+	},
+	{
+		.feature_name = "Serial Management Protocol",
+		.feature_bit = PQI_FIRMWARE_FEATURE_SMP,
+		.feature_status = pqi_firmware_feature_status,
+	},
+	{
+		.feature_name = "Maximum Known Feature",
+		.feature_bit = PQI_FIRMWARE_FEATURE_MAX_KNOWN_FEATURE,
+		.feature_status = pqi_firmware_feature_status,
+	},
+	{
+		.feature_name = "RAID 0 Read Bypass",
+		.feature_bit = PQI_FIRMWARE_FEATURE_RAID_0_READ_BYPASS,
+		.feature_status = pqi_firmware_feature_status,
+	},
+	{
+		.feature_name = "RAID 1 Read Bypass",
+		.feature_bit = PQI_FIRMWARE_FEATURE_RAID_1_READ_BYPASS,
+		.feature_status = pqi_firmware_feature_status,
+	},
+	{
+		.feature_name = "RAID 5 Read Bypass",
+		.feature_bit = PQI_FIRMWARE_FEATURE_RAID_5_READ_BYPASS,
+		.feature_status = pqi_firmware_feature_status,
+	},
+	{
+		.feature_name = "RAID 6 Read Bypass",
+		.feature_bit = PQI_FIRMWARE_FEATURE_RAID_6_READ_BYPASS,
+		.feature_status = pqi_firmware_feature_status,
+	},
+	{
+		.feature_name = "RAID 0 Write Bypass",
+		.feature_bit = PQI_FIRMWARE_FEATURE_RAID_0_WRITE_BYPASS,
+		.feature_status = pqi_firmware_feature_status,
+	},
+	{
+		.feature_name = "RAID 1 Write Bypass",
+		.feature_bit = PQI_FIRMWARE_FEATURE_RAID_1_WRITE_BYPASS,
+		.feature_status = pqi_ctrl_update_feature_flags,
+	},
+	{
+		.feature_name = "RAID 5 Write Bypass",
+		.feature_bit = PQI_FIRMWARE_FEATURE_RAID_5_WRITE_BYPASS,
+		.feature_status = pqi_ctrl_update_feature_flags,
+	},
+	{
+		.feature_name = "RAID 6 Write Bypass",
+		.feature_bit = PQI_FIRMWARE_FEATURE_RAID_6_WRITE_BYPASS,
+		.feature_status = pqi_ctrl_update_feature_flags,
+	},
+	{
+		.feature_name = "New Soft Reset Handshake",
+		.feature_bit = PQI_FIRMWARE_FEATURE_SOFT_RESET_HANDSHAKE,
+		.feature_status = pqi_ctrl_update_feature_flags,
+	},
+	{
+		.feature_name = "RAID IU Timeout",
+		.feature_bit = PQI_FIRMWARE_FEATURE_RAID_IU_TIMEOUT,
+		.feature_status = pqi_ctrl_update_feature_flags,
+	},
+	{
+		.feature_name = "TMF IU Timeout",
+		.feature_bit = PQI_FIRMWARE_FEATURE_TMF_IU_TIMEOUT,
+		.feature_status = pqi_ctrl_update_feature_flags,
+	},
+	{
+		.feature_name = "RAID Bypass on encrypted logical volumes on NVMe",
+		.feature_bit = PQI_FIRMWARE_FEATURE_RAID_BYPASS_ON_ENCRYPTED_NVME,
+		.feature_status = pqi_firmware_feature_status,
+	},
+	{
+		.feature_name = "Unique WWID in Report Physical LUN",
+		.feature_bit = PQI_FIRMWARE_FEATURE_UNIQUE_WWID_IN_REPORT_PHYS_LUN,
+		.feature_status = pqi_ctrl_update_feature_flags,
+	},
+	{
+		.feature_name = "Firmware Triage",
+		.feature_bit = PQI_FIRMWARE_FEATURE_FW_TRIAGE,
+		.feature_status = pqi_ctrl_update_feature_flags,
+	},
+	{
+		.feature_name = "RPL Extended Formats 4 and 5",
+		.feature_bit = PQI_FIRMWARE_FEATURE_RPL_EXTENDED_FORMAT_4_5,
+		.feature_status = pqi_ctrl_update_feature_flags,
+	},
+};
+
+static void pqi_process_firmware_features(
+	struct pqi_config_table_section_info *section_info)
+{
+	int rc;
+	struct pqi_ctrl_info *ctrl_info;
+	struct pqi_config_table_firmware_features *firmware_features;
+	void __iomem *firmware_features_iomem_addr;
+	unsigned int i;
+	unsigned int num_features_supported;
+
+	ctrl_info = section_info->ctrl_info;
+	firmware_features = section_info->section;
+	firmware_features_iomem_addr = section_info->section_iomem_addr;
+
+	for (i = 0, num_features_supported = 0;
+		i < ARRAY_SIZE(pqi_firmware_features); i++) {
+		if (pqi_is_firmware_feature_supported(firmware_features,
+			pqi_firmware_features[i].feature_bit)) {
+			pqi_firmware_features[i].supported = true;
+			num_features_supported++;
+		} else {
+			pqi_firmware_feature_update(ctrl_info,
+				&pqi_firmware_features[i]);
+		}
+	}
+
+	if (num_features_supported == 0)
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(pqi_firmware_features); i++) {
+		if (!pqi_firmware_features[i].supported)
+			continue;
+		pqi_request_firmware_feature(firmware_features,
+			pqi_firmware_features[i].feature_bit);
+	}
+
+	rc = pqi_enable_firmware_features(ctrl_info, firmware_features,
+		firmware_features_iomem_addr);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"failed to enable firmware features in PQI configuration table\n");
+		for (i = 0; i < ARRAY_SIZE(pqi_firmware_features); i++) {
+			if (!pqi_firmware_features[i].supported)
+				continue;
+			pqi_firmware_feature_update(ctrl_info,
+				&pqi_firmware_features[i]);
+		}
+		return;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(pqi_firmware_features); i++) {
+		if (!pqi_firmware_features[i].supported)
+			continue;
+		if (pqi_is_firmware_feature_enabled(firmware_features,
+			firmware_features_iomem_addr,
+			pqi_firmware_features[i].feature_bit)) {
+				pqi_firmware_features[i].enabled = true;
+		}
+		pqi_firmware_feature_update(ctrl_info,
+			&pqi_firmware_features[i]);
+	}
+}
+
+static void pqi_init_firmware_features(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(pqi_firmware_features); i++) {
+		pqi_firmware_features[i].supported = false;
+		pqi_firmware_features[i].enabled = false;
+	}
+}
+
+static void pqi_process_firmware_features_section(
+	struct pqi_config_table_section_info *section_info)
+{
+	mutex_lock(&pqi_firmware_features_mutex);
+	pqi_init_firmware_features();
+	pqi_process_firmware_features(section_info);
+	mutex_unlock(&pqi_firmware_features_mutex);
+}
+
+/*
+ * Reset all controller settings that can be initialized during the processing
+ * of the PQI Configuration Table.
+ */
+
+static void pqi_ctrl_reset_config(struct pqi_ctrl_info *ctrl_info)
+{
+	ctrl_info->heartbeat_counter = NULL;
+	ctrl_info->soft_reset_status = NULL;
+	ctrl_info->soft_reset_handshake_supported = false;
+	ctrl_info->enable_r1_writes = false;
+	ctrl_info->enable_r5_writes = false;
+	ctrl_info->enable_r6_writes = false;
+	ctrl_info->raid_iu_timeout_supported = false;
+	ctrl_info->tmf_iu_timeout_supported = false;
+	ctrl_info->unique_wwid_in_report_phys_lun_supported = false;
+	ctrl_info->firmware_triage_supported = false;
+	ctrl_info->rpl_extended_format_4_5_supported = false;
+}
+
+static int pqi_process_config_table(struct pqi_ctrl_info *ctrl_info)
+{
+	u32 table_length;
+	u32 section_offset;
+	bool firmware_feature_section_present;
+	void __iomem *table_iomem_addr;
+	struct pqi_config_table *config_table;
+	struct pqi_config_table_section_header *section;
+	struct pqi_config_table_section_info section_info;
+	struct pqi_config_table_section_info feature_section_info;
+
+	table_length = ctrl_info->config_table_length;
+	if (table_length == 0)
+		return 0;
+
+	config_table = kmalloc(table_length, GFP_KERNEL);
+	if (!config_table) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"failed to allocate memory for PQI configuration table\n");
+		return -ENOMEM;
+	}
+
+	/*
+	 * Copy the config table contents from I/O memory space into the
+	 * temporary buffer.
+	 */
+	table_iomem_addr = ctrl_info->iomem_base + ctrl_info->config_table_offset;
+	memcpy_fromio(config_table, table_iomem_addr, table_length);
+
+	firmware_feature_section_present = false;
+	section_info.ctrl_info = ctrl_info;
+	section_offset = get_unaligned_le32(&config_table->first_section_offset);
+
+	while (section_offset) {
+		section = (void *)config_table + section_offset;
+
+		section_info.section = section;
+		section_info.section_offset = section_offset;
+		section_info.section_iomem_addr = table_iomem_addr + section_offset;
+
+		switch (get_unaligned_le16(&section->section_id)) {
+		case PQI_CONFIG_TABLE_SECTION_FIRMWARE_FEATURES:
+			firmware_feature_section_present = true;
+			feature_section_info = section_info;
+			break;
+		case PQI_CONFIG_TABLE_SECTION_HEARTBEAT:
+			if (pqi_disable_heartbeat)
+				dev_warn(&ctrl_info->pci_dev->dev,
+				"heartbeat disabled by module parameter\n");
+			else
+				ctrl_info->heartbeat_counter =
+					table_iomem_addr +
+					section_offset +
+					offsetof(struct pqi_config_table_heartbeat,
+						heartbeat_counter);
+			break;
+		case PQI_CONFIG_TABLE_SECTION_SOFT_RESET:
+			ctrl_info->soft_reset_status =
+				table_iomem_addr +
+				section_offset +
+				offsetof(struct pqi_config_table_soft_reset,
+					soft_reset_status);
+			break;
+		}
+
+		section_offset = get_unaligned_le16(&section->next_section_offset);
+	}
+
+	/*
+	 * We process the firmware feature section after all other sections
+	 * have been processed so that the feature bit callbacks can take
+	 * into account the settings configured by other sections.
+	 */
+	if (firmware_feature_section_present)
+		pqi_process_firmware_features_section(&feature_section_info);
+
+	kfree(config_table);
+
+	return 0;
+}
+
+/* Switches the controller from PQI mode back into SIS mode. */
+
+static int pqi_revert_to_sis_mode(struct pqi_ctrl_info *ctrl_info)
+{
+	int rc;
+
+	pqi_change_irq_mode(ctrl_info, IRQ_MODE_NONE);
+	rc = pqi_reset(ctrl_info);
+	if (rc)
+		return rc;
+	rc = sis_reenable_sis_mode(ctrl_info);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"re-enabling SIS mode failed with error %d\n", rc);
+		return rc;
+	}
+	pqi_save_ctrl_mode(ctrl_info, SIS_MODE);
+
+	return 0;
+}
+
+/*
+ * If the controller isn't already in SIS mode, this function forces it into
+ * SIS mode.
+ */
+
+static int pqi_force_sis_mode(struct pqi_ctrl_info *ctrl_info)
+{
+	if (!sis_is_firmware_running(ctrl_info))
+		return -ENXIO;
+
+	if (pqi_get_ctrl_mode(ctrl_info) == SIS_MODE)
+		return 0;
+
+	if (sis_is_kernel_up(ctrl_info)) {
+		pqi_save_ctrl_mode(ctrl_info, SIS_MODE);
+		return 0;
+	}
+
+	return pqi_revert_to_sis_mode(ctrl_info);
+}
+
+static int pqi_ctrl_init(struct pqi_ctrl_info *ctrl_info)
+{
+	int rc;
+	u32 product_id;
+
+	if (reset_devices) {
+		if (pqi_is_fw_triage_supported(ctrl_info)) {
+			rc = sis_wait_for_fw_triage_completion(ctrl_info);
+			if (rc)
+				return rc;
+		}
+		sis_soft_reset(ctrl_info);
+		ssleep(PQI_POST_RESET_DELAY_SECS);
+	} else {
+		rc = pqi_force_sis_mode(ctrl_info);
+		if (rc)
+			return rc;
+	}
+
+	/*
+	 * Wait until the controller is ready to start accepting SIS
+	 * commands.
+	 */
+	rc = sis_wait_for_ctrl_ready(ctrl_info);
+	if (rc)
+		return rc;
+
+	/*
+	 * Get the controller properties.  This allows us to determine
+	 * whether or not it supports PQI mode.
+	 */
+	rc = sis_get_ctrl_properties(ctrl_info);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"error obtaining controller properties\n");
+		return rc;
+	}
+
+	rc = sis_get_pqi_capabilities(ctrl_info);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"error obtaining controller capabilities\n");
+		return rc;
+	}
+
+	product_id = sis_get_product_id(ctrl_info);
+	ctrl_info->product_id = (u8)product_id;
+	ctrl_info->product_revision = (u8)(product_id >> 8);
+
+	if (ctrl_info->product_id != PQI_CTRL_PRODUCT_ID_GEN1)
+		ctrl_info->enable_stream_detection = true;
+
+	if (reset_devices) {
+		if (ctrl_info->max_outstanding_requests >
+			PQI_MAX_OUTSTANDING_REQUESTS_KDUMP)
+				ctrl_info->max_outstanding_requests =
+					PQI_MAX_OUTSTANDING_REQUESTS_KDUMP;
+	} else {
+		if (ctrl_info->max_outstanding_requests >
+			PQI_MAX_OUTSTANDING_REQUESTS)
+				ctrl_info->max_outstanding_requests =
+					PQI_MAX_OUTSTANDING_REQUESTS;
+	}
+
+	pqi_calculate_io_resources(ctrl_info);
+
+	rc = pqi_alloc_error_buffer(ctrl_info);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"failed to allocate PQI error buffer\n");
+		return rc;
+	}
+
+	/*
+	 * If the function we are about to call succeeds, the
+	 * controller will transition from legacy SIS mode
+	 * into PQI mode.
+	 */
+	rc = sis_init_base_struct_addr(ctrl_info);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"error initializing PQI mode\n");
+		return rc;
+	}
+
+	/* Wait for the controller to complete the SIS -> PQI transition. */
+	rc = pqi_wait_for_pqi_mode_ready(ctrl_info);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"transition to PQI mode failed\n");
+		return rc;
+	}
+
+	/* From here on, we are running in PQI mode. */
+	ctrl_info->pqi_mode_enabled = true;
+	pqi_save_ctrl_mode(ctrl_info, PQI_MODE);
+
+	rc = pqi_alloc_admin_queues(ctrl_info);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"failed to allocate admin queues\n");
+		return rc;
+	}
+
+	rc = pqi_create_admin_queues(ctrl_info);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"error creating admin queues\n");
+		return rc;
+	}
+
+	rc = pqi_report_device_capability(ctrl_info);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"obtaining device capability failed\n");
+		return rc;
+	}
+
+	rc = pqi_validate_device_capability(ctrl_info);
+	if (rc)
+		return rc;
+
+	pqi_calculate_queue_resources(ctrl_info);
+
+	rc = pqi_enable_msix_interrupts(ctrl_info);
+	if (rc)
+		return rc;
+
+	if (ctrl_info->num_msix_vectors_enabled < ctrl_info->num_queue_groups) {
+		ctrl_info->max_msix_vectors =
+			ctrl_info->num_msix_vectors_enabled;
+		pqi_calculate_queue_resources(ctrl_info);
+	}
+
+	rc = pqi_alloc_io_resources(ctrl_info);
+	if (rc)
+		return rc;
+
+	rc = pqi_alloc_operational_queues(ctrl_info);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"failed to allocate operational queues\n");
+		return rc;
+	}
+
+	pqi_init_operational_queues(ctrl_info);
+
+	rc = pqi_create_queues(ctrl_info);
+	if (rc)
+		return rc;
+
+	rc = pqi_request_irqs(ctrl_info);
+	if (rc)
+		return rc;
+
+	pqi_change_irq_mode(ctrl_info, IRQ_MODE_MSIX);
+
+	ctrl_info->controller_online = true;
+
+	rc = pqi_process_config_table(ctrl_info);
+	if (rc)
+		return rc;
+
+	pqi_start_heartbeat_timer(ctrl_info);
+
+	if (ctrl_info->enable_r5_writes || ctrl_info->enable_r6_writes) {
+		rc = pqi_get_advanced_raid_bypass_config(ctrl_info);
+		if (rc) {
+			dev_err(&ctrl_info->pci_dev->dev,
+				"error obtaining advanced RAID bypass configuration\n");
+			return rc;
+		}
+		ctrl_info->ciss_report_log_flags |=
+			CISS_REPORT_LOG_FLAG_DRIVE_TYPE_MIX;
+	}
+
+	rc = pqi_enable_events(ctrl_info);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"error enabling events\n");
+		return rc;
+	}
+
+	/* Register with the SCSI subsystem. */
+	rc = pqi_register_scsi(ctrl_info);
+	if (rc)
+		return rc;
+
+	rc = pqi_get_ctrl_product_details(ctrl_info);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"error obtaining product details\n");
+		return rc;
+	}
+
+	rc = pqi_get_ctrl_serial_number(ctrl_info);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"error obtaining ctrl serial number\n");
+		return rc;
+	}
+
+	rc = pqi_set_diag_rescan(ctrl_info);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"error enabling multi-lun rescan\n");
+		return rc;
+	}
+
+	rc = pqi_write_driver_version_to_host_wellness(ctrl_info);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"error updating host wellness\n");
+		return rc;
+	}
+
+	pqi_schedule_update_time_worker(ctrl_info);
+
+	pqi_scan_scsi_devices(ctrl_info);
+
+	return 0;
+}
+
+static void pqi_reinit_queues(struct pqi_ctrl_info *ctrl_info)
+{
+	unsigned int i;
+	struct pqi_admin_queues *admin_queues;
+	struct pqi_event_queue *event_queue;
+
+	admin_queues = &ctrl_info->admin_queues;
+	admin_queues->iq_pi_copy = 0;
+	admin_queues->oq_ci_copy = 0;
+	writel(0, admin_queues->oq_pi);
+
+	for (i = 0; i < ctrl_info->num_queue_groups; i++) {
+		ctrl_info->queue_groups[i].iq_pi_copy[RAID_PATH] = 0;
+		ctrl_info->queue_groups[i].iq_pi_copy[AIO_PATH] = 0;
+		ctrl_info->queue_groups[i].oq_ci_copy = 0;
+
+		writel(0, ctrl_info->queue_groups[i].iq_ci[RAID_PATH]);
+		writel(0, ctrl_info->queue_groups[i].iq_ci[AIO_PATH]);
+		writel(0, ctrl_info->queue_groups[i].oq_pi);
+	}
+
+	event_queue = &ctrl_info->event_queue;
+	writel(0, event_queue->oq_pi);
+	event_queue->oq_ci_copy = 0;
+}
+
+static int pqi_ctrl_init_resume(struct pqi_ctrl_info *ctrl_info)
+{
+	int rc;
+
+	rc = pqi_force_sis_mode(ctrl_info);
+	if (rc)
+		return rc;
+
+	/*
+	 * Wait until the controller is ready to start accepting SIS
+	 * commands.
+	 */
+	rc = sis_wait_for_ctrl_ready_resume(ctrl_info);
+	if (rc)
+		return rc;
+
+	/*
+	 * Get the controller properties.  This allows us to determine
+	 * whether or not it supports PQI mode.
+	 */
+	rc = sis_get_ctrl_properties(ctrl_info);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"error obtaining controller properties\n");
+		return rc;
+	}
+
+	rc = sis_get_pqi_capabilities(ctrl_info);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"error obtaining controller capabilities\n");
+		return rc;
+	}
+
+	/*
+	 * If the function we are about to call succeeds, the
+	 * controller will transition from legacy SIS mode
+	 * into PQI mode.
+	 */
+	rc = sis_init_base_struct_addr(ctrl_info);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"error initializing PQI mode\n");
+		return rc;
+	}
+
+	/* Wait for the controller to complete the SIS -> PQI transition. */
+	rc = pqi_wait_for_pqi_mode_ready(ctrl_info);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"transition to PQI mode failed\n");
+		return rc;
+	}
+
+	/* From here on, we are running in PQI mode. */
+	ctrl_info->pqi_mode_enabled = true;
+	pqi_save_ctrl_mode(ctrl_info, PQI_MODE);
+
+	pqi_reinit_queues(ctrl_info);
+
+	rc = pqi_create_admin_queues(ctrl_info);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"error creating admin queues\n");
+		return rc;
+	}
+
+	rc = pqi_create_queues(ctrl_info);
+	if (rc)
+		return rc;
+
+	pqi_change_irq_mode(ctrl_info, IRQ_MODE_MSIX);
+
+	ctrl_info->controller_online = true;
+	pqi_ctrl_unblock_requests(ctrl_info);
+
+	pqi_ctrl_reset_config(ctrl_info);
+
+	rc = pqi_process_config_table(ctrl_info);
+	if (rc)
+		return rc;
+
+	pqi_start_heartbeat_timer(ctrl_info);
+
+	if (ctrl_info->enable_r5_writes || ctrl_info->enable_r6_writes) {
+		rc = pqi_get_advanced_raid_bypass_config(ctrl_info);
+		if (rc) {
+			dev_err(&ctrl_info->pci_dev->dev,
+				"error obtaining advanced RAID bypass configuration\n");
+			return rc;
+		}
+		ctrl_info->ciss_report_log_flags |=
+			CISS_REPORT_LOG_FLAG_DRIVE_TYPE_MIX;
+	}
+
+	rc = pqi_enable_events(ctrl_info);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"error enabling events\n");
+		return rc;
+	}
+
+	rc = pqi_get_ctrl_product_details(ctrl_info);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"error obtaining product details\n");
+		return rc;
+	}
+
+	rc = pqi_set_diag_rescan(ctrl_info);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"error enabling multi-lun rescan\n");
+		return rc;
+	}
+
+	rc = pqi_write_driver_version_to_host_wellness(ctrl_info);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"error updating host wellness\n");
+		return rc;
+	}
+
+	if (pqi_ofa_in_progress(ctrl_info))
+		pqi_ctrl_unblock_scan(ctrl_info);
+
+	pqi_scan_scsi_devices(ctrl_info);
+
+	return 0;
+}
+
+static inline int pqi_set_pcie_completion_timeout(struct pci_dev *pci_dev, u16 timeout)
+{
+	return pcie_capability_clear_and_set_word(pci_dev, PCI_EXP_DEVCTL2,
+		PCI_EXP_DEVCTL2_COMP_TIMEOUT, timeout);
+}
+
+static int pqi_pci_init(struct pqi_ctrl_info *ctrl_info)
+{
+	int rc;
+	u64 mask;
+
+	rc = pci_enable_device(ctrl_info->pci_dev);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"failed to enable PCI device\n");
+		return rc;
+	}
+
+	if (sizeof(dma_addr_t) > 4)
+		mask = DMA_BIT_MASK(64);
+	else
+		mask = DMA_BIT_MASK(32);
+
+	rc = pqi_dma_set_mask_and_coherent(&ctrl_info->pci_dev->dev, mask);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev, "failed to set DMA mask\n");
+		goto disable_device;
+	}
+
+	rc = pci_request_regions(ctrl_info->pci_dev, DRIVER_NAME_SHORT);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"failed to obtain PCI resources\n");
+		goto disable_device;
+	}
+
+	ctrl_info->iomem_base = ioremap_nocache(pci_resource_start(
+		ctrl_info->pci_dev, 0),
+		sizeof(struct pqi_ctrl_registers));
+	if (!ctrl_info->iomem_base) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"failed to map memory for controller registers\n");
+		rc = -ENOMEM;
+		goto release_regions;
+	}
+
+#define PCI_EXP_COMP_TIMEOUT_65_TO_210_MS		0x6
+
+	/* Increase the PCIe completion timeout. */
+	rc = pqi_set_pcie_completion_timeout(ctrl_info->pci_dev,
+		PCI_EXP_COMP_TIMEOUT_65_TO_210_MS);
+	if (rc) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"failed to set PCIe completion timeout\n");
+		goto release_regions;
+	}
+
+	/* Enable bus mastering. */
+	pci_set_master(ctrl_info->pci_dev);
+
+	ctrl_info->registers = ctrl_info->iomem_base;
+	ctrl_info->pqi_registers = &ctrl_info->registers->pqi_registers;
+
+	pci_set_drvdata(ctrl_info->pci_dev, ctrl_info);
+
+	return 0;
+
+release_regions:
+	pci_release_regions(ctrl_info->pci_dev);
+disable_device:
+	pci_disable_device(ctrl_info->pci_dev);
+
+	return rc;
+}
+
+static void pqi_cleanup_pci_init(struct pqi_ctrl_info *ctrl_info)
+{
+	iounmap(ctrl_info->iomem_base);
+	pci_release_regions(ctrl_info->pci_dev);
+	if (pci_is_enabled(ctrl_info->pci_dev))
+		pci_disable_device(ctrl_info->pci_dev);
+	pci_set_drvdata(ctrl_info->pci_dev, NULL);
+}
+
+static struct pqi_ctrl_info *pqi_alloc_ctrl_info(int numa_node)
+{
+	struct pqi_ctrl_info *ctrl_info;
+
+	ctrl_info = kzalloc_node(sizeof(struct pqi_ctrl_info),
+			GFP_KERNEL, numa_node);
+	if (!ctrl_info)
+		return NULL;
+
+	mutex_init(&ctrl_info->scan_mutex);
+	mutex_init(&ctrl_info->lun_reset_mutex);
+	mutex_init(&ctrl_info->ofa_mutex);
+
+	INIT_LIST_HEAD(&ctrl_info->scsi_device_list);
+	spin_lock_init(&ctrl_info->scsi_device_list_lock);
+
+	INIT_WORK(&ctrl_info->event_work, pqi_event_worker);
+	atomic_set(&ctrl_info->num_interrupts, 0);
+	atomic_set(&ctrl_info->total_scmds_outstanding, 0);
+
+	INIT_DELAYED_WORK(&ctrl_info->rescan_work, pqi_rescan_worker);
+	INIT_DELAYED_WORK(&ctrl_info->update_time_work, pqi_update_time_worker);
+
+	timer_setup(&ctrl_info->heartbeat_timer, pqi_heartbeat_timer_handler, 0);
+	INIT_WORK(&ctrl_info->ctrl_offline_work, pqi_ctrl_offline_worker);
+
+	INIT_WORK(&ctrl_info->ofa_memory_alloc_work, pqi_ofa_memory_alloc_worker);
+	INIT_WORK(&ctrl_info->ofa_quiesce_work, pqi_ofa_quiesce_worker);
+
+	sema_init(&ctrl_info->sync_request_sem,
+		PQI_RESERVED_IO_SLOTS_SYNCHRONOUS_REQUESTS);
+	init_waitqueue_head(&ctrl_info->block_requests_wait);
+
+	ctrl_info->ctrl_id = atomic_inc_return(&pqi_controller_count) - 1;
+	ctrl_info->irq_mode = IRQ_MODE_NONE;
+	ctrl_info->max_msix_vectors = PQI_MAX_MSIX_VECTORS;
+
+	ctrl_info->ciss_report_log_flags = CISS_REPORT_LOG_FLAG_UNIQUE_LUN_ID;
+	ctrl_info->max_transfer_encrypted_sas_sata =
+		PQI_DEFAULT_MAX_TRANSFER_ENCRYPTED_SAS_SATA;
+	ctrl_info->max_transfer_encrypted_nvme =
+		PQI_DEFAULT_MAX_TRANSFER_ENCRYPTED_NVME;
+	ctrl_info->max_write_raid_5_6 = PQI_DEFAULT_MAX_WRITE_RAID_5_6;
+	ctrl_info->max_write_raid_1_10_2drive = ~0;
+	ctrl_info->max_write_raid_1_10_3drive = ~0;
+
+	return ctrl_info;
+}
+
+static inline void pqi_free_ctrl_info(struct pqi_ctrl_info *ctrl_info)
+{
+	kfree(ctrl_info);
+}
+
+static void pqi_free_interrupts(struct pqi_ctrl_info *ctrl_info)
+{
+	pqi_free_irqs(ctrl_info);
+	pqi_disable_msix_interrupts(ctrl_info);
+}
+
+static void pqi_free_ctrl_resources(struct pqi_ctrl_info *ctrl_info)
+{
+	pqi_stop_heartbeat_timer(ctrl_info);
+	pqi_free_interrupts(ctrl_info);
+	if (ctrl_info->queue_memory_base)
+		dma_free_coherent(&ctrl_info->pci_dev->dev,
+			ctrl_info->queue_memory_length,
+			ctrl_info->queue_memory_base,
+			ctrl_info->queue_memory_base_dma_handle);
+	if (ctrl_info->admin_queue_memory_base)
+		dma_free_coherent(&ctrl_info->pci_dev->dev,
+			ctrl_info->admin_queue_memory_length,
+			ctrl_info->admin_queue_memory_base,
+			ctrl_info->admin_queue_memory_base_dma_handle);
+	pqi_free_all_io_requests(ctrl_info);
+	if (ctrl_info->error_buffer)
+		dma_free_coherent(&ctrl_info->pci_dev->dev,
+			ctrl_info->error_buffer_length,
+			ctrl_info->error_buffer,
+			ctrl_info->error_buffer_dma_handle);
+	if (ctrl_info->iomem_base)
+		pqi_cleanup_pci_init(ctrl_info);
+	pqi_free_ctrl_info(ctrl_info);
+}
+
+static void pqi_remove_ctrl(struct pqi_ctrl_info *ctrl_info)
+{
+	pqi_cancel_rescan_worker(ctrl_info);
+	pqi_cancel_update_time_worker(ctrl_info);
+	pqi_remove_all_scsi_devices(ctrl_info);
+	pqi_unregister_scsi(ctrl_info);
+	if (ctrl_info->pqi_mode_enabled)
+		pqi_revert_to_sis_mode(ctrl_info);
+	pqi_free_ctrl_resources(ctrl_info);
+}
+
+static void pqi_ofa_ctrl_quiesce(struct pqi_ctrl_info *ctrl_info)
+{
+	pqi_ctrl_block_scan(ctrl_info);
+	pqi_scsi_block_requests(ctrl_info);
+	pqi_ctrl_block_device_reset(ctrl_info);
+	pqi_ctrl_block_requests(ctrl_info);
+	pqi_ctrl_wait_until_quiesced(ctrl_info);
+	pqi_stop_heartbeat_timer(ctrl_info);
+}
+
+static void pqi_ofa_ctrl_unquiesce(struct pqi_ctrl_info *ctrl_info)
+{
+	pqi_start_heartbeat_timer(ctrl_info);
+	pqi_ctrl_unblock_requests(ctrl_info);
+	pqi_ctrl_unblock_device_reset(ctrl_info);
+	pqi_scsi_unblock_requests(ctrl_info);
+	pqi_ctrl_unblock_scan(ctrl_info);
+}
+
+static int pqi_ofa_alloc_mem(struct pqi_ctrl_info *ctrl_info, u32 total_size, u32 chunk_size)
+{
+	int i;
+	u32 sg_count;
+	struct device *dev;
+	struct pqi_ofa_memory *ofap;
+	struct pqi_sg_descriptor *mem_descriptor;
+	dma_addr_t dma_handle;
+
+	ofap = ctrl_info->pqi_ofa_mem_virt_addr;
+
+	sg_count = DIV_ROUND_UP(total_size, chunk_size);
+	if (sg_count == 0 || sg_count > PQI_OFA_MAX_SG_DESCRIPTORS)
+		goto out;
+
+	ctrl_info->pqi_ofa_chunk_virt_addr = kmalloc(sg_count * sizeof(void *), GFP_KERNEL);
+	if (!ctrl_info->pqi_ofa_chunk_virt_addr)
+		goto out;
+
+	dev = &ctrl_info->pci_dev->dev;
+
+	for (i = 0; i < sg_count; i++) {
+		ctrl_info->pqi_ofa_chunk_virt_addr[i] =
+			dma_zalloc_coherent(dev, chunk_size, &dma_handle, GFP_KERNEL);
+		if (!ctrl_info->pqi_ofa_chunk_virt_addr[i])
+			goto out_free_chunks;
+		mem_descriptor = &ofap->sg_descriptor[i];
+		put_unaligned_le64((u64)dma_handle, &mem_descriptor->address);
+		put_unaligned_le32(chunk_size, &mem_descriptor->length);
+	}
+
+	put_unaligned_le32(CISS_SG_LAST, &mem_descriptor->flags);
+	put_unaligned_le16(sg_count, &ofap->num_memory_descriptors);
+	put_unaligned_le32(sg_count * chunk_size, &ofap->bytes_allocated);
+
+	return 0;
+
+out_free_chunks:
+	while (--i >= 0) {
+		mem_descriptor = &ofap->sg_descriptor[i];
+		dma_free_coherent(dev, chunk_size,
+			ctrl_info->pqi_ofa_chunk_virt_addr[i],
+			get_unaligned_le64(&mem_descriptor->address));
+	}
+	kfree(ctrl_info->pqi_ofa_chunk_virt_addr);
+
+out:
+	return -ENOMEM;
+}
+
+static int pqi_ofa_alloc_host_buffer(struct pqi_ctrl_info *ctrl_info)
+{
+	u32 total_size;
+	u32 chunk_size;
+	u32 min_chunk_size;
+
+	if (ctrl_info->ofa_bytes_requested == 0)
+		return 0;
+
+	total_size = PAGE_ALIGN(ctrl_info->ofa_bytes_requested);
+	min_chunk_size = DIV_ROUND_UP(total_size, PQI_OFA_MAX_SG_DESCRIPTORS);
+	min_chunk_size = PAGE_ALIGN(min_chunk_size);
+
+	for (chunk_size = total_size; chunk_size >= min_chunk_size;) {
+		if (pqi_ofa_alloc_mem(ctrl_info, total_size, chunk_size) == 0)
+			return 0;
+		chunk_size /= 2;
+		chunk_size = PAGE_ALIGN(chunk_size);
+	}
+
+	return -ENOMEM;
+}
+
+static void pqi_ofa_setup_host_buffer(struct pqi_ctrl_info *ctrl_info)
+{
+	struct device *dev;
+	struct pqi_ofa_memory *ofap;
+
+	dev = &ctrl_info->pci_dev->dev;
+
+	ofap = dma_zalloc_coherent(dev, sizeof(*ofap),
+		&ctrl_info->pqi_ofa_mem_dma_handle, GFP_KERNEL);
+	if (!ofap)
+		return;
+
+	ctrl_info->pqi_ofa_mem_virt_addr = ofap;
+
+	if (pqi_ofa_alloc_host_buffer(ctrl_info) < 0) {
+		dev_err(dev,
+			"failed to allocate host buffer for Online Firmware Activation\n");
+		dma_free_coherent(dev, sizeof(*ofap), ofap, ctrl_info->pqi_ofa_mem_dma_handle);
+		ctrl_info->pqi_ofa_mem_virt_addr = NULL;
+		return;
+	}
+
+	put_unaligned_le16(PQI_OFA_VERSION, &ofap->version);
+	memcpy(&ofap->signature, PQI_OFA_SIGNATURE, sizeof(ofap->signature));
+}
+
+static void pqi_ofa_free_host_buffer(struct pqi_ctrl_info *ctrl_info)
+{
+	unsigned int i;
+	struct device *dev;
+	struct pqi_ofa_memory *ofap;
+	struct pqi_sg_descriptor *mem_descriptor;
+	unsigned int num_memory_descriptors;
+
+	ofap = ctrl_info->pqi_ofa_mem_virt_addr;
+	if (!ofap)
+		return;
+
+	dev = &ctrl_info->pci_dev->dev;
+
+	if (get_unaligned_le32(&ofap->bytes_allocated) == 0)
+		goto out;
+
+	mem_descriptor = ofap->sg_descriptor;
+	num_memory_descriptors =
+		get_unaligned_le16(&ofap->num_memory_descriptors);
+
+	for (i = 0; i < num_memory_descriptors; i++) {
+		dma_free_coherent(dev,
+			get_unaligned_le32(&mem_descriptor[i].length),
+			ctrl_info->pqi_ofa_chunk_virt_addr[i],
+			get_unaligned_le64(&mem_descriptor[i].address));
+	}
+	kfree(ctrl_info->pqi_ofa_chunk_virt_addr);
+
+out:
+	dma_free_coherent(dev, sizeof(*ofap), ofap,
+		ctrl_info->pqi_ofa_mem_dma_handle);
+	ctrl_info->pqi_ofa_mem_virt_addr = NULL;
+}
+
+static int pqi_ofa_host_memory_update(struct pqi_ctrl_info *ctrl_info)
+{
+	u32 buffer_length;
+	struct pqi_vendor_general_request request;
+	struct pqi_ofa_memory *ofap;
+
+	memset(&request, 0, sizeof(request));
+
+	request.header.iu_type = PQI_REQUEST_IU_VENDOR_GENERAL;
+	put_unaligned_le16(sizeof(request) - PQI_REQUEST_HEADER_LENGTH,
+		&request.header.iu_length);
+	put_unaligned_le16(PQI_VENDOR_GENERAL_HOST_MEMORY_UPDATE,
+		&request.function_code);
+
+	ofap = ctrl_info->pqi_ofa_mem_virt_addr;
+
+	if (ofap) {
+		buffer_length = offsetof(struct pqi_ofa_memory, sg_descriptor) +
+			get_unaligned_le16(&ofap->num_memory_descriptors) *
+			sizeof(struct pqi_sg_descriptor);
+
+		put_unaligned_le64((u64)ctrl_info->pqi_ofa_mem_dma_handle,
+			&request.data.ofa_memory_allocation.buffer_address);
+		put_unaligned_le32(buffer_length,
+			&request.data.ofa_memory_allocation.buffer_length);
+	}
+
+	return pqi_submit_raid_request_synchronous(ctrl_info, &request.header,
+		0, NULL);
+}
+
+static int pqi_ofa_ctrl_restart(struct pqi_ctrl_info *ctrl_info, unsigned int delay_secs)
+{
+	ssleep(delay_secs);
+
+	return pqi_ctrl_init_resume(ctrl_info);
+}
+
+static void pqi_perform_lockup_action(void)
+{
+	switch (pqi_lockup_action) {
+	case PANIC:
+		panic("FATAL: Smart Family Controller lockup detected");
+		break;
+	case REBOOT:
+		emergency_restart();
+		break;
+	case NONE:
+	default:
+		break;
+	}
+}
+
+static struct pqi_raid_error_info pqi_ctrl_offline_raid_error_info = {
+	.data_out_result = PQI_DATA_IN_OUT_HARDWARE_ERROR,
+	.status = SAM_STAT_CHECK_CONDITION,
+};
+
+static void pqi_fail_all_outstanding_requests(struct pqi_ctrl_info *ctrl_info)
+{
+	unsigned int i;
+	struct pqi_io_request *io_request;
+	struct scsi_cmnd *scmd;
+	struct scsi_device *sdev;
+
+	for (i = 0; i < ctrl_info->max_io_slots; i++) {
+		io_request = &ctrl_info->io_request_pool[i];
+		if (atomic_read(&io_request->refcount) == 0)
+			continue;
+
+		scmd = io_request->scmd;
+		if (scmd) {
+			sdev = scmd->device;
+			if (!sdev || !scsi_device_online(sdev)) {
+				pqi_free_io_request(io_request);
+				continue;
+			} else {
+				set_host_byte(scmd, DID_NO_CONNECT);
+			}
+		} else {
+			io_request->status = -ENXIO;
+			io_request->error_info =
+				&pqi_ctrl_offline_raid_error_info;
+		}
+
+		io_request->io_complete_callback(io_request,
+			io_request->context);
+	}
+}
+
+static void pqi_take_ctrl_offline_deferred(struct pqi_ctrl_info *ctrl_info)
+{
+	pqi_perform_lockup_action();
+	pqi_stop_heartbeat_timer(ctrl_info);
+	pqi_free_interrupts(ctrl_info);
+	pqi_cancel_rescan_worker(ctrl_info);
+	pqi_cancel_update_time_worker(ctrl_info);
+	pqi_ctrl_wait_until_quiesced(ctrl_info);
+	pqi_fail_all_outstanding_requests(ctrl_info);
+	pqi_ctrl_unblock_requests(ctrl_info);
+}
+
+static void pqi_ctrl_offline_worker(struct work_struct *work)
+{
+	struct pqi_ctrl_info *ctrl_info;
+
+	ctrl_info = container_of(work, struct pqi_ctrl_info, ctrl_offline_work);
+	pqi_take_ctrl_offline_deferred(ctrl_info);
+}
+
+static void pqi_take_ctrl_offline(struct pqi_ctrl_info *ctrl_info,
+	enum pqi_ctrl_shutdown_reason ctrl_shutdown_reason)
+{
+	if (!ctrl_info->controller_online)
+		return;
+
+	ctrl_info->controller_online = false;
+	ctrl_info->pqi_mode_enabled = false;
+	pqi_ctrl_block_requests(ctrl_info);
+	if (!pqi_disable_ctrl_shutdown)
+		sis_shutdown_ctrl(ctrl_info, ctrl_shutdown_reason);
+	pci_disable_device(ctrl_info->pci_dev);
+	dev_err(&ctrl_info->pci_dev->dev, "controller offline\n");
+	schedule_work(&ctrl_info->ctrl_offline_work);
+}
+
+static void pqi_print_ctrl_info(struct pci_dev *pci_dev,
+	const struct pci_device_id *id)
+{
+	char *ctrl_description;
+
+	if (id->driver_data)
+		ctrl_description = (char *)id->driver_data;
+	else
+		ctrl_description = "Microchip Smart Family Controller";
+
+	dev_info(&pci_dev->dev, "%s found\n", ctrl_description);
+}
+
+static int pqi_pci_probe(struct pci_dev *pci_dev,
+	const struct pci_device_id *id)
+{
+	int rc;
+	int node, cp_node;
+	struct pqi_ctrl_info *ctrl_info;
+
+	pqi_print_ctrl_info(pci_dev, id);
+
+	if (pqi_disable_device_id_wildcards &&
+		id->subvendor == PCI_ANY_ID &&
+		id->subdevice == PCI_ANY_ID) {
+		dev_warn(&pci_dev->dev,
+			"controller not probed because device ID wildcards are disabled\n");
+		return -ENODEV;
+	}
+
+	if (id->subvendor == PCI_ANY_ID || id->subdevice == PCI_ANY_ID)
+		dev_warn(&pci_dev->dev,
+			"controller device ID matched using wildcards\n");
+
+	node = dev_to_node(&pci_dev->dev);
+	if (node == NUMA_NO_NODE) {
+		cp_node = cpu_to_node(0);
+		if (cp_node == NUMA_NO_NODE)
+			cp_node = 0;
+		set_dev_node(&pci_dev->dev, cp_node);
+	}
+
+	ctrl_info = pqi_alloc_ctrl_info(node);
+	if (!ctrl_info) {
+		dev_err(&pci_dev->dev,
+			"failed to allocate controller info block\n");
+		return -ENOMEM;
+	}
+
+	ctrl_info->pci_dev = pci_dev;
+
+	rc = pqi_pci_init(ctrl_info);
+	if (rc)
+		goto error;
+
+	rc = pqi_ctrl_init(ctrl_info);
+	if (rc)
+		goto error;
+
+	return 0;
+
+error:
+	pqi_remove_ctrl(ctrl_info);
+
+	return rc;
+}
+
+static void pqi_pci_remove(struct pci_dev *pci_dev)
+{
+	struct pqi_ctrl_info *ctrl_info;
+
+	ctrl_info = pci_get_drvdata(pci_dev);
+	if (!ctrl_info)
+		return;
+
+	pqi_remove_ctrl(ctrl_info);
+}
+
+static void pqi_dump_request(struct pqi_ctrl_info *ctrl_info,
+				struct pqi_io_request *io_request)
+{
+	struct scsi_cmnd *scmd;
+
+	scmd = io_request->scmd;
+	if (scmd) {
+		struct Scsi_Host *shost;
+		struct pqi_scsi_dev *device;
+
+		if (scmd->device == NULL || scmd->device->host == NULL ||
+			scmd->device->hostdata == NULL)
+			return;
+
+		shost = scmd->device->host;
+		device = scmd->device->hostdata;
+
+		dev_warn(&ctrl_info->pci_dev->dev,
+		"%d:%d:%d:%d scsicmnd=[0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x  0x%02x] scmd=%p outstanding cmds = %d\n",
+		shost->host_no, device->bus, device->target, device->lun,
+		scmd->cmnd[0], scmd->cmnd[1], scmd->cmnd[2], scmd->cmnd[3],
+		scmd->cmnd[4], scmd->cmnd[5], scmd->cmnd[6], scmd->cmnd[7],
+		scmd->cmnd[8], scmd->cmnd[9], scmd->cmnd[10], scmd->cmnd[11],
+		scmd->cmnd[12], scmd->cmnd[13], scmd->cmnd[14], scmd->cmnd[15],
+		scmd, atomic_read(&device->scsi_cmds_outstanding));
+	} else {
+		struct pqi_iu_header request_h;
+		size_t iu_length;
+
+		memcpy(&request_h, io_request->iu, PQI_REQUEST_HEADER_LENGTH);
+		iu_length = get_unaligned_le16(&request_h.iu_length) +
+					PQI_REQUEST_HEADER_LENGTH;
+
+		dev_warn(&ctrl_info->pci_dev->dev,
+			"sync cmd IU type = 0x%02x len = %u\n",
+			request_h.iu_type, request_h.iu_length);
+	}
+}
+
+static void pqi_crash_if_pending_command(struct pqi_ctrl_info *ctrl_info)
+{
+	unsigned int i;
+	struct pqi_io_request *io_request;
+	bool pending = false;
+
+	for (i = 0; i < ctrl_info->max_io_slots; i++) {
+		io_request = &ctrl_info->io_request_pool[i];
+		if (atomic_read(&io_request->refcount) == 0)
+			continue;
+		pqi_dump_request(ctrl_info, io_request);
+		pending = true;
+	}
+	BUG_ON(pending);
+}
+
+static void pqi_shutdown(struct pci_dev *pci_dev)
+{
+	int rc;
+	struct pqi_ctrl_info *ctrl_info;
+	enum bmic_flush_cache_shutdown_event shutdown_event;
+
+	ctrl_info = pci_get_drvdata(pci_dev);
+	if (!ctrl_info) {
+		dev_err(&pci_dev->dev,
+			"cache could not be flushed\n");
+		return;
+	}
+
+	pqi_wait_until_ofa_finished(ctrl_info);
+
+	pqi_scsi_block_requests(ctrl_info);
+	pqi_ctrl_block_device_reset(ctrl_info);
+	pqi_ctrl_block_requests(ctrl_info);
+	pqi_ctrl_wait_until_quiesced(ctrl_info);
+
+	if (system_state == SYSTEM_RESTART)
+		shutdown_event = RESTART;
+	else
+		shutdown_event = SHUTDOWN;
+
+	/*
+	 * Write all data in the controller's battery-backed cache to
+	 * storage.
+	 */
+	rc = pqi_flush_cache(ctrl_info, shutdown_event);
+	if (rc)
+		dev_err(&pci_dev->dev,
+			"unable to flush controller cache\n");
+
+	pqi_crash_if_pending_command(ctrl_info);
+	pqi_reset(ctrl_info);
+}
+
+static void pqi_process_lockup_action_param(void)
+{
+	unsigned int i;
+
+	if (!pqi_lockup_action_param)
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(pqi_lockup_actions); i++) {
+		if (strcmp(pqi_lockup_action_param,
+			pqi_lockup_actions[i].name) == 0) {
+			pqi_lockup_action = pqi_lockup_actions[i].action;
+			return;
+		}
+	}
+
+	pr_warn("%s: invalid lockup action setting \"%s\" - supported settings: none, reboot, panic\n",
+		DRIVER_NAME_SHORT, pqi_lockup_action_param);
+}
+
+static void pqi_process_module_params(void)
+{
+	pqi_process_lockup_action_param();
+}
+
+static inline enum bmic_flush_cache_shutdown_event pqi_get_flush_cache_shutdown_event(struct pci_dev *pci_dev)
+{
+	if (pci_dev->subsystem_vendor == PCI_VENDOR_ID_ADAPTEC2 && pci_dev->subsystem_device == 0x1304)
+		return RESTART;
+	return SUSPEND;
+}
+
+#if defined(CONFIG_PM)
+
+static int pqi_suspend(struct pci_dev *pci_dev, pm_message_t state)
+{
+	struct pqi_ctrl_info *ctrl_info;
+	enum bmic_flush_cache_shutdown_event shutdown_event;
+
+	shutdown_event = pqi_get_flush_cache_shutdown_event(pci_dev);
+	ctrl_info = pci_get_drvdata(pci_dev);
+
+	pqi_wait_until_ofa_finished(ctrl_info);
+
+	pqi_ctrl_block_scan(ctrl_info);
+	pqi_scsi_block_requests(ctrl_info);
+	pqi_ctrl_block_device_reset(ctrl_info);
+	pqi_ctrl_block_requests(ctrl_info);
+	pqi_ctrl_wait_until_quiesced(ctrl_info);
+	pqi_flush_cache(ctrl_info, shutdown_event);
+	pqi_stop_heartbeat_timer(ctrl_info);
+
+	pqi_crash_if_pending_command(ctrl_info);
+
+	if (state.event == PM_EVENT_FREEZE)
+		return 0;
+
+	pci_save_state(pci_dev);
+	pci_set_power_state(pci_dev, pci_choose_state(pci_dev, state));
+
+	ctrl_info->controller_online = false;
+	ctrl_info->pqi_mode_enabled = false;
+
+	return 0;
+}
+
+static int pqi_resume(struct pci_dev *pci_dev)
+{
+	int rc;
+	struct pqi_ctrl_info *ctrl_info;
+
+	ctrl_info = pci_get_drvdata(pci_dev);
+
+	if (pci_dev->current_state != PCI_D0) {
+		ctrl_info->max_hw_queue_index = 0;
+		pqi_free_interrupts(ctrl_info);
+		pqi_change_irq_mode(ctrl_info, IRQ_MODE_INTX);
+		rc = request_irq(pqi_pci_irq_vector(pci_dev, 0), pqi_irq_handler,
+			IRQF_SHARED, DRIVER_NAME_SHORT,
+			pqi_get_irq_cookie(ctrl_info, 0));
+		if (rc) {
+			dev_err(&ctrl_info->pci_dev->dev,
+				"irq %u init failed with error %d\n",
+				pci_dev->irq, rc);
+			return rc;
+		}
+		pqi_ctrl_unblock_device_reset(ctrl_info);
+		pqi_ctrl_unblock_requests(ctrl_info);
+		pqi_scsi_unblock_requests(ctrl_info);
+		pqi_ctrl_unblock_scan(ctrl_info);
+		return 0;
+	}
+
+	pci_set_power_state(pci_dev, PCI_D0);
+	pci_restore_state(pci_dev);
+
+	pqi_ctrl_unblock_device_reset(ctrl_info);
+	pqi_ctrl_unblock_requests(ctrl_info);
+	pqi_scsi_unblock_requests(ctrl_info);
+	pqi_ctrl_unblock_scan(ctrl_info);
+
+	return pqi_ctrl_init_resume(ctrl_info);
+}
+
+#endif /* CONFIG_PM */
+
+/* Define the PCI IDs for the controllers that we support. */
+static const struct pci_device_id pqi_pci_id_table[] = {
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_FOXCONN, 0x1211)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_FOXCONN, 0x1321)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_QUANTA, 0x8a22)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_QUANTA, 0x8a23)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_QUANTA, 0x8a24)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_QUANTA, 0x8a36)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_QUANTA, 0x8a37)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_H3C, 0x1104)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_H3C, 0x1105)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_H3C, 0x1106)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_H3C, 0x1107)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_H3C, 0x1108)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_H3C, 0x1109)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_H3C, 0x8460)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_H3C, 0x8461)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_H3C, 0xc460)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_H3C, 0xc461)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_H3C, 0xf460)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_H3C, 0xf461)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_INSPUR, 0x0045)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_INSPUR, 0x0046)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_INSPUR, 0x0047)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_INSPUR, 0x0048)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_INSPUR, 0x004a)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_INSPUR, 0x004b)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_INSPUR, 0x004c)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_INSPUR, 0x004f)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_INSPUR, 0x0051)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_INSPUR, 0x0052)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_INSPUR, 0x0053)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_INSPUR, 0x0054)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HUAWEI, 0xd227)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HUAWEI, 0xd228)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HUAWEI, 0xd229)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HUAWEI, 0xd22a)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HUAWEI, 0xd22b)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HUAWEI, 0xd22c)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x0110)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x0608)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x0800)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x0801)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x0802)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x0803)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x0804)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x0805)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x0806)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x0807)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x0808)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x0809)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x080a)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x0900)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x0901)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x0902)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x0903)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x0904)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x0905)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x0906)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x0907)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x0908)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x090a)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1200)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1201)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1202)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1280)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1281)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1282)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1300)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1301)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1302)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1303)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1304)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1380)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1400)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1402)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1410)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1411)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1412)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1420)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1430)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1440)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1441)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1450)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1452)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1460)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1461)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1462)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1470)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1471)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1472)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1473)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1480)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1490)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1491)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x14a0)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x14a1)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x14a2)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x14b0)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x14b1)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x14c0)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x14c1)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x14d0)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x14e0)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x14f0)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADVANTECH, 0x8312)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_DELL, 0x1fe0)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HP, 0x0600)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HP, 0x0601)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HP, 0x0602)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HP, 0x0603)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HP, 0x0609)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HP, 0x0650)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HP, 0x0651)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HP, 0x0652)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HP, 0x0653)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HP, 0x0654)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HP, 0x0655)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HP, 0x0700)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HP, 0x0701)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HP, 0x1001)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HP, 0x1002)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HP, 0x1100)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HP, 0x1101)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HPE, 0x0294)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HPE, 0x02db)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HPE, 0x02dc)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HPE, 0x032e)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HPE, 0x036f)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_FIBERHOME, 0x0800)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_FIBERHOME, 0x0908)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_FIBERHOME, 0x0806)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_FIBERHOME, 0x0916)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_GIGABYTE, 0x1000)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_NTCOM, 0x3161)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ZTE, 0x5445)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ZTE, 0x5446)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ZTE, 0x5447)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ZTE, 0x5449)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ZTE, 0x544A)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ZTE, 0x544B)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ZTE, 0x544D)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ZTE, 0x544E)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ZTE, 0x544F)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ZTE, 0x0b27)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ZTE, 0x0b29)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ZTE, 0x0b45)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_ANY_ID, PCI_ANY_ID)
+	},
+	{ 0 }
+};
+
+MODULE_DEVICE_TABLE(pci, pqi_pci_id_table);
+
+static struct pci_driver pqi_pci_driver = {
+	.name = DRIVER_NAME_SHORT,
+	.id_table = pqi_pci_id_table,
+	.probe = pqi_pci_probe,
+	.remove = pqi_pci_remove,
+	.shutdown = pqi_shutdown,
+#if defined(CONFIG_PM)
+	.suspend = pqi_suspend,
+	.resume = pqi_resume,
+#endif
+};
+
+static int __init pqi_init(void)
+{
+	int rc;
+
+	pr_info(DRIVER_NAME "\n");
+
+	pqi_sas_transport_template = sas_attach_transport(&pqi_sas_transport_functions);
+	if (!pqi_sas_transport_template)
+		return -ENODEV;
+
+	pqi_process_module_params();
+
+	rc = pci_register_driver(&pqi_pci_driver);
+	if (rc)
+		sas_release_transport(pqi_sas_transport_template);
+
+	return rc;
+}
+
+static void __exit pqi_cleanup(void)
+{
+	pci_unregister_driver(&pqi_pci_driver);
+	sas_release_transport(pqi_sas_transport_template);
+}
+
+module_init(pqi_init);
+module_exit(pqi_cleanup);
+
+static void __attribute__((unused)) verify_structures(void)
+{
+	BUILD_BUG_ON(offsetof(struct pqi_ctrl_registers,
+		sis_host_to_ctrl_doorbell) != 0x20);
+	BUILD_BUG_ON(offsetof(struct pqi_ctrl_registers,
+		sis_interrupt_mask) != 0x34);
+	BUILD_BUG_ON(offsetof(struct pqi_ctrl_registers,
+		sis_ctrl_to_host_doorbell) != 0x9c);
+	BUILD_BUG_ON(offsetof(struct pqi_ctrl_registers,
+		sis_ctrl_to_host_doorbell_clear) != 0xa0);
+	BUILD_BUG_ON(offsetof(struct pqi_ctrl_registers,
+		sis_driver_scratch) != 0xb0);
+	BUILD_BUG_ON(offsetof(struct pqi_ctrl_registers,
+		sis_product_identifier) != 0xb4);
+	BUILD_BUG_ON(offsetof(struct pqi_ctrl_registers,
+		sis_firmware_status) != 0xbc);
+	BUILD_BUG_ON(offsetof(struct pqi_ctrl_registers,
+		sis_ctrl_shutdown_reason_code) != 0xcc);
+	BUILD_BUG_ON(offsetof(struct pqi_ctrl_registers,
+		sis_mailbox) != 0x1000);
+	BUILD_BUG_ON(offsetof(struct pqi_ctrl_registers,
+		pqi_registers) != 0x4000);
+
+	BUILD_BUG_ON(offsetof(struct pqi_iu_header,
+		iu_type) != 0x0);
+	BUILD_BUG_ON(offsetof(struct pqi_iu_header,
+		iu_length) != 0x2);
+	BUILD_BUG_ON(offsetof(struct pqi_iu_header,
+		response_queue_id) != 0x4);
+	BUILD_BUG_ON(offsetof(struct pqi_iu_header,
+		driver_flags) != 0x6);
+	BUILD_BUG_ON(sizeof(struct pqi_iu_header) != 0x8);
+
+	BUILD_BUG_ON(offsetof(struct pqi_aio_error_info,
+		status) != 0x0);
+	BUILD_BUG_ON(offsetof(struct pqi_aio_error_info,
+		service_response) != 0x1);
+	BUILD_BUG_ON(offsetof(struct pqi_aio_error_info,
+		data_present) != 0x2);
+	BUILD_BUG_ON(offsetof(struct pqi_aio_error_info,
+		reserved) != 0x3);
+	BUILD_BUG_ON(offsetof(struct pqi_aio_error_info,
+		residual_count) != 0x4);
+	BUILD_BUG_ON(offsetof(struct pqi_aio_error_info,
+		data_length) != 0x8);
+	BUILD_BUG_ON(offsetof(struct pqi_aio_error_info,
+		reserved1) != 0xa);
+	BUILD_BUG_ON(offsetof(struct pqi_aio_error_info,
+		data) != 0xc);
+	BUILD_BUG_ON(sizeof(struct pqi_aio_error_info) != 0x10c);
+
+	BUILD_BUG_ON(offsetof(struct pqi_raid_error_info,
+		data_in_result) != 0x0);
+	BUILD_BUG_ON(offsetof(struct pqi_raid_error_info,
+		data_out_result) != 0x1);
+	BUILD_BUG_ON(offsetof(struct pqi_raid_error_info,
+		reserved) != 0x2);
+	BUILD_BUG_ON(offsetof(struct pqi_raid_error_info,
+		status) != 0x5);
+	BUILD_BUG_ON(offsetof(struct pqi_raid_error_info,
+		status_qualifier) != 0x6);
+	BUILD_BUG_ON(offsetof(struct pqi_raid_error_info,
+		sense_data_length) != 0x8);
+	BUILD_BUG_ON(offsetof(struct pqi_raid_error_info,
+		response_data_length) != 0xa);
+	BUILD_BUG_ON(offsetof(struct pqi_raid_error_info,
+		data_in_transferred) != 0xc);
+	BUILD_BUG_ON(offsetof(struct pqi_raid_error_info,
+		data_out_transferred) != 0x10);
+	BUILD_BUG_ON(offsetof(struct pqi_raid_error_info,
+		data) != 0x14);
+	BUILD_BUG_ON(sizeof(struct pqi_raid_error_info) != 0x114);
+
+	BUILD_BUG_ON(offsetof(struct pqi_device_registers,
+		signature) != 0x0);
+	BUILD_BUG_ON(offsetof(struct pqi_device_registers,
+		function_and_status_code) != 0x8);
+	BUILD_BUG_ON(offsetof(struct pqi_device_registers,
+		max_admin_iq_elements) != 0x10);
+	BUILD_BUG_ON(offsetof(struct pqi_device_registers,
+		max_admin_oq_elements) != 0x11);
+	BUILD_BUG_ON(offsetof(struct pqi_device_registers,
+		admin_iq_element_length) != 0x12);
+	BUILD_BUG_ON(offsetof(struct pqi_device_registers,
+		admin_oq_element_length) != 0x13);
+	BUILD_BUG_ON(offsetof(struct pqi_device_registers,
+		max_reset_timeout) != 0x14);
+	BUILD_BUG_ON(offsetof(struct pqi_device_registers,
+		legacy_intx_status) != 0x18);
+	BUILD_BUG_ON(offsetof(struct pqi_device_registers,
+		legacy_intx_mask_set) != 0x1c);
+	BUILD_BUG_ON(offsetof(struct pqi_device_registers,
+		legacy_intx_mask_clear) != 0x20);
+	BUILD_BUG_ON(offsetof(struct pqi_device_registers,
+		device_status) != 0x40);
+	BUILD_BUG_ON(offsetof(struct pqi_device_registers,
+		admin_iq_pi_offset) != 0x48);
+	BUILD_BUG_ON(offsetof(struct pqi_device_registers,
+		admin_oq_ci_offset) != 0x50);
+	BUILD_BUG_ON(offsetof(struct pqi_device_registers,
+		admin_iq_element_array_addr) != 0x58);
+	BUILD_BUG_ON(offsetof(struct pqi_device_registers,
+		admin_oq_element_array_addr) != 0x60);
+	BUILD_BUG_ON(offsetof(struct pqi_device_registers,
+		admin_iq_ci_addr) != 0x68);
+	BUILD_BUG_ON(offsetof(struct pqi_device_registers,
+		admin_oq_pi_addr) != 0x70);
+	BUILD_BUG_ON(offsetof(struct pqi_device_registers,
+		admin_iq_num_elements) != 0x78);
+	BUILD_BUG_ON(offsetof(struct pqi_device_registers,
+		admin_oq_num_elements) != 0x79);
+	BUILD_BUG_ON(offsetof(struct pqi_device_registers,
+		admin_queue_int_msg_num) != 0x7a);
+	BUILD_BUG_ON(offsetof(struct pqi_device_registers,
+		device_error) != 0x80);
+	BUILD_BUG_ON(offsetof(struct pqi_device_registers,
+		error_details) != 0x88);
+	BUILD_BUG_ON(offsetof(struct pqi_device_registers,
+		device_reset) != 0x90);
+	BUILD_BUG_ON(offsetof(struct pqi_device_registers,
+		power_action) != 0x94);
+	BUILD_BUG_ON(sizeof(struct pqi_device_registers) != 0x100);
+
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_request,
+		header.iu_type) != 0);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_request,
+		header.iu_length) != 2);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_request,
+		header.driver_flags) != 6);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_request,
+		request_id) != 8);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_request,
+		function_code) != 10);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_request,
+		data.report_device_capability.buffer_length) != 44);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_request,
+		data.report_device_capability.sg_descriptor) != 48);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_request,
+		data.create_operational_iq.queue_id) != 12);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_request,
+		data.create_operational_iq.element_array_addr) != 16);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_request,
+		data.create_operational_iq.ci_addr) != 24);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_request,
+		data.create_operational_iq.num_elements) != 32);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_request,
+		data.create_operational_iq.element_length) != 34);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_request,
+		data.create_operational_iq.queue_protocol) != 36);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_request,
+		data.create_operational_oq.queue_id) != 12);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_request,
+		data.create_operational_oq.element_array_addr) != 16);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_request,
+		data.create_operational_oq.pi_addr) != 24);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_request,
+		data.create_operational_oq.num_elements) != 32);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_request,
+		data.create_operational_oq.element_length) != 34);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_request,
+		data.create_operational_oq.queue_protocol) != 36);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_request,
+		data.create_operational_oq.int_msg_num) != 40);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_request,
+		data.create_operational_oq.coalescing_count) != 42);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_request,
+		data.create_operational_oq.min_coalescing_time) != 44);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_request,
+		data.create_operational_oq.max_coalescing_time) != 48);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_request,
+		data.delete_operational_queue.queue_id) != 12);
+	BUILD_BUG_ON(sizeof(struct pqi_general_admin_request) != 64);
+	BUILD_BUG_ON(FIELD_SIZEOF(struct pqi_general_admin_request,
+		data.create_operational_iq) != 64 - 11);
+	BUILD_BUG_ON(FIELD_SIZEOF(struct pqi_general_admin_request,
+		data.create_operational_oq) != 64 - 11);
+	BUILD_BUG_ON(FIELD_SIZEOF(struct pqi_general_admin_request,
+		data.delete_operational_queue) != 64 - 11);
+
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_response,
+		header.iu_type) != 0);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_response,
+		header.iu_length) != 2);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_response,
+		header.driver_flags) != 6);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_response,
+		request_id) != 8);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_response,
+		function_code) != 10);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_response,
+		status) != 11);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_response,
+		data.create_operational_iq.status_descriptor) != 12);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_response,
+		data.create_operational_iq.iq_pi_offset) != 16);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_response,
+		data.create_operational_oq.status_descriptor) != 12);
+	BUILD_BUG_ON(offsetof(struct pqi_general_admin_response,
+		data.create_operational_oq.oq_ci_offset) != 16);
+	BUILD_BUG_ON(sizeof(struct pqi_general_admin_response) != 64);
+
+	BUILD_BUG_ON(offsetof(struct pqi_raid_path_request,
+		header.iu_type) != 0);
+	BUILD_BUG_ON(offsetof(struct pqi_raid_path_request,
+		header.iu_length) != 2);
+	BUILD_BUG_ON(offsetof(struct pqi_raid_path_request,
+		header.response_queue_id) != 4);
+	BUILD_BUG_ON(offsetof(struct pqi_raid_path_request,
+		header.driver_flags) != 6);
+	BUILD_BUG_ON(offsetof(struct pqi_raid_path_request,
+		request_id) != 8);
+	BUILD_BUG_ON(offsetof(struct pqi_raid_path_request,
+		nexus_id) != 10);
+	BUILD_BUG_ON(offsetof(struct pqi_raid_path_request,
+		buffer_length) != 12);
+	BUILD_BUG_ON(offsetof(struct pqi_raid_path_request,
+		lun_number) != 16);
+	BUILD_BUG_ON(offsetof(struct pqi_raid_path_request,
+		protocol_specific) != 24);
+	BUILD_BUG_ON(offsetof(struct pqi_raid_path_request,
+		error_index) != 27);
+	BUILD_BUG_ON(offsetof(struct pqi_raid_path_request,
+		cdb) != 32);
+	BUILD_BUG_ON(offsetof(struct pqi_raid_path_request,
+		timeout) != 60);
+	BUILD_BUG_ON(offsetof(struct pqi_raid_path_request,
+		sg_descriptors) != 64);
+	BUILD_BUG_ON(sizeof(struct pqi_raid_path_request) !=
+		PQI_OPERATIONAL_IQ_ELEMENT_LENGTH);
+
+	BUILD_BUG_ON(offsetof(struct pqi_aio_path_request,
+		header.iu_type) != 0);
+	BUILD_BUG_ON(offsetof(struct pqi_aio_path_request,
+		header.iu_length) != 2);
+	BUILD_BUG_ON(offsetof(struct pqi_aio_path_request,
+		header.response_queue_id) != 4);
+	BUILD_BUG_ON(offsetof(struct pqi_aio_path_request,
+		header.driver_flags) != 6);
+	BUILD_BUG_ON(offsetof(struct pqi_aio_path_request,
+		request_id) != 8);
+	BUILD_BUG_ON(offsetof(struct pqi_aio_path_request,
+		nexus_id) != 12);
+	BUILD_BUG_ON(offsetof(struct pqi_aio_path_request,
+		buffer_length) != 16);
+	BUILD_BUG_ON(offsetof(struct pqi_aio_path_request,
+		data_encryption_key_index) != 22);
+	BUILD_BUG_ON(offsetof(struct pqi_aio_path_request,
+		encrypt_tweak_lower) != 24);
+	BUILD_BUG_ON(offsetof(struct pqi_aio_path_request,
+		encrypt_tweak_upper) != 28);
+	BUILD_BUG_ON(offsetof(struct pqi_aio_path_request,
+		cdb) != 32);
+	BUILD_BUG_ON(offsetof(struct pqi_aio_path_request,
+		error_index) != 48);
+	BUILD_BUG_ON(offsetof(struct pqi_aio_path_request,
+		num_sg_descriptors) != 50);
+	BUILD_BUG_ON(offsetof(struct pqi_aio_path_request,
+		cdb_length) != 51);
+	BUILD_BUG_ON(offsetof(struct pqi_aio_path_request,
+		lun_number) != 52);
+	BUILD_BUG_ON(offsetof(struct pqi_aio_path_request,
+		sg_descriptors) != 64);
+	BUILD_BUG_ON(sizeof(struct pqi_aio_path_request) !=
+		PQI_OPERATIONAL_IQ_ELEMENT_LENGTH);
+
+	BUILD_BUG_ON(offsetof(struct pqi_io_response,
+		header.iu_type) != 0);
+	BUILD_BUG_ON(offsetof(struct pqi_io_response,
+		header.iu_length) != 2);
+	BUILD_BUG_ON(offsetof(struct pqi_io_response,
+		request_id) != 8);
+	BUILD_BUG_ON(offsetof(struct pqi_io_response,
+		error_index) != 10);
+
+	BUILD_BUG_ON(offsetof(struct pqi_general_management_request,
+		header.iu_type) != 0);
+	BUILD_BUG_ON(offsetof(struct pqi_general_management_request,
+		header.iu_length) != 2);
+	BUILD_BUG_ON(offsetof(struct pqi_general_management_request,
+		header.response_queue_id) != 4);
+	BUILD_BUG_ON(offsetof(struct pqi_general_management_request,
+		request_id) != 8);
+	BUILD_BUG_ON(offsetof(struct pqi_general_management_request,
+		data.report_event_configuration.buffer_length) != 12);
+	BUILD_BUG_ON(offsetof(struct pqi_general_management_request,
+		data.report_event_configuration.sg_descriptors) != 16);
+	BUILD_BUG_ON(offsetof(struct pqi_general_management_request,
+		data.set_event_configuration.global_event_oq_id) != 10);
+	BUILD_BUG_ON(offsetof(struct pqi_general_management_request,
+		data.set_event_configuration.buffer_length) != 12);
+	BUILD_BUG_ON(offsetof(struct pqi_general_management_request,
+		data.set_event_configuration.sg_descriptors) != 16);
+
+	BUILD_BUG_ON(offsetof(struct pqi_iu_layer_descriptor,
+		max_inbound_iu_length) != 6);
+	BUILD_BUG_ON(offsetof(struct pqi_iu_layer_descriptor,
+		max_outbound_iu_length) != 14);
+	BUILD_BUG_ON(sizeof(struct pqi_iu_layer_descriptor) != 16);
+
+	BUILD_BUG_ON(offsetof(struct pqi_device_capability,
+		data_length) != 0);
+	BUILD_BUG_ON(offsetof(struct pqi_device_capability,
+		iq_arbitration_priority_support_bitmask) != 8);
+	BUILD_BUG_ON(offsetof(struct pqi_device_capability,
+		maximum_aw_a) != 9);
+	BUILD_BUG_ON(offsetof(struct pqi_device_capability,
+		maximum_aw_b) != 10);
+	BUILD_BUG_ON(offsetof(struct pqi_device_capability,
+		maximum_aw_c) != 11);
+	BUILD_BUG_ON(offsetof(struct pqi_device_capability,
+		max_inbound_queues) != 16);
+	BUILD_BUG_ON(offsetof(struct pqi_device_capability,
+		max_elements_per_iq) != 18);
+	BUILD_BUG_ON(offsetof(struct pqi_device_capability,
+		max_iq_element_length) != 24);
+	BUILD_BUG_ON(offsetof(struct pqi_device_capability,
+		min_iq_element_length) != 26);
+	BUILD_BUG_ON(offsetof(struct pqi_device_capability,
+		max_outbound_queues) != 30);
+	BUILD_BUG_ON(offsetof(struct pqi_device_capability,
+		max_elements_per_oq) != 32);
+	BUILD_BUG_ON(offsetof(struct pqi_device_capability,
+		intr_coalescing_time_granularity) != 34);
+	BUILD_BUG_ON(offsetof(struct pqi_device_capability,
+		max_oq_element_length) != 36);
+	BUILD_BUG_ON(offsetof(struct pqi_device_capability,
+		min_oq_element_length) != 38);
+	BUILD_BUG_ON(offsetof(struct pqi_device_capability,
+		iu_layer_descriptors) != 64);
+	BUILD_BUG_ON(sizeof(struct pqi_device_capability) != 576);
+
+	BUILD_BUG_ON(offsetof(struct pqi_event_descriptor,
+		event_type) != 0);
+	BUILD_BUG_ON(offsetof(struct pqi_event_descriptor,
+		oq_id) != 2);
+	BUILD_BUG_ON(sizeof(struct pqi_event_descriptor) != 4);
+
+	BUILD_BUG_ON(offsetof(struct pqi_event_config,
+		num_event_descriptors) != 2);
+	BUILD_BUG_ON(offsetof(struct pqi_event_config,
+		descriptors) != 4);
+
+	BUILD_BUG_ON(PQI_NUM_SUPPORTED_EVENTS !=
+		ARRAY_SIZE(pqi_supported_event_types));
+
+	BUILD_BUG_ON(offsetof(struct pqi_event_response,
+		header.iu_type) != 0);
+	BUILD_BUG_ON(offsetof(struct pqi_event_response,
+		header.iu_length) != 2);
+	BUILD_BUG_ON(offsetof(struct pqi_event_response,
+		event_type) != 8);
+	BUILD_BUG_ON(offsetof(struct pqi_event_response,
+		event_id) != 10);
+	BUILD_BUG_ON(offsetof(struct pqi_event_response,
+		additional_event_id) != 12);
+	BUILD_BUG_ON(offsetof(struct pqi_event_response,
+		data) != 16);
+	BUILD_BUG_ON(sizeof(struct pqi_event_response) != 32);
+
+	BUILD_BUG_ON(offsetof(struct pqi_event_acknowledge_request,
+		header.iu_type) != 0);
+	BUILD_BUG_ON(offsetof(struct pqi_event_acknowledge_request,
+		header.iu_length) != 2);
+	BUILD_BUG_ON(offsetof(struct pqi_event_acknowledge_request,
+		event_type) != 8);
+	BUILD_BUG_ON(offsetof(struct pqi_event_acknowledge_request,
+		event_id) != 10);
+	BUILD_BUG_ON(offsetof(struct pqi_event_acknowledge_request,
+		additional_event_id) != 12);
+	BUILD_BUG_ON(sizeof(struct pqi_event_acknowledge_request) != 16);
+
+	BUILD_BUG_ON(offsetof(struct pqi_task_management_request,
+		header.iu_type) != 0);
+	BUILD_BUG_ON(offsetof(struct pqi_task_management_request,
+		header.iu_length) != 2);
+	BUILD_BUG_ON(offsetof(struct pqi_task_management_request,
+		request_id) != 8);
+	BUILD_BUG_ON(offsetof(struct pqi_task_management_request,
+		nexus_id) != 10);
+	BUILD_BUG_ON(offsetof(struct pqi_task_management_request,
+		timeout) != 14);
+	BUILD_BUG_ON(offsetof(struct pqi_task_management_request,
+		lun_number) != 16);
+	BUILD_BUG_ON(offsetof(struct pqi_task_management_request,
+		protocol_specific) != 24);
+	BUILD_BUG_ON(offsetof(struct pqi_task_management_request,
+		outbound_queue_id_to_manage) != 26);
+	BUILD_BUG_ON(offsetof(struct pqi_task_management_request,
+		request_id_to_manage) != 28);
+	BUILD_BUG_ON(offsetof(struct pqi_task_management_request,
+		task_management_function) != 30);
+	BUILD_BUG_ON(sizeof(struct pqi_task_management_request) != 32);
+
+	BUILD_BUG_ON(offsetof(struct pqi_task_management_response,
+		header.iu_type) != 0);
+	BUILD_BUG_ON(offsetof(struct pqi_task_management_response,
+		header.iu_length) != 2);
+	BUILD_BUG_ON(offsetof(struct pqi_task_management_response,
+		request_id) != 8);
+	BUILD_BUG_ON(offsetof(struct pqi_task_management_response,
+		nexus_id) != 10);
+	BUILD_BUG_ON(offsetof(struct pqi_task_management_response,
+		additional_response_info) != 12);
+	BUILD_BUG_ON(offsetof(struct pqi_task_management_response,
+		response_code) != 15);
+	BUILD_BUG_ON(sizeof(struct pqi_task_management_response) != 16);
+
+	BUILD_BUG_ON(offsetof(struct bmic_identify_controller,
+		configured_logical_drive_count) != 0);
+	BUILD_BUG_ON(offsetof(struct bmic_identify_controller,
+		configuration_signature) != 1);
+	BUILD_BUG_ON(offsetof(struct bmic_identify_controller,
+		firmware_version_short) != 5);
+	BUILD_BUG_ON(offsetof(struct bmic_identify_controller,
+		extended_logical_unit_count) != 154);
+	BUILD_BUG_ON(offsetof(struct bmic_identify_controller,
+		firmware_build_number) != 190);
+	BUILD_BUG_ON(offsetof(struct bmic_identify_controller,
+		vendor_id) != 200);
+	BUILD_BUG_ON(offsetof(struct bmic_identify_controller,
+		product_id) != 208);
+	BUILD_BUG_ON(offsetof(struct bmic_identify_controller,
+		extra_controller_flags) != 286);
+	BUILD_BUG_ON(offsetof(struct bmic_identify_controller,
+		controller_mode) != 292);
+	BUILD_BUG_ON(offsetof(struct bmic_identify_controller,
+		spare_part_number) != 293);
+	BUILD_BUG_ON(offsetof(struct bmic_identify_controller,
+		firmware_version_long) != 325);
+
+	BUILD_BUG_ON(offsetof(struct bmic_identify_physical_device,
+		phys_bay_in_box) != 115);
+	BUILD_BUG_ON(offsetof(struct bmic_identify_physical_device,
+		device_type) != 120);
+	BUILD_BUG_ON(offsetof(struct bmic_identify_physical_device,
+		redundant_path_present_map) != 1736);
+	BUILD_BUG_ON(offsetof(struct bmic_identify_physical_device,
+		active_path_number) != 1738);
+	BUILD_BUG_ON(offsetof(struct bmic_identify_physical_device,
+		alternate_paths_phys_connector) != 1739);
+	BUILD_BUG_ON(offsetof(struct bmic_identify_physical_device,
+		alternate_paths_phys_box_on_port) != 1755);
+	BUILD_BUG_ON(offsetof(struct bmic_identify_physical_device,
+		current_queue_depth_limit) != 1796);
+	BUILD_BUG_ON(sizeof(struct bmic_identify_physical_device) != 2560);
+
+	BUILD_BUG_ON(sizeof(struct bmic_sense_feature_buffer_header) != 4);
+	BUILD_BUG_ON(offsetof(struct bmic_sense_feature_buffer_header,
+		page_code) != 0);
+	BUILD_BUG_ON(offsetof(struct bmic_sense_feature_buffer_header,
+		subpage_code) != 1);
+	BUILD_BUG_ON(offsetof(struct bmic_sense_feature_buffer_header,
+		buffer_length) != 2);
+
+	BUILD_BUG_ON(sizeof(struct bmic_sense_feature_page_header) != 4);
+	BUILD_BUG_ON(offsetof(struct bmic_sense_feature_page_header,
+		page_code) != 0);
+	BUILD_BUG_ON(offsetof(struct bmic_sense_feature_page_header,
+		subpage_code) != 1);
+	BUILD_BUG_ON(offsetof(struct bmic_sense_feature_page_header,
+		page_length) != 2);
+
+	BUILD_BUG_ON(sizeof(struct bmic_sense_feature_io_page_aio_subpage)
+		!= 18);
+	BUILD_BUG_ON(offsetof(struct bmic_sense_feature_io_page_aio_subpage,
+		header) != 0);
+	BUILD_BUG_ON(offsetof(struct bmic_sense_feature_io_page_aio_subpage,
+		firmware_read_support) != 4);
+	BUILD_BUG_ON(offsetof(struct bmic_sense_feature_io_page_aio_subpage,
+		driver_read_support) != 5);
+	BUILD_BUG_ON(offsetof(struct bmic_sense_feature_io_page_aio_subpage,
+		firmware_write_support) != 6);
+	BUILD_BUG_ON(offsetof(struct bmic_sense_feature_io_page_aio_subpage,
+		driver_write_support) != 7);
+	BUILD_BUG_ON(offsetof(struct bmic_sense_feature_io_page_aio_subpage,
+		max_transfer_encrypted_sas_sata) != 8);
+	BUILD_BUG_ON(offsetof(struct bmic_sense_feature_io_page_aio_subpage,
+		max_transfer_encrypted_nvme) != 10);
+	BUILD_BUG_ON(offsetof(struct bmic_sense_feature_io_page_aio_subpage,
+		max_write_raid_5_6) != 12);
+	BUILD_BUG_ON(offsetof(struct bmic_sense_feature_io_page_aio_subpage,
+		max_write_raid_1_10_2drive) != 14);
+	BUILD_BUG_ON(offsetof(struct bmic_sense_feature_io_page_aio_subpage,
+		max_write_raid_1_10_3drive) != 16);
+
+	BUILD_BUG_ON(PQI_ADMIN_IQ_NUM_ELEMENTS > 255);
+	BUILD_BUG_ON(PQI_ADMIN_OQ_NUM_ELEMENTS > 255);
+	BUILD_BUG_ON(PQI_ADMIN_IQ_ELEMENT_LENGTH %
+		PQI_QUEUE_ELEMENT_LENGTH_ALIGNMENT != 0);
+	BUILD_BUG_ON(PQI_ADMIN_OQ_ELEMENT_LENGTH %
+		PQI_QUEUE_ELEMENT_LENGTH_ALIGNMENT != 0);
+	BUILD_BUG_ON(PQI_OPERATIONAL_IQ_ELEMENT_LENGTH > 1048560);
+	BUILD_BUG_ON(PQI_OPERATIONAL_IQ_ELEMENT_LENGTH %
+		PQI_QUEUE_ELEMENT_LENGTH_ALIGNMENT != 0);
+	BUILD_BUG_ON(PQI_OPERATIONAL_OQ_ELEMENT_LENGTH > 1048560);
+	BUILD_BUG_ON(PQI_OPERATIONAL_OQ_ELEMENT_LENGTH %
+		PQI_QUEUE_ELEMENT_LENGTH_ALIGNMENT != 0);
+
+	BUILD_BUG_ON(PQI_RESERVED_IO_SLOTS >= PQI_MAX_OUTSTANDING_REQUESTS);
+	BUILD_BUG_ON(PQI_RESERVED_IO_SLOTS >=
+		PQI_MAX_OUTSTANDING_REQUESTS_KDUMP);
+}
diff --git a/drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.c b/drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.c
new file mode 100644
index 0000000000000..c0c598f99c4c6
--- /dev/null
+++ b/drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.c
@@ -0,0 +1,391 @@
+/*
+ *    driver for Microchip PQI-based storage controllers
+ *    Copyright (c) 2019-2021 Microchip Technology Inc. and its subsidiaries
+ *    Copyright (c) 2016-2018 Microsemi Corporation
+ *    Copyright (c) 2016 PMC-Sierra, Inc.
+ *
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; version 2 of the License.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *    NON INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ *    Questions/Comments/Bugfixes to storagedev@microchip.com
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/bsg-lib.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+#include "smartpqi.h"
+#include "smartpqi_kernel_compat.h"
+#if KFEATURE_ENABLE_SCSI_MAP_QUEUES
+#include <linux/blk-mq-pci.h>
+#endif
+extern struct device_attribute *pqi_ncq_prio_sdev_attrs;
+
+#if !KFEATURE_HAS_2011_03_QUEUECOMMAND
+
+int pqi_scsi_queue_command_compat(struct scsi_cmnd *scmd,
+	void (*done)(struct scsi_cmnd *))
+{
+	scmd->SCp.ptr = (char *)done;
+
+	return pqi_scsi_queue_command(scmd->device->host, scmd);
+}
+
+#endif	/* !KFEATURE_HAS_2011_03_QUEUECOMMAND */
+
+#if !KFEATURE_HAS_PCI_ENABLE_MSIX_RANGE
+
+int pci_enable_msix_range(struct pci_dev *pci_dev, struct msix_entry *entries,
+	int minvec, int maxvec)
+{
+	int nvec = maxvec;
+	int rc;
+
+	if (maxvec < minvec)
+		return -ERANGE;
+
+	do {
+		rc = pci_enable_msix(pci_dev, entries, nvec);
+		if (rc < 0)
+			return rc;
+		if (rc > 0) {
+			if (rc < minvec)
+				return -ENOSPC;
+			nvec = rc;
+		}
+	} while (rc);
+
+	return nvec;
+}
+
+#endif	/* !KFEATURE_HAS_PCI_ENABLE_MSIX_RANGE */
+
+#if !KFEATURE_HAS_SCSI_CHANGE_QUEUE_DEPTH
+
+int scsi_change_queue_depth(struct scsi_device *sdev, int queue_depth)
+{
+	scsi_adjust_queue_depth(sdev, scsi_get_tag_type(sdev), queue_depth);
+
+	return queue_depth;
+}
+
+static int pqi_change_queue_depth(struct scsi_device *sdev, int qdepth,
+	int reason)
+{
+	if (reason == SCSI_QDEPTH_DEFAULT || reason == SCSI_QDEPTH_RAMP_UP) {
+		struct pqi_scsi_dev *device = sdev->hostdata;
+
+		if (!device)
+			return -ENODEV;
+
+		if (qdepth < 1)
+			qdepth = 1;
+		else if (qdepth > device->queue_depth)
+			qdepth = device->queue_depth;
+
+		scsi_adjust_queue_depth(sdev, scsi_get_tag_type(sdev), qdepth);
+
+	} else if (reason == SCSI_QDEPTH_QFULL)
+		scsi_track_queue_full(sdev, qdepth);
+	else
+		return -ENOTSUPP;
+
+	return sdev->queue_depth;
+}
+
+static int pqi_change_queue_type(struct scsi_device *sdev, int tag_type)
+{
+	if (sdev->tagged_supported) {
+		scsi_set_tag_type(sdev, tag_type);
+		if (tag_type)
+			scsi_activate_tcq(sdev, sdev->queue_depth);
+		else
+			scsi_deactivate_tcq(sdev, sdev->queue_depth);
+	} else {
+		tag_type = 0;
+	}
+
+	return tag_type;
+}
+
+#endif	/* !KFEATURE_HAS_SCSI_CHANGE_QUEUE_DEPTH */
+
+#if KFEATURE_ENABLE_SCSI_MAP_QUEUES
+static int pqi_map_queues(struct Scsi_Host *shost)
+{
+	struct pqi_ctrl_info *ctrl_info = shost_to_hba(shost);
+
+#if KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V1
+	return blk_mq_pci_map_queues(&shost->tag_set, ctrl_info->pci_dev);
+#elif KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V2
+	return blk_mq_pci_map_queues(&shost->tag_set, ctrl_info->pci_dev, 0);
+#elif KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V3
+	return blk_mq_pci_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT],
+					ctrl_info->pci_dev, 0);
+#else
+	#error "A version for KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES has not been defined."
+#endif
+}
+#endif /* KFEATURE_ENABLE_SCSI_MAP_QUEUES */
+
+void pqi_compat_init_scsi_host_template(struct scsi_host_template *hostt)
+{
+#if !KFEATURE_HAS_SCSI_CHANGE_QUEUE_DEPTH
+	hostt->change_queue_depth = pqi_change_queue_depth;
+	hostt->change_queue_type = pqi_change_queue_type;
+#endif	/* !KFEATURE_HAS_SCSI_CHANGE_QUEUE_DEPTH */
+#if KFEATURE_HAS_LOCKLESS_DISPATCH_IO
+	hostt->lockless = 1;
+#endif
+#if KFEATURE_HAS_USE_CLUSTERING
+	hostt->use_clustering = ENABLE_CLUSTERING;
+#endif
+#if KFEATURE_ENABLE_SCSI_MAP_QUEUES
+	hostt->map_queues = pqi_map_queues;
+#endif
+#if KFEATURE_HAS_NCQ_PRIO_SUPPORT
+	hostt->sdev_attrs = &pqi_ncq_prio_sdev_attrs;
+#endif
+}
+
+void pqi_compat_init_scsi_host(struct Scsi_Host *shost,
+	struct pqi_ctrl_info *ctrl_info)
+{
+#if KFEATURE_HAS_MQ_SUPPORT
+	shost->nr_hw_queues = ctrl_info->num_queue_groups;
+#endif	/* KFEATURE_HAS_MQ_SUPPORT */
+}
+
+#if !KFEATURE_HAS_SCSI_SANITIZE_INQUIRY_STRING
+
+void scsi_sanitize_inquiry_string(unsigned char *s, int len)
+{
+	bool terminated = false;
+
+	for (; len > 0; (--len, ++s)) {
+		if (*s == 0)
+			terminated = true;
+		if (terminated || *s < 0x20 || *s > 0x7e)
+			*s = ' ';
+	}
+}
+
+#endif	/* !KFEATURE_HAS_SCSI_SANITIZE_INQUIRY_STRING */
+
+#if !KFEATURE_HAS_PCIE_CAPABILITY_SUPPORT
+
+#if defined(RHEL6U3)
+/*
+ * Note that these accessor functions are only for the "PCI Express
+ * Capability" (see PCIe spec r3.0, sec 7.8).  They do not apply to the
+ * other "PCI Express Extended Capabilities" (AER, VC, ACS, MFVC, etc.)
+ */
+int pcie_capability_read_word(struct pci_dev *dev, int pos, u16 *val)
+{
+	int ret;
+
+	*val = 0;
+	if (pos & 1)
+		return -EINVAL;
+
+	ret = pci_read_config_word(dev, pci_pcie_cap(dev) + pos, val);
+	/*
+	 * Reset *val to 0 if pci_read_config_word() fails, it may
+	 * have been written as 0xFFFF if hardware error happens
+	 * during pci_read_config_word().
+	 */
+	if (ret)
+		*val = 0;
+	return ret;
+}
+
+int pcie_capability_write_word(struct pci_dev *dev, int pos, u16 val)
+{
+	if (pos & 1)
+		return -EINVAL;
+
+	return pci_write_config_word(dev, pci_pcie_cap(dev) + pos, val);
+}
+
+#endif /* RHEL6U3 */
+
+int pcie_capability_clear_and_set_word(struct pci_dev *dev, int pos,
+	u16 clear, u16 set)
+{
+	int ret;
+	u16 val;
+
+	ret = pcie_capability_read_word(dev, pos, &val);
+	if (!ret) {
+		val &= ~clear;
+		val |= set;
+		ret = pcie_capability_write_word(dev, pos, val);
+	}
+
+	return ret;
+}
+
+#endif
+
+#if !KFEATURE_HAS_BSG_JOB_SMP_HANDLER
+
+static int pqi_bsg_map_buffer(struct bsg_buffer *buf, struct request *req)
+{
+	size_t sz = (sizeof(struct scatterlist) * req->nr_phys_segments);
+
+	if (!req->nr_phys_segments) {
+		WARN_ON(!req->nr_phys_segments);
+		return -EINVAL;
+	}
+
+	buf->sg_list = kzalloc(sz, GFP_KERNEL);
+	if (!buf->sg_list)
+		return -ENOMEM;
+	sg_init_table(buf->sg_list, req->nr_phys_segments);
+	buf->sg_cnt = blk_rq_map_sg(req->q, req, buf->sg_list);
+	buf->payload_len = blk_rq_bytes(req);
+	return 0;
+}
+
+static int pqi_bsg_prepare_job(struct bsg_job *job, struct request *rq)
+{
+	struct request *rsp = rq->next_rq;
+	int ret;
+#if KFEATURE_HAS_SCSI_REQUEST
+	struct scsi_request *req = scsi_req(rq);
+#else
+	struct request *req = rq;
+#endif
+
+	job->request = req->cmd;
+	job->request_len = req->cmd_len;
+	job->reply = req->sense;
+
+	if (rq->bio) {
+		ret = pqi_bsg_map_buffer(&job->request_payload, rq);
+		if (ret)
+			goto failjob_rls_job;
+	}
+
+	if (rsp && rsp->bio) {
+		ret = pqi_bsg_map_buffer(&job->reply_payload, rsp);
+		if (ret)
+			goto failjob_rls_rqst_payload;
+	}
+
+	return 0;
+
+failjob_rls_rqst_payload:
+	kfree(job->request_payload.sg_list);
+failjob_rls_job:
+	return -ENOMEM;
+}
+
+struct bsg_return_data {
+	int result;
+	unsigned int reply_payload_rcv_len;
+};
+static struct bsg_return_data bsg_ret;
+
+void pqi_bsg_job_done(struct bsg_job *job, int result,
+	unsigned int reply_payload_rcv_len)
+{
+	bsg_ret.result = result;
+	bsg_ret.reply_payload_rcv_len = reply_payload_rcv_len;
+	complete(job->dd_data);
+}
+
+int pqi_sas_smp_handler_compat(struct Scsi_Host *shost, struct sas_rphy *rphy,
+	struct request *rq)
+{
+	struct bsg_job *job;
+	struct completion bsg_job;
+#if KFEATURE_HAS_SCSI_REQUEST
+	struct scsi_request *req = scsi_req(rq);
+	struct scsi_request *resp = scsi_req(rq->next_rq);
+#else
+	struct request *req = rq;
+	struct request *resp = req->next_rq;
+#endif
+
+	init_completion(&bsg_job);
+	job = kzalloc(sizeof(struct bsg_job), GFP_KERNEL);
+	if (!job)
+		return -ENOMEM;
+	job->dd_data = &bsg_job;
+
+	pqi_bsg_prepare_job(job, rq);
+	pqi_sas_smp_handler(job, shost, rphy);
+
+	wait_for_completion(&bsg_job);
+
+	req->sense_len = job->reply_len;
+	memcpy(req->sense, job->reply, job->reply_len);
+
+	resp->resid_len -= min(bsg_ret.reply_payload_rcv_len, resp->resid_len);
+	req->resid_len = 0;
+
+	kfree(job);
+	return bsg_ret.result;
+}
+
+#endif	/* !KFEATURE_HAS_BSG_JOB_SMP_HANDLER */
+
+int pqi_pci_irq_vector(struct pci_dev *dev, unsigned int nr)
+{
+#if KFEATURE_ENABLE_PCI_ALLOC_IRQ_VECTORS
+	return pci_irq_vector(dev, nr);
+#else
+	struct pqi_ctrl_info *ctrl_info;
+
+	ctrl_info = pci_get_drvdata(dev);
+	if (ctrl_info->irq_mode == IRQ_MODE_INTX)
+		return dev->irq;
+	else
+		return ctrl_info->msix_vectors[nr];
+#endif
+}
+
+void pqi_pci_free_irq_vectors(struct pci_dev *dev)
+{
+#if KFEATURE_ENABLE_PCI_ALLOC_IRQ_VECTORS
+	pci_free_irq_vectors(dev);
+#else
+	pci_disable_msix(dev);
+#endif
+}
+
+int pqi_pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
+                              unsigned int max_vecs, unsigned int flags)
+{
+#if KFEATURE_ENABLE_PCI_ALLOC_IRQ_VECTORS
+	return pci_alloc_irq_vectors(dev, min_vecs, max_vecs, flags);
+#else
+	unsigned int i;
+	int num_vectors_enabled;
+	struct pqi_ctrl_info *ctrl_info;
+	struct msix_entry msix_entries[PQI_MAX_MSIX_VECTORS];
+
+	ctrl_info = pci_get_drvdata(dev);
+
+	for (i = 0; i < max_vecs; i++)
+		msix_entries[i].entry = i;
+
+	num_vectors_enabled = pci_enable_msix_range(dev, msix_entries, min_vecs,
+		max_vecs);
+
+	for (i = 0; i < num_vectors_enabled; i++) {
+		ctrl_info->msix_vectors[i] = msix_entries[i].vector;
+		ctrl_info->intr_data[i] = &ctrl_info->queue_groups[i];
+	}
+
+	return num_vectors_enabled;
+#endif
+}
diff --git a/drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.h b/drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.h
new file mode 100644
index 0000000000000..4ae705d86ba74
--- /dev/null
+++ b/drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.h
@@ -0,0 +1,674 @@
+/*
+ *    driver for Microchip PQI-based storage controllers
+ *    Copyright (c) 2019-2021 Microchip Technology Inc. and its subsidiaries
+ *    Copyright (c) 2016-2018 Microsemi Corporation
+ *    Copyright (c) 2016 PMC-Sierra, Inc.
+ *
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; version 2 of the License.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *    NON INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ *    Questions/Comments/Bugfixes to storagedev@microchip.com
+ *
+ */
+
+#if !defined(_SMARTPQI_KERNEL_COMPAT_H)
+#define _SMARTPQI_KERNEL_COMPAT_H
+
+/* #define RHEL6 */
+/* #define RHEL7 */
+/* default is kernel.org */
+
+/* ----- RHEL6 variants --------- */
+#if \
+	defined(RHEL6U0) || \
+	defined(RHEL6U1) || \
+	defined(RHEL6U2) || \
+	defined(RHEL6U3) || \
+	defined(RHEL6U4) || \
+	defined(RHEL6U5) || \
+	defined(RHEL6U6) || \
+	defined(RHEL6U7) || \
+	defined(RHEL6U8) || \
+	defined(RHEL6U9) || \
+	defined(RHEL6U10)
+#define RHEL6
+#endif
+
+/* ----- RHEL7 variants --------- */
+#if \
+	defined(RHEL7U0)    || \
+	defined(RHEL7U1)    || \
+	defined(RHEL7U2)    || \
+	defined(RHEL7U3)    || \
+	defined(RHEL7U4)    || \
+	defined(RHEL7U4ARM) || \
+	defined(RHEL7U5)    || \
+	defined(RHEL7U5ARM) || \
+	defined(RHEL7U6)    || \
+	defined(RHEL7U7)    || \
+	defined(RHEL7U8)    || \
+	defined(RHEL7U9)
+#define RHEL7
+#endif
+
+/* ----- RHEL8 variants --------- */
+#if \
+	defined(RHEL8U0)    || \
+	defined(RHEL8U1)    || \
+	defined(RHEL8U2)    || \
+	defined(RHEL8U3)    || \
+	defined(RHEL8U4)    || \
+	defined(RHEL8U5)    || \
+	defined(RHEL8U6)    || \
+	defined(RHEL8U7)
+#define RHEL8
+#endif
+
+/* ----- SLES11 variants --------- */
+#if \
+	defined(SLES11SP0) || \
+	defined(SLES11SP1) || \
+	defined(SLES11SP2) || \
+	defined(SLES11SP3) || \
+	defined(SLES11SP4)
+#define SLES11
+#endif
+
+/* ----- SLES12 variants --------- */
+#if \
+	defined(SLES12SP0) || \
+	defined(SLES12SP1) || \
+	defined(SLES12SP2) || \
+	defined(SLES12SP3) || \
+	defined(SLES12SP4) || \
+	defined(SLES12SP5)
+#define SLES12
+#endif
+
+/* ----- SLES15 variants --------- */
+#if \
+	defined(SLES15SP0) || \
+	defined(SLES15SP1) || \
+	defined(SLES15SP2) || \
+	defined(SLES15SP3) || \
+	defined(SLES15SP4)
+#define SLES15
+#endif
+
+#include <scsi/scsi_tcq.h>
+#include <linux/bsg-lib.h>
+#include <linux/ktime.h>
+#include <linux/dma-mapping.h>
+
+#if defined(MSG_SIMPLE_TAG)
+#define KFEATURE_HAS_SCSI_CHANGE_QUEUE_DEPTH		0
+#if !defined(RHEL7U3)
+#define KFEATURE_HAS_MQ_SUPPORT				0
+#endif
+#endif
+
+#if defined(CENTOS7ALTARM)
+#define KFEATURE_HAS_MQ_SUPPORT				0
+#endif
+
+#if defined(XEN7)
+#define KCLASS4A
+#endif
+
+#if !defined(PCI_EXP_DEVCTL2_COMP_TIMEOUT)
+#define PCI_EXP_DEVCTL2_COMP_TIMEOUT	0x000f
+#if TORTUGA
+#define KFEATURE_HAS_PCIE_CAPABILITY_SUPPORT		1
+#else
+#define KFEATURE_HAS_PCIE_CAPABILITY_SUPPORT		0
+#endif
+#endif
+
+#if defined(RHEL6)
+#define KFEATURE_HAS_WAIT_FOR_COMPLETION_IO		0
+#define KFEATURE_HAS_2011_03_QUEUECOMMAND		0
+#define KFEATURE_HAS_NO_WRITE_SAME			0
+#define KFEATURE_HAS_ATOMIC_HOST_BUSY			0
+#if defined(RHEL6U3) || defined(RHEL6U4) || defined(RHEL6U5)
+#if defined(RHEL6U3)
+#define KFEATURE_HAS_DMA_ZALLOC_COHERENT		0
+#endif
+#define KFEATURE_HAS_PCI_ENABLE_MSIX_RANGE		0
+#endif
+#if !defined(RHEL6U0) && !defined(RHEL6U1)
+#define KFEATURE_HAS_LOCKLESS_DISPATCH_IO		1
+#endif
+#if defined(RHEL6U5)
+#define KFEATURE_HAS_DMA_MASK_AND_COHERENT		0
+#endif
+#elif defined(RHEL7)
+#if defined(RHEL7U0)
+#define KFEATURE_HAS_PCI_ENABLE_MSIX_RANGE		0
+#define KFEATURE_HAS_ATOMIC_HOST_BUSY			0
+#endif
+#if defined(RHEL7U1)
+#define KFEATURE_HAS_ATOMIC_HOST_BUSY			0
+#endif
+#if defined(RHEL7U4ARM) || defined(RHEL7U5ARM)
+#endif
+#elif defined(RHEL8)
+#define KFEATURE_ENABLE_PCI_ALLOC_IRQ_VECTORS 		1
+#define KFEATURE_HAS_MQ_SUPPORT 			1
+#define shost_use_blk_mq(x) 				1
+#define KFEATURE_ENABLE_SCSI_MAP_QUEUES 		1
+#define KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V3		1
+#elif defined(SLES11)
+#define KFEATURE_HAS_WAIT_FOR_COMPLETION_IO		0
+#define KFEATURE_HAS_NO_WRITE_SAME			0
+#define KFEATURE_HAS_ATOMIC_HOST_BUSY			0
+#if defined(SLES11SP0) || defined(SLES11SP1)
+#define KFEATURE_HAS_2011_03_QUEUECOMMAND		0
+#endif
+#if defined(SLES11SP3)
+#define KFEATURE_HAS_DMA_ZALLOC_COHERENT		0
+#define KFEATURE_HAS_PCI_ENABLE_MSIX_RANGE		0
+#endif
+#elif defined(SLES12)
+#if defined(SLES12SP2) || defined(SLES12SP3)
+#define KFEATURE_HAS_KTIME_SECONDS			1
+#endif
+#if defined(SLES12SP0)
+#define KFEATURE_HAS_PCI_ENABLE_MSIX_RANGE		0
+#define KFEATURE_HAS_ATOMIC_HOST_BUSY			0
+#endif
+#if defined(SLES12SP1)
+#define KFEATURE_HAS_ATOMIC_HOST_BUSY			0
+#endif
+#elif defined(SLES15)
+#define KFEATURE_HAS_SCSI_REQUEST 			1
+#define KFEATURE_HAS_KTIME_SECONDS 			1
+#define KFEATURE_ENABLE_PCI_ALLOC_IRQ_VECTORS 		1
+#define KFEATURE_HAS_MQ_SUPPORT 			1
+#define KFEATURE_ENABLE_SCSI_MAP_QUEUES 		1
+#if defined(SLES15SP0)
+#define KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V1 		1
+#elif defined(SLES15SP1)
+#define KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V2 		1
+#else
+#define KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V3 		1
+#endif
+#elif defined(UBUNTU1404) || TORTUGA || defined(KCLASS3C)
+#define KFEATURE_HAS_PCI_ENABLE_MSIX_RANGE		0
+#define KFEATURE_HAS_ATOMIC_HOST_BUSY			0
+#elif defined(OL7U2) || defined(KCLASS3B)
+#define KFEATURE_HAS_DMA_MASK_AND_COHERENT		0
+#define KFEATURE_HAS_WAIT_FOR_COMPLETION_IO		0
+#define KFEATURE_HAS_ATOMIC_HOST_BUSY			0
+#endif
+#if defined(KCLASS4A)
+#define KFEATURE_HAS_KTIME_SECONDS			1
+#endif
+#if defined(KCLASS4B) || defined(KCLASS4C) || defined(SLES12SP4) || \
+    defined(SLES12SP5) || defined(RHEL8) || defined(KCLASS5A) || \
+    defined(KCLASS5B) || defined(KCLASS5C) || defined(KCLASS5D) || \
+    defined(SLES15SP2) || defined(SLES15SP3) || defined (CENTOS7ALTARM)
+#define KFEATURE_HAS_KTIME_SECONDS			1
+#define KFEATURE_HAS_SCSI_REQUEST			1
+#define KFEATURE_HAS_KTIME64				1
+#endif
+#if defined(KCLASS4C) || defined(RHEL8) || defined(SLES15SP1) || \
+    defined(SLES15SP2) || defined(SLES15SP3) || defined(KCLASS5A) || \
+    defined(KCLASS5B) || defined(KCLASS5C) || defined(KCLASS5D) || \
+    defined(SLES12SP5) || defined (CENTOS7ALTARM)
+#define KFEATURE_HAS_BSG_JOB_SMP_HANDLER		1
+#endif
+#if defined(RHEL8U3) || defined(RHEL8U4) || defined(RHEL8U5)
+#define KFEATURE_HAS_HOST_BUSY_FUNCTION			1
+#endif
+
+#if defined(KCLASS3D)
+#define KFEATURE_HAS_KTIME_SECONDS			1
+#endif
+#if defined(KCLASS5A) || defined(KCLASS5B) || defined(KCLASS5C) || defined(KCLASS5D) || \
+    defined(KCLASS4D) || defined(SLES15SP2) || defined(SLES15SP3)
+#define dma_zalloc_coherent	dma_alloc_coherent
+#define shost_use_blk_mq(x)	1
+#define KFEATURE_HAS_USE_CLUSTERING			0
+#endif
+
+#if defined(KCLASS5B) || defined(KCLASS5C) || defined(KCLASS5D) || \
+    defined(KCLASS4D) || defined(SLES15SP2) || defined(SLES15SP3)
+#define IOCTL_INT	unsigned int
+#else
+#define IOCTL_INT	int
+#endif
+
+#if defined(KCLASS5C) || defined(KCLASS5D)
+#define KFEATURE_HAS_HOST_BUSY_FUNCTION			1
+#define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
+#define ioremap_nocache ioremap
+#endif
+
+#if defined(KCLASS5A) || defined(KCLASS5B) || defined(KCLASS5C) || defined(KCLASS5D) || \
+	defined(KCLASS4C) || defined(KCLASS4D) || defined(RHEL8) || defined(SLES15)
+#define KFEATURE_HAS_NCQ_PRIO_SUPPORT			1
+#endif
+
+#define KFEATURE_HAS_SCSI_SANITIZE_INQUIRY_STRING	0
+
+#if !defined(from_timer)
+#define KFEATURE_HAS_OLD_TIMER				1
+#endif
+
+/* default values */
+#if !defined(KFEATURE_HAS_WAIT_FOR_COMPLETION_IO)
+#define KFEATURE_HAS_WAIT_FOR_COMPLETION_IO		1
+#endif
+#if !defined(KFEATURE_HAS_2011_03_QUEUECOMMAND)
+#define KFEATURE_HAS_2011_03_QUEUECOMMAND		1
+#endif
+#if !defined(KFEATURE_HAS_DMA_ZALLOC_COHERENT)
+#define KFEATURE_HAS_DMA_ZALLOC_COHERENT		1
+#endif
+#if !defined(KFEATURE_HAS_PCI_ENABLE_MSIX_RANGE)
+#define KFEATURE_HAS_PCI_ENABLE_MSIX_RANGE		1
+#endif
+#if !defined(KFEATURE_HAS_SCSI_CHANGE_QUEUE_DEPTH)
+#define KFEATURE_HAS_SCSI_CHANGE_QUEUE_DEPTH		1
+#endif
+#if !defined(KFEATURE_HAS_MQ_SUPPORT)
+#define KFEATURE_HAS_MQ_SUPPORT				1
+#endif
+#if !defined(KFEATURE_HAS_SCSI_SANITIZE_INQUIRY_STRING)
+#define KFEATURE_HAS_SCSI_SANITIZE_INQUIRY_STRING	1
+#endif
+#if !defined(KFEATURE_HAS_PCIE_CAPABILITY_SUPPORT)
+#define KFEATURE_HAS_PCIE_CAPABILITY_SUPPORT		1
+#endif
+#if !defined(KFEATURE_HAS_NO_WRITE_SAME)
+#define KFEATURE_HAS_NO_WRITE_SAME			1
+#endif
+#if !defined(KFEATURE_HAS_BSG_JOB_SMP_HANDLER)
+#define KFEATURE_HAS_BSG_JOB_SMP_HANDLER		0
+#endif
+#if !defined(KFEATURE_HAS_HOST_BUSY_FUNCTION)
+#define KFEATURE_HAS_HOST_BUSY_FUNCTION			0
+#endif
+#if !defined(KFEATURE_HAS_SCSI_REQUEST)
+#define KFEATURE_HAS_SCSI_REQUEST			0
+#endif
+#if !defined(KFEATURE_HAS_LOCKLESS_DISPATCH_IO)
+#define KFEATURE_HAS_LOCKLESS_DISPATCH_IO		0
+#endif
+#if !defined(KFEATURE_HAS_USE_CLUSTERING)
+#define KFEATURE_HAS_USE_CLUSTERING			1
+#define IOCTL_INT int
+#endif
+#if !defined(KFEATURE_HAS_OLD_TIMER)
+#define KFEATURE_HAS_OLD_TIMER				0
+#endif
+#if !defined(KFEATURE_HAS_KTIME_SECONDS)
+#define KFEATURE_HAS_KTIME_SECONDS			0
+#endif
+#if !defined(KFEATURE_HAS_KTIME64)
+#define KFEATURE_HAS_KTIME64				0
+#endif
+#if !defined(KFEATURE_HAS_DMA_MASK_AND_COHERENT)
+#define KFEATURE_HAS_DMA_MASK_AND_COHERENT		1
+#endif
+#if !defined(KFEATURE_HAS_ATOMIC_HOST_BUSY)
+#define KFEATURE_HAS_ATOMIC_HOST_BUSY			1
+#endif
+#if !defined(KFEATURE_ENABLE_PCI_ALLOC_IRQ_VECTORS)
+#define KFEATURE_ENABLE_PCI_ALLOC_IRQ_VECTORS 		0
+#endif
+#if !defined(KFEATURE_ENABLE_SCSI_MAP_QUEUES)
+#define KFEATURE_ENABLE_SCSI_MAP_QUEUES 		0
+#endif
+#if !defined(KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V1)
+#define KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V1 		0
+#endif
+#if !defined(KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V2)
+#define KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V2 		0
+#endif
+#if !defined(KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V3)
+#define KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V3 		0
+#endif
+#if !defined(KFEATURE_HAS_NCQ_PRIO_SUPPORT)
+#define KFEATURE_HAS_NCQ_PRIO_SUPPORT			0
+#endif
+#if !defined(list_next_entry)
+#define list_next_entry(pos, member) \
+	list_entry((pos)->member.next, typeof(*(pos)), member)
+#endif
+
+#if !defined(list_first_entry_or_null)
+#define list_first_entry_or_null(ptr, type, member) \
+	(!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL)
+#endif
+
+#if !defined(TYPE_ZBC)
+#define TYPE_ZBC	0x14
+#endif
+
+#if !defined(readq)
+#define readq readq
+static inline u64 readq(const volatile void __iomem *addr)
+{
+	u32 lower32;
+	u32 upper32;
+
+	lower32 = readl(addr);
+	upper32 = readl(addr + 4);
+
+	return ((u64)upper32 << 32) | lower32;
+}
+#endif
+
+#if !defined(writeq)
+#define writeq writeq
+static inline void writeq(u64 value, volatile void __iomem *addr)
+{
+	u32 lower32;
+	u32 upper32;
+
+	lower32 = lower_32_bits(value);
+	upper32 = upper_32_bits(value);
+
+	writel(lower32, addr);
+	writel(upper32, addr + 4);
+}
+#endif
+
+static inline void pqi_disable_write_same(struct scsi_device *sdev)
+{
+#if KFEATURE_HAS_NO_WRITE_SAME
+	sdev->no_write_same = 1;
+#endif
+}
+
+#if !defined(PCI_DEVICE_SUB)
+#define PCI_DEVICE_SUB(vend, dev, subvend, subdev) \
+	.vendor = (vend), .device = (dev), \
+	.subvendor = (subvend), .subdevice = (subdev)
+#endif
+
+#if !defined(PCI_VENDOR_ID_HPE)
+#define PCI_VENDOR_ID_HPE		0x1590
+#endif
+
+#if !defined(PCI_VENDOR_ID_ADVANTECH)
+#define PCI_VENDOR_ID_ADVANTECH		0x13fe
+#endif
+
+#if !defined(PCI_VENDOR_ID_FIBERHOME)
+#define PCI_VENDOR_ID_FIBERHOME		0x1d8d
+#endif
+
+#if !defined(PCI_VENDOR_ID_GIGABYTE)
+#define PCI_VENDOR_ID_GIGABYTE		0x1458
+#endif
+
+#if !defined(PCI_VENDOR_ID_FOXCONN)
+#define PCI_VENDOR_ID_FOXCONN		0x105b
+#endif
+
+#if !defined(PCI_VENDOR_ID_HUAWEI)
+#define PCI_VENDOR_ID_HUAWEI		0x19e5
+#endif
+
+#if !defined(PCI_VENDOR_ID_H3C)
+#define PCI_VENDOR_ID_H3C		0x193d
+#endif
+
+#if !defined(PCI_VENDOR_ID_QUANTA)
+#define PCI_VENDOR_ID_QUANTA		0x152d
+#endif
+
+#if !defined(PCI_VENDOR_ID_INSPUR)
+#define PCI_VENDOR_ID_INSPUR		0x1bd4
+#endif
+
+#if !defined(PCI_VENDOR_ID_NTCOM)
+#define PCI_VENDOR_ID_NTCOM		0x1dfc
+#endif
+
+#if !defined(PCI_VENDOR_ID_ZTE)
+#define PCI_VENDOR_ID_ZTE		0x1cf2
+#endif
+
+#if !defined(offsetofend)
+#define offsetofend(TYPE, MEMBER) \
+	(offsetof(TYPE, MEMBER)	+ sizeof(((TYPE *)0)->MEMBER))
+#endif
+
+void pqi_compat_init_scsi_host_template(struct scsi_host_template *template);
+void pqi_compat_init_scsi_host(struct Scsi_Host *shost,
+	struct pqi_ctrl_info *ctrl_info);
+
+#if !KFEATURE_HAS_WAIT_FOR_COMPLETION_IO
+
+static inline unsigned long wait_for_completion_io_timeout(struct completion *x,
+	unsigned long timeout)
+{
+	return wait_for_completion_timeout(x, timeout);
+}
+
+static inline unsigned long wait_for_completion_io(struct completion *x)
+{
+	wait_for_completion(x);
+	return 0;
+}
+
+#endif	/* !KFEATURE_HAS_WAIT_FOR_COMPLETION_IO */
+
+#if KFEATURE_HAS_2011_03_QUEUECOMMAND
+
+#define PQI_SCSI_QUEUE_COMMAND		pqi_scsi_queue_command
+
+static inline void pqi_scsi_done(struct scsi_cmnd *scmd)
+{
+	pqi_prep_for_scsi_done(scmd);
+	if (scmd && scmd->scsi_done)
+		scmd->scsi_done(scmd);
+}
+
+#else
+
+int pqi_scsi_queue_command_compat(struct scsi_cmnd *scmd,
+	void (*done)(struct scsi_cmnd *));
+
+#define PQI_SCSI_QUEUE_COMMAND		pqi_scsi_queue_command_compat
+
+static inline void pqi_scsi_done(struct scsi_cmnd *scmd)
+{
+	void (*scsi_done)(struct scsi_cmnd *);
+
+	pqi_prep_for_scsi_done(scmd);
+	if (scmd) {
+		scsi_done = (void(*)(struct scsi_cmnd *))scmd->SCp.ptr;
+		scsi_done(scmd);
+	}
+}
+
+#endif	/* KFEATURE_HAS_2011_03_QUEUECOMMAND */
+
+#if !KFEATURE_HAS_DMA_ZALLOC_COHERENT
+
+static inline void *dma_zalloc_coherent(struct device *dev, size_t size,
+	dma_addr_t *dma_handle, gfp_t flag)
+{
+	void *ret = dma_alloc_coherent(dev, size, dma_handle,
+		flag | __GFP_ZERO);
+	return ret;
+}
+
+#endif	/* !KFEATURE_HAS_DMA_ZALLOC_COHERENT */
+
+#if !KFEATURE_HAS_PCI_ENABLE_MSIX_RANGE
+
+int pci_enable_msix_range(struct pci_dev *pci_dev, struct msix_entry *entries,
+	int minvec, int maxvec);
+
+#endif	/* !KFEATURE_HAS_PCI_ENABLE_MSIX_RANGE */
+
+#if !KFEATURE_HAS_SCSI_CHANGE_QUEUE_DEPTH
+
+int scsi_change_queue_depth(struct scsi_device *sdev, int queue_depth);
+
+#endif	/* !KFEATURE_HAS_SCSI_CHANGE_QUEUE_DEPTH */
+
+#if !KFEATURE_HAS_SCSI_SANITIZE_INQUIRY_STRING
+
+void scsi_sanitize_inquiry_string(unsigned char *s, int len);
+
+#endif	/* !KFEATURE_HAS_SCSI_SANITIZE_INQUIRY_STRING */
+
+#if !KFEATURE_HAS_PCIE_CAPABILITY_SUPPORT
+
+#define PCI_EXP_DEVCTL2			40	/* Device Control 2 */
+
+int pcie_capability_clear_and_set_word(struct pci_dev *dev, int pos,
+	u16 clear, u16 set);
+
+#endif	/* !KFEATURE_HAS_PCIE_CAPABILITY_SUPPORT */
+
+static inline u16 pqi_get_hw_queue(struct pqi_ctrl_info *ctrl_info,
+	struct scsi_cmnd *scmd)
+{
+	u16 hw_queue;
+
+#if KFEATURE_HAS_MQ_SUPPORT
+	if (shost_use_blk_mq(scmd->device->host))
+		hw_queue = blk_mq_unique_tag_to_hwq(blk_mq_unique_tag(scmd->request));
+	else
+		hw_queue = smp_processor_id();
+#else
+	hw_queue = smp_processor_id();
+#endif
+	if (hw_queue > ctrl_info->max_hw_queue_index)
+		hw_queue = 0;
+
+	return hw_queue;
+}
+
+#ifdef KFEATURE_NEEDS_BLK_RQ_IS_PASSTHROUGH
+
+static inline bool blk_rq_is_passthrough(struct request *rq)
+{
+	return rq->cmd_type != REQ_TYPE_FS;
+}
+
+#endif	/* KFEATURE_NEEDS_BLK_RQ_IS_PASSTHROUGH */
+
+#if !KFEATURE_HAS_BSG_JOB_SMP_HANDLER
+
+int pqi_sas_smp_handler_compat(struct Scsi_Host *shost, struct sas_rphy *rphy,
+	struct request *req);
+
+void pqi_bsg_job_done(struct bsg_job *job, int result,
+	unsigned int reply_payload_rcv_len);
+
+#define PQI_SAS_SMP_HANDLER		pqi_sas_smp_handler_compat
+
+#else
+
+#define PQI_SAS_SMP_HANDLER		pqi_sas_smp_handler
+
+static inline void pqi_bsg_job_done(struct bsg_job *job, int result,
+	unsigned int reply_payload_rcv_len)
+{
+	bsg_job_done(job, result, reply_payload_rcv_len);
+}
+
+#endif	/* !KFEATURE_HAS_BSG_JOB_SMP_HANDLER */
+
+#if KFEATURE_HAS_OLD_TIMER
+#define from_timer(var, callback_timer, timer_fieldname) \
+	container_of(callback_timer, typeof(*var), timer_fieldname)
+
+#if !defined(TIMER_DATA_TYPE)
+#define TIMER_DATA_TYPE		unsigned long
+#define TIMER_FUNC_TYPE		void (*)(TIMER_DATA_TYPE)
+#endif
+
+static inline void timer_setup (struct timer_list *timer,
+	void (*func) (struct timer_list *), unsigned long data)
+{
+	init_timer(timer);
+	timer->function = (TIMER_FUNC_TYPE) func;
+	timer->data = (unsigned long) timer;
+}
+#endif	/* KFEATURE_HAS_OLD_TIMER */
+
+#if !KFEATURE_HAS_KTIME64
+#define time64_to_tm	time_to_tm
+#endif
+
+#if !KFEATURE_HAS_KTIME_SECONDS
+static inline unsigned long ktime_get_real_seconds(void)
+{
+	ktime_t tv;
+	struct timeval time;
+
+	tv = ktime_get_real();
+	time = ktime_to_timeval(tv);
+
+	return time.tv_sec;
+}
+#endif
+
+#if !KFEATURE_HAS_DMA_MASK_AND_COHERENT
+
+static inline int pqi_dma_set_mask_and_coherent(struct device *device, u64 mask)
+{
+	return dma_set_mask(device, mask);
+}
+
+#else
+
+static inline int pqi_dma_set_mask_and_coherent(struct device *device, u64 mask)
+{
+	return dma_set_mask_and_coherent(device, mask);
+}
+
+#endif	/* !KFEATURE_HAS_DMA_MASK_AND_COHERENT */
+
+static inline bool pqi_scsi_host_busy(struct Scsi_Host *shost)
+{
+#if KFEATURE_HAS_HOST_BUSY_FUNCTION
+	return scsi_host_busy(shost);
+#else
+#if KFEATURE_HAS_ATOMIC_HOST_BUSY
+	return atomic_read(&shost->host_busy) > 0;
+#else
+	return shost->host_busy > 0;
+#endif
+#endif
+}
+
+#if !KFEATURE_ENABLE_PCI_ALLOC_IRQ_VECTORS
+#if !defined(PCI_IRQ_MSIX)
+#define PCI_IRQ_MSIX		(1 << 2) /* Allow MSI-X interrupts */
+#endif
+#if !defined(PCI_IRQ_AFFINITY)
+#define PCI_IRQ_AFFINITY	(1 << 3) /* Auto-assign affinity */
+#endif
+#endif
+
+int pqi_pci_irq_vector(struct pci_dev *dev, unsigned int nr);
+int pqi_pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
+                              unsigned int max_vecs, unsigned int flags);
+void pqi_pci_free_irq_vectors(struct pci_dev *dev);
+
+static inline void *pqi_get_irq_cookie(struct pqi_ctrl_info *ctrl_info, unsigned int nr)
+{
+#if KFEATURE_ENABLE_PCI_ALLOC_IRQ_VECTORS
+	return &ctrl_info->queue_groups[nr];
+#else
+	return ctrl_info->intr_data[nr];
+#endif
+}
+
+#endif	/* _SMARTPQI_KERNEL_COMPAT_H */
diff --git a/drivers/amazon/scsi/smartpqi/smartpqi_sas_transport.c b/drivers/amazon/scsi/smartpqi/smartpqi_sas_transport.c
new file mode 100644
index 0000000000000..17bee4f5ccdd7
--- /dev/null
+++ b/drivers/amazon/scsi/smartpqi/smartpqi_sas_transport.c
@@ -0,0 +1,585 @@
+/*
+ *    driver for Microchip PQI-based storage controllers
+ *    Copyright (c) 2019-2021 Microchip Technology Inc. and its subsidiaries
+ *    Copyright (c) 2016-2018 Microsemi Corporation
+ *    Copyright (c) 2016 PMC-Sierra, Inc.
+ *
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; version 2 of the License.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *    NON INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ *    Questions/Comments/Bugfixes to storagedev@microchip.com
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/bsg-lib.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_transport_sas.h>
+#include <asm/unaligned.h>
+#include "smartpqi.h"
+#include "smartpqi_kernel_compat.h"
+
+static struct pqi_sas_phy *pqi_alloc_sas_phy(struct pqi_sas_port *pqi_sas_port)
+{
+	struct pqi_sas_phy *pqi_sas_phy;
+	struct sas_phy *phy;
+
+	pqi_sas_phy = kzalloc(sizeof(*pqi_sas_phy), GFP_KERNEL);
+	if (!pqi_sas_phy)
+		return NULL;
+
+	phy = sas_phy_alloc(pqi_sas_port->parent_node->parent_dev,
+		pqi_sas_port->next_phy_index);
+	if (!phy) {
+		kfree(pqi_sas_phy);
+		return NULL;
+	}
+
+	pqi_sas_port->next_phy_index++;
+	pqi_sas_phy->phy = phy;
+	pqi_sas_phy->parent_port = pqi_sas_port;
+
+	return pqi_sas_phy;
+}
+
+static void pqi_free_sas_phy(struct pqi_sas_phy *pqi_sas_phy)
+{
+	struct sas_phy *phy = pqi_sas_phy->phy;
+
+	sas_port_delete_phy(pqi_sas_phy->parent_port->port, phy);
+	if (pqi_sas_phy->added_to_port)
+		list_del(&pqi_sas_phy->phy_list_entry);
+	sas_phy_delete(phy);
+	kfree(pqi_sas_phy);
+}
+
+static int pqi_sas_port_add_phy(struct pqi_sas_phy *pqi_sas_phy)
+{
+	int rc;
+	struct pqi_sas_port *pqi_sas_port;
+	struct sas_phy *phy;
+	struct sas_identify *identify;
+
+	pqi_sas_port = pqi_sas_phy->parent_port;
+	phy = pqi_sas_phy->phy;
+
+	identify = &phy->identify;
+	memset(identify, 0, sizeof(*identify));
+	identify->sas_address = pqi_sas_port->sas_address;
+	identify->device_type = SAS_END_DEVICE;
+	identify->initiator_port_protocols = SAS_PROTOCOL_ALL;
+	identify->target_port_protocols = SAS_PROTOCOL_ALL;
+	phy->minimum_linkrate_hw = SAS_LINK_RATE_UNKNOWN;
+	phy->maximum_linkrate_hw = SAS_LINK_RATE_UNKNOWN;
+	phy->minimum_linkrate = SAS_LINK_RATE_UNKNOWN;
+	phy->maximum_linkrate = SAS_LINK_RATE_UNKNOWN;
+	phy->negotiated_linkrate = SAS_LINK_RATE_UNKNOWN;
+
+	rc = sas_phy_add(pqi_sas_phy->phy);
+	if (rc)
+		return rc;
+
+	sas_port_add_phy(pqi_sas_port->port, pqi_sas_phy->phy);
+	list_add_tail(&pqi_sas_phy->phy_list_entry,
+		&pqi_sas_port->phy_list_head);
+	pqi_sas_phy->added_to_port = true;
+
+	return 0;
+}
+
+static int pqi_sas_port_add_rphy(struct pqi_sas_port *pqi_sas_port,
+	struct sas_rphy *rphy)
+{
+	struct sas_identify *identify;
+
+	identify = &rphy->identify;
+	identify->sas_address = pqi_sas_port->sas_address;
+	identify->phy_identifier = pqi_sas_port->device->phy_id;
+
+	identify->initiator_port_protocols = SAS_PROTOCOL_ALL;
+	identify->target_port_protocols = SAS_PROTOCOL_STP;
+
+	if (pqi_sas_port->device) {
+		switch (pqi_sas_port->device->device_type) {
+		case SA_DEVICE_TYPE_SAS:
+		case SA_DEVICE_TYPE_SES:
+		case SA_DEVICE_TYPE_NVME:
+			identify->target_port_protocols = SAS_PROTOCOL_SSP;
+			break;
+		case SA_DEVICE_TYPE_EXPANDER_SMP:
+			identify->target_port_protocols = SAS_PROTOCOL_SMP;
+			break;
+		case SA_DEVICE_TYPE_SATA:
+		default:
+			break;
+		}
+	}
+
+	return sas_rphy_add(rphy);
+}
+
+static struct sas_rphy *pqi_sas_rphy_alloc(struct pqi_sas_port *pqi_sas_port)
+{
+	if (pqi_sas_port->device && pqi_sas_port->device->is_expander_smp_device)
+		return sas_expander_alloc(pqi_sas_port->port,
+				SAS_FANOUT_EXPANDER_DEVICE);
+
+	return sas_end_device_alloc(pqi_sas_port->port);
+}
+
+static struct pqi_sas_port *pqi_alloc_sas_port(
+	struct pqi_sas_node *pqi_sas_node, u64 sas_address,
+	struct pqi_scsi_dev *device)
+{
+	int rc;
+	struct pqi_sas_port *pqi_sas_port;
+	struct sas_port *port;
+
+	pqi_sas_port = kzalloc(sizeof(*pqi_sas_port), GFP_KERNEL);
+	if (!pqi_sas_port)
+		return NULL;
+
+	INIT_LIST_HEAD(&pqi_sas_port->phy_list_head);
+	pqi_sas_port->parent_node = pqi_sas_node;
+
+	port = sas_port_alloc_num(pqi_sas_node->parent_dev);
+	if (!port)
+		goto free_pqi_port;
+
+	rc = sas_port_add(port);
+	if (rc)
+		goto free_sas_port;
+
+	pqi_sas_port->port = port;
+	pqi_sas_port->sas_address = sas_address;
+	pqi_sas_port->device = device;
+	list_add_tail(&pqi_sas_port->port_list_entry,
+		&pqi_sas_node->port_list_head);
+
+	return pqi_sas_port;
+
+free_sas_port:
+	sas_port_free(port);
+free_pqi_port:
+	kfree(pqi_sas_port);
+
+	return NULL;
+}
+
+static void pqi_free_sas_port(struct pqi_sas_port *pqi_sas_port)
+{
+	struct pqi_sas_phy *pqi_sas_phy;
+	struct pqi_sas_phy *next;
+
+	list_for_each_entry_safe(pqi_sas_phy, next,
+		&pqi_sas_port->phy_list_head, phy_list_entry)
+			pqi_free_sas_phy(pqi_sas_phy);
+
+	sas_port_delete(pqi_sas_port->port);
+	list_del(&pqi_sas_port->port_list_entry);
+	kfree(pqi_sas_port);
+}
+
+static struct pqi_sas_node *pqi_alloc_sas_node(struct device *parent_dev)
+{
+	struct pqi_sas_node *pqi_sas_node;
+
+	pqi_sas_node = kzalloc(sizeof(*pqi_sas_node), GFP_KERNEL);
+	if (pqi_sas_node) {
+		pqi_sas_node->parent_dev = parent_dev;
+		INIT_LIST_HEAD(&pqi_sas_node->port_list_head);
+	}
+
+	return pqi_sas_node;
+}
+
+static void pqi_free_sas_node(struct pqi_sas_node *pqi_sas_node)
+{
+	struct pqi_sas_port *pqi_sas_port;
+	struct pqi_sas_port *next;
+
+	if (!pqi_sas_node)
+		return;
+
+	list_for_each_entry_safe(pqi_sas_port, next,
+		&pqi_sas_node->port_list_head, port_list_entry)
+			pqi_free_sas_port(pqi_sas_port);
+
+	kfree(pqi_sas_node);
+}
+
+struct pqi_scsi_dev *pqi_find_device_by_sas_rphy(
+	struct pqi_ctrl_info *ctrl_info, struct sas_rphy *rphy)
+{
+	struct pqi_scsi_dev *device;
+
+	list_for_each_entry(device, &ctrl_info->scsi_device_list,
+		scsi_device_list_entry) {
+		if (!device->sas_port)
+			continue;
+		if (device->sas_port->rphy == rphy)
+			return device;
+	}
+
+	return NULL;
+}
+
+int pqi_add_sas_host(struct Scsi_Host *shost, struct pqi_ctrl_info *ctrl_info)
+{
+	int rc;
+	struct device *parent_dev;
+	struct pqi_sas_node *pqi_sas_node;
+	struct pqi_sas_port *pqi_sas_port;
+	struct pqi_sas_phy *pqi_sas_phy;
+
+	parent_dev = &shost->shost_dev;
+
+	pqi_sas_node = pqi_alloc_sas_node(parent_dev);
+	if (!pqi_sas_node)
+		return -ENOMEM;
+
+	pqi_sas_port = pqi_alloc_sas_port(pqi_sas_node,
+		ctrl_info->sas_address, NULL);
+	if (!pqi_sas_port) {
+		rc = -ENODEV;
+		goto free_sas_node;
+	}
+
+	pqi_sas_phy = pqi_alloc_sas_phy(pqi_sas_port);
+	if (!pqi_sas_phy) {
+		rc = -ENODEV;
+		goto free_sas_port;
+	}
+
+	rc = pqi_sas_port_add_phy(pqi_sas_phy);
+	if (rc)
+		goto free_sas_phy;
+
+	ctrl_info->sas_host = pqi_sas_node;
+
+	return 0;
+
+free_sas_phy:
+	pqi_free_sas_phy(pqi_sas_phy);
+free_sas_port:
+	pqi_free_sas_port(pqi_sas_port);
+free_sas_node:
+	pqi_free_sas_node(pqi_sas_node);
+
+	return rc;
+}
+
+void pqi_delete_sas_host(struct pqi_ctrl_info *ctrl_info)
+{
+	pqi_free_sas_node(ctrl_info->sas_host);
+}
+
+int pqi_add_sas_device(struct pqi_sas_node *pqi_sas_node,
+	struct pqi_scsi_dev *device)
+{
+	int rc;
+	struct pqi_sas_port *pqi_sas_port;
+	struct sas_rphy *rphy;
+
+	pqi_sas_port = pqi_alloc_sas_port(pqi_sas_node,
+		device->sas_address, device);
+	if (!pqi_sas_port)
+		return -ENOMEM;
+
+	rphy = pqi_sas_rphy_alloc(pqi_sas_port);
+	if (!rphy) {
+		rc = -ENODEV;
+		goto free_sas_port;
+	}
+
+	pqi_sas_port->rphy = rphy;
+	device->sas_port = pqi_sas_port;
+
+	rc = pqi_sas_port_add_rphy(pqi_sas_port, rphy);
+	if (rc)
+		goto free_sas_port;
+
+	return 0;
+
+free_sas_port:
+	pqi_free_sas_port(pqi_sas_port);
+	device->sas_port = NULL;
+
+	return rc;
+}
+
+void pqi_remove_sas_device(struct pqi_scsi_dev *device)
+{
+	if (device->sas_port) {
+		pqi_free_sas_port(device->sas_port);
+		device->sas_port = NULL;
+	}
+}
+
+static int pqi_sas_get_linkerrors(struct sas_phy *phy)
+{
+	return 0;
+}
+
+static int pqi_sas_get_enclosure_identifier(struct sas_rphy *rphy,
+	u64 *identifier)
+{
+	int rc;
+	unsigned long flags;
+	struct Scsi_Host *shost;
+	struct pqi_ctrl_info *ctrl_info;
+	struct pqi_scsi_dev *found_device;
+	struct pqi_scsi_dev *device;
+
+	if (!rphy)
+		return -ENODEV;
+
+	shost = rphy_to_shost(rphy);
+	ctrl_info = shost_to_hba(shost);
+	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
+	found_device = pqi_find_device_by_sas_rphy(ctrl_info, rphy);
+
+	if (!found_device) {
+		rc = -ENODEV;
+		goto out;
+	}
+
+	if (found_device->devtype == TYPE_ENCLOSURE) {
+		*identifier = get_unaligned_be64(&found_device->wwid[8]);
+		rc = 0;
+		goto out;
+	}
+
+	if (found_device->box_index == 0xff ||
+		found_device->phys_box_on_bus == 0 ||
+		found_device->bay == 0xff) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	list_for_each_entry(device, &ctrl_info->scsi_device_list,
+		scsi_device_list_entry) {
+		if (device->devtype == TYPE_ENCLOSURE &&
+			device->box_index == found_device->box_index &&
+			device->phys_box_on_bus ==
+				found_device->phys_box_on_bus &&
+			memcmp(device->phys_connector,
+				found_device->phys_connector, 2) == 0) {
+			*identifier =
+				get_unaligned_be64(&device->wwid[8]);
+			rc = 0;
+			goto out;
+		}
+	}
+
+	if (found_device->phy_connected_dev_type != SA_DEVICE_TYPE_CONTROLLER) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	list_for_each_entry(device, &ctrl_info->scsi_device_list,
+		scsi_device_list_entry) {
+		if (device->devtype == TYPE_ENCLOSURE &&
+			CISS_GET_DRIVE_NUMBER(device->scsi3addr) ==
+				PQI_VSEP_CISS_BTL) {
+			*identifier = get_unaligned_be64(&device->wwid[8]);
+			rc = 0;
+			goto out;
+		}
+	}
+
+	rc = -EINVAL;
+out:
+	spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+
+	return rc;
+}
+
+static int pqi_sas_get_bay_identifier(struct sas_rphy *rphy)
+{
+	int rc;
+	unsigned long flags;
+	struct pqi_ctrl_info *ctrl_info;
+	struct pqi_scsi_dev *device;
+	struct Scsi_Host *shost;
+
+	if (!rphy)
+		return -ENODEV;
+
+	shost = rphy_to_shost(rphy);
+	ctrl_info = shost_to_hba(shost);
+	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
+	device = pqi_find_device_by_sas_rphy(ctrl_info, rphy);
+
+	if (!device) {
+		rc = -ENODEV;
+		goto out;
+	}
+
+	if (device->bay == 0xff)
+		rc = -EINVAL;
+	else
+		rc = device->bay;
+
+out:
+	spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+
+	return rc;
+}
+
+static int pqi_sas_phy_reset(struct sas_phy *phy, int hard_reset)
+{
+	return 0;
+}
+
+static int pqi_sas_phy_enable(struct sas_phy *phy, int enable)
+{
+	return 0;
+}
+
+static int pqi_sas_phy_setup(struct sas_phy *phy)
+{
+	return 0;
+}
+
+static void pqi_sas_phy_release(struct sas_phy *phy)
+{
+}
+
+static int pqi_sas_phy_speed(struct sas_phy *phy,
+	struct sas_phy_linkrates *rates)
+{
+	return -EINVAL;
+}
+
+#define CSMI_IOCTL_TIMEOUT	60
+#define SMP_CRC_FIELD_LENGTH	4
+
+static struct bmic_csmi_smp_passthru_buffer *
+pqi_build_csmi_smp_passthru_buffer(struct sas_rphy *rphy,
+	struct bsg_job *job)
+{
+	struct bmic_csmi_smp_passthru_buffer *smp_buf;
+	struct bmic_csmi_ioctl_header *ioctl_header;
+	struct bmic_csmi_smp_passthru *parameters;
+	u32 req_size;
+	u32 resp_size;
+
+	smp_buf = kzalloc(sizeof(*smp_buf), GFP_KERNEL);
+	if (!smp_buf)
+		return NULL;
+
+	req_size = job->request_payload.payload_len;
+	resp_size = job->reply_payload.payload_len;
+
+	ioctl_header = &smp_buf->ioctl_header;
+	put_unaligned_le32(sizeof(smp_buf->ioctl_header),
+		&ioctl_header->header_length);
+	put_unaligned_le32(CSMI_IOCTL_TIMEOUT, &ioctl_header->timeout);
+	put_unaligned_le32(CSMI_CC_SAS_SMP_PASSTHRU,
+		&ioctl_header->control_code);
+	put_unaligned_le32(sizeof(smp_buf->parameters), &ioctl_header->length);
+
+	parameters = &smp_buf->parameters;
+	parameters->phy_identifier = rphy->identify.phy_identifier;
+	parameters->port_identifier = 0;
+	parameters->connection_rate = 0;
+	put_unaligned_be64(rphy->identify.sas_address,
+		&parameters->destination_sas_address);
+
+	if (req_size > SMP_CRC_FIELD_LENGTH)
+		req_size -= SMP_CRC_FIELD_LENGTH;
+
+	put_unaligned_le32(req_size, &parameters->request_length);
+	put_unaligned_le32(resp_size, &parameters->response_length);
+
+	sg_copy_to_buffer(job->request_payload.sg_list,
+		job->reply_payload.sg_cnt, &parameters->request,
+		req_size);
+
+	return smp_buf;
+}
+
+static unsigned int pqi_build_sas_smp_handler_reply(
+	struct bmic_csmi_smp_passthru_buffer *smp_buf, struct bsg_job *job,
+	struct pqi_raid_error_info *error_info)
+{
+	sg_copy_from_buffer(job->reply_payload.sg_list,
+		job->reply_payload.sg_cnt, &smp_buf->parameters.response,
+		le32_to_cpu(smp_buf->parameters.response_length));
+
+	job->reply_len = le16_to_cpu(error_info->sense_data_length);
+	memcpy(job->reply, error_info->data,
+		le16_to_cpu(error_info->sense_data_length));
+
+	return job->reply_payload.payload_len -
+		get_unaligned_le32(&error_info->data_in_transferred);
+}
+
+void pqi_sas_smp_handler(struct bsg_job *job, struct Scsi_Host *shost,
+	struct sas_rphy *rphy)
+{
+	int rc;
+	struct pqi_ctrl_info *ctrl_info;
+	struct bmic_csmi_smp_passthru_buffer *smp_buf;
+	struct pqi_raid_error_info error_info;
+	unsigned int reslen = 0;
+
+	ctrl_info = shost_to_hba(shost);
+
+	if (job->reply_payload.payload_len == 0) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	if (!rphy) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (rphy->identify.device_type != SAS_FANOUT_EXPANDER_DEVICE) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (job->request_payload.sg_cnt > 1 || job->reply_payload.sg_cnt > 1) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	smp_buf = pqi_build_csmi_smp_passthru_buffer(rphy, job);
+	if (!smp_buf) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	rc = pqi_csmi_smp_passthru(ctrl_info, smp_buf, sizeof(*smp_buf),
+		&error_info);
+	if (rc)
+		goto out;
+
+	reslen = pqi_build_sas_smp_handler_reply(smp_buf, job, &error_info);
+
+out:
+	pqi_bsg_job_done(job, rc, reslen);
+}
+
+struct sas_function_template pqi_sas_transport_functions = {
+	.get_linkerrors = pqi_sas_get_linkerrors,
+	.get_enclosure_identifier = pqi_sas_get_enclosure_identifier,
+	.get_bay_identifier = pqi_sas_get_bay_identifier,
+	.phy_reset = pqi_sas_phy_reset,
+	.phy_enable = pqi_sas_phy_enable,
+	.phy_setup = pqi_sas_phy_setup,
+	.phy_release = pqi_sas_phy_release,
+	.set_phy_speed = pqi_sas_phy_speed,
+	.smp_handler = PQI_SAS_SMP_HANDLER,
+};
+
diff --git a/drivers/amazon/scsi/smartpqi/smartpqi_sis.c b/drivers/amazon/scsi/smartpqi/smartpqi_sis.c
new file mode 100644
index 0000000000000..5a6369668c382
--- /dev/null
+++ b/drivers/amazon/scsi/smartpqi/smartpqi_sis.c
@@ -0,0 +1,511 @@
+/*
+ *    driver for Microchip PQI-based storage controllers
+ *    Copyright (c) 2019-2021 Microchip Technology Inc. and its subsidiaries
+ *    Copyright (c) 2016-2018 Microsemi Corporation
+ *    Copyright (c) 2016 PMC-Sierra, Inc.
+ *
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; version 2 of the License.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *    NON INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ *    Questions/Comments/Bugfixes to storagedev@microchip.com
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <scsi/scsi_device.h>
+#include <asm/unaligned.h>
+#include "smartpqi.h"
+#include "smartpqi_sis.h"
+
+/* legacy SIS interface commands */
+#define SIS_CMD_GET_ADAPTER_PROPERTIES		0x19
+#define SIS_CMD_INIT_BASE_STRUCT_ADDRESS	0x1b
+#define SIS_CMD_GET_PQI_CAPABILITIES		0x3000
+
+/* for submission of legacy SIS commands */
+#define SIS_REENABLE_SIS_MODE			0x1
+#define SIS_ENABLE_MSIX				0x40
+#define SIS_ENABLE_INTX				0x80
+#define SIS_SOFT_RESET				0x100
+#define SIS_CMD_READY				0x200
+#define SIS_TRIGGER_SHUTDOWN			0x800000
+#define SIS_PQI_RESET_QUIESCE			0x1000000
+
+#define SIS_CMD_COMPLETE			0x1000
+#define SIS_CLEAR_CTRL_TO_HOST_DOORBELL		0x1000
+
+#define SIS_CMD_STATUS_SUCCESS			0x1
+#define SIS_CMD_COMPLETE_TIMEOUT_SECS		(30 * HZ)
+#define SIS_CMD_COMPLETE_POLL_INTERVAL_MSECS	10
+
+/* used with SIS_CMD_GET_ADAPTER_PROPERTIES command */
+#define SIS_EXTENDED_PROPERTIES_SUPPORTED	0x800000
+#define SIS_SMARTARRAY_FEATURES_SUPPORTED	0x2
+#define SIS_PQI_MODE_SUPPORTED			0x4
+#define SIS_PQI_RESET_QUIESCE_SUPPORTED		0x8
+#define SIS_REQUIRED_EXTENDED_PROPERTIES	\
+	(SIS_SMARTARRAY_FEATURES_SUPPORTED | SIS_PQI_MODE_SUPPORTED)
+
+/* used with SIS_CMD_INIT_BASE_STRUCT_ADDRESS command */
+#define SIS_BASE_STRUCT_REVISION		9
+#define SIS_BASE_STRUCT_ALIGNMENT		16
+
+#define SIS_CTRL_KERNEL_FW_TRIAGE		0x3
+#define SIS_CTRL_KERNEL_UP			0x80
+#define SIS_CTRL_KERNEL_PANIC			0x100
+#if TORTUGA
+#define SIS_CTRL_READY_TIMEOUT_SECS		(150 * HZ)
+#else
+#define SIS_CTRL_READY_TIMEOUT_SECS		(180 * HZ)
+#endif
+#define SIS_CTRL_READY_RESUME_TIMEOUT_SECS	(90 * HZ)
+#define SIS_CTRL_READY_POLL_INTERVAL_MSECS	10
+
+enum sis_fw_triage_status {
+	FW_TRIAGE_NOT_STARTED = 0,
+	FW_TRIAGE_STARTED,
+	FW_TRIAGE_COND_INVALID,
+	FW_TRIAGE_COMPLETED
+};
+
+#pragma pack(1)
+
+/* for use with SIS_CMD_INIT_BASE_STRUCT_ADDRESS command */
+struct sis_base_struct {
+	__le32	revision;		/* revision of this structure */
+	__le32	flags;			/* reserved */
+	__le32	error_buffer_paddr_low;	/* lower 32 bits of physical memory */
+					/* buffer for PQI error response */
+					/* data */
+	__le32	error_buffer_paddr_high;	/* upper 32 bits of physical */
+						/* memory buffer for PQI */
+						/* error response data */
+	__le32	error_buffer_element_length;	/* length of each PQI error */
+						/* response buffer element */
+						/* in bytes */
+	__le32	error_buffer_num_elements;	/* total number of PQI error */
+						/* response buffers available */
+};
+
+#pragma pack()
+
+static int sis_wait_for_ctrl_ready_with_timeout(struct pqi_ctrl_info *ctrl_info,
+	unsigned int timeout_secs)
+{
+	unsigned long timeout;
+	u32 status;
+
+	timeout = timeout_secs + jiffies;
+
+	while (1) {
+		status = readl(&ctrl_info->registers->sis_firmware_status);
+		if (status != ~0) {
+			if (status & SIS_CTRL_KERNEL_PANIC) {
+				dev_err(&ctrl_info->pci_dev->dev,
+					"controller is offline: status code 0x%x\n",
+					readl(
+					&ctrl_info->registers->sis_mailbox[7]));
+				return -ENODEV;
+			}
+			if (status & SIS_CTRL_KERNEL_UP)
+				break;
+		}
+		if (time_after(jiffies, timeout)) {
+			dev_err(&ctrl_info->pci_dev->dev,
+				"controller not ready after %u seconds\n",
+				timeout_secs);
+			return -ETIMEDOUT;
+		}
+		msleep(SIS_CTRL_READY_POLL_INTERVAL_MSECS);
+	}
+
+	return 0;
+}
+
+int sis_wait_for_ctrl_ready(struct pqi_ctrl_info *ctrl_info)
+{
+	return sis_wait_for_ctrl_ready_with_timeout(ctrl_info,
+		SIS_CTRL_READY_TIMEOUT_SECS);
+}
+
+int sis_wait_for_ctrl_ready_resume(struct pqi_ctrl_info *ctrl_info)
+{
+	return sis_wait_for_ctrl_ready_with_timeout(ctrl_info,
+		SIS_CTRL_READY_RESUME_TIMEOUT_SECS);
+}
+
+bool sis_is_firmware_running(struct pqi_ctrl_info *ctrl_info)
+{
+	bool running;
+	u32 status;
+
+	status = readl(&ctrl_info->registers->sis_firmware_status);
+
+	if (status & SIS_CTRL_KERNEL_PANIC)
+		running = false;
+	else
+		running = true;
+
+	if (!running)
+		dev_err(&ctrl_info->pci_dev->dev,
+			"controller is offline: status code 0x%x\n",
+			readl(&ctrl_info->registers->sis_mailbox[7]));
+
+	return running;
+}
+
+bool sis_is_kernel_up(struct pqi_ctrl_info *ctrl_info)
+{
+	return readl(&ctrl_info->registers->sis_firmware_status) &
+		SIS_CTRL_KERNEL_UP;
+}
+
+u32 sis_get_product_id(struct pqi_ctrl_info *ctrl_info)
+{
+	return readl(&ctrl_info->registers->sis_product_identifier);
+}
+
+/* used for passing command parameters/results when issuing SIS commands */
+struct sis_sync_cmd_params {
+	u32	mailbox[6];	/* mailboxes 0-5 */
+};
+
+static int sis_send_sync_cmd(struct pqi_ctrl_info *ctrl_info,
+	u32 cmd, struct sis_sync_cmd_params *params)
+{
+	struct pqi_ctrl_registers __iomem *registers;
+	unsigned int i;
+	unsigned long timeout;
+	u32 doorbell;
+	u32 cmd_status;
+
+	registers = ctrl_info->registers;
+
+	/* Write the command to mailbox 0. */
+	writel(cmd, &registers->sis_mailbox[0]);
+
+	/*
+	 * Write the command parameters to mailboxes 1-4 (mailbox 5 is not used
+	 * when sending a command to the controller).
+	 */
+	for (i = 1; i <= 4; i++)
+		writel(params->mailbox[i], &registers->sis_mailbox[i]);
+
+	/* Clear the command doorbell. */
+	writel(SIS_CLEAR_CTRL_TO_HOST_DOORBELL,
+		&registers->sis_ctrl_to_host_doorbell_clear);
+
+	/* Disable doorbell interrupts by masking all interrupts. */
+	writel(~0, &registers->sis_interrupt_mask);
+
+	/*
+	 * Force the completion of the interrupt mask register write before
+	 * submitting the command.
+	 */
+	readl(&registers->sis_interrupt_mask);
+
+	/* Submit the command to the controller. */
+	writel(SIS_CMD_READY, &registers->sis_host_to_ctrl_doorbell);
+
+	/*
+	 * Poll for command completion.  Note that the call to msleep() is at
+	 * the top of the loop in order to give the controller time to start
+	 * processing the command before we start polling.
+	 */
+	timeout = SIS_CMD_COMPLETE_TIMEOUT_SECS + jiffies;
+	while (1) {
+		msleep(SIS_CMD_COMPLETE_POLL_INTERVAL_MSECS);
+		doorbell = readl(&registers->sis_ctrl_to_host_doorbell);
+		if (doorbell & SIS_CMD_COMPLETE)
+			break;
+		if (time_after(jiffies, timeout))
+			return -ETIMEDOUT;
+	}
+
+	/* Read the command status from mailbox 0. */
+	cmd_status = readl(&registers->sis_mailbox[0]);
+	if (cmd_status != SIS_CMD_STATUS_SUCCESS) {
+		dev_err(&ctrl_info->pci_dev->dev,
+			"SIS command failed for command 0x%x: status = 0x%x\n",
+			cmd, cmd_status);
+		return -EINVAL;
+	}
+
+	/*
+	 * The command completed successfully, so save the command status and
+	 * read the values returned in mailboxes 1-5.
+	 */
+	params->mailbox[0] = cmd_status;
+	for (i = 1; i < ARRAY_SIZE(params->mailbox); i++)
+		params->mailbox[i] = readl(&registers->sis_mailbox[i]);
+
+	return 0;
+}
+
+/*
+ * This function verifies that we are talking to a controller that speaks PQI.
+ */
+
+int sis_get_ctrl_properties(struct pqi_ctrl_info *ctrl_info)
+{
+	int rc;
+	u32 properties;
+	u32 extended_properties;
+	struct sis_sync_cmd_params params;
+
+	memset(&params, 0, sizeof(params));
+
+	rc = sis_send_sync_cmd(ctrl_info, SIS_CMD_GET_ADAPTER_PROPERTIES,
+		&params);
+	if (rc)
+		return rc;
+
+	properties = params.mailbox[1];
+
+	if (!(properties & SIS_EXTENDED_PROPERTIES_SUPPORTED))
+		return -ENODEV;
+
+	extended_properties = params.mailbox[4];
+
+	if ((extended_properties & SIS_REQUIRED_EXTENDED_PROPERTIES) !=
+		SIS_REQUIRED_EXTENDED_PROPERTIES)
+		return -ENODEV;
+
+	if (extended_properties & SIS_PQI_RESET_QUIESCE_SUPPORTED)
+		ctrl_info->pqi_reset_quiesce_supported = true;
+
+	return 0;
+}
+
+int sis_get_pqi_capabilities(struct pqi_ctrl_info *ctrl_info)
+{
+	int rc;
+	struct sis_sync_cmd_params params;
+
+	memset(&params, 0, sizeof(params));
+
+	rc = sis_send_sync_cmd(ctrl_info, SIS_CMD_GET_PQI_CAPABILITIES,
+		&params);
+	if (rc)
+		return rc;
+
+	ctrl_info->max_sg_entries = params.mailbox[1];
+	ctrl_info->max_transfer_size = params.mailbox[2];
+	ctrl_info->max_outstanding_requests = params.mailbox[3];
+	ctrl_info->config_table_offset = params.mailbox[4];
+	ctrl_info->config_table_length = params.mailbox[5];
+
+	return 0;
+}
+
+int sis_init_base_struct_addr(struct pqi_ctrl_info *ctrl_info)
+{
+	int rc;
+	void *base_struct_unaligned;
+	struct sis_base_struct *base_struct;
+	struct sis_sync_cmd_params params;
+	unsigned long error_buffer_paddr;
+	dma_addr_t bus_address;
+
+	base_struct_unaligned = kzalloc(sizeof(*base_struct)
+		+ SIS_BASE_STRUCT_ALIGNMENT - 1, GFP_KERNEL);
+	if (!base_struct_unaligned)
+		return -ENOMEM;
+
+	base_struct = PTR_ALIGN(base_struct_unaligned,
+		SIS_BASE_STRUCT_ALIGNMENT);
+	error_buffer_paddr = (unsigned long)ctrl_info->error_buffer_dma_handle;
+
+	put_unaligned_le32(SIS_BASE_STRUCT_REVISION, &base_struct->revision);
+	put_unaligned_le32(lower_32_bits(error_buffer_paddr),
+		&base_struct->error_buffer_paddr_low);
+	put_unaligned_le32(upper_32_bits(error_buffer_paddr),
+		&base_struct->error_buffer_paddr_high);
+	put_unaligned_le32(PQI_ERROR_BUFFER_ELEMENT_LENGTH,
+		&base_struct->error_buffer_element_length);
+	put_unaligned_le32(ctrl_info->max_io_slots,
+		&base_struct->error_buffer_num_elements);
+
+	bus_address = dma_map_single(&ctrl_info->pci_dev->dev, base_struct,
+		sizeof(*base_struct), DMA_TO_DEVICE);
+	if (dma_mapping_error(&ctrl_info->pci_dev->dev, bus_address)) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	memset(&params, 0, sizeof(params));
+	params.mailbox[1] = lower_32_bits((u64)bus_address);
+	params.mailbox[2] = upper_32_bits((u64)bus_address);
+	params.mailbox[3] = sizeof(*base_struct);
+
+	rc = sis_send_sync_cmd(ctrl_info, SIS_CMD_INIT_BASE_STRUCT_ADDRESS,
+		&params);
+
+	dma_unmap_single(&ctrl_info->pci_dev->dev, bus_address,
+			sizeof(*base_struct), DMA_TO_DEVICE);
+out:
+	kfree(base_struct_unaligned);
+
+	return rc;
+}
+
+#define SIS_DOORBELL_BIT_CLEAR_TIMEOUT_SECS	(30 * HZ)
+
+static int sis_wait_for_doorbell_bit_to_clear(
+	struct pqi_ctrl_info *ctrl_info, u32 bit)
+{
+	int rc = 0;
+	u32 doorbell_register;
+	unsigned long timeout;
+
+	timeout = SIS_DOORBELL_BIT_CLEAR_TIMEOUT_SECS + jiffies;
+
+	while (1) {
+		doorbell_register =
+			readl(&ctrl_info->registers->sis_host_to_ctrl_doorbell);
+		if ((doorbell_register & bit) == 0)
+			break;
+		if (readl(&ctrl_info->registers->sis_firmware_status) &
+			SIS_CTRL_KERNEL_PANIC) {
+			rc = -ENODEV;
+			break;
+		}
+		if (time_after(jiffies, timeout)) {
+			dev_err(&ctrl_info->pci_dev->dev,
+				"doorbell register bit 0x%x not cleared\n",
+				bit);
+			rc = -ETIMEDOUT;
+			break;
+		}
+		msleep(1);
+	}
+
+	return rc;
+}
+
+static inline int sis_set_doorbell_bit(struct pqi_ctrl_info *ctrl_info, u32 bit)
+{
+	writel(bit, &ctrl_info->registers->sis_host_to_ctrl_doorbell);
+
+	return sis_wait_for_doorbell_bit_to_clear(ctrl_info, bit);
+}
+
+void sis_enable_msix(struct pqi_ctrl_info *ctrl_info)
+{
+	sis_set_doorbell_bit(ctrl_info, SIS_ENABLE_MSIX);
+}
+
+void sis_enable_intx(struct pqi_ctrl_info *ctrl_info)
+{
+	sis_set_doorbell_bit(ctrl_info, SIS_ENABLE_INTX);
+}
+
+void sis_shutdown_ctrl(struct pqi_ctrl_info *ctrl_info,
+	enum pqi_ctrl_shutdown_reason ctrl_shutdown_reason)
+{
+	if (readl(&ctrl_info->registers->sis_firmware_status) &
+		SIS_CTRL_KERNEL_PANIC)
+		return;
+
+	if (ctrl_info->firmware_triage_supported)
+		writel(ctrl_shutdown_reason, &ctrl_info->registers->sis_ctrl_shutdown_reason_code);
+
+	writel(SIS_TRIGGER_SHUTDOWN, &ctrl_info->registers->sis_host_to_ctrl_doorbell);
+}
+
+int sis_pqi_reset_quiesce(struct pqi_ctrl_info *ctrl_info)
+{
+	return sis_set_doorbell_bit(ctrl_info, SIS_PQI_RESET_QUIESCE);
+}
+
+int sis_reenable_sis_mode(struct pqi_ctrl_info *ctrl_info)
+{
+	return sis_set_doorbell_bit(ctrl_info, SIS_REENABLE_SIS_MODE);
+}
+
+void sis_write_driver_scratch(struct pqi_ctrl_info *ctrl_info, u32 value)
+{
+	writel(value, &ctrl_info->registers->sis_driver_scratch);
+}
+
+u32 sis_read_driver_scratch(struct pqi_ctrl_info *ctrl_info)
+{
+	return readl(&ctrl_info->registers->sis_driver_scratch);
+}
+
+static inline enum sis_fw_triage_status
+	sis_read_firmware_triage_status(struct pqi_ctrl_info *ctrl_info)
+{
+	return ((enum sis_fw_triage_status)(readl(&ctrl_info->registers->sis_firmware_status) &
+		SIS_CTRL_KERNEL_FW_TRIAGE));
+}
+
+void sis_soft_reset(struct pqi_ctrl_info *ctrl_info)
+{
+	writel(SIS_SOFT_RESET,
+		&ctrl_info->registers->sis_host_to_ctrl_doorbell);
+}
+
+#define SIS_FW_TRIAGE_STATUS_TIMEOUT_SECS		(300 * HZ)
+#define SIS_FW_TRIAGE_STATUS_POLL_INTERVAL_SECS		1
+
+int sis_wait_for_fw_triage_completion(struct pqi_ctrl_info *ctrl_info)
+{
+	int rc;
+	enum sis_fw_triage_status status;
+	unsigned long timeout;
+
+	timeout = SIS_FW_TRIAGE_STATUS_TIMEOUT_SECS + jiffies;
+	while (1) {
+		status = sis_read_firmware_triage_status(ctrl_info);
+		if (status == FW_TRIAGE_COND_INVALID) {
+			dev_err(&ctrl_info->pci_dev->dev,
+				"firmware triage condition invalid\n");
+			rc = -EINVAL;
+			break;
+		} else if (status == FW_TRIAGE_NOT_STARTED ||
+			status == FW_TRIAGE_COMPLETED) {
+			rc = 0;
+			break;
+		}
+
+		if (time_after(jiffies, timeout)) {
+			dev_err(&ctrl_info->pci_dev->dev,
+				"timed out waiting for firmware triage status\n");
+			rc = -ETIMEDOUT;
+			break;
+		}
+
+		ssleep(SIS_FW_TRIAGE_STATUS_POLL_INTERVAL_SECS);
+	}
+
+	return rc;
+
+}
+
+static void __attribute__((unused)) verify_structures(void)
+{
+	BUILD_BUG_ON(offsetof(struct sis_base_struct,
+		revision) != 0x0);
+	BUILD_BUG_ON(offsetof(struct sis_base_struct,
+		flags) != 0x4);
+	BUILD_BUG_ON(offsetof(struct sis_base_struct,
+		error_buffer_paddr_low) != 0x8);
+	BUILD_BUG_ON(offsetof(struct sis_base_struct,
+		error_buffer_paddr_high) != 0xc);
+	BUILD_BUG_ON(offsetof(struct sis_base_struct,
+		error_buffer_element_length) != 0x10);
+	BUILD_BUG_ON(offsetof(struct sis_base_struct,
+		error_buffer_num_elements) != 0x14);
+	BUILD_BUG_ON(sizeof(struct sis_base_struct) != 0x18);
+}
+
diff --git a/drivers/amazon/scsi/smartpqi/smartpqi_sis.h b/drivers/amazon/scsi/smartpqi/smartpqi_sis.h
new file mode 100644
index 0000000000000..5a265d52e3585
--- /dev/null
+++ b/drivers/amazon/scsi/smartpqi/smartpqi_sis.h
@@ -0,0 +1,42 @@
+/*
+ *    driver for Microchip PQI-based storage controllers
+ *    Copyright (c) 2019-2021 Microchip Technology Inc. and its subsidiaries
+ *    Copyright (c) 2016-2018 Microsemi Corporation
+ *    Copyright (c) 2016 PMC-Sierra, Inc.
+ *
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; version 2 of the License.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *    NON INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ *    Questions/Comments/Bugfixes to storagedev@microchip.com
+ *
+ */
+
+#if !defined(_SMARTPQI_SIS_H)
+#define _SMARTPQI_SIS_H
+
+int sis_wait_for_ctrl_ready(struct pqi_ctrl_info *ctrl_info);
+int sis_wait_for_ctrl_ready_resume(struct pqi_ctrl_info *ctrl_info);
+bool sis_is_firmware_running(struct pqi_ctrl_info *ctrl_info);
+bool sis_is_kernel_up(struct pqi_ctrl_info *ctrl_info);
+int sis_get_ctrl_properties(struct pqi_ctrl_info *ctrl_info);
+int sis_get_pqi_capabilities(struct pqi_ctrl_info *ctrl_info);
+int sis_init_base_struct_addr(struct pqi_ctrl_info *ctrl_info);
+void sis_enable_msix(struct pqi_ctrl_info *ctrl_info);
+void sis_enable_intx(struct pqi_ctrl_info *ctrl_info);
+void sis_shutdown_ctrl(struct pqi_ctrl_info *ctrl_info,
+	enum pqi_ctrl_shutdown_reason ctrl_shutdown_reason);
+int sis_pqi_reset_quiesce(struct pqi_ctrl_info *ctrl_info);
+int sis_reenable_sis_mode(struct pqi_ctrl_info *ctrl_info);
+void sis_write_driver_scratch(struct pqi_ctrl_info *ctrl_info, u32 value);
+u32 sis_read_driver_scratch(struct pqi_ctrl_info *ctrl_info);
+void sis_soft_reset(struct pqi_ctrl_info *ctrl_info);
+u32 sis_get_product_id(struct pqi_ctrl_info *ctrl_info);
+int sis_wait_for_fw_triage_completion(struct pqi_ctrl_info *ctrl_info);
+
+#endif	/* _SMARTPQI_SIS_H */

From debaf993e14a4bc4f7186572dde4d924e532e3d1 Mon Sep 17 00:00:00 2001
From: KP Singh <kpsingh@google.com>
Date: Fri, 6 Nov 2020 10:37:43 +0000
Subject: [PATCH 367/737] bpf: Implement get_current_task_btf and
 RET_PTR_TO_BTF_ID

The currently available bpf_get_current_task returns an unsigned integer
which can be used along with BPF_CORE_READ to read data from
the task_struct but still cannot be used as an input argument to a
helper that accepts an ARG_PTR_TO_BTF_ID of type task_struct.

In order to implement this helper a new return type, RET_PTR_TO_BTF_ID,
is added. This is similar to RET_PTR_TO_BTF_ID_OR_NULL but does not
require checking the nullness of returned pointer.

Signed-off-by: KP Singh <kpsingh@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20201106103747.2780972-6-kpsingh@chromium.org
---
 include/linux/bpf.h            |  1 +
 include/uapi/linux/bpf.h       |  9 +++++++++
 kernel/bpf/verifier.c          |  7 +++++--
 kernel/trace/bpf_trace.c       | 16 ++++++++++++++++
 tools/include/uapi/linux/bpf.h |  9 +++++++++
 5 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5fdc7ec87961e..8e2145767ea95 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -311,6 +311,7 @@ enum bpf_return_type {
 	RET_PTR_TO_BTF_ID_OR_NULL,	/* returns a pointer to a btf_id or NULL */
 	RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */
 	RET_PTR_TO_MEM_OR_BTF_ID,	/* returns a pointer to a valid memory or a btf_id */
+	RET_PTR_TO_BTF_ID,		/* returns a pointer to a btf_id */
 };
 
 /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 20a6a9282a571..12c03ebcf81e0 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3752,6 +3752,14 @@ union bpf_attr {
  * 	Return
  * 		The helper returns **TC_ACT_REDIRECT** on success or
  * 		**TC_ACT_SHOT** on error.
+ *
+ * struct task_struct *bpf_get_current_task_btf(void)
+ *	Description
+ *		Return a BTF pointer to the "current" task.
+ *		This pointer can also be used in helpers that accept an
+ *		*ARG_PTR_TO_BTF_ID* of type *task_struct*.
+ *	Return
+ *		Pointer to the current task.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3910,6 +3918,7 @@ union bpf_attr {
 	FN(per_cpu_ptr),		\
 	FN(this_cpu_ptr),		\
 	FN(redirect_peer),		\
+	FN(get_current_task_btf),	\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index edb19ada0405d..fc642715bd46c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5589,11 +5589,14 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 				PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL;
 			regs[BPF_REG_0].btf_id = meta.ret_btf_id;
 		}
-	} else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) {
+	} else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL ||
+		   fn->ret_type == RET_PTR_TO_BTF_ID) {
 		int ret_btf_id;
 
 		mark_reg_known_zero(env, regs, BPF_REG_0);
-		regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL;
+		regs[BPF_REG_0].type = fn->ret_type == RET_PTR_TO_BTF_ID ?
+						     PTR_TO_BTF_ID :
+						     PTR_TO_BTF_ID_OR_NULL;
 		ret_btf_id = *fn->ret_btf_id;
 		if (ret_btf_id == 0) {
 			verbose(env, "invalid return type %d of func %s#%d\n",
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 46e7d3b67eb5b..d223b45f57165 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1021,6 +1021,20 @@ const struct bpf_func_proto bpf_get_current_task_proto = {
 	.ret_type	= RET_INTEGER,
 };
 
+BPF_CALL_0(bpf_get_current_task_btf)
+{
+	return (unsigned long) current;
+}
+
+BTF_ID_LIST_SINGLE(bpf_get_current_btf_ids, struct, task_struct)
+
+static const struct bpf_func_proto bpf_get_current_task_btf_proto = {
+	.func		= bpf_get_current_task_btf,
+	.gpl_only	= true,
+	.ret_type	= RET_PTR_TO_BTF_ID,
+	.ret_btf_id	= &bpf_get_current_btf_ids[0],
+};
+
 BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
@@ -1278,6 +1292,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_current_pid_tgid_proto;
 	case BPF_FUNC_get_current_task:
 		return &bpf_get_current_task_proto;
+	case BPF_FUNC_get_current_task_btf:
+		return &bpf_get_current_task_btf_proto;
 	case BPF_FUNC_get_current_uid_gid:
 		return &bpf_get_current_uid_gid_proto;
 	case BPF_FUNC_get_current_comm:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index c38cb426ee15c..8d8653a334530 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3752,6 +3752,14 @@ union bpf_attr {
  * 	Return
  * 		The helper returns **TC_ACT_REDIRECT** on success or
  * 		**TC_ACT_SHOT** on error.
+ *
+ * struct task_struct *bpf_get_current_task_btf(void)
+ *	Description
+ *		Return a BTF pointer to the "current" task.
+ *		This pointer can also be used in helpers that accept an
+ *		*ARG_PTR_TO_BTF_ID* of type *task_struct*.
+ *	Return
+ *		Pointer to the current task.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3910,6 +3918,7 @@ union bpf_attr {
 	FN(per_cpu_ptr),		\
 	FN(this_cpu_ptr),		\
 	FN(redirect_peer),		\
+	FN(get_current_task_btf),	\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper

From afcbe5792a2a32c542d1065351124585d2b447f5 Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Thu, 16 Dec 2021 16:31:44 -0800
Subject: [PATCH 368/737] bpf: Introduce composable reg, ret and arg types.

There are some common properties shared between bpf reg, ret and arg
values. For instance, a value may be a NULL pointer, or a pointer to
a read-only memory. Previously, to express these properties, enumeration
was used. For example, in order to test whether a reg value can be NULL,
reg_type_may_be_null() simply enumerates all types that are possibly
NULL. The problem of this approach is that it's not scalable and causes
a lot of duplication. These properties can be combined, for example, a
type could be either MAYBE_NULL or RDONLY, or both.

This patch series rewrites the layout of reg_type, arg_type and
ret_type, so that common properties can be extracted and represented as
composable flag. For example, one can write

 ARG_PTR_TO_MEM | PTR_MAYBE_NULL

which is equivalent to the previous

 ARG_PTR_TO_MEM_OR_NULL

The type ARG_PTR_TO_MEM are called "base type" in this patch. Base
types can be extended with flags. A flag occupies the higher bits while
base types sits in the lower bits.

This patch in particular sets up a set of macro for this purpose. The
following patches will rewrite arg_types, ret_types and reg_types
respectively.

Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211217003152.48334-2-haoluo@google.com
---
 include/linux/bpf.h          | 43 ++++++++++++++++++++++++++++++++++++
 include/linux/bpf_verifier.h | 14 ++++++++++++
 2 files changed, 57 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8e2145767ea95..0fae7dfd0c3bb 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -254,6 +254,29 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
 
 extern const struct bpf_map_ops bpf_map_offload_ops;
 
+/* bpf_type_flag contains a set of flags that are applicable to the values of
+ * arg_type, ret_type and reg_type. For example, a pointer value may be null,
+ * or a memory is read-only. We classify types into two categories: base types
+ * and extended types. Extended types are base types combined with a type flag.
+ *
+ * Currently there are no more than 32 base types in arg_type, ret_type and
+ * reg_types.
+ */
+#define BPF_BASE_TYPE_BITS	8
+
+enum bpf_type_flag {
+	/* PTR may be NULL. */
+	PTR_MAYBE_NULL		= BIT(0 + BPF_BASE_TYPE_BITS),
+
+	__BPF_TYPE_LAST_FLAG	= PTR_MAYBE_NULL,
+};
+
+/* Max number of base types. */
+#define BPF_BASE_TYPE_LIMIT	(1UL << BPF_BASE_TYPE_BITS)
+
+/* Max number of all types. */
+#define BPF_TYPE_LIMIT		(__BPF_TYPE_LAST_FLAG | (__BPF_TYPE_LAST_FLAG - 1))
+
 /* function argument constraints */
 enum bpf_arg_type {
 	ARG_DONTCARE = 0,	/* unused argument in helper function */
@@ -296,7 +319,13 @@ enum bpf_arg_type {
 	ARG_PTR_TO_BTF_ID_SOCK_COMMON,	/* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */
 	ARG_PTR_TO_PERCPU_BTF_ID,	/* pointer to in-kernel percpu type */
 	__BPF_ARG_TYPE_MAX,
+
+	/* This must be the last entry. Its purpose is to ensure the enum is
+	 * wide enough to hold the higher bits reserved for bpf_type_flag.
+	 */
+	__BPF_ARG_TYPE_LIMIT	= BPF_TYPE_LIMIT,
 };
+static_assert(__BPF_ARG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);
 
 /* type of values returned from helper functions */
 enum bpf_return_type {
@@ -312,7 +341,14 @@ enum bpf_return_type {
 	RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */
 	RET_PTR_TO_MEM_OR_BTF_ID,	/* returns a pointer to a valid memory or a btf_id */
 	RET_PTR_TO_BTF_ID,		/* returns a pointer to a btf_id */
+	__BPF_RET_TYPE_MAX,
+
+	/* This must be the last entry. Its purpose is to ensure the enum is
+	 * wide enough to hold the higher bits reserved for bpf_type_flag.
+	 */
+	__BPF_RET_TYPE_LIMIT	= BPF_TYPE_LIMIT,
 };
+static_assert(__BPF_RET_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);
 
 /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
  * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL
@@ -411,7 +447,14 @@ enum bpf_reg_type {
 	PTR_TO_RDWR_BUF,	 /* reg points to a read/write buffer */
 	PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */
 	PTR_TO_PERCPU_BTF_ID,	 /* reg points to a percpu kernel variable */
+	__BPF_REG_TYPE_MAX,
+
+	/* This must be the last entry. Its purpose is to ensure the enum is
+	 * wide enough to hold the higher bits reserved for bpf_type_flag.
+	 */
+	__BPF_REG_TYPE_LIMIT	= BPF_TYPE_LIMIT,
 };
+static_assert(__BPF_REG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);
 
 /* The information passed from prog-specific *_is_valid_access
  * back to the verifier.
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 4d37c69e76b17..71192aa285df7 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -509,4 +509,18 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 			    u32 btf_id,
 			    struct bpf_attach_target_info *tgt_info);
 
+#define BPF_BASE_TYPE_MASK	GENMASK(BPF_BASE_TYPE_BITS - 1, 0)
+
+/* extract base type from bpf_{arg, return, reg}_type. */
+static inline u32 base_type(u32 type)
+{
+	return type & BPF_BASE_TYPE_MASK;
+}
+
+/* extract flags from an extended type. See bpf_type_flag in bpf.h. */
+static inline u32 type_flag(u32 type)
+{
+	return type & ~BPF_BASE_TYPE_MASK;
+}
+
 #endif /* _LINUX_BPF_VERIFIER_H */

From 6418cea3da5ecfc54ca1c6232f101648d9209501 Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Thu, 16 Dec 2021 16:31:45 -0800
Subject: [PATCH 369/737] bpf: Replace ARG_XXX_OR_NULL with ARG_XXX |
 PTR_MAYBE_NULL

We have introduced a new type to make bpf_arg composable, by
reserving high bits of bpf_arg to represent flags of a type.

One of the flags is PTR_MAYBE_NULL which indicates a pointer
may be NULL. When applying this flag to an arg_type, it means
the arg can take NULL pointer. This patch switches the
qualified arg_types to use this flag. The arg_types changed
in this patch include:

1. ARG_PTR_TO_MAP_VALUE_OR_NULL
2. ARG_PTR_TO_MEM_OR_NULL
3. ARG_PTR_TO_CTX_OR_NULL
4. ARG_PTR_TO_SOCKET_OR_NULL
5. ARG_PTR_TO_ALLOC_MEM_OR_NULL
6. ARG_PTR_TO_STACK_OR_NULL

This patch does not eliminate the use of these arg_types, instead
it makes them an alias to the 'ARG_XXX | PTR_MAYBE_NULL'.

Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211217003152.48334-3-haoluo@google.com
---
 include/linux/bpf.h   | 12 +++++++-----
 kernel/bpf/verifier.c | 36 +++++++++++++-----------------------
 2 files changed, 20 insertions(+), 28 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0fae7dfd0c3bb..7710739b255f8 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -288,13 +288,11 @@ enum bpf_arg_type {
 	ARG_PTR_TO_MAP_KEY,	/* pointer to stack used as map key */
 	ARG_PTR_TO_MAP_VALUE,	/* pointer to stack used as map value */
 	ARG_PTR_TO_UNINIT_MAP_VALUE,	/* pointer to valid memory used to store a map value */
-	ARG_PTR_TO_MAP_VALUE_OR_NULL,	/* pointer to stack used as map value or NULL */
 
 	/* the following constraints used to prototype bpf_memcmp() and other
 	 * functions that access data on eBPF program stack
 	 */
 	ARG_PTR_TO_MEM,		/* pointer to valid memory (stack, packet, map value) */
-	ARG_PTR_TO_MEM_OR_NULL, /* pointer to valid memory or NULL */
 	ARG_PTR_TO_UNINIT_MEM,	/* pointer to memory does not need to be initialized,
 				 * helper function must fill all bytes or clear
 				 * them in error case.
@@ -304,22 +302,26 @@ enum bpf_arg_type {
 	ARG_CONST_SIZE_OR_ZERO,	/* number of bytes accessed from memory or 0 */
 
 	ARG_PTR_TO_CTX,		/* pointer to context */
-	ARG_PTR_TO_CTX_OR_NULL,	/* pointer to context or NULL */
 	ARG_ANYTHING,		/* any (initialized) argument is ok */
 	ARG_PTR_TO_SPIN_LOCK,	/* pointer to bpf_spin_lock */
 	ARG_PTR_TO_SOCK_COMMON,	/* pointer to sock_common */
 	ARG_PTR_TO_INT,		/* pointer to int */
 	ARG_PTR_TO_LONG,	/* pointer to long */
 	ARG_PTR_TO_SOCKET,	/* pointer to bpf_sock (fullsock) */
-	ARG_PTR_TO_SOCKET_OR_NULL,	/* pointer to bpf_sock (fullsock) or NULL */
 	ARG_PTR_TO_BTF_ID,	/* pointer to in-kernel struct */
 	ARG_PTR_TO_ALLOC_MEM,	/* pointer to dynamically allocated memory */
-	ARG_PTR_TO_ALLOC_MEM_OR_NULL,	/* pointer to dynamically allocated memory or NULL */
 	ARG_CONST_ALLOC_SIZE_OR_ZERO,	/* number of allocated bytes requested */
 	ARG_PTR_TO_BTF_ID_SOCK_COMMON,	/* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */
 	ARG_PTR_TO_PERCPU_BTF_ID,	/* pointer to in-kernel percpu type */
 	__BPF_ARG_TYPE_MAX,
 
+	/* Extended arg_types. */
+	ARG_PTR_TO_MAP_VALUE_OR_NULL	= PTR_MAYBE_NULL | ARG_PTR_TO_MAP_VALUE,
+	ARG_PTR_TO_MEM_OR_NULL		= PTR_MAYBE_NULL | ARG_PTR_TO_MEM,
+	ARG_PTR_TO_CTX_OR_NULL		= PTR_MAYBE_NULL | ARG_PTR_TO_CTX,
+	ARG_PTR_TO_SOCKET_OR_NULL	= PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET,
+	ARG_PTR_TO_ALLOC_MEM_OR_NULL	= PTR_MAYBE_NULL | ARG_PTR_TO_ALLOC_MEM,
+
 	/* This must be the last entry. Its purpose is to ensure the enum is
 	 * wide enough to hold the higher bits reserved for bpf_type_flag.
 	 */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index fc642715bd46c..b71ddc0e04324 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -437,13 +437,9 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
 	return type == ARG_PTR_TO_SOCK_COMMON;
 }
 
-static bool arg_type_may_be_null(enum bpf_arg_type type)
+static bool type_may_be_null(u32 type)
 {
-	return type == ARG_PTR_TO_MAP_VALUE_OR_NULL ||
-	       type == ARG_PTR_TO_MEM_OR_NULL ||
-	       type == ARG_PTR_TO_CTX_OR_NULL ||
-	       type == ARG_PTR_TO_SOCKET_OR_NULL ||
-	       type == ARG_PTR_TO_ALLOC_MEM_OR_NULL;
+	return type & PTR_MAYBE_NULL;
 }
 
 /* Determine whether the function releases some resources allocated by another
@@ -4335,9 +4331,8 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
 
 static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
 {
-	return type == ARG_PTR_TO_MEM ||
-	       type == ARG_PTR_TO_MEM_OR_NULL ||
-	       type == ARG_PTR_TO_UNINIT_MEM;
+	return base_type(type) == ARG_PTR_TO_MEM ||
+	       base_type(type) == ARG_PTR_TO_UNINIT_MEM;
 }
 
 static bool arg_type_is_mem_size(enum bpf_arg_type type)
@@ -4464,26 +4459,21 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_MAP_KEY]		= &map_key_value_types,
 	[ARG_PTR_TO_MAP_VALUE]		= &map_key_value_types,
 	[ARG_PTR_TO_UNINIT_MAP_VALUE]	= &map_key_value_types,
-	[ARG_PTR_TO_MAP_VALUE_OR_NULL]	= &map_key_value_types,
 	[ARG_CONST_SIZE]		= &scalar_types,
 	[ARG_CONST_SIZE_OR_ZERO]	= &scalar_types,
 	[ARG_CONST_ALLOC_SIZE_OR_ZERO]	= &scalar_types,
 	[ARG_CONST_MAP_PTR]		= &const_map_ptr_types,
 	[ARG_PTR_TO_CTX]		= &context_types,
-	[ARG_PTR_TO_CTX_OR_NULL]	= &context_types,
 	[ARG_PTR_TO_SOCK_COMMON]	= &sock_types,
 #ifdef CONFIG_NET
 	[ARG_PTR_TO_BTF_ID_SOCK_COMMON]	= &btf_id_sock_common_types,
 #endif
 	[ARG_PTR_TO_SOCKET]		= &fullsock_types,
-	[ARG_PTR_TO_SOCKET_OR_NULL]	= &fullsock_types,
 	[ARG_PTR_TO_BTF_ID]		= &btf_ptr_types,
 	[ARG_PTR_TO_SPIN_LOCK]		= &spin_lock_types,
 	[ARG_PTR_TO_MEM]		= &mem_types,
-	[ARG_PTR_TO_MEM_OR_NULL]	= &mem_types,
 	[ARG_PTR_TO_UNINIT_MEM]		= &mem_types,
 	[ARG_PTR_TO_ALLOC_MEM]		= &alloc_mem_types,
-	[ARG_PTR_TO_ALLOC_MEM_OR_NULL]	= &alloc_mem_types,
 	[ARG_PTR_TO_INT]		= &int_ptr_types,
 	[ARG_PTR_TO_LONG]		= &int_ptr_types,
 	[ARG_PTR_TO_PERCPU_BTF_ID]	= &percpu_btf_ptr_types,
@@ -4498,7 +4488,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
 	const struct bpf_reg_types *compatible;
 	int i, j;
 
-	compatible = compatible_reg_types[arg_type];
+	compatible = compatible_reg_types[base_type(arg_type)];
 	if (!compatible) {
 		verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type);
 		return -EFAULT;
@@ -4579,15 +4569,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 		return -EACCES;
 	}
 
-	if (arg_type == ARG_PTR_TO_MAP_VALUE ||
-	    arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE ||
-	    arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) {
+	if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE ||
+	    base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) {
 		err = resolve_map_arg_type(env, meta, &arg_type);
 		if (err)
 			return err;
 	}
 
-	if (register_is_null(reg) && arg_type_may_be_null(arg_type))
+	if (register_is_null(reg) && type_may_be_null(arg_type))
 		/* A NULL register has a SCALAR_VALUE type, so skip
 		 * type checking.
 		 */
@@ -4634,10 +4623,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 		err = check_helper_mem_access(env, regno,
 					      meta->map_ptr->key_size, false,
 					      NULL);
-	} else if (arg_type == ARG_PTR_TO_MAP_VALUE ||
-		   (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL &&
-		    !register_is_null(reg)) ||
-		   arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) {
+	} else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE ||
+		   base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) {
+		if (type_may_be_null(arg_type) && register_is_null(reg))
+			return 0;
+
 		/* bpf_map_xxx(..., map_ptr, ..., value) call:
 		 * check [value, value + map->value_size) validity
 		 */

From 6016261216b2603726d8e34bf08ba39b3f3f9843 Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Thu, 16 Dec 2021 16:31:46 -0800
Subject: [PATCH 370/737] bpf: Replace RET_XXX_OR_NULL with RET_XXX |
 PTR_MAYBE_NULL

We have introduced a new type to make bpf_ret composable, by
reserving high bits to represent flags.

One of the flag is PTR_MAYBE_NULL, which indicates a pointer
may be NULL. When applying this flag to ret_types, it means
the returned value could be a NULL pointer. This patch
switches the qualified arg_types to use this flag.
The ret_types changed in this patch include:

1. RET_PTR_TO_MAP_VALUE_OR_NULL
2. RET_PTR_TO_SOCKET_OR_NULL
3. RET_PTR_TO_TCP_SOCK_OR_NULL
4. RET_PTR_TO_SOCK_COMMON_OR_NULL
5. RET_PTR_TO_ALLOC_MEM_OR_NULL
6. RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL
7. RET_PTR_TO_BTF_ID_OR_NULL

This patch doesn't eliminate the use of these names, instead
it makes them aliases to 'RET_PTR_TO_XXX | PTR_MAYBE_NULL'.

Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211217003152.48334-4-haoluo@google.com
---
 include/linux/bpf.h   | 19 ++++++++++------
 kernel/bpf/helpers.c  |  2 +-
 kernel/bpf/verifier.c | 52 +++++++++++++++++++++----------------------
 3 files changed, 39 insertions(+), 34 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7710739b255f8..2255f9fd2d755 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -334,17 +334,22 @@ enum bpf_return_type {
 	RET_INTEGER,			/* function returns integer */
 	RET_VOID,			/* function doesn't return anything */
 	RET_PTR_TO_MAP_VALUE,		/* returns a pointer to map elem value */
-	RET_PTR_TO_MAP_VALUE_OR_NULL,	/* returns a pointer to map elem value or NULL */
-	RET_PTR_TO_SOCKET_OR_NULL,	/* returns a pointer to a socket or NULL */
-	RET_PTR_TO_TCP_SOCK_OR_NULL,	/* returns a pointer to a tcp_sock or NULL */
-	RET_PTR_TO_SOCK_COMMON_OR_NULL,	/* returns a pointer to a sock_common or NULL */
-	RET_PTR_TO_ALLOC_MEM_OR_NULL,	/* returns a pointer to dynamically allocated memory or NULL */
-	RET_PTR_TO_BTF_ID_OR_NULL,	/* returns a pointer to a btf_id or NULL */
-	RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */
+	RET_PTR_TO_SOCKET,		/* returns a pointer to a socket */
+	RET_PTR_TO_TCP_SOCK,		/* returns a pointer to a tcp_sock */
+	RET_PTR_TO_SOCK_COMMON,		/* returns a pointer to a sock_common */
+	RET_PTR_TO_ALLOC_MEM,		/* returns a pointer to dynamically allocated memory */
 	RET_PTR_TO_MEM_OR_BTF_ID,	/* returns a pointer to a valid memory or a btf_id */
 	RET_PTR_TO_BTF_ID,		/* returns a pointer to a btf_id */
 	__BPF_RET_TYPE_MAX,
 
+	/* Extended ret_types. */
+	RET_PTR_TO_MAP_VALUE_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_MAP_VALUE,
+	RET_PTR_TO_SOCKET_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_SOCKET,
+	RET_PTR_TO_TCP_SOCK_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK,
+	RET_PTR_TO_SOCK_COMMON_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON,
+	RET_PTR_TO_ALLOC_MEM_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_ALLOC_MEM,
+	RET_PTR_TO_BTF_ID_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID,
+
 	/* This must be the last entry. Its purpose is to ensure the enum is
 	 * wide enough to hold the higher bits reserved for bpf_type_flag.
 	 */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 0efe7c7bfe5e9..606d02964a1a9 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -642,7 +642,7 @@ BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
 const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
 	.func		= bpf_per_cpu_ptr,
 	.gpl_only	= false,
-	.ret_type	= RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL,
+	.ret_type	= RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL,
 	.arg1_type	= ARG_PTR_TO_PERCPU_BTF_ID,
 	.arg2_type	= ARG_ANYTHING,
 };
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b71ddc0e04324..ab215f954d060 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5400,6 +5400,7 @@ static int check_reference_leak(struct bpf_verifier_env *env)
 static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 {
 	const struct bpf_func_proto *fn = NULL;
+	enum bpf_return_type ret_type;
 	struct bpf_reg_state *regs;
 	struct bpf_call_arg_meta meta;
 	bool changes_data;
@@ -5511,13 +5512,13 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
 
 	/* update return register (already marked as written above) */
-	if (fn->ret_type == RET_INTEGER) {
+	ret_type = fn->ret_type;
+	if (ret_type == RET_INTEGER) {
 		/* sets type to SCALAR_VALUE */
 		mark_reg_unknown(env, regs, BPF_REG_0);
-	} else if (fn->ret_type == RET_VOID) {
+	} else if (ret_type == RET_VOID) {
 		regs[BPF_REG_0].type = NOT_INIT;
-	} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL ||
-		   fn->ret_type == RET_PTR_TO_MAP_VALUE) {
+	} else if (base_type(ret_type) == RET_PTR_TO_MAP_VALUE) {
 		/* There is no offset yet applied, variable or fixed */
 		mark_reg_known_zero(env, regs, BPF_REG_0);
 		/* remember map_ptr, so that check_map_access()
@@ -5530,28 +5531,27 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 			return -EINVAL;
 		}
 		regs[BPF_REG_0].map_ptr = meta.map_ptr;
-		if (fn->ret_type == RET_PTR_TO_MAP_VALUE) {
+		if (type_may_be_null(ret_type)) {
+			regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
+		} else {
 			regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;
 			if (map_value_has_spin_lock(meta.map_ptr))
 				regs[BPF_REG_0].id = ++env->id_gen;
-		} else {
-			regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
 		}
-	} else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) {
+	} else if (base_type(ret_type) == RET_PTR_TO_SOCKET) {
 		mark_reg_known_zero(env, regs, BPF_REG_0);
 		regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;
-	} else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) {
+	} else if (base_type(ret_type) == RET_PTR_TO_SOCK_COMMON) {
 		mark_reg_known_zero(env, regs, BPF_REG_0);
 		regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL;
-	} else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) {
+	} else if (base_type(ret_type) == RET_PTR_TO_TCP_SOCK) {
 		mark_reg_known_zero(env, regs, BPF_REG_0);
 		regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
-	} else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) {
+	} else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) {
 		mark_reg_known_zero(env, regs, BPF_REG_0);
 		regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL;
 		regs[BPF_REG_0].mem_size = meta.mem_size;
-	} else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL ||
-		   fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) {
+	} else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) {
 		const struct btf_type *t;
 
 		mark_reg_known_zero(env, regs, BPF_REG_0);
@@ -5570,33 +5570,33 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 				return -EINVAL;
 			}
 			regs[BPF_REG_0].type =
-				fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
-				PTR_TO_MEM : PTR_TO_MEM_OR_NULL;
+				(ret_type & PTR_MAYBE_NULL) ?
+				PTR_TO_MEM_OR_NULL : PTR_TO_MEM;
 			regs[BPF_REG_0].mem_size = tsize;
 		} else {
 			regs[BPF_REG_0].type =
-				fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
-				PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL;
+				(ret_type & PTR_MAYBE_NULL) ?
+				PTR_TO_BTF_ID_OR_NULL : PTR_TO_BTF_ID;
 			regs[BPF_REG_0].btf_id = meta.ret_btf_id;
 		}
-	} else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL ||
-		   fn->ret_type == RET_PTR_TO_BTF_ID) {
+	} else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) {
 		int ret_btf_id;
 
 		mark_reg_known_zero(env, regs, BPF_REG_0);
-		regs[BPF_REG_0].type = fn->ret_type == RET_PTR_TO_BTF_ID ?
-						     PTR_TO_BTF_ID :
-						     PTR_TO_BTF_ID_OR_NULL;
+		regs[BPF_REG_0].type = (ret_type & PTR_MAYBE_NULL) ?
+						     PTR_TO_BTF_ID_OR_NULL :
+						     PTR_TO_BTF_ID;
 		ret_btf_id = *fn->ret_btf_id;
 		if (ret_btf_id == 0) {
-			verbose(env, "invalid return type %d of func %s#%d\n",
-				fn->ret_type, func_id_name(func_id), func_id);
+			verbose(env, "invalid return type %u of func %s#%d\n",
+				base_type(ret_type), func_id_name(func_id),
+				func_id);
 			return -EINVAL;
 		}
 		regs[BPF_REG_0].btf_id = ret_btf_id;
 	} else {
-		verbose(env, "unknown return type %d of func %s#%d\n",
-			fn->ret_type, func_id_name(func_id), func_id);
+		verbose(env, "unknown return type %u of func %s#%d\n",
+			base_type(ret_type), func_id_name(func_id), func_id);
 		return -EINVAL;
 	}
 

From 65fbd9aa5b8bc6bbd1995713ece09800d9c3bf82 Mon Sep 17 00:00:00 2001
From: Dmitrii Banshchikov <me@ubique.spb.ru>
Date: Sat, 13 Feb 2021 00:56:40 +0400
Subject: [PATCH 371/737] bpf: Extract nullable reg type conversion into a
 helper function

Extract conversion from a register's nullable type to a type with a
value. The helper will be used in mark_ptr_not_null_reg().

Signed-off-by: Dmitrii Banshchikov <me@ubique.spb.ru>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210212205642.620788-3-me@ubique.spb.ru
---
 kernel/bpf/verifier.c | 83 +++++++++++++++++++++++++++----------------
 1 file changed, 52 insertions(+), 31 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ab215f954d060..ad1088a6f2fe6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1082,6 +1082,51 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env,
 	__mark_reg_known_zero(regs + regno);
 }
 
+static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
+{
+	switch (reg->type) {
+	case PTR_TO_MAP_VALUE_OR_NULL: {
+		const struct bpf_map *map = reg->map_ptr;
+
+		if (map->inner_map_meta) {
+			reg->type = CONST_PTR_TO_MAP;
+			reg->map_ptr = map->inner_map_meta;
+		} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
+			reg->type = PTR_TO_XDP_SOCK;
+		} else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
+			   map->map_type == BPF_MAP_TYPE_SOCKHASH) {
+			reg->type = PTR_TO_SOCKET;
+		} else {
+			reg->type = PTR_TO_MAP_VALUE;
+		}
+		break;
+	}
+	case PTR_TO_SOCKET_OR_NULL:
+		reg->type = PTR_TO_SOCKET;
+		break;
+	case PTR_TO_SOCK_COMMON_OR_NULL:
+		reg->type = PTR_TO_SOCK_COMMON;
+		break;
+	case PTR_TO_TCP_SOCK_OR_NULL:
+		reg->type = PTR_TO_TCP_SOCK;
+		break;
+	case PTR_TO_BTF_ID_OR_NULL:
+		reg->type = PTR_TO_BTF_ID;
+		break;
+	case PTR_TO_MEM_OR_NULL:
+		reg->type = PTR_TO_MEM;
+		break;
+	case PTR_TO_RDONLY_BUF_OR_NULL:
+		reg->type = PTR_TO_RDONLY_BUF;
+		break;
+	case PTR_TO_RDWR_BUF_OR_NULL:
+		reg->type = PTR_TO_RDWR_BUF;
+		break;
+	default:
+		WARN_ON("unknown nullable register type");
+	}
+}
+
 static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
 {
 	return type_is_pkt_pointer(reg->type);
@@ -7823,43 +7868,19 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
 		}
 		if (is_null) {
 			reg->type = SCALAR_VALUE;
-		} else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
-			const struct bpf_map *map = reg->map_ptr;
-
-			if (map->inner_map_meta) {
-				reg->type = CONST_PTR_TO_MAP;
-				reg->map_ptr = map->inner_map_meta;
-			} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
-				reg->type = PTR_TO_XDP_SOCK;
-			} else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
-				   map->map_type == BPF_MAP_TYPE_SOCKHASH) {
-				reg->type = PTR_TO_SOCKET;
-			} else {
-				reg->type = PTR_TO_MAP_VALUE;
-			}
-		} else if (reg->type == PTR_TO_SOCKET_OR_NULL) {
-			reg->type = PTR_TO_SOCKET;
-		} else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) {
-			reg->type = PTR_TO_SOCK_COMMON;
-		} else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {
-			reg->type = PTR_TO_TCP_SOCK;
-		} else if (reg->type == PTR_TO_BTF_ID_OR_NULL) {
-			reg->type = PTR_TO_BTF_ID;
-		} else if (reg->type == PTR_TO_MEM_OR_NULL) {
-			reg->type = PTR_TO_MEM;
-		} else if (reg->type == PTR_TO_RDONLY_BUF_OR_NULL) {
-			reg->type = PTR_TO_RDONLY_BUF;
-		} else if (reg->type == PTR_TO_RDWR_BUF_OR_NULL) {
-			reg->type = PTR_TO_RDWR_BUF;
-		}
-		if (is_null) {
 			/* We don't need id and ref_obj_id from this point
 			 * onwards anymore, thus we should better reset it,
 			 * so that state pruning has chances to take effect.
 			 */
 			reg->id = 0;
 			reg->ref_obj_id = 0;
-		} else if (!reg_may_point_to_spin_lock(reg)) {
+
+			return;
+		}
+
+		mark_ptr_not_null_reg(reg);
+
+		if (!reg_may_point_to_spin_lock(reg)) {
 			/* For not-NULL ptr, reg->ref_obj_id will be reset
 			 * in release_reference().
 			 *

From b0111f55320845d4ff8c21f8ed81374317a58912 Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Thu, 16 Dec 2021 16:31:47 -0800
Subject: [PATCH 372/737] bpf: Replace PTR_TO_XXX_OR_NULL with PTR_TO_XXX |
 PTR_MAYBE_NULL

We have introduced a new type to make bpf_reg composable, by
allocating bits in the type to represent flags.

One of the flags is PTR_MAYBE_NULL which indicates a pointer
may be NULL. This patch switches the qualified reg_types to
use this flag. The reg_types changed in this patch include:

1. PTR_TO_MAP_VALUE_OR_NULL
2. PTR_TO_SOCKET_OR_NULL
3. PTR_TO_SOCK_COMMON_OR_NULL
4. PTR_TO_TCP_SOCK_OR_NULL
5. PTR_TO_BTF_ID_OR_NULL
6. PTR_TO_MEM_OR_NULL
7. PTR_TO_RDONLY_BUF_OR_NULL
8. PTR_TO_RDWR_BUF_OR_NULL

Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/r/20211217003152.48334-5-haoluo@google.com
---
 include/linux/bpf.h          |  17 ++-
 include/linux/bpf_verifier.h |   4 +
 kernel/bpf/btf.c             |   7 +-
 kernel/bpf/map_iter.c        |   4 +-
 kernel/bpf/verifier.c        | 288 +++++++++++++++--------------------
 net/core/bpf_sk_storage.c    |   2 +-
 net/core/sock_map.c          |   2 +-
 7 files changed, 144 insertions(+), 180 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2255f9fd2d755..c139f6b80654c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -417,18 +417,15 @@ enum bpf_reg_type {
 	PTR_TO_CTX,		 /* reg points to bpf_context */
 	CONST_PTR_TO_MAP,	 /* reg points to struct bpf_map */
 	PTR_TO_MAP_VALUE,	 /* reg points to map element value */
-	PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
+	PTR_TO_MAP_KEY,		 /* reg points to a map element key */
 	PTR_TO_STACK,		 /* reg == frame_pointer + offset */
 	PTR_TO_PACKET_META,	 /* skb->data - meta_len */
 	PTR_TO_PACKET,		 /* reg points to skb->data */
 	PTR_TO_PACKET_END,	 /* skb->data + headlen */
 	PTR_TO_FLOW_KEYS,	 /* reg points to bpf_flow_keys */
 	PTR_TO_SOCKET,		 /* reg points to struct bpf_sock */
-	PTR_TO_SOCKET_OR_NULL,	 /* reg points to struct bpf_sock or NULL */
 	PTR_TO_SOCK_COMMON,	 /* reg points to sock_common */
-	PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */
 	PTR_TO_TCP_SOCK,	 /* reg points to struct tcp_sock */
-	PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */
 	PTR_TO_TP_BUFFER,	 /* reg points to a writable raw tp's buffer */
 	PTR_TO_XDP_SOCK,	 /* reg points to struct xdp_sock */
 	/* PTR_TO_BTF_ID points to a kernel struct that does not need
@@ -446,16 +443,20 @@ enum bpf_reg_type {
 	 * been checked for null. Used primarily to inform the verifier
 	 * an explicit null check is required for this struct.
 	 */
-	PTR_TO_BTF_ID_OR_NULL,
 	PTR_TO_MEM,		 /* reg points to valid memory region */
-	PTR_TO_MEM_OR_NULL,	 /* reg points to valid memory region or NULL */
 	PTR_TO_RDONLY_BUF,	 /* reg points to a readonly buffer */
-	PTR_TO_RDONLY_BUF_OR_NULL, /* reg points to a readonly buffer or NULL */
 	PTR_TO_RDWR_BUF,	 /* reg points to a read/write buffer */
-	PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */
 	PTR_TO_PERCPU_BTF_ID,	 /* reg points to a percpu kernel variable */
 	__BPF_REG_TYPE_MAX,
 
+	/* Extended reg_types. */
+	PTR_TO_MAP_VALUE_OR_NULL	= PTR_MAYBE_NULL | PTR_TO_MAP_VALUE,
+	PTR_TO_SOCKET_OR_NULL		= PTR_MAYBE_NULL | PTR_TO_SOCKET,
+	PTR_TO_SOCK_COMMON_OR_NULL	= PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON,
+	PTR_TO_TCP_SOCK_OR_NULL		= PTR_MAYBE_NULL | PTR_TO_TCP_SOCK,
+	PTR_TO_BTF_ID_OR_NULL		= PTR_MAYBE_NULL | PTR_TO_BTF_ID,
+	PTR_TO_MEM_OR_NULL		= PTR_MAYBE_NULL | PTR_TO_MEM,
+
 	/* This must be the last entry. Its purpose is to ensure the enum is
 	 * wide enough to hold the higher bits reserved for bpf_type_flag.
 	 */
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 71192aa285df7..d47f127fcf6e9 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -17,6 +17,8 @@
  * that converting umax_value to int cannot overflow.
  */
 #define BPF_MAX_VAR_SIZ	(1 << 29)
+/* size of type_str_buf in bpf_verifier. */
+#define TYPE_STR_BUF_LEN 64
 
 /* Liveness marks, used for registers and spilled-regs (in stack slots).
  * Read marks propagate upwards until they find a write mark; they record that
@@ -462,6 +464,8 @@ struct bpf_verifier_env {
 	u32 peak_states;
 	/* longest register parentage chain walked for liveness marking */
 	u32 longest_mark_read_walk;
+	/* buffer used in reg_type_str() to generate reg_type string */
+	char type_str_buf[TYPE_STR_BUF_LEN];
 };
 
 __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 06c028bdb8d4d..bee03bbd75442 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -4535,10 +4535,13 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 	/* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */
 	for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
 		const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
+		u32 type, flag;
 
+		type = base_type(ctx_arg_info->reg_type);
+		flag = type_flag(ctx_arg_info->reg_type);
 		if (ctx_arg_info->offset == off &&
-		    (ctx_arg_info->reg_type == PTR_TO_RDONLY_BUF_OR_NULL ||
-		     ctx_arg_info->reg_type == PTR_TO_RDWR_BUF_OR_NULL)) {
+		    (type == PTR_TO_RDWR_BUF || type == PTR_TO_RDONLY_BUF) &&
+		    (flag & PTR_MAYBE_NULL)) {
 			info->reg_type = ctx_arg_info->reg_type;
 			return true;
 		}
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index 6a9542af4212a..631f0e44b7a9e 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -174,9 +174,9 @@ static const struct bpf_iter_reg bpf_map_elem_reg_info = {
 	.ctx_arg_info_size	= 2,
 	.ctx_arg_info		= {
 		{ offsetof(struct bpf_iter__bpf_map_elem, key),
-		  PTR_TO_RDONLY_BUF_OR_NULL },
+		  PTR_TO_RDONLY_BUF | PTR_MAYBE_NULL },
 		{ offsetof(struct bpf_iter__bpf_map_elem, value),
-		  PTR_TO_RDWR_BUF_OR_NULL },
+		  PTR_TO_RDWR_BUF | PTR_MAYBE_NULL },
 	},
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ad1088a6f2fe6..04473f23ccfb6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -404,18 +404,6 @@ static bool reg_type_not_null(enum bpf_reg_type type)
 		type == PTR_TO_SOCK_COMMON;
 }
 
-static bool reg_type_may_be_null(enum bpf_reg_type type)
-{
-	return type == PTR_TO_MAP_VALUE_OR_NULL ||
-	       type == PTR_TO_SOCKET_OR_NULL ||
-	       type == PTR_TO_SOCK_COMMON_OR_NULL ||
-	       type == PTR_TO_TCP_SOCK_OR_NULL ||
-	       type == PTR_TO_BTF_ID_OR_NULL ||
-	       type == PTR_TO_MEM_OR_NULL ||
-	       type == PTR_TO_RDONLY_BUF_OR_NULL ||
-	       type == PTR_TO_RDWR_BUF_OR_NULL;
-}
-
 static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
 {
 	return reg->type == PTR_TO_MAP_VALUE &&
@@ -424,12 +412,9 @@ static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
 
 static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
 {
-	return type == PTR_TO_SOCKET ||
-		type == PTR_TO_SOCKET_OR_NULL ||
-		type == PTR_TO_TCP_SOCK ||
-		type == PTR_TO_TCP_SOCK_OR_NULL ||
-		type == PTR_TO_MEM ||
-		type == PTR_TO_MEM_OR_NULL;
+	return base_type(type) == PTR_TO_SOCKET ||
+		base_type(type) == PTR_TO_TCP_SOCK ||
+		base_type(type) == PTR_TO_MEM;
 }
 
 static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
@@ -492,37 +477,50 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id)
 		func_id == BPF_FUNC_skc_to_tcp_request_sock;
 }
 
-/* string representation of 'enum bpf_reg_type' */
-static const char * const reg_type_str[] = {
-	[NOT_INIT]		= "?",
-	[SCALAR_VALUE]		= "inv",
-	[PTR_TO_CTX]		= "ctx",
-	[CONST_PTR_TO_MAP]	= "map_ptr",
-	[PTR_TO_MAP_VALUE]	= "map_value",
-	[PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
-	[PTR_TO_STACK]		= "fp",
-	[PTR_TO_PACKET]		= "pkt",
-	[PTR_TO_PACKET_META]	= "pkt_meta",
-	[PTR_TO_PACKET_END]	= "pkt_end",
-	[PTR_TO_FLOW_KEYS]	= "flow_keys",
-	[PTR_TO_SOCKET]		= "sock",
-	[PTR_TO_SOCKET_OR_NULL] = "sock_or_null",
-	[PTR_TO_SOCK_COMMON]	= "sock_common",
-	[PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null",
-	[PTR_TO_TCP_SOCK]	= "tcp_sock",
-	[PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",
-	[PTR_TO_TP_BUFFER]	= "tp_buffer",
-	[PTR_TO_XDP_SOCK]	= "xdp_sock",
-	[PTR_TO_BTF_ID]		= "ptr_",
-	[PTR_TO_BTF_ID_OR_NULL]	= "ptr_or_null_",
-	[PTR_TO_PERCPU_BTF_ID]	= "percpu_ptr_",
-	[PTR_TO_MEM]		= "mem",
-	[PTR_TO_MEM_OR_NULL]	= "mem_or_null",
-	[PTR_TO_RDONLY_BUF]	= "rdonly_buf",
-	[PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null",
-	[PTR_TO_RDWR_BUF]	= "rdwr_buf",
-	[PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null",
-};
+/* string representation of 'enum bpf_reg_type'
+ *
+ * Note that reg_type_str() can not appear more than once in a single verbose()
+ * statement.
+ */
+static const char *reg_type_str(struct bpf_verifier_env *env,
+				enum bpf_reg_type type)
+{
+	char postfix[16] = {0};
+	static const char * const str[] = {
+		[NOT_INIT]		= "?",
+		[SCALAR_VALUE]		= "inv",
+		[PTR_TO_CTX]		= "ctx",
+		[CONST_PTR_TO_MAP]	= "map_ptr",
+		[PTR_TO_MAP_VALUE]	= "map_value",
+		[PTR_TO_STACK]		= "fp",
+		[PTR_TO_PACKET]		= "pkt",
+		[PTR_TO_PACKET_META]	= "pkt_meta",
+		[PTR_TO_PACKET_END]	= "pkt_end",
+		[PTR_TO_FLOW_KEYS]	= "flow_keys",
+		[PTR_TO_SOCKET]		= "sock",
+		[PTR_TO_SOCK_COMMON]	= "sock_common",
+		[PTR_TO_TCP_SOCK]	= "tcp_sock",
+		[PTR_TO_TP_BUFFER]	= "tp_buffer",
+		[PTR_TO_XDP_SOCK]	= "xdp_sock",
+		[PTR_TO_BTF_ID]		= "ptr_",
+		[PTR_TO_PERCPU_BTF_ID]	= "percpu_ptr_",
+		[PTR_TO_MEM]		= "mem",
+		[PTR_TO_RDONLY_BUF]	= "rdonly_buf",
+		[PTR_TO_RDWR_BUF]	= "rdwr_buf",
+	};
+
+	if (type & PTR_MAYBE_NULL) {
+		if (base_type(type) == PTR_TO_BTF_ID ||
+		    base_type(type) == PTR_TO_PERCPU_BTF_ID)
+			strncpy(postfix, "or_null_", 16);
+		else
+			strncpy(postfix, "_or_null", 16);
+	}
+
+	snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s",
+		 str[base_type(type)], postfix);
+	return env->type_str_buf;
+}
 
 static char slot_type_char[] = {
 	[STACK_INVALID]	= '?',
@@ -588,7 +586,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 			continue;
 		verbose(env, " R%d", i);
 		print_liveness(env, reg->live);
-		verbose(env, "=%s", reg_type_str[t]);
+		verbose(env, "=%s", reg_type_str(env, t));
 		if (t == SCALAR_VALUE && reg->precise)
 			verbose(env, "P");
 		if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
@@ -596,9 +594,8 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 			/* reg->off should be 0 for SCALAR_VALUE */
 			verbose(env, "%lld", reg->var_off.value + reg->off);
 		} else {
-			if (t == PTR_TO_BTF_ID ||
-			    t == PTR_TO_BTF_ID_OR_NULL ||
-			    t == PTR_TO_PERCPU_BTF_ID)
+			if (base_type(t) == PTR_TO_BTF_ID ||
+			    base_type(t) == PTR_TO_PERCPU_BTF_ID)
 				verbose(env, "%s", kernel_type_name(reg->btf_id));
 			verbose(env, "(id=%d", reg->id);
 			if (reg_type_may_be_refcounted_or_null(t))
@@ -607,9 +604,9 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 				verbose(env, ",off=%d", reg->off);
 			if (type_is_pkt_pointer(t))
 				verbose(env, ",r=%d", reg->range);
-			else if (t == CONST_PTR_TO_MAP ||
-				 t == PTR_TO_MAP_VALUE ||
-				 t == PTR_TO_MAP_VALUE_OR_NULL)
+			else if (base_type(t) == CONST_PTR_TO_MAP ||
+				 base_type(t) == PTR_TO_MAP_KEY ||
+				 base_type(t) == PTR_TO_MAP_VALUE)
 				verbose(env, ",ks=%d,vs=%d",
 					reg->map_ptr->key_size,
 					reg->map_ptr->value_size);
@@ -679,7 +676,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 		if (is_spilled_reg(&state->stack[i])) {
 			reg = &state->stack[i].spilled_ptr;
 			t = reg->type;
-			verbose(env, "=%s", reg_type_str[t]);
+			verbose(env, "=%s", reg_type_str(env, t));
 			if (t == SCALAR_VALUE && reg->precise)
 				verbose(env, "P");
 			if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
@@ -1084,8 +1081,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env,
 
 static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
 {
-	switch (reg->type) {
-	case PTR_TO_MAP_VALUE_OR_NULL: {
+	if (base_type(reg->type) == PTR_TO_MAP_VALUE) {
 		const struct bpf_map *map = reg->map_ptr;
 
 		if (map->inner_map_meta) {
@@ -1099,32 +1095,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
 		} else {
 			reg->type = PTR_TO_MAP_VALUE;
 		}
-		break;
-	}
-	case PTR_TO_SOCKET_OR_NULL:
-		reg->type = PTR_TO_SOCKET;
-		break;
-	case PTR_TO_SOCK_COMMON_OR_NULL:
-		reg->type = PTR_TO_SOCK_COMMON;
-		break;
-	case PTR_TO_TCP_SOCK_OR_NULL:
-		reg->type = PTR_TO_TCP_SOCK;
-		break;
-	case PTR_TO_BTF_ID_OR_NULL:
-		reg->type = PTR_TO_BTF_ID;
-		break;
-	case PTR_TO_MEM_OR_NULL:
-		reg->type = PTR_TO_MEM;
-		break;
-	case PTR_TO_RDONLY_BUF_OR_NULL:
-		reg->type = PTR_TO_RDONLY_BUF;
-		break;
-	case PTR_TO_RDWR_BUF_OR_NULL:
-		reg->type = PTR_TO_RDWR_BUF;
-		break;
-	default:
-		WARN_ON("unknown nullable register type");
+		return;
 	}
+
+	reg->type &= ~PTR_MAYBE_NULL;
 }
 
 static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
@@ -1622,7 +1596,7 @@ static int mark_reg_read(struct bpf_verifier_env *env,
 			break;
 		if (parent->live & REG_LIVE_DONE) {
 			verbose(env, "verifier BUG type %s var_off %lld off %d\n",
-				reg_type_str[parent->type],
+				reg_type_str(env, parent->type),
 				parent->var_off.value, parent->off);
 			return -EFAULT;
 		}
@@ -2264,9 +2238,8 @@ static int mark_chain_precision_stack_frame(struct bpf_verifier_env *env, int fr
 
 static bool is_spillable_regtype(enum bpf_reg_type type)
 {
-	switch (type) {
+	switch (base_type(type)) {
 	case PTR_TO_MAP_VALUE:
-	case PTR_TO_MAP_VALUE_OR_NULL:
 	case PTR_TO_STACK:
 	case PTR_TO_CTX:
 	case PTR_TO_PACKET:
@@ -2275,21 +2248,14 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_FLOW_KEYS:
 	case CONST_PTR_TO_MAP:
 	case PTR_TO_SOCKET:
-	case PTR_TO_SOCKET_OR_NULL:
 	case PTR_TO_SOCK_COMMON:
-	case PTR_TO_SOCK_COMMON_OR_NULL:
 	case PTR_TO_TCP_SOCK:
-	case PTR_TO_TCP_SOCK_OR_NULL:
 	case PTR_TO_XDP_SOCK:
 	case PTR_TO_BTF_ID:
-	case PTR_TO_BTF_ID_OR_NULL:
 	case PTR_TO_RDONLY_BUF:
-	case PTR_TO_RDONLY_BUF_OR_NULL:
 	case PTR_TO_RDWR_BUF:
-	case PTR_TO_RDWR_BUF_OR_NULL:
 	case PTR_TO_PERCPU_BTF_ID:
 	case PTR_TO_MEM:
-	case PTR_TO_MEM_OR_NULL:
 		return true;
 	default:
 		return false;
@@ -3146,7 +3112,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 		 */
 		*reg_type = info.reg_type;
 
-		if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL)
+		if (base_type(*reg_type) == PTR_TO_BTF_ID)
 			*btf_id = info.btf_id;
 		else
 			env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
@@ -3212,7 +3178,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
 	}
 
 	verbose(env, "R%d invalid %s access off=%d size=%d\n",
-		regno, reg_type_str[reg->type], off, size);
+		regno, reg_type_str(env, reg->type), off, size);
 
 	return -EACCES;
 }
@@ -3952,7 +3918,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			} else {
 				mark_reg_known_zero(env, regs,
 						    value_regno);
-				if (reg_type_may_be_null(reg_type))
+				if (type_may_be_null(reg_type))
 					regs[value_regno].id = ++env->id_gen;
 				/* A load of ctx field could have different
 				 * actual load size with the one encoded in the
@@ -3960,8 +3926,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 				 * a sub-register.
 				 */
 				regs[value_regno].subreg_def = DEF_NOT_SUBREG;
-				if (reg_type == PTR_TO_BTF_ID ||
-				    reg_type == PTR_TO_BTF_ID_OR_NULL)
+				if (base_type(reg_type) == PTR_TO_BTF_ID)
 					regs[value_regno].btf_id = btf_id;
 			}
 			regs[value_regno].type = reg_type;
@@ -4012,7 +3977,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 	} else if (type_is_sk_pointer(reg->type)) {
 		if (t == BPF_WRITE) {
 			verbose(env, "R%d cannot write into %s\n",
-				regno, reg_type_str[reg->type]);
+				regno, reg_type_str(env, reg->type));
 			return -EACCES;
 		}
 		err = check_sock_access(env, insn_idx, regno, off, size, t);
@@ -4031,7 +3996,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 	} else if (reg->type == PTR_TO_RDONLY_BUF) {
 		if (t == BPF_WRITE) {
 			verbose(env, "R%d cannot write into %s\n",
-				regno, reg_type_str[reg->type]);
+				regno, reg_type_str(env, reg->type));
 			return -EACCES;
 		}
 		err = check_buffer_access(env, reg, regno, off, size, false,
@@ -4047,7 +4012,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			mark_reg_unknown(env, regs, value_regno);
 	} else {
 		verbose(env, "R%d invalid mem access '%s'\n", regno,
-			reg_type_str[reg->type]);
+			reg_type_str(env, reg->type));
 		return -EACCES;
 	}
 
@@ -4090,7 +4055,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
 	    is_sk_reg(env, insn->dst_reg)) {
 		verbose(env, "BPF_XADD stores into R%d %s is not allowed\n",
 			insn->dst_reg,
-			reg_type_str[reg_state(env, insn->dst_reg)->type]);
+			reg_type_str(env, reg_state(env, insn->dst_reg)->type));
 		return -EACCES;
 	}
 
@@ -4286,9 +4251,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 		    register_is_null(reg))
 			return 0;
 
-		verbose(env, "R%d type=%s expected=%s\n", regno,
-			reg_type_str[reg->type],
-			reg_type_str[PTR_TO_STACK]);
+		verbose(env, "R%d type=%s ", regno,
+			reg_type_str(env, reg->type));
+		verbose(env, "expected=%s\n", reg_type_str(env, PTR_TO_STACK));
 		return -EACCES;
 	}
 }
@@ -4548,10 +4513,10 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
 			goto found;
 	}
 
-	verbose(env, "R%d type=%s expected=", regno, reg_type_str[type]);
+	verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, type));
 	for (j = 0; j + 1 < i; j++)
-		verbose(env, "%s, ", reg_type_str[compatible->types[j]]);
-	verbose(env, "%s\n", reg_type_str[compatible->types[j]]);
+		verbose(env, "%s, ", reg_type_str(env, compatible->types[j]));
+	verbose(env, "%s\n", reg_type_str(env, compatible->types[j]));
 	return -EACCES;
 
 found:
@@ -5446,6 +5411,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 {
 	const struct bpf_func_proto *fn = NULL;
 	enum bpf_return_type ret_type;
+	enum bpf_type_flag ret_flag;
 	struct bpf_reg_state *regs;
 	struct bpf_call_arg_meta meta;
 	bool changes_data;
@@ -5558,6 +5524,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 
 	/* update return register (already marked as written above) */
 	ret_type = fn->ret_type;
+	ret_flag = type_flag(fn->ret_type);
 	if (ret_type == RET_INTEGER) {
 		/* sets type to SCALAR_VALUE */
 		mark_reg_unknown(env, regs, BPF_REG_0);
@@ -5576,25 +5543,23 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 			return -EINVAL;
 		}
 		regs[BPF_REG_0].map_ptr = meta.map_ptr;
-		if (type_may_be_null(ret_type)) {
-			regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
-		} else {
-			regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;
-			if (map_value_has_spin_lock(meta.map_ptr))
-				regs[BPF_REG_0].id = ++env->id_gen;
+		regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;
+		if (!type_may_be_null(ret_type) &&
+		    map_value_has_spin_lock(meta.map_ptr)) {
+			regs[BPF_REG_0].id = ++env->id_gen;
 		}
 	} else if (base_type(ret_type) == RET_PTR_TO_SOCKET) {
 		mark_reg_known_zero(env, regs, BPF_REG_0);
-		regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;
+		regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag;
 	} else if (base_type(ret_type) == RET_PTR_TO_SOCK_COMMON) {
 		mark_reg_known_zero(env, regs, BPF_REG_0);
-		regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL;
+		regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag;
 	} else if (base_type(ret_type) == RET_PTR_TO_TCP_SOCK) {
 		mark_reg_known_zero(env, regs, BPF_REG_0);
-		regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
+		regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
 	} else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) {
 		mark_reg_known_zero(env, regs, BPF_REG_0);
-		regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL;
+		regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
 		regs[BPF_REG_0].mem_size = meta.mem_size;
 	} else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) {
 		const struct btf_type *t;
@@ -5614,23 +5579,17 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 					tname, PTR_ERR(ret));
 				return -EINVAL;
 			}
-			regs[BPF_REG_0].type =
-				(ret_type & PTR_MAYBE_NULL) ?
-				PTR_TO_MEM_OR_NULL : PTR_TO_MEM;
+			regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
 			regs[BPF_REG_0].mem_size = tsize;
 		} else {
-			regs[BPF_REG_0].type =
-				(ret_type & PTR_MAYBE_NULL) ?
-				PTR_TO_BTF_ID_OR_NULL : PTR_TO_BTF_ID;
+			regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
 			regs[BPF_REG_0].btf_id = meta.ret_btf_id;
 		}
 	} else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) {
 		int ret_btf_id;
 
 		mark_reg_known_zero(env, regs, BPF_REG_0);
-		regs[BPF_REG_0].type = (ret_type & PTR_MAYBE_NULL) ?
-						     PTR_TO_BTF_ID_OR_NULL :
-						     PTR_TO_BTF_ID;
+		regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
 		ret_btf_id = *fn->ret_btf_id;
 		if (ret_btf_id == 0) {
 			verbose(env, "invalid return type %u of func %s#%d\n",
@@ -5645,7 +5604,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		return -EINVAL;
 	}
 
-	if (reg_type_may_be_null(regs[BPF_REG_0].type))
+	if (type_may_be_null(regs[BPF_REG_0].type))
 		regs[BPF_REG_0].id = ++env->id_gen;
 
 	if (is_ptr_cast_function(func_id)) {
@@ -5746,25 +5705,25 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env,
 
 	if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
 		verbose(env, "math between %s pointer and %lld is not allowed\n",
-			reg_type_str[type], val);
+			reg_type_str(env, type), val);
 		return false;
 	}
 
 	if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) {
 		verbose(env, "%s pointer offset %d is not allowed\n",
-			reg_type_str[type], reg->off);
+			reg_type_str(env, type), reg->off);
 		return false;
 	}
 
 	if (smin == S64_MIN) {
 		verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n",
-			reg_type_str[type]);
+			reg_type_str(env, type));
 		return false;
 	}
 
 	if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {
 		verbose(env, "value %lld makes %s pointer be out of bounds\n",
-			smin, reg_type_str[type]);
+			smin, reg_type_str(env, type));
 		return false;
 	}
 
@@ -6141,11 +6100,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		return -EACCES;
 	}
 
-	switch (ptr_reg->type) {
-	case PTR_TO_MAP_VALUE_OR_NULL:
+	if (ptr_reg->type & PTR_MAYBE_NULL) {
 		verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n",
-			dst, reg_type_str[ptr_reg->type]);
+			dst, reg_type_str(env, ptr_reg->type));
 		return -EACCES;
+	}
+
+	switch (base_type(ptr_reg->type)) {
 	case CONST_PTR_TO_MAP:
 		/* smin_val represents the known value */
 		if (known && smin_val == 0 && opcode == BPF_ADD)
@@ -6158,10 +6119,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	case PTR_TO_XDP_SOCK:
 reject:
 		verbose(env, "R%d pointer arithmetic on %s prohibited\n",
-			dst, reg_type_str[ptr_reg->type]);
+			dst, reg_type_str(env, ptr_reg->type));
 		return -EACCES;
 	default:
-		if (reg_type_may_be_null(ptr_reg->type))
+		if (type_may_be_null(ptr_reg->type))
 			goto reject;
 		break;
 	}
@@ -7854,7 +7815,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
 				 struct bpf_reg_state *reg, u32 id,
 				 bool is_null)
 {
-	if (reg_type_may_be_null(reg->type) && reg->id == id &&
+	if (type_may_be_null(reg->type) && reg->id == id &&
 	    !WARN_ON_ONCE(!reg->id)) {
 		if (WARN_ON_ONCE(reg->smin_value || reg->smax_value ||
 				 !tnum_equals_const(reg->var_off, 0) ||
@@ -8204,7 +8165,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	 */
 	if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K &&
 	    insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
-	    reg_type_may_be_null(dst_reg->type)) {
+	    type_may_be_null(dst_reg->type)) {
 		/* Mark all identical registers in each branch as either
 		 * safe or unknown depending R == 0 or R != 0 conditional.
 		 */
@@ -8433,7 +8394,7 @@ static int check_return_code(struct bpf_verifier_env *env)
 	if (is_subprog) {
 		if (reg->type != SCALAR_VALUE) {
 			verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n",
-				reg_type_str[reg->type]);
+				reg_type_str(env, reg->type));
 			return -EINVAL;
 		}
 		return 0;
@@ -8494,7 +8455,7 @@ static int check_return_code(struct bpf_verifier_env *env)
 
 	if (reg->type != SCALAR_VALUE) {
 		verbose(env, "At program exit the register R0 is not a known value (%s)\n",
-			reg_type_str[reg->type]);
+			reg_type_str(env, reg->type));
 		return -EINVAL;
 	}
 
@@ -9242,7 +9203,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
 		return true;
 	if (rcur->type == NOT_INIT)
 		return false;
-	switch (rold->type) {
+	switch (base_type(rold->type)) {
 	case SCALAR_VALUE:
 		if (env->explore_alu_limits)
 			return false;
@@ -9263,6 +9224,22 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
 			return false;
 		}
 	case PTR_TO_MAP_VALUE:
+		/* a PTR_TO_MAP_VALUE could be safe to use as a
+		 * PTR_TO_MAP_VALUE_OR_NULL into the same map.
+		 * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL-
+		 * checked, doing so could have affected others with the same
+		 * id, and we can't check for that because we lost the id when
+		 * we converted to a PTR_TO_MAP_VALUE.
+		 */
+		if (type_may_be_null(rold->type)) {
+			if (!type_may_be_null(rcur->type))
+				return false;
+			if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)))
+				return false;
+			/* Check our ids match any regs they're supposed to */
+			return check_ids(rold->id, rcur->id, idmap);
+		}
+
 		/* If the new min/max/var_off satisfy the old ones and
 		 * everything else matches, we are OK.
 		 * 'id' is not compared, since it's only used for maps with
@@ -9274,20 +9251,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
 		return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
 		       range_within(rold, rcur) &&
 		       tnum_in(rold->var_off, rcur->var_off);
-	case PTR_TO_MAP_VALUE_OR_NULL:
-		/* a PTR_TO_MAP_VALUE could be safe to use as a
-		 * PTR_TO_MAP_VALUE_OR_NULL into the same map.
-		 * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL-
-		 * checked, doing so could have affected others with the same
-		 * id, and we can't check for that because we lost the id when
-		 * we converted to a PTR_TO_MAP_VALUE.
-		 */
-		if (rcur->type != PTR_TO_MAP_VALUE_OR_NULL)
-			return false;
-		if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)))
-			return false;
-		/* Check our ids match any regs they're supposed to */
-		return check_ids(rold->id, rcur->id, idmap);
 	case PTR_TO_PACKET_META:
 	case PTR_TO_PACKET:
 		if (rcur->type != rold->type)
@@ -9316,11 +9279,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
 	case PTR_TO_PACKET_END:
 	case PTR_TO_FLOW_KEYS:
 	case PTR_TO_SOCKET:
-	case PTR_TO_SOCKET_OR_NULL:
 	case PTR_TO_SOCK_COMMON:
-	case PTR_TO_SOCK_COMMON_OR_NULL:
 	case PTR_TO_TCP_SOCK:
-	case PTR_TO_TCP_SOCK_OR_NULL:
 	case PTR_TO_XDP_SOCK:
 		/* Only valid matches are exact, which memcmp() above
 		 * would have accepted
@@ -9834,17 +9794,13 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 /* Return true if it's OK to have the same insn return a different type. */
 static bool reg_type_mismatch_ok(enum bpf_reg_type type)
 {
-	switch (type) {
+	switch (base_type(type)) {
 	case PTR_TO_CTX:
 	case PTR_TO_SOCKET:
-	case PTR_TO_SOCKET_OR_NULL:
 	case PTR_TO_SOCK_COMMON:
-	case PTR_TO_SOCK_COMMON_OR_NULL:
 	case PTR_TO_TCP_SOCK:
-	case PTR_TO_TCP_SOCK_OR_NULL:
 	case PTR_TO_XDP_SOCK:
 	case PTR_TO_BTF_ID:
-	case PTR_TO_BTF_ID_OR_NULL:
 		return false;
 	default:
 		return true;
@@ -10062,7 +10018,7 @@ static int do_check(struct bpf_verifier_env *env)
 			if (is_ctx_reg(env, insn->dst_reg)) {
 				verbose(env, "BPF_ST stores into R%d %s is not allowed\n",
 					insn->dst_reg,
-					reg_type_str[reg_state(env, insn->dst_reg)->type]);
+					reg_type_str(env, reg_state(env, insn->dst_reg)->type));
 				return -EACCES;
 			}
 
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index a811fe0f0f6fd..ba4e1df72ce54 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -867,7 +867,7 @@ static struct bpf_iter_reg bpf_sk_storage_map_reg_info = {
 		{ offsetof(struct bpf_iter__bpf_sk_storage_map, sk),
 		  PTR_TO_BTF_ID_OR_NULL },
 		{ offsetof(struct bpf_iter__bpf_sk_storage_map, value),
-		  PTR_TO_RDWR_BUF_OR_NULL },
+		  PTR_TO_RDWR_BUF | PTR_MAYBE_NULL },
 	},
 	.seq_info		= &iter_seq_info,
 };
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index f375ef1501490..f8b03fdfbcfb6 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -1626,7 +1626,7 @@ static struct bpf_iter_reg sock_map_iter_reg = {
 	.ctx_arg_info_size	= 2,
 	.ctx_arg_info		= {
 		{ offsetof(struct bpf_iter__sockmap, key),
-		  PTR_TO_RDONLY_BUF_OR_NULL },
+		  PTR_TO_RDONLY_BUF | PTR_MAYBE_NULL },
 		{ offsetof(struct bpf_iter__sockmap, sk),
 		  PTR_TO_BTF_ID_OR_NULL },
 	},

From dd04d374d94f8f14f46ce6f126dd8b64308cab66 Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Thu, 16 Dec 2021 16:31:48 -0800
Subject: [PATCH 373/737] bpf: Introduce MEM_RDONLY flag

This patch introduce a flag MEM_RDONLY to tag a reg value
pointing to read-only memory. It makes the following changes:

1. PTR_TO_RDWR_BUF -> PTR_TO_BUF
2. PTR_TO_RDONLY_BUF -> PTR_TO_BUF | MEM_RDONLY

Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211217003152.48334-6-haoluo@google.com
---
 include/linux/bpf.h       |  8 ++--
 kernel/bpf/btf.c          |  3 +-
 kernel/bpf/map_iter.c     |  4 +-
 kernel/bpf/verifier.c     | 84 +++++++++++++++++++++++----------------
 net/core/bpf_sk_storage.c |  2 +-
 net/core/sock_map.c       |  2 +-
 6 files changed, 60 insertions(+), 43 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c139f6b80654c..6872758566ea7 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -268,7 +268,10 @@ enum bpf_type_flag {
 	/* PTR may be NULL. */
 	PTR_MAYBE_NULL		= BIT(0 + BPF_BASE_TYPE_BITS),
 
-	__BPF_TYPE_LAST_FLAG	= PTR_MAYBE_NULL,
+	/* MEM is read-only. */
+	MEM_RDONLY		= BIT(1 + BPF_BASE_TYPE_BITS),
+
+	__BPF_TYPE_LAST_FLAG	= MEM_RDONLY,
 };
 
 /* Max number of base types. */
@@ -444,8 +447,7 @@ enum bpf_reg_type {
 	 * an explicit null check is required for this struct.
 	 */
 	PTR_TO_MEM,		 /* reg points to valid memory region */
-	PTR_TO_RDONLY_BUF,	 /* reg points to a readonly buffer */
-	PTR_TO_RDWR_BUF,	 /* reg points to a read/write buffer */
+	PTR_TO_BUF,		 /* reg points to a read/write buffer */
 	PTR_TO_PERCPU_BTF_ID,	 /* reg points to a percpu kernel variable */
 	__BPF_REG_TYPE_MAX,
 
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index bee03bbd75442..5e13f6a8b9b73 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -4539,8 +4539,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 
 		type = base_type(ctx_arg_info->reg_type);
 		flag = type_flag(ctx_arg_info->reg_type);
-		if (ctx_arg_info->offset == off &&
-		    (type == PTR_TO_RDWR_BUF || type == PTR_TO_RDONLY_BUF) &&
+		if (ctx_arg_info->offset == off && type == PTR_TO_BUF &&
 		    (flag & PTR_MAYBE_NULL)) {
 			info->reg_type = ctx_arg_info->reg_type;
 			return true;
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index 631f0e44b7a9e..b0fa190b09790 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -174,9 +174,9 @@ static const struct bpf_iter_reg bpf_map_elem_reg_info = {
 	.ctx_arg_info_size	= 2,
 	.ctx_arg_info		= {
 		{ offsetof(struct bpf_iter__bpf_map_elem, key),
-		  PTR_TO_RDONLY_BUF | PTR_MAYBE_NULL },
+		  PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY },
 		{ offsetof(struct bpf_iter__bpf_map_elem, value),
-		  PTR_TO_RDWR_BUF | PTR_MAYBE_NULL },
+		  PTR_TO_BUF | PTR_MAYBE_NULL },
 	},
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 04473f23ccfb6..1edd859d88859 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -417,6 +417,11 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
 		base_type(type) == PTR_TO_MEM;
 }
 
+static bool type_is_rdonly_mem(u32 type)
+{
+	return type & MEM_RDONLY;
+}
+
 static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
 {
 	return type == ARG_PTR_TO_SOCK_COMMON;
@@ -485,7 +490,7 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id)
 static const char *reg_type_str(struct bpf_verifier_env *env,
 				enum bpf_reg_type type)
 {
-	char postfix[16] = {0};
+	char postfix[16] = {0}, prefix[16] = {0};
 	static const char * const str[] = {
 		[NOT_INIT]		= "?",
 		[SCALAR_VALUE]		= "inv",
@@ -505,8 +510,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
 		[PTR_TO_BTF_ID]		= "ptr_",
 		[PTR_TO_PERCPU_BTF_ID]	= "percpu_ptr_",
 		[PTR_TO_MEM]		= "mem",
-		[PTR_TO_RDONLY_BUF]	= "rdonly_buf",
-		[PTR_TO_RDWR_BUF]	= "rdwr_buf",
+		[PTR_TO_BUF]		= "buf",
 	};
 
 	if (type & PTR_MAYBE_NULL) {
@@ -517,8 +521,11 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
 			strncpy(postfix, "_or_null", 16);
 	}
 
-	snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s",
-		 str[base_type(type)], postfix);
+	if (type & MEM_RDONLY)
+		strncpy(prefix, "rdonly_", 16);
+
+	snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s",
+		 prefix, str[base_type(type)], postfix);
 	return env->type_str_buf;
 }
 
@@ -2252,8 +2259,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_TCP_SOCK:
 	case PTR_TO_XDP_SOCK:
 	case PTR_TO_BTF_ID:
-	case PTR_TO_RDONLY_BUF:
-	case PTR_TO_RDWR_BUF:
+	case PTR_TO_BUF:
 	case PTR_TO_PERCPU_BTF_ID:
 	case PTR_TO_MEM:
 		return true;
@@ -3993,22 +3999,28 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 	} else if (reg->type == CONST_PTR_TO_MAP) {
 		err = check_ptr_to_map_access(env, regs, regno, off, size, t,
 					      value_regno);
-	} else if (reg->type == PTR_TO_RDONLY_BUF) {
-		if (t == BPF_WRITE) {
-			verbose(env, "R%d cannot write into %s\n",
-				regno, reg_type_str(env, reg->type));
-			return -EACCES;
+	} else if (base_type(reg->type) == PTR_TO_BUF) {
+		bool rdonly_mem = type_is_rdonly_mem(reg->type);
+		const char *buf_info;
+		u32 *max_access;
+
+		if (rdonly_mem) {
+			if (t == BPF_WRITE) {
+				verbose(env, "R%d cannot write into %s\n",
+					regno, reg_type_str(env, reg->type));
+				return -EACCES;
+			}
+			buf_info = "rdonly";
+			max_access = &env->prog->aux->max_rdonly_access;
+		} else {
+			buf_info = "rdwr";
+			max_access = &env->prog->aux->max_rdwr_access;
 		}
+
 		err = check_buffer_access(env, reg, regno, off, size, false,
-					  "rdonly",
-					  &env->prog->aux->max_rdonly_access);
-		if (!err && value_regno >= 0)
-			mark_reg_unknown(env, regs, value_regno);
-	} else if (reg->type == PTR_TO_RDWR_BUF) {
-		err = check_buffer_access(env, reg, regno, off, size, false,
-					  "rdwr",
-					  &env->prog->aux->max_rdwr_access);
-		if (!err && t == BPF_READ && value_regno >= 0)
+					  buf_info, max_access);
+
+		if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ))
 			mark_reg_unknown(env, regs, value_regno);
 	} else {
 		verbose(env, "R%d invalid mem access '%s'\n", regno,
@@ -4211,8 +4223,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 				   struct bpf_call_arg_meta *meta)
 {
 	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+	const char *buf_info;
+	u32 *max_access;
 
-	switch (reg->type) {
+	switch (base_type(reg->type)) {
 	case PTR_TO_PACKET:
 	case PTR_TO_PACKET_META:
 		return check_packet_access(env, regno, reg->off, access_size,
@@ -4228,18 +4242,20 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 		return check_mem_region_access(env, regno, reg->off,
 					       access_size, reg->mem_size,
 					       zero_size_allowed);
-	case PTR_TO_RDONLY_BUF:
-		if (meta && meta->raw_mode)
-			return -EACCES;
-		return check_buffer_access(env, reg, regno, reg->off,
-					   access_size, zero_size_allowed,
-					   "rdonly",
-					   &env->prog->aux->max_rdonly_access);
-	case PTR_TO_RDWR_BUF:
+	case PTR_TO_BUF:
+		if (type_is_rdonly_mem(reg->type)) {
+			if (meta && meta->raw_mode)
+				return -EACCES;
+
+			buf_info = "rdonly";
+			max_access = &env->prog->aux->max_rdonly_access;
+		} else {
+			buf_info = "rdwr";
+			max_access = &env->prog->aux->max_rdwr_access;
+		}
 		return check_buffer_access(env, reg, regno, reg->off,
 					   access_size, zero_size_allowed,
-					   "rdwr",
-					   &env->prog->aux->max_rdwr_access);
+					   buf_info, max_access);
 	case PTR_TO_STACK:
 		return check_stack_range_initialized(
 				env,
@@ -4442,8 +4458,8 @@ static const struct bpf_reg_types mem_types = {
 		PTR_TO_PACKET_META,
 		PTR_TO_MAP_VALUE,
 		PTR_TO_MEM,
-		PTR_TO_RDONLY_BUF,
-		PTR_TO_RDWR_BUF,
+		PTR_TO_BUF,
+		PTR_TO_BUF | MEM_RDONLY,
 	},
 };
 
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index ba4e1df72ce54..3fad2f5b920e0 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -867,7 +867,7 @@ static struct bpf_iter_reg bpf_sk_storage_map_reg_info = {
 		{ offsetof(struct bpf_iter__bpf_sk_storage_map, sk),
 		  PTR_TO_BTF_ID_OR_NULL },
 		{ offsetof(struct bpf_iter__bpf_sk_storage_map, value),
-		  PTR_TO_RDWR_BUF | PTR_MAYBE_NULL },
+		  PTR_TO_BUF | PTR_MAYBE_NULL },
 	},
 	.seq_info		= &iter_seq_info,
 };
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index f8b03fdfbcfb6..6da240ab01701 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -1626,7 +1626,7 @@ static struct bpf_iter_reg sock_map_iter_reg = {
 	.ctx_arg_info_size	= 2,
 	.ctx_arg_info		= {
 		{ offsetof(struct bpf_iter__sockmap, key),
-		  PTR_TO_RDONLY_BUF | PTR_MAYBE_NULL },
+		  PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY },
 		{ offsetof(struct bpf_iter__sockmap, sk),
 		  PTR_TO_BTF_ID_OR_NULL },
 	},

From 01a59f10d4f358a7d5f8850b7765bde25c2110a4 Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Thu, 16 Dec 2021 16:31:49 -0800
Subject: [PATCH 374/737] bpf: Convert PTR_TO_MEM_OR_NULL to composable types.

Remove PTR_TO_MEM_OR_NULL and replace it with PTR_TO_MEM combined with
flag PTR_MAYBE_NULL.

Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211217003152.48334-7-haoluo@google.com
---
 include/linux/bpf.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6872758566ea7..e6c71590d0b9c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -457,7 +457,6 @@ enum bpf_reg_type {
 	PTR_TO_SOCK_COMMON_OR_NULL	= PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON,
 	PTR_TO_TCP_SOCK_OR_NULL		= PTR_MAYBE_NULL | PTR_TO_TCP_SOCK,
 	PTR_TO_BTF_ID_OR_NULL		= PTR_MAYBE_NULL | PTR_TO_BTF_ID,
-	PTR_TO_MEM_OR_NULL		= PTR_MAYBE_NULL | PTR_TO_MEM,
 
 	/* This must be the last entry. Its purpose is to ensure the enum is
 	 * wide enough to hold the higher bits reserved for bpf_type_flag.

From a0a6093c4ecd9c387e95fbfe8175921a0ad37310 Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Thu, 16 Dec 2021 16:31:50 -0800
Subject: [PATCH 375/737] bpf: Make per_cpu_ptr return rdonly PTR_TO_MEM.

Tag the return type of {per, this}_cpu_ptr with RDONLY_MEM. The
returned value of this pair of helpers is kernel object, which
can not be updated by bpf programs. Previously these two helpers
return PTR_OT_MEM for kernel objects of scalar type, which allows
one to directly modify the memory. Now with RDONLY_MEM tagging,
the verifier will reject programs that write into RDONLY_MEM.

Fixes: 63d9b80dcf2c ("bpf: Introducte bpf_this_cpu_ptr()")
Fixes: eaa6bcb71ef6 ("bpf: Introduce bpf_per_cpu_ptr()")
Fixes: 4976b718c355 ("bpf: Introduce pseudo_btf_id")
Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211217003152.48334-8-haoluo@google.com
---
 kernel/bpf/helpers.c  |  4 ++--
 kernel/bpf/verifier.c | 30 ++++++++++++++++++++++++++----
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 606d02964a1a9..41059e9a03d31 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -642,7 +642,7 @@ BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
 const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
 	.func		= bpf_per_cpu_ptr,
 	.gpl_only	= false,
-	.ret_type	= RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL,
+	.ret_type	= RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY,
 	.arg1_type	= ARG_PTR_TO_PERCPU_BTF_ID,
 	.arg2_type	= ARG_ANYTHING,
 };
@@ -655,7 +655,7 @@ BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr)
 const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
 	.func		= bpf_this_cpu_ptr,
 	.gpl_only	= false,
-	.ret_type	= RET_PTR_TO_MEM_OR_BTF_ID,
+	.ret_type	= RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY,
 	.arg1_type	= ARG_PTR_TO_PERCPU_BTF_ID,
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1edd859d88859..f40fdc0e68aa3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3887,15 +3887,30 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 				mark_reg_unknown(env, regs, value_regno);
 			}
 		}
-	} else if (reg->type == PTR_TO_MEM) {
+	} else if (base_type(reg->type) == PTR_TO_MEM) {
+		bool rdonly_mem = type_is_rdonly_mem(reg->type);
+
+		if (type_may_be_null(reg->type)) {
+			verbose(env, "R%d invalid mem access '%s'\n", regno,
+				reg_type_str(env, reg->type));
+			return -EACCES;
+		}
+
+		if (t == BPF_WRITE && rdonly_mem) {
+			verbose(env, "R%d cannot write into %s\n",
+				regno, reg_type_str(env, reg->type));
+			return -EACCES;
+		}
+
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
 			verbose(env, "R%d leaks addr into mem\n", value_regno);
 			return -EACCES;
 		}
+
 		err = check_mem_region_access(env, regno, off, size,
 					      reg->mem_size, false);
-		if (!err && t == BPF_READ && value_regno >= 0)
+		if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem))
 			mark_reg_unknown(env, regs, value_regno);
 	} else if (reg->type == PTR_TO_CTX) {
 		enum bpf_reg_type reg_type = SCALAR_VALUE;
@@ -5598,6 +5613,13 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 			regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
 			regs[BPF_REG_0].mem_size = tsize;
 		} else {
+			/* MEM_RDONLY may be carried from ret_flag, but it
+			 * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise
+			 * it will confuse the check of PTR_TO_BTF_ID in
+			 * check_mem_access().
+			 */
+			ret_flag &= ~MEM_RDONLY;
+
 			regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
 			regs[BPF_REG_0].btf_id = meta.ret_btf_id;
 		}
@@ -8236,7 +8258,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		mark_reg_known_zero(env, regs, insn->dst_reg);
 
 		dst_reg->type = aux->btf_var.reg_type;
-		switch (dst_reg->type) {
+		switch (base_type(dst_reg->type)) {
 		case PTR_TO_MEM:
 			dst_reg->mem_size = aux->btf_var.mem_size;
 			break;
@@ -10242,7 +10264,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
 				tname, PTR_ERR(ret));
 			return -EINVAL;
 		}
-		aux->btf_var.reg_type = PTR_TO_MEM;
+		aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
 		aux->btf_var.mem_size = tsize;
 	} else {
 		aux->btf_var.reg_type = PTR_TO_BTF_ID;

From 7506e53c5ce2dfbd1a6bae810441131a5e5e8345 Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Thu, 16 Dec 2021 16:31:51 -0800
Subject: [PATCH 376/737] bpf: Add MEM_RDONLY for helper args that are pointers
 to rdonly mem.

Some helper functions may modify its arguments, for example,
bpf_d_path, bpf_get_stack etc. Previously, their argument types
were marked as ARG_PTR_TO_MEM, which is compatible with read-only
mem types, such as PTR_TO_RDONLY_BUF. Therefore it's legitimate,
but technically incorrect, to modify a read-only memory by passing
it into one of such helper functions.

This patch tags the bpf_args compatible with immutable memory with
MEM_RDONLY flag. The arguments that don't have this flag will be
only compatible with mutable memory types, preventing the helper
from modifying a read-only memory. The bpf_args that have
MEM_RDONLY are compatible with both mutable memory and immutable
memory.

Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211217003152.48334-9-haoluo@google.com
---
 include/linux/bpf.h      |  4 ++-
 kernel/bpf/cgroup.c      |  2 +-
 kernel/bpf/helpers.c     |  6 ++--
 kernel/bpf/ringbuf.c     |  2 +-
 kernel/bpf/verifier.c    | 35 +++++++++++++++++++++--
 kernel/trace/bpf_trace.c | 20 ++++++-------
 net/core/filter.c        | 62 ++++++++++++++++++++--------------------
 7 files changed, 81 insertions(+), 50 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e6c71590d0b9c..629f219b601d4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -268,7 +268,9 @@ enum bpf_type_flag {
 	/* PTR may be NULL. */
 	PTR_MAYBE_NULL		= BIT(0 + BPF_BASE_TYPE_BITS),
 
-	/* MEM is read-only. */
+	/* MEM is read-only. When applied on bpf_arg, it indicates the arg is
+	 * compatible with both mutable and immutable memory.
+	 */
 	MEM_RDONLY		= BIT(1 + BPF_BASE_TYPE_BITS),
 
 	__BPF_TYPE_LAST_FLAG	= MEM_RDONLY,
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 85927c2aa3433..54321df6cfac6 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1738,7 +1738,7 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg3_type	= ARG_CONST_SIZE,
 };
 
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 41059e9a03d31..1a83f0572deff 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -505,7 +505,7 @@ const struct bpf_func_proto bpf_strtol_proto = {
 	.func		= bpf_strtol,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_MEM,
+	.arg1_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg2_type	= ARG_CONST_SIZE,
 	.arg3_type	= ARG_ANYTHING,
 	.arg4_type	= ARG_PTR_TO_LONG,
@@ -533,7 +533,7 @@ const struct bpf_func_proto bpf_strtoul_proto = {
 	.func		= bpf_strtoul,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_MEM,
+	.arg1_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg2_type	= ARG_CONST_SIZE,
 	.arg3_type	= ARG_ANYTHING,
 	.arg4_type	= ARG_PTR_TO_LONG,
@@ -605,7 +605,7 @@ const struct bpf_func_proto bpf_event_output_data_proto =  {
 	.arg1_type      = ARG_PTR_TO_CTX,
 	.arg2_type      = ARG_CONST_MAP_PTR,
 	.arg3_type      = ARG_ANYTHING,
-	.arg4_type      = ARG_PTR_TO_MEM,
+	.arg4_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg5_type      = ARG_CONST_SIZE_OR_ZERO,
 };
 
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 1e4bf23528a3d..d6fbe17432ae5 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -463,7 +463,7 @@ const struct bpf_func_proto bpf_ringbuf_output_proto = {
 	.func		= bpf_ringbuf_output,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_CONST_MAP_PTR,
-	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
 	.arg4_type	= ARG_ANYTHING,
 };
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f40fdc0e68aa3..0e0ff32adabc4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4474,7 +4474,6 @@ static const struct bpf_reg_types mem_types = {
 		PTR_TO_MAP_VALUE,
 		PTR_TO_MEM,
 		PTR_TO_BUF,
-		PTR_TO_BUF | MEM_RDONLY,
 	},
 };
 
@@ -4534,6 +4533,36 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
 		verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type);
 		return -EFAULT;
 	}
+        
+        /* ARG_PTR_TO_MEM + RDONLY is compatible with PTR_TO_MEM and PTR_TO_MEM + RDONLY,
+	 * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM and NOT with PTR_TO_MEM + RDONLY
+	 *
+	 * Same for MAYBE_NULL:
+	 *
+	 * ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL,
+	 * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL
+	 *
+	 * Therefore we fold these flags depending on the arg_type before comparison.
+	 */
+	if (arg_type & MEM_RDONLY)
+		type &= ~MEM_RDONLY;
+	if (arg_type & PTR_MAYBE_NULL)
+		type &= ~PTR_MAYBE_NULL;
+
+	/* ARG_PTR_TO_MEM + RDONLY is compatible with PTR_TO_MEM and PTR_TO_MEM + RDONLY,
+	 * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM and NOT with PTR_TO_MEM + RDONLY
+	 *
+	 * Same for MAYBE_NULL:
+	 *
+	 * ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL,
+	 * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL
+	 *
+	 * Therefore we fold these flags depending on the arg_type before comparison.
+	 */
+	if (arg_type & MEM_RDONLY)
+		type &= ~MEM_RDONLY;
+	if (arg_type & PTR_MAYBE_NULL)
+		type &= ~PTR_MAYBE_NULL;
 
 	for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
 		expected = compatible->types[i];
@@ -4544,14 +4573,14 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
 			goto found;
 	}
 
-	verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, type));
+	verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, reg->type));
 	for (j = 0; j + 1 < i; j++)
 		verbose(env, "%s, ", reg_type_str(env, compatible->types[j]));
 	verbose(env, "%s\n", reg_type_str(env, compatible->types[j]));
 	return -EACCES;
 
 found:
-	if (type == PTR_TO_BTF_ID) {
+	if (reg->type == PTR_TO_BTF_ID) {
 		if (!arg_btf_id) {
 			if (!compatible->btf_id) {
 				verbose(env, "verifier internal error: missing arg compatible BTF ID\n");
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d223b45f57165..279bf9042cd26 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -342,7 +342,7 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = {
 	.gpl_only	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_ANYTHING,
-	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg3_type	= ARG_CONST_SIZE,
 };
 
@@ -545,7 +545,7 @@ static const struct bpf_func_proto bpf_trace_printk_proto = {
 	.func		= bpf_trace_printk,
 	.gpl_only	= true,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_MEM,
+	.arg1_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg2_type	= ARG_CONST_SIZE,
 };
 
@@ -754,9 +754,9 @@ static const struct bpf_func_proto bpf_seq_printf_proto = {
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_BTF_ID,
 	.arg1_btf_id	= &btf_seq_file_ids[0],
-	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg3_type	= ARG_CONST_SIZE,
-	.arg4_type      = ARG_PTR_TO_MEM_OR_NULL,
+	.arg4_type      = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
 	.arg5_type      = ARG_CONST_SIZE_OR_ZERO,
 };
 
@@ -771,7 +771,7 @@ static const struct bpf_func_proto bpf_seq_write_proto = {
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_BTF_ID,
 	.arg1_btf_id	= &btf_seq_file_ids[0],
-	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
@@ -795,7 +795,7 @@ static const struct bpf_func_proto bpf_seq_printf_btf_proto = {
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_BTF_ID,
 	.arg1_btf_id	= &btf_seq_file_ids[0],
-	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
 	.arg4_type	= ARG_ANYTHING,
 };
@@ -956,7 +956,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
@@ -1438,7 +1438,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
@@ -1656,7 +1656,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
@@ -1710,7 +1710,7 @@ static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
 	.gpl_only	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
 	.arg4_type	= ARG_ANYTHING,
 };
diff --git a/net/core/filter.c b/net/core/filter.c
index a887814660681..c7520126c32dc 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1713,7 +1713,7 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_ANYTHING,
-	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg4_type	= ARG_CONST_SIZE,
 	.arg5_type	= ARG_ANYTHING,
 };
@@ -2021,9 +2021,9 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
 	.gpl_only	= false,
 	.pkt_access	= true,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_MEM_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
-	.arg3_type	= ARG_PTR_TO_MEM_OR_NULL,
+	.arg3_type	= ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
 	.arg4_type	= ARG_CONST_SIZE_OR_ZERO,
 	.arg5_type	= ARG_ANYTHING,
 };
@@ -2570,7 +2570,7 @@ static const struct bpf_func_proto bpf_redirect_neigh_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_ANYTHING,
-	.arg2_type      = ARG_PTR_TO_MEM_OR_NULL,
+	.arg2_type      = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
 	.arg3_type      = ARG_CONST_SIZE_OR_ZERO,
 	.arg4_type	= ARG_ANYTHING,
 };
@@ -4193,7 +4193,7 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
@@ -4207,7 +4207,7 @@ const struct bpf_func_proto bpf_skb_output_proto = {
 	.arg1_btf_id	= &bpf_skb_output_btf_ids[0],
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
@@ -4390,7 +4390,7 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg3_type	= ARG_CONST_SIZE,
 	.arg4_type	= ARG_ANYTHING,
 };
@@ -4416,7 +4416,7 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg3_type	= ARG_CONST_SIZE,
 };
 
@@ -4586,7 +4586,7 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
@@ -4600,7 +4600,7 @@ const struct bpf_func_proto bpf_xdp_output_proto = {
 	.arg1_btf_id	= &bpf_xdp_output_btf_ids[0],
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
@@ -5030,7 +5030,7 @@ static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_ANYTHING,
 	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg5_type	= ARG_CONST_SIZE,
 };
 
@@ -5064,7 +5064,7 @@ static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_ANYTHING,
 	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg5_type	= ARG_CONST_SIZE,
 };
 
@@ -5239,7 +5239,7 @@ static const struct bpf_func_proto bpf_bind_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg3_type	= ARG_CONST_SIZE,
 };
 
@@ -5700,7 +5700,7 @@ static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = {
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_ANYTHING,
-	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg4_type	= ARG_CONST_SIZE
 };
 
@@ -5710,7 +5710,7 @@ static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = {
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_ANYTHING,
-	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg4_type	= ARG_CONST_SIZE
 };
 
@@ -5753,7 +5753,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_ANYTHING,
-	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg4_type	= ARG_CONST_SIZE
 };
 
@@ -5841,7 +5841,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_ANYTHING,
-	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg4_type	= ARG_CONST_SIZE
 };
 
@@ -6088,7 +6088,7 @@ static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = {
 	.pkt_access	= true,
 	.ret_type	= RET_PTR_TO_SOCK_COMMON_OR_NULL,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg3_type	= ARG_CONST_SIZE,
 	.arg4_type	= ARG_ANYTHING,
 	.arg5_type	= ARG_ANYTHING,
@@ -6107,7 +6107,7 @@ static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {
 	.pkt_access	= true,
 	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg3_type	= ARG_CONST_SIZE,
 	.arg4_type	= ARG_ANYTHING,
 	.arg5_type	= ARG_ANYTHING,
@@ -6126,7 +6126,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
 	.pkt_access	= true,
 	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg3_type	= ARG_CONST_SIZE,
 	.arg4_type	= ARG_ANYTHING,
 	.arg5_type	= ARG_ANYTHING,
@@ -6163,7 +6163,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = {
 	.pkt_access     = true,
 	.ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
 	.arg1_type      = ARG_PTR_TO_CTX,
-	.arg2_type      = ARG_PTR_TO_MEM,
+	.arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg3_type      = ARG_CONST_SIZE,
 	.arg4_type      = ARG_ANYTHING,
 	.arg5_type      = ARG_ANYTHING,
@@ -6186,7 +6186,7 @@ static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = {
 	.pkt_access     = true,
 	.ret_type       = RET_PTR_TO_SOCK_COMMON_OR_NULL,
 	.arg1_type      = ARG_PTR_TO_CTX,
-	.arg2_type      = ARG_PTR_TO_MEM,
+	.arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg3_type      = ARG_CONST_SIZE,
 	.arg4_type      = ARG_ANYTHING,
 	.arg5_type      = ARG_ANYTHING,
@@ -6209,7 +6209,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = {
 	.pkt_access     = true,
 	.ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
 	.arg1_type      = ARG_PTR_TO_CTX,
-	.arg2_type      = ARG_PTR_TO_MEM,
+	.arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg3_type      = ARG_CONST_SIZE,
 	.arg4_type      = ARG_ANYTHING,
 	.arg5_type      = ARG_ANYTHING,
@@ -6228,7 +6228,7 @@ static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_PTR_TO_SOCK_COMMON_OR_NULL,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg3_type	= ARG_CONST_SIZE,
 	.arg4_type	= ARG_ANYTHING,
 	.arg5_type	= ARG_ANYTHING,
@@ -6247,7 +6247,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg3_type	= ARG_CONST_SIZE,
 	.arg4_type	= ARG_ANYTHING,
 	.arg5_type	= ARG_ANYTHING,
@@ -6266,7 +6266,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg3_type	= ARG_CONST_SIZE,
 	.arg4_type	= ARG_ANYTHING,
 	.arg5_type	= ARG_ANYTHING,
@@ -6588,9 +6588,9 @@ static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = {
 	.pkt_access	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_BTF_ID_SOCK_COMMON,
-	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg3_type	= ARG_CONST_SIZE,
-	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg5_type	= ARG_CONST_SIZE,
 };
 
@@ -6657,9 +6657,9 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = {
 	.pkt_access	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_BTF_ID_SOCK_COMMON,
-	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg3_type	= ARG_CONST_SIZE,
-	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg5_type	= ARG_CONST_SIZE,
 };
 
@@ -6888,7 +6888,7 @@ static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg3_type	= ARG_CONST_SIZE,
 	.arg4_type	= ARG_ANYTHING,
 };

From 4b8316534352aec2562d4ac41fad8916bc3559f8 Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Thu, 16 Dec 2021 16:31:52 -0800
Subject: [PATCH 377/737] bpf/selftests: Test PTR_TO_RDONLY_MEM

This test verifies that a ksym of non-struct can not be directly
updated.

Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211217003152.48334-10-haoluo@google.com
---
 .../selftests/bpf/prog_tests/ksyms_btf.c      | 14 +++++++++
 .../bpf/progs/test_ksyms_btf_write_check.c    | 29 +++++++++++++++++++
 2 files changed, 43 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c

diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c
index b58b775d19f3f..97f38d4f6a263 100644
--- a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c
+++ b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c
@@ -6,6 +6,7 @@
 #include <bpf/btf.h>
 #include "test_ksyms_btf.skel.h"
 #include "test_ksyms_btf_null_check.skel.h"
+#include "test_ksyms_btf_write_check.skel.h"
 
 static int duration;
 
@@ -81,6 +82,16 @@ static void test_null_check(void)
 	test_ksyms_btf_null_check__destroy(skel);
 }
 
+static void test_write_check(void)
+{
+	struct test_ksyms_btf_write_check *skel;
+
+	skel = test_ksyms_btf_write_check__open_and_load();
+	CHECK(skel, "skel_open", "unexpected load of a prog writing to ksym memory\n");
+
+	test_ksyms_btf_write_check__destroy(skel);
+}
+
 void test_ksyms_btf(void)
 {
 	int percpu_datasec;
@@ -106,4 +117,7 @@ void test_ksyms_btf(void)
 
 	if (test__start_subtest("null_check"))
 		test_null_check();
+
+	if (test__start_subtest("write_check"))
+		test_write_check();
 }
diff --git a/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c b/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c
new file mode 100644
index 0000000000000..2180c41cd890f
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Google */
+
+#include "vmlinux.h"
+
+#include <bpf/bpf_helpers.h>
+
+extern const int bpf_prog_active __ksym; /* int type global var. */
+
+SEC("raw_tp/sys_enter")
+int handler(const void *ctx)
+{
+	int *active;
+	__u32 cpu;
+
+	cpu = bpf_get_smp_processor_id();
+	active = (int *)bpf_per_cpu_ptr(&bpf_prog_active, cpu);
+	if (active) {
+		/* Kernel memory obtained from bpf_{per,this}_cpu_ptr
+		 * is read-only, should _not_ pass verification.
+		 */
+		/* WRITE_ONCE */
+		*(volatile int *)active = -1;
+	}
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";

From eb63ec1b6d93c66d79dc7dc2cda486e83d8da053 Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Thu, 26 Aug 2021 10:49:47 +0800
Subject: [PATCH 378/737] sock: remove one redundant SKB_FRAG_PAGE_ORDER macro

[ Upstream commit 723783d077e39c256a1fafebbd97cbb14207c28f ]

Both SKB_FRAG_PAGE_ORDER are defined to the same value in
net/core/sock.c and drivers/vhost/net.c.

Move the SKB_FRAG_PAGE_ORDER definition to net/core/sock.h,
as both net/core/sock.c and drivers/vhost/net.c include it,
and it seems a reasonable file to put the macro.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/vhost/net.c | 2 --
 net/core/sock.c     | 1 -
 2 files changed, 3 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index b9c8e40252142..fb24b4f1957f8 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -652,8 +652,6 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len)
 	       !vhost_vq_avail_empty(vq->dev, vq);
 }
 
-#define SKB_FRAG_PAGE_ORDER     get_order(32768)
-
 static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz,
 				       struct page_frag *pfrag, gfp_t gfp)
 {
diff --git a/net/core/sock.c b/net/core/sock.c
index 98f4b4a80de42..5157e17081c71 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2461,7 +2461,6 @@ static void sk_leave_memory_pressure(struct sock *sk)
 	}
 }
 
-#define SKB_FRAG_PAGE_ORDER	get_order(32768)
 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
 
 /**

From de080bf621260bc86132dfe66b0d296cf8065577 Mon Sep 17 00:00:00 2001
From: Shaoying Xu <shaoyi@amazon.com>
Date: Wed, 23 Mar 2022 16:28:28 +0000
Subject: [PATCH 379/737] Revert "lustre: update to AmazonFSxLustreClient
 v2.12.8-1"

This reverts commit 097695e586af57cbdfe0e9e5453be6fb869b58dc.

Signed-off-by: Shaoying Xu <shaoyi@amazon.com>
---
 drivers/staging/lustrefsx/Makefile.rules      |    1 -
 drivers/staging/lustrefsx/config.h            |  217 +-
 .../lustrefsx/libcfs/include/libcfs/bitmap.h  |    3 -
 .../lustrefsx/libcfs/include/libcfs/curproc.h |    4 -
 .../lustrefsx/libcfs/include/libcfs/libcfs.h  |   78 +-
 .../libcfs/include/libcfs/libcfs_cpu.h        |  102 +-
 .../libcfs/include/libcfs/libcfs_crypto.h     |  113 +-
 .../libcfs/include/libcfs/libcfs_debug.h      |  171 +-
 .../libcfs/include/libcfs/libcfs_fail.h       |   40 +-
 .../include/libcfs}/libcfs_ioctl.h            |   24 +-
 .../libcfs/include/libcfs/libcfs_prim.h       |    1 -
 .../libcfs/include/libcfs/libcfs_private.h    |   11 +-
 .../libcfs/include/libcfs/libcfs_ptask.h      |  121 +
 .../libcfs/include/libcfs/libcfs_string.h     |    2 +-
 .../libcfs/include/libcfs/libcfs_time.h       |   81 +
 .../libcfs/include/libcfs/linux/libcfs.h      |  150 +
 .../libcfs/include/libcfs/linux/linux-cpu.h   |   57 +-
 .../include/libcfs/linux/linux-crypto.h       |    5 +
 .../libcfs/include/libcfs/linux/linux-fs.h    |   21 +-
 .../libcfs/include/libcfs/linux/linux-hash.h  |  247 --
 .../libcfs/include/libcfs/linux/linux-mem.h   |    8 +
 .../libcfs/include/libcfs/linux/linux-misc.h  |   64 +-
 .../libcfs/include/libcfs/linux/linux-time.h  |  164 +-
 .../libcfs/include/libcfs/linux/linux-wait.h  |  568 ---
 .../libcfs/include/libcfs/util/hash.h         |  103 -
 .../libcfs/include/libcfs/util/ioctl.h        |    4 +-
 .../libcfs/include/libcfs/util/parser.h       |    4 +-
 .../libcfs/include/libcfs/util/string.h       |   11 +-
 .../staging/lustrefsx/libcfs/libcfs/Makefile  |    5 +-
 .../staging/lustrefsx/libcfs/libcfs/debug.c   |  385 +-
 .../lustrefsx/libcfs/libcfs/libcfs_cpu.c      | 1180 +-----
 .../lustrefsx/libcfs/libcfs/libcfs_mem.c      |   13 +-
 .../lustrefsx/libcfs/libcfs/libcfs_ptask.c    |  478 +++
 .../lustrefsx/libcfs/libcfs/libcfs_string.c   |   50 +-
 .../libcfs/libcfs/linux/linux-crypto-adler.c  |    2 -
 .../libcfs/libcfs/linux/linux-crypto-crc32.c  |    3 -
 .../libcfs/linux/linux-crypto-crc32c-pclmul.c |    3 -
 .../libcfs/linux/linux-crypto-crc32pclmul.c   |    4 -
 .../libcfs/libcfs/linux/linux-crypto.c        |   70 +-
 .../libcfs/libcfs/linux/linux-curproc.c       |   23 +-
 .../libcfs/libcfs/linux/linux-debug.c         |   54 +-
 .../libcfs/libcfs/linux/linux-hash.c          |   57 -
 .../libcfs/libcfs/linux/linux-module.c        |    4 +-
 .../libcfs/libcfs/linux/linux-prim.c          |   60 +-
 .../libcfs/libcfs/linux/linux-tracefile.c     |    1 -
 .../libcfs/libcfs/linux/linux-wait.c          |  115 -
 .../staging/lustrefsx/libcfs/libcfs/module.c  |  552 +--
 .../lustrefsx/libcfs/libcfs/tracefile.c       |   89 +-
 .../lustrefsx/libcfs/libcfs/tracefile.h       |    3 +-
 .../lustrefsx/libcfs/libcfs/util/l_ioctl.c    |    4 +-
 .../lustrefsx/libcfs/libcfs/util/nidstrings.c |    6 +-
 .../lustrefsx/libcfs/libcfs/util/param.c      |   10 +-
 .../lustrefsx/libcfs/libcfs/util/parser.c     |   67 +-
 .../lustrefsx/libcfs/libcfs/util/string.c     |  122 +-
 .../lustrefsx/libcfs/libcfs/watchdog.c        |    7 +-
 .../lustrefsx/libcfs/libcfs/workitem.c        |    9 +-
 .../staging/lustrefsx/lnet/include/cyaml.h    |    2 +-
 .../staging/lustrefsx/lnet/include/lnet/api.h |    8 +-
 .../linux/lnet/lnet-dlc.h => lnet/lib-dlc.h}  |   77 +-
 .../lustrefsx/lnet/include/lnet/lib-lnet.h    |  275 +-
 .../lustrefsx/lnet/include/lnet/lib-types.h   |  582 +--
 .../linux/lnet/socklnd.h => lnet/lnet.h}      |   24 +-
 .../include/{uapi/linux => }/lnet/lnetctl.h   |   36 +-
 .../include/{uapi/linux => }/lnet/lnetst.h    |   10 +-
 .../include/{uapi/linux => }/lnet/nidstr.h    |   14 +-
 .../lustrefsx/lnet/include/lnet/socklnd.h     |   14 +-
 .../linux/lnet/lnet-types.h => lnet/types.h}  |  179 +-
 .../include/uapi/linux/lnet/libcfs_debug.h    |  151 -
 .../lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c    |  926 +++--
 .../lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h    |  426 +--
 .../lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c | 1037 +++---
 .../lnet/klnds/o2iblnd/o2iblnd_modparams.c    |   84 +-
 .../lustrefsx/lnet/klnds/socklnd/socklnd.c    |  782 ++--
 .../lustrefsx/lnet/klnds/socklnd/socklnd.h    |  333 +-
 .../lustrefsx/lnet/klnds/socklnd/socklnd_cb.c | 1107 +++---
 .../lnet/klnds/socklnd/socklnd_lib.c          |  116 +-
 .../lnet/klnds/socklnd/socklnd_modparams.c    |    4 +-
 .../lnet/klnds/socklnd/socklnd_proto.c        |   87 +-
 .../staging/lustrefsx/lnet/lnet/acceptor.c    |   10 +-
 drivers/staging/lustrefsx/lnet/lnet/api-ni.c  | 1678 ++-------
 drivers/staging/lustrefsx/lnet/lnet/config.c  |   66 +-
 drivers/staging/lustrefsx/lnet/lnet/lib-eq.c  |    2 +
 drivers/staging/lustrefsx/lnet/lnet/lib-md.c  |    2 +-
 .../staging/lustrefsx/lnet/lnet/lib-move.c    | 3068 +++-------------
 drivers/staging/lustrefsx/lnet/lnet/lib-msg.c |  699 +---
 drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c |    2 +-
 .../staging/lustrefsx/lnet/lnet/lib-socket.c  |   28 +-
 drivers/staging/lustrefsx/lnet/lnet/lo.c      |    2 -
 drivers/staging/lustrefsx/lnet/lnet/module.c  |   51 +-
 .../staging/lustrefsx/lnet/lnet/net_fault.c   |  198 +-
 .../staging/lustrefsx/lnet/lnet/nidstrings.c  |    4 +-
 drivers/staging/lustrefsx/lnet/lnet/peer.c    | 3166 ++---------------
 drivers/staging/lustrefsx/lnet/lnet/router.c  |  436 +--
 .../staging/lustrefsx/lnet/lnet/router_proc.c |  107 +-
 .../lustrefsx/lnet/selftest/brw_test.c        |  107 +-
 .../staging/lustrefsx/lnet/selftest/conctl.c  |  784 ++--
 .../staging/lustrefsx/lnet/selftest/conrpc.c  |  335 +-
 .../staging/lustrefsx/lnet/selftest/conrpc.h  |   46 +-
 .../staging/lustrefsx/lnet/selftest/console.c |  444 ++-
 .../staging/lustrefsx/lnet/selftest/console.h |   69 +-
 .../lustrefsx/lnet/selftest/framework.c       |  412 ++-
 .../staging/lustrefsx/lnet/selftest/module.c  |  129 +-
 .../lustrefsx/lnet/selftest/ping_test.c       |   52 +-
 drivers/staging/lustrefsx/lnet/selftest/rpc.c | 1218 ++++---
 drivers/staging/lustrefsx/lnet/selftest/rpc.h |  182 +-
 .../lustrefsx/lnet/selftest/selftest.h        |  246 +-
 .../staging/lustrefsx/lnet/selftest/timer.c   |   33 +-
 .../staging/lustrefsx/lnet/selftest/timer.h   |   10 +-
 .../lustrefsx/lustre/fid/fid_handler.c        |   64 +-
 .../lustrefsx/lustre/fid/fid_internal.h       |   11 +-
 .../staging/lustrefsx/lustre/fid/fid_lib.c    |    1 +
 .../lustrefsx/lustre/fid/fid_request.c        |  114 +-
 .../staging/lustrefsx/lustre/fid/fid_store.c  |    2 +-
 .../staging/lustrefsx/lustre/fid/lproc_fid.c  |  212 +-
 .../staging/lustrefsx/lustre/fld/fld_cache.c  |   23 +-
 .../lustrefsx/lustre/fld/fld_handler.c        |  149 +-
 .../staging/lustrefsx/lustre/fld/fld_index.c  |  133 +-
 .../lustrefsx/lustre/fld/fld_internal.h       |   18 +-
 .../lustrefsx/lustre/fld/fld_request.c        |  418 +--
 .../staging/lustrefsx/lustre/fld/lproc_fld.c  |   65 +-
 .../lustrefsx/lustre/include/cl_object.h      |  214 +-
 .../lustrefsx/lustre/include/dt_object.h      |   88 +-
 .../lustrefsx/lustre/include/llog_swab.h      |    2 +-
 .../lustrefsx/lustre/include/lprocfs_status.h |  315 +-
 .../lustrefsx/lustre/include/lu_object.h      |  194 +-
 .../lustrefsx/lustre/include/lu_target.h      |   69 +-
 .../lustre/include/lustre/ll_fiemap.h         |   38 +-
 .../include/lustre/lustre_barrier_user.h      |   53 +-
 .../include/{ => lustre}/lustre_errno.h       |    0
 .../{uapi/linux => }/lustre/lustre_idl.h      | 1017 +++---
 .../lustre/include/lustre/lustre_lfsck_user.h |  214 +-
 .../lustre/include/lustre/lustre_user.h       | 1639 ++++++++-
 .../lustre/include/lustre/lustreapi.h         |  616 ++--
 .../lustrefsx/lustre/include/lustre_acl.h     |    2 +-
 .../lustrefsx/lustre/include/lustre_barrier.h |    2 +-
 .../lustrefsx/lustre/include/lustre_compat.h  |  205 +-
 .../lustrefsx/lustre/include/lustre_disk.h    |   11 +-
 .../lustrefsx/lustre/include/lustre_dlm.h     |  311 +-
 .../lustre/include/lustre_dlm_flags.h         |   73 +-
 .../lustrefsx/lustre/include/lustre_eacl.h    |    2 +-
 .../lustrefsx/lustre/include/lustre_export.h  |  148 +-
 .../lustrefsx/lustre/include/lustre_fid.h     |   42 +-
 .../lustrefsx/lustre/include/lustre_fld.h     |   19 +-
 .../lustrefsx/lustre/include/lustre_ha.h      |    2 +-
 .../lustrefsx/lustre/include/lustre_idmap.h   |    2 +-
 .../lustrefsx/lustre/include/lustre_import.h  |  102 +-
 .../lustre/include/lustre_kernelcomm.h        |    4 +-
 .../lustrefsx/lustre/include/lustre_lfsck.h   |   10 +-
 .../lustrefsx/lustre/include/lustre_lib.h     |   38 +-
 .../lustrefsx/lustre/include/lustre_linkea.h  |    2 +-
 .../lustrefsx/lustre/include/lustre_lmv.h     |   43 +-
 .../lustrefsx/lustre/include/lustre_log.h     |   17 +-
 .../{uapi/linux/lustre => }/lustre_log_user.h |    3 +-
 .../lustrefsx/lustre/include/lustre_mdc.h     |  100 +-
 .../lustrefsx/lustre/include/lustre_mds.h     |   29 +-
 .../lustrefsx/lustre/include/lustre_net.h     |  209 +-
 .../lustrefsx/lustre/include/lustre_nodemap.h |   13 +-
 .../lustrefsx/lustre/include/lustre_nrs_tbf.h |   64 +-
 .../lustrefsx/lustre/include/lustre_obdo.h    |    2 +-
 .../lustre/include/lustre_patchless_compat.h  |   20 +
 .../lustrefsx/lustre/include/lustre_quota.h   |   29 +-
 .../lustre/include/lustre_req_layout.h        |   20 +-
 .../lustrefsx/lustre/include/lustre_scrub.h   |  375 --
 .../lustrefsx/lustre/include/lustre_sec.h     |   21 +-
 .../lustrefsx/lustre/include/lustre_swab.h    |    8 +-
 .../lustrefsx/lustre/include/lustre_update.h  |    5 +-
 .../{uapi/linux/lustre => }/lustre_ver.h      |    6 +
 .../lustrefsx/lustre/include/md_object.h      |  287 +-
 .../staging/lustrefsx/lustre/include/obd.h    |  211 +-
 .../lustrefsx/lustre/include/obd_cksum.h      |  153 +-
 .../lustrefsx/lustre/include/obd_class.h      | 1269 +++----
 .../lustrefsx/lustre/include/obd_support.h    |  130 +-
 .../lustrefsx/lustre/include/obj_update.h     |    2 +-
 .../lustrefsx/lustre/include/seq_range.h      |    2 +-
 .../uapi/linux/lustre/lustre_barrier_user.h   |   74 -
 .../include/uapi/linux/lustre/lustre_fiemap.h |   72 -
 .../uapi/linux/lustre/lustre_lfsck_user.h     |  238 --
 .../include/uapi/linux/lustre/lustre_user.h   | 2366 ------------
 .../uapi/linux/{lustre => }/lustre_cfg.h      |   67 +-
 .../uapi/linux/{lustre => }/lustre_disk.h     |   36 +-
 .../uapi/linux/{lustre => }/lustre_fid.h      |    7 +-
 .../uapi/linux/{lustre => }/lustre_ioctl.h    |   27 +-
 .../uapi/linux/{lustre => }/lustre_ostid.h    |   14 +-
 .../uapi/linux/{lustre => }/lustre_param.h    |    0
 .../lustre_kernelcomm.h => uapi_kernelcomm.h} |   15 +-
 .../lustrefsx/lustre/include/upcall_cache.h   |   10 +-
 .../lustrefsx/lustre/ldlm/interval_tree.c     |    7 +-
 .../lustrefsx/lustre/ldlm/ldlm_extent.c       |  182 +-
 .../lustrefsx/lustre/ldlm/ldlm_flock.c        |   39 +-
 .../lustrefsx/lustre/ldlm/ldlm_inodebits.c    |  441 +--
 .../lustrefsx/lustre/ldlm/ldlm_internal.h     |   91 +-
 .../staging/lustrefsx/lustre/ldlm/ldlm_lib.c  |  715 ++--
 .../staging/lustrefsx/lustre/ldlm/ldlm_lock.c |  893 ++---
 .../lustrefsx/lustre/ldlm/ldlm_lockd.c        | 1227 +++----
 .../lustrefsx/lustre/ldlm/ldlm_plain.c        |   36 +-
 .../staging/lustrefsx/lustre/ldlm/ldlm_pool.c |  374 +-
 .../lustrefsx/lustre/ldlm/ldlm_request.c      |  791 ++--
 .../lustrefsx/lustre/ldlm/ldlm_resource.c     |  438 +--
 .../staging/lustrefsx/lustre/llite/Makefile   |    2 +-
 .../staging/lustrefsx/lustre/llite/dcache.c   |    3 +-
 drivers/staging/lustrefsx/lustre/llite/dir.c  |  826 ++---
 drivers/staging/lustrefsx/lustre/llite/file.c | 2016 ++++-------
 .../staging/lustrefsx/lustre/llite/glimpse.c  |   84 +-
 .../lustrefsx/lustre/llite/lcommon_cl.c       |   19 +-
 .../lustrefsx/lustre/llite/lcommon_misc.c     |    5 +-
 .../lustrefsx/lustre/llite/llite_internal.h   |  192 +-
 .../lustrefsx/lustre/llite/llite_lib.c        |  915 +++--
 .../lustrefsx/lustre/llite/llite_mmap.c       |   23 +-
 .../lustrefsx/lustre/llite/llite_nfs.c        |    2 +-
 .../lustrefsx/lustre/llite/lproc_llite.c      | 1307 ++++---
 .../staging/lustrefsx/lustre/llite/namei.c    |  603 +---
 .../lustrefsx/lustre/llite/range_lock.c       |    5 +-
 drivers/staging/lustrefsx/lustre/llite/rw.c   |  100 +-
 drivers/staging/lustrefsx/lustre/llite/rw26.c |  162 +-
 .../lustrefsx/lustre/llite/statahead.c        |  181 +-
 .../staging/lustrefsx/lustre/llite/super25.c  |   41 +-
 .../staging/lustrefsx/lustre/llite/vvp_dev.c  |  311 +-
 .../lustrefsx/lustre/llite/vvp_internal.h     |   28 +-
 .../staging/lustrefsx/lustre/llite/vvp_io.c   |  393 +-
 .../staging/lustrefsx/lustre/llite/vvp_lock.c |   86 +
 .../lustrefsx/lustre/llite/vvp_object.c       |   18 +-
 .../staging/lustrefsx/lustre/llite/vvp_page.c |   39 +-
 .../staging/lustrefsx/lustre/llite/xattr.c    |   86 +-
 .../staging/lustrefsx/lustre/llite/xattr26.c  |   32 +-
 .../lustrefsx/lustre/llite/xattr_cache.c      |    3 +-
 .../lustrefsx/lustre/llite/xattr_security.c   |   33 -
 .../staging/lustrefsx/lustre/lmv/lmv_fld.c    |    1 +
 .../staging/lustrefsx/lustre/lmv/lmv_intent.c |  245 +-
 .../lustrefsx/lustre/lmv/lmv_internal.h       |   89 +-
 .../staging/lustrefsx/lustre/lmv/lmv_obd.c    | 1622 ++++-----
 .../staging/lustrefsx/lustre/lmv/lproc_lmv.c  |  121 +-
 drivers/staging/lustrefsx/lustre/lov/Makefile |    4 +-
 .../lustrefsx/lustre/lov/lov_cl_internal.h    |  296 +-
 .../staging/lustrefsx/lustre/lov/lov_dev.c    |  581 +--
 drivers/staging/lustrefsx/lustre/lov/lov_ea.c |   84 +-
 .../lustrefsx/lustre/lov/lov_internal.h       |   53 +-
 drivers/staging/lustrefsx/lustre/lov/lov_io.c |  990 ++----
 .../staging/lustrefsx/lustre/lov/lov_lock.c   |  128 +-
 .../staging/lustrefsx/lustre/lov/lov_merge.c  |    2 +-
 .../staging/lustrefsx/lustre/lov/lov_obd.c    |  615 ++--
 .../staging/lustrefsx/lustre/lov/lov_object.c |  952 ++---
 .../staging/lustrefsx/lustre/lov/lov_offset.c |  153 +-
 .../staging/lustrefsx/lustre/lov/lov_pack.c   |  106 +-
 .../staging/lustrefsx/lustre/lov/lov_page.c   |   44 +-
 .../staging/lustrefsx/lustre/lov/lov_pool.c   |   32 +-
 .../lustrefsx/lustre/lov/lov_request.c        |  308 +-
 .../staging/lustrefsx/lustre/lov/lovsub_dev.c |  108 +-
 .../lustrefsx/lustre/lov/lovsub_lock.c        |   82 +
 .../lustrefsx/lustre/lov/lovsub_object.c      |   95 +-
 .../lustrefsx/lustre/lov/lovsub_page.c        |   70 +
 .../staging/lustrefsx/lustre/lov/lproc_lov.c  |  272 +-
 drivers/staging/lustrefsx/lustre/mdc/Makefile |    2 +-
 .../staging/lustrefsx/lustre/mdc/lproc_mdc.c  |  511 +--
 .../lustrefsx/lustre/mdc/mdc_changelog.c      |  342 +-
 .../staging/lustrefsx/lustre/mdc/mdc_dev.c    | 1564 --------
 .../lustrefsx/lustre/mdc/mdc_internal.h       |   28 +-
 .../staging/lustrefsx/lustre/mdc/mdc_lib.c    |  185 +-
 .../staging/lustrefsx/lustre/mdc/mdc_locks.c  |  353 +-
 .../staging/lustrefsx/lustre/mdc/mdc_reint.c  |  183 +-
 .../lustrefsx/lustre/mdc/mdc_request.c        |  770 ++--
 .../staging/lustrefsx/lustre/mgc/lproc_mgc.c  |   52 +-
 .../lustrefsx/lustre/mgc/mgc_internal.h       |    7 +-
 .../lustrefsx/lustre/mgc/mgc_request.c        |  175 +-
 .../lustrefsx/lustre/obdclass/Makefile        |   14 +-
 .../staging/lustrefsx/lustre/obdclass/acl.c   |  183 +-
 .../lustrefsx/lustre/obdclass/cl_internal.h   |    2 +-
 .../staging/lustrefsx/lustre/obdclass/cl_io.c |  263 +-
 .../lustrefsx/lustre/obdclass/cl_lock.c       |   11 +-
 .../lustrefsx/lustre/obdclass/cl_object.c     |  105 +-
 .../lustrefsx/lustre/obdclass/cl_page.c       |  114 +-
 .../lustrefsx/lustre/obdclass/class_obd.c     |  301 +-
 .../lustrefsx/lustre/obdclass/dt_object.c     |  671 ++--
 .../lustrefsx/lustre/obdclass/genops.c        |  931 +++--
 .../staging/lustrefsx/lustre/obdclass/idmap.c |   26 +-
 .../lustrefsx/lustre/obdclass/integrity.c     |  277 --
 .../staging/lustrefsx/lustre/obdclass/jobid.c |  575 ---
 .../lustrefsx/lustre/obdclass/kernelcomm.c    |   11 +-
 .../lustrefsx/lustre/obdclass/linkea.c        |   12 +-
 .../lustre/obdclass/linux/linux-module.c      |  582 +++
 .../lustre/obdclass/linux/linux-obdo.c        |  157 +
 .../lustre/obdclass/linux/linux-sysctl.c      |  190 +
 .../staging/lustrefsx/lustre/obdclass/llog.c  |  311 +-
 .../lustrefsx/lustre/obdclass/llog_cat.c      |  689 ++--
 .../lustrefsx/lustre/obdclass/llog_internal.h |   11 +-
 .../lustrefsx/lustre/obdclass/llog_ioctl.c    |  296 +-
 .../lustrefsx/lustre/obdclass/llog_obd.c      |  182 +-
 .../lustrefsx/lustre/obdclass/llog_osd.c      |  112 +-
 .../lustrefsx/lustre/obdclass/llog_swab.c     |  306 +-
 .../lustrefsx/lustre/obdclass/llog_test.c     |  630 ++--
 .../lustrefsx/lustre/obdclass/local_storage.c |   10 +-
 .../lustre/obdclass/lprocfs_jobstats.c        |   63 +-
 .../lustre/obdclass/lprocfs_status.c          | 1099 +++---
 .../lustre/obdclass/lprocfs_status_server.c   |  405 +--
 .../lustrefsx/lustre/obdclass/lu_object.c     |  461 +--
 .../lustrefsx/lustre/obdclass/lu_ref.c        |  172 +-
 .../lustre/obdclass/lustre_handles.c          |   85 +-
 .../lustrefsx/lustre/obdclass/lustre_peer.c   |  105 +-
 .../lustrefsx/lustre/obdclass/md_attrs.c      |   27 +-
 .../lustrefsx/lustre/obdclass/obd_cksum.c     |  149 -
 .../lustrefsx/lustre/obdclass/obd_config.c    |  583 +--
 .../lustrefsx/lustre/obdclass/obd_mount.c     |  100 +-
 .../lustre/obdclass/obd_mount_server.c        |  150 +-
 .../lustrefsx/lustre/obdclass/obd_sysfs.c     |  535 ---
 .../staging/lustrefsx/lustre/obdclass/obdo.c  |  172 +-
 .../lustrefsx/lustre/obdclass/obdo_server.c   |  156 -
 .../staging/lustrefsx/lustre/obdclass/scrub.c | 1216 -------
 .../lustrefsx/lustre/obdclass/statfs_pack.c   |   36 +-
 .../lustrefsx/lustre/obdclass/upcall_cache.c  |   21 +-
 .../staging/lustrefsx/lustre/obdclass/uuid.c  |   78 +
 .../staging/lustrefsx/lustre/obdecho/echo.c   |  614 +---
 .../lustrefsx/lustre/obdecho/echo_client.c    |   87 +-
 .../lustrefsx/lustre/obdecho/echo_internal.h  |    1 -
 .../staging/lustrefsx/lustre/osc/lproc_osc.c  |  676 ++--
 .../staging/lustrefsx/lustre/osc/osc_cache.c  |  366 +-
 .../lustre_osc.h => osc/osc_cl_internal.h}    |  598 +---
 .../staging/lustrefsx/lustre/osc/osc_dev.c    |   58 +-
 .../lustrefsx/lustre/osc/osc_internal.h       |  191 +-
 drivers/staging/lustrefsx/lustre/osc/osc_io.c |  290 +-
 .../staging/lustrefsx/lustre/osc/osc_lock.c   |  401 +--
 .../staging/lustrefsx/lustre/osc/osc_object.c |  167 +-
 .../staging/lustrefsx/lustre/osc/osc_page.c   |  103 +-
 .../staging/lustrefsx/lustre/osc/osc_quota.c  |   21 +-
 .../lustrefsx/lustre/osc/osc_request.c        | 1248 +++----
 .../staging/lustrefsx/lustre/ptlrpc/client.c  |  308 +-
 .../staging/lustrefsx/lustre/ptlrpc/errno.c   |   33 +-
 .../staging/lustrefsx/lustre/ptlrpc/events.c  |   46 +-
 .../lustrefsx/lustre/ptlrpc/gss/gss_api.h     |   14 +-
 .../lustrefsx/lustre/ptlrpc/gss/gss_bulk.c    |    1 +
 .../lustre/ptlrpc/gss/gss_cli_upcall.c        |  148 +-
 .../lustrefsx/lustre/ptlrpc/gss/gss_crypto.c  |  177 +-
 .../lustrefsx/lustre/ptlrpc/gss/gss_crypto.h  |   88 +-
 .../lustre/ptlrpc/gss/gss_generic_token.c     |    1 +
 .../lustre/ptlrpc/gss/gss_internal.h          |   47 +-
 .../lustrefsx/lustre/ptlrpc/gss/gss_keyring.c |  249 +-
 .../lustrefsx/lustre/ptlrpc/gss/gss_krb5.h    |    2 +-
 .../lustre/ptlrpc/gss/gss_krb5_mech.c         | 1268 ++++---
 .../lustre/ptlrpc/gss/gss_mech_switch.c       |   62 +-
 .../lustre/ptlrpc/gss/gss_null_mech.c         |    4 +-
 .../lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c  |    2 +-
 .../lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c |  309 +-
 .../lustre/ptlrpc/gss/gss_svc_upcall.c        |  180 +-
 .../lustrefsx/lustre/ptlrpc/gss/lproc_gss.c   |   41 +-
 .../lustrefsx/lustre/ptlrpc/gss/sec_gss.c     |   45 +-
 .../staging/lustrefsx/lustre/ptlrpc/import.c  |  735 ++--
 .../staging/lustrefsx/lustre/ptlrpc/layout.c  |  447 +--
 .../lustrefsx/lustre/ptlrpc/llog_client.c     |   36 +
 .../lustrefsx/lustre/ptlrpc/llog_server.c     |   51 +
 .../lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c    |  448 +--
 .../staging/lustrefsx/lustre/ptlrpc/niobuf.c  |  100 +-
 .../lustre/ptlrpc/nodemap_internal.h          |    2 +-
 .../staging/lustrefsx/lustre/ptlrpc/nrs_crr.c |   50 +-
 .../lustrefsx/lustre/ptlrpc/nrs_delay.c       |   44 +-
 .../staging/lustrefsx/lustre/ptlrpc/nrs_orr.c |   85 +-
 .../staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c |  790 +---
 .../lustrefsx/lustre/ptlrpc/pack_generic.c    |  700 ++--
 .../staging/lustrefsx/lustre/ptlrpc/pers.c    |   29 +-
 .../staging/lustrefsx/lustre/ptlrpc/pinger.c  |  447 ++-
 .../lustrefsx/lustre/ptlrpc/ptlrpc_internal.h |   18 +-
 .../staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c |   14 +-
 .../staging/lustrefsx/lustre/ptlrpc/recover.c |   73 +-
 drivers/staging/lustrefsx/lustre/ptlrpc/sec.c |  173 +-
 .../lustrefsx/lustre/ptlrpc/sec_bulk.c        |   79 +-
 .../lustrefsx/lustre/ptlrpc/sec_config.c      |    4 +-
 .../staging/lustrefsx/lustre/ptlrpc/sec_ctx.c |    2 +-
 .../staging/lustrefsx/lustre/ptlrpc/sec_gc.c  |  107 +-
 .../lustrefsx/lustre/ptlrpc/sec_lproc.c       |  115 +-
 .../lustrefsx/lustre/ptlrpc/sec_null.c        |   17 +-
 .../lustrefsx/lustre/ptlrpc/sec_plain.c       |   27 +-
 .../staging/lustrefsx/lustre/ptlrpc/service.c |  296 +-
 .../staging/lustrefsx/lustre/ptlrpc/wirehdr.c |    6 +-
 .../lustrefsx/lustre/ptlrpc/wiretest.c        |  794 ++---
 .../staging/lustrefsx/lustre/target/barrier.c |   24 +-
 .../lustrefsx/lustre/target/out_handler.c     |  109 +-
 .../staging/lustrefsx/lustre/target/out_lib.c |   27 +-
 .../staging/lustrefsx/lustre/target/tgt_fmd.c |  363 --
 .../lustrefsx/lustre/target/tgt_grant.c       |  257 +-
 .../lustrefsx/lustre/target/tgt_handler.c     |  775 +---
 .../lustrefsx/lustre/target/tgt_internal.h    |   18 +-
 .../lustrefsx/lustre/target/tgt_lastrcvd.c    |   31 +-
 .../lustrefsx/lustre/target/tgt_main.c        |  350 +-
 .../lustrefsx/lustre/target/update_records.c  |    2 +-
 .../lustrefsx/lustre/target/update_recovery.c |    2 +-
 .../lustrefsx/lustre/target/update_trans.c    |   23 +-
 drivers/staging/lustrefsx/undef.h             |  197 +-
 384 files changed, 32538 insertions(+), 57933 deletions(-)
 rename drivers/staging/lustrefsx/{lnet/include/uapi/linux/lnet => libcfs/include/libcfs}/libcfs_ioctl.h (88%)
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_time.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/libcfs.h
 delete mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-hash.h
 delete mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-wait.h
 delete mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/util/hash.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c
 delete mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-hash.c
 delete mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-wait.c
 rename drivers/staging/lustrefsx/lnet/include/{uapi/linux/lnet/lnet-dlc.h => lnet/lib-dlc.h} (76%)
 rename drivers/staging/lustrefsx/lnet/include/{uapi/linux/lnet/socklnd.h => lnet/lnet.h} (74%)
 rename drivers/staging/lustrefsx/lnet/include/{uapi/linux => }/lnet/lnetctl.h (76%)
 rename drivers/staging/lustrefsx/lnet/include/{uapi/linux => }/lnet/lnetst.h (99%)
 rename drivers/staging/lustrefsx/lnet/include/{uapi/linux => }/lnet/nidstr.h (93%)
 rename drivers/staging/lustrefsx/lnet/include/{uapi/linux/lnet/lnet-types.h => lnet/types.h} (85%)
 delete mode 100644 drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_debug.h
 rename drivers/staging/lustrefsx/lustre/include/{ => lustre}/lustre_errno.h (100%)
 rename drivers/staging/lustrefsx/lustre/include/{uapi/linux => }/lustre/lustre_idl.h (81%)
 rename drivers/staging/lustrefsx/lustre/include/{uapi/linux/lustre => }/lustre_log_user.h (97%)
 delete mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_scrub.h
 rename drivers/staging/lustrefsx/lustre/include/{uapi/linux/lustre => }/lustre_ver.h (83%)
 delete mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_barrier_user.h
 delete mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fiemap.h
 delete mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_lfsck_user.h
 delete mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_user.h
 rename drivers/staging/lustrefsx/lustre/include/uapi/linux/{lustre => }/lustre_cfg.h (77%)
 rename drivers/staging/lustrefsx/lustre/include/uapi/linux/{lustre => }/lustre_disk.h (85%)
 rename drivers/staging/lustrefsx/lustre/include/uapi/linux/{lustre => }/lustre_fid.h (97%)
 rename drivers/staging/lustrefsx/lustre/include/uapi/linux/{lustre => }/lustre_ioctl.h (93%)
 rename drivers/staging/lustrefsx/lustre/include/uapi/linux/{lustre => }/lustre_ostid.h (95%)
 rename drivers/staging/lustrefsx/lustre/include/uapi/linux/{lustre => }/lustre_param.h (100%)
 rename drivers/staging/lustrefsx/lustre/include/{uapi/linux/lustre/lustre_kernelcomm.h => uapi_kernelcomm.h} (88%)
 mode change 100644 => 100755 drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
 create mode 100644 drivers/staging/lustrefsx/lustre/llite/vvp_lock.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lovsub_lock.c
 create mode 100644 drivers/staging/lustrefsx/lustre/lov/lovsub_page.c
 delete mode 100644 drivers/staging/lustrefsx/lustre/mdc/mdc_dev.c
 delete mode 100644 drivers/staging/lustrefsx/lustre/obdclass/integrity.c
 delete mode 100644 drivers/staging/lustrefsx/lustre/obdclass/jobid.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/linux/linux-module.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/linux/linux-obdo.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/linux/linux-sysctl.c
 delete mode 100644 drivers/staging/lustrefsx/lustre/obdclass/obd_cksum.c
 delete mode 100644 drivers/staging/lustrefsx/lustre/obdclass/obd_sysfs.c
 delete mode 100644 drivers/staging/lustrefsx/lustre/obdclass/obdo_server.c
 delete mode 100644 drivers/staging/lustrefsx/lustre/obdclass/scrub.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/uuid.c
 rename drivers/staging/lustrefsx/lustre/{include/lustre_osc.h => osc/osc_cl_internal.h} (52%)
 delete mode 100644 drivers/staging/lustrefsx/lustre/target/tgt_fmd.c

diff --git a/drivers/staging/lustrefsx/Makefile.rules b/drivers/staging/lustrefsx/Makefile.rules
index ce56ffa5576a0..a0d56e80f2ce7 100644
--- a/drivers/staging/lustrefsx/Makefile.rules
+++ b/drivers/staging/lustrefsx/Makefile.rules
@@ -3,5 +3,4 @@ ccflags-y += -include $(srctree)/drivers/staging/lustrefsx/config.h
 ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/libcfs/include
 ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/lnet/include
 ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/lustre/include
-ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/lustre/include/uapi
 ccflags-y += -Wno-format-truncation -Werror
diff --git a/drivers/staging/lustrefsx/config.h b/drivers/staging/lustrefsx/config.h
index 69580ddb7b9f3..fce8b057480b6 100644
--- a/drivers/staging/lustrefsx/config.h
+++ b/drivers/staging/lustrefsx/config.h
@@ -25,11 +25,17 @@
 /* extened attributes for ldiskfs */
 /* #undef CONFIG_LDISKFS_FS_XATTR */
 
+/* Max LNET payload */
+#define CONFIG_LNET_MAX_PAYLOAD LNET_MTU
+
 /* enable invariant checking */
 /* #undef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK */
 
+/* IOCTL Buffer Size */
+#define CONFIG_LUSTRE_OBD_MAX_IOCTL_BUFFER 8192
+
 /* kernel has cpu affinity support */
-#define CPU_AFFINITY 1
+/* #undef CPU_AFFINITY */
 
 /* both i_dentry/d_alias uses list */
 /* #undef DATA_FOR_LLITE_IS_LIST */
@@ -52,15 +58,9 @@
 /* do data checksums */
 #define ENABLE_CHECKSUM 1
 
-/* enable flock by default */
-#define ENABLE_FLOCK 1
-
 /* Use the Pinger */
 #define ENABLE_PINGER 1
 
-/* aes-sha2 is supported by krb5 */
-/* #undef HAVE_AES_SHA2_SUPPORT */
-
 /* Define to 1 if you have the <asm/types.h> header file. */
 #define HAVE_ASM_TYPES_H 1
 
@@ -79,12 +79,6 @@
 /* 'bio_integrity_enabled' is available */
 /* #undef HAVE_BIO_INTEGRITY_ENABLED */
 
-/* kernel has bio_integrity_prep_fn */
-/* #undef HAVE_BIO_INTEGRITY_PREP_FN */
-
-/* bio_integrity_payload.bip_iter exist */
-#define HAVE_BIP_ITER_BIO_INTEGRITY_PAYLOAD 1
-
 /* 'bi_bdev' is available */
 /* #undef HAVE_BI_BDEV */
 
@@ -109,18 +103,9 @@
 /* blk_queue_max_segments is defined */
 #define HAVE_BLK_QUEUE_MAX_SEGMENTS 1
 
-/* kernel hash_64() is broken */
-/* #undef HAVE_BROKEN_HASH_64 */
-
 /* kernel has struct bvec_iter */
 #define HAVE_BVEC_ITER 1
 
-/* struct cache_detail has writers */
-#define HAVE_CACHE_DETAIL_WRITERS 1
-
-/* if cache_detail->hash_lock is a spinlock */
-#define HAVE_CACHE_HASH_SPINLOCK 1
-
 /* cache_head has hlist cache_list */
 #define HAVE_CACHE_HEAD_HLIST 1
 
@@ -133,24 +118,24 @@
 /* kernel has clean_bdev_aliases */
 /* #undef HAVE_CLEAN_BDEV_ALIASES */
 
-/* 'clear_and_wake_up_bit' is available */
-#define HAVE_CLEAR_AND_WAKE_UP_BIT 1
-
 /* have clear_inode */
 #define HAVE_CLEAR_INODE 1
 
 /* compat rdma found */
 /* #undef HAVE_COMPAT_RDMA */
 
+/* cpumap_print_to_pagebuf is available */
+#define HAVE_CPUMASK_PRINT_TO_PAGEBUF 1
+
 /* kernel compiled with CRC32 functions */
 #define HAVE_CRC32 1
 
+/* struct cred has member tgcred */
+/* #undef HAVE_CRED_TGCRED */
+
 /* crypto hash helper functions are available */
 #define HAVE_CRYPTO_HASH_HELPERS 1
 
-/* 'CRYPTO_MAX_ALG_NAME' is 128 */
-#define HAVE_CRYPTO_MAX_ALG_NAME_128 1
-
 /* current_time() has replaced CURRENT_TIME */
 #define HAVE_CURRENT_TIME 1
 
@@ -169,9 +154,6 @@
 /* dentry_open uses struct path as first argument */
 #define HAVE_DENTRY_OPEN_USE_PATH 1
 
-/* DES3 enctype is supported by krb5 */
-/* #undef HAVE_DES3_SUPPORT */
-
 /* direct_IO need 2 arguments */
 #define HAVE_DIRECTIO_2ARGS 1
 
@@ -253,9 +235,6 @@
 /* d_delete first parameter declared is not const */
 #define HAVE_D_DELETE_CONST const
 
-/* d_hash_and_lookup is exported by the kernel */
-#define HAVE_D_HASH_AND_LOOKUP 1
-
 /* have d_make_root */
 #define HAVE_D_MAKE_ROOT 1
 
@@ -343,15 +322,15 @@
 /* Define to 1 if you have the `gethostbyname' function. */
 #define HAVE_GETHOSTBYNAME 1
 
-/* get_request_key_auth() is available */
-#define HAVE_GET_REQUEST_KEY_AUTH 1
-
 /* get_user_pages takes 6 arguments */
 /* #undef HAVE_GET_USER_PAGES_6ARG */
 
 /* get_user_pages takes gup_flags in arguments */
 #define HAVE_GET_USER_PAGES_GUP_FLAGS 1
 
+/* get_user_pages takes gup_flags in arguments with 7 args */
+/* #undef HAVE_GET_USER_PAGES_GUP_FLAGS_7ARGS */
+
 /* struct group_info has member gid */
 #define HAVE_GROUP_INFO_GID 1
 
@@ -364,9 +343,6 @@
 /* Define this if the Kerberos GSS library supports gss_krb5_ccache_name */
 /* #undef HAVE_GSS_KRB5_CCACHE_NAME */
 
-/* '__rhashtable_insert_fast()' returns int */
-/* #undef HAVE_HASHTABLE_INSERT_FAST_RETURN_INT */
-
 /* Define this if you have Heimdal Kerberos libraries */
 /* #undef HAVE_HEIMDAL */
 
@@ -415,9 +391,6 @@
 /* if ib_sg_dma_address wrapper exists */
 /* #undef HAVE_IB_SG_DMA_ADDRESS */
 
-/* INIT_LIST_HEAD_RCU exists */
-#define HAVE_INIT_LIST_HEAD_RCU 1
-
 /* inode_operations .getattr member function can gather advance stats */
 #define HAVE_INODEOPS_ENHANCED_GETATTR 1
 
@@ -442,15 +415,6 @@
 /* inode_operations->permission has two args */
 #define HAVE_INODE_PERMISION_2ARGS 1
 
-/* inode times are using timespec64 */
-#define HAVE_INODE_TIMESPEC64 1
-
-/* blk_integrity.interval exist */
-/* #undef HAVE_INTERVAL_BLK_INTEGRITY */
-
-/* blk_integrity.interval_exp exist */
-#define HAVE_INTERVAL_EXP_BLK_INTEGRITY 1
-
 /* Define to 1 if you have the <inttypes.h> header file. */
 #define HAVE_INTTYPES_H 1
 
@@ -460,9 +424,6 @@
 /* have in_compat_syscall */
 #define HAVE_IN_COMPAT_SYSCALL 1
 
-/* 'in_dev_for_each_ifa_rtnl' is defined */
-#define HAVE_IN_DEV_FOR_EACH_IFA_RTNL 1
-
 /* inode_operations->rename need flags as argument */
 #define HAVE_IOPS_RENAME_WITH_FLAGS 1
 
@@ -502,27 +463,18 @@
 /* is_sxid is defined */
 #define HAVE_IS_SXID 1
 
-/* 'iterate_shared' is available */
-#define HAVE_ITERATE_SHARED 1
-
 /* struct address_space has i_pages */
 #define HAVE_I_PAGES 1
 
 /* i_uid_read is present */
 #define HAVE_I_UID_READ 1
 
-/* kallsyms_lookup_name is exported by kernel */
-/* #undef HAVE_KALLSYMS_LOOKUP_NAME */
+/* jiffies_to_timespec64() is available */
+#define HAVE_JIFFIES_TO_TIMESPEC64 1
 
 /* kernel_locked is defined */
 /* #undef HAVE_KERNEL_LOCKED */
 
-/* 'kernel_param_[un]lock' is available */
-#define HAVE_KERNEL_PARAM_LOCK 1
-
-/* 'struct kernel_param_ops' is available */
-#define HAVE_KERNEL_PARAM_OPS 1
-
 /* kernel_setsockopt still in use */
 /* #undef HAVE_KERNEL_SETSOCKOPT */
 
@@ -541,9 +493,6 @@
 /* key_type->instantiate has two args */
 #define HAVE_KEY_TYPE_INSTANTIATE_2ARGS 1
 
-/* key.usage is of type refcount_t */
-#define HAVE_KEY_USAGE_REFCOUNT 1
-
 /* ki_left exist */
 /* #undef HAVE_KIOCB_KI_LEFT */
 
@@ -572,15 +521,12 @@
    available */
 /* #undef HAVE_KRB5_GET_INIT_CREDS_OPT_SET_ADDRESSLESS */
 
-/* kset_find_obj is exported by the kernel */
-#define HAVE_KSET_FIND_OBJ 1
-
-/* kernel has kstrtobool_from_user */
-#define HAVE_KSTRTOBOOL_FROM_USER 1
-
 /* kernel has kstrtoul */
 #define HAVE_KSTRTOUL 1
 
+/* kernel has ksys_close */
+#define HAVE_KSYS_CLOSE 1
+
 /* kthread_worker found */
 /* #undef HAVE_KTHREAD_WORK */
 
@@ -608,9 +554,6 @@
 /* 'ktime_get_ts64' is available */
 #define HAVE_KTIME_GET_TS64 1
 
-/* 'ktime_ms_delta' is available */
-#define HAVE_KTIME_MS_DELTA 1
-
 /* 'ktime_to_timespec64' is available */
 #define HAVE_KTIME_TO_TIMESPEC64 1
 
@@ -638,12 +581,21 @@
 /* readline library is available */
 /* #undef HAVE_LIBREADLINE */
 
-/* linux/rhashtable.h is present */
-#define HAVE_LINUX_RHASHTABLE_H 1
+/* Define to 1 if you have the <linux/random.h> header file. */
+#define HAVE_LINUX_RANDOM_H 1
 
 /* if linux/selinux.h exists */
 /* #undef HAVE_LINUX_SELINUX_IS_ENABLED */
 
+/* Define to 1 if you have the <linux/types.h> header file. */
+#define HAVE_LINUX_TYPES_H 1
+
+/* Define to 1 if you have the <linux/unistd.h> header file. */
+#define HAVE_LINUX_UNISTD_H 1
+
+/* Define to 1 if you have the <linux/version.h> header file. */
+#define HAVE_LINUX_VERSION_H 1
+
 /* lock_manager_operations has lm_compare_owner */
 /* #undef HAVE_LM_COMPARE_OWNER */
 
@@ -653,9 +605,6 @@
 /* kernel has locks_lock_file_wait */
 #define HAVE_LOCKS_LOCK_FILE_WAIT 1
 
-/* lookup_user_key() is available */
-#define HAVE_LOOKUP_USER_KEY 1
-
 /* kernel has LOOP_CTL_GET_FREE */
 #define HAVE_LOOP_CTL_GET_FREE 1
 
@@ -684,9 +633,6 @@
 /* kernel module loading is possible */
 #define HAVE_MODULE_LOADING_SUPPORT 1
 
-/* locking module param is supported */
-/* #undef HAVE_MODULE_PARAM_LOCKING */
-
 /* Define to 1 if you have the `name_to_handle_at' function. */
 #define HAVE_NAME_TO_HANDLE_AT 1
 
@@ -696,24 +642,15 @@
 /* cancel_dirty_page with one arguement is available */
 #define HAVE_NEW_CANCEL_DIRTY_PAGE 1
 
-/* DEFINE_TIMER uses only 2 arguements */
-#define HAVE_NEW_DEFINE_TIMER 1
-
 /* 'kernel_write' aligns with read/write helpers */
 #define HAVE_NEW_KERNEL_WRITE 1
 
 /* NR_UNSTABLE_NFS is still in use. */
 /* #undef HAVE_NR_UNSTABLE_NFS */
 
-/* ns_to_timespec64() is available */
-#define HAVE_NS_TO_TIMESPEC64 1
-
 /* with oldsize */
 /* #undef HAVE_OLDSIZE_TRUNCATE_PAGECACHE */
 
-/* openssl-devel is present */
-/* #undef HAVE_OPENSSL_GETSEPOL */
-
 /* OpenSSL HMAC functions needed for SSK */
 /* #undef HAVE_OPENSSL_SSK */
 
@@ -738,9 +675,6 @@
 /* posix_acl_valid takes struct user_namespace */
 #define HAVE_POSIX_ACL_VALID_USER_NS 1
 
-/* 'prepare_to_wait_event' is available */
-#define HAVE_PREPARE_TO_WAIT_EVENT 1
-
 /* struct proc_ops exists */
 #define HAVE_PROC_OPS 1
 
@@ -753,18 +687,12 @@
 /* inode->i_nlink is protected from direct modification */
 #define HAVE_PROTECT_I_NLINK 1
 
-/* 'PTR_ERR_OR_ZERO' exist */
-#define HAVE_PTR_ERR_OR_ZERO 1
-
 /* have quota64 */
 #define HAVE_QUOTA64 1
 
 /* radix_tree_exceptional_entry exist */
 /* #undef HAVE_RADIX_EXCEPTION_ENTRY */
 
-/* rdma_connect_locked is defined */
-#define HAVE_RDMA_CONNECT_LOCKED 1
-
 /* rdma_create_id wants 4 args */
 /* #undef HAVE_RDMA_CREATE_ID_4ARG */
 
@@ -774,24 +702,15 @@
 /* rdma_reject has 4 arguments */
 #define HAVE_RDMA_REJECT_4ARGS 1
 
+/* reinit_completion is exist */
+#define HAVE_REINIT_COMPLETION 1
+
 /* kernel export remove_from_page_cache */
 /* #undef HAVE_REMOVE_FROM_PAGE_CACHE */
 
 /* remove_proc_subtree is defined */
 #define HAVE_REMOVE_PROC_SUBTREE 1
 
-/* rhashtable_lookup() is available */
-#define HAVE_RHASHTABLE_LOOKUP 1
-
-/* rhashtable_lookup_get_insert_fast() is available */
-#define HAVE_RHASHTABLE_LOOKUP_GET_INSERT_FAST 1
-
-/* struct rhltable exist */
-#define HAVE_RHLTABLE 1
-
-/* save_stack_trace_tsk is exported */
-/* #undef HAVE_SAVE_STACK_TRACE_TSK */
-
 /* Have sa_spill_alloc in ZFS */
 /* #undef HAVE_SA_SPILL_ALLOC */
 
@@ -816,9 +735,6 @@
 /* security_inode_init_security takes a 'struct qstr' parameter */
 /* #undef HAVE_SECURITY_IINITSEC_QSTR */
 
-/* security_inode_listsecurity() is available/exported */
-#define HAVE_SECURITY_INODE_LISTSECURITY 1
-
 /* security_release_secctx has 1 arg. */
 /* #undef HAVE_SEC_RELEASE_SECCTX_1ARG */
 
@@ -862,27 +778,36 @@
 /* Have spa_maxblocksize in ZFS */
 /* #undef HAVE_SPA_MAXBLOCKSIZE */
 
+/* spinlock_t is defined */
+/* #undef HAVE_SPINLOCK_T */
+
 /* struct stacktrace_ops exists */
 /* #undef HAVE_STACKTRACE_OPS */
 
 /* stacktrace_ops.warning is exist */
 /* #undef HAVE_STACKTRACE_WARNING */
 
+/* stack_trace_print() exists */
+#define HAVE_STACK_TRACE_PRINT 1
+
 /* Define to 1 if you have the <stdint.h> header file. */
 #define HAVE_STDINT_H 1
 
 /* Define to 1 if you have the <stdlib.h> header file. */
 #define HAVE_STDLIB_H 1
 
-/* stringhash.h is present */
-#define HAVE_STRINGHASH 1
-
 /* Define to 1 if you have the <strings.h> header file. */
 #define HAVE_STRINGS_H 1
 
 /* Define to 1 if you have the <string.h> header file. */
 #define HAVE_STRING_H 1
 
+/* Define to 1 if you have the `strlcat' function. */
+/* #undef HAVE_STRLCAT */
+
+/* Define to 1 if you have the `strlcpy' function. */
+/* #undef HAVE_STRLCPY */
+
 /* Define to 1 if you have the `strnlen' function. */
 #define HAVE_STRNLEN 1
 
@@ -910,6 +835,9 @@
 /* ctl_table has ctl_name field */
 /* #undef HAVE_SYSCTL_CTLNAME */
 
+/* Define to 1 if you have the <sys/ioctl.h> header file. */
+#define HAVE_SYS_IOCTL_H 1
+
 /* Define to 1 if you have <sys/quota.h>. */
 #define HAVE_SYS_QUOTA_H 1
 
@@ -940,6 +868,9 @@
 /* 'timespec64_to_ktime' is available */
 #define HAVE_TIMESPEC64_TO_KTIME 1
 
+/* have_time_t */
+/* #undef HAVE_TIME_T */
+
 /* topology_sibling_cpumask is available */
 #define HAVE_TOPOLOGY_SIBLING_CPUMASK 1
 
@@ -988,18 +919,9 @@
 /* 'struct vm_operations' remove struct vm_area_struct argument */
 #define HAVE_VM_OPS_USE_VM_FAULT_ONLY 1
 
-/* wait_bit.h is present */
-#define HAVE_WAIT_BIT_HEADER_H 1
-
 /* 'wait_queue_entry_t' is available */
 #define HAVE_WAIT_QUEUE_ENTRY 1
 
-/* linux wait_queue_head_t list_head is name head */
-#define HAVE_WAIT_QUEUE_ENTRY_LIST 1
-
-/* 'wait_var_event' is available */
-#define HAVE_WAIT_VAR_EVENT 1
-
 /* flags field exist */
 #define HAVE_XATTR_HANDLER_FLAGS 1
 
@@ -1024,18 +946,9 @@
 /* Have zap_remove_by_dnode() in ZFS */
 /* #undef HAVE_ZAP_REMOVE_ADD_BY_DNODE */
 
-/* Have inode_timespec_t */
-/* #undef HAVE_ZFS_INODE_TIMESPEC */
-
-/* Have multihost protection in ZFS */
-/* #undef HAVE_ZFS_MULTIHOST */
-
 /* Enable zfs osd */
 /* #undef HAVE_ZFS_OSD */
 
-/* Have zfs_refcount_add */
-/* #undef HAVE_ZFS_REFCOUNT_ADD */
-
 /* __add_wait_queue_exclusive exists */
 /* #undef HAVE___ADD_WAIT_QUEUE_EXCLUSIVE */
 
@@ -1062,13 +975,13 @@
 #define LUSTRE_MAJOR 2
 
 /* Second number in the Lustre version */
-#define LUSTRE_MINOR 12
+#define LUSTRE_MINOR 10
 
 /* Third number in the Lustre version */
 #define LUSTRE_PATCH 8
 
 /* A copy of PACKAGE_VERSION */
-#define LUSTRE_VERSION_STRING "2.12.8-1"
+#define LUSTRE_VERSION_STRING "2.10.8-10"
 
 /* maximum number of MDS threads */
 /* #undef MDS_MAX_THREADS */
@@ -1085,9 +998,6 @@
 /* need pclmulqdq based crc32 */
 /* #undef NEED_CRC32_ACCEL */
 
-/* 'ktime_get_ns' is not available */
-/* #undef NEED_KTIME_GET_NS */
-
 /* 'ktime_get_real_ns' is not available */
 /* #undef NEED_KTIME_GET_REAL_NS */
 
@@ -1104,7 +1014,7 @@
 #define PACKAGE_NAME "Lustre"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "Lustre 2.12.8-1"
+#define PACKAGE_STRING "Lustre 2.10.8-10"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "lustre"
@@ -1113,11 +1023,14 @@
 #define PACKAGE_URL ""
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2.12.8-1"
+#define PACKAGE_VERSION "2.10.8-10"
 
 /* name of parallel fsck program */
 #define PFSCK "fsck"
 
+/* proc handler methods use __user */
+/* #undef PROC_HANDLER_USE_USER_ATTR */
+
 /* enable randomly alloc failure */
 #define RANDOM_FAIL_ALLOC 1
 
@@ -1154,16 +1067,16 @@
 /* #undef USE_LU_REF */
 
 /* Version number of package */
-#define VERSION "2.12.8-1"
+#define VERSION "2.10.8-10"
 
 /* zfs fix version */
-#define ZFS_FIX 0
+/* #undef ZFS_FIX */
 
 /* zfs major version */
-#define ZFS_MAJOR 
+/* #undef ZFS_MAJOR */
 
 /* zfs minor version */
-#define ZFS_MINOR 
+/* #undef ZFS_MINOR */
 
 /* zfs patch version */
-#define ZFS_PATCH 
+/* #undef ZFS_PATCH */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h
index 1763da296244d..28472601ed4df 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h
@@ -32,9 +32,6 @@
 #ifndef _LIBCFS_BITMAP_H_
 #define _LIBCFS_BITMAP_H_
 
-#include <linux/interrupt.h>
-#include <libcfs/libcfs_private.h>
-
 struct cfs_bitmap {
 	unsigned int size;
 	unsigned long data[0];
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/curproc.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/curproc.h
index 0f00c7219e75d..e9e0cc2109034 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/curproc.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/curproc.h
@@ -39,10 +39,6 @@
 #ifndef __LIBCFS_CURPROC_H__
 #define __LIBCFS_CURPROC_H__
 
-/* check if task is running in compat mode.*/
-#define current_pid()		(current->pid)
-#define current_comm()		(current->comm)
-
 typedef __u32 cfs_cap_t;
 
 #define CFS_CAP_CHOWN                   0
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h
index 9ae7b8405a94b..f01170c6e1d97 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -33,28 +33,11 @@
 #ifndef __LIBCFS_LIBCFS_H__
 #define __LIBCFS_LIBCFS_H__
 
-#include <linux/kernel.h>
-#include <linux/module.h>
+#ifdef __KERNEL__
+# include <libcfs/linux/libcfs.h>
+# include "curproc.h"
 
-#include <libcfs/linux/linux-misc.h>
-#include <libcfs/linux/linux-time.h>
-#include <libcfs/linux/linux-wait.h>
-#include <libcfs/linux/linux-mem.h>
-
-#include <uapi/linux/lnet/libcfs_ioctl.h>
-#include <libcfs/libcfs_debug.h>
-#include <libcfs/libcfs_private.h>
-#include <libcfs/bitmap.h>
-#include <libcfs/libcfs_cpu.h>
-#include <libcfs/libcfs_prim.h>
-#include <libcfs/libcfs_string.h>
-#include <libcfs/libcfs_workitem.h>
-#include <libcfs/libcfs_hash.h>
-#include <libcfs/libcfs_heap.h>
-#include <libcfs/libcfs_fail.h>
-#include "curproc.h"
-
-#define LIBCFS_VERSION	"0.7.1"
+#define LIBCFS_VERSION	"0.5.0"
 
 #define PO2_ROUNDUP_TYPED(x, po2, type) (-(-(type)(x) & -(type)(po2)))
 #define LOWEST_BIT_SET(x) ((x) & ~((x) - 1))
@@ -99,19 +82,15 @@ void lc_watchdog_delete(struct lc_watchdog *lcw);
  #endif
 #endif
 
-typedef s32 timeout_t;
-
 /* need both kernel and user-land acceptor */
 #define LNET_ACCEPTOR_MIN_RESERVED_PORT    512
 #define LNET_ACCEPTOR_MAX_RESERVED_PORT    1023
 
-extern struct blocking_notifier_head libcfs_ioctl_list;
-static inline int notifier_from_ioctl_errno(int err)
-{
-	if (err == -EINVAL)
-		return NOTIFY_OK;
-	return notifier_from_errno(err) | NOTIFY_STOP_MASK;
-}
+/*
+ * Drop into debugger, if possible. Implementation is provided by platform.
+ */
+
+void cfs_enter_debugger(void);
 
 /*
  * Defined by platform
@@ -132,6 +111,21 @@ unsigned int cfs_rand(void);
 /* seed the generator */
 void cfs_srand(unsigned int, unsigned int);
 void cfs_get_random_bytes(void *buf, int size);
+#endif /* __KERNEL__ */
+
+#include <libcfs/libcfs_debug.h>
+#ifdef __KERNEL__
+# include <libcfs/libcfs_private.h>
+# include <libcfs/bitmap.h>
+# include <libcfs/libcfs_cpu.h>
+# include <libcfs/libcfs_ioctl.h>
+# include <libcfs/libcfs_prim.h>
+# include <libcfs/libcfs_time.h>
+# include <libcfs/libcfs_string.h>
+# include <libcfs/libcfs_workitem.h>
+# include <libcfs/libcfs_hash.h>
+# include <libcfs/libcfs_heap.h>
+# include <libcfs/libcfs_fail.h>
 
 int libcfs_ioctl_data_adjust(struct libcfs_ioctl_data *data);
 int libcfs_ioctl(unsigned long cmd, void __user *uparam);
@@ -145,30 +139,12 @@ static inline void *__container_of(const void *ptr, unsigned long shift)
 		return (char *)ptr - shift;
 }
 
-#define container_of0(ptr, type, member) \
+#define container_of0(ptr, type, member)				\
 	((type *)__container_of((ptr), offsetof(type, member)))
 
-struct lnet_debugfs_symlink_def {
-	const char *name;
-	const char *target;
-};
-
-void lnet_insert_debugfs(struct ctl_table *table);
-void lnet_remove_debugfs(struct ctl_table *table);
-
-/* helper for sysctl handlers */
-int lprocfs_call_handler(void *data, int write, loff_t *ppos,
-			 void __user *buffer, size_t *lenp,
-			 int (*handler)(void *data, int write, loff_t pos,
-					void __user *buffer, int len));
-int debugfs_doint(struct ctl_table *table, int write,
-		  void __user *buffer, size_t *lenp, loff_t *ppos);
+#endif /* __KERNEL__ */
 
 /* atomic-context safe vfree */
-#ifdef HAVE_LIBCFS_VFREE_ATOMIC
 void libcfs_vfree_atomic(const void *addr);
-#else
-#define libcfs_vfree_atomic(ptr) vfree(ptr)
-#endif
 
 #endif /* _LIBCFS_LIBCFS_H_ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_cpu.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_cpu.h
index 4620dcc08cf80..9fd28ce749cfe 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_cpu.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_cpu.h
@@ -13,12 +13,17 @@
  * General Public License version 2 for more details (a copy is included
  * in the LICENSE file that accompanied this code).
  *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
  * GPL HEADER END
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -42,16 +47,16 @@
  *
  *     Example: if there are 8 cores on the system, while creating a CPT
  *     with cpu_npartitions=4:
- *		core[0, 1] = partition[0], core[2, 3] = partition[1]
- *		core[4, 5] = partition[2], core[6, 7] = partition[3]
+ *              core[0, 1] = partition[0], core[2, 3] = partition[1]
+ *              core[4, 5] = partition[2], core[6, 7] = partition[3]
  *
  *          cpu_npartitions=1:
- *		core[0, 1, ... 7] = partition[0]
+ *              core[0, 1, ... 7] = partition[0]
  *
  *   . User can also specify CPU partitions by string pattern
  *
  *     Examples: cpu_partitions="0[0,1], 1[2,3]"
- *		 cpu_partitions="N 0[0-3], 1[4-8]"
+ *               cpu_partitions="N 0[0-3], 1[4-8]"
  *
  *     The first character "N" means following numbers are numa ID
  *
@@ -71,56 +76,21 @@
 #ifndef __LIBCFS_CPU_H__
 #define __LIBCFS_CPU_H__
 
-#include <linux/cpu.h>
-#include <linux/cpuset.h>
-#include <linux/slab.h>
-#include <linux/topology.h>
-#include <linux/version.h>
-#include <linux/vmalloc.h>
-
-#include <libcfs/linux/linux-cpu.h>
-
-#ifdef CONFIG_SMP
-
-/** virtual processing unit */
-struct cfs_cpu_partition {
-	/* CPUs mask for this partition */
-	cpumask_t			*cpt_cpumask;
-	/* nodes mask for this partition */
-	nodemask_t			*cpt_nodemask;
-	/* NUMA distance between CPTs */
-	unsigned int			*cpt_distance;
-	/* spread rotor for NUMA allocator */
-	unsigned int			 cpt_spread_rotor;
-	/* NUMA node if cpt_nodemask is empty */
-	int				 cpt_node;
-};
-#endif /* CONFIG_SMP */
+#ifndef HAVE_LIBCFS_CPT
 
-/** descriptor for CPU partitions */
 struct cfs_cpt_table {
-#ifdef CONFIG_SMP
-	/* spread rotor for NUMA allocator */
-	unsigned int			 ctb_spread_rotor;
-	/* maximum NUMA distance between all nodes in table */
-	unsigned int			 ctb_distance;
-	/* partitions tables */
-	struct cfs_cpu_partition	*ctb_parts;
-	/* shadow HW CPU to CPU partition ID */
-	int				*ctb_cpu2cpt;
-	/* shadow HW node to CPU partition ID */
-	int				*ctb_node2cpt;
 	/* # of CPU partitions */
-	int				 ctb_nparts;
-	/* all nodes in this partition table */
-	nodemask_t			*ctb_nodemask;
-#else
-	nodemask_t			 ctb_nodemask;
-#endif /* CONFIG_SMP */
-	/* all cpus in this partition table */
-	cpumask_t			*ctb_cpumask;
+	int			ctb_nparts;
+	/* cpu mask */
+	cpumask_t		ctb_mask;
+	/* node mask */
+	nodemask_t		ctb_nodemask;
+	/* version */
+	__u64			ctb_version;
 };
 
+#endif /* !HAVE_LIBCFS_CPT */
+
 /* any CPU partition */
 #define CFS_CPT_ANY		(-1)
 
@@ -147,7 +117,7 @@ int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len);
  */
 int cfs_cpt_number(struct cfs_cpt_table *cptab);
 /**
- * return number of HW cores or hyper-threadings in a CPU partition \a cpt
+ * return number of HW cores or hypter-threadings in a CPU partition \a cpt
  */
 int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt);
 /**
@@ -177,13 +147,13 @@ int cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node);
 /**
  * NUMA distance between \a cpt1 and \a cpt2 in \a cptab
  */
-unsigned int cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2);
+unsigned cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2);
 /**
  * bind current thread on a CPU-partition \a cpt of \a cptab
  */
 int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt);
 /**
- * add \a cpu to CPU partition @cpt of \a cptab, return 1 for success,
+ * add \a cpu to CPU partion @cpt of \a cptab, return 1 for success,
  * otherwise 0 is returned
  */
 int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu);
@@ -195,6 +165,7 @@ void cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu);
  * add all cpus in \a mask to CPU partition \a cpt
  * return 1 if successfully set all CPUs, otherwise return 0
  */
+
 int cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt,
 			const cpumask_t *mask);
 /**
@@ -232,15 +203,15 @@ int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt);
 /*
  * allocate per-cpu-partition data, returned value is an array of pointers,
  * variable can be indexed by CPU ID.
- *	cptab != NULL: size of array is number of CPU partitions
- *	cptab == NULL: size of array is number of HW cores
+ *      cptab != NULL: size of array is number of CPU partitions
+ *      cptab == NULL: size of array is number of HW cores
  */
 void *cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size);
 /*
- * destroy per-cpu-partition variable
+ * destory per-cpu-partition variable
  */
-void cfs_percpt_free(void *vars);
-int cfs_percpt_number(void *vars);
+void  cfs_percpt_free(void *vars);
+int   cfs_percpt_number(void *vars);
 
 #define cfs_percpt_for_each(var, i, vars)		\
 	for (i = 0; i < cfs_percpt_number(vars) &&	\
@@ -289,17 +260,16 @@ void cfs_percpt_lock_free(struct cfs_percpt_lock *pcl);
 
 /* lock private lock \a index of \a pcl */
 void cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index);
-
 /* unlock private lock \a index of \a pcl */
 void cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index);
 
-#define CFS_PERCPT_LOCK_KEYS	256
+#define CFS_PERCPT_LOCK_KEYS   256
 
 /* NB: don't allocate keys dynamically, lockdep needs them to be in ".data" */
 #define cfs_percpt_lock_alloc(cptab)					\
 ({									\
-	static struct lock_class_key ___keys[CFS_PERCPT_LOCK_KEYS];	\
-	struct cfs_percpt_lock *___lk;					\
+	static struct lock_class_key  ___keys[CFS_PERCPT_LOCK_KEYS];	\
+	struct cfs_percpt_lock	     *___lk;				\
 									\
 	if (cfs_cpt_number(cptab) > CFS_PERCPT_LOCK_KEYS)		\
 		___lk = cfs_percpt_lock_create(cptab, NULL);		\
@@ -368,6 +338,14 @@ cfs_mem_cache_cpt_alloc(struct kmem_cache *cachep, struct cfs_cpt_table *cptab,
 #define cfs_cpt_for_each(i, cptab)	\
 	for (i = 0; i < cfs_cpt_number(cptab); i++)
 
+#ifndef __read_mostly
+# define __read_mostly
+#endif
+
+#ifndef ____cacheline_aligned
+#define ____cacheline_aligned
+#endif
+
 int  cfs_cpu_init(void);
 void cfs_cpu_fini(void);
 
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_crypto.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_crypto.h
index 8271306ce6019..ea9234abc7f76 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_crypto.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_crypto.h
@@ -38,12 +38,6 @@ struct cfs_crypto_hash_type {
 	unsigned int    cht_size;       /**< hash digest size */
 };
 
-struct cfs_crypto_crypt_type {
-	char	       *cct_name;	  /**< crypto algorithm name, equal to
-					   * format name for crypto api */
-	unsigned int    cct_size;         /**< crypto key size */
-};
-
 enum cfs_crypto_hash_alg {
 	CFS_HASH_ALG_NULL	= 0,
 	CFS_HASH_ALG_ADLER32,
@@ -60,13 +54,6 @@ enum cfs_crypto_hash_alg {
 	CFS_HASH_ALG_UNKNOWN	= 0xff
 };
 
-enum cfs_crypto_crypt_alg {
-	CFS_CRYPT_ALG_NULL	= 0,
-	CFS_CRYPT_ALG_AES256_CTR,
-	CFS_CRYPT_ALG_MAX,
-	CFS_CRYPT_ALG_UNKNOWN	= 0xff
-};
-
 static struct cfs_crypto_hash_type hash_types[] = {
 	[CFS_HASH_ALG_NULL] = {
 		.cht_name	= "null",
@@ -120,17 +107,6 @@ static struct cfs_crypto_hash_type hash_types[] = {
 	}
 };
 
-static struct cfs_crypto_crypt_type crypt_types[] = {
-	[CFS_CRYPT_ALG_NULL] = {
-		.cct_name	= "null",
-		.cct_size	= 0
-	},
-	[CFS_CRYPT_ALG_AES256_CTR] = {
-		.cct_name	= "ctr(aes)",
-		.cct_size	= 32
-	}
-};
-
 /* Maximum size of hash_types[].cht_size */
 #define CFS_CRYPTO_HASH_DIGESTSIZE_MAX 64
 
@@ -212,103 +188,24 @@ static inline unsigned char cfs_crypto_hash_alg(const char *algname)
 	return CFS_HASH_ALG_UNKNOWN;
 }
 
-/**
- * Return crypt algorithm information for the specified algorithm identifier
- *
- * Crypt information includes algorithm name, key size.
- *
- * \retval		cfs_crypto_crupt_type for valid ID (CFS_CRYPT_ALG_*)
- * \retval		NULL for unknown algorithm identifier
- */
-static inline const struct
-cfs_crypto_crypt_type *cfs_crypto_crypt_type(
-	enum cfs_crypto_crypt_alg crypt_alg)
-{
-	struct cfs_crypto_crypt_type *ct;
-
-	if (crypt_alg < CFS_CRYPT_ALG_MAX) {
-		ct = &crypt_types[crypt_alg];
-		if (ct->cct_name != NULL)
-			return ct;
-	}
-	return NULL;
-}
-
-/**
- * Return crypt name for crypt algorithm identifier
- *
- * \param[in] crypt_alg	crypt alrgorithm id (CFS_CRYPT_ALG_*)
- *
- * \retval		string name of known crypt algorithm
- * \retval		"unknown" if hash algorithm is unknown
- */
-static inline const
-char *cfs_crypto_crypt_name(enum cfs_crypto_crypt_alg crypt_alg)
-{
-	const struct cfs_crypto_crypt_type *ct;
-
-	ct = cfs_crypto_crypt_type(crypt_alg);
-	if (ct)
-		return ct->cct_name;
-
-	return "unknown";
-}
-
-
-/**
- * Return key size for crypto algorithm type
- *
- * \param[in] crypt_alg	crypt alrgorithm id (CFS_CRYPT_ALG_*)
- *
- * \retval		crypt algorithm key size in bytes
- * \retval		0 if crypt algorithm type is unknown
- */
-static inline
-unsigned int cfs_crypto_crypt_keysize(enum cfs_crypto_crypt_alg crypt_alg)
-{
-	const struct cfs_crypto_crypt_type *ct;
-
-	ct = cfs_crypto_crypt_type(crypt_alg);
-	if (ct != NULL)
-		return ct->cct_size;
-
-	return 0;
-}
-
-/**
- * Find crypto algorithm ID for the specified algorithm name
- *
- * \retval		crypto algorithm ID for valid ID (CFS_CRYPT_ALG_*)
- * \retval		CFS_CRYPT_ALG_UNKNOWN for unknown algorithm name
- */
-static inline unsigned char cfs_crypto_crypt_alg(const char *algname)
-{
-	enum cfs_crypto_crypt_alg crypt_alg;
-
-	for (crypt_alg = 0; crypt_alg < CFS_CRYPT_ALG_MAX; crypt_alg++)
-		if (strcmp(crypt_types[crypt_alg].cct_name, algname) == 0)
-			return crypt_alg;
-
-	return CFS_CRYPT_ALG_UNKNOWN;
-}
-
 int cfs_crypto_hash_digest(enum cfs_crypto_hash_alg hash_alg,
 			   const void *buf, unsigned int buf_len,
 			   unsigned char *key, unsigned int key_len,
 			   unsigned char *hash, unsigned int *hash_len);
 
 /* cfs crypto hash descriptor */
+struct cfs_crypto_hash_desc;
 struct page;
 
-struct ahash_request *
+struct cfs_crypto_hash_desc *
 	cfs_crypto_hash_init(enum cfs_crypto_hash_alg hash_alg,
 			     unsigned char *key, unsigned int key_len);
-int cfs_crypto_hash_update_page(struct ahash_request *req,
+int cfs_crypto_hash_update_page(struct cfs_crypto_hash_desc *desc,
 				struct page *page, unsigned int offset,
 				unsigned int len);
-int cfs_crypto_hash_update(struct ahash_request *req, const void *buf,
+int cfs_crypto_hash_update(struct cfs_crypto_hash_desc *desc, const void *buf,
 			   unsigned int buf_len);
-int cfs_crypto_hash_final(struct ahash_request *req,
+int cfs_crypto_hash_final(struct cfs_crypto_hash_desc *desc,
 			  unsigned char *hash, unsigned int *hash_len);
 int cfs_crypto_register(void);
 void cfs_crypto_unregister(void);
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_debug.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_debug.h
index ac89d2cb60b55..2eb6b7aa57d9c 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_debug.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_debug.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,10 +38,6 @@
 #ifndef __LIBCFS_DEBUG_H__
 #define __LIBCFS_DEBUG_H__
 
-#include <stdarg.h>
-#include <linux/limits.h>
-#include <uapi/linux/lnet/libcfs_debug.h>
-
 /*
  *  Debugging
  */
@@ -64,6 +60,112 @@ int libcfs_debug_str2mask(int *mask, const char *str, int is_subsys);
 extern unsigned int libcfs_catastrophe;
 extern unsigned int libcfs_panic_on_lbug;
 
+/**
+ * Format for debug message headers
+ */
+struct ptldebug_header {
+        __u32 ph_len;
+        __u32 ph_flags;
+        __u32 ph_subsys;
+        __u32 ph_mask;
+        __u16 ph_cpu_id;
+        __u16 ph_type;
+	/* time_t overflow in 2106 */
+        __u32 ph_sec;
+        __u64 ph_usec;
+        __u32 ph_stack;
+        __u32 ph_pid;
+        __u32 ph_extern_pid;
+        __u32 ph_line_num;
+} __attribute__((packed));
+
+
+#define PH_FLAG_FIRST_RECORD 1
+
+/* Debugging subsystems (32 bits, non-overlapping) */
+#define S_UNDEFINED	0x00000001
+#define S_MDC		0x00000002
+#define S_MDS		0x00000004
+#define S_OSC		0x00000008
+#define S_OST		0x00000010
+#define S_CLASS		0x00000020
+#define S_LOG		0x00000040
+#define S_LLITE		0x00000080
+#define S_RPC		0x00000100
+#define S_MGMT		0x00000200
+#define S_LNET		0x00000400
+#define S_LND		0x00000800 /* ALL LNDs */
+#define S_PINGER	0x00001000
+#define S_FILTER	0x00002000
+/* unused */
+#define S_ECHO		0x00008000
+#define S_LDLM		0x00010000
+#define S_LOV		0x00020000
+#define S_LQUOTA	0x00040000
+#define S_OSD		0x00080000
+#define S_LFSCK		0x00100000
+#define S_SNAPSHOT	0x00200000
+/* unused */
+#define S_LMV		0x00800000 /* b_new_cmd */
+/* unused */
+#define S_SEC		0x02000000 /* upcall cache */
+#define S_GSS		0x04000000 /* b_new_cmd */
+/* unused */
+#define S_MGC		0x10000000
+#define S_MGS		0x20000000
+#define S_FID		0x40000000 /* b_new_cmd */
+#define S_FLD		0x80000000 /* b_new_cmd */
+
+#define LIBCFS_DEBUG_SUBSYS_NAMES {					\
+	"undefined", "mdc", "mds", "osc", "ost", "class", "log",	\
+	"llite", "rpc", "mgmt", "lnet", "lnd", "pinger", "filter", "",	\
+	"echo", "ldlm", "lov", "lquota", "osd", "lfsck", "snapshot", "",\
+	"lmv",	"", "sec", "gss", "", "mgc", "mgs", "fid", "fld", NULL }
+
+/* Debugging masks (32 bits, non-overlapping) */
+#define D_TRACE		0x00000001 /* ENTRY/EXIT markers */
+#define D_INODE		0x00000002
+#define D_SUPER		0x00000004
+#define D_EXT2		0x00000008 /* anything from ext2_debug */
+#define D_MALLOC	0x00000010 /* print malloc, free information */
+#define D_CACHE		0x00000020 /* cache-related items */
+#define D_INFO		0x00000040 /* general information */
+#define D_IOCTL		0x00000080 /* ioctl related information */
+#define D_NETERROR	0x00000100 /* network errors */
+#define D_NET		0x00000200 /* network communications */
+#define D_WARNING	0x00000400 /* CWARN(...) == CDEBUG (D_WARNING, ...) */
+#define D_BUFFS		0x00000800
+#define D_OTHER		0x00001000
+#define D_DENTRY	0x00002000
+#define D_NETTRACE	0x00004000
+#define D_PAGE		0x00008000 /* bulk page handling */
+#define D_DLMTRACE	0x00010000
+#define D_ERROR		0x00020000 /* CERROR(...) == CDEBUG (D_ERROR, ...) */
+#define D_EMERG		0x00040000 /* CEMERG(...) == CDEBUG (D_EMERG, ...) */
+#define D_HA		0x00080000 /* recovery and failover */
+#define D_RPCTRACE	0x00100000 /* for distributed debugging */
+#define D_VFSTRACE	0x00200000
+#define D_READA		0x00400000 /* read-ahead */
+#define D_MMAP		0x00800000
+#define D_CONFIG	0x01000000
+#define D_CONSOLE	0x02000000
+#define D_QUOTA		0x04000000
+#define D_SEC		0x08000000
+#define D_LFSCK		0x10000000 /* For both OI scrub and LFSCK */
+#define D_HSM		0x20000000
+#define D_SNAPSHOT	0x40000000 /* snapshot */
+#define D_LAYOUT	0x80000000
+
+#define LIBCFS_DEBUG_MASKS_NAMES {					\
+	"trace", "inode", "super", "ext2", "malloc", "cache", "info",	\
+	"ioctl", "neterror", "net", "warning", "buffs", "other",	\
+	"dentry", "nettrace", "page", "dlmtrace", "error", "emerg",	\
+	"ha", "rpctrace", "vfstrace", "reada", "mmap", "config",	\
+	"console", "quota", "sec", "lfsck", "hsm", "snapshot", "layout",\
+	NULL }
+
+#define D_CANTMASK   (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE)
+
 #ifndef DEBUG_SUBSYSTEM
 # define DEBUG_SUBSYSTEM S_UNDEFINED
 #endif
@@ -105,38 +207,9 @@ do {                                                        \
                .msg_cdls   = (cdls)         };              \
         dataname.msg_mask   = (mask);
 
-#ifdef CDEBUG_ENABLED
+#ifdef __KERNEL__
 
-#if !defined(__x86_64__)
-# ifdef __ia64__
-#  define CDEBUG_STACK() (THREAD_SIZE -					\
-			  ((unsigned long)__builtin_dwarf_cfa() &	\
-			   (THREAD_SIZE - 1)))
-# else
-#  define CDEBUG_STACK() (THREAD_SIZE -					\
-			  ((unsigned long)__builtin_frame_address(0) &	\
-			   (THREAD_SIZE - 1)))
-# endif /* __ia64__ */
-
-#define __CHECK_STACK(msgdata, mask, cdls)				\
-do {									\
-	if (unlikely(CDEBUG_STACK() > libcfs_stack)) {			\
-		LIBCFS_DEBUG_MSG_DATA_INIT(msgdata, D_WARNING, NULL);	\
-		libcfs_stack = CDEBUG_STACK();				\
-		libcfs_debug_msg(msgdata,				\
-				 "maximum lustre stack %lu\n",		\
-				 CDEBUG_STACK());			\
-		(msgdata)->msg_mask = mask;				\
-		(msgdata)->msg_cdls = cdls;				\
-		dump_stack();						\
-		/*panic("LBUG");*/					\
-	}								\
-} while (0)
-#define CFS_CHECK_STACK(msgdata, mask, cdls)  __CHECK_STACK(msgdata, mask, cdls)
-#else /* __x86_64__ */
-#define CFS_CHECK_STACK(msgdata, mask, cdls) do {} while (0)
-#define CDEBUG_STACK() (0L)
-#endif /* __x86_64__ */
+# ifdef CDEBUG_ENABLED
 
 /**
  * Filters out logging messages based on mask and subsystem.
@@ -178,6 +251,22 @@ static inline int cfs_cdebug_show(unsigned int mask, unsigned int subsystem)
 #  warning "CDEBUG IS DISABLED. THIS SHOULD NEVER BE DONE FOR PRODUCTION!"
 # endif /* CDEBUG_ENABLED */
 
+#else /* !__KERNEL__ */
+static inline int cfs_cdebug_show(unsigned int mask, unsigned int subsystem)
+{
+        return 0;
+}
+# define CDEBUG(mask, format, ...)					\
+do {                                                                    \
+        if (((mask) & D_CANTMASK) != 0)                                 \
+                fprintf(stderr, "(%s:%d:%s()) " format,                 \
+                        __FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__);\
+} while (0)
+
+# define CDEBUG_LIMIT CDEBUG
+
+#endif /* __KERNEL__ */
+
 /*
  * Lustre Error Checksum: calculates checksum
  * of Hex number by XORing each bit.
@@ -199,7 +288,7 @@ static inline int cfs_cdebug_show(unsigned int mask, unsigned int subsystem)
 
 #define LCONSOLE_EMERG(format, ...) CDEBUG(D_CONSOLE | D_EMERG, format, ## __VA_ARGS__)
 
-#if defined(CDEBUG_ENTRY_EXIT)
+#if defined(CDEBUG_ENTRY_EXIT) && defined(__KERNEL__)
 
 void libcfs_log_goto(struct libcfs_debug_msg_data *goto_data,
 		     const char *label, long rc);
@@ -252,7 +341,7 @@ do {									\
 # define ENTRY	CDEBUG(D_TRACE, "Process entered\n")
 # define EXIT	CDEBUG(D_TRACE, "Process leaving\n")
 
-#else /* !CDEBUG_ENTRY_EXIT */
+#else /* !CDEBUG_ENTRY_EXIT || !__KERNEL__ */
 
 # define GOTO(label, rc)						\
 	do {								\
@@ -264,7 +353,7 @@ do {									\
 # define ENTRY	do { } while (0)
 # define EXIT	do { } while (0)
 
-#endif /* CDEBUG_ENTRY_EXIT */
+#endif /* CDEBUG_ENTRY_EXIT && __KERNEL__ */
 
 #define RETURN_EXIT							\
 do {									\
@@ -281,15 +370,15 @@ extern int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata,
                               va_list args, const char *format2, ...)
         __attribute__ ((format (printf, 4, 5)));
 
+#ifdef __KERNEL__
 /* other external symbols that tracefile provides: */
 extern int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
 				   const char __user *usr_buffer,
 				   int usr_buffer_nob);
 extern int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob,
 				    const char *knl_buffer, char *append);
+#endif /* __KERNEL__ */
 
 #define LIBCFS_DEBUG_FILE_PATH_DEFAULT "/tmp/lustre-log"
 
-void cfs_debug_init(void);
-
 #endif	/* __LIBCFS_DEBUG_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_fail.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_fail.h
index 203e470df88d0..2af5149be8f69 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_fail.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_fail.h
@@ -165,7 +165,7 @@ static inline void cfs_race(__u32 id)
 			CERROR("cfs_race id %x sleeping\n", id);
 			rc = wait_event_interruptible(cfs_race_waitq,
 						      cfs_race_state != 0);
-			CERROR("cfs_fail_race id %x awake: rc=%d\n", id, rc);
+			CERROR("cfs_fail_race id %x awake, rc=%d\n", id, rc);
 		} else {
 			CERROR("cfs_fail_race id %x waking\n", id);
 			cfs_race_state = 1;
@@ -175,42 +175,4 @@ static inline void cfs_race(__u32 id)
 }
 #define CFS_RACE(id) cfs_race(id)
 
-/**
- * Wait on race.
- *
- * The first thread that calls this with a matching fail_loc is put to sleep,
- * but subseqent callers of this won't sleep. Until another thread that calls
- * cfs_race_wakeup(), the first thread will be woken up and continue.
- */
-static inline void cfs_race_wait(__u32 id)
-{
-	if (CFS_FAIL_PRECHECK(id)) {
-		if (unlikely(__cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET))) {
-			int rc;
-
-			cfs_race_state = 0;
-			CERROR("cfs_race id %x sleeping\n", id);
-			rc = wait_event_interruptible(cfs_race_waitq,
-						      cfs_race_state != 0);
-			CERROR("cfs_fail_race id %x awake: rc=%d\n", id, rc);
-		}
-	}
-}
-#define CFS_RACE_WAIT(id) cfs_race_wait(id)
-
-/**
- * Wake up the thread that is waiting on the matching fail_loc.
- */
-static inline void cfs_race_wakeup(__u32 id)
-{
-	if (CFS_FAIL_PRECHECK(id)) {
-		if (likely(!__cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET))) {
-			CERROR("cfs_fail_race id %x waking\n", id);
-			cfs_race_state = 1;
-			wake_up(&cfs_race_waitq);
-		}
-	}
-}
-#define CFS_RACE_WAKEUP(id) cfs_race_wakeup(id)
-
 #endif /* _LIBCFS_FAIL_H */
diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_ioctl.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ioctl.h
similarity index 88%
rename from drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_ioctl.h
rename to drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ioctl.h
index cdac10f572408..6b79096f761a0 100644
--- a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_ioctl.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ioctl.h
@@ -23,19 +23,21 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2013, 2017, Intel Corporation.
+ * Copyright (c) 2013, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  * Lustre is a trademark of Sun Microsystems, Inc.
  *
+ * libcfs/include/libcfs/libcfs_ioctl.h
+ *
  * Low-level ioctl data structures. Kernel ioctl functions declared here,
  * and user space functions are in libcfs/util/ioctl.h.
  *
  */
 
-#ifndef __UAPI_LIBCFS_IOCTL_H__
-#define __UAPI_LIBCFS_IOCTL_H__
+#ifndef __LIBCFS_IOCTL_H__
+#define __LIBCFS_IOCTL_H__
 
 #include <linux/types.h>
 #include <linux/ioctl.h>
@@ -75,7 +77,8 @@ struct libcfs_ioctl_data {
 	char ioc_bulk[0];
 };
 
-struct libcfs_debug_ioctl_data {
+struct libcfs_debug_ioctl_data
+{
 	struct libcfs_ioctl_hdr hdr;
 	unsigned int subs;
 	unsigned int debug;
@@ -102,7 +105,7 @@ struct libcfs_debug_ioctl_data {
 #define IOC_LIBCFS_CONFIGURE		   _IOWR('e', 59, IOCTL_LIBCFS_TYPE)
 #define IOC_LIBCFS_TESTPROTOCOMPAT	   _IOWR('e', 60, IOCTL_LIBCFS_TYPE)
 #define IOC_LIBCFS_PING			   _IOWR('e', 61, IOCTL_LIBCFS_TYPE)
-#define IOC_LIBCFS_PING_PEER               _IOWR('e', 62, IOCTL_LIBCFS_TYPE)
+/*	IOC_LIBCFS_DEBUG_PEER		   _IOWR('e', 62, IOCTL_LIBCFS_TYPE) */
 #define IOC_LIBCFS_LNETST		   _IOWR('e', 63, IOCTL_LIBCFS_TYPE)
 #define IOC_LIBCFS_LNET_FAULT		   _IOWR('e', 64, IOCTL_LIBCFS_TYPE)
 /* lnd ioctls */
@@ -113,7 +116,7 @@ struct libcfs_debug_ioctl_data {
 #define IOC_LIBCFS_DEL_PEER		   _IOWR('e', 74, IOCTL_LIBCFS_TYPE)
 #define IOC_LIBCFS_ADD_PEER		   _IOWR('e', 75, IOCTL_LIBCFS_TYPE)
 #define IOC_LIBCFS_GET_PEER		   _IOWR('e', 76, IOCTL_LIBCFS_TYPE)
-#define IOC_LIBCFS_DISCOVER                _IOWR('e', 77, IOCTL_LIBCFS_TYPE)
+/* ioctl 77 is free for use */
 #define IOC_LIBCFS_ADD_INTERFACE	   _IOWR('e', 78, IOCTL_LIBCFS_TYPE)
 #define IOC_LIBCFS_DEL_INTERFACE	   _IOWR('e', 79, IOCTL_LIBCFS_TYPE)
 #define IOC_LIBCFS_GET_INTERFACE	   _IOWR('e', 80, IOCTL_LIBCFS_TYPE)
@@ -145,13 +148,8 @@ struct libcfs_debug_ioctl_data {
 #define IOC_LIBCFS_GET_LOCAL_NI		   _IOWR(IOC_LIBCFS_TYPE, 97, IOCTL_CONFIG_SIZE)
 #define IOC_LIBCFS_SET_NUMA_RANGE	   _IOWR(IOC_LIBCFS_TYPE, 98, IOCTL_CONFIG_SIZE)
 #define IOC_LIBCFS_GET_NUMA_RANGE	   _IOWR(IOC_LIBCFS_TYPE, 99, IOCTL_CONFIG_SIZE)
-#define IOC_LIBCFS_GET_PEER_LIST	   _IOWR(IOC_LIBCFS_TYPE, 100, IOCTL_CONFIG_SIZE)
-#define IOC_LIBCFS_GET_LOCAL_NI_MSG_STATS  _IOWR(IOC_LIBCFS_TYPE, 101, IOCTL_CONFIG_SIZE)
-#define IOC_LIBCFS_SET_HEALHV		   _IOWR(IOC_LIBCFS_TYPE, 102, IOCTL_CONFIG_SIZE)
-#define IOC_LIBCFS_GET_LOCAL_HSTATS	   _IOWR(IOC_LIBCFS_TYPE, 103, IOCTL_CONFIG_SIZE)
-#define IOC_LIBCFS_GET_RECOVERY_QUEUE	   _IOWR(IOC_LIBCFS_TYPE, 104, IOCTL_CONFIG_SIZE)
-#define IOC_LIBCFS_MAX_NR					  104
+#define IOC_LIBCFS_MAX_NR					  99
 
 extern int libcfs_ioctl_data_adjust(struct libcfs_ioctl_data *data);
 
-#endif /* __UAPI_LIBCFS_IOCTL_H__ */
+#endif /* __LIBCFS_IOCTL_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_prim.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_prim.h
index 1001362e75cd0..16bda0c460ebf 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_prim.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_prim.h
@@ -36,7 +36,6 @@
 #ifndef __LIBCFS_PRIM_H__
 #define __LIBCFS_PRIM_H__
 
-#include <linux/mm.h>
 #include <linux/sched.h>
 
 /*
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h
index 9a242839fd843..ebcdc990203b2 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h
@@ -42,9 +42,6 @@
 # define DEBUG_SUBSYSTEM S_UNDEFINED
 #endif
 
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-
 #ifdef LIBCFS_DEBUG
 
 /*
@@ -216,14 +213,8 @@ do {									    \
 #define LIBCFS_CPT_ALLOC(ptr, cptab, cpt, size)				    \
 	LIBCFS_CPT_ALLOC_GFP(ptr, cptab, cpt, size, GFP_NOFS)
 
-#ifdef LLIST_HEAD
 void init_libcfs_vfree_atomic(void);
 void exit_libcfs_vfree_atomic(void);
-#define HAVE_LIBCFS_VFREE_ATOMIC
-#else
-#define init_libcfs_vfree_atomic() do {} while(0)
-#define exit_libcfs_vfree_atomic() do {} while(0)
-#endif
 
 #define LIBCFS_FREE(ptr, size)						\
 do {									\
@@ -237,7 +228,7 @@ do {									\
 	CDEBUG(D_MALLOC, "kfreed '" #ptr "': %d at %p (tot %d).\n",     \
 	       s, (ptr), libcfs_kmem_read());				\
 	if (unlikely(s > LIBCFS_VMALLOC_SIZE))                          \
-		libcfs_vfree_atomic(ptr);				\
+		libcfs_vfree_atomic(ptr);						\
 	else								\
 		kfree(ptr);						\
 } while (0)
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h
new file mode 100644
index 0000000000000..ca40551dfc678
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h
@@ -0,0 +1,121 @@
+#ifndef __LIBCFS_PTASK_H__
+#define __LIBCFS_PTASK_H__
+
+#include <linux/types.h>
+#include <linux/bitops.h>
+#include <linux/kernel.h>
+#include <linux/cpumask.h>
+#include <linux/uaccess.h>
+#include <linux/notifier.h>
+#include <linux/workqueue.h>
+#include <linux/completion.h>
+
+/*
+ * Unconditionaly disable PADATA.
+ *
+ * Padata is needed for PIO client feature. This feature is disabled by default
+ * and was removed from Lustre code during 2.13 development (2b0a34fe43bf).
+ * Instead of adapting the code to Linux 5.4+ change, just disable it.
+ */
+#undef CONFIG_PADATA
+
+#ifdef CONFIG_PADATA
+#include <linux/padata.h>
+#else
+struct padata_priv {};
+struct padata_instance {};
+#endif
+
+#define PTF_COMPLETE	BIT(0)
+#define PTF_AUTOFREE	BIT(1)
+#define PTF_ORDERED	BIT(2)
+#define PTF_USER_MM	BIT(3)
+#define PTF_ATOMIC	BIT(4)
+#define PTF_RETRY	BIT(5)
+
+struct cfs_ptask_engine {
+	struct padata_instance	*pte_pinst;
+	struct workqueue_struct	*pte_wq;
+	struct notifier_block	 pte_notifier;
+	int			 pte_weight;
+};
+
+struct cfs_ptask;
+typedef int (*cfs_ptask_cb_t)(struct cfs_ptask *);
+
+struct cfs_ptask {
+	struct padata_priv	 pt_padata;
+	struct completion	 pt_completion;
+	struct mm_struct	*pt_mm;
+	unsigned int		 pt_flags;
+	int			 pt_cbcpu;
+	cfs_ptask_cb_t		 pt_cbfunc;
+	void			*pt_cbdata;
+	int			 pt_result;
+};
+
+static inline
+struct padata_priv *cfs_ptask2padata(struct cfs_ptask *ptask)
+{
+	return &ptask->pt_padata;
+}
+
+static inline
+struct cfs_ptask *cfs_padata2ptask(struct padata_priv *padata)
+{
+	return container_of(padata, struct cfs_ptask, pt_padata);
+}
+
+static inline
+bool cfs_ptask_need_complete(struct cfs_ptask *ptask)
+{
+	return ptask->pt_flags & PTF_COMPLETE;
+}
+
+static inline
+bool cfs_ptask_is_autofree(struct cfs_ptask *ptask)
+{
+	return ptask->pt_flags & PTF_AUTOFREE;
+}
+
+static inline
+bool cfs_ptask_is_ordered(struct cfs_ptask *ptask)
+{
+	return ptask->pt_flags & PTF_ORDERED;
+}
+
+static inline
+bool cfs_ptask_use_user_mm(struct cfs_ptask *ptask)
+{
+	return ptask->pt_flags & PTF_USER_MM;
+}
+
+static inline
+bool cfs_ptask_is_atomic(struct cfs_ptask *ptask)
+{
+	return ptask->pt_flags & PTF_ATOMIC;
+}
+
+static inline
+bool cfs_ptask_is_retry(struct cfs_ptask *ptask)
+{
+	return ptask->pt_flags & PTF_RETRY;
+}
+
+static inline
+int cfs_ptask_result(struct cfs_ptask *ptask)
+{
+	return ptask->pt_result;
+}
+
+struct cfs_ptask_engine *cfs_ptengine_init(const char *, const struct cpumask *);
+void cfs_ptengine_fini(struct cfs_ptask_engine *);
+int  cfs_ptengine_set_cpumask(struct cfs_ptask_engine *, const struct cpumask *);
+int  cfs_ptengine_weight(struct cfs_ptask_engine *);
+
+int  cfs_ptask_submit(struct cfs_ptask *, struct cfs_ptask_engine *);
+int  cfs_ptask_wait_for(struct cfs_ptask *);
+int  cfs_ptask_init(struct cfs_ptask *, cfs_ptask_cb_t, void *,
+		    unsigned int, int);
+
+#endif /* __LIBCFS_PTASK_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_string.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_string.h
index 4d9dbde91e8a0..3c34071d35774 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_string.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_string.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_time.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_time.h
new file mode 100644
index 0000000000000..68947c9792296
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_time.h
@@ -0,0 +1,81 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_time.h
+ *
+ * Time functions.
+ *
+ */
+
+#ifndef __LIBCFS_TIME_H__
+#define __LIBCFS_TIME_H__
+
+/*
+ * generic time manipulation functions.
+ */
+
+static inline cfs_time_t cfs_time_add(cfs_time_t t, cfs_duration_t d)
+{
+        return (cfs_time_t)(t + d);
+}
+
+static inline cfs_duration_t cfs_time_sub(cfs_time_t t1, cfs_time_t t2)
+{
+        return (cfs_time_t)(t1 - t2);
+}
+
+static inline int cfs_time_after(cfs_time_t t1, cfs_time_t t2)
+{
+        return cfs_time_before(t2, t1);
+}
+
+static inline int cfs_time_aftereq(cfs_time_t t1, cfs_time_t t2)
+{
+        return cfs_time_beforeq(t2, t1);
+}
+
+static inline cfs_time_t cfs_time_shift(int seconds)
+{
+        return cfs_time_add(cfs_time_current(), cfs_time_seconds(seconds));
+}
+
+#define CFS_TICK	1
+
+/*
+ * return valid time-out based on user supplied one. Currently we only check
+ * that time-out is not shorted than allowed.
+ */
+static inline cfs_duration_t cfs_timeout_cap(cfs_duration_t timeout)
+{
+	if (timeout < CFS_TICK)
+		timeout = CFS_TICK;
+	return timeout;
+}
+
+#endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/libcfs.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/libcfs.h
new file mode 100644
index 0000000000000..0f67a87096c0a
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/libcfs.h
@@ -0,0 +1,150 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LINUX_LIBCFS_H__
+#define __LIBCFS_LINUX_LIBCFS_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
+#endif
+
+#ifndef __KERNEL__
+#error This include is only for kernel use.
+#endif
+
+#include <linux/bitops.h>
+#include <linux/compiler.h>
+#include <linux/ctype.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/kthread.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/random.h>
+#include <linux/rbtree.h>
+#include <linux/rwsem.h>
+#include <linux/scatterlist.h>
+#include <linux/sched.h>
+#ifdef HAVE_SCHED_HEADERS
+#include <linux/sched/signal.h>
+#include <linux/sched/mm.h>
+#endif
+#include <linux/signal.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/time.h>
+#include <linux/timer.h>
+#include <linux/types.h>
+#include <linux/unistd.h>
+#include <linux/vmalloc.h>
+#include <net/sock.h>
+#include <linux/atomic.h>
+#include <asm/div64.h>
+#include <linux/timex.h>
+#include <linux/uaccess.h>
+#include <stdarg.h>
+
+#include <libcfs/linux/linux-cpu.h>
+#include <libcfs/linux/linux-time.h>
+#include <libcfs/linux/linux-mem.h>
+#include <libcfs/linux/linux-misc.h>
+#include <libcfs/linux/linux-fs.h>
+
+#if !defined(__x86_64__)
+# ifdef  __ia64__
+#  define CDEBUG_STACK() (THREAD_SIZE -                                 \
+                          ((unsigned long)__builtin_dwarf_cfa() &       \
+                           (THREAD_SIZE - 1)))
+# else
+#  define CDEBUG_STACK() (THREAD_SIZE -                                 \
+                          ((unsigned long)__builtin_frame_address(0) &  \
+                           (THREAD_SIZE - 1)))
+# endif /* __ia64__ */
+
+#define __CHECK_STACK(msgdata, mask, cdls)                              \
+do {                                                                    \
+        if (unlikely(CDEBUG_STACK() > libcfs_stack)) {                  \
+                LIBCFS_DEBUG_MSG_DATA_INIT(msgdata, D_WARNING, NULL);   \
+                libcfs_stack = CDEBUG_STACK();                          \
+                libcfs_debug_msg(msgdata,                               \
+                                 "maximum lustre stack %lu\n",          \
+                                 CDEBUG_STACK());                       \
+                (msgdata)->msg_mask = mask;                             \
+                (msgdata)->msg_cdls = cdls;                             \
+                dump_stack();                                           \
+              /*panic("LBUG");*/                                        \
+        }                                                               \
+} while (0)
+#define CFS_CHECK_STACK(msgdata, mask, cdls)  __CHECK_STACK(msgdata, mask, cdls)
+#else /* __x86_64__ */
+#define CFS_CHECK_STACK(msgdata, mask, cdls) do {} while(0)
+#define CDEBUG_STACK() (0L)
+#endif /* __x86_64__ */
+
+/**
+ * Platform specific declarations for cfs_curproc API (libcfs/curproc.h)
+ *
+ * Implementation is in linux-curproc.c
+ */
+#define CFS_CURPROC_COMM_MAX (sizeof ((struct task_struct *)0)->comm)
+
+/* helper for sysctl handlers */
+int lprocfs_call_handler(void *data, int write, loff_t *ppos,
+			 void __user *buffer, size_t *lenp,
+			 int (*handler)(void *data, int write,
+			 loff_t pos, void __user *buffer, int len));
+
+#ifndef WITH_WATCHDOG
+#define WITH_WATCHDOG
+#endif
+
+/*
+ * Macros to access common characteristics of "current" UNIX process.
+ */
+#define current_pid()             (current->pid)
+#define current_comm()            (current->comm)
+
+/* check if task is running in compat mode.*/
+int current_is_32bit(void);
+
+#endif /* _LINUX_LIBCFS_H */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-cpu.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-cpu.h
index ab6b55e0586a6..a46e252466026 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-cpu.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-cpu.h
@@ -23,7 +23,7 @@
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2013, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -39,6 +39,61 @@
 #ifndef __LIBCFS_LINUX_CPU_H__
 #define __LIBCFS_LINUX_CPU_H__
 
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
+#endif
+
+#ifndef __KERNEL__
+#error This include is only for kernel use.
+#endif
+
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/topology.h>
+#include <linux/version.h>
+
+#ifdef CONFIG_SMP
+
+#define HAVE_LIBCFS_CPT
+
+/** virtual processing unit */
+struct cfs_cpu_partition {
+	/* CPUs mask for this partition */
+	cpumask_t			*cpt_cpumask;
+	/* nodes mask for this partition */
+	nodemask_t			*cpt_nodemask;
+	/* NUMA distance between CPTs */
+	unsigned			*cpt_distance;
+	/* spread rotor for NUMA allocator */
+	int				 cpt_spread_rotor;
+	/* NUMA node if cpt_nodemask is empty */
+	int				 cpt_node;
+};
+
+/** descriptor for CPU partitions */
+struct cfs_cpt_table {
+	/* spread rotor for NUMA allocator */
+	int				ctb_spread_rotor;
+	/* maximum NUMA distance between all nodes in table */
+	unsigned			ctb_distance;
+	/* # of CPU partitions */
+	int				 ctb_nparts;
+	/* partitions tables */
+	struct cfs_cpu_partition	*ctb_parts;
+	/* shadow HW CPU to CPU partition ID */
+	int				*ctb_cpu2cpt;
+	/* all cpus in this partition table */
+	cpumask_t			*ctb_cpumask;
+	/* shadow HW node to CPU partition ID */
+	int				*ctb_node2cpt;
+	/* all nodes in this partition table */
+	nodemask_t			*ctb_nodemask;
+};
+
+void cfs_cpu_core_siblings(int cpu, cpumask_t *mask);
+
+#endif /* CONFIG_SMP */
+
 #ifndef HAVE_TOPOLOGY_SIBLING_CPUMASK
 # define topology_sibling_cpumask(cpu)	topology_thread_cpumask(cpu)
 #endif /* HAVE_TOPOLOGY_SIBLING_CPUMASK */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-crypto.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-crypto.h
index 6346c59e516e7..a9c15a66ab207 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-crypto.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-crypto.h
@@ -26,6 +26,11 @@
  * Copyright 2012 Xyratex Technology Limited
  */
 
+/* Added in v4.15-rc4 (commit a208fa8f3303) */
+#ifndef CRYPTO_ALG_OPTIONAL_KEY
+#define CRYPTO_ALG_OPTIONAL_KEY 0x00004000
+#endif
+
 /**
  * Linux crypto hash specific functions.
  */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h
index dd86d1947466b..dbc84de172146 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h
@@ -37,6 +37,14 @@
 #ifndef __LIBCFS_LINUX_CFS_FS_H__
 #define __LIBCFS_LINUX_CFS_FS_H__
 
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
+#endif
+
+#ifndef __KERNEL__
+#error This include is only for kernel use.
+#endif
+
 #include <linux/fs.h>
 #include <linux/stat.h>
 #include <linux/mount.h>
@@ -50,10 +58,6 @@ static inline struct dentry *file_dentry(const struct file *file)
 }
 #endif
 
-#ifndef QSTR_INIT
-#define QSTR_INIT(n, l) { .len = l, .name = n }
-#endif
-
 #if defined(HAVE_FILE_FSYNC_4ARGS) || defined(HAVE_FILE_FSYNC_2ARGS)
 #define ll_vfs_fsync_range(fp, start, end, datasync) \
 	vfs_fsync_range(fp, start, end, datasync)
@@ -62,6 +66,15 @@ static inline struct dentry *file_dentry(const struct file *file)
 	vfs_fsync_range(fp, file_dentry(fp), start, end, datasync)
 #endif
 
+#define flock_type(fl)			((fl)->fl_type)
+#define flock_set_type(fl, type)	do { (fl)->fl_type = (type); } while (0)
+#define flock_pid(fl)			((fl)->fl_pid)
+#define flock_set_pid(fl, pid)		do { (fl)->fl_pid = (pid); } while (0)
+#define flock_start(fl)			((fl)->fl_start)
+#define flock_set_start(fl, st)		do { (fl)->fl_start = (st); } while (0)
+#define flock_end(fl)			((fl)->fl_end)
+#define flock_set_end(fl, end)		do { (fl)->fl_end = (end); } while (0)
+
 #ifndef IFSHIFT
 #define IFSHIFT			12
 #endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-hash.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-hash.h
deleted file mode 100644
index 2721655306bbe..0000000000000
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-hash.h
+++ /dev/null
@@ -1,247 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-
-#ifndef __LIBCFS_LINUX_HASH_H__
-#define __LIBCFS_LINUX_HASH_H__
-
-#include <linux/dcache.h>
-
-u64 cfs_hashlen_string(const void *salt, const char *name);
-
-#ifndef hashlen_hash
-#define hashlen_hash(hashlen) ((u32)(hashlen))
-#endif
-
-#ifndef HAVE_STRINGHASH
-#ifndef hashlen_create
-#define hashlen_create(hash, len) ((u64)(len)<<32 | (u32)(hash))
-#endif
-#endif /* !HAVE_STRINGHASH */
-
-#ifdef HAVE_LINUX_RHASHTABLE_H
-#include <linux/rhashtable.h>
-
-#ifndef HAVE_RHLTABLE
-struct rhlist_head {
-	struct rhash_head		rhead;
-	struct rhlist_head __rcu	*next;
-};
-
-struct rhltable {
-	struct rhashtable ht;
-};
-
-#define rhl_for_each_entry_rcu(tpos, pos, list, member)                 \
-	for (pos = list; pos && rht_entry(tpos, pos, member);           \
-		pos = rcu_dereference_raw(pos->next))
-
-static inline int rhltable_init(struct rhltable *hlt,
-				const struct rhashtable_params *params)
-{
-	return rhashtable_init(&hlt->ht, params);
-}
-
-static inline struct rhlist_head *rhltable_lookup(
-	struct rhltable *hlt, const void *key,
-	const struct rhashtable_params params)
-{
-	struct rhashtable *ht = &hlt->ht;
-	struct rhashtable_compare_arg arg = {
-		.ht = ht,
-		.key = key,
-	};
-	struct bucket_table *tbl;
-	struct rhash_head *he;
-	unsigned int hash;
-
-	tbl = rht_dereference_rcu(ht->tbl, ht);
-restart:
-	hash = rht_key_hashfn(ht, tbl, key, params);
-	rht_for_each_rcu(he, tbl, hash) {
-		if (params.obj_cmpfn ?
-		    params.obj_cmpfn(&arg, rht_obj(ht, he)) :
-		    rhashtable_compare(&arg, rht_obj(ht, he)))
-			continue;
-		return he ? container_of(he, struct rhlist_head, rhead) : NULL;
-	}
-
-	/* Ensure we see any new tables. */
-	smp_rmb();
-
-	tbl = rht_dereference_rcu(tbl->future_tbl, ht);
-	if (unlikely(tbl))
-		goto restart;
-
-	return NULL;
-}
-
-static inline int rhltable_insert_key(
-	struct rhltable *hlt, const void *key, struct rhlist_head *list,
-	const struct rhashtable_params params)
-{
-#ifdef HAVE_HASHTABLE_INSERT_FAST_RETURN_INT
-	return __rhashtable_insert_fast(&hlt->ht, key, &list->rhead,
-					params);
-#else
-	return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead,
-						params));
-#endif
-}
-
-static inline int rhltable_remove(
-	struct rhltable *hlt, struct rhlist_head *list,
-	const struct rhashtable_params params)
-{
-	return rhashtable_remove_fast(&hlt->ht, &list->rhead, params);
-}
-
-static inline void rhltable_free_and_destroy(struct rhltable *hlt,
-					     void (*free_fn)(void *ptr,
-							     void *arg),
-					     void *arg)
-{
-	rhashtable_free_and_destroy(&hlt->ht, free_fn, arg);
-}
-
-static inline void rhltable_destroy(struct rhltable *hlt)
-{
-	rhltable_free_and_destroy(hlt, NULL, NULL);
-}
-
-static inline void rhltable_walk_enter(struct rhltable *hlt,
-				       struct rhashtable_iter *iter)
-{
-	rhashtable_walk_init(&hlt->ht, iter);
-}
-#endif /* !HAVE_RHLTABLE */
-
-#ifdef HAVE_BROKEN_HASH_64
-
-#define GOLDEN_RATIO_32 0x61C88647
-#define GOLDEN_RATIO_64 0x61C8864680B583EBull
-
-static inline u32 cfs_hash_32(u32 val, unsigned int bits)
-{
-	/* High bits are more random, so use them. */
-	return (val * GOLDEN_RATIO_32) >> (32 - bits);
-}
-
-static __always_inline u32 cfs_hash_64(u64 val, unsigned int bits)
-{
-#if BITS_PER_LONG == 64
-	/* 64x64-bit multiply is efficient on all 64-bit processors */
-	return val * GOLDEN_RATIO_64 >> (64 - bits);
-#else
-	/* Hash 64 bits using only 32x32-bit multiply. */
-	return cfs_hash_32(((u32)val ^ ((val >> 32) * GOLDEN_RATIO_32)), bits);
-#endif
-}
-#else
-
-#define cfs_hash_32	hash_32
-#define cfs_hash_64	hash_64
-
-#endif /* HAVE_BROKEN_HASH_64 */
-
-#ifndef HAVE_RHASHTABLE_LOOKUP_GET_INSERT_FAST
-/**
- * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table
- * @ht:         hash table
- * @obj:        pointer to hash head inside object
- * @params:     hash table parameters
- *
- * Just like rhashtable_lookup_insert_fast(), but this function returns the
- * object if it exists, NULL if it did not and the insertion was successful,
- * and an ERR_PTR otherwise.
- */
-static inline void *rhashtable_lookup_get_insert_fast(
-	struct rhashtable *ht, struct rhash_head *obj,
-	const struct rhashtable_params params)
-{
-	const char *key;
-	void *ret;
-	int rc;
-
-	rc = rhashtable_lookup_insert_fast(ht, obj, params);
-	switch (rc) {
-	case -EEXIST:
-		key = rht_obj(ht, obj);
-		ret = rhashtable_lookup_fast(ht, key, params);
-		break;
-	case 0:
-		ret = NULL;
-		break;
-	default:
-		ret = ERR_PTR(rc);
-		break;
-	}
-	return ret;
-}
-#endif /* !HAVE_RHASHTABLE_LOOKUP_GET_INSERT_FAST */
-
-#ifndef HAVE_RHASHTABLE_LOOKUP
-/*
- * The function rhashtable_lookup() and rhashtable_lookup_fast()
- * are almost the same except rhashtable_lookup() doesn't
- * take the RCU read lock. Since this is the case and only
- * SLES12 SP3 lacks rhashtable_lookup() just duplicate the
- * SLES12 SP3 rhashtable_lookup_fast() minus the RCU read lock.
- */
-static inline void *rhashtable_lookup(
-	struct rhashtable *ht, const void *key,
-	const struct rhashtable_params params)
-{
-	struct rhashtable_compare_arg arg = {
-		.ht = ht,
-		.key = key,
-	};
-	const struct bucket_table *tbl;
-	struct rhash_head *he;
-	unsigned int hash;
-
-	tbl = rht_dereference_rcu(ht->tbl, ht);
-restart:
-	hash = rht_key_hashfn(ht, tbl, key, params);
-	rht_for_each_rcu(he, tbl, hash) {
-		if (params.obj_cmpfn ?
-		    params.obj_cmpfn(&arg, rht_obj(ht, he)) :
-		    rhashtable_compare(&arg, rht_obj(ht, he)))
-			continue;
-		return rht_obj(ht, he);
-	}
-
-	/* Ensure we see any new tables. */
-	smp_rmb();
-
-	tbl = rht_dereference_rcu(tbl->future_tbl, ht);
-	if (unlikely(tbl))
-		goto restart;
-
-	return NULL;
-}
-#endif /* !HAVE_RHASHTABLE_LOOKUP */
-#else
-#define rhashtable_init(ht, param) 0
-#define rhashtable_destroy(ht) do {} while (0)
-#endif /* HAVE_LINUX_RHASHTABLE_H */
-
-#endif /* __LIBCFS_LINUX_HASH_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h
index 81e79dbf24852..f08d623bd8a84 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h
@@ -37,6 +37,14 @@
 #ifndef __LIBCFS_LINUX_CFS_MEM_H__
 #define __LIBCFS_LINUX_CFS_MEM_H__
 
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
+#endif
+
+#ifndef __KERNEL__
+#error This include is only for kernel use.
+#endif
+
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/pagemap.h>
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h
index 2b07699f77284..754f183050485 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -34,10 +34,7 @@
 #define __LIBCFS_LINUX_MISC_H__
 
 #include <linux/fs.h>
-#include <linux/mutex.h>
-#include <linux/user_namespace.h>
 #include <linux/uio.h>
-#include <linux/kallsyms.h>
 
 #ifdef HAVE_SYSCTL_CTLNAME
 #define INIT_CTL_NAME	.ctl_name = CTL_UNNUMBERED,
@@ -63,8 +60,8 @@
 #endif
 #endif /* HAVE_IOV_ITER_TYPE */
 
-#ifndef HAVE_MODULE_PARAM_LOCKING
-static DEFINE_MUTEX(param_lock);
+#ifndef HAVE_LINUX_SELINUX_IS_ENABLED
+bool selinux_is_enabled(void);
 #endif
 
 #ifndef HAVE_UIDGID_HEADER
@@ -131,40 +128,12 @@ static inline bool gid_valid(kgid_t gid)
 
 int cfs_get_environ(const char *key, char *value, int *val_len);
 
-int cfs_kernel_write(struct file *filp, const void *buf, size_t count,
-		     loff_t *pos);
-
-/*
- * For RHEL6 struct kernel_parm_ops doesn't exist. Also
- * the arguments for .set and .get take different
- * parameters which is handled below
- */
-#ifdef HAVE_KERNEL_PARAM_OPS
-#define cfs_kernel_param_arg_t const struct kernel_param
-#else
-#define cfs_kernel_param_arg_t struct kernel_param_ops
-#define kernel_param_ops kernel_param
-#endif /* ! HAVE_KERNEL_PARAM_OPS */
-
-#ifndef HAVE_KERNEL_PARAM_LOCK
-static inline void kernel_param_unlock(struct module *mod)
-{
-#ifndef	HAVE_MODULE_PARAM_LOCKING
-	mutex_unlock(&param_lock);
-#else
-	__kernel_param_unlock();
+#ifndef HAVE_WAIT_QUEUE_ENTRY
+#define wait_queue_entry_t wait_queue_t
 #endif
-}
 
-static inline void kernel_param_lock(struct module *mod)
-{
-#ifndef	HAVE_MODULE_PARAM_LOCKING
-	mutex_lock(&param_lock);
-#else
-	__kernel_param_lock();
-#endif
-}
-#endif /* ! HAVE_KERNEL_PARAM_LOCK */
+int cfs_kernel_write(struct file *filp, const void *buf, size_t count,
+		     loff_t *pos);
 
 #ifndef HAVE_KSTRTOUL
 static inline int kstrtoul(const char *s, unsigned int base, unsigned long *res)
@@ -178,23 +147,4 @@ static inline int kstrtoul(const char *s, unsigned int base, unsigned long *res)
 }
 #endif /* !HAVE_KSTRTOUL */
 
-#ifndef HAVE_KSTRTOBOOL_FROM_USER
-
-#define kstrtobool strtobool
-
-int kstrtobool_from_user(const char __user *s, size_t count, bool *res);
-#endif
-
-#ifdef HAVE_KALLSYMS_LOOKUP_NAME
-static inline void *cfs_kallsyms_lookup_name(const char *name)
-{
-	return (void *)kallsyms_lookup_name(name);
-}
-#else
-static inline void *cfs_kallsyms_lookup_name(const char *name)
-{
-	return NULL;
-}
-#endif
-
 #endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
index 3934635dcd322..a805ff9aedf84 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
@@ -39,13 +39,54 @@
 #ifndef __LIBCFS_LINUX_LINUX_TIME_H__
 #define __LIBCFS_LINUX_LINUX_TIME_H__
 
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
+#endif
+
+#ifndef __KERNEL__
+#error This include is only for kernel use.
+#endif
+
 /* Portable time API */
-#include <linux/hrtimer.h>
+
+/*
+ * Platform provides three opaque data-types:
+ *
+ *  cfs_time_t        represents point in time. This is internal kernel
+ *                    time rather than "wall clock". This time bears no
+ *                    relation to gettimeofday().
+ *
+ *  cfs_duration_t    represents time interval with resolution of internal
+ *                    platform clock
+ *
+ *  cfs_time_t     cfs_time_current(void);
+ *  cfs_time_t     cfs_time_add    (cfs_time_t, cfs_duration_t);
+ *  cfs_duration_t cfs_time_sub    (cfs_time_t, cfs_time_t);
+ *  int            cfs_impl_time_before (cfs_time_t, cfs_time_t);
+ *  int            cfs_impl_time_before_eq(cfs_time_t, cfs_time_t);
+ *
+ *  cfs_duration_t cfs_duration_build(int64_t);
+ *
+ *  time_t         cfs_duration_sec (cfs_duration_t);
+ *  void           cfs_duration_usec(cfs_duration_t, struct timeval *);
+ *  void           cfs_duration_nsec(cfs_duration_t, struct timespec *);
+ *
+ *  CFS_TIME_FORMAT
+ *  CFS_DURATION_FORMAT
+ *
+ */
+
+#define ONE_BILLION ((u_int64_t)1000000000)
+#define ONE_MILLION 1000000
+
+#ifndef __KERNEL__
+#error This include is only for kernel use.
+#endif
+
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/version.h>
 #include <linux/jiffies.h>
-#include <linux/hrtimer.h>
 #include <linux/types.h>
 #include <linux/time.h>
 #include <asm/div64.h>
@@ -53,6 +94,10 @@
 /*
  * Generic kernel stuff
  */
+
+typedef unsigned long cfs_time_t;      /* jiffies */
+typedef long cfs_duration_t;
+
 #ifndef HAVE_TIMESPEC64
 
 typedef __s64 time64_t;
@@ -98,23 +143,22 @@ static inline struct timespec timespec64_to_timespec(const struct timespec64 ts6
 
 #endif /* HAVE_TIMESPEC64 */
 
-#ifndef HAVE_NS_TO_TIMESPEC64
-static inline struct timespec64 ns_to_timespec64(const s64 nsec)
-{
-	struct timespec64 ts;
-	s32 rem;
-
-	if (!nsec)
-		return (struct timespec64) {0, 0};
-
-	ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
-	if (unlikely(rem < 0)) {
-		ts.tv_sec--;
-		rem += NSEC_PER_SEC;
-	}
-	ts.tv_nsec = rem;
+#ifndef HAVE_TIME_T
+typedef __kernel_old_time_t time_t;
+#endif
 
-	return ts;
+#ifndef HAVE_JIFFIES_TO_TIMESPEC64
+static inline void
+jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value)
+{
+	/*
+	 * Convert jiffies to nanoseconds and separate with
+	 * one divide.
+	 */
+	u32 rem;
+	value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
+					NSEC_PER_SEC, &rem);
+	value->tv_nsec = rem;
 }
 #endif
 
@@ -163,13 +207,6 @@ time64_t ktime_get_real_seconds(void);
 time64_t ktime_get_seconds(void);
 #endif /* HAVE_KTIME_GET_SECONDS */
 
-#ifdef NEED_KTIME_GET_NS
-static inline u64 ktime_get_ns(void)
-{
-	return ktime_to_ns(ktime_get());
-}
-#endif /* NEED_KTIME_GET_NS */
-
 #ifdef NEED_KTIME_GET_REAL_NS
 static inline u64 ktime_get_real_ns(void)
 {
@@ -177,13 +214,6 @@ static inline u64 ktime_get_real_ns(void)
 }
 #endif /* NEED_KTIME_GET_REAL_NS */
 
-#ifndef HAVE_KTIME_MS_DELTA
-static inline s64 ktime_ms_delta(const ktime_t later, const ktime_t earlier)
-{
-	return ktime_to_ms(ktime_sub(later, earlier));
-}
-#endif /* HAVE_KTIME_MS_DELTA */
-
 #ifndef HAVE_KTIME_TO_TIMESPEC64
 static inline struct timespec64 ktime_to_timespec64(ktime_t kt)
 {
@@ -212,39 +242,79 @@ static inline ktime_t timespec64_to_ktime(struct timespec64 ts)
 }
 #endif
 
-static inline unsigned long cfs_time_seconds(time64_t seconds)
+static inline int cfs_time_before(cfs_time_t t1, cfs_time_t t2)
 {
-	return nsecs_to_jiffies(seconds * NSEC_PER_SEC);
+        return time_before(t1, t2);
 }
 
-#ifdef HAVE_NEW_DEFINE_TIMER
-# ifndef TIMER_DATA_TYPE
-# define TIMER_DATA_TYPE struct timer_list *
-# endif
+static inline int cfs_time_beforeq(cfs_time_t t1, cfs_time_t t2)
+{
+        return time_before_eq(t1, t2);
+}
 
-#define CFS_DEFINE_TIMER(_name, _function, _expires, _data) \
-	DEFINE_TIMER((_name), (_function))
-#else
-# ifndef TIMER_DATA_TYPE
-# define TIMER_DATA_TYPE unsigned long
-# endif
+static inline cfs_time_t cfs_time_current(void)
+{
+        return jiffies;
+}
 
-#define CFS_DEFINE_TIMER(_name, _function, _expires, _data) \
-	DEFINE_TIMER((_name), (_function), (_expires), (_data))
-#endif
+static inline time_t cfs_time_current_sec(void)
+{
+	return ktime_get_real_seconds();
+}
+
+static inline cfs_duration_t cfs_time_seconds(int seconds)
+{
+	return ((cfs_duration_t)seconds) * msecs_to_jiffies(MSEC_PER_SEC);
+}
+
+static inline time_t cfs_duration_sec(cfs_duration_t d)
+{
+	return d / msecs_to_jiffies(MSEC_PER_SEC);
+}
+
+#define cfs_time_current_64 get_jiffies_64
+
+static inline __u64 cfs_time_add_64(__u64 t, __u64 d)
+{
+        return t + d;
+}
+
+static inline __u64 cfs_time_shift_64(int seconds)
+{
+        return cfs_time_add_64(cfs_time_current_64(),
+                               cfs_time_seconds(seconds));
+}
 
+static inline int cfs_time_before_64(__u64 t1, __u64 t2)
+{
+        return (__s64)t2 - (__s64)t1 > 0;
+}
+
+static inline int cfs_time_beforeq_64(__u64 t1, __u64 t2)
+{
+        return (__s64)t2 - (__s64)t1 >= 0;
+}
+
+/*
+ * One jiffy
+ */
+#define CFS_DURATION_T          "%ld"
 #ifdef HAVE_TIMER_SETUP
 #define cfs_timer_cb_arg_t struct timer_list *
 #define cfs_from_timer(var, callback_timer, timer_fieldname) \
 	from_timer(var, callback_timer, timer_fieldname)
 #define cfs_timer_setup(timer, callback, data, flags) \
 	timer_setup((timer), (callback), (flags))
+#define CFS_DEFINE_TIMER(_name, _function, _expires, _data) \
+	DEFINE_TIMER((_name), (_function))
 #define cfs_timer_cb_arg(var, timer_fieldname) (&(var)->timer_fieldname)
 #else
 #define cfs_timer_cb_arg_t unsigned long
 #define cfs_from_timer(var, data, timer_fieldname) (typeof(var))(data)
 #define cfs_timer_setup(timer, callback, data, flags) \
 	setup_timer((timer), (callback), (data))
+#define CFS_DEFINE_TIMER(_name, _function, _expires, _data) \
+	DEFINE_TIMER((_name), (_function), (_expires), (_data))
 #define cfs_timer_cb_arg(var, timer_fieldname) (cfs_timer_cb_arg_t)(var)
 #endif
 
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-wait.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-wait.h
deleted file mode 100644
index fd154ba0f049f..0000000000000
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-wait.h
+++ /dev/null
@@ -1,568 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __LIBCFS_LINUX_WAIT_BIT_H
-#define __LIBCFS_LINUX_WAIT_BIT_H
-
-/* Make sure we can see if we have TASK_NOLOAD */
-#include <linux/sched.h>
-/*
- * Linux wait-bit related types and methods:
- */
-#ifdef HAVE_WAIT_BIT_HEADER_H
-#include <linux/wait_bit.h>
-#endif
-#include <linux/wait.h>
-
-#ifndef HAVE_WAIT_QUEUE_ENTRY
-#define wait_queue_entry_t wait_queue_t
-#endif
-
-#ifndef HAVE_WAIT_BIT_HEADER_H
-struct wait_bit_queue_entry {
-	struct wait_bit_key	key;
-	wait_queue_entry_t	wq_entry;
-};
-
-#define ___wait_is_interruptible(state)                                         \
-	(!__builtin_constant_p(state) ||                                        \
-		state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE)          \
-
-#endif /* ! HAVE_WAIT_BIT_HEADER_H */
-
-#ifndef HAVE_PREPARE_TO_WAIT_EVENT
-extern long prepare_to_wait_event(wait_queue_head_t *wq_head,
-				  wait_queue_entry_t *wq_entry, int state);
-#endif
-
-/* ___wait_cond_timeout changed number of args in v3.12-rc1-78-g35a2af94c7ce
- * so let's define our own ___wait_cond_timeout1
- */
-
-#define ___wait_cond_timeout1(condition)				\
-({									\
-	bool __cond = (condition);					\
-	if (__cond && !__ret)						\
-		__ret = 1;						\
-	__cond || !__ret;						\
-})
-
-#ifndef HAVE_CLEAR_AND_WAKE_UP_BIT
-/**
- * clear_and_wake_up_bit - clear a bit and wake up anyone waiting on that bit
- *
- * @bit: the bit of the word being waited on
- * @word: the word being waited on, a kernel virtual address
- *
- * You can use this helper if bitflags are manipulated atomically rather than
- * non-atomically under a lock.
- */
-static inline void clear_and_wake_up_bit(int bit, void *word)
-{
-	clear_bit_unlock(bit, word);
-	/* See wake_up_bit() for which memory barrier you need to use. */
-	smp_mb__after_atomic();
-	wake_up_bit(word, bit);
-}
-#endif /* ! HAVE_CLEAR_AND_WAKE_UP_BIT */
-
-#ifndef HAVE_WAIT_VAR_EVENT
-extern void __init wait_bit_init(void);
-extern void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry,
-				void *var, int flags);
-extern void wake_up_var(void *var);
-extern wait_queue_head_t *__var_waitqueue(void *p);
-
-#define ___wait_var_event(var, condition, state, exclusive, ret, cmd)	\
-({									\
-	__label__ __out;						\
-	wait_queue_head_t *__wq_head = __var_waitqueue(var);		\
-	struct wait_bit_queue_entry __wbq_entry;			\
-	long __ret = ret; /* explicit shadow */				\
-									\
-	init_wait_var_entry(&__wbq_entry, var,				\
-			    exclusive ? WQ_FLAG_EXCLUSIVE : 0);		\
-	for (;;) {							\
-		long __int = prepare_to_wait_event(__wq_head,		\
-						   &__wbq_entry.wq_entry, \
-						   state);		\
-		if (condition)						\
-			break;						\
-									\
-		if (___wait_is_interruptible(state) && __int) {		\
-			__ret = __int;					\
-			goto __out;					\
-		}							\
-									\
-		cmd;							\
-	}								\
-	finish_wait(__wq_head, &__wbq_entry.wq_entry);			\
-__out:	__ret;								\
-})
-
-#define __wait_var_event(var, condition)				\
-	___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0,	\
-			  schedule())
-
-#define wait_var_event(var, condition)					\
-do {									\
-	might_sleep();							\
-	if (condition)							\
-		break;							\
-	__wait_var_event(var, condition);				\
-} while (0)
-
-#define __wait_var_event_killable(var, condition)			\
-	___wait_var_event(var, condition, TASK_KILLABLE, 0, 0,		\
-			  schedule())
-
-#define wait_var_event_killable(var, condition)				\
-({									\
-	int __ret = 0;							\
-	might_sleep();							\
-	if (!(condition))						\
-		__ret = __wait_var_event_killable(var, condition);	\
-	__ret;								\
-})
-
-#define __wait_var_event_timeout(var, condition, timeout)		\
-	___wait_var_event(var, ___wait_cond_timeout1(condition),	\
-			  TASK_UNINTERRUPTIBLE, 0, timeout,		\
-			  __ret = schedule_timeout(__ret))
-
-#define wait_var_event_timeout(var, condition, timeout)			\
-({									\
-	long __ret = timeout;						\
-	might_sleep();							\
-	if (!___wait_cond_timeout1(condition))				\
-		__ret = __wait_var_event_timeout(var, condition, timeout); \
-	__ret;								\
-})
-#endif /* ! HAVE_WAIT_VAR_EVENT */
-
-/*
- * prepare_to_wait_event() does not support an exclusive
- * lifo wait.
- * However it will not relink the wait_queue_entry if
- * it is already linked.  So we link to the head of the
- * queue here, and it will stay there.
- */
-static inline void prepare_to_wait_exclusive_head(
-	wait_queue_head_t *waitq, wait_queue_entry_t *link)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&(waitq->lock), flags);
-#ifdef HAVE_WAIT_QUEUE_ENTRY_LIST
-	if (list_empty(&link->entry))
-#else
-	if (list_empty(&link->task_list))
-#endif
-		__add_wait_queue_exclusive(waitq, link);
-	spin_unlock_irqrestore(&((waitq)->lock), flags);
-}
-
-#ifndef ___wait_event
-/*
- * The below macro ___wait_event() has an explicit shadow of the __ret
- * variable when used from the wait_event_*() macros.
- *
- * This is so that both can use the ___wait_cond_timeout1() construct
- * to wrap the condition.
- *
- * The type inconsistency of the wait_event_*() __ret variable is also
- * on purpose; we use long where we can return timeout values and int
- * otherwise.
- */
-
-#define ___wait_event(wq_head, condition, state, exclusive, ret, cmd)	\
-({									\
-	__label__ __out;						\
-	wait_queue_entry_ __wq_entry;					\
-	long __ret = ret;	/* explicit shadow */			\
-									\
-	init_wait(&__wq_entry);						\
-	if (exclusive)							\
-		__wq_entry.flags = WQ_FLAG_EXCLUSIVE			\
-	for (;;) {							\
-		long __int = prepare_to_wait_event(&wq_head,		\
-						  &__wq_entry, state);	\
-									\
-		if (condition)						\
-			break;						\
-									\
-		if (___wait_is_interruptible(state) && __int) {		\
-			__ret = __int;					\
-			goto __out;					\
-		}							\
-									\
-		cmd;							\
-	}								\
-	finish_wait(&wq_head, &__wq_entry);				\
-__out:	__ret;								\
-})
-#endif
-
-#ifndef TASK_NOLOAD
-
-#define ___wait_event_idle(wq_head, condition, exclusive, ret, cmd)	\
-({									\
-	wait_queue_entry_t __wq_entry;					\
-	unsigned long flags;						\
-	long __ret = ret;	/* explicit shadow */			\
-	sigset_t __blocked;						\
-									\
-	__blocked = cfs_block_sigsinv(0);				\
-	init_wait(&__wq_entry);						\
-	if (exclusive)							\
-		__wq_entry.flags = WQ_FLAG_EXCLUSIVE;			\
-	for (;;) {							\
-		prepare_to_wait_event(&wq_head,				\
-				   &__wq_entry,				\
-				   TASK_INTERRUPTIBLE);			\
-									\
-		if (condition)						\
-			break;						\
-		/* We have to do this here because some signals */	\
-		/* are not blockable - ie from strace(1).       */	\
-		/* In these cases we want to schedule_timeout() */	\
-		/* again, because we don't want that to return  */	\
-		/* -EINTR when the RPC actually succeeded.      */	\
-		/* the recalc_sigpending() below will deliver the */	\
-		/* signal properly.                             */	\
-		if (signal_pending(current)) {				\
-			spin_lock_irqsave(&current->sighand->siglock,	\
-					  flags);			\
-			clear_tsk_thread_flag(current, TIF_SIGPENDING);	\
-			spin_unlock_irqrestore(&current->sighand->siglock,\
-					       flags);			\
-		}							\
-		cmd;							\
-	}								\
-	finish_wait(&wq_head, &__wq_entry);				\
-	cfs_restore_sigs(__blocked);					\
-	__ret;								\
-})
-
-#define wait_event_idle(wq_head, condition)				\
-do {									\
-	might_sleep();							\
-	if (!(condition))						\
-		___wait_event_idle(wq_head, condition, 0, 0, schedule());\
-} while (0)
-
-#define wait_event_idle_exclusive(wq_head, condition)			\
-do {									\
-	might_sleep();							\
-	if (!(condition))						\
-		___wait_event_idle(wq_head, condition, 1, 0, schedule());\
-} while (0)
-
-#define __wait_event_idle_exclusive_timeout(wq_head, condition, timeout)\
-	___wait_event_idle(wq_head, ___wait_cond_timeout1(condition),	\
-			   1, timeout,					\
-			   __ret = schedule_timeout(__ret))
-
-#define wait_event_idle_exclusive_timeout(wq_head, condition, timeout)	\
-({									\
-	long __ret = timeout;						\
-	might_sleep();							\
-	if (!___wait_cond_timeout1(condition))				\
-		__ret = __wait_event_idle_exclusive_timeout(		\
-			wq_head, condition, timeout);			\
-	__ret;								\
-})
-
-#define __wait_event_idle_exclusive_timeout_cmd(wq_head, condition,	\
-						timeout, cmd1, cmd2)	\
-	___wait_event_idle(wq_head, ___wait_cond_timeout1(condition),	\
-			   1, timeout,					\
-			   cmd1; __ret = schedule_timeout(__ret); cmd2)
-
-#define wait_event_idle_exclusive_timeout_cmd(wq_head, condition, timeout,\
-					      cmd1, cmd2)		\
-({									\
-	long __ret = timeout;						\
-	if (!___wait_cond_timeout1(condition))				\
-		__ret = __wait_event_idle_exclusive_timeout_cmd(	\
-			wq_head, condition, timeout, cmd1, cmd2);	\
-	__ret;								\
-})
-
-#define __wait_event_idle_timeout(wq_head, condition, timeout)		\
-	___wait_event_idle(wq_head, ___wait_cond_timeout1(condition),	\
-			   0, timeout,					\
-			   __ret = schedule_timeout(__ret))
-
-#define wait_event_idle_timeout(wq_head, condition, timeout)		\
-({									\
-	long __ret = timeout;						\
-	might_sleep();							\
-	if (!___wait_cond_timeout1(condition))				\
-		__ret = __wait_event_idle_timeout(wq_head, condition,	\
-						  timeout);		\
-	__ret;								\
-})
-
-#else /* TASK_IDLE */
-#ifndef wait_event_idle
-/**
- * wait_event_idle - wait for a condition without contributing to system load
- * @wq_head: the waitqueue to wait on
- * @condition: a C expression for the event to wait for
- *
- * The process is put to sleep (TASK_IDLE) until the
- * @condition evaluates to true.
- * The @condition is checked each time the waitqueue @wq_head is woken up.
- *
- * wake_up() has to be called after changing any variable that could
- * change the result of the wait condition.
- *
- */
-#define wait_event_idle(wq_head, condition)				\
-do {									\
-	might_sleep();							\
-	if (!(condition))						\
-		___wait_event(wq_head, condition, TASK_IDLE, 0, 0,	\
-			      schedule());				\
-} while (0)
-#endif
-#ifndef wait_event_idle_exclusive
-/**
- * wait_event_idle_exclusive - wait for a condition without contributing to
- *               system load
- * @wq_head: the waitqueue to wait on
- * @condition: a C expression for the event to wait for
- *
- * The process is put to sleep (TASK_IDLE) until the
- * @condition evaluates to true.
- * The @condition is checked each time the waitqueue @wq_head is woken up.
- *
- * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
- * set thus if other processes wait on the same list, when this
- * process is woken further processes are not considered.
- *
- * wake_up() has to be called after changing any variable that could
- * change the result of the wait condition.
- *
- */
-#define wait_event_idle_exclusive(wq_head, condition)			\
-do {									\
-	might_sleep();							\
-	if (!(condition))						\
-		___wait_event(wq_head, condition, TASK_IDLE, 1, 0,	\
-			      schedule());				\
-} while (0)
-#endif
-#ifndef wait_event_idle_exclusive_timeout
-/**
- * wait_event_idle_exclusive_timeout - sleep without load until a condition
- *                       becomes true or a timeout elapses
- * @wq_head: the waitqueue to wait on
- * @condition: a C expression for the event to wait for
- * @timeout: timeout, in jiffies
- *
- * The process is put to sleep (TASK_IDLE) until the
- * @condition evaluates to true. The @condition is checked each time
- * the waitqueue @wq_head is woken up.
- *
- * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
- * set thus if other processes wait on the same list, when this
- * process is woken further processes are not considered.
- *
- * wake_up() has to be called after changing any variable that could
- * change the result of the wait condition.
- *
- * Returns:
- * 0 if the @condition evaluated to %false after the @timeout elapsed,
- * 1 if the @condition evaluated to %true after the @timeout elapsed,
- * or the remaining jiffies (at least 1) if the @condition evaluated
- * to %true before the @timeout elapsed.
- */
-#define wait_event_idle_exclusive_timeout(wq_head, condition, timeout)	\
-({									\
-	long __ret = timeout;						\
-	might_sleep();							\
-	if (!___wait_cond_timeout1(condition))				\
-		__ret = __wait_event_idle_exclusive_timeout(wq_head,	\
-							    condition,	\
-							    timeout);	\
-	__ret;								\
-})
-#endif
-#ifndef wait_event_idle_exclusive_timeout_cmd
-#define __wait_event_idle_exclusive_timeout_cmd(wq_head, condition,	\
-						timeout, cmd1, cmd2)	\
-	___wait_event(wq_head, ___wait_cond_timeout1(condition),	\
-		      TASK_IDLE, 1, timeout,				\
-		      cmd1; __ret = schedule_timeout(__ret); cmd2)
-
-#define wait_event_idle_exclusive_timeout_cmd(wq_head, condition, timeout,\
-					      cmd1, cmd2)		\
-({									\
-	long __ret = timeout;						\
-	if (!___wait_cond_timeout1(condition))				\
-		__ret = __wait_event_idle_exclusive_timeout_cmd(	\
-			wq_head, condition, timeout, cmd1, cmd2);	\
-	__ret;								\
-})
-#endif
-
-#ifndef wait_event_idle_timeout
-
-#define __wait_event_idle_timeout(wq_head, condition, timeout)		\
-	___wait_event(wq_head, ___wait_cond_timeout1(condition),	\
-		      TASK_IDLE, 0, timeout,				\
-		      __ret = schedule_timeout(__ret))
-
-/**
- * wait_event_idle_timeout - sleep without load until a condition becomes
- *                           true or a timeout elapses
- * @wq_head: the waitqueue to wait on
- * @condition: a C expression for the event to wait for
- * @timeout: timeout, in jiffies
- *
- * The process is put to sleep (TASK_IDLE) until the
- * @condition evaluates to true. The @condition is checked each time
- * the waitqueue @wq_head is woken up.
- *
- * wake_up() has to be called after changing any variable that could
- * change the result of the wait condition.
- *
- * Returns:
- * 0 if the @condition evaluated to %false after the @timeout elapsed,
- * 1 if the @condition evaluated to %true after the @timeout elapsed,
- * or the remaining jiffies (at least 1) if the @condition evaluated
- * to %true before the @timeout elapsed.
- */
-#define wait_event_idle_timeout(wq_head, condition, timeout)		\
-({									\
-	long __ret = timeout;						\
-	might_sleep();							\
-	if (!___wait_cond_timeout1(condition))				\
-		__ret = __wait_event_idle_timeout(wq_head, condition,	\
-						  timeout);		\
-	__ret;								\
-})
-#endif
-#endif /* TASK_IDLE */
-
-/* ___wait_event_lifo is used for lifo exclusive 'idle' waits */
-#ifdef TASK_NOLOAD
-
-#define ___wait_event_lifo(wq_head, condition, ret, cmd)		\
-({									\
-	wait_queue_entry_t	 __wq_entry;				\
-	long __ret = ret;	/* explicit shadow */			\
-									\
-	init_wait(&__wq_entry);						\
-	__wq_entry.flags =  WQ_FLAG_EXCLUSIVE;				\
-	for (;;) {							\
-		prepare_to_wait_exclusive_head(&wq_head, &__wq_entry);	\
-		prepare_to_wait_event(&wq_head, &__wq_entry, TASK_IDLE);\
-									\
-		if (condition)						\
-			break;						\
-									\
-		cmd;							\
-	}								\
-	finish_wait(&wq_head, &__wq_entry);				\
-	__ret;								\
-})
-#else
-#define ___wait_event_lifo(wq_head, condition, ret, cmd)		\
-({									\
-	wait_queue_entry_t __wq_entry;					\
-	unsigned long flags;						\
-	long __ret = ret;	/* explicit shadow */			\
-	sigset_t __blocked;						\
-									\
-	__blocked = cfs_block_sigsinv(0);				\
-	init_wait(&__wq_entry);						\
-	__wq_entry.flags = WQ_FLAG_EXCLUSIVE;				\
-	for (;;) {							\
-		prepare_to_wait_exclusive_head(&wq_head, &__wq_entry);	\
-		prepare_to_wait_event(&wq_head, &__wq_entry,		\
-				      TASK_INTERRUPTIBLE);		\
-									\
-		if (condition)						\
-			break;						\
-		/* See justification in ___wait_event_idle */		\
-		if (signal_pending(current)) {				\
-			spin_lock_irqsave(&current->sighand->siglock,	\
-					  flags);			\
-			clear_tsk_thread_flag(current, TIF_SIGPENDING);	\
-			spin_unlock_irqrestore(&current->sighand->siglock,\
-					       flags);			\
-		}							\
-		cmd;							\
-	}								\
-	cfs_restore_sigs(__blocked);					\
-	finish_wait(&wq_head, &__wq_entry);				\
-	__ret;								\
-})
-#endif
-
-#define wait_event_idle_exclusive_lifo(wq_head, condition)		\
-do {									\
-	might_sleep();							\
-	if (!(condition))						\
-		___wait_event_lifo(wq_head, condition, 0, schedule());	\
-} while (0)
-
-#define __wait_event_idle_lifo_timeout(wq_head, condition, timeout)	\
-	___wait_event_lifo(wq_head, ___wait_cond_timeout1(condition),	\
-			   timeout,					\
-			   __ret = schedule_timeout(__ret))
-
-#define wait_event_idle_exclusive_lifo_timeout(wq_head, condition, timeout)\
-({									\
-	long __ret = timeout;						\
-	might_sleep();							\
-	if (!___wait_cond_timeout1(condition))				\
-		__ret = __wait_event_idle_lifo_timeout(wq_head,		\
-						       condition,	\
-						       timeout);	\
-	__ret;								\
-})
-
-/* l_wait_event_abortable() is a bit like wait_event_killable()
- * except there is a fixed set of signals which will abort:
- * LUSTRE_FATAL_SIGS
- */
-#define LUSTRE_FATAL_SIGS					 \
-	(sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGTERM) | \
-	 sigmask(SIGQUIT) | sigmask(SIGALRM))
-
-#define l_wait_event_abortable(wq, condition)				\
-({									\
-	sigset_t __new_blocked, __old_blocked;				\
-	int __ret = 0;							\
-	siginitsetinv(&__new_blocked, LUSTRE_FATAL_SIGS);		\
-	sigprocmask(SIG_BLOCK, &__new_blocked, &__old_blocked);		\
-	__ret = wait_event_interruptible(wq, condition);		\
-	sigprocmask(SIG_SETMASK, &__old_blocked, NULL);			\
-	__ret;								\
-})
-
-#define l_wait_event_abortable_timeout(wq, condition, timeout)		\
-({									\
-	sigset_t __new_blocked, __old_blocked;				\
-	int __ret = 0;							\
-	siginitsetinv(&__new_blocked, LUSTRE_FATAL_SIGS);		\
-	sigprocmask(SIG_BLOCK, &__new_blocked, &__old_blocked);		\
-	__ret = wait_event_interruptible_timeout(wq, condition, timeout);\
-	sigprocmask(SIG_SETMASK, &__old_blocked, NULL);			\
-	__ret;								\
-})
-
-#define l_wait_event_abortable_exclusive(wq, condition)			\
-({									\
-	sigset_t __new_blocked, __old_blocked;				\
-	int __ret = 0;							\
-	siginitsetinv(&__new_blocked, LUSTRE_FATAL_SIGS);		\
-	sigprocmask(SIG_BLOCK, &__new_blocked, &__old_blocked);		\
-	__ret = wait_event_interruptible_exclusive(wq, condition);	\
-	sigprocmask(SIG_SETMASK, &__old_blocked, NULL);			\
-	__ret;								\
-})
-
-#endif /* __LICBFS_LINUX_WAIT_BIT_H */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/hash.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/hash.h
deleted file mode 100644
index 45818dddedd94..0000000000000
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/hash.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-
-#ifndef _LINUX_HASH_H
-#define _LINUX_HASH_H
-/* Fast hashing routine for ints,  longs and pointers.
-   (C) 2002 Nadia Yvette Chambers, IBM */
-
-/*
- * Knuth recommends primes in approximately golden ratio to the maximum
- * integer representable by a machine word for multiplicative hashing.
- * Chuck Lever verified the effectiveness of this technique:
- * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
- *
- * These primes are chosen to be bit-sparse, that is operations on
- * them can use shifts and additions instead of multiplications for
- * machines where multiplications are slow.
- */
-
-#include <linux/types.h>
-
-/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
-#define GOLDEN_RATIO_PRIME_32 0x9e370001UL
-/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
-#define GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001UL
-
-#if __BITS_PER_LONG == 32
-#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_32
-#define hash_long(val, bits) hash_32(val, bits)
-#elif __BITS_PER_LONG == 64
-#define hash_long(val, bits) hash_64(val, bits)
-#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_64
-#else
-#error Wordsize not 32 or 64
-#endif
-
-static __always_inline __u64 hash_64(__u64 val, unsigned int bits)
-{
-	__u64 hash = val;
-
-	/*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
-	__u64 n = hash;
-	n <<= 18;
-	hash -= n;
-	n <<= 33;
-	hash -= n;
-	n <<= 3;
-	hash += n;
-	n <<= 3;
-	hash -= n;
-	n <<= 4;
-	hash += n;
-	n <<= 2;
-	hash += n;
-
-	/* High bits are more random, so use them. */
-	return hash >> (64 - bits);
-}
-
-static inline __u32 hash_32(__u32 val, unsigned int bits)
-{
-	/* On some cpus multiply is faster, on others gcc will do shifts */
-	__u32 hash = val * GOLDEN_RATIO_PRIME_32;
-
-	/* High bits are more random, so use them. */
-	return hash >> (32 - bits);
-}
-
-static inline unsigned long hash_ptr(const void *ptr, unsigned int bits)
-{
-	return hash_long((unsigned long)ptr, bits);
-}
-
-static inline __u32 hash32_ptr(const void *ptr)
-{
-	unsigned long val = (unsigned long)ptr;
-
-#if __BITS_PER_LONG == 64
-	val ^= (val >> 32);
-#endif
-	return (__u32)val;
-}
-
-#endif /* _LINUX_HASH_H */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/ioctl.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/ioctl.h
index a42e0c5fe4568..600bf27b607b4 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/ioctl.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/ioctl.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2014, 2017, Intel Corporation.
+ * Copyright (c) 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -43,7 +43,7 @@
 /* Sparse annotation. */
 #define __user
 
-#include <uapi/linux/lnet/libcfs_ioctl.h>
+#include <libcfs/libcfs_ioctl.h>
 
 #define LIBCFS_IOC_INIT(data)					\
 do {								\
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/parser.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/parser.h
index 7bae8393a1916..2fb2db7c651dd 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/parser.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/parser.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2014, 2017, Intel Corporation.
+ * Copyright (c) 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -107,7 +107,7 @@ char *Parser_strarg(char *inp, const char *prompt, const char *deft,
 int Parser_arg2int(const char *inp, long *result, int base);
 
 /* Convert human readable size string to and int; "1k" -> 1000 */
-int Parser_size(unsigned long *sizep, char *str);
+int Parser_size(int *sizep, char *str);
 
 /* Convert a string boolean to an int; "enable" -> 1 */
 int Parser_bool(int *b, char *str);
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/string.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/string.h
index 065829b7161d6..72414f0c8003a 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/string.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/string.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -44,6 +44,14 @@
 #include <linux/types.h>
 #include <libcfs/util/list.h>
 
+#ifndef HAVE_STRLCPY /* not in glibc for RHEL 5.x, remove when obsolete */
+size_t strlcpy(char *tgt, const char *src, size_t tgt_len);
+#endif
+
+#ifndef HAVE_STRLCAT /* not in glibc for RHEL 5.x, remove when obsolete */
+size_t strlcat(char *tgt, const char *src, size_t tgt_len);
+#endif
+
 /**
  * Structure to represent NULL-less strings.
  */
@@ -85,6 +93,5 @@ int cfs_ip_addr_parse(char *str, int len, struct list_head *list);
 int cfs_ip_addr_range_gen(__u32 *ip_list, int count,
 			  struct list_head *ip_addr_expr);
 int cfs_ip_addr_match(__u32 addr, struct list_head *list);
-int cfs_abs_path(const char *request_path, char **resolved_path);
 
 #endif
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/Makefile b/drivers/staging/lustrefsx/libcfs/libcfs/Makefile
index a324f01fa2d77..a487ba0329342 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/Makefile
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/Makefile
@@ -1,15 +1,16 @@
 obj-$(CONFIG_LUSTREFSX_LIBCFS)	+= libcfs.o
 
 libcfs-linux-objs	:= linux-tracefile.o linux-debug.o linux-prim.o
-libcfs-linux-objs	+= linux-curproc.o linux-module.o
+libcfs-linux-objs	+= linux-cpu.o linux-curproc.o linux-module.o
 libcfs-linux-objs	+= linux-crypto.o linux-crypto-adler.o
-libcfs-linux-objs	+= linux-crypto-crc32.o linux-hash.o linux-wait.o
+libcfs-linux-objs	+= linux-crypto-crc32.o
 
 libcfs-linux-objs	:= $(addprefix linux/,$(libcfs-linux-objs))
 
 libcfs-all-objs		:= debug.o fail.o module.o tracefile.o watchdog.o
 libcfs-all-objs		+= libcfs_string.o hash.o prng.o workitem.o
 libcfs-all-objs		+= libcfs_cpu.o libcfs_mem.o libcfs_lock.o heap.o
+libcfs-all-objs		+= libcfs_ptask.o
 
 libcfs-y		+= $(libcfs-linux-objs) $(libcfs-all-objs)
 
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/debug.c b/drivers/staging/lustrefsx/libcfs/libcfs/debug.c
index 0f7d6194a68f8..a4aede1e3be08 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/debug.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/debug.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -37,7 +37,6 @@
 
 # define DEBUG_SUBSYSTEM S_LNET
 
-#include <linux/ctype.h>
 #include <linux/kthread.h>
 #include <libcfs/libcfs.h>
 #include "tracefile.h"
@@ -55,63 +54,8 @@ module_param(libcfs_debug, int, 0644);
 MODULE_PARM_DESC(libcfs_debug, "Lustre kernel debug mask");
 EXPORT_SYMBOL(libcfs_debug);
 
-static int libcfs_param_debug_mb_set(const char *val,
-				     cfs_kernel_param_arg_t *kp)
-{
-	int rc;
-	unsigned int num;
-
-	rc = kstrtouint(val, 0, &num);
-	if (rc < 0)
-		return rc;
-
-/*
- * RHEL6 does not support any kind of locking so we have to provide
- * our own
- */
-#if !defined(HAVE_MODULE_PARAM_LOCKING) && !defined(HAVE_KERNEL_PARAM_LOCK)
-	kernel_param_lock(THIS_MODULE);
-#endif
-	if (!*((unsigned int *)kp->arg)) {
-		*((unsigned int *)kp->arg) = num;
-
-#if !defined(HAVE_MODULE_PARAM_LOCKING) && !defined(HAVE_KERNEL_PARAM_LOCK)
-		kernel_param_unlock(THIS_MODULE);
-#endif
-		return 0;
-	}
-
-	rc = cfs_trace_set_debug_mb(num);
-
-	if (!rc)
-		*((unsigned int *)kp->arg) = cfs_trace_get_debug_mb();
-
-#if !defined(HAVE_MODULE_PARAM_LOCKING) && !defined(HAVE_KERNEL_PARAM_LOCK)
-	kernel_param_unlock(THIS_MODULE);
-#endif
-	return rc;
-}
-
-/*
- * While debug_mb setting look like unsigned int, in fact
- * it needs quite a bunch of extra processing, so we define special
- * debug_mb parameter type with corresponding methods to handle this case
- */
-static struct kernel_param_ops param_ops_debug_mb = {
-	.set = libcfs_param_debug_mb_set,
-	.get = param_get_uint,
-};
-
-#define param_check_debug_mb(name, p) \
-		__param_check(name, p, unsigned int)
-
 static unsigned int libcfs_debug_mb;
-#ifdef HAVE_KERNEL_PARAM_OPS
-module_param(libcfs_debug_mb, debug_mb, 0644);
-#else
-module_param_call(libcfs_debug_mb, libcfs_param_debug_mb_set, param_get_uint,
-		  &param_ops_debug_mb, 0644);
-#endif
+module_param(libcfs_debug_mb, uint, 0644);
 MODULE_PARM_DESC(libcfs_debug_mb, "Total debug buffer size.");
 
 unsigned int libcfs_printk = D_CANTMASK;
@@ -122,123 +66,16 @@ unsigned int libcfs_console_ratelimit = 1;
 module_param(libcfs_console_ratelimit, uint, 0644);
 MODULE_PARM_DESC(libcfs_console_ratelimit, "Lustre kernel debug console ratelimit (0 to disable)");
 
-static int param_set_delay_minmax(const char *val,
-				  cfs_kernel_param_arg_t *kp,
-				  long min, long max)
-{
-	long d;
-	int sec;
-	int rc;
-
-	rc = kstrtoint(val, 0, &sec);
-	if (rc)
-		return -EINVAL;
-
-	/* The sysfs setting is in centiseconds */
-	d = cfs_time_seconds(sec) / 100;
-	if (d < min || d > max)
-		return -EINVAL;
-
-	*((unsigned int *)kp->arg) = d;
-
-	return 0;
-}
-
-static int param_get_delay(char *buffer, cfs_kernel_param_arg_t *kp)
-{
-	unsigned int d = *(unsigned int *)kp->arg;
-
-	return sprintf(buffer, "%lu", jiffies_to_msecs(d * 10) / MSEC_PER_SEC);
-}
-
 unsigned int libcfs_console_max_delay;
-unsigned int libcfs_console_min_delay;
-
-static int param_set_console_max_delay(const char *val,
-				       cfs_kernel_param_arg_t *kp)
-{
-	return param_set_delay_minmax(val, kp,
-				      libcfs_console_min_delay, INT_MAX);
-}
-
-static struct kernel_param_ops param_ops_console_max_delay = {
-	.set = param_set_console_max_delay,
-	.get = param_get_delay,
-};
-
-#define param_check_console_max_delay(name, p) \
-		__param_check(name, p, unsigned int)
-
-#ifdef HAVE_KERNEL_PARAM_OPS
-module_param(libcfs_console_max_delay, console_max_delay, 0644);
-#else
-module_param_call(libcfs_console_max_delay, param_set_console_max_delay,
-		  param_get_delay, &param_ops_console_max_delay, 0644);
-#endif
+module_param(libcfs_console_max_delay, uint, 0644);
 MODULE_PARM_DESC(libcfs_console_max_delay, "Lustre kernel debug console max delay (jiffies)");
 
-static int param_set_console_min_delay(const char *val,
-				       cfs_kernel_param_arg_t *kp)
-{
-	return param_set_delay_minmax(val, kp,
-				      1, libcfs_console_max_delay);
-}
-
-static struct kernel_param_ops param_ops_console_min_delay = {
-	.set = param_set_console_min_delay,
-	.get = param_get_delay,
-};
-
-#define param_check_console_min_delay(name, p) \
-		__param_check(name, p, unsigned int)
-
-#ifdef HAVE_KERNEL_PARAM_OPS
-module_param(libcfs_console_min_delay, console_min_delay, 0644);
-#else
-module_param_call(libcfs_console_min_delay, param_set_console_min_delay,
-		  param_get_delay, &param_ops_console_min_delay, 0644);
-#endif
+unsigned int libcfs_console_min_delay;
+module_param(libcfs_console_min_delay, uint, 0644);
 MODULE_PARM_DESC(libcfs_console_min_delay, "Lustre kernel debug console min delay (jiffies)");
 
-static int param_set_uint_minmax(const char *val,
-				 cfs_kernel_param_arg_t *kp,
-				 unsigned int min, unsigned int max)
-{
-	unsigned int num;
-	int ret;
-
-	if (!val)
-		return -EINVAL;
-
-	ret = kstrtouint(val, 0, &num);
-	if (ret < 0 || num < min || num > max)
-		return -EINVAL;
-
-	*((unsigned int *)kp->arg) = num;
-	return 0;
-}
-
-static int param_set_uintpos(const char *val,
-			     cfs_kernel_param_arg_t *kp)
-{
-	return param_set_uint_minmax(val, kp, 1, -1);
-}
-
-static struct kernel_param_ops param_ops_uintpos = {
-	.set = param_set_uintpos,
-	.get = param_get_uint,
-};
-
-#define param_check_uintpos(name, p) \
-		__param_check(name, p, unsigned int)
-
 unsigned int libcfs_console_backoff = CDEBUG_DEFAULT_BACKOFF;
-#ifdef HAVE_KERNEL_PARAM_OPS
-module_param(libcfs_console_backoff, uintpos, 0644);
-#else
-module_param_call(libcfs_console_backoff, param_set_uintpos, param_get_uint,
-		  &param_ops_uintpos, 0644);
-#endif
+module_param(libcfs_console_backoff, uint, 0644);
 MODULE_PARM_DESC(libcfs_console_backoff, "Lustre kernel debug console backoff factor");
 
 unsigned int libcfs_debug_binary = 1;
@@ -264,17 +101,15 @@ char libcfs_debug_file_path_arr[PATH_MAX] = LIBCFS_DEBUG_FILE_PATH_DEFAULT;
 EXPORT_SYMBOL(libcfs_debug_file_path_arr);
 
 /* We need to pass a pointer here, but elsewhere this must be a const */
-static char *libcfs_debug_file_path = LIBCFS_DEBUG_FILE_PATH_DEFAULT;
+static char *libcfs_debug_file_path;
 module_param(libcfs_debug_file_path, charp, 0644);
 MODULE_PARM_DESC(libcfs_debug_file_path,
 		 "Path for dumping debug logs, set 'NONE' to prevent log dumping");
 
 int libcfs_panic_in_progress;
 
-/*
- * libcfs_debug_token2mask() expects the returned
- * string in lower-case
- */
+/* libcfs_debug_token2mask() expects the returned
+ * string in lower-case */
 static const char *libcfs_debug_subsys2str(int subsys)
 {
 	static const char *libcfs_debug_subsystems[] = LIBCFS_DEBUG_SUBSYS_NAMES;
@@ -285,10 +120,8 @@ static const char *libcfs_debug_subsys2str(int subsys)
 	return libcfs_debug_subsystems[subsys];
 }
 
-/*
- * libcfs_debug_token2mask() expects the returned
- * string in lower-case
- */
+/* libcfs_debug_token2mask() expects the returned
+ * string in lower-case */
 static const char *libcfs_debug_dbg2str(int debug)
 {
 	static const char *libcfs_debug_masks[] = LIBCFS_DEBUG_MASKS_NAMES;
@@ -302,78 +135,79 @@ static const char *libcfs_debug_dbg2str(int debug)
 int
 libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys)
 {
-	const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
-						 libcfs_debug_dbg2str;
-	int len = 0;
-	const char *token;
-	int i;
-
-	if (mask == 0) {			/* "0" */
-		if (size > 0)
-			str[0] = '0';
-		len = 1;
-	} else {				/* space-separated tokens */
-		for (i = 0; i < 32; i++) {
-			if ((mask & (1 << i)) == 0)
-				continue;
-
-			token = fn(i);
-			if (token == NULL)	/* unused bit */
-				continue;
-
-			if (len > 0) {		/* separator? */
-				if (len < size)
-					str[len] = ' ';
-				len++;
-			}
-
-			while (*token != 0) {
-				if (len < size)
-					str[len] = *token;
-				token++;
-				len++;
-			}
-		}
-	}
-
-	/* terminate 'str' */
-	if (len < size)
-		str[len] = 0;
-	else
-		str[size - 1] = 0;
-
-	return len;
+        const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
+                                                 libcfs_debug_dbg2str;
+        int           len = 0;
+        const char   *token;
+        int           i;
+
+        if (mask == 0) {                        /* "0" */
+                if (size > 0)
+                        str[0] = '0';
+                len = 1;
+        } else {                                /* space-separated tokens */
+                for (i = 0; i < 32; i++) {
+                        if ((mask & (1 << i)) == 0)
+                                continue;
+
+                        token = fn(i);
+                        if (token == NULL)              /* unused bit */
+                                continue;
+
+                        if (len > 0) {                  /* separator? */
+                                if (len < size)
+                                        str[len] = ' ';
+                                len++;
+                        }
+
+                        while (*token != 0) {
+                                if (len < size)
+                                        str[len] = *token;
+                                token++;
+                                len++;
+                        }
+                }
+        }
+
+        /* terminate 'str' */
+        if (len < size)
+                str[len] = 0;
+        else
+                str[size - 1] = 0;
+
+        return len;
 }
 
 int
 libcfs_debug_str2mask(int *mask, const char *str, int is_subsys)
 {
-	const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
-						 libcfs_debug_dbg2str;
-	int m = 0;
-	int matched;
-	int n;
-	int t;
-
-	/* Allow a number for backwards compatibility */
-	for (n = strlen(str); n > 0; n--)
-		if (!isspace(str[n-1]))
-			break;
-	matched = n;
-
-	t = sscanf(str, "%i%n", &m, &matched);
-	if (t >= 1 && matched == n) {
-		/* don't print warning for lctl set_param debug=0 or -1 */
-		if (m != 0 && m != -1)
-			CWARN("You are trying to use a numerical value for the "
-			      "mask - this will be deprecated in a future "
-			      "release.\n");
-		*mask = m;
-		return 0;
-	}
-
-	return cfs_str2mask(str, fn, mask, is_subsys ? 0 : D_CANTMASK,
-			    0xffffffff);
+        const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
+                                                 libcfs_debug_dbg2str;
+        int         m = 0;
+        int         matched;
+        int         n;
+        int         t;
+
+        /* Allow a number for backwards compatibility */
+
+        for (n = strlen(str); n > 0; n--)
+                if (!isspace(str[n-1]))
+                        break;
+        matched = n;
+
+        if ((t = sscanf(str, "%i%n", &m, &matched)) >= 1 &&
+            matched == n) {
+                /* don't print warning for lctl set_param debug=0 or -1 */
+                if (m != 0 && m != -1)
+                        CWARN("You are trying to use a numerical value for the "
+                              "mask - this will be deprecated in a future "
+                              "release.\n");
+                *mask = m;
+                return 0;
+        }
+
+        return cfs_str2mask(str, fn, mask, is_subsys ? 0 : D_CANTMASK,
+                            0xffffffff);
 }
 
 /**
@@ -414,14 +248,11 @@ void libcfs_debug_dumplog(void)
 {
 	wait_queue_entry_t wait;
 	struct task_struct *dumper;
-
 	ENTRY;
 
-	/*
-	 * we're being careful to ensure that the kernel thread is
+	/* we're being careful to ensure that the kernel thread is
 	 * able to set our state to running as it exits before we
-	 * get to schedule()
-	 */
+	 * get to schedule() */
 	init_waitqueue_entry(&wait, current);
 	set_current_state(TASK_INTERRUPTIBLE);
 	add_wait_queue(&debug_ctlwq, &wait);
@@ -443,7 +274,7 @@ EXPORT_SYMBOL(libcfs_debug_dumplog);
 
 int libcfs_debug_init(unsigned long bufsize)
 {
-	int rc = 0;
+	int    rc = 0;
 	unsigned int max = libcfs_debug_mb;
 
 	init_waitqueue_head(&debug_ctlwq);
@@ -461,65 +292,55 @@ int libcfs_debug_init(unsigned long bufsize)
 			sizeof(libcfs_debug_file_path_arr));
 	}
 
-	/*
-	 * If libcfs_debug_mb is set to an invalid value or uninitialized
-	 * then just make the total buffers smp_num_cpus * TCD_MAX_PAGES
-	 */
+	/* If libcfs_debug_mb is set to an invalid value or uninitialized
+	 * then just make the total buffers smp_num_cpus * TCD_MAX_PAGES */
 	if (max > cfs_trace_max_debug_mb() || max < num_possible_cpus()) {
 		max = TCD_MAX_PAGES;
 	} else {
 		max = (max / num_possible_cpus());
 		max = (max << (20 - PAGE_SHIFT));
 	}
-
 	rc = cfs_tracefile_init(max);
-	if (rc)
-		return rc;
-
-	libcfs_register_panic_notifier();
-	kernel_param_lock(THIS_MODULE);
-	libcfs_debug_mb = cfs_trace_get_debug_mb();
-	kernel_param_unlock(THIS_MODULE);
-	return rc;
+
+        if (rc == 0)
+                libcfs_register_panic_notifier();
+
+        return rc;
 }
 
 int libcfs_debug_cleanup(void)
 {
-	libcfs_unregister_panic_notifier();
-	kernel_param_lock(THIS_MODULE);
-	cfs_tracefile_exit();
-	kernel_param_unlock(THIS_MODULE);
-	return 0;
+        libcfs_unregister_panic_notifier();
+        cfs_tracefile_exit();
+        return 0;
 }
 
 int libcfs_debug_clear_buffer(void)
 {
-	cfs_trace_flush_pages();
-	return 0;
+        cfs_trace_flush_pages();
+        return 0;
 }
 
-/*
- * Debug markers, although printed by S_LNET
- * should not be be marked as such.
- */
+/* Debug markers, although printed by S_LNET
+ * should not be be marked as such. */
 #undef DEBUG_SUBSYSTEM
 #define DEBUG_SUBSYSTEM S_UNDEFINED
 int libcfs_debug_mark_buffer(const char *text)
 {
-	CDEBUG(D_TRACE, "**************************************************\n");
-	LCONSOLE(D_WARNING, "DEBUG MARKER: %s\n", text);
-	CDEBUG(D_TRACE, "**************************************************\n");
+        CDEBUG(D_TRACE,"***************************************************\n");
+        LCONSOLE(D_WARNING, "DEBUG MARKER: %s\n", text);
+        CDEBUG(D_TRACE,"***************************************************\n");
 
-	return 0;
+        return 0;
 }
 #undef DEBUG_SUBSYSTEM
 #define DEBUG_SUBSYSTEM S_LNET
 
 long libcfs_log_return(struct libcfs_debug_msg_data *msgdata, long rc)
 {
-	libcfs_debug_msg(msgdata, "Process leaving (rc=%lu : %ld : %lx)\n",
-			 rc, rc, rc);
-	return rc;
+        libcfs_debug_msg(msgdata, "Process leaving (rc=%lu : %ld : %lx)\n",
+                         rc, rc, rc);
+        return rc;
 }
 EXPORT_SYMBOL(libcfs_log_return);
 
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_cpu.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_cpu.c
index 8757ad1f5c1e8..209333edf6b5b 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_cpu.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_cpu.c
@@ -13,11 +13,16 @@
  * General Public License version 2 for more details (a copy is included
  * in the LICENSE file that accompanied this code).
  *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
  * GPL HEADER END
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2013, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -30,193 +35,42 @@
 
 #define DEBUG_SUBSYSTEM S_LNET
 
-#include <linux/cpu.h>
-#include <linux/sched.h>
-#include <libcfs/libcfs_cpu.h>
 #include <libcfs/libcfs.h>
 
 /** Global CPU partition table */
-struct cfs_cpt_table *cfs_cpt_table __read_mostly;
+struct cfs_cpt_table *cfs_cpt_table __read_mostly = NULL;
 EXPORT_SYMBOL(cfs_cpt_table);
 
-/**
- * modparam for setting number of partitions
- *
- *  0 : estimate best value based on cores or NUMA nodes
- *  1 : disable multiple partitions
- * >1 : specify number of partitions
- */
-static int cpu_npartitions;
-module_param(cpu_npartitions, int, 0444);
-MODULE_PARM_DESC(cpu_npartitions, "# of CPU partitions");
+#ifndef HAVE_LIBCFS_CPT
 
-/**
- * modparam for setting CPU partitions patterns:
- *
- * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID,
- *      number in bracket is processor ID (core or HT)
- *
- * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket
- *       are NUMA node ID, number before bracket is CPU partition ID.
- *
- * i.e: "N", shortcut expression to create CPT from NUMA & CPU topology
- *
- * NB: If user specified cpu_pattern, cpu_npartitions will be ignored
- */
-static char *cpu_pattern = "N";
-module_param(cpu_pattern, charp, 0444);
-MODULE_PARM_DESC(cpu_pattern, "CPU partitions pattern");
+#define CFS_CPU_VERSION_MAGIC           0xbabecafe
+
+#define CFS_CPT_DISTANCE		1	/* Arbitrary positive value */
 
-#ifdef CONFIG_SMP
 struct cfs_cpt_table *cfs_cpt_table_alloc(int ncpt)
 {
 	struct cfs_cpt_table *cptab;
-	int i;
 
-	LIBCFS_ALLOC(cptab, sizeof(*cptab));
-	if (!cptab)
+	if (ncpt != 1) {
+		CERROR("Can't support cpu partition number %d\n", ncpt);
 		return NULL;
-
-	cptab->ctb_nparts = ncpt;
-
-	LIBCFS_ALLOC(cptab->ctb_cpumask, cpumask_size());
-	if (!cptab->ctb_cpumask)
-		goto failed_alloc_cpumask;
-
-	LIBCFS_ALLOC(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
-	if (!cptab->ctb_nodemask)
-		goto failed_alloc_nodemask;
-
-	LIBCFS_ALLOC(cptab->ctb_cpu2cpt,
-		     nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
-	if (!cptab->ctb_cpu2cpt)
-		goto failed_alloc_cpu2cpt;
-
-	memset(cptab->ctb_cpu2cpt, -1,
-	       nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
-
-	LIBCFS_ALLOC(cptab->ctb_node2cpt,
-		     nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
-	if (!cptab->ctb_node2cpt)
-		goto failed_alloc_node2cpt;
-
-	memset(cptab->ctb_node2cpt, -1,
-	       nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
-
-	LIBCFS_ALLOC(cptab->ctb_parts, ncpt * sizeof(cptab->ctb_parts[0]));
-	if (!cptab->ctb_parts)
-		goto failed_alloc_ctb_parts;
-
-	memset(cptab->ctb_parts, -1, ncpt * sizeof(cptab->ctb_parts[0]));
-
-	for (i = 0; i < ncpt; i++) {
-		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
-
-		LIBCFS_ALLOC(part->cpt_cpumask, cpumask_size());
-		if (!part->cpt_cpumask)
-			goto failed_setting_ctb_parts;
-
-		LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask));
-		if (!part->cpt_nodemask)
-			goto failed_setting_ctb_parts;
-
-		LIBCFS_ALLOC(part->cpt_distance,
-			     cptab->ctb_nparts * sizeof(part->cpt_distance[0]));
-		if (!part->cpt_distance)
-			goto failed_setting_ctb_parts;
-
-		memset(part->cpt_distance, -1,
-		       cptab->ctb_nparts * sizeof(part->cpt_distance[0]));
 	}
 
-	return cptab;
-
-failed_setting_ctb_parts:
-	while (i-- >= 0) {
-		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
-
-		if (part->cpt_nodemask) {
-			LIBCFS_FREE(part->cpt_nodemask,
-				    sizeof(*part->cpt_nodemask));
-		}
-
-		if (part->cpt_cpumask)
-			LIBCFS_FREE(part->cpt_cpumask, cpumask_size());
-
-		if (part->cpt_distance) {
-			LIBCFS_FREE(part->cpt_distance,
-				cptab->ctb_nparts *
-					sizeof(part->cpt_distance[0]));
-		}
+	LIBCFS_ALLOC(cptab, sizeof(*cptab));
+	if (cptab != NULL) {
+		cptab->ctb_version = CFS_CPU_VERSION_MAGIC;
+		cpu_set(0, cptab->ctb_cpumask);
+		node_set(0, cptab->ctb_nodemask);
+		cptab->ctb_nparts  = ncpt;
 	}
 
-	if (cptab->ctb_parts) {
-		LIBCFS_FREE(cptab->ctb_parts,
-			    cptab->ctb_nparts * sizeof(cptab->ctb_parts[0]));
-	}
-failed_alloc_ctb_parts:
-	if (cptab->ctb_node2cpt) {
-		LIBCFS_FREE(cptab->ctb_node2cpt,
-			    nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
-	}
-failed_alloc_node2cpt:
-	if (cptab->ctb_cpu2cpt) {
-		LIBCFS_FREE(cptab->ctb_cpu2cpt,
-			    nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
-	}
-failed_alloc_cpu2cpt:
-	if (cptab->ctb_nodemask)
-		LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
-failed_alloc_nodemask:
-	if (cptab->ctb_cpumask)
-		LIBCFS_FREE(cptab->ctb_cpumask, cpumask_size());
-failed_alloc_cpumask:
-	LIBCFS_FREE(cptab, sizeof(*cptab));
-	return NULL;
+	return cptab;
 }
 EXPORT_SYMBOL(cfs_cpt_table_alloc);
 
 void cfs_cpt_table_free(struct cfs_cpt_table *cptab)
 {
-	int i;
-
-	if (cptab->ctb_cpu2cpt) {
-		LIBCFS_FREE(cptab->ctb_cpu2cpt,
-			    nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
-	}
-
-	if (cptab->ctb_node2cpt) {
-		LIBCFS_FREE(cptab->ctb_node2cpt,
-			    nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
-	}
-
-	for (i = 0; cptab->ctb_parts && i < cptab->ctb_nparts; i++) {
-		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
-
-		if (part->cpt_nodemask) {
-			LIBCFS_FREE(part->cpt_nodemask,
-				    sizeof(*part->cpt_nodemask));
-		}
-
-		if (part->cpt_cpumask)
-			LIBCFS_FREE(part->cpt_cpumask, cpumask_size());
-
-		if (part->cpt_distance) {
-			LIBCFS_FREE(part->cpt_distance,
-				cptab->ctb_nparts *
-					sizeof(part->cpt_distance[0]));
-		}
-	}
-
-	if (cptab->ctb_parts) {
-		LIBCFS_FREE(cptab->ctb_parts,
-			    cptab->ctb_nparts * sizeof(cptab->ctb_parts[0]));
-	}
-
-	if (cptab->ctb_nodemask)
-		LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
-	if (cptab->ctb_cpumask)
-		LIBCFS_FREE(cptab->ctb_cpumask, cpumask_size());
+	LASSERT(cptab->ctb_version == CFS_CPU_VERSION_MAGIC);
 
 	LIBCFS_FREE(cptab, sizeof(*cptab));
 }
@@ -224,346 +78,80 @@ EXPORT_SYMBOL(cfs_cpt_table_free);
 
 int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
 {
-	char *tmp = buf;
-	int rc;
-	int i;
-	int j;
-
-	for (i = 0; i < cptab->ctb_nparts; i++) {
-		if (len <= 0)
-			goto err;
-
-		rc = snprintf(tmp, len, "%d\t:", i);
-		len -= rc;
-
-		if (len <= 0)
-			goto err;
-
-		tmp += rc;
-		for_each_cpu(j, cptab->ctb_parts[i].cpt_cpumask) {
-			rc = snprintf(tmp, len, " %d", j);
-			len -= rc;
-			if (len <= 0)
-				goto err;
-			tmp += rc;
-		}
+	int rc = 0;
 
-		*tmp = '\n';
-		tmp++;
-		len--;
-	}
+	rc = snprintf(buf, len, "%d\t: %d\n", 0, 0);
+	len -= rc;
+	if (len <= 0)
+		return -EFBIG;
 
-	return tmp - buf;
-err:
-	return -E2BIG;
+	return rc;
 }
 EXPORT_SYMBOL(cfs_cpt_table_print);
 
 int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len)
 {
-	char *tmp = buf;
-	int rc;
-	int i;
-	int j;
-
-	for (i = 0; i < cptab->ctb_nparts; i++) {
-		if (len <= 0)
-			goto err;
-
-		rc = snprintf(tmp, len, "%d\t:", i);
-		len -= rc;
+	int	rc = 0;
 
-		if (len <= 0)
-			goto err;
-
-		tmp += rc;
-		for (j = 0; j < cptab->ctb_nparts; j++) {
-			rc = snprintf(tmp, len, " %d:%d", j,
-				      cptab->ctb_parts[i].cpt_distance[j]);
-			len -= rc;
-			if (len <= 0)
-				goto err;
-			tmp += rc;
-		}
-
-		*tmp = '\n';
-		tmp++;
-		len--;
-	}
+	rc = snprintf(buf, len, "%d\t: %d:%d\n", 0, CFS_CPT_DISTANCE);
+	len -= rc;
+	if (len <= 0)
+		return -EFBIG;
 
-	return tmp - buf;
-err:
-	return -E2BIG;
+	return rc;
 }
 EXPORT_SYMBOL(cfs_cpt_distance_print);
 
 int cfs_cpt_number(struct cfs_cpt_table *cptab)
 {
-	return cptab->ctb_nparts;
+	return 1;
 }
 EXPORT_SYMBOL(cfs_cpt_number);
 
 int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
 {
-	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
-
-	return cpt == CFS_CPT_ANY ?
-	       cpumask_weight(cptab->ctb_cpumask) :
-	       cpumask_weight(cptab->ctb_parts[cpt].cpt_cpumask);
+	return 1;
 }
 EXPORT_SYMBOL(cfs_cpt_weight);
 
 int cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
 {
-	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
-
-	return cpt == CFS_CPT_ANY ?
-	       cpumask_any_and(cptab->ctb_cpumask,
-			       cpu_online_mask) < nr_cpu_ids :
-	       cpumask_any_and(cptab->ctb_parts[cpt].cpt_cpumask,
-			       cpu_online_mask) < nr_cpu_ids;
+	return 1;
 }
 EXPORT_SYMBOL(cfs_cpt_online);
 
 cpumask_t *cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt)
 {
-	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
-
-	return cpt == CFS_CPT_ANY ?
-	       cptab->ctb_cpumask : cptab->ctb_parts[cpt].cpt_cpumask;
+	return &cptab->ctb_mask;
 }
 EXPORT_SYMBOL(cfs_cpt_cpumask);
 
 nodemask_t *cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt)
 {
-	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
-
-	return cpt == CFS_CPT_ANY ?
-	       cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask;
+	return &cptab->ctb_nodemask;
 }
 EXPORT_SYMBOL(cfs_cpt_nodemask);
 
-unsigned int cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2)
+unsigned cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2)
 {
-	LASSERT(cpt1 == CFS_CPT_ANY || (cpt1 >= 0 && cpt1 < cptab->ctb_nparts));
-	LASSERT(cpt2 == CFS_CPT_ANY || (cpt2 >= 0 && cpt2 < cptab->ctb_nparts));
-
-	if (cpt1 == CFS_CPT_ANY || cpt2 == CFS_CPT_ANY)
-		return cptab->ctb_distance;
-
-	return cptab->ctb_parts[cpt1].cpt_distance[cpt2];
+	return CFS_CPT_DISTANCE;
 }
 EXPORT_SYMBOL(cfs_cpt_distance);
 
-/*
- * Calculate the maximum NUMA distance between all nodes in the
- * from_mask and all nodes in the to_mask.
- */
-static unsigned int cfs_cpt_distance_calculate(nodemask_t *from_mask,
-					       nodemask_t *to_mask)
-{
-	unsigned int maximum;
-	unsigned int distance;
-	int from;
-	int to;
-
-	maximum = 0;
-	for_each_node_mask(from, *from_mask) {
-		for_each_node_mask(to, *to_mask) {
-			distance = node_distance(from, to);
-			if (maximum < distance)
-				maximum = distance;
-		}
-	}
-	return maximum;
-}
-
-static void cfs_cpt_add_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
-{
-	cptab->ctb_cpu2cpt[cpu] = cpt;
-
-	cpumask_set_cpu(cpu, cptab->ctb_cpumask);
-	cpumask_set_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
-}
-
-static void cfs_cpt_del_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
-{
-	cpumask_clear_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
-	cpumask_clear_cpu(cpu, cptab->ctb_cpumask);
-
-	cptab->ctb_cpu2cpt[cpu] = -1;
-}
-
-static void cfs_cpt_add_node(struct cfs_cpt_table *cptab, int cpt, int node)
-{
-	struct cfs_cpu_partition *part;
-
-	if (!node_isset(node, *cptab->ctb_nodemask)) {
-		unsigned int dist;
-
-		/* first time node is added to the CPT table */
-		node_set(node, *cptab->ctb_nodemask);
-		cptab->ctb_node2cpt[node] = cpt;
-
-		dist = cfs_cpt_distance_calculate(cptab->ctb_nodemask,
-						  cptab->ctb_nodemask);
-		cptab->ctb_distance = dist;
-	}
-
-	part = &cptab->ctb_parts[cpt];
-	if (!node_isset(node, *part->cpt_nodemask)) {
-		int cpt2;
-
-		/* first time node is added to this CPT */
-		node_set(node, *part->cpt_nodemask);
-		for (cpt2 = 0; cpt2 < cptab->ctb_nparts; cpt2++) {
-			struct cfs_cpu_partition *part2;
-			unsigned int dist;
-
-			part2 = &cptab->ctb_parts[cpt2];
-			dist = cfs_cpt_distance_calculate(part->cpt_nodemask,
-							  part2->cpt_nodemask);
-			part->cpt_distance[cpt2] = dist;
-			dist = cfs_cpt_distance_calculate(part2->cpt_nodemask,
-							  part->cpt_nodemask);
-			part2->cpt_distance[cpt] = dist;
-		}
-	}
-}
-
-static void cfs_cpt_del_node(struct cfs_cpt_table *cptab, int cpt, int node)
-{
-	struct cfs_cpu_partition *part = &cptab->ctb_parts[cpt];
-	int cpu;
-
-	for_each_cpu(cpu, part->cpt_cpumask) {
-		/* this CPT has other CPU belonging to this node? */
-		if (cpu_to_node(cpu) == node)
-			break;
-	}
-
-	if (cpu >= nr_cpu_ids && node_isset(node,  *part->cpt_nodemask)) {
-		int cpt2;
-
-		/* No more CPUs in the node for this CPT. */
-		node_clear(node, *part->cpt_nodemask);
-		for (cpt2 = 0; cpt2 < cptab->ctb_nparts; cpt2++) {
-			struct cfs_cpu_partition *part2;
-			unsigned int dist;
-
-			part2 = &cptab->ctb_parts[cpt2];
-			if (node_isset(node, *part2->cpt_nodemask))
-				cptab->ctb_node2cpt[node] = cpt2;
-
-			dist = cfs_cpt_distance_calculate(part->cpt_nodemask,
-							  part2->cpt_nodemask);
-			part->cpt_distance[cpt2] = dist;
-			dist = cfs_cpt_distance_calculate(part2->cpt_nodemask,
-							  part->cpt_nodemask);
-			part2->cpt_distance[cpt] = dist;
-		}
-	}
-
-	for_each_cpu(cpu, cptab->ctb_cpumask) {
-		/* this CPT-table has other CPUs belonging to this node? */
-		if (cpu_to_node(cpu) == node)
-			break;
-	}
-
-	if (cpu >= nr_cpu_ids && node_isset(node, *cptab->ctb_nodemask)) {
-		/* No more CPUs in the table for this node. */
-		node_clear(node, *cptab->ctb_nodemask);
-		cptab->ctb_node2cpt[node] = -1;
-		cptab->ctb_distance =
-			cfs_cpt_distance_calculate(cptab->ctb_nodemask,
-						   cptab->ctb_nodemask);
-	}
-}
-
 int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
 {
-	LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts);
-
-	if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_online(cpu)) {
-		CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu);
-		return 0;
-	}
-
-	if (cptab->ctb_cpu2cpt[cpu] != -1) {
-		CDEBUG(D_INFO, "CPU %d is already in partition %d\n",
-		       cpu, cptab->ctb_cpu2cpt[cpu]);
-		return 0;
-	}
-
-	if (cpumask_test_cpu(cpu, cptab->ctb_cpumask)) {
-		CDEBUG(D_INFO, "CPU %d is already in cpumask\n", cpu);
-		return 0;
-	}
-
-	if (cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask)) {
-		CDEBUG(D_INFO, "CPU %d is already in partition %d cpumask\n",
-		       cpu, cptab->ctb_cpu2cpt[cpu]);
-		return 0;
-	}
-
-	cfs_cpt_add_cpu(cptab, cpt, cpu);
-	cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu));
-
 	return 1;
 }
 EXPORT_SYMBOL(cfs_cpt_set_cpu);
 
 void cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
 {
-	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
-
-	if (cpu < 0 || cpu >= nr_cpu_ids) {
-		CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu);
-		return;
-	}
-
-	if (cpt == CFS_CPT_ANY) {
-		/* caller doesn't know the partition ID */
-		cpt = cptab->ctb_cpu2cpt[cpu];
-		if (cpt < 0) { /* not set in this CPT-table */
-			CDEBUG(D_INFO,
-			       "Try to unset cpu %d which is not in CPT-table %p\n",
-			       cpt, cptab);
-			return;
-		}
-
-	} else if (cpt != cptab->ctb_cpu2cpt[cpu]) {
-		CDEBUG(D_INFO,
-		       "CPU %d is not in CPU partition %d\n", cpu, cpt);
-		return;
-	}
-
-	LASSERT(cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
-	LASSERT(cpumask_test_cpu(cpu, cptab->ctb_cpumask));
-
-	cfs_cpt_del_cpu(cptab, cpt, cpu);
-	cfs_cpt_del_node(cptab, cpt, cpu_to_node(cpu));
 }
 EXPORT_SYMBOL(cfs_cpt_unset_cpu);
 
 int cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt,
 			const cpumask_t *mask)
 {
-	int cpu;
-
-	if (!cpumask_weight(mask) ||
-	    cpumask_any_and(mask, cpu_online_mask) >= nr_cpu_ids) {
-		CDEBUG(D_INFO,
-		       "No online CPU is found in the CPU mask for CPU partition %d\n",
-		       cpt);
-		return 0;
-	}
-
-	for_each_cpu(cpu, mask) {
-		cfs_cpt_add_cpu(cptab, cpt, cpu);
-		cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu));
-	}
-
 	return 1;
 }
 EXPORT_SYMBOL(cfs_cpt_set_cpumask);
@@ -571,65 +159,23 @@ EXPORT_SYMBOL(cfs_cpt_set_cpumask);
 void cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt,
 			   const cpumask_t *mask)
 {
-	int cpu;
-
-	for_each_cpu(cpu, mask) {
-		cfs_cpt_del_cpu(cptab, cpt, cpu);
-		cfs_cpt_del_node(cptab, cpt, cpu_to_node(cpu));
-	}
 }
 EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
 
 int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
 {
-	const cpumask_t *mask;
-	int cpu;
-
-	if (node < 0 || node >= nr_node_ids) {
-		CDEBUG(D_INFO,
-		       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
-		return 0;
-	}
-
-	mask = cpumask_of_node(node);
-
-	for_each_cpu(cpu, mask)
-		cfs_cpt_add_cpu(cptab, cpt, cpu);
-
-	cfs_cpt_add_node(cptab, cpt, node);
-
 	return 1;
 }
 EXPORT_SYMBOL(cfs_cpt_set_node);
 
 void cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
 {
-	const cpumask_t *mask;
-	int cpu;
-
-	if (node < 0 || node >= nr_node_ids) {
-		CDEBUG(D_INFO,
-		       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
-		return;
-	}
-
-	mask = cpumask_of_node(node);
-
-	for_each_cpu(cpu, mask)
-		cfs_cpt_del_cpu(cptab, cpt, cpu);
-
-	cfs_cpt_del_node(cptab, cpt, node);
 }
 EXPORT_SYMBOL(cfs_cpt_unset_node);
 
 int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt,
 			 const nodemask_t *mask)
 {
-	int node;
-
-	for_each_node_mask(node, *mask)
-		cfs_cpt_set_node(cptab, cpt, node);
-
 	return 1;
 }
 EXPORT_SYMBOL(cfs_cpt_set_nodemask);
@@ -637,674 +183,42 @@ EXPORT_SYMBOL(cfs_cpt_set_nodemask);
 void cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt,
 			    const nodemask_t *mask)
 {
-	int node;
-
-	for_each_node_mask(node, *mask)
-		cfs_cpt_unset_node(cptab, cpt, node);
 }
 EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
 
 int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
 {
-	nodemask_t *mask;
-	int weight;
-	unsigned int rotor;
-	int node = 0;
-
-	/* convert CPU partition ID to HW node id */
-
-	if (cpt < 0 || cpt >= cptab->ctb_nparts) {
-		mask = cptab->ctb_nodemask;
-		rotor = cptab->ctb_spread_rotor++;
-	} else {
-		mask = cptab->ctb_parts[cpt].cpt_nodemask;
-		rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++;
-		node  = cptab->ctb_parts[cpt].cpt_node;
-	}
-
-	weight = nodes_weight(*mask);
-	if (weight > 0) {
-		rotor %= weight;
-
-		for_each_node_mask(node, *mask) {
-			if (!rotor--)
-				return node;
-		}
-	}
-
-	return node;
+	return 0;
 }
 EXPORT_SYMBOL(cfs_cpt_spread_node);
 
 int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
 {
-	int cpu;
-	int cpt;
-
-	preempt_disable();
-	cpu = smp_processor_id();
-	cpt = cptab->ctb_cpu2cpt[cpu];
-
-	if (cpt < 0 && remap) {
-		/* don't return negative value for safety of upper layer,
-		 * instead we shadow the unknown cpu to a valid partition ID
-		 */
-		cpt = cpu % cptab->ctb_nparts;
-	}
-	preempt_enable();
-	return cpt;
+	return 0;
 }
 EXPORT_SYMBOL(cfs_cpt_current);
 
 int cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
 {
-	LASSERT(cpu >= 0 && cpu < nr_cpu_ids);
-
-	return cptab->ctb_cpu2cpt[cpu];
+	return 0;
 }
 EXPORT_SYMBOL(cfs_cpt_of_cpu);
 
 int cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node)
 {
-	if (node < 0 || node > nr_node_ids)
-		return CFS_CPT_ANY;
-
-	return cptab->ctb_node2cpt[node];
+	return 0;
 }
 EXPORT_SYMBOL(cfs_cpt_of_node);
 
 int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
 {
-	nodemask_t *nodemask;
-	cpumask_t *cpumask;
-	int cpu;
-	int rc;
-
-	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
-
-	if (cpt == CFS_CPT_ANY) {
-		cpumask = cptab->ctb_cpumask;
-		nodemask = cptab->ctb_nodemask;
-	} else {
-		cpumask = cptab->ctb_parts[cpt].cpt_cpumask;
-		nodemask = cptab->ctb_parts[cpt].cpt_nodemask;
-	}
-
-	if (!cpumask_intersects(cpumask, cpu_online_mask)) {
-		CDEBUG(D_INFO,
-		       "No online CPU found in CPU partition %d, did someone do CPU hotplug on system? You might need to reload Lustre modules to keep system working well.\n",
-			cpt);
-		return -ENODEV;
-	}
-
-	for_each_online_cpu(cpu) {
-		if (cpumask_test_cpu(cpu, cpumask))
-			continue;
-
-		rc = set_cpus_allowed_ptr(current, cpumask);
-		set_mems_allowed(*nodemask);
-		if (!rc)
-			schedule(); /* switch to allowed CPU */
-
-		return rc;
-	}
-
-	/* don't need to set affinity because all online CPUs are covered */
 	return 0;
 }
 EXPORT_SYMBOL(cfs_cpt_bind);
 
-/**
- * Choose max to \a number CPUs from \a node and set them in \a cpt.
- * We always prefer to choose CPU in the same core/socket.
- */
-static int cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt,
-				cpumask_t *node_mask, int number)
-{
-	cpumask_t *socket_mask = NULL;
-	cpumask_t *core_mask = NULL;
-	int rc = 0;
-	int cpu;
-	int i;
-
-	LASSERT(number > 0);
-
-	if (number >= cpumask_weight(node_mask)) {
-		while (!cpumask_empty(node_mask)) {
-			cpu = cpumask_first(node_mask);
-			cpumask_clear_cpu(cpu, node_mask);
-
-			if (!cpu_online(cpu))
-				continue;
-
-			rc = cfs_cpt_set_cpu(cptab, cpt, cpu);
-			if (!rc)
-				return -EINVAL;
-		}
-		return 0;
-	}
-
-	/* allocate scratch buffer */
-	LIBCFS_ALLOC(socket_mask, cpumask_size());
-	LIBCFS_ALLOC(core_mask, cpumask_size());
-	if (!socket_mask || !core_mask) {
-		rc = -ENOMEM;
-		goto out;
-	}
-
-	while (!cpumask_empty(node_mask)) {
-		cpu = cpumask_first(node_mask);
-
-		/* get cpumask for cores in the same socket */
-		cpumask_and(socket_mask, topology_core_cpumask(cpu), node_mask);
-		while (!cpumask_empty(socket_mask)) {
-			/* get cpumask for hts in the same core */
-			cpumask_and(core_mask, topology_sibling_cpumask(cpu),
-				    node_mask);
-
-			for_each_cpu(i, core_mask) {
-				cpumask_clear_cpu(i, socket_mask);
-				cpumask_clear_cpu(i, node_mask);
-
-				if (!cpu_online(i))
-					continue;
-
-				rc = cfs_cpt_set_cpu(cptab, cpt, i);
-				if (!rc) {
-					rc = -EINVAL;
-					goto out;
-				}
-
-				if (!--number)
-					goto out;
-			}
-			cpu = cpumask_first(socket_mask);
-		}
-	}
-
-out:
-	if (core_mask)
-		LIBCFS_FREE(core_mask, cpumask_size());
-	if (socket_mask)
-		LIBCFS_FREE(socket_mask, cpumask_size());
-	return rc;
-}
-
-#define CPT_WEIGHT_MIN 4
-
-static int cfs_cpt_num_estimate(void)
-{
-	int nthr = cpumask_weight(topology_sibling_cpumask(smp_processor_id()));
-	int ncpu = num_online_cpus();
-	int ncpt = 1;
-
-	if (ncpu > CPT_WEIGHT_MIN)
-		for (ncpt = 2; ncpu > 2 * nthr * ncpt; ncpt++)
-			; /* nothing */
-
-#if (BITS_PER_LONG == 32)
-	/* config many CPU partitions on 32-bit system could consume
-	 * too much memory
-	 */
-	ncpt = min(2, ncpt);
-#endif
-	while (ncpu % ncpt)
-		ncpt--; /* worst case is 1 */
-
-	return ncpt;
-}
-
-static struct cfs_cpt_table *cfs_cpt_table_create(int ncpt)
-{
-	struct cfs_cpt_table *cptab = NULL;
-	cpumask_t *node_mask = NULL;
-	int cpt = 0;
-	int node;
-	int num;
-	int rem;
-	int rc = 0;
-
-	num = cfs_cpt_num_estimate();
-	if (ncpt <= 0)
-		ncpt = num;
-
-	if (ncpt > num_online_cpus()) {
-		rc = -EINVAL;
-		CERROR("libcfs: CPU partition count %d > cores %d: rc = %d\n",
-		       ncpt, num_online_cpus(), rc);
-		goto failed;
-	}
-
-	if (ncpt > 4 * num) {
-		CWARN("CPU partition number %d is larger than suggested value (%d), your system may have performance issue or run out of memory while under pressure\n",
-		      ncpt, num);
-	}
-
-	cptab = cfs_cpt_table_alloc(ncpt);
-	if (!cptab) {
-		CERROR("Failed to allocate CPU map(%d)\n", ncpt);
-		rc = -ENOMEM;
-		goto failed;
-	}
-
-	LIBCFS_ALLOC(node_mask, cpumask_size());
-	if (!node_mask) {
-		CERROR("Failed to allocate scratch cpumask\n");
-		rc = -ENOMEM;
-		goto failed;
-	}
-
-	num = num_online_cpus() / ncpt;
-	rem = num_online_cpus() % ncpt;
-	for_each_online_node(node) {
-		cpumask_copy(node_mask, cpumask_of_node(node));
-
-		while (cpt < ncpt && !cpumask_empty(node_mask)) {
-			struct cfs_cpu_partition *part = &cptab->ctb_parts[cpt];
-			int ncpu = cpumask_weight(part->cpt_cpumask);
-
-			rc = cfs_cpt_choose_ncpus(cptab, cpt, node_mask,
-						  (rem > 0) + num - ncpu);
-			if (rc < 0) {
-				rc = -EINVAL;
-				goto failed_mask;
-			}
-
-			ncpu = cpumask_weight(part->cpt_cpumask);
-			if (ncpu == num + !!(rem > 0)) {
-				cpt++;
-				rem--;
-			}
-		}
-	}
-
-	LIBCFS_FREE(node_mask, cpumask_size());
-
-	return cptab;
-
-failed_mask:
-	if (node_mask)
-		LIBCFS_FREE(node_mask, cpumask_size());
-failed:
-	CERROR("Failed (rc = %d) to setup CPU partition table with %d partitions, online HW NUMA nodes: %d, HW CPU cores: %d.\n",
-	       rc, ncpt, num_online_nodes(), num_online_cpus());
-
-	if (cptab)
-		cfs_cpt_table_free(cptab);
-
-	return ERR_PTR(rc);
-}
-
-static struct cfs_cpt_table *cfs_cpt_table_create_pattern(const char *pattern)
-{
-	struct cfs_cpt_table *cptab;
-	char *pattern_dup;
-	char *bracket;
-	char *str;
-	int node = 0;
-	int ncpt = 0;
-	int cpt = 0;
-	int high;
-	int rc;
-	int c;
-	int i;
-
-	pattern_dup = kstrdup(pattern, GFP_KERNEL);
-	if (!pattern_dup) {
-		CERROR("Failed to duplicate pattern '%s'\n", pattern);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	str = cfs_trimwhite(pattern_dup);
-	if (*str == 'n' || *str == 'N') {
-		str++; /* skip 'N' char */
-		node = 1; /* NUMA pattern */
-		if (*str == '\0') {
-			node = -1;
-			for_each_online_node(i) {
-				if (!cpumask_empty(cpumask_of_node(i)))
-					ncpt++;
-			}
-			if (ncpt == 1) { /* single NUMA node */
-				kfree(pattern_dup);
-				return cfs_cpt_table_create(cpu_npartitions);
-			}
-		}
-	}
-
-	if (!ncpt) { /* scanning bracket which is mark of partition */
-		bracket = str;
-		while ((bracket = strchr(bracket, '['))) {
-			bracket++;
-			ncpt++;
-		}
-	}
-
-	if (!ncpt ||
-	    (node && ncpt > num_online_nodes()) ||
-	    (!node && ncpt > num_online_cpus())) {
-		CERROR("Invalid pattern '%s', or too many partitions %d\n",
-		       pattern_dup, ncpt);
-		rc = -EINVAL;
-		goto err_free_str;
-	}
-
-	cptab = cfs_cpt_table_alloc(ncpt);
-	if (!cptab) {
-		CERROR("Failed to allocate CPU partition table\n");
-		rc = -ENOMEM;
-		goto err_free_str;
-	}
-
-	if (node < 0) { /* shortcut to create CPT from NUMA & CPU topology */
-		for_each_online_node(i) {
-			if (cpumask_empty(cpumask_of_node(i)))
-				continue;
-
-			rc = cfs_cpt_set_node(cptab, cpt++, i);
-			if (!rc) {
-				rc = -EINVAL;
-				goto err_free_table;
-			}
-		}
-		kfree(pattern_dup);
-		return cptab;
-	}
-
-	high = node ? nr_node_ids - 1 : nr_cpu_ids - 1;
-
-	for (str = cfs_trimwhite(str), c = 0; /* until break */; c++) {
-		struct cfs_range_expr *range;
-		struct cfs_expr_list *el;
-		int n;
-
-		bracket = strchr(str, '[');
-		if (!bracket) {
-			if (*str) {
-				CERROR("Invalid pattern '%s'\n", str);
-				rc = -EINVAL;
-				goto err_free_table;
-			} else if (c != ncpt) {
-				CERROR("Expect %d partitions but found %d\n",
-				       ncpt, c);
-				rc = -EINVAL;
-				goto err_free_table;
-			}
-			break;
-		}
-
-		if (sscanf(str, "%d%n", &cpt, &n) < 1) {
-			CERROR("Invalid CPU pattern '%s'\n", str);
-			rc = -EINVAL;
-			goto err_free_table;
-		}
-
-		if (cpt < 0 || cpt >= ncpt) {
-			CERROR("Invalid partition id %d, total partitions %d\n",
-			       cpt, ncpt);
-			rc = -EINVAL;
-			goto err_free_table;
-		}
-
-		if (cfs_cpt_weight(cptab, cpt)) {
-			CERROR("Partition %d has already been set.\n", cpt);
-			rc = -EPERM;
-			goto err_free_table;
-		}
-
-		str = cfs_trimwhite(str + n);
-		if (str != bracket) {
-			CERROR("Invalid pattern '%s'\n", str);
-			rc = -EINVAL;
-			goto err_free_table;
-		}
-
-		bracket = strchr(str, ']');
-		if (!bracket) {
-			CERROR("Missing right bracket for partition %d in '%s'\n",
-			       cpt, str);
-			rc = -EINVAL;
-			goto err_free_table;
-		}
-
-		rc = cfs_expr_list_parse(str, (bracket - str) + 1, 0, high,
-					 &el);
-		if (rc) {
-			CERROR("Can't parse number range in '%s'\n", str);
-			rc = -ERANGE;
-			goto err_free_table;
-		}
-
-		list_for_each_entry(range, &el->el_exprs, re_link) {
-			for (i = range->re_lo; i <= range->re_hi; i++) {
-				if ((i - range->re_lo) % range->re_stride)
-					continue;
-
-				rc = node ? cfs_cpt_set_node(cptab, cpt, i)
-					  : cfs_cpt_set_cpu(cptab, cpt, i);
-				if (!rc) {
-					cfs_expr_list_free(el);
-					rc = -EINVAL;
-					goto err_free_table;
-				}
-			}
-		}
-
-		cfs_expr_list_free(el);
-
-		if (!cfs_cpt_online(cptab, cpt)) {
-			CERROR("No online CPU is found on partition %d\n", cpt);
-			rc = -ENODEV;
-			goto err_free_table;
-		}
-
-		str = cfs_trimwhite(bracket + 1);
-	}
-
-	kfree(pattern_dup);
-	return cptab;
-
-err_free_table:
-	cfs_cpt_table_free(cptab);
-err_free_str:
-	kfree(pattern_dup);
-	return ERR_PTR(rc);
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-#ifdef HAVE_HOTPLUG_STATE_MACHINE
-static enum cpuhp_state lustre_cpu_online;
-
-static int cfs_cpu_online(unsigned int cpu)
-{
-	return 0;
-}
-#endif
-
-static int cfs_cpu_dead(unsigned int cpu)
-{
-	bool warn;
-
-	/* if all HTs in a core are offline, it may break affinity */
-	warn = cpumask_any_and(topology_sibling_cpumask(cpu),
-			       cpu_online_mask) >= nr_cpu_ids;
-	CDEBUG(warn ? D_WARNING : D_INFO,
-	       "Lustre: can't support CPU plug-out well now, performance and stability could be impacted [CPU %u]\n",
-	       cpu);
-	return 0;
-}
-
-#ifndef HAVE_HOTPLUG_STATE_MACHINE
-static int cfs_cpu_notify(struct notifier_block *self, unsigned long action,
-			  void *hcpu)
-{
-	int cpu = (unsigned long)hcpu;
-
-	switch (action) {
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-	default:
-		if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) {
-			CDEBUG(D_INFO, "CPU changed [cpu %u action %lx]\n",
-			       cpu, action);
-			break;
-		}
-
-		cfs_cpu_dead(cpu);
-	}
-
-	return NOTIFY_OK;
-}
-
-static struct notifier_block cfs_cpu_notifier = {
-	.notifier_call	= cfs_cpu_notify,
-	.priority	= 0
-};
-#endif /* !HAVE_HOTPLUG_STATE_MACHINE */
-#endif /* CONFIG_HOTPLUG_CPU */
-
-void cfs_cpu_fini(void)
-{
-	if (!IS_ERR_OR_NULL(cfs_cpt_table))
-		cfs_cpt_table_free(cfs_cpt_table);
-
-#ifdef CONFIG_HOTPLUG_CPU
-#ifdef HAVE_HOTPLUG_STATE_MACHINE
-	if (lustre_cpu_online > 0)
-		cpuhp_remove_state_nocalls(lustre_cpu_online);
-	cpuhp_remove_state_nocalls(CPUHP_LUSTRE_CFS_DEAD);
-#else
-	unregister_hotcpu_notifier(&cfs_cpu_notifier);
-#endif /* !HAVE_HOTPLUG_STATE_MACHINE */
-#endif /* CONFIG_HOTPLUG_CPU */
-}
-
-int cfs_cpu_init(void)
-{
-	int ret;
-
-	LASSERT(!cfs_cpt_table);
-
-#ifdef CONFIG_HOTPLUG_CPU
-#ifdef HAVE_HOTPLUG_STATE_MACHINE
-	ret = cpuhp_setup_state_nocalls(CPUHP_LUSTRE_CFS_DEAD,
-					"fs/lustre/cfe:dead", NULL,
-					cfs_cpu_dead);
-	if (ret < 0)
-		goto failed_cpu_dead;
-
-	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
-					"fs/lustre/cfe:online",
-					cfs_cpu_online, NULL);
-	if (ret < 0)
-		goto failed_cpu_online;
-
-	lustre_cpu_online = ret;
-#else
-	register_hotcpu_notifier(&cfs_cpu_notifier);
-#endif /* !HAVE_HOTPLUG_STATE_MACHINE */
-#endif /* CONFIG_HOTPLUG_CPU */
-
-	get_online_cpus();
-	if (*cpu_pattern) {
-		cfs_cpt_table = cfs_cpt_table_create_pattern(cpu_pattern);
-		if (IS_ERR(cfs_cpt_table)) {
-			CERROR("Failed to create cptab from pattern '%s'\n",
-			       cpu_pattern);
-			ret = PTR_ERR(cfs_cpt_table);
-			goto failed_alloc_table;
-		}
-
-	} else {
-		cfs_cpt_table = cfs_cpt_table_create(cpu_npartitions);
-		if (IS_ERR(cfs_cpt_table)) {
-			CERROR("Failed to create cptab with npartitions %d\n",
-			       cpu_npartitions);
-			ret = PTR_ERR(cfs_cpt_table);
-			goto failed_alloc_table;
-		}
-	}
-
-	put_online_cpus();
-
-	LCONSOLE(0, "HW NUMA nodes: %d, HW CPU cores: %d, npartitions: %d\n",
-		 num_online_nodes(), num_online_cpus(),
-		 cfs_cpt_number(cfs_cpt_table));
-	return 0;
-
-failed_alloc_table:
-	put_online_cpus();
-
-	if (!IS_ERR_OR_NULL(cfs_cpt_table))
-		cfs_cpt_table_free(cfs_cpt_table);
-
-#ifdef CONFIG_HOTPLUG_CPU
-#ifdef HAVE_HOTPLUG_STATE_MACHINE
-	if (lustre_cpu_online > 0)
-		cpuhp_remove_state_nocalls(lustre_cpu_online);
-failed_cpu_online:
-	cpuhp_remove_state_nocalls(CPUHP_LUSTRE_CFS_DEAD);
-failed_cpu_dead:
-#else
-	unregister_hotcpu_notifier(&cfs_cpu_notifier);
-#endif /* !HAVE_HOTPLUG_STATE_MACHINE */
-#endif /* CONFIG_HOTPLUG_CPU */
-	return ret;
-}
-
-#else /* ! CONFIG_SMP */
-
-struct cfs_cpt_table *cfs_cpt_table_alloc(int ncpt)
-{
-	struct cfs_cpt_table *cptab;
-
-	if (ncpt != 1) {
-		CERROR("Can't support cpu partition number %d\n", ncpt);
-		return NULL;
-	}
-
-	LIBCFS_ALLOC(cptab, sizeof(*cptab));
-	if (!cptab)
-		return NULL;
-
-	cpumask_set_cpu(0, cptab->ctb_cpumask);
-	node_set(0, cptab->ctb_nodemask);
-
-	return cptab;
-}
-EXPORT_SYMBOL(cfs_cpt_table_alloc);
-
-int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
-{
-	int rc;
-
-	rc = snprintf(buf, len, "0\t: 0\n");
-	len -= rc;
-	if (len <= 0)
-		return -EFBIG;
-
-	return rc;
-}
-EXPORT_SYMBOL(cfs_cpt_table_print);
-
-int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len)
-{
-	int rc;
-
-	rc = snprintf(buf, len, "0\t: 0:1\n");
-	len -= rc;
-	if (len <= 0)
-		return -EFBIG;
-
-	return rc;
-}
-EXPORT_SYMBOL(cfs_cpt_distance_print);
-
 void cfs_cpu_fini(void)
 {
-	if (cfs_cpt_table) {
+	if (cfs_cpt_table != NULL) {
 		cfs_cpt_table_free(cfs_cpt_table);
 		cfs_cpt_table = NULL;
 	}
@@ -1314,7 +228,7 @@ int cfs_cpu_init(void)
 {
 	cfs_cpt_table = cfs_cpt_table_alloc(1);
 
-	return cfs_cpt_table ? 0 : -1;
+	return cfs_cpt_table != NULL ? 0 : -1;
 }
 
-#endif /* !CONFIG_SMP */
+#endif /* HAVE_LIBCFS_CPT */
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c
index 5f85219101eb0..2f401e74a7dd7 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c
@@ -33,6 +33,7 @@
 
 #define DEBUG_SUBSYSTEM S_LNET
 
+#include <linux/workqueue.h>
 #include <libcfs/libcfs.h>
 
 struct cfs_var_array {
@@ -171,12 +172,9 @@ cfs_array_alloc(int count, unsigned int size)
 }
 EXPORT_SYMBOL(cfs_array_alloc);
 
-#ifdef HAVE_LIBCFS_VFREE_ATOMIC
-#include <linux/workqueue.h>
 /*
  * This is opencoding of vfree_atomic from Linux kernel added in 4.10 with
- * minimum changes needed to work on some older kernels too.
- * For RHEL6, just use vfree() directly since it is missing too much code.
+ * minimum changes needed to work on older kernels too.
  */
 
 #ifndef raw_cpu_ptr
@@ -185,12 +183,12 @@ EXPORT_SYMBOL(cfs_array_alloc);
 
 #ifndef llist_for_each_safe
 #define llist_for_each_safe(pos, n, node)                       \
-	for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n))
+		for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n))
 #endif
 
 struct vfree_deferred {
-	struct llist_head list;
-	struct work_struct wq;
+		struct llist_head list;
+		struct work_struct wq;
 };
 static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
 
@@ -232,4 +230,3 @@ void __exit exit_libcfs_vfree_atomic(void)
 {
 	flush_scheduled_work();
 }
-#endif /* HAVE_LIBCFS_VFREE_ATOMIC */
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c
new file mode 100644
index 0000000000000..9786288cbad50
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c
@@ -0,0 +1,478 @@
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/cpumask.h>
+#include <linux/cpu.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/moduleparam.h>
+#include <linux/mmu_context.h>
+
+#define DEBUG_SUBSYSTEM S_UNDEFINED
+
+#include <libcfs/libcfs.h>
+#include <libcfs/libcfs_ptask.h>
+
+/**
+ * This API based on Linux kernel padada API which is used to perform
+ * encryption and decryption on large numbers of packets without
+ * reordering those packets.
+ *
+ * It was adopted for general use in Lustre for parallelization of
+ * various functionality.
+ *
+ * The first step in using it is to set up a cfs_ptask structure to
+ * control of how this task are to be run:
+ *
+ * #include <libcfs/libcfs_ptask.h>
+ *
+ * int cfs_ptask_init(struct cfs_ptask *ptask, cfs_ptask_cb_t cbfunc,
+ *                    void *cbdata, unsigned int flags, int cpu);
+ *
+ * The cbfunc function with cbdata argument will be called in the process
+ * of getting the task done. The cpu specifies which CPU will be used for
+ * the final callback when the task is done.
+ *
+ * The submission of task is done with:
+ *
+ * int cfs_ptask_submit(struct cfs_ptask *ptask, struct cfs_ptask_engine *engine);
+ *
+ * The task is submitted to the engine for execution.
+ *
+ * In order to wait for result of task execution you should call:
+ *
+ * int cfs_ptask_wait_for(struct cfs_ptask *ptask);
+ *
+ * The tasks with flag PTF_ORDERED are executed in parallel but complete
+ * into submission order. So, waiting for last ordered task you can be sure
+ * that all previous tasks were done before this task complete.
+ */
+
+#ifndef HAVE_REINIT_COMPLETION
+/**
+ * reinit_completion - reinitialize a completion structure
+ * @x:  pointer to completion structure that is to be reinitialized
+ *
+ * This inline function should be used to reinitialize a completion
+ * structure so it can be reused. This is especially important after
+ * complete_all() is used.
+ */
+static inline void reinit_completion(struct completion *x)
+{
+	x->done = 0;
+}
+#endif
+
+#ifndef HAVE_CPUMASK_PRINT_TO_PAGEBUF
+static inline void cpumap_print_to_pagebuf(bool unused, char *buf,
+					   const struct cpumask *mask)
+{
+	cpulist_scnprintf(buf, PAGE_SIZE, mask);
+}
+#endif
+
+#ifdef CONFIG_PADATA
+static void cfs_ptask_complete(struct padata_priv *padata)
+{
+	struct cfs_ptask *ptask = cfs_padata2ptask(padata);
+
+	if (cfs_ptask_need_complete(ptask)) {
+		if (cfs_ptask_is_ordered(ptask))
+			complete(&ptask->pt_completion);
+	} else if (cfs_ptask_is_autofree(ptask)) {
+		kfree(ptask);
+	}
+}
+
+static void cfs_ptask_execute(struct padata_priv *padata)
+{
+	struct cfs_ptask *ptask = cfs_padata2ptask(padata);
+	bool bh_enabled = false;
+
+	if (!cfs_ptask_is_atomic(ptask)) {
+		local_bh_enable();
+		bh_enabled = true;
+	}
+
+	if (cfs_ptask_use_user_mm(ptask) && ptask->pt_mm != NULL) {
+		kthread_use_mm(ptask->pt_mm);
+	}
+
+	if (ptask->pt_cbfunc != NULL)
+		ptask->pt_result = ptask->pt_cbfunc(ptask);
+	else
+		ptask->pt_result = -ENOSYS;
+
+	if (cfs_ptask_use_user_mm(ptask) && ptask->pt_mm != NULL) {
+		kthread_unuse_mm(ptask->pt_mm);
+		mmput(ptask->pt_mm);
+		ptask->pt_mm = NULL;
+	}
+
+	if (cfs_ptask_need_complete(ptask) && !cfs_ptask_is_ordered(ptask))
+		complete(&ptask->pt_completion);
+
+	if (bh_enabled)
+		local_bh_disable();
+
+	padata_do_serial(padata);
+}
+
+static int cfs_do_parallel(struct cfs_ptask_engine *engine,
+			   struct padata_priv *padata)
+{
+	struct cfs_ptask *ptask = cfs_padata2ptask(padata);
+	int rc;
+
+	if (cfs_ptask_need_complete(ptask))
+		reinit_completion(&ptask->pt_completion);
+
+	if (cfs_ptask_use_user_mm(ptask)) {
+		ptask->pt_mm = get_task_mm(current);
+	}
+	ptask->pt_result = -EINPROGRESS;
+
+retry:
+	rc = padata_do_parallel(engine->pte_pinst, padata, ptask->pt_cbcpu);
+	if (rc == -EBUSY && cfs_ptask_is_retry(ptask)) {
+		/* too many tasks already in queue */
+		schedule_timeout_uninterruptible(1);
+		goto retry;
+	}
+
+	if (rc) {
+		if (cfs_ptask_use_user_mm(ptask) && ptask->pt_mm != NULL) {
+			mmput(ptask->pt_mm);
+			ptask->pt_mm = NULL;
+		}
+		ptask->pt_result = rc;
+	}
+
+	return rc;
+}
+
+/**
+ * This function submit initialized task for async execution
+ * in engine with specified id.
+ */
+int cfs_ptask_submit(struct cfs_ptask *ptask, struct cfs_ptask_engine *engine)
+{
+	struct padata_priv *padata = cfs_ptask2padata(ptask);
+
+	if (IS_ERR_OR_NULL(engine))
+		return -EINVAL;
+
+	memset(padata, 0, sizeof(*padata));
+
+	padata->parallel = cfs_ptask_execute;
+	padata->serial   = cfs_ptask_complete;
+
+	return cfs_do_parallel(engine, padata);
+}
+
+#else  /* !CONFIG_PADATA */
+
+/**
+ * If CONFIG_PADATA is not defined this function just execute
+ * the initialized task in current thread. (emulate async execution)
+ */
+int cfs_ptask_submit(struct cfs_ptask *ptask, struct cfs_ptask_engine *engine)
+{
+	if (IS_ERR_OR_NULL(engine))
+		return -EINVAL;
+
+	if (ptask->pt_cbfunc != NULL)
+		ptask->pt_result = ptask->pt_cbfunc(ptask);
+	else
+		ptask->pt_result = -ENOSYS;
+
+	if (cfs_ptask_need_complete(ptask))
+		complete(&ptask->pt_completion);
+	else if (cfs_ptask_is_autofree(ptask))
+		kfree(ptask);
+
+	return 0;
+}
+#endif /* CONFIG_PADATA */
+
+EXPORT_SYMBOL(cfs_ptask_submit);
+
+/**
+ * This function waits when task complete async execution.
+ * The tasks with flag PTF_ORDERED are executed in parallel but completes
+ * into submission order. So, waiting for last ordered task you can be sure
+ * that all previous tasks were done before this task complete.
+ */
+int cfs_ptask_wait_for(struct cfs_ptask *ptask)
+{
+	if (!cfs_ptask_need_complete(ptask))
+		return -EINVAL;
+
+	wait_for_completion(&ptask->pt_completion);
+
+	return 0;
+}
+EXPORT_SYMBOL(cfs_ptask_wait_for);
+
+/**
+ * This function initialize internal members of task and prepare it for
+ * async execution.
+ */
+int cfs_ptask_init(struct cfs_ptask *ptask, cfs_ptask_cb_t cbfunc, void *cbdata,
+		   unsigned int flags, int cpu)
+{
+	memset(ptask, 0, sizeof(*ptask));
+
+	ptask->pt_flags  = flags;
+	ptask->pt_cbcpu  = cpu;
+	ptask->pt_mm     = NULL; /* will be set in cfs_do_parallel() */
+	ptask->pt_cbfunc = cbfunc;
+	ptask->pt_cbdata = cbdata;
+	ptask->pt_result = -EAGAIN;
+
+	if (cfs_ptask_need_complete(ptask)) {
+		if (cfs_ptask_is_autofree(ptask))
+			return -EINVAL;
+
+		init_completion(&ptask->pt_completion);
+	}
+
+	if (cfs_ptask_is_atomic(ptask) && cfs_ptask_use_user_mm(ptask))
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL(cfs_ptask_init);
+
+/**
+ * This function set the mask of allowed CPUs for parallel execution
+ * for engine with specified id.
+ */
+int cfs_ptengine_set_cpumask(struct cfs_ptask_engine *engine,
+			     const struct cpumask *cpumask)
+{
+	int rc = 0;
+
+#ifdef CONFIG_PADATA
+	cpumask_var_t serial_mask;
+	cpumask_var_t parallel_mask;
+
+	if (IS_ERR_OR_NULL(engine))
+		return -EINVAL;
+
+	if (!alloc_cpumask_var(&serial_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	if (!alloc_cpumask_var(&parallel_mask, GFP_KERNEL)) {
+		free_cpumask_var(serial_mask);
+		return -ENOMEM;
+	}
+
+	cpumask_copy(parallel_mask, cpumask);
+	cpumask_copy(serial_mask, cpu_online_mask);
+
+	rc = padata_set_cpumask(engine->pte_pinst, PADATA_CPU_PARALLEL,
+				parallel_mask);
+	free_cpumask_var(parallel_mask);
+	if (rc)
+		goto out_failed_mask;
+
+	rc = padata_set_cpumask(engine->pte_pinst, PADATA_CPU_SERIAL,
+				serial_mask);
+out_failed_mask:
+	free_cpumask_var(serial_mask);
+#endif /* CONFIG_PADATA */
+
+	return rc;
+}
+EXPORT_SYMBOL(cfs_ptengine_set_cpumask);
+
+/**
+ * This function returns the count of allowed CPUs for parallel execution
+ * for engine with specified id.
+ */
+int cfs_ptengine_weight(struct cfs_ptask_engine *engine)
+{
+	if (IS_ERR_OR_NULL(engine))
+		return -EINVAL;
+
+	return engine->pte_weight;
+}
+EXPORT_SYMBOL(cfs_ptengine_weight);
+
+#ifdef CONFIG_PADATA
+static int cfs_ptask_cpumask_change_notify(struct notifier_block *self,
+					   unsigned long val, void *data)
+{
+	struct padata_cpumask *padata_cpumask = data;
+	struct cfs_ptask_engine *engine;
+
+	engine = container_of(self, struct cfs_ptask_engine, pte_notifier);
+
+	if (val & PADATA_CPU_PARALLEL)
+		engine->pte_weight = cpumask_weight(padata_cpumask->pcpu);
+
+	return 0;
+}
+
+static int cfs_ptengine_padata_init(struct cfs_ptask_engine *engine,
+				    const char *name,
+				    const struct cpumask *cpumask)
+{
+	cpumask_var_t all_mask;
+	cpumask_var_t par_mask;
+	unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_CPU_INTENSIVE;
+	int rc;
+
+	get_online_cpus();
+
+	engine->pte_wq = alloc_workqueue(name, wq_flags, 1);
+	if (engine->pte_wq == NULL)
+		GOTO(err, rc = -ENOMEM);
+
+	if (!alloc_cpumask_var(&all_mask, GFP_KERNEL))
+		GOTO(err_destroy_workqueue, rc = -ENOMEM);
+
+	if (!alloc_cpumask_var(&par_mask, GFP_KERNEL))
+		GOTO(err_free_all_mask, rc = -ENOMEM);
+
+	cpumask_copy(par_mask, cpumask);
+	if (cpumask_empty(par_mask) ||
+	    cpumask_equal(par_mask, cpu_online_mask)) {
+		cpumask_copy(all_mask, cpu_online_mask);
+		cpumask_clear(par_mask);
+		while (!cpumask_empty(all_mask)) {
+			int cpu = cpumask_first(all_mask);
+
+			cpumask_set_cpu(cpu, par_mask);
+			cpumask_andnot(all_mask, all_mask,
+					topology_sibling_cpumask(cpu));
+		}
+	}
+
+	cpumask_copy(all_mask, cpu_online_mask);
+
+	{
+		char *pa_mask_buff, *cb_mask_buff;
+
+		pa_mask_buff = (char *)__get_free_page(GFP_KERNEL);
+		if (pa_mask_buff == NULL)
+			GOTO(err_free_par_mask, rc = -ENOMEM);
+
+		cb_mask_buff = (char *)__get_free_page(GFP_KERNEL);
+		if (cb_mask_buff == NULL) {
+			free_page((unsigned long)pa_mask_buff);
+			GOTO(err_free_par_mask, rc = -ENOMEM);
+		}
+
+		cpumap_print_to_pagebuf(true, pa_mask_buff, par_mask);
+		pa_mask_buff[PAGE_SIZE - 1] = '\0';
+		cpumap_print_to_pagebuf(true, cb_mask_buff, all_mask);
+		cb_mask_buff[PAGE_SIZE - 1] = '\0';
+
+		CDEBUG(D_INFO, "%s weight=%u plist='%s' cblist='%s'\n",
+			name, cpumask_weight(par_mask),
+			pa_mask_buff, cb_mask_buff);
+
+		free_page((unsigned long)cb_mask_buff);
+		free_page((unsigned long)pa_mask_buff);
+	}
+
+	engine->pte_weight = cpumask_weight(par_mask);
+	engine->pte_pinst  = padata_alloc_possible(engine->pte_wq);
+	if (engine->pte_pinst == NULL)
+		GOTO(err_free_par_mask, rc = -ENOMEM);
+
+	engine->pte_notifier.notifier_call = cfs_ptask_cpumask_change_notify;
+	rc = padata_register_cpumask_notifier(engine->pte_pinst,
+					      &engine->pte_notifier);
+	if (rc)
+		GOTO(err_free_padata, rc);
+
+	rc = cfs_ptengine_set_cpumask(engine, par_mask);
+	if (rc)
+		GOTO(err_unregister, rc);
+
+	rc = padata_start(engine->pte_pinst);
+	if (rc)
+		GOTO(err_unregister, rc);
+
+	free_cpumask_var(par_mask);
+	free_cpumask_var(all_mask);
+
+	put_online_cpus();
+	return 0;
+
+err_unregister:
+	padata_unregister_cpumask_notifier(engine->pte_pinst,
+					   &engine->pte_notifier);
+err_free_padata:
+	padata_free(engine->pte_pinst);
+err_free_par_mask:
+	free_cpumask_var(par_mask);
+err_free_all_mask:
+	free_cpumask_var(all_mask);
+err_destroy_workqueue:
+	destroy_workqueue(engine->pte_wq);
+err:
+	put_online_cpus();
+	return rc;
+}
+
+static void cfs_ptengine_padata_fini(struct cfs_ptask_engine *engine)
+{
+	padata_stop(engine->pte_pinst);
+	padata_unregister_cpumask_notifier(engine->pte_pinst,
+					   &engine->pte_notifier);
+	padata_free(engine->pte_pinst);
+	destroy_workqueue(engine->pte_wq);
+}
+
+#else  /* !CONFIG_PADATA */
+
+static int cfs_ptengine_padata_init(struct cfs_ptask_engine *engine,
+				    const char *name,
+				    const struct cpumask *cpumask)
+{
+	engine->pte_weight = 1;
+
+	return 0;
+}
+
+static void cfs_ptengine_padata_fini(struct cfs_ptask_engine *engine)
+{
+}
+#endif /* CONFIG_PADATA */
+
+struct cfs_ptask_engine *cfs_ptengine_init(const char *name,
+					   const struct cpumask *cpumask)
+{
+	struct cfs_ptask_engine *engine;
+	int rc;
+
+	engine = kzalloc(sizeof(*engine), GFP_KERNEL);
+	if (engine == NULL)
+		GOTO(err, rc = -ENOMEM);
+
+	rc = cfs_ptengine_padata_init(engine, name, cpumask);
+	if (rc)
+		GOTO(err_free_engine, rc);
+
+	return engine;
+
+err_free_engine:
+	kfree(engine);
+err:
+	return ERR_PTR(rc);
+}
+EXPORT_SYMBOL(cfs_ptengine_init);
+
+void cfs_ptengine_fini(struct cfs_ptask_engine *engine)
+{
+	if (IS_ERR_OR_NULL(engine))
+		return;
+
+	cfs_ptengine_padata_fini(engine);
+	kfree(engine);
+}
+EXPORT_SYMBOL(cfs_ptengine_fini);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_string.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_string.c
index b460df3c4d9bc..04e1dd56dd430 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_string.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_string.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -36,9 +36,7 @@
  * Author: Nathan Rutman <nathan.rutman@sun.com>
  */
 
-#include <linux/ctype.h>
 #include <libcfs/libcfs.h>
-#include <libcfs/libcfs_string.h>
 
 char *cfs_strrstr(const char *haystack, const char *needle)
 {
@@ -254,47 +252,17 @@ int
 cfs_str2num_check(char *str, int nob, unsigned *num,
 		  unsigned min, unsigned max)
 {
-	bool all_numbers = true;
-	char *endp, cache;
-	int len;
-	int rc;
-
-	endp = strim(str);
-	/**
-	 * kstrouint can only handle strings composed
-	 * of only numbers. We need to scan the string
-	 * passed in for the first non-digit character
-	 * and end the string at that location. If we
-	 * don't find any non-digit character we still
-	 * need to place a '\0' at position len since
-	 * we are not interested in the rest of the
-	 * string which is longer than len in size.
-	 * After we are done the character at the
-	 * position we placed '\0' must be restored.
-	 */
-	len = min((int)strlen(endp), nob);
-	for (; endp < str + len; endp++) {
-		if (!isxdigit(*endp) && *endp != '-' &&
-		    *endp != '+') {
-			all_numbers = false;
-			break;
-		}
-	}
-
-	/* Eat trailing space */
-	if (!all_numbers && isspace(*endp)) {
-		all_numbers = true;
-		endp--;
-	}
-
-	cache = *endp;
-	*endp = '\0';
+	char	*endp;
 
-	rc = kstrtouint(str, 0, num);
-	*endp = cache;
-	if (rc || !all_numbers)
+	*num = simple_strtoul(str, &endp, 0);
+	if (endp == str)
 		return 0;
 
+	for (; endp < str + nob; endp++) {
+		if (!isspace(*endp))
+			return 0;
+	}
+
 	return (*num >= min && *num <= max);
 }
 EXPORT_SYMBOL(cfs_str2num_check);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-adler.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-adler.c
index 7a19a5803ee8c..0f507d555e603 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-adler.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-adler.c
@@ -116,9 +116,7 @@ static struct shash_alg alg = {
 		.cra_name		= "adler32",
 		.cra_driver_name	= "adler32-zlib",
 		.cra_priority		= 100,
-#ifdef CRYPTO_ALG_OPTIONAL_KEY
 		.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
-#endif
 		.cra_blocksize		= CHKSUM_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(u32),
 		.cra_module		= NULL,
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32.c
index c794e670ecfd9..c20e5e9a8194b 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32.c
@@ -65,7 +65,6 @@ static int crc32_setkey(struct crypto_shash *hash, const u8 *key,
 
 	if (keylen != sizeof(u32))
 		return -EINVAL;
-
 	*mctx = le32_to_cpup((__le32 *)key);
 	return 0;
 }
@@ -129,9 +128,7 @@ static struct shash_alg alg = {
 		.cra_name		= "crc32",
 		.cra_driver_name	= "crc32-table",
 		.cra_priority		= 100,
-#ifdef CRYPTO_ALG_OPTIONAL_KEY
 		.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
-#endif
 		.cra_blocksize		= CHKSUM_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(u32),
 		.cra_module		= NULL,
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c
index 566ba882ede82..5262f071b8a7a 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c
@@ -63,7 +63,6 @@ static int crc32c_pclmul_setkey(struct crypto_shash *hash, const u8 *key,
 
 	if (keylen != sizeof(u32))
 		return -EINVAL;
-
 	*mctx = le32_to_cpup((__le32 *)key);
 	return 0;
 }
@@ -132,9 +131,7 @@ static struct shash_alg alg = {
 			.cra_name		= "crc32c",
 			.cra_driver_name	= "crc32c-pclmul",
 			.cra_priority		= 150,
-#ifdef CRYPTO_ALG_OPTIONAL_KEY
 			.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
-#endif
 			.cra_blocksize		= CHKSUM_BLOCK_SIZE,
 			.cra_ctxsize		= sizeof(u32),
 			.cra_module		= NULL,
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c
index 8d4cb640681f8..4ad3b7c310037 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c
@@ -102,7 +102,6 @@ static int crc32_pclmul_setkey(struct crypto_shash *hash, const u8 *key,
 
 	if (keylen != sizeof(u32))
 		return -EINVAL;
-
 	*mctx = le32_to_cpup((__le32 *)key);
 	return 0;
 }
@@ -168,9 +167,6 @@ static struct shash_alg alg = {
 			.cra_name		= "crc32",
 			.cra_driver_name	= "crc32-pclmul",
 			.cra_priority		= 200,
-#ifdef CRYPTO_ALG_OPTIONAL_KEY
-			.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
-#endif
 			.cra_blocksize		= CHKSUM_BLOCK_SIZE,
 			.cra_ctxsize		= sizeof(u32),
 			.cra_module		= NULL,
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto.c
index dce1734a4d500..1991a86a49598 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto.c
@@ -29,7 +29,6 @@
 
 #include <crypto/hash.h>
 #include <linux/scatterlist.h>
-#include <linux/pagemap.h>
 #include <libcfs/libcfs.h>
 #include <libcfs/libcfs_crypto.h>
 #include <libcfs/linux/linux-crypto.h>
@@ -78,27 +77,13 @@ static int cfs_crypto_hash_alloc(enum cfs_crypto_hash_alg hash_alg,
 	int err = 0;
 
 	*type = cfs_crypto_hash_type(hash_alg);
-	if (!*type) {
+
+	if (*type == NULL) {
 		CWARN("Unsupported hash algorithm id = %d, max id is %d\n",
 		      hash_alg, CFS_HASH_ALG_MAX);
 		return -EINVAL;
 	}
-
-	/* Keys are only supported for the hmac version */
-	if (key && key_len > 0) {
-		char *algo_name;
-
-		algo_name = kasprintf(GFP_KERNEL, "hmac(%s)",
-				      (*type)->cht_name);
-		if (!algo_name)
-			return -ENOMEM;
-
-		tfm = crypto_alloc_ahash(algo_name, 0, CRYPTO_ALG_ASYNC);
-		kfree(algo_name);
-	} else {
-		tfm = crypto_alloc_ahash((*type)->cht_name, 0,
-					 CRYPTO_ALG_ASYNC);
-	}
+	tfm = crypto_alloc_ahash((*type)->cht_name, 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(tfm)) {
 		CDEBUG(D_INFO, "Failed to alloc crypto hash %s\n",
 		       (*type)->cht_name);
@@ -109,7 +94,8 @@ static int cfs_crypto_hash_alloc(enum cfs_crypto_hash_alg hash_alg,
 	if (!*req) {
 		CDEBUG(D_INFO, "Failed to alloc ahash_request for %s\n",
 		       (*type)->cht_name);
-		GOTO(out_free_tfm, err = -ENOMEM);
+		crypto_free_ahash(tfm);
+		return -ENOMEM;
 	}
 
 	ahash_request_set_callback(*req, 0, NULL, NULL);
@@ -120,8 +106,12 @@ static int cfs_crypto_hash_alloc(enum cfs_crypto_hash_alg hash_alg,
 		err = crypto_ahash_setkey(tfm,
 					 (unsigned char *)&((*type)->cht_key),
 					 (*type)->cht_size);
-	if (err)
-		GOTO(out_free_req, err);
+
+	if (err != 0) {
+		ahash_request_free(*req);
+		crypto_free_ahash(tfm);
+		return err;
+	}
 
 	CDEBUG(D_INFO, "Using crypto hash: %s (%s) speed %d MB/s\n",
 	       crypto_ahash_alg_name(tfm), crypto_ahash_driver_name(tfm),
@@ -129,9 +119,7 @@ static int cfs_crypto_hash_alloc(enum cfs_crypto_hash_alg hash_alg,
 
 	err = crypto_ahash_init(*req);
 	if (err) {
-out_free_req:
 		ahash_request_free(*req);
-out_free_tfm:
 		crypto_free_ahash(tfm);
 	}
 	return err;
@@ -207,10 +195,10 @@ EXPORT_SYMBOL(cfs_crypto_hash_digest);
  *			use default initial value
  * \param[in] key_len	length of \a key in bytes
  *
- * \retval		pointer to ahash request
+ * \retval		pointer to descriptor of hash instance
  * \retval		ERR_PTR(errno) in case of error
  */
-struct ahash_request *
+struct cfs_crypto_hash_desc *
 	cfs_crypto_hash_init(enum cfs_crypto_hash_alg hash_alg,
 			     unsigned char *key, unsigned int key_len)
 {
@@ -221,14 +209,14 @@ struct ahash_request *
 	err = cfs_crypto_hash_alloc(hash_alg, &type, &req, key, key_len);
 	if (err)
 		return ERR_PTR(err);
-	return req;
+	return (struct cfs_crypto_hash_desc *)req;
 }
 EXPORT_SYMBOL(cfs_crypto_hash_init);
 
 /**
  * Update hash digest computed on data within the given \a page
  *
- * \param[in] req	ahash request
+ * \param[in] hdesc	hash state descriptor
  * \param[in] page	data page on which to compute the hash
  * \param[in] offset	offset within \a page at which to start hash
  * \param[in] len	length of data on which to compute hash
@@ -236,10 +224,11 @@ EXPORT_SYMBOL(cfs_crypto_hash_init);
  * \retval		0 for success
  * \retval		negative errno on failure
  */
-int cfs_crypto_hash_update_page(struct ahash_request *req,
+int cfs_crypto_hash_update_page(struct cfs_crypto_hash_desc *hdesc,
 				struct page *page, unsigned int offset,
 				unsigned int len)
 {
+	struct ahash_request *req = (void *)hdesc;
 	struct scatterlist sl;
 
 	sg_init_table(&sl, 1);
@@ -253,16 +242,17 @@ EXPORT_SYMBOL(cfs_crypto_hash_update_page);
 /**
  * Update hash digest computed on the specified data
  *
- * \param[in] req	ahash request
+ * \param[in] hdesc	hash state descriptor
  * \param[in] buf	data buffer on which to compute the hash
  * \param[in] buf_len	length of \buf on which to compute hash
  *
  * \retval		0 for success
  * \retval		negative errno on failure
  */
-int cfs_crypto_hash_update(struct ahash_request *req,
+int cfs_crypto_hash_update(struct cfs_crypto_hash_desc *hdesc,
 			   const void *buf, unsigned int buf_len)
 {
+	struct ahash_request *req = (void *)hdesc;
 	struct scatterlist sl;
 
 	sg_init_one(&sl, (void *)buf, buf_len);
@@ -275,7 +265,7 @@ EXPORT_SYMBOL(cfs_crypto_hash_update);
 /**
  * Finish hash calculation, copy hash digest to buffer, clean up hash descriptor
  *
- * \param[in]	req		ahash request
+ * \param[in]	hdesc		hash descriptor
  * \param[out]	hash		pointer to hash buffer to store hash digest
  * \param[in,out] hash_len	pointer to hash buffer size, if \a hash == NULL
  *				or hash_len == NULL only free \a hdesc instead
@@ -285,9 +275,10 @@ EXPORT_SYMBOL(cfs_crypto_hash_update);
  * \retval		-EOVERFLOW if hash_len is too small for the hash digest
  * \retval		negative errno for other errors from lower layers
  */
-int cfs_crypto_hash_final(struct ahash_request *req,
+int cfs_crypto_hash_final(struct cfs_crypto_hash_desc *hdesc,
 			  unsigned char *hash, unsigned int *hash_len)
 {
+	struct ahash_request *req = (void *)hdesc;
 	int size = crypto_ahash_digestsize(crypto_ahash_reqtfm(req));
 	int err;
 
@@ -322,9 +313,6 @@ EXPORT_SYMBOL(cfs_crypto_hash_final);
  * The speed is stored internally in the cfs_crypto_hash_speeds[] array, and
  * is available through the cfs_crypto_hash_speed() function.
  *
- * This function needs to stay the same as obd_t10_performance_test() so that
- * the speeds are comparable.
- *
  * \param[in] hash_alg	hash algorithm id (CFS_HASH_ALG_*)
  * \param[in] buf	data buffer on which to compute the hash
  * \param[in] buf_len	length of \buf on which to compute hash
@@ -352,23 +340,23 @@ static void cfs_crypto_performance_test(enum cfs_crypto_hash_alg hash_alg)
 
 	for (start = jiffies, end = start + msecs_to_jiffies(MSEC_PER_SEC / 4),
 	     bcount = 0; time_before(jiffies, end) && err == 0; bcount++) {
-		struct ahash_request *req;
+		struct cfs_crypto_hash_desc *hdesc;
 		int i;
 
-		req = cfs_crypto_hash_init(hash_alg, NULL, 0);
-		if (IS_ERR(req)) {
-			err = PTR_ERR(req);
+		hdesc = cfs_crypto_hash_init(hash_alg, NULL, 0);
+		if (IS_ERR(hdesc)) {
+			err = PTR_ERR(hdesc);
 			break;
 		}
 
 		for (i = 0; i < buf_len / PAGE_SIZE; i++) {
-			err = cfs_crypto_hash_update_page(req, page, 0,
+			err = cfs_crypto_hash_update_page(hdesc, page, 0,
 							  PAGE_SIZE);
 			if (err != 0)
 				break;
 		}
 
-		err = cfs_crypto_hash_final(req, hash, &hash_len);
+		err = cfs_crypto_hash_final(hdesc, hash, &hash_len);
 		if (err != 0)
 			break;
 	}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c
index 799c40ea638ec..cd00d0ae5717f 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -37,12 +37,8 @@
  */
 
 #include <linux/sched.h>
-#ifdef HAVE_SCHED_HEADERS
-#include <linux/sched/signal.h>
-#include <linux/sched/mm.h>
-#endif
 #include <linux/fs_struct.h>
-#include <linux/pagemap.h>
+
 #include <linux/compat.h>
 #include <linux/thread_info.h>
 
@@ -153,7 +149,9 @@ static int cfs_access_process_vm(struct task_struct *tsk,
 		int bytes, rc, offset;
 		void *maddr;
 
-#if defined(HAVE_GET_USER_PAGES_GUP_FLAGS)
+#if defined(HAVE_GET_USER_PAGES_GUP_FLAGS_7ARGS)
+		rc = get_user_pages(tsk, mm, addr, 1, write ? FOLL_WRITE : 0, &page, &vma);
+#elif defined(HAVE_GET_USER_PAGES_GUP_FLAGS)
 		rc = get_user_pages(addr, 1, write ? FOLL_WRITE : 0, &page, &vma);
 #elif defined(HAVE_GET_USER_PAGES_6ARG)
 		rc = get_user_pages(addr, 1, write, 1, &page, &vma);
@@ -256,22 +254,15 @@ int cfs_get_environ(const char *key, char *value, int *val_len)
 
 			entry = env_start;
 			entry_len = env_end - env_start;
-			CDEBUG(D_INFO, "key: %s, entry: %s\n", key, entry);
 
 			/* Key length + length of '=' */
 			if (entry_len > key_len + 1 &&
-			    entry[key_len] == '='  &&
 			    !memcmp(entry, key, key_len)) {
 				entry += key_len + 1;
 				entry_len -= key_len + 1;
-
-				/* The 'value' buffer passed in is too small.
-				 * Copy what fits, but return -EOVERFLOW. */
-				if (entry_len >= *val_len) {
-					memcpy(value, entry, *val_len);
-					value[*val_len - 1] = 0;
+				/* The 'value' buffer passed in is too small.*/
+				if (entry_len >= *val_len)
 					GOTO(out, rc = -EOVERFLOW);
-				}
 
 				memcpy(value, entry, entry_len);
 				*val_len = entry_len;
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-debug.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-debug.c
index f1701f47d334a..048b2f34df5ba 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-debug.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-debug.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2013, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -36,6 +36,7 @@
 
 #include <linux/errno.h>
 #include <linux/interrupt.h>
+#include <linux/kallsyms.h>
 #include <linux/kmod.h>
 #include <linux/module.h>
 #include <linux/notifier.h>
@@ -114,28 +115,6 @@ EXPORT_SYMBOL(lbug_with_loc);
 
 #ifdef CONFIG_STACKTRACE
 
-#ifndef HAVE_SAVE_STACK_TRACE_TSK
-#define save_stack_trace_tsk(tsk, trace)				       \
-do {									       \
-	if (tsk == current)						       \
-		save_stack_trace(trace);				       \
-	else								       \
-		pr_info("No stack, save_stack_trace_tsk() not exported\n");    \
-} while (0)
-#endif
-
-static void cfs_print_stack_trace(unsigned long *entries, unsigned int nr)
-{
-	unsigned int i;
-
-	/* Prefer %pB for backtraced symbolic names since it was added in:
-	 * Linux v2.6.38-6557-g0f77a8d37825
-	 * vsprintf: Introduce %pB format specifier
-	 */
-	for (i = 0; i < nr; i++)
-		pr_info("[<0>] %pB\n", (void *)entries[i]);
-}
-
 #define MAX_ST_ENTRIES	100
 static DEFINE_SPINLOCK(st_lock);
 
@@ -151,20 +130,11 @@ typedef unsigned int (stack_trace_save_tsk_t)(struct task_struct *task,
 static stack_trace_save_tsk_t *task_dump_stack;
 #endif
 
-void __init cfs_debug_init(void)
-{
-#ifdef CONFIG_ARCH_STACKWALK
-	task_dump_stack = (void *)
-			cfs_kallsyms_lookup_name("stack_trace_save_tsk");
-
-#endif
-}
-
 static void libcfs_call_trace(struct task_struct *tsk)
 {
-	static unsigned long entries[MAX_ST_ENTRIES];
 #ifdef CONFIG_ARCH_STACKWALK
-	unsigned int nr_entries;
+	static unsigned long entries[MAX_ST_ENTRIES];
+	unsigned int i, nr_entries;
 
 	if (!task_dump_stack)
 		task_dump_stack = (stack_trace_save_tsk_t *)
@@ -176,11 +146,13 @@ static void libcfs_call_trace(struct task_struct *tsk)
 	pr_info("Call Trace TBD:\n");
 	if (task_dump_stack) {
 		nr_entries = task_dump_stack(tsk, entries, MAX_ST_ENTRIES, 0);
-		cfs_print_stack_trace(entries, nr_entries);
+		for (i = 0; i < nr_entries; i++)
+			pr_info("[<0>] %pB\n", (void *)entries[i]);
 	}
 	spin_unlock(&st_lock);
 #else
 	struct stack_trace trace;
+	static unsigned long entries[MAX_ST_ENTRIES];
 
 	trace.nr_entries = 0;
 	trace.max_entries = MAX_ST_ENTRIES;
@@ -192,7 +164,11 @@ static void libcfs_call_trace(struct task_struct *tsk)
 	       init_utsname()->release, init_utsname()->version);
 	pr_info("Call Trace:\n");
 	save_stack_trace_tsk(tsk, &trace);
-	cfs_print_stack_trace(trace.entries, trace.nr_entries);
+#ifdef HAVE_STACK_TRACE_PRINT
+	stack_trace_print(trace.entries, trace.nr_entries, 0);
+#else
+	print_stack_trace(&trace, 0);
+#endif
 	spin_unlock(&st_lock);
 #endif
 }
@@ -294,6 +270,12 @@ void libcfs_debug_dumpstack(struct task_struct *tsk)
 }
 EXPORT_SYMBOL(libcfs_debug_dumpstack);
 
+struct task_struct *libcfs_current(void)
+{
+        CWARN("current task struct is %p\n", current);
+        return current;
+}
+
 static int panic_notifier(struct notifier_block *self, unsigned long unused1,
                          void *unused2)
 {
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-hash.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-hash.c
deleted file mode 100644
index e4e67c20cee5d..0000000000000
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-hash.c
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-
-#define DEBUG_SUBSYSTEM S_LNET
-
-#include <linux/module.h>
-#ifdef HAVE_STRINGHASH
-#include <linux/stringhash.h>
-#else
-#include <linux/dcache.h>
-#endif
-#include <linux/hash.h>
-
-#include <libcfs/linux/linux-hash.h>
-
-/* Return the "hash_len" (hash and length) of a null-terminated string */
-/* The kernel equivalent is in fs/namei.c but for some strange reason
- * RHEL7.5 stuck it in dax/super.c instead. This placement never existed
- * upstream so to make life easier we just have the equavilent
- */
-u64 cfs_hashlen_string(const void *salt, const char *name)
-{
-#ifdef HAVE_FULL_NAME_HASH_3ARGS
-	unsigned long hash = init_name_hash(salt);
-#else
-	unsigned long hash = init_name_hash();
-#endif
-	unsigned long len = 0, c;
-
-	c = (unsigned char)*name;
-	while (c) {
-		len++;
-		hash = partial_name_hash(c, hash);
-		c = (unsigned char)name[len];
-	}
-	return hashlen_create(end_name_hash(hash), len);
-}
-EXPORT_SYMBOL(cfs_hashlen_string);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-module.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-module.c
index 7300af8018c69..839f9324ac5ca 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-module.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-module.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -32,9 +32,7 @@
 
 #define DEBUG_SUBSYSTEM S_LNET
 
-#include <linux/fs.h>
 #include <linux/miscdevice.h>
-#include <linux/uaccess.h>
 #include <libcfs/libcfs.h>
 
 static inline size_t libcfs_ioctl_packlen(struct libcfs_ioctl_data *data)
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c
index 2ee18be5e59a6..4b73ed6e79a93 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c
@@ -36,18 +36,13 @@
 #include <linux/fs.h>
 #include <linux/fs_struct.h>
 #include <linux/sched.h>
-#ifdef HAVE_SCHED_HEADERS
-#include <linux/sched/signal.h>
-#include <linux/sched/mm.h>
-#endif
 #include <linux/uaccess.h>
+#include <libcfs/libcfs.h>
 
 #if defined(CONFIG_KGDB)
 #include <asm/kgdb.h>
 #endif
 
-#include <libcfs/linux/linux-time.h>
-
 #ifndef HAVE_KTIME_GET_TS64
 void ktime_get_ts64(struct timespec64 *ts)
 {
@@ -102,17 +97,17 @@ time64_t ktime_get_seconds(void)
 EXPORT_SYMBOL(ktime_get_seconds);
 #endif /* HAVE_KTIME_GET_SECONDS */
 
-static int (*cfs_apply_workqueue_attrs_t)(struct workqueue_struct *wq,
-					  const struct workqueue_attrs *attrs);
+#ifndef HAVE_LINUX_SELINUX_IS_ENABLED
+static char **cfs_lsm_names;
 
-int cfs_apply_workqueue_attrs(struct workqueue_struct *wq,
-			      const struct workqueue_attrs *attrs)
+bool selinux_is_enabled(void)
 {
-	if (cfs_apply_workqueue_attrs_t)
-		return cfs_apply_workqueue_attrs_t(wq, attrs);
-	return 0;
+	if (cfs_lsm_names)
+		return !!strstr("selinux", *cfs_lsm_names);
+	return false;
 }
-EXPORT_SYMBOL_GPL(cfs_apply_workqueue_attrs);
+EXPORT_SYMBOL(selinux_is_enabled);
+#endif
 
 int cfs_kernel_write(struct file *filp, const void *buf, size_t count,
 		     loff_t *pos)
@@ -132,43 +127,6 @@ int cfs_kernel_write(struct file *filp, const void *buf, size_t count,
 }
 EXPORT_SYMBOL(cfs_kernel_write);
 
-#ifndef HAVE_KSET_FIND_OBJ
-struct kobject *kset_find_obj(struct kset *kset, const char *name)
-{
-	struct kobject *ret = NULL;
-	struct kobject *k;
-
-	spin_lock(&kset->list_lock);
-
-	list_for_each_entry(k, &kset->list, entry) {
-		if (kobject_name(k) && !strcmp(kobject_name(k), name)) {
-			if (kref_get_unless_zero(&k->kref))
-				ret = k;
-			break;
-		}
-	}
-
-	spin_unlock(&kset->list_lock);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(kset_find_obj);
-#endif
-
-#ifndef HAVE_KSTRTOBOOL_FROM_USER
-int kstrtobool_from_user(const char __user *s, size_t count, bool *res)
-{
-	/* Longest string needed to differentiate, newline, terminator */
-	char buf[4];
-
-	count = min(count, sizeof(buf) - 1);
-	if (copy_from_user(buf, s, count))
-		return -EFAULT;
-	buf[count] = '\0';
-	return strtobool(buf, res);
-}
-EXPORT_SYMBOL(kstrtobool_from_user);
-#endif /* !HAVE_KSTRTOBOOL_FROM_USER */
-
 sigset_t
 cfs_block_allsigs(void)
 {
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-tracefile.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-tracefile.c
index 9685296266f04..e0fd4c0de04f1 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-tracefile.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-tracefile.c
@@ -33,7 +33,6 @@
 #define DEBUG_SUBSYSTEM S_LNET
 #define LUSTRE_TRACEFILE_PRIVATE
 
-#include <linux/slab.h>
 #include <libcfs/libcfs.h>
 #include "tracefile.h"
 
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-wait.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-wait.c
deleted file mode 100644
index 5843d808bc332..0000000000000
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-wait.c
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * The implementation of the wait_bit*() and related waiting APIs:
- */
-#include <linux/hash.h>
-#include <linux/sched.h>
-#ifdef HAVE_SCHED_HEADERS
-#include <linux/sched/signal.h>
-#endif
-#include <libcfs/linux/linux-wait.h>
-
-#ifndef HAVE_PREPARE_TO_WAIT_EVENT
-
-#define __add_wait_queue_entry_tail __add_wait_queue_tail
-
-long prepare_to_wait_event(wait_queue_head_t *wq_head,
-			   wait_queue_entry_t *wq_entry, int state)
-{
-	unsigned long flags;
-	long ret = 0;
-
-	spin_lock_irqsave(&wq_head->lock, flags);
-	if (unlikely(signal_pending_state(state, current))) {
-		/*
-		 * Exclusive waiter must not fail if it was selected by wakeup,
-		 * it should "consume" the condition we were waiting for.
-		 *
-		 * The caller will recheck the condition and return success if
-		 * we were already woken up, we can not miss the event because
-		 * wakeup locks/unlocks the same wq_head->lock.
-		 *
-		 * But we need to ensure that set-condition + wakeup after that
-		 * can't see us, it should wake up another exclusive waiter if
-		 * we fail.
-		 */
-		list_del_init(&wq_entry->task_list);
-		ret = -ERESTARTSYS;
-	} else {
-		if (list_empty(&wq_entry->task_list)) {
-			if (wq_entry->flags & WQ_FLAG_EXCLUSIVE)
-				__add_wait_queue_entry_tail(wq_head, wq_entry);
-			else
-				__add_wait_queue(wq_head, wq_entry);
-		}
-		set_current_state(state);
-	}
-	spin_unlock_irqrestore(&wq_head->lock, flags);
-
-	return ret;
-}
-EXPORT_SYMBOL(prepare_to_wait_event);
-#endif /* !HAVE_PREPARE_TO_WAIT_EVENT */
-
-#ifndef HAVE_WAIT_VAR_EVENT
-
-#define WAIT_TABLE_BITS 8
-#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
-
-static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
-
-wait_queue_head_t *__var_waitqueue(void *p)
-{
-	return bit_wait_table + hash_ptr(p, WAIT_TABLE_BITS);
-}
-EXPORT_SYMBOL(__var_waitqueue);
-
-static int
-var_wake_function(wait_queue_entry_t *wq_entry, unsigned int mode,
-		  int sync, void *arg)
-{
-	struct wait_bit_key *key = arg;
-	struct wait_bit_queue_entry *wbq_entry =
-		container_of(wq_entry, struct wait_bit_queue_entry, wq_entry);
-
-	if (wbq_entry->key.flags != key->flags ||
-	    wbq_entry->key.bit_nr != key->bit_nr)
-		return 0;
-
-	return autoremove_wake_function(wq_entry, mode, sync, key);
-}
-
-void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var,
-			 int flags)
-{
-	*wbq_entry = (struct wait_bit_queue_entry){
-		.key = {
-			.flags	= (var),
-			.bit_nr = -1,
-		},
-		.wq_entry = {
-			.private = current,
-			.func = var_wake_function,
-#ifdef HAVE_WAIT_QUEUE_ENTRY_LIST
-			.entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry),
-#else
-			.task_list = LIST_HEAD_INIT(wbq_entry->wq_entry.task_list),
-#endif
-		},
-	};
-}
-EXPORT_SYMBOL(init_wait_var_entry);
-
-void wake_up_var(void *var)
-{
-	__wake_up_bit(__var_waitqueue(var), var, -1);
-}
-EXPORT_SYMBOL(wake_up_var);
-
-void __init wait_bit_init(void)
-{
-	int i;
-
-	for (i = 0; i < WAIT_TABLE_SIZE; i++)
-		init_waitqueue_head(bit_wait_table + i);
-}
-#endif /* ! HAVE_WAIT_VAR_EVENT */
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/module.c b/drivers/staging/lustrefsx/libcfs/libcfs/module.c
index 08f5a1c1a5655..f832a6fd02bce 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/module.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/module.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -46,21 +46,52 @@
 #include <linux/list.h>
 
 #include <linux/sysctl.h>
-#include <linux/debugfs.h>
+#include <linux/proc_fs.h>
 #include <asm/div64.h>
 
 #define DEBUG_SUBSYSTEM S_LNET
 
 #include <libcfs/libcfs.h>
 #include <libcfs/libcfs_crypto.h>
-#include <libcfs/linux/linux-fs.h>
 #include <lnet/lib-lnet.h>
 #include "tracefile.h"
 
-static struct dentry *lnet_debugfs_root;
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *lnet_table_header;
+#endif
+
+static DECLARE_RWSEM(ioctl_list_sem);
+static LIST_HEAD(ioctl_list);
+
+int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand)
+{
+	int rc = 0;
+
+	down_write(&ioctl_list_sem);
+	if (!list_empty(&hand->item))
+		rc = -EBUSY;
+	else
+		list_add_tail(&hand->item, &ioctl_list);
+	up_write(&ioctl_list_sem);
+
+	return rc;
+}
+EXPORT_SYMBOL(libcfs_register_ioctl);
+
+int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand)
+{
+	int rc = 0;
+
+	down_write(&ioctl_list_sem);
+	if (list_empty(&hand->item))
+		rc = -ENOENT;
+	else
+		list_del_init(&hand->item);
+	up_write(&ioctl_list_sem);
 
-BLOCKING_NOTIFIER_HEAD(libcfs_ioctl_list);
-EXPORT_SYMBOL(libcfs_ioctl_list);
+	return rc;
+}
+EXPORT_SYMBOL(libcfs_deregister_ioctl);
 
 int libcfs_ioctl(unsigned long cmd, void __user *uparam)
 {
@@ -102,27 +133,35 @@ int libcfs_ioctl(unsigned long cmd, void __user *uparam)
 		libcfs_debug_mark_buffer(data->ioc_inlbuf1);
 		break;
 
-	default:
-		err = blocking_notifier_call_chain(&libcfs_ioctl_list,
-						   cmd, hdr);
-		if (!(err & NOTIFY_STOP_MASK))
-			/* No-one claimed the ioctl */
-			err = -EINVAL;
-		else
-			err = notifier_to_errno(err);
-		if (copy_to_user(uparam, hdr, hdr->ioc_len) && !err)
-			err = -EFAULT;
-		break;
+	default: {
+		struct libcfs_ioctl_handler *hand;
+
+		err = -EINVAL;
+		down_read(&ioctl_list_sem);
+		list_for_each_entry(hand, &ioctl_list, item) {
+			err = hand->handle_ioctl(cmd, hdr);
+			if (err == -EINVAL)
+				continue;
+
+			if (err == 0) {
+				if (copy_to_user(uparam, hdr, hdr->ioc_len))
+					err = -EFAULT;
+			}
+			break;
+		}
+		up_read(&ioctl_list_sem);
+		break; }
 	}
 out:
 	LIBCFS_FREE(hdr, hdr->ioc_len);
 	RETURN(err);
 }
 
-int lprocfs_call_handler(void *data, int write, loff_t *ppos,
-			 void __user *buffer, size_t *lenp,
-			 int (*handler)(void *data, int write, loff_t pos,
-					void __user *buffer, int len))
+int
+lprocfs_call_handler(void *data, int write, loff_t *ppos,
+		     void __user *buffer, size_t *lenp,
+		     int (*handler)(void *data, int write, loff_t pos,
+				    void __user *buffer, int len))
 {
 	int rc = handler(data, write, *ppos, buffer, *lenp);
 
@@ -180,8 +219,9 @@ static int __proc_dobitmasks(void *data, int write,
 	return rc;
 }
 
-static int proc_dobitmasks(struct ctl_table *table, int write,
-			   void __user *buffer, size_t *lenp, loff_t *ppos)
+static int
+proc_dobitmasks(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
 				    __proc_dobitmasks);
@@ -199,8 +239,9 @@ static int __proc_dump_kernel(void *data, int write,
 	return cfs_trace_dump_debug_buffer_usrstr(buffer, nob);
 }
 
-static int proc_dump_kernel(struct ctl_table *table, int write,
-			    void __user *buffer, size_t *lenp, loff_t *ppos)
+static int
+proc_dump_kernel(struct ctl_table *table, int write, void __user *buffer,
+		 size_t *lenp, loff_t *ppos)
 {
 	return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
 				    __proc_dump_kernel);
@@ -222,133 +263,156 @@ static int __proc_daemon_file(void *data, int write,
 	return cfs_trace_daemon_command_usrstr(buffer, nob);
 }
 
-static int proc_daemon_file(struct ctl_table *table, int write,
-			    void __user *buffer, size_t *lenp, loff_t *ppos)
+static int
+proc_daemon_file(struct ctl_table *table, int write, void __user *buffer,
+		 size_t *lenp, loff_t *ppos)
 {
 	return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
 				    __proc_daemon_file);
 }
 
-static int libcfs_force_lbug(struct ctl_table *table, int write,
-			     void __user *buffer,
-			     size_t *lenp, loff_t *ppos)
+static int __proc_debug_mb(void *data, int write,
+			   loff_t pos, void __user *buffer, int nob)
 {
-	if (write)
-		LBUG();
-	return 0;
+	if (!write) {
+		char tmpstr[32];
+		int  len = snprintf(tmpstr, sizeof(tmpstr), "%d",
+				    cfs_trace_get_debug_mb());
+
+		if (pos >= len)
+			return 0;
+
+		return cfs_trace_copyout_string(buffer, nob, tmpstr + pos,
+						"\n");
+	}
+
+	return cfs_trace_set_debug_mb_usrstr(buffer, nob);
 }
 
-static int proc_fail_loc(struct ctl_table *table, int write,
-			 void __user *buffer, size_t *lenp, loff_t *ppos)
+static int
+proc_debug_mb(struct ctl_table *table, int write, void __user *buffer,
+	      size_t *lenp, loff_t *ppos)
 {
-	int rc;
-	long old_fail_loc = cfs_fail_loc;
+	return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
+				    __proc_debug_mb);
+}
 
-	if (!*lenp || *ppos) {
-		*lenp = 0;
-		return 0;
-	}
+static int
+proc_console_max_delay_cs(struct ctl_table *table, int write,
+			  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int rc, max_delay_cs;
+	struct ctl_table dummy = *table;
+	cfs_duration_t d;
 
-	if (write) {
-		char *kbuf = memdup_user_nul(buffer, *lenp);
+	dummy.data = &max_delay_cs;
+	dummy.proc_handler = &proc_dointvec;
 
-		if (IS_ERR(kbuf))
-			return PTR_ERR(kbuf);
-		rc = kstrtoul(kbuf, 0, &cfs_fail_loc);
-		kfree(kbuf);
-		*ppos += *lenp;
-	} else {
-		char kbuf[64/3+3];
-
-		rc = scnprintf(kbuf, sizeof(kbuf), "%lu\n", cfs_fail_loc);
-		if (copy_to_user(buffer, kbuf, rc))
-			rc = -EFAULT;
-		else {
-			*lenp = rc;
-			*ppos += rc;
-		}
+	if (!write) { /* read */
+		max_delay_cs = cfs_duration_sec(libcfs_console_max_delay * 100);
+		rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+		return rc;
 	}
 
-	if (old_fail_loc != cfs_fail_loc) {
-		cfs_race_state = 1;
-		wake_up(&cfs_race_waitq);
-	}
+	/* write */
+	max_delay_cs = 0;
+	rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+	if (rc < 0)
+		return rc;
+	if (max_delay_cs <= 0)
+		return -EINVAL;
+
+	d = cfs_time_seconds(max_delay_cs) / 100;
+	if (d == 0 || d < libcfs_console_min_delay)
+		return -EINVAL;
+	libcfs_console_max_delay = d;
+
 	return rc;
 }
 
-int debugfs_doint(struct ctl_table *table, int write,
-		  void __user *buffer, size_t *lenp, loff_t *ppos)
+static int
+proc_console_min_delay_cs(struct ctl_table *table, int write,
+			  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-	int rc;
+	int rc, min_delay_cs;
+	struct ctl_table dummy = *table;
+	cfs_duration_t d;
 
-	if (!*lenp || *ppos) {
-		*lenp = 0;
-		return 0;
-	}
+	dummy.data = &min_delay_cs;
+	dummy.proc_handler = &proc_dointvec;
 
-	if (write) {
-		char *kbuf = memdup_user_nul(buffer, *lenp);
-		int val;
-
-		if (IS_ERR(kbuf))
-			return PTR_ERR(kbuf);
-
-		rc = kstrtoint(kbuf, 0, &val);
-		kfree(kbuf);
-		if (!rc) {
-			if (table->extra1 && val < *(int *)table->extra1)
-				val = *(int *)table->extra1;
-			if (table->extra2 && val > *(int *)table->extra2)
-				val = *(int *)table->extra2;
-			*(int *)table->data = val;
-		}
-		*ppos += *lenp;
-	} else {
-		char kbuf[64/3+3];
-
-		rc = scnprintf(kbuf, sizeof(kbuf), "%u\n", *(int *)table->data);
-		if (copy_to_user(buffer, kbuf, rc))
-			rc = -EFAULT;
-		else {
-			*lenp = rc;
-			*ppos += rc;
-		}
+	if (!write) { /* read */
+		min_delay_cs = cfs_duration_sec(libcfs_console_min_delay * 100);
+		rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+		return rc;
 	}
 
+	/* write */
+	min_delay_cs = 0;
+	rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+	if (rc < 0)
+		return rc;
+	if (min_delay_cs <= 0)
+		return -EINVAL;
+
+	d = cfs_time_seconds(min_delay_cs) / 100;
+	if (d == 0 || d > libcfs_console_max_delay)
+		return -EINVAL;
+	libcfs_console_min_delay = d;
+
 	return rc;
 }
-EXPORT_SYMBOL(debugfs_doint);
 
-static int debugfs_dostring(struct ctl_table *table, int write,
-			    void __user *buffer, size_t *lenp, loff_t *ppos)
+static int
+proc_console_backoff(struct ctl_table *table, int write, void __user *buffer,
+		     size_t *lenp, loff_t *ppos)
 {
-	int len = *lenp;
-	char *kbuf = table->data;
+	int rc, backoff;
+	struct ctl_table dummy = *table;
 
-	if (!len || *ppos) {
-		*lenp = 0;
-		return 0;
-	}
-	if (len > table->maxlen)
-		len = table->maxlen;
-	if (write) {
-		if (copy_from_user(kbuf, buffer, len))
-			return -EFAULT;
-		memset(kbuf+len, 0, table->maxlen - len);
-		*ppos = *lenp;
-	} else {
-		len = strnlen(kbuf, len);
-		if (copy_to_user(buffer, kbuf, len))
-			return -EFAULT;
-		if (len < *lenp) {
-			if (copy_to_user(buffer+len, "\n", 1))
-				return -EFAULT;
-			len += 1;
-		}
-		*ppos += len;
-		*lenp -= len;
+	dummy.data = &backoff;
+	dummy.proc_handler = &proc_dointvec;
+
+	if (!write) { /* read */
+		backoff = libcfs_console_backoff;
+		rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+		return rc;
 	}
-	return len;
+
+	/* write */
+	backoff = 0;
+	rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+	if (rc < 0)
+		return rc;
+
+	if (backoff <= 0)
+		return -EINVAL;
+
+	libcfs_console_backoff = backoff;
+
+	return rc;
+}
+
+static int
+libcfs_force_lbug(struct ctl_table *table, int write, void __user *buffer,
+		  size_t *lenp, loff_t *ppos)
+{
+	if (write)
+		LBUG();
+	return 0;
+}
+
+static int
+proc_fail_loc(struct ctl_table *table, int write, void __user *buffer,
+	      size_t *lenp, loff_t *ppos)
+{
+	int rc;
+	long old_fail_loc = cfs_fail_loc;
+
+	rc = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+	if (old_fail_loc != cfs_fail_loc)
+		wake_up(&cfs_race_waitq);
+	return rc;
 }
 
 static int __proc_cpt_table(void *data, int write,
@@ -392,8 +456,9 @@ static int __proc_cpt_table(void *data, int write,
 	return rc;
 }
 
-static int proc_cpt_table(struct ctl_table *table, int write,
-			  void __user *buffer, size_t *lenp, loff_t *ppos)
+static int
+proc_cpt_table(struct ctl_table *table, int write, void __user *buffer,
+	       size_t *lenp, loff_t *ppos)
 {
 	return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
 				    __proc_cpt_table);
@@ -440,14 +505,19 @@ static int __proc_cpt_distance(void *data, int write,
 	return rc;
 }
 
-static int proc_cpt_distance(struct ctl_table *table, int write,
-			     void __user *buffer, size_t *lenp, loff_t *ppos)
+static int
+proc_cpt_distance(struct ctl_table *table, int write, void __user *buffer,
+	       size_t *lenp, loff_t *ppos)
 {
 	return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
 				     __proc_cpt_distance);
 }
 
 static struct ctl_table lnet_table[] = {
+	/*
+	 * NB No .strategy entries have been provided since sysctl(8) prefers
+	 * to go via /proc for portability.
+	 */
 	{
 		INIT_CTL_NAME
 		.procname	= "debug",
@@ -472,6 +542,43 @@ static struct ctl_table lnet_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dobitmasks,
 	},
+	{
+		INIT_CTL_NAME
+		.procname	= "console_ratelimit",
+		.data		= &libcfs_console_ratelimit,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "console_max_delay_centisecs",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_console_max_delay_cs
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "console_min_delay_centisecs",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_console_min_delay_cs
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "console_backoff",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_console_backoff
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "debug_path",
+		.data		= libcfs_debug_file_path_arr,
+		.maxlen		= sizeof(libcfs_debug_file_path_arr),
+		.mode		= 0644,
+		.proc_handler	= &proc_dostring,
+	},
 	{
 		INIT_CTL_NAME
 		.procname	= "cpu_partition_table",
@@ -492,7 +599,7 @@ static struct ctl_table lnet_table[] = {
 		.data		= lnet_debug_log_upcall,
 		.maxlen		= sizeof(lnet_debug_log_upcall),
 		.mode		= 0644,
-		.proc_handler	= &debugfs_dostring,
+		.proc_handler	= &proc_dostring,
 	},
 	{
 		INIT_CTL_NAME
@@ -500,7 +607,7 @@ static struct ctl_table lnet_table[] = {
 		.data		= (int *)&libcfs_kmemory.counter,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &debugfs_doint,
+		.proc_handler	= &proc_dointvec,
 	},
 	{
 		INIT_CTL_NAME
@@ -508,7 +615,15 @@ static struct ctl_table lnet_table[] = {
 		.data		= &libcfs_catastrophe,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &debugfs_doint,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		INIT_CTL_NAME
+		.procname	= "panic_on_lbug",
+		.data		= &libcfs_panic_on_lbug,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
 	},
 	{
 		INIT_CTL_NAME
@@ -524,13 +639,19 @@ static struct ctl_table lnet_table[] = {
 		.maxlen		= 256,
 		.proc_handler	= &proc_daemon_file,
 	},
+	{
+		INIT_CTL_NAME
+		.procname	= "debug_mb",
+		.mode		= 0644,
+		.proc_handler	= &proc_debug_mb,
+	},
 	{
 		INIT_CTL_NAME
 		.procname	= "watchdog_ratelimit",
 		.data		= &libcfs_watchdog_ratelimit,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &debugfs_doint,
+		.proc_handler	= &proc_dointvec_minmax,
 		.extra1		= &min_watchdog_ratelimit,
 		.extra2		= &max_watchdog_ratelimit,
 	},
@@ -556,7 +677,7 @@ static struct ctl_table lnet_table[] = {
 		.data		= &cfs_fail_val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &debugfs_doint
+		.proc_handler	= &proc_dointvec
 	},
 	{
 		INIT_CTL_NAME
@@ -564,154 +685,55 @@ static struct ctl_table lnet_table[] = {
 		.data		= &cfs_fail_err,
 		.maxlen		= sizeof(cfs_fail_err),
 		.mode		= 0644,
-		.proc_handler	= &debugfs_doint,
+		.proc_handler	= &proc_dointvec,
 	},
 	{
 	}
 };
 
-static const struct lnet_debugfs_symlink_def lnet_debugfs_symlinks[] = {
-	{ .name		= "console_ratelimit",
-	  .target	= "../../../module/libcfs/parameters/libcfs_console_ratelimit" },
-	{ .name		= "debug_path",
-	  .target	= "../../../module/libcfs/parameters/libcfs_debug_file_path" },
-	{ .name		= "panic_on_lbug",
-	  .target	= "../../../module/libcfs/parameters/libcfs_panic_on_lbug" },
-	{ .name		= "console_backoff",
-	  .target	= "../../../module/libcfs/parameters/libcfs_console_backoff" },
-	{ .name		= "debug_mb",
-	  .target	= "../../../module/libcfs/parameters/libcfs_debug_mb" },
-	{ .name		= "console_min_delay_centisecs",
-	  .target	= "../../../module/libcfs/parameters/libcfs_console_min_delay" },
-	{ .name		= "console_max_delay_centisecs",
-	  .target	= "../../../module/libcfs/parameters/libcfs_console_max_delay" },
-	{ .name		= NULL },
-};
-
-static ssize_t lnet_debugfs_read(struct file *filp, char __user *buf,
-				 size_t count, loff_t *ppos)
-{
-	struct ctl_table *table = filp->private_data;
-	ssize_t rc = -EINVAL;
-
-	if (table) {
-		rc = table->proc_handler(table, 0, buf, &count, ppos);
-		if (!rc)
-			rc = count;
-	}
-
-	return rc;
-}
-
-static ssize_t lnet_debugfs_write(struct file *filp, const char __user *buf,
-				  size_t count, loff_t *ppos)
-{
-	struct ctl_table *table = filp->private_data;
-	ssize_t rc = -EINVAL;
-
-	if (table) {
-		rc = table->proc_handler(table, 1, (void __user *)buf, &count,
-					 ppos);
-		if (!rc)
-			rc = count;
-	}
-
-	return rc;
-}
-
-static const struct file_operations lnet_debugfs_file_operations_rw = {
-	.open		= simple_open,
-	.read		= lnet_debugfs_read,
-	.write		= lnet_debugfs_write,
-	.llseek		= default_llseek,
-};
-
-static const struct file_operations lnet_debugfs_file_operations_ro = {
-	.open		= simple_open,
-	.read		= lnet_debugfs_read,
-	.llseek		= default_llseek,
-};
-
-static const struct file_operations lnet_debugfs_file_operations_wo = {
-	.open		= simple_open,
-	.write		= lnet_debugfs_write,
-	.llseek		= default_llseek,
+#ifdef CONFIG_SYSCTL
+static struct ctl_table top_table[] = {
+	{
+		INIT_CTL_NAME
+		.procname       = "lnet",
+		.mode           = 0555,
+		.data           = NULL,
+		.maxlen         = 0,
+		.child          = lnet_table,
+	},
+	{ .procname = NULL }
 };
+#endif
 
-static const struct file_operations *lnet_debugfs_fops_select(umode_t mode)
-{
-	if (!(mode & S_IWUGO))
-		return &lnet_debugfs_file_operations_ro;
-
-	if (!(mode & S_IRUGO))
-		return &lnet_debugfs_file_operations_wo;
-
-	return &lnet_debugfs_file_operations_rw;
-}
-
-void lnet_insert_debugfs(struct ctl_table *table)
+static int insert_proc(void)
 {
-	if (!lnet_debugfs_root)
-		lnet_debugfs_root = debugfs_create_dir("lnet", NULL);
-
-	/* Even if we cannot create, just ignore it altogether) */
-	if (IS_ERR_OR_NULL(lnet_debugfs_root))
-		return;
-
-	/* We don't save the dentry returned in next two calls, because
-	 * we don't call debugfs_remove() but rather remove_recursive()
-	 */
-	for (; table && table->procname; table++)
-		debugfs_create_file(table->procname, table->mode,
-				    lnet_debugfs_root, table,
-				    lnet_debugfs_fops_select(table->mode));
+#ifdef CONFIG_SYSCTL
+	if (lnet_table_header == NULL)
+		lnet_table_header = register_sysctl_table(top_table);
+#endif
+	return 0;
 }
-EXPORT_SYMBOL_GPL(lnet_insert_debugfs);
 
-static void lnet_insert_debugfs_links(
-		const struct lnet_debugfs_symlink_def *symlinks)
+static void remove_proc(void)
 {
-	for (; symlinks && symlinks->name; symlinks++)
-		debugfs_create_symlink(symlinks->name, lnet_debugfs_root,
-				       symlinks->target);
-}
+#ifdef CONFIG_SYSCTL
+	if (lnet_table_header != NULL)
+		unregister_sysctl_table(lnet_table_header);
 
-void lnet_remove_debugfs(struct ctl_table *table)
-{
-#ifndef HAVE_D_HASH_AND_LOOKUP
-	debugfs_remove_recursive(lnet_debugfs_root);
-	lnet_debugfs_root = NULL;
-	return;
+	lnet_table_header = NULL;
 #endif
-
-	for (; table && table->procname; table++) {
-		struct qstr dname = QSTR_INIT(table->procname,
-					      strlen(table->procname));
-		struct dentry *dentry;
-
-		dentry = d_hash_and_lookup(lnet_debugfs_root, &dname);
-		debugfs_remove(dentry);
-	}
 }
-EXPORT_SYMBOL_GPL(lnet_remove_debugfs);
 
 static int __init libcfs_init(void)
 {
 	int rc;
-
-#ifndef HAVE_WAIT_VAR_EVENT
-	wait_bit_init();
-#endif
 	init_libcfs_vfree_atomic();
-
 	rc = libcfs_debug_init(5 * 1024 * 1024);
 	if (rc < 0) {
 		printk(KERN_ERR "LustreError: libcfs_debug_init: %d\n", rc);
 		return (rc);
 	}
 
-	cfs_debug_init();
-
 	rc = cfs_cpu_init();
 	if (rc != 0)
 		goto cleanup_debug;
@@ -743,12 +765,17 @@ static int __init libcfs_init(void)
 		goto cleanup_wi;
 	}
 
-	lnet_insert_debugfs(lnet_table);
-	if (!IS_ERR_OR_NULL(lnet_debugfs_root))
-		lnet_insert_debugfs_links(lnet_debugfs_symlinks);
+
+	rc = insert_proc();
+	if (rc) {
+		CERROR("insert_proc: error %d\n", rc);
+		goto cleanup_crypto;
+	}
 
 	CDEBUG (D_OTHER, "portals setup OK\n");
 	return 0;
+cleanup_crypto:
+	cfs_crypto_unregister();
 cleanup_wi:
 	cfs_wi_shutdown();
 cleanup_deregister:
@@ -764,11 +791,7 @@ static void __exit libcfs_exit(void)
 {
 	int rc;
 
-	/* Remove everthing */
-	if (lnet_debugfs_root) {
-		debugfs_remove_recursive(lnet_debugfs_root);
-		lnet_debugfs_root = NULL;
-	}
+	remove_proc();
 
 	CDEBUG(D_MALLOC, "before Portals cleanup: kmem %d\n",
 	       atomic_read(&libcfs_kmemory));
@@ -793,7 +816,6 @@ static void __exit libcfs_exit(void)
 	if (rc)
 		printk(KERN_ERR "LustreError: libcfs_debug_cleanup: %d\n",
 		       rc);
-
 	exit_libcfs_vfree_atomic();
 }
 
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c
index f9d96d12f2555..ac762726fa5ce 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,12 +40,8 @@
 #define LUSTRE_TRACEFILE_PRIVATE
 #include "tracefile.h"
 
-#include <linux/ctype.h>
-#include <linux/fs.h>
 #include <linux/kthread.h>
-#include <linux/pagemap.h>
-#include <linux/uaccess.h>
-#include <libcfs/linux/linux-fs.h>
+#include <libcfs/linux/linux-misc.h>
 #include <libcfs/libcfs.h>
 
 /* XXX move things up to the top, comment */
@@ -394,34 +390,34 @@ int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata,
                 return 1;
         }
 
-	if (cdls != NULL) {
-		if (libcfs_console_ratelimit &&
-		    cdls->cdls_next != 0 &&	/* not first time ever */
-		    time_before(jiffies, cdls->cdls_next)) {
-			/* skipping a console message */
-			cdls->cdls_count++;
-			if (tcd != NULL)
-				cfs_trace_put_tcd(tcd);
-			return 1;
-		}
+        if (cdls != NULL) {
+                if (libcfs_console_ratelimit &&
+                    cdls->cdls_next != 0 &&     /* not first time ever */
+                    !cfs_time_after(cfs_time_current(), cdls->cdls_next)) {
+                        /* skipping a console message */
+                        cdls->cdls_count++;
+                        if (tcd != NULL)
+                                cfs_trace_put_tcd(tcd);
+                        return 1;
+                }
 
-		if (time_after(jiffies, cdls->cdls_next +
-					libcfs_console_max_delay +
-					cfs_time_seconds(10))) {
-			/* last timeout was a long time ago */
-			cdls->cdls_delay /= libcfs_console_backoff * 4;
-		} else {
-			cdls->cdls_delay *= libcfs_console_backoff;
-		}
+                if (cfs_time_after(cfs_time_current(), cdls->cdls_next +
+                                                       libcfs_console_max_delay
+                                                       + cfs_time_seconds(10))) {
+                        /* last timeout was a long time ago */
+                        cdls->cdls_delay /= libcfs_console_backoff * 4;
+                } else {
+                        cdls->cdls_delay *= libcfs_console_backoff;
+                }
 
 		if (cdls->cdls_delay < libcfs_console_min_delay)
 			cdls->cdls_delay = libcfs_console_min_delay;
 		else if (cdls->cdls_delay > libcfs_console_max_delay)
 			cdls->cdls_delay = libcfs_console_max_delay;
 
-		/* ensure cdls_next is never zero after it's been seen */
-		cdls->cdls_next = (jiffies + cdls->cdls_delay) | 1;
-	}
+                /* ensure cdls_next is never zero after it's been seen */
+                cdls->cdls_next = (cfs_time_current() + cdls->cdls_delay) | 1;
+        }
 
         if (tcd != NULL) {
                 cfs_print_to_console(&header, mask, string_buf, needed, file,
@@ -741,8 +737,12 @@ int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
         if (usr_buffer_nob > knl_buffer_nob)
                 return -EOVERFLOW;
 
+#ifdef PROC_HANDLER_USE_USER_ATTR
 	if (copy_from_user(knl_buffer, usr_buffer, usr_buffer_nob))
                 return -EFAULT;
+#else
+	memcpy(knl_buffer, usr_buffer, usr_buffer_nob);
+#endif
 
         nob = strnlen(knl_buffer, usr_buffer_nob);
         while (nob-- >= 0)                      /* strip trailing whitespace */
@@ -771,12 +771,20 @@ int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob,
         if (nob > usr_buffer_nob)
                 nob = usr_buffer_nob;
 
+#ifdef PROC_HANDLER_USE_USER_ATTR
 	if (copy_to_user(usr_buffer, knl_buffer, nob))
                 return -EFAULT;
+#else
+	memcpy(usr_buffer, knl_buffer, nob);
+#endif
 
         if (append != NULL && nob < usr_buffer_nob) {
+#ifdef PROC_HANDLER_USE_USER_ATTR
 		if (copy_to_user(usr_buffer + nob, append, 1))
                         return -EFAULT;
+#else
+		memcpy(usr_buffer + nob, append, 1);
+#endif
 
                 nob++;
         }
@@ -833,16 +841,13 @@ int cfs_trace_daemon_command(char *str)
                 cfs_tracefile_write_lock();
                 memset(cfs_tracefile, 0, sizeof(cfs_tracefile));
 
-	} else if (strncmp(str, "size=", 5) == 0) {
-		unsigned long tmp;
+        } else if (strncmp(str, "size=", 5) == 0) {
+                cfs_tracefile_size = simple_strtoul(str + 5, NULL, 0);
+                if (cfs_tracefile_size < 10 || cfs_tracefile_size > 20480)
+                        cfs_tracefile_size = CFS_TRACEFILE_SIZE;
+                else
+                        cfs_tracefile_size <<= 20;
 
-		rc = kstrtoul(str + 5, 10, &tmp);
-		if (!rc) {
-			if (tmp < 10 || tmp > 20480)
-				cfs_tracefile_size = CFS_TRACEFILE_SIZE;
-			else
-				cfs_tracefile_size = tmp << 20;
-		}
         } else if (strlen(str) >= sizeof(cfs_tracefile)) {
                 rc = -ENAMETOOLONG;
         } else if (str[0] != '/') {
@@ -915,6 +920,18 @@ int cfs_trace_set_debug_mb(int mb)
 	return 0;
 }
 
+int cfs_trace_set_debug_mb_usrstr(void __user *usr_str, int usr_str_nob)
+{
+        char     str[32];
+        int      rc;
+
+        rc = cfs_trace_copyin_string(str, sizeof(str), usr_str, usr_str_nob);
+        if (rc < 0)
+                return rc;
+
+        return cfs_trace_set_debug_mb(simple_strtoul(str, NULL, 0));
+}
+
 int cfs_trace_get_debug_mb(void)
 {
         int i;
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.h b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.h
index c6ca34d4fb08e..2f5dc4f272783 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.h
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -82,6 +82,7 @@ int cfs_trace_dump_debug_buffer_usrstr(void __user *usr_str, int usr_str_nob);
 int cfs_trace_daemon_command(char *str);
 int cfs_trace_daemon_command_usrstr(void __user *usr_str, int usr_str_nob);
 int cfs_trace_set_debug_mb(int mb);
+int cfs_trace_set_debug_mb_usrstr(void __user *usr_str, int usr_str_nob);
 int cfs_trace_get_debug_mb(void);
 
 extern void libcfs_debug_dumplog_internal(void *arg);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c
index f1676aa8f7a4d..c3d5556ab1557 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
  *
- * Copyright (c) 2014, 2017, Intel Corporation.
+ * Copyright (c) 2014, Intel Corporation.
  *
  *   This file is part of Lustre, https://wiki.whamcloud.com/
  *
@@ -35,7 +35,7 @@
 #include <linux/types.h>
 
 #include <libcfs/util/ioctl.h>
-#include <linux/lnet/lnetctl.h>
+#include <lnet/lnetctl.h>
 
 struct ioc_dev {
 	const char *dev_name;
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/nidstrings.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/nidstrings.c
index 246d420354217..04a33bdef4c4c 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/util/nidstrings.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/nidstrings.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -44,8 +44,8 @@
 #include <string.h>
 
 #include <libcfs/util/string.h>
-#include <linux/lnet/lnet-types.h>
-#include <linux/lnet/nidstr.h>
+#include <lnet/types.h>
+#include <lnet/nidstr.h>
 #ifdef HAVE_NETDB_H
 # include <netdb.h>
 #endif
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/param.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/param.c
index 18fe84dc53f6a..9facce6bfa975 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/util/param.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/param.c
@@ -64,10 +64,10 @@
 int
 cfs_get_param_paths(glob_t *paths, const char *pattern, ...)
 {
-	char topdir[PATH_MAX] = "{/sys/{fs,kernel/debug}/{lnet,lustre},"
-				"/proc/{fs,sys}/{lnet,lustre}}";
+	char path[PATH_MAX] = "{/sys/{fs,kernel/debug}/{lnet,lustre}/,"
+			       "/proc/{fs,sys}/{lnet,lustre}/}";
 	static bool test_mounted = false;
-	char path[PATH_MAX];
+	size_t len = strlen(path);
 	char buf[PATH_MAX];
 	struct statfs statfsbuf;
 	va_list args;
@@ -127,9 +127,9 @@ cfs_get_param_paths(glob_t *paths, const char *pattern, ...)
 		errno = EINVAL;
 		return -1;
 	}
+	len += rc;
 
-	if (snprintf(path, sizeof(path), "%s/%s", topdir, buf) >=
-	    sizeof(path)) {
+	if (strlcat(path, buf, sizeof(path)) != len) {
 		errno = E2BIG;
 		return -1;
 	}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/parser.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/parser.c
index 861f97a3c51e6..9afdaa07f8883 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/util/parser.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/parser.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (C) 2001 Cluster File Systems, Inc.
  *
- * Copyright (c) 2014, 2017, Intel Corporation.
+ * Copyright (c) 2014, Intel Corporation.
  *
  *   This file is part of Lustre, http://www.sf.net/projects/lustre/
  *
@@ -36,7 +36,7 @@
 #include <unistd.h>
 
 #include <libcfs/util/parser.h>
-#include <linux/lustre/lustre_ver.h>
+#include <lustre_ver.h>
 
 static command_t * top_level;           /* Top level of commands, initialized by
                                     * InitParser                              */
@@ -768,41 +768,40 @@ int Parser_arg2int(const char *inp, long *result, int base)
 }
 
 /* Convert human readable size string to and int; "1k" -> 1000 */
-int Parser_size(unsigned long *sizep, char *str)
-{
-	unsigned long size;
-	char mod[32];
+int Parser_size (int *sizep, char *str) {
+        int size;
+        char mod[32];
 
-	switch (sscanf(str, "%lu%1[gGmMkK]", &size, mod)) {
-	default:
-		return -1;
+        switch (sscanf (str, "%d%1[gGmMkK]", &size, mod)) {
+        default:
+                return (-1);
 
-	case 1:
-		*sizep = size;
-		return 0;
+        case 1:
+                *sizep = size;
+                return (0);
 
-	case 2:
-		switch (*mod) {
-		case 'g':
-		case 'G':
-			*sizep = size << 30;
-			return 0;
-
-		case 'm':
-		case 'M':
-			*sizep = size << 20;
-			return 0;
-
-		case 'k':
-		case 'K':
-			*sizep = size << 10;
-			return 0;
-
-		default:
-			*sizep = size;
-			return 0;
-		}
-	}
+        case 2:
+                switch (*mod) {
+                case 'g':
+                case 'G':
+                        *sizep = size << 30;
+                        return (0);
+
+                case 'm':
+                case 'M':
+                        *sizep = size << 20;
+                        return (0);
+
+                case 'k':
+                case 'K':
+                        *sizep = size << 10;
+                        return (0);
+
+                default:
+                        *sizep = size;
+                        return (0);
+                }
+        }
 }
 
 /* Convert a string boolean to an int; "enable" -> 1 */
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/string.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/string.c
index 2c1a24cacebb2..9078500020bb9 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/util/string.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/string.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -41,10 +41,46 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <limits.h>
-#include <unistd.h>
 #include <libcfs/util/string.h>
 
+/*
+ * According manual of strlcpy() and strlcat() the functions should return
+ * the total length of the string they tried to create. For strlcpy() that
+ * means the length of src. For strlcat() that means the initial length of
+ * dst plus the length of src. So, the function strnlen() cannot be used
+ * otherwise the return value will be wrong.
+ */
+#ifndef HAVE_STRLCPY /* not in glibc for RHEL 5.x, remove when obsolete */
+size_t strlcpy(char *dst, const char *src, size_t size)
+{
+	size_t ret = strlen(src);
+
+	if (size) {
+		size_t len = (ret >= size) ? size - 1 : ret;
+		memcpy(dst, src, len);
+		dst[len] = '\0';
+	}
+	return ret;
+}
+#endif
+
+#ifndef HAVE_STRLCAT /* not in glibc for RHEL 5.x, remove when obsolete */
+size_t strlcat(char *dst, const char *src, size_t size)
+{
+	size_t dsize = strlen(dst);
+	size_t len = strlen(src);
+	size_t ret = dsize + len;
+
+	dst  += dsize;
+	size -= dsize;
+	if (len >= size)
+		len = size-1;
+	memcpy(dst, src, len);
+	dst[len] = '\0';
+	return ret;
+}
+#endif
+
 /**
  * Extracts tokens from strings.
  *
@@ -444,83 +480,3 @@ cfs_expr_list_free_list(struct list_head *list)
 		cfs_expr_list_free(el);
 	}
 }
-
-/**
- * cfs_abs_path() - Get the absolute path of a relative path
- * @request_path:	The relative path to be resolved
- * @resolved_path:	Set to the resolved absolute path
- *
- * Returns the canonicalized absolute pathname.  This function is a wrapper to
- * realpath, but will work even if the target file does not exist.  All
- * directories in the path must exist.
- *
- * Return: On success, 0 is returned and resolved_path points to an allocated
- * string containing the absolute pathname.  On error, errno is set
- * appropriately, -errno is returned, and resolved_path points to NULL.
- */
-int cfs_abs_path(const char *request_path, char **resolved_path)
-{
-	char  buf[PATH_MAX + 1] = "";
-	char *path;
-	char *ptr;
-	int len;
-	int rc = 0;
-	const char *fmt;
-
-	path = malloc(sizeof(buf));
-	if (path == NULL)
-		return -ENOMEM;
-
-	if (request_path[0] != '/') {
-		if (getcwd(path, sizeof(buf) - 1) == NULL) {
-			rc = -errno;
-			goto out;
-		}
-		len = snprintf(buf, sizeof(buf), "%s/%s", path, request_path);
-		if (len >= sizeof(buf)) {
-			rc = -ENAMETOOLONG;
-			goto out;
-		}
-	} else {
-		/* skip duplicate leading '/' */
-		len = snprintf(buf, sizeof(buf), "%s",
-			       request_path + strspn(request_path, "/") - 1);
-		if (len >= sizeof(buf)) {
-			rc = -ENAMETOOLONG;
-			goto out;
-		}
-	}
-
-	/* if filename not in root directory, call realpath for parent path */
-	ptr = strrchr(buf, '/');
-	if (ptr != buf) {
-		*ptr = '\0';
-		if (path != realpath(buf, path)) {
-			rc = -errno;
-			goto out;
-		}
-		/* add the filename back */
-		len = strlen(path);
-		fmt = (path[len - 1] == '/') ? "%s" : "/%s";
-		len = snprintf(path + len, sizeof(buf) - len, fmt, ptr + 1);
-		if (len >= sizeof(buf) - len) {
-			rc = -ENAMETOOLONG;
-			goto out;
-		}
-	} else {
-		len = snprintf(path, sizeof(buf), "%s", buf);
-		if (len >= sizeof(buf)) {
-			rc = -ENAMETOOLONG;
-			goto out;
-		}
-	}
-
-out:
-	if (rc == 0) {
-		*resolved_path = path;
-	} else {
-		*resolved_path = NULL;
-		free(path);
-	}
-	return rc;
-}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c b/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c
index dd451dd807bc1..f9e4de58b8ed2 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,10 +40,6 @@
 #include <libcfs/libcfs.h>
 #include "tracefile.h"
 
-#ifndef WITH_WATCHDOG
-#define WITH_WATCHDOG
-#endif
-
 struct lc_watchdog {
 	spinlock_t		lcw_lock;	/* check or change lcw_list */
 	int			lcw_refcount;	/* must hold lcw_pending_timers_lock */
@@ -335,7 +331,6 @@ static void lcw_dispatch_stop(void)
 	wake_up(&lcw_event_waitq);
 
 	wait_for_completion(&lcw_stop_completion);
-	clear_bit(LCW_FLAG_STOP, &lcw_flags);
 
 	CDEBUG(D_INFO, "watchdog dispatcher has shut down.\n");
 
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/workitem.c b/drivers/staging/lustrefsx/libcfs/libcfs/workitem.c
index f370ffab81677..fb4fd643ee0c0 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/workitem.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/workitem.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2013, Intel Corporation.
+ * Copyright (c) 2011, 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -313,9 +313,10 @@ cfs_wi_sched_destroy(struct cfs_wi_sched *sched)
 		int i = 2;
 
 		while (sched->ws_nthreads > 0) {
-			CDEBUG(is_power_of_2(++i / 20) ? D_WARNING : D_NET,
-			       "waiting %us for %d %s worker threads to exit\n",
-			       i / 20, sched->ws_nthreads, sched->ws_name);
+			CDEBUG(is_power_of_2(++i) ? D_WARNING : D_NET,
+			       "waiting for %d threads of WI sched[%s] to "
+			       "terminate\n", sched->ws_nthreads,
+			       sched->ws_name);
 
 			spin_unlock(&cfs_wi_data.wi_glock);
 			set_current_state(TASK_UNINTERRUPTIBLE);
diff --git a/drivers/staging/lustrefsx/lnet/include/cyaml.h b/drivers/staging/lustrefsx/lnet/include/cyaml.h
index 1537dbd19ed0c..c9c21c750a45d 100644
--- a/drivers/staging/lustrefsx/lnet/include/cyaml.h
+++ b/drivers/staging/lustrefsx/lnet/include/cyaml.h
@@ -18,7 +18,7 @@
  *
  * LGPL HEADER END
  *
- * Copyright (c) 2014, 2017, Intel Corporation.
+ * Copyright (c) 2014, Intel Corporation.
  *
  * Author:
  *   Amir Shehata <amir.shehata@intel.com>
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/api.h b/drivers/staging/lustrefsx/lnet/include/lnet/api.h
index 1ce4a0056829d..84c6bd0039632 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/api.h
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/api.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2016, 2017, Intel Corporation.
+ * Copyright (c) 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -47,7 +47,7 @@
 # error This include is only for kernel use.
 #endif
 
-#include <uapi/linux/lnet/lnet-types.h>
+#include <lnet/types.h>
 
 /** \defgroup lnet_init_fini Initialization and cleanup
  * The LNet must be properly initialized before any LNet calls can be made.
@@ -198,8 +198,7 @@ int LNetGet(lnet_nid_t	      self,
 	    struct lnet_process_id target_in,
 	    unsigned int      portal_in,
 	    __u64	      match_bits_in,
-	    unsigned int      offset_in,
-	    bool	      recovery);
+	    unsigned int      offset_in);
 /** @} lnet_data */
 
 
@@ -211,7 +210,6 @@ int LNetSetLazyPortal(int portal);
 int LNetClearLazyPortal(int portal);
 int LNetCtl(unsigned int cmd, void *arg);
 void LNetDebugPeer(struct lnet_process_id id);
-int LNetGetPeerDiscoveryStatus(void);
 
 /** @} lnet_misc */
 
diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-dlc.h b/drivers/staging/lustrefsx/lnet/include/lnet/lib-dlc.h
similarity index 76%
rename from drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-dlc.h
rename to drivers/staging/lustrefsx/lnet/include/lnet/lib-dlc.h
index f10cbc3309176..4141f7c492c22 100644
--- a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-dlc.h
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/lib-dlc.h
@@ -20,32 +20,21 @@
  *
  */
 /*
- * Copyright (c) 2014, 2017, Intel Corporation.
+ * Copyright (c) 2014, 2016, Intel Corporation.
  */
 /*
  * Author: Amir Shehata <amir.shehata@intel.com>
  */
 
-#ifndef __UAPI_LNET_DLC_H_
-#define __UAPI_LNET_DLC_H_
+#ifndef LNET_DLC_H
+#define LNET_DLC_H
 
-#include <linux/types.h>
-/*
- * This is due to us being out of kernel and the way the OpenSFS branch
- * handles CFLAGS.
- */
-#ifdef __KERNEL__
-# include <uapi/linux/lnet/libcfs_ioctl.h>
-# include <uapi/linux/lnet/lnet-types.h>
-#else
-# include <linux/lnet/libcfs_ioctl.h>
-# include <linux/lnet/lnet-types.h>
-#endif
+#include <libcfs/libcfs_ioctl.h>
+#include <lnet/types.h>
 
 #define MAX_NUM_SHOW_ENTRIES	32
 #define LNET_MAX_STR_LEN	128
 #define LNET_MAX_SHOW_NUM_CPT	128
-#define LNET_MAX_SHOW_NUM_NID	128
 #define LNET_UNDEFINED_HOPS	((__u32) -1)
 
 /*
@@ -92,7 +81,7 @@ struct lnet_ioctl_config_lnd_tunables {
 };
 
 struct lnet_ioctl_net_config {
-	char ni_interfaces[LNET_INTERFACES_NUM][LNET_MAX_STR_LEN];
+	char ni_interfaces[LNET_NUM_INTERFACES][LNET_MAX_STR_LEN];
 	__u32 ni_status;
 	__u32 ni_cpts[LNET_MAX_SHOW_NUM_CPT];
 	char cfg_bulk[0];
@@ -122,8 +111,8 @@ struct lnet_ioctl_ping_data {
 	__u32 ping_count;
 	__u32 ping_flags;
 	bool mr_info;
-	struct lnet_process_id ping_id;
-	struct lnet_process_id __user *ping_buf;
+	lnet_process_id_t ping_id;
+	lnet_process_id_t __user *ping_buf;
 };
 
 struct lnet_ioctl_config_data {
@@ -174,31 +163,6 @@ struct lnet_ioctl_element_stats {
 	__u32 iel_drop_count;
 };
 
-enum lnet_health_type {
-	LNET_HEALTH_TYPE_LOCAL_NI = 0,
-	LNET_HEALTH_TYPE_PEER_NI,
-};
-
-struct lnet_ioctl_local_ni_hstats {
-	struct libcfs_ioctl_hdr hlni_hdr;
-	lnet_nid_t hlni_nid;
-	__u32 hlni_local_interrupt;
-	__u32 hlni_local_dropped;
-	__u32 hlni_local_aborted;
-	__u32 hlni_local_no_route;
-	__u32 hlni_local_timeout;
-	__u32 hlni_local_error;
-	__s32 hlni_health_value;
-};
-
-struct lnet_ioctl_peer_ni_hstats {
-	__u32 hlpni_remote_dropped;
-	__u32 hlpni_remote_timeout;
-	__u32 hlpni_remote_error;
-	__u32 hlpni_network_timeout;
-	__s32 hlpni_health_value;
-};
-
 struct lnet_ioctl_element_msg_stats {
 	struct libcfs_ioctl_hdr im_hdr;
 	__u32 im_idx;
@@ -220,7 +184,7 @@ struct lnet_ioctl_element_msg_stats {
 struct lnet_ioctl_config_ni {
 	struct libcfs_ioctl_hdr lic_cfg_hdr;
 	lnet_nid_t		lic_nid;
-	char			lic_ni_intf[LNET_INTERFACES_NUM][LNET_MAX_STR_LEN];
+	char			lic_ni_intf[LNET_NUM_INTERFACES][LNET_MAX_STR_LEN];
 	char			lic_legacy_ip2nets[LNET_MAX_STR_LEN];
 	__u32			lic_cpts[LNET_MAX_SHOW_NUM_CPT];
 	__u32			lic_ncpts;
@@ -266,24 +230,9 @@ struct lnet_ioctl_peer_cfg {
 	void __user *prcfg_bulk;
 };
 
-struct lnet_ioctl_reset_health_cfg {
-	struct libcfs_ioctl_hdr rh_hdr;
-	enum lnet_health_type rh_type;
-	bool rh_all;
-	int rh_value;
-	lnet_nid_t rh_nid;
-};
-
-struct lnet_ioctl_recovery_list {
-	struct libcfs_ioctl_hdr rlst_hdr;
-	enum lnet_health_type rlst_type;
-	int rlst_num_nids;
-	lnet_nid_t rlst_nid_array[LNET_MAX_SHOW_NUM_NID];
-};
-
-struct lnet_ioctl_set_value {
-	struct libcfs_ioctl_hdr sv_hdr;
-	__u32 sv_value;
+struct lnet_ioctl_numa_range {
+	struct libcfs_ioctl_hdr nr_hdr;
+	__u32 nr_range;
 };
 
 struct lnet_ioctl_lnet_stats {
@@ -291,4 +240,4 @@ struct lnet_ioctl_lnet_stats {
 	struct lnet_counters st_cntrs;
 };
 
-#endif /* _LNET_DLC_H_ */
+#endif /* LNET_DLC_H */
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h b/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h
index 9ed1856453610..c905eda43b5b8 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -39,13 +39,11 @@
 
 #include <linux/netdevice.h>
 
+#include <libcfs/linux/linux-misc.h>
 #include <libcfs/libcfs.h>
 #include <lnet/api.h>
+#include <lnet/lnet.h>
 #include <lnet/lib-types.h>
-#include <uapi/linux/lnet/lnet-dlc.h>
-#include <uapi/linux/lnet/lnet-types.h>
-#include <uapi/linux/lnet/lnetctl.h>
-#include <uapi/linux/lnet/nidstr.h>
 
 extern struct lnet the_lnet;			/* THE network */
 
@@ -71,10 +69,6 @@ extern struct lnet the_lnet;			/* THE network */
 /** exclusive lock */
 #define LNET_LOCK_EX		CFS_PERCPT_LOCK_EX
 
-/* default timeout */
-#define DEFAULT_PEER_TIMEOUT    180
-#define LNET_LND_DEFAULT_TIMEOUT 5
-
 #ifdef HAVE_KERN_SOCK_GETNAME_2ARGS
 #define lnet_kernel_getpeername(sock, addr, addrlen) \
 		kernel_getpeername(sock, addr)
@@ -395,40 +389,10 @@ lnet_handle2me(struct lnet_handle_me *handle)
 	return lh_entry(lh, struct lnet_me, me_lh);
 }
 
-static inline void
-lnet_peer_net_addref_locked(struct lnet_peer_net *lpn)
-{
-	atomic_inc(&lpn->lpn_refcount);
-}
-
-extern void lnet_destroy_peer_net_locked(struct lnet_peer_net *lpn);
-
-static inline void
-lnet_peer_net_decref_locked(struct lnet_peer_net *lpn)
-{
-	if (atomic_dec_and_test(&lpn->lpn_refcount))
-		lnet_destroy_peer_net_locked(lpn);
-}
-
-static inline void
-lnet_peer_addref_locked(struct lnet_peer *lp)
-{
-	atomic_inc(&lp->lp_refcount);
-}
-
-extern void lnet_destroy_peer_locked(struct lnet_peer *lp);
-
-static inline void
-lnet_peer_decref_locked(struct lnet_peer *lp)
-{
-	if (atomic_dec_and_test(&lp->lp_refcount))
-		lnet_destroy_peer_locked(lp);
-}
-
 static inline void
 lnet_peer_ni_addref_locked(struct lnet_peer_ni *lp)
 {
-	LASSERT(atomic_read(&lp->lpni_refcount) > 0);
+	LASSERT (atomic_read(&lp->lpni_refcount) > 0);
 	atomic_inc(&lp->lpni_refcount);
 }
 
@@ -437,8 +401,9 @@ extern void lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lp);
 static inline void
 lnet_peer_ni_decref_locked(struct lnet_peer_ni *lp)
 {
-	LASSERT(atomic_read(&lp->lpni_refcount) > 0);
-	if (atomic_dec_and_test(&lp->lpni_refcount))
+	LASSERT (atomic_read(&lp->lpni_refcount) > 0);
+	atomic_dec(&lp->lpni_refcount);
+	if (atomic_read(&lp->lpni_refcount) == 0)
 		lnet_destroy_peer_ni_locked(lp);
 }
 
@@ -500,26 +465,6 @@ lnet_msg_free(struct lnet_msg *msg)
 	LIBCFS_FREE(msg, sizeof(*msg));
 }
 
-static inline struct lnet_rsp_tracker *
-lnet_rspt_alloc(int cpt)
-{
-	struct lnet_rsp_tracker *rspt;
-	LIBCFS_ALLOC(rspt, sizeof(*rspt));
-	lnet_net_lock(cpt);
-	the_lnet.ln_counters[cpt]->lct_health.lch_rst_alloc++;
-	lnet_net_unlock(cpt);
-	return rspt;
-}
-
-static inline void
-lnet_rspt_free(struct lnet_rsp_tracker *rspt, int cpt)
-{
-	LIBCFS_FREE(rspt, sizeof(*rspt));
-	lnet_net_lock(cpt);
-	the_lnet.ln_counters[cpt]->lct_health.lch_rst_alloc--;
-	lnet_net_unlock(cpt);
-}
-
 void lnet_ni_free(struct lnet_ni *ni);
 void lnet_net_free(struct lnet_net *net);
 
@@ -557,26 +502,19 @@ extern struct lnet_ni *lnet_nid2ni_locked(lnet_nid_t nid, int cpt);
 extern struct lnet_ni *lnet_nid2ni_addref(lnet_nid_t nid);
 extern struct lnet_ni *lnet_net2ni_locked(__u32 net, int cpt);
 extern struct lnet_ni *lnet_net2ni_addref(__u32 net);
+bool lnet_is_ni_healthy_locked(struct lnet_ni *ni);
 struct lnet_net *lnet_get_net_locked(__u32 net_id);
 
 int lnet_lib_init(void);
 void lnet_lib_exit(void);
 
-extern unsigned lnet_transaction_timeout;
-extern unsigned lnet_retry_count;
 extern unsigned int lnet_numa_range;
-extern unsigned int lnet_health_sensitivity;
-extern unsigned int lnet_recovery_interval;
-extern unsigned int lnet_peer_discovery_disabled;
-extern unsigned int lnet_drop_asym_route;
 extern int portal_rotor;
 
-void lnet_mt_event_handler(struct lnet_event *event);
-
 int lnet_notify(struct lnet_ni *ni, lnet_nid_t peer, int alive,
-		time64_t when);
+		cfs_time_t when);
 void lnet_notify_locked(struct lnet_peer_ni *lp, int notifylnd, int alive,
-			time64_t when);
+			cfs_time_t when);
 int lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway_nid,
 		   unsigned int priority);
 int lnet_check_routes(void);
@@ -589,15 +527,24 @@ struct lnet_ni *lnet_get_next_ni_locked(struct lnet_net *mynet,
 					struct lnet_ni *prev);
 struct lnet_ni *lnet_get_ni_idx_locked(int idx);
 
+struct libcfs_ioctl_handler {
+	struct list_head item;
+	int (*handle_ioctl)(unsigned int cmd, struct libcfs_ioctl_hdr *hdr);
+};
+
+#define DECLARE_IOCTL_HANDLER(ident, func)			\
+	static struct libcfs_ioctl_handler ident = {		\
+		.item	      = LIST_HEAD_INIT(ident.item),	\
+		.handle_ioctl = func				\
+	}
+
+extern int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand);
+extern int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand);
 extern int libcfs_ioctl_getdata(struct libcfs_ioctl_hdr **hdr_pp,
 				struct libcfs_ioctl_hdr __user *uparam);
-extern int lnet_get_peer_list(__u32 *countp, __u32 *sizep,
-			      struct lnet_process_id __user *ids);
-extern void lnet_peer_ni_set_healthv(lnet_nid_t nid, int value, bool all);
-extern void lnet_peer_ni_add_to_recoveryq_locked(struct lnet_peer_ni *lpni);
 
-void lnet_router_debugfs_init(void);
-void lnet_router_debugfs_fini(void);
+void lnet_proc_init(void);
+void lnet_proc_fini(void);
 int  lnet_rtrpools_alloc(int im_a_router);
 void lnet_destroy_rtrbuf(struct lnet_rtrbuf *rb, int npages);
 int  lnet_rtrpools_adjust(int tiny, int small, int large);
@@ -617,6 +564,7 @@ int lnet_islocalnet(__u32 net);
 
 void lnet_msg_attach_md(struct lnet_msg *msg, struct lnet_libmd *md,
 			unsigned int offset, unsigned int mlen);
+void lnet_msg_detach_md(struct lnet_msg *msg, int status);
 void lnet_build_unlink_event(struct lnet_libmd *md, struct lnet_event *ev);
 void lnet_build_msg_event(struct lnet_msg *msg, enum lnet_event_kind ev_type);
 void lnet_msg_commit(struct lnet_msg *msg, int cpt);
@@ -627,15 +575,11 @@ void lnet_prep_send(struct lnet_msg *msg, int type,
 		    struct lnet_process_id target, unsigned int offset,
 		    unsigned int len);
 int lnet_send(lnet_nid_t nid, struct lnet_msg *msg, lnet_nid_t rtr_nid);
-int lnet_send_ping(lnet_nid_t dest_nid, struct lnet_handle_md *mdh, int nnis,
-		   void *user_ptr, struct lnet_handle_eq eqh, bool recovery);
 void lnet_return_tx_credits_locked(struct lnet_msg *msg);
 void lnet_return_rx_credits_locked(struct lnet_msg *msg);
 void lnet_schedule_blocked_locked(struct lnet_rtrbufpool *rbp);
 void lnet_drop_routed_msgs_locked(struct list_head *list, int cpt);
 
-struct list_head **lnet_create_array_of_queues(void);
-
 /* portals functions */
 /* portals attributes */
 static inline int
@@ -700,22 +644,16 @@ void lnet_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg,
 void lnet_ni_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg,
 		  int delayed, unsigned int offset,
 		  unsigned int mlen, unsigned int rlen);
-void lnet_ni_send(struct lnet_ni *ni, struct lnet_msg *msg);
 
 struct lnet_msg *lnet_create_reply_msg(struct lnet_ni *ni,
 				       struct lnet_msg *get_msg);
 void lnet_set_reply_msg_len(struct lnet_ni *ni, struct lnet_msg *msg,
 			    unsigned int len);
-void lnet_detach_rsp_tracker(struct lnet_libmd *md, int cpt);
-void lnet_clean_zombie_rstqs(void);
 
 void lnet_finalize(struct lnet_msg *msg, int rc);
-bool lnet_send_error_simulation(struct lnet_msg *msg,
-				enum lnet_msg_hstatus *hstatus);
-void lnet_handle_remote_failure_locked(struct lnet_peer_ni *lpni);
 
 void lnet_drop_message(struct lnet_ni *ni, int cpt, void *private,
-		       unsigned int nob, __u32 msg_type);
+		       unsigned int nob);
 void lnet_drop_delayed_msg_list(struct list_head *head, char *reason);
 void lnet_recv_delayed_msg_list(struct list_head *head);
 
@@ -724,7 +662,6 @@ void lnet_msg_container_cleanup(struct lnet_msg_container *container);
 void lnet_msg_containers_destroy(void);
 int lnet_msg_containers_create(void);
 
-char *lnet_health_error2str(enum lnet_msg_hstatus hstatus);
 char *lnet_msgtyp2str(int type);
 void lnet_print_hdr(struct lnet_hdr *hdr);
 int lnet_fail_nid(lnet_nid_t nid, unsigned int threshold);
@@ -735,7 +672,7 @@ int lnet_fault_ctl(int cmd, struct libcfs_ioctl_data *data);
 int lnet_fault_init(void);
 void lnet_fault_fini(void);
 
-bool lnet_drop_rule_match(struct lnet_hdr *hdr, enum lnet_msg_hstatus *hstatus);
+bool lnet_drop_rule_match(struct lnet_hdr *hdr);
 
 int lnet_delay_rule_add(struct lnet_fault_attr *attr);
 int lnet_delay_rule_del(lnet_nid_t src, lnet_nid_t dst, bool shutdown);
@@ -747,7 +684,6 @@ bool lnet_delay_rule_match_locked(struct lnet_hdr *hdr, struct lnet_msg *msg);
 
 /** @} lnet_fault_simulation */
 
-void lnet_counters_get_common(struct lnet_counters_common *common);
 void lnet_counters_get(struct lnet_counters *counters);
 void lnet_counters_reset(void);
 
@@ -827,7 +763,6 @@ void lnet_md_deconstruct(struct lnet_libmd *lmd, struct lnet_md *umd);
 struct page *lnet_kvaddr_to_page(unsigned long vaddr);
 int lnet_cpt_of_md(struct lnet_libmd *md, unsigned int offset);
 
-unsigned int lnet_get_lnd_timeout(void);
 void lnet_register_lnd(struct lnet_lnd *lnd);
 void lnet_unregister_lnd(struct lnet_lnd *lnd);
 
@@ -866,45 +801,10 @@ int lnet_sock_connect(struct socket **sockp, int *fatal,
 int lnet_peers_start_down(void);
 int lnet_peer_buffer_credits(struct lnet_net *net);
 
-int lnet_monitor_thr_start(void);
-void lnet_monitor_thr_stop(void);
-
-bool lnet_router_checker_active(void);
-void lnet_check_routers(void);
-int lnet_router_pre_mt_start(void);
-void lnet_router_post_mt_start(void);
-void lnet_prune_rc_data(int wait_unlink);
-void lnet_router_cleanup(void);
+int lnet_router_checker_start(void);
+void lnet_router_checker_stop(void);
 void lnet_router_ni_update_locked(struct lnet_peer_ni *gw, __u32 net);
-void lnet_swap_pinginfo(struct lnet_ping_buffer *pbuf);
-
-int lnet_ping_info_validate(struct lnet_ping_info *pinfo);
-struct lnet_ping_buffer *lnet_ping_buffer_alloc(int nnis, gfp_t gfp);
-void lnet_ping_buffer_free(struct lnet_ping_buffer *pbuf);
-
-static inline void lnet_ping_buffer_addref(struct lnet_ping_buffer *pbuf)
-{
-	atomic_inc(&pbuf->pb_refcnt);
-}
-
-static inline void lnet_ping_buffer_decref(struct lnet_ping_buffer *pbuf)
-{
-	if (atomic_dec_and_test(&pbuf->pb_refcnt))
-		lnet_ping_buffer_free(pbuf);
-}
-
-static inline int lnet_ping_buffer_numref(struct lnet_ping_buffer *pbuf)
-{
-	return atomic_read(&pbuf->pb_refcnt);
-}
-
-static inline int lnet_push_target_resize_needed(void)
-{
-	return the_lnet.ln_push_target->pb_nnis < the_lnet.ln_push_target_nnis;
-}
-
-int lnet_push_target_resize(void);
-void lnet_peer_push_event(struct lnet_event *ev);
+void lnet_swap_pinginfo(struct lnet_ping_info *info);
 
 int lnet_parse_ip2nets(char **networksp, char *ip2nets);
 int lnet_parse_routes(char *route_str, int *im_a_router);
@@ -919,115 +819,94 @@ __u32 lnet_get_dlc_seq_locked(void);
 struct lnet_peer_ni *lnet_get_next_peer_ni_locked(struct lnet_peer *peer,
 						  struct lnet_peer_net *peer_net,
 						  struct lnet_peer_ni *prev);
-struct lnet_peer_ni *lnet_nid2peerni_locked(lnet_nid_t nid, lnet_nid_t pref,
-					int cpt);
+struct lnet_peer *lnet_find_or_create_peer_locked(lnet_nid_t dst_nid, int cpt);
+struct lnet_peer_ni *lnet_nid2peerni_locked(lnet_nid_t nid, int cpt);
 struct lnet_peer_ni *lnet_nid2peerni_ex(lnet_nid_t nid, int cpt);
 struct lnet_peer_ni *lnet_find_peer_ni_locked(lnet_nid_t nid);
-struct lnet_peer *lnet_find_peer(lnet_nid_t nid);
 void lnet_peer_net_added(struct lnet_net *net);
 lnet_nid_t lnet_peer_primary_nid_locked(lnet_nid_t nid);
-int lnet_discover_peer_locked(struct lnet_peer_ni *lpni, int cpt, bool block);
-int lnet_peer_discovery_start(void);
-void lnet_peer_discovery_stop(void);
-void lnet_push_update_to_peers(int force);
 void lnet_peer_tables_cleanup(struct lnet_net *net);
 void lnet_peer_uninit(void);
 int lnet_peer_tables_create(void);
 void lnet_debug_peer(lnet_nid_t nid);
 struct lnet_peer_net *lnet_peer_get_net_locked(struct lnet_peer *peer,
 					       __u32 net_id);
-bool lnet_peer_is_pref_nid_locked(struct lnet_peer_ni *lpni, lnet_nid_t nid);
-int lnet_peer_ni_set_non_mr_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid);
-int lnet_add_peer_ni(lnet_nid_t key_nid, lnet_nid_t nid, bool mr);
-int lnet_del_peer_ni(lnet_nid_t key_nid, lnet_nid_t nid);
-int lnet_get_peer_info(struct lnet_ioctl_peer_cfg *cfg, void __user *bulk);
+bool lnet_peer_is_ni_pref_locked(struct lnet_peer_ni *lpni,
+				 struct lnet_ni *ni);
+int lnet_add_peer_ni_to_peer(lnet_nid_t key_nid, lnet_nid_t nid, bool mr);
+int lnet_del_peer_ni_from_peer(lnet_nid_t key_nid, lnet_nid_t nid);
+int lnet_get_peer_info(__u32 idx, lnet_nid_t *primary_nid, lnet_nid_t *nid,
+		       bool *mr,
+		       struct lnet_peer_ni_credit_info __user *peer_ni_info,
+		       struct lnet_ioctl_element_stats __user *peer_ni_stats);
 int lnet_get_peer_ni_info(__u32 peer_index, __u64 *nid,
 			  char alivness[LNET_MAX_STR_LEN],
 			  __u32 *cpt_iter, __u32 *refcount,
 			  __u32 *ni_peer_tx_credits, __u32 *peer_tx_credits,
 			  __u32 *peer_rtr_credits, __u32 *peer_min_rtr_credtis,
 			  __u32 *peer_tx_qnob);
-int lnet_get_peer_ni_hstats(struct lnet_ioctl_peer_ni_hstats *stats);
 
-static inline struct lnet_peer_net *
-lnet_find_peer_net_locked(struct lnet_peer *peer, __u32 net_id)
+
+static inline __u32
+lnet_get_num_peer_nis(struct lnet_peer *peer)
 {
-	struct lnet_peer_net *peer_net;
+	struct lnet_peer_net *lpn;
+	struct lnet_peer_ni *lpni;
+	__u32 count = 0;
 
-	list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_peer_nets) {
-		if (peer_net->lpn_net_id == net_id)
-			return peer_net;
-	}
+	list_for_each_entry(lpn, &peer->lp_peer_nets, lpn_on_peer_list)
+		list_for_each_entry(lpni, &lpn->lpn_peer_nis,
+				    lpni_on_peer_net_list)
+			count++;
 
-	return NULL;
+	return count;
 }
 
-static inline void
-lnet_peer_set_alive(struct lnet_peer_ni *lp)
+static inline bool
+lnet_is_peer_ni_healthy_locked(struct lnet_peer_ni *lpni)
 {
-	lp->lpni_last_alive = ktime_get_seconds();
-	lp->lpni_last_query = lp->lpni_last_alive;
-	if (!lp->lpni_alive)
-		lnet_notify_locked(lp, 0, 1, lp->lpni_last_alive);
+	return lpni->lpni_healthy;
 }
 
-static inline bool
-lnet_peer_is_multi_rail(struct lnet_peer *lp)
+static inline void
+lnet_set_peer_ni_health_locked(struct lnet_peer_ni *lpni, bool health)
 {
-	if (lp->lp_state & LNET_PEER_MULTI_RAIL)
-		return true;
-	return false;
+	lpni->lpni_healthy = health;
 }
 
 static inline bool
-lnet_peer_ni_is_configured(struct lnet_peer_ni *lpni)
+lnet_is_peer_net_healthy_locked(struct lnet_peer_net *peer_net)
 {
-	if (lpni->lpni_peer_net->lpn_peer->lp_state & LNET_PEER_CONFIGURED)
-		return true;
+	struct lnet_peer_ni *lpni;
+
+	list_for_each_entry(lpni, &peer_net->lpn_peer_nis,
+			    lpni_on_peer_net_list) {
+		if (lnet_is_peer_ni_healthy_locked(lpni))
+			return true;
+	}
+
 	return false;
 }
 
 static inline bool
-lnet_peer_ni_is_primary(struct lnet_peer_ni *lpni)
+lnet_is_peer_healthy_locked(struct lnet_peer *peer)
 {
-	return lpni->lpni_nid == lpni->lpni_peer_net->lpn_peer->lp_primary_nid;
-}
+	struct lnet_peer_net *peer_net;
 
-bool lnet_peer_is_uptodate(struct lnet_peer *lp);
-bool lnet_peer_is_uptodate_locked(struct lnet_peer *lp);
-bool lnet_is_discovery_disabled(struct lnet_peer *lp);
+	list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_on_peer_list) {
+		if (lnet_is_peer_net_healthy_locked(peer_net))
+			return true;
+	}
 
-static inline bool
-lnet_peer_needs_push(struct lnet_peer *lp)
-{
-	if (!(lp->lp_state & LNET_PEER_MULTI_RAIL))
-		return false;
-	if (lp->lp_state & LNET_PEER_FORCE_PUSH)
-		return true;
-	if (lp->lp_state & LNET_PEER_NO_DISCOVERY)
-		return false;
-	/* if discovery is not enabled then no need to push */
-	if (lnet_peer_discovery_disabled)
-		return false;
-	if (lp->lp_node_seqno < atomic_read(&the_lnet.ln_ping_target_seqno))
-		return true;
 	return false;
 }
 
 static inline void
-lnet_inc_healthv(atomic_t *healthv)
+lnet_peer_set_alive(struct lnet_peer_ni *lp)
 {
-	atomic_add_unless(healthv, 1, LNET_MAX_HEALTH_VALUE);
+	lp->lpni_last_alive = lp->lpni_last_query = cfs_time_current();
+	if (!lp->lpni_alive)
+		lnet_notify_locked(lp, 0, 1, lp->lpni_last_alive);
 }
 
-void lnet_incr_stats(struct lnet_element_stats *stats,
-		     enum lnet_msg_type msg_type,
-		     enum lnet_stats_type stats_type);
-
-__u32 lnet_sum_stats(struct lnet_element_stats *stats,
-		     enum lnet_stats_type stats_type);
-
-void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
-			      struct lnet_element_stats *stats);
-
 #endif
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lib-types.h b/drivers/staging/lustrefsx/lnet/include/lnet/lib-types.h
index 496a1b0fe0f93..9b8af0e45a4c8 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/lib-types.h
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/lib-types.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -44,56 +44,26 @@
 
 #include <linux/kthread.h>
 #include <linux/uio.h>
-#include <linux/semaphore.h>
 #include <linux/types.h>
 
-#include <uapi/linux/lnet/lnet-dlc.h>
-#include <uapi/linux/lnet/lnetctl.h>
+#include <lnet/lnetctl.h>
 
 /* Max payload size */
-#define LNET_MAX_PAYLOAD	LNET_MTU
-
-/** limit on the number of fragments in discontiguous MDs */
-#define LNET_MAX_IOV	256
+#ifndef CONFIG_LNET_MAX_PAYLOAD
+# error "CONFIG_LNET_MAX_PAYLOAD must be defined in config.h"
+#endif
 
-/*
- * This is the maximum health value.
- * All local and peer NIs created have their health default to this value.
- */
-#define LNET_MAX_HEALTH_VALUE 1000
+#define LNET_MAX_PAYLOAD       CONFIG_LNET_MAX_PAYLOAD
+#if (LNET_MAX_PAYLOAD < LNET_MTU)
+# error "LNET_MAX_PAYLOAD too small - error in configure --with-max-payload-mb"
+#elif (LNET_MAX_PAYLOAD > (PAGE_SIZE * LNET_MAX_IOV))
+# error "LNET_MAX_PAYLOAD too large - error in configure --with-max-payload-mb"
+#endif
 
 /* forward refs */
 struct lnet_libmd;
 
-enum lnet_msg_hstatus {
-	LNET_MSG_STATUS_OK = 0,
-	LNET_MSG_STATUS_LOCAL_INTERRUPT,
-	LNET_MSG_STATUS_LOCAL_DROPPED,
-	LNET_MSG_STATUS_LOCAL_ABORTED,
-	LNET_MSG_STATUS_LOCAL_NO_ROUTE,
-	LNET_MSG_STATUS_LOCAL_ERROR,
-	LNET_MSG_STATUS_LOCAL_TIMEOUT,
-	LNET_MSG_STATUS_REMOTE_ERROR,
-	LNET_MSG_STATUS_REMOTE_DROPPED,
-	LNET_MSG_STATUS_REMOTE_TIMEOUT,
-	LNET_MSG_STATUS_NETWORK_TIMEOUT,
-	LNET_MSG_STATUS_END,
-};
-
-struct lnet_rsp_tracker {
-	/* chain on the waiting list */
-	struct list_head rspt_on_list;
-	/* cpt to lock */
-	int rspt_cpt;
-	/* nid of next hop */
-	lnet_nid_t rspt_next_hop_nid;
-	/* deadline of the REPLY/ACK */
-	ktime_t rspt_deadline;
-	/* parent MD */
-	struct lnet_handle_md rspt_mdh;
-};
-
-struct lnet_msg {
+typedef struct lnet_msg {
 	struct list_head	msg_activelist;
 	struct list_head	msg_list;	/* Q for credits/MD */
 
@@ -104,28 +74,6 @@ struct lnet_msg {
 	lnet_nid_t		msg_from;
 	__u32			msg_type;
 
-	/*
-	 * hold parameters in case message is with held due
-	 * to discovery
-	 */
-	lnet_nid_t		msg_src_nid_param;
-	lnet_nid_t		msg_rtr_nid_param;
-
-	/*
-	 * Deadline for the message after which it will be finalized if it
-	 * has not completed.
-	 */
-	ktime_t			msg_deadline;
-
-	/* The message health status. */
-	enum lnet_msg_hstatus	msg_health_status;
-	/* This is a recovery message */
-	bool			msg_recovery;
-	/* the number of times a transmission has been retried */
-	int			msg_retry_count;
-	/* flag to indicate that we do not want to resend this message */
-	bool			msg_no_resend;
-
 	/* committed for sending */
 	unsigned int		msg_tx_committed:1;
 	/* CPT # this message committed for sending */
@@ -172,17 +120,17 @@ struct lnet_msg {
 
 	struct lnet_event	msg_ev;
 	struct lnet_hdr		msg_hdr;
-};
+} lnet_msg_t;
 
-struct lnet_libhandle {
+typedef struct lnet_libhandle {
 	struct list_head	lh_hash_chain;
 	__u64			lh_cookie;
-};
+} lnet_libhandle_t;
 
 #define lh_entry(ptr, type, member) \
 	((type *)((char *)(ptr)-(char *)(&((type *)0)->member)))
 
-struct lnet_eq {
+typedef struct lnet_eq {
 	struct list_head	eq_list;
 	struct lnet_libhandle	eq_lh;
 	unsigned long		eq_enq_seq;
@@ -191,9 +139,9 @@ struct lnet_eq {
 	lnet_eq_handler_t	eq_callback;
 	struct lnet_event	*eq_events;
 	int			**eq_refs;	/* percpt refcount for EQ */
-};
+} lnet_eq_t;
 
-struct lnet_me {
+typedef struct lnet_me {
 	struct list_head	me_list;
 	struct lnet_libhandle	me_lh;
 	struct lnet_process_id	me_match_id;
@@ -203,41 +151,40 @@ struct lnet_me {
 	__u64			me_ignore_bits;
 	enum lnet_unlink	me_unlink;
 	struct lnet_libmd      *me_md;
-};
-
-struct lnet_libmd {
-	struct list_head	 md_list;
-	struct lnet_libhandle	 md_lh;
-	struct lnet_me	        *md_me;
-	char		        *md_start;
-	unsigned int		 md_offset;
-	unsigned int		 md_length;
-	unsigned int		 md_max_size;
-	int			 md_threshold;
-	int			 md_refcount;
-	unsigned int		 md_options;
-	unsigned int		 md_flags;
-	unsigned int		 md_niov;	/* # frags at end of struct */
-	void		        *md_user_ptr;
-	struct lnet_rsp_tracker *md_rspt_ptr;
-	struct lnet_eq	        *md_eq;
-	struct lnet_handle_md	 md_bulk_handle;
+} lnet_me_t;
+
+typedef struct lnet_libmd {
+	struct list_head	md_list;
+	struct lnet_libhandle	md_lh;
+	struct lnet_me	       *md_me;
+	char		       *md_start;
+	unsigned int		md_offset;
+	unsigned int		md_length;
+	unsigned int		md_max_size;
+	int			md_threshold;
+	int			md_refcount;
+	unsigned int		md_options;
+	unsigned int		md_flags;
+	unsigned int		md_niov;	/* # frags at end of struct */
+	void		       *md_user_ptr;
+	struct lnet_eq	       *md_eq;
+	struct lnet_handle_md	md_bulk_handle;
 	union {
-		struct kvec	 iov[LNET_MAX_IOV];
-		lnet_kiov_t	 kiov[LNET_MAX_IOV];
+		struct kvec	iov[LNET_MAX_IOV];
+		lnet_kiov_t	kiov[LNET_MAX_IOV];
 	} md_iov;
-};
+} lnet_libmd_t;
 
 #define LNET_MD_FLAG_ZOMBIE	 (1 << 0)
 #define LNET_MD_FLAG_AUTO_UNLINK (1 << 1)
 #define LNET_MD_FLAG_ABORTED	 (1 << 2)
 
-struct lnet_test_peer {
+typedef struct lnet_test_peer {
 	/* info about peers we are trying to fail */
 	struct list_head	tp_list;	/* ln_test_peers */
 	lnet_nid_t		tp_nid;		/* matching nid */
 	unsigned int		tp_threshold;	/* # failures to simulate */
-};
+} lnet_test_peer_t;
 
 #define LNET_COOKIE_TYPE_MD    1
 #define LNET_COOKIE_TYPE_ME    2
@@ -248,7 +195,7 @@ struct lnet_test_peer {
 struct lnet_ni;					 /* forward ref */
 struct socket;
 
-struct lnet_lnd {
+typedef struct lnet_lnd {
 	/* fields managed by portals */
 	struct list_head	lnd_list;	/* stash in the LND table */
 	int			lnd_refcount;	/* # active instances */
@@ -302,11 +249,17 @@ struct lnet_lnd {
 	void (*lnd_notify)(struct lnet_ni *ni, lnet_nid_t peer, int alive);
 
 	/* query of peer aliveness */
-	void (*lnd_query)(struct lnet_ni *ni, lnet_nid_t peer, time64_t *when);
+	void (*lnd_query)(struct lnet_ni *ni, lnet_nid_t peer, cfs_time_t *when);
 
 	/* accept a new connection */
 	int (*lnd_accept)(struct lnet_ni *ni, struct socket *sock);
-};
+} lnd_t;
+
+typedef struct lnet_ni_status {
+	lnet_nid_t ns_nid;
+	__u32	   ns_status;
+	__u32	   ns_unused;
+} WIRE_ATTR lnet_ni_status_t;
 
 struct lnet_tx_queue {
 	int			tq_credits;	/* # tx credits free */
@@ -327,51 +280,22 @@ enum lnet_net_state {
 };
 
 enum lnet_ni_state {
-	/* initial state when NI is created */
+	/* set when NI block is allocated */
 	LNET_NI_STATE_INIT = 0,
-	/* set when NI is brought up */
+	/* set when NI is started successfully */
 	LNET_NI_STATE_ACTIVE,
-	/* set when NI is being shutdown */
-	LNET_NI_STATE_DELETING,
-};
-
-#define LNET_NI_RECOVERY_PENDING	BIT(0)
-#define LNET_NI_RECOVERY_FAILED		BIT(1)
-
-enum lnet_stats_type {
-	LNET_STATS_TYPE_SEND = 0,
-	LNET_STATS_TYPE_RECV,
-	LNET_STATS_TYPE_DROP
-};
-
-struct lnet_comm_count {
-	atomic_t co_get_count;
-	atomic_t co_put_count;
-	atomic_t co_reply_count;
-	atomic_t co_ack_count;
-	atomic_t co_hello_count;
+	/* set when LND notifies NI failed */
+	LNET_NI_STATE_FAILED,
+	/* set when LND notifies NI degraded */
+	LNET_NI_STATE_DEGRADED,
+	/* set when shuttding down NI */
+	LNET_NI_STATE_DELETING
 };
 
 struct lnet_element_stats {
-	struct lnet_comm_count el_send_stats;
-	struct lnet_comm_count el_recv_stats;
-	struct lnet_comm_count el_drop_stats;
-};
-
-struct lnet_health_local_stats {
-	atomic_t hlt_local_interrupt;
-	atomic_t hlt_local_dropped;
-	atomic_t hlt_local_aborted;
-	atomic_t hlt_local_no_route;
-	atomic_t hlt_local_timeout;
-	atomic_t hlt_local_error;
-};
-
-struct lnet_health_remote_stats {
-	atomic_t hlt_remote_dropped;
-	atomic_t hlt_remote_timeout;
-	atomic_t hlt_remote_error;
-	atomic_t hlt_network_timeout;
+	atomic_t	send_count;
+	atomic_t	recv_count;
+	atomic_t	drop_count;
 };
 
 struct lnet_net {
@@ -418,15 +342,12 @@ struct lnet_net {
 	enum lnet_net_state	net_state;
 };
 
-struct lnet_ni {
+typedef struct lnet_ni {
 	/* chain on the lnet_net structure */
 	struct list_head	ni_netlist;
 
-	/* chain on the recovery queue */
-	struct list_head	ni_recovery;
-
-	/* MD handle for recovery ping */
-	struct lnet_handle_md	ni_ping_mdh;
+	/* chain on net_ni_cpt */
+	struct list_head	ni_cptlist;
 
 	spinlock_t		ni_lock;
 
@@ -452,7 +373,7 @@ struct lnet_ni {
 	int			**ni_refs;
 
 	/* when I was last alive */
-	time64_t		ni_last_alive;
+	long			ni_last_alive;
 
 	/* pointer to parent network */
 	struct lnet_net		*ni_net;
@@ -460,12 +381,9 @@ struct lnet_ni {
 	/* my health status */
 	struct lnet_ni_status	*ni_status;
 
-	/* NI FSM. Protected by lnet_ni_lock() */
+	/* NI FSM */
 	enum lnet_ni_state	ni_state;
 
-	/* Recovery state. Protected by lnet_ni_lock() */
-	__u32			ni_recovery_state;
-
 	/* per NI LND tunables */
 	struct lnet_lnd_tunables ni_lnd_tunables;
 
@@ -474,7 +392,6 @@ struct lnet_ni {
 
 	/* NI statistics */
 	struct lnet_element_stats ni_stats;
-	struct lnet_health_local_stats ni_hstats;
 
 	/* physical device CPT */
 	int			ni_dev_cpt;
@@ -482,69 +399,50 @@ struct lnet_ni {
 	/* sequence number used to round robin over nis within a net */
 	__u32			ni_seq;
 
-	/*
-	 * health value
-	 *	initialized to LNET_MAX_HEALTH_VALUE
-	 * Value is decremented every time we fail to send a message over
-	 * this NI because of a NI specific failure.
-	 * Value is incremented if we successfully send a message.
-	 */
-	atomic_t		ni_healthv;
-
-	/*
-	 * Set to 1 by the LND when it receives an event telling it the device
-	 * has gone into a fatal state. Set to 0 when the LND receives an
-	 * even telling it the device is back online.
-	 */
-	atomic_t		ni_fatal_error_on;
-
 	/*
 	 * equivalent interfaces to use
 	 * This is an array because socklnd bonding can still be configured
 	 */
-	char			*ni_interfaces[LNET_INTERFACES_NUM];
+	char			*ni_interfaces[LNET_NUM_INTERFACES];
 	struct net		*ni_net_ns;     /* original net namespace */
-};
+} lnet_ni_t;
 
 #define LNET_PROTO_PING_MATCHBITS	0x8000000000000000LL
 
-/*
- * Descriptor of a ping info buffer: keep a separate indicator of the
- * size and a reference count. The type is used both as a source and
- * sink of data, so we need to keep some information outside of the
- * area that may be overwritten by network data.
- */
-struct lnet_ping_buffer {
-	int			pb_nnis;
-	atomic_t		pb_refcnt;
-	struct lnet_ping_info	pb_info;
-};
+/* NB: value of these features equal to LNET_PROTO_PING_VERSION_x
+ * of old LNet, so there shouldn't be any compatibility issue */
+#define LNET_PING_FEAT_INVAL		(0)		/* no feature */
+#define LNET_PING_FEAT_BASE		(1 << 0)	/* just a ping */
+#define LNET_PING_FEAT_NI_STATUS	(1 << 1)	/* return NI status */
+#define LNET_PING_FEAT_RTE_DISABLED	(1 << 2)	/* Routing enabled */
 
-#define LNET_PING_BUFFER_SIZE(NNIDS) \
-	offsetof(struct lnet_ping_buffer, pb_info.pi_ni[NNIDS])
-#define LNET_PING_BUFFER_LONI(PBUF)	((PBUF)->pb_info.pi_ni[0].ns_nid)
-#define LNET_PING_BUFFER_SEQNO(PBUF)	((PBUF)->pb_info.pi_ni[0].ns_status)
+#define LNET_PING_FEAT_MASK		(LNET_PING_FEAT_BASE | \
+					 LNET_PING_FEAT_NI_STATUS)
 
-#define LNET_PING_INFO_TO_BUFFER(PINFO)	\
-	container_of((PINFO), struct lnet_ping_buffer, pb_info)
+typedef struct lnet_ping_info {
+	__u32			pi_magic;
+	__u32			pi_features;
+	lnet_pid_t		pi_pid;
+	__u32			pi_nnis;
+	struct lnet_ni_status	pi_ni[0];
+} WIRE_ATTR lnet_ping_info_t;
 
 /* router checker data, per router */
-struct lnet_rc_data {
+#define LNET_MAX_RTR_NIS   16
+#define LNET_PINGINFO_SIZE offsetof(struct lnet_ping_info, pi_ni[LNET_MAX_RTR_NIS])
+typedef struct lnet_rc_data {
 	/* chain on the_lnet.ln_zombie_rcd or ln_deathrow_rcd */
 	struct list_head	rcd_list;
 	struct lnet_handle_md	rcd_mdh;	/* ping buffer MD */
 	struct lnet_peer_ni	*rcd_gateway;	/* reference to gateway */
-	struct lnet_ping_buffer	*rcd_pingbuffer;/* ping buffer */
-	int			rcd_nnis;	/* desired size of buffer */
-};
+	struct lnet_ping_info	*rcd_pinginfo;	/* ping buffer */
+} lnet_rc_data_t;
 
 struct lnet_peer_ni {
-	/* chain on lpn_peer_nis */
-	struct list_head	lpni_peer_nis;
+	/* chain on peer_net */
+	struct list_head	lpni_on_peer_net_list;
 	/* chain on remote peer list */
 	struct list_head	lpni_on_remote_peer_ni_list;
-	/* chain on recovery queue */
-	struct list_head	lpni_recovery;
 	/* chain on peer hash */
 	struct list_head	lpni_hashlist;
 	/* messages blocking for tx credits */
@@ -557,7 +455,6 @@ struct lnet_peer_ni {
 	struct lnet_peer_net	*lpni_peer_net;
 	/* statistics kept on each peer NI */
 	struct lnet_element_stats lpni_stats;
-	struct lnet_health_remote_stats lpni_hstats;
 	/* spin lock protecting credits and lpni_txq / lpni_rtrq */
 	spinlock_t		lpni_lock;
 	/* # tx credits available */
@@ -583,29 +480,23 @@ struct lnet_peer_ni {
 	/* # times router went dead<->alive. Protected with lpni_lock */
 	int			lpni_alive_count;
 	/* time of last aliveness news */
-	time64_t		lpni_timestamp;
+	cfs_time_t		lpni_timestamp;
 	/* time of last ping attempt */
-	time64_t		lpni_ping_timestamp;
+	cfs_time_t		lpni_ping_timestamp;
 	/* != 0 if ping reply expected */
-	time64_t		lpni_ping_deadline;
+	cfs_time_t		lpni_ping_deadline;
 	/* when I was last alive */
-	time64_t		lpni_last_alive;
+	cfs_time_t		lpni_last_alive;
 	/* when lpni_ni was queried last time */
-	time64_t		lpni_last_query;
+	cfs_time_t		lpni_last_query;
 	/* network peer is on */
 	struct lnet_net		*lpni_net;
 	/* peer's NID */
 	lnet_nid_t		lpni_nid;
 	/* # refs */
 	atomic_t		lpni_refcount;
-	/* health value for the peer */
-	atomic_t		lpni_healthv;
-	/* recovery ping mdh */
-	struct lnet_handle_md	lpni_recovery_ping_mdh;
 	/* CPT this peer attached on */
 	int			lpni_cpt;
-	/* state flags -- protected by lpni_lock */
-	unsigned		lpni_state;
 	/* # refs from lnet_route_t::lr_gateway */
 	int			lpni_rtr_refcount;
 	/* sequence number used to round robin over peer nis within a net */
@@ -618,148 +509,31 @@ struct lnet_peer_ni {
 	unsigned int		lpni_ping_feats;
 	/* routes on this peer */
 	struct list_head	lpni_routes;
-	/* preferred local nids: if only one, use lpni_pref.nid */
-	union lpni_pref {
-		lnet_nid_t	nid;
-		lnet_nid_t	*nids;
-	} lpni_pref;
+	/* array of preferred local nids */
+	lnet_nid_t		*lpni_pref_nids;
 	/* number of preferred NIDs in lnpi_pref_nids */
 	__u32			lpni_pref_nnids;
 	/* router checker state */
 	struct lnet_rc_data	*lpni_rcd;
 };
 
-/* Preferred path added due to traffic on non-MR peer_ni */
-#define LNET_PEER_NI_NON_MR_PREF	(1 << 0)
-/* peer is being recovered. */
-#define LNET_PEER_NI_RECOVERY_PENDING	(1 << 1)
-/* recovery ping failed */
-#define LNET_PEER_NI_RECOVERY_FAILED	(1 << 2)
-/* peer is being deleted */
-#define LNET_PEER_NI_DELETING		(1 << 3)
-
 struct lnet_peer {
-	/* chain on pt_peer_list */
-	struct list_head	lp_peer_list;
+	/* chain on global peer list */
+	struct list_head	lp_on_lnet_peer_list;
 
 	/* list of peer nets */
 	struct list_head	lp_peer_nets;
 
-	/* list of messages pending discovery*/
-	struct list_head	lp_dc_pendq;
-
 	/* primary NID of the peer */
 	lnet_nid_t		lp_primary_nid;
 
-	/* source NID to use during discovery */
-	lnet_nid_t		lp_disc_src_nid;
-
-	/* CPT of peer_table */
-	int			lp_cpt;
-
-	/* number of NIDs on this peer */
-	int			lp_nnis;
-
-	/* reference count */
-	atomic_t		lp_refcount;
-
-	/* lock protecting peer state flags */
-	spinlock_t		lp_lock;
-
-	/* peer state flags */
-	unsigned		lp_state;
-
-	/* buffer for data pushed by peer */
-	struct lnet_ping_buffer	*lp_data;
-
-	/* MD handle for ping in progress */
-	struct lnet_handle_md	lp_ping_mdh;
-
-	/* MD handle for push in progress */
-	struct lnet_handle_md	lp_push_mdh;
-
-	/* number of NIDs for sizing push data */
-	int			lp_data_nnis;
-
-	/* NI config sequence number of peer */
-	__u32			lp_peer_seqno;
-
-	/* Local NI config sequence number acked by peer */
-	__u32			lp_node_seqno;
-
-	/* Local NI config sequence number sent to peer */
-	__u32			lp_node_seqno_sent;
-
-	/* Ping error encountered during discovery. */
-	int			lp_ping_error;
-
-	/* Push error encountered during discovery. */
-	int			lp_push_error;
-
-	/* Error encountered during discovery. */
-	int			lp_dc_error;
-
-	/* time it was put on the ln_dc_working queue */
-	time64_t		lp_last_queued;
-
-	/* link on discovery-related lists */
-	struct list_head	lp_dc_list;
-
-	/* tasks waiting on discovery of this peer */
-	wait_queue_head_t	lp_dc_waitq;
+	/* peer is Multi-Rail enabled peer */
+	bool			lp_multi_rail;
 };
 
-/*
- * The status flags in lp_state. Their semantics have chosen so that
- * lp_state can be zero-initialized.
- *
- * A peer is marked MULTI_RAIL in two cases: it was configured using DLC
- * as multi-rail aware, or the LNET_PING_FEAT_MULTI_RAIL bit was set.
- *
- * A peer is marked NO_DISCOVERY if the LNET_PING_FEAT_DISCOVERY bit was
- * NOT set when the peer was pinged by discovery.
- */
-#define LNET_PEER_MULTI_RAIL	(1 << 0)	/* Multi-rail aware */
-#define LNET_PEER_NO_DISCOVERY	(1 << 1)	/* Peer disabled discovery */
-/*
- * A peer is marked CONFIGURED if it was configured by DLC.
- *
- * In addition, a peer is marked DISCOVERED if it has fully passed
- * through Peer Discovery.
- *
- * When Peer Discovery is disabled, the discovery thread will mark
- * peers REDISCOVER to indicate that they should be re-examined if
- * discovery is (re)enabled on the node.
- *
- * A peer that was created as the result of inbound traffic will not
- * be marked at all.
- */
-#define LNET_PEER_CONFIGURED	(1 << 2)	/* Configured via DLC */
-#define LNET_PEER_DISCOVERED	(1 << 3)	/* Peer was discovered */
-#define LNET_PEER_REDISCOVER	(1 << 4)	/* Discovery was disabled */
-/*
- * A peer is marked DISCOVERING when discovery is in progress.
- * The other flags below correspond to stages of discovery.
- */
-#define LNET_PEER_DISCOVERING	(1 << 5)	/* Discovering */
-#define LNET_PEER_DATA_PRESENT	(1 << 6)	/* Remote peer data present */
-#define LNET_PEER_NIDS_UPTODATE	(1 << 7)	/* Remote peer info uptodate */
-#define LNET_PEER_PING_SENT	(1 << 8)	/* Waiting for REPLY to Ping */
-#define LNET_PEER_PUSH_SENT	(1 << 9)	/* Waiting for ACK of Push */
-#define LNET_PEER_PING_FAILED	(1 << 10)	/* Ping send failure */
-#define LNET_PEER_PUSH_FAILED	(1 << 11)	/* Push send failure */
-/*
- * A ping can be forced as a way to fix up state, or as a manual
- * intervention by an admin.
- * A push can be forced in circumstances that would normally not
- * allow for one to happen.
- */
-#define LNET_PEER_FORCE_PING	(1 << 12)	/* Forced Ping */
-#define LNET_PEER_FORCE_PUSH	(1 << 13)	/* Forced Push */
-
 struct lnet_peer_net {
-	/* chain on lp_peer_nets */
-	struct list_head	lpn_peer_nets;
+	/* chain on peer block */
+	struct list_head	lpn_on_peer_list;
 
 	/* list of peer_nis on this network */
 	struct list_head	lpn_peer_nis;
@@ -769,38 +543,19 @@ struct lnet_peer_net {
 
 	/* Net ID */
 	__u32			lpn_net_id;
-
-	/* reference count */
-	atomic_t		lpn_refcount;
 };
 
 /* peer hash size */
 #define LNET_PEER_HASH_BITS	9
 #define LNET_PEER_HASH_SIZE	(1 << LNET_PEER_HASH_BITS)
 
-/*
- * peer hash table - one per CPT
- *
- * protected by lnet_net_lock/EX for update
- *    pt_version
- *    pt_number
- *    pt_hash[...]
- *    pt_peer_list
- *    pt_peers
- * protected by pt_zombie_lock:
- *    pt_zombie_list
- *    pt_zombies
- *
- * pt_zombie lock nests inside lnet_net_lock
- */
+/* peer hash table */
 struct lnet_peer_table {
 	int			pt_version;	/* /proc validity stamp */
-	int			pt_number;	/* # peers_ni extant */
+	atomic_t		pt_number;	/* # peers extant */
 	struct list_head	*pt_hash;	/* NID->peer hash */
-	struct list_head	pt_peer_list;	/* peers */
-	int			pt_peers;	/* # peers */
-	struct list_head	pt_zombie_list;	/* zombie peer_ni */
-	int			pt_zombies;	/* # zombie peers_ni */
+	struct list_head	pt_zombie_list;	/* zombie peers */
+	int			pt_zombies;	/* # zombie peers */
 	spinlock_t		pt_zombie_lock;	/* protect list and count */
 };
 
@@ -811,7 +566,7 @@ struct lnet_peer_table {
 					((lp)->lpni_net) && \
 					(lp)->lpni_net->net_tunables.lct_peer_timeout > 0)
 
-struct lnet_route {
+typedef struct lnet_route {
 	struct list_head	lr_list;	/* chain on net */
 	struct list_head	lr_gwlist;	/* chain on gateway */
 	struct lnet_peer_ni	*lr_gateway;	/* router node */
@@ -820,29 +575,27 @@ struct lnet_route {
 	unsigned int		lr_downis;	/* number of down NIs */
 	__u32			lr_hops;	/* how far I am */
 	unsigned int		lr_priority;	/* route priority */
-};
+} lnet_route_t;
 
 #define LNET_REMOTE_NETS_HASH_DEFAULT	(1U << 7)
 #define LNET_REMOTE_NETS_HASH_MAX	(1U << 16)
 #define LNET_REMOTE_NETS_HASH_SIZE	(1 << the_lnet.ln_remote_nets_hbits)
 
-struct lnet_remotenet {
+typedef struct lnet_remotenet {
 	/* chain on ln_remote_nets_hash */
 	struct list_head	lrn_list;
 	/* routes to me */
 	struct list_head	lrn_routes;
 	/* my net number */
 	__u32			lrn_net;
-};
+} lnet_remotenet_t;
 
 /** lnet message has credit and can be submitted to lnd for send/receive */
 #define LNET_CREDIT_OK		0
 /** lnet message is waiting for credit */
 #define LNET_CREDIT_WAIT	1
-/** lnet message is waiting for discovery */
-#define LNET_DC_WAIT		2
 
-struct lnet_rtrbufpool {
+typedef struct lnet_rtrbufpool {
 	/* my free buffer pool */
 	struct list_head	rbp_bufs;
 	/* messages blocking for a buffer */
@@ -857,13 +610,13 @@ struct lnet_rtrbufpool {
 	int			rbp_credits;
 	/* low water mark */
 	int			rbp_mincredits;
-};
+} lnet_rtrbufpool_t;
 
-struct lnet_rtrbuf {
+typedef struct lnet_rtrbuf {
 	struct list_head	 rb_list;	/* chain on rbp_bufs */
 	struct lnet_rtrbufpool	*rb_pool;	/* owning pool */
 	lnet_kiov_t		 rb_kiov[0];	/* the buffer space */
-};
+} lnet_rtrbuf_t;
 
 #define LNET_PEER_HASHSIZE   503		/* prime! */
 
@@ -933,7 +686,7 @@ struct lnet_match_table {
 /* dispatch routed PUT message by hashing source NID for wildcard portals */
 #define	LNET_PTL_ROTOR_HASH_RT	3
 
-struct lnet_portal {
+typedef struct lnet_portal {
 	spinlock_t		ptl_lock;
 	unsigned int		ptl_index;	/* portal ID, reserved */
 	/* flags on this portal: lazy, unique... */
@@ -950,7 +703,7 @@ struct lnet_portal {
 	int			ptl_mt_nmaps;
 	/* array of active entries' cpu-partition-id */
 	int			ptl_mt_maps[0];
-};
+} lnet_portal_t;
 
 #define LNET_LH_HASH_BITS	12
 #define LNET_LH_HASH_SIZE	(1ULL << LNET_LH_HASH_BITS)
@@ -971,31 +724,22 @@ struct lnet_msg_container {
 	int			msc_nfinalizers;
 	/* msgs waiting to complete finalizing */
 	struct list_head	msc_finalizing;
-	/* msgs waiting to be resent */
-	struct list_head	msc_resending;
 	struct list_head	msc_active;	/* active message list */
 	/* threads doing finalization */
 	void			**msc_finalizers;
-	/* threads doing resends */
-	void			**msc_resenders;
 };
 
-/* Peer Discovery states */
-#define LNET_DC_STATE_SHUTDOWN		0	/* not started */
-#define LNET_DC_STATE_RUNNING		1	/* started up OK */
-#define LNET_DC_STATE_STOPPING		2	/* telling thread to stop */
-
 /* Router Checker states */
-#define LNET_MT_STATE_SHUTDOWN		0	/* not started */
-#define LNET_MT_STATE_RUNNING		1	/* started up OK */
-#define LNET_MT_STATE_STOPPING		2	/* telling thread to stop */
+#define LNET_RC_STATE_SHUTDOWN		0	/* not started */
+#define LNET_RC_STATE_RUNNING		1	/* started up OK */
+#define LNET_RC_STATE_STOPPING		2	/* telling thread to stop */
 
 /* LNet states */
 #define LNET_STATE_SHUTDOWN		0	/* not started */
 #define LNET_STATE_RUNNING		1	/* started up OK */
 #define LNET_STATE_STOPPING		2	/* telling thread to stop */
 
-struct lnet {
+typedef struct lnet {
 	/* CPU partition table of LNet */
 	struct cfs_cpt_table		*ln_cpt_table;
 	/* number of CPTs in ln_cpt_table */
@@ -1026,6 +770,8 @@ struct lnet {
 	struct lnet_msg_container	**ln_msg_containers;
 	struct lnet_counters		**ln_counters;
 	struct lnet_peer_table		**ln_peer_tables;
+	/* list of configured or discovered peers */
+	struct list_head		ln_peers;
 	/* list of peer nis not on a local network */
 	struct list_head		ln_remote_peer_ni_list;
 	/* failure simulation */
@@ -1038,10 +784,6 @@ struct lnet {
 	struct lnet_ni			*ln_loni;
 	/* network zombie list */
 	struct list_head		ln_net_zombie;
-	/* resend messages list */
-	struct list_head		ln_msg_resend;
-	/* spin lock to protect the msg resend list */
-	spinlock_t			ln_msg_resend_lock;
 
 	/* remote networks with routes to them */
 	struct list_head		*ln_remote_nets_hash;
@@ -1054,46 +796,12 @@ struct lnet {
 	/* percpt router buffer pools */
 	struct lnet_rtrbufpool		**ln_rtrpools;
 
-	/*
-	 * Ping target / Push source
-	 *
-	 * The ping target and push source share a single buffer. The
-	 * ln_ping_target is protected against concurrent updates by
-	 * ln_api_mutex.
-	 */
 	struct lnet_handle_md		ln_ping_target_md;
 	struct lnet_handle_eq		ln_ping_target_eq;
-	struct lnet_ping_buffer		*ln_ping_target;
-	atomic_t			ln_ping_target_seqno;
+	struct lnet_ping_info		*ln_ping_info;
 
-	/*
-	 * Push Target
-	 *
-	 * ln_push_nnis contains the desired size of the push target.
-	 * The lnet_net_lock is used to handle update races. The old
-	 * buffer may linger a while after it has been unlinked, in
-	 * which case the event handler cleans up.
-	 */
-	struct lnet_handle_eq		ln_push_target_eq;
-	struct lnet_handle_md		ln_push_target_md;
-	struct lnet_ping_buffer		*ln_push_target;
-	int				ln_push_target_nnis;
-
-	/* discovery event queue handle */
-	struct lnet_handle_eq		ln_dc_eqh;
-	/* discovery requests */
-	struct list_head		ln_dc_request;
-	/* discovery working list */
-	struct list_head		ln_dc_working;
-	/* discovery expired list */
-	struct list_head		ln_dc_expired;
-	/* discovery thread wait queue */
-	wait_queue_head_t		ln_dc_waitq;
-	/* discovery startup/shutdown state */
-	int				ln_dc_state;
-
-	/* monitor thread startup/shutdown state */
-	int				ln_mt_state;
+	/* router checker startup/shutdown state */
+	int				ln_rc_state;
 	/* router checker's event queue */
 	struct lnet_handle_eq		ln_rc_eqh;
 	/* rcd still pending on net */
@@ -1101,7 +809,7 @@ struct lnet {
 	/* rcd ready for free */
 	struct list_head		ln_rcd_zombie;
 	/* serialise startup/shutdown */
-	struct semaphore		ln_mt_signal;
+	struct semaphore		ln_rc_signal;
 
 	struct mutex			ln_api_mutex;
 	struct mutex			ln_lnd_mutex;
@@ -1129,36 +837,10 @@ struct lnet {
 	 */
 	bool				ln_nis_from_mod_params;
 
-	/*
-	 * waitq for the monitor thread. The monitor thread takes care of
-	 * checking routes, timedout messages and resending messages.
-	 */
-	wait_queue_head_t		ln_mt_waitq;
-
-	/* per-cpt resend queues */
-	struct list_head		**ln_mt_resendqs;
-	/* local NIs to recover */
-	struct list_head		ln_mt_localNIRecovq;
-	/* local NIs to recover */
-	struct list_head		ln_mt_peerNIRecovq;
-	/*
-	 * An array of queues for GET/PUT waiting for REPLY/ACK respectively.
-	 * There are CPT number of queues. Since response trackers will be
-	 * added on the fast path we can't afford to grab the exclusive
-	 * net lock to protect these queues. The CPT will be calculated
-	 * based on the mdh cookie.
-	 */
-	struct list_head		**ln_mt_rstq;
-	/*
-	 * A response tracker becomes a zombie when the associated MD is queued
-	 * for unlink before the response tracker is detached from the MD. An
-	 * entry on a zombie list can be freed when either the remaining
-	 * operations on the MD complete or when LNet has shut down.
-	 */
-	struct list_head		**ln_mt_zombie_rstqs;
-	/* recovery eq handler */
-	struct lnet_handle_eq		ln_mt_eqh;
-
-};
+	/* waitq for router checker.  As long as there are no routes in
+	 * the list, the router checker will sleep on this queue.  when
+	 * routes are added the thread will wake up */
+	wait_queue_head_t		ln_rc_waitq;
+} lnet_t;
 
 #endif
diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/socklnd.h b/drivers/staging/lustrefsx/lnet/include/lnet/lnet.h
similarity index 74%
rename from drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/socklnd.h
rename to drivers/staging/lustrefsx/lnet/include/lnet/lnet.h
index 6453e053fa99d..54061f593496e 100644
--- a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/socklnd.h
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/lnet.h
@@ -22,23 +22,25 @@
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * #defines shared between socknal implementation and utilities
  */
-#ifndef __UAPI_LNET_SOCKLND_H__
-#define __UAPI_LNET_SOCKLND_H__
 
-#define SOCKLND_CONN_NONE     (-1)
-#define SOCKLND_CONN_ANY	0
-#define SOCKLND_CONN_CONTROL	1
-#define SOCKLND_CONN_BULK_IN	2
-#define SOCKLND_CONN_BULK_OUT	3
-#define SOCKLND_CONN_NTYPES	4
+#ifndef __LNET_H__
+#define __LNET_H__
+
+/*
+ * lnet.h
+ *
+ * User application interface file
+ */
 
-#define SOCKLND_CONN_ACK	SOCKLND_CONN_BULK_IN
+#include <lnet/types.h>
+#include <lnet/lib-dlc.h>
+#include <lnet/nidstr.h>
 
 #endif
diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetctl.h b/drivers/staging/lustrefsx/lnet/include/lnet/lnetctl.h
similarity index 76%
rename from drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetctl.h
rename to drivers/staging/lustrefsx/lnet/include/lnet/lnetctl.h
index cb4f153e377d1..4328135c5ec72 100644
--- a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetctl.h
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/lnetctl.h
@@ -17,23 +17,12 @@
  * header for lnet ioctl
  */
 /*
- * Copyright (c) 2014, 2017, Intel Corporation.
+ * Copyright (c) 2014, Intel Corporation.
  */
-#ifndef __UAPI_LNETCTL_H_
-#define __UAPI_LNETCTL_H_
+#ifndef _LNETCTL_H_
+#define _LNETCTL_H_
 
-#include <linux/types.h>
-/*
- * This is due to us being out of kernel and the way the OpenSFS branch
- * handles CFLAGS.
- */
-#ifdef __KERNEL__
-# include <uapi/linux/lnet/lnet-types.h>
-#else
-# include <linux/lnet/lnet-types.h>
-#endif
-
-#include <stdbool.h>
+#include <lnet/types.h>
 
 /** \addtogroup lnet_fault_simulation
  * @{ */
@@ -54,19 +43,6 @@ enum {
 #define LNET_GET_BIT		(1 << 2)
 #define LNET_REPLY_BIT		(1 << 3)
 
-#define HSTATUS_END			11
-#define HSTATUS_LOCAL_INTERRUPT_BIT	(1 << 1)
-#define HSTATUS_LOCAL_DROPPED_BIT	(1 << 2)
-#define HSTATUS_LOCAL_ABORTED_BIT	(1 << 3)
-#define HSTATUS_LOCAL_NO_ROUTE_BIT	(1 << 4)
-#define HSTATUS_LOCAL_ERROR_BIT		(1 << 5)
-#define HSTATUS_LOCAL_TIMEOUT_BIT	(1 << 6)
-#define HSTATUS_REMOTE_ERROR_BIT	(1 << 7)
-#define HSTATUS_REMOTE_DROPPED_BIT	(1 << 8)
-#define HSTATUS_REMOTE_TIMEOUT_BIT	(1 << 9)
-#define HSTATUS_NETWORK_TIMEOUT_BIT	(1 << 10)
-#define HSTATUS_RANDOM			0xffffffff
-
 /** ioctl parameter for LNet fault simulation */
 struct lnet_fault_attr {
 	/**
@@ -104,10 +80,6 @@ struct lnet_fault_attr {
 			 * with da_rate
 			 */
 			__u32			da_interval;
-			/** error type mask */
-			__u32			da_health_error_mask;
-			/** randomize error generation */
-			bool			da_random;
 		} drop;
 		/** message latency simulation */
 		struct {
diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetst.h b/drivers/staging/lustrefsx/lnet/include/lnet/lnetst.h
similarity index 99%
rename from drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetst.h
rename to drivers/staging/lustrefsx/lnet/include/lnet/lnetst.h
index ca871cac02b7b..7071039d9aa38 100644
--- a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetst.h
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/lnetst.h
@@ -29,13 +29,13 @@
  * This file is part of Lustre, http://www.lustre.org/
  * Lustre is a trademark of Sun Microsystems, Inc.
  *
+ * lnet/include/lnet/lnetst.h
+ *
  * Author: Liang Zhen <liangzhen@clusterfs.com>
  */
 
-#ifndef __UAPI_LNET_ST_H__
-#define __UAPI_LNET_ST_H__
-
-#include <linux/types.h>
+#ifndef __LNET_ST_H__
+#define __LNET_ST_H__
 
 #define LST_FEAT_NONE		(0)
 #define LST_FEAT_BULK_LEN	(1 << 0)	/* enable variable page size */
@@ -67,7 +67,7 @@
 
 struct lst_sid {
 	lnet_nid_t	ses_nid;	/* nid of console node */
-	__s64		ses_stamp;	/* time stamp in milliseconds */
+	__u64		ses_stamp;	/* time stamp */
 };					/*** session id */
 
 extern struct lst_sid LST_INVALID_SID;
diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/nidstr.h b/drivers/staging/lustrefsx/lnet/include/lnet/nidstr.h
similarity index 93%
rename from drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/nidstr.h
rename to drivers/staging/lustrefsx/lnet/include/lnet/nidstr.h
index c41b9158ecd7d..be14a1dfcf71d 100644
--- a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/nidstr.h
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/nidstr.h
@@ -23,21 +23,11 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2015, 2017, Intel Corporation.
+ * Copyright (c) 2014, 2015, Intel Corporation.
  */
 #ifndef _LNET_NIDSTRINGS_H
 #define _LNET_NIDSTRINGS_H
-
-#include <linux/types.h>
-/*
- * This is due to us being out of kernel and the way the OpenSFS branch
- * handles CFLAGS.
- */
-#ifdef __KERNEL__
-# include <uapi/linux/lnet/lnet-types.h>
-#else
-# include <linux/lnet/lnet-types.h>
-#endif
+#include <lnet/types.h>
 
 /**
  *  Lustre Network Driver types.
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/socklnd.h b/drivers/staging/lustrefsx/lnet/include/lnet/socklnd.h
index e2c19f2a4ed35..843d35c06105a 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/socklnd.h
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/socklnd.h
@@ -28,12 +28,22 @@
  * Lustre is a trademark of Sun Microsystems, Inc.
  *
  * lnet/include/lnet/socklnd.h
+ *
+ * #defines shared between socknal implementation and utilities
  */
 #ifndef __LNET_LNET_SOCKLND_H__
 #define __LNET_LNET_SOCKLND_H__
 
-#include <uapi/linux/lnet/lnet-types.h>
-#include <uapi/linux/lnet/socklnd.h>
+#include <lnet/types.h>
+
+#define SOCKLND_CONN_NONE     (-1)
+#define SOCKLND_CONN_ANY	0
+#define SOCKLND_CONN_CONTROL	1
+#define SOCKLND_CONN_BULK_IN	2
+#define SOCKLND_CONN_BULK_OUT	3
+#define SOCKLND_CONN_NTYPES	4
+
+#define SOCKLND_CONN_ACK	SOCKLND_CONN_BULK_IN
 
 struct ksock_hello_msg {
 	__u32			kshm_magic;	/* magic number of socklnd message */
diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-types.h b/drivers/staging/lustrefsx/lnet/include/lnet/types.h
similarity index 85%
rename from drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-types.h
rename to drivers/staging/lustrefsx/lnet/include/lnet/types.h
index 1f7828c8c9c15..e4bfe3d4951dd 100644
--- a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-types.h
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/types.h
@@ -23,15 +23,15 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  * Lustre is a trademark of Sun Microsystems, Inc.
  */
 
-#ifndef __UAPI_LNET_TYPES_H__
-#define __UAPI_LNET_TYPES_H__
+#ifndef __LNET_TYPES_H__
+#define __LNET_TYPES_H__
 
 /** \addtogroup lnet
  * @{ */
@@ -107,33 +107,30 @@ static inline __u32 LNET_MKNET(__u32 type, __u32 num)
 	return (type << 16) | num;
 }
 
-/** The lolnd NID (i.e. myself) */
-#define LNET_NID_LO_0 LNET_MKNID(LNET_MKNET(LOLND, 0), 0)
-
 #define WIRE_ATTR	__attribute__((packed))
 
 /* Packed version of struct lnet_process_id to transfer via network */
-struct lnet_process_id_packed {
+typedef struct lnet_process_id_packed {
 	lnet_nid_t nid;
 	lnet_pid_t pid;	/* node id / process id */
-} WIRE_ATTR;
+} WIRE_ATTR lnet_process_id_packed;
 
 /* The wire handle's interface cookie only matches one network interface in
  * one epoch (i.e. new cookie when the interface restarts or the node
  * reboots).  The object cookie only matches one object on that interface
  * during that object's lifetime (i.e. no cookie re-use). */
-struct lnet_handle_wire {
+typedef struct lnet_handle_wire {
 	__u64 wh_interface_cookie;
 	__u64 wh_object_cookie;
-} WIRE_ATTR;
+} WIRE_ATTR lnet_handle_wire_t;
 
-enum lnet_msg_type {
+typedef enum lnet_msg_type {
 	LNET_MSG_ACK = 0,
 	LNET_MSG_PUT,
 	LNET_MSG_GET,
 	LNET_MSG_REPLY,
 	LNET_MSG_HELLO,
-};
+} lnet_msg_type_t;
 
 /* The variant fields of the portals message header are aligned on an 8
  * byte boundary in the message header.  Note that all types used in these
@@ -170,7 +167,7 @@ struct lnet_hello {
 	__u32			type;
 } WIRE_ATTR;
 
-struct lnet_hdr {
+typedef struct lnet_hdr {
 	lnet_nid_t	dest_nid;
 	lnet_nid_t	src_nid;
 	lnet_pid_t	dest_pid;
@@ -185,7 +182,7 @@ struct lnet_hdr {
 		struct lnet_reply	reply;
 		struct lnet_hello	hello;
 	} msg;
-} WIRE_ATTR;
+} WIRE_ATTR lnet_hdr_t;
 
 /* A HELLO message contains a magic number and protocol version
  * code in the header's dest_nid, the peer's NID in the src_nid, and
@@ -196,11 +193,11 @@ struct lnet_hdr {
  * exchange HELLO messages when a connection is first established.  Individual
  * LNDs can put whatever else they fancy in lnet_hdr::msg.
  */
-struct lnet_magicversion {
+typedef struct lnet_magicversion {
 	__u32	magic;		/* LNET_PROTO_TCP_MAGIC */
 	__u16	version_major;	/* increment on incompatible change */
 	__u16	version_minor;	/* increment on compatible change */
-} WIRE_ATTR;
+} WIRE_ATTR lnet_magic_version_t;
 
 /* PROTO MAGIC for LNDs */
 #define LNET_PROTO_IB_MAGIC		0x0be91b91
@@ -218,109 +215,39 @@ struct lnet_magicversion {
 #define LNET_PROTO_TCP_VERSION_MINOR	0
 
 /* Acceptor connection request */
-struct lnet_acceptor_connreq {
+typedef struct lnet_acceptor_connreq {
 	__u32	acr_magic;	/* PTL_ACCEPTOR_PROTO_MAGIC */
 	__u32	acr_version;	/* protocol version */
 	__u64	acr_nid;	/* target NID */
-} WIRE_ATTR;
+} WIRE_ATTR lnet_acceptor_connreq_t;
 
 #define LNET_PROTO_ACCEPTOR_VERSION	1
 
-struct lnet_counters_common {
-	__u32	lcc_msgs_alloc;
-	__u32	lcc_msgs_max;
-	__u32	lcc_errors;
-	__u32	lcc_send_count;
-	__u32	lcc_recv_count;
-	__u32	lcc_route_count;
-	__u32	lcc_drop_count;
-	__u64	lcc_send_length;
-	__u64	lcc_recv_length;
-	__u64	lcc_route_length;
-	__u64	lcc_drop_length;
-} WIRE_ATTR;
-
-struct lnet_counters_health {
-	__u32	lch_rst_alloc;
-	__u32	lch_resend_count;
-	__u32	lch_response_timeout_count;
-	__u32	lch_local_interrupt_count;
-	__u32	lch_local_dropped_count;
-	__u32	lch_local_aborted_count;
-	__u32	lch_local_no_route_count;
-	__u32	lch_local_timeout_count;
-	__u32	lch_local_error_count;
-	__u32	lch_remote_dropped_count;
-	__u32	lch_remote_error_count;
-	__u32	lch_remote_timeout_count;
-	__u32	lch_network_timeout_count;
-};
-
-struct lnet_counters {
-	struct lnet_counters_common lct_common;
-	struct lnet_counters_health lct_health;
-};
+typedef struct lnet_counters {
+	__u32	msgs_alloc;
+	__u32	msgs_max;
+	__u32	errors;
+	__u32	send_count;
+	__u32	recv_count;
+	__u32	route_count;
+	__u32	drop_count;
+	__u64	send_length;
+	__u64	recv_length;
+	__u64	route_length;
+	__u64	drop_length;
+} WIRE_ATTR lnet_counters_t;
 
 #define LNET_NI_STATUS_UP	0x15aac0de
 #define LNET_NI_STATUS_DOWN	0xdeadface
 #define LNET_NI_STATUS_INVALID	0x00000000
 
-struct lnet_ni_status {
-	lnet_nid_t ns_nid;
-	__u32      ns_status;
-	__u32      ns_unused;
-} WIRE_ATTR;
-
-/*
- * NB: value of these features equal to LNET_PROTO_PING_VERSION_x
- * of old LNet, so there shouldn't be any compatibility issue
- */
-#define LNET_PING_FEAT_INVAL		(0)		/* no feature */
-#define LNET_PING_FEAT_BASE		(1 << 0)	/* just a ping */
-#define LNET_PING_FEAT_NI_STATUS	(1 << 1)	/* return NI status */
-#define LNET_PING_FEAT_RTE_DISABLED	(1 << 2)        /* Routing enabled */
-#define LNET_PING_FEAT_MULTI_RAIL	(1 << 3)        /* Multi-Rail aware */
-#define LNET_PING_FEAT_DISCOVERY	(1 << 4)	/* Supports Discovery */
-
-/*
- * All ping feature bits fit to hit the wire.
- * In lnet_assert_wire_constants() this is compared against its open-coded
- * value, and in lnet_ping_target_update() it is used to verify that no
- * unknown bits have been set.
- * New feature bits can be added, just be aware that this does change the
- * over-the-wire protocol.
- */
-#define LNET_PING_FEAT_BITS		(LNET_PING_FEAT_BASE | \
-					 LNET_PING_FEAT_NI_STATUS | \
-					 LNET_PING_FEAT_RTE_DISABLED | \
-					 LNET_PING_FEAT_MULTI_RAIL | \
-					 LNET_PING_FEAT_DISCOVERY)
-
-struct lnet_ping_info {
-	__u32			pi_magic;
-	__u32			pi_features;
-	lnet_pid_t		pi_pid;
-	__u32			pi_nnis;
-	struct lnet_ni_status	pi_ni[0];
-} WIRE_ATTR;
-
-#define LNET_PING_INFO_SIZE(NNIDS) \
-	offsetof(struct lnet_ping_info, pi_ni[NNIDS])
-#define LNET_PING_INFO_LONI(PINFO)	((PINFO)->pi_ni[0].ns_nid)
-#define LNET_PING_INFO_SEQNO(PINFO)	((PINFO)->pi_ni[0].ns_status)
-
 /*
  * This is a hard-coded limit on the number of interfaces supported by
  * the interface bonding implemented by the ksocknal LND. It must be
  * defined here because it is used in LNet data structures that are
  * common to all LNDs.
  */
-#define LNET_INTERFACES_NUM	16
-
-/* The minimum number of interfaces per node supported by LNet. */
-#define LNET_INTERFACES_MIN	16
-/* The default - arbitrary - value of the lnet_max_interfaces tunable. */
-#define LNET_INTERFACES_MAX_DEFAULT	200
+#define LNET_NUM_INTERFACES	16
 
 /**
  * Objects maintained by the LNet are accessed through handles. Handle types
@@ -331,9 +258,9 @@ struct lnet_ping_info {
  */
 #define LNET_WIRE_HANDLE_COOKIE_NONE   (-1)
 
-struct lnet_handle_eq {
+typedef struct lnet_handle_eq {
 	__u64	cookie;
-};
+} lnet_handle_eq_t;
 
 /**
  * Invalidate eq handle \a h.
@@ -353,9 +280,9 @@ static inline int LNetEQHandleIsInvalid(struct lnet_handle_eq h)
 	return (LNET_WIRE_HANDLE_COOKIE_NONE == h.cookie);
 }
 
-struct lnet_handle_md {
+typedef struct lnet_handle_md {
 	__u64	cookie;
-};
+} lnet_handle_md_t;
 
 /**
  * Invalidate md handle \a h.
@@ -375,19 +302,19 @@ static inline int LNetMDHandleIsInvalid(struct lnet_handle_md h)
 	return (LNET_WIRE_HANDLE_COOKIE_NONE == h.cookie);
 }
 
-struct lnet_handle_me {
+typedef struct lnet_handle_me {
 	__u64	cookie;
-};
+} lnet_handle_me_t;
 
 /**
  * Global process ID.
  */
-struct lnet_process_id {
+typedef struct lnet_process_id {
 	/** node id */
 	lnet_nid_t nid;
 	/** process id */
 	lnet_pid_t pid;
-};
+} lnet_process_id_t;
 /** @} lnet_addr */
 
 /** \addtogroup lnet_me
@@ -397,10 +324,10 @@ struct lnet_process_id {
  * Specifies whether the match entry or memory descriptor should be unlinked
  * automatically (LNET_UNLINK) or not (LNET_RETAIN).
  */
-enum lnet_unlink {
+typedef enum lnet_unlink {
 	LNET_RETAIN = 0,
 	LNET_UNLINK
-};
+} lnet_unlink_t;
 
 /**
  * Values of the type enum lnet_ins_pos are used to control where a new match
@@ -409,14 +336,14 @@ enum lnet_unlink {
  * LNET_INS_AFTER is used to insert the new entry after the current entry
  * or after the last item in the list.
  */
-enum lnet_ins_pos {
+typedef enum lnet_ins_pos {
 	/** insert ME before current position or head of the list */
 	LNET_INS_BEFORE,
 	/** insert ME after current position or tail of the list */
 	LNET_INS_AFTER,
 	/** attach ME at tail of local CPU partition ME list */
 	LNET_INS_LOCAL
-};
+} lnet_ins_pos;
 
 /** @} lnet_me */
 
@@ -427,7 +354,7 @@ enum lnet_ins_pos {
  * Defines the visible parts of a memory descriptor. Values of this type
  * are used to initialize memory descriptors.
  */
-struct lnet_md {
+typedef struct lnet_md {
 	/**
 	 * Specify the memory region associated with the memory descriptor.
 	 * If the options field has:
@@ -531,7 +458,7 @@ struct lnet_md {
 	 * if the LNET_MD_BULK_HANDLE option is set.
 	 */
 	struct lnet_handle_md bulk_handle;
-};
+} lnet_md_t;
 
 /* Max Transfer Unit (minimum supported everywhere).
  * CAVEAT EMPTOR, with multinet (i.e. routers forwarding between networks)
@@ -539,6 +466,9 @@ struct lnet_md {
 #define LNET_MTU_BITS	20
 #define LNET_MTU	(1 << LNET_MTU_BITS)
 
+/** limit on the number of fragments in discontiguous MDs */
+#define LNET_MAX_IOV	256
+
 /**
  * Options for the MD structure. See struct lnet_md::options.
  */
@@ -590,7 +520,7 @@ typedef struct {
 /**
  * Six types of events can be logged in an event queue.
  */
-enum lnet_event_kind {
+typedef enum lnet_event_kind {
 	/** An incoming GET operation has completed on the MD. */
 	LNET_EVENT_GET		= 1,
 	/**
@@ -626,14 +556,14 @@ enum lnet_event_kind {
 	 * \see LNetMDUnlink
 	 */
 	LNET_EVENT_UNLINK,
-};
+} lnet_event_kind_t;
 
 #define LNET_SEQ_GT(a, b)	(((signed long)((a) - (b))) > 0)
 
 /**
  * Information about an event on a MD.
  */
-struct lnet_event {
+typedef struct lnet_event {
 	/** The identifier (nid, pid) of the target. */
 	struct lnet_process_id   target;
 	/** The identifier (nid, pid) of the initiator. */
@@ -678,11 +608,6 @@ struct lnet_event {
 	 * \see LNetPut
 	 */
 	__u64               hdr_data;
-	/**
-	 * The message type, to ensure a handler for LNET_EVENT_SEND can
-	 * distinguish between LNET_MSG_GET and LNET_MSG_PUT.
-	 */
-	__u32               msg_type;
 	/**
 	 * Indicates the completion status of the operation. It's 0 for
 	 * successful operations, otherwise it's an error code.
@@ -707,7 +632,7 @@ struct lnet_event {
 	 * to each event.
 	 */
 	volatile unsigned long sequence;
-};
+} lnet_event_t;
 
 /**
  * Event queue handler function type.
@@ -734,12 +659,12 @@ typedef void (*lnet_eq_handler_t)(struct lnet_event *event);
  * \see struct lnet_md::options for the discussion on LNET_MD_ACK_DISABLE
  * by which acknowledgments can be disabled for a MD.
  */
-enum lnet_ack_req {
+typedef enum lnet_ack_req {
 	/** Request an acknowledgment */
 	LNET_ACK_REQ,
 	/** Request that no acknowledgment should be generated. */
 	LNET_NOACK_REQ
-};
+} lnet_ack_req_t;
 /** @} lnet_data */
 
 /** @} lnet */
diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_debug.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_debug.h
deleted file mode 100644
index 2672fe7ae103d..0000000000000
--- a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_debug.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2012, 2014, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * libcfs/include/libcfs/libcfs_debug.h
- *
- * Debug messages and assertions
- *
- */
-
-#ifndef __UAPI_LIBCFS_DEBUG_H__
-#define __UAPI_LIBCFS_DEBUG_H__
-
-#include <linux/types.h>
-
-/**
- * Format for debug message headers
- */
-struct ptldebug_header {
-	__u32 ph_len;
-	__u32 ph_flags;
-	__u32 ph_subsys;
-	__u32 ph_mask;
-	__u16 ph_cpu_id;
-	__u16 ph_type;
-	/* time_t overflow in 2106 */
-	__u32 ph_sec;
-	__u64 ph_usec;
-	__u32 ph_stack;
-	__u32 ph_pid;
-	__u32 ph_extern_pid;
-	__u32 ph_line_num;
-} __attribute__((packed));
-
-#define PH_FLAG_FIRST_RECORD	1
-
-/* Debugging subsystems (32 bits, non-overlapping) */
-#define S_UNDEFINED     0x00000001
-#define S_MDC           0x00000002
-#define S_MDS           0x00000004
-#define S_OSC           0x00000008
-#define S_OST           0x00000010
-#define S_CLASS         0x00000020
-#define S_LOG           0x00000040
-#define S_LLITE         0x00000080
-#define S_RPC           0x00000100
-#define S_MGMT          0x00000200
-#define S_LNET          0x00000400
-#define S_LND           0x00000800 /* ALL LNDs */
-#define S_PINGER        0x00001000
-#define S_FILTER        0x00002000
-#define S_LIBCFS        0x00004000
-#define S_ECHO          0x00008000
-#define S_LDLM          0x00010000
-#define S_LOV           0x00020000
-#define S_LQUOTA        0x00040000
-#define S_OSD           0x00080000
-#define S_LFSCK         0x00100000
-#define S_SNAPSHOT      0x00200000
-/* unused */
-#define S_LMV           0x00800000 /* b_new_cmd */
-/* unused */
-#define S_SEC           0x02000000 /* upcall cache */
-#define S_GSS           0x04000000 /* b_new_cmd */
-/* unused */
-#define S_MGC           0x10000000
-#define S_MGS           0x20000000
-#define S_FID           0x40000000 /* b_new_cmd */
-#define S_FLD           0x80000000 /* b_new_cmd */
-
-#define LIBCFS_DEBUG_SUBSYS_NAMES {					\
-	"undefined", "mdc", "mds", "osc", "ost", "class", "log",	\
-	"llite", "rpc", "mgmt", "lnet", "lnd", "pinger", "filter",	\
-	"libcfs", "echo", "ldlm", "lov", "lquota", "osd", "lfsck",	\
-	"snapshot", "", "lmv", "", "sec", "gss", "", "mgc", "mgs",	\
-	"fid", "fld", NULL }
-
-/* Debugging masks (32 bits, non-overlapping) */
-#define D_TRACE         0x00000001 /* ENTRY/EXIT markers */
-#define D_INODE         0x00000002
-#define D_SUPER         0x00000004
-#define D_EXT2          0x00000008 /* anything from ext2_debug */
-#define D_MALLOC        0x00000010 /* print malloc, free information */
-#define D_CACHE         0x00000020 /* cache-related items */
-#define D_INFO          0x00000040 /* general information */
-#define D_IOCTL         0x00000080 /* ioctl related information */
-#define D_NETERROR      0x00000100 /* network errors */
-#define D_NET           0x00000200 /* network communications */
-#define D_WARNING       0x00000400 /* CWARN(...) == CDEBUG (D_WARNING, ...) */
-#define D_BUFFS         0x00000800
-#define D_OTHER         0x00001000
-#define D_DENTRY        0x00002000
-#define D_NETTRACE      0x00004000
-#define D_PAGE          0x00008000 /* bulk page handling */
-#define D_DLMTRACE      0x00010000
-#define D_ERROR         0x00020000 /* CERROR(...) == CDEBUG (D_ERROR, ...) */
-#define D_EMERG         0x00040000 /* CEMERG(...) == CDEBUG (D_EMERG, ...) */
-#define D_HA            0x00080000 /* recovery and failover */
-#define D_RPCTRACE      0x00100000 /* for distributed debugging */
-#define D_VFSTRACE      0x00200000
-#define D_READA         0x00400000 /* read-ahead */
-#define D_MMAP          0x00800000
-#define D_CONFIG        0x01000000
-#define D_CONSOLE       0x02000000
-#define D_QUOTA         0x04000000
-#define D_SEC           0x08000000
-#define D_LFSCK         0x10000000 /* For both OI scrub and LFSCK */
-#define D_HSM           0x20000000
-#define D_SNAPSHOT      0x40000000 /* snapshot */
-#define D_LAYOUT        0x80000000
-
-#define LIBCFS_DEBUG_MASKS_NAMES {					\
-	"trace", "inode", "super", "ext2", "malloc", "cache", "info",	\
-	"ioctl", "neterror", "net", "warning", "buffs", "other",	\
-	"dentry", "nettrace", "page", "dlmtrace", "error", "emerg",	\
-	"ha", "rpctrace", "vfstrace", "reada", "mmap", "config",	\
-	"console", "quota", "sec", "lfsck", "hsm", "snapshot", "layout",\
-	NULL }
-
-#define D_CANTMASK   (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE)
-
-#define LIBCFS_DEBUG_FILE_PATH_DEFAULT "/tmp/lustre-log"
-
-#endif	/* __UAPI_LIBCFS_DEBUG_H__ */
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c
index 8a14b86f904c1..90645f6388ea6 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -35,13 +35,11 @@
  */
 
 #include <asm/page.h>
-#include <linux/inetdevice.h>
-
 #include "o2iblnd.h"
 
 static struct lnet_lnd the_o2iblnd;
 
-struct kib_data kiblnd_data;
+kib_data_t              kiblnd_data;
 
 static __u32
 kiblnd_cksum (void *ptr, int nob)
@@ -98,40 +96,41 @@ kiblnd_msgtype2str(int type)
 static int
 kiblnd_msgtype2size(int type)
 {
-	const int hdr_size = offsetof(struct kib_msg, ibm_u);
+        const int hdr_size = offsetof(kib_msg_t, ibm_u);
 
         switch (type) {
         case IBLND_MSG_CONNREQ:
         case IBLND_MSG_CONNACK:
-		return hdr_size + sizeof(struct kib_connparams);
+                return hdr_size + sizeof(kib_connparams_t);
 
         case IBLND_MSG_NOOP:
                 return hdr_size;
 
         case IBLND_MSG_IMMEDIATE:
-		return offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[0]);
+                return offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]);
 
         case IBLND_MSG_PUT_REQ:
-		return hdr_size + sizeof(struct kib_putreq_msg);
+                return hdr_size + sizeof(kib_putreq_msg_t);
 
         case IBLND_MSG_PUT_ACK:
-		return hdr_size + sizeof(struct kib_putack_msg);
+                return hdr_size + sizeof(kib_putack_msg_t);
 
         case IBLND_MSG_GET_REQ:
-		return hdr_size + sizeof(struct kib_get_msg);
+                return hdr_size + sizeof(kib_get_msg_t);
 
         case IBLND_MSG_PUT_NAK:
         case IBLND_MSG_PUT_DONE:
         case IBLND_MSG_GET_DONE:
-		return hdr_size + sizeof(struct kib_completion_msg);
+                return hdr_size + sizeof(kib_completion_msg_t);
         default:
                 return -1;
         }
 }
 
-static int kiblnd_unpack_rd(struct kib_msg *msg, int flip)
+static int
+kiblnd_unpack_rd(kib_msg_t *msg, int flip)
 {
-	struct kib_rdma_desc *rd;
+        kib_rdma_desc_t   *rd;
         int                nob;
         int                n;
         int                i;
@@ -156,7 +155,7 @@ static int kiblnd_unpack_rd(struct kib_msg *msg, int flip)
                 return 1;
         }
 
-	nob = offsetof(struct kib_msg, ibm_u) +
+        nob = offsetof (kib_msg_t, ibm_u) +
               kiblnd_rd_msg_size(rd, msg->ibm_type, n);
 
         if (msg->ibm_nob < nob) {
@@ -176,10 +175,11 @@ static int kiblnd_unpack_rd(struct kib_msg *msg, int flip)
         return 0;
 }
 
-void kiblnd_pack_msg(struct lnet_ni *ni, struct kib_msg *msg, int version,
-		     int credits, lnet_nid_t dstnid, __u64 dststamp)
+void
+kiblnd_pack_msg(struct lnet_ni *ni, kib_msg_t *msg, int version,
+		int credits, lnet_nid_t dstnid, __u64 dststamp)
 {
-	struct kib_net *net = ni->ni_data;
+        kib_net_t *net = ni->ni_data;
 
         /* CAVEAT EMPTOR! all message fields not set here should have been
          * initialised previously. */
@@ -200,9 +200,10 @@ void kiblnd_pack_msg(struct lnet_ni *ni, struct kib_msg *msg, int version,
         }
 }
 
-int kiblnd_unpack_msg(struct kib_msg *msg, int nob)
+int
+kiblnd_unpack_msg(kib_msg_t *msg, int nob)
 {
-	const int hdr_size = offsetof(struct kib_msg, ibm_u);
+        const int hdr_size = offsetof(kib_msg_t, ibm_u);
         __u32     msg_cksum;
         __u16     version;
         int       msg_nob;
@@ -312,13 +313,12 @@ int kiblnd_unpack_msg(struct kib_msg *msg, int nob)
 }
 
 int
-kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer_ni **peerp,
-		   lnet_nid_t nid)
+kiblnd_create_peer(struct lnet_ni *ni, kib_peer_ni_t **peerp, lnet_nid_t nid)
 {
-	struct kib_peer_ni *peer_ni;
-	struct kib_net *net = ni->ni_data;
-	int cpt = lnet_cpt_of_nid(nid, ni);
-	unsigned long flags;
+	kib_peer_ni_t	*peer_ni;
+	kib_net_t	*net = ni->ni_data;
+	int		cpt = lnet_cpt_of_nid(nid, ni);
+	unsigned long   flags;
 
 	LASSERT(net != NULL);
 	LASSERT(nid != LNET_NID_ANY);
@@ -333,7 +333,7 @@ kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer_ni **peerp,
 	peer_ni->ibp_nid = nid;
 	peer_ni->ibp_error = 0;
 	peer_ni->ibp_last_alive = 0;
-	peer_ni->ibp_max_frags = IBLND_MAX_RDMA_FRAGS;
+	peer_ni->ibp_max_frags = kiblnd_cfg_rdma_frags(peer_ni->ibp_ni);
 	peer_ni->ibp_queue_depth = ni->ni_net->net_tunables.lct_peer_tx_credits;
 	atomic_set(&peer_ni->ibp_refcount, 1);	/* 1 ref for caller */
 
@@ -356,9 +356,9 @@ kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer_ni **peerp,
 }
 
 void
-kiblnd_destroy_peer(struct kib_peer_ni *peer_ni)
+kiblnd_destroy_peer (kib_peer_ni_t *peer_ni)
 {
-	struct kib_net *net = peer_ni->ibp_ni->ni_data;
+	kib_net_t *net = peer_ni->ibp_ni->ni_data;
 
 	LASSERT(net != NULL);
 	LASSERT (atomic_read(&peer_ni->ibp_refcount) == 0);
@@ -375,18 +375,18 @@ kiblnd_destroy_peer(struct kib_peer_ni *peer_ni)
 	atomic_dec(&net->ibn_npeers);
 }
 
-struct kib_peer_ni *
+kib_peer_ni_t *
 kiblnd_find_peer_locked(struct lnet_ni *ni, lnet_nid_t nid)
 {
 	/* the caller is responsible for accounting the additional reference
 	 * that this creates */
 	struct list_head	*peer_list = kiblnd_nid2peerlist(nid);
 	struct list_head	*tmp;
-	struct kib_peer_ni		*peer_ni;
+	kib_peer_ni_t		*peer_ni;
 
 	list_for_each(tmp, peer_list) {
 
-		peer_ni = list_entry(tmp, struct kib_peer_ni, ibp_list);
+		peer_ni = list_entry(tmp, kib_peer_ni_t, ibp_list);
 		LASSERT(!kiblnd_peer_idle(peer_ni));
 
 		/*
@@ -409,7 +409,7 @@ kiblnd_find_peer_locked(struct lnet_ni *ni, lnet_nid_t nid)
 }
 
 void
-kiblnd_unlink_peer_locked(struct kib_peer_ni *peer_ni)
+kiblnd_unlink_peer_locked (kib_peer_ni_t *peer_ni)
 {
 	LASSERT(list_empty(&peer_ni->ibp_conns));
 
@@ -423,7 +423,7 @@ static int
 kiblnd_get_peer_info(struct lnet_ni *ni, int index,
 		     lnet_nid_t *nidp, int *count)
 {
-	struct kib_peer_ni		*peer_ni;
+	kib_peer_ni_t		*peer_ni;
 	struct list_head	*ptmp;
 	int			 i;
 	unsigned long		 flags;
@@ -434,7 +434,7 @@ kiblnd_get_peer_info(struct lnet_ni *ni, int index,
 
 		list_for_each(ptmp, &kiblnd_data.kib_peers[i]) {
 
-			peer_ni = list_entry(ptmp, struct kib_peer_ni, ibp_list);
+			peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list);
 			LASSERT(!kiblnd_peer_idle(peer_ni));
 
 			if (peer_ni->ibp_ni != ni)
@@ -457,17 +457,17 @@ kiblnd_get_peer_info(struct lnet_ni *ni, int index,
 }
 
 static void
-kiblnd_del_peer_locked(struct kib_peer_ni *peer_ni)
+kiblnd_del_peer_locked (kib_peer_ni_t *peer_ni)
 {
-	struct list_head *ctmp;
-	struct list_head *cnxt;
-	struct kib_conn	*conn;
+	struct list_head	*ctmp;
+	struct list_head	*cnxt;
+	kib_conn_t		*conn;
 
 	if (list_empty(&peer_ni->ibp_conns)) {
 		kiblnd_unlink_peer_locked(peer_ni);
 	} else {
 		list_for_each_safe(ctmp, cnxt, &peer_ni->ibp_conns) {
-			conn = list_entry(ctmp, struct kib_conn, ibc_list);
+			conn = list_entry(ctmp, kib_conn_t, ibc_list);
 
 			kiblnd_close_conn_locked(conn, 0);
 		}
@@ -483,7 +483,7 @@ kiblnd_del_peer(struct lnet_ni *ni, lnet_nid_t nid)
 	struct list_head	zombies = LIST_HEAD_INIT(zombies);
 	struct list_head	*ptmp;
 	struct list_head	*pnxt;
-	struct kib_peer_ni		*peer_ni;
+	kib_peer_ni_t		*peer_ni;
 	int			lo;
 	int			hi;
 	int			i;
@@ -501,7 +501,7 @@ kiblnd_del_peer(struct lnet_ni *ni, lnet_nid_t nid)
 
 	for (i = lo; i <= hi; i++) {
 		list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
-			peer_ni = list_entry(ptmp, struct kib_peer_ni, ibp_list);
+			peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list);
 			LASSERT(!kiblnd_peer_idle(peer_ni));
 
 			if (peer_ni->ibp_ni != ni)
@@ -524,17 +524,17 @@ kiblnd_del_peer(struct lnet_ni *ni, lnet_nid_t nid)
 
 	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 
-	kiblnd_txlist_done(&zombies, -EIO, LNET_MSG_STATUS_LOCAL_ERROR);
+	kiblnd_txlist_done(&zombies, -EIO);
 
 	return rc;
 }
 
-static struct kib_conn *
+static kib_conn_t *
 kiblnd_get_conn_by_idx(struct lnet_ni *ni, int index)
 {
-	struct kib_peer_ni		*peer_ni;
+	kib_peer_ni_t		*peer_ni;
 	struct list_head	*ptmp;
-	struct kib_conn	*conn;
+	kib_conn_t		*conn;
 	struct list_head	*ctmp;
 	int			i;
 	unsigned long		flags;
@@ -544,7 +544,7 @@ kiblnd_get_conn_by_idx(struct lnet_ni *ni, int index)
 	for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
 		list_for_each(ptmp, &kiblnd_data.kib_peers[i]) {
 
-			peer_ni = list_entry(ptmp, struct kib_peer_ni, ibp_list);
+			peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list);
 			LASSERT(!kiblnd_peer_idle(peer_ni));
 
 			if (peer_ni->ibp_ni != ni)
@@ -554,7 +554,7 @@ kiblnd_get_conn_by_idx(struct lnet_ni *ni, int index)
 				if (index-- > 0)
 					continue;
 
-				conn = list_entry(ctmp, struct kib_conn, ibc_list);
+				conn = list_entry(ctmp, kib_conn_t, ibc_list);
 				kiblnd_conn_addref(conn);
 				read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
 						       flags);
@@ -568,7 +568,7 @@ kiblnd_get_conn_by_idx(struct lnet_ni *ni, int index)
 }
 
 static void
-kiblnd_debug_rx(struct kib_rx *rx)
+kiblnd_debug_rx (kib_rx_t *rx)
 {
         CDEBUG(D_CONSOLE, "      %p status %d msg_type %x cred %d\n",
                rx, rx->rx_status, rx->rx_msg->ibm_type,
@@ -576,19 +576,19 @@ kiblnd_debug_rx(struct kib_rx *rx)
 }
 
 static void
-kiblnd_debug_tx(struct kib_tx *tx)
+kiblnd_debug_tx (kib_tx_t *tx)
 {
-	CDEBUG(D_CONSOLE, "      %p snd %d q %d w %d rc %d dl %lld "
+        CDEBUG(D_CONSOLE, "      %p snd %d q %d w %d rc %d dl %lx "
 	       "cookie %#llx msg %s%s type %x cred %d\n",
                tx, tx->tx_sending, tx->tx_queued, tx->tx_waiting,
-	       tx->tx_status, ktime_to_ns(tx->tx_deadline), tx->tx_cookie,
+               tx->tx_status, tx->tx_deadline, tx->tx_cookie,
                tx->tx_lntmsg[0] == NULL ? "-" : "!",
                tx->tx_lntmsg[1] == NULL ? "-" : "!",
                tx->tx_msg->ibm_type, tx->tx_msg->ibm_credits);
 }
 
 void
-kiblnd_debug_conn(struct kib_conn *conn)
+kiblnd_debug_conn (kib_conn_t *conn)
 {
 	struct list_head	*tmp;
 	int			i;
@@ -606,27 +606,27 @@ kiblnd_debug_conn(struct kib_conn *conn)
 
 	CDEBUG(D_CONSOLE, "   early_rxs:\n");
 	list_for_each(tmp, &conn->ibc_early_rxs)
-		kiblnd_debug_rx(list_entry(tmp, struct kib_rx, rx_list));
+		kiblnd_debug_rx(list_entry(tmp, kib_rx_t, rx_list));
 
 	CDEBUG(D_CONSOLE, "   tx_noops:\n");
 	list_for_each(tmp, &conn->ibc_tx_noops)
-		kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
+		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
 
 	CDEBUG(D_CONSOLE, "   tx_queue_nocred:\n");
 	list_for_each(tmp, &conn->ibc_tx_queue_nocred)
-		kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
+		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
 
 	CDEBUG(D_CONSOLE, "   tx_queue_rsrvd:\n");
 	list_for_each(tmp, &conn->ibc_tx_queue_rsrvd)
-		kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
+		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
 
 	CDEBUG(D_CONSOLE, "   tx_queue:\n");
 	list_for_each(tmp, &conn->ibc_tx_queue)
-		kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
+		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
 
 	CDEBUG(D_CONSOLE, "   active_txs:\n");
 	list_for_each(tmp, &conn->ibc_active_txs)
-		kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
+		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
 
 	CDEBUG(D_CONSOLE, "   rxs:\n");
 	for (i = 0; i < IBLND_RX_MSGS(conn); i++)
@@ -672,7 +672,7 @@ kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid)
 }
 
 static int
-kiblnd_get_completion_vector(struct kib_conn *conn, int cpt)
+kiblnd_get_completion_vector(kib_conn_t *conn, int cpt)
 {
 	cpumask_t	*mask;
 	int		vectors;
@@ -734,32 +734,15 @@ static unsigned int kiblnd_send_wrs(struct kib_conn *conn)
 	 * One WR for the LNet message
 	 * And ibc_max_frags for the transfer WRs
 	 */
-	int ret;
-	int multiplier = 1 + conn->ibc_max_frags;
-	enum kib_dev_caps dev_caps = conn->ibc_hdev->ibh_dev->ibd_dev_caps;
-
-	/* FastReg needs two extra WRs for map and invalidate */
-	if (dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED)
-		multiplier += 2;
+	unsigned int ret = 1 + conn->ibc_max_frags;
 
 	/* account for a maximum of ibc_queue_depth in-flight transfers */
-	ret = multiplier * conn->ibc_queue_depth;
-
-	if (ret > conn->ibc_hdev->ibh_max_qp_wr) {
-		CDEBUG(D_NET, "peer_credits %u will result in send work "
-		       "request size %d larger than maximum %d device "
-		       "can handle\n", conn->ibc_queue_depth, ret,
-		       conn->ibc_hdev->ibh_max_qp_wr);
-		conn->ibc_queue_depth =
-			conn->ibc_hdev->ibh_max_qp_wr / multiplier;
-	}
-
-	/* don't go beyond the maximum the device can handle */
-	return min(ret, conn->ibc_hdev->ibh_max_qp_wr);
+	ret *= conn->ibc_queue_depth;
+	return ret;
 }
 
-struct kib_conn *
-kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid,
+kib_conn_t *
+kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
 		   int state, int version)
 {
 	/* CAVEAT EMPTOR:
@@ -770,14 +753,14 @@ kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid,
 	 * to destroy 'cmid' here since I'm called from the CM which still has
 	 * its ref on 'cmid'). */
 	rwlock_t	       *glock = &kiblnd_data.kib_global_lock;
-	struct kib_net              *net = peer_ni->ibp_ni->ni_data;
-	struct kib_dev *dev;
+	kib_net_t              *net = peer_ni->ibp_ni->ni_data;
+	kib_dev_t              *dev;
 	struct ib_qp_init_attr *init_qp_attr;
 	struct kib_sched_info	*sched;
 #ifdef HAVE_IB_CQ_INIT_ATTR
 	struct ib_cq_init_attr  cq_attr = {};
 #endif
-	struct kib_conn	*conn;
+	kib_conn_t		*conn;
 	struct ib_cq		*cq;
 	unsigned long		flags;
 	int			cpt;
@@ -832,7 +815,6 @@ kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid,
 	INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd);
 	INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred);
 	INIT_LIST_HEAD(&conn->ibc_active_txs);
-	INIT_LIST_HEAD(&conn->ibc_zombie_txs);
 	spin_lock_init(&conn->ibc_lock);
 
 	LIBCFS_CPT_ALLOC(conn->ibc_connvars, lnet_cpt_table(), cpt,
@@ -871,7 +853,7 @@ kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid,
 	write_unlock_irqrestore(glock, flags);
 
 	LIBCFS_CPT_ALLOC(conn->ibc_rxs, lnet_cpt_table(), cpt,
-			 IBLND_RX_MSGS(conn) * sizeof(struct kib_rx));
+			 IBLND_RX_MSGS(conn) * sizeof(kib_rx_t));
 	if (conn->ibc_rxs == NULL) {
 		CERROR("Cannot allocate RX buffers\n");
 		goto failed_2;
@@ -897,12 +879,6 @@ kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid,
 			  kiblnd_get_completion_vector(conn, cpt));
 #endif
 	if (IS_ERR(cq)) {
-		/*
-		 * on MLX-5 (possibly MLX-4 as well) this error could be
-		 * hit if the concurrent_sends and/or peer_tx_credits is set
-		 * too high. Or due to an MLX-5 bug which tries to
-		 * allocate 256kb via kmalloc for WR cookie array
-		 */
 		CERROR("Failed to create CQ with %d CQEs: %ld\n",
 			IBLND_CQ_ENTRIES(conn), PTR_ERR(cq));
 		goto failed_2;
@@ -924,14 +900,20 @@ kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid,
 	init_qp_attr->qp_type = IB_QPT_RC;
 	init_qp_attr->send_cq = cq;
 	init_qp_attr->recv_cq = cq;
-	/*
-	 * kiblnd_send_wrs() can change the connection's queue depth if
-	 * the maximum work requests for the device is maxed out
-	 */
-	init_qp_attr->cap.max_send_wr = kiblnd_send_wrs(conn);
-	init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn);
 
-	rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
+	conn->ibc_sched = sched;
+
+	do {
+		init_qp_attr->cap.max_send_wr = kiblnd_send_wrs(conn);
+		init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn);
+
+		rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
+		if (!rc || conn->ibc_queue_depth < 2)
+			break;
+
+		conn->ibc_queue_depth--;
+	} while (rc);
+
 	if (rc) {
 		CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d, "
 		       "send_sge: %d, recv_sge: %d\n",
@@ -942,8 +924,6 @@ kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid,
 		goto failed_2;
 	}
 
-	conn->ibc_sched = sched;
-
 	if (conn->ibc_queue_depth != peer_ni->ibp_queue_depth)
 		CWARN("peer %s - queue depth reduced from %u to %u"
 		      "  to allow for qp creation\n",
@@ -996,8 +976,7 @@ kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid,
         return conn;
 
  failed_2:
-	kiblnd_destroy_conn(conn);
-	LIBCFS_FREE(conn, sizeof(*conn));
+	kiblnd_destroy_conn(conn, true);
  failed_1:
         LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
  failed_0:
@@ -1005,10 +984,10 @@ kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid,
 }
 
 void
-kiblnd_destroy_conn(struct kib_conn *conn)
+kiblnd_destroy_conn(kib_conn_t *conn, bool free_conn)
 {
 	struct rdma_cm_id *cmid = conn->ibc_cmid;
-	struct kib_peer_ni *peer_ni = conn->ibc_peer;
+	kib_peer_ni_t        *peer_ni = conn->ibc_peer;
 
 	LASSERT (!in_interrupt());
 	LASSERT (atomic_read(&conn->ibc_refcount) == 0);
@@ -1042,15 +1021,12 @@ kiblnd_destroy_conn(struct kib_conn *conn)
 	if (conn->ibc_cq)
 		ib_destroy_cq(conn->ibc_cq);
 
-	kiblnd_txlist_done(&conn->ibc_zombie_txs, -ECONNABORTED,
-			   LNET_MSG_STATUS_OK);
-
 	if (conn->ibc_rx_pages != NULL)
 		kiblnd_unmap_rx_descs(conn);
 
 	if (conn->ibc_rxs != NULL) {
 		LIBCFS_FREE(conn->ibc_rxs,
-			    IBLND_RX_MSGS(conn) * sizeof(struct kib_rx));
+			    IBLND_RX_MSGS(conn) * sizeof(kib_rx_t));
 	}
 
 	if (conn->ibc_connvars != NULL)
@@ -1061,24 +1037,27 @@ kiblnd_destroy_conn(struct kib_conn *conn)
 
 	/* See CAVEAT EMPTOR above in kiblnd_create_conn */
 	if (conn->ibc_state != IBLND_CONN_INIT) {
-		struct kib_net *net = peer_ni->ibp_ni->ni_data;
+		kib_net_t *net = peer_ni->ibp_ni->ni_data;
 
 		kiblnd_peer_decref(peer_ni);
 		rdma_destroy_id(cmid);
 		atomic_dec(&net->ibn_nconns);
 	}
+
+	if (free_conn)
+		LIBCFS_FREE(conn, sizeof(*conn));
 }
 
 int
-kiblnd_close_peer_conns_locked(struct kib_peer_ni *peer_ni, int why)
+kiblnd_close_peer_conns_locked(kib_peer_ni_t *peer_ni, int why)
 {
-	struct kib_conn	*conn;
+	kib_conn_t		*conn;
 	struct list_head	*ctmp;
 	struct list_head	*cnxt;
 	int			count = 0;
 
 	list_for_each_safe(ctmp, cnxt, &peer_ni->ibp_conns) {
-		conn = list_entry(ctmp, struct kib_conn, ibc_list);
+		conn = list_entry(ctmp, kib_conn_t, ibc_list);
 
 		CDEBUG(D_NET, "Closing conn -> %s, "
 			      "version: %x, reason: %d\n",
@@ -1093,16 +1072,16 @@ kiblnd_close_peer_conns_locked(struct kib_peer_ni *peer_ni, int why)
 }
 
 int
-kiblnd_close_stale_conns_locked(struct kib_peer_ni *peer_ni,
+kiblnd_close_stale_conns_locked(kib_peer_ni_t *peer_ni,
 				int version, __u64 incarnation)
 {
-	struct kib_conn	*conn;
+	kib_conn_t		*conn;
 	struct list_head	*ctmp;
 	struct list_head	*cnxt;
 	int			count = 0;
 
 	list_for_each_safe(ctmp, cnxt, &peer_ni->ibp_conns) {
-		conn = list_entry(ctmp, struct kib_conn, ibc_list);
+		conn = list_entry(ctmp, kib_conn_t, ibc_list);
 
 		if (conn->ibc_version     == version &&
 		    conn->ibc_incarnation == incarnation)
@@ -1124,7 +1103,7 @@ kiblnd_close_stale_conns_locked(struct kib_peer_ni *peer_ni,
 static int
 kiblnd_close_matching_conns(struct lnet_ni *ni, lnet_nid_t nid)
 {
-	struct kib_peer_ni		*peer_ni;
+	kib_peer_ni_t		*peer_ni;
 	struct list_head	*ptmp;
 	struct list_head	*pnxt;
 	int			lo;
@@ -1145,7 +1124,7 @@ kiblnd_close_matching_conns(struct lnet_ni *ni, lnet_nid_t nid)
 	for (i = lo; i <= hi; i++) {
 		list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
 
-			peer_ni = list_entry(ptmp, struct kib_peer_ni, ibp_list);
+			peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list);
 			LASSERT(!kiblnd_peer_idle(peer_ni));
 
 			if (peer_ni->ibp_ni != ni)
@@ -1190,7 +1169,7 @@ kiblnd_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
                 break;
         }
         case IOC_LIBCFS_GET_CONN: {
-		struct kib_conn *conn;
+                kib_conn_t *conn;
 
                 rc = 0;
                 conn = kiblnd_get_conn_by_idx(ni, data->ioc_count);
@@ -1222,13 +1201,13 @@ kiblnd_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
 }
 
 static void
-kiblnd_query(struct lnet_ni *ni, lnet_nid_t nid, time64_t *when)
+kiblnd_query(struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when)
 {
-	time64_t last_alive = 0;
-	time64_t now = ktime_get_seconds();
-	rwlock_t *glock = &kiblnd_data.kib_global_lock;
-	struct kib_peer_ni *peer_ni;
-	unsigned long flags;
+	cfs_time_t	last_alive = 0;
+	cfs_time_t	now = cfs_time_current();
+	rwlock_t	*glock = &kiblnd_data.kib_global_lock;
+	kib_peer_ni_t	*peer_ni;
+	unsigned long	flags;
 
 	read_lock_irqsave(glock, flags);
 
@@ -1246,14 +1225,14 @@ kiblnd_query(struct lnet_ni *ni, lnet_nid_t nid, time64_t *when)
 	if (peer_ni == NULL)
 		kiblnd_launch_tx(ni, NULL, nid);
 
-	CDEBUG(D_NET, "peer_ni %s %p, alive %lld secs ago\n",
+	CDEBUG(D_NET, "peer_ni %s %p, alive %ld secs ago\n",
 	       libcfs_nid2str(nid), peer_ni,
-	       last_alive ? now - last_alive : -1);
+	       last_alive ? cfs_duration_sec(now - last_alive) : -1);
 	return;
 }
 
 static void
-kiblnd_free_pages(struct kib_pages *p)
+kiblnd_free_pages(kib_pages_t *p)
 {
 	int	npages = p->ibp_npages;
 	int	i;
@@ -1263,23 +1242,23 @@ kiblnd_free_pages(struct kib_pages *p)
 			__free_page(p->ibp_pages[i]);
 	}
 
-	LIBCFS_FREE(p, offsetof(struct kib_pages, ibp_pages[npages]));
+	LIBCFS_FREE(p, offsetof(kib_pages_t, ibp_pages[npages]));
 }
 
 int
-kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages)
+kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages)
 {
-	struct kib_pages *p;
-	int i;
+	kib_pages_t	*p;
+	int		i;
 
 	LIBCFS_CPT_ALLOC(p, lnet_cpt_table(), cpt,
-			 offsetof(struct kib_pages, ibp_pages[npages]));
+			 offsetof(kib_pages_t, ibp_pages[npages]));
         if (p == NULL) {
                 CERROR("Can't allocate descriptor for %d pages\n", npages);
                 return -ENOMEM;
         }
 
-	memset(p, 0, offsetof(struct kib_pages, ibp_pages[npages]));
+        memset(p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
         p->ibp_npages = npages;
 
         for (i = 0; i < npages; i++) {
@@ -1297,9 +1276,9 @@ kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages)
 }
 
 void
-kiblnd_unmap_rx_descs(struct kib_conn *conn)
+kiblnd_unmap_rx_descs(kib_conn_t *conn)
 {
-	struct kib_rx *rx;
+        kib_rx_t *rx;
         int       i;
 
         LASSERT (conn->ibc_rxs != NULL);
@@ -1322,9 +1301,9 @@ kiblnd_unmap_rx_descs(struct kib_conn *conn)
 }
 
 void
-kiblnd_map_rx_descs(struct kib_conn *conn)
+kiblnd_map_rx_descs(kib_conn_t *conn)
 {
-	struct kib_rx *rx;
+        kib_rx_t       *rx;
         struct page    *pg;
         int             pg_off;
         int             ipg;
@@ -1335,7 +1314,7 @@ kiblnd_map_rx_descs(struct kib_conn *conn)
 		rx = &conn->ibc_rxs[i];
 
 		rx->rx_conn = conn;
-		rx->rx_msg = (struct kib_msg *)(((char *)page_address(pg)) + pg_off);
+		rx->rx_msg = (kib_msg_t *)(((char *)page_address(pg)) + pg_off);
 
 		rx->rx_msgaddr =
 			kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev,
@@ -1361,11 +1340,11 @@ kiblnd_map_rx_descs(struct kib_conn *conn)
 }
 
 static void
-kiblnd_unmap_tx_pool(struct kib_tx_pool *tpo)
+kiblnd_unmap_tx_pool(kib_tx_pool_t *tpo)
 {
-	struct kib_hca_dev *hdev = tpo->tpo_hdev;
-	struct kib_tx *tx;
-	int i;
+        kib_hca_dev_t  *hdev = tpo->tpo_hdev;
+        kib_tx_t       *tx;
+        int             i;
 
         LASSERT (tpo->tpo_pool.po_allocated == 0);
 
@@ -1384,10 +1363,10 @@ kiblnd_unmap_tx_pool(struct kib_tx_pool *tpo)
         tpo->tpo_hdev = NULL;
 }
 
-static struct kib_hca_dev *
-kiblnd_current_hdev(struct kib_dev *dev)
+static kib_hca_dev_t *
+kiblnd_current_hdev(kib_dev_t *dev)
 {
-	struct kib_hca_dev *hdev;
+        kib_hca_dev_t *hdev;
         unsigned long  flags;
         int            i = 0;
 
@@ -1412,14 +1391,14 @@ kiblnd_current_hdev(struct kib_dev *dev)
 }
 
 static void
-kiblnd_map_tx_pool(struct kib_tx_pool *tpo)
-{
-	struct kib_pages *txpgs = tpo->tpo_tx_pages;
-	struct kib_pool *pool = &tpo->tpo_pool;
-	struct kib_net      *net   = pool->po_owner->ps_net;
-	struct kib_dev *dev;
-	struct page *page;
-	struct kib_tx *tx;
+kiblnd_map_tx_pool(kib_tx_pool_t *tpo)
+{
+        kib_pages_t    *txpgs = tpo->tpo_tx_pages;
+        kib_pool_t     *pool  = &tpo->tpo_pool;
+        kib_net_t      *net   = pool->po_owner->ps_net;
+	kib_dev_t      *dev;
+        struct page    *page;
+        kib_tx_t       *tx;
         int             page_offset;
         int             ipage;
         int             i;
@@ -1440,8 +1419,8 @@ kiblnd_map_tx_pool(struct kib_tx_pool *tpo)
 		page = txpgs->ibp_pages[ipage];
 		tx = &tpo->tpo_tx_descs[i];
 
-		tx->tx_msg = (struct kib_msg *)(((char *)page_address(page)) +
-						page_offset);
+		tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
+					   page_offset);
 
 		tx->tx_msgaddr = kiblnd_dma_map_single(tpo->tpo_hdev->ibh_ibdev,
 						       tx->tx_msg,
@@ -1464,14 +1443,39 @@ kiblnd_map_tx_pool(struct kib_tx_pool *tpo)
 	}
 }
 
+#ifdef HAVE_IB_GET_DMA_MR
+struct ib_mr *
+kiblnd_find_rd_dma_mr(struct lnet_ni *ni, kib_rdma_desc_t *rd,
+		      int negotiated_nfrags)
+{
+	kib_net_t     *net   = ni->ni_data;
+	kib_hca_dev_t *hdev  = net->ibn_dev->ibd_hdev;
+	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+	int	mod;
+	__u16	nfrags;
+
+	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+	mod = tunables->lnd_map_on_demand;
+	nfrags = (negotiated_nfrags != -1) ? negotiated_nfrags : mod;
+
+	LASSERT(hdev->ibh_mrs != NULL);
+
+	if (mod > 0 && nfrags <= rd->rd_nfrags)
+		return NULL;
+
+	return hdev->ibh_mrs;
+}
+#endif
+
 static void
-kiblnd_destroy_fmr_pool(struct kib_fmr_pool *fpo)
+kiblnd_destroy_fmr_pool(kib_fmr_pool_t *fpo)
 {
 	LASSERT(fpo->fpo_map_count == 0);
 
 #ifdef HAVE_FMR_POOL_API
-	if (fpo->fpo_is_fmr && fpo->fmr.fpo_fmr_pool) {
-		ib_destroy_fmr_pool(fpo->fmr.fpo_fmr_pool);
+	if (fpo->fpo_is_fmr) {
+		if (fpo->fmr.fpo_fmr_pool)
+			ib_destroy_fmr_pool(fpo->fmr.fpo_fmr_pool);
 	} else
 #endif /* HAVE_FMR_POOL_API */
 	{
@@ -1502,7 +1506,7 @@ kiblnd_destroy_fmr_pool(struct kib_fmr_pool *fpo)
 static void
 kiblnd_destroy_fmr_pool_list(struct list_head *head)
 {
-	struct kib_fmr_pool *fpo, *tmp;
+	kib_fmr_pool_t *fpo, *tmp;
 
 	list_for_each_entry_safe(fpo, tmp, head, fpo_list) {
 		list_del(&fpo->fpo_list);
@@ -1529,11 +1533,10 @@ kiblnd_fmr_flush_trigger(struct lnet_ioctl_config_o2iblnd_tunables *tunables,
 }
 
 #ifdef HAVE_FMR_POOL_API
-static int kiblnd_alloc_fmr_pool(struct kib_fmr_poolset *fps,
-				 struct kib_fmr_pool *fpo)
+static int kiblnd_alloc_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t *fpo)
 {
 	struct ib_fmr_pool_param param = {
-		.max_pages_per_fmr = LNET_MAX_IOV,
+		.max_pages_per_fmr = LNET_MAX_PAYLOAD/PAGE_SIZE,
 		.page_shift        = PAGE_SHIFT,
 		.access            = (IB_ACCESS_LOCAL_WRITE |
 				      IB_ACCESS_REMOTE_WRITE),
@@ -1553,23 +1556,16 @@ static int kiblnd_alloc_fmr_pool(struct kib_fmr_poolset *fps,
 		else
 			CERROR("FMRs are not supported\n");
 	}
-	fpo->fpo_is_fmr = true;
 
 	return rc;
 }
 #endif /* HAVE_FMR_POOL_API */
 
-static int kiblnd_alloc_freg_pool(struct kib_fmr_poolset *fps,
-				  struct kib_fmr_pool *fpo,
-				  enum kib_dev_caps dev_caps)
+static int kiblnd_alloc_freg_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t *fpo)
 {
 	struct kib_fast_reg_descriptor *frd, *tmp;
 	int i, rc;
 
-#ifdef HAVE_FMR_POOL_API
-	fpo->fpo_is_fmr = false;
-#endif
-
 	INIT_LIST_HEAD(&fpo->fast_reg.fpo_pool_list);
 	fpo->fast_reg.fpo_pool_size = 0;
 	for (i = 0; i < fps->fps_pool_size; i++) {
@@ -1584,7 +1580,7 @@ static int kiblnd_alloc_freg_pool(struct kib_fmr_poolset *fps,
 
 #ifndef HAVE_IB_MAP_MR_SG
 		frd->frd_frpl = ib_alloc_fast_reg_page_list(fpo->fpo_hdev->ibh_ibdev,
-							    LNET_MAX_IOV);
+							    LNET_MAX_PAYLOAD/PAGE_SIZE);
 		if (IS_ERR(frd->frd_frpl)) {
 			rc = PTR_ERR(frd->frd_frpl);
 			CERROR("Failed to allocate ib_fast_reg_page_list: %d\n",
@@ -1596,28 +1592,11 @@ static int kiblnd_alloc_freg_pool(struct kib_fmr_poolset *fps,
 
 #ifdef HAVE_IB_ALLOC_FAST_REG_MR
 		frd->frd_mr = ib_alloc_fast_reg_mr(fpo->fpo_hdev->ibh_pd,
-						   LNET_MAX_IOV);
+						   LNET_MAX_PAYLOAD/PAGE_SIZE);
 #else
-		/*
-		 * it is expected to get here if this is an MLX-5 card.
-		 * MLX-4 cards will always use FMR and MLX-5 cards will
-		 * always use fast_reg. It turns out that some MLX-5 cards
-		 * (possibly due to older FW versions) do not natively support
-		 * gaps. So we will need to track them here.
-		 */
 		frd->frd_mr = ib_alloc_mr(fpo->fpo_hdev->ibh_pd,
-#ifdef IB_MR_TYPE_SG_GAPS
-					  ((*kiblnd_tunables.kib_use_fastreg_gaps == 1) &&
-					   (dev_caps & IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT)) ?
-						IB_MR_TYPE_SG_GAPS :
-						IB_MR_TYPE_MEM_REG,
-#else
-						IB_MR_TYPE_MEM_REG,
-#endif
-					  LNET_MAX_IOV);
-		if ((*kiblnd_tunables.kib_use_fastreg_gaps == 1) &&
-		    (dev_caps & IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT))
-			CWARN("using IB_MR_TYPE_SG_GAPS, expect a performance drop\n");
+					  IB_MR_TYPE_MEM_REG,
+					  LNET_MAX_PAYLOAD/PAGE_SIZE);
 #endif
 		if (IS_ERR(frd->frd_mr)) {
 			rc = PTR_ERR(frd->frd_mr);
@@ -1660,32 +1639,79 @@ static int kiblnd_alloc_freg_pool(struct kib_fmr_poolset *fps,
 	return rc;
 }
 
-static int kiblnd_create_fmr_pool(struct kib_fmr_poolset *fps,
-				  struct kib_fmr_pool **pp_fpo)
+static int
+kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t **pp_fpo)
 {
-	struct kib_dev *dev = fps->fps_net->ibn_dev;
-	struct kib_fmr_pool *fpo;
+	struct ib_device_attr *dev_attr;
+	kib_dev_t *dev = fps->fps_net->ibn_dev;
+	kib_fmr_pool_t *fpo;
 	int rc;
 
+#ifndef HAVE_IB_DEVICE_ATTRS
+	dev_attr = kmalloc(sizeof(*dev_attr), GFP_KERNEL);
+	if (!dev_attr)
+		return -ENOMEM;
+#endif
+
 	LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo));
 	if (!fpo) {
-		return -ENOMEM;
+		rc = -ENOMEM;
+		goto out_dev_attr;
 	}
-	memset(fpo, 0, sizeof(*fpo));
 
 	fpo->fpo_hdev = kiblnd_current_hdev(dev);
 
+#ifdef HAVE_IB_DEVICE_ATTRS
+	dev_attr = &fpo->fpo_hdev->ibh_ibdev->attrs;
+#else
+	rc = ib_query_device(fpo->fpo_hdev->ibh_ibdev, dev_attr);
+	if (rc) {
+		CERROR("Query device failed for %s: %d\n",
+			fpo->fpo_hdev->ibh_ibdev->name, rc);
+		goto out_dev_attr;
+	}
+#endif
+
+#ifdef HAVE_FMR_POOL_API
+	/* Check for FMR or FastReg support */
+	fpo->fpo_is_fmr = 0;
+#ifdef HAVE_IB_DEVICE_OPS
+	if (fpo->fpo_hdev->ibh_ibdev->ops.alloc_fmr &&
+	    fpo->fpo_hdev->ibh_ibdev->ops.dealloc_fmr &&
+	    fpo->fpo_hdev->ibh_ibdev->ops.map_phys_fmr &&
+	    fpo->fpo_hdev->ibh_ibdev->ops.unmap_fmr) {
+#else
+	if (fpo->fpo_hdev->ibh_ibdev->alloc_fmr &&
+	    fpo->fpo_hdev->ibh_ibdev->dealloc_fmr &&
+	    fpo->fpo_hdev->ibh_ibdev->map_phys_fmr &&
+	    fpo->fpo_hdev->ibh_ibdev->unmap_fmr) {
+#endif
+		LCONSOLE_INFO("Using FMR for registration\n");
+		fpo->fpo_is_fmr = 1;
+	} else
+#endif /* HAVE_FMR_POOL_API */
+	if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
+		LCONSOLE_INFO("Using FastReg for registration\n");
+	} else {
+		rc = -ENOSYS;
+		LCONSOLE_ERROR_MSG(rc, "IB device does not support FMRs nor FastRegs, can't register memory\n");
+		goto out_dev_attr;
+	}
+
 #ifdef HAVE_FMR_POOL_API
-	if (dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED)
+	if (fpo->fpo_is_fmr)
 		rc = kiblnd_alloc_fmr_pool(fps, fpo);
 	else
 #endif /* HAVE_FMR_POOL_API */
-		rc = kiblnd_alloc_freg_pool(fps, fpo, dev->ibd_dev_caps);
+		rc = kiblnd_alloc_freg_pool(fps, fpo);
 	if (rc)
 		goto out_fpo;
 
-	fpo->fpo_deadline = ktime_get_seconds() + IBLND_POOL_DEADLINE;
-	fpo->fpo_owner = fps;
+#ifndef HAVE_IB_DEVICE_ATTRS
+	kfree(dev_attr);
+#endif
+	fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+	fpo->fpo_owner    = fps;
 	*pp_fpo = fpo;
 
 	return 0;
@@ -1693,11 +1719,17 @@ static int kiblnd_create_fmr_pool(struct kib_fmr_poolset *fps,
 out_fpo:
 	kiblnd_hdev_decref(fpo->fpo_hdev);
 	LIBCFS_FREE(fpo, sizeof(*fpo));
+
+out_dev_attr:
+#ifndef HAVE_IB_DEVICE_ATTRS
+	kfree(dev_attr);
+#endif
+
 	return rc;
 }
 
 static void
-kiblnd_fail_fmr_poolset(struct kib_fmr_poolset *fps, struct list_head *zombies)
+kiblnd_fail_fmr_poolset(kib_fmr_poolset_t *fps, struct list_head *zombies)
 {
 	if (fps->fps_net == NULL) /* intialized? */
 		return;
@@ -1705,10 +1737,8 @@ kiblnd_fail_fmr_poolset(struct kib_fmr_poolset *fps, struct list_head *zombies)
 	spin_lock(&fps->fps_lock);
 
 	while (!list_empty(&fps->fps_pool_list)) {
-		struct kib_fmr_pool *fpo = list_entry(fps->fps_pool_list.next,
-						      struct kib_fmr_pool,
-						      fpo_list);
-
+		kib_fmr_pool_t *fpo = list_entry(fps->fps_pool_list.next,
+                                                 kib_fmr_pool_t, fpo_list);
 		fpo->fpo_failed = 1;
 		list_del(&fpo->fpo_list);
 		if (fpo->fpo_map_count == 0)
@@ -1721,7 +1751,7 @@ kiblnd_fail_fmr_poolset(struct kib_fmr_poolset *fps, struct list_head *zombies)
 }
 
 static void
-kiblnd_fini_fmr_poolset(struct kib_fmr_poolset *fps)
+kiblnd_fini_fmr_poolset(kib_fmr_poolset_t *fps)
 {
 	if (fps->fps_net != NULL) { /* initialized? */
 		kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list);
@@ -1730,14 +1760,14 @@ kiblnd_fini_fmr_poolset(struct kib_fmr_poolset *fps)
 }
 
 static int
-kiblnd_init_fmr_poolset(struct kib_fmr_poolset *fps, int cpt, int ncpts,
-			struct kib_net *net,
+kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt, int ncpts,
+			kib_net_t *net,
 			struct lnet_ioctl_config_o2iblnd_tunables *tunables)
 {
-	struct kib_fmr_pool *fpo;
-	int rc;
+	kib_fmr_pool_t *fpo;
+	int		rc;
 
-	memset(fps, 0, sizeof(struct kib_fmr_poolset));
+	memset(fps, 0, sizeof(kib_fmr_poolset_t));
 
 	fps->fps_net = net;
 	fps->fps_cpt = cpt;
@@ -1758,20 +1788,20 @@ kiblnd_init_fmr_poolset(struct kib_fmr_poolset *fps, int cpt, int ncpts,
 }
 
 static int
-kiblnd_fmr_pool_is_idle(struct kib_fmr_pool *fpo, time64_t now)
+kiblnd_fmr_pool_is_idle(kib_fmr_pool_t *fpo, cfs_time_t now)
 {
         if (fpo->fpo_map_count != 0) /* still in use */
                 return 0;
         if (fpo->fpo_failed)
                 return 1;
-	return now >= fpo->fpo_deadline;
+        return cfs_time_aftereq(now, fpo->fpo_deadline);
 }
 
 #if defined(HAVE_FMR_POOL_API) || !defined(HAVE_IB_MAP_MR_SG)
 static int
-kiblnd_map_tx_pages(struct kib_tx *tx, struct kib_rdma_desc *rd)
+kiblnd_map_tx_pages(kib_tx_t *tx, kib_rdma_desc_t *rd)
 {
-	struct kib_hca_dev *hdev;
+	kib_hca_dev_t	*hdev;
 	__u64		*pages = tx->tx_pages;
 	int		npages;
 	int		size;
@@ -1792,13 +1822,13 @@ kiblnd_map_tx_pages(struct kib_tx *tx, struct kib_rdma_desc *rd)
 #endif
 
 void
-kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status)
+kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status)
 {
-	struct list_head zombies = LIST_HEAD_INIT(zombies);
-	struct kib_fmr_pool *fpo = fmr->fmr_pool;
-	struct kib_fmr_poolset *fps;
-	time64_t now = ktime_get_seconds();
-	struct kib_fmr_pool *tmp;
+	struct list_head   zombies = LIST_HEAD_INIT(zombies);
+	kib_fmr_pool_t    *fpo = fmr->fmr_pool;
+	kib_fmr_poolset_t *fps;
+	cfs_time_t         now = cfs_time_current();
+	kib_fmr_pool_t    *tmp;
 
 	if (!fpo)
 		return;
@@ -1823,11 +1853,10 @@ kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status)
 
 		if (frd) {
 			frd->frd_valid = false;
-			frd->frd_posted = false;
-			fmr->fmr_frd = NULL;
 			spin_lock(&fps->fps_lock);
 			list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list);
 			spin_unlock(&fps->fps_lock);
+			fmr->fmr_frd = NULL;
 		}
 	}
 	fmr->fmr_pool = NULL;
@@ -1851,11 +1880,11 @@ kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status)
 		kiblnd_destroy_fmr_pool_list(&zombies);
 }
 
-int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx,
-			struct kib_rdma_desc *rd, u32 nob, u64 iov,
-			struct kib_fmr *fmr)
+int
+kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, kib_rdma_desc_t *rd,
+		    __u32 nob, __u64 iov, kib_fmr_t *fmr, bool *is_fastreg)
 {
-	struct kib_fmr_pool *fpo;
+	kib_fmr_pool_t *fpo;
 	__u64 version;
 	bool is_rx = (rd != tx->tx_rd);
 #ifdef HAVE_FMR_POOL_API
@@ -1869,7 +1898,7 @@ int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx,
 	spin_lock(&fps->fps_lock);
 	version = fps->fps_version;
 	list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) {
-		fpo->fpo_deadline = ktime_get_seconds() + IBLND_POOL_DEADLINE;
+		fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
 		fpo->fpo_map_count++;
 
 #ifdef HAVE_FMR_POOL_API
@@ -1877,6 +1906,7 @@ int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx,
 		if (fpo->fpo_is_fmr) {
 			struct ib_pool_fmr *pfmr;
 
+			*is_fastreg = 0;
 			spin_unlock(&fps->fps_lock);
 
 			if (!tx_pages_mapped) {
@@ -1898,6 +1928,7 @@ int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx,
 		} else
 #endif /* HAVE_FMR_POOL_API */
 		{
+			*is_fastreg = 1;
 			if (!list_empty(&fpo->fast_reg.fpo_pool_list)) {
 				struct kib_fast_reg_descriptor *frd;
 #ifdef HAVE_IB_MAP_MR_SG
@@ -1939,14 +1970,14 @@ int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx,
 #ifdef HAVE_IB_MAP_MR_SG
 #ifdef HAVE_IB_MAP_MR_SG_5ARGS
 				n = ib_map_mr_sg(mr, tx->tx_frags,
-						 rd->rd_nfrags, NULL, PAGE_SIZE);
+						 tx->tx_nfrags, NULL, PAGE_SIZE);
 #else
 				n = ib_map_mr_sg(mr, tx->tx_frags,
-						 rd->rd_nfrags, PAGE_SIZE);
+						 tx->tx_nfrags, PAGE_SIZE);
 #endif /* HAVE_IB_MAP_MR_SG_5ARGS */
-				if (unlikely(n != rd->rd_nfrags)) {
+				if (unlikely(n != tx->tx_nfrags)) {
 					CERROR("Failed to map mr %d/%d "
-					       "elements\n", n, rd->rd_nfrags);
+					       "elements\n", n, tx->tx_nfrags);
 					return n < 0 ? n : -EINVAL;
 				}
 
@@ -1993,7 +2024,6 @@ int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx,
 				fmr->fmr_key  = is_rx ? mr->rkey : mr->lkey;
 				fmr->fmr_frd  = frd;
 				fmr->fmr_pool = fpo;
-				frd->frd_posted = false;
 				return 0;
 			}
 			spin_unlock(&fps->fps_lock);
@@ -2023,7 +2053,7 @@ int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx,
 
 	}
 
-	if (ktime_get_seconds() < fps->fps_next_retry) {
+	if (cfs_time_before(cfs_time_current(), fps->fps_next_retry)) {
 		/* someone failed recently */
 		spin_unlock(&fps->fps_lock);
 		return -EAGAIN;
@@ -2040,7 +2070,7 @@ int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx,
 		fps->fps_version++;
 		list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
 	} else {
-		fps->fps_next_retry = ktime_get_seconds() + IBLND_POOL_RETRY;
+		fps->fps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
 	}
 	spin_unlock(&fps->fps_lock);
 
@@ -2048,7 +2078,7 @@ int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx,
 }
 
 static void
-kiblnd_fini_pool(struct kib_pool *pool)
+kiblnd_fini_pool(kib_pool_t *pool)
 {
 	LASSERT(list_empty(&pool->po_free_list));
 	LASSERT(pool->po_allocated == 0);
@@ -2057,24 +2087,24 @@ kiblnd_fini_pool(struct kib_pool *pool)
 }
 
 static void
-kiblnd_init_pool(struct kib_poolset *ps, struct kib_pool *pool, int size)
+kiblnd_init_pool(kib_poolset_t *ps, kib_pool_t *pool, int size)
 {
 	CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name);
 
-	memset(pool, 0, sizeof(struct kib_pool));
+	memset(pool, 0, sizeof(kib_pool_t));
 	INIT_LIST_HEAD(&pool->po_free_list);
-	pool->po_deadline = ktime_get_seconds() + IBLND_POOL_DEADLINE;
-	pool->po_owner = ps;
-	pool->po_size = size;
+	pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+	pool->po_owner	  = ps;
+	pool->po_size	  = size;
 }
 
 static void
 kiblnd_destroy_pool_list(struct list_head *head)
 {
-	struct kib_pool *pool;
+	kib_pool_t *pool;
 
 	while (!list_empty(head)) {
-		pool = list_entry(head->next, struct kib_pool, po_list);
+		pool = list_entry(head->next, kib_pool_t, po_list);
 		list_del(&pool->po_list);
 
 		LASSERT(pool->po_owner != NULL);
@@ -2083,16 +2113,15 @@ kiblnd_destroy_pool_list(struct list_head *head)
 }
 
 static void
-kiblnd_fail_poolset(struct kib_poolset *ps, struct list_head *zombies)
+kiblnd_fail_poolset(kib_poolset_t *ps, struct list_head *zombies)
 {
 	if (ps->ps_net == NULL) /* intialized? */
 		return;
 
 	spin_lock(&ps->ps_lock);
 	while (!list_empty(&ps->ps_pool_list)) {
-		struct kib_pool *po = list_entry(ps->ps_pool_list.next,
-						 struct kib_pool, po_list);
-
+		kib_pool_t *po = list_entry(ps->ps_pool_list.next,
+                                            kib_pool_t, po_list);
 		po->po_failed = 1;
 		list_del(&po->po_list);
 		if (po->po_allocated == 0)
@@ -2104,7 +2133,7 @@ kiblnd_fail_poolset(struct kib_poolset *ps, struct list_head *zombies)
 }
 
 static void
-kiblnd_fini_poolset(struct kib_poolset *ps)
+kiblnd_fini_poolset(kib_poolset_t *ps)
 {
 	if (ps->ps_net != NULL) { /* initialized? */
 		kiblnd_destroy_pool_list(&ps->ps_failed_pool_list);
@@ -2113,17 +2142,17 @@ kiblnd_fini_poolset(struct kib_poolset *ps)
 }
 
 static int
-kiblnd_init_poolset(struct kib_poolset *ps, int cpt,
-		    struct kib_net *net, char *name, int size,
+kiblnd_init_poolset(kib_poolset_t *ps, int cpt,
+		    kib_net_t *net, char *name, int size,
 		    kib_ps_pool_create_t po_create,
 		    kib_ps_pool_destroy_t po_destroy,
 		    kib_ps_node_init_t nd_init,
 		    kib_ps_node_fini_t nd_fini)
 {
-	struct kib_pool	*pool;
-	int rc;
+	kib_pool_t	*pool;
+	int		rc;
 
-	memset(ps, 0, sizeof(struct kib_poolset));
+	memset(ps, 0, sizeof(kib_poolset_t));
 
 	ps->ps_cpt	    = cpt;
         ps->ps_net          = net;
@@ -2149,22 +2178,22 @@ kiblnd_init_poolset(struct kib_poolset *ps, int cpt,
 }
 
 static int
-kiblnd_pool_is_idle(struct kib_pool *pool, time64_t now)
+kiblnd_pool_is_idle(kib_pool_t *pool, cfs_time_t now)
 {
         if (pool->po_allocated != 0) /* still in use */
                 return 0;
         if (pool->po_failed)
                 return 1;
-	return now >= pool->po_deadline;
+        return cfs_time_aftereq(now, pool->po_deadline);
 }
 
 void
-kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node)
+kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node)
 {
 	struct list_head zombies = LIST_HEAD_INIT(zombies);
-	struct kib_poolset *ps = pool->po_owner;
-	struct kib_pool *tmp;
-	time64_t now = ktime_get_seconds();
+	kib_poolset_t	*ps = pool->po_owner;
+	kib_pool_t	*tmp;
+	cfs_time_t	 now = cfs_time_current();
 
 	spin_lock(&ps->ps_lock);
 
@@ -2190,14 +2219,14 @@ kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node)
 }
 
 struct list_head *
-kiblnd_pool_alloc_node(struct kib_poolset *ps)
+kiblnd_pool_alloc_node(kib_poolset_t *ps)
 {
 	struct list_head	*node;
-	struct kib_pool	*pool;
+	kib_pool_t		*pool;
 	int			rc;
 	unsigned int		interval = 1;
-	ktime_t time_before;
-	unsigned int trips = 0;
+	cfs_time_t		time_before;
+	unsigned int		trips = 0;
 
 again:
 	spin_lock(&ps->ps_lock);
@@ -2206,8 +2235,7 @@ kiblnd_pool_alloc_node(struct kib_poolset *ps)
 			continue;
 
 		pool->po_allocated++;
-		pool->po_deadline = ktime_get_seconds() +
-				    IBLND_POOL_DEADLINE;
+		pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
 		node = pool->po_free_list.next;
 		list_del(node);
 
@@ -2237,7 +2265,7 @@ kiblnd_pool_alloc_node(struct kib_poolset *ps)
                 goto again;
         }
 
-	if (ktime_get_seconds() < ps->ps_next_retry) {
+	if (cfs_time_before(cfs_time_current(), ps->ps_next_retry)) {
 		/* someone failed recently */
 		spin_unlock(&ps->ps_lock);
 		return NULL;
@@ -2247,17 +2275,17 @@ kiblnd_pool_alloc_node(struct kib_poolset *ps)
 	spin_unlock(&ps->ps_lock);
 
 	CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name);
-	time_before = ktime_get();
+	time_before = cfs_time_current();
 	rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool);
-	CDEBUG(D_NET, "ps_pool_create took %lld ms to complete",
-	       ktime_ms_delta(ktime_get(), time_before));
+	CDEBUG(D_NET, "ps_pool_create took %lu HZ to complete",
+	       cfs_time_current() - time_before);
 
 	spin_lock(&ps->ps_lock);
 	ps->ps_increasing = 0;
 	if (rc == 0) {
 		list_add_tail(&pool->po_list, &ps->ps_pool_list);
 	} else {
-		ps->ps_next_retry = ktime_get_seconds() + IBLND_POOL_RETRY;
+		ps->ps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
 		CERROR("Can't allocate new %s pool because out of memory\n",
 		       ps->ps_name);
 	}
@@ -2267,11 +2295,10 @@ kiblnd_pool_alloc_node(struct kib_poolset *ps)
 }
 
 static void
-kiblnd_destroy_tx_pool(struct kib_pool *pool)
+kiblnd_destroy_tx_pool(kib_pool_t *pool)
 {
-	struct kib_tx_pool *tpo = container_of(pool, struct kib_tx_pool,
-					       tpo_pool);
-	int i;
+        kib_tx_pool_t  *tpo = container_of(pool, kib_tx_pool_t, tpo_pool);
+        int             i;
 
         LASSERT (pool->po_allocated == 0);
 
@@ -2284,7 +2311,7 @@ kiblnd_destroy_tx_pool(struct kib_pool *pool)
                 goto out;
 
         for (i = 0; i < pool->po_size; i++) {
-		struct kib_tx *tx = &tpo->tpo_tx_descs[i];
+		kib_tx_t *tx = &tpo->tpo_tx_descs[i];
 		int	  wrq_sge = *kiblnd_tunables.kib_wrq_sge;
 
 		list_del(&tx->tx_list);
@@ -2306,15 +2333,15 @@ kiblnd_destroy_tx_pool(struct kib_pool *pool)
 				    sizeof(*tx->tx_sge));
                 if (tx->tx_rd != NULL)
                         LIBCFS_FREE(tx->tx_rd,
-				    offsetof(struct kib_rdma_desc,
+                                    offsetof(kib_rdma_desc_t,
                                              rd_frags[IBLND_MAX_RDMA_FRAGS]));
         }
 
         LIBCFS_FREE(tpo->tpo_tx_descs,
-		    pool->po_size * sizeof(struct kib_tx));
+                    pool->po_size * sizeof(kib_tx_t));
 out:
         kiblnd_fini_pool(pool);
-	LIBCFS_FREE(tpo, sizeof(struct kib_tx_pool));
+        LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
 }
 
 static int kiblnd_tx_pool_size(struct lnet_ni *ni, int ncpts)
@@ -2329,12 +2356,12 @@ static int kiblnd_tx_pool_size(struct lnet_ni *ni, int ncpts)
 }
 
 static int
-kiblnd_create_tx_pool(struct kib_poolset *ps, int size, struct kib_pool **pp_po)
+kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
 {
         int            i;
         int            npg;
-	struct kib_pool *pool;
-	struct kib_tx_pool *tpo;
+        kib_pool_t    *pool;
+        kib_tx_pool_t *tpo;
 
 	LIBCFS_CPT_ALLOC(tpo, lnet_cpt_table(), ps->ps_cpt, sizeof(*tpo));
         if (tpo == NULL) {
@@ -2350,22 +2377,22 @@ kiblnd_create_tx_pool(struct kib_poolset *ps, int size, struct kib_pool **pp_po)
         npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE;
 	if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg) != 0) {
 		CERROR("Can't allocate tx pages: %d\n", npg);
-		LIBCFS_FREE(tpo, sizeof(struct kib_tx_pool));
+		LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
 		return -ENOMEM;
 	}
 
 	LIBCFS_CPT_ALLOC(tpo->tpo_tx_descs, lnet_cpt_table(), ps->ps_cpt,
-			 size * sizeof(struct kib_tx));
+			 size * sizeof(kib_tx_t));
         if (tpo->tpo_tx_descs == NULL) {
                 CERROR("Can't allocate %d tx descriptors\n", size);
                 ps->ps_pool_destroy(pool);
                 return -ENOMEM;
         }
 
-	memset(tpo->tpo_tx_descs, 0, size * sizeof(struct kib_tx));
+        memset(tpo->tpo_tx_descs, 0, size * sizeof(kib_tx_t));
 
         for (i = 0; i < size; i++) {
-		struct kib_tx *tx = &tpo->tpo_tx_descs[i];
+		kib_tx_t *tx = &tpo->tpo_tx_descs[i];
 		int	  wrq_sge = *kiblnd_tunables.kib_wrq_sge;
 
                 tx->tx_pool = tpo;
@@ -2398,7 +2425,7 @@ kiblnd_create_tx_pool(struct kib_poolset *ps, int size, struct kib_pool **pp_po)
 			break;
 
 		LIBCFS_CPT_ALLOC(tx->tx_rd, lnet_cpt_table(), ps->ps_cpt,
-				 offsetof(struct kib_rdma_desc,
+				 offsetof(kib_rdma_desc_t,
 					  rd_frags[IBLND_MAX_RDMA_FRAGS]));
 		if (tx->tx_rd == NULL)
 			break;
@@ -2415,24 +2442,23 @@ kiblnd_create_tx_pool(struct kib_poolset *ps, int size, struct kib_pool **pp_po)
 }
 
 static void
-kiblnd_tx_init(struct kib_pool *pool, struct list_head *node)
+kiblnd_tx_init(kib_pool_t *pool, struct list_head *node)
 {
-	struct kib_tx_poolset *tps = container_of(pool->po_owner,
-						  struct kib_tx_poolset,
-						  tps_poolset);
-	struct kib_tx *tx  = list_entry(node, struct kib_tx, tx_list);
+	kib_tx_poolset_t *tps = container_of(pool->po_owner, kib_tx_poolset_t,
+					     tps_poolset);
+	kib_tx_t	 *tx  = list_entry(node, kib_tx_t, tx_list);
 
 	tx->tx_cookie = tps->tps_next_tx_cookie++;
 }
 
 static void
-kiblnd_net_fini_pools(struct kib_net *net)
+kiblnd_net_fini_pools(kib_net_t *net)
 {
 	int	i;
 
 	cfs_cpt_for_each(i, lnet_cpt_table()) {
-		struct kib_tx_poolset *tps;
-		struct kib_fmr_poolset *fps;
+		kib_tx_poolset_t	*tps;
+		kib_fmr_poolset_t	*fps;
 
 		if (net->ibn_tx_ps != NULL) {
 			tps = net->ibn_tx_ps[i];
@@ -2457,7 +2483,7 @@ kiblnd_net_fini_pools(struct kib_net *net)
 }
 
 static int
-kiblnd_net_init_pools(struct kib_net *net, struct lnet_ni *ni, __u32 *cpts,
+kiblnd_net_init_pools(kib_net_t *net, struct lnet_ni *ni, __u32 *cpts,
 		      int ncpts)
 {
 	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
@@ -2472,12 +2498,7 @@ kiblnd_net_init_pools(struct kib_net *net, struct lnet_ni *ni, __u32 *cpts,
 
 #ifdef HAVE_IB_GET_DMA_MR
 	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-	/*
-	 * if lnd_map_on_demand is zero then we have effectively disabled
-	 * FMR or FastReg and we're using global memory regions
-	 * exclusively.
-	 */
-	if (!tunables->lnd_map_on_demand) {
+	if (tunables->lnd_map_on_demand == 0) {
 		read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
 					   flags);
 		goto create_tx_pool;
@@ -2502,7 +2523,7 @@ kiblnd_net_init_pools(struct kib_net *net, struct lnet_ni *ni, __u32 *cpts,
 	 * FMR pool and map-on-demand if premapping failed */
 
 	net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
-					   sizeof(struct kib_fmr_poolset));
+					   sizeof(kib_fmr_poolset_t));
 	if (net->ibn_fmr_ps == NULL) {
 		CERROR("Failed to allocate FMR pool array\n");
 		rc = -ENOMEM;
@@ -2527,7 +2548,7 @@ kiblnd_net_init_pools(struct kib_net *net, struct lnet_ni *ni, __u32 *cpts,
  create_tx_pool:
 #endif
 	net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(),
-					  sizeof(struct kib_tx_poolset));
+					  sizeof(kib_tx_poolset_t));
 	if (net->ibn_tx_ps == NULL) {
 		CERROR("Failed to allocate tx pool array\n");
 		rc = -ENOMEM;
@@ -2557,87 +2578,52 @@ kiblnd_net_init_pools(struct kib_net *net, struct lnet_ni *ni, __u32 *cpts,
 }
 
 static int
-kiblnd_hdev_get_attr(struct kib_hca_dev *hdev)
+kiblnd_hdev_get_attr(kib_hca_dev_t *hdev)
 {
-	struct ib_device_attr *dev_attr;
-	int rc = 0;
-
-	/* It's safe to assume a HCA can handle a page size
-	 * matching that of the native system */
-	hdev->ibh_page_shift = PAGE_SHIFT;
-	hdev->ibh_page_size  = 1 << PAGE_SHIFT;
-	hdev->ibh_page_mask  = ~((__u64)hdev->ibh_page_size - 1);
-
 #ifndef HAVE_IB_DEVICE_ATTRS
-	LIBCFS_ALLOC(dev_attr, sizeof(*dev_attr));
-	if (dev_attr == NULL) {
-		CERROR("Out of memory\n");
-		return -ENOMEM;
-	}
-
-	rc = ib_query_device(hdev->ibh_ibdev, dev_attr);
-	if (rc != 0) {
-		CERROR("Failed to query IB device: %d\n", rc);
-		goto out_clean_attr;
-	}
-#else
-	dev_attr = &hdev->ibh_ibdev->attrs;
+	struct ib_device_attr *attr;
+	int                    rc;
 #endif
 
-	hdev->ibh_mr_size = dev_attr->max_mr_size;
-	hdev->ibh_max_qp_wr = dev_attr->max_qp_wr;
+        /* It's safe to assume a HCA can handle a page size
+         * matching that of the native system */
+        hdev->ibh_page_shift = PAGE_SHIFT;
+        hdev->ibh_page_size  = 1 << PAGE_SHIFT;
+        hdev->ibh_page_mask  = ~((__u64)hdev->ibh_page_size - 1);
 
-	/* Setup device Memory Registration capabilities */
-#ifdef HAVE_FMR_POOL_API
-#ifdef HAVE_IB_DEVICE_OPS
-	if (hdev->ibh_ibdev->ops.alloc_fmr &&
-	    hdev->ibh_ibdev->ops.dealloc_fmr &&
-	    hdev->ibh_ibdev->ops.map_phys_fmr &&
-	    hdev->ibh_ibdev->ops.unmap_fmr) {
+#ifdef HAVE_IB_DEVICE_ATTRS
+	hdev->ibh_mr_size = hdev->ibh_ibdev->attrs.max_mr_size;
 #else
-	if (hdev->ibh_ibdev->alloc_fmr &&
-	    hdev->ibh_ibdev->dealloc_fmr &&
-	    hdev->ibh_ibdev->map_phys_fmr &&
-	    hdev->ibh_ibdev->unmap_fmr) {
-#endif
-		LCONSOLE_INFO("Using FMR for registration\n");
-		hdev->ibh_dev->ibd_dev_caps |= IBLND_DEV_CAPS_FMR_ENABLED;
-	} else
-#endif /* HAVE_FMR_POOL_API */
-	if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
-		LCONSOLE_INFO("Using FastReg for registration\n");
-		hdev->ibh_dev->ibd_dev_caps |= IBLND_DEV_CAPS_FASTREG_ENABLED;
-#ifndef HAVE_IB_ALLOC_FAST_REG_MR
-#ifdef IB_DEVICE_SG_GAPS_REG
-		if (dev_attr->device_cap_flags & IB_DEVICE_SG_GAPS_REG)
-			hdev->ibh_dev->ibd_dev_caps |= IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT;
-#endif
-#endif
-	} else {
-		rc = -ENOSYS;
-	}
+        LIBCFS_ALLOC(attr, sizeof(*attr));
+        if (attr == NULL) {
+                CERROR("Out of memory\n");
+                return -ENOMEM;
+        }
 
-	if (rc == 0 && hdev->ibh_mr_size == ~0ULL)
-		hdev->ibh_mr_shift = 64;
-	else if (rc != 0)
-		rc = -EINVAL;
+        rc = ib_query_device(hdev->ibh_ibdev, attr);
+        if (rc == 0)
+                hdev->ibh_mr_size = attr->max_mr_size;
 
-#ifndef HAVE_IB_DEVICE_ATTRS
-out_clean_attr:
-	LIBCFS_FREE(dev_attr, sizeof(*dev_attr));
+        LIBCFS_FREE(attr, sizeof(*attr));
+
+        if (rc != 0) {
+                CERROR("Failed to query IB device: %d\n", rc);
+                return rc;
+        }
 #endif
 
-	if (rc == -ENOSYS)
-		CERROR("IB device does not support FMRs nor FastRegs, can't "
-		       "register memory: %d\n", rc);
-	else if (rc == -EINVAL)
-		CERROR("Invalid mr size: %#llx\n", hdev->ibh_mr_size);
-	return rc;
+        if (hdev->ibh_mr_size == ~0ULL) {
+                hdev->ibh_mr_shift = 64;
+                return 0;
+        }
+
+	CERROR("Invalid mr size: %#llx\n", hdev->ibh_mr_size);
+        return -EINVAL;
 }
 
 #ifdef HAVE_IB_GET_DMA_MR
 static void
-kiblnd_hdev_cleanup_mrs(struct kib_hca_dev *hdev)
+kiblnd_hdev_cleanup_mrs(kib_hca_dev_t *hdev)
 {
 	if (hdev->ibh_mrs == NULL)
 		return;
@@ -2649,7 +2635,7 @@ kiblnd_hdev_cleanup_mrs(struct kib_hca_dev *hdev)
 #endif
 
 void
-kiblnd_hdev_destroy(struct kib_hca_dev *hdev)
+kiblnd_hdev_destroy(kib_hca_dev_t *hdev)
 {
 #ifdef HAVE_IB_GET_DMA_MR
         kiblnd_hdev_cleanup_mrs(hdev);
@@ -2666,12 +2652,17 @@ kiblnd_hdev_destroy(struct kib_hca_dev *hdev)
 
 #ifdef HAVE_IB_GET_DMA_MR
 static int
-kiblnd_hdev_setup_mrs(struct kib_hca_dev *hdev)
+kiblnd_hdev_setup_mrs(kib_hca_dev_t *hdev)
 {
 	struct ib_mr *mr;
+	int           rc;
 	int           acflags = IB_ACCESS_LOCAL_WRITE |
 				IB_ACCESS_REMOTE_WRITE;
 
+	rc = kiblnd_hdev_get_attr(hdev);
+	if (rc != 0)
+		return rc;
+
 	mr = ib_get_dma_mr(hdev->ibh_pd, acflags);
 	if (IS_ERR(mr)) {
 		CERROR("Failed ib_get_dma_mr: %ld\n", PTR_ERR(mr));
@@ -2692,7 +2683,7 @@ kiblnd_dummy_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
 }
 
 static int
-kiblnd_dev_need_failover(struct kib_dev *dev, struct net *ns)
+kiblnd_dev_need_failover(kib_dev_t *dev, struct net *ns)
 {
         struct rdma_cm_id  *cmid;
         struct sockaddr_in  srcaddr;
@@ -2744,16 +2735,16 @@ kiblnd_dev_need_failover(struct kib_dev *dev, struct net *ns)
 }
 
 int
-kiblnd_dev_failover(struct kib_dev *dev, struct net *ns)
+kiblnd_dev_failover(kib_dev_t *dev, struct net *ns)
 {
 	struct list_head    zombie_tpo = LIST_HEAD_INIT(zombie_tpo);
 	struct list_head    zombie_ppo = LIST_HEAD_INIT(zombie_ppo);
 	struct list_head    zombie_fpo = LIST_HEAD_INIT(zombie_fpo);
         struct rdma_cm_id  *cmid  = NULL;
-	struct kib_hca_dev *hdev  = NULL;
-	struct kib_hca_dev *old;
+        kib_hca_dev_t      *hdev  = NULL;
+        kib_hca_dev_t      *old;
         struct ib_pd       *pd;
-	struct kib_net *net;
+        kib_net_t          *net;
         struct sockaddr_in  addr;
         unsigned long       flags;
         int                 rc = 0;
@@ -2785,7 +2776,7 @@ kiblnd_dev_failover(struct kib_dev *dev, struct net *ns)
         }
 
 	cmid = kiblnd_rdma_create_id(ns, kiblnd_cm_callback, dev, RDMA_PS_TCP,
-				     IB_QPT_RC);
+                                     IB_QPT_RC);
         if (IS_ERR(cmid)) {
                 rc = PTR_ERR(cmid);
                 CERROR("Failed to create cmid for failover: %d\n", rc);
@@ -2839,18 +2830,18 @@ kiblnd_dev_failover(struct kib_dev *dev, struct net *ns)
                 goto out;
         }
 
-	rc = kiblnd_hdev_get_attr(hdev);
-	if (rc != 0) {
-		CERROR("Can't get device attributes: %d\n", rc);
-		goto out;
-	}
-
 #ifdef HAVE_IB_GET_DMA_MR
 	rc = kiblnd_hdev_setup_mrs(hdev);
 	if (rc != 0) {
 		CERROR("Can't setup device: %d\n", rc);
 		goto out;
 	}
+#else
+	rc = kiblnd_hdev_get_attr(hdev);
+	if (rc != 0) {
+		CERROR("Can't get device attributes: %d\n", rc);
+		goto out;
+	}
 #endif
 
 	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
@@ -2890,9 +2881,9 @@ kiblnd_dev_failover(struct kib_dev *dev, struct net *ns)
 }
 
 void
-kiblnd_destroy_dev(struct kib_dev *dev)
+kiblnd_destroy_dev (kib_dev_t *dev)
 {
-	LASSERT(dev->ibd_nnets == 0);
+        LASSERT (dev->ibd_nnets == 0);
 	LASSERT(list_empty(&dev->ibd_nets));
 
 	list_del(&dev->ibd_fail_list);
@@ -2978,7 +2969,7 @@ kiblnd_base_shutdown(void)
 static void
 kiblnd_shutdown(struct lnet_ni *ni)
 {
-	struct kib_net *net = ni->ni_data;
+        kib_net_t        *net = ni->ni_data;
 	rwlock_t     *g_lock = &kiblnd_data.kib_global_lock;
         int               i;
         unsigned long     flags;
@@ -3184,8 +3175,7 @@ kiblnd_start_schedulers(struct kib_sched_info *sched)
 	return rc;
 }
 
-static int kiblnd_dev_start_threads(struct kib_dev *dev, bool newdev, u32 *cpts,
-				    int ncpts)
+static int kiblnd_dev_start_threads(kib_dev_t *dev, __u32 *cpts, int ncpts)
 {
 	int	cpt;
 	int	rc;
@@ -3197,7 +3187,7 @@ static int kiblnd_dev_start_threads(struct kib_dev *dev, bool newdev, u32 *cpts,
 		cpt = (cpts == NULL) ? i : cpts[i];
 		sched = kiblnd_data.kib_scheds[cpt];
 
-		if (!newdev && sched->ibs_nthreads > 0)
+		if (sched->ibs_nthreads > 0)
 			continue;
 
 		rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]);
@@ -3210,80 +3200,38 @@ static int kiblnd_dev_start_threads(struct kib_dev *dev, bool newdev, u32 *cpts,
 	return 0;
 }
 
-static struct kib_dev *
-kiblnd_dev_search(char *ifname)
-{
-	struct kib_dev *alias = NULL;
-	struct kib_dev *dev;
-	char            *colon;
-	char            *colon2;
-
-	colon = strchr(ifname, ':');
-	list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
-		if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
-			return dev;
-
-		if (alias != NULL)
-			continue;
-
-		colon2 = strchr(dev->ibd_ifname, ':');
-		if (colon != NULL)
-			*colon = 0;
-		if (colon2 != NULL)
-			*colon2 = 0;
-
-		if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
-			alias = dev;
-
-		if (colon != NULL)
-			*colon = ':';
-		if (colon2 != NULL)
-			*colon2 = ':';
-	}
-	return alias;
-}
-
 static int
 kiblnd_startup(struct lnet_ni *ni)
 {
-	char *ifname = NULL;
+        char                     *ifname;
 	struct lnet_inetdev *ifaces = NULL;
-	struct kib_dev *ibdev = NULL;
-	struct kib_net *net = NULL;
-	unsigned long flags;
-	int rc;
+        kib_dev_t                *ibdev = NULL;
+        kib_net_t                *net;
+        unsigned long             flags;
+        int                       rc;
 	int i;
-	bool newdev;
 
-	LASSERT(ni->ni_net->net_lnd == &the_o2iblnd);
+        LASSERT (ni->ni_net->net_lnd == &the_o2iblnd);
 
-	if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
+        if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
 		rc = kiblnd_base_startup(ni->ni_net_ns);
-		if (rc != 0)
-			return rc;
-	}
+                if (rc != 0)
+                        return rc;
+        }
 
-	LIBCFS_ALLOC(net, sizeof(*net));
-	ni->ni_data = net;
-	if (net == NULL) {
-		rc = -ENOMEM;
-		goto failed;
-	}
+        LIBCFS_ALLOC(net, sizeof(*net));
+        ni->ni_data = net;
+        if (net == NULL)
+                goto failed;
 
 	net->ibn_incarnation = ktime_get_real_ns() / NSEC_PER_USEC;
 
 	kiblnd_tunables_setup(ni);
 
-	/*
-	 * ni_interfaces is only to support legacy pre Multi-Rail
-	 * tcp bonding for ksocklnd. Multi-Rail wants each secondary
-	 * IP to be treated as an unique 'struct ni' interfaces instead.
-	 */
 	if (ni->ni_interfaces[0] != NULL) {
 		/* Use the IPoIB interface specified in 'networks=' */
 		if (ni->ni_interfaces[1] != NULL) {
 			CERROR("ko2iblnd: Multiple interfaces not supported\n");
-			rc = -EINVAL;
 			goto failed;
 		}
 
@@ -3292,11 +3240,10 @@ kiblnd_startup(struct lnet_ni *ni)
 		ifname = *kiblnd_tunables.kib_default_ipif;
 	}
 
-	if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
-		CERROR("IPoIB interface name too long: %s\n", ifname);
-		rc = -E2BIG;
-		goto failed;
-	}
+        if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
+                CERROR("IPoIB interface name too long: %s\n", ifname);
+                goto failed;
+        }
 
 	rc = lnet_inet_enumerate(&ifaces, ni->ni_net_ns);
 	if (rc < 0)
@@ -3313,70 +3260,63 @@ kiblnd_startup(struct lnet_ni *ni)
 		goto failed;
 	}
 
-	ibdev = kiblnd_dev_search(ifname);
-	newdev = ibdev == NULL;
-	/* hmm...create kib_dev even for alias */
-	if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0) {
-		LIBCFS_ALLOC(ibdev, sizeof(*ibdev));
-		if (!ibdev) {
-			rc = -ENOMEM;
-			goto failed;
-		}
-
-		ibdev->ibd_ifip = ifaces[i].li_ipaddr;
-		strlcpy(ibdev->ibd_ifname, ifaces[i].li_name,
-			sizeof(ibdev->ibd_ifname));
-		ibdev->ibd_can_failover = !!(ifaces[i].li_flags & IFF_MASTER);
+	LIBCFS_ALLOC(ibdev, sizeof(*ibdev));
+	if (!ibdev) {
+		rc = -ENOMEM;
+		goto failed;
+	}
 
-		INIT_LIST_HEAD(&ibdev->ibd_nets);
-		INIT_LIST_HEAD(&ibdev->ibd_list); /* not yet in kib_devs */
-		INIT_LIST_HEAD(&ibdev->ibd_fail_list);
+	ibdev->ibd_ifip = ifaces[i].li_ipaddr;
+	strlcpy(ibdev->ibd_ifname, ifaces[i].li_name,
+		sizeof(ibdev->ibd_ifname));
+	ibdev->ibd_can_failover = !!(ifaces[i].li_flags & IFF_MASTER);
 
-		/* initialize the device */
-		rc = kiblnd_dev_failover(ibdev, ni->ni_net_ns);
-		if (rc) {
-			CERROR("ko2iblnd: Can't initialize device: rc = %d\n", rc);
-			goto failed;
-		}
+	INIT_LIST_HEAD(&ibdev->ibd_nets);
+	INIT_LIST_HEAD(&ibdev->ibd_list); /* not yet in kib_devs */
+	INIT_LIST_HEAD(&ibdev->ibd_fail_list);
 
-		list_add_tail(&ibdev->ibd_list, &kiblnd_data.kib_devs);
+	/* initialize the device */
+	rc = kiblnd_dev_failover(ibdev, ni->ni_net_ns);
+	if (rc) {
+		CERROR("ko2iblnd: Can't initialize device: rc = %d\n", rc);
+		goto failed;
 	}
 
+	list_add_tail(&ibdev->ibd_list, &kiblnd_data.kib_devs);
+
 	net->ibn_dev = ibdev;
 	ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
 
 	ni->ni_dev_cpt = ifaces[i].li_cpt;
 
-	rc = kiblnd_dev_start_threads(ibdev, newdev, ni->ni_cpts, ni->ni_ncpts);
+	rc = kiblnd_dev_start_threads(ibdev, ni->ni_cpts, ni->ni_ncpts);
 	if (rc != 0)
 		goto failed;
 
 	rc = kiblnd_net_init_pools(net, ni, ni->ni_cpts, ni->ni_ncpts);
-	if (rc != 0) {
-		CERROR("Failed to initialize NI pools: %d\n", rc);
-		goto failed;
-	}
+        if (rc != 0) {
+                CERROR("Failed to initialize NI pools: %d\n", rc);
+                goto failed;
+        }
 
 	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 	ibdev->ibd_nnets++;
 	list_add_tail(&net->ibn_list, &ibdev->ibd_nets);
 	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 
-	net->ibn_init = IBLND_INIT_ALL;
+        net->ibn_init = IBLND_INIT_ALL;
 
-	return 0;
+        return 0;
 
 failed:
 	if (net != NULL && net->ibn_dev == NULL && ibdev != NULL)
-		kiblnd_destroy_dev(ibdev);
+                kiblnd_destroy_dev(ibdev);
 
 	kfree(ifaces);
-	kiblnd_shutdown(ni);
-
-	CDEBUG(D_NET, "Configuration of device %s failed: rc = %d\n",
-	       ifname ? ifname : "", rc);
+        kiblnd_shutdown(ni);
 
-	return -ENETDOWN;
+        CDEBUG(D_NET, "kiblnd_startup failed\n");
+        return -ENETDOWN;
 }
 
 static struct lnet_lnd the_o2iblnd = {
@@ -3398,11 +3338,11 @@ static int __init ko2iblnd_init(void)
 {
 	int rc;
 
-	CLASSERT(sizeof(struct kib_msg) <= IBLND_MSG_SIZE);
-	CLASSERT(offsetof(struct kib_msg,
+	CLASSERT(sizeof(kib_msg_t) <= IBLND_MSG_SIZE);
+	CLASSERT(offsetof(kib_msg_t,
 			  ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) <=
 		 IBLND_MSG_SIZE);
-	CLASSERT(offsetof(struct kib_msg,
+	CLASSERT(offsetof(kib_msg_t,
 			  ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
 		 <= IBLND_MSG_SIZE);
 
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h
index 3e24405c2c31e..7a9a1c3de16a4 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -78,6 +78,7 @@
 #define DEBUG_SUBSYSTEM S_LND
 
 #include <libcfs/libcfs.h>
+#include <lnet/lnet.h>
 #include <lnet/lib-lnet.h>
 
 #define IBLND_PEER_HASH_SIZE		101	/* # peer_ni lists */
@@ -87,12 +88,13 @@
 #define IBLND_N_SCHED			2
 #define IBLND_N_SCHED_HIGH		4
 
-struct kib_tunables {
+typedef struct
+{
 	int              *kib_dev_failover;     /* HCA failover */
 	unsigned int     *kib_service;          /* IB service number */
 	int              *kib_min_reconnect_interval; /* first failed connection retry... */
 	int              *kib_max_reconnect_interval; /* ...exponentially increasing to this */
-	int              *kib_cksum;            /* checksum struct kib_msg? */
+	int              *kib_cksum;            /* checksum kib_msg_t? */
 	int              *kib_timeout;          /* comms timeout (seconds) */
 	int              *kib_keepalive;        /* keepalive timeout (seconds) */
 	int              *kib_ntx;              /* # tx descs */
@@ -105,32 +107,32 @@ struct kib_tunables {
 	/* # threads on each CPT */
 	int		 *kib_nscheds;
 	int		 *kib_wrq_sge;		/* # sg elements per wrq */
-	int		 *kib_use_fastreg_gaps; /* enable discontiguous fastreg fragment support */
-};
+} kib_tunables_t;
 
-extern struct kib_tunables  kiblnd_tunables;
+extern kib_tunables_t  kiblnd_tunables;
 
 #define IBLND_MSG_QUEUE_SIZE_V1      8          /* V1 only : # messages/RDMAs in-flight */
 #define IBLND_CREDIT_HIGHWATER_V1    7          /* V1 only : when eagerly to return credits */
 
 #define IBLND_CREDITS_DEFAULT        8          /* default # of peer_ni credits */
-#define IBLND_CREDITS_MAX          ((typeof(((struct kib_msg *) 0)->ibm_credits)) - 1)  /* Max # of peer_ni credits */
+#define IBLND_CREDITS_MAX          ((typeof(((kib_msg_t*) 0)->ibm_credits)) - 1)  /* Max # of peer_ni credits */
 
 /* when eagerly to return credits */
-#define IBLND_CREDITS_HIGHWATER(t, conn) ((conn->ibc_version) == IBLND_MSG_VERSION_1 ? \
+#define IBLND_CREDITS_HIGHWATER(t, v) ((v) == IBLND_MSG_VERSION_1 ? \
 					IBLND_CREDIT_HIGHWATER_V1 : \
-			min(t->lnd_peercredits_hiw, (__u32)conn->ibc_queue_depth - 1))
+					t->lnd_peercredits_hiw)
 
 #ifdef HAVE_RDMA_CREATE_ID_5ARG
-# define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) \
-	 rdma_create_id((ns) ? (ns) : &init_net, cb, dev, ps, qpt)
+# define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) rdma_create_id(ns, cb, \
+								    dev, ps, \
+								    qpt)
 #else
 # ifdef HAVE_RDMA_CREATE_ID_4ARG
-#  define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) \
-	  rdma_create_id(cb, dev, ps, qpt)
+#  define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) rdma_create_id(cb, dev, \
+								     ps, qpt)
 # else
-#  define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) \
-	  rdma_create_id(cb, dev, ps)
+#  define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) rdma_create_id(cb, dev, \
+								     ps)
 # endif
 #endif
 
@@ -160,7 +162,7 @@ extern struct kib_tunables  kiblnd_tunables;
 #define IBLND_RECV_WRS(c)            IBLND_RX_MSGS(c)
 
 /* 2 = LNet msg + Transfer chain */
-#define IBLND_CQ_ENTRIES(c) (IBLND_RECV_WRS(c) + kiblnd_send_wrs(c))
+#define IBLND_CQ_ENTRIES(c)	(IBLND_RECV_WRS(c) + kiblnd_send_wrs(c))
 
 struct kib_hca_dev;
 
@@ -171,15 +173,8 @@ struct kib_hca_dev;
 #define KIB_IFNAME_SIZE              256
 #endif
 
-enum kib_dev_caps {
-	IBLND_DEV_CAPS_FASTREG_ENABLED		= BIT(0),
-	IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT	= BIT(1),
-#ifdef HAVE_FMR_POOL_API
-	IBLND_DEV_CAPS_FMR_ENABLED		= BIT(2),
-#endif
-};
-
-struct kib_dev {
+typedef struct
+{
 	struct list_head	ibd_list;	/* chain on kib_devs */
 	struct list_head	ibd_fail_list;	/* chain on kib_failed_devs */
 	__u32			ibd_ifip;	/* IPoIB interface IP */
@@ -187,7 +182,7 @@ struct kib_dev {
 	char			ibd_ifname[KIB_IFNAME_SIZE];
 	int			ibd_nnets;	/* # nets extant */
 
-	time64_t		ibd_next_failover;
+	cfs_time_t		ibd_next_failover;
 	/* # failover failures */
 	int			ibd_failed_failover;
 	/* failover in progress */
@@ -196,10 +191,10 @@ struct kib_dev {
 	unsigned int		ibd_can_failover;
 	struct list_head	ibd_nets;
 	struct kib_hca_dev	*ibd_hdev;
-	enum kib_dev_caps	ibd_dev_caps;
-};
+} kib_dev_t;
 
-struct kib_hca_dev {
+typedef struct kib_hca_dev
+{
 	struct rdma_cm_id   *ibh_cmid;          /* listener cmid */
 	struct ib_device    *ibh_ibdev;         /* IB device */
 	int                  ibh_page_shift;    /* page shift of current HCA */
@@ -207,24 +202,24 @@ struct kib_hca_dev {
 	__u64                ibh_page_mask;     /* page mask of current HCA */
 	int                  ibh_mr_shift;      /* bits shift of max MR size */
 	__u64                ibh_mr_size;       /* size of MR */
-	int		     ibh_max_qp_wr;     /* maximum work requests size */
 #ifdef HAVE_IB_GET_DMA_MR
 	struct ib_mr        *ibh_mrs;           /* global MR */
 #endif
 	struct ib_pd        *ibh_pd;            /* PD */
-	struct kib_dev           *ibh_dev;           /* owner */
+	kib_dev_t           *ibh_dev;           /* owner */
 	atomic_t             ibh_ref;           /* refcount */
-};
+} kib_hca_dev_t;
 
 /** # of seconds to keep pool alive */
 #define IBLND_POOL_DEADLINE     300
 /** # of seconds to retry if allocation failed */
 #define IBLND_POOL_RETRY        1
 
-struct kib_pages {
+typedef struct
+{
         int                     ibp_npages;             /* # pages */
         struct page            *ibp_pages[0];           /* page array */
-};
+} kib_pages_t;
 
 struct kib_pool;
 struct kib_poolset;
@@ -239,7 +234,8 @@ struct kib_net;
 
 #define IBLND_POOL_NAME_LEN     32
 
-struct kib_poolset {
+typedef struct kib_poolset
+{
 	/* serialize */
 	spinlock_t		ps_lock;
 	/* network it belongs to */
@@ -251,7 +247,7 @@ struct kib_poolset {
 	/* failed pool list */
 	struct list_head	ps_failed_pool_list;
 	/* time stamp for retry if failed to allocate */
-	time64_t		ps_next_retry;
+	cfs_time_t		ps_next_retry;
 	/* is allocating new pool */
 	int			ps_increasing;
 	/* new pool size */
@@ -267,38 +263,40 @@ struct kib_poolset {
 	kib_ps_node_init_t	ps_node_init;
 	/* finalize node */
 	kib_ps_node_fini_t	ps_node_fini;
-};
+} kib_poolset_t;
 
-struct kib_pool {
+typedef struct kib_pool
+{
 	/* chain on pool list */
 	struct list_head	po_list;
 	/* pre-allocated node */
 	struct list_head	po_free_list;
 	/* pool_set of this pool */
-	struct kib_poolset     *po_owner;
+	kib_poolset_t	       *po_owner;
 	/* deadline of this pool */
-	time64_t		po_deadline;
+	cfs_time_t		po_deadline;
 	/* # of elements in use */
 	int			po_allocated;
 	/* pool is created on failed HCA */
 	int			po_failed;
 	/* # of pre-allocated elements */
 	int			po_size;
-};
+} kib_pool_t;
 
-struct kib_tx_poolset {
-	struct kib_poolset	tps_poolset;		/* pool-set */
+typedef struct {
+        kib_poolset_t           tps_poolset;            /* pool-set */
         __u64                   tps_next_tx_cookie;     /* cookie of TX */
-};
+} kib_tx_poolset_t;
 
-struct kib_tx_pool {
-	struct kib_pool		tpo_pool;		/* pool */
+typedef struct {
+        kib_pool_t              tpo_pool;               /* pool */
         struct kib_hca_dev     *tpo_hdev;               /* device for this pool */
         struct kib_tx          *tpo_tx_descs;           /* all the tx descriptors */
-	struct kib_pages       *tpo_tx_pages;           /* premapped tx msg pages */
-};
+        kib_pages_t            *tpo_tx_pages;           /* premapped tx msg pages */
+} kib_tx_pool_t;
 
-struct kib_fmr_poolset {
+typedef struct
+{
 	spinlock_t		fps_lock;		/* serialize */
 	struct kib_net	       *fps_net;		/* IB network */
 	struct list_head	fps_pool_list;		/* FMR pool list */
@@ -311,8 +309,8 @@ struct kib_fmr_poolset {
 	/* is allocating new pool */
 	int			fps_increasing;
 	/* time stamp for retry if failed to allocate */
-	time64_t		fps_next_retry;
-};
+	cfs_time_t		fps_next_retry;
+} kib_fmr_poolset_t;
 
 #ifndef HAVE_IB_RDMA_WR
 struct ib_rdma_wr {
@@ -331,13 +329,13 @@ struct kib_fast_reg_descriptor { /* For fast registration */
 #endif
 	struct ib_mr			*frd_mr;
 	bool				 frd_valid;
-	bool				 frd_posted;
 };
 
-struct kib_fmr_pool {
+typedef struct
+{
 	struct list_head	fpo_list;	/* chain on pool list */
 	struct kib_hca_dev     *fpo_hdev;	/* device for this pool */
-	struct kib_fmr_poolset      *fpo_owner;	/* owner of this pool */
+	kib_fmr_poolset_t      *fpo_owner;	/* owner of this pool */
 #ifdef HAVE_FMR_POOL_API
 	union {
 		struct {
@@ -350,24 +348,25 @@ struct kib_fmr_pool {
 		} fast_reg;
 #ifdef HAVE_FMR_POOL_API
 	};
-	bool			fpo_is_fmr; /* True if FMR pools allocated */
+	int			fpo_is_fmr;
 #endif
-	time64_t		fpo_deadline;	/* deadline of this pool */
+	cfs_time_t		fpo_deadline;	/* deadline of this pool */
 	int			fpo_failed;	/* fmr pool is failed */
 	int			fpo_map_count;	/* # of mapped FMR */
-};
+} kib_fmr_pool_t;
 
-struct kib_fmr {
-	struct kib_fmr_pool		*fmr_pool;	/* pool of FMR */
+typedef struct {
+	kib_fmr_pool_t			*fmr_pool;	/* pool of FMR */
 #ifdef HAVE_FMR_POOL_API
 	struct ib_pool_fmr		*fmr_pfmr;	/* IB pool fmr */
 #endif /* HAVE_FMR_POOL_API */
 	struct kib_fast_reg_descriptor	*fmr_frd;
 	u32				 fmr_key;
-};
+} kib_fmr_t;
 
-struct kib_net {
-	/* chain on struct kib_dev::ibd_nets */
+typedef struct kib_net
+{
+	/* chain on kib_dev_t::ibd_nets */
 	struct list_head	ibn_list;
 	__u64			ibn_incarnation;/* my epoch */
 	int			ibn_init;	/* initialisation state */
@@ -376,11 +375,11 @@ struct kib_net {
 	atomic_t		ibn_npeers;	/* # peers extant */
 	atomic_t		ibn_nconns;	/* # connections extant */
 
-	struct kib_tx_poolset	**ibn_tx_ps;	/* tx pool-set */
-	struct kib_fmr_poolset	**ibn_fmr_ps;	/* fmr pool-set */
+	kib_tx_poolset_t	**ibn_tx_ps;	/* tx pool-set */
+	kib_fmr_poolset_t	**ibn_fmr_ps;	/* fmr pool-set */
 
-	struct kib_dev		*ibn_dev;	/* underlying IB device */
-};
+	kib_dev_t		*ibn_dev;	/* underlying IB device */
+} kib_net_t;
 
 #define KIB_THREAD_SHIFT		16
 #define KIB_THREAD_ID(cpt, tid)		((cpt) << KIB_THREAD_SHIFT | (tid))
@@ -401,7 +400,8 @@ struct kib_sched_info {
 	int			ibs_cpt;	/* CPT id */
 };
 
-struct kib_data {
+typedef struct
+{
 	int			kib_init;	/* initialisation state */
 	int			kib_shutdown;	/* shut down? */
 	struct list_head	kib_devs;	/* IB devices extant */
@@ -430,14 +430,14 @@ struct kib_data {
 	 * The second that peers are pulled out from \a kib_reconn_wait
 	 * for reconnection.
 	 */
-	time64_t		kib_reconn_sec;
+	unsigned int		kib_reconn_sec;
 	/* connection daemon sleeps here */
 	wait_queue_head_t	kib_connd_waitq;
 	spinlock_t		kib_connd_lock;	/* serialise */
 	struct ib_qp_attr	kib_error_qpa;	/* QP->ERROR */
 	/* percpt data for schedulers */
 	struct kib_sched_info	**kib_scheds;
-};
+} kib_data_t;
 
 #define IBLND_INIT_NOTHING         0
 #define IBLND_INIT_DATA            1
@@ -448,51 +448,60 @@ struct kib_data {
  * These are sent in sender's byte order (i.e. receiver flips).
  */
 
-struct kib_connparams {
+typedef struct kib_connparams
+{
         __u16             ibcp_queue_depth;
         __u16             ibcp_max_frags;
         __u32             ibcp_max_msg_size;
-} WIRE_ATTR;
+} WIRE_ATTR kib_connparams_t;
 
-struct kib_immediate_msg {
+typedef struct
+{
 	struct lnet_hdr		ibim_hdr;	/* portals header */
 	char			ibim_payload[0];/* piggy-backed payload */
-} WIRE_ATTR;
+} WIRE_ATTR kib_immediate_msg_t;
 
-struct kib_rdma_frag {
+typedef struct
+{
         __u32             rf_nob;               /* # bytes this frag */
         __u64             rf_addr;              /* CAVEAT EMPTOR: misaligned!! */
-} WIRE_ATTR;
+} WIRE_ATTR kib_rdma_frag_t;
 
-struct kib_rdma_desc {
-	__u32			rd_key;		/* local/remote key */
-	__u32			rd_nfrags;	/* # fragments */
-	struct kib_rdma_frag	rd_frags[0];	/* buffer frags */
-} WIRE_ATTR;
+typedef struct
+{
+        __u32             rd_key;               /* local/remote key */
+        __u32             rd_nfrags;            /* # fragments */
+        kib_rdma_frag_t   rd_frags[0];          /* buffer frags */
+} WIRE_ATTR kib_rdma_desc_t;
 
-struct kib_putreq_msg {
+typedef struct
+{
 	struct lnet_hdr		ibprm_hdr;	/* portals header */
 	__u64			ibprm_cookie;	/* opaque completion cookie */
-} WIRE_ATTR;
+} WIRE_ATTR kib_putreq_msg_t;
 
-struct kib_putack_msg {
+typedef struct
+{
         __u64             ibpam_src_cookie;     /* reflected completion cookie */
         __u64             ibpam_dst_cookie;     /* opaque completion cookie */
-	struct kib_rdma_desc	ibpam_rd;	/* sender's sink buffer */
-} WIRE_ATTR;
+        kib_rdma_desc_t   ibpam_rd;             /* sender's sink buffer */
+} WIRE_ATTR kib_putack_msg_t;
 
-struct kib_get_msg {
+typedef struct
+{
 	struct lnet_hdr		ibgm_hdr;	/* portals header */
 	__u64			ibgm_cookie;	/* opaque completion cookie */
-	struct kib_rdma_desc	ibgm_rd;	/* rdma descriptor */
-} WIRE_ATTR;
+	kib_rdma_desc_t		ibgm_rd;	/* rdma descriptor */
+} WIRE_ATTR kib_get_msg_t;
 
-struct kib_completion_msg {
+typedef struct
+{
         __u64             ibcm_cookie;          /* opaque completion cookie */
         __s32             ibcm_status;          /* < 0 failure: >= 0 length */
-} WIRE_ATTR;
+} WIRE_ATTR kib_completion_msg_t;
 
-struct kib_msg {
+typedef struct
+{
         /* First 2 fields fixed FOR ALL TIME */
         __u32             ibm_magic;            /* I'm an ibnal message */
         __u16             ibm_version;          /* this is my version number */
@@ -507,14 +516,14 @@ struct kib_msg {
         __u64             ibm_dststamp;         /* destination's incarnation */
 
         union {
-		struct kib_connparams		connparams;
-		struct kib_immediate_msg	immediate;
-		struct kib_putreq_msg		putreq;
-		struct kib_putack_msg		putack;
-		struct kib_get_msg		get;
-		struct kib_completion_msg	completion;
+                kib_connparams_t      connparams;
+                kib_immediate_msg_t   immediate;
+                kib_putreq_msg_t      putreq;
+                kib_putack_msg_t      putack;
+                kib_get_msg_t         get;
+                kib_completion_msg_t  completion;
         } WIRE_ATTR ibm_u;
-} WIRE_ATTR;
+} WIRE_ATTR kib_msg_t;
 
 #define IBLND_MSG_MAGIC LNET_PROTO_IB_MAGIC	/* unique magic */
 
@@ -533,14 +542,14 @@ struct kib_msg {
 #define IBLND_MSG_GET_REQ           0xd6        /* getreq (sink->src) */
 #define IBLND_MSG_GET_DONE          0xd7        /* completion (src->sink: all OK) */
 
-struct kib_rej {
+typedef struct {
         __u32            ibr_magic;             /* sender's magic */
         __u16            ibr_version;           /* sender's version */
         __u8             ibr_why;               /* reject reason */
         __u8             ibr_padding;           /* padding */
         __u64            ibr_incarnation;       /* incarnation of peer_ni */
-	struct kib_connparams	ibr_cp;		/* connection parameters */
-} WIRE_ATTR;
+        kib_connparams_t ibr_cp;                /* connection parameters */
+} WIRE_ATTR kib_rej_t;
 
 /* connection rejection reasons */
 #define IBLND_REJECT_CONN_RACE       1          /* You lost connection race */
@@ -558,7 +567,8 @@ struct kib_rej {
 
 /***********************************************************************/
 
-struct kib_rx {					/* receive message */
+typedef struct kib_rx                           /* receive message */
+{
 	/* queue for attention */
 	struct list_head	rx_list;
 	/* owning conn */
@@ -568,7 +578,7 @@ struct kib_rx {					/* receive message */
 	/* completion status */
 	enum ib_wc_status	rx_status;
 	/* message buffer (host vaddr) */
-	struct kib_msg	       *rx_msg;
+	kib_msg_t	       *rx_msg;
 	/* message buffer (I/O addr) */
 	__u64			rx_msgaddr;
 	/* for dma_unmap_single() */
@@ -577,18 +587,19 @@ struct kib_rx {					/* receive message */
 	struct ib_recv_wr	rx_wrq;
 	/* ...and its memory */
 	struct ib_sge		rx_sge;
-};
+} kib_rx_t;
 
 #define IBLND_POSTRX_DONT_POST    0             /* don't post */
 #define IBLND_POSTRX_NO_CREDIT    1             /* post: no credits */
 #define IBLND_POSTRX_PEER_CREDIT  2             /* post: give peer_ni back 1 credit */
 #define IBLND_POSTRX_RSRVD_CREDIT 3             /* post: give myself back 1 reserved credit */
 
-struct kib_tx {					/* transmit message */
+typedef struct kib_tx                           /* transmit message */
+{
 	/* queue on idle_txs ibc_tx_queue etc. */
 	struct list_head	tx_list;
 	/* pool I'm from */
-	struct kib_tx_pool	*tx_pool;
+	kib_tx_pool_t		*tx_pool;
 	/* owning conn */
 	struct kib_conn		*tx_conn;
 	/* # tx callbacks outstanding */
@@ -599,16 +610,14 @@ struct kib_tx {					/* transmit message */
 	short			tx_waiting;
 	/* LNET completion status */
 	int			tx_status;
-	/* health status of the transmit */
-	enum lnet_msg_hstatus	tx_hstatus;
 	/* completion deadline */
-	ktime_t			tx_deadline;
+	unsigned long		tx_deadline;
 	/* completion cookie */
 	__u64			tx_cookie;
 	/* lnet msgs to finalize on completion */
 	struct lnet_msg		*tx_lntmsg[2];
 	/* message buffer (host vaddr) */
-	struct kib_msg		*tx_msg;
+	kib_msg_t		*tx_msg;
 	/* message buffer (I/O addr) */
 	__u64			tx_msgaddr;
 	/* for dma_unmap_single() */
@@ -624,33 +633,33 @@ struct kib_tx {					/* transmit message */
 	/* ...and their memory */
 	struct ib_sge		*tx_sge;
 	/* rdma descriptor */
-	struct kib_rdma_desc	*tx_rd;
+	kib_rdma_desc_t		*tx_rd;
 	/* # entries in... */
 	int			tx_nfrags;
 	/* dma_map_sg descriptor */
 	struct scatterlist	*tx_frags;
 	/* rdma phys page addrs */
 	__u64			*tx_pages;
-	/* gaps in fragments */
-	bool			tx_gaps;
 	/* FMR */
-	struct kib_fmr		tx_fmr;
+	kib_fmr_t		fmr;
 				/* dma direction */
 	int			tx_dmadir;
-};
+} kib_tx_t;
 
-struct kib_connvars {
+typedef struct kib_connvars
+{
         /* connection-in-progress variables */
-	struct kib_msg		cv_msg;
-};
+        kib_msg_t                 cv_msg;
+} kib_connvars_t;
 
-struct kib_conn {
+typedef struct kib_conn
+{
 	/* scheduler information */
 	struct kib_sched_info	*ibc_sched;
 	/* owning peer_ni */
-	struct kib_peer_ni	*ibc_peer;
+	struct kib_peer		*ibc_peer;
 	/* HCA bound on */
-	struct kib_hca_dev	*ibc_hdev;
+	kib_hca_dev_t		*ibc_hdev;
 	/* stash on peer_ni's conn list */
 	struct list_head	ibc_list;
 	/* schedule for attention */
@@ -688,7 +697,7 @@ struct kib_conn {
 	/* CQ callback fired */
 	unsigned int		ibc_ready:1;
 	/* time of last send */
-	ktime_t			ibc_last_send;
+	unsigned long		ibc_last_send;
 	/** link chain for kiblnd_check_conns only */
 	struct list_head	ibc_connd_list;
 	/** rxs completed before ESTABLISHED */
@@ -703,14 +712,12 @@ struct kib_conn {
 	struct list_head	ibc_tx_queue_rsrvd;
 	/* active tx awaiting completion */
 	struct list_head	ibc_active_txs;
-	/* zombie tx awaiting done */
-	struct list_head	ibc_zombie_txs;
 	/* serialise */
 	spinlock_t		ibc_lock;
 	/* the rx descs */
-	struct kib_rx		*ibc_rxs;
+	kib_rx_t		*ibc_rxs;
 	/* premapped rx msg pages */
-	struct kib_pages	*ibc_rx_pages;
+	kib_pages_t		*ibc_rx_pages;
 
 	/* CM id */
 	struct rdma_cm_id	*ibc_cmid;
@@ -718,8 +725,8 @@ struct kib_conn {
 	struct ib_cq		*ibc_cq;
 
 	/* in-progress connection state */
-	struct kib_connvars	*ibc_connvars;
-};
+	kib_connvars_t		*ibc_connvars;
+} kib_conn_t;
 
 #define IBLND_CONN_INIT               0         /* being initialised */
 #define IBLND_CONN_ACTIVE_CONNECT     1         /* active sending req */
@@ -728,7 +735,8 @@ struct kib_conn {
 #define IBLND_CONN_CLOSING            4         /* being closed */
 #define IBLND_CONN_DISCONNECTED       5         /* disconnected */
 
-struct kib_peer_ni {
+typedef struct kib_peer
+{
 	/* stash on global peer_ni list */
 	struct list_head	ibp_list;
 	/* who's on the other end(s) */
@@ -743,8 +751,8 @@ struct kib_peer_ni {
 	struct list_head	ibp_tx_queue;
 	/* incarnation of peer_ni */
 	__u64			ibp_incarnation;
-	/* when (in seconds) I was last alive */
-	time64_t		ibp_last_alive;
+	/* when (in jiffies) I was last alive */
+	cfs_time_t		ibp_last_alive;
 	/* # users */
 	atomic_t		ibp_refcount;
 	/* version of peer_ni */
@@ -759,15 +767,13 @@ struct kib_peer_ni {
 	unsigned char		ibp_races;
 	/* # consecutive reconnection attempts to this peer */
 	unsigned int		ibp_reconnected;
-	/* number of total active retries */
-	unsigned int		ibp_retries;
 	/* errno on closing this peer_ni */
 	int			ibp_error;
 	/* max map_on_demand */
 	__u16			ibp_max_frags;
 	/* max_peer_credits */
 	__u16			ibp_queue_depth;
-};
+} kib_peer_ni_t;
 
 #ifndef HAVE_IB_INC_RKEY
 /**
@@ -782,12 +788,32 @@ static inline u32 ib_inc_rkey(u32 rkey)
 }
 #endif
 
-extern struct kib_data kiblnd_data;
+extern kib_data_t      kiblnd_data;
 
-extern void kiblnd_hdev_destroy(struct kib_hca_dev *hdev);
+extern void kiblnd_hdev_destroy(kib_hca_dev_t *hdev);
 
 int kiblnd_msg_queue_size(int version, struct lnet_ni *ni);
 
+/* max # of fragments configured by user */
+static inline int
+kiblnd_cfg_rdma_frags(struct lnet_ni *ni)
+{
+	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+	int mod;
+
+	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+	mod = tunables->lnd_map_on_demand;
+	return mod != 0 ? mod : IBLND_MAX_RDMA_FRAGS;
+}
+
+static inline int
+kiblnd_rdma_frags(int version, struct lnet_ni *ni)
+{
+	return version == IBLND_MSG_VERSION_1 ?
+	  IBLND_MAX_RDMA_FRAGS :
+	  kiblnd_cfg_rdma_frags(ni);
+}
+
 static inline int
 kiblnd_concurrent_sends(int version, struct lnet_ni *ni)
 {
@@ -809,14 +835,14 @@ kiblnd_concurrent_sends(int version, struct lnet_ni *ni)
 }
 
 static inline void
-kiblnd_hdev_addref_locked(struct kib_hca_dev *hdev)
+kiblnd_hdev_addref_locked(kib_hca_dev_t *hdev)
 {
 	LASSERT(atomic_read(&hdev->ibh_ref) > 0);
 	atomic_inc(&hdev->ibh_ref);
 }
 
 static inline void
-kiblnd_hdev_decref(struct kib_hca_dev *hdev)
+kiblnd_hdev_decref(kib_hca_dev_t *hdev)
 {
 	LASSERT(atomic_read(&hdev->ibh_ref) > 0);
 	if (atomic_dec_and_test(&hdev->ibh_ref))
@@ -824,7 +850,7 @@ kiblnd_hdev_decref(struct kib_hca_dev *hdev)
 }
 
 static inline int
-kiblnd_dev_can_failover(struct kib_dev *dev)
+kiblnd_dev_can_failover(kib_dev_t *dev)
 {
 	if (!list_empty(&dev->ibd_fail_list)) /* already scheduled */
                 return 0;
@@ -880,7 +906,7 @@ do {                                                            \
 } while (0)
 
 static inline bool
-kiblnd_peer_connecting(struct kib_peer_ni *peer_ni)
+kiblnd_peer_connecting(kib_peer_ni_t *peer_ni)
 {
 	return peer_ni->ibp_connecting != 0 ||
 	       peer_ni->ibp_reconnecting != 0 ||
@@ -888,7 +914,7 @@ kiblnd_peer_connecting(struct kib_peer_ni *peer_ni)
 }
 
 static inline bool
-kiblnd_peer_idle(struct kib_peer_ni *peer_ni)
+kiblnd_peer_idle(kib_peer_ni_t *peer_ni)
 {
 	return !kiblnd_peer_connecting(peer_ni) && list_empty(&peer_ni->ibp_conns);
 }
@@ -903,14 +929,14 @@ kiblnd_nid2peerlist (lnet_nid_t nid)
 }
 
 static inline int
-kiblnd_peer_active(struct kib_peer_ni *peer_ni)
+kiblnd_peer_active (kib_peer_ni_t *peer_ni)
 {
 	/* Am I in the peer_ni hash table? */
 	return !list_empty(&peer_ni->ibp_list);
 }
 
 static inline struct kib_conn *
-kiblnd_get_conn_locked(struct kib_peer_ni *peer_ni)
+kiblnd_get_conn_locked (kib_peer_ni_t *peer_ni)
 {
 	struct list_head *next;
 
@@ -928,17 +954,16 @@ kiblnd_get_conn_locked(struct kib_peer_ni *peer_ni)
 }
 
 static inline int
-kiblnd_send_keepalive(struct kib_conn *conn)
+kiblnd_send_keepalive(kib_conn_t *conn)
 {
-	s64 keepalive_ns = *kiblnd_tunables.kib_keepalive * NSEC_PER_SEC;
-
 	return (*kiblnd_tunables.kib_keepalive > 0) &&
-		ktime_after(ktime_get(),
-			    ktime_add_ns(conn->ibc_last_send, keepalive_ns));
+		cfs_time_after(jiffies, conn->ibc_last_send +
+			       msecs_to_jiffies(*kiblnd_tunables.kib_keepalive *
+						MSEC_PER_SEC));
 }
 
 static inline int
-kiblnd_need_noop(struct kib_conn *conn)
+kiblnd_need_noop(kib_conn_t *conn)
 {
 	struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
 	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
@@ -947,7 +972,7 @@ kiblnd_need_noop(struct kib_conn *conn)
 	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
 
         if (conn->ibc_outstanding_credits <
-	    IBLND_CREDITS_HIGHWATER(tunables, conn) &&
+	    IBLND_CREDITS_HIGHWATER(tunables, conn->ibc_version) &&
             !kiblnd_send_keepalive(conn))
                 return 0; /* No need to send NOOP */
 
@@ -974,14 +999,14 @@ kiblnd_need_noop(struct kib_conn *conn)
 }
 
 static inline void
-kiblnd_abort_receives(struct kib_conn *conn)
+kiblnd_abort_receives(kib_conn_t *conn)
 {
         ib_modify_qp(conn->ibc_cmid->qp,
                      &kiblnd_data.kib_error_qpa, IB_QP_STATE);
 }
 
 static inline const char *
-kiblnd_queue2str(struct kib_conn *conn, struct list_head *q)
+kiblnd_queue2str(kib_conn_t *conn, struct list_head *q)
 {
 	if (q == &conn->ibc_tx_queue)
 		return "tx_queue";
@@ -1032,21 +1057,21 @@ kiblnd_wreqid2type (__u64 wreqid)
 }
 
 static inline void
-kiblnd_set_conn_state(struct kib_conn *conn, int state)
+kiblnd_set_conn_state (kib_conn_t *conn, int state)
 {
 	conn->ibc_state = state;
 	smp_mb();
 }
 
 static inline void
-kiblnd_init_msg(struct kib_msg *msg, int type, int body_nob)
+kiblnd_init_msg (kib_msg_t *msg, int type, int body_nob)
 {
         msg->ibm_type = type;
-	msg->ibm_nob = offsetof(struct kib_msg, ibm_u) + body_nob;
+        msg->ibm_nob  = offsetof(kib_msg_t, ibm_u) + body_nob;
 }
 
 static inline int
-kiblnd_rd_size(struct kib_rdma_desc *rd)
+kiblnd_rd_size (kib_rdma_desc_t *rd)
 {
         int   i;
         int   size;
@@ -1058,25 +1083,25 @@ kiblnd_rd_size(struct kib_rdma_desc *rd)
 }
 
 static inline __u64
-kiblnd_rd_frag_addr(struct kib_rdma_desc *rd, int index)
+kiblnd_rd_frag_addr(kib_rdma_desc_t *rd, int index)
 {
         return rd->rd_frags[index].rf_addr;
 }
 
 static inline __u32
-kiblnd_rd_frag_size(struct kib_rdma_desc *rd, int index)
+kiblnd_rd_frag_size(kib_rdma_desc_t *rd, int index)
 {
         return rd->rd_frags[index].rf_nob;
 }
 
 static inline __u32
-kiblnd_rd_frag_key(struct kib_rdma_desc *rd, int index)
+kiblnd_rd_frag_key(kib_rdma_desc_t *rd, int index)
 {
         return rd->rd_key;
 }
 
 static inline int
-kiblnd_rd_consume_frag(struct kib_rdma_desc *rd, int index, __u32 nob)
+kiblnd_rd_consume_frag(kib_rdma_desc_t *rd, int index, __u32 nob)
 {
         if (nob < rd->rd_frags[index].rf_nob) {
                 rd->rd_frags[index].rf_addr += nob;
@@ -1089,14 +1114,14 @@ kiblnd_rd_consume_frag(struct kib_rdma_desc *rd, int index, __u32 nob)
 }
 
 static inline int
-kiblnd_rd_msg_size(struct kib_rdma_desc *rd, int msgtype, int n)
+kiblnd_rd_msg_size(kib_rdma_desc_t *rd, int msgtype, int n)
 {
         LASSERT (msgtype == IBLND_MSG_GET_REQ ||
                  msgtype == IBLND_MSG_PUT_ACK);
 
         return msgtype == IBLND_MSG_GET_REQ ?
-	       offsetof(struct kib_get_msg, ibgm_rd.rd_frags[n]) :
-	       offsetof(struct kib_putack_msg, ibpam_rd.rd_frags[n]);
+               offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]) :
+               offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
 }
 
 static inline __u64
@@ -1154,10 +1179,6 @@ static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev,
         return ib_sg_dma_len(dev, sg);
 }
 
-#ifndef HAVE_RDMA_CONNECT_LOCKED
-#define rdma_connect_locked(cmid, cpp)	rdma_connect(cmid, cpp)
-#endif
-
 /* XXX We use KIBLND_CONN_PARAM(e) as writable buffer, it's not strictly
  * right because OFED1.2 defines it as const, to use it we have to add
  * (void *) cast to overcome "const" */
@@ -1165,16 +1186,19 @@ static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev,
 #define KIBLND_CONN_PARAM(e)            ((e)->param.conn.private_data)
 #define KIBLND_CONN_PARAM_LEN(e)        ((e)->param.conn.private_data_len)
 
-void kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs);
-void kiblnd_map_rx_descs(struct kib_conn *conn);
-void kiblnd_unmap_rx_descs(struct kib_conn *conn);
-void kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node);
-struct list_head *kiblnd_pool_alloc_node(struct kib_poolset *ps);
+#ifdef HAVE_IB_GET_DMA_MR
+struct ib_mr *kiblnd_find_rd_dma_mr(struct lnet_ni *ni, kib_rdma_desc_t *rd,
+				    int negotiated_nfrags);
+#endif
+void kiblnd_map_rx_descs(kib_conn_t *conn);
+void kiblnd_unmap_rx_descs(kib_conn_t *conn);
+void kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node);
+struct list_head *kiblnd_pool_alloc_node(kib_poolset_t *ps);
 
-int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx,
-			struct kib_rdma_desc *rd, u32 nob, u64 iov,
-			struct kib_fmr *fmr);
-void kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status);
+int  kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx,
+			 kib_rdma_desc_t *rd, __u32 nob, __u64 iov,
+			 kib_fmr_t *fmr, bool *is_fastreg);
+void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status);
 
 int  kiblnd_tunables_setup(struct lnet_ni *ni);
 int  kiblnd_tunables_init(void);
@@ -1184,45 +1208,43 @@ int  kiblnd_scheduler(void *arg);
 int  kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name);
 int  kiblnd_failover_thread (void *arg);
 
-int kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages);
+int  kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages);
 
 int  kiblnd_cm_callback(struct rdma_cm_id *cmid,
                         struct rdma_cm_event *event);
 int  kiblnd_translate_mtu(int value);
 
-int  kiblnd_dev_failover(struct kib_dev *dev, struct net *ns);
-int kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer_ni **peerp,
+int  kiblnd_dev_failover(kib_dev_t *dev, struct net *ns);
+int kiblnd_create_peer(struct lnet_ni *ni, kib_peer_ni_t **peerp,
 		       lnet_nid_t nid);
-void kiblnd_destroy_peer(struct kib_peer_ni *peer);
-bool kiblnd_reconnect_peer(struct kib_peer_ni *peer);
-void kiblnd_destroy_dev(struct kib_dev *dev);
-void kiblnd_unlink_peer_locked(struct kib_peer_ni *peer_ni);
-struct kib_peer_ni *kiblnd_find_peer_locked(struct lnet_ni *ni, lnet_nid_t nid);
-int  kiblnd_close_stale_conns_locked(struct kib_peer_ni *peer_ni,
-				     int version, u64 incarnation);
-int  kiblnd_close_peer_conns_locked(struct kib_peer_ni *peer_ni, int why);
-
-struct kib_conn *kiblnd_create_conn(struct kib_peer_ni *peer_ni,
-				    struct rdma_cm_id *cmid,
-				    int state, int version);
-void kiblnd_destroy_conn(struct kib_conn *conn);
-void kiblnd_close_conn(struct kib_conn *conn, int error);
-void kiblnd_close_conn_locked(struct kib_conn *conn, int error);
-
-void kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid);
-void kiblnd_txlist_done(struct list_head *txlist, int status,
-			enum lnet_msg_hstatus hstatus);
+void kiblnd_destroy_peer (kib_peer_ni_t *peer);
+bool kiblnd_reconnect_peer(kib_peer_ni_t *peer);
+void kiblnd_destroy_dev (kib_dev_t *dev);
+void kiblnd_unlink_peer_locked (kib_peer_ni_t *peer_ni);
+kib_peer_ni_t *kiblnd_find_peer_locked(struct lnet_ni *ni, lnet_nid_t nid);
+int  kiblnd_close_stale_conns_locked (kib_peer_ni_t *peer_ni,
+                                      int version, __u64 incarnation);
+int  kiblnd_close_peer_conns_locked (kib_peer_ni_t *peer_ni, int why);
+
+kib_conn_t *kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
+			       int state, int version);
+void kiblnd_destroy_conn(kib_conn_t *conn, bool free_conn);
+void kiblnd_close_conn (kib_conn_t *conn, int error);
+void kiblnd_close_conn_locked (kib_conn_t *conn, int error);
+
+void kiblnd_launch_tx(struct lnet_ni *ni, kib_tx_t *tx, lnet_nid_t nid);
+void kiblnd_txlist_done(struct list_head *txlist, int status);
 
 void kiblnd_qp_event(struct ib_event *event, void *arg);
 void kiblnd_cq_event(struct ib_event *event, void *arg);
 void kiblnd_cq_completion(struct ib_cq *cq, void *arg);
 
-void kiblnd_pack_msg(struct lnet_ni *ni, struct kib_msg *msg, int version,
+void kiblnd_pack_msg(struct lnet_ni *ni, kib_msg_t *msg, int version,
 		     int credits, lnet_nid_t dstnid, __u64 dststamp);
-int kiblnd_unpack_msg(struct kib_msg *msg, int nob);
-int kiblnd_post_rx(struct kib_rx *rx, int credit);
+int  kiblnd_unpack_msg(kib_msg_t *msg, int nob);
+int  kiblnd_post_rx (kib_rx_t *rx, int credit);
 
-int kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg);
+int  kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg);
 int kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
 		int delayed, unsigned int niov, struct kvec *iov,
 		lnet_kiov_t *kiov, unsigned int offset, unsigned int mlen,
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c
index e2eb6c272114f..4b896a52d3bb4 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,21 +38,20 @@
 
 #define MAX_CONN_RACES_BEFORE_ABORT 20
 
-static void kiblnd_peer_alive(struct kib_peer_ni *peer_ni);
-static void kiblnd_peer_connect_failed(struct kib_peer_ni *peer_ni, int active,
-				       int error);
-static void kiblnd_init_tx_msg(struct lnet_ni *ni, struct kib_tx *tx,
+static void kiblnd_peer_alive(kib_peer_ni_t *peer_ni);
+static void kiblnd_peer_connect_failed(kib_peer_ni_t *peer_ni, int active, int error);
+static void kiblnd_init_tx_msg(struct lnet_ni *ni, kib_tx_t *tx,
 			       int type, int body_nob);
-static int kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type,
-			    int resid, struct kib_rdma_desc *dstrd, u64 dstcookie);
-static void kiblnd_queue_tx_locked(struct kib_tx *tx, struct kib_conn *conn);
-static void kiblnd_queue_tx(struct kib_tx *tx, struct kib_conn *conn);
+static int kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type,
+			    int resid, kib_rdma_desc_t *dstrd, __u64 dstcookie);
+static void kiblnd_queue_tx_locked(kib_tx_t *tx, kib_conn_t *conn);
+static void kiblnd_queue_tx(kib_tx_t *tx, kib_conn_t *conn);
 
-static void kiblnd_unmap_tx(struct kib_tx *tx);
-static void kiblnd_check_sends_locked(struct kib_conn *conn);
+static void kiblnd_unmap_tx(kib_tx_t *tx);
+static void kiblnd_check_sends_locked(kib_conn_t *conn);
 
 void
-kiblnd_tx_done(struct kib_tx *tx)
+kiblnd_tx_done(kib_tx_t *tx)
 {
 	struct lnet_msg *lntmsg[2];
 	int         rc;
@@ -86,46 +85,39 @@ kiblnd_tx_done(struct kib_tx *tx)
 		if (lntmsg[i] == NULL)
 			continue;
 
-		/* propagate health status to LNet for requests */
-		if (i == 0 && lntmsg[i])
-			lntmsg[i]->msg_health_status = tx->tx_hstatus;
-
 		lnet_finalize(lntmsg[i], rc);
 	}
 }
 
 void
-kiblnd_txlist_done(struct list_head *txlist, int status,
-		   enum lnet_msg_hstatus hstatus)
+kiblnd_txlist_done(struct list_head *txlist, int status)
 {
-	struct kib_tx *tx;
+	kib_tx_t *tx;
 
 	while (!list_empty(txlist)) {
-		tx = list_entry(txlist->next, struct kib_tx, tx_list);
+		tx = list_entry(txlist->next, kib_tx_t, tx_list);
 
 		list_del(&tx->tx_list);
 		/* complete now */
 		tx->tx_waiting = 0;
 		tx->tx_status = status;
-		if (hstatus != LNET_MSG_STATUS_OK)
-			tx->tx_hstatus = hstatus;
 		kiblnd_tx_done(tx);
 	}
 }
 
-static struct kib_tx *
+static kib_tx_t *
 kiblnd_get_idle_tx(struct lnet_ni *ni, lnet_nid_t target)
 {
-	struct kib_net *net = ni->ni_data;
-	struct list_head *node;
-	struct kib_tx *tx;
-	struct kib_tx_poolset *tps;
+	kib_net_t		*net = (kib_net_t *)ni->ni_data;
+	struct list_head	*node;
+	kib_tx_t		*tx;
+	kib_tx_poolset_t	*tps;
 
 	tps = net->ibn_tx_ps[lnet_cpt_of_nid(target, ni)];
 	node = kiblnd_pool_alloc_node(&tps->tps_poolset);
         if (node == NULL)
                 return NULL;
-	tx = container_of(node, struct kib_tx, tx_list);
+        tx = container_of(node, kib_tx_t, tx_list);
 
         LASSERT (tx->tx_nwrq == 0);
         LASSERT (!tx->tx_queued);
@@ -137,18 +129,15 @@ kiblnd_get_idle_tx(struct lnet_ni *ni, lnet_nid_t target)
         LASSERT (tx->tx_lntmsg[1] == NULL);
         LASSERT (tx->tx_nfrags == 0);
 
-	tx->tx_gaps = false;
-	tx->tx_hstatus = LNET_MSG_STATUS_OK;
-
         return tx;
 }
 
 static void
-kiblnd_drop_rx(struct kib_rx *rx)
+kiblnd_drop_rx(kib_rx_t *rx)
 {
-	struct kib_conn *conn = rx->rx_conn;
-	struct kib_sched_info *sched = conn->ibc_sched;
-	unsigned long flags;
+	kib_conn_t		*conn	= rx->rx_conn;
+	struct kib_sched_info	*sched	= conn->ibc_sched;
+	unsigned long		flags;
 
 	spin_lock_irqsave(&sched->ibs_lock, flags);
 	LASSERT(conn->ibc_nrx > 0);
@@ -159,15 +148,15 @@ kiblnd_drop_rx(struct kib_rx *rx)
 }
 
 int
-kiblnd_post_rx(struct kib_rx *rx, int credit)
+kiblnd_post_rx (kib_rx_t *rx, int credit)
 {
-	struct kib_conn *conn = rx->rx_conn;
-	struct kib_net *net = conn->ibc_peer->ibp_ni->ni_data;
-	struct ib_recv_wr *bad_wrq = NULL;
+	kib_conn_t         *conn = rx->rx_conn;
+	kib_net_t          *net = conn->ibc_peer->ibp_ni->ni_data;
+	struct ib_recv_wr  *bad_wrq = NULL;
 #ifdef HAVE_IB_GET_DMA_MR
-	struct ib_mr *mr = conn->ibc_hdev->ibh_mrs;
+	struct ib_mr       *mr = conn->ibc_hdev->ibh_mrs;
 #endif
-	int rc;
+	int                 rc;
 
 	LASSERT (net != NULL);
 	LASSERT (!in_interrupt());
@@ -240,13 +229,13 @@ kiblnd_post_rx(struct kib_rx *rx, int credit)
 	return rc;
 }
 
-static struct kib_tx *
-kiblnd_find_waiting_tx_locked(struct kib_conn *conn, int txtype, u64 cookie)
+static kib_tx_t *
+kiblnd_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
 {
 	struct list_head *tmp;
 
 	list_for_each(tmp, &conn->ibc_active_txs) {
-		struct kib_tx *tx = list_entry(tmp, struct kib_tx, tx_list);
+		kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
 
 		LASSERT(!tx->tx_queued);
 		LASSERT(tx->tx_sending != 0 || tx->tx_waiting);
@@ -266,11 +255,11 @@ kiblnd_find_waiting_tx_locked(struct kib_conn *conn, int txtype, u64 cookie)
 }
 
 static void
-kiblnd_handle_completion(struct kib_conn *conn, int txtype, int status, u64 cookie)
+kiblnd_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
 {
-	struct kib_tx *tx;
-	struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
-	int idle;
+	kib_tx_t    *tx;
+	struct lnet_ni   *ni = conn->ibc_peer->ibp_ni;
+	int          idle;
 
 	spin_lock(&conn->ibc_lock);
 
@@ -279,24 +268,23 @@ kiblnd_handle_completion(struct kib_conn *conn, int txtype, int status, u64 cook
 		spin_unlock(&conn->ibc_lock);
 
 		CWARN("Unmatched completion type %x cookie %#llx from %s\n",
-		      txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
-		kiblnd_close_conn(conn, -EPROTO);
-		return;
-	}
+                      txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                kiblnd_close_conn(conn, -EPROTO);
+                return;
+        }
 
-	if (tx->tx_status == 0) {               /* success so far */
-		if (status < 0) {               /* failed? */
-			tx->tx_status = status;
-			tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_ERROR;
-		} else if (txtype == IBLND_MSG_GET_REQ) {
-			lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status);
-		}
-	}
+        if (tx->tx_status == 0) {               /* success so far */
+                if (status < 0) {               /* failed? */
+                        tx->tx_status = status;
+                } else if (txtype == IBLND_MSG_GET_REQ) {
+                        lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status);
+                }
+        }
 
-	tx->tx_waiting = 0;
+        tx->tx_waiting = 0;
 
-	idle = !tx->tx_queued && (tx->tx_sending == 0);
-	if (idle)
+        idle = !tx->tx_queued && (tx->tx_sending == 0);
+        if (idle)
 		list_del(&tx->tx_list);
 
 	spin_unlock(&conn->ibc_lock);
@@ -306,10 +294,10 @@ kiblnd_handle_completion(struct kib_conn *conn, int txtype, int status, u64 cook
 }
 
 static void
-kiblnd_send_completion(struct kib_conn *conn, int type, int status, u64 cookie)
+kiblnd_send_completion(kib_conn_t *conn, int type, int status, __u64 cookie)
 {
-	struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
-	struct kib_tx *tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+	struct lnet_ni   *ni = conn->ibc_peer->ibp_ni;
+	kib_tx_t    *tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
 
         if (tx == NULL) {
                 CERROR("Can't get tx for completion %x for %s\n",
@@ -319,19 +307,19 @@ kiblnd_send_completion(struct kib_conn *conn, int type, int status, u64 cookie)
 
         tx->tx_msg->ibm_u.completion.ibcm_status = status;
         tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
-	kiblnd_init_tx_msg(ni, tx, type, sizeof(struct kib_completion_msg));
+        kiblnd_init_tx_msg(ni, tx, type, sizeof(kib_completion_msg_t));
 
         kiblnd_queue_tx(tx, conn);
 }
 
 static void
-kiblnd_handle_rx(struct kib_rx *rx)
+kiblnd_handle_rx (kib_rx_t *rx)
 {
-	struct kib_msg *msg = rx->rx_msg;
-	struct kib_conn   *conn = rx->rx_conn;
-	struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
+        kib_msg_t    *msg = rx->rx_msg;
+        kib_conn_t   *conn = rx->rx_conn;
+	struct lnet_ni    *ni = conn->ibc_peer->ibp_ni;
         int           credits = msg->ibm_credits;
-	struct kib_tx *tx;
+        kib_tx_t     *tx;
         int           rc = 0;
         int           rc2;
         int           post_credit;
@@ -486,14 +474,14 @@ kiblnd_handle_rx(struct kib_rx *rx)
 }
 
 static void
-kiblnd_rx_complete(struct kib_rx *rx, int status, int nob)
+kiblnd_rx_complete (kib_rx_t *rx, int status, int nob)
 {
-	struct kib_msg *msg = rx->rx_msg;
-	struct kib_conn   *conn = rx->rx_conn;
-	struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
-	struct kib_net *net = ni->ni_data;
-	int rc;
-	int err = -EIO;
+        kib_msg_t    *msg = rx->rx_msg;
+        kib_conn_t   *conn = rx->rx_conn;
+	struct lnet_ni    *ni = conn->ibc_peer->ibp_ni;
+        kib_net_t    *net = ni->ni_data;
+        int           rc;
+        int           err = -EIO;
 
         LASSERT (net != NULL);
         LASSERT (rx->rx_nob < 0);               /* was posted */
@@ -557,112 +545,47 @@ kiblnd_rx_complete(struct kib_rx *rx, int status, int nob)
 }
 
 static int
-kiblnd_fmr_map_tx(struct kib_net *net, struct kib_tx *tx,
-		  struct kib_rdma_desc *rd, u32 nob)
+kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, __u32 nob)
 {
-	struct kib_hca_dev *hdev;
-	struct kib_dev *dev;
-	struct kib_fmr_poolset *fps;
+	kib_hca_dev_t		*hdev;
+	kib_fmr_poolset_t	*fps;
 	int			cpt;
 	int			rc;
-	int i;
+	bool			is_fastreg = 0;
 
 	LASSERT(tx->tx_pool != NULL);
 	LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL);
 
-	dev = net->ibn_dev;
 	hdev = tx->tx_pool->tpo_hdev;
 	cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
 
-	/*
-	 * If we're dealing with FastReg, but the device doesn't
-	 * support GAPS and the tx has GAPS, then there is no real point
-	 * in trying to map the memory, because it'll just fail. So
-	 * preemptively fail with an appropriate message
-	 */
-	if ((dev->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED) &&
-	    !(dev->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT) &&
-	    tx->tx_gaps) {
-		CERROR("Using FastReg with no GAPS support, but tx has gaps. "
-		       "Try setting use_fastreg_gaps to 1\n");
-		return -EPROTONOSUPPORT;
-	}
-
-#ifdef HAVE_FMR_POOL_API
-	/*
-	 * FMR does not support gaps but the tx has gaps then
-	 * we should make sure that the number of fragments we'll be sending
-	 * over fits within the number of fragments negotiated on the
-	 * connection, otherwise, we won't be able to RDMA the data.
-	 * We need to maintain the number of fragments negotiation on the
-	 * connection for backwards compatibility.
-	 */
-	if (tx->tx_gaps && (dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED)) {
-		if (tx->tx_conn &&
-		    tx->tx_conn->ibc_max_frags <= rd->rd_nfrags) {
-			CERROR("TX number of frags (%d) is <= than connection"
-			       " number of frags (%d). Consider setting peer's"
-			       " map_on_demand to 256\n", tx->tx_nfrags,
-			       tx->tx_conn->ibc_max_frags);
-			return -EFBIG;
-		}
-	}
-#endif
-
 	fps = net->ibn_fmr_ps[cpt];
-	rc = kiblnd_fmr_pool_map(fps, tx, rd, nob, 0, &tx->tx_fmr);
+	rc = kiblnd_fmr_pool_map(fps, tx, rd, nob, 0, &tx->fmr, &is_fastreg);
 	if (rc != 0) {
-		CERROR("Can't map %u bytes (%u/%u)s: %d\n", nob,
-		       tx->tx_nfrags, rd->rd_nfrags, rc);
+		CERROR("Can't map %u pages: %d\n", nob, rc);
 		return rc;
 	}
 
-	/*
-	 * If rd is not tx_rd, it's going to get sent to a peer_ni, who will
-	 * need the rkey
-	 */
-	rd->rd_key = tx->tx_fmr.fmr_key;
-	/*
-	 * for FastReg or FMR with no gaps we can accumulate all
-	 * the fragments in one FastReg or FMR fragment.
-	 */
-	if (
-#ifdef HAVE_FMR_POOL_API
-	    ((dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED)
-	     && !tx->tx_gaps) ||
-#endif
-	    (dev->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED)) {
-		/* FMR requires zero based address */
-#ifdef HAVE_FMR_POOL_API
-		if (dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED)
-			rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask;
-#endif
-		rd->rd_frags[0].rf_nob = nob;
-		rd->rd_nfrags = 1;
-	} else {
-		/*
-		 * We're transmitting with gaps using FMR.
-		 * We'll need to use multiple fragments and identify the
-		 * zero based address of each fragment.
-		 */
-		for (i = 0; i < rd->rd_nfrags; i++) {
-			rd->rd_frags[i].rf_addr &= ~hdev->ibh_page_mask;
-			rd->rd_frags[i].rf_addr += i << hdev->ibh_page_shift;
-		}
-	}
+	/* If rd is not tx_rd, it's going to get sent to a peer_ni, who will need
+	 * the rkey */
+	rd->rd_key = tx->fmr.fmr_key;
+	if (!is_fastreg)
+		rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask;
+	rd->rd_frags[0].rf_nob   = nob;
+	rd->rd_nfrags = 1;
 
 	return 0;
 }
 
 static void
-kiblnd_unmap_tx(struct kib_tx *tx)
+kiblnd_unmap_tx(kib_tx_t *tx)
 {
 	if (
 #ifdef HAVE_FMR_POOL_API
-		tx->tx_fmr.fmr_pfmr ||
+		tx->fmr.fmr_pfmr ||
 #endif
-		tx->tx_fmr.fmr_frd)
-		kiblnd_fmr_pool_unmap(&tx->tx_fmr, tx->tx_status);
+		tx->fmr.fmr_frd)
+		kiblnd_fmr_pool_unmap(&tx->fmr, tx->tx_status);
 
 	if (tx->tx_nfrags != 0) {
 		kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev,
@@ -671,46 +594,13 @@ kiblnd_unmap_tx(struct kib_tx *tx)
 	}
 }
 
-#ifdef HAVE_IB_GET_DMA_MR
-static struct ib_mr *
-kiblnd_find_rd_dma_mr(struct lnet_ni *ni, struct kib_rdma_desc *rd)
-{
-	struct kib_net *net = ni->ni_data;
-	struct kib_hca_dev *hdev = net->ibn_dev->ibd_hdev;
-	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
-
-	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
-
-	/*
-	 * if map-on-demand is turned on and the device supports
-	 * either FMR or FastReg then use that. Otherwise use global
-	 * memory regions. If that's not available either, then you're
-	 * dead in the water and fail the operation.
-	 */
-	if (tunables->lnd_map_on_demand &&
-	    (net->ibn_dev->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED
-#ifdef HAVE_FMR_POOL_API
-	     || net->ibn_dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED
-#endif
-	))
-		return NULL;
-
-	/*
-	 * hdev->ibh_mrs can be NULL. This case is dealt with gracefully
-	 * in the call chain. The mapping will fail with appropriate error
-	 * message.
-	 */
-	return hdev->ibh_mrs;
-}
-#endif
-
-static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
-			 struct kib_rdma_desc *rd, int nfrags)
+static int
+kiblnd_map_tx(struct lnet_ni *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, int nfrags)
 {
-	struct kib_net *net = ni->ni_data;
-	struct kib_hca_dev *hdev = net->ibn_dev->ibd_hdev;
+	kib_net_t     *net   = ni->ni_data;
+	kib_hca_dev_t *hdev  = net->ibn_dev->ibd_hdev;
 #ifdef HAVE_IB_GET_DMA_MR
-	struct ib_mr *mr = NULL;
+	struct ib_mr  *mr    = NULL;
 #endif
 	__u32 nob;
 	int i;
@@ -732,7 +622,9 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
         }
 
 #ifdef HAVE_IB_GET_DMA_MR
-	mr = kiblnd_find_rd_dma_mr(ni, rd);
+	mr = kiblnd_find_rd_dma_mr(ni, rd,
+				   (tx->tx_conn != NULL) ?
+				   tx->tx_conn->ibc_max_frags : -1);
 	if (mr != NULL) {
 		/* found pre-mapping MR */
 		rd->rd_key = (rd != tx->tx_rd) ? mr->rkey : mr->lkey;
@@ -746,17 +638,17 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
 	return -EINVAL;
 }
 
-static int kiblnd_setup_rd_iov(struct lnet_ni *ni, struct kib_tx *tx,
-			       struct kib_rdma_desc *rd, unsigned int niov,
-			       struct kvec *iov, int offset, int nob)
+
+static int
+kiblnd_setup_rd_iov(struct lnet_ni *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
+		    unsigned int niov, struct kvec *iov, int offset, int nob)
 {
-	struct kib_net *net = ni->ni_data;
-	struct page *page;
+        kib_net_t          *net = ni->ni_data;
+        struct page        *page;
         struct scatterlist *sg;
         unsigned long       vaddr;
         int                 fragnob;
         int                 page_offset;
-	unsigned int	    max_niov;
 
         LASSERT (nob > 0);
         LASSERT (niov > 0);
@@ -769,8 +661,6 @@ static int kiblnd_setup_rd_iov(struct lnet_ni *ni, struct kib_tx *tx,
                 LASSERT (niov > 0);
         }
 
-	max_niov = niov;
-
 	sg = tx->tx_frags;
 	do {
 		LASSERT(niov > 0);
@@ -786,20 +676,6 @@ static int kiblnd_setup_rd_iov(struct lnet_ni *ni, struct kib_tx *tx,
 		fragnob = min((int)(iov->iov_len - offset), nob);
 		fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
 
-		/*
-		 * We're allowed to start at a non-aligned page offset in
-		 * the first fragment and end at a non-aligned page offset
-		 * in the last fragment.
-		 */
-		if ((fragnob < (int)PAGE_SIZE - page_offset) &&
-		    (niov < max_niov) && nob > fragnob) {
-			CDEBUG(D_NET, "fragnob %d < available page %d: with"
-				      " remaining %d iovs with %d nob left\n",
-			       fragnob, (int)PAGE_SIZE - page_offset, niov,
-			       nob);
-			tx->tx_gaps = true;
-		}
-
 		sg_set_page(sg, page, fragnob, page_offset);
 		sg = sg_next(sg);
 		if (!sg) {
@@ -820,49 +696,32 @@ static int kiblnd_setup_rd_iov(struct lnet_ni *ni, struct kib_tx *tx,
         return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
 }
 
-static int kiblnd_setup_rd_kiov(struct lnet_ni *ni, struct kib_tx *tx,
-				struct kib_rdma_desc *rd, int nkiov,
-				lnet_kiov_t *kiov, int offset, int nob)
+static int
+kiblnd_setup_rd_kiov(struct lnet_ni *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
+		     int nkiov, lnet_kiov_t *kiov, int offset, int nob)
 {
-	struct kib_net *net = ni->ni_data;
-	struct scatterlist *sg;
-	int                 fragnob;
-	int		    max_nkiov;
-
-	CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
-
-	LASSERT(nob > 0);
-	LASSERT(nkiov > 0);
-	LASSERT(net != NULL);
-
-	while (offset >= kiov->kiov_len) {
-		offset -= kiov->kiov_len;
-		nkiov--;
-		kiov++;
-		LASSERT(nkiov > 0);
-	}
+        kib_net_t          *net = ni->ni_data;
+        struct scatterlist *sg;
+        int                 fragnob;
 
-	max_nkiov = nkiov;
+        CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
 
-	sg = tx->tx_frags;
-	do {
-		LASSERT(nkiov > 0);
+        LASSERT (nob > 0);
+        LASSERT (nkiov > 0);
+        LASSERT (net != NULL);
 
-		fragnob = min((int)(kiov->kiov_len - offset), nob);
+        while (offset >= kiov->kiov_len) {
+                offset -= kiov->kiov_len;
+                nkiov--;
+                kiov++;
+                LASSERT (nkiov > 0);
+        }
 
-		/*
-		 * We're allowed to start at a non-aligned page offset in
-		 * the first fragment and end at a non-aligned page offset
-		 * in the last fragment.
-		 */
-		if ((fragnob < (int)(kiov->kiov_len - offset)) &&
-		    nkiov < max_nkiov && nob > fragnob) {
-			CDEBUG(D_NET, "fragnob %d < available page %d: with"
-				      " remaining %d kiovs with %d nob left\n",
-			       fragnob, (int)(kiov->kiov_len - offset),
-			       nkiov, nob);
-			tx->tx_gaps = true;
-		}
+        sg = tx->tx_frags;
+        do {
+                LASSERT (nkiov > 0);
+
+                fragnob = min((int)(kiov->kiov_len - offset), nob);
 
 		sg_set_page(sg, kiov->kiov_page, fragnob,
 			    kiov->kiov_offset + offset);
@@ -872,23 +731,22 @@ static int kiblnd_setup_rd_kiov(struct lnet_ni *ni, struct kib_tx *tx,
 			return -EFAULT;
 		}
 
-		offset = 0;
-		kiov++;
-		nkiov--;
-		nob -= fragnob;
-	} while (nob > 0);
+                offset = 0;
+                kiov++;
+                nkiov--;
+                nob -= fragnob;
+        } while (nob > 0);
 
-	return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
+        return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
 }
 
 static int
-kiblnd_post_tx_locked(struct kib_conn *conn, struct kib_tx *tx, int credit)
+kiblnd_post_tx_locked (kib_conn_t *conn, kib_tx_t *tx, int credit)
 __must_hold(&conn->ibc_lock)
 {
-	struct kib_msg *msg = tx->tx_msg;
-	struct kib_peer_ni *peer_ni = conn->ibc_peer;
+	kib_msg_t *msg = tx->tx_msg;
+	kib_peer_ni_t *peer_ni = conn->ibc_peer;
 	struct lnet_ni *ni = peer_ni->ibp_ni;
-	struct kib_fast_reg_descriptor *frd = tx->tx_fmr.fmr_frd;
 	int ver = conn->ibc_version;
 	int rc;
 	int done;
@@ -906,11 +764,11 @@ __must_hold(&conn->ibc_lock)
 
 	if (conn->ibc_nsends_posted ==
 	    kiblnd_concurrent_sends(ver, ni)) {
-		/* tx completions outstanding... */
-		CDEBUG(D_NET, "%s: posted enough\n",
-		       libcfs_nid2str(peer_ni->ibp_nid));
-		return -EAGAIN;
-	}
+                /* tx completions outstanding... */
+                CDEBUG(D_NET, "%s: posted enough\n",
+                       libcfs_nid2str(peer_ni->ibp_nid));
+                return -EAGAIN;
+        }
 
         if (credit != 0 && conn->ibc_credits == 0) {   /* no credits */
                 CDEBUG(D_NET, "%s: no credits\n",
@@ -938,7 +796,6 @@ __must_hold(&conn->ibc_lock)
 		 * kiblnd_check_sends_locked will queue NOOP again when
 		 * posted NOOPs complete */
 		spin_unlock(&conn->ibc_lock);
-		tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
 		kiblnd_tx_done(tx);
 		spin_lock(&conn->ibc_lock);
                 CDEBUG(D_NET, "%s(%d): redundant or enough NOOP\n",
@@ -973,10 +830,11 @@ __must_hold(&conn->ibc_lock)
                 /* close_conn will launch failover */
                 rc = -ENETDOWN;
         } else {
+		struct kib_fast_reg_descriptor *frd = tx->fmr.fmr_frd;
 		struct ib_send_wr *bad = &tx->tx_wrq[tx->tx_nwrq - 1].wr;
 		struct ib_send_wr *wr  = &tx->tx_wrq[0].wr;
 
-		if (frd != NULL && !frd->frd_posted) {
+		if (frd != NULL) {
 			if (!frd->frd_valid) {
 				wr = &frd->frd_inv_wr.wr;
 				wr->next = &frd->frd_fastreg_wr.wr;
@@ -992,24 +850,18 @@ __must_hold(&conn->ibc_lock)
 			 libcfs_nid2str(conn->ibc_peer->ibp_nid));
 
 		bad = NULL;
-		if (lnet_send_error_simulation(tx->tx_lntmsg[0], &tx->tx_hstatus))
-			rc = -EINVAL;
-		else
 #ifdef HAVE_IB_POST_SEND_RECV_CONST
-			rc = ib_post_send(conn->ibc_cmid->qp, wr,
-					  (const struct ib_send_wr **)&bad);
+		rc = ib_post_send(conn->ibc_cmid->qp, wr,
+				  (const struct ib_send_wr **)&bad);
 #else
-			rc = ib_post_send(conn->ibc_cmid->qp, wr, &bad);
+		rc = ib_post_send(conn->ibc_cmid->qp, wr, &bad);
 #endif
 	}
 
-	conn->ibc_last_send = ktime_get();
+        conn->ibc_last_send = jiffies;
 
-	if (rc == 0) {
-		if (frd != NULL)
-			frd->frd_posted = true;
-		return 0;
-	}
+        if (rc == 0)
+                return 0;
 
         /* NB credits are transferred in the actual
          * message, which can only be the last work item */
@@ -1047,11 +899,11 @@ __must_hold(&conn->ibc_lock)
 }
 
 static void
-kiblnd_check_sends_locked(struct kib_conn *conn)
+kiblnd_check_sends_locked(kib_conn_t *conn)
 {
-	int ver = conn->ibc_version;
+        int        ver = conn->ibc_version;
 	struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
-	struct kib_tx *tx;
+        kib_tx_t  *tx;
 
         /* Don't send anything until after the connection is established */
         if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
@@ -1069,7 +921,7 @@ kiblnd_check_sends_locked(struct kib_conn *conn)
         while (conn->ibc_reserved_credits > 0 &&
 	       !list_empty(&conn->ibc_tx_queue_rsrvd)) {
 		tx = list_entry(conn->ibc_tx_queue_rsrvd.next,
-				struct kib_tx, tx_list);
+                                    kib_tx_t, tx_list);
 		list_del(&tx->tx_list);
 		list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
                 conn->ibc_reserved_credits--;
@@ -1093,16 +945,16 @@ kiblnd_check_sends_locked(struct kib_conn *conn)
 		if (!list_empty(&conn->ibc_tx_queue_nocred)) {
                         credit = 0;
 			tx = list_entry(conn->ibc_tx_queue_nocred.next,
-					struct kib_tx, tx_list);
+                                            kib_tx_t, tx_list);
 		} else if (!list_empty(&conn->ibc_tx_noops)) {
                         LASSERT (!IBLND_OOB_CAPABLE(ver));
                         credit = 1;
 			tx = list_entry(conn->ibc_tx_noops.next,
-					struct kib_tx, tx_list);
+                                        kib_tx_t, tx_list);
 		} else if (!list_empty(&conn->ibc_tx_queue)) {
                         credit = 1;
 			tx = list_entry(conn->ibc_tx_queue.next,
-					struct kib_tx, tx_list);
+                                            kib_tx_t, tx_list);
                 } else
                         break;
 
@@ -1112,30 +964,26 @@ kiblnd_check_sends_locked(struct kib_conn *conn)
 }
 
 static void
-kiblnd_tx_complete(struct kib_tx *tx, int status)
+kiblnd_tx_complete (kib_tx_t *tx, int status)
 {
-	int           failed = (status != IB_WC_SUCCESS);
-	struct kib_conn   *conn = tx->tx_conn;
-	int           idle;
+        int           failed = (status != IB_WC_SUCCESS);
+        kib_conn_t   *conn = tx->tx_conn;
+        int           idle;
 
-	if (tx->tx_sending <= 0) {
-		CERROR("Received an event on a freed tx: %p status %d\n",
-		       tx, tx->tx_status);
-		return;
-	}
+        LASSERT (tx->tx_sending > 0);
 
-	if (failed) {
-		if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
+        if (failed) {
+                if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
 			CNETERR("Tx -> %s cookie %#llx"
-				" sending %d waiting %d: failed %d\n",
-				libcfs_nid2str(conn->ibc_peer->ibp_nid),
-				tx->tx_cookie, tx->tx_sending, tx->tx_waiting,
-				status);
+                                " sending %d waiting %d: failed %d\n",
+                                libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                                tx->tx_cookie, tx->tx_sending, tx->tx_waiting,
+                                status);
 
-		kiblnd_close_conn(conn, -EIO);
-	} else {
-		kiblnd_peer_alive(conn->ibc_peer);
-	}
+                kiblnd_close_conn(conn, -EIO);
+        } else {
+                kiblnd_peer_alive(conn->ibc_peer);
+        }
 
 	spin_lock(&conn->ibc_lock);
 
@@ -1148,7 +996,6 @@ kiblnd_tx_complete(struct kib_tx *tx, int status)
                 conn->ibc_noops_posted--;
 
         if (failed) {
-		tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_DROPPED;
                 tx->tx_waiting = 0;             /* don't wait for peer_ni */
                 tx->tx_status = -EIO;
         }
@@ -1167,13 +1014,12 @@ kiblnd_tx_complete(struct kib_tx *tx, int status)
 }
 
 static void
-kiblnd_init_tx_msg(struct lnet_ni *ni, struct kib_tx *tx, int type,
-		   int body_nob)
+kiblnd_init_tx_msg(struct lnet_ni *ni, kib_tx_t *tx, int type, int body_nob)
 {
-	struct kib_hca_dev *hdev = tx->tx_pool->tpo_hdev;
+	kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev;
 	struct ib_sge *sge = &tx->tx_msgsge;
 	struct ib_rdma_wr *wrq;
-	int nob = offsetof(struct kib_msg, ibm_u) + body_nob;
+	int nob = offsetof(kib_msg_t, ibm_u) + body_nob;
 #ifdef HAVE_IB_GET_DMA_MR
 	struct ib_mr *mr = hdev->ibh_mrs;
 #endif
@@ -1209,11 +1055,11 @@ kiblnd_init_tx_msg(struct lnet_ni *ni, struct kib_tx *tx, int type,
 }
 
 static int
-kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type,
-		 int resid, struct kib_rdma_desc *dstrd, u64 dstcookie)
+kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type,
+		 int resid, kib_rdma_desc_t *dstrd, __u64 dstcookie)
 {
-	struct kib_msg *ibmsg = tx->tx_msg;
-	struct kib_rdma_desc *srcrd = tx->tx_rd;
+	kib_msg_t	  *ibmsg = tx->tx_msg;
+	kib_rdma_desc_t   *srcrd = tx->tx_rd;
 	struct ib_rdma_wr *wrq = NULL;
 	struct ib_sge	  *sge;
 	int		   rc  = resid;
@@ -1301,39 +1147,24 @@ kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type,
         ibmsg->ibm_u.completion.ibcm_status = rc;
         ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
         kiblnd_init_tx_msg(conn->ibc_peer->ibp_ni, tx,
-			   type, sizeof(struct kib_completion_msg));
+                           type, sizeof (kib_completion_msg_t));
 
         return rc;
 }
 
 static void
-kiblnd_queue_tx_locked(struct kib_tx *tx, struct kib_conn *conn)
+kiblnd_queue_tx_locked(kib_tx_t *tx, kib_conn_t *conn)
 {
 	struct list_head *q;
-	s64 timeout_ns;
 
 	LASSERT(tx->tx_nwrq > 0);	/* work items set up */
 	LASSERT(!tx->tx_queued);	/* not queued for sending already */
 	LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
 
-	if (conn->ibc_state >= IBLND_CONN_DISCONNECTED) {
-		tx->tx_status = -ECONNABORTED;
-		tx->tx_waiting = 0;
-		if (tx->tx_conn != NULL) {
-			/* PUT_DONE first attached to conn as a PUT_REQ */
-			LASSERT(tx->tx_conn == conn);
-			LASSERT(tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE);
-			tx->tx_conn = NULL;
-			kiblnd_conn_decref(conn);
-		}
-		list_add(&tx->tx_list, &conn->ibc_zombie_txs);
-
-		return;
-	}
-
-	timeout_ns = lnet_get_lnd_timeout() * NSEC_PER_SEC;
 	tx->tx_queued = 1;
-	tx->tx_deadline = ktime_add_ns(ktime_get(), timeout_ns);
+	tx->tx_deadline = jiffies +
+			  msecs_to_jiffies(*kiblnd_tunables.kib_timeout *
+					   MSEC_PER_SEC);
 
         if (tx->tx_conn == NULL) {
                 kiblnd_conn_addref(conn);
@@ -1377,7 +1208,7 @@ kiblnd_queue_tx_locked(struct kib_tx *tx, struct kib_conn *conn)
 }
 
 static void
-kiblnd_queue_tx(struct kib_tx *tx, struct kib_conn *conn)
+kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
 {
 	spin_lock(&conn->ibc_lock);
 	kiblnd_queue_tx_locked(tx, conn);
@@ -1423,14 +1254,14 @@ static int kiblnd_resolve_addr(struct rdma_cm_id *cmid,
 }
 
 static void
-kiblnd_connect_peer(struct kib_peer_ni *peer_ni)
+kiblnd_connect_peer (kib_peer_ni_t *peer_ni)
 {
         struct rdma_cm_id *cmid;
-	struct kib_dev *dev;
-	struct kib_net *net = peer_ni->ibp_ni->ni_data;
+        kib_dev_t         *dev;
+        kib_net_t         *net = peer_ni->ibp_ni->ni_data;
         struct sockaddr_in srcaddr;
         struct sockaddr_in dstaddr;
-	int rc;
+        int                rc;
 
         LASSERT (net != NULL);
         LASSERT (peer_ni->ibp_connecting > 0);
@@ -1458,21 +1289,21 @@ kiblnd_connect_peer(struct kib_peer_ni *peer_ni)
 
         kiblnd_peer_addref(peer_ni);               /* cmid's ref */
 
-	if (*kiblnd_tunables.kib_use_priv_port) {
-		rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr,
-					 lnet_get_lnd_timeout() * 1000);
-	} else {
-		rc = rdma_resolve_addr(cmid,
-				       (struct sockaddr *)&srcaddr,
-				       (struct sockaddr *)&dstaddr,
-				       lnet_get_lnd_timeout() * 1000);
-	}
-	if (rc != 0) {
-		/* Can't initiate address resolution:  */
-		CERROR("Can't resolve addr for %s: %d\n",
-		       libcfs_nid2str(peer_ni->ibp_nid), rc);
-		goto failed2;
-	}
+        if (*kiblnd_tunables.kib_use_priv_port) {
+                rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr,
+                                         *kiblnd_tunables.kib_timeout * 1000);
+        } else {
+                rc = rdma_resolve_addr(cmid,
+                                       (struct sockaddr *)&srcaddr,
+                                       (struct sockaddr *)&dstaddr,
+                                       *kiblnd_tunables.kib_timeout * 1000);
+        }
+        if (rc != 0) {
+                /* Can't initiate address resolution:  */
+                CERROR("Can't resolve addr for %s: %d\n",
+                       libcfs_nid2str(peer_ni->ibp_nid), rc);
+                goto failed2;
+        }
 
 	return;
 
@@ -1486,7 +1317,7 @@ kiblnd_connect_peer(struct kib_peer_ni *peer_ni)
 }
 
 bool
-kiblnd_reconnect_peer(struct kib_peer_ni *peer_ni)
+kiblnd_reconnect_peer(kib_peer_ni_t *peer_ni)
 {
 	rwlock_t	 *glock = &kiblnd_data.kib_global_lock;
 	char		 *reason = NULL;
@@ -1532,18 +1363,17 @@ kiblnd_reconnect_peer(struct kib_peer_ni *peer_ni)
 
 	CWARN("Abort reconnection of %s: %s\n",
 	      libcfs_nid2str(peer_ni->ibp_nid), reason);
-	kiblnd_txlist_done(&txs, -ECONNABORTED,
-			   LNET_MSG_STATUS_LOCAL_ABORTED);
+	kiblnd_txlist_done(&txs, -ECONNABORTED);
 	return false;
 }
 
 void
-kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid)
+kiblnd_launch_tx(struct lnet_ni *ni, kib_tx_t *tx, lnet_nid_t nid)
 {
-	struct kib_peer_ni *peer_ni;
-	struct kib_peer_ni *peer2;
-	struct kib_conn *conn;
-	rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
+        kib_peer_ni_t        *peer_ni;
+        kib_peer_ni_t        *peer2;
+        kib_conn_t        *conn;
+	rwlock_t	*g_lock = &kiblnd_data.kib_global_lock;
         unsigned long      flags;
         int                rc;
 	int		   i;
@@ -1608,7 +1438,6 @@ kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid)
 		if (tx != NULL) {
 			tx->tx_status = -EHOSTUNREACH;
 			tx->tx_waiting = 0;
-			tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
 			kiblnd_tx_done(tx);
 		}
 		return;
@@ -1646,7 +1475,7 @@ kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid)
 	peer_ni->ibp_connecting = tunables->lnd_conns_per_peer;
 
 	/* always called with a ref on ni, which prevents ni being shutdown */
-	LASSERT(((struct kib_net *)ni->ni_data)->ibn_shutdown == 0);
+	LASSERT(((kib_net_t *)ni->ni_data)->ibn_shutdown == 0);
 
 	if (tx != NULL)
 		list_add_tail(&tx->tx_list, &peer_ni->ibp_tx_queue);
@@ -1674,9 +1503,9 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
 	lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
 	unsigned int      payload_offset = lntmsg->msg_offset;
 	unsigned int      payload_nob = lntmsg->msg_len;
-	struct kib_msg *ibmsg;
-	struct kib_rdma_desc *rd;
-	struct kib_tx *tx;
+	kib_msg_t        *ibmsg;
+	kib_rdma_desc_t  *rd;
+	kib_tx_t         *tx;
 	int               nob;
 	int               rc;
 
@@ -1707,7 +1536,7 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
                         break;                  /* send IMMEDIATE */
 
                 /* is the REPLY message too small for RDMA? */
-		nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
                 if (nob <= IBLND_MSG_SIZE)
                         break;                  /* send IMMEDIATE */
 
@@ -1733,12 +1562,11 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
 		if (rc != 0) {
 			CERROR("Can't setup GET sink for %s: %d\n",
 			       libcfs_nid2str(target.nid), rc);
-			tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
 			kiblnd_tx_done(tx);
 			return -EIO;
 		}
 
-		nob = offsetof(struct kib_get_msg, ibgm_rd.rd_frags[rd->rd_nfrags]);
+		nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[rd->rd_nfrags]);
 		ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
 		ibmsg->ibm_u.get.ibgm_hdr = *hdr;
 
@@ -1760,7 +1588,7 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
         case LNET_MSG_REPLY:
         case LNET_MSG_PUT:
                 /* Is the payload small enough not to need RDMA? */
-		nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[payload_nob]);
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
                 if (nob <= IBLND_MSG_SIZE)
                         break;                  /* send IMMEDIATE */
 
@@ -1790,8 +1618,7 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
                 ibmsg = tx->tx_msg;
                 ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
                 ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
-		kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_REQ,
-				   sizeof(struct kib_putreq_msg));
+                kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
 
                 tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
                 tx->tx_waiting = 1;             /* waiting for PUT_{ACK,NAK} */
@@ -1799,9 +1626,10 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
                 return 0;
         }
 
-	/* send IMMEDIATE */
-	LASSERT(offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[payload_nob])
-		<= IBLND_MSG_SIZE);
+        /* send IMMEDIATE */
+
+        LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
+                 <= IBLND_MSG_SIZE);
 
 	tx = kiblnd_get_idle_tx(ni, target.nid);
         if (tx == NULL) {
@@ -1815,16 +1643,16 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
 
         if (payload_kiov != NULL)
                 lnet_copy_kiov2flat(IBLND_MSG_SIZE, ibmsg,
-				    offsetof(struct kib_msg, ibm_u.immediate.ibim_payload),
+                                    offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
                                     payload_niov, payload_kiov,
                                     payload_offset, payload_nob);
         else
                 lnet_copy_iov2flat(IBLND_MSG_SIZE, ibmsg,
-				   offsetof(struct kib_msg, ibm_u.immediate.ibim_payload),
+                                   offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
                                    payload_niov, payload_iov,
                                    payload_offset, payload_nob);
 
-	nob = offsetof(struct kib_immediate_msg, ibim_payload[payload_nob]);
+        nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
         kiblnd_init_tx_msg(ni, tx, IBLND_MSG_IMMEDIATE, nob);
 
         tx->tx_lntmsg[0] = lntmsg;              /* finalise lntmsg on completion */
@@ -1833,7 +1661,7 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
 }
 
 static void
-kiblnd_reply(struct lnet_ni *ni, struct kib_rx *rx, struct lnet_msg *lntmsg)
+kiblnd_reply(struct lnet_ni *ni, kib_rx_t *rx, struct lnet_msg *lntmsg)
 {
 	struct lnet_process_id target = lntmsg->msg_target;
         unsigned int      niov = lntmsg->msg_niov;
@@ -1841,7 +1669,7 @@ kiblnd_reply(struct lnet_ni *ni, struct kib_rx *rx, struct lnet_msg *lntmsg)
         lnet_kiov_t      *kiov = lntmsg->msg_kiov;
         unsigned int      offset = lntmsg->msg_offset;
         unsigned int      nob = lntmsg->msg_len;
-	struct kib_tx *tx;
+        kib_tx_t         *tx;
         int               rc;
 
 	tx = kiblnd_get_idle_tx(ni, rx->rx_conn->ibc_peer->ibp_nid);
@@ -1888,11 +1716,9 @@ kiblnd_reply(struct lnet_ni *ni, struct kib_rx *rx, struct lnet_msg *lntmsg)
         kiblnd_queue_tx(tx, rx->rx_conn);
         return;
 
-
-failed_1:
-	tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
+ failed_1:
 	kiblnd_tx_done(tx);
-failed_0:
+ failed_0:
 	lnet_finalize(lntmsg, -EIO);
 }
 
@@ -1901,10 +1727,10 @@ kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
 	    int delayed, unsigned int niov, struct kvec *iov, lnet_kiov_t *kiov,
 	    unsigned int offset, unsigned int mlen, unsigned int rlen)
 {
-	struct kib_rx *rx = private;
-	struct kib_msg *rxmsg = rx->rx_msg;
-	struct kib_conn *conn = rx->rx_conn;
-	struct kib_tx *tx;
+        kib_rx_t    *rx = private;
+        kib_msg_t   *rxmsg = rx->rx_msg;
+        kib_conn_t  *conn = rx->rx_conn;
+        kib_tx_t    *tx;
 	__u64	     ibprm_cookie;
 	int          nob;
 	int          post_credit = IBLND_POSTRX_PEER_CREDIT;
@@ -1920,7 +1746,7 @@ kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
 		LBUG();
 
         case IBLND_MSG_IMMEDIATE:
-		nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[rlen]);
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
                 if (nob > rx->rx_nob) {
                         CERROR ("Immediate message from %s too big: %d(%d)\n",
                                 libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
@@ -1932,19 +1758,19 @@ kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
                 if (kiov != NULL)
                         lnet_copy_flat2kiov(niov, kiov, offset,
                                             IBLND_MSG_SIZE, rxmsg,
-					    offsetof(struct kib_msg, ibm_u.immediate.ibim_payload),
+                                            offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
                                             mlen);
                 else
                         lnet_copy_flat2iov(niov, iov, offset,
                                            IBLND_MSG_SIZE, rxmsg,
-					   offsetof(struct kib_msg, ibm_u.immediate.ibim_payload),
+                                           offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
                                            mlen);
 		lnet_finalize(lntmsg, 0);
 		break;
 
 	case IBLND_MSG_PUT_REQ: {
-		struct kib_msg	*txmsg;
-		struct kib_rdma_desc *rd;
+		kib_msg_t	*txmsg;
+		kib_rdma_desc_t *rd;
 		ibprm_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
 
 		if (mlen == 0) {
@@ -1974,7 +1800,6 @@ kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
 		if (rc != 0) {
 			CERROR("Can't setup PUT sink for %s: %d\n",
 			       libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
-			tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
 			kiblnd_tx_done(tx);
 			/* tell peer_ni it's over */
 			kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK,
@@ -1982,7 +1807,7 @@ kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
 			break;
 		}
 
-		nob = offsetof(struct kib_putack_msg, ibpam_rd.rd_frags[rd->rd_nfrags]);
+		nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[rd->rd_nfrags]);
 		txmsg->ibm_u.putack.ibpam_src_cookie = ibprm_cookie;
 		txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
 
@@ -2033,18 +1858,18 @@ kiblnd_thread_fini (void)
 }
 
 static void
-kiblnd_peer_alive(struct kib_peer_ni *peer_ni)
+kiblnd_peer_alive (kib_peer_ni_t *peer_ni)
 {
-	/* This is racy, but everyone's only writing ktime_get_seconds() */
-	peer_ni->ibp_last_alive = ktime_get_seconds();
+	/* This is racy, but everyone's only writing cfs_time_current() */
+	peer_ni->ibp_last_alive = cfs_time_current();
 	smp_mb();
 }
 
 static void
-kiblnd_peer_notify(struct kib_peer_ni *peer_ni)
+kiblnd_peer_notify (kib_peer_ni_t *peer_ni)
 {
         int           error = 0;
-	time64_t last_alive = 0;
+        cfs_time_t    last_alive = 0;
         unsigned long flags;
 
 	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
@@ -2064,7 +1889,7 @@ kiblnd_peer_notify(struct kib_peer_ni *peer_ni)
 }
 
 void
-kiblnd_close_conn_locked(struct kib_conn *conn, int error)
+kiblnd_close_conn_locked (kib_conn_t *conn, int error)
 {
         /* This just does the immediate housekeeping.  'error' is zero for a
          * normal shutdown which can happen only after the connection has been
@@ -2072,9 +1897,9 @@ kiblnd_close_conn_locked(struct kib_conn *conn, int error)
          * connection to be finished off by the connd.  Otherwise the connd is
          * already dealing with it (either to set it up or tear it down).
          * Caller holds kib_global_lock exclusively in irq context */
-	struct kib_peer_ni *peer_ni = conn->ibc_peer;
-	struct kib_dev *dev;
-	unsigned long flags;
+        kib_peer_ni_t       *peer_ni = conn->ibc_peer;
+        kib_dev_t        *dev;
+        unsigned long     flags;
 
         LASSERT (error != 0 || conn->ibc_state >= IBLND_CONN_ESTABLISHED);
 
@@ -2104,7 +1929,7 @@ kiblnd_close_conn_locked(struct kib_conn *conn, int error)
 		       list_empty(&conn->ibc_active_txs) ? "" : "(waiting)");
 	}
 
-	dev = ((struct kib_net *)peer_ni->ibp_ni->ni_data)->ibn_dev;
+	dev = ((kib_net_t *)peer_ni->ibp_ni->ni_data)->ibn_dev;
 	if (peer_ni->ibp_next_conn == conn)
 		/* clear next_conn so it won't be used */
 		peer_ni->ibp_next_conn = NULL;
@@ -2137,7 +1962,7 @@ kiblnd_close_conn_locked(struct kib_conn *conn, int error)
 }
 
 void
-kiblnd_close_conn(struct kib_conn *conn, int error)
+kiblnd_close_conn(kib_conn_t *conn, int error)
 {
 	unsigned long flags;
 
@@ -2149,10 +1974,10 @@ kiblnd_close_conn(struct kib_conn *conn, int error)
 }
 
 static void
-kiblnd_handle_early_rxs(struct kib_conn *conn)
+kiblnd_handle_early_rxs(kib_conn_t *conn)
 {
-	unsigned long flags;
-	struct kib_rx *rx;
+	unsigned long    flags;
+	kib_rx_t        *rx;
 
 	LASSERT(!in_interrupt());
 	LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
@@ -2160,7 +1985,7 @@ kiblnd_handle_early_rxs(struct kib_conn *conn)
 	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 	while (!list_empty(&conn->ibc_early_rxs)) {
 		rx = list_entry(conn->ibc_early_rxs.next,
-				struct kib_rx, rx_list);
+				    kib_rx_t, rx_list);
 		list_del(&rx->rx_list);
 		write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 
@@ -2171,52 +1996,30 @@ kiblnd_handle_early_rxs(struct kib_conn *conn)
 	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 }
 
-void
-kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs)
+static void
+kiblnd_abort_txs(kib_conn_t *conn, struct list_head *txs)
 {
 	struct list_head	 zombies = LIST_HEAD_INIT(zombies);
 	struct list_head	*tmp;
 	struct list_head	*nxt;
-	struct kib_tx *tx;
+	kib_tx_t		*tx;
 
 	spin_lock(&conn->ibc_lock);
 
 	list_for_each_safe(tmp, nxt, txs) {
-		tx = list_entry(tmp, struct kib_tx, tx_list);
+		tx = list_entry(tmp, kib_tx_t, tx_list);
 
 		if (txs == &conn->ibc_active_txs) {
 			LASSERT(!tx->tx_queued);
 			LASSERT(tx->tx_waiting ||
 				tx->tx_sending != 0);
-			if (conn->ibc_comms_error == -ETIMEDOUT) {
-				if (tx->tx_waiting && !tx->tx_sending)
-					tx->tx_hstatus =
-					  LNET_MSG_STATUS_REMOTE_TIMEOUT;
-				else if (tx->tx_sending)
-					tx->tx_hstatus =
-					  LNET_MSG_STATUS_NETWORK_TIMEOUT;
-			}
 		} else {
 			LASSERT(tx->tx_queued);
-			if (conn->ibc_comms_error == -ETIMEDOUT)
-				tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_TIMEOUT;
-			else
-				tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
 		}
 
 		tx->tx_status = -ECONNABORTED;
 		tx->tx_waiting = 0;
 
-		/*
-		 * TODO: This makes an assumption that
-		 * kiblnd_tx_complete() will be called for each tx. If
-		 * that event is dropped we could end up with stale
-		 * connections floating around. We'd like to deal with
-		 * that in a better way.
-		 *
-		 * Also that means we can exceed the timeout by many
-		 * seconds.
-		 */
 		if (tx->tx_sending == 0) {
 			tx->tx_queued = 0;
 			list_del(&tx->tx_list);
@@ -2226,28 +2029,22 @@ kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs)
 
 	spin_unlock(&conn->ibc_lock);
 
-	/*
-	 * aborting transmits occurs when finalizing the connection.
-	 * The connection is finalized on error.
-	 * Passing LNET_MSG_STATUS_OK to txlist_done() will not
-	 * override the value already set in tx->tx_hstatus above.
-	 */
-	kiblnd_txlist_done(&zombies, -ECONNABORTED, LNET_MSG_STATUS_OK);
+	kiblnd_txlist_done(&zombies, -ECONNABORTED);
 }
 
 static void
-kiblnd_finalise_conn(struct kib_conn *conn)
+kiblnd_finalise_conn (kib_conn_t *conn)
 {
 	LASSERT (!in_interrupt());
 	LASSERT (conn->ibc_state > IBLND_CONN_INIT);
 
+	kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED);
+
 	/* abort_receives moves QP state to IB_QPS_ERR.  This is only required
 	 * for connections that didn't get as far as being connected, because
 	 * rdma_disconnect() does this for free. */
 	kiblnd_abort_receives(conn);
 
-	kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED);
-
 	/* Complete all tx descs not waiting for sends to complete.
 	 * NB we should be safe from RDMA now that the QP has changed state */
 
@@ -2261,8 +2058,7 @@ kiblnd_finalise_conn(struct kib_conn *conn)
 }
 
 static void
-kiblnd_peer_connect_failed(struct kib_peer_ni *peer_ni, int active,
-			   int error)
+kiblnd_peer_connect_failed(kib_peer_ni_t *peer_ni, int active, int error)
 {
 	struct list_head zombies = LIST_HEAD_INIT(zombies);
 	unsigned long	flags;
@@ -2290,7 +2086,8 @@ kiblnd_peer_connect_failed(struct kib_peer_ni *peer_ni, int active,
 	peer_ni->ibp_reconnected = 0;
 	if (list_empty(&peer_ni->ibp_conns)) {
 		/* Take peer_ni's blocked transmits to complete with error */
-		list_splice_init(&peer_ni->ibp_tx_queue, &zombies);
+		list_add(&zombies, &peer_ni->ibp_tx_queue);
+		list_del_init(&peer_ni->ibp_tx_queue);
 
 		if (kiblnd_peer_active(peer_ni))
 			kiblnd_unlink_peer_locked(peer_ni);
@@ -2311,15 +2108,14 @@ kiblnd_peer_connect_failed(struct kib_peer_ni *peer_ni, int active,
 	CNETERR("Deleting messages for %s: connection failed\n",
 		libcfs_nid2str(peer_ni->ibp_nid));
 
-	kiblnd_txlist_done(&zombies, error,
-			   LNET_MSG_STATUS_LOCAL_DROPPED);
+	kiblnd_txlist_done(&zombies, -EHOSTUNREACH);
 }
 
 static void
-kiblnd_connreq_done(struct kib_conn *conn, int status)
+kiblnd_connreq_done(kib_conn_t *conn, int status)
 {
-	struct kib_peer_ni *peer_ni = conn->ibc_peer;
-	struct kib_tx *tx;
+	kib_peer_ni_t	 *peer_ni = conn->ibc_peer;
+	kib_tx_t	 *tx;
 	struct list_head txs;
 	unsigned long	 flags;
 	int		 active;
@@ -2336,23 +2132,20 @@ kiblnd_connreq_done(struct kib_conn *conn, int status)
 		 (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT &&
 		  peer_ni->ibp_accepting > 0));
 
-	LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
-	conn->ibc_connvars = NULL;
+        LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+        conn->ibc_connvars = NULL;
 
-	if (status != 0) {
-		/* failed to establish connection */
-		kiblnd_peer_connect_failed(peer_ni, active, status);
-		kiblnd_finalise_conn(conn);
-		return;
-	}
+        if (status != 0) {
+                /* failed to establish connection */
+                kiblnd_peer_connect_failed(peer_ni, active, status);
+                kiblnd_finalise_conn(conn);
+                return;
+        }
 
-	/* connection established */
+        /* connection established */
 	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 
-	/* reset retry count */
-	peer_ni->ibp_retries = 0;
-
-	conn->ibc_last_send = ktime_get();
+        conn->ibc_last_send = jiffies;
         kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED);
         kiblnd_peer_alive(peer_ni);
 
@@ -2390,8 +2183,7 @@ kiblnd_connreq_done(struct kib_conn *conn, int status)
                 kiblnd_close_conn_locked(conn, -ECONNABORTED);
 		write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 
-		kiblnd_txlist_done(&txs, -ECONNABORTED,
-				   LNET_MSG_STATUS_LOCAL_ERROR);
+		kiblnd_txlist_done(&txs, -ECONNABORTED);
 
 		return;
 	}
@@ -2411,7 +2203,7 @@ kiblnd_connreq_done(struct kib_conn *conn, int status)
 	 */
 	spin_lock(&conn->ibc_lock);
 	while (!list_empty(&txs)) {
-		tx = list_entry(txs.next, struct kib_tx, tx_list);
+		tx = list_entry(txs.next, kib_tx_t, tx_list);
 		list_del(&tx->tx_list);
 
 		kiblnd_queue_tx_locked(tx, conn);
@@ -2425,7 +2217,7 @@ kiblnd_connreq_done(struct kib_conn *conn, int status)
 }
 
 static void
-kiblnd_reject(struct rdma_cm_id *cmid, struct kib_rej *rej)
+kiblnd_reject(struct rdma_cm_id *cmid, kib_rej_t *rej)
 {
         int          rc;
 
@@ -2443,17 +2235,17 @@ static int
 kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
 {
 	rwlock_t		*g_lock = &kiblnd_data.kib_global_lock;
-	struct kib_msg *reqmsg = priv;
-	struct kib_msg *ackmsg;
-	struct kib_dev *ibdev;
-	struct kib_peer_ni *peer_ni;
-	struct kib_peer_ni *peer2;
-	struct kib_conn *conn;
-	struct lnet_ni *ni = NULL;
-	struct kib_net *net = NULL;
+        kib_msg_t             *reqmsg = priv;
+        kib_msg_t             *ackmsg;
+        kib_dev_t             *ibdev;
+        kib_peer_ni_t            *peer_ni;
+        kib_peer_ni_t            *peer2;
+        kib_conn_t            *conn;
+	struct lnet_ni             *ni  = NULL;
+        kib_net_t             *net = NULL;
         lnet_nid_t             nid;
         struct rdma_conn_param cp;
-	struct kib_rej rej;
+        kib_rej_t              rej;
 	int                    version = IBLND_MSG_VERSION;
 	unsigned long          flags;
 	int                    rc;
@@ -2461,8 +2253,8 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
 	LASSERT (!in_interrupt());
 
 	/* cmid inherits 'context' from the corresponding listener id */
-	ibdev = cmid->context;
-	LASSERT(ibdev);
+	ibdev = (kib_dev_t *)cmid->context;
+	LASSERT (ibdev != NULL);
 
         memset(&rej, 0, sizeof(rej));
         rej.ibr_magic                = IBLND_MSG_MAGIC;
@@ -2478,7 +2270,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
 		goto failed;
 	}
 
-	if (priv_nob < offsetof(struct kib_msg, ibm_type)) {
+	if (priv_nob < offsetof(kib_msg_t, ibm_type)) {
 		CERROR("Short connection request\n");
 		goto failed;
 	}
@@ -2511,7 +2303,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
 	ni  = lnet_nid2ni_addref(reqmsg->ibm_dstnid);
 
 	if (ni != NULL) {
-		net = (struct kib_net *)ni->ni_data;
+		net = (kib_net_t *)ni->ni_data;
 		rej.ibr_incarnation = net->ibn_incarnation;
 	}
 
@@ -2560,26 +2352,26 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
 	}
 
 	if (reqmsg->ibm_u.connparams.ibcp_max_frags >
-	    IBLND_MAX_RDMA_FRAGS) {
+	    kiblnd_rdma_frags(version, ni)) {
 		CWARN("Can't accept conn from %s (version %x): "
 		      "max_frags %d too large (%d wanted)\n",
 		      libcfs_nid2str(nid), version,
 		      reqmsg->ibm_u.connparams.ibcp_max_frags,
-		      IBLND_MAX_RDMA_FRAGS);
+		      kiblnd_rdma_frags(version, ni));
 
 		if (version >= IBLND_MSG_VERSION)
 			rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
 
 		goto failed;
 	} else if (reqmsg->ibm_u.connparams.ibcp_max_frags <
-		   IBLND_MAX_RDMA_FRAGS &&
+		   kiblnd_rdma_frags(version, ni) &&
 		   net->ibn_fmr_ps == NULL) {
 		CWARN("Can't accept conn from %s (version %x): "
 		      "max_frags %d incompatible without FMR pool "
 		      "(%d wanted)\n",
 		      libcfs_nid2str(nid), version,
 		      reqmsg->ibm_u.connparams.ibcp_max_frags,
-		      IBLND_MAX_RDMA_FRAGS);
+		      kiblnd_rdma_frags(version, ni));
 
 		if (version == IBLND_MSG_VERSION)
 			rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
@@ -2753,7 +2545,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
 	if (ni != NULL) {
 		rej.ibr_cp.ibcp_queue_depth =
 			kiblnd_msg_queue_size(version, ni);
-		rej.ibr_cp.ibcp_max_frags   = IBLND_MAX_RDMA_FRAGS;
+		rej.ibr_cp.ibcp_max_frags   = kiblnd_rdma_frags(version, ni);
 		lnet_ni_decref(ni);
 	}
 
@@ -2764,11 +2556,11 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
 }
 
 static void
-kiblnd_check_reconnect(struct kib_conn *conn, int version,
-		       u64 incarnation, int why, struct kib_connparams *cp)
+kiblnd_check_reconnect(kib_conn_t *conn, int version,
+		       __u64 incarnation, int why, kib_connparams_t *cp)
 {
 	rwlock_t	*glock = &kiblnd_data.kib_global_lock;
-	struct kib_peer_ni *peer_ni = conn->ibc_peer;
+	kib_peer_ni_t	*peer_ni = conn->ibc_peer;
 	char		*reason;
 	int		 msg_size = IBLND_MSG_SIZE;
 	int		 frag_num = -1;
@@ -2800,15 +2592,10 @@ kiblnd_check_reconnect(struct kib_conn *conn, int version,
 		goto out;
 	}
 
-	if (peer_ni->ibp_retries > *kiblnd_tunables.kib_retry_count) {
-		reason = "retry count exceeded due to no listener";
-		goto out;
-	}
-
-	switch (why) {
-	default:
-		reason = "Unknown";
-		break;
+        switch (why) {
+        default:
+                reason = "Unknown";
+                break;
 
 	case IBLND_REJECT_RDMA_FRAGS: {
 		struct lnet_ioctl_config_o2iblnd_tunables *tunables;
@@ -2818,16 +2605,10 @@ kiblnd_check_reconnect(struct kib_conn *conn, int version,
 			goto out;
 		}
 		tunables = &peer_ni->ibp_ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
-#ifdef HAVE_IB_GET_DMA_MR
-		/*
-		 * This check only makes sense if the kernel supports global
-		 * memory registration. Otherwise, map_on_demand will never == 0
-		 */
 		if (!tunables->lnd_map_on_demand) {
 			reason = "map_on_demand must be enabled";
 			goto out;
 		}
-#endif
 		if (conn->ibc_max_frags <= frag_num) {
 			reason = "unsupported max frags";
 			goto out;
@@ -2889,9 +2670,9 @@ kiblnd_check_reconnect(struct kib_conn *conn, int version,
 }
 
 static void
-kiblnd_rejected(struct kib_conn *conn, int reason, void *priv, int priv_nob)
+kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
 {
-	struct kib_peer_ni *peer_ni = conn->ibc_peer;
+	kib_peer_ni_t    *peer_ni = conn->ibc_peer;
 
 	LASSERT (!in_interrupt());
 	LASSERT (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
@@ -2903,18 +2684,17 @@ kiblnd_rejected(struct kib_conn *conn, int reason, void *priv, int priv_nob)
 		break;
 
         case IB_CM_REJ_INVALID_SERVICE_ID:
-		peer_ni->ibp_retries++;
 		kiblnd_check_reconnect(conn, IBLND_MSG_VERSION, 0,
 				       IBLND_REJECT_INVALID_SRV_ID, NULL);
-		CNETERR("%s rejected: no listener at %d\n",
-			libcfs_nid2str(peer_ni->ibp_nid),
-			*kiblnd_tunables.kib_service);
-		break;
+                CNETERR("%s rejected: no listener at %d\n",
+                        libcfs_nid2str(peer_ni->ibp_nid),
+                        *kiblnd_tunables.kib_service);
+                break;
 
         case IB_CM_REJ_CONSUMER_DEFINED:
-		if (priv_nob >= offsetof(struct kib_rej, ibr_padding)) {
-			struct kib_rej *rej = priv;
-			struct kib_connparams *cp = NULL;
+                if (priv_nob >= offsetof(kib_rej_t, ibr_padding)) {
+                        kib_rej_t        *rej         = priv;
+                        kib_connparams_t *cp          = NULL;
                         int               flip        = 0;
                         __u64             incarnation = -1;
 
@@ -2927,7 +2707,7 @@ kiblnd_rejected(struct kib_conn *conn, int reason, void *priv, int priv_nob)
                          * it rejected me then upgrade to V2, I have no idea
                          * about the upgrading and try to reconnect with V1,
                          * in this case upgraded V2 can find out I'm trying to
-			 * talk to the old guy and reject me(incarnation is -1).
+                         * talk to the old guy and reject me(incarnation is -1). 
                          */
 
                         if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) ||
@@ -2937,7 +2717,7 @@ kiblnd_rejected(struct kib_conn *conn, int reason, void *priv, int priv_nob)
                                 flip = 1;
                         }
 
-			if (priv_nob >= sizeof(struct kib_rej) &&
+                        if (priv_nob >= sizeof(kib_rej_t) &&
                             rej->ibr_version > IBLND_MSG_VERSION_1) {
                                 /* priv_nob is always 148 in current version
                                  * of OFED, so we still need to check version.
@@ -3017,12 +2797,12 @@ kiblnd_rejected(struct kib_conn *conn, int reason, void *priv, int priv_nob)
 }
 
 static void
-kiblnd_check_connreply(struct kib_conn *conn, void *priv, int priv_nob)
+kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob)
 {
-	struct kib_peer_ni *peer_ni = conn->ibc_peer;
-	struct lnet_ni *ni = peer_ni->ibp_ni;
-	struct kib_net *net = ni->ni_data;
-	struct kib_msg *msg = priv;
+        kib_peer_ni_t    *peer_ni = conn->ibc_peer;
+	struct lnet_ni *ni   = peer_ni->ibp_ni;
+        kib_net_t     *net  = ni->ni_data;
+        kib_msg_t     *msg  = priv;
         int            ver  = conn->ibc_version;
         int            rc   = kiblnd_unpack_msg(msg, priv_nob);
         unsigned long  flags;
@@ -3118,12 +2898,12 @@ kiblnd_check_connreply(struct kib_conn *conn, void *priv, int priv_nob)
 }
 
 static int
-kiblnd_active_connect(struct rdma_cm_id *cmid)
+kiblnd_active_connect (struct rdma_cm_id *cmid)
 {
-	struct kib_peer_ni *peer_ni = cmid->context;
-	struct kib_conn *conn;
-	struct kib_msg *msg;
-	struct rdma_conn_param cp;
+        kib_peer_ni_t              *peer_ni = (kib_peer_ni_t *)cmid->context;
+        kib_conn_t              *conn;
+        kib_msg_t               *msg;
+        struct rdma_conn_param   cp;
         int                      version;
         __u64                    incarnation;
         unsigned long            flags;
@@ -3171,7 +2951,8 @@ kiblnd_active_connect(struct rdma_cm_id *cmid)
 
         LASSERT(cmid->context == (void *)conn);
         LASSERT(conn->ibc_cmid == cmid);
-	rc = rdma_connect_locked(cmid, &cp);
+
+        rc = rdma_connect(cmid, &cp);
         if (rc != 0) {
                 CERROR("Can't connect to %s: %d\n",
                        libcfs_nid2str(peer_ni->ibp_nid), rc);
@@ -3185,9 +2966,9 @@ kiblnd_active_connect(struct rdma_cm_id *cmid)
 int
 kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
 {
-	struct kib_peer_ni *peer_ni;
-	struct kib_conn *conn;
-	int rc;
+        kib_peer_ni_t  *peer_ni;
+        kib_conn_t  *conn;
+	int          rc;
 
 	switch (event->event) {
 	default:
@@ -3197,14 +2978,14 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
 
 	case RDMA_CM_EVENT_CONNECT_REQUEST:
                 /* destroy cmid on failure */
-		rc = kiblnd_passive_connect(cmid,
+		rc = kiblnd_passive_connect(cmid, 
                                             (void *)KIBLND_CONN_PARAM(event),
                                             KIBLND_CONN_PARAM_LEN(event));
                 CDEBUG(D_NET, "connreq: %d\n", rc);
                 return rc;
-
+                
 	case RDMA_CM_EVENT_ADDR_ERROR:
-		peer_ni = cmid->context;
+                peer_ni = (kib_peer_ni_t *)cmid->context;
                 CNETERR("%s: ADDR ERROR %d\n",
                        libcfs_nid2str(peer_ni->ibp_nid), event->status);
                 kiblnd_peer_connect_failed(peer_ni, 1, -EHOSTUNREACH);
@@ -3212,7 +2993,7 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                 return -EHOSTUNREACH;      /* rc != 0 destroys cmid */
 
 	case RDMA_CM_EVENT_ADDR_RESOLVED:
-		peer_ni = cmid->context;
+                peer_ni = (kib_peer_ni_t *)cmid->context;
 
                 CDEBUG(D_NET,"%s Addr resolved: %d\n",
                        libcfs_nid2str(peer_ni->ibp_nid), event->status);
@@ -3221,12 +3002,12 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                         CNETERR("Can't resolve address for %s: %d\n",
                                 libcfs_nid2str(peer_ni->ibp_nid), event->status);
                         rc = event->status;
-		} else {
-			rc = rdma_resolve_route(
-				cmid, lnet_get_lnd_timeout() * 1000);
+                } else {
+                        rc = rdma_resolve_route(
+                                cmid, *kiblnd_tunables.kib_timeout * 1000);
 			if (rc == 0) {
-				struct kib_net *net = peer_ni->ibp_ni->ni_data;
-				struct kib_dev *dev = net->ibn_dev;
+				kib_net_t *net = peer_ni->ibp_ni->ni_data;
+				kib_dev_t *dev = net->ibn_dev;
 
 				CDEBUG(D_NET, "%s: connection bound to "\
 				       "%s:%pI4h:%s\n",
@@ -3246,7 +3027,7 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                 return rc;                      /* rc != 0 destroys cmid */
 
 	case RDMA_CM_EVENT_ROUTE_ERROR:
-		peer_ni = cmid->context;
+                peer_ni = (kib_peer_ni_t *)cmid->context;
                 CNETERR("%s: ROUTE ERROR %d\n",
                         libcfs_nid2str(peer_ni->ibp_nid), event->status);
                 kiblnd_peer_connect_failed(peer_ni, 1, -EHOSTUNREACH);
@@ -3254,7 +3035,7 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                 return -EHOSTUNREACH;           /* rc != 0 destroys cmid */
 
 	case RDMA_CM_EVENT_ROUTE_RESOLVED:
-		peer_ni = cmid->context;
+                peer_ni = (kib_peer_ni_t *)cmid->context;
                 CDEBUG(D_NET,"%s Route resolved: %d\n",
                        libcfs_nid2str(peer_ni->ibp_nid), event->status);
 
@@ -3266,9 +3047,9 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                 kiblnd_peer_connect_failed(peer_ni, 1, event->status);
                 kiblnd_peer_decref(peer_ni);
                 return event->status;           /* rc != 0 destroys cmid */
-
+                
 	case RDMA_CM_EVENT_UNREACHABLE:
-		conn = cmid->context;
+                conn = (kib_conn_t *)cmid->context;
                 LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
                         conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
                 CNETERR("%s: UNREACHABLE %d\n",
@@ -3278,7 +3059,7 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                 return 0;
 
 	case RDMA_CM_EVENT_CONNECT_ERROR:
-		conn = cmid->context;
+                conn = (kib_conn_t *)cmid->context;
                 LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
                         conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
                 CNETERR("%s: CONNECT ERROR %d\n",
@@ -3288,7 +3069,7 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                 return 0;
 
 	case RDMA_CM_EVENT_REJECTED:
-		conn = cmid->context;
+                conn = (kib_conn_t *)cmid->context;
                 switch (conn->ibc_state) {
                 default:
                         LBUG();
@@ -3310,7 +3091,7 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                 return 0;
 
 	case RDMA_CM_EVENT_ESTABLISHED:
-		conn = cmid->context;
+                conn = (kib_conn_t *)cmid->context;
                 switch (conn->ibc_state) {
                 default:
                         LBUG();
@@ -3337,7 +3118,7 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                 return 0;
 
 	case RDMA_CM_EVENT_DISCONNECTED:
-		conn = cmid->context;
+                conn = (kib_conn_t *)cmid->context;
                 if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
                         CERROR("%s DISCONNECTED\n",
                                libcfs_nid2str(conn->ibc_peer->ibp_nid));
@@ -3364,13 +3145,13 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
 }
 
 static int
-kiblnd_check_txs_locked(struct kib_conn *conn, struct list_head *txs)
+kiblnd_check_txs_locked(kib_conn_t *conn, struct list_head *txs)
 {
-	struct kib_tx *tx;
+	kib_tx_t	 *tx;
 	struct list_head *ttmp;
 
 	list_for_each(ttmp, txs) {
-		tx = list_entry(ttmp, struct kib_tx, tx_list);
+		tx = list_entry(ttmp, kib_tx_t, tx_list);
 
 		if (txs != &conn->ibc_active_txs) {
 			LASSERT(tx->tx_queued);
@@ -3379,11 +3160,10 @@ kiblnd_check_txs_locked(struct kib_conn *conn, struct list_head *txs)
 			LASSERT(tx->tx_waiting || tx->tx_sending != 0);
 		}
 
-		if (ktime_compare(ktime_get(), tx->tx_deadline) >= 0) {
-			CERROR("Timed out tx: %s, %lld seconds\n",
+		if (cfs_time_aftereq(jiffies, tx->tx_deadline)) {
+			CERROR("Timed out tx: %s, %lu seconds\n",
 			       kiblnd_queue2str(conn, txs),
-			       ktime_ms_delta(ktime_get(),
-					      tx->tx_deadline) / MSEC_PER_SEC);
+			       cfs_duration_sec(jiffies - tx->tx_deadline));
 			return 1;
 		}
 	}
@@ -3392,7 +3172,7 @@ kiblnd_check_txs_locked(struct kib_conn *conn, struct list_head *txs)
 }
 
 static int
-kiblnd_conn_timed_out_locked(struct kib_conn *conn)
+kiblnd_conn_timed_out_locked(kib_conn_t *conn)
 {
         return  kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue) ||
                 kiblnd_check_txs_locked(conn, &conn->ibc_tx_noops) ||
@@ -3409,9 +3189,9 @@ kiblnd_check_conns (int idx)
 	struct list_head  timedout_txs = LIST_HEAD_INIT(timedout_txs);
 	struct list_head *peers = &kiblnd_data.kib_peers[idx];
 	struct list_head *ptmp;
-	struct kib_peer_ni *peer_ni;
-	struct kib_conn	*conn;
-	struct kib_tx *tx, *tx_tmp;
+	kib_peer_ni_t	 *peer_ni;
+	kib_conn_t	 *conn;
+	kib_tx_t	 *tx, *tx_tmp;
 	struct list_head *ctmp;
 	unsigned long	  flags;
 
@@ -3421,15 +3201,14 @@ kiblnd_check_conns (int idx)
 	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 
 	list_for_each(ptmp, peers) {
-		peer_ni = list_entry(ptmp, struct kib_peer_ni, ibp_list);
+		peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list);
 
 		/* Check tx_deadline */
 		list_for_each_entry_safe(tx, tx_tmp, &peer_ni->ibp_tx_queue, tx_list) {
-			if (ktime_compare(ktime_get(), tx->tx_deadline) >= 0) {
-				CWARN("Timed out tx for %s: %lld seconds\n",
+			if (cfs_time_aftereq(jiffies, tx->tx_deadline)) {
+				CWARN("Timed out tx for %s: %lu seconds\n",
 				      libcfs_nid2str(peer_ni->ibp_nid),
-				      ktime_ms_delta(ktime_get(),
-						     tx->tx_deadline) / MSEC_PER_SEC);
+				      cfs_duration_sec(jiffies - tx->tx_deadline));
 				list_move(&tx->tx_list, &timedout_txs);
 			}
 		}
@@ -3438,7 +3217,7 @@ kiblnd_check_conns (int idx)
 			int timedout;
 			int sendnoop;
 
-			conn = list_entry(ctmp, struct kib_conn, ibc_list);
+			conn = list_entry(ctmp, kib_conn_t, ibc_list);
 
 			LASSERT(conn->ibc_state == IBLND_CONN_ESTABLISHED);
 
@@ -3452,10 +3231,11 @@ kiblnd_check_conns (int idx)
 			}
 
 			if (timedout) {
-				CERROR("Timed out RDMA with %s (%lld): "
+				CERROR("Timed out RDMA with %s (%lu): "
 				       "c: %u, oc: %u, rc: %u\n",
 				       libcfs_nid2str(peer_ni->ibp_nid),
-				       ktime_get_seconds() - peer_ni->ibp_last_alive,
+				       cfs_duration_sec(cfs_time_current() -
+							peer_ni->ibp_last_alive),
 				       conn->ibc_credits,
 				       conn->ibc_outstanding_credits,
 				       conn->ibc_reserved_credits);
@@ -3473,15 +3253,14 @@ kiblnd_check_conns (int idx)
 	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 
 	if (!list_empty(&timedout_txs))
-		kiblnd_txlist_done(&timedout_txs, -ETIMEDOUT,
-				   LNET_MSG_STATUS_LOCAL_TIMEOUT);
+		kiblnd_txlist_done(&timedout_txs, -ETIMEDOUT);
 
 	/* Handle timeout by closing the whole
 	 * connection. We can only be sure RDMA activity
 	 * has ceased once the QP has been modified. */
 	while (!list_empty(&closes)) {
 		conn = list_entry(closes.next,
-				  struct kib_conn, ibc_connd_list);
+				  kib_conn_t, ibc_connd_list);
 		list_del(&conn->ibc_connd_list);
 		kiblnd_close_conn(conn, -ETIMEDOUT);
 		kiblnd_conn_decref(conn);
@@ -3492,7 +3271,7 @@ kiblnd_check_conns (int idx)
 	 * free to do it last time... */
 	while (!list_empty(&checksends)) {
 		conn = list_entry(checksends.next,
-				  struct kib_conn, ibc_connd_list);
+				  kib_conn_t, ibc_connd_list);
 		list_del(&conn->ibc_connd_list);
 
 		spin_lock(&conn->ibc_lock);
@@ -3504,7 +3283,7 @@ kiblnd_check_conns (int idx)
 }
 
 static void
-kiblnd_disconnect_conn(struct kib_conn *conn)
+kiblnd_disconnect_conn (kib_conn_t *conn)
 {
 	LASSERT (!in_interrupt());
 	LASSERT (current == kiblnd_data.kib_connd);
@@ -3533,7 +3312,7 @@ kiblnd_connd (void *arg)
 	spinlock_t	  *lock= &kiblnd_data.kib_connd_lock;
 	wait_queue_entry_t wait;
 	unsigned long      flags;
-	struct kib_conn *conn;
+	kib_conn_t        *conn;
 	int                timeout;
 	int                i;
 	int                dropped_lock;
@@ -3553,10 +3332,10 @@ kiblnd_connd (void *arg)
                 dropped_lock = 0;
 
 		if (!list_empty(&kiblnd_data.kib_connd_zombies)) {
-			struct kib_peer_ni *peer_ni = NULL;
+			kib_peer_ni_t *peer_ni = NULL;
 
 			conn = list_entry(kiblnd_data.kib_connd_zombies.next,
-					  struct kib_conn, ibc_list);
+					  kib_conn_t, ibc_list);
 			list_del(&conn->ibc_list);
 			if (conn->ibc_reconnect) {
 				peer_ni = conn->ibc_peer;
@@ -3566,13 +3345,11 @@ kiblnd_connd (void *arg)
 			spin_unlock_irqrestore(lock, flags);
 			dropped_lock = 1;
 
-			kiblnd_destroy_conn(conn);
+			kiblnd_destroy_conn(conn, !peer_ni);
 
 			spin_lock_irqsave(lock, flags);
-			if (!peer_ni) {
-				LIBCFS_FREE(conn, sizeof(*conn));
+			if (!peer_ni)
 				continue;
-			}
 
 			conn->ibc_peer = peer_ni;
 			if (peer_ni->ibp_reconnected < KIB_RECONN_HIGH_RACE)
@@ -3585,7 +3362,7 @@ kiblnd_connd (void *arg)
 
 		if (!list_empty(&kiblnd_data.kib_connd_conns)) {
 			conn = list_entry(kiblnd_data.kib_connd_conns.next,
-					  struct kib_conn, ibc_list);
+					      kib_conn_t, ibc_list);
 			list_del(&conn->ibc_list);
 
 			spin_unlock_irqrestore(lock, flags);
@@ -3598,8 +3375,7 @@ kiblnd_connd (void *arg)
                 }
 
 		while (reconn < KIB_RECONN_BREAK) {
-			if (kiblnd_data.kib_reconn_sec !=
-			    ktime_get_real_seconds()) {
+			if (kiblnd_data.kib_reconn_sec != ktime_get_real_seconds()) {
 				kiblnd_data.kib_reconn_sec = ktime_get_real_seconds();
 				list_splice_init(&kiblnd_data.kib_reconn_wait,
 						 &kiblnd_data.kib_reconn_list);
@@ -3609,7 +3385,7 @@ kiblnd_connd (void *arg)
 				break;
 
 			conn = list_entry(kiblnd_data.kib_reconn_list.next,
-					  struct kib_conn, ibc_list);
+					  kib_conn_t, ibc_list);
 			list_del(&conn->ibc_list);
 
 			spin_unlock_irqrestore(lock, flags);
@@ -3628,7 +3404,6 @@ kiblnd_connd (void *arg)
                         const int n = 4;
                         const int p = 1;
                         int       chunk = kiblnd_data.kib_peer_hash_size;
-			unsigned int lnd_timeout;
 
 			spin_unlock_irqrestore(lock, flags);
                         dropped_lock = 1;
@@ -3641,11 +3416,11 @@ kiblnd_connd (void *arg)
                          * connection within (n+1)/n times the timeout
                          * interval. */
 
-			lnd_timeout = lnet_get_lnd_timeout();
-			if (lnd_timeout > n * p)
-				chunk = (chunk * n * p) / lnd_timeout;
-			if (chunk == 0)
-				chunk = 1;
+                        if (*kiblnd_tunables.kib_timeout > n * p)
+                                chunk = (chunk * n * p) /
+                                        *kiblnd_tunables.kib_timeout;
+                        if (chunk == 0)
+                                chunk = 1;
 
 			for (i = 0; i < chunk; i++) {
 				kiblnd_check_conns(peer_index);
@@ -3681,36 +3456,23 @@ kiblnd_connd (void *arg)
 void
 kiblnd_qp_event(struct ib_event *event, void *arg)
 {
-	struct kib_conn *conn = arg;
+        kib_conn_t *conn = arg;
 
-	switch (event->event) {
-	case IB_EVENT_COMM_EST:
-		CDEBUG(D_NET, "%s established\n",
-		       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+        switch (event->event) {
+        case IB_EVENT_COMM_EST:
+                CDEBUG(D_NET, "%s established\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid));
 		/* We received a packet but connection isn't established
 		 * probably handshake packet was lost, so free to
 		 * force make connection established */
 		rdma_notify(conn->ibc_cmid, IB_EVENT_COMM_EST);
-		return;
-
-	case IB_EVENT_PORT_ERR:
-	case IB_EVENT_DEVICE_FATAL:
-		CERROR("Fatal device error for NI %s\n",
-		       libcfs_nid2str(conn->ibc_peer->ibp_ni->ni_nid));
-		atomic_set(&conn->ibc_peer->ibp_ni->ni_fatal_error_on, 1);
-		return;
-
-	case IB_EVENT_PORT_ACTIVE:
-		CERROR("Port reactivated for NI %s\n",
-		       libcfs_nid2str(conn->ibc_peer->ibp_ni->ni_nid));
-		atomic_set(&conn->ibc_peer->ibp_ni->ni_fatal_error_on, 0);
-		return;
+                return;
 
-	default:
-		CERROR("%s: Async QP event type %d\n",
-		       libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
-		return;
-	}
+        default:
+                CERROR("%s: Async QP event type %d\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
+                return;
+        }
 }
 
 static void
@@ -3756,9 +3518,9 @@ kiblnd_cq_completion(struct ib_cq *cq, void *arg)
 	 * consuming my CQ I could be called after all completions have
 	 * occurred.  But in this case, ibc_nrx == 0 && ibc_nsends_posted == 0
 	 * and this CQ is about to be destroyed so I NOOP. */
-	struct kib_conn	*conn = arg;
-	struct kib_sched_info *sched = conn->ibc_sched;
-	unsigned long flags;
+	kib_conn_t		*conn = (kib_conn_t *)arg;
+	struct kib_sched_info	*sched = conn->ibc_sched;
+	unsigned long		flags;
 
 	LASSERT(cq == conn->ibc_cq);
 
@@ -3783,7 +3545,7 @@ kiblnd_cq_completion(struct ib_cq *cq, void *arg)
 void
 kiblnd_cq_event(struct ib_event *event, void *arg)
 {
-	struct kib_conn *conn = arg;
+        kib_conn_t *conn = arg;
 
         CERROR("%s: async CQ event type %d\n",
                libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
@@ -3794,7 +3556,7 @@ kiblnd_scheduler(void *arg)
 {
 	long			id = (long)arg;
 	struct kib_sched_info	*sched;
-	struct kib_conn	*conn;
+	kib_conn_t		*conn;
 	wait_queue_entry_t      wait;
 	unsigned long		flags;
 	struct ib_wc		wc;
@@ -3832,7 +3594,7 @@ kiblnd_scheduler(void *arg)
 
 		if (!list_empty(&sched->ibs_conns)) {
 			conn = list_entry(sched->ibs_conns.next,
-					  struct kib_conn, ibc_sched_list);
+					      kib_conn_t, ibc_sched_list);
 			/* take over kib_sched_conns' ref on conn... */
 			LASSERT(conn->ibc_scheduled);
 			list_del(&conn->ibc_sched_list);
@@ -3936,7 +3698,7 @@ int
 kiblnd_failover_thread(void *arg)
 {
 	rwlock_t	*glock = &kiblnd_data.kib_global_lock;
-	struct kib_dev *dev;
+	kib_dev_t	*dev;
 	struct net *ns = arg;
 	wait_queue_entry_t wait;
 	unsigned long	 flags;
@@ -3955,7 +3717,8 @@ kiblnd_failover_thread(void *arg)
 
 		list_for_each_entry(dev, &kiblnd_data.kib_failed_devs,
                                     ibd_fail_list) {
-			if (ktime_get_seconds() < dev->ibd_next_failover)
+                        if (cfs_time_before(cfs_time_current(),
+                                            dev->ibd_next_failover))
                                 continue;
                         do_failover = 1;
                         break;
@@ -3973,13 +3736,13 @@ kiblnd_failover_thread(void *arg)
                         LASSERT (dev->ibd_failover);
                         dev->ibd_failover = 0;
                         if (rc >= 0) { /* Device is OK or failover succeed */
-				dev->ibd_next_failover = ktime_get_seconds() + 3;
+                                dev->ibd_next_failover = cfs_time_shift(3);
                                 continue;
                         }
 
                         /* failed to failover, retry later */
-			dev->ibd_next_failover = ktime_get_seconds() +
-						 min(dev->ibd_failed_failover, 10);
+                        dev->ibd_next_failover =
+                                cfs_time_shift(min(dev->ibd_failed_failover, 10));
                         if (kiblnd_dev_can_failover(dev)) {
 				list_add_tail(&dev->ibd_fail_list,
                                               &kiblnd_data.kib_failed_devs);
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_modparams.c
index 39f9a620d04a4..72cb50ecd14f5 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_modparams.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_modparams.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -82,7 +82,7 @@ static int peer_buffer_credits = 0;
 module_param(peer_buffer_credits, int, 0444);
 MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits");
 
-static int peer_timeout = DEFAULT_PEER_TIMEOUT;
+static int peer_timeout = 180;
 module_param(peer_timeout, int, 0444);
 MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)");
 
@@ -92,7 +92,7 @@ MODULE_PARM_DESC(ipif_name, "IPoIB interface name");
 
 static int retry_count = 5;
 module_param(retry_count, int, 0644);
-MODULE_PARM_DESC(retry_count, "Number of times to retry connection operations");
+MODULE_PARM_DESC(retry_count, "Retransmissions when no ACK received");
 
 static int rnr_retry_count = 6;
 module_param(rnr_retry_count, int, 0644);
@@ -110,46 +110,16 @@ static int concurrent_sends;
 module_param(concurrent_sends, int, 0444);
 MODULE_PARM_DESC(concurrent_sends, "send work-queue sizing");
 
-static int use_fastreg_gaps;
-module_param(use_fastreg_gaps, int, 0444);
-MODULE_PARM_DESC(use_fastreg_gaps, "Enable discontiguous fastreg fragment support. Expect performance drop");
-
-/*
- * map_on_demand is a flag used to determine if we can use FMR or FastReg.
- * This is applicable for kernels which support global memory regions. For
- * later kernels this flag is always enabled, since we will always either
- * use FMR or FastReg
- * For kernels which support global memory regions map_on_demand defaults
- * to 0 which means we will be using global memory regions exclusively.
- * If it is set to a value other than 0, then we will behave as follows:
- *  1. Always default the number of fragments to IBLND_MAX_RDMA_FRAGS
- *  2. Create FMR/FastReg pools
- *  3. Negotiate the supported number of fragments per connection
- *  4. Attempt to transmit using global memory regions only if
- *     map-on-demand is not turned on, otherwise use FMR or FastReg
- *  5. In case of transmitting tx with GAPS over FMR we will need to
- *     transmit it with multiple fragments. Look at the comments in
- *     kiblnd_fmr_map_tx() for an explanation of the behavior.
- *
- * For later kernels we default map_on_demand to 1 and not allow
- * it to be set to 0, since there is no longer support for global memory
- * regions. Behavior:
- *  1. Default the number of fragments to IBLND_MAX_RDMA_FRAGS
- *  2. Create FMR/FastReg pools
- *  3. Negotiate the supported number of fragments per connection
- *  4. Look at the comments in kiblnd_fmr_map_tx() for an explanation of
- *     the behavior when transmit with GAPS verses contiguous.
- */
 #ifdef HAVE_IB_GET_DMA_MR
 #define IBLND_DEFAULT_MAP_ON_DEMAND 0
-#define MOD_STR "map on demand"
+#define IBLND_MIN_MAP_ON_DEMAND 0
 #else
-#define IBLND_DEFAULT_MAP_ON_DEMAND 1
-#define MOD_STR "map on demand (obsolete)"
+#define IBLND_DEFAULT_MAP_ON_DEMAND IBLND_MAX_RDMA_FRAGS
+#define IBLND_MIN_MAP_ON_DEMAND 1
 #endif
 static int map_on_demand = IBLND_DEFAULT_MAP_ON_DEMAND;
 module_param(map_on_demand, int, 0444);
-MODULE_PARM_DESC(map_on_demand, MOD_STR);
+MODULE_PARM_DESC(map_on_demand, "map on demand");
 
 /* NB: this value is shared by all CPTs, it can grow at runtime */
 static int fmr_pool_size = 512;
@@ -186,7 +156,7 @@ static unsigned int wrq_sge = 2;
 module_param(wrq_sge, uint, 0444);
 MODULE_PARM_DESC(wrq_sge, "# scatter/gather element per work request");
 
-struct kib_tunables kiblnd_tunables = {
+kib_tunables_t kiblnd_tunables = {
         .kib_dev_failover           = &dev_failover,
         .kib_service                = &service,
         .kib_cksum                  = &cksum,
@@ -200,7 +170,6 @@ struct kib_tunables kiblnd_tunables = {
 	.kib_use_priv_port	    = &use_privileged_port,
 	.kib_nscheds		    = &nscheds,
 	.kib_wrq_sge		    = &wrq_sge,
-	.kib_use_fastreg_gaps       = &use_fastreg_gaps,
 };
 
 static struct lnet_ioctl_config_o2iblnd_tunables default_tunables;
@@ -267,15 +236,6 @@ kiblnd_tunables_setup(struct lnet_ni *ni)
 		net_tunables->lct_peer_tx_credits =
 			net_tunables->lct_max_tx_credits;
 
-#ifndef HAVE_IB_GET_DMA_MR
-	/*
-	 * For kernels which do not support global memory regions, always
-	 * enable map_on_demand
-	 */
-	if (tunables->lnd_map_on_demand == 0)
-		tunables->lnd_map_on_demand = 1;
-#endif
-
 	if (!tunables->lnd_peercredits_hiw)
 		tunables->lnd_peercredits_hiw = peer_credits_hiw;
 
@@ -285,8 +245,30 @@ kiblnd_tunables_setup(struct lnet_ni *ni)
 	if (tunables->lnd_peercredits_hiw >= net_tunables->lct_peer_tx_credits)
 		tunables->lnd_peercredits_hiw = net_tunables->lct_peer_tx_credits - 1;
 
-	if (tunables->lnd_concurrent_sends == 0)
-			tunables->lnd_concurrent_sends = net_tunables->lct_peer_tx_credits;
+	if (tunables->lnd_map_on_demand < IBLND_MIN_MAP_ON_DEMAND ||
+	    tunables->lnd_map_on_demand > IBLND_MAX_RDMA_FRAGS) {
+		/* Use the default */
+		CWARN("Invalid map_on_demand (%d), expects %d - %d. Using default of %d\n",
+		      tunables->lnd_map_on_demand, IBLND_MIN_MAP_ON_DEMAND,
+		      IBLND_MAX_RDMA_FRAGS, IBLND_DEFAULT_MAP_ON_DEMAND);
+		tunables->lnd_map_on_demand = IBLND_DEFAULT_MAP_ON_DEMAND;
+	}
+
+	if (tunables->lnd_map_on_demand == 1) {
+		/* don't make sense to create map if only one fragment */
+		tunables->lnd_map_on_demand = 2;
+	}
+
+	if (tunables->lnd_concurrent_sends == 0) {
+		if (tunables->lnd_map_on_demand > 0 &&
+		    tunables->lnd_map_on_demand <= IBLND_MAX_RDMA_FRAGS / 8) {
+			tunables->lnd_concurrent_sends =
+					net_tunables->lct_peer_tx_credits * 2;
+		} else {
+			tunables->lnd_concurrent_sends =
+				net_tunables->lct_peer_tx_credits;
+		}
+	}
 
 	if (tunables->lnd_concurrent_sends > net_tunables->lct_peer_tx_credits * 2)
 		tunables->lnd_concurrent_sends = net_tunables->lct_peer_tx_credits * 2;
@@ -321,7 +303,7 @@ int
 kiblnd_tunables_init(void)
 {
 	default_tunables.lnd_version = CURRENT_LND_VERSION;
-	default_tunables.lnd_peercredits_hiw = peer_credits_hiw;
+	default_tunables.lnd_peercredits_hiw = peer_credits_hiw,
 	default_tunables.lnd_map_on_demand = map_on_demand;
 	default_tunables.lnd_concurrent_sends = concurrent_sends;
 	default_tunables.lnd_fmr_pool_size = fmr_pool_size;
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c
index 9b199e3ab541a..d0b8756143580 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -41,17 +41,17 @@
 #include <linux/inetdevice.h>
 
 static struct lnet_lnd the_ksocklnd;
-struct ksock_nal_data ksocknal_data;
+ksock_nal_data_t        ksocknal_data;
 
-static struct ksock_interface *
+static ksock_interface_t *
 ksocknal_ip2iface(struct lnet_ni *ni, __u32 ip)
 {
-	struct ksock_net *net = ni->ni_data;
+	ksock_net_t *net = ni->ni_data;
 	int i;
-	struct ksock_interface *iface;
+	ksock_interface_t *iface;
 
 	for (i = 0; i < net->ksnn_ninterfaces; i++) {
-		LASSERT(i < LNET_INTERFACES_NUM);
+		LASSERT(i < LNET_NUM_INTERFACES);
 		iface = &net->ksnn_interfaces[i];
 
 		if (iface->ksni_ipaddr == ip)
@@ -61,10 +61,10 @@ ksocknal_ip2iface(struct lnet_ni *ni, __u32 ip)
 	return NULL;
 }
 
-static struct ksock_route *
-ksocknal_create_route(__u32 ipaddr, int port)
+static ksock_route_t *
+ksocknal_create_route (__u32 ipaddr, int port)
 {
-	struct ksock_route *route;
+	ksock_route_t *route;
 
 	LIBCFS_ALLOC (route, sizeof (*route));
 	if (route == NULL)
@@ -86,7 +86,7 @@ ksocknal_create_route(__u32 ipaddr, int port)
 }
 
 void
-ksocknal_destroy_route(struct ksock_route *route)
+ksocknal_destroy_route (ksock_route_t *route)
 {
 	LASSERT (atomic_read(&route->ksnr_refcount) == 0);
 
@@ -97,12 +97,12 @@ ksocknal_destroy_route(struct ksock_route *route)
 }
 
 static int
-ksocknal_create_peer(struct ksock_peer_ni **peerp, struct lnet_ni *ni,
+ksocknal_create_peer(ksock_peer_ni_t **peerp, struct lnet_ni *ni,
 		     struct lnet_process_id id)
 {
-	int cpt = lnet_cpt_of_nid(id.nid, ni);
-	struct ksock_net *net = ni->ni_data;
-	struct ksock_peer_ni *peer_ni;
+	int		cpt = lnet_cpt_of_nid(id.nid, ni);
+	ksock_net_t	*net = ni->ni_data;
+	ksock_peer_ni_t	*peer_ni;
 
 	LASSERT(id.nid != LNET_NID_ANY);
 	LASSERT(id.pid != LNET_PID_ANY);
@@ -146,9 +146,9 @@ ksocknal_create_peer(struct ksock_peer_ni **peerp, struct lnet_ni *ni,
 }
 
 void
-ksocknal_destroy_peer(struct ksock_peer_ni *peer_ni)
+ksocknal_destroy_peer (ksock_peer_ni_t *peer_ni)
 {
-	struct ksock_net *net = peer_ni->ksnp_ni->ni_data;
+	ksock_net_t    *net = peer_ni->ksnp_ni->ni_data;
 
 	CDEBUG (D_NET, "peer_ni %s %p deleted\n",
 		libcfs_id2str(peer_ni->ksnp_id), peer_ni);
@@ -171,15 +171,16 @@ ksocknal_destroy_peer(struct ksock_peer_ni *peer_ni)
 	spin_unlock_bh(&net->ksnn_lock);
 }
 
-struct ksock_peer_ni *
+ksock_peer_ni_t *
 ksocknal_find_peer_locked(struct lnet_ni *ni, struct lnet_process_id id)
 {
 	struct list_head *peer_list = ksocknal_nid2peerlist(id.nid);
 	struct list_head *tmp;
-	struct ksock_peer_ni *peer_ni;
+	ksock_peer_ni_t	 *peer_ni;
 
 	list_for_each(tmp, peer_list) {
-		peer_ni = list_entry(tmp, struct ksock_peer_ni, ksnp_list);
+
+		peer_ni = list_entry(tmp, ksock_peer_ni_t, ksnp_list);
 
 		LASSERT(!peer_ni->ksnp_closing);
 
@@ -198,10 +199,10 @@ ksocknal_find_peer_locked(struct lnet_ni *ni, struct lnet_process_id id)
 	return NULL;
 }
 
-struct ksock_peer_ni *
+ksock_peer_ni_t *
 ksocknal_find_peer(struct lnet_ni *ni, struct lnet_process_id id)
 {
-	struct ksock_peer_ni *peer_ni;
+        ksock_peer_ni_t     *peer_ni;
 
 	read_lock(&ksocknal_data.ksnd_global_lock);
 	peer_ni = ksocknal_find_peer_locked(ni, id);
@@ -213,14 +214,14 @@ ksocknal_find_peer(struct lnet_ni *ni, struct lnet_process_id id)
 }
 
 static void
-ksocknal_unlink_peer_locked(struct ksock_peer_ni *peer_ni)
+ksocknal_unlink_peer_locked(ksock_peer_ni_t *peer_ni)
 {
 	int i;
 	__u32 ip;
-	struct ksock_interface *iface;
+	ksock_interface_t *iface;
 
 	for (i = 0; i < peer_ni->ksnp_n_passive_ips; i++) {
-		LASSERT(i < LNET_INTERFACES_NUM);
+		LASSERT(i < LNET_NUM_INTERFACES);
 		ip = peer_ni->ksnp_passive_ips[i];
 
 		iface = ksocknal_ip2iface(peer_ni->ksnp_ni, ip);
@@ -249,19 +250,19 @@ ksocknal_get_peer_info(struct lnet_ni *ni, int index,
 		       struct lnet_process_id *id, __u32 *myip, __u32 *peer_ip,
 		       int *port, int *conn_count, int *share_count)
 {
-	struct ksock_peer_ni *peer_ni;
-	struct list_head *ptmp;
-	struct ksock_route *route;
-	struct list_head *rtmp;
-	int i;
-	int j;
-	int rc = -ENOENT;
+	ksock_peer_ni_t	  *peer_ni;
+	struct list_head  *ptmp;
+	ksock_route_t     *route;
+	struct list_head  *rtmp;
+	int		   i;
+        int                j;
+	int		   rc = -ENOENT;
 
 	read_lock(&ksocknal_data.ksnd_global_lock);
 
 	for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
 		list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
-			peer_ni = list_entry(ptmp, struct ksock_peer_ni, ksnp_list);
+			peer_ni = list_entry(ptmp, ksock_peer_ni_t, ksnp_list);
 
 			if (peer_ni->ksnp_ni != ni)
 				continue;
@@ -299,7 +300,7 @@ ksocknal_get_peer_info(struct lnet_ni *ni, int index,
 				if (index-- > 0)
 					continue;
 
-				route = list_entry(rtmp, struct ksock_route,
+				route = list_entry(rtmp, ksock_route_t,
 						   ksnr_list);
 
 				*id = peer_ni->ksnp_id;
@@ -319,11 +320,11 @@ ksocknal_get_peer_info(struct lnet_ni *ni, int index,
 }
 
 static void
-ksocknal_associate_route_conn_locked(struct ksock_route *route, struct ksock_conn *conn)
+ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn)
 {
-	struct ksock_peer_ni *peer_ni = route->ksnr_peer;
-	int type = conn->ksnc_type;
-	struct ksock_interface *iface;
+	ksock_peer_ni_t	  *peer_ni = route->ksnr_peer;
+	int		   type = conn->ksnc_type;
+	ksock_interface_t *iface;
 
 	conn->ksnc_route = route;
 	ksocknal_route_addref(route);
@@ -363,11 +364,11 @@ ksocknal_associate_route_conn_locked(struct ksock_route *route, struct ksock_con
 }
 
 static void
-ksocknal_add_route_locked(struct ksock_peer_ni *peer_ni, struct ksock_route *route)
+ksocknal_add_route_locked (ksock_peer_ni_t *peer_ni, ksock_route_t *route)
 {
 	struct list_head *tmp;
-	struct ksock_conn *conn;
-	struct ksock_route *route2;
+	ksock_conn_t	 *conn;
+	ksock_route_t	 *route2;
 
 	LASSERT(!peer_ni->ksnp_closing);
 	LASSERT(route->ksnr_peer == NULL);
@@ -377,7 +378,7 @@ ksocknal_add_route_locked(struct ksock_peer_ni *peer_ni, struct ksock_route *rou
 
 	/* LASSERT(unique) */
 	list_for_each(tmp, &peer_ni->ksnp_routes) {
-		route2 = list_entry(tmp, struct ksock_route, ksnr_list);
+		route2 = list_entry(tmp, ksock_route_t, ksnr_list);
 
 		if (route2->ksnr_ipaddr == route->ksnr_ipaddr) {
 			CERROR("Duplicate route %s %pI4h\n",
@@ -393,7 +394,7 @@ ksocknal_add_route_locked(struct ksock_peer_ni *peer_ni, struct ksock_route *rou
 	list_add_tail(&route->ksnr_list, &peer_ni->ksnp_routes);
 
 	list_for_each(tmp, &peer_ni->ksnp_conns) {
-		conn = list_entry(tmp, struct ksock_conn, ksnc_list);
+		conn = list_entry(tmp, ksock_conn_t, ksnc_list);
 
 		if (conn->ksnc_ipaddr != route->ksnr_ipaddr)
 			continue;
@@ -404,19 +405,19 @@ ksocknal_add_route_locked(struct ksock_peer_ni *peer_ni, struct ksock_route *rou
 }
 
 static void
-ksocknal_del_route_locked(struct ksock_route *route)
+ksocknal_del_route_locked (ksock_route_t *route)
 {
-	struct ksock_peer_ni *peer_ni = route->ksnr_peer;
-	struct ksock_interface *iface;
-	struct ksock_conn *conn;
-	struct list_head *ctmp;
-	struct list_head *cnxt;
+	ksock_peer_ni_t	  *peer_ni = route->ksnr_peer;
+	ksock_interface_t *iface;
+	ksock_conn_t	  *conn;
+	struct list_head  *ctmp;
+	struct list_head  *cnxt;
 
 	LASSERT(!route->ksnr_deleted);
 
 	/* Close associated conns */
 	list_for_each_safe(ctmp, cnxt, &peer_ni->ksnp_conns) {
-		conn = list_entry(ctmp, struct ksock_conn, ksnc_list);
+		conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
 
 		if (conn->ksnc_route != route)
 			continue;
@@ -448,11 +449,11 @@ ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ipaddr,
 		  int port)
 {
 	struct list_head *tmp;
-	struct ksock_peer_ni *peer_ni;
-	struct ksock_peer_ni *peer2;
-	struct ksock_route *route;
-	struct ksock_route *route2;
-	int rc;
+	ksock_peer_ni_t	 *peer_ni;
+	ksock_peer_ni_t	 *peer2;
+	ksock_route_t	 *route;
+	ksock_route_t	 *route2;
+	int		  rc;
 
         if (id.nid == LNET_NID_ANY ||
             id.pid == LNET_PID_ANY)
@@ -472,7 +473,7 @@ ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ipaddr,
 	write_lock_bh(&ksocknal_data.ksnd_global_lock);
 
         /* always called with a ref on ni, so shutdown can't have started */
-	LASSERT(((struct ksock_net *) ni->ni_data)->ksnn_shutdown == 0);
+        LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0);
 
 	peer2 = ksocknal_find_peer_locked(ni, id);
 	if (peer2 != NULL) {
@@ -486,7 +487,7 @@ ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ipaddr,
 
 	route2 = NULL;
 	list_for_each(tmp, &peer_ni->ksnp_routes) {
-		route2 = list_entry(tmp, struct ksock_route, ksnr_list);
+		route2 = list_entry(tmp, ksock_route_t, ksnr_list);
 
 		if (route2->ksnr_ipaddr == ipaddr)
 			break;
@@ -507,13 +508,13 @@ ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ipaddr,
 }
 
 static void
-ksocknal_del_peer_locked(struct ksock_peer_ni *peer_ni, __u32 ip)
+ksocknal_del_peer_locked (ksock_peer_ni_t *peer_ni, __u32 ip)
 {
-	struct ksock_conn *conn;
-	struct ksock_route *route;
+	ksock_conn_t	 *conn;
+	ksock_route_t	 *route;
 	struct list_head *tmp;
 	struct list_head *nxt;
-	int nshared;
+	int		  nshared;
 
 	LASSERT(!peer_ni->ksnp_closing);
 
@@ -521,7 +522,7 @@ ksocknal_del_peer_locked(struct ksock_peer_ni *peer_ni, __u32 ip)
 	ksocknal_peer_addref(peer_ni);
 
 	list_for_each_safe(tmp, nxt, &peer_ni->ksnp_routes) {
-		route = list_entry(tmp, struct ksock_route, ksnr_list);
+		route = list_entry(tmp, ksock_route_t, ksnr_list);
 
 		/* no match */
 		if (!(ip == 0 || route->ksnr_ipaddr == ip))
@@ -534,7 +535,7 @@ ksocknal_del_peer_locked(struct ksock_peer_ni *peer_ni, __u32 ip)
 
 	nshared = 0;
 	list_for_each_safe(tmp, nxt, &peer_ni->ksnp_routes) {
-		route = list_entry(tmp, struct ksock_route, ksnr_list);
+		route = list_entry(tmp, ksock_route_t, ksnr_list);
 		nshared += route->ksnr_share_count;
 	}
 
@@ -543,7 +544,7 @@ ksocknal_del_peer_locked(struct ksock_peer_ni *peer_ni, __u32 ip)
 		 * left */
 
 		list_for_each_safe(tmp, nxt, &peer_ni->ksnp_routes) {
-			route = list_entry(tmp, struct ksock_route, ksnr_list);
+			route = list_entry(tmp, ksock_route_t, ksnr_list);
 
 			/* we should only be removing auto-entries */
 			LASSERT(route->ksnr_share_count == 0);
@@ -551,27 +552,27 @@ ksocknal_del_peer_locked(struct ksock_peer_ni *peer_ni, __u32 ip)
 		}
 
 		list_for_each_safe(tmp, nxt, &peer_ni->ksnp_conns) {
-			conn = list_entry(tmp, struct ksock_conn, ksnc_list);
+			conn = list_entry(tmp, ksock_conn_t, ksnc_list);
 
 			ksocknal_close_conn_locked(conn, 0);
 		}
 	}
 
 	ksocknal_peer_decref(peer_ni);
-	/* NB peer_ni unlinks itself when last conn/route is removed */
+		/* NB peer_ni unlinks itself when last conn/route is removed */
 }
 
 static int
 ksocknal_del_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip)
 {
-	struct list_head zombies = LIST_HEAD_INIT(zombies);
+	struct list_head  zombies = LIST_HEAD_INIT(zombies);
 	struct list_head *ptmp;
 	struct list_head *pnxt;
-	struct ksock_peer_ni *peer_ni;
-	int lo;
-	int hi;
-	int i;
-	int rc = -ENOENT;
+	ksock_peer_ni_t     *peer_ni;
+	int		  lo;
+	int		  hi;
+	int		  i;
+	int		  rc = -ENOENT;
 
 	write_lock_bh(&ksocknal_data.ksnd_global_lock);
 
@@ -587,7 +588,7 @@ ksocknal_del_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip)
 	for (i = lo; i <= hi; i++) {
 		list_for_each_safe(ptmp, pnxt,
 				   &ksocknal_data.ksnd_peers[i]) {
-			peer_ni = list_entry(ptmp, struct ksock_peer_ni, ksnp_list);
+			peer_ni = list_entry(ptmp, ksock_peer_ni_t, ksnp_list);
 
 			if (peer_ni->ksnp_ni != ni)
 				continue;
@@ -624,20 +625,20 @@ ksocknal_del_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip)
 	return rc;
 }
 
-static struct ksock_conn *
+static ksock_conn_t *
 ksocknal_get_conn_by_idx(struct lnet_ni *ni, int index)
 {
-	struct ksock_peer_ni *peer_ni;
+	ksock_peer_ni_t	 *peer_ni;
 	struct list_head *ptmp;
-	struct ksock_conn *conn;
+	ksock_conn_t	 *conn;
 	struct list_head *ctmp;
-	int i;
+	int		  i;
 
 	read_lock(&ksocknal_data.ksnd_global_lock);
 
 	for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
 		list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
-			peer_ni = list_entry(ptmp, struct ksock_peer_ni, ksnp_list);
+			peer_ni = list_entry(ptmp, ksock_peer_ni_t, ksnp_list);
 
 			LASSERT(!peer_ni->ksnp_closing);
 
@@ -648,7 +649,7 @@ ksocknal_get_conn_by_idx(struct lnet_ni *ni, int index)
 				if (index-- > 0)
 					continue;
 
-				conn = list_entry(ctmp, struct ksock_conn,
+				conn = list_entry(ctmp, ksock_conn_t,
 						  ksnc_list);
 				ksocknal_conn_addref(conn);
 				read_unlock(&ksocknal_data. \
@@ -662,37 +663,50 @@ ksocknal_get_conn_by_idx(struct lnet_ni *ni, int index)
 	return NULL;
 }
 
-static struct ksock_sched *
+static ksock_sched_t *
 ksocknal_choose_scheduler_locked(unsigned int cpt)
 {
-	struct ksock_sched *sched = ksocknal_data.ksnd_schedulers[cpt];
-	int i;
+	struct ksock_sched_info	*info = ksocknal_data.ksnd_sched_info[cpt];
+	ksock_sched_t		*sched;
+	int			i;
 
-	if (sched->kss_nthreads == 0) {
-		cfs_percpt_for_each(sched, i, ksocknal_data.ksnd_schedulers) {
-			if (sched->kss_nthreads > 0) {
+	if (info->ksi_nthreads == 0) {
+		cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
+			if (info->ksi_nthreads > 0) {
 				CDEBUG(D_NET, "scheduler[%d] has no threads. selected scheduler[%d]\n",
-				       cpt, sched->kss_cpt);
-				return sched;
+				       cpt, info->ksi_cpt);
+				goto select_sched;
 			}
 		}
 		return NULL;
 	}
 
+select_sched:
+	sched = &info->ksi_scheds[0];
+	/*
+	 * NB: it's safe so far, but info->ksi_nthreads could be changed
+	 * at runtime when we have dynamic LNet configuration, then we
+	 * need to take care of this.
+	 */
+	for (i = 1; i < info->ksi_nthreads; i++) {
+		if (sched->kss_nconns > info->ksi_scheds[i].kss_nconns)
+			sched = &info->ksi_scheds[i];
+	}
+
 	return sched;
 }
 
 static int
 ksocknal_local_ipvec(struct lnet_ni *ni, __u32 *ipaddrs)
 {
-	struct ksock_net *net = ni->ni_data;
+	ksock_net_t *net = ni->ni_data;
 	int i;
 	int nip;
 
 	read_lock(&ksocknal_data.ksnd_global_lock);
 
 	nip = net->ksnn_ninterfaces;
-	LASSERT(nip <= LNET_INTERFACES_NUM);
+	LASSERT(nip <= LNET_NUM_INTERFACES);
 
 	/*
 	 * Only offer interfaces for additional connections if I have
@@ -713,14 +727,14 @@ ksocknal_local_ipvec(struct lnet_ni *ni, __u32 *ipaddrs)
 }
 
 static int
-ksocknal_match_peerip(struct ksock_interface *iface, __u32 *ips, int nips)
+ksocknal_match_peerip (ksock_interface_t *iface, __u32 *ips, int nips)
 {
-	int best_netmatch = 0;
-	int best_xor = 0;
-	int best = -1;
-	int this_xor;
-	int this_netmatch;
-	int i;
+        int   best_netmatch = 0;
+        int   best_xor      = 0;
+        int   best          = -1;
+        int   this_xor;
+        int   this_netmatch;
+        int   i;
 
         for (i = 0; i < nips; i++) {
                 if (ips[i] == 0)
@@ -745,21 +759,21 @@ ksocknal_match_peerip(struct ksock_interface *iface, __u32 *ips, int nips)
 }
 
 static int
-ksocknal_select_ips(struct ksock_peer_ni *peer_ni, __u32 *peerips, int n_peerips)
+ksocknal_select_ips(ksock_peer_ni_t *peer_ni, __u32 *peerips, int n_peerips)
 {
-	rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock;
-	struct ksock_net *net = peer_ni->ksnp_ni->ni_data;
-	struct ksock_interface *iface;
-	struct ksock_interface *best_iface;
-	int n_ips;
-	int i;
-	int j;
-	int k;
-	u32 ip;
-	u32 xor;
-	int this_netmatch;
-	int best_netmatch;
-	int best_npeers;
+	rwlock_t		*global_lock = &ksocknal_data.ksnd_global_lock;
+        ksock_net_t        *net = peer_ni->ksnp_ni->ni_data;
+        ksock_interface_t  *iface;
+        ksock_interface_t  *best_iface;
+        int                 n_ips;
+        int                 i;
+        int                 j;
+        int                 k;
+        __u32               ip;
+        __u32               xor;
+        int                 this_netmatch;
+        int                 best_netmatch;
+        int                 best_npeers;
 
         /* CAVEAT EMPTOR: We do all our interface matching with an
          * exclusive hold of global lock at IRQ priority.  We're only
@@ -771,8 +785,8 @@ ksocknal_select_ips(struct ksock_peer_ni *peer_ni, __u32 *peerips, int n_peerips
 
 	write_lock_bh(global_lock);
 
-	LASSERT(n_peerips <= LNET_INTERFACES_NUM);
-	LASSERT(net->ksnn_ninterfaces <= LNET_INTERFACES_NUM);
+	LASSERT(n_peerips <= LNET_NUM_INTERFACES);
+	LASSERT(net->ksnn_ninterfaces <= LNET_NUM_INTERFACES);
 
 	/* Only match interfaces for additional connections
          * if I have > 1 interface */
@@ -851,17 +865,17 @@ ksocknal_select_ips(struct ksock_peer_ni *peer_ni, __u32 *peerips, int n_peerips
 }
 
 static void
-ksocknal_create_routes(struct ksock_peer_ni *peer_ni, int port,
+ksocknal_create_routes(ksock_peer_ni_t *peer_ni, int port,
                        __u32 *peer_ipaddrs, int npeer_ipaddrs)
 {
-	struct ksock_route		*newroute = NULL;
+	ksock_route_t		*newroute = NULL;
 	rwlock_t		*global_lock = &ksocknal_data.ksnd_global_lock;
 	struct lnet_ni *ni = peer_ni->ksnp_ni;
-	struct ksock_net		*net = ni->ni_data;
+	ksock_net_t		*net = ni->ni_data;
 	struct list_head	*rtmp;
-	struct ksock_route		*route;
-	struct ksock_interface	*iface;
-	struct ksock_interface	*best_iface;
+	ksock_route_t		*route;
+	ksock_interface_t	*iface;
+	ksock_interface_t	*best_iface;
 	int			best_netmatch;
 	int			this_netmatch;
 	int			best_nroutes;
@@ -882,7 +896,7 @@ ksocknal_create_routes(struct ksock_peer_ni *peer_ni, int port,
                 return;
         }
 
-	LASSERT(npeer_ipaddrs <= LNET_INTERFACES_NUM);
+	LASSERT(npeer_ipaddrs <= LNET_NUM_INTERFACES);
 
         for (i = 0; i < npeer_ipaddrs; i++) {
                 if (newroute != NULL) {
@@ -905,7 +919,7 @@ ksocknal_create_routes(struct ksock_peer_ni *peer_ni, int port,
 		/* Already got a route? */
 		route = NULL;
 		list_for_each(rtmp, &peer_ni->ksnp_routes) {
-			route = list_entry(rtmp, struct ksock_route, ksnr_list);
+			route = list_entry(rtmp, ksock_route_t, ksnr_list);
 
 			if (route->ksnr_ipaddr == newroute->ksnr_ipaddr)
 				break;
@@ -919,7 +933,7 @@ ksocknal_create_routes(struct ksock_peer_ni *peer_ni, int port,
 		best_nroutes = 0;
 		best_netmatch = 0;
 
-		LASSERT(net->ksnn_ninterfaces <= LNET_INTERFACES_NUM);
+		LASSERT(net->ksnn_ninterfaces <= LNET_NUM_INTERFACES);
 
 		/* Select interface to connect from */
 		for (j = 0; j < net->ksnn_ninterfaces; j++) {
@@ -927,7 +941,7 @@ ksocknal_create_routes(struct ksock_peer_ni *peer_ni, int port,
 
 			/* Using this interface already? */
 			list_for_each(rtmp, &peer_ni->ksnp_routes) {
-				route = list_entry(rtmp, struct ksock_route,
+				route = list_entry(rtmp, ksock_route_t,
 						   ksnr_list);
 
 				if (route->ksnr_myipaddr == iface->ksni_ipaddr)
@@ -971,10 +985,10 @@ ksocknal_create_routes(struct ksock_peer_ni *peer_ni, int port,
 int
 ksocknal_accept(struct lnet_ni *ni, struct socket *sock)
 {
-	struct ksock_connreq *cr;
-	int rc;
-	u32 peer_ip;
-	int peer_port;
+	ksock_connreq_t	*cr;
+	int		 rc;
+	__u32		 peer_ip;
+	int		 peer_port;
 
 	rc = lnet_sock_getaddr(sock, true, &peer_ip, &peer_port);
 	LASSERT(rc == 0);		/* we succeeded before */
@@ -1000,9 +1014,9 @@ ksocknal_accept(struct lnet_ni *ni, struct socket *sock)
 }
 
 static int
-ksocknal_connecting(struct ksock_peer_ni *peer_ni, __u32 ipaddr)
+ksocknal_connecting (ksock_peer_ni_t *peer_ni, __u32 ipaddr)
 {
-	struct ksock_route *route;
+	ksock_route_t *route;
 
 	list_for_each_entry(route, &peer_ni->ksnp_routes, ksnr_list) {
 		if (route->ksnr_ipaddr == ipaddr)
@@ -1012,27 +1026,27 @@ ksocknal_connecting(struct ksock_peer_ni *peer_ni, __u32 ipaddr)
 }
 
 int
-ksocknal_create_conn(struct lnet_ni *ni, struct ksock_route *route,
+ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
 		     struct socket *sock, int type)
 {
-	rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock;
-	struct list_head zombies = LIST_HEAD_INIT(zombies);
+	rwlock_t		*global_lock = &ksocknal_data.ksnd_global_lock;
+	struct list_head	zombies = LIST_HEAD_INIT(zombies);
 	struct lnet_process_id peerid;
-	struct list_head *tmp;
-	u64 incarnation;
-	struct ksock_conn *conn;
-	struct ksock_conn *conn2;
-	struct ksock_peer_ni *peer_ni = NULL;
-	struct ksock_peer_ni *peer2;
-	struct ksock_sched *sched;
+	struct list_head	*tmp;
+        __u64              incarnation;
+        ksock_conn_t      *conn;
+        ksock_conn_t      *conn2;
+        ksock_peer_ni_t      *peer_ni = NULL;
+        ksock_peer_ni_t      *peer2;
+        ksock_sched_t     *sched;
 	struct ksock_hello_msg *hello;
-	int cpt;
-	struct ksock_tx *tx;
-	struct ksock_tx *txtmp;
-	int rc;
-	int rc2;
-	int active;
-	char *warn = NULL;
+	int		   cpt;
+        ksock_tx_t        *tx;
+        ksock_tx_t        *txtmp;
+        int                rc;
+	int                rc2;
+        int                active;
+        char              *warn = NULL;
 
         active = (route != NULL);
 
@@ -1064,7 +1078,7 @@ ksocknal_create_conn(struct lnet_ni *ni, struct ksock_route *route,
 	atomic_set (&conn->ksnc_tx_nob, 0);
 
 	LIBCFS_ALLOC(hello, offsetof(struct ksock_hello_msg,
-				     kshm_ips[LNET_INTERFACES_NUM]));
+				     kshm_ips[LNET_NUM_INTERFACES]));
         if (hello == NULL) {
                 rc = -ENOMEM;
                 goto failed_1;
@@ -1134,7 +1148,7 @@ ksocknal_create_conn(struct lnet_ni *ni, struct ksock_route *route,
 		write_lock_bh(global_lock);
 
                 /* called with a ref on ni, so shutdown can't have started */
-		LASSERT(((struct ksock_net *) ni->ni_data)->ksnn_shutdown == 0);
+                LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0);
 
 		peer2 = ksocknal_find_peer_locked(ni, peerid);
 		if (peer2 == NULL) {
@@ -1210,7 +1224,7 @@ ksocknal_create_conn(struct lnet_ni *ni, struct ksock_route *route,
 	 * loopback connection */
 	if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) {
 		list_for_each(tmp, &peer_ni->ksnp_conns) {
-			conn2 = list_entry(tmp, struct ksock_conn, ksnc_list);
+			conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
 
                         if (conn2->ksnc_ipaddr != conn->ksnc_ipaddr ||
                             conn2->ksnc_myipaddr != conn->ksnc_myipaddr ||
@@ -1244,7 +1258,7 @@ ksocknal_create_conn(struct lnet_ni *ni, struct ksock_route *route,
 	 * by routes in my peer_ni to match my own route entries so I don't
 	 * continually create duplicate routes. */
 	list_for_each(tmp, &peer_ni->ksnp_routes) {
-		route = list_entry(tmp, struct ksock_route, ksnr_list);
+		route = list_entry(tmp, ksock_route_t, ksnr_list);
 
 		if (route->ksnr_ipaddr != conn->ksnc_ipaddr)
 			continue;
@@ -1254,7 +1268,7 @@ ksocknal_create_conn(struct lnet_ni *ni, struct ksock_route *route,
 	}
 
 	conn->ksnc_peer = peer_ni;                 /* conn takes my ref on peer_ni */
-	peer_ni->ksnp_last_alive = ktime_get_seconds();
+	peer_ni->ksnp_last_alive = cfs_time_current();
 	peer_ni->ksnp_send_keepalive = 0;
 	peer_ni->ksnp_error = 0;
 
@@ -1267,15 +1281,14 @@ ksocknal_create_conn(struct lnet_ni *ni, struct ksock_route *route,
 	 * The cpt might have changed if we ended up selecting a non cpt
 	 * native scheduler. So use the scheduler's cpt instead.
 	 */
-	cpt = sched->kss_cpt;
+	cpt = sched->kss_info->ksi_cpt;
         sched->kss_nconns++;
         conn->ksnc_scheduler = sched;
 
-	conn->ksnc_tx_last_post = ktime_get_seconds();
+	conn->ksnc_tx_last_post = cfs_time_current();
 	/* Set the deadline for the outgoing HELLO to drain */
 	conn->ksnc_tx_bufnob = sock->sk->sk_wmem_queued;
-	conn->ksnc_tx_deadline = ktime_get_seconds() +
-				 lnet_get_lnd_timeout();
+	conn->ksnc_tx_deadline = cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
 	smp_mb();   /* order with adding to peer_ni's conn list */
 
 	list_add(&conn->ksnc_list, &peer_ni->ksnp_conns);
@@ -1306,10 +1319,11 @@ ksocknal_create_conn(struct lnet_ni *ni, struct ksock_route *route,
          */
 
 	CDEBUG(D_NET, "New conn %s p %d.x %pI4h -> %pI4h/%d"
-	       " incarnation:%lld sched[%d]\n",
+	       " incarnation:%lld sched[%d:%d]\n",
 	       libcfs_id2str(peerid), conn->ksnc_proto->pro_version,
 	       &conn->ksnc_myipaddr, &conn->ksnc_ipaddr,
-	       conn->ksnc_port, incarnation, cpt);
+	       conn->ksnc_port, incarnation, cpt,
+	       (int)(sched - &sched->kss_info->ksi_scheds[0]));
 
         if (active) {
                 /* additional routes after interface exchange? */
@@ -1322,7 +1336,7 @@ ksocknal_create_conn(struct lnet_ni *ni, struct ksock_route *route,
         }
 
 	LIBCFS_FREE(hello, offsetof(struct ksock_hello_msg,
-				    kshm_ips[LNET_INTERFACES_NUM]));
+				    kshm_ips[LNET_NUM_INTERFACES]));
 
         /* setup the socket AFTER I've received hello (it disables
          * SO_LINGER).  I might call back to the acceptor who may want
@@ -1406,7 +1420,7 @@ ksocknal_create_conn(struct lnet_ni *ni, struct ksock_route *route,
 failed_1:
 	if (hello != NULL)
 		LIBCFS_FREE(hello, offsetof(struct ksock_hello_msg,
-					    kshm_ips[LNET_INTERFACES_NUM]));
+					    kshm_ips[LNET_NUM_INTERFACES]));
 
 	LIBCFS_FREE(conn, sizeof(*conn));
 
@@ -1416,15 +1430,15 @@ ksocknal_create_conn(struct lnet_ni *ni, struct ksock_route *route,
 }
 
 void
-ksocknal_close_conn_locked(struct ksock_conn *conn, int error)
+ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
 {
         /* This just does the immmediate housekeeping, and queues the
          * connection for the reaper to terminate.
          * Caller holds ksnd_global_lock exclusively in irq context */
-	struct ksock_peer_ni *peer_ni = conn->ksnc_peer;
-	struct ksock_route *route;
-	struct ksock_conn *conn2;
-	struct list_head *tmp;
+        ksock_peer_ni_t      *peer_ni = conn->ksnc_peer;
+        ksock_route_t     *route;
+        ksock_conn_t      *conn2;
+	struct list_head  *tmp;
 
 	LASSERT(peer_ni->ksnp_error == 0);
 	LASSERT(!conn->ksnc_closing);
@@ -1441,7 +1455,7 @@ ksocknal_close_conn_locked(struct ksock_conn *conn, int error)
 
 		conn2 = NULL;
 		list_for_each(tmp, &peer_ni->ksnp_conns) {
-			conn2 = list_entry(tmp, struct ksock_conn, ksnc_list);
+			conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
 
 			if (conn2->ksnc_route == route &&
 			    conn2->ksnc_type == conn->ksnc_type)
@@ -1461,7 +1475,7 @@ ksocknal_close_conn_locked(struct ksock_conn *conn, int error)
 		/* No more connections to this peer_ni */
 
 		if (!list_empty(&peer_ni->ksnp_tx_queue)) {
-				struct ksock_tx *tx;
+				ksock_tx_t *tx;
 
 			LASSERT(conn->ksnc_proto == &ksocknal_protocol_v3x);
 
@@ -1499,10 +1513,10 @@ ksocknal_close_conn_locked(struct ksock_conn *conn, int error)
 }
 
 void
-ksocknal_peer_failed(struct ksock_peer_ni *peer_ni)
+ksocknal_peer_failed (ksock_peer_ni_t *peer_ni)
 {
-	int notify = 0;
-	time64_t last_alive = 0;
+        int        notify = 0;
+	cfs_time_t last_alive = 0;
 
 	/* There has been a connection failure or comms error; but I'll only
 	 * tell LNET I think the peer_ni is dead if it's to another kernel and
@@ -1526,12 +1540,12 @@ ksocknal_peer_failed(struct ksock_peer_ni *peer_ni)
 }
 
 void
-ksocknal_finalize_zcreq(struct ksock_conn *conn)
+ksocknal_finalize_zcreq(ksock_conn_t *conn)
 {
-	struct ksock_peer_ni *peer_ni = conn->ksnc_peer;
-	struct ksock_tx *tx;
-	struct ksock_tx *tmp;
-	struct list_head zlist = LIST_HEAD_INIT(zlist);
+	ksock_peer_ni_t	 *peer_ni = conn->ksnc_peer;
+	ksock_tx_t	 *tx;
+	ksock_tx_t	 *tmp;
+	struct list_head  zlist = LIST_HEAD_INIT(zlist);
 
 	/* NB safe to finalize TXs because closing of socket will
 	 * abort all buffered data */
@@ -1554,7 +1568,7 @@ ksocknal_finalize_zcreq(struct ksock_conn *conn)
 	spin_unlock(&peer_ni->ksnp_lock);
 
 	while (!list_empty(&zlist)) {
-		tx = list_entry(zlist.next, struct ksock_tx, tx_zc_list);
+		tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list);
 
 		list_del(&tx->tx_zc_list);
 		ksocknal_tx_decref(tx);
@@ -1562,15 +1576,15 @@ ksocknal_finalize_zcreq(struct ksock_conn *conn)
 }
 
 void
-ksocknal_terminate_conn(struct ksock_conn *conn)
+ksocknal_terminate_conn(ksock_conn_t *conn)
 {
         /* This gets called by the reaper (guaranteed thread context) to
          * disengage the socket from its callbacks and close it.
          * ksnc_refcount will eventually hit zero, and then the reaper will
          * destroy it. */
-	struct ksock_peer_ni *peer_ni = conn->ksnc_peer;
-	struct ksock_sched *sched = conn->ksnc_scheduler;
-	int failed = 0;
+        ksock_peer_ni_t     *peer_ni = conn->ksnc_peer;
+        ksock_sched_t    *sched = conn->ksnc_scheduler;
+        int               failed = 0;
 
         LASSERT(conn->ksnc_closing);
 
@@ -1623,9 +1637,10 @@ ksocknal_terminate_conn(struct ksock_conn *conn)
 }
 
 void
-ksocknal_queue_zombie_conn(struct ksock_conn *conn)
+ksocknal_queue_zombie_conn (ksock_conn_t *conn)
 {
 	/* Queue the conn for the reaper to destroy */
+
 	LASSERT(atomic_read(&conn->ksnc_conn_refcount) == 0);
 	spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
 
@@ -1636,9 +1651,9 @@ ksocknal_queue_zombie_conn(struct ksock_conn *conn)
 }
 
 void
-ksocknal_destroy_conn(struct ksock_conn *conn)
+ksocknal_destroy_conn (ksock_conn_t *conn)
 {
-	time64_t last_rcv;
+	cfs_time_t      last_rcv;
 
 	/* Final coup-de-grace of the reaper */
 	CDEBUG (D_NET, "connection %p\n", conn);
@@ -1655,18 +1670,16 @@ ksocknal_destroy_conn(struct ksock_conn *conn)
         switch (conn->ksnc_rx_state) {
         case SOCKNAL_RX_LNET_PAYLOAD:
                 last_rcv = conn->ksnc_rx_deadline -
-			   lnet_get_lnd_timeout();
+			   cfs_time_seconds(*ksocknal_tunables.ksnd_timeout);
 		CERROR("Completing partial receive from %s[%d], "
 		       "ip %pI4h:%d, with error, wanted: %d, left: %d, "
-		       "last alive is %lld secs ago\n",
+		       "last alive is %ld secs ago\n",
                        libcfs_id2str(conn->ksnc_peer->ksnp_id), conn->ksnc_type,
 		       &conn->ksnc_ipaddr, conn->ksnc_port,
                        conn->ksnc_rx_nob_wanted, conn->ksnc_rx_nob_left,
-		       ktime_get_seconds() - last_rcv);
-		if (conn->ksnc_lnet_msg)
-			conn->ksnc_lnet_msg->msg_health_status =
-				LNET_MSG_STATUS_REMOTE_ERROR;
-		lnet_finalize(conn->ksnc_lnet_msg, -EIO);
+		       cfs_duration_sec(cfs_time_sub(cfs_time_current(),
+					last_rcv)));
+		lnet_finalize(conn->ksnc_cookie, -EIO);
 		break;
         case SOCKNAL_RX_LNET_HEADER:
                 if (conn->ksnc_rx_started)
@@ -1702,15 +1715,15 @@ ksocknal_destroy_conn(struct ksock_conn *conn)
 }
 
 int
-ksocknal_close_peer_conns_locked(struct ksock_peer_ni *peer_ni, __u32 ipaddr, int why)
+ksocknal_close_peer_conns_locked (ksock_peer_ni_t *peer_ni, __u32 ipaddr, int why)
 {
-	struct ksock_conn *conn;
-	struct list_head *ctmp;
-	struct list_head *cnxt;
-	int count = 0;
+        ksock_conn_t       *conn;
+	struct list_head         *ctmp;
+	struct list_head         *cnxt;
+        int                 count = 0;
 
 	list_for_each_safe(ctmp, cnxt, &peer_ni->ksnp_conns) {
-		conn = list_entry(ctmp, struct ksock_conn, ksnc_list);
+		conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
 
                 if (ipaddr == 0 ||
                     conn->ksnc_ipaddr == ipaddr) {
@@ -1723,11 +1736,11 @@ ksocknal_close_peer_conns_locked(struct ksock_peer_ni *peer_ni, __u32 ipaddr, in
 }
 
 int
-ksocknal_close_conn_and_siblings(struct ksock_conn *conn, int why)
+ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why)
 {
-	struct ksock_peer_ni *peer_ni = conn->ksnc_peer;
-	u32 ipaddr = conn->ksnc_ipaddr;
-	int count;
+        ksock_peer_ni_t     *peer_ni = conn->ksnc_peer;
+        __u32             ipaddr = conn->ksnc_ipaddr;
+        int               count;
 
 	write_lock_bh(&ksocknal_data.ksnd_global_lock);
 
@@ -1741,13 +1754,13 @@ ksocknal_close_conn_and_siblings(struct ksock_conn *conn, int why)
 int
 ksocknal_close_matching_conns(struct lnet_process_id id, __u32 ipaddr)
 {
-	struct ksock_peer_ni *peer_ni;
-	struct list_head *ptmp;
-	struct list_head *pnxt;
-	int lo;
-	int hi;
-	int i;
-	int count = 0;
+        ksock_peer_ni_t       *peer_ni;
+	struct list_head         *ptmp;
+	struct list_head         *pnxt;
+        int                 lo;
+        int                 hi;
+        int                 i;
+        int                 count = 0;
 
 	write_lock_bh(&ksocknal_data.ksnd_global_lock);
 
@@ -1761,7 +1774,7 @@ ksocknal_close_matching_conns(struct lnet_process_id id, __u32 ipaddr)
         for (i = lo; i <= hi; i++) {
 		list_for_each_safe(ptmp, pnxt, &ksocknal_data.ksnd_peers[i]) {
 
-			peer_ni = list_entry(ptmp, struct ksock_peer_ni, ksnp_list);
+			peer_ni = list_entry(ptmp, ksock_peer_ni_t, ksnp_list);
 
                         if (!((id.nid == LNET_NID_ANY || id.nid == peer_ni->ksnp_id.nid) &&
                               (id.pid == LNET_PID_ANY || id.pid == peer_ni->ksnp_id.pid)))
@@ -1805,12 +1818,12 @@ ksocknal_notify(struct lnet_ni *ni, lnet_nid_t gw_nid, int alive)
 }
 
 void
-ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, time64_t *when)
+ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when)
 {
 	int connect = 1;
-	time64_t last_alive = 0;
-	time64_t now = ktime_get_seconds();
-	struct ksock_peer_ni *peer_ni = NULL;
+	cfs_time_t last_alive = 0;
+	cfs_time_t now = cfs_time_current();
+	ksock_peer_ni_t *peer_ni = NULL;
 	rwlock_t *glock = &ksocknal_data.ksnd_global_lock;
 	struct lnet_process_id id = {
 		.nid = nid,
@@ -1819,20 +1832,20 @@ ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, time64_t *when)
 
 	read_lock(glock);
 
-	peer_ni = ksocknal_find_peer_locked(ni, id);
-	if (peer_ni != NULL) {
-		struct list_head *tmp;
-		struct ksock_conn *conn;
-		int bufnob;
+        peer_ni = ksocknal_find_peer_locked(ni, id);
+        if (peer_ni != NULL) {
+		struct list_head       *tmp;
+                ksock_conn_t     *conn;
+                int               bufnob;
 
 		list_for_each(tmp, &peer_ni->ksnp_conns) {
-			conn = list_entry(tmp, struct ksock_conn, ksnc_list);
+			conn = list_entry(tmp, ksock_conn_t, ksnc_list);
 			bufnob = conn->ksnc_sock->sk->sk_wmem_queued;
 
 			if (bufnob < conn->ksnc_tx_bufnob) {
 				/* something got ACKed */
-				conn->ksnc_tx_deadline = ktime_get_seconds() +
-							 lnet_get_lnd_timeout();
+				conn->ksnc_tx_deadline =
+					cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
                                 peer_ni->ksnp_last_alive = now;
                                 conn->ksnc_tx_bufnob = bufnob;
                         }
@@ -1848,9 +1861,9 @@ ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, time64_t *when)
         if (last_alive != 0)
 		*when = last_alive;
 
-	CDEBUG(D_NET, "peer_ni %s %p, alive %lld secs ago, connect %d\n",
+	CDEBUG(D_NET, "peer_ni %s %p, alive %ld secs ago, connect %d\n",
                libcfs_nid2str(nid), peer_ni,
-	       last_alive ? now - last_alive : -1,
+	       last_alive ? cfs_duration_sec(now - last_alive) : -1,
                connect);
 
         if (!connect)
@@ -1869,12 +1882,12 @@ ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, time64_t *when)
 }
 
 static void
-ksocknal_push_peer(struct ksock_peer_ni *peer_ni)
+ksocknal_push_peer (ksock_peer_ni_t *peer_ni)
 {
-	int index;
-	int i;
-	struct list_head *tmp;
-	struct ksock_conn *conn;
+        int               index;
+        int               i;
+	struct list_head       *tmp;
+        ksock_conn_t     *conn;
 
         for (index = 0; ; index++) {
 		read_lock(&ksocknal_data.ksnd_global_lock);
@@ -1884,8 +1897,8 @@ ksocknal_push_peer(struct ksock_peer_ni *peer_ni)
 
 		list_for_each(tmp, &peer_ni->ksnp_conns) {
                         if (i++ == index) {
-				conn = list_entry(tmp, struct ksock_conn,
-						  ksnc_list);
+				conn = list_entry(tmp, ksock_conn_t,
+                                                       ksnc_list);
                                 ksocknal_conn_addref(conn);
                                 break;
                         }
@@ -1921,7 +1934,7 @@ ksocknal_push(struct lnet_ni *ni, struct lnet_process_id id)
 		int	peer_off; /* searching offset in peer_ni hash table */
 
 		for (peer_off = 0; ; peer_off++) {
-			struct ksock_peer_ni *peer_ni;
+			ksock_peer_ni_t *peer_ni;
 			int	      i = 0;
 
 			read_lock(&ksocknal_data.ksnd_global_lock);
@@ -1953,15 +1966,15 @@ ksocknal_push(struct lnet_ni *ni, struct lnet_process_id id)
 static int
 ksocknal_add_interface(struct lnet_ni *ni, __u32 ipaddress, __u32 netmask)
 {
-	struct ksock_net *net = ni->ni_data;
-	struct ksock_interface *iface;
+	ksock_net_t *net = ni->ni_data;
+	ksock_interface_t *iface;
 	int rc;
 	int i;
 	int j;
 	struct list_head *ptmp;
-	struct ksock_peer_ni *peer_ni;
+	ksock_peer_ni_t *peer_ni;
 	struct list_head *rtmp;
-	struct ksock_route *route;
+	ksock_route_t *route;
 
 	if (ipaddress == 0 ||
 	    netmask == 0)
@@ -1973,7 +1986,7 @@ ksocknal_add_interface(struct lnet_ni *ni, __u32 ipaddress, __u32 netmask)
 	if (iface != NULL) {
 		/* silently ignore dups */
 		rc = 0;
-	} else if (net->ksnn_ninterfaces == LNET_INTERFACES_NUM) {
+	} else if (net->ksnn_ninterfaces == LNET_NUM_INTERFACES) {
 		rc = -ENOSPC;
 	} else {
 		iface = &net->ksnn_interfaces[net->ksnn_ninterfaces++];
@@ -1985,7 +1998,7 @@ ksocknal_add_interface(struct lnet_ni *ni, __u32 ipaddress, __u32 netmask)
 
 		for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
 			list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
-				peer_ni = list_entry(ptmp, struct ksock_peer_ni,
+				peer_ni = list_entry(ptmp, ksock_peer_ni_t,
 						     ksnp_list);
 
 				for (j = 0; j < peer_ni->ksnp_n_passive_ips; j++)
@@ -1994,7 +2007,7 @@ ksocknal_add_interface(struct lnet_ni *ni, __u32 ipaddress, __u32 netmask)
 
 				list_for_each(rtmp, &peer_ni->ksnp_routes) {
 					route = list_entry(rtmp,
-							   struct ksock_route,
+							   ksock_route_t,
 							   ksnr_list);
 
 					if (route->ksnr_myipaddr == ipaddress)
@@ -2013,14 +2026,14 @@ ksocknal_add_interface(struct lnet_ni *ni, __u32 ipaddress, __u32 netmask)
 }
 
 static void
-ksocknal_peer_del_interface_locked(struct ksock_peer_ni *peer_ni, __u32 ipaddr)
+ksocknal_peer_del_interface_locked(ksock_peer_ni_t *peer_ni, __u32 ipaddr)
 {
-	struct list_head *tmp;
-	struct list_head *nxt;
-	struct ksock_route *route;
-	struct ksock_conn *conn;
-	int i;
-	int j;
+	struct list_head         *tmp;
+	struct list_head         *nxt;
+        ksock_route_t      *route;
+        ksock_conn_t       *conn;
+        int                 i;
+        int                 j;
 
         for (i = 0; i < peer_ni->ksnp_n_passive_ips; i++)
                 if (peer_ni->ksnp_passive_ips[i] == ipaddr) {
@@ -2032,7 +2045,7 @@ ksocknal_peer_del_interface_locked(struct ksock_peer_ni *peer_ni, __u32 ipaddr)
                 }
 
 	list_for_each_safe(tmp, nxt, &peer_ni->ksnp_routes) {
-		route = list_entry(tmp, struct ksock_route, ksnr_list);
+		route = list_entry(tmp, ksock_route_t, ksnr_list);
 
                 if (route->ksnr_myipaddr != ipaddr)
                         continue;
@@ -2046,7 +2059,7 @@ ksocknal_peer_del_interface_locked(struct ksock_peer_ni *peer_ni, __u32 ipaddr)
         }
 
 	list_for_each_safe(tmp, nxt, &peer_ni->ksnp_conns) {
-		conn = list_entry(tmp, struct ksock_conn, ksnc_list);
+		conn = list_entry(tmp, ksock_conn_t, ksnc_list);
 
                 if (conn->ksnc_myipaddr == ipaddr)
                         ksocknal_close_conn_locked (conn, 0);
@@ -2056,14 +2069,14 @@ ksocknal_peer_del_interface_locked(struct ksock_peer_ni *peer_ni, __u32 ipaddr)
 static int
 ksocknal_del_interface(struct lnet_ni *ni, __u32 ipaddress)
 {
-	struct ksock_net *net = ni->ni_data;
-	int rc = -ENOENT;
-	struct list_head *tmp;
-	struct list_head *nxt;
-	struct ksock_peer_ni *peer_ni;
-	u32 this_ip;
-	int i;
-	int j;
+        ksock_net_t       *net = ni->ni_data;
+        int                rc = -ENOENT;
+	struct list_head        *tmp;
+	struct list_head        *nxt;
+        ksock_peer_ni_t      *peer_ni;
+        __u32              this_ip;
+        int                i;
+        int                j;
 
 	write_lock_bh(&ksocknal_data.ksnd_global_lock);
 
@@ -2084,9 +2097,9 @@ ksocknal_del_interface(struct lnet_ni *ni, __u32 ipaddress)
 
                 for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) {
 			list_for_each_safe(tmp, nxt,
-					   &ksocknal_data.ksnd_peers[j]) {
-				peer_ni = list_entry(tmp, struct ksock_peer_ni,
-						     ksnp_list);
+                                               &ksocknal_data.ksnd_peers[j]) {
+				peer_ni = list_entry(tmp, ksock_peer_ni_t,
+                                                      ksnp_list);
 
                                 if (peer_ni->ksnp_ni != ni)
                                         continue;
@@ -2110,8 +2123,8 @@ ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
 
         switch(cmd) {
         case IOC_LIBCFS_GET_INTERFACE: {
-		struct ksock_net *net = ni->ni_data;
-		struct ksock_interface *iface;
+                ksock_net_t       *net = ni->ni_data;
+                ksock_interface_t *iface;
 
 		read_lock(&ksocknal_data.ksnd_global_lock);
 
@@ -2180,7 +2193,7 @@ ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
                 int           txmem;
                 int           rxmem;
                 int           nagle;
-		struct ksock_conn *conn = ksocknal_get_conn_by_idx(ni, data->ioc_count);
+                ksock_conn_t *conn = ksocknal_get_conn_by_idx (ni, data->ioc_count);
 
                 if (conn == NULL)
                         return -ENOENT;
@@ -2194,7 +2207,7 @@ ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
                 data->ioc_u32[1] = conn->ksnc_port;
                 data->ioc_u32[2] = conn->ksnc_myipaddr;
                 data->ioc_u32[3] = conn->ksnc_type;
-		data->ioc_u32[4] = conn->ksnc_scheduler->kss_cpt;
+		data->ioc_u32[4] = conn->ksnc_scheduler->kss_info->ksi_cpt;
                 data->ioc_u32[5] = rxmem;
                 data->ioc_u32[6] = conn->ksnc_peer->ksnp_id.pid;
                 ksocknal_conn_decref(conn);
@@ -2233,8 +2246,19 @@ ksocknal_free_buffers (void)
 {
 	LASSERT (atomic_read(&ksocknal_data.ksnd_nactive_txs) == 0);
 
-	if (ksocknal_data.ksnd_schedulers != NULL)
-		cfs_percpt_free(ksocknal_data.ksnd_schedulers);
+	if (ksocknal_data.ksnd_sched_info != NULL) {
+		struct ksock_sched_info	*info;
+		int			i;
+
+		cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
+			if (info->ksi_scheds != NULL) {
+				LIBCFS_FREE(info->ksi_scheds,
+					    info->ksi_nthreads_max *
+					    sizeof(info->ksi_scheds[0]));
+			}
+		}
+		cfs_percpt_free(ksocknal_data.ksnd_sched_info);
+	}
 
         LIBCFS_FREE (ksocknal_data.ksnd_peers,
 		     sizeof(struct list_head) *
@@ -2243,15 +2267,15 @@ ksocknal_free_buffers (void)
 	spin_lock(&ksocknal_data.ksnd_tx_lock);
 
 	if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
-		struct list_head zlist;
-		struct ksock_tx	*tx;
+		struct list_head	zlist;
+		ksock_tx_t	*tx;
 
 		list_add(&zlist, &ksocknal_data.ksnd_idle_noop_txs);
 		list_del_init(&ksocknal_data.ksnd_idle_noop_txs);
 		spin_unlock(&ksocknal_data.ksnd_tx_lock);
 
 		while (!list_empty(&zlist)) {
-			tx = list_entry(zlist.next, struct ksock_tx, tx_list);
+			tx = list_entry(zlist.next, ksock_tx_t, tx_list);
 			list_del(&tx->tx_list);
 			LIBCFS_FREE(tx, tx->tx_desc_size);
 		}
@@ -2263,23 +2287,26 @@ ksocknal_free_buffers (void)
 static void
 ksocknal_base_shutdown(void)
 {
-	struct ksock_sched *sched;
-	int i;
+	struct ksock_sched_info *info;
+	ksock_sched_t		*sched;
+	int			i;
+	int			j;
 
 	CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
 	       atomic_read (&libcfs_kmemory));
 	LASSERT (ksocknal_data.ksnd_nnets == 0);
 
-	switch (ksocknal_data.ksnd_init) {
-	default:
-		LASSERT(0);
-		/* fallthrough */
+        switch (ksocknal_data.ksnd_init) {
+        default:
+                LASSERT (0);
+		/* Fall through */
 
-	case SOCKNAL_INIT_ALL:
-	case SOCKNAL_INIT_DATA:
-		LASSERT(ksocknal_data.ksnd_peers != NULL);
-		for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++)
+        case SOCKNAL_INIT_ALL:
+        case SOCKNAL_INIT_DATA:
+                LASSERT (ksocknal_data.ksnd_peers != NULL);
+                for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
 			LASSERT(list_empty(&ksocknal_data.ksnd_peers[i]));
+                }
 
 		LASSERT(list_empty(&ksocknal_data.ksnd_nets));
 		LASSERT(list_empty(&ksocknal_data.ksnd_enomem_conns));
@@ -2287,14 +2314,23 @@ ksocknal_base_shutdown(void)
 		LASSERT(list_empty(&ksocknal_data.ksnd_connd_connreqs));
 		LASSERT(list_empty(&ksocknal_data.ksnd_connd_routes));
 
-		if (ksocknal_data.ksnd_schedulers != NULL) {
-			cfs_percpt_for_each(sched, i,
-					    ksocknal_data.ksnd_schedulers) {
+		if (ksocknal_data.ksnd_sched_info != NULL) {
+			cfs_percpt_for_each(info, i,
+					    ksocknal_data.ksnd_sched_info) {
+				if (info->ksi_scheds == NULL)
+					continue;
+
+				for (j = 0; j < info->ksi_nthreads_max; j++) {
 
-				LASSERT(list_empty(&sched->kss_tx_conns));
-				LASSERT(list_empty(&sched->kss_rx_conns));
-				LASSERT(list_empty(&sched->kss_zombie_noop_txs));
-				LASSERT(sched->kss_nconns == 0);
+					sched = &info->ksi_scheds[j];
+					LASSERT(list_empty(&sched->\
+							       kss_tx_conns));
+					LASSERT(list_empty(&sched->\
+							       kss_rx_conns));
+					LASSERT(list_empty(&sched-> \
+						  kss_zombie_noop_txs));
+					LASSERT(sched->kss_nconns == 0);
+				}
 			}
 		}
 
@@ -2303,10 +2339,17 @@ ksocknal_base_shutdown(void)
 		wake_up_all(&ksocknal_data.ksnd_connd_waitq);
 		wake_up_all(&ksocknal_data.ksnd_reaper_waitq);
 
-		if (ksocknal_data.ksnd_schedulers != NULL) {
-			cfs_percpt_for_each(sched, i,
-					    ksocknal_data.ksnd_schedulers)
+		if (ksocknal_data.ksnd_sched_info != NULL) {
+			cfs_percpt_for_each(info, i,
+					    ksocknal_data.ksnd_sched_info) {
+				if (info->ksi_scheds == NULL)
+					continue;
+
+				for (j = 0; j < info->ksi_nthreads_max; j++) {
+					sched = &info->ksi_scheds[j];
 					wake_up_all(&sched->kss_waitq);
+				}
+			}
 		}
 
 		i = 4;
@@ -2339,9 +2382,9 @@ ksocknal_base_shutdown(void)
 static int
 ksocknal_base_startup(void)
 {
-	struct ksock_sched *sched;
-	int rc;
-	int i;
+	struct ksock_sched_info	*info;
+	int			rc;
+	int			i;
 
         LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
         LASSERT (ksocknal_data.ksnd_nnets == 0);
@@ -2381,43 +2424,50 @@ ksocknal_base_startup(void)
 	ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
 	try_module_get(THIS_MODULE);
 
-	/* Create a scheduler block per available CPT */
-	ksocknal_data.ksnd_schedulers = cfs_percpt_alloc(lnet_cpt_table(),
-							 sizeof(*sched));
-	if (ksocknal_data.ksnd_schedulers == NULL)
+	ksocknal_data.ksnd_sched_info = cfs_percpt_alloc(lnet_cpt_table(),
+							 sizeof(*info));
+	if (ksocknal_data.ksnd_sched_info == NULL)
 		goto failed;
 
-	cfs_percpt_for_each(sched, i, ksocknal_data.ksnd_schedulers) {
-		int nthrs;
+	cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
+		ksock_sched_t	*sched;
+		int		nthrs;
 
-		/*
-		 * make sure not to allocate more threads than there are
-		 * cores/CPUs in teh CPT
-		 */
 		nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
 		if (*ksocknal_tunables.ksnd_nscheds > 0) {
 			nthrs = min(nthrs, *ksocknal_tunables.ksnd_nscheds);
 		} else {
-			/*
-			 * max to half of CPUs, assume another half should be
-			 * reserved for upper layer modules
-			 */
+			/* max to half of CPUs, assume another half should be
+			 * reserved for upper layer modules */
 			nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
 		}
 
-		sched->kss_nthreads_max = nthrs;
-		sched->kss_cpt = i;
-
-		spin_lock_init(&sched->kss_lock);
-		INIT_LIST_HEAD(&sched->kss_rx_conns);
-		INIT_LIST_HEAD(&sched->kss_tx_conns);
-		INIT_LIST_HEAD(&sched->kss_zombie_noop_txs);
-		init_waitqueue_head(&sched->kss_waitq);
+		info->ksi_nthreads_max = nthrs;
+		info->ksi_cpt = i;
+
+		if (nthrs != 0) {
+			LIBCFS_CPT_ALLOC(info->ksi_scheds, lnet_cpt_table(), i,
+					 info->ksi_nthreads_max *
+						sizeof(*sched));
+			if (info->ksi_scheds == NULL)
+				goto failed;
+
+			for (; nthrs > 0; nthrs--) {
+				sched = &info->ksi_scheds[nthrs - 1];
+
+				sched->kss_info = info;
+				spin_lock_init(&sched->kss_lock);
+				INIT_LIST_HEAD(&sched->kss_rx_conns);
+				INIT_LIST_HEAD(&sched->kss_tx_conns);
+				INIT_LIST_HEAD(&sched->kss_zombie_noop_txs);
+				init_waitqueue_head(&sched->kss_waitq);
+			}
+		}
         }
 
         ksocknal_data.ksnd_connd_starting         = 0;
         ksocknal_data.ksnd_connd_failed_stamp     = 0;
-	ksocknal_data.ksnd_connd_starting_stamp   = ktime_get_real_seconds();
+	ksocknal_data.ksnd_connd_starting_stamp   = cfs_time_current_sec();
         /* must have at least 2 connds to remain responsive to accepts while
          * connecting */
         if (*ksocknal_tunables.ksnd_nconnds < SOCKNAL_CONND_RESV + 1)
@@ -2467,15 +2517,15 @@ ksocknal_base_startup(void)
 static void
 ksocknal_debug_peerhash(struct lnet_ni *ni)
 {
-	struct ksock_peer_ni *peer_ni = NULL;
-	struct list_head *tmp;
-	int i;
+	ksock_peer_ni_t	*peer_ni = NULL;
+	struct list_head	*tmp;
+	int		i;
 
 	read_lock(&ksocknal_data.ksnd_global_lock);
 
         for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
 		list_for_each(tmp, &ksocknal_data.ksnd_peers[i]) {
-			peer_ni = list_entry(tmp, struct ksock_peer_ni, ksnp_list);
+			peer_ni = list_entry(tmp, ksock_peer_ni_t, ksnp_list);
 
                         if (peer_ni->ksnp_ni == ni) break;
 
@@ -2484,8 +2534,8 @@ ksocknal_debug_peerhash(struct lnet_ni *ni)
         }
 
         if (peer_ni != NULL) {
-		struct ksock_route *route;
-		struct ksock_conn  *conn;
+                ksock_route_t *route;
+                ksock_conn_t  *conn;
 
 		CWARN ("Active peer_ni on shutdown: %s, ref %d, scnt %d, "
 		       "closing %d, accepting %d, err %d, zcookie %llu, "
@@ -2498,7 +2548,7 @@ ksocknal_debug_peerhash(struct lnet_ni *ni)
 		       !list_empty(&peer_ni->ksnp_zc_req_list));
 
 		list_for_each(tmp, &peer_ni->ksnp_routes) {
-			route = list_entry(tmp, struct ksock_route, ksnr_list);
+			route = list_entry(tmp, ksock_route_t, ksnr_list);
 			CWARN ("Route: ref %d, schd %d, conn %d, cnted %d, "
 			       "del %d\n", atomic_read(&route->ksnr_refcount),
 			       route->ksnr_scheduled, route->ksnr_connecting,
@@ -2506,7 +2556,7 @@ ksocknal_debug_peerhash(struct lnet_ni *ni)
 		}
 
 		list_for_each(tmp, &peer_ni->ksnp_conns) {
-			conn = list_entry(tmp, struct ksock_conn, ksnc_list);
+			conn = list_entry(tmp, ksock_conn_t, ksnc_list);
 			CWARN ("Conn: ref %d, sref %d, t %d, c %d\n",
 			       atomic_read(&conn->ksnc_conn_refcount),
 			       atomic_read(&conn->ksnc_sock_refcount),
@@ -2521,7 +2571,7 @@ ksocknal_debug_peerhash(struct lnet_ni *ni)
 void
 ksocknal_shutdown(struct lnet_ni *ni)
 {
-	struct ksock_net *net = ni->ni_data;
+	ksock_net_t *net = ni->ni_data;
 	struct lnet_process_id anyid = {
 		.nid = LNET_NID_ANY,
 		.pid = LNET_PID_ANY,
@@ -2571,17 +2621,17 @@ ksocknal_shutdown(struct lnet_ni *ni)
 }
 
 static int
-ksocknal_search_new_ipif(struct ksock_net *net)
+ksocknal_search_new_ipif(ksock_net_t *net)
 {
-	int new_ipif = 0;
-	int i;
+	int	new_ipif = 0;
+	int	i;
 
 	for (i = 0; i < net->ksnn_ninterfaces; i++) {
-		char *ifnam = &net->ksnn_interfaces[i].ksni_name[0];
-		char *colon = strchr(ifnam, ':');
-		int found  = 0;
-		struct ksock_net *tmp;
-		int j;
+		char		*ifnam = &net->ksnn_interfaces[i].ksni_name[0];
+		char		*colon = strchr(ifnam, ':');
+		int		found  = 0;
+		ksock_net_t	*tmp;
+		int		j;
 
 		if (colon != NULL) /* ignore alias device */
 			*colon = 0;
@@ -2613,35 +2663,36 @@ ksocknal_search_new_ipif(struct ksock_net *net)
 }
 
 static int
-ksocknal_start_schedulers(struct ksock_sched *sched)
+ksocknal_start_schedulers(struct ksock_sched_info *info)
 {
 	int	nthrs;
 	int	rc = 0;
 	int	i;
 
-	if (sched->kss_nthreads == 0) {
+	if (info->ksi_nthreads == 0) {
 		if (*ksocknal_tunables.ksnd_nscheds > 0) {
-			nthrs = sched->kss_nthreads_max;
+			nthrs = info->ksi_nthreads_max;
 		} else {
 			nthrs = cfs_cpt_weight(lnet_cpt_table(),
-					       sched->kss_cpt);
+					       info->ksi_cpt);
 			nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
 			nthrs = min(SOCKNAL_NSCHEDS_HIGH, nthrs);
 		}
-		nthrs = min(nthrs, sched->kss_nthreads_max);
+		nthrs = min(nthrs, info->ksi_nthreads_max);
 	} else {
-		LASSERT(sched->kss_nthreads <= sched->kss_nthreads_max);
+		LASSERT(info->ksi_nthreads <= info->ksi_nthreads_max);
 		/* increase two threads if there is new interface */
-		nthrs = min(2, sched->kss_nthreads_max - sched->kss_nthreads);
+		nthrs = min(2, info->ksi_nthreads_max - info->ksi_nthreads);
 	}
 
 	for (i = 0; i < nthrs; i++) {
-		long id;
-		char name[20];
-
-		id = KSOCK_THREAD_ID(sched->kss_cpt, sched->kss_nthreads + i);
+		long		id;
+		char		name[20];
+		ksock_sched_t	*sched;
+		id = KSOCK_THREAD_ID(info->ksi_cpt, info->ksi_nthreads + i);
+		sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
 		snprintf(name, sizeof(name), "socknal_sd%02d_%02d",
-			 sched->kss_cpt, (int)KSOCK_THREAD_SID(id));
+			 info->ksi_cpt, (int)(sched - &info->ksi_scheds[0]));
 
 		rc = ksocknal_thread_start(ksocknal_scheduler,
 					   (void *)id, name);
@@ -2649,35 +2700,35 @@ ksocknal_start_schedulers(struct ksock_sched *sched)
 			continue;
 
 		CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
-		       sched->kss_cpt, (int) KSOCK_THREAD_SID(id), rc);
+		       info->ksi_cpt, info->ksi_nthreads + i, rc);
 		break;
 	}
 
-	sched->kss_nthreads += i;
+	info->ksi_nthreads += i;
 	return rc;
 }
 
 static int
-ksocknal_net_start_threads(struct ksock_net *net, __u32 *cpts, int ncpts)
+ksocknal_net_start_threads(ksock_net_t *net, __u32 *cpts, int ncpts)
 {
-	int newif = ksocknal_search_new_ipif(net);
-	int rc;
-	int i;
+	int	newif = ksocknal_search_new_ipif(net);
+	int	rc;
+	int	i;
 
 	if (ncpts > 0 && ncpts > cfs_cpt_number(lnet_cpt_table()))
 		return -EINVAL;
 
 	for (i = 0; i < ncpts; i++) {
-		struct ksock_sched *sched;
+		struct ksock_sched_info	*info;
 		int cpt = (cpts == NULL) ? i : cpts[i];
 
 		LASSERT(cpt < cfs_cpt_number(lnet_cpt_table()));
-		sched = ksocknal_data.ksnd_schedulers[cpt];
+		info = ksocknal_data.ksnd_sched_info[cpt];
 
-		if (!newif && sched->kss_nthreads > 0)
+		if (!newif && info->ksi_nthreads > 0)
 			continue;
 
-		rc = ksocknal_start_schedulers(sched);
+		rc = ksocknal_start_schedulers(info);
 		if (rc != 0)
 			return rc;
 	}
@@ -2687,9 +2738,8 @@ ksocknal_net_start_threads(struct ksock_net *net, __u32 *cpts, int ncpts)
 int
 ksocknal_startup(struct lnet_ni *ni)
 {
-	struct ksock_net *net;
-	struct lnet_ioctl_config_lnd_cmn_tunables *net_tunables;
-	struct ksock_interface *ksi = NULL;
+	ksock_net_t  *net;
+	ksock_interface_t *ksi = NULL;
 	struct lnet_inetdev *ifaces = NULL;
 	int i = 0;
 	int rc;
@@ -2709,28 +2759,18 @@ ksocknal_startup(struct lnet_ni *ni)
 	spin_lock_init(&net->ksnn_lock);
 	net->ksnn_incarnation = ktime_get_real_ns();
 	ni->ni_data = net;
-	net_tunables = &ni->ni_net->net_tunables;
-
-	if (net_tunables->lct_peer_timeout == -1)
-		net_tunables->lct_peer_timeout =
+	if (!ni->ni_net->net_tunables_set) {
+		ni->ni_net->net_tunables.lct_peer_timeout =
 			*ksocknal_tunables.ksnd_peertimeout;
-
-	if (net_tunables->lct_max_tx_credits == -1)
-		net_tunables->lct_max_tx_credits =
+		ni->ni_net->net_tunables.lct_max_tx_credits =
 			*ksocknal_tunables.ksnd_credits;
-
-	if (net_tunables->lct_peer_tx_credits == -1)
-		net_tunables->lct_peer_tx_credits =
+		ni->ni_net->net_tunables.lct_peer_tx_credits =
 			*ksocknal_tunables.ksnd_peertxcredits;
-
-	if (net_tunables->lct_peer_tx_credits >
-	    net_tunables->lct_max_tx_credits)
-		net_tunables->lct_peer_tx_credits =
-			net_tunables->lct_max_tx_credits;
-
-	if (net_tunables->lct_peer_rtr_credits == -1)
-		net_tunables->lct_peer_rtr_credits =
+		ni->ni_net->net_tunables.lct_peer_rtr_credits =
 			*ksocknal_tunables.ksnd_peerrtrcredits;
+		ni->ni_net->net_tunables_set = true;
+	}
+
 
 	rc = lnet_inet_enumerate(&ifaces, ni->ni_net_ns);
 	if (rc < 0)
@@ -2757,13 +2797,13 @@ ksocknal_startup(struct lnet_ni *ni)
 		 * should exist. Each IP alias should be mapped to
 		 * each 'struct net_ni'.
 		 */
-		for (i = 0; i < LNET_INTERFACES_NUM; i++) {
+		for (i = 0; i < LNET_NUM_INTERFACES; i++) {
 			int j;
 
 			if (!ni->ni_interfaces[i])
 				break;
 
-			for (j = 0; j < LNET_INTERFACES_NUM;  j++) {
+			for (j = 0; j < LNET_NUM_INTERFACES;  j++) {
 				if (i != j && ni->ni_interfaces[j] &&
 				    strcmp(ni->ni_interfaces[i],
 					   ni->ni_interfaces[j]) == 0) {
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h
index cbc40f7347d4d..12d6cb83ef4ac 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  *
  *   Author: Zach Brown <zab@zabbo.net>
  *   Author: Peter J. Braam <braam@clusterfs.com>
@@ -41,7 +41,6 @@
 #include <linux/list.h>
 #include <linux/mm.h>
 #include <linux/module.h>
-#include <linux/pagemap.h>
 #include <linux/stat.h>
 #include <linux/string.h>
 #include <linux/syscalls.h>
@@ -51,9 +50,13 @@
 #include <net/sock.h>
 #include <net/tcp.h>
 
+#include <libcfs/libcfs.h>
+#include <lnet/lnet.h>
 #include <lnet/lib-lnet.h>
 #include <lnet/socklnd.h>
 
+#include <libcfs/linux/linux-net.h>
+
 #ifdef HAVE_TCP_SENDPAGE_USE_SOCKET
 # define cfs_tcp_sendpage(sk, page, offset, size, flags) \
 	tcp_sendpage((sk)->sk_socket, page, offset, size, flags)
@@ -62,8 +65,6 @@
 	tcp_sendpage(sk, page, offset, size, flags)
 #endif /* HAVE_TCP_SENDPAGE_USE_SOCKET */
 
-#include <libcfs/linux/linux-net.h>
-
 #ifndef NETIF_F_CSUM_MASK
 # define NETIF_F_CSUM_MASK NETIF_F_ALL_CSUM
 #endif
@@ -75,7 +76,7 @@
 #define SOCKNAL_PEER_HASH_SIZE  101             /* # peer_ni lists */
 #define SOCKNAL_RESCHED         100             /* # scheduler loops before reschedule */
 #define SOCKNAL_INSANITY_RECONN 5000            /* connd is trying on reconn infinitely */
-#define SOCKNAL_ENOMEM_RETRY    1		/* seconds between retries */
+#define SOCKNAL_ENOMEM_RETRY    CFS_TICK        /* jiffies between retries */
 
 #define SOCKNAL_SINGLE_FRAG_TX      0           /* disable multi-fragment sends */
 #define SOCKNAL_SINGLE_FRAG_RX      0           /* disable multi-fragment receives */
@@ -90,25 +91,33 @@
 # define SOCKNAL_RISK_KMAP_DEADLOCK  1
 #endif
 
-/* per scheduler state */
-struct ksock_sched {
-	/* serialise */
-	spinlock_t kss_lock;
+struct ksock_sched_info;
+
+typedef struct                                  /* per scheduler state */
+{
+	spinlock_t		kss_lock;	/* serialise */
+	struct list_head	kss_rx_conns;	/* conn waiting to be read */
 	/* conn waiting to be written */
-	struct list_head kss_rx_conns;
-	struct list_head kss_tx_conns;
+	struct list_head	kss_tx_conns;
 	/* zombie noop tx list */
-	struct list_head kss_zombie_noop_txs;
-	/* where scheduler sleeps */
-	wait_queue_head_t kss_waitq;
+	struct list_head	kss_zombie_noop_txs;
+	wait_queue_head_t	kss_waitq;	/* where scheduler sleeps */
 	/* # connections assigned to this scheduler */
-	int kss_nconns;
-	/* max allowed threads */
-	int kss_nthreads_max;
-	/* number of threads */
-	int kss_nthreads;
-	/* CPT id */
-	int kss_cpt;
+	int			kss_nconns;
+	struct ksock_sched_info	*kss_info;	/* owner of it */
+#if !SOCKNAL_SINGLE_FRAG_RX
+	struct page		*kss_rx_scratch_pgs[LNET_MAX_IOV];
+#endif
+#if !SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_SINGLE_FRAG_RX
+	struct kvec		kss_scratch_iov[LNET_MAX_IOV];
+#endif
+} ksock_sched_t;
+
+struct ksock_sched_info {
+	int			ksi_nthreads_max; /* max allowed threads */
+	int			ksi_nthreads;	/* number of threads */
+	int			ksi_cpt;	/* CPT id */
+	ksock_sched_t		*ksi_scheds;	/* array of schedulers */
 };
 
 #define KSOCK_CPT_SHIFT			16
@@ -116,15 +125,17 @@ struct ksock_sched {
 #define KSOCK_THREAD_CPT(id)		((id) >> KSOCK_CPT_SHIFT)
 #define KSOCK_THREAD_SID(id)		((id) & ((1UL << KSOCK_CPT_SHIFT) - 1))
 
-struct ksock_interface {			/* in-use interface */
+typedef struct                                  /* in-use interface */
+{
 	__u32		ksni_ipaddr;		/* interface's IP address */
 	__u32		ksni_netmask;		/* interface's network mask */
 	int		ksni_nroutes;		/* # routes using (active) */
 	int		ksni_npeers;		/* # peers using (passive) */
 	char		ksni_name[IFNAMSIZ];	/* interface name */
-};
+} ksock_interface_t;
 
-struct ksock_tunables {
+typedef struct
+{
 	/* "stuck" socket timeout (seconds) */
 	int              *ksnd_timeout;
 	/* # scheduler threads in each pool while starting */
@@ -164,24 +175,26 @@ struct ksock_tunables {
 #if SOCKNAL_VERSION_DEBUG
         int              *ksnd_protocol;        /* protocol version */
 #endif
-};
+} ksock_tunables_t;
 
-struct ksock_net {
+typedef struct
+{
 	__u64		  ksnn_incarnation;	/* my epoch */
 	spinlock_t	  ksnn_lock;		/* serialise */
 	struct list_head  ksnn_list;		/* chain on global list */
 	int		  ksnn_npeers;		/* # peers */
 	int		  ksnn_shutdown;	/* shutting down? */
 	int		  ksnn_ninterfaces;	/* IP interfaces */
-	struct ksock_interface ksnn_interfaces[LNET_INTERFACES_NUM];
-};
+	ksock_interface_t ksnn_interfaces[LNET_NUM_INTERFACES];
+} ksock_net_t;
 
 /** connd timeout */
 #define SOCKNAL_CONND_TIMEOUT  120
 /** reserved thread for accepting & creating new connd */
 #define SOCKNAL_CONND_RESV     1
 
-struct ksock_nal_data {
+typedef struct
+{
 	int			ksnd_init;	/* initialisation state */
 	int			ksnd_nnets;	/* # networks set up */
 	struct list_head	ksnd_nets;	/* list of nets */
@@ -194,7 +207,7 @@ struct ksock_nal_data {
 	int			ksnd_nthreads;	/* # live threads */
 	int			ksnd_shuttingdown; /* tell threads to exit */
 	/* schedulers information */
-	struct ksock_sched	**ksnd_schedulers;
+	struct ksock_sched_info	**ksnd_sched_info;
 
 	atomic_t      ksnd_nactive_txs;    /* #active txs */
 
@@ -207,7 +220,7 @@ struct ksock_nal_data {
 	/* reaper sleeps here */
 	wait_queue_head_t       ksnd_reaper_waitq;
 	/* when reaper will wake */
-	time64_t		ksnd_reaper_waketime;
+	cfs_time_t        ksnd_reaper_waketime;
 	/* serialise */
 	spinlock_t	  ksnd_reaper_lock;
 
@@ -224,11 +237,11 @@ struct ksock_nal_data {
 	/* # connds connecting */
 	int			ksnd_connd_connecting;
 	/** time stamp of the last failed connecting attempt */
-	time64_t		ksnd_connd_failed_stamp;
+	long			ksnd_connd_failed_stamp;
 	/** # starting connd */
 	unsigned		ksnd_connd_starting;
 	/** time stamp of the last starting connd */
-	time64_t		ksnd_connd_starting_stamp;
+	long			ksnd_connd_starting_stamp;
 	/** # running connd */
 	unsigned		ksnd_connd_running;
 	/* serialise */
@@ -238,7 +251,8 @@ struct ksock_nal_data {
 	struct list_head	ksnd_idle_noop_txs;
 	/* serialise, g_lock unsafe */
 	spinlock_t		ksnd_tx_lock;
-};
+
+} ksock_nal_data_t;
 
 #define SOCKNAL_INIT_NOTHING    0
 #define SOCKNAL_INIT_DATA       1
@@ -258,7 +272,8 @@ struct ksock_peer;                              /* forward ref */
 struct ksock_route;                             /* forward ref */
 struct ksock_proto;                             /* forward ref */
 
-struct ksock_tx {			/* transmit packet */
+typedef struct                                  /* transmit packet */
+{
 	struct list_head   tx_list;	/* queue on conn for transmission etc */
 	struct list_head   tx_zc_list;	/* queue on peer_ni for ZC request */
 	atomic_t       tx_refcount;    /* tx reference count */
@@ -274,10 +289,9 @@ struct ksock_tx {			/* transmit packet */
         lnet_kiov_t   *tx_kiov;        /* packet page frags */
 	struct ksock_conn *tx_conn;        /* owning conn */
 	struct lnet_msg	  *tx_lnetmsg;	/* lnet message for lnet_finalize() */
-	time64_t	   tx_deadline;	/* when (in secs) tx times out */
-	struct ksock_msg   tx_msg;         /* socklnd message buffer */
+	cfs_time_t     tx_deadline;    /* when (in jiffies) tx times out */
+	struct ksock_msg    tx_msg;         /* socklnd message buffer */
         int            tx_desc_size;   /* size of this descriptor */
-	enum lnet_msg_hstatus tx_hstatus; /* health status of tx */
         union {
                 struct {
 			struct kvec iov;	/* virt hdr */
@@ -287,18 +301,18 @@ struct ksock_tx {			/* transmit packet */
 			struct kvec iov[1];	/* virt hdr + payload */
                 }                  virt;
         }                       tx_frags;
-};
+} ksock_tx_t;
 
-#define KSOCK_NOOP_TX_SIZE  ((int)offsetof(struct ksock_tx, tx_frags.paged.kiov[0]))
+#define KSOCK_NOOP_TX_SIZE  ((int)offsetof(ksock_tx_t, tx_frags.paged.kiov[0]))
 
-/* network zero copy callback descriptor embedded in struct ksock_tx */
+/* network zero copy callback descriptor embedded in ksock_tx_t */
 
 /* space for the rx frag descriptors; we either read a single contiguous
  * header, or up to LNET_MAX_IOV frags of payload of either type. */
-union ksock_rxiovspace {
-	struct kvec	iov[LNET_MAX_IOV];
-	lnet_kiov_t	kiov[LNET_MAX_IOV];
-};
+typedef union {
+	struct kvec     iov[LNET_MAX_IOV];
+        lnet_kiov_t      kiov[LNET_MAX_IOV];
+} ksock_rxiovspace_t;
 
 #define SOCKNAL_RX_KSM_HEADER   1               /* reading ksock message header */
 #define SOCKNAL_RX_LNET_HEADER  2               /* reading lnet message header */
@@ -307,16 +321,17 @@ union ksock_rxiovspace {
 #define SOCKNAL_RX_LNET_PAYLOAD 5               /* reading lnet payload (to deliver here) */
 #define SOCKNAL_RX_SLOP         6               /* skipping body */
 
-struct ksock_conn {
-	struct ksock_peer_ni	*ksnc_peer;	/* owning peer_ni */
-	struct ksock_route	*ksnc_route;	/* owning route */
+typedef struct ksock_conn
+{
+	struct ksock_peer  *ksnc_peer;		/* owning peer_ni */
+	struct ksock_route *ksnc_route;		/* owning route */
 	struct list_head    ksnc_list;		/* stash on peer_ni's conn list */
 	struct socket       *ksnc_sock;		/* actual socket */
 	void                *ksnc_saved_data_ready; /* socket's original data_ready() callback */
 	void                *ksnc_saved_write_space; /* socket's original write_space() callback */
 	atomic_t            ksnc_conn_refcount; /* conn refcount */
 	atomic_t            ksnc_sock_refcount; /* sock refcount */
-	struct ksock_sched *ksnc_scheduler;	/* who schedules this connection */
+	ksock_sched_t       *ksnc_scheduler;  /* who schedules this connection */
 	__u32               ksnc_myipaddr;   /* my IP */
         __u32               ksnc_ipaddr;     /* peer_ni's IP */
         int                 ksnc_port;       /* peer_ni's port */
@@ -331,7 +346,7 @@ struct ksock_conn {
 
 	/* where I enq waiting input or a forwarding descriptor */
 	struct list_head   ksnc_rx_list;
-	time64_t		ksnc_rx_deadline; /* when (in seconds) receive times out */
+	cfs_time_t            ksnc_rx_deadline; /* when (in jiffies) receive times out */
         __u8                  ksnc_rx_started;  /* started receiving a message */
         __u8                  ksnc_rx_ready;    /* data ready to read */
         __u8                  ksnc_rx_scheduled;/* being progressed */
@@ -342,9 +357,9 @@ struct ksock_conn {
 	struct kvec          *ksnc_rx_iov;      /* the kvec frags */
         int                   ksnc_rx_nkiov;    /* # page frags */
         lnet_kiov_t          *ksnc_rx_kiov;     /* the page frags */
-	union ksock_rxiovspace	ksnc_rx_iov_space;/* space for frag descriptors */
+        ksock_rxiovspace_t    ksnc_rx_iov_space;/* space for frag descriptors */
         __u32                 ksnc_rx_csum;     /* partial checksum for incoming data */
-	struct lnet_msg      *ksnc_lnet_msg;    /* rx lnet_finalize arg*/
+        void                 *ksnc_cookie;      /* rx lnet_finalize passthru arg */
 	struct ksock_msg	ksnc_msg;	/* incoming message buffer:
 						 * V2.x message takes the
 						 * whole struct
@@ -358,9 +373,9 @@ struct ksock_conn {
 	/* packets waiting to be sent */
 	struct list_head	ksnc_tx_queue;
 	/* next TX that can carry a LNet message or ZC-ACK */
-	struct ksock_tx		*ksnc_tx_carrier;
-	/* when (in seconds) tx times out */
-	time64_t		ksnc_tx_deadline;
+	ksock_tx_t		*ksnc_tx_carrier;
+	/* when (in jiffies) tx times out */
+	cfs_time_t		ksnc_tx_deadline;
 	/* send buffer marker */
 	int			ksnc_tx_bufnob;
 	/* # bytes queued */
@@ -370,16 +385,17 @@ struct ksock_conn {
 	/* being progressed */
 	int			ksnc_tx_scheduled;
 	/* time stamp of the last posted TX */
-	time64_t		ksnc_tx_last_post;
-};
+	cfs_time_t		ksnc_tx_last_post;
+} ksock_conn_t;
 
-struct ksock_route {
+typedef struct ksock_route
+{
 	struct list_head   ksnr_list;		/* chain on peer_ni route list */
 	struct list_head   ksnr_connd_list;	/* chain on ksnr_connd_routes */
-	struct ksock_peer_ni *ksnr_peer;	/* owning peer_ni */
+	struct ksock_peer *ksnr_peer;		/* owning peer_ni */
 	atomic_t	   ksnr_refcount;	/* # users */
-	time64_t	   ksnr_timeout;	/* when (in secs) reconnection can happen next */
-	time64_t	   ksnr_retry_interval;	/* how long between retries */
+	cfs_time_t            ksnr_timeout;     /* when (in jiffies) reconnection can happen next */
+	cfs_duration_t        ksnr_retry_interval; /* how long between retries */
         __u32                 ksnr_myipaddr;    /* my IP */
         __u32                 ksnr_ipaddr;      /* IP address to connect to */
         int                   ksnr_port;        /* port to connect to */
@@ -389,13 +405,14 @@ struct ksock_route {
         unsigned int          ksnr_deleted:1;   /* been removed from peer_ni? */
         unsigned int          ksnr_share_count; /* created explicitly? */
         int                   ksnr_conn_count;  /* # conns established by this route */
-};
+} ksock_route_t;
 
 #define SOCKNAL_KEEPALIVE_PING          1       /* cookie for keepalive ping */
 
-struct ksock_peer_ni {
+typedef struct ksock_peer
+{
 	struct list_head	ksnp_list;	/* stash on global peer_ni list */
-	time64_t		ksnp_last_alive;/* when (in seconds) I was last alive */
+	cfs_time_t            ksnp_last_alive;  /* when (in jiffies) I was last alive */
 	struct lnet_process_id	ksnp_id;	/* who's on the other end(s) */
 	atomic_t              ksnp_refcount; /* # users */
 	int                   ksnp_sharecount;  /* lconf usage counter */
@@ -411,48 +428,50 @@ struct ksock_peer_ni {
 	spinlock_t	      ksnp_lock;	/* serialize, g_lock unsafe */
 	/* zero copy requests wait for ACK  */
 	struct list_head	ksnp_zc_req_list;
-	time64_t		ksnp_send_keepalive; /* time to send keepalive */
+	cfs_time_t            ksnp_send_keepalive; /* time to send keepalive */
 	struct lnet_ni       *ksnp_ni;       /* which network */
 	int                   ksnp_n_passive_ips; /* # of... */
-	__u32                 ksnp_passive_ips[LNET_INTERFACES_NUM]; /* preferred local interfaces */
-};
+	__u32                 ksnp_passive_ips[LNET_NUM_INTERFACES]; /* preferred local interfaces */
+} ksock_peer_ni_t;
 
-struct ksock_connreq {
+typedef struct ksock_connreq
+{
 	/* stash on ksnd_connd_connreqs */
 	struct list_head	ksncr_list;
 	/* chosen NI */
 	struct lnet_ni		*ksncr_ni;
 	/* accepted socket */
 	struct socket		*ksncr_sock;
-};
+} ksock_connreq_t;
 
-extern struct ksock_nal_data ksocknal_data;
-extern struct ksock_tunables ksocknal_tunables;
+extern ksock_nal_data_t ksocknal_data;
+extern ksock_tunables_t ksocknal_tunables;
 
 #define SOCKNAL_MATCH_NO        0        /* TX can't match type of connection */
 #define SOCKNAL_MATCH_YES       1        /* TX matches type of connection */
 #define SOCKNAL_MATCH_MAY       2        /* TX can be sent on the connection, but not preferred */
 
-struct ksock_proto {
+typedef struct ksock_proto
+{
         int           pro_version;                                              /* version number of protocol */
-	int         (*pro_send_hello)(struct ksock_conn *, struct ksock_hello_msg *);     /* handshake function */
-	int         (*pro_recv_hello)(struct ksock_conn *, struct ksock_hello_msg *, int);/* handshake function */
-	void        (*pro_pack)(struct ksock_tx *);                                  /* message pack */
+	int         (*pro_send_hello)(ksock_conn_t *, struct ksock_hello_msg *);     /* handshake function */
+	int         (*pro_recv_hello)(ksock_conn_t *, struct ksock_hello_msg *, int);/* handshake function */
+        void        (*pro_pack)(ksock_tx_t *);                                  /* message pack */
 	void        (*pro_unpack)(struct ksock_msg *);				/* message unpack */
-	struct ksock_tx *(*pro_queue_tx_msg)(struct ksock_conn *, struct ksock_tx *);          /* queue tx on the connection */
-	int         (*pro_queue_tx_zcack)(struct ksock_conn *, struct ksock_tx *, __u64); /* queue ZC ack on the connection */
-	int         (*pro_handle_zcreq)(struct ksock_conn *, __u64, int);            /* handle ZC request */
-	int         (*pro_handle_zcack)(struct ksock_conn *, __u64, __u64);          /* handle ZC ACK */
-	int         (*pro_match_tx)(struct ksock_conn *, struct ksock_tx *, int);         /* msg type matches the connection type:
+        ksock_tx_t *(*pro_queue_tx_msg)(ksock_conn_t *, ksock_tx_t *);          /* queue tx on the connection */
+        int         (*pro_queue_tx_zcack)(ksock_conn_t *, ksock_tx_t *, __u64); /* queue ZC ack on the connection */
+        int         (*pro_handle_zcreq)(ksock_conn_t *, __u64, int);            /* handle ZC request */
+        int         (*pro_handle_zcack)(ksock_conn_t *, __u64, __u64);          /* handle ZC ACK */
+        int         (*pro_match_tx)(ksock_conn_t *, ksock_tx_t *, int);         /* msg type matches the connection type:
                                                                                  * return value:
                                                                                  *   return MATCH_NO  : no
                                                                                  *   return MATCH_YES : matching type
                                                                                  *   return MATCH_MAY : can be backup */
-};
+} ksock_proto_t;
 
-extern struct ksock_proto ksocknal_protocol_v1x;
-extern struct ksock_proto ksocknal_protocol_v2x;
-extern struct ksock_proto ksocknal_protocol_v3x;
+extern ksock_proto_t ksocknal_protocol_v1x;
+extern ksock_proto_t ksocknal_protocol_v2x;
+extern ksock_proto_t ksocknal_protocol_v3x;
 
 #define KSOCK_PROTO_V1_MAJOR    LNET_PROTO_TCP_VERSION_MAJOR
 #define KSOCK_PROTO_V1_MINOR    LNET_PROTO_TCP_VERSION_MINOR
@@ -494,27 +513,27 @@ ksocknal_nid2peerlist (lnet_nid_t nid)
 }
 
 static inline void
-ksocknal_conn_addref(struct ksock_conn *conn)
+ksocknal_conn_addref (ksock_conn_t *conn)
 {
-	LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0);
+	LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
 	atomic_inc(&conn->ksnc_conn_refcount);
 }
 
-extern void ksocknal_queue_zombie_conn(struct ksock_conn *conn);
-extern void ksocknal_finalize_zcreq(struct ksock_conn *conn);
+extern void ksocknal_queue_zombie_conn (ksock_conn_t *conn);
+extern void ksocknal_finalize_zcreq(ksock_conn_t *conn);
 
 static inline void
-ksocknal_conn_decref(struct ksock_conn *conn)
+ksocknal_conn_decref (ksock_conn_t *conn)
 {
-	LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0);
+	LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
 	if (atomic_dec_and_test(&conn->ksnc_conn_refcount))
 		ksocknal_queue_zombie_conn(conn);
 }
 
 static inline int
-ksocknal_connsock_addref(struct ksock_conn *conn)
+ksocknal_connsock_addref (ksock_conn_t *conn)
 {
-	int rc = -ESHUTDOWN;
+	int   rc = -ESHUTDOWN;
 
 	read_lock(&ksocknal_data.ksnd_global_lock);
 	if (!conn->ksnc_closing) {
@@ -528,9 +547,9 @@ ksocknal_connsock_addref(struct ksock_conn *conn)
 }
 
 static inline void
-ksocknal_connsock_decref(struct ksock_conn *conn)
+ksocknal_connsock_decref (ksock_conn_t *conn)
 {
-	LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0);
+	LASSERT (atomic_read(&conn->ksnc_sock_refcount) > 0);
 	if (atomic_dec_and_test(&conn->ksnc_sock_refcount)) {
 		LASSERT (conn->ksnc_closing);
 		sock_release(conn->ksnc_sock);
@@ -540,55 +559,55 @@ ksocknal_connsock_decref(struct ksock_conn *conn)
 }
 
 static inline void
-ksocknal_tx_addref(struct ksock_tx *tx)
+ksocknal_tx_addref (ksock_tx_t *tx)
 {
-	LASSERT(atomic_read(&tx->tx_refcount) > 0);
+	LASSERT (atomic_read(&tx->tx_refcount) > 0);
 	atomic_inc(&tx->tx_refcount);
 }
 
-extern void ksocknal_tx_prep(struct ksock_conn *, struct ksock_tx *tx);
-extern void ksocknal_tx_done(struct lnet_ni *ni, struct ksock_tx *tx, int error);
+extern void ksocknal_tx_prep (ksock_conn_t *, ksock_tx_t *tx);
+extern void ksocknal_tx_done(struct lnet_ni *ni, ksock_tx_t *tx, int error);
 
 static inline void
-ksocknal_tx_decref(struct ksock_tx *tx)
+ksocknal_tx_decref (ksock_tx_t *tx)
 {
-	LASSERT(atomic_read(&tx->tx_refcount) > 0);
+	LASSERT (atomic_read(&tx->tx_refcount) > 0);
 	if (atomic_dec_and_test(&tx->tx_refcount))
 		ksocknal_tx_done(NULL, tx, 0);
 }
 
 static inline void
-ksocknal_route_addref(struct ksock_route *route)
+ksocknal_route_addref (ksock_route_t *route)
 {
-	LASSERT(atomic_read(&route->ksnr_refcount) > 0);
+	LASSERT (atomic_read(&route->ksnr_refcount) > 0);
 	atomic_inc(&route->ksnr_refcount);
 }
 
-extern void ksocknal_destroy_route(struct ksock_route *route);
+extern void ksocknal_destroy_route (ksock_route_t *route);
 
 static inline void
-ksocknal_route_decref(struct ksock_route *route)
+ksocknal_route_decref (ksock_route_t *route)
 {
-	LASSERT(atomic_read(&route->ksnr_refcount) > 0);
+	LASSERT (atomic_read (&route->ksnr_refcount) > 0);
 	if (atomic_dec_and_test(&route->ksnr_refcount))
 		ksocknal_destroy_route (route);
 }
 
 static inline void
-ksocknal_peer_addref(struct ksock_peer_ni *peer_ni)
+ksocknal_peer_addref (ksock_peer_ni_t *peer_ni)
 {
-	LASSERT(atomic_read(&peer_ni->ksnp_refcount) > 0);
+	LASSERT (atomic_read (&peer_ni->ksnp_refcount) > 0);
 	atomic_inc(&peer_ni->ksnp_refcount);
 }
 
-extern void ksocknal_destroy_peer(struct ksock_peer_ni *peer_ni);
+extern void ksocknal_destroy_peer (ksock_peer_ni_t *peer_ni);
 
 static inline void
-ksocknal_peer_decref(struct ksock_peer_ni *peer_ni)
+ksocknal_peer_decref (ksock_peer_ni_t *peer_ni)
 {
 	LASSERT (atomic_read (&peer_ni->ksnp_refcount) > 0);
 	if (atomic_dec_and_test(&peer_ni->ksnp_refcount))
-		ksocknal_destroy_peer(peer_ni);
+		ksocknal_destroy_peer (peer_ni);
 }
 
 int ksocknal_startup(struct lnet_ni *ni);
@@ -603,77 +622,73 @@ int ksocknal_accept(struct lnet_ni *ni, struct socket *sock);
 
 int ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip,
 		      int port);
-struct ksock_peer_ni *ksocknal_find_peer_locked(struct lnet_ni *ni,
+ksock_peer_ni_t *ksocknal_find_peer_locked(struct lnet_ni *ni,
 					   struct lnet_process_id id);
-struct ksock_peer_ni *ksocknal_find_peer(struct lnet_ni *ni,
+ksock_peer_ni_t *ksocknal_find_peer(struct lnet_ni *ni,
 				    struct lnet_process_id id);
-extern void ksocknal_peer_failed(struct ksock_peer_ni *peer_ni);
-extern int ksocknal_create_conn(struct lnet_ni *ni, struct ksock_route *route,
+extern void ksocknal_peer_failed (ksock_peer_ni_t *peer_ni);
+extern int ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
 				struct socket *sock, int type);
-extern void ksocknal_close_conn_locked(struct ksock_conn *conn, int why);
-extern void ksocknal_terminate_conn(struct ksock_conn *conn);
-extern void ksocknal_destroy_conn(struct ksock_conn *conn);
-extern int  ksocknal_close_peer_conns_locked(struct ksock_peer_ni *peer_ni,
-					     __u32 ipaddr, int why);
-extern int ksocknal_close_conn_and_siblings(struct ksock_conn *conn, int why);
+extern void ksocknal_close_conn_locked (ksock_conn_t *conn, int why);
+extern void ksocknal_terminate_conn (ksock_conn_t *conn);
+extern void ksocknal_destroy_conn (ksock_conn_t *conn);
+extern int  ksocknal_close_peer_conns_locked (ksock_peer_ni_t *peer_ni,
+                                              __u32 ipaddr, int why);
+extern int ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why);
 int ksocknal_close_matching_conns(struct lnet_process_id id, __u32 ipaddr);
-extern struct ksock_conn *ksocknal_find_conn_locked(struct ksock_peer_ni *peer_ni,
-						    struct ksock_tx *tx, int nonblk);
+extern ksock_conn_t *ksocknal_find_conn_locked(ksock_peer_ni_t *peer_ni,
+                                               ksock_tx_t *tx, int nonblk);
 
-extern int  ksocknal_launch_packet(struct lnet_ni *ni, struct ksock_tx *tx,
+extern int  ksocknal_launch_packet(struct lnet_ni *ni, ksock_tx_t *tx,
 				   struct lnet_process_id id);
-extern struct ksock_tx *ksocknal_alloc_tx(int type, int size);
-extern void ksocknal_free_tx(struct ksock_tx *tx);
-extern struct ksock_tx *ksocknal_alloc_tx_noop(__u64 cookie, int nonblk);
-extern void ksocknal_next_tx_carrier(struct ksock_conn *conn);
-extern void ksocknal_queue_tx_locked(struct ksock_tx *tx, struct ksock_conn *conn);
+extern ksock_tx_t *ksocknal_alloc_tx(int type, int size);
+extern void ksocknal_free_tx (ksock_tx_t *tx);
+extern ksock_tx_t *ksocknal_alloc_tx_noop(__u64 cookie, int nonblk);
+extern void ksocknal_next_tx_carrier(ksock_conn_t *conn);
+extern void ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn);
 extern void ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist,
 				 int error);
 extern void ksocknal_notify(struct lnet_ni *ni, lnet_nid_t gw_nid, int alive);
-extern void ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, time64_t *when);
+extern void ksocknal_query (struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when);
 extern int ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name);
-extern void ksocknal_thread_fini(void);
-extern void ksocknal_launch_all_connections_locked(struct ksock_peer_ni *peer_ni);
-extern struct ksock_route *ksocknal_find_connectable_route_locked(struct ksock_peer_ni *peer_ni);
-extern struct ksock_route *ksocknal_find_connecting_route_locked(struct ksock_peer_ni *peer_ni);
-extern int ksocknal_new_packet(struct ksock_conn *conn, int skip);
-extern int ksocknal_scheduler(void *arg);
-extern int ksocknal_connd(void *arg);
-extern int ksocknal_reaper(void *arg);
-int ksocknal_send_hello(struct lnet_ni *ni, struct ksock_conn *conn,
+extern void ksocknal_thread_fini (void);
+extern void ksocknal_launch_all_connections_locked (ksock_peer_ni_t *peer_ni);
+extern ksock_route_t *ksocknal_find_connectable_route_locked (ksock_peer_ni_t *peer_ni);
+extern ksock_route_t *ksocknal_find_connecting_route_locked (ksock_peer_ni_t *peer_ni);
+extern int ksocknal_new_packet (ksock_conn_t *conn, int skip);
+extern int ksocknal_scheduler (void *arg);
+extern int ksocknal_connd (void *arg);
+extern int ksocknal_reaper (void *arg);
+int ksocknal_send_hello(struct lnet_ni *ni, ksock_conn_t *conn,
 			lnet_nid_t peer_nid, struct ksock_hello_msg *hello);
-int ksocknal_recv_hello(struct lnet_ni *ni, struct ksock_conn *conn,
+int ksocknal_recv_hello(struct lnet_ni *ni, ksock_conn_t *conn,
 			struct ksock_hello_msg *hello,
 			struct lnet_process_id *id,
 			__u64 *incarnation);
-extern void ksocknal_read_callback(struct ksock_conn *conn);
-extern void ksocknal_write_callback(struct ksock_conn *conn);
+extern void ksocknal_read_callback(ksock_conn_t *conn);
+extern void ksocknal_write_callback(ksock_conn_t *conn);
 
-extern int ksocknal_lib_zc_capable(struct ksock_conn *conn);
-extern void ksocknal_lib_save_callback(struct socket *sock, struct ksock_conn *conn);
-extern void ksocknal_lib_set_callback(struct socket *sock,  struct ksock_conn *conn);
+extern int ksocknal_lib_zc_capable(ksock_conn_t *conn);
+extern void ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn);
+extern void ksocknal_lib_set_callback(struct socket *sock,  ksock_conn_t *conn);
 extern void ksocknal_lib_reset_callback(struct socket *sock,
-					struct ksock_conn *conn);
-extern void ksocknal_lib_push_conn(struct ksock_conn *conn);
-extern int ksocknal_lib_get_conn_addrs(struct ksock_conn *conn);
+					ksock_conn_t *conn);
+extern void ksocknal_lib_push_conn(ksock_conn_t *conn);
+extern int ksocknal_lib_get_conn_addrs(ksock_conn_t *conn);
 extern int ksocknal_lib_setup_sock(struct socket *so);
-extern int ksocknal_lib_send_iov(struct ksock_conn *conn, struct ksock_tx *tx,
-				 struct kvec *scratch_iov);
-extern int ksocknal_lib_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx,
-				  struct kvec *scratch_iov);
-extern void ksocknal_lib_eager_ack(struct ksock_conn *conn);
-extern int ksocknal_lib_recv_iov(struct ksock_conn *conn,
-				 struct kvec *scratchiov);
-extern int ksocknal_lib_recv_kiov(struct ksock_conn *conn, struct page **pages,
-		       struct kvec *scratchiov);
-extern int ksocknal_lib_get_conn_tunables(struct ksock_conn *conn, int *txmem,
+extern int ksocknal_lib_send_iov(ksock_conn_t *conn, ksock_tx_t *tx);
+extern int ksocknal_lib_send_kiov(ksock_conn_t *conn, ksock_tx_t *tx);
+extern void ksocknal_lib_eager_ack(ksock_conn_t *conn);
+extern int ksocknal_lib_recv_iov(ksock_conn_t *conn);
+extern int ksocknal_lib_recv_kiov(ksock_conn_t *conn);
+extern int ksocknal_lib_get_conn_tunables(ksock_conn_t *conn, int *txmem,
 					  int *rxmem, int *nagle);
 
 extern int ksocknal_tunables_init(void);
 
-extern void ksocknal_lib_csum_tx(struct ksock_tx *tx);
+extern void ksocknal_lib_csum_tx(ksock_tx_t *tx);
 
-extern int ksocknal_lib_memory_pressure(struct ksock_conn *conn);
+extern int ksocknal_lib_memory_pressure(ksock_conn_t *conn);
 extern int ksocknal_lib_bind_thread_to_cpu(int id);
 
 #endif /* _SOCKLND_SOCKLND_H_ */
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c
index 1da3fe51398ca..83c6a2da2f4ae 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2014, Intel Corporation.
  *
  *   Author: Zach Brown <zab@zabbo.net>
  *   Author: Peter J. Braam <braam@clusterfs.com>
@@ -26,10 +26,10 @@
 
 #include "socklnd.h"
 
-struct ksock_tx *
+ksock_tx_t *
 ksocknal_alloc_tx(int type, int size)
 {
-	struct ksock_tx *tx = NULL;
+	ksock_tx_t *tx = NULL;
 
 	if (type == KSOCK_MSG_NOOP) {
 		LASSERT(size == KSOCK_NOOP_TX_SIZE);
@@ -38,8 +38,8 @@ ksocknal_alloc_tx(int type, int size)
 		spin_lock(&ksocknal_data.ksnd_tx_lock);
 
 		if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
-			tx = list_entry(ksocknal_data.ksnd_idle_noop_txs.next,
-					struct ksock_tx, tx_list);
+			tx = list_entry(ksocknal_data.ksnd_idle_noop_txs. \
+					    next, ksock_tx_t, tx_list);
 			LASSERT(tx->tx_desc_size == size);
 			list_del(&tx->tx_list);
 		}
@@ -57,7 +57,6 @@ ksocknal_alloc_tx(int type, int size)
 	tx->tx_zc_aborted = 0;
 	tx->tx_zc_capable = 0;
 	tx->tx_zc_checked = 0;
-	tx->tx_hstatus = LNET_MSG_STATUS_OK;
 	tx->tx_desc_size  = size;
 
 	atomic_inc(&ksocknal_data.ksnd_nactive_txs);
@@ -65,10 +64,10 @@ ksocknal_alloc_tx(int type, int size)
 	return tx;
 }
 
-struct ksock_tx *
+ksock_tx_t *
 ksocknal_alloc_tx_noop(__u64 cookie, int nonblk)
 {
-	struct ksock_tx *tx;
+        ksock_tx_t *tx;
 
         tx = ksocknal_alloc_tx(KSOCK_MSG_NOOP, KSOCK_NOOP_TX_SIZE);
         if (tx == NULL) {
@@ -94,7 +93,7 @@ ksocknal_alloc_tx_noop(__u64 cookie, int nonblk)
 
 
 void
-ksocknal_free_tx(struct ksock_tx *tx)
+ksocknal_free_tx (ksock_tx_t *tx)
 {
 	atomic_dec(&ksocknal_data.ksnd_nactive_txs);
 
@@ -111,85 +110,82 @@ ksocknal_free_tx(struct ksock_tx *tx)
 }
 
 static int
-ksocknal_send_iov(struct ksock_conn *conn, struct ksock_tx *tx,
-		  struct kvec *scratch_iov)
+ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
 {
 	struct kvec *iov = tx->tx_iov;
-	int    nob;
-	int    rc;
+        int    nob;
+        int    rc;
 
-	LASSERT(tx->tx_niov > 0);
+        LASSERT (tx->tx_niov > 0);
 
-	/* Never touch tx->tx_iov inside ksocknal_lib_send_iov() */
-	rc = ksocknal_lib_send_iov(conn, tx, scratch_iov);
+        /* Never touch tx->tx_iov inside ksocknal_lib_send_iov() */
+        rc = ksocknal_lib_send_iov(conn, tx);
 
-	if (rc <= 0)                            /* sent nothing? */
-		return rc;
+        if (rc <= 0)                            /* sent nothing? */
+                return (rc);
 
-	nob = rc;
-	LASSERT(nob <= tx->tx_resid);
-	tx->tx_resid -= nob;
+        nob = rc;
+        LASSERT (nob <= tx->tx_resid);
+        tx->tx_resid -= nob;
 
-	/* "consume" iov */
-	do {
-		LASSERT(tx->tx_niov > 0);
+        /* "consume" iov */
+        do {
+                LASSERT (tx->tx_niov > 0);
 
-		if (nob < (int) iov->iov_len) {
+                if (nob < (int) iov->iov_len) {
 			iov->iov_base += nob;
-			iov->iov_len -= nob;
-			return rc;
-		}
+                        iov->iov_len -= nob;
+                        return (rc);
+                }
 
-		nob -= iov->iov_len;
-		tx->tx_iov = ++iov;
-		tx->tx_niov--;
-	} while (nob != 0);
+                nob -= iov->iov_len;
+                tx->tx_iov = ++iov;
+                tx->tx_niov--;
+        } while (nob != 0);
 
-	return rc;
+        return (rc);
 }
 
 static int
-ksocknal_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx,
-		   struct kvec *scratch_iov)
+ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
 {
-	lnet_kiov_t *kiov = tx->tx_kiov;
-	int nob;
-	int rc;
+        lnet_kiov_t    *kiov = tx->tx_kiov;
+        int     nob;
+        int     rc;
 
-	LASSERT(tx->tx_niov == 0);
-	LASSERT(tx->tx_nkiov > 0);
+        LASSERT (tx->tx_niov == 0);
+        LASSERT (tx->tx_nkiov > 0);
 
-	/* Never touch tx->tx_kiov inside ksocknal_lib_send_kiov() */
-	rc = ksocknal_lib_send_kiov(conn, tx, scratch_iov);
+        /* Never touch tx->tx_kiov inside ksocknal_lib_send_kiov() */
+        rc = ksocknal_lib_send_kiov(conn, tx);
 
-	if (rc <= 0)                            /* sent nothing? */
-		return rc;
+        if (rc <= 0)                            /* sent nothing? */
+                return (rc);
 
-	nob = rc;
-	LASSERT(nob <= tx->tx_resid);
-	tx->tx_resid -= nob;
+        nob = rc;
+        LASSERT (nob <= tx->tx_resid);
+        tx->tx_resid -= nob;
 
-	/* "consume" kiov */
-	do {
-		LASSERT(tx->tx_nkiov > 0);
+        /* "consume" kiov */
+        do {
+                LASSERT(tx->tx_nkiov > 0);
 
-		if (nob < (int)kiov->kiov_len) {
-			kiov->kiov_offset += nob;
-			kiov->kiov_len -= nob;
-			return rc;
-		}
+                if (nob < (int)kiov->kiov_len) {
+                        kiov->kiov_offset += nob;
+                        kiov->kiov_len -= nob;
+                        return rc;
+                }
 
-		nob -= (int)kiov->kiov_len;
-		tx->tx_kiov = ++kiov;
-		tx->tx_nkiov--;
-	} while (nob != 0);
+                nob -= (int)kiov->kiov_len;
+                tx->tx_kiov = ++kiov;
+                tx->tx_nkiov--;
+        } while (nob != 0);
 
-	return rc;
+        return (rc);
 }
 
 static int
-ksocknal_transmit(struct ksock_conn *conn, struct ksock_tx *tx,
-		  struct kvec *scratch_iov)
+ksocknal_transmit(ksock_conn_t *conn, ksock_tx_t *tx)
 {
 	int	rc;
 	int	bufnob;
@@ -201,223 +197,214 @@ ksocknal_transmit(struct ksock_conn *conn, struct ksock_tx *tx,
 
 	LASSERT(tx->tx_resid != 0);
 
-	rc = ksocknal_connsock_addref(conn);
-	if (rc != 0) {
-		LASSERT(conn->ksnc_closing);
-		return -ESHUTDOWN;
-	}
+        rc = ksocknal_connsock_addref(conn);
+        if (rc != 0) {
+                LASSERT (conn->ksnc_closing);
+                return (-ESHUTDOWN);
+        }
 
-	do {
-		if (ksocknal_data.ksnd_enomem_tx > 0) {
-			/* testing... */
-			ksocknal_data.ksnd_enomem_tx--;
-			rc = -EAGAIN;
-		} else if (tx->tx_niov != 0) {
-			rc = ksocknal_send_iov(conn, tx, scratch_iov);
-		} else {
-			rc = ksocknal_send_kiov(conn, tx, scratch_iov);
-		}
+        do {
+                if (ksocknal_data.ksnd_enomem_tx > 0) {
+                        /* testing... */
+                        ksocknal_data.ksnd_enomem_tx--;
+                        rc = -EAGAIN;
+                } else if (tx->tx_niov != 0) {
+                        rc = ksocknal_send_iov (conn, tx);
+                } else {
+                        rc = ksocknal_send_kiov (conn, tx);
+                }
 
 		bufnob = conn->ksnc_sock->sk->sk_wmem_queued;
-		if (rc > 0)                     /* sent something? */
-			conn->ksnc_tx_bufnob += rc; /* account it */
+                if (rc > 0)                     /* sent something? */
+                        conn->ksnc_tx_bufnob += rc; /* account it */
 
 		if (bufnob < conn->ksnc_tx_bufnob) {
 			/* allocated send buffer bytes < computed; infer
 			 * something got ACKed */
-			conn->ksnc_tx_deadline = ktime_get_seconds() +
-						 lnet_get_lnd_timeout();
-			conn->ksnc_peer->ksnp_last_alive = ktime_get_seconds();
+			conn->ksnc_tx_deadline =
+				cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+			conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
 			conn->ksnc_tx_bufnob = bufnob;
 			smp_mb();
 		}
 
 		if (rc <= 0) { /* Didn't write anything? */
-			/* some stacks return 0 instead of -EAGAIN */
-			if (rc == 0)
-				rc = -EAGAIN;
 
-			/* Check if EAGAIN is due to memory pressure */
-			if (rc == -EAGAIN && ksocknal_lib_memory_pressure(conn))
-				rc = -ENOMEM;
+                        if (rc == 0) /* some stacks return 0 instead of -EAGAIN */
+                                rc = -EAGAIN;
 
-			break;
-		}
+                        /* Check if EAGAIN is due to memory pressure */
+                        if(rc == -EAGAIN && ksocknal_lib_memory_pressure(conn))
+                                rc = -ENOMEM;
+
+                        break;
+                }
 
-		/* socket's wmem_queued now includes 'rc' bytes */
+                /* socket's wmem_queued now includes 'rc' bytes */
 		atomic_sub (rc, &conn->ksnc_tx_nob);
-		rc = 0;
+                rc = 0;
 
-	} while (tx->tx_resid != 0);
+        } while (tx->tx_resid != 0);
 
-	ksocknal_connsock_decref(conn);
-	return rc;
+        ksocknal_connsock_decref(conn);
+        return (rc);
 }
 
 static int
-ksocknal_recv_iov(struct ksock_conn *conn, struct kvec *scratchiov)
+ksocknal_recv_iov (ksock_conn_t *conn)
 {
 	struct kvec *iov = conn->ksnc_rx_iov;
-	int     nob;
-	int     rc;
+        int     nob;
+        int     rc;
 
-	LASSERT(conn->ksnc_rx_niov > 0);
+        LASSERT (conn->ksnc_rx_niov > 0);
 
 	/* Never touch conn->ksnc_rx_iov or change connection
-	 * status inside ksocknal_lib_recv_iov */
-	rc = ksocknal_lib_recv_iov(conn, scratchiov);
+         * status inside ksocknal_lib_recv_iov */
+        rc = ksocknal_lib_recv_iov(conn);
 
-	if (rc <= 0)
-		return rc;
+        if (rc <= 0)
+                return (rc);
 
-	/* received something... */
-	nob = rc;
+        /* received something... */
+        nob = rc;
 
-	conn->ksnc_peer->ksnp_last_alive = ktime_get_seconds();
-	conn->ksnc_rx_deadline = ktime_get_seconds() +
-				 lnet_get_lnd_timeout();
+	conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+	conn->ksnc_rx_deadline =
+		cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
 	smp_mb();                       /* order with setting rx_started */
 	conn->ksnc_rx_started = 1;
 
 	conn->ksnc_rx_nob_wanted -= nob;
 	conn->ksnc_rx_nob_left -= nob;
 
-	do {
-		LASSERT(conn->ksnc_rx_niov > 0);
+        do {
+                LASSERT (conn->ksnc_rx_niov > 0);
 
-		if (nob < (int)iov->iov_len) {
-			iov->iov_len -= nob;
+                if (nob < (int)iov->iov_len) {
+                        iov->iov_len -= nob;
 			iov->iov_base += nob;
-			return -EAGAIN;
-		}
+                        return (-EAGAIN);
+                }
 
-		nob -= iov->iov_len;
-		conn->ksnc_rx_iov = ++iov;
-		conn->ksnc_rx_niov--;
-	} while (nob != 0);
+                nob -= iov->iov_len;
+                conn->ksnc_rx_iov = ++iov;
+                conn->ksnc_rx_niov--;
+        } while (nob != 0);
 
-	return rc;
+        return (rc);
 }
 
 static int
-ksocknal_recv_kiov(struct ksock_conn *conn, struct page **rx_scratch_pgs,
-		   struct kvec *scratch_iov)
+ksocknal_recv_kiov (ksock_conn_t *conn)
 {
-	lnet_kiov_t *kiov = conn->ksnc_rx_kiov;
-	int nob;
-	int rc;
-	LASSERT(conn->ksnc_rx_nkiov > 0);
+        lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
+        int     nob;
+        int     rc;
+        LASSERT (conn->ksnc_rx_nkiov > 0);
 
 	/* Never touch conn->ksnc_rx_kiov or change connection
-	 * status inside ksocknal_lib_recv_iov */
-	rc = ksocknal_lib_recv_kiov(conn, rx_scratch_pgs, scratch_iov);
+         * status inside ksocknal_lib_recv_iov */
+        rc = ksocknal_lib_recv_kiov(conn);
 
-	if (rc <= 0)
-		return rc;
+        if (rc <= 0)
+                return (rc);
 
-	/* received something... */
-	nob = rc;
+        /* received something... */
+        nob = rc;
 
-	conn->ksnc_peer->ksnp_last_alive = ktime_get_seconds();
-	conn->ksnc_rx_deadline = ktime_get_seconds() +
-				 lnet_get_lnd_timeout();
+	conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+	conn->ksnc_rx_deadline =
+		cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
 	smp_mb();                       /* order with setting rx_started */
 	conn->ksnc_rx_started = 1;
 
 	conn->ksnc_rx_nob_wanted -= nob;
 	conn->ksnc_rx_nob_left -= nob;
 
-	do {
-		LASSERT(conn->ksnc_rx_nkiov > 0);
+        do {
+                LASSERT (conn->ksnc_rx_nkiov > 0);
 
-		if (nob < (int) kiov->kiov_len) {
-			kiov->kiov_offset += nob;
-			kiov->kiov_len -= nob;
-			return -EAGAIN;
-		}
+                if (nob < (int) kiov->kiov_len) {
+                        kiov->kiov_offset += nob;
+                        kiov->kiov_len -= nob;
+                        return -EAGAIN;
+                }
 
-		nob -= kiov->kiov_len;
-		conn->ksnc_rx_kiov = ++kiov;
-		conn->ksnc_rx_nkiov--;
-	} while (nob != 0);
+                nob -= kiov->kiov_len;
+                conn->ksnc_rx_kiov = ++kiov;
+                conn->ksnc_rx_nkiov--;
+        } while (nob != 0);
 
-	return 1;
+        return 1;
 }
 
 static int
-ksocknal_receive(struct ksock_conn *conn, struct page **rx_scratch_pgs,
-		 struct kvec *scratch_iov)
+ksocknal_receive (ksock_conn_t *conn)
 {
-	/* Return 1 on success, 0 on EOF, < 0 on error.
-	 * Caller checks ksnc_rx_nob_wanted to determine
-	 * progress/completion. */
-	int     rc;
-	ENTRY;
+        /* Return 1 on success, 0 on EOF, < 0 on error.
+         * Caller checks ksnc_rx_nob_wanted to determine
+         * progress/completion. */
+        int     rc;
+        ENTRY;
 
 	if (ksocknal_data.ksnd_stall_rx != 0) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_timeout(cfs_time_seconds(ksocknal_data.ksnd_stall_rx));
 	}
 
-	rc = ksocknal_connsock_addref(conn);
-	if (rc != 0) {
-		LASSERT(conn->ksnc_closing);
-		return -ESHUTDOWN;
-	}
-
-	for (;;) {
-		if (conn->ksnc_rx_niov != 0)
-			rc = ksocknal_recv_iov(conn, scratch_iov);
-		else
-			rc = ksocknal_recv_kiov(conn, rx_scratch_pgs,
-						 scratch_iov);
+        rc = ksocknal_connsock_addref(conn);
+        if (rc != 0) {
+                LASSERT (conn->ksnc_closing);
+                return (-ESHUTDOWN);
+        }
 
-		if (rc <= 0) {
-			/* error/EOF or partial receive */
-			if (rc == -EAGAIN) {
-				rc = 1;
-			} else if (rc == 0 && conn->ksnc_rx_started) {
-				/* EOF in the middle of a message */
-				rc = -EPROTO;
-			}
-			break;
-		}
+        for (;;) {
+                if (conn->ksnc_rx_niov != 0)
+                        rc = ksocknal_recv_iov (conn);
+                else
+                        rc = ksocknal_recv_kiov (conn);
+
+                if (rc <= 0) {
+                        /* error/EOF or partial receive */
+                        if (rc == -EAGAIN) {
+                                rc = 1;
+                        } else if (rc == 0 && conn->ksnc_rx_started) {
+                                /* EOF in the middle of a message */
+                                rc = -EPROTO;
+                        }
+                        break;
+                }
 
-		/* Completed a fragment */
+                /* Completed a fragment */
 
-		if (conn->ksnc_rx_nob_wanted == 0) {
-			rc = 1;
-			break;
-		}
-	}
+                if (conn->ksnc_rx_nob_wanted == 0) {
+                        rc = 1;
+                        break;
+                }
+        }
 
-	ksocknal_connsock_decref(conn);
-	RETURN(rc);
+        ksocknal_connsock_decref(conn);
+        RETURN (rc);
 }
 
 void
-ksocknal_tx_done(struct lnet_ni *ni, struct ksock_tx *tx, int rc)
+ksocknal_tx_done(struct lnet_ni *ni, ksock_tx_t *tx, int rc)
 {
 	struct lnet_msg *lnetmsg = tx->tx_lnetmsg;
-	enum lnet_msg_hstatus hstatus = tx->tx_hstatus;
         ENTRY;
 
 	LASSERT(ni != NULL || tx->tx_conn != NULL);
 
-	if (!rc && (tx->tx_resid != 0 || tx->tx_zc_aborted)) {
+	if (!rc && (tx->tx_resid != 0 || tx->tx_zc_aborted))
 		rc = -EIO;
-		if (hstatus == LNET_MSG_STATUS_OK)
-			hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
-	}
 
 	if (tx->tx_conn != NULL)
 		ksocknal_conn_decref(tx->tx_conn);
 
 	ksocknal_free_tx(tx);
-	if (lnetmsg != NULL) { /* KSOCK_MSG_NOOP go without lnetmsg */
-		lnetmsg->msg_health_status = hstatus;
+	if (lnetmsg != NULL) /* KSOCK_MSG_NOOP go without lnetmsg */
 		lnet_finalize(lnetmsg, rc);
-	}
 
 	EXIT;
 }
@@ -425,10 +412,10 @@ ksocknal_tx_done(struct lnet_ni *ni, struct ksock_tx *tx, int rc)
 void
 ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist, int error)
 {
-	struct ksock_tx *tx;
+	ksock_tx_t *tx;
 
 	while (!list_empty(txlist)) {
-		tx = list_entry(txlist->next, struct ksock_tx, tx_list);
+		tx = list_entry(txlist->next, ksock_tx_t, tx_list);
 
 		if (error && tx->tx_lnetmsg != NULL) {
 			CNETERR("Deleting packet type %d len %d %s->%s\n",
@@ -442,34 +429,16 @@ ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist, int error)
 
 		list_del(&tx->tx_list);
 
-		if (tx->tx_hstatus == LNET_MSG_STATUS_OK) {
-			if (error == -ETIMEDOUT)
-				tx->tx_hstatus =
-				  LNET_MSG_STATUS_LOCAL_TIMEOUT;
-			else if (error == -ENETDOWN ||
-				 error == -EHOSTUNREACH ||
-				 error == -ENETUNREACH ||
-				 error == -ECONNREFUSED ||
-				 error == -ECONNRESET)
-				tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_DROPPED;
-			/*
-			 * for all other errors we don't want to
-			 * retransmit
-			 */
-			else if (error)
-				tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
-		}
-
 		LASSERT(atomic_read(&tx->tx_refcount) == 1);
 		ksocknal_tx_done(ni, tx, error);
 	}
 }
 
 static void
-ksocknal_check_zc_req(struct ksock_tx *tx)
+ksocknal_check_zc_req(ksock_tx_t *tx)
 {
-	struct ksock_conn *conn = tx->tx_conn;
-	struct ksock_peer_ni *peer_ni = conn->ksnc_peer;
+        ksock_conn_t   *conn = tx->tx_conn;
+        ksock_peer_ni_t   *peer_ni = conn->ksnc_peer;
 
         /* Set tx_msg.ksm_zc_cookies[0] to a unique non-zero cookie and add tx
          * to ksnp_zc_req_list if some fragment of this message should be sent
@@ -494,8 +463,8 @@ ksocknal_check_zc_req(struct ksock_tx *tx)
 	spin_lock(&peer_ni->ksnp_lock);
 
         /* ZC_REQ is going to be pinned to the peer_ni */
-	tx->tx_deadline = ktime_get_seconds() +
-			  lnet_get_lnd_timeout();
+	tx->tx_deadline =
+		cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
 
         LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0);
 
@@ -510,9 +479,9 @@ ksocknal_check_zc_req(struct ksock_tx *tx)
 }
 
 static void
-ksocknal_uncheck_zc_req(struct ksock_tx *tx)
+ksocknal_uncheck_zc_req(ksock_tx_t *tx)
 {
-	struct ksock_peer_ni *peer_ni = tx->tx_conn->ksnc_peer;
+	ksock_peer_ni_t   *peer_ni = tx->tx_conn->ksnc_peer;
 
 	LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
 	LASSERT(tx->tx_zc_capable);
@@ -536,111 +505,85 @@ ksocknal_uncheck_zc_req(struct ksock_tx *tx)
 }
 
 static int
-ksocknal_process_transmit(struct ksock_conn *conn, struct ksock_tx *tx,
-			  struct kvec *scratch_iov)
+ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
 {
-	int rc;
-	bool error_sim = false;
-
-	if (lnet_send_error_simulation(tx->tx_lnetmsg, &tx->tx_hstatus)) {
-		error_sim = true;
-		rc = -EINVAL;
-		goto simulate_error;
-	}
+        int            rc;
 
-	if (tx->tx_zc_capable && !tx->tx_zc_checked)
-		ksocknal_check_zc_req(tx);
+        if (tx->tx_zc_capable && !tx->tx_zc_checked)
+                ksocknal_check_zc_req(tx);
 
-	rc = ksocknal_transmit(conn, tx, scratch_iov);
+        rc = ksocknal_transmit (conn, tx);
 
-	CDEBUG(D_NET, "send(%d) %d\n", tx->tx_resid, rc);
+        CDEBUG (D_NET, "send(%d) %d\n", tx->tx_resid, rc);
 
-	if (tx->tx_resid == 0) {
-		/* Sent everything OK */
-		LASSERT(rc == 0);
+        if (tx->tx_resid == 0) {
+                /* Sent everything OK */
+                LASSERT (rc == 0);
 
-		return 0;
-	}
+                return (0);
+        }
 
-	if (rc == -EAGAIN)
-		return rc;
+        if (rc == -EAGAIN)
+                return (rc);
 
-	if (rc == -ENOMEM) {
-		static int counter;
+        if (rc == -ENOMEM) {
+                static int counter;
 
-		counter++;   /* exponential backoff warnings */
-		if ((counter & (-counter)) == counter)
-			CWARN("%u ENOMEM tx %p (%u allocated)\n",
+                counter++;   /* exponential backoff warnings */
+                if ((counter & (-counter)) == counter)
+                        CWARN("%u ENOMEM tx %p (%u allocated)\n",
 			      counter, conn, atomic_read(&libcfs_kmemory));
 
-		/* Queue on ksnd_enomem_conns for retry after a timeout */
+                /* Queue on ksnd_enomem_conns for retry after a timeout */
 		spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
 
-		/* enomem list takes over scheduler's ref... */
-		LASSERT(conn->ksnc_tx_scheduled);
+                /* enomem list takes over scheduler's ref... */
+                LASSERT (conn->ksnc_tx_scheduled);
 		list_add_tail(&conn->ksnc_tx_list,
-				  &ksocknal_data.ksnd_enomem_conns);
-		if (ktime_get_seconds() + SOCKNAL_ENOMEM_RETRY <
-		    ksocknal_data.ksnd_reaper_waketime)
+                                  &ksocknal_data.ksnd_enomem_conns);
+		if (!cfs_time_aftereq(cfs_time_add(cfs_time_current(),
+					SOCKNAL_ENOMEM_RETRY),
+					ksocknal_data.ksnd_reaper_waketime))
 			wake_up(&ksocknal_data.ksnd_reaper_waitq);
 
 		spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
-
-		/*
-		 * set the health status of the message which determines
-		 * whether we should retry the transmit
-		 */
-		tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
 		return (rc);
 	}
 
-simulate_error:
-
-	/* Actual error */
-	LASSERT(rc < 0);
-
-	if (!error_sim) {
-		/*
-		* set the health status of the message which determines
-		* whether we should retry the transmit
-		*/
-		if (rc == -ETIMEDOUT)
-			tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_TIMEOUT;
-		else
-			tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
-	}
+        /* Actual error */
+        LASSERT (rc < 0);
 
-	if (!conn->ksnc_closing) {
-		switch (rc) {
-		case -ECONNRESET:
+        if (!conn->ksnc_closing) {
+                switch (rc) {
+                case -ECONNRESET:
 			LCONSOLE_WARN("Host %pI4h reset our connection "
-				      "while we were sending data; it may have "
-				      "rebooted.\n",
+                                      "while we were sending data; it may have "
+                                      "rebooted.\n",
 				      &conn->ksnc_ipaddr);
-			break;
-		default:
-			LCONSOLE_WARN("There was an unexpected network error "
+                        break;
+                default:
+                        LCONSOLE_WARN("There was an unexpected network error "
 				      "while writing to %pI4h: %d.\n",
 				      &conn->ksnc_ipaddr, rc);
-			break;
-		}
+                        break;
+                }
 		CDEBUG(D_NET, "[%p] Error %d on write to %s ip %pI4h:%d\n",
 		       conn, rc, libcfs_id2str(conn->ksnc_peer->ksnp_id),
 		       &conn->ksnc_ipaddr, conn->ksnc_port);
-	}
+        }
 
-	if (tx->tx_zc_checked)
-		ksocknal_uncheck_zc_req(tx);
+        if (tx->tx_zc_checked)
+                ksocknal_uncheck_zc_req(tx);
 
-	/* it's not an error if conn is being closed */
-	ksocknal_close_conn_and_siblings(conn,
-					  (conn->ksnc_closing) ? 0 : rc);
+        /* it's not an error if conn is being closed */
+        ksocknal_close_conn_and_siblings (conn,
+                                          (conn->ksnc_closing) ? 0 : rc);
 
-	return rc;
+        return (rc);
 }
 
 static void
-ksocknal_launch_connection_locked(struct ksock_route *route)
+ksocknal_launch_connection_locked (ksock_route_t *route)
 {
 
         /* called holding write lock on ksnd_global_lock */
@@ -662,9 +605,9 @@ ksocknal_launch_connection_locked(struct ksock_route *route)
 }
 
 void
-ksocknal_launch_all_connections_locked(struct ksock_peer_ni *peer_ni)
+ksocknal_launch_all_connections_locked (ksock_peer_ni_t *peer_ni)
 {
-	struct ksock_route *route;
+        ksock_route_t *route;
 
         /* called holding write lock on ksnd_global_lock */
         for (;;) {
@@ -677,22 +620,21 @@ ksocknal_launch_all_connections_locked(struct ksock_peer_ni *peer_ni)
         }
 }
 
-struct ksock_conn *
-ksocknal_find_conn_locked(struct ksock_peer_ni *peer_ni, struct ksock_tx *tx, int nonblk)
+ksock_conn_t *
+ksocknal_find_conn_locked(ksock_peer_ni_t *peer_ni, ksock_tx_t *tx, int nonblk)
 {
 	struct list_head *tmp;
-	struct ksock_conn *conn;
-	struct ksock_conn *typed = NULL;
-	struct ksock_conn *fallback = NULL;
-	int tnob = 0;
-	int fnob = 0;
+        ksock_conn_t     *conn;
+        ksock_conn_t     *typed = NULL;
+        ksock_conn_t     *fallback = NULL;
+        int               tnob     = 0;
+        int               fnob     = 0;
 
 	list_for_each(tmp, &peer_ni->ksnp_conns) {
-		struct ksock_conn *c = list_entry(tmp, struct ksock_conn,
-						  ksnc_list);
-		int nob = atomic_read(&c->ksnc_tx_nob) +
-			  c->ksnc_sock->sk->sk_wmem_queued;
-		int rc;
+		ksock_conn_t *c  = list_entry(tmp, ksock_conn_t, ksnc_list);
+		int           nob = atomic_read(&c->ksnc_tx_nob) +
+					c->ksnc_sock->sk->sk_wmem_queued;
+                int           rc;
 
                 LASSERT (!c->ksnc_closing);
                 LASSERT (c->ksnc_proto != NULL &&
@@ -709,7 +651,7 @@ ksocknal_find_conn_locked(struct ksock_peer_ni *peer_ni, struct ksock_tx *tx, in
                 case SOCKNAL_MATCH_YES: /* typed connection */
                         if (typed == NULL || tnob > nob ||
                             (tnob == nob && *ksocknal_tunables.ksnd_round_robin &&
-			     typed->ksnc_tx_last_post > c->ksnc_tx_last_post)) {
+			     cfs_time_after(typed->ksnc_tx_last_post, c->ksnc_tx_last_post))) {
                                 typed = c;
                                 tnob  = nob;
                         }
@@ -718,7 +660,7 @@ ksocknal_find_conn_locked(struct ksock_peer_ni *peer_ni, struct ksock_tx *tx, in
                 case SOCKNAL_MATCH_MAY: /* fallback connection */
                         if (fallback == NULL || fnob > nob ||
                             (fnob == nob && *ksocknal_tunables.ksnd_round_robin &&
-			     fallback->ksnc_tx_last_post > c->ksnc_tx_last_post)) {
+			     cfs_time_after(fallback->ksnc_tx_last_post, c->ksnc_tx_last_post))) {
                                 fallback = c;
                                 fnob     = nob;
                         }
@@ -730,13 +672,13 @@ ksocknal_find_conn_locked(struct ksock_peer_ni *peer_ni, struct ksock_tx *tx, in
         conn = (typed != NULL) ? typed : fallback;
 
         if (conn != NULL)
-		conn->ksnc_tx_last_post = ktime_get_seconds();
+		conn->ksnc_tx_last_post = cfs_time_current();
 
         return conn;
 }
 
 void
-ksocknal_tx_prep(struct ksock_conn *conn, struct ksock_tx *tx)
+ksocknal_tx_prep(ksock_conn_t *conn, ksock_tx_t *tx)
 {
         conn->ksnc_proto->pro_pack(tx);
 
@@ -746,12 +688,12 @@ ksocknal_tx_prep(struct ksock_conn *conn, struct ksock_tx *tx)
 }
 
 void
-ksocknal_queue_tx_locked(struct ksock_tx *tx, struct ksock_conn *conn)
+ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn)
 {
-	struct ksock_sched *sched = conn->ksnc_scheduler;
-	struct ksock_msg *msg = &tx->tx_msg;
-	struct ksock_tx *ztx = NULL;
-	int bufnob = 0;
+        ksock_sched_t *sched = conn->ksnc_scheduler;
+	struct ksock_msg   *msg = &tx->tx_msg;
+        ksock_tx_t    *ztx = NULL;
+        int            bufnob = 0;
 
         /* called holding global lock (read or irq-write) and caller may
          * not have dropped this lock between finding conn and calling me,
@@ -787,10 +729,10 @@ ksocknal_queue_tx_locked(struct ksock_tx *tx, struct ksock_conn *conn)
 
 	if (list_empty(&conn->ksnc_tx_queue) && bufnob == 0) {
 		/* First packet starts the timeout */
-		conn->ksnc_tx_deadline = ktime_get_seconds() +
-					 lnet_get_lnd_timeout();
+		conn->ksnc_tx_deadline =
+			cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
 		if (conn->ksnc_tx_bufnob > 0) /* something got ACKed */
-			conn->ksnc_peer->ksnp_last_alive = ktime_get_seconds();
+			conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
 		conn->ksnc_tx_bufnob = 0;
 		smp_mb(); /* order with adding to tx_queue */
 	}
@@ -833,15 +775,15 @@ ksocknal_queue_tx_locked(struct ksock_tx *tx, struct ksock_conn *conn)
 }
 
 
-struct ksock_route *
-ksocknal_find_connectable_route_locked(struct ksock_peer_ni *peer_ni)
+ksock_route_t *
+ksocknal_find_connectable_route_locked (ksock_peer_ni_t *peer_ni)
 {
-	time64_t now = ktime_get_seconds();
-	struct list_head *tmp;
-	struct ksock_route *route;
+	cfs_time_t     now = cfs_time_current();
+	struct list_head    *tmp;
+        ksock_route_t *route;
 
 	list_for_each(tmp, &peer_ni->ksnp_routes) {
-		route = list_entry(tmp, struct ksock_route, ksnr_list);
+		route = list_entry(tmp, ksock_route_t, ksnr_list);
 
                 LASSERT (!route->ksnr_connecting || route->ksnr_scheduled);
 
@@ -853,14 +795,14 @@ ksocknal_find_connectable_route_locked(struct ksock_peer_ni *peer_ni)
                         continue;
 
                 if (!(route->ksnr_retry_interval == 0 || /* first attempt */
-		      now >= route->ksnr_timeout)) {
+		      cfs_time_aftereq(now, route->ksnr_timeout))) {
                         CDEBUG(D_NET,
 			       "Too soon to retry route %pI4h "
-			       "(cnted %d, interval %lld, %lld secs later)\n",
+			       "(cnted %d, interval %ld, %ld secs later)\n",
 			       &route->ksnr_ipaddr,
                                route->ksnr_connected,
                                route->ksnr_retry_interval,
-			       route->ksnr_timeout - now);
+			       cfs_duration_sec(route->ksnr_timeout - now));
                         continue;
                 }
 
@@ -870,14 +812,14 @@ ksocknal_find_connectable_route_locked(struct ksock_peer_ni *peer_ni)
         return (NULL);
 }
 
-struct ksock_route *
-ksocknal_find_connecting_route_locked(struct ksock_peer_ni *peer_ni)
+ksock_route_t *
+ksocknal_find_connecting_route_locked (ksock_peer_ni_t *peer_ni)
 {
-	struct list_head *tmp;
-	struct ksock_route *route;
+	struct list_head        *tmp;
+        ksock_route_t     *route;
 
 	list_for_each(tmp, &peer_ni->ksnp_routes) {
-		route = list_entry(tmp, struct ksock_route, ksnr_list);
+		route = list_entry(tmp, ksock_route_t, ksnr_list);
 
                 LASSERT (!route->ksnr_connecting || route->ksnr_scheduled);
 
@@ -889,14 +831,14 @@ ksocknal_find_connecting_route_locked(struct ksock_peer_ni *peer_ni)
 }
 
 int
-ksocknal_launch_packet(struct lnet_ni *ni, struct ksock_tx *tx,
+ksocknal_launch_packet(struct lnet_ni *ni, ksock_tx_t *tx,
 		       struct lnet_process_id id)
 {
-	struct ksock_peer_ni *peer_ni;
-	struct ksock_conn *conn;
-	rwlock_t *g_lock;
-	int retry;
-	int rc;
+        ksock_peer_ni_t     *peer_ni;
+        ksock_conn_t     *conn;
+	rwlock_t     *g_lock;
+        int               retry;
+        int               rc;
 
         LASSERT (tx->tx_conn == NULL);
 
@@ -964,8 +906,8 @@ ksocknal_launch_packet(struct lnet_ni *ni, struct ksock_tx *tx,
         if (peer_ni->ksnp_accepting > 0 ||
             ksocknal_find_connecting_route_locked (peer_ni) != NULL) {
                 /* the message is going to be pinned to the peer_ni */
-		tx->tx_deadline = ktime_get_seconds() +
-				  lnet_get_lnd_timeout();
+		tx->tx_deadline =
+			cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
 
                 /* Queue the message until a connection is established */
 		list_add_tail(&tx->tx_list, &peer_ni->ksnp_tx_queue);
@@ -977,7 +919,6 @@ ksocknal_launch_packet(struct lnet_ni *ni, struct ksock_tx *tx,
 
         /* NB Routes may be ignored if connections to them failed recently */
         CNETERR("No usable routes to %s\n", libcfs_id2str(id));
-	tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_ERROR;
         return (-EHOSTUNREACH);
 }
 
@@ -992,7 +933,7 @@ ksocknal_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
         lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
         unsigned int      payload_offset = lntmsg->msg_offset;
         unsigned int      payload_nob = lntmsg->msg_len;
-	struct ksock_tx *tx;
+        ksock_tx_t       *tx;
         int               desc_size;
         int               rc;
 
@@ -1009,10 +950,10 @@ ksocknal_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
 	LASSERT (!in_interrupt ());
 
 	if (payload_iov != NULL)
-		desc_size = offsetof(struct ksock_tx,
+		desc_size = offsetof(ksock_tx_t,
 				     tx_frags.virt.iov[1 + payload_niov]);
 	else
-		desc_size = offsetof(struct ksock_tx,
+		desc_size = offsetof(ksock_tx_t,
 				     tx_frags.paged.kiov[payload_niov]);
 
         if (lntmsg->msg_vmflush)
@@ -1062,7 +1003,6 @@ ksocknal_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
         if (rc == 0)
                 return (0);
 
-	lntmsg->msg_health_status = tx->tx_hstatus;
         ksocknal_free_tx(tx);
         return (-EIO);
 }
@@ -1090,12 +1030,13 @@ ksocknal_thread_fini (void)
 }
 
 int
-ksocknal_new_packet(struct ksock_conn *conn, int nob_to_skip)
+ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip)
 {
         static char ksocknal_slop_buffer[4096];
-	int nob;
-	unsigned int niov;
-	int skipped;
+
+        int            nob;
+        unsigned int   niov;
+        int            skipped;
 
         LASSERT(conn->ksnc_proto != NULL);
 
@@ -1171,9 +1112,7 @@ ksocknal_new_packet(struct ksock_conn *conn, int nob_to_skip)
 }
 
 static int
-ksocknal_process_receive(struct ksock_conn *conn,
-			 struct page **rx_scratch_pgs,
-			 struct kvec *scratch_iov)
+ksocknal_process_receive (ksock_conn_t *conn)
 {
 	struct lnet_hdr *lhdr;
 	struct lnet_process_id *id;
@@ -1183,14 +1122,13 @@ ksocknal_process_receive(struct ksock_conn *conn,
 
 	/* NB: sched lock NOT held */
 	/* SOCKNAL_RX_LNET_HEADER is here for backward compatibility */
-	LASSERT(conn->ksnc_rx_state == SOCKNAL_RX_KSM_HEADER ||
-		conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD ||
-		conn->ksnc_rx_state == SOCKNAL_RX_LNET_HEADER ||
-		conn->ksnc_rx_state == SOCKNAL_RX_SLOP);
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_KSM_HEADER ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_LNET_HEADER ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_SLOP);
  again:
-	if (conn->ksnc_rx_nob_wanted != 0) {
-		rc = ksocknal_receive(conn, rx_scratch_pgs,
-				      scratch_iov);
+        if (conn->ksnc_rx_nob_wanted != 0) {
+                rc = ksocknal_receive(conn);
 
 		if (rc <= 0) {
 			struct lnet_process_id ksnp_id;
@@ -1356,10 +1294,7 @@ ksocknal_process_receive(struct ksock_conn *conn,
                                         le64_to_cpu(lhdr->src_nid) != id->nid);
                 }
 
-		if (rc && conn->ksnc_lnet_msg)
-			conn->ksnc_lnet_msg->msg_health_status =
-				LNET_MSG_STATUS_REMOTE_ERROR;
-		lnet_finalize(conn->ksnc_lnet_msg, rc);
+		lnet_finalize(conn->ksnc_cookie, rc);
 
                 if (rc != 0) {
                         ksocknal_new_packet(conn, 0);
@@ -1389,15 +1324,15 @@ ksocknal_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg,
 	      lnet_kiov_t *kiov, unsigned int offset, unsigned int mlen,
 	      unsigned int rlen)
 {
-	struct ksock_conn *conn = private;
-	struct ksock_sched *sched = conn->ksnc_scheduler;
+        ksock_conn_t  *conn = (ksock_conn_t *)private;
+        ksock_sched_t *sched = conn->ksnc_scheduler;
 
         LASSERT (mlen <= rlen);
         LASSERT (niov <= LNET_MAX_IOV);
 
-	conn->ksnc_lnet_msg = msg;
-	conn->ksnc_rx_nob_wanted = mlen;
-	conn->ksnc_rx_nob_left   = rlen;
+        conn->ksnc_cookie = msg;
+        conn->ksnc_rx_nob_wanted = mlen;
+        conn->ksnc_rx_nob_left   = rlen;
 
         if (mlen == 0 || iov != NULL) {
                 conn->ksnc_rx_nkiov = 0;
@@ -1443,7 +1378,7 @@ ksocknal_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg,
 }
 
 static inline int
-ksocknal_sched_cansleep(struct ksock_sched *sched)
+ksocknal_sched_cansleep(ksock_sched_t *sched)
 {
 	int           rc;
 
@@ -1459,169 +1394,154 @@ ksocknal_sched_cansleep(struct ksock_sched *sched)
 
 int ksocknal_scheduler(void *arg)
 {
-	struct ksock_sched *sched;
-	struct ksock_conn *conn;
-	struct ksock_tx	*tx;
-	int rc;
-	int nloops = 0;
-	long id = (long)arg;
-	struct page **rx_scratch_pgs;
-	struct kvec *scratch_iov;
-
-	sched = ksocknal_data.ksnd_schedulers[KSOCK_THREAD_CPT(id)];
-
-	LIBCFS_CPT_ALLOC(rx_scratch_pgs, lnet_cpt_table(), sched->kss_cpt,
-			 sizeof(*rx_scratch_pgs) * LNET_MAX_IOV);
-	if (!rx_scratch_pgs) {
-		CERROR("Unable to allocate scratch pages\n");
-		return -ENOMEM;
-	}
+	struct ksock_sched_info	*info;
+	ksock_sched_t		*sched;
+	ksock_conn_t		*conn;
+	ksock_tx_t		*tx;
+	int			rc;
+	int			nloops = 0;
+	long			id = (long)arg;
 
-	LIBCFS_CPT_ALLOC(scratch_iov, lnet_cpt_table(), sched->kss_cpt,
-			 sizeof(*scratch_iov) * LNET_MAX_IOV);
-	if (!scratch_iov) {
-		CERROR("Unable to allocate scratch iov\n");
-		return -ENOMEM;
-	}
+	info = ksocknal_data.ksnd_sched_info[KSOCK_THREAD_CPT(id)];
+	sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
 
 	cfs_block_allsigs();
 
-	rc = cfs_cpt_bind(lnet_cpt_table(), sched->kss_cpt);
+	rc = cfs_cpt_bind(lnet_cpt_table(), info->ksi_cpt);
 	if (rc != 0) {
 		CWARN("Can't set CPU partition affinity to %d: %d\n",
-			sched->kss_cpt, rc);
+			info->ksi_cpt, rc);
 	}
 
 	spin_lock_bh(&sched->kss_lock);
 
-	while (!ksocknal_data.ksnd_shuttingdown) {
-		int did_something = 0;
+        while (!ksocknal_data.ksnd_shuttingdown) {
+                int did_something = 0;
 
-		/* Ensure I progress everything semi-fairly */
+                /* Ensure I progress everything semi-fairly */
 
 		if (!list_empty(&sched->kss_rx_conns)) {
 			conn = list_entry(sched->kss_rx_conns.next,
-					  struct ksock_conn, ksnc_rx_list);
+                                              ksock_conn_t, ksnc_rx_list);
 			list_del(&conn->ksnc_rx_list);
 
-			LASSERT(conn->ksnc_rx_scheduled);
-			LASSERT(conn->ksnc_rx_ready);
+                        LASSERT(conn->ksnc_rx_scheduled);
+                        LASSERT(conn->ksnc_rx_ready);
 
-			/* clear rx_ready in case receive isn't complete.
-			 * Do it BEFORE we call process_recv, since
-			 * data_ready can set it any time after we release
-			 * kss_lock. */
-			conn->ksnc_rx_ready = 0;
+                        /* clear rx_ready in case receive isn't complete.
+                         * Do it BEFORE we call process_recv, since
+                         * data_ready can set it any time after we release
+                         * kss_lock. */
+                        conn->ksnc_rx_ready = 0;
 			spin_unlock_bh(&sched->kss_lock);
 
-			rc = ksocknal_process_receive(conn, rx_scratch_pgs,
-						      scratch_iov);
+			rc = ksocknal_process_receive(conn);
 
 			spin_lock_bh(&sched->kss_lock);
 
-			/* I'm the only one that can clear this flag */
-			LASSERT(conn->ksnc_rx_scheduled);
+                        /* I'm the only one that can clear this flag */
+                        LASSERT(conn->ksnc_rx_scheduled);
 
-			/* Did process_receive get everything it wanted? */
-			if (rc == 0)
-				conn->ksnc_rx_ready = 1;
-
-			if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) {
-				/* Conn blocked waiting for ksocknal_recv()
-				 * I change its state (under lock) to signal
-				 * it can be rescheduled */
-				conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT;
-			} else if (conn->ksnc_rx_ready) {
-				/* reschedule for rx */
+                        /* Did process_receive get everything it wanted? */
+                        if (rc == 0)
+                                conn->ksnc_rx_ready = 1;
+
+                        if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) {
+                                /* Conn blocked waiting for ksocknal_recv()
+                                 * I change its state (under lock) to signal
+                                 * it can be rescheduled */
+                                conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT;
+                        } else if (conn->ksnc_rx_ready) {
+                                /* reschedule for rx */
 				list_add_tail(&conn->ksnc_rx_list,
-						   &sched->kss_rx_conns);
-			} else {
-				conn->ksnc_rx_scheduled = 0;
-				/* drop my ref */
-				ksocknal_conn_decref(conn);
-			}
+                                                   &sched->kss_rx_conns);
+                        } else {
+                                conn->ksnc_rx_scheduled = 0;
+                                /* drop my ref */
+                                ksocknal_conn_decref(conn);
+                        }
 
-			did_something = 1;
-		}
+                        did_something = 1;
+                }
 
 		if (!list_empty(&sched->kss_tx_conns)) {
 			struct list_head zlist = LIST_HEAD_INIT(zlist);
 
 			if (!list_empty(&sched->kss_zombie_noop_txs)) {
 				list_add(&zlist,
-					 &sched->kss_zombie_noop_txs);
+                                             &sched->kss_zombie_noop_txs);
 				list_del_init(&sched->kss_zombie_noop_txs);
-			}
+                        }
 
 			conn = list_entry(sched->kss_tx_conns.next,
-					  struct ksock_conn, ksnc_tx_list);
+                                              ksock_conn_t, ksnc_tx_list);
 			list_del(&conn->ksnc_tx_list);
 
-			LASSERT(conn->ksnc_tx_scheduled);
-			LASSERT(conn->ksnc_tx_ready);
+                        LASSERT(conn->ksnc_tx_scheduled);
+                        LASSERT(conn->ksnc_tx_ready);
 			LASSERT(!list_empty(&conn->ksnc_tx_queue));
 
 			tx = list_entry(conn->ksnc_tx_queue.next,
-					struct ksock_tx, tx_list);
+                                            ksock_tx_t, tx_list);
 
-			if (conn->ksnc_tx_carrier == tx)
-				ksocknal_next_tx_carrier(conn);
+                        if (conn->ksnc_tx_carrier == tx)
+                                ksocknal_next_tx_carrier(conn);
 
-			/* dequeue now so empty list => more to send */
+                        /* dequeue now so empty list => more to send */
 			list_del(&tx->tx_list);
 
-			/* Clear tx_ready in case send isn't complete.  Do
-			 * it BEFORE we call process_transmit, since
-			 * write_space can set it any time after we release
-			 * kss_lock. */
-			conn->ksnc_tx_ready = 0;
+                        /* Clear tx_ready in case send isn't complete.  Do
+                         * it BEFORE we call process_transmit, since
+                         * write_space can set it any time after we release
+                         * kss_lock. */
+                        conn->ksnc_tx_ready = 0;
 			spin_unlock_bh(&sched->kss_lock);
 
 			if (!list_empty(&zlist)) {
 				/* free zombie noop txs, it's fast because
-				 * noop txs are just put in freelist */
-				ksocknal_txlist_done(NULL, &zlist, 0);
-			}
+                                 * noop txs are just put in freelist */
+                                ksocknal_txlist_done(NULL, &zlist, 0);
+                        }
 
-			rc = ksocknal_process_transmit(conn, tx, scratch_iov);
+                        rc = ksocknal_process_transmit(conn, tx);
 
-			if (rc == -ENOMEM || rc == -EAGAIN) {
-				/* Incomplete send: replace tx on HEAD of tx_queue */
+                        if (rc == -ENOMEM || rc == -EAGAIN) {
+                                /* Incomplete send: replace tx on HEAD of tx_queue */
 				spin_lock_bh(&sched->kss_lock);
 				list_add(&tx->tx_list,
-					 &conn->ksnc_tx_queue);
+					     &conn->ksnc_tx_queue);
 			} else {
 				/* Complete send; tx -ref */
 				ksocknal_tx_decref(tx);
 
 				spin_lock_bh(&sched->kss_lock);
-				/* assume space for more */
-				conn->ksnc_tx_ready = 1;
-			}
+                                /* assume space for more */
+                                conn->ksnc_tx_ready = 1;
+                        }
 
-			if (rc == -ENOMEM) {
-				/* Do nothing; after a short timeout, this
-				 * conn will be reposted on kss_tx_conns. */
-			} else if (conn->ksnc_tx_ready &&
+                        if (rc == -ENOMEM) {
+                                /* Do nothing; after a short timeout, this
+                                 * conn will be reposted on kss_tx_conns. */
+                        } else if (conn->ksnc_tx_ready &&
 				   !list_empty(&conn->ksnc_tx_queue)) {
-				/* reschedule for tx */
+                                /* reschedule for tx */
 				list_add_tail(&conn->ksnc_tx_list,
-					      &sched->kss_tx_conns);
-			} else {
-				conn->ksnc_tx_scheduled = 0;
-				/* drop my ref */
-				ksocknal_conn_decref(conn);
-			}
+                                                   &sched->kss_tx_conns);
+                        } else {
+                                conn->ksnc_tx_scheduled = 0;
+                                /* drop my ref */
+                                ksocknal_conn_decref(conn);
+                        }
 
-			did_something = 1;
-		}
-		if (!did_something ||           /* nothing to do */
-		    ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */
+                        did_something = 1;
+                }
+                if (!did_something ||           /* nothing to do */
+                    ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */
 			spin_unlock_bh(&sched->kss_lock);
 
-			nloops = 0;
+                        nloops = 0;
 
-			if (!did_something) {   /* wait for something to do */
+                        if (!did_something) {   /* wait for something to do */
 				rc = wait_event_interruptible_exclusive(
 					sched->kss_waitq,
 					!ksocknal_sched_cansleep(sched));
@@ -1635,10 +1555,6 @@ int ksocknal_scheduler(void *arg)
 	}
 
 	spin_unlock_bh(&sched->kss_lock);
-	LIBCFS_FREE(rx_scratch_pgs, sizeof(*rx_scratch_pgs) *
-		    LNET_MAX_IOV);
-	LIBCFS_FREE(scratch_iov, sizeof(*scratch_iov) *
-		    LNET_MAX_IOV);
 	ksocknal_thread_fini();
 	return 0;
 }
@@ -1647,9 +1563,9 @@ int ksocknal_scheduler(void *arg)
  * Add connection to kss_rx_conns of scheduler
  * and wakeup the scheduler.
  */
-void ksocknal_read_callback(struct ksock_conn *conn)
+void ksocknal_read_callback (ksock_conn_t *conn)
 {
-	struct ksock_sched *sched;
+	ksock_sched_t *sched;
 	ENTRY;
 
 	sched = conn->ksnc_scheduler;
@@ -1676,9 +1592,9 @@ void ksocknal_read_callback(struct ksock_conn *conn)
  * Add connection to kss_tx_conns of scheduler
  * and wakeup the scheduler.
  */
-void ksocknal_write_callback(struct ksock_conn *conn)
+void ksocknal_write_callback(ksock_conn_t *conn)
 {
-	struct ksock_sched *sched;
+	ksock_sched_t *sched;
 	ENTRY;
 
 	sched = conn->ksnc_scheduler;
@@ -1702,7 +1618,7 @@ void ksocknal_write_callback(struct ksock_conn *conn)
 	EXIT;
 }
 
-static struct ksock_proto *
+static ksock_proto_t *
 ksocknal_parse_proto_version (struct ksock_hello_msg *hello)
 {
         __u32   version = 0;
@@ -1747,13 +1663,13 @@ ksocknal_parse_proto_version (struct ksock_hello_msg *hello)
 }
 
 int
-ksocknal_send_hello(struct lnet_ni *ni, struct ksock_conn *conn,
+ksocknal_send_hello(struct lnet_ni *ni, ksock_conn_t *conn,
 		    lnet_nid_t peer_nid, struct ksock_hello_msg *hello)
 {
 	/* CAVEAT EMPTOR: this byte flips 'ipaddrs' */
-	struct ksock_net *net = (struct ksock_net *)ni->ni_data;
+	ksock_net_t         *net = (ksock_net_t *)ni->ni_data;
 
-	LASSERT(hello->kshm_nips <= LNET_INTERFACES_NUM);
+	LASSERT(hello->kshm_nips <= LNET_NUM_INTERFACES);
 
 	/* rely on caller to hold a ref on socket so it wouldn't disappear */
 	LASSERT(conn->ksnc_proto != NULL);
@@ -1786,7 +1702,7 @@ ksocknal_invert_type(int type)
 }
 
 int
-ksocknal_recv_hello(struct lnet_ni *ni, struct ksock_conn *conn,
+ksocknal_recv_hello(struct lnet_ni *ni, ksock_conn_t *conn,
 		    struct ksock_hello_msg *hello,
 		    struct lnet_process_id *peerid,
 		    __u64 *incarnation)
@@ -1801,13 +1717,13 @@ ksocknal_recv_hello(struct lnet_ni *ni, struct ksock_conn *conn,
         int                  timeout;
         int                  proto_match;
         int                  rc;
-	struct ksock_proto *proto;
-	struct lnet_process_id recv_id;
+        ksock_proto_t       *proto;
+	struct lnet_process_id    recv_id;
 
 	/* socket type set on active connections - not set on passive */
 	LASSERT(!active == !(conn->ksnc_type != SOCKLND_CONN_NONE));
 
-	timeout = active ? lnet_get_lnd_timeout() :
+	timeout = active ? *ksocknal_tunables.ksnd_timeout :
 			    lnet_acceptor_timeout();
 
 	rc = lnet_sock_read(sock, &hello->kshm_magic,
@@ -1931,18 +1847,19 @@ ksocknal_recv_hello(struct lnet_ni *ni, struct ksock_conn *conn,
 }
 
 static int
-ksocknal_connect(struct ksock_route *route)
+ksocknal_connect (ksock_route_t *route)
 {
-	struct list_head zombies = LIST_HEAD_INIT(zombies);
-	struct ksock_peer_ni *peer_ni = route->ksnr_peer;
+	struct list_head        zombies = LIST_HEAD_INIT(zombies);
+        ksock_peer_ni_t     *peer_ni = route->ksnr_peer;
         int               type;
         int               wanted;
 	struct socket     *sock;
-	time64_t deadline;
+	cfs_time_t        deadline;
         int               retry_later = 0;
         int               rc = 0;
 
-	deadline = ktime_get_seconds() + lnet_get_lnd_timeout();
+	deadline = cfs_time_add(cfs_time_current(),
+				cfs_time_seconds(*ksocknal_tunables.ksnd_timeout));
 
 	write_lock_bh(&ksocknal_data.ksnd_global_lock);
 
@@ -1986,7 +1903,7 @@ ksocknal_connect(struct ksock_route *route)
 
 		write_unlock_bh(&ksocknal_data.ksnd_global_lock);
 
-		if (ktime_get_seconds() >= deadline) {
+		if (cfs_time_aftereq(cfs_time_current(), deadline)) {
                         rc = -ETIMEDOUT;
                         lnet_connect_console_error(rc, peer_ni->ksnp_id.nid,
                                                    route->ksnr_ipaddr,
@@ -1994,12 +1911,12 @@ ksocknal_connect(struct ksock_route *route)
                         goto failed;
                 }
 
-		rc = lnet_connect(&sock, peer_ni->ksnp_id.nid,
-				  route->ksnr_myipaddr,
+                rc = lnet_connect(&sock, peer_ni->ksnp_id.nid,
+                                  route->ksnr_myipaddr,
 				  route->ksnr_ipaddr, route->ksnr_port,
 				  peer_ni->ksnp_ni->ni_net_ns);
-		if (rc != 0)
-			goto failed;
+                if (rc != 0)
+                        goto failed;
 
                 rc = ksocknal_create_conn(peer_ni->ksnp_ni, route, sock, type);
                 if (rc < 0) {
@@ -2032,9 +1949,10 @@ ksocknal_connect(struct ksock_route *route)
                          * attempt to connect if we lost conn race,
                          * but the race is resolved quickly usually,
                          * so min_reconnectms should be good heuristic */
-			route->ksnr_retry_interval = *ksocknal_tunables.ksnd_min_reconnectms / 1000;
-			route->ksnr_timeout = ktime_get_seconds() +
-					      route->ksnr_retry_interval;
+			route->ksnr_retry_interval =
+				cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000;
+			route->ksnr_timeout = cfs_time_add(cfs_time_current(),
+							   route->ksnr_retry_interval);
                 }
 
                 ksocknal_launch_connection_locked(route);
@@ -2052,25 +1970,26 @@ ksocknal_connect(struct ksock_route *route)
 	/* This is a retry rather than a new connection */
 	route->ksnr_retry_interval *= 2;
 	route->ksnr_retry_interval =
-		max_t(time64_t, route->ksnr_retry_interval,
-		      *ksocknal_tunables.ksnd_min_reconnectms / 1000);
+		MAX(route->ksnr_retry_interval,
+		    cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000);
 	route->ksnr_retry_interval =
-		min_t(time64_t, route->ksnr_retry_interval,
-		      *ksocknal_tunables.ksnd_max_reconnectms / 1000);
+		MIN(route->ksnr_retry_interval,
+		    cfs_time_seconds(*ksocknal_tunables.ksnd_max_reconnectms)/1000);
 
-	LASSERT(route->ksnr_retry_interval);
-	route->ksnr_timeout = ktime_get_seconds() + route->ksnr_retry_interval;
+	LASSERT (route->ksnr_retry_interval != 0);
+	route->ksnr_timeout = cfs_time_add(cfs_time_current(),
+					   route->ksnr_retry_interval);
 
 	if (!list_empty(&peer_ni->ksnp_tx_queue) &&
             peer_ni->ksnp_accepting == 0 &&
             ksocknal_find_connecting_route_locked(peer_ni) == NULL) {
-		struct ksock_conn *conn;
+                ksock_conn_t *conn;
 
                 /* ksnp_tx_queue is queued on a conn on successful
                  * connection for V1.x and V2.x */
 		if (!list_empty(&peer_ni->ksnp_conns)) {
 			conn = list_entry(peer_ni->ksnp_conns.next,
-					  struct ksock_conn, ksnc_list);
+                                              ksock_conn_t, ksnc_list);
                         LASSERT (conn->ksnc_proto == &ksocknal_protocol_v3x);
                 }
 
@@ -2093,7 +2012,7 @@ ksocknal_connect(struct ksock_route *route)
  * running out of resource.
  */
 static int
-ksocknal_connd_check_start(time64_t sec, long *timeout)
+ksocknal_connd_check_start(long sec, long *timeout)
 {
 	char name[16];
         int rc;
@@ -2143,7 +2062,7 @@ ksocknal_connd_check_start(time64_t sec, long *timeout)
         /* we tried ... */
         LASSERT(ksocknal_data.ksnd_connd_starting > 0);
         ksocknal_data.ksnd_connd_starting--;
-	ksocknal_data.ksnd_connd_failed_stamp = ktime_get_real_seconds();
+	ksocknal_data.ksnd_connd_failed_stamp = cfs_time_current_sec();
 
         return 1;
 }
@@ -2155,7 +2074,7 @@ ksocknal_connd_check_start(time64_t sec, long *timeout)
  * again to recheck these conditions.
  */
 static int
-ksocknal_connd_check_stop(time64_t sec, long *timeout)
+ksocknal_connd_check_stop(long sec, long *timeout)
 {
         int val;
 
@@ -2191,36 +2110,38 @@ ksocknal_connd_check_stop(time64_t sec, long *timeout)
 
 /* Go through connd_routes queue looking for a route that we can process
  * right now, @timeout_p can be updated if we need to come back later */
-static struct ksock_route *
+static ksock_route_t *
 ksocknal_connd_get_route_locked(signed long *timeout_p)
 {
-	time64_t now = ktime_get_seconds();
-	struct ksock_route *route;
+	ksock_route_t *route;
+	cfs_time_t     now;
+
+	now = cfs_time_current();
 
 	/* connd_routes can contain both pending and ordinary routes */
 	list_for_each_entry(route, &ksocknal_data.ksnd_connd_routes,
 				 ksnr_connd_list) {
 
 		if (route->ksnr_retry_interval == 0 ||
-		    now >= route->ksnr_timeout)
+		    cfs_time_aftereq(now, route->ksnr_timeout))
 			return route;
 
 		if (*timeout_p == MAX_SCHEDULE_TIMEOUT ||
-		    *timeout_p > cfs_time_seconds(route->ksnr_timeout - now))
-			*timeout_p = cfs_time_seconds(route->ksnr_timeout - now);
+		    (int)*timeout_p > (int)(route->ksnr_timeout - now))
+			*timeout_p = (int)(route->ksnr_timeout - now);
 	}
 
 	return NULL;
 }
 
 int
-ksocknal_connd(void *arg)
+ksocknal_connd (void *arg)
 {
-	spinlock_t *connd_lock = &ksocknal_data.ksnd_connd_lock;
-	struct ksock_connreq *cr;
+	spinlock_t    *connd_lock = &ksocknal_data.ksnd_connd_lock;
+	ksock_connreq_t   *cr;
 	wait_queue_entry_t wait;
-	int nloops = 0;
-	int cons_retry = 0;
+	int                nloops = 0;
+	int                cons_retry = 0;
 
 	cfs_block_allsigs();
 
@@ -2233,8 +2154,8 @@ ksocknal_connd(void *arg)
 	ksocknal_data.ksnd_connd_running++;
 
 	while (!ksocknal_data.ksnd_shuttingdown) {
-		struct ksock_route *route = NULL;
-		time64_t sec = ktime_get_real_seconds();
+		ksock_route_t *route = NULL;
+		long sec = cfs_time_current_sec();
 		long timeout = MAX_SCHEDULE_TIMEOUT;
 		int  dropped_lock = 0;
 
@@ -2251,8 +2172,8 @@ ksocknal_connd(void *arg)
 
 		if (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) {
                         /* Connection accepted by the listener */
-			cr = list_entry(ksocknal_data.ksnd_connd_connreqs.next,
-					struct ksock_connreq, ksncr_list);
+			cr = list_entry(ksocknal_data.ksnd_connd_connreqs. \
+                                            next, ksock_connreq_t, ksncr_list);
 
 			list_del(&cr->ksncr_list);
 			spin_unlock_bh(connd_lock);
@@ -2326,18 +2247,16 @@ ksocknal_connd(void *arg)
 	return 0;
 }
 
-static struct ksock_conn *
-ksocknal_find_timed_out_conn(struct ksock_peer_ni *peer_ni)
+static ksock_conn_t *
+ksocknal_find_timed_out_conn (ksock_peer_ni_t *peer_ni)
 {
         /* We're called with a shared lock on ksnd_global_lock */
-	struct ksock_conn *conn;
-	struct list_head *ctmp;
-	struct ksock_tx *tx;
+        ksock_conn_t      *conn;
+	struct list_head        *ctmp;
 
 	list_for_each(ctmp, &peer_ni->ksnp_conns) {
-		int error;
-
-		conn = list_entry(ctmp, struct ksock_conn, ksnc_list);
+                int     error;
+		conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
 
                 /* Don't need the {get,put}connsock dance to deref ksnc_sock */
                 LASSERT (!conn->ksnc_closing);
@@ -2377,7 +2296,8 @@ ksocknal_find_timed_out_conn(struct ksock_peer_ni *peer_ni)
                 }
 
                 if (conn->ksnc_rx_started &&
-		    ktime_get_seconds() >= conn->ksnc_rx_deadline) {
+		    cfs_time_aftereq(cfs_time_current(),
+				     conn->ksnc_rx_deadline)) {
                         /* Timed out incomplete incoming message */
                         ksocknal_conn_addref(conn);
 			CNETERR("Timeout receiving from %s (%pI4h:%d), "
@@ -2393,14 +2313,11 @@ ksocknal_find_timed_out_conn(struct ksock_peer_ni *peer_ni)
 
 		if ((!list_empty(&conn->ksnc_tx_queue) ||
 		     conn->ksnc_sock->sk->sk_wmem_queued != 0) &&
-		    ktime_get_seconds() >= conn->ksnc_tx_deadline) {
+		    cfs_time_aftereq(cfs_time_current(),
+				     conn->ksnc_tx_deadline)) {
                         /* Timed out messages queued for sending or
                          * buffered in the socket's send buffer */
                         ksocknal_conn_addref(conn);
-			list_for_each_entry(tx, &conn->ksnc_tx_queue,
-					    tx_list)
-				tx->tx_hstatus =
-					LNET_MSG_STATUS_LOCAL_TIMEOUT;
 			CNETERR("Timeout sending data to %s (%pI4h:%d) "
                                 "the network or that node may be down.\n",
                                 libcfs_id2str(peer_ni->ksnp_id),
@@ -2413,22 +2330,21 @@ ksocknal_find_timed_out_conn(struct ksock_peer_ni *peer_ni)
 }
 
 static inline void
-ksocknal_flush_stale_txs(struct ksock_peer_ni *peer_ni)
+ksocknal_flush_stale_txs(ksock_peer_ni_t *peer_ni)
 {
-	struct ksock_tx	*tx;
-	struct list_head stale_txs = LIST_HEAD_INIT(stale_txs);
+	ksock_tx_t	  *tx;
+	struct list_head	stale_txs = LIST_HEAD_INIT(stale_txs);
 
 	write_lock_bh(&ksocknal_data.ksnd_global_lock);
 
 	while (!list_empty(&peer_ni->ksnp_tx_queue)) {
 		tx = list_entry(peer_ni->ksnp_tx_queue.next,
-				struct ksock_tx, tx_list);
+				     ksock_tx_t, tx_list);
 
-		if (ktime_get_seconds() < tx->tx_deadline)
+		if (!cfs_time_aftereq(cfs_time_current(),
+				      tx->tx_deadline))
 			break;
 
-		tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_TIMEOUT;
-
 		list_del(&tx->tx_list);
 		list_add_tail(&tx->tx_list, &stale_txs);
 	}
@@ -2439,12 +2355,12 @@ ksocknal_flush_stale_txs(struct ksock_peer_ni *peer_ni)
 }
 
 static int
-ksocknal_send_keepalive_locked(struct ksock_peer_ni *peer_ni)
+ksocknal_send_keepalive_locked(ksock_peer_ni_t *peer_ni)
 __must_hold(&ksocknal_data.ksnd_global_lock)
 {
-	struct ksock_sched *sched;
-	struct ksock_conn *conn;
-	struct ksock_tx *tx;
+        ksock_sched_t  *sched;
+        ksock_conn_t   *conn;
+        ksock_tx_t     *tx;
 
 	/* last_alive will be updated by create_conn */
 	if (list_empty(&peer_ni->ksnp_conns))
@@ -2454,16 +2370,18 @@ __must_hold(&ksocknal_data.ksnd_global_lock)
                 return 0;
 
         if (*ksocknal_tunables.ksnd_keepalive <= 0 ||
-	    ktime_get_seconds() < peer_ni->ksnp_last_alive +
-				  *ksocknal_tunables.ksnd_keepalive)
+	    cfs_time_before(cfs_time_current(),
+			    cfs_time_add(peer_ni->ksnp_last_alive,
+					 cfs_time_seconds(*ksocknal_tunables.ksnd_keepalive))))
                 return 0;
 
-	if (ktime_get_seconds() < peer_ni->ksnp_send_keepalive)
+	if (cfs_time_before(cfs_time_current(),
+			    peer_ni->ksnp_send_keepalive))
                 return 0;
 
         /* retry 10 secs later, so we wouldn't put pressure
          * on this peer_ni if we failed to send keepalive this time */
-	peer_ni->ksnp_send_keepalive = ktime_get_seconds() + 10;
+	peer_ni->ksnp_send_keepalive = cfs_time_shift(10);
 
         conn = ksocknal_find_conn_locked(peer_ni, NULL, 1);
         if (conn != NULL) {
@@ -2501,12 +2419,12 @@ __must_hold(&ksocknal_data.ksnd_global_lock)
 
 
 static void
-ksocknal_check_peer_timeouts(int idx)
+ksocknal_check_peer_timeouts (int idx)
 {
-	struct list_head *peers = &ksocknal_data.ksnd_peers[idx];
-	struct ksock_peer_ni *peer_ni;
-	struct ksock_conn *conn;
-	struct ksock_tx *tx;
+	struct list_head       *peers = &ksocknal_data.ksnd_peers[idx];
+        ksock_peer_ni_t     *peer_ni;
+        ksock_conn_t     *conn;
+        ksock_tx_t       *tx;
 
  again:
         /* NB. We expect to have a look at all the peers and not find any
@@ -2515,10 +2433,10 @@ ksocknal_check_peer_timeouts(int idx)
 	read_lock(&ksocknal_data.ksnd_global_lock);
 
 	list_for_each_entry(peer_ni, peers, ksnp_list) {
-		struct ksock_tx *tx_stale;
-		time64_t deadline = 0;
-		int resid = 0;
-		int n = 0;
+		ksock_tx_t *tx_stale;
+		cfs_time_t  deadline = 0;
+		int         resid = 0;
+		int         n     = 0;
 
                 if (ksocknal_send_keepalive_locked(peer_ni) != 0) {
 			read_unlock(&ksocknal_data.ksnd_global_lock);
@@ -2542,11 +2460,13 @@ ksocknal_check_peer_timeouts(int idx)
                 /* we can't process stale txs right here because we're
                  * holding only shared lock */
 		if (!list_empty(&peer_ni->ksnp_tx_queue)) {
-			struct ksock_tx *tx;
+                        ksock_tx_t *tx =
+				list_entry(peer_ni->ksnp_tx_queue.next,
+                                                ksock_tx_t, tx_list);
+
+			if (cfs_time_aftereq(cfs_time_current(),
+					     tx->tx_deadline)) {
 
-			tx = list_entry(peer_ni->ksnp_tx_queue.next,
-					struct ksock_tx, tx_list);
-			if (ktime_get_seconds() >= tx->tx_deadline) {
                                 ksocknal_peer_addref(peer_ni);
 				read_unlock(&ksocknal_data.ksnd_global_lock);
 
@@ -2563,7 +2483,8 @@ ksocknal_check_peer_timeouts(int idx)
 		tx_stale = NULL;
 		spin_lock(&peer_ni->ksnp_lock);
 		list_for_each_entry(tx, &peer_ni->ksnp_zc_req_list, tx_zc_list) {
-			if (ktime_get_seconds() < tx->tx_deadline)
+			if (!cfs_time_aftereq(cfs_time_current(),
+					      tx->tx_deadline))
                                 break;
                         /* ignore the TX if connection is being closed */
                         if (tx->tx_conn->ksnc_closing)
@@ -2587,10 +2508,10 @@ ksocknal_check_peer_timeouts(int idx)
 		read_unlock(&ksocknal_data.ksnd_global_lock);
 
 		CERROR("Total %d stale ZC_REQs for peer_ni %s detected; the "
-		       "oldest(%p) timed out %lld secs ago, "
+		       "oldest(%p) timed out %ld secs ago, "
 		       "resid: %d, wmem: %d\n",
 		       n, libcfs_nid2str(peer_ni->ksnp_id.nid), tx_stale,
-		       ktime_get_seconds() - deadline,
+		       cfs_duration_sec(cfs_time_current() - deadline),
 		       resid, conn->ksnc_sock->sk->sk_wmem_queued);
 
                 ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT);
@@ -2604,14 +2525,14 @@ ksocknal_check_peer_timeouts(int idx)
 int ksocknal_reaper(void *arg)
 {
 	wait_queue_entry_t wait;
-	struct ksock_conn *conn;
-	struct ksock_sched *sched;
-	struct list_head enomem_conns;
-	int nenomem_conns;
-	time64_t timeout;
-	int i;
-	int peer_index = 0;
-	time64_t deadline = ktime_get_seconds();
+	ksock_conn_t      *conn;
+	ksock_sched_t     *sched;
+	struct list_head         enomem_conns;
+        int                nenomem_conns;
+	cfs_duration_t     timeout;
+        int                i;
+        int                peer_index = 0;
+	cfs_time_t         deadline = cfs_time_current();
 
         cfs_block_allsigs ();
 
@@ -2621,9 +2542,11 @@ int ksocknal_reaper(void *arg)
 	spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
 
         while (!ksocknal_data.ksnd_shuttingdown) {
+
 		if (!list_empty(&ksocknal_data.ksnd_deathrow_conns)) {
-			conn = list_entry(ksocknal_data.ksnd_deathrow_conns.next,
-					  struct ksock_conn, ksnc_list);
+			conn = list_entry(ksocknal_data. \
+                                               ksnd_deathrow_conns.next,
+                                               ksock_conn_t, ksnc_list);
 			list_del(&conn->ksnc_list);
 
 			spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
@@ -2636,8 +2559,8 @@ int ksocknal_reaper(void *arg)
                 }
 
 		if (!list_empty(&ksocknal_data.ksnd_zombie_conns)) {
-			conn = list_entry(ksocknal_data.ksnd_zombie_conns.next,
-					  struct ksock_conn, ksnc_list);
+			conn = list_entry(ksocknal_data.ksnd_zombie_conns.\
+                                               next, ksock_conn_t, ksnc_list);
 			list_del(&conn->ksnc_list);
 
 			spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
@@ -2660,7 +2583,7 @@ int ksocknal_reaper(void *arg)
                 nenomem_conns = 0;
 		while (!list_empty(&enomem_conns)) {
 			conn = list_entry(enomem_conns.next,
-					  struct ksock_conn, ksnc_tx_list);
+                                               ksock_conn_t, ksnc_tx_list);
 			list_del(&conn->ksnc_tx_list);
 
                         sched = conn->ksnc_scheduler;
@@ -2678,11 +2601,11 @@ int ksocknal_reaper(void *arg)
                 }
 
                 /* careful with the jiffy wrap... */
-		while ((timeout = deadline - ktime_get_seconds()) <= 0) {
+		while ((timeout = cfs_time_sub(deadline,
+					       cfs_time_current())) <= 0) {
                         const int n = 4;
                         const int p = 1;
                         int       chunk = ksocknal_data.ksnd_peer_hash_size;
-			unsigned int lnd_timeout;
 
                         /* Time to check for timeouts on a few more peers: I do
                          * checks every 'p' seconds on a proportion of the peer_ni
@@ -2691,11 +2614,11 @@ int ksocknal_reaper(void *arg)
                          * timeout on any connection within (n+1)/n times the
                          * timeout interval. */
 
-			lnd_timeout = lnet_get_lnd_timeout();
-			if (lnd_timeout > n * p)
-				chunk = (chunk * n * p) / lnd_timeout;
-			if (chunk == 0)
-				chunk = 1;
+                        if (*ksocknal_tunables.ksnd_timeout > n * p)
+                                chunk = (chunk * n * p) /
+                                        *ksocknal_tunables.ksnd_timeout;
+                        if (chunk == 0)
+                                chunk = 1;
 
                         for (i = 0; i < chunk; i++) {
                                 ksocknal_check_peer_timeouts (peer_index);
@@ -2703,7 +2626,7 @@ int ksocknal_reaper(void *arg)
                                              ksocknal_data.ksnd_peer_hash_size;
                         }
 
-			deadline += p;
+			deadline = cfs_time_add(deadline, cfs_time_seconds(p));
                 }
 
                 if (nenomem_conns != 0) {
@@ -2712,16 +2635,16 @@ int ksocknal_reaper(void *arg)
                          * if any go back on my enomem list. */
                         timeout = SOCKNAL_ENOMEM_RETRY;
                 }
-		ksocknal_data.ksnd_reaper_waketime = ktime_get_seconds() +
-						     timeout;
+		ksocknal_data.ksnd_reaper_waketime =
+			cfs_time_add(cfs_time_current(), timeout);
 
-		set_current_state(TASK_INTERRUPTIBLE);
+			set_current_state(TASK_INTERRUPTIBLE);
 		add_wait_queue(&ksocknal_data.ksnd_reaper_waitq, &wait);
 
 		if (!ksocknal_data.ksnd_shuttingdown &&
 		    list_empty(&ksocknal_data.ksnd_deathrow_conns) &&
 		    list_empty(&ksocknal_data.ksnd_zombie_conns))
-			schedule_timeout(cfs_time_seconds(timeout));
+			schedule_timeout(timeout);
 
 		set_current_state(TASK_RUNNING);
 		remove_wait_queue(&ksocknal_data.ksnd_reaper_waitq, &wait);
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c
index 72f2bd526613e..91a9cf05e2ad8 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -33,11 +33,11 @@
 #include "socklnd.h"
 
 int
-ksocknal_lib_get_conn_addrs(struct ksock_conn *conn)
+ksocknal_lib_get_conn_addrs (ksock_conn_t *conn)
 {
 	int rc = lnet_sock_getaddr(conn->ksnc_sock, true,
-				   &conn->ksnc_ipaddr,
-				   &conn->ksnc_port);
+                                     &conn->ksnc_ipaddr,
+                                     &conn->ksnc_port);
 
         /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
         LASSERT (!conn->ksnc_closing);
@@ -58,7 +58,7 @@ ksocknal_lib_get_conn_addrs(struct ksock_conn *conn)
 }
 
 int
-ksocknal_lib_zc_capable(struct ksock_conn *conn)
+ksocknal_lib_zc_capable(ksock_conn_t *conn)
 {
 	int  caps = conn->ksnc_sock->sk->sk_route_caps;
 
@@ -71,8 +71,7 @@ ksocknal_lib_zc_capable(struct ksock_conn *conn)
 }
 
 int
-ksocknal_lib_send_iov(struct ksock_conn *conn, struct ksock_tx *tx,
-		      struct kvec *scratchiov)
+ksocknal_lib_send_iov(ksock_conn_t *conn, ksock_tx_t *tx)
 {
 	struct socket  *sock = conn->ksnc_sock;
 	int		nob;
@@ -93,6 +92,7 @@ ksocknal_lib_send_iov(struct ksock_conn *conn, struct ksock_tx *tx,
 		struct kvec *scratchiov = &scratch;
 		unsigned int niov = 1;
 #else
+		struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
 		unsigned int niov = tx->tx_niov;
 #endif
 		struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
@@ -113,42 +113,41 @@ ksocknal_lib_send_iov(struct ksock_conn *conn, struct ksock_tx *tx,
 }
 
 int
-ksocknal_lib_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx,
-		       struct kvec *scratchiov)
+ksocknal_lib_send_kiov(ksock_conn_t *conn, ksock_tx_t *tx)
 {
-	struct socket *sock = conn->ksnc_sock;
-	lnet_kiov_t   *kiov = tx->tx_kiov;
-	int            rc;
-	int            nob;
+        struct socket *sock = conn->ksnc_sock;
+        lnet_kiov_t   *kiov = tx->tx_kiov;
+        int            rc;
+        int            nob;
 
-	/* Not NOOP message */
-	LASSERT(tx->tx_lnetmsg != NULL);
+        /* Not NOOP message */
+        LASSERT (tx->tx_lnetmsg != NULL);
 
-	/* NB we can't trust socket ops to either consume our iovs
-	 * or leave them alone. */
-	if (tx->tx_msg.ksm_zc_cookies[0] != 0) {
-		/* Zero copy is enabled */
-		struct sock   *sk = sock->sk;
-		struct page   *page = kiov->kiov_page;
-		int            offset = kiov->kiov_offset;
-		int            fragsize = kiov->kiov_len;
-		int            msgflg = MSG_DONTWAIT;
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone. */
+        if (tx->tx_msg.ksm_zc_cookies[0] != 0) {
+                /* Zero copy is enabled */
+                struct sock   *sk = sock->sk;
+                struct page   *page = kiov->kiov_page;
+                int            offset = kiov->kiov_offset;
+                int            fragsize = kiov->kiov_len;
+                int            msgflg = MSG_DONTWAIT;
 
-		CDEBUG(D_NET, "page %p + offset %x for %d\n",
-			       page, offset, kiov->kiov_len);
+                CDEBUG(D_NET, "page %p + offset %x for %d\n",
+                               page, offset, kiov->kiov_len);
 
 		if (!list_empty(&conn->ksnc_tx_queue) ||
-		    fragsize < tx->tx_resid)
-			msgflg |= MSG_MORE;
-
-		if (sk->sk_prot->sendpage != NULL) {
-			rc = sk->sk_prot->sendpage(sk, page,
-						   offset, fragsize, msgflg);
-		} else {
-			rc = cfs_tcp_sendpage(sk, page, offset, fragsize,
-					      msgflg);
-		}
-	} else {
+                    fragsize < tx->tx_resid)
+                        msgflg |= MSG_MORE;
+
+                if (sk->sk_prot->sendpage != NULL) {
+                        rc = sk->sk_prot->sendpage(sk, page,
+                                                   offset, fragsize, msgflg);
+                } else {
+                        rc = cfs_tcp_sendpage(sk, page, offset, fragsize,
+                                              msgflg);
+                }
+        } else {
 #if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK
 		struct kvec	scratch;
 		struct kvec   *scratchiov = &scratch;
@@ -157,6 +156,7 @@ ksocknal_lib_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx,
 #ifdef CONFIG_HIGHMEM
 #warning "XXX risk of kmap deadlock on multiple frags..."
 #endif
+		struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
 		unsigned int  niov = tx->tx_nkiov;
 #endif
 		struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
@@ -181,7 +181,7 @@ ksocknal_lib_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx,
 }
 
 void
-ksocknal_lib_eager_ack(struct ksock_conn *conn)
+ksocknal_lib_eager_ack (ksock_conn_t *conn)
 {
 	struct socket *sock = conn->ksnc_sock;
 
@@ -194,13 +194,14 @@ ksocknal_lib_eager_ack(struct ksock_conn *conn)
 }
 
 int
-ksocknal_lib_recv_iov(struct ksock_conn *conn, struct kvec *scratchiov)
+ksocknal_lib_recv_iov (ksock_conn_t *conn)
 {
 #if SOCKNAL_SINGLE_FRAG_RX
 	struct kvec  scratch;
 	struct kvec *scratchiov = &scratch;
 	unsigned int  niov = 1;
 #else
+	struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
 	unsigned int  niov = conn->ksnc_rx_niov;
 #endif
 	struct kvec *iov = conn->ksnc_rx_iov;
@@ -298,8 +299,7 @@ ksocknal_lib_kiov_vmap(lnet_kiov_t *kiov, int niov,
 }
 
 int
-ksocknal_lib_recv_kiov(struct ksock_conn *conn, struct page **pages,
-		       struct kvec *scratchiov)
+ksocknal_lib_recv_kiov (ksock_conn_t *conn)
 {
 #if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK
 	struct kvec   scratch;
@@ -310,6 +310,8 @@ ksocknal_lib_recv_kiov(struct ksock_conn *conn, struct page **pages,
 #ifdef CONFIG_HIGHMEM
 #warning "XXX risk of kmap deadlock on multiple frags..."
 #endif
+	struct kvec  *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+	struct page  **pages      = conn->ksnc_scheduler->kss_rx_scratch_pgs;
 	unsigned int   niov       = conn->ksnc_rx_nkiov;
 #endif
 	lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
@@ -376,7 +378,7 @@ ksocknal_lib_recv_kiov(struct ksock_conn *conn, struct page **pages,
 }
 
 void
-ksocknal_lib_csum_tx(struct ksock_tx *tx)
+ksocknal_lib_csum_tx(ksock_tx_t *tx)
 {
         int          i;
         __u32        csum;
@@ -415,7 +417,7 @@ ksocknal_lib_csum_tx(struct ksock_tx *tx)
 }
 
 int
-ksocknal_lib_get_conn_tunables(struct ksock_conn *conn, int *txmem, int *rxmem, int *nagle)
+ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
 {
 	struct socket *sock = conn->ksnc_sock;
 	struct tcp_sock *tp = tcp_sk(sock->sk);
@@ -546,12 +548,12 @@ ksocknal_lib_setup_sock (struct socket *sock)
 }
 
 void
-ksocknal_lib_push_conn(struct ksock_conn *conn)
+ksocknal_lib_push_conn (ksock_conn_t *conn)
 {
-	struct sock *sk;
-	struct tcp_sock *tp;
-	int nonagle;
-	int rc;
+        struct sock    *sk;
+        struct tcp_sock *tp;
+        int             nonagle;
+        int             rc;
 
 	rc = ksocknal_connsock_addref(conn);
 	if (rc != 0)                            /* being shut down */
@@ -574,8 +576,8 @@ ksocknal_lib_push_conn(struct ksock_conn *conn)
 	ksocknal_connsock_decref(conn);
 }
 
-void ksocknal_read_callback(struct ksock_conn *conn);
-void ksocknal_write_callback(struct ksock_conn *conn);
+extern void ksocknal_read_callback (ksock_conn_t *conn);
+extern void ksocknal_write_callback (ksock_conn_t *conn);
 /*
  * socket call back in Linux
  */
@@ -586,7 +588,7 @@ ksocknal_data_ready(struct sock *sk)
 ksocknal_data_ready(struct sock *sk, int n)
 #endif
 {
-	struct ksock_conn  *conn;
+	ksock_conn_t  *conn;
 	ENTRY;
 
         /* interleave correctly with closing sockets... */
@@ -612,7 +614,7 @@ ksocknal_data_ready(struct sock *sk, int n)
 static void
 ksocknal_write_space (struct sock *sk)
 {
-	struct ksock_conn  *conn;
+        ksock_conn_t  *conn;
         int            wspace;
         int            min_wpace;
 
@@ -655,14 +657,14 @@ ksocknal_write_space (struct sock *sk)
 }
 
 void
-ksocknal_lib_save_callback(struct socket *sock, struct ksock_conn *conn)
+ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn)
 {
         conn->ksnc_saved_data_ready = sock->sk->sk_data_ready;
         conn->ksnc_saved_write_space = sock->sk->sk_write_space;
 }
 
 void
-ksocknal_lib_set_callback(struct socket *sock,  struct ksock_conn *conn)
+ksocknal_lib_set_callback(struct socket *sock,  ksock_conn_t *conn)
 {
         sock->sk->sk_user_data = conn;
         sock->sk->sk_data_ready = ksocknal_data_ready;
@@ -671,7 +673,7 @@ ksocknal_lib_set_callback(struct socket *sock,  struct ksock_conn *conn)
 }
 
 void
-ksocknal_lib_reset_callback(struct socket *sock, struct ksock_conn *conn)
+ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn)
 {
         /* Remove conn's network callbacks.
          * NB I _have_ to restore the callback, rather than storing a noop,
@@ -688,10 +690,10 @@ ksocknal_lib_reset_callback(struct socket *sock, struct ksock_conn *conn)
 }
 
 int
-ksocknal_lib_memory_pressure(struct ksock_conn *conn)
+ksocknal_lib_memory_pressure(ksock_conn_t *conn)
 {
 	int            rc = 0;
-	struct ksock_sched *sched;
+	ksock_sched_t *sched;
 
 	sched = conn->ksnc_scheduler;
 	spin_lock_bh(&sched->kss_lock);
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_modparams.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_modparams.c
index df9d96e6e4cfc..6495703626094 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_modparams.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_modparams.c
@@ -37,7 +37,7 @@ static int peer_buffer_credits;
 module_param(peer_buffer_credits, int, 0444);
 MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits");
 
-static int peer_timeout = DEFAULT_PEER_TIMEOUT;
+static int peer_timeout = 180;
 module_param(peer_timeout, int, 0444);
 MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)");
 
@@ -155,7 +155,7 @@ module_param(protocol, int, 0644);
 MODULE_PARM_DESC(protocol, "protocol version");
 #endif
 
-struct ksock_tunables ksocknal_tunables;
+ksock_tunables_t ksocknal_tunables;
 
 int ksocknal_tunables_init(void)
 {
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c
index 6dd648a2299cc..42dff10fdb563 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, Intel Corporation.
  *
  *   Author: Zach Brown <zab@zabbo.net>
  *   Author: Peter J. Braam <braam@clusterfs.com>
@@ -41,8 +41,8 @@
  *   pro_match_tx()       : Called holding glock
  */
 
-static struct ksock_tx *
-ksocknal_queue_tx_msg_v1(struct ksock_conn *conn, struct ksock_tx *tx_msg)
+static ksock_tx_t *
+ksocknal_queue_tx_msg_v1(ksock_conn_t *conn, ksock_tx_t *tx_msg)
 {
         /* V1.x, just enqueue it */
 	list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
@@ -50,9 +50,9 @@ ksocknal_queue_tx_msg_v1(struct ksock_conn *conn, struct ksock_tx *tx_msg)
 }
 
 void
-ksocknal_next_tx_carrier(struct ksock_conn *conn)
+ksocknal_next_tx_carrier(ksock_conn_t *conn)
 {
-	struct ksock_tx *tx = conn->ksnc_tx_carrier;
+        ksock_tx_t     *tx = conn->ksnc_tx_carrier;
 
         /* Called holding BH lock: conn->ksnc_scheduler->kss_lock */
 	LASSERT(!list_empty(&conn->ksnc_tx_queue));
@@ -64,17 +64,17 @@ ksocknal_next_tx_carrier(struct ksock_conn *conn)
                 conn->ksnc_tx_carrier = NULL;
         } else {
 		conn->ksnc_tx_carrier = list_entry(tx->tx_list.next,
-						   struct ksock_tx, tx_list);
+                                                       ksock_tx_t, tx_list);
 		LASSERT(conn->ksnc_tx_carrier->tx_msg.ksm_type ==
 			tx->tx_msg.ksm_type);
         }
 }
 
 static int
-ksocknal_queue_tx_zcack_v2(struct ksock_conn *conn,
-			   struct ksock_tx *tx_ack, __u64 cookie)
+ksocknal_queue_tx_zcack_v2(ksock_conn_t *conn,
+                           ksock_tx_t *tx_ack, __u64 cookie)
 {
-	struct ksock_tx *tx = conn->ksnc_tx_carrier;
+        ksock_tx_t *tx = conn->ksnc_tx_carrier;
 
         LASSERT (tx_ack == NULL ||
                  tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP);
@@ -117,10 +117,10 @@ ksocknal_queue_tx_zcack_v2(struct ksock_conn *conn,
         return 1;
 }
 
-static struct ksock_tx *
-ksocknal_queue_tx_msg_v2(struct ksock_conn *conn, struct ksock_tx *tx_msg)
+static ksock_tx_t *
+ksocknal_queue_tx_msg_v2(ksock_conn_t *conn, ksock_tx_t *tx_msg)
 {
-	struct ksock_tx  *tx  = conn->ksnc_tx_carrier;
+        ksock_tx_t  *tx  = conn->ksnc_tx_carrier;
 
         /*
          * Enqueue tx_msg:
@@ -154,10 +154,10 @@ ksocknal_queue_tx_msg_v2(struct ksock_conn *conn, struct ksock_tx *tx_msg)
 }
 
 static int
-ksocknal_queue_tx_zcack_v3(struct ksock_conn *conn,
-			   struct ksock_tx *tx_ack, __u64 cookie)
+ksocknal_queue_tx_zcack_v3(ksock_conn_t *conn,
+                           ksock_tx_t *tx_ack, __u64 cookie)
 {
-	struct ksock_tx *tx;
+        ksock_tx_t *tx;
 
         if (conn->ksnc_type != SOCKLND_CONN_ACK)
                 return ksocknal_queue_tx_zcack_v2(conn, tx_ack, cookie);
@@ -271,7 +271,7 @@ ksocknal_queue_tx_zcack_v3(struct ksock_conn *conn,
 }
 
 static int
-ksocknal_match_tx(struct ksock_conn *conn, struct ksock_tx *tx, int nonblk)
+ksocknal_match_tx(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
 {
         int nob;
 
@@ -315,7 +315,7 @@ ksocknal_match_tx(struct ksock_conn *conn, struct ksock_tx *tx, int nonblk)
 }
 
 static int
-ksocknal_match_tx_v3(struct ksock_conn *conn, struct ksock_tx *tx, int nonblk)
+ksocknal_match_tx_v3(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
 {
         int nob;
 
@@ -359,18 +359,18 @@ ksocknal_match_tx_v3(struct ksock_conn *conn, struct ksock_tx *tx, int nonblk)
 
 /* (Sink) handle incoming ZC request from sender */
 static int
-ksocknal_handle_zcreq(struct ksock_conn *c, __u64 cookie, int remote)
+ksocknal_handle_zcreq(ksock_conn_t *c, __u64 cookie, int remote)
 {
-	struct ksock_peer_ni *peer_ni = c->ksnc_peer;
-	struct ksock_conn *conn;
-	struct ksock_tx *tx;
-	int rc;
+	ksock_peer_ni_t   *peer_ni = c->ksnc_peer;
+	ksock_conn_t   *conn;
+	ksock_tx_t     *tx;
+	int             rc;
 
 	read_lock(&ksocknal_data.ksnd_global_lock);
 
 	conn = ksocknal_find_conn_locked(peer_ni, NULL, !!remote);
 	if (conn != NULL) {
-		struct ksock_sched *sched = conn->ksnc_scheduler;
+		ksock_sched_t *sched = conn->ksnc_scheduler;
 
 		LASSERT(conn->ksnc_proto->pro_queue_tx_zcack != NULL);
 
@@ -402,13 +402,13 @@ ksocknal_handle_zcreq(struct ksock_conn *c, __u64 cookie, int remote)
 
 /* (Sender) handle ZC_ACK from sink */
 static int
-ksocknal_handle_zcack(struct ksock_conn *conn, __u64 cookie1, __u64 cookie2)
+ksocknal_handle_zcack(ksock_conn_t *conn, __u64 cookie1, __u64 cookie2)
 {
-	struct ksock_peer_ni *peer_ni = conn->ksnc_peer;
-	struct ksock_tx *tx;
-	struct ksock_tx *tmp;
-	struct list_head zlist = LIST_HEAD_INIT(zlist);
-	int count;
+        ksock_peer_ni_t      *peer_ni = conn->ksnc_peer;
+        ksock_tx_t        *tx;
+        ksock_tx_t        *tmp;
+	struct list_head        zlist = LIST_HEAD_INIT(zlist);
+        int                count;
 
         if (cookie1 == 0)
                 cookie1 = cookie2;
@@ -440,7 +440,7 @@ ksocknal_handle_zcack(struct ksock_conn *conn, __u64 cookie1, __u64 cookie2)
 	spin_unlock(&peer_ni->ksnp_lock);
 
 	while (!list_empty(&zlist)) {
-		tx = list_entry(zlist.next, struct ksock_tx, tx_zc_list);
+		tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list);
 		list_del(&tx->tx_zc_list);
                 ksocknal_tx_decref(tx);
         }
@@ -449,7 +449,7 @@ ksocknal_handle_zcack(struct ksock_conn *conn, __u64 cookie1, __u64 cookie2)
 }
 
 static int
-ksocknal_send_hello_v1(struct ksock_conn *conn, struct ksock_hello_msg *hello)
+ksocknal_send_hello_v1 (ksock_conn_t *conn, struct ksock_hello_msg *hello)
 {
 	struct socket *sock = conn->ksnc_sock;
 	struct lnet_hdr *hdr;
@@ -524,10 +524,10 @@ ksocknal_send_hello_v1(struct ksock_conn *conn, struct ksock_hello_msg *hello)
 }
 
 static int
-ksocknal_send_hello_v2(struct ksock_conn *conn, struct ksock_hello_msg *hello)
+ksocknal_send_hello_v2 (ksock_conn_t *conn, struct ksock_hello_msg *hello)
 {
-	struct socket *sock = conn->ksnc_sock;
-	int rc;
+	struct socket   *sock = conn->ksnc_sock;
+        int             rc;
 
         hello->kshm_magic   = LNET_PROTO_MAGIC;
         hello->kshm_version = conn->ksnc_proto->pro_version;
@@ -567,8 +567,7 @@ ksocknal_send_hello_v2(struct ksock_conn *conn, struct ksock_hello_msg *hello)
 }
 
 static int
-ksocknal_recv_hello_v1(struct ksock_conn *conn, struct ksock_hello_msg *hello,
-		       int timeout)
+ksocknal_recv_hello_v1(ksock_conn_t *conn, struct ksock_hello_msg *hello,int timeout)
 {
 	struct socket *sock = conn->ksnc_sock;
 	struct lnet_hdr *hdr;
@@ -608,7 +607,7 @@ ksocknal_recv_hello_v1(struct ksock_conn *conn, struct ksock_hello_msg *hello,
         hello->kshm_nips            = le32_to_cpu (hdr->payload_length) /
                                          sizeof (__u32);
 
-	if (hello->kshm_nips > LNET_INTERFACES_NUM) {
+	if (hello->kshm_nips > LNET_NUM_INTERFACES) {
 		CERROR("Bad nips %d from ip %pI4h\n",
 		       hello->kshm_nips, &conn->ksnc_ipaddr);
 		rc = -EPROTO;
@@ -644,7 +643,7 @@ ksocknal_recv_hello_v1(struct ksock_conn *conn, struct ksock_hello_msg *hello,
 }
 
 static int
-ksocknal_recv_hello_v2(struct ksock_conn *conn, struct ksock_hello_msg *hello,
+ksocknal_recv_hello_v2(ksock_conn_t *conn, struct ksock_hello_msg *hello,
 		       int timeout)
 {
 	struct socket	  *sock = conn->ksnc_sock;
@@ -678,7 +677,7 @@ ksocknal_recv_hello_v2(struct ksock_conn *conn, struct ksock_hello_msg *hello,
                 __swab32s(&hello->kshm_nips);
         }
 
-	if (hello->kshm_nips > LNET_INTERFACES_NUM) {
+	if (hello->kshm_nips > LNET_NUM_INTERFACES) {
 		CERROR("Bad nips %d from ip %pI4h\n",
 		       hello->kshm_nips, &conn->ksnc_ipaddr);
 		return -EPROTO;
@@ -711,7 +710,7 @@ ksocknal_recv_hello_v2(struct ksock_conn *conn, struct ksock_hello_msg *hello,
 }
 
 static void
-ksocknal_pack_msg_v1(struct ksock_tx *tx)
+ksocknal_pack_msg_v1(ksock_tx_t *tx)
 {
 	/* V1.x has no KSOCK_MSG_NOOP */
 	LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
@@ -725,7 +724,7 @@ ksocknal_pack_msg_v1(struct ksock_tx *tx)
 }
 
 static void
-ksocknal_pack_msg_v2(struct ksock_tx *tx)
+ksocknal_pack_msg_v2(ksock_tx_t *tx)
 {
         tx->tx_iov[0].iov_base = (void *)&tx->tx_msg;
 
@@ -758,7 +757,7 @@ ksocknal_unpack_msg_v2(struct ksock_msg *msg)
         return;  /* Do nothing */
 }
 
-struct ksock_proto  ksocknal_protocol_v1x =
+ksock_proto_t  ksocknal_protocol_v1x =
 {
         .pro_version            = KSOCK_PROTO_V1,
         .pro_send_hello         = ksocknal_send_hello_v1,
@@ -772,7 +771,7 @@ struct ksock_proto  ksocknal_protocol_v1x =
         .pro_match_tx           = ksocknal_match_tx
 };
 
-struct ksock_proto  ksocknal_protocol_v2x =
+ksock_proto_t  ksocknal_protocol_v2x =
 {
         .pro_version            = KSOCK_PROTO_V2,
         .pro_send_hello         = ksocknal_send_hello_v2,
@@ -786,7 +785,7 @@ struct ksock_proto  ksocknal_protocol_v2x =
         .pro_match_tx           = ksocknal_match_tx
 };
 
-struct ksock_proto  ksocknal_protocol_v3x =
+ksock_proto_t  ksocknal_protocol_v3x =
 {
         .pro_version            = KSOCK_PROTO_V3,
         .pro_send_hello         = ksocknal_send_hello_v2,
diff --git a/drivers/staging/lustrefsx/lnet/lnet/acceptor.c b/drivers/staging/lustrefsx/lnet/lnet/acceptor.c
index 5be1dd88a6b2f..8d3d6030d7d31 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/acceptor.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/acceptor.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -32,6 +32,7 @@
 
 #define DEBUG_SUBSYSTEM S_LNET
 
+#include <linux/nsproxy.h>
 #include <linux/completion.h>
 #include <net/sock.h>
 #include <lnet/lib-lnet.h>
@@ -480,15 +481,14 @@ lnet_acceptor_start(void)
 
 	if (lnet_count_acceptor_nets() == 0)  /* not required */
 		return 0;
-	if (current->nsproxy && current->nsproxy->net_ns)
-		lnet_acceptor_state.pta_ns = current->nsproxy->net_ns;
-	else
-		lnet_acceptor_state.pta_ns = &init_net;
+
+	lnet_acceptor_state.pta_ns = current->nsproxy->net_ns;
 	task = kthread_run(lnet_acceptor, (void *)(uintptr_t)secure,
 			   "acceptor_%03ld", secure);
 	if (IS_ERR(task)) {
 		rc2 = PTR_ERR(task);
 		CERROR("Can't start acceptor thread: %ld\n", rc2);
+
 		return -ESRCH;
 	}
 
diff --git a/drivers/staging/lustrefsx/lnet/lnet/api-ni.c b/drivers/staging/lustrefsx/lnet/lnet/api-ni.c
index 24e7d7aa59cd0..c70e26680b447 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/api-ni.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/api-ni.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -31,25 +31,14 @@
  */
 
 #define DEBUG_SUBSYSTEM S_LNET
-
-#include <linux/ctype.h>
 #include <linux/log2.h>
 #include <linux/ktime.h>
-#include <linux/moduleparam.h>
-#include <linux/uaccess.h>
 
 #include <lnet/lib-lnet.h>
 
 #define D_LNI D_CONSOLE
 
-/*
- * initialize ln_api_mutex statically, since it needs to be used in
- * discovery_set callback. That module parameter callback can be called
- * before module init completes. The mutex needs to be ready for use then.
- */
-struct lnet the_lnet = {
-	.ln_api_mutex = __MUTEX_INITIALIZER(the_lnet.ln_api_mutex),
-};		/* THE state of the network */
+struct lnet the_lnet;		/* THE state of the network */
 EXPORT_SYMBOL(the_lnet);
 
 static char *ip2nets = "";
@@ -71,157 +60,13 @@ MODULE_PARM_DESC(rnet_htable_size, "size of remote network hash table");
 static int use_tcp_bonding = false;
 module_param(use_tcp_bonding, int, 0444);
 MODULE_PARM_DESC(use_tcp_bonding,
-		 "use_tcp_bonding parameter has been deprecated");
+		 "Set to 1 to use socklnd bonding. 0 to use Multi-Rail");
 
 unsigned int lnet_numa_range = 0;
 module_param(lnet_numa_range, uint, 0444);
 MODULE_PARM_DESC(lnet_numa_range,
 		"NUMA range to consider during Multi-Rail selection");
 
-/*
- * lnet_health_sensitivity determines by how much we decrement the health
- * value on sending error. The value defaults to 100, which means health
- * interface health is decremented by 100 points every failure.
- */
-unsigned int lnet_health_sensitivity = 100;
-static int sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp);
-#ifdef HAVE_KERNEL_PARAM_OPS
-static struct kernel_param_ops param_ops_health_sensitivity = {
-	.set = sensitivity_set,
-	.get = param_get_int,
-};
-#define param_check_health_sensitivity(name, p) \
-		__param_check(name, p, int)
-module_param(lnet_health_sensitivity, health_sensitivity, S_IRUGO|S_IWUSR);
-#else
-module_param_call(lnet_health_sensitivity, sensitivity_set, param_get_int,
-		  &lnet_health_sensitivity, S_IRUGO|S_IWUSR);
-#endif
-MODULE_PARM_DESC(lnet_health_sensitivity,
-		"Value to decrement the health value by on error");
-
-/*
- * lnet_recovery_interval determines how often we should perform recovery
- * on unhealthy interfaces.
- */
-unsigned int lnet_recovery_interval = 1;
-static int recovery_interval_set(const char *val, cfs_kernel_param_arg_t *kp);
-#ifdef HAVE_KERNEL_PARAM_OPS
-static struct kernel_param_ops param_ops_recovery_interval = {
-	.set = recovery_interval_set,
-	.get = param_get_int,
-};
-#define param_check_recovery_interval(name, p) \
-		__param_check(name, p, int)
-module_param(lnet_recovery_interval, recovery_interval, S_IRUGO|S_IWUSR);
-#else
-module_param_call(lnet_recovery_interval, recovery_interval_set, param_get_int,
-		  &lnet_recovery_interval, S_IRUGO|S_IWUSR);
-#endif
-MODULE_PARM_DESC(lnet_recovery_interval,
-		"Interval to recover unhealthy interfaces in seconds");
-
-static int lnet_interfaces_max = LNET_INTERFACES_MAX_DEFAULT;
-static int intf_max_set(const char *val, cfs_kernel_param_arg_t *kp);
-
-#ifdef HAVE_KERNEL_PARAM_OPS
-static struct kernel_param_ops param_ops_interfaces_max = {
-	.set = intf_max_set,
-	.get = param_get_int,
-};
-
-#define param_check_interfaces_max(name, p) \
-		__param_check(name, p, int)
-
-module_param(lnet_interfaces_max, interfaces_max, 0644);
-#else
-module_param_call(lnet_interfaces_max, intf_max_set, param_get_int,
-		  &lnet_interfaces_max, 0644);
-#endif
-MODULE_PARM_DESC(lnet_interfaces_max,
-		"Maximum number of interfaces in a node.");
-
-unsigned lnet_peer_discovery_disabled = 0;
-static int discovery_set(const char *val, cfs_kernel_param_arg_t *kp);
-
-#ifdef HAVE_KERNEL_PARAM_OPS
-static struct kernel_param_ops param_ops_discovery_disabled = {
-	.set = discovery_set,
-	.get = param_get_int,
-};
-
-#define param_check_discovery_disabled(name, p) \
-		__param_check(name, p, int)
-module_param(lnet_peer_discovery_disabled, discovery_disabled, 0644);
-#else
-module_param_call(lnet_peer_discovery_disabled, discovery_set, param_get_int,
-		  &lnet_peer_discovery_disabled, 0644);
-#endif
-MODULE_PARM_DESC(lnet_peer_discovery_disabled,
-		"Set to 1 to disable peer discovery on this node.");
-
-unsigned int lnet_drop_asym_route;
-static int drop_asym_route_set(const char *val, cfs_kernel_param_arg_t *kp);
-
-#ifdef HAVE_KERNEL_PARAM_OPS
-static struct kernel_param_ops param_ops_drop_asym_route = {
-	.set = drop_asym_route_set,
-	.get = param_get_int,
-};
-
-#define param_check_drop_asym_route(name, p)	\
-	__param_check(name, p, int)
-module_param(lnet_drop_asym_route, drop_asym_route, 0644);
-#else
-module_param_call(lnet_drop_asym_route, drop_asym_route_set, param_get_int,
-		  &lnet_drop_asym_route, 0644);
-#endif
-MODULE_PARM_DESC(lnet_drop_asym_route,
-		 "Set to 1 to drop asymmetrical route messages.");
-
-#define LNET_TRANSACTION_TIMEOUT_NO_HEALTH_DEFAULT 50
-#define LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT 50
-
-unsigned lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT;
-static int transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp);
-#ifdef HAVE_KERNEL_PARAM_OPS
-static struct kernel_param_ops param_ops_transaction_timeout = {
-	.set = transaction_to_set,
-	.get = param_get_int,
-};
-
-#define param_check_transaction_timeout(name, p) \
-		__param_check(name, p, int)
-module_param(lnet_transaction_timeout, transaction_timeout, S_IRUGO|S_IWUSR);
-#else
-module_param_call(lnet_transaction_timeout, transaction_to_set, param_get_int,
-		  &lnet_transaction_timeout, S_IRUGO|S_IWUSR);
-#endif
-MODULE_PARM_DESC(lnet_transaction_timeout,
-		"Maximum number of seconds to wait for a peer response.");
-
-#define LNET_RETRY_COUNT_HEALTH_DEFAULT 2
-unsigned lnet_retry_count = LNET_RETRY_COUNT_HEALTH_DEFAULT;
-static int retry_count_set(const char *val, cfs_kernel_param_arg_t *kp);
-#ifdef HAVE_KERNEL_PARAM_OPS
-static struct kernel_param_ops param_ops_retry_count = {
-	.set = retry_count_set,
-	.get = param_get_int,
-};
-
-#define param_check_retry_count(name, p) \
-		__param_check(name, p, int)
-module_param(lnet_retry_count, retry_count, S_IRUGO|S_IWUSR);
-#else
-module_param_call(lnet_retry_count, retry_count_set, param_get_int,
-		  &lnet_retry_count, S_IRUGO|S_IWUSR);
-#endif
-MODULE_PARM_DESC(lnet_retry_count,
-		 "Maximum number of times to retry transmitting a message");
-
-
-unsigned lnet_lnd_timeout = LNET_LND_DEFAULT_TIMEOUT;
-
 /*
  * This sequence number keeps track of how many times DLC was used to
  * update the local NIs. It is incremented when a NI is added or
@@ -234,282 +79,6 @@ static atomic_t lnet_dlc_seq_no = ATOMIC_INIT(0);
 static int lnet_ping(struct lnet_process_id id, signed long timeout,
 		     struct lnet_process_id __user *ids, int n_ids);
 
-static int lnet_discover(struct lnet_process_id id, __u32 force,
-			 struct lnet_process_id __user *ids, int n_ids);
-
-static int
-sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp)
-{
-	int rc;
-	unsigned *sensitivity = (unsigned *)kp->arg;
-	unsigned long value;
-
-	rc = kstrtoul(val, 0, &value);
-	if (rc) {
-		CERROR("Invalid module parameter value for 'lnet_health_sensitivity'\n");
-		return rc;
-	}
-
-	/*
-	 * The purpose of locking the api_mutex here is to ensure that
-	 * the correct value ends up stored properly.
-	 */
-	mutex_lock(&the_lnet.ln_api_mutex);
-
-	if (value > LNET_MAX_HEALTH_VALUE) {
-		mutex_unlock(&the_lnet.ln_api_mutex);
-		CERROR("Invalid health value. Maximum: %d value = %lu\n",
-		       LNET_MAX_HEALTH_VALUE, value);
-		return -EINVAL;
-	}
-
-	/*
-	 * if we're turning on health then use the health timeout
-	 * defaults.
-	 */
-	if (*sensitivity == 0 && value != 0) {
-		lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT;
-		lnet_retry_count = LNET_RETRY_COUNT_HEALTH_DEFAULT;
-	/*
-	 * if we're turning off health then use the no health timeout
-	 * default.
-	 */
-	} else if (*sensitivity != 0 && value == 0) {
-		lnet_transaction_timeout =
-			LNET_TRANSACTION_TIMEOUT_NO_HEALTH_DEFAULT;
-		lnet_retry_count = 0;
-	}
-
-	*sensitivity = value;
-
-	mutex_unlock(&the_lnet.ln_api_mutex);
-
-	return 0;
-}
-
-static int
-recovery_interval_set(const char *val, cfs_kernel_param_arg_t *kp)
-{
-	int rc;
-	unsigned *interval = (unsigned *)kp->arg;
-	unsigned long value;
-
-	rc = kstrtoul(val, 0, &value);
-	if (rc) {
-		CERROR("Invalid module parameter value for 'lnet_recovery_interval'\n");
-		return rc;
-	}
-
-	if (value < 1) {
-		CERROR("lnet_recovery_interval must be at least 1 second\n");
-		return -EINVAL;
-	}
-
-	/*
-	 * The purpose of locking the api_mutex here is to ensure that
-	 * the correct value ends up stored properly.
-	 */
-	mutex_lock(&the_lnet.ln_api_mutex);
-
-	*interval = value;
-
-	mutex_unlock(&the_lnet.ln_api_mutex);
-
-	return 0;
-}
-
-static int
-discovery_set(const char *val, cfs_kernel_param_arg_t *kp)
-{
-	int rc;
-	unsigned *discovery = (unsigned *)kp->arg;
-	unsigned long value;
-	struct lnet_ping_buffer *pbuf;
-
-	rc = kstrtoul(val, 0, &value);
-	if (rc) {
-		CERROR("Invalid module parameter value for 'lnet_peer_discovery_disabled'\n");
-		return rc;
-	}
-
-	value = (value) ? 1 : 0;
-
-	/*
-	 * The purpose of locking the api_mutex here is to ensure that
-	 * the correct value ends up stored properly.
-	 */
-	mutex_lock(&the_lnet.ln_api_mutex);
-
-	if (value == *discovery) {
-		mutex_unlock(&the_lnet.ln_api_mutex);
-		return 0;
-	}
-
-	*discovery = value;
-
-	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
-		mutex_unlock(&the_lnet.ln_api_mutex);
-		return 0;
-	}
-
-	/* tell peers that discovery setting has changed */
-	lnet_net_lock(LNET_LOCK_EX);
-	pbuf = the_lnet.ln_ping_target;
-	if (value)
-		pbuf->pb_info.pi_features &= ~LNET_PING_FEAT_DISCOVERY;
-	else
-		pbuf->pb_info.pi_features |= LNET_PING_FEAT_DISCOVERY;
-	lnet_net_unlock(LNET_LOCK_EX);
-
-	lnet_push_update_to_peers(1);
-
-	mutex_unlock(&the_lnet.ln_api_mutex);
-
-	return 0;
-}
-
-static int
-drop_asym_route_set(const char *val, cfs_kernel_param_arg_t *kp)
-{
-	int rc;
-	unsigned int *drop_asym_route = (unsigned int *)kp->arg;
-	unsigned long value;
-
-	rc = kstrtoul(val, 0, &value);
-	if (rc) {
-		CERROR("Invalid module parameter value for "
-		       "'lnet_drop_asym_route'\n");
-		return rc;
-	}
-
-	/*
-	 * The purpose of locking the api_mutex here is to ensure that
-	 * the correct value ends up stored properly.
-	 */
-	mutex_lock(&the_lnet.ln_api_mutex);
-
-	if (value == *drop_asym_route) {
-		mutex_unlock(&the_lnet.ln_api_mutex);
-		return 0;
-	}
-
-	*drop_asym_route = value;
-
-	mutex_unlock(&the_lnet.ln_api_mutex);
-
-	return 0;
-}
-
-static int
-transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp)
-{
-	int rc;
-	unsigned *transaction_to = (unsigned *)kp->arg;
-	unsigned long value;
-
-	rc = kstrtoul(val, 0, &value);
-	if (rc) {
-		CERROR("Invalid module parameter value for 'lnet_transaction_timeout'\n");
-		return rc;
-	}
-
-	/*
-	 * The purpose of locking the api_mutex here is to ensure that
-	 * the correct value ends up stored properly.
-	 */
-	mutex_lock(&the_lnet.ln_api_mutex);
-
-	if (value < lnet_retry_count || value == 0) {
-		mutex_unlock(&the_lnet.ln_api_mutex);
-		CERROR("Invalid value for lnet_transaction_timeout (%lu). "
-		       "Has to be greater than lnet_retry_count (%u)\n",
-		       value, lnet_retry_count);
-		return -EINVAL;
-	}
-
-	if (value == *transaction_to) {
-		mutex_unlock(&the_lnet.ln_api_mutex);
-		return 0;
-	}
-
-	*transaction_to = value;
-	if (lnet_retry_count == 0)
-		lnet_lnd_timeout = value;
-	else
-		lnet_lnd_timeout = value / lnet_retry_count;
-
-	mutex_unlock(&the_lnet.ln_api_mutex);
-
-	return 0;
-}
-
-static int
-retry_count_set(const char *val, cfs_kernel_param_arg_t *kp)
-{
-	int rc;
-	unsigned *retry_count = (unsigned *)kp->arg;
-	unsigned long value;
-
-	rc = kstrtoul(val, 0, &value);
-	if (rc) {
-		CERROR("Invalid module parameter value for 'lnet_retry_count'\n");
-		return rc;
-	}
-
-	/*
-	 * The purpose of locking the api_mutex here is to ensure that
-	 * the correct value ends up stored properly.
-	 */
-	mutex_lock(&the_lnet.ln_api_mutex);
-
-	if (lnet_health_sensitivity == 0) {
-		mutex_unlock(&the_lnet.ln_api_mutex);
-		CERROR("Can not set retry_count when health feature is turned off\n");
-		return -EINVAL;
-	}
-
-	if (value > lnet_transaction_timeout) {
-		mutex_unlock(&the_lnet.ln_api_mutex);
-		CERROR("Invalid value for lnet_retry_count (%lu). "
-		       "Has to be smaller than lnet_transaction_timeout (%u)\n",
-		       value, lnet_transaction_timeout);
-		return -EINVAL;
-	}
-
-	*retry_count = value;
-
-	if (value == 0)
-		lnet_lnd_timeout = lnet_transaction_timeout;
-	else
-		lnet_lnd_timeout = lnet_transaction_timeout / value;
-
-	mutex_unlock(&the_lnet.ln_api_mutex);
-
-	return 0;
-}
-
-static int
-intf_max_set(const char *val, cfs_kernel_param_arg_t *kp)
-{
-	int value, rc;
-
-	rc = kstrtoint(val, 0, &value);
-	if (rc) {
-		CERROR("Invalid module parameter value for 'lnet_interfaces_max'\n");
-		return rc;
-	}
-
-	if (value < LNET_INTERFACES_MIN) {
-		CWARN("max interfaces provided are too small, setting to %d\n",
-		      LNET_INTERFACES_MAX_DEFAULT);
-		value = LNET_INTERFACES_MAX_DEFAULT;
-	}
-
-	*(int *)kp->arg = value;
-
-	return 0;
-}
-
 static char *
 lnet_get_routes(void)
 {
@@ -543,10 +112,10 @@ static void
 lnet_init_locks(void)
 {
 	spin_lock_init(&the_lnet.ln_eq_wait_lock);
-	spin_lock_init(&the_lnet.ln_msg_resend_lock);
 	init_waitqueue_head(&the_lnet.ln_eq_waitq);
-	init_waitqueue_head(&the_lnet.ln_mt_waitq);
+	init_waitqueue_head(&the_lnet.ln_rc_waitq);
 	mutex_init(&the_lnet.ln_lnd_mutex);
+	mutex_init(&the_lnet.ln_api_mutex);
 }
 
 static void
@@ -757,43 +326,6 @@ static void lnet_assert_wire_constants(void)
 	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.hello.incarnation) == 8);
 	CLASSERT((int)offsetof(struct lnet_hdr, msg.hello.type) == 40);
 	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.hello.type) == 4);
-
-	/* Checks for struct lnet_ni_status and related constants */
-	CLASSERT(LNET_NI_STATUS_INVALID == 0x00000000);
-	CLASSERT(LNET_NI_STATUS_UP == 0x15aac0de);
-	CLASSERT(LNET_NI_STATUS_DOWN == 0xdeadface);
-
-	/* Checks for struct lnet_ni_status */
-	CLASSERT((int)sizeof(struct lnet_ni_status) == 16);
-	CLASSERT((int)offsetof(struct lnet_ni_status, ns_nid) == 0);
-	CLASSERT((int)sizeof(((struct lnet_ni_status *)0)->ns_nid) == 8);
-	CLASSERT((int)offsetof(struct lnet_ni_status, ns_status) == 8);
-	CLASSERT((int)sizeof(((struct lnet_ni_status *)0)->ns_status) == 4);
-	CLASSERT((int)offsetof(struct lnet_ni_status, ns_unused) == 12);
-	CLASSERT((int)sizeof(((struct lnet_ni_status *)0)->ns_unused) == 4);
-
-	/* Checks for struct lnet_ping_info and related constants */
-	CLASSERT(LNET_PROTO_PING_MAGIC == 0x70696E67);
-	CLASSERT(LNET_PING_FEAT_INVAL == 0);
-	CLASSERT(LNET_PING_FEAT_BASE == 1);
-	CLASSERT(LNET_PING_FEAT_NI_STATUS == 2);
-	CLASSERT(LNET_PING_FEAT_RTE_DISABLED == 4);
-	CLASSERT(LNET_PING_FEAT_MULTI_RAIL == 8);
-	CLASSERT(LNET_PING_FEAT_DISCOVERY == 16);
-	CLASSERT(LNET_PING_FEAT_BITS == 31);
-
-	/* Checks for struct lnet_ping_info */
-	CLASSERT((int)sizeof(struct lnet_ping_info) == 16);
-	CLASSERT((int)offsetof(struct lnet_ping_info, pi_magic) == 0);
-	CLASSERT((int)sizeof(((struct lnet_ping_info *)0)->pi_magic) == 4);
-	CLASSERT((int)offsetof(struct lnet_ping_info, pi_features) == 4);
-	CLASSERT((int)sizeof(((struct lnet_ping_info *)0)->pi_features) == 4);
-	CLASSERT((int)offsetof(struct lnet_ping_info, pi_pid) == 8);
-	CLASSERT((int)sizeof(((struct lnet_ping_info *)0)->pi_pid) == 4);
-	CLASSERT((int)offsetof(struct lnet_ping_info, pi_nnis) == 12);
-	CLASSERT((int)sizeof(((struct lnet_ping_info *)0)->pi_nnis) == 4);
-	CLASSERT((int)offsetof(struct lnet_ping_info, pi_ni) == 16);
-	CLASSERT((int)sizeof(((struct lnet_ping_info *)0)->pi_ni) == 0);
 }
 
 static struct lnet_lnd *lnet_find_lnd_by_type(__u32 type)
@@ -811,13 +343,6 @@ static struct lnet_lnd *lnet_find_lnd_by_type(__u32 type)
 	return NULL;
 }
 
-unsigned int
-lnet_get_lnd_timeout(void)
-{
-	return lnet_lnd_timeout;
-}
-EXPORT_SYMBOL(lnet_get_lnd_timeout);
-
 void
 lnet_register_lnd(struct lnet_lnd *lnd)
 {
@@ -850,71 +375,29 @@ lnet_unregister_lnd(struct lnet_lnd *lnd)
 }
 EXPORT_SYMBOL(lnet_unregister_lnd);
 
-void
-lnet_counters_get_common(struct lnet_counters_common *common)
-{
-	struct lnet_counters *ctr;
-	int i;
-
-	memset(common, 0, sizeof(*common));
-
-	lnet_net_lock(LNET_LOCK_EX);
-
-	cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) {
-		common->lcc_msgs_max     += ctr->lct_common.lcc_msgs_max;
-		common->lcc_msgs_alloc   += ctr->lct_common.lcc_msgs_alloc;
-		common->lcc_errors       += ctr->lct_common.lcc_errors;
-		common->lcc_send_count   += ctr->lct_common.lcc_send_count;
-		common->lcc_recv_count   += ctr->lct_common.lcc_recv_count;
-		common->lcc_route_count  += ctr->lct_common.lcc_route_count;
-		common->lcc_drop_count   += ctr->lct_common.lcc_drop_count;
-		common->lcc_send_length  += ctr->lct_common.lcc_send_length;
-		common->lcc_recv_length  += ctr->lct_common.lcc_recv_length;
-		common->lcc_route_length += ctr->lct_common.lcc_route_length;
-		common->lcc_drop_length  += ctr->lct_common.lcc_drop_length;
-	}
-	lnet_net_unlock(LNET_LOCK_EX);
-}
-EXPORT_SYMBOL(lnet_counters_get_common);
-
 void
 lnet_counters_get(struct lnet_counters *counters)
 {
 	struct lnet_counters *ctr;
-	struct lnet_counters_health *health = &counters->lct_health;
 	int		i;
 
 	memset(counters, 0, sizeof(*counters));
 
-	lnet_counters_get_common(&counters->lct_common);
-
 	lnet_net_lock(LNET_LOCK_EX);
 
 	cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) {
-		health->lch_rst_alloc    += ctr->lct_health.lch_rst_alloc;
-		health->lch_resend_count += ctr->lct_health.lch_resend_count;
-		health->lch_response_timeout_count +=
-				ctr->lct_health.lch_response_timeout_count;
-		health->lch_local_interrupt_count +=
-				ctr->lct_health.lch_local_interrupt_count;
-		health->lch_local_dropped_count +=
-				ctr->lct_health.lch_local_dropped_count;
-		health->lch_local_aborted_count +=
-				ctr->lct_health.lch_local_aborted_count;
-		health->lch_local_no_route_count +=
-				ctr->lct_health.lch_local_no_route_count;
-		health->lch_local_timeout_count +=
-				ctr->lct_health.lch_local_timeout_count;
-		health->lch_local_error_count +=
-				ctr->lct_health.lch_local_error_count;
-		health->lch_remote_dropped_count +=
-				ctr->lct_health.lch_remote_dropped_count;
-		health->lch_remote_error_count +=
-				ctr->lct_health.lch_remote_error_count;
-		health->lch_remote_timeout_count +=
-				ctr->lct_health.lch_remote_timeout_count;
-		health->lch_network_timeout_count +=
-				ctr->lct_health.lch_network_timeout_count;
+		counters->msgs_max     += ctr->msgs_max;
+		counters->msgs_alloc   += ctr->msgs_alloc;
+		counters->errors       += ctr->errors;
+		counters->send_count   += ctr->send_count;
+		counters->recv_count   += ctr->recv_count;
+		counters->route_count  += ctr->route_count;
+		counters->drop_count   += ctr->drop_count;
+		counters->send_length  += ctr->send_length;
+		counters->recv_length  += ctr->recv_length;
+		counters->route_length += ctr->route_length;
+		counters->drop_length  += ctr->drop_length;
+
 	}
 	lnet_net_unlock(LNET_LOCK_EX);
 }
@@ -1099,26 +582,6 @@ lnet_res_lh_initialize(struct lnet_res_container *rec,
 	list_add(&lh->lh_hash_chain, &rec->rec_lh_hash[hash]);
 }
 
-struct list_head **
-lnet_create_array_of_queues(void)
-{
-	struct list_head **qs;
-	struct list_head *q;
-	int i;
-
-	qs = cfs_percpt_alloc(lnet_cpt_table(),
-			      sizeof(struct list_head));
-	if (!qs) {
-		CERROR("Failed to allocate queues\n");
-		return NULL;
-	}
-
-	cfs_percpt_for_each(q, i, qs)
-		INIT_LIST_HEAD(q);
-
-	return qs;
-}
-
 static int lnet_unprepare(void);
 
 static int
@@ -1141,18 +604,12 @@ lnet_prepare(lnet_pid_t requested_pid)
 	the_lnet.ln_pid = requested_pid;
 
 	INIT_LIST_HEAD(&the_lnet.ln_test_peers);
+	INIT_LIST_HEAD(&the_lnet.ln_peers);
 	INIT_LIST_HEAD(&the_lnet.ln_remote_peer_ni_list);
 	INIT_LIST_HEAD(&the_lnet.ln_nets);
 	INIT_LIST_HEAD(&the_lnet.ln_routers);
 	INIT_LIST_HEAD(&the_lnet.ln_drop_rules);
 	INIT_LIST_HEAD(&the_lnet.ln_delay_rules);
-	INIT_LIST_HEAD(&the_lnet.ln_dc_request);
-	INIT_LIST_HEAD(&the_lnet.ln_dc_working);
-	INIT_LIST_HEAD(&the_lnet.ln_dc_expired);
-	INIT_LIST_HEAD(&the_lnet.ln_mt_localNIRecovq);
-	INIT_LIST_HEAD(&the_lnet.ln_mt_peerNIRecovq);
-	init_waitqueue_head(&the_lnet.ln_dc_waitq);
-	LNetInvalidateEQHandle(&the_lnet.ln_mt_eqh);
 
 	rc = lnet_descriptor_setup();
 	if (rc != 0)
@@ -1211,12 +668,6 @@ lnet_prepare(lnet_pid_t requested_pid)
 		goto failed;
 	}
 
-	the_lnet.ln_mt_zombie_rstqs = lnet_create_array_of_queues();
-	if (!the_lnet.ln_mt_zombie_rstqs) {
-		rc = -ENOMEM;
-		goto failed;
-	}
-
 	return 0;
 
  failed:
@@ -1227,8 +678,6 @@ lnet_prepare(lnet_pid_t requested_pid)
 static int
 lnet_unprepare (void)
 {
-	int rc;
-
 	/* NB no LNET_LOCK since this is the last reference.  All LND instances
 	 * have shut down already, so it is safe to unlink and free all
 	 * descriptors, even those that appear committed to a network op (eg MD
@@ -1240,17 +689,6 @@ lnet_unprepare (void)
 	LASSERT(list_empty(&the_lnet.ln_test_peers));
 	LASSERT(list_empty(&the_lnet.ln_nets));
 
-	if (the_lnet.ln_mt_zombie_rstqs) {
-		lnet_clean_zombie_rstqs();
-		the_lnet.ln_mt_zombie_rstqs = NULL;
-	}
-
-	if (!LNetEQHandleIsInvalid(the_lnet.ln_mt_eqh)) {
-		rc = LNetEQFree(the_lnet.ln_mt_eqh);
-		LNetInvalidateEQHandle(&the_lnet.ln_mt_eqh);
-		LASSERT(rc == 0);
-	}
-
 	lnet_portals_destroy();
 
 	if (the_lnet.ln_md_containers != NULL) {
@@ -1416,6 +854,16 @@ lnet_islocalnet(__u32 net_id)
 	return local;
 }
 
+bool
+lnet_is_ni_healthy_locked(struct lnet_ni *ni)
+{
+	if (ni->ni_state == LNET_NI_STATE_ACTIVE ||
+	    ni->ni_state == LNET_NI_STATE_DEGRADED)
+		return true;
+
+	return false;
+}
+
 struct lnet_ni  *
 lnet_nid2ni_locked(lnet_nid_t nid, int cpt)
 {
@@ -1483,45 +931,25 @@ lnet_count_acceptor_nets(void)
 	return count;
 }
 
-struct lnet_ping_buffer *
-lnet_ping_buffer_alloc(int nnis, gfp_t gfp)
-{
-	struct lnet_ping_buffer *pbuf;
-
-	LIBCFS_ALLOC_GFP(pbuf, LNET_PING_BUFFER_SIZE(nnis), gfp);
-	if (pbuf) {
-		pbuf->pb_nnis = nnis;
-		atomic_set(&pbuf->pb_refcnt, 1);
-	}
-
-	return pbuf;
-}
-
-void
-lnet_ping_buffer_free(struct lnet_ping_buffer *pbuf)
+static struct lnet_ping_info *
+lnet_ping_info_create(int num_ni)
 {
-	LASSERT(lnet_ping_buffer_numref(pbuf) == 0);
-	LIBCFS_FREE(pbuf, LNET_PING_BUFFER_SIZE(pbuf->pb_nnis));
-}
+	struct lnet_ping_info *ping_info;
+	unsigned int	 infosz;
 
-static struct lnet_ping_buffer *
-lnet_ping_target_create(int nnis)
-{
-	struct lnet_ping_buffer *pbuf;
-
-	pbuf = lnet_ping_buffer_alloc(nnis, GFP_NOFS);
-	if (pbuf == NULL) {
-		CERROR("Can't allocate ping source [%d]\n", nnis);
+	infosz = offsetof(struct lnet_ping_info, pi_ni[num_ni]);
+	LIBCFS_ALLOC(ping_info, infosz);
+	if (ping_info == NULL) {
+		CERROR("Can't allocate ping info[%d]\n", num_ni);
 		return NULL;
 	}
 
-	pbuf->pb_info.pi_nnis = nnis;
-	pbuf->pb_info.pi_pid = the_lnet.ln_pid;
-	pbuf->pb_info.pi_magic = LNET_PROTO_PING_MAGIC;
-	pbuf->pb_info.pi_features =
-		LNET_PING_FEAT_NI_STATUS | LNET_PING_FEAT_MULTI_RAIL;
+	ping_info->pi_nnis = num_ni;
+	ping_info->pi_pid = the_lnet.ln_pid;
+	ping_info->pi_magic = LNET_PROTO_PING_MAGIC;
+	ping_info->pi_features = LNET_PING_FEAT_NI_STATUS;
 
-	return pbuf;
+	return ping_info;
 }
 
 static inline int
@@ -1567,25 +995,16 @@ lnet_get_ni_count(void)
 	return count;
 }
 
-int
-lnet_ping_info_validate(struct lnet_ping_info *pinfo)
+static inline void
+lnet_ping_info_free(struct lnet_ping_info *pinfo)
 {
-	if (!pinfo)
-		return -EINVAL;
-	if (pinfo->pi_magic != LNET_PROTO_PING_MAGIC)
-		return -EPROTO;
-	if (!(pinfo->pi_features & LNET_PING_FEAT_NI_STATUS))
-		return -EPROTO;
-	/* Loopback is guaranteed to be present */
-	if (pinfo->pi_nnis < 1 || pinfo->pi_nnis > lnet_interfaces_max)
-		return -ERANGE;
-	if (LNET_PING_INFO_LONI(pinfo) != LNET_NID_LO_0)
-		return -EPROTO;
-	return 0;
+	LIBCFS_FREE(pinfo,
+		    offsetof(struct lnet_ping_info,
+			     pi_ni[pinfo->pi_nnis]));
 }
 
 static void
-lnet_ping_target_destroy(void)
+lnet_ping_info_destroy(void)
 {
 	struct lnet_net *net;
 	struct lnet_ni	*ni;
@@ -1600,25 +1019,25 @@ lnet_ping_target_destroy(void)
 		}
 	}
 
-	lnet_ping_buffer_decref(the_lnet.ln_ping_target);
-	the_lnet.ln_ping_target = NULL;
+	lnet_ping_info_free(the_lnet.ln_ping_info);
+	the_lnet.ln_ping_info = NULL;
 
 	lnet_net_unlock(LNET_LOCK_EX);
 }
 
 static void
-lnet_ping_target_event_handler(struct lnet_event *event)
+lnet_ping_event_handler(struct lnet_event *event)
 {
-	struct lnet_ping_buffer *pbuf = event->md.user_ptr;
+	struct lnet_ping_info *pinfo = event->md.user_ptr;
 
 	if (event->unlinked)
-		lnet_ping_buffer_decref(pbuf);
+		pinfo->pi_features = LNET_PING_FEAT_INVAL;
 }
 
 static int
-lnet_ping_target_setup(struct lnet_ping_buffer **ppbuf,
-		       struct lnet_handle_md *ping_mdh,
-		       int ni_count, bool set_eq)
+lnet_ping_info_setup(struct lnet_ping_info **ppinfo,
+		     struct lnet_handle_md *md_handle,
+		     int ni_count, bool set_eq)
 {
 	struct lnet_process_id id = {
 		.nid = LNET_NID_ANY,
@@ -1629,76 +1048,72 @@ lnet_ping_target_setup(struct lnet_ping_buffer **ppbuf,
 	int rc, rc2;
 
 	if (set_eq) {
-		rc = LNetEQAlloc(0, lnet_ping_target_event_handler,
+		rc = LNetEQAlloc(0, lnet_ping_event_handler,
 				 &the_lnet.ln_ping_target_eq);
 		if (rc != 0) {
-			CERROR("Can't allocate ping buffer EQ: %d\n", rc);
+			CERROR("Can't allocate ping EQ: %d\n", rc);
 			return rc;
 		}
 	}
 
-	*ppbuf = lnet_ping_target_create(ni_count);
-	if (*ppbuf == NULL) {
+	*ppinfo = lnet_ping_info_create(ni_count);
+	if (*ppinfo == NULL) {
 		rc = -ENOMEM;
-		goto fail_free_eq;
+		goto failed_0;
 	}
 
-	/* Ping target ME/MD */
 	rc = LNetMEAttach(LNET_RESERVED_PORTAL, id,
 			  LNET_PROTO_PING_MATCHBITS, 0,
 			  LNET_UNLINK, LNET_INS_AFTER,
 			  &me_handle);
 	if (rc != 0) {
-		CERROR("Can't create ping target ME: %d\n", rc);
-		goto fail_decref_ping_buffer;
+		CERROR("Can't create ping ME: %d\n", rc);
+		goto failed_1;
 	}
 
 	/* initialize md content */
-	md.start     = &(*ppbuf)->pb_info;
-	md.length    = LNET_PING_INFO_SIZE((*ppbuf)->pb_nnis);
+	md.start     = *ppinfo;
+	md.length    = offsetof(struct lnet_ping_info,
+				pi_ni[(*ppinfo)->pi_nnis]);
 	md.threshold = LNET_MD_THRESH_INF;
 	md.max_size  = 0;
 	md.options   = LNET_MD_OP_GET | LNET_MD_TRUNCATE |
 		       LNET_MD_MANAGE_REMOTE;
+	md.user_ptr  = NULL;
 	md.eq_handle = the_lnet.ln_ping_target_eq;
-	md.user_ptr  = *ppbuf;
+	md.user_ptr = *ppinfo;
 
-	rc = LNetMDAttach(me_handle, md, LNET_RETAIN, ping_mdh);
+	rc = LNetMDAttach(me_handle, md, LNET_RETAIN, md_handle);
 	if (rc != 0) {
-		CERROR("Can't attach ping target MD: %d\n", rc);
-		goto fail_unlink_ping_me;
+		CERROR("Can't attach ping MD: %d\n", rc);
+		goto failed_2;
 	}
-	lnet_ping_buffer_addref(*ppbuf);
 
 	return 0;
 
-fail_unlink_ping_me:
+failed_2:
 	rc2 = LNetMEUnlink(me_handle);
 	LASSERT(rc2 == 0);
-fail_decref_ping_buffer:
-	LASSERT(lnet_ping_buffer_numref(*ppbuf) == 1);
-	lnet_ping_buffer_decref(*ppbuf);
-	*ppbuf = NULL;
-fail_free_eq:
-	if (set_eq) {
-		rc2 = LNetEQFree(the_lnet.ln_ping_target_eq);
-		LASSERT(rc2 == 0);
-	}
+failed_1:
+	lnet_ping_info_free(*ppinfo);
+	*ppinfo = NULL;
+failed_0:
+	if (set_eq)
+		LNetEQFree(the_lnet.ln_ping_target_eq);
 	return rc;
 }
 
 static void
-lnet_ping_md_unlink(struct lnet_ping_buffer *pbuf,
-		    struct lnet_handle_md *ping_mdh)
+lnet_ping_md_unlink(struct lnet_ping_info *pinfo, struct lnet_handle_md *md_handle)
 {
 	sigset_t	blocked = cfs_block_allsigs();
 
-	LNetMDUnlink(*ping_mdh);
-	LNetInvalidateMDHandle(ping_mdh);
+	LNetMDUnlink(*md_handle);
+	LNetInvalidateMDHandle(md_handle);
 
-	/* NB the MD could be busy; this just starts the unlink */
-	while (lnet_ping_buffer_numref(pbuf) > 1) {
-		CDEBUG(D_NET, "Still waiting for ping data MD to unlink\n");
+	/* NB md could be busy; this just starts the unlink */
+	while (pinfo->pi_features != LNET_PING_FEAT_INVAL) {
+		CDEBUG(D_NET, "Still waiting for ping MD to unlink\n");
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_timeout(cfs_time_seconds(1));
 	}
@@ -1707,241 +1122,77 @@ lnet_ping_md_unlink(struct lnet_ping_buffer *pbuf,
 }
 
 static void
-lnet_ping_target_install_locked(struct lnet_ping_buffer *pbuf)
+lnet_ping_info_install_locked(struct lnet_ping_info *ping_info)
 {
+	int			i;
 	struct lnet_ni		*ni;
 	struct lnet_net		*net;
 	struct lnet_ni_status *ns;
-	int			i;
-	int			rc;
 
 	i = 0;
 	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
 		list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
-			LASSERT(i < pbuf->pb_nnis);
+			LASSERT(i < ping_info->pi_nnis);
 
-			ns = &pbuf->pb_info.pi_ni[i];
+			ns = &ping_info->pi_ni[i];
 
 			ns->ns_nid = ni->ni_nid;
 
 			lnet_ni_lock(ni);
 			ns->ns_status = (ni->ni_status != NULL) ?
-					 ni->ni_status->ns_status :
+					ni->ni_status->ns_status :
 						LNET_NI_STATUS_UP;
 			ni->ni_status = ns;
 			lnet_ni_unlock(ni);
 
 			i++;
 		}
+
 	}
-	/*
-	 * We (ab)use the ns_status of the loopback interface to
-	 * transmit the sequence number. The first interface listed
-	 * must be the loopback interface.
-	 */
-	rc = lnet_ping_info_validate(&pbuf->pb_info);
-	if (rc) {
-		LCONSOLE_EMERG("Invalid ping target: %d\n", rc);
-		LBUG();
-	}
-	LNET_PING_BUFFER_SEQNO(pbuf) =
-		atomic_inc_return(&the_lnet.ln_ping_target_seqno);
 }
 
 static void
-lnet_ping_target_update(struct lnet_ping_buffer *pbuf,
-			struct lnet_handle_md ping_mdh)
+lnet_ping_target_update(struct lnet_ping_info *pinfo,
+			struct lnet_handle_md md_handle)
 {
-	struct lnet_ping_buffer *old_pbuf = NULL;
-	struct lnet_handle_md old_ping_md;
+	struct lnet_ping_info *old_pinfo = NULL;
+	struct lnet_handle_md old_md;
 
 	/* switch the NIs to point to the new ping info created */
 	lnet_net_lock(LNET_LOCK_EX);
 
 	if (!the_lnet.ln_routing)
-		pbuf->pb_info.pi_features |= LNET_PING_FEAT_RTE_DISABLED;
-	if (!lnet_peer_discovery_disabled)
-		pbuf->pb_info.pi_features |= LNET_PING_FEAT_DISCOVERY;
-
-	/* Ensure only known feature bits have been set. */
-	LASSERT(pbuf->pb_info.pi_features & LNET_PING_FEAT_BITS);
-	LASSERT(!(pbuf->pb_info.pi_features & ~LNET_PING_FEAT_BITS));
-
-	lnet_ping_target_install_locked(pbuf);
-
-	if (the_lnet.ln_ping_target) {
-		old_pbuf = the_lnet.ln_ping_target;
-		old_ping_md = the_lnet.ln_ping_target_md;
-	}
-	the_lnet.ln_ping_target_md = ping_mdh;
-	the_lnet.ln_ping_target = pbuf;
-
-	lnet_net_unlock(LNET_LOCK_EX);
-
-	if (old_pbuf) {
-		/* unlink and free the old ping info */
-		lnet_ping_md_unlink(old_pbuf, &old_ping_md);
-		lnet_ping_buffer_decref(old_pbuf);
-	}
-
-	lnet_push_update_to_peers(0);
-}
-
-static void
-lnet_ping_target_fini(void)
-{
-	int		rc;
-
-	lnet_ping_md_unlink(the_lnet.ln_ping_target,
-			    &the_lnet.ln_ping_target_md);
-
-	rc = LNetEQFree(the_lnet.ln_ping_target_eq);
-	LASSERT(rc == 0);
-
-	lnet_ping_target_destroy();
-}
-
-/* Resize the push target. */
-int lnet_push_target_resize(void)
-{
-	struct lnet_process_id id = { LNET_NID_ANY, LNET_PID_ANY };
-	struct lnet_md md = { NULL };
-	struct lnet_handle_me meh;
-	struct lnet_handle_md mdh;
-	struct lnet_handle_md old_mdh;
-	struct lnet_ping_buffer *pbuf;
-	struct lnet_ping_buffer *old_pbuf;
-	int nnis = the_lnet.ln_push_target_nnis;
-	int rc;
-
-	if (nnis <= 0) {
-		rc = -EINVAL;
-		goto fail_return;
-	}
-again:
-	pbuf = lnet_ping_buffer_alloc(nnis, GFP_NOFS);
-	if (!pbuf) {
-		rc = -ENOMEM;
-		goto fail_return;
-	}
-
-	rc = LNetMEAttach(LNET_RESERVED_PORTAL, id,
-			  LNET_PROTO_PING_MATCHBITS, 0,
-			  LNET_UNLINK, LNET_INS_AFTER,
-			  &meh);
-	if (rc) {
-		CERROR("Can't create push target ME: %d\n", rc);
-		goto fail_decref_pbuf;
-	}
-
-	/* initialize md content */
-	md.start     = &pbuf->pb_info;
-	md.length    = LNET_PING_INFO_SIZE(nnis);
-	md.threshold = LNET_MD_THRESH_INF;
-	md.max_size  = 0;
-	md.options   = LNET_MD_OP_PUT | LNET_MD_TRUNCATE |
-		       LNET_MD_MANAGE_REMOTE;
-	md.user_ptr  = pbuf;
-	md.eq_handle = the_lnet.ln_push_target_eq;
+		pinfo->pi_features |= LNET_PING_FEAT_RTE_DISABLED;
+	lnet_ping_info_install_locked(pinfo);
 
-	rc = LNetMDAttach(meh, md, LNET_RETAIN, &mdh);
-	if (rc) {
-		CERROR("Can't attach push MD: %d\n", rc);
-		goto fail_unlink_meh;
+	if (the_lnet.ln_ping_info != NULL) {
+		old_pinfo = the_lnet.ln_ping_info;
+		old_md = the_lnet.ln_ping_target_md;
 	}
-	lnet_ping_buffer_addref(pbuf);
+	the_lnet.ln_ping_target_md = md_handle;
+	the_lnet.ln_ping_info = pinfo;
 
-	lnet_net_lock(LNET_LOCK_EX);
-	old_pbuf = the_lnet.ln_push_target;
-	old_mdh = the_lnet.ln_push_target_md;
-	the_lnet.ln_push_target = pbuf;
-	the_lnet.ln_push_target_md = mdh;
 	lnet_net_unlock(LNET_LOCK_EX);
 
-	if (old_pbuf) {
-		LNetMDUnlink(old_mdh);
-		lnet_ping_buffer_decref(old_pbuf);
-	}
-
-	if (nnis < the_lnet.ln_push_target_nnis)
-		goto again;
-
-	CDEBUG(D_NET, "nnis %d success\n", nnis);
-
-	return 0;
-
-fail_unlink_meh:
-	LNetMEUnlink(meh);
-fail_decref_pbuf:
-	lnet_ping_buffer_decref(pbuf);
-fail_return:
-	CDEBUG(D_NET, "nnis %d error %d\n", nnis, rc);
-	return rc;
-}
-
-static void lnet_push_target_event_handler(struct lnet_event *ev)
-{
-	struct lnet_ping_buffer *pbuf = ev->md.user_ptr;
-
-	if (pbuf->pb_info.pi_magic == __swab32(LNET_PROTO_PING_MAGIC))
-		lnet_swap_pinginfo(pbuf);
-
-	lnet_peer_push_event(ev);
-	if (ev->unlinked)
-		lnet_ping_buffer_decref(pbuf);
-}
-
-/* Initialize the push target. */
-static int lnet_push_target_init(void)
-{
-	int rc;
-
-	if (the_lnet.ln_push_target)
-		return -EALREADY;
-
-	rc = LNetEQAlloc(0, lnet_push_target_event_handler,
-			 &the_lnet.ln_push_target_eq);
-	if (rc) {
-		CERROR("Can't allocated push target EQ: %d\n", rc);
-		return rc;
-	}
-
-	/* Start at the required minimum, we'll enlarge if required. */
-	the_lnet.ln_push_target_nnis = LNET_INTERFACES_MIN;
-
-	rc = lnet_push_target_resize();
-
-	if (rc) {
-		LNetEQFree(the_lnet.ln_push_target_eq);
-		LNetInvalidateEQHandle(&the_lnet.ln_push_target_eq);
+	if (old_pinfo != NULL) {
+		/* unlink the old ping info */
+		lnet_ping_md_unlink(old_pinfo, &old_md);
+		lnet_ping_info_free(old_pinfo);
 	}
-
-	return rc;
 }
 
-/* Clean up the push target. */
-static void lnet_push_target_fini(void)
+static void
+lnet_ping_target_fini(void)
 {
-	if (!the_lnet.ln_push_target)
-		return;
-
-	/* Unlink and invalidate to prevent new references. */
-	LNetMDUnlink(the_lnet.ln_push_target_md);
-	LNetInvalidateMDHandle(&the_lnet.ln_push_target_md);
+	int		rc;
 
-	/* Wait for the unlink to complete. */
-	while (lnet_ping_buffer_numref(the_lnet.ln_push_target) > 1) {
-		CDEBUG(D_NET, "Still waiting for ping data MD to unlink\n");
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		schedule_timeout(cfs_time_seconds(1));
-	}
+	lnet_ping_md_unlink(the_lnet.ln_ping_info,
+			    &the_lnet.ln_ping_target_md);
 
-	lnet_ping_buffer_decref(the_lnet.ln_push_target);
-	the_lnet.ln_push_target = NULL;
-	the_lnet.ln_push_target_nnis = 0;
+	rc = LNetEQFree(the_lnet.ln_ping_target_eq);
+	LASSERT(rc == 0);
 
-	LNetEQFree(the_lnet.ln_push_target_eq);
-	LNetInvalidateEQHandle(&the_lnet.ln_push_target_eq);
+	lnet_ping_info_destroy();
 }
 
 static int
@@ -1964,6 +1215,11 @@ lnet_ni_tq_credits(struct lnet_ni *ni)
 static void
 lnet_ni_unlink_locked(struct lnet_ni *ni)
 {
+	if (!list_empty(&ni->ni_cptlist)) {
+		list_del_init(&ni->ni_cptlist);
+		lnet_ni_decref_locked(ni, 0);
+	}
+
 	/* move it to zombie list and nobody can find it anymore */
 	LASSERT(!list_empty(&ni->ni_netlist));
 	list_move(&ni->ni_netlist, &ni->ni_net->net_ni_zombie);
@@ -2002,13 +1258,7 @@ lnet_clear_zombies_nis_locked(struct lnet_net *net)
 		}
 
 		if (!list_empty(&ni->ni_netlist)) {
-			/* Unlock mutex while waiting to allow other
-			 * threads to read the LNet state and fall through
-			 * to avoid deadlock
-			 */
 			lnet_net_unlock(LNET_LOCK_EX);
-			mutex_unlock(&the_lnet.ln_api_mutex);
-
 			++i;
 			if ((i & (-i)) == i) {
 				CDEBUG(D_WARNING,
@@ -2017,8 +1267,6 @@ lnet_clear_zombies_nis_locked(struct lnet_net *net)
 			}
 			set_current_state(TASK_UNINTERRUPTIBLE);
 			schedule_timeout(cfs_time_seconds(1));
-
-			mutex_lock(&the_lnet.ln_api_mutex);
 			lnet_net_lock(LNET_LOCK_EX);
 			continue;
 		}
@@ -2048,9 +1296,7 @@ lnet_shutdown_lndni(struct lnet_ni *ni)
 	struct lnet_net *net = ni->ni_net;
 
 	lnet_net_lock(LNET_LOCK_EX);
-	lnet_ni_lock(ni);
 	ni->ni_state = LNET_NI_STATE_DELETING;
-	lnet_ni_unlock(ni);
 	lnet_ni_unlink_locked(ni);
 	lnet_incr_dlc_seq();
 	lnet_net_unlock(LNET_LOCK_EX);
@@ -2104,10 +1350,6 @@ static void
 lnet_shutdown_lndnets(void)
 {
 	struct lnet_net *net;
-	struct list_head resend;
-	struct lnet_msg *msg, *tmp;
-
-	INIT_LIST_HEAD(&resend);
 
 	/* NB called holding the global mutex */
 
@@ -2143,16 +1385,6 @@ lnet_shutdown_lndnets(void)
 		lnet_shutdown_lndnet(net);
 	}
 
-	spin_lock(&the_lnet.ln_msg_resend_lock);
-	list_splice(&the_lnet.ln_msg_resend, &resend);
-	spin_unlock(&the_lnet.ln_msg_resend_lock);
-
-	list_for_each_entry_safe(msg, tmp, &resend, msg_list) {
-		list_del_init(&msg->msg_list);
-		msg->msg_no_resend = true;
-		lnet_finalize(msg, -ECANCELED);
-	}
-
 	lnet_net_lock(LNET_LOCK_EX);
 	the_lnet.ln_state = LNET_STATE_SHUTDOWN;
 	lnet_net_unlock(LNET_LOCK_EX);
@@ -2186,9 +1418,7 @@ lnet_startup_lndni(struct lnet_ni *ni, struct lnet_lnd_tunables *tun)
 		goto failed0;
 	}
 
-	lnet_ni_lock(ni);
 	ni->ni_state = LNET_NI_STATE_ACTIVE;
-	lnet_ni_unlock(ni);
 
 	/* We keep a reference on the loopback net through the loopback NI */
 	if (net->net_lnd->lnd_type == LOLND) {
@@ -2223,7 +1453,6 @@ lnet_startup_lndni(struct lnet_ni *ni, struct lnet_lnd_tunables *tun)
 
 	atomic_set(&ni->ni_tx_credits,
 		   lnet_ni_tq_credits(ni) * ni->ni_ncpts);
-	atomic_set(&ni->ni_healthv, LNET_MAX_HEALTH_VALUE);
 
 	CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n",
 		libcfs_nid2str(ni->ni_nid),
@@ -2267,6 +1496,8 @@ lnet_startup_lndnet(struct lnet_net *net, struct lnet_lnd_tunables *tun)
 	if (lnet_net_unique(net->net_id, &the_lnet.ln_nets, &net_l)) {
 		lnd_type = LNET_NETTYP(net->net_id);
 
+		LASSERT(libcfs_isknown_lnd(lnd_type));
+
 		mutex_lock(&the_lnet.ln_lnd_mutex);
 		lnd = lnet_find_lnd_by_type(lnd_type);
 
@@ -2345,7 +1576,7 @@ lnet_startup_lndnet(struct lnet_net *net, struct lnet_lnd_tunables *tun)
 		 * up is actually unique. if it's not fail. */
 		if (!lnet_ni_unique_net(&net_l->net_ni_list,
 					ni->ni_interfaces[0])) {
-			rc = -EEXIST;
+			rc = -EINVAL;
 			goto failed1;
 		}
 
@@ -2470,6 +1701,8 @@ int lnet_lib_init(void)
 
 	lnet_assert_wire_constants();
 
+	memset(&the_lnet, 0, sizeof(the_lnet));
+
 	/* refer to global cfs_cpt_table for now */
 	the_lnet.ln_cpt_table	= cfs_cpt_table;
 	the_lnet.ln_cpt_number	= cfs_cpt_number(cfs_cpt_table);
@@ -2497,7 +1730,6 @@ int lnet_lib_init(void)
 	INIT_LIST_HEAD(&the_lnet.ln_lnds);
 	INIT_LIST_HEAD(&the_lnet.ln_net_zombie);
 	INIT_LIST_HEAD(&the_lnet.ln_rcd_zombie);
-	INIT_LIST_HEAD(&the_lnet.ln_msg_resend);
 	INIT_LIST_HEAD(&the_lnet.ln_rcd_deathrow);
 
 	/* The hash table size is the number of bits it takes to express the set
@@ -2554,8 +1786,8 @@ LNetNIInit(lnet_pid_t requested_pid)
 	int			im_a_router = 0;
 	int			rc;
 	int			ni_count;
-	struct lnet_ping_buffer	*pbuf;
-	struct lnet_handle_md	ping_mdh;
+	struct lnet_ping_info	*pinfo;
+	struct lnet_handle_md	md_handle;
 	struct list_head	net_head;
 	struct lnet_net		*net;
 
@@ -2590,9 +1822,6 @@ LNetNIInit(lnet_pid_t requested_pid)
 		goto err_empty_list;
 	}
 
-	if (use_tcp_bonding)
-		CWARN("'use_tcp_bonding' option has been deprecated. See LU-13641\n");
-
 	/* If LNet is being initialized via DLC it is possible
 	 * that the user requests not to load module parameters (ones which
 	 * are supported by DLC) on initialization.  Therefore, make sure not
@@ -2633,41 +1862,23 @@ LNetNIInit(lnet_pid_t requested_pid)
 	the_lnet.ln_refcount = 1;
 	/* Now I may use my own API functions... */
 
-	rc = lnet_ping_target_setup(&pbuf, &ping_mdh, ni_count, true);
+	rc = lnet_ping_info_setup(&pinfo, &md_handle, ni_count, true);
 	if (rc != 0)
 		goto err_acceptor_stop;
 
-	lnet_ping_target_update(pbuf, ping_mdh);
-
-	rc = LNetEQAlloc(0, lnet_mt_event_handler, &the_lnet.ln_mt_eqh);
-	if (rc != 0) {
-		CERROR("Can't allocate monitor thread EQ: %d\n", rc);
-		goto err_stop_ping;
-	}
+	lnet_ping_target_update(pinfo, md_handle);
 
-	rc = lnet_monitor_thr_start();
+	rc = lnet_router_checker_start();
 	if (rc != 0)
 		goto err_stop_ping;
 
-	rc = lnet_push_target_init();
-	if (rc != 0)
-		goto err_stop_monitor_thr;
-
-	rc = lnet_peer_discovery_start();
-	if (rc != 0)
-		goto err_destroy_push_target;
-
 	lnet_fault_init();
-	lnet_router_debugfs_init();
+	lnet_proc_init();
 
 	mutex_unlock(&the_lnet.ln_api_mutex);
 
 	return 0;
 
-err_destroy_push_target:
-	lnet_push_target_fini();
-err_stop_monitor_thr:
-	lnet_monitor_thr_stop();
 err_stop_ping:
 	lnet_ping_target_fini();
 err_acceptor_stop:
@@ -2716,10 +1927,8 @@ LNetNIFini()
 
 		lnet_fault_fini();
 
-		lnet_router_debugfs_fini();
-		lnet_peer_discovery_stop();
-		lnet_push_target_fini();
-		lnet_monitor_thr_stop();
+		lnet_proc_fini();
+		lnet_router_checker_stop();
 		lnet_ping_target_fini();
 
 		/* Teardown fns that use my own API functions BEFORE here */
@@ -2767,22 +1976,15 @@ lnet_fill_ni_info(struct lnet_ni *ni, struct lnet_ioctl_config_ni *cfg_ni,
 	}
 
 	cfg_ni->lic_nid = ni->ni_nid;
-	if (ni->ni_nid == LNET_NID_LO_0)
-		cfg_ni->lic_status = LNET_NI_STATUS_UP;
-	else
-		cfg_ni->lic_status = ni->ni_status->ns_status;
+	cfg_ni->lic_status = ni->ni_status->ns_status;
 	cfg_ni->lic_tcp_bonding = use_tcp_bonding;
 	cfg_ni->lic_dev_cpt = ni->ni_dev_cpt;
 
 	memcpy(&tun->lt_cmn, &ni->ni_net->net_tunables, sizeof(tun->lt_cmn));
 
 	if (stats) {
-		stats->iel_send_count = lnet_sum_stats(&ni->ni_stats,
-						       LNET_STATS_TYPE_SEND);
-		stats->iel_recv_count = lnet_sum_stats(&ni->ni_stats,
-						       LNET_STATS_TYPE_RECV);
-		stats->iel_drop_count = lnet_sum_stats(&ni->ni_stats,
-						       LNET_STATS_TYPE_DROP);
+		stats->iel_send_count = atomic_read(&ni->ni_stats.send_count);
+		stats->iel_recv_count = atomic_read(&ni->ni_stats.recv_count);
 	}
 
 	/*
@@ -2859,10 +2061,7 @@ lnet_fill_ni_info_legacy(struct lnet_ni *ni,
 	config->cfg_config_u.cfg_net.net_peer_rtr_credits =
 		ni->ni_net->net_tunables.lct_peer_rtr_credits;
 
-	if (ni->ni_nid == LNET_NID_LO_0)
-		net_config->ni_status = LNET_NI_STATUS_UP;
-	else
-		net_config->ni_status = ni->ni_status->ns_status;
+	net_config->ni_status = ni->ni_status->ns_status;
 
 	if (ni->ni_cpts) {
 		int num_cpts = min(ni->ni_ncpts, LNET_MAX_SHOW_NUM_CPT);
@@ -2920,17 +2119,10 @@ lnet_get_next_ni_locked(struct lnet_net *mynet, struct lnet_ni *prev)
 	struct lnet_ni		*ni;
 	struct lnet_net		*net = mynet;
 
-	/*
-	 * It is possible that the net has been cleaned out while there is
-	 * a message being sent. This function accessed the net without
-	 * checking if the list is empty
-	 */
 	if (prev == NULL) {
 		if (net == NULL)
 			net = list_entry(the_lnet.ln_nets.next, struct lnet_net,
 					net_list);
-		if (list_empty(&net->net_ni_list))
-			return NULL;
 		ni = list_entry(net->net_ni_list.next, struct lnet_ni,
 				ni_netlist);
 
@@ -2952,8 +2144,6 @@ lnet_get_next_ni_locked(struct lnet_net *mynet, struct lnet_ni *prev)
 		/* get the next net */
 		net = list_entry(prev->ni_net->net_list.next, struct lnet_net,
 				 net_list);
-		if (list_empty(&net->net_ni_list))
-			return NULL;
 		/* get the ni on it */
 		ni = list_entry(net->net_ni_list.next, struct lnet_ni,
 				ni_netlist);
@@ -2961,9 +2151,6 @@ lnet_get_next_ni_locked(struct lnet_net *mynet, struct lnet_ni *prev)
 		return ni;
 	}
 
-	if (list_empty(&prev->ni_netlist))
-		return NULL;
-
 	/* there are more nis left */
 	ni = list_entry(prev->ni_netlist.next, struct lnet_ni, ni_netlist);
 
@@ -3021,35 +2208,12 @@ lnet_get_ni_config(struct lnet_ioctl_config_ni *cfg_ni,
 	return rc;
 }
 
-int lnet_get_ni_stats(struct lnet_ioctl_element_msg_stats *msg_stats)
-{
-	struct lnet_ni *ni;
-	int cpt;
-	int rc = -ENOENT;
-
-	if (!msg_stats)
-		return -EINVAL;
-
-	cpt = lnet_net_lock_current();
-
-	ni = lnet_get_ni_idx_locked(msg_stats->im_idx);
-
-	if (ni) {
-		lnet_usr_translate_stats(msg_stats, &ni->ni_stats);
-		rc = 0;
-	}
-
-	lnet_net_unlock(cpt);
-
-	return rc;
-}
-
 static int lnet_add_net_common(struct lnet_net *net,
 			       struct lnet_ioctl_config_lnd_tunables *tun)
 {
 	__u32			net_id;
-	struct lnet_ping_buffer *pbuf;
-	struct lnet_handle_md	ping_mdh;
+	struct lnet_ping_info	*pinfo;
+	struct lnet_handle_md	md_handle;
 	int			rc;
 	struct lnet_remotenet *rnet;
 	int			net_ni_count;
@@ -3071,7 +2235,7 @@ static int lnet_add_net_common(struct lnet_net *net,
 
 	/*
 	 * make sure you calculate the correct number of slots in the ping
-	 * buffer. Since the ping info is a flattened list of all the NIs,
+	 * info. Since the ping info is a flattened list of all the NIs,
 	 * we should allocate enough slots to accomodate the number of NIs
 	 * which will be added.
 	 *
@@ -3080,9 +2244,9 @@ static int lnet_add_net_common(struct lnet_net *net,
 	 */
 	net_ni_count = lnet_get_net_ni_count_pre(net);
 
-	rc = lnet_ping_target_setup(&pbuf, &ping_mdh,
-				    net_ni_count + lnet_get_ni_count(),
-				    false);
+	rc = lnet_ping_info_setup(&pinfo, &md_handle,
+				  net_ni_count + lnet_get_ni_count(),
+				  false);
 	if (rc < 0) {
 		lnet_net_free(net);
 		return rc;
@@ -3133,13 +2297,13 @@ static int lnet_add_net_common(struct lnet_net *net,
 	lnet_peer_net_added(net);
 	lnet_net_unlock(LNET_LOCK_EX);
 
-	lnet_ping_target_update(pbuf, ping_mdh);
+	lnet_ping_target_update(pinfo, md_handle);
 
 	return 0;
 
 failed:
-	lnet_ping_md_unlink(pbuf, &ping_mdh);
-	lnet_ping_buffer_decref(pbuf);
+	lnet_ping_md_unlink(pinfo, &md_handle);
+	lnet_ping_info_free(pinfo);
 	return rc;
 }
 
@@ -3187,7 +2351,7 @@ int lnet_dyn_add_ni(struct lnet_ioctl_config_ni *conf)
 	struct lnet_ni *ni;
 	struct lnet_ioctl_config_lnd_tunables *tun = NULL;
 	int rc, i;
-	__u32 net_id, lnd_type;
+	__u32 net_id;
 
 	/* get the tunables if they are available */
 	if (conf->lic_cfg_hdr.ioc_len >=
@@ -3201,12 +2365,6 @@ int lnet_dyn_add_ni(struct lnet_ioctl_config_ni *conf)
 						  tun);
 
 	net_id = LNET_NIDNET(conf->lic_nid);
-	lnd_type = LNET_NETTYP(net_id);
-
-	if (!libcfs_isknown_lnd(lnd_type)) {
-		CERROR("No valid net and lnd information provided\n");
-		return -EINVAL;
-	}
 
 	net = lnet_net_alloc(net_id, NULL);
 	if (!net)
@@ -3236,8 +2394,8 @@ int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf)
 	struct lnet_net	 *net;
 	struct lnet_ni *ni;
 	__u32 net_id = LNET_NIDNET(conf->lic_nid);
-	struct lnet_ping_buffer *pbuf;
-	struct lnet_handle_md  ping_mdh;
+	struct lnet_ping_info *pinfo;
+	struct lnet_handle_md md_handle;
 	int		  rc;
 	int		  net_count;
 	__u32		  addr;
@@ -3255,7 +2413,7 @@ int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf)
 		CERROR("net %s not found\n",
 		       libcfs_net2str(net_id));
 		rc = -ENOENT;
-		goto unlock_net;
+		goto net_unlock;
 	}
 
 	addr = LNET_NIDADDR(conf->lic_nid);
@@ -3266,28 +2424,28 @@ int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf)
 		lnet_net_unlock(0);
 
 		/* create and link a new ping info, before removing the old one */
-		rc = lnet_ping_target_setup(&pbuf, &ping_mdh,
+		rc = lnet_ping_info_setup(&pinfo, &md_handle,
 					lnet_get_ni_count() - net_count,
 					false);
 		if (rc != 0)
-			goto unlock_api_mutex;
+			goto out;
 
 		lnet_shutdown_lndnet(net);
 
 		if (lnet_count_acceptor_nets() == 0)
 			lnet_acceptor_stop();
 
-		lnet_ping_target_update(pbuf, ping_mdh);
+		lnet_ping_target_update(pinfo, md_handle);
 
-		goto unlock_api_mutex;
+		goto out;
 	}
 
 	ni = lnet_nid2ni_locked(conf->lic_nid, 0);
 	if (!ni) {
-		CERROR("nid %s not found\n",
+		CERROR("nid %s not found \n",
 		       libcfs_nid2str(conf->lic_nid));
 		rc = -ENOENT;
-		goto unlock_net;
+		goto net_unlock;
 	}
 
 	net_count = lnet_get_net_ni_count_locked(net);
@@ -3295,27 +2453,27 @@ int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf)
 	lnet_net_unlock(0);
 
 	/* create and link a new ping info, before removing the old one */
-	rc = lnet_ping_target_setup(&pbuf, &ping_mdh,
+	rc = lnet_ping_info_setup(&pinfo, &md_handle,
 				  lnet_get_ni_count() - 1, false);
 	if (rc != 0)
-		goto unlock_api_mutex;
+		goto out;
 
 	lnet_shutdown_lndni(ni);
 
 	if (lnet_count_acceptor_nets() == 0)
 		lnet_acceptor_stop();
 
-	lnet_ping_target_update(pbuf, ping_mdh);
+	lnet_ping_target_update(pinfo, md_handle);
 
 	/* check if the net is empty and remove it if it is */
 	if (net_count == 1)
 		lnet_shutdown_lndnet(net);
 
-	goto unlock_api_mutex;
+	goto out;
 
-unlock_net:
+net_unlock:
 	lnet_net_unlock(0);
-unlock_api_mutex:
+out:
 	mutex_unlock(&the_lnet.ln_api_mutex);
 
 	return rc;
@@ -3383,8 +2541,8 @@ int
 lnet_dyn_del_net(__u32 net_id)
 {
 	struct lnet_net	 *net;
-	struct lnet_ping_buffer *pbuf;
-	struct lnet_handle_md ping_mdh;
+	struct lnet_ping_info *pinfo;
+	struct lnet_handle_md md_handle;
 	int		  rc;
 	int		  net_ni_count;
 
@@ -3398,7 +2556,6 @@ lnet_dyn_del_net(__u32 net_id)
 
 	net = lnet_get_net_locked(net_id);
 	if (net == NULL) {
-		lnet_net_unlock(0);
 		rc = -EINVAL;
 		goto out;
 	}
@@ -3408,8 +2565,8 @@ lnet_dyn_del_net(__u32 net_id)
 	lnet_net_unlock(0);
 
 	/* create and link a new ping info, before removing the old one */
-	rc = lnet_ping_target_setup(&pbuf, &ping_mdh,
-				    lnet_get_ni_count() - net_ni_count, false);
+	rc = lnet_ping_info_setup(&pinfo, &md_handle,
+				  lnet_get_ni_count() - net_ni_count, false);
 	if (rc != 0)
 		goto out;
 
@@ -3418,7 +2575,7 @@ lnet_dyn_del_net(__u32 net_id)
 	if (lnet_count_acceptor_nets() == 0)
 		lnet_acceptor_stop();
 
-	lnet_ping_target_update(pbuf, ping_mdh);
+	lnet_ping_target_update(pinfo, md_handle);
 
 out:
 	mutex_unlock(&the_lnet.ln_api_mutex);
@@ -3436,102 +2593,6 @@ __u32 lnet_get_dlc_seq_locked(void)
 	return atomic_read(&lnet_dlc_seq_no);
 }
 
-static void
-lnet_ni_set_healthv(lnet_nid_t nid, int value, bool all)
-{
-	struct lnet_net *net;
-	struct lnet_ni *ni;
-
-	lnet_net_lock(LNET_LOCK_EX);
-	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
-		list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
-			if (ni->ni_nid == nid || all) {
-				atomic_set(&ni->ni_healthv, value);
-				if (list_empty(&ni->ni_recovery) &&
-				    value < LNET_MAX_HEALTH_VALUE) {
-					CERROR("manually adding local NI %s to recovery\n",
-					       libcfs_nid2str(ni->ni_nid));
-					list_add_tail(&ni->ni_recovery,
-						      &the_lnet.ln_mt_localNIRecovq);
-					lnet_ni_addref_locked(ni, 0);
-				}
-				if (!all) {
-					lnet_net_unlock(LNET_LOCK_EX);
-					return;
-				}
-			}
-		}
-	}
-	lnet_net_unlock(LNET_LOCK_EX);
-}
-
-static int
-lnet_get_local_ni_hstats(struct lnet_ioctl_local_ni_hstats *stats)
-{
-	int cpt, rc = 0;
-	struct lnet_ni *ni;
-	lnet_nid_t nid = stats->hlni_nid;
-
-	cpt = lnet_net_lock_current();
-	ni = lnet_nid2ni_locked(nid, cpt);
-
-	if (!ni) {
-		rc = -ENOENT;
-		goto unlock;
-	}
-
-	stats->hlni_local_interrupt = atomic_read(&ni->ni_hstats.hlt_local_interrupt);
-	stats->hlni_local_dropped = atomic_read(&ni->ni_hstats.hlt_local_dropped);
-	stats->hlni_local_aborted = atomic_read(&ni->ni_hstats.hlt_local_aborted);
-	stats->hlni_local_no_route = atomic_read(&ni->ni_hstats.hlt_local_no_route);
-	stats->hlni_local_timeout = atomic_read(&ni->ni_hstats.hlt_local_timeout);
-	stats->hlni_local_error = atomic_read(&ni->ni_hstats.hlt_local_error);
-	stats->hlni_health_value = atomic_read(&ni->ni_healthv);
-
-unlock:
-	lnet_net_unlock(cpt);
-
-	return rc;
-}
-
-static int
-lnet_get_local_ni_recovery_list(struct lnet_ioctl_recovery_list *list)
-{
-	struct lnet_ni *ni;
-	int i = 0;
-
-	lnet_net_lock(LNET_LOCK_EX);
-	list_for_each_entry(ni, &the_lnet.ln_mt_localNIRecovq, ni_recovery) {
-		list->rlst_nid_array[i] = ni->ni_nid;
-		i++;
-		if (i >= LNET_MAX_SHOW_NUM_NID)
-			break;
-	}
-	lnet_net_unlock(LNET_LOCK_EX);
-	list->rlst_num_nids = i;
-
-	return 0;
-}
-
-static int
-lnet_get_peer_ni_recovery_list(struct lnet_ioctl_recovery_list *list)
-{
-	struct lnet_peer_ni *lpni;
-	int i = 0;
-
-	lnet_net_lock(LNET_LOCK_EX);
-	list_for_each_entry(lpni, &the_lnet.ln_mt_peerNIRecovq, lpni_recovery) {
-		list->rlst_nid_array[i] = lpni->lpni_nid;
-		i++;
-		if (i >= LNET_MAX_SHOW_NUM_NID)
-			break;
-	}
-	lnet_net_unlock(LNET_LOCK_EX);
-	list->rlst_num_nids = i;
-
-	return 0;
-}
-
 /**
  * LNet ioctl handler.
  *
@@ -3613,10 +2674,9 @@ LNetCtl(unsigned int cmd, void *arg)
 		__u32 tun_size;
 
 		cfg_ni = arg;
-
 		/* get the tunables if they are available */
 		if (cfg_ni->lic_cfg_hdr.ioc_len <
-		    sizeof(*cfg_ni) + sizeof(*stats) + sizeof(*tun))
+		    sizeof(*cfg_ni) + sizeof(*stats)+ sizeof(*tun))
 			return -EINVAL;
 
 		stats = (struct lnet_ioctl_element_stats *)
@@ -3633,19 +2693,6 @@ LNetCtl(unsigned int cmd, void *arg)
 		return rc;
 	}
 
-	case IOC_LIBCFS_GET_LOCAL_NI_MSG_STATS: {
-		struct lnet_ioctl_element_msg_stats *msg_stats = arg;
-
-		if (msg_stats->im_hdr.ioc_len != sizeof(*msg_stats))
-			return -EINVAL;
-
-		mutex_lock(&the_lnet.ln_api_mutex);
-		rc = lnet_get_ni_stats(msg_stats);
-		mutex_unlock(&the_lnet.ln_api_mutex);
-
-		return rc;
-	}
-
 	case IOC_LIBCFS_GET_NET: {
 		size_t total = sizeof(*config) +
 			       sizeof(struct lnet_ioctl_net_config);
@@ -3706,22 +2753,22 @@ LNetCtl(unsigned int cmd, void *arg)
 		return rc;
 
 	case IOC_LIBCFS_SET_NUMA_RANGE: {
-		struct lnet_ioctl_set_value *numa;
+		struct lnet_ioctl_numa_range *numa;
 		numa = arg;
-		if (numa->sv_hdr.ioc_len != sizeof(*numa))
+		if (numa->nr_hdr.ioc_len != sizeof(*numa))
 			return -EINVAL;
-		lnet_net_lock(LNET_LOCK_EX);
-		lnet_numa_range = numa->sv_value;
-		lnet_net_unlock(LNET_LOCK_EX);
+		mutex_lock(&the_lnet.ln_api_mutex);
+		lnet_numa_range = numa->nr_range;
+		mutex_unlock(&the_lnet.ln_api_mutex);
 		return 0;
 	}
 
 	case IOC_LIBCFS_GET_NUMA_RANGE: {
-		struct lnet_ioctl_set_value *numa;
+		struct lnet_ioctl_numa_range *numa;
 		numa = arg;
-		if (numa->sv_hdr.ioc_len != sizeof(*numa))
+		if (numa->nr_hdr.ioc_len != sizeof(*numa))
 			return -EINVAL;
-		numa->sv_value = lnet_numa_range;
+		numa->nr_range = lnet_numa_range;
 		return 0;
 	}
 
@@ -3742,33 +2789,6 @@ LNetCtl(unsigned int cmd, void *arg)
 		return rc;
 	}
 
-	case IOC_LIBCFS_GET_LOCAL_HSTATS: {
-		struct lnet_ioctl_local_ni_hstats *stats = arg;
-
-		if (stats->hlni_hdr.ioc_len < sizeof(*stats))
-			return -EINVAL;
-
-		mutex_lock(&the_lnet.ln_api_mutex);
-		rc = lnet_get_local_ni_hstats(stats);
-		mutex_unlock(&the_lnet.ln_api_mutex);
-
-		return rc;
-	}
-
-	case IOC_LIBCFS_GET_RECOVERY_QUEUE: {
-		struct lnet_ioctl_recovery_list *list = arg;
-		if (list->rlst_hdr.ioc_len < sizeof(*list))
-			return -EINVAL;
-
-		mutex_lock(&the_lnet.ln_api_mutex);
-		if (list->rlst_type == LNET_HEALTH_TYPE_LOCAL_NI)
-			rc = lnet_get_local_ni_recovery_list(list);
-		else
-			rc = lnet_get_peer_ni_recovery_list(list);
-		mutex_unlock(&the_lnet.ln_api_mutex);
-		return rc;
-	}
-
 	case IOC_LIBCFS_ADD_PEER_NI: {
 		struct lnet_ioctl_peer_cfg *cfg = arg;
 
@@ -3776,9 +2796,9 @@ LNetCtl(unsigned int cmd, void *arg)
 			return -EINVAL;
 
 		mutex_lock(&the_lnet.ln_api_mutex);
-		rc = lnet_add_peer_ni(cfg->prcfg_prim_nid,
-				      cfg->prcfg_cfg_nid,
-				      cfg->prcfg_mr);
+		rc = lnet_add_peer_ni_to_peer(cfg->prcfg_prim_nid,
+					      cfg->prcfg_cfg_nid,
+					      cfg->prcfg_mr);
 		mutex_unlock(&the_lnet.ln_api_mutex);
 		return rc;
 	}
@@ -3790,8 +2810,8 @@ LNetCtl(unsigned int cmd, void *arg)
 			return -EINVAL;
 
 		mutex_lock(&the_lnet.ln_api_mutex);
-		rc = lnet_del_peer_ni(cfg->prcfg_prim_nid,
-				      cfg->prcfg_cfg_nid);
+		rc = lnet_del_peer_ni_from_peer(cfg->prcfg_prim_nid,
+						cfg->prcfg_cfg_nid);
 		mutex_unlock(&the_lnet.ln_api_mutex);
 		return rc;
 	}
@@ -3820,65 +2840,30 @@ LNetCtl(unsigned int cmd, void *arg)
 
 	case IOC_LIBCFS_GET_PEER_NI: {
 		struct lnet_ioctl_peer_cfg *cfg = arg;
+		struct lnet_peer_ni_credit_info __user *lpni_cri;
+		struct lnet_ioctl_element_stats __user *lpni_stats;
+		size_t usr_size = sizeof(*lpni_cri) + sizeof(*lpni_stats);
 
-		if (cfg->prcfg_hdr.ioc_len < sizeof(*cfg))
+		if ((cfg->prcfg_hdr.ioc_len != sizeof(*cfg)) ||
+		    (cfg->prcfg_size != usr_size))
 			return -EINVAL;
 
-		mutex_lock(&the_lnet.ln_api_mutex);
-		rc = lnet_get_peer_info(cfg,
-					(void __user *)cfg->prcfg_bulk);
-		mutex_unlock(&the_lnet.ln_api_mutex);
-		return rc;
-	}
-
-	case IOC_LIBCFS_GET_PEER_LIST: {
-		struct lnet_ioctl_peer_cfg *cfg = arg;
-
-		if (cfg->prcfg_hdr.ioc_len < sizeof(*cfg))
-			return -EINVAL;
+		lpni_cri = cfg->prcfg_bulk;
+		lpni_stats = cfg->prcfg_bulk + sizeof(*lpni_cri);
 
 		mutex_lock(&the_lnet.ln_api_mutex);
-		rc = lnet_get_peer_list(&cfg->prcfg_count, &cfg->prcfg_size,
-				(struct lnet_process_id __user *)cfg->prcfg_bulk);
+		rc = lnet_get_peer_info(cfg->prcfg_count, &cfg->prcfg_prim_nid,
+					&cfg->prcfg_cfg_nid, &cfg->prcfg_mr,
+					lpni_cri, lpni_stats);
 		mutex_unlock(&the_lnet.ln_api_mutex);
 		return rc;
 	}
 
-	case IOC_LIBCFS_SET_HEALHV: {
-		struct lnet_ioctl_reset_health_cfg *cfg = arg;
-		int value;
-		if (cfg->rh_hdr.ioc_len < sizeof(*cfg))
-			return -EINVAL;
-		if (cfg->rh_value < 0 ||
-		    cfg->rh_value > LNET_MAX_HEALTH_VALUE)
-			value = LNET_MAX_HEALTH_VALUE;
-		else
-			value = cfg->rh_value;
-		CDEBUG(D_NET, "Manually setting healthv to %d for %s:%s. all = %d\n",
-		       value, (cfg->rh_type == LNET_HEALTH_TYPE_LOCAL_NI) ?
-		       "local" : "peer", libcfs_nid2str(cfg->rh_nid), cfg->rh_all);
-		mutex_lock(&the_lnet.ln_api_mutex);
-		if (cfg->rh_type == LNET_HEALTH_TYPE_LOCAL_NI)
-			lnet_ni_set_healthv(cfg->rh_nid, value,
-					     cfg->rh_all);
-		else
-			lnet_peer_ni_set_healthv(cfg->rh_nid, value,
-						  cfg->rh_all);
-		mutex_unlock(&the_lnet.ln_api_mutex);
-		return 0;
-	}
-
-	case IOC_LIBCFS_NOTIFY_ROUTER: {
-		time64_t deadline = ktime_get_real_seconds() - data->ioc_u64[0];
-
-		/* The deadline passed in by the user should be some time in
-		 * seconds in the future since the UNIX epoch. We have to map
-		 * that deadline to the wall clock.
-		 */
-		deadline += ktime_get_seconds();
+	case IOC_LIBCFS_NOTIFY_ROUTER:
 		return lnet_notify(NULL, data->ioc_nid, data->ioc_flags,
-				   deadline);
-	}
+				  cfs_time_current() -
+				  cfs_time_seconds(cfs_time_current_sec() -
+						   (time_t)data->ioc_u64[0]));
 
 	case IOC_LIBCFS_LNET_DIST:
 		rc = LNetDist(data->ioc_nid, &data->ioc_nid, &data->ioc_u32[1]);
@@ -3903,77 +2888,24 @@ LNetCtl(unsigned int cmd, void *arg)
 		id.nid = data->ioc_nid;
 		id.pid = data->ioc_u32[0];
 
-		/* If timeout is negative then set default of 3 minutes */
-		if (((s32)data->ioc_u32[1] <= 0) ||
-		    data->ioc_u32[1] > (DEFAULT_PEER_TIMEOUT * MSEC_PER_SEC))
-			timeout = msecs_to_jiffies(DEFAULT_PEER_TIMEOUT * MSEC_PER_SEC);
+		/* Don't block longer than 2 minutes */
+		if (data->ioc_u32[1] > 120 * MSEC_PER_SEC)
+			return -EINVAL;
+
+		/* If timestamp is negative then disable timeout */
+		if ((s32)data->ioc_u32[1] < 0)
+			timeout = MAX_SCHEDULE_TIMEOUT;
 		else
 			timeout = msecs_to_jiffies(data->ioc_u32[1]);
 
 		rc = lnet_ping(id, timeout, data->ioc_pbuf1,
 			       data->ioc_plen1 / sizeof(struct lnet_process_id));
-
 		if (rc < 0)
 			return rc;
-
 		data->ioc_count = rc;
 		return 0;
 	}
 
-	case IOC_LIBCFS_PING_PEER: {
-		struct lnet_ioctl_ping_data *ping = arg;
-		struct lnet_peer *lp;
-		signed long timeout;
-
-		/* If timeout is negative then set default of 3 minutes */
-		if (((s32)ping->op_param) <= 0 ||
-		    ping->op_param > (DEFAULT_PEER_TIMEOUT * MSEC_PER_SEC))
-			timeout = msecs_to_jiffies(DEFAULT_PEER_TIMEOUT * MSEC_PER_SEC);
-		else
-			timeout = msecs_to_jiffies(ping->op_param);
-
-		rc = lnet_ping(ping->ping_id, timeout,
-			       ping->ping_buf,
-			       ping->ping_count);
-		if (rc < 0)
-			return rc;
-
-		mutex_lock(&the_lnet.ln_api_mutex);
-		lp = lnet_find_peer(ping->ping_id.nid);
-		if (lp) {
-			ping->ping_id.nid = lp->lp_primary_nid;
-			ping->mr_info = lnet_peer_is_multi_rail(lp);
-			lnet_peer_decref_locked(lp);
-		}
-		mutex_unlock(&the_lnet.ln_api_mutex);
-
-		ping->ping_count = rc;
-		return 0;
-	}
-
-	case IOC_LIBCFS_DISCOVER: {
-		struct lnet_ioctl_ping_data *discover = arg;
-		struct lnet_peer *lp;
-
-		rc = lnet_discover(discover->ping_id, discover->op_param,
-				   discover->ping_buf,
-				   discover->ping_count);
-		if (rc < 0)
-			return rc;
-
-		mutex_lock(&the_lnet.ln_api_mutex);
-		lp = lnet_find_peer(discover->ping_id.nid);
-		if (lp) {
-			discover->ping_id.nid = lp->lp_primary_nid;
-			discover->mr_info = lnet_peer_is_multi_rail(lp);
-			lnet_peer_decref_locked(lp);
-		}
-		mutex_unlock(&the_lnet.ln_api_mutex);
-
-		discover->ping_count = rc;
-		return 0;
-	}
-
 	default:
 		ni = lnet_net2ni_addref(data->ioc_net);
 		if (ni == NULL)
@@ -4073,47 +3005,43 @@ static int lnet_ping(struct lnet_process_id id, signed long timeout,
 	struct lnet_handle_md mdh;
 	struct lnet_event event;
 	struct lnet_md md = { NULL };
-	int which;
-	int unlinked = 0;
-	int replied = 0;
+	int		     which;
+	int		     unlinked = 0;
+	int		     replied = 0;
 	const signed long a_long_time = msecs_to_jiffies(60 * MSEC_PER_SEC);
-	struct lnet_ping_buffer *pbuf;
+	int		     infosz;
+	struct lnet_ping_info *info;
 	struct lnet_process_id tmpid;
-	int i;
-	int nob;
-	int rc;
-	int rc2;
-	sigset_t blocked;
+	int		     i;
+	int		     nob;
+	int		     rc;
+	int		     rc2;
+	sigset_t	 blocked;
+
+	infosz = offsetof(struct lnet_ping_info, pi_ni[n_ids]);
 
 	/* n_ids limit is arbitrary */
-	if (n_ids <= 0 || id.nid == LNET_NID_ANY)
+	if (n_ids <= 0 || n_ids > 20 || id.nid == LNET_NID_ANY)
 		return -EINVAL;
 
-	/*
-	 * if the user buffer has more space than the lnet_interfaces_max
-	 * then only fill it up to lnet_interfaces_max
-	 */
-	if (n_ids > lnet_interfaces_max)
-		n_ids = lnet_interfaces_max;
-
 	if (id.pid == LNET_PID_ANY)
 		id.pid = LNET_PID_LUSTRE;
 
-	pbuf = lnet_ping_buffer_alloc(n_ids, GFP_NOFS);
-	if (!pbuf)
+	LIBCFS_ALLOC(info, infosz);
+	if (info == NULL)
 		return -ENOMEM;
 
 	/* NB 2 events max (including any unlink event) */
 	rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &eqh);
 	if (rc != 0) {
 		CERROR("Can't allocate EQ: %d\n", rc);
-		goto fail_ping_buffer_decref;
+		goto out_0;
 	}
 
 	/* initialize md content */
-	md.start     = &pbuf->pb_info;
-	md.length    = LNET_PING_INFO_SIZE(n_ids);
-	md.threshold = 2; /* GET/REPLY */
+	md.start     = info;
+	md.length    = infosz;
+	md.threshold = 2; /*GET/REPLY*/
 	md.max_size  = 0;
 	md.options   = LNET_MD_TRUNCATE;
 	md.user_ptr  = NULL;
@@ -4122,15 +3050,16 @@ static int lnet_ping(struct lnet_process_id id, signed long timeout,
 	rc = LNetMDBind(md, LNET_UNLINK, &mdh);
 	if (rc != 0) {
 		CERROR("Can't bind MD: %d\n", rc);
-		goto fail_free_eq;
+		goto out_1;
 	}
 
 	rc = LNetGet(LNET_NID_ANY, mdh, id,
 		     LNET_RESERVED_PORTAL,
-		     LNET_PROTO_PING_MATCHBITS, 0, false);
+		     LNET_PROTO_PING_MATCHBITS, 0);
 
 	if (rc != 0) {
 		/* Don't CERROR; this could be deliberate! */
+
 		rc2 = LNetMDUnlink(mdh);
 		LASSERT(rc2 == 0);
 
@@ -4178,6 +3107,7 @@ static int lnet_ping(struct lnet_process_id id, signed long timeout,
 			replied = 1;
 			rc = event.mlength;
 		}
+
 	} while (rc2 <= 0 || !event.unlinked);
 
 	if (!replied) {
@@ -4185,170 +3115,68 @@ static int lnet_ping(struct lnet_process_id id, signed long timeout,
 			CWARN("%s: Unexpected rc >= 0 but no reply!\n",
 			      libcfs_id2str(id));
 		rc = -EIO;
-		goto fail_free_eq;
+		goto out_1;
 	}
 
 	nob = rc;
-	LASSERT(nob >= 0 && nob <= LNET_PING_INFO_SIZE(n_ids));
+	LASSERT(nob >= 0 && nob <= infosz);
 
-	rc = -EPROTO;		/* if I can't parse... */
+	rc = -EPROTO;				/* if I can't parse... */
 
 	if (nob < 8) {
+		/* can't check magic/version */
 		CERROR("%s: ping info too short %d\n",
 		       libcfs_id2str(id), nob);
-		goto fail_free_eq;
+		goto out_1;
 	}
 
-	if (pbuf->pb_info.pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) {
-		lnet_swap_pinginfo(pbuf);
-	} else if (pbuf->pb_info.pi_magic != LNET_PROTO_PING_MAGIC) {
+	if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) {
+		lnet_swap_pinginfo(info);
+	} else if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
 		CERROR("%s: Unexpected magic %08x\n",
-		       libcfs_id2str(id), pbuf->pb_info.pi_magic);
-		goto fail_free_eq;
+		       libcfs_id2str(id), info->pi_magic);
+		goto out_1;
 	}
 
-	if ((pbuf->pb_info.pi_features & LNET_PING_FEAT_NI_STATUS) == 0) {
+	if ((info->pi_features & LNET_PING_FEAT_NI_STATUS) == 0) {
 		CERROR("%s: ping w/o NI status: 0x%x\n",
-		       libcfs_id2str(id), pbuf->pb_info.pi_features);
-		goto fail_free_eq;
+		       libcfs_id2str(id), info->pi_features);
+		goto out_1;
 	}
 
-	if (nob < LNET_PING_INFO_SIZE(0)) {
-		CERROR("%s: Short reply %d(%d min)\n",
-		       libcfs_id2str(id),
-		       nob, (int)LNET_PING_INFO_SIZE(0));
-		goto fail_free_eq;
+	if (nob < offsetof(struct lnet_ping_info, pi_ni[0])) {
+		CERROR("%s: Short reply %d(%d min)\n", libcfs_id2str(id),
+		       nob, (int)offsetof(struct lnet_ping_info, pi_ni[0]));
+		goto out_1;
 	}
 
-	if (pbuf->pb_info.pi_nnis < n_ids)
-		n_ids = pbuf->pb_info.pi_nnis;
+	if (info->pi_nnis < n_ids)
+		n_ids = info->pi_nnis;
 
-	if (nob < LNET_PING_INFO_SIZE(n_ids)) {
-		CERROR("%s: Short reply %d(%d expected)\n",
-		       libcfs_id2str(id),
-		       nob, (int)LNET_PING_INFO_SIZE(n_ids));
-		goto fail_free_eq;
+	if (nob < offsetof(struct lnet_ping_info, pi_ni[n_ids])) {
+		CERROR("%s: Short reply %d(%d expected)\n", libcfs_id2str(id),
+		       nob, (int)offsetof(struct lnet_ping_info, pi_ni[n_ids]));
+		goto out_1;
 	}
 
-	rc = -EFAULT;		/* if I segv in copy_to_user()... */
+	rc = -EFAULT;				/* If I SEGV... */
 
 	memset(&tmpid, 0, sizeof(tmpid));
 	for (i = 0; i < n_ids; i++) {
-		tmpid.pid = pbuf->pb_info.pi_pid;
-		tmpid.nid = pbuf->pb_info.pi_ni[i].ns_nid;
+		tmpid.pid = info->pi_pid;
+		tmpid.nid = info->pi_ni[i].ns_nid;
 		if (copy_to_user(&ids[i], &tmpid, sizeof(tmpid)))
-			goto fail_free_eq;
+			goto out_1;
 	}
-	rc = pbuf->pb_info.pi_nnis;
+	rc = info->pi_nnis;
 
- fail_free_eq:
+ out_1:
 	rc2 = LNetEQFree(eqh);
 	if (rc2 != 0)
 		CERROR("rc2 %d\n", rc2);
 	LASSERT(rc2 == 0);
 
- fail_ping_buffer_decref:
-	lnet_ping_buffer_decref(pbuf);
-	return rc;
-}
-
-static int
-lnet_discover(struct lnet_process_id id, __u32 force,
-	      struct lnet_process_id __user *ids, int n_ids)
-{
-	struct lnet_peer_ni *lpni;
-	struct lnet_peer_ni *p;
-	struct lnet_peer *lp;
-	struct lnet_process_id *buf;
-	int cpt;
-	int i;
-	int rc;
-	int max_intf = lnet_interfaces_max;
-	size_t buf_size;
-
-	if (n_ids <= 0 ||
-	    id.nid == LNET_NID_ANY)
-		return -EINVAL;
-
-	if (id.pid == LNET_PID_ANY)
-		id.pid = LNET_PID_LUSTRE;
-
-	/*
-	 * if the user buffer has more space than the max_intf
-	 * then only fill it up to max_intf
-	 */
-	if (n_ids > max_intf)
-		n_ids = max_intf;
-
-	buf_size = n_ids * sizeof(*buf);
-
-	LIBCFS_ALLOC(buf, buf_size);
-	if (!buf)
-		return -ENOMEM;
-
-	cpt = lnet_net_lock_current();
-	lpni = lnet_nid2peerni_locked(id.nid, LNET_NID_ANY, cpt);
-	if (IS_ERR(lpni)) {
-		rc = PTR_ERR(lpni);
-		goto out;
-	}
-
-	/*
-	 * Clearing the NIDS_UPTODATE flag ensures the peer will
-	 * be discovered, provided discovery has not been disabled.
-	 */
-	lp = lpni->lpni_peer_net->lpn_peer;
-	spin_lock(&lp->lp_lock);
-	lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE;
-	/* If the force flag is set, force a PING and PUSH as well. */
-	if (force)
-		lp->lp_state |= LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH;
-	spin_unlock(&lp->lp_lock);
-	rc = lnet_discover_peer_locked(lpni, cpt, true);
-	if (rc)
-		goto out_decref;
-
-	/* Peer may have changed. */
-	lp = lpni->lpni_peer_net->lpn_peer;
-	if (lp->lp_nnis < n_ids)
-		n_ids = lp->lp_nnis;
-
-	i = 0;
-	p = NULL;
-	while ((p = lnet_get_next_peer_ni_locked(lp, NULL, p)) != NULL) {
-		buf[i].pid = id.pid;
-		buf[i].nid = p->lpni_nid;
-		if (++i >= n_ids)
-			break;
-	}
-
-	lnet_net_unlock(cpt);
-
-	rc = -EFAULT;
-	if (copy_to_user(ids, buf, n_ids * sizeof(*buf)))
-		goto out_relock;
-	rc = n_ids;
-out_relock:
-	lnet_net_lock(cpt);
-out_decref:
-	lnet_peer_ni_decref_locked(lpni);
-out:
-	lnet_net_unlock(cpt);
-
-	LIBCFS_FREE(buf, buf_size);
-
+ out_0:
+	LIBCFS_FREE(info, infosz);
 	return rc;
 }
-
-/**
- * Retrieve peer discovery status.
- *
- * \retval 1 if lnet_peer_discovery_disabled is 0
- * \retval 0 if lnet_peer_discovery_disabled is 1
- */
-int
-LNetGetPeerDiscoveryStatus(void)
-{
-	return !lnet_peer_discovery_disabled;
-}
-EXPORT_SYMBOL(LNetGetPeerDiscoveryStatus);
diff --git a/drivers/staging/lustrefsx/lnet/lnet/config.c b/drivers/staging/lustrefsx/lnet/lnet/config.c
index 741711af0813f..2f90e90849ac3 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/config.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/config.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -32,8 +32,6 @@
 
 #define DEBUG_SUBSYSTEM S_LNET
 
-#include <linux/ctype.h>
-#include <linux/rtnetlink.h>
 #include <linux/inetdevice.h>
 #include <linux/nsproxy.h>
 #include <net/net_namespace.h>
@@ -125,10 +123,10 @@ lnet_ni_unique_net(struct list_head *nilist, char *iface)
 /* check that the NI is unique to the interfaces with in the same NI.
  * This is only a consideration if use_tcp_bonding is set */
 static bool
-lnet_ni_unique_ni(char *iface_list[LNET_INTERFACES_NUM], char *iface)
+lnet_ni_unique_ni(char *iface_list[LNET_NUM_INTERFACES], char *iface)
 {
 	int i;
-	for (i = 0; i < LNET_INTERFACES_NUM; i++) {
+	for (i = 0; i < LNET_NUM_INTERFACES; i++) {
 		if (iface_list[i] != NULL &&
 		    strncmp(iface_list[i], iface, strlen(iface)) == 0)
 			return false;
@@ -311,7 +309,7 @@ lnet_ni_free(struct lnet_ni *ni)
 	if (ni->ni_cpts != NULL)
 		cfs_expr_list_values_free(ni->ni_cpts, ni->ni_ncpts);
 
-	for (i = 0; i < LNET_INTERFACES_NUM &&
+	for (i = 0; i < LNET_NUM_INTERFACES &&
 		    ni->ni_interfaces[i] != NULL; i++) {
 		LIBCFS_FREE(ni->ni_interfaces[i],
 			    strlen(ni->ni_interfaces[i]) + 1);
@@ -411,11 +409,11 @@ lnet_ni_add_interface(struct lnet_ni *ni, char *iface)
 	 * can free the tokens at the end of the function.
 	 * The newly allocated ni_interfaces[] can be
 	 * freed when freeing the NI */
-	while (niface < LNET_INTERFACES_NUM &&
+	while (niface < LNET_NUM_INTERFACES &&
 	       ni->ni_interfaces[niface] != NULL)
 		niface++;
 
-	if (niface >= LNET_INTERFACES_NUM) {
+	if (niface >= LNET_NUM_INTERFACES) {
 		LCONSOLE_ERROR_MSG(0x115, "Too many interfaces "
 				   "for net %s\n",
 				   libcfs_net2str(LNET_NIDNET(ni->ni_nid)));
@@ -458,9 +456,8 @@ lnet_ni_alloc_common(struct lnet_net *net, char *iface)
 	}
 
 	spin_lock_init(&ni->ni_lock);
+	INIT_LIST_HEAD(&ni->ni_cptlist);
 	INIT_LIST_HEAD(&ni->ni_netlist);
-	INIT_LIST_HEAD(&ni->ni_recovery);
-	LNetInvalidateMDHandle(&ni->ni_ping_mdh);
 	ni->ni_refs = cfs_percpt_alloc(lnet_cpt_table(),
 				       sizeof(*ni->ni_refs[0]));
 	if (ni->ni_refs == NULL)
@@ -479,12 +476,12 @@ lnet_ni_alloc_common(struct lnet_net *net, char *iface)
 	ni->ni_nid = LNET_MKNID(net->net_id, 0);
 
 	/* Store net namespace in which current ni is being created */
-	if (current->nsproxy && current->nsproxy->net_ns)
+	if (current->nsproxy->net_ns != NULL)
 		ni->ni_net_ns = get_net(current->nsproxy->net_ns);
 	else
-		ni->ni_net_ns = get_net(&init_net);
+		ni->ni_net_ns = NULL;
 
-	ni->ni_last_alive = ktime_get_real_seconds();
+	ni->ni_last_alive = cfs_time_current_sec();
 	ni->ni_state = LNET_NI_STATE_INIT;
 	list_add_tail(&ni->ni_netlist, &net->net_ni_added);
 
@@ -1124,26 +1121,26 @@ lnet_parse_priority(char *str, unsigned int *priority, char **token)
 }
 
 static int
-lnet_parse_route(char *str, int *im_a_router)
+lnet_parse_route (char *str, int *im_a_router)
 {
 	/* static scratch buffer OK (single threaded) */
-	static char cmd[LNET_SINGLE_TEXTBUF_NOB];
+	static char	  cmd[LNET_SINGLE_TEXTBUF_NOB];
 
-	struct list_head nets;
-	struct list_head gateways;
+	struct list_head  nets;
+	struct list_head  gateways;
 	struct list_head *tmp1;
 	struct list_head *tmp2;
-	__u32 net;
-	lnet_nid_t nid;
-	struct lnet_text_buf *ltb;
-	int rc;
-	char *sep;
-	char *token = str;
-	int ntokens = 0;
-	int myrc = -1;
-	__u32 hops;
-	int got_hops = 0;
-	unsigned int priority = 0;
+	__u32		  net;
+	lnet_nid_t	  nid;
+	struct lnet_text_buf  *ltb;
+	int		  rc;
+	char		 *sep;
+	char		 *token = str;
+	int		  ntokens = 0;
+	int		  myrc = -1;
+	__u32		  hops;
+	int		  got_hops = 0;
+	unsigned int	  priority = 0;
 
 	INIT_LIST_HEAD(&gateways);
 	INIT_LIST_HEAD(&nets);
@@ -1217,7 +1214,8 @@ lnet_parse_route(char *str, int *im_a_router)
 					goto token_error;
 
 				nid = libcfs_str2nid(ltb->ltb_text);
-				if (nid == LNET_NID_ANY || nid == LNET_NID_LO_0)
+				if (nid == LNET_NID_ANY ||
+				    LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
 					goto token_error;
 			}
 		}
@@ -1605,12 +1603,11 @@ lnet_match_networks (char **networksp, char *ip2nets, __u32 *ipaddrs, int nip)
 }
 /*
  * kernel 5.3: commit ef11db3310e272d3d8dbe8739e0770820dd20e52
- * kernel 4.18.0-193.el8:
  * added in_dev_for_each_ifa_rtnl and in_dev_for_each_ifa_rcu
  * and removed for_ifa and endfor_ifa.
  * Use the _rntl variant as the current locking is rtnl.
  */
-#ifdef HAVE_IN_DEV_FOR_EACH_IFA_RTNL
+#ifdef in_dev_for_each_ifa_rtnl
 #define DECLARE_CONST_IN_IFADDR(ifa)		const struct in_ifaddr *ifa
 #define endfor_ifa(in_dev)
 #else
@@ -1656,7 +1653,7 @@ int lnet_inet_enumerate(struct lnet_inetdev **dev_list, struct net *ns)
 			if (nip >= nalloc) {
 				struct lnet_inetdev *tmp;
 
-				nalloc += LNET_INTERFACES_NUM;
+				nalloc += LNET_NUM_INTERFACES;
 				tmp = krealloc(ifaces, nalloc * sizeof(*tmp),
 					       GFP_KERNEL);
 				if (!tmp) {
@@ -1700,10 +1697,7 @@ lnet_parse_ip2nets (char **networksp, char *ip2nets)
 	int	   rc;
 	int i;
 
-	if (current->nsproxy && current->nsproxy->net_ns)
-		nip = lnet_inet_enumerate(&ifaces, current->nsproxy->net_ns);
-	else
-		nip = lnet_inet_enumerate(&ifaces, &init_net);
+	nip = lnet_inet_enumerate(&ifaces, current->nsproxy->net_ns);
 	if (nip < 0) {
 		if (nip != -ENOENT) {
 			LCONSOLE_ERROR_MSG(0x117,
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-eq.c b/drivers/staging/lustrefsx/lnet/lnet/lib-eq.c
index 354c9768a3a1d..3bca6b77539a6 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lib-eq.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-eq.c
@@ -159,6 +159,8 @@ LNetEQFree(struct lnet_handle_eq eqh)
 	int		size = 0;
 	int		i;
 
+	LASSERT(the_lnet.ln_refcount > 0);
+
 	lnet_res_lock(LNET_LOCK_EX);
 	/* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do
 	 * both EQ lookup and poll event with only lnet_eq_wait_lock */
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-md.c b/drivers/staging/lustrefsx/lnet/lnet/lib-md.c
index 9bf890c9477b6..a3d0487063cbd 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lib-md.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-md.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2013, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-move.c b/drivers/staging/lustrefsx/lnet/lnet/lib-move.c
index b48d4af51b739..b60106f949b69 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lib-move.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-move.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -36,8 +36,6 @@
 
 #define DEBUG_SUBSYSTEM S_LNET
 
-#include <linux/pagemap.h>
-
 #include <lnet/lib-lnet.h>
 #include <linux/nsproxy.h>
 #include <net/net_namespace.h>
@@ -46,119 +44,6 @@ static int local_nid_dist_zero = 1;
 module_param(local_nid_dist_zero, int, 0444);
 MODULE_PARM_DESC(local_nid_dist_zero, "Reserved");
 
-struct lnet_send_data {
-	struct lnet_ni *sd_best_ni;
-	struct lnet_peer_ni *sd_best_lpni;
-	struct lnet_peer_ni *sd_final_dst_lpni;
-	struct lnet_peer *sd_peer;
-	struct lnet_peer *sd_gw_peer;
-	struct lnet_peer_ni *sd_gw_lpni;
-	struct lnet_peer_net *sd_peer_net;
-	struct lnet_msg *sd_msg;
-	lnet_nid_t sd_dst_nid;
-	lnet_nid_t sd_src_nid;
-	lnet_nid_t sd_rtr_nid;
-	int sd_cpt;
-	int sd_md_cpt;
-	__u32 sd_send_case;
-};
-
-static inline struct lnet_comm_count *
-get_stats_counts(struct lnet_element_stats *stats,
-		 enum lnet_stats_type stats_type)
-{
-	switch (stats_type) {
-	case LNET_STATS_TYPE_SEND:
-		return &stats->el_send_stats;
-	case LNET_STATS_TYPE_RECV:
-		return &stats->el_recv_stats;
-	case LNET_STATS_TYPE_DROP:
-		return &stats->el_drop_stats;
-	default:
-		CERROR("Unknown stats type\n");
-	}
-
-	return NULL;
-}
-
-void lnet_incr_stats(struct lnet_element_stats *stats,
-		     enum lnet_msg_type msg_type,
-		     enum lnet_stats_type stats_type)
-{
-	struct lnet_comm_count *counts = get_stats_counts(stats, stats_type);
-	if (!counts)
-		return;
-
-	switch (msg_type) {
-	case LNET_MSG_ACK:
-		atomic_inc(&counts->co_ack_count);
-		break;
-	case LNET_MSG_PUT:
-		atomic_inc(&counts->co_put_count);
-		break;
-	case LNET_MSG_GET:
-		atomic_inc(&counts->co_get_count);
-		break;
-	case LNET_MSG_REPLY:
-		atomic_inc(&counts->co_reply_count);
-		break;
-	case LNET_MSG_HELLO:
-		atomic_inc(&counts->co_hello_count);
-		break;
-	default:
-		CERROR("There is a BUG in the code. Unknown message type\n");
-		break;
-	}
-}
-
-__u32 lnet_sum_stats(struct lnet_element_stats *stats,
-		     enum lnet_stats_type stats_type)
-{
-	struct lnet_comm_count *counts = get_stats_counts(stats, stats_type);
-	if (!counts)
-		return 0;
-
-	return (atomic_read(&counts->co_ack_count) +
-		atomic_read(&counts->co_put_count) +
-		atomic_read(&counts->co_get_count) +
-		atomic_read(&counts->co_reply_count) +
-		atomic_read(&counts->co_hello_count));
-}
-
-static inline void assign_stats(struct lnet_ioctl_comm_count *msg_stats,
-				struct lnet_comm_count *counts)
-{
-	msg_stats->ico_get_count = atomic_read(&counts->co_get_count);
-	msg_stats->ico_put_count = atomic_read(&counts->co_put_count);
-	msg_stats->ico_reply_count = atomic_read(&counts->co_reply_count);
-	msg_stats->ico_ack_count = atomic_read(&counts->co_ack_count);
-	msg_stats->ico_hello_count = atomic_read(&counts->co_hello_count);
-}
-
-void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
-			      struct lnet_element_stats *stats)
-{
-	struct lnet_comm_count *counts;
-
-	LASSERT(msg_stats);
-	LASSERT(stats);
-
-	counts = get_stats_counts(stats, LNET_STATS_TYPE_SEND);
-	if (!counts)
-		return;
-	assign_stats(&msg_stats->im_send_stats, counts);
-
-	counts = get_stats_counts(stats, LNET_STATS_TYPE_RECV);
-	if (!counts)
-		return;
-	assign_stats(&msg_stats->im_recv_stats, counts);
-
-	counts = get_stats_counts(stats, LNET_STATS_TYPE_DROP);
-	if (!counts)
-		return;
-	assign_stats(&msg_stats->im_drop_stats, counts);
-}
-
 int
 lnet_fail_nid(lnet_nid_t nid, unsigned int threshold)
 {
@@ -745,29 +630,25 @@ lnet_prep_send(struct lnet_msg *msg, int type, struct lnet_process_id target,
 
 	memset (&msg->msg_hdr, 0, sizeof (msg->msg_hdr));
 	msg->msg_hdr.type           = cpu_to_le32(type);
-	/* dest_nid will be overwritten by lnet_select_pathway() */
-	msg->msg_hdr.dest_nid       = cpu_to_le64(target.nid);
 	msg->msg_hdr.dest_pid       = cpu_to_le32(target.pid);
 	/* src_nid will be set later */
 	msg->msg_hdr.src_pid        = cpu_to_le32(the_lnet.ln_pid);
 	msg->msg_hdr.payload_length = cpu_to_le32(len);
 }
 
-void
+static void
 lnet_ni_send(struct lnet_ni *ni, struct lnet_msg *msg)
 {
-	void *priv = msg->msg_private;
-	int rc;
+	void   *priv = msg->msg_private;
+	int	rc;
 
-	LASSERT(!in_interrupt());
-	LASSERT(ni->ni_nid == LNET_NID_LO_0 ||
-		(msg->msg_txcredit && msg->msg_peertxcredit));
+	LASSERT (!in_interrupt ());
+	LASSERT (LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND ||
+		 (msg->msg_txcredit && msg->msg_peertxcredit));
 
 	rc = (ni->ni_net->net_lnd->lnd_send)(ni, priv, msg);
-	if (rc < 0) {
-		msg->msg_no_resend = true;
+	if (rc < 0)
 		lnet_finalize(msg, rc);
-	}
 }
 
 static int
@@ -805,7 +686,7 @@ lnet_ni_eager_recv(struct lnet_ni *ni, struct lnet_msg *msg)
 static void
 lnet_ni_query_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp)
 {
-	time64_t last_alive = 0;
+	cfs_time_t last_alive = 0;
 	int cpt = lnet_cpt_of_nid_locked(lp->lpni_nid, ni);
 
 	LASSERT(lnet_peer_aliveness_enabled(lp));
@@ -815,7 +696,7 @@ lnet_ni_query_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp)
 	(ni->ni_net->net_lnd->lnd_query)(ni, lp->lpni_nid, &last_alive);
 	lnet_net_lock(cpt);
 
-	lp->lpni_last_query = ktime_get_seconds();
+	lp->lpni_last_query = cfs_time_current();
 
 	if (last_alive != 0) /* NI has updated timestamp */
 		lp->lpni_last_alive = last_alive;
@@ -823,10 +704,10 @@ lnet_ni_query_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp)
 
 /* NB: always called with lnet_net_lock held */
 static inline int
-lnet_peer_is_alive(struct lnet_peer_ni *lp, time64_t now)
+lnet_peer_is_alive (struct lnet_peer_ni *lp, cfs_time_t now)
 {
-	int alive;
-	time64_t deadline;
+	int        alive;
+	cfs_time_t deadline;
 
 	LASSERT (lnet_peer_aliveness_enabled(lp));
 
@@ -836,14 +717,16 @@ lnet_peer_is_alive(struct lnet_peer_ni *lp, time64_t now)
 	 */
 	spin_lock(&lp->lpni_lock);
 	if (!lp->lpni_alive && lp->lpni_alive_count > 0 &&
-	    lp->lpni_timestamp >= lp->lpni_last_alive) {
+	    cfs_time_aftereq(lp->lpni_timestamp, lp->lpni_last_alive)) {
 		spin_unlock(&lp->lpni_lock);
 		return 0;
 	}
 
-	deadline = lp->lpni_last_alive +
-		   lp->lpni_net->net_tunables.lct_peer_timeout;
-	alive = deadline > now;
+	deadline =
+	  cfs_time_add(lp->lpni_last_alive,
+		       cfs_time_seconds(lp->lpni_net->net_tunables.
+					lct_peer_timeout));
+	alive = cfs_time_after(deadline, now);
 
 	/*
 	 * Update obsolete lp_alive except for routers assumed to be dead
@@ -865,10 +748,9 @@ lnet_peer_is_alive(struct lnet_peer_ni *lp, time64_t now)
 /* NB: returns 1 when alive, 0 when dead, negative when error;
  *     may drop the lnet_net_lock */
 static int
-lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp,
-		       struct lnet_msg *msg)
+lnet_peer_alive_locked (struct lnet_ni *ni, struct lnet_peer_ni *lp)
 {
-	time64_t now = ktime_get_seconds();
+	cfs_time_t now = cfs_time_current();
 
 	if (!lnet_peer_aliveness_enabled(lp))
 		return -ENODEV;
@@ -876,29 +758,23 @@ lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp,
 	if (lnet_peer_is_alive(lp, now))
 		return 1;
 
-	/*
-	 * If we're resending a message, let's attempt to send it even if
-	 * the peer is down to fulfill our resend quota on the message
-	 */
-	if (msg->msg_retry_count > 0)
-		return 1;
-
 	/*
 	 * Peer appears dead, but we should avoid frequent NI queries (at
 	 * most once per lnet_queryinterval seconds).
 	 */
 	if (lp->lpni_last_query != 0) {
 		static const int lnet_queryinterval = 1;
-		time64_t next_query;
 
-		next_query = lp->lpni_last_query + lnet_queryinterval;
+		cfs_time_t next_query =
+			   cfs_time_add(lp->lpni_last_query,
+					cfs_time_seconds(lnet_queryinterval));
 
-		if (now < next_query) {
+		if (cfs_time_before(now, next_query)) {
 			if (lp->lpni_alive)
 				CWARN("Unexpected aliveness of peer %s: "
-				      "%lld < %lld (%d/%d)\n",
+				      "%d < %d (%d/%d)\n",
 				      libcfs_nid2str(lp->lpni_nid),
-				      now, next_query,
+				      (int)now, (int)next_query,
 				      lnet_queryinterval,
 				      lp->lpni_net->net_tunables.lct_peer_timeout);
 			return 0;
@@ -938,28 +814,20 @@ lnet_post_send_locked(struct lnet_msg *msg, int do_send)
 	LASSERT(!do_send || msg->msg_tx_delayed);
 	LASSERT(!msg->msg_receiving);
 	LASSERT(msg->msg_tx_committed);
-	/* can't get here if we're sending to the loopback interface */
-	LASSERT(lp->lpni_nid != the_lnet.ln_loni->ni_nid);
 
 	/* NB 'lp' is always the next hop */
 	if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 &&
-	    lnet_peer_alive_locked(ni, lp, msg) == 0) {
-		the_lnet.ln_counters[cpt]->lct_common.lcc_drop_count++;
-		the_lnet.ln_counters[cpt]->lct_common.lcc_drop_length +=
-			msg->msg_len;
+	    lnet_peer_alive_locked(ni, lp) == 0) {
+		the_lnet.ln_counters[cpt]->drop_count++;
+		the_lnet.ln_counters[cpt]->drop_length += msg->msg_len;
 		lnet_net_unlock(cpt);
 		if (msg->msg_txpeer)
-			lnet_incr_stats(&msg->msg_txpeer->lpni_stats,
-					msg->msg_type,
-					LNET_STATS_TYPE_DROP);
+			atomic_inc(&msg->msg_txpeer->lpni_stats.drop_count);
 		if (msg->msg_txni)
-			lnet_incr_stats(&msg->msg_txni->ni_stats,
-					msg->msg_type,
-					LNET_STATS_TYPE_DROP);
+			atomic_inc(&msg->msg_txni->ni_stats.drop_count);
 
 		CNETERR("Dropping message for %s: peer not alive\n",
 			libcfs_id2str(msg->msg_target));
-		msg->msg_health_status = LNET_MSG_STATUS_REMOTE_DROPPED;
 		if (do_send)
 			lnet_finalize(msg, -EHOSTUNREACH);
 
@@ -974,12 +842,8 @@ lnet_post_send_locked(struct lnet_msg *msg, int do_send)
 		CNETERR("Aborting message for %s: LNetM[DE]Unlink() already "
 			"called on the MD/ME.\n",
 			libcfs_id2str(msg->msg_target));
-		if (do_send) {
-			msg->msg_no_resend = true;
-			CDEBUG(D_NET, "msg %p to %s canceled and will not be resent\n",
-			       msg, libcfs_id2str(msg->msg_target));
+		if (do_send)
 			lnet_finalize(msg, -ECANCELED);
-		}
 
 		lnet_net_lock(cpt);
 		return -ECANCELED;
@@ -1024,15 +888,6 @@ lnet_post_send_locked(struct lnet_msg *msg, int do_send)
 		}
 	}
 
-	if (unlikely(!list_empty(&the_lnet.ln_delay_rules)) &&
-	    lnet_delay_rule_match_locked(&msg->msg_hdr, msg)) {
-		msg->msg_tx_delayed = 1;
-		return LNET_CREDIT_WAIT;
-	}
-
-	/* unset the tx_delay flag as we're going to send it now */
-	msg->msg_tx_delayed = 0;
-
 	if (do_send) {
 		lnet_net_unlock(cpt);
 		lnet_ni_send(ni, msg);
@@ -1128,9 +983,6 @@ lnet_post_routed_recv_locked(struct lnet_msg *msg, int do_recv)
 	msg->msg_niov = rbp->rbp_npages;
 	msg->msg_kiov = &rb->rb_kiov[0];
 
-	/* unset the msg-rx_delayed flag since we're receiving the message */
-	msg->msg_rx_delayed = 0;
-
 	if (do_recv) {
 		int cpt = msg->msg_rx_cpt;
 
@@ -1230,6 +1082,15 @@ lnet_return_tx_credits_locked(struct lnet_msg *msg)
 	}
 
 	if (txpeer != NULL) {
+		/*
+		 * TODO:
+		 * Once the patch for the health comes in we need to set
+		 * the health of the peer ni to bad when we fail to send
+		 * a message.
+		 * int status = msg->msg_ev.status;
+		 * if (status != 0)
+		 *	lnet_set_peer_ni_health_locked(txpeer, false)
+		 */
 		msg->msg_txpeer = NULL;
 		lnet_peer_ni_decref_locked(txpeer);
 	}
@@ -1261,8 +1122,6 @@ lnet_drop_routed_msgs_locked(struct list_head *list, int cpt)
 		lnet_ni_recv(msg->msg_rxni, msg->msg_private, NULL,
 			     0, 0, 0, msg->msg_hdr.payload_length);
 		list_del_init(&msg->msg_list);
-		msg->msg_no_resend = true;
-		msg->msg_health_status = LNET_MSG_STATUS_REMOTE_ERROR;
 		lnet_finalize(msg, -ECANCELED);
 	}
 
@@ -1409,7 +1268,7 @@ lnet_compare_routes(struct lnet_route *r1, struct lnet_route *r2)
 }
 
 static struct lnet_peer_ni *
-lnet_find_route_locked(struct lnet_net *net, __u32 remote_net,
+lnet_find_route_locked(struct lnet_net *net, lnet_nid_t target,
 		       lnet_nid_t rtr_nid)
 {
 	struct lnet_remotenet	*rnet;
@@ -1423,7 +1282,7 @@ lnet_find_route_locked(struct lnet_net *net, __u32 remote_net,
 	/* If @rtr_nid is not LNET_NID_ANY, return the gateway with
 	 * rtr_nid nid, otherwise find the best gateway I can use */
 
-	rnet = lnet_find_rnet_locked(remote_net);
+	rnet = lnet_find_rnet_locked(LNET_NIDNET(target));
 	if (rnet == NULL)
 		return NULL;
 
@@ -1468,42 +1327,30 @@ lnet_find_route_locked(struct lnet_net *net, __u32 remote_net,
 }
 
 static struct lnet_ni *
-lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
-		 struct lnet_peer *peer, struct lnet_peer_net *peer_net,
+lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *cur_ni,
 		 int md_cpt)
 {
-	struct lnet_ni *ni = NULL;
+	struct lnet_ni *ni = NULL, *best_ni = cur_ni;
 	unsigned int shortest_distance;
 	int best_credits;
-	int best_healthv;
-
-	/*
-	 * If there is no peer_ni that we can send to on this network,
-	 * then there is no point in looking for a new best_ni here.
-	*/
-	if (!lnet_get_next_peer_ni_locked(peer, peer_net, NULL))
-		return best_ni;
 
 	if (best_ni == NULL) {
 		shortest_distance = UINT_MAX;
 		best_credits = INT_MIN;
-		best_healthv = 0;
 	} else {
 		shortest_distance = cfs_cpt_distance(lnet_cpt_table(), md_cpt,
 						     best_ni->ni_dev_cpt);
 		best_credits = atomic_read(&best_ni->ni_tx_credits);
-		best_healthv = atomic_read(&best_ni->ni_healthv);
 	}
 
 	while ((ni = lnet_get_next_ni_locked(local_net, ni))) {
 		unsigned int distance;
 		int ni_credits;
-		int ni_healthv;
-		int ni_fatal;
+
+		if (!lnet_is_ni_healthy_locked(ni))
+			continue;
 
 		ni_credits = atomic_read(&ni->ni_tx_credits);
-		ni_healthv = atomic_read(&ni->ni_healthv);
-		ni_fatal = atomic_read(&ni->ni_fatal_error_on);
 
 		/*
 		 * calculate the distance from the CPT on which
@@ -1514,12 +1361,6 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
 					    md_cpt,
 					    ni->ni_dev_cpt);
 
-		CDEBUG(D_NET, "compare ni %s [c:%d, d:%d, s:%d] with best_ni %s [c:%d, d:%d, s:%d]\n",
-		       libcfs_nid2str(ni->ni_nid), ni_credits, distance,
-		       ni->ni_seq, (best_ni) ? libcfs_nid2str(best_ni->ni_nid)
-			: "not seleced", best_credits, shortest_distance,
-			(best_ni) ? best_ni->ni_seq : 0);
-
 		/*
 		 * All distances smaller than the NUMA range
 		 * are treated equally.
@@ -1528,242 +1369,383 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
 			distance = lnet_numa_range;
 
 		/*
-		 * Select on health, shorter distance, available
+		 * Select on shorter distance, then available
 		 * credits, then round-robin.
 		 */
-		if (ni_fatal) {
-			continue;
-		} else if (ni_healthv < best_healthv) {
-			continue;
-		} else if (ni_healthv > best_healthv) {
-			best_healthv = ni_healthv;
-			/*
-			 * If we're going to prefer this ni because it's
-			 * the healthiest, then we should set the
-			 * shortest_distance in the algorithm in case
-			 * there are multiple NIs with the same health but
-			 * different distances.
-			 */
-			if (distance < shortest_distance)
-				shortest_distance = distance;
-		} else if (distance > shortest_distance) {
+		if (distance > shortest_distance) {
 			continue;
 		} else if (distance < shortest_distance) {
 			shortest_distance = distance;
 		} else if (ni_credits < best_credits) {
 			continue;
 		} else if (ni_credits == best_credits) {
-			if (best_ni && best_ni->ni_seq <= ni->ni_seq)
+			if (best_ni && (best_ni)->ni_seq <= ni->ni_seq)
 				continue;
 		}
 		best_ni = ni;
 		best_credits = ni_credits;
 	}
 
-	CDEBUG(D_NET, "selected best_ni %s\n",
-	       (best_ni) ? libcfs_nid2str(best_ni->ni_nid) : "no selection");
-
 	return best_ni;
 }
 
-/*
- * Traffic to the LNET_RESERVED_PORTAL may not trigger peer discovery,
- * because such traffic is required to perform discovery. We therefore
- * exclude all GET and PUT on that portal. We also exclude all ACK and
- * REPLY traffic, but that is because the portal is not tracked in the
- * message structure for these message types. We could restrict this
- * further by also checking for LNET_PROTO_PING_MATCHBITS.
- */
-static bool
-lnet_msg_discovery(struct lnet_msg *msg)
+static int
+lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
+		    struct lnet_msg *msg, lnet_nid_t rtr_nid)
 {
-	if (msg->msg_type == LNET_MSG_PUT) {
-		if (msg->msg_hdr.msg.put.ptl_index != LNET_RESERVED_PORTAL)
-			return true;
-	} else if (msg->msg_type == LNET_MSG_GET) {
-		if (msg->msg_hdr.msg.get.ptl_index != LNET_RESERVED_PORTAL)
-			return true;
-	}
-	return false;
-}
+	struct lnet_ni		*best_ni;
+	struct lnet_peer_ni	*best_lpni;
+	struct lnet_peer_ni	*best_gw;
+	struct lnet_peer_ni	*lpni;
+	struct lnet_peer_ni	*final_dst;
+	struct lnet_peer	*peer;
+	struct lnet_peer_net	*peer_net;
+	struct lnet_net		*local_net;
+	__u32			seq;
+	int			cpt, cpt2, rc;
+	bool			routing;
+	bool			routing2;
+	bool			ni_is_pref;
+	bool			preferred;
+	bool			local_found;
+	int			best_lpni_credits;
+	int			md_cpt;
 
-#define SRC_SPEC	0x0001
-#define SRC_ANY		0x0002
-#define LOCAL_DST	0x0004
-#define REMOTE_DST	0x0008
-#define MR_DST		0x0010
-#define NMR_DST		0x0020
-#define SND_RESP	0x0040
-
-/* The following to defines are used for return codes */
-#define REPEAT_SEND	0x1000
-#define PASS_THROUGH	0x2000
-
-/* The different cases lnet_select pathway needs to handle */
-#define SRC_SPEC_LOCAL_MR_DST	(SRC_SPEC | LOCAL_DST | MR_DST)
-#define SRC_SPEC_ROUTER_MR_DST	(SRC_SPEC | REMOTE_DST | MR_DST)
-#define SRC_SPEC_LOCAL_NMR_DST	(SRC_SPEC | LOCAL_DST | NMR_DST)
-#define SRC_SPEC_ROUTER_NMR_DST	(SRC_SPEC | REMOTE_DST | NMR_DST)
-#define SRC_ANY_LOCAL_MR_DST	(SRC_ANY | LOCAL_DST | MR_DST)
-#define SRC_ANY_ROUTER_MR_DST	(SRC_ANY | REMOTE_DST | MR_DST)
-#define SRC_ANY_LOCAL_NMR_DST	(SRC_ANY | LOCAL_DST | NMR_DST)
-#define SRC_ANY_ROUTER_NMR_DST	(SRC_ANY | REMOTE_DST | NMR_DST)
+	/*
+	 * get an initial CPT to use for locking. The idea here is not to
+	 * serialize the calls to select_pathway, so that as many
+	 * operations can run concurrently as possible. To do that we use
+	 * the CPT where this call is being executed. Later on when we
+	 * determine the CPT to use in lnet_message_commit, we switch the
+	 * lock and check if there was any configuration change.  If none,
+	 * then we proceed, if there is, then we restart the operation.
+	 */
+	cpt = lnet_net_lock_current();
 
-static int
-lnet_handle_lo_send(struct lnet_send_data *sd)
-{
-	struct lnet_msg *msg = sd->sd_msg;
-	int cpt = sd->sd_cpt;
+	md_cpt = lnet_cpt_of_md(msg->msg_md, msg->msg_offset);
+	if (md_cpt == CFS_CPT_ANY)
+		md_cpt = cpt;
 
-	/* No send credit hassles with LOLND */
-	lnet_ni_addref_locked(the_lnet.ln_loni, cpt);
-	msg->msg_hdr.dest_nid = cpu_to_le64(the_lnet.ln_loni->ni_nid);
-	if (!msg->msg_routing)
-		msg->msg_hdr.src_nid =
-			cpu_to_le64(the_lnet.ln_loni->ni_nid);
-	msg->msg_target.nid = the_lnet.ln_loni->ni_nid;
-	lnet_msg_commit(msg, cpt);
-	msg->msg_txni = the_lnet.ln_loni;
+again:
+	best_ni = NULL;
+	best_lpni = NULL;
+	best_gw = NULL;
+	final_dst = NULL;
+	local_net = NULL;
+	routing = false;
+	routing2 = false;
+	local_found = false;
+
+	seq = lnet_get_dlc_seq_locked();
+
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+		lnet_net_unlock(cpt);
+		return -ESHUTDOWN;
+	}
 
-	return LNET_CREDIT_OK;
-}
+	peer = lnet_find_or_create_peer_locked(dst_nid, cpt);
+	if (IS_ERR(peer)) {
+		lnet_net_unlock(cpt);
+		return PTR_ERR(peer);
+	}
 
-static int
-lnet_handle_send(struct lnet_send_data *sd)
-{
-	struct lnet_ni *best_ni = sd->sd_best_ni;
-	struct lnet_peer_ni *best_lpni = sd->sd_best_lpni;
-	struct lnet_peer_ni *final_dst_lpni = sd->sd_final_dst_lpni;
-	struct lnet_msg *msg = sd->sd_msg;
-	int cpt2;
-	__u32 send_case = sd->sd_send_case;
-	int rc;
-	__u32 routing = send_case & REMOTE_DST;
-	 struct lnet_rsp_tracker *rspt;
+	/* If peer is not healthy then can not send anything to it */
+	if (!lnet_is_peer_healthy_locked(peer)) {
+		lnet_net_unlock(cpt);
+		return -EHOSTUNREACH;
+	}
 
-	/*
-	 * Increment sequence number of the selected peer so that we
-	 * pick the next one in Round Robin.
-	 */
-	best_lpni->lpni_seq++;
+	if (!peer->lp_multi_rail && lnet_get_num_peer_nis(peer) > 1) {
+		lnet_net_unlock(cpt);
+		CERROR("peer %s is declared to be non MR capable, "
+		       "yet configured with more than one NID\n",
+		       libcfs_nid2str(dst_nid));
+		return -EINVAL;
+	}
 
 	/*
-	 * grab a reference on the peer_ni so it sticks around even if
-	 * we need to drop and relock the lnet_net_lock below.
+	 * STEP 1: first jab at determining best_ni
+	 * if src_nid is explicitly specified, then best_ni is already
+	 * pre-determiend for us. Otherwise we need to select the best
+	 * one to use later on
 	 */
-	lnet_peer_ni_addref_locked(best_lpni);
+	if (src_nid != LNET_NID_ANY) {
+		best_ni = lnet_nid2ni_locked(src_nid, cpt);
+		if (!best_ni) {
+			lnet_net_unlock(cpt);
+			LCONSOLE_WARN("Can't send to %s: src %s is not a "
+				      "local nid\n", libcfs_nid2str(dst_nid),
+				      libcfs_nid2str(src_nid));
+			return -EINVAL;
+		}
+	}
 
-	/*
-	 * Use lnet_cpt_of_nid() to determine the CPT used to commit the
-	 * message. This ensures that we get a CPT that is correct for
-	 * the NI when the NI has been restricted to a subset of all CPTs.
-	 * If the selected CPT differs from the one currently locked, we
-	 * must unlock and relock the lnet_net_lock(), and then check whether
-	 * the configuration has changed. We don't have a hold on the best_ni
-	 * yet, and it may have vanished.
-	 */
-	cpt2 = lnet_cpt_of_nid_locked(best_lpni->lpni_nid, best_ni);
-	if (sd->sd_cpt != cpt2) {
-		__u32 seq = lnet_get_dlc_seq_locked();
-		lnet_net_unlock(sd->sd_cpt);
-		sd->sd_cpt = cpt2;
-		lnet_net_lock(sd->sd_cpt);
-		if (seq != lnet_get_dlc_seq_locked()) {
+	if (msg->msg_type == LNET_MSG_REPLY ||
+	    msg->msg_type == LNET_MSG_ACK ||
+	    !peer->lp_multi_rail ||
+	    best_ni) {
+		/*
+		 * for replies we want to respond on the same peer_ni we
+		 * received the message on if possible. If not, then pick
+		 * a peer_ni to send to
+		 *
+		 * if the peer is non-multi-rail then you want to send to
+		 * the dst_nid provided as well.
+		 *
+		 * If the best_ni has already been determined, IE the
+		 * src_nid has been specified, then use the
+		 * destination_nid provided as well, since we're
+		 * continuing a series of related messages for the same
+		 * RPC.
+		 *
+		 * It is expected to find the lpni using dst_nid, since we
+		 * created it earlier.
+		 */
+		best_lpni = lnet_find_peer_ni_locked(dst_nid);
+		if (best_lpni)
 			lnet_peer_ni_decref_locked(best_lpni);
-			return REPEAT_SEND;
+
+		if (best_lpni && !lnet_get_net_locked(LNET_NIDNET(dst_nid))) {
+			/*
+			 * this lpni is not on a local network so we need
+			 * to route this reply.
+			 */
+			best_gw = lnet_find_route_locked(NULL,
+							 best_lpni->lpni_nid,
+							 rtr_nid);
+			if (best_gw) {
+				/*
+				* RULE: Each node considers only the next-hop
+				*
+				* We're going to route the message, so change the peer to
+				* the router.
+				*/
+				LASSERT(best_gw->lpni_peer_net);
+				LASSERT(best_gw->lpni_peer_net->lpn_peer);
+				peer = best_gw->lpni_peer_net->lpn_peer;
+
+				/*
+				* if the router is not multi-rail then use the best_gw
+				* found to send the message to
+				*/
+				if (!peer->lp_multi_rail)
+					best_lpni = best_gw;
+				else
+					best_lpni = NULL;
+
+				routing = true;
+			} else {
+				best_lpni = NULL;
+			}
+		} else if (!best_lpni) {
+			lnet_net_unlock(cpt);
+			CERROR("unable to send msg_type %d to "
+			      "originating %s. Destination NID not in DB\n",
+			      msg->msg_type, libcfs_nid2str(dst_nid));
+			return -EINVAL;
 		}
 	}
 
 	/*
-	 * store the best_lpni in the message right away to avoid having
-	 * to do the same operation under different conditions
+	 * if the peer is not MR capable, then we should always send to it
+	 * using the first NI in the NET we determined.
 	 */
-	msg->msg_txpeer = best_lpni;
-	msg->msg_txni = best_ni;
+	if (!peer->lp_multi_rail) {
+		if (!best_lpni) {
+			lnet_net_unlock(cpt);
+			CERROR("no route to %s\n",
+			       libcfs_nid2str(dst_nid));
+			return -EHOSTUNREACH;
+		}
 
-	/*
-	 * grab a reference for the best_ni since now it's in use in this
-	 * send. The reference will be dropped in lnet_finalize()
-	 */
-	lnet_ni_addref_locked(msg->msg_txni, sd->sd_cpt);
+		/* best ni could be set because src_nid was provided */
+		if (!best_ni) {
+			best_ni = lnet_net2ni_locked(best_lpni->lpni_net->net_id, cpt);
+			if (!best_ni) {
+				lnet_net_unlock(cpt);
+				CERROR("no path to %s from net %s\n",
+				libcfs_nid2str(best_lpni->lpni_nid),
+				libcfs_net2str(best_lpni->lpni_net->net_id));
+				return -EHOSTUNREACH;
+			}
+		}
+	}
 
 	/*
-	 * Always set the target.nid to the best peer picked. Either the
-	 * NID will be one of the peer NIDs selected, or the same NID as
-	 * what was originally set in the target or it will be the NID of
-	 * a router if this message should be routed
+	 * if we already found a best_ni because src_nid is specified and
+	 * best_lpni because we are replying to a message then just send
+	 * the message
 	 */
-	msg->msg_target.nid = msg->msg_txpeer->lpni_nid;
+	if (best_ni && best_lpni)
+		goto send;
 
 	/*
-	 * lnet_msg_commit assigns the correct cpt to the message, which
-	 * is used to decrement the correct refcount on the ni when it's
-	 * time to return the credits
+	 * If we already found a best_ni because src_nid is specified then
+	 * pick the peer then send the message
 	 */
-	lnet_msg_commit(msg, sd->sd_cpt);
+	if (best_ni)
+		goto pick_peer;
 
 	/*
-	 * If we are routing the message then we keep the src_nid that was
-	 * set by the originator. If we are not routing then we are the
-	 * originator and set it here.
+	 * pick the best_ni by going through all the possible networks of
+	 * that peer and see which local NI is best suited to talk to that
+	 * peer.
+	 *
+	 * Locally connected networks will always be preferred over
+	 * a routed network. If there are only routed paths to the peer,
+	 * then the best route is chosen. If all routes are equal then
+	 * they are used in round robin.
 	 */
-	if (!msg->msg_routing)
-		msg->msg_hdr.src_nid = cpu_to_le64(msg->msg_txni->ni_nid);
+	list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_on_peer_list) {
+		if (!lnet_is_peer_net_healthy_locked(peer_net))
+			continue;
+
+		local_net = lnet_get_net_locked(peer_net->lpn_net_id);
+		if (!local_net && !routing && !local_found) {
+			struct lnet_peer_ni *net_gw;
+
+			lpni = list_entry(peer_net->lpn_peer_nis.next,
+					  struct lnet_peer_ni,
+					  lpni_on_peer_net_list);
+
+			net_gw = lnet_find_route_locked(NULL,
+							lpni->lpni_nid,
+							rtr_nid);
+			if (!net_gw)
+				continue;
+
+			if (best_gw) {
+				/*
+				 * lnet_find_route_locked() call
+				 * will return the best_Gw on the
+				 * lpni->lpni_nid network.
+				 * However, best_gw and net_gw can
+				 * be on different networks.
+				 * Therefore need to compare them
+				 * to pick the better of either.
+				 */
+				if (lnet_compare_peers(best_gw, net_gw) > 0)
+					continue;
+				if (best_gw->lpni_gw_seq <= net_gw->lpni_gw_seq)
+					continue;
+			}
+			best_gw = net_gw;
+			final_dst = lpni;
+
+			routing2 = true;
+		} else {
+			best_gw = NULL;
+			final_dst = NULL;
+			routing2 = false;
+			local_found = true;
+		}
 
-	if (routing) {
-		msg->msg_target_is_router = 1;
-		msg->msg_target.pid = LNET_PID_LUSTRE;
 		/*
-		 * since we're routing we want to ensure that the
-		 * msg_hdr.dest_nid is set to the final destination. When
-		 * the router receives this message it knows how to route
-		 * it.
-		 *
-		 * final_dst_lpni is set at the beginning of the
-		 * lnet_select_pathway() function and is never changed.
-		 * It's safe to use it here.
+		 * a gw on this network is found, but there could be
+		 * other better gateways on other networks. So don't pick
+		 * the best_ni until we determine the best_gw.
 		 */
-		msg->msg_hdr.dest_nid = cpu_to_le64(final_dst_lpni->lpni_nid);
-	} else {
+		if (best_gw)
+			continue;
+
+		/* if no local_net found continue */
+		if (!local_net)
+			continue;
+
 		/*
-		 * if we're not routing set the dest_nid to the best peer
-		 * ni NID that we picked earlier in the algorithm.
+		 * Iterate through the NIs in this local Net and select
+		 * the NI to send from. The selection is determined by
+		 * these 3 criterion in the following priority:
+		 *	1. NUMA
+		 *	2. NI available credits
+		 *	3. Round Robin
 		 */
-		msg->msg_hdr.dest_nid = cpu_to_le64(msg->msg_txpeer->lpni_nid);
+		best_ni = lnet_get_best_ni(local_net, best_ni, md_cpt);
+	}
+
+	if (!best_ni && !best_gw) {
+		lnet_net_unlock(cpt);
+		LCONSOLE_WARN("No local ni found to send from to %s\n",
+			libcfs_nid2str(dst_nid));
+		return -EINVAL;
+	}
+
+	if (!best_ni) {
+		best_ni = lnet_get_best_ni(best_gw->lpni_net, best_ni, md_cpt);
+		LASSERT(best_gw && best_ni);
+
+		/*
+		 * We're going to route the message, so change the peer to
+		 * the router.
+		 */
+		LASSERT(best_gw->lpni_peer_net);
+		LASSERT(best_gw->lpni_peer_net->lpn_peer);
+		best_gw->lpni_gw_seq++;
+		peer = best_gw->lpni_peer_net->lpn_peer;
 	}
 
 	/*
-	 * if we have response tracker block update it with the next hop
-	 * nid
+	 * Now that we selected the NI to use increment its sequence
+	 * number so the Round Robin algorithm will detect that it has
+	 * been used and pick the next NI.
 	 */
-	if (msg->msg_md) {
-		rspt = msg->msg_md->md_rspt_ptr;
-		if (rspt) {
-			rspt->rspt_next_hop_nid = msg->msg_txpeer->lpni_nid;
-			CDEBUG(D_NET, "rspt_next_hop_nid = %s\n",
-			       libcfs_nid2str(rspt->rspt_next_hop_nid));
-		}
-	}
+	best_ni->ni_seq++;
 
-	rc = lnet_post_send_locked(msg, 0);
+pick_peer:
+	/*
+	 * At this point the best_ni is on a local network on which
+	 * the peer has a peer_ni as well
+	 */
+	peer_net = lnet_peer_get_net_locked(peer,
+					    best_ni->ni_net->net_id);
+	/*
+	 * peer_net is not available or the src_nid is explicitly defined
+	 * and the peer_net for that src_nid is unhealthy. find a route to
+	 * the destination nid.
+	 */
+	if (!peer_net ||
+	    (src_nid != LNET_NID_ANY &&
+	     !lnet_is_peer_net_healthy_locked(peer_net))) {
+		best_gw = lnet_find_route_locked(best_ni->ni_net,
+						 dst_nid,
+						 rtr_nid);
+		/*
+		 * if no route is found for that network then
+		 * move onto the next peer_ni in the peer
+		 */
+		if (!best_gw) {
+			lnet_net_unlock(cpt);
+			LCONSOLE_WARN("No route to peer from %s\n",
+				libcfs_nid2str(best_ni->ni_nid));
+			return -EHOSTUNREACH;
+		}
 
-	if (!rc)
-		CDEBUG(D_NET, "TRACE: %s(%s:%s) -> %s(%s:%s) : %s try# %d\n",
-		       libcfs_nid2str(msg->msg_hdr.src_nid),
-		       libcfs_nid2str(msg->msg_txni->ni_nid),
-		       libcfs_nid2str(sd->sd_src_nid),
-		       libcfs_nid2str(msg->msg_hdr.dest_nid),
-		       libcfs_nid2str(sd->sd_dst_nid),
-		       libcfs_nid2str(msg->msg_txpeer->lpni_nid),
-		       lnet_msgtyp2str(msg->msg_type), msg->msg_retry_count);
+		CDEBUG(D_NET, "Best route to %s via %s for %s %d\n",
+			libcfs_nid2str(dst_nid),
+			libcfs_nid2str(best_gw->lpni_nid),
+			lnet_msgtyp2str(msg->msg_type), msg->msg_len);
 
-	return rc;
-}
+		routing2 = true;
+		/*
+		 * RULE: Each node considers only the next-hop
+		 *
+		 * We're going to route the message, so change the peer to
+		 * the router.
+		 */
+		LASSERT(best_gw->lpni_peer_net);
+		LASSERT(best_gw->lpni_peer_net->lpn_peer);
+		peer = best_gw->lpni_peer_net->lpn_peer;
+	} else if (!lnet_is_peer_net_healthy_locked(peer_net)) {
+		/*
+		 * this peer_net is unhealthy but we still have an opportunity
+		 * to find another peer_net that we can use
+		 */
+		__u32 net_id = peer_net->lpn_net_id;
+		LCONSOLE_WARN("peer net %s unhealthy\n",
+			      libcfs_net2str(net_id));
+		goto again;
+	}
 
-static struct lnet_peer_ni *
-lnet_select_peer_ni(struct lnet_send_data *sd, struct lnet_peer *peer,
-		    struct lnet_peer_net *peer_net)
-{
 	/*
 	 * Look at the peer NIs for the destination peer that connect
 	 * to the chosen net. If a peer_ni is preferred when using the
@@ -1772,1995 +1754,224 @@ lnet_select_peer_ni(struct lnet_send_data *sd, struct lnet_peer *peer,
 	 * the available transmit credits are used. If the transmit
 	 * credits are equal, we round-robin over the peer_ni.
 	 */
-	struct lnet_peer_ni *lpni = NULL;
-	struct lnet_peer_ni *best_lpni = NULL;
-	struct lnet_ni *best_ni = sd->sd_best_ni;
-	lnet_nid_t dst_nid = sd->sd_dst_nid;
-	int best_lpni_credits = INT_MIN;
-	bool preferred = false;
-	bool ni_is_pref;
-	int best_lpni_healthv = 0;
-	int lpni_healthv;
-
+	lpni = NULL;
+	best_lpni_credits = INT_MIN;
+	preferred = false;
+	best_lpni = NULL;
 	while ((lpni = lnet_get_next_peer_ni_locked(peer, peer_net, lpni))) {
 		/*
-		 * if the best_ni we've chosen aleady has this lpni
-		 * preferred, then let's use it
+		 * if this peer ni is not healthy just skip it, no point in
+		 * examining it further
 		 */
-		ni_is_pref = lnet_peer_is_pref_nid_locked(lpni,
-							  best_ni->ni_nid);
-
-		lpni_healthv = atomic_read(&lpni->lpni_healthv);
-
-		CDEBUG(D_NET, "%s ni_is_pref = %d\n",
-		       libcfs_nid2str(best_ni->ni_nid), ni_is_pref);
-
-		if (best_lpni)
-			CDEBUG(D_NET, "%s c:[%d, %d], s:[%d, %d]\n",
-				libcfs_nid2str(lpni->lpni_nid),
-				lpni->lpni_txcredits, best_lpni_credits,
-				lpni->lpni_seq, best_lpni->lpni_seq);
-
-		/* pick the healthiest peer ni */
-		if (lpni_healthv < best_lpni_healthv) {
+		if (!lnet_is_peer_ni_healthy_locked(lpni))
 			continue;
-		} else if (lpni_healthv > best_lpni_healthv) {
-			best_lpni_healthv = lpni_healthv;
+		ni_is_pref = lnet_peer_is_ni_pref_locked(lpni, best_ni);
+
 		/* if this is a preferred peer use it */
-		} else if (!preferred && ni_is_pref) {
+		if (!preferred && ni_is_pref) {
 			preferred = true;
 		} else if (preferred && !ni_is_pref) {
 			/*
 			 * this is not the preferred peer so let's ignore
 			 * it.
 			 */
-			continue;
-		} else if (lpni->lpni_txcredits < best_lpni_credits) {
-			/*
-			 * We already have a peer that has more credits
-			 * available than this one. No need to consider
-			 * this peer further.
-			 */
-			continue;
-		} else if (lpni->lpni_txcredits == best_lpni_credits) {
-			/*
-			 * The best peer found so far and the current peer
-			 * have the same number of available credits let's
-			 * make sure to select between them using Round
-			 * Robin
-			 */
-			if (best_lpni) {
-				if (best_lpni->lpni_seq <= lpni->lpni_seq)
-					continue;
-			}
-		}
-
-		best_lpni = lpni;
-		best_lpni_credits = lpni->lpni_txcredits;
-	}
-
-	/* if we still can't find a peer ni then we can't reach it */
-	if (!best_lpni) {
-		__u32 net_id = (peer_net) ? peer_net->lpn_net_id :
-			LNET_NIDNET(dst_nid);
-		CDEBUG(D_NET, "no peer_ni found on peer net %s\n",
-				libcfs_net2str(net_id));
-		return NULL;
-	}
-
-	CDEBUG(D_NET, "sd_best_lpni = %s\n",
-	       libcfs_nid2str(best_lpni->lpni_nid));
-
-	return best_lpni;
-}
-
-/*
- * Prerequisite: the best_ni should already be set in the sd
- */
-static inline struct lnet_peer_ni *
-lnet_find_best_lpni_on_net(struct lnet_send_data *sd, struct lnet_peer *peer,
-			   __u32 net_id)
-{
-	struct lnet_peer_net *peer_net;
-
-	/*
-	 * The gateway is Multi-Rail capable so now we must select the
-	 * proper peer_ni
-	 */
-	peer_net = lnet_peer_get_net_locked(peer, net_id);
-
-	if (!peer_net) {
-		CERROR("gateway peer %s has no NI on net %s\n",
-		       libcfs_nid2str(peer->lp_primary_nid),
-		       libcfs_net2str(net_id));
-		return NULL;
-	}
-
-	return lnet_select_peer_ni(sd, peer, peer_net);
-}
-
-static inline void
-lnet_set_non_mr_pref_nid(struct lnet_send_data *sd)
-{
-	if (sd->sd_send_case & NMR_DST &&
-	    sd->sd_msg->msg_type != LNET_MSG_REPLY &&
-	    sd->sd_msg->msg_type != LNET_MSG_ACK &&
-	    sd->sd_best_lpni->lpni_pref_nnids == 0) {
-		CDEBUG(D_NET, "Setting preferred local NID %s on NMR peer %s\n",
-		       libcfs_nid2str(sd->sd_best_ni->ni_nid),
-		       libcfs_nid2str(sd->sd_best_lpni->lpni_nid));
-		lnet_peer_ni_set_non_mr_pref_nid(sd->sd_best_lpni,
-						 sd->sd_best_ni->ni_nid);
-	}
-}
-
-/*
- * Source Specified
- * Local Destination
- * non-mr peer
- *
- * use the source and destination NIDs as the pathway
- */
-static int
-lnet_handle_spec_local_nmr_dst(struct lnet_send_data *sd)
-{
-	/* the destination lpni is set before we get here. */
-
-	/* find local NI */
-	sd->sd_best_ni = lnet_nid2ni_locked(sd->sd_src_nid, sd->sd_cpt);
-	if (!sd->sd_best_ni) {
-		CERROR("Can't send to %s: src %s is not a "
-		       "local nid\n", libcfs_nid2str(sd->sd_dst_nid),
-				libcfs_nid2str(sd->sd_src_nid));
-		return -EINVAL;
-	}
-
-	/*
-	 * the preferred NID will only be set for NMR peers
-	 */
-	lnet_set_non_mr_pref_nid(sd);
-
-	return lnet_handle_send(sd);
-}
-
-/*
- * Source Specified
- * Local Destination
- * MR Peer
- *
- * Don't run the selection algorithm on the peer NIs. By specifying the
- * local NID, we're also saying that we should always use the destination NID
- * provided. This handles the case where we should be using the same
- * destination NID for the all the messages which belong to the same RPC
- * request.
- */
-static int
-lnet_handle_spec_local_mr_dst(struct lnet_send_data *sd)
-{
-	sd->sd_best_ni = lnet_nid2ni_locked(sd->sd_src_nid, sd->sd_cpt);
-	if (!sd->sd_best_ni) {
-		CERROR("Can't send to %s: src %s is not a "
-		       "local nid\n", libcfs_nid2str(sd->sd_dst_nid),
-				libcfs_nid2str(sd->sd_src_nid));
-		return -EINVAL;
-	}
-
-	if (sd->sd_best_lpni &&
-	    sd->sd_best_lpni->lpni_nid == the_lnet.ln_loni->ni_nid)
-		return lnet_handle_lo_send(sd);
-	else if (sd->sd_best_lpni)
-		return lnet_handle_send(sd);
-
-	CERROR("can't send to %s. no NI on %s\n",
-	       libcfs_nid2str(sd->sd_dst_nid),
-	       libcfs_net2str(sd->sd_best_ni->ni_net->net_id));
-
-	return -EHOSTUNREACH;
-}
-
-struct lnet_ni *
-lnet_find_best_ni_on_spec_net(struct lnet_ni *cur_best_ni,
-			      struct lnet_peer *peer,
-			      struct lnet_peer_net *peer_net,
-			      int cpt,
-			      bool incr_seq)
-{
-	struct lnet_net *local_net;
-	struct lnet_ni *best_ni;
-
-	local_net = lnet_get_net_locked(peer_net->lpn_net_id);
-	if (!local_net)
-		return NULL;
-
-	/*
-	 * Iterate through the NIs in this local Net and select
-	 * the NI to send from. The selection is determined by
-	 * these 3 criterion in the following priority:
-	 *	1. NUMA
-	 *	2. NI available credits
-	 *	3. Round Robin
-	 */
-	best_ni = lnet_get_best_ni(local_net, cur_best_ni,
-				   peer, peer_net, cpt);
-
-	if (incr_seq && best_ni)
-		best_ni->ni_seq++;
-
-	return best_ni;
-}
-
-static int
-lnet_handle_find_routed_path(struct lnet_send_data *sd,
-			     lnet_nid_t dst_nid,
-			     struct lnet_peer_ni **gw_lpni,
-			     struct lnet_peer **gw_peer)
-{
-	struct lnet_peer_ni *gw;
-	lnet_nid_t src_nid = sd->sd_src_nid;
-
-	gw = lnet_find_route_locked(NULL, LNET_NIDNET(dst_nid),
-				    sd->sd_rtr_nid);
-	if (!gw) {
-		CERROR("no route to %s from %s\n",
-		       libcfs_nid2str(dst_nid), libcfs_nid2str(src_nid));
-		return -EHOSTUNREACH;
-	}
-
-	/* get the peer of the gw_ni */
-	LASSERT(gw->lpni_peer_net);
-	LASSERT(gw->lpni_peer_net->lpn_peer);
-
-	*gw_peer = gw->lpni_peer_net->lpn_peer;
-
-	if (!sd->sd_best_ni)
-		sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL, *gw_peer,
-					gw->lpni_peer_net,
-					sd->sd_md_cpt,
-					true);
-
-	if (!sd->sd_best_ni) {
-		CERROR("Internal Error. Expected local ni on %s "
-		       "but non found :%s\n",
-		       libcfs_net2str(gw->lpni_peer_net->lpn_net_id),
-		       libcfs_nid2str(sd->sd_src_nid));
-		return -EFAULT;
-	}
-
-	/*
-	 * if gw is MR let's find its best peer_ni
-	 */
-	if (lnet_peer_is_multi_rail(*gw_peer)) {
-		gw = lnet_find_best_lpni_on_net(sd, *gw_peer,
-						sd->sd_best_ni->ni_net->net_id);
-		/*
-		 * We've already verified that the gw has an NI on that
-		 * desired net, but we're not finding it. Something is
-		 * wrong.
-		 */
-		if (!gw) {
-			CERROR("Internal Error. Route expected to %s from %s\n",
-				libcfs_nid2str(dst_nid),
-				libcfs_nid2str(src_nid));
-			return -EFAULT;
-		}
-	}
-
-	*gw_lpni = gw;
-
-	return 0;
-}
-
-/*
- * Handle two cases:
- *
- * Case 1:
- *  Source specified
- *  Remote destination
- *  Non-MR destination
- *
- * Case 2:
- *  Source specified
- *  Remote destination
- *  MR destination
- *
- * The handling of these two cases is similar. Even though the destination
- * can be MR or non-MR, we'll deal directly with the router.
- */
-static int
-lnet_handle_spec_router_dst(struct lnet_send_data *sd)
-{
-	int rc;
-	struct lnet_peer_ni *gw_lpni = NULL;
-	struct lnet_peer *gw_peer = NULL;
-
-	/* find local NI */
-	sd->sd_best_ni = lnet_nid2ni_locked(sd->sd_src_nid, sd->sd_cpt);
-	if (!sd->sd_best_ni) {
-		CERROR("Can't send to %s: src %s is not a "
-		       "local nid\n", libcfs_nid2str(sd->sd_dst_nid),
-				libcfs_nid2str(sd->sd_src_nid));
-		return -EINVAL;
-	}
-
-	rc = lnet_handle_find_routed_path(sd, sd->sd_dst_nid, &gw_lpni,
-				     &gw_peer);
-	if (rc < 0)
-		return rc;
-
-	if (sd->sd_send_case & NMR_DST)
-		/*
-		* since the final destination is non-MR let's set its preferred
-		* NID before we send
-		*/
-		lnet_set_non_mr_pref_nid(sd);
-
-	/*
-	 * We're going to send to the gw found so let's set its
-	 * info
-	 */
-	sd->sd_peer = gw_peer;
-	sd->sd_best_lpni = gw_lpni;
-
-	return lnet_handle_send(sd);
-}
-
-struct lnet_ni *
-lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt)
-{
-	struct lnet_peer_net *peer_net = NULL;
-	struct lnet_ni *best_ni = NULL;
-
-	/*
-	 * The peer can have multiple interfaces, some of them can be on
-	 * the local network and others on a routed network. We should
-	 * prefer the local network. However if the local network is not
-	 * available then we need to try the routed network
-	 */
-
-	/* go through all the peer nets and find the best_ni */
-	list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_peer_nets) {
-		/*
-		 * The peer's list of nets can contain non-local nets. We
-		 * want to only examine the local ones.
-		 */
-		if (!lnet_get_net_locked(peer_net->lpn_net_id))
-			continue;
-		best_ni = lnet_find_best_ni_on_spec_net(best_ni, peer,
-						   peer_net, md_cpt, false);
-	}
-
-	if (best_ni)
-		/* increment sequence number so we can round robin */
-		best_ni->ni_seq++;
-
-	return best_ni;
-}
-
-static struct lnet_ni *
-lnet_find_existing_preferred_best_ni(struct lnet_send_data *sd)
-{
-	struct lnet_ni *best_ni = NULL;
-	struct lnet_peer_net *peer_net;
-	struct lnet_peer *peer = sd->sd_peer;
-	struct lnet_peer_ni *best_lpni = sd->sd_best_lpni;
-	struct lnet_peer_ni *lpni;
-	int cpt = sd->sd_cpt;
-
-	/*
-	 * We must use a consistent source address when sending to a
-	 * non-MR peer. However, a non-MR peer can have multiple NIDs
-	 * on multiple networks, and we may even need to talk to this
-	 * peer on multiple networks -- certain types of
-	 * load-balancing configuration do this.
-	 *
-	 * So we need to pick the NI the peer prefers for this
-	 * particular network.
-	 */
-
-	/* Get the target peer_ni */
-	peer_net = lnet_peer_get_net_locked(peer,
-			LNET_NIDNET(best_lpni->lpni_nid));
-	LASSERT(peer_net != NULL);
-	list_for_each_entry(lpni, &peer_net->lpn_peer_nis,
-				lpni_peer_nis) {
-		if (lpni->lpni_pref_nnids == 0)
-			continue;
-		LASSERT(lpni->lpni_pref_nnids == 1);
-		best_ni = lnet_nid2ni_locked(
-				lpni->lpni_pref.nid, cpt);
-		break;
-	}
-
-	return best_ni;
-}
-
-/* Prerequisite: sd->sd_peer and sd->sd_best_lpni should be set */
-static int
-lnet_select_preferred_best_ni(struct lnet_send_data *sd)
-{
-	struct lnet_ni *best_ni = NULL;
-	struct lnet_peer_ni *best_lpni = sd->sd_best_lpni;
-
-	/*
-	 * We must use a consistent source address when sending to a
-	 * non-MR peer. However, a non-MR peer can have multiple NIDs
-	 * on multiple networks, and we may even need to talk to this
-	 * peer on multiple networks -- certain types of
-	 * load-balancing configuration do this.
-	 *
-	 * So we need to pick the NI the peer prefers for this
-	 * particular network.
-	 */
-
-	best_ni = lnet_find_existing_preferred_best_ni(sd);
-
-	/* if best_ni is still not set just pick one */
-	if (!best_ni) {
-		best_ni =
-		  lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer,
-						sd->sd_best_lpni->lpni_peer_net,
-						sd->sd_md_cpt, true);
-		/* If there is no best_ni we don't have a route */
-		if (!best_ni) {
-			CERROR("no path to %s from net %s\n",
-				libcfs_nid2str(best_lpni->lpni_nid),
-				libcfs_net2str(best_lpni->lpni_net->net_id));
-			return -EHOSTUNREACH;
-		}
-	}
-
-	sd->sd_best_ni = best_ni;
-
-	/* Set preferred NI if necessary. */
-	lnet_set_non_mr_pref_nid(sd);
-
-	return 0;
-}
-
-
-/*
- * Source not specified
- * Local destination
- * Non-MR Peer
- *
- * always use the same source NID for NMR peers
- * If we've talked to that peer before then we already have a preferred
- * source NI associated with it. Otherwise, we select a preferred local NI
- * and store it in the peer
- */
-static int
-lnet_handle_any_local_nmr_dst(struct lnet_send_data *sd)
-{
-	int rc;
-
-	/* sd->sd_best_lpni is already set to the final destination */
-
-	/*
-	 * At this point we should've created the peer ni and peer. If we
-	 * can't find it, then something went wrong. Instead of assert
-	 * output a relevant message and fail the send
-	 */
-	if (!sd->sd_best_lpni) {
-		CERROR("Internal fault. Unable to send msg %s to %s. "
-		       "NID not known\n",
-		       lnet_msgtyp2str(sd->sd_msg->msg_type),
-		       libcfs_nid2str(sd->sd_dst_nid));
-		return -EFAULT;
-	}
-
-	rc = lnet_select_preferred_best_ni(sd);
-	if (!rc)
-		rc = lnet_handle_send(sd);
-
-	return rc;
-}
-
-static int
-lnet_handle_any_mr_dsta(struct lnet_send_data *sd)
-{
-	/*
-	 * NOTE we've already handled the remote peer case. So we only
-	 * need to worry about the local case here.
-	 *
-	 * if we're sending a response, ACK or reply, we need to send it
-	 * to the destination NID given to us. At this point we already
-	 * have the peer_ni we're suppose to send to, so just find the
-	 * best_ni on the peer net and use that. Since we're sending to an
-	 * MR peer then we can just run the selection algorithm on our
-	 * local NIs and pick the best one.
-	 */
-	if (sd->sd_send_case & SND_RESP) {
-		sd->sd_best_ni =
-		  lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer,
-						sd->sd_best_lpni->lpni_peer_net,
-						sd->sd_md_cpt, true);
-
-		if (!sd->sd_best_ni) {
-			/*
-			 * We're not going to deal with not able to send
-			 * a response to the provided final destination
-			 */
-			CERROR("Can't send response to %s. "
-			       "No local NI available\n",
-				libcfs_nid2str(sd->sd_dst_nid));
-			return -EHOSTUNREACH;
-		}
-
-		return lnet_handle_send(sd);
-	}
-
-	/*
-	 * If we get here that means we're sending a fresh request, PUT or
-	 * GET, so we need to run our standard selection algorithm.
-	 * First find the best local interface that's on any of the peer's
-	 * networks.
-	 */
-	sd->sd_best_ni = lnet_find_best_ni_on_local_net(sd->sd_peer,
-							sd->sd_md_cpt);
-	if (sd->sd_best_ni) {
-		sd->sd_best_lpni =
-		  lnet_find_best_lpni_on_net(sd, sd->sd_peer,
-					     sd->sd_best_ni->ni_net->net_id);
-
-		/*
-		 * if we're successful in selecting a peer_ni on the local
-		 * network, then send to it. Otherwise fall through and
-		 * try and see if we can reach it over another routed
-		 * network
-		 */
-		if (sd->sd_best_lpni &&
-		    sd->sd_best_lpni->lpni_nid == the_lnet.ln_loni->ni_nid) {
-			/*
-			 * in case we initially started with a routed
-			 * destination, let's reset to local
-			 */
-			sd->sd_send_case &= ~REMOTE_DST;
-			sd->sd_send_case |= LOCAL_DST;
-			return lnet_handle_lo_send(sd);
-		} else if (sd->sd_best_lpni) {
-			/*
-			 * in case we initially started with a routed
-			 * destination, let's reset to local
-			 */
-			sd->sd_send_case &= ~REMOTE_DST;
-			sd->sd_send_case |= LOCAL_DST;
-			return lnet_handle_send(sd);
-		}
-
-		CERROR("Internal Error. Expected to have a best_lpni: "
-		       "%s -> %s\n",
-		       libcfs_nid2str(sd->sd_src_nid),
-		       libcfs_nid2str(sd->sd_dst_nid));
-
-		return -EFAULT;
-	}
-
-	/*
-	 * Peer doesn't have a local network. Let's see if there is
-	 * a remote network we can reach it on.
-	 */
-	return PASS_THROUGH;
-}
-
-/*
- * Case 1:
- *	Source NID not specified
- *	Local destination
- *	MR peer
- *
- * Case 2:
- *	Source NID not speified
- *	Remote destination
- *	MR peer
- *
- * In both of these cases if we're sending a response, ACK or REPLY, then
- * we need to send to the destination NID provided.
- *
- * In the remote case let's deal with MR routers.
- *
- */
-
-static int
-lnet_handle_any_mr_dst(struct lnet_send_data *sd)
-{
-	int rc = 0;
-	struct lnet_peer *gw_peer = NULL;
-	struct lnet_peer_ni *gw_lpni = NULL;
-
-	/*
-	 * handle sending a response to a remote peer here so we don't
-	 * have to worry about it if we hit lnet_handle_any_mr_dsta()
-	 */
-	if (sd->sd_send_case & REMOTE_DST &&
-	    sd->sd_send_case & SND_RESP) {
-		struct lnet_peer_ni *gw;
-		struct lnet_peer *gw_peer;
-
-		rc = lnet_handle_find_routed_path(sd, sd->sd_dst_nid, &gw,
-						  &gw_peer);
-		if (rc < 0) {
-			CERROR("Can't send response to %s. "
-			       "No route available\n",
-				libcfs_nid2str(sd->sd_dst_nid));
-			return -EHOSTUNREACH;
-		}
-
-		sd->sd_best_lpni = gw;
-		sd->sd_peer = gw_peer;
-
-		return lnet_handle_send(sd);
-	}
-
-	/*
-	 * Even though the NID for the peer might not be on a local network,
-	 * since the peer is MR there could be other interfaces on the
-	 * local network. In that case we'd still like to prefer the local
-	 * network over the routed network. If we're unable to do that
-	 * then we select the best router among the different routed networks,
-	 * and if the router is MR then we can deal with it as such.
-	 */
-	rc = lnet_handle_any_mr_dsta(sd);
-	if (rc != PASS_THROUGH)
-		return rc;
-
-	/*
-	 * TODO; One possible enhancement is to run the selection
-	 * algorithm on the peer. However for remote peers the credits are
-	 * not decremented, so we'll be basically going over the peer NIs
-	 * in round robin. An MR router will run the selection algorithm
-	 * on the next-hop interfaces.
-	 */
-	rc = lnet_handle_find_routed_path(sd, sd->sd_dst_nid, &gw_lpni,
-					  &gw_peer);
-	if (rc < 0)
-		return rc;
-
-	sd->sd_send_case &= ~LOCAL_DST;
-	sd->sd_send_case |= REMOTE_DST;
-
-	sd->sd_peer = gw_peer;
-	sd->sd_best_lpni = gw_lpni;
-
-	return lnet_handle_send(sd);
-}
-
-/*
- * Source not specified
- * Remote destination
- * Non-MR peer
- *
- * Must send to the specified peer NID using the same source NID that
- * we've used before. If it's the first time to talk to that peer then
- * find the source NI and assign it as preferred to that peer
- */
-static int
-lnet_handle_any_router_nmr_dst(struct lnet_send_data *sd)
-{
-	int rc;
-	struct lnet_peer_ni *gw_lpni = NULL;
-	struct lnet_peer *gw_peer = NULL;
-
-	/*
-	 * Let's set if we have a preferred NI to talk to this NMR peer
-	 */
-	sd->sd_best_ni = lnet_find_existing_preferred_best_ni(sd);
-
-	/*
-	 * find the router and that'll find the best NI if we didn't find
-	 * it already.
-	 */
-	rc = lnet_handle_find_routed_path(sd, sd->sd_dst_nid, &gw_lpni,
-					  &gw_peer);
-	if (rc < 0)
-		return rc;
-
-	/*
-	 * set the best_ni we've chosen as the preferred one for
-	 * this peer
-	 */
-	lnet_set_non_mr_pref_nid(sd);
-
-	/* we'll be sending to the gw */
-	sd->sd_best_lpni = gw_lpni;
-	sd->sd_peer = gw_peer;
-
-	return lnet_handle_send(sd);
-}
-
-static int
-lnet_handle_send_case_locked(struct lnet_send_data *sd)
-{
-	/*
-	 * turn off the SND_RESP bit.
-	 * It will be checked in the case handling
-	 */
-	__u32 send_case = sd->sd_send_case &= ~SND_RESP ;
-
-	CDEBUG(D_NET, "Source %s%s to %s %s %s destination\n",
-		(send_case & SRC_SPEC) ? "Specified: " : "ANY",
-		(send_case & SRC_SPEC) ? libcfs_nid2str(sd->sd_src_nid) : "",
-		(send_case & MR_DST) ? "MR: " : "NMR: ",
-		libcfs_nid2str(sd->sd_dst_nid),
-		(send_case & LOCAL_DST) ? "local" : "routed");
-
-	switch (send_case) {
-	/*
-	 * For all cases where the source is specified, we should always
-	 * use the destination NID, whether it's an MR destination or not,
-	 * since we're continuing a series of related messages for the
-	 * same RPC
-	 */
-	case SRC_SPEC_LOCAL_NMR_DST:
-		return lnet_handle_spec_local_nmr_dst(sd);
-	case SRC_SPEC_LOCAL_MR_DST:
-		return lnet_handle_spec_local_mr_dst(sd);
-	case SRC_SPEC_ROUTER_NMR_DST:
-	case SRC_SPEC_ROUTER_MR_DST:
-		return lnet_handle_spec_router_dst(sd);
-	case SRC_ANY_LOCAL_NMR_DST:
-		return lnet_handle_any_local_nmr_dst(sd);
-	case SRC_ANY_LOCAL_MR_DST:
-	case SRC_ANY_ROUTER_MR_DST:
-		return lnet_handle_any_mr_dst(sd);
-	case SRC_ANY_ROUTER_NMR_DST:
-		return lnet_handle_any_router_nmr_dst(sd);
-	default:
-		CERROR("Unknown send case\n");
-		return -1;
-	}
-}
-
-static int
-lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
-		    struct lnet_msg *msg, lnet_nid_t rtr_nid)
-{
-	struct lnet_peer_ni *lpni;
-	struct lnet_peer *peer;
-	struct lnet_send_data send_data;
-	int cpt, rc;
-	int md_cpt;
-	__u32 send_case = 0;
-
-	memset(&send_data, 0, sizeof(send_data));
-
-	/*
-	 * get an initial CPT to use for locking. The idea here is not to
-	 * serialize the calls to select_pathway, so that as many
-	 * operations can run concurrently as possible. To do that we use
-	 * the CPT where this call is being executed. Later on when we
-	 * determine the CPT to use in lnet_message_commit, we switch the
-	 * lock and check if there was any configuration change.  If none,
-	 * then we proceed, if there is, then we restart the operation.
-	 */
-	cpt = lnet_net_lock_current();
-
-	md_cpt = lnet_cpt_of_md(msg->msg_md, msg->msg_offset);
-	if (md_cpt == CFS_CPT_ANY)
-		md_cpt = cpt;
-
-again:
-
-	/*
-	 * If we're being asked to send to the loopback interface, there
-	 * is no need to go through any selection. We can just shortcut
-	 * the entire process and send over lolnd
-	 */
-	send_data.sd_msg = msg;
-	send_data.sd_cpt = cpt;
-	if (dst_nid == LNET_NID_LO_0) {
-		rc = lnet_handle_lo_send(&send_data);
-		lnet_net_unlock(cpt);
-		return rc;
-	}
-
-	/*
-	 * find an existing peer_ni, or create one and mark it as having been
-	 * created due to network traffic. This call will create the
-	 * peer->peer_net->peer_ni tree.
-	 */
-	lpni = lnet_nid2peerni_locked(dst_nid, LNET_NID_ANY, cpt);
-	if (IS_ERR(lpni)) {
-		lnet_net_unlock(cpt);
-		return PTR_ERR(lpni);
-	}
-
-	/*
-	 * Cache the original src_nid. If we need to resend the message
-	 * then we'll need to know whether the src_nid was originally
-	 * specified for this message. If it was originally specified,
-	 * then we need to keep using the same src_nid since it's
-	 * continuing the same sequence of messages.
-	 */
-	msg->msg_src_nid_param = src_nid;
-
-	/*
-	 * Now that we have a peer_ni, check if we want to discover
-	 * the peer. Traffic to the LNET_RESERVED_PORTAL should not
-	 * trigger discovery.
-	 */
-	peer = lpni->lpni_peer_net->lpn_peer;
-	if (lnet_msg_discovery(msg) && !lnet_peer_is_uptodate(peer)) {
-		lnet_nid_t primary_nid;
-		rc = lnet_discover_peer_locked(lpni, cpt, false);
-		if (rc) {
-			lnet_peer_ni_decref_locked(lpni);
-			lnet_net_unlock(cpt);
-			return rc;
-		}
-		/* The peer may have changed. */
-		peer = lpni->lpni_peer_net->lpn_peer;
-		spin_lock(&peer->lp_lock);
-		if (lnet_peer_is_uptodate_locked(peer)) {
-			spin_unlock(&peer->lp_lock);
-		} else {
-			/* queue message and return */
-			msg->msg_rtr_nid_param = rtr_nid;
-			msg->msg_sending = 0;
-			list_add_tail(&msg->msg_list, &peer->lp_dc_pendq);
-			primary_nid = peer->lp_primary_nid;
-			spin_unlock(&peer->lp_lock);
-			lnet_peer_ni_decref_locked(lpni);
-			lnet_net_unlock(cpt);
-
-			CDEBUG(D_NET, "%s pending discovery\n",
-			       libcfs_nid2str(primary_nid));
-
-			return LNET_DC_WAIT;
-		}
-	}
-	lnet_peer_ni_decref_locked(lpni);
-	peer = lpni->lpni_peer_net->lpn_peer;
-
-	/*
-	 * Identify the different send cases
-	 */
-	if (src_nid == LNET_NID_ANY)
-		send_case |= SRC_ANY;
-	else
-		send_case |= SRC_SPEC;
-
-	if (lnet_get_net_locked(LNET_NIDNET(dst_nid)))
-		send_case |= LOCAL_DST;
-	else
-		send_case |= REMOTE_DST;
-
-	/*
-	 * if this is a non-MR peer or if we're recovering a peer ni then
-	 * let's consider this an NMR case so we can hit the destination
-	 * NID.
-	 */
-	if (!lnet_peer_is_multi_rail(peer) || msg->msg_recovery)
-		send_case |= NMR_DST;
-	else
-		send_case |= MR_DST;
-
-	if (msg->msg_type == LNET_MSG_REPLY ||
-	    msg->msg_type == LNET_MSG_ACK)
-		send_case |= SND_RESP;
-
-	/* assign parameters to the send_data */
-	send_data.sd_rtr_nid = rtr_nid;
-	send_data.sd_src_nid = src_nid;
-	send_data.sd_dst_nid = dst_nid;
-	send_data.sd_best_lpni = lpni;
-	/*
-	 * keep a pointer to the final destination in case we're going to
-	 * route, so we'll need to access it later
-	 */
-	send_data.sd_final_dst_lpni = lpni;
-	send_data.sd_peer = peer;
-	send_data.sd_md_cpt = md_cpt;
-	send_data.sd_send_case = send_case;
-
-	rc = lnet_handle_send_case_locked(&send_data);
-
-	/*
-	 * Update the local cpt since send_data.sd_cpt might've been
-	 * updated as a result of calling lnet_handle_send_case_locked().
-	 */
-	cpt = send_data.sd_cpt;
-
-	if (rc == REPEAT_SEND)
-		goto again;
-
-	lnet_net_unlock(cpt);
-
-	return rc;
-}
-
-int
-lnet_send(lnet_nid_t src_nid, struct lnet_msg *msg, lnet_nid_t rtr_nid)
-{
-	lnet_nid_t		dst_nid = msg->msg_target.nid;
-	int			rc;
-
-	/*
-	 * NB: rtr_nid is set to LNET_NID_ANY for all current use-cases,
-	 * but we might want to use pre-determined router for ACK/REPLY
-	 * in the future
-	 */
-	/* NB: ni != NULL == interface pre-determined (ACK/REPLY) */
-	LASSERT(msg->msg_txpeer == NULL);
-	LASSERT(msg->msg_txni == NULL);
-	LASSERT(!msg->msg_sending);
-	LASSERT(!msg->msg_target_is_router);
-	LASSERT(!msg->msg_receiving);
-
-	msg->msg_sending = 1;
-
-	LASSERT(!msg->msg_tx_committed);
-
-	rc = lnet_select_pathway(src_nid, dst_nid, msg, rtr_nid);
-	if (rc < 0)
-		return rc;
-
-	if (rc == LNET_CREDIT_OK)
-		lnet_ni_send(msg->msg_txni, msg);
-
-	/* rc == LNET_CREDIT_OK or LNET_CREDIT_WAIT or LNET_DC_WAIT */
-	return 0;
-}
-
-enum lnet_mt_event_type {
-	MT_TYPE_LOCAL_NI = 0,
-	MT_TYPE_PEER_NI
-};
-
-struct lnet_mt_event_info {
-	enum lnet_mt_event_type mt_type;
-	lnet_nid_t mt_nid;
-};
-
-/* called with res_lock held */
-void
-lnet_detach_rsp_tracker(struct lnet_libmd *md, int cpt)
-{
-	struct lnet_rsp_tracker *rspt;
-
-	/*
-	 * msg has a refcount on the MD so the MD is not going away.
-	 * The rspt queue for the cpt is protected by
-	 * the lnet_net_lock(cpt). cpt is the cpt of the MD cookie.
-	 */
-	if (!md->md_rspt_ptr)
-		return;
-
-	rspt = md->md_rspt_ptr;
-
-	/* debug code */
-	LASSERT(rspt->rspt_cpt == cpt);
-
-	md->md_rspt_ptr = NULL;
-
-	if (LNetMDHandleIsInvalid(rspt->rspt_mdh)) {
-		/*
-		 * The monitor thread has invalidated this handle because the
-		 * response timed out, but it failed to lookup the MD. That
-		 * means this response tracker is on the zombie list. We can
-		 * safely remove it under the resource lock (held by caller) and
-		 * free the response tracker block.
-		 */
-		list_del(&rspt->rspt_on_list);
-		lnet_rspt_free(rspt, cpt);
-	} else {
-		/*
-		 * invalidate the handle to indicate that a response has been
-		 * received, which will then lead the monitor thread to clean up
-		 * the rspt block.
-		 */
-		LNetInvalidateMDHandle(&rspt->rspt_mdh);
-	}
-}
-
-void
-lnet_clean_zombie_rstqs(void)
-{
-	struct lnet_rsp_tracker *rspt, *tmp;
-	int i;
-
-	cfs_cpt_for_each(i, lnet_cpt_table()) {
-		list_for_each_entry_safe(rspt, tmp,
-					 the_lnet.ln_mt_zombie_rstqs[i],
-					 rspt_on_list) {
-			list_del(&rspt->rspt_on_list);
-			lnet_rspt_free(rspt, i);
-		}
-	}
-
-	cfs_percpt_free(the_lnet.ln_mt_zombie_rstqs);
-}
-
-static void
-lnet_finalize_expired_responses(void)
-{
-	struct lnet_libmd *md;
-	struct list_head local_queue;
-	struct lnet_rsp_tracker *rspt, *tmp;
-	ktime_t now;
-	int i;
-
-	if (the_lnet.ln_mt_rstq == NULL)
-		return;
-
-	cfs_cpt_for_each(i, lnet_cpt_table()) {
-		INIT_LIST_HEAD(&local_queue);
-
-		lnet_net_lock(i);
-		if (!the_lnet.ln_mt_rstq[i]) {
-			lnet_net_unlock(i);
-			continue;
-		}
-		list_splice_init(the_lnet.ln_mt_rstq[i], &local_queue);
-		lnet_net_unlock(i);
-
-		now = ktime_get();
-
-		list_for_each_entry_safe(rspt, tmp, &local_queue, rspt_on_list) {
-			/*
-			 * The rspt mdh will be invalidated when a response
-			 * is received or whenever we want to discard the
-			 * block the monitor thread will walk the queue
-			 * and clean up any rsts with an invalid mdh.
-			 * The monitor thread will walk the queue until
-			 * the first unexpired rspt block. This means that
-			 * some rspt blocks which received their
-			 * corresponding responses will linger in the
-			 * queue until they are cleaned up eventually.
-			 */
-			lnet_res_lock(i);
-			if (LNetMDHandleIsInvalid(rspt->rspt_mdh)) {
-				lnet_res_unlock(i);
-				list_del(&rspt->rspt_on_list);
-				lnet_rspt_free(rspt, i);
-				continue;
-			}
-
-			if (ktime_compare(now, rspt->rspt_deadline) >= 0 ||
-			    the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN) {
-				struct lnet_peer_ni *lpni;
-				lnet_nid_t nid;
-
-				md = lnet_handle2md(&rspt->rspt_mdh);
-				if (!md) {
-					/* MD has been queued for unlink, but
-					 * rspt hasn't been detached (Note we've
-					 * checked above that the rspt_mdh is
-					 * valid). Since we cannot lookup the MD
-					 * we're unable to detach the rspt
-					 * ourselves. Thus, move the rspt to the
-					 * zombie list where we'll wait for
-					 * either:
-					 *   1. The remaining operations on the
-					 *   MD to complete. In this case the
-					 *   final operation will result in
-					 *   lnet_msg_detach_md()->
-					 *   lnet_detach_rsp_tracker() where
-					 *   we will clean up this response
-					 *   tracker.
-					 *   2. LNet to shutdown. In this case
-					 *   we'll wait until after all LND Nets
-					 *   have shutdown and then we can
-					 *   safely free any remaining response
-					 *   tracker blocks on the zombie list.
-					 * Note: We need to hold the resource
-					 * lock when adding to the zombie list
-					 * because we may have concurrent access
-					 * with lnet_detach_rsp_tracker().
-					 */
-					LNetInvalidateMDHandle(&rspt->rspt_mdh);
-					list_move(&rspt->rspt_on_list,
-						  the_lnet.ln_mt_zombie_rstqs[i]);
-					lnet_res_unlock(i);
-					continue;
-				}
-				LASSERT(md->md_rspt_ptr == rspt);
-				md->md_rspt_ptr = NULL;
-				lnet_res_unlock(i);
-
-				LNetMDUnlink(rspt->rspt_mdh);
-
-				nid = rspt->rspt_next_hop_nid;
-
-				list_del(&rspt->rspt_on_list);
-				lnet_rspt_free(rspt, i);
-
-				/* If we're shutting down we just want to clean
-				 * up the rspt blocks
-				 */
-				if (the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN)
-					continue;
-
-				lnet_net_lock(i);
-				the_lnet.ln_counters[i]->lct_health.lch_response_timeout_count++;
-				lnet_net_unlock(i);
-
-				CDEBUG(D_NET,
-				       "Response timeout: md = %p: nid = %s\n",
-				       md, libcfs_nid2str(nid));
-
-				/*
-				 * If there is a timeout on the response
-				 * from the next hop decrement its health
-				 * value so that we don't use it
-				 */
-				lnet_net_lock(0);
-				lpni = lnet_find_peer_ni_locked(nid);
-				if (lpni) {
-					lnet_handle_remote_failure_locked(lpni);
-					lnet_peer_ni_decref_locked(lpni);
-				}
-				lnet_net_unlock(0);
-			} else {
-				lnet_res_unlock(i);
-				break;
-			}
-		}
-
-		if (!list_empty(&local_queue)) {
-			lnet_net_lock(i);
-			list_splice(&local_queue, the_lnet.ln_mt_rstq[i]);
-			lnet_net_unlock(i);
-		}
-	}
-}
-
-static void
-lnet_resend_pending_msgs_locked(struct list_head *resendq, int cpt)
-{
-	struct lnet_msg *msg;
-
-	while (!list_empty(resendq)) {
-		struct lnet_peer_ni *lpni;
-
-		msg = list_entry(resendq->next, struct lnet_msg,
-				 msg_list);
-
-		list_del_init(&msg->msg_list);
-
-		lpni = lnet_find_peer_ni_locked(msg->msg_hdr.dest_nid);
-		if (!lpni) {
-			lnet_net_unlock(cpt);
-			CERROR("Expected that a peer is already created for %s\n",
-			       libcfs_nid2str(msg->msg_hdr.dest_nid));
-			msg->msg_no_resend = true;
-			lnet_finalize(msg, -EFAULT);
-			lnet_net_lock(cpt);
-		} else {
-			struct lnet_peer *peer;
-			int rc;
-			lnet_nid_t src_nid = LNET_NID_ANY;
-
-			/*
-			 * if this message is not being routed and the
-			 * peer is non-MR then we must use the same
-			 * src_nid that was used in the original send.
-			 * Otherwise if we're routing the message (IE
-			 * we're a router) then we can use any of our
-			 * local interfaces. It doesn't matter to the
-			 * final destination.
-			 */
-			peer = lpni->lpni_peer_net->lpn_peer;
-			if (!msg->msg_routing &&
-			    !lnet_peer_is_multi_rail(peer))
-				src_nid = le64_to_cpu(msg->msg_hdr.src_nid);
-
-			/*
-			 * If we originally specified a src NID, then we
-			 * must attempt to reuse it in the resend as well.
-			 */
-			if (msg->msg_src_nid_param != LNET_NID_ANY)
-				src_nid = msg->msg_src_nid_param;
-			lnet_peer_ni_decref_locked(lpni);
-
-			lnet_net_unlock(cpt);
-			CDEBUG(D_NET, "resending %s->%s: %s recovery %d try# %d\n",
-			       libcfs_nid2str(src_nid),
-			       libcfs_id2str(msg->msg_target),
-			       lnet_msgtyp2str(msg->msg_type),
-			       msg->msg_recovery,
-			       msg->msg_retry_count);
-			rc = lnet_send(src_nid, msg, LNET_NID_ANY);
-			if (rc) {
-				CERROR("Error sending %s to %s: %d\n",
-				       lnet_msgtyp2str(msg->msg_type),
-				       libcfs_id2str(msg->msg_target), rc);
-				msg->msg_no_resend = true;
-				lnet_finalize(msg, rc);
-			}
-			lnet_net_lock(cpt);
-			if (!rc)
-				the_lnet.ln_counters[cpt]->lct_health.lch_resend_count++;
-		}
-	}
-}
-
-static void
-lnet_resend_pending_msgs(void)
-{
-	int i;
-
-	cfs_cpt_for_each(i, lnet_cpt_table()) {
-		lnet_net_lock(i);
-		lnet_resend_pending_msgs_locked(the_lnet.ln_mt_resendqs[i], i);
-		lnet_net_unlock(i);
-	}
-}
-
-/* called with cpt and ni_lock held */
-static void
-lnet_unlink_ni_recovery_mdh_locked(struct lnet_ni *ni, int cpt, bool force)
-{
-	struct lnet_handle_md recovery_mdh;
-
-	LNetInvalidateMDHandle(&recovery_mdh);
-
-	if (ni->ni_recovery_state & LNET_NI_RECOVERY_PENDING ||
-	    force) {
-		recovery_mdh = ni->ni_ping_mdh;
-		LNetInvalidateMDHandle(&ni->ni_ping_mdh);
-	}
-	lnet_ni_unlock(ni);
-	lnet_net_unlock(cpt);
-	if (!LNetMDHandleIsInvalid(recovery_mdh))
-		LNetMDUnlink(recovery_mdh);
-	lnet_net_lock(cpt);
-	lnet_ni_lock(ni);
-}
-
-static void
-lnet_recover_local_nis(void)
-{
-	struct lnet_mt_event_info *ev_info;
-	struct list_head processed_list;
-	struct list_head local_queue;
-	struct lnet_handle_md mdh;
-	struct lnet_ni *tmp;
-	struct lnet_ni *ni;
-	lnet_nid_t nid;
-	int healthv;
-	int rc;
-
-	INIT_LIST_HEAD(&local_queue);
-	INIT_LIST_HEAD(&processed_list);
-
-	/*
-	 * splice the recovery queue on a local queue. We will iterate
-	 * through the local queue and update it as needed. Once we're
-	 * done with the traversal, we'll splice the local queue back on
-	 * the head of the ln_mt_localNIRecovq. Any newly added local NIs
-	 * will be traversed in the next iteration.
-	 */
-	lnet_net_lock(0);
-	list_splice_init(&the_lnet.ln_mt_localNIRecovq,
-			 &local_queue);
-	lnet_net_unlock(0);
-
-	list_for_each_entry_safe(ni, tmp, &local_queue, ni_recovery) {
-		/*
-		 * if an NI is being deleted or it is now healthy, there
-		 * is no need to keep it around in the recovery queue.
-		 * The monitor thread is the only thread responsible for
-		 * removing the NI from the recovery queue.
-		 * Multiple threads can be adding NIs to the recovery
-		 * queue.
-		 */
-		healthv = atomic_read(&ni->ni_healthv);
-
-		lnet_net_lock(0);
-		lnet_ni_lock(ni);
-		if (ni->ni_state != LNET_NI_STATE_ACTIVE ||
-		    healthv == LNET_MAX_HEALTH_VALUE) {
-			list_del_init(&ni->ni_recovery);
-			lnet_unlink_ni_recovery_mdh_locked(ni, 0, false);
-			lnet_ni_unlock(ni);
-			lnet_ni_decref_locked(ni, 0);
-			lnet_net_unlock(0);
-			continue;
-		}
-
-		/*
-		 * if the local NI failed recovery we must unlink the md.
-		 * But we want to keep the local_ni on the recovery queue
-		 * so we can continue the attempts to recover it.
-		 */
-		if (ni->ni_recovery_state & LNET_NI_RECOVERY_FAILED) {
-			lnet_unlink_ni_recovery_mdh_locked(ni, 0, true);
-			ni->ni_recovery_state &= ~LNET_NI_RECOVERY_FAILED;
-		}
-
-		lnet_ni_unlock(ni);
-		lnet_net_unlock(0);
-
-
-		CDEBUG(D_NET, "attempting to recover local ni: %s\n",
-		       libcfs_nid2str(ni->ni_nid));
-
-		lnet_ni_lock(ni);
-		if (!(ni->ni_recovery_state & LNET_NI_RECOVERY_PENDING)) {
-			ni->ni_recovery_state |= LNET_NI_RECOVERY_PENDING;
-			lnet_ni_unlock(ni);
-
-			LIBCFS_ALLOC(ev_info, sizeof(*ev_info));
-			if (!ev_info) {
-				CERROR("out of memory. Can't recover %s\n",
-				       libcfs_nid2str(ni->ni_nid));
-				lnet_ni_lock(ni);
-				ni->ni_recovery_state &=
-				  ~LNET_NI_RECOVERY_PENDING;
-				lnet_ni_unlock(ni);
-				continue;
-			}
-
-			mdh = ni->ni_ping_mdh;
-			/*
-			 * Invalidate the ni mdh in case it's deleted.
-			 * We'll unlink the mdh in this case below.
-			 */
-			LNetInvalidateMDHandle(&ni->ni_ping_mdh);
-			nid = ni->ni_nid;
-
-			/*
-			 * remove the NI from the local queue and drop the
-			 * reference count to it while we're recovering
-			 * it. The reason for that, is that the NI could
-			 * be deleted, and the way the code is structured
-			 * is if we don't drop the NI, then the deletion
-			 * code will enter a loop waiting for the
-			 * reference count to be removed while holding the
-			 * ln_mutex_lock(). When we look up the peer to
-			 * send to in lnet_select_pathway() we will try to
-			 * lock the ln_mutex_lock() as well, leading to
-			 * a deadlock. By dropping the refcount and
-			 * removing it from the list, we allow for the NI
-			 * to be removed, then we use the cached NID to
-			 * look it up again. If it's gone, then we just
-			 * continue examining the rest of the queue.
-			 */
-			lnet_net_lock(0);
-			list_del_init(&ni->ni_recovery);
-			lnet_ni_decref_locked(ni, 0);
-			lnet_net_unlock(0);
-
-			ev_info->mt_type = MT_TYPE_LOCAL_NI;
-			ev_info->mt_nid = nid;
-			rc = lnet_send_ping(nid, &mdh, LNET_INTERFACES_MIN,
-					    ev_info, the_lnet.ln_mt_eqh, true);
-			/* lookup the nid again */
-			lnet_net_lock(0);
-			ni = lnet_nid2ni_locked(nid, 0);
-			if (!ni) {
-				/*
-				 * the NI has been deleted when we dropped
-				 * the ref count
-				 */
-				lnet_net_unlock(0);
-				LNetMDUnlink(mdh);
-				continue;
-			}
-			/*
-			 * Same note as in lnet_recover_peer_nis(). When
-			 * we're sending the ping, the NI is free to be
-			 * deleted or manipulated. By this point it
-			 * could've been added back on the recovery queue,
-			 * and a refcount taken on it.
-			 * So we can't just add it blindly again or we'll
-			 * corrupt the queue. We must check under lock if
-			 * it's not on any list and if not then add it
-			 * to the processed list, which will eventually be
-			 * spliced back on to the recovery queue.
-			 */
-			ni->ni_ping_mdh = mdh;
-			if (list_empty(&ni->ni_recovery)) {
-				list_add_tail(&ni->ni_recovery, &processed_list);
-				lnet_ni_addref_locked(ni, 0);
-			}
-			lnet_net_unlock(0);
-
-			lnet_ni_lock(ni);
-			if (rc)
-				ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING;
-		}
-		lnet_ni_unlock(ni);
-	}
-
-	/*
-	 * put back the remaining NIs on the ln_mt_localNIRecovq to be
-	 * reexamined in the next iteration.
-	 */
-	list_splice_init(&processed_list, &local_queue);
-	lnet_net_lock(0);
-	list_splice(&local_queue, &the_lnet.ln_mt_localNIRecovq);
-	lnet_net_unlock(0);
-}
-
-static int
-lnet_resendqs_create(void)
-{
-	struct list_head **resendqs;
-	resendqs = lnet_create_array_of_queues();
-
-	if (!resendqs)
-		return -ENOMEM;
-
-	lnet_net_lock(LNET_LOCK_EX);
-	the_lnet.ln_mt_resendqs = resendqs;
-	lnet_net_unlock(LNET_LOCK_EX);
-
-	return 0;
-}
-
-static void
-lnet_clean_local_ni_recoveryq(void)
-{
-	struct lnet_ni *ni;
-
-	/* This is only called when the monitor thread has stopped */
-	lnet_net_lock(0);
-
-	while (!list_empty(&the_lnet.ln_mt_localNIRecovq)) {
-		ni = list_entry(the_lnet.ln_mt_localNIRecovq.next,
-				struct lnet_ni, ni_recovery);
-		list_del_init(&ni->ni_recovery);
-		lnet_ni_lock(ni);
-		lnet_unlink_ni_recovery_mdh_locked(ni, 0, true);
-		lnet_ni_unlock(ni);
-		lnet_ni_decref_locked(ni, 0);
-	}
-
-	lnet_net_unlock(0);
-}
-
-static void
-lnet_unlink_lpni_recovery_mdh_locked(struct lnet_peer_ni *lpni, int cpt,
-				     bool force)
-{
-	struct lnet_handle_md recovery_mdh;
-
-	LNetInvalidateMDHandle(&recovery_mdh);
-
-	if (lpni->lpni_state & LNET_PEER_NI_RECOVERY_PENDING || force) {
-		recovery_mdh = lpni->lpni_recovery_ping_mdh;
-		LNetInvalidateMDHandle(&lpni->lpni_recovery_ping_mdh);
-	}
-	spin_unlock(&lpni->lpni_lock);
-	lnet_net_unlock(cpt);
-	if (!LNetMDHandleIsInvalid(recovery_mdh))
-		LNetMDUnlink(recovery_mdh);
-	lnet_net_lock(cpt);
-	spin_lock(&lpni->lpni_lock);
-}
-
-static void
-lnet_clean_peer_ni_recoveryq(void)
-{
-	struct lnet_peer_ni *lpni, *tmp;
-
-	lnet_net_lock(LNET_LOCK_EX);
-
-	list_for_each_entry_safe(lpni, tmp, &the_lnet.ln_mt_peerNIRecovq,
-				 lpni_recovery) {
-		list_del_init(&lpni->lpni_recovery);
-		spin_lock(&lpni->lpni_lock);
-		lnet_unlink_lpni_recovery_mdh_locked(lpni, LNET_LOCK_EX, true);
-		spin_unlock(&lpni->lpni_lock);
-		lnet_peer_ni_decref_locked(lpni);
-	}
-
-	lnet_net_unlock(LNET_LOCK_EX);
-}
-
-static void
-lnet_clean_resendqs(void)
-{
-	struct lnet_msg *msg, *tmp;
-	struct list_head msgs;
-	int i;
-
-	INIT_LIST_HEAD(&msgs);
-
-	cfs_cpt_for_each(i, lnet_cpt_table()) {
-		lnet_net_lock(i);
-		list_splice_init(the_lnet.ln_mt_resendqs[i], &msgs);
-		lnet_net_unlock(i);
-		list_for_each_entry_safe(msg, tmp, &msgs, msg_list) {
-			list_del_init(&msg->msg_list);
-			msg->msg_no_resend = true;
-			lnet_finalize(msg, -ESHUTDOWN);
-		}
-	}
-
-	cfs_percpt_free(the_lnet.ln_mt_resendqs);
-}
-
-static void
-lnet_recover_peer_nis(void)
-{
-	struct lnet_mt_event_info *ev_info;
-	struct list_head processed_list;
-	struct list_head local_queue;
-	struct lnet_handle_md mdh;
-	struct lnet_peer_ni *lpni;
-	struct lnet_peer_ni *tmp;
-	lnet_nid_t nid;
-	int healthv;
-	int rc;
-
-	INIT_LIST_HEAD(&local_queue);
-	INIT_LIST_HEAD(&processed_list);
-
-	/*
-	 * Always use cpt 0 for locking across all interactions with
-	 * ln_mt_peerNIRecovq
-	 */
-	lnet_net_lock(0);
-	list_splice_init(&the_lnet.ln_mt_peerNIRecovq,
-			 &local_queue);
-	lnet_net_unlock(0);
-
-	list_for_each_entry_safe(lpni, tmp, &local_queue,
-				 lpni_recovery) {
-		/*
-		 * The same protection strategy is used here as is in the
-		 * local recovery case.
-		 */
-		lnet_net_lock(0);
-		healthv = atomic_read(&lpni->lpni_healthv);
-		spin_lock(&lpni->lpni_lock);
-		if (lpni->lpni_state & LNET_PEER_NI_DELETING ||
-		    healthv == LNET_MAX_HEALTH_VALUE) {
-			list_del_init(&lpni->lpni_recovery);
-			lnet_unlink_lpni_recovery_mdh_locked(lpni, 0, false);
-			spin_unlock(&lpni->lpni_lock);
-			lnet_peer_ni_decref_locked(lpni);
-			lnet_net_unlock(0);
-			continue;
-		}
-
-		/*
-		 * If the peer NI has failed recovery we must unlink the
-		 * md. But we want to keep the peer ni on the recovery
-		 * queue so we can try to continue recovering it
-		 */
-		if (lpni->lpni_state & LNET_PEER_NI_RECOVERY_FAILED) {
-			lnet_unlink_lpni_recovery_mdh_locked(lpni, 0, true);
-			lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_FAILED;
-		}
-
-		spin_unlock(&lpni->lpni_lock);
-		lnet_net_unlock(0);
-
-		/*
-		 * NOTE: we're racing with peer deletion from user space.
-		 * It's possible that a peer is deleted after we check its
-		 * state. In this case the recovery can create a new peer
-		 */
-		spin_lock(&lpni->lpni_lock);
-		if (!(lpni->lpni_state & LNET_PEER_NI_RECOVERY_PENDING) &&
-		    !(lpni->lpni_state & LNET_PEER_NI_DELETING)) {
-			lpni->lpni_state |= LNET_PEER_NI_RECOVERY_PENDING;
-			spin_unlock(&lpni->lpni_lock);
-
-			LIBCFS_ALLOC(ev_info, sizeof(*ev_info));
-			if (!ev_info) {
-				CERROR("out of memory. Can't recover %s\n",
-				       libcfs_nid2str(lpni->lpni_nid));
-				spin_lock(&lpni->lpni_lock);
-				lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING;
-				spin_unlock(&lpni->lpni_lock);
-				continue;
-			}
-
-			/* look at the comments in lnet_recover_local_nis() */
-			mdh = lpni->lpni_recovery_ping_mdh;
-			LNetInvalidateMDHandle(&lpni->lpni_recovery_ping_mdh);
-			nid = lpni->lpni_nid;
-			lnet_net_lock(0);
-			list_del_init(&lpni->lpni_recovery);
-			lnet_peer_ni_decref_locked(lpni);
-			lnet_net_unlock(0);
-
-			ev_info->mt_type = MT_TYPE_PEER_NI;
-			ev_info->mt_nid = nid;
-			rc = lnet_send_ping(nid, &mdh, LNET_INTERFACES_MIN,
-					    ev_info, the_lnet.ln_mt_eqh, true);
-			lnet_net_lock(0);
-			/*
-			 * lnet_find_peer_ni_locked() grabs a refcount for
-			 * us. No need to take it explicitly.
-			 */
-			lpni = lnet_find_peer_ni_locked(nid);
-			if (!lpni) {
-				lnet_net_unlock(0);
-				LNetMDUnlink(mdh);
-				continue;
-			}
-
-			lpni->lpni_recovery_ping_mdh = mdh;
-			/*
-			 * While we're unlocked the lpni could've been
-			 * readded on the recovery queue. In this case we
-			 * don't need to add it to the local queue, since
-			 * it's already on there and the thread that added
-			 * it would've incremented the refcount on the
-			 * peer, which means we need to decref the refcount
-			 * that was implicitly grabbed by find_peer_ni_locked.
-			 * Otherwise, if the lpni is still not on
-			 * the recovery queue, then we'll add it to the
-			 * processed list.
-			 */
-			if (list_empty(&lpni->lpni_recovery))
-				list_add_tail(&lpni->lpni_recovery, &processed_list);
-			else
-				lnet_peer_ni_decref_locked(lpni);
-			lnet_net_unlock(0);
-
-			spin_lock(&lpni->lpni_lock);
-			if (rc)
-				lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING;
-		}
-		spin_unlock(&lpni->lpni_lock);
-	}
-
-	list_splice_init(&processed_list, &local_queue);
-	lnet_net_lock(0);
-	list_splice(&local_queue, &the_lnet.ln_mt_peerNIRecovq);
-	lnet_net_unlock(0);
-}
-
-static int
-lnet_monitor_thread(void *arg)
-{
-	time64_t recovery_timeout = 0;
-	time64_t rsp_timeout = 0;
-	int interval;
-	time64_t now;
-
-	/*
-	 * The monitor thread takes care of the following:
-	 *  1. Checks the aliveness of routers
-	 *  2. Checks if there are messages on the resend queue to resend
-	 *     them.
-	 *  3. Check if there are any NIs on the local recovery queue and
-	 *     pings them
-	 *  4. Checks if there are any NIs on the remote recovery queue
-	 *     and pings them.
-	 */
-	cfs_block_allsigs();
-
-	while (the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING) {
-		now = ktime_get_real_seconds();
-
-		if (lnet_router_checker_active())
-			lnet_check_routers();
-
-		lnet_resend_pending_msgs();
-
-		if (now >= rsp_timeout) {
-			lnet_finalize_expired_responses();
-			rsp_timeout = now + (lnet_transaction_timeout / 2);
-		}
-
-		if (now >= recovery_timeout) {
-			lnet_recover_local_nis();
-			lnet_recover_peer_nis();
-			recovery_timeout = now + lnet_recovery_interval;
+			continue;
+		} else if (lpni->lpni_txcredits < best_lpni_credits) {
+			/*
+			 * We already have a peer that has more credits
+			 * available than this one. No need to consider
+			 * this peer further.
+			 */
+			continue;
+		} else if (lpni->lpni_txcredits == best_lpni_credits) {
+			/*
+			 * The best peer found so far and the current peer
+			 * have the same number of available credits let's
+			 * make sure to select between them using Round
+			 * Robin
+			 */
+			if (best_lpni) {
+				if (best_lpni->lpni_seq <= lpni->lpni_seq)
+					continue;
+			}
 		}
 
-		/*
-		 * TODO do we need to check if we should sleep without
-		 * timeout?  Technically, an active system will always
-		 * have messages in flight so this check will always
-		 * evaluate to false. And on an idle system do we care
-		 * if we wake up every 1 second? Although, we've seen
-		 * cases where we get a complaint that an idle thread
-		 * is waking up unnecessarily.
-		 */
-		interval = min(lnet_recovery_interval,
-			       lnet_transaction_timeout / 2);
-		wait_event_interruptible_timeout(the_lnet.ln_mt_waitq,
-						false,
-						cfs_time_seconds(interval));
+		best_lpni = lpni;
+		best_lpni_credits = lpni->lpni_txcredits;
 	}
 
-	/* clean up the router checker */
-	lnet_prune_rc_data(1);
-
-	/* Shutting down */
-	lnet_net_lock(LNET_LOCK_EX);
-	the_lnet.ln_mt_state = LNET_MT_STATE_SHUTDOWN;
-	lnet_net_unlock(LNET_LOCK_EX);
-
-	/* signal that the monitor thread is exiting */
-	up(&the_lnet.ln_mt_signal);
+	/* if we still can't find a peer ni then we can't reach it */
+	if (!best_lpni) {
+		__u32 net_id = (peer_net) ? peer_net->lpn_net_id :
+			LNET_NIDNET(dst_nid);
+		lnet_net_unlock(cpt);
+		LCONSOLE_WARN("no peer_ni found on peer net %s\n",
+				libcfs_net2str(net_id));
+		return -EHOSTUNREACH;
+	}
 
-	return 0;
-}
 
-/*
- * lnet_send_ping
- * Sends a ping.
- * Returns == 0 if success
- * Returns > 0 if LNetMDBind or prior fails
- * Returns < 0 if LNetGet fails
- */
-int
-lnet_send_ping(lnet_nid_t dest_nid,
-	       struct lnet_handle_md *mdh, int nnis,
-	       void *user_data, struct lnet_handle_eq eqh, bool recovery)
-{
-	struct lnet_md md = { NULL };
-	struct lnet_process_id id;
-	struct lnet_ping_buffer *pbuf;
-	int rc;
+send:
+	/* Shortcut for loopback. */
+	if (best_ni == the_lnet.ln_loni) {
+		/* No send credit hassles with LOLND */
+		lnet_ni_addref_locked(best_ni, cpt);
+		msg->msg_hdr.dest_nid = cpu_to_le64(best_ni->ni_nid);
+		if (!msg->msg_routing)
+			msg->msg_hdr.src_nid = cpu_to_le64(best_ni->ni_nid);
+		msg->msg_target.nid = best_ni->ni_nid;
+		lnet_msg_commit(msg, cpt);
+		msg->msg_txni = best_ni;
+		lnet_net_unlock(cpt);
 
-	if (dest_nid == LNET_NID_ANY) {
-		rc = -EHOSTUNREACH;
-		goto fail_error;
+		return LNET_CREDIT_OK;
 	}
 
-	pbuf = lnet_ping_buffer_alloc(nnis, GFP_NOFS);
-	if (!pbuf) {
-		rc = ENOMEM;
-		goto fail_error;
-	}
+	routing = routing || routing2;
 
-	/* initialize md content */
-	md.start     = &pbuf->pb_info;
-	md.length    = LNET_PING_INFO_SIZE(nnis);
-	md.threshold = 2; /* GET/REPLY */
-	md.max_size  = 0;
-	md.options   = LNET_MD_TRUNCATE;
-	md.user_ptr  = user_data;
-	md.eq_handle = eqh;
-
-	rc = LNetMDBind(md, LNET_UNLINK, mdh);
-	if (rc) {
-		lnet_ping_buffer_decref(pbuf);
-		CERROR("Can't bind MD: %d\n", rc);
-		rc = -rc; /* change the rc to positive */
-		goto fail_error;
-	}
-	id.pid = LNET_PID_LUSTRE;
-	id.nid = dest_nid;
+	/*
+	 * Increment sequence number of the peer selected so that we
+	 * pick the next one in Round Robin.
+	 */
+	best_lpni->lpni_seq++;
 
-	rc = LNetGet(LNET_NID_ANY, *mdh, id,
-		     LNET_RESERVED_PORTAL,
-		     LNET_PROTO_PING_MATCHBITS, 0, recovery);
+	/*
+	 * grab a reference on the peer_ni so it sticks around even if
+	 * we need to drop and relock the lnet_net_lock below.
+	 */
+	lnet_peer_ni_addref_locked(best_lpni);
 
-	if (rc)
-		goto fail_unlink_md;
+	/*
+	 * Use lnet_cpt_of_nid() to determine the CPT used to commit the
+	 * message. This ensures that we get a CPT that is correct for
+	 * the NI when the NI has been restricted to a subset of all CPTs.
+	 * If the selected CPT differs from the one currently locked, we
+	 * must unlock and relock the lnet_net_lock(), and then check whether
+	 * the configuration has changed. We don't have a hold on the best_ni
+	 * yet, and it may have vanished.
+	 */
+	cpt2 = lnet_cpt_of_nid_locked(best_lpni->lpni_nid, best_ni);
+	if (cpt != cpt2) {
+		lnet_net_unlock(cpt);
+		cpt = cpt2;
+		lnet_net_lock(cpt);
+		if (seq != lnet_get_dlc_seq_locked()) {
+			lnet_peer_ni_decref_locked(best_lpni);
+			goto again;
+		}
+	}
 
-	return 0;
+	/*
+	 * store the best_lpni in the message right away to avoid having
+	 * to do the same operation under different conditions
+	 */
+	msg->msg_txpeer = best_lpni;
+	msg->msg_txni = best_ni;
 
-fail_unlink_md:
-	LNetMDUnlink(*mdh);
-	LNetInvalidateMDHandle(mdh);
-fail_error:
-	return rc;
-}
+	/*
+	 * grab a reference for the best_ni since now it's in use in this
+	 * send. the reference will need to be dropped when the message is
+	 * finished in lnet_finalize()
+	 */
+	lnet_ni_addref_locked(msg->msg_txni, cpt);
 
-static void
-lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info,
-			   int status, bool unlink_event)
-{
-	lnet_nid_t nid = ev_info->mt_nid;
+	/*
+	 * Always set the target.nid to the best peer picked. Either the
+	 * nid will be one of the preconfigured NIDs, or the same NID as
+	 * what was originally set in the target or it will be the NID of
+	 * a router if this message should be routed
+	 */
+	msg->msg_target.nid = msg->msg_txpeer->lpni_nid;
 
-	if (ev_info->mt_type == MT_TYPE_LOCAL_NI) {
-		struct lnet_ni *ni;
+	/*
+	 * lnet_msg_commit assigns the correct cpt to the message, which
+	 * is used to decrement the correct refcount on the ni when it's
+	 * time to return the credits
+	 */
+	lnet_msg_commit(msg, cpt);
 
-		lnet_net_lock(0);
-		ni = lnet_nid2ni_locked(nid, 0);
-		if (!ni) {
-			lnet_net_unlock(0);
-			return;
-		}
-		lnet_ni_lock(ni);
-		ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING;
-		if (status)
-			ni->ni_recovery_state |= LNET_NI_RECOVERY_FAILED;
-		lnet_ni_unlock(ni);
-		lnet_net_unlock(0);
+	/*
+	 * If we are routing the message then we don't need to overwrite
+	 * the src_nid since it would've been set at the origin. Otherwise
+	 * we are the originator so we need to set it.
+	 */
+	if (!msg->msg_routing)
+		msg->msg_hdr.src_nid = cpu_to_le64(msg->msg_txni->ni_nid);
 
-		if (status != 0) {
-			CERROR("local NI (%s) recovery failed with %d\n",
-			       libcfs_nid2str(nid), status);
-			return;
-		}
+	if (routing) {
+		msg->msg_target_is_router = 1;
+		msg->msg_target.pid = LNET_PID_LUSTRE;
 		/*
-		 * need to increment healthv for the ni here, because in
-		 * the lnet_finalize() path we don't have access to this
-		 * NI. And in order to get access to it, we'll need to
-		 * carry forward too much information.
-		 * In the peer case, it'll naturally be incremented
+		 * since we're routing we want to ensure that the
+		 * msg_hdr.dest_nid is set to the final destination. When
+		 * the router receives this message it knows how to route
+		 * it.
 		 */
-		if (!unlink_event)
-			lnet_inc_healthv(&ni->ni_healthv);
+		msg->msg_hdr.dest_nid =
+			cpu_to_le64(final_dst ? final_dst->lpni_nid : dst_nid);
 	} else {
-		struct lnet_peer_ni *lpni;
-		int cpt;
-
-		cpt = lnet_net_lock_current();
-		lpni = lnet_find_peer_ni_locked(nid);
-		if (!lpni) {
-			lnet_net_unlock(cpt);
-			return;
-		}
-		spin_lock(&lpni->lpni_lock);
-		lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING;
-		if (status)
-			lpni->lpni_state |= LNET_PEER_NI_RECOVERY_FAILED;
-		spin_unlock(&lpni->lpni_lock);
-		lnet_peer_ni_decref_locked(lpni);
-		lnet_net_unlock(cpt);
-
-		if (status != 0)
-			CERROR("peer NI (%s) recovery failed with %d\n",
-			       libcfs_nid2str(nid), status);
-	}
-}
-
-void
-lnet_mt_event_handler(struct lnet_event *event)
-{
-	struct lnet_mt_event_info *ev_info = event->md.user_ptr;
-	struct lnet_ping_buffer *pbuf;
-
-	/* TODO: remove assert */
-	LASSERT(event->type == LNET_EVENT_REPLY ||
-		event->type == LNET_EVENT_SEND ||
-		event->type == LNET_EVENT_UNLINK);
-
-	CDEBUG(D_NET, "Received event: %d status: %d\n", event->type,
-	       event->status);
-
-	switch (event->type) {
-	case LNET_EVENT_UNLINK:
-		CDEBUG(D_NET, "%s recovery ping unlinked\n",
-		       libcfs_nid2str(ev_info->mt_nid));
-		/* fallthrough */
-	case LNET_EVENT_REPLY:
-		lnet_handle_recovery_reply(ev_info, event->status,
-					   event->type == LNET_EVENT_UNLINK);
-		break;
-	case LNET_EVENT_SEND:
-		CDEBUG(D_NET, "%s recovery message sent %s:%d\n",
-			       libcfs_nid2str(ev_info->mt_nid),
-			       (event->status) ? "unsuccessfully" :
-			       "successfully", event->status);
-		break;
-	default:
-		CERROR("Unexpected event: %d\n", event->type);
-		break;
-	}
-	if (event->unlinked) {
-		LIBCFS_FREE(ev_info, sizeof(*ev_info));
-		pbuf = LNET_PING_INFO_TO_BUFFER(event->md.start);
-		lnet_ping_buffer_decref(pbuf);
+		/*
+		 * if we're not routing set the dest_nid to the best peer
+		 * ni that we picked earlier in the algorithm.
+		 */
+		msg->msg_hdr.dest_nid = cpu_to_le64(msg->msg_txpeer->lpni_nid);
 	}
-}
 
-static int
-lnet_rsp_tracker_create(void)
-{
-	struct list_head **rstqs;
-	rstqs = lnet_create_array_of_queues();
+	rc = lnet_post_send_locked(msg, 0);
 
-	if (!rstqs)
-		return -ENOMEM;
+	if (!rc)
+		CDEBUG(D_NET, "TRACE: %s(%s:%s) -> %s(%s:%s) : %s\n",
+		       libcfs_nid2str(msg->msg_hdr.src_nid),
+		       libcfs_nid2str(msg->msg_txni->ni_nid),
+		       libcfs_nid2str(src_nid),
+		       libcfs_nid2str(msg->msg_hdr.dest_nid),
+		       libcfs_nid2str(dst_nid),
+		       libcfs_nid2str(msg->msg_txpeer->lpni_nid),
+		       lnet_msgtyp2str(msg->msg_type));
 
-	the_lnet.ln_mt_rstq = rstqs;
+	lnet_net_unlock(cpt);
 
-	return 0;
+	return rc;
 }
 
-static void
-lnet_rsp_tracker_clean(void)
+int
+lnet_send(lnet_nid_t src_nid, struct lnet_msg *msg, lnet_nid_t rtr_nid)
 {
-	lnet_finalize_expired_responses();
+	lnet_nid_t		dst_nid = msg->msg_target.nid;
+	int			rc;
 
-	cfs_percpt_free(the_lnet.ln_mt_rstq);
-	the_lnet.ln_mt_rstq = NULL;
-}
+	/*
+	 * NB: rtr_nid is set to LNET_NID_ANY for all current use-cases,
+	 * but we might want to use pre-determined router for ACK/REPLY
+	 * in the future
+	 */
+	/* NB: ni != NULL == interface pre-determined (ACK/REPLY) */
+	LASSERT (msg->msg_txpeer == NULL);
+	LASSERT (!msg->msg_sending);
+	LASSERT (!msg->msg_target_is_router);
+	LASSERT (!msg->msg_receiving);
 
-int lnet_monitor_thr_start(void)
-{
-	int rc = 0;
-	struct task_struct *task;
+	msg->msg_sending = 1;
 
-	if (the_lnet.ln_mt_state != LNET_MT_STATE_SHUTDOWN)
-		return -EALREADY;
+	LASSERT(!msg->msg_tx_committed);
 
-	rc = lnet_resendqs_create();
-	if (rc)
+	rc = lnet_select_pathway(src_nid, dst_nid, msg, rtr_nid);
+	if (rc < 0)
 		return rc;
 
-	rc = lnet_rsp_tracker_create();
-	if (rc)
-		goto clean_queues;
-
-	/* Pre monitor thread start processing */
-	rc = lnet_router_pre_mt_start();
-	if (rc)
-		goto free_mem;
-
-	sema_init(&the_lnet.ln_mt_signal, 0);
-
-	lnet_net_lock(LNET_LOCK_EX);
-	the_lnet.ln_mt_state = LNET_MT_STATE_RUNNING;
-	lnet_net_unlock(LNET_LOCK_EX);
-	task = kthread_run(lnet_monitor_thread, NULL, "monitor_thread");
-	if (IS_ERR(task)) {
-		rc = PTR_ERR(task);
-		CERROR("Can't start monitor thread: %d\n", rc);
-		goto clean_thread;
-	}
-
-	/* post monitor thread start processing */
-	lnet_router_post_mt_start();
+	if (rc == LNET_CREDIT_OK)
+		lnet_ni_send(msg->msg_txni, msg);
 
+	/* rc == LNET_CREDIT_OK or LNET_CREDIT_WAIT */
 	return 0;
-
-clean_thread:
-	lnet_net_lock(LNET_LOCK_EX);
-	the_lnet.ln_mt_state = LNET_MT_STATE_STOPPING;
-	lnet_net_unlock(LNET_LOCK_EX);
-	/* block until event callback signals exit */
-	down(&the_lnet.ln_mt_signal);
-	/* clean up */
-	lnet_router_cleanup();
-free_mem:
-	lnet_net_lock(LNET_LOCK_EX);
-	the_lnet.ln_mt_state = LNET_MT_STATE_SHUTDOWN;
-	lnet_net_unlock(LNET_LOCK_EX);
-	lnet_rsp_tracker_clean();
-	lnet_clean_local_ni_recoveryq();
-	lnet_clean_peer_ni_recoveryq();
-	lnet_clean_resendqs();
-	LNetInvalidateEQHandle(&the_lnet.ln_mt_eqh);
-	return rc;
-clean_queues:
-	lnet_rsp_tracker_clean();
-	lnet_clean_local_ni_recoveryq();
-	lnet_clean_peer_ni_recoveryq();
-	lnet_clean_resendqs();
-	return rc;
-}
-
-void lnet_monitor_thr_stop(void)
-{
-	if (the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN)
-		return;
-
-	LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING);
-	lnet_net_lock(LNET_LOCK_EX);
-	the_lnet.ln_mt_state = LNET_MT_STATE_STOPPING;
-	lnet_net_unlock(LNET_LOCK_EX);
-
-	/* tell the monitor thread that we're shutting down */
-	wake_up(&the_lnet.ln_mt_waitq);
-
-	/* block until monitor thread signals that it's done */
-	down(&the_lnet.ln_mt_signal);
-	LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN);
-
-	/* perform cleanup tasks */
-	lnet_router_cleanup();
-	lnet_rsp_tracker_clean();
-	lnet_clean_local_ni_recoveryq();
-	lnet_clean_peer_ni_recoveryq();
-	lnet_clean_resendqs();
-
-	return;
 }
 
 void
-lnet_drop_message(struct lnet_ni *ni, int cpt, void *private, unsigned int nob,
-		  __u32 msg_type)
+lnet_drop_message(struct lnet_ni *ni, int cpt, void *private, unsigned int nob)
 {
 	lnet_net_lock(cpt);
-	lnet_incr_stats(&ni->ni_stats, msg_type, LNET_STATS_TYPE_DROP);
-	the_lnet.ln_counters[cpt]->lct_common.lcc_drop_count++;
-	the_lnet.ln_counters[cpt]->lct_common.lcc_drop_length += nob;
+	the_lnet.ln_counters[cpt]->drop_count++;
+	the_lnet.ln_counters[cpt]->drop_length += nob;
 	lnet_net_unlock(cpt);
 
 	lnet_ni_recv(ni, private, NULL, 0, 0, 0, nob);
@@ -3917,13 +2128,13 @@ lnet_parse_get(struct lnet_ni *ni, struct lnet_msg *msg, int rdma_get)
 static int
 lnet_parse_reply(struct lnet_ni *ni, struct lnet_msg *msg)
 {
-	void *private = msg->msg_private;
-	struct lnet_hdr *hdr = &msg->msg_hdr;
+	void		 *private = msg->msg_private;
+	struct lnet_hdr	 *hdr = &msg->msg_hdr;
 	struct lnet_process_id src = {0};
-	struct lnet_libmd *md;
-	int rlength;
-	int mlength;
-	int cpt;
+	struct lnet_libmd	 *md;
+	int		  rlength;
+	int		  mlength;
+	int			cpt;
 
 	cpt = lnet_cpt_of_cookie(hdr->msg.reply.dst_wmd.wh_object_cookie);
 	lnet_res_lock(cpt);
@@ -3984,10 +2195,10 @@ lnet_parse_reply(struct lnet_ni *ni, struct lnet_msg *msg)
 static int
 lnet_parse_ack(struct lnet_ni *ni, struct lnet_msg *msg)
 {
-	struct lnet_hdr *hdr = &msg->msg_hdr;
+	struct lnet_hdr	 *hdr = &msg->msg_hdr;
 	struct lnet_process_id src = {0};
-	struct lnet_libmd *md;
-	int cpt;
+	struct lnet_libmd	 *md;
+	int			cpt;
 
 	src.nid = hdr->src_nid;
 	src.pid = hdr->src_pid;
@@ -4194,12 +2405,11 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
 	for_me = (ni->ni_nid == dest_nid);
 	cpt = lnet_cpt_of_nid(from_nid, ni);
 
-	CDEBUG(D_NET, "TRACE: %s(%s) <- %s : %s - %s\n",
+	CDEBUG(D_NET, "TRACE: %s(%s) <- %s : %s\n",
 		libcfs_nid2str(dest_nid),
 		libcfs_nid2str(ni->ni_nid),
 		libcfs_nid2str(src_nid),
-		lnet_msgtyp2str(type),
-		(for_me) ? "for me" : "routed");
+		lnet_msgtyp2str(type));
 
 	switch (type) {
 	case LNET_MSG_ACK:
@@ -4236,10 +2446,10 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
 	}
 
 	if (the_lnet.ln_routing &&
-	    ni->ni_last_alive != ktime_get_real_seconds()) {
+	    ni->ni_last_alive != cfs_time_current_sec()) {
 		/* NB: so far here is the only place to set NI status to "up */
 		lnet_ni_lock(ni);
-		ni->ni_last_alive = ktime_get_real_seconds();
+		ni->ni_last_alive = cfs_time_current_sec();
 		if (ni->ni_status != NULL &&
 		    ni->ni_status->ns_status == LNET_NI_STATUS_DOWN)
 			ni->ni_status->ns_status = LNET_NI_STATUS_UP;
@@ -4303,7 +2513,7 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
 	}
 
 	if (!list_empty(&the_lnet.ln_drop_rules) &&
-	    lnet_drop_rule_match(hdr, NULL)) {
+	    lnet_drop_rule_match(hdr)) {
 		CDEBUG(D_NET, "%s, src %s, dst %s: Dropping %s to simulate"
 			      "silent message loss\n",
 		       libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
@@ -4311,52 +2521,6 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
 		goto drop;
 	}
 
-	if (lnet_drop_asym_route && for_me &&
-	    LNET_NIDNET(src_nid) != LNET_NIDNET(from_nid)) {
-		struct lnet_net *net;
-		struct lnet_remotenet *rnet;
-		bool found = true;
-
-		/* we are dealing with a routed message,
-		 * so see if route to reach src_nid goes through from_nid
-		 */
-		lnet_net_lock(cpt);
-		net = lnet_get_net_locked(LNET_NIDNET(ni->ni_nid));
-		if (!net) {
-			lnet_net_unlock(cpt);
-			CERROR("net %s not found\n",
-			       libcfs_net2str(LNET_NIDNET(ni->ni_nid)));
-			return -EPROTO;
-		}
-
-		rnet = lnet_find_rnet_locked(LNET_NIDNET(src_nid));
-		if (rnet) {
-			struct lnet_peer_ni *gw = NULL;
-			struct lnet_route *route;
-
-			list_for_each_entry(route, &rnet->lrn_routes, lr_list) {
-				found = false;
-				gw = route->lr_gateway;
-				if (gw->lpni_net != net)
-					continue;
-				if (gw->lpni_nid == from_nid) {
-					found = true;
-					break;
-				}
-			}
-		}
-		lnet_net_unlock(cpt);
-		if (!found) {
-			/* we would not use from_nid to route a message to
-			 * src_nid
-			 * => asymmetric routing detected but forbidden
-			 */
-			CERROR("%s, src %s: Dropping asymmetrical route %s\n",
-			       libcfs_nid2str(from_nid),
-			       libcfs_nid2str(src_nid), lnet_msgtyp2str(type));
-			goto drop;
-		}
-	}
 
 	msg = lnet_msg_alloc();
 	if (msg == NULL) {
@@ -4394,7 +2558,7 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
 	}
 
 	lnet_net_lock(cpt);
-	lpni = lnet_nid2peerni_locked(from_nid, ni->ni_nid, cpt);
+	lpni = lnet_nid2peerni_locked(from_nid, cpt);
 	if (IS_ERR(lpni)) {
 		lnet_net_unlock(cpt);
 		CERROR("%s, src %s: Dropping %s "
@@ -4461,7 +2625,7 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
 	lnet_finalize(msg, rc);
 
  drop:
-	lnet_drop_message(ni, cpt, private, payload_length, type);
+	lnet_drop_message(ni, cpt, private, payload_length);
 	return 0;
 }
 EXPORT_SYMBOL(lnet_parse);
@@ -4497,10 +2661,7 @@ lnet_drop_delayed_msg_list(struct list_head *head, char *reason)
 		 * until that's done */
 
 		lnet_drop_message(msg->msg_rxni, msg->msg_rx_cpt,
-				  msg->msg_private, msg->msg_len,
-				  msg->msg_type);
-
-		msg->msg_no_resend = true;
+				  msg->msg_private, msg->msg_len);
 		/*
 		 * NB: message will not generate event because w/o attached MD,
 		 * but we still should give error code so lnet_msg_decommit()
@@ -4543,54 +2704,6 @@ lnet_recv_delayed_msg_list(struct list_head *head)
 	}
 }
 
-static void
-lnet_attach_rsp_tracker(struct lnet_rsp_tracker *rspt, int cpt,
-			struct lnet_libmd *md, struct lnet_handle_md mdh)
-{
-	s64 timeout_ns;
-	bool new_entry = true;
-	struct lnet_rsp_tracker *local_rspt;
-
-	/*
-	 * MD has a refcount taken by message so it's not going away.
-	 * The MD however can be looked up. We need to secure the access
-	 * to the md_rspt_ptr by taking the res_lock.
-	 * The rspt can be accessed without protection up to when it gets
-	 * added to the list.
-	 */
-
-	lnet_res_lock(cpt);
-	local_rspt = md->md_rspt_ptr;
-	timeout_ns = lnet_transaction_timeout * NSEC_PER_SEC;
-	if (local_rspt != NULL) {
-		/*
-		 * we already have an rspt attached to the md, so we'll
-		 * update the deadline on that one.
-		 */
-		LIBCFS_FREE(rspt, sizeof(*rspt));
-		new_entry = false;
-	} else {
-		/* new md */
-		rspt->rspt_mdh = mdh;
-		rspt->rspt_cpt = cpt;
-		/* store the rspt so we can access it when we get the REPLY */
-		md->md_rspt_ptr = rspt;
-		local_rspt = rspt;
-	}
-	local_rspt->rspt_deadline = ktime_add_ns(ktime_get(), timeout_ns);
-
-	/*
-	 * add to the list of tracked responses. It's added to tail of the
-	 * list in order to expire all the older entries first.
-	 */
-	lnet_net_lock(cpt);
-	if (!new_entry && !list_empty(&local_rspt->rspt_on_list))
-		list_del_init(&local_rspt->rspt_on_list);
-	list_add_tail(&local_rspt->rspt_on_list, the_lnet.ln_mt_rstq[cpt]);
-	lnet_net_unlock(cpt);
-	lnet_res_unlock(cpt);
-}
-
 /**
  * Initiate an asynchronous PUT operation.
  *
@@ -4641,11 +2754,10 @@ LNetPut(lnet_nid_t self, struct lnet_handle_md mdh, enum lnet_ack_req ack,
 	__u64 match_bits, unsigned int offset,
 	__u64 hdr_data)
 {
-	struct lnet_msg *msg;
-	struct lnet_libmd *md;
-	int cpt;
-	int rc;
-	struct lnet_rsp_tracker *rspt = NULL;
+	struct lnet_msg		*msg;
+	struct lnet_libmd	*md;
+	int			cpt;
+	int			rc;
 
 	LASSERT(the_lnet.ln_refcount > 0);
 
@@ -4665,17 +2777,6 @@ LNetPut(lnet_nid_t self, struct lnet_handle_md mdh, enum lnet_ack_req ack,
 	msg->msg_vmflush = !!memory_pressure_get();
 
 	cpt = lnet_cpt_of_cookie(mdh.cookie);
-
-	if (ack == LNET_ACK_REQ) {
-		rspt = lnet_rspt_alloc(cpt);
-		if (!rspt) {
-			CERROR("Dropping PUT to %s: ENOMEM on response tracker\n",
-				libcfs_id2str(target));
-			return -ENOMEM;
-		}
-		INIT_LIST_HEAD(&rspt->rspt_on_list);
-	}
-
 	lnet_res_lock(cpt);
 
 	md = lnet_handle2md(&mdh);
@@ -4688,7 +2789,6 @@ LNetPut(lnet_nid_t self, struct lnet_handle_md mdh, enum lnet_ack_req ack,
 			       md->md_me->me_portal);
 		lnet_res_unlock(cpt);
 
-		LIBCFS_FREE(rspt, sizeof(*rspt));
 		lnet_msg_free(msg);
 		return -ENOENT;
 	}
@@ -4721,14 +2821,10 @@ LNetPut(lnet_nid_t self, struct lnet_handle_md mdh, enum lnet_ack_req ack,
 
 	lnet_build_msg_event(msg, LNET_EVENT_SEND);
 
-	if (ack == LNET_ACK_REQ)
-		lnet_attach_rsp_tracker(rspt, cpt, md, mdh);
-
 	rc = lnet_send(self, msg, LNET_NID_ANY);
 	if (rc != 0) {
 		CNETERR("Error sending PUT to %s: %d\n",
 			libcfs_id2str(target), rc);
-		msg->msg_no_resend = true;
 		lnet_finalize(msg, rc);
 	}
 
@@ -4805,10 +2901,8 @@ lnet_create_reply_msg(struct lnet_ni *ni, struct lnet_msg *getmsg)
 	cpt = lnet_cpt_of_nid(peer_id.nid, ni);
 
 	lnet_net_lock(cpt);
-	lnet_incr_stats(&ni->ni_stats, LNET_MSG_GET, LNET_STATS_TYPE_DROP);
-	the_lnet.ln_counters[cpt]->lct_common.lcc_drop_count++;
-	the_lnet.ln_counters[cpt]->lct_common.lcc_drop_length +=
-		getmd->md_length;
+	the_lnet.ln_counters[cpt]->drop_count++;
+	the_lnet.ln_counters[cpt]->drop_length += getmd->md_length;
 	lnet_net_unlock(cpt);
 
 	if (msg != NULL)
@@ -4859,13 +2953,12 @@ EXPORT_SYMBOL(lnet_set_reply_msg_len);
 int
 LNetGet(lnet_nid_t self, struct lnet_handle_md mdh,
 	struct lnet_process_id target, unsigned int portal,
-	__u64 match_bits, unsigned int offset, bool recovery)
+	__u64 match_bits, unsigned int offset)
 {
-	struct lnet_msg *msg;
-	struct lnet_libmd *md;
-	struct lnet_rsp_tracker *rspt;
-	int cpt;
-	int rc;
+	struct lnet_msg		*msg;
+	struct lnet_libmd	*md;
+	int			cpt;
+	int			rc;
 
 	LASSERT(the_lnet.ln_refcount > 0);
 
@@ -4878,24 +2971,13 @@ LNetGet(lnet_nid_t self, struct lnet_handle_md mdh,
 	}
 
 	msg = lnet_msg_alloc();
-	if (!msg) {
+	if (msg == NULL) {
 		CERROR("Dropping GET to %s: ENOMEM on struct lnet_msg\n",
 		       libcfs_id2str(target));
 		return -ENOMEM;
 	}
 
 	cpt = lnet_cpt_of_cookie(mdh.cookie);
-
-	rspt = lnet_rspt_alloc(cpt);
-	if (!rspt) {
-		CERROR("Dropping GET to %s: ENOMEM on response tracker\n",
-		       libcfs_id2str(target));
-		return -ENOMEM;
-	}
-	INIT_LIST_HEAD(&rspt->rspt_on_list);
-
-	msg->msg_recovery = recovery;
-
 	lnet_res_lock(cpt);
 
 	md = lnet_handle2md(&mdh);
@@ -4910,7 +2992,6 @@ LNetGet(lnet_nid_t self, struct lnet_handle_md mdh,
 		lnet_res_unlock(cpt);
 
 		lnet_msg_free(msg);
-		LIBCFS_FREE(rspt, sizeof(*rspt));
 		return -ENOENT;
 	}
 
@@ -4935,13 +3016,10 @@ LNetGet(lnet_nid_t self, struct lnet_handle_md mdh,
 
 	lnet_build_msg_event(msg, LNET_EVENT_SEND);
 
-	lnet_attach_rsp_tracker(rspt, cpt, md, mdh);
-
 	rc = lnet_send(self, msg, LNET_NID_ANY);
 	if (rc < 0) {
 		CNETERR("Error sending GET to %s: %d\n",
 			libcfs_id2str(target), rc);
-		msg->msg_no_resend = true;
 		lnet_finalize(msg, rc);
 	}
 
@@ -4967,14 +3045,14 @@ EXPORT_SYMBOL(LNetGet);
 int
 LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp)
 {
-	struct list_head *e;
+	struct list_head	*e;
 	struct lnet_ni *ni = NULL;
 	struct lnet_remotenet *rnet;
-	__u32 dstnet = LNET_NIDNET(dstnid);
-	int hops;
-	int cpt;
-	__u32 order = 2;
-	struct list_head *rn_list;
+	__u32			dstnet = LNET_NIDNET(dstnid);
+	int			hops;
+	int			cpt;
+	__u32			order = 2;
+	struct list_head	*rn_list;
 
 	/* if !local_nid_dist_zero, I don't return a distance of 0 ever
 	 * (when lustre sees a distance of 0, it substitutes 0@lo), so I
@@ -4990,7 +3068,7 @@ LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp)
 			if (srcnidp != NULL)
 				*srcnidp = dstnid;
 			if (orderp != NULL) {
-				if (dstnid == LNET_NID_LO_0)
+				if (LNET_NETTYP(LNET_NIDNET(dstnid)) == LOLND)
 					*orderp = 0;
 				else
 					*orderp = 1;
@@ -5005,9 +3083,9 @@ LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp)
 			 * current net namespace.
 			 * If not, assign order above 0xffff0000,
 			 * to make this ni not a priority. */
-			if (current->nsproxy &&
-			    !net_eq(ni->ni_net_ns, current->nsproxy->net_ns))
-					order += 0xffff0000;
+			if (!net_eq(ni->ni_net_ns, current->nsproxy->net_ns))
+				order += 0xffff0000;
+
 			if (srcnidp != NULL)
 				*srcnidp = ni->ni_nid;
 			if (orderp != NULL)
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-msg.c b/drivers/staging/lustrefsx/lnet/lnet/lib-msg.c
index 959c370d2d4da..1b90855375a20 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lib-msg.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-msg.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -65,7 +65,6 @@ lnet_build_msg_event(struct lnet_msg *msg, enum lnet_event_kind ev_type)
 	LASSERT(!msg->msg_routing);
 
 	ev->type = ev_type;
-	ev->msg_type = msg->msg_type;
 
 	if (ev_type == LNET_EVENT_SEND) {
 		/* event for active message */
@@ -76,6 +75,7 @@ lnet_build_msg_event(struct lnet_msg *msg, enum lnet_event_kind ev_type)
 		ev->source.nid	  = LNET_NID_ANY;
 		ev->source.pid    = the_lnet.ln_pid;
 		ev->sender	  = LNET_NID_ANY;
+
 	} else {
 		/* event for passive message */
 		ev->target.pid	  = hdr->dest_pid;
@@ -142,18 +142,14 @@ void
 lnet_msg_commit(struct lnet_msg *msg, int cpt)
 {
 	struct lnet_msg_container *container = the_lnet.ln_msg_containers[cpt];
-	struct lnet_counters_common *common;
-	s64 timeout_ns;
-
-	/* set the message deadline */
-	timeout_ns = lnet_transaction_timeout * NSEC_PER_SEC;
-	msg->msg_deadline = ktime_add_ns(ktime_get(), timeout_ns);
+	struct lnet_counters *counters = the_lnet.ln_counters[cpt];
 
 	/* routed message can be committed for both receiving and sending */
 	LASSERT(!msg->msg_tx_committed);
 
 	if (msg->msg_sending) {
 		LASSERT(!msg->msg_receiving);
+
 		msg->msg_tx_cpt = cpt;
 		msg->msg_tx_committed = 1;
 		if (msg->msg_rx_committed) { /* routed message REPLY */
@@ -167,35 +163,33 @@ lnet_msg_commit(struct lnet_msg *msg, int cpt)
 	}
 
 	LASSERT(!msg->msg_onactivelist);
-
 	msg->msg_onactivelist = 1;
-	list_add_tail(&msg->msg_activelist, &container->msc_active);
+	list_add(&msg->msg_activelist, &container->msc_active);
 
-	common = &the_lnet.ln_counters[cpt]->lct_common;
-	common->lcc_msgs_alloc++;
-	if (common->lcc_msgs_alloc > common->lcc_msgs_max)
-		common->lcc_msgs_max = common->lcc_msgs_alloc;
+	counters->msgs_alloc++;
+	if (counters->msgs_alloc > counters->msgs_max)
+		counters->msgs_max = counters->msgs_alloc;
 }
 
 static void
 lnet_msg_decommit_tx(struct lnet_msg *msg, int status)
 {
-	struct lnet_counters_common *common;
+	struct lnet_counters *counters;
 	struct lnet_event *ev = &msg->msg_ev;
 
 	LASSERT(msg->msg_tx_committed);
 	if (status != 0)
 		goto out;
 
-	common = &(the_lnet.ln_counters[msg->msg_tx_cpt]->lct_common);
+	counters = the_lnet.ln_counters[msg->msg_tx_cpt];
 	switch (ev->type) {
 	default: /* routed message */
 		LASSERT(msg->msg_routing);
 		LASSERT(msg->msg_rx_committed);
 		LASSERT(ev->type == 0);
 
-		common->lcc_route_length += msg->msg_len;
-		common->lcc_route_count++;
+		counters->route_length += msg->msg_len;
+		counters->route_count++;
 		goto incr_stats;
 
 	case LNET_EVENT_PUT:
@@ -209,7 +203,7 @@ lnet_msg_decommit_tx(struct lnet_msg *msg, int status)
 	case LNET_EVENT_SEND:
 		LASSERT(!msg->msg_rx_committed);
 		if (msg->msg_type == LNET_MSG_PUT)
-			common->lcc_send_length += msg->msg_len;
+			counters->send_length += msg->msg_len;
 		break;
 
 	case LNET_EVENT_GET:
@@ -221,17 +215,13 @@ lnet_msg_decommit_tx(struct lnet_msg *msg, int status)
 		break;
 	}
 
-	common->lcc_send_count++;
+	counters->send_count++;
 
 incr_stats:
 	if (msg->msg_txpeer)
-		lnet_incr_stats(&msg->msg_txpeer->lpni_stats,
-				msg->msg_type,
-				LNET_STATS_TYPE_SEND);
+		atomic_inc(&msg->msg_txpeer->lpni_stats.send_count);
 	if (msg->msg_txni)
-		lnet_incr_stats(&msg->msg_txni->ni_stats,
-				msg->msg_type,
-				LNET_STATS_TYPE_SEND);
+		atomic_inc(&msg->msg_txni->ni_stats.send_count);
  out:
 	lnet_return_tx_credits_locked(msg);
 	msg->msg_tx_committed = 0;
@@ -240,7 +230,7 @@ lnet_msg_decommit_tx(struct lnet_msg *msg, int status)
 static void
 lnet_msg_decommit_rx(struct lnet_msg *msg, int status)
 {
-	struct lnet_counters_common *common;
+	struct lnet_counters *counters;
 	struct lnet_event *ev = &msg->msg_ev;
 
 	LASSERT(!msg->msg_tx_committed); /* decommitted or never committed */
@@ -249,7 +239,7 @@ lnet_msg_decommit_rx(struct lnet_msg *msg, int status)
 	if (status != 0)
 		goto out;
 
-	common = &(the_lnet.ln_counters[msg->msg_rx_cpt]->lct_common);
+	counters = the_lnet.ln_counters[msg->msg_rx_cpt];
 	switch (ev->type) {
 	default:
 		LASSERT(ev->type == 0);
@@ -267,7 +257,7 @@ lnet_msg_decommit_rx(struct lnet_msg *msg, int status)
 		 * lnet_msg_decommit_tx(), see details in lnet_parse_get() */
 		LASSERT(msg->msg_type == LNET_MSG_REPLY ||
 			msg->msg_type == LNET_MSG_GET);
-		common->lcc_send_length += msg->msg_wanted;
+		counters->send_length += msg->msg_wanted;
 		break;
 
 	case LNET_EVENT_PUT:
@@ -282,19 +272,15 @@ lnet_msg_decommit_rx(struct lnet_msg *msg, int status)
 		break;
 	}
 
-	common->lcc_recv_count++;
+	counters->recv_count++;
 
 incr_stats:
 	if (msg->msg_rxpeer)
-		lnet_incr_stats(&msg->msg_rxpeer->lpni_stats,
-				msg->msg_type,
-				LNET_STATS_TYPE_RECV);
+		atomic_inc(&msg->msg_rxpeer->lpni_stats.recv_count);
 	if (msg->msg_rxni)
-		lnet_incr_stats(&msg->msg_rxni->ni_stats,
-				msg->msg_type,
-				LNET_STATS_TYPE_RECV);
+		atomic_inc(&msg->msg_rxni->ni_stats.recv_count);
 	if (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_REPLY)
-		common->lcc_recv_length += msg->msg_wanted;
+		counters->recv_length += msg->msg_wanted;
 
  out:
 	lnet_return_rx_credits_locked(msg);
@@ -327,7 +313,7 @@ lnet_msg_decommit(struct lnet_msg *msg, int cpt, int status)
 	list_del(&msg->msg_activelist);
 	msg->msg_onactivelist = 0;
 
-	the_lnet.ln_counters[cpt2]->lct_common.lcc_msgs_alloc--;
+	the_lnet.ln_counters[cpt2]->msgs_alloc--;
 
 	if (cpt2 != cpt) {
 		lnet_net_unlock(cpt2);
@@ -363,6 +349,29 @@ lnet_msg_attach_md(struct lnet_msg *msg, struct lnet_libmd *md,
 	lnet_md_deconstruct(md, &msg->msg_ev.md);
 }
 
+void
+lnet_msg_detach_md(struct lnet_msg *msg, int status)
+{
+	struct lnet_libmd *md = msg->msg_md;
+	int unlink;
+
+	/* Now it's safe to drop my caller's ref */
+	md->md_refcount--;
+	LASSERT(md->md_refcount >= 0);
+
+	unlink = lnet_md_unlinkable(md);
+	if (md->md_eq != NULL) {
+		msg->msg_ev.status   = status;
+		msg->msg_ev.unlinked = unlink;
+		lnet_eq_enqueue_event(md->md_eq, &msg->msg_ev);
+	}
+
+	if (unlink)
+		lnet_md_unlink(md);
+
+	msg->msg_md = NULL;
+}
+
 static int
 lnet_complete_msg_locked(struct lnet_msg *msg, int cpt)
 {
@@ -439,549 +448,14 @@ lnet_complete_msg_locked(struct lnet_msg *msg, int cpt)
 	return 0;
 }
 
-static void
-lnet_dec_healthv_locked(atomic_t *healthv)
-{
-	int h = atomic_read(healthv);
-
-	if (h < lnet_health_sensitivity) {
-		atomic_set(healthv, 0);
-	} else {
-		h -= lnet_health_sensitivity;
-		atomic_set(healthv, h);
-	}
-}
-
-static void
-lnet_handle_local_failure(struct lnet_msg *msg)
-{
-	struct lnet_ni *local_ni;
-
-	local_ni = msg->msg_txni;
-
-	/*
-	 * the lnet_net_lock(0) is used to protect the addref on the ni
-	 * and the recovery queue.
-	 */
-	lnet_net_lock(0);
-	/* the mt could've shutdown and cleaned up the queues */
-	if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) {
-		lnet_net_unlock(0);
-		return;
-	}
-
-	lnet_dec_healthv_locked(&local_ni->ni_healthv);
-	/*
-	 * add the NI to the recovery queue if it's not already there
-	 * and it's health value is actually below the maximum. It's
-	 * possible that the sensitivity might be set to 0, and the health
-	 * value will not be reduced. In this case, there is no reason to
-	 * invoke recovery
-	 */
-	if (list_empty(&local_ni->ni_recovery) &&
-	    atomic_read(&local_ni->ni_healthv) < LNET_MAX_HEALTH_VALUE) {
-		CDEBUG(D_NET, "ni %s added to recovery queue. Health = %d\n",
-			libcfs_nid2str(local_ni->ni_nid),
-			atomic_read(&local_ni->ni_healthv));
-		list_add_tail(&local_ni->ni_recovery,
-			      &the_lnet.ln_mt_localNIRecovq);
-		lnet_ni_addref_locked(local_ni, 0);
-	}
-	lnet_net_unlock(0);
-}
-
-void
-lnet_handle_remote_failure_locked(struct lnet_peer_ni *lpni)
-{
-	/* lpni could be NULL if we're in the LOLND case */
-	if (!lpni)
-		return;
-
-	lnet_dec_healthv_locked(&lpni->lpni_healthv);
-	/*
-	 * add the peer NI to the recovery queue if it's not already there
-	 * and it's health value is actually below the maximum. It's
-	 * possible that the sensitivity might be set to 0, and the health
-	 * value will not be reduced. In this case, there is no reason to
-	 * invoke recovery
-	 */
-	lnet_peer_ni_add_to_recoveryq_locked(lpni);
-}
-
-static void
-lnet_handle_remote_failure(struct lnet_peer_ni *lpni)
-{
-	/* lpni could be NULL if we're in the LOLND case */
-	if (!lpni)
-		return;
-
-	lnet_net_lock(0);
-	/* the mt could've shutdown and cleaned up the queues */
-	if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) {
-		lnet_net_unlock(0);
-		return;
-	}
-	lnet_handle_remote_failure_locked(lpni);
-	lnet_net_unlock(0);
-}
-
-static void
-lnet_incr_hstats(struct lnet_msg *msg, enum lnet_msg_hstatus hstatus)
-{
-	struct lnet_ni *ni = msg->msg_txni;
-	struct lnet_peer_ni *lpni = msg->msg_txpeer;
-	struct lnet_counters_health *health;
-
-	health = &the_lnet.ln_counters[0]->lct_health;
-
-	switch (hstatus) {
-	case LNET_MSG_STATUS_LOCAL_INTERRUPT:
-		atomic_inc(&ni->ni_hstats.hlt_local_interrupt);
-		health->lch_local_interrupt_count++;
-		break;
-	case LNET_MSG_STATUS_LOCAL_DROPPED:
-		atomic_inc(&ni->ni_hstats.hlt_local_dropped);
-		health->lch_local_dropped_count++;
-		break;
-	case LNET_MSG_STATUS_LOCAL_ABORTED:
-		atomic_inc(&ni->ni_hstats.hlt_local_aborted);
-		health->lch_local_aborted_count++;
-		break;
-	case LNET_MSG_STATUS_LOCAL_NO_ROUTE:
-		atomic_inc(&ni->ni_hstats.hlt_local_no_route);
-		health->lch_local_no_route_count++;
-		break;
-	case LNET_MSG_STATUS_LOCAL_TIMEOUT:
-		atomic_inc(&ni->ni_hstats.hlt_local_timeout);
-		health->lch_local_timeout_count++;
-		break;
-	case LNET_MSG_STATUS_LOCAL_ERROR:
-		atomic_inc(&ni->ni_hstats.hlt_local_error);
-		health->lch_local_error_count++;
-		break;
-	case LNET_MSG_STATUS_REMOTE_DROPPED:
-		if (lpni)
-			atomic_inc(&lpni->lpni_hstats.hlt_remote_dropped);
-		health->lch_remote_dropped_count++;
-		break;
-	case LNET_MSG_STATUS_REMOTE_ERROR:
-		if (lpni)
-			atomic_inc(&lpni->lpni_hstats.hlt_remote_error);
-		health->lch_remote_error_count++;
-		break;
-	case LNET_MSG_STATUS_REMOTE_TIMEOUT:
-		if (lpni)
-			atomic_inc(&lpni->lpni_hstats.hlt_remote_timeout);
-		health->lch_remote_timeout_count++;
-		break;
-	case LNET_MSG_STATUS_NETWORK_TIMEOUT:
-		if (lpni)
-			atomic_inc(&lpni->lpni_hstats.hlt_network_timeout);
-		health->lch_network_timeout_count++;
-		break;
-	case LNET_MSG_STATUS_OK:
-		break;
-	default:
-		LBUG();
-	}
-}
-
-static void
-lnet_resend_msg_locked(struct lnet_msg *msg)
-{
-	msg->msg_retry_count++;
-
-	/*
-	 * remove message from the active list and reset it to prepare
-	 * for a resend. Two exceptions to this
-	 *
-	 * 1. the router case. When a message is being routed it is
-	 * committed for rx when received and committed for tx when
-	 * forwarded. We don't want to remove it from the active list, since
-	 * code which handles receiving expects it to remain on the active
-	 * list.
-	 *
-	 * 2. The REPLY case. Reply messages use the same message
-	 * structure for the GET that was received.
-	 */
-	if (!msg->msg_routing && msg->msg_type != LNET_MSG_REPLY) {
-		list_del_init(&msg->msg_activelist);
-		msg->msg_onactivelist = 0;
-	}
-	/*
-	 * The msg_target.nid which was originally set
-	 * when calling LNetGet() or LNetPut() might've
-	 * been overwritten if we're routing this message.
-	 * Call lnet_msg_decommit_tx() to return the credit
-	 * this message consumed. The message will
-	 * consume another credit when it gets resent.
-	 */
-	msg->msg_target.nid = msg->msg_hdr.dest_nid;
-	lnet_msg_decommit_tx(msg, -EAGAIN);
-	msg->msg_sending = 0;
-	msg->msg_receiving = 0;
-	msg->msg_target_is_router = 0;
-
-	CDEBUG(D_NET, "%s->%s:%s:%s - queuing msg (%p) for resend\n",
-	       libcfs_nid2str(msg->msg_hdr.src_nid),
-	       libcfs_nid2str(msg->msg_hdr.dest_nid),
-	       lnet_msgtyp2str(msg->msg_type),
-	       lnet_health_error2str(msg->msg_health_status), msg);
-
-	list_add_tail(&msg->msg_list, the_lnet.ln_mt_resendqs[msg->msg_tx_cpt]);
-
-	wake_up(&the_lnet.ln_mt_waitq);
-}
-
-int
-lnet_check_finalize_recursion_locked(struct lnet_msg *msg,
-				     struct list_head *containerq,
-				     int nworkers, void **workers)
-{
-	int my_slot = -1;
-	int i;
-
-	list_add_tail(&msg->msg_list, containerq);
-
-	for (i = 0; i < nworkers; i++) {
-		if (workers[i] == current)
-			break;
-
-		if (my_slot < 0 && workers[i] == NULL)
-			my_slot = i;
-	}
-
-	if (i < nworkers || my_slot < 0)
-		return -1;
-
-	workers[my_slot] = current;
-
-	return my_slot;
-}
-
-int
-lnet_attempt_msg_resend(struct lnet_msg *msg)
-{
-	struct lnet_msg_container *container;
-	int my_slot;
-	int cpt;
-
-	/* we can only resend tx_committed messages */
-	LASSERT(msg->msg_tx_committed);
-
-	/* don't resend recovery messages */
-	if (msg->msg_recovery) {
-		CDEBUG(D_NET, "msg %s->%s is a recovery ping. retry# %d\n",
-			libcfs_nid2str(msg->msg_from),
-			libcfs_nid2str(msg->msg_target.nid),
-			msg->msg_retry_count);
-		return -ENOTRECOVERABLE;
-	}
-
-	/*
-	 * if we explicitly indicated we don't want to resend then just
-	 * return
-	 */
-	if (msg->msg_no_resend) {
-		CDEBUG(D_NET, "msg %s->%s requested no resend. retry# %d\n",
-			libcfs_nid2str(msg->msg_from),
-			libcfs_nid2str(msg->msg_target.nid),
-			msg->msg_retry_count);
-		return -ENOTRECOVERABLE;
-	}
-
-	/* check if the message has exceeded the number of retries */
-	if (msg->msg_retry_count >= lnet_retry_count) {
-		CNETERR("msg %s->%s exceeded retry count %d\n",
-			libcfs_nid2str(msg->msg_from),
-			libcfs_nid2str(msg->msg_target.nid),
-			msg->msg_retry_count);
-		return -ENOTRECOVERABLE;
-	}
-
-	cpt = msg->msg_tx_cpt;
-	lnet_net_lock(cpt);
-
-	/* check again under lock */
-	if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) {
-		lnet_net_unlock(cpt);
-		return -ESHUTDOWN;
-	}
-
-	container = the_lnet.ln_msg_containers[cpt];
-	my_slot =
-		lnet_check_finalize_recursion_locked(msg,
-					&container->msc_resending,
-					container->msc_nfinalizers,
-					container->msc_resenders);
-
-	/* enough threads are resending */
-	if (my_slot == -1) {
-		lnet_net_unlock(cpt);
-		return 0;
-	}
-
-	while (!list_empty(&container->msc_resending)) {
-		msg = list_entry(container->msc_resending.next,
-					struct lnet_msg, msg_list);
-		list_del(&msg->msg_list);
-
-		/*
-		 * resending the message will require us to call
-		 * lnet_msg_decommit_tx() which will return the credit
-		 * which this message holds. This could trigger another
-		 * queued message to be sent. If that message fails and
-		 * requires a resend we will recurse.
-		 * But since at this point the slot is taken, the message
-		 * will be queued in the container and dealt with
-		 * later. This breaks the recursion.
-		 */
-		lnet_resend_msg_locked(msg);
-	}
-
-	/*
-	 * msc_resenders is an array of process pointers. Each entry holds
-	 * a pointer to the current process operating on the message. An
-	 * array entry is created per CPT. If the array slot is already
-	 * set, then it means that there is a thread on the CPT currently
-	 * resending a message.
-	 * Once the thread finishes clear the slot to enable the thread to
-	 * take on more resend work.
-	 */
-	container->msc_resenders[my_slot] = NULL;
-	lnet_net_unlock(cpt);
-
-	return 0;
-}
-
-/*
- * Do a health check on the message:
- * return -1 if we're not going to handle the error or
- *   if we've reached the maximum number of retries.
- *   success case will return -1 as well
- * return 0 if it the message is requeued for send
- */
-static int
-lnet_health_check(struct lnet_msg *msg)
-{
-	enum lnet_msg_hstatus hstatus = msg->msg_health_status;
-	bool lo = false;
-
-	/* if we're shutting down no point in handling health. */
-	if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING)
-		return -1;
-
-	LASSERT(msg->msg_txni);
-
-	/*
-	 * if we're sending to the LOLND then the msg_txpeer will not be
-	 * set. So no need to sanity check it.
-	 */
-	if (msg->msg_txni->ni_nid != LNET_NID_LO_0)
-		LASSERT(msg->msg_txpeer);
-	else
-		lo = true;
-
-	if (hstatus != LNET_MSG_STATUS_OK &&
-	    ktime_compare(ktime_get(), msg->msg_deadline) >= 0)
-		return -1;
-
-	/*
-	 * stats are only incremented for errors so avoid wasting time
-	 * incrementing statistics if there is no error.
-	 */
-	if (hstatus != LNET_MSG_STATUS_OK) {
-		lnet_net_lock(0);
-		lnet_incr_hstats(msg, hstatus);
-		lnet_net_unlock(0);
-	}
-
-	CDEBUG(D_NET, "health check: %s->%s: %s: %s\n",
-	       libcfs_nid2str(msg->msg_txni->ni_nid),
-	       (lo) ? "self" : libcfs_nid2str(msg->msg_txpeer->lpni_nid),
-	       lnet_msgtyp2str(msg->msg_type),
-	       lnet_health_error2str(hstatus));
-
-	switch (hstatus) {
-	case LNET_MSG_STATUS_OK:
-		lnet_inc_healthv(&msg->msg_txni->ni_healthv);
-		/*
-		 * It's possible msg_txpeer is NULL in the LOLND
-		 * case.
-		 */
-		if (msg->msg_txpeer)
-			lnet_inc_healthv(&msg->msg_txpeer->lpni_healthv);
-
-		/* we can finalize this message */
-		return -1;
-	case LNET_MSG_STATUS_LOCAL_INTERRUPT:
-	case LNET_MSG_STATUS_LOCAL_DROPPED:
-	case LNET_MSG_STATUS_LOCAL_ABORTED:
-	case LNET_MSG_STATUS_LOCAL_NO_ROUTE:
-	case LNET_MSG_STATUS_LOCAL_TIMEOUT:
-		lnet_handle_local_failure(msg);
-		/* add to the re-send queue */
-		return lnet_attempt_msg_resend(msg);
-
-	/*
-	 * These errors will not trigger a resend so simply
-	 * finalize the message
-	 */
-	case LNET_MSG_STATUS_LOCAL_ERROR:
-		lnet_handle_local_failure(msg);
-		return -1;
-
-	/*
-	 * TODO: since the remote dropped the message we can
-	 * attempt a resend safely.
-	 */
-	case LNET_MSG_STATUS_REMOTE_DROPPED:
-		lnet_handle_remote_failure(msg->msg_txpeer);
-		return lnet_attempt_msg_resend(msg);
-
-	case LNET_MSG_STATUS_REMOTE_ERROR:
-	case LNET_MSG_STATUS_REMOTE_TIMEOUT:
-	case LNET_MSG_STATUS_NETWORK_TIMEOUT:
-		lnet_handle_remote_failure(msg->msg_txpeer);
-		return -1;
-	default:
-		LBUG();
-	}
-
-	/* no resend is needed */
-	return -1;
-}
-
-static void
-lnet_msg_detach_md(struct lnet_msg *msg, int cpt, int status)
-{
-	struct lnet_libmd *md = msg->msg_md;
-	int unlink;
-
-	/* Now it's safe to drop my caller's ref */
-	md->md_refcount--;
-	LASSERT(md->md_refcount >= 0);
-
-	unlink = lnet_md_unlinkable(md);
-	if (md->md_eq != NULL) {
-		msg->msg_ev.status   = status;
-		msg->msg_ev.unlinked = unlink;
-		lnet_eq_enqueue_event(md->md_eq, &msg->msg_ev);
-	}
-
-	if (unlink || (md->md_refcount == 0 &&
-		       md->md_threshold == LNET_MD_THRESH_INF))
-		lnet_detach_rsp_tracker(md, cpt);
-
-	if (unlink)
-		lnet_md_unlink(md);
-
-	msg->msg_md = NULL;
-}
-
-static bool
-lnet_is_health_check(struct lnet_msg *msg)
-{
-	bool hc;
-	int status = msg->msg_ev.status;
-
-	if ((!msg->msg_tx_committed && !msg->msg_rx_committed) ||
-	    !msg->msg_onactivelist) {
-		CDEBUG(D_NET, "msg %p not committed for send or receive\n",
-		       msg);
-		return false;
-	}
-
-	if ((msg->msg_tx_committed && !msg->msg_txpeer) ||
-	    (msg->msg_rx_committed && !msg->msg_rxpeer)) {
-		CDEBUG(D_NET, "msg %p failed too early to retry and send\n",
-		       msg);
-		return false;
-	}
-
-	/*
-	 * perform a health check for any message committed for transmit
-	 */
-	hc = msg->msg_tx_committed;
-
-	/* Check for status inconsistencies */
-	if (hc &&
-	    ((!status && msg->msg_health_status != LNET_MSG_STATUS_OK) ||
-	     (status && msg->msg_health_status == LNET_MSG_STATUS_OK))) {
-		CDEBUG(D_NET, "Msg %p is in inconsistent state, don't perform health "
-			      "checking (%d, %d)\n", msg, status,
-			      msg->msg_health_status);
-		hc = false;
-	}
-
-	CDEBUG(D_NET, "health check = %d, status = %d, hstatus = %d\n",
-	       hc, status, msg->msg_health_status);
-
-	return hc;
-}
-
-char *
-lnet_health_error2str(enum lnet_msg_hstatus hstatus)
-{
-	switch (hstatus) {
-	case LNET_MSG_STATUS_LOCAL_INTERRUPT:
-		return "LOCAL_INTERRUPT";
-	case LNET_MSG_STATUS_LOCAL_DROPPED:
-		return "LOCAL_DROPPED";
-	case LNET_MSG_STATUS_LOCAL_ABORTED:
-		return "LOCAL_ABORTED";
-	case LNET_MSG_STATUS_LOCAL_NO_ROUTE:
-		return "LOCAL_NO_ROUTE";
-	case LNET_MSG_STATUS_LOCAL_TIMEOUT:
-		return "LOCAL_TIMEOUT";
-	case LNET_MSG_STATUS_LOCAL_ERROR:
-		return "LOCAL_ERROR";
-	case LNET_MSG_STATUS_REMOTE_DROPPED:
-		return "REMOTE_DROPPED";
-	case LNET_MSG_STATUS_REMOTE_ERROR:
-		return "REMOTE_ERROR";
-	case LNET_MSG_STATUS_REMOTE_TIMEOUT:
-		return "REMOTE_TIMEOUT";
-	case LNET_MSG_STATUS_NETWORK_TIMEOUT:
-		return "NETWORK_TIMEOUT";
-	case LNET_MSG_STATUS_OK:
-		return "OK";
-	default:
-		return "<UNKNOWN>";
-	}
-}
-
-bool
-lnet_send_error_simulation(struct lnet_msg *msg,
-			   enum lnet_msg_hstatus *hstatus)
-{
-	if (!msg)
-		return false;
-
-	if (list_empty(&the_lnet.ln_drop_rules))
-	    return false;
-
-	/* match only health rules */
-	if (!lnet_drop_rule_match(&msg->msg_hdr, hstatus))
-		return false;
-
-	CDEBUG(D_NET, "src %s, dst %s: %s simulate health error: %s\n",
-		libcfs_nid2str(msg->msg_hdr.src_nid),
-		libcfs_nid2str(msg->msg_hdr.dest_nid),
-		lnet_msgtyp2str(msg->msg_type),
-		lnet_health_error2str(*hstatus));
-
-	return true;
-}
-EXPORT_SYMBOL(lnet_send_error_simulation);
-
 void
 lnet_finalize(struct lnet_msg *msg, int status)
 {
-	struct lnet_msg_container *container;
-	int my_slot;
-	int cpt;
-	int rc;
+	struct lnet_msg_container	*container;
+	int				my_slot;
+	int				cpt;
+	int				rc;
+	int				i;
 
 	LASSERT(!in_interrupt());
 
@@ -990,35 +464,16 @@ lnet_finalize(struct lnet_msg *msg, int status)
 
 	msg->msg_ev.status = status;
 
-	if (lnet_is_health_check(msg)) {
-		/*
-		 * Check the health status of the message. If it has one
-		 * of the errors that we're supposed to handle, and it has
-		 * not timed out, then
-		 *	1. Decrement the appropriate health_value
-		 *	2. queue the message on the resend queue
-
-		 * if the message send is success, timed out or failed in the
-		 * health check for any reason then we'll just finalize the
-		 * message. Otherwise just return since the message has been
-		 * put on the resend queue.
-		 */
-		if (!lnet_health_check(msg))
-			return;
-	}
-
-	/*
-	 * We're not going to resend this message so detach its MD and invoke
-	 * the appropriate callbacks
-	 */
 	if (msg->msg_md != NULL) {
 		cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie);
+
 		lnet_res_lock(cpt);
-		lnet_msg_detach_md(msg, cpt, status);
+		lnet_msg_detach_md(msg, status);
 		lnet_res_unlock(cpt);
 	}
 
-again:
+ again:
+	rc = 0;
 	if (!msg->msg_tx_committed && !msg->msg_rx_committed) {
 		/* not committed to network yet */
 		LASSERT(!msg->msg_onactivelist);
@@ -1035,26 +490,32 @@ lnet_finalize(struct lnet_msg *msg, int status)
 	lnet_net_lock(cpt);
 
 	container = the_lnet.ln_msg_containers[cpt];
+	list_add_tail(&msg->msg_list, &container->msc_finalizing);
 
 	/* Recursion breaker.  Don't complete the message here if I am (or
 	 * enough other threads are) already completing messages */
-	my_slot = lnet_check_finalize_recursion_locked(msg,
-						&container->msc_finalizing,
-						container->msc_nfinalizers,
-						container->msc_finalizers);
 
-	/* enough threads are resending */
-	if (my_slot == -1) {
+	my_slot = -1;
+	for (i = 0; i < container->msc_nfinalizers; i++) {
+		if (container->msc_finalizers[i] == current)
+			break;
+
+		if (my_slot < 0 && container->msc_finalizers[i] == NULL)
+			my_slot = i;
+	}
+
+	if (i < container->msc_nfinalizers || my_slot < 0) {
 		lnet_net_unlock(cpt);
 		return;
 	}
 
-	rc = 0;
+	container->msc_finalizers[my_slot] = current;
+
 	while (!list_empty(&container->msc_finalizing)) {
 		msg = list_entry(container->msc_finalizing.next,
 				 struct lnet_msg, msg_list);
 
-		list_del_init(&msg->msg_list);
+		list_del(&msg->msg_list);
 
 		/* NB drops and regains the lnet lock if it actually does
 		 * anything, so my finalizing friends can chomp along too */
@@ -1092,7 +553,7 @@ lnet_msg_container_cleanup(struct lnet_msg_container *container)
 				  struct lnet_msg, msg_activelist);
 		LASSERT(msg->msg_onactivelist);
 		msg->msg_onactivelist = 0;
-		list_del_init(&msg->msg_activelist);
+		list_del(&msg->msg_activelist);
 		lnet_msg_free(msg);
 		count++;
 	}
@@ -1106,13 +567,6 @@ lnet_msg_container_cleanup(struct lnet_msg_container *container)
 			    sizeof(*container->msc_finalizers));
 		container->msc_finalizers = NULL;
 	}
-
-	if (container->msc_resenders != NULL) {
-		LIBCFS_FREE(container->msc_resenders,
-			    container->msc_nfinalizers *
-			    sizeof(*container->msc_resenders));
-		container->msc_resenders = NULL;
-	}
 	container->msc_init = 0;
 }
 
@@ -1125,7 +579,6 @@ lnet_msg_container_setup(struct lnet_msg_container *container, int cpt)
 
 	INIT_LIST_HEAD(&container->msc_active);
 	INIT_LIST_HEAD(&container->msc_finalizing);
-	INIT_LIST_HEAD(&container->msc_resending);
 
 	/* number of CPUs */
 	container->msc_nfinalizers = cfs_cpt_weight(lnet_cpt_table(), cpt);
@@ -1142,16 +595,6 @@ lnet_msg_container_setup(struct lnet_msg_container *container, int cpt)
 		return -ENOMEM;
 	}
 
-	LIBCFS_CPT_ALLOC(container->msc_resenders, lnet_cpt_table(), cpt,
-			 container->msc_nfinalizers *
-			 sizeof(*container->msc_resenders));
-
-	if (container->msc_resenders == NULL) {
-		CERROR("Failed to allocate message resenders\n");
-		lnet_msg_container_cleanup(container);
-		return -ENOMEM;
-	}
-
 	return rc;
 }
 
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c b/drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c
index 75a352dec6ff8..3773ed9e2436c 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c
@@ -21,7 +21,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
index ba330c6d2af1c..973587a2a1dc5 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2015, 2017, Intel Corporation.
+ * Copyright (c) 2015, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,9 +40,9 @@
 #include <linux/syscalls.h>
 #include <net/sock.h>
 
-#include <libcfs/linux/linux-time.h>
 #include <libcfs/linux/linux-net.h>
 #include <libcfs/libcfs.h>
+#include <libcfs/linux/linux-time.h>
 #include <lnet/lib-lnet.h>
 
 /*
@@ -66,6 +66,20 @@
 #define SO_RCVTIMEO SO_RCVTIMEO_OLD
 #endif
 
+static int
+lnet_sock_create_kern(struct socket **sock, struct net *ns)
+{
+	int rc;
+
+#ifdef HAVE_SOCK_CREATE_KERN_USE_NET
+	rc = sock_create_kern(ns, PF_INET, SOCK_STREAM, 0, sock);
+#else
+	rc = sock_create_kern(PF_INET, SOCK_STREAM, 0, sock);
+#endif
+
+	return rc;
+}
+
 int
 lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout)
 {
@@ -172,17 +186,13 @@ lnet_sock_create(struct socket **sockp, int *fatal,
 		 __u32 local_ip, int local_port, struct net *ns)
 {
 	struct sockaddr_in  locaddr;
-	struct socket	   *sock;
-	int		    rc;
+	struct socket *sock;
+	int rc;
 
 	/* All errors are fatal except bind failure if the port is in use */
 	*fatal = 1;
 
-#ifdef HAVE_SOCK_CREATE_KERN_USE_NET
-	rc = sock_create_kern(ns, PF_INET, SOCK_STREAM, 0, &sock);
-#else
-	rc = sock_create_kern(PF_INET, SOCK_STREAM, 0, &sock);
-#endif
+	rc = lnet_sock_create_kern(&sock, ns);
 	*sockp = sock;
 	if (rc != 0) {
 		CERROR("Can't create socket: %d\n", rc);
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lo.c b/drivers/staging/lustrefsx/lnet/lnet/lo.c
index a11ecddb08349..eaa06fb41631d 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lo.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lo.c
@@ -22,8 +22,6 @@
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
- *
- * Copyright (c) 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lnet/lnet/module.c b/drivers/staging/lustrefsx/lnet/lnet/module.c
index 676f7345ca576..a7190dd79d002 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/module.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/module.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -31,9 +31,8 @@
  */
 
 #define DEBUG_SUBSYSTEM S_LNET
-
 #include <lnet/lib-lnet.h>
-#include <uapi/linux/lnet/lnet-dlc.h>
+#include <lnet/lib-dlc.h>
 
 static int config_on_load = 0;
 module_param(config_on_load, int, 0444);
@@ -172,45 +171,36 @@ lnet_dyn_unconfigure_ni(struct libcfs_ioctl_hdr *hdr)
 }
 
 static int
-lnet_ioctl(struct notifier_block *nb,
-	   unsigned long cmd, void *vdata)
+lnet_ioctl(unsigned int cmd, struct libcfs_ioctl_hdr *hdr)
 {
-	struct libcfs_ioctl_hdr *hdr = vdata;
-	int rc;
+	int   rc;
 
 	switch (cmd) {
 	case IOC_LIBCFS_CONFIGURE: {
 		struct libcfs_ioctl_data *data =
 		  (struct libcfs_ioctl_data *)hdr;
 
-		if (data->ioc_hdr.ioc_len < sizeof(*data)) {
-			rc = -EINVAL;
-		} else {
-			the_lnet.ln_nis_from_mod_params = data->ioc_flags;
-			rc = lnet_configure(NULL);
-		}
-		break;
+		if (data->ioc_hdr.ioc_len < sizeof(*data))
+			return -EINVAL;
+
+		the_lnet.ln_nis_from_mod_params = data->ioc_flags;
+		return lnet_configure(NULL);
 	}
 
 	case IOC_LIBCFS_UNCONFIGURE:
-		rc = lnet_unconfigure();
-		break;
+		return lnet_unconfigure();
 
 	case IOC_LIBCFS_ADD_NET:
-		rc = lnet_dyn_configure_net(hdr);
-		break;
+		return lnet_dyn_configure_net(hdr);
 
 	case IOC_LIBCFS_DEL_NET:
-		rc = lnet_dyn_unconfigure_net(hdr);
-		break;
+		return lnet_dyn_unconfigure_net(hdr);
 
 	case IOC_LIBCFS_ADD_LOCAL_NI:
-		rc = lnet_dyn_configure_ni(hdr);
-		break;
+		return lnet_dyn_configure_ni(hdr);
 
 	case IOC_LIBCFS_DEL_LOCAL_NI:
-		rc = lnet_dyn_unconfigure_ni(hdr);
-		break;
+		return lnet_dyn_unconfigure_ni(hdr);
 
 	default:
 		/* Passing LNET_PID_ANY only gives me a ref if the net is up
@@ -221,14 +211,11 @@ lnet_ioctl(struct notifier_block *nb,
 			rc = LNetCtl(cmd, hdr);
 			LNetNIFini();
 		}
-		break;
+		return rc;
 	}
-	return notifier_from_ioctl_errno(rc);
 }
 
-static struct notifier_block lnet_ioctl_handler = {
-	.notifier_call = lnet_ioctl,
-};
+DECLARE_IOCTL_HANDLER(lnet_ioctl_handler, lnet_ioctl);
 
 static int __init lnet_init(void)
 {
@@ -243,8 +230,7 @@ static int __init lnet_init(void)
 		RETURN(rc);
 	}
 
-	rc = blocking_notifier_chain_register(&libcfs_ioctl_list,
-					      &lnet_ioctl_handler);
+	rc = libcfs_register_ioctl(&lnet_ioctl_handler);
 	LASSERT(rc == 0);
 
 	if (config_on_load) {
@@ -260,8 +246,7 @@ static void __exit lnet_exit(void)
 {
 	int rc;
 
-	rc = blocking_notifier_chain_unregister(&libcfs_ioctl_list,
-						&lnet_ioctl_handler);
+	rc = libcfs_deregister_ioctl(&lnet_ioctl_handler);
 	LASSERT(rc == 0);
 
 	lnet_lib_exit();
diff --git a/drivers/staging/lustrefsx/lnet/lnet/net_fault.c b/drivers/staging/lustrefsx/lnet/lnet/net_fault.c
index e2172da009db5..b3d5b907a827b 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/net_fault.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/net_fault.c
@@ -21,7 +21,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2014, 2017, Intel Corporation.
+ * Copyright (c) 2014, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -37,7 +37,7 @@
 #define DEBUG_SUBSYSTEM S_LNET
 
 #include <lnet/lib-lnet.h>
-#include <uapi/linux/lnet/lnetctl.h>
+#include <lnet/lnetctl.h>
 
 #define LNET_MSG_MASK		(LNET_PUT_BIT | LNET_ACK_BIT | \
 				 LNET_GET_BIT | LNET_REPLY_BIT)
@@ -57,9 +57,9 @@ struct lnet_drop_rule {
 	/**
 	 * seconds to drop the next message, it's exclusive with dr_drop_at
 	 */
-	time64_t		dr_drop_time;
+	cfs_time_t		dr_drop_time;
 	/** baseline to caculate dr_drop_time */
-	time64_t		dr_time_base;
+	cfs_time_t		dr_time_base;
 	/** statistic of dropped messages */
 	struct lnet_fault_stat	dr_stat;
 };
@@ -170,9 +170,9 @@ lnet_drop_rule_add(struct lnet_fault_attr *attr)
 
 	rule->dr_attr = *attr;
 	if (attr->u.drop.da_interval != 0) {
-		rule->dr_time_base = ktime_get_seconds() + attr->u.drop.da_interval;
-		rule->dr_drop_time = ktime_get_seconds() +
-				     cfs_rand() % attr->u.drop.da_interval;
+		rule->dr_time_base = cfs_time_shift(attr->u.drop.da_interval);
+		rule->dr_drop_time = cfs_time_shift(cfs_rand() %
+						    attr->u.drop.da_interval);
 	} else {
 		rule->dr_drop_at = cfs_rand() % attr->u.drop.da_rate;
 	}
@@ -283,9 +283,10 @@ lnet_drop_rule_reset(void)
 		if (attr->u.drop.da_rate != 0) {
 			rule->dr_drop_at = cfs_rand() % attr->u.drop.da_rate;
 		} else {
-			rule->dr_drop_time = ktime_get_seconds() +
-					     cfs_rand() % attr->u.drop.da_interval;
-			rule->dr_time_base = ktime_get_seconds() + attr->u.drop.da_interval;
+			rule->dr_drop_time = cfs_time_shift(cfs_rand() %
+						attr->u.drop.da_interval);
+			rule->dr_time_base = cfs_time_shift(attr->u.drop.
+								  da_interval);
 		}
 		spin_unlock(&rule->dr_lock);
 	}
@@ -294,58 +295,13 @@ lnet_drop_rule_reset(void)
 	EXIT;
 }
 
-static void
-lnet_fault_match_health(enum lnet_msg_hstatus *hstatus, __u32 mask)
-{
-	unsigned int random;
-	int choice;
-	int delta;
-	int best_delta;
-	int i;
-
-	/* assign a random failure */
-	random = cfs_rand();
-	choice = random % (LNET_MSG_STATUS_END - LNET_MSG_STATUS_OK);
-	if (choice == 0)
-		choice++;
-
-	if (mask == HSTATUS_RANDOM) {
-		*hstatus = choice;
-		return;
-	}
-
-	if (mask & (1 << choice)) {
-		*hstatus = choice;
-		return;
-	}
-
-	/* round to the closest ON bit */
-	i = HSTATUS_END;
-	best_delta = HSTATUS_END;
-	while (i > 0) {
-		if (mask & (1 << i)) {
-			delta = choice - i;
-			if (delta < 0)
-				delta *= -1;
-			if (delta < best_delta) {
-				best_delta = delta;
-				choice = i;
-			}
-		}
-		i--;
-	}
-
-	*hstatus = choice;
-}
-
 /**
  * check source/destination NID, portal, message type and drop rate,
  * decide whether should drop this message or not
  */
 static bool
 drop_rule_match(struct lnet_drop_rule *rule, lnet_nid_t src,
-		lnet_nid_t dst, unsigned int type, unsigned int portal,
-		enum lnet_msg_hstatus *hstatus)
+		lnet_nid_t dst, unsigned int type, unsigned int portal)
 {
 	struct lnet_fault_attr	*attr = &rule->dr_attr;
 	bool			 drop;
@@ -353,36 +309,24 @@ drop_rule_match(struct lnet_drop_rule *rule, lnet_nid_t src,
 	if (!lnet_fault_attr_match(attr, src, dst, type, portal))
 		return false;
 
-	/*
-	 * if we're trying to match a health status error but it hasn't
-	 * been set in the rule, then don't match
-	 */
-	if ((hstatus && !attr->u.drop.da_health_error_mask) ||
-	    (!hstatus && attr->u.drop.da_health_error_mask))
-		return false;
-
 	/* match this rule, check drop rate now */
 	spin_lock(&rule->dr_lock);
-	if (attr->u.drop.da_random) {
-		int value = cfs_rand() % attr->u.drop.da_interval;
-		if (value >= (attr->u.drop.da_interval / 2))
-			drop = true;
-		else
-			drop = false;
-	} else if (rule->dr_drop_time != 0) { /* time based drop */
-		time64_t now = ktime_get_seconds();
+	if (rule->dr_drop_time != 0) { /* time based drop */
+		cfs_time_t now = cfs_time_current();
 
 		rule->dr_stat.fs_count++;
-		drop = now >= rule->dr_drop_time;
+		drop = cfs_time_aftereq(now, rule->dr_drop_time);
 		if (drop) {
-			if (now > rule->dr_time_base)
+			if (cfs_time_after(now, rule->dr_time_base))
 				rule->dr_time_base = now;
 
 			rule->dr_drop_time = rule->dr_time_base +
-					     cfs_rand() % attr->u.drop.da_interval;
-			rule->dr_time_base += attr->u.drop.da_interval;
+					     cfs_time_seconds(cfs_rand() %
+						attr->u.drop.da_interval);
+			rule->dr_time_base += cfs_time_seconds(attr->u.drop.
+							       da_interval);
 
-			CDEBUG(D_NET, "Drop Rule %s->%s: next drop : %lld\n",
+			CDEBUG(D_NET, "Drop Rule %s->%s: next drop : %ld\n",
 			       libcfs_nid2str(attr->fa_src),
 			       libcfs_nid2str(attr->fa_dst),
 			       rule->dr_drop_time);
@@ -403,9 +347,6 @@ drop_rule_match(struct lnet_drop_rule *rule, lnet_nid_t src,
 	}
 
 	if (drop) { /* drop this message, update counters */
-		if (hstatus)
-			lnet_fault_match_health(hstatus,
-				attr->u.drop.da_health_error_mask);
 		lnet_fault_stat_inc(&rule->dr_stat, type);
 		rule->dr_stat.u.drop.ds_dropped++;
 	}
@@ -418,15 +359,15 @@ drop_rule_match(struct lnet_drop_rule *rule, lnet_nid_t src,
  * Check if message from \a src to \a dst can match any existed drop rule
  */
 bool
-lnet_drop_rule_match(struct lnet_hdr *hdr, enum lnet_msg_hstatus *hstatus)
+lnet_drop_rule_match(struct lnet_hdr *hdr)
 {
-	lnet_nid_t src = le64_to_cpu(hdr->src_nid);
-	lnet_nid_t dst = le64_to_cpu(hdr->dest_nid);
-	unsigned int typ = le32_to_cpu(hdr->type);
-	struct lnet_drop_rule *rule;
-	unsigned int ptl = -1;
-	bool drop = false;
-	int cpt;
+	struct lnet_drop_rule	*rule;
+	lnet_nid_t		 src = le64_to_cpu(hdr->src_nid);
+	lnet_nid_t		 dst = le64_to_cpu(hdr->dest_nid);
+	unsigned int		 typ = le32_to_cpu(hdr->type);
+	unsigned int		 ptl = -1;
+	bool			 drop = false;
+	int			 cpt;
 
 	/* NB: if Portal is specified, then only PUT and GET will be
 	 * filtered by drop rule */
@@ -437,13 +378,12 @@ lnet_drop_rule_match(struct lnet_hdr *hdr, enum lnet_msg_hstatus *hstatus)
 
 	cpt = lnet_net_lock_current();
 	list_for_each_entry(rule, &the_lnet.ln_drop_rules, dr_link) {
-		drop = drop_rule_match(rule, src, dst, typ, ptl,
-				       hstatus);
+		drop = drop_rule_match(rule, src, dst, typ, ptl);
 		if (drop)
 			break;
 	}
-	lnet_net_unlock(cpt);
 
+	lnet_net_unlock(cpt);
 	return drop;
 }
 
@@ -472,9 +412,9 @@ struct lnet_delay_rule {
 	/**
 	 * seconds to delay the next message, it's exclusive with dl_delay_at
 	 */
-	time64_t		dl_delay_time;
+	cfs_time_t		dl_delay_time;
 	/** baseline to caculate dl_delay_time */
-	time64_t		dl_time_base;
+	cfs_time_t		dl_time_base;
 	/** jiffies to send the next delayed message */
 	unsigned long		dl_msg_send;
 	/** delayed message list */
@@ -504,6 +444,13 @@ struct delay_daemon_data {
 
 static struct delay_daemon_data	delay_dd;
 
+static cfs_time_t
+round_timeout(cfs_time_t timeout)
+{
+	return cfs_time_seconds((unsigned int)
+			cfs_duration_sec(cfs_time_sub(timeout, 0)) + 1);
+}
+
 static void
 delay_rule_decref(struct lnet_delay_rule *rule)
 {
@@ -525,9 +472,8 @@ delay_rule_match(struct lnet_delay_rule *rule, lnet_nid_t src,
 		lnet_nid_t dst, unsigned int type, unsigned int portal,
 		struct lnet_msg *msg)
 {
-	struct lnet_fault_attr *attr = &rule->dl_attr;
-	bool delay;
-	time64_t now = ktime_get_seconds();
+	struct lnet_fault_attr	*attr = &rule->dl_attr;
+	bool			 delay;
 
 	if (!lnet_fault_attr_match(attr, src, dst, type, portal))
 		return false;
@@ -535,17 +481,21 @@ delay_rule_match(struct lnet_delay_rule *rule, lnet_nid_t src,
 	/* match this rule, check delay rate now */
 	spin_lock(&rule->dl_lock);
 	if (rule->dl_delay_time != 0) { /* time based delay */
+		cfs_time_t now = cfs_time_current();
+
 		rule->dl_stat.fs_count++;
-		delay = now >= rule->dl_delay_time;
+		delay = cfs_time_aftereq(now, rule->dl_delay_time);
 		if (delay) {
-			if (now > rule->dl_time_base)
+			if (cfs_time_after(now, rule->dl_time_base))
 				rule->dl_time_base = now;
 
 			rule->dl_delay_time = rule->dl_time_base +
-					      cfs_rand() % attr->u.delay.la_interval;
-			rule->dl_time_base += attr->u.delay.la_interval;
+					     cfs_time_seconds(cfs_rand() %
+						attr->u.delay.la_interval);
+			rule->dl_time_base += cfs_time_seconds(attr->u.delay.
+							       la_interval);
 
-			CDEBUG(D_NET, "Delay Rule %s->%s: next delay : %lld\n",
+			CDEBUG(D_NET, "Delay Rule %s->%s: next delay : %ld\n",
 			       libcfs_nid2str(attr->fa_src),
 			       libcfs_nid2str(attr->fa_dst),
 			       rule->dl_delay_time);
@@ -576,11 +526,11 @@ delay_rule_match(struct lnet_delay_rule *rule, lnet_nid_t src,
 	rule->dl_stat.u.delay.ls_delayed++;
 
 	list_add_tail(&msg->msg_list, &rule->dl_msg_list);
-	msg->msg_delay_send = now + attr->u.delay.la_latency;
+	msg->msg_delay_send = round_timeout(
+			cfs_time_shift(attr->u.delay.la_latency));
 	if (rule->dl_msg_send == -1) {
 		rule->dl_msg_send = msg->msg_delay_send;
-		mod_timer(&rule->dl_timer,
-			  jiffies + cfs_time_seconds(attr->u.delay.la_latency));
+		mod_timer(&rule->dl_timer, rule->dl_msg_send);
 	}
 
 	spin_unlock(&rule->dl_lock);
@@ -624,7 +574,7 @@ delayed_msg_check(struct lnet_delay_rule *rule, bool all,
 {
 	struct lnet_msg *msg;
 	struct lnet_msg *tmp;
-	time64_t now = ktime_get_seconds();
+	unsigned long	 now = cfs_time_current();
 
 	if (!all && rule->dl_msg_send > now)
 		return;
@@ -648,9 +598,7 @@ delayed_msg_check(struct lnet_delay_rule *rule, bool all,
 		msg = list_entry(rule->dl_msg_list.next,
 				 struct lnet_msg, msg_list);
 		rule->dl_msg_send = msg->msg_delay_send;
-		mod_timer(&rule->dl_timer,
-			  jiffies +
-			  cfs_time_seconds(msg->msg_delay_send - now));
+		mod_timer(&rule->dl_timer, rule->dl_msg_send);
 	}
 	spin_unlock(&rule->dl_lock);
 }
@@ -666,20 +614,6 @@ delayed_msg_process(struct list_head *msg_list, bool drop)
 		int		rc;
 
 		msg = list_entry(msg_list->next, struct lnet_msg, msg_list);
-
-		if (msg->msg_sending) {
-			/* Delayed send */
-			list_del_init(&msg->msg_list);
-			ni = msg->msg_txni;
-			CDEBUG(D_NET, "TRACE: msg %p %s -> %s : %s\n", msg,
-			       libcfs_nid2str(ni->ni_nid),
-			       libcfs_nid2str(msg->msg_txpeer->lpni_nid),
-			       lnet_msgtyp2str(msg->msg_type));
-			lnet_ni_send(ni, msg);
-			continue;
-		}
-
-		/* Delayed receive */
 		LASSERT(msg->msg_rxpeer != NULL);
 		LASSERT(msg->msg_rxni != NULL);
 
@@ -704,7 +638,7 @@ delayed_msg_process(struct list_head *msg_list, bool drop)
 			case LNET_CREDIT_OK:
 				lnet_ni_recv(ni, msg->msg_private, msg, 0,
 					     0, msg->msg_len, msg->msg_len);
-				/* fallthrough */
+				/* Fall through */
 			case LNET_CREDIT_WAIT:
 				continue;
 			default: /* failures */
@@ -712,8 +646,7 @@ delayed_msg_process(struct list_head *msg_list, bool drop)
 			}
 		}
 
-		lnet_drop_message(ni, cpt, msg->msg_private, msg->msg_len,
-				  msg->msg_type);
+		lnet_drop_message(ni, cpt, msg->msg_private, msg->msg_len);
 		lnet_finalize(msg, rc);
 	}
 }
@@ -849,10 +782,9 @@ lnet_delay_rule_add(struct lnet_fault_attr *attr)
 
 	rule->dl_attr = *attr;
 	if (attr->u.delay.la_interval != 0) {
-		rule->dl_time_base = ktime_get_seconds() +
-				     attr->u.delay.la_interval;
-		rule->dl_delay_time = ktime_get_seconds() +
-				      cfs_rand() % attr->u.delay.la_interval;
+		rule->dl_time_base = cfs_time_shift(attr->u.delay.la_interval);
+		rule->dl_delay_time = cfs_time_shift(cfs_rand() %
+						     attr->u.delay.la_interval);
 	} else {
 		rule->dl_delay_at = cfs_rand() % attr->u.delay.la_rate;
 	}
@@ -1003,10 +935,10 @@ lnet_delay_rule_reset(void)
 		if (attr->u.delay.la_rate != 0) {
 			rule->dl_delay_at = cfs_rand() % attr->u.delay.la_rate;
 		} else {
-			rule->dl_delay_time = ktime_get_seconds() +
-					      cfs_rand() % attr->u.delay.la_interval;
-			rule->dl_time_base = ktime_get_seconds() +
-					     attr->u.delay.la_interval;
+			rule->dl_delay_time = cfs_time_shift(cfs_rand() %
+						attr->u.delay.la_interval);
+			rule->dl_time_base = cfs_time_shift(attr->u.delay.
+								  la_interval);
 		}
 		spin_unlock(&rule->dl_lock);
 	}
diff --git a/drivers/staging/lustrefsx/lnet/lnet/nidstrings.c b/drivers/staging/lustrefsx/lnet/lnet/nidstrings.c
index fe3add7b9701c..5122a2e6b5d81 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/nidstrings.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/nidstrings.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -37,7 +37,7 @@
 #define DEBUG_SUBSYSTEM S_LNET
 
 #include <libcfs/libcfs.h>
-#include <uapi/linux/lnet/nidstr.h>
+#include <lnet/nidstr.h>
 
 /* max value for numeric network address */
 #define MAX_NUMERIC_VALUE 0xffffffff
diff --git a/drivers/staging/lustrefsx/lnet/lnet/peer.c b/drivers/staging/lustrefsx/lnet/lnet/peer.c
index c2d64d140702e..612af87d47692 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/peer.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/peer.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -34,19 +34,8 @@
 
 #define DEBUG_SUBSYSTEM S_LNET
 
-#include <linux/sched.h>
-#ifdef HAVE_SCHED_HEADERS
-#include <linux/sched/signal.h>
-#endif
-#include <linux/uaccess.h>
-
 #include <lnet/lib-lnet.h>
-#include <uapi/linux/lnet/lnet-dlc.h>
-
-/* Value indicating that recovery needs to re-check a peer immediately. */
-#define LNET_REDISCOVER_PEER	(1)
-
-static int lnet_peer_queue_for_discovery(struct lnet_peer *lp);
+#include <lnet/lib-dlc.h>
 
 static void
 lnet_peer_remove_from_remote_list(struct lnet_peer_ni *lpni)
@@ -138,8 +127,6 @@ lnet_peer_tables_create(void)
 		spin_lock_init(&ptable->pt_zombie_lock);
 		INIT_LIST_HEAD(&ptable->pt_zombie_list);
 
-		INIT_LIST_HEAD(&ptable->pt_peer_list);
-
 		for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
 			INIT_LIST_HEAD(&hash[j]);
 		ptable->pt_hash = hash; /* sign of initialization */
@@ -165,19 +152,17 @@ lnet_peer_ni_alloc(lnet_nid_t nid)
 	INIT_LIST_HEAD(&lpni->lpni_rtrq);
 	INIT_LIST_HEAD(&lpni->lpni_routes);
 	INIT_LIST_HEAD(&lpni->lpni_hashlist);
-	INIT_LIST_HEAD(&lpni->lpni_peer_nis);
-	INIT_LIST_HEAD(&lpni->lpni_recovery);
+	INIT_LIST_HEAD(&lpni->lpni_on_peer_net_list);
 	INIT_LIST_HEAD(&lpni->lpni_on_remote_peer_ni_list);
-	LNetInvalidateMDHandle(&lpni->lpni_recovery_ping_mdh);
 
 	spin_lock_init(&lpni->lpni_lock);
 
 	lpni->lpni_alive = !lnet_peers_start_down(); /* 1 bit!! */
-	lpni->lpni_last_alive = ktime_get_seconds(); /* assumes alive */
+	lpni->lpni_last_alive = cfs_time_current(); /* assumes alive */
 	lpni->lpni_ping_feats = LNET_PING_FEAT_INVAL;
 	lpni->lpni_nid = nid;
 	lpni->lpni_cpt = cpt;
-	atomic_set(&lpni->lpni_healthv, LNET_MAX_HEALTH_VALUE);
+	lnet_set_peer_ni_health_locked(lpni, true);
 
 	net = lnet_get_net_locked(LNET_NIDNET(nid));
 	lpni->lpni_net = net;
@@ -199,7 +184,7 @@ lnet_peer_ni_alloc(lnet_nid_t nid)
 			      &the_lnet.ln_remote_peer_ni_list);
 	}
 
-	CDEBUG(D_NET, "%p nid %s\n", lpni, libcfs_nid2str(lpni->lpni_nid));
+	/* TODO: update flags */
 
 	return lpni;
 }
@@ -213,32 +198,13 @@ lnet_peer_net_alloc(__u32 net_id)
 	if (!lpn)
 		return NULL;
 
-	INIT_LIST_HEAD(&lpn->lpn_peer_nets);
+	INIT_LIST_HEAD(&lpn->lpn_on_peer_list);
 	INIT_LIST_HEAD(&lpn->lpn_peer_nis);
 	lpn->lpn_net_id = net_id;
 
-	CDEBUG(D_NET, "%p net %s\n", lpn, libcfs_net2str(lpn->lpn_net_id));
-
 	return lpn;
 }
 
-void
-lnet_destroy_peer_net_locked(struct lnet_peer_net *lpn)
-{
-	struct lnet_peer *lp;
-
-	CDEBUG(D_NET, "%p net %s\n", lpn, libcfs_net2str(lpn->lpn_net_id));
-
-	LASSERT(atomic_read(&lpn->lpn_refcount) == 0);
-	LASSERT(list_empty(&lpn->lpn_peer_nis));
-	LASSERT(list_empty(&lpn->lpn_peer_nets));
-	lp = lpn->lpn_peer;
-	lpn->lpn_peer = NULL;
-	LIBCFS_FREE(lpn, sizeof(*lpn));
-
-	lnet_peer_decref_locked(lp);
-}
-
 static struct lnet_peer *
 lnet_peer_alloc(lnet_nid_t nid)
 {
@@ -248,118 +214,47 @@ lnet_peer_alloc(lnet_nid_t nid)
 	if (!lp)
 		return NULL;
 
-	INIT_LIST_HEAD(&lp->lp_peer_list);
+	INIT_LIST_HEAD(&lp->lp_on_lnet_peer_list);
 	INIT_LIST_HEAD(&lp->lp_peer_nets);
-	INIT_LIST_HEAD(&lp->lp_dc_list);
-	INIT_LIST_HEAD(&lp->lp_dc_pendq);
-	init_waitqueue_head(&lp->lp_dc_waitq);
-	spin_lock_init(&lp->lp_lock);
 	lp->lp_primary_nid = nid;
-	lp->lp_disc_src_nid = LNET_NID_ANY;
-
-	/*
-	 * Turn off discovery for loopback peer. If you're creating a peer
-	 * for the loopback interface then that was initiated when we
-	 * attempted to send a message over the loopback. There is no need
-	 * to ever use a different interface when sending messages to
-	 * myself.
-	 */
-	if (nid == LNET_NID_LO_0)
-		lp->lp_state = LNET_PEER_NO_DISCOVERY;
-	lp->lp_cpt = lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
 
-	CDEBUG(D_NET, "%p nid %s\n", lp, libcfs_nid2str(lp->lp_primary_nid));
+	/* TODO: update flags */
 
 	return lp;
 }
 
-void
-lnet_destroy_peer_locked(struct lnet_peer *lp)
-{
-	CDEBUG(D_NET, "%p nid %s\n", lp, libcfs_nid2str(lp->lp_primary_nid));
-
-	LASSERT(atomic_read(&lp->lp_refcount) == 0);
-	LASSERT(list_empty(&lp->lp_peer_nets));
-	LASSERT(list_empty(&lp->lp_peer_list));
-	LASSERT(list_empty(&lp->lp_dc_list));
-
-	if (lp->lp_data)
-		lnet_ping_buffer_decref(lp->lp_data);
-
-	/*
-	 * if there are messages still on the pending queue, then make
-	 * sure to queue them on the ln_msg_resend list so they can be
-	 * resent at a later point if the discovery thread is still
-	 * running.
-	 * If the discovery thread has stopped, then the wakeup will be a
-	 * no-op, and it is expected the lnet_shutdown_lndnets() will
-	 * eventually be called, which will traverse this list and
-	 * finalize the messages on the list.
-	 * We can not resend them now because we're holding the cpt lock.
-	 * Releasing the lock can cause an inconsistent state
-	 */
-	spin_lock(&the_lnet.ln_msg_resend_lock);
-	spin_lock(&lp->lp_lock);
-	list_splice(&lp->lp_dc_pendq, &the_lnet.ln_msg_resend);
-	spin_unlock(&lp->lp_lock);
-	spin_unlock(&the_lnet.ln_msg_resend_lock);
-	wake_up(&the_lnet.ln_dc_waitq);
-
-	LIBCFS_FREE(lp, sizeof(*lp));
-}
 
-/*
- * Detach a peer_ni from its peer_net. If this was the last peer_ni on
- * that peer_net, detach the peer_net from the peer.
- *
- * Call with lnet_net_lock/EX held
- */
 static void
-lnet_peer_detach_peer_ni_locked(struct lnet_peer_ni *lpni)
+lnet_try_destroy_peer_hierarchy_locked(struct lnet_peer_ni *lpni)
 {
-	struct lnet_peer_table *ptable;
-	struct lnet_peer_net *lpn;
-	struct lnet_peer *lp;
+	struct lnet_peer_net *peer_net;
+	struct lnet_peer *peer;
 
-	/*
-	 * Belts and suspenders: gracefully handle teardown of a
-	 * partially connected peer_ni.
-	 */
-	lpn = lpni->lpni_peer_net;
+	/* TODO: could the below situation happen? accessing an already
+	 * destroyed peer? */
+	if (lpni->lpni_peer_net == NULL ||
+	    lpni->lpni_peer_net->lpn_peer == NULL)
+		return;
 
-	list_del_init(&lpni->lpni_peer_nis);
-	/*
-	 * If there are no lpni's left, we detach lpn from
-	 * lp_peer_nets, so it cannot be found anymore.
-	 */
-	if (list_empty(&lpn->lpn_peer_nis))
-		list_del_init(&lpn->lpn_peer_nets);
+	peer_net = lpni->lpni_peer_net;
+	peer = lpni->lpni_peer_net->lpn_peer;
 
-	/* Update peer NID count. */
-	lp = lpn->lpn_peer;
-	lp->lp_nnis--;
+	list_del_init(&lpni->lpni_on_peer_net_list);
+	lpni->lpni_peer_net = NULL;
 
-	/*
-	 * If there are no more peer nets, make the peer unfindable
-	 * via the peer_tables.
-	 *
-	 * Otherwise, if the peer is DISCOVERED, tell discovery to
-	 * take another look at it. This is a no-op if discovery for
-	 * this peer did the detaching.
-	 */
-	if (list_empty(&lp->lp_peer_nets)) {
-		list_del_init(&lp->lp_peer_list);
-		ptable = the_lnet.ln_peer_tables[lp->lp_cpt];
-		ptable->pt_peers--;
-	} else if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING) {
-		/* Discovery isn't running, nothing to do here. */
-	} else if (lp->lp_state & LNET_PEER_DISCOVERED) {
-		lnet_peer_queue_for_discovery(lp);
-		wake_up(&the_lnet.ln_dc_waitq);
+	/* if peer_net is empty, then remove it from the peer */
+	if (list_empty(&peer_net->lpn_peer_nis)) {
+		list_del_init(&peer_net->lpn_on_peer_list);
+		peer_net->lpn_peer = NULL;
+		LIBCFS_FREE(peer_net, sizeof(*peer_net));
+
+		/* if the peer is empty then remove it from the
+		 * the_lnet.ln_peers */
+		if (list_empty(&peer->lp_peer_nets)) {
+			list_del_init(&peer->lp_on_lnet_peer_list);
+			LIBCFS_FREE(peer, sizeof(*peer));
+		}
 	}
-	CDEBUG(D_NET, "peer %s NID %s\n",
-		libcfs_nid2str(lp->lp_primary_nid),
-		libcfs_nid2str(lpni->lpni_nid));
 }
 
 /* called with lnet_net_lock LNET_LOCK_EX held */
@@ -380,18 +275,10 @@ lnet_peer_ni_del_locked(struct lnet_peer_ni *lpni)
 	/* remove peer ni from the hash list. */
 	list_del_init(&lpni->lpni_hashlist);
 
-	/*
-	 * indicate the peer is being deleted so the monitor thread can
-	 * remove it from the recovery queue.
-	 */
-	spin_lock(&lpni->lpni_lock);
-	lpni->lpni_state |= LNET_PEER_NI_DELETING;
-	spin_unlock(&lpni->lpni_lock);
-
 	/* decrement the ref count on the peer table */
 	ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt];
-	LASSERT(ptable->pt_number > 0);
-	ptable->pt_number--;
+	LASSERT(atomic_read(&ptable->pt_number) > 0);
+	atomic_dec(&ptable->pt_number);
 
 	/*
 	 * The peer_ni can no longer be found with a lookup. But there
@@ -400,7 +287,7 @@ lnet_peer_ni_del_locked(struct lnet_peer_ni *lpni)
 	 *
 	 * The last reference may be lost in a place where the
 	 * lnet_net_lock locks only a single cpt, and that cpt may not
-	 * be lpni->lpni_cpt. So the zombie list of lnet_peer_table
+	 * be lpni->lpni_cpt. So the zombie list of this peer_table
 	 * has its own lock.
 	 */
 	spin_lock(&ptable->pt_zombie_lock);
@@ -408,10 +295,10 @@ lnet_peer_ni_del_locked(struct lnet_peer_ni *lpni)
 	ptable->pt_zombies++;
 	spin_unlock(&ptable->pt_zombie_lock);
 
-	/* no need to keep this peer_ni on the hierarchy anymore */
-	lnet_peer_detach_peer_ni_locked(lpni);
+	/* no need to keep this peer on the hierarchy anymore */
+	lnet_try_destroy_peer_hierarchy_locked(lpni);
 
-	/* remove hashlist reference on peer_ni */
+	/* decrement reference on peer */
 	lnet_peer_ni_decref_locked(lpni);
 
 	return 0;
@@ -439,8 +326,6 @@ lnet_peer_del_locked(struct lnet_peer *peer)
 	struct lnet_peer_ni *lpni = NULL, *lpni2;
 	int rc = 0, rc2 = 0;
 
-	CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(peer->lp_primary_nid));
-
 	lpni = lnet_get_next_peer_ni_locked(peer, NULL, lpni);
 	while (lpni != NULL) {
 		lpni2 = lnet_get_next_peer_ni_locked(peer, NULL, lpni);
@@ -453,71 +338,6 @@ lnet_peer_del_locked(struct lnet_peer *peer)
 	return rc2;
 }
 
-static int
-lnet_peer_del(struct lnet_peer *peer)
-{
-	lnet_net_lock(LNET_LOCK_EX);
-	lnet_peer_del_locked(peer);
-	lnet_net_unlock(LNET_LOCK_EX);
-
-	return 0;
-}
-
-/*
- * Delete a NID from a peer. Call with ln_api_mutex held.
- *
- * Error codes:
- *  -EPERM:  Non-DLC deletion from DLC-configured peer.
- *  -ENOENT: No lnet_peer_ni corresponding to the nid.
- *  -ECHILD: The lnet_peer_ni isn't connected to the peer.
- *  -EBUSY:  The lnet_peer_ni is the primary, and not the only peer_ni.
- */
-static int
-lnet_peer_del_nid(struct lnet_peer *lp, lnet_nid_t nid, unsigned flags)
-{
-	struct lnet_peer_ni *lpni;
-	lnet_nid_t primary_nid = lp->lp_primary_nid;
-	int rc = 0;
-
-	if (!(flags & LNET_PEER_CONFIGURED)) {
-		if (lp->lp_state & LNET_PEER_CONFIGURED) {
-			rc = -EPERM;
-			goto out;
-		}
-	}
-	lpni = lnet_find_peer_ni_locked(nid);
-	if (!lpni) {
-		rc = -ENOENT;
-		goto out;
-	}
-	lnet_peer_ni_decref_locked(lpni);
-	if (lp != lpni->lpni_peer_net->lpn_peer) {
-		rc = -ECHILD;
-		goto out;
-	}
-
-	/*
-	 * This function only allows deletion of the primary NID if it
-	 * is the only NID.
-	 */
-	if (nid == lp->lp_primary_nid && lp->lp_nnis != 1) {
-		rc = -EBUSY;
-		goto out;
-	}
-
-	lnet_net_lock(LNET_LOCK_EX);
-
-	rc = lnet_peer_ni_del_locked(lpni);
-
-	lnet_net_unlock(LNET_LOCK_EX);
-
-out:
-	CDEBUG(D_NET, "peer %s NID %s flags %#x: %d\n",
-	       libcfs_nid2str(primary_nid), libcfs_nid2str(nid), flags, rc);
-
-	return rc;
-}
-
 static void
 lnet_peer_table_cleanup_locked(struct lnet_net *net,
 			       struct lnet_peer_table *ptable)
@@ -604,8 +424,8 @@ lnet_peer_table_del_rtrs_locked(struct lnet_net *net,
 void
 lnet_peer_tables_cleanup(struct lnet_net *net)
 {
-	int i;
-	struct lnet_peer_table *ptable;
+	int				i;
+	struct lnet_peer_table		*ptable;
 
 	LASSERT(the_lnet.ln_state != LNET_STATE_SHUTDOWN || net != NULL);
 	/* If just deleting the peers for a NI, get rid of any routes these
@@ -662,24 +482,42 @@ lnet_find_peer_ni_locked(lnet_nid_t nid)
 }
 
 struct lnet_peer *
-lnet_find_peer(lnet_nid_t nid)
+lnet_find_or_create_peer_locked(lnet_nid_t dst_nid, int cpt)
 {
 	struct lnet_peer_ni *lpni;
-	struct lnet_peer *lp = NULL;
-	int cpt;
+	struct lnet_peer *lp;
 
-	cpt = lnet_net_lock_current();
-	lpni = lnet_find_peer_ni_locked(nid);
-	if (lpni) {
-		lp = lpni->lpni_peer_net->lpn_peer;
-		lnet_peer_addref_locked(lp);
-		lnet_peer_ni_decref_locked(lpni);
+	lpni = lnet_find_peer_ni_locked(dst_nid);
+	if (!lpni) {
+		lpni = lnet_nid2peerni_locked(dst_nid, cpt);
+		if (IS_ERR(lpni))
+			return ERR_CAST(lpni);
 	}
-	lnet_net_unlock(cpt);
+
+	lp = lpni->lpni_peer_net->lpn_peer;
+	lnet_peer_ni_decref_locked(lpni);
 
 	return lp;
 }
 
+struct lnet_peer_ni *
+lnet_get_peer_ni_idx_locked(int idx, struct lnet_peer_net **lpn,
+			    struct lnet_peer **lp)
+{
+	struct lnet_peer_ni	*lpni;
+
+	list_for_each_entry((*lp), &the_lnet.ln_peers, lp_on_lnet_peer_list) {
+		list_for_each_entry((*lpn), &((*lp)->lp_peer_nets), lpn_on_peer_list) {
+			list_for_each_entry(lpni, &((*lpn)->lpn_peer_nis),
+					    lpni_on_peer_net_list)
+				if (idx-- == 0)
+					return lpni;
+		}
+	}
+
+	return NULL;
+}
+
 struct lnet_peer_ni *
 lnet_get_next_peer_ni_locked(struct lnet_peer *peer,
 			     struct lnet_peer_net *peer_net,
@@ -689,21 +527,18 @@ lnet_get_next_peer_ni_locked(struct lnet_peer *peer,
 	struct lnet_peer_net *net = peer_net;
 
 	if (!prev) {
-		if (!net) {
-			if (list_empty(&peer->lp_peer_nets))
-				return NULL;
-
+		if (!net)
 			net = list_entry(peer->lp_peer_nets.next,
 					 struct lnet_peer_net,
-					 lpn_peer_nets);
-		}
+					 lpn_on_peer_list);
 		lpni = list_entry(net->lpn_peer_nis.next, struct lnet_peer_ni,
-				  lpni_peer_nis);
+				  lpni_on_peer_net_list);
 
 		return lpni;
 	}
 
-	if (prev->lpni_peer_nis.next == &prev->lpni_peer_net->lpn_peer_nis) {
+	if (prev->lpni_on_peer_net_list.next ==
+	    &prev->lpni_peer_net->lpn_peer_nis) {
 		/*
 		 * if you reached the end of the peer ni list and the peer
 		 * net is specified then there are no more peer nis in that
@@ -716,915 +551,428 @@ lnet_get_next_peer_ni_locked(struct lnet_peer *peer,
 		 * we reached the end of this net ni list. move to the
 		 * next net
 		 */
-		if (prev->lpni_peer_net->lpn_peer_nets.next ==
+		if (prev->lpni_peer_net->lpn_on_peer_list.next ==
 		    &peer->lp_peer_nets)
 			/* no more nets and no more NIs. */
 			return NULL;
 
 		/* get the next net */
-		net = list_entry(prev->lpni_peer_net->lpn_peer_nets.next,
+		net = list_entry(prev->lpni_peer_net->lpn_on_peer_list.next,
 				 struct lnet_peer_net,
-				 lpn_peer_nets);
+				 lpn_on_peer_list);
 		/* get the ni on it */
 		lpni = list_entry(net->lpn_peer_nis.next, struct lnet_peer_ni,
-				  lpni_peer_nis);
+				  lpni_on_peer_net_list);
 
 		return lpni;
 	}
 
 	/* there are more nis left */
-	lpni = list_entry(prev->lpni_peer_nis.next,
-			  struct lnet_peer_ni, lpni_peer_nis);
+	lpni = list_entry(prev->lpni_on_peer_net_list.next,
+			  struct lnet_peer_ni, lpni_on_peer_net_list);
 
 	return lpni;
 }
 
-/* Call with the ln_api_mutex held */
-int lnet_get_peer_list(u32 *countp, u32 *sizep, struct lnet_process_id __user *ids)
-{
-	struct lnet_process_id id;
-	struct lnet_peer_table *ptable;
-	struct lnet_peer *lp;
-	__u32 count = 0;
-	__u32 size = 0;
-	int lncpt;
-	int cpt;
-	__u32 i;
-	int rc;
-
-	rc = -ESHUTDOWN;
-	if (the_lnet.ln_state != LNET_STATE_RUNNING)
-		goto done;
-
-	lncpt = cfs_percpt_number(the_lnet.ln_peer_tables);
-
-	/*
-	 * Count the number of peers, and return E2BIG if the buffer
-	 * is too small. We'll also return the desired size.
-	 */
-	rc = -E2BIG;
-	for (cpt = 0; cpt < lncpt; cpt++) {
-		ptable = the_lnet.ln_peer_tables[cpt];
-		count += ptable->pt_peers;
-	}
-	size = count * sizeof(*ids);
-	if (size > *sizep)
-		goto done;
-
-	/*
-	 * Walk the peer lists and copy out the primary nids.
-	 * This is safe because the peer lists are only modified
-	 * while the ln_api_mutex is held. So we don't need to
-	 * hold the lnet_net_lock as well, and can therefore
-	 * directly call copy_to_user().
-	 */
-	rc = -EFAULT;
-	memset(&id, 0, sizeof(id));
-	id.pid = LNET_PID_LUSTRE;
-	i = 0;
-	for (cpt = 0; cpt < lncpt; cpt++) {
-		ptable = the_lnet.ln_peer_tables[cpt];
-		list_for_each_entry(lp, &ptable->pt_peer_list, lp_peer_list) {
-			if (i >= count)
-				goto done;
-			id.nid = lp->lp_primary_nid;
-			if (copy_to_user(&ids[i], &id, sizeof(id)))
-				goto done;
-			i++;
-		}
-	}
-	rc = 0;
-done:
-	*countp = count;
-	*sizep = size;
-	return rc;
-}
-
-/*
- * Start pushes to peers that need to be updated for a configuration
- * change on this node.
- */
-void
-lnet_push_update_to_peers(int force)
-{
-	struct lnet_peer_table *ptable;
-	struct lnet_peer *lp;
-	int lncpt;
-	int cpt;
-
-	lnet_net_lock(LNET_LOCK_EX);
-	lncpt = cfs_percpt_number(the_lnet.ln_peer_tables);
-	for (cpt = 0; cpt < lncpt; cpt++) {
-		ptable = the_lnet.ln_peer_tables[cpt];
-		list_for_each_entry(lp, &ptable->pt_peer_list, lp_peer_list) {
-			if (force) {
-				spin_lock(&lp->lp_lock);
-				if (lp->lp_state & LNET_PEER_MULTI_RAIL)
-					lp->lp_state |= LNET_PEER_FORCE_PUSH;
-				spin_unlock(&lp->lp_lock);
-			}
-			if (lnet_peer_needs_push(lp))
-				lnet_peer_queue_for_discovery(lp);
-		}
-	}
-	lnet_net_unlock(LNET_LOCK_EX);
-	wake_up(&the_lnet.ln_dc_waitq);
-}
-
-/*
- * Test whether a ni is a preferred ni for this peer_ni, e.g, whether
- * this is a preferred point-to-point path. Call with lnet_net_lock in
- * shared mmode.
- */
 bool
-lnet_peer_is_pref_nid_locked(struct lnet_peer_ni *lpni, lnet_nid_t nid)
+lnet_peer_is_ni_pref_locked(struct lnet_peer_ni *lpni, struct lnet_ni *ni)
 {
 	int i;
 
-	if (lpni->lpni_pref_nnids == 0)
-		return false;
-	if (lpni->lpni_pref_nnids == 1)
-		return lpni->lpni_pref.nid == nid;
 	for (i = 0; i < lpni->lpni_pref_nnids; i++) {
-		if (lpni->lpni_pref.nids[i] == nid)
+		if (lpni->lpni_pref_nids[i] == ni->ni_nid)
 			return true;
 	}
 	return false;
 }
 
-/*
- * Set a single ni as preferred, provided no preferred ni is already
- * defined. Only to be used for non-multi-rail peer_ni.
- */
-int
-lnet_peer_ni_set_non_mr_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid)
+lnet_nid_t
+lnet_peer_primary_nid_locked(lnet_nid_t nid)
 {
-	int rc = 0;
+	struct lnet_peer_ni *lpni;
+	lnet_nid_t primary_nid = nid;
 
-	spin_lock(&lpni->lpni_lock);
-	if (nid == LNET_NID_ANY) {
-		rc = -EINVAL;
-	} else if (lpni->lpni_pref_nnids > 0) {
-		rc = -EPERM;
-	} else if (lpni->lpni_pref_nnids == 0) {
-		lpni->lpni_pref.nid = nid;
-		lpni->lpni_pref_nnids = 1;
-		lpni->lpni_state |= LNET_PEER_NI_NON_MR_PREF;
+	lpni = lnet_find_peer_ni_locked(nid);
+	if (lpni) {
+		primary_nid = lpni->lpni_peer_net->lpn_peer->lp_primary_nid;
+		lnet_peer_ni_decref_locked(lpni);
 	}
-	spin_unlock(&lpni->lpni_lock);
 
-	CDEBUG(D_NET, "peer %s nid %s: %d\n",
-	       libcfs_nid2str(lpni->lpni_nid), libcfs_nid2str(nid), rc);
-	return rc;
+	return primary_nid;
 }
 
-/*
- * Clear the preferred NID from a non-multi-rail peer_ni, provided
- * this preference was set by lnet_peer_ni_set_non_mr_pref_nid().
- */
-int
-lnet_peer_ni_clr_non_mr_pref_nid(struct lnet_peer_ni *lpni)
+lnet_nid_t
+LNetPrimaryNID(lnet_nid_t nid)
 {
+	struct lnet_peer_ni *lpni;
+	lnet_nid_t primary_nid = nid;
 	int rc = 0;
+	int cpt;
 
-	spin_lock(&lpni->lpni_lock);
-	if (lpni->lpni_state & LNET_PEER_NI_NON_MR_PREF) {
-		lpni->lpni_pref_nnids = 0;
-		lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF;
-	} else if (lpni->lpni_pref_nnids == 0) {
-		rc = -ENOENT;
-	} else {
-		rc = -EPERM;
+	cpt = lnet_net_lock_current();
+	lpni = lnet_nid2peerni_locked(nid, cpt);
+	if (IS_ERR(lpni)) {
+		rc = PTR_ERR(lpni);
+		goto out_unlock;
 	}
-	spin_unlock(&lpni->lpni_lock);
+	primary_nid = lpni->lpni_peer_net->lpn_peer->lp_primary_nid;
+	lnet_peer_ni_decref_locked(lpni);
+out_unlock:
+	lnet_net_unlock(cpt);
 
-	CDEBUG(D_NET, "peer %s: %d\n",
-	       libcfs_nid2str(lpni->lpni_nid), rc);
-	return rc;
+	CDEBUG(D_NET, "NID %s primary NID %s rc %d\n", libcfs_nid2str(nid),
+	       libcfs_nid2str(primary_nid), rc);
+	return primary_nid;
 }
+EXPORT_SYMBOL(LNetPrimaryNID);
 
-/*
- * Clear the preferred NIDs from a non-multi-rail peer.
- */
-void
-lnet_peer_clr_non_mr_pref_nids(struct lnet_peer *lp)
+struct lnet_peer_net *
+lnet_peer_get_net_locked(struct lnet_peer *peer, __u32 net_id)
 {
-	struct lnet_peer_ni *lpni = NULL;
-
-	while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL)
-		lnet_peer_ni_clr_non_mr_pref_nid(lpni);
+	struct lnet_peer_net *peer_net;
+	list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_on_peer_list) {
+		if (peer_net->lpn_net_id == net_id)
+			return peer_net;
+	}
+	return NULL;
 }
 
-int
-lnet_peer_add_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid)
+static int
+lnet_peer_setup_hierarchy(struct lnet_peer *lp, struct lnet_peer_ni *lpni,
+			  lnet_nid_t nid)
 {
-	lnet_nid_t *nids = NULL;
-	lnet_nid_t *oldnids = NULL;
-	struct lnet_peer *lp = lpni->lpni_peer_net->lpn_peer;
-	int size;
-	int i;
-	int rc = 0;
-
-	if (nid == LNET_NID_ANY) {
-		rc = -EINVAL;
-		goto out;
-	}
+	struct lnet_peer_net *lpn = NULL;
+	struct lnet_peer_table *ptable;
+        __u32 net_id = LNET_NIDNET(nid);
 
-	if (lpni->lpni_pref_nnids == 1 && lpni->lpni_pref.nid == nid) {
-		rc = -EEXIST;
-		goto out;
+	/*
+	 * Create the peer_ni, peer_net, and peer if they don't exist
+	 * yet.
+	 */
+	if (lp) {
+		lpn = lnet_peer_get_net_locked(lp, net_id);
+	} else {
+		lp = lnet_peer_alloc(nid);
+		if (!lp)
+			goto out_enomem;
 	}
 
-	/* A non-MR node may have only one preferred NI per peer_ni */
-	if (lpni->lpni_pref_nnids > 0) {
-		if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) {
-			rc = -EPERM;
-			goto out;
-		}
+	if (!lpn) {
+		lpn = lnet_peer_net_alloc(net_id);
+		if (!lpn)
+			goto out_maybe_free_lp;
 	}
 
-	if (lpni->lpni_pref_nnids != 0) {
-		size = sizeof(*nids) * (lpni->lpni_pref_nnids + 1);
-		LIBCFS_CPT_ALLOC(nids, lnet_cpt_table(), lpni->lpni_cpt, size);
-		if (!nids) {
-			rc = -ENOMEM;
-			goto out;
-		}
-		for (i = 0; i < lpni->lpni_pref_nnids; i++) {
-			if (lpni->lpni_pref.nids[i] == nid) {
-				LIBCFS_FREE(nids, size);
-				rc = -EEXIST;
-				goto out;
-			}
-			nids[i] = lpni->lpni_pref.nids[i];
-		}
-		nids[i] = nid;
+	if (!lpni) {
+		lpni = lnet_peer_ni_alloc(nid);
+		if (!lpni)
+			goto out_maybe_free_lpn;
 	}
 
+	/* Install the new peer_ni */
 	lnet_net_lock(LNET_LOCK_EX);
-	spin_lock(&lpni->lpni_lock);
-	if (lpni->lpni_pref_nnids == 0) {
-		lpni->lpni_pref.nid = nid;
-	} else {
-		oldnids = lpni->lpni_pref.nids;
-		lpni->lpni_pref.nids = nids;
-	}
-	lpni->lpni_pref_nnids++;
-	lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF;
-	spin_unlock(&lpni->lpni_lock);
-	lnet_net_unlock(LNET_LOCK_EX);
-
-	if (oldnids) {
-		size = sizeof(*nids) * (lpni->lpni_pref_nnids - 1);
-		LIBCFS_FREE(oldnids, sizeof(*oldnids) * size);
-	}
-out:
-	if (rc == -EEXIST && (lpni->lpni_state & LNET_PEER_NI_NON_MR_PREF)) {
-		spin_lock(&lpni->lpni_lock);
-		lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF;
-		spin_unlock(&lpni->lpni_lock);
+	/* Add peer_ni to global peer table hash, if necessary. */
+	if (list_empty(&lpni->lpni_hashlist)) {
+		ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt];
+		list_add_tail(&lpni->lpni_hashlist,
+			      &ptable->pt_hash[lnet_nid2peerhash(nid)]);
+		ptable->pt_version++;
+		atomic_inc(&ptable->pt_number);
+		atomic_inc(&lpni->lpni_refcount);
 	}
-	CDEBUG(D_NET, "peer %s nid %s: %d\n",
-	       libcfs_nid2str(lp->lp_primary_nid), libcfs_nid2str(nid), rc);
-	return rc;
-}
 
-int
-lnet_peer_del_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid)
-{
-	lnet_nid_t *nids = NULL;
-	lnet_nid_t *oldnids = NULL;
-	struct lnet_peer *lp = lpni->lpni_peer_net->lpn_peer;
-	int size;
-	int i, j;
-	int rc = 0;
+	/* Detach the peer_ni from an existing peer, if necessary. */
+	if (lpni->lpni_peer_net && lpni->lpni_peer_net->lpn_peer != lp)
+		lnet_try_destroy_peer_hierarchy_locked(lpni);
 
-	if (lpni->lpni_pref_nnids == 0) {
-		rc = -ENOENT;
-		goto out;
-	}
+	/* Add peer_ni to peer_net */
+	lpni->lpni_peer_net = lpn;
+	list_add_tail(&lpni->lpni_on_peer_net_list, &lpn->lpn_peer_nis);
 
-	if (lpni->lpni_pref_nnids == 1) {
-		if (lpni->lpni_pref.nid != nid) {
-			rc = -ENOENT;
-			goto out;
-		}
-	} else if (lpni->lpni_pref_nnids == 2) {
-		if (lpni->lpni_pref.nids[0] != nid &&
-		    lpni->lpni_pref.nids[1] != nid) {
-			rc = -ENOENT;
-			goto out;
-		}
-	} else {
-		size = sizeof(*nids) * (lpni->lpni_pref_nnids - 1);
-		LIBCFS_CPT_ALLOC(nids, lnet_cpt_table(), lpni->lpni_cpt, size);
-		if (!nids) {
-			rc = -ENOMEM;
-			goto out;
-		}
-		for (i = 0, j = 0; i < lpni->lpni_pref_nnids; i++) {
-			if (lpni->lpni_pref.nids[i] != nid)
-				continue;
-			nids[j++] = lpni->lpni_pref.nids[i];
-		}
-		/* Check if we actually removed a nid. */
-		if (j == lpni->lpni_pref_nnids) {
-			LIBCFS_FREE(nids, size);
-			rc = -ENOENT;
-			goto out;
-		}
+	/* Add peer_net to peer */
+	if (!lpn->lpn_peer) {
+		lpn->lpn_peer = lp;
+		list_add_tail(&lpn->lpn_on_peer_list, &lp->lp_peer_nets);
 	}
 
-	lnet_net_lock(LNET_LOCK_EX);
-	spin_lock(&lpni->lpni_lock);
-	if (lpni->lpni_pref_nnids == 1) {
-		lpni->lpni_pref.nid = LNET_NID_ANY;
-	} else if (lpni->lpni_pref_nnids == 2) {
-		oldnids = lpni->lpni_pref.nids;
-		if (oldnids[0] == nid)
-			lpni->lpni_pref.nid = oldnids[1];
-		else
-			lpni->lpni_pref.nid = oldnids[2];
-	} else {
-		oldnids = lpni->lpni_pref.nids;
-		lpni->lpni_pref.nids = nids;
-	}
-	lpni->lpni_pref_nnids--;
-	lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF;
-	spin_unlock(&lpni->lpni_lock);
+	/* Add peer to global peer list */
+	if (list_empty(&lp->lp_on_lnet_peer_list))
+		list_add_tail(&lp->lp_on_lnet_peer_list, &the_lnet.ln_peers);
 	lnet_net_unlock(LNET_LOCK_EX);
 
-	if (oldnids) {
-		size = sizeof(*nids) * (lpni->lpni_pref_nnids + 1);
-		LIBCFS_FREE(oldnids, sizeof(*oldnids) * size);
-	}
-out:
-	CDEBUG(D_NET, "peer %s nid %s: %d\n",
-	       libcfs_nid2str(lp->lp_primary_nid), libcfs_nid2str(nid), rc);
-	return rc;
+	return 0;
+
+out_maybe_free_lpn:
+	if (list_empty(&lpn->lpn_on_peer_list))
+		LIBCFS_FREE(lpn, sizeof(*lpn));
+out_maybe_free_lp:
+	if (list_empty(&lp->lp_on_lnet_peer_list))
+		LIBCFS_FREE(lp, sizeof(*lp));
+out_enomem:
+	return -ENOMEM;
 }
 
-lnet_nid_t
-lnet_peer_primary_nid_locked(lnet_nid_t nid)
+static int
+lnet_add_prim_lpni(lnet_nid_t nid)
 {
+	int rc = 0;
+	struct lnet_peer *peer;
 	struct lnet_peer_ni *lpni;
-	lnet_nid_t primary_nid = nid;
 
+	LASSERT(nid != LNET_NID_ANY);
+
+	/*
+	 * lookup the NID and its peer
+	 *  if the peer doesn't exist, create it.
+	 *  if this is a non-MR peer then change its state to MR and exit.
+	 *  if this is an MR peer and it's a primary NI: NO-OP.
+	 *  if this is an MR peer and it's not a primary NI. Operation not
+	 *     allowed.
+	 *
+	 * The adding and deleting of peer nis is being serialized through
+	 * the api_mutex. So we can look up peers with the mutex locked
+	 * safely. Only when we need to change the ptable, do we need to
+	 * exclusively lock the lnet_net_lock()
+	 */
 	lpni = lnet_find_peer_ni_locked(nid);
-	if (lpni) {
-		primary_nid = lpni->lpni_peer_net->lpn_peer->lp_primary_nid;
-		lnet_peer_ni_decref_locked(lpni);
+	if (!lpni) {
+		rc = lnet_peer_setup_hierarchy(NULL, NULL, nid);
+		if (rc != 0)
+			return rc;
+		lpni = lnet_find_peer_ni_locked(nid);
 	}
 
-	return primary_nid;
-}
-
-bool
-lnet_is_discovery_disabled_locked(struct lnet_peer *lp)
-{
-	if (lnet_peer_discovery_disabled)
-		return true;
+	LASSERT(lpni);
 
-	if (!(lp->lp_state & LNET_PEER_MULTI_RAIL) ||
-	    (lp->lp_state & LNET_PEER_NO_DISCOVERY)) {
-		return true;
-	}
+	lnet_peer_ni_decref_locked(lpni);
 
-	return false;
-}
+	peer = lpni->lpni_peer_net->lpn_peer;
 
-/*
- * Peer Discovery
- */
-bool
-lnet_is_discovery_disabled(struct lnet_peer *lp)
-{
-	bool rc = false;
+	/*
+	 * If we found a lpni with the same nid as the NID we're trying to
+	 * create, then we're trying to create an already existing lpni 
+	 * that belongs to a different peer
+	 */
+	if (peer->lp_primary_nid != nid)
+		return -EEXIST;
 
-	spin_lock(&lp->lp_lock);
-	rc = lnet_is_discovery_disabled_locked(lp);
-	spin_unlock(&lp->lp_lock);
+	/*
+	 * if we found an lpni that is not a multi-rail, which could occur
+	 * if lpni is already created as a non-mr lpni or we just created
+	 * it, then make sure you indicate that this lpni is a primary mr
+	 * capable peer.
+	 *
+	 * TODO: update flags if necessary
+	 */
+	if (!peer->lp_multi_rail && peer->lp_primary_nid == nid)
+		peer->lp_multi_rail = true;
 
 	return rc;
 }
 
-lnet_nid_t
-LNetPrimaryNID(lnet_nid_t nid)
-{
-	struct lnet_peer *lp;
-	struct lnet_peer_ni *lpni;
-	lnet_nid_t primary_nid = nid;
-	int rc = 0;
-	int cpt;
-
-	if (nid == LNET_NID_LO_0)
-		return LNET_NID_LO_0;
-
-	cpt = lnet_net_lock_current();
-	lpni = lnet_nid2peerni_locked(nid, LNET_NID_ANY, cpt);
-	if (IS_ERR(lpni)) {
-		rc = PTR_ERR(lpni);
-		goto out_unlock;
-	}
-	lp = lpni->lpni_peer_net->lpn_peer;
-
-	while (!lnet_peer_is_uptodate(lp)) {
-		spin_lock(&lp->lp_lock);
-		/* force a full discovery cycle */
-		lp->lp_state |= LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH;
-		spin_unlock(&lp->lp_lock);
-
-		rc = lnet_discover_peer_locked(lpni, cpt, true);
-		if (rc)
-			goto out_decref;
-		lp = lpni->lpni_peer_net->lpn_peer;
-
-		/* Only try once if discovery is disabled */
-		if (lnet_is_discovery_disabled(lp))
-			break;
-	}
-	primary_nid = lp->lp_primary_nid;
-out_decref:
-	lnet_peer_ni_decref_locked(lpni);
-out_unlock:
-	lnet_net_unlock(cpt);
-
-	CDEBUG(D_NET, "NID %s primary NID %s rc %d\n", libcfs_nid2str(nid),
-	       libcfs_nid2str(primary_nid), rc);
-	return primary_nid;
-}
-EXPORT_SYMBOL(LNetPrimaryNID);
-
-struct lnet_peer_net *
-lnet_peer_get_net_locked(struct lnet_peer *peer, __u32 net_id)
-{
-	struct lnet_peer_net *peer_net;
-	list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_peer_nets) {
-		if (peer_net->lpn_net_id == net_id)
-			return peer_net;
-	}
-	return NULL;
-}
-
-/*
- * Attach a peer_ni to a peer_net and peer. This function assumes
- * peer_ni is not already attached to the peer_net/peer. The peer_ni
- * may be attached to a different peer, in which case it will be
- * properly detached first. The whole operation is done atomically.
- *
- * Always returns 0.  This is the last function called from functions
- * that do return an int, so returning 0 here allows the compiler to
- * do a tail call.
- */
 static int
-lnet_peer_attach_peer_ni(struct lnet_peer *lp,
-				struct lnet_peer_net *lpn,
-				struct lnet_peer_ni *lpni,
-				unsigned flags)
+lnet_add_peer_ni_to_prim_lpni(lnet_nid_t prim_nid, lnet_nid_t nid)
 {
-	struct lnet_peer_table *ptable;
-
-	/* Install the new peer_ni */
-	lnet_net_lock(LNET_LOCK_EX);
-	/* Add peer_ni to global peer table hash, if necessary. */
-	if (list_empty(&lpni->lpni_hashlist)) {
-		int hash = lnet_nid2peerhash(lpni->lpni_nid);
-
-		ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt];
-		list_add_tail(&lpni->lpni_hashlist, &ptable->pt_hash[hash]);
-		ptable->pt_version++;
-		ptable->pt_number++;
-		/* This is the 1st refcount on lpni. */
-		atomic_inc(&lpni->lpni_refcount);
-	}
-
-	/* Detach the peer_ni from an existing peer, if necessary. */
-	if (lpni->lpni_peer_net) {
-		LASSERT(lpni->lpni_peer_net != lpn);
-		LASSERT(lpni->lpni_peer_net->lpn_peer != lp);
-		lnet_peer_detach_peer_ni_locked(lpni);
-		lnet_peer_net_decref_locked(lpni->lpni_peer_net);
-		lpni->lpni_peer_net = NULL;
-	}
-
-	/* Add peer_ni to peer_net */
-	lpni->lpni_peer_net = lpn;
-	list_add_tail(&lpni->lpni_peer_nis, &lpn->lpn_peer_nis);
-	lnet_peer_net_addref_locked(lpn);
-
-	/* Add peer_net to peer */
-	if (!lpn->lpn_peer) {
-		lpn->lpn_peer = lp;
-		list_add_tail(&lpn->lpn_peer_nets, &lp->lp_peer_nets);
-		lnet_peer_addref_locked(lp);
-	}
-
-	/* Add peer to global peer list, if necessary */
-	ptable = the_lnet.ln_peer_tables[lp->lp_cpt];
-	if (list_empty(&lp->lp_peer_list)) {
-		list_add_tail(&lp->lp_peer_list, &ptable->pt_peer_list);
-		ptable->pt_peers++;
-	}
-
-
-	/* Update peer state */
-	spin_lock(&lp->lp_lock);
-	if (flags & LNET_PEER_CONFIGURED) {
-		if (!(lp->lp_state & LNET_PEER_CONFIGURED))
-			lp->lp_state |= LNET_PEER_CONFIGURED;
-	}
-	if (flags & LNET_PEER_MULTI_RAIL) {
-		if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) {
-			lp->lp_state |= LNET_PEER_MULTI_RAIL;
-			lnet_peer_clr_non_mr_pref_nids(lp);
-		}
-	}
-	spin_unlock(&lp->lp_lock);
+	struct lnet_peer *peer, *primary_peer;
+	struct lnet_peer_ni *lpni = NULL, *klpni = NULL;
 
-	lp->lp_nnis++;
-	lnet_net_unlock(LNET_LOCK_EX);
-
-	CDEBUG(D_NET, "peer %s NID %s flags %#x\n",
-	       libcfs_nid2str(lp->lp_primary_nid),
-	       libcfs_nid2str(lpni->lpni_nid), flags);
+	LASSERT(prim_nid != LNET_NID_ANY && nid != LNET_NID_ANY);
 
-	return 0;
-}
+	/*
+	 * key nid must be created by this point. If not then this
+	 * operation is not permitted
+	 */
+	klpni = lnet_find_peer_ni_locked(prim_nid);
+	if (!klpni)
+		return -ENOENT;
 
-/*
- * Create a new peer, with nid as its primary nid.
- *
- * Call with the lnet_api_mutex held.
- */
-static int
-lnet_peer_add(lnet_nid_t nid, unsigned flags)
-{
-	struct lnet_peer *lp;
-	struct lnet_peer_net *lpn;
-	struct lnet_peer_ni *lpni;
-	int rc = 0;
+	lnet_peer_ni_decref_locked(klpni);
 
-	LASSERT(nid != LNET_NID_ANY);
+	primary_peer = klpni->lpni_peer_net->lpn_peer;
 
-	/*
-	 * No need for the lnet_net_lock here, because the
-	 * lnet_api_mutex is held.
-	 */
 	lpni = lnet_find_peer_ni_locked(nid);
 	if (lpni) {
-		/* A peer with this NID already exists. */
-		lp = lpni->lpni_peer_net->lpn_peer;
 		lnet_peer_ni_decref_locked(lpni);
+
+		peer = lpni->lpni_peer_net->lpn_peer;
 		/*
-		 * This is an error if the peer was configured and the
-		 * primary NID differs or an attempt is made to change
-		 * the Multi-Rail flag. Otherwise the assumption is
-		 * that an existing peer is being modified.
+		 * lpni already exists in the system but it belongs to
+		 * a different peer. We can't re-added it
 		 */
-		if (lp->lp_state & LNET_PEER_CONFIGURED) {
-			if (lp->lp_primary_nid != nid)
-				rc = -EEXIST;
-			else if ((lp->lp_state ^ flags) & LNET_PEER_MULTI_RAIL)
-				rc = -EPERM;
-			goto out;
+		if (peer->lp_primary_nid != prim_nid && peer->lp_multi_rail) {
+			CERROR("Cannot add NID %s owned by peer %s to peer %s\n",
+			       libcfs_nid2str(lpni->lpni_nid),
+			       libcfs_nid2str(peer->lp_primary_nid),
+			       libcfs_nid2str(prim_nid));
+			return -EEXIST;
+		} else if (peer->lp_primary_nid == prim_nid) {
+			/*
+			 * found a peer_ni that is already part of the
+			 * peer. This is a no-op operation.
+			 */
+			return 0;
 		}
-		/* Delete and recreate as a configured peer. */
-		lnet_peer_del(lp);
-	}
 
-	/* Create peer, peer_net, and peer_ni. */
-	rc = -ENOMEM;
-	lp = lnet_peer_alloc(nid);
-	if (!lp)
-		goto out;
-	lpn = lnet_peer_net_alloc(LNET_NIDNET(nid));
-	if (!lpn)
-		goto out_free_lp;
-	lpni = lnet_peer_ni_alloc(nid);
-	if (!lpni)
-		goto out_free_lpn;
+		/*
+		 * TODO: else if (peer->lp_primary_nid != prim_nid &&
+		 *		  !peer->lp_multi_rail)
+		 * peer is not an MR peer and it will be moved in the next
+		 * step to klpni, so update its flags accordingly.
+		 * lnet_move_peer_ni()
+		 */
 
-	return lnet_peer_attach_peer_ni(lp, lpn, lpni, flags);
+		/*
+		 * TODO: call lnet_update_peer() from here to update the
+		 * flags. This is the case when the lpni you're trying to
+		 * add is already part of the peer. This could've been
+		 * added by the DD previously, so go ahead and do any
+		 * updates to the state if necessary
+		 */
 
-out_free_lpn:
-	LIBCFS_FREE(lpn, sizeof(*lpn));
-out_free_lp:
-	LIBCFS_FREE(lp, sizeof(*lp));
-out:
-	CDEBUG(D_NET, "peer %s NID flags %#x: %d\n",
-	       libcfs_nid2str(nid), flags, rc);
-	return rc;
+	}
+
+	/*
+	 * When we get here we either have found an existing lpni, which
+	 * we can switch to the new peer. Or we need to create one and
+	 * add it to the new peer
+	 */
+	return lnet_peer_setup_hierarchy(primary_peer, lpni, nid);
 }
 
 /*
- * Add a NID to a peer. Call with ln_api_mutex held.
- *
- * Error codes:
- *  -EPERM:    Non-DLC addition to a DLC-configured peer.
- *  -EEXIST:   The NID was configured by DLC for a different peer.
- *  -ENOMEM:   Out of memory.
- *  -ENOTUNIQ: Adding a second peer NID on a single network on a
- *             non-multi-rail peer.
+ * lpni creation initiated due to traffic either sending or receiving.
  */
 static int
-lnet_peer_add_nid(struct lnet_peer *lp, lnet_nid_t nid, unsigned flags)
+lnet_peer_ni_traffic_add(lnet_nid_t nid)
 {
-	struct lnet_peer_net *lpn;
 	struct lnet_peer_ni *lpni;
 	int rc = 0;
 
-	LASSERT(lp);
-	LASSERT(nid != LNET_NID_ANY);
-
-	/* A configured peer can only be updated through configuration. */
-	if (!(flags & LNET_PEER_CONFIGURED)) {
-		if (lp->lp_state & LNET_PEER_CONFIGURED) {
-			rc = -EPERM;
-			goto out;
-		}
-	}
-
-	/*
-	 * The MULTI_RAIL flag can be set but not cleared, because
-	 * that would leave the peer struct in an invalid state.
-	 */
-	if (flags & LNET_PEER_MULTI_RAIL) {
-		spin_lock(&lp->lp_lock);
-		if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) {
-			lp->lp_state |= LNET_PEER_MULTI_RAIL;
-			lnet_peer_clr_non_mr_pref_nids(lp);
-		}
-		spin_unlock(&lp->lp_lock);
-	} else if (lp->lp_state & LNET_PEER_MULTI_RAIL) {
-		rc = -EPERM;
-		goto out;
-	}
+	if (nid == LNET_NID_ANY)
+		return -EINVAL;
 
+	/* lnet_net_lock is not needed here because ln_api_lock is held */
 	lpni = lnet_find_peer_ni_locked(nid);
 	if (lpni) {
 		/*
-		 * A peer_ni already exists. This is only a problem if
-		 * it is not connected to this peer and was configured
-		 * by DLC.
+		 * TODO: lnet_update_primary_nid() but not all of it
+		 * only indicate if we're converting this to MR capable
+		 * Can happen due to DD
 		 */
 		lnet_peer_ni_decref_locked(lpni);
-		if (lpni->lpni_peer_net->lpn_peer == lp)
-			goto out;
-		if (lnet_peer_ni_is_configured(lpni)) {
-			rc = -EEXIST;
-			goto out;
-		}
-		/* If this is the primary NID, destroy the peer. */
-		if (lnet_peer_ni_is_primary(lpni)) {
-			lnet_peer_del(lpni->lpni_peer_net->lpn_peer);
-			lpni = lnet_peer_ni_alloc(nid);
-			if (!lpni) {
-				rc = -ENOMEM;
-				goto out;
-			}
-		}
 	} else {
-		lpni = lnet_peer_ni_alloc(nid);
-		if (!lpni) {
-			rc = -ENOMEM;
-			goto out;
-		}
-	}
-
-	/*
-	 * Get the peer_net. Check that we're not adding a second
-	 * peer_ni on a peer_net of a non-multi-rail peer.
-	 */
-	lpn = lnet_peer_get_net_locked(lp, LNET_NIDNET(nid));
-	if (!lpn) {
-		lpn = lnet_peer_net_alloc(LNET_NIDNET(nid));
-		if (!lpn) {
-			rc = -ENOMEM;
-			goto out_free_lpni;
-		}
-	} else if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) {
-		rc = -ENOTUNIQ;
-		goto out_free_lpni;
+		rc = lnet_peer_setup_hierarchy(NULL, NULL, nid);
 	}
 
-	return lnet_peer_attach_peer_ni(lp, lpn, lpni, flags);
-
-out_free_lpni:
-	/* If the peer_ni was allocated above its peer_net pointer is NULL */
-	if (!lpni->lpni_peer_net)
-		LIBCFS_FREE(lpni, sizeof(*lpni));
-out:
-	CDEBUG(D_NET, "peer %s NID %s flags %#x: %d\n",
-	       libcfs_nid2str(lp->lp_primary_nid), libcfs_nid2str(nid),
-	       flags, rc);
 	return rc;
-}
-
-/*
- * Update the primary NID of a peer, if possible.
- *
- * Call with the lnet_api_mutex held.
- */
-static int
-lnet_peer_set_primary_nid(struct lnet_peer *lp, lnet_nid_t nid, unsigned flags)
-{
-	lnet_nid_t old = lp->lp_primary_nid;
-	int rc = 0;
 
-	if (lp->lp_primary_nid == nid)
-		goto out;
-	rc = lnet_peer_add_nid(lp, nid, flags);
-	if (rc)
-		goto out;
-	lp->lp_primary_nid = nid;
-out:
-	CDEBUG(D_NET, "peer %s NID %s: %d\n",
-	       libcfs_nid2str(old), libcfs_nid2str(nid), rc);
-	return rc;
 }
 
-/*
- * lpni creation initiated due to traffic either sending or receiving.
- */
 static int
-lnet_peer_ni_traffic_add(lnet_nid_t nid, lnet_nid_t pref)
+lnet_peer_ni_add_non_mr(lnet_nid_t nid)
 {
-	struct lnet_peer *lp;
-	struct lnet_peer_net *lpn;
 	struct lnet_peer_ni *lpni;
-	unsigned flags = 0;
-	int rc = 0;
-
-	if (nid == LNET_NID_ANY) {
-		rc = -EINVAL;
-		goto out;
-	}
 
-	/* lnet_net_lock is not needed here because ln_api_lock is held */
 	lpni = lnet_find_peer_ni_locked(nid);
 	if (lpni) {
-		/*
-		 * We must have raced with another thread. Since we
-		 * know next to nothing about a peer_ni created by
-		 * traffic, we just assume everything is ok and
-		 * return.
-		 */
+		CERROR("Cannot add %s as non-mr when it already exists\n",
+		       libcfs_nid2str(nid));
 		lnet_peer_ni_decref_locked(lpni);
-		goto out;
+		return -EEXIST;
 	}
 
-	/* Create peer, peer_net, and peer_ni. */
-	rc = -ENOMEM;
-	lp = lnet_peer_alloc(nid);
-	if (!lp)
-		goto out;
-	lpn = lnet_peer_net_alloc(LNET_NIDNET(nid));
-	if (!lpn)
-		goto out_free_lp;
-	lpni = lnet_peer_ni_alloc(nid);
-	if (!lpni)
-		goto out_free_lpn;
-	if (pref != LNET_NID_ANY)
-		lnet_peer_ni_set_non_mr_pref_nid(lpni, pref);
-
-	return lnet_peer_attach_peer_ni(lp, lpn, lpni, flags);
-
-out_free_lpn:
-	LIBCFS_FREE(lpn, sizeof(*lpn));
-out_free_lp:
-	LIBCFS_FREE(lp, sizeof(*lp));
-out:
-	CDEBUG(D_NET, "peer %s: %d\n", libcfs_nid2str(nid), rc);
-	return rc;
+	return lnet_peer_setup_hierarchy(NULL, NULL, nid);
 }
 
 /*
- * Implementation of IOC_LIBCFS_ADD_PEER_NI.
- *
  * This API handles the following combinations:
- *   Create a peer with its primary NI if only the prim_nid is provided
- *   Add a NID to a peer identified by the prim_nid. The peer identified
- *   by the prim_nid must already exist.
- *   The peer being created may be non-MR.
- *
- * The caller must hold ln_api_mutex. This prevents the peer from
- * being created/modified/deleted by a different thread.
+ *	Create a primary NI if only the prim_nid is provided
+ *	Create or add an lpni to a primary NI. Primary NI must've already
+ *	been created
+ *	Create a non-MR peer.
  */
 int
-lnet_add_peer_ni(lnet_nid_t prim_nid, lnet_nid_t nid, bool mr)
+lnet_add_peer_ni_to_peer(lnet_nid_t prim_nid, lnet_nid_t nid, bool mr)
 {
-	struct lnet_peer *lp = NULL;
-	struct lnet_peer_ni *lpni;
-	unsigned flags;
-
-	/* The prim_nid must always be specified */
-	if (prim_nid == LNET_NID_ANY)
-		return -EINVAL;
-
-	flags = LNET_PEER_CONFIGURED;
-	if (mr)
-		flags |= LNET_PEER_MULTI_RAIL;
-
 	/*
-	 * If nid isn't specified, we must create a new peer with
-	 * prim_nid as its primary nid.
+	 * Caller trying to setup an MR like peer hierarchy but
+	 * specifying it to be non-MR. This is not allowed.
 	 */
-	if (nid == LNET_NID_ANY)
-		return lnet_peer_add(prim_nid, flags);
-
-	/* Look up the prim_nid, which must exist. */
-	lpni = lnet_find_peer_ni_locked(prim_nid);
-	if (!lpni)
-		return -ENOENT;
-	lnet_peer_ni_decref_locked(lpni);
-	lp = lpni->lpni_peer_net->lpn_peer;
+	if (prim_nid != LNET_NID_ANY &&
+	    nid != LNET_NID_ANY && !mr)
+		return -EPERM;
 
-	/* Peer must have been configured. */
-	if (!(lp->lp_state & LNET_PEER_CONFIGURED)) {
-		CDEBUG(D_NET, "peer %s was not configured\n",
-		       libcfs_nid2str(prim_nid));
-		return -ENOENT;
-	}
+	/* Add the primary NID of a peer */
+	if (prim_nid != LNET_NID_ANY &&
+	    nid == LNET_NID_ANY && mr)
+		return lnet_add_prim_lpni(prim_nid);
 
-	/* Primary NID must match */
-	if (lp->lp_primary_nid != prim_nid) {
-		CDEBUG(D_NET, "prim_nid %s is not primary for peer %s\n",
-		       libcfs_nid2str(prim_nid),
-		       libcfs_nid2str(lp->lp_primary_nid));
-		return -ENODEV;
-	}
+	/* Add a NID to an existing peer */
+	if (prim_nid != LNET_NID_ANY &&
+	    nid != LNET_NID_ANY && mr)
+		return lnet_add_peer_ni_to_prim_lpni(prim_nid, nid);
 
-	/* Multi-Rail flag must match. */
-	if ((lp->lp_state ^ flags) & LNET_PEER_MULTI_RAIL) {
-		CDEBUG(D_NET, "multi-rail state mismatch for peer %s\n",
-		       libcfs_nid2str(prim_nid));
-		return -EPERM;
-	}
+	/* Add a non-MR peer NI */
+	if (((prim_nid != LNET_NID_ANY &&
+	      nid == LNET_NID_ANY) ||
+	     (prim_nid == LNET_NID_ANY &&
+	      nid != LNET_NID_ANY)) && !mr)
+		return lnet_peer_ni_add_non_mr(prim_nid != LNET_NID_ANY ?
+							 prim_nid : nid);
 
-	return lnet_peer_add_nid(lp, nid, flags);
+	return 0;
 }
 
-/*
- * Implementation of IOC_LIBCFS_DEL_PEER_NI.
- *
- * This API handles the following combinations:
- *   Delete a NI from a peer if both prim_nid and nid are provided.
- *   Delete a peer if only prim_nid is provided.
- *   Delete a peer if its primary nid is provided.
- *
- * The caller must hold ln_api_mutex. This prevents the peer from
- * being modified/deleted by a different thread.
- */
 int
-lnet_del_peer_ni(lnet_nid_t prim_nid, lnet_nid_t nid)
+lnet_del_peer_ni_from_peer(lnet_nid_t prim_nid, lnet_nid_t nid)
 {
-	struct lnet_peer *lp;
+	lnet_nid_t local_nid;
+	struct lnet_peer *peer;
 	struct lnet_peer_ni *lpni;
-	unsigned flags;
+	int rc;
 
 	if (prim_nid == LNET_NID_ANY)
 		return -EINVAL;
 
-	lpni = lnet_find_peer_ni_locked(prim_nid);
+	local_nid = (nid != LNET_NID_ANY) ? nid : prim_nid;
+
+	lpni = lnet_find_peer_ni_locked(local_nid);
 	if (!lpni)
-		return -ENOENT;
+		return -EINVAL;
 	lnet_peer_ni_decref_locked(lpni);
-	lp = lpni->lpni_peer_net->lpn_peer;
 
-	if (prim_nid != lp->lp_primary_nid) {
-		CDEBUG(D_NET, "prim_nid %s is not primary for peer %s\n",
-		       libcfs_nid2str(prim_nid),
-		       libcfs_nid2str(lp->lp_primary_nid));
-		return -ENODEV;
-	}
+	peer = lpni->lpni_peer_net->lpn_peer;
+	LASSERT(peer != NULL);
+
+	if (peer->lp_primary_nid == lpni->lpni_nid) {
+		/*
+		 * deleting the primary ni is equivalent to deleting the
+		 * entire peer
+		 */
+		lnet_net_lock(LNET_LOCK_EX);
+		rc = lnet_peer_del_locked(peer);
+		lnet_net_unlock(LNET_LOCK_EX);
 
-	if (nid == LNET_NID_ANY || nid == lp->lp_primary_nid)
-		return lnet_peer_del(lp);
+		return rc;
+	}
 
-	flags = LNET_PEER_CONFIGURED;
-	if (lp->lp_state & LNET_PEER_MULTI_RAIL)
-		flags |= LNET_PEER_MULTI_RAIL;
+	lnet_net_lock(LNET_LOCK_EX);
+	rc = lnet_peer_ni_del_locked(lpni);
+	lnet_net_unlock(LNET_LOCK_EX);
 
-	return lnet_peer_del_nid(lp, nid, flags);
+	return rc;
 }
 
 void
 lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lpni)
 {
 	struct lnet_peer_table *ptable;
-	struct lnet_peer_net *lpn;
-
-	CDEBUG(D_NET, "%p nid %s\n", lpni, libcfs_nid2str(lpni->lpni_nid));
 
 	LASSERT(atomic_read(&lpni->lpni_refcount) == 0);
 	LASSERT(lpni->lpni_rtr_refcount == 0);
 	LASSERT(list_empty(&lpni->lpni_txq));
 	LASSERT(lpni->lpni_txqnob == 0);
-	LASSERT(list_empty(&lpni->lpni_peer_nis));
-	LASSERT(list_empty(&lpni->lpni_on_remote_peer_ni_list));
 
-	lpn = lpni->lpni_peer_net;
-	lpni->lpni_peer_net = NULL;
 	lpni->lpni_net = NULL;
 
 	/* remove the peer ni from the zombie list */
@@ -1634,13 +982,7 @@ lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lpni)
 	ptable->pt_zombies--;
 	spin_unlock(&ptable->pt_zombie_lock);
 
-	if (lpni->lpni_pref_nnids > 1) {
-		LIBCFS_FREE(lpni->lpni_pref.nids,
-			sizeof(*lpni->lpni_pref.nids) * lpni->lpni_pref_nnids);
-	}
 	LIBCFS_FREE(lpni, sizeof(*lpni));
-
-	lnet_peer_net_decref_locked(lpn);
 }
 
 struct lnet_peer_ni *
@@ -1662,7 +1004,7 @@ lnet_nid2peerni_ex(lnet_nid_t nid, int cpt)
 
 	lnet_net_unlock(cpt);
 
-	rc = lnet_peer_ni_traffic_add(nid, LNET_NID_ANY);
+	rc = lnet_peer_ni_traffic_add(nid);
 	if (rc) {
 		lpni = ERR_PTR(rc);
 		goto out_net_relock;
@@ -1677,12 +1019,8 @@ lnet_nid2peerni_ex(lnet_nid_t nid, int cpt)
 	return lpni;
 }
 
-/*
- * Get a peer_ni for the given nid, create it if necessary. Takes a
- * hold on the peer_ni.
- */
 struct lnet_peer_ni *
-lnet_nid2peerni_locked(lnet_nid_t nid, lnet_nid_t pref, int cpt)
+lnet_nid2peerni_locked(lnet_nid_t nid, int cpt)
 {
 	struct lnet_peer_ni *lpni = NULL;
 	int rc;
@@ -1721,7 +1059,7 @@ lnet_nid2peerni_locked(lnet_nid_t nid, lnet_nid_t pref, int cpt)
 		goto out_mutex_unlock;
 	}
 
-	rc = lnet_peer_ni_traffic_add(nid, pref);
+	rc = lnet_peer_ni_traffic_add(nid);
 	if (rc) {
 		lpni = ERR_PTR(rc);
 		goto out_mutex_unlock;
@@ -1734,1615 +1072,20 @@ lnet_nid2peerni_locked(lnet_nid_t nid, lnet_nid_t pref, int cpt)
 	mutex_unlock(&the_lnet.ln_api_mutex);
 	lnet_net_lock(cpt);
 
-	/* Lock has been dropped, check again for shutdown. */
-	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
-		if (!IS_ERR(lpni))
-			lnet_peer_ni_decref_locked(lpni);
-		lpni = ERR_PTR(-ESHUTDOWN);
-	}
-
 	return lpni;
 }
 
-bool
-lnet_peer_is_uptodate(struct lnet_peer *lp)
+void
+lnet_debug_peer(lnet_nid_t nid)
 {
-	bool rc;
-
-	spin_lock(&lp->lp_lock);
-	rc = lnet_peer_is_uptodate_locked(lp);
-	spin_unlock(&lp->lp_lock);
-	return rc;
-}
-
-/*
- * Is a peer uptodate from the point of view of discovery?
- *
- * If it is currently being processed, obviously not.
- * A forced Ping or Push is also handled by the discovery thread.
- *
- * Otherwise look at whether the peer needs rediscovering.
- */
-bool
-lnet_peer_is_uptodate_locked(struct lnet_peer *lp)
-__must_hold(&lp->lp_lock)
-{
-	bool rc;
-
-	if (lp->lp_state & (LNET_PEER_DISCOVERING |
-			    LNET_PEER_FORCE_PING |
-			    LNET_PEER_FORCE_PUSH)) {
-		rc = false;
-	} else if (lp->lp_state & LNET_PEER_REDISCOVER) {
-			rc = false;
-	} else if (lnet_peer_needs_push(lp)) {
-		rc = false;
-	} else if (lp->lp_state & LNET_PEER_DISCOVERED) {
-		if (lp->lp_state & LNET_PEER_NIDS_UPTODATE)
-			rc = true;
-		else
-			rc = false;
-	} else {
-		rc = false;
-	}
-
-	return rc;
-}
-
-/*
- * Queue a peer for the attention of the discovery thread.  Call with
- * lnet_net_lock/EX held. Returns 0 if the peer was queued, and
- * -EALREADY if the peer was already queued.
- */
-static int lnet_peer_queue_for_discovery(struct lnet_peer *lp)
-{
-	int rc;
-
-	spin_lock(&lp->lp_lock);
-	if (!(lp->lp_state & LNET_PEER_DISCOVERING))
-		lp->lp_state |= LNET_PEER_DISCOVERING;
-	spin_unlock(&lp->lp_lock);
-	if (list_empty(&lp->lp_dc_list)) {
-		lnet_peer_addref_locked(lp);
-		list_add_tail(&lp->lp_dc_list, &the_lnet.ln_dc_request);
-		wake_up(&the_lnet.ln_dc_waitq);
-		rc = 0;
-	} else {
-		rc = -EALREADY;
-	}
-
-	CDEBUG(D_NET, "Queue peer %s: %d\n",
-	       libcfs_nid2str(lp->lp_primary_nid), rc);
-
-	return rc;
-}
-
-/*
- * Discovery of a peer is complete. Wake all waiters on the peer.
- * Call with lnet_net_lock/EX held.
- */
-static void lnet_peer_discovery_complete(struct lnet_peer *lp)
-{
-	struct lnet_msg *msg, *tmp;
-	int rc = 0;
-	struct list_head pending_msgs;
-
-	INIT_LIST_HEAD(&pending_msgs);
-
-	CDEBUG(D_NET, "Discovery complete. Dequeue peer %s\n",
-	       libcfs_nid2str(lp->lp_primary_nid));
-
-	list_del_init(&lp->lp_dc_list);
-	spin_lock(&lp->lp_lock);
-	list_splice_init(&lp->lp_dc_pendq, &pending_msgs);
-	spin_unlock(&lp->lp_lock);
-	wake_up_all(&lp->lp_dc_waitq);
-
-	lnet_net_unlock(LNET_LOCK_EX);
-
-	/* iterate through all pending messages and send them again */
-	list_for_each_entry_safe(msg, tmp, &pending_msgs, msg_list) {
-		list_del_init(&msg->msg_list);
-		if (lp->lp_dc_error) {
-			lnet_finalize(msg, lp->lp_dc_error);
-			continue;
-		}
-
-		CDEBUG(D_NET, "sending pending message %s to target %s\n",
-		       lnet_msgtyp2str(msg->msg_type),
-		       libcfs_id2str(msg->msg_target));
-		rc = lnet_send(msg->msg_src_nid_param, msg,
-			       msg->msg_rtr_nid_param);
-		if (rc < 0) {
-			CNETERR("Error sending %s to %s: %d\n",
-			       lnet_msgtyp2str(msg->msg_type),
-			       libcfs_id2str(msg->msg_target), rc);
-			lnet_finalize(msg, rc);
-		}
-	}
-	lnet_net_lock(LNET_LOCK_EX);
-	lnet_peer_decref_locked(lp);
-}
-
-/*
- * Handle inbound push.
- * Like any event handler, called with lnet_res_lock/CPT held.
- */
-void lnet_peer_push_event(struct lnet_event *ev)
-{
-	struct lnet_ping_buffer *pbuf = ev->md.user_ptr;
-	struct lnet_peer *lp;
-
-	/* lnet_find_peer() adds a refcount */
-	lp = lnet_find_peer(ev->source.nid);
-	if (!lp) {
-		CDEBUG(D_NET, "Push Put from unknown %s (source %s). Ignoring...\n",
-		       libcfs_nid2str(ev->initiator.nid),
-		       libcfs_nid2str(ev->source.nid));
-		return;
-	}
-
-	/* Ensure peer state remains consistent while we modify it. */
-	spin_lock(&lp->lp_lock);
-
-	/*
-	 * If some kind of error happened the contents of the message
-	 * cannot be used. Clear the NIDS_UPTODATE and set the
-	 * FORCE_PING flag to trigger a ping.
-	 */
-	if (ev->status) {
-		lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE;
-		lp->lp_state |= LNET_PEER_FORCE_PING;
-		CDEBUG(D_NET, "Push Put error %d from %s (source %s)\n",
-		       ev->status,
-		       libcfs_nid2str(lp->lp_primary_nid),
-		       libcfs_nid2str(ev->source.nid));
-		goto out;
-	}
-
-	/*
-	 * A push with invalid or corrupted info. Clear the UPTODATE
-	 * flag to trigger a ping.
-	 */
-	if (lnet_ping_info_validate(&pbuf->pb_info)) {
-		lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE;
-		lp->lp_state |= LNET_PEER_FORCE_PING;
-		CDEBUG(D_NET, "Corrupted Push from %s\n",
-		       libcfs_nid2str(lp->lp_primary_nid));
-		goto out;
-	}
-
-	/*
-	 * Make sure we'll allocate the correct size ping buffer when
-	 * pinging the peer.
-	 */
-	if (lp->lp_data_nnis < pbuf->pb_info.pi_nnis)
-		lp->lp_data_nnis = pbuf->pb_info.pi_nnis;
-
-	/*
-	 * A non-Multi-Rail peer is not supposed to be capable of
-	 * sending a push.
-	 */
-	if (!(pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL)) {
-		CERROR("Push from non-Multi-Rail peer %s dropped\n",
-		       libcfs_nid2str(lp->lp_primary_nid));
-		goto out;
-	}
-
-	/*
-	 * The peer may have discovery disabled at its end. Set
-	 * NO_DISCOVERY as appropriate.
-	 */
-	if (!(pbuf->pb_info.pi_features & LNET_PING_FEAT_DISCOVERY)) {
-		CDEBUG(D_NET, "Peer %s has discovery disabled\n",
-		       libcfs_nid2str(lp->lp_primary_nid));
-		lp->lp_state |= LNET_PEER_NO_DISCOVERY;
-	} else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) {
-		CDEBUG(D_NET, "Peer %s has discovery enabled\n",
-		       libcfs_nid2str(lp->lp_primary_nid));
-		lp->lp_state &= ~LNET_PEER_NO_DISCOVERY;
-	}
-
-	/*
-	 * Update the MULTI_RAIL flag based on the push. If the peer
-	 * was configured with DLC then the setting should match what
-	 * DLC put in.
-	 * NB: We verified above that the MR feature bit is set in pi_features
-	 */
-	if (lp->lp_state & LNET_PEER_MULTI_RAIL) {
-		CDEBUG(D_NET, "peer %s(%p) is MR\n",
-		       libcfs_nid2str(lp->lp_primary_nid), lp);
-	} else if (lp->lp_state & LNET_PEER_CONFIGURED) {
-		CWARN("Push says %s is Multi-Rail, DLC says not\n",
-		      libcfs_nid2str(lp->lp_primary_nid));
-	} else if (lnet_peer_discovery_disabled) {
-		CDEBUG(D_NET, "peer %s(%p) not MR: DD disabled locally\n",
-		       libcfs_nid2str(lp->lp_primary_nid), lp);
-	} else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) {
-		CDEBUG(D_NET, "peer %s(%p) not MR: DD disabled remotely\n",
-		       libcfs_nid2str(lp->lp_primary_nid), lp);
-	} else {
-		CDEBUG(D_NET, "peer %s(%p) is MR capable\n",
-		       libcfs_nid2str(lp->lp_primary_nid), lp);
-		lp->lp_state |= LNET_PEER_MULTI_RAIL;
-		lnet_peer_clr_non_mr_pref_nids(lp);
-	}
-
-	/*
-	 * Check for truncation of the Put message. Clear the
-	 * NIDS_UPTODATE flag and set FORCE_PING to trigger a ping,
-	 * and tell discovery to allocate a bigger buffer.
-	 */
-	if (pbuf->pb_nnis < pbuf->pb_info.pi_nnis) {
-		if (the_lnet.ln_push_target_nnis < pbuf->pb_info.pi_nnis)
-			the_lnet.ln_push_target_nnis = pbuf->pb_info.pi_nnis;
-		lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE;
-		lp->lp_state |= LNET_PEER_FORCE_PING;
-		CDEBUG(D_NET, "Truncated Push from %s (%d nids)\n",
-		       libcfs_nid2str(lp->lp_primary_nid),
-		       pbuf->pb_info.pi_nnis);
-		goto out;
-	}
-
-	/* always assume new data */
-	lp->lp_peer_seqno = LNET_PING_BUFFER_SEQNO(pbuf);
-	lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE;
-
-	/*
-	 * If there is data present that hasn't been processed yet,
-	 * we'll replace it if the Put contained newer data and it
-	 * fits. We're racing with a Ping or earlier Push in this
-	 * case.
-	 */
-	if (lp->lp_state & LNET_PEER_DATA_PRESENT) {
-		if (LNET_PING_BUFFER_SEQNO(pbuf) >
-			LNET_PING_BUFFER_SEQNO(lp->lp_data) &&
-		    pbuf->pb_info.pi_nnis <= lp->lp_data->pb_nnis) {
-			memcpy(&lp->lp_data->pb_info, &pbuf->pb_info,
-			       LNET_PING_INFO_SIZE(pbuf->pb_info.pi_nnis));
-			CDEBUG(D_NET, "Ping/Push race from %s: %u vs %u\n",
-			      libcfs_nid2str(lp->lp_primary_nid),
-			      LNET_PING_BUFFER_SEQNO(pbuf),
-			      LNET_PING_BUFFER_SEQNO(lp->lp_data));
-		}
-		goto out;
-	}
-
-	/*
-	 * Allocate a buffer to copy the data. On a failure we drop
-	 * the Push and set FORCE_PING to force the discovery
-	 * thread to fix the problem by pinging the peer.
-	 */
-	lp->lp_data = lnet_ping_buffer_alloc(lp->lp_data_nnis, GFP_ATOMIC);
-	if (!lp->lp_data) {
-		lp->lp_state |= LNET_PEER_FORCE_PING;
-		CDEBUG(D_NET, "Cannot allocate Push buffer for %s %u\n",
-		       libcfs_nid2str(lp->lp_primary_nid),
-		       LNET_PING_BUFFER_SEQNO(pbuf));
-		goto out;
-	}
-
-	/* Success */
-	memcpy(&lp->lp_data->pb_info, &pbuf->pb_info,
-	       LNET_PING_INFO_SIZE(pbuf->pb_info.pi_nnis));
-	lp->lp_state |= LNET_PEER_DATA_PRESENT;
-	CDEBUG(D_NET, "Received Push %s %u\n",
-	       libcfs_nid2str(lp->lp_primary_nid),
-	       LNET_PING_BUFFER_SEQNO(pbuf));
-
-out:
-	/*
-	 * Queue the peer for discovery if not done, force it on the request
-	 * queue and wake the discovery thread if the peer was already queued,
-	 * because its status changed.
-	 */
-	spin_unlock(&lp->lp_lock);
-	lnet_net_lock(LNET_LOCK_EX);
-	if (!lnet_peer_is_uptodate(lp) && lnet_peer_queue_for_discovery(lp)) {
-		list_move(&lp->lp_dc_list, &the_lnet.ln_dc_request);
-		wake_up(&the_lnet.ln_dc_waitq);
-	}
-	/* Drop refcount from lookup */
-	lnet_peer_decref_locked(lp);
-	lnet_net_unlock(LNET_LOCK_EX);
-}
-
-/*
- * Clear the discovery error state, unless we're already discovering
- * this peer, in which case the error is current.
- */
-static void lnet_peer_clear_discovery_error(struct lnet_peer *lp)
-{
-	spin_lock(&lp->lp_lock);
-	if (!(lp->lp_state & LNET_PEER_DISCOVERING))
-		lp->lp_dc_error = 0;
-	spin_unlock(&lp->lp_lock);
-}
-
-/*
- * Peer discovery slow path. The ln_api_mutex is held on entry, and
- * dropped/retaken within this function. An lnet_peer_ni is passed in
- * because discovery could tear down an lnet_peer.
- */
-int
-lnet_discover_peer_locked(struct lnet_peer_ni *lpni, int cpt, bool block)
-{
-	DEFINE_WAIT(wait);
-	struct lnet_peer *lp;
-	int rc = 0;
-	int count = 0;
-
-again:
-	lnet_net_unlock(cpt);
-	lnet_net_lock(LNET_LOCK_EX);
-	lp = lpni->lpni_peer_net->lpn_peer;
-	lnet_peer_clear_discovery_error(lp);
-
-	/*
-	 * We're willing to be interrupted. The lpni can become a
-	 * zombie if we race with DLC, so we must check for that.
-	 */
-	for (;;) {
-		/* Keep lp alive when the lnet_net_lock is unlocked */
-		lnet_peer_addref_locked(lp);
-		prepare_to_wait(&lp->lp_dc_waitq, &wait, TASK_INTERRUPTIBLE);
-		if (signal_pending(current))
-			break;
-		if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING)
-			break;
-		/*
-		 * Don't repeat discovery if discovery is disabled. This is
-		 * done to ensure we can use discovery as a standard ping as
-		 * well for backwards compatibility with routers which do not
-		 * have discovery or have discovery disabled
-		 */
-		if (lnet_is_discovery_disabled(lp) && count > 0)
-			break;
-		if (lp->lp_dc_error)
-			break;
-		if (lnet_peer_is_uptodate(lp))
-			break;
-		lnet_peer_queue_for_discovery(lp);
-		count++;
-		CDEBUG(D_NET, "Discovery attempt # %d\n", count);
-
-		/*
-		 * If caller requested a non-blocking operation then
-		 * return immediately. Once discovery is complete any
-		 * pending messages that were stopped due to discovery
-		 * will be transmitted.
-		 */
-		if (!block)
-			break;
-
-		lnet_net_unlock(LNET_LOCK_EX);
-		schedule();
-		finish_wait(&lp->lp_dc_waitq, &wait);
-		lnet_net_lock(LNET_LOCK_EX);
-		lnet_peer_decref_locked(lp);
-		/* Peer may have changed */
-		lp = lpni->lpni_peer_net->lpn_peer;
-	}
-	finish_wait(&lp->lp_dc_waitq, &wait);
-
-	lnet_net_unlock(LNET_LOCK_EX);
-	lnet_net_lock(cpt);
-	lnet_peer_decref_locked(lp);
-	/*
-	 * The peer may have changed, so re-check and rediscover if that turns
-	 * out to have been the case. The reference count on lp ensured that
-	 * even if it was unlinked from lpni the memory could not be recycled.
-	 * Thus the check below is sufficient to determine whether the peer
-	 * changed. If the peer changed, then lp must not be dereferenced.
-	 */
-	if (lp != lpni->lpni_peer_net->lpn_peer)
-		goto again;
-
-	if (signal_pending(current))
-		rc = -EINTR;
-	else if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING)
-		rc = -ESHUTDOWN;
-	else if (lp->lp_dc_error)
-		rc = lp->lp_dc_error;
-	else if (!block)
-		CDEBUG(D_NET, "non-blocking discovery\n");
-	else if (!lnet_peer_is_uptodate(lp) && !lnet_is_discovery_disabled(lp))
-		goto again;
-
-	CDEBUG(D_NET, "peer %s NID %s: %d. %s\n",
-	       (lp ? libcfs_nid2str(lp->lp_primary_nid) : "(none)"),
-	       libcfs_nid2str(lpni->lpni_nid), rc,
-	       (!block) ? "pending discovery" : "discovery complete");
-
-	return rc;
-}
-
-/* Handle an incoming ack for a push. */
-static void
-lnet_discovery_event_ack(struct lnet_peer *lp, struct lnet_event *ev)
-{
-	struct lnet_ping_buffer *pbuf;
-
-	pbuf = LNET_PING_INFO_TO_BUFFER(ev->md.start);
-	spin_lock(&lp->lp_lock);
-	lp->lp_state &= ~LNET_PEER_PUSH_SENT;
-	lp->lp_push_error = ev->status;
-	if (ev->status)
-		lp->lp_state |= LNET_PEER_PUSH_FAILED;
-	else
-		lp->lp_node_seqno = LNET_PING_BUFFER_SEQNO(pbuf);
-	spin_unlock(&lp->lp_lock);
-
-	CDEBUG(D_NET, "peer %s ev->status %d\n",
-	       libcfs_nid2str(lp->lp_primary_nid), ev->status);
-}
-
-/* Handle a Reply message. This is the reply to a Ping message. */
-static void
-lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev)
-{
-	struct lnet_ping_buffer *pbuf;
-	int rc;
-
-	spin_lock(&lp->lp_lock);
-
-	lp->lp_disc_src_nid = ev->target.nid;
-
-	/*
-	 * If some kind of error happened the contents of message
-	 * cannot be used. Set PING_FAILED to trigger a retry.
-	 */
-	if (ev->status) {
-		lp->lp_state |= LNET_PEER_PING_FAILED;
-		lp->lp_ping_error = ev->status;
-		CDEBUG(D_NET, "Ping Reply error %d from %s (source %s)\n",
-		       ev->status,
-		       libcfs_nid2str(lp->lp_primary_nid),
-		       libcfs_nid2str(ev->source.nid));
-		goto out;
-	}
-
-	pbuf = LNET_PING_INFO_TO_BUFFER(ev->md.start);
-	if (pbuf->pb_info.pi_magic == __swab32(LNET_PROTO_PING_MAGIC))
-		lnet_swap_pinginfo(pbuf);
-
-	/*
-	 * A reply with invalid or corrupted info. Set PING_FAILED to
-	 * trigger a retry.
-	 */
-	rc = lnet_ping_info_validate(&pbuf->pb_info);
-	if (rc) {
-		lp->lp_state |= LNET_PEER_PING_FAILED;
-		lp->lp_ping_error = 0;
-		CDEBUG(D_NET, "Corrupted Ping Reply from %s: %d\n",
-		       libcfs_nid2str(lp->lp_primary_nid), rc);
-		goto out;
-	}
-
-	/* The peer may have discovery disabled at its end. Set
-	 * NO_DISCOVERY as appropriate.
-	 */
-	if ((pbuf->pb_info.pi_features & LNET_PING_FEAT_DISCOVERY) &&
-		!lnet_peer_discovery_disabled) {
-		CDEBUG(D_NET, "Peer %s has discovery enabled\n",
-			libcfs_nid2str(lp->lp_primary_nid));
-		lp->lp_state &= ~LNET_PEER_NO_DISCOVERY;
-	} else {
-		CDEBUG(D_NET, "Peer %s has discovery disabled\n",
-			libcfs_nid2str(lp->lp_primary_nid));
-		lp->lp_state |= LNET_PEER_NO_DISCOVERY;
-	}
-
-	/*
-	 * Update the MULTI_RAIL flag based on the reply. If the peer
-	 * was configured with DLC then the setting should match what
-	 * DLC put in.
-	 */
-	if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL) {
-		if (lp->lp_state & LNET_PEER_MULTI_RAIL) {
-			CDEBUG(D_NET, "peer %s(%p) is MR\n",
-			       libcfs_nid2str(lp->lp_primary_nid), lp);
-		} else if (lp->lp_state & LNET_PEER_CONFIGURED) {
-			CWARN("Reply says %s is Multi-Rail, DLC says not\n",
-			      libcfs_nid2str(lp->lp_primary_nid));
-		} else if (lnet_peer_discovery_disabled) {
-			CDEBUG(D_NET,
-			       "peer %s(%p) not MR: DD disabled locally\n",
-			       libcfs_nid2str(lp->lp_primary_nid), lp);
-		} else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) {
-			CDEBUG(D_NET,
-			       "peer %s(%p) not MR: DD disabled remotely\n",
-			       libcfs_nid2str(lp->lp_primary_nid), lp);
-		} else {
-			CDEBUG(D_NET, "peer %s(%p) is MR capable\n",
-			       libcfs_nid2str(lp->lp_primary_nid), lp);
-			lp->lp_state |= LNET_PEER_MULTI_RAIL;
-			lnet_peer_clr_non_mr_pref_nids(lp);
-		}
-	} else if (lp->lp_state & LNET_PEER_MULTI_RAIL) {
-		if (lp->lp_state & LNET_PEER_CONFIGURED) {
-			CWARN("DLC says %s is Multi-Rail, Reply says not\n",
-			      libcfs_nid2str(lp->lp_primary_nid));
-		} else {
-			CERROR("Multi-Rail state vanished from %s\n",
-			       libcfs_nid2str(lp->lp_primary_nid));
-			lp->lp_state &= ~LNET_PEER_MULTI_RAIL;
-		}
-	}
-
-	/*
-	 * Make sure we'll allocate the correct size ping buffer when
-	 * pinging the peer.
-	 */
-	if (lp->lp_data_nnis < pbuf->pb_info.pi_nnis)
-		lp->lp_data_nnis = pbuf->pb_info.pi_nnis;
-
-	/*
-	 * Check for truncation of the Reply. Clear PING_SENT and set
-	 * PING_FAILED to trigger a retry.
-	 */
-	if (pbuf->pb_nnis < pbuf->pb_info.pi_nnis) {
-		if (the_lnet.ln_push_target_nnis < pbuf->pb_info.pi_nnis)
-			the_lnet.ln_push_target_nnis = pbuf->pb_info.pi_nnis;
-		lp->lp_state |= LNET_PEER_PING_FAILED;
-		lp->lp_ping_error = 0;
-		CDEBUG(D_NET, "Truncated Reply from %s (%d nids)\n",
-		       libcfs_nid2str(lp->lp_primary_nid),
-		       pbuf->pb_info.pi_nnis);
-		goto out;
-	}
-
-	/*
-	 * Check the sequence numbers in the reply. These are only
-	 * available if the reply came from a Multi-Rail peer.
-	 */
-	if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL &&
-	    pbuf->pb_info.pi_nnis > 1 &&
-	    lp->lp_primary_nid == pbuf->pb_info.pi_ni[1].ns_nid) {
-		if (LNET_PING_BUFFER_SEQNO(pbuf) < lp->lp_peer_seqno)
-			CDEBUG(D_NET, "peer %s: seq# got %u have %u. peer rebooted?\n",
-				libcfs_nid2str(lp->lp_primary_nid),
-				LNET_PING_BUFFER_SEQNO(pbuf),
-				lp->lp_peer_seqno);
-
-		lp->lp_peer_seqno = LNET_PING_BUFFER_SEQNO(pbuf);
-	}
-
-	/* We're happy with the state of the data in the buffer. */
-	CDEBUG(D_NET, "peer %s data present %u\n",
-	       libcfs_nid2str(lp->lp_primary_nid), lp->lp_peer_seqno);
-	if (lp->lp_state & LNET_PEER_DATA_PRESENT)
-		lnet_ping_buffer_decref(lp->lp_data);
-	else
-		lp->lp_state |= LNET_PEER_DATA_PRESENT;
-	lnet_ping_buffer_addref(pbuf);
-	lp->lp_data = pbuf;
-out:
-	lp->lp_state &= ~LNET_PEER_PING_SENT;
-	spin_unlock(&lp->lp_lock);
-}
-
-/*
- * Send event handling. Only matters for error cases, where we clean
- * up state on the peer and peer_ni that would otherwise be updated in
- * the REPLY event handler for a successful Ping, and the ACK event
- * handler for a successful Push.
- */
-static int
-lnet_discovery_event_send(struct lnet_peer *lp, struct lnet_event *ev)
-{
-	int rc = 0;
-
-	if (!ev->status)
-		goto out;
-
-	spin_lock(&lp->lp_lock);
-	if (ev->msg_type == LNET_MSG_GET) {
-		lp->lp_state &= ~LNET_PEER_PING_SENT;
-		lp->lp_state |= LNET_PEER_PING_FAILED;
-		lp->lp_ping_error = ev->status;
-	} else { /* ev->msg_type == LNET_MSG_PUT */
-		lp->lp_state &= ~LNET_PEER_PUSH_SENT;
-		lp->lp_state |= LNET_PEER_PUSH_FAILED;
-		lp->lp_push_error = ev->status;
-	}
-	spin_unlock(&lp->lp_lock);
-	rc = LNET_REDISCOVER_PEER;
-out:
-	CDEBUG(D_NET, "%s Send to %s: %d\n",
-		(ev->msg_type == LNET_MSG_GET ? "Ping" : "Push"),
-		libcfs_nid2str(ev->target.nid), rc);
-	return rc;
-}
-
-/*
- * Unlink event handling. This event is only seen if a call to
- * LNetMDUnlink() caused the event to be unlinked. If this call was
- * made after the event was set up in LNetGet() or LNetPut() then we
- * assume the Ping or Push timed out.
- */
-static void
-lnet_discovery_event_unlink(struct lnet_peer *lp, struct lnet_event *ev)
-{
-	spin_lock(&lp->lp_lock);
-	/* We've passed through LNetGet() */
-	if (lp->lp_state & LNET_PEER_PING_SENT) {
-		lp->lp_state &= ~LNET_PEER_PING_SENT;
-		lp->lp_state |= LNET_PEER_PING_FAILED;
-		lp->lp_ping_error = -ETIMEDOUT;
-		CDEBUG(D_NET, "Ping Unlink for message to peer %s\n",
-			libcfs_nid2str(lp->lp_primary_nid));
-	}
-	/* We've passed through LNetPut() */
-	if (lp->lp_state & LNET_PEER_PUSH_SENT) {
-		lp->lp_state &= ~LNET_PEER_PUSH_SENT;
-		lp->lp_state |= LNET_PEER_PUSH_FAILED;
-		lp->lp_push_error = -ETIMEDOUT;
-		CDEBUG(D_NET, "Push Unlink for message to peer %s\n",
-			libcfs_nid2str(lp->lp_primary_nid));
-	}
-	spin_unlock(&lp->lp_lock);
-}
-
-/*
- * Event handler for the discovery EQ.
- *
- * Called with lnet_res_lock(cpt) held. The cpt is the
- * lnet_cpt_of_cookie() of the md handle cookie.
- */
-static void lnet_discovery_event_handler(struct lnet_event *event)
-{
-	struct lnet_peer *lp = event->md.user_ptr;
-	struct lnet_ping_buffer *pbuf;
-	int rc;
-
-	/* discovery needs to take another look */
-	rc = LNET_REDISCOVER_PEER;
-
-	CDEBUG(D_NET, "Received event: %d\n", event->type);
-
-	switch (event->type) {
-	case LNET_EVENT_ACK:
-		lnet_discovery_event_ack(lp, event);
-		break;
-	case LNET_EVENT_REPLY:
-		lnet_discovery_event_reply(lp, event);
-		break;
-	case LNET_EVENT_SEND:
-		/* Only send failure triggers a retry. */
-		rc = lnet_discovery_event_send(lp, event);
-		break;
-	case LNET_EVENT_UNLINK:
-		/* LNetMDUnlink() was called */
-		lnet_discovery_event_unlink(lp, event);
-		break;
-	default:
-		/* Invalid events. */
-		LBUG();
-	}
-	lnet_net_lock(LNET_LOCK_EX);
-	if (event->unlinked) {
-		pbuf = LNET_PING_INFO_TO_BUFFER(event->md.start);
-		lnet_ping_buffer_decref(pbuf);
-		lnet_peer_decref_locked(lp);
-	}
-
-	/* put peer back at end of request queue, if discovery not already
-	 * done */
-	if (rc == LNET_REDISCOVER_PEER && !lnet_peer_is_uptodate(lp) &&
-	    lnet_peer_queue_for_discovery(lp)) {
-		list_move_tail(&lp->lp_dc_list, &the_lnet.ln_dc_request);
-		wake_up(&the_lnet.ln_dc_waitq);
-	}
-	lnet_net_unlock(LNET_LOCK_EX);
-}
-
-/*
- * Build a peer from incoming data.
- *
- * The NIDs in the incoming data are supposed to be structured as follows:
- *  - loopback
- *  - primary NID
- *  - other NIDs in same net
- *  - NIDs in second net
- *  - NIDs in third net
- *  - ...
- * This due to the way the list of NIDs in the data is created.
- *
- * Note that this function will mark the peer uptodate unless an
- * ENOMEM is encontered. All other errors are due to a conflict
- * between the DLC configuration and what discovery sees. We treat DLC
- * as binding, and therefore set the NIDS_UPTODATE flag to prevent the
- * peer from becoming stuck in discovery.
- */
-static int lnet_peer_merge_data(struct lnet_peer *lp,
-				struct lnet_ping_buffer *pbuf)
-{
-	struct lnet_peer_ni *lpni;
-	lnet_nid_t *curnis = NULL;
-	lnet_nid_t *addnis = NULL;
-	lnet_nid_t *delnis = NULL;
-	unsigned flags;
-	int ncurnis;
-	int naddnis;
-	int ndelnis;
-	int nnis = 0;
-	int i;
-	int j;
-	int rc;
-
-	flags = LNET_PEER_DISCOVERED;
-	if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL)
-		flags |= LNET_PEER_MULTI_RAIL;
-
-	nnis = MAX(lp->lp_nnis, pbuf->pb_info.pi_nnis);
-	LIBCFS_ALLOC(curnis, nnis * sizeof(lnet_nid_t));
-	LIBCFS_ALLOC(addnis, nnis * sizeof(lnet_nid_t));
-	LIBCFS_ALLOC(delnis, nnis * sizeof(lnet_nid_t));
-	if (!curnis || !addnis || !delnis) {
-		rc = -ENOMEM;
-		goto out;
-	}
-	ncurnis = 0;
-	naddnis = 0;
-	ndelnis = 0;
-
-	/* Construct the list of NIDs present in peer. */
-	lpni = NULL;
-	while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL)
-		curnis[ncurnis++] = lpni->lpni_nid;
-
-	/*
-	 * Check for NIDs in pbuf not present in curnis[].
-	 * The loop starts at 1 to skip the loopback NID.
-	 */
-	for (i = 1; i < pbuf->pb_info.pi_nnis; i++) {
-		for (j = 0; j < ncurnis; j++)
-			if (pbuf->pb_info.pi_ni[i].ns_nid == curnis[j])
-				break;
-		if (j == ncurnis)
-			addnis[naddnis++] = pbuf->pb_info.pi_ni[i].ns_nid;
-	}
-	/*
-	 * Check for NIDs in curnis[] not present in pbuf.
-	 * The nested loop starts at 1 to skip the loopback NID.
-	 *
-	 * But never add the loopback NID to delnis[]: if it is
-	 * present in curnis[] then this peer is for this node.
-	 */
-	for (i = 0; i < ncurnis; i++) {
-		if (curnis[i] == LNET_NID_LO_0)
-			continue;
-		for (j = 1; j < pbuf->pb_info.pi_nnis; j++)
-			if (curnis[i] == pbuf->pb_info.pi_ni[j].ns_nid)
-				break;
-		if (j == pbuf->pb_info.pi_nnis)
-			delnis[ndelnis++] = curnis[i];
-	}
-
-	rc = 0;
-	if (lnet_is_discovery_disabled(lp))
-		goto out;
-
-	for (i = 0; i < naddnis; i++) {
-		rc = lnet_peer_add_nid(lp, addnis[i], flags);
-		if (rc) {
-			CERROR("Error adding NID %s to peer %s: %d\n",
-			       libcfs_nid2str(addnis[i]),
-			       libcfs_nid2str(lp->lp_primary_nid), rc);
-			if (rc == -ENOMEM)
-				goto out;
-		}
-	}
-	for (i = 0; i < ndelnis; i++) {
-		rc = lnet_peer_del_nid(lp, delnis[i], flags);
-		if (rc) {
-			CERROR("Error deleting NID %s from peer %s: %d\n",
-			       libcfs_nid2str(delnis[i]),
-			       libcfs_nid2str(lp->lp_primary_nid), rc);
-			if (rc == -ENOMEM)
-				goto out;
-		}
-	}
-	/*
-	 * Errors other than -ENOMEM are due to peers having been
-	 * configured with DLC. Ignore these because DLC overrides
-	 * Discovery.
-	 */
-	rc = 0;
-out:
-	LIBCFS_FREE(curnis, nnis * sizeof(lnet_nid_t));
-	LIBCFS_FREE(addnis, nnis * sizeof(lnet_nid_t));
-	LIBCFS_FREE(delnis, nnis * sizeof(lnet_nid_t));
-	lnet_ping_buffer_decref(pbuf);
-	CDEBUG(D_NET, "peer %s: %d\n", libcfs_nid2str(lp->lp_primary_nid), rc);
-
-	if (rc) {
-		spin_lock(&lp->lp_lock);
-		lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE;
-		lp->lp_state |= LNET_PEER_FORCE_PING;
-		spin_unlock(&lp->lp_lock);
-	}
-	return rc;
-}
-
-/*
- * The data in pbuf says lp is its primary peer, but the data was
- * received by a different peer. Try to update lp with the data.
- */
-static int
-lnet_peer_set_primary_data(struct lnet_peer *lp, struct lnet_ping_buffer *pbuf)
-{
-	struct lnet_handle_md mdh;
-
-	/* Queue lp for discovery, and force it on the request queue. */
-	lnet_net_lock(LNET_LOCK_EX);
-	if (lnet_peer_queue_for_discovery(lp))
-		list_move(&lp->lp_dc_list, &the_lnet.ln_dc_request);
-	lnet_net_unlock(LNET_LOCK_EX);
-
-	LNetInvalidateMDHandle(&mdh);
-
-	/*
-	 * Decide whether we can move the peer to the DATA_PRESENT state.
-	 *
-	 * We replace stale data for a multi-rail peer, repair PING_FAILED
-	 * status, and preempt FORCE_PING.
-	 *
-	 * If after that we have DATA_PRESENT, we merge it into this peer.
-	 */
-	spin_lock(&lp->lp_lock);
-	if (lp->lp_state & LNET_PEER_MULTI_RAIL) {
-		if (lp->lp_peer_seqno < LNET_PING_BUFFER_SEQNO(pbuf)) {
-			lp->lp_peer_seqno = LNET_PING_BUFFER_SEQNO(pbuf);
-		} else if (lp->lp_state & LNET_PEER_DATA_PRESENT) {
-			lp->lp_state &= ~LNET_PEER_DATA_PRESENT;
-			lnet_ping_buffer_decref(pbuf);
-			pbuf = lp->lp_data;
-			lp->lp_data = NULL;
-		}
-	}
-	if (lp->lp_state & LNET_PEER_DATA_PRESENT) {
-		lnet_ping_buffer_decref(lp->lp_data);
-		lp->lp_data = NULL;
-		lp->lp_state &= ~LNET_PEER_DATA_PRESENT;
-	}
-	if (lp->lp_state & LNET_PEER_PING_FAILED) {
-		mdh = lp->lp_ping_mdh;
-		LNetInvalidateMDHandle(&lp->lp_ping_mdh);
-		lp->lp_state &= ~LNET_PEER_PING_FAILED;
-		lp->lp_ping_error = 0;
-	}
-	if (lp->lp_state & LNET_PEER_FORCE_PING)
-		lp->lp_state &= ~LNET_PEER_FORCE_PING;
-	lp->lp_state |= LNET_PEER_NIDS_UPTODATE;
-	spin_unlock(&lp->lp_lock);
-
-	if (!LNetMDHandleIsInvalid(mdh))
-		LNetMDUnlink(mdh);
-
-	if (pbuf)
-		return lnet_peer_merge_data(lp, pbuf);
-
-	CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(lp->lp_primary_nid));
-	return 0;
-}
-
-static bool lnet_is_nid_in_ping_info(lnet_nid_t nid, struct lnet_ping_info *pinfo)
-{
-	int i;
-
-	for (i = 0; i < pinfo->pi_nnis; i++) {
-		if (pinfo->pi_ni[i].ns_nid == nid)
-			return true;
-	}
-
-	return false;
-}
-
-/*
- * Update a peer using the data received.
- */
-static int lnet_peer_data_present(struct lnet_peer *lp)
-__must_hold(&lp->lp_lock)
-{
-	struct lnet_ping_buffer *pbuf;
-	struct lnet_peer_ni *lpni;
-	lnet_nid_t nid = LNET_NID_ANY;
-	unsigned flags;
-	int rc = 0;
-
-	pbuf = lp->lp_data;
-	lp->lp_data = NULL;
-	lp->lp_state &= ~LNET_PEER_DATA_PRESENT;
-	lp->lp_state |= LNET_PEER_NIDS_UPTODATE;
-	spin_unlock(&lp->lp_lock);
-
-	/*
-	 * Modifications of peer structures are done while holding the
-	 * ln_api_mutex. A global lock is required because we may be
-	 * modifying multiple peer structures, and a mutex greatly
-	 * simplifies memory management.
-	 *
-	 * The actual changes to the data structures must also protect
-	 * against concurrent lookups, for which the lnet_net_lock in
-	 * LNET_LOCK_EX mode is used.
-	 */
-	mutex_lock(&the_lnet.ln_api_mutex);
-	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
-		rc = -ESHUTDOWN;
-		goto out;
-	}
-
-	/*
-	 * If this peer is not on the peer list then it is being torn
-	 * down, and our reference count may be all that is keeping it
-	 * alive. Don't do any work on it.
-	 */
-	if (list_empty(&lp->lp_peer_list))
-		goto out;
-
-	flags = LNET_PEER_DISCOVERED;
-	if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL)
-		flags |= LNET_PEER_MULTI_RAIL;
-
-	/*
-	 * Check whether the primary NID in the message matches the
-	 * primary NID of the peer. If it does, update the peer, if
-	 * it it does not, check whether there is already a peer with
-	 * that primary NID. If no such peer exists, try to update
-	 * the primary NID of the current peer (allowed if it was
-	 * created due to message traffic) and complete the update.
-	 * If the peer did exist, hand off the data to it.
-	 *
-	 * The peer for the loopback interface is a special case: this
-	 * is the peer for the local node, and we want to set its
-	 * primary NID to the correct value here. Moreover, this peer
-	 * can show up with only the loopback NID in the ping buffer.
-	 */
-	if (pbuf->pb_info.pi_nnis <= 1)
-		goto out;
-	nid = pbuf->pb_info.pi_ni[1].ns_nid;
-	if (lp->lp_primary_nid == LNET_NID_LO_0) {
-		rc = lnet_peer_set_primary_nid(lp, nid, flags);
-		if (!rc)
-			rc = lnet_peer_merge_data(lp, pbuf);
-	} else if (lp->lp_primary_nid == nid ||
-		   (lnet_is_nid_in_ping_info(lp->lp_primary_nid, &pbuf->pb_info) &&
-		    lnet_is_discovery_disabled(lp))) {
-		rc = lnet_peer_merge_data(lp, pbuf);
-	} else {
-		lpni = lnet_find_peer_ni_locked(nid);
-		if (!lpni) {
-			rc = lnet_peer_set_primary_nid(lp, nid, flags);
-			if (rc) {
-				CERROR("Primary NID error %s versus %s: %d\n",
-				       libcfs_nid2str(lp->lp_primary_nid),
-				       libcfs_nid2str(nid), rc);
-			} else {
-				rc = lnet_peer_merge_data(lp, pbuf);
-			}
-		} else {
-			struct lnet_peer *new_lp;
-
-			new_lp = lpni->lpni_peer_net->lpn_peer;
-			/* if lp has discovery/MR enabled that means new_lp
-			 * should have discovery/MR enabled as well, since
-			 * it's the same peer, which we're about to merge
-			 */
-			if (!(lp->lp_state & LNET_PEER_NO_DISCOVERY))
-				new_lp->lp_state &= ~LNET_PEER_NO_DISCOVERY;
-			if (lp->lp_state & LNET_PEER_MULTI_RAIL)
-				new_lp->lp_state |= LNET_PEER_MULTI_RAIL;
-			rc = lnet_peer_set_primary_data(
-				lpni->lpni_peer_net->lpn_peer, pbuf);
-			lnet_peer_ni_decref_locked(lpni);
-		}
-	}
-out:
-	CDEBUG(D_NET, "peer %s: %d\n", libcfs_nid2str(lp->lp_primary_nid), rc);
-	mutex_unlock(&the_lnet.ln_api_mutex);
-
-	spin_lock(&lp->lp_lock);
-	/* Tell discovery to re-check the peer immediately. */
-	if (!rc)
-		rc = LNET_REDISCOVER_PEER;
-	return rc;
-}
-
-/*
- * A ping failed. Clear the PING_FAILED state and set the
- * FORCE_PING state, to ensure a retry even if discovery is
- * disabled. This avoids being left with incorrect state.
- */
-static int lnet_peer_ping_failed(struct lnet_peer *lp)
-__must_hold(&lp->lp_lock)
-{
-	struct lnet_handle_md mdh;
-	int rc;
-
-	mdh = lp->lp_ping_mdh;
-	LNetInvalidateMDHandle(&lp->lp_ping_mdh);
-	lp->lp_state &= ~LNET_PEER_PING_FAILED;
-	lp->lp_state |= LNET_PEER_FORCE_PING;
-	rc = lp->lp_ping_error;
-	lp->lp_ping_error = 0;
-	spin_unlock(&lp->lp_lock);
-
-	if (!LNetMDHandleIsInvalid(mdh))
-		LNetMDUnlink(mdh);
-
-	CDEBUG(D_NET, "peer %s:%d\n",
-	       libcfs_nid2str(lp->lp_primary_nid), rc);
-
-	spin_lock(&lp->lp_lock);
-	return rc ? rc : LNET_REDISCOVER_PEER;
-}
-
-/*
- * Select NID to send a Ping or Push to.
- */
-static lnet_nid_t lnet_peer_select_nid(struct lnet_peer *lp)
-{
-	struct lnet_peer_ni *lpni;
-
-	/* Look for a direct-connected NID for this peer. */
-	lpni = NULL;
-	while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL) {
-		if (!lnet_get_net_locked(lpni->lpni_peer_net->lpn_net_id))
-			continue;
-		break;
-	}
-	if (lpni)
-		return lpni->lpni_nid;
-
-	/* Look for a routed-connected NID for this peer. */
-	lpni = NULL;
-	while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL) {
-		if (!lnet_find_rnet_locked(lpni->lpni_peer_net->lpn_net_id))
-			continue;
-		break;
-	}
-	if (lpni)
-		return lpni->lpni_nid;
-
-	return LNET_NID_ANY;
-}
-
-/* Active side of ping. */
-static int lnet_peer_send_ping(struct lnet_peer *lp)
-__must_hold(&lp->lp_lock)
-{
-	lnet_nid_t pnid;
-	int nnis;
-	int rc;
-	int cpt;
-
-	lp->lp_state |= LNET_PEER_PING_SENT;
-	lp->lp_state &= ~LNET_PEER_FORCE_PING;
-	spin_unlock(&lp->lp_lock);
-
-	cpt = lnet_net_lock_current();
-	/* Refcount for MD. */
-	lnet_peer_addref_locked(lp);
-	pnid = lnet_peer_select_nid(lp);
-	lnet_net_unlock(cpt);
-
-	nnis = MAX(lp->lp_data_nnis, LNET_INTERFACES_MIN);
-
-	rc = lnet_send_ping(pnid, &lp->lp_ping_mdh, nnis, lp,
-			    the_lnet.ln_dc_eqh, false);
-
-	/*
-	 * if LNetMDBind in lnet_send_ping fails we need to decrement the
-	 * refcount on the peer, otherwise LNetMDUnlink will be called
-	 * which will eventually do that.
-	 */
-	if (rc > 0) {
-		lnet_net_lock(cpt);
-		lnet_peer_decref_locked(lp);
-		lnet_net_unlock(cpt);
-		rc = -rc; /* change the rc to negative value */
-		goto fail_error;
-	} else if (rc < 0) {
-		goto fail_error;
-	}
-
-	CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(lp->lp_primary_nid));
-
-	spin_lock(&lp->lp_lock);
-	return 0;
-
-fail_error:
-	CDEBUG(D_NET, "peer %s: %d\n", libcfs_nid2str(lp->lp_primary_nid), rc);
-	/*
-	 * The errors that get us here are considered hard errors and
-	 * cause Discovery to terminate. So we clear PING_SENT, but do
-	 * not set either PING_FAILED or FORCE_PING. In fact we need
-	 * to clear PING_FAILED, because the unlink event handler will
-	 * have set it if we called LNetMDUnlink() above.
-	 */
-	spin_lock(&lp->lp_lock);
-	lp->lp_state &= ~(LNET_PEER_PING_SENT | LNET_PEER_PING_FAILED);
-	return rc;
-}
-
-/*
- * This function exists because you cannot call LNetMDUnlink() from an
- * event handler.
- */
-static int lnet_peer_push_failed(struct lnet_peer *lp)
-__must_hold(&lp->lp_lock)
-{
-	struct lnet_handle_md mdh;
-	int rc;
-
-	mdh = lp->lp_push_mdh;
-	LNetInvalidateMDHandle(&lp->lp_push_mdh);
-	lp->lp_state &= ~LNET_PEER_PUSH_FAILED;
-	rc = lp->lp_push_error;
-	lp->lp_push_error = 0;
-	spin_unlock(&lp->lp_lock);
-
-	if (!LNetMDHandleIsInvalid(mdh))
-		LNetMDUnlink(mdh);
-
-	CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(lp->lp_primary_nid));
-	spin_lock(&lp->lp_lock);
-	return rc ? rc : LNET_REDISCOVER_PEER;
-}
-
-/*
- * Mark the peer as discovered.
- */
-static int lnet_peer_discovered(struct lnet_peer *lp)
-__must_hold(&lp->lp_lock)
-{
-	lp->lp_state |= LNET_PEER_DISCOVERED;
-	lp->lp_state &= ~(LNET_PEER_DISCOVERING |
-			  LNET_PEER_REDISCOVER);
-
-	CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(lp->lp_primary_nid));
-
-	return 0;
-}
-
-/* Active side of push. */
-static int lnet_peer_send_push(struct lnet_peer *lp)
-__must_hold(&lp->lp_lock)
-{
-	struct lnet_ping_buffer *pbuf;
-	struct lnet_process_id id;
-	struct lnet_md md;
-	int cpt;
-	int rc;
-
-	/* Don't push to a non-multi-rail peer. */
-	if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) {
-		lp->lp_state &= ~LNET_PEER_FORCE_PUSH;
-		/* if peer's NIDs are uptodate then peer is discovered */
-		if (lp->lp_state & LNET_PEER_NIDS_UPTODATE) {
-			rc = lnet_peer_discovered(lp);
-			return rc;
-		}
-
-		return 0;
-	}
-
-	lp->lp_state |= LNET_PEER_PUSH_SENT;
-	lp->lp_state &= ~LNET_PEER_FORCE_PUSH;
-	spin_unlock(&lp->lp_lock);
-
-	cpt = lnet_net_lock_current();
-	pbuf = the_lnet.ln_ping_target;
-	lnet_ping_buffer_addref(pbuf);
-	lnet_net_unlock(cpt);
-
-	/* Push source MD */
-	md.start     = &pbuf->pb_info;
-	md.length    = LNET_PING_INFO_SIZE(pbuf->pb_nnis);
-	md.threshold = 2; /* Put/Ack */
-	md.max_size  = 0;
-	md.options   = 0;
-	md.eq_handle = the_lnet.ln_dc_eqh;
-	md.user_ptr  = lp;
-
-	rc = LNetMDBind(md, LNET_UNLINK, &lp->lp_push_mdh);
-	if (rc) {
-		lnet_ping_buffer_decref(pbuf);
-		CERROR("Can't bind push source MD: %d\n", rc);
-		goto fail_error;
-	}
-	cpt = lnet_net_lock_current();
-	/* Refcount for MD. */
-	lnet_peer_addref_locked(lp);
-	id.pid = LNET_PID_LUSTRE;
-	id.nid = lnet_peer_select_nid(lp);
-	lnet_net_unlock(cpt);
-
-	if (id.nid == LNET_NID_ANY) {
-		rc = -EHOSTUNREACH;
-		goto fail_unlink;
-	}
-
-	rc = LNetPut(lp->lp_disc_src_nid, lp->lp_push_mdh,
-		     LNET_ACK_REQ, id, LNET_RESERVED_PORTAL,
-		     LNET_PROTO_PING_MATCHBITS, 0, 0);
-
-	/*
-	 * reset the discovery nid. There is no need to restrict sending
-	 * from that source, if we call lnet_push_update_to_peers(). It'll
-	 * get set to a specific NID, if we initiate discovery from the
-	 * scratch
-	 */
-	lp->lp_disc_src_nid = LNET_NID_ANY;
-
-	if (rc)
-		goto fail_unlink;
-
-	CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(lp->lp_primary_nid));
-
-	spin_lock(&lp->lp_lock);
-	return 0;
-
-fail_unlink:
-	LNetMDUnlink(lp->lp_push_mdh);
-	LNetInvalidateMDHandle(&lp->lp_push_mdh);
-fail_error:
-	CDEBUG(D_NET, "peer %s: %d\n", libcfs_nid2str(lp->lp_primary_nid), rc);
-	/*
-	 * The errors that get us here are considered hard errors and
-	 * cause Discovery to terminate. So we clear PUSH_SENT, but do
-	 * not set PUSH_FAILED. In fact we need to clear PUSH_FAILED,
-	 * because the unlink event handler will have set it if we
-	 * called LNetMDUnlink() above.
-	 */
-	spin_lock(&lp->lp_lock);
-	lp->lp_state &= ~(LNET_PEER_PUSH_SENT | LNET_PEER_PUSH_FAILED);
-	return rc;
-}
-
-/*
- * An unrecoverable error was encountered during discovery.
- * Set error status in peer and abort discovery.
- */
-static void lnet_peer_discovery_error(struct lnet_peer *lp, int error)
-{
-	CDEBUG(D_NET, "Discovery error %s: %d\n",
-	       libcfs_nid2str(lp->lp_primary_nid), error);
-
-	spin_lock(&lp->lp_lock);
-	lp->lp_dc_error = error;
-	lp->lp_state &= ~LNET_PEER_DISCOVERING;
-	lp->lp_state |= LNET_PEER_REDISCOVER;
-	spin_unlock(&lp->lp_lock);
-}
-
-/*
- * Discovering this peer is taking too long. Cancel any Ping or Push
- * that discovery is waiting on by unlinking the relevant MDs. The
- * lnet_discovery_event_handler() will proceed from here and complete
- * the cleanup.
- */
-static void lnet_peer_cancel_discovery(struct lnet_peer *lp)
-{
-	struct lnet_handle_md ping_mdh;
-	struct lnet_handle_md push_mdh;
-
-	LNetInvalidateMDHandle(&ping_mdh);
-	LNetInvalidateMDHandle(&push_mdh);
-
-	spin_lock(&lp->lp_lock);
-	if (lp->lp_state & LNET_PEER_PING_SENT) {
-		ping_mdh = lp->lp_ping_mdh;
-		LNetInvalidateMDHandle(&lp->lp_ping_mdh);
-	}
-	if (lp->lp_state & LNET_PEER_PUSH_SENT) {
-		push_mdh = lp->lp_push_mdh;
-		LNetInvalidateMDHandle(&lp->lp_push_mdh);
-	}
-	spin_unlock(&lp->lp_lock);
-
-	if (!LNetMDHandleIsInvalid(ping_mdh))
-		LNetMDUnlink(ping_mdh);
-	if (!LNetMDHandleIsInvalid(push_mdh))
-		LNetMDUnlink(push_mdh);
-}
-
-/*
- * Wait for work to be queued or some other change that must be
- * attended to. Returns non-zero if the discovery thread should shut
- * down.
- */
-static int lnet_peer_discovery_wait_for_work(void)
-{
-	int cpt;
-	int rc = 0;
-
-	DEFINE_WAIT(wait);
-
-	cpt = lnet_net_lock_current();
-	for (;;) {
-		prepare_to_wait(&the_lnet.ln_dc_waitq, &wait,
-				TASK_INTERRUPTIBLE);
-		if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING)
-			break;
-		if (lnet_push_target_resize_needed())
-			break;
-		if (!list_empty(&the_lnet.ln_dc_request))
-			break;
-		if (!list_empty(&the_lnet.ln_msg_resend))
-			break;
-		lnet_net_unlock(cpt);
-
-		/*
-		 * wakeup max every second to check if there are peers that
-		 * have been stuck on the working queue for greater than
-		 * the peer timeout.
-		 */
-		schedule_timeout(cfs_time_seconds(1));
-		finish_wait(&the_lnet.ln_dc_waitq, &wait);
-		cpt = lnet_net_lock_current();
-	}
-	finish_wait(&the_lnet.ln_dc_waitq, &wait);
-
-	if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING)
-		rc = -ESHUTDOWN;
-
-	lnet_net_unlock(cpt);
-
-	CDEBUG(D_NET, "woken: %d\n", rc);
-
-	return rc;
-}
-
-/*
- * Messages that were pending on a destroyed peer will be put on a global
- * resend list. The message resend list will be checked by
- * the discovery thread when it wakes up, and will resend messages. These
- * messages can still be sendable in the case the lpni which was the initial
- * cause of the message re-queue was transfered to another peer.
- *
- * It is possible that LNet could be shutdown while we're iterating
- * through the list. lnet_shudown_lndnets() will attempt to access the
- * resend list, but will have to wait until the spinlock is released, by
- * which time there shouldn't be any more messages on the resend list.
- * During shutdown lnet_send() will fail and lnet_finalize() will be called
- * for the messages so they can be released. The other case is that
- * lnet_shudown_lndnets() can finalize all the messages before this
- * function can visit the resend list, in which case this function will be
- * a no-op.
- */
-static void lnet_resend_msgs(void)
-{
-	struct lnet_msg *msg, *tmp;
-	struct list_head resend;
-	int rc;
-
-	INIT_LIST_HEAD(&resend);
-
-	spin_lock(&the_lnet.ln_msg_resend_lock);
-	list_splice(&the_lnet.ln_msg_resend, &resend);
-	spin_unlock(&the_lnet.ln_msg_resend_lock);
-
-	list_for_each_entry_safe(msg, tmp, &resend, msg_list) {
-		list_del_init(&msg->msg_list);
-		rc = lnet_send(msg->msg_src_nid_param, msg,
-			       msg->msg_rtr_nid_param);
-		if (rc < 0) {
-			CNETERR("Error sending %s to %s: %d\n",
-			       lnet_msgtyp2str(msg->msg_type),
-			       libcfs_id2str(msg->msg_target), rc);
-			lnet_finalize(msg, rc);
-		}
-	}
-}
-
-/* The discovery thread. */
-static int lnet_peer_discovery(void *arg)
-{
-	struct lnet_peer *lp;
-	int rc;
-
-	CDEBUG(D_NET, "started\n");
-	cfs_block_allsigs();
-
-	for (;;) {
-		if (lnet_peer_discovery_wait_for_work())
-			break;
-
-		lnet_resend_msgs();
-
-		if (lnet_push_target_resize_needed())
-			lnet_push_target_resize();
-
-		lnet_net_lock(LNET_LOCK_EX);
-		if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING) {
-			lnet_net_unlock(LNET_LOCK_EX);
-			break;
-		}
-
-		/*
-		 * Process all incoming discovery work requests.  When
-		 * discovery must wait on a peer to change state, it
-		 * is added to the tail of the ln_dc_working queue. A
-		 * timestamp keeps track of when the peer was added,
-		 * so we can time out discovery requests that take too
-		 * long.
-		 */
-		while (!list_empty(&the_lnet.ln_dc_request)) {
-			lp = list_first_entry(&the_lnet.ln_dc_request,
-					      struct lnet_peer, lp_dc_list);
-			list_move(&lp->lp_dc_list, &the_lnet.ln_dc_working);
-			/*
-			 * set the time the peer was put on the dc_working
-			 * queue. It shouldn't remain on the queue
-			 * forever, in case the GET message (for ping)
-			 * doesn't get a REPLY or the PUT message (for
-			 * push) doesn't get an ACK.
-			 */
-			lp->lp_last_queued = ktime_get_real_seconds();
-			lnet_net_unlock(LNET_LOCK_EX);
-
-			/*
-			 * Select an action depending on the state of
-			 * the peer and whether discovery is disabled.
-			 * The check whether discovery is disabled is
-			 * done after the code that handles processing
-			 * for arrived data, cleanup for failures, and
-			 * forcing a Ping or Push.
-			 */
-			spin_lock(&lp->lp_lock);
-			CDEBUG(D_NET, "peer %s state %#x\n",
-				libcfs_nid2str(lp->lp_primary_nid),
-				lp->lp_state);
-			if (lp->lp_state & LNET_PEER_DATA_PRESENT)
-				rc = lnet_peer_data_present(lp);
-			else if (lp->lp_state & LNET_PEER_PING_FAILED)
-				rc = lnet_peer_ping_failed(lp);
-			else if (lp->lp_state & LNET_PEER_PUSH_FAILED)
-				rc = lnet_peer_push_failed(lp);
-			else if (lp->lp_state & LNET_PEER_FORCE_PING)
-				rc = lnet_peer_send_ping(lp);
-			else if (lp->lp_state & LNET_PEER_FORCE_PUSH)
-				rc = lnet_peer_send_push(lp);
-			else if (!(lp->lp_state & LNET_PEER_NIDS_UPTODATE))
-				rc = lnet_peer_send_ping(lp);
-			else if (lnet_peer_needs_push(lp))
-				rc = lnet_peer_send_push(lp);
-			else
-				rc = lnet_peer_discovered(lp);
-			CDEBUG(D_NET, "peer %s state %#x rc %d\n",
-				libcfs_nid2str(lp->lp_primary_nid),
-				lp->lp_state, rc);
-			spin_unlock(&lp->lp_lock);
-
-			lnet_net_lock(LNET_LOCK_EX);
-			if (rc == LNET_REDISCOVER_PEER) {
-				list_move(&lp->lp_dc_list,
-					  &the_lnet.ln_dc_request);
-			} else if (rc) {
-				lnet_peer_discovery_error(lp, rc);
-			}
-			if (!(lp->lp_state & LNET_PEER_DISCOVERING))
-				lnet_peer_discovery_complete(lp);
-			if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING)
-				break;
-		}
-
-		lnet_net_unlock(LNET_LOCK_EX);
-	}
-
-	CDEBUG(D_NET, "stopping\n");
-	/*
-	 * Clean up before telling lnet_peer_discovery_stop() that
-	 * we're done. Use wake_up() below to somewhat reduce the
-	 * size of the thundering herd if there are multiple threads
-	 * waiting on discovery of a single peer.
-	 */
-
-	/* Queue cleanup 1: stop all pending pings and pushes. */
-	lnet_net_lock(LNET_LOCK_EX);
-	while (!list_empty(&the_lnet.ln_dc_working)) {
-		lp = list_first_entry(&the_lnet.ln_dc_working,
-				      struct lnet_peer, lp_dc_list);
-		list_move(&lp->lp_dc_list, &the_lnet.ln_dc_expired);
-		lnet_net_unlock(LNET_LOCK_EX);
-		lnet_peer_cancel_discovery(lp);
-		lnet_net_lock(LNET_LOCK_EX);
-	}
-	lnet_net_unlock(LNET_LOCK_EX);
-
-	/* Queue cleanup 2: wait for the expired queue to clear. */
-	while (!list_empty(&the_lnet.ln_dc_expired))
-		schedule_timeout(cfs_time_seconds(1));
-
-	/* Queue cleanup 3: clear the request queue. */
-	lnet_net_lock(LNET_LOCK_EX);
-	while (!list_empty(&the_lnet.ln_dc_request)) {
-		lp = list_first_entry(&the_lnet.ln_dc_request,
-				      struct lnet_peer, lp_dc_list);
-		lnet_peer_discovery_error(lp, -ESHUTDOWN);
-		lnet_peer_discovery_complete(lp);
-	}
-	lnet_net_unlock(LNET_LOCK_EX);
-
-	LNetEQFree(the_lnet.ln_dc_eqh);
-	LNetInvalidateEQHandle(&the_lnet.ln_dc_eqh);
-
-	the_lnet.ln_dc_state = LNET_DC_STATE_SHUTDOWN;
-	wake_up(&the_lnet.ln_dc_waitq);
-
-	CDEBUG(D_NET, "stopped\n");
-
-	return 0;
-}
-
-/* ln_api_mutex is held on entry. */
-int lnet_peer_discovery_start(void)
-{
-	struct task_struct *task;
-	int rc;
-
-	if (the_lnet.ln_dc_state != LNET_DC_STATE_SHUTDOWN)
-		return -EALREADY;
-
-	rc = LNetEQAlloc(0, lnet_discovery_event_handler, &the_lnet.ln_dc_eqh);
-	if (rc != 0) {
-		CERROR("Can't allocate discovery EQ: %d\n", rc);
-		return rc;
-	}
-
-	the_lnet.ln_dc_state = LNET_DC_STATE_RUNNING;
-	task = kthread_run(lnet_peer_discovery, NULL, "lnet_discovery");
-	if (IS_ERR(task)) {
-		rc = PTR_ERR(task);
-		CERROR("Can't start peer discovery thread: %d\n", rc);
-
-		LNetEQFree(the_lnet.ln_dc_eqh);
-		LNetInvalidateEQHandle(&the_lnet.ln_dc_eqh);
-
-		the_lnet.ln_dc_state = LNET_DC_STATE_SHUTDOWN;
-	}
-
-	CDEBUG(D_NET, "discovery start: %d\n", rc);
-
-	return rc;
-}
-
-/* ln_api_mutex is held on entry. */
-void lnet_peer_discovery_stop(void)
-{
-	if (the_lnet.ln_dc_state == LNET_DC_STATE_SHUTDOWN)
-		return;
-
-	LASSERT(the_lnet.ln_dc_state == LNET_DC_STATE_RUNNING);
-	the_lnet.ln_dc_state = LNET_DC_STATE_STOPPING;
-	wake_up(&the_lnet.ln_dc_waitq);
-
-	wait_event(the_lnet.ln_dc_waitq,
-		   the_lnet.ln_dc_state == LNET_DC_STATE_SHUTDOWN);
-
-	LASSERT(list_empty(&the_lnet.ln_dc_request));
-	LASSERT(list_empty(&the_lnet.ln_dc_working));
-	LASSERT(list_empty(&the_lnet.ln_dc_expired));
-
-	CDEBUG(D_NET, "discovery stopped\n");
-}
-
-/* Debugging */
-
-void
-lnet_debug_peer(lnet_nid_t nid)
-{
-	char			*aliveness = "NA";
-	struct lnet_peer_ni	*lp;
-	int			cpt;
+	char			*aliveness = "NA";
+	struct lnet_peer_ni	*lp;
+	int			cpt;
 
 	cpt = lnet_cpt_of_nid(nid, NULL);
 	lnet_net_lock(cpt);
 
-	lp = lnet_nid2peerni_locked(nid, LNET_NID_ANY, cpt);
+	lp = lnet_nid2peerni_locked(nid, cpt);
 	if (IS_ERR(lp)) {
 		lnet_net_unlock(cpt);
 		CDEBUG(D_WARNING, "No peer %s\n", libcfs_nid2str(nid));
@@ -3363,8 +1106,6 @@ lnet_debug_peer(lnet_nid_t nid)
 	lnet_net_unlock(cpt);
 }
 
-/* Gathering information for userspace. */
-
 int lnet_get_peer_ni_info(__u32 peer_index, __u64 *nid,
 			  char aliveness[LNET_MAX_STR_LEN],
 			  __u32 *cpt_iter, __u32 *refcount,
@@ -3428,193 +1169,56 @@ int lnet_get_peer_ni_info(__u32 peer_index, __u64 *nid,
 	return found ? 0 : -ENOENT;
 }
 
-/* ln_api_mutex is held, which keeps the peer list stable */
-int lnet_get_peer_info(struct lnet_ioctl_peer_cfg *cfg, void __user *bulk)
+int lnet_get_peer_info(__u32 idx, lnet_nid_t *primary_nid, lnet_nid_t *nid,
+		       bool *mr,
+		       struct lnet_peer_ni_credit_info __user *peer_ni_info,
+		       struct lnet_ioctl_element_stats __user *peer_ni_stats)
 {
-	struct lnet_ioctl_element_stats *lpni_stats;
-	struct lnet_ioctl_element_msg_stats *lpni_msg_stats;
-	struct lnet_ioctl_peer_ni_hstats *lpni_hstats;
-	struct lnet_peer_ni_credit_info *lpni_info;
-	struct lnet_peer_ni *lpni;
-	struct lnet_peer *lp;
-	lnet_nid_t nid;
-	__u32 size;
+	struct lnet_peer_ni *lpni = NULL;
+	struct lnet_peer_net *lpn = NULL;
+	struct lnet_peer *lp = NULL;
+	struct lnet_peer_ni_credit_info ni_info;
+	struct lnet_ioctl_element_stats ni_stats;
 	int rc;
 
-	lp = lnet_find_peer(cfg->prcfg_prim_nid);
-
-	if (!lp) {
-		rc = -ENOENT;
-		goto out;
-	}
+	lpni = lnet_get_peer_ni_idx_locked(idx, &lpn, &lp);
 
-	size = sizeof(nid) + sizeof(*lpni_info) + sizeof(*lpni_stats)
-		+ sizeof(*lpni_msg_stats) + sizeof(*lpni_hstats);
-	size *= lp->lp_nnis;
-	if (size > cfg->prcfg_size) {
-		cfg->prcfg_size = size;
-		rc = -E2BIG;
-		goto out_lp_decref;
-	}
+	if (!lpni)
+		return -ENOENT;
 
-	cfg->prcfg_prim_nid = lp->lp_primary_nid;
-	cfg->prcfg_mr = lnet_peer_is_multi_rail(lp);
-	cfg->prcfg_cfg_nid = lp->lp_primary_nid;
-	cfg->prcfg_count = lp->lp_nnis;
-	cfg->prcfg_size = size;
-	cfg->prcfg_state = lp->lp_state;
-
-	/* Allocate helper buffers. */
-	rc = -ENOMEM;
-	LIBCFS_ALLOC(lpni_info, sizeof(*lpni_info));
-	if (!lpni_info)
-		goto out_lp_decref;
-	LIBCFS_ALLOC(lpni_stats, sizeof(*lpni_stats));
-	if (!lpni_stats)
-		goto out_free_info;
-	LIBCFS_ALLOC(lpni_msg_stats, sizeof(*lpni_msg_stats));
-	if (!lpni_msg_stats)
-		goto out_free_stats;
-	LIBCFS_ALLOC(lpni_hstats, sizeof(*lpni_hstats));
-	if (!lpni_hstats)
-		goto out_free_msg_stats;
-
-
-	lpni = NULL;
+	*primary_nid = lp->lp_primary_nid;
+	*mr = lp->lp_multi_rail;
+	*nid = lpni->lpni_nid;
+	snprintf(ni_info.cr_aliveness, LNET_MAX_STR_LEN, "NA");
+	if (lnet_isrouter(lpni) ||
+		lnet_peer_aliveness_enabled(lpni))
+		snprintf(ni_info.cr_aliveness, LNET_MAX_STR_LEN,
+			 lpni->lpni_alive ? "up" : "down");
+
+	ni_info.cr_refcount = atomic_read(&lpni->lpni_refcount);
+	ni_info.cr_ni_peer_tx_credits = (lpni->lpni_net != NULL) ?
+		lpni->lpni_net->net_tunables.lct_peer_tx_credits : 0;
+	ni_info.cr_peer_tx_credits = lpni->lpni_txcredits;
+	ni_info.cr_peer_rtr_credits = lpni->lpni_rtrcredits;
+	ni_info.cr_peer_min_rtr_credits = lpni->lpni_minrtrcredits;
+	ni_info.cr_peer_min_tx_credits = lpni->lpni_mintxcredits;
+	ni_info.cr_peer_tx_qnob = lpni->lpni_txqnob;
+	ni_info.cr_ncpt = lpni->lpni_cpt;
+
+	ni_stats.iel_send_count = atomic_read(&lpni->lpni_stats.send_count);
+	ni_stats.iel_recv_count = atomic_read(&lpni->lpni_stats.recv_count);
+	ni_stats.iel_drop_count = atomic_read(&lpni->lpni_stats.drop_count);
+
+	/* If copy_to_user fails */
 	rc = -EFAULT;
-	while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL) {
-		nid = lpni->lpni_nid;
-		if (copy_to_user(bulk, &nid, sizeof(nid)))
-			goto out_free_hstats;
-		bulk += sizeof(nid);
-
-		memset(lpni_info, 0, sizeof(*lpni_info));
-		snprintf(lpni_info->cr_aliveness, LNET_MAX_STR_LEN, "NA");
-		if (lnet_isrouter(lpni) ||
-			lnet_peer_aliveness_enabled(lpni))
-			snprintf(lpni_info->cr_aliveness, LNET_MAX_STR_LEN,
-				lpni->lpni_alive ? "up" : "down");
-
-		lpni_info->cr_refcount = atomic_read(&lpni->lpni_refcount);
-		lpni_info->cr_ni_peer_tx_credits = (lpni->lpni_net != NULL) ?
-			lpni->lpni_net->net_tunables.lct_peer_tx_credits : 0;
-		lpni_info->cr_peer_tx_credits = lpni->lpni_txcredits;
-		lpni_info->cr_peer_rtr_credits = lpni->lpni_rtrcredits;
-		lpni_info->cr_peer_min_rtr_credits = lpni->lpni_minrtrcredits;
-		lpni_info->cr_peer_min_tx_credits = lpni->lpni_mintxcredits;
-		lpni_info->cr_peer_tx_qnob = lpni->lpni_txqnob;
-		if (copy_to_user(bulk, lpni_info, sizeof(*lpni_info)))
-			goto out_free_hstats;
-		bulk += sizeof(*lpni_info);
-
-		memset(lpni_stats, 0, sizeof(*lpni_stats));
-		lpni_stats->iel_send_count = lnet_sum_stats(&lpni->lpni_stats,
-							    LNET_STATS_TYPE_SEND);
-		lpni_stats->iel_recv_count = lnet_sum_stats(&lpni->lpni_stats,
-							    LNET_STATS_TYPE_RECV);
-		lpni_stats->iel_drop_count = lnet_sum_stats(&lpni->lpni_stats,
-							    LNET_STATS_TYPE_DROP);
-		if (copy_to_user(bulk, lpni_stats, sizeof(*lpni_stats)))
-			goto out_free_hstats;
-		bulk += sizeof(*lpni_stats);
-		lnet_usr_translate_stats(lpni_msg_stats, &lpni->lpni_stats);
-		if (copy_to_user(bulk, lpni_msg_stats, sizeof(*lpni_msg_stats)))
-			goto out_free_hstats;
-		bulk += sizeof(*lpni_msg_stats);
-		lpni_hstats->hlpni_network_timeout =
-		  atomic_read(&lpni->lpni_hstats.hlt_network_timeout);
-		lpni_hstats->hlpni_remote_dropped =
-		  atomic_read(&lpni->lpni_hstats.hlt_remote_dropped);
-		lpni_hstats->hlpni_remote_timeout =
-		  atomic_read(&lpni->lpni_hstats.hlt_remote_timeout);
-		lpni_hstats->hlpni_remote_error =
-		  atomic_read(&lpni->lpni_hstats.hlt_remote_error);
-		lpni_hstats->hlpni_health_value =
-		  atomic_read(&lpni->lpni_healthv);
-		if (copy_to_user(bulk, lpni_hstats, sizeof(*lpni_hstats)))
-			goto out_free_hstats;
-		bulk += sizeof(*lpni_hstats);
-	}
-	rc = 0;
-
-out_free_hstats:
-	LIBCFS_FREE(lpni_hstats, sizeof(*lpni_hstats));
-out_free_msg_stats:
-	LIBCFS_FREE(lpni_msg_stats, sizeof(*lpni_msg_stats));
-out_free_stats:
-	LIBCFS_FREE(lpni_stats, sizeof(*lpni_stats));
-out_free_info:
-	LIBCFS_FREE(lpni_info, sizeof(*lpni_info));
-out_lp_decref:
-	lnet_peer_decref_locked(lp);
-out:
-	return rc;
-}
-
-void
-lnet_peer_ni_add_to_recoveryq_locked(struct lnet_peer_ni *lpni)
-{
-	/* the mt could've shutdown and cleaned up the queues */
-	if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING)
-		return;
-
-	if (list_empty(&lpni->lpni_recovery) &&
-	    atomic_read(&lpni->lpni_healthv) < LNET_MAX_HEALTH_VALUE) {
-		CDEBUG(D_NET, "lpni %s added to recovery queue. Health = %d\n",
-			libcfs_nid2str(lpni->lpni_nid),
-			atomic_read(&lpni->lpni_healthv));
-		list_add_tail(&lpni->lpni_recovery, &the_lnet.ln_mt_peerNIRecovq);
-		lnet_peer_ni_addref_locked(lpni);
-	}
-}
-
-/* Call with the ln_api_mutex held */
-void
-lnet_peer_ni_set_healthv(lnet_nid_t nid, int value, bool all)
-{
-	struct lnet_peer_table *ptable;
-	struct lnet_peer *lp;
-	struct lnet_peer_net *lpn;
-	struct lnet_peer_ni *lpni;
-	int lncpt;
-	int cpt;
-
-	if (the_lnet.ln_state != LNET_STATE_RUNNING)
-		return;
+	if (copy_to_user(peer_ni_info, &ni_info, sizeof(ni_info)))
+		goto copy_failed;
 
-	if (!all) {
-		lnet_net_lock(LNET_LOCK_EX);
-		lpni = lnet_find_peer_ni_locked(nid);
-		if (!lpni) {
-			lnet_net_unlock(LNET_LOCK_EX);
-			return;
-		}
-		atomic_set(&lpni->lpni_healthv, value);
-		lnet_peer_ni_add_to_recoveryq_locked(lpni);
-		lnet_peer_ni_decref_locked(lpni);
-		lnet_net_unlock(LNET_LOCK_EX);
-		return;
-	}
+	if (copy_to_user(peer_ni_stats, &ni_stats, sizeof(ni_stats)))
+		goto copy_failed;
 
-	lncpt = cfs_percpt_number(the_lnet.ln_peer_tables);
+	rc = 0;
 
-	/*
-	 * Walk all the peers and reset the healhv for each one to the
-	 * maximum value.
-	 */
-	lnet_net_lock(LNET_LOCK_EX);
-	for (cpt = 0; cpt < lncpt; cpt++) {
-		ptable = the_lnet.ln_peer_tables[cpt];
-		list_for_each_entry(lp, &ptable->pt_peer_list, lp_peer_list) {
-			list_for_each_entry(lpn, &lp->lp_peer_nets, lpn_peer_nets) {
-				list_for_each_entry(lpni, &lpn->lpn_peer_nis,
-						    lpni_peer_nis) {
-					atomic_set(&lpni->lpni_healthv, value);
-					lnet_peer_ni_add_to_recoveryq_locked(lpni);
-				}
-			}
-		}
-	}
-	lnet_net_unlock(LNET_LOCK_EX);
+copy_failed:
+	return rc;
 }
-
diff --git a/drivers/staging/lustrefsx/lnet/lnet/router.c b/drivers/staging/lustrefsx/lnet/lnet/router.c
index e2966cf77c561..bd30963a960d1 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/router.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/router.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  *
  *   This file is part of Lustre, https://wiki.whamcloud.com/
  *
@@ -68,6 +68,9 @@ lnet_peer_buffer_credits(struct lnet_net *net)
 	return net->net_tunables.lct_peer_tx_credits;
 }
 
+/* forward ref's */
+static int lnet_router_checker(void *);
+
 static int check_routers_before_use;
 module_param(check_routers_before_use, int, 0444);
 MODULE_PARM_DESC(check_routers_before_use, "Assume routers are down and ping them before use");
@@ -96,9 +99,9 @@ lnet_peers_start_down(void)
 
 void
 lnet_notify_locked(struct lnet_peer_ni *lp, int notifylnd, int alive,
-		   time64_t when)
+		   cfs_time_t when)
 {
-	if (lp->lpni_timestamp > when) { /* out of date information */
+	if (cfs_time_before(when, lp->lpni_timestamp)) { /* out of date information */
 		CDEBUG(D_NET, "Out of date\n");
 		return;
 	}
@@ -111,7 +114,7 @@ lnet_notify_locked(struct lnet_peer_ni *lp, int notifylnd, int alive,
 	 */
 	spin_lock(&lp->lpni_lock);
 
-	lp->lpni_timestamp = when; /* update timestamp */
+	lp->lpni_timestamp = when;                /* update timestamp */
 	lp->lpni_ping_deadline = 0;               /* disable ping timeout */
 
 	if (lp->lpni_alive_count != 0 &&          /* got old news */
@@ -331,7 +334,7 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
 	       libcfs_net2str(net), hops, priority, libcfs_nid2str(gateway));
 
 	if (gateway == LNET_NID_ANY ||
-	    gateway == LNET_NID_LO_0 ||
+	    LNET_NETTYP(LNET_NIDNET(gateway)) == LOLND ||
 	    net == LNET_NIDNET(LNET_NID_ANY) ||
 	    LNET_NETTYP(net) == LOLND ||
 	    LNET_NIDNET(gateway) == net ||
@@ -341,13 +344,6 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
 	if (lnet_islocalnet(net))	/* it's a local network */
 		return -EEXIST;
 
-	if (!lnet_islocalnet(LNET_NIDNET(gateway))) {
-		CERROR("Cannot add route with gateway %s. There is no local interface configured on LNet %s\n",
-		       libcfs_nid2str(gateway),
-		       libcfs_net2str(LNET_NIDNET(gateway)));
-		return -EHOSTUNREACH;
-	}
-
 	/* Assume net, route, all new */
 	LIBCFS_ALLOC(route, sizeof(*route));
 	LIBCFS_ALLOC(rnet, sizeof(*rnet));
@@ -437,8 +433,8 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
 	if (rnet != rnet2)
 		LIBCFS_FREE(rnet, sizeof(*rnet));
 
-	/* kick start the monitor thread to handle the added route */
-	wake_up(&the_lnet.ln_mt_waitq);
+	/* indicate to startup the router checker if configured */
+	wake_up(&the_lnet.ln_rc_waitq);
 
 	return rc;
 }
@@ -581,29 +577,29 @@ lnet_destroy_routes (void)
 	lnet_del_route(LNET_NIDNET(LNET_NID_ANY), LNET_NID_ANY);
 }
 
-int lnet_get_rtr_pool_cfg(int cpt, struct lnet_ioctl_pool_cfg *pool_cfg)
+int lnet_get_rtr_pool_cfg(int idx, struct lnet_ioctl_pool_cfg *pool_cfg)
 {
-	struct lnet_rtrbufpool *rbp;
 	int i, rc = -ENOENT, j;
 
 	if (the_lnet.ln_rtrpools == NULL)
 		return rc;
 
+	for (i = 0; i < LNET_NRBPOOLS; i++) {
+		struct lnet_rtrbufpool *rbp;
 
-	cfs_percpt_for_each(rbp, i, the_lnet.ln_rtrpools) {
-		if (i != cpt)
-			continue;
+		lnet_net_lock(LNET_LOCK_EX);
+		cfs_percpt_for_each(rbp, j, the_lnet.ln_rtrpools) {
+			if (i++ != idx)
+				continue;
 
-		lnet_net_lock(i);
-		for (j = 0; j < LNET_NRBPOOLS; j++) {
-			pool_cfg->pl_pools[j].pl_npages = rbp[j].rbp_npages;
-			pool_cfg->pl_pools[j].pl_nbuffers = rbp[j].rbp_nbuffers;
-			pool_cfg->pl_pools[j].pl_credits = rbp[j].rbp_credits;
-			pool_cfg->pl_pools[j].pl_mincredits = rbp[j].rbp_mincredits;
+			pool_cfg->pl_pools[i].pl_npages = rbp[i].rbp_npages;
+			pool_cfg->pl_pools[i].pl_nbuffers = rbp[i].rbp_nbuffers;
+			pool_cfg->pl_pools[i].pl_credits = rbp[i].rbp_credits;
+			pool_cfg->pl_pools[i].pl_mincredits = rbp[i].rbp_mincredits;
+			rc = 0;
+			break;
 		}
-		lnet_net_unlock(i);
-		rc = 0;
-		break;
+		lnet_net_unlock(LNET_LOCK_EX);
 	}
 
 	lnet_net_lock(LNET_LOCK_EX);
@@ -654,21 +650,17 @@ lnet_get_route(int idx, __u32 *net, __u32 *hops,
 }
 
 void
-lnet_swap_pinginfo(struct lnet_ping_buffer *pbuf)
+lnet_swap_pinginfo(struct lnet_ping_info *info)
 {
+	int		  i;
 	struct lnet_ni_status *stat;
-	int nnis;
-	int i;
 
-	__swab32s(&pbuf->pb_info.pi_magic);
-	__swab32s(&pbuf->pb_info.pi_features);
-	__swab32s(&pbuf->pb_info.pi_pid);
-	__swab32s(&pbuf->pb_info.pi_nnis);
-	nnis = pbuf->pb_info.pi_nnis;
-	if (nnis > pbuf->pb_nnis)
-		nnis = pbuf->pb_nnis;
-	for (i = 0; i < nnis; i++) {
-		stat = &pbuf->pb_info.pi_ni[i];
+	__swab32s(&info->pi_magic);
+	__swab32s(&info->pi_features);
+	__swab32s(&info->pi_pid);
+	__swab32s(&info->pi_nnis);
+	for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
+		stat = &info->pi_ni[i];
 		__swab64s(&stat->ns_nid);
 		__swab32s(&stat->ns_status);
 	}
@@ -682,12 +674,11 @@ lnet_swap_pinginfo(struct lnet_ping_buffer *pbuf)
 static void
 lnet_parse_rc_info(struct lnet_rc_data *rcd)
 {
-	struct lnet_ping_buffer	*pbuf = rcd->rcd_pingbuffer;
+	struct lnet_ping_info	*info = rcd->rcd_pinginfo;
 	struct lnet_peer_ni	*gw   = rcd->rcd_gateway;
 	struct lnet_route		*rte;
-	int			nnis;
 
-	if (!gw->lpni_alive || !pbuf)
+	if (!gw->lpni_alive)
 		return;
 
 	/*
@@ -696,29 +687,29 @@ lnet_parse_rc_info(struct lnet_rc_data *rcd)
 	 */
 	spin_lock(&gw->lpni_lock);
 
-	if (pbuf->pb_info.pi_magic == __swab32(LNET_PROTO_PING_MAGIC))
-		lnet_swap_pinginfo(pbuf);
+	if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC))
+		lnet_swap_pinginfo(info);
 
 	/* NB always racing with network! */
-	if (pbuf->pb_info.pi_magic != LNET_PROTO_PING_MAGIC) {
+	if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
 		CDEBUG(D_NET, "%s: Unexpected magic %08x\n",
-		       libcfs_nid2str(gw->lpni_nid), pbuf->pb_info.pi_magic);
+		       libcfs_nid2str(gw->lpni_nid), info->pi_magic);
 		gw->lpni_ping_feats = LNET_PING_FEAT_INVAL;
-		goto out;
+		spin_unlock(&gw->lpni_lock);
+		return;
 	}
 
-	gw->lpni_ping_feats = pbuf->pb_info.pi_features;
-
-	/* Without NI status info there's nothing more to do. */
-	if ((gw->lpni_ping_feats & LNET_PING_FEAT_NI_STATUS) == 0)
-		goto out;
+	gw->lpni_ping_feats = info->pi_features;
+	if ((gw->lpni_ping_feats & LNET_PING_FEAT_MASK) == 0) {
+		CDEBUG(D_NET, "%s: Unexpected features 0x%x\n",
+		       libcfs_nid2str(gw->lpni_nid), gw->lpni_ping_feats);
+		spin_unlock(&gw->lpni_lock);
+		return; /* nothing I can understand */
+	}
 
-	/* Determine the number of NIs for which there is data. */
-	nnis = pbuf->pb_info.pi_nnis;
-	if (pbuf->pb_nnis < nnis) {
-		if (rcd->rcd_nnis < nnis)
-			rcd->rcd_nnis = nnis;
-		nnis = pbuf->pb_nnis;
+	if ((gw->lpni_ping_feats & LNET_PING_FEAT_NI_STATUS) == 0) {
+		spin_unlock(&gw->lpni_lock);
+		return; /* can't carry NI status info */
 	}
 
 	list_for_each_entry(rte, &gw->lpni_routes, lr_gwlist) {
@@ -726,24 +717,24 @@ lnet_parse_rc_info(struct lnet_rc_data *rcd)
 		int	up = 0;
 		int	i;
 
-		/* If routing disabled then the route is down. */
 		if ((gw->lpni_ping_feats & LNET_PING_FEAT_RTE_DISABLED) != 0) {
 			rte->lr_downis = 1;
 			continue;
 		}
 
-		for (i = 0; i < nnis; i++) {
-			struct lnet_ni_status *stat = &pbuf->pb_info.pi_ni[i];
+		for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
+			struct lnet_ni_status *stat = &info->pi_ni[i];
 			lnet_nid_t	 nid = stat->ns_nid;
 
 			if (nid == LNET_NID_ANY) {
 				CDEBUG(D_NET, "%s: unexpected LNET_NID_ANY\n",
 				       libcfs_nid2str(gw->lpni_nid));
 				gw->lpni_ping_feats = LNET_PING_FEAT_INVAL;
-				goto out;
+				spin_unlock(&gw->lpni_lock);
+				return;
 			}
 
-			if (nid == LNET_NID_LO_0)
+			if (LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
 				continue;
 
 			if (stat->ns_status == LNET_NI_STATUS_DOWN) {
@@ -762,7 +753,8 @@ lnet_parse_rc_info(struct lnet_rc_data *rcd)
 			CDEBUG(D_NET, "%s: Unexpected status 0x%x\n",
 			       libcfs_nid2str(gw->lpni_nid), stat->ns_status);
 			gw->lpni_ping_feats = LNET_PING_FEAT_INVAL;
-			goto out;
+			spin_unlock(&gw->lpni_lock);
+			return;
 		}
 
 		if (up) { /* ignore downed NIs if NI for dest network is up */
@@ -776,7 +768,7 @@ lnet_parse_rc_info(struct lnet_rc_data *rcd)
 
 		rte->lr_downis = down;
 	}
-out:
+
 	spin_unlock(&gw->lpni_lock);
 }
 
@@ -820,7 +812,7 @@ lnet_router_checker_event(struct lnet_event *event)
 	 * we ping alive routers to try to detect router death before
 	 * apps get burned). */
 
-	lnet_notify_locked(lp, 1, !event->status, ktime_get_seconds());
+	lnet_notify_locked(lp, 1, (event->status == 0), cfs_time_current());
 	/* The router checker will wake up very shortly and do the
 	 * actual notification.
 	 * XXX If 'lp' stops being a router before then, it will still
@@ -840,9 +832,8 @@ lnet_wait_known_routerstate(void)
 	struct list_head *entry;
 	int all_known;
 
-	LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING);
+	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
 
-	/* the_lnet.ln_api_mutex must be locked */
 	for (;;) {
 		int cpt = lnet_net_lock_current();
 
@@ -866,10 +857,8 @@ lnet_wait_known_routerstate(void)
 		if (all_known)
 			return;
 
-		mutex_unlock(&the_lnet.ln_api_mutex);
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_timeout(cfs_time_seconds(1));
-		mutex_lock(&the_lnet.ln_api_mutex);
 	}
 }
 
@@ -892,15 +881,15 @@ static void
 lnet_update_ni_status_locked(void)
 {
 	struct lnet_ni *ni = NULL;
-	time64_t now;
-	time64_t timeout;
+	long now;
+	int timeout;
 
 	LASSERT(the_lnet.ln_routing);
 
 	timeout = router_ping_timeout +
 		  MAX(live_router_check_interval, dead_router_check_interval);
 
-	now = ktime_get_real_seconds();
+	now = cfs_time_current_sec();
 	while ((ni = lnet_get_next_ni_locked(NULL, ni))) {
 		if (ni->ni_net->net_lnd->lnd_type == LOLND)
 			continue;
@@ -918,7 +907,7 @@ lnet_update_ni_status_locked(void)
 		LASSERT(ni->ni_status != NULL);
 
 		if (ni->ni_status->ns_status != LNET_NI_STATUS_DOWN) {
-			CDEBUG(D_NET, "NI(%s:%lld) status changed to down\n",
+			CDEBUG(D_NET, "NI(%s:%d) status changed to down\n",
 			       libcfs_nid2str(ni->ni_nid), timeout);
 			/* NB: so far, this is the only place to set
 			 * NI status to "down" */
@@ -943,62 +932,43 @@ lnet_destroy_rc_data(struct lnet_rc_data *rcd)
 		lnet_net_unlock(cpt);
 	}
 
-	if (rcd->rcd_pingbuffer != NULL)
-		lnet_ping_buffer_decref(rcd->rcd_pingbuffer);
+	if (rcd->rcd_pinginfo != NULL)
+		LIBCFS_FREE(rcd->rcd_pinginfo, LNET_PINGINFO_SIZE);
 
 	LIBCFS_FREE(rcd, sizeof(*rcd));
 }
 
 static struct lnet_rc_data *
-lnet_update_rc_data_locked(struct lnet_peer_ni *gateway)
+lnet_create_rc_data_locked(struct lnet_peer_ni *gateway)
 {
-	struct lnet_handle_md mdh;
-	struct lnet_rc_data *rcd;
-	struct lnet_ping_buffer *pbuf = NULL;
-	int nnis = LNET_INTERFACES_MIN;
-	int rc;
-	int i;
-
-	rcd = gateway->lpni_rcd;
-	if (rcd) {
-		nnis = rcd->rcd_nnis;
-		mdh = rcd->rcd_mdh;
-		LNetInvalidateMDHandle(&rcd->rcd_mdh);
-		pbuf = rcd->rcd_pingbuffer;
-		rcd->rcd_pingbuffer = NULL;
-	} else {
-		LNetInvalidateMDHandle(&mdh);
-	}
+	struct lnet_rc_data		*rcd = NULL;
+	struct lnet_ping_info	*pi;
+	int			rc;
+	int			i;
 
 	lnet_net_unlock(gateway->lpni_cpt);
 
-	if (rcd) {
-		LNetMDUnlink(mdh);
-		lnet_ping_buffer_decref(pbuf);
-	} else {
-		LIBCFS_ALLOC(rcd, sizeof(*rcd));
-		if (rcd == NULL)
-			goto out;
+	LIBCFS_ALLOC(rcd, sizeof(*rcd));
+	if (rcd == NULL)
+		goto out;
 
-		LNetInvalidateMDHandle(&rcd->rcd_mdh);
-		INIT_LIST_HEAD(&rcd->rcd_list);
-		rcd->rcd_nnis = nnis;
-	}
+	LNetInvalidateMDHandle(&rcd->rcd_mdh);
+	INIT_LIST_HEAD(&rcd->rcd_list);
 
-	pbuf = lnet_ping_buffer_alloc(nnis, GFP_NOFS);
-	if (pbuf == NULL)
+	LIBCFS_ALLOC(pi, LNET_PINGINFO_SIZE);
+	if (pi == NULL)
 		goto out;
 
-	for (i = 0; i < nnis; i++) {
-		pbuf->pb_info.pi_ni[i].ns_nid = LNET_NID_ANY;
-		pbuf->pb_info.pi_ni[i].ns_status = LNET_NI_STATUS_INVALID;
+	for (i = 0; i < LNET_MAX_RTR_NIS; i++) {
+		pi->pi_ni[i].ns_nid = LNET_NID_ANY;
+		pi->pi_ni[i].ns_status = LNET_NI_STATUS_INVALID;
 	}
-	rcd->rcd_pingbuffer = pbuf;
+	rcd->rcd_pinginfo = pi;
 
 	LASSERT(!LNetEQHandleIsInvalid(the_lnet.ln_rc_eqh));
-	rc = LNetMDBind((struct lnet_md){.start     = &pbuf->pb_info,
+	rc = LNetMDBind((struct lnet_md){.start     = pi,
 				    .user_ptr  = rcd,
-				    .length    = LNET_PING_INFO_SIZE(nnis),
+				    .length    = LNET_PINGINFO_SIZE,
 				    .threshold = LNET_MD_THRESH_INF,
 				    .options   = LNET_MD_TRUNCATE,
 				    .eq_handle = the_lnet.ln_rc_eqh},
@@ -1006,37 +976,33 @@ lnet_update_rc_data_locked(struct lnet_peer_ni *gateway)
 			&rcd->rcd_mdh);
 	if (rc < 0) {
 		CERROR("Can't bind MD: %d\n", rc);
-		goto out_ping_buffer_decref;
+		goto out;
 	}
 	LASSERT(rc == 0);
 
 	lnet_net_lock(gateway->lpni_cpt);
-	/* Check if this is still a router. */
-	if (!lnet_isrouter(gateway))
-		goto out_unlock;
-	/* Check if someone else installed router data. */
-	if (gateway->lpni_rcd && gateway->lpni_rcd != rcd)
-		goto out_unlock;
-
-	/* Install and/or update the router data. */
-	if (!gateway->lpni_rcd) {
-		lnet_peer_ni_addref_locked(gateway);
-		rcd->rcd_gateway = gateway;
-		gateway->lpni_rcd = rcd;
+	/* router table changed or someone has created rcd for this gateway */
+	if (!lnet_isrouter(gateway) || gateway->lpni_rcd != NULL) {
+		lnet_net_unlock(gateway->lpni_cpt);
+		goto out;
 	}
+
+	lnet_peer_ni_addref_locked(gateway);
+	rcd->rcd_gateway = gateway;
+	gateway->lpni_rcd = rcd;
 	gateway->lpni_ping_notsent = 0;
 
 	return rcd;
 
-out_unlock:
-	lnet_net_unlock(gateway->lpni_cpt);
-	rc = LNetMDUnlink(mdh);
-	LASSERT(rc == 0);
-out_ping_buffer_decref:
-	lnet_ping_buffer_decref(pbuf);
 out:
-	if (rcd && rcd != gateway->lpni_rcd)
+	if (rcd != NULL) {
+		if (!LNetMDHandleIsInvalid(rcd->rcd_mdh)) {
+			rc = LNetMDUnlink(rcd->rcd_mdh);
+			LASSERT(rc == 0);
+		}
 		lnet_destroy_rc_data(rcd);
+	}
+
 	lnet_net_lock(gateway->lpni_cpt);
 	return gateway->lpni_rcd;
 }
@@ -1058,14 +1024,14 @@ static void
 lnet_ping_router_locked(struct lnet_peer_ni *rtr)
 {
 	struct lnet_rc_data *rcd = NULL;
-	time64_t now = ktime_get_seconds();
-	time64_t secs;
-	struct lnet_ni *ni;
+	cfs_time_t      now = cfs_time_current();
+	int             secs;
+	struct lnet_ni  *ni;
 
 	lnet_peer_ni_addref_locked(rtr);
 
 	if (rtr->lpni_ping_deadline != 0 && /* ping timed out? */
-	    now >  rtr->lpni_ping_deadline)
+	    cfs_time_after(now, rtr->lpni_ping_deadline))
 		lnet_notify_locked(rtr, 1, 0, now);
 
 	/* Run any outstanding notifications */
@@ -1073,36 +1039,30 @@ lnet_ping_router_locked(struct lnet_peer_ni *rtr)
 	lnet_ni_notify_locked(ni, rtr);
 
 	if (!lnet_isrouter(rtr) ||
-	    the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) {
+	    the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
 		/* router table changed or router checker is shutting down */
 		lnet_peer_ni_decref_locked(rtr);
 		return;
 	}
 
-	rcd = rtr->lpni_rcd;
+	rcd = rtr->lpni_rcd != NULL ?
+	      rtr->lpni_rcd : lnet_create_rc_data_locked(rtr);
 
-	/*
-	 * The response to the router checker ping could've timed out and
-	 * the mdh might've been invalidated, so we need to update it
-	 * again.
-	 */
-	if (!rcd || rcd->rcd_nnis > rcd->rcd_pingbuffer->pb_nnis ||
-	    LNetMDHandleIsInvalid(rcd->rcd_mdh))
-		rcd = lnet_update_rc_data_locked(rtr);
 	if (rcd == NULL)
 		return;
 
 	secs = lnet_router_check_interval(rtr);
 
 	CDEBUG(D_NET,
-	       "rtr %s %lld: deadline %lld ping_notsent %d alive %d "
-	       "alive_count %d lpni_ping_timestamp %lld\n",
+	       "rtr %s %d: deadline %lu ping_notsent %d alive %d "
+	       "alive_count %d lpni_ping_timestamp %lu\n",
 	       libcfs_nid2str(rtr->lpni_nid), secs,
 	       rtr->lpni_ping_deadline, rtr->lpni_ping_notsent,
 	       rtr->lpni_alive, rtr->lpni_alive_count, rtr->lpni_ping_timestamp);
 
 	if (secs != 0 && !rtr->lpni_ping_notsent &&
-	    now > rtr->lpni_ping_timestamp + secs) {
+	    cfs_time_after(now, cfs_time_add(rtr->lpni_ping_timestamp,
+					     cfs_time_seconds(secs)))) {
 		int               rc;
 		struct lnet_process_id id;
 		struct lnet_handle_md mdh;
@@ -1117,14 +1077,14 @@ lnet_ping_router_locked(struct lnet_peer_ni *rtr)
 		mdh = rcd->rcd_mdh;
 
 		if (rtr->lpni_ping_deadline == 0) {
-			rtr->lpni_ping_deadline = ktime_get_seconds() +
-						  router_ping_timeout;
+			rtr->lpni_ping_deadline =
+				cfs_time_shift(router_ping_timeout);
 		}
 
 		lnet_net_unlock(rtr->lpni_cpt);
 
 		rc = LNetGet(LNET_NID_ANY, mdh, id, LNET_RESERVED_PORTAL,
-			     LNET_PROTO_PING_MATCHBITS, 0, false);
+			     LNET_PROTO_PING_MATCHBITS, 0);
 
 		lnet_net_lock(rtr->lpni_cpt);
 		if (rc != 0)
@@ -1135,9 +1095,14 @@ lnet_ping_router_locked(struct lnet_peer_ni *rtr)
 	return;
 }
 
-int lnet_router_pre_mt_start(void)
+int
+lnet_router_checker_start(void)
 {
-	int rc;
+	int			rc;
+	int			eqsz = 0;
+	struct task_struct     *task;
+
+	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
 
 	if (check_routers_before_use &&
 	    dead_router_check_interval <= 0) {
@@ -1147,36 +1112,60 @@ int lnet_router_pre_mt_start(void)
 		return -EINVAL;
 	}
 
+	sema_init(&the_lnet.ln_rc_signal, 0);
+
 	rc = LNetEQAlloc(0, lnet_router_checker_event, &the_lnet.ln_rc_eqh);
 	if (rc != 0) {
-		CERROR("Can't allocate EQ(0): %d\n", rc);
+		CERROR("Can't allocate EQ(%d): %d\n", eqsz, rc);
 		return -ENOMEM;
 	}
 
-	return 0;
-}
+	the_lnet.ln_rc_state = LNET_RC_STATE_RUNNING;
+	task = kthread_run(lnet_router_checker, NULL, "router_checker");
+	if (IS_ERR(task)) {
+		rc = PTR_ERR(task);
+		CERROR("Can't start router checker thread: %d\n", rc);
+		/* block until event callback signals exit */
+		down(&the_lnet.ln_rc_signal);
+		rc = LNetEQFree(the_lnet.ln_rc_eqh);
+		LASSERT(rc == 0);
+		the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+		return -ENOMEM;
+	}
 
-void lnet_router_post_mt_start(void)
-{
 	if (check_routers_before_use) {
 		/* Note that a helpful side-effect of pinging all known routers
 		 * at startup is that it makes them drop stale connections they
 		 * may have to a previous instance of me. */
 		lnet_wait_known_routerstate();
 	}
+
+	return 0;
 }
 
 void
-lnet_router_cleanup(void)
+lnet_router_checker_stop (void)
 {
 	int rc;
 
+	if (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN)
+		return;
+
+	LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+	the_lnet.ln_rc_state = LNET_RC_STATE_STOPPING;
+	/* wakeup the RC thread if it's sleeping */
+	wake_up(&the_lnet.ln_rc_waitq);
+
+	/* block until event callback signals exit */
+	down(&the_lnet.ln_rc_signal);
+	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
+
 	rc = LNetEQFree(the_lnet.ln_rc_eqh);
 	LASSERT(rc == 0);
 	return;
 }
 
-void
+static void
 lnet_prune_rc_data(int wait_unlink)
 {
 	struct lnet_rc_data *rcd;
@@ -1185,7 +1174,7 @@ lnet_prune_rc_data(int wait_unlink)
 	struct list_head head;
 	int i = 2;
 
-	if (likely(the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING &&
+	if (likely(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING &&
 		   list_empty(&the_lnet.ln_rcd_deathrow) &&
 		   list_empty(&the_lnet.ln_rcd_zombie)))
 		return;
@@ -1194,7 +1183,7 @@ lnet_prune_rc_data(int wait_unlink)
 
 	lnet_net_lock(LNET_LOCK_EX);
 
-	if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) {
+	if (the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
 		/* router checker is stopping, prune all */
 		list_for_each_entry(lp, &the_lnet.ln_routers,
 				    lpni_rtr_list) {
@@ -1258,13 +1247,18 @@ lnet_prune_rc_data(int wait_unlink)
 }
 
 /*
- * This function is called from the monitor thread to check if there are
- * any active routers that need to be checked.
+ * This function is called to check if the RC should block indefinitely.
+ * It's called from lnet_router_checker() as well as being passed to
+ * wait_event_interruptible() to avoid the lost wake_up problem.
+ *
+ * When it's called from wait_event_interruptible() it is necessary to
+ * also not sleep if the rc state is not running to avoid a deadlock
+ * when the system is shutting down
  */
-inline bool
+static inline bool
 lnet_router_checker_active(void)
 {
-	if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING)
+	if (the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING)
 		return true;
 
 	/* Router Checker thread needs to run when routing is enabled in
@@ -1272,58 +1266,79 @@ lnet_router_checker_active(void)
 	if (the_lnet.ln_routing)
 		return true;
 
-	/* if there are routers that need to be cleaned up then do so */
-	if (!list_empty(&the_lnet.ln_rcd_deathrow) ||
-	    !list_empty(&the_lnet.ln_rcd_zombie))
-		return true;
-
 	return !list_empty(&the_lnet.ln_routers) &&
 		(live_router_check_interval > 0 ||
 		 dead_router_check_interval > 0);
 }
 
-void
-lnet_check_routers(void)
+static int
+lnet_router_checker(void *arg)
 {
 	struct lnet_peer_ni *rtr;
 	struct list_head *entry;
-	__u64	version;
-	int	cpt;
-	int	cpt2;
 
-	cpt = lnet_net_lock_current();
+	cfs_block_allsigs();
+
+	while (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) {
+		__u64	version;
+		int	cpt;
+		int	cpt2;
+
+		cpt = lnet_net_lock_current();
 rescan:
-	version = the_lnet.ln_routers_version;
+		version = the_lnet.ln_routers_version;
 
-	list_for_each(entry, &the_lnet.ln_routers) {
-		rtr = list_entry(entry, struct lnet_peer_ni,
-					lpni_rtr_list);
+		list_for_each(entry, &the_lnet.ln_routers) {
+			rtr = list_entry(entry, struct lnet_peer_ni,
+					 lpni_rtr_list);
 
-		cpt2 = rtr->lpni_cpt;
-		if (cpt != cpt2) {
-			lnet_net_unlock(cpt);
-			cpt = cpt2;
-			lnet_net_lock(cpt);
-			/* the routers list has changed */
-			if (version != the_lnet.ln_routers_version)
+			cpt2 = rtr->lpni_cpt;
+			if (cpt != cpt2) {
+				lnet_net_unlock(cpt);
+				cpt = cpt2;
+				lnet_net_lock(cpt);
+				/* the routers list has changed */
+				if (version != the_lnet.ln_routers_version)
+					goto rescan;
+			}
+
+			lnet_ping_router_locked(rtr);
+
+			/* NB dropped lock */
+			if (version != the_lnet.ln_routers_version) {
+				/* the routers list has changed */
 				goto rescan;
+			}
 		}
 
-		lnet_ping_router_locked(rtr);
+		if (the_lnet.ln_routing)
+			lnet_update_ni_status_locked();
 
-		/* NB dropped lock */
-		if (version != the_lnet.ln_routers_version) {
-			/* the routers list has changed */
-			goto rescan;
-		}
-	}
+		lnet_net_unlock(cpt);
 
-	if (the_lnet.ln_routing)
-		lnet_update_ni_status_locked();
+		lnet_prune_rc_data(0); /* don't wait for UNLINK */
 
-	lnet_net_unlock(cpt);
+		/* Call schedule_timeout() here always adds 1 to load average
+		 * because kernel counts # active tasks as nr_running
+		 * + nr_uninterruptible. */
+		/* if there are any routes then wakeup every second.  If
+		 * there are no routes then sleep indefinitely until woken
+		 * up by a user adding a route */
+		if (!lnet_router_checker_active())
+			wait_event_interruptible(the_lnet.ln_rc_waitq,
+						 lnet_router_checker_active());
+		else
+			wait_event_interruptible_timeout(the_lnet.ln_rc_waitq,
+							 false,
+							 cfs_time_seconds(1));
+	}
 
-	lnet_prune_rc_data(0); /* don't wait for UNLINK */
+	lnet_prune_rc_data(1); /* wait for UNLINK */
+
+	the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+	up(&the_lnet.ln_rc_signal);
+	/* The unlink event callback will signal final completion */
+	return 0;
 }
 
 void
@@ -1726,8 +1741,7 @@ lnet_rtrpools_enable(void)
 	lnet_net_lock(LNET_LOCK_EX);
 	the_lnet.ln_routing = 1;
 
-	the_lnet.ln_ping_target->pb_info.pi_features &=
-		~LNET_PING_FEAT_RTE_DISABLED;
+	the_lnet.ln_ping_info->pi_features &= ~LNET_PING_FEAT_RTE_DISABLED;
 	lnet_net_unlock(LNET_LOCK_EX);
 
 	return rc;
@@ -1741,8 +1755,7 @@ lnet_rtrpools_disable(void)
 
 	lnet_net_lock(LNET_LOCK_EX);
 	the_lnet.ln_routing = 0;
-	the_lnet.ln_ping_target->pb_info.pi_features |=
-		LNET_PING_FEAT_RTE_DISABLED;
+	the_lnet.ln_ping_info->pi_features |= LNET_PING_FEAT_RTE_DISABLED;
 
 	tiny_router_buffers = 0;
 	small_router_buffers = 0;
@@ -1752,10 +1765,10 @@ lnet_rtrpools_disable(void)
 }
 
 int
-lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, int alive, time64_t when)
+lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, int alive, cfs_time_t when)
 {
 	struct lnet_peer_ni *lp = NULL;
-	time64_t now = ktime_get_seconds();
+	cfs_time_t now = cfs_time_current();
 	int cpt = lnet_cpt_of_nid(nid, ni);
 
 	LASSERT (!in_interrupt ());
@@ -1774,11 +1787,12 @@ lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, int alive, time64_t when)
 	}
 
 	/* can't do predictions... */
-	if (when > now) {
+	if (cfs_time_after(when, now)) {
 		CWARN("Ignoring prediction from %s of %s %s "
-		      "%lld seconds in the future\n",
+		      "%ld seconds in the future\n",
 		      (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
-		      libcfs_nid2str(nid), alive ? "up" : "down", when - now);
+		      libcfs_nid2str(nid), alive ? "up" : "down",
+		      cfs_duration_sec(cfs_time_sub(when, now)));
 		return -EINVAL;
 	}
 
diff --git a/drivers/staging/lustrefsx/lnet/lnet/router_proc.c b/drivers/staging/lustrefsx/lnet/lnet/router_proc.c
index 2e60609ee229d..b7d513521b433 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/router_proc.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/router_proc.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2015, Intel Corporation.
  *
  *   This file is part of Lustre, https://wiki.whamcloud.com/
  *
@@ -21,15 +21,14 @@
  */
 
 #define DEBUG_SUBSYSTEM S_LNET
-
-#include <linux/uaccess.h>
-
 #include <libcfs/libcfs.h>
 #include <lnet/lib-lnet.h>
 
 /* This is really lnet_proc.c. You might need to update sanity test 215
  * if any file format is changed. */
 
+static struct ctl_table_header *lnet_table_header = NULL;
+
 #define LNET_LOFFT_BITS		(sizeof(loff_t) * 8)
 /*
  * NB: max allowed LNET_CPT_BITS is 8 on 64-bit system and 2 on 32-bit system
@@ -82,7 +81,6 @@ static int __proc_lnet_stats(void *data, int write,
 {
 	int		 rc;
 	struct lnet_counters *ctrs;
-	struct lnet_counters_common common;
 	int		 len;
 	char		*tmpstr;
 	const int	 tmpsiz = 256; /* 7 %u and 4 __u64 */
@@ -105,17 +103,16 @@ static int __proc_lnet_stats(void *data, int write,
 	}
 
 	lnet_counters_get(ctrs);
-	common = ctrs->lct_common;
 
 	len = snprintf(tmpstr, tmpsiz,
 		       "%u %u %u %u %u %u %u %llu %llu "
 		       "%llu %llu",
-		       common.lcc_msgs_alloc, common.lcc_msgs_max,
-		       common.lcc_errors,
-		       common.lcc_send_count, common.lcc_recv_count,
-		       common.lcc_route_count, common.lcc_drop_count,
-		       common.lcc_send_length, common.lcc_recv_length,
-		       common.lcc_route_length, common.lcc_drop_length);
+		       ctrs->msgs_alloc, ctrs->msgs_max,
+		       ctrs->errors,
+		       ctrs->send_count, ctrs->recv_count,
+		       ctrs->route_count, ctrs->drop_count,
+		       ctrs->send_length, ctrs->recv_length,
+		       ctrs->route_length, ctrs->drop_length);
 
 	if (pos >= min_t(int, len, strlen(tmpstr)))
 		rc = 0;
@@ -247,9 +244,14 @@ proc_lnet_routes(struct ctl_table *table, int write, void __user *buffer,
 	if (len > *lenp) {    /* linux-supplied buffer is too small */
 		rc = -EINVAL;
 	} else if (len > 0) { /* wrote something */
+#ifdef PROC_HANDLER_USE_USER_ATTR
 		if (copy_to_user(buffer, tmpstr, len))
 			rc = -EFAULT;
 		else {
+#else
+		memcpy(buffer, tmpstr, len);
+		{
+#endif
 			off += 1;
 			*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
 		}
@@ -333,14 +335,15 @@ proc_lnet_routers(struct ctl_table *table, int write, void __user *buffer,
 
 		if (peer != NULL) {
 			lnet_nid_t nid = peer->lpni_nid;
-			time64_t now = ktime_get_seconds();
-			time64_t deadline = peer->lpni_ping_deadline;
+			cfs_time_t now = cfs_time_current();
+			cfs_time_t deadline = peer->lpni_ping_deadline;
 			int nrefs     = atomic_read(&peer->lpni_refcount);
 			int nrtrrefs  = peer->lpni_rtr_refcount;
 			int alive_cnt = peer->lpni_alive_count;
 			int alive     = peer->lpni_alive;
 			int pingsent  = !peer->lpni_ping_notsent;
-			time64_t last_ping = now - peer->lpni_ping_timestamp;
+			int last_ping = cfs_duration_sec(cfs_time_sub(now,
+						     peer->lpni_ping_timestamp));
 			int down_ni   = 0;
 			struct lnet_route *rtr;
 
@@ -359,18 +362,18 @@ proc_lnet_routers(struct ctl_table *table, int write, void __user *buffer,
 
 			if (deadline == 0)
 				s += snprintf(s, tmpstr + tmpsiz - s,
-					      "%-4d %7d %9d %6s %12llu %9d %8s %7d %s\n",
+					      "%-4d %7d %9d %6s %12d %9d %8s %7d %s\n",
 					      nrefs, nrtrrefs, alive_cnt,
 					      alive ? "up" : "down", last_ping,
 					      pingsent, "NA", down_ni,
 					      libcfs_nid2str(nid));
 			else
 				s += snprintf(s, tmpstr + tmpsiz - s,
-					      "%-4d %7d %9d %6s %12llu %9d %8llu %7d %s\n",
+					      "%-4d %7d %9d %6s %12d %9d %8lu %7d %s\n",
 					      nrefs, nrtrrefs, alive_cnt,
 					      alive ? "up" : "down", last_ping,
 					      pingsent,
-					      deadline - now,
+					      cfs_duration_sec(cfs_time_sub(deadline, now)),
 					      down_ni, libcfs_nid2str(nid));
 			LASSERT(tmpstr + tmpsiz - s > 0);
 		}
@@ -383,9 +386,14 @@ proc_lnet_routers(struct ctl_table *table, int write, void __user *buffer,
 	if (len > *lenp) {    /* linux-supplied buffer is too small */
 		rc = -EINVAL;
 	} else if (len > 0) { /* wrote something */
+#ifdef PROC_HANDLER_USE_USER_ATTR
 		if (copy_to_user(buffer, tmpstr, len))
 			rc = -EFAULT;
 		else {
+#else
+		memcpy(buffer, tmpstr, len);
+		{
+#endif
 			off += 1;
 			*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
 		}
@@ -523,7 +531,7 @@ proc_lnet_peers(struct ctl_table *table, int write, void __user *buffer,
 		if (peer != NULL) {
 			lnet_nid_t nid = peer->lpni_nid;
 			int nrefs = atomic_read(&peer->lpni_refcount);
-			time64_t lastalive = -1;
+			int lastalive = -1;
 			char *aliveness = "NA";
 			int maxcr = (peer->lpni_net) ?
 			  peer->lpni_net->net_tunables.lct_peer_tx_credits : 0;
@@ -538,9 +546,11 @@ proc_lnet_peers(struct ctl_table *table, int write, void __user *buffer,
 				aliveness = peer->lpni_alive ? "up" : "down";
 
 			if (lnet_peer_aliveness_enabled(peer)) {
-				time64_t now = ktime_get_seconds();
+				cfs_time_t now = cfs_time_current();
+				cfs_duration_t delta;
 
-				lastalive = now - peer->lpni_last_alive;
+				delta = cfs_time_sub(now, peer->lpni_last_alive);
+				lastalive = cfs_duration_sec(delta);
 
 				/* No need to mess up peers contents with
 				 * arbitrarily long integers - it suffices to
@@ -553,7 +563,7 @@ proc_lnet_peers(struct ctl_table *table, int write, void __user *buffer,
 			lnet_net_unlock(cpt);
 
 			s += snprintf(s, tmpstr + tmpsiz - s,
-				      "%-24s %4d %5s %5lld %5d %5d %5d %5d %5d %d\n",
+				      "%-24s %4d %5s %5d %5d %5d %5d %5d %5d %d\n",
 				      libcfs_nid2str(nid), nrefs, aliveness,
 				      lastalive, maxcr, rtrcr, minrtrcr, txcr,
 				      mintxcr, txqnob);
@@ -577,9 +587,13 @@ proc_lnet_peers(struct ctl_table *table, int write, void __user *buffer,
 	if (len > *lenp) {    /* linux-supplied buffer is too small */
 		rc = -EINVAL;
 	} else if (len > 0) { /* wrote something */
+#ifdef PROC_HANDLER_USE_USER_ATTR
 		if (copy_to_user(buffer, tmpstr, len))
 			rc = -EFAULT;
 		else
+#else
+		memcpy(buffer, tmpstr, len);
+#endif
 			*ppos = LNET_PROC_POS_MAKE(cpt, ver, hash, hoff);
 	}
 
@@ -727,12 +741,12 @@ proc_lnet_nis(struct ctl_table *table, int write, void __user *buffer,
 		ni = lnet_get_ni_idx_locked(skip);
 
 		if (ni != NULL) {
-			struct lnet_tx_queue *tq;
-			char *stat;
-			time64_t now = ktime_get_real_seconds();
-			time64_t last_alive = -1;
-			int i;
-			int j;
+			struct lnet_tx_queue	*tq;
+			char	*stat;
+			long now = cfs_time_current_sec();
+			int	last_alive = -1;
+			int	i;
+			int	j;
 
 			if (the_lnet.ln_routing)
 				last_alive = now - ni->ni_last_alive;
@@ -763,7 +777,7 @@ proc_lnet_nis(struct ctl_table *table, int write, void __user *buffer,
 					lnet_net_lock(i);
 
 				s += snprintf(s, tmpstr + tmpsiz - s,
-				      "%-24s %6s %5lld %4d %4d %4d %5d %5d %5d\n",
+				      "%-24s %6s %5d %4d %4d %4d %5d %5d %5d\n",
 				      libcfs_nid2str(ni->ni_nid), stat,
 				      last_alive, *ni->ni_refs[i],
 				      ni->ni_net->net_tunables.lct_peer_tx_credits,
@@ -784,9 +798,14 @@ proc_lnet_nis(struct ctl_table *table, int write, void __user *buffer,
 	if (len > *lenp) {    /* linux-supplied buffer is too small */
 		rc = -EINVAL;
 	} else if (len > 0) { /* wrote something */
+
+#ifdef PROC_HANDLER_USE_USER_ATTR
 		if (copy_to_user(buffer, tmpstr, len))
 			rc = -EFAULT;
 		else
+#else
+		memcpy(buffer, tmpstr, len);
+#endif
 			*ppos += 1;
 	}
 
@@ -955,12 +974,34 @@ static struct ctl_table lnet_table[] = {
 	{ .procname = NULL }
 };
 
-void lnet_router_debugfs_init(void)
+static struct ctl_table top_table[] = {
+	{
+		INIT_CTL_NAME
+		.procname	= "lnet",
+		.mode		= 0555,
+		.data		= NULL,
+		.maxlen		= 0,
+		.child		= lnet_table,
+	},
+	{ .procname = NULL }
+};
+
+void
+lnet_proc_init(void)
 {
-	lnet_insert_debugfs(lnet_table);
+#ifdef CONFIG_SYSCTL
+	if (lnet_table_header == NULL)
+		lnet_table_header = register_sysctl_table(top_table);
+#endif
 }
 
-void lnet_router_debugfs_fini(void)
+void
+lnet_proc_fini(void)
 {
-	lnet_remove_debugfs(lnet_table);
+#ifdef CONFIG_SYSCTL
+	if (lnet_table_header != NULL)
+		unregister_sysctl_table(lnet_table_header);
+
+	lnet_table_header = NULL;
+#endif
 }
diff --git a/drivers/staging/lustrefsx/lnet/selftest/brw_test.c b/drivers/staging/lustrefsx/lnet/selftest/brw_test.c
index a03f6078c0589..512dbb5b8a2f1 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/brw_test.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/brw_test.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -49,10 +49,10 @@ MODULE_PARM_DESC(brw_inject_errors, "# data errors to inject randomly, zero by d
 #define BRW_MSIZE	sizeof(__u64)
 
 static void
-brw_client_fini(struct sfw_test_instance *tsi)
+brw_client_fini (sfw_test_instance_t *tsi)
 {
-	struct srpc_bulk *bulk;
-	struct sfw_test_unit *tsu;
+	srpc_bulk_t	*bulk;
+	sfw_test_unit_t	*tsu;
 
 	LASSERT(tsi->tsi_is_client);
 
@@ -67,22 +67,22 @@ brw_client_fini(struct sfw_test_instance *tsi)
 }
 
 static int
-brw_client_init(struct sfw_test_instance *tsi)
+brw_client_init (sfw_test_instance_t *tsi)
 {
-	struct sfw_session *sn = tsi->tsi_batch->bat_session;
+	sfw_session_t	 *sn = tsi->tsi_batch->bat_session;
 	int		  flags;
 	int		  off;
 	int		  npg;
 	int		  len;
 	int		  opc;
-	struct srpc_bulk *bulk;
-	struct sfw_test_unit *tsu;
+	srpc_bulk_t	 *bulk;
+	sfw_test_unit_t	 *tsu;
 
 	LASSERT(sn != NULL);
 	LASSERT(tsi->tsi_is_client);
 
 	if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
-		struct test_bulk_req *breq = &tsi->tsi_u.bulk_v0;
+		test_bulk_req_t  *breq = &tsi->tsi_u.bulk_v0;
 
 		opc   = breq->blk_opc;
 		flags = breq->blk_flags;
@@ -93,7 +93,7 @@ brw_client_init(struct sfw_test_instance *tsi)
 		off   = 0;
 
 	} else {
-		struct test_bulk_req_v1 *breq = &tsi->tsi_u.bulk_v1;
+		test_bulk_req_v1_t  *breq = &tsi->tsi_u.bulk_v1;
 
 		/* I should never get this step if it's unknown feature
 		 * because make_session will reject unknown feature */
@@ -137,7 +137,7 @@ brw_client_init(struct sfw_test_instance *tsi)
 #define BRW_MAGIC       0xeeb0eeb1eeb2eeb3ULL
 #define BRW_MSIZE       sizeof(__u64)
 
-static int brw_inject_one_error(void)
+int brw_inject_one_error(void)
 {
 	struct timespec64 ts;
 
@@ -228,7 +228,7 @@ brw_check_page(struct page *pg, int off, int len, int pattern, __u64 magic)
 }
 
 static void
-brw_fill_bulk(struct srpc_bulk *bk, int pattern, __u64 magic)
+brw_fill_bulk(srpc_bulk_t *bk, int pattern, __u64 magic)
 {
 	int	     i;
 	struct page *pg;
@@ -245,7 +245,7 @@ brw_fill_bulk(struct srpc_bulk *bk, int pattern, __u64 magic)
 }
 
 static int
-brw_check_bulk(struct srpc_bulk *bk, int pattern, __u64 magic)
+brw_check_bulk(srpc_bulk_t *bk, int pattern, __u64 magic)
 {
 	int	     i;
 	struct page *pg;
@@ -268,25 +268,25 @@ brw_check_bulk(struct srpc_bulk *bk, int pattern, __u64 magic)
 }
 
 static int
-brw_client_prep_rpc(struct sfw_test_unit *tsu, struct lnet_process_id dest,
-		    struct srpc_client_rpc **rpcpp)
+brw_client_prep_rpc(sfw_test_unit_t *tsu,
+		    struct lnet_process_id dest, srpc_client_rpc_t **rpcpp)
 {
-	struct srpc_bulk *bulk = tsu->tsu_private;
-	struct sfw_test_instance *tsi = tsu->tsu_instance;
-	struct sfw_session *sn = tsi->tsi_batch->bat_session;
-	struct srpc_client_rpc *rpc;
-	struct srpc_brw_reqst *req;
-	int flags;
-	int npg;
-	int len;
-	int opc;
-	int rc;
+	srpc_bulk_t	    *bulk = tsu->tsu_private;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	sfw_session_t	    *sn = tsi->tsi_batch->bat_session;
+	srpc_client_rpc_t   *rpc;
+	srpc_brw_reqst_t    *req;
+	int		     flags;
+	int		     npg;
+	int		     len;
+	int		     opc;
+	int		     rc;
 
 	LASSERT(sn != NULL);
 	LASSERT(bulk != NULL);
 
 	if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
-		struct test_bulk_req *breq = &tsi->tsi_u.bulk_v0;
+		test_bulk_req_t *breq = &tsi->tsi_u.bulk_v0;
 
 		opc   = breq->blk_opc;
 		flags = breq->blk_flags;
@@ -294,8 +294,8 @@ brw_client_prep_rpc(struct sfw_test_unit *tsu, struct lnet_process_id dest,
 		len   = npg * PAGE_SIZE;
 
 	} else {
-		struct test_bulk_req_v1 *breq = &tsi->tsi_u.bulk_v1;
-		int off;
+		test_bulk_req_v1_t  *breq = &tsi->tsi_u.bulk_v1;
+		int		     off;
 
 		/* I should never get this step if it's unknown feature
 		 * because make_session will reject unknown feature */
@@ -312,7 +312,7 @@ brw_client_prep_rpc(struct sfw_test_unit *tsu, struct lnet_process_id dest,
 	if (rc != 0)
 		return rc;
 
-	memcpy(&rpc->crpc_bulk, bulk, offsetof(struct srpc_bulk, bk_iovs[npg]));
+	memcpy(&rpc->crpc_bulk, bulk, offsetof(srpc_bulk_t, bk_iovs[npg]));
 	if (opc == LST_BRW_WRITE)
 		brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_MAGIC);
 	else
@@ -328,14 +328,14 @@ brw_client_prep_rpc(struct sfw_test_unit *tsu, struct lnet_process_id dest,
 }
 
 static void
-brw_client_done_rpc(struct sfw_test_unit *tsu, struct srpc_client_rpc *rpc)
+brw_client_done_rpc(sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
 {
-	__u64 magic = BRW_MAGIC;
-	struct sfw_test_instance *tsi = tsu->tsu_instance;
-	struct sfw_session *sn = tsi->tsi_batch->bat_session;
-	struct srpc_msg *msg = &rpc->crpc_replymsg;
-	struct srpc_brw_reply *reply = &msg->msg_body.brw_reply;
-	struct srpc_brw_reqst *reqst = &rpc->crpc_reqstmsg.msg_body.brw_reqst;
+	__u64                magic = BRW_MAGIC;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	sfw_session_t       *sn = tsi->tsi_batch->bat_session;
+	srpc_msg_t          *msg = &rpc->crpc_replymsg;
+	srpc_brw_reply_t    *reply = &msg->msg_body.brw_reply;
+	srpc_brw_reqst_t    *reqst = &rpc->crpc_reqstmsg.msg_body.brw_reqst;
 
 	LASSERT(sn != NULL);
 
@@ -376,9 +376,9 @@ brw_client_done_rpc(struct sfw_test_unit *tsu, struct srpc_client_rpc *rpc)
 }
 
 static void
-brw_server_rpc_done(struct srpc_server_rpc *rpc)
+brw_server_rpc_done(srpc_server_rpc_t *rpc)
 {
-	struct srpc_bulk *blk = rpc->srpc_bulk;
+	srpc_bulk_t *blk = rpc->srpc_bulk;
 
 	if (blk == NULL)
 		return;
@@ -396,12 +396,12 @@ brw_server_rpc_done(struct srpc_server_rpc *rpc)
 }
 
 static int
-brw_bulk_ready(struct srpc_server_rpc *rpc, int status)
+brw_bulk_ready(srpc_server_rpc_t *rpc, int status)
 {
-	__u64 magic = BRW_MAGIC;
-	struct srpc_brw_reply *reply = &rpc->srpc_replymsg.msg_body.brw_reply;
-	struct srpc_brw_reqst *reqst;
-	struct srpc_msg *reqstmsg;
+        __u64             magic = BRW_MAGIC;
+        srpc_brw_reply_t *reply = &rpc->srpc_replymsg.msg_body.brw_reply;
+        srpc_brw_reqst_t *reqst;
+        srpc_msg_t       *reqstmsg;
 
         LASSERT (rpc->srpc_bulk != NULL);
         LASSERT (rpc->srpc_reqstbuf != NULL);
@@ -434,13 +434,13 @@ brw_bulk_ready(struct srpc_server_rpc *rpc, int status)
 static int
 brw_server_handle(struct srpc_server_rpc *rpc)
 {
-	struct srpc_service *sv = rpc->srpc_scd->scd_svc;
-	struct srpc_msg *replymsg = &rpc->srpc_replymsg;
-	struct srpc_msg *reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
-	struct srpc_brw_reply *reply = &replymsg->msg_body.brw_reply;
-	struct srpc_brw_reqst *reqst = &reqstmsg->msg_body.brw_reqst;
-	int npg;
-	int rc;
+	struct srpc_service	*sv = rpc->srpc_scd->scd_svc;
+        srpc_msg_t       *replymsg = &rpc->srpc_replymsg;
+        srpc_msg_t       *reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+        srpc_brw_reply_t *reply = &replymsg->msg_body.brw_reply;
+        srpc_brw_reqst_t *reqst = &reqstmsg->msg_body.brw_reqst;
+	int		  npg;
+        int               rc;
 
         LASSERT (sv->sv_id == SRPC_SERVICE_BRW);
 
@@ -505,8 +505,7 @@ brw_server_handle(struct srpc_server_rpc *rpc)
         return 0;
 }
 
-struct sfw_test_client_ops brw_test_client;
-
+sfw_test_client_ops_t brw_test_client;
 void brw_init_test_client(void)
 {
         brw_test_client.tso_init       = brw_client_init;
@@ -515,10 +514,10 @@ void brw_init_test_client(void)
         brw_test_client.tso_done_rpc   = brw_client_done_rpc;
 };
 
-struct srpc_service brw_test_service;
-
+srpc_service_t brw_test_service;
 void brw_init_test_service(void)
 {
+
         brw_test_service.sv_id         = SRPC_SERVICE_BRW;
         brw_test_service.sv_name       = "brw_test";
         brw_test_service.sv_handler    = brw_server_handle;
diff --git a/drivers/staging/lustrefsx/lnet/selftest/conctl.c b/drivers/staging/lustrefsx/lnet/selftest/conctl.c
index 7ce53bbabff32..9e60d0d671df2 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/conctl.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/conctl.c
@@ -43,27 +43,27 @@
 static int
 lst_session_new_ioctl(struct lstio_session_new_args *args)
 {
-	char *name;
-	int rc;
-
-	if (args->lstio_ses_idp == NULL || /* address for output sid */
-	    args->lstio_ses_key == 0 || /* no key is specified */
-	    args->lstio_ses_namep == NULL || /* session name */
-	    args->lstio_ses_nmlen <= 0 ||
-	    args->lstio_ses_nmlen > LST_NAME_SIZE)
-		return -EINVAL;
+        char      *name;
+        int        rc;
 
-	LIBCFS_ALLOC(name, args->lstio_ses_nmlen + 1);
-	if (name == NULL)
-		return -ENOMEM;
+        if (args->lstio_ses_idp   == NULL || /* address for output sid */
+            args->lstio_ses_key   == 0 || /* no key is specified */
+            args->lstio_ses_namep == NULL || /* session name */
+            args->lstio_ses_nmlen <= 0 ||
+            args->lstio_ses_nmlen > LST_NAME_SIZE)
+                return -EINVAL;
+
+        LIBCFS_ALLOC(name, args->lstio_ses_nmlen + 1);
+        if (name == NULL)
+                return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_ses_namep,
 			   args->lstio_ses_nmlen)) {
-		LIBCFS_FREE(name, args->lstio_ses_nmlen + 1);
-		return -EFAULT;
-	}
+                LIBCFS_FREE(name, args->lstio_ses_nmlen + 1);
+                return -EFAULT;
+        }
 
-	name[args->lstio_ses_nmlen] = 0;
+        name[args->lstio_ses_nmlen] = 0;
 
 	rc = lstcon_session_new(name,
 				args->lstio_ses_key,
@@ -79,272 +79,272 @@ lst_session_new_ioctl(struct lstio_session_new_args *args)
 static int
 lst_session_end_ioctl(struct lstio_session_end_args *args)
 {
-	if (args->lstio_ses_key != console_session.ses_key)
-		return -EACCES;
+        if (args->lstio_ses_key != console_session.ses_key)
+                return -EACCES;
 
-	return lstcon_session_end();
+        return lstcon_session_end();
 }
 
 static int
 lst_session_info_ioctl(struct lstio_session_info_args *args)
 {
-	/* no checking of key */
-
-	if (args->lstio_ses_idp == NULL || /* address for ouput sid */
-	    args->lstio_ses_keyp == NULL || /* address for ouput key */
-	    args->lstio_ses_featp == NULL || /* address for ouput features */
-	    args->lstio_ses_ndinfo == NULL || /* address for output ndinfo */
-	    args->lstio_ses_namep == NULL || /* address for ouput name */
-	    args->lstio_ses_nmlen <= 0 ||
-	    args->lstio_ses_nmlen > LST_NAME_SIZE)
-		return -EINVAL;
-
-	return lstcon_session_info(args->lstio_ses_idp,
-				   args->lstio_ses_keyp,
+        /* no checking of key */
+
+        if (args->lstio_ses_idp   == NULL || /* address for ouput sid */
+            args->lstio_ses_keyp  == NULL || /* address for ouput key */
+	    args->lstio_ses_featp  == NULL || /* address for ouput features */
+            args->lstio_ses_ndinfo == NULL || /* address for output ndinfo */
+            args->lstio_ses_namep == NULL || /* address for ouput name */
+            args->lstio_ses_nmlen <= 0 ||
+            args->lstio_ses_nmlen > LST_NAME_SIZE)
+                return -EINVAL;
+
+        return lstcon_session_info(args->lstio_ses_idp,
+                                   args->lstio_ses_keyp,
 				   args->lstio_ses_featp,
-				   args->lstio_ses_ndinfo,
-				   args->lstio_ses_namep,
-				   args->lstio_ses_nmlen);
+                                   args->lstio_ses_ndinfo,
+                                   args->lstio_ses_namep,
+                                   args->lstio_ses_nmlen);
 }
 
 static int
 lst_debug_ioctl(struct lstio_debug_args *args)
 {
-	char *name = NULL;
-	int client = 1;
-	int rc;
+        char   *name   = NULL;
+        int     client = 1;
+        int     rc;
 
-	if (args->lstio_dbg_key != console_session.ses_key)
-		return -EACCES;
+        if (args->lstio_dbg_key != console_session.ses_key)
+                return -EACCES;
 
-	if (args->lstio_dbg_resultp == NULL)
-		return -EINVAL;
+        if (args->lstio_dbg_resultp == NULL)
+                return -EINVAL;
 
-	if (args->lstio_dbg_namep != NULL && /* name of batch/group */
-	    (args->lstio_dbg_nmlen <= 0 ||
-	     args->lstio_dbg_nmlen > LST_NAME_SIZE))
-		return -EINVAL;
+        if (args->lstio_dbg_namep != NULL && /* name of batch/group */
+            (args->lstio_dbg_nmlen <= 0 ||
+             args->lstio_dbg_nmlen > LST_NAME_SIZE))
+                return -EINVAL;
 
-	if (args->lstio_dbg_namep != NULL) {
-		LIBCFS_ALLOC(name, args->lstio_dbg_nmlen + 1);
-		if (name == NULL)
-			return -ENOMEM;
+        if (args->lstio_dbg_namep != NULL) {
+                LIBCFS_ALLOC(name, args->lstio_dbg_nmlen + 1);
+                if (name == NULL)
+                        return -ENOMEM;
 
 		if (copy_from_user(name, args->lstio_dbg_namep,
-				   args->lstio_dbg_nmlen)) {
-			LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
+                                       args->lstio_dbg_nmlen)) {
+                        LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
 
-			return -EFAULT;
-		}
+                        return -EFAULT;
+                }
 
-		name[args->lstio_dbg_nmlen] = 0;
-	}
+                name[args->lstio_dbg_nmlen] = 0;
+        }
 
-	rc = -EINVAL;
+        rc = -EINVAL;
 
-	switch (args->lstio_dbg_type) {
-	case LST_OPC_SESSION:
-		rc = lstcon_session_debug(args->lstio_dbg_timeout,
-					  args->lstio_dbg_resultp);
-		break;
+        switch (args->lstio_dbg_type) {
+        case LST_OPC_SESSION:
+                rc = lstcon_session_debug(args->lstio_dbg_timeout,
+                                          args->lstio_dbg_resultp);
+                break;
 
-	case LST_OPC_BATCHSRV:
-		client = 0;
-		/* fallthrough */
-	case LST_OPC_BATCHCLI:
-		if (name == NULL)
-			goto out;
+        case LST_OPC_BATCHSRV:
+                client = 0;
+		/* Fall through */
+        case LST_OPC_BATCHCLI:
+                if (name == NULL)
+                        goto out;
 
-		rc = lstcon_batch_debug(args->lstio_dbg_timeout,
-					name, client, args->lstio_dbg_resultp);
-		break;
+                rc = lstcon_batch_debug(args->lstio_dbg_timeout,
+                                        name, client, args->lstio_dbg_resultp);
+                break;
 
-	case LST_OPC_GROUP:
-		if (name == NULL)
-			goto out;
+        case LST_OPC_GROUP:
+                if (name == NULL)
+                        goto out;
 
-		rc = lstcon_group_debug(args->lstio_dbg_timeout,
-					name, args->lstio_dbg_resultp);
-		break;
+                rc = lstcon_group_debug(args->lstio_dbg_timeout,
+                                        name, args->lstio_dbg_resultp);
+                break;
 
-	case LST_OPC_NODES:
-		if (args->lstio_dbg_count <= 0 ||
-		    args->lstio_dbg_idsp == NULL)
-			goto out;
+        case LST_OPC_NODES:
+                if (args->lstio_dbg_count <= 0 ||
+                    args->lstio_dbg_idsp == NULL)
+                        goto out;
 
-		rc = lstcon_nodes_debug(args->lstio_dbg_timeout,
-					args->lstio_dbg_count,
-					args->lstio_dbg_idsp,
-					args->lstio_dbg_resultp);
-		break;
+                rc = lstcon_nodes_debug(args->lstio_dbg_timeout,
+                                        args->lstio_dbg_count,
+                                        args->lstio_dbg_idsp,
+                                        args->lstio_dbg_resultp);
+                break;
 
-	default:
-		break;
-	}
+        default:
+                break;
+        }
 
 out:
-	if (name != NULL)
-		LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
+        if (name != NULL)
+                LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
 
-	return rc;
+        return rc;
 }
 
 static int
 lst_group_add_ioctl(struct lstio_group_add_args *args)
 {
-	char *name;
-	int rc;
+        char           *name;
+        int             rc;
 
-	if (args->lstio_grp_key != console_session.ses_key)
-		return -EACCES;
+        if (args->lstio_grp_key != console_session.ses_key)
+                return -EACCES;
 
-	if (args->lstio_grp_namep == NULL ||
+        if (args->lstio_grp_namep == NULL||
 	    args->lstio_grp_nmlen <= 0 ||
-	    args->lstio_grp_nmlen > LST_NAME_SIZE)
-		return -EINVAL;
+            args->lstio_grp_nmlen > LST_NAME_SIZE)
+                return -EINVAL;
 
-	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
-	if (name == NULL)
-		return -ENOMEM;
+        LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+        if (name == NULL)
+                return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_grp_namep,
 			   args->lstio_grp_nmlen)) {
-		LIBCFS_FREE(name, args->lstio_grp_nmlen);
-		return -EFAULT;
-	}
+                LIBCFS_FREE(name, args->lstio_grp_nmlen);
+                return -EFAULT;
+        }
 
-	name[args->lstio_grp_nmlen] = 0;
+        name[args->lstio_grp_nmlen] = 0;
 
-	rc = lstcon_group_add(name);
+        rc = lstcon_group_add(name);
 
-	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+        LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
 
-	return rc;
+        return rc;
 }
 
 static int
 lst_group_del_ioctl(struct lstio_group_del_args *args)
 {
-	int rc;
-	char *name;
+        int     rc;
+        char   *name;
 
-	if (args->lstio_grp_key != console_session.ses_key)
-		return -EACCES;
+        if (args->lstio_grp_key != console_session.ses_key)
+                return -EACCES;
 
-	if (args->lstio_grp_namep == NULL ||
-	    args->lstio_grp_nmlen <= 0 ||
-	    args->lstio_grp_nmlen > LST_NAME_SIZE)
-		return -EINVAL;
+        if (args->lstio_grp_namep == NULL ||
+            args->lstio_grp_nmlen <= 0 ||
+            args->lstio_grp_nmlen > LST_NAME_SIZE)
+                return -EINVAL;
 
-	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
-	if (name == NULL)
-		return -ENOMEM;
+        LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+        if (name == NULL)
+                return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_grp_namep,
 			   args->lstio_grp_nmlen)) {
-		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
-		return -EFAULT;
-	}
+                LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+                return -EFAULT;
+        }
 
-	name[args->lstio_grp_nmlen] = 0;
+        name[args->lstio_grp_nmlen] = 0;
 
-	rc = lstcon_group_del(name);
+        rc = lstcon_group_del(name);
 
-	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+        LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
 
-	return rc;
+        return rc;
 }
 
 static int
 lst_group_update_ioctl(struct lstio_group_update_args *args)
 {
-	int rc;
-	char *name;
+        int     rc;
+        char   *name;
 
-	if (args->lstio_grp_key != console_session.ses_key)
-		return -EACCES;
+        if (args->lstio_grp_key != console_session.ses_key)
+                return -EACCES;
 
-	if (args->lstio_grp_resultp == NULL ||
-	    args->lstio_grp_namep == NULL ||
+        if (args->lstio_grp_resultp == NULL ||
+            args->lstio_grp_namep == NULL ||
 	    args->lstio_grp_nmlen <= 0 ||
-	    args->lstio_grp_nmlen > LST_NAME_SIZE)
-		return -EINVAL;
+            args->lstio_grp_nmlen > LST_NAME_SIZE)
+                return -EINVAL;
 
-	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
-	if (name == NULL)
-		return -ENOMEM;
+        LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+        if (name == NULL)
+                return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_grp_namep,
 			   args->lstio_grp_nmlen)) {
-		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
-		return -EFAULT;
-	}
-
-	name[args->lstio_grp_nmlen] = 0;
-
-	switch (args->lstio_grp_opc) {
-	case LST_GROUP_CLEAN:
-		rc = lstcon_group_clean(name, args->lstio_grp_args);
-		break;
-
-	case LST_GROUP_REFRESH:
-		rc = lstcon_group_refresh(name, args->lstio_grp_resultp);
-		break;
-
-	case LST_GROUP_RMND:
-		if (args->lstio_grp_count <= 0 ||
-		    args->lstio_grp_idsp == NULL) {
-			rc = -EINVAL;
-			break;
-		}
-		rc = lstcon_nodes_remove(name, args->lstio_grp_count,
-					 args->lstio_grp_idsp,
-					 args->lstio_grp_resultp);
-		break;
-
-	default:
-		rc = -EINVAL;
-		break;
-	}
-
-	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
-
-	return rc;
+                LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+                return -EFAULT;
+        }
+
+        name[args->lstio_grp_nmlen] = 0;
+
+        switch (args->lstio_grp_opc) {
+        case LST_GROUP_CLEAN:
+                rc = lstcon_group_clean(name, args->lstio_grp_args);
+                break;
+
+        case LST_GROUP_REFRESH:
+                rc = lstcon_group_refresh(name, args->lstio_grp_resultp);
+                break;
+
+        case LST_GROUP_RMND:
+                if (args->lstio_grp_count  <= 0 ||
+                    args->lstio_grp_idsp == NULL) {
+                        rc = -EINVAL;
+                        break;
+                }
+                rc = lstcon_nodes_remove(name, args->lstio_grp_count,
+                                         args->lstio_grp_idsp,
+                                         args->lstio_grp_resultp);
+                break;
+
+        default:
+                rc = -EINVAL;
+                break;
+        }
+
+        LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+        return rc;
 }
 
 static int
 lst_nodes_add_ioctl(struct lstio_group_nodes_args *args)
 {
-	unsigned int feats;
-	int rc;
-	char *name;
+	unsigned feats;
+        int     rc;
+        char   *name;
 
-	if (args->lstio_grp_key != console_session.ses_key)
-		return -EACCES;
+        if (args->lstio_grp_key != console_session.ses_key)
+                return -EACCES;
 
-	if (args->lstio_grp_idsp == NULL || /* array of ids */
-	    args->lstio_grp_count <= 0 ||
-	    args->lstio_grp_resultp == NULL ||
+        if (args->lstio_grp_idsp == NULL || /* array of ids */
+            args->lstio_grp_count <= 0 ||
+            args->lstio_grp_resultp == NULL ||
 	    args->lstio_grp_featp == NULL ||
 	    args->lstio_grp_namep == NULL ||
 	    args->lstio_grp_nmlen <= 0 ||
-	    args->lstio_grp_nmlen > LST_NAME_SIZE)
-		return -EINVAL;
+            args->lstio_grp_nmlen > LST_NAME_SIZE)
+                return -EINVAL;
 
-	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
-	if (name == NULL)
-		return -ENOMEM;
+        LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+        if (name == NULL)
+                return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_grp_namep,
-			   args->lstio_grp_nmlen)) {
-		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+                               args->lstio_grp_nmlen)) {
+                LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
 
-		return -EFAULT;
-	}
+                return -EFAULT;
+        }
 
-	name[args->lstio_grp_nmlen] = 0;
+        name[args->lstio_grp_nmlen] = 0;
 
-	rc = lstcon_nodes_add(name, args->lstio_grp_count,
+        rc = lstcon_nodes_add(name, args->lstio_grp_count,
 			      args->lstio_grp_idsp, &feats,
 			      args->lstio_grp_resultp);
 
@@ -354,50 +354,50 @@ lst_nodes_add_ioctl(struct lstio_group_nodes_args *args)
 		return -EINVAL;
 	}
 
-	return rc;
+        return rc;
 }
 
 static int
 lst_group_list_ioctl(struct lstio_group_list_args *args)
 {
 	if (args->lstio_grp_key != console_session.ses_key)
-		return -EACCES;
+                return -EACCES;
 
-	if (args->lstio_grp_idx   < 0 ||
-	    args->lstio_grp_namep == NULL ||
-	    args->lstio_grp_nmlen <= 0 ||
-	    args->lstio_grp_nmlen > LST_NAME_SIZE)
-		return -EINVAL;
+        if (args->lstio_grp_idx   < 0 ||
+            args->lstio_grp_namep == NULL ||
+            args->lstio_grp_nmlen <= 0 ||
+            args->lstio_grp_nmlen > LST_NAME_SIZE)
+                return -EINVAL;
 
-	return lstcon_group_list(args->lstio_grp_idx,
-				 args->lstio_grp_nmlen,
-				 args->lstio_grp_namep);
+        return lstcon_group_list(args->lstio_grp_idx,
+                              args->lstio_grp_nmlen,
+                              args->lstio_grp_namep);
 }
 
 static int
 lst_group_info_ioctl(struct lstio_group_info_args *args)
 {
-	char *name;
-	int ndent;
-	int index;
-	int rc;
+        char           *name;
+        int             ndent;
+        int             index;
+        int             rc;
 
-	if (args->lstio_grp_key != console_session.ses_key)
-		return -EACCES;
+        if (args->lstio_grp_key != console_session.ses_key)
+                return -EACCES;
 
-	if (args->lstio_grp_namep == NULL ||
-	    args->lstio_grp_nmlen <= 0 ||
-	    args->lstio_grp_nmlen > LST_NAME_SIZE)
-		return -EINVAL;
+        if (args->lstio_grp_namep == NULL ||
+            args->lstio_grp_nmlen <= 0 ||
+            args->lstio_grp_nmlen > LST_NAME_SIZE)
+                return -EINVAL;
 
-	if (args->lstio_grp_entp == NULL && /* output: group entry */
-	    args->lstio_grp_dentsp == NULL)  /* output: node entry */
-		return -EINVAL;
+        if (args->lstio_grp_entp  == NULL && /* output: group entry */
+            args->lstio_grp_dentsp == NULL)  /* output: node entry */
+                return -EINVAL;
 
-	if (args->lstio_grp_dentsp != NULL) { /* have node entry */
-		if (args->lstio_grp_idxp == NULL || /* node index */
-		    args->lstio_grp_ndentp == NULL) /* # of node entry */
-			return -EINVAL;
+        if (args->lstio_grp_dentsp != NULL) { /* have node entry */
+                if (args->lstio_grp_idxp == NULL || /* node index */
+                    args->lstio_grp_ndentp == NULL) /* # of node entry */
+                        return -EINVAL;
 
 		if (copy_from_user(&ndent, args->lstio_grp_ndentp,
 				   sizeof(ndent)) ||
@@ -415,19 +415,19 @@ lst_group_info_ioctl(struct lstio_group_info_args *args)
 
 	if (copy_from_user(name, args->lstio_grp_namep,
 			   args->lstio_grp_nmlen)) {
-		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
-		return -EFAULT;
-	}
+                LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+                return -EFAULT;
+        }
 
-	name[args->lstio_grp_nmlen] = 0;
+        name[args->lstio_grp_nmlen] = 0;
 
-	rc = lstcon_group_info(name, args->lstio_grp_entp,
-			       &index, &ndent, args->lstio_grp_dentsp);
+        rc = lstcon_group_info(name, args->lstio_grp_entp,
+                               &index, &ndent, args->lstio_grp_dentsp);
 
-	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+        LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
 
 	if (rc != 0)
-		return rc;
+                return rc;
 
 	if (args->lstio_grp_dentsp != NULL &&
 	    (copy_to_user(args->lstio_grp_idxp, &index, sizeof(index)) ||
@@ -440,20 +440,20 @@ lst_group_info_ioctl(struct lstio_group_info_args *args)
 static int
 lst_batch_add_ioctl(struct lstio_batch_add_args *args)
 {
-	int rc;
-	char *name;
+        int             rc;
+        char           *name;
 
-	if (args->lstio_bat_key != console_session.ses_key)
-		return -EACCES;
+        if (args->lstio_bat_key != console_session.ses_key)
+                return -EACCES;
 
-	if (args->lstio_bat_namep == NULL ||
-	    args->lstio_bat_nmlen <= 0 ||
-	    args->lstio_bat_nmlen > LST_NAME_SIZE)
-		return -EINVAL;
+        if (args->lstio_bat_namep == NULL ||
+            args->lstio_bat_nmlen <= 0 ||
+            args->lstio_bat_nmlen > LST_NAME_SIZE)
+                return -EINVAL;
 
-	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
-	if (name == NULL)
-		return -ENOMEM;
+        LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+        if (name == NULL)
+                return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_bat_namep,
 			   args->lstio_bat_nmlen)) {
@@ -461,32 +461,32 @@ lst_batch_add_ioctl(struct lstio_batch_add_args *args)
 		return -EFAULT;
 	}
 
-	name[args->lstio_bat_nmlen] = 0;
+        name[args->lstio_bat_nmlen] = 0;
 
-	rc = lstcon_batch_add(name);
+        rc = lstcon_batch_add(name);
 
-	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+        LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
 
-	return rc;
+        return rc;
 }
 
 static int
 lst_batch_run_ioctl(struct lstio_batch_run_args *args)
 {
-	int rc;
-	char *name;
+        int             rc;
+        char           *name;
 
-	if (args->lstio_bat_key != console_session.ses_key)
-		return -EACCES;
+        if (args->lstio_bat_key != console_session.ses_key)
+                return -EACCES;
 
-	if (args->lstio_bat_namep == NULL ||
-	    args->lstio_bat_nmlen <= 0 ||
-	    args->lstio_bat_nmlen > LST_NAME_SIZE)
-		return -EINVAL;
+        if (args->lstio_bat_namep == NULL ||
+            args->lstio_bat_nmlen <= 0 ||
+            args->lstio_bat_nmlen > LST_NAME_SIZE)
+                return -EINVAL;
 
-	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
-	if (name == NULL)
-		return -ENOMEM;
+        LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+        if (name == NULL)
+                return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_bat_namep,
 			   args->lstio_bat_nmlen)) {
@@ -494,34 +494,34 @@ lst_batch_run_ioctl(struct lstio_batch_run_args *args)
 		return -EFAULT;
 	}
 
-	name[args->lstio_bat_nmlen] = 0;
+        name[args->lstio_bat_nmlen] = 0;
 
-	rc = lstcon_batch_run(name, args->lstio_bat_timeout,
-			      args->lstio_bat_resultp);
+        rc = lstcon_batch_run(name, args->lstio_bat_timeout,
+                              args->lstio_bat_resultp);
 
-	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+        LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
 
-	return rc;
+        return rc;
 }
 
 static int
 lst_batch_stop_ioctl(struct lstio_batch_stop_args *args)
 {
-	int rc;
-	char *name;
+        int             rc;
+        char           *name;
 
-	if (args->lstio_bat_key != console_session.ses_key)
-		return -EACCES;
+        if (args->lstio_bat_key != console_session.ses_key)
+                return -EACCES;
 
-	if (args->lstio_bat_resultp == NULL ||
-	    args->lstio_bat_namep == NULL ||
-	    args->lstio_bat_nmlen <= 0 ||
-	    args->lstio_bat_nmlen > LST_NAME_SIZE)
-		return -EINVAL;
+        if (args->lstio_bat_resultp == NULL ||
+            args->lstio_bat_namep == NULL ||
+            args->lstio_bat_nmlen <= 0 ||
+            args->lstio_bat_nmlen > LST_NAME_SIZE)
+                return -EINVAL;
 
-	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
-	if (name == NULL)
-		return -ENOMEM;
+        LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+        if (name == NULL)
+                return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_bat_namep,
 			   args->lstio_bat_nmlen)) {
@@ -529,37 +529,37 @@ lst_batch_stop_ioctl(struct lstio_batch_stop_args *args)
 		return -EFAULT;
 	}
 
-	name[args->lstio_bat_nmlen] = 0;
+        name[args->lstio_bat_nmlen] = 0;
 
-	rc = lstcon_batch_stop(name, args->lstio_bat_force,
-			       args->lstio_bat_resultp);
+        rc = lstcon_batch_stop(name, args->lstio_bat_force,
+                               args->lstio_bat_resultp);
 
-	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+        LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
 
-	return rc;
+        return rc;
 }
 
 static int
 lst_batch_query_ioctl(struct lstio_batch_query_args *args)
 {
-	char *name;
-	int rc;
+        char   *name;
+        int     rc;
 
-	if (args->lstio_bat_key != console_session.ses_key)
-		return -EACCES;
+        if (args->lstio_bat_key != console_session.ses_key)
+                return -EACCES;
 
-	if (args->lstio_bat_resultp == NULL ||
-	    args->lstio_bat_namep == NULL ||
-	    args->lstio_bat_nmlen <= 0 ||
-	    args->lstio_bat_nmlen > LST_NAME_SIZE)
-		return -EINVAL;
+        if (args->lstio_bat_resultp == NULL ||
+            args->lstio_bat_namep == NULL ||
+            args->lstio_bat_nmlen <= 0 ||
+            args->lstio_bat_nmlen > LST_NAME_SIZE)
+                return -EINVAL;
 
-	if (args->lstio_bat_testidx < 0)
-		return -EINVAL;
+        if (args->lstio_bat_testidx < 0)
+                return -EINVAL;
 
-	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
-	if (name == NULL)
-		return -ENOMEM;
+        LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+        if (name == NULL)
+                return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_bat_namep,
 			   args->lstio_bat_nmlen)) {
@@ -567,92 +567,92 @@ lst_batch_query_ioctl(struct lstio_batch_query_args *args)
 		return -EFAULT;
 	}
 
-	name[args->lstio_bat_nmlen] = 0;
+        name[args->lstio_bat_nmlen] = 0;
 
-	rc = lstcon_test_batch_query(name,
-				     args->lstio_bat_testidx,
-				     args->lstio_bat_client,
-				     args->lstio_bat_timeout,
-				     args->lstio_bat_resultp);
+        rc = lstcon_test_batch_query(name,
+                                     args->lstio_bat_testidx,
+                                     args->lstio_bat_client,
+                                     args->lstio_bat_timeout,
+                                     args->lstio_bat_resultp);
 
-	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+        LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
 
-	return rc;
+        return rc;
 }
 
 static int
 lst_batch_list_ioctl(struct lstio_batch_list_args *args)
 {
-	if (args->lstio_bat_key != console_session.ses_key)
-		return -EACCES;
-
-	if (args->lstio_bat_idx < 0 ||
-	    args->lstio_bat_namep == NULL ||
-	    args->lstio_bat_nmlen <= 0 ||
-	    args->lstio_bat_nmlen > LST_NAME_SIZE)
-		return -EINVAL;
-
-	return lstcon_batch_list(args->lstio_bat_idx,
-				 args->lstio_bat_nmlen,
-				 args->lstio_bat_namep);
+        if (args->lstio_bat_key != console_session.ses_key)
+                return -EACCES;
+
+        if (args->lstio_bat_idx   < 0 ||
+            args->lstio_bat_namep == NULL ||
+            args->lstio_bat_nmlen <= 0 ||
+            args->lstio_bat_nmlen > LST_NAME_SIZE)
+                return -EINVAL;
+
+        return lstcon_batch_list(args->lstio_bat_idx,
+                              args->lstio_bat_nmlen,
+                              args->lstio_bat_namep);
 }
 
 static int
 lst_batch_info_ioctl(struct lstio_batch_info_args *args)
 {
-	char *name;
-	int rc;
-	int index;
-	int ndent;
+        char           *name;
+        int             rc;
+        int             index;
+        int             ndent;
 
-	if (args->lstio_bat_key != console_session.ses_key)
-		return -EACCES;
+        if (args->lstio_bat_key != console_session.ses_key)
+                return -EACCES;
 
-	if (args->lstio_bat_namep == NULL || /* batch name */
-	    args->lstio_bat_nmlen <= 0 ||
-	    args->lstio_bat_nmlen > LST_NAME_SIZE)
-		return -EINVAL;
+        if (args->lstio_bat_namep == NULL || /* batch name */
+            args->lstio_bat_nmlen <= 0 ||
+            args->lstio_bat_nmlen > LST_NAME_SIZE)
+                return -EINVAL;
 
-	if (args->lstio_bat_entp == NULL && /* output: batch entry */
-	    args->lstio_bat_dentsp == NULL) /* output: node entry */
-		return -EINVAL;
+        if (args->lstio_bat_entp == NULL && /* output: batch entry */
+            args->lstio_bat_dentsp == NULL) /* output: node entry */
+                return -EINVAL;
 
-	if (args->lstio_bat_dentsp != NULL) { /* have node entry */
-		if (args->lstio_bat_idxp == NULL || /* node index */
-		    args->lstio_bat_ndentp == NULL) /* # of node entry */
-			return -EINVAL;
+        if (args->lstio_bat_dentsp != NULL) { /* have node entry */
+                if (args->lstio_bat_idxp == NULL || /* node index */
+                    args->lstio_bat_ndentp == NULL) /* # of node entry */
+                        return -EINVAL;
 
 		if (copy_from_user(&index, args->lstio_bat_idxp,
-				   sizeof(index)) ||
+                                       sizeof(index)) ||
 		    copy_from_user(&ndent, args->lstio_bat_ndentp,
-				   sizeof(ndent)))
-			return -EFAULT;
+                                       sizeof(ndent)))
+                        return -EFAULT;
 
-		if (ndent <= 0 || index < 0)
-			return -EINVAL;
-	}
+                if (ndent <= 0 || index < 0)
+                        return -EINVAL;
+        }
 
-	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
-	if (name == NULL)
-		return -ENOMEM;
+        LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+        if (name == NULL)
+                return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_bat_namep,
 			   args->lstio_bat_nmlen)) {
-		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
-		return -EFAULT;
-	}
+                LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+                return -EFAULT;
+        }
 
-	name[args->lstio_bat_nmlen] = 0;
+        name[args->lstio_bat_nmlen] = 0;
 
-	rc = lstcon_batch_info(name,
-			       args->lstio_bat_entp, args->lstio_bat_server,
-			       args->lstio_bat_testidx, &index, &ndent,
-			       args->lstio_bat_dentsp);
+        rc = lstcon_batch_info(name,
+                            args->lstio_bat_entp, args->lstio_bat_server,
+                            args->lstio_bat_testidx, &index, &ndent,
+                            args->lstio_bat_dentsp);
 
-	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+        LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
 
-	if (rc != 0)
-		return rc;
+        if (rc != 0)
+                return rc;
 
 	if (args->lstio_bat_dentsp != NULL &&
 	    (copy_to_user(args->lstio_bat_idxp, &index, sizeof(index)) ||
@@ -665,12 +665,12 @@ lst_batch_info_ioctl(struct lstio_batch_info_args *args)
 static int
 lst_stat_query_ioctl(struct lstio_stat_args *args)
 {
-	int rc;
-	char *name = NULL;
+        int             rc;
+	char           *name = NULL;
 
-	/* TODO: not finished */
-	if (args->lstio_sta_key != console_session.ses_key)
-		return -EACCES;
+        /* TODO: not finished */
+        if (args->lstio_sta_key != console_session.ses_key)
+                return -EACCES;
 
 	if (args->lstio_sta_resultp == NULL)
 		return -EINVAL;
@@ -680,9 +680,9 @@ lst_stat_query_ioctl(struct lstio_stat_args *args)
 			return -EINVAL;
 
 		rc = lstcon_nodes_stat(args->lstio_sta_count,
-				       args->lstio_sta_idsp,
-				       args->lstio_sta_timeout,
-				       args->lstio_sta_resultp);
+                                       args->lstio_sta_idsp,
+                                       args->lstio_sta_timeout,
+                                       args->lstio_sta_resultp);
 	} else if (args->lstio_sta_namep != NULL) {
 		if (args->lstio_sta_nmlen <= 0 ||
 		    args->lstio_sta_nmlen > LST_NAME_SIZE)
@@ -711,12 +711,12 @@ lst_stat_query_ioctl(struct lstio_stat_args *args)
 
 static int lst_test_add_ioctl(struct lstio_test_args *args)
 {
-	char *batch_name;
-	char *src_name = NULL;
-	char *dst_name = NULL;
-	void *param = NULL;
-	int ret = 0;
-	int rc = -ENOMEM;
+	char		*batch_name;
+	char		*src_name = NULL;
+	char		*dst_name = NULL;
+	void		*param = NULL;
+	int		ret = 0;
+	int		rc = -ENOMEM;
 
 	if (args->lstio_tes_resultp == NULL ||
 	    args->lstio_tes_retp == NULL ||
@@ -737,12 +737,12 @@ static int lst_test_add_ioctl(struct lstio_test_args *args)
 	    args->lstio_tes_span <= 0)
 		return -EINVAL;
 
-	/* have parameter, check if parameter length is valid */
-	if (args->lstio_tes_param != NULL &&
-	    (args->lstio_tes_param_len <= 0 ||
+        /* have parameter, check if parameter length is valid */
+        if (args->lstio_tes_param != NULL &&
+            (args->lstio_tes_param_len <= 0 ||
 	     args->lstio_tes_param_len >
-	     PAGE_SIZE - sizeof(struct lstcon_test)))
-		return -EINVAL;
+	     PAGE_SIZE - sizeof(lstcon_test_t)))
+                return -EINVAL;
 
 	LIBCFS_ALLOC(batch_name, args->lstio_tes_bat_nmlen + 1);
 	if (batch_name == NULL)
@@ -777,17 +777,17 @@ static int lst_test_add_ioctl(struct lstio_test_args *args)
 		goto out;
 
 	rc = lstcon_test_add(batch_name,
-			     args->lstio_tes_type,
-			     args->lstio_tes_loop,
-			     args->lstio_tes_concur,
-			     args->lstio_tes_dist, args->lstio_tes_span,
-			     src_name, dst_name, param,
-			     args->lstio_tes_param_len,
-			     &ret, args->lstio_tes_resultp);
-
-	if (ret != 0)
+			    args->lstio_tes_type,
+			    args->lstio_tes_loop,
+			    args->lstio_tes_concur,
+			    args->lstio_tes_dist, args->lstio_tes_span,
+			    src_name, dst_name, param,
+			    args->lstio_tes_param_len,
+			    &ret, args->lstio_tes_resultp);
+
+        if (ret != 0)
 		rc = (copy_to_user(args->lstio_tes_retp, &ret,
-				   sizeof(ret))) ? -EFAULT : 0;
+                                       sizeof(ret))) ? -EFAULT : 0;
 out:
 	if (batch_name != NULL)
 		LIBCFS_FREE(batch_name, args->lstio_tes_bat_nmlen + 1);
@@ -805,40 +805,36 @@ static int lst_test_add_ioctl(struct lstio_test_args *args)
 }
 
 int
-lstcon_ioctl_entry(struct notifier_block *nb,
-		   unsigned long cmd, void *vdata)
+lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_hdr *hdr)
 {
-	struct libcfs_ioctl_hdr *hdr = vdata;
+	char   *buf;
 	struct libcfs_ioctl_data *data;
-	char *buf = NULL;
-	int rc = -EINVAL;
-	int opc;
+	int     opc;
+	int     rc;
 
 	if (cmd != IOC_LIBCFS_LNETST)
-		goto err;
+		return -EINVAL;
 
 	data = container_of(hdr, struct libcfs_ioctl_data, ioc_hdr);
 
 	opc = data->ioc_u32[0];
 
 	if (data->ioc_plen1 > PAGE_SIZE)
-		goto err;
+		return -EINVAL;
 
 	LIBCFS_ALLOC(buf, data->ioc_plen1);
-	if (buf == NULL) {
-		rc = -ENOMEM;
-		goto err;
-	}
+	if (buf == NULL)
+		return -ENOMEM;
 
 	/* copy in parameter */
 	if (copy_from_user(buf, data->ioc_pbuf1, data->ioc_plen1)) {
-		rc = -EFAULT;
-		goto out_free_buf;
+		LIBCFS_FREE(buf, data->ioc_plen1);
+		return -EFAULT;
 	}
 
 	mutex_lock(&console_session.ses_mutex);
 
-	console_session.ses_laststamp = ktime_get_real_seconds();
+	console_session.ses_laststamp = cfs_time_current_sec();
 
 	if (console_session.ses_shutdown) {
 		rc = -ESHUTDOWN;
@@ -855,8 +851,7 @@ lstcon_ioctl_entry(struct notifier_block *nb,
 		goto out;
 	}
 
-	memset(&console_session.ses_trans_stat, 0,
-	       sizeof(struct lstcon_trans_stat));
+	memset(&console_session.ses_trans_stat, 0, sizeof(struct lstcon_trans_stat));
 
 	switch (opc) {
 	case LSTIO_SESSION_NEW:
@@ -915,7 +910,6 @@ lstcon_ioctl_entry(struct notifier_block *nb,
 		break;
 	default:
 		rc = -EINVAL;
-		goto out;
 	}
 
 	if (copy_to_user(data->ioc_pbuf2, &console_session.ses_trans_stat,
@@ -923,8 +917,8 @@ lstcon_ioctl_entry(struct notifier_block *nb,
 		rc = -EFAULT;
 out:
 	mutex_unlock(&console_session.ses_mutex);
-out_free_buf:
+
 	LIBCFS_FREE(buf, data->ioc_plen1);
-err:
-	return notifier_from_ioctl_errno(rc);
+
+	return rc;
 }
diff --git a/drivers/staging/lustrefsx/lnet/selftest/conrpc.c b/drivers/staging/lustrefsx/lnet/selftest/conrpc.c
index b39756f724a2a..a1ef9ada96804 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/conrpc.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/conrpc.c
@@ -43,13 +43,13 @@
 #include "conrpc.h"
 #include "console.h"
 
-void lstcon_rpc_stat_reply(struct lstcon_rpc_trans *, struct srpc_msg *,
-			   struct lstcon_node *, struct lstcon_trans_stat *);
+void lstcon_rpc_stat_reply(lstcon_rpc_trans_t *, srpc_msg_t *,
+			   lstcon_node_t *, struct lstcon_trans_stat *);
 
 static void
-lstcon_rpc_done(struct srpc_client_rpc *rpc)
+lstcon_rpc_done(srpc_client_rpc_t *rpc)
 {
-	struct lstcon_rpc *crpc = rpc->crpc_priv;
+	lstcon_rpc_t *crpc = (lstcon_rpc_t *)rpc->crpc_priv;
 
 	LASSERT(crpc != NULL && rpc == crpc->crp_rpc);
 	LASSERT(crpc->crp_posted && !crpc->crp_finished);
@@ -69,11 +69,11 @@ lstcon_rpc_done(struct srpc_client_rpc *rpc)
 	/* not an orphan RPC */
 	crpc->crp_finished = 1;
 
-	if (crpc->crp_stamp_ns == 0) {
+	if (crpc->crp_stamp == 0) {
 		/* not aborted */
-		LASSERT(crpc->crp_status == 0);
+		LASSERT (crpc->crp_status == 0);
 
-		crpc->crp_stamp_ns = ktime_get_ns();
+		crpc->crp_stamp  = cfs_time_current();
 		crpc->crp_status = rpc->crpc_status;
 	}
 
@@ -85,19 +85,22 @@ lstcon_rpc_done(struct srpc_client_rpc *rpc)
 }
 
 static int
-lstcon_rpc_init(struct lstcon_node *nd, int service, unsigned int feats,
-		int bulk_npg, int bulk_len, int embedded,
-		struct lstcon_rpc *crpc)
+lstcon_rpc_init(lstcon_node_t *nd, int service, unsigned feats,
+		int bulk_npg, int bulk_len, int embedded, lstcon_rpc_t *crpc)
 {
-	memset(crpc, 0, sizeof(*crpc));
-
 	crpc->crp_rpc = sfw_create_rpc(nd->nd_id, service,
 				       feats, bulk_npg, bulk_len,
 				       lstcon_rpc_done, (void *)crpc);
 	if (crpc->crp_rpc == NULL)
 		return -ENOMEM;
 
+	crpc->crp_trans	   = NULL;
 	crpc->crp_node	   = nd;
+	crpc->crp_posted   = 0;
+	crpc->crp_finished = 0;
+	crpc->crp_unpacked = 0;
+	crpc->crp_status   = 0;
+	crpc->crp_stamp	   = 0;
 	crpc->crp_embedded = embedded;
 	INIT_LIST_HEAD(&crpc->crp_link);
 
@@ -107,17 +110,17 @@ lstcon_rpc_init(struct lstcon_node *nd, int service, unsigned int feats,
 }
 
 static int
-lstcon_rpc_prep(struct lstcon_node *nd, int service, unsigned int feats,
-		int bulk_npg, int bulk_len, struct lstcon_rpc **crpcpp)
+lstcon_rpc_prep(lstcon_node_t *nd, int service, unsigned feats,
+		int bulk_npg, int bulk_len, lstcon_rpc_t **crpcpp)
 {
-	struct lstcon_rpc *crpc = NULL;
-	int rc;
+	lstcon_rpc_t  *crpc = NULL;
+	int            rc;
 
 	spin_lock(&console_session.ses_rpc_lock);
 
 	if (!list_empty(&console_session.ses_rpc_freelist)) {
 		crpc = list_entry(console_session.ses_rpc_freelist.next,
-				  struct lstcon_rpc, crp_link);
+				  lstcon_rpc_t, crp_link);
 		list_del_init(&crpc->crp_link);
 	}
 
@@ -141,10 +144,10 @@ lstcon_rpc_prep(struct lstcon_node *nd, int service, unsigned int feats,
 }
 
 void
-lstcon_rpc_put(struct lstcon_rpc *crpc)
+lstcon_rpc_put(lstcon_rpc_t *crpc)
 {
-	struct srpc_bulk *bulk = &crpc->crp_rpc->crpc_bulk;
-	int i;
+	srpc_bulk_t *bulk = &crpc->crp_rpc->crpc_bulk;
+	int	     i;
 
 	LASSERT(list_empty(&crpc->crp_link));
 
@@ -176,9 +179,9 @@ lstcon_rpc_put(struct lstcon_rpc *crpc)
 }
 
 static void
-lstcon_rpc_post(struct lstcon_rpc *crpc)
+lstcon_rpc_post(lstcon_rpc_t *crpc)
 {
-	struct lstcon_rpc_trans *trans = crpc->crp_trans;
+        lstcon_rpc_trans_t *trans = crpc->crp_trans;
 
         LASSERT (trans != NULL);
 
@@ -229,9 +232,9 @@ lstcon_rpc_trans_name(int transop)
 
 int
 lstcon_rpc_trans_prep(struct list_head *translist, int transop,
-		      struct lstcon_rpc_trans **transpp)
+		      lstcon_rpc_trans_t **transpp)
 {
-	struct lstcon_rpc_trans *trans;
+	lstcon_rpc_trans_t *trans;
 
 	if (translist != NULL) {
 		list_for_each_entry(trans, translist, tas_link) {
@@ -269,18 +272,18 @@ lstcon_rpc_trans_prep(struct list_head *translist, int transop,
 }
 
 void
-lstcon_rpc_trans_addreq(struct lstcon_rpc_trans *trans, struct lstcon_rpc *crpc)
+lstcon_rpc_trans_addreq(lstcon_rpc_trans_t *trans, lstcon_rpc_t *crpc)
 {
 	list_add_tail(&crpc->crp_link, &trans->tas_rpcs_list);
 	crpc->crp_trans = trans;
 }
 
 void
-lstcon_rpc_trans_abort(struct lstcon_rpc_trans *trans, int error)
+lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error)
 {
-	struct srpc_client_rpc *rpc;
-	struct lstcon_rpc *crpc;
-	struct lstcon_node *nd;
+	srpc_client_rpc_t *rpc;
+	lstcon_rpc_t	  *crpc;
+	lstcon_node_t	  *nd;
 
 	list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
 		rpc = crpc->crp_rpc;
@@ -288,16 +291,16 @@ lstcon_rpc_trans_abort(struct lstcon_rpc_trans *trans, int error)
 		spin_lock(&rpc->crpc_lock);
 
 		if (!crpc->crp_posted || /* not posted */
-		    crpc->crp_stamp_ns != 0) { /* rpc done or aborted already */
-			if (crpc->crp_stamp_ns == 0) {
-				crpc->crp_stamp_ns = ktime_get_ns();
+		    crpc->crp_stamp != 0) { /* rpc done or aborted already */
+			if (crpc->crp_stamp == 0) {
+				crpc->crp_stamp = cfs_time_current();
 				crpc->crp_status = -EINTR;
 			}
 			spin_unlock(&rpc->crpc_lock);
 			continue;
 		}
 
-		crpc->crp_stamp_ns  = ktime_get_ns();
+		crpc->crp_stamp  = cfs_time_current();
 		crpc->crp_status = error;
 
 		spin_unlock(&rpc->crpc_lock);
@@ -308,16 +311,16 @@ lstcon_rpc_trans_abort(struct lstcon_rpc_trans *trans, int error)
 			continue;
 
 		nd = crpc->crp_node;
-		if (ktime_to_ns(nd->nd_stamp) > crpc->crp_stamp_ns)
+		if (cfs_time_after(nd->nd_stamp, crpc->crp_stamp))
 			continue;
 
-		nd->nd_stamp = ktime_set(0, crpc->crp_stamp_ns);
+		nd->nd_stamp = crpc->crp_stamp;
 		nd->nd_state = LST_NODE_DOWN;
 	}
 }
 
 static int
-lstcon_rpc_trans_check(struct lstcon_rpc_trans *trans)
+lstcon_rpc_trans_check(lstcon_rpc_trans_t *trans)
 {
 	if (console_session.ses_shutdown &&
 	    !list_empty(&trans->tas_olink)) /* Not an end session RPC */
@@ -327,10 +330,10 @@ lstcon_rpc_trans_check(struct lstcon_rpc_trans *trans)
 }
 
 int
-lstcon_rpc_trans_postwait(struct lstcon_rpc_trans *trans, int timeout)
+lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout)
 {
-	struct lstcon_rpc *crpc;
-	int rc;
+	lstcon_rpc_t  *crpc;
+	int	       rc;
 
 	if (list_empty(&trans->tas_rpcs_list))
                 return 0;
@@ -378,14 +381,14 @@ lstcon_rpc_trans_postwait(struct lstcon_rpc_trans *trans, int timeout)
 }
 
 static int
-lstcon_rpc_get_reply(struct lstcon_rpc *crpc, struct srpc_msg **msgpp)
+lstcon_rpc_get_reply(lstcon_rpc_t *crpc, srpc_msg_t **msgpp)
 {
-	struct lstcon_node *nd = crpc->crp_node;
-	struct srpc_client_rpc *rpc = crpc->crp_rpc;
-	struct srpc_generic_reply *rep;
+        lstcon_node_t        *nd  = crpc->crp_node;
+        srpc_client_rpc_t    *rpc = crpc->crp_rpc;
+        srpc_generic_reply_t *rep;
 
-	LASSERT(nd != NULL && rpc != NULL);
-	LASSERT(crpc->crp_stamp_ns != 0);
+        LASSERT (nd != NULL && rpc != NULL);
+        LASSERT (crpc->crp_stamp != 0);
 
         if (crpc->crp_status != 0) {
                 *msgpp = NULL;
@@ -398,11 +401,11 @@ lstcon_rpc_get_reply(struct lstcon_rpc *crpc, struct srpc_msg **msgpp)
                 crpc->crp_unpacked = 1;
         }
 
-	if (ktime_to_ns(nd->nd_stamp) > crpc->crp_stamp_ns)
-		return 0;
+        if (cfs_time_after(nd->nd_stamp, crpc->crp_stamp))
+                return 0;
 
-	nd->nd_stamp = ktime_set(0, crpc->crp_stamp_ns);
-	rep = &(*msgpp)->msg_body.reply;
+        nd->nd_stamp = crpc->crp_stamp;
+        rep = &(*msgpp)->msg_body.reply;
 
         if (rep->sid.ses_nid == LNET_NID_ANY)
                 nd->nd_state = LST_NODE_UNKNOWN;
@@ -415,12 +418,11 @@ lstcon_rpc_get_reply(struct lstcon_rpc *crpc, struct srpc_msg **msgpp)
 }
 
 void
-lstcon_rpc_trans_stat(struct lstcon_rpc_trans *trans,
-		      struct lstcon_trans_stat *stat)
+lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans, struct lstcon_trans_stat *stat)
 {
-	struct lstcon_rpc *crpc;
-	struct srpc_msg	*rep;
-	int error;
+	lstcon_rpc_t	*crpc;
+	srpc_msg_t	*rep;
+	int		 error;
 
 	LASSERT(stat != NULL);
 
@@ -429,7 +431,7 @@ lstcon_rpc_trans_stat(struct lstcon_rpc_trans *trans,
 	list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
 		lstcon_rpc_stat_total(stat, 1);
 
-		LASSERT(crpc->crp_stamp_ns != 0);
+		LASSERT(crpc->crp_stamp != 0);
 
                 error = lstcon_rpc_get_reply(crpc, &rep);
                 if (error != 0) {
@@ -462,20 +464,20 @@ lstcon_rpc_trans_stat(struct lstcon_rpc_trans *trans,
 }
 
 int
-lstcon_rpc_trans_interpreter(struct lstcon_rpc_trans *trans,
+lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
 			     struct list_head __user *head_up,
 			     lstcon_rpc_readent_func_t readent)
 {
-	struct list_head tmp;
-	struct list_head __user *next;
-	struct lstcon_rpc_ent *ent;
-	struct srpc_generic_reply *rep;
-	struct lstcon_rpc *crpc;
-	struct srpc_msg *msg;
-	struct lstcon_node *nd;
+	struct list_head      tmp;
+	struct list_head     __user *next;
+	struct lstcon_rpc_ent     *ent;
+        srpc_generic_reply_t *rep;
+        lstcon_rpc_t         *crpc;
+        srpc_msg_t           *msg;
+        lstcon_node_t        *nd;
+        cfs_duration_t        dur;
 	struct timespec64 ts;
-	int error;
-	s64 dur;
+        int                   error;
 
 	LASSERT(head_up != NULL);
 
@@ -493,15 +495,15 @@ lstcon_rpc_trans_interpreter(struct lstcon_rpc_trans *trans,
 
 		ent = list_entry(next, struct lstcon_rpc_ent, rpe_link);
 
-		LASSERT(crpc->crp_stamp_ns != 0);
+		LASSERT(crpc->crp_stamp != 0);
 
-		error = lstcon_rpc_get_reply(crpc, &msg);
+                error = lstcon_rpc_get_reply(crpc, &msg);
 
-		nd = crpc->crp_node;
+                nd = crpc->crp_node;
 
-		dur = crpc->crp_stamp_ns -
-		      console_session.ses_id.ses_stamp * NSEC_PER_MSEC;
-		ts = ns_to_timespec64(dur);
+		dur = (cfs_duration_t)cfs_time_sub(crpc->crp_stamp,
+		       (cfs_time_t)console_session.ses_id.ses_stamp);
+		jiffies_to_timespec64(dur, &ts);
 
 		if (copy_to_user(&ent->rpe_peer,
 				 &nd->nd_id, sizeof(struct lnet_process_id)) ||
@@ -516,7 +518,7 @@ lstcon_rpc_trans_interpreter(struct lstcon_rpc_trans *trans,
 			continue;
 
 		/* RPC is done */
-		rep = (struct srpc_generic_reply *)&msg->msg_body.reply;
+		rep = (srpc_generic_reply_t *)&msg->msg_body.reply;
 
 		if (copy_to_user(&ent->rpe_sid,
 				 &rep->sid, sizeof(rep->sid)) ||
@@ -536,12 +538,12 @@ lstcon_rpc_trans_interpreter(struct lstcon_rpc_trans *trans,
 }
 
 void
-lstcon_rpc_trans_destroy(struct lstcon_rpc_trans *trans)
+lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans)
 {
-	struct srpc_client_rpc *rpc;
-	struct lstcon_rpc *crpc;
-	struct lstcon_rpc *tmp;
-	int count = 0;
+	srpc_client_rpc_t *rpc;
+	lstcon_rpc_t      *crpc;
+	lstcon_rpc_t      *tmp;
+	int                count = 0;
 
 	list_for_each_entry_safe(crpc, tmp, &trans->tas_rpcs_list, crp_link) {
 		rpc = crpc->crp_rpc;
@@ -590,12 +592,12 @@ lstcon_rpc_trans_destroy(struct lstcon_rpc_trans *trans)
 }
 
 int
-lstcon_sesrpc_prep(struct lstcon_node *nd, int transop,
-		   unsigned int feats, struct lstcon_rpc **crpc)
+lstcon_sesrpc_prep(lstcon_node_t *nd, int transop,
+		   unsigned feats, lstcon_rpc_t **crpc)
 {
-	struct srpc_mksn_reqst *msrq;
-	struct srpc_rmsn_reqst *rsrq;
-	int rc;
+        srpc_mksn_reqst_t *msrq;
+        srpc_rmsn_reqst_t *rsrq;
+        int                rc;
 
         switch (transop) {
         case LST_TRANS_SESNEW:
@@ -629,11 +631,10 @@ lstcon_sesrpc_prep(struct lstcon_node *nd, int transop,
 }
 
 int
-lstcon_dbgrpc_prep(struct lstcon_node *nd, unsigned int feats,
-		   struct lstcon_rpc **crpc)
+lstcon_dbgrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc)
 {
-	struct srpc_debug_reqst *drq;
-	int rc;
+	srpc_debug_reqst_t *drq;
+	int		    rc;
 
 	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_DEBUG, feats, 0, 0, crpc);
         if (rc != 0)
@@ -648,12 +649,12 @@ lstcon_dbgrpc_prep(struct lstcon_node *nd, unsigned int feats,
 }
 
 int
-lstcon_batrpc_prep(struct lstcon_node *nd, int transop, unsigned int feats,
-		   struct lstcon_tsb_hdr *tsb, struct lstcon_rpc **crpc)
+lstcon_batrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
+		   lstcon_tsb_hdr_t *tsb, lstcon_rpc_t **crpc)
 {
-	struct lstcon_batch *batch;
-	struct srpc_batch_reqst *brq;
-	int rc;
+	lstcon_batch_t	   *batch;
+	srpc_batch_reqst_t *brq;
+	int		    rc;
 
 	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_BATCH, feats, 0, 0, crpc);
         if (rc != 0)
@@ -674,18 +675,17 @@ lstcon_batrpc_prep(struct lstcon_node *nd, int transop, unsigned int feats,
 
         LASSERT (tsb->tsb_index == 0);
 
-	batch = (struct lstcon_batch *)tsb;
+        batch = (lstcon_batch_t *)tsb;
         brq->bar_arg = batch->bat_arg;
 
         return 0;
 }
 
 int
-lstcon_statrpc_prep(struct lstcon_node *nd, unsigned int feats,
-		    struct lstcon_rpc **crpc)
+lstcon_statrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc)
 {
-	struct srpc_stat_reqst *srq;
-	int rc;
+	srpc_stat_reqst_t *srq;
+	int		   rc;
 
 	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_QUERY_STAT, feats, 0, 0, crpc);
         if (rc != 0)
@@ -715,15 +715,15 @@ lstcon_next_id(int idx, int nkiov, lnet_kiov_t *kiov)
 }
 
 static int
-lstcon_dstnodes_prep(struct lstcon_group *grp, int idx,
+lstcon_dstnodes_prep(lstcon_group_t *grp, int idx,
                      int dist, int span, int nkiov, lnet_kiov_t *kiov)
 {
 	struct lnet_process_id_packed *pid;
-	struct lstcon_ndlink *ndl;
-	struct lstcon_node *nd;
-	int start;
-	int end;
-	int i = 0;
+        lstcon_ndlink_t          *ndl;
+        lstcon_node_t            *nd;
+        int                       start;
+        int                       end;
+        int                       i = 0;
 
         LASSERT (dist >= 1);
         LASSERT (span >= 1);
@@ -769,10 +769,9 @@ lstcon_dstnodes_prep(struct lstcon_group *grp, int idx,
 }
 
 static int
-lstcon_pingrpc_prep(struct lst_test_ping_param *param,
-		    struct srpc_test_reqst *req)
+lstcon_pingrpc_prep(struct lst_test_ping_param *param, srpc_test_reqst_t *req)
 {
-	struct test_ping_req *prq = &req->tsr_u.ping;
+        test_ping_req_t *prq = &req->tsr_u.ping;
 
         prq->png_size   = param->png_size;
         prq->png_flags  = param->png_flags;
@@ -781,10 +780,9 @@ lstcon_pingrpc_prep(struct lst_test_ping_param *param,
 }
 
 static int
-lstcon_bulkrpc_v0_prep(struct lst_test_bulk_param *param,
-		       struct srpc_test_reqst *req)
+lstcon_bulkrpc_v0_prep(struct lst_test_bulk_param *param, srpc_test_reqst_t *req)
 {
-	struct test_bulk_req *brq = &req->tsr_u.bulk_v0;
+	test_bulk_req_t *brq = &req->tsr_u.bulk_v0;
 
 	brq->blk_opc    = param->blk_opc;
 	brq->blk_npg    = (param->blk_size + PAGE_SIZE - 1) /
@@ -796,9 +794,9 @@ lstcon_bulkrpc_v0_prep(struct lst_test_bulk_param *param,
 
 static int
 lstcon_bulkrpc_v1_prep(struct lst_test_bulk_param *param, bool is_client,
-		       struct srpc_test_reqst *req)
+		       srpc_test_reqst_t *req)
 {
-	struct test_bulk_req_v1 *brq = &req->tsr_u.bulk_v1;
+	test_bulk_req_v1_t *brq = &req->tsr_u.bulk_v1;
 
 	brq->blk_opc	= param->blk_opc;
 	brq->blk_flags	= param->blk_flags;
@@ -809,17 +807,17 @@ lstcon_bulkrpc_v1_prep(struct lst_test_bulk_param *param, bool is_client,
 }
 
 int
-lstcon_testrpc_prep(struct lstcon_node *nd, int transop, unsigned int feats,
-		    struct lstcon_test *test, struct lstcon_rpc **crpc)
+lstcon_testrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
+                    lstcon_test_t *test, lstcon_rpc_t **crpc)
 {
-	struct lstcon_group *sgrp = test->tes_src_grp;
-	struct lstcon_group *dgrp = test->tes_dst_grp;
-	struct srpc_test_reqst *trq;
-	struct srpc_bulk *bulk;
-	int i;
-	int npg = 0;
-	int nob = 0;
-	int rc = 0;
+        lstcon_group_t    *sgrp = test->tes_src_grp;
+        lstcon_group_t    *dgrp = test->tes_dst_grp;
+        srpc_test_reqst_t *trq;
+        srpc_bulk_t       *bulk;
+        int                i;
+	int		   npg = 0;
+	int		   nob = 0;
+	int		   rc  = 0;
 
 	if (transop == LST_TRANS_TSBCLIADD) {
 		npg = sfw_id_pages(test->tes_span);
@@ -917,11 +915,11 @@ lstcon_testrpc_prep(struct lstcon_node *nd, int transop, unsigned int feats,
 }
 
 static int
-lstcon_sesnew_stat_reply(struct lstcon_rpc_trans *trans,
-			 struct lstcon_node *nd, struct srpc_msg *reply)
+lstcon_sesnew_stat_reply(lstcon_rpc_trans_t *trans,
+			 lstcon_node_t *nd, srpc_msg_t *reply)
 {
-	struct srpc_mksn_reply *mksn_rep = &reply->msg_body.mksn_reply;
-	int status = mksn_rep->mksn_status;
+	srpc_mksn_reply_t *mksn_rep = &reply->msg_body.mksn_reply;
+	int		   status   = mksn_rep->mksn_status;
 
 	if (status == 0 &&
 	    (reply->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
@@ -964,15 +962,15 @@ lstcon_sesnew_stat_reply(struct lstcon_rpc_trans *trans,
 }
 
 void
-lstcon_rpc_stat_reply(struct lstcon_rpc_trans *trans, struct srpc_msg *msg,
-		      struct lstcon_node *nd, struct lstcon_trans_stat *stat)
+lstcon_rpc_stat_reply(lstcon_rpc_trans_t *trans, srpc_msg_t *msg,
+		      lstcon_node_t *nd, struct lstcon_trans_stat *stat)
 {
-	struct srpc_rmsn_reply *rmsn_rep;
-	struct srpc_debug_reply *dbg_rep;
-	struct srpc_batch_reply *bat_rep;
-	struct srpc_test_reply *test_rep;
-	struct srpc_stat_reply *stat_rep;
-	int rc = 0;
+        srpc_rmsn_reply_t  *rmsn_rep;
+        srpc_debug_reply_t *dbg_rep;
+        srpc_batch_reply_t *bat_rep;
+        srpc_test_reply_t  *test_rep;
+        srpc_stat_reply_t  *stat_rep;
+        int                 rc = 0;
 
 	switch (trans->tas_opc) {
 	case LST_TRANS_SESNEW:
@@ -1087,14 +1085,14 @@ int
 lstcon_rpc_trans_ndlist(struct list_head *ndlist,
 			struct list_head *translist, int transop,
 			void *arg, lstcon_rpc_cond_func_t condition,
-			struct lstcon_rpc_trans **transpp)
+			lstcon_rpc_trans_t **transpp)
 {
-	struct lstcon_rpc_trans *trans;
-	struct lstcon_ndlink *ndl;
-	struct lstcon_node *nd;
-	struct lstcon_rpc *rpc;
-	unsigned int feats;
-	int rc;
+        lstcon_rpc_trans_t *trans;
+        lstcon_ndlink_t    *ndl;
+        lstcon_node_t      *nd;
+        lstcon_rpc_t       *rpc;
+	unsigned	    feats;
+        int                 rc;
 
         /* Creating session RPG for list of nodes */
 
@@ -1132,16 +1130,14 @@ lstcon_rpc_trans_ndlist(struct list_head *ndlist,
 		case LST_TRANS_TSBCLIADD:
 		case LST_TRANS_TSBSRVADD:
 			rc = lstcon_testrpc_prep(nd, transop, feats,
-						 (struct lstcon_test *)arg,
-						 &rpc);
+						 (lstcon_test_t *)arg, &rpc);
 			break;
 		case LST_TRANS_TSBRUN:
 		case LST_TRANS_TSBSTOP:
 		case LST_TRANS_TSBCLIQRY:
 		case LST_TRANS_TSBSRVQRY:
 			rc = lstcon_batrpc_prep(nd, transop, feats,
-						(struct lstcon_tsb_hdr *)arg,
-						&rpc);
+						(lstcon_tsb_hdr_t *)arg, &rpc);
 			break;
 		case LST_TRANS_STATQRY:
 			rc = lstcon_statrpc_prep(nd, feats, &rpc);
@@ -1173,16 +1169,16 @@ lstcon_rpc_trans_ndlist(struct list_head *ndlist,
 static void
 lstcon_rpc_pinger(void *arg)
 {
-	struct stt_timer *ptimer = arg;
-	struct lstcon_rpc_trans *trans;
-	struct lstcon_rpc *crpc;
-	struct srpc_msg *rep;
-	struct srpc_debug_reqst *drq;
-	struct lstcon_ndlink *ndl;
-	struct lstcon_node *nd;
+        stt_timer_t        *ptimer = (stt_timer_t *)arg;
+        lstcon_rpc_trans_t *trans;
+        lstcon_rpc_t       *crpc;
+        srpc_msg_t         *rep;
+        srpc_debug_reqst_t *drq;
+        lstcon_ndlink_t    *ndl;
+        lstcon_node_t      *nd;
 	int intv;
-	int count = 0;
-	int rc;
+        int                 count = 0;
+        int                 rc;
 
         /* RPC pinger is a special case of transaction,
          * it's called by timer at 8 seconds interval.
@@ -1195,8 +1191,8 @@ lstcon_rpc_pinger(void *arg)
         }
 
 	if (!console_session.ses_expired &&
-	    ktime_get_real_seconds() - console_session.ses_laststamp >
-	    (time64_t)console_session.ses_timeout)
+	    cfs_time_current_sec() - console_session.ses_laststamp >
+	    (time_t)console_session.ses_timeout)
 		console_session.ses_expired = 1;
 
 	trans = console_session.ses_ping;
@@ -1249,13 +1245,12 @@ lstcon_rpc_pinger(void *arg)
 			lstcon_rpc_put(crpc);
 		}
 
-		if (nd->nd_state != LST_NODE_ACTIVE)
-			continue;
+                if (nd->nd_state != LST_NODE_ACTIVE)
+                        continue;
 
-		intv = div_u64(ktime_ms_delta(ktime_get(), nd->nd_stamp),
-			       MSEC_PER_SEC);
+		intv = cfs_duration_sec(jiffies - nd->nd_stamp);
 		if (intv < nd->nd_timeout / 2)
-			continue;
+                        continue;
 
 		rc = lstcon_rpc_init(nd, SRPC_SERVICE_DEBUG,
 				     trans->tas_features, 0, 0, 1, crpc);
@@ -1282,7 +1277,7 @@ lstcon_rpc_pinger(void *arg)
 
         CDEBUG(D_NET, "Ping %d nodes in session\n", count);
 
-	ptimer->stt_expires = ktime_get_real_seconds() + LST_PING_INTERVAL;
+	ptimer->stt_expires = (cfs_time_t)(cfs_time_current_sec() + LST_PING_INTERVAL);
 	stt_add_timer(ptimer);
 
 	mutex_unlock(&console_session.ses_mutex);
@@ -1291,8 +1286,8 @@ lstcon_rpc_pinger(void *arg)
 int
 lstcon_rpc_pinger_start(void)
 {
-	struct stt_timer *ptimer;
-	int rc;
+	stt_timer_t	*ptimer;
+	int		 rc;
 
 	LASSERT(list_empty(&console_session.ses_rpc_freelist));
 	LASSERT(atomic_read(&console_session.ses_rpc_counter) == 0);
@@ -1305,7 +1300,7 @@ lstcon_rpc_pinger_start(void)
         }
 
 	ptimer = &console_session.ses_ping_timer;
-	ptimer->stt_expires = ktime_get_real_seconds() + LST_PING_INTERVAL;
+	ptimer->stt_expires = (cfs_time_t)(cfs_time_current_sec() + LST_PING_INTERVAL);
 
 	stt_add_timer(ptimer);
 
@@ -1331,10 +1326,10 @@ lstcon_rpc_pinger_stop(void)
 void
 lstcon_rpc_cleanup_wait(void)
 {
-	struct lstcon_rpc_trans	*trans;
-	struct lstcon_rpc *crpc;
-	struct list_head *pacer;
-	struct list_head zlist;
+	lstcon_rpc_trans_t	*trans;
+	lstcon_rpc_t		*crpc;
+	struct list_head	*pacer;
+	struct list_head	 zlist;
 
 	/* Called with hold of global mutex */
 
@@ -1342,7 +1337,7 @@ lstcon_rpc_cleanup_wait(void)
 
 	while (!list_empty(&console_session.ses_trans_list)) {
 		list_for_each(pacer, &console_session.ses_trans_list) {
-			trans = list_entry(pacer, struct lstcon_rpc_trans,
+			trans = list_entry(pacer, lstcon_rpc_trans_t,
 					   tas_link);
 
 			CDEBUG(D_NET, "Session closed, wakeup transaction %s\n",
@@ -1375,10 +1370,10 @@ lstcon_rpc_cleanup_wait(void)
 	spin_unlock(&console_session.ses_rpc_lock);
 
 	while (!list_empty(&zlist)) {
-		crpc = list_entry(zlist.next, struct lstcon_rpc, crp_link);
+		crpc = list_entry(zlist.next, lstcon_rpc_t, crp_link);
 
 		list_del(&crpc->crp_link);
-		LIBCFS_FREE(crpc, sizeof(*crpc));
+		LIBCFS_FREE(crpc, sizeof(lstcon_rpc_t));
 	}
 }
 
diff --git a/drivers/staging/lustrefsx/lnet/selftest/conrpc.h b/drivers/staging/lustrefsx/lnet/selftest/conrpc.h
index 51d4ee90e07cc..fd56e648491ce 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/conrpc.h
+++ b/drivers/staging/lustrefsx/lnet/selftest/conrpc.h
@@ -40,6 +40,7 @@
 #define __LST_CONRPC_H__
 
 #include <libcfs/libcfs.h>
+#include <lnet/lnet.h>
 #include <lnet/lib-types.h>
 #include "rpc.h"
 #include "selftest.h"
@@ -57,9 +58,9 @@ struct lstcon_tsb_hdr;
 struct lstcon_test;
 struct lstcon_node;
 
-struct lstcon_rpc {
+typedef struct lstcon_rpc {
 	struct list_head	 crp_link;	/* chain on rpc transaction */
-	struct srpc_client_rpc	*crp_rpc;	/* client rpc */
+	srpc_client_rpc_t	*crp_rpc;	/* client rpc */
 	struct lstcon_node	*crp_node;	/* destination node */
 	struct lstcon_rpc_trans *crp_trans;	/* conrpc transaction */
 
@@ -69,10 +70,10 @@ struct lstcon_rpc {
 	/** RPC is embedded in other structure and can't free it */
 	unsigned int		 crp_embedded:1;
         int                      crp_status;     /* console rpc errors */
-	s64			 crp_stamp_ns;	 /* replied time stamp */
-};
+        cfs_time_t               crp_stamp;      /* replied time stamp */
+} lstcon_rpc_t;
 
-struct lstcon_rpc_trans {
+typedef struct lstcon_rpc_trans {
 	/* link chain on owner list */
 	struct list_head	tas_olink;
 	/* link chain on global list */
@@ -86,7 +87,7 @@ struct lstcon_rpc_trans {
 	wait_queue_head_t	tas_waitq;	/* wait queue head */
 	atomic_t		tas_remaining;	/* # of un-scheduled rpcs */
 	struct list_head	tas_rpcs_list;	/* queued requests */
-};
+} lstcon_rpc_trans_t;
 
 #define LST_TRANS_PRIVATE       0x1000
 
@@ -104,37 +105,36 @@ struct lstcon_rpc_trans {
 
 #define LST_TRANS_STATQRY       0x21
 
-typedef int (*lstcon_rpc_cond_func_t)(int, struct lstcon_node *, void *);
-typedef int (*lstcon_rpc_readent_func_t)(int, struct srpc_msg *,
+typedef int (* lstcon_rpc_cond_func_t)(int, struct lstcon_node *, void *);
+typedef int (*lstcon_rpc_readent_func_t)(int, srpc_msg_t *,
 					 struct lstcon_rpc_ent __user *);
 
 int  lstcon_sesrpc_prep(struct lstcon_node *nd, int transop,
-			unsigned int version, struct lstcon_rpc **crpc);
+			unsigned version, lstcon_rpc_t **crpc);
 int  lstcon_dbgrpc_prep(struct lstcon_node *nd,
-			unsigned int version, struct lstcon_rpc **crpc);
+			unsigned version, lstcon_rpc_t **crpc);
 int  lstcon_batrpc_prep(struct lstcon_node *nd, int transop, unsigned version,
-			struct lstcon_tsb_hdr *tsb, struct lstcon_rpc **crpc);
+                        struct lstcon_tsb_hdr *tsb, lstcon_rpc_t **crpc);
 int  lstcon_testrpc_prep(struct lstcon_node *nd, int transop, unsigned version,
-			 struct lstcon_test *test, struct lstcon_rpc **crpc);
+                         struct lstcon_test *test, lstcon_rpc_t **crpc);
 int  lstcon_statrpc_prep(struct lstcon_node *nd, unsigned version,
-			 struct lstcon_rpc **crpc);
-void lstcon_rpc_put(struct lstcon_rpc *crpc);
+			 lstcon_rpc_t **crpc);
+void lstcon_rpc_put(lstcon_rpc_t *crpc);
 int  lstcon_rpc_trans_prep(struct list_head *translist,
-			   int transop, struct lstcon_rpc_trans **transpp);
+			   int transop, lstcon_rpc_trans_t **transpp);
 int  lstcon_rpc_trans_ndlist(struct list_head *ndlist,
 			     struct list_head *translist, int transop,
 			     void *arg, lstcon_rpc_cond_func_t condition,
-			     struct lstcon_rpc_trans **transpp);
-void lstcon_rpc_trans_stat(struct lstcon_rpc_trans *trans,
+			     lstcon_rpc_trans_t **transpp);
+void lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans,
 			   struct lstcon_trans_stat *stat);
-int  lstcon_rpc_trans_interpreter(struct lstcon_rpc_trans *trans,
+int  lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
 				  struct list_head __user *head_up,
 				  lstcon_rpc_readent_func_t readent);
-void lstcon_rpc_trans_abort(struct lstcon_rpc_trans *trans, int error);
-void lstcon_rpc_trans_destroy(struct lstcon_rpc_trans *trans);
-void lstcon_rpc_trans_addreq(struct lstcon_rpc_trans *trans,
-			     struct lstcon_rpc *req);
-int  lstcon_rpc_trans_postwait(struct lstcon_rpc_trans *trans, int timeout);
+void lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error);
+void lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans);
+void lstcon_rpc_trans_addreq(lstcon_rpc_trans_t *trans, lstcon_rpc_t *req);
+int  lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout);
 int  lstcon_rpc_pinger_start(void);
 void lstcon_rpc_pinger_stop(void);
 void lstcon_rpc_cleanup_wait(void);
diff --git a/drivers/staging/lustrefsx/lnet/selftest/console.c b/drivers/staging/lustrefsx/lnet/selftest/console.c
index 1e37454732cd1..a9fe8a85a2dd1 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/console.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/console.c
@@ -36,6 +36,7 @@
  * Author: Liang Zhen <liangzhen@clusterfs.com>
  */
 
+
 #include <libcfs/libcfs.h>
 #include <lnet/lib-lnet.h>
 #include "console.h"
@@ -54,10 +55,10 @@ do {                                                    \
         (p)->nle_nnode ++;                              \
 } while (0)
 
-struct lstcon_session console_session;
+lstcon_session_t        console_session;
 
 static void
-lstcon_node_get(struct lstcon_node *nd)
+lstcon_node_get(lstcon_node_t *nd)
 {
         LASSERT (nd->nd_ref >= 1);
 
@@ -65,11 +66,10 @@ lstcon_node_get(struct lstcon_node *nd)
 }
 
 static int
-lstcon_node_find(struct lnet_process_id id, struct lstcon_node **ndpp,
-		 int create)
+lstcon_node_find(struct lnet_process_id id, lstcon_node_t **ndpp, int create)
 {
-	struct lstcon_ndlink *ndl;
-	unsigned int idx = LNET_NIDADDR(id.nid) % LST_GLOBAL_HASHSIZE;
+	lstcon_ndlink_t	*ndl;
+	unsigned int	 idx = LNET_NIDADDR(id.nid) % LST_GLOBAL_HASHSIZE;
 
 	LASSERT(id.nid != LNET_NID_ANY);
 
@@ -87,20 +87,20 @@ lstcon_node_find(struct lnet_process_id id, struct lstcon_node **ndpp,
         if (!create)
                 return -ENOENT;
 
-	LIBCFS_ALLOC(*ndpp, sizeof(**ndpp) + sizeof(*ndl));
-	if (*ndpp == NULL)
-		return -ENOMEM;
+        LIBCFS_ALLOC(*ndpp, sizeof(lstcon_node_t) + sizeof(lstcon_ndlink_t));
+        if (*ndpp == NULL)
+                return -ENOMEM;
 
-	ndl = (struct lstcon_ndlink *)(*ndpp + 1);
+        ndl = (lstcon_ndlink_t *)(*ndpp + 1);
 
-	ndl->ndl_node = *ndpp;
+        ndl->ndl_node = *ndpp;
 
-	ndl->ndl_node->nd_ref   = 1;
-	ndl->ndl_node->nd_id    = id;
-	ndl->ndl_node->nd_stamp = ktime_get();
-	ndl->ndl_node->nd_state = LST_NODE_UNKNOWN;
-	ndl->ndl_node->nd_timeout = 0;
-	memset(&ndl->ndl_node->nd_ping, 0, sizeof(ndl->ndl_node->nd_ping));
+        ndl->ndl_node->nd_ref   = 1;
+        ndl->ndl_node->nd_id    = id;
+        ndl->ndl_node->nd_stamp = cfs_time_current();
+        ndl->ndl_node->nd_state = LST_NODE_UNKNOWN;
+        ndl->ndl_node->nd_timeout = 0;
+        memset(&ndl->ndl_node->nd_ping, 0, sizeof(lstcon_rpc_t));
 
 	/* queued in global hash & list, no refcount is taken by
 	 * global hash & list, if caller release his refcount,
@@ -112,16 +112,16 @@ lstcon_node_find(struct lnet_process_id id, struct lstcon_node **ndpp,
 }
 
 static void
-lstcon_node_put(struct lstcon_node *nd)
+lstcon_node_put(lstcon_node_t *nd)
 {
-	struct lstcon_ndlink *ndl;
+	lstcon_ndlink_t *ndl;
 
 	LASSERT(nd->nd_ref > 0);
 
 	if (--nd->nd_ref > 0)
 		return;
 
-	ndl = (struct lstcon_ndlink *)(nd + 1);
+	ndl = (lstcon_ndlink_t *)(nd + 1);
 
 	LASSERT(!list_empty(&ndl->ndl_link));
 	LASSERT(!list_empty(&ndl->ndl_hlink));
@@ -130,17 +130,17 @@ lstcon_node_put(struct lstcon_node *nd)
 	list_del(&ndl->ndl_link);
 	list_del(&ndl->ndl_hlink);
 
-	LIBCFS_FREE(nd, sizeof(*nd) + sizeof(*ndl));
+	LIBCFS_FREE(nd, sizeof(lstcon_node_t) + sizeof(lstcon_ndlink_t));
 }
 
 static int
 lstcon_ndlink_find(struct list_head *hash, struct lnet_process_id id,
-		   struct lstcon_ndlink **ndlpp, int create)
+		   lstcon_ndlink_t **ndlpp, int create)
 {
-	unsigned int idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE;
-	struct lstcon_ndlink *ndl;
-	struct lstcon_node *nd;
-	int rc;
+	unsigned int	 idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE;
+	lstcon_ndlink_t *ndl;
+	lstcon_node_t   *nd;
+	int		 rc;
 
 	if (id.nid == LNET_NID_ANY)
 		return -EINVAL;
@@ -163,7 +163,7 @@ lstcon_ndlink_find(struct list_head *hash, struct lnet_process_id id,
         if (rc != 0)
                 return rc;
 
-	LIBCFS_ALLOC(ndl, sizeof(*ndl));
+        LIBCFS_ALLOC(ndl, sizeof(lstcon_ndlink_t));
         if (ndl == NULL) {
                 lstcon_node_put(nd);
                 return -ENOMEM;
@@ -179,7 +179,7 @@ lstcon_ndlink_find(struct list_head *hash, struct lnet_process_id id,
 }
 
 static void
-lstcon_ndlink_release(struct lstcon_ndlink *ndl)
+lstcon_ndlink_release(lstcon_ndlink_t *ndl)
 {
 	LASSERT(list_empty(&ndl->ndl_link));
 	LASSERT(!list_empty(&ndl->ndl_hlink));
@@ -191,12 +191,12 @@ lstcon_ndlink_release(struct lstcon_ndlink *ndl)
 }
 
 static int
-lstcon_group_alloc(char *name, struct lstcon_group **grpp)
+lstcon_group_alloc(char *name, lstcon_group_t **grpp)
 {
-	struct lstcon_group *grp;
-	int i;
+	lstcon_group_t *grp;
+	int		i;
 
-	LIBCFS_ALLOC(grp, offsetof(struct lstcon_group,
+        LIBCFS_ALLOC(grp, offsetof(lstcon_group_t,
                                    grp_ndl_hash[LST_NODE_HASHSIZE]));
         if (grp == NULL)
                 return -ENOMEM;
@@ -204,7 +204,7 @@ lstcon_group_alloc(char *name, struct lstcon_group **grpp)
         grp->grp_ref = 1;
 	if (name != NULL) {
 		if (strlen(name) > sizeof(grp->grp_name)-1) {
-			LIBCFS_FREE(grp, offsetof(struct lstcon_group,
+			LIBCFS_FREE(grp, offsetof(lstcon_group_t,
 					  grp_ndl_hash[LST_NODE_HASHSIZE]));
 			return -E2BIG;
 		}
@@ -224,19 +224,18 @@ lstcon_group_alloc(char *name, struct lstcon_group **grpp)
 }
 
 static void
-lstcon_group_addref(struct lstcon_group *grp)
+lstcon_group_addref(lstcon_group_t *grp)
 {
 	grp->grp_ref++;
 }
 
-static void lstcon_group_ndlink_release(struct lstcon_group *,
-					struct lstcon_ndlink *);
+static void lstcon_group_ndlink_release(lstcon_group_t *, lstcon_ndlink_t *);
 
 static void
-lstcon_group_drain(struct lstcon_group *grp, int keep)
+lstcon_group_drain(lstcon_group_t *grp, int keep)
 {
-	struct lstcon_ndlink *ndl;
-	struct lstcon_ndlink *tmp;
+	lstcon_ndlink_t *ndl;
+	lstcon_ndlink_t *tmp;
 
 	list_for_each_entry_safe(ndl, tmp, &grp->grp_ndl_list, ndl_link) {
 		if ((ndl->ndl_node->nd_state & keep) == 0)
@@ -245,7 +244,7 @@ lstcon_group_drain(struct lstcon_group *grp, int keep)
 }
 
 static void
-lstcon_group_decref(struct lstcon_group *grp)
+lstcon_group_decref(lstcon_group_t *grp)
 {
 	int i;
 
@@ -260,14 +259,14 @@ lstcon_group_decref(struct lstcon_group *grp)
 	for (i = 0; i < LST_NODE_HASHSIZE; i++)
 		LASSERT(list_empty(&grp->grp_ndl_hash[i]));
 
-	LIBCFS_FREE(grp, offsetof(struct lstcon_group,
+	LIBCFS_FREE(grp, offsetof(lstcon_group_t,
 				  grp_ndl_hash[LST_NODE_HASHSIZE]));
 }
 
 static int
-lstcon_group_find(const char *name, struct lstcon_group **grpp)
+lstcon_group_find(const char *name, lstcon_group_t **grpp)
 {
-	struct lstcon_group *grp;
+	lstcon_group_t *grp;
 
 	list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) {
 		if (strncmp(grp->grp_name, name, LST_NAME_SIZE) != 0)
@@ -282,8 +281,8 @@ lstcon_group_find(const char *name, struct lstcon_group **grpp)
 }
 
 static int
-lstcon_group_ndlink_find(struct lstcon_group *grp, struct lnet_process_id id,
-			 struct lstcon_ndlink **ndlpp, int create)
+lstcon_group_ndlink_find(lstcon_group_t *grp, struct lnet_process_id id,
+                         lstcon_ndlink_t **ndlpp, int create)
 {
 	int rc;
 
@@ -301,7 +300,7 @@ lstcon_group_ndlink_find(struct lstcon_group *grp, struct lnet_process_id id,
 }
 
 static void
-lstcon_group_ndlink_release(struct lstcon_group *grp, struct lstcon_ndlink *ndl)
+lstcon_group_ndlink_release(lstcon_group_t *grp, lstcon_ndlink_t *ndl)
 {
 	list_del_init(&ndl->ndl_link);
 	lstcon_ndlink_release(ndl);
@@ -309,8 +308,8 @@ lstcon_group_ndlink_release(struct lstcon_group *grp, struct lstcon_ndlink *ndl)
 }
 
 static void
-lstcon_group_ndlink_move(struct lstcon_group *old,
-			 struct lstcon_group *new, struct lstcon_ndlink *ndl)
+lstcon_group_ndlink_move(lstcon_group_t *old,
+                         lstcon_group_t *new, lstcon_ndlink_t *ndl)
 {
 	unsigned int idx = LNET_NIDADDR(ndl->ndl_node->nd_id.nid) %
 					LST_NODE_HASHSIZE;
@@ -327,21 +326,21 @@ lstcon_group_ndlink_move(struct lstcon_group *old,
 }
 
 static void
-lstcon_group_move(struct lstcon_group *old, struct lstcon_group *new)
+lstcon_group_move(lstcon_group_t *old, lstcon_group_t *new)
 {
-	struct lstcon_ndlink *ndl;
+	lstcon_ndlink_t *ndl;
 
 	while (!list_empty(&old->grp_ndl_list)) {
 		ndl = list_entry(old->grp_ndl_list.next,
-				 struct lstcon_ndlink, ndl_link);
+				 lstcon_ndlink_t, ndl_link);
 		lstcon_group_ndlink_move(old, new, ndl);
 	}
 }
 
 static int
-lstcon_sesrpc_condition(int transop, struct lstcon_node *nd, void *arg)
+lstcon_sesrpc_condition(int transop, lstcon_node_t *nd, void *arg)
 {
-	struct lstcon_group *grp = arg;
+        lstcon_group_t *grp = (lstcon_group_t *)arg;
 
         switch (transop) {
         case LST_TRANS_SESNEW:
@@ -368,10 +367,10 @@ lstcon_sesrpc_condition(int transop, struct lstcon_node *nd, void *arg)
 }
 
 static int
-lstcon_sesrpc_readent(int transop, struct srpc_msg *msg,
+lstcon_sesrpc_readent(int transop, srpc_msg_t *msg,
 		      struct lstcon_rpc_ent __user *ent_up)
 {
-	struct srpc_debug_reply *rep;
+        srpc_debug_reply_t *rep;
 
         switch (transop) {
         case LST_TRANS_SESNEW:
@@ -397,17 +396,16 @@ lstcon_sesrpc_readent(int transop, struct srpc_msg *msg,
 }
 
 static int
-lstcon_group_nodes_add(struct lstcon_group *grp,
+lstcon_group_nodes_add(lstcon_group_t *grp,
 		       int count, struct lnet_process_id __user *ids_up,
-		       unsigned int *featp,
-		       struct list_head __user *result_up)
+		       unsigned *featp, struct list_head __user *result_up)
 {
-	struct lstcon_rpc_trans *trans;
-	struct lstcon_ndlink *ndl;
-	struct lstcon_group *tmp;
-	struct lnet_process_id id;
-	int i;
-	int rc;
+        lstcon_rpc_trans_t      *trans;
+        lstcon_ndlink_t         *ndl;
+        lstcon_group_t          *tmp;
+	struct lnet_process_id        id;
+        int                      i;
+        int                      rc;
 
         rc = lstcon_group_alloc(NULL, &tmp);
         if (rc != 0) {
@@ -465,16 +463,16 @@ lstcon_group_nodes_add(struct lstcon_group *grp,
 }
 
 static int
-lstcon_group_nodes_remove(struct lstcon_group *grp,
+lstcon_group_nodes_remove(lstcon_group_t *grp,
 			  int count, struct lnet_process_id __user *ids_up,
 			  struct list_head __user *result_up)
 {
-	struct lstcon_rpc_trans *trans;
-	struct lstcon_ndlink *ndl;
-	struct lstcon_group *tmp;
-	struct lnet_process_id id;
-	int rc;
-	int i;
+        lstcon_rpc_trans_t     *trans;
+        lstcon_ndlink_t        *ndl;
+        lstcon_group_t         *tmp;
+	struct lnet_process_id       id;
+        int                     rc;
+        int                     i;
 
         /* End session and remove node from the group */
 
@@ -522,8 +520,8 @@ lstcon_group_nodes_remove(struct lstcon_group *grp,
 int
 lstcon_group_add(char *name)
 {
-	struct lstcon_group *grp;
-	int rc;
+        lstcon_group_t *grp;
+        int             rc;
 
         rc = (lstcon_group_find(name, &grp) == 0)? -EEXIST: 0;
         if (rc != 0) {
@@ -547,7 +545,7 @@ int
 lstcon_nodes_add(char *name, int count, struct lnet_process_id __user *ids_up,
 		 unsigned *featp, struct list_head __user *result_up)
 {
-	struct lstcon_group         *grp;
+        lstcon_group_t         *grp;
         int                     rc;
 
         LASSERT (count > 0);
@@ -577,9 +575,9 @@ lstcon_nodes_add(char *name, int count, struct lnet_process_id __user *ids_up,
 int
 lstcon_group_del(char *name)
 {
-	struct lstcon_rpc_trans *trans;
-	struct lstcon_group *grp;
-	int rc;
+        lstcon_rpc_trans_t *trans;
+        lstcon_group_t     *grp;
+        int                 rc;
 
         rc = lstcon_group_find(name, &grp);
         if (rc != 0) {
@@ -618,8 +616,8 @@ lstcon_group_del(char *name)
 int
 lstcon_group_clean(char *name, int args)
 {
-	struct lstcon_group *grp = NULL;
-	int rc;
+        lstcon_group_t *grp = NULL;
+        int             rc;
 
         rc = lstcon_group_find(name, &grp);
         if (rc != 0) {
@@ -652,8 +650,8 @@ lstcon_nodes_remove(char *name, int count,
 		    struct lnet_process_id __user *ids_up,
 		    struct list_head __user *result_up)
 {
-	struct lstcon_group *grp = NULL;
-	int rc;
+        lstcon_group_t *grp = NULL;
+        int             rc;
 
         rc = lstcon_group_find(name, &grp);
         if (rc != 0) {
@@ -681,9 +679,9 @@ lstcon_nodes_remove(char *name, int count,
 int
 lstcon_group_refresh(char *name, struct list_head __user *result_up)
 {
-	struct lstcon_rpc_trans *trans;
-	struct lstcon_group *grp;
-	int rc;
+        lstcon_rpc_trans_t      *trans;
+        lstcon_group_t          *grp;
+        int                      rc;
 
         rc = lstcon_group_find(name, &grp);
         if (rc != 0) {
@@ -723,7 +721,7 @@ lstcon_group_refresh(char *name, struct list_head __user *result_up)
 int
 lstcon_group_list(int index, int len, char __user *name_up)
 {
-	struct lstcon_group *grp;
+	lstcon_group_t *grp;
 
 	LASSERT(index >= 0);
 	LASSERT(name_up != NULL);
@@ -742,10 +740,10 @@ static int
 lstcon_nodes_getent(struct list_head *head, int *index_p,
 		    int *count_p, struct lstcon_node_ent __user *dents_up)
 {
-	struct lstcon_ndlink *ndl;
-	struct lstcon_node *nd;
-	int count = 0;
-	int index = 0;
+        lstcon_ndlink_t  *ndl;
+        lstcon_node_t    *nd;
+        int               count = 0;
+        int               index = 0;
 
 	LASSERT(index_p != NULL && count_p != NULL);
 	LASSERT(dents_up != NULL);
@@ -784,9 +782,9 @@ lstcon_group_info(char *name, struct lstcon_ndlist_ent __user *gents_p,
 		  struct lstcon_node_ent __user *dents_up)
 {
 	struct lstcon_ndlist_ent *gentp;
-	struct lstcon_group *grp;
-	struct lstcon_ndlink *ndl;
-	int rc;
+        lstcon_group_t      *grp;
+        lstcon_ndlink_t     *ndl;
+        int                  rc;
 
         rc = lstcon_group_find(name, &grp);
         if (rc != 0) {
@@ -826,9 +824,9 @@ lstcon_group_info(char *name, struct lstcon_ndlist_ent __user *gents_p,
 }
 
 static int
-lstcon_batch_find(const char *name, struct lstcon_batch **batpp)
+lstcon_batch_find(const char *name, lstcon_batch_t **batpp)
 {
-	struct lstcon_batch *bat;
+	lstcon_batch_t *bat;
 
 	list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) {
 		if (strncmp(bat->bat_name, name, LST_NAME_SIZE) == 0) {
@@ -843,9 +841,9 @@ lstcon_batch_find(const char *name, struct lstcon_batch **batpp)
 int
 lstcon_batch_add(char *name)
 {
-	struct lstcon_batch *bat;
-	int i;
-	int rc;
+        lstcon_batch_t   *bat;
+        int               i;
+        int               rc;
 
         rc = (lstcon_batch_find(name, &bat) == 0)? -EEXIST: 0;
         if (rc != 0) {
@@ -853,17 +851,17 @@ lstcon_batch_add(char *name)
                 return rc;
         }
 
-	LIBCFS_ALLOC(bat, sizeof(*bat));
+        LIBCFS_ALLOC(bat, sizeof(lstcon_batch_t));
         if (bat == NULL) {
                 CERROR("Can't allocate descriptor for batch %s\n", name);
                 return -ENOMEM;
         }
 
-	LIBCFS_ALLOC(bat->bat_cli_hash,
+        LIBCFS_ALLOC(bat->bat_cli_hash,
 		     sizeof(struct list_head) * LST_NODE_HASHSIZE);
-	if (bat->bat_cli_hash == NULL) {
-		CERROR("Can't allocate hash for batch %s\n", name);
-		LIBCFS_FREE(bat, sizeof(*bat));
+        if (bat->bat_cli_hash == NULL) {
+                CERROR("Can't allocate hash for batch %s\n", name);
+                LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
 
                 return -ENOMEM;
         }
@@ -873,7 +871,7 @@ lstcon_batch_add(char *name)
         if (bat->bat_srv_hash == NULL) {
                 CERROR("Can't allocate hash for batch %s\n", name);
                 LIBCFS_FREE(bat->bat_cli_hash, LST_NODE_HASHSIZE);
-		LIBCFS_FREE(bat, sizeof(*bat));
+                LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
 
                 return -ENOMEM;
         }
@@ -881,7 +879,7 @@ lstcon_batch_add(char *name)
 	if (strlen(name) > sizeof(bat->bat_name)-1) {
 		LIBCFS_FREE(bat->bat_srv_hash, LST_NODE_HASHSIZE);
 		LIBCFS_FREE(bat->bat_cli_hash, LST_NODE_HASHSIZE);
-		LIBCFS_FREE(bat, sizeof(*bat));
+		LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
 		return -E2BIG;
 	}
 	strncpy(bat->bat_name, name, sizeof(bat->bat_name));
@@ -909,7 +907,7 @@ lstcon_batch_add(char *name)
 int
 lstcon_batch_list(int index, int len, char __user *name_up)
 {
-	struct lstcon_batch *bat;
+	lstcon_batch_t *bat;
 
 	LASSERT(name_up != NULL);
 	LASSERT(index >= 0);
@@ -930,12 +928,12 @@ lstcon_batch_info(char *name, struct lstcon_test_batch_ent __user *ent_up,
 		  struct lstcon_node_ent __user *dents_up)
 {
 	struct lstcon_test_batch_ent *entp;
-	struct list_head *clilst;
-	struct list_head *srvlst;
-	struct lstcon_test *test = NULL;
-	struct lstcon_batch *bat;
-	struct lstcon_ndlink *ndl;
-	int rc;
+	struct list_head	*clilst;
+	struct list_head	*srvlst;
+        lstcon_test_t           *test = NULL;
+        lstcon_batch_t          *bat;
+        lstcon_ndlink_t         *ndl;
+        int                      rc;
 
         rc = lstcon_batch_find(name, &bat);
         if (rc != 0) {
@@ -998,7 +996,7 @@ lstcon_batch_info(char *name, struct lstcon_test_batch_ent __user *ent_up,
 }
 
 static int
-lstcon_batrpc_condition(int transop, struct lstcon_node *nd, void *arg)
+lstcon_batrpc_condition(int transop, lstcon_node_t *nd, void *arg)
 {
         switch (transop) {
         case LST_TRANS_TSBRUN:
@@ -1020,10 +1018,10 @@ lstcon_batrpc_condition(int transop, struct lstcon_node *nd, void *arg)
 }
 
 static int
-lstcon_batch_op(struct lstcon_batch *bat, int transop,
+lstcon_batch_op(lstcon_batch_t *bat, int transop,
 		struct list_head __user *result_up)
 {
-	struct lstcon_rpc_trans *trans;
+        lstcon_rpc_trans_t *trans;
         int                 rc;
 
         rc = lstcon_rpc_trans_ndlist(&bat->bat_cli_list,
@@ -1046,8 +1044,8 @@ lstcon_batch_op(struct lstcon_batch *bat, int transop,
 int
 lstcon_batch_run(char *name, int timeout, struct list_head __user *result_up)
 {
-	struct lstcon_batch *bat;
-	int rc;
+        lstcon_batch_t *bat;
+        int             rc;
 
         if (lstcon_batch_find(name, &bat) != 0) {
                 CDEBUG(D_NET, "Can't find batch %s\n", name);
@@ -1068,8 +1066,8 @@ lstcon_batch_run(char *name, int timeout, struct list_head __user *result_up)
 int
 lstcon_batch_stop(char *name, int force, struct list_head __user *result_up)
 {
-	struct lstcon_batch *bat;
-	int rc;
+        lstcon_batch_t *bat;
+        int             rc;
 
         if (lstcon_batch_find(name, &bat) != 0) {
                 CDEBUG(D_NET, "Can't find batch %s\n", name);
@@ -1088,17 +1086,17 @@ lstcon_batch_stop(char *name, int force, struct list_head __user *result_up)
 }
 
 static void
-lstcon_batch_destroy(struct lstcon_batch *bat)
+lstcon_batch_destroy(lstcon_batch_t *bat)
 {
-	struct lstcon_ndlink *ndl;
-	struct lstcon_test *test;
-	int i;
+        lstcon_ndlink_t    *ndl;
+        lstcon_test_t      *test;
+        int                 i;
 
 	list_del(&bat->bat_link);
 
 	while (!list_empty(&bat->bat_test_list)) {
 		test = list_entry(bat->bat_test_list.next,
-				  struct lstcon_test, tes_link);
+				  lstcon_test_t, tes_link);
 		LASSERT(list_empty(&test->tes_trans_list));
 
 		list_del(&test->tes_link);
@@ -1106,7 +1104,7 @@ lstcon_batch_destroy(struct lstcon_batch *bat)
 		lstcon_group_decref(test->tes_src_grp);
 		lstcon_group_decref(test->tes_dst_grp);
 
-		LIBCFS_FREE(test, offsetof(struct lstcon_test,
+		LIBCFS_FREE(test, offsetof(lstcon_test_t,
 					   tes_param[test->tes_paramlen]));
 	}
 
@@ -1114,7 +1112,7 @@ lstcon_batch_destroy(struct lstcon_batch *bat)
 
 	while (!list_empty(&bat->bat_cli_list)) {
 		ndl = list_entry(bat->bat_cli_list.next,
-				 struct lstcon_ndlink, ndl_link);
+				 lstcon_ndlink_t, ndl_link);
 		list_del_init(&ndl->ndl_link);
 
 		lstcon_ndlink_release(ndl);
@@ -1122,7 +1120,7 @@ lstcon_batch_destroy(struct lstcon_batch *bat)
 
 	while (!list_empty(&bat->bat_srv_list)) {
 		ndl = list_entry(bat->bat_srv_list.next,
-				 struct lstcon_ndlink, ndl_link);
+				 lstcon_ndlink_t, ndl_link);
 		list_del_init(&ndl->ndl_link);
 
 		lstcon_ndlink_release(ndl);
@@ -1137,18 +1135,19 @@ lstcon_batch_destroy(struct lstcon_batch *bat)
 		    sizeof(struct list_head) * LST_NODE_HASHSIZE);
 	LIBCFS_FREE(bat->bat_srv_hash,
 		    sizeof(struct list_head) * LST_NODE_HASHSIZE);
-	LIBCFS_FREE(bat, sizeof(*bat));
+	LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
 }
 
 static int
-lstcon_testrpc_condition(int transop, struct lstcon_node *nd, void *arg)
+lstcon_testrpc_condition(int transop, lstcon_node_t *nd, void *arg)
 {
-	struct lstcon_test *test = arg;
-	struct lstcon_batch *batch;
-	struct lstcon_ndlink *ndl;
+	lstcon_test_t	 *test;
+	lstcon_batch_t	 *batch;
+	lstcon_ndlink_t  *ndl;
 	struct list_head *hash;
 	struct list_head *head;
 
+	test = (lstcon_test_t *)arg;
 	LASSERT(test != NULL);
 
 	batch = test->tes_batch;
@@ -1184,13 +1183,12 @@ lstcon_testrpc_condition(int transop, struct lstcon_node *nd, void *arg)
 }
 
 static int
-lstcon_test_nodes_add(struct lstcon_test *test,
-		      struct list_head __user *result_up)
+lstcon_test_nodes_add(lstcon_test_t *test, struct list_head __user *result_up)
 {
-	struct lstcon_rpc_trans *trans;
-	struct lstcon_group *grp;
-	int transop;
-	int rc;
+        lstcon_rpc_trans_t     *trans;
+        lstcon_group_t         *grp;
+        int                     transop;
+        int                     rc;
 
         LASSERT (test->tes_src_grp != NULL);
         LASSERT (test->tes_dst_grp != NULL);
@@ -1237,7 +1235,7 @@ lstcon_test_nodes_add(struct lstcon_test *test,
 }
 
 static int
-lstcon_verify_batch(const char *name, struct lstcon_batch **batch)
+lstcon_verify_batch(const char *name, lstcon_batch_t **batch)
 {
 	int rc;
 
@@ -1256,10 +1254,10 @@ lstcon_verify_batch(const char *name, struct lstcon_batch **batch)
 }
 
 static int
-lstcon_verify_group(const char *name, struct lstcon_group **grp)
+lstcon_verify_group(const char *name, lstcon_group_t **grp)
 {
-	int rc;
-	struct lstcon_ndlink *ndl;
+	int			rc;
+	lstcon_ndlink_t		*ndl;
 
 	rc = lstcon_group_find(name, grp);
 	if (rc != 0) {
@@ -1285,11 +1283,11 @@ lstcon_test_add(char *batch_name, int type, int loop,
 		void *param, int paramlen, int *retp,
 		struct list_head __user *result_up)
 {
-	struct lstcon_test *test = NULL;
-	int rc;
-	struct lstcon_group *src_grp = NULL;
-	struct lstcon_group *dst_grp = NULL;
-	struct lstcon_batch *batch = NULL;
+	lstcon_test_t	 *test	 = NULL;
+	int		 rc;
+	lstcon_group_t	 *src_grp = NULL;
+	lstcon_group_t	 *dst_grp = NULL;
+	lstcon_batch_t	 *batch = NULL;
 
 	/*
 	 * verify that a batch of the given name exists, and the groups
@@ -1311,7 +1309,7 @@ lstcon_test_add(char *batch_name, int type, int loop,
 	if (dst_grp->grp_userland)
 		*retp = 1;
 
-	LIBCFS_ALLOC(test, offsetof(struct lstcon_test, tes_param[paramlen]));
+	LIBCFS_ALLOC(test, offsetof(lstcon_test_t, tes_param[paramlen]));
 	if (!test) {
 		CERROR("Can't allocate test descriptor\n");
 		rc = -ENOMEM;
@@ -1358,8 +1356,7 @@ lstcon_test_add(char *batch_name, int type, int loop,
 	return rc;
 out:
 	if (test != NULL)
-		LIBCFS_FREE(test, offsetof(struct lstcon_test,
-					   tes_param[paramlen]));
+		LIBCFS_FREE(test, offsetof(lstcon_test_t, tes_param[paramlen]));
 
 	if (dst_grp != NULL)
 		lstcon_group_decref(dst_grp);
@@ -1371,10 +1368,9 @@ lstcon_test_add(char *batch_name, int type, int loop,
 }
 
 static int
-lstcon_test_find(struct lstcon_batch *batch, int idx,
-		 struct lstcon_test **testpp)
+lstcon_test_find(lstcon_batch_t *batch, int idx, lstcon_test_t **testpp)
 {
-	struct lstcon_test *test;
+	lstcon_test_t *test;
 
 	list_for_each_entry(test, &batch->bat_test_list, tes_link) {
 		if (idx == test->tes_hdr.tsb_index) {
@@ -1387,10 +1383,10 @@ lstcon_test_find(struct lstcon_batch *batch, int idx,
 }
 
 static int
-lstcon_tsbrpc_readent(int transop, struct srpc_msg *msg,
+lstcon_tsbrpc_readent(int transop, srpc_msg_t *msg,
 		      struct lstcon_rpc_ent __user *ent_up)
 {
-	struct srpc_batch_reply *rep = &msg->msg_body.bat_reply;
+        srpc_batch_reply_t *rep = &msg->msg_body.bat_reply;
 
         LASSERT (transop == LST_TRANS_TSBCLIQRY ||
                  transop == LST_TRANS_TSBSRVQRY);
@@ -1407,14 +1403,14 @@ int
 lstcon_test_batch_query(char *name, int testidx, int client,
 			int timeout, struct list_head __user *result_up)
 {
-	struct lstcon_rpc_trans *trans;
-	struct list_head *translist;
-	struct list_head *ndlist;
-	struct lstcon_tsb_hdr *hdr;
-	struct lstcon_batch *batch;
-	struct lstcon_test *test = NULL;
-	int transop;
-	int rc;
+        lstcon_rpc_trans_t *trans;
+	struct list_head   *translist;
+	struct list_head   *ndlist;
+        lstcon_tsb_hdr_t   *hdr;
+        lstcon_batch_t     *batch;
+        lstcon_test_t      *test = NULL;
+        int                 transop;
+        int                 rc;
 
         rc = lstcon_batch_find(name, &batch);
         if (rc != 0) {
@@ -1466,13 +1462,13 @@ lstcon_test_batch_query(char *name, int testidx, int client,
 }
 
 static int
-lstcon_statrpc_readent(int transop, struct srpc_msg *msg,
+lstcon_statrpc_readent(int transop, srpc_msg_t *msg,
 		       struct lstcon_rpc_ent __user *ent_up)
 {
-	struct srpc_stat_reply *rep = &msg->msg_body.stat_reply;
-	struct sfw_counters __user *sfwk_stat;
+	srpc_stat_reply_t *rep = &msg->msg_body.stat_reply;
+	struct sfw_counters __user  *sfwk_stat;
 	struct srpc_counters __user *srpc_stat;
-	struct lnet_counters_common __user *lnet_stat;
+	struct lnet_counters __user *lnet_stat;
 
         if (rep->str_status != 0)
                 return 0;
@@ -1480,7 +1476,7 @@ lstcon_statrpc_readent(int transop, struct srpc_msg *msg,
 	sfwk_stat = (struct sfw_counters __user *)&ent_up->rpe_payload[0];
 	srpc_stat = (struct srpc_counters __user *)
 		((char __user *)sfwk_stat + sizeof(*sfwk_stat));
-	lnet_stat = (struct lnet_counters_common __user *)
+	lnet_stat = (struct lnet_counters __user *)
 		((char __user *)srpc_stat + sizeof(*srpc_stat));
 
 	if (copy_to_user(sfwk_stat, &rep->str_fw, sizeof(*sfwk_stat)) ||
@@ -1496,7 +1492,7 @@ lstcon_ndlist_stat(struct list_head *ndlist,
 		   int timeout, struct list_head __user *result_up)
 {
 	struct list_head    head;
-	struct lstcon_rpc_trans *trans;
+	lstcon_rpc_trans_t *trans;
 	int		    rc;
 
 	INIT_LIST_HEAD(&head);
@@ -1521,8 +1517,8 @@ int
 lstcon_group_stat(char *grp_name, int timeout,
 		  struct list_head __user *result_up)
 {
-	struct lstcon_group *grp;
-	int rc;
+        lstcon_group_t     *grp;
+        int                 rc;
 
         rc = lstcon_group_find(grp_name, &grp);
         if (rc != 0) {
@@ -1541,11 +1537,11 @@ int
 lstcon_nodes_stat(int count, struct lnet_process_id __user *ids_up,
 		  int timeout, struct list_head __user *result_up)
 {
-	struct lstcon_ndlink *ndl;
-	struct lstcon_group *tmp;
-	struct lnet_process_id id;
-	int i;
-	int rc;
+        lstcon_ndlink_t         *ndl;
+        lstcon_group_t          *tmp;
+	struct lnet_process_id        id;
+        int                      i;
+        int                      rc;
 
         rc = lstcon_group_alloc(NULL, &tmp);
         if (rc != 0) {
@@ -1586,8 +1582,8 @@ lstcon_debug_ndlist(struct list_head *ndlist,
 		    struct list_head *translist,
 		    int timeout, struct list_head __user *result_up)
 {
-	struct lstcon_rpc_trans *trans;
-	int rc;
+        lstcon_rpc_trans_t *trans;
+        int                 rc;
 
         rc = lstcon_rpc_trans_ndlist(ndlist, translist, LST_TRANS_SESQRY,
                                      NULL, lstcon_sesrpc_condition, &trans);
@@ -1616,8 +1612,8 @@ int
 lstcon_batch_debug(int timeout, char *name,
 		   int client, struct list_head __user *result_up)
 {
-	struct lstcon_batch *bat;
-	int rc;
+        lstcon_batch_t *bat;
+        int             rc;
 
         rc = lstcon_batch_find(name, &bat);
         if (rc != 0)
@@ -1634,8 +1630,8 @@ int
 lstcon_group_debug(int timeout, char *name,
 		   struct list_head __user *result_up)
 {
-	struct lstcon_group *grp;
-	int rc;
+        lstcon_group_t *grp;
+        int             rc;
 
         rc = lstcon_group_find(name, &grp);
         if (rc != 0)
@@ -1649,15 +1645,15 @@ lstcon_group_debug(int timeout, char *name,
 }
 
 int
-lstcon_nodes_debug(int timeout, int count,
-		   struct lnet_process_id __user *ids_up,
+lstcon_nodes_debug(int timeout,
+		   int count, struct lnet_process_id __user *ids_up,
 		   struct list_head __user *result_up)
 {
-	struct lnet_process_id id;
-	struct lstcon_ndlink *ndl;
-	struct lstcon_group *grp;
-	int i;
-	int rc;
+	struct lnet_process_id  id;
+        lstcon_ndlink_t   *ndl;
+        lstcon_group_t    *grp;
+        int                i;
+        int                rc;
 
         rc = lstcon_group_alloc(NULL, &grp);
         if (rc != 0) {
@@ -1704,11 +1700,11 @@ lstcon_new_session_id(struct lst_sid *sid)
 {
 	struct lnet_process_id id;
 
-	LASSERT(console_session.ses_state == LST_SESSION_NONE);
+        LASSERT (console_session.ses_state == LST_SESSION_NONE);
 
-	LNetGetId(1, &id);
-	sid->ses_nid = id.nid;
-	sid->ses_stamp = div_u64(ktime_get_ns(), NSEC_PER_MSEC);
+        LNetGetId(1, &id);
+        sid->ses_nid   = id.nid;
+        sid->ses_stamp = cfs_time_current();
 }
 
 int
@@ -1763,7 +1759,7 @@ lstcon_session_new(char *name, int key, unsigned feats,
 
         rc = lstcon_rpc_pinger_start();
         if (rc != 0) {
-		struct lstcon_batch *bat = NULL;
+                lstcon_batch_t *bat = NULL;
 
                 lstcon_batch_find(LST_DEFAULT_BATCH, &bat);
                 lstcon_batch_destroy(bat);
@@ -1787,8 +1783,8 @@ lstcon_session_info(struct lst_sid __user *sid_up, int __user *key_up,
 		    char __user *name_up, int len)
 {
 	struct lstcon_ndlist_ent *entp;
-	struct lstcon_ndlink *ndl;
-	int rc = 0;
+        lstcon_ndlink_t     *ndl;
+        int                  rc = 0;
 
         if (console_session.ses_state != LST_SESSION_ACTIVE)
                 return -ESRCH;
@@ -1818,10 +1814,10 @@ lstcon_session_info(struct lst_sid __user *sid_up, int __user *key_up,
 int
 lstcon_session_end()
 {
-	struct lstcon_rpc_trans *trans;
-	struct lstcon_group *grp;
-	struct lstcon_batch *bat;
-	int rc = 0;
+        lstcon_rpc_trans_t *trans;
+        lstcon_group_t     *grp;
+        lstcon_batch_t     *bat;
+        int                 rc = 0;
 
         LASSERT (console_session.ses_state == LST_SESSION_ACTIVE);
 
@@ -1854,7 +1850,7 @@ lstcon_session_end()
 	/* destroy all batches */
 	while (!list_empty(&console_session.ses_bat_list)) {
 		bat = list_entry(console_session.ses_bat_list.next,
-				 struct lstcon_batch, bat_link);
+				 lstcon_batch_t, bat_link);
 
 		lstcon_batch_destroy(bat);
 	}
@@ -1862,7 +1858,7 @@ lstcon_session_end()
 	/* destroy all groups */
 	while (!list_empty(&console_session.ses_grp_list)) {
 		grp = list_entry(console_session.ses_grp_list.next,
-				 struct lstcon_group, grp_link);
+				 lstcon_group_t, grp_link);
 		LASSERT(grp->grp_ref == 1);
 
 		lstcon_group_decref(grp);
@@ -1910,15 +1906,15 @@ lstcon_session_feats_check(unsigned feats)
 }
 
 static int
-lstcon_acceptor_handle(struct srpc_server_rpc *rpc)
+lstcon_acceptor_handle (srpc_server_rpc_t *rpc)
 {
-	struct srpc_msg *rep = &rpc->srpc_replymsg;
-	struct srpc_msg *req = &rpc->srpc_reqstbuf->buf_msg;
-	struct srpc_join_reqst *jreq = &req->msg_body.join_reqst;
-	struct srpc_join_reply *jrep = &rep->msg_body.join_reply;
-	struct lstcon_group *grp = NULL;
-	struct lstcon_ndlink *ndl;
-	int rc = 0;
+        srpc_msg_t        *rep  = &rpc->srpc_replymsg;
+        srpc_msg_t        *req  = &rpc->srpc_reqstbuf->buf_msg;
+        srpc_join_reqst_t *jreq = &req->msg_body.join_reqst;
+        srpc_join_reply_t *jrep = &rep->msg_body.join_reply;
+        lstcon_group_t    *grp  = NULL;
+        lstcon_ndlink_t   *ndl;
+        int                rc   = 0;
 
         sfw_unpack_message(req);
 
@@ -1993,8 +1989,7 @@ lstcon_acceptor_handle(struct srpc_server_rpc *rpc)
         return rc;
 }
 
-static struct srpc_service lstcon_acceptor_service;
-
+static srpc_service_t lstcon_acceptor_service;
 static void lstcon_init_acceptor_service(void)
 {
         /* initialize selftest console acceptor service table */
@@ -2004,9 +1999,9 @@ static void lstcon_init_acceptor_service(void)
 	lstcon_acceptor_service.sv_wi_total = SFW_FRWK_WI_MAX;
 }
 
-static struct notifier_block lstcon_ioctl_handler = {
-	.notifier_call = lstcon_ioctl_entry,
-};
+int lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_hdr *hdr);
+
+DECLARE_IOCTL_HANDLER(lstcon_ioctl_handler, lstcon_ioctl_entry);
 
 /* initialize console */
 int
@@ -2015,6 +2010,8 @@ lstcon_console_init(void)
         int     i;
         int     rc;
 
+        memset(&console_session, 0, sizeof(lstcon_session_t));
+
 	console_session.ses_id		    = LST_INVALID_SID;
 	console_session.ses_state	    = LST_SESSION_NONE;
 	console_session.ses_timeout	    = 0;
@@ -2022,7 +2019,7 @@ lstcon_console_init(void)
 	console_session.ses_expired	    = 0;
 	console_session.ses_feats_updated   = 0;
 	console_session.ses_features	    = LST_FEATS_MASK;
-	console_session.ses_laststamp = ktime_get_real_seconds();
+	console_session.ses_laststamp = cfs_time_current_sec();
 
 	mutex_init(&console_session.ses_mutex);
 
@@ -2058,12 +2055,12 @@ lstcon_console_init(void)
                 goto out;
         }
 
-	rc = blocking_notifier_chain_register(&libcfs_ioctl_list,
-					      &lstcon_ioctl_handler);
-	if (rc == 0) {
-		lstcon_rpc_module_init();
-		return 0;
-	}
+        rc = libcfs_register_ioctl(&lstcon_ioctl_handler);
+
+        if (rc == 0) {
+                lstcon_rpc_module_init();
+                return 0;
+        }
 
 out:
 	srpc_shutdown_service(&lstcon_acceptor_service);
@@ -2080,10 +2077,9 @@ lstcon_console_init(void)
 int
 lstcon_console_fini(void)
 {
-	int i;
+        int     i;
 
-	blocking_notifier_chain_unregister(&libcfs_ioctl_list,
-					   &lstcon_ioctl_handler);
+        libcfs_deregister_ioctl(&lstcon_ioctl_handler);
 
 	mutex_lock(&console_session.ses_mutex);
 
diff --git a/drivers/staging/lustrefsx/lnet/selftest/console.h b/drivers/staging/lustrefsx/lnet/selftest/console.h
index 02c76a89627e6..ae76a50b4d173 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/console.h
+++ b/drivers/staging/lustrefsx/lnet/selftest/console.h
@@ -39,32 +39,29 @@
 #ifndef __LST_CONSOLE_H__
 #define __LST_CONSOLE_H__
 
-#include <linux/uaccess.h>
 
 #include <libcfs/libcfs.h>
+#include <lnet/lnet.h>
 #include <lnet/lib-types.h>
 #include "selftest.h"
 #include "conrpc.h"
 
-/* node descriptor */
-struct lstcon_node {
-	struct lnet_process_id    nd_id;	/* id of the node */
+typedef struct lstcon_node {
+	struct lnet_process_id    nd_id;          /* id of the node */
         int                  nd_ref;         /* reference count */
         int                  nd_state;       /* state of the node */
         int                  nd_timeout;     /* session timeout */
-	ktime_t			nd_stamp;	/* last RPC reply timestamp */
-	struct lstcon_rpc	nd_ping;	/* ping rpc */
-};
+        cfs_time_t           nd_stamp;       /* timestamp of last replied RPC */
+        struct lstcon_rpc    nd_ping;        /* ping rpc */
+} lstcon_node_t;                                /*** node descriptor */
 
-/* node link descriptor */
-struct lstcon_ndlink {
+typedef struct {
 	struct list_head	ndl_link;	/* chain on list */
 	struct list_head	ndl_hlink;	/* chain on hash */
-	struct lstcon_node	*ndl_node;	/* pointer to node */
-};
+	lstcon_node_t		*ndl_node;	/* pointer to node */
+} lstcon_ndlink_t;				/*** node link descriptor */
 
-/* (alias of nodes) group descriptor */
-struct lstcon_group {
+typedef struct {
 	struct list_head	grp_link;	/* chain on global group list */
 	int			grp_ref;	/* reference count */
 	int			grp_userland;	/* has userland nodes */
@@ -74,20 +71,19 @@ struct lstcon_group {
 	struct list_head	grp_trans_list;	/* transaction list */
 	struct list_head	grp_ndl_list;	/* nodes list */
 	struct list_head	grp_ndl_hash[0];/* hash table for nodes */
-};
+} lstcon_group_t;		/*** (alias of nodes) group descriptor */
 
 #define LST_BATCH_IDLE          0xB0            /* idle batch */
 #define LST_BATCH_RUNNING       0xB1            /* running batch */
 
-struct lstcon_tsb_hdr {
+typedef struct lstcon_tsb_hdr {
 	struct lst_bid		tsb_id;		/* batch ID */
         int                     tsb_index;      /* test index */
-};
+} lstcon_tsb_hdr_t;
 
-/* (tests ) batch descriptor */
-struct lstcon_batch {
+typedef struct {
 	/* test_batch header */
-	struct lstcon_tsb_hdr	bat_hdr;
+	lstcon_tsb_hdr_t	bat_hdr;
 	/* chain on session's batches list */
 	struct list_head	bat_link;
 	/* # of test */
@@ -103,7 +99,7 @@ struct lstcon_batch {
 	struct list_head	bat_test_list;
 	/* list head of transaction */
 	struct list_head	bat_trans_list;
-	/* list head of client nodes (struct lstcon_node) */
+	/* list head of client nodes (lstcon_node_t) */
 	struct list_head	bat_cli_list;
 	/* hash table of client nodes */
 	struct list_head	*bat_cli_hash;
@@ -111,16 +107,15 @@ struct lstcon_batch {
 	struct list_head	bat_srv_list;
 	/* hash table of server nodes */
 	struct list_head	*bat_srv_hash;
-};
+} lstcon_batch_t;		/*** (tests ) batch descritptor */
 
-/* a single test descriptor */
-struct lstcon_test {
+typedef struct lstcon_test {
 	/* test batch header */
-	struct lstcon_tsb_hdr	tes_hdr;
+	lstcon_tsb_hdr_t	tes_hdr;
 	/* chain on batch's tests list */
 	struct list_head	tes_link;
 	/* pointer to batch */
-	struct lstcon_batch	*tes_batch;
+	lstcon_batch_t	       *tes_batch;
 
         int                   tes_type;       /* type of the test, i.e: bulk, ping */
         int                   tes_stop_onerr; /* stop on error */
@@ -132,12 +127,12 @@ struct lstcon_test {
         int                   tes_cliidx;     /* client index, used for RPC creating */
 
 	struct list_head	tes_trans_list;	/* transaction list */
-	struct lstcon_group	*tes_src_grp;	/* group run the test */
-	struct lstcon_group	*tes_dst_grp;	/* target group */
+	lstcon_group_t		*tes_src_grp;	/* group run the test */
+	lstcon_group_t		*tes_dst_grp;	/* target group */
 
         int                   tes_paramlen;   /* test parameter length */
         char                  tes_param[0];   /* test parameter */
-};
+} lstcon_test_t;                                /*** a single test descriptor */
 
 #define LST_GLOBAL_HASHSIZE     503             /* global nodes hash table size */
 #define LST_NODE_HASHSIZE       239             /* node hash table (for batch or group) */
@@ -147,13 +142,13 @@ struct lstcon_test {
 
 #define LST_CONSOLE_TIMEOUT     300             /* default console timeout */
 
-struct lstcon_session {
+typedef struct {
 	struct mutex		ses_mutex;      /* only 1 thread in session */
-	struct lst_sid          ses_id;         /* global session id */
+	struct lst_sid               ses_id;         /* global session id */
         int                     ses_key;        /* local session key */
         int                     ses_state;      /* state of session */
         int                     ses_timeout;    /* timeout in seconds */
-	time64_t		ses_laststamp;  /* last operation stamp (seconds) */
+	time_t			ses_laststamp;  /* last operation stamp (seconds) */
 	/** tests features of the session */
 	unsigned		ses_features;
 	/** features are synced with remote test nodes */
@@ -166,9 +161,9 @@ struct lstcon_session {
 	unsigned		ses_expired:1;
         __u64                   ses_id_cookie;  /* batch id cookie */
         char                    ses_name[LST_NAME_SIZE];  /* session name */
-	struct lstcon_rpc_trans	*ses_ping;      /* session pinger */
-	struct stt_timer	ses_ping_timer;	/* timer for pinger */
-	struct lstcon_trans_stat ses_trans_stat;/* transaction stats */
+        lstcon_rpc_trans_t     *ses_ping;       /* session pinger */
+        stt_timer_t             ses_ping_timer; /* timer for pinger */
+	struct lstcon_trans_stat     ses_trans_stat; /* transaction stats */
 
 	struct list_head	ses_trans_list;	/* global list of transaction */
 	struct list_head	ses_grp_list;	/* global list of groups */
@@ -179,9 +174,9 @@ struct lstcon_session {
 	spinlock_t		ses_rpc_lock;	/* serialize */
 	atomic_t		ses_rpc_counter;/* # of initialized RPCs */
 	struct list_head	ses_rpc_freelist;/* idle console rpc */
-}; /* session descriptor */
+} lstcon_session_t;		/*** session descriptor */
 
-extern struct lstcon_session console_session;
+extern lstcon_session_t         console_session;
 
 static inline struct lstcon_trans_stat *
 lstcon_trans_stat(void)
@@ -255,8 +250,6 @@ extern int lstcon_test_add(char *batch_name, int type, int loop,
 			   void *param, int paramlen, int *retp,
 			   struct list_head __user *result_up);
 
-int lstcon_ioctl_entry(struct notifier_block *nb,
-		       unsigned long cmd, void *vdata);
 int lstcon_console_init(void);
 int lstcon_console_fini(void);
 
diff --git a/drivers/staging/lustrefsx/lnet/selftest/framework.c b/drivers/staging/lustrefsx/lnet/selftest/framework.c
index 000fca9d34e33..b5d430dde00d1 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/framework.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/framework.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -51,49 +51,49 @@ MODULE_PARM_DESC(rpc_timeout, "rpc timeout in seconds (64 by default, 0 == never
 
 #define sfw_unpack_id(id)               \
 do {                                    \
-	__swab64s(&(id).nid);           \
-	__swab32s(&(id).pid);           \
+        __swab64s(&(id).nid);           \
+        __swab32s(&(id).pid);           \
 } while (0)
 
 #define sfw_unpack_sid(sid)             \
 do {                                    \
-	__swab64s(&(sid).ses_nid);      \
-	__swab64s(&(sid).ses_stamp);    \
+        __swab64s(&(sid).ses_nid);      \
+        __swab64s(&(sid).ses_stamp);    \
 } while (0)
 
 #define sfw_unpack_fw_counters(fc)        \
 do {                                      \
-	__swab32s(&(fc).running_ms);      \
-	__swab32s(&(fc).active_batches);  \
-	__swab32s(&(fc).zombie_sessions); \
-	__swab32s(&(fc).brw_errors);      \
-	__swab32s(&(fc).ping_errors);     \
+        __swab32s(&(fc).running_ms);      \
+        __swab32s(&(fc).active_batches);  \
+        __swab32s(&(fc).zombie_sessions); \
+        __swab32s(&(fc).brw_errors);      \
+        __swab32s(&(fc).ping_errors);     \
 } while (0)
 
 #define sfw_unpack_rpc_counters(rc)     \
 do {                                    \
-	__swab32s(&(rc).errors);        \
-	__swab32s(&(rc).rpcs_sent);     \
-	__swab32s(&(rc).rpcs_rcvd);     \
-	__swab32s(&(rc).rpcs_dropped);  \
-	__swab32s(&(rc).rpcs_expired);  \
-	__swab64s(&(rc).bulk_get);      \
-	__swab64s(&(rc).bulk_put);      \
+        __swab32s(&(rc).errors);        \
+        __swab32s(&(rc).rpcs_sent);     \
+        __swab32s(&(rc).rpcs_rcvd);     \
+        __swab32s(&(rc).rpcs_dropped);  \
+        __swab32s(&(rc).rpcs_expired);  \
+        __swab64s(&(rc).bulk_get);      \
+        __swab64s(&(rc).bulk_put);      \
 } while (0)
 
 #define sfw_unpack_lnet_counters(lc)    \
 do {                                    \
-	__swab32s(&(lc).lcc_errors);        \
-	__swab32s(&(lc).lcc_msgs_max);      \
-	__swab32s(&(lc).lcc_msgs_alloc);    \
-	__swab32s(&(lc).lcc_send_count);    \
-	__swab32s(&(lc).lcc_recv_count);    \
-	__swab32s(&(lc).lcc_drop_count);    \
-	__swab32s(&(lc).lcc_route_count);   \
-	__swab64s(&(lc).lcc_send_length);   \
-	__swab64s(&(lc).lcc_recv_length);   \
-	__swab64s(&(lc).lcc_drop_length);   \
-	__swab64s(&(lc).lcc_route_length);  \
+        __swab32s(&(lc).errors);        \
+        __swab32s(&(lc).msgs_max);      \
+        __swab32s(&(lc).msgs_alloc);    \
+        __swab32s(&(lc).send_count);    \
+        __swab32s(&(lc).recv_count);    \
+        __swab32s(&(lc).drop_count);    \
+        __swab32s(&(lc).route_count);   \
+        __swab64s(&(lc).send_length);   \
+        __swab64s(&(lc).recv_length);   \
+        __swab64s(&(lc).drop_length);   \
+        __swab64s(&(lc).route_length);  \
 } while (0)
 
 #define sfw_test_active(t)      (atomic_read(&(t)->tsi_nactive) != 0)
@@ -111,21 +111,21 @@ static struct smoketest_framework {
 	/* serialise */
 	spinlock_t		fw_lock;
 	/* _the_ session */
-	struct sfw_session	*fw_session;
+	sfw_session_t		*fw_session;
 	/* shutdown in progress */
 	int			fw_shuttingdown;
 	/* running RPC */
-	struct srpc_server_rpc	*fw_active_srpc;
+	srpc_server_rpc_t	*fw_active_srpc;
 } sfw_data;
 
 /* forward ref's */
-static int sfw_stop_batch(struct sfw_batch *tsb, int force);
-static void sfw_destroy_session(struct sfw_session *sn);
+int sfw_stop_batch (sfw_batch_t *tsb, int force);
+void sfw_destroy_session (sfw_session_t *sn);
 
-static inline struct sfw_test_case *
+static inline sfw_test_case_t *
 sfw_find_test_case(int id)
 {
-	struct sfw_test_case *tsc;
+	sfw_test_case_t *tsc;
 
 	LASSERT(id <= SRPC_SERVICE_MAX_ID);
 	LASSERT(id > SRPC_FRAMEWORK_SERVICE_MAX_ID);
@@ -139,10 +139,9 @@ sfw_find_test_case(int id)
 }
 
 static int
-sfw_register_test(struct srpc_service *service,
-		  struct sfw_test_client_ops *cliops)
+sfw_register_test (srpc_service_t *service, sfw_test_client_ops_t *cliops)
 {
-	struct sfw_test_case *tsc;
+        sfw_test_case_t *tsc;
 
         if (sfw_find_test_case(service->sv_id) != NULL) {
                 CERROR ("Failed to register test %s (%d)\n",
@@ -150,7 +149,7 @@ sfw_register_test(struct srpc_service *service,
                 return -EEXIST;
         }
 
-	LIBCFS_ALLOC(tsc, sizeof(*tsc));
+        LIBCFS_ALLOC(tsc, sizeof(sfw_test_case_t));
         if (tsc == NULL)
                 return -ENOMEM;
 
@@ -164,8 +163,8 @@ sfw_register_test(struct srpc_service *service,
 static void
 sfw_add_session_timer (void)
 {
-	struct sfw_session *sn = sfw_data.fw_session;
-	struct stt_timer *timer = &sn->sn_timer;
+        sfw_session_t *sn = sfw_data.fw_session;
+        stt_timer_t   *timer = &sn->sn_timer;
 
         LASSERT (!sfw_data.fw_shuttingdown);
 
@@ -175,7 +174,8 @@ sfw_add_session_timer (void)
         LASSERT (!sn->sn_timer_active);
 
         sn->sn_timer_active = 1;
-	timer->stt_expires = ktime_get_real_seconds()+ sn->sn_timeout;
+	timer->stt_expires = cfs_time_add(sn->sn_timeout,
+					  cfs_time_current_sec());
         stt_add_timer(timer);
         return;
 }
@@ -183,7 +183,7 @@ sfw_add_session_timer (void)
 static int
 sfw_del_session_timer (void)
 {
-	struct sfw_session *sn = sfw_data.fw_session;
+        sfw_session_t *sn = sfw_data.fw_session;
 
         if (sn == NULL || !sn->sn_timer_active)
                 return 0;
@@ -203,10 +203,10 @@ static void
 sfw_deactivate_session (void)
 __must_hold(&sfw_data.fw_lock)
 {
-	struct sfw_session *sn = sfw_data.fw_session;
+        sfw_session_t *sn = sfw_data.fw_session;
         int            nactive = 0;
-	struct sfw_batch *tsb;
-	struct sfw_test_case *tsc;
+        sfw_batch_t   *tsb;
+        sfw_test_case_t *tsc;
 
         if (sn == NULL) return;
 
@@ -246,7 +246,7 @@ __must_hold(&sfw_data.fw_lock)
 static void
 sfw_session_expired (void *data)
 {
-	struct sfw_session *sn = data;
+	sfw_session_t *sn = data;
 
 	spin_lock(&sfw_data.fw_lock);
 
@@ -264,12 +264,12 @@ sfw_session_expired (void *data)
 }
 
 static inline void
-sfw_init_session(struct sfw_session *sn, struct lst_sid sid,
+sfw_init_session(sfw_session_t *sn, struct lst_sid sid,
 		 unsigned features, const char *name)
 {
-	struct stt_timer *timer = &sn->sn_timer;
+        stt_timer_t *timer = &sn->sn_timer;
 
-	memset(sn, 0, sizeof(struct sfw_session));
+        memset(sn, 0, sizeof(sfw_session_t));
 	INIT_LIST_HEAD(&sn->sn_list);
 	INIT_LIST_HEAD(&sn->sn_batches);
 	atomic_set(&sn->sn_refcount, 1);        /* +1 for caller */
@@ -277,14 +277,14 @@ sfw_init_session(struct sfw_session *sn, struct lst_sid sid,
 	atomic_set(&sn->sn_ping_errors, 0);
 	strlcpy(&sn->sn_name[0], name, sizeof(sn->sn_name));
 
-	sn->sn_timer_active = 0;
-	sn->sn_id = sid;
-	sn->sn_features = features;
-	sn->sn_timeout = session_timeout;
-	sn->sn_started = ktime_get();
+        sn->sn_timer_active = 0;
+        sn->sn_id           = sid;
+	sn->sn_features	    = features;
+        sn->sn_timeout      = session_timeout;
+        sn->sn_started      = cfs_time_current();
 
-	timer->stt_data = sn;
-	timer->stt_func = sfw_session_expired;
+        timer->stt_data = sn;
+        timer->stt_func = sfw_session_expired;
 	INIT_LIST_HEAD(&timer->stt_list);
 }
 
@@ -308,7 +308,7 @@ sfw_server_rpc_done(struct srpc_server_rpc *rpc)
 }
 
 static void
-sfw_client_rpc_fini(struct srpc_client_rpc *rpc)
+sfw_client_rpc_fini (srpc_client_rpc_t *rpc)
 {
 	LASSERT(rpc->crpc_bulk.bk_niov == 0);
 	LASSERT(list_empty(&rpc->crpc_list));
@@ -329,11 +329,11 @@ sfw_client_rpc_fini(struct srpc_client_rpc *rpc)
 	spin_unlock(&sfw_data.fw_lock);
 }
 
-static struct sfw_batch *
+static sfw_batch_t *
 sfw_find_batch(struct lst_bid bid)
 {
-	struct sfw_session *sn = sfw_data.fw_session;
-	struct sfw_batch *bat;
+	sfw_session_t *sn = sfw_data.fw_session;
+	sfw_batch_t   *bat;
 
 	LASSERT(sn != NULL);
 
@@ -345,11 +345,11 @@ sfw_find_batch(struct lst_bid bid)
 	return NULL;
 }
 
-static struct sfw_batch *
+static sfw_batch_t *
 sfw_bid2batch(struct lst_bid bid)
 {
-	struct sfw_session *sn = sfw_data.fw_session;
-	struct sfw_batch *bat;
+        sfw_session_t *sn = sfw_data.fw_session;
+        sfw_batch_t   *bat;
 
         LASSERT (sn != NULL);
 
@@ -357,7 +357,7 @@ sfw_bid2batch(struct lst_bid bid)
         if (bat != NULL)
                 return bat;
 
-	LIBCFS_ALLOC(bat, sizeof(*bat));
+        LIBCFS_ALLOC(bat, sizeof(sfw_batch_t));
 	if (bat == NULL)
                 return NULL;
 
@@ -372,11 +372,11 @@ sfw_bid2batch(struct lst_bid bid)
 }
 
 static int
-sfw_get_stats(struct srpc_stat_reqst *request, struct srpc_stat_reply *reply)
+sfw_get_stats (srpc_stat_reqst_t *request, srpc_stat_reply_t *reply)
 {
-	struct sfw_session *sn = sfw_data.fw_session;
+        sfw_session_t  *sn = sfw_data.fw_session;
 	struct sfw_counters *cnt = &reply->str_fw;
-	struct sfw_batch *bat;
+        sfw_batch_t    *bat;
 
         reply->str_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
 
@@ -390,14 +390,14 @@ sfw_get_stats(struct srpc_stat_reqst *request, struct srpc_stat_reply *reply)
                 return 0;
         }
 
-	lnet_counters_get_common(&reply->str_lnet);
+	lnet_counters_get(&reply->str_lnet);
 	srpc_get_counters(&reply->str_rpc);
 
         /* send over the msecs since the session was started
          - with 32 bits to send, this is ~49 days */
-	cnt->running_ms = ktime_ms_delta(ktime_get(), sn->sn_started);
-	cnt->brw_errors = atomic_read(&sn->sn_brw_errors);
-	cnt->ping_errors = atomic_read(&sn->sn_ping_errors);
+	cnt->running_ms      = jiffies_to_msecs(jiffies - sn->sn_started);
+	cnt->brw_errors      = atomic_read(&sn->sn_brw_errors);
+	cnt->ping_errors     = atomic_read(&sn->sn_ping_errors);
 	cnt->zombie_sessions = atomic_read(&sfw_data.fw_nzombies);
 
 	cnt->active_batches = 0;
@@ -411,12 +411,12 @@ sfw_get_stats(struct srpc_stat_reqst *request, struct srpc_stat_reply *reply)
 }
 
 int
-sfw_make_session(struct srpc_mksn_reqst *request, struct srpc_mksn_reply *reply)
+sfw_make_session(srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply)
 {
-	struct sfw_session *sn = sfw_data.fw_session;
-	struct srpc_msg *msg = container_of(request, struct srpc_msg,
-					    msg_body.mksn_reqst);
-	int cplen = 0;
+	sfw_session_t *sn = sfw_data.fw_session;
+	srpc_msg_t    *msg = container_of(request, srpc_msg_t,
+					  msg_body.mksn_reqst);
+	int	       cplen = 0;
 
         if (request->mksn_sid.ses_nid == LNET_NID_ANY) {
                 reply->mksn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
@@ -446,7 +446,7 @@ sfw_make_session(struct srpc_mksn_reqst *request, struct srpc_mksn_reply *reply)
 
 	/* reject the request if it requires unknown features
 	 * NB: old version will always accept all features because it's not
-	 * aware of struct srpc_msg::msg_ses_feats, it's a defect but it's also
+	 * aware of srpc_msg_t::msg_ses_feats, it's a defect but it's also
 	 * harmless because it will return zero feature to console, and it's
 	 * console's responsibility to make sure all nodes in a session have
 	 * same feature mask. */
@@ -456,7 +456,7 @@ sfw_make_session(struct srpc_mksn_reqst *request, struct srpc_mksn_reply *reply)
 	}
 
 	/* brand new or create by force */
-	LIBCFS_ALLOC(sn, sizeof(*sn));
+	LIBCFS_ALLOC(sn, sizeof(sfw_session_t));
 	if (sn == NULL) {
 		CERROR("dropping RPC mksn under memory pressure\n");
 		return -ENOMEM;
@@ -480,10 +480,9 @@ sfw_make_session(struct srpc_mksn_reqst *request, struct srpc_mksn_reply *reply)
 }
 
 static int
-sfw_remove_session(struct srpc_rmsn_reqst *request,
-		   struct srpc_rmsn_reply *reply)
+sfw_remove_session (srpc_rmsn_reqst_t *request, srpc_rmsn_reply_t *reply)
 {
-	struct sfw_session *sn = sfw_data.fw_session;
+        sfw_session_t *sn = sfw_data.fw_session;
 
         reply->rmsn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
 
@@ -513,10 +512,9 @@ sfw_remove_session(struct srpc_rmsn_reqst *request,
 }
 
 static int
-sfw_debug_session(struct srpc_debug_reqst *request,
-		  struct srpc_debug_reply *reply)
+sfw_debug_session (srpc_debug_reqst_t *request, srpc_debug_reply_t *reply)
 {
-	struct sfw_session *sn = sfw_data.fw_session;
+        sfw_session_t *sn = sfw_data.fw_session;
 
         if (sn == NULL) {
                 reply->dbg_status = ESRCH;
@@ -535,10 +533,10 @@ sfw_debug_session(struct srpc_debug_reqst *request,
 }
 
 static void
-sfw_test_rpc_fini(struct srpc_client_rpc *rpc)
+sfw_test_rpc_fini (srpc_client_rpc_t *rpc)
 {
-	struct sfw_test_unit *tsu = rpc->crpc_priv;
-	struct sfw_test_instance *tsi = tsu->tsu_instance;
+	sfw_test_unit_t	    *tsu = rpc->crpc_priv;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
 
 	/* Called with hold of tsi->tsi_lock */
 	LASSERT(list_empty(&rpc->crpc_list));
@@ -546,7 +544,7 @@ sfw_test_rpc_fini(struct srpc_client_rpc *rpc)
 }
 
 static inline int
-sfw_test_buffers(struct sfw_test_instance *tsi)
+sfw_test_buffers(sfw_test_instance_t *tsi)
 {
 	struct sfw_test_case	*tsc;
 	struct srpc_service	*svc;
@@ -620,10 +618,10 @@ sfw_unload_test(struct sfw_test_instance *tsi)
 }
 
 static void
-sfw_destroy_test_instance(struct sfw_test_instance *tsi)
+sfw_destroy_test_instance (sfw_test_instance_t *tsi)
 {
-	struct srpc_client_rpc *rpc;
-	struct sfw_test_unit *tsu;
+        srpc_client_rpc_t *rpc;
+        sfw_test_unit_t   *tsu;
 
         if (!tsi->tsi_is_client) goto clean;
 
@@ -635,14 +633,14 @@ sfw_destroy_test_instance(struct sfw_test_instance *tsi)
 
 	while (!list_empty(&tsi->tsi_units)) {
 		tsu = list_entry(tsi->tsi_units.next,
-				 struct sfw_test_unit, tsu_list);
+				 sfw_test_unit_t, tsu_list);
 		list_del(&tsu->tsu_list);
 		LIBCFS_FREE(tsu, sizeof(*tsu));
 	}
 
 	while (!list_empty(&tsi->tsi_free_rpcs)) {
 		rpc = list_entry(tsi->tsi_free_rpcs.next,
-				 struct srpc_client_rpc, crpc_list);
+				 srpc_client_rpc_t, crpc_list);
 		list_del(&rpc->crpc_list);
 		LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
 	}
@@ -654,35 +652,35 @@ sfw_destroy_test_instance(struct sfw_test_instance *tsi)
 }
 
 static void
-sfw_destroy_batch(struct sfw_batch *tsb)
+sfw_destroy_batch (sfw_batch_t *tsb)
 {
-	struct sfw_test_instance *tsi;
+	sfw_test_instance_t *tsi;
 
 	LASSERT(!sfw_batch_active(tsb));
 	LASSERT(list_empty(&tsb->bat_list));
 
 	while (!list_empty(&tsb->bat_tests)) {
 		tsi = list_entry(tsb->bat_tests.next,
-				 struct sfw_test_instance, tsi_list);
+				 sfw_test_instance_t, tsi_list);
 		list_del_init(&tsi->tsi_list);
 		sfw_destroy_test_instance(tsi);
 	}
 
-	LIBCFS_FREE(tsb, sizeof(*tsb));
+	LIBCFS_FREE(tsb, sizeof(sfw_batch_t));
 	return;
 }
 
-static void
-sfw_destroy_session(struct sfw_session *sn)
+void
+sfw_destroy_session (sfw_session_t *sn)
 {
-	struct sfw_batch *batch;
+	sfw_batch_t *batch;
 
 	LASSERT(list_empty(&sn->sn_list));
 	LASSERT(sn != sfw_data.fw_session);
 
 	while (!list_empty(&sn->sn_batches)) {
 		batch = list_entry(sn->sn_batches.next,
-				   struct sfw_batch, bat_list);
+				   sfw_batch_t, bat_list);
 		list_del_init(&batch->bat_list);
 		sfw_destroy_batch(batch);
 	}
@@ -693,9 +691,9 @@ sfw_destroy_session(struct sfw_session *sn)
 }
 
 static void
-sfw_unpack_addtest_req(struct srpc_msg *msg)
+sfw_unpack_addtest_req(srpc_msg_t *msg)
 {
-	struct srpc_test_reqst *req = &msg->msg_body.tes_reqst;
+        srpc_test_reqst_t *req = &msg->msg_body.tes_reqst;
 
         LASSERT (msg->msg_type == SRPC_MSG_TEST_REQST);
         LASSERT (req->tsr_is_client);
@@ -707,14 +705,14 @@ sfw_unpack_addtest_req(struct srpc_msg *msg)
 
 	if (req->tsr_service == SRPC_SERVICE_BRW) {
 		if ((msg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) {
-			struct test_bulk_req *bulk = &req->tsr_u.bulk_v0;
+			test_bulk_req_t *bulk = &req->tsr_u.bulk_v0;
 
 			__swab32s(&bulk->blk_opc);
 			__swab32s(&bulk->blk_npg);
 			__swab32s(&bulk->blk_flags);
 
 		} else {
-			struct test_bulk_req_v1 *bulk = &req->tsr_u.bulk_v1;
+			test_bulk_req_v1_t *bulk = &req->tsr_u.bulk_v1;
 
 			__swab16s(&bulk->blk_opc);
 			__swab16s(&bulk->blk_flags);
@@ -726,7 +724,7 @@ sfw_unpack_addtest_req(struct srpc_msg *msg)
 	}
 
         if (req->tsr_service == SRPC_SERVICE_PING) {
-		struct test_ping_req *ping = &req->tsr_u.ping;
+                test_ping_req_t *ping = &req->tsr_u.ping;
 
                 __swab32s(&ping->png_size);
                 __swab32s(&ping->png_flags);
@@ -738,16 +736,16 @@ sfw_unpack_addtest_req(struct srpc_msg *msg)
 }
 
 static int
-sfw_add_test_instance(struct sfw_batch *tsb, struct srpc_server_rpc *rpc)
+sfw_add_test_instance (sfw_batch_t *tsb, srpc_server_rpc_t *rpc)
 {
-	struct srpc_msg *msg = &rpc->srpc_reqstbuf->buf_msg;
-	struct srpc_test_reqst *req = &msg->msg_body.tes_reqst;
-	struct srpc_bulk *bk = rpc->srpc_bulk;
-	int ndest = req->tsr_ndest;
-	struct sfw_test_unit *tsu;
-	struct sfw_test_instance *tsi;
-	int i;
-	int rc;
+        srpc_msg_t          *msg = &rpc->srpc_reqstbuf->buf_msg;
+        srpc_test_reqst_t   *req = &msg->msg_body.tes_reqst;
+        srpc_bulk_t         *bk = rpc->srpc_bulk;
+        int                  ndest = req->tsr_ndest;
+        sfw_test_unit_t     *tsu;
+        sfw_test_instance_t *tsi;
+        int                  i;
+        int                  rc;
 
         LIBCFS_ALLOC(tsi, sizeof(*tsi));
         if (tsi == NULL) {
@@ -804,7 +802,7 @@ sfw_add_test_instance(struct sfw_batch *tsb, struct srpc_server_rpc *rpc)
                         sfw_unpack_id(id);
 
                 for (j = 0; j < tsi->tsi_concur; j++) {
-			LIBCFS_ALLOC(tsu, sizeof(*tsu));
+                        LIBCFS_ALLOC(tsu, sizeof(sfw_test_unit_t));
                         if (tsu == NULL) {
                                 rc = -ENOMEM;
                                 CERROR ("Can't allocate tsu for %d\n",
@@ -833,11 +831,11 @@ sfw_add_test_instance(struct sfw_batch *tsb, struct srpc_server_rpc *rpc)
 }
 
 static void
-sfw_test_unit_done(struct sfw_test_unit *tsu)
+sfw_test_unit_done (sfw_test_unit_t *tsu)
 {
-	struct sfw_test_instance *tsi = tsu->tsu_instance;
-	struct sfw_batch *tsb = tsi->tsi_batch;
-	struct sfw_session *sn = tsb->bat_session;
+        sfw_test_instance_t *tsi = tsu->tsu_instance;
+        sfw_batch_t         *tsb = tsi->tsi_batch;
+        sfw_session_t       *sn = tsb->bat_session;
 
         LASSERT (sfw_test_active(tsi));
 
@@ -876,10 +874,10 @@ sfw_test_unit_done(struct sfw_test_unit *tsu)
 }
 
 static void
-sfw_test_rpc_done(struct srpc_client_rpc *rpc)
+sfw_test_rpc_done (srpc_client_rpc_t *rpc)
 {
-	struct sfw_test_unit *tsu = rpc->crpc_priv;
-	struct sfw_test_instance *tsi = tsu->tsu_instance;
+        sfw_test_unit_t     *tsu = rpc->crpc_priv;
+        sfw_test_instance_t *tsi = tsu->tsu_instance;
         int                  done = 0;
 
         tsi->tsi_ops->tso_done_rpc(tsu, rpc);
@@ -912,12 +910,12 @@ sfw_test_rpc_done(struct srpc_client_rpc *rpc)
 }
 
 int
-sfw_create_test_rpc(struct sfw_test_unit *tsu, struct lnet_process_id peer,
+sfw_create_test_rpc(sfw_test_unit_t *tsu, struct lnet_process_id peer,
 		    unsigned features, int nblk, int blklen,
-		    struct srpc_client_rpc **rpcpp)
+		    srpc_client_rpc_t **rpcpp)
 {
-	struct srpc_client_rpc *rpc = NULL;
-	struct sfw_test_instance *tsi = tsu->tsu_instance;
+	srpc_client_rpc_t   *rpc = NULL;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
 
 	spin_lock(&tsi->tsi_lock);
 
@@ -926,7 +924,7 @@ sfw_create_test_rpc(struct sfw_test_unit *tsu, struct lnet_process_id peer,
 	if (!list_empty(&tsi->tsi_free_rpcs)) {
 		/* pick request from buffer */
 		rpc = list_entry(tsi->tsi_free_rpcs.next,
-				 struct srpc_client_rpc, crpc_list);
+				 srpc_client_rpc_t, crpc_list);
 		LASSERT(nblk == rpc->crpc_bulk.bk_niov);
 		list_del_init(&rpc->crpc_list);
 	}
@@ -955,11 +953,11 @@ sfw_create_test_rpc(struct sfw_test_unit *tsu, struct lnet_process_id peer,
 }
 
 static int
-sfw_run_test(struct swi_workitem *wi)
+sfw_run_test (swi_workitem_t *wi)
 {
-	struct sfw_test_unit *tsu = wi->swi_workitem.wi_data;
-	struct sfw_test_instance *tsi = tsu->tsu_instance;
-	struct srpc_client_rpc *rpc = NULL;
+        sfw_test_unit_t     *tsu = wi->swi_workitem.wi_data;
+        sfw_test_instance_t *tsi = tsu->tsu_instance;
+        srpc_client_rpc_t   *rpc = NULL;
 
         LASSERT (wi == &tsu->tsu_worker);
 
@@ -1004,11 +1002,11 @@ sfw_run_test(struct swi_workitem *wi)
 }
 
 static int
-sfw_run_batch(struct sfw_batch *tsb)
+sfw_run_batch (sfw_batch_t *tsb)
 {
-	struct swi_workitem *wi;
-	struct sfw_test_unit *tsu;
-	struct sfw_test_instance *tsi;
+        swi_workitem_t      *wi;
+        sfw_test_unit_t     *tsu;
+        sfw_test_instance_t *tsi;
 
         if (sfw_batch_active(tsb)) {
 		CDEBUG(D_NET, "Batch already active: %llu (%d)\n",
@@ -1040,11 +1038,11 @@ sfw_run_batch(struct sfw_batch *tsb)
 	return 0;
 }
 
-static int
-sfw_stop_batch(struct sfw_batch *tsb, int force)
+int
+sfw_stop_batch (sfw_batch_t *tsb, int force)
 {
-	struct sfw_test_instance *tsi;
-	struct srpc_client_rpc *rpc;
+        sfw_test_instance_t *tsi;
+        srpc_client_rpc_t   *rpc;
 
         if (!sfw_batch_active(tsb)) {
 		CDEBUG(D_NET, "Batch %llu inactive\n", tsb->bat_id.bat_id);
@@ -1083,10 +1081,9 @@ sfw_stop_batch(struct sfw_batch *tsb, int force)
 }
 
 static int
-sfw_query_batch(struct sfw_batch *tsb, int testidx,
-		struct srpc_batch_reply *reply)
+sfw_query_batch (sfw_batch_t *tsb, int testidx, srpc_batch_reply_t *reply)
 {
-	struct sfw_test_instance *tsi;
+        sfw_test_instance_t *tsi;
 
         if (testidx < 0)
                 return -EINVAL;
@@ -1108,7 +1105,7 @@ sfw_query_batch(struct sfw_batch *tsb, int testidx,
 }
 
 void
-sfw_free_pages(struct srpc_server_rpc *rpc)
+sfw_free_pages (srpc_server_rpc_t *rpc)
 {
         srpc_free_bulk(rpc->srpc_bulk);
         rpc->srpc_bulk = NULL;
@@ -1129,13 +1126,13 @@ sfw_alloc_pages(struct srpc_server_rpc *rpc, int cpt, int npages, int len,
 }
 
 static int
-sfw_add_test(struct srpc_server_rpc *rpc)
+sfw_add_test (srpc_server_rpc_t *rpc)
 {
-	struct sfw_session *sn = sfw_data.fw_session;
-	struct srpc_test_reply *reply = &rpc->srpc_replymsg.msg_body.tes_reply;
-	struct srpc_test_reqst *request;
-	int rc;
-	struct sfw_batch *bat;
+        sfw_session_t     *sn = sfw_data.fw_session;
+        srpc_test_reply_t *reply = &rpc->srpc_replymsg.msg_body.tes_reply;
+        srpc_test_reqst_t *request;
+        int                rc;
+        sfw_batch_t       *bat;
 
         request = &rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst;
         reply->tsr_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
@@ -1199,12 +1196,11 @@ sfw_add_test(struct srpc_server_rpc *rpc)
 }
 
 static int
-sfw_control_batch(struct srpc_batch_reqst *request,
-		  struct srpc_batch_reply *reply)
+sfw_control_batch (srpc_batch_reqst_t *request, srpc_batch_reply_t *reply)
 {
-	struct sfw_session *sn = sfw_data.fw_session;
+        sfw_session_t *sn = sfw_data.fw_session;
         int            rc = 0;
-	struct sfw_batch *bat;
+        sfw_batch_t   *bat;
 
         reply->bar_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
 
@@ -1244,8 +1240,8 @@ static int
 sfw_handle_server_rpc(struct srpc_server_rpc *rpc)
 {
 	struct srpc_service	*sv = rpc->srpc_scd->scd_svc;
-	struct srpc_msg     *reply	= &rpc->srpc_replymsg;
-	struct srpc_msg     *request	= &rpc->srpc_reqstbuf->buf_msg;
+	srpc_msg_t     *reply	= &rpc->srpc_replymsg;
+	srpc_msg_t     *request	= &rpc->srpc_reqstbuf->buf_msg;
 	unsigned	features = LST_FEATS_MASK;
 	int		rc = 0;
 
@@ -1278,7 +1274,7 @@ sfw_handle_server_rpc(struct srpc_server_rpc *rpc)
 
 	if (sv->sv_id != SRPC_SERVICE_MAKE_SESSION &&
 	    sv->sv_id != SRPC_SERVICE_DEBUG) {
-		struct sfw_session *sn = sfw_data.fw_session;
+		sfw_session_t *sn = sfw_data.fw_session;
 
 		if (sn != NULL &&
 		    sn->sn_features != request->msg_ses_feats) {
@@ -1394,12 +1390,12 @@ sfw_bulk_ready(struct srpc_server_rpc *rpc, int status)
 	return rc;
 }
 
-struct srpc_client_rpc *
+srpc_client_rpc_t *
 sfw_create_rpc(struct lnet_process_id peer, int service,
 	       unsigned features, int nbulkiov, int bulklen,
-	       void (*done)(struct srpc_client_rpc *), void *priv)
+	       void (*done)(srpc_client_rpc_t *), void *priv)
 {
-	struct srpc_client_rpc *rpc = NULL;
+	srpc_client_rpc_t *rpc = NULL;
 
 	spin_lock(&sfw_data.fw_lock);
 
@@ -1408,7 +1404,7 @@ sfw_create_rpc(struct lnet_process_id peer, int service,
 
 	if (nbulkiov == 0 && !list_empty(&sfw_data.fw_zombie_rpcs)) {
 		rpc = list_entry(sfw_data.fw_zombie_rpcs.next,
-				     struct srpc_client_rpc, crpc_list);
+                                     srpc_client_rpc_t, crpc_list);
 		list_del(&rpc->crpc_list);
 
                 srpc_init_client_rpc(rpc, peer, service, 0, 0,
@@ -1432,7 +1428,7 @@ sfw_create_rpc(struct lnet_process_id peer, int service,
 }
 
 void
-sfw_unpack_message(struct srpc_msg *msg)
+sfw_unpack_message (srpc_msg_t *msg)
 {
         if (msg->msg_magic == SRPC_MSG_MAGIC)
                 return; /* no flipping needed */
@@ -1441,7 +1437,7 @@ sfw_unpack_message(struct srpc_msg *msg)
         LASSERT (msg->msg_magic == __swab32(SRPC_MSG_MAGIC));
 
         if (msg->msg_type == SRPC_MSG_STAT_REQST) {
-		struct srpc_stat_reqst *req = &msg->msg_body.stat_reqst;
+                srpc_stat_reqst_t *req = &msg->msg_body.stat_reqst;
 
                 __swab32s(&req->str_type);
                 __swab64s(&req->str_rpyid);
@@ -1450,7 +1446,7 @@ sfw_unpack_message(struct srpc_msg *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_STAT_REPLY) {
-		struct srpc_stat_reply *rep = &msg->msg_body.stat_reply;
+                srpc_stat_reply_t *rep = &msg->msg_body.stat_reply;
 
                 __swab32s(&rep->str_status);
                 sfw_unpack_sid(rep->str_sid);
@@ -1461,7 +1457,7 @@ sfw_unpack_message(struct srpc_msg *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_MKSN_REQST) {
-		struct srpc_mksn_reqst *req = &msg->msg_body.mksn_reqst;
+                srpc_mksn_reqst_t *req = &msg->msg_body.mksn_reqst;
 
                 __swab64s(&req->mksn_rpyid);
                 __swab32s(&req->mksn_force);
@@ -1470,7 +1466,7 @@ sfw_unpack_message(struct srpc_msg *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_MKSN_REPLY) {
-		struct srpc_mksn_reply *rep = &msg->msg_body.mksn_reply;
+                srpc_mksn_reply_t *rep = &msg->msg_body.mksn_reply;
 
                 __swab32s(&rep->mksn_status);
                 __swab32s(&rep->mksn_timeout);
@@ -1479,7 +1475,7 @@ sfw_unpack_message(struct srpc_msg *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_RMSN_REQST) {
-		struct srpc_rmsn_reqst *req = &msg->msg_body.rmsn_reqst;
+                srpc_rmsn_reqst_t *req = &msg->msg_body.rmsn_reqst;
 
                 __swab64s(&req->rmsn_rpyid);
                 sfw_unpack_sid(req->rmsn_sid);
@@ -1487,7 +1483,7 @@ sfw_unpack_message(struct srpc_msg *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_RMSN_REPLY) {
-		struct srpc_rmsn_reply *rep = &msg->msg_body.rmsn_reply;
+                srpc_rmsn_reply_t *rep = &msg->msg_body.rmsn_reply;
 
                 __swab32s(&rep->rmsn_status);
                 sfw_unpack_sid(rep->rmsn_sid);
@@ -1495,7 +1491,7 @@ sfw_unpack_message(struct srpc_msg *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_DEBUG_REQST) {
-		struct srpc_debug_reqst *req = &msg->msg_body.dbg_reqst;
+                srpc_debug_reqst_t *req = &msg->msg_body.dbg_reqst;
 
                 __swab64s(&req->dbg_rpyid);
                 __swab32s(&req->dbg_flags);
@@ -1504,7 +1500,7 @@ sfw_unpack_message(struct srpc_msg *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_DEBUG_REPLY) {
-		struct srpc_debug_reply *rep = &msg->msg_body.dbg_reply;
+                srpc_debug_reply_t *rep = &msg->msg_body.dbg_reply;
 
                 __swab32s(&rep->dbg_nbatch);
                 __swab32s(&rep->dbg_timeout);
@@ -1513,7 +1509,7 @@ sfw_unpack_message(struct srpc_msg *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_BATCH_REQST) {
-		struct srpc_batch_reqst *req = &msg->msg_body.bat_reqst;
+                srpc_batch_reqst_t *req = &msg->msg_body.bat_reqst;
 
                 __swab32s(&req->bar_opc);
                 __swab64s(&req->bar_rpyid);
@@ -1525,7 +1521,7 @@ sfw_unpack_message(struct srpc_msg *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_BATCH_REPLY) {
-		struct srpc_batch_reply *rep = &msg->msg_body.bat_reply;
+                srpc_batch_reply_t *rep = &msg->msg_body.bat_reply;
 
                 __swab32s(&rep->bar_status);
                 sfw_unpack_sid(rep->bar_sid);
@@ -1533,7 +1529,7 @@ sfw_unpack_message(struct srpc_msg *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_TEST_REQST) {
-		struct srpc_test_reqst *req = &msg->msg_body.tes_reqst;
+                srpc_test_reqst_t *req = &msg->msg_body.tes_reqst;
 
                 __swab64s(&req->tsr_rpyid);
                 __swab64s(&req->tsr_bulkid);
@@ -1547,7 +1543,7 @@ sfw_unpack_message(struct srpc_msg *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_TEST_REPLY) {
-		struct srpc_test_reply *rep = &msg->msg_body.tes_reply;
+                srpc_test_reply_t *rep = &msg->msg_body.tes_reply;
 
                 __swab32s(&rep->tsr_status);
                 sfw_unpack_sid(rep->tsr_sid);
@@ -1555,7 +1551,7 @@ sfw_unpack_message(struct srpc_msg *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_JOIN_REQST) {
-		struct srpc_join_reqst *req = &msg->msg_body.join_reqst;
+                srpc_join_reqst_t *req = &msg->msg_body.join_reqst;
 
                 __swab64s(&req->join_rpyid);
                 sfw_unpack_sid(req->join_sid);
@@ -1563,7 +1559,7 @@ sfw_unpack_message(struct srpc_msg *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_JOIN_REPLY) {
-		struct srpc_join_reply *rep = &msg->msg_body.join_reply;
+                srpc_join_reply_t *rep = &msg->msg_body.join_reply;
 
                 __swab32s(&rep->join_status);
                 __swab32s(&rep->join_timeout);
@@ -1576,7 +1572,7 @@ sfw_unpack_message(struct srpc_msg *msg)
 }
 
 void
-sfw_abort_rpc(struct srpc_client_rpc *rpc)
+sfw_abort_rpc (srpc_client_rpc_t *rpc)
 {
 	LASSERT(atomic_read(&rpc->crpc_refcount) > 0);
 	LASSERT(rpc->crpc_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
@@ -1588,7 +1584,7 @@ sfw_abort_rpc(struct srpc_client_rpc *rpc)
 }
 
 void
-sfw_post_rpc(struct srpc_client_rpc *rpc)
+sfw_post_rpc (srpc_client_rpc_t *rpc)
 {
 	spin_lock(&rpc->crpc_lock);
 
@@ -1604,14 +1600,44 @@ sfw_post_rpc(struct srpc_client_rpc *rpc)
 	return;
 }
 
-static struct srpc_service sfw_services[] = {
-	{ .sv_id = SRPC_SERVICE_DEBUG,		.sv_name = "debug", },
-	{ .sv_id = SRPC_SERVICE_QUERY_STAT,	.sv_name = "query stats", },
-	{ .sv_id = SRPC_SERVICE_MAKE_SESSION,	.sv_name = "make session", },
-	{ .sv_id = SRPC_SERVICE_REMOVE_SESSION,	.sv_name = "remove session", },
-	{ .sv_id = SRPC_SERVICE_BATCH,		.sv_name = "batch service", },
-	{ .sv_id = SRPC_SERVICE_TEST,		.sv_name = "test service", },
-	{ .sv_id = 0, } };
+static srpc_service_t sfw_services[] =
+{
+        {
+                /* sv_id */    SRPC_SERVICE_DEBUG,
+                /* sv_name */  "debug",
+                0
+        },
+        {
+                /* sv_id */    SRPC_SERVICE_QUERY_STAT,
+                /* sv_name */  "query stats",
+                0
+        },
+        {
+                /* sv_id */    SRPC_SERVICE_MAKE_SESSION,
+                /* sv_name */  "make session",
+                0
+        },
+        {
+                /* sv_id */    SRPC_SERVICE_REMOVE_SESSION,
+                /* sv_name */  "remove session",
+                0
+        },
+        {
+                /* sv_id */    SRPC_SERVICE_BATCH,
+                /* sv_name */  "batch service",
+                0
+        },
+        {
+                /* sv_id */    SRPC_SERVICE_TEST,
+                /* sv_name */  "test service",
+                0
+        },
+        {
+                /* sv_id */    0,
+                /* sv_name */  NULL,
+                0
+        }
+};
 
 int
 sfw_startup (void)
@@ -1619,8 +1645,8 @@ sfw_startup (void)
         int              i;
         int              rc;
         int              error;
-	struct srpc_service *sv;
-	struct sfw_test_case *tsc;
+        srpc_service_t  *sv;
+        sfw_test_case_t *tsc;
 
 
         if (session_timeout < 0) {
@@ -1714,8 +1740,8 @@ sfw_startup (void)
 void
 sfw_shutdown (void)
 {
-	struct srpc_service *sv;
-	struct sfw_test_case *tsc;
+	srpc_service_t	*sv;
+	sfw_test_case_t	*tsc;
 	int		 i;
 
 	spin_lock(&sfw_data.fw_lock);
@@ -1752,10 +1778,10 @@ sfw_shutdown (void)
         }
 
 	while (!list_empty(&sfw_data.fw_zombie_rpcs)) {
-		struct srpc_client_rpc *rpc;
+		srpc_client_rpc_t *rpc;
 
 		rpc = list_entry(sfw_data.fw_zombie_rpcs.next,
-				 struct srpc_client_rpc, crpc_list);
+				 srpc_client_rpc_t, crpc_list);
 		list_del(&rpc->crpc_list);
 
 		LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
@@ -1771,7 +1797,7 @@ sfw_shutdown (void)
 
 	while (!list_empty(&sfw_data.fw_tests)) {
 		tsc = list_entry(sfw_data.fw_tests.next,
-				 struct sfw_test_case, tsc_list);
+				 sfw_test_case_t, tsc_list);
 
 		srpc_wait_service_shutdown(tsc->tsc_srv_service);
 
diff --git a/drivers/staging/lustrefsx/lnet/selftest/module.c b/drivers/staging/lustrefsx/lnet/selftest/module.c
index 5324957500940..56212a840dcc4 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/module.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/module.c
@@ -52,58 +52,61 @@ struct cfs_wi_sched **lst_sched_test;
 static void
 lnet_selftest_exit(void)
 {
-	int i;
-
-	switch (lst_init_step) {
-	case LST_INIT_CONSOLE:
-		lstcon_console_fini();
-		/* fallthrough */
-	case LST_INIT_FW:
-		sfw_shutdown();
-		/* fallthrough */
-	case LST_INIT_RPC:
-		srpc_shutdown();
-		/* fallthrough */
-	case LST_INIT_WI_TEST:
-		for (i = 0;
-		     i < cfs_cpt_number(lnet_cpt_table()); i++) {
-			if (lst_sched_test[i] == NULL)
-				continue;
-			cfs_wi_sched_destroy(lst_sched_test[i]);
-		}
-		LIBCFS_FREE(lst_sched_test,
-			    sizeof(lst_sched_test[0]) *
-			    cfs_cpt_number(lnet_cpt_table()));
-		lst_sched_test = NULL;
-		/* fallthrough */
-	case LST_INIT_WI_SERIAL:
-		cfs_wi_sched_destroy(lst_sched_serial);
-		lst_sched_serial = NULL;
-		/* fallthrough */
-	case LST_INIT_NONE:
-		break;
-	default:
-		LBUG();
-	}
+	int	i;
+
+        switch (lst_init_step) {
+                case LST_INIT_CONSOLE:
+                        lstcon_console_fini();
+		/* Fall through */
+                case LST_INIT_FW:
+                        sfw_shutdown();
+		/* Fall through */
+                case LST_INIT_RPC:
+                        srpc_shutdown();
+		/* Fall through */
+		case LST_INIT_WI_TEST:
+			for (i = 0;
+			     i < cfs_cpt_number(lnet_cpt_table()); i++) {
+				if (lst_sched_test[i] == NULL)
+					continue;
+				cfs_wi_sched_destroy(lst_sched_test[i]);
+			}
+			LIBCFS_FREE(lst_sched_test,
+				    sizeof(lst_sched_test[0]) *
+				    cfs_cpt_number(lnet_cpt_table()));
+			lst_sched_test = NULL;
+		/* Fall through */
+
+		case LST_INIT_WI_SERIAL:
+			cfs_wi_sched_destroy(lst_sched_serial);
+			lst_sched_serial = NULL;
+		/* Fall through */
+                case LST_INIT_NONE:
+                        break;
+		/* Fall through */
+                default:
+                        LBUG();
+        }
+        return;
 }
 
 void
 lnet_selftest_structure_assertion(void)
 {
-	CLASSERT(sizeof(struct srpc_msg) == 160);
-	CLASSERT(sizeof(struct srpc_test_reqst) == 70);
-	CLASSERT(offsetof(struct srpc_msg, msg_body.tes_reqst.tsr_concur) == 72);
-	CLASSERT(offsetof(struct srpc_msg, msg_body.tes_reqst.tsr_ndest) == 78);
-	CLASSERT(sizeof(struct srpc_stat_reply) == 136);
-	CLASSERT(sizeof(struct srpc_stat_reqst) == 28);
+        CLASSERT(sizeof(srpc_msg_t) == 160);
+        CLASSERT(sizeof(srpc_test_reqst_t) == 70);
+        CLASSERT(offsetof(srpc_msg_t, msg_body.tes_reqst.tsr_concur) == 72);
+        CLASSERT(offsetof(srpc_msg_t, msg_body.tes_reqst.tsr_ndest) == 78);
+        CLASSERT(sizeof(srpc_stat_reply_t) == 136);
+        CLASSERT(sizeof(srpc_stat_reqst_t) == 28);
 }
 
 static int __init
 lnet_selftest_init(void)
 {
-	int nscheds;
-	int rc;
-	int i;
+	int	nscheds;
+	int	rc;
+	int	i;
 
 	rc = cfs_wi_sched_create("lst_s", lnet_cpt_table(), CFS_CPT_ANY,
 				 1, &lst_sched_serial);
@@ -127,31 +130,31 @@ lnet_selftest_init(void)
 		rc = cfs_wi_sched_create("lst_t", lnet_cpt_table(), i,
 					 nthrs, &lst_sched_test[i]);
 		if (rc != 0) {
-			CERROR("Failed to create CPU partition affinity WI scheduler %d for LST\n",
-			       i);
+			CERROR("Failed to create CPU partition affinity WI "
+			       "scheduler %d for LST\n", i);
 			goto error;
 		}
 	}
 
-	rc = srpc_startup();
-	if (rc != 0) {
-		CERROR("LST can't startup rpc\n");
-		goto error;
-	}
-	lst_init_step = LST_INIT_RPC;
-
-	rc = sfw_startup();
-	if (rc != 0) {
-		CERROR("LST can't startup framework\n");
-		goto error;
-	}
-	lst_init_step = LST_INIT_FW;
-
-	rc = lstcon_console_init();
-	if (rc != 0) {
-		CERROR("LST can't startup console\n");
-		goto error;
-	}
+        rc = srpc_startup();
+        if (rc != 0) {
+                CERROR("LST can't startup rpc\n");
+                goto error;
+        }
+        lst_init_step = LST_INIT_RPC;
+
+        rc = sfw_startup();
+        if (rc != 0) {
+                CERROR("LST can't startup framework\n");
+                goto error;
+        }
+        lst_init_step = LST_INIT_FW;
+
+        rc = lstcon_console_init();
+        if (rc != 0) {
+                CERROR("LST can't startup console\n");
+                goto error;
+        }
 	lst_init_step = LST_INIT_CONSOLE;
 	return 0;
 error:
diff --git a/drivers/staging/lustrefsx/lnet/selftest/ping_test.c b/drivers/staging/lustrefsx/lnet/selftest/ping_test.c
index 2d1403b34c7bc..ea2076103c756 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/ping_test.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/ping_test.c
@@ -44,17 +44,17 @@ static int ping_srv_workitems = SFW_TEST_WI_MAX;
 module_param(ping_srv_workitems, int, 0644);
 MODULE_PARM_DESC(ping_srv_workitems, "# PING server workitems");
 
-struct lst_ping_data {
+typedef struct {
 	spinlock_t	pnd_lock;	/* serialize */
 	int		pnd_counter;	/* sequence counter */
-};
+} lst_ping_data_t;
 
-static struct lst_ping_data lst_ping_data;
+static lst_ping_data_t  lst_ping_data;
 
 static int
-ping_client_init(struct sfw_test_instance *tsi)
+ping_client_init(sfw_test_instance_t *tsi)
 {
-	struct sfw_session *sn = tsi->tsi_batch->bat_session;
+	sfw_session_t *sn = tsi->tsi_batch->bat_session;
 
 	LASSERT(tsi->tsi_is_client);
 	LASSERT(sn != NULL && (sn->sn_features & ~LST_FEATS_MASK) == 0);
@@ -66,9 +66,9 @@ ping_client_init(struct sfw_test_instance *tsi)
 }
 
 static void
-ping_client_fini(struct sfw_test_instance *tsi)
+ping_client_fini (sfw_test_instance_t *tsi)
 {
-	struct sfw_session *sn = tsi->tsi_batch->bat_session;
+        sfw_session_t *sn = tsi->tsi_batch->bat_session;
         int            errors;
 
         LASSERT (sn != NULL);
@@ -82,14 +82,14 @@ ping_client_fini(struct sfw_test_instance *tsi)
 }
 
 static int
-ping_client_prep_rpc(struct sfw_test_unit *tsu, struct lnet_process_id dest,
-		     struct srpc_client_rpc **rpc)
+ping_client_prep_rpc(sfw_test_unit_t *tsu,
+		     struct lnet_process_id dest, srpc_client_rpc_t **rpc)
 {
-	struct srpc_ping_reqst *req;
-	struct sfw_test_instance *tsi = tsu->tsu_instance;
-	struct sfw_session *sn = tsi->tsi_batch->bat_session;
+	srpc_ping_reqst_t   *req;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	sfw_session_t       *sn  = tsi->tsi_batch->bat_session;
 	struct timespec64 ts;
-	int rc;
+	int		     rc;
 
 	LASSERT(sn != NULL);
 	LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
@@ -114,12 +114,12 @@ ping_client_prep_rpc(struct sfw_test_unit *tsu, struct lnet_process_id dest,
 }
 
 static void
-ping_client_done_rpc(struct sfw_test_unit *tsu, struct srpc_client_rpc *rpc)
+ping_client_done_rpc (sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
 {
-	struct sfw_test_instance *tsi = tsu->tsu_instance;
-	struct sfw_session *sn = tsi->tsi_batch->bat_session;
-	struct srpc_ping_reqst *reqst = &rpc->crpc_reqstmsg.msg_body.ping_reqst;
-	struct srpc_ping_reply *reply = &rpc->crpc_replymsg.msg_body.ping_reply;
+        sfw_test_instance_t *tsi = tsu->tsu_instance;
+	sfw_session_t *sn = tsi->tsi_batch->bat_session;
+	srpc_ping_reqst_t *reqst = &rpc->crpc_reqstmsg.msg_body.ping_reqst;
+	srpc_ping_reply_t *reply = &rpc->crpc_replymsg.msg_body.ping_reply;
 	struct timespec64 ts;
 
 	LASSERT(sn != NULL);
@@ -167,11 +167,11 @@ ping_client_done_rpc(struct sfw_test_unit *tsu, struct srpc_client_rpc *rpc)
 static int
 ping_server_handle(struct srpc_server_rpc *rpc)
 {
-	struct srpc_service *sv  = rpc->srpc_scd->scd_svc;
-	struct srpc_msg *reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
-	struct srpc_msg *replymsg = &rpc->srpc_replymsg;
-	struct srpc_ping_reqst *req = &reqstmsg->msg_body.ping_reqst;
-	struct srpc_ping_reply *rep = &rpc->srpc_replymsg.msg_body.ping_reply;
+	struct srpc_service	*sv  = rpc->srpc_scd->scd_svc;
+        srpc_msg_t        *reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+	srpc_msg_t	  *replymsg = &rpc->srpc_replymsg;
+        srpc_ping_reqst_t *req = &reqstmsg->msg_body.ping_reqst;
+        srpc_ping_reply_t *rep = &rpc->srpc_replymsg.msg_body.ping_reply;
 
         LASSERT (sv->sv_id == SRPC_SERVICE_PING);
 
@@ -207,8 +207,7 @@ ping_server_handle(struct srpc_server_rpc *rpc)
 	return 0;
 }
 
-struct sfw_test_client_ops ping_test_client;
-
+sfw_test_client_ops_t ping_test_client;
 void ping_init_test_client(void)
 {
         ping_test_client.tso_init     = ping_client_init;
@@ -217,8 +216,7 @@ void ping_init_test_client(void)
         ping_test_client.tso_done_rpc = ping_client_done_rpc;
 }
 
-struct srpc_service ping_test_service;
-
+srpc_service_t ping_test_service;
 void ping_init_test_service(void)
 {
 	ping_test_service.sv_id       = SRPC_SERVICE_PING;
diff --git a/drivers/staging/lustrefsx/lnet/selftest/rpc.c b/drivers/staging/lustrefsx/lnet/selftest/rpc.c
index b1cc58926acb3..ed88dfeac7085 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/rpc.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/rpc.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -42,21 +42,21 @@
 
 #include "selftest.h"
 
-enum srpc_state {
-	SRPC_STATE_NONE,
-	SRPC_STATE_NI_INIT,
-	SRPC_STATE_EQ_INIT,
-	SRPC_STATE_RUNNING,
-	SRPC_STATE_STOPPING,
-};
+typedef enum {
+        SRPC_STATE_NONE,
+        SRPC_STATE_NI_INIT,
+        SRPC_STATE_EQ_INIT,
+        SRPC_STATE_RUNNING,
+        SRPC_STATE_STOPPING,
+} srpc_state_t;
 
 static struct smoketest_rpc {
 	spinlock_t	 rpc_glock;	/* global lock */
-	struct srpc_service	*rpc_services[SRPC_SERVICE_MAX_ID + 1];
-	struct lnet_handle_eq	 rpc_lnet_eq;	/* _the_ LNet event queue */
-	enum srpc_state		 rpc_state;
+	srpc_service_t	*rpc_services[SRPC_SERVICE_MAX_ID + 1];
+	struct lnet_handle_eq	rpc_lnet_eq;	/* _the_ LNet event queue */
+	srpc_state_t	 rpc_state;
 	struct srpc_counters	 rpc_counters;
-	__u64			 rpc_matchbits;	/* matchbits counter */
+	__u64		 rpc_matchbits;	/* matchbits counter */
 } srpc_data;
 
 static inline int
@@ -67,7 +67,7 @@ srpc_serv_portal(int svc_id)
 }
 
 /* forward ref's */
-static int srpc_handle_rpc(struct swi_workitem *wi);
+int srpc_handle_rpc(swi_workitem_t *wi);
 
 void srpc_get_counters(struct srpc_counters *cnt)
 {
@@ -84,8 +84,7 @@ void srpc_set_counters(const struct srpc_counters *cnt)
 }
 
 static int
-srpc_add_bulk_page(struct srpc_bulk *bk, struct page *pg, int i, int off,
-		   int nob)
+srpc_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i, int off, int nob)
 {
 	LASSERT(off < PAGE_SIZE);
 	LASSERT(nob > 0 && nob <= PAGE_SIZE);
@@ -97,49 +96,48 @@ srpc_add_bulk_page(struct srpc_bulk *bk, struct page *pg, int i, int off,
 }
 
 void
-srpc_free_bulk(struct srpc_bulk *bk)
+srpc_free_bulk (srpc_bulk_t *bk)
 {
-	int i;
+        int         i;
 	struct page *pg;
 
-	LASSERT(bk != NULL);
+        LASSERT (bk != NULL);
 
-	for (i = 0; i < bk->bk_niov; i++) {
-		pg = bk->bk_iovs[i].kiov_page;
-		if (pg == NULL)
-			break;
+        for (i = 0; i < bk->bk_niov; i++) {
+                pg = bk->bk_iovs[i].kiov_page;
+                if (pg == NULL) break;
 
 		__free_page(pg);
-	}
+        }
 
-	LIBCFS_FREE(bk, offsetof(struct srpc_bulk, bk_iovs[bk->bk_niov]));
-	return;
+        LIBCFS_FREE(bk, offsetof(srpc_bulk_t, bk_iovs[bk->bk_niov]));
+        return;
 }
 
-struct srpc_bulk *
+srpc_bulk_t *
 srpc_alloc_bulk(int cpt, unsigned bulk_off, unsigned bulk_npg,
 		unsigned bulk_len, int sink)
 {
-	struct srpc_bulk *bk;
-	int i;
+	srpc_bulk_t  *bk;
+	int	      i;
 
 	LASSERT(bulk_npg > 0 && bulk_npg <= LNET_MAX_IOV);
 
 	LIBCFS_CPT_ALLOC(bk, lnet_cpt_table(), cpt,
-			 offsetof(struct srpc_bulk, bk_iovs[bulk_npg]));
+			 offsetof(srpc_bulk_t, bk_iovs[bulk_npg]));
 	if (bk == NULL) {
 		CERROR("Can't allocate descriptor for %d pages\n", bulk_npg);
 		return NULL;
 	}
 
-	memset(bk, 0, offsetof(struct srpc_bulk, bk_iovs[bulk_npg]));
+	memset(bk, 0, offsetof(srpc_bulk_t, bk_iovs[bulk_npg]));
 	bk->bk_sink   = sink;
 	bk->bk_len    = bulk_len;
 	bk->bk_niov   = bulk_npg;
 
 	for (i = 0; i < bulk_npg; i++) {
 		struct page *pg;
-		int nob;
+		int	    nob;
 
 		pg = cfs_page_cpt_alloc(lnet_cpt_table(), cpt, GFP_KERNEL);
 		if (pg == NULL) {
@@ -192,11 +190,11 @@ srpc_init_server_rpc(struct srpc_server_rpc *rpc,
 static void
 srpc_service_fini(struct srpc_service *svc)
 {
-	struct srpc_service_cd *scd;
-	struct srpc_server_rpc *rpc;
-	struct srpc_buffer *buf;
-	struct list_head *q;
-	int i;
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	struct srpc_buffer	*buf;
+	struct list_head		*q;
+	int			i;
 
 	if (svc->sv_cpt_data == NULL)
 		return;
@@ -212,8 +210,8 @@ srpc_service_fini(struct srpc_service *svc)
 
 			while (!list_empty(q)) {
 				buf = list_entry(q->next,
-						 struct srpc_buffer,
-						 buf_list);
+						     struct srpc_buffer,
+						     buf_list);
 				list_del(&buf->buf_list);
 				LIBCFS_FREE(buf, sizeof(*buf));
 			}
@@ -223,8 +221,8 @@ srpc_service_fini(struct srpc_service *svc)
 
 		while (!list_empty(&scd->scd_rpc_free)) {
 			rpc = list_entry(scd->scd_rpc_free.next,
-					 struct srpc_server_rpc,
-					 srpc_list);
+					     struct srpc_server_rpc,
+					     srpc_list);
 			list_del(&rpc->srpc_list);
 			LIBCFS_FREE(rpc, sizeof(*rpc));
 		}
@@ -248,11 +246,11 @@ int srpc_add_buffer(struct swi_workitem *wi);
 static int
 srpc_service_init(struct srpc_service *svc)
 {
-	struct srpc_service_cd *scd;
-	struct srpc_server_rpc *rpc;
-	int nrpcs;
-	int i;
-	int j;
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	int			nrpcs;
+	int			i;
+	int			j;
 
 	svc->sv_shuttingdown = 0;
 
@@ -329,13 +327,13 @@ srpc_add_service(struct srpc_service *sv)
 	CDEBUG(D_NET, "Adding service: id %d, name %s\n", id, sv->sv_name);
 	return 0;
 
-failed:
+ failed:
 	srpc_service_fini(sv);
 	return -EBUSY;
 }
 
 int
-srpc_remove_service(struct srpc_service *sv)
+srpc_remove_service (srpc_service_t *sv)
 {
 	int id = sv->sv_id;
 
@@ -354,100 +352,98 @@ srpc_remove_service(struct srpc_service *sv)
 static int
 srpc_post_passive_rdma(int portal, int local, __u64 matchbits, void *buf,
 		       int len, int options, struct lnet_process_id peer,
-		       struct lnet_handle_md *mdh, struct srpc_event *ev)
+		       struct lnet_handle_md *mdh, srpc_event_t *ev)
 {
-	int rc;
-	struct lnet_md md;
+	int		 rc;
+	struct lnet_md	 md;
 	struct lnet_handle_me meh;
 
 	rc = LNetMEAttach(portal, peer, matchbits, 0, LNET_UNLINK,
 			  local ? LNET_INS_LOCAL : LNET_INS_AFTER, &meh);
-	if (rc != 0) {
-		CERROR("LNetMEAttach failed: %d\n", rc);
-		LASSERT(rc == -ENOMEM);
-		return -ENOMEM;
-	}
-
-	md.threshold = 1;
-	md.user_ptr  = ev;
-	md.start     = buf;
-	md.length    = len;
-	md.options   = options;
-	md.eq_handle = srpc_data.rpc_lnet_eq;
-
-	rc = LNetMDAttach(meh, md, LNET_UNLINK, mdh);
-	if (rc != 0) {
-		CERROR("LNetMDAttach failed: %d\n", rc);
-		LASSERT(rc == -ENOMEM);
-
-		rc = LNetMEUnlink(meh);
-		LASSERT(rc == 0);
-		return -ENOMEM;
-	}
-
-	CDEBUG(D_NET,
-	       "Posted passive RDMA: peer %s, portal %d, matchbits %#llx\n",
-	       libcfs_id2str(peer), portal, matchbits);
-	return 0;
+        if (rc != 0) {
+                CERROR ("LNetMEAttach failed: %d\n", rc);
+                LASSERT (rc == -ENOMEM);
+                return -ENOMEM;
+        }
+
+        md.threshold = 1;
+        md.user_ptr  = ev;
+        md.start     = buf;
+        md.length    = len;
+        md.options   = options;
+        md.eq_handle = srpc_data.rpc_lnet_eq;
+
+        rc = LNetMDAttach(meh, md, LNET_UNLINK, mdh);
+        if (rc != 0) {
+                CERROR ("LNetMDAttach failed: %d\n", rc);
+                LASSERT (rc == -ENOMEM);
+
+                rc = LNetMEUnlink(meh);
+                LASSERT (rc == 0);
+                return -ENOMEM;
+        }
+
+        CDEBUG (D_NET,
+		"Posted passive RDMA: peer %s, portal %d, matchbits %#llx\n",
+                libcfs_id2str(peer), portal, matchbits);
+        return 0;
 }
 
 static int
 srpc_post_active_rdma(int portal, __u64 matchbits, void *buf, int len,
-		      int options, struct lnet_process_id peer,
-		      lnet_nid_t self, struct lnet_handle_md *mdh,
-		      struct srpc_event *ev)
+		      int options, struct lnet_process_id peer, lnet_nid_t self,
+		      struct lnet_handle_md *mdh, srpc_event_t *ev)
 {
 	int rc;
 	struct lnet_md md;
 
-	md.user_ptr  = ev;
-	md.start     = buf;
-	md.length    = len;
-	md.eq_handle = srpc_data.rpc_lnet_eq;
-	md.threshold = ((options & LNET_MD_OP_GET) != 0) ? 2 : 1;
-	md.options   = options & ~(LNET_MD_OP_PUT | LNET_MD_OP_GET);
-
-	rc = LNetMDBind(md, LNET_UNLINK, mdh);
-	if (rc != 0) {
-		CERROR("LNetMDBind failed: %d\n", rc);
-		LASSERT(rc == -ENOMEM);
-		return -ENOMEM;
-	}
-
-	/* this is kind of an abuse of the LNET_MD_OP_{PUT,GET} options.
-	 * they're only meaningful for MDs attached to an ME (i.e. passive
-	 * buffers...
-	 */
-	if ((options & LNET_MD_OP_PUT) != 0) {
-		rc = LNetPut(self, *mdh, LNET_NOACK_REQ, peer,
-			     portal, matchbits, 0, 0);
-	} else {
-		LASSERT((options & LNET_MD_OP_GET) != 0);
-
-		rc = LNetGet(self, *mdh, peer, portal, matchbits, 0, false);
-	}
-
-	if (rc != 0) {
-		CERROR("LNet%s(%s, %d, %lld) failed: %d\n",
-		       ((options & LNET_MD_OP_PUT) != 0) ? "Put" : "Get",
-		       libcfs_id2str(peer), portal, matchbits, rc);
-
-		/* The forthcoming unlink event will complete this operation
-		 * with failure, so fall through and return success here.
-		 */
-		rc = LNetMDUnlink(*mdh);
-		LASSERT(rc == 0);
-	} else {
-		CDEBUG(D_NET,
-		       "Posted active RDMA: peer %s, portal %u, matchbits %#llx\n",
-		       libcfs_id2str(peer), portal, matchbits);
-	}
-	return 0;
+        md.user_ptr  = ev;
+        md.start     = buf;
+        md.length    = len;
+        md.eq_handle = srpc_data.rpc_lnet_eq;
+        md.threshold = ((options & LNET_MD_OP_GET) != 0) ? 2 : 1;
+        md.options   = options & ~(LNET_MD_OP_PUT | LNET_MD_OP_GET);
+
+        rc = LNetMDBind(md, LNET_UNLINK, mdh);
+        if (rc != 0) {
+                CERROR ("LNetMDBind failed: %d\n", rc);
+                LASSERT (rc == -ENOMEM);
+                return -ENOMEM;
+        }
+
+        /* this is kind of an abuse of the LNET_MD_OP_{PUT,GET} options.
+         * they're only meaningful for MDs attached to an ME (i.e. passive
+         * buffers... */
+        if ((options & LNET_MD_OP_PUT) != 0) {
+                rc = LNetPut(self, *mdh, LNET_NOACK_REQ, peer,
+                             portal, matchbits, 0, 0);
+        } else {
+                LASSERT ((options & LNET_MD_OP_GET) != 0);
+
+                rc = LNetGet(self, *mdh, peer, portal, matchbits, 0);
+        }
+
+        if (rc != 0) {
+		CERROR ("LNet%s(%s, %d, %lld) failed: %d\n",
+                        ((options & LNET_MD_OP_PUT) != 0) ? "Put" : "Get",
+                        libcfs_id2str(peer), portal, matchbits, rc);
+
+                /* The forthcoming unlink event will complete this operation
+                 * with failure, so fall through and return success here.
+                 */
+                rc = LNetMDUnlink(*mdh);
+                LASSERT (rc == 0);
+        } else {
+                CDEBUG (D_NET,
+			"Posted active RDMA: peer %s, portal %u, matchbits %#llx\n",
+                        libcfs_id2str(peer), portal, matchbits);
+        }
+        return 0;
 }
 
 static int
 srpc_post_passive_rqtbuf(int service, int local, void *buf, int len,
-			 struct lnet_handle_md *mdh, struct srpc_event *ev)
+			 struct lnet_handle_md *mdh, srpc_event_t *ev)
 {
 	struct lnet_process_id any = {0};
 
@@ -463,9 +459,9 @@ static int
 srpc_service_post_buffer(struct srpc_service_cd *scd, struct srpc_buffer *buf)
 __must_hold(&scd->scd_lock)
 {
-	struct srpc_service *sv = scd->scd_svc;
-	struct srpc_msg *msg = &buf->buf_msg;
-	int rc;
+	struct srpc_service	*sv = scd->scd_svc;
+	struct srpc_msg		*msg = &buf->buf_msg;
+	int			rc;
 
 	LNetInvalidateMDHandle(&buf->buf_mdh);
 	list_add(&buf->buf_list, &scd->scd_buf_posted);
@@ -511,10 +507,9 @@ __must_hold(&scd->scd_lock)
 int
 srpc_add_buffer(struct swi_workitem *wi)
 {
-	struct srpc_service_cd *scd = container_of(wi, struct srpc_service_cd,
-						   scd_buf_wi);
-	struct srpc_buffer *buf;
-	int rc = 0;
+	struct srpc_service_cd	*scd = wi->swi_workitem.wi_data;
+	struct srpc_buffer	*buf;
+	int			rc = 0;
 
 	/* it's called by workitem scheduler threads, these threads
 	 * should have been set CPT affinity, so buffers will be posted
@@ -558,7 +553,7 @@ srpc_add_buffer(struct swi_workitem *wi)
 	}
 
 	if (rc != 0) {
-		scd->scd_buf_err_stamp = ktime_get_real_seconds();
+		scd->scd_buf_err_stamp = cfs_time_current_sec();
 		scd->scd_buf_err = rc;
 
 		LASSERT(scd->scd_buf_posting > 0);
@@ -572,9 +567,9 @@ srpc_add_buffer(struct swi_workitem *wi)
 int
 srpc_service_add_buffers(struct srpc_service *sv, int nbuffer)
 {
-	struct srpc_service_cd *scd;
-	int rc = 0;
-	int i;
+	struct srpc_service_cd	*scd;
+	int			rc = 0;
+	int			i;
 
 	LASSERTF(nbuffer > 0, "nbuffer must be positive: %d\n", nbuffer);
 
@@ -626,9 +621,9 @@ srpc_service_add_buffers(struct srpc_service *sv, int nbuffer)
 void
 srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer)
 {
-	struct srpc_service_cd *scd;
-	int num;
-	int i;
+	struct srpc_service_cd	*scd;
+	int			num;
+	int			i;
 
 	LASSERT(!sv->sv_shuttingdown);
 
@@ -646,9 +641,9 @@ srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer)
 int
 srpc_finish_service(struct srpc_service *sv)
 {
-	struct srpc_service_cd *scd;
-	struct srpc_server_rpc *rpc;
-	int i;
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	int			i;
 
 	LASSERT(sv->sv_shuttingdown); /* srpc_shutdown_service called */
 
@@ -660,8 +655,8 @@ srpc_finish_service(struct srpc_service *sv)
 		}
 
 		if (scd->scd_buf_nposted > 0) {
-			CDEBUG(D_NET, "waiting for %d posted buffers to unlink\n",
-			       scd->scd_buf_nposted);
+			CDEBUG(D_NET, "waiting for %d posted buffers to "
+			       "unlink\n", scd->scd_buf_nposted);
 			spin_unlock(&scd->scd_lock);
 			return 0;
 		}
@@ -672,8 +667,10 @@ srpc_finish_service(struct srpc_service *sv)
 		}
 
 		rpc = list_entry(scd->scd_rpc_active.next,
-				 struct srpc_server_rpc, srpc_list);
-		CNETERR("Active RPC %p on shutdown: sv %s, peer %s, wi %s scheduled %d running %d, ev fired %d type %d status %d lnet %d\n",
+				     struct srpc_server_rpc, srpc_list);
+		CNETERR("Active RPC %p on shutdown: sv %s, peer %s, "
+			"wi %s scheduled %d running %d, "
+			"ev fired %d type %d status %d lnet %d\n",
 			rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
 			swi_state2str(rpc->srpc_wi.swi_state),
 			rpc->srpc_wi.swi_workitem.wi_scheduled,
@@ -691,8 +688,7 @@ srpc_finish_service(struct srpc_service *sv)
 
 /* called with sv->sv_lock held */
 static void
-srpc_service_recycle_buffer(struct srpc_service_cd *scd,
-			    struct srpc_buffer *buf)
+srpc_service_recycle_buffer(struct srpc_service_cd *scd, srpc_buffer_t *buf)
 __must_hold(&scd->scd_lock)
 {
 	if (!scd->scd_svc->sv_shuttingdown && scd->scd_buf_adjust >= 0) {
@@ -725,9 +721,9 @@ __must_hold(&scd->scd_lock)
 void
 srpc_abort_service(struct srpc_service *sv)
 {
-	struct srpc_service_cd *scd;
-	struct srpc_server_rpc *rpc;
-	int i;
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	int			i;
 
 	CDEBUG(D_NET, "Aborting service: id %d, name %s\n",
 	       sv->sv_id, sv->sv_name);
@@ -737,8 +733,7 @@ srpc_abort_service(struct srpc_service *sv)
 
 		/* schedule in-flight RPCs to notice the abort, NB:
 		 * racing with incoming RPCs; complete fix should make test
-		 * RPCs carry session ID in its headers
-		 */
+		 * RPCs carry session ID in its headers */
 		list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list) {
 			rpc->srpc_aborted = 1;
 			swi_schedule_workitem(&rpc->srpc_wi);
@@ -749,12 +744,12 @@ srpc_abort_service(struct srpc_service *sv)
 }
 
 void
-srpc_shutdown_service(struct srpc_service *sv)
+srpc_shutdown_service(srpc_service_t *sv)
 {
-	struct srpc_service_cd *scd;
-	struct srpc_server_rpc *rpc;
-	struct srpc_buffer *buf;
-	int i;
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	srpc_buffer_t		*buf;
+	int			i;
 
 	CDEBUG(D_NET, "Shutting down service: id %d, name %s\n",
 	       sv->sv_id, sv->sv_name);
@@ -777,139 +772,135 @@ srpc_shutdown_service(struct srpc_service *sv)
 		spin_unlock(&scd->scd_lock);
 
 		/* OK to traverse scd_buf_posted without lock, since no one
-		 * touches scd_buf_posted now
-		 */
+		 * touches scd_buf_posted now */
 		list_for_each_entry(buf, &scd->scd_buf_posted, buf_list)
 			LNetMDUnlink(buf->buf_mdh);
 	}
 }
 
 static int
-srpc_send_request(struct srpc_client_rpc *rpc)
+srpc_send_request (srpc_client_rpc_t *rpc)
 {
-	struct srpc_event *ev = &rpc->crpc_reqstev;
-	int rc;
+        srpc_event_t *ev = &rpc->crpc_reqstev;
+        int           rc;
 
-	ev->ev_fired = 0;
-	ev->ev_data  = rpc;
-	ev->ev_type  = SRPC_REQUEST_SENT;
+        ev->ev_fired = 0;
+        ev->ev_data  = rpc;
+        ev->ev_type  = SRPC_REQUEST_SENT;
 
 	rc = srpc_post_active_rdma(srpc_serv_portal(rpc->crpc_service),
 				   rpc->crpc_service, &rpc->crpc_reqstmsg,
-				   sizeof(struct srpc_msg), LNET_MD_OP_PUT,
+				   sizeof(srpc_msg_t), LNET_MD_OP_PUT,
 				   rpc->crpc_dest, LNET_NID_ANY,
 				   &rpc->crpc_reqstmdh, ev);
-	if (rc != 0) {
-		LASSERT(rc == -ENOMEM);
-		ev->ev_fired = 1;  /* no more event expected */
-	}
-	return rc;
+        if (rc != 0) {
+                LASSERT (rc == -ENOMEM);
+                ev->ev_fired = 1;  /* no more event expected */
+        }
+        return rc;
 }
 
 static int
-srpc_prepare_reply(struct srpc_client_rpc *rpc)
+srpc_prepare_reply (srpc_client_rpc_t *rpc)
 {
-	struct srpc_event *ev = &rpc->crpc_replyev;
-	u64 *id = &rpc->crpc_reqstmsg.msg_body.reqst.rpyid;
-	int rc;
+        srpc_event_t *ev = &rpc->crpc_replyev;
+        __u64        *id = &rpc->crpc_reqstmsg.msg_body.reqst.rpyid;
+        int           rc;
 
-	ev->ev_fired = 0;
-	ev->ev_data  = rpc;
-	ev->ev_type  = SRPC_REPLY_RCVD;
+        ev->ev_fired = 0;
+        ev->ev_data  = rpc;
+        ev->ev_type  = SRPC_REPLY_RCVD;
 
-	*id = srpc_next_id();
+        *id = srpc_next_id();
 
 	rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id,
-				    &rpc->crpc_replymsg,
-				    sizeof(struct srpc_msg),
-				    LNET_MD_OP_PUT, rpc->crpc_dest,
-				    &rpc->crpc_replymdh, ev);
-	if (rc != 0) {
-		LASSERT(rc == -ENOMEM);
-		ev->ev_fired = 1;  /* no more event expected */
-	}
-	return rc;
+                                    &rpc->crpc_replymsg, sizeof(srpc_msg_t),
+                                    LNET_MD_OP_PUT, rpc->crpc_dest,
+                                    &rpc->crpc_replymdh, ev);
+        if (rc != 0) {
+                LASSERT (rc == -ENOMEM);
+                ev->ev_fired = 1;  /* no more event expected */
+        }
+        return rc;
 }
 
 static int
-srpc_prepare_bulk(struct srpc_client_rpc *rpc)
+srpc_prepare_bulk (srpc_client_rpc_t *rpc)
 {
-	struct srpc_bulk *bk = &rpc->crpc_bulk;
-	struct srpc_event *ev = &rpc->crpc_bulkev;
-	__u64 *id = &rpc->crpc_reqstmsg.msg_body.reqst.bulkid;
-	int rc;
-	int opt;
+        srpc_bulk_t  *bk = &rpc->crpc_bulk;
+        srpc_event_t *ev = &rpc->crpc_bulkev;
+        __u64        *id = &rpc->crpc_reqstmsg.msg_body.reqst.bulkid;
+        int           rc;
+        int           opt;
 
-	LASSERT(bk->bk_niov <= LNET_MAX_IOV);
+        LASSERT (bk->bk_niov <= LNET_MAX_IOV);
 
-	/* nothing to do */
-	if (bk->bk_niov == 0)
-		return 0;
+        if (bk->bk_niov == 0) return 0; /* nothing to do */
 
-	opt = bk->bk_sink ? LNET_MD_OP_PUT : LNET_MD_OP_GET;
-	opt |= LNET_MD_KIOV;
+        opt = bk->bk_sink ? LNET_MD_OP_PUT : LNET_MD_OP_GET;
+        opt |= LNET_MD_KIOV;
 
-	ev->ev_fired = 0;
-	ev->ev_data  = rpc;
-	ev->ev_type  = SRPC_BULK_REQ_RCVD;
+        ev->ev_fired = 0;
+        ev->ev_data  = rpc;
+        ev->ev_type  = SRPC_BULK_REQ_RCVD;
 
-	*id = srpc_next_id();
+        *id = srpc_next_id();
 
 	rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id,
-				    &bk->bk_iovs[0], bk->bk_niov, opt,
-				    rpc->crpc_dest, &bk->bk_mdh, ev);
-	if (rc != 0) {
-		LASSERT(rc == -ENOMEM);
-		ev->ev_fired = 1;  /* no more event expected */
-	}
-	return rc;
+                                    &bk->bk_iovs[0], bk->bk_niov, opt,
+                                    rpc->crpc_dest, &bk->bk_mdh, ev);
+        if (rc != 0) {
+                LASSERT (rc == -ENOMEM);
+                ev->ev_fired = 1;  /* no more event expected */
+        }
+        return rc;
 }
 
 static int
-srpc_do_bulk(struct srpc_server_rpc *rpc)
+srpc_do_bulk (srpc_server_rpc_t *rpc)
 {
-	struct srpc_event *ev = &rpc->srpc_ev;
-	struct srpc_bulk *bk = rpc->srpc_bulk;
-	__u64 id = rpc->srpc_reqstbuf->buf_msg.msg_body.reqst.bulkid;
-	int rc;
-	int opt;
-
-	LASSERT(bk != NULL);
-
-	opt = bk->bk_sink ? LNET_MD_OP_GET : LNET_MD_OP_PUT;
-	opt |= LNET_MD_KIOV;
-
-	ev->ev_fired = 0;
-	ev->ev_data  = rpc;
-	ev->ev_type  = bk->bk_sink ? SRPC_BULK_GET_RPLD : SRPC_BULK_PUT_SENT;
-
-	rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, id,
-				   &bk->bk_iovs[0], bk->bk_niov, opt,
-				   rpc->srpc_peer, rpc->srpc_self,
-				   &bk->bk_mdh, ev);
-	if (rc != 0)
-		ev->ev_fired = 1;  /* no more event expected */
-	return rc;
+        srpc_event_t  *ev = &rpc->srpc_ev;
+        srpc_bulk_t   *bk = rpc->srpc_bulk;
+        __u64          id = rpc->srpc_reqstbuf->buf_msg.msg_body.reqst.bulkid;
+        int            rc;
+        int            opt;
+
+        LASSERT (bk != NULL);
+
+        opt = bk->bk_sink ? LNET_MD_OP_GET : LNET_MD_OP_PUT;
+        opt |= LNET_MD_KIOV;
+
+        ev->ev_fired = 0;
+        ev->ev_data  = rpc;
+        ev->ev_type  = bk->bk_sink ? SRPC_BULK_GET_RPLD : SRPC_BULK_PUT_SENT;
+
+        rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, id,
+                                   &bk->bk_iovs[0], bk->bk_niov, opt,
+                                   rpc->srpc_peer, rpc->srpc_self,
+                                   &bk->bk_mdh, ev);
+        if (rc != 0)
+                ev->ev_fired = 1;  /* no more event expected */
+        return rc;
 }
 
 /* only called from srpc_handle_rpc */
 static void
-srpc_server_rpc_done(struct srpc_server_rpc *rpc, int status)
+srpc_server_rpc_done(srpc_server_rpc_t *rpc, int status)
 {
 	struct srpc_service_cd	*scd = rpc->srpc_scd;
 	struct srpc_service	*sv  = scd->scd_svc;
-	struct srpc_buffer *buffer;
+	srpc_buffer_t		*buffer;
 
-	LASSERT(status != 0 || rpc->srpc_wi.swi_state == SWI_STATE_DONE);
+        LASSERT (status != 0 || rpc->srpc_wi.swi_state == SWI_STATE_DONE);
 
-	rpc->srpc_status = status;
+        rpc->srpc_status = status;
 
-	CDEBUG_LIMIT(status == 0 ? D_NET : D_NETERROR,
-		     "Server RPC %p done: service %s, peer %s, status %s:%d\n",
-		     rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
-		     swi_state2str(rpc->srpc_wi.swi_state), status);
+        CDEBUG_LIMIT (status == 0 ? D_NET : D_NETERROR,
+                "Server RPC %p done: service %s, peer %s, status %s:%d\n",
+                rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+                swi_state2str(rpc->srpc_wi.swi_state), status);
 
-	if (status != 0) {
+        if (status != 0) {
 		spin_lock(&srpc_data.rpc_glock);
 		srpc_data.rpc_counters.rpcs_dropped++;
 		spin_unlock(&srpc_data.rpc_glock);
@@ -923,8 +914,7 @@ srpc_server_rpc_done(struct srpc_server_rpc *rpc, int status)
 
 	if (rpc->srpc_reqstbuf != NULL) {
 		/* NB might drop sv_lock in srpc_service_recycle_buffer, but
-		 * sv won't go away for scd_rpc_active must not be empty
-		 */
+		 * sv won't go away for scd_rpc_active must not be empty */
 		srpc_service_recycle_buffer(scd, rpc->srpc_reqstbuf);
 		rpc->srpc_reqstbuf = NULL;
 	}
@@ -942,7 +932,7 @@ srpc_server_rpc_done(struct srpc_server_rpc *rpc, int status)
 
 	if (!sv->sv_shuttingdown && !list_empty(&scd->scd_buf_blocked)) {
 		buffer = list_entry(scd->scd_buf_blocked.next,
-				    struct srpc_buffer, buf_list);
+					srpc_buffer_t, buf_list);
 		list_del(&buffer->buf_list);
 
 		srpc_init_server_rpc(rpc, scd, buffer);
@@ -957,14 +947,14 @@ srpc_server_rpc_done(struct srpc_server_rpc *rpc, int status)
 }
 
 /* handles an incoming RPC */
-static int srpc_handle_rpc(struct swi_workitem *wi)
+int
+srpc_handle_rpc(swi_workitem_t *wi)
 {
-	struct srpc_server_rpc *rpc = container_of(wi, struct srpc_server_rpc,
-						   srpc_wi);
-	struct srpc_service_cd *scd = rpc->srpc_scd;
-	struct srpc_service *sv = scd->scd_svc;
-	struct srpc_event *ev = &rpc->srpc_ev;
-	int rc = 0;
+	struct srpc_server_rpc	*rpc = wi->swi_workitem.wi_data;
+	struct srpc_service_cd	*scd = rpc->srpc_scd;
+	struct srpc_service	*sv = scd->scd_svc;
+	srpc_event_t		*ev = &rpc->srpc_ev;
+	int			rc = 0;
 
 	LASSERT(wi == &rpc->srpc_wi);
 
@@ -973,32 +963,31 @@ static int srpc_handle_rpc(struct swi_workitem *wi)
 	if (sv->sv_shuttingdown || rpc->srpc_aborted) {
 		spin_unlock(&scd->scd_lock);
 
-		if (rpc->srpc_bulk != NULL)
-			LNetMDUnlink(rpc->srpc_bulk->bk_mdh);
-		LNetMDUnlink(rpc->srpc_replymdh);
+                if (rpc->srpc_bulk != NULL)
+                        LNetMDUnlink(rpc->srpc_bulk->bk_mdh);
+                LNetMDUnlink(rpc->srpc_replymdh);
 
-		if (ev->ev_fired) { /* no more event, OK to finish */
-			srpc_server_rpc_done(rpc, -ESHUTDOWN);
-			return 1;
-		}
-		return 0;
-	}
+                if (ev->ev_fired) { /* no more event, OK to finish */
+                        srpc_server_rpc_done(rpc, -ESHUTDOWN);
+                        return 1;
+                }
+                return 0;
+        }
 
 	spin_unlock(&scd->scd_lock);
 
-	switch (wi->swi_state) {
-	default:
-		LBUG();
-		/* fallthrough */
-	case SWI_STATE_NEWBORN: {
-		struct srpc_msg *msg;
-		struct srpc_generic_reply *reply;
+        switch (wi->swi_state) {
+        default:
+                LBUG ();
+        case SWI_STATE_NEWBORN: {
+                srpc_msg_t           *msg;
+                srpc_generic_reply_t *reply;
 
-		msg = &rpc->srpc_reqstbuf->buf_msg;
-		reply = &rpc->srpc_replymsg.msg_body.reply;
+                msg = &rpc->srpc_reqstbuf->buf_msg;
+                reply = &rpc->srpc_replymsg.msg_body.reply;
 
-		if (msg->msg_magic == 0) {
-			/* moaned already in srpc_lnet_ev_handler */
+                if (msg->msg_magic == 0) {
+                        /* moaned already in srpc_lnet_ev_handler */
 			srpc_server_rpc_done(rpc, EBADMSG);
 			return 1;
 		}
@@ -1018,67 +1007,67 @@ static int srpc_handle_rpc(struct swi_workitem *wi)
 				srpc_server_rpc_done(rpc, rc);
 				return 1;
 			}
-		}
-
-		wi->swi_state = SWI_STATE_BULK_STARTED;
-
-		if (rpc->srpc_bulk != NULL) {
-			rc = srpc_do_bulk(rpc);
-			if (rc == 0)
-				return 0; /* wait for bulk */
-
-			LASSERT(ev->ev_fired);
-			ev->ev_status = rc;
-		}
-	}
-	/* fallthrough */
-	case SWI_STATE_BULK_STARTED:
-		LASSERT(rpc->srpc_bulk == NULL || ev->ev_fired);
-
-		if (rpc->srpc_bulk != NULL) {
-			rc = ev->ev_status;
-
-			if (sv->sv_bulk_ready != NULL)
-				rc = (*sv->sv_bulk_ready) (rpc, rc);
-
-			if (rc != 0) {
-				srpc_server_rpc_done(rpc, rc);
-				return 1;
-			}
-		}
-
-		wi->swi_state = SWI_STATE_REPLY_SUBMITTED;
-		rc = srpc_send_reply(rpc);
-		if (rc == 0)
-			return 0; /* wait for reply */
-		srpc_server_rpc_done(rpc, rc);
-		return 1;
-
-	case SWI_STATE_REPLY_SUBMITTED:
-		if (!ev->ev_fired) {
-			CERROR("RPC %p: bulk %p, service %d\n",
+                }
+
+                wi->swi_state = SWI_STATE_BULK_STARTED;
+
+                if (rpc->srpc_bulk != NULL) {
+                        rc = srpc_do_bulk(rpc);
+                        if (rc == 0)
+                                return 0; /* wait for bulk */
+
+                        LASSERT (ev->ev_fired);
+                        ev->ev_status = rc;
+                }
+        }
+	/* Fall through */
+        case SWI_STATE_BULK_STARTED:
+                LASSERT (rpc->srpc_bulk == NULL || ev->ev_fired);
+
+                if (rpc->srpc_bulk != NULL) {
+                        rc = ev->ev_status;
+
+                        if (sv->sv_bulk_ready != NULL)
+                                rc = (*sv->sv_bulk_ready) (rpc, rc);
+
+                        if (rc != 0) {
+                                srpc_server_rpc_done(rpc, rc);
+                                return 1;
+                        }
+                }
+
+                wi->swi_state = SWI_STATE_REPLY_SUBMITTED;
+                rc = srpc_send_reply(rpc);
+                if (rc == 0)
+                        return 0; /* wait for reply */
+                srpc_server_rpc_done(rpc, rc);
+                return 1;
+
+        case SWI_STATE_REPLY_SUBMITTED:
+                if (!ev->ev_fired) {
+                        CERROR("RPC %p: bulk %p, service %d\n",
 			       rpc, rpc->srpc_bulk, sv->sv_id);
-			CERROR("Event: status %d, type %d, lnet %d\n",
-			       ev->ev_status, ev->ev_type, ev->ev_lnet);
-			LASSERT(ev->ev_fired);
-		}
+                        CERROR("Event: status %d, type %d, lnet %d\n",
+                               ev->ev_status, ev->ev_type, ev->ev_lnet);
+                        LASSERT (ev->ev_fired);
+                }
 
-		wi->swi_state = SWI_STATE_DONE;
-		srpc_server_rpc_done(rpc, ev->ev_status);
-		return 1;
-	}
+                wi->swi_state = SWI_STATE_DONE;
+                srpc_server_rpc_done(rpc, ev->ev_status);
+                return 1;
+        }
 
-	return 0;
+        return 0;
 }
 
 static void
 srpc_client_rpc_expired (void *data)
 {
-	struct srpc_client_rpc *rpc = data;
+        srpc_client_rpc_t *rpc = data;
 
-	CWARN("Client RPC expired: service %d, peer %s, timeout %d.\n",
-	      rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
-	      rpc->crpc_timeout);
+        CWARN ("Client RPC expired: service %d, peer %s, timeout %d.\n",
+               rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+               rpc->crpc_timeout);
 
 	spin_lock(&rpc->crpc_lock);
 
@@ -1093,9 +1082,9 @@ srpc_client_rpc_expired (void *data)
 }
 
 static void
-srpc_add_client_rpc_timer(struct srpc_client_rpc *rpc)
+srpc_add_client_rpc_timer(srpc_client_rpc_t *rpc)
 {
-	struct stt_timer *timer = &rpc->crpc_timer;
+	stt_timer_t *timer = &rpc->crpc_timer;
 
 	if (rpc->crpc_timeout == 0)
 		return;
@@ -1103,7 +1092,8 @@ srpc_add_client_rpc_timer(struct srpc_client_rpc *rpc)
 	INIT_LIST_HEAD(&timer->stt_list);
 	timer->stt_data	   = rpc;
 	timer->stt_func    = srpc_client_rpc_expired;
-	timer->stt_expires = ktime_get_real_seconds() + rpc->crpc_timeout;
+	timer->stt_expires = cfs_time_add(rpc->crpc_timeout,
+					  cfs_time_current_sec());
 	stt_add_timer(timer);
 	return;
 }
@@ -1112,10 +1102,9 @@ srpc_add_client_rpc_timer(struct srpc_client_rpc *rpc)
  * Called with rpc->crpc_lock held.
  *
  * Upon exit the RPC expiry timer is not queued and the handler is not
- * running on any CPU.
- */
+ * running on any CPU. */
 static void
-srpc_del_client_rpc_timer(struct srpc_client_rpc *rpc)
+srpc_del_client_rpc_timer (srpc_client_rpc_t *rpc)
 {
 	/* timer not planted or already exploded */
 	if (rpc->crpc_timeout == 0)
@@ -1136,34 +1125,34 @@ srpc_del_client_rpc_timer(struct srpc_client_rpc *rpc)
 }
 
 static void
-srpc_client_rpc_done(struct srpc_client_rpc *rpc, int status)
+srpc_client_rpc_done (srpc_client_rpc_t *rpc, int status)
 {
-	struct swi_workitem *wi = &rpc->crpc_wi;
+	swi_workitem_t *wi = &rpc->crpc_wi;
 
 	LASSERT(status != 0 || wi->swi_state == SWI_STATE_DONE);
 
 	spin_lock(&rpc->crpc_lock);
 
-	rpc->crpc_closed = 1;
-	if (rpc->crpc_status == 0)
-		rpc->crpc_status = status;
-
-	srpc_del_client_rpc_timer(rpc);
-
-	CDEBUG_LIMIT((status == 0) ? D_NET : D_NETERROR,
-		     "Client RPC done: service %d, peer %s, status %s:%d:%d\n",
-		     rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
-		     swi_state2str(wi->swi_state), rpc->crpc_aborted, status);
-
-	/*
-	 * No one can schedule me now since:
-	 * - RPC timer has been defused.
-	 * - all LNet events have been fired.
-	 * - crpc_closed has been set, preventing srpc_abort_rpc from
-	 *   scheduling me.
-	 * Cancel pending schedules and prevent future schedule attempts:
-	 */
-	LASSERT(!srpc_event_pending(rpc));
+        rpc->crpc_closed = 1;
+        if (rpc->crpc_status == 0)
+                rpc->crpc_status = status;
+
+        srpc_del_client_rpc_timer(rpc);
+
+        CDEBUG_LIMIT ((status == 0) ? D_NET : D_NETERROR,
+                "Client RPC done: service %d, peer %s, status %s:%d:%d\n",
+                rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+                swi_state2str(wi->swi_state), rpc->crpc_aborted, status);
+
+        /*
+         * No one can schedule me now since:
+         * - RPC timer has been defused.
+         * - all LNet events have been fired.
+         * - crpc_closed has been set, preventing srpc_abort_rpc from
+         *   scheduling me.
+         * Cancel pending schedules and prevent future schedule attempts:
+         */
+        LASSERT (!srpc_event_pending(rpc));
 	swi_exit_workitem(wi);
 
 	spin_unlock(&rpc->crpc_lock);
@@ -1174,19 +1163,19 @@ srpc_client_rpc_done(struct srpc_client_rpc *rpc, int status)
 
 /* sends an outgoing RPC */
 int
-srpc_send_rpc(struct swi_workitem *wi)
+srpc_send_rpc (swi_workitem_t *wi)
 {
-	int rc = 0;
-	struct srpc_client_rpc *rpc;
-	struct srpc_msg *reply;
-	int do_bulk;
+        int                rc = 0;
+	srpc_client_rpc_t *rpc;
+	srpc_msg_t        *reply;
+	int                do_bulk;
 
 	LASSERT(wi != NULL);
 
 	rpc = wi->swi_workitem.wi_data;
 
-	LASSERT(rpc != NULL);
-	LASSERT(wi == &rpc->crpc_wi);
+        LASSERT (rpc != NULL);
+        LASSERT (wi == &rpc->crpc_wi);
 
 	reply = &rpc->crpc_replymsg;
 	do_bulk = rpc->crpc_bulk.bk_niov > 0;
@@ -1200,93 +1189,86 @@ srpc_send_rpc(struct swi_workitem *wi)
 
 	spin_unlock(&rpc->crpc_lock);
 
-	switch (wi->swi_state) {
-	default:
-		LBUG();
-	case SWI_STATE_NEWBORN:
-		LASSERT(!srpc_event_pending(rpc));
-
-		rc = srpc_prepare_reply(rpc);
-		if (rc != 0) {
-			srpc_client_rpc_done(rpc, rc);
-			return 1;
-		}
+        switch (wi->swi_state) {
+        default:
+                LBUG ();
+        case SWI_STATE_NEWBORN:
+                LASSERT (!srpc_event_pending(rpc));
 
-		rc = srpc_prepare_bulk(rpc);
-		if (rc != 0)
-			break;
+                rc = srpc_prepare_reply(rpc);
+                if (rc != 0) {
+                        srpc_client_rpc_done(rpc, rc);
+                        return 1;
+                }
 
-		wi->swi_state = SWI_STATE_REQUEST_SUBMITTED;
-		rc = srpc_send_request(rpc);
-		break;
+                rc = srpc_prepare_bulk(rpc);
+                if (rc != 0) break;
 
-	case SWI_STATE_REQUEST_SUBMITTED:
-		/* CAVEAT EMPTOR: rqtev, rpyev, and bulkev may come in any
-		 * order; however, they're processed in a strict order:
-		 * rqt, rpy, and bulk.
-		 */
-		if (!rpc->crpc_reqstev.ev_fired)
-			break;
+                wi->swi_state = SWI_STATE_REQUEST_SUBMITTED;
+                rc = srpc_send_request(rpc);
+                break;
 
-		rc = rpc->crpc_reqstev.ev_status;
-		if (rc != 0)
-			break;
+        case SWI_STATE_REQUEST_SUBMITTED:
+                /* CAVEAT EMPTOR: rqtev, rpyev, and bulkev may come in any
+                 * order; however, they're processed in a strict order:
+                 * rqt, rpy, and bulk. */
+                if (!rpc->crpc_reqstev.ev_fired) break;
 
-		wi->swi_state = SWI_STATE_REQUEST_SENT;
-		/* fallthrough */
-	case SWI_STATE_REQUEST_SENT: {
-		enum srpc_msg_type type;
+                rc = rpc->crpc_reqstev.ev_status;
+                if (rc != 0) break;
 
-		type = srpc_service2reply(rpc->crpc_service);
+                wi->swi_state = SWI_STATE_REQUEST_SENT;
+                /* perhaps more events, fall thru */
+		/* Fall through */
+        case SWI_STATE_REQUEST_SENT: {
+                srpc_msg_type_t type = srpc_service2reply(rpc->crpc_service);
 
-		if (!rpc->crpc_replyev.ev_fired)
-			break;
+                if (!rpc->crpc_replyev.ev_fired) break;
 
-		rc = rpc->crpc_replyev.ev_status;
-		if (rc != 0)
-			break;
+                rc = rpc->crpc_replyev.ev_status;
+                if (rc != 0) break;
 
 		srpc_unpack_msg_hdr(reply);
 		if (reply->msg_type != type ||
 		    (reply->msg_magic != SRPC_MSG_MAGIC &&
 		     reply->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
-			CWARN("Bad message from %s: type %u (%d expected), magic %u (%d expected).\n",
-			      libcfs_id2str(rpc->crpc_dest),
-			      reply->msg_type, type,
-			      reply->msg_magic, SRPC_MSG_MAGIC);
-			rc = -EBADMSG;
-			break;
-		}
-
-		if (do_bulk && reply->msg_body.reply.status != 0) {
-			CWARN("Remote error %d at %s, unlink bulk buffer in case peer didn't initiate bulk transfer\n",
-			      reply->msg_body.reply.status,
-			      libcfs_id2str(rpc->crpc_dest));
-			LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
-		}
-
-		wi->swi_state = SWI_STATE_REPLY_RECEIVED;
-	}
-	/* fallthrough */
-	case SWI_STATE_REPLY_RECEIVED:
-		if (do_bulk && !rpc->crpc_bulkev.ev_fired)
-			break;
-
-		rc = do_bulk ? rpc->crpc_bulkev.ev_status : 0;
-
-		/* Bulk buffer was unlinked due to remote error. Clear error
-		 * since reply buffer still contains valid data.
-		 * NB rpc->crpc_done shouldn't look into bulk data in case of
-		 * remote error.
-		 */
-		if (do_bulk && rpc->crpc_bulkev.ev_lnet == LNET_EVENT_UNLINK &&
-		    rpc->crpc_status == 0 && reply->msg_body.reply.status != 0)
-			rc = 0;
-
-		wi->swi_state = SWI_STATE_DONE;
-		srpc_client_rpc_done(rpc, rc);
-		return 1;
-	}
+                        CWARN ("Bad message from %s: type %u (%d expected),"
+                               " magic %u (%d expected).\n",
+                               libcfs_id2str(rpc->crpc_dest),
+                               reply->msg_type, type,
+                               reply->msg_magic, SRPC_MSG_MAGIC);
+                        rc = -EBADMSG;
+                        break;
+                }
+
+                if (do_bulk && reply->msg_body.reply.status != 0) {
+                        CWARN ("Remote error %d at %s, unlink bulk buffer in "
+                               "case peer didn't initiate bulk transfer\n",
+                               reply->msg_body.reply.status,
+                               libcfs_id2str(rpc->crpc_dest));
+                        LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
+                }
+
+                wi->swi_state = SWI_STATE_REPLY_RECEIVED;
+        }
+	/* Fall through */
+        case SWI_STATE_REPLY_RECEIVED:
+                if (do_bulk && !rpc->crpc_bulkev.ev_fired) break;
+
+                rc = do_bulk ? rpc->crpc_bulkev.ev_status : 0;
+
+                /* Bulk buffer was unlinked due to remote error. Clear error
+                 * since reply buffer still contains valid data.
+                 * NB rpc->crpc_done shouldn't look into bulk data in case of
+                 * remote error. */
+                if (do_bulk && rpc->crpc_bulkev.ev_lnet == LNET_EVENT_UNLINK &&
+                    rpc->crpc_status == 0 && reply->msg_body.reply.status != 0)
+                        rc = 0;
+
+                wi->swi_state = SWI_STATE_DONE;
+                srpc_client_rpc_done(rpc, rc);
+                return 1;
+        }
 
 	if (rc != 0) {
 		spin_lock(&rpc->crpc_lock);
@@ -1295,85 +1277,85 @@ srpc_send_rpc(struct swi_workitem *wi)
 	}
 
 abort:
-	if (rpc->crpc_aborted) {
-		LNetMDUnlink(rpc->crpc_reqstmdh);
-		LNetMDUnlink(rpc->crpc_replymdh);
-		LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
-
-		if (!srpc_event_pending(rpc)) {
-			srpc_client_rpc_done(rpc, -EINTR);
-			return 1;
-		}
-	}
-	return 0;
+        if (rpc->crpc_aborted) {
+                LNetMDUnlink(rpc->crpc_reqstmdh);
+                LNetMDUnlink(rpc->crpc_replymdh);
+                LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
+
+                if (!srpc_event_pending(rpc)) {
+                        srpc_client_rpc_done(rpc, -EINTR);
+                        return 1;
+                }
+        }
+        return 0;
 }
 
-struct srpc_client_rpc *
+srpc_client_rpc_t *
 srpc_create_client_rpc(struct lnet_process_id peer, int service,
 		       int nbulkiov, int bulklen,
-		       void (*rpc_done)(struct srpc_client_rpc *),
-		       void (*rpc_fini)(struct srpc_client_rpc *), void *priv)
+		       void (*rpc_done)(srpc_client_rpc_t *),
+		       void (*rpc_fini)(srpc_client_rpc_t *), void *priv)
 {
-	struct srpc_client_rpc *rpc;
+        srpc_client_rpc_t *rpc;
 
-	LIBCFS_ALLOC(rpc, offsetof(struct srpc_client_rpc,
-				   crpc_bulk.bk_iovs[nbulkiov]));
-	if (rpc == NULL)
-		return NULL;
+        LIBCFS_ALLOC(rpc, offsetof(srpc_client_rpc_t,
+                                   crpc_bulk.bk_iovs[nbulkiov]));
+        if (rpc == NULL)
+                return NULL;
 
-	srpc_init_client_rpc(rpc, peer, service, nbulkiov,
-			     bulklen, rpc_done, rpc_fini, priv);
-	return rpc;
+        srpc_init_client_rpc(rpc, peer, service, nbulkiov,
+                             bulklen, rpc_done, rpc_fini, priv);
+        return rpc;
 }
 
 /* called with rpc->crpc_lock held */
 void
-srpc_abort_rpc(struct srpc_client_rpc *rpc, int why)
+srpc_abort_rpc (srpc_client_rpc_t *rpc, int why)
 {
-	LASSERT(why != 0);
+        LASSERT (why != 0);
 
-	if (rpc->crpc_aborted || /* already aborted */
-	    rpc->crpc_closed)    /* callback imminent */
-		return;
+        if (rpc->crpc_aborted || /* already aborted */
+            rpc->crpc_closed)    /* callback imminent */
+                return;
 
-	CDEBUG(D_NET,
-	       "Aborting RPC: service %d, peer %s, state %s, why %d\n",
-	       rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
-	       swi_state2str(rpc->crpc_wi.swi_state), why);
+        CDEBUG (D_NET,
+                "Aborting RPC: service %d, peer %s, state %s, why %d\n",
+                rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+                swi_state2str(rpc->crpc_wi.swi_state), why);
 
-	rpc->crpc_aborted = 1;
-	rpc->crpc_status  = why;
-	swi_schedule_workitem(&rpc->crpc_wi);
-	return;
+        rpc->crpc_aborted = 1;
+        rpc->crpc_status  = why;
+        swi_schedule_workitem(&rpc->crpc_wi);
+        return;
 }
 
 /* called with rpc->crpc_lock held */
 void
-srpc_post_rpc(struct srpc_client_rpc *rpc)
+srpc_post_rpc (srpc_client_rpc_t *rpc)
 {
-	LASSERT(!rpc->crpc_aborted);
-	LASSERT(srpc_data.rpc_state == SRPC_STATE_RUNNING);
+        LASSERT (!rpc->crpc_aborted);
+        LASSERT (srpc_data.rpc_state == SRPC_STATE_RUNNING);
 
-	CDEBUG(D_NET, "Posting RPC: peer %s, service %d, timeout %d\n",
-	       libcfs_id2str(rpc->crpc_dest), rpc->crpc_service,
-	       rpc->crpc_timeout);
+        CDEBUG (D_NET, "Posting RPC: peer %s, service %d, timeout %d\n",
+                libcfs_id2str(rpc->crpc_dest), rpc->crpc_service,
+                rpc->crpc_timeout);
 
-	srpc_add_client_rpc_timer(rpc);
-	swi_schedule_workitem(&rpc->crpc_wi);
-	return;
+        srpc_add_client_rpc_timer(rpc);
+        swi_schedule_workitem(&rpc->crpc_wi);
+        return;
 }
 
 
 int
 srpc_send_reply(struct srpc_server_rpc *rpc)
 {
-	struct srpc_event *ev = &rpc->srpc_ev;
-	struct srpc_msg *msg = &rpc->srpc_replymsg;
-	struct srpc_buffer *buffer = rpc->srpc_reqstbuf;
-	struct srpc_service_cd *scd = rpc->srpc_scd;
-	struct srpc_service *sv = scd->scd_svc;
-	__u64 rpyid;
-	int rc;
+	srpc_event_t		*ev = &rpc->srpc_ev;
+	struct srpc_msg		*msg = &rpc->srpc_replymsg;
+	struct srpc_buffer	*buffer = rpc->srpc_reqstbuf;
+	struct srpc_service_cd	*scd = rpc->srpc_scd;
+	struct srpc_service	*sv = scd->scd_svc;
+	__u64			rpyid;
+	int			rc;
 
 	LASSERT(buffer != NULL);
 	rpyid = buffer->buf_msg.msg_body.reqst.rpyid;
@@ -1382,8 +1364,7 @@ srpc_send_reply(struct srpc_server_rpc *rpc)
 
 	if (!sv->sv_shuttingdown && !srpc_serv_is_framework(sv)) {
 		/* Repost buffer before replying since test client
-		 * might send me another RPC once it gets the reply
-		 */
+		 * might send me another RPC once it gets the reply */
 		if (srpc_service_post_buffer(scd, buffer) != 0)
 			CWARN("Failed to repost %s buffer\n", sv->sv_name);
 		rpc->srpc_reqstbuf = NULL;
@@ -1391,37 +1372,37 @@ srpc_send_reply(struct srpc_server_rpc *rpc)
 
 	spin_unlock(&scd->scd_lock);
 
-	ev->ev_fired = 0;
-	ev->ev_data  = rpc;
-	ev->ev_type  = SRPC_REPLY_SENT;
-
-	msg->msg_magic   = SRPC_MSG_MAGIC;
-	msg->msg_version = SRPC_MSG_VERSION;
-	msg->msg_type    = srpc_service2reply(sv->sv_id);
-
-	rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, rpyid, msg,
-				   sizeof(*msg), LNET_MD_OP_PUT,
-				   rpc->srpc_peer, rpc->srpc_self,
-				   &rpc->srpc_replymdh, ev);
-	if (rc != 0)
-		ev->ev_fired = 1;  /* no more event expected */
-	return rc;
+        ev->ev_fired = 0;
+        ev->ev_data  = rpc;
+        ev->ev_type  = SRPC_REPLY_SENT;
+
+        msg->msg_magic   = SRPC_MSG_MAGIC;
+        msg->msg_version = SRPC_MSG_VERSION;
+        msg->msg_type    = srpc_service2reply(sv->sv_id);
+
+        rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, rpyid, msg,
+                                   sizeof(*msg), LNET_MD_OP_PUT,
+                                   rpc->srpc_peer, rpc->srpc_self,
+                                   &rpc->srpc_replymdh, ev);
+        if (rc != 0)
+                ev->ev_fired = 1;  /* no more event expected */
+        return rc;
 }
 
 /* when in kernel always called with LNET_LOCK() held, and in thread context */
 static void
 srpc_lnet_ev_handler(struct lnet_event *ev)
 {
-	struct srpc_service_cd *scd;
-	struct srpc_event *rpcev = ev->md.user_ptr;
-	struct srpc_client_rpc *crpc;
-	struct srpc_server_rpc *srpc;
-	struct srpc_buffer *buffer;
-	struct srpc_service *sv;
-	struct srpc_msg *msg;
-	enum srpc_msg_type type;
+	struct srpc_service_cd	*scd;
+	srpc_event_t      *rpcev = ev->md.user_ptr;
+	srpc_client_rpc_t *crpc;
+	srpc_server_rpc_t *srpc;
+	srpc_buffer_t     *buffer;
+	srpc_service_t    *sv;
+	srpc_msg_t        *msg;
+	srpc_msg_type_t    type;
 
-	LASSERT(!in_interrupt());
+	LASSERT (!in_interrupt());
 
 	if (ev->status != 0) {
 		__u32 errors;
@@ -1436,43 +1417,41 @@ srpc_lnet_ev_handler(struct lnet_event *ev)
 			ev->status, ev->type, errors);
 	}
 
-	rpcev->ev_lnet = ev->type;
+        rpcev->ev_lnet = ev->type;
 
-	switch (rpcev->ev_type) {
-	default:
-		CERROR("Unknown event: status %d, type %d, lnet %d\n",
-		       rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet);
-		LBUG();
-		/* fallthrough */
-	case SRPC_REQUEST_SENT:
-		if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
+        switch (rpcev->ev_type) {
+        default:
+                CERROR("Unknown event: status %d, type %d, lnet %d\n",
+                       rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet);
+                LBUG ();
+        case SRPC_REQUEST_SENT:
+                if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
 			spin_lock(&srpc_data.rpc_glock);
 			srpc_data.rpc_counters.rpcs_sent++;
 			spin_unlock(&srpc_data.rpc_glock);
-		}
-		/* fallthrough */
-	case SRPC_REPLY_RCVD:
-	case SRPC_BULK_REQ_RCVD:
-		crpc = rpcev->ev_data;
-
-		if (rpcev != &crpc->crpc_reqstev &&
-		    rpcev != &crpc->crpc_replyev &&
-		    rpcev != &crpc->crpc_bulkev) {
-			CERROR("rpcev %p, crpc %p, reqstev %p, replyev %p, bulkev %p\n",
-			       rpcev, crpc, &crpc->crpc_reqstev,
-			       &crpc->crpc_replyev, &crpc->crpc_bulkev);
-			CERROR("Bad event: status %d, type %d, lnet %d\n",
-			       rpcev->ev_status, rpcev->ev_type,
-			       rpcev->ev_lnet);
-			LBUG();
-		}
+                }
+		/* Fall through */
+        case SRPC_REPLY_RCVD:
+        case SRPC_BULK_REQ_RCVD:
+                crpc = rpcev->ev_data;
+
+                if (rpcev != &crpc->crpc_reqstev &&
+                    rpcev != &crpc->crpc_replyev &&
+                    rpcev != &crpc->crpc_bulkev) {
+                        CERROR("rpcev %p, crpc %p, reqstev %p, replyev %p, bulkev %p\n",
+                               rpcev, crpc, &crpc->crpc_reqstev,
+                               &crpc->crpc_replyev, &crpc->crpc_bulkev);
+                        CERROR("Bad event: status %d, type %d, lnet %d\n",
+                               rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet);
+                        LBUG ();
+                }
 
 		spin_lock(&crpc->crpc_lock);
 
 		LASSERT(rpcev->ev_fired == 0);
 		rpcev->ev_fired  = 1;
 		rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ?
-				   -EINTR : ev->status;
+						-EINTR : ev->status;
 		swi_schedule_workitem(&crpc->crpc_wi);
 
 		spin_unlock(&crpc->crpc_lock);
@@ -1486,30 +1465,28 @@ srpc_lnet_ev_handler(struct lnet_event *ev)
 
 		spin_lock(&scd->scd_lock);
 
-		LASSERT(ev->unlinked);
-		LASSERT(ev->type == LNET_EVENT_PUT ||
-			ev->type == LNET_EVENT_UNLINK);
-		LASSERT(ev->type != LNET_EVENT_UNLINK ||
-			sv->sv_shuttingdown);
+                LASSERT (ev->unlinked);
+                LASSERT (ev->type == LNET_EVENT_PUT ||
+                         ev->type == LNET_EVENT_UNLINK);
+                LASSERT (ev->type != LNET_EVENT_UNLINK ||
+                         sv->sv_shuttingdown);
 
-		buffer = container_of(ev->md.start, struct srpc_buffer,
-				      buf_msg);
+                buffer = container_of(ev->md.start, srpc_buffer_t, buf_msg);
 		buffer->buf_peer = ev->source;
-		buffer->buf_self = ev->target.nid;
+                buffer->buf_self = ev->target.nid;
 
 		LASSERT(scd->scd_buf_nposted > 0);
 		scd->scd_buf_nposted--;
 
 		if (sv->sv_shuttingdown) {
 			/* Leave buffer on scd->scd_buf_nposted since
-			 * srpc_finish_service needs to traverse it.
-			 */
+			 * srpc_finish_service needs to traverse it. */
 			spin_unlock(&scd->scd_lock);
 			break;
 		}
 
 		if (scd->scd_buf_err_stamp != 0 &&
-		    scd->scd_buf_err_stamp < ktime_get_real_seconds()) {
+		    scd->scd_buf_err_stamp < cfs_time_current_sec()) {
 			/* re-enable adding buffer */
 			scd->scd_buf_err_stamp = 0;
 			scd->scd_buf_err = 0;
@@ -1527,22 +1504,22 @@ srpc_lnet_ev_handler(struct lnet_event *ev)
 		msg = &buffer->buf_msg;
 		type = srpc_service2request(sv->sv_id);
 
-		if (ev->status != 0 || ev->mlength != sizeof(*msg) ||
-		    (msg->msg_type != type &&
-		     msg->msg_type != __swab32(type)) ||
-		    (msg->msg_magic != SRPC_MSG_MAGIC &&
-		     msg->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
-			CERROR("Dropping RPC (%s) from %s: status %d mlength %d type %u magic %u.\n",
-			       sv->sv_name, libcfs_id2str(ev->initiator),
-			       ev->status, ev->mlength,
-			       msg->msg_type, msg->msg_magic);
-
-			/* NB can't call srpc_service_recycle_buffer here since
-			 * it may call LNetM[DE]Attach. The invalid magic tells
-			 * srpc_handle_rpc to drop this RPC
-			 */
-			msg->msg_magic = 0;
-		}
+                if (ev->status != 0 || ev->mlength != sizeof(*msg) ||
+                    (msg->msg_type != type &&
+                     msg->msg_type != __swab32(type)) ||
+                    (msg->msg_magic != SRPC_MSG_MAGIC &&
+                     msg->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
+                        CERROR ("Dropping RPC (%s) from %s: "
+                                "status %d mlength %d type %u magic %u.\n",
+                                sv->sv_name, libcfs_id2str(ev->initiator),
+                                ev->status, ev->mlength,
+                                msg->msg_type, msg->msg_magic);
+
+                        /* NB can't call srpc_service_recycle_buffer here since
+                         * it may call LNetM[DE]Attach. The invalid magic tells
+                         * srpc_handle_rpc to drop this RPC */
+                        msg->msg_magic = 0;
+                }
 
 		if (!list_empty(&scd->scd_rpc_free)) {
 			srpc = list_entry(scd->scd_rpc_free.next,
@@ -1564,18 +1541,19 @@ srpc_lnet_ev_handler(struct lnet_event *ev)
 		spin_lock(&srpc_data.rpc_glock);
 		srpc_data.rpc_counters.rpcs_rcvd++;
 		spin_unlock(&srpc_data.rpc_glock);
-		break;
+                break;
 
-	case SRPC_BULK_GET_RPLD:
-		LASSERT(ev->type == LNET_EVENT_SEND ||
-			ev->type == LNET_EVENT_REPLY ||
-			ev->type == LNET_EVENT_UNLINK);
+        case SRPC_BULK_GET_RPLD:
+                LASSERT (ev->type == LNET_EVENT_SEND ||
+                         ev->type == LNET_EVENT_REPLY ||
+                         ev->type == LNET_EVENT_UNLINK);
 
-		if (!ev->unlinked)
-			break; /* wait for final event */
-		/* fallthrough */
-	case SRPC_BULK_PUT_SENT:
-		if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
+                if (!ev->unlinked)
+                        break; /* wait for final event */
+		/* Fall through */
+
+        case SRPC_BULK_PUT_SENT:
+                if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
 			spin_lock(&srpc_data.rpc_glock);
 
 			if (rpcev->ev_type == SRPC_BULK_GET_RPLD)
@@ -1585,7 +1563,7 @@ srpc_lnet_ev_handler(struct lnet_event *ev)
 
 			spin_unlock(&srpc_data.rpc_glock);
 		}
-		/* fallthrough */
+		/* Fall through */
 	case SRPC_REPLY_SENT:
 		srpc = rpcev->ev_data;
 		scd  = srpc->srpc_scd;
@@ -1616,84 +1594,84 @@ srpc_startup (void)
 	/* 1 second pause to avoid timestamp reuse */
 	set_current_state(TASK_UNINTERRUPTIBLE);
 	schedule_timeout(cfs_time_seconds(1));
-	srpc_data.rpc_matchbits = ((__u64) ktime_get_real_seconds()) << 48;
+	srpc_data.rpc_matchbits = ((__u64) cfs_time_current_sec()) << 48;
 
 	srpc_data.rpc_state = SRPC_STATE_NONE;
 
 	rc = LNetNIInit(LNET_PID_LUSTRE);
-	if (rc < 0) {
-		CERROR("LNetNIInit() has failed: %d\n", rc);
+        if (rc < 0) {
+                CERROR ("LNetNIInit() has failed: %d\n", rc);
 		return rc;
-	}
+        }
 
-	srpc_data.rpc_state = SRPC_STATE_NI_INIT;
+        srpc_data.rpc_state = SRPC_STATE_NI_INIT;
 
 	LNetInvalidateEQHandle(&srpc_data.rpc_lnet_eq);
 	rc = LNetEQAlloc(0, srpc_lnet_ev_handler, &srpc_data.rpc_lnet_eq);
-	if (rc != 0) {
-		CERROR("LNetEQAlloc() has failed: %d\n", rc);
-		goto bail;
-	}
+        if (rc != 0) {
+                CERROR("LNetEQAlloc() has failed: %d\n", rc);
+                goto bail;
+        }
 
 	rc = LNetSetLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
 	LASSERT(rc == 0);
 	rc = LNetSetLazyPortal(SRPC_REQUEST_PORTAL);
 	LASSERT(rc == 0);
 
-	srpc_data.rpc_state = SRPC_STATE_EQ_INIT;
+        srpc_data.rpc_state = SRPC_STATE_EQ_INIT;
 
-	rc = stt_startup();
+        rc = stt_startup();
 
 bail:
-	if (rc != 0)
-		srpc_shutdown();
-	else
-		srpc_data.rpc_state = SRPC_STATE_RUNNING;
+        if (rc != 0)
+                srpc_shutdown();
+        else
+                srpc_data.rpc_state = SRPC_STATE_RUNNING;
 
-	return rc;
+        return rc;
 }
 
 void
 srpc_shutdown (void)
 {
-	int i;
-	int rc;
-	int state;
+        int i;
+        int rc;
+        int state;
 
-	state = srpc_data.rpc_state;
-	srpc_data.rpc_state = SRPC_STATE_STOPPING;
+        state = srpc_data.rpc_state;
+        srpc_data.rpc_state = SRPC_STATE_STOPPING;
 
-	switch (state) {
-	default:
-		LBUG();
-		/* fallthrough */
-	case SRPC_STATE_RUNNING:
+        switch (state) {
+        default:
+                LBUG ();
+        case SRPC_STATE_RUNNING:
 		spin_lock(&srpc_data.rpc_glock);
 
-		for (i = 0; i <= SRPC_SERVICE_MAX_ID; i++) {
-			struct srpc_service *sv = srpc_data.rpc_services[i];
+                for (i = 0; i <= SRPC_SERVICE_MAX_ID; i++) {
+                        srpc_service_t *sv = srpc_data.rpc_services[i];
 
-			LASSERTF(sv == NULL,
-				 "service not empty: id %d, name %s\n",
-				 i, sv->sv_name);
-		}
+                        LASSERTF (sv == NULL,
+                                  "service not empty: id %d, name %s\n",
+                                  i, sv->sv_name);
+                }
 
 		spin_unlock(&srpc_data.rpc_glock);
 
-		stt_shutdown();
-		/* fallthrough */
+                stt_shutdown();
+		/* Fall through */
 
-	case SRPC_STATE_EQ_INIT:
-		rc = LNetClearLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
+        case SRPC_STATE_EQ_INIT:
+                rc = LNetClearLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
 		rc = LNetClearLazyPortal(SRPC_REQUEST_PORTAL);
-		LASSERT(rc == 0);
-		rc = LNetEQFree(srpc_data.rpc_lnet_eq);
-		LASSERT(rc == 0); /* the EQ should have no user by now */
-		/* fallthrough */
+                LASSERT (rc == 0);
+                rc = LNetEQFree(srpc_data.rpc_lnet_eq);
+                LASSERT (rc == 0); /* the EQ should have no user by now */
+		/* Fall through */
 
-	case SRPC_STATE_NI_INIT:
-		LNetNIFini();
-	}
+        case SRPC_STATE_NI_INIT:
+                LNetNIFini();
+		/* Fall through */
+        }
 
-	return;
+        return;
 }
diff --git a/drivers/staging/lustrefsx/lnet/selftest/rpc.h b/drivers/staging/lustrefsx/lnet/selftest/rpc.h
index 8cc8c434645d5..aab2629e7ba1d 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/rpc.h
+++ b/drivers/staging/lustrefsx/lnet/selftest/rpc.h
@@ -33,14 +33,14 @@
 #ifndef __SELFTEST_RPC_H__
 #define __SELFTEST_RPC_H__
 
-#include <uapi/linux/lnet/lnetst.h>
+#include <lnet/lnetst.h>
 
 /*
  * LST wired structures
- *
+ * 
  * XXX: *REPLY == *REQST + 1
  */
-enum srpc_msg_type {
+typedef enum {
         SRPC_MSG_MKSN_REQST     = 0,
         SRPC_MSG_MKSN_REPLY     = 1,
         SRPC_MSG_RMSN_REQST     = 2,
@@ -59,118 +59,118 @@ enum srpc_msg_type {
         SRPC_MSG_PING_REPLY     = 15,
         SRPC_MSG_JOIN_REQST     = 16,
         SRPC_MSG_JOIN_REPLY     = 17,
-};
+} srpc_msg_type_t;
 
 /* CAVEAT EMPTOR:
- * All struct srpc_*_reqst's 1st field must be matchbits of reply buffer,
+ * All srpc_*_reqst_t's 1st field must be matchbits of reply buffer,
  * and 2nd field matchbits of bulk buffer if any.
  *
- * All struct srpc_*_reply's 1st field must be a __u32 status, and 2nd field
+ * All srpc_*_reply_t's 1st field must be a __u32 status, and 2nd field
  * session id if needed.
  */
-struct srpc_generic_reqst {
+typedef struct {
         __u64 			rpyid;  	/* reply buffer matchbits */
         __u64 			bulkid; 	/* bulk buffer matchbits */
-} WIRE_ATTR;
+} WIRE_ATTR srpc_generic_reqst_t;
 
-struct srpc_generic_reply {
+typedef struct {
         __u32                   status;
 	struct lst_sid               sid;
-} WIRE_ATTR;
+} WIRE_ATTR srpc_generic_reply_t;
 
 /* FRAMEWORK RPCs */
-struct srpc_mksn_reqst {
+typedef struct {
         __u64 			mksn_rpyid;      /* reply buffer matchbits */
 	struct lst_sid               mksn_sid;        /* session id */
         __u32 			mksn_force;      /* use brute force */
         char  			mksn_name[LST_NAME_SIZE];
-} WIRE_ATTR;					/* make session request */
+} WIRE_ATTR srpc_mksn_reqst_t; 			/* make session request */
 
-struct srpc_mksn_reply {
+typedef struct {
         __u32                   mksn_status;      /* session status */
 	struct lst_sid               mksn_sid;         /* session id */
         __u32                   mksn_timeout;     /* session timeout */
         char  			mksn_name[LST_NAME_SIZE];
-} WIRE_ATTR;					/* make session reply */
+} WIRE_ATTR srpc_mksn_reply_t; /* make session reply */
 
-struct srpc_rmsn_reqst {
-	__u64			rmsn_rpyid;	/* reply buffer matchbits */
-	struct lst_sid		rmsn_sid;	/* session id */
-} WIRE_ATTR;					/* remove session request */
+typedef struct {
+        __u64			rmsn_rpyid;      /* reply buffer matchbits */
+	struct lst_sid		rmsn_sid;        /* session id */
+} WIRE_ATTR srpc_rmsn_reqst_t; /* remove session request */
 
-struct srpc_rmsn_reply {
+typedef struct {
         __u32			rmsn_status;
-	struct lst_sid		rmsn_sid;	/* session id */
-} WIRE_ATTR;					/* remove session reply */
+	struct lst_sid		rmsn_sid;        /* session id */
+} WIRE_ATTR srpc_rmsn_reply_t; /* remove session reply */
 
-struct srpc_join_reqst {
+typedef struct {
         __u64			join_rpyid;     /* reply buffer matchbits */
 	struct lst_sid               join_sid;       /* session id to join */
         char                    join_group[LST_NAME_SIZE]; /* group name */
-} WIRE_ATTR;
+} WIRE_ATTR srpc_join_reqst_t;
 
-struct srpc_join_reply {
+typedef struct {
         __u32                   join_status;    /* returned status */
 	struct lst_sid               join_sid;       /* session id */
         __u32 			join_timeout;   /* # seconds' inactivity to expire */
         char                    join_session[LST_NAME_SIZE]; /* session name */
-} WIRE_ATTR;
+} WIRE_ATTR srpc_join_reply_t;
 
-struct srpc_debug_reqst {
+typedef struct {
         __u64                   dbg_rpyid;      /* reply buffer matchbits */ 
 	struct lst_sid               dbg_sid;        /* session id */
         __u32                   dbg_flags;      /* bitmap of debug */
-} WIRE_ATTR;
+} WIRE_ATTR srpc_debug_reqst_t;
 
-struct srpc_debug_reply {
+typedef struct {
         __u32                   dbg_status;     /* returned code */
 	struct lst_sid               dbg_sid;        /* session id */
         __u32                   dbg_timeout;    /* session timeout */
         __u32                   dbg_nbatch;     /* # of batches in the node */
         char                    dbg_name[LST_NAME_SIZE]; /* session name */
-} WIRE_ATTR;
+} WIRE_ATTR srpc_debug_reply_t;
 
 #define SRPC_BATCH_OPC_RUN      1
 #define SRPC_BATCH_OPC_STOP     2
 #define SRPC_BATCH_OPC_QUERY    3
 
-struct srpc_batch_reqst {
+typedef struct {
         __u64                   bar_rpyid;      /* reply buffer matchbits */ 
 	struct lst_sid               bar_sid;        /* session id */
 	struct lst_bid               bar_bid;        /* batch id */
         __u32                   bar_opc;        /* create/start/stop batch */
         __u32                   bar_testidx;    /* index of test */
         __u32                   bar_arg;        /* parameters */
-} WIRE_ATTR;
+} WIRE_ATTR srpc_batch_reqst_t;
 
-struct srpc_batch_reply {
+typedef struct {
         __u32                   bar_status;     /* status of request */
 	struct lst_sid		bar_sid;	/* session id */
         __u32                   bar_active;     /* # of active tests in batch/test */
         __u32                   bar_time;       /* remained time */
-} WIRE_ATTR;
+} WIRE_ATTR srpc_batch_reply_t;
 
-struct srpc_stat_reqst {
+typedef struct {
         __u64                   str_rpyid;      /* reply buffer matchbits */
 	struct lst_sid		str_sid;	/* session id */
         __u32                   str_type;       /* type of stat */
-} WIRE_ATTR;
+} WIRE_ATTR srpc_stat_reqst_t;
 
-struct srpc_stat_reply {
-	__u32                    str_status;
-	struct lst_sid           str_sid;
-	struct sfw_counters      str_fw;
-	struct srpc_counters     str_rpc;
-	struct lnet_counters_common str_lnet;
-} WIRE_ATTR;
+typedef struct {
+        __u32                   str_status;
+	struct lst_sid		str_sid;
+	struct sfw_counters	str_fw;
+	struct srpc_counters	str_rpc;
+	struct lnet_counters	str_lnet;
+} WIRE_ATTR srpc_stat_reply_t;
 
-struct test_bulk_req {
+typedef struct {
         __u32                   blk_opc;        /* bulk operation code */
         __u32                   blk_npg;        /* # of pages */
         __u32                   blk_flags;      /* reserved flags */
-} WIRE_ATTR;
+} WIRE_ATTR test_bulk_req_t;
 
-struct test_bulk_req_v1 {
+typedef struct {
 	/** bulk operation code */
 	__u16			blk_opc;
 	/** data check flags */
@@ -179,14 +179,14 @@ struct test_bulk_req_v1 {
 	__u32			blk_len;
 	/** bulk offset */
 	__u32                   blk_offset;
-} WIRE_ATTR;
+} WIRE_ATTR test_bulk_req_v1_t;
 
-struct test_ping_req {
+typedef struct {
 	__u32			png_size;       /* size of ping message */
 	__u32			png_flags;      /* reserved flags */
-} WIRE_ATTR;
+} WIRE_ATTR test_ping_req_t;
 
-struct srpc_test_reqst {
+typedef struct {
 	__u64			tsr_rpyid;      /* reply buffer matchbits */
 	__u64			tsr_bulkid;     /* bulk buffer matchbits */
 	struct lst_sid		tsr_sid;        /* session id */
@@ -200,86 +200,86 @@ struct srpc_test_reqst {
 	__u32			tsr_ndest;      /* # of dest nodes */
 
 	union {
-		struct test_ping_req	ping;
-		struct test_bulk_req	bulk_v0;
-		struct test_bulk_req_v1	bulk_v1;
-	} tsr_u;
-} WIRE_ATTR;
+		test_ping_req_t		ping;
+		test_bulk_req_t		bulk_v0;
+		test_bulk_req_v1_t	bulk_v1;
+	}		tsr_u;
+} WIRE_ATTR srpc_test_reqst_t;
 
-struct srpc_test_reply {
+typedef struct {
 	__u32			tsr_status;     /* returned code */
 	struct lst_sid		tsr_sid;
-} WIRE_ATTR;
+} WIRE_ATTR srpc_test_reply_t;
 
 /* TEST RPCs */
-struct srpc_ping_reqst {
+typedef struct {
         __u64                   pnr_rpyid;
         __u32                   pnr_magic;
         __u32                   pnr_seq;
         __u64                   pnr_time_sec;
 	__u64                   pnr_time_nsec;
-} WIRE_ATTR;
+} WIRE_ATTR srpc_ping_reqst_t;
 
-struct srpc_ping_reply {
+typedef struct {
         __u32                   pnr_status;
         __u32                   pnr_magic;
         __u32                   pnr_seq;
-} WIRE_ATTR;
+} WIRE_ATTR srpc_ping_reply_t;
 
-struct srpc_brw_reqst {
+typedef struct {
         __u64                   brw_rpyid;      /* reply buffer matchbits */
         __u64                   brw_bulkid;     /* bulk buffer matchbits */
         __u32                   brw_rw;         /* read or write */
         __u32                   brw_len;        /* bulk data len */
         __u32                   brw_flags;      /* bulk data patterns */
-} WIRE_ATTR;					/* bulk r/w request */
+} WIRE_ATTR srpc_brw_reqst_t; /* bulk r/w request */
 
-struct srpc_brw_reply {
+typedef struct {
         __u32                   brw_status;
-} WIRE_ATTR; /* bulk r/w reply */
+} WIRE_ATTR srpc_brw_reply_t; /* bulk r/w reply */
 
 #define SRPC_MSG_MAGIC                  0xeeb0f00d
 #define SRPC_MSG_VERSION                1
 
-struct srpc_msg {
+typedef struct srpc_msg {
 	/** magic number */
 	__u32	msg_magic;
 	/** message version number */
 	__u32	msg_version;
-	/** type of message body: enum srpc_msg_type */
+	/** type of message body: srpc_msg_type_t */
 	__u32	msg_type;
 	__u32	msg_reserved0;
 	__u32	msg_reserved1;
 	/** test session features */
 	__u32	msg_ses_feats;
         union {
-		struct srpc_generic_reqst	reqst;
-		struct srpc_generic_reply	reply;
-
-		struct srpc_mksn_reqst		mksn_reqst;
-		struct srpc_mksn_reply		mksn_reply;
-		struct srpc_rmsn_reqst		rmsn_reqst;
-		struct srpc_rmsn_reply		rmsn_reply;
-		struct srpc_debug_reqst		dbg_reqst;
-		struct srpc_debug_reply		dbg_reply;
-		struct srpc_batch_reqst		bat_reqst;
-		struct srpc_batch_reply		bat_reply;
-		struct srpc_stat_reqst		stat_reqst;
-		struct srpc_stat_reply		stat_reply;
-		struct srpc_test_reqst		tes_reqst;
-		struct srpc_test_reply		tes_reply;
-		struct srpc_join_reqst		join_reqst;
-		struct srpc_join_reply		join_reply;
-
-		struct srpc_ping_reqst		ping_reqst;
-		struct srpc_ping_reply		ping_reply;
-		struct srpc_brw_reqst		brw_reqst;
-		struct srpc_brw_reply		brw_reply;
-	} msg_body;
-} WIRE_ATTR;
+                srpc_generic_reqst_t reqst;
+                srpc_generic_reply_t reply;
+
+                srpc_mksn_reqst_t    mksn_reqst;
+                srpc_mksn_reply_t    mksn_reply;
+                srpc_rmsn_reqst_t    rmsn_reqst;
+                srpc_rmsn_reply_t    rmsn_reply;
+                srpc_debug_reqst_t   dbg_reqst;
+                srpc_debug_reply_t   dbg_reply;
+                srpc_batch_reqst_t   bat_reqst;
+                srpc_batch_reply_t   bat_reply;
+                srpc_stat_reqst_t    stat_reqst;
+                srpc_stat_reply_t    stat_reply;
+                srpc_test_reqst_t    tes_reqst;
+                srpc_test_reply_t    tes_reply;
+                srpc_join_reqst_t    join_reqst;
+                srpc_join_reply_t    join_reply;
+
+                srpc_ping_reqst_t    ping_reqst;
+                srpc_ping_reply_t    ping_reply;
+                srpc_brw_reqst_t     brw_reqst;
+                srpc_brw_reply_t     brw_reply;
+        }     msg_body;
+} WIRE_ATTR srpc_msg_t;
 
 static inline void
-srpc_unpack_msg_hdr(struct srpc_msg *msg)
+srpc_unpack_msg_hdr(srpc_msg_t *msg)
 {
 	if (msg->msg_magic == SRPC_MSG_MAGIC)
 		return; /* no flipping needed */
diff --git a/drivers/staging/lustrefsx/lnet/selftest/selftest.h b/drivers/staging/lustrefsx/lnet/selftest/selftest.h
index 3f7c295e9a90c..2a29161cd4802 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/selftest.h
+++ b/drivers/staging/lustrefsx/lnet/selftest/selftest.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -42,7 +42,7 @@
 #include <lnet/api.h>
 #include <lnet/lib-lnet.h>
 #include <lnet/lib-types.h>
-#include <uapi/linux/lnet/lnetst.h>
+#include <lnet/lnetst.h>
 
 #include "rpc.h"
 #include "timer.h"
@@ -89,7 +89,7 @@ struct sfw_test_instance;
 /* all reply/bulk RDMAs go to this portal */
 #define SRPC_RDMA_PORTAL                52
 
-static inline enum srpc_msg_type
+static inline srpc_msg_type_t
 srpc_service2request (int service)
 {
         switch (service) {
@@ -124,13 +124,13 @@ srpc_service2request (int service)
         }
 }
 
-static inline enum srpc_msg_type
+static inline srpc_msg_type_t
 srpc_service2reply (int service)
 {
         return srpc_service2request(service) + 1;
 }
 
-enum srpc_event_type {
+typedef enum {
         SRPC_BULK_REQ_RCVD   = 1, /* passive bulk request(PUT sink/GET source) received */
         SRPC_BULK_PUT_SENT   = 2, /* active bulk PUT sent (source) */
         SRPC_BULK_GET_RPLD   = 3, /* active bulk GET replied (sink) */
@@ -138,74 +138,73 @@ enum srpc_event_type {
         SRPC_REPLY_SENT      = 5, /* outgoing reply sent */
         SRPC_REQUEST_RCVD    = 6, /* incoming request received */
         SRPC_REQUEST_SENT    = 7, /* outgoing request sent */
-};
+} srpc_event_type_t;
 
 /* RPC event */
-struct srpc_event {
-	enum srpc_event_type	ev_type;   /* what's up */
-	enum lnet_event_kind	ev_lnet;   /* LNet event type */
+typedef struct {
+        srpc_event_type_t ev_type;   /* what's up */
+	enum lnet_event_kind ev_lnet;   /* LNet event type */
         int               ev_fired;  /* LNet event fired? */
         int               ev_status; /* LNet event status */
         void             *ev_data;   /* owning server/client RPC */
-};
+} srpc_event_t;
 
-/* bulk descriptor */
-struct srpc_bulk {
+typedef struct {
         int              bk_len;  /* len of bulk data */
 	struct lnet_handle_md bk_mdh;
         int              bk_sink; /* sink/source */
         int              bk_niov; /* # iov in bk_iovs */
         lnet_kiov_t      bk_iovs[0];
-};
+} srpc_bulk_t; /* bulk descriptor */
 
 /* message buffer descriptor */
-struct srpc_buffer {
+typedef struct srpc_buffer {
 	struct list_head	buf_list; /* chain on srpc_service::*_msgq */
-	struct srpc_msg		buf_msg;
+	srpc_msg_t		buf_msg;
 	struct lnet_handle_md	buf_mdh;
 	lnet_nid_t		buf_self;
 	struct lnet_process_id	buf_peer;
-};
+} srpc_buffer_t;
 
 struct swi_workitem;
-typedef int (*swi_action_t)(struct swi_workitem *);
+typedef int (*swi_action_t) (struct swi_workitem *);
 
-struct swi_workitem {
+typedef struct swi_workitem {
 	struct cfs_wi_sched	*swi_sched;
-	struct cfs_workitem	swi_workitem;
+	struct cfs_workitem       swi_workitem;
         swi_action_t         swi_action;
         int                  swi_state;
-};
+} swi_workitem_t;
 
 /* server-side state of a RPC */
-struct srpc_server_rpc {
+typedef struct srpc_server_rpc {
 	/* chain on srpc_service::*_rpcq */
 	struct list_head	srpc_list;
 	struct srpc_service_cd *srpc_scd;
-	struct swi_workitem	srpc_wi;
-	struct srpc_event	srpc_ev;	/* bulk/reply event */
+	swi_workitem_t		srpc_wi;
+	srpc_event_t		srpc_ev;	/* bulk/reply event */
 	lnet_nid_t		srpc_self;
 	struct lnet_process_id	srpc_peer;
-	struct srpc_msg		srpc_replymsg;
+	srpc_msg_t		srpc_replymsg;
 	struct lnet_handle_md	srpc_replymdh;
-	struct srpc_buffer     *srpc_reqstbuf;
-	struct srpc_bulk       *srpc_bulk;
+	srpc_buffer_t		*srpc_reqstbuf;
+	srpc_bulk_t		*srpc_bulk;
 
 	unsigned int	srpc_aborted; /* being given up */
 	int		srpc_status;
 	void		(*srpc_done)(struct srpc_server_rpc *);
-};
+} srpc_server_rpc_t;
 
 /* client-side state of a RPC */
-struct srpc_client_rpc {
+typedef struct srpc_client_rpc {
 	struct list_head	crpc_list;	/* chain on user's lists */
 	spinlock_t		crpc_lock;	/* serialize */
 	int			crpc_service;
 	atomic_t		crpc_refcount;
 	/* # seconds to wait for reply */
 	int			crpc_timeout;
-	struct stt_timer	crpc_timer;
-	struct swi_workitem	crpc_wi;
+	stt_timer_t		crpc_timer;
+	swi_workitem_t		crpc_wi;
 	struct lnet_process_id	crpc_dest;
 
         void               (*crpc_done)(struct srpc_client_rpc *);
@@ -217,21 +216,21 @@ struct srpc_client_rpc {
         unsigned int         crpc_aborted:1; /* being given up */
         unsigned int         crpc_closed:1;  /* completed */
 
-	/* RPC events */
-	struct srpc_event	crpc_bulkev;	/* bulk event */
-	struct srpc_event	crpc_reqstev;	/* request event */
-	struct srpc_event	crpc_replyev;	/* reply event */
+        /* RPC events */
+        srpc_event_t         crpc_bulkev;    /* bulk event */
+        srpc_event_t         crpc_reqstev;   /* request event */
+        srpc_event_t         crpc_replyev;   /* reply event */
 
-	/* bulk, request(reqst), and reply exchanged on wire */
-	struct srpc_msg		crpc_reqstmsg;
-	struct srpc_msg		crpc_replymsg;
+        /* bulk, request(reqst), and reply exchanged on wire */
+        srpc_msg_t           crpc_reqstmsg;
+        srpc_msg_t           crpc_replymsg;
 	struct lnet_handle_md	crpc_reqstmdh;
 	struct lnet_handle_md	crpc_replymdh;
-	struct srpc_bulk	crpc_bulk;
-};
+        srpc_bulk_t          crpc_bulk;
+} srpc_client_rpc_t;
 
 #define srpc_client_rpc_size(rpc)                                       \
-offsetof(struct srpc_client_rpc, crpc_bulk.bk_iovs[(rpc)->crpc_bulk.bk_niov])
+offsetof(srpc_client_rpc_t, crpc_bulk.bk_iovs[(rpc)->crpc_bulk.bk_niov])
 
 #define srpc_client_rpc_addref(rpc)                                     \
 do {                                                                    \
@@ -263,19 +262,19 @@ struct srpc_service_cd {
 	/** backref to service */
 	struct srpc_service	*scd_svc;
 	/** event buffer */
-	struct srpc_event	scd_ev;
+	srpc_event_t		scd_ev;
 	/** free RPC descriptors */
 	struct list_head	scd_rpc_free;
 	/** in-flight RPCs */
 	struct list_head	scd_rpc_active;
 	/** workitem for posting buffer */
-	struct swi_workitem	scd_buf_wi;
+	swi_workitem_t		scd_buf_wi;
 	/** CPT id */
 	int			scd_cpt;
 	/** error code for scd_buf_wi */
 	int			scd_buf_err;
 	/** timestamp for scd_buf_err */
-	time64_t		scd_buf_err_stamp;
+	unsigned long		scd_buf_err_stamp;
 	/** total # request buffers */
 	int			scd_buf_total;
 	/** # posted request buffers */
@@ -303,7 +302,7 @@ struct srpc_service_cd {
 #define SFW_FRWK_WI_MIN		16
 #define SFW_FRWK_WI_MAX		256
 
-struct srpc_service {
+typedef struct srpc_service {
 	int			sv_id;		/* service id */
 	const char		*sv_name;	/* human readable name */
 	int			sv_wi_total;	/* total server workitems */
@@ -315,11 +314,11 @@ struct srpc_service {
          * - sv_handler: process incoming RPC request
          * - sv_bulk_ready: notify bulk data
          */
-	int              (*sv_handler)(struct srpc_server_rpc *);
-	int              (*sv_bulk_ready)(struct srpc_server_rpc *, int);
-};
+        int              (*sv_handler) (srpc_server_rpc_t *);
+        int              (*sv_bulk_ready) (srpc_server_rpc_t *, int);
+} srpc_service_t;
 
-struct sfw_session {
+typedef struct {
 	/* chain on fw_zombie_sessions */
 	struct list_head	sn_list;
 	struct lst_sid		sn_id;		/* unique identifier */
@@ -327,42 +326,42 @@ struct sfw_session {
 	unsigned int		sn_timeout;
 	int			sn_timer_active;
 	unsigned int		sn_features;
-	struct stt_timer	sn_timer;
+	stt_timer_t		sn_timer;
 	struct list_head	sn_batches;	/* list of batches */
 	char			sn_name[LST_NAME_SIZE];
 	atomic_t		sn_refcount;
 	atomic_t		sn_brw_errors;
 	atomic_t		sn_ping_errors;
-	ktime_t			sn_started;
-};
+	cfs_time_t		sn_started;
+} sfw_session_t;
 
 #define sfw_sid_equal(sid0, sid1)     ((sid0).ses_nid == (sid1).ses_nid && \
                                        (sid0).ses_stamp == (sid1).ses_stamp)
 
-struct sfw_batch {
+typedef struct {
 	struct list_head	bat_list;	/* chain on sn_batches */
 	struct lst_bid		bat_id;		/* batch id */
 	int			bat_error;	/* error code of batch */
-	struct sfw_session	*bat_session;	/* batch's session */
+	sfw_session_t		*bat_session;	/* batch's session */
 	atomic_t		bat_nactive;	/* # of active tests */
 	struct list_head	bat_tests;	/* test instances */
-};
+} sfw_batch_t;
 
-struct sfw_test_client_ops {
-	int  (*tso_init)(struct sfw_test_instance *tsi); /* intailize test client */
-	void (*tso_fini)(struct sfw_test_instance *tsi); /* finalize test client */
-	int  (*tso_prep_rpc)(struct sfw_test_unit *tsu,
+typedef struct {
+        int  (*tso_init)(struct sfw_test_instance *tsi); /* intialize test client */
+        void (*tso_fini)(struct sfw_test_instance *tsi); /* finalize test client */
+        int  (*tso_prep_rpc)(struct sfw_test_unit *tsu,
 			     struct lnet_process_id dest,
-			     struct srpc_client_rpc **rpc); /* prep a tests rpc */
+                             srpc_client_rpc_t **rpc);   /* prep a tests rpc */
         void (*tso_done_rpc)(struct sfw_test_unit *tsu,
-			     struct srpc_client_rpc *rpc);  /* done a test rpc */
-};
+                             srpc_client_rpc_t *rpc);    /* done a test rpc */
+} sfw_test_client_ops_t;
 
-struct sfw_test_instance {
+typedef struct sfw_test_instance {
 	struct list_head	tsi_list;	/* chain on batch */
 	int			tsi_service;	/* test type */
-	struct sfw_batch	*tsi_batch;	/* batch */
-	struct sfw_test_client_ops	*tsi_ops;	/* test client operations */
+	sfw_batch_t		*tsi_batch;	/* batch */
+	sfw_test_client_ops_t	*tsi_ops;	/* test client operations */
 
 	/* public parameter for all test units */
 	unsigned int		tsi_is_client:1;     /* is test client */
@@ -379,11 +378,11 @@ struct sfw_test_instance {
 	struct list_head	tsi_active_rpcs;/* active rpcs */
 
 	union {
-		struct test_ping_req	ping;	  /* ping parameter */
-		struct test_bulk_req	bulk_v0;  /* bulk parameter */
-		struct test_bulk_req_v1	bulk_v1;  /* bulk v1 parameter */
+		test_ping_req_t		ping;	  /* ping parameter */
+		test_bulk_req_t		bulk_v0;  /* bulk parameter */
+		test_bulk_req_v1_t	bulk_v1;  /* bulk v1 parameter */
 	} tsi_u;
-};
+} sfw_test_instance_t;
 
 /* XXX: trailing (PAGE_SIZE % sizeof(struct lnet_process_id)) bytes at
  * the end of pages are not used */
@@ -392,59 +391,57 @@ struct sfw_test_instance {
 #define SFW_MAX_NDESTS     (LNET_MAX_IOV * SFW_ID_PER_PAGE)
 #define sfw_id_pages(n)    (((n) + SFW_ID_PER_PAGE - 1) / SFW_ID_PER_PAGE)
 
-struct sfw_test_unit {
+typedef struct sfw_test_unit {
 	struct list_head	tsu_list;	/* chain on lst_test_instance */
 	struct lnet_process_id	tsu_dest;	/* id of dest node */
 	int			tsu_loop;	/* loop count of the test */
-	struct sfw_test_instance *tsu_instance;	/* pointer to test instance */
+	sfw_test_instance_t	*tsu_instance;	/* pointer to test instance */
 	void			*tsu_private;	/* private data */
-	struct swi_workitem	 tsu_worker;	/* workitem of the test unit */
-};
+	swi_workitem_t		tsu_worker;	/* workitem of the test unit */
+} sfw_test_unit_t;
 
-struct sfw_test_case {
-	struct list_head		tsc_list;		/* chain on fw_tests */
-	struct srpc_service		*tsc_srv_service;	/* test service */
-	struct sfw_test_client_ops	*tsc_cli_ops;		/* ops of test client */
-};
+typedef struct sfw_test_case {
+	struct list_head	tsc_list;		/* chain on fw_tests */
+	srpc_service_t		*tsc_srv_service;	/* test service */
+	sfw_test_client_ops_t	*tsc_cli_ops;		/* ops of test client */
+} sfw_test_case_t;
 
-struct srpc_client_rpc *
+srpc_client_rpc_t *
 sfw_create_rpc(struct lnet_process_id peer, int service,
 	       unsigned features, int nbulkiov, int bulklen,
-	       void (*done)(struct srpc_client_rpc *), void *priv);
-int sfw_create_test_rpc(struct sfw_test_unit *tsu,
+	       void (*done) (srpc_client_rpc_t *), void *priv);
+int sfw_create_test_rpc(sfw_test_unit_t *tsu,
 			struct lnet_process_id peer, unsigned int features,
-			int nblk, int blklen, struct srpc_client_rpc **rpc);
-void sfw_abort_rpc(struct srpc_client_rpc *rpc);
-void sfw_post_rpc(struct srpc_client_rpc *rpc);
-void sfw_client_rpc_done(struct srpc_client_rpc *rpc);
-void sfw_unpack_message(struct srpc_msg *msg);
-void sfw_free_pages(struct srpc_server_rpc *rpc);
-void sfw_add_bulk_page(struct srpc_bulk *bk, struct page *pg, int i);
-int sfw_alloc_pages(struct srpc_server_rpc *rpc, int cpt, int npages, int len,
+			int nblk, int blklen, srpc_client_rpc_t **rpc);
+void sfw_abort_rpc(srpc_client_rpc_t *rpc);
+void sfw_post_rpc(srpc_client_rpc_t *rpc);
+void sfw_client_rpc_done(srpc_client_rpc_t *rpc);
+void sfw_unpack_message(srpc_msg_t *msg);
+void sfw_free_pages(srpc_server_rpc_t *rpc);
+void sfw_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i);
+int sfw_alloc_pages(srpc_server_rpc_t *rpc, int cpt, int npages, int len,
 		    int sink);
-int sfw_make_session(struct srpc_mksn_reqst *request,
-		     struct srpc_mksn_reply *reply);
+int sfw_make_session (srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply);
 
-struct srpc_client_rpc *
+srpc_client_rpc_t *
 srpc_create_client_rpc(struct lnet_process_id peer, int service,
                        int nbulkiov, int bulklen,
-		       void (*rpc_done)(struct srpc_client_rpc *),
-		       void (*rpc_fini)(struct srpc_client_rpc *), void *priv);
-void srpc_post_rpc(struct srpc_client_rpc *rpc);
-void srpc_abort_rpc(struct srpc_client_rpc *rpc, int why);
-void srpc_free_bulk(struct srpc_bulk *bk);
-struct srpc_bulk *srpc_alloc_bulk(int cpt, unsigned int off,
-				  unsigned int bulk_npg, unsigned int bulk_len,
-				  int sink);
-int srpc_send_rpc(struct swi_workitem *wi);
-int srpc_send_reply(struct srpc_server_rpc *rpc);
-int srpc_add_service(struct srpc_service *sv);
-int srpc_remove_service(struct srpc_service *sv);
-void srpc_shutdown_service(struct srpc_service *sv);
-void srpc_abort_service(struct srpc_service *sv);
-int srpc_finish_service(struct srpc_service *sv);
-int srpc_service_add_buffers(struct srpc_service *sv, int nbuffer);
-void srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer);
+                       void (*rpc_done)(srpc_client_rpc_t *),
+                       void (*rpc_fini)(srpc_client_rpc_t *), void *priv);
+void srpc_post_rpc(srpc_client_rpc_t *rpc);
+void srpc_abort_rpc(srpc_client_rpc_t *rpc, int why);
+void srpc_free_bulk(srpc_bulk_t *bk);
+srpc_bulk_t *srpc_alloc_bulk(int cpt, unsigned off, unsigned bulk_npg,
+			     unsigned bulk_len, int sink);
+int srpc_send_rpc(swi_workitem_t *wi);
+int srpc_send_reply(srpc_server_rpc_t *rpc);
+int srpc_add_service(srpc_service_t *sv);
+int srpc_remove_service(srpc_service_t *sv);
+void srpc_shutdown_service(srpc_service_t *sv);
+void srpc_abort_service(srpc_service_t *sv);
+int srpc_finish_service(srpc_service_t *sv);
+int srpc_service_add_buffers(srpc_service_t *sv, int nbuffer);
+void srpc_service_remove_buffers(srpc_service_t *sv, int nbuffer);
 void srpc_get_counters(struct srpc_counters *cnt);
 void srpc_set_counters(const struct srpc_counters *cnt);
 
@@ -460,14 +457,13 @@ srpc_serv_is_framework(struct srpc_service *svc)
 static inline int
 swi_wi_action(struct cfs_workitem *wi)
 {
-	struct swi_workitem *swi;
+        swi_workitem_t *swi = container_of(wi, swi_workitem_t, swi_workitem);
 
-	swi = container_of(wi, struct swi_workitem, swi_workitem);
-	return swi->swi_action(swi);
+        return swi->swi_action(swi);
 }
 
 static inline void
-swi_init_workitem(struct swi_workitem *swi, void *data,
+swi_init_workitem(swi_workitem_t *swi, void *data,
 		  swi_action_t action, struct cfs_wi_sched *sched)
 {
 	swi->swi_sched  = sched;
@@ -477,19 +473,19 @@ swi_init_workitem(struct swi_workitem *swi, void *data,
 }
 
 static inline void
-swi_schedule_workitem(struct swi_workitem *wi)
+swi_schedule_workitem(swi_workitem_t *wi)
 {
 	cfs_wi_schedule(wi->swi_sched, &wi->swi_workitem);
 }
 
 static inline void
-swi_exit_workitem(struct swi_workitem *swi)
+swi_exit_workitem(swi_workitem_t *swi)
 {
 	cfs_wi_exit(swi->swi_sched, &swi->swi_workitem);
 }
 
 static inline int
-swi_deschedule_workitem(struct swi_workitem *swi)
+swi_deschedule_workitem(swi_workitem_t *swi)
 {
 	return cfs_wi_deschedule(swi->swi_sched, &swi->swi_workitem);
 }
@@ -500,7 +496,7 @@ void sfw_shutdown(void);
 void srpc_shutdown(void);
 
 static inline void
-srpc_destroy_client_rpc(struct srpc_client_rpc *rpc)
+srpc_destroy_client_rpc (srpc_client_rpc_t *rpc)
 {
 	LASSERT (rpc != NULL);
 	LASSERT (!srpc_event_pending(rpc));
@@ -516,14 +512,14 @@ srpc_destroy_client_rpc(struct srpc_client_rpc *rpc)
 }
 
 static inline void
-srpc_init_client_rpc(struct srpc_client_rpc *rpc, struct lnet_process_id peer,
+srpc_init_client_rpc(srpc_client_rpc_t *rpc, struct lnet_process_id peer,
 		     int service, int nbulkiov, int bulklen,
-		     void (*rpc_done)(struct srpc_client_rpc *),
-		     void (*rpc_fini)(struct srpc_client_rpc *), void *priv)
+		     void (*rpc_done)(srpc_client_rpc_t *),
+		     void (*rpc_fini)(srpc_client_rpc_t *), void *priv)
 {
 	LASSERT(nbulkiov <= LNET_MAX_IOV);
 
-	memset(rpc, 0, offsetof(struct srpc_client_rpc,
+	memset(rpc, 0, offsetof(srpc_client_rpc_t,
 				crpc_bulk.bk_iovs[nbulkiov]));
 
 	INIT_LIST_HEAD(&rpc->crpc_list);
@@ -589,7 +585,7 @@ do {									\
 } while (0)
 
 static inline void
-srpc_wait_service_shutdown(struct srpc_service *sv)
+srpc_wait_service_shutdown(srpc_service_t *sv)
 {
 	int i = 2;
 
@@ -605,13 +601,13 @@ srpc_wait_service_shutdown(struct srpc_service *sv)
 	}
 }
 
-extern struct sfw_test_client_ops ping_test_client;
-extern struct srpc_service ping_test_service;
+extern sfw_test_client_ops_t ping_test_client;
+extern srpc_service_t        ping_test_service;
 void ping_init_test_client(void);
 void ping_init_test_service(void);
 
-extern struct sfw_test_client_ops brw_test_client;
-extern struct srpc_service brw_test_service;
+extern sfw_test_client_ops_t brw_test_client;
+extern srpc_service_t        brw_test_service;
 void brw_init_test_client(void);
 void brw_init_test_service(void);
 
diff --git a/drivers/staging/lustrefsx/lnet/selftest/timer.c b/drivers/staging/lustrefsx/lnet/selftest/timer.c
index 3ceec81bf1b08..7e09e6672b3ef 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/timer.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/timer.c
@@ -56,7 +56,7 @@
 static struct st_timer_data {
 	spinlock_t		stt_lock;
 	/* start time of the slot processed previously */
-	time64_t		stt_prev_slot;
+	cfs_time_t		stt_prev_slot;
 	struct list_head	stt_hash[STTIMER_NSLOTS];
 	int			stt_shuttingdown;
 	wait_queue_head_t	stt_waitq;
@@ -64,7 +64,7 @@ static struct st_timer_data {
 } stt_data;
 
 void
-stt_add_timer(struct stt_timer *timer)
+stt_add_timer(stt_timer_t *timer)
 {
 	struct list_head *pos;
 
@@ -74,12 +74,11 @@ stt_add_timer(struct stt_timer *timer)
 	LASSERT(!stt_data.stt_shuttingdown);
 	LASSERT(timer->stt_func != NULL);
 	LASSERT(list_empty(&timer->stt_list));
-	LASSERT(timer->stt_expires > ktime_get_real_seconds());
+	LASSERT(cfs_time_after(timer->stt_expires, cfs_time_current_sec()));
 
 	/* a simple insertion sort */
 	list_for_each_prev(pos, STTIMER_SLOT(timer->stt_expires)) {
-		struct stt_timer *old = list_entry(pos, struct stt_timer,
-						   stt_list);
+		stt_timer_t *old = list_entry(pos, stt_timer_t, stt_list);
 
 		if (timer->stt_expires >= old->stt_expires)
 			break;
@@ -99,7 +98,7 @@ stt_add_timer(struct stt_timer *timer)
  * another CPU.
  */
 int
-stt_del_timer(struct stt_timer *timer)
+stt_del_timer(stt_timer_t *timer)
 {
 	int ret = 0;
 
@@ -119,13 +118,13 @@ stt_del_timer(struct stt_timer *timer)
 
 /* called with stt_data.stt_lock held */
 static int
-stt_expire_list(struct list_head *slot, time64_t now)
+stt_expire_list(struct list_head *slot, cfs_time_t now)
 {
 	int	     expired = 0;
-	struct stt_timer *timer;
+	stt_timer_t *timer;
 
 	while (!list_empty(slot)) {
-		timer = list_entry(slot->next, struct stt_timer, stt_list);
+		timer = list_entry(slot->next, stt_timer_t, stt_list);
 
 		if (timer->stt_expires > now)
 			break;
@@ -143,20 +142,20 @@ stt_expire_list(struct list_head *slot, time64_t now)
 }
 
 static int
-stt_check_timers(time64_t *last)
+stt_check_timers(cfs_time_t *last)
 {
 	int expired = 0;
-	time64_t now;
-	time64_t this_slot;
+	cfs_time_t now;
+        cfs_time_t this_slot;
 
-	now = ktime_get_real_seconds();
-	this_slot = now & STTIMER_SLOTTIMEMASK;
+	now = cfs_time_current_sec();
+        this_slot = now & STTIMER_SLOTTIMEMASK;
 
 	spin_lock(&stt_data.stt_lock);
 
-	while (this_slot >= *last) {
+	while (cfs_time_aftereq(this_slot, *last)) {
 		expired += stt_expire_list(STTIMER_SLOT(this_slot), now);
-		this_slot = this_slot - STTIMER_SLOTTIME;
+		this_slot = cfs_time_sub(this_slot, STTIMER_SLOTTIME);
 	}
 
 	*last = now & STTIMER_SLOTTIMEMASK;
@@ -211,7 +210,7 @@ stt_startup (void)
         int i;
 
         stt_data.stt_shuttingdown = 0;
-	stt_data.stt_prev_slot = ktime_get_real_seconds() & STTIMER_SLOTTIMEMASK;
+	stt_data.stt_prev_slot = cfs_time_current_sec() & STTIMER_SLOTTIMEMASK;
 
 	spin_lock_init(&stt_data.stt_lock);
         for (i = 0; i < STTIMER_NSLOTS; i++)
diff --git a/drivers/staging/lustrefsx/lnet/selftest/timer.h b/drivers/staging/lustrefsx/lnet/selftest/timer.h
index e769c4cc9ebd7..71c3de2736b15 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/timer.h
+++ b/drivers/staging/lustrefsx/lnet/selftest/timer.h
@@ -34,15 +34,15 @@
 #ifndef __SELFTEST_TIMER_H__
 #define __SELFTEST_TIMER_H__
 
-struct stt_timer {
+typedef struct {
 	struct list_head	stt_list;
-	time64_t		stt_expires;
+	cfs_time_t		stt_expires;
 	void			(*stt_func)(void *);
 	void			*stt_data;
-};
+} stt_timer_t;
 
-void stt_add_timer(struct stt_timer *timer);
-int stt_del_timer(struct stt_timer *timer);
+void stt_add_timer(stt_timer_t *timer);
+int stt_del_timer(stt_timer_t *timer);
 int stt_startup(void);
 void stt_shutdown(void);
 
diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_handler.c b/drivers/staging/lustrefsx/lustre/fid/fid_handler.c
index 8676ec223548d..ef61772f0dcb2 100644
--- a/drivers/staging/lustrefsx/lustre/fid/fid_handler.c
+++ b/drivers/staging/lustrefsx/lustre/fid/fid_handler.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -48,6 +48,8 @@
 #include <lustre_fid.h>
 #include "fid_internal.h"
 
+static void seq_server_proc_fini(struct lu_server_seq *seq);
+
 /* Assigns client to sequence controller node. */
 int seq_server_set_cli(const struct lu_env *env, struct lu_server_seq *seq,
 		       struct lu_client_seq *cli)
@@ -456,43 +458,35 @@ LU_KEY_INIT_FINI(seq, struct seq_thread_info);
 /* context key: seq_thread_key */
 LU_CONTEXT_KEY_DEFINE(seq, LCT_MD_THREAD | LCT_DT_THREAD);
 
-extern const struct file_operations seq_fld_debugfs_seq_fops;
-
-static void seq_server_debugfs_fini(struct lu_server_seq *seq)
-{
-	if (!IS_ERR_OR_NULL(seq->lss_debugfs_entry))
-		ldebugfs_remove(&seq->lss_debugfs_entry);
-}
+extern const struct file_operations seq_fld_proc_seq_fops;
 
-static int seq_server_debugfs_init(struct lu_server_seq *seq)
+static int seq_server_proc_init(struct lu_server_seq *seq)
 {
+#ifdef CONFIG_PROC_FS
 	int rc;
 	ENTRY;
 
-	seq->lss_debugfs_entry = ldebugfs_register(seq->lss_name,
-						   seq_debugfs_dir,
-						   NULL, NULL);
-	if (IS_ERR_OR_NULL(seq->lss_debugfs_entry)) {
-		rc = seq->lss_debugfs_entry ? PTR_ERR(seq->lss_debugfs_entry)
-					    : -ENOMEM;
-		seq->lss_debugfs_entry = NULL;
+	seq->lss_proc_dir = lprocfs_register(seq->lss_name,
+					     seq_type_proc_dir,
+					     NULL, NULL);
+	if (IS_ERR(seq->lss_proc_dir)) {
+		rc = PTR_ERR(seq->lss_proc_dir);
 		RETURN(rc);
 	}
 
-	rc = ldebugfs_add_vars(seq->lss_debugfs_entry,
-			       seq_server_debugfs_list, seq);
+	rc = lprocfs_add_vars(seq->lss_proc_dir, seq_server_proc_list, seq);
 	if (rc) {
-		CERROR("%s: Can't init sequence manager debugfs, rc %d\n",
-		       seq->lss_name, rc);
+		CERROR("%s: Can't init sequence manager "
+		       "proc, rc %d\n", seq->lss_name, rc);
 		GOTO(out_cleanup, rc);
 	}
 
 	if (seq->lss_type == LUSTRE_SEQ_CONTROLLER) {
-		rc = ldebugfs_seq_create(seq->lss_debugfs_entry, "fldb", 0644,
-					 &seq_fld_debugfs_seq_fops, seq);
+		rc = lprocfs_seq_create(seq->lss_proc_dir, "fldb", 0644,
+					&seq_fld_proc_seq_fops, seq);
 		if (rc) {
-			CERROR("%s: Can't create fldb for sequence manager debugfs: rc = %d\n",
-			       seq->lss_name, rc);
+			CERROR("%s: Can't create fldb for sequence manager "
+			       "proc: rc = %d\n", seq->lss_name, rc);
 			GOTO(out_cleanup, rc);
 		}
 	}
@@ -500,8 +494,24 @@ static int seq_server_debugfs_init(struct lu_server_seq *seq)
 	RETURN(0);
 
 out_cleanup:
-	seq_server_debugfs_fini(seq);
+	seq_server_proc_fini(seq);
 	return rc;
+#else /* !CONFIG_PROC_FS */
+	return 0;
+#endif /* CONFIG_PROC_FS */
+}
+
+static void seq_server_proc_fini(struct lu_server_seq *seq)
+{
+#ifdef CONFIG_PROC_FS
+        ENTRY;
+        if (seq->lss_proc_dir != NULL) {
+                if (!IS_ERR(seq->lss_proc_dir))
+                        lprocfs_remove(&seq->lss_proc_dir);
+                seq->lss_proc_dir = NULL;
+        }
+        EXIT;
+#endif /* CONFIG_PROC_FS */
 }
 
 int seq_server_init(const struct lu_env *env,
@@ -582,7 +592,7 @@ int seq_server_init(const struct lu_env *env,
 			lu_seq_range_is_sane(&seq->lss_space));
 	}
 
-	rc  = seq_server_debugfs_init(seq);
+        rc  = seq_server_proc_init(seq);
         if (rc)
                 GOTO(out, rc);
 
@@ -599,7 +609,7 @@ void seq_server_fini(struct lu_server_seq *seq,
 {
         ENTRY;
 
-	seq_server_debugfs_fini(seq);
+        seq_server_proc_fini(seq);
         seq_store_fini(seq, env);
 
         EXIT;
diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_internal.h b/drivers/staging/lustrefsx/lustre/fid/fid_internal.h
index 1c6587d43b52b..9ad1420e1812e 100644
--- a/drivers/staging/lustrefsx/lustre/fid/fid_internal.h
+++ b/drivers/staging/lustrefsx/lustre/fid/fid_internal.h
@@ -36,6 +36,7 @@
 #ifndef __FID_INTERNAL_H
 #define __FID_INTERNAL_H
 
+#include <lustre/lustre_idl.h>
 #include <libcfs/libcfs.h>
 
 #ifdef HAVE_SERVER_SUPPORT
@@ -55,7 +56,9 @@ enum {
 
 extern struct lu_context_key seq_thread_key;
 
-extern struct ldebugfs_vars seq_server_debugfs_list[];
+# ifdef CONFIG_PROC_FS
+extern struct lprocfs_vars seq_server_proc_list[];
+# endif
 
 /* Store API functions. */
 struct dt_device;
@@ -87,8 +90,10 @@ void fid_server_mod_exit(void);
 int seq_client_alloc_super(struct lu_client_seq *seq,
 			   const struct lu_env *env);
 
-extern struct dentry *seq_debugfs_dir;
+# ifdef CONFIG_PROC_FS
+extern struct lprocfs_vars seq_client_proc_list[];
+# endif
 
-extern struct ldebugfs_vars seq_client_debugfs_list[];
+extern struct proc_dir_entry *seq_type_proc_dir;
 
 #endif /* __FID_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_lib.c b/drivers/staging/lustrefsx/lustre/fid/fid_lib.c
index ab3a59820abc7..7c5477c044351 100644
--- a/drivers/staging/lustrefsx/lustre/fid/fid_lib.c
+++ b/drivers/staging/lustrefsx/lustre/fid/fid_lib.c
@@ -41,6 +41,7 @@
 
 #include <libcfs/libcfs.h>
 #include <linux/module.h>
+#include <lustre/lustre_idl.h>
 #include <lustre_fid.h>
 
 /**
diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_request.c b/drivers/staging/lustrefsx/lustre/fid/fid_request.c
index 93f6402a12232..ab1cca59bc916 100644
--- a/drivers/staging/lustrefsx/lustre/fid/fid_request.c
+++ b/drivers/staging/lustrefsx/lustre/fid/fid_request.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,8 +38,8 @@
 
 #define DEBUG_SUBSYSTEM S_FID
 
-#include <linux/err.h>
 #include <linux/module.h>
+#include <libcfs/libcfs.h>
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
@@ -48,8 +48,6 @@
 #include <lustre_mdc.h>
 #include "fid_internal.h"
 
-struct dentry *seq_debugfs_dir;
-
 static int seq_client_rpc(struct lu_client_seq *seq,
                           struct lu_seq_range *output, __u32 opc,
                           const char *opcname)
@@ -440,57 +438,51 @@ void seq_client_flush(struct lu_client_seq *seq)
 }
 EXPORT_SYMBOL(seq_client_flush);
 
-static void seq_client_debugfs_fini(struct lu_client_seq *seq)
+static void seq_client_proc_fini(struct lu_client_seq *seq)
 {
-	if (!IS_ERR_OR_NULL(seq->lcs_debugfs_entry))
-		ldebugfs_remove(&seq->lcs_debugfs_entry);
+#ifdef CONFIG_PROC_FS
+	ENTRY;
+	if (seq->lcs_proc_dir) {
+		if (!IS_ERR(seq->lcs_proc_dir))
+			lprocfs_remove(&seq->lcs_proc_dir);
+		seq->lcs_proc_dir = NULL;
+	}
+	EXIT;
+#endif /* CONFIG_PROC_FS */
 }
 
-static int seq_client_debugfs_init(struct lu_client_seq *seq)
+static int seq_client_proc_init(struct lu_client_seq *seq)
 {
+#ifdef CONFIG_PROC_FS
         int rc;
+        ENTRY;
 
-	seq->lcs_debugfs_entry = ldebugfs_register(seq->lcs_name,
-						   seq_debugfs_dir,
-						   NULL, NULL);
-	if (IS_ERR_OR_NULL(seq->lcs_debugfs_entry)) {
-		CERROR("%s: LdebugFS failed in seq-init\n", seq->lcs_name);
-		rc = seq->lcs_debugfs_entry ? PTR_ERR(seq->lcs_debugfs_entry)
-					    : -ENOMEM;
-		seq->lcs_debugfs_entry = NULL;
-		RETURN(rc);
+	seq->lcs_proc_dir = lprocfs_register(seq->lcs_name, seq_type_proc_dir,
+					     NULL, NULL);
+        if (IS_ERR(seq->lcs_proc_dir)) {
+                CERROR("%s: LProcFS failed in seq-init\n",
+                       seq->lcs_name);
+                rc = PTR_ERR(seq->lcs_proc_dir);
+                RETURN(rc);
         }
 
-	rc = ldebugfs_add_vars(seq->lcs_debugfs_entry,
-			       seq_client_debugfs_list, seq);
-	if (rc) {
-		CERROR("%s: Can't init sequence manager debugfs, rc %d\n",
-		       seq->lcs_name, rc);
+	rc = lprocfs_add_vars(seq->lcs_proc_dir, seq_client_proc_list, seq);
+        if (rc) {
+                CERROR("%s: Can't init sequence manager "
+                       "proc, rc %d\n", seq->lcs_name, rc);
                 GOTO(out_cleanup, rc);
         }
 
         RETURN(0);
 
 out_cleanup:
-	seq_client_debugfs_fini(seq);
+        seq_client_proc_fini(seq);
         return rc;
-}
-
-void seq_client_fini(struct lu_client_seq *seq)
-{
-	ENTRY;
-
-	seq_client_debugfs_fini(seq);
-
-	if (seq->lcs_exp != NULL) {
-		class_export_put(seq->lcs_exp);
-		seq->lcs_exp = NULL;
-	}
 
-	seq->lcs_srv = NULL;
-	EXIT;
+#else /* !CONFIG_PROC_FS */
+	return 0;
+#endif /* CONFIG_PROC_FS */
 }
-EXPORT_SYMBOL(seq_client_fini);
 
 int seq_client_init(struct lu_client_seq *seq,
                     struct obd_export *exp,
@@ -523,13 +515,29 @@ int seq_client_init(struct lu_client_seq *seq,
 	snprintf(seq->lcs_name, sizeof(seq->lcs_name),
 		 "cli-%s", prefix);
 
-	rc = seq_client_debugfs_init(seq);
+	rc = seq_client_proc_init(seq);
 	if (rc)
 		seq_client_fini(seq);
 	RETURN(rc);
 }
 EXPORT_SYMBOL(seq_client_init);
 
+void seq_client_fini(struct lu_client_seq *seq)
+{
+        ENTRY;
+
+        seq_client_proc_fini(seq);
+
+        if (seq->lcs_exp != NULL) {
+                class_export_put(seq->lcs_exp);
+                seq->lcs_exp = NULL;
+        }
+
+        seq->lcs_srv = NULL;
+        EXIT;
+}
+EXPORT_SYMBOL(seq_client_fini);
+
 int client_fid_init(struct obd_device *obd,
 		    struct obd_export *exp, enum lu_cli_type type)
 {
@@ -583,18 +591,21 @@ int client_fid_fini(struct obd_device *obd)
 }
 EXPORT_SYMBOL(client_fid_fini);
 
+struct proc_dir_entry *seq_type_proc_dir;
+
 static int __init fid_init(void)
 {
-#ifdef HAVE_SERVER_SUPPORT
-	int rc = fid_server_mod_init();
+	seq_type_proc_dir = lprocfs_register(LUSTRE_SEQ_NAME,
+					     proc_lustre_root,
+					     NULL, NULL);
+	if (IS_ERR(seq_type_proc_dir))
+		return PTR_ERR(seq_type_proc_dir);
 
-	if (rc)
-		return rc;
-#endif
-	seq_debugfs_dir = ldebugfs_register(LUSTRE_SEQ_NAME,
-					    debugfs_lustre_root,
-					    NULL, NULL);
-	return PTR_ERR_OR_ZERO(seq_debugfs_dir);
+# ifdef HAVE_SERVER_SUPPORT
+	fid_server_mod_init();
+# endif
+
+	return 0;
 }
 
 static void __exit fid_exit(void)
@@ -602,8 +613,11 @@ static void __exit fid_exit(void)
 # ifdef HAVE_SERVER_SUPPORT
 	fid_server_mod_exit();
 # endif
-	if (!IS_ERR_OR_NULL(seq_debugfs_dir))
-		ldebugfs_remove(&seq_debugfs_dir);
+
+	if (seq_type_proc_dir != NULL && !IS_ERR(seq_type_proc_dir)) {
+		lprocfs_remove(&seq_type_proc_dir);
+		seq_type_proc_dir = NULL;
+	}
 }
 
 MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_store.c b/drivers/staging/lustrefsx/lustre/fid/fid_store.c
index 1565d80811d29..225ddfad6f634 100644
--- a/drivers/staging/lustrefsx/lustre/fid/fid_store.c
+++ b/drivers/staging/lustrefsx/lustre/fid/fid_store.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c b/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c
index 5ac2b883d0861..d95888f15cfcb 100644
--- a/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c
+++ b/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -47,6 +47,8 @@
 #include <lprocfs_status.h>
 #include "fid_internal.h"
 
+#ifdef CONFIG_PROC_FS
+
 /* Format: [0x64BIT_INT - 0x64BIT_INT] + 32 bytes just in case */
 #define MAX_FID_RANGE_STRLEN (32 + 2 * 2 * sizeof(__u64))
 /**
@@ -58,37 +60,34 @@
  * safe for production use.
  */
 static int
-ldebugfs_fid_write_common(const char __user *buffer, size_t count,
-			  struct lu_seq_range *range)
+lprocfs_fid_write_common(struct file *file, const char __user *buffer,
+				size_t count, struct lu_seq_range *range)
 {
-	char kernbuf[MAX_FID_RANGE_STRLEN];
 	struct lu_seq_range tmp = {
 		.lsr_start = 0,
 	};
-	int rc;
-
+	char kernbuf[MAX_FID_RANGE_STRLEN];
 	ENTRY;
-	LASSERT(range);
+
+	LASSERT(range != NULL);
 
 	if (count >= sizeof(kernbuf))
 		RETURN(-EINVAL);
 
-	if (copy_from_user(kernbuf, buffer, count))
+	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
 		RETURN(-EFAULT);
 
 	kernbuf[count] = 0;
 
 	if (count == 5 && strcmp(kernbuf, "clear") == 0) {
 		memset(range, 0, sizeof(*range));
-		RETURN(count);
+		RETURN(0);
 	}
 
 	/* of the form "[0x0000000240000400 - 0x000000028000400]" */
-	rc = sscanf(kernbuf, "[%llx - %llx]\n",
-		    (unsigned long long *)&tmp.lsr_start,
-		    (unsigned long long *)&tmp.lsr_end);
-	if (rc != 2)
-		RETURN(-EINVAL);
+	sscanf(kernbuf, "[%llx - %llx]\n",
+	       (long long unsigned *)&tmp.lsr_start,
+	       (long long unsigned *)&tmp.lsr_end);
 	if (!lu_seq_range_is_sane(&tmp) || lu_seq_range_is_zero(&tmp) ||
 	    tmp.lsr_start < range->lsr_start || tmp.lsr_end > range->lsr_end)
 		RETURN(-EINVAL);
@@ -98,24 +97,23 @@ ldebugfs_fid_write_common(const char __user *buffer, size_t count,
 
 #ifdef HAVE_SERVER_SUPPORT
 /*
- * Server side debugfs stuff.
+ * Server side procfs stuff.
  */
 static ssize_t
-ldebugfs_server_fid_space_seq_write(struct file *file,
-				    const char __user *buffer,
-				    size_t count, loff_t *off)
+lprocfs_server_fid_space_seq_write(struct file *file, const char __user *buffer,
+					size_t count, loff_t *off)
 {
-	struct lu_server_seq *seq;
+	struct lu_server_seq *seq = ((struct seq_file *)file->private_data)->private;
 	int rc;
-
 	ENTRY;
-	seq = ((struct seq_file *)file->private_data)->private;
+
+	LASSERT(seq != NULL);
 
 	mutex_lock(&seq->lss_mutex);
-	rc = ldebugfs_fid_write_common(buffer, count, &seq->lss_space);
+	rc = lprocfs_fid_write_common(file, buffer, count, &seq->lss_space);
 	if (rc == 0) {
-		CDEBUG(D_INFO, "%s: Space: " DRANGE "\n",
-		       seq->lss_name, PRANGE(&seq->lss_space));
+		CDEBUG(D_INFO, "%s: Space: "DRANGE"\n",
+			seq->lss_name, PRANGE(&seq->lss_space));
 	}
 	mutex_unlock(&seq->lss_mutex);
 
@@ -123,11 +121,13 @@ ldebugfs_server_fid_space_seq_write(struct file *file,
 }
 
 static int
-ldebugfs_server_fid_space_seq_show(struct seq_file *m, void *unused)
+lprocfs_server_fid_space_seq_show(struct seq_file *m, void *unused)
 {
 	struct lu_server_seq *seq = (struct lu_server_seq *)m->private;
 	ENTRY;
 
+	LASSERT(seq != NULL);
+
 	mutex_lock(&seq->lss_mutex);
 	seq_printf(m, "[%#llx - %#llx]:%x:%s\n", PRANGE(&seq->lss_space));
 	mutex_unlock(&seq->lss_mutex);
@@ -136,12 +136,14 @@ ldebugfs_server_fid_space_seq_show(struct seq_file *m, void *unused)
 }
 
 static int
-ldebugfs_server_fid_server_seq_show(struct seq_file *m, void *unused)
+lprocfs_server_fid_server_seq_show(struct seq_file *m, void *unused)
 {
 	struct lu_server_seq *seq = (struct lu_server_seq *)m->private;
 	struct client_obd *cli;
 	ENTRY;
 
+	LASSERT(seq != NULL);
+
 	if (seq->lss_cli) {
 		if (seq->lss_cli->lcs_exp != NULL) {
 			cli = &seq->lss_cli->lcs_exp->exp_obd->u.cli;
@@ -156,24 +158,34 @@ ldebugfs_server_fid_server_seq_show(struct seq_file *m, void *unused)
 	RETURN(0);
 }
 
-static ssize_t ldebugfs_server_fid_width_seq_write(struct file *file,
-						   const char __user *buffer,
-						   size_t count, loff_t *off)
+static ssize_t
+lprocfs_server_fid_width_seq_write(struct file *file, const char __user *buffer,
+					size_t count, loff_t *off)
 {
-	struct seq_file *m = file->private_data;
-	struct lu_server_seq *seq = m->private;
+	struct lu_server_seq *seq = ((struct seq_file *)file->private_data)->private;
 	int rc;
-
+	__s64 val;
 	ENTRY;
+
+	LASSERT(seq != NULL);
+
 	mutex_lock(&seq->lss_mutex);
 
-	rc = kstrtoull_from_user(buffer, count, 0, &seq->lss_width);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc) {
 		CERROR("%s: invalid FID sequence width: rc = %d\n",
 		       seq->lss_name, rc);
 		GOTO(out_unlock, count = rc);
 	}
 
+	if (val < 0) {
+		CERROR("%s: invalid FID sequence width: rc = %d\n",
+		       seq->lss_name, -ERANGE);
+		GOTO(out_unlock, count = -ERANGE);
+	}
+
+	seq->lss_width = val;
+
 	CDEBUG(D_INFO, "%s: Width: %llu\n",
 	       seq->lss_name, seq->lss_width);
 out_unlock:
@@ -183,11 +195,13 @@ static ssize_t ldebugfs_server_fid_width_seq_write(struct file *file,
 }
 
 static int
-ldebugfs_server_fid_width_seq_show(struct seq_file *m, void *unused)
+lprocfs_server_fid_width_seq_show(struct seq_file *m, void *unused)
 {
 	struct lu_server_seq *seq = (struct lu_server_seq *)m->private;
-
 	ENTRY;
+
+	LASSERT(seq != NULL);
+
 	mutex_lock(&seq->lss_mutex);
 	seq_printf(m, "%llu\n", seq->lss_width);
 	mutex_unlock(&seq->lss_mutex);
@@ -195,17 +209,17 @@ ldebugfs_server_fid_width_seq_show(struct seq_file *m, void *unused)
 	RETURN(0);
 }
 
-LDEBUGFS_SEQ_FOPS(ldebugfs_server_fid_space);
-LDEBUGFS_SEQ_FOPS(ldebugfs_server_fid_width);
-LDEBUGFS_SEQ_FOPS_RO(ldebugfs_server_fid_server);
+LPROC_SEQ_FOPS(lprocfs_server_fid_space);
+LPROC_SEQ_FOPS(lprocfs_server_fid_width);
+LPROC_SEQ_FOPS_RO(lprocfs_server_fid_server);
 
-struct ldebugfs_vars seq_server_debugfs_list[] = {
+struct lprocfs_vars seq_server_proc_list[] = {
 	{ .name	=	"space",
-	  .fops	=	&ldebugfs_server_fid_space_fops	},
+	  .fops	=	&lprocfs_server_fid_space_fops	},
 	{ .name	=	"width",
-	  .fops	=	&ldebugfs_server_fid_width_fops	},
+	  .fops	=	&lprocfs_server_fid_width_fops	},
 	{ .name	=	"server",
-	  .fops	=	&ldebugfs_server_fid_server_fops},
+	  .fops	=	&lprocfs_server_fid_server_fops	},
 	{ NULL }
 };
 
@@ -336,7 +350,7 @@ struct seq_operations fldb_sops = {
 static int fldb_seq_open(struct inode *inode, struct file *file)
 {
 	struct seq_file		*seq;
-	struct lu_server_seq *ss = inode->i_private;
+	struct lu_server_seq    *ss = (struct lu_server_seq *) PDE_DATA(inode);
 	struct lu_server_fld    *fld;
 	struct dt_object	*obj;
 	const struct dt_it_ops  *iops;
@@ -347,6 +361,10 @@ static int fldb_seq_open(struct inode *inode, struct file *file)
 	fld = ss->lss_site->ss_server_fld;
 	LASSERT(fld != NULL);
 
+	rc = LPROCFS_ENTRY_CHECK(inode);
+	if (rc < 0)
+		return rc;
+
 	rc = seq_open(file, &fldb_sops);
 	if (rc)
 		return rc;
@@ -398,7 +416,7 @@ static int fldb_seq_release(struct inode *inode, struct file *file)
 
 	param = seq->private;
 	if (param == NULL) {
-		seq_release(inode, file);
+		lprocfs_seq_release(inode, file);
 		return 0;
 	}
 
@@ -412,7 +430,7 @@ static int fldb_seq_release(struct inode *inode, struct file *file)
 	iops->fini(&param->fsp_env, param->fsp_it);
 	lu_env_fini(&param->fsp_env);
 	OBD_FREE_PTR(param);
-	seq_release(inode, file);
+	lprocfs_seq_release(inode, file);
 
 	return 0;
 }
@@ -478,7 +496,7 @@ static ssize_t fldb_seq_write(struct file *file, const char __user *buf,
 	RETURN(rc < 0 ? rc : len);
 }
 
-const struct file_operations seq_fld_debugfs_seq_fops = {
+const struct file_operations seq_fld_proc_seq_fops = {
 	.owner	 = THIS_MODULE,
 	.open	 = fldb_seq_open,
 	.read	 = seq_read,
@@ -488,22 +506,21 @@ const struct file_operations seq_fld_debugfs_seq_fops = {
 
 #endif /* HAVE_SERVER_SUPPORT */
 
-/* Client side debugfs stuff */
+/* Client side procfs stuff */
 static ssize_t
-ldebugfs_client_fid_space_seq_write(struct file *file,
-				    const char __user *buffer,
-				    size_t count, loff_t *off)
+lprocfs_client_fid_space_seq_write(struct file *file, const char __user *buffer,
+				   size_t count, loff_t *off)
 {
-	struct lu_client_seq *seq;
+	struct lu_client_seq *seq = ((struct seq_file *)file->private_data)->private;
 	int rc;
-
 	ENTRY;
-	seq = ((struct seq_file *)file->private_data)->private;
+
+	LASSERT(seq != NULL);
 
 	mutex_lock(&seq->lcs_mutex);
-	rc = ldebugfs_fid_write_common(buffer, count, &seq->lcs_space);
+	rc = lprocfs_fid_write_common(file, buffer, count, &seq->lcs_space);
 	if (rc == 0) {
-		CDEBUG(D_INFO, "%s: Space: " DRANGE "\n",
+		CDEBUG(D_INFO, "%s: Space: "DRANGE"\n",
                        seq->lcs_name, PRANGE(&seq->lcs_space));
 	}
 
@@ -512,58 +529,68 @@ ldebugfs_client_fid_space_seq_write(struct file *file,
 	RETURN(count);
 }
 
-static int ldebugfs_client_fid_space_seq_show(struct seq_file *m, void *unused)
+static int
+lprocfs_client_fid_space_seq_show(struct seq_file *m, void *unused)
 {
 	struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
-
 	ENTRY;
+
+	LASSERT(seq != NULL);
+
 	mutex_lock(&seq->lcs_mutex);
-	seq_printf(m, "[%#llx - %#llx]:%x:%s\n", PRANGE(&seq->lcs_space));
+	seq_printf(m, "[%#llx - %#llx]:%x:%s\n",
+		   PRANGE(&seq->lcs_space));
 	mutex_unlock(&seq->lcs_mutex);
 
 	RETURN(0);
 }
 
-static ssize_t ldebugfs_client_fid_width_seq_write(struct file *file,
-						   const char __user *buffer,
-						   size_t count, loff_t *off)
+static ssize_t
+lprocfs_client_fid_width_seq_write(struct file *file, const char __user *buffer,
+				   size_t count, loff_t *off)
 {
-	struct seq_file *m = file->private_data;
-	struct lu_client_seq *seq = m->private;
-	u64 val;
-	u64 max;
+	struct lu_client_seq *seq = ((struct seq_file *)file->private_data)->private;
+	__u64 max;
 	int rc;
-
+	__s64 val;
 	ENTRY;
-	rc = kstrtoull_from_user(buffer, count, 0, &val);
-	if (rc)
-		return rc;
+
+	LASSERT(seq != NULL);
 
 	mutex_lock(&seq->lcs_mutex);
+
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	if (rc) {
+		GOTO(out_unlock, count = rc);
+	}
+
 	if (seq->lcs_type == LUSTRE_SEQ_DATA)
 		max = LUSTRE_DATA_SEQ_MAX_WIDTH;
 	else
 		max = LUSTRE_METADATA_SEQ_MAX_WIDTH;
 
-	if (val <= max) {
+	if (val <= max && val > 0) {
 		seq->lcs_width = val;
 
-		CDEBUG(D_INFO, "%s: Sequence size: %llu\n", seq->lcs_name,
-		       seq->lcs_width);
+		CDEBUG(D_INFO, "%s: Sequence size: %llu\n",
+		       seq->lcs_name, seq->lcs_width);
 	} else {
-		count = -ERANGE;
+		GOTO(out_unlock, count = -ERANGE);
 	}
 
+out_unlock:
 	mutex_unlock(&seq->lcs_mutex);
 	RETURN(count);
 }
 
 static int
-ldebugfs_client_fid_width_seq_show(struct seq_file *m, void *unused)
+lprocfs_client_fid_width_seq_show(struct seq_file *m, void *unused)
 {
 	struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
-
 	ENTRY;
+
+	LASSERT(seq != NULL);
+
 	mutex_lock(&seq->lcs_mutex);
 	seq_printf(m, "%llu\n", seq->lcs_width);
 	mutex_unlock(&seq->lcs_mutex);
@@ -572,11 +599,13 @@ ldebugfs_client_fid_width_seq_show(struct seq_file *m, void *unused)
 }
 
 static int
-ldebugfs_client_fid_fid_seq_show(struct seq_file *m, void *unused)
+lprocfs_client_fid_fid_seq_show(struct seq_file *m, void *unused)
 {
 	struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
-
 	ENTRY;
+
+	LASSERT(seq != NULL);
+
 	mutex_lock(&seq->lcs_mutex);
 	seq_printf(m, DFID"\n", PFID(&seq->lcs_fid));
 	mutex_unlock(&seq->lcs_mutex);
@@ -585,37 +614,38 @@ ldebugfs_client_fid_fid_seq_show(struct seq_file *m, void *unused)
 }
 
 static int
-ldebugfs_client_fid_server_seq_show(struct seq_file *m, void *unused)
+lprocfs_client_fid_server_seq_show(struct seq_file *m, void *unused)
 {
 	struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
 	struct client_obd *cli;
 	ENTRY;
 
-	if (seq->lcs_exp) {
+	LASSERT(seq != NULL);
+
+	if (seq->lcs_exp != NULL) {
 		cli = &seq->lcs_exp->exp_obd->u.cli;
 		seq_printf(m, "%s\n", cli->cl_target_uuid.uuid);
-#ifdef HAVE_SERVER_SUPPORT
 	} else {
 		seq_printf(m, "%s\n", seq->lcs_srv->lss_name);
-#endif /* HAVE_SERVER_SUPPORT */
 	}
-
 	RETURN(0);
 }
 
-LDEBUGFS_SEQ_FOPS(ldebugfs_client_fid_space);
-LDEBUGFS_SEQ_FOPS(ldebugfs_client_fid_width);
-LDEBUGFS_SEQ_FOPS_RO(ldebugfs_client_fid_server);
-LDEBUGFS_SEQ_FOPS_RO(ldebugfs_client_fid_fid);
+LPROC_SEQ_FOPS(lprocfs_client_fid_space);
+LPROC_SEQ_FOPS(lprocfs_client_fid_width);
+LPROC_SEQ_FOPS_RO(lprocfs_client_fid_server);
+LPROC_SEQ_FOPS_RO(lprocfs_client_fid_fid);
 
-struct ldebugfs_vars seq_client_debugfs_list[] = {
+struct lprocfs_vars seq_client_proc_list[] = {
 	{ .name	=	"space",
-	  .fops	=	&ldebugfs_client_fid_space_fops	},
+	  .fops	=	&lprocfs_client_fid_space_fops	},
 	{ .name	=	"width",
-	  .fops	=	&ldebugfs_client_fid_width_fops	},
+	  .fops	=	&lprocfs_client_fid_width_fops	},
 	{ .name	=	"server",
-	  .fops	=	&ldebugfs_client_fid_server_fops},
+	  .fops	=	&lprocfs_client_fid_server_fops	},
 	{ .name	=	"fid",
-	  .fops	=	&ldebugfs_client_fid_fid_fops	},
+	  .fops	=	&lprocfs_client_fid_fid_fops	},
 	{ NULL }
 };
+
+#endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_cache.c b/drivers/staging/lustrefsx/lustre/fld/fld_cache.c
index f638e0dcd1ea4..9b46feed04e72 100644
--- a/drivers/staging/lustrefsx/lustre/fld/fld_cache.c
+++ b/drivers/staging/lustrefsx/lustre/fld/fld_cache.c
@@ -88,14 +88,27 @@ struct fld_cache *fld_cache_init(const char *name,
  */
 void fld_cache_fini(struct fld_cache *cache)
 {
-	LASSERT(cache != NULL);
-	fld_cache_flush(cache);
+        __u64 pct;
+        ENTRY;
 
-	CDEBUG(D_INFO, "FLD cache statistics (%s):\n", cache->fci_name);
-	CDEBUG(D_INFO, "  Cache reqs: %llu\n", cache->fci_stat.fst_cache);
+        LASSERT(cache != NULL);
+        fld_cache_flush(cache);
+
+        if (cache->fci_stat.fst_count > 0) {
+                pct = cache->fci_stat.fst_cache * 100;
+                do_div(pct, cache->fci_stat.fst_count);
+        } else {
+                pct = 0;
+        }
+
+        CDEBUG(D_INFO, "FLD cache statistics (%s):\n", cache->fci_name);
 	CDEBUG(D_INFO, "  Total reqs: %llu\n", cache->fci_stat.fst_count);
+	CDEBUG(D_INFO, "  Cache reqs: %llu\n", cache->fci_stat.fst_cache);
+	CDEBUG(D_INFO, "  Cache hits: %llu%%\n", pct);
 
-	OBD_FREE_PTR(cache);
+        OBD_FREE_PTR(cache);
+
+        EXIT;
 }
 
 /**
diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_handler.c b/drivers/staging/lustrefsx/lustre/fld/fld_handler.c
index 42f00da7f1363..375070464cd85 100644
--- a/drivers/staging/lustrefsx/lustre/fld/fld_handler.c
+++ b/drivers/staging/lustrefsx/lustre/fld/fld_handler.c
@@ -103,16 +103,15 @@ EXPORT_SYMBOL(fld_server_create);
 /**
  * Extract index information from fld name like srv-fsname-MDT0000
  **/
-int fld_name_to_index(const char *name, u32 *index)
+int fld_name_to_index(const char *name, __u32 *index)
 {
 	char *dash;
 	int rc;
-
 	ENTRY;
 
 	CDEBUG(D_INFO, "get index from %s\n", name);
 	dash = strrchr(name, '-');
-	if (!dash)
+	if (dash == NULL)
 		RETURN(-EINVAL);
 	dash++;
 	rc = target_name2index(dash, index, NULL);
@@ -125,20 +124,17 @@ int fld_name_to_index(const char *name, u32 *index)
 int fld_update_from_controller(const struct lu_env *env,
 			       struct lu_server_fld *fld)
 {
-	struct fld_thread_info *info;
-	struct lu_seq_range *range;
+	struct fld_thread_info	  *info;
+	struct lu_seq_range	  *range;
 	struct lu_seq_range_array *lsra;
-	u32 index;
-	struct ptlrpc_request *req;
-	int rc;
-	int i;
-
+	__u32			  index;
+	struct ptlrpc_request	  *req;
+	int			  rc;
+	int			  i;
 	ENTRY;
 
-	/*
-	 * Update only happens during initalization, i.e. local FLDB
-	 * does not exist yet
-	 */
+	/* Update only happens during initalization, i.e. local FLDB
+	 * does not exist yet */
 	if (!fld->lsf_new)
 		RETURN(0);
 
@@ -166,7 +162,7 @@ int fld_update_from_controller(const struct lu_env *env,
 		LASSERT(req != NULL);
 		lsra = (struct lu_seq_range_array *)req_capsule_server_get(
 					  &req->rq_pill, &RMF_GENERIC_DATA);
-		if (!lsra)
+		if (lsra == NULL)
 			GOTO(out, rc = -EPROTO);
 
 		range_array_le_to_cpu(lsra, lsra);
@@ -192,7 +188,7 @@ int fld_update_from_controller(const struct lu_env *env,
 
 	fld->lsf_new = 1;
 out:
-	if (req)
+	if (req != NULL)
 		ptlrpc_req_finished(req);
 
 	RETURN(rc);
@@ -208,7 +204,6 @@ int fld_local_lookup(const struct lu_env *env, struct lu_server_fld *fld,
 	struct lu_seq_range *erange;
 	struct fld_thread_info *info;
 	int rc;
-
 	ENTRY;
 
 	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
@@ -220,9 +215,9 @@ int fld_local_lookup(const struct lu_env *env, struct lu_server_fld *fld,
 	if (rc == 0) {
 		if (unlikely(fld_range_type(erange) != fld_range_type(range) &&
 			     !fld_range_is_any(range))) {
-			CERROR("%s: FLD cache range "DRANGE" does not match requested flag %x: rc = %d\n",
-			       fld->lsf_name, PRANGE(erange), range->lsr_flags,
-			       -EIO);
+			CERROR("%s: FLD cache range "DRANGE" does not match"
+			       "requested flag %x: rc = %d\n", fld->lsf_name,
+			       PRANGE(erange), range->lsr_flags, -EIO);
 			RETURN(-EIO);
 		}
 		*range = *erange;
@@ -242,9 +237,8 @@ EXPORT_SYMBOL(fld_local_lookup);
 int fld_server_lookup(const struct lu_env *env, struct lu_server_fld *fld,
 		      u64 seq, struct lu_seq_range *range)
 {
-	u32 index;
+	__u32 index;
 	int rc;
-
 	ENTRY;
 
 	rc = fld_local_lookup(env, fld, seq, range);
@@ -256,21 +250,18 @@ int fld_server_lookup(const struct lu_env *env, struct lu_server_fld *fld,
 		RETURN(rc);
 
 	if (index == 0 && rc == LDD_F_SV_TYPE_MDT) {
-		/*
-		 * On server side, all entries should be in cache.
-		 * If we can not find it in cache, just return error
-		 */
+		/* On server side, all entries should be in cache.
+		 * If we can not find it in cache, just return error */
 		CERROR("%s: Cannot find sequence %#llx: rc = %d\n",
 		       fld->lsf_name, seq, -ENOENT);
 		RETURN(-ENOENT);
 	} else {
-		if (!fld->lsf_control_exp) {
-			CERROR("%s: lookup %#llx, but not connects to MDT0 yet: rc = %d.\n",
-			       fld->lsf_name, seq, -EIO);
+		if (fld->lsf_control_exp == NULL) {
+			CERROR("%s: lookup %#llx, but not connects to MDT0"
+			       "yet: rc = %d.\n", fld->lsf_name, seq, -EIO);
 			RETURN(-EIO);
 		}
-		/*
-		 * send request to mdt0 i.e. super seq. controller.
+		/* send request to mdt0 i.e. super seq. controller.
 		 * This is temporary solution, long term solution is fld
 		 * replication on all mdt servers.
 		 */
@@ -290,17 +281,17 @@ EXPORT_SYMBOL(fld_server_lookup);
  */
 static int fld_handle_lookup(struct tgt_session_info *tsi)
 {
-	struct obd_export *exp = tsi->tsi_exp;
-	struct lu_site *site = exp->exp_obd->obd_lu_dev->ld_site;
-	struct lu_server_fld *fld;
-	struct lu_seq_range *in;
-	struct lu_seq_range *out;
-	int rc;
+	struct obd_export	*exp = tsi->tsi_exp;
+	struct lu_site		*site = exp->exp_obd->obd_lu_dev->ld_site;
+	struct lu_server_fld	*fld;
+	struct lu_seq_range	*in;
+	struct lu_seq_range	*out;
+	int			rc;
 
 	ENTRY;
 
 	in = req_capsule_client_get(tsi->tsi_pill, &RMF_FLD_MDFLD);
-	if (!in)
+	if (in == NULL)
 		RETURN(err_serious(-EPROTO));
 
 	rc = req_capsule_server_pack(tsi->tsi_pill);
@@ -308,7 +299,7 @@ static int fld_handle_lookup(struct tgt_session_info *tsi)
 		RETURN(err_serious(rc));
 
 	out = req_capsule_server_get(tsi->tsi_pill, &RMF_FLD_MDFLD);
-	if (!out)
+	if (out == NULL)
 		RETURN(err_serious(-EPROTO));
 	*out = *in;
 
@@ -324,18 +315,18 @@ static int fld_handle_lookup(struct tgt_session_info *tsi)
 
 static int fld_handle_read(struct tgt_session_info *tsi)
 {
-	struct obd_export *exp = tsi->tsi_exp;
-	struct lu_site *site = exp->exp_obd->obd_lu_dev->ld_site;
-	struct lu_seq_range *in;
-	void *data;
-	int rc;
+	struct obd_export	*exp = tsi->tsi_exp;
+	struct lu_site		*site = exp->exp_obd->obd_lu_dev->ld_site;
+	struct lu_seq_range	*in;
+	void			*data;
+	int			rc;
 
 	ENTRY;
 
 	req_capsule_set(tsi->tsi_pill, &RQF_FLD_READ);
 
 	in = req_capsule_client_get(tsi->tsi_pill, &RMF_FLD_MDFLD);
-	if (!in)
+	if (in == NULL)
 		RETURN(err_serious(-EPROTO));
 
 	req_capsule_set_size(tsi->tsi_pill, &RMF_GENERIC_DATA, RCL_SERVER,
@@ -374,13 +365,12 @@ static int fld_handle_query(struct tgt_session_info *tsi)
  * fid_is_local() is supposed to be used in assertion checks only.
  */
 int fid_is_local(const struct lu_env *env,
-		 struct lu_site *site, const struct lu_fid *fid)
+                 struct lu_site *site, const struct lu_fid *fid)
 {
 	int result;
 	struct seq_server_site *ss_site;
 	struct lu_seq_range *range;
 	struct fld_thread_info *info;
-
 	ENTRY;
 
 	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
@@ -388,7 +378,7 @@ int fid_is_local(const struct lu_env *env,
 
 	result = 1; /* conservatively assume fid is local */
 	ss_site = lu_site2seq(site);
-	if (ss_site->ss_client_fld) {
+	if (ss_site->ss_client_fld != NULL) {
 		int rc;
 
 		rc = fld_cache_lookup(ss_site->ss_client_fld->lcf_cache,
@@ -398,37 +388,54 @@ int fid_is_local(const struct lu_env *env,
 	}
 	return result;
 }
-EXPORT_SYMBOL(fid_is_local);
 
-static void fld_server_debugfs_fini(struct lu_server_fld *fld)
-{
-	if (!IS_ERR_OR_NULL(fld->lsf_debugfs_entry))
-		ldebugfs_remove(&fld->lsf_debugfs_entry);
-}
+static void fld_server_proc_fini(struct lu_server_fld *fld);
 
-static int fld_server_debugfs_init(struct lu_server_fld *fld)
+#ifdef CONFIG_PROC_FS
+static int fld_server_proc_init(struct lu_server_fld *fld)
 {
-	int rc = 0;
+        int rc = 0;
+        ENTRY;
 
-	ENTRY;
-	fld->lsf_debugfs_entry = ldebugfs_register(fld->lsf_name,
-						   fld_debugfs_dir,
-						   NULL, NULL);
-	if (IS_ERR_OR_NULL(fld->lsf_debugfs_entry)) {
-		rc = fld->lsf_debugfs_entry ? PTR_ERR(fld->lsf_debugfs_entry)
-					    : -ENOMEM;
-		fld->lsf_debugfs_entry = NULL;
+	fld->lsf_proc_dir = lprocfs_register(fld->lsf_name, fld_type_proc_dir,
+					     fld_server_proc_list, fld);
+	if (IS_ERR(fld->lsf_proc_dir)) {
+		rc = PTR_ERR(fld->lsf_proc_dir);
 		RETURN(rc);
 	}
 
-	rc = ldebugfs_seq_create(fld->lsf_debugfs_entry, "fldb", 0444,
-				 &fld_debugfs_seq_fops, fld);
-	if (rc)
-		ldebugfs_remove(&fld->lsf_debugfs_entry);
+	rc = lprocfs_seq_create(fld->lsf_proc_dir, "fldb", 0444,
+				&fld_proc_seq_fops, fld);
+	if (rc) {
+		lprocfs_remove(&fld->lsf_proc_dir);
+		fld->lsf_proc_dir = NULL;
+	}
 
 	RETURN(rc);
 }
 
+static void fld_server_proc_fini(struct lu_server_fld *fld)
+{
+        ENTRY;
+        if (fld->lsf_proc_dir != NULL) {
+                if (!IS_ERR(fld->lsf_proc_dir))
+                        lprocfs_remove(&fld->lsf_proc_dir);
+                fld->lsf_proc_dir = NULL;
+        }
+        EXIT;
+}
+#else
+static int fld_server_proc_init(struct lu_server_fld *fld)
+{
+        return 0;
+}
+
+static void fld_server_proc_fini(struct lu_server_fld *fld)
+{
+        return;
+}
+#endif
+
 int fld_server_init(const struct lu_env *env, struct lu_server_fld *fld,
 		    struct dt_device *dt, const char *prefix, int type)
 {
@@ -456,7 +463,7 @@ int fld_server_init(const struct lu_env *env, struct lu_server_fld *fld,
 	if (rc)
 		GOTO(out_cache, rc);
 
-	rc = fld_server_debugfs_init(fld);
+	rc = fld_server_proc_init(fld);
 	if (rc)
 		GOTO(out_index, rc);
 
@@ -477,10 +484,10 @@ void fld_server_fini(const struct lu_env *env, struct lu_server_fld *fld)
 {
 	ENTRY;
 
-	fld_server_debugfs_fini(fld);
+	fld_server_proc_fini(fld);
 	fld_index_fini(env, fld);
 
-	if (fld->lsf_cache) {
+	if (fld->lsf_cache != NULL) {
 		if (!IS_ERR(fld->lsf_cache))
 			fld_cache_fini(fld->lsf_cache);
 		fld->lsf_cache = NULL;
diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_index.c b/drivers/staging/lustrefsx/lustre/fld/fld_index.c
index f2079cb5b1f49..fa9ca9427f22f 100644
--- a/drivers/staging/lustrefsx/lustre/fld/fld_index.c
+++ b/drivers/staging/lustrefsx/lustre/fld/fld_index.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -84,10 +84,10 @@ int fld_declare_index_create(const struct lu_env *env,
 			     const struct lu_seq_range *new_range,
 			     struct thandle *th)
 {
-	struct lu_seq_range *tmp;
-	struct lu_seq_range *range;
-	struct fld_thread_info *info;
-	int rc = 0;
+	struct lu_seq_range	*tmp;
+	struct lu_seq_range	*range;
+	struct fld_thread_info	*info;
+	int			rc = 0;
 
 	ENTRY;
 
@@ -109,10 +109,8 @@ int fld_declare_index_create(const struct lu_env *env,
 		GOTO(out, rc);
 	}
 
-	/*
-	 * Check for merge case, since the fld entry can only be increamental,
-	 * so we will only check whether it can be merged from the left.
-	 */
+	/* Check for merge case, since the fld entry can only be increamental,
+	 * so we will only check whether it can be merged from the left. */
 	if (new_range->lsr_start == range->lsr_end && range->lsr_end != 0 &&
 	    lu_seq_range_compare_loc(new_range, range) == 0) {
 		range_cpu_to_be(tmp, range);
@@ -158,13 +156,12 @@ int fld_declare_index_create(const struct lu_env *env,
 int fld_index_create(const struct lu_env *env, struct lu_server_fld *fld,
 		     const struct lu_seq_range *new_range, struct thandle *th)
 {
-	struct lu_seq_range *range;
-	struct lu_seq_range *tmp;
-	struct fld_thread_info *info;
-	int rc = 0;
-	int deleted = 0;
-	struct fld_cache_entry *flde;
-
+	struct lu_seq_range	*range;
+	struct lu_seq_range	*tmp;
+	struct fld_thread_info	*info;
+	int			rc = 0;
+	int			deleted = 0;
+	struct fld_cache_entry	*flde;
 	ENTRY;
 
 	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
@@ -196,7 +193,7 @@ int fld_index_create(const struct lu_env *env, struct lu_server_fld *fld,
 
 	range_cpu_to_be(tmp, tmp);
 	rc = dt_insert(env, fld->lsf_obj, (struct dt_rec *)tmp,
-		       (struct dt_key *)&tmp->lsr_start, th);
+		       (struct dt_key *)&tmp->lsr_start, th, 1);
 	if (rc != 0) {
 		CERROR("%s: insert range "DRANGE" failed: rc = %d\n",
 		       fld->lsf_name, PRANGE(new_range), rc);
@@ -232,11 +229,11 @@ int fld_index_create(const struct lu_env *env, struct lu_server_fld *fld,
 int fld_index_lookup(const struct lu_env *env, struct lu_server_fld *fld,
 		     u64 seq, struct lu_seq_range *range)
 {
-	struct lu_seq_range *fld_rec;
-	struct fld_thread_info *info;
-	int rc;
+        struct lu_seq_range     *fld_rec;
+        struct fld_thread_info  *info;
+        int rc;
 
-	ENTRY;
+        ENTRY;
 
 	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
 	fld_rec = &info->fti_rec;
@@ -248,12 +245,12 @@ int fld_index_lookup(const struct lu_env *env, struct lu_server_fld *fld,
 			rc = 0;
 		else
 			rc = -ENOENT;
-	}
+        }
 
 	CDEBUG(D_INFO, "%s: lookup seq = %#llx range : "DRANGE" rc = %d\n",
-	       fld->lsf_name, seq, PRANGE(range), rc);
+               fld->lsf_name, seq, PRANGE(range), rc);
 
-	RETURN(rc);
+        RETURN(rc);
 }
 
 /**
@@ -276,7 +273,6 @@ int fld_insert_entry(const struct lu_env *env,
 	struct thandle *th;
 	struct dt_device *dt = lu2dt_dev(fld->lsf_obj->do_lu.lo_dev);
 	int rc;
-
 	ENTRY;
 
 	LASSERT(mutex_is_locked(&fld->lsf_lock));
@@ -329,18 +325,16 @@ static int fld_insert_special_entries(const struct lu_env *env,
 int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
 		   struct dt_device *dt, int type)
 {
-	struct dt_object *dt_obj = NULL;
-	struct lu_fid fid;
-	struct lu_attr *attr = NULL;
-	struct lu_seq_range *range = NULL;
-	struct fld_thread_info *info;
-	struct dt_object_format dof;
-	struct dt_it *it;
-	const struct dt_it_ops *iops;
-	int rc;
-	u32 index;
-	int range_count = 0;
-
+	struct dt_object	*dt_obj = NULL;
+	struct lu_fid		fid;
+	struct lu_attr		*attr = NULL;
+	struct lu_seq_range	*range = NULL;
+	struct fld_thread_info	*info;
+	struct dt_object_format	dof;
+	struct dt_it		*it;
+	const struct dt_it_ops	*iops;
+	int			rc;
+	__u32			index;
 	ENTRY;
 
 	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
@@ -348,7 +342,7 @@ int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
 
 	lu_local_obj_fid(&fid, FLD_INDEX_OID);
 	OBD_ALLOC_PTR(attr);
-	if (!attr)
+	if (attr == NULL)
 		RETURN(-ENOMEM);
 
 	memset(attr, 0, sizeof(*attr));
@@ -394,40 +388,25 @@ int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
 		GOTO(out, rc = PTR_ERR(it));
 
 	rc = iops->load(env, it, 0);
-	if (rc > 0)
-		rc = 0;
-	else if (rc == 0)
-		rc = iops->next(env, it);
-
 	if (rc < 0)
 		GOTO(out_it_fini, rc);
 
-	while (rc == 0) {
-		rc = iops->rec(env, it, (struct dt_rec *)range, 0);
-		if (rc != 0)
-			GOTO(out_it_put, rc);
-
-		range_be_to_cpu(range, range);
-
-		/*
-		 * Newly created ldiskfs IAM indexes may include a
-		 * zeroed-out key and record. Ignore it here.
-		 */
-		if (range->lsr_start < range->lsr_end) {
+	if (rc > 0) {
+		/* Load FLD entry into server cache */
+		do {
+			rc = iops->rec(env, it, (struct dt_rec *)range, 0);
+			if (rc != 0)
+				GOTO(out_it_put, rc);
+			LASSERT(range != NULL);
+			range_be_to_cpu(range, range);
 			rc = fld_cache_insert(fld->lsf_cache, range);
 			if (rc != 0)
 				GOTO(out_it_put, rc);
-
-			range_count++;
-		}
-
-		rc = iops->next(env, it);
-		if (rc < 0)
-			GOTO(out_it_fini, rc);
-	}
-
-	if (range_count == 0)
+			rc = iops->next(env, it);
+		} while (rc == 0);
+	} else {
 		fld->lsf_new = 1;
+	}
 
 	rc = fld_name_to_index(fld->lsf_name, &index);
 	if (rc < 0)
@@ -436,10 +415,8 @@ int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
 		rc = 0;
 
 	if (index == 0 && type == LU_SEQ_RANGE_MDT) {
-		/*
-		 * Note: fld_insert_entry will detect whether these
-		 * special entries already exist inside FLDB
-		 */
+		/* Note: fld_insert_entry will detect whether these
+		 * special entries already exist inside FLDB */
 		mutex_lock(&fld->lsf_lock);
 		rc = fld_insert_special_entries(env, fld);
 		mutex_unlock(&fld->lsf_lock);
@@ -454,11 +431,11 @@ int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
 out_it_fini:
 	iops->fini(env, it);
 out:
-	if (attr)
+	if (attr != NULL)
 		OBD_FREE_PTR(attr);
 
 	if (rc < 0) {
-		if (dt_obj)
+		if (dt_obj != NULL)
 			dt_object_put(env, dt_obj);
 		fld->lsf_obj = NULL;
 	}
@@ -468,7 +445,7 @@ int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
 void fld_index_fini(const struct lu_env *env, struct lu_server_fld *fld)
 {
 	ENTRY;
-	if (fld->lsf_obj) {
+	if (fld->lsf_obj != NULL) {
 		if (!IS_ERR(fld->lsf_obj))
 			dt_object_put(env, fld->lsf_obj);
 		fld->lsf_obj = NULL;
@@ -480,12 +457,12 @@ int fld_server_read(const struct lu_env *env, struct lu_server_fld *fld,
 		    struct lu_seq_range *range, void *data, int data_len)
 {
 	struct lu_seq_range_array *lsra = data;
-	struct fld_thread_info *info;
-	struct dt_object *dt_obj = fld->lsf_obj;
-	struct lu_seq_range *entry;
-	struct dt_it *it;
-	const struct dt_it_ops *iops;
-	int rc;
+	struct fld_thread_info	  *info;
+	struct dt_object	  *dt_obj = fld->lsf_obj;
+	struct lu_seq_range	  *entry;
+	struct dt_it		  *it;
+	const struct dt_it_ops	  *iops;
+	int			  rc;
 
 	ENTRY;
 
diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_internal.h b/drivers/staging/lustrefsx/lustre/fld/fld_internal.h
index 48337e0b6839b..dcb24a3c2f22a 100644
--- a/drivers/staging/lustrefsx/lustre/fld/fld_internal.h
+++ b/drivers/staging/lustrefsx/lustre/fld/fld_internal.h
@@ -56,6 +56,7 @@
 #define __FLD_INTERNAL_H
 
 #include <obd.h>
+#include <lustre/lustre_idl.h>
 #include <libcfs/libcfs.h>
 #include <lustre_fld.h>
 
@@ -138,6 +139,12 @@ enum {
 
 extern struct lu_fld_hash fld_hash[];
 
+
+#ifdef CONFIG_PROC_FS
+extern struct proc_dir_entry *fld_type_proc_dir;
+extern struct lprocfs_vars fld_client_proc_list[];
+#endif
+
 # ifdef HAVE_SERVER_SUPPORT
 struct fld_thread_info {
 	struct lu_seq_range fti_rec;
@@ -165,15 +172,16 @@ int fld_index_lookup(const struct lu_env *env, struct lu_server_fld *fld,
 		     u64 seq, struct lu_seq_range *range);
 
 int fld_name_to_index(const char *name, __u32 *index);
-
 int fld_server_mod_init(void);
+
 void fld_server_mod_exit(void);
 
 int fld_server_read(const struct lu_env *env, struct lu_server_fld *fld,
 		    struct lu_seq_range *range, void *data, int data_len);
-
-extern const struct file_operations fld_debugfs_seq_fops;
-extern struct dentry *fld_debugfs_dir;
+#ifdef CONFIG_PROC_FS
+extern const struct file_operations fld_proc_seq_fops;
+extern struct lprocfs_vars fld_server_proc_list[];
+#endif
 
 # endif /* HAVE_SERVER_SUPPORT */
 
@@ -181,8 +189,6 @@ int fld_client_rpc(struct obd_export *exp,
                    struct lu_seq_range *range, __u32 fld_op,
 		   struct ptlrpc_request **reqp);
 
-extern struct ldebugfs_vars fld_client_debugfs_list[];
-
 struct fld_cache *fld_cache_init(const char *name,
                                  int cache_size, int cache_threshold);
 
diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_request.c b/drivers/staging/lustrefsx/lustre/fld/fld_request.c
index 3dd616e0a6e94..19b5789c19851 100644
--- a/drivers/staging/lustrefsx/lustre/fld/fld_request.c
+++ b/drivers/staging/lustrefsx/lustre/fld/fld_request.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -60,18 +60,15 @@ static int fld_rrb_hash(struct lu_client_fld *fld, u64 seq)
 static struct lu_fld_target *
 fld_rrb_scan(struct lu_client_fld *fld, u64 seq)
 {
-	struct lu_fld_target *target;
-	int hash;
-
-	ENTRY;
+        struct lu_fld_target *target;
+        int hash;
+        ENTRY;
 
-	/*
-	 * Because almost all of special sequence located in MDT0,
+	/* Because almost all of special sequence located in MDT0,
 	 * it should go to index 0 directly, instead of calculating
 	 * hash again, and also if other MDTs is not being connected,
 	 * the fld lookup requests(for seq on MDT0) should not be
-	 * blocked because of other MDTs
-	 */
+	 * blocked because of other MDTs */
 	if (fid_seq_is_norm(seq))
 		hash = fld_rrb_hash(fld, seq);
 	else
@@ -79,59 +76,57 @@ fld_rrb_scan(struct lu_client_fld *fld, u64 seq)
 
 again:
 	list_for_each_entry(target, &fld->lcf_targets, ft_chain) {
-		if (target->ft_idx == hash)
-			RETURN(target);
-	}
+                if (target->ft_idx == hash)
+                        RETURN(target);
+        }
 
 	if (hash != 0) {
-		/*
-		 * It is possible the remote target(MDT) are not connected to
+		/* It is possible the remote target(MDT) are not connected to
 		 * with client yet, so we will refer this to MDT0, which should
-		 * be connected during mount
-		 */
+		 * be connected during mount */
 		hash = 0;
 		goto again;
 	}
 
-	CERROR("%s: Can't find target by hash %d (seq %#llx). Targets (%d):\n",
-	       fld->lcf_name, hash, seq, fld->lcf_count);
+	CERROR("%s: Can't find target by hash %d (seq %#llx). "
+               "Targets (%d):\n", fld->lcf_name, hash, seq,
+               fld->lcf_count);
 
 	list_for_each_entry(target, &fld->lcf_targets, ft_chain) {
-		const char *srv_name = target->ft_srv != NULL  ?
-			target->ft_srv->lsf_name : "<null>";
-		const char *exp_name = target->ft_exp != NULL ?
-			(char *)target->ft_exp->exp_obd->obd_uuid.uuid :
-			"<null>";
+                const char *srv_name = target->ft_srv != NULL  ?
+                        target->ft_srv->lsf_name : "<null>";
+                const char *exp_name = target->ft_exp != NULL ?
+                        (char *)target->ft_exp->exp_obd->obd_uuid.uuid :
+                        "<null>";
 
 		CERROR("  exp: 0x%p (%s), srv: 0x%p (%s), idx: %llu\n",
-		       target->ft_exp, exp_name, target->ft_srv,
-		       srv_name, target->ft_idx);
-	}
-
-	/*
-	 * If target is not found, there is logical error anyway, so here is
-	 * LBUG() to catch this situation.
-	 */
-	LBUG();
-	RETURN(NULL);
+                       target->ft_exp, exp_name, target->ft_srv,
+                       srv_name, target->ft_idx);
+        }
+
+        /*
+         * If target is not found, there is logical error anyway, so here is
+         * LBUG() to catch this situation.
+         */
+        LBUG();
+        RETURN(NULL);
 }
 
 struct lu_fld_hash fld_hash[] = {
-	{
-		.fh_name = "RRB",
-		.fh_hash_func = fld_rrb_hash,
-		.fh_scan_func = fld_rrb_scan
-	},
-	{
+        {
+                .fh_name = "RRB",
+                .fh_hash_func = fld_rrb_hash,
+                .fh_scan_func = fld_rrb_scan
+        },
+        {
 		NULL,
-	}
+        }
 };
 
 static struct lu_fld_target *
 fld_client_get_target(struct lu_client_fld *fld, u64 seq)
 {
 	struct lu_fld_target *target;
-
 	ENTRY;
 
 	LASSERT(fld->lcf_hash != NULL);
@@ -140,12 +135,13 @@ fld_client_get_target(struct lu_client_fld *fld, u64 seq)
 	target = fld->lcf_hash->fh_scan_func(fld, seq);
 	spin_unlock(&fld->lcf_lock);
 
-	if (target) {
-		CDEBUG(D_INFO, "%s: Found target (idx %llu) by seq %#llx\n",
-		       fld->lcf_name, target->ft_idx, seq);
-	}
+        if (target != NULL) {
+		CDEBUG(D_INFO, "%s: Found target (idx %llu"
+		       ") by seq %#llx\n", fld->lcf_name,
+                       target->ft_idx, seq);
+        }
 
-	RETURN(target);
+        RETURN(target);
 }
 
 /*
@@ -153,45 +149,44 @@ fld_client_get_target(struct lu_client_fld *fld, u64 seq)
  * of FLD module.
  */
 int fld_client_add_target(struct lu_client_fld *fld,
-			  struct lu_fld_target *tar)
+                          struct lu_fld_target *tar)
 {
 	const char *name;
-	struct lu_fld_target *target, *tmp;
+        struct lu_fld_target *target, *tmp;
+        ENTRY;
 
-	ENTRY;
-
-	LASSERT(tar != NULL);
+        LASSERT(tar != NULL);
 	name = fld_target_name(tar);
-	LASSERT(name != NULL);
-	LASSERT(tar->ft_srv != NULL || tar->ft_exp != NULL);
+        LASSERT(name != NULL);
+        LASSERT(tar->ft_srv != NULL || tar->ft_exp != NULL);
 
 	CDEBUG(D_INFO, "%s: Adding target %s (idx %llu)\n", fld->lcf_name,
 	       name, tar->ft_idx);
 
-	OBD_ALLOC_PTR(target);
-	if (!target)
-		RETURN(-ENOMEM);
+        OBD_ALLOC_PTR(target);
+        if (target == NULL)
+                RETURN(-ENOMEM);
 
 	spin_lock(&fld->lcf_lock);
 	list_for_each_entry(tmp, &fld->lcf_targets, ft_chain) {
 		if (tmp->ft_idx == tar->ft_idx) {
 			spin_unlock(&fld->lcf_lock);
-			OBD_FREE_PTR(target);
+                        OBD_FREE_PTR(target);
 			CERROR("Target %s exists in FLD and known as %s:#%llu\n",
-			       name, fld_target_name(tmp), tmp->ft_idx);
-			RETURN(-EEXIST);
-		}
-	}
+                               name, fld_target_name(tmp), tmp->ft_idx);
+                        RETURN(-EEXIST);
+                }
+        }
 
-	target->ft_exp = tar->ft_exp;
-	if (target->ft_exp)
-		class_export_get(target->ft_exp);
-	target->ft_srv = tar->ft_srv;
-	target->ft_idx = tar->ft_idx;
+        target->ft_exp = tar->ft_exp;
+        if (target->ft_exp != NULL)
+                class_export_get(target->ft_exp);
+        target->ft_srv = tar->ft_srv;
+        target->ft_idx = tar->ft_idx;
 
 	list_add_tail(&target->ft_chain, &fld->lcf_targets);
 
-	fld->lcf_count++;
+        fld->lcf_count++;
 	spin_unlock(&fld->lcf_lock);
 
 	RETURN(0);
@@ -199,10 +194,9 @@ int fld_client_add_target(struct lu_client_fld *fld,
 EXPORT_SYMBOL(fld_client_add_target);
 
 /* Remove export from FLD */
-int fld_client_del_target(struct lu_client_fld *fld, u64 idx)
+int fld_client_del_target(struct lu_client_fld *fld, __u64 idx)
 {
 	struct lu_fld_target *target, *tmp;
-
 	ENTRY;
 
 	spin_lock(&fld->lcf_lock);
@@ -212,161 +206,182 @@ int fld_client_del_target(struct lu_client_fld *fld, u64 idx)
 			list_del(&target->ft_chain);
 			spin_unlock(&fld->lcf_lock);
 
-			if (target->ft_exp)
-				class_export_put(target->ft_exp);
+                        if (target->ft_exp != NULL)
+                                class_export_put(target->ft_exp);
 
-			OBD_FREE_PTR(target);
-			RETURN(0);
-		}
-	}
+                        OBD_FREE_PTR(target);
+                        RETURN(0);
+                }
+        }
 	spin_unlock(&fld->lcf_lock);
 	RETURN(-ENOENT);
 }
 
-struct dentry *fld_debugfs_dir;
-
-static int fld_client_debugfs_init(struct lu_client_fld *fld)
+#ifdef CONFIG_PROC_FS
+static int fld_client_proc_init(struct lu_client_fld *fld)
 {
 	int rc;
-
 	ENTRY;
-	fld->lcf_debugfs_entry = ldebugfs_register(fld->lcf_name,
-						   fld_debugfs_dir,
-						   fld_client_debugfs_list,
-						   fld);
-	if (IS_ERR_OR_NULL(fld->lcf_debugfs_entry)) {
-		CERROR("%s: LdebugFS failed in fld-init\n", fld->lcf_name);
-		rc = fld->lcf_debugfs_entry ? PTR_ERR(fld->lcf_debugfs_entry)
-					    : -ENOMEM;
-		fld->lcf_debugfs_entry = NULL;
+
+	fld->lcf_proc_dir = lprocfs_register(fld->lcf_name, fld_type_proc_dir,
+					     NULL, NULL);
+	if (IS_ERR(fld->lcf_proc_dir)) {
+		CERROR("%s: LProcFS failed in fld-init\n",
+		       fld->lcf_name);
+		rc = PTR_ERR(fld->lcf_proc_dir);
 		RETURN(rc);
 	}
 
-	return 0;
+	rc = lprocfs_add_vars(fld->lcf_proc_dir, fld_client_proc_list, fld);
+	if (rc) {
+		CERROR("%s: Can't init FLD proc, rc %d\n",
+		       fld->lcf_name, rc);
+		GOTO(out_cleanup, rc);
+	}
+
+	RETURN(0);
+
+out_cleanup:
+	fld_client_proc_fini(fld);
+	return rc;
 }
 
-void fld_client_debugfs_fini(struct lu_client_fld *fld)
+void fld_client_proc_fini(struct lu_client_fld *fld)
 {
-	if (!IS_ERR_OR_NULL(fld->lcf_debugfs_entry))
-		ldebugfs_remove(&fld->lcf_debugfs_entry);
+        ENTRY;
+        if (fld->lcf_proc_dir) {
+                if (!IS_ERR(fld->lcf_proc_dir))
+                        lprocfs_remove(&fld->lcf_proc_dir);
+                fld->lcf_proc_dir = NULL;
+        }
+        EXIT;
+}
+#else /* !CONFIG_PROC_FS */
+static int fld_client_proc_init(struct lu_client_fld *fld)
+{
+        return 0;
 }
-EXPORT_SYMBOL(fld_client_debugfs_fini);
+
+void fld_client_proc_fini(struct lu_client_fld *fld)
+{
+        return;
+}
+#endif /* CONFIG_PROC_FS */
+
+EXPORT_SYMBOL(fld_client_proc_fini);
 
 static inline int hash_is_sane(int hash)
 {
-	return (hash >= 0 && hash < ARRAY_SIZE(fld_hash));
+        return (hash >= 0 && hash < ARRAY_SIZE(fld_hash));
 }
 
 int fld_client_init(struct lu_client_fld *fld,
-		    const char *prefix, int hash)
+                    const char *prefix, int hash)
 {
-	int cache_size, cache_threshold;
-	int rc;
+        int cache_size, cache_threshold;
+        int rc;
+        ENTRY;
 
-	ENTRY;
-	snprintf(fld->lcf_name, sizeof(fld->lcf_name),
-		 "cli-%s", prefix);
+        LASSERT(fld != NULL);
 
-	if (!hash_is_sane(hash)) {
-		CERROR("%s: Wrong hash function %#x\n",
-		       fld->lcf_name, hash);
-		RETURN(-EINVAL);
-	}
+        snprintf(fld->lcf_name, sizeof(fld->lcf_name),
+                 "cli-%s", prefix);
+
+        if (!hash_is_sane(hash)) {
+                CERROR("%s: Wrong hash function %#x\n",
+                       fld->lcf_name, hash);
+                RETURN(-EINVAL);
+        }
 
 	fld->lcf_count = 0;
 	spin_lock_init(&fld->lcf_lock);
 	fld->lcf_hash = &fld_hash[hash];
 	INIT_LIST_HEAD(&fld->lcf_targets);
 
-	cache_size = FLD_CLIENT_CACHE_SIZE /
-		sizeof(struct fld_cache_entry);
+        cache_size = FLD_CLIENT_CACHE_SIZE /
+                sizeof(struct fld_cache_entry);
 
-	cache_threshold = cache_size *
-		FLD_CLIENT_CACHE_THRESHOLD / 100;
+        cache_threshold = cache_size *
+                FLD_CLIENT_CACHE_THRESHOLD / 100;
 
-	fld->lcf_cache = fld_cache_init(fld->lcf_name,
-					cache_size, cache_threshold);
-	if (IS_ERR(fld->lcf_cache)) {
-		rc = PTR_ERR(fld->lcf_cache);
-		fld->lcf_cache = NULL;
-		GOTO(out, rc);
-	}
+        fld->lcf_cache = fld_cache_init(fld->lcf_name,
+                                        cache_size, cache_threshold);
+        if (IS_ERR(fld->lcf_cache)) {
+                rc = PTR_ERR(fld->lcf_cache);
+                fld->lcf_cache = NULL;
+                GOTO(out, rc);
+        }
 
-	rc = fld_client_debugfs_init(fld);
-	if (rc)
-		GOTO(out, rc);
-	EXIT;
+        rc = fld_client_proc_init(fld);
+        if (rc)
+                GOTO(out, rc);
+        EXIT;
 out:
-	if (rc)
-		fld_client_fini(fld);
-	else
-		CDEBUG(D_INFO, "%s: Using \"%s\" hash\n",
-		       fld->lcf_name, fld->lcf_hash->fh_name);
-	return rc;
+        if (rc)
+                fld_client_fini(fld);
+        else
+                CDEBUG(D_INFO, "%s: Using \"%s\" hash\n",
+                       fld->lcf_name, fld->lcf_hash->fh_name);
+        return rc;
 }
 EXPORT_SYMBOL(fld_client_init);
 
 void fld_client_fini(struct lu_client_fld *fld)
 {
 	struct lu_fld_target *target, *tmp;
-
 	ENTRY;
 
 	spin_lock(&fld->lcf_lock);
 	list_for_each_entry_safe(target, tmp, &fld->lcf_targets, ft_chain) {
-		fld->lcf_count--;
+                fld->lcf_count--;
 		list_del(&target->ft_chain);
-		if (target->ft_exp)
-			class_export_put(target->ft_exp);
-		OBD_FREE_PTR(target);
-	}
+                if (target->ft_exp != NULL)
+                        class_export_put(target->ft_exp);
+                OBD_FREE_PTR(target);
+        }
 	spin_unlock(&fld->lcf_lock);
 
-	if (fld->lcf_cache) {
-		if (!IS_ERR(fld->lcf_cache))
-			fld_cache_fini(fld->lcf_cache);
-		fld->lcf_cache = NULL;
-	}
+        if (fld->lcf_cache != NULL) {
+                if (!IS_ERR(fld->lcf_cache))
+                        fld_cache_fini(fld->lcf_cache);
+                fld->lcf_cache = NULL;
+        }
 
-	EXIT;
+        EXIT;
 }
 EXPORT_SYMBOL(fld_client_fini);
 
 int fld_client_rpc(struct obd_export *exp,
-		   struct lu_seq_range *range, u32 fld_op,
+		   struct lu_seq_range *range, __u32 fld_op,
 		   struct ptlrpc_request **reqp)
 {
 	struct ptlrpc_request *req = NULL;
-	struct lu_seq_range *prange;
-	u32 *op;
-	int rc = 0;
-	struct obd_import *imp;
-
+	struct lu_seq_range   *prange;
+	__u32                 *op;
+	int                    rc = 0;
+	struct obd_import     *imp;
 	ENTRY;
 
 	LASSERT(exp != NULL);
 
+again:
 	imp = class_exp2cliimp(exp);
 	switch (fld_op) {
 	case FLD_QUERY:
 		req = ptlrpc_request_alloc_pack(imp, &RQF_FLD_QUERY,
 						LUSTRE_MDS_VERSION, FLD_QUERY);
-		if (!req)
+		if (req == NULL)
 			RETURN(-ENOMEM);
 
-		/*
-		 * XXX: only needed when talking to old server(< 2.6), it should
-		 * be removed when < 2.6 server is not supported
-		 */
+		/* XXX: only needed when talking to old server(< 2.6), it should
+		 * be removed when < 2.6 server is not supported */
 		op = req_capsule_client_get(&req->rq_pill, &RMF_FLD_OPC);
 		*op = FLD_LOOKUP;
 
-		/*
-		 * For MDS_MDS seq lookup, it will always use LWP connection,
+		/* For MDS_MDS seq lookup, it will always use LWP connection,
 		 * but LWP will be evicted after restart, so cause the error.
 		 * so we will set no_delay for seq lookup request, once the
-		 * request fails because of the eviction. always retry here
-		 */
+		 * request fails because of the eviction. always retry here */
 		if (imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS) {
 			req->rq_allow_replay = 1;
 			req->rq_no_delay = 1;
@@ -375,7 +390,7 @@ int fld_client_rpc(struct obd_export *exp,
 	case FLD_READ:
 		req = ptlrpc_request_alloc_pack(imp, &RQF_FLD_READ,
 						LUSTRE_MDS_VERSION, FLD_READ);
-		if (!req)
+		if (req == NULL)
 			RETURN(-ENOMEM);
 
 		req_capsule_set_size(&req->rq_pill, &RMF_GENERIC_DATA,
@@ -392,19 +407,13 @@ int fld_client_rpc(struct obd_export *exp,
 	prange = req_capsule_client_get(&req->rq_pill, &RMF_FLD_MDFLD);
 	*prange = *range;
 	ptlrpc_request_set_replen(req);
-	req->rq_request_portal = FLD_REQUEST_PORTAL;
+        req->rq_request_portal = FLD_REQUEST_PORTAL;
 	req->rq_reply_portal = MDC_REPLY_PORTAL;
-	ptlrpc_at_set_req_timeout(req);
-
-	if (OBD_FAIL_CHECK(OBD_FAIL_FLD_QUERY_REQ && req->rq_no_delay)) {
-		/* the same error returned by ptlrpc_import_delay_req */
-		rc = -EWOULDBLOCK;
-		req->rq_status = rc;
-	} else {
-		obd_get_request_slot(&exp->exp_obd->u.cli);
-		rc = ptlrpc_queue_wait(req);
-		obd_put_request_slot(&exp->exp_obd->u.cli);
-	}
+        ptlrpc_at_set_req_timeout(req);
+
+	obd_get_request_slot(&exp->exp_obd->u.cli);
+	rc = ptlrpc_queue_wait(req);
+	obd_put_request_slot(&exp->exp_obd->u.cli);
 
 	if (rc == -ENOENT) {
 		/* Don't loop forever on non-existing FID sequences. */
@@ -417,11 +426,14 @@ int fld_client_rpc(struct obd_export *exp,
 		    imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS &&
 		    OCD_HAS_FLAG(&imp->imp_connect_data, LIGHTWEIGHT) &&
 		    rc != -ENOTSUPP) {
-			/*
-			 * Since LWP is not replayable, so notify the caller
-			 * to retry if needed after a while.
-			 */
-			rc = -EAGAIN;
+			/* Since LWP is not replayable, so it will keep
+			 * trying unless umount happens or the remote
+			 * target does not support the operation, otherwise
+			 * it would cause unecessary failure of the
+			 * application. */
+			ptlrpc_req_finished(req);
+			rc = 0;
+			goto again;
 		}
 		GOTO(out_req, rc);
 	}
@@ -429,32 +441,31 @@ int fld_client_rpc(struct obd_export *exp,
 	if (fld_op == FLD_QUERY) {
 		prange = req_capsule_server_get(&req->rq_pill,
 						&RMF_FLD_MDFLD);
-		if (!prange)
+		if (prange == NULL)
 			GOTO(out_req, rc = -EFAULT);
 		*range = *prange;
 	}
 
 	EXIT;
 out_req:
-	if (rc != 0 || !reqp) {
+	if (rc != 0 || reqp == NULL) {
 		ptlrpc_req_finished(req);
 		req = NULL;
 	}
 
-	if (reqp)
+	if (reqp != NULL)
 		*reqp = req;
 
 	return rc;
 }
 
 int fld_client_lookup(struct lu_client_fld *fld, u64 seq, u32 *mds,
-		      u32 flags, const struct lu_env *env)
+		      __u32 flags, const struct lu_env *env)
 {
 	struct lu_seq_range res = { 0 };
 	struct lu_fld_target *target;
 	struct lu_fld_target *origin;
 	int rc;
-
 	ENTRY;
 
 	rc = fld_cache_lookup(fld->lcf_cache, seq, &res);
@@ -463,19 +474,20 @@ int fld_client_lookup(struct lu_client_fld *fld, u64 seq, u32 *mds,
 		RETURN(0);
 	}
 
-	/* Can not find it in the cache */
-	target = fld_client_get_target(fld, seq);
-	LASSERT(target != NULL);
+        /* Can not find it in the cache */
+        target = fld_client_get_target(fld, seq);
+        LASSERT(target != NULL);
 	origin = target;
 again:
-	CDEBUG(D_INFO, "%s: Lookup fld entry (seq: %#llx) on target %s (idx %llu)\n",
-	       fld->lcf_name, seq, fld_target_name(target), target->ft_idx);
+	CDEBUG(D_INFO, "%s: Lookup fld entry (seq: %#llx) on "
+	       "target %s (idx %llu)\n", fld->lcf_name, seq,
+               fld_target_name(target), target->ft_idx);
 
 	res.lsr_start = seq;
 	fld_range_set_type(&res, flags);
 
 #ifdef HAVE_SERVER_SUPPORT
-	if (target->ft_srv) {
+	if (target->ft_srv != NULL) {
 		LASSERT(env != NULL);
 		rc = fld_server_lookup(env, target->ft_srv, seq, &res);
 	} else
@@ -485,17 +497,15 @@ int fld_client_lookup(struct lu_client_fld *fld, u64 seq, u32 *mds,
 	}
 
 	if (rc == -ESHUTDOWN) {
-		/*
-		 * If fld lookup failed because the target has been shutdown,
+		/* If fld lookup failed because the target has been shutdown,
 		 * then try next target in the list, until trying all targets
-		 * or fld lookup succeeds
-		 */
+		 * or fld lookup succeeds */
 		spin_lock(&fld->lcf_lock);
-		/*
-		 * If the next entry in the list is the head of the list,
+
+		/* If the next entry in the list is the head of the list,
 		 * move to the next entry after the head and retrieve
-		 * the target. Else retreive the next target entry.
-		 */
+		 * the target. Else retreive the next target entry. */
+
 		if (target->ft_chain.next == &fld->lcf_targets)
 			target = list_entry(target->ft_chain.next->next,
 					    struct lu_fld_target, ft_chain);
@@ -518,23 +528,25 @@ EXPORT_SYMBOL(fld_client_lookup);
 
 void fld_client_flush(struct lu_client_fld *fld)
 {
-	fld_cache_flush(fld->lcf_cache);
+        fld_cache_flush(fld->lcf_cache);
 }
 
+
+struct proc_dir_entry *fld_type_proc_dir;
+
 static int __init fld_init(void)
 {
-#ifdef HAVE_SERVER_SUPPORT
-	int rc;
+	fld_type_proc_dir = lprocfs_register(LUSTRE_FLD_NAME,
+					     proc_lustre_root,
+					     NULL, NULL);
+	if (IS_ERR(fld_type_proc_dir))
+		return PTR_ERR(fld_type_proc_dir);
 
-	rc = fld_server_mod_init();
-	if (rc)
-		return rc;
+#ifdef HAVE_SERVER_SUPPORT
+	fld_server_mod_init();
 #endif /* HAVE_SERVER_SUPPORT */
 
-	fld_debugfs_dir = ldebugfs_register(LUSTRE_FLD_NAME,
-					    debugfs_lustre_root,
-					    NULL, NULL);
-	return PTR_ERR_OR_ZERO(fld_debugfs_dir);
+	return 0;
 }
 
 static void __exit fld_exit(void)
@@ -543,8 +555,10 @@ static void __exit fld_exit(void)
 	fld_server_mod_exit();
 #endif /* HAVE_SERVER_SUPPORT */
 
-	if (!IS_ERR_OR_NULL(fld_debugfs_dir))
-		ldebugfs_remove(&fld_debugfs_dir);
+	if (fld_type_proc_dir != NULL && !IS_ERR(fld_type_proc_dir)) {
+		lprocfs_remove(&fld_type_proc_dir);
+		fld_type_proc_dir = NULL;
+	}
 }
 
 MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
diff --git a/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c b/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c
index a555889f57730..926ed5598052b 100644
--- a/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c
+++ b/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c
@@ -41,36 +41,37 @@
 
 #include <libcfs/libcfs.h>
 #include <linux/module.h>
-
-#ifdef HAVE_SERVER_SUPPORT
 #include <dt_object.h>
-#endif
 #include <obd_support.h>
 #include <lustre_fld.h>
 #include <lustre_fid.h>
 #include "fld_internal.h"
 
+#ifdef CONFIG_PROC_FS
 static int
-fld_debugfs_targets_seq_show(struct seq_file *m, void *unused)
+fld_proc_targets_seq_show(struct seq_file *m, void *unused)
 {
 	struct lu_client_fld *fld = (struct lu_client_fld *)m->private;
         struct lu_fld_target *target;
-
 	ENTRY;
+
+	LASSERT(fld != NULL);
+
 	spin_lock(&fld->lcf_lock);
 	list_for_each_entry(target, &fld->lcf_targets, ft_chain)
 	seq_printf(m, "%s\n", fld_target_name(target));
 	spin_unlock(&fld->lcf_lock);
-
 	RETURN(0);
 }
 
 static int
-fld_debugfs_hash_seq_show(struct seq_file *m, void *unused)
+fld_proc_hash_seq_show(struct seq_file *m, void *unused)
 {
 	struct lu_client_fld *fld = (struct lu_client_fld *)m->private;
-
 	ENTRY;
+
+	LASSERT(fld != NULL);
+
 	spin_lock(&fld->lcf_lock);
 	seq_printf(m, "%s\n", fld->lcf_hash->fh_name);
 	spin_unlock(&fld->lcf_lock);
@@ -79,7 +80,7 @@ fld_debugfs_hash_seq_show(struct seq_file *m, void *unused)
 }
 
 static ssize_t
-fld_debugfs_hash_seq_write(struct file *file, const char __user *buffer,
+fld_proc_hash_seq_write(struct file *file, const char __user *buffer,
 			size_t count, loff_t *off)
 {
 	struct lu_client_fld *fld;
@@ -90,12 +91,13 @@ fld_debugfs_hash_seq_write(struct file *file, const char __user *buffer,
 	if (count > sizeof(fh_name))
 		return -ENAMETOOLONG;
 
-	if (copy_from_user(fh_name, buffer, count) != 0)
+	if (lprocfs_copy_from_user(file, fh_name, buffer, count) != 0)
 		return -EFAULT;
 
 	fld = ((struct seq_file *)file->private_data)->private;
+	LASSERT(fld != NULL);
 
-	for (i = 0; fld_hash[i].fh_name; i++) {
+	for (i = 0; fld_hash[i].fh_name != NULL; i++) {
 		if (count != strlen(fld_hash[i].fh_name))
 			continue;
 
@@ -105,7 +107,7 @@ fld_debugfs_hash_seq_write(struct file *file, const char __user *buffer,
 		}
 	}
 
-	if (hash) {
+	if (hash != NULL) {
 		spin_lock(&fld->lcf_lock);
 		fld->lcf_hash = hash;
 		spin_unlock(&fld->lcf_lock);
@@ -117,14 +119,15 @@ fld_debugfs_hash_seq_write(struct file *file, const char __user *buffer,
 	return count;
 }
 
-static ssize_t ldebugfs_cache_flush_seq_write(struct file *file,
-					      const char __user *buffer,
-					      size_t count, loff_t *pos)
+static ssize_t
+lprocfs_cache_flush_seq_write(struct file *file, const char __user *buffer,
+			       size_t count, loff_t *pos)
 {
-	struct seq_file *m = file->private_data;
-	struct lu_client_fld *fld = m->private;
-
+	struct lu_client_fld *fld = ((struct seq_file *)file->private_data)->private;
 	ENTRY;
+
+        LASSERT(fld != NULL);
+
         fld_cache_flush(fld->lcf_cache);
 
         CDEBUG(D_INFO, "%s: Lookup cache is flushed\n", fld->lcf_name);
@@ -132,15 +135,15 @@ static ssize_t ldebugfs_cache_flush_seq_write(struct file *file,
         RETURN(count);
 }
 
-LDEBUGFS_SEQ_FOPS_RO(fld_debugfs_targets);
-LDEBUGFS_SEQ_FOPS(fld_debugfs_hash);
-LDEBUGFS_FOPS_WR_ONLY(fld, cache_flush);
+LPROC_SEQ_FOPS_RO(fld_proc_targets);
+LPROC_SEQ_FOPS(fld_proc_hash);
+LPROC_SEQ_FOPS_WO_TYPE(fld, cache_flush);
 
-struct ldebugfs_vars fld_client_debugfs_list[] = {
+struct lprocfs_vars fld_client_proc_list[] = {
 	{ .name	=	"targets",
-	  .fops	=	&fld_debugfs_targets_fops	},
+	  .fops	=	&fld_proc_targets_fops	},
 	{ .name	=	"hash",
-	  .fops	=	&fld_debugfs_hash_fops	},
+	  .fops	=	&fld_proc_hash_fops	},
 	{ .name	=	"cache_flush",
 	  .fops	=	&fld_cache_flush_fops	},
 	{ NULL }
@@ -272,13 +275,17 @@ struct seq_operations fldb_sops = {
 static int fldb_seq_open(struct inode *inode, struct file *file)
 {
 	struct seq_file		*seq;
-	struct lu_server_fld    *fld = inode->i_private;
+	struct lu_server_fld    *fld = (struct lu_server_fld *)PDE_DATA(inode);
 	struct dt_object	*obj;
 	const struct dt_it_ops  *iops;
 	struct fld_seq_param    *param = NULL;
 	int			env_init = 0;
 	int			rc;
 
+	rc = LPROCFS_ENTRY_CHECK(inode);
+	if (rc < 0)
+		return rc;
+
 	rc = seq_open(file, &fldb_sops);
 	if (rc)
 		GOTO(out, rc);
@@ -348,11 +355,17 @@ static int fldb_seq_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-const struct file_operations fld_debugfs_seq_fops = {
+const struct file_operations fld_proc_seq_fops = {
 	.owner   = THIS_MODULE,
 	.open    = fldb_seq_open,
 	.read    = seq_read,
 	.release = fldb_seq_release,
 };
 
+struct lprocfs_vars fld_server_proc_list[] = {
+	{ NULL }
+};
+
 # endif /* HAVE_SERVER_SUPPORT */
+
+#endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/include/cl_object.h b/drivers/staging/lustrefsx/lustre/include/cl_object.h
index f0c8a5b4bfda0..78d09269a33c9 100644
--- a/drivers/staging/lustrefsx/lustre/include/cl_object.h
+++ b/drivers/staging/lustrefsx/lustre/include/cl_object.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -88,17 +88,14 @@
 /*
  * super-class definitions.
  */
-#include <linux/aio.h>
-#include <linux/fs.h>
-
 #include <libcfs/libcfs.h>
+#include <libcfs/libcfs_ptask.h>
 #include <lu_object.h>
 #include <linux/atomic.h>
 #include <linux/mutex.h>
 #include <linux/radix-tree.h>
 #include <linux/spinlock.h>
 #include <linux/wait.h>
-#include <linux/pagevec.h>
 #include <lustre_dlm.h>
 
 struct obd_info;
@@ -121,6 +118,8 @@ struct cl_io_slice;
 
 struct cl_req_attr;
 
+extern struct cfs_ptask_engine *cl_io_engine;
+
 /**
  * Device in the client stack.
  *
@@ -416,13 +415,6 @@ struct cl_object_operations {
 	void (*coo_req_attr_set)(const struct lu_env *env,
 				 struct cl_object *obj,
 				 struct cl_req_attr *attr);
-	/**
-	 * Flush \a obj data corresponding to \a lock. Used for DoM
-	 * locks in llite's cancelling blocking ast callback.
-	 */
-	int (*coo_object_flush)(const struct lu_env *env,
-				struct cl_object *obj,
-				struct ldlm_lock *lock);
 };
 
 /**
@@ -711,7 +703,7 @@ enum cl_page_type {
 
         /** Transient page, the transient cl_page is used to bind a cl_page
          *  to vmpage which is not belonging to the same object of cl_page.
-         *  it is used in DirectIO and lockless IO. */
+         *  it is used in DirectIO, lockless IO and liblustre. */
         CPT_TRANSIENT,
 };
 
@@ -872,13 +864,6 @@ struct cl_page_operations {
          */
         int (*cpo_is_vmlocked)(const struct lu_env *env,
                                const struct cl_page_slice *slice);
-
-	/**
-	 * Update file attributes when all we have is this page.  Used for tiny
-	 * writes to update attributes when we don't have a full cl_io.
-	 */
-	void (*cpo_page_touch)(const struct lu_env *env,
-			       const struct cl_page_slice *slice, size_t to);
         /**
          * Page destruction.
          */
@@ -903,8 +888,7 @@ struct cl_page_operations {
                            const struct cl_page_slice *slice);
         /** Destructor. Frees resources and slice itself. */
         void (*cpo_fini)(const struct lu_env *env,
-			 struct cl_page_slice *slice,
-			 struct pagevec *pvec);
+                         struct cl_page_slice *slice);
         /**
          * Optional debugging helper. Prints given page slice.
          *
@@ -1087,13 +1071,15 @@ static inline bool __page_in_use(const struct cl_page *page, int refc)
  * (struct cl_lock) and a list of layers (struct cl_lock_slice), linked to
  * cl_lock::cll_layers list through cl_lock_slice::cls_linkage.
  *
- * Typical cl_lock consists of one layer:
+ * Typical cl_lock consists of the two layers:
  *
+ *     - vvp_lock (vvp specific data), and
  *     - lov_lock (lov specific data).
  *
  * lov_lock contains an array of sub-locks. Each of these sub-locks is a
  * normal cl_lock: it has a header (struct cl_lock) and a list of layers:
  *
+ *     - lovsub_lock, and
  *     - osc_lock
  *
  * Each sub-lock is associated with a cl_object (representing stripe
@@ -1213,7 +1199,7 @@ struct cl_lock {
 /**
  * Per-layer part of cl_lock
  *
- * \see lov_lock, osc_lock
+ * \see vvp_lock, lov_lock, lovsub_lock, osc_lock
  */
 struct cl_lock_slice {
         struct cl_lock                  *cls_lock;
@@ -1227,7 +1213,7 @@ struct cl_lock_slice {
 
 /**
  *
- * \see lov_lock_ops, osc_lock_ops
+ * \see vvp_lock_ops, lov_lock_ops, lovsub_lock_ops, osc_lock_ops
  */
 struct cl_lock_operations {
 	/** @{ */
@@ -1239,7 +1225,8 @@ struct cl_lock_operations {
 	 *		@anchor for resources
 	 * \retval -ve	failure
 	 *
-	 * \see lov_lock_enqueue(), osc_lock_enqueue()
+	 * \see vvp_lock_enqueue(), lov_lock_enqueue(), lovsub_lock_enqueue(),
+	 * \see osc_lock_enqueue()
 	 */
 	int  (*clo_enqueue)(const struct lu_env *env,
 			    const struct cl_lock_slice *slice,
@@ -1254,7 +1241,8 @@ struct cl_lock_operations {
 	/**
 	 * Destructor. Frees resources and the slice.
 	 *
-	 * \see lov_lock_fini(), osc_lock_fini()
+	 * \see vvp_lock_fini(), lov_lock_fini(), lovsub_lock_fini(),
+	 * \see osc_lock_fini()
 	 */
         void (*clo_fini)(const struct lu_env *env, struct cl_lock_slice *slice);
         /**
@@ -1309,7 +1297,7 @@ struct cl_page_list {
 	struct task_struct	*pl_owner;
 };
 
-/**
+/** 
  * A 2-queue of pages. A convenience data-type for common use case, 2-queue
  * contains an incoming page list and an outgoing page list.
  */
@@ -1390,10 +1378,6 @@ enum cl_io_type {
 	 * To write out a range of file
 	 */
 	CIT_FSYNC,
-	/**
-	 * glimpse. An io context to acquire glimpse lock.
-	 */
-	CIT_GLIMPSE,
 	/**
          * Miscellaneous io. This is used for occasional io activity that
          * doesn't fit into other types. Currently this is used for:
@@ -1405,6 +1389,8 @@ enum cl_io_type {
          *     - VM induced page write-out. An io context for writing page out
          *     for memory cleansing;
          *
+         *     - glimpse. An io context to acquire glimpse lock.
+         *
          *     - grouplock. An io context to acquire group lock.
          *
          * CIT_MISC io is used simply as a context in which locks and pages
@@ -1621,30 +1607,25 @@ enum cl_enq_flags {
          * -EWOULDBLOCK is returned immediately.
          */
         CEF_NONBLOCK     = 0x00000001,
-	/**
-	 * Tell lower layers this is a glimpse request, translated to
-	 * LDLM_FL_HAS_INTENT at LDLM layer.
-	 *
-	 * Also, because glimpse locks never block other locks, we count this
-	 * as automatically compatible with other osc locks.
-	 * (see osc_lock_compatible)
-	 */
-	CEF_GLIMPSE        = 0x00000002,
+        /**
+         * take lock asynchronously (out of order), as it cannot
+         * deadlock. This is for LDLM_FL_HAS_INTENT locks used for glimpsing.
+         */
+        CEF_ASYNC        = 0x00000002,
         /**
          * tell the server to instruct (though a flag in the blocking ast) an
          * owner of the conflicting lock, that it can drop dirty pages
          * protected by this lock, without sending them to the server.
          */
         CEF_DISCARD_DATA = 0x00000004,
-	/**
-	 * tell the sub layers that it must be a `real' lock. This is used for
-	 * mmapped-buffer locks, glimpse locks, manually requested locks
-	 * (LU_LADVISE_LOCKAHEAD) that must never be converted into lockless
-	 * mode.
-	 *
-	 * \see vvp_mmap_locks(), cl_glimpse_lock, cl_request_lock().
-	 */
-	CEF_MUST         = 0x00000008,
+        /**
+         * tell the sub layers that it must be a `real' lock. This is used for
+         * mmapped-buffer locks and glimpse locks that must be never converted
+         * into lockless mode.
+         *
+         * \see vvp_mmap_locks(), cl_glimpse_lock().
+         */
+        CEF_MUST         = 0x00000008,
         /**
          * tell the sub layers that never request a `real' lock. This flag is
          * not used currently.
@@ -1657,16 +1638,9 @@ enum cl_enq_flags {
          */
         CEF_NEVER        = 0x00000010,
         /**
-	 * tell the dlm layer this is a speculative lock request
-	 * speculative lock requests are locks which are not requested as part
-	 * of an I/O operation.  Instead, they are requested because we expect
-	 * to use them in the future.  They are requested asynchronously at the
-	 * ptlrpc layer.
-	 *
-	 * Currently used for asynchronous glimpse locks and manually requested
-	 * locks (LU_LADVISE_LOCKAHEAD).
+         * for async glimpse lock.
          */
-	CEF_SPECULATIVE          = 0x00000020,
+        CEF_AGL          = 0x00000020,
 	/**
 	 * enqueue a lock to test DLM lock existence.
 	 */
@@ -1676,14 +1650,10 @@ enum cl_enq_flags {
 	 * is known to exist.
 	 */
 	CEF_LOCK_MATCH  = 0x00000080,
-	/**
-	 * tell the DLM layer to lock only the requested range
-	 */
-	CEF_LOCK_NO_EXPAND    = 0x00000100,
 	/**
 	 * mask of enq_flags.
 	 */
-	CEF_MASK         = 0x000001ff,
+	CEF_MASK         = 0x000000ff,
 };
 
 /**
@@ -1761,10 +1731,21 @@ enum cl_fsync_mode {
 	CL_FSYNC_ALL   = 3
 };
 
-struct cl_io_rw_common {
-	loff_t	crw_pos;
-	size_t	crw_count;
-	int	crw_nonblock;
+struct cl_io_range {
+	loff_t cir_pos;
+	size_t cir_count;
+};
+
+struct cl_io_pt {
+	struct cl_io_pt		*cip_next;
+	struct cfs_ptask	 cip_task;
+	struct kiocb		 cip_iocb;
+	struct iov_iter		 cip_iter;
+	struct file		*cip_file;
+	enum cl_io_type		 cip_iot;
+	loff_t			 cip_pos;
+	size_t			 cip_count;
+	ssize_t			 cip_result;
 };
 
 /**
@@ -1794,30 +1775,27 @@ struct cl_io {
         struct cl_lockset              ci_lockset;
         /** lock requirements, this is just a help info for sublayers. */
         enum cl_io_lock_dmd            ci_lockreq;
-	/** layout version when this IO occurs */
-	__u32				ci_layout_version;
-	union {
-		struct cl_rd_io {
-			struct cl_io_rw_common rd;
-		} ci_rd;
-		struct cl_wr_io {
-			struct cl_io_rw_common wr;
-			int                    wr_append;
-			int                    wr_sync;
-		} ci_wr;
-		struct cl_io_rw_common ci_rw;
+        union {
+		struct cl_rw_io {
+			struct iov_iter		 rw_iter;
+			struct kiocb		 rw_iocb;
+			struct cl_io_range	 rw_range;
+			struct file		*rw_file;
+			unsigned int		 rw_nonblock:1,
+						 rw_append:1,
+						 rw_sync:1;
+			int (*rw_ptask)(struct cfs_ptask *ptask);
+		} ci_rw;
 		struct cl_setattr_io {
 			struct ost_lvb		 sa_attr;
 			unsigned int		 sa_attr_flags;
-			unsigned int		 sa_avalid; /* ATTR_* */
-			unsigned int		 sa_xvalid; /* OP_XVALID */
+			unsigned int		 sa_valid;
 			int			 sa_stripe_index;
 			struct ost_layout	 sa_layout;
 			const struct lu_fid	*sa_parent_fid;
 		} ci_setattr;
 		struct cl_data_version_io {
 			u64 dv_data_version;
-			u32 dv_layout_version;
 			int dv_flags;
 		} ci_data_version;
                 struct cl_fault_io {
@@ -1872,10 +1850,8 @@ struct cl_io {
 	 */
 			     ci_ignore_layout:1,
 	/**
-	 * Need MDS intervention to complete a write.
-	 * Write intent is required for the following cases:
-	 * 1. component being written is not initialized, or
-	 * 2. the mirrored files are NOT in WRITE_PENDING state.
+	 * Need MDS intervention to complete a write. This usually means the
+	 * corresponding component is not initialized for the writing extent.
 	 */
 			     ci_need_write_intent:1,
 	/**
@@ -1894,43 +1870,12 @@ struct cl_io {
 	 * O_NOATIME
 	 */
 			     ci_noatime:1,
-	/* Tell sublayers not to expand LDLM locks requested for this IO */
-			     ci_lock_no_expand:1,
-	/**
-	 * Set if non-delay RPC should be used for this IO.
-	 *
-	 * If this file has multiple mirrors, and if the OSTs of the current
-	 * mirror is inaccessible, non-delay RPC would error out quickly so
-	 * that the upper layer can try to access the next mirror.
-	 */
-			     ci_ndelay:1,
-	/**
-	 * Set if we've tried all mirrors for this read IO, if it's not set,
-	 * the read IO will check to-be-read OSCs' status, and make fast-switch
-	 * another mirror if some of the OSTs are not healthy.
-	 */
-			     ci_tried_all_mirrors:1;
-	/**
-	 * Bypass quota check
-	 */
-	unsigned	     ci_noquota:1;
-	/**
-	 * How many times the read has retried before this one.
-	 * Set by the top level and consumed by the LOV.
-	 */
-	unsigned             ci_ndelay_tried;
-	/**
-	 * Designated mirror index for this I/O.
-	 */
-	unsigned	     ci_designated_mirror;
+	/** Set to 1 if parallel execution is allowed for current I/O? */
+			     ci_pio:1;
 	/**
 	 * Number of pages owned by this IO. For invariant checking.
 	 */
 	unsigned	     ci_owned_nr;
-	/**
-	 * Range of write intent. Valid if ci_need_write_intent is set.
-	 */
-	struct lu_extent	ci_write_intent;
 };
 
 /** @} cl_io */
@@ -2113,9 +2058,6 @@ int cl_object_fiemap(const struct lu_env *env, struct cl_object *obj,
 int cl_object_layout_get(const struct lu_env *env, struct cl_object *obj,
 			 struct cl_layout *cl);
 loff_t cl_object_maxbytes(struct cl_object *obj);
-int cl_object_flush(const struct lu_env *env, struct cl_object *obj,
-		    struct ldlm_lock *lock);
-
 
 /**
  * Returns true, iff \a o0 and \a o1 are slices of the same object.
@@ -2170,9 +2112,6 @@ struct cl_page *cl_page_alloc       (const struct lu_env *env,
 void            cl_page_get         (struct cl_page *page);
 void            cl_page_put         (const struct lu_env *env,
                                      struct cl_page *page);
-void		cl_pagevec_put      (const struct lu_env *env,
-				     struct cl_page *page,
-				     struct pagevec *pvec);
 void            cl_page_print       (const struct lu_env *env, void *cookie,
                                      lu_printer_t printer,
                                      const struct cl_page *pg);
@@ -2240,8 +2179,6 @@ void    cl_page_discard(const struct lu_env *env, struct cl_io *io,
 void    cl_page_delete(const struct lu_env *env, struct cl_page *pg);
 int     cl_page_is_vmlocked(const struct lu_env *env,
 			    const struct cl_page *pg);
-void	cl_page_touch(const struct lu_env *env, const struct cl_page *pg,
-		      size_t to);
 void    cl_page_export(const struct lu_env *env,
 		       struct cl_page *pg, int uptodate);
 loff_t  cl_offset(const struct cl_object *obj, pgoff_t idx);
@@ -2369,12 +2306,12 @@ int   cl_io_cancel       (const struct lu_env *env, struct cl_io *io,
  */
 static inline int cl_io_is_append(const struct cl_io *io)
 {
-	return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_append;
+	return io->ci_type == CIT_WRITE && io->u.ci_rw.rw_append;
 }
 
 static inline int cl_io_is_sync_write(const struct cl_io *io)
 {
-	return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_sync;
+	return io->ci_type == CIT_WRITE && io->u.ci_rw.rw_sync;
 }
 
 static inline int cl_io_is_mkwrite(const struct cl_io *io)
@@ -2387,8 +2324,8 @@ static inline int cl_io_is_mkwrite(const struct cl_io *io)
  */
 static inline int cl_io_is_trunc(const struct cl_io *io)
 {
-	return io->ci_type == CIT_SETATTR &&
-		(io->u.ci_setattr.sa_avalid & ATTR_SIZE);
+        return io->ci_type == CIT_SETATTR &&
+                (io->u.ci_setattr.sa_valid & ATTR_SIZE);
 }
 
 struct cl_io *cl_io_top(struct cl_io *io);
@@ -2396,12 +2333,13 @@ struct cl_io *cl_io_top(struct cl_io *io);
 void cl_io_print(const struct lu_env *env, void *cookie,
                  lu_printer_t printer, const struct cl_io *io);
 
-#define CL_IO_SLICE_CLEAN(foo_io, base)					\
-do {									\
-	typeof(foo_io) __foo_io = (foo_io);				\
-									\
-	memset(&__foo_io->base, 0,					\
-	       sizeof(*__foo_io) - offsetof(typeof(*__foo_io), base));	\
+#define CL_IO_SLICE_CLEAN(foo_io, base)                                 \
+do {                                                                    \
+        typeof(foo_io) __foo_io = (foo_io);                             \
+                                                                        \
+        CLASSERT(offsetof(typeof(*__foo_io), base) == 0);               \
+        memset(&__foo_io->base + 1, 0,                                  \
+               (sizeof *__foo_io) - sizeof __foo_io->base);             \
 } while (0)
 
 /** @} cl_io */
diff --git a/drivers/staging/lustrefsx/lustre/include/dt_object.h b/drivers/staging/lustrefsx/lustre/include/dt_object.h
index f16895ddafba6..e872981b5284e 100644
--- a/drivers/staging/lustrefsx/lustre/include/dt_object.h
+++ b/drivers/staging/lustrefsx/lustre/include/dt_object.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -86,8 +86,6 @@ struct dt_device_param {
 	 * calculation */
 	unsigned int	   ddp_extent_tax;
 	unsigned int	   ddp_brw_size;	/* optimal RPC size */
-	/* T10PI checksum type, zero if not supported */
-	enum cksum_types   ddp_t10_cksum_type;
 };
 
 /**
@@ -253,13 +251,6 @@ struct dt_device_operations {
                              const struct dt_device *dev,
                              struct dt_device_param *param);
 
-	/**
-	 * Return device's super block.
-	 *
-	 * \param[in] dev	dt device
-	 */
-	struct super_block *(*dt_mnt_sb_get)(const struct dt_device *dev);
-
 	/**
 	 * Sync the device.
 	 *
@@ -378,9 +369,6 @@ struct dt_allocation_hint {
 	const void		*dah_eadata;
 	int			dah_eadata_len;
 	__u32			dah_mode;
-	int			dah_append_stripes;
-	bool			dah_can_block;
-	char			*dah_append_pool;
 };
 
 /**
@@ -428,8 +416,6 @@ typedef __u64 dt_obj_version_t;
 
 union ldlm_policy_data;
 
-struct md_layout_change;
-
 /**
  * A dt_object provides common operations to create and destroy
  * objects and to manage regular and extended attributes.
@@ -1054,7 +1040,8 @@ struct dt_object_operations {
 	 */
 	int (*do_declare_layout_change)(const struct lu_env *env,
 					struct dt_object *dt,
-					struct md_layout_change *mlc,
+					struct layout_intent *layout,
+					const struct lu_buf *buf,
 					struct thandle *th);
 
 	/**
@@ -1070,8 +1057,8 @@ struct dt_object_operations {
 	 * \retval -ne		error code
 	 */
 	int (*do_layout_change)(const struct lu_env *env, struct dt_object *dt,
-				struct md_layout_change *mlc,
-				struct thandle *th);
+				struct layout_intent *layout,
+				const struct lu_buf *buf, struct thandle *th);
 };
 
 enum dt_bufs_type {
@@ -1149,6 +1136,7 @@ struct dt_body_operations {
 	 * \param[in] pos	position in the object to start
 	 * \param[out] pos	\a pos + bytes written
 	 * \param[in] th	transaction handle
+	 * \param[in] ignore	unused (was used to request quota ignorance)
 	 *
 	 * \retval positive	bytes written on success
 	 * \retval negative	negated errno on error
@@ -1157,7 +1145,8 @@ struct dt_body_operations {
 			     struct dt_object *dt,
 			     const struct lu_buf *buf,
 			     loff_t *pos,
-			     struct thandle *th);
+			     struct thandle *th,
+			     int ignore);
 
 	/**
 	 * Return buffers for data.
@@ -1186,7 +1175,6 @@ struct dt_body_operations {
 	 * \param[in] pos	position in the object to start
 	 * \param[in] len	size of region in bytes
 	 * \param[out] lb	array of descriptors to fill
-	 * \param[in] maxlnb	max slots in @lnb array
 	 * \param[in] rw	0 if used to read, 1 if used for write
 	 *
 	 * \retval positive	number of descriptors on success
@@ -1197,7 +1185,6 @@ struct dt_body_operations {
 			    loff_t pos,
 			    ssize_t len,
 			    struct niobuf_local *lb,
-			    int maxlnb,
 			    enum dt_bufs_type rw);
 
 	/**
@@ -1492,6 +1479,7 @@ struct dt_index_operations {
 	 * \param[in] rec	buffer storing value
 	 * \param[in] key	key
 	 * \param[in] th	transaction handle
+	 * \param[in] ignore	unused (was used to request quota ignorance)
 	 *
 	 * \retval 0		on success
 	 * \retval negative	negated errno on error
@@ -1500,7 +1488,8 @@ struct dt_index_operations {
 			  struct dt_object *dt,
 			  const struct dt_rec *rec,
 			  const struct dt_key *key,
-			  struct thandle *th);
+			  struct thandle *th,
+			  int ignore);
 
 	/**
 	 * Declare intention to delete a key/value from an index.
@@ -1793,14 +1782,6 @@ struct dt_device {
 	struct list_head		   dd_txn_callbacks;
 	unsigned int			   dd_record_fid_accessed:1,
 					   dd_rdonly:1;
-
-	/* sysfs and debugfs handling */
-	struct dentry			  *dd_debugfs_entry;
-
-	const struct attribute		 **dd_def_attrs;
-	struct kobject			   dd_kobj;
-	struct kobj_type		   dd_ktype;
-	struct completion		   dd_kobj_unregister;
 };
 
 int  dt_device_init(struct dt_device *dev, struct lu_device_type *t);
@@ -1919,9 +1900,7 @@ struct thandle {
 				th_wait_submit:1,
 	/* complex transaction which will track updates on all targets,
 	 * including OSTs */
-				th_complex:1,
-	/* whether ignore quota */
-				th_ignore_quota:1;
+				th_complex:1;
 };
 
 /**
@@ -2401,14 +2380,13 @@ static inline int dt_ref_del(const struct lu_env *env,
 
 static inline int dt_bufs_get(const struct lu_env *env, struct dt_object *d,
 			      struct niobuf_remote *rnb,
-			      struct niobuf_local *lnb, int maxlnb,
-			      enum dt_bufs_type rw)
+			      struct niobuf_local *lnb, enum dt_bufs_type rw)
 {
 	LASSERT(d);
 	LASSERT(d->do_body_ops);
 	LASSERT(d->do_body_ops->dbo_bufs_get);
 	return d->do_body_ops->dbo_bufs_get(env, d, rnb->rnb_offset,
-					    rnb->rnb_len, lnb, maxlnb, rw);
+					    rnb->rnb_len, lnb, rw);
 }
 
 static inline int dt_bufs_put(const struct lu_env *env, struct dt_object *d,
@@ -2472,12 +2450,12 @@ static inline int dt_declare_write(const struct lu_env *env,
 
 static inline ssize_t dt_write(const struct lu_env *env, struct dt_object *dt,
 			       const struct lu_buf *buf, loff_t *pos,
-			       struct thandle *th)
+			       struct thandle *th, int rq)
 {
 	LASSERT(dt);
 	LASSERT(dt->do_body_ops);
 	LASSERT(dt->do_body_ops->dbo_write);
-	return dt->do_body_ops->dbo_write(env, dt, buf, pos, th);
+	return dt->do_body_ops->dbo_write(env, dt, buf, pos, th, rq);
 }
 
 static inline int dt_declare_punch(const struct lu_env *env,
@@ -2547,16 +2525,6 @@ static inline void dt_conf_get(const struct lu_env *env,
         return dev->dd_ops->dt_conf_get(env, dev, param);
 }
 
-static inline struct super_block *dt_mnt_sb_get(const struct dt_device *dev)
-{
-	LASSERT(dev);
-	LASSERT(dev->dd_ops);
-	if (dev->dd_ops->dt_mnt_sb_get)
-		return dev->dd_ops->dt_mnt_sb_get(dev);
-
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
 static inline int dt_sync(const struct lu_env *env, struct dt_device *dev)
 {
         LASSERT(dev);
@@ -2590,10 +2558,11 @@ static inline int dt_declare_insert(const struct lu_env *env,
 }
 
 static inline int dt_insert(const struct lu_env *env,
-			    struct dt_object *dt,
-			    const struct dt_rec *rec,
-			    const struct dt_key *key,
-			    struct thandle *th)
+                                    struct dt_object *dt,
+                                    const struct dt_rec *rec,
+                                    const struct dt_key *key,
+                                    struct thandle *th,
+                                    int noquota)
 {
         LASSERT(dt);
         LASSERT(dt->do_index_ops);
@@ -2602,7 +2571,7 @@ static inline int dt_insert(const struct lu_env *env,
 	if (CFS_FAULT_CHECK(OBD_FAIL_DT_INSERT))
 		return cfs_fail_err;
 
-	return dt->do_index_ops->dio_insert(env, dt, rec, key, th);
+	return dt->do_index_ops->dio_insert(env, dt, rec, key, th, noquota);
 }
 
 static inline int dt_declare_xattr_del(const struct lu_env *env,
@@ -2778,24 +2747,26 @@ static inline int dt_lookup(const struct lu_env *env,
 
 static inline int dt_declare_layout_change(const struct lu_env *env,
 					   struct dt_object *o,
-					   struct md_layout_change *mlc,
+					   struct layout_intent *layout,
+					   const struct lu_buf *buf,
 					   struct thandle *th)
 {
 	LASSERT(o);
 	LASSERT(o->do_ops);
 	LASSERT(o->do_ops->do_declare_layout_change);
-	return o->do_ops->do_declare_layout_change(env, o, mlc, th);
+	return o->do_ops->do_declare_layout_change(env, o, layout, buf, th);
 }
 
 static inline int dt_layout_change(const struct lu_env *env,
 				   struct dt_object *o,
-				   struct md_layout_change *mlc,
+				   struct layout_intent *layout,
+				   const struct lu_buf *buf,
 				   struct thandle *th)
 {
 	LASSERT(o);
 	LASSERT(o->do_ops);
 	LASSERT(o->do_ops->do_layout_change);
-	return o->do_ops->do_layout_change(env, o, mlc, th);
+	return o->do_ops->do_layout_change(env, o, layout, buf, th);
 }
 
 struct dt_find_hint {
@@ -2844,9 +2815,6 @@ static inline struct dt_thread_info *dt_info(const struct lu_env *env)
 
 int dt_global_init(void);
 void dt_global_fini(void);
-int dt_tunables_init(struct dt_device *dt, struct obd_type *type,
-		     const char *name, struct ldebugfs_vars *list);
-int dt_tunables_fini(struct dt_device *dt);
 
 # ifdef CONFIG_PROC_FS
 int lprocfs_dt_blksize_seq_show(struct seq_file *m, void *v);
diff --git a/drivers/staging/lustrefsx/lustre/include/llog_swab.h b/drivers/staging/lustrefsx/lustre/include/llog_swab.h
index 6fe62bce3bcb3..a0b8d022c1a5b 100644
--- a/drivers/staging/lustrefsx/lustre/include/llog_swab.h
+++ b/drivers/staging/lustrefsx/lustre/include/llog_swab.h
@@ -48,7 +48,7 @@
 #ifndef _LLOG_SWAB_H_
 #define _LLOG_SWAB_H_
 
-#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre/lustre_idl.h>
 struct lustre_cfg;
 
 void lustre_swab_lu_fid(struct lu_fid *fid);
diff --git a/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h b/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h
index 85b66b3af7126..a9d6342f1b6c3 100644
--- a/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h
+++ b/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -47,7 +47,7 @@
 
 #include <libcfs/libcfs.h>
 #include <libcfs/linux/linux-fs.h>
-#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre/lustre_idl.h>
 
 /*
  * Liuux 5.6 introduces proc_ops with v5.5-8862-gd56c0d45f0e2
@@ -58,43 +58,18 @@ struct lprocfs_vars {
 	const char			*name;
 	const struct proc_ops		*fops;
 	void				*data;
-	/** /proc file mode. */
+	/* /proc file mode. */
 	mode_t				 proc_mode;
 };
 
-/** Provide a debugfs container */
 struct ldebugfs_vars {
 	const char			*name;
 	const struct file_operations	*fops;
 	void				*data;
-	/** debugfs file mode. */
+	/* debugfs file mode. */
 	mode_t				 proc_mode;
 };
 
-static inline unsigned int pct(unsigned long a, unsigned long b)
-{
-	return b ? a * 100 / b : 0;
-}
-
-#define PAGES_TO_MiB(pages)	((pages) >> (20 - PAGE_SHIFT))
-#define MiB_TO_PAGES(mb)	((mb) << (20 - PAGE_SHIFT))
-
-/**
- * Append a space separated list of current set flags to str.
- */
-#define flag2str(port, flag)						\
-	do {								\
-		if ((port)->port##_##flag) {				\
-			seq_printf(m, "%s" #flag, first ? "" : ", ");	\
-			first = false;					\
-		}							\
-	} while (0)
-
-void obd_connect_seq_flags2str(struct seq_file *m, __u64 flags, __u64 flags2,
-			       const char *sep);
-void obd_connect_data_seqprint(struct seq_file *m,
-			       struct obd_connect_data *ocd);
-
 /* if we find more consumers this could be generalized */
 #define OBD_HIST_MAX 32
 struct obd_histogram {
@@ -374,29 +349,28 @@ enum {
 #define PTLRPC_FIRST_CNTR PTLRPC_REQWAIT_CNTR
 
 enum lprocfs_extra_opc {
-	LDLM_GLIMPSE_ENQUEUE = 0,
-	LDLM_PLAIN_ENQUEUE,
-	LDLM_EXTENT_ENQUEUE,
-	LDLM_FLOCK_ENQUEUE,
-	LDLM_IBITS_ENQUEUE,
-	MDS_REINT_SETATTR,
-	MDS_REINT_CREATE,
-	MDS_REINT_LINK,
-	MDS_REINT_UNLINK,
-	MDS_REINT_RENAME,
-	MDS_REINT_OPEN,
-	MDS_REINT_SETXATTR,
-	MDS_REINT_RESYNC,
-	BRW_READ_BYTES,
-	BRW_WRITE_BYTES,
-	EXTRA_LAST_OPC
+        LDLM_GLIMPSE_ENQUEUE = 0,
+        LDLM_PLAIN_ENQUEUE,
+        LDLM_EXTENT_ENQUEUE,
+        LDLM_FLOCK_ENQUEUE,
+        LDLM_IBITS_ENQUEUE,
+        MDS_REINT_SETATTR,
+        MDS_REINT_CREATE,
+        MDS_REINT_LINK,
+        MDS_REINT_UNLINK,
+        MDS_REINT_RENAME,
+        MDS_REINT_OPEN,
+        MDS_REINT_SETXATTR,
+        BRW_READ_BYTES,
+        BRW_WRITE_BYTES,
+        EXTRA_LAST_OPC
 };
 
 #define EXTRA_FIRST_OPC LDLM_GLIMPSE_ENQUEUE
 /* class_obd.c */
 extern struct proc_dir_entry *proc_lustre_root;
 extern struct dentry *debugfs_lustre_root;
-extern struct kset *lustre_kset;
+extern struct kobject *lustre_kobj;
 
 struct obd_device;
 struct obd_histogram;
@@ -413,7 +387,7 @@ struct obd_job_stats {
 	struct list_head	ojs_list;	/* list of job_stat structs */
 	rwlock_t		ojs_lock;	/* protect ojs_list/js_list */
 	unsigned int		ojs_cleanup_interval;/* seconds before expiry */
-	time64_t		ojs_last_cleanup; /* previous cleanup time */
+	time_t			ojs_last_cleanup; /* previous cleanup time */
 	cntr_init_callback	ojs_cntr_init_fn;/* lprocfs_stats initializer */
 	unsigned short		ojs_cntr_num;	/* number of stats in struct */
 	bool			ojs_cleaning;	/* currently expiring stats */
@@ -489,9 +463,13 @@ extern struct lprocfs_stats *
 lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags);
 extern void lprocfs_clear_stats(struct lprocfs_stats *stats);
 extern void lprocfs_free_stats(struct lprocfs_stats **stats);
+extern void lprocfs_init_ops_stats(int num_private_stats,
+                                   struct lprocfs_stats *stats);
+extern void lprocfs_init_mps_stats(int num_private_stats,
+                                   struct lprocfs_stats *stats);
 extern void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats);
 extern int lprocfs_alloc_obd_stats(struct obd_device *obddev,
-				   unsigned int num_stats);
+                                   unsigned int num_private_stats);
 extern int lprocfs_alloc_md_stats(struct obd_device *obddev,
                                   unsigned int num_private_stats);
 extern void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
@@ -506,14 +484,10 @@ extern int lprocfs_add_clear_entry(struct obd_device * obd,
 #ifdef HAVE_SERVER_SUPPORT
 extern int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *peer_nid);
 extern int lprocfs_exp_cleanup(struct obd_export *exp);
-struct dentry *ldebugfs_add_symlink(const char *name, const char *target,
-				    const char *format, ...);
 #else
 static inline int lprocfs_exp_cleanup(struct obd_export *exp)
 { return 0; }
 #endif
-struct dentry *ldebugfs_add_simple(struct dentry *root, char *name, void *data,
-				   const struct file_operations *fops);
 extern struct proc_dir_entry *
 lprocfs_add_simple(struct proc_dir_entry *root, char *name,
 		   void *data, const struct proc_ops *ops);
@@ -530,12 +504,11 @@ extern int lprocfs_nid_stats_clear_seq_show(struct seq_file *file, void *data);
 extern int ldebugfs_register_stats(struct dentry *parent, const char *name,
 				   struct lprocfs_stats *stats);
 extern int lprocfs_register_stats(struct proc_dir_entry *root, const char *name,
-				  struct lprocfs_stats *stats);
-extern const struct file_operations ldebugfs_stats_seq_fops;
+                                  struct lprocfs_stats *stats);
 
 /* lprocfs_status.c */
 extern int ldebugfs_add_vars(struct dentry *parent, struct ldebugfs_vars *var,
-			      void *data);
+			     void *data);
 extern int lprocfs_add_vars(struct proc_dir_entry *root,
 			    struct lprocfs_vars *var, void *data);
 
@@ -573,32 +546,44 @@ static inline int LPROCFS_ENTRY_CHECK(struct inode *inode)
 static inline int LPROCFS_ENTRY_CHECK(struct inode *inode)
 { return 0; }
 #endif
-
-extern int lprocfs_obd_setup(struct obd_device *obd, bool uuid_only);
+extern int lprocfs_obd_setup(struct obd_device *dev);
 extern int lprocfs_obd_cleanup(struct obd_device *obd);
 #ifdef HAVE_SERVER_SUPPORT
 extern const struct file_operations lprocfs_evict_client_fops;
 #endif
 
-int ldebugfs_seq_create(struct dentry *parent, const char *name, umode_t mode,
-			const struct file_operations *seq_fops, void *data);
+extern int ldebugfs_seq_create(struct dentry *parent, const char *name,
+			       umode_t mode,
+			       const struct file_operations *seq_fops,
+			       void *data);
 extern int lprocfs_seq_create(struct proc_dir_entry *parent, const char *name,
 			      mode_t mode, const struct proc_ops *seq_fops,
 			      void *data);
-extern int lprocfs_obd_seq_create(struct obd_device *obd, const char *name,
+extern int lprocfs_obd_seq_create(struct obd_device *dev, const char *name,
 				  mode_t mode, const struct proc_ops *seq_fops,
 				  void *data);
 
 /* Generic callbacks */
+extern int lprocfs_u64_seq_show(struct seq_file *m, void *data);
+extern int lprocfs_atomic_seq_show(struct seq_file *m, void *data);
+extern ssize_t lprocfs_atomic_seq_write(struct file *file,
+					const char __user *buffer,
+					size_t count, loff_t *off);
+extern int lprocfs_uint_seq_show(struct seq_file *m, void *data);
+extern ssize_t lprocfs_uint_seq_write(struct file *file,
+				      const char __user *buffer,
+				      size_t count, loff_t *off);
+extern int lprocfs_wr_uint(struct file *file, const char __user *buffer,
+			   unsigned long count, void *data);
 extern int lprocfs_uuid_seq_show(struct seq_file *m, void *data);
+extern int lprocfs_name_seq_show(struct seq_file *m, void *data);
 extern int lprocfs_server_uuid_seq_show(struct seq_file *m, void *data);
-ssize_t conn_uuid_show(struct kobject *kobj, struct attribute *attr, char *buf);
+extern int lprocfs_conn_uuid_seq_show(struct seq_file *m, void *data);
 extern int lprocfs_import_seq_show(struct seq_file *m, void *data);
 extern int lprocfs_state_seq_show(struct seq_file *m, void *data);
 extern int lprocfs_connect_flags_seq_show(struct seq_file *m, void *data);
 #ifdef HAVE_SERVER_SUPPORT
-ssize_t num_exports_show(struct kobject *kobj, struct attribute *attr,
-			 char *buf);
+extern int lprocfs_num_exports_seq_show(struct seq_file *m, void *data);
 #endif
 struct adaptive_timeout;
 extern int lprocfs_at_hist_helper(struct seq_file *m,
@@ -612,27 +597,32 @@ extern ssize_t
 lprocfs_evict_client_seq_write(struct file *file, const char __user *buffer,
 				size_t count, loff_t *off);
 #endif
-ssize_t ping_store(struct kobject *kobj, struct attribute *attr,
-		   const char *buffer, size_t count);
-ssize_t ping_show(struct kobject *kobj, struct attribute *attr,
-		  char *buffer);
-
 extern ssize_t
-ldebugfs_import_seq_write(struct file *file, const char __user *buffer,
-			  size_t count, loff_t *off);
-static inline ssize_t
+lprocfs_ping_seq_write(struct file *file, const char __user *buffer,
+		       size_t count, loff_t *off);
+extern ssize_t
 lprocfs_import_seq_write(struct file *file, const char __user *buffer,
-			 size_t count, loff_t *off)
-{
-	return ldebugfs_import_seq_write(file, buffer, count, off);
-}
-
+			 size_t count, loff_t *off);
 extern int lprocfs_pinger_recov_seq_show(struct seq_file *m, void *data);
 extern ssize_t
 lprocfs_pinger_recov_seq_write(struct file *file, const char __user *buffer,
 			       size_t count, loff_t *off);
 
-extern int lprocfs_str_with_units_to_s64(const char __user *buffer,
+/* Statfs helpers */
+extern int lprocfs_blksize_seq_show(struct seq_file *m, void *data);
+extern int lprocfs_kbytestotal_seq_show(struct seq_file *m, void *data);
+extern int lprocfs_kbytesfree_seq_show(struct seq_file *m, void *data);
+extern int lprocfs_kbytesavail_seq_show(struct seq_file *m, void *data);
+extern int lprocfs_filestotal_seq_show(struct seq_file *m, void *data);
+extern int lprocfs_filesfree_seq_show(struct seq_file *m, void *data);
+
+extern int lprocfs_seq_read_frac_helper(struct seq_file *m, long val, int mult);
+extern int lprocfs_read_frac_helper(char *buffer, unsigned long count,
+                                    long val, int mult);
+extern int lprocfs_str_to_s64(struct file *, const char __user *buffer,
+			      unsigned long count, __s64 *val);
+extern int lprocfs_str_with_units_to_s64(struct file *,
+					 const char __user *buffer,
 					 unsigned long count, __s64 *val,
 					 char defunit);
 
@@ -655,10 +645,10 @@ int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data);
 int lprocfs_hash_seq_show(struct seq_file *m, void *data);
 
 /* lprocfs_status.c: IR factor */
-ssize_t ir_factor_show(struct kobject *kobj, struct attribute *attr,
-		       char *buf);
-ssize_t ir_factor_store(struct kobject *kobj, struct attribute *attr,
-			const char *buffer, size_t count);
+int lprocfs_ir_factor_seq_show(struct seq_file *m, void *data);
+ssize_t
+lprocfs_ir_factor_seq_write(struct file *file, const char __user *buffer,
+				size_t count, loff_t *off);
 #endif
 
 /* lprocfs_status.c: dump pages on cksum error */
@@ -683,75 +673,10 @@ extern int lprocfs_seq_release(struct inode *, struct file *);
 #define LPROCFS_CLIMP_EXIT(obd)                 \
 	up_read(&(obd)->u.cli.cl_sem);
 
-/* write the name##_seq_show function, call LDEBUGFS_SEQ_FOPS_RO for read-only
- * debugfs entries; otherwise, you will define name##_seq_write function also
- * for a read-write debugfs entry, and then call LDEBUGFS_SEQ_FOPS instead.
- * Finally, call ldebugfs_seq_create(obd, filename, 0444, &name#_fops, data);
- */
-#define __LDEBUGFS_SEQ_FOPS(name, custom_seq_write)			\
-static int name##_single_open(struct inode *inode, struct file *file)	\
-{									\
-	return single_open(file, name##_seq_show, inode->i_private);	\
-}									\
-static const struct file_operations name##_fops = {			\
-	.owner	 = THIS_MODULE,						\
-	.open	 = name##_single_open,					\
-	.read	 = seq_read,						\
-	.write	 = custom_seq_write,					\
-	.llseek	 = seq_lseek,						\
-	.release = single_release,					\
-}
-
-#define LDEBUGFS_SEQ_FOPS_RO(name)	__LDEBUGFS_SEQ_FOPS(name, NULL)
-#define LDEBUGFS_SEQ_FOPS(name)		__LDEBUGFS_SEQ_FOPS(name, \
-							    name##_seq_write)
-
-#define LDEBUGFS_SEQ_FOPS_RO_TYPE(name, type)				\
-	static int name##_##type##_seq_show(struct seq_file *m, void *v)\
-	{								\
-		return lprocfs_##type##_seq_show(m, m->private);	\
-	}								\
-	LDEBUGFS_SEQ_FOPS_RO(name##_##type)
-
-#define LDEBUGFS_SEQ_FOPS_RW_TYPE(name, type)				\
-	static int name##_##type##_seq_show(struct seq_file *m, void *v)\
-	{								\
-		return lprocfs_##type##_seq_show(m, m->private);	\
-	}								\
-	static ssize_t name##_##type##_seq_write(struct file *file,	\
-			const char __user *buffer, size_t count,	\
-			loff_t *off)					\
-	{								\
-		struct seq_file *seq = file->private_data;		\
-		return ldebugfs_##type##_seq_write(file, buffer, count,	\
-						   seq->private);	\
-	}								\
-	LDEBUGFS_SEQ_FOPS(name##_##type);
-
-#define LDEBUGFS_FOPS_WR_ONLY(name, type)				\
-	static ssize_t name##_##type##_write(struct file *file,		\
-			const char __user *buffer, size_t count,	\
-			loff_t *off)					\
-	{								\
-		return ldebugfs_##type##_seq_write(file, buffer, count,	\
-						   off);		\
-	}								\
-	static int name##_##type##_open(struct inode *inode,		\
-					struct file *file)		\
-	{								\
-		return single_open(file, NULL, inode->i_private);	\
-	}								\
-	static const struct file_operations name##_##type##_fops = {	\
-		.open	 = name##_##type##_open,			\
-		.write	 = name##_##type##_write,			\
-		.release = single_release,				\
-	};
-
 /* write the name##_seq_show function, call LPROC_SEQ_FOPS_RO for read-only
- * proc entries; otherwise, you will define name##_seq_write function also for
- * a read-write proc entry, and then call LPROC_SEQ_FOPS instead. Finally,
- * call ldebugfs_obd_seq_create(obd, filename, 0444, &name#_fops, data);
- */
+  proc entries; otherwise, you will define name##_seq_write function also for
+  a read-write proc entry, and then call LPROC_SEQ_SEQ instead. Finally,
+  call lprocfs_obd_seq_create(obd, filename, 0444, &name#_fops, data); */
 #define __LPROC_SEQ_FOPS(name, custom_seq_write)			\
 static int name##_single_open(struct inode *inode, struct file *file)	\
 {									\
@@ -762,8 +687,7 @@ static int name##_single_open(struct inode *inode, struct file *file)	\
 		return rc;						\
 									\
 	return single_open(file, name##_seq_show,			\
-			   inode->i_private ? inode->i_private :	\
-					      PDE_DATA(inode));		\
+			   inode->i_private ? : PDE_DATA(inode));	\
 }									\
 static const struct proc_ops name##_fops = {				\
 	PROC_OWNER(THIS_MODULE)						\
@@ -795,11 +719,11 @@ static const struct proc_ops name##_fops = {				\
 	{								\
 		struct seq_file *seq = file->private_data;		\
 		return lprocfs_##type##_seq_write(file, buffer,		\
-						  count, seq->private);	\
+						count, seq->private);	\
 	}								\
 	LPROC_SEQ_FOPS(name##_##type);
 
-#define LPROC_SEQ_FOPS_WR_ONLY(name, type)				\
+#define LPROC_SEQ_FOPS_WO_TYPE(name, type)				\
 	static ssize_t name##_##type##_write(struct file *file,		\
 			const char __user *buffer, size_t count,	\
 			loff_t *off)					\
@@ -809,8 +733,7 @@ static const struct proc_ops name##_fops = {				\
 	static int name##_##type##_open(struct inode *inode, struct file *file)\
 	{								\
 		return single_open(file, NULL,				\
-				   inode->i_private ? inode->i_private : \
-				   PDE_DATA(inode));			\
+				   inode->i_private ? : PDE_DATA(inode));\
 	}								\
 	static const struct proc_ops name##_##type##_fops = {		\
 		.proc_open	= name##_##type##_open,			\
@@ -826,10 +749,22 @@ struct lustre_attr {
 			 const char *buf, size_t len);
 };
 
+/*
+ * Hacks to get around set_fs removal.
+ */
+void lprocfs_file_set_kernel(struct file *file);
+bool lprocfs_file_is_kernel(struct file *file);
+
+/*
+ * Version of copy_from_user() that uses the above hacks to determine
+ * whether it's dealing with user or kernel space.
+ */
+unsigned long lprocfs_copy_from_user(struct file *file, void *to,
+				     const void __user *from, unsigned long n);
+
 #define LUSTRE_ATTR(name, mode, show, store) \
 static struct lustre_attr lustre_attr_##name = __ATTR(name, mode, show, store)
 
-#define LUSTRE_WO_ATTR(name) LUSTRE_ATTR(name, 0200, NULL, name##_store)
 #define LUSTRE_RO_ATTR(name) LUSTRE_ATTR(name, 0444, name##_show, NULL)
 #define LUSTRE_RW_ATTR(name) LUSTRE_ATTR(name, 0644, name##_show, name##_store)
 
@@ -851,43 +786,33 @@ int lprocfs_job_stats_log(struct obd_device *obd, char *jobid,
 void lprocfs_job_stats_fini(struct obd_device *obd);
 int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
 			   cntr_init_callback fn);
-ssize_t job_cleanup_interval_show(struct kobject *kobj, struct attribute *attr,
-				  char *buf);
-ssize_t job_cleanup_interval_store(struct kobject *kobj,
-				   struct attribute *attr,
-				   const char *buffer, size_t count);
-/* lproc_status_server.c */
-ssize_t recovery_time_soft_show(struct kobject *kobj, struct attribute *attr,
-				char *buf);
-ssize_t recovery_time_soft_store(struct kobject *kobj,
-				 struct attribute *attr,
-				 const char *buffer, size_t count);
-ssize_t recovery_time_hard_show(struct kobject *kobj, struct attribute *attr,
-				char *buf);
-ssize_t recovery_time_hard_store(struct kobject *kobj,
-				 struct attribute *attr,
-				 const char *buffer, size_t count);
-ssize_t instance_show(struct kobject *kobj, struct attribute *attr,
-		      char *buf);
-#endif
+int lprocfs_job_interval_seq_show(struct seq_file *m, void *data);
+ssize_t
+lprocfs_job_interval_seq_write(struct file *file, const char __user *buffer,
+				size_t count, loff_t *off);
 /* lproc_status.c */
+int lprocfs_recovery_time_soft_seq_show(struct seq_file *m, void *data);
+ssize_t lprocfs_recovery_time_soft_seq_write(struct file *file,
+						const char __user *buffer,
+						size_t count, loff_t *off);
+int lprocfs_recovery_time_hard_seq_show(struct seq_file *m, void *data);
+ssize_t
+lprocfs_recovery_time_hard_seq_write(struct file *file,
+				     const char __user *buffer,
+				     size_t count, loff_t *off);
+int lprocfs_target_instance_seq_show(struct seq_file *m, void *data);
+#endif
 int lprocfs_obd_max_pages_per_rpc_seq_show(struct seq_file *m, void *data);
 ssize_t lprocfs_obd_max_pages_per_rpc_seq_write(struct file *file,
 						const char __user *buffer,
 						size_t count, loff_t *off);
-int lprocfs_obd_short_io_bytes_seq_show(struct seq_file *m, void *data);
-ssize_t lprocfs_obd_short_io_bytes_seq_write(struct file *file,
-					     const char __user *buffer,
-					     size_t count, loff_t *off);
-ssize_t short_io_bytes_show(struct kobject *kobj, struct attribute *attr,
-			    char *buf);
-ssize_t short_io_bytes_store(struct kobject *kobj, struct attribute *attr,
-			     const char *buffer, size_t count);
 
 struct root_squash_info;
-int lprocfs_wr_root_squash(const char __user *buffer, unsigned long count,
+int lprocfs_wr_root_squash(struct file *file, const char __user *buffer,
+			   unsigned long count,
 			   struct root_squash_info *squash, char *name);
-int lprocfs_wr_nosquash_nids(const char __user *buffer, unsigned long count,
+int lprocfs_wr_nosquash_nids(struct file *file, const char __user *buffer,
+			     unsigned long count,
 			     struct root_squash_info *squash, char *name);
 
 #else /* !CONFIG_PROC_FS */
@@ -927,10 +852,16 @@ static inline int lprocfs_register_stats(struct proc_dir_entry *root,
                                          const char *name,
                                          struct lprocfs_stats *stats)
 { return 0; }
+static inline void lprocfs_init_ops_stats(int num_private_stats,
+                                          struct lprocfs_stats *stats)
+{ return; }
+static inline void lprocfs_init_mps_stats(int num_private_stats,
+                                          struct lprocfs_stats *stats)
+{ return; }
 static inline void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
 { return; }
 static inline int lprocfs_alloc_obd_stats(struct obd_device *obddev,
-					  unsigned int num_stats)
+                                          unsigned int num_private_stats)
 { return 0; }
 static inline int lprocfs_alloc_md_stats(struct obd_device *obddev,
                                          unsigned int num_private_stats)
@@ -979,14 +910,18 @@ static inline void lprocfs_remove(struct proc_dir_entry **root)
 static inline void lprocfs_remove_proc_entry(const char *name,
                                              struct proc_dir_entry *parent)
 { return; }
-static inline int lprocfs_obd_setup(struct obd_device *dev, bool uuid_only)
+static inline int lprocfs_obd_setup(struct obd_device *dev)
 { return 0; }
 static inline int lprocfs_obd_cleanup(struct obd_device *dev)
 { return 0; }
 static inline int lprocfs_uuid_seq_show(struct seq_file *m, void *data)
 { return 0; }
+static inline int lprocfs_name_seq_show(struct seq_file *m, void *data)
+{ return 0; }
 static inline int lprocfs_server_seq_show(struct seq_file *m, void *data)
 { return 0; }
+static inline int lprocfs_conn_uuid_seq_show(struct seq_file *m, void *data)
+{ return 0; }
 static inline int lprocfs_import_seq_show(struct seq_file *m, void *data)
 { return 0; }
 static inline int lprocfs_state_seq_show(struct seq_file *m, void *data)
@@ -1018,10 +953,6 @@ lprocfs_ping_seq_write(struct file *file, const char __user *buffer,
 		       size_t count, loff_t *off)
 { return 0; }
 static inline ssize_t
-ldebugfs_import_seq_write(struct file *file, const char __user *buffer,
-			  size_t count, loff_t *off)
-{ return 0; }
-static inline ssize_t
 lprocfs_import_seq_write(struct file *file, const char __user *buffer,
 			 size_t count, loff_t *off)
 { return 0; }
@@ -1077,7 +1008,7 @@ u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx,
 #define LPROC_SEQ_FOPS(name)
 #define LPROC_SEQ_FOPS_RO_TYPE(name, type)
 #define LPROC_SEQ_FOPS_RW_TYPE(name, type)
-#define LPROC_SEQ_FOPS_WR_ONLY(name, type)
+#define LPROC_SEQ_FOPS_WO_TYPE(name, type)
 
 /* lprocfs_jobstats.c */
 static inline
diff --git a/drivers/staging/lustrefsx/lustre/include/lu_object.h b/drivers/staging/lustrefsx/lustre/include/lu_object.h
index c75d3115fdff5..ae5bb3dde4c82 100644
--- a/drivers/staging/lustrefsx/lustre/include/lu_object.h
+++ b/drivers/staging/lustrefsx/lustre/include/lu_object.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -35,7 +35,7 @@
 
 #include <stdarg.h>
 #include <libcfs/libcfs.h>
-#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre/lustre_idl.h>
 #include <lu_ref.h>
 #include <linux/percpu_counter.h>
 
@@ -426,8 +426,26 @@ struct lu_attr {
         __u32          la_rdev;
 	/** project id */
 	__u32	       la_projid;
-	/** set layout version to OST objects. */
-	__u32		la_layout_version;
+};
+
+/** Bit-mask of valid attributes */
+enum la_valid {
+        LA_ATIME = 1 << 0,
+        LA_MTIME = 1 << 1,
+        LA_CTIME = 1 << 2,
+        LA_SIZE  = 1 << 3,
+        LA_MODE  = 1 << 4,
+        LA_UID   = 1 << 5,
+        LA_GID   = 1 << 6,
+        LA_BLOCKS = 1 << 7,
+        LA_TYPE   = 1 << 8,
+        LA_FLAGS  = 1 << 9,
+        LA_NLINK  = 1 << 10,
+        LA_RDEV   = 1 << 11,
+        LA_BLKSIZE = 1 << 12,
+        LA_KILL_SUID = 1 << 13,
+        LA_KILL_SGID = 1 << 14,
+	LA_PROJID    = 1 << 15,
 };
 
 /**
@@ -466,23 +484,17 @@ enum lu_object_header_flags {
 	/**
 	 * Mark this object has already been taken out of cache.
 	 */
-	LU_OBJECT_UNHASHED	= 1,
-	/**
-	 * Object is initialized, when object is found in cache, it may not be
-	 * intialized yet, the object allocator will initialize it.
-	 */
-	LU_OBJECT_INITED	= 2
+	LU_OBJECT_UNHASHED = 1,
 };
 
 enum lu_object_header_attr {
-	LOHA_EXISTS		= 1 << 0,
-	LOHA_REMOTE		= 1 << 1,
-	LOHA_HAS_AGENT_ENTRY	= 1 << 2,
-	/**
-	 * UNIX file type is stored in S_IFMT bits.
-	 */
-	LOHA_FT_START		= 001 << 12, /**< S_IFIFO */
-	LOHA_FT_END		= 017 << 12, /**< S_IFMT */
+        LOHA_EXISTS   = 1 << 0,
+        LOHA_REMOTE   = 1 << 1,
+        /**
+         * UNIX file type is stored in S_IFMT bits.
+         */
+        LOHA_FT_START = 001 << 12, /**< S_IFIFO */
+        LOHA_FT_END   = 017 << 12, /**< S_IFMT */
 };
 
 /**
@@ -536,6 +548,31 @@ struct lu_object_header {
 
 struct fld;
 
+struct lu_site_bkt_data {
+	/**
+	 * number of object in this bucket on the lsb_lru list.
+	 */
+	long			lsb_lru_len;
+	/**
+	 * LRU list, updated on each access to object. Protected by
+	 * bucket lock of lu_site::ls_obj_hash.
+	 *
+	 * "Cold" end of LRU is lu_site::ls_lru.next. Accessed object are
+	 * moved to the lu_site::ls_lru.prev (this is due to the non-existence
+	 * of list_for_each_entry_safe_reverse()).
+	 */
+	struct list_head	lsb_lru;
+	/**
+	 * Wait-queue signaled when an object in this site is ultimately
+	 * destroyed (lu_object_free()). It is used by lu_object_find() to
+	 * wait before re-trying when object in the process of destruction is
+	 * found in the hash table.
+	 *
+	 * \see htable_lookup().
+	 */
+	wait_queue_head_t	lsb_marche_funebre;
+};
+
 enum {
 	LU_SS_CREATED		= 0,
 	LU_SS_CACHE_HIT,
@@ -606,8 +643,14 @@ struct lu_site {
 	struct percpu_counter   ls_lru_len_counter;
 };
 
-wait_queue_head_t *
-lu_site_wq_from_fid(struct lu_site *site, struct lu_fid *fid);
+static inline struct lu_site_bkt_data *
+lu_site_bkt_from_fid(struct lu_site *site, struct lu_fid *fid)
+{
+	struct cfs_hash_bd bd;
+
+        cfs_hash_bd_get(site->ls_obj_hash, fid, &bd);
+        return cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
+}
 
 static inline struct seq_server_site *lu_site2seq(const struct lu_site *s)
 {
@@ -672,14 +715,6 @@ static inline int lu_object_is_dying(const struct lu_object_header *h)
 	return test_bit(LU_OBJECT_HEARD_BANSHEE, &h->loh_flags);
 }
 
-/**
- * Return true if object is initialized.
- */
-static inline int lu_object_is_inited(const struct lu_object_header *h)
-{
-	return test_bit(LU_OBJECT_INITED, &h->loh_flags);
-}
-
 void lu_object_put(const struct lu_env *env, struct lu_object *o);
 void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o);
 void lu_object_unhash(const struct lu_env *env, struct lu_object *o);
@@ -809,22 +844,6 @@ int lu_object_invariant(const struct lu_object *o);
  */
 #define lu_object_remote(o) unlikely((o)->lo_header->loh_attr & LOHA_REMOTE)
 
-/**
- * Check whether the object as agent entry on current target
- */
-#define lu_object_has_agent_entry(o) \
-	unlikely((o)->lo_header->loh_attr & LOHA_HAS_AGENT_ENTRY)
-
-static inline void lu_object_set_agent_entry(struct lu_object *o)
-{
-	o->lo_header->loh_attr |= LOHA_HAS_AGENT_ENTRY;
-}
-
-static inline void lu_object_clear_agent_entry(struct lu_object *o)
-{
-	o->lo_header->loh_attr &= ~LOHA_HAS_AGENT_ENTRY;
-}
-
 static inline int lu_object_assert_exists(const struct lu_object *o)
 {
 	return lu_object_exists(o);
@@ -841,8 +860,7 @@ static inline int lu_object_assert_not_exists(const struct lu_object *o)
 static inline __u32 lu_object_attr(const struct lu_object *o)
 {
 	LASSERT(lu_object_exists(o) != 0);
-
-	return o->lo_header->loh_attr & S_IFMT;
+        return o->lo_header->loh_attr;
 }
 
 static inline void lu_object_ref_add(struct lu_object *o,
@@ -889,9 +907,7 @@ struct lu_rdpg {
 
 enum lu_xattr_flags {
 	LU_XATTR_REPLACE = (1 << 0),
-	LU_XATTR_CREATE  = (1 << 1),
-	LU_XATTR_MERGE   = (1 << 2),
-	LU_XATTR_SPLIT   = (1 << 3),
+	LU_XATTR_CREATE  = (1 << 1)
 };
 
 /** @} helpers */
@@ -1113,20 +1129,20 @@ struct lu_context_key {
 };
 
 #define LU_KEY_INIT(mod, type)                                    \
-	static void *mod##_key_init(const struct lu_context *ctx, \
-				    struct lu_context_key *key)   \
-	{                                                         \
-		type *value;                                      \
+        static void* mod##_key_init(const struct lu_context *ctx, \
+                                    struct lu_context_key *key)   \
+        {                                                         \
+                type *value;                                      \
                                                                   \
 		CLASSERT(PAGE_SIZE >= sizeof(*value));		  \
                                                                   \
-		OBD_ALLOC_PTR(value);                             \
-		if (value == NULL)                                \
-			value = ERR_PTR(-ENOMEM);                 \
-								  \
-		return value;                                     \
-	}                                                         \
-	struct __##mod##__dummy_init { ; } /* semicolon catcher */
+                OBD_ALLOC_PTR(value);                             \
+                if (value == NULL)                                \
+                        value = ERR_PTR(-ENOMEM);                 \
+                                                                  \
+                return value;                                     \
+        }                                                         \
+        struct __##mod##__dummy_init {;} /* semicolon catcher */
 
 #define LU_KEY_FINI(mod, type)                                              \
         static void mod##_key_fini(const struct lu_context *ctx,            \
@@ -1262,37 +1278,6 @@ void lu_env_fini  (struct lu_env *env);
 int  lu_env_refill(struct lu_env *env);
 int  lu_env_refill_by_tags(struct lu_env *env, __u32 ctags, __u32 stags);
 
-static inline void* lu_env_info(const struct lu_env *env,
-				const struct lu_context_key *key)
-{
-	void *info;
-	info = lu_context_key_get(&env->le_ctx, key);
-	if (!info) {
-		if (!lu_env_refill((struct lu_env *)env))
-			info = lu_context_key_get(&env->le_ctx, key);
-	}
-	LASSERT(info);
-	return info;
-}
-
-#ifdef HAVE_SERVER_SUPPORT
-struct lu_env *lu_env_find(void);
-int lu_env_add(struct lu_env *env);
-void lu_env_remove(struct lu_env *env);
-#else
-static inline struct lu_env *lu_env_find(void)
-{
-	return NULL;
-}
-static inline int lu_env_add(struct lu_env *env)
-{
-	return 0;
-}
-static inline void lu_env_remove(struct lu_env *env)
-{
-}
-#endif /* HAVE_SERVER_SUPPORT */
-
 /** @} lu_context */
 
 /**
@@ -1309,26 +1294,6 @@ struct lu_name {
         int            ln_namelen;
 };
 
-static inline bool name_is_dot_or_dotdot(const char *name, int namelen)
-{
-	return name[0] == '.' &&
-	       (namelen == 1 || (namelen == 2 && name[1] == '.'));
-}
-
-static inline bool lu_name_is_dot_or_dotdot(const struct lu_name *lname)
-{
-	return name_is_dot_or_dotdot(lname->ln_name, lname->ln_namelen);
-}
-
-static inline bool lu_name_is_valid_len(const char *name, size_t name_len)
-{
-	return name != NULL &&
-	       name_len > 0 &&
-	       name_len < INT_MAX &&
-	       strlen(name) == name_len &&
-	       memchr(name, '/', name_len) == NULL;
-}
-
 /**
  * Validate names (path components)
  *
@@ -1340,7 +1305,12 @@ static inline bool lu_name_is_valid_len(const char *name, size_t name_len)
  */
 static inline bool lu_name_is_valid_2(const char *name, size_t name_len)
 {
-	return lu_name_is_valid_len(name, name_len) && name[name_len] == '\0';
+	return name != NULL &&
+	       name_len > 0 &&
+	       name_len < INT_MAX &&
+	       name[name_len] == '\0' &&
+	       strlen(name) == name_len &&
+	       memchr(name, '/', name_len) == NULL;
 }
 
 static inline bool lu_name_is_valid(const struct lu_name *ln)
diff --git a/drivers/staging/lustrefsx/lustre/include/lu_target.h b/drivers/staging/lustrefsx/lustre/include/lu_target.h
index 0810fbea8b55e..0d3ef968923ad 100644
--- a/drivers/staging/lustrefsx/lustre/include/lu_target.h
+++ b/drivers/staging/lustrefsx/lustre/include/lu_target.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -126,17 +126,14 @@ struct tg_grants_data {
 	u64			 tgd_tot_granted;
 	/* grant used by I/Os in progress (between prepare and commit) */
 	u64			 tgd_tot_pending;
-	/* amount of available space in percentage that is never used for
-	 * grants, used on MDT to always keep space for metadata. */
-	u64			 tgd_reserved_pcnt;
 	/* number of clients using grants */
 	int			 tgd_tot_granted_clients;
 	/* shall we grant space to clients not
 	 * supporting OBD_CONNECT_GRANT_PARAM? */
-	unsigned int		 tgd_grant_compat_disable:1;
+	int			 tgd_grant_compat_disable;
 	/* protect all statfs-related counters */
 	spinlock_t		 tgd_osfs_lock;
-	time64_t		 tgd_osfs_age;
+	__u64			 tgd_osfs_age;
 	int			 tgd_blockbits;
 	/* counters used during statfs update, protected by ofd_osfs_lock.
 	 * record when some statfs refresh are in progress */
@@ -204,18 +201,8 @@ struct lu_target {
 
 	/* target grants fields */
 	struct tg_grants_data	 lut_tgd;
-
-	/* target tunables */
-	const struct attribute	**lut_attrs;
-
-	/* FMD (file modification data) values */
-	int			 lut_fmd_max_num;
-	time64_t		 lut_fmd_max_age;
 };
 
-#define LUT_FMD_MAX_NUM_DEFAULT 128
-#define LUT_FMD_MAX_AGE_DEFAULT (obd_timeout + 10)
-
 /* number of slots in reply bitmap */
 #define LUT_REPLY_SLOTS_PER_CHUNK (1<<20)
 #define LUT_REPLY_SLOTS_MAX_CHUNKS 16
@@ -369,7 +356,7 @@ struct tgt_handler {
 	/* Flags in enum tgt_handler_flags */
 	__u32			 th_flags;
 	/* Request version for this opcode */
-	enum lustre_msg_version	 th_version;
+	int			 th_version;
 	/* Handler function */
 	int			(*th_act)(struct tgt_session_info *tsi);
 	/* Handler function for high priority requests */
@@ -422,6 +409,8 @@ int tgt_convert(struct tgt_session_info *tsi);
 int tgt_bl_callback(struct tgt_session_info *tsi);
 int tgt_cp_callback(struct tgt_session_info *tsi);
 int tgt_llog_open(struct tgt_session_info *tsi);
+int tgt_llog_close(struct tgt_session_info *tsi);
+int tgt_llog_destroy(struct tgt_session_info *tsi);
 int tgt_llog_read_header(struct tgt_session_info *tsi);
 int tgt_llog_next_block(struct tgt_session_info *tsi);
 int tgt_llog_prev_block(struct tgt_session_info *tsi);
@@ -437,13 +426,15 @@ int tgt_sync(const struct lu_env *env, struct lu_target *tgt,
 int tgt_io_thread_init(struct ptlrpc_thread *thread);
 void tgt_io_thread_done(struct ptlrpc_thread *thread);
 
-int tgt_mdt_data_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
-		      struct lustre_handle *lh, int mode, __u64 *flags);
-void tgt_mdt_data_unlock(struct lustre_handle *lh, enum ldlm_mode mode);
-int tgt_extent_lock(const struct lu_env *env, struct ldlm_namespace *ns,
-		    struct ldlm_res_id *res_id, __u64 start, __u64 end,
-		    struct lustre_handle *lh, int mode, __u64 *flags);
+int tgt_extent_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
+		    __u64 start, __u64 end, struct lustre_handle *lh,
+		    int mode, __u64 *flags);
 void tgt_extent_unlock(struct lustre_handle *lh, enum ldlm_mode mode);
+int tgt_brw_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
+		 struct obd_ioobj *obj, struct niobuf_remote *nb,
+		 struct lustre_handle *lh, enum ldlm_mode mode);
+void tgt_brw_unlock(struct obd_ioobj *obj, struct niobuf_remote *niob,
+		    struct lustre_handle *lh, enum ldlm_mode mode);
 int tgt_brw_read(struct tgt_session_info *tsi);
 int tgt_brw_write(struct tgt_session_info *tsi);
 int tgt_hpreq_handler(struct ptlrpc_request *req);
@@ -503,8 +494,6 @@ int tgt_add_reply_data(const struct lu_env *env, struct lu_target *tgt,
 		       struct thandle *th, bool update_lrd_file);
 struct tg_reply_data *tgt_lookup_reply_by_xid(struct tg_export_data *ted,
 					       __u64 xid);
-int tgt_tunables_init(struct lu_target *lut);
-void tgt_tunables_fini(struct lu_target *lut);
 
 /* target/tgt_grant.c */
 static inline int exp_grant_param_supp(struct obd_export *exp)
@@ -532,36 +521,8 @@ int tgt_grant_commit_cb_add(struct thandle *th, struct obd_export *exp,
 long tgt_grant_create(const struct lu_env *env, struct obd_export *exp,
 		      s64 *nr);
 int tgt_statfs_internal(const struct lu_env *env, struct lu_target *lut,
-			struct obd_statfs *osfs, time64_t max_age,
+			struct obd_statfs *osfs, __u64 max_age,
 			int *from_cache);
-#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 16, 53, 0)
-ssize_t sync_lock_cancel_show(struct kobject *kobj,
-			      struct attribute *attr, char *buf);
-ssize_t sync_lock_cancel_store(struct kobject *kobj, struct attribute *attr,
-			       const char *buffer, size_t count);
-#endif
-ssize_t tot_dirty_show(struct kobject *kobj, struct attribute *attr,
-		       char *buf);
-ssize_t tot_granted_show(struct kobject *kobj, struct attribute *attr,
-			 char *buf);
-ssize_t tot_pending_show(struct kobject *kobj, struct attribute *attr,
-			 char *buf);
-ssize_t grant_compat_disable_show(struct kobject *kobj, struct attribute *attr,
-				  char *buf);
-ssize_t grant_compat_disable_store(struct kobject *kobj,
-				   struct attribute *attr,
-				   const char *buffer, size_t count);
-
-/* FMD */
-void tgt_fmd_update(struct obd_export *exp, const struct lu_fid *fid,
-		    __u64 xid);
-bool tgt_fmd_check(struct obd_export *exp, const struct lu_fid *fid,
-		   __u64 xid);
-#ifdef DO_FMD_DROP
-void tgt_fmd_drop(struct obd_export *exp, const struct lu_fid *fid);
-#else
-#define tgt_fmd_drop(exp, fid) do {} while (0)
-#endif
 
 /* target/update_trans.c */
 int distribute_txn_init(const struct lu_env *env,
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h b/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h
index e5466c7886238..6f57a20a6a8ab 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h
@@ -38,12 +38,36 @@
  * Author: Andreas Dilger <adilger@sun.com>
  */
 
-/*
- * NOTE: This file is DEPRECATED! Please include linux/lustre/lustre_fiemap.h
- * directly instead of this file. This file will be removed from a
- * future version of lustre!
- */
+#ifndef _LUSTRE_FIEMAP_H
+#define _LUSTRE_FIEMAP_H
+
+#include <stddef.h>
+#include <linux/fiemap.h>
+
+/* XXX: We use fiemap_extent::fe_reserved[0] */
+#define fe_device	fe_reserved[0]
+
+static inline size_t fiemap_count_to_size(size_t extent_count)
+{
+	return sizeof(struct fiemap) + extent_count *
+				       sizeof(struct fiemap_extent);
+}
+
+static inline unsigned fiemap_size_to_count(size_t array_size)
+{
+	return (array_size - sizeof(struct fiemap)) /
+	       sizeof(struct fiemap_extent);
+}
+
+#define FIEMAP_FLAG_DEVICE_ORDER 0x40000000 /* return device ordered mapping */
+
+#ifdef FIEMAP_FLAGS_COMPAT
+#undef FIEMAP_FLAGS_COMPAT
+#endif
 
-#include <linux/lustre/lustre_fiemap.h>
+/* Lustre specific flags - use a high bit, don't conflict with upstream flag */
+#define FIEMAP_EXTENT_NO_DIRECT 0x40000000 /* Data mapping undefined */
+#define FIEMAP_EXTENT_NET       0x80000000 /* Data stored remotely.
+					    * Sets NO_DIRECT flag */
 
-#warning "Including ll_fiemap.h is deprecated. Include linux/lustre/lustre_fiemap.h directly."
+#endif /* _LUSTRE_FIEMAP_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_barrier_user.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_barrier_user.h
index f8489d55a3b44..e69bdc2795e56 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_barrier_user.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_barrier_user.h
@@ -6,13 +6,13 @@
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 only,
  * as published by the Free Software Foundation.
- *
+
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License version 2 for more details.  A copy is
  * included in the COPYING file that accompanied this code.
- *
+
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2016, Intel Corporation.
  *
  * lustre/include/lustre/lustre_barrier_user.h
  *
@@ -28,13 +28,46 @@
  *
  * Author: Fan, Yong <fan.yong@intel.com>
  */
+#ifndef _LUSTRE_BARRIER_USER_H
+# define _LUSTRE_BARRIER_USER_H
 
-/*
- * NOTE: This file is DEPRECATED! Please include linux/lustre/lustre_barrier_user.h
- * directly instead of this file. This file will be removed from a
- * future version of lustre!
- */
+#include <lustre/lustre_user.h>
+
+#define BARRIER_VERSION_V1	1
+#define BARRIER_TIMEOUT_DEFAULT	30
+
+enum barrier_commands {
+	BC_FREEZE	= 1,
+	BC_THAW		= 2,
+	BC_STAT		= 3,
+	BC_RESCAN	= 4,
+};
+
+enum barrier_status {
+	BS_INIT		= 0,
+	BS_FREEZING_P1	= 1,
+	BS_FREEZING_P2	= 2,
+	BS_FROZEN	= 3,
+	BS_THAWING	= 4,
+	BS_THAWED	= 5,
+	BS_FAILED	= 6,
+	BS_EXPIRED	= 7,
+	BS_RESCAN	= 8,
+};
 
-#include <linux/lustre/lustre_barrier_user.h>
+struct barrier_ctl {
+	__u32	bc_version;
+	__u32	bc_cmd;
+	union {
+		__s32	bc_timeout;
+		__u32	bc_total;
+	};
+	union {
+		__u32	bc_status;
+		__u32	bc_absence;
+	};
+	char	bc_name[12];
+	__u32	bc_padding;
+};
 
-#warning "Including lustre_barrier_user.h is deprecated. Include linux/lustre/lustre_barrier_user.h directly."
+#endif /* _LUSTRE_BARRIER_USER_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_errno.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_errno.h
similarity index 100%
rename from drivers/staging/lustrefsx/lustre/include/lustre_errno.h
rename to drivers/staging/lustrefsx/lustre/include/lustre/lustre_errno.h
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_idl.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_idl.h
similarity index 81%
rename from drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_idl.h
rename to drivers/staging/lustrefsx/lustre/include/lustre/lustre_idl.h
index fb26eaeceec28..f2c850c0f1848 100644
--- a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_idl.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_idl.h
@@ -23,12 +23,14 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  * Lustre is a trademark of Sun Microsystems, Inc.
  *
+ * lustre/include/lustre/lustre_idl.h
+ *
  * Lustre wire protocol definitions.
  */
 
@@ -40,7 +42,7 @@
  * that are used in interfaces with userspace should go in lustre_user.h.
  *
  * All structs being declared here should be built from simple fixed-size
- * types defined in linux/types.h or be built from other types or
+ * types (__u8, __u16, __u32, __u64) or be built from other types or
  * structs also declared in this file.  Similarly, all flags and magic
  * values in those structs should also be declared here.  This ensures
  * that the Lustre wire protocol is not influenced by external dependencies.
@@ -68,24 +70,11 @@
 #define _LUSTRE_IDL_H_
 
 #include <asm/byteorder.h>
-#include <linux/errno.h>
-#include <linux/fiemap.h>
 #include <linux/types.h>
-/*
- * This is due to us being out of kernel and the way the OpenSFS branch
- * handles CFLAGS.
- */
-#ifdef __KERNEL__
-# include <uapi/linux/lnet/lnet-types.h>
-#else
-# include <linux/lnet/lnet-types.h>
-#endif
-#include <linux/lustre/lustre_user.h>
-#include <linux/lustre/lustre_ver.h>
 
-#if defined(__cplusplus)
-extern "C" {
-#endif
+#include <lnet/types.h>
+#include <lustre/lustre_user.h> /* Defn's shared with user-space. */
+#include <lustre_ver.h>
 
 /*
  *  GENERAL STUFF
@@ -97,25 +86,25 @@ extern "C" {
 
 #define CONNMGR_REQUEST_PORTAL          1
 #define CONNMGR_REPLY_PORTAL            2
-/* #define OSC_REQUEST_PORTAL		 3*/
+//#define OSC_REQUEST_PORTAL            3
 #define OSC_REPLY_PORTAL                4
-/*#define OSC_BULK_PORTAL		 5*/
+//#define OSC_BULK_PORTAL               5
 #define OST_IO_PORTAL                   6
 #define OST_CREATE_PORTAL               7
 #define OST_BULK_PORTAL                 8
-/*#define MDC_REQUEST_PORTAL		 9*/
+//#define MDC_REQUEST_PORTAL            9
 #define MDC_REPLY_PORTAL               10
-/*#define MDC_BULK_PORTAL		11*/
+//#define MDC_BULK_PORTAL              11
 #define MDS_REQUEST_PORTAL             12
-#define MDS_IO_PORTAL			13
+//#define MDS_REPLY_PORTAL             13
 #define MDS_BULK_PORTAL                14
 #define LDLM_CB_REQUEST_PORTAL         15
 #define LDLM_CB_REPLY_PORTAL           16
 #define LDLM_CANCEL_REQUEST_PORTAL     17
 #define LDLM_CANCEL_REPLY_PORTAL       18
-/*#define PTLBD_REQUEST_PORTAL		19*/
-/*#define PTLBD_REPLY_PORTAL		20*/
-/*#define PTLBD_BULK_PORTAL		21*/
+//#define PTLBD_REQUEST_PORTAL           19
+//#define PTLBD_REPLY_PORTAL             20
+//#define PTLBD_BULK_PORTAL              21
 #define MDS_SETATTR_PORTAL             22
 #define MDS_READPAGE_PORTAL            23
 #define OUT_PORTAL			24
@@ -128,8 +117,28 @@ extern "C" {
 #define SEQ_DATA_PORTAL                31
 #define SEQ_CONTROLLER_PORTAL          32
 #define MGS_BULK_PORTAL                33
-/* #define DVS_PORTAL			63 */
-/* reserved for Cray DVS - spitzcor@cray.com, roe@cray.com, n8851@cray.com */
+
+/* Portal 63 is reserved for the Cray Inc DVS - nic@cray.com, roe@cray.com, n8851@cray.com */
+
+/* packet types */
+#define PTL_RPC_MSG_REQUEST 4711
+#define PTL_RPC_MSG_ERR     4712
+#define PTL_RPC_MSG_REPLY   4713
+
+/* DON'T use swabbed values of MAGIC as magic! */
+#define LUSTRE_MSG_MAGIC_V2 0x0BD00BD3
+#define LUSTRE_MSG_MAGIC_V2_SWABBED 0xD30BD00B
+
+#define LUSTRE_MSG_MAGIC LUSTRE_MSG_MAGIC_V2
+
+#define PTLRPC_MSG_VERSION  0x00000003
+#define LUSTRE_VERSION_MASK 0xffff0000
+#define LUSTRE_OBD_VERSION  0x00010000
+#define LUSTRE_MDS_VERSION  0x00020000
+#define LUSTRE_OST_VERSION  0x00030000
+#define LUSTRE_DLM_VERSION  0x00040000
+#define LUSTRE_LOG_VERSION  0x00050000
+#define LUSTRE_MGS_VERSION  0x00060000
 
 /**
  * Describes a range of sequence, lsr_start is included but lsr_end is
@@ -169,14 +178,12 @@ extern void lustre_loa_init(struct lustre_ost_attrs *loa,
 			    const struct lu_fid *fid,
 			    __u32 compat, __u32 incompat);
 
-/* copytool can use any nonnegative integer to represent archive-Ids during
- * register with MDT thru kuc.
+/* copytool uses a 32b bitmask field to encode archive-Ids during register
+ * with MDT thru kuc.
  * archive num = 0 => all
- * archive num from 1 to MAX_U32
+ * archive num from 1 to 32
  */
-#define LL_HSM_ORIGIN_MAX_ARCHIVE	(sizeof(__u32) * 8)
-/* the max count of archive ids that one agent can support */
-#define LL_HSM_MAX_ARCHIVES_PER_AGENT	1024
+#define LL_HSM_MAX_ARCHIVE (sizeof(__u32) * 8)
 
 /**
  * HSM on-disk attributes stored in a separate xattr.
@@ -382,23 +389,6 @@ struct lu_orphan_ent_v2 {
 	struct lu_orphan_rec_v2	loe_rec;
 };
 
-struct lu_orphan_rec_v3 {
-	struct lu_orphan_rec	lor_rec;
-	struct ost_layout	lor_layout;
-	/* The OST-object declared layout version in PFID EA.*/
-	__u32			lor_layout_version;
-	/* The OST-object declared layout range (of version) in PFID EA.*/
-	__u32			lor_range;
-	__u32			lor_padding_1;
-	__u64			lor_padding_2;
-};
-
-struct lu_orphan_ent_v3 {
-	/* The orphan OST-object's FID */
-	struct lu_fid		loe_key;
-	struct lu_orphan_rec_v3	loe_rec;
-};
-
 /** @} lu_fid */
 
 /** \defgroup lu_dir lu_dir
@@ -524,21 +514,18 @@ static inline struct lu_dirent *lu_dirent_next(struct lu_dirent *ent)
 	return next;
 }
 
-static inline __kernel_size_t lu_dirent_calc_size(size_t namelen, __u16 attr)
+static inline size_t lu_dirent_calc_size(size_t namelen, __u16 attr)
 {
-	__kernel_size_t size;
+	size_t size;
 
 	if (attr & LUDA_TYPE) {
-		const __kernel_size_t align = sizeof(struct luda_type) - 1;
-
-		size = (sizeof(struct lu_dirent) + namelen + 1 + align) &
-		       ~align;
-		size += sizeof(struct luda_type);
-	} else {
-		size = sizeof(struct lu_dirent) + namelen + 1;
-	}
+		const size_t align = sizeof(struct luda_type) - 1;
+                size = (sizeof(struct lu_dirent) + namelen + align) & ~align;
+                size += sizeof(struct luda_type);
+        } else
+                size = sizeof(struct lu_dirent) + namelen;
 
-	return (size + 7) & ~7;
+        return (size + 7) & ~7;
 }
 
 #define MDS_DIR_END_OFF 0xfffffffffffffffeULL
@@ -582,109 +569,59 @@ static inline void lustre_handle_copy(struct lustre_handle *tgt,
 	tgt->cookie = src->cookie;
 }
 
-/* lustre_msg struct magic.  DON'T use swabbed values of MAGIC as magic! */
-enum lustre_msg_magic {
-	LUSTRE_MSG_MAGIC_V2		= 0x0BD00BD3,
-	LUSTRE_MSG_MAGIC_V2_SWABBED	= 0xD30BD00B,
-	LUSTRE_MSG_MAGIC		= LUSTRE_MSG_MAGIC_V2
+struct lustre_handle_array {
+	unsigned int		count;
+	struct lustre_handle	handles[0];
 };
 
 /* flags for lm_flags */
-enum lustre_msghdr {
-	MSGHDR_AT_SUPPORT	= 0x1,	/* adaptive timeouts, lm_cksum valid
-					 * in early reply messages */
-	MSGHDR_CKSUM_INCOMPAT18	= 0x2,	/* compat for 1.8, needs to be set well
-					 * beyond 2.8.0 for compatibility */
-};
+#define MSGHDR_AT_SUPPORT               0x1
+#define MSGHDR_CKSUM_INCOMPAT18         0x2
 
 #define lustre_msg lustre_msg_v2
 /* we depend on this structure to be 8-byte aligned */
 /* this type is only endian-adjusted in lustre_unpack_msg() */
 struct lustre_msg_v2 {
-	__u32 lm_bufcount;	/* number of buffers in lm_buflens[] */
-	__u32 lm_secflvr;	/* 0 = no crypto, or sptlrpc security flavour */
-	__u32 lm_magic;		/* RPC version magic = LUSTRE_MSG_MAGIC_V2 */
-	__u32 lm_repsize;	/* size of preallocated reply buffer */
-	__u32 lm_cksum;		/* CRC32 of ptlrpc_body early reply messages */
-	__u32 lm_flags;		/* enum lustre_msghdr MSGHDR_* flags */
-	__u32 lm_padding_2;	/* unused */
-	__u32 lm_padding_3;	/* unused */
-	__u32 lm_buflens[0];	/* length of additional buffers in bytes,
-				 * padded to a multiple of 8 bytes. */
-	/*
-	 * message buffers are packed after padded lm_buflens[] array,
-	 * padded to a multiple of 8 bytes each to align contents.
-	 */
-};
-
-/* ptlrpc_body packet pb_types */
-#define PTL_RPC_MSG_REQUEST	4711	/* normal RPC request message */
-#define PTL_RPC_MSG_ERR		4712	/* error reply if request unprocessed */
-#define PTL_RPC_MSG_REPLY	4713	/* normal RPC reply message */
-
-/* ptlrpc_body pb_version ((target_version << 16) | rpc_version) */
-enum lustre_msg_version {
-	PTLRPC_MSG_VERSION	= 0x00000003,
-	LUSTRE_VERSION_MASK	= 0xffff0000,
-	LUSTRE_OBD_VERSION	= 0x00010000,
-	LUSTRE_MDS_VERSION	= 0x00020000,
-	LUSTRE_OST_VERSION	= 0x00030000,
-	LUSTRE_DLM_VERSION	= 0x00040000,
-	LUSTRE_LOG_VERSION	= 0x00050000,
-	LUSTRE_MGS_VERSION	= 0x00060000,
-};
-
-/* pb_flags that apply to all request messages */
-/* #define MSG_LAST_REPLAY	0x0001 obsolete 2.0 => {REQ,LOCK}_REPLAY_DONE */
-#define MSG_RESENT		0x0002 /* was previously sent, no reply seen */
-#define MSG_REPLAY		0x0004 /* was processed, got reply, recovery */
-/* #define MSG_AT_SUPPORT	0x0008 obsolete since 1.5, AT always enabled */
-/* #define MSG_DELAY_REPLAY	0x0010 obsolete since 2.0 */
-/* #define MSG_VERSION_REPLAY	0x0020 obsolete since 1.8.2, VBR always on */
-#define MSG_REQ_REPLAY_DONE	0x0040 /* request replay over, locks next */
-#define MSG_LOCK_REPLAY_DONE	0x0080 /* lock replay over, client done */
-
-/* pb_op_flags for connect opcodes: MDS_CONNECT, OST_CONNECT, MGS_CONNECT */
-#define MSG_CONNECT_RECOVERING	0x00000001 /* target is in recovery */
-#define MSG_CONNECT_RECONNECT	0x00000002 /* tgt already has client import */
-#define MSG_CONNECT_REPLAYABLE	0x00000004 /* target supports RPC replay */
-/* #define MSG_CONNECT_PEER	0x00000008 obsolete since 1.2, removed in 1.5 */
-#define MSG_CONNECT_LIBCLIENT	0x00000010 /* obsolete since 2.3, removed 2.6 */
-#define MSG_CONNECT_INITIAL	0x00000020 /* first client connection attempt */
-/* #define MSG_CONNECT_ASYNC	0x00000040 obsolete since 1.5 */
-#define MSG_CONNECT_NEXT_VER	0x00000080 /* use next version of lustre_msg */
-#define MSG_CONNECT_TRANSNO	0x00000100 /* client sent transno in replay */
-
-/* number of previous object versions in pb_pre_versions[] */
-#define PTLRPC_NUM_VERSIONS     4
+        __u32 lm_bufcount;
+        __u32 lm_secflvr;
+        __u32 lm_magic;
+        __u32 lm_repsize;
+        __u32 lm_cksum;
+        __u32 lm_flags;
+        __u32 lm_padding_2;
+        __u32 lm_padding_3;
+        __u32 lm_buflens[0];
+};
+
 /* without gss, ptlrpc_body is put at the first buffer. */
+#define PTLRPC_NUM_VERSIONS     4
 struct ptlrpc_body_v3 {
 	struct lustre_handle pb_handle;
-	__u32 pb_type;		/* request/reply/err type: PTL_RPC_MSG_* */
-	__u32 pb_version;	/* LUSTRE_*_VERSION | PTLRPC_MSG_VERSION */
-	__u32 pb_opc;		/* RPC opcodes: MDS_*, OST_*, LDLM_, ... */
-	__u32 pb_status;	/* negative Linux x86 error number */
-	__u64 pb_last_xid;	/* highest replied XID w/o lower unreplied XID*/
-	__u16 pb_tag;		/* multiple modifying RPCs virtual slot index */
+	__u32 pb_type;
+	__u32 pb_version;
+	__u32 pb_opc;
+	__u32 pb_status;
+	__u64 pb_last_xid; /* highest replied XID without lower unreplied XID */
+	__u16 pb_tag;      /* virtual slot idx for multiple modifying RPCs */
 	__u16 pb_padding0;
 	__u32 pb_padding1;
-	__u64 pb_last_committed;/* rep: highest pb_transno committed to disk */
-	__u64 pb_transno;	/* server-assigned transno for modifying RPCs */
-	__u32 pb_flags;		/* req: MSG_* flags */
-	__u32 pb_op_flags;	/* req: MSG_CONNECT_* flags */
-	__u32 pb_conn_cnt;	/* connect instance of this client on server */
-	__u32 pb_timeout;	/* req: max wait time; rep: service estimate */
-	__u32 pb_service_time;	/* rep: server arrival to reply in seconds */
-	__u32 pb_limit;		/* rep: dynamic DLM LRU lock count limit */
-	__u64 pb_slv;		/* rep: dynamic DLM LRU server lock volume */
-	/* VBR: rep: previous pb_version(s) of objects modified by this RPC */
+	__u64 pb_last_committed;
+	__u64 pb_transno;
+	__u32 pb_flags;
+	__u32 pb_op_flags;
+	__u32 pb_conn_cnt;
+	__u32 pb_timeout;  /* for req, the deadline, for rep, the service est */
+	__u32 pb_service_time; /* for rep, actual service time */
+	__u32 pb_limit;
+	__u64 pb_slv;
+	/* VBR: pre-versions */
 	__u64 pb_pre_versions[PTLRPC_NUM_VERSIONS];
 	__u64 pb_mbits;	/**< match bits for bulk request */
-	/* padding for future needs - fix lustre_swab_ptlrpc_body() also */
+	/* padding for future needs */
 	__u64 pb_padding64_0;
 	__u64 pb_padding64_1;
 	__u64 pb_padding64_2;
-	char  pb_jobid[LUSTRE_JOBID_SIZE]; /* req: ASCII jobid from env + NUL */
+	char  pb_jobid[LUSTRE_JOBID_SIZE];
 };
 #define ptlrpc_body     ptlrpc_body_v3
 
@@ -740,6 +677,38 @@ struct ptlrpc_body_v2 {
 /** only use in req->rq_{req,rep}_swab_mask */
 #define MSG_PTLRPC_HEADER_OFF           31
 
+/* Flags that are operation-specific go in the top 16 bits. */
+#define MSG_OP_FLAG_MASK   0xffff0000
+#define MSG_OP_FLAG_SHIFT  16
+
+/* Flags that apply to all requests are in the bottom 16 bits */
+#define MSG_GEN_FLAG_MASK     0x0000ffff
+#define MSG_LAST_REPLAY           0x0001
+#define MSG_RESENT                0x0002
+#define MSG_REPLAY                0x0004
+/* #define MSG_AT_SUPPORT         0x0008
+ * This was used in early prototypes of adaptive timeouts, and while there
+ * shouldn't be any users of that code there also isn't a need for using this
+ * bits. Defer usage until at least 1.10 to avoid potential conflict. */
+#define MSG_DELAY_REPLAY          0x0010
+#define MSG_VERSION_REPLAY        0x0020
+#define MSG_REQ_REPLAY_DONE       0x0040
+#define MSG_LOCK_REPLAY_DONE      0x0080
+
+/*
+ * Flags for all connect opcodes (MDS_CONNECT, OST_CONNECT)
+ */
+
+#define MSG_CONNECT_RECOVERING  0x00000001
+#define MSG_CONNECT_RECONNECT   0x00000002
+#define MSG_CONNECT_REPLAYABLE  0x00000004
+//#define MSG_CONNECT_PEER        0x8
+#define MSG_CONNECT_LIBCLIENT   0x00000010
+#define MSG_CONNECT_INITIAL     0x00000020
+#define MSG_CONNECT_ASYNC       0x00000040
+#define MSG_CONNECT_NEXT_VER    0x00000080 /* use next version of lustre_msg */
+#define MSG_CONNECT_TRANSNO     0x00000100 /* report transno */
+
 /* Connect flags */
 #define OBD_CONNECT_RDONLY                0x1ULL /*client has read-only access*/
 #define OBD_CONNECT_INDEX                 0x2ULL /*connect specific LOV idx */
@@ -814,27 +783,14 @@ struct ptlrpc_body_v2 {
 							 RPCs in parallel */
 #define OBD_CONNECT_DIR_STRIPE	 0x400000000000000ULL /* striped DNE dir */
 #define OBD_CONNECT_SUBTREE	0x800000000000000ULL /* fileset mount */
-#define OBD_CONNECT_LOCKAHEAD_OLD 0x1000000000000000ULL /* Old Cray lockahead */
-
+#define OBD_CONNECT_LOCK_AHEAD	 0x1000000000000000ULL /* lock ahead */
 /** bulk matchbits is sent within ptlrpc_body */
 #define OBD_CONNECT_BULK_MBITS	 0x2000000000000000ULL
 #define OBD_CONNECT_OBDOPACK	 0x4000000000000000ULL /* compact OUT obdo */
 #define OBD_CONNECT_FLAGS2	 0x8000000000000000ULL /* second flags word */
 /* ocd_connect_flags2 flags */
-#define OBD_CONNECT2_FILE_SECCTX	 0x1ULL /* set file security context at create */
-#define OBD_CONNECT2_LOCKAHEAD		 0x2ULL /* ladvise lockahead v2 */
-#define OBD_CONNECT2_DIR_MIGRATE	 0x4ULL /* migrate striped dir */
-#define OBD_CONNECT2_SUM_STATFS		0x8ULL /* MDT return aggregated stats */
-#define OBD_CONNECT2_FLR		0x20ULL /* FLR support */
-#define OBD_CONNECT2_WBC_INTENTS	0x40ULL /* create/unlink/... intents for wbc, also operations under client-held parent locks */
-#define OBD_CONNECT2_LOCK_CONVERT	0x80ULL /* IBITS lock convert support */
-#define OBD_CONNECT2_ARCHIVE_ID_ARRAY	0x100ULL /* store HSM archive_id in array */
-#define OBD_CONNECT2_SELINUX_POLICY	0x400ULL /* has client SELinux policy */
-#define OBD_CONNECT2_LSOM		0x800ULL /* LSOM support */
-#define OBD_CONNECT2_ASYNC_DISCARD	0x4000ULL /* support async DoM data discard */
-#define OBD_CONNECT2_ENCRYPT		0x8000ULL /* client-to-disk encrypt */
-#define OBD_CONNECT2_FIDMAP	       0x10000ULL /* FID map */
-#define OBD_CONNECT2_GETATTR_PFID      0x20000ULL /* pack parent FID in getattr */
+#define OBD_CONNECT2_FILE_SECCTX	0x1ULL /* set file security context at create */
+
 /* XXX README XXX:
  * Please DO NOT add flag values here before first ensuring that this same
  * flag value is not in use on some other branch.  Please clear any such
@@ -876,23 +832,13 @@ struct ptlrpc_body_v2 {
 				OBD_CONNECT_FLOCK_DEAD | \
 				OBD_CONNECT_DISP_STRIPE | OBD_CONNECT_LFSCK | \
 				OBD_CONNECT_OPEN_BY_FID | \
-				OBD_CONNECT_DIR_STRIPE | OBD_CONNECT_GRANT | \
-				OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_SRVLOCK | \
-				OBD_CONNECT_BULK_MBITS | OBD_CONNECT_CKSUM | \
+				OBD_CONNECT_DIR_STRIPE | \
+				OBD_CONNECT_BULK_MBITS | \
 				OBD_CONNECT_MULTIMODRPCS | \
 				OBD_CONNECT_SUBTREE | OBD_CONNECT_LARGE_ACL | \
-				OBD_CONNECT_GRANT_PARAM | \
-				OBD_CONNECT_SHORTIO | OBD_CONNECT_FLAGS2)
-
-#define MDT_CONNECT_SUPPORTED2 (OBD_CONNECT2_FILE_SECCTX | OBD_CONNECT2_FLR | \
-                                OBD_CONNECT2_SUM_STATFS | \
-				OBD_CONNECT2_LOCK_CONVERT | \
-				OBD_CONNECT2_DIR_MIGRATE | \
-				OBD_CONNECT2_ARCHIVE_ID_ARRAY | \
-				OBD_CONNECT2_SELINUX_POLICY | \
-				OBD_CONNECT2_LSOM | \
-				OBD_CONNECT2_ASYNC_DISCARD | \
-				OBD_CONNECT2_GETATTR_PFID)
+				OBD_CONNECT_FLAGS2)
+
+#define MDT_CONNECT_SUPPORTED2 OBD_CONNECT2_FILE_SECCTX
 
 #define OST_CONNECT_SUPPORTED  (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
 				OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
@@ -910,12 +856,10 @@ struct ptlrpc_body_v2 {
 				OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_FID | \
 				OBD_CONNECT_PINGLESS | OBD_CONNECT_LFSCK | \
 				OBD_CONNECT_BULK_MBITS | \
-				OBD_CONNECT_GRANT_PARAM | \
-				OBD_CONNECT_SHORTIO | OBD_CONNECT_FLAGS2)
+				OBD_CONNECT_GRANT_PARAM)
+#define OST_CONNECT_SUPPORTED2 0
 
-#define OST_CONNECT_SUPPORTED2 OBD_CONNECT2_LOCKAHEAD
-
-#define ECHO_CONNECT_SUPPORTED (OBD_CONNECT_FID)
+#define ECHO_CONNECT_SUPPORTED 0
 #define ECHO_CONNECT_SUPPORTED2 0
 
 #define MGS_CONNECT_SUPPORTED  (OBD_CONNECT_VERSION | OBD_CONNECT_AT | \
@@ -927,7 +871,6 @@ struct ptlrpc_body_v2 {
 
 /* Features required for this version of the client to work with server */
 #define CLIENT_CONNECT_MDT_REQD (OBD_CONNECT_FID |	\
-				 OBD_CONNECT_ATTRFID |	\
 				 OBD_CONNECT_FULL20)
 
 /* This structure is used for both request and reply.
@@ -984,43 +927,21 @@ struct obd_connect_data {
 /*
  * Supported checksum algorithms. Up to 32 checksum types are supported.
  * (32-bit mask stored in obd_connect_data::ocd_cksum_types)
- * Please update DECLARE_CKSUM_NAME in obd_cksum.h when adding a new
- * algorithm and also the OBD_FL_CKSUM* flags, OBD_CKSUM_ALL flag,
- * OBD_FL_CKSUM_ALL flag and potentially OBD_CKSUM_T10_ALL flag.
- */
-enum cksum_types {
-	OBD_CKSUM_CRC32		= 0x00000001,
-	OBD_CKSUM_ADLER		= 0x00000002,
-	OBD_CKSUM_CRC32C	= 0x00000004,
-	OBD_CKSUM_RESERVED	= 0x00000008,
-	OBD_CKSUM_T10IP512	= 0x00000010,
-	OBD_CKSUM_T10IP4K	= 0x00000020,
-	OBD_CKSUM_T10CRC512	= 0x00000040,
-	OBD_CKSUM_T10CRC4K	= 0x00000080,
-};
-
-#define OBD_CKSUM_T10_ALL (OBD_CKSUM_T10IP512 | OBD_CKSUM_T10IP4K | \
-	OBD_CKSUM_T10CRC512 | OBD_CKSUM_T10CRC4K)
-
-#define OBD_CKSUM_ALL (OBD_CKSUM_CRC32 | OBD_CKSUM_ADLER | OBD_CKSUM_CRC32C | \
-		       OBD_CKSUM_T10_ALL)
-
-/*
- * The default checksum algorithm used on top of T10PI GRD tags for RPC.
- * Considering that the checksum-of-checksums is only computing CRC32 on a
- * 4KB chunk of GRD tags for a 1MB RPC for 512B sectors, or 16KB of GRD
- * tags for 16MB of 4KB sectors, this is only 1/256 or 1/1024 of the
- * total data being checksummed, so the checksum type used here should not
- * affect overall system performance noticeably.
+ * Please update DECLARE_CKSUM_NAME/OBD_CKSUM_ALL in obd.h when adding a new
+ * algorithm and also the OBD_FL_CKSUM* flags.
  */
-#define OBD_CKSUM_T10_TOP OBD_CKSUM_ADLER
+typedef enum cksum_types {
+        OBD_CKSUM_CRC32 = 0x00000001,
+        OBD_CKSUM_ADLER = 0x00000002,
+        OBD_CKSUM_CRC32C= 0x00000004,
+} cksum_type_t;
 
 /*
  *   OST requests: OBDO & OBD request records
  */
 
 /* opcodes */
-enum ost_cmd {
+typedef enum {
         OST_REPLY      =  0,       /* reply ? */
         OST_GETATTR    =  1,
         OST_SETATTR    =  2,
@@ -1041,10 +962,8 @@ enum ost_cmd {
         OST_QUOTACTL   = 19,
 	OST_QUOTA_ADJUST_QUNIT = 20, /* not used since 2.4 */
 	OST_LADVISE    = 21,
-	OST_LAST_OPC, /* must be < 33 to avoid MDS_GETATTR */
-	OST_FALLOCATE  = 22,
-	OST_SEEK       = 23,
-};
+	OST_LAST_OPC /* must be < 33 to avoid MDS_GETATTR */
+} ost_cmd_t;
 #define OST_FIRST_OPC  OST_REPLY
 
 enum obdo_flags {
@@ -1061,16 +980,13 @@ enum obdo_flags {
         OBD_FL_NO_GRPQUOTA  = 0x00000200, /* the object's group is over quota */
         OBD_FL_CREATE_CROW  = 0x00000400, /* object should be create on write */
         OBD_FL_SRVLOCK      = 0x00000800, /* delegate DLM locking to server */
-	OBD_FL_CKSUM_CRC32  = 0x00001000, /* CRC32 checksum type */
-	OBD_FL_CKSUM_ADLER  = 0x00002000, /* ADLER checksum type */
-	OBD_FL_CKSUM_CRC32C = 0x00004000, /* CRC32C checksum type */
-	OBD_FL_CKSUM_T10IP512  = 0x00005000, /* T10PI IP cksum, 512B sector */
-	OBD_FL_CKSUM_T10IP4K   = 0x00006000, /* T10PI IP cksum, 4KB sector */
-	OBD_FL_CKSUM_T10CRC512 = 0x00007000, /* T10PI CRC cksum, 512B sector */
-	OBD_FL_CKSUM_T10CRC4K  = 0x00008000, /* T10PI CRC cksum, 4KB sector */
-	OBD_FL_CKSUM_RSVD3  = 0x00010000, /* for future cksum types */
-	OBD_FL_SHRINK_GRANT = 0x00020000, /* object shrink the grant */
-	OBD_FL_MMAP         = 0x00040000, /* object is mmapped on the client.
+        OBD_FL_CKSUM_CRC32  = 0x00001000, /* CRC32 checksum type */
+        OBD_FL_CKSUM_ADLER  = 0x00002000, /* ADLER checksum type */
+        OBD_FL_CKSUM_CRC32C = 0x00004000, /* CRC32C checksum type */
+        OBD_FL_CKSUM_RSVD2  = 0x00008000, /* for future cksum types */
+        OBD_FL_CKSUM_RSVD3  = 0x00010000, /* for future cksum types */
+        OBD_FL_SHRINK_GRANT = 0x00020000, /* object shrink the grant */
+        OBD_FL_MMAP         = 0x00040000, /* object is mmapped on the client.
                                            * XXX: obsoleted - reserved for old
                                            * clients prior than 2.2 */
         OBD_FL_RECOV_RESEND = 0x00080000, /* recoverable resent */
@@ -1079,18 +995,10 @@ enum obdo_flags {
 	OBD_FL_SHORT_IO	    = 0x00400000, /* short io request */
 	/* OBD_FL_LOCAL_MASK = 0xF0000000, was local-only flags until 2.10 */
 
-	/*
-	 * Note that while the original checksum values were separate bits,
-	 * in 2.x we can actually allow all values from 1-31. T10-PI checksum
-	 * types already use values which are not separate bits.
-	 */
+	/* Note that while these checksum values are currently separate bits,
+	 * in 2.x we can actually allow all values from 1-31 if we wanted. */
 	OBD_FL_CKSUM_ALL    = OBD_FL_CKSUM_CRC32 | OBD_FL_CKSUM_ADLER |
-			      OBD_FL_CKSUM_CRC32C | OBD_FL_CKSUM_T10IP512 |
-			      OBD_FL_CKSUM_T10IP4K | OBD_FL_CKSUM_T10CRC512 |
-			      OBD_FL_CKSUM_T10CRC4K,
-
-	OBD_FL_NO_QUOTA_ALL = OBD_FL_NO_USRQUOTA | OBD_FL_NO_GRPQUOTA |
-			      OBD_FL_NO_PRJQUOTA,
+			      OBD_FL_CKSUM_CRC32C,
 };
 
 /*
@@ -1127,10 +1035,10 @@ enum obdo_flags {
  * those *_DEF magics are only used on server side internally, they
  * won't be put on wire or disk.
  */
-#define LOV_MAGIC_DEFINED		0x10000000
-#define LOV_MAGIC_V1_DEFINED		(LOV_MAGIC_DEFINED | LOV_MAGIC_V1)
-#define LOV_MAGIC_V3_DEFINED		(LOV_MAGIC_DEFINED | LOV_MAGIC_V3)
-#define LOV_MAGIC_COMP_V1_DEFINED	(LOV_MAGIC_DEFINED | LOV_MAGIC_COMP_V1)
+#define LOV_MAGIC_DEF		0x10000000
+#define LOV_MAGIC_V1_DEF	(LOV_MAGIC_DEF | LOV_MAGIC_V1)
+#define LOV_MAGIC_V3_DEF	(LOV_MAGIC_DEF | LOV_MAGIC_V3)
+#define LOV_MAGIC_COMP_V1_DEF	(LOV_MAGIC_DEF | LOV_MAGIC_COMP_V1)
 
 #define lov_pattern(pattern)		(pattern & ~LOV_PATTERN_F_MASK)
 #define lov_pattern_flags(pattern)	(pattern & LOV_PATTERN_F_MASK)
@@ -1173,7 +1081,6 @@ struct lov_mds_md_v1 {            /* LOV EA mds/wire data (little-endian) */
 #define XATTR_TRUSTED_PREFIX    "trusted."
 #define XATTR_SECURITY_PREFIX   "security."
 
-#define XATTR_NAME_SOM		"trusted.som"
 #define XATTR_NAME_LOV          "trusted.lov"
 #define XATTR_NAME_LMA          "trusted.lma"
 #define XATTR_NAME_LMV          "trusted.lmv"
@@ -1215,7 +1122,7 @@ static inline __u32 lov_mds_md_size(__u16 stripes, __u32 lmm_magic)
 }
 
 static inline __u32
-lov_mds_md_max_stripe_count(__kernel_size_t buf_size, __u32 lmm_magic)
+lov_mds_md_max_stripe_count(size_t buf_size, __u32 lmm_magic)
 {
 	switch (lmm_magic) {
 	case LOV_MAGIC_V1: {
@@ -1251,21 +1158,20 @@ lov_mds_md_max_stripe_count(__kernel_size_t buf_size, __u32 lmm_magic)
 #define OBD_MD_FLUID       (0x00000200ULL) /* user ID */
 #define OBD_MD_FLGID       (0x00000400ULL) /* group ID */
 #define OBD_MD_FLFLAGS     (0x00000800ULL) /* flags word */
-#define OBD_MD_DOM_SIZE    (0X00001000ULL) /* Data-on-MDT component size */
 #define OBD_MD_FLNLINK     (0x00002000ULL) /* link count */
-#define OBD_MD_FLPARENT    (0x00004000ULL) /* parent FID */
-#define OBD_MD_LAYOUT_VERSION (0x00008000ULL) /* OST object layout version */
+#define OBD_MD_FLGENER     (0x00004000ULL) /* generation number */
+/*#define OBD_MD_FLINLINE    (0x00008000ULL)  inline data. used until 1.6.5 */
 #define OBD_MD_FLRDEV      (0x00010000ULL) /* device number */
 #define OBD_MD_FLEASIZE    (0x00020000ULL) /* extended attribute data */
 #define OBD_MD_LINKNAME    (0x00040000ULL) /* symbolic link target */
 #define OBD_MD_FLHANDLE    (0x00080000ULL) /* file/lock handle */
 #define OBD_MD_FLCKSUM     (0x00100000ULL) /* bulk data checksum */
-/*	OBD_MD_FLQOS       (0x00200000ULL) has never been used */
-/*	OBD_MD_FLCOOKIE    (0x00800000ULL) obsolete in 2.8 */
+#define OBD_MD_FLQOS       (0x00200000ULL) /* quality of service stats */
+/*	OBD_MD_FLCOOKIE    (0x00800000ULL)    obsolete in 2.8 */
 #define OBD_MD_FLPRJQUOTA  (0x00400000ULL) /* over quota flags sent from ost */
 #define OBD_MD_FLGROUP     (0x01000000ULL) /* group */
 #define OBD_MD_FLFID       (0x02000000ULL) /* ->ost write inline fid */
-/*	OBD_MD_FLEPOCH     (0x04000000ULL) obsolete 2.7.50 */
+#define OBD_MD_FLEPOCH     (0x04000000ULL) /* ->ost write with ioepoch */
                                            /* ->mds if epoch opens or closes */
 #define OBD_MD_FLGRANT     (0x08000000ULL) /* ost preallocation space grant */
 #define OBD_MD_FLDIREA     (0x10000000ULL) /* dir's extended attribute data */
@@ -1274,7 +1180,7 @@ lov_mds_md_max_stripe_count(__kernel_size_t buf_size, __u32 lmm_magic)
 #define OBD_MD_FLMODEASIZE (0x80000000ULL) /* EA size will be changed */
 
 #define OBD_MD_MDS         (0x0000000100000000ULL) /* where an inode lives on */
-/*	OBD_MD_REINT       (0x0000000200000000ULL) obsolete 1.8 */
+#define OBD_MD_REINT       (0x0000000200000000ULL) /* reintegrate oa */
 #define OBD_MD_MEA         (0x0000000400000000ULL) /* CMD split EA  */
 #define OBD_MD_TSTATE      (0x0000000800000000ULL) /* transient state field */
 
@@ -1282,10 +1188,10 @@ lov_mds_md_max_stripe_count(__kernel_size_t buf_size, __u32 lmm_magic)
 #define OBD_MD_FLXATTRLS     (0x0000002000000000ULL) /* xattr list */
 #define OBD_MD_FLXATTRRM     (0x0000004000000000ULL) /* xattr remove */
 #define OBD_MD_FLACL         (0x0000008000000000ULL) /* ACL */
-#define OBD_MD_FLAGSTATFS    (0x0000010000000000ULL) /* aggregated statfs */
-/*	OBD_MD_FLMDSCAPA     (0x0000020000000000ULL) obsolete 2.7.54 */
-/*	OBD_MD_FLOSSCAPA     (0x0000040000000000ULL) obsolete 2.7.54 */
-/*      OBD_MD_FLCKSPLIT     (0x0000080000000000ULL) obsolete 2.3.58*/
+/*	OBD_MD_FLRMTPERM     (0x0000010000000000ULL) remote perm, obsolete */
+#define OBD_MD_FLMDSCAPA     (0x0000020000000000ULL) /* MDS capability */
+#define OBD_MD_FLOSSCAPA     (0x0000040000000000ULL) /* OSS capability */
+#define OBD_MD_FLCKSPLIT     (0x0000080000000000ULL) /* Check split on server */
 #define OBD_MD_FLCROSSREF    (0x0000100000000000ULL) /* Cross-ref case */
 #define OBD_MD_FLGETATTRLOCK (0x0000200000000000ULL) /* Get IOEpoch attributes
                                                       * under lock; for xattr
@@ -1300,10 +1206,6 @@ lov_mds_md_max_stripe_count(__kernel_size_t buf_size, __u32 lmm_magic)
 #define OBD_MD_DEFAULT_MEA   (0x0040000000000000ULL) /* default MEA */
 #define OBD_MD_FLOSTLAYOUT   (0x0080000000000000ULL) /* contain ost_layout */
 #define OBD_MD_FLPROJID      (0x0100000000000000ULL) /* project ID */
-#define OBD_MD_SECCTX        (0x0200000000000000ULL) /* embed security xattr */
-
-#define OBD_MD_FLLAZYSIZE    (0x0400000000000000ULL) /* Lazy size */
-#define OBD_MD_FLLAZYBLOCKS  (0x0800000000000000ULL) /* Lazy blocks */
 
 #define OBD_MD_FLALLQUOTA (OBD_MD_FLUSRQUOTA | \
 			   OBD_MD_FLGRPQUOTA | \
@@ -1313,7 +1215,7 @@ lov_mds_md_max_stripe_count(__kernel_size_t buf_size, __u32 lmm_magic)
 			  OBD_MD_FLCTIME | OBD_MD_FLSIZE  | OBD_MD_FLBLKSZ | \
 			  OBD_MD_FLMODE  | OBD_MD_FLTYPE  | OBD_MD_FLUID   | \
 			  OBD_MD_FLGID   | OBD_MD_FLFLAGS | OBD_MD_FLNLINK | \
-			  OBD_MD_FLPARENT | OBD_MD_FLRDEV  | OBD_MD_FLGROUP | \
+			  OBD_MD_FLGENER | OBD_MD_FLRDEV  | OBD_MD_FLGROUP | \
 			  OBD_MD_FLPROJID)
 
 #define OBD_MD_FLXATTRALL (OBD_MD_FLXATTR | OBD_MD_FLXATTRLS)
@@ -1339,9 +1241,6 @@ struct hsm_state_set {
 #define OBD_BRW_READ            0x01
 #define OBD_BRW_WRITE           0x02
 #define OBD_BRW_RWMASK          (OBD_BRW_READ | OBD_BRW_WRITE)
-#define OBD_BRW_NDELAY		0x04 /* Non-delay RPC should be issued for
-				      * this page. Non-delay RPCs have bit
-				      * rq_no_delay set. */
 #define OBD_BRW_SYNC            0x08 /* this page is a part of synchronous
                                       * transfer and is not accounted in
                                       * the grant. */
@@ -1596,11 +1495,11 @@ struct lquota_lvb {
 #define lvb_glb_ver  lvb_id_may_rel /* current version of the global index */
 
 /* op codes */
-enum quota_cmd {
+typedef enum {
 	QUOTA_DQACQ	= 601,
 	QUOTA_DQREL	= 602,
 	QUOTA_LAST_OPC
-};
+} quota_cmd_t;
 #define QUOTA_FIRST_OPC	QUOTA_DQACQ
 
 /*
@@ -1608,7 +1507,7 @@ enum quota_cmd {
  */
 
 /* opcodes */
-enum mds_cmd {
+typedef enum {
 	MDS_GETATTR		= 33,
 	MDS_GETATTR_NAME	= 34,
 	MDS_CLOSE		= 35,
@@ -1638,18 +1537,17 @@ enum mds_cmd {
 	MDS_HSM_CT_REGISTER	= 59,
 	MDS_HSM_CT_UNREGISTER	= 60,
 	MDS_SWAP_LAYOUTS	= 61,
-	MDS_RMFID		= 62,
 	MDS_LAST_OPC
-};
+} mds_cmd_t;
 
 #define MDS_FIRST_OPC    MDS_GETATTR
 
 
 /* opcodes for object update */
-enum update_cmd {
+typedef enum {
 	OUT_UPDATE	= 1000,
 	OUT_UPDATE_LAST_OPC
-};
+} update_cmd_t;
 
 #define OUT_UPDATE_FIRST_OPC    OUT_UPDATE
 
@@ -1657,7 +1555,7 @@ enum update_cmd {
  * Do not exceed 63
  */
 
-enum mds_reint_op {
+typedef enum {
 	REINT_SETATTR  = 1,
 	REINT_CREATE   = 2,
 	REINT_LINK     = 3,
@@ -1667,9 +1565,8 @@ enum mds_reint_op {
 	REINT_SETXATTR = 7,
 	REINT_RMENTRY  = 8,
 	REINT_MIGRATE  = 9,
-	REINT_RESYNC   = 10,
-	REINT_MAX
-};
+        REINT_MAX
+} mds_reint_t, mdt_reint_t;
 
 /* the disposition of the intent outlines what was executed */
 #define DISP_IT_EXECD        0x00000001
@@ -1687,33 +1584,28 @@ enum mds_reint_op {
 #define DISP_OPEN_DENY	     0x10000000
 
 /* INODE LOCK PARTS */
-enum mds_ibits_locks {
-	MDS_INODELOCK_LOOKUP	= 0x000001, /* For namespace, dentry etc.  Was
-					     * used to protect permission (mode,
-					     * owner, group, etc) before 2.4. */
-	MDS_INODELOCK_UPDATE	= 0x000002, /* size, links, timestamps */
-	MDS_INODELOCK_OPEN	= 0x000004, /* For opened files */
-	MDS_INODELOCK_LAYOUT	= 0x000008, /* for layout */
-
-	/* The PERM bit is added in 2.4, and is used to protect permission
-	 * (mode, owner, group, ACL, etc.) separate from LOOKUP lock.
-	 * For remote directories (in DNE) these locks will be granted by
-	 * different MDTs (different LDLM namespace).
-	 *
-	 * For local directory, the MDT always grants UPDATE|PERM together.
-	 * For remote directory, master MDT (where remote directory is) grants
-	 * UPDATE|PERM, and remote MDT (where name entry is) grants LOOKUP_LOCK.
-	 */
-	MDS_INODELOCK_PERM	= 0x000010,
-	MDS_INODELOCK_XATTR	= 0x000020, /* non-permission extended attrs */
-	MDS_INODELOCK_DOM	= 0x000040, /* Data for Data-on-MDT files */
-	/* Do not forget to increase MDS_INODELOCK_NUMBITS when adding bits */
-};
-#define MDS_INODELOCK_NUMBITS 7
+#define MDS_INODELOCK_LOOKUP 0x000001	/* For namespace, dentry etc, and also
+					 * was used to protect permission (mode,
+					 * owner, group etc) before 2.4. */
+#define MDS_INODELOCK_UPDATE 0x000002	/* size, links, timestamps */
+#define MDS_INODELOCK_OPEN   0x000004	/* For opened files */
+#define MDS_INODELOCK_LAYOUT 0x000008	/* for layout */
+
+/* The PERM bit is added int 2.4, and it is used to protect permission(mode,
+ * owner, group, acl etc), so to separate the permission from LOOKUP lock.
+ * Because for remote directories(in DNE), these locks will be granted by
+ * different MDTs(different ldlm namespace).
+ *
+ * For local directory, MDT will always grant UPDATE_LOCK|PERM_LOCK together.
+ * For Remote directory, the master MDT, where the remote directory is, will
+ * grant UPDATE_LOCK|PERM_LOCK, and the remote MDT, where the name entry is,
+ * will grant LOOKUP_LOCK. */
+#define MDS_INODELOCK_PERM   0x000010
+#define MDS_INODELOCK_XATTR  0x000020	/* extended attributes */
+
+#define MDS_INODELOCK_MAXSHIFT 5
 /* This FULL lock is useful to take on unlink sort of operations */
-#define MDS_INODELOCK_FULL ((1 << MDS_INODELOCK_NUMBITS) - 1)
-/* DOM lock shouldn't be canceled early, use this macro for ELC */
-#define MDS_INODELOCK_ELC (MDS_INODELOCK_FULL & ~MDS_INODELOCK_DOM)
+#define MDS_INODELOCK_FULL ((1<<(MDS_INODELOCK_MAXSHIFT+1))-1)
 
 /* NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2],
  * but was moved into name[1] along with the OID to avoid consuming the
@@ -1733,17 +1625,17 @@ enum {
 enum {
 	/* these should be identical to their EXT4_*_FL counterparts, they are
 	 * redefined here only to avoid dragging in fs/ext4/ext4.h */
-	LUSTRE_SYNC_FL		= 0x00000008, /* Synchronous updates */
-	LUSTRE_IMMUTABLE_FL	= 0x00000010, /* Immutable file */
-	LUSTRE_APPEND_FL	= 0x00000020, /* file writes may only append */
-	LUSTRE_NODUMP_FL	= 0x00000040, /* do not dump file */
-	LUSTRE_NOATIME_FL	= 0x00000080, /* do not update atime */
-	LUSTRE_INDEX_FL		= 0x00001000, /* hash-indexed directory */
-	LUSTRE_DIRSYNC_FL	= 0x00010000, /* dirsync behaviour (dir only) */
-	LUSTRE_TOPDIR_FL	= 0x00020000, /* Top of directory hierarchies*/
-	LUSTRE_DIRECTIO_FL	= 0x00100000, /* Use direct i/o */
-	LUSTRE_INLINE_DATA_FL	= 0x10000000, /* Inode has inline data. */
-	LUSTRE_PROJINHERIT_FL	= 0x20000000, /* Create with parents projid */
+	LUSTRE_SYNC_FL = 0x00000008, /* Synchronous updates */
+	LUSTRE_IMMUTABLE_FL = 0x00000010, /* Immutable file */
+	LUSTRE_APPEND_FL = 0x00000020, /* writes to file may only append */
+	LUSTRE_NODUMP_FL = 0x00000040, /* do not dump file */
+	LUSTRE_NOATIME_FL = 0x00000080, /* do not update atime */
+	LUSTRE_INDEX_FL = 0x00001000, /* hash-indexed directory */
+	LUSTRE_DIRSYNC_FL = 0x00010000, /* dirsync behaviour (dir only) */
+	LUSTRE_TOPDIR_FL = 0x00020000, /* Top of directory hierarchies*/
+	LUSTRE_DIRECTIO_FL = 0x00100000, /* Use direct i/o */
+	LUSTRE_INLINE_DATA_FL = 0x10000000, /* Inode has inline data. */
+	LUSTRE_PROJINHERIT_FL = 0x20000000, /* Create with parents projid */
 
 	/* These flags will not be identical to any EXT4_*_FL counterparts,
 	 * and only reserved for lustre purpose. Note: these flags might
@@ -1752,26 +1644,45 @@ enum {
 	 * wired by la_flags see osd_attr_get().
 	 * 2. If these flags needs to be stored into inode, they will be
 	 * stored in LMA. see LMAI_XXXX */
-	LUSTRE_ORPHAN_FL	= 0x00002000,
-	LUSTRE_SET_SYNC_FL	= 0x00040000, /* Synchronous setattr on OSTs */
+	LUSTRE_ORPHAN_FL = 0x00002000,
 
-	LUSTRE_LMA_FL_MASKS	= LUSTRE_ORPHAN_FL,
+	LUSTRE_LMA_FL_MASKS = LUSTRE_ORPHAN_FL,
 };
 
-#ifndef FS_XFLAG_SYNC
-#define FS_XFLAG_SYNC		0x00000020	/* all writes synchronous */
-#endif
-#ifndef FS_XFLAG_NOATIME
-#define FS_XFLAG_NOATIME	0x00000040	/* do not update access time */
+#ifndef FS_XFLAG_PROJINHERIT
+#define FS_XFLAG_PROJINHERIT	0x00000200	/* create with parents projid */
 #endif
-#ifndef FS_XFLAG_IMMUTABLE
-#define FS_XFLAG_IMMUTABLE	0x00000008	/* file cannot be modified */
+
+#ifdef __KERNEL__
+/* Convert wire LUSTRE_*_FL to corresponding client local VFS S_* values
+ * for the client inode i_flags.  The LUSTRE_*_FL are the Lustre wire
+ * protocol equivalents of LDISKFS_*_FL values stored on disk, while
+ * the S_* flags are kernel-internal values that change between kernel
+ * versions.  These flags are set/cleared via FSFILT_IOC_{GET,SET}_FLAGS.
+ * See b=16526 for a full history. */
+static inline int ll_ext_to_inode_flags(int flags)
+{
+        return (((flags & LUSTRE_SYNC_FL)      ? S_SYNC      : 0) |
+                ((flags & LUSTRE_NOATIME_FL)   ? S_NOATIME   : 0) |
+                ((flags & LUSTRE_APPEND_FL)    ? S_APPEND    : 0) |
+#if defined(S_DIRSYNC)
+                ((flags & LUSTRE_DIRSYNC_FL)   ? S_DIRSYNC   : 0) |
 #endif
-#ifndef FS_XFLAG_APPEND
-#define FS_XFLAG_APPEND		0x00000010	/* all writes append */
+		((flags & LUSTRE_IMMUTABLE_FL) ? S_IMMUTABLE : 0) |
+		((flags & LUSTRE_PROJINHERIT_FL) ? FS_XFLAG_PROJINHERIT : 0));
+}
+
+static inline int ll_inode_to_ext_flags(int iflags)
+{
+        return (((iflags & S_SYNC)      ? LUSTRE_SYNC_FL      : 0) |
+                ((iflags & S_NOATIME)   ? LUSTRE_NOATIME_FL   : 0) |
+                ((iflags & S_APPEND)    ? LUSTRE_APPEND_FL    : 0) |
+#if defined(S_DIRSYNC)
+                ((iflags & S_DIRSYNC)   ? LUSTRE_DIRSYNC_FL   : 0) |
 #endif
-#ifndef FS_XFLAG_PROJINHERIT
-#define FS_XFLAG_PROJINHERIT	0x00000200	/* create with parents projid */
+		((iflags & S_IMMUTABLE) ? LUSTRE_IMMUTABLE_FL : 0) |
+		((iflags & FS_XFLAG_PROJINHERIT) ? LUSTRE_PROJINHERIT_FL : 0));
+}
 #endif
 
 /* 64 possible states */
@@ -1782,14 +1693,14 @@ enum md_transient_state {
 struct mdt_body {
 	struct lu_fid mbo_fid1;
 	struct lu_fid mbo_fid2;
-	struct lustre_handle mbo_open_handle;
+	struct lustre_handle mbo_handle;
 	__u64	mbo_valid;
 	__u64	mbo_size; /* Offset, in the case of MDS_READPAGE */
 	__s64	mbo_mtime;
 	__s64	mbo_atime;
 	__s64	mbo_ctime;
 	__u64	mbo_blocks; /* XID, in the case of MDS_READPAGE */
-	__u64	mbo_version; /* was mbo_ioepoch before 2.11 */
+	__u64	mbo_ioepoch;
 	__u64	mbo_t_state; /* transient file state defined in
 			      * enum md_transient_state
 			      * was "ino" until 2.4.0 */
@@ -1802,7 +1713,7 @@ struct mdt_body {
 	__u32	mbo_flags;   /* LUSTRE_*_FL file attributes */
 	__u32	mbo_rdev;
 	__u32	mbo_nlink; /* #bytes to read in the case of MDS_READPAGE */
-	__u32	mbo_layout_gen; /* was "generation" until 2.4.0 */
+	__u32	mbo_unused2; /* was "generation" until 2.4.0 */
 	__u32	mbo_suppgid;
 	__u32	mbo_eadatasize;
 	__u32	mbo_aclsize;
@@ -1811,15 +1722,15 @@ struct mdt_body {
 	__u32	mbo_uid_h; /* high 32-bits of uid, for FUID */
 	__u32	mbo_gid_h; /* high 32-bits of gid, for FUID */
 	__u32	mbo_projid;
-	__u64	mbo_dom_size; /* size of DOM component */
-	__u64	mbo_dom_blocks; /* blocks consumed by DOM component */
-	__u64	mbo_padding_8; /* also fix lustre_swab_mdt_body */
+	__u64	mbo_padding_6; /* also fix lustre_swab_mdt_body */
+	__u64	mbo_padding_7;
+	__u64	mbo_padding_8;
 	__u64	mbo_padding_9;
 	__u64	mbo_padding_10;
 }; /* 216 */
 
 struct mdt_ioepoch {
-	struct lustre_handle mio_open_handle;
+	struct lustre_handle mio_handle;
 	__u64 mio_unused1; /* was ioepoch */
 	__u32 mio_unused2; /* was flags */
 	__u32 mio_padding;
@@ -1883,72 +1794,103 @@ struct mdt_rec_setattr {
 #define MDS_ATTR_FROM_OPEN  0x4000ULL /* = 16384, called from open path, ie O_TRUNC */
 #define MDS_ATTR_BLOCKS     0x8000ULL /* = 32768 */
 #define MDS_ATTR_PROJID	    0x10000ULL	/* = 65536 */
-#define MDS_ATTR_LSIZE      0x20000ULL	/* = 131072 */
-#define MDS_ATTR_LBLOCKS    0x40000ULL	/* = 262144 */
-#define MDS_ATTR_OVERRIDE	0x2000000ULL /* = 33554432 */
+
+#ifndef FMODE_READ
+#define FMODE_READ               00000001
+#define FMODE_WRITE              00000002
+#endif
+
+#define MDS_FMODE_CLOSED         00000000
+#define MDS_FMODE_EXEC           00000004
+/*	MDS_FMODE_EPOCH          01000000 obsolete since 2.8.0 */
+/*	MDS_FMODE_TRUNC          02000000 obsolete since 2.8.0 */
+/*	MDS_FMODE_SOM            04000000 obsolete since 2.8.0 */
+
+#define MDS_OPEN_CREATED         00000010
+#define MDS_OPEN_CROSS           00000020
+
+#define MDS_OPEN_CREAT           00000100
+#define MDS_OPEN_EXCL            00000200
+#define MDS_OPEN_TRUNC           00001000
+#define MDS_OPEN_APPEND          00002000
+#define MDS_OPEN_SYNC            00010000
+#define MDS_OPEN_DIRECTORY       00200000
+
+#define MDS_OPEN_BY_FID 	040000000 /* open_by_fid for known object */
+#define MDS_OPEN_DELAY_CREATE  0100000000 /* delay initial object create */
+#define MDS_OPEN_OWNEROVERRIDE 0200000000 /* NFSD rw-reopen ro file for owner */
+#define MDS_OPEN_JOIN_FILE     0400000000 /* open for join file.
+                                           * We do not support JOIN FILE
+                                           * anymore, reserve this flags
+                                           * just for preventing such bit
+                                           * to be reused. */
+
+#define MDS_OPEN_LOCK         04000000000 /* This open requires open lock */
+#define MDS_OPEN_HAS_EA      010000000000 /* specify object create pattern */
+#define MDS_OPEN_HAS_OBJS    020000000000 /* Just set the EA the obj exist */
+#define MDS_OPEN_NORESTORE  0100000000000ULL /* Do not restore file at open */
+#define MDS_OPEN_NEWSTRIPE  0200000000000ULL /* New stripe needed (restripe or
+                                              * hsm restore) */
+#define MDS_OPEN_VOLATILE   0400000000000ULL /* File is volatile = created
+						unlinked */
+#define MDS_OPEN_LEASE	   01000000000000ULL /* Open the file and grant lease
+					      * delegation, succeed if it's not
+					      * being opened with conflict mode.
+					      */
+#define MDS_OPEN_RELEASE   02000000000000ULL /* Open the file for HSM release */
+
+/* lustre internal open flags, which should not be set from user space */
+#define MDS_OPEN_FL_INTERNAL (MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS |	\
+			      MDS_OPEN_OWNEROVERRIDE | MDS_OPEN_LOCK |	\
+			      MDS_OPEN_BY_FID | MDS_OPEN_LEASE |	\
+			      MDS_OPEN_RELEASE)
 
 enum mds_op_bias {
-/*	MDS_CHECK_SPLIT		= 1 << 0, obsolete before 2.3.58 */
-	/* used for remote object getattr/open by name: in the original
-	 * getattr/open request, MDT found the object against name is on another
-	 * MDT, then packed FID and LOOKUP lock in reply and returned -EREMOTE,
-	 * and client knew it's a remote object, then set this flag in
-	 * getattr/open request and sent to the corresponding MDT to finish
-	 * getattr/open, which fetched attributes and UPDATE lock/opened file.
-	 */
+	MDS_CHECK_SPLIT		= 1 << 0,
 	MDS_CROSS_REF		= 1 << 1,
-/*	MDS_VTX_BYPASS		= 1 << 2, obsolete since 2.3.54 */
+	MDS_VTX_BYPASS		= 1 << 2,
 	MDS_PERM_BYPASS		= 1 << 3,
 /*	MDS_SOM			= 1 << 4, obsolete since 2.8.0 */
 	MDS_QUOTA_IGNORE	= 1 << 5,
-/*	MDS_CLOSE_CLEANUP	= 1 << 6, obsolete since 2.3.51 */
+	/* Was MDS_CLOSE_CLEANUP (1 << 6), No more used */
 	MDS_KEEP_ORPHAN		= 1 << 7,
 	MDS_RECOV_OPEN		= 1 << 8,
 	MDS_DATA_MODIFIED	= 1 << 9,
 	MDS_CREATE_VOLATILE	= 1 << 10,
 	MDS_OWNEROVERRIDE	= 1 << 11,
 	MDS_HSM_RELEASE		= 1 << 12,
-	MDS_CLOSE_MIGRATE	= 1 << 13,
+	MDS_RENAME_MIGRATE	= 1 << 13,
 	MDS_CLOSE_LAYOUT_SWAP	= 1 << 14,
-	MDS_CLOSE_LAYOUT_MERGE	= 1 << 15,
-	MDS_CLOSE_RESYNC_DONE	= 1 << 16,
-	MDS_CLOSE_LAYOUT_SPLIT	= 1 << 17,
-	MDS_TRUNC_KEEP_LEASE	= 1 << 18,
-	MDS_CLOSE_UPDATE_TIMES	= 1 << 20,
 };
 
-#define MDS_CLOSE_INTENT (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP |         \
-			  MDS_CLOSE_LAYOUT_MERGE | MDS_CLOSE_LAYOUT_SPLIT | \
-			  MDS_CLOSE_RESYNC_DONE)
-
 /* instance of mdt_reint_rec */
 struct mdt_rec_create {
-	__u32		cr_opcode;
-	__u32		cr_cap;
-	__u32		cr_fsuid;
-	__u32		cr_fsuid_h;
-	__u32		cr_fsgid;
-	__u32		cr_fsgid_h;
-	__u32		cr_suppgid1;
-	__u32		cr_suppgid1_h;
-	__u32		cr_suppgid2;
-	__u32		cr_suppgid2_h;
-	struct lu_fid	cr_fid1;
-	struct lu_fid	cr_fid2;
-	struct lustre_handle cr_open_handle_old; /* in case of open replay */
+        __u32           cr_opcode;
+        __u32           cr_cap;
+        __u32           cr_fsuid;
+        __u32           cr_fsuid_h;
+        __u32           cr_fsgid;
+        __u32           cr_fsgid_h;
+        __u32           cr_suppgid1;
+        __u32           cr_suppgid1_h;
+        __u32           cr_suppgid2;
+        __u32           cr_suppgid2_h;
+        struct lu_fid   cr_fid1;
+        struct lu_fid   cr_fid2;
+        struct lustre_handle cr_old_handle; /* handle in case of open replay */
 	__s64		cr_time;
-	__u64		cr_rdev;
-	__u64		cr_ioepoch;
-	__u64		cr_padding_1;   /* rr_blocks */
-	__u32		cr_mode;
-	__u32		cr_bias;
-	/* use of helpers set/get_mrc_cr_flags() is needed to access
-	 * 64 bits cr_flags [cr_flags_l, cr_flags_h], this is done to
-	 * extend cr_flags size without breaking 1.8 compat */
-	__u32		cr_flags_l;	/* for use with open, low  32 bits  */
-	__u32		cr_flags_h;	/* for use with open, high 32 bits */
-	__u32		cr_umask;	/* umask for create */
-	__u32		cr_padding_4;   /* rr_padding_4 */
+        __u64           cr_rdev;
+        __u64           cr_ioepoch;
+        __u64           cr_padding_1;   /* rr_blocks */
+        __u32           cr_mode;
+        __u32           cr_bias;
+        /* use of helpers set/get_mrc_cr_flags() is needed to access
+         * 64 bits cr_flags [cr_flags_l, cr_flags_h], this is done to
+         * extend cr_flags size without breaking 1.8 compat */
+        __u32           cr_flags_l;     /* for use with open, low  32 bits  */
+        __u32           cr_flags_h;     /* for use with open, high 32 bits */
+        __u32           cr_umask;       /* umask for create */
+        __u32           cr_padding_4;   /* rr_padding_4 */
 };
 
 /* instance of mdt_reint_rec */
@@ -2061,35 +2003,6 @@ struct mdt_rec_setxattr {
         __u32           sx_padding_11;  /* rr_padding_4 */
 };
 
-/* instance of mdt_reint_rec
- * FLR: for file resync MDS_REINT_RESYNC RPC. */
-struct mdt_rec_resync {
-	__u32           rs_opcode;
-	__u32           rs_cap;
-	__u32           rs_fsuid;
-	__u32           rs_fsuid_h;
-	__u32           rs_fsgid;
-	__u32           rs_fsgid_h;
-	__u32           rs_suppgid1;
-	__u32           rs_suppgid1_h;
-	__u32           rs_suppgid2;
-	__u32           rs_suppgid2_h;
-	struct lu_fid   rs_fid;
-	__u8		rs_padding0[sizeof(struct lu_fid)];
-	struct lustre_handle rs_lease_handle;	/* rr_mtime */
-	__s64		rs_padding1;	/* rr_atime */
-	__s64		rs_padding2;	/* rr_ctime */
-	__u64           rs_padding3;	/* rr_size */
-	__u64           rs_padding4;	/* rr_blocks */
-	__u32           rs_bias;
-	__u32           rs_padding5;	/* rr_mode */
-	__u32           rs_padding6;	/* rr_flags */
-	__u32           rs_padding7;	/* rr_flags_h */
-	__u32           rs_padding8;	/* rr_umask */
-	__u16           rs_mirror_id;
-	__u16           rs_padding9;	/* rr_padding_4 */
-};
-
 /*
  * mdt_rec_reint is the template for all mdt_reint_xxx structures.
  * Do NOT change the size of various members, otherwise the value
@@ -2121,8 +2034,7 @@ struct mdt_rec_reint {
 	__u32           rr_flags;
 	__u32           rr_flags_h;
 	__u32           rr_umask;
-	__u16		rr_mirror_id;
-	__u16           rr_padding_4; /* also fix lustre_swab_mdt_rec_reint */
+	__u32           rr_padding_4; /* also fix lustre_swab_mdt_rec_reint */
 };
 
 /* lmv structures */
@@ -2153,16 +2065,9 @@ struct lmv_mds_md_v1 {
 					 * used for now. Higher 16 bits will
 					 * be used to mark the object status,
 					 * for example migrating or dead. */
-	__u32 lmv_layout_version;	/* increased each time layout changed,
-					 * by directory migration, restripe
-					 * and LFSCK. */
-	__u32 lmv_migrate_offset;	/* once this is set, it means this
-					 * directory is been migrated, stripes
-					 * before this offset belong to target,
-					 * from this to source. */
-	__u32 lmv_migrate_hash;		/* hash type of source stripes of
-					 * migrating directory */
-	__u32 lmv_padding2;
+	__u32 lmv_layout_version;	/* Used for directory restriping */
+	__u32 lmv_padding1;
+	__u64 lmv_padding2;
 	__u64 lmv_padding3;
 	char lmv_pool_name[LOV_MAXPOOLNAME + 1];	/* pool name */
 	struct lu_fid lmv_stripe_fids[0];	/* FIDs for each stripe */
@@ -2182,7 +2087,7 @@ struct lmv_mds_md_v1 {
 
 #define LMV_HASH_FLAG_MIGRATION	0x80000000
 
-#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 12, 53, 0)
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 10, 53, 0)
 /* Since lustre 2.8, this flag will not be needed, instead this DEAD
  * and orphan flags will be stored in LMA (see LMAI_ORPHAN)
  * Keep this flag just for LFSCK, because it still might meet such
@@ -2210,11 +2115,11 @@ struct lmv_mds_md_v1 {
  **/
 #define LUSTRE_FNV_1A_64_PRIME	0x100000001b3ULL
 #define LUSTRE_FNV_1A_64_OFFSET_BIAS 0xcbf29ce484222325ULL
-static inline __u64 lustre_hash_fnv_1a_64(const void *buf, __kernel_size_t size)
+static inline __u64 lustre_hash_fnv_1a_64(const void *buf, size_t size)
 {
 	__u64 hash = LUSTRE_FNV_1A_64_OFFSET_BIAS;
 	const unsigned char *p = buf;
-	__kernel_size_t i;
+	size_t i;
 
 	for (i = 0; i < size; i++) {
 		hash ^= p[i];
@@ -2230,22 +2135,18 @@ union lmv_mds_md {
 	struct lmv_user_md	 lmv_user_md;
 };
 
-static inline __kernel_ssize_t lmv_mds_md_size(int stripe_count,
-					       unsigned int lmm_magic)
+static inline int lmv_mds_md_size(int stripe_count, unsigned int lmm_magic)
 {
-	__kernel_ssize_t len = -EINVAL;
-
 	switch (lmm_magic) {
-	case LMV_MAGIC_V1: {
+	case LMV_MAGIC_V1:{
 		struct lmv_mds_md_v1 *lmm1;
 
-		len = sizeof(*lmm1);
-		len += stripe_count * sizeof(lmm1->lmv_stripe_fids[0]);
-		break; }
+		return sizeof(*lmm1) + stripe_count *
+				       sizeof(lmm1->lmv_stripe_fids[0]);
+	}
 	default:
-		break;
+		return -EINVAL;
 	}
-	return len;
 }
 
 static inline int lmv_mds_md_stripe_count_get(const union lmv_mds_md *lmm)
@@ -2297,12 +2198,12 @@ enum fld_op {
 };
 
 /* LFSCK opcodes */
-enum lfsck_cmd {
+typedef enum {
 	LFSCK_NOTIFY		= 1101,
 	LFSCK_QUERY		= 1102,
 	LFSCK_LAST_OPC,
-	LFSCK_FIRST_OPC		= LFSCK_NOTIFY
-};
+	LFSCK_FIRST_OPC 	= LFSCK_NOTIFY
+} lfsck_cmd_t;
 
 /*
  *  LOV data structures
@@ -2338,7 +2239,7 @@ struct lov_desc {
  *   LDLM requests:
  */
 /* opcodes -- MUST be distinct from OST/MDS opcodes */
-enum ldlm_cmd {
+typedef enum {
         LDLM_ENQUEUE     = 101,
         LDLM_CONVERT     = 102,
         LDLM_CANCEL      = 103,
@@ -2347,7 +2248,7 @@ enum ldlm_cmd {
         LDLM_GL_CALLBACK = 106,
         LDLM_SET_INFO    = 107,
         LDLM_LAST_OPC
-};
+} ldlm_cmd_t;
 #define LDLM_FIRST_OPC LDLM_ENQUEUE
 
 #define RES_NAME_SIZE 4
@@ -2362,7 +2263,7 @@ struct ldlm_res_id {
 			(unsigned long long)(res)->lr_name.name[3]
 
 /* lock types */
-enum ldlm_mode {
+typedef enum ldlm_mode {
 	LCK_MINMODE	= 0,
 	LCK_EX		= 1,
 	LCK_PW		= 2,
@@ -2373,17 +2274,17 @@ enum ldlm_mode {
 	LCK_GROUP	= 64,
 	LCK_COS		= 128,
 	LCK_MAXMODE
-};
+} ldlm_mode_t;
 
 #define LCK_MODE_NUM    8
 
-enum ldlm_type {
+typedef enum ldlm_type {
 	LDLM_PLAIN	= 10,
 	LDLM_EXTENT	= 11,
 	LDLM_FLOCK	= 12,
 	LDLM_IBITS	= 13,
 	LDLM_MAX_TYPE
-};
+} ldlm_type_t;
 
 #define LDLM_MIN_TYPE LDLM_PLAIN
 
@@ -2393,18 +2294,8 @@ struct ldlm_extent {
         __u64 gid;
 };
 
-static inline bool ldlm_extent_equal(const struct ldlm_extent *ex1,
-				    const struct ldlm_extent *ex2)
-{
-	return ex1->start == ex2->start && ex1->end == ex2->end;
-}
-
 struct ldlm_inodebits {
-	__u64 bits;
-	union {
-		__u64 try_bits; /* optional bits to try */
-		__u64 cancel_bits; /* for lock convert */
-	};
+        __u64 bits;
 };
 
 struct ldlm_flock_wire {
@@ -2421,11 +2312,11 @@ struct ldlm_flock_wire {
  * this ever changes we will need to swab the union differently based
  * on the resource type. */
 
-union ldlm_wire_policy_data {
+typedef union ldlm_wire_policy_data {
 	struct ldlm_extent	l_extent;
 	struct ldlm_flock_wire	l_flock;
 	struct ldlm_inodebits	l_inodebits;
-};
+} ldlm_wire_policy_data_t;
 
 struct barrier_lvb {
 	__u32	lvb_status;
@@ -2447,21 +2338,19 @@ union ldlm_gl_desc {
 enum ldlm_intent_flags {
 	IT_OPEN        = 0x00000001,
 	IT_CREAT       = 0x00000002,
-	IT_OPEN_CREAT  = IT_OPEN | IT_CREAT, /* To allow case label. */
-	IT_READDIR     = 0x00000004, /* Used by mdc, not put on the wire. */
+	IT_OPEN_CREAT  = 0x00000003,
+	IT_READDIR     = 0x00000004,
 	IT_GETATTR     = 0x00000008,
 	IT_LOOKUP      = 0x00000010,
-/*	IT_UNLINK      = 0x00000020, Obsolete. */
-/*	IT_TRUNC       = 0x00000040, Obsolete. */
+	IT_UNLINK      = 0x00000020,
+	IT_TRUNC       = 0x00000040,
 	IT_GETXATTR    = 0x00000080,
-/*	IT_EXEC        = 0x00000100, Obsolete. */
-/*	IT_PIN         = 0x00000200, Obsolete. */
+	IT_EXEC        = 0x00000100,
+	IT_PIN         = 0x00000200,
 	IT_LAYOUT      = 0x00000400,
 	IT_QUOTA_DQACQ = 0x00000800,
 	IT_QUOTA_CONN  = 0x00001000,
-/*	IT_SETXATTR    = 0x00002000, Obsolete. */
-	IT_GLIMPSE     = 0x00004000,
-	IT_BRW	       = 0x00008000,
+	IT_SETXATTR    = 0x00002000,
 };
 
 struct ldlm_intent {
@@ -2485,10 +2374,10 @@ struct ldlm_lock_desc {
 #define LDLM_ENQUEUE_CANCEL_OFF 1
 
 struct ldlm_request {
-	__u32 lock_flags;		/* LDLM_FL_*, see lustre_dlm_flags.h */
-	__u32 lock_count;		/* number of locks in lock_handle[] */
-	struct ldlm_lock_desc lock_desc;/* lock descriptor */
-	struct lustre_handle lock_handle[LDLM_LOCKREQ_HANDLES];
+        __u32 lock_flags;
+        __u32 lock_count;
+        struct ldlm_lock_desc lock_desc;
+        struct lustre_handle lock_handle[LDLM_LOCKREQ_HANDLES];
 };
 
 struct ldlm_reply {
@@ -2506,17 +2395,17 @@ struct ldlm_reply {
 /*
  * Opcodes for mountconf (mgs and mgc)
  */
-enum mgs_cmd {
-	MGS_CONNECT	= 250,
-	MGS_DISCONNECT	= 251,
-	MGS_EXCEPTION	= 252,	/* node died, etc. */
-	MGS_TARGET_REG	= 253,	/* whenever target starts up */
-	MGS_TARGET_DEL	= 254,
-	MGS_SET_INFO	= 255,
-	MGS_CONFIG_READ	= 256,
-	MGS_LAST_OPC,
-	MGS_FIRST_OPC	= MGS_CONNECT
-};
+typedef enum {
+        MGS_CONNECT = 250,
+        MGS_DISCONNECT,
+        MGS_EXCEPTION,         /* node died, etc. */
+        MGS_TARGET_REG,        /* whenever target starts up */
+        MGS_TARGET_DEL,
+        MGS_SET_INFO,
+        MGS_CONFIG_READ,
+        MGS_LAST_OPC
+} mgs_cmd_t;
+#define MGS_FIRST_OPC MGS_CONNECT
 
 #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0)
 #define MGS_PARAM_MAXLEN 1024
@@ -2532,17 +2421,17 @@ struct mgs_send_param {
 #define MTI_PARAM_MAXLEN 4096
 #define MTI_NIDS_MAX     32
 struct mgs_target_info {
-	__u32		mti_lustre_ver;
-	__u32		mti_stripe_index;
-	__u32		mti_config_ver;
-	__u32		mti_flags;    /* LDD_F_* */
-	__u32		mti_nid_count;
-	__u32		mti_instance; /* Running instance of target */
-	char		mti_fsname[MTI_NAME_MAXLEN];
-	char		mti_svname[MTI_NAME_MAXLEN];
-	char		mti_uuid[sizeof(struct obd_uuid)];
-	__u64		mti_nids[MTI_NIDS_MAX]; /* host nids (lnet_nid_t) */
-	char		mti_params[MTI_PARAM_MAXLEN];
+        __u32            mti_lustre_ver;
+        __u32            mti_stripe_index;
+        __u32            mti_config_ver;
+        __u32            mti_flags;
+        __u32            mti_nid_count;
+        __u32            mti_instance; /* Running instance of target */
+        char             mti_fsname[MTI_NAME_MAXLEN];
+        char             mti_svname[MTI_NAME_MAXLEN];
+        char             mti_uuid[sizeof(struct obd_uuid)];
+        __u64            mti_nids[MTI_NIDS_MAX];     /* host nids (lnet_nid_t)*/
+        char             mti_params[MTI_PARAM_MAXLEN];
 };
 
 struct mgs_nidtbl_entry {
@@ -2608,14 +2497,15 @@ struct cfg_marker {
 /*
  * Opcodes for multiple servers.
  */
-enum obd_cmd {
-	OBD_PING	= 400,
-/*	OBD_LOG_CANCEL	= 401, obsolete since 1.5 */
-/*	OBD_QC_CALLBACK	= 402, obsolete since 2.4 */
-	OBD_IDX_READ	= 403,
-	OBD_LAST_OPC,
-	OBD_FIRST_OPC = OBD_PING
-};
+
+typedef enum {
+        OBD_PING = 400,
+        OBD_LOG_CANCEL,
+	OBD_QC_CALLBACK, /* not used since 2.4 */
+	OBD_IDX_READ,
+        OBD_LAST_OPC
+} obd_cmd_t;
+#define OBD_FIRST_OPC OBD_PING
 
 /**
  * llog contexts indices.
@@ -2664,7 +2554,7 @@ struct llog_catid {
 #define LLOG_OP_MAGIC 0x10600000
 #define LLOG_OP_MASK  0xfff00000
 
-enum llog_op_type {
+typedef enum {
 	LLOG_PAD_MAGIC		= LLOG_OP_MAGIC | 0x00000,
 	OST_SZ_REC		= LLOG_OP_MAGIC | 0x00f00,
 	/* OST_RAID1_REC	= LLOG_OP_MAGIC | 0x01000, never used */
@@ -2681,12 +2571,11 @@ enum llog_op_type {
 	/* LLOG_JOIN_REC	= LLOG_OP_MAGIC | 0x50000, obsolete  1.8.0 */
 	CHANGELOG_REC		= LLOG_OP_MAGIC | 0x60000,
 	CHANGELOG_USER_REC	= LLOG_OP_MAGIC | 0x70000,
-	CHANGELOG_USER_REC2	= LLOG_OP_MAGIC | 0x70002,
 	HSM_AGENT_REC		= LLOG_OP_MAGIC | 0x80000,
 	UPDATE_REC		= LLOG_OP_MAGIC | 0xa0000,
 	LLOG_HDR_MAGIC		= LLOG_OP_MAGIC | 0x45539,
 	LLOG_LOGID_MAGIC	= LLOG_OP_MAGIC | 0x4553b,
-};
+} llog_op_type;
 
 #define LLOG_REC_HDR_NEEDS_SWABBING(r) \
 	(((r)->lrh_type & __swab32(LLOG_OP_MASK)) == __swab32(LLOG_OP_MAGIC))
@@ -2700,12 +2589,12 @@ struct llog_rec_hdr {
 	__u32	lrh_index;
 	__u32	lrh_type;
 	__u32	lrh_id;
-} __attribute__((packed));
+};
 
 struct llog_rec_tail {
 	__u32	lrt_len;
 	__u32	lrt_index;
-} __attribute__((packed));
+};
 
 /* Where data follow just after header */
 #define REC_DATA(ptr)						\
@@ -2763,7 +2652,7 @@ struct llog_setattr64_rec_v2 {
 	__u32			lsr_gid_h;
 	__u64			lsr_valid;
 	__u32			lsr_projid;
-	__u32			lsr_layout_version;
+	__u32			lsr_padding1;
 	__u64			lsr_padding2;
 	__u64			lsr_padding3;
 	struct llog_rec_tail	lsr_tail;
@@ -2787,13 +2676,8 @@ struct llog_size_change_rec {
 #define CHANGELOG_ALLMASK 0XFFFFFFFF
 /** default \a changelog_rec_type mask. Allow all of them, except
  * CL_ATIME since it can really be time consuming, and not necessary
- * under normal use.
- * Remove also CL_OPEN, CL_GETXATTR and CL_DN_OPEN from default list as it can
- * be costly and only necessary for audit purpose.
- */
-#define CHANGELOG_DEFMASK (CHANGELOG_ALLMASK & \
-			   ~(1 << CL_ATIME | 1 << CL_OPEN | 1 << CL_GETXATTR | \
-			     1 << CL_DN_OPEN))
+ * under normal use. */
+#define CHANGELOG_DEFMASK (CHANGELOG_ALLMASK & ~(1 << CL_ATIME))
 
 /* changelog llog name, needed by client replicators */
 #define CHANGELOG_CATALOG "changelog_catalog"
@@ -2813,13 +2697,11 @@ struct llog_changelog_rec {
 #define CHANGELOG_USER_PREFIX "cl"
 
 struct llog_changelog_user_rec {
-	struct llog_rec_hdr   cur_hdr;
-	__u32                 cur_id;
-	/* only intended to be used in relative time comparisons to
-	 * detect idle users */
-	__u32                 cur_time;
-	__u64                 cur_endrec;
-	struct llog_rec_tail  cur_tail;
+        struct llog_rec_hdr   cur_hdr;
+        __u32                 cur_id;
+        __u32                 cur_padding;
+        __u64                 cur_endrec;
+        struct llog_rec_tail  cur_tail;
 } __attribute__((packed));
 
 enum agent_req_status {
@@ -2855,7 +2737,7 @@ struct llog_agent_req_rec {
 						 * agent_req_status */
 	__u32			arr_archive_id;	/**< backend archive number */
 	__u64			arr_flags;	/**< req flags */
-	__u64			arr_compound_id; /**< compound cookie, ignored */
+	__u64			arr_compound_id;	/**< compound cookie */
 	__u64			arr_req_create;	/**< req. creation time */
 	__u64			arr_req_change;	/**< req. status change time */
 	struct hsm_action_item	arr_hai;	/**< req. to the agent */
@@ -2884,25 +2766,12 @@ enum llog_flag {
 	LLOG_F_IS_PLAIN		= 0x4,
 	LLOG_F_EXT_JOBID	= 0x8,
 	LLOG_F_IS_FIXSIZE	= 0x10,
-	LLOG_F_EXT_EXTRA_FLAGS  = 0x20,
-	LLOG_F_EXT_X_UIDGID	= 0x40,
-	LLOG_F_EXT_X_NID	= 0x80,
-	LLOG_F_EXT_X_OMODE	= 0x100,
-	LLOG_F_EXT_X_XATTR	= 0x200,
-	LLOG_F_RM_ON_ERR	= 0x400,
 
 	/* Note: Flags covered by LLOG_F_EXT_MASK will be inherited from
 	 * catlog to plain log, so do not add LLOG_F_IS_FIXSIZE here,
 	 * because the catlog record is usually fixed size, but its plain
 	 * log record can be variable */
-	LLOG_F_EXT_MASK = LLOG_F_EXT_JOBID | LLOG_F_EXT_EXTRA_FLAGS |
-			  LLOG_F_EXT_X_UIDGID | LLOG_F_EXT_X_NID |
-			  LLOG_F_EXT_X_OMODE | LLOG_F_EXT_X_XATTR,
-};
-
-/* means first record of catalog */
-enum {
-	LLOG_CAT_FIRST = -1,
+	LLOG_F_EXT_MASK = LLOG_F_EXT_JOBID,
 };
 
 /* On-disk header structure of each log object, stored in little endian order */
@@ -2948,13 +2817,9 @@ struct llog_log_hdr {
 						 llh->llh_hdr.lrh_len -	\
 						 sizeof(llh->llh_tail)))
 
-/** log cookies are used to reference a specific log file and a record therein,
-    and pass record offset from llog_process_thread to llog_write */
+/** log cookies are used to reference a specific log file and a record therein */
 struct llog_cookie {
-	union {
-		struct llog_logid	lgc_lgl;
-		__u64			lgc_offset;
-	};
+        struct llog_logid       lgc_lgl;
         __u32                   lgc_subsys;
         __u32                   lgc_index;
         __u32                   lgc_padding;
@@ -2962,17 +2827,17 @@ struct llog_cookie {
 
 /** llog protocol */
 enum llogd_rpc_ops {
-	LLOG_ORIGIN_HANDLE_CREATE	= 501,
-	LLOG_ORIGIN_HANDLE_NEXT_BLOCK	= 502,
-	LLOG_ORIGIN_HANDLE_READ_HEADER	= 503,
-/*	LLOG_ORIGIN_HANDLE_WRITE_REC	= 504, Obsolete by 2.1. */
-/*	LLOG_ORIGIN_HANDLE_CLOSE	= 505, Obsolete by 1.8. */
-/*	LLOG_ORIGIN_CONNECT		= 506, Obsolete by 2.4. */
-/*	LLOG_CATINFO			= 507, Obsolete by 2.3. */
-	LLOG_ORIGIN_HANDLE_PREV_BLOCK	= 508,
-	LLOG_ORIGIN_HANDLE_DESTROY	= 509, /* Obsolete by 2.11. */
-	LLOG_LAST_OPC,
-	LLOG_FIRST_OPC			= LLOG_ORIGIN_HANDLE_CREATE
+        LLOG_ORIGIN_HANDLE_CREATE       = 501,
+        LLOG_ORIGIN_HANDLE_NEXT_BLOCK   = 502,
+        LLOG_ORIGIN_HANDLE_READ_HEADER  = 503,
+        LLOG_ORIGIN_HANDLE_WRITE_REC    = 504,
+        LLOG_ORIGIN_HANDLE_CLOSE        = 505,
+        LLOG_ORIGIN_CONNECT             = 506,
+	LLOG_CATINFO			= 507,  /* deprecated */
+        LLOG_ORIGIN_HANDLE_PREV_BLOCK   = 508,
+        LLOG_ORIGIN_HANDLE_DESTROY      = 509,  /* for destroy llog object*/
+        LLOG_LAST_OPC,
+        LLOG_FIRST_OPC                  = LLOG_ORIGIN_HANDLE_CREATE
 };
 
 struct llogd_body {
@@ -3026,7 +2891,7 @@ struct obdo {
 	 *
 	 * sizeof(ost_layout) + sieof(__u32) == sizeof(llog_cookie). */
 	struct ost_layout	o_layout;
-	__u32			o_layout_version;
+	__u32			o_padding_3;
 	__u32			o_uid_h;
 	__u32			o_gid_h;
 
@@ -3199,13 +3064,13 @@ union lu_page {
 };
 
 /* security opcodes */
-enum sec_cmd {
+typedef enum {
         SEC_CTX_INIT            = 801,
         SEC_CTX_INIT_CONT       = 802,
         SEC_CTX_FINI            = 803,
         SEC_LAST_OPC,
         SEC_FIRST_OPC           = SEC_CTX_INIT
-};
+} sec_cmd_t;
 
 /*
  * capa related definitions
@@ -3286,7 +3151,7 @@ struct link_ea_entry {
         unsigned char      lee_reclen[2];
         unsigned char      lee_parent_fid[sizeof(struct lu_fid)];
         char               lee_name[0];
-} __attribute__((packed));
+}__attribute__((packed));
 
 /** fid2path request/reply structure */
 struct getinfo_fid2path {
@@ -3308,7 +3173,7 @@ struct getparent {
 	char		gp_name[0];     /**< zero-terminated link name */
 } __attribute__((packed));
 
-enum layout_intent_opc {
+enum {
 	LAYOUT_INTENT_ACCESS	= 0,	/** generic access */
 	LAYOUT_INTENT_READ	= 1,	/** not used */
 	LAYOUT_INTENT_WRITE	= 2,	/** write file, for comp layout */
@@ -3322,7 +3187,8 @@ enum layout_intent_opc {
 struct layout_intent {
 	__u32 li_opc;	/* intent operation for enqueue, read, write etc */
 	__u32 li_flags;
-	struct lu_extent li_extent;
+	__u64 li_start;
+	__u64 li_end;
 } __attribute__((packed));
 
 /**
@@ -3332,7 +3198,7 @@ struct layout_intent {
  */
 struct hsm_progress_kernel {
 	/* Field taken from struct hsm_progress */
-	struct lu_fid		hpk_fid;
+	lustre_fid		hpk_fid;
 	__u64			hpk_cookie;
 	struct hsm_extent	hpk_extent;
 	__u16			hpk_flags;
@@ -3397,7 +3263,6 @@ enum update_type {
 	OUT_PUNCH		= 14,
 	OUT_READ		= 15,
 	OUT_NOOP		= 16,
-	OUT_XATTR_LIST		= 17,
 	OUT_LAST
 };
 
@@ -3488,22 +3353,11 @@ struct mdc_swap_layouts {
 	__u64           msl_flags;
 } __attribute__((packed));
 
-#define INLINE_RESYNC_ARRAY_SIZE	15
-struct close_data_resync_done {
-	__u32	resync_count;
-	__u32	resync_ids_inline[INLINE_RESYNC_ARRAY_SIZE];
-};
-
 struct close_data {
 	struct lustre_handle	cd_handle;
 	struct lu_fid		cd_fid;
 	__u64			cd_data_version;
-	union {
-		__u64				cd_reserved[8];
-		struct close_data_resync_done	cd_resync;
-		/* split close */
-		__u16				cd_mirror_id;
-	};
+	__u64			cd_reserved[8];
 };
 
 /* Update llog format */
@@ -3512,7 +3366,7 @@ struct update_op {
 	__u16		uop_type;
 	__u16		uop_param_count;
 	__u16		uop_params_off[0];
-} __attribute__((packed));
+};
 
 struct update_ops {
 	struct update_op	uops_op[0];
@@ -3563,19 +3417,6 @@ struct llog_update_record {
 	*/
 };
 
-/* sepol string format is:
- * <1-digit for SELinux status>:<policy name>:<policy version>:<policy hash>
- */
-/* Max length of the sepol string
- * Should be large enough to contain a sha512sum of the policy
- */
-#define SELINUX_MODE_LEN 1
-#define SELINUX_POLICY_VER_LEN 3 /* 3 chars to leave room for the future */
-#define SELINUX_POLICY_HASH_LEN 64
-#define LUSTRE_NODEMAP_SEPOL_LENGTH (SELINUX_MODE_LEN + NAME_MAX + \
-				     SELINUX_POLICY_VER_LEN + \
-				     SELINUX_POLICY_HASH_LEN + 3)
-
 /* nodemap records, uses 32 byte record length */
 #define LUSTRE_NODEMAP_NAME_LENGTH 16
 struct nodemap_cluster_rec {
@@ -3646,9 +3487,5 @@ struct ladvise_hdr {
 	struct lu_ladvise	lah_advise[0];	/* advices in this header */
 };
 
-#if defined(__cplusplus)
-}
-#endif
-
 #endif
 /** @} lustreidl */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_lfsck_user.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_lfsck_user.h
index 7b84426fa2750..a02f65fa08aef 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_lfsck_user.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_lfsck_user.h
@@ -6,13 +6,13 @@
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 only,
  * as published by the Free Software Foundation.
- *
+
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License version 2 for more details.  A copy is
  * included in the COPYING file that accompanied this code.
- *
+
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * lustre/include/lustre/lustre_lfsck_user.h
@@ -30,11 +30,207 @@
  * Author: Fan, Yong <fan.yong@intel.com>
  */
 
-/*
- * NOTE: This file is DEPRECATED! Please include linux/lustre/lustre_lfsck_user.h
- * directly instead of this file. This file will be removed from a
- * future version of lustre!
+#ifndef _LUSTRE_LFSCK_USER_H
+# define _LUSTRE_LFSCK_USER_H
+# include <lustre/lustre_user.h>
+
+/**
+ * state machine:
+ *
+ *					LS_INIT
+ *					   |
+ *				     (lfsck|start)
+ *					   |
+ *					   v
+ *				   LS_SCANNING_PHASE1
+ *					|	^
+ *					|	:
+ *					| (lfsck:restart)
+ *					|	:
+ *					v	:
+ *	-----------------------------------------------------------------
+ *	|		    |^		|^	   |^	      |^	|^
+ *	|		    |:		|:	   |:	      |:	|:
+ *	v		    v:		v:	   v:	      v:	v:
+ * LS_SCANNING_PHASE2	LS_FAILED  LS_STOPPED  LS_PAUSED LS_CRASHED LS_PARTIAL
+ *			  (CO_)       (CO_)	 (CO_)
+ *	|	^	    ^:		^:	   ^:	      ^:	^:
+ *	|	:	    |:		|:	   |:	      |:	|:
+ *	| (lfsck:restart)   |:		|:	   |:	      |:	|:
+ *	v	:	    |v		|v	   |v	      |v	|v
+ *	-----------------------------------------------------------------
+ *	    |
+ *	    v
+ *    LS_COMPLETED
  */
+enum lfsck_status {
+	/* The lfsck file is new created, for new MDT, upgrading from old disk,
+	 * or re-creating the lfsck file manually. */
+	LS_INIT			= 0,
+
+	/* The first-step system scanning. The checked items during the phase1
+	 * scanning depends on the LFSCK type. */
+	LS_SCANNING_PHASE1	= 1,
+
+	/* The second-step system scanning. The checked items during the phase2
+	 * scanning depends on the LFSCK type. */
+	LS_SCANNING_PHASE2	= 2,
+
+	/* The LFSCK processing has completed for all objects. */
+	LS_COMPLETED		= 3,
+
+	/* The LFSCK exited automatically for failure, will not auto restart. */
+	LS_FAILED		= 4,
+
+	/* The LFSCK is stopped manually, will not auto restart. */
+	LS_STOPPED		= 5,
+
+	/* LFSCK is paused automatically when umount,
+	 * will be restarted automatically when remount. */
+	LS_PAUSED		= 6,
+
+	/* System crashed during the LFSCK,
+	 * will be restarted automatically after recovery. */
+	LS_CRASHED		= 7,
+
+	/* Some OST/MDT failed during the LFSCK, or not join the LFSCK. */
+	LS_PARTIAL		= 8,
+
+	/* The LFSCK is failed because its controller is failed. */
+	LS_CO_FAILED		= 9,
+
+	/* The LFSCK is stopped because its controller is stopped. */
+	LS_CO_STOPPED		= 10,
+
+	/* The LFSCK is paused because its controller is paused. */
+	LS_CO_PAUSED		= 11,
+
+	LS_MAX
+};
+
+static inline const char *lfsck_status2name(int status)
+{
+	static const char * const lfsck_status_names[] = {
+		[LS_INIT]		= "init",
+		[LS_SCANNING_PHASE1]	= "scanning-phase1",
+		[LS_SCANNING_PHASE2]	= "scanning-phase2",
+		[LS_COMPLETED]		= "completed",
+		[LS_FAILED]		= "failed",
+		[LS_STOPPED]		= "stopped",
+		[LS_PAUSED]		= "paused",
+		[LS_CRASHED]		= "crashed",
+		[LS_PARTIAL]		= "partial",
+		[LS_CO_FAILED]		= "co-failed",
+		[LS_CO_STOPPED]		= "co-stopped",
+		[LS_CO_PAUSED]		= "co-paused"
+	};
+
+	if (status < 0 || status >= LS_MAX)
+		return "unknown";
+
+	return lfsck_status_names[status];
+}
+
+enum lfsck_param_flags {
+	/* Reset LFSCK iterator position to the device beginning. */
+	LPF_RESET		= 0x0001,
+
+	/* Exit when fail. */
+	LPF_FAILOUT		= 0x0002,
+
+	/* Dryrun mode, only check without modification */
+	LPF_DRYRUN		= 0x0004,
+
+	/* LFSCK runs on all targets. */
+	LPF_ALL_TGT		= 0x0008,
+
+	/* Broadcast the command to other MDTs. Only valid on the sponsor MDT */
+	LPF_BROADCAST		= 0x0010,
+
+	/* Handle orphan OST-objects. */
+	LPF_OST_ORPHAN		= 0x0020,
+
+	/* Create OST-object for dangling LOV EA. */
+	LPF_CREATE_OSTOBJ	= 0x0040,
+
+	/* Create MDT-object for dangling name entry. */
+	LPF_CREATE_MDTOBJ	= 0x0080,
+
+	/* Do not return until the LFSCK not running. */
+	LPF_WAIT		= 0x0100,
+
+	/* Delay to create OST-object for dangling LOV EA. */
+	LPF_DELAY_CREATE_OSTOBJ	= 0x0200,
+};
+
+enum lfsck_type {
+	/* For MDT and OST internal OSD consistency check/repair. */
+	LFSCK_TYPE_SCRUB	= 0x0000,
+
+	/* For MDT-OST (layout, object) consistency check/repair. */
+	LFSCK_TYPE_LAYOUT	= 0x0001,
+
+	/* For MDT (FID-in-dirent, linkEA) consistency check/repair. */
+	LFSCK_TYPE_NAMESPACE	= 0x0004,
+	LFSCK_TYPES_SUPPORTED	= (LFSCK_TYPE_SCRUB | LFSCK_TYPE_LAYOUT |
+				   LFSCK_TYPE_NAMESPACE),
+	LFSCK_TYPES_DEF		= LFSCK_TYPES_SUPPORTED,
+	LFSCK_TYPES_ALL		= ((__u16)(~0))
+};
+
+#define LFSCK_VERSION_V1	1
+#define LFSCK_VERSION_V2	2
+
+#define LFSCK_SPEED_NO_LIMIT	0
+#define LFSCK_SPEED_LIMIT_DEF	LFSCK_SPEED_NO_LIMIT
+#define LFSCK_ASYNC_WIN_DEFAULT 1024
+#define LFSCK_ASYNC_WIN_MAX	((__u16)(~0))
+#define LFSCK_TYPE_BITS		16
+
+enum lfsck_start_valid {
+	LSV_SPEED_LIMIT		= 0x00000001,
+	LSV_ERROR_HANDLE	= 0x00000002,
+	LSV_DRYRUN		= 0x00000004,
+	LSV_ASYNC_WINDOWS	= 0x00000008,
+	LSV_CREATE_OSTOBJ	= 0x00000010,
+	LSV_CREATE_MDTOBJ	= 0x00000020,
+	LSV_DELAY_CREATE_OSTOBJ	= 0x00000040,
+};
+
+/* Arguments for starting lfsck. */
+struct lfsck_start {
+	/* Which arguments are valid, see 'enum lfsck_start_valid'. */
+	__u32   ls_valid;
+
+	/* How many items can be scanned at most per second. */
+	__u32   ls_speed_limit;
+
+	/* For compatibility between user space tools and kernel service. */
+	__u16   ls_version;
+
+	/* Which LFSCK components to be (have been) started. */
+	__u16   ls_active;
+
+	/* Flags for the LFSCK, see 'enum lfsck_param_flags'. */
+	__u16   ls_flags;
+
+	/* The windows size for async requests pipeline. */
+	__u16   ls_async_windows;
+};
+
+struct lfsck_stop {
+	__u32	ls_status;
+	__u16	ls_flags;
+	__u16	ls_padding_1; /* For 64-bits aligned. */
+	__u64	ls_padding_2;
+};
+
+struct lfsck_query {
+	__u16	lu_types;
+	__u16	lu_flags;
+	__u32	lu_mdts_count[LFSCK_TYPE_BITS][LS_MAX + 1];
+	__u32	lu_osts_count[LFSCK_TYPE_BITS][LS_MAX + 1];
+	__u64	lu_repaired[LFSCK_TYPE_BITS];
+};
 
-#include <linux/lustre/lustre_lfsck_user.h>
-#warning "Including lustre_lfsck_user.h is deprecated. Include linux/lustre/lustre_lfsck_user.h directly."
+#endif /* _LUSTRE_LFSCK_USER_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h
index 9d8f5ebefa569..67ed9768fcb2f 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2017, Intel Corporation.
+ * Copyright (c) 2010, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -34,15 +34,1638 @@
  * Lustre public user-space interface definitions.
  */
 
+#ifndef _LUSTRE_USER_H
+#define _LUSTRE_USER_H
+
+/** \defgroup lustreuser lustreuser
+ *
+ * @{
+ */
+
+#include <linux/types.h>
+
+#ifdef __KERNEL__
+# include <linux/fs.h>
+# include <linux/quota.h>
+# include <linux/string.h> /* snprintf() */
+# include <linux/version.h>
+#else /* !__KERNEL__ */
+# include <limits.h>
+# include <stdbool.h>
+# include <stdio.h> /* snprintf() */
+# include <string.h>
+# define NEED_QUOTA_DEFS
+/* # include <sys/quota.h> - this causes complaints about caddr_t */
+# include <sys/stat.h>
+#endif /* __KERNEL__ */
+#include <lustre/ll_fiemap.h>
+
 /*
- * NOTE: This file is DEPRECATED! Please include linux/lustre/lustre_user.h
- * directly instead of this file. This file will be removed from a
- * future version of lustre!
+ * This is a temporary solution of adding quota type.
+ * Should be removed as soon as system header is updated.
+ */
+#undef LL_MAXQUOTAS
+#define LL_MAXQUOTAS 3
+#undef INITQFNAMES
+#define INITQFNAMES { \
+    "user",	/* USRQUOTA */ \
+    "group",	/* GRPQUOTA */ \
+    "project",	/* PRJQUOTA */ \
+    "undefined", \
+};
+#ifndef USRQUOTA
+#define USRQUOTA 0
+#endif
+#ifndef GRPQUOTA
+#define GRPQUOTA 1
+#endif
+#ifndef PRJQUOTA
+#define PRJQUOTA 2
+#endif
+
+#if defined(__x86_64__) || defined(__ia64__) || defined(__ppc64__) || \
+    defined(__craynv) || defined(__mips64__) || defined(__powerpc64__) || \
+    defined(__aarch64__)
+typedef struct stat	lstat_t;
+# define lstat_f	lstat
+# define fstat_f	fstat
+# define fstatat_f	fstatat
+# define HAVE_LOV_USER_MDS_DATA
+#elif defined(__USE_LARGEFILE64) || defined(__KERNEL__)
+typedef struct stat64	lstat_t;
+# define lstat_f	lstat64
+# define fstat_f	fstat64
+# define fstatat_f	fstatat64
+# define HAVE_LOV_USER_MDS_DATA
+#endif
+
+#define LUSTRE_EOF 0xffffffffffffffffULL
+
+/* for statfs() */
+#define LL_SUPER_MAGIC 0x0BD00BD0
+
+#ifndef FSFILT_IOC_GETFLAGS
+#define FSFILT_IOC_GETFLAGS               _IOR('f', 1, long)
+#define FSFILT_IOC_SETFLAGS               _IOW('f', 2, long)
+#define FSFILT_IOC_GETVERSION             _IOR('f', 3, long)
+#define FSFILT_IOC_SETVERSION             _IOW('f', 4, long)
+#define FSFILT_IOC_GETVERSION_OLD         _IOR('v', 1, long)
+#define FSFILT_IOC_SETVERSION_OLD         _IOW('v', 2, long)
+#endif
+
+/* FIEMAP flags supported by Lustre */
+#define LUSTRE_FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_DEVICE_ORDER)
+
+enum obd_statfs_state {
+        OS_STATE_DEGRADED       = 0x00000001, /**< RAID degraded/rebuilding */
+        OS_STATE_READONLY       = 0x00000002, /**< filesystem is read-only */
+	OS_STATE_NOPRECREATE    = 0x00000004, /**< no object precreation */
+	OS_STATE_ENOSPC		= 0x00000020, /**< not enough free space */
+	OS_STATE_ENOINO		= 0x00000040, /**< not enough inodes */
+};
+
+struct obd_statfs {
+        __u64           os_type;
+        __u64           os_blocks;
+        __u64           os_bfree;
+        __u64           os_bavail;
+        __u64           os_files;
+        __u64           os_ffree;
+        __u8            os_fsid[40];
+        __u32           os_bsize;
+        __u32           os_namelen;
+        __u64           os_maxbytes;
+        __u32           os_state;       /**< obd_statfs_state OS_STATE_* flag */
+	__u32           os_fprecreated;	/* objs available now to the caller */
+					/* used in QoS code to find preferred
+					 * OSTs */
+        __u32           os_spare2;
+        __u32           os_spare3;
+        __u32           os_spare4;
+        __u32           os_spare5;
+        __u32           os_spare6;
+        __u32           os_spare7;
+        __u32           os_spare8;
+        __u32           os_spare9;
+};
+
+/**
+ * File IDentifier.
+ *
+ * FID is a cluster-wide unique identifier of a file or an object (stripe).
+ * FIDs are never reused.
+ **/
+struct lu_fid {
+       /**
+	* FID sequence. Sequence is a unit of migration: all files (objects)
+	* with FIDs from a given sequence are stored on the same server.
+	* Lustre should support 2^64 objects, so even if each sequence
+	* has only a single object we can still enumerate 2^64 objects.
+	**/
+	__u64 f_seq;
+	/* FID number within sequence. */
+	__u32 f_oid;
+	/**
+	 * FID version, used to distinguish different versions (in the sense
+	 * of snapshots, etc.) of the same file system object. Not currently
+	 * used.
+	 **/
+	__u32 f_ver;
+};
+
+static inline bool fid_is_zero(const struct lu_fid *fid)
+{
+	return fid->f_seq == 0 && fid->f_oid == 0;
+}
+
+/* Currently, the filter_fid::ff_parent::f_ver is not the real parent
+ * MDT-object's FID::f_ver, instead it is the OST-object index in its
+ * parent MDT-object's layout EA. */
+#define f_stripe_idx f_ver
+
+struct ost_layout {
+	__u32	ol_stripe_size;
+	__u32	ol_stripe_count;
+	__u64	ol_comp_start;
+	__u64	ol_comp_end;
+	__u32	ol_comp_id;
+} __attribute__((packed));
+
+/* The filter_fid structure has changed several times over its lifetime.
+ * For a long time "trusted.fid" held the MDT inode parent FID/IGIF and
+ * stripe_index and the "self FID" (objid/seq) to be able to recover the
+ * OST objects in case of corruption.  With the move to 2.4 and OSD-API for
+ * the OST, the "trusted.lma" xattr was added to the OST objects to store
+ * the "self FID" to be consistent with the MDT on-disk format, and the
+ * filter_fid only stored the MDT inode parent FID and stripe index.
+ *
+ * In 2.10, the addition of PFL composite layouts required more information
+ * to be stored into the filter_fid in order to be able to identify which
+ * component the OST object belonged.  As well, the stripe size may vary
+ * between components, so it was no longer safe to assume the stripe size
+ * or stripe_count of a file.  This is also more robust for plain layouts.
+ *
+ * For ldiskfs OSTs that were formatted with 256-byte inodes, there is not
+ * enough space to store both the filter_fid and LMA in the inode, so they
+ * are packed into struct lustre_ost_attrs on disk in trusted.lma to avoid
+ * an extra seek for every OST object access.
+ *
+ * In 2.11, FLR mirror layouts also need to store the layout version and
+ * range so that writes to old versions of the layout are not allowed.
+ * That ensures that mirrored objects are not modified by evicted clients,
+ * and ensures that the components are correctly marked stale on the MDT.
+ */
+struct filter_fid_18_23 {
+	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
+	__u64			ff_objid;
+	__u64			ff_seq;
+};
+
+struct filter_fid_24_29 {
+	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
+};
+
+struct filter_fid_210 {
+	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
+	struct ost_layout	ff_layout;
+};
+
+struct filter_fid {
+	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
+	struct ost_layout	ff_layout;
+	__u32			ff_layout_version;
+	__u32			ff_range; /* range of layout version that
+					   * write are allowed */
+} __attribute__((packed));
+
+/* Userspace should treat lu_fid as opaque, and only use the following methods
+ * to print or parse them.  Other functions (e.g. compare, swab) could be moved
+ * here from lustre_idl.h if needed. */
+typedef struct lu_fid lustre_fid;
+
+enum lma_compat {
+	LMAC_HSM	 = 0x00000001,
+/*	LMAC_SOM	 = 0x00000002, obsolete since 2.8.0 */
+	LMAC_NOT_IN_OI	 = 0x00000004, /* the object does NOT need OI mapping */
+	LMAC_FID_ON_OST  = 0x00000008, /* For OST-object, its OI mapping is
+				       * under /O/<seq>/d<x>. */
+	LMAC_STRIPE_INFO = 0x00000010, /* stripe info in the LMA EA. */
+	LMAC_COMP_INFO	 = 0x00000020, /* Component info in the LMA EA. */
+};
+
+/**
+ * Masks for all features that should be supported by a Lustre version to
+ * access a specific file.
+ * This information is stored in lustre_mdt_attrs::lma_incompat.
+ */
+enum lma_incompat {
+	LMAI_RELEASED		= 0x00000001, /* file is released */
+	LMAI_AGENT		= 0x00000002, /* agent inode */
+	LMAI_REMOTE_PARENT	= 0x00000004, /* the parent of the object
+						 is on the remote MDT */
+	LMAI_STRIPED		= 0x00000008, /* striped directory inode */
+	LMAI_ORPHAN		= 0x00000010, /* inode is orphan */
+	LMA_INCOMPAT_SUPP	= (LMAI_AGENT | LMAI_REMOTE_PARENT | \
+				   LMAI_STRIPED | LMAI_ORPHAN)
+};
+
+
+/**
+ * Following struct for object attributes, that will be kept inode's EA.
+ * Introduced in 2.0 release (please see b15993, for details)
+ * Added to all objects since Lustre 2.4 as contains self FID
+ */
+struct lustre_mdt_attrs {
+	/**
+	 * Bitfield for supported data in this structure. From enum lma_compat.
+	 * lma_self_fid and lma_flags are always available.
+	 */
+	__u32   lma_compat;
+	/**
+	 * Per-file incompat feature list. Lustre version should support all
+	 * flags set in this field. The supported feature mask is available in
+	 * LMA_INCOMPAT_SUPP.
+	 */
+	__u32   lma_incompat;
+	/** FID of this inode */
+	struct lu_fid  lma_self_fid;
+};
+
+struct lustre_ost_attrs {
+	/* Use lustre_mdt_attrs directly for now, need a common header
+	 * structure if want to change lustre_mdt_attrs in future. */
+	struct lustre_mdt_attrs loa_lma;
+
+	/* Below five elements are for OST-object's PFID EA, the
+	 * lma_parent_fid::f_ver is composed of the stripe_count (high 16 bits)
+	 * and the stripe_index (low 16 bits), the size should not exceed
+	 * 5 * sizeof(__u64)) to be accessable by old Lustre. If the flag
+	 * LMAC_STRIPE_INFO is set, then loa_parent_fid and loa_stripe_size
+	 * are valid; if the flag LMAC_COMP_INFO is set, then the next three
+	 * loa_comp_* elements are valid. */
+	struct lu_fid	loa_parent_fid;
+	__u32		loa_stripe_size;
+	__u32		loa_comp_id;
+	__u64		loa_comp_start;
+	__u64		loa_comp_end;
+};
+
+/**
+ * Prior to 2.4, the LMA structure also included SOM attributes which has since
+ * been moved to a dedicated xattr
+ * lma_flags was also removed because of lma_compat/incompat fields.
+ */
+#define LMA_OLD_SIZE (sizeof(struct lustre_mdt_attrs) + 5 * sizeof(__u64))
+
+/**
+ * OST object IDentifier.
  */
+struct ost_id {
+	union {
+		struct {
+			__u64	oi_id;
+			__u64	oi_seq;
+		} oi;
+		struct lu_fid oi_fid;
+	};
+};
+
+#define DOSTID "%#llx:%llu"
+#define POSTID(oi) ((unsigned long long)ostid_seq(oi)), \
+		   ((unsigned long long)ostid_id(oi))
 
-#include <linux/lustre/lustre_user.h>
+struct ll_futimes_3 {
+	__u64 lfu_atime_sec;
+	__u64 lfu_atime_nsec;
+	__u64 lfu_mtime_sec;
+	__u64 lfu_mtime_nsec;
+	__u64 lfu_ctime_sec;
+	__u64 lfu_ctime_nsec;
+};
 
-/* Disable warning until 2.16 or 3.0, until new header is widely available.
- * This gives apps time to move to the new header without spurious warnings.
-#warning "Including lustre/lustre_user.h is deprecated. Include linux/lustre/lustre_user.h instead."
+/*
+ * The ioctl naming rules:
+ * LL_*     - works on the currently opened filehandle instead of parent dir
+ * *_OBD_*  - gets data for both OSC or MDC (LOV, LMV indirectly)
+ * *_MDC_*  - gets/sets data related to MDC
+ * *_LOV_*  - gets/sets data related to OSC/LOV
+ * *FILE*   - called on parent dir and passes in a filename
+ * *STRIPE* - set/get lov_user_md
+ * *INFO    - set/get lov_user_mds_data
+ */
+/*	lustre_ioctl.h			101-150 */
+#define LL_IOC_GETFLAGS                 _IOR ('f', 151, long)
+#define LL_IOC_SETFLAGS                 _IOW ('f', 152, long)
+#define LL_IOC_CLRFLAGS                 _IOW ('f', 153, long)
+#define LL_IOC_LOV_SETSTRIPE            _IOW ('f', 154, long)
+#define LL_IOC_LOV_SETSTRIPE_NEW	_IOWR('f', 154, struct lov_user_md)
+#define LL_IOC_LOV_GETSTRIPE            _IOW ('f', 155, long)
+#define LL_IOC_LOV_GETSTRIPE_NEW	_IOR('f', 155, struct lov_user_md)
+#define LL_IOC_LOV_SETEA                _IOW ('f', 156, long)
+/*	LL_IOC_RECREATE_OBJ             157 obsolete */
+/*	LL_IOC_RECREATE_FID             157 obsolete */
+#define LL_IOC_GROUP_LOCK               _IOW ('f', 158, long)
+#define LL_IOC_GROUP_UNLOCK             _IOW ('f', 159, long)
+/*	LL_IOC_QUOTACHECK               160 OBD_IOC_QUOTACHECK */
+/*	LL_IOC_POLL_QUOTACHECK		161 OBD_IOC_POLL_QUOTACHECK */
+/*	LL_IOC_QUOTACTL			162 OBD_IOC_QUOTACTL */
+#define IOC_OBD_STATFS                  _IOWR('f', 164, struct obd_statfs *)
+/*	IOC_LOV_GETINFO                 165 obsolete */
+#define LL_IOC_FLUSHCTX                 _IOW ('f', 166, long)
+/*	LL_IOC_RMTACL                   167 obsolete */
+#define LL_IOC_GETOBDCOUNT              _IOR ('f', 168, long)
+#define LL_IOC_LLOOP_ATTACH             _IOWR('f', 169, long)
+#define LL_IOC_LLOOP_DETACH             _IOWR('f', 170, long)
+#define LL_IOC_LLOOP_INFO               _IOWR('f', 171, struct lu_fid)
+#define LL_IOC_LLOOP_DETACH_BYDEV       _IOWR('f', 172, long)
+#define LL_IOC_PATH2FID                 _IOR ('f', 173, long)
+#define LL_IOC_GET_CONNECT_FLAGS        _IOWR('f', 174, __u64 *)
+#define LL_IOC_GET_MDTIDX               _IOR ('f', 175, int)
+#define LL_IOC_FUTIMES_3		_IOWR('f', 176, struct ll_futimes_3)
+/*	lustre_ioctl.h			177-210 */
+#define LL_IOC_HSM_STATE_GET		_IOR('f', 211, struct hsm_user_state)
+#define LL_IOC_HSM_STATE_SET		_IOW('f', 212, struct hsm_state_set)
+#define LL_IOC_HSM_CT_START		_IOW('f', 213, struct lustre_kernelcomm)
+#define LL_IOC_HSM_COPY_START		_IOW('f', 214, struct hsm_copy *)
+#define LL_IOC_HSM_COPY_END		_IOW('f', 215, struct hsm_copy *)
+#define LL_IOC_HSM_PROGRESS		_IOW('f', 216, struct hsm_user_request)
+#define LL_IOC_HSM_REQUEST		_IOW('f', 217, struct hsm_user_request)
+#define LL_IOC_DATA_VERSION		_IOR('f', 218, struct ioc_data_version)
+#define LL_IOC_LOV_SWAP_LAYOUTS		_IOW('f', 219, \
+						struct lustre_swap_layouts)
+#define LL_IOC_HSM_ACTION		_IOR('f', 220, \
+						struct hsm_current_action)
+/*	lustre_ioctl.h			221-232 */
+#define LL_IOC_LMV_SETSTRIPE		_IOWR('f', 240, struct lmv_user_md)
+#define LL_IOC_LMV_GETSTRIPE		_IOWR('f', 241, struct lmv_user_md)
+#define LL_IOC_REMOVE_ENTRY		_IOWR('f', 242, __u64)
+#define LL_IOC_SET_LEASE		_IOWR('f', 243, long)
+#define LL_IOC_GET_LEASE		_IO('f', 244)
+#define LL_IOC_HSM_IMPORT		_IOWR('f', 245, struct hsm_user_import)
+#define LL_IOC_LMV_SET_DEFAULT_STRIPE	_IOWR('f', 246, struct lmv_user_md)
+#define LL_IOC_MIGRATE			_IOR('f', 247, int)
+#define LL_IOC_FID2MDTIDX		_IOWR('f', 248, struct lu_fid)
+#define LL_IOC_GETPARENT		_IOWR('f', 249, struct getparent)
+#define LL_IOC_LADVISE			_IOR('f', 250, struct llapi_lu_ladvise)
+
+#ifndef	FS_IOC_FSGETXATTR
+/*
+ * Structure for FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR.
 */
+struct fsxattr {
+	__u32           fsx_xflags;     /* xflags field value (get/set) */
+	__u32           fsx_extsize;    /* extsize field value (get/set)*/
+	__u32           fsx_nextents;   /* nextents field value (get)   */
+	__u32           fsx_projid;     /* project identifier (get/set) */
+	unsigned char   fsx_pad[12];
+};
+#define FS_IOC_FSGETXATTR		_IOR('X', 31, struct fsxattr)
+#define FS_IOC_FSSETXATTR		_IOW('X', 32, struct fsxattr)
+#endif
+#define LL_IOC_FSGETXATTR		FS_IOC_FSGETXATTR
+#define LL_IOC_FSSETXATTR		FS_IOC_FSSETXATTR
+#define LL_PROJINHERIT_FL		0x20000000
+
+
+/* Lease types for use as arg and return of LL_IOC_{GET,SET}_LEASE ioctl. */
+enum ll_lease_type {
+	LL_LEASE_RDLCK	= 0x1,
+	LL_LEASE_WRLCK	= 0x2,
+	LL_LEASE_UNLCK	= 0x4,
+};
+
+#define LL_STATFS_LMV		1
+#define LL_STATFS_LOV		2
+#define LL_STATFS_NODELAY	4
+
+#define IOC_MDC_TYPE            'i'
+#define IOC_MDC_LOOKUP          _IOWR(IOC_MDC_TYPE, 20, struct obd_device *)
+#define IOC_MDC_GETFILESTRIPE   _IOWR(IOC_MDC_TYPE, 21, struct lov_user_md *)
+#define IOC_MDC_GETFILEINFO     _IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data *)
+#define LL_IOC_MDC_GETINFO      _IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data *)
+
+#define MAX_OBD_NAME 128 /* If this changes, a NEW ioctl must be added */
+
+/* Define O_LOV_DELAY_CREATE to be a mask that is not useful for regular
+ * files, but are unlikely to be used in practice and are not harmful if
+ * used incorrectly.  O_NOCTTY and FASYNC are only meaningful for character
+ * devices and are safe for use on new files. See LU-4209. */
+/* To be compatible with old statically linked binary we keep the check for
+ * the older 0100000000 flag.  This is already removed upstream.  LU-812. */
+#define O_LOV_DELAY_CREATE_1_8	0100000000 /* FMODE_NONOTIFY masked in 2.6.36 */
+#ifndef FASYNC
+#define FASYNC			00020000   /* fcntl, for BSD compatibility */
+#endif
+#define O_LOV_DELAY_CREATE_MASK	(O_NOCTTY | FASYNC)
+#define O_LOV_DELAY_CREATE		(O_LOV_DELAY_CREATE_1_8 | \
+					 O_LOV_DELAY_CREATE_MASK)
+
+#define LL_FILE_IGNORE_LOCK     0x00000001
+#define LL_FILE_GROUP_LOCKED    0x00000002
+#define LL_FILE_READAHEA        0x00000004
+#define LL_FILE_LOCKED_DIRECTIO 0x00000008 /* client-side locks with dio */
+#define LL_FILE_LOCKLESS_IO     0x00000010 /* server-side locks with cio */
+
+#define LOV_USER_MAGIC_V1	0x0BD10BD0
+#define LOV_USER_MAGIC		LOV_USER_MAGIC_V1
+#define LOV_USER_MAGIC_JOIN_V1	0x0BD20BD0
+#define LOV_USER_MAGIC_V3	0x0BD30BD0
+/* 0x0BD40BD0 is occupied by LOV_MAGIC_MIGRATE */
+#define LOV_USER_MAGIC_SPECIFIC 0x0BD50BD0	/* for specific OSTs */
+#define LOV_USER_MAGIC_COMP_V1	0x0BD60BD0
+
+#define LMV_USER_MAGIC		0x0CD30CD0    /* default lmv magic */
+#define LMV_USER_MAGIC_V0	0x0CD20CD0    /* old default lmv magic */
+
+#define LOV_PATTERN_NONE	0x000
+#define LOV_PATTERN_RAID0	0x001
+#define LOV_PATTERN_RAID1	0x002
+#define LOV_PATTERN_FIRST	0x100
+#define LOV_PATTERN_CMOBD	0x200
+
+#define LOV_PATTERN_F_MASK	0xffff0000
+#define LOV_PATTERN_F_HOLE	0x40000000 /* there is hole in LOV EA */
+#define LOV_PATTERN_F_RELEASED	0x80000000 /* HSM released file */
+#define LOV_PATTERN_DEFAULT	0xffffffff
+
+static inline bool lov_pattern_supported(__u32 pattern)
+{
+	return pattern == LOV_PATTERN_RAID0 ||
+	       pattern == (LOV_PATTERN_RAID0 | LOV_PATTERN_F_RELEASED);
+}
+
+#define LOV_MAXPOOLNAME 15
+#define LOV_POOLNAMEF "%.15s"
+
+#define LOV_MIN_STRIPE_BITS 16   /* maximum PAGE_SIZE (ia64), power of 2 */
+#define LOV_MIN_STRIPE_SIZE (1 << LOV_MIN_STRIPE_BITS)
+#define LOV_MAX_STRIPE_COUNT_OLD 160
+/* This calculation is crafted so that input of 4096 will result in 160
+ * which in turn is equal to old maximal stripe count.
+ * XXX: In fact this is too simpified for now, what it also need is to get
+ * ea_type argument to clearly know how much space each stripe consumes.
+ *
+ * The limit of 12 pages is somewhat arbitrary, but is a reasonably large
+ * allocation that is sufficient for the current generation of systems.
+ *
+ * (max buffer size - lov+rpc header) / sizeof(struct lov_ost_data_v1) */
+#define LOV_MAX_STRIPE_COUNT 2000  /* ((12 * 4096 - 256) / 24) */
+#define LOV_ALL_STRIPES       0xffff /* only valid for directories */
+#define LOV_V1_INSANE_STRIPE_COUNT 65532 /* maximum stripe count bz13933 */
+
+#define XATTR_LUSTRE_PREFIX	"lustre."
+#define XATTR_LUSTRE_LOV	XATTR_LUSTRE_PREFIX"lov"
+
+#define lov_user_ost_data lov_user_ost_data_v1
+struct lov_user_ost_data_v1 {     /* per-stripe data structure */
+	struct ost_id l_ost_oi;	  /* OST object ID */
+	__u32 l_ost_gen;          /* generation of this OST index */
+	__u32 l_ost_idx;          /* OST index in LOV */
+} __attribute__((packed));
+
+#define lov_user_md lov_user_md_v1
+struct lov_user_md_v1 {           /* LOV EA user data (host-endian) */
+	__u32 lmm_magic;          /* magic number = LOV_USER_MAGIC_V1 */
+	__u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+	struct ost_id lmm_oi;	  /* MDT parent inode id/seq (id/0 for 1.x) */
+	__u32 lmm_stripe_size;    /* size of stripe in bytes */
+	__u16 lmm_stripe_count;   /* num stripes in use for this object */
+	union {
+		__u16 lmm_stripe_offset;  /* starting stripe offset in
+					   * lmm_objects, use when writing */
+		__u16 lmm_layout_gen;     /* layout generation number
+					   * used when reading */
+	};
+	struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+} __attribute__((packed,  __may_alias__));
+
+struct lov_user_md_v3 {           /* LOV EA user data (host-endian) */
+	__u32 lmm_magic;          /* magic number = LOV_USER_MAGIC_V3 */
+	__u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+	struct ost_id lmm_oi;	  /* MDT parent inode id/seq (id/0 for 1.x) */
+	__u32 lmm_stripe_size;    /* size of stripe in bytes */
+	__u16 lmm_stripe_count;   /* num stripes in use for this object */
+	union {
+		__u16 lmm_stripe_offset;  /* starting stripe offset in
+					   * lmm_objects, use when writing */
+		__u16 lmm_layout_gen;     /* layout generation number
+					   * used when reading */
+	};
+	char  lmm_pool_name[LOV_MAXPOOLNAME + 1]; /* pool name */
+	struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+} __attribute__((packed));
+
+struct lu_extent {
+	__u64	e_start;
+	__u64	e_end;
+};
+
+#define DEXT "[ %#llx , %#llx )"
+#define PEXT(ext) (ext)->e_start, (ext)->e_end
+
+static inline bool lu_extent_is_overlapped(struct lu_extent *e1,
+					   struct lu_extent *e2)
+{
+	return e1->e_start < e2->e_end && e2->e_start < e1->e_end;
+}
+
+enum lov_comp_md_entry_flags {
+	LCME_FL_PRIMARY	= 0x00000001,	/* Not used */
+	LCME_FL_STALE	= 0x00000002,	/* Not used */
+	LCME_FL_OFFLINE	= 0x00000004,	/* Not used */
+	LCME_FL_PREFERRED = 0x00000008, /* Not used */
+	LCME_FL_INIT	= 0x00000010,	/* instantiated */
+	LCME_FL_NEG	= 0x80000000	/* used to indicate a negative flag,
+					   won't be stored on disk */
+};
+
+#define LCME_KNOWN_FLAGS	(LCME_FL_NEG | LCME_FL_INIT)
+
+/* lcme_id can be specified as certain flags, and the the first
+ * bit of lcme_id is used to indicate that the ID is representing
+ * certain LCME_FL_* but not a real ID. Which implies we can have
+ * at most 31 flags (see LCME_FL_XXX). */
+enum lcme_id {
+	LCME_ID_INVAL	= 0x0,
+	LCME_ID_MAX	= 0x7FFFFFFF,
+	LCME_ID_ALL	= 0xFFFFFFFF,
+	LCME_ID_NOT_ID	= LCME_FL_NEG
+};
+
+#define LCME_ID_MASK	LCME_ID_MAX
+
+struct lov_comp_md_entry_v1 {
+	__u32			lcme_id;        /* unique id of component */
+	__u32			lcme_flags;     /* LCME_FL_XXX */
+	struct lu_extent	lcme_extent;    /* file extent for component */
+	__u32			lcme_offset;    /* offset of component blob,
+						   start from lov_comp_md_v1 */
+	__u32			lcme_size;      /* size of component blob */
+	__u64			lcme_padding[2];
+} __attribute__((packed));
+
+enum lov_comp_md_flags;
+
+struct lov_comp_md_v1 {
+	__u32	lcm_magic;      /* LOV_USER_MAGIC_COMP_V1 */
+	__u32	lcm_size;       /* overall size including this struct */
+	__u32	lcm_layout_gen;
+	__u16	lcm_flags;
+	__u16	lcm_entry_count;
+	__u64	lcm_padding1;
+	__u64	lcm_padding2;
+	struct lov_comp_md_entry_v1 lcm_entries[0];
+} __attribute__((packed));
+
+static inline __u32 lov_user_md_size(__u16 stripes, __u32 lmm_magic)
+{
+	if (stripes == (__u16)-1)
+		stripes = 0;
+
+	if (lmm_magic == LOV_USER_MAGIC_V1)
+		return sizeof(struct lov_user_md_v1) +
+			      stripes * sizeof(struct lov_user_ost_data_v1);
+	return sizeof(struct lov_user_md_v3) +
+				stripes * sizeof(struct lov_user_ost_data_v1);
+}
+
+/* Compile with -D_LARGEFILE64_SOURCE or -D_GNU_SOURCE (or #define) to
+ * use this.  It is unsafe to #define those values in this header as it
+ * is possible the application has already #included <sys/stat.h>. */
+#ifdef HAVE_LOV_USER_MDS_DATA
+#define lov_user_mds_data lov_user_mds_data_v1
+struct lov_user_mds_data_v1 {
+        lstat_t lmd_st;                 /* MDS stat struct */
+        struct lov_user_md_v1 lmd_lmm;  /* LOV EA V1 user data */
+} __attribute__((packed));
+
+struct lov_user_mds_data_v3 {
+        lstat_t lmd_st;                 /* MDS stat struct */
+        struct lov_user_md_v3 lmd_lmm;  /* LOV EA V3 user data */
+} __attribute__((packed));
+#endif
+
+struct lmv_user_mds_data {
+	struct lu_fid	lum_fid;
+	__u32		lum_padding;
+	__u32		lum_mds;
+};
+
+enum lmv_hash_type {
+	LMV_HASH_TYPE_UNKNOWN	= 0,	/* 0 is reserved for testing purpose */
+	LMV_HASH_TYPE_ALL_CHARS = 1,
+	LMV_HASH_TYPE_FNV_1A_64 = 2,
+	LMV_HASH_TYPE_MAX,
+};
+
+#define LMV_HASH_NAME_ALL_CHARS	"all_char"
+#define LMV_HASH_NAME_FNV_1A_64	"fnv_1a_64"
+
+extern char *mdt_hash_name[LMV_HASH_TYPE_MAX];
+
+/* Got this according to how get LOV_MAX_STRIPE_COUNT, see above,
+ * (max buffer size - lmv+rpc header) / sizeof(struct lmv_user_mds_data) */
+#define LMV_MAX_STRIPE_COUNT 2000  /* ((12 * 4096 - 256) / 24) */
+#define lmv_user_md lmv_user_md_v1
+struct lmv_user_md_v1 {
+	__u32	lum_magic;	 /* must be the first field */
+	__u32	lum_stripe_count;  /* dirstripe count */
+	__u32	lum_stripe_offset; /* MDT idx for default dirstripe */
+	__u32	lum_hash_type;     /* Dir stripe policy */
+	__u32	lum_type;	  /* LMV type: default or normal */
+	__u32	lum_padding1;
+	__u32	lum_padding2;
+	__u32	lum_padding3;
+	char	lum_pool_name[LOV_MAXPOOLNAME + 1];
+	struct	lmv_user_mds_data  lum_objects[0];
+} __attribute__((packed));
+
+static inline int lmv_user_md_size(int stripes, int lmm_magic)
+{
+	return sizeof(struct lmv_user_md) +
+		      stripes * sizeof(struct lmv_user_mds_data);
+}
+
+struct ll_recreate_obj {
+        __u64 lrc_id;
+        __u32 lrc_ost_idx;
+};
+
+struct ll_fid {
+        __u64 id;         /* holds object id */
+        __u32 generation; /* holds object generation */
+        __u32 f_type;     /* holds object type or stripe idx when passing it to
+                           * OST for saving into EA. */
+};
+
+#define UUID_MAX        40
+struct obd_uuid {
+        char uuid[UUID_MAX];
+};
+
+static inline bool obd_uuid_equals(const struct obd_uuid *u1,
+				   const struct obd_uuid *u2)
+{
+	return strcmp((char *)u1->uuid, (char *)u2->uuid) == 0;
+}
+
+static inline int obd_uuid_empty(struct obd_uuid *uuid)
+{
+        return uuid->uuid[0] == '\0';
+}
+
+static inline void obd_str2uuid(struct obd_uuid *uuid, const char *tmp)
+{
+        strncpy((char *)uuid->uuid, tmp, sizeof(*uuid));
+        uuid->uuid[sizeof(*uuid) - 1] = '\0';
+}
+
+/* For printf's only, make sure uuid is terminated */
+static inline char *obd_uuid2str(const struct obd_uuid *uuid)
+{
+	if (uuid == NULL)
+		return NULL;
+
+        if (uuid->uuid[sizeof(*uuid) - 1] != '\0') {
+                /* Obviously not safe, but for printfs, no real harm done...
+                   we're always null-terminated, even in a race. */
+                static char temp[sizeof(*uuid)];
+                memcpy(temp, uuid->uuid, sizeof(*uuid) - 1);
+                temp[sizeof(*uuid) - 1] = '\0';
+                return temp;
+        }
+        return (char *)(uuid->uuid);
+}
+
+#define LUSTRE_MAXFSNAME 8
+
+/* Extract fsname from uuid (or target name) of a target
+   e.g. (myfs-OST0007_UUID -> myfs)
+   see also deuuidify. */
+static inline void obd_uuid2fsname(char *buf, char *uuid, int buflen)
+{
+        char *p;
+
+        strncpy(buf, uuid, buflen - 1);
+        buf[buflen - 1] = '\0';
+        p = strrchr(buf, '-');
+	if (p != NULL)
+		*p = '\0';
+}
+
+/* printf display format for Lustre FIDs
+ * usage: printf("file FID is "DFID"\n", PFID(fid)); */
+#define FID_NOBRACE_LEN 40
+#define FID_LEN (FID_NOBRACE_LEN + 2)
+#define DFID_NOBRACE "%#llx:0x%x:0x%x"
+#define DFID "["DFID_NOBRACE"]"
+#define PFID(fid) (unsigned long long)(fid)->f_seq, (fid)->f_oid, (fid)->f_ver
+
+/* scanf input parse format for fids in DFID_NOBRACE format
+ * Need to strip '[' from DFID format first or use "["SFID"]" at caller.
+ * usage: sscanf(fidstr, SFID, RFID(&fid)); */
+#define SFID "0x%llx:0x%x:0x%x"
+#define RFID(fid) &((fid)->f_seq), &((fid)->f_oid), &((fid)->f_ver)
+
+/********* Quotas **********/
+
+#define LUSTRE_QUOTABLOCK_BITS 10
+#define LUSTRE_QUOTABLOCK_SIZE (1 << LUSTRE_QUOTABLOCK_BITS)
+
+static inline __u64 lustre_stoqb(size_t space)
+{
+	return (space + LUSTRE_QUOTABLOCK_SIZE - 1) >> LUSTRE_QUOTABLOCK_BITS;
+}
+
+#define Q_QUOTACHECK	0x800100 /* deprecated as of 2.4 */
+#define Q_INITQUOTA	0x800101 /* deprecated as of 2.4  */
+#define Q_GETOINFO	0x800102 /* get obd quota info */
+#define Q_GETOQUOTA	0x800103 /* get obd quotas */
+#define Q_FINVALIDATE	0x800104 /* deprecated as of 2.4 */
+
+/* these must be explicitly translated into linux Q_* in ll_dir_ioctl */
+#define LUSTRE_Q_QUOTAON    0x800002     /* deprecated as of 2.4 */
+#define LUSTRE_Q_QUOTAOFF   0x800003     /* deprecated as of 2.4 */
+#define LUSTRE_Q_GETINFO    0x800005     /* get information about quota files */
+#define LUSTRE_Q_SETINFO    0x800006     /* set information about quota files */
+#define LUSTRE_Q_GETQUOTA   0x800007     /* get user quota structure */
+#define LUSTRE_Q_SETQUOTA   0x800008     /* set user quota structure */
+/* lustre-specific control commands */
+#define LUSTRE_Q_INVALIDATE  0x80000b     /* deprecated as of 2.4 */
+#define LUSTRE_Q_FINVALIDATE 0x80000c     /* deprecated as of 2.4 */
+
+#define ALLQUOTA 255       /* set all quota */
+static inline char *qtype_name(int qtype)
+{
+	switch (qtype) {
+	case USRQUOTA:
+		return "usr";
+	case GRPQUOTA:
+		return "grp";
+	case PRJQUOTA:
+		return "prj";
+	}
+	return "unknown";
+}
+
+#define IDENTITY_DOWNCALL_MAGIC 0x6d6dd629
+
+/* permission */
+#define N_PERMS_MAX      64
+
+struct perm_downcall_data {
+        __u64 pdd_nid;
+        __u32 pdd_perm;
+        __u32 pdd_padding;
+};
+
+struct identity_downcall_data {
+        __u32                            idd_magic;
+        __u32                            idd_err;
+        __u32                            idd_uid;
+        __u32                            idd_gid;
+        __u32                            idd_nperms;
+        __u32                            idd_ngroups;
+        struct perm_downcall_data idd_perms[N_PERMS_MAX];
+        __u32                            idd_groups[0];
+};
+
+#ifdef NEED_QUOTA_DEFS
+#ifndef QIF_BLIMITS
+#define QIF_BLIMITS     1
+#define QIF_SPACE       2
+#define QIF_ILIMITS     4
+#define QIF_INODES      8
+#define QIF_BTIME       16
+#define QIF_ITIME       32
+#define QIF_LIMITS      (QIF_BLIMITS | QIF_ILIMITS)
+#define QIF_USAGE       (QIF_SPACE | QIF_INODES)
+#define QIF_TIMES       (QIF_BTIME | QIF_ITIME)
+#define QIF_ALL         (QIF_LIMITS | QIF_USAGE | QIF_TIMES)
+#endif
+
+#endif /* !__KERNEL__ */
+
+/* lustre volatile file support
+ * file name header: .^L^S^T^R:volatile"
+ */
+#define LUSTRE_VOLATILE_HDR	".\x0c\x13\x14\x12:VOLATILE"
+#define LUSTRE_VOLATILE_HDR_LEN	14
+
+typedef enum lustre_quota_version {
+        LUSTRE_QUOTA_V2 = 1
+} lustre_quota_version_t;
+
+/* XXX: same as if_dqinfo struct in kernel */
+struct obd_dqinfo {
+        __u64 dqi_bgrace;
+        __u64 dqi_igrace;
+        __u32 dqi_flags;
+        __u32 dqi_valid;
+};
+
+/* XXX: same as if_dqblk struct in kernel, plus one padding */
+struct obd_dqblk {
+        __u64 dqb_bhardlimit;
+        __u64 dqb_bsoftlimit;
+        __u64 dqb_curspace;
+        __u64 dqb_ihardlimit;
+        __u64 dqb_isoftlimit;
+        __u64 dqb_curinodes;
+        __u64 dqb_btime;
+        __u64 dqb_itime;
+        __u32 dqb_valid;
+        __u32 dqb_padding;
+};
+
+enum {
+        QC_GENERAL      = 0,
+        QC_MDTIDX       = 1,
+        QC_OSTIDX       = 2,
+        QC_UUID         = 3
+};
+
+struct if_quotactl {
+        __u32                   qc_cmd;
+        __u32                   qc_type;
+        __u32                   qc_id;
+        __u32                   qc_stat;
+        __u32                   qc_valid;
+        __u32                   qc_idx;
+        struct obd_dqinfo       qc_dqinfo;
+        struct obd_dqblk        qc_dqblk;
+        char                    obd_type[16];
+        struct obd_uuid         obd_uuid;
+};
+
+/* swap layout flags */
+#define SWAP_LAYOUTS_CHECK_DV1		(1 << 0)
+#define SWAP_LAYOUTS_CHECK_DV2		(1 << 1)
+#define SWAP_LAYOUTS_KEEP_MTIME		(1 << 2)
+#define SWAP_LAYOUTS_KEEP_ATIME		(1 << 3)
+#define SWAP_LAYOUTS_CLOSE		(1 << 4)
+
+/* Swap XATTR_NAME_HSM as well, only on the MDT so far */
+#define SWAP_LAYOUTS_MDS_HSM		(1 << 31)
+struct lustre_swap_layouts {
+	__u64	sl_flags;
+	__u32	sl_fd;
+	__u32	sl_gid;
+	__u64	sl_dv1;
+	__u64	sl_dv2;
+};
+
+
+/********* Changelogs **********/
+/** Changelog record types */
+enum changelog_rec_type {
+        CL_MARK     = 0,
+        CL_CREATE   = 1,  /* namespace */
+        CL_MKDIR    = 2,  /* namespace */
+        CL_HARDLINK = 3,  /* namespace */
+        CL_SOFTLINK = 4,  /* namespace */
+        CL_MKNOD    = 5,  /* namespace */
+        CL_UNLINK   = 6,  /* namespace */
+        CL_RMDIR    = 7,  /* namespace */
+        CL_RENAME   = 8,  /* namespace */
+        CL_EXT      = 9,  /* namespace extended record (2nd half of rename) */
+        CL_OPEN     = 10, /* not currently used */
+        CL_CLOSE    = 11, /* may be written to log only with mtime change */
+	CL_LAYOUT   = 12, /* file layout/striping modified */
+	CL_TRUNC    = 13,
+	CL_SETATTR  = 14,
+	CL_XATTR    = 15,
+	CL_HSM      = 16, /* HSM specific events, see flags */
+	CL_MTIME    = 17, /* Precedence: setattr > mtime > ctime > atime */
+	CL_CTIME    = 18,
+	CL_ATIME    = 19,
+	CL_MIGRATE  = 20,
+	CL_LAST
+};
+
+static inline const char *changelog_type2str(int type) {
+	static const char *changelog_str[] = {
+		"MARK",  "CREAT", "MKDIR", "HLINK", "SLINK", "MKNOD", "UNLNK",
+		"RMDIR", "RENME", "RNMTO", "OPEN",  "CLOSE", "LYOUT", "TRUNC",
+		"SATTR", "XATTR", "HSM",   "MTIME", "CTIME", "ATIME", "MIGRT"
+	};
+
+	if (type >= 0 && type < CL_LAST)
+		return changelog_str[type];
+	return NULL;
+}
+
+/* per-record flags */
+#define CLF_FLAGSHIFT   12
+#define CLF_FLAGMASK    ((1U << CLF_FLAGSHIFT) - 1)
+#define CLF_VERMASK     (~CLF_FLAGMASK)
+enum changelog_rec_flags {
+	CLF_VERSION	= 0x1000,
+	CLF_RENAME	= 0x2000,
+	CLF_JOBID	= 0x4000,
+	CLF_SUPPORTED	= CLF_VERSION | CLF_RENAME | CLF_JOBID
+};
+
+
+/* Anything under the flagmask may be per-type (if desired) */
+/* Flags for unlink */
+#define CLF_UNLINK_LAST       0x0001 /* Unlink of last hardlink */
+#define CLF_UNLINK_HSM_EXISTS 0x0002 /* File has something in HSM */
+                                     /* HSM cleaning needed */
+/* Flags for rename */
+#define CLF_RENAME_LAST		0x0001 /* rename unlink last hardlink
+					* of target */
+#define CLF_RENAME_LAST_EXISTS	0x0002 /* rename unlink last hardlink of target
+					* has an archive in backend */
+
+/* Flags for HSM */
+/* 12b used (from high weight to low weight):
+ * 2b for flags
+ * 3b for event
+ * 7b for error code
+ */
+#define CLF_HSM_ERR_L        0 /* HSM return code, 7 bits */
+#define CLF_HSM_ERR_H        6
+#define CLF_HSM_EVENT_L      7 /* HSM event, 3 bits, see enum hsm_event */
+#define CLF_HSM_EVENT_H      9
+#define CLF_HSM_FLAG_L      10 /* HSM flags, 2 bits, 1 used, 1 spare */
+#define CLF_HSM_FLAG_H      11
+#define CLF_HSM_SPARE_L     12 /* 4 spare bits */
+#define CLF_HSM_SPARE_H     15
+#define CLF_HSM_LAST        15
+
+/* Remove bits higher than _h, then extract the value
+ * between _h and _l by shifting lower weigth to bit 0. */
+#define CLF_GET_BITS(_b, _h, _l) (((_b << (CLF_HSM_LAST - _h)) & 0xFFFF) \
+                                   >> (CLF_HSM_LAST - _h + _l))
+
+#define CLF_HSM_SUCCESS      0x00
+#define CLF_HSM_MAXERROR     0x7E
+#define CLF_HSM_ERROVERFLOW  0x7F
+
+#define CLF_HSM_DIRTY        1 /* file is dirty after HSM request end */
+
+/* 3 bits field => 8 values allowed */
+enum hsm_event {
+        HE_ARCHIVE      = 0,
+        HE_RESTORE      = 1,
+        HE_CANCEL       = 2,
+        HE_RELEASE      = 3,
+        HE_REMOVE       = 4,
+        HE_STATE        = 5,
+        HE_SPARE1       = 6,
+        HE_SPARE2       = 7,
+};
+
+static inline enum hsm_event hsm_get_cl_event(__u16 flags)
+{
+	return (enum hsm_event)CLF_GET_BITS(flags, CLF_HSM_EVENT_H,
+					    CLF_HSM_EVENT_L);
+}
+
+static inline void hsm_set_cl_event(int *flags, enum hsm_event he)
+{
+        *flags |= (he << CLF_HSM_EVENT_L);
+}
+
+static inline __u16 hsm_get_cl_flags(int flags)
+{
+        return CLF_GET_BITS(flags, CLF_HSM_FLAG_H, CLF_HSM_FLAG_L);
+}
+
+static inline void hsm_set_cl_flags(int *flags, int bits)
+{
+        *flags |= (bits << CLF_HSM_FLAG_L);
+}
+
+static inline int hsm_get_cl_error(int flags)
+{
+        return CLF_GET_BITS(flags, CLF_HSM_ERR_H, CLF_HSM_ERR_L);
+}
+
+static inline void hsm_set_cl_error(int *flags, int error)
+{
+        *flags |= (error << CLF_HSM_ERR_L);
+}
+
+enum changelog_send_flag {
+	/* Not yet implemented */
+	CHANGELOG_FLAG_FOLLOW   = 0x01,
+	/* Blocking IO makes sense in case of slow user parsing of the records,
+	 * but it also prevents us from cleaning up if the records are not
+	 * consumed. */
+	CHANGELOG_FLAG_BLOCK    = 0x02,
+	/* Pack jobid into the changelog records if available. */
+	CHANGELOG_FLAG_JOBID    = 0x04,
+};
+
+#define CR_MAXSIZE cfs_size_round(2 * NAME_MAX + 2 + \
+				  changelog_rec_offset(CLF_SUPPORTED))
+
+/* 31 usable bytes string + null terminator. */
+#define LUSTRE_JOBID_SIZE	32
+
+/* This is the minimal changelog record. It can contain extensions
+ * such as rename fields or process jobid. Its exact content is described
+ * by the cr_flags.
+ *
+ * Extensions are packed in the same order as their corresponding flags.
+ */
+struct changelog_rec {
+	__u16			cr_namelen;
+	__u16			cr_flags; /**< \a changelog_rec_flags */
+	__u32			cr_type;  /**< \a changelog_rec_type */
+	__u64			cr_index; /**< changelog record number */
+	__u64			cr_prev;  /**< last index for this target fid */
+	__u64			cr_time;
+	union {
+		lustre_fid	cr_tfid;        /**< target fid */
+		__u32		cr_markerflags; /**< CL_MARK flags */
+	};
+	lustre_fid		cr_pfid;        /**< parent fid */
+};
+
+/* Changelog extension for RENAME. */
+struct changelog_ext_rename {
+	lustre_fid		cr_sfid;     /**< source fid, or zero */
+	lustre_fid		cr_spfid;    /**< source parent fid, or zero */
+};
+
+/* Changelog extension to include JOBID. */
+struct changelog_ext_jobid {
+	char	cr_jobid[LUSTRE_JOBID_SIZE];	/**< zero-terminated string. */
+};
+
+
+static inline size_t changelog_rec_offset(enum changelog_rec_flags crf)
+{
+	size_t size = sizeof(struct changelog_rec);
+
+	if (crf & CLF_RENAME)
+		size += sizeof(struct changelog_ext_rename);
+
+	if (crf & CLF_JOBID)
+		size += sizeof(struct changelog_ext_jobid);
+
+	return size;
+}
+
+static inline size_t changelog_rec_size(const struct changelog_rec *rec)
+{
+	return changelog_rec_offset(rec->cr_flags);
+}
+
+static inline size_t changelog_rec_varsize(const struct changelog_rec *rec)
+{
+	return changelog_rec_size(rec) - sizeof(*rec) + rec->cr_namelen;
+}
+
+static inline
+struct changelog_ext_rename *changelog_rec_rename(const struct changelog_rec *rec)
+{
+	enum changelog_rec_flags crf = rec->cr_flags & CLF_VERSION;
+
+	return (struct changelog_ext_rename *)((char *)rec +
+					       changelog_rec_offset(crf));
+}
+
+/* The jobid follows the rename extension, if present */
+static inline
+struct changelog_ext_jobid *changelog_rec_jobid(const struct changelog_rec *rec)
+{
+	enum changelog_rec_flags crf = rec->cr_flags &
+					(CLF_VERSION | CLF_RENAME);
+
+	return (struct changelog_ext_jobid *)((char *)rec +
+					      changelog_rec_offset(crf));
+}
+
+/* The name follows the rename and jobid extensions, if present */
+static inline char *changelog_rec_name(const struct changelog_rec *rec)
+{
+	return (char *)rec + changelog_rec_offset(rec->cr_flags &
+						  CLF_SUPPORTED);
+}
+
+static inline size_t changelog_rec_snamelen(const struct changelog_rec *rec)
+{
+	return rec->cr_namelen - strlen(changelog_rec_name(rec)) - 1;
+}
+
+static inline char *changelog_rec_sname(const struct changelog_rec *rec)
+{
+	char *cr_name = changelog_rec_name(rec);
+
+	return cr_name + strlen(cr_name) + 1;
+}
+
+/**
+ * Remap a record to the desired format as specified by the crf flags.
+ * The record must be big enough to contain the final remapped version.
+ * Superfluous extension fields are removed and missing ones are added
+ * and zeroed. The flags of the record are updated accordingly.
+ *
+ * The jobid and rename extensions can be added to a record, to match the
+ * format an application expects, typically. In this case, the newly added
+ * fields will be zeroed.
+ * The Jobid field can be removed, to guarantee compatibility with older
+ * clients that don't expect this field in the records they process.
+ *
+ * The following assumptions are being made:
+ *   - CLF_RENAME will not be removed
+ *   - CLF_JOBID will not be added without CLF_RENAME being added too
+ *
+ * @param[in,out]  rec         The record to remap.
+ * @param[in]      crf_wanted  Flags describing the desired extensions.
+ */
+static inline void changelog_remap_rec(struct changelog_rec *rec,
+				       enum changelog_rec_flags crf_wanted)
+{
+	char *jid_mov;
+	char *rnm_mov;
+
+	crf_wanted &= CLF_SUPPORTED;
+
+	if ((rec->cr_flags & CLF_SUPPORTED) == crf_wanted)
+		return;
+
+	/* First move the variable-length name field */
+	memmove((char *)rec + changelog_rec_offset(crf_wanted),
+		changelog_rec_name(rec), rec->cr_namelen);
+
+	/* Locations of jobid and rename extensions in the remapped record */
+	jid_mov = (char *)rec +
+		  changelog_rec_offset(crf_wanted & ~CLF_JOBID);
+	rnm_mov = (char *)rec +
+		  changelog_rec_offset(crf_wanted & ~(CLF_JOBID | CLF_RENAME));
+
+	/* Move the extension fields to the desired positions */
+	if ((crf_wanted & CLF_JOBID) && (rec->cr_flags & CLF_JOBID))
+		memmove(jid_mov, changelog_rec_jobid(rec),
+			sizeof(struct changelog_ext_jobid));
+
+	if ((crf_wanted & CLF_RENAME) && (rec->cr_flags & CLF_RENAME))
+		memmove(rnm_mov, changelog_rec_rename(rec),
+			sizeof(struct changelog_ext_rename));
+
+	/* Clear newly added fields */
+	if ((crf_wanted & CLF_JOBID) && !(rec->cr_flags & CLF_JOBID))
+		memset(jid_mov, 0, sizeof(struct changelog_ext_jobid));
+
+	if ((crf_wanted & CLF_RENAME) && !(rec->cr_flags & CLF_RENAME))
+		memset(rnm_mov, 0, sizeof(struct changelog_ext_rename));
+
+	/* Update the record's flags accordingly */
+	rec->cr_flags = (rec->cr_flags & CLF_FLAGMASK) | crf_wanted;
+}
+
+enum changelog_message_type {
+        CL_RECORD = 10, /* message is a changelog_rec */
+        CL_EOF    = 11, /* at end of current changelog */
+};
+
+/********* Misc **********/
+
+struct ioc_data_version {
+        __u64 idv_version;
+        __u64 idv_flags;     /* See LL_DV_xxx */
+};
+#define LL_DV_RD_FLUSH (1 << 0) /* Flush dirty pages from clients */
+#define LL_DV_WR_FLUSH (1 << 1) /* Flush all caching pages from clients */
+
+#ifndef offsetof
+#define offsetof(typ, memb)     ((unsigned long)((char *)&(((typ *)0)->memb)))
+#endif
+
+#define dot_lustre_name ".lustre"
+
+
+/********* HSM **********/
+
+/** HSM per-file state
+ * See HSM_FLAGS below.
+ */
+enum hsm_states {
+	HS_NONE		= 0x00000000,
+	HS_EXISTS	= 0x00000001,
+	HS_DIRTY	= 0x00000002,
+	HS_RELEASED	= 0x00000004,
+	HS_ARCHIVED	= 0x00000008,
+	HS_NORELEASE	= 0x00000010,
+	HS_NOARCHIVE	= 0x00000020,
+	HS_LOST		= 0x00000040,
+};
+
+/* HSM user-setable flags. */
+#define HSM_USER_MASK   (HS_NORELEASE | HS_NOARCHIVE | HS_DIRTY)
+
+/* Other HSM flags. */
+#define HSM_STATUS_MASK (HS_EXISTS | HS_LOST | HS_RELEASED | HS_ARCHIVED)
+
+/*
+ * All HSM-related possible flags that could be applied to a file.
+ * This should be kept in sync with hsm_states.
+ */
+#define HSM_FLAGS_MASK  (HSM_USER_MASK | HSM_STATUS_MASK)
+
+/**
+ * HSM request progress state
+ */
+enum hsm_progress_states {
+	HPS_WAITING	= 1,
+	HPS_RUNNING	= 2,
+	HPS_DONE	= 3,
+};
+#define HPS_NONE	0
+
+static inline const char *hsm_progress_state2name(enum hsm_progress_states s)
+{
+	switch  (s) {
+	case HPS_WAITING:	return "waiting";
+	case HPS_RUNNING:	return "running";
+	case HPS_DONE:		return "done";
+	default:		return "unknown";
+	}
+}
+
+struct hsm_extent {
+	__u64 offset;
+	__u64 length;
+} __attribute__((packed));
+
+/**
+ * Current HSM states of a Lustre file.
+ *
+ * This structure purpose is to be sent to user-space mainly. It describes the
+ * current HSM flags and in-progress action.
+ */
+struct hsm_user_state {
+	/** Current HSM states, from enum hsm_states. */
+	__u32			hus_states;
+	__u32			hus_archive_id;
+	/**  The current undergoing action, if there is one */
+	__u32			hus_in_progress_state;
+	__u32			hus_in_progress_action;
+	struct hsm_extent	hus_in_progress_location;
+	char			hus_extended_info[];
+};
+
+struct hsm_state_set_ioc {
+	struct lu_fid	hssi_fid;
+	__u64		hssi_setmask;
+	__u64		hssi_clearmask;
+};
+
+/*
+ * This structure describes the current in-progress action for a file.
+ * it is retuned to user space and send over the wire
+ */
+struct hsm_current_action {
+	/**  The current undergoing action, if there is one */
+	/* state is one of hsm_progress_states */
+	__u32			hca_state;
+	/* action is one of hsm_user_action */
+	__u32			hca_action;
+	struct hsm_extent	hca_location;
+};
+
+/***** HSM user requests ******/
+/* User-generated (lfs/ioctl) request types */
+enum hsm_user_action {
+        HUA_NONE    =  1, /* no action (noop) */
+        HUA_ARCHIVE = 10, /* copy to hsm */
+        HUA_RESTORE = 11, /* prestage */
+        HUA_RELEASE = 12, /* drop ost objects */
+        HUA_REMOVE  = 13, /* remove from archive */
+        HUA_CANCEL  = 14  /* cancel a request */
+};
+
+static inline const char *hsm_user_action2name(enum hsm_user_action  a)
+{
+        switch  (a) {
+        case HUA_NONE:    return "NOOP";
+        case HUA_ARCHIVE: return "ARCHIVE";
+        case HUA_RESTORE: return "RESTORE";
+        case HUA_RELEASE: return "RELEASE";
+        case HUA_REMOVE:  return "REMOVE";
+        case HUA_CANCEL:  return "CANCEL";
+        default:          return "UNKNOWN";
+        }
+}
+
+/*
+ * List of hr_flags (bit field)
+ */
+#define HSM_FORCE_ACTION 0x0001
+/* used by CT, cannot be set by user */
+#define HSM_GHOST_COPY   0x0002
+
+/**
+ * Contains all the fixed part of struct hsm_user_request.
+ *
+ */
+struct hsm_request {
+	__u32 hr_action;	/* enum hsm_user_action */
+	__u32 hr_archive_id;	/* archive id, used only with HUA_ARCHIVE */
+	__u64 hr_flags;		/* request flags */
+	__u32 hr_itemcount;	/* item count in hur_user_item vector */
+	__u32 hr_data_len;
+};
+
+struct hsm_user_item {
+       lustre_fid        hui_fid;
+       struct hsm_extent hui_extent;
+} __attribute__((packed));
+
+struct hsm_user_request {
+	struct hsm_request	hur_request;
+	struct hsm_user_item	hur_user_item[0];
+	/* extra data blob at end of struct (after all
+	 * hur_user_items), only use helpers to access it
+	 */
+} __attribute__((packed));
+
+/** Return pointer to data field in a hsm user request */
+static inline void *hur_data(struct hsm_user_request *hur)
+{
+	return &(hur->hur_user_item[hur->hur_request.hr_itemcount]);
+}
+
+/**
+ * Compute the current length of the provided hsm_user_request.  This returns -1
+ * instead of an errno because ssize_t is defined to be only [ -1, SSIZE_MAX ]
+ *
+ * return -1 on bounds check error.
+ */
+static inline ssize_t hur_len(struct hsm_user_request *hur)
+{
+	__u64	size;
+
+	/* can't overflow a __u64 since hr_itemcount is only __u32 */
+	size = offsetof(struct hsm_user_request, hur_user_item[0]) +
+		(__u64)hur->hur_request.hr_itemcount *
+		sizeof(hur->hur_user_item[0]) + hur->hur_request.hr_data_len;
+
+	if (size != (ssize_t)size)
+		return -1;
+
+	return size;
+}
+
+/****** HSM RPCs to copytool *****/
+/* Message types the copytool may receive */
+enum hsm_message_type {
+        HMT_ACTION_LIST = 100, /* message is a hsm_action_list */
+};
+
+/* Actions the copytool may be instructed to take for a given action_item */
+enum hsm_copytool_action {
+        HSMA_NONE    = 10, /* no action */
+        HSMA_ARCHIVE = 20, /* arbitrary offset */
+        HSMA_RESTORE = 21,
+        HSMA_REMOVE  = 22,
+        HSMA_CANCEL  = 23
+};
+
+static inline const char *hsm_copytool_action2name(enum hsm_copytool_action  a)
+{
+        switch  (a) {
+        case HSMA_NONE:    return "NOOP";
+        case HSMA_ARCHIVE: return "ARCHIVE";
+        case HSMA_RESTORE: return "RESTORE";
+        case HSMA_REMOVE:  return "REMOVE";
+        case HSMA_CANCEL:  return "CANCEL";
+        default:           return "UNKNOWN";
+        }
+}
+
+/* Copytool item action description */
+struct hsm_action_item {
+	__u32      hai_len;     /* valid size of this struct */
+	__u32      hai_action;  /* hsm_copytool_action, but use known size */
+	lustre_fid hai_fid;     /* Lustre FID to operate on */
+	lustre_fid hai_dfid;    /* fid used for data access */
+	struct hsm_extent hai_extent;  /* byte range to operate on */
+	__u64      hai_cookie;  /* action cookie from coordinator */
+	__u64      hai_gid;     /* grouplock id */
+	char       hai_data[0]; /* variable length */
+} __attribute__((packed));
+
+/**
+ * helper function which print in hexa the first bytes of
+ * hai opaque field
+ *
+ * \param hai [IN]        record to print
+ * \param buffer [IN,OUT] buffer to write the hex string to
+ * \param len [IN]        max buffer length
+ *
+ * \retval buffer
+ */
+static inline char *hai_dump_data_field(const struct hsm_action_item *hai,
+					char *buffer, size_t len)
+{
+	int i;
+	int data_len;
+	char *ptr;
+
+	ptr = buffer;
+	data_len = hai->hai_len - sizeof(*hai);
+	for (i = 0; (i < data_len) && (len > 2); i++) {
+		snprintf(ptr, 3, "%02X", (unsigned char)hai->hai_data[i]);
+		ptr += 2;
+		len -= 2;
+	}
+
+	*ptr = '\0';
+
+	return buffer;
+}
+
+/* Copytool action list */
+#define HAL_VERSION 1
+#define HAL_MAXSIZE LNET_MTU /* bytes, used in userspace only */
+struct hsm_action_list {
+	__u32 hal_version;
+	__u32 hal_count;       /* number of hai's to follow */
+	__u64 hal_compound_id; /* returned by coordinator */
+	__u64 hal_flags;
+	__u32 hal_archive_id; /* which archive backend */
+	__u32 padding1;
+	char  hal_fsname[0];   /* null-terminated */
+	/* struct hsm_action_item[hal_count] follows, aligned on 8-byte
+	   boundaries. See hai_zero */
+} __attribute__((packed));
+
+#ifndef HAVE_CFS_SIZE_ROUND
+static inline int cfs_size_round (int val)
+{
+        return (val + 7) & (~0x7);
+}
+#define HAVE_CFS_SIZE_ROUND
+#endif
+
+/* Return pointer to first hai in action list */
+static inline struct hsm_action_item *hai_first(struct hsm_action_list *hal)
+{
+	return (struct hsm_action_item *)(hal->hal_fsname +
+					  cfs_size_round(strlen(hal-> \
+								hal_fsname)
+							 + 1));
+}
+/* Return pointer to next hai */
+static inline struct hsm_action_item * hai_next(struct hsm_action_item *hai)
+{
+        return (struct hsm_action_item *)((char *)hai +
+                                          cfs_size_round(hai->hai_len));
+}
+
+/* Return size of an hsm_action_list */
+static inline size_t hal_size(struct hsm_action_list *hal)
+{
+	__u32 i;
+	size_t sz;
+	struct hsm_action_item *hai;
+
+	sz = sizeof(*hal) + cfs_size_round(strlen(hal->hal_fsname) + 1);
+	hai = hai_first(hal);
+	for (i = 0; i < hal->hal_count ; i++, hai = hai_next(hai))
+		sz += cfs_size_round(hai->hai_len);
+
+	return sz;
+}
+
+/* HSM file import
+ * describe the attributes to be set on imported file
+ */
+struct hsm_user_import {
+	__u64		hui_size;
+	__u64		hui_atime;
+	__u64		hui_mtime;
+	__u32		hui_atime_ns;
+	__u32		hui_mtime_ns;
+	__u32		hui_uid;
+	__u32		hui_gid;
+	__u32		hui_mode;
+	__u32		hui_archive_id;
+};
+
+/* Copytool progress reporting */
+#define HP_FLAG_COMPLETED 0x01
+#define HP_FLAG_RETRY     0x02
+
+struct hsm_progress {
+	lustre_fid		hp_fid;
+	__u64			hp_cookie;
+	struct hsm_extent	hp_extent;
+	__u16			hp_flags;
+	__u16			hp_errval; /* positive val */
+	__u32			padding;
+};
+
+struct hsm_copy {
+	__u64			hc_data_version;
+	__u16			hc_flags;
+	__u16			hc_errval; /* positive val */
+	__u32			padding;
+	struct hsm_action_item	hc_hai;
+};
+
+/* JSON objects */
+enum llapi_json_types {
+	LLAPI_JSON_INTEGER = 1,
+	LLAPI_JSON_BIGNUM,
+	LLAPI_JSON_REAL,
+	LLAPI_JSON_STRING
+};
+
+struct llapi_json_item {
+	char			*lji_key;
+	__u32			lji_type;
+	union {
+		int	lji_integer;
+		__u64	lji_u64;
+		double	lji_real;
+		char	*lji_string;
+	};
+	struct llapi_json_item	*lji_next;
+};
+
+struct llapi_json_item_list {
+	int			ljil_item_count;
+	struct llapi_json_item	*ljil_items;
+};
+
+enum lu_ladvise_type {
+	LU_LADVISE_INVALID	= 0,
+	LU_LADVISE_WILLREAD	= 1,
+	LU_LADVISE_DONTNEED	= 2,
+};
+
+#define LU_LADVISE_NAMES {						\
+	[LU_LADVISE_WILLREAD]	= "willread",				\
+	[LU_LADVISE_DONTNEED]	= "dontneed",				\
+}
+
+/* This is the userspace argument for ladvise.  It is currently the same as
+ * what goes on the wire (struct lu_ladvise), but is defined separately as we
+ * may need info which is only used locally. */
+struct llapi_lu_ladvise {
+	__u16 lla_advice;	/* advice type */
+	__u16 lla_value1;	/* values for different advice types */
+	__u32 lla_value2;
+	__u64 lla_start;	/* first byte of extent for advice */
+	__u64 lla_end;		/* last byte of extent for advice */
+	__u32 lla_value3;
+	__u32 lla_value4;
+};
+
+enum ladvise_flag {
+	LF_ASYNC	= 0x00000001,
+};
+
+#define LADVISE_MAGIC 0x1ADF1CE0
+#define LF_MASK LF_ASYNC
+
+/* This is the userspace argument for ladvise, corresponds to ladvise_hdr which
+ * is used on the wire.  It is defined separately as we may need info which is
+ * only used locally. */
+struct llapi_ladvise_hdr {
+	__u32			lah_magic;	/* LADVISE_MAGIC */
+	__u32			lah_count;	/* number of advices */
+	__u64			lah_flags;	/* from enum ladvise_flag */
+	__u32			lah_value1;	/* unused */
+	__u32			lah_value2;	/* unused */
+	__u64			lah_value3;	/* unused */
+	struct llapi_lu_ladvise	lah_advise[0];	/* advices in this header */
+};
+
+#define LAH_COUNT_MAX	(1024)
+
+/* Shared key */
+enum sk_crypt_alg {
+	SK_CRYPT_INVALID	= -1,
+	SK_CRYPT_EMPTY		= 0,
+	SK_CRYPT_AES256_CTR	= 1,
+	SK_CRYPT_MAX		= 2,
+};
+
+enum sk_hmac_alg {
+	SK_HMAC_INVALID	= -1,
+	SK_HMAC_EMPTY	= 0,
+	SK_HMAC_SHA256	= 1,
+	SK_HMAC_SHA512	= 2,
+	SK_HMAC_MAX	= 3,
+};
+
+struct sk_crypt_type {
+	char    *sct_name;
+	size_t   sct_bytes;
+};
+
+struct sk_hmac_type {
+	char    *sht_name;
+	size_t   sht_bytes;
+};
+
+/** @} lustreuser */
+#endif /* _LUSTRE_USER_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h
index da1e166d9c39c..67df286a5c358 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,17 +38,9 @@
  * @{
  */
 
-#include <glob.h>
 #include <stdarg.h>
 #include <stdint.h>
-#include <time.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <linux/lustre/lustre_user.h>
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
+#include <lustre/lustre_user.h>
 
 #ifndef LL_MAXQUOTAS
 #define LL_MAXQUOTAS 3
@@ -58,14 +50,9 @@ extern "C" {
 #define ARRAY_SIZE(a) ((sizeof(a)) / (sizeof((a)[0])))
 #endif
 
-#define lustre_fid struct lu_fid
-
-/* Currently external applications can access this but in the
- * future this will no longer be exposed for the user. Instead
- * if you want to know if the library is initialized just call
- * llapi_liblustreapi_initialized() which is now available. */
 extern bool liblustreapi_initialized;
 
+
 typedef void (*llapi_cb_t)(char *obd_type_name, char *obd_name, char *obd_uuid,
 			   void *args);
 
@@ -84,10 +71,6 @@ enum llapi_message_level {
 typedef void (*llapi_log_callback_t)(enum llapi_message_level level, int err,
 				     const char *fmt, va_list ap);
 
-static inline bool llapi_liblustreapi_initialized(void)
-{
-	return liblustreapi_initialized;
-}
 
 /* the bottom three bits reserved for llapi_message_level */
 #define LLAPI_MSG_MASK          0x00000007
@@ -104,11 +87,10 @@ static inline const char *llapi_msg_level2str(enum llapi_message_level level)
 
 	return levels[level];
 }
-
-void llapi_msg_set_level(int level);
+extern void llapi_msg_set_level(int level);
 int llapi_msg_get_level(void);
-llapi_log_callback_t llapi_error_callback_set(llapi_log_callback_t cb);
-llapi_log_callback_t llapi_info_callback_set(llapi_log_callback_t cb);
+extern llapi_log_callback_t llapi_error_callback_set(llapi_log_callback_t cb);
+extern llapi_log_callback_t llapi_info_callback_set(llapi_log_callback_t cb);
 
 void llapi_error(enum llapi_message_level level, int err, const char *fmt, ...)
 	__attribute__((__format__(__printf__, 3, 4)));
@@ -128,64 +110,53 @@ struct llapi_stripe_param {
 	__u32			lsp_osts[0];
 };
 
-#define lsp_tgts	lsp_osts
-
-int llapi_file_open_param(const char *name, int flags, mode_t mode,
-			  const struct llapi_stripe_param *param);
-int llapi_file_create(const char *name, unsigned long long stripe_size,
-		      int stripe_offset, int stripe_count, int stripe_pattern);
-int llapi_file_open(const char *name, int flags, int mode,
-		    unsigned long long stripe_size, int stripe_offset,
-		    int stripe_count, int stripe_pattern);
-int llapi_file_create_pool(const char *name, unsigned long long stripe_size,
-			   int stripe_offset, int stripe_count,
-			   int stripe_pattern, char *pool_name);
-int llapi_file_open_pool(const char *name, int flags, int mode,
-			 unsigned long long stripe_size, int stripe_offset,
-			 int stripe_count, int stripe_pattern, char *pool_name);
-int llapi_poollist(const char *name);
-int llapi_get_poollist(const char *name, char **poollist, int list_size,
-		       char *buffer, int buffer_size);
-int llapi_get_poolmembers(const char *poolname, char **members, int list_size,
-			  char *buffer, int buffer_size);
-int llapi_file_get_stripe(const char *path, struct lov_user_md *lum);
-int llapi_file_lookup(int dirfd, const char *name);
-void llapi_set_command_name(const char *cmd);
-void llapi_clear_command_name(void);
-
-enum llapi_layout_verbose  {
-	VERBOSE_STRIPE_COUNT	=     0x1,
-	VERBOSE_STRIPE_SIZE	=     0x2,
-	VERBOSE_STRIPE_OFFSET	=     0x4,
-	VERBOSE_POOL		=     0x8,
-	VERBOSE_DETAIL		=    0x10,
-	VERBOSE_OBJID		=    0x20,
-	VERBOSE_GENERATION	=    0x40,
-	VERBOSE_MDTINDEX	=    0x80,
-	VERBOSE_PATTERN		=   0x100,
-	VERBOSE_COMP_COUNT	=   0x200,
-	VERBOSE_COMP_FLAGS	=   0x400,
-	VERBOSE_COMP_START	=   0x800,
-	VERBOSE_COMP_END	=  0x1000,
-	VERBOSE_COMP_ID		=  0x2000,
-	VERBOSE_DFID		=  0x4000,
-	VERBOSE_HASH_TYPE	=  0x8000,
-	VERBOSE_MIRROR_COUNT	= 0x10000,
-	VERBOSE_MIRROR_ID	= 0x20000,
-	VERBOSE_DEFAULT		= VERBOSE_STRIPE_COUNT | VERBOSE_STRIPE_SIZE |
-				  VERBOSE_STRIPE_OFFSET | VERBOSE_POOL |
-				  VERBOSE_OBJID | VERBOSE_GENERATION |
-				  VERBOSE_PATTERN | VERBOSE_HASH_TYPE |
-				  VERBOSE_COMP_COUNT | VERBOSE_COMP_FLAGS |
-				  VERBOSE_COMP_START | VERBOSE_COMP_END |
-				  VERBOSE_COMP_ID | VERBOSE_MIRROR_COUNT |
-				  VERBOSE_MIRROR_ID
-};
-/* Compatibility with original names */
-#define VERBOSE_SIZE	VERBOSE_STRIPE_SIZE
-#define VERBOSE_COUNT	VERBOSE_STRIPE_COUNT
-#define VERBOSE_OFFSET	VERBOSE_STRIPE_OFFSET
-#define VERBOSE_LAYOUT	VERBOSE_PATTERN
+extern int llapi_file_open_param(const char *name, int flags, mode_t mode,
+				 const struct llapi_stripe_param *param);
+extern int llapi_file_create(const char *name, unsigned long long stripe_size,
+                             int stripe_offset, int stripe_count,
+                             int stripe_pattern);
+extern int llapi_file_open(const char *name, int flags, int mode,
+                           unsigned long long stripe_size, int stripe_offset,
+                           int stripe_count, int stripe_pattern);
+extern int llapi_file_create_pool(const char *name,
+                                  unsigned long long stripe_size,
+                                  int stripe_offset, int stripe_count,
+                                  int stripe_pattern, char *pool_name);
+extern int llapi_file_open_pool(const char *name, int flags, int mode,
+                                unsigned long long stripe_size,
+                                int stripe_offset, int stripe_count,
+                                int stripe_pattern, char *pool_name);
+extern int llapi_poollist(const char *name);
+extern int llapi_get_poollist(const char *name, char **poollist, int list_size,
+                              char *buffer, int buffer_size);
+extern int llapi_get_poolmembers(const char *poolname, char **members,
+                                 int list_size, char *buffer, int buffer_size);
+extern int llapi_file_get_stripe(const char *path, struct lov_user_md *lum);
+extern int llapi_file_lookup(int dirfd, const char *name);
+
+#define VERBOSE_COUNT		   0x1
+#define VERBOSE_SIZE		   0x2
+#define VERBOSE_OFFSET		   0x4
+#define VERBOSE_POOL		   0x8
+#define VERBOSE_DETAIL		  0x10
+#define VERBOSE_OBJID		  0x20
+#define VERBOSE_GENERATION	  0x40
+#define VERBOSE_MDTINDEX	  0x80
+#define VERBOSE_LAYOUT		 0x100
+#define VERBOSE_COMP_COUNT	 0x200
+#define VERBOSE_COMP_FLAGS	 0x400
+#define VERBOSE_COMP_START	 0x800
+#define VERBOSE_COMP_END	0x1000
+#define VERBOSE_COMP_ID		0x2000
+#define VERBOSE_DFID		0x4000
+#define VERBOSE_HASH_TYPE	0x8000
+#define VERBOSE_DEFAULT		(VERBOSE_COUNT | VERBOSE_SIZE | \
+				 VERBOSE_OFFSET | VERBOSE_POOL | \
+				 VERBOSE_OBJID | VERBOSE_GENERATION | \
+				 VERBOSE_LAYOUT | VERBOSE_HASH_TYPE | \
+				 VERBOSE_COMP_COUNT | VERBOSE_COMP_FLAGS | \
+				 VERBOSE_COMP_START | VERBOSE_COMP_END | \
+				 VERBOSE_COMP_ID)
 
 struct find_param {
 	unsigned int		 fp_max_depth;
@@ -208,11 +179,7 @@ struct find_param {
 				 fp_comp_start_sign:2,
 				 fp_comp_end_sign:2,
 				 fp_comp_count_sign:2,
-				 fp_mirror_count_sign:2,
-				 fp_mirror_index_sign:2,
-				 fp_mirror_id_sign:2,
-				 fp_mdt_count_sign:2,
-				 fp_blocks_sign:2;
+				 fp_mdt_count_sign:2;
 	unsigned long long	 fp_size;
 	unsigned long long	 fp_size_units;
 
@@ -247,30 +214,21 @@ struct find_param {
 				 fp_exclude_projid:1,
 				 fp_check_comp_count:1,
 				 fp_exclude_comp_count:1,
-				 fp_check_mirror_count:1,
-				 fp_exclude_mirror_count:1,
 				 fp_check_comp_flags:1,
-				 fp_check_mirror_state:1,
+				 fp_exclude_comp_flags:1,
 				 fp_check_comp_start:1,
 				 fp_exclude_comp_start:1,
 				 fp_check_comp_end:1,
 				 fp_exclude_comp_end:1,
 				 fp_check_comp_id:1,
 				 fp_exclude_comp_id:1,
-				 fp_check_mirror_id:1,
-				 fp_exclude_mirror_id:1,
-				 fp_check_mirror_index:1,
-				 fp_exclude_mirror_index:1,
 				 fp_check_mdt_count:1,
 				 fp_exclude_mdt_count:1,
 				 fp_check_hash_type:1,
 				 fp_exclude_hash_type:1,
-				 fp_yaml:1,	/* output layout in YAML */
-				 fp_check_blocks:1,
-				 fp_exclude_blocks:1,
-				 fp_lazy:1;
+				 fp_yaml:1;	/* output layout in YAML */
 
-	enum llapi_layout_verbose fp_verbose;
+	int			 fp_verbose;
 	int			 fp_quiet;
 
 	/* regular expression */
@@ -303,22 +261,14 @@ struct find_param {
 	__u32			 fp_layout;
 
 	__u32			 fp_comp_count;
-	__u32			 fp_mirror_count;
 	__u32			 fp_comp_flags;
-	__u32			 fp_comp_neg_flags;
-	__u16			 fp_mirror_state;
-	__u16			 fp_mirror_neg_state;
 	__u32			 fp_comp_id;
-	__u16			 fp_mirror_id;
-	__u16			 fp_mirror_index;
 	unsigned long long	 fp_comp_start;
 	unsigned long long	 fp_comp_start_units;
 	unsigned long long	 fp_comp_end;
 	unsigned long long	 fp_comp_end_units;
 	unsigned long long	 fp_mdt_count;
 	unsigned		 fp_projid;
-	unsigned long long	 fp_blocks;
-	unsigned long long	 fp_blocks_units;
 
 	/* In-process parameters. */
 	unsigned long		 fp_got_uuids:1,
@@ -327,123 +277,104 @@ struct find_param {
 	unsigned int		 fp_hash_type;
 };
 
-int llapi_ostlist(char *path, struct find_param *param);
-int llapi_uuid_match(char *real_uuid, char *search_uuid);
-int llapi_getstripe(char *path, struct find_param *param);
-int llapi_find(char *path, struct find_param *param);
-
-int llapi_file_fget_mdtidx(int fd, int *mdtidx);
-int llapi_dir_set_default_lmv(const char *name,
-			      const struct llapi_stripe_param *param);
-int llapi_dir_set_default_lmv_stripe(const char *name, int stripe_offset,
-				     int stripe_count, int stripe_pattern,
-				     const char *pool_name);
-int llapi_dir_create(const char *name, mode_t mode,
-		     const struct llapi_stripe_param *param);
-int llapi_dir_create_pool(const char *name, int flags, int stripe_offset,
-			  int stripe_count, int stripe_pattern,
-			  const char *poolname);
+extern int llapi_ostlist(char *path, struct find_param *param);
+extern int llapi_uuid_match(char *real_uuid, char *search_uuid);
+extern int llapi_getstripe(char *path, struct find_param *param);
+extern int llapi_find(char *path, struct find_param *param);
+
+extern int llapi_file_fget_mdtidx(int fd, int *mdtidx);
+extern int llapi_dir_set_default_lmv_stripe(const char *name, int stripe_offset,
+					   int stripe_count, int stripe_pattern,
+					    const char *pool_name);
+extern int llapi_dir_create_pool(const char *name, int flags, int stripe_offset,
+				 int stripe_count, int stripe_pattern,
+				 const char *poolname);
 int llapi_direntry_remove(char *dname);
 
 int llapi_obd_fstatfs(int fd, __u32 type, __u32 index,
 		      struct obd_statfs *stat_buf, struct obd_uuid *uuid_buf);
-int llapi_obd_statfs(char *path, __u32 type, __u32 index,
-		     struct obd_statfs *stat_buf, struct obd_uuid *uuid_buf);
-int llapi_ping(char *obd_type, char *obd_name);
-int llapi_target_check(int num_types, char **obd_types, char *dir);
-int llapi_file_get_lov_uuid(const char *path, struct obd_uuid *lov_uuid);
-int llapi_file_get_lmv_uuid(const char *path, struct obd_uuid *lmv_uuid);
-int llapi_file_fget_lov_uuid(int fd, struct obd_uuid *lov_uuid);
-int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count);
-int llapi_lmv_get_uuids(int fd, struct obd_uuid *uuidp, int *mdt_count);
-int llapi_is_lustre_mnttype(const char *type);
-int llapi_search_tgt(char *fsname, char *poolname, char *tgtname, bool is_mdt);
-int llapi_search_ost(char *fsname, char *poolname, char *ostname);
-int llapi_get_obd_count(char *mnt, int *count, int is_mdt);
-int llapi_parse_size(const char *optarg, unsigned long long *size,
-		     unsigned long long *size_units, int bytes_spec);
-int llapi_search_mounts(const char *pathname, int index, char *mntdir,
-			char *fsname);
-int llapi_search_fsname(const char *pathname, char *fsname);
-int llapi_getname(const char *path, char *buf, size_t size);
-int llapi_search_fileset(const char *pathname, char *fileset);
-
-int llapi_search_rootpath(char *pathname, const char *fsname);
-int llapi_nodemap_exists(const char *name);
-int llapi_migrate_mdt(char *path, struct find_param *param);
-int llapi_mv(char *path, struct find_param *param);
+extern int llapi_obd_statfs(char *path, __u32 type, __u32 index,
+                     struct obd_statfs *stat_buf,
+                     struct obd_uuid *uuid_buf);
+extern int llapi_ping(char *obd_type, char *obd_name);
+extern int llapi_target_check(int num_types, char **obd_types, char *dir);
+extern int llapi_file_get_lov_uuid(const char *path, struct obd_uuid *lov_uuid);
+extern int llapi_file_get_lmv_uuid(const char *path, struct obd_uuid *lmv_uuid);
+extern int llapi_file_fget_lov_uuid(int fd, struct obd_uuid *lov_uuid);
+extern int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count);
+extern int llapi_lmv_get_uuids(int fd, struct obd_uuid *uuidp, int *mdt_count);
+extern int llapi_is_lustre_mnttype(const char *type);
+extern int llapi_search_ost(char *fsname, char *poolname, char *ostname);
+extern int llapi_get_obd_count(char *mnt, int *count, int is_mdt);
+extern int llapi_parse_size(const char *optarg, unsigned long long *size,
+			    unsigned long long *size_units, int bytes_spec);
+extern int llapi_search_mounts(const char *pathname, int index,
+                               char *mntdir, char *fsname);
+extern int llapi_search_fsname(const char *pathname, char *fsname);
+extern int llapi_getname(const char *path, char *buf, size_t size);
+extern int llapi_search_fileset(const char *pathname, char *fileset);
+
+extern int llapi_search_rootpath(char *pathname, const char *fsname);
+extern int llapi_nodemap_exists(const char *name);
+extern int llapi_migrate_mdt(char *path, struct find_param *param);
+extern int llapi_mv(char *path, struct find_param *param);
 
 struct mntent;
-
 #define HAVE_LLAPI_IS_LUSTRE_MNT
-int llapi_is_lustre_mnt(struct mntent *mnt);
-int llapi_quotactl(char *mnt, struct if_quotactl *qctl);
-int llapi_target_iterate(int type_num, char **obd_type, void *args,
-			 llapi_cb_t cb);
-int llapi_get_connect_flags(const char *mnt, __u64 *flags);
-int llapi_cp(int argc, char *argv[]);
-int llapi_ls(int argc, char *argv[]);
-int llapi_fid2path(const char *device, const char *fidstr, char *path,
-		   int pathlen, long long *recno, int *linkno);
-int llapi_path2fid(const char *path, struct lu_fid *fid);
-int llapi_get_mdt_index_by_fid(int fd, const struct lu_fid *fid,
-			       int *mdt_index);
-int llapi_get_lum_file(const char *path, __u64 *valid, lstatx_t *statx,
-		       struct lov_user_md *lum, size_t lumsize);
-int llapi_get_lum_dir(const char *path, __u64 *valid, lstatx_t *statx,
-		      struct lov_user_md *lum, size_t lumsize);
-int llapi_get_lum_file_fd(int dir_fd, const char *fname, __u64 *valid,
-			  lstatx_t *statx, struct lov_user_md *lum,
-			  size_t lumsize);
-int llapi_get_lum_dir_fd(int dir_fd, __u64 *valid, lstatx_t *statx,
-			 struct lov_user_md *lum, size_t lumsize);
-
-int llapi_fd2fid(int fd, struct lu_fid *fid);
+extern int llapi_is_lustre_mnt(struct mntent *mnt);
+extern int llapi_quotactl(char *mnt, struct if_quotactl *qctl);
+extern int llapi_target_iterate(int type_num, char **obd_type, void *args,
+				llapi_cb_t cb);
+extern int llapi_get_connect_flags(const char *mnt, __u64 *flags);
+extern int llapi_cp(int argc, char *argv[]);
+extern int llapi_ls(int argc, char *argv[]);
+extern int llapi_fid2path(const char *device, const char *fidstr, char *path,
+			  int pathlen, long long *recno, int *linkno);
+extern int llapi_path2fid(const char *path, lustre_fid *fid);
+extern int llapi_get_mdt_index_by_fid(int fd, const lustre_fid *fid,
+				      int *mdt_index);
+extern int llapi_fd2fid(const int fd, lustre_fid *fid);
 /* get FID of parent dir + the related name of entry in this parent dir */
-int llapi_path2parent(const char *path, unsigned int linkno,
-		      struct lu_fid *parent_fid, char *name, size_t name_size);
-int llapi_fd2parent(int fd, unsigned int linkno, struct lu_fid *parent_fid,
-		    char *name, size_t name_size);
-int llapi_rmfid(const char *path, struct fid_array *fa);
-int llapi_chomp_string(char *buf);
-int llapi_open_by_fid(const char *dir, const struct lu_fid *fid,
-		      int open_flags);
-int llapi_get_version_string(char *version, unsigned int version_size);
+extern int llapi_path2parent(const char *path, unsigned int linkno,
+			     lustre_fid *parent_fid, char *name,
+			     size_t name_size);
+extern int llapi_fd2parent(int fd, unsigned int linkno,
+			   lustre_fid *parent_fid, char *name,
+			   size_t name_size);
+extern int llapi_chomp_string(char *buf);
+extern int llapi_open_by_fid(const char *dir, const lustre_fid *fid,
+			     int open_flags);
+
+extern int llapi_get_version_string(char *version, unsigned int version_size);
 /* llapi_get_version() is deprecated, use llapi_get_version_string() instead */
-int llapi_get_version(char *buffer, int buffer_size, char **version)
+extern int llapi_get_version(char *buffer, int buffer_size, char **version)
 	__attribute__((deprecated));
-int llapi_get_data_version(int fd, __u64 *data_version, __u64 flags);
-int llapi_file_flush(int fd);
-extern int llapi_get_ost_layout_version(int fd, __u32 *layout_version);
-int llapi_hsm_state_get_fd(int fd, struct hsm_user_state *hus);
-int llapi_hsm_state_get(const char *path, struct hsm_user_state *hus);
-int llapi_hsm_state_set_fd(int fd, __u64 setmask, __u64 clearmask,
-			   __u32 archive_id);
-int llapi_hsm_state_set(const char *path, __u64 setmask, __u64 clearmask,
-			__u32 archive_id);
-int llapi_hsm_register_event_fifo(const char *path);
-int llapi_hsm_unregister_event_fifo(const char *path);
-void llapi_hsm_log_error(enum llapi_message_level level, int _rc,
-			 const char *fmt, va_list args);
-
-int llapi_get_agent_uuid(char *path, char *buf, size_t bufsize);
-int llapi_create_volatile_idx(const char *directory, int mdt_idx,
-			      int open_flags);
-int llapi_create_volatile_param(const char *directory, int mdt_idx,
-				int open_flags, mode_t mode,
-				const struct llapi_stripe_param *stripe_param);
-
-static inline int llapi_create_volatile(char *directory, int open_flags)
+extern int llapi_get_data_version(int fd, __u64 *data_version, __u64 flags);
+extern int llapi_hsm_state_get_fd(int fd, struct hsm_user_state *hus);
+extern int llapi_hsm_state_get(const char *path, struct hsm_user_state *hus);
+extern int llapi_hsm_state_set_fd(int fd, __u64 setmask, __u64 clearmask,
+				  __u32 archive_id);
+extern int llapi_hsm_state_set(const char *path, __u64 setmask, __u64 clearmask,
+			       __u32 archive_id);
+extern int llapi_hsm_register_event_fifo(const char *path);
+extern int llapi_hsm_unregister_event_fifo(const char *path);
+extern void llapi_hsm_log_error(enum llapi_message_level level, int _rc,
+				const char *fmt, va_list args);
+
+extern int llapi_get_agent_uuid(char *path, char *buf, size_t bufsize);
+extern int llapi_create_volatile_idx(char *directory, int idx, int mode);
+static inline int llapi_create_volatile(char *directory, int mode)
 {
-	return llapi_create_volatile_idx(directory, -1, open_flags);
+	return llapi_create_volatile_idx(directory, -1, mode);
 }
 
 
-int llapi_fswap_layouts_grouplock(int fd1, int fd2, __u64 dv1, __u64 dv2,
-				  int gid, __u64 flags);
-int llapi_fswap_layouts(int fd1, int fd2, __u64 dv1, __u64 dv2, __u64 flags);
-int llapi_swap_layouts(const char *path1, const char *path2, __u64 dv1,
-		       __u64 dv2, __u64 flags);
+extern int llapi_fswap_layouts_grouplock(int fd1, int fd2, __u64 dv1, __u64 dv2,
+					 int gid, __u64 flags);
+extern int llapi_fswap_layouts(int fd1, int fd2, __u64 dv1, __u64 dv2,
+			       __u64 flags);
+extern int llapi_swap_layouts(const char *path1, const char *path2,
+			      __u64 dv1, __u64 dv2, __u64 flags);
 
 /* Changelog interface.  priv is private state, managed internally by these
  * functions */
@@ -453,18 +384,15 @@ int llapi_swap_layouts(const char *path1, const char *path2, __u64 dv1,
  * converted to extended format in the lustre api to ease changelog analysis. */
 #define HAVE_CHANGELOG_EXTEND_REC 1
 
-int llapi_changelog_start(void **priv, enum changelog_send_flag flags,
-			  const char *mdtname, long long startrec);
-int llapi_changelog_fini(void **priv);
-int llapi_changelog_recv(void *priv, struct changelog_rec **rech);
-int llapi_changelog_in_buf(void *priv);
-int llapi_changelog_free(struct changelog_rec **rech);
-int llapi_changelog_get_fd(void *priv);
+extern int llapi_changelog_start(void **priv, enum changelog_send_flag flags,
+				 const char *mdtname, long long startrec);
+extern int llapi_changelog_fini(void **priv);
+extern int llapi_changelog_recv(void *priv, struct changelog_rec **rech);
+extern int llapi_changelog_free(struct changelog_rec **rech);
+extern int llapi_changelog_get_fd(void *priv);
 /* Allow records up to endrec to be destroyed; requires registered id. */
-int llapi_changelog_clear(const char *mdtname, const char *idstr,
-			  long long endrec);
-extern int llapi_changelog_set_xflags(void *priv,
-				    enum changelog_send_extra_flag extra_flags);
+extern int llapi_changelog_clear(const char *mdtname, const char *idstr,
+                                 long long endrec);
 
 /* HSM copytool interface.
  * priv is private state, managed internally by these functions
@@ -472,51 +400,52 @@ extern int llapi_changelog_set_xflags(void *priv,
 struct hsm_copytool_private;
 struct hsm_copyaction_private;
 
-int llapi_hsm_copytool_register(struct hsm_copytool_private **priv,
-				const char *mnt, int archive_count,
-				int *archives, int rfd_flags);
-int llapi_hsm_copytool_unregister(struct hsm_copytool_private **priv);
-int llapi_hsm_copytool_get_fd(struct hsm_copytool_private *ct);
-int llapi_hsm_copytool_recv(struct hsm_copytool_private *priv,
-			    struct hsm_action_list **hal, int *msgsize);
-int llapi_hsm_action_begin(struct hsm_copyaction_private **phcp,
-			   const struct hsm_copytool_private *ct,
-			   const struct hsm_action_item *hai,
-			   int restore_mdt_index, int restore_open_flags,
-			   bool is_error);
-int llapi_hsm_action_end(struct hsm_copyaction_private **phcp,
-			 const struct hsm_extent *he, int hp_flags, int errval);
-int llapi_hsm_action_progress(struct hsm_copyaction_private *hcp,
-			      const struct hsm_extent *he, __u64 total,
-			      int hp_flags);
-int llapi_hsm_action_get_dfid(const struct hsm_copyaction_private *hcp,
-			      struct lu_fid *fid);
-int llapi_hsm_action_get_fd(const struct hsm_copyaction_private *hcp);
-int llapi_hsm_import(const char *dst, int archive, const struct stat *st,
-		     unsigned long long stripe_size, int stripe_offset,
-		     int stripe_count, int stripe_pattern, char *pool_name,
-		     struct lu_fid *newfid);
+extern int llapi_hsm_copytool_register(struct hsm_copytool_private **priv,
+				       const char *mnt, int archive_count,
+				       int *archives, int rfd_flags);
+extern int llapi_hsm_copytool_unregister(struct hsm_copytool_private **priv);
+extern int llapi_hsm_copytool_get_fd(struct hsm_copytool_private *ct);
+extern int llapi_hsm_copytool_recv(struct hsm_copytool_private *priv,
+				   struct hsm_action_list **hal, int *msgsize);
+extern int llapi_hsm_action_begin(struct hsm_copyaction_private **phcp,
+				  const struct hsm_copytool_private *ct,
+				  const struct hsm_action_item *hai,
+				  int restore_mdt_index, int restore_open_flags,
+				  bool is_error);
+extern int llapi_hsm_action_end(struct hsm_copyaction_private **phcp,
+				const struct hsm_extent *he,
+				int hp_flags, int errval);
+extern int llapi_hsm_action_progress(struct hsm_copyaction_private *hcp,
+				     const struct hsm_extent *he, __u64 total,
+				     int hp_flags);
+extern int llapi_hsm_action_get_dfid(const struct hsm_copyaction_private *hcp,
+				     lustre_fid *fid);
+extern int llapi_hsm_action_get_fd(const struct hsm_copyaction_private *hcp);
+extern int llapi_hsm_import(const char *dst, int archive, const struct stat *st,
+			    unsigned long long stripe_size, int stripe_offset,
+			    int stripe_count, int stripe_pattern,
+			    char *pool_name, lustre_fid *newfid);
 
 /* HSM user interface */
-struct hsm_user_request *llapi_hsm_user_request_alloc(int itemcount,
-						      int data_len);
-int llapi_hsm_request(const char *path, const struct hsm_user_request *request);
-int llapi_hsm_current_action(const char *path, struct hsm_current_action *hca);
+extern struct hsm_user_request *llapi_hsm_user_request_alloc(int itemcount,
+							     int data_len);
+extern int llapi_hsm_request(const char *path,
+			     const struct hsm_user_request *request);
+extern int llapi_hsm_current_action(const char *path,
+				    struct hsm_current_action *hca);
 
 /* JSON handling */
-int llapi_json_init_list(struct llapi_json_item_list **item_list);
-int llapi_json_destroy_list(struct llapi_json_item_list **item_list);
-int llapi_json_add_item(struct llapi_json_item_list **item_list, char *key,
-			__u32 type, void *val);
-int llapi_json_write_list(struct llapi_json_item_list **item_list, FILE *fp);
+extern int llapi_json_init_list(struct llapi_json_item_list **item_list);
+extern int llapi_json_destroy_list(struct llapi_json_item_list **item_list);
+extern int llapi_json_add_item(struct llapi_json_item_list **item_list,
+			       char *key, __u32 type, void *val);
+extern int llapi_json_write_list(struct llapi_json_item_list **item_list,
+				 FILE *fp);
 
 /* File lease */
-int llapi_lease_acquire(int fd, enum ll_lease_mode mode);
-int llapi_lease_release(int fd);
-int llapi_lease_set(int fd, const struct ll_ioc_lease *data);
-int llapi_lease_check(int fd);
-int llapi_lease_get(int fd, int mode); /* obsoleted */
-int llapi_lease_put(int fd); /* obsoleted */
+extern int llapi_lease_get(int fd, int mode);
+extern int llapi_lease_check(int fd);
+extern int llapi_lease_put(int fd);
 
 /* Group lock */
 int llapi_group_lock(int fd, int gid);
@@ -529,33 +458,9 @@ int llapi_ladvise(int fd, unsigned long long flags, int num_advise,
 
 /* llapi_layout user interface */
 
-/**
- * An array element storing component info to be resynced during mirror
- * resynchronization.
- */
-struct llapi_resync_comp {
-	uint64_t lrc_start;
-	uint64_t lrc_end;
-	uint32_t lrc_mirror_id;
-	uint32_t lrc_id;	/* component id */
-	bool lrc_synced;
-};
-
 /** Opaque data type abstracting the layout of a Lustre file. */
 struct llapi_layout;
 
-int llapi_mirror_truncate(int fd, unsigned int id, off_t length);
-ssize_t llapi_mirror_write(int fd, unsigned int id, const void *buf,
-			   size_t count, off_t pos);
-uint32_t llapi_mirror_find(struct llapi_layout *layout,
-			   uint64_t file_start, uint64_t file_end,
-			   uint64_t *endp);
-int llapi_mirror_find_stale(struct llapi_layout *layout,
-		struct llapi_resync_comp *comp, size_t comp_size,
-		__u16 *mirror_ids, int ids_nr);
-int llapi_mirror_resync_many(int fd, struct llapi_layout *layout,
-			     struct llapi_resync_comp *comp_array,
-			     int comp_size,  uint64_t start, uint64_t end);
 /*
  * Flags to control how layouts are retrieved.
  */
@@ -582,8 +487,8 @@ struct llapi_layout *llapi_layout_get_by_fd(int fd, uint32_t flags);
 
 /**
  * Return a pointer to a newly-allocated opaque data type containing the
- * layout for the file associated with Lustre file identifier
- * \a fid.  The string \a path must name a path within the
+ * layout for the file associated with Lustre file identifier string
+ * \a fidstr.  The string \a path must name a path within the
  * filesystem that contains the file being looked up, such as the
  * filesystem root.  The returned pointer should be freed with
  * llapi_layout_free() when it is no longer needed.  Failure is
@@ -591,35 +496,9 @@ struct llapi_layout *llapi_layout_get_by_fd(int fd, uint32_t flags);
  * stored in errno.
  */
 struct llapi_layout *llapi_layout_get_by_fid(const char *path,
-					     const struct lu_fid *fid,
+					     const lustre_fid *fid,
 					     uint32_t flags);
 
-enum llapi_layout_xattr_flags {
-	LLAPI_LXF_CHECK = 0x0001,
-	LLAPI_LXF_COPY  = 0x0002,
-};
-
-/**
- * Return a pointer to a newly-allocated opaque data type containing the
- * layout for the file associated with extended attribute \a lov_xattr.  The
- * length of the extended attribute is \a lov_xattr_size. The \a lov_xattr
- * should be raw xattr without being swapped, since this function will swap it
- * properly. Thus, \a lov_xattr will be modified during the process. If the
- * \a LLAPI_LXF_CHECK flag of \a flags is set, this function will check whether
- * the objects count in lum is consistent with the stripe count in lum. This
- * check only apply to regular file, so \a LLAPI_LXF_CHECK flag should be
- * cleared if the xattr belongs to a directory. If the \a LLAPI_LXF_COPY flag
- * of \a flags is set, this function will use a temporary buffer for byte
- * swapping when necessary, leaving \a lov_xattr untouched. Otherwise, the byte
- * swapping will be done to the \a lov_xattr buffer directly.  The returned
- * pointer should be freed with llapi_layout_free() when it is no longer
- * needed.  Failure is  * indicated with a NULL return value and an appropriate
- * error code stored in errno.
- */
-struct llapi_layout *llapi_layout_get_by_xattr(void *lov_xattr,
-					       ssize_t lov_xattr_size,
-					       uint32_t flags);
-
 /**
  * Allocate a new layout. Use this when creating a new file with
  * llapi_layout_file_create().
@@ -631,19 +510,6 @@ struct llapi_layout *llapi_layout_alloc(void);
  */
 void llapi_layout_free(struct llapi_layout *layout);
 
-/**
- * llapi_layout_merge() - Merge a composite layout into another one.
- * @dst_layout: Destination composite layout.
- * @src_layout: Source composite layout.
- *
- * This function copies all of the components from @src_layout and
- * appends them to @dst_layout.
- *
- * Return: 0 on success or -1 on failure.
- */
-int llapi_layout_merge(struct llapi_layout **dst_layout,
-		       const struct llapi_layout *src_layout);
-
 /** Not a valid stripe size, offset, or RAID pattern. */
 #define LLAPI_LAYOUT_INVALID	0x1000000000000001ULL
 
@@ -665,8 +531,7 @@ int llapi_layout_merge(struct llapi_layout **dst_layout,
  * stored using RAID0.  That is, data will be split evenly and without
  * redundancy across all OSTs in the layout.
  */
-#define LLAPI_LAYOUT_RAID0	0ULL
-#define LLAPI_LAYOUT_MDT	2ULL
+#define LLAPI_LAYOUT_RAID0	0
 
 /**
 * The layout includes a specific set of OSTs on which to allocate.
@@ -866,39 +731,6 @@ int llapi_layout_file_open(const char *path, int open_flags, mode_t mode,
 int llapi_layout_file_create(const char *path, int open_flags, int mode,
 			     const struct llapi_layout *layout);
 
-/**
- * Set flags to the header of component layout.
- */
-int llapi_layout_flags_set(struct llapi_layout *layout, uint32_t flags);
-int llapi_layout_flags_get(struct llapi_layout *layout, uint32_t *flags);
-const char *llapi_layout_flags_string(uint32_t flags);
-const __u16 llapi_layout_string_flags(char *string);
-
-/**
- * llapi_layout_mirror_count_get() - Get mirror count from the header of
- *				     a layout.
- * @layout: Layout to get mirror count from.
- * @count:  Returned mirror count value.
- *
- * This function gets mirror count from the header of a layout.
- *
- * Return: 0 on success or -1 on failure.
- */
-int llapi_layout_mirror_count_get(struct llapi_layout *layout,
-				  uint16_t *count);
-
-/**
- * llapi_layout_mirror_count_set() - Set mirror count to the header of a layout.
- * @layout: Layout to set mirror count in.
- * @count:  Mirror count value to be set.
- *
- * This function sets mirror count to the header of a layout.
- *
- * Return: 0 on success or -1 on failure.
- */
-int llapi_layout_mirror_count_set(struct llapi_layout *layout,
-				  uint16_t count);
-
 /**
  * Fetch the start and end offset of the current layout component.
  */
@@ -916,10 +748,12 @@ static const struct comp_flag_name {
 	const char *cfn_name;
 } comp_flags_table[] = {
 	{ LCME_FL_INIT,		"init" },
+	/* For now, only "init" is supported
+	{ LCME_FL_PRIMARY,	"primary" },
 	{ LCME_FL_STALE,	"stale" },
-	{ LCME_FL_PREF_RW,	"prefer" },
 	{ LCME_FL_OFFLINE,	"offline" },
-	{ LCME_FL_NOSYNC,	"nosync" },
+	{ LCME_FL_PREFERRED,	"preferred" }
+	*/
 };
 
 /**
@@ -939,18 +773,10 @@ int llapi_layout_comp_flags_clear(struct llapi_layout *layout, uint32_t flags);
  * Fetches the file-unique component ID of the current layout component.
  */
 int llapi_layout_comp_id_get(const struct llapi_layout *layout, uint32_t *id);
-/**
- * Fetches the mirror ID of the current layout component.
- */
-int llapi_layout_mirror_id_get(const struct llapi_layout *layout, uint32_t *id);
 /**
  * Adds one component to the existing composite or plain layout.
  */
 int llapi_layout_comp_add(struct llapi_layout *layout);
-/**
- * Adds a first component of a mirror to the existing composite layout.
- */
-int llapi_layout_add_first_comp(struct llapi_layout *layout);
 /**
  * Deletes the current layout component from the composite layout.
  */
@@ -987,52 +813,10 @@ int llapi_layout_file_comp_del(const char *path, uint32_t id, uint32_t flags);
  * attributes are passed in by @comp and @valid is used to specify which
  * attributes in the component are going to be changed.
  */
-int llapi_layout_file_comp_set(const char *path, uint32_t *ids, uint32_t *flags,
-			       size_t count);
-/**
- * Check if the file layout is composite.
- */
-bool llapi_layout_is_composite(struct llapi_layout *layout);
-
-enum {
-	LLAPI_LAYOUT_ITER_CONT = 0,
-	LLAPI_LAYOUT_ITER_STOP = 1,
-};
-
-/**
- * Iteration callback function.
- *
- * \retval LLAPI_LAYOUT_ITER_CONT	Iteration proceeds
- * \retval LLAPI_LAYOUT_ITER_STOP	Stop iteration
- * \retval < 0				error code
- */
-typedef int (*llapi_layout_iter_cb)(struct llapi_layout *layout, void *cbdata);
-
-/**
- * Iterate all components in the corresponding layout
- */
-int llapi_layout_comp_iterate(struct llapi_layout *layout,
-			      llapi_layout_iter_cb cb, void *cbdata);
-
-/**
- * FLR: mirror operation APIs
- */
-int llapi_mirror_set(int fd, unsigned int id);
-int llapi_mirror_clear(int fd);
-ssize_t llapi_mirror_read(int fd, unsigned int id,
-			   void *buf, size_t count, off_t pos);
-ssize_t llapi_mirror_copy_many(int fd, __u16 src, __u16 *dst, size_t count);
-int llapi_mirror_copy(int fd, unsigned int src, unsigned int dst,
-		       off_t pos, size_t count);
-
-int llapi_param_get_paths(const char *pattern, glob_t *paths);
-int llapi_param_get_value(const char *path, char **buf, size_t *buflen);
-void llapi_param_paths_free(glob_t *paths);
+int llapi_layout_file_comp_set(const char *path,
+			       const struct llapi_layout *comp,
+			       uint32_t valid);
 
 /** @} llapi */
 
-#if defined(__cplusplus)
-}
-#endif
-
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_acl.h b/drivers/staging/lustrefsx/lustre/include/lustre_acl.h
index 933d09ab4ef1f..beab4a225119f 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_acl.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_acl.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2014, 2017, Intel Corporation.
+ * Copyright (c) 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_barrier.h b/drivers/staging/lustrefsx/lustre/include/lustre_barrier.h
index df6f78bb4b29b..231eae97972ee 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_barrier.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_barrier.h
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2016, Intel Corporation.
  *
  * lustre/include/lustre_barrier.h
  *
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
index e56f9abf7c8ec..441f737170daa 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -33,19 +33,15 @@
 #ifndef _LUSTRE_COMPAT_H
 #define _LUSTRE_COMPAT_H
 
-#include <linux/aio.h>
-#include <linux/fs.h>
 #include <linux/fs_struct.h>
 #include <linux/namei.h>
 #include <linux/pagemap.h>
 #include <linux/bio.h>
 #include <linux/xattr.h>
-#include <linux/workqueue.h>
-#include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/security.h>
 
-#include <libcfs/linux/linux-fs.h>
+#include <libcfs/libcfs.h>
 #include <lustre_patchless_compat.h>
 #include <obd_support.h>
 
@@ -84,6 +80,22 @@ static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
 		path_put(&old_pwd);
 }
 
+/*
+ * set ATTR_BLOCKS to a high value to avoid any risk of collision with other
+ * ATTR_* attributes (see bug 13828)
+ */
+#define ATTR_BLOCKS    (1 << 27)
+
+/*
+ * In more recent kernels, this flag was removed because nobody was using it.
+ * But Lustre does. So define it if needed. It is safe to do so, since it's
+ * not been replaced with a different flag with the same value, and Lustre
+ * only uses it internally.
+ */
+#ifndef ATTR_ATTR_FLAG
+#define ATTR_ATTR_FLAG (1 << 10)
+#endif
+
 #define current_ngroups current_cred()->group_info->ngroups
 #define current_groups current_cred()->group_info->small_block
 
@@ -144,12 +156,8 @@ static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
 #define simple_setattr(dentry, ops) inode_setattr((dentry)->d_inode, ops)
 #endif
 
-#ifndef HAVE_INIT_LIST_HEAD_RCU
-static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
-{
-	WRITE_ONCE(list->next, list);
-	WRITE_ONCE(list->prev, list);
-}
+#ifndef SLAB_DESTROY_BY_RCU
+#define SLAB_DESTROY_BY_RCU 0
 #endif
 
 #ifndef HAVE_DQUOT_SUSPEND
@@ -182,12 +190,6 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
 #define bvl_to_page(bvl)		(bvl->bv_page)
 #endif
 
-#ifdef HAVE_BVEC_ITER
-#define bio_start_sector(bio) (bio->bi_iter.bi_sector)
-#else
-#define bio_start_sector(bio) (bio->bi_sector)
-#endif
-
 #ifndef HAVE_BLK_QUEUE_MAX_SEGMENTS
 #define blk_queue_max_segments(rq, seg)                      \
         do { blk_queue_max_phys_segments(rq, seg);           \
@@ -404,16 +406,6 @@ static inline void truncate_inode_pages_final(struct address_space *map)
 }
 #endif
 
-#ifndef HAVE_PTR_ERR_OR_ZERO
-static inline int __must_check PTR_ERR_OR_ZERO(__force const void *ptr)
-{
-	if (IS_ERR(ptr))
-		return PTR_ERR(ptr);
-	else
-		return 0;
-}
-#endif
-
 #ifndef SIZE_MAX
 #define SIZE_MAX	(~(size_t)0)
 #endif
@@ -444,11 +436,9 @@ static inline int __must_check PTR_ERR_OR_ZERO(__force const void *ptr)
 #endif
 
 #ifdef HAVE_PID_NS_FOR_CHILDREN
-# define ll_task_pid_ns(task) \
-	 ((task)->nsproxy ? ((task)->nsproxy->pid_ns_for_children) : NULL)
+# define ll_task_pid_ns(task)	((task)->nsproxy->pid_ns_for_children)
 #else
-# define ll_task_pid_ns(task) \
-	 ((task)->nsproxy ? ((task)->nsproxy->pid_ns) : NULL)
+# define ll_task_pid_ns(task)	((task)->nsproxy->pid_ns)
 #endif
 
 #ifdef HAVE_FULL_NAME_HASH_3ARGS
@@ -482,30 +472,37 @@ int ll_removexattr(struct dentry *dentry, const char *name);
 #ifndef HAVE_VFS_SETXATTR
 const struct xattr_handler *get_xattr_type(const char *name);
 
+#ifdef HAVE_XATTR_HANDLER_FLAGS
 static inline int
 __vfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
 	       const void *value, size_t size, int flags)
 {
-# ifdef HAVE_XATTR_HANDLER_FLAGS
 	const struct xattr_handler *handler;
 	int rc;
 
 	handler = get_xattr_type(name);
 	if (!handler)
-		return -EOPNOTSUPP;
-
-#  if defined(HAVE_XATTR_HANDLER_INODE_PARAM)
-	rc = handler->set(handler, dentry, inode, name, value, size, flags);
-#  elif defined(HAVE_XATTR_HANDLER_SIMPLIFIED)
-	rc = handler->set(handler, dentry, name, value, size, flags);
-#  else
-	rc = handler->set(dentry, name, value, size, flags, handler->flags);
-#  endif /* !HAVE_XATTR_HANDLER_INODE_PARAM */
+		return -ENXIO;
+
+#if defined(HAVE_XATTR_HANDLER_INODE_PARAM)
+	rc = handler->set(handler, dentry, inode, name, value, size,
+			  XATTR_CREATE);
+#elif defined(HAVE_XATTR_HANDLER_SIMPLIFIED)
+	rc = handler->set(handler, dentry, name, value, size, XATTR_CREATE);
+#else
+	rc = handler->set(dentry, name, value, size, XATTR_CREATE,
+			  handler->flags);
+#endif /* !HAVE_XATTR_HANDLER_INODE_PARAM */
 	return rc;
-# else /* !HAVE_XATTR_HANDLER_FLAGS */
+}
+#else /* !HAVE_XATTR_HANDLER_FLAGS */
+static inline int
+__vfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
+	       const void *value, size_t size, int flags)
+{
 	return ll_setxattr(dentry, name, value, size, flags);
-# endif /* HAVE_XATTR_HANDLER_FLAGS */
 }
+#endif /* HAVE_XATTR_HANDLER_FLAGS */
 #endif /* HAVE_VFS_SETXATTR */
 
 #ifdef HAVE_IOP_SET_ACL
@@ -692,122 +689,10 @@ static inline struct timespec current_time(struct inode *inode)
 }
 #endif
 
-#ifndef time_after32
-/**
- * time_after32 - compare two 32-bit relative times
- * @a: the time which may be after @b
- * @b: the time which may be before @a
- *
- * time_after32(a, b) returns true if the time @a is after time @b.
- * time_before32(b, a) returns true if the time @b is before time @a.
- *
- * Similar to time_after(), compare two 32-bit timestamps for relative
- * times.  This is useful for comparing 32-bit seconds values that can't
- * be converted to 64-bit values (e.g. due to disk format or wire protocol
- * issues) when it is known that the times are less than 68 years apart.
- */
-#define time_after32(a, b)     ((s32)((u32)(b) - (u32)(a)) < 0)
-#define time_before32(b, a)    time_after32(a, b)
-
-#endif
-
 #ifndef __GFP_COLD
 #define __GFP_COLD 0
 #endif
 
-#ifndef alloc_workqueue
-#define alloc_workqueue(name, flags, max_active) create_workqueue(name)
-#endif
-
-#ifndef READ_ONCE
-#define READ_ONCE ACCESS_ONCE
-#endif
-
-#if IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)
-static inline unsigned short blk_integrity_interval(struct blk_integrity *bi)
-{
-#ifdef HAVE_INTERVAL_EXP_BLK_INTEGRITY
-	return bi->interval_exp ? 1 << bi->interval_exp : 0;
-#elif defined(HAVE_INTERVAL_BLK_INTEGRITY)
-	return bi->interval;
-#else
-	return bi->sector_size;
-#endif /* !HAVE_INTERVAL_EXP_BLK_INTEGRITY */
-}
-
-static inline const char *blk_integrity_name(struct blk_integrity *bi)
-{
-#ifdef HAVE_INTERVAL_EXP_BLK_INTEGRITY
-	return bi->profile->name;
-#else
-	return bi->name;
-#endif
-}
-
-static inline unsigned int bip_size(struct bio_integrity_payload *bip)
-{
-#ifdef HAVE_BIP_ITER_BIO_INTEGRITY_PAYLOAD
-	return bip->bip_iter.bi_size;
-#else
-	return bip->bip_size;
-#endif
-}
-#else /* !CONFIG_BLK_DEV_INTEGRITY */
-static inline unsigned short blk_integrity_interval(struct blk_integrity *bi)
-{
-	return 0;
-}
-static inline const char *blk_integrity_name(struct blk_integrity *bi)
-{
-	/* gcc8 dislikes when strcmp() is called against NULL */
-	return "";
-}
-#endif /* !CONFIG_BLK_DEV_INTEGRITY */
-
-#ifndef INTEGRITY_FLAG_READ
-#define INTEGRITY_FLAG_READ BLK_INTEGRITY_VERIFY
-#endif
-
-#ifndef INTEGRITY_FLAG_WRITE
-#define INTEGRITY_FLAG_WRITE BLK_INTEGRITY_GENERATE
-#endif
-
-static inline bool bdev_integrity_enabled(struct block_device *bdev, int rw)
-{
-#if IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)
-	struct blk_integrity *bi = bdev_get_integrity(bdev);
-
-	if (bi == NULL)
-		return false;
-
-#ifdef HAVE_INTERVAL_EXP_BLK_INTEGRITY
-	if (rw == 0 && bi->profile->verify_fn != NULL &&
-	    (bi->flags & INTEGRITY_FLAG_READ))
-		return true;
-
-	if (rw == 1 && bi->profile->generate_fn != NULL &&
-	    (bi->flags & INTEGRITY_FLAG_WRITE))
-		return true;
-#else
-	if (rw == 0 && bi->verify_fn != NULL &&
-	    (bi->flags & INTEGRITY_FLAG_READ))
-		return true;
-
-	if (rw == 1 && bi->generate_fn != NULL &&
-	    (bi->flags & INTEGRITY_FLAG_WRITE))
-		return true;
-#endif /* !HAVE_INTERVAL_EXP_BLK_INTEGRITY */
-#endif /* !CONFIG_BLK_DEV_INTEGRITY */
-
-	return false;
-}
-
-#ifdef HAVE_PAGEVEC_INIT_ONE_PARAM
-#define ll_pagevec_init(pvec, n) pagevec_init(pvec)
-#else
-#define ll_pagevec_init(pvec, n) pagevec_init(pvec, n)
-#endif
-
 #ifdef HAVE_I_PAGES
 #define page_tree i_pages
 #else
@@ -816,16 +701,16 @@ static inline bool bdev_integrity_enabled(struct block_device *bdev, int rw)
 #define xa_unlock_irq(lockp) spin_unlock_irq(lockp)
 #endif
 
+#ifndef HAVE_LINUX_SELINUX_IS_ENABLED
+#define selinux_is_enabled() 1
+#endif
+
 #ifndef KMEM_CACHE_USERCOPY
 #define kmem_cache_create_usercopy(name, size, align, flags, useroffset, \
 				   usersize, ctor)			 \
 	kmem_cache_create(name, size, align, flags, ctor)
 #endif
 
-#ifndef HAVE_LINUX_SELINUX_IS_ENABLED
-#define selinux_is_enabled() 1
-#endif
-
 static inline void ll_security_release_secctx(char *secdata, u32 seclen)
 {
 #ifdef HAVE_SEC_RELEASE_SECCTX_1ARG
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_disk.h b/drivers/staging/lustrefsx/lustre/include/lustre_disk.h
index c121ab18420d2..9b20b7ba8f09e 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_disk.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_disk.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -48,8 +48,8 @@
 #include <linux/backing-dev.h>
 #include <linux/list.h>
 #include <libcfs/libcfs.h>
-#include <uapi/linux/lustre/lustre_disk.h>
-#include <uapi/linux/lustre/lustre_idl.h>
+#include <uapi/linux/lustre_disk.h>
+#include <lustre/lustre_idl.h>
 
 #define IS_MDT(data)		((data)->lsi_flags & LDD_F_SV_TYPE_MDT)
 #define IS_OST(data)		((data)->lsi_flags & LDD_F_SV_TYPE_OST)
@@ -111,7 +111,6 @@ struct lustre_mount_data {
 
 /****************** superblock additional info *********************/
 struct ll_sb_info;
-struct kobject;
 
 struct lustre_sb_info {
 	int                       lsi_flags;
@@ -120,7 +119,6 @@ struct lustre_sb_info {
 	struct ll_sb_info        *lsi_llsbi;   /* add'l client sbi info */
 	struct dt_device	 *lsi_dt_dev;  /* dt device to access disk fs*/
 	atomic_t		  lsi_mounts;  /* references to the srv_mnt */
-	struct kobject		 *lsi_kobj;
 	char			  lsi_svname[MTI_NAME_MAXLEN];
 	/* lsi_osd_obdname format = 'lsi->ls_svname'-osd */
 	char			  lsi_osd_obdname[MTI_NAME_MAXLEN + 4];
@@ -131,9 +129,8 @@ struct lustre_sb_info {
 	char			  lsi_fstype[16];
 	struct backing_dev_info   lsi_bdi;     /* each client mountpoint needs
 						  own backing_dev_info */
-	/* protect lsi_lwp_list */
-	struct mutex		  lsi_lwp_mutex;
 	struct list_head	  lsi_lwp_list;
+	spinlock_t		  lsi_lwp_lock;
 	unsigned long		  lsi_lwp_started:1;
 };
 
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_dlm.h b/drivers/staging/lustrefsx/lustre/include/lustre_dlm.h
index c6291b62f4259..3eed4226f85a7 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_dlm.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_dlm.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2017, Intel Corporation.
+ * Copyright (c) 2010, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -64,9 +64,6 @@ extern struct kset *ldlm_svc_kset;
 #define LDLM_DEFAULT_LRU_SIZE (100 * num_online_cpus())
 #define LDLM_DEFAULT_MAX_ALIVE		3900	/* 3900 seconds ~65 min */
 #define LDLM_CTIME_AGE_LIMIT (10)
-/* if client lock is unused for that time it can be cancelled if any other
- * client shows interest in that lock, e.g. glimpse is occured. */
-#define LDLM_DIRTY_AGE_LIMIT (10)
 #define LDLM_DEFAULT_PARALLEL_AST_LIMIT 1024
 
 /**
@@ -235,8 +232,8 @@ struct ldlm_pool_ops {
  * This feature is commonly referred to as lru_resize.
  */
 struct ldlm_pool {
-	/** Pool debugfs directory. */
-	struct dentry		*pl_debugfs_entry;
+	/** Pool proc directory. */
+	struct proc_dir_entry	*pl_proc_dir;
 	/** Pool name, must be long enough to hold compound proc entry name. */
 	char			pl_name[100];
 	/** Lock for protecting SLV/CLV updates. */
@@ -272,10 +269,9 @@ struct ldlm_pool {
 	struct completion	 pl_kobj_unregister;
 };
 
-typedef int (*ldlm_res_policy)(const struct lu_env *env,
-			       struct ldlm_namespace *,
-			       struct ldlm_lock **, void *req_cookie,
-			       enum ldlm_mode mode, __u64 flags, void *data);
+typedef int (*ldlm_res_policy)(struct ldlm_namespace *, struct ldlm_lock **,
+			       void *req_cookie, enum ldlm_mode mode,
+			       __u64 flags, void *data);
 
 typedef int (*ldlm_cancel_cbt)(struct ldlm_lock *lock);
 
@@ -293,10 +289,11 @@ typedef int (*ldlm_cancel_cbt)(struct ldlm_lock *lock);
  * of ldlm_[res_]lvbo_[init,update,fill]() functions.
  */
 struct ldlm_valblock_ops {
-	int (*lvbo_init)(struct ldlm_resource *res);
-	int (*lvbo_update)(struct ldlm_resource *res, struct ldlm_lock *lock,
-			   struct ptlrpc_request *r, int increase);
-	int (*lvbo_free)(struct ldlm_resource *res);
+        int (*lvbo_init)(struct ldlm_resource *res);
+        int (*lvbo_update)(struct ldlm_resource *res,
+                           struct ptlrpc_request *r,
+                           int increase);
+        int (*lvbo_free)(struct ldlm_resource *res);
 	/* Return size of lvb data appropriate RPC size can be reserved */
 	int (*lvbo_size)(struct ldlm_lock *lock);
 	/* Called to fill in lvb data to RPC buffer @buf */
@@ -351,14 +348,6 @@ enum ldlm_ns_type {
 	LDLM_NS_TYPE_MGT,		/**< MGT namespace */
 };
 
-enum ldlm_namespace_flags {
-	/**
-	 * Flag to indicate the LRU cancel is in progress.
-	 * Used to limit the process by 1 thread only.
-	 */
-	LDLM_LRU_CANCEL = 0
-};
-
 /**
  * LDLM Namespace.
  *
@@ -387,9 +376,6 @@ struct ldlm_namespace {
 	/** Flag indicating if namespace is on client instead of server */
 	enum ldlm_side		ns_client;
 
-	/** name of this namespace */
-	char			*ns_name;
-
 	/** Resource hash table for namespace. */
 	struct cfs_hash		*ns_rs_hash;
 
@@ -408,8 +394,8 @@ struct ldlm_namespace {
 	/** Client side original connect flags supported by server. */
 	__u64			ns_orig_connect_flags;
 
-	/* namespace debugfs dir entry */
-	struct dentry		*ns_debugfs_entry;
+	/* namespace proc dir entry */
+	struct proc_dir_entry	*ns_proc_dir_entry;
 
 	/**
 	 * Position in global namespace list linking all namespaces on
@@ -453,20 +439,14 @@ struct ldlm_namespace {
 	 * This allows the client to start caching negative dentries
 	 * for a directory and may save an RPC for a later stat.
 	 */
-	time64_t		ns_ctime_age_limit;
-	/**
-	 * Number of seconds since the lock was last used. The client may
-	 * cancel the lock limited by this age and flush related data if
-	 * any other client shows interest in it doing glimpse request.
-	 * This allows to cache stat data locally for such files early.
-	 */
-	time64_t		ns_dirty_age_limit;
+	unsigned int		ns_ctime_age_limit;
+
 	/**
 	 * Used to rate-limit ldlm_namespace_dump calls.
 	 * \see ldlm_namespace_dump. Increased by 10 seconds every time
 	 * it is called.
 	 */
-	time64_t		ns_next_dump;
+	cfs_time_t		ns_next_dump;
 
 	/** "policy" function that does actual lock conflict determination */
 	ldlm_res_policy		ns_policy;
@@ -504,7 +484,7 @@ struct ldlm_namespace {
 	 * The resources in this namespace remember contended state during
 	 * \a ns_contention_time, in seconds.
 	 */
-	time64_t		ns_contention_time;
+	unsigned		ns_contention_time;
 
 	/**
 	 * Limit size of contended extent locks, in bytes.
@@ -539,11 +519,6 @@ struct ldlm_namespace {
 
 	struct kobject		ns_kobj; /* sysfs object */
 	struct completion	ns_kobj_unregister;
-
-	/**
-	 * To avoid another ns_lock usage, a separate bitops field.
-	 */
-	unsigned long		ns_flags;
 };
 
 /**
@@ -552,6 +527,8 @@ struct ldlm_namespace {
 static inline int ns_is_client(struct ldlm_namespace *ns)
 {
         LASSERT(ns != NULL);
+        LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT |
+                                    LDLM_NAMESPACE_SERVER)));
         LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT ||
                 ns->ns_client == LDLM_NAMESPACE_SERVER);
         return ns->ns_client == LDLM_NAMESPACE_CLIENT;
@@ -563,6 +540,8 @@ static inline int ns_is_client(struct ldlm_namespace *ns)
 static inline int ns_is_server(struct ldlm_namespace *ns)
 {
         LASSERT(ns != NULL);
+        LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT |
+                                    LDLM_NAMESPACE_SERVER)));
         LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT ||
                 ns->ns_client == LDLM_NAMESPACE_SERVER);
         return ns->ns_client == LDLM_NAMESPACE_SERVER;
@@ -605,9 +584,6 @@ typedef int (*ldlm_completion_callback)(struct ldlm_lock *lock, __u64 flags,
 /** Type for glimpse callback function of a lock. */
 typedef int (*ldlm_glimpse_callback)(struct ldlm_lock *lock, void *data);
 
-/** Type for created callback function of a lock. */
-typedef void (*ldlm_created_callback)(struct ldlm_lock *lock);
-
 /** Work list for sending GL ASTs to multiple locks. */
 struct ldlm_glimpse_work {
 	struct ldlm_lock	*gl_lock; /* lock to glimpse */
@@ -619,11 +595,6 @@ struct ldlm_glimpse_work {
 	void			*gl_interpret_data;
 };
 
-struct ldlm_bl_desc {
-	unsigned int bl_same_client:1,
-		     bl_cos_incompat:1;
-};
-
 struct ldlm_cb_set_arg {
 	struct ptlrpc_request_set	*set;
 	int				 type; /* LDLM_{CP,BL,GL}_CALLBACK */
@@ -632,7 +603,6 @@ struct ldlm_cb_set_arg {
 	union ldlm_gl_desc		*gl_desc; /* glimpse AST descriptor */
 	ptlrpc_interpterer_t		 gl_interpret_reply;
 	void				*gl_interpret_data;
-	struct ldlm_bl_desc		*bl_desc;
 };
 
 struct ldlm_cb_async_args {
@@ -640,8 +610,8 @@ struct ldlm_cb_async_args {
 	struct ldlm_lock	*ca_lock;
 };
 
-/** The ldlm_glimpse_work was slab allocated & must be freed accordingly.*/
-#define LDLM_GL_WORK_SLAB_ALLOCATED 0x1
+/** The ldlm_glimpse_work is allocated on the stack and should not be freed. */
+#define LDLM_GL_WORK_NOFREE 0x1
 
 /** Interval node data for each LDLM_EXTENT lock. */
 struct ldlm_interval {
@@ -664,19 +634,6 @@ struct ldlm_interval_tree {
 	struct interval_node	*lit_root; /* actual ldlm_interval */
 };
 
-/**
- * Lists of waiting locks for each inodebit type.
- * A lock can be in several liq_waiting lists and it remains in lr_waiting.
- */
-struct ldlm_ibits_queues {
-	struct list_head	liq_waiting[MDS_INODELOCK_NUMBITS];
-};
-
-struct ldlm_ibits_node {
-	struct list_head	lin_link[MDS_INODELOCK_NUMBITS];
-	struct ldlm_lock	*lock;
-};
-
 /** Whether to track references to exports by LDLM locks. */
 #define LUSTRE_TRACKS_LOCK_EXP_REFS (0)
 
@@ -767,17 +724,14 @@ struct ldlm_lock {
 	struct list_head	l_lru;
 	/**
 	 * Linkage to resource's lock queues according to current lock state.
-	 * (could be granted or waiting)
+	 * (could be granted, waiting or converting)
 	 * Protected by lr_lock in struct ldlm_resource.
 	 */
 	struct list_head	l_res_link;
 	/**
-	 * Internal structures per lock type..
+	 * Tree node for ldlm_extent.
 	 */
-	union {
-		struct ldlm_interval	*l_tree_node;
-		struct ldlm_ibits_node  *l_ibits_node;
-	};
+	struct ldlm_interval	*l_tree_node;
 	/**
 	 * Per export hash of locks.
 	 * Protected by per-bucket exp->exp_lock_hash locks.
@@ -891,13 +845,10 @@ struct ldlm_lock {
 	 * the lock at client, e.g. enqueue the lock. For server it is the
 	 * time when blocking ast was sent.
 	 */
-		time64_t	l_activity;
-		time64_t	l_blast_sent;
+		time64_t        l_activity;
+		time64_t        l_blast_sent;
 	};
 
-	/* separate ost_lvb used mostly by Data-on-MDT for now.
-	 * It is introduced to don't mix with layout lock data. */
-	struct ost_lvb		 l_ost_lvb;
 	/*
 	 * Server-side-only members.
 	 */
@@ -925,7 +876,7 @@ struct ldlm_lock {
 	 * under this lock.
 	 * \see ost_rw_prolong_locks
 	 */
-	time64_t		l_callback_timeout;
+	cfs_time_t		l_callback_timeout;
 
 	/** Local PID of process which created this lock. */
 	__u32			l_pid;
@@ -977,20 +928,6 @@ struct ldlm_lock {
 	struct list_head	l_exp_list;
 };
 
-/**
- * Describe the overlap between two locks.  itree_overlap_cb data.
- */
-struct ldlm_match_data {
-	struct ldlm_lock	*lmd_old;
-	struct ldlm_lock	*lmd_lock;
-	enum ldlm_mode		*lmd_mode;
-	union ldlm_policy_data	*lmd_policy;
-	__u64			 lmd_flags;
-	__u64			 lmd_skip_flags;
-	int			 lmd_unref;
-	bool			 lmd_has_ast_data;
-};
-
 /** For uncommitted cross-MDT lock, store transno this lock belongs to */
 #define l_transno l_client_cookie
 
@@ -998,15 +935,6 @@ struct ldlm_match_data {
  *  which is for server. */
 #define l_slc_link l_rk_ast
 
-#define HANDLE_MAP_SIZE  ((LMV_MAX_STRIPE_COUNT + 7) >> 3)
-
-struct lustre_handle_array {
-	unsigned int		ha_count;
-	/* ha_map is used as bit flag to indicate handle is remote or local */
-	char			ha_map[HANDLE_MAP_SIZE];
-	struct lustre_handle	ha_handles[0];
-};
-
 /**
  * LDLM resource description.
  * Basically, resource is a representation for a single object.
@@ -1038,6 +966,8 @@ struct ldlm_resource {
 	 * @{ */
 	/** List of locks in granted state */
 	struct list_head	lr_granted;
+	/** List of locks waiting to change their granted mode (converted) */
+	struct list_head	lr_converting;
 	/**
 	 * List of locks that could not be granted due to conflicts and
 	 * that are waiting for conflicts to go away */
@@ -1047,21 +977,16 @@ struct ldlm_resource {
 	/** Resource name */
 	struct ldlm_res_id	lr_name;
 
-	union {
-		/**
-		 * Interval trees (only for extent locks) for all modes of
-		 * this resource
-		 */
-		struct ldlm_interval_tree *lr_itree;
-		struct ldlm_ibits_queues *lr_ibits_queues;
-	};
+	/**
+	 * Interval trees (only for extent locks) for all modes of this resource
+	 */
+	struct ldlm_interval_tree *lr_itree;
 
 	union {
 		/**
 		 * When the resource was considered as contended,
-		 * used only on server side.
-		 */
-		time64_t	lr_contention_time;
+		 * used only on server side. */
+		cfs_time_t	lr_contention_time;
 		/**
 		 * Associated inode, used only on client side.
 		 */
@@ -1086,27 +1011,16 @@ struct ldlm_resource {
 	struct lu_ref		lr_reference;
 };
 
-static inline int ldlm_is_granted(struct ldlm_lock *lock)
-{
-	return lock->l_req_mode == lock->l_granted_mode;
-}
-
 static inline bool ldlm_has_layout(struct ldlm_lock *lock)
 {
 	return lock->l_resource->lr_type == LDLM_IBITS &&
 		lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_LAYOUT;
 }
 
-static inline bool ldlm_has_dom(struct ldlm_lock *lock)
-{
-	return lock->l_resource->lr_type == LDLM_IBITS &&
-		lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_DOM;
-}
-
 static inline char *
 ldlm_ns_name(struct ldlm_namespace *ns)
 {
-	return ns->ns_name;
+        return ns->ns_rs_hash->hs_name;
 }
 
 static inline struct ldlm_namespace *
@@ -1213,11 +1127,10 @@ struct ldlm_enqueue_info {
 	void		*ei_cb_local_bl; /** blocking local lock callback */
 	void		*ei_cb_cp;	/** lock completion callback */
 	void		*ei_cb_gl;	/** lock glimpse callback */
-	ldlm_created_callback ei_cb_created;	/** lock created callback */
 	void		*ei_cbdata;	/** Data to be passed into callbacks. */
 	void		*ei_namespace;	/** lock namespace **/
-	u64		ei_inodebits;	/** lock inode bits **/
-	unsigned int	ei_enq_slave:1;	/** whether enqueue slave stripes */
+	unsigned int	ei_enq_slave:1,	/** whether enqueue slave stripes */
+			ei_nonblock:1;	/** non block enqueue */
 };
 
 #define ei_res_id	ei_cb_gl
@@ -1290,21 +1203,21 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
  * LDLM_PROCESS_RESCAN:
  *
  * It's used when policy functions are called from ldlm_reprocess_queue() to
- * reprocess the wait list and try to grant locks, blocking ASTs
+ * reprocess the wait & convert list and try to grant locks, blocking ASTs
  * have already been sent in this situation, completion ASTs need be sent for
  * the locks being granted.
  *
  * LDLM_PROCESS_ENQUEUE:
  *
  * It's used when policy functions are called from ldlm_lock_enqueue() to
- * process the wait list for handling an enqueue request, blocking
+ * process the wait & convert list for handling an enqueue request, blocking
  * ASTs have not been sent yet, so list of conflicting locks would be
  * collected and ASTs sent.
  *
  * LDLM_PROCESS_RECOVERY:
  *
  * It's used when policy functions are called from ldlm_reprocess_queue() to
- * reprocess the wait list when recovery done. In case of blocking
+ * reprocess the wait & convert list when recovery done. In case of blocking
  * ASTs are lost before recovery, it needs not only to grant locks if
  * available, but also send blocking ASTs to the locks doesn't have AST sent
  * flag. Completion ASTs need be sent for the locks being granted.
@@ -1320,12 +1233,6 @@ typedef int (*ldlm_processing_policy)(struct ldlm_lock *lock, __u64 *flags,
 				      enum ldlm_error *err,
 				      struct list_head *work_list);
 
-typedef int (*ldlm_reprocessing_policy)(struct ldlm_resource *res,
-					struct list_head *queue,
-					struct list_head *work_list,
-					enum ldlm_process_intention intention,
-					struct ldlm_lock *hint);
-
 /**
  * Return values for lock iterators.
  * Also used during deciding of lock grants and cancellations.
@@ -1362,7 +1269,7 @@ struct ldlm_prolong_args {
 	struct ldlm_res_id	lpa_resid;
 	struct ldlm_extent	lpa_extent;
 	enum ldlm_mode		lpa_mode;
-	time64_t		lpa_timeout;
+	int			lpa_timeout;
 	int			lpa_locks_cnt;
 	int			lpa_blocks_cnt;
 };
@@ -1396,11 +1303,14 @@ int ldlm_glimpse_locks(struct ldlm_resource *res,
  * MDT or OST to pass through LDLM requests to LDLM for handling
  * @{
  */
+int ldlm_handle_enqueue(struct ptlrpc_request *req, ldlm_completion_callback,
+                        ldlm_blocking_callback, ldlm_glimpse_callback);
 int ldlm_handle_enqueue0(struct ldlm_namespace *ns, struct ptlrpc_request *req,
-			 const struct ldlm_request *dlm_req,
-			 const struct ldlm_callback_suite *cbs);
+                         const struct ldlm_request *dlm_req,
+                         const struct ldlm_callback_suite *cbs);
+int ldlm_handle_convert(struct ptlrpc_request *req);
 int ldlm_handle_convert0(struct ptlrpc_request *req,
-			 const struct ldlm_request *dlm_req);
+                         const struct ldlm_request *dlm_req);
 int ldlm_handle_cancel(struct ptlrpc_request *req);
 int ldlm_request_cancel(struct ptlrpc_request *req,
 			const struct ldlm_request *dlm_req,
@@ -1408,10 +1318,10 @@ int ldlm_request_cancel(struct ptlrpc_request *req,
 /** @} ldlm_handlers */
 
 void ldlm_revoke_export_locks(struct obd_export *exp);
-time64_t ldlm_bl_timeout(struct ldlm_lock *lock);
+unsigned int ldlm_bl_timeout(struct ldlm_lock *lock);
 #endif
 int ldlm_del_waiting_lock(struct ldlm_lock *lock);
-int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, time64_t timeout);
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout);
 int ldlm_get_ref(void);
 void ldlm_put_ref(void);
 int ldlm_init_export(struct obd_export *exp);
@@ -1421,8 +1331,6 @@ struct ldlm_lock *ldlm_request_lock(struct ptlrpc_request *req);
 /* ldlm_lock.c */
 #ifdef HAVE_SERVER_SUPPORT
 ldlm_processing_policy ldlm_get_processing_policy(struct ldlm_resource *res);
-ldlm_reprocessing_policy
-ldlm_get_reprocessing_policy(struct ldlm_resource *res);
 #endif
 void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg);
 void ldlm_lock2handle(const struct ldlm_lock *lock,
@@ -1458,11 +1366,9 @@ ldlm_handle2lock_long(const struct lustre_handle *h, __u64 flags)
  * Update Lock Value Block Operations (LVBO) on a resource taking into account
  * data from request \a r
  */
-static inline int ldlm_lvbo_update(struct ldlm_resource *res,
-				   struct ldlm_lock *lock,
-				   struct ptlrpc_request *req, int increase)
+static inline int ldlm_res_lvbo_update(struct ldlm_resource *res,
+				       struct ptlrpc_request *req, int increase)
 {
-	struct ldlm_namespace *ns = ldlm_res_to_ns(res);
 	int rc;
 
 	/* delayed lvb init may be required */
@@ -1472,21 +1378,14 @@ static inline int ldlm_lvbo_update(struct ldlm_resource *res,
 		return rc;
 	}
 
-	if (ns->ns_lvbo && ns->ns_lvbo->lvbo_update)
-		return ns->ns_lvbo->lvbo_update(res, lock, req, increase);
-
+	if (ldlm_res_to_ns(res)->ns_lvbo &&
+	    ldlm_res_to_ns(res)->ns_lvbo->lvbo_update) {
+		return ldlm_res_to_ns(res)->ns_lvbo->lvbo_update(res, req,
+								 increase);
+	}
 	return 0;
 }
 
-static inline int ldlm_res_lvbo_update(struct ldlm_resource *res,
-				       struct ptlrpc_request *req,
-				       int increase)
-{
-	return ldlm_lvbo_update(res, NULL, req, increase);
-}
-
-int is_granted_or_cancelled_nolock(struct ldlm_lock *lock);
-
 int ldlm_error2errno(enum ldlm_error error);
 enum ldlm_error ldlm_errno2error(int err_no); /* don't call it `errno': this
 					       * confuses user-space. */
@@ -1549,33 +1448,17 @@ void ldlm_lock_fail_match_locked(struct ldlm_lock *lock);
 void ldlm_lock_fail_match(struct ldlm_lock *lock);
 void ldlm_lock_allow_match(struct ldlm_lock *lock);
 void ldlm_lock_allow_match_locked(struct ldlm_lock *lock);
-enum ldlm_mode ldlm_lock_match_with_skip(struct ldlm_namespace *ns,
-					 __u64 flags, __u64 skip_flags,
-					 const struct ldlm_res_id *res_id,
-					 enum ldlm_type type,
-					 union ldlm_policy_data *policy,
-					 enum ldlm_mode mode,
-					 struct lustre_handle *lh,
-					 int unref);
-static inline enum ldlm_mode ldlm_lock_match(struct ldlm_namespace *ns,
-					     __u64 flags,
-					     const struct ldlm_res_id *res_id,
-					     enum ldlm_type type,
-					     union ldlm_policy_data *policy,
-					     enum ldlm_mode mode,
-					     struct lustre_handle *lh,
-					     int unref)
-{
-	return ldlm_lock_match_with_skip(ns, flags, 0, res_id, type, policy,
-					 mode, lh, unref);
-}
-struct ldlm_lock *search_itree(struct ldlm_resource *res,
-			       struct ldlm_match_data *data);
+enum ldlm_mode ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags,
+			       const struct ldlm_res_id *, enum ldlm_type type,
+			       union ldlm_policy_data *, enum ldlm_mode mode,
+			       struct lustre_handle *, int unref);
 enum ldlm_mode ldlm_revalidate_lock_handle(const struct lustre_handle *lockh,
 					   __u64 *bits);
-void ldlm_lock_mode_downgrade(struct ldlm_lock *lock, enum ldlm_mode new_mode);
+struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock,
+					enum ldlm_mode new_mode, __u32 *flags);
+void ldlm_lock_downgrade(struct ldlm_lock *lock, enum ldlm_mode new_mode);
 void ldlm_lock_cancel(struct ldlm_lock *lock);
-void ldlm_reprocess_all(struct ldlm_resource *res, struct ldlm_lock *hint);
+void ldlm_reprocess_all(struct ldlm_resource *res);
 void ldlm_reprocess_recovery_done(struct ldlm_namespace *ns);
 void ldlm_lock_dump_handle(int level, const struct lustre_handle *lockh);
 void ldlm_unlink_lock_skiplist(struct ldlm_lock *req);
@@ -1597,40 +1480,12 @@ void ldlm_namespace_unregister(struct ldlm_namespace *ns,
 			       enum ldlm_side client);
 void ldlm_namespace_get(struct ldlm_namespace *ns);
 void ldlm_namespace_put(struct ldlm_namespace *ns);
-
-int ldlm_debugfs_setup(void);
-void ldlm_debugfs_cleanup(void);
-
-static inline void ldlm_svc_get_eopc(const struct ldlm_request *dlm_req,
-				     struct lprocfs_stats *srv_stats)
-{
-	int lock_type = 0, op = 0;
-
-	lock_type = dlm_req->lock_desc.l_resource.lr_type;
-
-	switch (lock_type) {
-	case LDLM_PLAIN:
-		op = PTLRPC_LAST_CNTR + LDLM_PLAIN_ENQUEUE;
-		break;
-	case LDLM_EXTENT:
-		op = PTLRPC_LAST_CNTR + LDLM_EXTENT_ENQUEUE;
-		break;
-	case LDLM_FLOCK:
-		op = PTLRPC_LAST_CNTR + LDLM_FLOCK_ENQUEUE;
-		break;
-	case LDLM_IBITS:
-		op = PTLRPC_LAST_CNTR + LDLM_IBITS_ENQUEUE;
-		break;
-	default:
-		op = 0;
-		break;
-	}
-
-	if (op != 0)
-		lprocfs_counter_incr(srv_stats, op);
-
-	return;
-}
+int ldlm_proc_setup(void);
+#ifdef CONFIG_PROC_FS
+void ldlm_proc_cleanup(void);
+#else
+static inline void ldlm_proc_cleanup(void) {}
+#endif
 
 /* resource.c - internal */
 struct ldlm_resource *ldlm_resource_get(struct ldlm_namespace *ns,
@@ -1700,8 +1555,7 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
 			  enum ldlm_mode mode, __u64 *flags, void *lvb,
 			  __u32 lvb_len,
 			  const struct lustre_handle *lockh, int rc);
-int ldlm_cli_enqueue_local(const struct lu_env *env,
-			   struct ldlm_namespace *ns,
+int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
 			   const struct ldlm_res_id *res_id,
 			   enum ldlm_type type, union ldlm_policy_data *policy,
 			   enum ldlm_mode mode, __u64 *flags,
@@ -1711,9 +1565,8 @@ int ldlm_cli_enqueue_local(const struct lu_env *env,
 			   void *data, __u32 lvb_len, enum lvb_type lvb_type,
 			   const __u64 *client_cookie,
 			   struct lustre_handle *lockh);
-int ldlm_cli_convert_req(struct ldlm_lock *lock, __u32 *flags, __u64 new_bits);
-int ldlm_cli_convert(struct ldlm_lock *lock,
-		     enum ldlm_cancel_flags cancel_flags);
+int ldlm_cli_convert(const struct lustre_handle *lockh, int new_mode,
+		     __u32 *flags);
 int ldlm_cli_update_pool(struct ptlrpc_request *req);
 int ldlm_cli_cancel(const struct lustre_handle *lockh,
 		    enum ldlm_cancel_flags cancel_flags);
@@ -1737,15 +1590,8 @@ int ldlm_cli_cancel_list_local(struct list_head *cancels, int count,
 int ldlm_cli_cancel_list(struct list_head *head, int count,
 			 struct ptlrpc_request *req,
 			 enum ldlm_cancel_flags flags);
-
-int ldlm_inodebits_drop(struct ldlm_lock *lock, __u64 to_drop);
-int ldlm_cli_inodebits_convert(struct ldlm_lock *lock,
-			       enum ldlm_cancel_flags cancel_flags);
-
 /** @} ldlm_cli_api */
 
-extern unsigned int ldlm_enqueue_min;
-
 /* mds/handler.c */
 /* This has to be here because recursive inclusion sucks. */
 int intent_disposition(struct ldlm_reply *rep, int flag);
@@ -1793,6 +1639,7 @@ void unlock_res_and_lock(struct ldlm_lock *lock);
  * There are not used outside of ldlm.
  * @{
  */
+int ldlm_pools_recalc(enum ldlm_side client);
 int ldlm_pools_init(void);
 void ldlm_pools_fini(void);
 
@@ -1801,7 +1648,7 @@ int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
 int ldlm_pool_shrink(struct ldlm_pool *pl, int nr, gfp_t gfp_mask);
 void ldlm_pool_fini(struct ldlm_pool *pl);
 int ldlm_pool_setup(struct ldlm_pool *pl, int limit);
-time64_t ldlm_pool_recalc(struct ldlm_pool *pl);
+int ldlm_pool_recalc(struct ldlm_pool *pl);
 __u32 ldlm_pool_get_lvf(struct ldlm_pool *pl);
 __u64 ldlm_pool_get_slv(struct ldlm_pool *pl);
 __u64 ldlm_pool_get_clv(struct ldlm_pool *pl);
@@ -1826,7 +1673,5 @@ static inline int ldlm_extent_contain(const struct ldlm_extent *ex1,
 	return ex1->start <= ex2->start && ex1->end >= ex2->end;
 }
 
-int ldlm_inodebits_drop(struct ldlm_lock *lock,  __u64 to_drop);
-
 #endif
 /** @} LDLM */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h b/drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h
index 9fdebcefe66a5..cab4e5f2f702a 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h
@@ -26,7 +26,7 @@
 #ifndef LDLM_ALL_FLAGS_MASK
 
 /** l_flags bits marked as "all_flags" bits */
-#define LDLM_FL_ALL_FLAGS_MASK          0x00FFFFFFC28F932FULL
+#define LDLM_FL_ALL_FLAGS_MASK          0x00FFFFFFC08F932FULL
 
 /** extent, mode, or resource changed */
 #define LDLM_FL_LOCK_CHANGED            0x0000000000000001ULL // bit   0
@@ -44,7 +44,7 @@
 
 /**
  * Server placed lock on conv list, or a recovering client wants the lock
- * added to the conv list, no questions asked. (obsoleted) */
+ * added to the conv list, no questions asked. */
 #define LDLM_FL_BLOCK_CONV              0x0000000000000004ULL // bit   2
 #define ldlm_is_block_conv(_l)          LDLM_TEST_FLAG(( _l), 1ULL <<  2)
 #define ldlm_set_block_conv(_l)         LDLM_SET_FLAG((  _l), 1ULL <<  2)
@@ -58,15 +58,6 @@
 #define ldlm_set_block_wait(_l)         LDLM_SET_FLAG((  _l), 1ULL <<  3)
 #define ldlm_clear_block_wait(_l)       LDLM_CLEAR_FLAG((_l), 1ULL <<  3)
 
-/**
- * Lock request is speculative/asynchronous, and cannot wait for any reason.
- * Fail the lock request if any blocking locks are encountered.
- * */
-#define LDLM_FL_SPECULATIVE		0x0000000000000010ULL /* bit   4 */
-#define ldlm_is_speculative(_l)		LDLM_TEST_FLAG((_l), 1ULL <<  4)
-#define ldlm_set_speculative(_l)	LDLM_SET_FLAG((_l), 1ULL <<  4)
-#define ldlm_clear_specualtive_(_l)	LDLM_CLEAR_FLAG((_l), 1ULL <<  4)
-
 /** blocking or cancel packet was queued for sending. */
 #define LDLM_FL_AST_SENT                0x0000000000000020ULL // bit   5
 #define ldlm_is_ast_sent(_l)            LDLM_TEST_FLAG(( _l), 1ULL <<  5)
@@ -147,35 +138,6 @@
 #define ldlm_set_cos_incompat(_l)	LDLM_SET_FLAG((_l), 1ULL << 24)
 #define ldlm_clear_cos_incompat(_l)	LDLM_CLEAR_FLAG((_l), 1ULL << 24)
 
-/*
- * Flag indicates that lock is being converted (downgraded) during the blocking
- * AST instead of cancelling. Used for IBITS locks now and drops conflicting
- * bits only keepeing other.
- */
-#define LDLM_FL_CONVERTING              0x0000000002000000ULL /* bit  25 */
-#define ldlm_is_converting(_l)          LDLM_TEST_FLAG((_l), 1ULL << 25)
-#define ldlm_set_converting(_l)         LDLM_SET_FLAG((_l), 1ULL << 25)
-#define ldlm_clear_converting(_l)       LDLM_CLEAR_FLAG((_l), 1ULL << 25)
-
-/**
- * Part of original lockahead implementation, OBD_CONNECT_LOCKAHEAD_OLD.
- * Reserved temporarily to allow those implementations to keep working.
- * Will be removed after 2.12 release.
- * */
-#define LDLM_FL_LOCKAHEAD_OLD_RESERVED	0x0000000010000000ULL /* bit  28 */
-#define ldlm_is_do_not_expand_io(_l)	 LDLM_TEST_FLAG((_l), 1ULL << 28)
-#define ldlm_set_do_not_expand_io(_l)	 LDLM_SET_FLAG((_l), 1ULL << 28)
-#define ldlm_clear_do_not_expand_io(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 28)
-
-/**
- * Do not expand this lock.  Grant it only on the extent requested.
- * Used for manually requested locks from the client (LU_LADVISE_LOCKAHEAD).
- * */
-#define LDLM_FL_NO_EXPANSION		0x0000000020000000ULL /* bit  29 */
-#define ldlm_is_do_not_expand(_l)	LDLM_TEST_FLAG((_l), 1ULL << 29)
-#define ldlm_set_do_not_expand(_l)	LDLM_SET_FLAG((_l), 1ULL << 29)
-#define ldlm_clear_do_not_expand(_l)	LDLM_CLEAR_FLAG((_l), 1ULL << 29)
-
 /**
  * measure lock contention and return -EUSERS if locking contention is high */
 #define LDLM_FL_DENY_ON_CONTENTION        0x0000000040000000ULL // bit  30
@@ -392,43 +354,26 @@
 #define ldlm_is_cos_enabled(_l)          LDLM_TEST_FLAG((_l), 1ULL << 57)
 #define ldlm_set_cos_enabled(_l)         LDLM_SET_FLAG((_l), 1ULL << 57)
 
-/**
- * This flags means to use non-delay RPC to send dlm request RPC.
- */
-#define LDLM_FL_NDELAY			 0x0400000000000000ULL /* bit  58 */
-#define ldlm_is_ndelay(_l)		 LDLM_TEST_FLAG((_l), 1ULL << 58)
-#define ldlm_set_ndelay(_l)		 LDLM_SET_FLAG((_l), 1ULL << 58)
-
-/**
- * LVB from this lock is cached in osc object
- */
-#define LDLM_FL_LVB_CACHED              0x0800000000000000ULL /* bit  59 */
-#define ldlm_is_lvb_cached(_l)          LDLM_TEST_FLAG((_l), 1ULL << 59)
-#define ldlm_set_lvb_cached(_l)         LDLM_SET_FLAG((_l), 1ULL << 59)
-#define ldlm_clear_lvb_cached(_l)       LDLM_CLEAR_FLAG((_l), 1ULL << 59)
-
 /** l_flags bits marked as "ast" bits */
 #define LDLM_FL_AST_MASK                (LDLM_FL_FLOCK_DEADLOCK		|\
-					 LDLM_FL_DISCARD_DATA)
+					 LDLM_FL_AST_DISCARD_DATA)
 
 /** l_flags bits marked as "blocked" bits */
 #define LDLM_FL_BLOCKED_MASK            (LDLM_FL_BLOCK_GRANTED		|\
+					 LDLM_FL_BLOCK_CONV		|\
 					 LDLM_FL_BLOCK_WAIT)
 
 /** l_flags bits marked as "gone" bits */
 #define LDLM_FL_GONE_MASK		(LDLM_FL_DESTROYED		|\
 					 LDLM_FL_FAILED)
 
-/** l_flags bits marked as "inherit" bits
- * Flags inherited from wire on enqueue/reply between client/server.
- * CANCEL_ON_BLOCK so server will not grant if a blocking lock is found
- * NO_TIMEOUT flag to force ldlm_lock_match() to wait with no timeout.
- * TEST_LOCK flag to not let TEST lock to be granted.
- * NO_EXPANSION to tell server not to expand extent of lock request */
+/** l_flags bits marked as "inherit" bits */
+/* Flags inherited from wire on enqueue/reply between client/server. */
+/* NO_TIMEOUT flag to force ldlm_lock_match() to wait with no timeout. */
+/* TEST_LOCK flag to not let TEST lock to be granted. */
 #define LDLM_FL_INHERIT_MASK            (LDLM_FL_CANCEL_ON_BLOCK	|\
 					 LDLM_FL_NO_TIMEOUT		|\
-					 LDLM_FL_TEST_LOCK              |\
-					 LDLM_FL_NO_EXPANSION)
+					 LDLM_FL_TEST_LOCK)
 
 /** flags returned in @flags parameter on ldlm_lock_enqueue,
  * to be re-constructed on re-send */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_eacl.h b/drivers/staging/lustrefsx/lustre/include/lustre_eacl.h
index 03b9adc84897c..3061be1bc6124 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_eacl.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_eacl.h
@@ -24,7 +24,7 @@
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_export.h b/drivers/staging/lustrefsx/lustre/include/lustre_export.h
index 5cf29e1a74d00..8552d3d1c00a7 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_export.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_export.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -42,10 +42,8 @@
  * @{
  */
 
-#include <linux/workqueue.h>
-
 #include <lprocfs_status.h>
-#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre/lustre_idl.h>
 #include <lustre_dlm.h>
 
 struct mds_client_data;
@@ -101,13 +99,6 @@ struct tg_export_data {
 	long			ted_grant;    /* in bytes */
 	long			ted_pending;  /* bytes just being written */
 	__u8			ted_pagebits; /* log2 of client page size */
-
-	/**
-	 * File Modification Data (FMD) tracking
-	 */
-	spinlock_t		ted_fmd_lock; /* protects ted_fmd_list */
-	struct list_head	ted_fmd_list; /* FIDs being modified */
-	int			ted_fmd_count;/* items in ted_fmd_list */
 };
 
 /**
@@ -128,10 +119,13 @@ struct ec_export_data { /* echo client */
 /** Filter (oss-side) specific import data */
 struct filter_export_data {
 	struct tg_export_data	fed_ted;
+	spinlock_t		fed_lock;	/**< protects fed_mod_list */
 	__u64			fed_lastid_gen;
+	struct list_head	fed_mod_list; /* files being modified */
 	/* count of SOFT_SYNC RPCs, which will be reset after
 	 * ofd_soft_sync_limit number of RPCs, and trigger a sync. */
 	atomic_t		fed_soft_sync_count;
+	int			fed_mod_count;/* items in fed_writing list */
 	__u32			fed_group;
 };
 
@@ -208,8 +202,6 @@ struct obd_export {
 	struct obd_uuid		exp_client_uuid;
         /** To link all exports on an obd device */
 	struct list_head	exp_obd_chain;
-	/** work_struct for destruction of export */
-	struct work_struct	exp_zombie_work;
 	/* Unlinked export list */
 	struct list_head	exp_stale_list;
 	struct hlist_node	exp_uuid_hash;	/** uuid-export hash*/
@@ -247,44 +239,45 @@ struct obd_export {
 	/** Last committed transno for this export */
 	__u64			exp_last_committed;
 	/** When was last request received */
-	time64_t		exp_last_request_time;
+	cfs_time_t		exp_last_request_time;
 	/** On replay all requests waiting for replay are linked here */
 	struct list_head	exp_req_replay_queue;
 	/**
 	 * protects exp_flags, exp_outstanding_replies and the change
 	 * of exp_imp_reverse
 	 */
-	spinlock_t		exp_lock;
+	spinlock_t		  exp_lock;
 	/** Compatibility flags for this export are embedded into
 	 *  exp_connect_data */
-	struct obd_connect_data exp_connect_data;
-	enum obd_option		exp_flags;
-	unsigned long		exp_failed:1,
-				exp_in_recovery:1,
-				exp_disconnected:1,
-				exp_connecting:1,
-				/** VBR: export missed recovery */
-				exp_delayed:1,
-				/** VBR: failed version checking */
-				exp_vbr_failed:1,
-				exp_req_replay_needed:1,
-				exp_lock_replay_needed:1,
-				exp_need_sync:1,
-				exp_flvr_changed:1,
-				exp_flvr_adapt:1,
-				/* if to swap nidtbl entries for 2.2 clients.
-				 * Only used by the MGS to fix LU-1644. */
-				exp_need_mne_swab:1,
-				/* The export already got final replay ping
-				 * request. */
-				exp_replay_done:1;
-	/* also protected by exp_lock */
-	enum lustre_sec_part	exp_sp_peer;
-	struct sptlrpc_flavor	exp_flvr;		/* current */
-	struct sptlrpc_flavor	exp_flvr_old[2];	/* about-to-expire */
-	time64_t		exp_flvr_expire[2];	/* seconds */
-
-	/** protects exp_hp_rpcs */
+	struct obd_connect_data   exp_connect_data;
+        enum obd_option           exp_flags;
+        unsigned long             exp_failed:1,
+                                  exp_in_recovery:1,
+                                  exp_disconnected:1,
+                                  exp_connecting:1,
+                                  /** VBR: export missed recovery */
+                                  exp_delayed:1,
+                                  /** VBR: failed version checking */
+                                  exp_vbr_failed:1,
+                                  exp_req_replay_needed:1,
+                                  exp_lock_replay_needed:1,
+                                  exp_need_sync:1,
+                                  exp_flvr_changed:1,
+                                  exp_flvr_adapt:1,
+                                  exp_libclient:1, /* liblustre client? */
+				  /* if to swap nidtbl entries for 2.2 clients.
+				   * Only used by the MGS to fix LU-1644. */
+				  exp_need_mne_swab:1,
+				  /* The export already got final replay ping
+				   * request. */
+				  exp_replay_done:1;
+        /* also protected by exp_lock */
+        enum lustre_sec_part      exp_sp_peer;
+        struct sptlrpc_flavor     exp_flvr;             /* current */
+        struct sptlrpc_flavor     exp_flvr_old[2];      /* about-to-expire */
+	time64_t		  exp_flvr_expire[2];	/* seconds */
+
+        /** protects exp_hp_rpcs */
 	spinlock_t		exp_rpc_lock;
 	struct list_head	exp_hp_rpcs;	/* (potential) HP RPCs */
 	struct list_head	exp_reg_rpcs;  /* RPC being handled */
@@ -325,18 +318,6 @@ static inline __u64 exp_connect_flags(struct obd_export *exp)
 	return *exp_connect_flags_ptr(exp);
 }
 
-static inline __u64 *exp_connect_flags2_ptr(struct obd_export *exp)
-{
-	return &exp->exp_connect_data.ocd_connect_flags2;
-}
-
-static inline __u64 exp_connect_flags2(struct obd_export *exp)
-{
-	if (exp_connect_flags(exp) & OBD_CONNECT_FLAGS2)
-		return *exp_connect_flags2_ptr(exp);
-	return 0;
-}
-
 static inline int exp_max_brw_size(struct obd_export *exp)
 {
 	LASSERT(exp != NULL);
@@ -351,6 +332,13 @@ static inline int exp_connect_multibulk(struct obd_export *exp)
 	return exp_max_brw_size(exp) > ONE_MB_BRW_SIZE;
 }
 
+static inline int exp_expired(struct obd_export *exp, cfs_duration_t age)
+{
+        LASSERT(exp->exp_delayed);
+        return cfs_time_before(cfs_time_add(exp->exp_last_request_time, age),
+                               cfs_time_current_sec());
+}
+
 static inline int exp_connect_cancelset(struct obd_export *exp)
 {
 	LASSERT(exp != NULL);
@@ -419,13 +407,6 @@ static inline bool imp_connect_disp_stripe(struct obd_import *imp)
 	return ocd->ocd_connect_flags & OBD_CONNECT_DISP_STRIPE;
 }
 
-static inline bool imp_connect_shortio(struct obd_import *imp)
-{
-	struct obd_connect_data *ocd = &imp->imp_connect_data;
-
-	return ocd->ocd_connect_flags & OBD_CONNECT_SHORTIO;
-}
-
 static inline __u64 exp_connect_ibits(struct obd_export *exp)
 {
 	struct obd_connect_data *ocd;
@@ -439,50 +420,13 @@ static inline int exp_connect_large_acl(struct obd_export *exp)
 	return !!(exp_connect_flags(exp) & OBD_CONNECT_LARGE_ACL);
 }
 
-static inline int exp_connect_lockahead_old(struct obd_export *exp)
-{
-	return !!(exp_connect_flags(exp) & OBD_CONNECT_LOCKAHEAD_OLD);
-}
-
-static inline int exp_connect_lockahead(struct obd_export *exp)
-{
-	return !!(exp_connect_flags2(exp) & OBD_CONNECT2_LOCKAHEAD);
-}
-
-static inline int exp_connect_flr(struct obd_export *exp)
-{
-	return !!(exp_connect_flags2(exp) & OBD_CONNECT2_FLR);
-}
-
-static inline int exp_connect_lock_convert(struct obd_export *exp)
-{
-	return !!(exp_connect_flags2(exp) & OBD_CONNECT2_LOCK_CONVERT);
-}
-
 extern struct obd_export *class_conn2export(struct lustre_handle *conn);
+extern struct obd_device *class_conn2obd(struct lustre_handle *conn);
 
-static inline int exp_connect_archive_id_array(struct obd_export *exp)
-{
-	return !!(exp_connect_flags2(exp) & OBD_CONNECT2_ARCHIVE_ID_ARRAY);
-}
-
-static inline int exp_connect_sepol(struct obd_export *exp)
-{
-	return !!(exp_connect_flags2(exp) & OBD_CONNECT2_SELINUX_POLICY);
-}
-
-enum {
-	/* archive_ids in array format */
-	KKUC_CT_DATA_ARRAY_MAGIC	= 0x092013cea,
-	/* archive_ids in bitmap format */
-	KKUC_CT_DATA_BITMAP_MAGIC	= 0x082018cea,
-};
-
-
+#define KKUC_CT_DATA_MAGIC	0x092013cea
 struct kkuc_ct_data {
 	__u32		kcd_magic;
-	__u32		kcd_nr_archives;
-	__u32		kcd_archives[0];
+	__u32		kcd_archive;
 };
 
 /** @} export */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_fid.h b/drivers/staging/lustrefsx/lustre/include/lustre_fid.h
index ea6d743b1aaae..43d0c3419417d 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_fid.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_fid.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -149,9 +149,9 @@
  */
 
 #include <libcfs/libcfs.h>
-#include <uapi/linux/lustre/lustre_fid.h>
-#include <uapi/linux/lustre/lustre_idl.h>
-#include <uapi/linux/lustre/lustre_ostid.h>
+#include <uapi/linux/lustre_fid.h>
+#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre_ostid.h>
 
 struct lu_env;
 struct lu_site;
@@ -196,6 +196,13 @@ enum {
 	LUSTRE_SEQ_SUPER_WIDTH = ((1ULL << 30ULL) * LUSTRE_SEQ_META_WIDTH)
 };
 
+enum {
+        /** 2^6 FIDs for OI containers */
+        OSD_OI_FID_OID_BITS     = 6,
+        /** reserve enough FIDs in case we want more in the future */
+        OSD_OI_FID_OID_BITS_MAX = 10,
+};
+
 /** special OID for local objects */
 enum local_oid {
 	/** \see fld_mod_init */
@@ -218,7 +225,6 @@ enum local_oid {
 	OSD_LPF_OID		= 19UL,
 	REPLY_DATA_OID		= 21UL,
 	ACCT_PROJECT_OID	= 22UL,
-	INDEX_BACKUP_OID	= 4116UL,
 	OFD_LAST_GROUP_OID	= 4117UL,
 	LLOG_CATALOGS_OID	= 4118UL,
 	MGS_CONFIGS_OID		= 4119UL,
@@ -344,13 +350,10 @@ static inline void filter_fid_cpu_to_le(struct filter_fid *dst,
 {
 	fid_cpu_to_le(&dst->ff_parent, &src->ff_parent);
 
-	if (size < sizeof(struct filter_fid)) {
+	if (size < sizeof(struct filter_fid))
 		memset(&dst->ff_layout, 0, sizeof(dst->ff_layout));
-	} else {
+	else
 		ost_layout_cpu_to_le(&dst->ff_layout, &src->ff_layout);
-		dst->ff_layout_version = cpu_to_le32(src->ff_layout_version);
-		dst->ff_range = cpu_to_le32(src->ff_range);
-	}
 
 	/* XXX: Add more if filter_fid is enlarged in the future. */
 }
@@ -360,13 +363,10 @@ static inline void filter_fid_le_to_cpu(struct filter_fid *dst,
 {
 	fid_le_to_cpu(&dst->ff_parent, &src->ff_parent);
 
-	if (size < sizeof(struct filter_fid)) {
+	if (size < sizeof(struct filter_fid))
 		memset(&dst->ff_layout, 0, sizeof(dst->ff_layout));
-	} else {
+	else
 		ost_layout_le_to_cpu(&dst->ff_layout, &src->ff_layout);
-		dst->ff_layout_version = le32_to_cpu(src->ff_layout_version);
-		dst->ff_range = le32_to_cpu(src->ff_range);
-	}
 
 	/* XXX: Add more if filter_fid is enlarged in the future. */
 }
@@ -416,8 +416,8 @@ struct lu_client_seq {
          */
         struct lu_seq_range         lcs_space;
 
-	/* Seq related debugfs */
-	struct dentry		*lcs_debugfs_entry;
+        /* Seq related proc */
+	struct proc_dir_entry   *lcs_proc_dir;
 
         /* This holds last allocated fid in last obtained seq */
         struct lu_fid           lcs_fid;
@@ -427,7 +427,7 @@ struct lu_client_seq {
 
         /*
          * Service uuid, passed from MDT + seq name to form unique seq name to
-	 * use it with debugfs.
+         * use it with procfs.
          */
         char                    lcs_name[80];
 
@@ -463,8 +463,8 @@ struct lu_server_seq {
         /* /seq file object device */
         struct dt_object       *lss_obj;
 
-	/* Seq related debugfs */
-	struct dentry		*lss_debugfs_entry;
+        /* Seq related proc */
+	struct proc_dir_entry	*lss_proc_dir;
 
         /* LUSTRE_SEQ_SERVER or LUSTRE_SEQ_CONTROLLER */
         enum lu_mgr_type       lss_type;
@@ -477,7 +477,7 @@ struct lu_server_seq {
 
         /*
          * Service uuid, passed from MDT + seq name to form unique seq name to
-	 * use it with debugfs.
+         * use it with procfs.
          */
         char                    lss_name[80];
 
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_fld.h b/drivers/staging/lustrefsx/lustre/include/lustre_fld.h
index 102dcfac77480..2f39962f8fb5e 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_fld.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_fld.h
@@ -38,7 +38,7 @@
  * @{
  */
 
-#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre/lustre_idl.h>
 #include <libcfs/libcfs.h>
 #include <seq_range.h>
 
@@ -67,10 +67,9 @@ struct lu_fld_target {
 };
 
 struct lu_server_fld {
-	/**
-	 * Fld dir debugfs entry.
-	 */
-	struct dentry		*lsf_debugfs_entry;
+        /**
+         * Fld dir proc entry. */
+	struct proc_dir_entry	*lsf_proc_dir;
 
         /**
          * /fld file object device */
@@ -109,9 +108,8 @@ struct lu_server_fld {
 
 struct lu_client_fld {
 	/**
-	 * Client side debugfs entry.
-	 */
-	struct dentry		*lcf_debugfs_entry;
+	 * Client side proc entry. */
+	struct proc_dir_entry	*lcf_proc_dir;
 
 	/**
 	 * List of exports client FLD knows about. */
@@ -134,8 +132,7 @@ struct lu_client_fld {
         struct fld_cache        *lcf_cache;
 
         /**
-	 * Client fld debugfs entry name.
-	 */
+         * Client fld proc entry name. */
         char                     lcf_name[80];
 };
 
@@ -192,7 +189,7 @@ int fld_client_add_target(struct lu_client_fld *fld,
 int fld_client_del_target(struct lu_client_fld *fld,
                           __u64 idx);
 
-void fld_client_debugfs_fini(struct lu_client_fld *fld);
+void fld_client_proc_fini(struct lu_client_fld *fld);
 
 /** @} fld */
 
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_ha.h b/drivers/staging/lustrefsx/lustre/include/lustre_ha.h
index 2cb4969b615bf..7c22d985af5a4 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_ha.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_ha.h
@@ -50,7 +50,7 @@ void ptlrpc_free_committed(struct obd_import *imp);
 void ptlrpc_wake_delayed(struct obd_import *imp);
 int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async);
 int ptlrpc_set_import_active(struct obd_import *imp, int active);
-void ptlrpc_activate_import(struct obd_import *imp, bool set_state_full);
+void ptlrpc_activate_import(struct obd_import *imp);
 void ptlrpc_deactivate_import(struct obd_import *imp);
 void ptlrpc_invalidate_import(struct obd_import *imp);
 void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt);
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_idmap.h b/drivers/staging/lustrefsx/lustre/include/lustre_idmap.h
index a8c5a218b6c7d..57a192359d118 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_idmap.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_idmap.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_import.h b/drivers/staging/lustrefsx/lustre/include/lustre_import.h
index 430fde2e92738..1b44d32393139 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_import.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_import.h
@@ -42,15 +42,10 @@
  *
  * @{
  */
-#include <linux/atomic.h>
-#include <linux/list.h>
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
-#include <linux/time.h>
-#include <linux/types.h>
-#include <linux/workqueue.h>
-#include <libcfs/libcfs.h>
-#include <uapi/linux/lustre/lustre_idl.h>
+
+#include <lustre_handles.h>
+#include <lustre/lustre_idl.h>
+
 
 /**
  * Adaptive Timeout stuff
@@ -106,21 +101,19 @@ enum lustre_imp_state {
         LUSTRE_IMP_RECOVER    = 8,
         LUSTRE_IMP_FULL       = 9,
         LUSTRE_IMP_EVICTED    = 10,
-	LUSTRE_IMP_IDLE	      = 11,
-	LUSTRE_IMP_LAST
 };
 
 /** Returns test string representation of numeric import state \a state */
 static inline char * ptlrpc_import_state_name(enum lustre_imp_state state)
 {
-	static char *import_state_names[] = {
-		"<UNKNOWN>", "CLOSED",  "NEW", "DISCONN",
-		"CONNECTING", "REPLAY", "REPLAY_LOCKS", "REPLAY_WAIT",
-		"RECOVER", "FULL", "EVICTED", "IDLE",
-	};
-
-	LASSERT(state < LUSTRE_IMP_LAST);
-	return import_state_names[state];
+        static char* import_state_names[] = {
+                "<UNKNOWN>", "CLOSED",  "NEW", "DISCONN",
+                "CONNECTING", "REPLAY", "REPLAY_LOCKS", "REPLAY_WAIT",
+                "RECOVER", "FULL", "EVICTED",
+        };
+
+        LASSERT (state <= LUSTRE_IMP_EVICTED);
+        return import_state_names[state];
 }
 
 /**
@@ -147,9 +140,9 @@ struct obd_import_conn {
         /** uuid of remote side */
         struct obd_uuid           oic_uuid;
         /**
-	 * Time (64 bit seconds) of last connection attempt on this connection
+         * Time (64 bit jiffies) of last connection attempt on this connection
          */
-	time64_t		  oic_last_attempt;
+        __u64                     oic_last_attempt;
 };
 
 /* state history */
@@ -164,6 +157,8 @@ struct import_state_hist {
  * Imports are representing client-side view to remote target.
  */
 struct obd_import {
+	/** Local handle (== id) for this import. */
+	struct portals_handle     imp_handle;
 	/** Reference counter */
 	atomic_t                  imp_refcount;
 	struct lustre_handle      imp_dlm_handle; /* client's ldlm export */
@@ -173,8 +168,8 @@ struct obd_import {
         struct ptlrpc_client     *imp_client;
 	/** List element for linking into pinger chain */
 	struct list_head	  imp_pinger_chain;
-	/** work struct for destruction of import */
-	struct work_struct	  imp_zombie_work;
+	/** List element for linking into chain for destruction */
+	struct list_head	  imp_zombie_chain;
 
         /**
          * Lists of requests that are retained for replay, waiting for a reply,
@@ -218,17 +213,12 @@ struct obd_import {
 	/** Wait queue for those who need to wait for recovery completion */
 	wait_queue_head_t         imp_recovery_waitq;
 
-	/** Number of requests allocated */
-	atomic_t                  imp_reqs;
 	/** Number of requests currently in-flight */
 	atomic_t                  imp_inflight;
 	/** Number of requests currently unregistering */
 	atomic_t                  imp_unregistering;
 	/** Number of replay requests inflight */
 	atomic_t                  imp_replay_inflight;
-	/** In-flight replays rate control */
-	wait_queue_head_t	  imp_replay_waitq;
-
 	/** Number of currently happening import invalidations */
 	atomic_t                  imp_inval_count;
 	/** Numbner of request timeouts */
@@ -242,8 +232,6 @@ struct obd_import {
         int                       imp_state_hist_idx;
         /** Current import generation. Incremented on every reconnect */
         int                       imp_generation;
-	/** Idle connection initiated at this generation */
-	int			  imp_initiated_at;
         /** Incremented every time we send reconnection request */
         __u32                     imp_conn_cnt;
        /** 
@@ -268,9 +256,9 @@ struct obd_import {
          */
         struct lustre_handle      imp_remote_handle;
         /** When to perform next ping. time in jiffies. */
-	time64_t		imp_next_ping;
+        cfs_time_t                imp_next_ping;
 	/** When we last successfully connected. time in 64bit jiffies */
-	time64_t		imp_last_success_conn;
+        __u64                     imp_last_success_conn;
 
         /** List of all possible connection for import. */
 	struct list_head	imp_conn_list;
@@ -295,6 +283,9 @@ struct obd_import {
 				  imp_server_timeout:1,
 				  /* VBR: imp in delayed recovery */
 				  imp_delayed_recovery:1,
+				  /* VBR: if gap was found then no lock replays
+				   */
+				  imp_no_lock_replay:1,
 				  /* recovery by versions was failed */
 				  imp_vbr_failed:1,
 				  /* force an immidiate ping */
@@ -307,32 +298,30 @@ struct obd_import {
 				  imp_resend_replay:1,
 				  /* disable normal recovery, for test only. */
 				  imp_no_pinger_recover:1,
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
+				  /* need IR MNE swab */
+				  imp_need_mne_swab:1,
+#endif
 				  /* import must be reconnected instead of
 				   * chouse new connection */
 				  imp_force_reconnect:1,
 				  /* import has tried to connect with server */
 				  imp_connect_tried:1,
 				  /* connected but not FULL yet */
-				  imp_connected:1,
-				  /* grant shrink disabled */
-				  imp_grant_shrink_disabled:1,
-				  /* to supress LCONSOLE() at conn.restore */
-				  imp_was_idle:1;
-	u32			  imp_connect_op;
-	u32			  imp_idle_timeout;
-	u32			  imp_idle_debug;
-	struct obd_connect_data	  imp_connect_data;
-	__u64			  imp_connect_flags_orig;
-	__u64			  imp_connect_flags2_orig;
-	int			  imp_connect_error;
-
-	enum lustre_msg_magic	imp_msg_magic;
-				/* adjusted based on server capability */
-	enum lustre_msghdr	imp_msghdr_flags;
-
-				/* adaptive timeout data */
-	struct imp_at		imp_at;
-	time64_t		imp_last_reply_time;	/* for health check */
+				  imp_connected:1;
+	__u32                     imp_connect_op;
+	struct obd_connect_data   imp_connect_data;
+	__u64                     imp_connect_flags_orig;
+	__u64                     imp_connect_flags2_orig;
+	int                       imp_connect_error;
+
+	__u32                     imp_msg_magic;
+				  /* adjusted based on server capability */
+	__u32                     imp_msghdr_flags;
+
+				  /* adaptive timeout data */
+	struct imp_at             imp_at;
+	time64_t		  imp_last_reply_time;	/* for health check */
 };
 
 /* import.c */
@@ -342,11 +331,11 @@ static inline unsigned int at_est2timeout(unsigned int val)
         return (val + (val >> 2) + 5);
 }
 
-static inline timeout_t at_timeout2est(timeout_t timeout)
+static inline unsigned int at_timeout2est(unsigned int val)
 {
-	/* restore estimate value from timeout: e=4/5(t-5) */
-	LASSERT(timeout > 0);
-	return max((timeout << 2) / 5, 5) - 4;
+        /* restore estimate value from timeout: e=4/5(t-5) */
+        LASSERT(val);
+        return (max((val << 2) / 5, 5U) - 4);
 }
 
 static inline void at_reset_nolock(struct adaptive_timeout *at, int val)
@@ -392,6 +381,7 @@ extern unsigned int at_max;
 /* genops.c */
 struct obd_export;
 extern struct obd_import *class_exp2cliimp(struct obd_export *);
+extern struct obd_import *class_conn2cliimp(struct lustre_handle *);
 
 /** @} import */
 
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_kernelcomm.h b/drivers/staging/lustrefsx/lustre/include/lustre_kernelcomm.h
index 4af88af0edf87..4fc76566501ba 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_kernelcomm.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_kernelcomm.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2013, 2017, Intel Corporation.
+ * Copyright (c) 2013, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,7 +38,7 @@
 #define __LUSTRE_KERNELCOMM_H__
 
 /* For declarations shared with userspace */
-#include <uapi/linux/lustre/lustre_kernelcomm.h>
+#include <uapi_kernelcomm.h>
 
 /* prototype for callback function on kuc groups */
 typedef int (*libcfs_kkuc_cb_t)(void *data, void *cb_arg);
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_lfsck.h b/drivers/staging/lustrefsx/lustre/include/lustre_lfsck.h
index 11409b97e66c8..37f6ee1de49eb 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_lfsck.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_lfsck.h
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2013, 2017, Intel Corporation.
+ * Copyright (c) 2013, 2016, Intel Corporation.
  */
 /*
  * lustre/include/lustre_lfsck.h
@@ -33,7 +33,7 @@
 #ifndef _LUSTRE_LFSCK_H
 # define _LUSTRE_LFSCK_H
 
-#include <uapi/linux/lustre/lustre_lfsck_user.h>
+#include <lustre/lustre_lfsck_user.h>
 #include <lustre_dlm.h>
 #include <lu_object.h>
 #include <dt_object.h>
@@ -101,10 +101,10 @@ int lfsck_query(const struct lu_env *env, struct dt_device *key,
 		struct lfsck_request *req, struct lfsck_reply *rep,
 		struct lfsck_query *que);
 
-int lfsck_get_speed(struct seq_file *m, char *buf, struct dt_device *key);
+int lfsck_get_speed(struct seq_file *m, struct dt_device *key);
 int lfsck_set_speed(struct dt_device *key, __u32 val);
-int lfsck_get_windows(char *buf, struct dt_device *key);
-int lfsck_set_windows(struct dt_device *key, unsigned int val);
+int lfsck_get_windows(struct seq_file *m, struct dt_device *key);
+int lfsck_set_windows(struct dt_device *key, int val);
 
 int lfsck_dump(struct seq_file *m, struct dt_device *key, enum lfsck_type type);
 
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_lib.h b/drivers/staging/lustrefsx/lustre/include/lustre_lib.h
index f67791252056d..df1ca627aa4d0 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_lib.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_lib.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -42,15 +42,11 @@
  * @{
  */
 
-#ifdef HAVE_SCHED_HEADERS
-#include <linux/sched/signal.h>
-#include <linux/sched/mm.h>
-#endif
-
+#include <libcfs/linux/linux-misc.h>
 #include <libcfs/libcfs.h>
-#include <uapi/linux/lustre/lustre_idl.h>
-#include <uapi/linux/lustre/lustre_ver.h>
-#include <uapi/linux/lustre/lustre_cfg.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_ver.h>
+#include <uapi/linux/lustre_cfg.h>
 
 /* target.c */
 struct ptlrpc_request;
@@ -73,6 +69,7 @@ int rev_import_init(struct obd_export *exp);
 int target_handle_connect(struct ptlrpc_request *req);
 int target_handle_disconnect(struct ptlrpc_request *req);
 void target_destroy_export(struct obd_export *exp);
+int target_handle_ping(struct ptlrpc_request *req);
 void target_committed_to_req(struct ptlrpc_request *req);
 void target_cancel_recovery_timer(struct obd_device *obd);
 void target_stop_recovery_thread(struct obd_device *obd);
@@ -164,9 +161,9 @@ static inline int back_to_sleep(void *arg)
 #define LWI_ON_SIGNAL_NOOP ((void (*)(void *))(-1))
 
 struct l_wait_info {
-	long		lwi_timeout;
-	long		lwi_interval;
-	int		lwi_allow_intr;
+        cfs_duration_t lwi_timeout;
+        cfs_duration_t lwi_interval;
+        int            lwi_allow_intr;
         int  (*lwi_on_timeout)(void *);
         void (*lwi_on_signal)(void *);
         void  *lwi_cb_data;
@@ -258,8 +255,8 @@ static inline void __add_wait_queue_exclusive(wait_queue_head_t *q,
 #define __l_wait_event(wq, condition, info, ret, l_add_wait)                   \
 do {                                                                           \
 	wait_queue_entry_t __wait;                                             \
-	long __timeout = info->lwi_timeout;				       \
-	sigset_t __blocked;						       \
+	cfs_duration_t __timeout = info->lwi_timeout;                          \
+	sigset_t   __blocked;                                              \
 	int   __allow_intr = info->lwi_allow_intr;                             \
 									       \
 	ret = 0;                                                               \
@@ -308,12 +305,13 @@ do {                                                                           \
 		if (__timeout == 0) {                                          \
 			schedule();					       \
 		} else {                                                       \
-			long interval = info->lwi_interval ?		       \
-						min_t(long, info->lwi_interval,\
-						      __timeout) : __timeout;  \
-			long remaining = schedule_timeout(interval);	       \
-									       \
-			__timeout -= interval - remaining;		       \
+			cfs_duration_t interval = info->lwi_interval?          \
+					     min_t(cfs_duration_t,             \
+						 info->lwi_interval,__timeout):\
+					     __timeout;                        \
+			cfs_duration_t remaining = schedule_timeout(interval); \
+			__timeout = cfs_time_sub(__timeout,                    \
+					    cfs_time_sub(interval, remaining));\
 			if (__timeout == 0) {                                  \
 				if (info->lwi_on_timeout == NULL ||            \
 				    info->lwi_on_timeout(info->lwi_cb_data)) { \
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_linkea.h b/drivers/staging/lustrefsx/lustre/include/lustre_linkea.h
index 3bf6e2b54fd9b..89a040f735d5d 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_linkea.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_linkea.h
@@ -21,7 +21,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2013, 2017, Intel Corporation.
+ * Copyright (c) 2013, 2014, Intel Corporation.
  * Use is subject to license terms.
  *
  * Author: di wang <di.wang@intel.com>
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_lmv.h b/drivers/staging/lustrefsx/lustre/include/lustre_lmv.h
index d5fb751524b0b..f936973801012 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_lmv.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_lmv.h
@@ -32,7 +32,7 @@
 
 #ifndef _LUSTRE_LMV_H
 #define _LUSTRE_LMV_H
-#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre/lustre_idl.h>
 
 struct lmv_oinfo {
 	struct lu_fid	lmo_fid;
@@ -46,8 +46,6 @@ struct lmv_stripe_md {
 	__u32	lsm_md_master_mdt_index;
 	__u32	lsm_md_hash_type;
 	__u32	lsm_md_layout_version;
-	__u32	lsm_md_migrate_offset;
-	__u32	lsm_md_migrate_hash;
 	__u32	lsm_md_default_count;
 	__u32	lsm_md_default_index;
 	char	lsm_md_pool_name[LOV_MAXPOOLNAME + 1];
@@ -66,10 +64,6 @@ lsm_md_eq(const struct lmv_stripe_md *lsm1, const struct lmv_stripe_md *lsm2)
 	    lsm1->lsm_md_hash_type != lsm2->lsm_md_hash_type ||
 	    lsm1->lsm_md_layout_version !=
 				lsm2->lsm_md_layout_version ||
-	    lsm1->lsm_md_migrate_offset !=
-				lsm2->lsm_md_migrate_offset ||
-	    lsm1->lsm_md_migrate_hash !=
-				lsm2->lsm_md_migrate_hash ||
 	    strcmp(lsm1->lsm_md_pool_name,
 		      lsm2->lsm_md_pool_name) != 0)
 		return false;
@@ -82,27 +76,14 @@ lsm_md_eq(const struct lmv_stripe_md *lsm1, const struct lmv_stripe_md *lsm2)
 
 	return true;
 }
-
-static inline void lsm_md_dump(int mask, const struct lmv_stripe_md *lsm)
-{
-	int i;
-
-	CDEBUG(mask, "magic %#x stripe count %d master mdt %d hash type %#x "
-		"version %d migrate offset %d migrate hash %#x pool %s\n",
-		lsm->lsm_md_magic, lsm->lsm_md_stripe_count,
-		lsm->lsm_md_master_mdt_index, lsm->lsm_md_hash_type,
-		lsm->lsm_md_layout_version, lsm->lsm_md_migrate_offset,
-		lsm->lsm_md_migrate_hash, lsm->lsm_md_pool_name);
-
-	for (i = 0; i < lsm->lsm_md_stripe_count; i++)
-		CDEBUG(mask, "stripe[%d] "DFID"\n",
-		       i, PFID(&lsm->lsm_md_oinfo[i].lmo_fid));
-}
-
 union lmv_mds_md;
 
 void lmv_free_memmd(struct lmv_stripe_md *lsm);
 
+int lmvea_load_shards(const struct lu_env *env, struct dt_object *obj,
+		      struct lu_dirent *ent, struct lu_buf *buf,
+		      bool resize);
+
 static inline void lmv1_le_to_cpu(struct lmv_mds_md_v1 *lmv_dst,
 				  const struct lmv_mds_md_v1 *lmv_src)
 {
@@ -160,14 +141,18 @@ static inline int lmv_name_to_stripe_index(__u32 lmv_hash_type,
 					   unsigned int stripe_count,
 					   const char *name, int namelen)
 {
-	int idx;
+	int	idx;
+	__u32	hash_type = lmv_hash_type & LMV_HASH_TYPE_MASK;
 
 	LASSERT(namelen > 0);
-
 	if (stripe_count <= 1)
 		return 0;
 
-	switch (lmv_hash_type & LMV_HASH_TYPE_MASK) {
+	/* for migrating object, always start from 0 stripe */
+	if (lmv_hash_type & LMV_HASH_FLAG_MIGRATION)
+		return 0;
+
+	switch (hash_type) {
 	case LMV_HASH_TYPE_ALL_CHARS:
 		idx = lmv_hash_all_chars(stripe_count, name, namelen);
 		break;
@@ -179,8 +164,8 @@ static inline int lmv_name_to_stripe_index(__u32 lmv_hash_type,
 		break;
 	}
 
-	CDEBUG(D_INFO, "name %.*s hash_type %#x idx %d/%u\n", namelen, name,
-	       lmv_hash_type, idx, stripe_count);
+	CDEBUG(D_INFO, "name %.*s hash_type %d idx %d\n", namelen, name,
+	       hash_type, idx);
 
 	return idx;
 }
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_log.h b/drivers/staging/lustrefsx/lustre/include/lustre_log.h
index f2522050f7337..237da21bf4210 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_log.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_log.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -52,9 +52,9 @@
  */
 
 #include <obd_class.h>
+#include <lustre/lustre_idl.h>
 #include <dt_object.h>
-#include <uapi/linux/lustre/lustre_idl.h>
-#include <uapi/linux/lustre/lustre_log_user.h>
+#include <lustre_log_user.h>
 
 #define LOG_NAME_LIMIT(logname, name)                   \
         snprintf(logname, sizeof(logname), "LOGS/%s", name)
@@ -160,7 +160,6 @@ int llog_cat_process_or_fork(const struct lu_env *env,
 int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh,
 		     llog_cb_t cb, void *data, int startcat, int startidx);
 __u64 llog_cat_size(const struct lu_env *env, struct llog_handle *cat_llh);
-__u32 llog_cat_free_space(struct llog_handle *cat_llh);
 int llog_cat_reverse_process(const struct lu_env *env,
 			     struct llog_handle *cat_llh, llog_cb_t cb,
 			     void *data);
@@ -171,6 +170,8 @@ int llog_setup(const struct lu_env *env, struct obd_device *obd,
 int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt);
 int llog_cleanup(const struct lu_env *env, struct llog_ctxt *);
 int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags);
+int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt,
+		struct llog_cookie *cookies, int flags);
 
 /* llog_ioctl.c */
 struct obd_ioctl_data;
@@ -201,6 +202,8 @@ struct llog_operations {
 	int (*lop_sync)(struct llog_ctxt *ctxt, struct obd_export *exp,
 			int flags);
 	int (*lop_cleanup)(const struct lu_env *env, struct llog_ctxt *ctxt);
+	int (*lop_cancel)(const struct lu_env *env, struct llog_ctxt *ctxt,
+			  struct llog_cookie *cookies, int flags);
 	int (*lop_connect)(struct llog_ctxt *ctxt, struct llog_logid *logid,
 			   struct llog_gen *gen, struct obd_uuid *uuid);
 	/**
@@ -268,8 +271,8 @@ struct llog_handle {
 	 * case, after it will have reached LLOG_HDR_BITMAP_SIZE, llh_cat_idx
 	 * will become its upper limit */
 	int			 lgh_last_idx;
-	struct rw_semaphore	 lgh_last_sem;
-	__u64			 lgh_cur_offset; /* used for test only */
+	int			 lgh_cur_idx; /* used during llog_process */
+	__u64			 lgh_cur_offset; /* used during llog_process */
 	struct llog_ctxt	*lgh_ctxt;
 	union {
 		struct plain_handle_data	 phd;
@@ -281,7 +284,7 @@ struct llog_handle {
 	atomic_t		 lgh_refcount;
 
 	int			lgh_max_size;
-	bool			lgh_destroyed;
+	__u32			lgh_stale:1;
 };
 
 /* llog_osd.c */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_log_user.h b/drivers/staging/lustrefsx/lustre/include/lustre_log_user.h
similarity index 97%
rename from drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_log_user.h
rename to drivers/staging/lustrefsx/lustre/include/lustre_log_user.h
index bcf46eb21e6c2..ee5f0f7385fa0 100644
--- a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_log_user.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_log_user.h
@@ -38,8 +38,7 @@
 #ifndef _LUSTRE_LOG_USER_H
 #define _LUSTRE_LOG_USER_H
 
-#include <linux/types.h>
-#include <linux/lustre/lustre_fid.h>
+#include <uapi/linux/lustre_fid.h>
 
 /*  Lustre logs use FIDs constructed from oi_id and oi_seq directly,
  *  without attempting to use the IGIF and IDIF ranges as is done
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_mdc.h b/drivers/staging/lustrefsx/lustre/include/lustre_mdc.h
index ac7b0d5f4a2f0..be0eb7742e644 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_mdc.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_mdc.h
@@ -53,7 +53,7 @@
 #include <lustre_intent.h>
 #include <libcfs/libcfs.h>
 #include <obd_class.h>
-#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre/lustre_idl.h>
 #include <lustre_lib.h>
 #include <lustre_dlm.h>
 #include <lustre_export.h>
@@ -63,6 +63,104 @@ struct obd_export;
 struct ptlrpc_request;
 struct obd_device;
 
+/**
+ * Serializes in-flight MDT-modifying RPC requests to preserve idempotency.
+ *
+ * This mutex is used to implement execute-once semantics on the MDT.
+ * The MDT stores the last transaction ID and result for every client in
+ * its last_rcvd file. If the client doesn't get a reply, it can safely
+ * resend the request and the MDT will reconstruct the reply being aware
+ * that the request has already been executed. Without this lock,
+ * execution status of concurrent in-flight requests would be
+ * overwritten.
+ *
+ * This design limits the extent to which we can keep a full pipeline of
+ * in-flight requests from a single client.  This limitation could be
+ * overcome by allowing multiple slots per client in the last_rcvd file.
+ */
+struct mdc_rpc_lock {
+	/** Lock protecting in-flight RPC concurrency. */
+	struct mutex		rpcl_mutex;
+	/** Intent associated with currently executing request. */
+	struct lookup_intent	*rpcl_it;
+	/** Used for MDS/RPC load testing purposes. */
+	int			rpcl_fakes;
+};
+
+#define MDC_FAKE_RPCL_IT ((void *)0x2c0012bfUL)
+
+static inline void mdc_init_rpc_lock(struct mdc_rpc_lock *lck)
+{
+	mutex_init(&lck->rpcl_mutex);
+        lck->rpcl_it = NULL;
+}
+
+static inline void mdc_get_rpc_lock(struct mdc_rpc_lock *lck,
+				    struct lookup_intent *it)
+{
+	ENTRY;
+
+	if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP ||
+			   it->it_op == IT_LAYOUT || it->it_op == IT_READDIR))
+		return;
+
+	/* This would normally block until the existing request finishes.
+	 * If fail_loc is set it will block until the regular request is
+	 * done, then set rpcl_it to MDC_FAKE_RPCL_IT.  Once that is set
+	 * it will only be cleared when all fake requests are finished.
+	 * Only when all fake requests are finished can normal requests
+	 * be sent, to ensure they are recoverable again. */
+ again:
+	mutex_lock(&lck->rpcl_mutex);
+
+	if (CFS_FAIL_CHECK_QUIET(OBD_FAIL_MDC_RPCS_SEM)) {
+		lck->rpcl_it = MDC_FAKE_RPCL_IT;
+		lck->rpcl_fakes++;
+		mutex_unlock(&lck->rpcl_mutex);
+		return;
+	}
+
+	/* This will only happen when the CFS_FAIL_CHECK() was
+	 * just turned off but there are still requests in progress.
+	 * Wait until they finish.  It doesn't need to be efficient
+	 * in this extremely rare case, just have low overhead in
+	 * the common case when it isn't true. */
+	while (unlikely(lck->rpcl_it == MDC_FAKE_RPCL_IT)) {
+		mutex_unlock(&lck->rpcl_mutex);
+		schedule_timeout(cfs_time_seconds(1) / 4);
+		goto again;
+	}
+
+	LASSERT(lck->rpcl_it == NULL);
+	lck->rpcl_it = it;
+}
+
+static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck,
+				    struct lookup_intent *it)
+{
+	if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP ||
+			   it->it_op == IT_LAYOUT || it->it_op == IT_READDIR))
+		goto out;
+
+	if (lck->rpcl_it == MDC_FAKE_RPCL_IT) { /* OBD_FAIL_MDC_RPCS_SEM */
+		mutex_lock(&lck->rpcl_mutex);
+
+		LASSERTF(lck->rpcl_fakes > 0, "%d\n", lck->rpcl_fakes);
+		lck->rpcl_fakes--;
+
+		if (lck->rpcl_fakes == 0)
+			lck->rpcl_it = NULL;
+
+	} else {
+		LASSERTF(it == lck->rpcl_it, "%p != %p\n", it, lck->rpcl_it);
+		lck->rpcl_it = NULL;
+	}
+
+	mutex_unlock(&lck->rpcl_mutex);
+ out:
+	EXIT;
+}
+
 static inline void mdc_get_mod_rpc_slot(struct ptlrpc_request *req,
 					struct lookup_intent *it)
 {
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_mds.h b/drivers/staging/lustrefsx/lustre/include/lustre_mds.h
index cb43281574890..c254c7f730f10 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_mds.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_mds.h
@@ -45,7 +45,7 @@
 
 #include <lustre_handles.h>
 #include <libcfs/libcfs.h>
-#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre/lustre_idl.h>
 #include <lustre_lib.h>
 #include <lustre_dlm.h>
 #include <lustre_export.h>
@@ -60,34 +60,13 @@ struct mds_capa_info {
         struct lustre_capa_key *capa;
 };
 
-struct md_rejig_data {
-	struct md_object	*mrd_obj;
-	__u16			mrd_mirror_id;
-};
-
 #define MDD_OBD_NAME     "mdd_obd"
 #define MDD_OBD_UUID     "mdd_obd_uuid"
 
-static inline int md_should_create(u64 open_flags)
+static inline int md_should_create(__u64 flags)
 {
-	return !(open_flags & MDS_OPEN_DELAY_CREATE) &&
-		(open_flags & MDS_FMODE_WRITE) &&
-	       !(open_flags & MDS_OPEN_LEASE);
-}
-
-/* do NOT or the MAY_*'s, you'll get the weakest */
-static inline int mds_accmode(u64 open_flags)
-{
-	int res = 0;
-
-	if (open_flags & MDS_FMODE_READ)
-		res |= MAY_READ;
-	if (open_flags & (MDS_FMODE_WRITE | MDS_OPEN_TRUNC | MDS_OPEN_APPEND))
-		res |= MAY_WRITE;
-	if (open_flags & MDS_FMODE_EXEC)
-		res = MAY_EXEC;
-
-	return res;
+	return !(flags & MDS_OPEN_DELAY_CREATE) && (flags & FMODE_WRITE) &&
+               !(flags & MDS_OPEN_LEASE);
 }
 
 /** @} mds */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_net.h b/drivers/staging/lustrefsx/lustre/include/lustre_net.h
index 3a94a921e11de..f6d67c832ed64 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_net.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_net.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2017, Intel Corporation.
+ * Copyright (c) 2010, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -50,13 +50,12 @@
  *
  * @{
  */
-#include <linux/kobject.h>
+
 #include <linux/uio.h>
 #include <libcfs/libcfs.h>
+#include <lnet/nidstr.h>
 #include <lnet/api.h>
-#include <lnet/lib-types.h>
-#include <uapi/linux/lnet/nidstr.h>
-#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre/lustre_idl.h>
 #include <lustre_ha.h>
 #include <lustre_sec.h>
 #include <lustre_import.h>
@@ -64,7 +63,7 @@
 #include <lu_object.h>
 #include <lustre_req_layout.h>
 #include <obd_support.h>
-#include <uapi/linux/lustre/lustre_ver.h>
+#include <lustre_ver.h>
 
 /* MD flags we _always_ use */
 #define PTLRPC_MD_OPTIONS  0
@@ -76,7 +75,7 @@
  * value.  The client is free to limit the actual RPC size for any bulk
  * transfer via cl_max_pages_per_rpc to some non-power-of-two value.
  * NOTE: This is limited to 16 (=64GB RPCs) by IOOBJ_MAX_BRW_BITS. */
-#define PTLRPC_BULK_OPS_BITS	6
+#define PTLRPC_BULK_OPS_BITS	4
 #if PTLRPC_BULK_OPS_BITS > 16
 #error "More than 65536 BRW RPCs not allowed by IOOBJ_MAX_BRW_BITS."
 #endif
@@ -473,31 +472,19 @@
  * - single object with 16 pages is 512 bytes
  * - OST_IO_MAXREQSIZE must be at least 1 page of cookies plus some spillover
  * - Must be a multiple of 1024
+ * - actual size is about 18K
  */
-#define _OST_MAXREQSIZE_BASE ((unsigned long)(sizeof(struct lustre_msg) + \
-				    sizeof(struct ptlrpc_body) +	  \
-				    sizeof(struct obdo) +		  \
-				    sizeof(struct obd_ioobj) +		  \
-				    sizeof(struct niobuf_remote)))
-#define _OST_MAXREQSIZE_SUM ((unsigned long)(_OST_MAXREQSIZE_BASE +	  \
-				   sizeof(struct niobuf_remote) *	  \
-				   (DT_MAX_BRW_PAGES - 1)))
+#define _OST_MAXREQSIZE_SUM (sizeof(struct lustre_msg) + \
+			     sizeof(struct ptlrpc_body) + \
+			     sizeof(struct obdo) + \
+			     sizeof(struct obd_ioobj) + \
+			     sizeof(struct niobuf_remote) * DT_MAX_BRW_PAGES)
 /**
  * FIEMAP request can be 4K+ for now
  */
-#define OST_MAXREQSIZE		(16UL * 1024UL)
-#define OST_IO_MAXREQSIZE	max(OST_MAXREQSIZE,			\
-				   ((_OST_MAXREQSIZE_SUM - 1) |		\
-				    (1024UL - 1)) + 1)
-/* Safe estimate of free space in standard RPC, provides upper limit for # of
- * bytes of i/o to pack in RPC (skipping bulk transfer). */
-#define OST_SHORT_IO_SPACE	(OST_IO_MAXREQSIZE - _OST_MAXREQSIZE_BASE)
-
-/* Actual size used for short i/o buffer.  Calculation means this:
- * At least one page (for large PAGE_SIZE), or 16 KiB, but not more
- * than the available space aligned to a page boundary. */
-#define OBD_MAX_SHORT_IO_BYTES	min(max(PAGE_SIZE, 16UL * 1024UL), \
-				    OST_SHORT_IO_SPACE & PAGE_MASK)
+#define OST_MAXREQSIZE		(16 * 1024)
+#define OST_IO_MAXREQSIZE	max_t(int, OST_MAXREQSIZE, \
+				(((_OST_MAXREQSIZE_SUM - 1) | (1024 - 1)) + 1))
 
 #define OST_MAXREPSIZE		(9 * 1024)
 #define OST_IO_MAXREPSIZE	OST_MAXREPSIZE
@@ -511,7 +498,6 @@
  */
 #define OST_IO_BUFSIZE		max_t(int, OST_IO_MAXREQSIZE + 1024, 64 * 1024)
 
-
 /* Macro to hide a typecast. */
 #define ptlrpc_req_async_args(req) ((void *)&req->rq_async_args)
 
@@ -566,6 +552,7 @@ union ptlrpc_async_args {
 };
 
 struct ptlrpc_request_set;
+typedef int (*set_interpreter_func)(struct ptlrpc_request_set *, void *, int);
 typedef int (*set_producer_func)(struct ptlrpc_request_set *, void *);
 
 /**
@@ -587,8 +574,19 @@ struct ptlrpc_request_set {
 	atomic_t		set_remaining;
 	/** wait queue to wait on for request events */
 	wait_queue_head_t	set_waitq;
+	wait_queue_head_t      *set_wakeup_ptr;
 	/** List of requests in the set */
 	struct list_head	set_requests;
+	/**
+	 * List of completion callbacks to be called when the set is completed
+	 * This is only used if \a set_interpret is NULL.
+	 * Links struct ptlrpc_set_cbdata.
+	 */
+	struct list_head	set_cblist;
+	/** Completion callback, if only one. */
+	set_interpreter_func	set_interpret;
+	/** opaq argument passed to completion \a set_interpret callback. */
+	void			*set_arg;
 	/**
 	 * Lock for \a set_new_requests manipulations
 	 * locked so that any old caller can communicate requests to
@@ -610,6 +608,18 @@ struct ptlrpc_request_set {
 	unsigned int		 set_allow_intr:1;
 };
 
+/**
+ * Description of a single ptrlrpc_set callback
+ */
+struct ptlrpc_set_cbdata {
+	/** List linkage item */
+	struct list_head	psc_item;
+	/** Pointer to interpreting function */
+	set_interpreter_func	psc_interpret;
+	/** Opaq argument to pass to the callback */
+	void			*psc_data;
+};
+
 struct ptlrpc_bulk_desc;
 struct ptlrpc_service_part;
 struct ptlrpc_service;
@@ -774,9 +784,9 @@ struct ptlrpc_cli_req {
 	/** For bulk requests on client only: bulk descriptor */
 	struct ptlrpc_bulk_desc		*cr_bulk;
 	/** optional time limit for send attempts */
-	time64_t			 cr_delay_limit;
+	cfs_duration_t			 cr_delay_limit;
 	/** time request was first queued */
-	time64_t			 cr_queued_time;
+	cfs_time_t			 cr_queued_time;
 	/** request sent in nanoseconds */
 	ktime_t				 cr_sent_ns;
 	/** time for request really sent out */
@@ -1049,13 +1059,6 @@ struct ptlrpc_request {
 	/** description of flavors for client & server */
 	struct sptlrpc_flavor		 rq_flvr;
 
-	/**
-	 * SELinux policy info at the time of the request
-	 * sepol string format is:
-	 * <mode>:<policy name>:<policy version>:<policy hash>
-	 */
-	char rq_sepol[LUSTRE_NODEMAP_SEPOL_LENGTH + 1];
-
 	/* client/server security flags */
 	unsigned int
                                  rq_ctx_init:1,      /* context initiation */
@@ -1112,17 +1115,8 @@ struct ptlrpc_request {
 	/**
 	 * service time estimate (secs)
 	 * If the request is not served by this time, it is marked as timed out.
-	 * Do not change to time64_t since this is transmitted over the wire.
-	 *
-	 * The linux kernel handles timestamps with time64_t and timeouts
-	 * are normally done with jiffies. Lustre shares the rq_timeout between
-	 * nodes. Since jiffies can vary from node to node Lustre instead
-	 * will express the timeout value in seconds. To avoid confusion with
-	 * timestamps (time64_t) and jiffy timeouts (long) Lustre timeouts
-	 * are expressed in s32 (timeout_t). Also what is transmitted over
-	 * the wire is 32 bits.
 	 */
-	timeout_t			 rq_timeout;
+	int				 rq_timeout;
 	/**
 	 * when request/reply sent (secs), or time when request should be sent
 	 */
@@ -1179,37 +1173,37 @@ static inline bool ptlrpc_nrs_req_can_move(struct ptlrpc_request *req)
 /** @} nrs */
 
 /**
- * Returns true if request buffer at offset \a index was already swabbed
+ * Returns 1 if request buffer at offset \a index was already swabbed
  */
-static inline bool lustre_req_swabbed(struct ptlrpc_request *req, size_t index)
+static inline int lustre_req_swabbed(struct ptlrpc_request *req, size_t index)
 {
-	LASSERT(index < sizeof(req->rq_req_swab_mask) * 8);
-	return req->rq_req_swab_mask & (1 << index);
+        LASSERT(index < sizeof(req->rq_req_swab_mask) * 8);
+        return req->rq_req_swab_mask & (1 << index);
 }
 
 /**
- * Returns true if request reply buffer at offset \a index was already swabbed
+ * Returns 1 if request reply buffer at offset \a index was already swabbed
  */
-static inline bool lustre_rep_swabbed(struct ptlrpc_request *req, size_t index)
+static inline int lustre_rep_swabbed(struct ptlrpc_request *req, size_t index)
 {
-	LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8);
-	return req->rq_rep_swab_mask & (1 << index);
+        LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8);
+        return req->rq_rep_swab_mask & (1 << index);
 }
 
 /**
- * Returns true if request needs to be swabbed into local cpu byteorder
+ * Returns 1 if request needs to be swabbed into local cpu byteorder
  */
-static inline bool ptlrpc_req_need_swab(struct ptlrpc_request *req)
+static inline int ptlrpc_req_need_swab(struct ptlrpc_request *req)
 {
-	return lustre_req_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+        return lustre_req_swabbed(req, MSG_PTLRPC_HEADER_OFF);
 }
 
 /**
- * Returns true if request reply needs to be swabbed into local cpu byteorder
+ * Returns 1 if request reply needs to be swabbed into local cpu byteorder
  */
-static inline bool ptlrpc_rep_need_swab(struct ptlrpc_request *req)
+static inline int ptlrpc_rep_need_swab(struct ptlrpc_request *req)
 {
-	return lustre_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+        return lustre_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
 }
 
 /**
@@ -1444,8 +1438,6 @@ extern const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kvec_ops;
  *  Another user is readpage for MDT.
  */
 struct ptlrpc_bulk_desc {
-	/** number MD's assigned including zero-sends */
-	unsigned int bd_refs;
 	/** completed with failure */
 	unsigned long bd_failure:1;
 	/** client side */
@@ -1470,7 +1462,6 @@ struct ptlrpc_bulk_desc {
 	int                    bd_max_iov;      /* allocated size of bd_iov */
 	int                    bd_nob;          /* # bytes covered */
 	int                    bd_nob_transferred; /* # bytes GOT/PUT */
-	unsigned int	       bd_nob_last;	/* # bytes in last MD */
 
 	__u64                  bd_last_mbits;
 
@@ -1478,8 +1469,6 @@ struct ptlrpc_bulk_desc {
 	lnet_nid_t             bd_sender;       /* stash event::sender */
 	int			bd_md_count;	/* # valid entries in bd_mds */
 	int			bd_md_max_brw;	/* max entries in bd_mds */
-	/** array of offsets for each MD */
-	unsigned int		bd_mds_off[PTLRPC_BULK_OPS_COUNT];
 	/** array of associated MDs */
 	struct lnet_handle_md	bd_mds[PTLRPC_BULK_OPS_COUNT];
 
@@ -1704,8 +1693,8 @@ struct ptlrpc_service {
 	int				srv_nthrs_cpt_init;
 	/** limit of threads number for each partition */
 	int				srv_nthrs_cpt_limit;
-	/** Root of debugfs dir tree for this service */
-	struct dentry		       *srv_debugfs_entry;
+        /** Root of /proc dir tree for this service */
+	struct proc_dir_entry           *srv_procroot;
         /** Pointer to statistic data for this service */
         struct lprocfs_stats           *srv_stats;
         /** # hp per lp reqs to handle */
@@ -1731,25 +1720,17 @@ struct ptlrpc_service {
         int                             srv_watchdog_factor;
         /** under unregister_service */
         unsigned                        srv_is_stopping:1;
-	/** Whether or not to restrict service threads to CPUs in this CPT */
-	unsigned			srv_cpt_bind:1;
 
-	/** max # request buffers */
-	int				srv_nrqbds_max;
 	/** max # request buffers in history per partition */
 	int				srv_hist_nrqbds_cpt_max;
-	/** number of CPTs this service associated with */
+	/** number of CPTs this service bound on */
 	int				srv_ncpts;
-	/** CPTs array this service associated with */
+	/** CPTs array this service bound on */
 	__u32				*srv_cpts;
 	/** 2^srv_cptab_bits >= cfs_cpt_numbert(srv_cptable) */
 	int				srv_cpt_bits;
 	/** CPT table this service is running over */
 	struct cfs_cpt_table		*srv_cptable;
-
-	/* sysfs object */
-	struct kobject			srv_kobj;
-	struct completion		srv_kobj_unregister;
 	/**
 	 * partition data for ptlrpc service
 	 */
@@ -1796,8 +1777,6 @@ struct ptlrpc_service_part {
 	 * threads starting & stopping are also protected by this lock.
 	 */
 	spinlock_t			scp_lock  __cfs_cacheline_aligned;
-	/** userland serialization */
-	struct mutex			scp_mutex;
 	/** total # req buffer descs allocated */
 	int				scp_nrqbds_total;
 	/** # posted request buffers for receiving */
@@ -1812,8 +1791,8 @@ struct ptlrpc_service_part {
 	struct list_head		scp_rqbd_posted;
 	/** incoming reqs */
 	struct list_head		scp_req_incoming;
-	/** timeout before re-posting reqs, in jiffies */
-	long				scp_rqbd_timeout;
+	/** timeout before re-posting reqs, in tick */
+	cfs_duration_t			scp_rqbd_timeout;
 	/**
 	 * all threads sleep on this. This wait-queue is signalled when new
 	 * incoming request arrives and when difficult reply has to be handled.
@@ -1864,7 +1843,7 @@ struct ptlrpc_service_part {
 	/** early reply timer */
 	struct timer_list		scp_at_timer;
 	/** debug */
-	ktime_t				scp_at_checktime;
+	cfs_time_t			scp_at_checktime;
 	/** check early replies */
 	unsigned			scp_at_check;
 	/** @} */
@@ -2082,7 +2061,7 @@ static inline int ptlrpc_server_bulk_active(struct ptlrpc_bulk_desc *desc)
 	LASSERT(desc != NULL);
 
 	spin_lock(&desc->bd_lock);
-	rc = desc->bd_refs;
+	rc = desc->bd_md_count;
 	spin_unlock(&desc->bd_lock);
 	return rc;
 }
@@ -2099,15 +2078,14 @@ static inline int ptlrpc_client_bulk_active(struct ptlrpc_request *req)
 	LASSERT(req != NULL);
 	desc = req->rq_bulk;
 
-	if (!desc)
-		return 0;
-
 	if (req->rq_bulk_deadline > ktime_get_real_seconds())
 		return 1;
 
+	if (!desc)
+		return 0;
 
 	spin_lock(&desc->bd_lock);
-	rc = desc->bd_refs;
+	rc = desc->bd_md_count;
 	spin_unlock(&desc->bd_lock);
 	return rc;
 }
@@ -2147,8 +2125,10 @@ void ptlrpc_abort_set(struct ptlrpc_request_set *set);
 struct ptlrpc_request_set *ptlrpc_prep_set(void);
 struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func,
 					     void *arg);
+int ptlrpc_set_add_cb(struct ptlrpc_request_set *set,
+                      set_interpreter_func fn, void *data);
 int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set);
-int ptlrpc_set_wait(const struct lu_env *env, struct ptlrpc_request_set *);
+int ptlrpc_set_wait(struct ptlrpc_request_set *);
 void ptlrpc_mark_interrupted(struct ptlrpc_request *req);
 void ptlrpc_set_destroy(struct ptlrpc_request_set *);
 void ptlrpc_set_add_req(struct ptlrpc_request_set *, struct ptlrpc_request *);
@@ -2265,8 +2245,8 @@ struct ptlrpc_service_thr_conf {
 	/* user specified threads number, it will be validated due to
 	 * other members of this structure. */
 	unsigned int			tc_nthrs_user;
-	/* bind service threads to only CPUs in their associated CPT */
-	unsigned int			tc_cpu_bind;
+	/* set NUMA node affinity for service threads */
+	unsigned int			tc_cpu_affinity;
 	/* Tags for lu_context associated with service thread */
 	__u32				tc_ctx_tags;
 };
@@ -2275,8 +2255,6 @@ struct ptlrpc_service_cpt_conf {
 	struct cfs_cpt_table		*cc_cptable;
 	/* string pattern to describe CPTs for a service */
 	char				*cc_pattern;
-	/* whether or not to have per-CPT service partitions */
-	bool				cc_affinity;
 };
 
 struct ptlrpc_service_conf {
@@ -2309,18 +2287,18 @@ void ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs);
 int ptlrpc_hpreq_handler(struct ptlrpc_request *req);
 struct ptlrpc_service *ptlrpc_register_service(
 				struct ptlrpc_service_conf *conf,
-				struct kset *parent,
-				struct dentry *debugfs_entry);
+				struct proc_dir_entry *proc_entry);
 void ptlrpc_stop_all_threads(struct ptlrpc_service *svc);
 
 int ptlrpc_start_threads(struct ptlrpc_service *svc);
 int ptlrpc_unregister_service(struct ptlrpc_service *service);
+int liblustre_check_services(void *arg);
+void ptlrpc_daemonize(char *name);
 int ptlrpc_service_health_check(struct ptlrpc_service *);
 void ptlrpc_server_drop_request(struct ptlrpc_request *req);
 void ptlrpc_request_change_export(struct ptlrpc_request *req,
 				  struct obd_export *export);
-void ptlrpc_update_export_timer(struct obd_export *exp,
-				time64_t extra_delay);
+void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay);
 
 int ptlrpc_hr_init(void);
 void ptlrpc_hr_fini(void);
@@ -2333,10 +2311,8 @@ void ptlrpc_hr_fini(void);
  * @{
  */
 int ptlrpc_connect_import(struct obd_import *imp);
-int ptlrpc_connect_import_locked(struct obd_import *imp);
 int ptlrpc_init_import(struct obd_import *imp);
 int ptlrpc_disconnect_import(struct obd_import *imp, int noclose);
-int ptlrpc_disconnect_and_idle_import(struct obd_import *imp);
 int ptlrpc_import_recovery_state_machine(struct obd_import *imp);
 void deuuidify(char *uuid, const char *prefix, char **uuid_start,
 	       int *uuid_len);
@@ -2350,14 +2326,8 @@ int ptlrpc_reconnect_import(struct obd_import *imp);
  *
  * @{
  */
-#define PTLRPC_MAX_BUFCOUNT \
-	(sizeof(((struct ptlrpc_request *)0)->rq_req_swab_mask) * 8)
-#define MD_MAX_BUFLEN		(MDS_REG_MAXREQSIZE > OUT_MAXREQSIZE ? \
-				 MDS_REG_MAXREQSIZE : OUT_MAXREQSIZE)
-#define PTLRPC_MAX_BUFLEN	(OST_IO_MAXREQSIZE > MD_MAX_BUFLEN ? \
-				 OST_IO_MAXREQSIZE : MD_MAX_BUFLEN)
-bool ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout,
-			  __u32 index);
+int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout,
+			 __u32 index);
 void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout,
 			    __u32 index);
 int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len);
@@ -2400,7 +2370,7 @@ __u32 lustre_msg_get_op_flags(struct lustre_msg *msg);
 void lustre_msg_add_op_flags(struct lustre_msg *msg, __u32 flags);
 struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg);
 __u32 lustre_msg_get_type(struct lustre_msg *msg);
-enum lustre_msg_version lustre_msg_get_version(struct lustre_msg *msg);
+__u32 lustre_msg_get_version(struct lustre_msg *msg);
 void lustre_msg_add_version(struct lustre_msg *msg, __u32 version);
 __u32 lustre_msg_get_opc(struct lustre_msg *msg);
 __u64 lustre_msg_get_last_xid(struct lustre_msg *msg);
@@ -2415,8 +2385,8 @@ void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit);
 int lustre_msg_get_status(struct lustre_msg *msg);
 __u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg);
 __u32 lustre_msg_get_magic(struct lustre_msg *msg);
-timeout_t lustre_msg_get_timeout(struct lustre_msg *msg);
-timeout_t lustre_msg_get_service_timeout(struct lustre_msg *msg);
+__u32 lustre_msg_get_timeout(struct lustre_msg *msg);
+__u32 lustre_msg_get_service_time(struct lustre_msg *msg);
 char *lustre_msg_get_jobid(struct lustre_msg *msg);
 __u32 lustre_msg_get_cksum(struct lustre_msg *msg);
 __u64 lustre_msg_get_mbits(struct lustre_msg *msg);
@@ -2433,9 +2403,8 @@ void lustre_msg_set_status(struct lustre_msg *msg, __u32 status);
 void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt);
 void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *sizes);
 void ptlrpc_request_set_replen(struct ptlrpc_request *req);
-void lustre_msg_set_timeout(struct lustre_msg *msg, timeout_t timeout);
-void lustre_msg_set_service_timeout(struct lustre_msg *msg,
-				    timeout_t service_timeout);
+void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout);
+void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time);
 void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid);
 void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum);
 void lustre_msg_set_mbits(struct lustre_msg *msg, __u64 mbits);
@@ -2619,8 +2588,11 @@ static inline int ptlrpc_req_get_repsize(struct ptlrpc_request *req)
 static inline int ptlrpc_send_limit_expired(struct ptlrpc_request *req)
 {
         if (req->rq_delay_limit != 0 &&
-	    req->rq_queued_time + req->rq_delay_limit < ktime_get_seconds())
+            cfs_time_before(cfs_time_add(req->rq_queued_time,
+                                         cfs_time_seconds(req->rq_delay_limit)),
+                            cfs_time_current())) {
                 return 1;
+        }
         return 0;
 }
 
@@ -2687,6 +2659,11 @@ struct timeout_item;
 typedef int (*timeout_cb_t)(struct timeout_item *, void *);
 int ptlrpc_pinger_add_import(struct obd_import *imp);
 int ptlrpc_pinger_del_import(struct obd_import *imp);
+int ptlrpc_add_timeout_client(int time, enum timeout_event event,
+			      timeout_cb_t cb, void *data,
+			      struct list_head *obd_list);
+int ptlrpc_del_timeout_client(struct list_head *obd_list,
+                              enum timeout_event event);
 struct ptlrpc_request * ptlrpc_prep_ping(struct obd_import *imp);
 int ptlrpc_obd_ping(struct obd_device *obd);
 void ping_evictor_start(void);
@@ -2725,9 +2702,11 @@ static inline void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes) {}
 
 /* ptlrpc/llog_server.c */
 int llog_origin_handle_open(struct ptlrpc_request *req);
+int llog_origin_handle_destroy(struct ptlrpc_request *req);
 int llog_origin_handle_prev_block(struct ptlrpc_request *req);
 int llog_origin_handle_next_block(struct ptlrpc_request *req);
 int llog_origin_handle_read_header(struct ptlrpc_request *req);
+int llog_origin_handle_close(struct ptlrpc_request *req);
 
 /* ptlrpc/llog_client.c */
 extern struct llog_operations llog_client_ops;
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nodemap.h b/drivers/staging/lustrefsx/lustre/include/lustre_nodemap.h
index 9d200bf651b64..7cabc6f2424d7 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_nodemap.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_nodemap.h
@@ -21,16 +21,13 @@
  */
 /*
  * Copyright (C) 2013, Trustees of Indiana University
- *
- * Copyright (c) 2017, Intel Corporation.
- *
  * Author: Joshua Walgenbach <jjw@iu.edu>
  */
 
 #ifndef _LUSTRE_NODEMAP_H
 #define _LUSTRE_NODEMAP_H
 
-#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre/lustre_idl.h>
 
 #define LUSTRE_NODEMAP_NAME "nodemap"
 
@@ -76,8 +73,7 @@ struct lu_nodemap {
 				 nmf_deny_unknown:1,
 				 nmf_allow_root_access:1,
 				 nmf_map_uid_only:1,
-				 nmf_map_gid_only:1,
-				 nmf_enable_audit:1;
+				 nmf_map_gid_only:1;
 	/* unique ID set by MGS */
 	unsigned int		 nm_id;
 	/* nodemap ref counter */
@@ -106,8 +102,6 @@ struct lu_nodemap {
 	struct nodemap_pde	*nm_pde_data;
 	/* fileset the nodes of this nodemap are restricted to */
 	char			 nm_fileset[PATH_MAX+1];
-	/* information about the expected SELinux policy on the nodes */
-	char			 nm_sepol[LUSTRE_NODEMAP_SEPOL_LENGTH + 1];
 
 	/* used when loading/unloading nodemaps */
 	struct list_head	 nm_list;
@@ -138,7 +132,6 @@ int nodemap_set_deny_unknown(const char *name, bool deny_unknown);
 int nodemap_set_mapping_mode(const char *name, enum nodemap_mapping_modes mode);
 int nodemap_set_squash_uid(const char *name, uid_t uid);
 int nodemap_set_squash_gid(const char *name, gid_t gid);
-int nodemap_set_audit_mode(const char *name, bool enable_audit);
 bool nodemap_can_setquota(const struct lu_nodemap *nodemap);
 int nodemap_add_idmap(const char *name, enum nodemap_id_type id_type,
 		      const __u32 map[2]);
@@ -146,8 +139,6 @@ int nodemap_del_idmap(const char *name, enum nodemap_id_type id_type,
 		      const __u32 map[2]);
 int nodemap_set_fileset(const char *name, const char *fileset);
 char *nodemap_get_fileset(const struct lu_nodemap *nodemap);
-int nodemap_set_sepol(const char *name, const char *sepol);
-const char *nodemap_get_sepol(const struct lu_nodemap *nodemap);
 __u32 nodemap_map_id(struct lu_nodemap *nodemap,
 		     enum nodemap_id_type id_type,
 		     enum nodemap_tree_type tree_type, __u32 id);
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nrs_tbf.h b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_tbf.h
index 0a407197c36f6..6e0c736ab8d87 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_nrs_tbf.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_tbf.h
@@ -51,31 +51,7 @@ struct nrs_tbf_jobid {
 	struct list_head tj_linkage;
 };
 
-#define MAX_U32_STR_LEN	10
-#define NRS_TBF_KEY_LEN	(LNET_NIDSTR_SIZE + LUSTRE_JOBID_SIZE + \
-			 MAX_U32_STR_LEN + MAX_U32_STR_LEN + 3 + 2)
-
-enum nrs_tbf_flag {
-	NRS_TBF_FLAG_INVALID    = 0x0000000,
-	NRS_TBF_FLAG_JOBID      = 0x0000001,
-	NRS_TBF_FLAG_NID        = 0x0000002,
-	NRS_TBF_FLAG_OPCODE     = 0x0000004,
-	NRS_TBF_FLAG_GENERIC    = 0x0000008,
-	NRS_TBF_FLAG_UID        = 0x0000010,
-	NRS_TBF_FLAG_GID        = 0x0000020,
-};
-
-struct tbf_id {
-	enum nrs_tbf_flag	ti_type;
-	u32			ti_uid;
-	u32			ti_gid;
-};
-
-struct nrs_tbf_id {
-	struct tbf_id		nti_id;
-	struct list_head	nti_linkage;
-};
-
+#define NRS_TBF_KEY_LEN	(LNET_NIDSTR_SIZE + LUSTRE_JOBID_SIZE + 3 + 2)
 struct nrs_tbf_client {
 	/** Resource object for policy instance. */
 	struct ptlrpc_nrs_resource	 tc_res;
@@ -87,8 +63,6 @@ struct nrs_tbf_client {
 	char				 tc_jobid[LUSTRE_JOBID_SIZE];
 	/** opcode of the client. */
 	__u32				 tc_opcode;
-	/** gid or uid of the client. */
-	struct tbf_id			tc_id;
 	/** Hash key of the client. */
 	char				 tc_key[NRS_TBF_KEY_LEN];
 	/** Reference number of the client. */
@@ -111,13 +85,6 @@ struct nrs_tbf_client {
 	__u64				 tc_depth;
 	/** Time check-point. */
 	__u64				 tc_check_time;
-	/** Deadline of a class */
-	__u64				 tc_deadline;
-	/**
-	 * Time residue: the remainder of elapsed time
-	 * divided by nsecs when dequeue a request.
-	 */
-	__u64				 tc_nsecs_resid;
 	/** List of queued requests. */
 	struct list_head		 tc_list;
 	/** Node in binary heap. */
@@ -135,11 +102,8 @@ struct nrs_tbf_client {
 
 #define MAX_TBF_NAME (16)
 
-enum nrs_rule_flags {
-	NTRS_STOPPING	= 0x00000001,
-	NTRS_DEFAULT	= 0x00000002,
-	NTRS_REALTIME	= 0x00000004,
-};
+#define NTRS_STOPPING	0x0000001
+#define NTRS_DEFAULT	0x0000002
 
 struct nrs_tbf_rule {
 	/** Name of the rule. */
@@ -156,10 +120,6 @@ struct nrs_tbf_rule {
 	struct list_head		 tr_jobids;
 	/** Jobid list string of the rule.*/
 	char				*tr_jobids_str;
-	/** uid/gid list of the rule. */
-	struct list_head		tr_ids;
-	/** uid/gid list string of the rule. */
-	char				*tr_ids_str;
 	/** Opcode bitmap of the rule. */
 	struct cfs_bitmap		*tr_opcodes;
 	/** Opcode list string of the rule.*/
@@ -179,7 +139,7 @@ struct nrs_tbf_rule {
 	/** List of client. */
 	struct list_head		 tr_cli_list;
 	/** Flags of the rule. */
-	enum nrs_rule_flags		 tr_flags;
+	__u32				 tr_flags;
 	/** Usage Reference count taken on the rule. */
 	atomic_t			 tr_ref;
 	/** Generation of the rule. */
@@ -208,10 +168,16 @@ struct nrs_tbf_ops {
 #define NRS_TBF_TYPE_NID	"nid"
 #define NRS_TBF_TYPE_OPCODE	"opcode"
 #define NRS_TBF_TYPE_GENERIC	"generic"
-#define NRS_TBF_TYPE_UID	"uid"
-#define NRS_TBF_TYPE_GID	"gid"
 #define NRS_TBF_TYPE_MAX_LEN	20
 
+enum nrs_tbf_flag {
+	NRS_TBF_FLAG_INVALID	= 0x0000000,
+	NRS_TBF_FLAG_JOBID	= 0x0000001,
+	NRS_TBF_FLAG_NID	= 0x0000002,
+	NRS_TBF_FLAG_OPCODE	= 0x0000004,
+	NRS_TBF_FLAG_GENERIC	= 0x0000008,
+};
+
 struct nrs_tbf_type {
 	const char		*ntt_name;
 	enum nrs_tbf_flag	 ntt_flag;
@@ -304,14 +270,12 @@ struct nrs_tbf_cmd {
 			char			*ts_nids_str;
 			struct list_head	 ts_jobids;
 			char			*ts_jobids_str;
-			struct list_head	 ts_ids;
-			char			*ts_ids_str;
 			struct cfs_bitmap	*ts_opcodes;
 			char			*ts_opcodes_str;
 			struct list_head	 ts_conds;
 			char			*ts_conds_str;
 			__u32			 ts_valid_type;
-			enum nrs_rule_flags	 ts_rule_flags;
+			__u32			 ts_rule_flags;
 			char			*ts_next_name;
 		} tc_start;
 		struct nrs_tbf_cmd_change {
@@ -325,8 +289,6 @@ enum nrs_tbf_field {
 	NRS_TBF_FIELD_NID,
 	NRS_TBF_FIELD_JOBID,
 	NRS_TBF_FIELD_OPCODE,
-	NRS_TBF_FIELD_UID,
-	NRS_TBF_FIELD_GID,
 	NRS_TBF_FIELD_MAX
 };
 
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_obdo.h b/drivers/staging/lustrefsx/lustre/include/lustre_obdo.h
index dd99eee5af714..d3afac961b043 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_obdo.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_obdo.h
@@ -35,7 +35,7 @@
 #ifndef _LUSTRE_OBDO_H_
 #define _LUSTRE_OBDO_H_
 
-#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre/lustre_idl.h>
 
 /**
  * Create an obdo to send over the wire
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_patchless_compat.h b/drivers/staging/lustrefsx/lustre/include/lustre_patchless_compat.h
index b6070871e555c..2ad8bce19ac53 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_patchless_compat.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_patchless_compat.h
@@ -111,6 +111,26 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
 #  define ll_d_count(d)			((d)->d_count)
 #endif /* HAVE_DCACHE_LOCK */
 
+#ifdef ATTR_OPEN
+# define ATTR_FROM_OPEN ATTR_OPEN
+#else
+# ifndef ATTR_FROM_OPEN
+#  define ATTR_FROM_OPEN 0
+# endif
+#endif /* ATTR_OPEN */
+
+#ifndef ATTR_RAW
+#define ATTR_RAW 0
+#endif
+
+#ifndef ATTR_CTIME_SET
+/*
+ * set ATTR_CTIME_SET to a high value to avoid any risk of collision with other
+ * ATTR_* attributes (see bug 13828)
+ */
+#define ATTR_CTIME_SET (1 << 28)
+#endif
+
 #ifndef HAVE_IN_COMPAT_SYSCALL
 #define in_compat_syscall	is_compat_task
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_quota.h b/drivers/staging/lustrefsx/lustre/include/lustre_quota.h
index 17ff2da6240ca..8cb25d2374322 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_quota.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_quota.h
@@ -21,7 +21,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2014, Intel Corporation.
  * Use is subject to license terms.
  */
 
@@ -175,22 +175,13 @@ struct qsd_instance;
  * Below are the function prototypes to be used by OSD layer to manage quota
  * enforcement. Arguments are documented where each function is defined.  */
 
-/* flags for quota local enforcement */
-enum osd_quota_local_flags {
-	QUOTA_FL_OVER_USRQUOTA = 1 << 0,
-	QUOTA_FL_OVER_GRPQUOTA = 1 << 1,
-	QUOTA_FL_SYNC = 1 << 2,
-	QUOTA_FL_OVER_PRJQUOTA = 1 << 3,
-};
-
 struct qsd_instance *qsd_init(const struct lu_env *, char *, struct dt_device *,
-			      struct proc_dir_entry *, bool is_md);
+			      struct proc_dir_entry *);
 int qsd_prepare(const struct lu_env *, struct qsd_instance *);
 int qsd_start(const struct lu_env *, struct qsd_instance *);
 void qsd_fini(const struct lu_env *, struct qsd_instance *);
 int qsd_op_begin(const struct lu_env *, struct qsd_instance *,
-		 struct lquota_trans *, struct lquota_id_info *,
-		 enum osd_quota_local_flags *);
+		 struct lquota_trans *, struct lquota_id_info *, int *);
 void qsd_op_end(const struct lu_env *, struct qsd_instance *,
 		struct lquota_trans *);
 void qsd_op_adjust(const struct lu_env *, struct qsd_instance *,
@@ -221,13 +212,13 @@ struct lquota_id_info {
 	bool			 lqi_is_blk;
 };
 
-/* With the DoM, both inode quota in meta pool and block quota in data pool
- * will be enforced at MDT, there are at most 4 quota ids being enforced in
- * a single transaction for inode and block quota, which is chown transaction:
+/* Since we enforce only inode quota in meta pool (MDTs), and block quota in
+ * data pool (OSTs), there are at most 4 quota ids being enforced in a single
+ * transaction, which is chown transaction:
  * original uid and gid, new uid and gid.
  *
  * This value might need to be revised when directory quota is added.  */
-#define QUOTA_MAX_TRANSIDS    8
+#define QUOTA_MAX_TRANSIDS    4
 
 /* all qids involved in a single transaction */
 struct lquota_trans {
@@ -235,6 +226,12 @@ struct lquota_trans {
 	struct lquota_id_info	lqt_ids[QUOTA_MAX_TRANSIDS];
 };
 
+/* flags for quota local enforcement */
+#define QUOTA_FL_OVER_USRQUOTA  0x01
+#define QUOTA_FL_OVER_GRPQUOTA  0x02
+#define QUOTA_FL_SYNC           0x04
+#define QUOTA_FL_OVER_PRJQUOTA  0x08
+
 #define IS_LQUOTA_RES(res)						\
 	(res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA ||	\
 	 res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA_GLB)
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h b/drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h
index 7b6c03b195624..46e6fa862f48e 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -59,7 +59,7 @@ enum req_location {
 };
 
 /* Maximal number of fields (buffers) in a request message. */
-#define REQ_MAX_FIELD_NR 11
+#define REQ_MAX_FIELD_NR 10
 
 struct req_capsule {
         struct ptlrpc_request   *rc_req;
@@ -128,7 +128,6 @@ int req_capsule_server_grow(struct req_capsule *pill,
 			    __u32 newlen);
 int  req_layout_init(void);
 void req_layout_fini(void);
-int req_check_sepol(struct req_capsule *pill);
 
 extern struct req_format RQF_OBD_PING;
 extern struct req_format RQF_OBD_SET_INFO;
@@ -146,7 +145,6 @@ extern struct req_format RQF_FLD_READ;
 extern struct req_format RQF_MDS_CONNECT;
 extern struct req_format RQF_MDS_DISCONNECT;
 extern struct req_format RQF_MDS_STATFS;
-extern struct req_format RQF_MDS_STATFS_NEW;
 extern struct req_format RQF_MDS_GET_ROOT;
 extern struct req_format RQF_MDS_SYNC;
 extern struct req_format RQF_MDS_GETXATTR;
@@ -158,7 +156,7 @@ extern struct req_format RQF_OUT_UPDATE;
  */
 extern struct req_format RQF_MDS_GETATTR_NAME;
 extern struct req_format RQF_MDS_CLOSE;
-extern struct req_format RQF_MDS_CLOSE_INTENT;
+extern struct req_format RQF_MDS_INTENT_CLOSE;
 extern struct req_format RQF_MDS_CONNECT;
 extern struct req_format RQF_MDS_DISCONNECT;
 extern struct req_format RQF_MDS_GET_INFO;
@@ -178,8 +176,6 @@ extern struct req_format RQF_MDS_QUOTACTL;
 extern struct req_format RQF_QUOTA_DQACQ;
 extern struct req_format RQF_MDS_SWAP_LAYOUTS;
 extern struct req_format RQF_MDS_REINT_MIGRATE;
-extern struct req_format RQF_MDS_REINT_RESYNC;
-extern struct req_format RQF_MDS_RMFID;
 /* MDS hsm formats */
 extern struct req_format RQF_MDS_HSM_STATE_GET;
 extern struct req_format RQF_MDS_HSM_STATE_SET;
@@ -219,6 +215,7 @@ extern struct req_format RQF_LDLM_INTENT_LAYOUT;
 extern struct req_format RQF_LDLM_INTENT_GETATTR;
 extern struct req_format RQF_LDLM_INTENT_OPEN;
 extern struct req_format RQF_LDLM_INTENT_CREATE;
+extern struct req_format RQF_LDLM_INTENT_UNLINK;
 extern struct req_format RQF_LDLM_INTENT_GETXATTR;
 extern struct req_format RQF_LDLM_INTENT_QUOTA;
 extern struct req_format RQF_LDLM_CANCEL;
@@ -226,12 +223,15 @@ extern struct req_format RQF_LDLM_CALLBACK;
 extern struct req_format RQF_LDLM_CP_CALLBACK;
 extern struct req_format RQF_LDLM_BL_CALLBACK;
 extern struct req_format RQF_LDLM_GL_CALLBACK;
-extern struct req_format RQF_LDLM_GL_CALLBACK_DESC;
+extern struct req_format RQF_LDLM_GL_DESC_CALLBACK;
 /* LOG req_format */
+extern struct req_format RQF_LOG_CANCEL;
 extern struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_DESTROY;
 extern struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK;
 extern struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK;
 extern struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER;
+extern struct req_format RQF_LLOG_ORIGIN_CONNECT;
 
 extern struct req_format RQF_CONNECT;
 
@@ -257,7 +257,6 @@ extern struct req_msg_field RMF_IDX_INFO;
 extern struct req_msg_field RMF_CLOSE_DATA;
 extern struct req_msg_field RMF_FILE_SECCTX_NAME;
 extern struct req_msg_field RMF_FILE_SECCTX;
-extern struct req_msg_field RMF_FID_ARRAY;
 
 /*
  * connection handle received in MDS_CONNECT request.
@@ -292,7 +291,6 @@ extern struct req_msg_field RMF_HSM_USER_STATE;
 extern struct req_msg_field RMF_HSM_STATE_SET;
 extern struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION;
 extern struct req_msg_field RMF_MDS_HSM_REQUEST;
-extern struct req_msg_field RMF_SELINUX_POL;
 
 /* seq-mgr fields */
 extern struct req_msg_field RMF_SEQ_OPC;
@@ -315,12 +313,10 @@ extern struct req_msg_field RMF_OBD_IOOBJ;
 extern struct req_msg_field RMF_OBD_ID;
 extern struct req_msg_field RMF_FID;
 extern struct req_msg_field RMF_NIOBUF_REMOTE;
-extern struct req_msg_field RMF_NIOBUF_INLINE;
 extern struct req_msg_field RMF_RCS;
 extern struct req_msg_field RMF_FIEMAP_KEY;
 extern struct req_msg_field RMF_FIEMAP_VAL;
 extern struct req_msg_field RMF_OST_ID;
-extern struct req_msg_field RMF_SHORT_IO;
 
 /* MGS config read message format */
 extern struct req_msg_field RMF_MGS_CONFIG_BODY;
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_scrub.h b/drivers/staging/lustrefsx/lustre/include/lustre_scrub.h
deleted file mode 100644
index 3eba040fac690..0000000000000
--- a/drivers/staging/lustrefsx/lustre/include/lustre_scrub.h
+++ /dev/null
@@ -1,375 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2017, Intel Corporation.
- */
-/*
- * lustre/include/lustre_scrub.h
- *
- * Shared definitions and declarations for Lustre OI scrub.
- *
- * Author: Fan Yong <fan.yong@intel.com>
- */
-
-#ifndef _LUSTRE_SCRUB_H
-# define _LUSTRE_SCRUB_H
-
-#include <dt_object.h>
-#include <lustre_net.h>
-
-#define OSD_OI_FID_OID_BITS_MAX	10
-#define OSD_OI_FID_NR_MAX	(1UL << OSD_OI_FID_OID_BITS_MAX)
-#define SCRUB_OI_BITMAP_SIZE	(OSD_OI_FID_NR_MAX >> 3)
-#define PFID_STRIPE_IDX_BITS	16
-#define PFID_STRIPE_COUNT_MASK	((1 << PFID_STRIPE_IDX_BITS) - 1)
-
-#define SCRUB_MAGIC_V1			0x4C5FD252
-#define SCRUB_CHECKPOINT_INTERVAL	60
-#define SCRUB_WINDOW_SIZE		1024
-
-enum scrub_next_status {
-	/* exit current loop and process next group */
-	SCRUB_NEXT_BREAK	= 1,
-
-	/* skip current object and process next bit */
-	SCRUB_NEXT_CONTINUE	= 2,
-
-	/* exit all the loops */
-	SCRUB_NEXT_EXIT		= 3,
-
-	/* wait for free cache slot */
-	SCRUB_NEXT_WAIT		= 4,
-
-	/* simulate system crash during OI scrub */
-	SCRUB_NEXT_CRASH	= 5,
-
-	/* simulate failure during OI scrub */
-	SCRUB_NEXT_FATAL	= 6,
-
-	/* new created object, no scrub on it */
-	SCRUB_NEXT_NOSCRUB	= 7,
-
-	/* the object has no FID-in-LMA */
-	SCRUB_NEXT_NOLMA	= 8,
-
-	/* for OST-object */
-	SCRUB_NEXT_OSTOBJ	= 9,
-
-	/* old OST-object, no LMA or no FID-on-OST flags in LMA */
-	SCRUB_NEXT_OSTOBJ_OLD	= 10,
-};
-
-enum scrub_local_file_flags {
-	SLFF_SCAN_SUBITEMS	= 0x0001,
-	SLFF_HIDE_FID		= 0x0002,
-	SLFF_SHOW_NAME		= 0x0004,
-	SLFF_NO_OI		= 0x0008,
-	SLFF_IDX_IN_FID		= 0x0010,
-};
-
-enum scrub_status {
-	/* The scrub file is new created, for new MDT, upgrading from old disk,
-	 * or re-creating the scrub file manually. */
-	SS_INIT		= 0,
-
-	/* The scrub is checking/repairing the OI files. */
-	SS_SCANNING	= 1,
-
-	/* The scrub checked/repaired the OI files successfully. */
-	SS_COMPLETED	= 2,
-
-	/* The scrub failed to check/repair the OI files. */
-	SS_FAILED	= 3,
-
-	/* The scrub is stopped manually, the OI files may be inconsistent. */
-	SS_STOPPED	= 4,
-
-	/* The scrub is paused automatically when umount. */
-	SS_PAUSED	= 5,
-
-	/* The scrub crashed during the scanning, should be restarted. */
-	SS_CRASHED	= 6,
-};
-
-enum scrub_flags {
-	/* OI files have been recreated, OI mappings should be re-inserted. */
-	SF_RECREATED	= 0x0000000000000001ULL,
-
-	/* OI files are invalid, should be rebuild ASAP */
-	SF_INCONSISTENT	= 0x0000000000000002ULL,
-
-	/* OI scrub is triggered automatically. */
-	SF_AUTO		= 0x0000000000000004ULL,
-
-	/* The device is upgraded from 1.8 format. */
-	SF_UPGRADE	= 0x0000000000000008ULL,
-};
-
-enum scrub_param {
-	/* Exit when fail. */
-	SP_FAILOUT	= 0x0001,
-
-	/* Check only without repairing. */
-	SP_DRYRUN	= 0x0002,
-};
-
-enum scrub_start {
-	/* Set failout flag. */
-	SS_SET_FAILOUT		= 0x00000001,
-
-	/* Clear failout flag. */
-	SS_CLEAR_FAILOUT	= 0x00000002,
-
-	/* Reset scrub start position. */
-	SS_RESET		= 0x00000004,
-
-	/* Trigger full scrub automatically. */
-	SS_AUTO_FULL		= 0x00000008,
-
-	/* Trigger partial scrub automatically. */
-	SS_AUTO_PARTIAL		= 0x00000010,
-
-	/* Set dryrun flag. */
-	SS_SET_DRYRUN		= 0x00000020,
-
-	/* Clear dryrun flag. */
-	SS_CLEAR_DRYRUN		= 0x00000040,
-};
-
-enum osd_lf_flags {
-	OLF_SCAN_SUBITEMS	= 0x0001,
-	OLF_HIDE_FID		= 0x0002,
-	OLF_SHOW_NAME		= 0x0004,
-	OLF_NO_OI		= 0x0008,
-	OLF_IDX_IN_FID		= 0x0010,
-	OLF_NOT_BACKUP		= 0x0020,
-};
-
-/* There are some overhead to detect OI inconsistency automatically
- * during normal RPC handling. We do not want to always auto detect
- * OI inconsistency especailly when OI scrub just done recently.
- *
- * The 'auto_scrub' defines the time (united as second) interval to
- * enable auto detect OI inconsistency since last OI scurb done. */
-enum auto_scrub {
-	/* Disable auto scrub. */
-	AS_NEVER	= 0,
-
-	/* 1 second is too short interval, it is almost equal to always auto
-	 * detect inconsistent OI, usually used for test. */
-	AS_ALWAYS	= 1,
-
-	/* Enable auto detect OI inconsistency one month (60 * 60 * 24 * 30)
-	 * after last OI scrub. */
-	AS_DEFAULT	= 2592000LL,
-};
-
-struct scrub_file {
-	/* 128-bit uuid for volume. */
-	__u8    sf_uuid[16];
-
-	/* See 'enum scrub_flags'. */
-	__u64   sf_flags;
-
-	/* The scrub magic. */
-	__u32   sf_magic;
-
-	/* See 'enum scrub_status'. */
-	__u16   sf_status;
-
-	/* See 'enum scrub_param'. */
-	__u16   sf_param;
-
-	/* The time for the last OI scrub completed. */
-	time64_t sf_time_last_complete;
-
-	/* The ttime for the latest OI scrub ran. */
-	time64_t sf_time_latest_start;
-
-	/* The time for the last OI scrub checkpoint. */
-	time64_t sf_time_last_checkpoint;
-
-	/* The position for the latest OI scrub started from. */
-	__u64   sf_pos_latest_start;
-
-	/* The position for the last OI scrub checkpoint. */
-	__u64   sf_pos_last_checkpoint;
-
-	/* The position for the first should be updated object. */
-	__u64   sf_pos_first_inconsistent;
-
-	/* How many objects have been checked. */
-	__u64   sf_items_checked;
-
-	/* How many objects have been updated. */
-	__u64   sf_items_updated;
-
-	/* How many objects failed to be processed. */
-	__u64   sf_items_failed;
-
-	/* How many prior objects have been updated during scanning. */
-	__u64   sf_items_updated_prior;
-
-	/* How many objects marked as LDISKFS_STATE_LUSTRE_NOSCRUB. */
-	__u64   sf_items_noscrub;
-
-	/* How many IGIF objects. */
-	__u64   sf_items_igif;
-
-	/* How long the OI scrub has run in seconds. Do NOT change
-	 * to time64_t since this breaks backwards compatibility.
-	 * It shouldn't take more than 136 years to complete :-)
-	 */
-	time_t	sf_run_time;
-
-	/* How many completed OI scrub ran on the device. */
-	__u32   sf_success_count;
-
-	/* How many OI files. */
-	__u16   sf_oi_count;
-
-	/* Keep the flags after scrub reset. See 'enum scrub_internal_flags' */
-	__u16	sf_internal_flags;
-
-	__u32	sf_reserved_1;
-	__u64	sf_reserved_2[16];
-
-	/* Bitmap for OI files recreated case. */
-	__u8    sf_oi_bitmap[SCRUB_OI_BITMAP_SIZE];
-};
-
-struct lustre_scrub {
-	/* Object for the scrub file. */
-	struct dt_object       *os_obj;
-
-	struct ptlrpc_thread    os_thread;
-	struct list_head	os_inconsistent_items;
-
-	/* write lock for scrub prep/update/post/checkpoint,
-	 * read lock for scrub dump. */
-	struct rw_semaphore	os_rwsem;
-	spinlock_t		os_lock;
-
-	/* Scrub file in memory. */
-	struct scrub_file       os_file;
-
-	/* Buffer for scrub file load/store. */
-	struct scrub_file       os_file_disk;
-
-	const char	       *os_name;
-
-	/* The time for last checkpoint, seconds */
-	time64_t		os_time_last_checkpoint;
-
-	/* The time for next checkpoint, seconds */
-	time64_t		os_time_next_checkpoint;
-
-	/* How many objects have been checked since last checkpoint. */
-	__u64			os_new_checked;
-	__u64			os_pos_current;
-	__u32			os_start_flags;
-	unsigned int		os_in_prior:1, /* process inconsistent item
-						* found by RPC prior */
-				os_waiting:1, /* Waiting for scan window. */
-				os_full_speed:1, /* run w/o speed limit */
-				os_paused:1, /* The scrub is paused. */
-				os_convert_igif:1,
-				os_partial_scan:1,
-				os_in_join:1,
-				os_full_scrub:1;
-};
-
-#define INDEX_BACKUP_MAGIC_V1	0x1E41F208
-#define INDEX_BACKUP_BUFSIZE	(4096 * 4)
-
-enum lustre_index_backup_policy {
-	/* By default, do not backup the index */
-	LIBP_NONE	= 0,
-
-	/* Backup the dirty index objects when umount */
-	LIBP_AUTO	= 1,
-};
-
-struct lustre_index_backup_header {
-	__u32		libh_magic;
-	__u32		libh_count;
-	__u32		libh_keysize;
-	__u32		libh_recsize;
-	struct lu_fid	libh_owner;
-	__u64		libh_pad[60]; /* keep header 512 bytes aligned */
-};
-
-struct lustre_index_backup_unit {
-	struct list_head	libu_link;
-	struct lu_fid		libu_fid;
-	__u32			libu_keysize;
-	__u32			libu_recsize;
-};
-
-struct lustre_index_restore_unit {
-	struct list_head	liru_link;
-	struct lu_fid		liru_pfid;
-	struct lu_fid		liru_cfid;
-	__u64			liru_clid;
-	int			liru_len;
-	char			liru_name[0];
-};
-
-void scrub_file_init(struct lustre_scrub *scrub, __u8 *uuid);
-void scrub_file_reset(struct lustre_scrub *scrub, __u8 *uuid, __u64 flags);
-int scrub_file_load(const struct lu_env *env, struct lustre_scrub *scrub);
-int scrub_file_store(const struct lu_env *env, struct lustre_scrub *scrub);
-int scrub_checkpoint(const struct lu_env *env, struct lustre_scrub *scrub);
-int scrub_start(int (*threadfn)(void *data), struct lustre_scrub *scrub,
-		void *data, __u32 flags);
-void scrub_stop(struct lustre_scrub *scrub);
-void scrub_dump(struct seq_file *m, struct lustre_scrub *scrub);
-
-int lustre_liru_new(struct list_head *head, const struct lu_fid *pfid,
-		    const struct lu_fid *cfid, __u64 child,
-		    const char *name, int namelen);
-
-int lustre_index_register(struct dt_device *dev, const char *devname,
-			  struct list_head *head, spinlock_t *lock, int *guard,
-			  const struct lu_fid *fid,
-			  __u32 keysize, __u32 recsize);
-
-void lustre_index_backup(const struct lu_env *env, struct dt_device *dev,
-			 const char *devname, struct list_head *head,
-			 spinlock_t *lock, int *guard, bool backup);
-int lustre_index_restore(const struct lu_env *env, struct dt_device *dev,
-			 const struct lu_fid *parent_fid,
-			 const struct lu_fid *tgt_fid,
-			 const struct lu_fid *bak_fid, const char *name,
-			 struct list_head *head, spinlock_t *lock,
-			 char *buf, int bufsize);
-
-static inline void lustre_fid2lbx(char *buf, const struct lu_fid *fid, int len)
-{
-	snprintf(buf, len, DFID_NOBRACE".lbx", PFID(fid));
-}
-
-static inline const char *osd_scrub2name(struct lustre_scrub *scrub)
-{
-	return scrub->os_name;
-}
-#endif /* _LUSTRE_SCRUB_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_sec.h b/drivers/staging/lustrefsx/lustre/include/lustre_sec.h
index 6a69d01150aa1..7e6f490854911 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_sec.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_sec.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -549,7 +549,7 @@ struct ptlrpc_cli_ctx {
 	atomic_t		cc_refcount;
 	struct ptlrpc_sec      *cc_sec;
 	struct ptlrpc_ctx_ops  *cc_ops;
-	time64_t		cc_expire;	/* in seconds */
+	cfs_time_t		cc_expire;	/* in seconds */
 	unsigned int		cc_early_expire:1;
 	unsigned long		cc_flags;
 	struct vfs_cred		cc_vcred;
@@ -869,17 +869,6 @@ struct ptlrpc_sec {
         /** owning import */
         struct obd_import              *ps_import;
 	spinlock_t			ps_lock;
-	/** mtime of SELinux policy file */
-	ktime_t				ps_sepol_mtime;
-	/** next check time of SELinux policy file */
-	ktime_t				ps_sepol_checknext;
-	/**
-	 * SELinux policy info
-	 * sepol string format is:
-	 * <mode>:<policy name>:<policy version>:<policy hash>
-	 */
-	char				ps_sepol[LUSTRE_NODEMAP_SEPOL_LENGTH
-						 + 1];
 
 	/*
 	 * garbage collection
@@ -1103,7 +1092,6 @@ int  sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req,
 void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req);
 
 void sptlrpc_request_out_callback(struct ptlrpc_request *req);
-int sptlrpc_get_sepol(struct ptlrpc_request *req);
 
 /*
  * exported higher interface of import & request
@@ -1121,7 +1109,6 @@ void sptlrpc_import_flush_all_ctx(struct obd_import *imp);
 int  sptlrpc_req_get_ctx(struct ptlrpc_request *req);
 void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync);
 int  sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout);
-int  sptlrpc_export_update_ctx(struct obd_export *exp);
 int  sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req);
 void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode);
 
@@ -1206,6 +1193,10 @@ int sptlrpc_current_user_desc_size(void);
 int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset);
 int sptlrpc_unpack_user_desc(struct lustre_msg *req, int offset, int swabbed);
 
+
+#define CFS_CAP_CHOWN_MASK (1 << CFS_CAP_CHOWN)
+#define CFS_CAP_SYS_RESOURCE_MASK (1 << CFS_CAP_SYS_RESOURCE)
+
 /** @} sptlrpc */
 
 #endif /* _LUSTRE_SEC_H_ */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_swab.h b/drivers/staging/lustrefsx/lustre/include/lustre_swab.h
index 96dcd493f5f33..8f8b375e64c25 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_swab.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_swab.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2016, Intel Corporation.
  *
  * Copyright 2015 Cray Inc, all rights reserved.
  * Author: Ben Evans.
@@ -48,11 +48,10 @@
 #ifndef _LUSTRE_SWAB_H_
 #define _LUSTRE_SWAB_H_
 
-#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre/lustre_idl.h>
 
 void lustre_swab_orphan_ent(struct lu_orphan_ent *ent);
 void lustre_swab_orphan_ent_v2(struct lu_orphan_ent_v2 *ent);
-void lustre_swab_orphan_ent_v3(struct lu_orphan_ent_v3 *ent);
 void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
 void lustre_swab_connect(struct obd_connect_data *ocd);
 void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
@@ -93,13 +92,11 @@ void lustre_swab_obdo(struct obdo *o);
 void lustre_swab_ost_body(struct ost_body *b);
 void lustre_swab_ost_last_id(__u64 *id);
 void lustre_swab_fiemap(struct fiemap *fiemap);
-void lustre_swab_fiemap_info_key(struct ll_fiemap_info_key *fiemap_info);
 void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum);
 void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum);
 void lustre_swab_lov_comp_md_v1(struct lov_comp_md_v1 *lum);
 void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
 				     int stripe_count);
-void lustre_swab_lov_user_md(struct lov_user_md *lum, size_t size);
 void lustre_swab_lov_mds_md(struct lov_mds_md *lmm);
 void lustre_swab_idx_info(struct idx_info *ii);
 void lustre_swab_lip_header(struct lu_idxpage *lip);
@@ -121,7 +118,6 @@ void lustre_swab_object_update_result(struct object_update_result *our);
 void lustre_swab_object_update_reply(struct object_update_reply *our);
 void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl);
 void lustre_swab_close_data(struct close_data *data);
-void lustre_swab_close_data_resync_done(struct close_data_resync_done *resync);
 void lustre_swab_lmv_user_md(struct lmv_user_md *lum);
 void lustre_swab_ladvise(struct lu_ladvise *ladvise);
 void lustre_swab_ladvise_hdr(struct ladvise_hdr *ladvise_hdr);
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_update.h b/drivers/staging/lustrefsx/lustre/include/lustre_update.h
index 78cd3d4bfdd51..968cc51028d86 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_update.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_update.h
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2013, 2017, Intel Corporation.
+ * Copyright (c) 2013, 2016, Intel Corporation.
  */
 /*
  * lustre/include/lustre_update.h
@@ -454,9 +454,6 @@ int out_xattr_get_pack(const struct lu_env *env,
 		       struct object_update *update, size_t *max_update_size,
 		       const struct lu_fid *fid, const char *name,
 		       const int bufsize);
-int out_xattr_list_pack(const struct lu_env *env, struct object_update *update,
-		       size_t *max_update_size, const struct lu_fid *fid,
-		       const int bufsize);
 int out_read_pack(const struct lu_env *env, struct object_update *update,
 		  size_t *max_update_length, const struct lu_fid *fid,
 		  size_t size, loff_t pos);
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ver.h b/drivers/staging/lustrefsx/lustre/include/lustre_ver.h
similarity index 83%
rename from drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ver.h
rename to drivers/staging/lustrefsx/lustre/include/lustre_ver.h
index 90aa25d8aab8a..0557c2dd554e5 100644
--- a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ver.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_ver.h
@@ -23,9 +23,15 @@
 #define LUSTRE_VERSION_CODE						\
 	OBD_OCD_VERSION(LUSTRE_MAJOR, LUSTRE_MINOR, LUSTRE_PATCH, LUSTRE_FIX)
 
+/* liblustre clients are only allowed to connect if their LUSTRE_FIX mismatches
+ * by this amount (set in lustre/autoconf/lustre-version.ac). */
+#define LUSTRE_VERSION_ALLOWED_OFFSET OBD_OCD_VERSION(0, 0, 1, 32)
+
+#ifdef __KERNEL__
 /* If lustre version of client and servers it connects to differs by more
  * than this amount, client would issue a warning.
  * (set in lustre/autoconf/lustre-version.ac) */
 #define LUSTRE_VERSION_OFFSET_WARN OBD_OCD_VERSION(0, 4, 50, 0)
+#endif
 
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/include/md_object.h b/drivers/staging/lustrefsx/lustre/include/md_object.h
index a5f994e36d50b..d64d243ff8988 100644
--- a/drivers/staging/lustrefsx/lustre/include/md_object.h
+++ b/drivers/staging/lustrefsx/lustre/include/md_object.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -74,7 +74,6 @@ enum ma_valid {
 	MA_HSM       = 1 << 6,
 	MA_PFID      = 1 << 7,
 	MA_LMV_DEF   = 1 << 8,
-	MA_SOM	     = 1 << 9,
 };
 
 typedef enum {
@@ -109,47 +108,34 @@ struct md_hsm {
 	__u64	mh_arch_ver;
 };
 
-
-/* memory structure for SOM attributes
- * for fields description see the on disk structure som_attrs
- * which is defined in lustre_idl.h
- */
-struct md_som {
-	__u16	ms_valid;
-	__u64	ms_size;
-	__u64	ms_blocks;
-};
-
 struct md_attr {
-	__u64			 ma_valid;
-	__u64			 ma_need;
-	__u64			 ma_attr_flags;
-	struct lu_attr		 ma_attr;
-	struct lu_fid		 ma_pfid;
-	struct md_hsm		 ma_hsm;
-	struct md_som		 ma_som;
-	struct lov_mds_md	*ma_lmm;
-	union lmv_mds_md	*ma_lmv;
-	void			*ma_acl;
-	int			 ma_lmm_size;
-	int			 ma_lmv_size;
-	int			 ma_acl_size;
-	int			 ma_enable_chprojid_gid;
+        __u64                   ma_valid;
+        __u64                   ma_need;
+        __u64                   ma_attr_flags;
+        struct lu_attr          ma_attr;
+        struct lu_fid           ma_pfid;
+        struct md_hsm           ma_hsm;
+        struct lov_mds_md      *ma_lmm;
+	union lmv_mds_md       *ma_lmv;
+        void                   *ma_acl;
+        int                     ma_lmm_size;
+        int                     ma_lmv_size;
+        int                     ma_acl_size;
 };
 
 /** Additional parameters for create */
 struct md_op_spec {
-	union {
-		/** symlink target */
-		const char *sp_symname;
-		/** eadata for regular files */
-		struct md_spec_reg {
-			void *eadata;
-			int  eadatalen;
-		} sp_ea;
-	} u;
-
-	/** Open flags from client: such as MDS_OPEN_CREAT, and others. */
+        union {
+                /** symlink target */
+                const char               *sp_symname;
+                /** eadata for regular files */
+                struct md_spec_reg {
+                        const void *eadata;
+                        int  eadatalen;
+                } sp_ea;
+        } u;
+
+	/** Create flag from client: such as MDS_OPEN_CREAT, and others. */
 	__u64      sp_cr_flags;
 
 	/* File security context for creates. */
@@ -164,30 +150,10 @@ struct md_op_spec {
 		     sp_permitted:1, /* do not check permission */
 		     sp_migrate_close:1; /* close the file during migrate */
 	/** Current lock mode for parent dir where create is performing. */
-	mdl_mode_t sp_cr_mode;
-
-	/** to create directory */
-	const struct dt_index_features *sp_feat;
-};
-
-enum md_layout_opc {
-	MD_LAYOUT_NOP	= 0,
-	MD_LAYOUT_WRITE,	/* FLR: write the file */
-	MD_LAYOUT_RESYNC,	/* FLR: resync starts */
-	MD_LAYOUT_RESYNC_DONE,	/* FLR: resync done */
-};
+        mdl_mode_t sp_cr_mode;
 
-/**
- * Parameters for layout change API.
- */
-struct md_layout_change {
-	enum md_layout_opc	 mlc_opc;
-	__u16			 mlc_mirror_id;
-	struct layout_intent	*mlc_intent;
-	struct lu_buf		 mlc_buf;
-	struct lustre_som_attrs	 mlc_som;
-	size_t			 mlc_resync_count;
-	__u32			*mlc_resync_ids;
+        /** to create directory */
+        const struct dt_index_features *sp_feat;
 };
 
 union ldlm_policy_data;
@@ -195,53 +161,51 @@ union ldlm_policy_data;
  * Operations implemented for each md object (both directory and leaf).
  */
 struct md_object_operations {
-	int (*moo_permission)(const struct lu_env *env,
-			      struct md_object *pobj, struct md_object *cobj,
-			      struct md_attr *attr, int mask);
+        int (*moo_permission)(const struct lu_env *env,
+                              struct md_object *pobj, struct md_object *cobj,
+                              struct md_attr *attr, int mask);
 
-	int (*moo_attr_get)(const struct lu_env *env, struct md_object *obj,
-			    struct md_attr *attr);
+        int (*moo_attr_get)(const struct lu_env *env, struct md_object *obj,
+                            struct md_attr *attr);
 
-	int (*moo_attr_set)(const struct lu_env *env, struct md_object *obj,
-			    const struct md_attr *attr);
+        int (*moo_attr_set)(const struct lu_env *env, struct md_object *obj,
+                            const struct md_attr *attr);
 
-	int (*moo_xattr_get)(const struct lu_env *env, struct md_object *obj,
-			     struct lu_buf *buf, const char *name);
+        int (*moo_xattr_get)(const struct lu_env *env, struct md_object *obj,
+                             struct lu_buf *buf, const char *name);
 
-	int (*moo_xattr_list)(const struct lu_env *env, struct md_object *obj,
-			      struct lu_buf *buf);
+        int (*moo_xattr_list)(const struct lu_env *env, struct md_object *obj,
+                              struct lu_buf *buf);
 
-	int (*moo_xattr_set)(const struct lu_env *env, struct md_object *obj,
-			     const struct lu_buf *buf, const char *name,
-			     int fl);
+        int (*moo_xattr_set)(const struct lu_env *env, struct md_object *obj,
+                             const struct lu_buf *buf, const char *name,
+                             int fl);
 
-	int (*moo_xattr_del)(const struct lu_env *env, struct md_object *obj,
-			     const char *name);
+        int (*moo_xattr_del)(const struct lu_env *env, struct md_object *obj,
+                             const char *name);
 
 	/** This method is used to swap the layouts between 2 objects */
 	int (*moo_swap_layouts)(const struct lu_env *env,
 			       struct md_object *obj1, struct md_object *obj2,
 			       __u64 flags);
 
-	/** \retval number of bytes actually read upon success */
-	int (*moo_readpage)(const struct lu_env *env, struct md_object *obj,
-			    const struct lu_rdpg *rdpg);
-
-	int (*moo_readlink)(const struct lu_env *env, struct md_object *obj,
-			    struct lu_buf *buf);
+        /** \retval number of bytes actually read upon success */
+        int (*moo_readpage)(const struct lu_env *env, struct md_object *obj,
+                            const struct lu_rdpg *rdpg);
 
+        int (*moo_readlink)(const struct lu_env *env, struct md_object *obj,
+                            struct lu_buf *buf);
 	int (*moo_changelog)(const struct lu_env *env,
-			     enum changelog_rec_type type,
-			     enum changelog_rec_flags clf_flags,
+			     enum changelog_rec_type type, int flags,
 			     struct md_device *m, const struct lu_fid *fid);
 
-	int (*moo_open)(const struct lu_env *env,
-			struct md_object *obj, u64 open_flags);
+        int (*moo_open)(const struct lu_env *env,
+                        struct md_object *obj, int flag);
 
-	int (*moo_close)(const struct lu_env *env, struct md_object *obj,
-			 struct md_attr *ma, u64 open_flags);
+        int (*moo_close)(const struct lu_env *env, struct md_object *obj,
+                         struct md_attr *ma, int mode);
 
-	int (*moo_object_sync)(const struct lu_env *, struct md_object *);
+        int (*moo_object_sync)(const struct lu_env *, struct md_object *);
 
 	int (*moo_object_lock)(const struct lu_env *env, struct md_object *obj,
 			       struct lustre_handle *lh,
@@ -258,62 +222,55 @@ struct md_object_operations {
 	 *
 	 * The caller should have held layout lock.
 	 *
-	 * This API can be extended to support every other layout changing
-	 * operations, such as component {add,del,change}, layout swap,
-	 * layout merge, etc. One of the benefits by doing this is that the MDT
-	 * no longer needs to understand layout.
-	 *
-	 * However, layout creation, removal, and fetch should still use
-	 * xattr_{get,set}() because they don't interpret layout on the
-	 * MDT layer.
-	 *
 	 * \param[in] env	execution environment
 	 * \param[in] obj	MD object
 	 * \param[in] layout	data structure to describe the changes to
 	 *			the MD object's layout
+	 * \param[in] buf	buffer containing the client's lovea
 	 *
 	 * \retval 0		success
 	 * \retval -ne		error code
 	 */
 	int (*moo_layout_change)(const struct lu_env *env,
 				 struct md_object *obj,
-				 struct md_layout_change *layout);
+				 struct layout_intent *layout,
+				 const struct lu_buf *buf);
 };
 
 /**
  * Operations implemented for each directory object.
  */
 struct md_dir_operations {
-	int (*mdo_is_subdir)(const struct lu_env *env, struct md_object *obj,
-			     const struct lu_fid *fid);
+        int (*mdo_is_subdir) (const struct lu_env *env, struct md_object *obj,
+                              const struct lu_fid *fid, struct lu_fid *sfid);
 
-	int (*mdo_lookup)(const struct lu_env *env, struct md_object *obj,
-			  const struct lu_name *lname, struct lu_fid *fid,
-			  struct md_op_spec *spec);
+        int (*mdo_lookup)(const struct lu_env *env, struct md_object *obj,
+                          const struct lu_name *lname, struct lu_fid *fid,
+                          struct md_op_spec *spec);
 
-	mdl_mode_t (*mdo_lock_mode)(const struct lu_env *env,
-				    struct md_object *obj,
-				    mdl_mode_t mode);
+        mdl_mode_t (*mdo_lock_mode)(const struct lu_env *env,
+                                    struct md_object *obj,
+                                    mdl_mode_t mode);
 
-	int (*mdo_create)(const struct lu_env *env, struct md_object *pobj,
-			  const struct lu_name *lname, struct md_object *child,
-			  struct md_op_spec *spec,
-			  struct md_attr *ma);
+        int (*mdo_create)(const struct lu_env *env, struct md_object *pobj,
+                          const struct lu_name *lname, struct md_object *child,
+                          struct md_op_spec *spec,
+                          struct md_attr *ma);
 
-	/** This method is used for creating data object for this meta object*/
-	int (*mdo_create_data)(const struct lu_env *env, struct md_object *p,
-			       struct md_object *o,
-			       const struct md_op_spec *spec,
-			       struct md_attr *ma);
+        /** This method is used for creating data object for this meta object*/
+        int (*mdo_create_data)(const struct lu_env *env, struct md_object *p,
+                               struct md_object *o,
+                               const struct md_op_spec *spec,
+                               struct md_attr *ma);
 
-	int (*mdo_rename)(const struct lu_env *env, struct md_object *spobj,
-			  struct md_object *tpobj, const struct lu_fid *lf,
-			  const struct lu_name *lsname, struct md_object *tobj,
-			  const struct lu_name *ltname, struct md_attr *ma);
+        int (*mdo_rename)(const struct lu_env *env, struct md_object *spobj,
+                          struct md_object *tpobj, const struct lu_fid *lf,
+                          const struct lu_name *lsname, struct md_object *tobj,
+                          const struct lu_name *ltname, struct md_attr *ma);
 
-	int (*mdo_link)(const struct lu_env *env, struct md_object *tgt_obj,
-			struct md_object *src_obj, const struct lu_name *lname,
-			struct md_attr *ma);
+        int (*mdo_link)(const struct lu_env *env, struct md_object *tgt_obj,
+                        struct md_object *src_obj, const struct lu_name *lname,
+                        struct md_attr *ma);
 
 	int (*mdo_unlink)(const struct lu_env *env, struct md_object *pobj,
 			  struct md_object *cobj, const struct lu_name *lname,
@@ -321,8 +278,7 @@ struct md_dir_operations {
 
 	int (*mdo_migrate)(const struct lu_env *env, struct md_object *pobj,
 			   struct md_object *sobj, const struct lu_name *lname,
-			   struct md_object *tobj, struct md_op_spec *spec,
-			   struct md_attr *ma);
+			   struct md_object *tobj, struct md_attr *ma);
 };
 
 struct md_device_operations {
@@ -330,8 +286,8 @@ struct md_device_operations {
 	int (*mdo_root_get)(const struct lu_env *env, struct md_device *m,
 			    struct lu_fid *f);
 
-	const struct dt_device_param *(*mdo_dtconf_get)(const struct lu_env *e,
-							struct md_device *m);
+	int (*mdo_maxeasize_get)(const struct lu_env *env, struct md_device *m,
+				int *easize);
 
         int (*mdo_statfs)(const struct lu_env *env, struct md_device *m,
                           struct obd_statfs *sfs);
@@ -390,19 +346,22 @@ static inline struct md_object *md_object_find_slice(const struct lu_env *env,
 
 
 /** md operations */
-static inline int mo_permission(const struct lu_env *env, struct md_object *p,
-				struct md_object *c, struct md_attr *at,
-				int mask)
+static inline int mo_permission(const struct lu_env *env,
+                                struct md_object *p,
+                                struct md_object *c,
+                                struct md_attr *at,
+                                int mask)
 {
-	LASSERT(c->mo_ops->moo_permission);
-	return c->mo_ops->moo_permission(env, p, c, at, mask);
+        LASSERT(c->mo_ops->moo_permission);
+        return c->mo_ops->moo_permission(env, p, c, at, mask);
 }
 
-static inline int mo_attr_get(const struct lu_env *env, struct md_object *m,
-			      struct md_attr *at)
+static inline int mo_attr_get(const struct lu_env *env,
+                              struct md_object *m,
+                              struct md_attr *at)
 {
-	LASSERT(m->mo_ops->moo_attr_get);
-	return m->mo_ops->moo_attr_get(env, m, at);
+        LASSERT(m->mo_ops->moo_attr_get);
+        return m->mo_ops->moo_attr_get(env, m, at);
 }
 
 static inline int mo_readlink(const struct lu_env *env,
@@ -415,8 +374,8 @@ static inline int mo_readlink(const struct lu_env *env,
 
 static inline int mo_changelog(const struct lu_env *env,
 			       enum changelog_rec_type type,
-			       enum changelog_rec_flags clf_flags,
-			       struct md_device *m, const struct lu_fid *fid)
+			       int flags, struct md_device *m,
+			       const struct lu_fid *fid)
 {
 	struct lu_fid rootfid;
 	struct md_object *root;
@@ -431,7 +390,7 @@ static inline int mo_changelog(const struct lu_env *env,
 		RETURN(PTR_ERR(root));
 
 	LASSERT(root->mo_ops->moo_changelog);
-	rc = root->mo_ops->moo_changelog(env, type, clf_flags, m, fid);
+	rc = root->mo_ops->moo_changelog(env, type, flags, m, fid);
 
 	lu_object_put(env, &root->mo_lu);
 
@@ -489,11 +448,12 @@ static inline int mo_invalidate(const struct lu_env *env, struct md_object *m)
 
 static inline int mo_layout_change(const struct lu_env *env,
 				   struct md_object *m,
-				   struct md_layout_change *layout)
+				   struct layout_intent *layout,
+				   const struct lu_buf *buf)
 {
 	/* need instantiate objects which in the access range */
 	LASSERT(m->mo_ops->moo_layout_change);
-	return m->mo_ops->moo_layout_change(env, m, layout);
+	return m->mo_ops->moo_layout_change(env, m, layout, buf);
 }
 
 static inline int mo_swap_layouts(const struct lu_env *env,
@@ -507,18 +467,21 @@ static inline int mo_swap_layouts(const struct lu_env *env,
 	return o1->mo_ops->moo_swap_layouts(env, o1, o2, flags);
 }
 
-static inline int mo_open(const struct lu_env *env, struct md_object *m,
-			  u64 open_flags)
+static inline int mo_open(const struct lu_env *env,
+                          struct md_object *m,
+                          int flags)
 {
-	LASSERT(m->mo_ops->moo_open);
-	return m->mo_ops->moo_open(env, m, open_flags);
+        LASSERT(m->mo_ops->moo_open);
+        return m->mo_ops->moo_open(env, m, flags);
 }
 
-static inline int mo_close(const struct lu_env *env, struct md_object *m,
-			   struct md_attr *ma, u64 open_flags)
+static inline int mo_close(const struct lu_env *env,
+                           struct md_object *m,
+                           struct md_attr *ma,
+                           int mode)
 {
-	LASSERT(m->mo_ops->moo_close);
-	return m->mo_ops->moo_close(env, m, ma, open_flags);
+        LASSERT(m->mo_ops->moo_close);
+        return m->mo_ops->moo_close(env, m, ma, mode);
 }
 
 static inline int mo_readpage(const struct lu_env *env,
@@ -613,20 +576,19 @@ static inline int mdo_migrate(const struct lu_env *env,
 			     struct md_object *sobj,
 			     const struct lu_name *lname,
 			     struct md_object *tobj,
-			     struct md_op_spec *spec,
 			     struct md_attr *ma)
 {
 	LASSERT(pobj->mo_dir_ops->mdo_migrate);
-	return pobj->mo_dir_ops->mdo_migrate(env, pobj, sobj, lname, tobj, spec,
-					     ma);
+	return pobj->mo_dir_ops->mdo_migrate(env, pobj, sobj, lname, tobj, ma);
 }
 
 static inline int mdo_is_subdir(const struct lu_env *env,
-				struct md_object *mo,
-				const struct lu_fid *fid)
+                                struct md_object *mo,
+                                const struct lu_fid *fid,
+                                struct lu_fid *sfid)
 {
-	LASSERT(mo->mo_dir_ops->mdo_is_subdir);
-	return mo->mo_dir_ops->mdo_is_subdir(env, mo, fid);
+        LASSERT(mo->mo_dir_ops->mdo_is_subdir);
+        return mo->mo_dir_ops->mdo_is_subdir(env, mo, fid, sfid);
 }
 
 static inline int mdo_link(const struct lu_env *env,
@@ -649,14 +611,6 @@ static inline int mdo_unlink(const struct lu_env *env,
 	return p->mo_dir_ops->mdo_unlink(env, p, c, lname, ma, no_name);
 }
 
-static inline int mdo_statfs(const struct lu_env *env,
-			     struct md_device *m,
-			     struct obd_statfs *sfs)
-{
-	LASSERT(m->md_ops->mdo_statfs);
-	return m->md_ops->mdo_statfs(env, m, sfs);
-}
-
 /**
  * Used in MDD/OUT layer for object lock rule
  **/
@@ -670,7 +624,6 @@ enum mdd_object_role {
 
 struct dt_device;
 
-void lustre_som_swab(struct lustre_som_attrs *attrs);
 int lustre_buf2hsm(void *buf, int rc, struct md_hsm *mh);
 void lustre_hsm2buf(void *buf, const struct md_hsm *mh);
 
@@ -697,8 +650,6 @@ struct lu_ucred {
 	struct group_info	*uc_ginfo;
 	struct md_identity	*uc_identity;
 	char			 uc_jobid[LUSTRE_JOBID_SIZE];
-	lnet_nid_t		 uc_nid;
-	bool			 uc_enable_audit;
 };
 
 struct lu_ucred *lu_ucred(const struct lu_env *env);
diff --git a/drivers/staging/lustrefsx/lustre/include/obd.h b/drivers/staging/lustrefsx/lustre/include/obd.h
index 7c00e69a20322..9d49ce5a2a17a 100644
--- a/drivers/staging/lustrefsx/lustre/include/obd.h
+++ b/drivers/staging/lustrefsx/lustre/include/obd.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -33,11 +33,9 @@
 #ifndef __OBD_H
 #define __OBD_H
 
-#include <linux/kobject.h>
 #include <linux/spinlock.h>
-#include <linux/sysfs.h>
 
-#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre/lustre_idl.h>
 #include <lustre_lib.h>
 #include <libcfs/bitmap.h>
 #ifdef HAVE_SERVER_SUPPORT
@@ -102,15 +100,11 @@ struct obd_type {
 	struct md_ops		*typ_md_ops;
 	struct proc_dir_entry	*typ_procroot;
 	struct proc_dir_entry	*typ_procsym;
-	struct dentry		*typ_debugfs_entry;
-#ifdef HAVE_SERVER_SUPPORT
-	bool			 typ_sym_filter;
-#endif
+	__u32			 typ_sym_filter;
 	char			*typ_name;
 	int			 typ_refcnt;
 	struct lu_device_type	*typ_lu;
 	spinlock_t		 obd_type_lock;
-	struct kobject		*typ_kobj;
 };
 
 struct brw_page {
@@ -122,7 +116,7 @@ struct brw_page {
 
 struct timeout_item {
 	enum timeout_event ti_event;
-	time64_t	   ti_timeout;
+	cfs_time_t         ti_timeout;
 	timeout_cb_t       ti_cb;
 	void              *ti_cb_data;
 	struct list_head   ti_obd_list;
@@ -132,15 +126,16 @@ struct timeout_item {
 #define OBD_MAX_RIF_DEFAULT	8
 #define OBD_MAX_RIF_MAX		512
 #define OSC_MAX_RIF_MAX		256
-#define OSC_MAX_DIRTY_DEFAULT	2000	 /* Arbitrary large value */
+#define OSC_MAX_DIRTY_DEFAULT	(OBD_MAX_RIF_DEFAULT * 4)
 #define OSC_MAX_DIRTY_MB_MAX	2048     /* arbitrary, but < MAX_LONG bytes */
 #define OSC_DEFAULT_RESENDS	10
 
-/* possible values for lut_sync_lock_cancel */
-enum tgt_sync_lock_cancel {
-	SYNC_LOCK_CANCEL_NEVER    = 0,
-	SYNC_LOCK_CANCEL_BLOCKING = 1,
-	SYNC_LOCK_CANCEL_ALWAYS   = 2,
+/* possible values for fo_sync_lock_cancel */
+enum {
+        NEVER_SYNC_ON_CANCEL = 0,
+        BLOCKING_SYNC_ON_CANCEL = 1,
+        ALWAYS_SYNC_ON_CANCEL = 2,
+        NUM_SYNC_ON_CANCEL_STATES
 };
 
 /*
@@ -186,17 +181,6 @@ struct client_obd {
 	 * run-time if a larger observed size is advertised by the MDT. */
 	__u32			 cl_max_mds_easize;
 
-	/* Data-on-MDT specific value to set larger reply buffer for possible
-	 * data read along with open/stat requests. By default it tries to use
-	 * unused space in reply buffer.
-	 * This value is used to ensure that reply buffer has at least as
-	 * much free space as value indicates. That free space is gained from
-	 * LOV EA buffer which is small for DoM files and on big systems can
-	 * provide up to 32KB of extra space in reply buffer.
-	 * Default value is 8K now.
-	 */
-	__u32			 cl_dom_min_inline_repsize;
-
 	enum lustre_sec_part	 cl_sp_me;
 	enum lustre_sec_part	 cl_sp_to;
 	struct sptlrpc_flavor	 cl_flvr_mgc; /* fixed flavor of mgc->mgs */
@@ -204,6 +188,7 @@ struct client_obd {
 	/* the grant values are protected by loi_list_lock below */
 	unsigned long		 cl_dirty_pages;      /* all _dirty_ in pages */
 	unsigned long		 cl_dirty_max_pages;  /* allowed w/o rpc */
+	unsigned long		 cl_dirty_transit;    /* dirty synchronous */
 	unsigned long		 cl_avail_grant;   /* bytes of credit for ost */
 	unsigned long		 cl_lost_grant;    /* lost credits (trunc) */
 	/* grant consumed for dirty pages */
@@ -214,10 +199,10 @@ struct client_obd {
 	 * grant before trying to dirty a page and unreserve the rest.
 	 * See osc_{reserve|unreserve}_grant for details. */
 	long			cl_reserved_grant;
-	wait_queue_head_t	cl_cache_waiters; /* waiting for cache/grant */
-	time64_t		cl_next_shrink_grant;	/* seconds */
-	struct list_head	cl_grant_chain;
-	time64_t		cl_grant_shrink_interval; /* seconds */
+	struct list_head	cl_cache_waiters; /* waiting for cache/grant */
+	cfs_time_t		cl_next_shrink_grant;   /* jiffies */
+	struct list_head	cl_grant_shrink_list;  /* Timeout event list */
+	int			cl_grant_shrink_interval; /* seconds */
 
 	/* A chunk is an optimal size used by osc_extent to determine
 	 * the extent size. A chunk is max(PAGE_SIZE, OST block size) */
@@ -255,9 +240,8 @@ struct client_obd {
 	/* just a sum of the loi/lop pending numbers to be exported by /proc */
 	atomic_t		cl_pending_w_pages;
 	atomic_t		cl_pending_r_pages;
-	u32			cl_max_pages_per_rpc;
-	u32			cl_max_rpcs_in_flight;
-	u32			cl_max_short_io_bytes;
+	__u32			cl_max_pages_per_rpc;
+	__u32			cl_max_rpcs_in_flight;
 	struct obd_histogram	cl_read_rpc_hist;
 	struct obd_histogram	cl_write_rpc_hist;
 	struct obd_histogram	cl_read_page_hist;
@@ -304,6 +288,8 @@ struct client_obd {
 	atomic_t		 cl_destroy_in_flight;
 	wait_queue_head_t	 cl_destroy_waitq;
 
+        struct mdc_rpc_lock     *cl_rpc_lock;
+
 	/* modify rpcs in flight
 	 * currently used for metadata only */
 	spinlock_t		 cl_mod_rpcs_lock;
@@ -318,11 +304,8 @@ struct client_obd {
 	struct mutex		  cl_mgc_mutex;
 	struct local_oid_storage *cl_mgc_los;
 	struct dt_object	 *cl_mgc_configs_dir;
-	struct obd_export        *cl_mgc_mgsexp;
 	atomic_t		  cl_mgc_refcount;
-	/* in-flight control list and total RPCs counter */
-	struct list_head	 cl_flight_waiters;
-	__u32			 cl_rpcs_in_flight;
+	struct obd_export        *cl_mgc_mgsexp;
 
         /* checksumming for data sent over the network */
 	unsigned int		 cl_checksum:1, /* 0 = disabled, 1 = enabled */
@@ -330,7 +313,7 @@ struct client_obd {
         /* supported checksum types that are worked out at connect time */
         __u32                    cl_supp_cksum_types;
         /* checksum algorithm to be used */
-	enum cksum_types	 cl_cksum_type;
+        cksum_type_t             cl_cksum_type;
 
         /* also protected by the poorly named _loi_list_lock lock above */
         struct osc_async_rc      cl_ar;
@@ -344,11 +327,8 @@ struct client_obd {
 	/* ptlrpc work for writeback in ptlrpcd context */
 	void			*cl_writeback_work;
 	void			*cl_lru_work;
-	struct mutex		  cl_quota_mutex;
 	/* hash tables for osc_quota_info */
 	struct cfs_hash		*cl_quota_hash[LL_MAXQUOTAS];
-	/* the xid of the request updating the hash tables */
-	__u64			 cl_quota_last_xid;
 	/* Links to the global list of registered changelog devices */
 	struct list_head	 cl_chg_dev_linkage;
 };
@@ -378,8 +358,6 @@ struct ost_pool {
 
 /* allow statfs data caching for 1 second */
 #define OBD_STATFS_CACHE_SECONDS 1
-/* arbitrary maximum. larger would be useless, allows catching bogus input */
-#define OBD_STATFS_CACHE_MAX_AGE 3600 /* seconds */
 
 struct lov_tgt_desc {
 	struct list_head    ltd_kill;
@@ -393,11 +371,6 @@ struct lov_tgt_desc {
                             ltd_reap:1;  /* should this target be deleted */
 };
 
-struct lov_md_tgt_desc {
-	struct obd_device *lmtd_mdc;
-	__u32		   lmtd_index;
-};
-
 struct lov_obd {
 	struct lov_desc		desc;
 	struct lov_tgt_desc   **lov_tgts;		/* sparse array */
@@ -420,15 +393,10 @@ struct lov_obd {
 	struct cl_client_cache *lov_cache;
 
 	struct rw_semaphore	lov_notify_lock;
-	/* Data-on-MDT: MDC array */
-	struct lov_md_tgt_desc	*lov_mdc_tgts;
-
-	struct kobject		*lov_tgts_kobj;
 };
 
 struct lmv_tgt_desc {
 	struct obd_uuid		ltd_uuid;
-	struct obd_device	*ltd_obd;
 	struct obd_export	*ltd_exp;
 	__u32			ltd_idx;
 	struct mutex		ltd_fid_mutex;
@@ -439,23 +407,19 @@ struct lmv_obd {
 	struct lu_client_fld	lmv_fld;
 	spinlock_t		lmv_lock;
 	struct lmv_desc		desc;
+	struct proc_dir_entry	*targets_proc_entry;
 
 	struct mutex		lmv_init_mutex;
 	int			connected;
 	int			max_easize;
 	int			max_def_easize;
-	u32			lmv_statfs_start;
 
-	u32			tgts_size; /* size of tgts array */
+	__u32			tgts_size; /* size of tgts array */
 	struct lmv_tgt_desc	**tgts;
 
 	struct obd_connect_data	conn_data;
-	struct kobject		*lmv_tgts_kobj;
 };
 
-/* Minimum sector size is 512 */
-#define MAX_GUARD_NUMBER (PAGE_SIZE / 512)
-
 struct niobuf_local {
 	__u64		lnb_file_offset;
 	__u32		lnb_page_offset;
@@ -464,11 +428,6 @@ struct niobuf_local {
 	int		lnb_rc;
 	struct page	*lnb_page;
 	void		*lnb_data;
-	__u16		lnb_guards[MAX_GUARD_NUMBER];
-	__u16		lnb_guard_rpc:1;
-	__u16		lnb_guard_disk:1;
-	/* separate unlock for read path to allow shared access */
-	__u16		lnb_locked:1;
 };
 
 struct tgt_thread_big_cache {
@@ -581,7 +540,7 @@ enum obd_notify_event {
 
 /*
  * Data structure used to pass obd_notify()-event to non-obd listeners (llite
- * being main example).
+ * and liblustre being main examples).
  */
 struct obd_notify_upcall {
 	int (*onu_upcall)(struct obd_device *host, struct obd_device *watched,
@@ -636,6 +595,7 @@ struct obd_device {
 					 * (for /proc/status only!!) */
 		obd_no_ir:1,		/* no imperative recovery. */
 		obd_process_conf:1,	/* device is processing mgs config */
+		obd_uses_nid_stats:1,	/* maintain per-client OBD stats */
 		obd_checksum_dump:1;	/* dump pages upon cksum error */
 
         /* use separate field as it is set in interrupt to don't mess with
@@ -663,7 +623,7 @@ struct obd_device {
 	spinlock_t		obd_dev_lock; /* protect OBD bitfield above */
 	spinlock_t		obd_osfs_lock;
 	struct obd_statfs	obd_osfs;       /* locked by obd_osfs_lock */
-	time64_t		obd_osfs_age;
+	__u64			obd_osfs_age;
 	__u64			obd_last_committed;
 	struct mutex		obd_dev_mutex;
 	struct lvfs_run_ctxt	obd_lvfs_ctxt;
@@ -675,9 +635,9 @@ struct obd_device {
 	struct obd_export	*obd_lwp_export;
 	/* list of exports in LRU order, for ping evictor, with obd_dev_lock */
 	struct list_head	obd_exports_timed;
-	time64_t		obd_eviction_timer;	/* for ping evictor */
+	time_t			obd_eviction_timer;	/* for ping evictor */
 
-	atomic_t                obd_max_recoverable_clients;
+	int                     obd_max_recoverable_clients;
 	atomic_t                obd_connected_clients;
 	int                     obd_stale_clients;
         /* this lock protects all recovery list_heads, timer and
@@ -688,7 +648,7 @@ struct obd_device {
 	int			obd_requests_queued_for_recovery;
 	wait_queue_head_t	obd_next_transno_waitq;
 	/* protected by obd_recovery_task_lock */
-	struct hrtimer		obd_recovery_timer;
+	struct timer_list	obd_recovery_timer;
 	/* seconds */
 	time64_t		obd_recovery_start;
 	/* seconds, for lprocfs_status */
@@ -723,17 +683,16 @@ struct obd_device {
 
 	/* Fields used by LProcFS */
 	struct lprocfs_stats		*obd_stats;
+	unsigned int			obd_cntr_base;
 
+	unsigned int			 obd_md_cntr_base;
 	struct lprocfs_stats		*obd_md_stats;
 
-	struct dentry			*obd_debugfs_entry;
 	struct proc_dir_entry	*obd_proc_entry;
 	struct proc_dir_entry	*obd_proc_exports_entry;
-	struct dentry			*obd_svc_debugfs_entry;
+	struct proc_dir_entry	*obd_svc_procroot;
 	struct lprocfs_stats	*obd_svc_stats;
-	const struct attribute	       **obd_attrs;
 	struct lprocfs_vars	*obd_vars;
-	struct ldebugfs_vars	*obd_debugfs_vars;
 	atomic_t		obd_evict_inprogress;
 	wait_queue_head_t	obd_evict_inprogress_waitq;
 	struct list_head	obd_evict_list;	/* protected with pet_lock */
@@ -750,10 +709,6 @@ struct obd_device {
 	/**
 	 * List of outstanding class_incref()'s fo this OBD. For debugging. */
 	struct lu_ref			obd_reference;
-
-	struct kset		        obd_kset; /* sysfs object collection */
-	struct kobj_type		obd_ktype;
-	struct completion		obd_kobj_unregister;
 };
 
 /* get/set_info keys */
@@ -786,17 +741,6 @@ struct obd_device {
 #define KEY_CACHE_LRU_SHRINK	"cache_lru_shrink"
 #define KEY_OSP_CONNECTED	"osp_connected"
 
-/* Flags for op_xvalid */
-enum op_xvalid {
-	OP_XVALID_CTIME_SET	= BIT(0),	/* 0x0001 */
-	OP_XVALID_BLOCKS	= BIT(1),	/* 0x0002 */
-	OP_XVALID_OWNEROVERRIDE	= BIT(2),	/* 0x0004 */
-	OP_XVALID_FLAGS		= BIT(3),	/* 0x0008 */
-	OP_XVALID_PROJID	= BIT(4),	/* 0x0010 */
-	OP_XVALID_LAZYSIZE	= BIT(5),	/* 0x0020 */
-	OP_XVALID_LAZYBLOCKS	= BIT(6),	/* 0x0040 */
-};
-
 struct lu_context;
 
 static inline int it_to_lock_mode(struct lookup_intent *it)
@@ -804,14 +748,15 @@ static inline int it_to_lock_mode(struct lookup_intent *it)
 	/* CREAT needs to be tested before open (both could be set) */
 	if (it->it_op & IT_CREAT)
 		return LCK_CW;
-	else if (it->it_op & (IT_GETATTR | IT_OPEN | IT_LOOKUP))
+	else if (it->it_op & (IT_GETATTR | IT_OPEN | IT_LOOKUP |
+			      IT_LAYOUT))
 		return LCK_CR;
-	else if (it->it_op & IT_LAYOUT)
-		return (it->it_flags & FMODE_WRITE) ? LCK_EX : LCK_CR;
 	else if (it->it_op &  IT_READDIR)
 		return LCK_PR;
 	else if (it->it_op &  IT_GETXATTR)
 		return LCK_PR;
+	else if (it->it_op &  IT_SETXATTR)
+		return LCK_PW;
 
 	LASSERTF(0, "Invalid it_op: %d\n", it->it_op);
 	return -EINVAL;
@@ -823,7 +768,6 @@ enum md_op_flags {
 	MF_MDC_CANCEL_FID3	= 1 << 2,
 	MF_MDC_CANCEL_FID4	= 1 << 3,
 	MF_GET_MDT_IDX		= 1 << 4,
-	MF_GETATTR_BY_FID	= 1 << 5,
 };
 
 enum md_cli_flags {
@@ -841,7 +785,7 @@ enum md_cli_flags {
  */
 static inline bool it_has_reply_body(const struct lookup_intent *it)
 {
-	return it->it_op & (IT_OPEN | IT_LOOKUP | IT_GETATTR);
+	return it->it_op & (IT_OPEN | IT_UNLINK | IT_LOOKUP | IT_GETATTR);
 }
 
 struct md_op_data {
@@ -851,12 +795,10 @@ struct md_op_data {
 	struct lu_fid		op_fid4; /* to the operation locks. */
 	u32			op_mds;  /* what mds server open will go to */
 	__u32			op_mode;
-	struct lustre_handle	op_open_handle;
+	struct lustre_handle	op_handle;
 	s64			op_mod_time;
 	const char		*op_name;
 	size_t			op_namelen;
-	struct rw_semaphore	*op_mea1_sem;
-	struct rw_semaphore	*op_mea2_sem;
 	struct lmv_stripe_md	*op_mea1;
 	struct lmv_stripe_md	*op_mea2;
 	__u32			op_suppgids[2];
@@ -868,10 +810,9 @@ struct md_op_data {
 
 	/* iattr fields and blocks. */
 	struct iattr            op_attr;
-	enum op_xvalid		op_xvalid;	/* eXtra validity flags */
 	loff_t                  op_attr_blocks;
-	u64			op_valid;	/* OBD_MD_* */
-	unsigned int		op_attr_flags;	/* LUSTRE_{SYNC,..}_FL */
+	__u64                   op_valid; /* OBD_MD_* */
+	unsigned int		op_attr_flags; /* LUSTRE_{SYNC,..}_FL */
 
 	enum md_op_flags	op_flags;
 
@@ -886,9 +827,8 @@ struct md_op_data {
 	__u64			op_data_version;
 	struct lustre_handle	op_lease_handle;
 
-	/* File security context, for creates/metadata ops */
+	/* File security context, for creates. */
 	const char	       *op_file_secctx_name;
-	__u32			op_file_secctx_name_size;
 	void		       *op_file_secctx;
 	__u32			op_file_secctx_size;
 
@@ -900,19 +840,6 @@ struct md_op_data {
 	/* Used by readdir */
 	unsigned int		op_max_pages;
 
-	__u16			op_mirror_id;
-
-	/*
-	 * used to access migrating dir: if it's set, assume migration is
-	 * finished, use the new layout to access dir, otherwise use old layout.
-	 * By default it's not set, because new files are created under new
-	 * layout, if we can't find file with name under both old and new
-	 * layout, we are sure file with name doesn't exist, but in reverse
-	 * order there may be a race with creation by others.
-	 */
-	bool			op_post_migrate;
-	/* used to access dir with bash hash */
-	__u32			op_stripe_index;
 };
 
 struct md_callback {
@@ -984,9 +911,9 @@ struct obd_ops {
 	 * about this.
 	 */
 	int (*o_statfs)(const struct lu_env *, struct obd_export *exp,
-			struct obd_statfs *osfs, time64_t max_age, __u32 flags);
+			struct obd_statfs *osfs, __u64 max_age, __u32 flags);
 	int (*o_statfs_async)(struct obd_export *exp, struct obd_info *oinfo,
-			      time64_t max_age, struct ptlrpc_request_set *set);
+			      __u64 max_age, struct ptlrpc_request_set *set);
 	int (*o_create)(const struct lu_env *env, struct obd_export *exp,
 			struct obdo *oa);
 	int (*o_destroy)(const struct lu_env *env, struct obd_export *exp,
@@ -1020,6 +947,8 @@ struct obd_ops {
 	int (*o_quotactl)(struct obd_device *, struct obd_export *,
 			  struct obd_quotactl *);
 
+	int (*o_ping)(const struct lu_env *, struct obd_export *exp);
+
 	/* pools methods */
 	int (*o_pool_new)(struct obd_device *obd, char *poolname);
 	int (*o_pool_del)(struct obd_device *obd, char *poolname);
@@ -1027,6 +956,12 @@ struct obd_ops {
 			  char *ostname);
 	int (*o_pool_rem)(struct obd_device *obd, char *poolname,
 			  char *ostname);
+	void (*o_getref)(struct obd_device *obd);
+	void (*o_putref)(struct obd_device *obd);
+	/*
+	 * NOTE: If adding ops, add another LPROCFS_OBD_OP_INIT() line
+	 * to lprocfs_alloc_obd_stats() in obdclass/lprocfs_status.c.
+	 * Also, add a wrapper function in include/linux/obd_class.h. */
 };
 
 /* lmv structures */
@@ -1048,7 +983,7 @@ struct md_open_data {
 };
 
 struct obd_client_handle {
-	struct lustre_handle	 och_open_handle;
+	struct lustre_handle	 och_fh;
 	struct lu_fid		 och_fid;
 	struct md_open_data	*och_mod;
 	struct lustre_handle	 och_lease_handle; /* open lock for lease */
@@ -1062,6 +997,18 @@ struct lookup_intent;
 struct cl_attr;
 
 struct md_ops {
+	/* Every operation from MD_STATS_FIRST_OP up to and including
+	 * MD_STATS_LAST_OP will be counted by EXP_MD_OP_INCREMENT()
+	 * and will appear in /proc/fs/lustre/{lmv,mdc}/.../md_stats.
+	 * Operations after MD_STATS_LAST_OP are excluded from stats.
+	 * There are a few reasons for doing this: we prune the 17
+	 * counters which will be of minimal use in understanding
+	 * metadata utilization, we save memory by allocating 15
+	 * instead of 32 counters, we save cycles by not counting.
+	 *
+	 * MD_STATS_FIRST_OP must be the first member of md_ops.
+	 */
+#define MD_STATS_FIRST_OP m_close
 	int (*m_close)(struct obd_export *, struct md_op_data *,
 		       struct md_open_data *, struct ptlrpc_request **);
 
@@ -1102,11 +1049,12 @@ struct md_ops {
 			struct ptlrpc_request **);
 
 	int (*m_setxattr)(struct obd_export *, const struct lu_fid *,
-			  u64, const char *, const void *, size_t, unsigned int,
-			  u32, struct ptlrpc_request **);
+			  u64, const char *, const char *, int, int, int, u32,
+			  struct ptlrpc_request **);
 
 	int (*m_getxattr)(struct obd_export *, const struct lu_fid *,
-			  u64, const char *, size_t, struct ptlrpc_request **);
+			  u64, const char *, const char *, int, int, int,
+			  struct ptlrpc_request **);
 
 	int (*m_intent_getattr_async)(struct obd_export *,
 				      struct md_enqueue_info *);
@@ -1114,7 +1062,7 @@ struct md_ops {
         int (*m_revalidate_lock)(struct obd_export *, struct lookup_intent *,
                                  struct lu_fid *, __u64 *bits);
 
-	int (*m_file_resync)(struct obd_export *, struct md_op_data *);
+#define MD_STATS_LAST_OP m_revalidate_lock
 
 	int (*m_get_root)(struct obd_export *, const char *, struct lu_fid *);
 	int (*m_null_inode)(struct obd_export *, const struct lu_fid *);
@@ -1159,8 +1107,6 @@ struct md_ops {
 				  struct lu_fid *fid);
 	int (*m_unpackmd)(struct obd_export *exp, struct lmv_stripe_md **plsm,
 			  const union lmv_mds_md *lmv, size_t lmv_size);
-	int (*m_rmfid)(struct obd_export *exp, struct fid_array *fa, int *rcs,
-		       struct ptlrpc_request_set *set);
 };
 
 static inline struct md_open_data *obd_mod_alloc(void)
@@ -1255,8 +1201,7 @@ static inline int cli_brw_size(struct obd_device *obd)
 	return obd->u.cli.cl_max_pages_per_rpc << PAGE_SHIFT;
 }
 
-/*
- * When RPC size or the max RPCs in flight is increased, the max dirty pages
+/* when RPC size or the max RPCs in flight is increased, the max dirty pages
  * of the client should be increased accordingly to avoid sending fragmented
  * RPCs over the network when the client runs out of the maximum dirty space
  * when so many RPCs are being generated.
@@ -1264,10 +1209,10 @@ static inline int cli_brw_size(struct obd_device *obd)
 static inline void client_adjust_max_dirty(struct client_obd *cli)
 {
 	 /* initializing */
-	if (cli->cl_dirty_max_pages <= 0) {
-		cli->cl_dirty_max_pages =
-			(OSC_MAX_DIRTY_DEFAULT * 1024 * 1024) >> PAGE_SHIFT;
-	} else {
+	if (cli->cl_dirty_max_pages <= 0)
+		cli->cl_dirty_max_pages = (OSC_MAX_DIRTY_DEFAULT * 1024 * 1024)
+							>> PAGE_SHIFT;
+	else {
 		unsigned long dirty_max = cli->cl_max_rpcs_in_flight *
 					  cli->cl_max_pages_per_rpc;
 
@@ -1277,12 +1222,6 @@ static inline void client_adjust_max_dirty(struct client_obd *cli)
 
 	if (cli->cl_dirty_max_pages > cfs_totalram_pages() / 8)
 		cli->cl_dirty_max_pages = cfs_totalram_pages() / 8;
-
-	/* This value is exported to userspace through the max_dirty_mb
-	 * parameter.  So we round up the number of pages to make it a round
-	 * number of MBs. */
-	cli->cl_dirty_max_pages = round_up(cli->cl_dirty_max_pages,
-					   1 << (20 - PAGE_SHIFT));
 }
 
 #endif /* __OBD_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/obd_cksum.h b/drivers/staging/lustrefsx/lustre/include/obd_cksum.h
index 6e807d762c354..6a0cfe8d72fc0 100644
--- a/drivers/staging/lustrefsx/lustre/include/obd_cksum.h
+++ b/drivers/staging/lustrefsx/lustre/include/obd_cksum.h
@@ -34,12 +34,9 @@
 #define __OBD_CKSUM
 #include <libcfs/libcfs.h>
 #include <libcfs/libcfs_crypto.h>
-#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre/lustre_idl.h>
 
-int obd_t10_cksum_speed(const char *obd_name,
-			enum cksum_types cksum_type);
-
-static inline unsigned char cksum_obd2cfs(enum cksum_types cksum_type)
+static inline unsigned char cksum_obd2cfs(cksum_type_t cksum_type)
 {
 	switch (cksum_type) {
 	case OBD_CKSUM_CRC32:
@@ -55,23 +52,58 @@ static inline unsigned char cksum_obd2cfs(enum cksum_types cksum_type)
 	return 0;
 }
 
-u32 obd_cksum_type_pack(const char *obd_name, enum cksum_types cksum_type);
+/* The OBD_FL_CKSUM_* flags is packed into 5 bits of o_flags, since there can
+ * only be a single checksum type per RPC.
+ *
+ * The OBD_CHECKSUM_* type bits passed in ocd_cksum_types are a 32-bit bitmask
+ * since they need to represent the full range of checksum algorithms that
+ * both the client and server can understand.
+ *
+ * In case of an unsupported types/flags we fall back to ADLER
+ * because that is supported by all clients since 1.8
+ *
+ * In case multiple algorithms are supported the best one is used. */
+static inline u32 cksum_type_pack(cksum_type_t cksum_type)
+{
+	unsigned int    performance = 0, tmp;
+	u32		flag = OBD_FL_CKSUM_ADLER;
+
+	if (cksum_type & OBD_CKSUM_CRC32) {
+		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32));
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_CRC32;
+		}
+	}
+	if (cksum_type & OBD_CKSUM_CRC32C) {
+		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C));
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_CRC32C;
+		}
+	}
+	if (cksum_type & OBD_CKSUM_ADLER) {
+		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER));
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_ADLER;
+		}
+	}
+	if (unlikely(cksum_type && !(cksum_type & (OBD_CKSUM_CRC32C |
+						   OBD_CKSUM_CRC32 |
+						   OBD_CKSUM_ADLER))))
+		CWARN("unknown cksum type %x\n", cksum_type);
+
+	return flag;
+}
 
-static inline enum cksum_types obd_cksum_type_unpack(u32 o_flags)
+static inline cksum_type_t cksum_type_unpack(u32 o_flags)
 {
 	switch (o_flags & OBD_FL_CKSUM_ALL) {
 	case OBD_FL_CKSUM_CRC32C:
 		return OBD_CKSUM_CRC32C;
 	case OBD_FL_CKSUM_CRC32:
 		return OBD_CKSUM_CRC32;
-	case OBD_FL_CKSUM_T10IP512:
-		return OBD_CKSUM_T10IP512;
-	case OBD_FL_CKSUM_T10IP4K:
-		return OBD_CKSUM_T10IP4K;
-	case OBD_FL_CKSUM_T10CRC512:
-		return OBD_CKSUM_T10CRC512;
-	case OBD_FL_CKSUM_T10CRC4K:
-		return OBD_CKSUM_T10CRC4K;
 	default:
 		break;
 	}
@@ -83,9 +115,9 @@ static inline enum cksum_types obd_cksum_type_unpack(u32 o_flags)
  * 1.8 supported ADLER it is base and not depend on hw
  * Client uses all available local algos
  */
-static inline enum cksum_types obd_cksum_types_supported_client(void)
+static inline cksum_type_t cksum_types_supported_client(void)
 {
-	enum cksum_types ret = OBD_CKSUM_ADLER;
+	cksum_type_t ret = OBD_CKSUM_ADLER;
 
 	CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n",
 	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)),
@@ -97,13 +129,32 @@ static inline enum cksum_types obd_cksum_types_supported_client(void)
 	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) > 0)
 		ret |= OBD_CKSUM_CRC32;
 
-	/* Client support all kinds of T10 checksum */
-	ret |= OBD_CKSUM_T10_ALL;
+	return ret;
+}
+
+/* Server uses algos that perform at 50% or better of the Adler */
+static inline enum cksum_types cksum_types_supported_server(void)
+{
+	enum cksum_types ret = OBD_CKSUM_ADLER;
+	int base_speed;
+
+	CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n",
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)),
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)),
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)));
+
+	base_speed = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)) / 2;
+
+	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) >=
+	    base_speed)
+		ret |= OBD_CKSUM_CRC32C;
+	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) >=
+	    base_speed)
+		ret |= OBD_CKSUM_CRC32;
 
 	return ret;
 }
 
-enum cksum_types obd_cksum_types_supported_server(const char *obd_name);
 
 /* Select the best checksum algorithm among those supplied in the cksum_types
  * input.
@@ -112,69 +163,13 @@ enum cksum_types obd_cksum_types_supported_server(const char *obd_name);
  * checksum type due to its benchmarking at libcfs module load.
  * Caution is advised, however, since what is fastest on a single client may
  * not be the fastest or most efficient algorithm on the server.  */
-static inline enum cksum_types
-obd_cksum_type_select(const char *obd_name, enum cksum_types cksum_types)
+static inline cksum_type_t cksum_type_select(cksum_type_t cksum_types)
 {
-	u32 flag = obd_cksum_type_pack(obd_name, cksum_types);
-
-	return obd_cksum_type_unpack(flag);
+	return cksum_type_unpack(cksum_type_pack(cksum_types));
 }
 
 /* Checksum algorithm names. Must be defined in the same order as the
  * OBD_CKSUM_* flags. */
-#define DECLARE_CKSUM_NAME const char *cksum_name[] = {"crc32", "adler", \
-	"crc32c", "reserved", "t10ip512", "t10ip4K", "t10crc512", "t10crc4K"}
-
-typedef __u16 (obd_dif_csum_fn) (void *, unsigned int);
-
-__u16 obd_dif_crc_fn(void *data, unsigned int len);
-__u16 obd_dif_ip_fn(void *data, unsigned int len);
-int obd_page_dif_generate_buffer(const char *obd_name, struct page *page,
-				 __u32 offset, __u32 length,
-				 __u16 *guard_start, int guard_number,
-				 int *used_number, int sector_size,
-				 obd_dif_csum_fn *fn);
-/*
- * If checksum type is one T10 checksum types, init the csum_fn and sector
- * size. Otherwise, init them to NULL/zero.
- */
-static inline void obd_t10_cksum2dif(enum cksum_types cksum_type,
-				     obd_dif_csum_fn **fn, int *sector_size)
-{
-	*fn = NULL;
-	*sector_size = 0;
-
-#if IS_ENABLED(CONFIG_CRC_T10DIF)
-	switch (cksum_type) {
-	case OBD_CKSUM_T10IP512:
-		*fn = obd_dif_ip_fn;
-		*sector_size = 512;
-		break;
-	case OBD_CKSUM_T10IP4K:
-		*fn = obd_dif_ip_fn;
-		*sector_size = 4096;
-		break;
-	case OBD_CKSUM_T10CRC512:
-		*fn = obd_dif_crc_fn;
-		*sector_size = 512;
-		break;
-	case OBD_CKSUM_T10CRC4K:
-		*fn = obd_dif_crc_fn;
-		*sector_size = 4096;
-		break;
-	default:
-		break;
-	}
-#endif /* CONFIG_CRC_T10DIF */
-}
-
-enum obd_t10_cksum_type {
-	OBD_T10_CKSUM_UNKNOWN = 0,
-	OBD_T10_CKSUM_IP512,
-	OBD_T10_CKSUM_IP4K,
-	OBD_T10_CKSUM_CRC512,
-	OBD_T10_CKSUM_CRC4K,
-	OBD_T10_CKSUM_MAX
-};
+#define DECLARE_CKSUM_NAME char *cksum_name[] = {"crc32", "adler", "crc32c"}
 
 #endif /* __OBD_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/obd_class.h b/drivers/staging/lustrefsx/lustre/include/obd_class.h
index ce46183f9d4da..da40a4e38f91b 100644
--- a/drivers/staging/lustrefsx/lustre/include/obd_class.h
+++ b/drivers/staging/lustrefsx/lustre/include/obd_class.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -32,13 +32,13 @@
 #ifndef __CLASS_OBD_H
 #define __CLASS_OBD_H
 
-#include <linux/kobject.h>
+
 #include <obd_support.h>
 #include <lustre_import.h>
 #include <lustre_net.h>
 #include <obd.h>
 #include <lustre_lib.h>
-#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre/lustre_idl.h>
 #include <lprocfs_status.h>
 
 #define OBD_STATFS_NODELAY      0x0001  /* requests should be send without delay
@@ -47,36 +47,27 @@
                                          * obd_osfs_age */
 #define OBD_STATFS_FOR_MDT0	0x0004	/* The statfs is only for retrieving
 					 * information from MDT0. */
-#define OBD_STATFS_SUM		0x0008	/* get aggregated statfs from MDT */
 
 extern rwlock_t obd_dev_lock;
 
 /* OBD Operations Declarations */
+extern struct obd_device *class_conn2obd(struct lustre_handle *);
 extern struct obd_device *class_exp2obd(struct obd_export *);
 extern int class_handle_ioctl(unsigned int cmd, unsigned long arg);
-int lustre_get_jobid(char *jobid, size_t len);
-void lustre_jobid_clear(const char *jobid);
-void jobid_cache_fini(void);
-int jobid_cache_init(void);
+extern int lustre_get_jobid(char *jobid);
 
 struct lu_device_type;
 
 /* genops.c */
 struct obd_export *class_conn2export(struct lustre_handle *);
-struct kobject *class_setup_tunables(const char *name);
-int class_register_type(const struct obd_ops *dt_ops,
-			const struct md_ops *md_ops, bool enable_proc,
-			struct ldebugfs_vars *module_vars,
+int class_register_type(struct obd_ops *, struct md_ops *, bool enable_proc,
+			struct lprocfs_vars *module_vars,
 			const char *nm, struct lu_device_type *ldt);
 int class_unregister_type(const char *nm);
 
-struct obd_device *class_newdev(const char *type_name, const char *name,
-				const char *uuid);
-int class_register_device(struct obd_device *obd);
-void class_unregister_device(struct obd_device *obd);
-void class_free_dev(struct obd_device *obd);
+struct obd_device *class_newdev(const char *type_name, const char *name);
+void class_release_dev(struct obd_device *obd);
 
-struct obd_device *class_dev_by_str(const char *str);
 int class_name2dev(const char *name);
 struct obd_device *class_name2obd(const char *name);
 int class_uuid2dev(struct obd_uuid *uuid);
@@ -92,17 +83,7 @@ int get_devices_count(void);
 
 int class_notify_sptlrpc_conf(const char *fsname, int namelen);
 
-static inline char *obd_export_nid2str(struct obd_export *exp)
-{
-	return exp->exp_connection == NULL ?
-	       "<unknown>" : libcfs_nid2str(exp->exp_connection->c_peer.nid);
-}
-
-static inline char *obd_import_nid2str(struct obd_import *imp)
-{
-	return imp->imp_connection == NULL ?
-	       "<unknown>" : libcfs_nid2str(imp->imp_connection->c_peer.nid);
-}
+char *obd_export_nid2str(struct obd_export *exp);
 
 int obd_export_evict_by_nid(struct obd_device *obd, const char *nid);
 int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid);
@@ -152,9 +133,8 @@ struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg,
 				     const char *new_name);
 void print_lustre_cfg(struct lustre_cfg *lcfg);
 int class_process_config(struct lustre_cfg *lcfg);
-ssize_t class_set_global(const char *param);
-ssize_t class_modify_config(struct lustre_cfg *lcfg, const char *prefix,
-			    struct kobject *kobj);
+int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
+			     struct lustre_cfg *lcfg, void *data);
 int class_attach(struct lustre_cfg *lcfg);
 int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
 int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg);
@@ -184,11 +164,12 @@ int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg);
 #define CFG_F_START     0x01   /* Set when we start updating from a log */
 #define CFG_F_MARKER    0x02   /* We are within a maker */
 #define CFG_F_SKIP      0x04   /* We should ignore this cfg command */
+#define CFG_F_COMPAT146 0x08   /* Allow old-style logs */
 #define CFG_F_EXCLUDE   0x10   /* OST exclusion list */
 
 /* Passed as data param to class_config_parse_llog */
 struct config_llog_instance {
-	unsigned long		 cfg_instance;
+	void			*cfg_instance;
 	struct super_block	*cfg_sb;
 	struct obd_uuid		 cfg_uuid;
 	llog_cb_t		 cfg_callback;
@@ -200,19 +181,6 @@ struct config_llog_instance {
 int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
 			    char *name, struct config_llog_instance *cfg);
 
-/**
- * Generate a unique configuration instance for this mount
- *
- * Temporary hack to bypass ASLR in 4.15+ kernels, a better fix soon.
- * For now, use the same value as before - the superblock pointer value.
- *
- * Using the client UUID would be an option, but it needs more testing.
- */
-static inline unsigned long ll_get_cfg_instance(struct super_block *sb)
-{
-	return (unsigned long)sb;
-}
-
 #define CONFIG_SUB_SPTLRPC	0x01
 #define CONFIG_SUB_RECOVER	0x02
 #define CONFIG_SUB_PARAMS	0x04
@@ -247,7 +215,7 @@ static inline bool logname_is_barrier(const char *logname)
 struct config_llog_data {
 	struct ldlm_res_id	    cld_resid;
 	struct config_llog_instance cld_cfg;
-	struct list_head	    cld_list_chain;/* on config_llog_list */
+	struct list_head	    cld_list_chain;
 	atomic_t		    cld_refcount;
 	struct config_llog_data    *cld_sptlrpc;/* depended sptlrpc log */
 	struct config_llog_data    *cld_params;	/* common parameters log */
@@ -347,8 +315,6 @@ struct obd_export *class_export_get(struct obd_export *exp);
 void class_export_put(struct obd_export *exp);
 struct obd_export *class_new_export(struct obd_device *obddev,
                                     struct obd_uuid *cluuid);
-struct obd_export *class_new_export_self(struct obd_device *obd,
-					 struct obd_uuid *uuid);
 void class_unlink_export(struct obd_export *exp);
 
 struct obd_import *class_import_get(struct obd_import *);
@@ -368,7 +334,6 @@ void class_disconnect_exports(struct obd_device *obddev);
 int class_manual_cleanup(struct obd_device *obd);
 void class_disconnect_stale_exports(struct obd_device *,
                                     int (*test_export)(struct obd_export *));
-
 static inline enum obd_option exp_flags_from_obd(struct obd_device *obd)
 {
         return ((obd->obd_fail ? OBD_OPT_FAILOVER : 0) |
@@ -403,25 +368,25 @@ void la_from_obdo(struct lu_attr *la, const struct obdo *dst, u64 valid);
 void obdo_cpy_md(struct obdo *dst, const struct obdo *src, u64 valid);
 void obdo_to_ioobj(const struct obdo *oa, struct obd_ioobj *ioobj);
 
+#define OBT(dev)        (dev)->obd_type
 #define OBP(dev, op)    (dev)->obd_type->typ_dt_ops->o_ ## op
 #define MDP(dev, op)    (dev)->obd_type->typ_md_ops->m_ ## op
+#define CTXTP(ctxt, op) (ctxt)->loc_logops->lop_##op
 
-static inline int obd_check_dev(struct obd_device *obd)
-{
-	if (!obd) {
-		CERROR("NULL device\n");
-		return -ENODEV;
-	}
-	return 0;
-}
+/* Ensure obd_setup: used for cleanup which must be called
+   while obd is stopping */
+#define OBD_CHECK_DEV(obd)                                      \
+do {                                                            \
+        if (!(obd)) {                                           \
+                CERROR("NULL device\n");                        \
+                RETURN(-ENODEV);                                \
+        }                                                       \
+} while (0)
 
 /* ensure obd_setup and !obd_stopping */
 #define OBD_CHECK_DEV_ACTIVE(obd)                               \
 do {                                                            \
-	rc = obd_check_dev(obd);				\
-	if (rc)							\
-		return rc;					\
-								\
+        OBD_CHECK_DEV(obd);                                     \
         if (!(obd)->obd_set_up || (obd)->obd_stopping) {        \
                 CERROR("Device %d not setup\n",                 \
                        (obd)->obd_minor);                       \
@@ -430,6 +395,61 @@ do {                                                            \
 } while (0)
 
 
+#ifdef CONFIG_PROC_FS
+#define OBD_COUNTER_OFFSET(op)						       \
+	((offsetof(struct obd_ops, o_ ## op) -				       \
+	  offsetof(struct obd_ops, o_iocontrol))			       \
+	 / sizeof(((struct obd_ops *)NULL)->o_iocontrol))
+
+/* The '- 1' below is for o_owner. */
+#define NUM_OBD_STATS							       \
+	(sizeof(struct obd_ops) /					       \
+	 sizeof(((struct obd_ops *)NULL)->o_iocontrol) - 1)
+
+#define OBD_COUNTER_INCREMENT(obd, op)					       \
+	lprocfs_counter_incr((obd)->obd_stats,				       \
+			     (obd)->obd_cntr_base + OBD_COUNTER_OFFSET(op))
+
+#define EXP_COUNTER_INCREMENT(exp, op)					       \
+	do {								       \
+		unsigned int _off;					       \
+		_off = (exp)->exp_obd->obd_cntr_base + OBD_COUNTER_OFFSET(op); \
+		lprocfs_counter_incr((exp)->exp_obd->obd_stats, _off);	       \
+		if ((exp)->exp_obd->obd_uses_nid_stats &&		       \
+		    (exp)->exp_nid_stats != NULL)			       \
+			lprocfs_counter_incr((exp)->exp_nid_stats->nid_stats,  \
+					     _off);			       \
+	} while (0)
+
+#define _MD_COUNTER_OFFSET(m_op)					       \
+	((offsetof(struct md_ops, m_op) -				       \
+	  offsetof(struct md_ops, MD_STATS_FIRST_OP)) /			       \
+	 sizeof(((struct md_ops *)NULL)->MD_STATS_FIRST_OP))
+
+#define MD_COUNTER_OFFSET(op) _MD_COUNTER_OFFSET(m_ ## op)
+
+#define NUM_MD_STATS							       \
+	(_MD_COUNTER_OFFSET(MD_STATS_LAST_OP) -				       \
+	 _MD_COUNTER_OFFSET(MD_STATS_FIRST_OP) + 1)
+
+/* Note that we only increment md counters for ops whose offset is less
+ * than NUM_MD_STATS. This is explained in a comment in the definition
+ * of struct md_ops. */
+#define EXP_MD_COUNTER_INCREMENT(exp, op)				       \
+	do {								       \
+		if (MD_COUNTER_OFFSET(op) < NUM_MD_STATS)		       \
+			lprocfs_counter_incr((exp)->exp_obd->obd_md_stats,     \
+					(exp)->exp_obd->obd_md_cntr_base +     \
+					MD_COUNTER_OFFSET(op));	               \
+	} while (0)
+
+#else
+#define OBD_COUNTER_OFFSET(op)
+#define OBD_COUNTER_INCREMENT(obd, op)
+#define EXP_COUNTER_INCREMENT(exp, op)
+#define EXP_MD_COUNTER_INCREMENT(exp, op)
+#endif
+
 static inline int lprocfs_nid_ldlm_stats_init(struct nid_stat* tmp)
 {
 	/* Always add in ldlm_stats */
@@ -445,16 +465,57 @@ static inline int lprocfs_nid_ldlm_stats_init(struct nid_stat* tmp)
 				      tmp->nid_ldlm_stats);
 }
 
-static inline int exp_check_ops(struct obd_export *exp)
-{
-	if (exp == NULL) {
-		RETURN(-ENODEV);
-	}
-	if (exp->exp_obd == NULL || !exp->exp_obd->obd_type) {
-		RETURN(-EOPNOTSUPP);
-	}
-	RETURN(0);
-}
+#define EXP_CHECK_MD_OP(exp, op)					\
+do {									\
+	if ((exp) == NULL) {						\
+		CERROR("obd_" #op ": NULL export\n");			\
+		RETURN(-ENODEV);					\
+	}								\
+	if ((exp)->exp_obd == NULL || !OBT((exp)->exp_obd)) {		\
+		CERROR("obd_" #op ": cleaned up obd\n");		\
+		RETURN(-EOPNOTSUPP);					\
+	}								\
+	if (!OBT((exp)->exp_obd) || !MDP((exp)->exp_obd, op)) {		\
+		CERROR("%s: obd_" #op ": dev %d no operation\n",	\
+		       (exp)->exp_obd->obd_name,			\
+		       (exp)->exp_obd->obd_minor);			\
+		RETURN(-EOPNOTSUPP);					\
+	}								\
+} while (0)
+
+
+#define OBD_CHECK_DT_OP(obd, op, err)					\
+do {									\
+	if (!OBT(obd) || !OBP((obd), op)) {				\
+		if (err)						\
+			CERROR("%s: no obd_" #op " operation\n",	\
+			       obd->obd_name);				\
+		RETURN(err);						\
+	}								\
+} while (0)
+
+#define EXP_CHECK_DT_OP(exp, op)					\
+do {									\
+	if ((exp) == NULL) {						\
+		CERROR("obd_" #op ": NULL export\n");			\
+		RETURN(-ENODEV);					\
+	}								\
+	if ((exp)->exp_obd == NULL || !OBT((exp)->exp_obd)) {		\
+		CERROR("obd_" #op ": cleaned up obd\n");		\
+		RETURN(-EOPNOTSUPP);					\
+	}								\
+	OBD_CHECK_DT_OP((exp)->exp_obd, op, -EOPNOTSUPP);		\
+} while (0)
+
+#define CTXT_CHECK_OP(ctxt, op, err)					\
+do {									\
+	if (!OBT(ctxt->loc_obd) || !CTXTP((ctxt), op)) {		\
+		if (err)						\
+			CERROR("%s: no lop_" #op "operation\n",		\
+			       ctxt->loc_obd->obd_name);		\
+		RETURN(err);						\
+	}								\
+} while (0)
 
 static inline int class_devno_max(void)
 {
@@ -468,15 +529,8 @@ static inline int obd_get_info(const struct lu_env *env, struct obd_export *exp,
 	int rc;
 	ENTRY;
 
-	rc = exp_check_ops(exp);
-	if (rc)
-		RETURN(rc);
-
-	if (!exp->exp_obd->obd_type->typ_dt_ops->o_get_info) {
-		CERROR("%s: no %s operation\n",
-		       (exp)->exp_obd->obd_name, __func__);
-		RETURN(-ENOTSUPP);
-	}
+	EXP_CHECK_DT_OP(exp, get_info);
+	EXP_COUNTER_INCREMENT(exp, get_info);
 
 	rc = OBP(exp->exp_obd, get_info)(env, exp, keylen, key, vallen, val);
 	RETURN(rc);
@@ -491,15 +545,8 @@ static inline int obd_set_info_async(const struct lu_env *env,
         int rc;
         ENTRY;
 
-	rc = exp_check_ops(exp);
-	if (rc)
-		RETURN(rc);
-
-	if (!exp->exp_obd->obd_type->typ_dt_ops->o_set_info_async) {
-		CERROR("%s: no %s operation\n",
-		       (exp)->exp_obd->obd_name, __func__);
-		RETURN(-ENOTSUPP);
-	}
+        EXP_CHECK_DT_OP(exp, set_info_async);
+        EXP_COUNTER_INCREMENT(exp, set_info_async);
 
         rc = OBP(exp->exp_obd, set_info_async)(env, exp, keylen, key, vallen,
                                                val, set);
@@ -523,14 +570,18 @@ static inline int obd_set_info_async(const struct lu_env *env,
  * functionality of ->o_precleanup() and ->o_cleanup() they override. Hence,
  * obd_precleanup() and obd_cleanup() call both lu_device and obd operations.
  */
+
+#define DECLARE_LU_VARS(ldt, d)                 \
+        struct lu_device_type *ldt;       \
+        struct lu_device *d
+
 static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg)
 {
         int rc;
-	struct lu_device_type *ldt = obd->obd_type->typ_lu;
-	struct lu_device *d;
-
+        DECLARE_LU_VARS(ldt, d);
         ENTRY;
 
+        ldt = obd->obd_type->typ_lu;
         if (ldt != NULL) {
                 struct lu_context  session_ctx;
                 struct lu_env env;
@@ -554,11 +605,8 @@ static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg)
                 lu_context_fini(&session_ctx);
 
         } else {
-		if (!obd->obd_type->typ_dt_ops->o_setup) {
-			CERROR("%s: no %s operation\n", obd->obd_name,
-			       __func__);
-			RETURN(-EOPNOTSUPP);
-		}
+                OBD_CHECK_DT_OP(obd, setup, -EOPNOTSUPP);
+                OBD_COUNTER_INCREMENT(obd, setup);
                 rc = OBP(obd, setup)(obd, cfg);
         }
         RETURN(rc);
@@ -567,30 +615,23 @@ static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg)
 static inline int obd_precleanup(struct obd_device *obd)
 {
 	int rc;
-	struct lu_device_type *ldt = obd->obd_type->typ_lu;
-	struct lu_device *d = obd->obd_lu_dev;
-
+	DECLARE_LU_VARS(ldt, d);
 	ENTRY;
 
+	OBD_CHECK_DEV(obd);
+	ldt = obd->obd_type->typ_lu;
+	d = obd->obd_lu_dev;
 	if (ldt != NULL && d != NULL) {
-		struct lu_env *env = lu_env_find();
-		struct lu_env _env;
-
-		if (!env) {
-			env = &_env;
-			rc = lu_env_init(env, ldt->ldt_ctx_tags);
-			LASSERT(rc == 0);
-			lu_env_add(env);
-		}
-		ldt->ldt_ops->ldto_device_fini(env, d);
-		if (env == &_env) {
-			lu_env_remove(env);
-			lu_env_fini(env);
+		struct lu_env env;
+
+		rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+		if (rc == 0) {
+			ldt->ldt_ops->ldto_device_fini(&env, d);
+			lu_env_fini(&env);
 		}
 	}
-
-	if (!obd->obd_type->typ_dt_ops->o_precleanup)
-		RETURN(0);
+	OBD_CHECK_DT_OP(obd, precleanup, 0);
+	OBD_COUNTER_INCREMENT(obd, precleanup);
 
 	rc = OBP(obd, precleanup)(obd);
 	RETURN(rc);
@@ -599,10 +640,13 @@ static inline int obd_precleanup(struct obd_device *obd)
 static inline int obd_cleanup(struct obd_device *obd)
 {
         int rc;
-	struct lu_device_type *ldt = obd->obd_type->typ_lu;
-	struct lu_device *d = obd->obd_lu_dev;
+        DECLARE_LU_VARS(ldt, d);
+        ENTRY;
 
-	ENTRY;
+        OBD_CHECK_DEV(obd);
+
+        ldt = obd->obd_type->typ_lu;
+        d = obd->obd_lu_dev;
         if (ldt != NULL && d != NULL) {
                 struct lu_env env;
 
@@ -613,8 +657,8 @@ static inline int obd_cleanup(struct obd_device *obd)
                         obd->obd_lu_dev = NULL;
                 }
         }
-	if (!obd->obd_type->typ_dt_ops->o_cleanup)
-		RETURN(0);
+        OBD_CHECK_DT_OP(obd, cleanup, 0);
+        OBD_COUNTER_INCREMENT(obd, cleanup);
 
         rc = OBP(obd, cleanup)(obd);
         RETURN(rc);
@@ -641,16 +685,18 @@ static inline void obd_cleanup_client_import(struct obd_device *obd)
         EXIT;
 }
 
-static inline int obd_process_config(struct obd_device *obd, int datalen,
-				     void *data)
+static inline int
+obd_process_config(struct obd_device *obd, int datalen, void *data)
 {
         int rc;
-	struct lu_device_type *ldt = obd->obd_type->typ_lu;
-	struct lu_device *d = obd->obd_lu_dev;
+        DECLARE_LU_VARS(ldt, d);
+        ENTRY;
 
-	ENTRY;
+        OBD_CHECK_DEV(obd);
 
         obd->obd_process_conf = 1;
+        ldt = obd->obd_type->typ_lu;
+        d = obd->obd_lu_dev;
         if (ldt != NULL && d != NULL) {
                 struct lu_env env;
 
@@ -660,14 +706,10 @@ static inline int obd_process_config(struct obd_device *obd, int datalen,
                         lu_env_fini(&env);
                 }
         } else {
-		if (!obd->obd_type->typ_dt_ops->o_process_config) {
-			CERROR("%s: no %s operation\n",
-			       obd->obd_name, __func__);
-			RETURN(-EOPNOTSUPP);
-		}
+                OBD_CHECK_DT_OP(obd, process_config, -EOPNOTSUPP);
                 rc = OBP(obd, process_config)(obd, datalen, data);
         }
-
+        OBD_COUNTER_INCREMENT(obd, process_config);
         obd->obd_process_conf = 0;
 
         RETURN(rc);
@@ -679,15 +721,8 @@ static inline int obd_create(const struct lu_env *env, struct obd_export *exp,
 	int rc;
 	ENTRY;
 
-	rc = exp_check_ops(exp);
-	if (rc)
-		RETURN(rc);
-
-	if (!exp->exp_obd->obd_type->typ_dt_ops->o_create) {
-		CERROR("%s: no %s operation\n",
-		       (exp)->exp_obd->obd_name, __func__);
-		RETURN(-ENOTSUPP);
-	}
+	EXP_CHECK_DT_OP(exp, create);
+	EXP_COUNTER_INCREMENT(exp, create);
 
 	rc = OBP(exp->exp_obd, create)(env, exp, obdo);
 	RETURN(rc);
@@ -699,15 +734,8 @@ static inline int obd_destroy(const struct lu_env *env, struct obd_export *exp,
 	int rc;
 	ENTRY;
 
-	rc = exp_check_ops(exp);
-	if (rc)
-		RETURN(rc);
-
-	if (!exp->exp_obd->obd_type->typ_dt_ops->o_destroy) {
-		CERROR("%s: no %s operation\n",
-		       (exp)->exp_obd->obd_name, __func__);
-		RETURN(-ENOTSUPP);
-	}
+	EXP_CHECK_DT_OP(exp, destroy);
+	EXP_COUNTER_INCREMENT(exp, destroy);
 
 	rc = OBP(exp->exp_obd, destroy)(env, exp, obdo);
 	RETURN(rc);
@@ -719,16 +747,8 @@ static inline int obd_getattr(const struct lu_env *env, struct obd_export *exp,
 	int rc;
 
 	ENTRY;
-	rc = exp_check_ops(exp);
-	if (rc)
-		RETURN(rc);
-
-	if (!exp->exp_obd->obd_type->typ_dt_ops->o_getattr) {
-		CERROR("%s: no %s operation\n",
-		       (exp)->exp_obd->obd_name, __func__);
-		RETURN(-ENOTSUPP);
-	}
-
+	EXP_CHECK_DT_OP(exp, getattr);
+	EXP_COUNTER_INCREMENT(exp, getattr);
 	rc = OBP(exp->exp_obd, getattr)(env, exp, oa);
 
 	RETURN(rc);
@@ -740,16 +760,8 @@ static inline int obd_setattr(const struct lu_env *env, struct obd_export *exp,
 	int rc;
 
 	ENTRY;
-	rc = exp_check_ops(exp);
-	if (rc)
-		RETURN(rc);
-
-	if (!exp->exp_obd->obd_type->typ_dt_ops->o_setattr) {
-		CERROR("%s: no %s operation\n",
-		       (exp)->exp_obd->obd_name, __func__);
-		RETURN(-ENOTSUPP);
-	}
-
+	EXP_CHECK_DT_OP(exp, setattr);
+	EXP_COUNTER_INCREMENT(exp, setattr);
 	rc = OBP(exp->exp_obd, setattr)(env, exp, oa);
 
 	RETURN(rc);
@@ -763,10 +775,8 @@ static inline int obd_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
         ENTRY;
 
         OBD_CHECK_DEV_ACTIVE(obd);
-	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_add_conn) {
-		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
-		RETURN(-EOPNOTSUPP);
-	}
+        OBD_CHECK_DT_OP(obd, add_conn, -EOPNOTSUPP);
+        OBD_COUNTER_INCREMENT(obd, add_conn);
 
         rc = OBP(obd, add_conn)(imp, uuid, priority);
         RETURN(rc);
@@ -779,10 +789,8 @@ static inline int obd_del_conn(struct obd_import *imp, struct obd_uuid *uuid)
         ENTRY;
 
         OBD_CHECK_DEV_ACTIVE(obd);
-	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_del_conn) {
-		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
-		RETURN(-EOPNOTSUPP);
-	}
+        OBD_CHECK_DT_OP(obd, del_conn, -EOPNOTSUPP);
+        OBD_COUNTER_INCREMENT(obd, del_conn);
 
         rc = OBP(obd, del_conn)(imp, uuid);
         RETURN(rc);
@@ -793,9 +801,8 @@ static inline struct obd_uuid *obd_get_uuid(struct obd_export *exp)
         struct obd_uuid *uuid;
         ENTRY;
 
-	if (!exp->exp_obd->obd_type ||
-	    !exp->exp_obd->obd_type->typ_dt_ops->o_get_uuid)
-		RETURN(NULL);
+        OBD_CHECK_DT_OP(exp->exp_obd, get_uuid, NULL);
+        EXP_COUNTER_INCREMENT(exp, get_uuid);
 
         uuid = OBP(exp->exp_obd, get_uuid)(exp);
         RETURN(uuid);
@@ -818,10 +825,8 @@ static inline int obd_connect(const struct lu_env *env,
         ENTRY;
 
         OBD_CHECK_DEV_ACTIVE(obd);
-	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_connect) {
-		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
-		RETURN(-EOPNOTSUPP);
-	}
+        OBD_CHECK_DT_OP(obd, connect, -EOPNOTSUPP);
+        OBD_COUNTER_INCREMENT(obd, connect);
 
         rc = OBP(obd, connect)(env, exp, obd, cluuid, data, localdata);
         /* check that only subset is granted */
@@ -844,8 +849,8 @@ static inline int obd_reconnect(const struct lu_env *env,
         ENTRY;
 
         OBD_CHECK_DEV_ACTIVE(obd);
-	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_reconnect)
-		RETURN(0);
+        OBD_CHECK_DT_OP(obd, reconnect, 0);
+        OBD_COUNTER_INCREMENT(obd, reconnect);
 
         rc = OBP(obd, reconnect)(env, exp, obd, cluuid, d, localdata);
         /* check that only subset is granted */
@@ -858,15 +863,9 @@ static inline int obd_disconnect(struct obd_export *exp)
 {
         int rc;
         ENTRY;
-	rc = exp_check_ops(exp);
-	if (rc)
-		RETURN(rc);
-
-	if (!exp->exp_obd->obd_type->typ_dt_ops->o_disconnect) {
-		CERROR("%s: no %s operation\n",
-		       (exp)->exp_obd->obd_name, __func__);
-		RETURN(-ENOTSUPP);
-	}
+
+        EXP_CHECK_DT_OP(exp, disconnect);
+        EXP_COUNTER_INCREMENT(exp, disconnect);
 
         rc = OBP(exp->exp_obd, disconnect)(exp);
         RETURN(rc);
@@ -878,8 +877,8 @@ static inline int obd_fid_init(struct obd_device *obd, struct obd_export *exp,
 	int rc;
 	ENTRY;
 
-	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_fid_init)
-		RETURN(0);
+	OBD_CHECK_DT_OP(obd, fid_init, 0);
+	OBD_COUNTER_INCREMENT(obd, fid_init);
 
 	rc = OBP(obd, fid_init)(obd, exp, type);
 	RETURN(rc);
@@ -889,8 +888,9 @@ static inline int obd_fid_fini(struct obd_device *obd)
 {
 	int rc;
 	ENTRY;
-	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_fid_fini)
-		RETURN(0);
+
+	OBD_CHECK_DT_OP(obd, fid_fini, 0);
+	OBD_COUNTER_INCREMENT(obd, fid_fini);
 
 	rc = OBP(obd, fid_fini)(obd);
 	RETURN(rc);
@@ -903,29 +903,33 @@ static inline int obd_fid_alloc(const struct lu_env *env,
 {
 	int rc;
 	ENTRY;
-	rc = exp_check_ops(exp);
-	if (rc)
-		RETURN(rc);
-
-	if (!exp->exp_obd->obd_type->typ_dt_ops->o_fid_alloc) {
-		CERROR("%s: no %s operation\n",
-		       (exp)->exp_obd->obd_name, __func__);
-		RETURN(-ENOTSUPP);
-	}
+
+	EXP_CHECK_DT_OP(exp, fid_alloc);
+	EXP_COUNTER_INCREMENT(exp, fid_alloc);
 
 	rc = OBP(exp->exp_obd, fid_alloc)(env, exp, fid, op_data);
 	RETURN(rc);
 }
 
+static inline int obd_ping(const struct lu_env *env, struct obd_export *exp)
+{
+        int rc;
+        ENTRY;
+
+        OBD_CHECK_DT_OP(exp->exp_obd, ping, 0);
+        EXP_COUNTER_INCREMENT(exp, ping);
+
+        rc = OBP(exp->exp_obd, ping)(env, exp);
+        RETURN(rc);
+}
+
 static inline int obd_pool_new(struct obd_device *obd, char *poolname)
 {
         int rc;
         ENTRY;
 
-	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_pool_new) {
-		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
-		RETURN(-EOPNOTSUPP);
-	}
+        OBD_CHECK_DT_OP(obd, pool_new, -EOPNOTSUPP);
+        OBD_COUNTER_INCREMENT(obd, pool_new);
 
         rc = OBP(obd, pool_new)(obd, poolname);
         RETURN(rc);
@@ -935,166 +939,173 @@ static inline int obd_pool_del(struct obd_device *obd, char *poolname)
 {
         int rc;
         ENTRY;
-	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_pool_del) {
-		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
-		RETURN(-EOPNOTSUPP);
-	}
+
+        OBD_CHECK_DT_OP(obd, pool_del, -EOPNOTSUPP);
+        OBD_COUNTER_INCREMENT(obd, pool_del);
 
         rc = OBP(obd, pool_del)(obd, poolname);
         RETURN(rc);
 }
 
-static inline int obd_pool_add(struct obd_device *obd, char *poolname,
-			       char *ostname)
+static inline int obd_pool_add(struct obd_device *obd, char *poolname, char *ostname)
 {
         int rc;
         ENTRY;
 
-	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_pool_add) {
-		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
-		RETURN(-EOPNOTSUPP);
-	}
+        OBD_CHECK_DT_OP(obd, pool_add, -EOPNOTSUPP);
+        OBD_COUNTER_INCREMENT(obd, pool_add);
 
         rc = OBP(obd, pool_add)(obd, poolname, ostname);
         RETURN(rc);
 }
 
-static inline int obd_pool_rem(struct obd_device *obd, char *poolname,
-			       char *ostname)
+static inline int obd_pool_rem(struct obd_device *obd, char *poolname, char *ostname)
 {
-	int rc;
+        int rc;
+        ENTRY;
 
-	ENTRY;
-	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_pool_rem) {
-		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
-		RETURN(-EOPNOTSUPP);
-	}
+        OBD_CHECK_DT_OP(obd, pool_rem, -EOPNOTSUPP);
+        OBD_COUNTER_INCREMENT(obd, pool_rem);
 
-	rc = OBP(obd, pool_rem)(obd, poolname, ostname);
-	RETURN(rc);
+        rc = OBP(obd, pool_rem)(obd, poolname, ostname);
+        RETURN(rc);
+}
+
+static inline void obd_getref(struct obd_device *obd)
+{
+        ENTRY;
+        if (OBT(obd) && OBP(obd, getref)) {
+                OBD_COUNTER_INCREMENT(obd, getref);
+                OBP(obd, getref)(obd);
+        }
+        EXIT;
+}
+
+static inline void obd_putref(struct obd_device *obd)
+{
+        ENTRY;
+        if (OBT(obd) && OBP(obd, putref)) {
+                OBD_COUNTER_INCREMENT(obd, putref);
+                OBP(obd, putref)(obd);
+        }
+        EXIT;
 }
 
 static inline int obd_init_export(struct obd_export *exp)
 {
-	int rc = 0;
+        int rc = 0;
 
-	ENTRY;
-	if (exp->exp_obd != NULL && exp->exp_obd->obd_type &&
-	    OBP((exp)->exp_obd, init_export))
-		rc = OBP(exp->exp_obd, init_export)(exp);
-	RETURN(rc);
+        ENTRY;
+        if ((exp)->exp_obd != NULL && OBT((exp)->exp_obd) &&
+            OBP((exp)->exp_obd, init_export))
+                rc = OBP(exp->exp_obd, init_export)(exp);
+        RETURN(rc);
 }
 
 static inline int obd_destroy_export(struct obd_export *exp)
 {
-	ENTRY;
-	if (exp->exp_obd != NULL && exp->exp_obd->obd_type &&
-	    OBP(exp->exp_obd, destroy_export))
-		OBP(exp->exp_obd, destroy_export)(exp);
-	RETURN(0);
+        ENTRY;
+        if ((exp)->exp_obd != NULL && OBT((exp)->exp_obd) &&
+            OBP((exp)->exp_obd, destroy_export))
+                OBP(exp->exp_obd, destroy_export)(exp);
+        RETURN(0);
 }
 
-/* @max_age is the oldest time in seconds that we accept using a cached data.
+/* @max_age is the oldest time in jiffies that we accept using a cached data.
  * If the cache is older than @max_age we will get a new value from the
- * target. Use a value of 'ktime_get_seconds() + X' to guarantee freshness.
- */
+ * target.  Use a value of "cfs_time_current() + HZ" to guarantee freshness. */
 static inline int obd_statfs_async(struct obd_export *exp,
-				   struct obd_info *oinfo,
-				   time64_t max_age,
-				   struct ptlrpc_request_set *rqset)
+                                   struct obd_info *oinfo,
+                                   __u64 max_age,
+                                   struct ptlrpc_request_set *rqset)
 {
-	struct obd_device *obd;
-	int rc = 0;
-
-	ENTRY;
+        int rc = 0;
+        struct obd_device *obd;
+        ENTRY;
 
-	if (exp == NULL || exp->exp_obd == NULL)
-		RETURN(-EINVAL);
+        if (exp == NULL || exp->exp_obd == NULL)
+                RETURN(-EINVAL);
 
-	obd = exp->exp_obd;
-	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_statfs) {
-		rc = -EOPNOTSUPP;
-		CERROR("%s: no statfs operation: rc = %d\n", obd->obd_name, rc);
-		RETURN(rc);
-	}
+        obd = exp->exp_obd;
+        OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP);
+        OBD_COUNTER_INCREMENT(obd, statfs);
 
-	CDEBUG(D_SUPER, "%s: age %lld, max_age %lld\n",
-	       obd->obd_name, obd->obd_osfs_age, max_age);
-	if (obd->obd_osfs_age < max_age) {
-		rc = OBP(obd, statfs_async)(exp, oinfo, max_age, rqset);
-	} else {
-		CDEBUG(D_SUPER,
-		       "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n",
-		       obd->obd_name, &obd->obd_osfs,
-		       obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
-		       obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
+	CDEBUG(D_SUPER, "%s: osfs %p age %llu, max_age %llu\n",
+               obd->obd_name, &obd->obd_osfs, obd->obd_osfs_age, max_age);
+        if (cfs_time_before_64(obd->obd_osfs_age, max_age)) {
+                rc = OBP(obd, statfs_async)(exp, oinfo, max_age, rqset);
+        } else {
+		CDEBUG(D_SUPER, "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n",
+                       obd->obd_name, &obd->obd_osfs,
+                       obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
+                       obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
 		spin_lock(&obd->obd_osfs_lock);
 		memcpy(oinfo->oi_osfs, &obd->obd_osfs, sizeof(*oinfo->oi_osfs));
 		spin_unlock(&obd->obd_osfs_lock);
-		oinfo->oi_flags |= OBD_STATFS_FROM_CACHE;
-		if (oinfo->oi_cb_up)
-			oinfo->oi_cb_up(oinfo, 0);
-	}
-	RETURN(rc);
+                oinfo->oi_flags |= OBD_STATFS_FROM_CACHE;
+                if (oinfo->oi_cb_up)
+                        oinfo->oi_cb_up(oinfo, 0);
+        }
+        RETURN(rc);
 }
 
-/* @max_age is the oldest time in seconds that we accept using a cached data.
- * If the cache is older than @max_age we will get a new value from the
- * target. Use a value of 'ktime_get_seconds() + X' to guarantee freshness.
- */
-static inline int obd_statfs(const struct lu_env *env, struct obd_export *exp,
-			     struct obd_statfs *osfs, time64_t max_age,
-			     __u32 flags)
+static inline int obd_statfs_rqset(struct obd_export *exp,
+                                   struct obd_statfs *osfs, __u64 max_age,
+                                   __u32 flags)
 {
-	struct obd_device *obd = exp->exp_obd;
+	struct ptlrpc_request_set *set = NULL;
+	struct obd_info oinfo = {
+		.oi_osfs = osfs,
+		.oi_flags = flags,
+	};
 	int rc = 0;
 
 	ENTRY;
-	if (unlikely(obd == NULL))
-		RETURN(-EINVAL);
 
-	OBD_CHECK_DEV_ACTIVE(obd);
+	set = ptlrpc_prep_set();
+	if (set == NULL)
+		RETURN(-ENOMEM);
 
-	if (unlikely(!obd->obd_type || !obd->obd_type->typ_dt_ops->o_statfs)) {
-		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
-		RETURN(-EOPNOTSUPP);
-	}
+	rc = obd_statfs_async(exp, &oinfo, max_age, set);
+	if (rc == 0)
+		rc = ptlrpc_set_wait(set);
 
-	CDEBUG(D_SUPER, "%s: age %lld, max_age %lld\n",
-	       obd->obd_name, obd->obd_osfs_age, max_age);
-	/* ignore cache if aggregated isn't expected */
-	if (obd->obd_osfs_age < max_age ||
-	    ((obd->obd_osfs.os_state & OS_STATE_SUM) &&
-	     !(flags & OBD_STATFS_SUM))) {
-		/* the RPC will block anyway, so avoid sending many at once */
-		rc = mutex_lock_interruptible(&obd->obd_dev_mutex);
-		if (rc)
-			RETURN(rc);
-		if (obd->obd_osfs_age < max_age ||
-		    ((obd->obd_osfs.os_state & OS_STATE_SUM) &&
-		     !(flags & OBD_STATFS_SUM))) {
-			rc = OBP(obd, statfs)(env, exp, osfs, max_age, flags);
-		} else {
-			mutex_unlock(&obd->obd_dev_mutex);
-			GOTO(cached, rc = 0);
-		}
-		if (rc == 0) {
-			CDEBUG(D_SUPER,
-			       "%s: update %p cache blocks %llu/%llu objects %llu/%llu\n",
-			       obd->obd_name, &obd->obd_osfs,
-			       osfs->os_bavail, osfs->os_blocks,
-			       osfs->os_ffree, osfs->os_files);
+	ptlrpc_set_destroy(set);
+
+	RETURN(rc);
+}
+
+/* @max_age is the oldest time in jiffies that we accept using a cached data.
+ * If the cache is older than @max_age we will get a new value from the
+ * target.  Use a value of "cfs_time_current() + HZ" to guarantee freshness. */
+static inline int obd_statfs(const struct lu_env *env, struct obd_export *exp,
+                             struct obd_statfs *osfs, __u64 max_age,
+                             __u32 flags)
+{
+        int rc = 0;
+        struct obd_device *obd = exp->exp_obd;
+        ENTRY;
+
+        if (obd == NULL)
+                RETURN(-EINVAL);
+
+        OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP);
+	OBD_CHECK_DEV_ACTIVE(obd);
+        OBD_COUNTER_INCREMENT(obd, statfs);
 
+	CDEBUG(D_SUPER, "osfs %llu, max_age %llu\n",
+               obd->obd_osfs_age, max_age);
+        if (cfs_time_before_64(obd->obd_osfs_age, max_age)) {
+                rc = OBP(obd, statfs)(env, exp, osfs, max_age, flags);
+                if (rc == 0) {
 			spin_lock(&obd->obd_osfs_lock);
 			memcpy(&obd->obd_osfs, osfs, sizeof(obd->obd_osfs));
-			obd->obd_osfs_age = ktime_get_seconds();
+			obd->obd_osfs_age = cfs_time_current_64();
 			spin_unlock(&obd->obd_osfs_lock);
 		}
-		mutex_unlock(&obd->obd_dev_mutex);
 	} else {
-cached:
-		CDEBUG(D_SUPER,
-		       "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n",
+		CDEBUG(D_SUPER, "%s: use %p cache blocks %llu/%llu"
+		       " objects %llu/%llu\n",
 		       obd->obd_name, &obd->obd_osfs,
 		       obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
 		       obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
@@ -1114,17 +1125,8 @@ static inline int obd_preprw(const struct lu_env *env, int cmd,
 	int rc;
 
 	ENTRY;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		RETURN(rc);
-
-	if (!exp->exp_obd->obd_type->typ_dt_ops->o_preprw) {
-		CERROR("%s: no %s operation\n",
-		       (exp)->exp_obd->obd_name, __func__);
-		RETURN(-ENOTSUPP);
-	}
-
+	EXP_CHECK_DT_OP(exp, preprw);
+	EXP_COUNTER_INCREMENT(exp, preprw);
 	rc = OBP(exp->exp_obd, preprw)(env, cmd, exp, oa, objcount, obj, remote,
 				       pages, local);
 
@@ -1135,23 +1137,14 @@ static inline int obd_commitrw(const struct lu_env *env, int cmd,
 			       struct obd_export *exp, struct obdo *oa,
 			       int objcount, struct obd_ioobj *obj,
 			       struct niobuf_remote *rnb, int pages,
-			       struct niobuf_local *local, const int orig_rc)
+			       struct niobuf_local *local, int rc)
 {
-	int rc;
 	ENTRY;
 
-	rc = exp_check_ops(exp);
-	if (rc)
-		RETURN(rc);
-
-	if (!exp->exp_obd->obd_type->typ_dt_ops->o_commitrw) {
-		CERROR("%s: no %s operation\n",
-		       (exp)->exp_obd->obd_name, __func__);
-		RETURN(-ENOTSUPP);
-	}
-
+	EXP_CHECK_DT_OP(exp, commitrw);
+	EXP_COUNTER_INCREMENT(exp, commitrw);
 	rc = OBP(exp->exp_obd, commitrw)(env, cmd, exp, oa, objcount, obj,
-					 rnb, pages, local, orig_rc);
+					 rnb, pages, local, rc);
 
 	RETURN(rc);
 }
@@ -1162,15 +1155,8 @@ static inline int obd_iocontrol(unsigned int cmd, struct obd_export *exp,
         int rc;
         ENTRY;
 
-	rc = exp_check_ops(exp);
-	if (rc)
-		RETURN(rc);
-
-	if (!exp->exp_obd->obd_type->typ_dt_ops->o_iocontrol) {
-		CERROR("%s: no %s operation\n",
-		       (exp)->exp_obd->obd_name, __func__);
-		RETURN(-ENOTSUPP);
-	}
+        EXP_CHECK_DT_OP(exp, iocontrol);
+        EXP_COUNTER_INCREMENT(exp, iocontrol);
 
         rc = OBP(exp->exp_obd, iocontrol)(cmd, exp, len, karg, uarg);
         RETURN(rc);
@@ -1186,10 +1172,10 @@ static inline void obd_import_event(struct obd_device *obd,
                 EXIT;
                 return;
         }
-
-        if (obd->obd_set_up && OBP(obd, import_event))
+        if (obd->obd_set_up && OBP(obd, import_event)) {
+                OBD_COUNTER_INCREMENT(obd, import_event);
                 OBP(obd, import_event)(obd, imp, event);
-
+        }
         EXIT;
 }
 
@@ -1199,10 +1185,7 @@ static inline int obd_notify(struct obd_device *obd,
 {
 	int rc;
 	ENTRY;
-
-	rc = obd_check_dev(obd);
-	if (rc)
-		return rc;
+	OBD_CHECK_DEV(obd);
 
 	if (!obd->obd_set_up) {
 		CDEBUG(D_HA, "obd %s not set up\n", obd->obd_name);
@@ -1214,6 +1197,7 @@ static inline int obd_notify(struct obd_device *obd,
 		RETURN(-ENOSYS);
 	}
 
+	OBD_COUNTER_INCREMENT(obd, notify);
 	rc = OBP(obd, notify)(obd, watched, ev);
 
 	RETURN(rc);
@@ -1246,58 +1230,45 @@ static inline int obd_quotactl(struct obd_export *exp,
         int rc;
         ENTRY;
 
-	rc = exp_check_ops(exp);
-	if (rc)
-		RETURN(rc);
-
-	if (!exp->exp_obd->obd_type->typ_dt_ops->o_quotactl) {
-		CERROR("%s: no %s operation\n",
-		       (exp)->exp_obd->obd_name, __func__);
-		RETURN(-ENOTSUPP);
-	}
+        EXP_CHECK_DT_OP(exp, quotactl);
+        EXP_COUNTER_INCREMENT(exp, quotactl);
 
         rc = OBP(exp->exp_obd, quotactl)(exp->exp_obd, exp, oqctl);
         RETURN(rc);
 }
 
 static inline int obd_health_check(const struct lu_env *env,
-				   struct obd_device *obd)
-{
-	/* returns: 0 on healthy
-	 *         >0 on unhealthy + reason code/flag
-	 *            however the only suppored reason == 1 right now
-	 *            We'll need to define some better reasons
-	 *            or flags in the future.
-	 *         <0 on error
-	 */
-	int rc;
-
-	ENTRY;
+                                   struct obd_device *obd)
+{
+        /* returns: 0 on healthy
+         *         >0 on unhealthy + reason code/flag
+         *            however the only suppored reason == 1 right now
+         *            We'll need to define some better reasons
+         *            or flags in the future.
+         *         <0 on error
+         */
+        int rc;
+        ENTRY;
 
-	/* NULL method is normal here */
-	if (obd == NULL || !obd->obd_type) {
-		CERROR("cleaned up obd\n");
-		RETURN(-EOPNOTSUPP);
-	}
-	if (!obd->obd_set_up || obd->obd_stopping)
-		RETURN(0);
-	if (!OBP(obd, health_check))
-		RETURN(0);
+        /* don't use EXP_CHECK_DT_OP, because NULL method is normal here */
+        if (obd == NULL || !OBT(obd)) {
+                CERROR("cleaned up obd\n");
+                RETURN(-EOPNOTSUPP);
+        }
+        if (!obd->obd_set_up || obd->obd_stopping)
+                RETURN(0);
+        if (!OBP(obd, health_check))
+                RETURN(0);
 
-	rc = OBP(obd, health_check)(env, obd);
-	RETURN(rc);
+        rc = OBP(obd, health_check)(env, obd);
+        RETURN(rc);
 }
 
 static inline int obd_register_observer(struct obd_device *obd,
                                         struct obd_device *observer)
 {
-	int rc;
         ENTRY;
-
-	rc = obd_check_dev(obd);
-	if (rc)
-		return rc;
-
+        OBD_CHECK_DEV(obd);
 	down_write(&obd->obd_observer_link_sem);
         if (obd->obd_observer && observer) {
 		up_write(&obd->obd_observer_link_sem);
@@ -1309,79 +1280,51 @@ static inline int obd_register_observer(struct obd_device *obd,
 }
 
 /* metadata helpers */
-enum mps_stat_idx {
-	LPROC_MD_CLOSE,
-	LPROC_MD_CREATE,
-	LPROC_MD_ENQUEUE,
-	LPROC_MD_GETATTR,
-	LPROC_MD_INTENT_LOCK,
-	LPROC_MD_LINK,
-	LPROC_MD_RENAME,
-	LPROC_MD_SETATTR,
-	LPROC_MD_FSYNC,
-	LPROC_MD_READ_PAGE,
-	LPROC_MD_UNLINK,
-	LPROC_MD_SETXATTR,
-	LPROC_MD_GETXATTR,
-	LPROC_MD_INTENT_GETATTR_ASYNC,
-	LPROC_MD_REVALIDATE_LOCK,
-	LPROC_MD_LAST_OPC,
-};
-
 static inline int md_get_root(struct obd_export *exp, const char *fileset,
 			      struct lu_fid *fid)
 {
 	int rc;
 
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, get_root);
+	EXP_MD_COUNTER_INCREMENT(exp, get_root);
+	rc = MDP(exp->exp_obd, get_root)(exp, fileset, fid);
 
-	return MDP(exp->exp_obd, get_root)(exp, fileset, fid);
+	RETURN(rc);
 }
 
-static inline int md_getattr(struct obd_export *exp,
-			     struct md_op_data *op_data,
-			     struct ptlrpc_request **request)
+static inline int md_getattr(struct obd_export *exp, struct md_op_data *op_data,
+                             struct ptlrpc_request **request)
 {
-	int rc;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
-
-	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
-			     LPROC_MD_GETATTR);
-
-	return MDP(exp->exp_obd, getattr)(exp, op_data, request);
+        int rc;
+        ENTRY;
+        EXP_CHECK_MD_OP(exp, getattr);
+        EXP_MD_COUNTER_INCREMENT(exp, getattr);
+        rc = MDP(exp->exp_obd, getattr)(exp, op_data, request);
+        RETURN(rc);
 }
 
 static inline int md_null_inode(struct obd_export *exp,
                                    const struct lu_fid *fid)
 {
-	int rc;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
-
-	return MDP(exp->exp_obd, null_inode)(exp, fid);
+        int rc;
+        ENTRY;
+        EXP_CHECK_MD_OP(exp, null_inode);
+        EXP_MD_COUNTER_INCREMENT(exp, null_inode);
+        rc = MDP(exp->exp_obd, null_inode)(exp, fid);
+        RETURN(rc);
 }
 
 static inline int md_close(struct obd_export *exp, struct md_op_data *op_data,
                            struct md_open_data *mod,
                            struct ptlrpc_request **request)
 {
-	int rc;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
-
-	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
-			     LPROC_MD_CLOSE);
-
-	return MDP(exp->exp_obd, close)(exp, op_data, mod, request);
+        int rc;
+        ENTRY;
+        EXP_CHECK_MD_OP(exp, close);
+        EXP_MD_COUNTER_INCREMENT(exp, close);
+        rc = MDP(exp->exp_obd, close)(exp, op_data, mod, request);
+        RETURN(rc);
 }
 
 static inline int md_create(struct obd_export *exp, struct md_op_data *op_data,
@@ -1389,18 +1332,13 @@ static inline int md_create(struct obd_export *exp, struct md_op_data *op_data,
 			    uid_t uid, gid_t gid, cfs_cap_t cap_effective,
 			    __u64 rdev, struct ptlrpc_request **request)
 {
-	int rc;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
-
-	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
-			     LPROC_MD_CREATE);
-
-	return MDP(exp->exp_obd, create)(exp, op_data, data, datalen, mode,
-					 uid, gid, cap_effective, rdev,
-					 request);
+        int rc;
+        ENTRY;
+        EXP_CHECK_MD_OP(exp, create);
+        EXP_MD_COUNTER_INCREMENT(exp, create);
+        rc = MDP(exp->exp_obd, create)(exp, op_data, data, datalen, mode,
+                                       uid, gid, cap_effective, rdev, request);
+        RETURN(rc);
 }
 
 static inline int md_enqueue(struct obd_export *exp,
@@ -1411,29 +1349,24 @@ static inline int md_enqueue(struct obd_export *exp,
 			     __u64 extra_lock_flags)
 {
 	int rc;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
-
-	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
-			     LPROC_MD_ENQUEUE);
-
-	return MDP(exp->exp_obd, enqueue)(exp, einfo, policy, op_data, lockh,
-		   extra_lock_flags);
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, enqueue);
+	EXP_MD_COUNTER_INCREMENT(exp, enqueue);
+	rc = MDP(exp->exp_obd, enqueue)(exp, einfo, policy, op_data, lockh,
+					extra_lock_flags);
+        RETURN(rc);
 }
 
 static inline int md_getattr_name(struct obd_export *exp,
                                   struct md_op_data *op_data,
                                   struct ptlrpc_request **request)
 {
-	int rc;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
-
-	return MDP(exp->exp_obd, getattr_name)(exp, op_data, request);
+        int rc;
+        ENTRY;
+        EXP_CHECK_MD_OP(exp, getattr_name);
+        EXP_MD_COUNTER_INCREMENT(exp, getattr_name);
+        rc = MDP(exp->exp_obd, getattr_name)(exp, op_data, request);
+        RETURN(rc);
 }
 
 static inline int md_intent_lock(struct obd_export *exp,
@@ -1444,49 +1377,36 @@ static inline int md_intent_lock(struct obd_export *exp,
 				 __u64 extra_lock_flags)
 {
 	int rc;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
-
-	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
-			     LPROC_MD_INTENT_LOCK);
-
-	return MDP(exp->exp_obd, intent_lock)(exp, op_data, it, reqp,
-					      cb_blocking, extra_lock_flags);
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, intent_lock);
+	EXP_MD_COUNTER_INCREMENT(exp, intent_lock);
+	rc = MDP(exp->exp_obd, intent_lock)(exp, op_data, it, reqp, cb_blocking,
+					    extra_lock_flags);
+	RETURN(rc);
 }
 
 static inline int md_link(struct obd_export *exp, struct md_op_data *op_data,
                           struct ptlrpc_request **request)
 {
-	int rc;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
-
-	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
-			     LPROC_MD_LINK);
-
-	return MDP(exp->exp_obd, link)(exp, op_data, request);
+        int rc;
+        ENTRY;
+        EXP_CHECK_MD_OP(exp, link);
+        EXP_MD_COUNTER_INCREMENT(exp, link);
+        rc = MDP(exp->exp_obd, link)(exp, op_data, request);
+        RETURN(rc);
 }
 
 static inline int md_rename(struct obd_export *exp, struct md_op_data *op_data,
-			    const char *old_name, size_t oldlen,
-			    const char *new_name, size_t newlen,
-			    struct ptlrpc_request **request)
+			    const char *old, size_t oldlen, const char *new,
+			    size_t newlen, struct ptlrpc_request **request)
 {
-	int rc;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
-
-	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
-			     LPROC_MD_RENAME);
-
-	return MDP(exp->exp_obd, rename)(exp, op_data, old_name, oldlen,
-					 new_name, newlen, request);
+        int rc;
+        ENTRY;
+        EXP_CHECK_MD_OP(exp, rename);
+        EXP_MD_COUNTER_INCREMENT(exp, rename);
+        rc = MDP(exp->exp_obd, rename)(exp, op_data, old, oldlen, new,
+                                       newlen, request);
+        RETURN(rc);
 }
 
 static inline int md_setattr(struct obd_export *exp, struct md_op_data *op_data,
@@ -1494,15 +1414,11 @@ static inline int md_setattr(struct obd_export *exp, struct md_op_data *op_data,
 			     struct ptlrpc_request **request)
 {
 	int rc;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
-
-	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
-			     LPROC_MD_SETATTR);
-
-	return MDP(exp->exp_obd, setattr)(exp, op_data, ea, ealen, request);
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, setattr);
+	EXP_MD_COUNTER_INCREMENT(exp, setattr);
+	rc = MDP(exp->exp_obd, setattr)(exp, op_data, ea, ealen, request);
+	RETURN(rc);
 }
 
 static inline int md_fsync(struct obd_export *exp, const struct lu_fid *fid,
@@ -1510,27 +1426,12 @@ static inline int md_fsync(struct obd_export *exp, const struct lu_fid *fid,
 {
 	int rc;
 
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
-
-	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
-			     LPROC_MD_FSYNC);
-
-	return MDP(exp->exp_obd, fsync)(exp, fid, request);
-}
-
-/* FLR: resync mirrored files. */
-static inline int md_file_resync(struct obd_export *exp,
-				 struct md_op_data *data)
-{
-	int rc;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, fsync);
+	EXP_MD_COUNTER_INCREMENT(exp, fsync);
+	rc = MDP(exp->exp_obd, fsync)(exp, fid, request);
 
-	return MDP(exp->exp_obd, file_resync)(exp, data);
+	RETURN(rc);
 }
 
 static inline int md_read_page(struct obd_export *exp,
@@ -1540,31 +1441,23 @@ static inline int md_read_page(struct obd_export *exp,
 			       struct page **ppage)
 {
 	int rc;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
-
-	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
-			     LPROC_MD_READ_PAGE);
-
-	return MDP(exp->exp_obd, read_page)(exp, op_data, cb_op, hash_offset,
-					    ppage);
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, read_page);
+	EXP_MD_COUNTER_INCREMENT(exp, read_page);
+	rc = MDP(exp->exp_obd, read_page)(exp, op_data, cb_op, hash_offset,
+					  ppage);
+	RETURN(rc);
 }
 
 static inline int md_unlink(struct obd_export *exp, struct md_op_data *op_data,
                             struct ptlrpc_request **request)
 {
-	int rc;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
-
-	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
-			     LPROC_MD_UNLINK);
-
-	return MDP(exp->exp_obd, unlink)(exp, op_data, request);
+        int rc;
+        ENTRY;
+        EXP_CHECK_MD_OP(exp, unlink);
+        EXP_MD_COUNTER_INCREMENT(exp, unlink);
+        rc = MDP(exp->exp_obd, unlink)(exp, op_data, request);
+        RETURN(rc);
 }
 
 static inline int md_get_lustre_md(struct obd_export *exp,
@@ -1573,25 +1466,19 @@ static inline int md_get_lustre_md(struct obd_export *exp,
                                    struct obd_export *md_exp,
                                    struct lustre_md *md)
 {
-	int rc;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
-
-	return MDP(exp->exp_obd, get_lustre_md)(exp, req, dt_exp, md_exp, md);
+        ENTRY;
+        EXP_CHECK_MD_OP(exp, get_lustre_md);
+        EXP_MD_COUNTER_INCREMENT(exp, get_lustre_md);
+        RETURN(MDP(exp->exp_obd, get_lustre_md)(exp, req, dt_exp, md_exp, md));
 }
 
 static inline int md_free_lustre_md(struct obd_export *exp,
                                     struct lustre_md *md)
 {
-	int rc;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
-
-	return MDP(exp->exp_obd, free_lustre_md)(exp, md);
+        ENTRY;
+        EXP_CHECK_MD_OP(exp, free_lustre_md);
+        EXP_MD_COUNTER_INCREMENT(exp, free_lustre_md);
+        RETURN(MDP(exp->exp_obd, free_lustre_md)(exp, md));
 }
 
 static inline int md_merge_attr(struct obd_export *exp,
@@ -1599,88 +1486,67 @@ static inline int md_merge_attr(struct obd_export *exp,
 				struct cl_attr *attr,
 				ldlm_blocking_callback cb)
 {
-	int rc;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
-
-	return MDP(exp->exp_obd, merge_attr)(exp, lsm, attr, cb);
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, merge_attr);
+	EXP_MD_COUNTER_INCREMENT(exp, merge_attr);
+	RETURN(MDP(exp->exp_obd, merge_attr)(exp, lsm, attr, cb));
 }
 
 static inline int md_setxattr(struct obd_export *exp, const struct lu_fid *fid,
-			      u64 obd_md_valid, const char *name,
-			      const void *value, size_t value_size,
-			      unsigned int xattr_flags, u32 suppgid,
-			      struct ptlrpc_request **req)
+			      u64 valid, const char *name,
+			      const char *input, int input_size,
+			      int output_size, int flags, __u32 suppgid,
+			      struct ptlrpc_request **request)
 {
-	int rc;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
-
-	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
-			     LPROC_MD_SETXATTR);
-
-	return MDP(exp->exp_obd, setxattr)(exp, fid, obd_md_valid, name,
-					   value, value_size, xattr_flags,
-					   suppgid, req);
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, setxattr);
+	EXP_MD_COUNTER_INCREMENT(exp, setxattr);
+	RETURN(MDP(exp->exp_obd, setxattr)(exp, fid, valid, name, input,
+					   input_size, output_size, flags,
+					   suppgid, request));
 }
 
 static inline int md_getxattr(struct obd_export *exp, const struct lu_fid *fid,
-			      u64 obd_md_valid, const char *name,
-			      size_t buf_size, struct ptlrpc_request **req)
+			      u64 valid, const char *name,
+			      const char *input, int input_size,
+			      int output_size, int flags,
+			      struct ptlrpc_request **request)
 {
-	int rc;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
-
-	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
-			     LPROC_MD_GETXATTR);
-
-	return MDP(exp->exp_obd, getxattr)(exp, fid, obd_md_valid, name,
-					   buf_size, req);
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, getxattr);
+	EXP_MD_COUNTER_INCREMENT(exp, getxattr);
+	RETURN(MDP(exp->exp_obd, getxattr)(exp, fid, valid, name, input,
+					   input_size, output_size, flags,
+					   request));
 }
 
 static inline int md_set_open_replay_data(struct obd_export *exp,
 					  struct obd_client_handle *och,
 					  struct lookup_intent *it)
 {
-	int rc;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
-
-	return MDP(exp->exp_obd, set_open_replay_data)(exp, och, it);
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, set_open_replay_data);
+	EXP_MD_COUNTER_INCREMENT(exp, set_open_replay_data);
+	RETURN(MDP(exp->exp_obd, set_open_replay_data)(exp, och, it));
 }
 
 static inline int md_clear_open_replay_data(struct obd_export *exp,
                                             struct obd_client_handle *och)
 {
-	int rc;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
-
-	return MDP(exp->exp_obd, clear_open_replay_data)(exp, och);
+        ENTRY;
+        EXP_CHECK_MD_OP(exp, clear_open_replay_data);
+        EXP_MD_COUNTER_INCREMENT(exp, clear_open_replay_data);
+        RETURN(MDP(exp->exp_obd, clear_open_replay_data)(exp, och));
 }
 
 static inline int md_set_lock_data(struct obd_export *exp,
 				   const struct lustre_handle *lockh,
 				   void *data, __u64 *bits)
 {
-	int rc;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
-
-	return MDP(exp->exp_obd, set_lock_data)(exp, lockh, data, bits);
+        ENTRY;
+        EXP_CHECK_MD_OP(exp, set_lock_data);
+        EXP_MD_COUNTER_INCREMENT(exp, set_lock_data);
+        RETURN(MDP(exp->exp_obd, set_lock_data)(exp, lockh, data, bits));
 }
 
 static inline
@@ -1689,13 +1555,14 @@ int md_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
 		     enum ldlm_cancel_flags cancel_flags, void *opaque)
 {
 	int rc;
+	ENTRY;
 
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
+	EXP_CHECK_MD_OP(exp, cancel_unused);
+	EXP_MD_COUNTER_INCREMENT(exp, cancel_unused);
 
-	return MDP(exp->exp_obd, cancel_unused)(exp, fid, policy, mode,
-						cancel_flags, opaque);
+	rc = MDP(exp->exp_obd, cancel_unused)(exp, fid, policy, mode,
+					      cancel_flags, opaque);
+	RETURN(rc);
 }
 
 static inline enum ldlm_mode md_lock_match(struct obd_export *exp, __u64 flags,
@@ -1705,57 +1572,43 @@ static inline enum ldlm_mode md_lock_match(struct obd_export *exp, __u64 flags,
 					   enum ldlm_mode mode,
 					   struct lustre_handle *lockh)
 {
-	int rc;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
-
-	return MDP(exp->exp_obd, lock_match)(exp, flags, fid, type,
-					     policy, mode, lockh);
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, lock_match);
+	EXP_MD_COUNTER_INCREMENT(exp, lock_match);
+	RETURN(MDP(exp->exp_obd, lock_match)(exp, flags, fid, type,
+					     policy, mode, lockh));
 }
 
 static inline int md_init_ea_size(struct obd_export *exp, __u32 ea_size,
 				  __u32 def_ea_size)
 {
-	int rc;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
-
-	return MDP(exp->exp_obd, init_ea_size)(exp, ea_size, def_ea_size);
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, init_ea_size);
+	EXP_MD_COUNTER_INCREMENT(exp, init_ea_size);
+	RETURN(MDP(exp->exp_obd, init_ea_size)(exp, ea_size, def_ea_size));
 }
 
 static inline int md_intent_getattr_async(struct obd_export *exp,
 					  struct md_enqueue_info *minfo)
 {
 	int rc;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
-
-	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
-			     LPROC_MD_INTENT_GETATTR_ASYNC);
-
-	return MDP(exp->exp_obd, intent_getattr_async)(exp, minfo);
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, intent_getattr_async);
+	EXP_MD_COUNTER_INCREMENT(exp, intent_getattr_async);
+	rc = MDP(exp->exp_obd, intent_getattr_async)(exp, minfo);
+	RETURN(rc);
 }
 
 static inline int md_revalidate_lock(struct obd_export *exp,
                                      struct lookup_intent *it,
                                      struct lu_fid *fid, __u64 *bits)
 {
-	int rc;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
-
-	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
-			     LPROC_MD_REVALIDATE_LOCK);
-
-	return MDP(exp->exp_obd, revalidate_lock)(exp, it, fid, bits);
+        int rc;
+        ENTRY;
+        EXP_CHECK_MD_OP(exp, revalidate_lock);
+        EXP_MD_COUNTER_INCREMENT(exp, revalidate_lock);
+        rc = MDP(exp->exp_obd, revalidate_lock)(exp, it, fid, bits);
+        RETURN(rc);
 }
 
 static inline int md_get_fid_from_lsm(struct obd_export *exp,
@@ -1764,15 +1617,14 @@ static inline int md_get_fid_from_lsm(struct obd_export *exp,
 				      struct lu_fid *fid)
 {
 	int rc;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
-
-	return MDP(exp->exp_obd, get_fid_from_lsm)(exp, lsm, name, namelen,
-						   fid);
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, get_fid_from_lsm);
+	EXP_MD_COUNTER_INCREMENT(exp, get_fid_from_lsm);
+	rc = MDP(exp->exp_obd, get_fid_from_lsm)(exp, lsm, name, namelen, fid);
+	RETURN(rc);
 }
 
+
 /* Unpack an MD struct from disk to in-memory format.
  * Returns +ve size of unpacked MD (0 for free), or -ve error.
  *
@@ -1784,24 +1636,11 @@ static inline int md_unpackmd(struct obd_export *exp,
 			      const union lmv_mds_md *lmm, size_t lmm_size)
 {
 	int rc;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
-
-	return MDP(exp->exp_obd, unpackmd)(exp, plsm, lmm, lmm_size);
-}
-
-static inline int md_rmfid(struct obd_export *exp, struct fid_array *fa,
-			   int *rcs, struct ptlrpc_request_set *set)
-{
-	int rc;
-
-	rc = exp_check_ops(exp);
-	if (rc)
-		return rc;
-
-	return MDP(exp->exp_obd, rmfid)(exp, fa, rcs, set);
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, unpackmd);
+	EXP_MD_COUNTER_INCREMENT(exp, unpackmd);
+	rc = MDP(exp->exp_obd, unpackmd)(exp, plsm, lmm, lmm_size);
+	RETURN(rc);
 }
 
 /* OBD Metadata Support */
@@ -1809,6 +1648,20 @@ static inline int md_rmfid(struct obd_export *exp, struct fid_array *fa,
 extern int obd_init_caches(void);
 extern void obd_cleanup_caches(void);
 
+/* support routines */
+extern struct kmem_cache *obdo_cachep;
+
+#define OBDO_ALLOC(ptr)                                                       \
+do {                                                                          \
+	OBD_SLAB_ALLOC_PTR_GFP((ptr), obdo_cachep, GFP_NOFS);             \
+} while(0)
+
+#define OBDO_FREE(ptr)                                                        \
+do {                                                                          \
+        OBD_SLAB_FREE_PTR((ptr), obdo_cachep);                                \
+} while(0)
+
+
 typedef int (*register_lwp_cb)(void *data);
 
 struct lwp_register_item {
@@ -1839,14 +1692,13 @@ int lustre_register_fs(void);
 int lustre_unregister_fs(void);
 int lustre_check_exclusion(struct super_block *sb, char *svname);
 
+/* sysctl.c */
+extern int obd_sysctl_init(void);
+extern void obd_sysctl_clean(void);
+
+/* uuid.c  */
 typedef __u8 class_uuid_t[16];
-static inline void class_uuid_unparse(class_uuid_t uu, struct obd_uuid *out)
-{
-	snprintf(out->uuid, sizeof(out->uuid), "%02x%02x%02x%02x-%02x%02x-"
-		 "%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
-		 uu[14], uu[15], uu[12], uu[13], uu[10], uu[11], uu[8], uu[9],
-		 uu[6], uu[7], uu[4], uu[5], uu[2], uu[3], uu[0], uu[1]);
-}
+void class_uuid_unparse(class_uuid_t in, struct obd_uuid *out);
 
 /* lustre_peer.c    */
 int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index);
@@ -1855,7 +1707,7 @@ int class_del_uuid (const char *uuid);
 int class_check_uuid(struct obd_uuid *uuid, __u64 nid);
 
 /* class_obd.c */
-extern char obd_jobid_name[];
+extern char obd_jobid_node[];
 
 /* prng.c */
 #define ll_generate_random_uuid(uuid_out) cfs_get_random_bytes(uuid_out, sizeof(class_uuid_t))
@@ -1881,4 +1733,5 @@ extern struct miscdevice obd_psdev;
 int obd_ioctl_getdata(char **buf, int *len, void __user *arg);
 int class_procfs_init(void);
 int class_procfs_clean(void);
+
 #endif /* __LINUX_OBD_CLASS_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/obd_support.h b/drivers/staging/lustrefsx/lustre/include/obd_support.h
index 356585d91932b..c22e08fe8cdb2 100644
--- a/drivers/staging/lustrefsx/lustre/include/obd_support.h
+++ b/drivers/staging/lustrefsx/lustre/include/obd_support.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -33,8 +33,11 @@
 #ifndef _OBD_SUPPORT
 #define _OBD_SUPPORT
 
+#ifndef __KERNEL__
+# error Userspace should not include obd_support.h.
+#endif /* !__KERNEL__ */
+
 #include <linux/atomic.h>
-#include <linux/ctype.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
 #include <linux/types.h>
@@ -53,7 +56,6 @@ enum {
 extern unsigned int obd_debug_peer_on_timeout;
 extern unsigned int obd_dump_on_timeout;
 extern unsigned int obd_dump_on_eviction;
-extern unsigned int obd_lbug_on_eviction;
 /* obd_timeout should only be used for recovery, not for
    networking / disk / timings affected by load (use Adaptive Timeouts) */
 extern unsigned int obd_timeout;          /* seconds */
@@ -68,6 +70,7 @@ extern int at_early_margin;
 extern int at_extra;
 extern unsigned long obd_max_dirty_pages;
 extern atomic_long_t obd_dirty_pages;
+extern atomic_long_t obd_dirty_transit_pages;
 extern char obd_jobid_var[];
 
 /* Some hash init argument constants */
@@ -179,9 +182,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_MDS_GET_ROOT_NET	 0x11b
 #define OBD_FAIL_MDS_GET_ROOT_PACK	 0x11c
 #define OBD_FAIL_MDS_STATFS_PACK         0x11d
-#define OBD_FAIL_MDS_STATFS_SUM_PACK     0x11d
 #define OBD_FAIL_MDS_STATFS_NET          0x11e
-#define OBD_FAIL_MDS_STATFS_SUM_NET      0x11e
 #define OBD_FAIL_MDS_GETATTR_NAME_NET    0x11f
 #define OBD_FAIL_MDS_PIN_NET             0x120
 #define OBD_FAIL_MDS_UNPIN_NET           0x121
@@ -244,16 +245,11 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a
 #define OBD_FAIL_MDS_LLOG_CREATE_FAILED2 0x15b
 #define OBD_FAIL_MDS_FLD_LOOKUP			0x15c
-#define OBD_FAIL_MDS_CHANGELOG_REORDER	0x15d
 #define OBD_FAIL_MDS_INTENT_DELAY		0x160
 #define OBD_FAIL_MDS_XATTR_REP			0x161
 #define OBD_FAIL_MDS_TRACK_OVERFLOW	 0x162
 #define OBD_FAIL_MDS_LOV_CREATE_RACE	 0x163
 #define OBD_FAIL_MDS_HSM_CDT_DELAY	 0x164
-#define OBD_FAIL_MDS_ORPHAN_DELETE	 0x165
-#define OBD_FAIL_MDS_RMFID_NET		 0x166
-#define OBD_FAIL_MDS_REINT_OPEN		 0x169
-#define OBD_FAIL_MDS_REINT_OPEN2	 0x16a
 
 /* layout lock */
 #define OBD_FAIL_MDS_NO_LL_GETATTR	 0x170
@@ -269,8 +265,6 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_MDS_RECOVERY_ACCEPTS_GAPS 0x185
 #define OBD_FAIL_MDS_GET_INFO_NET        0x186
 #define OBD_FAIL_MDS_DQACQ_NET           0x187
-#define OBD_FAIL_MDS_STRIPE_CREATE	 0x188
-#define OBD_FAIL_MDS_STRIPE_FID		 0x189
 
 /* OI scrub */
 #define OBD_FAIL_OSD_SCRUB_DELAY			0x190
@@ -281,12 +275,6 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OSD_COMPAT_INVALID_ENTRY		0x195
 #define OBD_FAIL_OSD_COMPAT_NO_ENTRY			0x196
 #define OBD_FAIL_OSD_OST_EA_FID_SET			0x197
-#define OBD_FAIL_OSD_NO_OI_ENTRY			0x198
-#define OBD_FAIL_OSD_INDEX_CRASH			0x199
-
-#define OBD_FAIL_OSD_TXN_START				0x19a
-
-#define OBD_FAIL_OSD_DUPLICATE_MAP			0x19b
 
 #define OBD_FAIL_OFD_SET_OID				0x1e0
 
@@ -341,14 +329,6 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OST_PAUSE_PUNCH         0x236
 #define OBD_FAIL_OST_LADVISE_PAUSE	 0x237
 #define OBD_FAIL_OST_FAKE_RW		 0x238
-#define OBD_FAIL_OST_LIST_ASSERT         0x239
-#define OBD_FAIL_OST_GL_WORK_ALLOC	 0x240
-#define OBD_FAIL_OST_SKIP_LV_CHECK	 0x241
-#define OBD_FAIL_OST_STATFS_DELAY	 0x242
-#define OBD_FAIL_OST_INTEGRITY_FAULT	 0x243
-#define OBD_FAIL_OST_INTEGRITY_CMP	 0x244
-#define OBD_FAIL_OST_DISCONNECT_DELAY	 0x245
-#define OBD_FAIL_OST_2BIG_NIOBUF	 0x248
 
 #define OBD_FAIL_LDLM                    0x300
 #define OBD_FAIL_LDLM_NAMESPACE_NEW      0x301
@@ -391,11 +371,9 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_LDLM_SRV_GL_AST	 0x326
 #define OBD_FAIL_LDLM_WATERMARK_LOW	 0x327
 #define OBD_FAIL_LDLM_WATERMARK_HIGH	 0x328
-#define OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL 0x329
 
 #define OBD_FAIL_LDLM_GRANT_CHECK        0x32a
 #define OBD_FAIL_LDLM_PROLONG_PAUSE	 0x32b
-#define OBD_FAIL_LDLM_LOCK_REPLAY	 0x32d
 
 /* LOCKLESS IO */
 #define OBD_FAIL_LDLM_SET_CONTENTION     0x385
@@ -421,7 +399,6 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OSC_DELAY_SETTIME	 0x412
 #define OBD_FAIL_OSC_CONNECT_GRANT_PARAM 0x413
 #define OBD_FAIL_OSC_DELAY_IO            0x414
-#define OBD_FAIL_OSC_NO_SIZE_DATA        0x415
 
 #define OBD_FAIL_PTLRPC                  0x500
 #define OBD_FAIL_PTLRPC_ACK              0x501
@@ -450,21 +427,19 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_PTLRPC_LONG_BOTH_UNLINK 0x51c
 #define OBD_FAIL_PTLRPC_CLIENT_BULK_CB3  0x520
 #define OBD_FAIL_PTLRPC_BULK_ATTACH      0x521
-#define OBD_FAIL_PTLRPC_RESEND_RACE	 0x525
 #define OBD_FAIL_PTLRPC_CONNECT_RACE	 0x531
 
 #define OBD_FAIL_OBD_PING_NET            0x600
-/*	OBD_FAIL_OBD_LOG_CANCEL_NET      0x601 obsolete since 1.5 */
+#define OBD_FAIL_OBD_LOG_CANCEL_NET      0x601
 #define OBD_FAIL_OBD_LOGD_NET            0x602
 /*	OBD_FAIL_OBD_QC_CALLBACK_NET     0x603 obsolete since 2.4 */
 #define OBD_FAIL_OBD_DQACQ               0x604
 #define OBD_FAIL_OBD_LLOG_SETUP          0x605
-/*	OBD_FAIL_OBD_LOG_CANCEL_REP      0x606 obsolete since 1.5 */
+#define OBD_FAIL_OBD_LOG_CANCEL_REP      0x606
 #define OBD_FAIL_OBD_IDX_READ_NET        0x607
 #define OBD_FAIL_OBD_IDX_READ_BREAK	 0x608
 #define OBD_FAIL_OBD_NO_LRU		 0x609
 #define OBD_FAIL_OBDCLASS_MODULE_LOAD	 0x60a
-#define OBD_FAIL_OBD_ZERO_NLINK_RACE	 0x60b
 
 #define OBD_FAIL_TGT_REPLY_NET           0x700
 #define OBD_FAIL_TGT_CONN_RACE           0x701
@@ -487,19 +462,14 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_TGT_CLIENT_DEL		 0x718
 #define OBD_FAIL_TGT_SLUGGISH_NET	 0x719
 #define OBD_FAIL_TGT_RCVD_EIO		 0x720
-#define OBD_FAIL_TGT_RECOVERY_REQ_RACE	 0x721
-#define OBD_FAIL_TGT_REPLY_DATA_RACE	 0x722
-#define OBD_FAIL_TGT_NO_GRANT		 0x725
 
 #define OBD_FAIL_MDC_REVALIDATE_PAUSE    0x800
 #define OBD_FAIL_MDC_ENQUEUE_PAUSE       0x801
 #define OBD_FAIL_MDC_OLD_EXT_FLAGS       0x802
 #define OBD_FAIL_MDC_GETATTR_ENQUEUE     0x803
-#define OBD_FAIL_MDC_RPCS_SEM		 0x804 /* deprecated */
+#define OBD_FAIL_MDC_RPCS_SEM		 0x804
 #define OBD_FAIL_MDC_LIGHTWEIGHT	 0x805
 #define OBD_FAIL_MDC_CLOSE		 0x806
-#define OBD_FAIL_MDC_MERGE		 0x807
-#define OBD_FAIL_MDC_GLIMPSE_DDOS	 0x808
 
 #define OBD_FAIL_MGS                     0x900
 #define OBD_FAIL_MGS_ALL_REQUEST_NET     0x901
@@ -531,7 +501,6 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_FLD                     0x1100
 #define OBD_FAIL_FLD_QUERY_NET           0x1101
 #define OBD_FAIL_FLD_READ_NET		 0x1102
-#define OBD_FAIL_FLD_QUERY_REQ		 0x1103
 
 #define OBD_FAIL_SEC_CTX                 0x1200
 #define OBD_FAIL_SEC_CTX_INIT_NET        0x1201
@@ -540,25 +509,18 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_SEC_CTX_HDL_PAUSE       0x1204
 
 #define OBD_FAIL_LLOG                               0x1300
-/* was	OBD_FAIL_LLOG_ORIGIN_CONNECT_NET            0x1301 until 2.4 */
+#define OBD_FAIL_LLOG_ORIGIN_CONNECT_NET            0x1301
 #define OBD_FAIL_LLOG_ORIGIN_HANDLE_CREATE_NET      0x1302
-/* was	OBD_FAIL_LLOG_ORIGIN_HANDLE_DESTROY_NET     0x1303 until 2.11 */
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_DESTROY_NET     0x1303
 #define OBD_FAIL_LLOG_ORIGIN_HANDLE_READ_HEADER_NET 0x1304
 #define OBD_FAIL_LLOG_ORIGIN_HANDLE_NEXT_BLOCK_NET  0x1305
 #define OBD_FAIL_LLOG_ORIGIN_HANDLE_PREV_BLOCK_NET  0x1306
-/* was	OBD_FAIL_LLOG_ORIGIN_HANDLE_WRITE_REC_NET   0x1307 until 2.1 */
-/* was	OBD_FAIL_LLOG_ORIGIN_HANDLE_CLOSE_NET       0x1308 until 1.8 */
-/* was	OBD_FAIL_LLOG_CATINFO_NET                   0x1309 until 2.3 */
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_WRITE_REC_NET   0x1307
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CLOSE_NET       0x1308
+#define OBD_FAIL_LLOG_CATINFO_NET                   0x1309
 #define OBD_FAIL_MDS_SYNC_CAPA_SL                   0x1310
 #define OBD_FAIL_SEQ_ALLOC                          0x1311
 #define OBD_FAIL_CAT_RECORDS			    0x1312
-#define OBD_FAIL_CAT_FREE_RECORDS		    0x1313
-#define OBD_FAIL_TIME_IN_CHLOG_USER		    0x1314
-#define CFS_FAIL_CHLOG_USER_REG_UNREG_RACE	    0x1315
-#define OBD_FAIL_FORCE_GC_THREAD		    0x1316
-#define OBD_FAIL_LLOG_PROCESS_TIMEOUT		    0x1317
-#define OBD_FAIL_LLOG_PURGE_DELAY		    0x1318
-#define OBD_FAIL_CATLIST			    0x131b
 
 #define OBD_FAIL_LLITE                              0x1400
 #define OBD_FAIL_LLITE_FAULT_TRUNC_RACE             0x1401
@@ -574,10 +536,9 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_LLITE_NEWNODE_PAUSE		    0x140a
 #define OBD_FAIL_LLITE_SETDIRSTRIPE_PAUSE	    0x140b
 #define OBD_FAIL_LLITE_CREATE_NODE_PAUSE	    0x140c
+#define OBD_FAIL_LLITE_PTASK_IO_FAIL		    0x140d
 #define OBD_FAIL_LLITE_IMUTEX_SEC		    0x140e
 #define OBD_FAIL_LLITE_IMUTEX_NOSEC		    0x140f
-#define OBD_FAIL_LLITE_OPEN_BY_NAME		    0x1410
-#define OBD_FAIL_LLITE_SHORT_COMMIT		    0x1415
 
 #define OBD_FAIL_FID_INDIR	0x1501
 #define OBD_FAIL_FID_INLMA	0x1502
@@ -626,11 +587,9 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV	0x162a
 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV	0x162b
 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME	0x162c
-#define OBD_FAIL_LFSCK_ENGINE_DELAY	0x162d
+#define OBD_FAIL_LFSCK_ASSISTANT_DIRECT	0x162d
 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2	0x162e
 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE	0x162f
-#define OBD_FAIL_LFSCK_NO_AGENTOBJ	0x1630
-#define OBD_FAIL_LFSCK_NO_AGENTENT	0x1631
 
 #define OBD_FAIL_LFSCK_NOTIFY_NET	0x16f0
 #define OBD_FAIL_LFSCK_QUERY_NET	0x16f1
@@ -644,17 +603,14 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_INVALIDATE_UPDATE	0x1705
 
 /* MIGRATE */
+#define OBD_FAIL_MIGRATE_NET_REP		0x1800
 #define OBD_FAIL_MIGRATE_ENTRIES		0x1801
+#define OBD_FAIL_MIGRATE_LINKEA			0x1802
+#define OBD_FAIL_MIGRATE_DELAY			0x1803
 
 /* LMV */
 #define OBD_FAIL_UNKNOWN_LMV_STRIPE		0x1901
 
-/* FLR */
-#define OBD_FAIL_FLR_GLIMPSE_IMMUTABLE		0x1A00
-#define OBD_FAIL_FLR_LV_DELAY			0x1A01
-#define OBD_FAIL_FLR_LV_INC			0x1A02
-#define OBD_FAIL_FLR_RANDOM_PICK_MIRROR	0x1A03
-
 /* DT */
 #define OBD_FAIL_DT_DECLARE_ATTR_GET		0x2000
 #define OBD_FAIL_DT_ATTR_GET			0x2001
@@ -686,19 +642,14 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OSP_CHECK_INVALID_REC		0x2100
 #define OBD_FAIL_OSP_CHECK_ENOMEM		0x2101
 #define OBD_FAIL_OSP_FAKE_PRECREATE		0x2102
-#define OBD_FAIL_OSP_RPCS_SEM			0x2104
-#define OBD_FAIL_OSP_CANT_PROCESS_LLOG		0x2105
-#define OBD_FAIL_OSP_INVALID_LOGID		0x2106
 
-/* barrier */
+ /* barrier */
 #define OBD_FAIL_MGS_BARRIER_READ_NET		0x2200
 #define OBD_FAIL_MGS_BARRIER_NOTIFY_NET		0x2201
 
 #define OBD_FAIL_BARRIER_DELAY			0x2202
 #define OBD_FAIL_BARRIER_FAILURE		0x2203
 
-#define OBD_FAIL_OSD_FAIL_AT_TRUNCATE		0x2301
-
 /* Assign references to moved code to reduce code changes */
 #define OBD_FAIL_PRECHECK(id)                   CFS_FAIL_PRECHECK(id)
 #define OBD_FAIL_CHECK(id)                      CFS_FAIL_CHECK(id)
@@ -780,13 +731,11 @@ static inline void obd_memory_sub(long size)
 
 #define __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, flags)		      \
 do {									      \
-	if (cptab)							      \
-		ptr = cfs_cpt_malloc((cptab), (cpt), (size),		      \
-				     (flags) | __GFP_ZERO | __GFP_NOWARN);    \
-	if (!(cptab) || unlikely(!(ptr))) /* retry without CPT if failure */  \
-		ptr = kmalloc(size, (flags) | __GFP_ZERO);		      \
-	if (likely((ptr) != NULL))					      \
-		OBD_ALLOC_POST((ptr), (size), "kmalloced");		      \
+	(ptr) = (cptab) == NULL ?					      \
+		kmalloc(size, (flags) | __GFP_ZERO) :			      \
+		cfs_cpt_malloc(cptab, cpt, size, (flags) | __GFP_ZERO);	      \
+	if (likely((ptr) != NULL))                                            \
+		OBD_ALLOC_POST(ptr, size, "kmalloced");                       \
 } while (0)
 
 #define OBD_ALLOC_GFP(ptr, size, gfp_mask)				      \
@@ -813,7 +762,7 @@ do {									      \
 #define __OBD_VMALLOC_VERBOSE(ptr, cptab, cpt, size)			      \
 do {									      \
 	(ptr) = cptab == NULL ?						      \
-		__ll_vmalloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO) :   \
+		__ll_vmalloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO):    \
 		cfs_cpt_vzalloc(cptab, cpt, size);			      \
 	if (unlikely((ptr) == NULL)) {                                        \
 		CERROR("vmalloc of '" #ptr "' (%d bytes) failed\n",           \
@@ -874,7 +823,7 @@ do {									      \
 do {									      \
 	if (is_vmalloc_addr(ptr)) {					      \
 		OBD_FREE_PRE(ptr, size, "vfreed");			      \
-		libcfs_vfree_atomic(ptr);				      \
+		libcfs_vfree_atomic(ptr);						      \
 		POISON_PTR(ptr);					      \
 	} else {							      \
 		OBD_FREE(ptr, size);					      \
@@ -962,29 +911,4 @@ static inline int lustre_to_lma_flags(__u32 la_flags)
 	return (la_flags & LUSTRE_ORPHAN_FL) ? LMAI_ORPHAN : 0;
 }
 
-/* Convert wire LUSTRE_*_FL to corresponding client local VFS S_* values
- * for the client inode i_flags.  The LUSTRE_*_FL are the Lustre wire
- * protocol equivalents of LDISKFS_*_FL values stored on disk, while
- * the S_* flags are kernel-internal values that change between kernel
- * versions. These flags are set/cleared via FSFILT_IOC_{GET,SET}_FLAGS.
- * See b=16526 for a full history.
- */
-static inline int ll_ext_to_inode_flags(int flags)
-{
-	return (((flags & LUSTRE_SYNC_FL)      ? S_SYNC      : 0) |
-		((flags & LUSTRE_NOATIME_FL)   ? S_NOATIME   : 0) |
-		((flags & LUSTRE_APPEND_FL)    ? S_APPEND    : 0) |
-		((flags & LUSTRE_DIRSYNC_FL)   ? S_DIRSYNC   : 0) |
-		((flags & LUSTRE_IMMUTABLE_FL) ? S_IMMUTABLE : 0));
-}
-
-static inline int ll_inode_to_ext_flags(int iflags)
-{
-	return (((iflags & S_SYNC)      ? LUSTRE_SYNC_FL      : 0) |
-		((iflags & S_NOATIME)   ? LUSTRE_NOATIME_FL   : 0) |
-		((iflags & S_APPEND)    ? LUSTRE_APPEND_FL    : 0) |
-		((iflags & S_DIRSYNC)   ? LUSTRE_DIRSYNC_FL   : 0) |
-		((iflags & S_IMMUTABLE) ? LUSTRE_IMMUTABLE_FL : 0));
-}
-
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/include/obj_update.h b/drivers/staging/lustrefsx/lustre/include/obj_update.h
index 8c88de86005ea..c381f77f0045e 100644
--- a/drivers/staging/lustrefsx/lustre/include/obj_update.h
+++ b/drivers/staging/lustrefsx/lustre/include/obj_update.h
@@ -31,7 +31,7 @@
 #ifndef _OBJ_UPDATE_H_
 #define _OBJ_UPDATE_H_
 
-#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre/lustre_idl.h>
 
 static inline size_t
 object_update_param_size(const struct object_update_param *param)
diff --git a/drivers/staging/lustrefsx/lustre/include/seq_range.h b/drivers/staging/lustrefsx/lustre/include/seq_range.h
index 374d1932f0bdf..616ee3a78e68b 100644
--- a/drivers/staging/lustrefsx/lustre/include/seq_range.h
+++ b/drivers/staging/lustrefsx/lustre/include/seq_range.h
@@ -34,7 +34,7 @@
 #ifndef _SEQ_RANGE_H_
 #define _SEQ_RANGE_H_
 
-#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre/lustre_idl.h>
 
 /**
  * computes the sequence range type \a range
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_barrier_user.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_barrier_user.h
deleted file mode 100644
index 38084241d8998..0000000000000
--- a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_barrier_user.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
-
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License version 2 for more details.  A copy is
- * included in the COPYING file that accompanied this code.
-
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2017, Intel Corporation.
- *
- * lustre/include/lustre/lustre_barrier_user.h
- *
- * Lustre write barrier (on MDT) userspace interfaces.
- *
- * Author: Fan, Yong <fan.yong@intel.com>
- */
-#ifndef _LUSTRE_BARRIER_USER_H
-# define _LUSTRE_BARRIER_USER_H
-
-#include <linux/types.h>
-#include <linux/lustre/lustre_user.h>
-
-#define BARRIER_VERSION_V1	1
-#define BARRIER_TIMEOUT_DEFAULT	30
-
-enum barrier_commands {
-	BC_FREEZE	= 1,
-	BC_THAW		= 2,
-	BC_STAT		= 3,
-	BC_RESCAN	= 4,
-};
-
-enum barrier_status {
-	BS_INIT		= 0,
-	BS_FREEZING_P1	= 1,
-	BS_FREEZING_P2	= 2,
-	BS_FROZEN	= 3,
-	BS_THAWING	= 4,
-	BS_THAWED	= 5,
-	BS_FAILED	= 6,
-	BS_EXPIRED	= 7,
-	BS_RESCAN	= 8,
-};
-
-struct barrier_ctl {
-	__u32	bc_version;
-	__u32	bc_cmd;
-	union {
-		__s32	bc_timeout;
-		__u32	bc_total;
-	};
-	union {
-		__u32	bc_status;
-		__u32	bc_absence;
-	};
-	char	bc_name[12];
-	__u32	bc_padding;
-};
-
-#endif /* _LUSTRE_BARRIER_USER_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fiemap.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fiemap.h
deleted file mode 100644
index 8cdb05dedbd8c..0000000000000
--- a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fiemap.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2014, 2015, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * FIEMAP data structures and flags. This header file will be used until
- * fiemap.h is available in the upstream kernel.
- *
- * Author: Kalpak Shah <kalpak.shah@sun.com>
- * Author: Andreas Dilger <adilger@sun.com>
- */
-
-#ifndef _LUSTRE_FIEMAP_H
-#define _LUSTRE_FIEMAP_H
-
-#include <stddef.h>
-#include <linux/fiemap.h>
-#include <linux/types.h>
-
-/* XXX: We use fiemap_extent::fe_reserved[0] */
-#define fe_device	fe_reserved[0]
-
-static inline __kernel_size_t fiemap_count_to_size(__kernel_size_t extent_count)
-{
-	return sizeof(struct fiemap) + extent_count *
-				       sizeof(struct fiemap_extent);
-}
-
-static inline unsigned int fiemap_size_to_count(__kernel_size_t array_size)
-{
-	return (array_size - sizeof(struct fiemap)) /
-	       sizeof(struct fiemap_extent);
-}
-
-#define FIEMAP_FLAG_DEVICE_ORDER 0x40000000 /* return device ordered mapping */
-
-#ifdef FIEMAP_FLAGS_COMPAT
-#undef FIEMAP_FLAGS_COMPAT
-#endif
-
-/* Lustre specific flags - use a high bit, don't conflict with upstream flag */
-#define FIEMAP_EXTENT_NO_DIRECT 0x40000000 /* Data mapping undefined */
-#define FIEMAP_EXTENT_NET       0x80000000 /* Data stored remotely.
-					    * Sets NO_DIRECT flag */
-
-#endif /* _LUSTRE_FIEMAP_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_lfsck_user.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_lfsck_user.h
deleted file mode 100644
index 68c8d3a1009c4..0000000000000
--- a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_lfsck_user.h
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
-
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License version 2 for more details.  A copy is
- * included in the COPYING file that accompanied this code.
-
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2012, 2017, Intel Corporation.
- */
-/*
- * lustre/include/lustre/lustre_lfsck_user.h
- *
- * Lustre LFSCK userspace interfaces.
- *
- * Author: Fan, Yong <fan.yong@intel.com>
- */
-
-#ifndef _LUSTRE_LFSCK_USER_H
-# define _LUSTRE_LFSCK_USER_H
-
-#include <linux/types.h>
-#include <linux/lustre/lustre_user.h>
-
-/**
- * state machine:
- *
- *					LS_INIT
- *					   |
- *				     (lfsck|start)
- *					   |
- *					   v
- *				   LS_SCANNING_PHASE1
- *					|	^
- *					|	:
- *					| (lfsck:restart)
- *					|	:
- *					v	:
- *	-----------------------------------------------------------------
- *	|		    |^		|^	   |^	      |^	|^
- *	|		    |:		|:	   |:	      |:	|:
- *	v		    v:		v:	   v:	      v:	v:
- * LS_SCANNING_PHASE2	LS_FAILED  LS_STOPPED  LS_PAUSED LS_CRASHED LS_PARTIAL
- *			  (CO_)       (CO_)	 (CO_)
- *	|	^	    ^:		^:	   ^:	      ^:	^:
- *	|	:	    |:		|:	   |:	      |:	|:
- *	| (lfsck:restart)   |:		|:	   |:	      |:	|:
- *	v	:	    |v		|v	   |v	      |v	|v
- *	-----------------------------------------------------------------
- *	    |
- *	    v
- *    LS_COMPLETED
- */
-enum lfsck_status {
-	/* The lfsck file is new created, for new MDT, upgrading from old disk,
-	 * or re-creating the lfsck file manually. */
-	LS_INIT			= 0,
-
-	/* The first-step system scanning. The checked items during the phase1
-	 * scanning depends on the LFSCK type. */
-	LS_SCANNING_PHASE1	= 1,
-
-	/* The second-step system scanning. The checked items during the phase2
-	 * scanning depends on the LFSCK type. */
-	LS_SCANNING_PHASE2	= 2,
-
-	/* The LFSCK processing has completed for all objects. */
-	LS_COMPLETED		= 3,
-
-	/* The LFSCK exited automatically for failure, will not auto restart. */
-	LS_FAILED		= 4,
-
-	/* The LFSCK is stopped manually, will not auto restart. */
-	LS_STOPPED		= 5,
-
-	/* LFSCK is paused automatically when umount,
-	 * will be restarted automatically when remount. */
-	LS_PAUSED		= 6,
-
-	/* System crashed during the LFSCK,
-	 * will be restarted automatically after recovery. */
-	LS_CRASHED		= 7,
-
-	/* Some OST/MDT failed during the LFSCK, or not join the LFSCK. */
-	LS_PARTIAL		= 8,
-
-	/* The LFSCK is failed because its controller is failed. */
-	LS_CO_FAILED		= 9,
-
-	/* The LFSCK is stopped because its controller is stopped. */
-	LS_CO_STOPPED		= 10,
-
-	/* The LFSCK is paused because its controller is paused. */
-	LS_CO_PAUSED		= 11,
-
-	LS_MAX
-};
-
-static inline const char *lfsck_status2name(int status)
-{
-	static const char * const lfsck_status_names[] = {
-		[LS_INIT]		= "init",
-		[LS_SCANNING_PHASE1]	= "scanning-phase1",
-		[LS_SCANNING_PHASE2]	= "scanning-phase2",
-		[LS_COMPLETED]		= "completed",
-		[LS_FAILED]		= "failed",
-		[LS_STOPPED]		= "stopped",
-		[LS_PAUSED]		= "paused",
-		[LS_CRASHED]		= "crashed",
-		[LS_PARTIAL]		= "partial",
-		[LS_CO_FAILED]		= "co-failed",
-		[LS_CO_STOPPED]		= "co-stopped",
-		[LS_CO_PAUSED]		= "co-paused"
-	};
-
-	if (status < 0 || status >= LS_MAX)
-		return "unknown";
-
-	return lfsck_status_names[status];
-}
-
-enum lfsck_param_flags {
-	/* Reset LFSCK iterator position to the device beginning. */
-	LPF_RESET		= 0x0001,
-
-	/* Exit when fail. */
-	LPF_FAILOUT		= 0x0002,
-
-	/* Dryrun mode, only check without modification */
-	LPF_DRYRUN		= 0x0004,
-
-	/* LFSCK runs on all targets. */
-	LPF_ALL_TGT		= 0x0008,
-
-	/* Broadcast the command to other MDTs. Only valid on the sponsor MDT */
-	LPF_BROADCAST		= 0x0010,
-
-	/* Handle orphan OST-objects. */
-	LPF_OST_ORPHAN		= 0x0020,
-
-	/* Create OST-object for dangling LOV EA. */
-	LPF_CREATE_OSTOBJ	= 0x0040,
-
-	/* Create MDT-object for dangling name entry. */
-	LPF_CREATE_MDTOBJ	= 0x0080,
-
-	/* Do not return until the LFSCK not running. */
-	LPF_WAIT		= 0x0100,
-
-	/* Delay to create OST-object for dangling LOV EA. */
-	LPF_DELAY_CREATE_OSTOBJ	= 0x0200,
-};
-
-enum lfsck_type {
-	/* For MDT and OST internal OSD consistency check/repair. */
-	LFSCK_TYPE_SCRUB	= 0x0000,
-
-	/* For MDT-OST (layout, object) consistency check/repair. */
-	LFSCK_TYPE_LAYOUT	= 0x0001,
-
-	/* For MDT (FID-in-dirent, linkEA) consistency check/repair. */
-	LFSCK_TYPE_NAMESPACE	= 0x0004,
-	LFSCK_TYPES_SUPPORTED	= (LFSCK_TYPE_SCRUB | LFSCK_TYPE_LAYOUT |
-				   LFSCK_TYPE_NAMESPACE),
-	LFSCK_TYPES_DEF		= LFSCK_TYPES_SUPPORTED,
-	LFSCK_TYPES_ALL		= ((__u16)(~0))
-};
-
-#define LFSCK_VERSION_V1	1
-#define LFSCK_VERSION_V2	2
-
-#define LFSCK_SPEED_NO_LIMIT	0
-#define LFSCK_SPEED_LIMIT_DEF	LFSCK_SPEED_NO_LIMIT
-#define LFSCK_ASYNC_WIN_DEFAULT 1024
-#define LFSCK_ASYNC_WIN_MAX	((__u16)(~0))
-#define LFSCK_TYPE_BITS		16
-
-enum lfsck_start_valid {
-	LSV_SPEED_LIMIT		= 0x00000001,
-	LSV_ERROR_HANDLE	= 0x00000002,
-	LSV_DRYRUN		= 0x00000004,
-	LSV_ASYNC_WINDOWS	= 0x00000008,
-	LSV_CREATE_OSTOBJ	= 0x00000010,
-	LSV_CREATE_MDTOBJ	= 0x00000020,
-	LSV_DELAY_CREATE_OSTOBJ	= 0x00000040,
-};
-
-/* Arguments for starting lfsck. */
-struct lfsck_start {
-	/* Which arguments are valid, see 'enum lfsck_start_valid'. */
-	__u32   ls_valid;
-
-	/* How many items can be scanned at most per second. */
-	__u32   ls_speed_limit;
-
-	/* For compatibility between user space tools and kernel service. */
-	__u16   ls_version;
-
-	/* Which LFSCK components to be (have been) started. */
-	__u16   ls_active;
-
-	/* Flags for the LFSCK, see 'enum lfsck_param_flags'. */
-	__u16   ls_flags;
-
-	/* The windows size for async requests pipeline. */
-	__u16   ls_async_windows;
-};
-
-struct lfsck_stop {
-	__u32	ls_status;
-	__u16	ls_flags;
-	__u16	ls_padding_1; /* For 64-bits aligned. */
-	__u64	ls_padding_2;
-};
-
-struct lfsck_query {
-	__u16	lu_types;
-	__u16	lu_flags;
-	__u32	lu_mdts_count[LFSCK_TYPE_BITS][LS_MAX + 1];
-	__u32	lu_osts_count[LFSCK_TYPE_BITS][LS_MAX + 1];
-	__u64	lu_repaired[LFSCK_TYPE_BITS];
-};
-
-#endif /* _LUSTRE_LFSCK_USER_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_user.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_user.h
deleted file mode 100644
index b8d8bd71f19f9..0000000000000
--- a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_user.h
+++ /dev/null
@@ -1,2366 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2010, 2017, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lustre/include/lustre/lustre_user.h
- *
- * Lustre public user-space interface definitions.
- */
-
-#ifndef _LUSTRE_USER_H
-#define _LUSTRE_USER_H
-
-/** \defgroup lustreuser lustreuser
- *
- * @{
- */
-
-#include <linux/limits.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-
-#ifdef __KERNEL__
-# define __USE_ISOC99	1
-# include <linux/fs.h>
-# include <linux/quota.h>
-# include <linux/string.h> /* snprintf() */
-# include <linux/unistd.h>
-#else /* ! __KERNEL__ */
-# include <stdbool.h>
-# include <stdio.h> /* snprintf() */
-# include <string.h>
-# define NEED_QUOTA_DEFS
-/* # include <sys/quota.h> - this causes complaints about caddr_t */
-# include <sys/stat.h>
-# define FILEID_LUSTRE 0x97 /* for name_to_handle_at() (and llapi_fd2fid()) */
-#endif /* !__KERNEL__ */
-
-/* Handle older distros */
-#ifndef __ALIGN_KERNEL
-#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask))
-#define __ALIGN_KERNEL(x, a)	     __ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
-#endif
-
-#include <linux/lustre/lustre_fiemap.h>
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-#ifdef __STRICT_ANSI__
-#define typeof  __typeof__
-#endif
-
-/*
- * This is a temporary solution of adding quota type.
- * Should be removed as soon as system header is updated.
- */
-#undef LL_MAXQUOTAS
-#define LL_MAXQUOTAS 3
-#undef INITQFNAMES
-#define INITQFNAMES { \
-    "user",	/* USRQUOTA */ \
-    "group",	/* GRPQUOTA */ \
-    "project",	/* PRJQUOTA */ \
-    "undefined", \
-};
-#ifndef USRQUOTA
-#define USRQUOTA 0
-#endif
-#ifndef GRPQUOTA
-#define GRPQUOTA 1
-#endif
-#ifndef PRJQUOTA
-#define PRJQUOTA 2
-#endif
-
-/*
- * We need to always use 64bit version because the structure
- * is shared across entire cluster where 32bit and 64bit machines
- * are co-existing.
- */
-#if __BITS_PER_LONG != 64 || defined(__ARCH_WANT_STAT64)
-typedef struct stat64   lstat_t;
-#define lstat_f  lstat64
-#define fstat_f         fstat64
-#define fstatat_f       fstatat64
-#else
-typedef struct stat     lstat_t;
-#define lstat_f  lstat
-#define fstat_f         fstat
-#define fstatat_f       fstatat
-#endif
-
-#ifndef STATX_BASIC_STATS
-/*
- * Timestamp structure for the timestamps in struct statx.
- *
- * tv_sec holds the number of seconds before (negative) or after (positive)
- * 00:00:00 1st January 1970 UTC.
- *
- * tv_nsec holds a number of nanoseconds (0..999,999,999) after the tv_sec time.
- *
- * __reserved is held in case we need a yet finer resolution.
- */
-struct statx_timestamp {
-	__s64	tv_sec;
-	__u32	tv_nsec;
-	__s32	__reserved;
-};
-
-/*
- * Structures for the extended file attribute retrieval system call
- * (statx()).
- *
- * The caller passes a mask of what they're specifically interested in as a
- * parameter to statx().  What statx() actually got will be indicated in
- * st_mask upon return.
- *
- * For each bit in the mask argument:
- *
- * - if the datum is not supported:
- *
- *   - the bit will be cleared, and
- *
- *   - the datum will be set to an appropriate fabricated value if one is
- *     available (eg. CIFS can take a default uid and gid), otherwise
- *
- *   - the field will be cleared;
- *
- * - otherwise, if explicitly requested:
- *
- *   - the datum will be synchronised to the server if AT_STATX_FORCE_SYNC is
- *     set or if the datum is considered out of date, and
- *
- *   - the field will be filled in and the bit will be set;
- *
- * - otherwise, if not requested, but available in approximate form without any
- *   effort, it will be filled in anyway, and the bit will be set upon return
- *   (it might not be up to date, however, and no attempt will be made to
- *   synchronise the internal state first);
- *
- * - otherwise the field and the bit will be cleared before returning.
- *
- * Items in STATX_BASIC_STATS may be marked unavailable on return, but they
- * will have values installed for compatibility purposes so that stat() and
- * co. can be emulated in userspace.
- */
-struct statx {
-	/* 0x00 */
-	__u32	stx_mask;	/* What results were written [uncond] */
-	__u32	stx_blksize;	/* Preferred general I/O size [uncond] */
-	__u64	stx_attributes;	/* Flags conveying information about the file [uncond] */
-	/* 0x10 */
-	__u32	stx_nlink;	/* Number of hard links */
-	__u32	stx_uid;	/* User ID of owner */
-	__u32	stx_gid;	/* Group ID of owner */
-	__u16	stx_mode;	/* File mode */
-	__u16	__spare0[1];
-	/* 0x20 */
-	__u64	stx_ino;	/* Inode number */
-	__u64	stx_size;	/* File size */
-	__u64	stx_blocks;	/* Number of 512-byte blocks allocated */
-	__u64	stx_attributes_mask; /* Mask to show what's supported in stx_attributes */
-	/* 0x40 */
-	struct statx_timestamp	stx_atime;	/* Last access time */
-	struct statx_timestamp	stx_btime;	/* File creation time */
-	struct statx_timestamp	stx_ctime;	/* Last attribute change time */
-	struct statx_timestamp	stx_mtime;	/* Last data modification time */
-	/* 0x80 */
-	__u32	stx_rdev_major;	/* Device ID of special file [if bdev/cdev] */
-	__u32	stx_rdev_minor;
-	__u32	stx_dev_major;	/* ID of device containing file [uncond] */
-	__u32	stx_dev_minor;
-	/* 0x90 */
-	__u64	__spare2[14];	/* Spare space for future expansion */
-	/* 0x100 */
-};
-
-/*
- * Flags to be stx_mask
- *
- * Query request/result mask for statx() and struct statx::stx_mask.
- *
- * These bits should be set in the mask argument of statx() to request
- * particular items when calling statx().
- */
-#define STATX_TYPE		0x00000001U	/* Want/got stx_mode & S_IFMT */
-#define STATX_MODE		0x00000002U	/* Want/got stx_mode & ~S_IFMT */
-#define STATX_NLINK		0x00000004U	/* Want/got stx_nlink */
-#define STATX_UID		0x00000008U	/* Want/got stx_uid */
-#define STATX_GID		0x00000010U	/* Want/got stx_gid */
-#define STATX_ATIME		0x00000020U	/* Want/got stx_atime */
-#define STATX_MTIME		0x00000040U	/* Want/got stx_mtime */
-#define STATX_CTIME		0x00000080U	/* Want/got stx_ctime */
-#define STATX_INO		0x00000100U	/* Want/got stx_ino */
-#define STATX_SIZE		0x00000200U	/* Want/got stx_size */
-#define STATX_BLOCKS		0x00000400U	/* Want/got stx_blocks */
-#define STATX_BASIC_STATS	0x000007ffU	/* The stuff in the normal stat struct */
-#define STATX_BTIME		0x00000800U	/* Want/got stx_btime */
-#define STATX_ALL		0x00000fffU	/* All currently supported flags */
-#define STATX__RESERVED		0x80000000U	/* Reserved for future struct statx expansion */
-
-/*
- * Attributes to be found in stx_attributes and masked in stx_attributes_mask.
- *
- * These give information about the features or the state of a file that might
- * be of use to ordinary userspace programs such as GUIs or ls rather than
- * specialised tools.
- *
- * Note that the flags marked [I] correspond to generic FS_IOC_FLAGS
- * semantically.  Where possible, the numerical value is picked to correspond
- * also.
- */
-#define STATX_ATTR_COMPRESSED		0x00000004 /* [I] File is compressed by the fs */
-#define STATX_ATTR_IMMUTABLE		0x00000010 /* [I] File is marked immutable */
-#define STATX_ATTR_APPEND		0x00000020 /* [I] File is append-only */
-#define STATX_ATTR_NODUMP		0x00000040 /* [I] File is not to be dumped */
-#define STATX_ATTR_ENCRYPTED		0x00000800 /* [I] File requires key to decrypt in fs */
-
-#define STATX_ATTR_AUTOMOUNT		0x00001000 /* Dir: Automount trigger */
-
-#endif
-
-typedef struct statx lstatx_t;
-
-#define HAVE_LOV_USER_MDS_DATA
-
-#define LUSTRE_EOF 0xffffffffffffffffULL
-
-/* for statfs() */
-#define LL_SUPER_MAGIC 0x0BD00BD0
-
-#define FSFILT_IOC_GETVERSION		_IOR('f', 3, long)
-
-/* FIEMAP flags supported by Lustre */
-#define LUSTRE_FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_DEVICE_ORDER)
-
-enum obd_statfs_state {
-	OS_STATE_DEGRADED	= 0x00000001, /**< RAID degraded/rebuilding */
-	OS_STATE_READONLY	= 0x00000002, /**< filesystem is read-only */
-	OS_STATE_NOPRECREATE	= 0x00000004, /**< no object precreation */
-	OS_STATE_UNUSED1	= 0x00000008, /**< obsolete 1.6, was EROFS=30 */
-	OS_STATE_UNUSED2	= 0x00000010, /**< obsolete 1.6, was EROFS=30 */
-	OS_STATE_ENOSPC		= 0x00000020, /**< not enough free space */
-	OS_STATE_ENOINO		= 0x00000040, /**< not enough inodes */
-	OS_STATE_SUM		= 0x00000100, /**< aggregated for all tagrets */
-	OS_STATE_NONROT		= 0x00000200, /**< non-rotational device */
-};
-
-/** filesystem statistics/attributes for target device */
-struct obd_statfs {
-	__u64		os_type;	/* EXT4_SUPER_MAGIC, UBERBLOCK_MAGIC */
-	__u64		os_blocks;	/* total size in #os_bsize blocks */
-	__u64		os_bfree;	/* number of unused blocks */
-	__u64		os_bavail;	/* blocks available for allocation */
-	__u64		os_files;	/* total number of objects */
-	__u64		os_ffree;	/* # objects that could be created */
-	__u8		os_fsid[40];	/* identifier for filesystem */
-	__u32		os_bsize;	/* block size in bytes for os_blocks */
-	__u32		os_namelen;	/* maximum length of filename in bytes*/
-	__u64		os_maxbytes;	/* maximum object size in bytes */
-	__u32		os_state;       /**< obd_statfs_state OS_STATE_* flag */
-	__u32		os_fprecreated;	/* objs available now to the caller */
-					/* used in QoS code to find preferred
-					 * OSTs */
-	__u32           os_granted;	/* space granted for MDS */
-	__u32           os_spare3;	/* Unused padding fields.  Remember */
-	__u32           os_spare4;	/* to fix lustre_swab_obd_statfs() */
-	__u32           os_spare5;
-	__u32           os_spare6;
-	__u32           os_spare7;
-	__u32           os_spare8;
-	__u32           os_spare9;
-};
-
-/**
- * File IDentifier.
- *
- * FID is a cluster-wide unique identifier of a file or an object (stripe).
- * FIDs are never reused.
- **/
-struct lu_fid {
-       /**
-	* FID sequence. Sequence is a unit of migration: all files (objects)
-	* with FIDs from a given sequence are stored on the same server.
-	* Lustre should support 2^64 objects, so even if each sequence
-	* has only a single object we can still enumerate 2^64 objects.
-	**/
-	__u64 f_seq;
-	/* FID number within sequence. */
-	__u32 f_oid;
-	/**
-	 * FID version, used to distinguish different versions (in the sense
-	 * of snapshots, etc.) of the same file system object. Not currently
-	 * used.
-	 **/
-	__u32 f_ver;
-} __attribute__((packed));
-
-static inline bool fid_is_zero(const struct lu_fid *fid)
-{
-	return fid->f_seq == 0 && fid->f_oid == 0;
-}
-
-/* Currently, the filter_fid::ff_parent::f_ver is not the real parent
- * MDT-object's FID::f_ver, instead it is the OST-object index in its
- * parent MDT-object's layout EA. */
-#define f_stripe_idx f_ver
-
-struct ost_layout {
-	__u32	ol_stripe_size;
-	__u32	ol_stripe_count;
-	__u64	ol_comp_start;
-	__u64	ol_comp_end;
-	__u32	ol_comp_id;
-} __attribute__((packed));
-
-/* The filter_fid structure has changed several times over its lifetime.
- * For a long time "trusted.fid" held the MDT inode parent FID/IGIF and
- * stripe_index and the "self FID" (objid/seq) to be able to recover the
- * OST objects in case of corruption.  With the move to 2.4 and OSD-API for
- * the OST, the "trusted.lma" xattr was added to the OST objects to store
- * the "self FID" to be consistent with the MDT on-disk format, and the
- * filter_fid only stored the MDT inode parent FID and stripe index.
- *
- * In 2.10, the addition of PFL composite layouts required more information
- * to be stored into the filter_fid in order to be able to identify which
- * component the OST object belonged.  As well, the stripe size may vary
- * between components, so it was no longer safe to assume the stripe size
- * or stripe_count of a file.  This is also more robust for plain layouts.
- *
- * For ldiskfs OSTs that were formatted with 256-byte inodes, there is not
- * enough space to store both the filter_fid and LMA in the inode, so they
- * are packed into struct lustre_ost_attrs on disk in trusted.lma to avoid
- * an extra seek for every OST object access.
- *
- * In 2.11, FLR mirror layouts also need to store the layout version and
- * range so that writes to old versions of the layout are not allowed.
- * That ensures that mirrored objects are not modified by evicted clients,
- * and ensures that the components are correctly marked stale on the MDT.
- */
-struct filter_fid_18_23 {
-	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
-	__u64			ff_objid;
-	__u64			ff_seq;
-};
-
-struct filter_fid_24_29 {
-	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
-};
-
-struct filter_fid_210 {
-	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
-	struct ost_layout	ff_layout;
-};
-
-struct filter_fid {
-	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
-	struct ost_layout	ff_layout;
-	__u32			ff_layout_version;
-	__u32			ff_range; /* range of layout version that
-					   * write are allowed */
-} __attribute__((packed));
-
-/* Userspace should treat lu_fid as opaque, and only use the following methods
- * to print or parse them.  Other functions (e.g. compare, swab) could be moved
- * here from lustre_idl.h if needed. */
-struct lu_fid;
-
-enum lma_compat {
-	LMAC_HSM	 = 0x00000001,
-/*	LMAC_SOM	 = 0x00000002, obsolete since 2.8.0 */
-	LMAC_NOT_IN_OI	 = 0x00000004, /* the object does NOT need OI mapping */
-	LMAC_FID_ON_OST  = 0x00000008, /* For OST-object, its OI mapping is
-				       * under /O/<seq>/d<x>. */
-	LMAC_STRIPE_INFO = 0x00000010, /* stripe info in the LMA EA. */
-	LMAC_COMP_INFO	 = 0x00000020, /* Component info in the LMA EA. */
-	LMAC_IDX_BACKUP  = 0x00000040, /* Has index backup. */
-};
-
-/**
- * Masks for all features that should be supported by a Lustre version to
- * access a specific file.
- * This information is stored in lustre_mdt_attrs::lma_incompat.
- */
-enum lma_incompat {
-	LMAI_RELEASED		= 0x00000001, /* file is released */
-	LMAI_AGENT		= 0x00000002, /* agent inode */
-	LMAI_REMOTE_PARENT	= 0x00000004, /* the parent of the object
-						 is on the remote MDT */
-	LMAI_STRIPED		= 0x00000008, /* striped directory inode */
-	LMAI_ORPHAN		= 0x00000010, /* inode is orphan */
-	LMA_INCOMPAT_SUPP	= (LMAI_AGENT | LMAI_REMOTE_PARENT | \
-				   LMAI_STRIPED | LMAI_ORPHAN)
-};
-
-
-/**
- * Following struct for object attributes, that will be kept inode's EA.
- * Introduced in 2.0 release (please see b15993, for details)
- * Added to all objects since Lustre 2.4 as contains self FID
- */
-struct lustre_mdt_attrs {
-	/**
-	 * Bitfield for supported data in this structure. From enum lma_compat.
-	 * lma_self_fid and lma_flags are always available.
-	 */
-	__u32   lma_compat;
-	/**
-	 * Per-file incompat feature list. Lustre version should support all
-	 * flags set in this field. The supported feature mask is available in
-	 * LMA_INCOMPAT_SUPP.
-	 */
-	__u32   lma_incompat;
-	/** FID of this inode */
-	struct lu_fid  lma_self_fid;
-};
-
-struct lustre_ost_attrs {
-	/* Use lustre_mdt_attrs directly for now, need a common header
-	 * structure if want to change lustre_mdt_attrs in future. */
-	struct lustre_mdt_attrs loa_lma;
-
-	/* Below five elements are for OST-object's PFID EA, the
-	 * lma_parent_fid::f_ver is composed of the stripe_count (high 16 bits)
-	 * and the stripe_index (low 16 bits), the size should not exceed
-	 * 5 * sizeof(__u64)) to be accessable by old Lustre. If the flag
-	 * LMAC_STRIPE_INFO is set, then loa_parent_fid and loa_stripe_size
-	 * are valid; if the flag LMAC_COMP_INFO is set, then the next three
-	 * loa_comp_* elements are valid. */
-	struct lu_fid	loa_parent_fid;
-	__u32		loa_stripe_size;
-	__u32		loa_comp_id;
-	__u64		loa_comp_start;
-	__u64		loa_comp_end;
-};
-
-/**
- * Prior to 2.4, the LMA structure also included SOM attributes which has since
- * been moved to a dedicated xattr
- * lma_flags was also removed because of lma_compat/incompat fields.
- */
-#define LMA_OLD_SIZE (sizeof(struct lustre_mdt_attrs) + 5 * sizeof(__u64))
-
-enum lustre_som_flags {
-	/* Unknow or no SoM data, must get size from OSTs. */
-	SOM_FL_UNKNOWN	= 0x0000,
-	/* Known strictly correct, FLR or DoM file (SoM guaranteed). */
-	SOM_FL_STRICT	= 0x0001,
-	/* Known stale - was right at some point in the past, but it is
-	 * known (or likely) to be incorrect now (e.g. opened for write). */
-	SOM_FL_STALE	= 0x0002,
-	/* Approximate, may never have been strictly correct,
-	 * need to sync SOM data to achieve eventual consistency. */
-	SOM_FL_LAZY	= 0x0004,
-};
-
-struct lustre_som_attrs {
-	__u16	lsa_valid;
-	__u16	lsa_reserved[3];
-	__u64	lsa_size;
-	__u64	lsa_blocks;
-};
-
-/**
- * OST object IDentifier.
- */
-struct ost_id {
-	union {
-		struct {
-			__u64	oi_id;
-			__u64	oi_seq;
-		} oi;
-		struct lu_fid oi_fid;
-	};
-} __attribute__((packed));
-
-#define DOSTID "%#llx:%llu"
-#define POSTID(oi) ((unsigned long long)ostid_seq(oi)), \
-		   ((unsigned long long)ostid_id(oi))
-
-struct ll_futimes_3 {
-	__u64 lfu_atime_sec;
-	__u64 lfu_atime_nsec;
-	__u64 lfu_mtime_sec;
-	__u64 lfu_mtime_nsec;
-	__u64 lfu_ctime_sec;
-	__u64 lfu_ctime_nsec;
-};
-
-/*
- * Maximum number of mirrors currently implemented.
- */
-#define LUSTRE_MIRROR_COUNT_MAX		16
-
-/* Lease types for use as arg and return of LL_IOC_{GET,SET}_LEASE ioctl. */
-enum ll_lease_mode {
-	LL_LEASE_RDLCK	= 0x01,
-	LL_LEASE_WRLCK	= 0x02,
-	LL_LEASE_UNLCK	= 0x04,
-};
-
-enum ll_lease_flags {
-	LL_LEASE_RESYNC		= 0x1,
-	LL_LEASE_RESYNC_DONE	= 0x2,
-	LL_LEASE_LAYOUT_MERGE	= 0x4,
-	LL_LEASE_LAYOUT_SPLIT	= 0x8,
-};
-
-#define IOC_IDS_MAX	4096
-struct ll_ioc_lease {
-	__u32		lil_mode;
-	__u32		lil_flags;
-	__u32		lil_count;
-	__u32		lil_ids[0];
-};
-
-struct ll_ioc_lease_id {
-	__u32		lil_mode;
-	__u32		lil_flags;
-	__u32		lil_count;
-	__u16		lil_mirror_id;
-	__u16		lil_padding1;
-	__u64		lil_padding2;
-	__u32		lil_ids[0];
-};
-
-/*
- * The ioctl naming rules:
- * LL_*     - works on the currently opened filehandle instead of parent dir
- * *_OBD_*  - gets data for both OSC or MDC (LOV, LMV indirectly)
- * *_MDC_*  - gets/sets data related to MDC
- * *_LOV_*  - gets/sets data related to OSC/LOV
- * *FILE*   - called on parent dir and passes in a filename
- * *STRIPE* - set/get lov_user_md
- * *INFO    - set/get lov_user_mds_data
- */
-/*	lustre_ioctl.h			101-150 */
-#define LL_IOC_GETFLAGS                 _IOR ('f', 151, long)
-#define LL_IOC_SETFLAGS                 _IOW ('f', 152, long)
-#define LL_IOC_CLRFLAGS                 _IOW ('f', 153, long)
-#define LL_IOC_LOV_SETSTRIPE            _IOW ('f', 154, long)
-#define LL_IOC_LOV_SETSTRIPE_NEW	_IOWR('f', 154, struct lov_user_md)
-#define LL_IOC_LOV_GETSTRIPE            _IOW ('f', 155, long)
-#define LL_IOC_LOV_GETSTRIPE_NEW	_IOR('f', 155, struct lov_user_md)
-#define LL_IOC_LOV_SETEA                _IOW ('f', 156, long)
-/*	LL_IOC_RECREATE_OBJ             157 obsolete */
-/*	LL_IOC_RECREATE_FID             157 obsolete */
-#define LL_IOC_GROUP_LOCK               _IOW ('f', 158, long)
-#define LL_IOC_GROUP_UNLOCK             _IOW ('f', 159, long)
-/*	LL_IOC_QUOTACHECK               160 OBD_IOC_QUOTACHECK */
-/*	LL_IOC_POLL_QUOTACHECK		161 OBD_IOC_POLL_QUOTACHECK */
-/*	LL_IOC_QUOTACTL			162 OBD_IOC_QUOTACTL */
-#define IOC_OBD_STATFS                  _IOWR('f', 164, struct obd_statfs *)
-/*	IOC_LOV_GETINFO                 165 obsolete */
-#define LL_IOC_FLUSHCTX                 _IOW ('f', 166, long)
-/*	LL_IOC_RMTACL                   167 obsolete */
-#define LL_IOC_GETOBDCOUNT              _IOR ('f', 168, long)
-#define LL_IOC_LLOOP_ATTACH             _IOWR('f', 169, long)
-#define LL_IOC_LLOOP_DETACH             _IOWR('f', 170, long)
-#define LL_IOC_LLOOP_INFO               _IOWR('f', 171, struct lu_fid)
-#define LL_IOC_LLOOP_DETACH_BYDEV       _IOWR('f', 172, long)
-#define LL_IOC_PATH2FID                 _IOR ('f', 173, long)
-#define LL_IOC_GET_CONNECT_FLAGS        _IOWR('f', 174, __u64 *)
-#define LL_IOC_GET_MDTIDX               _IOR ('f', 175, int)
-#define LL_IOC_FUTIMES_3		_IOWR('f', 176, struct ll_futimes_3)
-#define LL_IOC_FLR_SET_MIRROR		_IOW ('f', 177, long)
-/*	lustre_ioctl.h			177-210 */
-#define LL_IOC_HSM_STATE_GET		_IOR('f', 211, struct hsm_user_state)
-#define LL_IOC_HSM_STATE_SET		_IOW('f', 212, struct hsm_state_set)
-#define LL_IOC_HSM_CT_START		_IOW('f', 213, struct lustre_kernelcomm)
-#define LL_IOC_HSM_COPY_START		_IOW('f', 214, struct hsm_copy *)
-#define LL_IOC_HSM_COPY_END		_IOW('f', 215, struct hsm_copy *)
-#define LL_IOC_HSM_PROGRESS		_IOW('f', 216, struct hsm_user_request)
-#define LL_IOC_HSM_REQUEST		_IOW('f', 217, struct hsm_user_request)
-#define LL_IOC_DATA_VERSION		_IOR('f', 218, struct ioc_data_version)
-#define LL_IOC_LOV_SWAP_LAYOUTS		_IOW('f', 219, \
-						struct lustre_swap_layouts)
-#define LL_IOC_HSM_ACTION		_IOR('f', 220, \
-						struct hsm_current_action)
-/*	lustre_ioctl.h			221-232 */
-#define LL_IOC_LMV_SETSTRIPE		_IOWR('f', 240, struct lmv_user_md)
-#define LL_IOC_LMV_GETSTRIPE		_IOWR('f', 241, struct lmv_user_md)
-#define LL_IOC_REMOVE_ENTRY		_IOWR('f', 242, __u64)
-#define LL_IOC_RMFID			_IOR('f', 242, struct fid_array)
-#define LL_IOC_SET_LEASE		_IOWR('f', 243, struct ll_ioc_lease)
-#define LL_IOC_SET_LEASE_OLD		_IOWR('f', 243, long)
-#define LL_IOC_GET_LEASE		_IO('f', 244)
-#define LL_IOC_HSM_IMPORT		_IOWR('f', 245, struct hsm_user_import)
-#define LL_IOC_LMV_SET_DEFAULT_STRIPE	_IOWR('f', 246, struct lmv_user_md)
-#define LL_IOC_MIGRATE			_IOR('f', 247, int)
-#define LL_IOC_FID2MDTIDX		_IOWR('f', 248, struct lu_fid)
-#define LL_IOC_GETPARENT		_IOWR('f', 249, struct getparent)
-#define LL_IOC_LADVISE			_IOR('f', 250, struct llapi_lu_ladvise)
-
-#ifndef	FS_IOC_FSGETXATTR
-/*
- * Structure for FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR.
-*/
-struct fsxattr {
-	__u32           fsx_xflags;     /* xflags field value (get/set) */
-	__u32           fsx_extsize;    /* extsize field value (get/set)*/
-	__u32           fsx_nextents;   /* nextents field value (get)   */
-	__u32           fsx_projid;     /* project identifier (get/set) */
-	unsigned char   fsx_pad[12];
-};
-#define FS_IOC_FSGETXATTR		_IOR('X', 31, struct fsxattr)
-#define FS_IOC_FSSETXATTR		_IOW('X', 32, struct fsxattr)
-#endif
-#define LL_IOC_FSGETXATTR		FS_IOC_FSGETXATTR
-#define LL_IOC_FSSETXATTR		FS_IOC_FSSETXATTR
-#ifndef FS_XFLAG_PROJINHERIT
-#define FS_XFLAG_PROJINHERIT		0x00000200
-#endif
-
-
-#define LL_STATFS_LMV		1
-#define LL_STATFS_LOV		2
-#define LL_STATFS_NODELAY	4
-
-#define IOC_MDC_TYPE		'i'
-#define IOC_MDC_LOOKUP		_IOWR(IOC_MDC_TYPE, 20, struct obd_device *)
-#define IOC_MDC_GETFILESTRIPE	_IOWR(IOC_MDC_TYPE, 21, struct lov_user_md *)
-#ifdef HAVE_LOV_USER_MDS_DATA
-#define IOC_MDC_GETFILEINFO_OLD	_IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data_v1 *)
-#define IOC_MDC_GETFILEINFO	_IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data)
-#define LL_IOC_MDC_GETINFO_OLD	_IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data_v1 *)
-#define LL_IOC_MDC_GETINFO	_IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data)
-#endif
-
-#define MAX_OBD_NAME 128 /* If this changes, a NEW ioctl must be added */
-
-/* Define O_LOV_DELAY_CREATE to be a mask that is not useful for regular
- * files, but are unlikely to be used in practice and are not harmful if
- * used incorrectly.  O_NOCTTY and FASYNC are only meaningful for character
- * devices and are safe for use on new files. See LU-4209. */
-/* To be compatible with old statically linked binary we keep the check for
- * the older 0100000000 flag.  This is already removed upstream.  LU-812. */
-#define O_LOV_DELAY_CREATE_1_8	0100000000 /* FMODE_NONOTIFY masked in 2.6.36 */
-#ifndef FASYNC
-#define FASYNC			00020000   /* fcntl, for BSD compatibility */
-#endif
-#define O_LOV_DELAY_CREATE_MASK	(O_NOCTTY | FASYNC)
-#define O_LOV_DELAY_CREATE		(O_LOV_DELAY_CREATE_1_8 | \
-					 O_LOV_DELAY_CREATE_MASK)
-
-#define LL_FILE_IGNORE_LOCK     0x00000001
-#define LL_FILE_GROUP_LOCKED    0x00000002
-#define LL_FILE_READAHEA        0x00000004
-#define LL_FILE_LOCKED_DIRECTIO 0x00000008 /* client-side locks with dio */
-#define LL_FILE_LOCKLESS_IO     0x00000010 /* server-side locks with cio */
-
-#define LOV_USER_MAGIC_V1	0x0BD10BD0
-#define LOV_USER_MAGIC		LOV_USER_MAGIC_V1
-#define LOV_USER_MAGIC_JOIN_V1	0x0BD20BD0
-#define LOV_USER_MAGIC_V3	0x0BD30BD0
-/* 0x0BD40BD0 is occupied by LOV_MAGIC_MIGRATE */
-#define LOV_USER_MAGIC_SPECIFIC 0x0BD50BD0	/* for specific OSTs */
-#define LOV_USER_MAGIC_COMP_V1	0x0BD60BD0
-
-#define LMV_USER_MAGIC		0x0CD30CD0    /* default lmv magic */
-#define LMV_USER_MAGIC_V0	0x0CD20CD0    /* old default lmv magic*/
-#define LMV_USER_MAGIC_SPECIFIC	0x0CD40CD0
-
-#define LOV_PATTERN_NONE	0x000
-#define LOV_PATTERN_RAID0	0x001
-#define LOV_PATTERN_RAID1	0x002
-#define LOV_PATTERN_MDT		0x100
-#define LOV_PATTERN_CMOBD	0x200
-
-#define LOV_PATTERN_F_MASK	0xffff0000
-#define LOV_PATTERN_F_HOLE	0x40000000 /* there is hole in LOV EA */
-#define LOV_PATTERN_F_RELEASED	0x80000000 /* HSM released file */
-#define LOV_PATTERN_DEFAULT	0xffffffff
-
-static inline bool lov_pattern_supported(__u32 pattern)
-{
-	return (pattern & ~LOV_PATTERN_F_RELEASED) == LOV_PATTERN_RAID0 ||
-	       (pattern & ~LOV_PATTERN_F_RELEASED) == LOV_PATTERN_MDT;
-}
-
-#define LOV_MAXPOOLNAME 15
-#define LOV_POOLNAMEF "%.15s"
-
-#define LOV_MIN_STRIPE_BITS 16   /* maximum PAGE_SIZE (ia64), power of 2 */
-#define LOV_MIN_STRIPE_SIZE (1 << LOV_MIN_STRIPE_BITS)
-#define LOV_MAX_STRIPE_COUNT_OLD 160
-/* This calculation is crafted so that input of 4096 will result in 160
- * which in turn is equal to old maximal stripe count.
- * XXX: In fact this is too simpified for now, what it also need is to get
- * ea_type argument to clearly know how much space each stripe consumes.
- *
- * The limit of 12 pages is somewhat arbitrary, but is a reasonably large
- * allocation that is sufficient for the current generation of systems.
- *
- * (max buffer size - lov+rpc header) / sizeof(struct lov_ost_data_v1) */
-#define LOV_MAX_STRIPE_COUNT 2000  /* ((12 * 4096 - 256) / 24) */
-#define LOV_ALL_STRIPES       0xffff /* only valid for directories */
-#define LOV_V1_INSANE_STRIPE_COUNT 65532 /* maximum stripe count bz13933 */
-
-#define XATTR_LUSTRE_PREFIX	"lustre."
-#define XATTR_LUSTRE_LOV	XATTR_LUSTRE_PREFIX"lov"
-
-/* Please update if XATTR_LUSTRE_LOV".set" groks more flags in the future */
-#define allowed_lustre_lov(att) (strcmp((att), XATTR_LUSTRE_LOV".add") == 0 || \
-			strcmp((att), XATTR_LUSTRE_LOV".set") == 0 || \
-			strcmp((att), XATTR_LUSTRE_LOV".set.flags") == 0 || \
-			strcmp((att), XATTR_LUSTRE_LOV".del") == 0)
-
-#define lov_user_ost_data lov_user_ost_data_v1
-struct lov_user_ost_data_v1 {     /* per-stripe data structure */
-	struct ost_id l_ost_oi;	  /* OST object ID */
-	__u32 l_ost_gen;          /* generation of this OST index */
-	__u32 l_ost_idx;          /* OST index in LOV */
-} __attribute__((packed));
-
-#define lov_user_md lov_user_md_v1
-struct lov_user_md_v1 {           /* LOV EA user data (host-endian) */
-	__u32 lmm_magic;          /* magic number = LOV_USER_MAGIC_V1 */
-	__u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
-	struct ost_id lmm_oi;	  /* MDT parent inode id/seq (id/0 for 1.x) */
-	__u32 lmm_stripe_size;    /* size of stripe in bytes */
-	__u16 lmm_stripe_count;   /* num stripes in use for this object */
-	union {
-		__u16 lmm_stripe_offset;  /* starting stripe offset in
-					   * lmm_objects, use when writing */
-		__u16 lmm_layout_gen;     /* layout generation number
-					   * used when reading */
-	};
-	struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
-} __attribute__((packed, __may_alias__));
-
-struct lov_user_md_v3 {           /* LOV EA user data (host-endian) */
-	__u32 lmm_magic;          /* magic number = LOV_USER_MAGIC_V3 */
-	__u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
-	struct ost_id lmm_oi;	  /* MDT parent inode id/seq (id/0 for 1.x) */
-	__u32 lmm_stripe_size;    /* size of stripe in bytes */
-	__u16 lmm_stripe_count;   /* num stripes in use for this object */
-	union {
-		__u16 lmm_stripe_offset;  /* starting stripe offset in
-					   * lmm_objects, use when writing */
-		__u16 lmm_layout_gen;     /* layout generation number
-					   * used when reading */
-	};
-	char  lmm_pool_name[LOV_MAXPOOLNAME + 1]; /* pool name */
-	struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
-} __attribute__((packed, __may_alias__));
-
-struct lu_extent {
-	__u64	e_start;
-	__u64	e_end;
-} __attribute__((packed));
-
-#define DEXT "[%#llx, %#llx)"
-#define PEXT(ext) (unsigned long long)(ext)->e_start, (unsigned long long)(ext)->e_end
-
-static inline bool lu_extent_is_overlapped(struct lu_extent *e1,
-					   struct lu_extent *e2)
-{
-	return e1->e_start < e2->e_end && e2->e_start < e1->e_end;
-}
-
-static inline bool lu_extent_is_whole(struct lu_extent *e)
-{
-	return e->e_start == 0 && e->e_end == LUSTRE_EOF;
-}
-
-enum lov_comp_md_entry_flags {
-	LCME_FL_STALE	= 0x00000001,	/* FLR: stale data */
-	LCME_FL_PREF_RD	= 0x00000002,	/* FLR: preferred for reading */
-	LCME_FL_PREF_WR	= 0x00000004,	/* FLR: preferred for writing */
-	LCME_FL_PREF_RW	= LCME_FL_PREF_RD | LCME_FL_PREF_WR,
-	LCME_FL_OFFLINE	= 0x00000008,	/* Not used */
-	LCME_FL_INIT	= 0x00000010,	/* instantiated */
-	LCME_FL_NOSYNC	= 0x00000020,	/* FLR: no sync for the mirror */
-	LCME_FL_NEG	= 0x80000000	/* used to indicate a negative flag,
-					   won't be stored on disk */
-};
-
-#define LCME_KNOWN_FLAGS	(LCME_FL_NEG | LCME_FL_INIT | LCME_FL_STALE | \
-				 LCME_FL_PREF_RW | LCME_FL_NOSYNC)
-/* The flags can be set by users at mirror creation time. */
-#define LCME_USER_FLAGS		(LCME_FL_PREF_RW)
-
-/* The flags are for mirrors */
-#define LCME_MIRROR_FLAGS	(LCME_FL_NOSYNC)
-
-/* These flags have meaning when set in a default layout and will be inherited
- * from the default/template layout set on a directory.
- */
-#define LCME_TEMPLATE_FLAGS	(LCME_FL_PREF_RW | LCME_FL_NOSYNC)
-
-/* the highest bit in obdo::o_layout_version is used to mark if the file is
- * being resynced. */
-#define LU_LAYOUT_RESYNC	LCME_FL_NEG
-
-/* lcme_id can be specified as certain flags, and the the first
- * bit of lcme_id is used to indicate that the ID is representing
- * certain LCME_FL_* but not a real ID. Which implies we can have
- * at most 31 flags (see LCME_FL_XXX). */
-enum lcme_id {
-	LCME_ID_INVAL	= 0x0,
-	LCME_ID_MAX	= 0x7FFFFFFF,
-	LCME_ID_ALL	= 0xFFFFFFFF,
-	LCME_ID_NOT_ID	= LCME_FL_NEG
-};
-
-#define LCME_ID_MASK	LCME_ID_MAX
-
-struct lov_comp_md_entry_v1 {
-	__u32			lcme_id;        /* unique id of component */
-	__u32			lcme_flags;     /* LCME_FL_XXX */
-	struct lu_extent	lcme_extent;    /* file extent for component */
-	__u32			lcme_offset;    /* offset of component blob,
-						   start from lov_comp_md_v1 */
-	__u32			lcme_size;      /* size of component blob */
-	__u32			lcme_layout_gen;
-	__u64			lcme_timestamp;	/* snapshot time if applicable*/
-	__u32			lcme_padding_1;
-} __attribute__((packed));
-
-#define SEQ_ID_MAX		0x0000FFFF
-#define SEQ_ID_MASK		SEQ_ID_MAX
-/* bit 30:16 of lcme_id is used to store mirror id */
-#define MIRROR_ID_MASK		0x7FFF0000
-#define MIRROR_ID_NEG		0x8000
-#define MIRROR_ID_SHIFT		16
-
-static inline __u32 pflr_id(__u16 mirror_id, __u16 seqid)
-{
-	return ((mirror_id << MIRROR_ID_SHIFT) & MIRROR_ID_MASK) | seqid;
-}
-
-static inline __u16 mirror_id_of(__u32 id)
-{
-	return (id & MIRROR_ID_MASK) >> MIRROR_ID_SHIFT;
-}
-
-/**
- * on-disk data for lcm_flags. Valid if lcm_magic is LOV_MAGIC_COMP_V1.
- */
-enum lov_comp_md_flags {
-	/* the least 2 bits are used by FLR to record file state */
-	LCM_FL_NONE          = 0,
-	LCM_FL_RDONLY           = 1,
-	LCM_FL_WRITE_PENDING    = 2,
-	LCM_FL_SYNC_PENDING     = 3,
-	LCM_FL_FLR_MASK         = 0x3,
-};
-
-struct lov_comp_md_v1 {
-	__u32	lcm_magic;      /* LOV_USER_MAGIC_COMP_V1 */
-	__u32	lcm_size;       /* overall size including this struct */
-	__u32	lcm_layout_gen;
-	__u16	lcm_flags;
-	__u16	lcm_entry_count;
-	/* lcm_mirror_count stores the number of actual mirrors minus 1,
-	 * so that non-flr files will have value 0 meaning 1 mirror. */
-	__u16	lcm_mirror_count;
-	__u16	lcm_padding1[3];
-	__u64	lcm_padding2;
-	struct lov_comp_md_entry_v1 lcm_entries[0];
-} __attribute__((packed));
-
-static inline __u32 lov_user_md_size(__u16 stripes, __u32 lmm_magic)
-{
-	if (stripes == (__u16)-1)
-		stripes = 0;
-
-	if (lmm_magic == LOV_USER_MAGIC_V1)
-		return sizeof(struct lov_user_md_v1) +
-			      stripes * sizeof(struct lov_user_ost_data_v1);
-	return sizeof(struct lov_user_md_v3) +
-				stripes * sizeof(struct lov_user_ost_data_v1);
-}
-
-/* Compile with -D_LARGEFILE64_SOURCE or -D_GNU_SOURCE (or #define) to
- * use this.  It is unsafe to #define those values in this header as it
- * is possible the application has already #included <sys/stat.h>. */
-#ifdef HAVE_LOV_USER_MDS_DATA
-#define lov_user_mds_data lov_user_mds_data_v2
-struct lov_user_mds_data_v1 {
-	lstat_t lmd_st;                 /* MDS stat struct */
-	struct lov_user_md_v1 lmd_lmm;  /* LOV EA V1 user data */
-} __attribute__((packed));
-
-struct lov_user_mds_data_v2 {
-	struct lu_fid lmd_fid;		/* Lustre FID */
-	lstatx_t lmd_stx;		/* MDS statx struct */
-	__u64 lmd_flags;		/* MDS stat flags */
-	__u32 lmd_lmmsize;		/* LOV EA size */
-	__u32 lmd_padding;		/* unused */
-	struct lov_user_md_v1 lmd_lmm;	/* LOV EA user data */
-} __attribute__((packed));
-#endif
-
-struct lmv_user_mds_data {
-	struct lu_fid	lum_fid;
-	__u32		lum_padding;
-	__u32		lum_mds;
-} __attribute__((packed, __may_alias__));
-
-enum lmv_hash_type {
-	LMV_HASH_TYPE_UNKNOWN	= 0,	/* 0 is reserved for testing purpose */
-	LMV_HASH_TYPE_ALL_CHARS = 1,
-	LMV_HASH_TYPE_FNV_1A_64 = 2,
-	LMV_HASH_TYPE_MAX,
-};
-
-#define LMV_HASH_NAME_ALL_CHARS	"all_char"
-#define LMV_HASH_NAME_FNV_1A_64	"fnv_1a_64"
-
-extern char *mdt_hash_name[LMV_HASH_TYPE_MAX];
-
-/* Got this according to how get LOV_MAX_STRIPE_COUNT, see above,
- * (max buffer size - lmv+rpc header) / sizeof(struct lmv_user_mds_data) */
-#define LMV_MAX_STRIPE_COUNT 2000  /* ((12 * 4096 - 256) / 24) */
-#define lmv_user_md lmv_user_md_v1
-struct lmv_user_md_v1 {
-	__u32	lum_magic;	 /* must be the first field */
-	__u32	lum_stripe_count;  /* dirstripe count */
-	__u32	lum_stripe_offset; /* MDT idx for default dirstripe */
-	__u32	lum_hash_type;     /* Dir stripe policy */
-	__u32	lum_type;	  /* LMV type: default or normal */
-	__u32	lum_padding1;
-	__u32	lum_padding2;
-	__u32	lum_padding3;
-	char	lum_pool_name[LOV_MAXPOOLNAME + 1];
-	struct	lmv_user_mds_data  lum_objects[0];
-} __attribute__((packed));
-
-static inline int lmv_user_md_size(int stripes, int lmm_magic)
-{
-	int size = sizeof(struct lmv_user_md);
-
-	if (lmm_magic == LMV_USER_MAGIC_SPECIFIC)
-		size += stripes * sizeof(struct lmv_user_mds_data);
-
-	return size;
-}
-
-struct ll_recreate_obj {
-	__u64 lrc_id;
-	__u32 lrc_ost_idx;
-};
-
-struct ll_fid {
-	__u64 id;         /* holds object id */
-	__u32 generation; /* holds object generation */
-	__u32 f_type;     /* holds object type or stripe idx when passing it to
-			   * OST for saving into EA. */
-};
-
-#define UUID_MAX        40
-struct obd_uuid {
-	char uuid[UUID_MAX];
-};
-
-static inline bool obd_uuid_equals(const struct obd_uuid *u1,
-				   const struct obd_uuid *u2)
-{
-	return strcmp((char *)u1->uuid, (char *)u2->uuid) == 0;
-}
-
-static inline int obd_uuid_empty(struct obd_uuid *uuid)
-{
-	return uuid->uuid[0] == '\0';
-}
-
-static inline void obd_str2uuid(struct obd_uuid *uuid, const char *tmp)
-{
-	strncpy((char *)uuid->uuid, tmp, sizeof(*uuid));
-	uuid->uuid[sizeof(*uuid) - 1] = '\0';
-}
-
-/* For printf's only, make sure uuid is terminated */
-static inline char *obd_uuid2str(const struct obd_uuid *uuid)
-{
-	if (uuid == NULL)
-		return NULL;
-
-	if (uuid->uuid[sizeof(*uuid) - 1] != '\0') {
-		/* Obviously not safe, but for printfs, no real harm done...
-		   we're always null-terminated, even in a race. */
-		static char temp[sizeof(*uuid)];
-		memcpy(temp, uuid->uuid, sizeof(*uuid) - 1);
-		temp[sizeof(*uuid) - 1] = '\0';
-		return temp;
-	}
-	return (char *)(uuid->uuid);
-}
-
-#define LUSTRE_MAXFSNAME 8
-
-/* Extract fsname from uuid (or target name) of a target
-   e.g. (myfs-OST0007_UUID -> myfs)
-   see also deuuidify. */
-static inline void obd_uuid2fsname(char *buf, char *uuid, int buflen)
-{
-	char *p;
-
-	strncpy(buf, uuid, buflen - 1);
-	buf[buflen - 1] = '\0';
-	p = strrchr(buf, '-');
-	if (p != NULL)
-		*p = '\0';
-}
-
-/* printf display format for Lustre FIDs
- * usage: printf("file FID is "DFID"\n", PFID(fid)); */
-#define FID_NOBRACE_LEN 40
-#define FID_LEN (FID_NOBRACE_LEN + 2)
-#define DFID_NOBRACE "%#llx:0x%x:0x%x"
-#define DFID "["DFID_NOBRACE"]"
-#define PFID(fid) (unsigned long long)(fid)->f_seq, (fid)->f_oid, (fid)->f_ver
-
-/* scanf input parse format for fids in DFID_NOBRACE format
- * Need to strip '[' from DFID format first or use "["SFID"]" at caller.
- * usage: sscanf(fidstr, SFID, RFID(&fid)); */
-#define SFID "0x%llx:0x%x:0x%x"
-#define RFID(fid) (unsigned long long *)&((fid)->f_seq), &((fid)->f_oid), &((fid)->f_ver)
-
-/********* Quotas **********/
-
-#define LUSTRE_QUOTABLOCK_BITS 10
-#define LUSTRE_QUOTABLOCK_SIZE (1 << LUSTRE_QUOTABLOCK_BITS)
-
-static inline __u64 lustre_stoqb(__kernel_size_t space)
-{
-	return (space + LUSTRE_QUOTABLOCK_SIZE - 1) >> LUSTRE_QUOTABLOCK_BITS;
-}
-
-#define Q_QUOTACHECK	0x800100 /* deprecated as of 2.4 */
-#define Q_INITQUOTA	0x800101 /* deprecated as of 2.4  */
-#define Q_GETOINFO	0x800102 /* get obd quota info */
-#define Q_GETOQUOTA	0x800103 /* get obd quotas */
-#define Q_FINVALIDATE	0x800104 /* deprecated as of 2.4 */
-
-/* these must be explicitly translated into linux Q_* in ll_dir_ioctl */
-#define LUSTRE_Q_QUOTAON    0x800002     /* deprecated as of 2.4 */
-#define LUSTRE_Q_QUOTAOFF   0x800003     /* deprecated as of 2.4 */
-#define LUSTRE_Q_GETINFO    0x800005     /* get information about quota files */
-#define LUSTRE_Q_SETINFO    0x800006     /* set information about quota files */
-#define LUSTRE_Q_GETQUOTA   0x800007     /* get user quota structure */
-#define LUSTRE_Q_SETQUOTA   0x800008     /* set user quota structure */
-/* lustre-specific control commands */
-#define LUSTRE_Q_INVALIDATE  0x80000b     /* deprecated as of 2.4 */
-#define LUSTRE_Q_FINVALIDATE 0x80000c     /* deprecated as of 2.4 */
-#define LUSTRE_Q_GETDEFAULT  0x80000d     /* get default quota */
-#define LUSTRE_Q_SETDEFAULT  0x80000e     /* set default quota */
-
-/* In the current Lustre implementation, the grace time is either the time
- * or the timestamp to be used after some quota ID exceeds the soft limt,
- * 48 bits should be enough, its high 16 bits can be used as quota flags.
- * */
-#define LQUOTA_GRACE_BITS	48
-#define LQUOTA_GRACE_MASK	((1ULL << LQUOTA_GRACE_BITS) - 1)
-#define LQUOTA_GRACE_MAX	LQUOTA_GRACE_MASK
-#define LQUOTA_GRACE(t)		(t & LQUOTA_GRACE_MASK)
-#define LQUOTA_FLAG(t)		(t >> LQUOTA_GRACE_BITS)
-#define LQUOTA_GRACE_FLAG(t, f)	((__u64)t | (__u64)f << LQUOTA_GRACE_BITS)
-
-/* different quota flags */
-
-/* the default quota flag, the corresponding quota ID will use the default
- * quota setting, the hardlimit and softlimit of its quota record in the global
- * quota file will be set to 0, the low 48 bits of the grace will be set to 0
- * and high 16 bits will contain this flag (see above comment).
- * */
-#define LQUOTA_FLAG_DEFAULT	0x0001
-
-#define ALLQUOTA 255       /* set all quota */
-static inline char *qtype_name(int qtype)
-{
-	switch (qtype) {
-	case USRQUOTA:
-		return "usr";
-	case GRPQUOTA:
-		return "grp";
-	case PRJQUOTA:
-		return "prj";
-	}
-	return "unknown";
-}
-
-#define IDENTITY_DOWNCALL_MAGIC 0x6d6dd629
-#define SEPOL_DOWNCALL_MAGIC 0x8b8bb842
-
-/* permission */
-#define N_PERMS_MAX      64
-
-struct perm_downcall_data {
-	__u64 pdd_nid;
-	__u32 pdd_perm;
-	__u32 pdd_padding;
-};
-
-struct identity_downcall_data {
-	__u32                            idd_magic;
-	__u32                            idd_err;
-	__u32                            idd_uid;
-	__u32                            idd_gid;
-	__u32                            idd_nperms;
-	__u32                            idd_ngroups;
-	struct perm_downcall_data idd_perms[N_PERMS_MAX];
-	__u32                            idd_groups[0];
-};
-
-struct sepol_downcall_data {
-	__u32		sdd_magic;
-	__s64		sdd_sepol_mtime;
-	__u16		sdd_sepol_len;
-	char		sdd_sepol[0];
-};
-
-#ifdef NEED_QUOTA_DEFS
-#ifndef QIF_BLIMITS
-#define QIF_BLIMITS     1
-#define QIF_SPACE       2
-#define QIF_ILIMITS     4
-#define QIF_INODES      8
-#define QIF_BTIME       16
-#define QIF_ITIME       32
-#define QIF_LIMITS      (QIF_BLIMITS | QIF_ILIMITS)
-#define QIF_USAGE       (QIF_SPACE | QIF_INODES)
-#define QIF_TIMES       (QIF_BTIME | QIF_ITIME)
-#define QIF_ALL         (QIF_LIMITS | QIF_USAGE | QIF_TIMES)
-#endif
-
-#endif /* !__KERNEL__ */
-
-/* lustre volatile file support
- * file name header: ".^L^S^T^R:volatile"
- */
-#define LUSTRE_VOLATILE_HDR	".\x0c\x13\x14\x12:VOLATILE"
-#define LUSTRE_VOLATILE_HDR_LEN	14
-
-enum lustre_quota_version {
-	LUSTRE_QUOTA_V2 = 1
-};
-
-/* XXX: same as if_dqinfo struct in kernel */
-struct obd_dqinfo {
-	__u64 dqi_bgrace;
-	__u64 dqi_igrace;
-	__u32 dqi_flags;
-	__u32 dqi_valid;
-};
-
-/* XXX: same as if_dqblk struct in kernel, plus one padding */
-struct obd_dqblk {
-	__u64 dqb_bhardlimit;
-	__u64 dqb_bsoftlimit;
-	__u64 dqb_curspace;
-	__u64 dqb_ihardlimit;
-	__u64 dqb_isoftlimit;
-	__u64 dqb_curinodes;
-	__u64 dqb_btime;
-	__u64 dqb_itime;
-	__u32 dqb_valid;
-	__u32 dqb_padding;
-};
-
-enum {
-	QC_GENERAL      = 0,
-	QC_MDTIDX       = 1,
-	QC_OSTIDX       = 2,
-	QC_UUID         = 3
-};
-
-struct if_quotactl {
-	__u32                   qc_cmd;
-	__u32                   qc_type;
-	__u32                   qc_id;
-	__u32                   qc_stat;
-	__u32                   qc_valid;
-	__u32                   qc_idx;
-	struct obd_dqinfo       qc_dqinfo;
-	struct obd_dqblk        qc_dqblk;
-	char                    obd_type[16];
-	struct obd_uuid         obd_uuid;
-};
-
-/* swap layout flags */
-#define SWAP_LAYOUTS_CHECK_DV1		(1 << 0)
-#define SWAP_LAYOUTS_CHECK_DV2		(1 << 1)
-#define SWAP_LAYOUTS_KEEP_MTIME		(1 << 2)
-#define SWAP_LAYOUTS_KEEP_ATIME		(1 << 3)
-#define SWAP_LAYOUTS_CLOSE		(1 << 4)
-
-/* Swap XATTR_NAME_HSM as well, only on the MDT so far */
-#define SWAP_LAYOUTS_MDS_HSM		(1 << 31)
-struct lustre_swap_layouts {
-	__u64	sl_flags;
-	__u32	sl_fd;
-	__u32	sl_gid;
-	__u64	sl_dv1;
-	__u64	sl_dv2;
-};
-
-/** Bit-mask of valid attributes */
-/* The LA_* flags are written to disk as part of the ChangeLog records
- * so they are part of the on-disk and network protocol, and cannot be changed.
- * Only the first 12 bits are currently saved.
- */
-enum la_valid {
-	LA_ATIME	= 1 << 0,	/* 0x00001 */
-	LA_MTIME	= 1 << 1,	/* 0x00002 */
-	LA_CTIME	= 1 << 2,	/* 0x00004 */
-	LA_SIZE		= 1 << 3,	/* 0x00008 */
-	LA_MODE		= 1 << 4,	/* 0x00010 */
-	LA_UID		= 1 << 5,	/* 0x00020 */
-	LA_GID		= 1 << 6,	/* 0x00040 */
-	LA_BLOCKS	= 1 << 7,	/* 0x00080 */
-	LA_TYPE		= 1 << 8,	/* 0x00100 */
-	LA_FLAGS	= 1 << 9,	/* 0x00200 */
-	LA_NLINK	= 1 << 10,	/* 0x00400 */
-	LA_RDEV		= 1 << 11,	/* 0x00800 */
-	LA_BLKSIZE	= 1 << 12,	/* 0x01000 */
-	LA_KILL_SUID	= 1 << 13,	/* 0x02000 */
-	LA_KILL_SGID	= 1 << 14,	/* 0x04000 */
-	LA_PROJID	= 1 << 15,	/* 0x08000 */
-	LA_LAYOUT_VERSION = 1 << 16,	/* 0x10000 */
-	LA_LSIZE	= 1 << 17,	/* 0x20000 */
-	LA_LBLOCKS	= 1 << 18,	/* 0x40000 */
-	/**
-	 * Attributes must be transmitted to OST objects
-	 */
-	LA_REMOTE_ATTR_SET = (LA_UID | LA_GID | LA_PROJID | LA_LAYOUT_VERSION)
-};
-
-#define MDS_FMODE_READ           00000001
-#define MDS_FMODE_WRITE          00000002
-
-#define MDS_FMODE_CLOSED         00000000
-#define MDS_FMODE_EXEC           00000004
-/*	MDS_FMODE_EPOCH          01000000 obsolete since 2.8.0 */
-/*	MDS_FMODE_TRUNC          02000000 obsolete since 2.8.0 */
-/*	MDS_FMODE_SOM            04000000 obsolete since 2.8.0 */
-
-#define MDS_OPEN_CREATED         00000010
-/*	MDS_OPEN_CROSS           00000020 obsolete in 2.12, internal use only */
-
-#define MDS_OPEN_CREAT           00000100
-#define MDS_OPEN_EXCL            00000200
-#define MDS_OPEN_TRUNC           00001000
-#define MDS_OPEN_APPEND          00002000
-#define MDS_OPEN_SYNC            00010000
-#define MDS_OPEN_DIRECTORY       00200000
-
-#define MDS_OPEN_BY_FID		040000000 /* open_by_fid for known object */
-#define MDS_OPEN_DELAY_CREATE  0100000000 /* delay initial object create */
-#define MDS_OPEN_OWNEROVERRIDE 0200000000 /* NFSD rw-reopen ro file for owner */
-#define MDS_OPEN_JOIN_FILE     0400000000 /* open for join file.
-					   * We do not support JOIN FILE
-					   * anymore, reserve this flags
-					   * just for preventing such bit
-					   * to be reused. */
-
-#define MDS_OPEN_LOCK         04000000000 /* This open requires open lock */
-#define MDS_OPEN_HAS_EA      010000000000 /* specify object create pattern */
-#define MDS_OPEN_HAS_OBJS    020000000000 /* Just set the EA the obj exist */
-#define MDS_OPEN_NORESTORE  0100000000000ULL /* Do not restore file at open */
-#define MDS_OPEN_NEWSTRIPE  0200000000000ULL /* New stripe needed (restripe or
-					      * hsm restore) */
-#define MDS_OPEN_VOLATILE   0400000000000ULL /* File is volatile = created
-						unlinked */
-#define MDS_OPEN_LEASE	   01000000000000ULL /* Open the file and grant lease
-					      * delegation, succeed if it's not
-					      * being opened with conflict mode.
-					      */
-#define MDS_OPEN_RELEASE   02000000000000ULL /* Open the file for HSM release */
-
-#define MDS_OPEN_RESYNC    04000000000000ULL /* FLR: file resync */
-
-/* lustre internal open flags, which should not be set from user space */
-#define MDS_OPEN_FL_INTERNAL (MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS |	\
-			      MDS_OPEN_OWNEROVERRIDE | MDS_OPEN_LOCK |	\
-			      MDS_OPEN_BY_FID | MDS_OPEN_LEASE |	\
-			      MDS_OPEN_RELEASE | MDS_OPEN_RESYNC)
-
-
-/********* Changelogs **********/
-/** Changelog record types */
-enum changelog_rec_type {
-	CL_NONE     = -1,
-	CL_MARK     = 0,
-	CL_CREATE   = 1,  /* namespace */
-	CL_MKDIR    = 2,  /* namespace */
-	CL_HARDLINK = 3,  /* namespace */
-	CL_SOFTLINK = 4,  /* namespace */
-	CL_MKNOD    = 5,  /* namespace */
-	CL_UNLINK   = 6,  /* namespace */
-	CL_RMDIR    = 7,  /* namespace */
-	CL_RENAME   = 8,  /* namespace */
-	CL_EXT      = 9,  /* namespace extended record (2nd half of rename) */
-	CL_OPEN     = 10, /* not currently used */
-	CL_CLOSE    = 11, /* may be written to log only with mtime change */
-	CL_LAYOUT   = 12, /* file layout/striping modified */
-	CL_TRUNC    = 13,
-	CL_SETATTR  = 14,
-	CL_SETXATTR = 15,
-	CL_XATTR    = CL_SETXATTR, /* Deprecated name */
-	CL_HSM      = 16, /* HSM specific events, see flags */
-	CL_MTIME    = 17, /* Precedence: setattr > mtime > ctime > atime */
-	CL_CTIME    = 18,
-	CL_ATIME    = 19,
-	CL_MIGRATE  = 20,
-	CL_FLRW     = 21, /* FLR: file was firstly written */
-	CL_RESYNC   = 22, /* FLR: file was resync-ed */
-	CL_GETXATTR = 23,
-	CL_DN_OPEN  = 24, /* denied open */
-	CL_LAST
-};
-
-static inline const char *changelog_type2str(int type) {
-	static const char *changelog_str[] = {
-		"MARK",  "CREAT", "MKDIR", "HLINK", "SLINK", "MKNOD", "UNLNK",
-		"RMDIR", "RENME", "RNMTO", "OPEN",  "CLOSE", "LYOUT", "TRUNC",
-		"SATTR", "XATTR", "HSM",   "MTIME", "CTIME", "ATIME", "MIGRT",
-		"FLRW",  "RESYNC","GXATR", "NOPEN",
-	};
-
-	if (type >= 0 && type < CL_LAST)
-		return changelog_str[type];
-	return NULL;
-}
-
-/* 12 bits of per-record data can be stored in the bottom of the flags */
-#define CLF_FLAGSHIFT   12
-enum changelog_rec_flags {
-	CLF_VERSION	= 0x1000,
-	CLF_RENAME	= 0x2000,
-	CLF_JOBID	= 0x4000,
-	CLF_EXTRA_FLAGS = 0x8000,
-	CLF_SUPPORTED	= CLF_VERSION | CLF_RENAME | CLF_JOBID |
-			  CLF_EXTRA_FLAGS,
-	CLF_FLAGMASK	= (1U << CLF_FLAGSHIFT) - 1,
-	CLF_VERMASK	= ~CLF_FLAGMASK,
-};
-
-
-/* Anything under the flagmask may be per-type (if desired) */
-/* Flags for unlink */
-#define CLF_UNLINK_LAST       0x0001 /* Unlink of last hardlink */
-#define CLF_UNLINK_HSM_EXISTS 0x0002 /* File has something in HSM */
-				     /* HSM cleaning needed */
-/* Flags for rename */
-#define CLF_RENAME_LAST		0x0001 /* rename unlink last hardlink
-					* of target */
-#define CLF_RENAME_LAST_EXISTS	0x0002 /* rename unlink last hardlink of target
-					* has an archive in backend */
-
-/* Flags for HSM */
-/* 12b used (from high weight to low weight):
- * 2b for flags
- * 3b for event
- * 7b for error code
- */
-#define CLF_HSM_ERR_L        0 /* HSM return code, 7 bits */
-#define CLF_HSM_ERR_H        6
-#define CLF_HSM_EVENT_L      7 /* HSM event, 3 bits, see enum hsm_event */
-#define CLF_HSM_EVENT_H      9
-#define CLF_HSM_FLAG_L      10 /* HSM flags, 2 bits, 1 used, 1 spare */
-#define CLF_HSM_FLAG_H      11
-#define CLF_HSM_SPARE_L     12 /* 4 spare bits */
-#define CLF_HSM_SPARE_H     15
-#define CLF_HSM_LAST        15
-
-/* Remove bits higher than _h, then extract the value
- * between _h and _l by shifting lower weigth to bit 0. */
-#define CLF_GET_BITS(_b, _h, _l) (((_b << (CLF_HSM_LAST - _h)) & 0xFFFF) \
-				   >> (CLF_HSM_LAST - _h + _l))
-
-#define CLF_HSM_SUCCESS      0x00
-#define CLF_HSM_MAXERROR     0x7E
-#define CLF_HSM_ERROVERFLOW  0x7F
-
-#define CLF_HSM_DIRTY        1 /* file is dirty after HSM request end */
-
-/* 3 bits field => 8 values allowed */
-enum hsm_event {
-	HE_ARCHIVE      = 0,
-	HE_RESTORE      = 1,
-	HE_CANCEL       = 2,
-	HE_RELEASE      = 3,
-	HE_REMOVE       = 4,
-	HE_STATE        = 5,
-	HE_SPARE1       = 6,
-	HE_SPARE2       = 7,
-};
-
-static inline enum hsm_event hsm_get_cl_event(__u16 flags)
-{
-	return (enum hsm_event)CLF_GET_BITS(flags, CLF_HSM_EVENT_H,
-					    CLF_HSM_EVENT_L);
-}
-
-static inline void hsm_set_cl_event(enum changelog_rec_flags *clf_flags,
-				    enum hsm_event he)
-{
-	*clf_flags |= (he << CLF_HSM_EVENT_L);
-}
-
-static inline __u16 hsm_get_cl_flags(enum changelog_rec_flags clf_flags)
-{
-	return CLF_GET_BITS(clf_flags, CLF_HSM_FLAG_H, CLF_HSM_FLAG_L);
-}
-
-static inline void hsm_set_cl_flags(enum changelog_rec_flags *clf_flags,
-				    unsigned int bits)
-{
-	*clf_flags |= (bits << CLF_HSM_FLAG_L);
-}
-
-static inline int hsm_get_cl_error(enum changelog_rec_flags clf_flags)
-{
-	return CLF_GET_BITS(clf_flags, CLF_HSM_ERR_H, CLF_HSM_ERR_L);
-}
-
-static inline void hsm_set_cl_error(enum changelog_rec_flags *clf_flags,
-				    unsigned int error)
-{
-	*clf_flags |= (error << CLF_HSM_ERR_L);
-}
-
-enum changelog_rec_extra_flags {
-	CLFE_INVALID	= 0,
-	CLFE_UIDGID	= 0x0001,
-	CLFE_NID	= 0x0002,
-	CLFE_OPEN	= 0x0004,
-	CLFE_XATTR	= 0x0008,
-	CLFE_SUPPORTED	= CLFE_UIDGID | CLFE_NID | CLFE_OPEN | CLFE_XATTR
-};
-
-enum changelog_send_flag {
-	/* Not yet implemented */
-	CHANGELOG_FLAG_FOLLOW      = 0x01,
-	/* Blocking IO makes sense in case of slow user parsing of the records,
-	 * but it also prevents us from cleaning up if the records are not
-	 * consumed. */
-	CHANGELOG_FLAG_BLOCK       = 0x02,
-	/* Pack jobid into the changelog records if available. */
-	CHANGELOG_FLAG_JOBID       = 0x04,
-	/* Pack additional flag bits into the changelog record */
-	CHANGELOG_FLAG_EXTRA_FLAGS = 0x08,
-};
-
-enum changelog_send_extra_flag {
-	/* Pack uid/gid into the changelog record */
-	CHANGELOG_EXTRA_FLAG_UIDGID = 0x01,
-	/* Pack nid into the changelog record */
-	CHANGELOG_EXTRA_FLAG_NID    = 0x02,
-	/* Pack open mode into the changelog record */
-	CHANGELOG_EXTRA_FLAG_OMODE  = 0x04,
-	/* Pack xattr name into the changelog record */
-	CHANGELOG_EXTRA_FLAG_XATTR  = 0x08,
-};
-
-#define CR_MAXSIZE __ALIGN_KERNEL(2 * NAME_MAX + 2 + \
-				  changelog_rec_offset(CLF_SUPPORTED, \
-						       CLFE_SUPPORTED), 8)
-
-/* 31 usable bytes string + null terminator. */
-#define LUSTRE_JOBID_SIZE	32
-
-/* This is the minimal changelog record. It can contain extensions
- * such as rename fields or process jobid. Its exact content is described
- * by the cr_flags and cr_extra_flags.
- *
- * Extensions are packed in the same order as their corresponding flags,
- * then in the same order as their corresponding extra flags.
- */
-struct changelog_rec {
-	__u16			cr_namelen;
-	__u16			cr_flags; /**< \a changelog_rec_flags */
-	__u32			cr_type;  /**< \a changelog_rec_type */
-	__u64			cr_index; /**< changelog record number */
-	__u64			cr_prev;  /**< last index for this target fid */
-	__u64			cr_time;
-	union {
-		struct lu_fid	cr_tfid;        /**< target fid */
-		__u32		cr_markerflags; /**< CL_MARK flags */
-	};
-	struct lu_fid		cr_pfid;        /**< parent fid */
-} __attribute__ ((packed));
-
-/* Changelog extension for RENAME. */
-struct changelog_ext_rename {
-	struct lu_fid		cr_sfid;     /**< source fid, or zero */
-	struct lu_fid		cr_spfid;    /**< source parent fid, or zero */
-};
-
-/* Changelog extension to include JOBID. */
-struct changelog_ext_jobid {
-	char	cr_jobid[LUSTRE_JOBID_SIZE];	/**< zero-terminated string. */
-};
-
-/* Changelog extension to include additional flags. */
-struct changelog_ext_extra_flags {
-	__u64 cr_extra_flags; /* Additional CLFE_* flags */
-};
-
-/* Changelog extra extension to include UID/GID. */
-struct changelog_ext_uidgid {
-	__u64	cr_uid;
-	__u64	cr_gid;
-};
-
-/* Changelog extra extension to include NID. */
-struct changelog_ext_nid {
-	/* have __u64 instead of lnet_nid_t type for use by client api */
-	__u64 cr_nid;
-	/* for use when IPv6 support is added */
-	__u64 extra;
-	__u32 padding;
-};
-
-/* Changelog extra extension to include low 32 bits of MDS_OPEN_* flags. */
-struct changelog_ext_openmode {
-	__u32 cr_openflags;
-};
-
-/* Changelog extra extension to include xattr */
-struct changelog_ext_xattr {
-	char cr_xattr[XATTR_NAME_MAX + 1]; /**< zero-terminated string. */
-};
-
-static inline struct changelog_ext_extra_flags *changelog_rec_extra_flags(
-	const struct changelog_rec *rec);
-
-static inline __kernel_size_t changelog_rec_offset(enum changelog_rec_flags crf,
-					  enum changelog_rec_extra_flags cref)
-{
-	__kernel_size_t size = sizeof(struct changelog_rec);
-
-	if (crf & CLF_RENAME)
-		size += sizeof(struct changelog_ext_rename);
-
-	if (crf & CLF_JOBID)
-		size += sizeof(struct changelog_ext_jobid);
-
-	if (crf & CLF_EXTRA_FLAGS) {
-		size += sizeof(struct changelog_ext_extra_flags);
-		if (cref & CLFE_UIDGID)
-			size += sizeof(struct changelog_ext_uidgid);
-		if (cref & CLFE_NID)
-			size += sizeof(struct changelog_ext_nid);
-		if (cref & CLFE_OPEN)
-			size += sizeof(struct changelog_ext_openmode);
-		if (cref & CLFE_XATTR)
-			size += sizeof(struct changelog_ext_xattr);
-	}
-
-	return size;
-}
-
-static inline __kernel_size_t changelog_rec_size(const struct changelog_rec *rec)
-{
-	enum changelog_rec_extra_flags cref = CLFE_INVALID;
-
-	if (rec->cr_flags & CLF_EXTRA_FLAGS)
-		cref = changelog_rec_extra_flags(rec)->cr_extra_flags;
-
-	return changelog_rec_offset(rec->cr_flags, cref);
-}
-
-static inline __kernel_size_t changelog_rec_varsize(const struct changelog_rec *rec)
-{
-	return changelog_rec_size(rec) - sizeof(*rec) + rec->cr_namelen;
-}
-
-static inline
-struct changelog_ext_rename *changelog_rec_rename(const struct changelog_rec *rec)
-{
-	enum changelog_rec_flags crf = rec->cr_flags & CLF_VERSION;
-
-	return (struct changelog_ext_rename *)((char *)rec +
-					       changelog_rec_offset(crf,
-								 CLFE_INVALID));
-}
-
-/* The jobid follows the rename extension, if present */
-static inline
-struct changelog_ext_jobid *changelog_rec_jobid(const struct changelog_rec *rec)
-{
-	enum changelog_rec_flags crf = rec->cr_flags &
-					(CLF_VERSION | CLF_RENAME);
-
-	return (struct changelog_ext_jobid *)((char *)rec +
-					      changelog_rec_offset(crf,
-								 CLFE_INVALID));
-}
-
-/* The additional flags follow the rename and jobid extensions, if present */
-static inline
-struct changelog_ext_extra_flags *changelog_rec_extra_flags(
-	const struct changelog_rec *rec)
-{
-	enum changelog_rec_flags crf = rec->cr_flags &
-		(CLF_VERSION | CLF_RENAME | CLF_JOBID);
-
-	return (struct changelog_ext_extra_flags *)((char *)rec +
-						 changelog_rec_offset(crf,
-								 CLFE_INVALID));
-}
-
-/* The uid/gid is the first extra extension */
-static inline
-struct changelog_ext_uidgid *changelog_rec_uidgid(
-	const struct changelog_rec *rec)
-{
-	enum changelog_rec_flags crf = rec->cr_flags &
-		(CLF_VERSION | CLF_RENAME | CLF_JOBID | CLF_EXTRA_FLAGS);
-
-	return (struct changelog_ext_uidgid *)((char *)rec +
-					       changelog_rec_offset(crf,
-								 CLFE_INVALID));
-}
-
-/* The nid is the second extra extension */
-static inline
-struct changelog_ext_nid *changelog_rec_nid(const struct changelog_rec *rec)
-{
-	enum changelog_rec_flags crf = rec->cr_flags &
-		(CLF_VERSION | CLF_RENAME | CLF_JOBID | CLF_EXTRA_FLAGS);
-	enum changelog_rec_extra_flags cref = CLFE_INVALID;
-
-	if (rec->cr_flags & CLF_EXTRA_FLAGS)
-		cref = changelog_rec_extra_flags(rec)->cr_extra_flags &
-		       CLFE_UIDGID;
-
-	return (struct changelog_ext_nid *)((char *)rec +
-					    changelog_rec_offset(crf, cref));
-}
-
-/* The OPEN mode is the third extra extension */
-static inline
-struct changelog_ext_openmode *changelog_rec_openmode(
-	const struct changelog_rec *rec)
-{
-	enum changelog_rec_flags crf = rec->cr_flags &
-		(CLF_VERSION | CLF_RENAME | CLF_JOBID | CLF_EXTRA_FLAGS);
-	enum changelog_rec_extra_flags cref = CLFE_INVALID;
-
-	if (rec->cr_flags & CLF_EXTRA_FLAGS)
-		cref = changelog_rec_extra_flags(rec)->cr_extra_flags &
-		       (CLFE_UIDGID | CLFE_NID);
-
-	return (struct changelog_ext_openmode *)((char *)rec +
-					       changelog_rec_offset(crf, cref));
-}
-
-/* The xattr name is the fourth extra extension */
-static inline
-struct changelog_ext_xattr *changelog_rec_xattr(
-	const struct changelog_rec *rec)
-{
-	enum changelog_rec_flags crf = rec->cr_flags &
-		(CLF_VERSION | CLF_RENAME | CLF_JOBID | CLF_EXTRA_FLAGS);
-	enum changelog_rec_extra_flags cref = CLFE_INVALID;
-
-	if (rec->cr_flags & CLF_EXTRA_FLAGS)
-		cref = changelog_rec_extra_flags(rec)->cr_extra_flags &
-			(CLFE_UIDGID | CLFE_NID | CLFE_OPEN);
-
-	return (struct changelog_ext_xattr *)((char *)rec +
-					      changelog_rec_offset(crf, cref));
-}
-
-/* The name follows the rename, jobid  and extra flags extns, if present */
-static inline char *changelog_rec_name(const struct changelog_rec *rec)
-{
-	enum changelog_rec_extra_flags cref = CLFE_INVALID;
-
-	if (rec->cr_flags & CLF_EXTRA_FLAGS)
-		cref = changelog_rec_extra_flags(rec)->cr_extra_flags;
-
-	return (char *)rec + changelog_rec_offset(rec->cr_flags & CLF_SUPPORTED,
-						  cref & CLFE_SUPPORTED);
-}
-
-static inline __kernel_size_t changelog_rec_snamelen(const struct changelog_rec *rec)
-{
-	return rec->cr_namelen - strlen(changelog_rec_name(rec)) - 1;
-}
-
-static inline char *changelog_rec_sname(const struct changelog_rec *rec)
-{
-	char *cr_name = changelog_rec_name(rec);
-
-	return cr_name + strlen(cr_name) + 1;
-}
-
-/**
- * Remap a record to the desired format as specified by the crf flags.
- * The record must be big enough to contain the final remapped version.
- * Superfluous extension fields are removed and missing ones are added
- * and zeroed. The flags of the record are updated accordingly.
- *
- * The jobid and rename extensions can be added to a record, to match the
- * format an application expects, typically. In this case, the newly added
- * fields will be zeroed.
- * The Jobid field can be removed, to guarantee compatibility with older
- * clients that don't expect this field in the records they process.
- *
- * The following assumptions are being made:
- *   - CLF_RENAME will not be removed
- *   - CLF_JOBID will not be added without CLF_RENAME being added too
- *   - CLF_EXTRA_FLAGS will not be added without CLF_JOBID being added too
- *
- * @param[in,out]  rec         The record to remap.
- * @param[in]      crf_wanted  Flags describing the desired extensions.
- * @param[in]      cref_want   Flags describing the desired extra extensions.
- */
-static inline void changelog_remap_rec(struct changelog_rec *rec,
-				       enum changelog_rec_flags crf_wanted,
-				       enum changelog_rec_extra_flags cref_want)
-{
-	char *xattr_mov = NULL;
-	char *omd_mov = NULL;
-	char *nid_mov = NULL;
-	char *uidgid_mov = NULL;
-	char *ef_mov;
-	char *jid_mov;
-	char *rnm_mov;
-	enum changelog_rec_extra_flags cref = CLFE_INVALID;
-
-	crf_wanted &= CLF_SUPPORTED;
-	cref_want &= CLFE_SUPPORTED;
-
-	if ((rec->cr_flags & CLF_SUPPORTED) == crf_wanted) {
-		if (!(rec->cr_flags & CLF_EXTRA_FLAGS) ||
-		    (rec->cr_flags & CLF_EXTRA_FLAGS &&
-		    (changelog_rec_extra_flags(rec)->cr_extra_flags &
-							CLFE_SUPPORTED) ==
-								     cref_want))
-			return;
-	}
-
-	/* First move the variable-length name field */
-	memmove((char *)rec + changelog_rec_offset(crf_wanted, cref_want),
-		changelog_rec_name(rec), rec->cr_namelen);
-
-	/* Locations of extensions in the remapped record */
-	if (rec->cr_flags & CLF_EXTRA_FLAGS) {
-		xattr_mov = (char *)rec +
-			changelog_rec_offset(crf_wanted & CLF_SUPPORTED,
-					     cref_want & ~CLFE_XATTR);
-		omd_mov = (char *)rec +
-			changelog_rec_offset(crf_wanted & CLF_SUPPORTED,
-					     cref_want & ~(CLFE_OPEN |
-							   CLFE_XATTR));
-		nid_mov = (char *)rec +
-			changelog_rec_offset(crf_wanted & CLF_SUPPORTED,
-					     cref_want & ~(CLFE_NID |
-							   CLFE_OPEN |
-							   CLFE_XATTR));
-		uidgid_mov = (char *)rec +
-			changelog_rec_offset(crf_wanted & CLF_SUPPORTED,
-					     cref_want & ~(CLFE_UIDGID |
-							   CLFE_NID |
-							   CLFE_OPEN |
-							   CLFE_XATTR));
-		cref = changelog_rec_extra_flags(rec)->cr_extra_flags;
-	}
-
-	ef_mov  = (char *)rec +
-		  changelog_rec_offset(crf_wanted & ~CLF_EXTRA_FLAGS,
-				       CLFE_INVALID);
-	jid_mov = (char *)rec +
-		  changelog_rec_offset(crf_wanted &
-				       ~(CLF_EXTRA_FLAGS | CLF_JOBID),
-				       CLFE_INVALID);
-	rnm_mov = (char *)rec +
-		  changelog_rec_offset(crf_wanted &
-				       ~(CLF_EXTRA_FLAGS |
-					 CLF_JOBID |
-					 CLF_RENAME),
-				       CLFE_INVALID);
-
-	/* Move the extension fields to the desired positions */
-	if ((crf_wanted & CLF_EXTRA_FLAGS) &&
-	    (rec->cr_flags & CLF_EXTRA_FLAGS)) {
-		if ((cref_want & CLFE_XATTR) && (cref & CLFE_XATTR))
-			memmove(xattr_mov, changelog_rec_xattr(rec),
-				sizeof(struct changelog_ext_xattr));
-
-		if ((cref_want & CLFE_OPEN) && (cref & CLFE_OPEN))
-			memmove(omd_mov, changelog_rec_openmode(rec),
-				sizeof(struct changelog_ext_openmode));
-
-		if ((cref_want & CLFE_NID) && (cref & CLFE_NID))
-			memmove(nid_mov, changelog_rec_nid(rec),
-				sizeof(struct changelog_ext_nid));
-
-		if ((cref_want & CLFE_UIDGID) && (cref & CLFE_UIDGID))
-			memmove(uidgid_mov, changelog_rec_uidgid(rec),
-				sizeof(struct changelog_ext_uidgid));
-
-		memmove(ef_mov, changelog_rec_extra_flags(rec),
-			sizeof(struct changelog_ext_extra_flags));
-	}
-
-	if ((crf_wanted & CLF_JOBID) && (rec->cr_flags & CLF_JOBID))
-		memmove(jid_mov, changelog_rec_jobid(rec),
-			sizeof(struct changelog_ext_jobid));
-
-	if ((crf_wanted & CLF_RENAME) && (rec->cr_flags & CLF_RENAME))
-		memmove(rnm_mov, changelog_rec_rename(rec),
-			sizeof(struct changelog_ext_rename));
-
-	/* Clear newly added fields */
-	if (xattr_mov && (cref_want & CLFE_XATTR) &&
-	    !(cref & CLFE_XATTR))
-		memset(xattr_mov, 0, sizeof(struct changelog_ext_xattr));
-
-	if (omd_mov && (cref_want & CLFE_OPEN) &&
-	    !(cref & CLFE_OPEN))
-		memset(omd_mov, 0, sizeof(struct changelog_ext_openmode));
-
-	if (nid_mov && (cref_want & CLFE_NID) &&
-	    !(cref & CLFE_NID))
-		memset(nid_mov, 0, sizeof(struct changelog_ext_nid));
-
-	if (uidgid_mov && (cref_want & CLFE_UIDGID) &&
-	    !(cref & CLFE_UIDGID))
-		memset(uidgid_mov, 0, sizeof(struct changelog_ext_uidgid));
-
-	if ((crf_wanted & CLF_EXTRA_FLAGS) &&
-	    !(rec->cr_flags & CLF_EXTRA_FLAGS))
-		memset(ef_mov, 0, sizeof(struct changelog_ext_extra_flags));
-
-	if ((crf_wanted & CLF_JOBID) && !(rec->cr_flags & CLF_JOBID))
-		memset(jid_mov, 0, sizeof(struct changelog_ext_jobid));
-
-	if ((crf_wanted & CLF_RENAME) && !(rec->cr_flags & CLF_RENAME))
-		memset(rnm_mov, 0, sizeof(struct changelog_ext_rename));
-
-	/* Update the record's flags accordingly */
-	rec->cr_flags = (rec->cr_flags & CLF_FLAGMASK) | crf_wanted;
-	if (rec->cr_flags & CLF_EXTRA_FLAGS)
-		changelog_rec_extra_flags(rec)->cr_extra_flags =
-			changelog_rec_extra_flags(rec)->cr_extra_flags |
-			cref_want;
-}
-
-enum changelog_message_type {
-	CL_RECORD = 10, /* message is a changelog_rec */
-	CL_EOF    = 11, /* at end of current changelog */
-};
-
-/********* Misc **********/
-
-struct ioc_data_version {
-	__u64	idv_version;
-	__u32	idv_layout_version; /* FLR: layout version for OST objects */
-	__u32	idv_flags;	/* enum ioc_data_version_flags */
-};
-
-enum ioc_data_version_flags {
-	LL_DV_RD_FLUSH	= (1 << 0), /* Flush dirty pages from clients */
-	LL_DV_WR_FLUSH	= (1 << 1), /* Flush all caching pages from clients */
-};
-
-#ifndef offsetof
-#define offsetof(typ, memb)     ((unsigned long)((char *)&(((typ *)0)->memb)))
-#endif
-
-#define dot_lustre_name ".lustre"
-
-
-/********* HSM **********/
-
-/** HSM per-file state
- * See HSM_FLAGS below.
- */
-enum hsm_states {
-	HS_NONE		= 0x00000000,
-	HS_EXISTS	= 0x00000001,
-	HS_DIRTY	= 0x00000002,
-	HS_RELEASED	= 0x00000004,
-	HS_ARCHIVED	= 0x00000008,
-	HS_NORELEASE	= 0x00000010,
-	HS_NOARCHIVE	= 0x00000020,
-	HS_LOST		= 0x00000040,
-};
-
-/* HSM user-setable flags. */
-#define HSM_USER_MASK   (HS_NORELEASE | HS_NOARCHIVE | HS_DIRTY)
-
-/* Other HSM flags. */
-#define HSM_STATUS_MASK (HS_EXISTS | HS_LOST | HS_RELEASED | HS_ARCHIVED)
-
-/*
- * All HSM-related possible flags that could be applied to a file.
- * This should be kept in sync with hsm_states.
- */
-#define HSM_FLAGS_MASK  (HSM_USER_MASK | HSM_STATUS_MASK)
-
-/**
- * HSM request progress state
- */
-enum hsm_progress_states {
-	HPS_NONE	= 0,
-	HPS_WAITING	= 1,
-	HPS_RUNNING	= 2,
-	HPS_DONE	= 3,
-};
-
-static inline const char *hsm_progress_state2name(enum hsm_progress_states s)
-{
-	switch  (s) {
-	case HPS_WAITING:	return "waiting";
-	case HPS_RUNNING:	return "running";
-	case HPS_DONE:		return "done";
-	default:		return "unknown";
-	}
-}
-
-struct hsm_extent {
-	__u64 offset;
-	__u64 length;
-} __attribute__((packed));
-
-/**
- * Current HSM states of a Lustre file.
- *
- * This structure purpose is to be sent to user-space mainly. It describes the
- * current HSM flags and in-progress action.
- */
-struct hsm_user_state {
-	/** Current HSM states, from enum hsm_states. */
-	__u32			hus_states;
-	__u32			hus_archive_id;
-	/**  The current undergoing action, if there is one */
-	__u32			hus_in_progress_state;
-	__u32			hus_in_progress_action;
-	struct hsm_extent	hus_in_progress_location;
-	char			hus_extended_info[];
-};
-
-struct hsm_state_set_ioc {
-	struct lu_fid	hssi_fid;
-	__u64		hssi_setmask;
-	__u64		hssi_clearmask;
-};
-
-/*
- * This structure describes the current in-progress action for a file.
- * it is retuned to user space and send over the wire
- */
-struct hsm_current_action {
-	/**  The current undergoing action, if there is one */
-	/* state is one of hsm_progress_states */
-	__u32			hca_state;
-	/* action is one of hsm_user_action */
-	__u32			hca_action;
-	struct hsm_extent	hca_location;
-};
-
-/***** HSM user requests ******/
-/* User-generated (lfs/ioctl) request types */
-enum hsm_user_action {
-	HUA_NONE    =  1, /* no action (noop) */
-	HUA_ARCHIVE = 10, /* copy to hsm */
-	HUA_RESTORE = 11, /* prestage */
-	HUA_RELEASE = 12, /* drop ost objects */
-	HUA_REMOVE  = 13, /* remove from archive */
-	HUA_CANCEL  = 14  /* cancel a request */
-};
-
-static inline const char *hsm_user_action2name(enum hsm_user_action  a)
-{
-	switch  (a) {
-	case HUA_NONE:    return "NOOP";
-	case HUA_ARCHIVE: return "ARCHIVE";
-	case HUA_RESTORE: return "RESTORE";
-	case HUA_RELEASE: return "RELEASE";
-	case HUA_REMOVE:  return "REMOVE";
-	case HUA_CANCEL:  return "CANCEL";
-	default:          return "UNKNOWN";
-	}
-}
-
-/*
- * List of hr_flags (bit field)
- */
-#define HSM_FORCE_ACTION 0x0001
-/* used by CT, cannot be set by user */
-#define HSM_GHOST_COPY   0x0002
-
-/**
- * Contains all the fixed part of struct hsm_user_request.
- *
- */
-struct hsm_request {
-	__u32 hr_action;	/* enum hsm_user_action */
-	__u32 hr_archive_id;	/* archive id, used only with HUA_ARCHIVE */
-	__u64 hr_flags;		/* request flags */
-	__u32 hr_itemcount;	/* item count in hur_user_item vector */
-	__u32 hr_data_len;
-};
-
-struct hsm_user_item {
-       struct lu_fid        hui_fid;
-       struct hsm_extent hui_extent;
-} __attribute__((packed));
-
-struct hsm_user_request {
-	struct hsm_request	hur_request;
-	struct hsm_user_item	hur_user_item[0];
-	/* extra data blob at end of struct (after all
-	 * hur_user_items), only use helpers to access it
-	 */
-} __attribute__((packed));
-
-/** Return pointer to data field in a hsm user request */
-static inline void *hur_data(struct hsm_user_request *hur)
-{
-	return &(hur->hur_user_item[hur->hur_request.hr_itemcount]);
-}
-
-/**
- * Compute the current length of the provided hsm_user_request.  This returns -1
- * instead of an errno because __kernel_ssize_t is defined to be only
- * [ -1, SSIZE_MAX ]
- *
- * return -1 on bounds check error.
- */
-static inline __kernel_size_t hur_len(struct hsm_user_request *hur)
-{
-	__u64	size;
-
-	/* can't overflow a __u64 since hr_itemcount is only __u32 */
-	size = offsetof(struct hsm_user_request, hur_user_item[0]) +
-		(__u64)hur->hur_request.hr_itemcount *
-		sizeof(hur->hur_user_item[0]) + hur->hur_request.hr_data_len;
-
-	if ((__kernel_ssize_t)size < 0)
-		return -1;
-
-	return size;
-}
-
-/****** HSM RPCs to copytool *****/
-/* Message types the copytool may receive */
-enum hsm_message_type {
-	HMT_ACTION_LIST = 100, /* message is a hsm_action_list */
-};
-
-/* Actions the copytool may be instructed to take for a given action_item */
-enum hsm_copytool_action {
-	HSMA_NONE    = 10, /* no action */
-	HSMA_ARCHIVE = 20, /* arbitrary offset */
-	HSMA_RESTORE = 21,
-	HSMA_REMOVE  = 22,
-	HSMA_CANCEL  = 23
-};
-
-static inline const char *hsm_copytool_action2name(enum hsm_copytool_action  a)
-{
-	switch  (a) {
-	case HSMA_NONE:    return "NOOP";
-	case HSMA_ARCHIVE: return "ARCHIVE";
-	case HSMA_RESTORE: return "RESTORE";
-	case HSMA_REMOVE:  return "REMOVE";
-	case HSMA_CANCEL:  return "CANCEL";
-	default:           return "UNKNOWN";
-	}
-}
-
-/* Copytool item action description */
-struct hsm_action_item {
-	__u32      hai_len;     /* valid size of this struct */
-	__u32      hai_action;  /* hsm_copytool_action, but use known size */
-	struct lu_fid hai_fid;     /* Lustre FID to operate on */
-	struct lu_fid hai_dfid;    /* fid used for data access */
-	struct hsm_extent hai_extent;  /* byte range to operate on */
-	__u64      hai_cookie;  /* action cookie from coordinator */
-	__u64      hai_gid;     /* grouplock id */
-	char       hai_data[0]; /* variable length */
-} __attribute__((packed));
-
-/**
- * helper function which print in hexa the first bytes of
- * hai opaque field
- *
- * \param hai [IN]        record to print
- * \param buffer [IN,OUT] buffer to write the hex string to
- * \param len [IN]        max buffer length
- *
- * \retval buffer
- */
-static inline char *hai_dump_data_field(const struct hsm_action_item *hai,
-					char *buffer, __kernel_size_t len)
-{
-	int i;
-	int data_len;
-	char *ptr;
-
-	ptr = buffer;
-	data_len = hai->hai_len - sizeof(*hai);
-	for (i = 0; (i < data_len) && (len > 2); i++) {
-		snprintf(ptr, 3, "%02X", (unsigned char)hai->hai_data[i]);
-		ptr += 2;
-		len -= 2;
-	}
-
-	*ptr = '\0';
-
-	return buffer;
-}
-
-/* Copytool action list */
-#define HAL_VERSION 1
-#define HAL_MAXSIZE LNET_MTU /* bytes, used in userspace only */
-struct hsm_action_list {
-	__u32 hal_version;
-	__u32 hal_count;       /* number of hai's to follow */
-	__u64 hal_compound_id; /* returned by coordinator, ignored */
-	__u64 hal_flags;
-	__u32 hal_archive_id; /* which archive backend */
-	__u32 padding1;
-	char  hal_fsname[0];   /* null-terminated */
-	/* struct hsm_action_item[hal_count] follows, aligned on 8-byte
-	   boundaries. See hai_zero */
-} __attribute__((packed));
-
-/* Return pointer to first hai in action list */
-static inline struct hsm_action_item *hai_first(struct hsm_action_list *hal)
-{
-	__kernel_size_t offset = __ALIGN_KERNEL(strlen(hal->hal_fsname) + 1, 8);
-
-	return (struct hsm_action_item *)(hal->hal_fsname + offset);
-}
-
-/* Return pointer to next hai */
-static inline struct hsm_action_item * hai_next(struct hsm_action_item *hai)
-{
-	__kernel_size_t offset = __ALIGN_KERNEL(hai->hai_len, 8);
-
-	return (struct hsm_action_item *)((char *)hai + offset);
-}
-
-/* Return size of an hsm_action_list */
-static inline __kernel_size_t hal_size(struct hsm_action_list *hal)
-{
-	__u32 i;
-	__kernel_size_t sz;
-	struct hsm_action_item *hai;
-
-	sz = sizeof(*hal) + __ALIGN_KERNEL(strlen(hal->hal_fsname) + 1, 8);
-	hai = hai_first(hal);
-	for (i = 0; i < hal->hal_count ; i++, hai = hai_next(hai))
-		sz += __ALIGN_KERNEL(hai->hai_len, 8);
-
-	return sz;
-}
-
-/* HSM file import
- * describe the attributes to be set on imported file
- */
-struct hsm_user_import {
-	__u64		hui_size;
-	__u64		hui_atime;
-	__u64		hui_mtime;
-	__u32		hui_atime_ns;
-	__u32		hui_mtime_ns;
-	__u32		hui_uid;
-	__u32		hui_gid;
-	__u32		hui_mode;
-	__u32		hui_archive_id;
-};
-
-/* Copytool progress reporting */
-#define HP_FLAG_COMPLETED 0x01
-#define HP_FLAG_RETRY     0x02
-
-struct hsm_progress {
-	struct lu_fid		hp_fid;
-	__u64			hp_cookie;
-	struct hsm_extent	hp_extent;
-	__u16			hp_flags;
-	__u16			hp_errval; /* positive val */
-	__u32			padding;
-};
-
-struct hsm_copy {
-	__u64			hc_data_version;
-	__u16			hc_flags;
-	__u16			hc_errval; /* positive val */
-	__u32			padding;
-	struct hsm_action_item	hc_hai;
-};
-
-/* JSON objects */
-enum llapi_json_types {
-	LLAPI_JSON_INTEGER = 1,
-	LLAPI_JSON_BIGNUM,
-	LLAPI_JSON_REAL,
-	LLAPI_JSON_STRING
-};
-
-struct llapi_json_item {
-	char			*lji_key;
-	__u32			lji_type;
-	union {
-		int	lji_integer;
-		__u64	lji_u64;
-		double	lji_real;
-		char	*lji_string;
-	};
-	struct llapi_json_item	*lji_next;
-};
-
-struct llapi_json_item_list {
-	int			ljil_item_count;
-	struct llapi_json_item	*ljil_items;
-};
-
-enum lu_ladvise_type {
-	LU_LADVISE_INVALID	= 0,
-	LU_LADVISE_WILLREAD	= 1,
-	LU_LADVISE_DONTNEED	= 2,
-	LU_LADVISE_LOCKNOEXPAND = 3,
-	LU_LADVISE_LOCKAHEAD	= 4,
-	LU_LADVISE_MAX
-};
-
-#define LU_LADVISE_NAMES {						\
-	[LU_LADVISE_WILLREAD]		= "willread",			\
-	[LU_LADVISE_DONTNEED]		= "dontneed",			\
-	[LU_LADVISE_LOCKNOEXPAND]	= "locknoexpand",		\
-	[LU_LADVISE_LOCKAHEAD]		= "lockahead",			\
-}
-
-/* This is the userspace argument for ladvise.  It is currently the same as
- * what goes on the wire (struct lu_ladvise), but is defined separately as we
- * may need info which is only used locally. */
-struct llapi_lu_ladvise {
-	__u16 lla_advice;	/* advice type */
-	__u16 lla_value1;	/* values for different advice types */
-	__u32 lla_value2;
-	__u64 lla_start;	/* first byte of extent for advice */
-	__u64 lla_end;		/* last byte of extent for advice */
-	__u32 lla_value3;
-	__u32 lla_value4;
-};
-
-enum ladvise_flag {
-	LF_ASYNC	= 0x00000001,
-	LF_UNSET        = 0x00000002,
-};
-
-#define LADVISE_MAGIC 0x1ADF1CE0
-/* Masks of valid flags for each advice */
-#define LF_LOCKNOEXPAND_MASK LF_UNSET
-/* Flags valid for all advices not explicitly specified */
-#define LF_DEFAULT_MASK LF_ASYNC
-/* All flags */
-#define LF_MASK (LF_ASYNC | LF_UNSET)
-
-#define lla_lockahead_mode   lla_value1
-#define lla_peradvice_flags    lla_value2
-#define lla_lockahead_result lla_value3
-
-/* This is the userspace argument for ladvise, corresponds to ladvise_hdr which
- * is used on the wire.  It is defined separately as we may need info which is
- * only used locally. */
-struct llapi_ladvise_hdr {
-	__u32			lah_magic;	/* LADVISE_MAGIC */
-	__u32			lah_count;	/* number of advices */
-	__u64			lah_flags;	/* from enum ladvise_flag */
-	__u32			lah_value1;	/* unused */
-	__u32			lah_value2;	/* unused */
-	__u64			lah_value3;	/* unused */
-	struct llapi_lu_ladvise	lah_advise[0];	/* advices in this header */
-};
-
-#define LAH_COUNT_MAX	(1024)
-
-/* Shared key */
-enum sk_crypt_alg {
-	SK_CRYPT_INVALID	= -1,
-	SK_CRYPT_EMPTY		= 0,
-	SK_CRYPT_AES256_CTR	= 1,
-};
-
-enum sk_hmac_alg {
-	SK_HMAC_INVALID	= -1,
-	SK_HMAC_EMPTY	= 0,
-	SK_HMAC_SHA256	= 1,
-	SK_HMAC_SHA512	= 2,
-};
-
-struct sk_crypt_type {
-	const char     *sct_name;
-	int		sct_type;
-};
-
-struct sk_hmac_type {
-	const char     *sht_name;
-	int		sht_type;
-};
-
-enum lock_mode_user {
-	MODE_READ_USER = 1,
-	MODE_WRITE_USER,
-	MODE_MAX_USER,
-};
-
-#define LOCK_MODE_NAMES { \
-	[MODE_READ_USER]  = "READ",\
-	[MODE_WRITE_USER] = "WRITE"\
-}
-
-enum lockahead_results {
-	LLA_RESULT_SENT = 0,
-	LLA_RESULT_DIFFERENT,
-	LLA_RESULT_SAME,
-};
-
-struct fid_array {
-	__u32 fa_nr;
-	/* make header's size equal lu_fid */
-	__u32 fa_padding0;
-	__u64 fa_padding1;
-	struct lu_fid fa_fids[0];
-};
-#define OBD_MAX_FIDS_IN_ARRAY	4096
-
-#if defined(__cplusplus)
-}
-#endif
-
-/** @} lustreuser */
-
-#endif /* _LUSTRE_USER_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_cfg.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_cfg.h
similarity index 77%
rename from drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_cfg.h
rename to drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_cfg.h
index 30d5c7d614892..b1f68d50b0242 100644
--- a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_cfg.h
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_cfg.h
@@ -33,10 +33,14 @@
 #ifndef _UAPI_LUSTRE_CFG_H
 #define _UAPI_LUSTRE_CFG_H
 
-#include <linux/errno.h>
 #include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/lustre/lustre_user.h>
+#include <lustre/lustre_user.h>
+
+/* Handle older distros */
+#ifndef __ALIGN_KERNEL
+# define __ALIGN_KERNEL(x, a)		__ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
+# define __ALIGN_KERNEL_MASK(x, mask)	(((x) + (mask)) & ~(mask))
+#endif
 
 /** \defgroup cfg cfg
  *
@@ -135,8 +139,6 @@ enum lcfg_command_type {
 						 *  users
 						 */
 	LCFG_NODEMAP_MAP_MODE	  = 0x00ce059, /**< set the mapping mode */
-	LCFG_NODEMAP_AUDIT_MODE	  = 0x00ce05a, /**< set the audit mode */
-	LCFG_NODEMAP_SET_SEPOL	  = 0x00ce05b, /**< set SELinux policy */
 };
 
 struct lustre_cfg_bufs {
@@ -158,57 +160,6 @@ struct lustre_cfg {
 	__u32 lcfg_buflens[0];
 };
 
-struct lcfg_type_data {
-	__u32	 ltd_type;
-	char	*ltd_name;
-	char	*ltd_bufs[4];
-};
-
-static struct lcfg_type_data lcfg_data_table[] = {
-	{ LCFG_ATTACH, "attach", { "type", "UUID", "3", "4" } },
-	{ LCFG_DETACH, "detach", { "1", "2", "3", "4" } },
-	{ LCFG_SETUP, "setup", { "UUID", "node", "options", "failout" } },
-	{ LCFG_CLEANUP, "cleanup", { "1", "2", "3", "4" } },
-	{ LCFG_ADD_UUID, "add_uuid", { "node", "2", "3", "4" }  },
-	{ LCFG_DEL_UUID, "del_uuid", { "1", "2", "3", "4" }  },
-	{ LCFG_MOUNTOPT, "new_profile", { "name", "lov", "lmv", "4" }  },
-	{ LCFG_DEL_MOUNTOPT, "del_mountopt", { "1", "2", "3", "4" }  },
-	{ LCFG_SET_TIMEOUT, "set_timeout", { "parameter", "2", "3", "4" }  },
-	{ LCFG_SET_UPCALL, "set_upcall", { "1", "2", "3", "4" }  },
-	{ LCFG_ADD_CONN, "add_conn", { "node", "2", "3", "4" }  },
-	{ LCFG_DEL_CONN, "del_conn", { "1", "2", "3", "4" }  },
-	{ LCFG_LOV_ADD_OBD, "add_osc", { "ost", "index", "gen", "UUID" } },
-	{ LCFG_LOV_DEL_OBD, "del_osc", { "1", "2", "3", "4" } },
-	{ LCFG_PARAM, "conf_param", { "parameter", "value", "3", "4" } },
-	{ LCFG_MARKER, "marker", { "1", "2", "3", "4" } },
-	{ LCFG_LOG_START, "log_start", { "1", "2", "3", "4" } },
-	{ LCFG_LOG_END, "log_end", { "1", "2", "3", "4" } },
-	{ LCFG_LOV_ADD_INA, "add_osc_inactive", { "1", "2", "3", "4" }  },
-	{ LCFG_ADD_MDC, "add_mdc", { "mdt", "index", "gen", "UUID" } },
-	{ LCFG_DEL_MDC, "del_mdc", { "1", "2", "3", "4" } },
-	{ LCFG_SPTLRPC_CONF, "security", { "parameter", "2", "3", "4" } },
-	{ LCFG_POOL_NEW, "new_pool", { "fsname", "pool", "3", "4" }  },
-	{ LCFG_POOL_ADD, "add_pool", { "fsname", "pool", "ost", "4" } },
-	{ LCFG_POOL_REM, "remove_pool", { "fsname", "pool", "ost", "4" } },
-	{ LCFG_POOL_DEL, "del_pool", { "fsname", "pool", "3", "4" } },
-	{ LCFG_SET_LDLM_TIMEOUT, "set_ldlm_timeout",
-	  { "parameter", "2", "3", "4" } },
-	{ LCFG_SET_PARAM, "set_param", { "parameter", "value", "3", "4" } },
-	{ 0, NULL, { NULL, NULL, NULL, NULL } }
-};
-
-static inline struct lcfg_type_data *lcfg_cmd2data(__u32 cmd)
-{
-	int i = 0;
-
-	while (lcfg_data_table[i].ltd_type != 0) {
-		if (lcfg_data_table[i].ltd_type == cmd)
-			return &lcfg_data_table[i];
-		i++;
-	}
-	return NULL;
-}
-
 enum cfg_record_type {
 	PORTALS_CFG_TYPE	= 1,
 	LUSTRE_CFG_TYPE		= 123,
@@ -250,7 +201,7 @@ static inline void lustre_cfg_bufs_reset(struct lustre_cfg_bufs *bufs,
 static inline void *lustre_cfg_buf(struct lustre_cfg *lcfg, __u32 index)
 {
 	__u32 i;
-	__kernel_size_t offset;
+	size_t offset;
 	__u32 bufcount;
 
 	if (!lcfg)
@@ -310,7 +261,7 @@ static inline void lustre_cfg_init(struct lustre_cfg *lcfg, int cmd,
 	}
 }
 
-static inline int lustre_cfg_sanity_check(void *buf, __kernel_size_t len)
+static inline int lustre_cfg_sanity_check(void *buf, size_t len)
 {
 	struct lustre_cfg *lcfg = (struct lustre_cfg *)buf;
 
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_disk.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_disk.h
similarity index 85%
rename from drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_disk.h
rename to drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_disk.h
index e9cbf3066738a..8887c82d3b8b9 100644
--- a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_disk.h
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_disk.h
@@ -29,6 +29,8 @@
  * This file is part of Lustre, http://www.lustre.org/
  * Lustre is a trademark of Sun Microsystems, Inc.
  *
+ * uapi/linux/lustre_disk.h
+ *
  * Lustre disk format definitions.
  *
  * Author: Nathan Rutman <nathan.rutman@seagate.com>
@@ -60,16 +62,11 @@
 #define MGS_NIDTBL_DIR		"NIDTBL_VERSIONS"
 #define QMT_DIR			"quota_master"
 #define QSD_DIR			"quota_slave"
-#define QSD_DIR_DT		"quota_slave_dt"
-#define QSD_DIR_MD		"quota_slave_md"
 #define HSM_ACTIONS		"hsm_actions"
 #define LFSCK_DIR		"LFSCK"
 #define LFSCK_BOOKMARK		"lfsck_bookmark"
 #define LFSCK_LAYOUT		"lfsck_layout"
 #define LFSCK_NAMESPACE		"lfsck_namespace"
-#define REMOTE_PARENT_DIR	"REMOTE_PARENT_DIR"
-#define INDEX_BACKUP_DIR	"index_backup"
-#define MDT_ORPHAN_DIR		"PENDING"
 
 /****************** persistent mount data *********************/
 
@@ -91,7 +88,7 @@
 /** regenerate config logs for this fs or server */
 #define LDD_F_WRITECONF		0x0100
 /** COMPAT_14 */
-/*#define LDD_F_UPGRADE14		0x0200 deprecated since 1.8 */
+#define LDD_F_UPGRADE14		0x0200
 /** process as lctl conf_param */
 #define LDD_F_PARAM		0x0400
 /** all nodes are specified as service nodes */
@@ -117,9 +114,36 @@ enum ldd_mount_type {
 	LDD_MT_LAST
 };
 
+/* On-disk configuration file. In host-endian order. */
+struct lustre_disk_data {
+	__u32 ldd_magic;
+	__u32 ldd_feature_compat;	/* compatible feature flags */
+	__u32 ldd_feature_rocompat;	/* read-only compatible feature flags */
+	__u32 ldd_feature_incompat;	/* incompatible feature flags */
+
+	__u32 ldd_config_ver;		/* config rewrite count - not used */
+	__u32 ldd_flags;		/* LDD_SV_TYPE */
+	__u32 ldd_svindex;		/* server index (0001), must match
+					 * svname
+					 */
+	__u32 ldd_mount_type;		/* target fs type LDD_MT_* */
+	char  ldd_fsname[64];		/* filesystem this server is part of,
+					 * MTI_NAME_MAXLEN
+					 */
+	char  ldd_svname[64];		/* this server's name (lustre-mdt0001)*/
+	__u8  ldd_uuid[40];		/* server UUID (COMPAT_146) */
+
+	char  ldd_userdata[1024 - 200];	/* arbitrary user string '200' */
+	__u8  ldd_padding[4096 - 1024];	/* 1024 */
+	char  ldd_mount_opts[4096];	/* target fs mount opts '4096' */
+	char  ldd_params[4096];		/* key=value pairs '8192' */
+};
+
 /****************** last_rcvd file *********************/
 
 #define LR_EXPIRE_INTERVALS 16	/**< number of intervals to track transno */
+#define ENOENT_VERSION 1	/** 'virtual' version of non-existent object */
+
 #define LR_SERVER_SIZE	512
 #define LR_CLIENT_START	8192
 #define LR_CLIENT_SIZE	128
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fid.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_fid.h
similarity index 97%
rename from drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fid.h
rename to drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_fid.h
index f11ad3b3b2115..3e58dd5329c3f 100644
--- a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fid.h
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_fid.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2014, Intel Corporation.
  *
  * Copyright 2016 Cray Inc, all rights reserved.
  * Author: Ben Evans.
@@ -37,8 +37,7 @@
 #ifndef _UAPI_LUSTRE_FID_H_
 #define _UAPI_LUSTRE_FID_H_
 
-#include <linux/types.h>
-#include <linux/lustre/lustre_idl.h>
+#include <lustre/lustre_idl.h>
 
 /** returns fid object sequence */
 static inline __u64 fid_seq(const struct lu_fid *fid)
@@ -278,7 +277,7 @@ static inline bool fid_is_last_id(const struct lu_fid *fid)
  * \param fid an igif to get inode number from.
  * \return inode number for the igif.
  */
-static inline __kernel_ino_t lu_igif_ino(const struct lu_fid *fid)
+static inline ino_t lu_igif_ino(const struct lu_fid *fid)
 {
 	return fid_seq(fid);
 }
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ioctl.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ioctl.h
similarity index 93%
rename from drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ioctl.h
rename to drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ioctl.h
index d0dc08bda5433..9fddf2b1b9bd3 100644
--- a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ioctl.h
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ioctl.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2014, 2017, Intel Corporation.
+ * Copyright (c) 2014, 2016, Intel Corporation.
  */
 #ifndef _UAPI_LUSTRE_IOCTL_H
 #define _UAPI_LUSTRE_IOCTL_H
@@ -31,13 +31,20 @@
 #include <linux/ioctl.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
-#include <linux/lustre/lustre_idl.h>
+#include <lustre/lustre_idl.h>
 
-/*
- * sparse kernel source annotations
- */
-#ifndef __user
-#define __user
+#ifndef __KERNEL__
+# define __user
+#endif
+
+#if !defined(__KERNEL__) && !defined(LUSTRE_UTILS)
+# error This file is for Lustre internal use only.
+#endif
+
+/* Handle older distros */
+#ifndef __ALIGN_KERNEL
+# define __ALIGN_KERNEL(x, a)	__ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
+# define __ALIGN_KERNEL_MASK(x, mask)	(((x) + (mask)) & ~(mask))
 #endif
 
 enum md_echo_cmd {
@@ -57,6 +64,7 @@ enum md_echo_cmd {
 
 #define OBD_IOCTL_VERSION	0x00010004
 #define OBD_DEV_BY_DEVNAME	0xffffd0de
+#define OBD_MAX_IOCTL_BUFFER	CONFIG_LUSTRE_OBD_MAX_IOCTL_BUFFER
 
 struct obd_ioctl_data {
 	__u32		ioc_len;
@@ -220,14 +228,13 @@ static inline __u32 obd_ioctl_packlen(struct obd_ioctl_data *data)
 #define OBD_IOC_LCFG_ERASE	_IOWR('f', 209, OBD_IOC_DATA_TYPE)
 #define OBD_IOC_GET_OBJ_VERSION	_IOR('f', 210, OBD_IOC_DATA_TYPE)
 
-/*	lustre/lustre_user.h	211-220 */
-/* was #define OBD_IOC_GET_MNTOPT	_IOW('f', 220, mntopt_t) until 2.11 */
+/*	lustre/lustre_user.h	212-217 */
+#define OBD_IOC_GET_MNTOPT	_IOW('f', 220, mntopt_t)
 #define OBD_IOC_ECHO_MD		_IOR('f', 221, struct obd_ioctl_data)
 #define OBD_IOC_ECHO_ALLOC_SEQ	_IOWR('f', 222, struct obd_ioctl_data)
 #define OBD_IOC_START_LFSCK	_IOWR('f', 230, OBD_IOC_DATA_TYPE)
 #define OBD_IOC_STOP_LFSCK	_IOW('f', 231, OBD_IOC_DATA_TYPE)
 #define OBD_IOC_QUERY_LFSCK	_IOR('f', 232, struct obd_ioctl_data)
-#define OBD_IOC_CHLG_POLL	_IOR('f', 233, long)
 /*	lustre/lustre_user.h	240-249 */
 /*	LIBCFS_IOC_DEBUG_MASK	250 */
 
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ostid.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ostid.h
similarity index 95%
rename from drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ostid.h
rename to drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ostid.h
index 90fa213f83e90..c0e662ae7b84f 100644
--- a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ostid.h
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ostid.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2014, Intel Corporation.
  *
  * Copyright 2015 Cray Inc, all rights reserved.
  * Author: Ben Evans.
@@ -34,9 +34,15 @@
 #ifndef _UAPI_LUSTRE_OSTID_H_
 #define _UAPI_LUSTRE_OSTID_H_
 
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/lustre/lustre_fid.h>
+/*
+ * This is due to us being out of kernel and the way the OpenSFS branch
+ * handles CFLAGS. Upstream will just have linux/lustre_fid.h
+ */
+#ifdef __KERNEL__
+#include <uapi/linux/lustre_fid.h>
+#else
+#include <linux/lustre_fid.h>
+#endif
 
 static inline __u64 lmm_oi_id(const struct ost_id *oi)
 {
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_param.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_param.h
similarity index 100%
rename from drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_param.h
rename to drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_param.h
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_kernelcomm.h b/drivers/staging/lustrefsx/lustre/include/uapi_kernelcomm.h
similarity index 88%
rename from drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_kernelcomm.h
rename to drivers/staging/lustrefsx/lustre/include/uapi_kernelcomm.h
index 26819ff7995cf..e8119f5278c23 100644
--- a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_kernelcomm.h
+++ b/drivers/staging/lustrefsx/lustre/include/uapi_kernelcomm.h
@@ -73,26 +73,17 @@ enum kuc_generic_message_type {
 #define KUC_GRP_HSM	0x02
 #define KUC_GRP_MAX	KUC_GRP_HSM
 
-enum lk_flags {
-	LK_FLG_STOP	= 0x0001,
-	LK_FLG_DATANR	= 0x0002,
-};
+#define LK_FLG_STOP 0x01
 #define LK_NOFD -1U
 
-/* kernelcomm control structure, passed from userspace to kernel.
- * For compatibility with old copytools, users who pass ARCHIVE_IDs
- * to kernel using lk_data_count and lk_data should fill lk_flags with
- * LK_FLG_DATANR. Otherwise kernel will take lk_data_count as bitmap of
- * ARCHIVE IDs.
- */
+/* kernelcomm control structure, passed from userspace to kernel */
 struct lustre_kernelcomm {
 	__u32 lk_wfd;
 	__u32 lk_rfd;
 	__u32 lk_uid;
 	__u32 lk_group;
-	__u32 lk_data_count;
+	__u32 lk_data;
 	__u32 lk_flags;
-	__u32 lk_data[0];
 } __attribute__((packed));
 
 #endif	/* __UAPI_KERNELCOMM_H__ */
diff --git a/drivers/staging/lustrefsx/lustre/include/upcall_cache.h b/drivers/staging/lustrefsx/lustre/include/upcall_cache.h
index 1f02294b9660d..accc4495d156e 100644
--- a/drivers/staging/lustrefsx/lustre/include/upcall_cache.h
+++ b/drivers/staging/lustrefsx/lustre/include/upcall_cache.h
@@ -34,7 +34,7 @@
 #define _UPCALL_CACHE_H
 
 #include <libcfs/libcfs.h>
-#include <uapi/linux/lnet/lnet-types.h>
+#include <lnet/types.h>
 
 /** \defgroup ucache ucache
  *
@@ -85,8 +85,8 @@ struct upcall_cache_entry {
 	atomic_t		ue_refcount;
 	int			ue_flags;
 	wait_queue_head_t	ue_waitq;
-	time64_t		ue_acquire_expire;
-	time64_t		ue_expire;
+	cfs_time_t		ue_acquire_expire;
+	cfs_time_t		ue_expire;
 	union {
 		struct md_identity	identity;
 	} u;
@@ -121,8 +121,8 @@ struct upcall_cache {
 
 	char			uc_name[40];		/* for upcall */
 	char			uc_upcall[UC_CACHE_UPCALL_MAXPATH];
-	time64_t		uc_acquire_expire;	/* seconds */
-	time64_t		uc_entry_expire;	/* seconds */
+	int			uc_acquire_expire;	/* seconds */
+	int			uc_entry_expire;	/* seconds */
 	struct upcall_cache_ops	*uc_ops;
 };
 
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/interval_tree.c b/drivers/staging/lustrefsx/lustre/ldlm/interval_tree.c
index b39b105a894e6..7dd0c65332649 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/interval_tree.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/interval_tree.c
@@ -36,8 +36,11 @@
  * Author: Huang Wei <huangwei@clusterfs.com>
  * Author: Jay Xiong <jinshan.xiong@sun.com>
  */
-
-#include <lustre_dlm.h>
+#ifdef __KERNEL__
+# include <lustre_dlm.h>
+#else
+# include <libcfs/libcfs.h>
+#endif
 #include <interval_tree.h>
 
 enum {
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_extent.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_extent.c
index 59d1302a36516..1088d583145e7 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_extent.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_extent.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2017, Intel Corporation.
+ * Copyright (c) 2010, 2013, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -269,49 +269,38 @@ ldlm_extent_internal_policy_waiting(struct ldlm_lock *req,
 static void ldlm_extent_policy(struct ldlm_resource *res,
 			       struct ldlm_lock *lock, __u64 *flags)
 {
-	struct ldlm_extent new_ex = { .start = 0, .end = OBD_OBJECT_EOF };
-
-	if (lock->l_export == NULL)
-		/*
-		 * this is a local lock taken by server (e.g., as a part of
-		 * OST-side locking, or unlink handling). Expansion doesn't
-		 * make a lot of sense for local locks, because they are
-		 * dropped immediately on operation completion and would only
-		 * conflict with other threads.
-		 */
-		return;
+        struct ldlm_extent new_ex = { .start = 0, .end = OBD_OBJECT_EOF };
+
+        if (lock->l_export == NULL)
+                /*
+                 * this is local lock taken by server (e.g., as a part of
+                 * OST-side locking, or unlink handling). Expansion doesn't
+                 * make a lot of sense for local locks, because they are
+                 * dropped immediately on operation completion and would only
+                 * conflict with other threads.
+                 */
+                return;
 
-	if (lock->l_policy_data.l_extent.start == 0 &&
-	    lock->l_policy_data.l_extent.end == OBD_OBJECT_EOF)
-		/* fast-path whole file locks */
-		return;
+        if (lock->l_policy_data.l_extent.start == 0 &&
+            lock->l_policy_data.l_extent.end == OBD_OBJECT_EOF)
+                /* fast-path whole file locks */
+                return;
 
-	/* Because reprocess_queue zeroes flags and uses it to return
-	 * LDLM_FL_LOCK_CHANGED, we must check for the NO_EXPANSION flag
-	 * in the lock flags rather than the 'flags' argument */
-	if (likely(!(lock->l_flags & LDLM_FL_NO_EXPANSION))) {
-		ldlm_extent_internal_policy_granted(lock, &new_ex);
-		ldlm_extent_internal_policy_waiting(lock, &new_ex);
-	} else {
-		LDLM_DEBUG(lock, "Not expanding manually requested lock.\n");
-		new_ex.start = lock->l_policy_data.l_extent.start;
-		new_ex.end = lock->l_policy_data.l_extent.end;
-		/* In case the request is not on correct boundaries, we call
-		 * fixup. (normally called in ldlm_extent_internal_policy_*) */
-		ldlm_extent_internal_policy_fixup(lock, &new_ex, 0);
-	}
+        ldlm_extent_internal_policy_granted(lock, &new_ex);
+        ldlm_extent_internal_policy_waiting(lock, &new_ex);
 
-	if (!ldlm_extent_equal(&new_ex, &lock->l_policy_data.l_extent)) {
-		*flags |= LDLM_FL_LOCK_CHANGED;
-		lock->l_policy_data.l_extent.start = new_ex.start;
-		lock->l_policy_data.l_extent.end = new_ex.end;
-	}
+        if (new_ex.start != lock->l_policy_data.l_extent.start ||
+            new_ex.end != lock->l_policy_data.l_extent.end) {
+                *flags |= LDLM_FL_LOCK_CHANGED;
+                lock->l_policy_data.l_extent.start = new_ex.start;
+                lock->l_policy_data.l_extent.end = new_ex.end;
+        }
 }
 
 static int ldlm_check_contention(struct ldlm_lock *lock, int contended_locks)
 {
 	struct ldlm_resource *res = lock->l_resource;
-	time64_t now = ktime_get_seconds();
+	cfs_time_t now = cfs_time_current();
 
 	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_SET_CONTENTION))
 		return 1;
@@ -319,9 +308,8 @@ static int ldlm_check_contention(struct ldlm_lock *lock, int contended_locks)
 	CDEBUG(D_DLMTRACE, "contended locks = %d\n", contended_locks);
 	if (contended_locks > ldlm_res_to_ns(res)->ns_contended_locks)
 		res->lr_contention_time = now;
-
-	return now < res->lr_contention_time +
-		     ldlm_res_to_ns(res)->ns_contention_time;
+	return cfs_time_before(now, cfs_time_add(res->lr_contention_time,
+		cfs_time_seconds(ldlm_res_to_ns(res)->ns_contention_time)));
 }
 
 struct ldlm_extent_compat_args {
@@ -433,8 +421,7 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
                         }
 
                         if (tree->lit_mode == LCK_GROUP) {
-				if (*flags & (LDLM_FL_BLOCK_NOWAIT |
-					      LDLM_FL_SPECULATIVE)) {
+                                if (*flags & LDLM_FL_BLOCK_NOWAIT) {
                                         compat = -EWOULDBLOCK;
                                         goto destroylock;
                                 }
@@ -451,24 +438,10 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
                                 continue;
                         }
 
-			/* We've found a potentially blocking lock, check
-			 * compatibility.  This handles locks other than GROUP
-			 * locks, which are handled separately above.
-			 *
-			 * Locks with FL_SPECULATIVE are asynchronous requests
-			 * which must never wait behind another lock, so they
-			 * fail if any conflicting lock is found. */
-			if (!work_list || (*flags & LDLM_FL_SPECULATIVE)) {
-				rc = interval_is_overlapped(tree->lit_root,
-							    &ex);
-				if (rc) {
-					if (!work_list) {
-						RETURN(0);
-					} else {
-						compat = -EWOULDBLOCK;
-						goto destroylock;
-					}
-				}
+                        if (!work_list) {
+                                rc = interval_is_overlapped(tree->lit_root,&ex);
+                                if (rc)
+                                        RETURN(0);
                         } else {
                                 interval_search(tree->lit_root, &ex,
                                                 ldlm_extent_compat_cb, &data);
@@ -555,8 +528,8 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
                                     lock->l_policy_data.l_extent.gid) {
                                         /* If existing lock with matched gid is granted,
                                            we grant new one too. */
-					if (ldlm_is_granted(lock))
-						RETURN(2);
+                                        if (lock->l_req_mode == lock->l_granted_mode)
+                                                RETURN(2);
 
                                         /* Otherwise we are scanning queue of waiting
                                          * locks and it means current request would
@@ -564,8 +537,7 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
                                          * already blocked.
                                          * If we are in nonblocking mode - return
                                          * immediately */
-					if (*flags & (LDLM_FL_BLOCK_NOWAIT
-						      | LDLM_FL_SPECULATIVE)) {
+                                        if (*flags & LDLM_FL_BLOCK_NOWAIT) {
                                                 compat = -EWOULDBLOCK;
                                                 goto destroylock;
                                         }
@@ -584,8 +556,8 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
                                 }
                         }
 
-			if (unlikely(req_mode == LCK_GROUP &&
-				     !ldlm_is_granted(lock))) {
+                        if (unlikely(req_mode == LCK_GROUP &&
+                                     (lock->l_req_mode != lock->l_granted_mode))) {
                                 scan = 1;
                                 compat = 0;
                                 if (lock->l_req_mode != LCK_GROUP) {
@@ -608,11 +580,10 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
                         }
 
                         if (unlikely(lock->l_req_mode == LCK_GROUP)) {
-				/* If compared lock is GROUP, then requested is
-				 * PR/PW so this is not compatible; extent
-				 * range does not matter */
-				if (*flags & (LDLM_FL_BLOCK_NOWAIT
-					      | LDLM_FL_SPECULATIVE)) {
+                                /* If compared lock is GROUP, then requested is PR/PW/
+                                 * so this is not compatible; extent range does not
+                                 * matter */
+                                if (*flags & LDLM_FL_BLOCK_NOWAIT) {
                                         compat = -EWOULDBLOCK;
                                         goto destroylock;
                                 } else {
@@ -631,11 +602,6 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
                         if (!work_list)
                                 RETURN(0);
 
-			if (*flags & LDLM_FL_SPECULATIVE) {
-				compat = -EWOULDBLOCK;
-				goto destroylock;
-			}
-
                         /* don't count conflicting glimpse locks */
                         if (lock->l_req_mode == LCK_PR &&
                             lock->l_policy_data.l_extent.start == 0 &&
@@ -676,7 +642,7 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
 void ldlm_lock_prolong_one(struct ldlm_lock *lock,
 			   struct ldlm_prolong_args *arg)
 {
-	time64_t timeout;
+	int timeout;
 
 	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PROLONG_PAUSE, 3);
 
@@ -696,7 +662,7 @@ void ldlm_lock_prolong_one(struct ldlm_lock *lock,
 	 */
 	timeout = arg->lpa_timeout + (ldlm_bl_timeout(lock) >> 1);
 
-	LDLM_DEBUG(lock, "refreshed to %llds.\n", timeout);
+	LDLM_DEBUG(lock, "refreshed to %ds.\n", timeout);
 
 	arg->lpa_blocks_cnt++;
 
@@ -786,24 +752,25 @@ int ldlm_process_extent_lock(struct ldlm_lock *lock, __u64 *flags,
 			     enum ldlm_error *err, struct list_head *work_list)
 {
 	struct ldlm_resource *res = lock->l_resource;
+	struct list_head rpc_list;
 	int rc, rc2;
 	int contended_locks = 0;
-	struct list_head *grant_work = intention == LDLM_PROCESS_ENQUEUE ?
-							NULL : work_list;
 	ENTRY;
 
-	LASSERT(!ldlm_is_granted(lock));
+	LASSERT(lock->l_granted_mode != lock->l_req_mode);
+	LASSERT(list_empty(&res->lr_converting));
 	LASSERT(!(*flags & LDLM_FL_DENY_ON_CONTENTION) ||
 		!ldlm_is_ast_discard_data(lock));
+	INIT_LIST_HEAD(&rpc_list);
 	check_res_locked(res);
 	*err = ELDLM_OK;
 
 	if (intention == LDLM_PROCESS_RESCAN) {
-		/* Careful observers will note that we don't handle -EWOULDBLOCK
-		 * here, but it's ok for a non-obvious reason -- compat_queue
-		 * can only return -EWOULDBLOCK if (flags & BLOCK_NOWAIT |
-		 * SPECULATIVE). flags should always be zero here, and if that
-		 * ever stops being true, we want to find out. */
+                /* Careful observers will note that we don't handle -EWOULDBLOCK
+                 * here, but it's ok for a non-obvious reason -- compat_queue
+                 * can only return -EWOULDBLOCK if (flags & BLOCK_NOWAIT).
+                 * flags should always be zero here, and if that ever stops
+                 * being true, we want to find out. */
                 LASSERT(*flags == 0);
                 rc = ldlm_extent_compat_queue(&res->lr_granted, lock, flags,
                                               err, NULL, &contended_locks);
@@ -819,38 +786,49 @@ int ldlm_process_extent_lock(struct ldlm_lock *lock, __u64 *flags,
 
                 if (!OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_EVICT_RACE))
                         ldlm_extent_policy(res, lock, flags);
-		ldlm_grant_lock(lock, grant_work);
+                ldlm_grant_lock(lock, work_list);
                 RETURN(LDLM_ITER_CONTINUE);
         }
 
+	LASSERT((intention == LDLM_PROCESS_ENQUEUE && work_list == NULL) ||
+		(intention == LDLM_PROCESS_RECOVERY && work_list != NULL));
+ restart:
         contended_locks = 0;
         rc = ldlm_extent_compat_queue(&res->lr_granted, lock, flags, err,
-				      work_list, &contended_locks);
+                                      &rpc_list, &contended_locks);
 	if (rc < 0)
 		GOTO(out_rpc_list, rc);
 
 	rc2 = 0;
 	if (rc != 2) {
 		rc2 = ldlm_extent_compat_queue(&res->lr_waiting, lock,
-					       flags, err, work_list,
+					       flags, err, &rpc_list,
 					       &contended_locks);
 		if (rc2 < 0)
 			GOTO(out_rpc_list, rc = rc2);
 	}
 
-	if (rc + rc2 == 2) {
+	if (rc + rc2 != 2) {
+		/* Adding LDLM_FL_NO_TIMEOUT flag to granted lock to force
+		 * client to wait for the lock endlessly once the lock is
+		 * enqueued -bzzz */
+		rc = ldlm_handle_conflict_lock(lock, flags, &rpc_list,
+					       LDLM_FL_NO_TIMEOUT);
+		if (rc == -ERESTART)
+			GOTO(restart, rc);
+		*err = rc;
+	} else {
 		ldlm_extent_policy(res, lock, flags);
 		ldlm_resource_unlink_lock(lock);
-		ldlm_grant_lock(lock, grant_work);
-	} else {
-		/* Adding LDLM_FL_NO_TIMEOUT flag to granted lock to
-		 * force client to wait for the lock endlessly once
-		 * the lock is enqueued -bzzz */
-		*flags |= LDLM_FL_NO_TIMEOUT;
+		ldlm_grant_lock(lock, work_list);
+		rc = 0;
 	}
-	rc = LDLM_ITER_CONTINUE;
 
 out_rpc_list:
+	if (!list_empty(&rpc_list)) {
+		LASSERT(!ldlm_is_ast_discard_data(lock));
+		ldlm_discard_bl_list(&rpc_list);
+	}
 	RETURN(rc);
 }
 #endif /* HAVE_SERVER_SUPPORT */
@@ -965,7 +943,7 @@ __u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms)
 EXPORT_SYMBOL(ldlm_extent_shift_kms);
 
 struct kmem_cache *ldlm_interval_slab;
-static struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock)
+struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock)
 {
 	struct ldlm_interval *node;
 	ENTRY;
@@ -1026,14 +1004,6 @@ static inline int ldlm_mode_to_index(enum ldlm_mode mode)
 	return index;
 }
 
-int ldlm_extent_alloc_lock(struct ldlm_lock *lock)
-{
-	lock->l_tree_node = NULL;
-	if (ldlm_interval_alloc(lock) == NULL)
-		return -ENOMEM;
-	return 0;
-}
-
 /** Add newly granted lock into interval tree for the resource. */
 void ldlm_extent_add_lock(struct ldlm_resource *res,
                           struct ldlm_lock *lock)
@@ -1043,7 +1013,7 @@ void ldlm_extent_add_lock(struct ldlm_resource *res,
         struct ldlm_extent *extent;
 	int idx, rc;
 
-	LASSERT(ldlm_is_granted(lock));
+        LASSERT(lock->l_granted_mode == lock->l_req_mode);
 
         node = lock->l_tree_node;
         LASSERT(node != NULL);
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_flock.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_flock.c
index be849938cc6c6..b3d669799ceba 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_flock.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_flock.c
@@ -27,7 +27,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2017, Intel Corporation.
+ * Copyright (c) 2010, 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -289,8 +289,6 @@ ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags,
 	int overlaps = 0;
 	int splitted = 0;
 	const struct ldlm_callback_suite null_cbs = { NULL };
-	struct list_head *grant_work = intention == LDLM_PROCESS_ENQUEUE ?
-							NULL : work_list;
 	ENTRY;
 
 	CDEBUG(D_DLMTRACE, "flags %#llx owner %llu pid %u mode %u start "
@@ -350,7 +348,7 @@ ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags,
 				reprocess_failed = 1;
 				if (ldlm_flock_deadlock(req, lock)) {
 					ldlm_flock_cancel_on_deadlock(req,
-							grant_work);
+							work_list);
 					RETURN(LDLM_ITER_CONTINUE);
 				}
 				continue;
@@ -581,7 +579,7 @@ ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags,
 restart:
 				ldlm_reprocess_queue(res, &res->lr_waiting,
 						     &rpc_list,
-						     LDLM_PROCESS_RESCAN, NULL);
+						     LDLM_PROCESS_RESCAN);
 
                                 unlock_res_and_lock(req);
                                 rc = ldlm_run_ast_work(ns, &rpc_list,
@@ -592,7 +590,7 @@ ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags,
                        }
                 } else {
                         LASSERT(req->l_completion_ast);
-			ldlm_add_ast_work_item(req, NULL, grant_work);
+                        ldlm_add_ast_work_item(req, NULL, work_list);
                 }
 #else /* !HAVE_SERVER_SUPPORT */
                 /* The only one possible case for client-side calls flock
@@ -744,7 +742,7 @@ ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
 		RETURN(-EIO);
 	}
 
-	/* ldlm_lock_enqueue() has already placed lock on the granted list. */
+        /* ldlm_lock_enqueue() has already placed lock on the granted list. */
 	ldlm_resource_unlink_lock(lock);
 
 	/* Import invalidation. We need to actually release the lock
@@ -759,7 +757,7 @@ ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
 			LASSERT(ldlm_is_test_lock(lock));
 
 		if (ldlm_is_test_lock(lock) || ldlm_is_flock_deadlock(lock))
-			mode = getlk->fl_type;
+			mode = flock_type(getlk);
 		else
 			mode = lock->l_granted_mode;
 
@@ -782,26 +780,27 @@ ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
 	LDLM_DEBUG(lock, "client-side enqueue granted");
 
 	if (flags & LDLM_FL_TEST_LOCK) {
-		/*
-		 * fcntl(F_GETLK) request
-		 * The old mode was saved in getlk->fl_type so that if the mode
-		 * in the lock changes we can decref the appropriate refcount.
-		 */
+                /* fcntl(F_GETLK) request */
+                /* The old mode was saved in getlk->fl_type so that if the mode
+                 * in the lock changes we can decref the appropriate refcount.*/
 		LASSERT(ldlm_is_test_lock(lock));
-		ldlm_flock_destroy(lock, getlk->fl_type, LDLM_FL_WAIT_NOREPROC);
+		ldlm_flock_destroy(lock, flock_type(getlk),
+				   LDLM_FL_WAIT_NOREPROC);
 		switch (lock->l_granted_mode) {
 		case LCK_PR:
-			getlk->fl_type = F_RDLCK;
+			flock_set_type(getlk, F_RDLCK);
 			break;
 		case LCK_PW:
-			getlk->fl_type = F_WRLCK;
+			flock_set_type(getlk, F_WRLCK);
 			break;
 		default:
-			getlk->fl_type = F_UNLCK;
+			flock_set_type(getlk, F_UNLCK);
 		}
-		getlk->fl_pid = (pid_t)lock->l_policy_data.l_flock.pid;
-		getlk->fl_start = (loff_t)lock->l_policy_data.l_flock.start;
-		getlk->fl_end = (loff_t)lock->l_policy_data.l_flock.end;
+		flock_set_pid(getlk, (pid_t)lock->l_policy_data.l_flock.pid);
+		flock_set_start(getlk,
+				(loff_t)lock->l_policy_data.l_flock.start);
+		flock_set_end(getlk,
+			      (loff_t)lock->l_policy_data.l_flock.end);
 	} else {
 		__u64 noreproc = LDLM_FL_WAIT_NOREPROC;
 
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_inodebits.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_inodebits.c
index c407cf676fba8..90e34a612d7c8 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_inodebits.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_inodebits.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -57,89 +57,6 @@
 #include "ldlm_internal.h"
 
 #ifdef HAVE_SERVER_SUPPORT
-
-/**
- * It should iterate through all waiting locks on a given resource queue and
- * attempt to grant them. An optimization is to check only heads waitintg
- * locks for each inodebit type.
- *
- * Must be called with resource lock held.
- */
-int ldlm_reprocess_inodebits_queue(struct ldlm_resource *res,
-				   struct list_head *queue,
-				   struct list_head *work_list,
-				   enum ldlm_process_intention intention,
-				   struct ldlm_lock *hint)
-{
-	__u64 flags;
-	int rc = LDLM_ITER_CONTINUE;
-	enum ldlm_error err;
-	struct list_head bl_ast_list = LIST_HEAD_INIT(bl_ast_list);
-	struct ldlm_ibits_queues *queues = res->lr_ibits_queues;
-	int i;
-
-	ENTRY;
-
-	check_res_locked(res);
-
-	LASSERT(res->lr_type == LDLM_IBITS);
-	LASSERT(intention == LDLM_PROCESS_RESCAN ||
-		intention == LDLM_PROCESS_RECOVERY);
-
-	if (intention == LDLM_PROCESS_RECOVERY)
-		return ldlm_reprocess_queue(res, queue, work_list, intention,
-					    NULL);
-
-restart:
-	CDEBUG(D_DLMTRACE, "--- Reprocess resource "DLDLMRES" (%p)\n",
-	       PLDLMRES(res), res);
-
-	for (i = 0; i < MDS_INODELOCK_NUMBITS; i++) {
-		struct list_head rpc_list = LIST_HEAD_INIT(rpc_list);
-		struct list_head *head = &queues->liq_waiting[i];
-		struct ldlm_lock *pending;
-		struct ldlm_ibits_node *node;
-
-		if (list_empty(head))
-			continue;
-		if (hint && !(hint->l_policy_data.l_inodebits.bits & (1 << i)))
-			continue;
-
-		node = list_entry(head->next, struct ldlm_ibits_node,
-				  lin_link[i]);
-
-		pending = node->lock;
-		LDLM_DEBUG(pending, "Reprocessing lock from queue %d", i);
-
-		flags = 0;
-		rc = ldlm_process_inodebits_lock(pending, &flags, intention,
-						 &err, &rpc_list);
-		if (ldlm_is_granted(pending)) {
-			list_splice(&rpc_list, work_list);
-			/* Try to grant more locks from current queue */
-			i--;
-		} else {
-			list_splice(&rpc_list, &bl_ast_list);
-		}
-	}
-
-	if (!list_empty(&bl_ast_list)) {
-		unlock_res(res);
-
-		rc = ldlm_run_ast_work(ldlm_res_to_ns(res), &bl_ast_list,
-				       LDLM_WORK_BL_AST);
-
-		lock_res(res);
-		if (rc == -ERESTART)
-			GOTO(restart, rc);
-	}
-
-	if (!list_empty(&bl_ast_list))
-		ldlm_discard_bl_list(&bl_ast_list);
-
-	RETURN(rc);
-}
-
 /**
  * Determine if the lock is compatible with all locks on the queue.
  *
@@ -162,18 +79,12 @@ ldlm_inodebits_compat_queue(struct list_head *queue, struct ldlm_lock *req,
 	struct list_head *tmp;
 	struct ldlm_lock *lock;
 	__u64 req_bits = req->l_policy_data.l_inodebits.bits;
-	__u64 *try_bits = &req->l_policy_data.l_inodebits.try_bits;
 	int compat = 1;
-
 	ENTRY;
 
-	/* There is no sense in lock with no bits set. Also such a lock
-	 * would be compatible with any other bit lock.
-	 * Meanwhile that can be true if there were just try_bits and all
-	 * are failed, so just exit gracefully and let the caller to care.
-	 */
-	if ((req_bits | *try_bits) == 0)
-		RETURN(0);
+	/* There is no sense in lock with no bits set, I think.
+	 * Also, such a lock would be compatible with any other bit lock */
+	LASSERT(req_bits != 0);
 
 	list_for_each(tmp, queue) {
 		struct list_head *mode_tail;
@@ -188,10 +99,11 @@ ldlm_inodebits_compat_queue(struct list_head *queue, struct ldlm_lock *req,
 
 		/* last lock in mode group */
 		LASSERT(lock->l_sl_mode.prev != NULL);
-		mode_tail = &list_entry(lock->l_sl_mode.prev, struct ldlm_lock,
+		mode_tail = &list_entry(lock->l_sl_mode.prev,
+					struct ldlm_lock,
 					l_sl_mode)->l_res_link;
 
-		/* if request lock is not COS_INCOMPAT and COS is disabled,
+		/* if reqest lock is not COS_INCOMPAT and COS is disabled,
 		 * they are compatible, IOW this request is from a local
 		 * transaction on a DNE system. */
 		if (lock->l_req_mode == LCK_COS && !ldlm_is_cos_incompat(req) &&
@@ -213,24 +125,8 @@ ldlm_inodebits_compat_queue(struct list_head *queue, struct ldlm_lock *req,
 
 			/* Advance loop cursor to last lock in policy group. */
 			tmp = &list_entry(lock->l_sl_policy.prev,
-					  struct ldlm_lock,
-					  l_sl_policy)->l_res_link;
-
-			/* New lock's try_bits are filtered out by ibits
-			 * of all locks in both granted and waiting queues.
-			 */
-			*try_bits &= ~(lock->l_policy_data.l_inodebits.bits |
-				lock->l_policy_data.l_inodebits.try_bits);
-
-			if ((req_bits | *try_bits) == 0)
-				RETURN(0);
-
-			/* The new lock ibits is more preferable than try_bits
-			 * of waiting locks so drop conflicting try_bits in
-			 * the waiting queue.
-			 * Notice that try_bits of granted locks must be zero.
-			 */
-			lock->l_policy_data.l_inodebits.try_bits &= ~req_bits;
+					      struct ldlm_lock,
+					      l_sl_policy)->l_res_link;
 
 			/* Locks with overlapping bits conflict. */
 			if (lock->l_policy_data.l_inodebits.bits & req_bits) {
@@ -242,7 +138,6 @@ ldlm_inodebits_compat_queue(struct list_head *queue, struct ldlm_lock *req,
 				    ldlm_is_cos_enabled(req) &&
 				    lock->l_client_cookie == req->l_client_cookie)
 					goto not_conflicting;
-
 				/* Found a conflicting policy group. */
 				if (!work_list)
 					RETURN(0);
@@ -251,21 +146,22 @@ ldlm_inodebits_compat_queue(struct list_head *queue, struct ldlm_lock *req,
 
 				/* Add locks of the policy group to @work_list
 				 * as blocking locks for @req */
-				if (lock->l_blocking_ast)
-					ldlm_add_ast_work_item(lock, req,
-							       work_list);
-				head = &lock->l_sl_policy;
+                                if (lock->l_blocking_ast)
+                                        ldlm_add_ast_work_item(lock, req,
+                                                               work_list);
+                                head = &lock->l_sl_policy;
 				list_for_each_entry(lock, head, l_sl_policy)
-					if (lock->l_blocking_ast)
-						ldlm_add_ast_work_item(lock,
-								req, work_list);
-			}
-not_conflicting:
-			if (tmp == mode_tail)
-				break;
-
-			tmp = tmp->next;
-			lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+                                        if (lock->l_blocking_ast)
+                                                ldlm_add_ast_work_item(lock, req,
+                                                                       work_list);
+                        }
+                not_conflicting:
+                        if (tmp == mode_tail)
+                                break;
+
+                        tmp = tmp->next;
+			lock = list_entry(tmp, struct ldlm_lock,
+                                              l_res_link);
 		} /* Loop over policy groups within one mode group. */
 	} /* Loop over mode groups within @queue. */
 
@@ -286,95 +182,57 @@ int ldlm_process_inodebits_lock(struct ldlm_lock *lock, __u64 *flags,
 				struct list_head *work_list)
 {
 	struct ldlm_resource *res = lock->l_resource;
-	struct list_head *grant_work = intention == LDLM_PROCESS_ENQUEUE ?
-							NULL : work_list;
+	struct list_head rpc_list;
 	int rc;
-
 	ENTRY;
 
-	LASSERT(!ldlm_is_granted(lock));
+	LASSERT(lock->l_granted_mode != lock->l_req_mode);
+	LASSERT(list_empty(&res->lr_converting));
+	INIT_LIST_HEAD(&rpc_list);
 	check_res_locked(res);
 
-	if (intention == LDLM_PROCESS_RESCAN) {
-		struct list_head *bl_list;
-
-		if (*flags & LDLM_FL_BLOCK_NOWAIT) {
-			bl_list = NULL;
+	/* (*flags & LDLM_FL_BLOCK_NOWAIT) is for layout lock right now. */
+	if (intention == LDLM_PROCESS_RESCAN ||
+	    (*flags & LDLM_FL_BLOCK_NOWAIT)) {
+		*err = ELDLM_LOCK_ABORTED;
+		if (*flags & LDLM_FL_BLOCK_NOWAIT)
 			*err = ELDLM_LOCK_WOULDBLOCK;
-		} else {
-			bl_list = work_list;
-			*err = ELDLM_LOCK_ABORTED;
-		}
 
-		LASSERT(lock->l_policy_data.l_inodebits.bits != 0);
+                rc = ldlm_inodebits_compat_queue(&res->lr_granted, lock, NULL);
+                if (!rc)
+                        RETURN(LDLM_ITER_STOP);
+                rc = ldlm_inodebits_compat_queue(&res->lr_waiting, lock, NULL);
+                if (!rc)
+                        RETURN(LDLM_ITER_STOP);
 
-		/* It is possible that some of granted locks was not canceled
-		 * but converted and is kept in granted queue. So there is
-		 * a window where lock with 'ast_sent' might become granted
-		 * again. Meanwhile a new lock may appear in that window and
-		 * conflicts with the converted lock so the following scenario
-		 * is possible:
-		 *
-		 * 1) lock1 conflicts with lock2
-		 * 2) bl_ast was sent for lock2
-		 * 3) lock3 comes and conflicts with lock2 too
-		 * 4) no bl_ast sent because lock2->l_bl_ast_sent is 1
-		 * 5) lock2 was converted for lock1 but not for lock3
-		 * 6) lock1 granted, lock3 still is waiting for lock2, but
-		 *    there will never be another bl_ast for that
-		 *
-		 * To avoid this scenario the work_list is used below to collect
-		 * any blocked locks from granted queue during every reprocess
-		 * and bl_ast will be sent if needed.
-		 */
-		rc = ldlm_inodebits_compat_queue(&res->lr_granted, lock,
-						 bl_list);
-		if (!rc)
-			RETURN(LDLM_ITER_STOP);
-		rc = ldlm_inodebits_compat_queue(&res->lr_waiting, lock, NULL);
-		if (!rc)
-			RETURN(LDLM_ITER_STOP);
-
-		/* grant also try_bits if any */
-		if (lock->l_policy_data.l_inodebits.try_bits != 0) {
-			lock->l_policy_data.l_inodebits.bits |=
-				lock->l_policy_data.l_inodebits.try_bits;
-			lock->l_policy_data.l_inodebits.try_bits = 0;
-			*flags |= LDLM_FL_LOCK_CHANGED;
-		}
-		ldlm_resource_unlink_lock(lock);
-		ldlm_grant_lock(lock, grant_work);
+                ldlm_resource_unlink_lock(lock);
+                ldlm_grant_lock(lock, work_list);
 
 		*err = ELDLM_OK;
 		RETURN(LDLM_ITER_CONTINUE);
 	}
 
-	rc = ldlm_inodebits_compat_queue(&res->lr_granted, lock, work_list);
-	rc += ldlm_inodebits_compat_queue(&res->lr_waiting, lock, work_list);
+	LASSERT((intention == LDLM_PROCESS_ENQUEUE && work_list == NULL) ||
+		(intention == LDLM_PROCESS_RECOVERY && work_list != NULL));
+ restart:
+        rc = ldlm_inodebits_compat_queue(&res->lr_granted, lock, &rpc_list);
+        rc += ldlm_inodebits_compat_queue(&res->lr_waiting, lock, &rpc_list);
 
-	if (rc != 2) {
-		/* if there were only bits to try and all are conflicting */
-		if ((lock->l_policy_data.l_inodebits.bits |
-		     lock->l_policy_data.l_inodebits.try_bits) == 0) {
-			*err = ELDLM_LOCK_WOULDBLOCK;
-		} else {
-			*err = ELDLM_OK;
-		}
+        if (rc != 2) {
+		rc = ldlm_handle_conflict_lock(lock, flags, &rpc_list, 0);
+		if (rc == -ERESTART)
+			GOTO(restart, rc);
+		*err = rc;
 	} else {
-		/* grant also all remaining try_bits */
-		if (lock->l_policy_data.l_inodebits.try_bits != 0) {
-			lock->l_policy_data.l_inodebits.bits |=
-				lock->l_policy_data.l_inodebits.try_bits;
-			lock->l_policy_data.l_inodebits.try_bits = 0;
-			*flags |= LDLM_FL_LOCK_CHANGED;
-		}
-		LASSERT(lock->l_policy_data.l_inodebits.bits);
 		ldlm_resource_unlink_lock(lock);
-		ldlm_grant_lock(lock, grant_work);
-		*err = ELDLM_OK;
+		ldlm_grant_lock(lock, work_list);
+		rc = 0;
 	}
 
-	RETURN(LDLM_ITER_CONTINUE);
+	if (!list_empty(&rpc_list))
+		ldlm_discard_bl_list(&rpc_list);
+
+	RETURN(rc);
 }
 #endif /* HAVE_SERVER_SUPPORT */
 
@@ -382,10 +240,6 @@ void ldlm_ibits_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy,
 				     union ldlm_policy_data *lpolicy)
 {
 	lpolicy->l_inodebits.bits = wpolicy->l_inodebits.bits;
-	/**
-	 * try_bits are to be handled outside of generic write_to_local due
-	 * to different behavior on a server and client.
-	 */
 }
 
 void ldlm_ibits_policy_local_to_wire(const union ldlm_policy_data *lpolicy,
@@ -393,185 +247,4 @@ void ldlm_ibits_policy_local_to_wire(const union ldlm_policy_data *lpolicy,
 {
 	memset(wpolicy, 0, sizeof(*wpolicy));
 	wpolicy->l_inodebits.bits = lpolicy->l_inodebits.bits;
-	wpolicy->l_inodebits.try_bits = lpolicy->l_inodebits.try_bits;
-}
-
-/**
- * Attempt to convert already granted IBITS lock with several bits set to
- * a lock with less bits (downgrade).
- *
- * Such lock conversion is used to keep lock with non-blocking bits instead of
- * cancelling it, introduced for better support of DoM files.
- */
-int ldlm_inodebits_drop(struct ldlm_lock *lock, __u64 to_drop)
-{
-	ENTRY;
-
-	check_res_locked(lock->l_resource);
-
-	/* Just return if there are no conflicting bits */
-	if ((lock->l_policy_data.l_inodebits.bits & to_drop) == 0) {
-		LDLM_WARN(lock, "try to drop unset bits %#llx/%#llx",
-			  lock->l_policy_data.l_inodebits.bits, to_drop);
-		/* nothing to do */
-		RETURN(0);
-	}
-
-	/* remove lock from a skiplist and put in the new place
-	 * according with new inodebits */
-	ldlm_resource_unlink_lock(lock);
-	lock->l_policy_data.l_inodebits.bits &= ~to_drop;
-	ldlm_grant_lock_with_skiplist(lock);
-	RETURN(0);
-}
-EXPORT_SYMBOL(ldlm_inodebits_drop);
-
-/* convert single lock */
-int ldlm_cli_inodebits_convert(struct ldlm_lock *lock,
-			       enum ldlm_cancel_flags cancel_flags)
-{
-	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
-	struct ldlm_lock_desc ld = { { 0 } };
-	__u64 drop_bits, new_bits;
-	__u32 flags = 0;
-	int rc;
-
-	ENTRY;
-
-	check_res_locked(lock->l_resource);
-
-	/* Lock is being converted already */
-	if (ldlm_is_converting(lock)) {
-		if (!(cancel_flags & LCF_ASYNC)) {
-			struct l_wait_info lwi = { 0 };
-
-			unlock_res_and_lock(lock);
-			l_wait_event(lock->l_waitq,
-				     is_lock_converted(lock), &lwi);
-			lock_res_and_lock(lock);
-		}
-		RETURN(0);
-	}
-
-	/* lru_cancel may happen in parallel and call ldlm_cli_cancel_list()
-	 * independently.
-	 */
-	if (ldlm_is_canceling(lock))
-		RETURN(-EINVAL);
-
-	/* no need in only local convert */
-	if (lock->l_flags & (LDLM_FL_LOCAL_ONLY | LDLM_FL_CANCEL_ON_BLOCK))
-		RETURN(-EINVAL);
-
-	drop_bits = lock->l_policy_data.l_inodebits.cancel_bits;
-	/* no cancel bits - means that caller needs full cancel */
-	if (drop_bits == 0)
-		RETURN(-EINVAL);
-
-	new_bits = lock->l_policy_data.l_inodebits.bits & ~drop_bits;
-	/* check if all lock bits are dropped, proceed with cancel */
-	if (!new_bits)
-		RETURN(-EINVAL);
-
-	/* check if no dropped bits, consider this as successful convert */
-	if (lock->l_policy_data.l_inodebits.bits == new_bits)
-		RETURN(0);
-
-	ldlm_set_converting(lock);
-	/* Finally call cancel callback for remaining bits only.
-	 * It is important to have converting flag during that
-	 * so blocking_ast callback can distinguish convert from
-	 * cancels.
-	 */
-	ld.l_policy_data.l_inodebits.cancel_bits = drop_bits;
-	unlock_res_and_lock(lock);
-	lock->l_blocking_ast(lock, &ld, lock->l_ast_data, LDLM_CB_CANCELING);
-	/* now notify server about convert */
-	rc = ldlm_cli_convert_req(lock, &flags, new_bits);
-	lock_res_and_lock(lock);
-	if (rc)
-		GOTO(full_cancel, rc);
-
-	/* Finally clear these bits in lock ibits */
-	ldlm_inodebits_drop(lock, drop_bits);
-
-	/* Being locked again check if lock was canceled, it is important
-	 * to do and don't drop cbpending below
-	 */
-	if (ldlm_is_canceling(lock))
-		GOTO(full_cancel, rc = -EINVAL);
-
-	/* also check again if more bits to be cancelled appeared */
-	if (drop_bits != lock->l_policy_data.l_inodebits.cancel_bits)
-		GOTO(clear_converting, rc = -EAGAIN);
-
-	/* clear cbpending flag early, it is safe to match lock right after
-	 * client convert because it is downgrade always.
-	 */
-	ldlm_clear_cbpending(lock);
-	ldlm_clear_bl_ast(lock);
-	spin_lock(&ns->ns_lock);
-	if (list_empty(&lock->l_lru))
-		ldlm_lock_add_to_lru_nolock(lock);
-	spin_unlock(&ns->ns_lock);
-
-	/* the job is done, zero the cancel_bits. If more conflicts appear,
-	 * it will result in another cycle of ldlm_cli_inodebits_convert().
-	 */
-full_cancel:
-	lock->l_policy_data.l_inodebits.cancel_bits = 0;
-clear_converting:
-	ldlm_clear_converting(lock);
-	RETURN(rc);
-}
-
-int ldlm_inodebits_alloc_lock(struct ldlm_lock *lock)
-{
-	if (ldlm_is_ns_srv(lock)) {
-		int i;
-
-		OBD_SLAB_ALLOC_PTR(lock->l_ibits_node, ldlm_inodebits_slab);
-		if (lock->l_ibits_node == NULL)
-			return -ENOMEM;
-		for (i = 0; i < MDS_INODELOCK_NUMBITS; i++)
-			INIT_LIST_HEAD(&lock->l_ibits_node->lin_link[i]);
-		lock->l_ibits_node->lock = lock;
-	} else {
-		lock->l_ibits_node = NULL;
-	}
-	return 0;
-}
-
-void ldlm_inodebits_add_lock(struct ldlm_resource *res, struct list_head *head,
-			     struct ldlm_lock *lock)
-{
-	int i;
-
-	if (!ldlm_is_ns_srv(lock))
-		return;
-
-	if (head == &res->lr_waiting) {
-		for (i = 0; i < MDS_INODELOCK_NUMBITS; i++) {
-			if (lock->l_policy_data.l_inodebits.bits & (1 << i))
-				list_add_tail(&lock->l_ibits_node->lin_link[i],
-					&res->lr_ibits_queues->liq_waiting[i]);
-		}
-	} else if (head == &res->lr_granted && lock->l_ibits_node != NULL) {
-		for (i = 0; i < MDS_INODELOCK_NUMBITS; i++)
-			LASSERT(list_empty(&lock->l_ibits_node->lin_link[i]));
-		OBD_SLAB_FREE_PTR(lock->l_ibits_node, ldlm_inodebits_slab);
-		lock->l_ibits_node = NULL;
-	}
-}
-
-void ldlm_inodebits_unlink_lock(struct ldlm_lock *lock)
-{
-	int i;
-
-	ldlm_unlink_lock_skiplist(lock);
-	if (!ldlm_is_ns_srv(lock))
-		return;
-
-	for (i = 0; i < MDS_INODELOCK_NUMBITS; i++)
-		list_del_init(&lock->l_ibits_node->lin_link[i]);
 }
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h
index 733773c50ed0c..779dec55882e5 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,7 +40,6 @@ extern struct mutex ldlm_cli_namespace_lock;
 extern struct list_head ldlm_cli_active_namespace_list;
 extern struct list_head ldlm_cli_inactive_namespace_list;
 extern unsigned int ldlm_cancel_unused_locks_before_replay;
-extern struct kmem_cache *ldlm_glimpse_work_kmem;
 
 static inline int ldlm_namespace_nr_read(enum ldlm_side client)
 {
@@ -98,27 +97,30 @@ struct ldlm_namespace *ldlm_namespace_first_locked(enum ldlm_side);
 /* ldlm_request.c */
 /* Cancel lru flag, it indicates we cancel aged locks. */
 enum ldlm_lru_flags {
-	LDLM_LRU_FLAG_NO_WAIT	= 0x1, /* Cancel locks w/o blocking (neither
-					* sending nor waiting for any RPCs) */
-	LDLM_LRU_FLAG_CLEANUP	= 0x2, /* Used when clearing lru, tells
-					* prepare_lru_list to set discard flag
-					* on PR extent locks so we don't waste
-					* time saving pages that will be
-					* discarded momentarily */
+	LDLM_LRU_FLAG_AGED	= 0x01, /* Cancel aged locks (non LRU resize) */
+	LDLM_LRU_FLAG_PASSED	= 0x02, /* Cancel passed number of locks */
+	LDLM_LRU_FLAG_SHRINK	= 0x04, /* Cancel locks from shrinker */
+	LDLM_LRU_FLAG_LRUR	= 0x08, /* Cancel locks from lru resize */
+	LDLM_LRU_FLAG_NO_WAIT	= 0x10, /* Cancel locks w/o blocking (neither
+					 * sending nor waiting for any RPCs) */
+	LDLM_LRU_FLAG_CLEANUP	= 0x20, /* Used when clearing lru, tells
+					 * prepare_lru_list to set discard flag
+					 * on PR extent locks so we don't waste
+					 * time saving pages that will be
+					 * discarded momentarily */
 };
 
-int ldlm_cancel_lru(struct ldlm_namespace *ns, int min,
+int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr,
 		    enum ldlm_cancel_flags cancel_flags,
 		    enum ldlm_lru_flags lru_flags);
 int ldlm_cancel_lru_local(struct ldlm_namespace *ns,
-			  struct list_head *cancels, int min, int max,
+			  struct list_head *cancels, int count, int max,
 			  enum ldlm_cancel_flags cancel_flags,
 			  enum ldlm_lru_flags lru_flags);
 extern unsigned int ldlm_enqueue_min;
 /* ldlm_resource.c */
 extern struct kmem_cache *ldlm_resource_slab;
 extern struct kmem_cache *ldlm_lock_slab;
-extern struct kmem_cache *ldlm_inodebits_slab;
 extern struct kmem_cache *ldlm_interval_tree_slab;
 
 void ldlm_resource_insert_lock_after(struct ldlm_lock *original,
@@ -133,7 +135,6 @@ typedef enum {
 	LDLM_WORK_GL_AST
 } ldlm_desc_ast_t;
 
-void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock);
 void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list);
 int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill,
 		  enum req_location loc, void *data, int size);
@@ -142,9 +143,7 @@ ldlm_lock_create(struct ldlm_namespace *ns, const struct ldlm_res_id *,
 		 enum ldlm_type type, enum ldlm_mode mode,
 		 const struct ldlm_callback_suite *cbs,
 		 void *data, __u32 lvb_len, enum lvb_type lvb_type);
-enum ldlm_error ldlm_lock_enqueue(const struct lu_env *env,
-				  struct ldlm_namespace *,
-				  struct ldlm_lock **,
+enum ldlm_error ldlm_lock_enqueue(struct ldlm_namespace *, struct ldlm_lock **,
 				  void *cookie, __u64 *flags);
 void ldlm_lock_addref_internal(struct ldlm_lock *, enum ldlm_mode mode);
 void ldlm_lock_addref_internal_nolock(struct ldlm_lock *, enum ldlm_mode mode);
@@ -155,16 +154,13 @@ void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
 #ifdef HAVE_SERVER_SUPPORT
 int ldlm_reprocess_queue(struct ldlm_resource *res, struct list_head *queue,
 			 struct list_head *work_list,
-			 enum ldlm_process_intention intention,
-			 struct ldlm_lock *hint);
+			 enum ldlm_process_intention intention);
 int ldlm_handle_conflict_lock(struct ldlm_lock *lock, __u64 *flags,
-			      struct list_head *rpc_list);
+			      struct list_head *rpc_list, __u64 grant_flags);
 void ldlm_discard_bl_list(struct list_head *bl_list);
-void ldlm_clear_blocking_lock(struct ldlm_lock *lock);
-void ldlm_clear_blocking_data(struct ldlm_lock *lock);
 #endif
 int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
-		      ldlm_desc_ast_t ast_type);
+                      ldlm_desc_ast_t ast_type);
 int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq);
 int ldlm_lock_remove_from_lru_check(struct ldlm_lock *lock, ktime_t last_use);
 #define ldlm_lock_remove_from_lru(lock) \
@@ -177,7 +173,6 @@ void ldlm_lock_destroy_nolock(struct ldlm_lock *lock);
 
 int ldlm_export_cancel_blocked_locks(struct obd_export *exp);
 int ldlm_export_cancel_locks(struct obd_export *exp);
-void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock);
 
 /* ldlm_lockd.c */
 int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld,
@@ -190,7 +185,6 @@ int ldlm_bl_thread_wakeup(void);
 
 void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
                              struct ldlm_lock_desc *ld, struct ldlm_lock *lock);
-void ldlm_bl_desc2lock(const struct ldlm_lock_desc *ld, struct ldlm_lock *lock);
 
 #ifdef HAVE_SERVER_SUPPORT
 /* ldlm_plain.c */
@@ -203,25 +197,14 @@ int ldlm_process_inodebits_lock(struct ldlm_lock *lock, __u64 *flags,
 				enum ldlm_process_intention intention,
 				enum ldlm_error *err,
 				struct list_head *work_list);
-int ldlm_reprocess_inodebits_queue(struct ldlm_resource *res,
-				   struct list_head *queue,
-				   struct list_head *work_list,
-				   enum ldlm_process_intention intention,
-				   struct ldlm_lock *hint);
 /* ldlm_extent.c */
 int ldlm_process_extent_lock(struct ldlm_lock *lock, __u64 *flags,
 			     enum ldlm_process_intention intention,
 			     enum ldlm_error *err, struct list_head *work_list);
 #endif
-int ldlm_extent_alloc_lock(struct ldlm_lock *lock);
 void ldlm_extent_add_lock(struct ldlm_resource *res, struct ldlm_lock *lock);
 void ldlm_extent_unlink_lock(struct ldlm_lock *lock);
 
-int ldlm_inodebits_alloc_lock(struct ldlm_lock *lock);
-void ldlm_inodebits_add_lock(struct ldlm_resource *res, struct list_head *head,
-			     struct ldlm_lock *lock);
-void ldlm_inodebits_unlink_lock(struct ldlm_lock *lock);
-
 /* ldlm_flock.c */
 int ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags,
 			    enum ldlm_process_intention intention,
@@ -233,7 +216,7 @@ void ldlm_destroy_flock_export(struct obd_export *exp);
 void l_check_ns_lock(struct ldlm_namespace *ns);
 void l_check_no_ns_lock(struct ldlm_namespace *ns);
 
-extern struct dentry *ldlm_svc_debugfs_dir;
+extern struct proc_dir_entry *ldlm_svc_proc_dir;
 
 struct ldlm_state {
         struct ptlrpc_service *ldlm_cb_service;
@@ -247,6 +230,7 @@ struct ldlm_state {
 extern struct kmem_cache *ldlm_interval_slab; /* slab cache for ldlm_interval */
 extern void ldlm_interval_attach(struct ldlm_interval *n, struct ldlm_lock *l);
 extern struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l);
+extern struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock);
 extern void ldlm_interval_free(struct ldlm_interval *node);
 /* this function must be called with res lock held */
 static inline struct ldlm_extent *
@@ -297,7 +281,7 @@ enum ldlm_policy_res {
 	static ssize_t var##_store(struct kobject *kobj,		   \
 				   struct attribute *attr,		   \
 				   const char *buffer,			   \
-				   size_t count)			   \
+				   unsigned long count)			   \
 	{								   \
 		struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool,\
 						    pl_kobj);		   \
@@ -333,7 +317,7 @@ enum ldlm_policy_res {
 	static ssize_t var##_store(struct kobject *kobj,		   \
 				   struct attribute *attr,		   \
 				   const char *buffer,			   \
-				   size_t count)			   \
+				   unsigned long count)			   \
 	{								   \
 		struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool,\
 						    pl_kobj);		   \
@@ -352,24 +336,28 @@ enum ldlm_policy_res {
 	struct __##var##__dummy_write {; } /* semicolon catcher */
 
 static inline void
-ldlm_add_var(struct ldebugfs_vars *vars, struct dentry *debugfs_entry,
-	     const char *name, void *data, const struct file_operations *ops)
+ldlm_add_var(struct lprocfs_vars *vars, struct proc_dir_entry *proc_dir,
+	     const char *name, void *data, const struct proc_ops *ops)
 {
 	snprintf((char *)vars->name, MAX_STRING_SIZE, "%s", name);
 	vars->data = data;
 	vars->fops = ops;
-	ldebugfs_add_vars(debugfs_entry, vars, NULL);
+	lprocfs_add_vars(proc_dir, vars, NULL);
 }
 
 static inline int is_granted_or_cancelled(struct ldlm_lock *lock)
 {
-	int ret = 0;
+        int ret = 0;
 
-	lock_res_and_lock(lock);
-	ret = is_granted_or_cancelled_nolock(lock);
-	unlock_res_and_lock(lock);
+        lock_res_and_lock(lock);
+	if ((lock->l_req_mode == lock->l_granted_mode) &&
+	     !ldlm_is_cp_reqd(lock))
+		ret = 1;
+	else if (ldlm_is_failed(lock) || ldlm_is_cancel(lock))
+                ret = 1;
+        unlock_res_and_lock(lock);
 
-	return ret;
+        return ret;
 }
 
 static inline bool is_bl_done(struct ldlm_lock *lock)
@@ -385,17 +373,6 @@ static inline bool is_bl_done(struct ldlm_lock *lock)
 	return bl_done;
 }
 
-static inline bool is_lock_converted(struct ldlm_lock *lock)
-{
-	bool ret = 0;
-
-	lock_res_and_lock(lock);
-	ret = (lock->l_policy_data.l_inodebits.cancel_bits == 0);
-	unlock_res_and_lock(lock);
-
-	return ret;
-}
-
 typedef void (*ldlm_policy_wire_to_local_t)(const union ldlm_wire_policy_data *,
 					    union ldlm_policy_data *);
 typedef void (*ldlm_policy_local_to_wire_t)(const union ldlm_policy_data *,
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c
index 41e655b6fc353..33d871da4bdf6 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2017, Intel Corporation.
+ * Copyright (c) 2010, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -39,8 +39,6 @@
 
 #define DEBUG_SUBSYSTEM S_LDLM
 
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
 #include <linux/kthread.h>
 #include <libcfs/libcfs.h>
 #include <obd.h>
@@ -360,13 +358,12 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
 		     sizeof(server_uuid)));
 
 	cli->cl_dirty_pages = 0;
-	cli->cl_dirty_max_pages = 0;
 	cli->cl_avail_grant = 0;
 	/* FIXME: Should limit this for the sum of all cl_dirty_max_pages. */
 	/* cl_dirty_max_pages may be changed at connect time in
 	 * ptlrpc_connect_interpret(). */
 	client_adjust_max_dirty(cli);
-	init_waitqueue_head(&cli->cl_cache_waiters);
+	INIT_LIST_HEAD(&cli->cl_cache_waiters);
 	INIT_LIST_HEAD(&cli->cl_loi_ready_list);
 	INIT_LIST_HEAD(&cli->cl_loi_hp_ready_list);
 	INIT_LIST_HEAD(&cli->cl_loi_write_list);
@@ -393,15 +390,9 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
 	spin_lock_init(&cli->cl_lru_list_lock);
 	atomic_long_set(&cli->cl_unstable_count, 0);
 	INIT_LIST_HEAD(&cli->cl_shrink_list);
-	INIT_LIST_HEAD(&cli->cl_grant_chain);
-
-	INIT_LIST_HEAD(&cli->cl_flight_waiters);
-	cli->cl_rpcs_in_flight = 0;
 
 	init_waitqueue_head(&cli->cl_destroy_waitq);
 	atomic_set(&cli->cl_destroy_in_flight, 0);
-
-	cli->cl_supp_cksum_types = OBD_CKSUM_CRC32;
 #ifdef ENABLE_CHECKSUM
 	/* Turn on checksumming by default. */
 	cli->cl_checksum = 1;
@@ -410,7 +401,7 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
 	 * Set cl_chksum* to CRC32 for now to avoid returning screwed info
 	 * through procfs.
 	 */
-	cli->cl_cksum_type = cli->cl_supp_cksum_types;
+	cli->cl_cksum_type = cli->cl_supp_cksum_types = OBD_CKSUM_CRC32;
 #endif
 	atomic_set(&cli->cl_resends, OSC_DEFAULT_RESENDS);
 
@@ -418,8 +409,6 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
 	 * from OFD after connecting. */
 	cli->cl_max_pages_per_rpc = PTLRPC_MAX_BRW_PAGES;
 
-	cli->cl_max_short_io_bytes = OBD_MAX_SHORT_IO_BYTES;
-
 	/* set cl_chunkbits default value to PAGE_SHIFT,
 	 * it will be updated at OSC connection time. */
 	cli->cl_chunkbits = PAGE_SHIFT;
@@ -437,7 +426,7 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
 			cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_MAX;
 		else
 			cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT;
-	}
+        }
 
 	spin_lock_init(&cli->cl_mod_rpcs_lock);
 	spin_lock_init(&cli->cl_mod_rpcs_hist.oh_lock);
@@ -610,7 +599,6 @@ int client_connect_import(const struct lu_env *env,
 			 ocd->ocd_connect_flags, "old %#llx, new %#llx\n",
 			 data->ocd_connect_flags, ocd->ocd_connect_flags);
 		data->ocd_connect_flags = ocd->ocd_connect_flags;
-		data->ocd_connect_flags2 = ocd->ocd_connect_flags2;
 	}
 
 	ptlrpc_pinger_add_import(imp);
@@ -743,32 +731,6 @@ int server_disconnect_export(struct obd_export *exp)
 }
 EXPORT_SYMBOL(server_disconnect_export);
 
-static inline int target_check_recovery_timer(struct obd_device *target)
-{
-	ktime_t remaining;
-	s64 timeout;
-
-	if (!target->obd_recovering || target->obd_recovery_start == 0)
-		return 0;
-
-	remaining = hrtimer_expires_remaining(&target->obd_recovery_timer);
-	timeout = ktime_divns(remaining, NSEC_PER_SEC);
-	if (timeout > -30)
-		return 0;
-
-	/* the recovery timer should expire, but it isn't triggered,
-	 * it's better to abort the recovery of this target to speed up
-	 * the recovery of the whole cluster. */
-	spin_lock(&target->obd_dev_lock);
-	if (target->obd_recovering) {
-		CERROR("%s: Aborting recovery\n", target->obd_name);
-		target->obd_abort_recovery = 1;
-		wake_up(&target->obd_next_transno_waitq);
-	}
-	spin_unlock(&target->obd_dev_lock);
-	return 0;
-}
-
 /* --------------------------------------------------------------------------
  * from old lib/target.c
  * -------------------------------------------------------------------------- */
@@ -779,11 +741,12 @@ static int target_handle_reconnect(struct lustre_handle *conn,
 {
 	struct obd_device *target;
 	struct lustre_handle *hdl;
-	ktime_t remaining;
-	s64 timeout;
+	cfs_time_t now;
+	cfs_time_t deadline;
+	int timeout;
 	int rc = 0;
-
 	ENTRY;
+
 	hdl = &exp->exp_imp_reverse->imp_remote_handle;
 	if (!exp->exp_connection || !lustre_handle_is_used(hdl)) {
 		conn->cookie = exp->exp_handle.h_cookie;
@@ -817,45 +780,46 @@ static int target_handle_reconnect(struct lustre_handle *conn,
 		GOTO(out_already, rc);
 	}
 
-	remaining = hrtimer_expires_remaining(&target->obd_recovery_timer);
-	timeout = ktime_divns(remaining, NSEC_PER_SEC);
-	if (timeout > 0) {
-		LCONSOLE_WARN("%s: Client %s (at %s) reconnected, waiting for %d clients in recovery for %lld:%.02lld\n",
-			      target->obd_name,
-			      obd_uuid2str(&exp->exp_client_uuid),
-			      obd_export_nid2str(exp),
-			      atomic_read(&target->obd_max_recoverable_clients),
-			      timeout / 60, timeout % 60);
-	} else {
-		struct target_distribute_txn_data *tdtd;
+	now = cfs_time_current();
+	deadline = target->obd_recovery_timer.expires;
+	if (cfs_time_before(now, deadline)) {
+		struct target_distribute_txn_data *tdtd =
+					class_exp2tgt(exp)->lut_tdtd;
 		int size = 0;
 		int count = 0;
 		char *buf = NULL;
 
-		target_check_recovery_timer(target);
-
-		tdtd = class_exp2tgt(exp)->lut_tdtd;
+		timeout = cfs_duration_sec(cfs_time_sub(deadline, now));
 		if (tdtd && tdtd->tdtd_show_update_logs_retrievers)
 			buf = tdtd->tdtd_show_update_logs_retrievers(
 				tdtd->tdtd_show_retrievers_cbdata,
 				&size, &count);
 
 		if (count > 0)
-			LCONSOLE_WARN("%s: Client %s (at %s) reconnecting, waiting for %d MDTs (%s) in recovery for %lld:%.02lld. Please wait until all MDTs recovered or you may force MDT evicition via 'lctl --device %s abort_recovery.\n",
-				      target->obd_name,
-				      obd_uuid2str(&exp->exp_client_uuid),
-				      obd_export_nid2str(exp), count,
-				      buf ? buf : "unknown (not enough RAM)",
-				      (abs(timeout) + target->obd_recovery_timeout) / 60,
-				      (abs(timeout) + target->obd_recovery_timeout) % 60,
-				      target->obd_name);
+			LCONSOLE_WARN("%s: Recovery already passed deadline "
+				      "%d:%.02d. It is due to DNE recovery "
+				      "failed/stuck on the %d MDT(s):%s. "
+				      "Please wait until all MDTs recovered "
+				      "or abort the recovery by force.\n",
+				      target->obd_name, timeout / 60,
+				      timeout % 60, count,
+				      buf ? buf : "unknown (not enough RAM)");
 		else
-			LCONSOLE_WARN("%s: Recovery already passed deadline %lld:%.02lld. If you do not want to wait more, you may force taget eviction via 'lctl --device %s abort_recovery.\n",
-				      target->obd_name, abs(timeout) / 60,
-				      abs(timeout) % 60, target->obd_name);
+			LCONSOLE_WARN("%s: Recovery already passed deadline "
+				      "%d:%.02d. If you do not want to wait "
+				      "more, please abort the recovery by "
+				      "force.\n", target->obd_name,
+				      timeout / 60, timeout % 60);
 
 		if (buf != NULL)
 			OBD_FREE(buf, size);
+	} else {
+		timeout = cfs_duration_sec(cfs_time_sub(now, deadline));
+		LCONSOLE_WARN("%s: Recovery already passed deadline"
+			" %d:%.02d, It is most likely due to DNE"
+			" recovery is failed or stuck, please wait a"
+			" few more minutes or abort the recovery.\n",
+			target->obd_name, timeout / 60, timeout % 60);
 	}
 
 out_already:
@@ -986,6 +950,7 @@ int target_handle_connect(struct ptlrpc_request *req)
 	 * reconnect case */
 	struct lustre_handle conn;
 	struct lustre_handle *tmp;
+        struct obd_uuid tgtuuid;
         struct obd_uuid cluuid;
         char *str;
         int rc = 0;
@@ -994,6 +959,7 @@ int target_handle_connect(struct ptlrpc_request *req)
 	bool	 mds_conn = false, lw_client = false, initial_conn = false;
 	bool	 mds_mds_conn = false;
 	bool	 new_mds_mds_conn = false;
+	bool	 target_referenced = false;
         struct obd_connect_data *data, *tmpdata;
         int size, tmpsize;
         lnet_nid_t *client_nid = NULL;
@@ -1007,7 +973,11 @@ int target_handle_connect(struct ptlrpc_request *req)
                 GOTO(out, rc = -EINVAL);
         }
 
-	target = class_dev_by_str(str);
+        obd_str2uuid(&tgtuuid, str);
+        target = class_uuid2obd(&tgtuuid);
+        if (!target)
+                target = class_name2obd(str);
+
 	if (!target) {
 		deuuidify(str, NULL, &target_start, &target_len);
 		LCONSOLE_ERROR_MSG(0x137, "%s: not available for connect "
@@ -1019,9 +989,6 @@ int target_handle_connect(struct ptlrpc_request *req)
 	}
 
 	spin_lock(&target->obd_dev_lock);
-
-	target->obd_conn_inprogress++;
-
 	if (target->obd_stopping || !target->obd_set_up) {
 		spin_unlock(&target->obd_dev_lock);
 
@@ -1043,6 +1010,13 @@ int target_handle_connect(struct ptlrpc_request *req)
 		GOTO(out, rc = -EAGAIN);
 	}
 
+	/* Make sure the target isn't cleaned up while we're here. Yes,
+	 * there's still a race between the above check and our incref here.
+	 * Really, class_uuid2obd should take the ref. */
+	class_incref(target, __func__, current);
+	target_referenced = true;
+
+	target->obd_conn_inprogress++;
 	spin_unlock(&target->obd_dev_lock);
 
         str = req_capsule_client_get(&req->rq_pill, &RMF_CLUUID);
@@ -1059,13 +1033,11 @@ int target_handle_connect(struct ptlrpc_request *req)
 
         conn = *tmp;
 
-	size = req_capsule_get_size(&req->rq_pill, &RMF_CONNECT_DATA,
-				    RCL_CLIENT);
-	if (size < 0 || size > 8 * sizeof(struct obd_connect_data))
-		GOTO(out, rc = -EPROTO);
-	data = req_capsule_client_get(&req->rq_pill, &RMF_CONNECT_DATA);
-	if (!data)
-		GOTO(out, rc = -EPROTO);
+        size = req_capsule_get_size(&req->rq_pill, &RMF_CONNECT_DATA,
+                                    RCL_CLIENT);
+        data = req_capsule_client_get(&req->rq_pill, &RMF_CONNECT_DATA);
+        if (!data)
+                GOTO(out, rc = -EPROTO);
 
         rc = req_capsule_server_pack(&req->rq_pill);
         if (rc)
@@ -1083,36 +1055,50 @@ int target_handle_connect(struct ptlrpc_request *req)
 	 */
 	if (!(data->ocd_connect_flags & OBD_CONNECT_FULL20))
 		GOTO(out, rc = -EPROTO);
+#endif
 
-	/* Don't allow liblustre clients to connect.
-	 * - testing was disabled in v2_2_50_0-61-g6a75d65
-	 * - building was disabled in v2_5_58_0-28-g7277179
-	 * - client code was deleted in v2_6_50_0-101-gcdfbc72,
-	 * - clients were refused connect for version difference > 0.0.1.32  */
 	if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
-		DEBUG_REQ(D_WARNING, req, "Refusing libclient connection");
-		GOTO(out, rc = -EPROTO);
+		if (data->ocd_version < LUSTRE_VERSION_CODE -
+		                               LUSTRE_VERSION_ALLOWED_OFFSET ||
+		    data->ocd_version > LUSTRE_VERSION_CODE +
+		                               LUSTRE_VERSION_ALLOWED_OFFSET) {
+			DEBUG_REQ(D_WARNING, req, "Refusing %s (%d.%d.%d.%d) "
+				  "libclient connection attempt",
+				  data->ocd_version < LUSTRE_VERSION_CODE ?
+				  "old" : "new",
+				  OBD_OCD_VERSION_MAJOR(data->ocd_version),
+				  OBD_OCD_VERSION_MINOR(data->ocd_version),
+				  OBD_OCD_VERSION_PATCH(data->ocd_version),
+				  OBD_OCD_VERSION_FIX(data->ocd_version));
+			data = req_capsule_server_sized_get(&req->rq_pill,
+							    &RMF_CONNECT_DATA,
+				    offsetof(typeof(*data), ocd_version) +
+					     sizeof(data->ocd_version));
+			if (data) {
+				data->ocd_connect_flags = OBD_CONNECT_VERSION;
+				data->ocd_version = LUSTRE_VERSION_CODE;
+			}
+			GOTO(out, rc = -EPROTO);
+		}
 	}
-#endif
 
 	/* Note: lw_client is needed in MDS-MDS failover during update log
 	 * processing, so we needs to allow lw_client to be connected at
-	 * anytime, instead of only the initial connection
-	 */
-	lw_client = OCD_HAS_FLAG(data, LIGHTWEIGHT);
+	 * anytime, instead of only the initial connection */
+	lw_client = (data->ocd_connect_flags & OBD_CONNECT_LIGHTWEIGHT) != 0;
 
 	if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_INITIAL) {
 		initial_conn = true;
-		mds_conn = OCD_HAS_FLAG(data, MDS);
-		mds_mds_conn = OCD_HAS_FLAG(data, MDS_MDS);
+		mds_conn = (data->ocd_connect_flags & OBD_CONNECT_MDS) != 0;
+		mds_mds_conn = (data->ocd_connect_flags &
+				OBD_CONNECT_MDS_MDS) != 0;
 
 		/* OBD_CONNECT_MNE_SWAB is defined as OBD_CONNECT_MDS_MDS
 		 * for Imperative Recovery connection from MGC to MGS.
 		 *
 		 * Via check OBD_CONNECT_FID, we can distinguish whether
 		 * the OBD_CONNECT_MDS_MDS/OBD_CONNECT_MNE_SWAB is from
-		 * MGC or MDT, since MGC does not use OBD_CONNECT_FID.
-		 */
+		 * MGC or MDT. */
 		if (!lw_client &&
 		    (data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) &&
 		    (data->ocd_connect_flags & OBD_CONNECT_FID) &&
@@ -1161,29 +1147,27 @@ int target_handle_connect(struct ptlrpc_request *req)
 		export = NULL;
 		rc = -EALREADY;
 	} else if ((mds_conn || (lw_client && initial_conn) ||
-		   OCD_HAS_FLAG(data, MDS_MDS)) && export->exp_connection) {
+		   data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) &&
+		   export->exp_connection != NULL) {
 		spin_unlock(&export->exp_lock);
 		if (req->rq_peer.nid != export->exp_connection->c_peer.nid) {
 			/* MDS or LWP reconnected after failover. */
-			LCONSOLE_WARN("%s: Received %s connection from %s, removing former export from %s\n",
-				      target->obd_name,
-				      lw_client ? "LWP" : "MDS",
-				      libcfs_nid2str(req->rq_peer.nid),
-				      libcfs_nid2str(export->exp_connection->c_peer.nid));
+			LCONSOLE_WARN("%s: Received %s connection from "
+			    "%s, removing former export from %s\n",
+			    target->obd_name, mds_conn ? "MDS" : "LWP",
+			    libcfs_nid2str(req->rq_peer.nid),
+			    libcfs_nid2str(export->exp_connection->c_peer.nid));
 		} else {
-			/* New connection from the same NID. */
-			LCONSOLE_WARN("%s: Received new %s connection from %s, %s former export from same NID\n",
-				      target->obd_name,
-				      lw_client ? "LWP" : "MDS",
-				      libcfs_nid2str(req->rq_peer.nid),
-				      OCD_HAS_FLAG(data, MDS_MDS) ?
-				      "keep" : "remove");
+			/* New MDS connection from the same NID. */
+			LCONSOLE_WARN("%s: Received new %s connection from "
+				"%s, removing former export from same NID\n",
+				target->obd_name, mds_conn ? "MDS" : "LWP",
+				libcfs_nid2str(req->rq_peer.nid));
 		}
 
 		if (req->rq_peer.nid == export->exp_connection->c_peer.nid &&
-		    OCD_HAS_FLAG(data, MDS_MDS)) {
-			/*
-			 * Because exports between MDTs will always be
+		    data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) {
+			/* Because exports between MDTs will always be
 			 * kept, let's do not fail such export if they
 			 * come from the same NID, otherwise it might
 			 * cause eviction between MDTs, which might
@@ -1250,11 +1234,11 @@ int target_handle_connect(struct ptlrpc_request *req)
                 GOTO(out, rc);
         }
 
-	CDEBUG(D_HA, "%s: connection from %s@%s %st%llu exp %p cur %lld last %lld\n",
-	       target->obd_name, cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
-	       target->obd_recovering ? "recovering/" : "", data->ocd_transno,
-	       export, ktime_get_seconds(),
-	       export ? export->exp_last_request_time : 0);
+	CDEBUG(D_HA, "%s: connection from %s@%s %st%llu exp %p cur %ld last %ld\n",
+               target->obd_name, cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
+              target->obd_recovering ? "recovering/" : "", data->ocd_transno,
+              export, (long)cfs_time_current_sec(),
+              export ? (long)export->exp_last_request_time : 0);
 
 	/* If this is the first time a client connects, reset the recovery
 	 * timer. Discard lightweight connections which might be local. */
@@ -1280,37 +1264,27 @@ int target_handle_connect(struct ptlrpc_request *req)
 		/* allow "new" MDT to be connected during recovery, since we
 		 * need retrieve recovery update records from it */
 		if (target->obd_recovering && !lw_client && !mds_mds_conn) {
-			struct hrtimer *timer = &target->obd_recovery_timer;
-			ktime_t remaining;
-			s64 timeout, left;
-			int in_progress;
-			int connected;
-			int known;
-			int stale;
-			char *msg;
-
-			connected = atomic_read(&target->obd_connected_clients);
-			in_progress = atomic_read(&target->obd_lock_replay_clients);
-			known =
-			   atomic_read(&target->obd_max_recoverable_clients);
-			stale = target->obd_stale_clients;
-			remaining = hrtimer_expires_remaining(timer);
-			left = ktime_divns(remaining, NSEC_PER_SEC);
-			if (ktime_to_ns(remaining) > 0) {
-				msg = "to recover in";
-				timeout = left;
-			} else {
-				msg = "already passed deadline";
-				timeout = -left;
-
-				target_check_recovery_timer(target);
-			}
-
-			LCONSOLE_WARN("%s: Denying connection for new client %s (at %s), waiting for %d known clients (%d recovered, %d in progress, and %d evicted) %s %lld:%.02lld\n",
+                        cfs_time_t t;
+			int	c; /* connected */
+			int	i; /* in progress */
+			int	k; /* known */
+			int	s; /* stale/evicted */
+
+			c = atomic_read(&target->obd_connected_clients);
+			i = atomic_read(&target->obd_lock_replay_clients);
+			k = target->obd_max_recoverable_clients;
+			s = target->obd_stale_clients;
+			t = target->obd_recovery_timer.expires;
+			t = cfs_time_sub(t, cfs_time_current());
+			t = cfs_duration_sec(t);
+			LCONSOLE_WARN("%s: Denying connection for new client %s"
+				      "(at %s), waiting for %d known clients "
+				      "(%d recovered, %d in progress, and %d "
+				      "evicted) to recover in %d:%.02d\n",
 				      target->obd_name, cluuid.uuid,
-				      libcfs_nid2str(req->rq_peer.nid), known,
-				      connected - in_progress, in_progress,
-				      stale, msg, timeout / 60, timeout % 60);
+				      libcfs_nid2str(req->rq_peer.nid), k,
+				      c - i, i, s, (int)t / 60,
+				      (int)t % 60);
 			rc = -EBUSY;
 		} else {
 dont_check_exports:
@@ -1365,26 +1339,37 @@ int target_handle_connect(struct ptlrpc_request *req)
 		spin_unlock(&export->exp_lock);
 		CDEBUG(D_RPCTRACE, "%s: %s already connected at greater "
 		       "or equal conn_cnt: %d >= %d\n",
-		       cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
-		       export->exp_conn_cnt,
-		       lustre_msg_get_conn_cnt(req->rq_reqmsg));
+                       cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
+                       export->exp_conn_cnt,
+                       lustre_msg_get_conn_cnt(req->rq_reqmsg));
 
-		GOTO(out, rc = -EALREADY);
+                GOTO(out, rc = -EALREADY);
+        }
+        LASSERT(lustre_msg_get_conn_cnt(req->rq_reqmsg) > 0);
+        export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
+
+	/* Don't evict liblustre clients for not pinging. */
+        if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
+                export->exp_libclient = 1;
+		spin_unlock(&export->exp_lock);
+
+		spin_lock(&target->obd_dev_lock);
+		list_del_init(&export->exp_obd_chain_timed);
+		spin_unlock(&target->obd_dev_lock);
+	} else {
+		spin_unlock(&export->exp_lock);
 	}
-	LASSERT(lustre_msg_get_conn_cnt(req->rq_reqmsg) > 0);
-	export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
-	spin_unlock(&export->exp_lock);
 
-	if (export->exp_connection != NULL) {
+        if (export->exp_connection != NULL) {
 		/* Check to see if connection came from another NID. */
-		if ((export->exp_connection->c_peer.nid != req->rq_peer.nid) &&
+                if ((export->exp_connection->c_peer.nid != req->rq_peer.nid) &&
 		    !hlist_unhashed(&export->exp_nid_hash))
-			cfs_hash_del(export->exp_obd->obd_nid_hash,
-				     &export->exp_connection->c_peer.nid,
-				     &export->exp_nid_hash);
+                        cfs_hash_del(export->exp_obd->obd_nid_hash,
+                                     &export->exp_connection->c_peer.nid,
+                                     &export->exp_nid_hash);
 
-		ptlrpc_connection_put(export->exp_connection);
-	}
+                ptlrpc_connection_put(export->exp_connection);
+        }
 
 	export->exp_connection = ptlrpc_connection_get(req->rq_peer,
 						       req->rq_self,
@@ -1440,10 +1425,9 @@ int target_handle_connect(struct ptlrpc_request *req)
 		 * also needs to be increased to match other recovery checking
 		 * condition. */
 		if (new_mds_mds_conn)
-			atomic_inc(&target->obd_max_recoverable_clients);
-
+			target->obd_max_recoverable_clients++;
 		if (atomic_inc_return(&target->obd_connected_clients) ==
-		    atomic_read(&target->obd_max_recoverable_clients))
+		    target->obd_max_recoverable_clients)
 			wake_up(&target->obd_next_transno_waitq);
 	}
 
@@ -1459,11 +1443,12 @@ int target_handle_connect(struct ptlrpc_request *req)
 
 		class_export_put(export);
 	}
-	if (target != NULL) {
+	if (target_referenced == true && target != NULL) {
 		spin_lock(&target->obd_dev_lock);
 		target->obd_conn_inprogress--;
 		spin_unlock(&target->obd_dev_lock);
-		class_decref(target, "find", current);
+
+		class_decref(target, __func__, current);
 	}
 	req->rq_status = rc;
 	RETURN(rc);
@@ -1475,23 +1460,11 @@ int target_handle_disconnect(struct ptlrpc_request *req)
         ENTRY;
 
         rc = req_capsule_server_pack(&req->rq_pill);
-	if (rc)
-		RETURN(rc);
-
-	/* In case of target disconnect, updating sec ctx immediately is
-	 * required in order to record latest sequence number used.
-	 * Sequence is normally updated on export destroy, but this event
-	 * can occur too late, ie after a new target connect request has
-	 * been processed.
-	 * Maintaining correct sequence when client connection becomes idle
-	 * ensures that GSS does not erroneously consider requests as replays.
-	 */
-	rc = sptlrpc_export_update_ctx(req->rq_export);
-	if (rc)
-		RETURN(rc);
+        if (rc)
+                RETURN(rc);
 
 	/* Keep the rq_export around so we can send the reply. */
-	req->rq_status = obd_disconnect(class_export_get(req->rq_export));
+        req->rq_status = obd_disconnect(class_export_get(req->rq_export));
 
         RETURN(0);
 }
@@ -1604,14 +1577,14 @@ static void target_finish_recovery(struct lu_target *lut)
 
 	/* Only log a recovery message when recovery has occurred. */
 	if (obd->obd_recovery_start) {
-		time64_t now = ktime_get_seconds();
+		time64_t now = ktime_get_real_seconds();
 		time64_t elapsed_time;
 
 		elapsed_time = max_t(time64_t, now - obd->obd_recovery_start, 1);
 		LCONSOLE_INFO("%s: Recovery over after %lld:%.02lld, of %d clients "
 			"%d recovered and %d %s evicted.\n", obd->obd_name,
 			(s64)elapsed_time / 60, (s64)elapsed_time % 60,
-			atomic_read(&obd->obd_max_recoverable_clients),
+			obd->obd_max_recoverable_clients,
 			atomic_read(&obd->obd_connected_clients),
 			obd->obd_stale_clients,
 			obd->obd_stale_clients == 1 ? "was" : "were");
@@ -1634,16 +1607,15 @@ static void target_finish_recovery(struct lu_target *lut)
 	}
 	spin_unlock(&obd->obd_recovery_task_lock);
 
-	obd->obd_recovery_end = ktime_get_seconds();
+	obd->obd_recovery_end = ktime_get_real_seconds();
 
 	/* When recovery finished, cleanup orphans on MDS and OST. */
-	if (obd->obd_type && OBP(obd, postrecov)) {
-		int rc = OBP(obd, postrecov)(obd);
-
-		if (rc < 0)
-			LCONSOLE_WARN("%s: Post recovery failed, rc %d\n",
-				      obd->obd_name, rc);
-	}
+        if (OBT(obd) && OBP(obd, postrecov)) {
+                int rc = OBP(obd, postrecov)(obd);
+                if (rc < 0)
+                        LCONSOLE_WARN("%s: Post recovery failed, rc %d\n",
+                                      obd->obd_name, rc);
+        }
         EXIT;
 }
 
@@ -1740,14 +1712,12 @@ EXPORT_SYMBOL(target_cleanup_recovery);
 /* obd_recovery_task_lock should be held */
 void target_cancel_recovery_timer(struct obd_device *obd)
 {
-	CDEBUG(D_HA, "%s: cancel recovery timer\n", obd->obd_name);
-	hrtimer_cancel(&obd->obd_recovery_timer);
+        CDEBUG(D_HA, "%s: cancel recovery timer\n", obd->obd_name);
+	del_timer(&obd->obd_recovery_timer);
 }
 
 static void target_start_recovery_timer(struct obd_device *obd)
 {
-	ktime_t delay;
-
 	if (obd->obd_recovery_start != 0)
 		return;
 
@@ -1764,36 +1734,33 @@ static void target_start_recovery_timer(struct obd_device *obd)
 		return;
 	}
 
-	obd->obd_recovery_start = ktime_get_seconds();
-	delay = ktime_set(obd->obd_recovery_start +
-			  obd->obd_recovery_timeout, 0);
-	hrtimer_start(&obd->obd_recovery_timer, delay, HRTIMER_MODE_ABS);
+	mod_timer(&obd->obd_recovery_timer,
+		  cfs_time_shift(obd->obd_recovery_timeout));
+	obd->obd_recovery_start = ktime_get_real_seconds();
 	spin_unlock(&obd->obd_dev_lock);
 
-	LCONSOLE_WARN("%s: Will be in recovery for at least %lu:%02lu, or until %d client%s reconnect%s\n",
+	LCONSOLE_WARN("%s: Will be in recovery for at least %llu:%02llu, or until %d client%s reconnect%s\n",
 		      obd->obd_name,
 		      obd->obd_recovery_timeout / 60,
 		      obd->obd_recovery_timeout % 60,
-		      atomic_read(&obd->obd_max_recoverable_clients),
-		      (atomic_read(&obd->obd_max_recoverable_clients) == 1) ?
-		      "" : "s",
-		      (atomic_read(&obd->obd_max_recoverable_clients) == 1) ?
-		      "s" : "");
+		      obd->obd_max_recoverable_clients,
+		      (obd->obd_max_recoverable_clients == 1) ? "" : "s",
+		      (obd->obd_max_recoverable_clients == 1) ? "s": "");
 }
 
 /**
  * extend recovery window.
  *
- * if @extend is true, extend recovery window to have @dr_timeout remaining
- * at least; otherwise, make sure the recovery timeout value is not less
- * than @dr_timeout.
+ * if @extend is true, extend recovery window to have @drt remaining at least;
+ * otherwise, make sure the recovery timeout value is not less than @drt.
  */
-static void extend_recovery_timer(struct obd_device *obd, time_t dr_timeout,
+static void extend_recovery_timer(struct obd_device *obd, int drt,
 				  bool extend)
 {
-	ktime_t left_ns;
-	time_t timeout;
-	time_t left;
+	time64_t now;
+	time64_t end;
+	time64_t left;
+	time64_t to;
 
 	spin_lock(&obd->obd_dev_lock);
 	if (!obd->obd_recovering || obd->obd_abort_recovery) {
@@ -1802,43 +1769,33 @@ static void extend_recovery_timer(struct obd_device *obd, time_t dr_timeout,
 	}
 	LASSERT(obd->obd_recovery_start != 0);
 
-	left_ns = hrtimer_expires_remaining(&obd->obd_recovery_timer);
-	left = ktime_divns(left_ns, NSEC_PER_SEC);
-
-	if (extend) {
-		timeout = obd->obd_recovery_timeout;
-		/* dr_timeout will happen after the hrtimer has expired.
-		 * Add the excess time to the soft recovery timeout without
-		 * exceeding the hard recovery timeout.
-		 */
-		if (dr_timeout > left) {
-			timeout += dr_timeout - left;
-			timeout = min_t(time_t, obd->obd_recovery_time_hard,
-					timeout);
-		}
-	} else {
-		timeout = clamp_t(time_t, dr_timeout, obd->obd_recovery_timeout,
-				  obd->obd_recovery_time_hard);
-	}
+	now = ktime_get_real_seconds();
+	to = obd->obd_recovery_timeout;
+	end = obd->obd_recovery_start + to;
+	left = end - now;
 
-	if (timeout == obd->obd_recovery_time_hard)
-		CWARN("%s: extended recovery timer reached hard limit: %ld, extend: %d\n",
-		      obd->obd_name, timeout, extend);
-
-	if (obd->obd_recovery_timeout < timeout) {
-		ktime_t end, now;
+        if (extend && (drt > left)) {
+                to += drt - left;
+        } else if (!extend && (drt > to)) {
+                to = drt;
+        }
 
-		obd->obd_recovery_timeout = timeout;
-		end = ktime_set(obd->obd_recovery_start + timeout, 0);
-		now = ktime_set(ktime_get_seconds(), 0);
-		left_ns = ktime_sub(end, now);
-		hrtimer_start(&obd->obd_recovery_timer, end, HRTIMER_MODE_ABS);
-		left = ktime_divns(left_ns, NSEC_PER_SEC);
+	if (to > obd->obd_recovery_time_hard) {
+		to = obd->obd_recovery_time_hard;
+		CWARN("%s: extended recovery timer reaching hard limit: %lld, extend: %d\n",
+		      obd->obd_name, to, extend);
 	}
+
+	if (obd->obd_recovery_timeout < to) {
+                obd->obd_recovery_timeout = to;
+		end = obd->obd_recovery_start + to;
+		mod_timer(&obd->obd_recovery_timer,
+			  cfs_time_shift(end - now));
+        }
 	spin_unlock(&obd->obd_dev_lock);
 
-	CDEBUG(D_HA, "%s: recovery timer will expire in %ld seconds\n",
-		obd->obd_name, left);
+	CDEBUG(D_HA, "%s: recovery timer will expire in %lld seconds\n",
+		obd->obd_name, (s64)(end - now));
 }
 
 /* Reset the timer with each new client connection */
@@ -1851,45 +1808,40 @@ static void extend_recovery_timer(struct obd_device *obd, time_t dr_timeout,
  * be extended to make sure the client could be reconnected, in the
  * process, the timeout from the new client should be ignored.
  */
+
 static void
 check_and_start_recovery_timer(struct obd_device *obd,
-			       struct ptlrpc_request *req,
-			       int new_client)
+                               struct ptlrpc_request *req,
+                               int new_client)
 {
-	timeout_t service_timeout = lustre_msg_get_service_timeout(req->rq_reqmsg);
-	struct obd_device_target *obt = &obd->u.obt;
+        int service_time = lustre_msg_get_service_time(req->rq_reqmsg);
+        struct obd_device_target *obt = &obd->u.obt;
 
-	if (!new_client && service_timeout)
-		/*
-		 * Teach server about old server's estimates, as first guess
-		 * at how long new requests will take.
-		 */
+        if (!new_client && service_time)
+                /* Teach server about old server's estimates, as first guess
+                 * at how long new requests will take. */
 		at_measured(&req->rq_rqbd->rqbd_svcpt->scp_at_estimate,
-			    service_timeout);
+                            service_time);
 
-	target_start_recovery_timer(obd);
+        target_start_recovery_timer(obd);
 
-	/*
-	 * Convert the service time to RPC timeout,
-	 * and reuse service_timeout to limit stack usage.
-	 */
-	service_timeout = at_est2timeout(service_timeout);
+	/* Convert the service time to RPC timeout,
+	 * and reuse service_time to limit stack usage. */
+	service_time = at_est2timeout(service_time);
 
 	if (OBD_FAIL_CHECK(OBD_FAIL_TGT_SLUGGISH_NET) &&
-	    service_timeout < at_extra)
-		service_timeout = at_extra;
+	    service_time < at_extra)
+		service_time = at_extra;
 
-	/*
-	 * We expect other clients to timeout within service_timeout, then try
+	/* We expect other clients to timeout within service_time, then try
 	 * to reconnect, then try the failover server.  The max delay between
-	 * connect attempts is SWITCH_MAX + SWITCH_INC + INITIAL.
-	 */
-	service_timeout += 2 * INITIAL_CONNECT_TIMEOUT;
+	 * connect attempts is SWITCH_MAX + SWITCH_INC + INITIAL. */
+        service_time += 2 * INITIAL_CONNECT_TIMEOUT;
 
-	LASSERT(obt->obt_magic == OBT_MAGIC);
-	service_timeout += 2 * (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC);
-	if (service_timeout > obd->obd_recovery_timeout && !new_client)
-		extend_recovery_timer(obd, service_timeout, false);
+        LASSERT(obt->obt_magic == OBT_MAGIC);
+	service_time += 2 * (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC);
+	if (service_time > obd->obd_recovery_timeout && !new_client)
+		extend_recovery_timer(obd, service_time, false);
 }
 
 /** Health checking routines */
@@ -1961,10 +1913,9 @@ static int check_for_next_transno(struct lu_target *lut)
 	queue_len = obd->obd_requests_queued_for_recovery;
 	next_transno = obd->obd_next_recovery_transno;
 
-	CDEBUG(D_HA,
-	       "max: %d, connected: %d, completed: %d, queue_len: %d, req_transno: %llu, next_transno: %llu\n",
-	       atomic_read(&obd->obd_max_recoverable_clients),
-	       connected, completed,
+	CDEBUG(D_HA, "max: %d, connected: %d, completed: %d, queue_len: %d, "
+	       "req_transno: %llu, next_transno: %llu\n",
+	       obd->obd_max_recoverable_clients, connected, completed,
 	       queue_len, req_transno, next_transno);
 
 	if (obd->obd_abort_recovery) {
@@ -2036,24 +1987,6 @@ static int check_for_next_lock(struct lu_target *lut)
 	return wake_up;
 }
 
-static int check_update_llog(struct lu_target *lut)
-{
-	struct obd_device *obd = lut->lut_obd;
-	struct target_distribute_txn_data *tdtd = lut->lut_tdtd;
-
-	if (obd->obd_abort_recovery) {
-		CDEBUG(D_HA, "waking for aborted recovery\n");
-		return 1;
-	}
-
-	if (atomic_read(&tdtd->tdtd_recovery_threads_count) == 0) {
-		CDEBUG(D_HA, "waking for completion of reading update log\n");
-		return 1;
-	}
-
-	return 0;
-}
-
 /**
  * wait for recovery events,
  * check its status with help of check_routine
@@ -2077,7 +2010,7 @@ static int target_recovery_overseer(struct lu_target *lut,
 			last = now;
 		}
 	}
-	if (obd->obd_recovery_start != 0 && ktime_get_seconds() >=
+	if (obd->obd_recovery_start != 0 && ktime_get_real_seconds() >=
 	      (obd->obd_recovery_start + obd->obd_recovery_time_hard)) {
 		__u64 next_update_transno = 0;
 
@@ -2093,16 +2026,16 @@ static int target_recovery_overseer(struct lu_target *lut,
 			 * updatelog retrieve threads did not get any records
 			 * yet, let's wait those threads stopped */
 			if (next_update_transno == 0) {
-				spin_unlock(&obd->obd_recovery_task_lock);
+				struct l_wait_info lwi = { 0 };
 
-				while (wait_event_timeout(
-					tdtd->tdtd_recovery_threads_waitq,
-					check_update_llog(lut),
-					cfs_time_seconds(60)) == 0);
+				l_wait_event(tdtd->tdtd_recovery_threads_waitq,
+				       atomic_read(
+				       &tdtd->tdtd_recovery_threads_count) == 0,
+				       &lwi);
 
-				spin_lock(&obd->obd_recovery_task_lock);
 				next_update_transno =
-					distribute_txn_get_next_transno(tdtd);
+					distribute_txn_get_next_transno(
+								lut->lut_tdtd);
 			}
 		}
 
@@ -2155,7 +2088,6 @@ static int target_recovery_overseer(struct lu_target *lut,
 		return 1;
 	} else if (obd->obd_recovery_expired) {
 		obd->obd_recovery_expired = 0;
-
 		/** If some clients died being recovered, evict them */
 		LCONSOLE_WARN("%s: recovery is timed out, "
 			      "evict stale exports\n", obd->obd_name);
@@ -2246,41 +2178,34 @@ static void handle_recovery_req(struct ptlrpc_thread *thread,
         (void)handler(req);
         lu_context_exit(&thread->t_env->le_ctx);
 
-	req->rq_svc_thread->t_env->le_ses = NULL;
-
-	/* don't reset timer for final stage */
-	if (!exp_finished(req->rq_export)) {
-		timeout_t timeout = obd_timeout;
+        /* don't reset timer for final stage */
+        if (!exp_finished(req->rq_export)) {
+                int to = obd_timeout;
 
-		/**
-		 * Add request @timeout to the recovery time so next request from
-		 * this client may come in recovery time
-		 */
-		if (!AT_OFF) {
+                /**
+                 * Add request timeout to the recovery time so next request from
+                 * this client may come in recovery time
+                 */
+                if (!AT_OFF) {
 			struct ptlrpc_service_part *svcpt;
-			timeout_t est_timeout;
 
 			svcpt = req->rq_rqbd->rqbd_svcpt;
 			/* If the server sent early reply for this request,
 			 * the client will recalculate the timeout according to
 			 * current server estimate service time, so we will
 			 * use the maxium timeout here for waiting the client
-			 * sending the next req
-			 */
-			est_timeout = at_get(&svcpt->scp_at_estimate);
-			timeout = max_t(timeout_t, at_est2timeout(est_timeout),
-					lustre_msg_get_timeout(req->rq_reqmsg));
-			/*
-			 * Add 2 net_latency, one for balance rq_deadline
+			 * sending the next req */
+			to = max((int)at_est2timeout(
+				 at_get(&svcpt->scp_at_estimate)),
+				 (int)lustre_msg_get_timeout(req->rq_reqmsg));
+			/* Add 2 net_latency, one for balance rq_deadline
 			 * (see ptl_send_rpc), one for resend the req to server,
 			 * Note: client will pack net_latency in replay req
-			 * (see ptlrpc_replay_req)
-			 */
-			timeout += 2 * lustre_msg_get_service_timeout(req->rq_reqmsg);
-		}
-		extend_recovery_timer(class_exp2obd(req->rq_export), timeout,
-				      true);
-	}
+			 * (see ptlrpc_replay_req) */
+			to += 2 * lustre_msg_get_service_time(req->rq_reqmsg);
+                }
+                extend_recovery_timer(class_exp2obd(req->rq_export), to, true);
+        }
 	EXIT;
 }
 
@@ -2290,17 +2215,15 @@ static int check_for_recovery_ready(struct lu_target *lut)
 	struct obd_device *obd = lut->lut_obd;
 	unsigned int clnts = atomic_read(&obd->obd_connected_clients);
 
-	CDEBUG(D_HA,
-	       "connected %d stale %d max_recoverable_clients %d abort %d expired %d\n",
-	       clnts, obd->obd_stale_clients,
-	       atomic_read(&obd->obd_max_recoverable_clients),
-	       obd->obd_abort_recovery, obd->obd_recovery_expired);
+	CDEBUG(D_HA, "connected %d stale %d max_recoverable_clients %d"
+	       " abort %d expired %d\n", clnts, obd->obd_stale_clients,
+	       obd->obd_max_recoverable_clients, obd->obd_abort_recovery,
+	       obd->obd_recovery_expired);
 
 	if (!obd->obd_abort_recovery && !obd->obd_recovery_expired) {
-		LASSERT(clnts <=
-			atomic_read(&obd->obd_max_recoverable_clients));
+		LASSERT(clnts <= obd->obd_max_recoverable_clients);
 		if (clnts + obd->obd_stale_clients <
-		    atomic_read(&obd->obd_max_recoverable_clients))
+		    obd->obd_max_recoverable_clients)
 			return 0;
 	}
 
@@ -2311,8 +2234,7 @@ static int check_for_recovery_ready(struct lu_target *lut)
 			 * timer expired, and some clients got evicted */
 			extend_recovery_timer(obd, obd->obd_recovery_timeout,
 					      true);
-			CDEBUG(D_HA,
-			       "%s update recovery is not ready, extend recovery %lu\n",
+			CDEBUG(D_HA, "%s update recovery is not ready, extend recovery %llu\n",
 			       obd->obd_name, obd->obd_recovery_timeout);
 			return 0;
 		}
@@ -2405,8 +2327,6 @@ static void drop_duplicate_replay_req(struct lu_env *env,
 	obd->obd_replayed_requests++;
 }
 
-#define WATCHDOG_TIMEOUT (obd_timeout * 10)
-
 static void replay_request_or_update(struct lu_env *env,
 				     struct lu_target *lut,
 				     struct target_recovery_data *trd,
@@ -2477,13 +2397,8 @@ static void replay_request_or_update(struct lu_env *env,
 				  lustre_msg_get_transno(req->rq_reqmsg),
 				  libcfs_nid2str(req->rq_peer.nid));
 
-			thread->t_watchdog = lc_watchdog_add(WATCHDOG_TIMEOUT,
-							     NULL, NULL);
 			handle_recovery_req(thread, req,
 					    trd->trd_recovery_handler);
-			lc_watchdog_delete(thread->t_watchdog);
-			thread->t_watchdog = NULL;
-
 			/**
 			 * bz18031: increase next_recovery_transno before
 			 * target_request_copy_put() will drop exp_rpc reference
@@ -2503,11 +2418,7 @@ static void replay_request_or_update(struct lu_env *env,
 			LASSERT(tdtd != NULL);
 			dtrq = distribute_txn_get_next_req(tdtd);
 			lu_context_enter(&thread->t_env->le_ctx);
-			thread->t_watchdog = lc_watchdog_add(WATCHDOG_TIMEOUT,
-							     NULL, NULL);
 			rc = tdtd->tdtd_replay_handler(env, tdtd, dtrq);
-			lc_watchdog_delete(thread->t_watchdog);
-			thread->t_watchdog = NULL;
 			lu_context_exit(&thread->t_env->le_ctx);
 			extend_recovery_timer(obd, obd_timeout, true);
 
@@ -2562,16 +2473,18 @@ static int target_recovery_thread(void *arg)
         if (thread == NULL)
                 RETURN(-ENOMEM);
 
-	OBD_ALLOC_PTR(env);
-	if (env == NULL)
-		GOTO(out_thread, rc = -ENOMEM);
-	rc = lu_env_add(env);
-	if (rc)
-		GOTO(out_env, rc);
+        OBD_ALLOC_PTR(env);
+        if (env == NULL) {
+                OBD_FREE_PTR(thread);
+                RETURN(-ENOMEM);
+        }
 
         rc = lu_context_init(&env->le_ctx, LCT_MD_THREAD | LCT_DT_THREAD);
-	if (rc)
-		GOTO(out_env_remove, rc);
+        if (rc) {
+                OBD_FREE_PTR(thread);
+                OBD_FREE_PTR(env);
+                RETURN(rc);
+        }
 
         thread->t_env = env;
         thread->t_id = -1; /* force filter_iobuf_get/put to use local buffers */
@@ -2613,11 +2526,6 @@ static int target_recovery_thread(void *arg)
 		LASSERT(trd->trd_processing_task == current_pid());
 		DEBUG_REQ(D_HA, req, "processing lock from %s: ",
 			  libcfs_nid2str(req->rq_peer.nid));
-		if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_LOCK_REPLAY)) {
-			req->rq_status = -ENODEV;
-			target_request_copy_put(req);
-			continue;
-		}
 		handle_recovery_req(thread, req,
 				    trd->trd_recovery_handler);
 		target_request_copy_put(req);
@@ -2668,12 +2576,8 @@ static int target_recovery_thread(void *arg)
 	complete(&trd->trd_finishing);
 
 	tgt_io_thread_done(thread);
-out_env_remove:
-	lu_env_remove(env);
-out_env:
-	OBD_FREE_PTR(env);
-out_thread:
 	OBD_FREE_PTR(thread);
+	OBD_FREE_PTR(env);
 	RETURN(rc);
 }
 
@@ -2730,20 +2634,17 @@ void target_recovery_fini(struct obd_device *obd)
 }
 EXPORT_SYMBOL(target_recovery_fini);
 
-static enum hrtimer_restart target_recovery_expired(struct hrtimer *timer)
+static void target_recovery_expired(cfs_timer_cb_arg_t data)
 {
-	struct obd_device *obd = container_of(timer, struct obd_device,
-					      obd_recovery_timer);
-
-	CDEBUG(D_HA,
-	       "%s: recovery timed out; %d clients are still in recovery after %llu seconds (%d clients connected)\n",
+	struct obd_device *obd = cfs_from_timer(obd, data, obd_recovery_timer);
+	CDEBUG(D_HA, "%s: recovery timed out; %d clients are still in recovery"
+	       " after %llus (%d clients connected)\n",
 	       obd->obd_name, atomic_read(&obd->obd_lock_replay_clients),
-	       ktime_get_real_seconds() - obd->obd_recovery_start,
+	       (s64)(ktime_get_real_seconds() - obd->obd_recovery_start),
 	       atomic_read(&obd->obd_connected_clients));
 
 	obd->obd_recovery_expired = 1;
 	wake_up(&obd->obd_next_transno_waitq);
-	return HRTIMER_NORESTART;
 }
 
 void target_recovery_init(struct lu_target *lut, svc_handler_t handler)
@@ -2753,7 +2654,7 @@ void target_recovery_init(struct lu_target *lut, svc_handler_t handler)
 	if (lut->lut_bottom->dd_rdonly)
 		return;
 
-	if (atomic_read(&obd->obd_max_recoverable_clients) == 0) {
+	if (obd->obd_max_recoverable_clients == 0) {
 		/** Update server last boot epoch */
 		tgt_boot_epoch_update(lut);
 		return;
@@ -2761,16 +2662,14 @@ void target_recovery_init(struct lu_target *lut, svc_handler_t handler)
 
 	CDEBUG(D_HA, "RECOVERY: service %s, %d recoverable clients, "
 	       "last_transno %llu\n", obd->obd_name,
-	       atomic_read(&obd->obd_max_recoverable_clients),
-	       obd->obd_last_committed);
-	LASSERT(obd->obd_stopping == 0);
-	obd->obd_next_recovery_transno = obd->obd_last_committed + 1;
-	obd->obd_recovery_start = 0;
-	obd->obd_recovery_end = 0;
-
-	hrtimer_init(&obd->obd_recovery_timer, CLOCK_MONOTONIC,
-		     HRTIMER_MODE_ABS);
-	obd->obd_recovery_timer.function = &target_recovery_expired;
+	       obd->obd_max_recoverable_clients, obd->obd_last_committed);
+        LASSERT(obd->obd_stopping == 0);
+        obd->obd_next_recovery_transno = obd->obd_last_committed + 1;
+        obd->obd_recovery_start = 0;
+        obd->obd_recovery_end = 0;
+
+	cfs_timer_setup(&obd->obd_recovery_timer, target_recovery_expired,
+			(unsigned long)obd, 0);
 	target_start_recovery_thread(lut, handler);
 }
 EXPORT_SYMBOL(target_recovery_init);
@@ -2826,17 +2725,6 @@ int target_queue_recovery_request(struct ptlrpc_request *req,
         target_process_req_flags(obd, req);
 
         if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LOCK_REPLAY_DONE) {
-		if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_TGT_RECOVERY_REQ_RACE))) {
-			if (cfs_fail_val == 1) {
-				cfs_race_state = 1;
-				cfs_fail_val = 0;
-				wake_up(&cfs_race_waitq);
-
-				set_current_state(TASK_INTERRUPTIBLE);
-				schedule_timeout(cfs_time_seconds(1));
-			}
-		}
-
                 /* client declares he's ready to complete recovery
                  * so, we put the request on th final queue */
 		target_request_copy_get(req);
@@ -2987,6 +2875,12 @@ int target_queue_recovery_request(struct ptlrpc_request *req,
 	RETURN(0);
 }
 
+int target_handle_ping(struct ptlrpc_request *req)
+{
+        obd_ping(req->rq_svc_thread->t_env, req->rq_export);
+        return req_capsule_server_pack(&req->rq_pill);
+}
+
 void target_committed_to_req(struct ptlrpc_request *req)
 {
         struct obd_export *exp = req->rq_export;
@@ -3278,10 +3172,10 @@ static inline const char *bulk2type(struct ptlrpc_request *req)
 int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc,
                    struct l_wait_info *lwi)
 {
-	struct ptlrpc_request *req = desc->bd_req;
-	time64_t start = ktime_get_seconds();
-	time64_t deadline;
-	int rc = 0;
+	struct ptlrpc_request	*req = desc->bd_req;
+	time_t			 start = cfs_time_current_sec();
+	time_t			 deadline;
+	int			 rc = 0;
 
 	ENTRY;
 
@@ -3328,13 +3222,12 @@ int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc,
 		deadline = req->rq_deadline;
 
 	do {
-		time64_t timeoutl = deadline - ktime_get_seconds();
-		long timeout_jiffies = timeoutl <= 0 ?
-				       1 : cfs_time_seconds(timeoutl);
-		time64_t rq_deadline;
+		long timeoutl = deadline - cfs_time_current_sec();
+		cfs_duration_t timeout = timeoutl <= 0 ?
+					 CFS_TICK : cfs_time_seconds(timeoutl);
+		time_t	rq_deadline;
 
-		*lwi = LWI_TIMEOUT_INTERVAL(timeout_jiffies,
-					    cfs_time_seconds(1),
+		*lwi = LWI_TIMEOUT_INTERVAL(timeout, cfs_time_seconds(1),
 					    target_bulk_timeout, desc);
 		rc = l_wait_event(desc->bd_waitq,
 				  !ptlrpc_server_bulk_active(desc) ||
@@ -3344,17 +3237,17 @@ int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc,
 				  lwi);
 		LASSERT(rc == 0 || rc == -ETIMEDOUT);
 		/* Wait again if we changed rq_deadline. */
-		rq_deadline = READ_ONCE(req->rq_deadline);
+		rq_deadline = ACCESS_ONCE(req->rq_deadline);
 		deadline = start + bulk_timeout;
 		if (deadline > rq_deadline)
 			deadline = rq_deadline;
-	} while (rc == -ETIMEDOUT &&
-		 deadline > ktime_get_seconds());
+	} while ((rc == -ETIMEDOUT) &&
+		 (deadline > cfs_time_current_sec()));
 
 	if (rc == -ETIMEDOUT) {
-		DEBUG_REQ(D_ERROR, req, "timeout on bulk %s after %lld%+llds",
+		DEBUG_REQ(D_ERROR, req, "timeout on bulk %s after %ld%+lds",
 			  bulk2type(req), deadline - start,
-			  ktime_get_real_seconds() - deadline);
+			  cfs_time_current_sec() - deadline);
 		ptlrpc_abort_bulk(desc);
 	} else if (exp->exp_failed) {
 		DEBUG_REQ(D_ERROR, req, "Eviction on bulk %s",
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c
index 42eccaf9cf861..df28b2d7b5131 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2017, Intel Corporation.
+ * Copyright (c) 2010, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -44,9 +44,6 @@
 
 #include "ldlm_internal.h"
 
-struct kmem_cache *ldlm_glimpse_work_kmem;
-EXPORT_SYMBOL(ldlm_glimpse_work_kmem);
-
 /* lock types */
 char *ldlm_lockname[] = {
 	[0] = "--",
@@ -125,6 +122,8 @@ const char *ldlm_it2str(enum ldlm_intent_flags it)
 		return "getattr";
 	case IT_LOOKUP:
 		return "lookup";
+	case IT_UNLINK:
+		return "unlink";
 	case IT_GETXATTR:
 		return "getxattr";
 	case IT_LAYOUT:
@@ -151,19 +150,6 @@ ldlm_processing_policy ldlm_get_processing_policy(struct ldlm_resource *res)
         return ldlm_processing_policy_table[res->lr_type];
 }
 EXPORT_SYMBOL(ldlm_get_processing_policy);
-
-static ldlm_reprocessing_policy ldlm_reprocessing_policy_table[] = {
-	[LDLM_PLAIN]	= ldlm_reprocess_queue,
-	[LDLM_EXTENT]	= ldlm_reprocess_queue,
-	[LDLM_FLOCK]	= ldlm_reprocess_queue,
-	[LDLM_IBITS]	= ldlm_reprocess_inodebits_queue,
-};
-
-ldlm_reprocessing_policy ldlm_get_reprocessing_policy(struct ldlm_resource *res)
-{
-	return ldlm_reprocessing_policy_table[res->lr_type];
-}
-
 #endif /* HAVE_SERVER_SUPPORT */
 
 void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg)
@@ -218,6 +204,8 @@ void ldlm_lock_put(struct ldlm_lock *lock)
                 lprocfs_counter_decr(ldlm_res_to_ns(res)->ns_stats,
                                      LDLM_NSS_LOCKS);
                 lu_ref_del(&res->lr_reference, "lock", lock);
+                ldlm_resource_putref(res);
+                lock->l_resource = NULL;
                 if (lock->l_export) {
                         class_export_lock_put(lock->l_export, lock);
                         lock->l_export = NULL;
@@ -226,15 +214,7 @@ void ldlm_lock_put(struct ldlm_lock *lock)
                 if (lock->l_lvb_data != NULL)
                         OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
 
-		if (res->lr_type == LDLM_EXTENT) {
-			ldlm_interval_free(ldlm_interval_detach(lock));
-		} else if (res->lr_type == LDLM_IBITS) {
-			if (lock->l_ibits_node != NULL)
-				OBD_SLAB_FREE_PTR(lock->l_ibits_node,
-						  ldlm_inodebits_slab);
-		}
-		ldlm_resource_putref(res);
-		lock->l_resource = NULL;
+                ldlm_interval_free(ldlm_interval_detach(lock));
                 lu_ref_fini(&lock->l_reference);
 		OBD_FREE_RCU(lock, sizeof(*lock), &lock->l_handle);
         }
@@ -497,7 +477,7 @@ static struct ldlm_lock *ldlm_lock_new(struct ldlm_resource *resource)
 
         lprocfs_counter_incr(ldlm_res_to_ns(resource)->ns_stats,
                              LDLM_NSS_LOCKS);
-	INIT_LIST_HEAD_RCU(&lock->l_handle.h_link);
+	INIT_LIST_HEAD(&lock->l_handle.h_link);
 	class_handle_hash(&lock->l_handle, &lock_handle_ops);
 
         lu_ref_init(&lock->l_reference);
@@ -684,19 +664,12 @@ static void ldlm_add_bl_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
 		 * discard dirty data, rather than writing back. */
 		if (ldlm_is_ast_discard_data(new))
 			ldlm_set_discard_data(lock);
-
-		/* Lock can be converted from a blocking state back to granted
-		 * after lock convert or COS downgrade but still be in an
-		 * older bl_list because it is controlled only by
-		 * ldlm_work_bl_ast_lock(), let it be processed there.
-		 */
-		if (list_empty(&lock->l_bl_ast)) {
-			list_add(&lock->l_bl_ast, work_list);
-			LDLM_LOCK_GET(lock);
-		}
-		LASSERT(lock->l_blocking_lock == NULL);
-		lock->l_blocking_lock = LDLM_LOCK_GET(new);
-	}
+		LASSERT(list_empty(&lock->l_bl_ast));
+		list_add(&lock->l_bl_ast, work_list);
+                LDLM_LOCK_GET(lock);
+                LASSERT(lock->l_blocking_lock == NULL);
+                lock->l_blocking_lock = LDLM_LOCK_GET(new);
+        }
 }
 
 /**
@@ -894,8 +867,7 @@ void ldlm_lock_decref_internal(struct ldlm_lock *lock, enum ldlm_mode mode)
         } else if (ns_is_client(ns) &&
                    !lock->l_readers && !lock->l_writers &&
 		   !ldlm_is_no_lru(lock) &&
-		   !ldlm_is_bl_ast(lock) &&
-		   !ldlm_is_converting(lock)) {
+		   !ldlm_is_bl_ast(lock)) {
 
                 LDLM_DEBUG(lock, "add lock into lru list");
 
@@ -1099,14 +1071,16 @@ static void ldlm_granted_list_add_lock(struct ldlm_lock *lock,
  * Add a lock to granted list on a resource maintaining skiplist
  * correctness.
  */
-void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock)
+static void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock)
 {
-	struct sl_insert_point prev;
+        struct sl_insert_point prev;
+        ENTRY;
 
-	LASSERT(ldlm_is_granted(lock));
+        LASSERT(lock->l_req_mode == lock->l_granted_mode);
 
-	search_granted_lock(&lock->l_resource->lr_granted, lock, &prev);
-	ldlm_granted_list_add_lock(lock, &prev);
+        search_granted_lock(&lock->l_resource->lr_granted, lock, &prev);
+        ldlm_granted_list_add_lock(lock, &prev);
+        EXIT;
 }
 
 /**
@@ -1116,6 +1090,7 @@ void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock)
  * NOTE: called by
  *  - ldlm_lock_enqueue
  *  - ldlm_reprocess_queue
+ *  - ldlm_lock_convert
  *
  * must be called with lr_lock held
  */
@@ -1156,6 +1131,18 @@ void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list)
         EXIT;
 }
 
+/**
+ * Describe the overlap between two locks.  itree_overlap_cb data.
+ */
+struct lock_match_data {
+	struct ldlm_lock	*lmd_old;
+	struct ldlm_lock	*lmd_lock;
+	enum ldlm_mode		*lmd_mode;
+	union ldlm_policy_data	*lmd_policy;
+	__u64			 lmd_flags;
+	int			 lmd_unref;
+};
+
 /**
  * Check if the given @lock meets the criteria for a match.
  * A reference on the lock is taken if matched.
@@ -1163,10 +1150,10 @@ void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list)
  * \param lock     test-against this lock
  * \param data	   parameters
  */
-static int lock_matches(struct ldlm_lock *lock, struct ldlm_match_data *data)
+static int lock_matches(struct ldlm_lock *lock, struct lock_match_data *data)
 {
 	union ldlm_policy_data *lpol = &lock->l_policy_data;
-	enum ldlm_mode match = LCK_MINMODE;
+	enum ldlm_mode match;
 
 	if (lock == data->lmd_old)
 		return INTERVAL_ITER_STOP;
@@ -1191,17 +1178,6 @@ static int lock_matches(struct ldlm_lock *lock, struct ldlm_match_data *data)
 
 	if (!(lock->l_req_mode & *data->lmd_mode))
 		return INTERVAL_ITER_CONT;
-
-	/* When we search for ast_data, we are not doing a traditional match,
-	 * so we don't worry about IBITS or extent matching.
-	 */
-	if (data->lmd_has_ast_data) {
-		if (!lock->l_ast_data)
-			return INTERVAL_ITER_CONT;
-
-		goto matched;
-	}
-
 	match = lock->l_req_mode;
 
 	switch (lock->l_resource->lr_type) {
@@ -1235,11 +1211,6 @@ static int lock_matches(struct ldlm_lock *lock, struct ldlm_match_data *data)
 	if (!equi(data->lmd_flags & LDLM_FL_LOCAL_ONLY, ldlm_is_local(lock)))
 		return INTERVAL_ITER_CONT;
 
-	/* Filter locks by skipping flags */
-	if (data->lmd_skip_flags & lock->l_flags)
-		return INTERVAL_ITER_CONT;
-
-matched:
 	if (data->lmd_flags & LDLM_FL_TEST_LOCK) {
 		LDLM_LOCK_GET(lock);
 		ldlm_lock_touch_in_lru(lock);
@@ -1256,7 +1227,7 @@ static int lock_matches(struct ldlm_lock *lock, struct ldlm_match_data *data)
 static unsigned int itree_overlap_cb(struct interval_node *in, void *args)
 {
 	struct ldlm_interval *node = to_ldlm_interval(in);
-	struct ldlm_match_data *data = args;
+	struct lock_match_data *data = args;
 	struct ldlm_lock *lock;
 	int rc;
 
@@ -1276,8 +1247,8 @@ static unsigned int itree_overlap_cb(struct interval_node *in, void *args)
  *
  * \retval a referenced lock or NULL.
  */
-struct ldlm_lock *search_itree(struct ldlm_resource *res,
-			       struct ldlm_match_data *data)
+static struct ldlm_lock *search_itree(struct ldlm_resource *res,
+				      struct lock_match_data *data)
 {
 	struct interval_node_extent ext = {
 		.start     = data->lmd_policy->l_extent.start,
@@ -1285,8 +1256,6 @@ struct ldlm_lock *search_itree(struct ldlm_resource *res,
 	};
 	int idx;
 
-	data->lmd_lock = NULL;
-
 	for (idx = 0; idx < LCK_MODE_NUM; idx++) {
 		struct ldlm_interval_tree *tree = &res->lr_itree[idx];
 
@@ -1298,13 +1267,9 @@ struct ldlm_lock *search_itree(struct ldlm_resource *res,
 
 		interval_search(tree->lit_root, &ext,
 				itree_overlap_cb, data);
-		if (data->lmd_lock)
-			return data->lmd_lock;
 	}
-
-	return NULL;
+	return data->lmd_lock;
 }
-EXPORT_SYMBOL(search_itree);
 
 
 /**
@@ -1316,19 +1281,16 @@ EXPORT_SYMBOL(search_itree);
  * \retval a referenced lock or NULL.
  */
 static struct ldlm_lock *search_queue(struct list_head *queue,
-				      struct ldlm_match_data *data)
+				      struct lock_match_data *data)
 {
 	struct ldlm_lock *lock;
 	int rc;
 
-	data->lmd_lock = NULL;
-
 	list_for_each_entry(lock, queue, l_res_link) {
 		rc = lock_matches(lock, data);
 		if (rc == INTERVAL_ITER_STOP)
 			return data->lmd_lock;
 	}
-
 	return NULL;
 }
 
@@ -1404,28 +1366,24 @@ EXPORT_SYMBOL(ldlm_lock_allow_match);
  * keep caller code unchanged), the context failure will be discovered by
  * caller sometime later.
  */
-enum ldlm_mode ldlm_lock_match_with_skip(struct ldlm_namespace *ns,
-					 __u64 flags, __u64 skip_flags,
-					 const struct ldlm_res_id *res_id,
-					 enum ldlm_type type,
-					 union ldlm_policy_data *policy,
-					 enum ldlm_mode mode,
-					 struct lustre_handle *lockh, int unref)
-{
-	struct ldlm_match_data data = {
-		.lmd_old = NULL,
-		.lmd_lock = NULL,
-		.lmd_mode = &mode,
-		.lmd_policy = policy,
-		.lmd_flags = flags,
-		.lmd_skip_flags = skip_flags,
-		.lmd_unref = unref,
-		.lmd_has_ast_data = false,
+enum ldlm_mode ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags,
+			       const struct ldlm_res_id *res_id,
+			       enum ldlm_type type,
+			       union ldlm_policy_data *policy,
+			       enum ldlm_mode mode,
+			       struct lustre_handle *lockh, int unref)
+{
+	struct lock_match_data data = {
+		.lmd_old	= NULL,
+		.lmd_lock	= NULL,
+		.lmd_mode	= &mode,
+		.lmd_policy	= policy,
+		.lmd_flags	= flags,
+		.lmd_unref	= unref,
 	};
 	struct ldlm_resource *res;
 	struct ldlm_lock *lock;
-	int matched;
-
+	int rc = 0;
 	ENTRY;
 
 	if (ns == NULL) {
@@ -1446,78 +1404,101 @@ enum ldlm_mode ldlm_lock_match_with_skip(struct ldlm_namespace *ns,
 
 	LDLM_RESOURCE_ADDREF(res);
 	lock_res(res);
+
 	if (res->lr_type == LDLM_EXTENT)
 		lock = search_itree(res, &data);
 	else
 		lock = search_queue(&res->lr_granted, &data);
-	if (!lock && !(flags & LDLM_FL_BLOCK_GRANTED))
-		lock = search_queue(&res->lr_waiting, &data);
-	matched = lock ? mode : 0;
-	unlock_res(res);
-	LDLM_RESOURCE_DELREF(res);
-	ldlm_resource_putref(res);
+	if (lock != NULL)
+		GOTO(out, rc = 1);
+	if (flags & LDLM_FL_BLOCK_GRANTED)
+		GOTO(out, rc = 0);
+	lock = search_queue(&res->lr_converting, &data);
+	if (lock != NULL)
+		GOTO(out, rc = 1);
+	lock = search_queue(&res->lr_waiting, &data);
+	if (lock != NULL)
+		GOTO(out, rc = 1);
+
+        EXIT;
+ out:
+        unlock_res(res);
+        LDLM_RESOURCE_DELREF(res);
+        ldlm_resource_putref(res);
 
-	if (lock) {
-		ldlm_lock2handle(lock, lockh);
-		if ((flags & LDLM_FL_LVB_READY) &&
+        if (lock) {
+                ldlm_lock2handle(lock, lockh);
+                if ((flags & LDLM_FL_LVB_READY) &&
 		    (!ldlm_is_lvb_ready(lock))) {
 			__u64 wait_flags = LDLM_FL_LVB_READY |
 				LDLM_FL_DESTROYED | LDLM_FL_FAIL_NOTIFIED;
-			struct l_wait_info lwi;
-
-			if (lock->l_completion_ast) {
-				int err = lock->l_completion_ast(lock,
-							LDLM_FL_WAIT_NOREPROC,
-							NULL);
-				if (err)
-					GOTO(out_fail_match, matched = 0);
-			}
+                        struct l_wait_info lwi;
+                        if (lock->l_completion_ast) {
+                                int err = lock->l_completion_ast(lock,
+                                                          LDLM_FL_WAIT_NOREPROC,
+                                                                 NULL);
+                                if (err) {
+                                        if (flags & LDLM_FL_TEST_LOCK)
+                                                LDLM_LOCK_RELEASE(lock);
+                                        else
+                                                ldlm_lock_decref_internal(lock,
+                                                                          mode);
+                                        rc = 0;
+                                        goto out2;
+                                }
+                        }
 
-			lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(obd_timeout),
-					       NULL, LWI_ON_SIGNAL_NOOP, NULL);
+                        lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(obd_timeout),
+                                               NULL, LWI_ON_SIGNAL_NOOP, NULL);
 
 			/* XXX FIXME see comment on CAN_MATCH in lustre_dlm.h */
-			l_wait_event(lock->l_waitq, lock->l_flags & wait_flags,
+			l_wait_event(lock->l_waitq,
+				     lock->l_flags & wait_flags,
 				     &lwi);
-			if (!ldlm_is_lvb_ready(lock))
-				GOTO(out_fail_match, matched = 0);
-		}
-
-		/* check user's security context */
-		if (lock->l_conn_export &&
-		    sptlrpc_import_check_ctx(
-				class_exp2cliimp(lock->l_conn_export)))
-			GOTO(out_fail_match, matched = 0);
-
+			if (!ldlm_is_lvb_ready(lock)) {
+                                if (flags & LDLM_FL_TEST_LOCK)
+                                        LDLM_LOCK_RELEASE(lock);
+                                else
+                                        ldlm_lock_decref_internal(lock, mode);
+                                rc = 0;
+                        }
+                }
+        }
+ out2:
+        if (rc) {
 		LDLM_DEBUG(lock, "matched (%llu %llu)",
-			   (type == LDLM_PLAIN || type == LDLM_IBITS) ?
-			   res_id->name[2] : policy->l_extent.start,
-			   (type == LDLM_PLAIN || type == LDLM_IBITS) ?
-			   res_id->name[3] : policy->l_extent.end);
-
-out_fail_match:
-		if (flags & LDLM_FL_TEST_LOCK)
-			LDLM_LOCK_RELEASE(lock);
-		else if (!matched)
-			ldlm_lock_decref_internal(lock, mode);
-	}
+                           (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+                                res_id->name[2] : policy->l_extent.start,
+                           (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+                                res_id->name[3] : policy->l_extent.end);
+
+                /* check user's security context */
+                if (lock->l_conn_export &&
+                    sptlrpc_import_check_ctx(
+                                class_exp2cliimp(lock->l_conn_export))) {
+                        if (!(flags & LDLM_FL_TEST_LOCK))
+                                ldlm_lock_decref_internal(lock, mode);
+                        rc = 0;
+                }
+
+                if (flags & LDLM_FL_TEST_LOCK)
+                        LDLM_LOCK_RELEASE(lock);
 
-	/* less verbose for test-only */
-	if (!matched && !(flags & LDLM_FL_TEST_LOCK)) {
-		LDLM_DEBUG_NOLOCK("not matched ns %p type %u mode %u res "
+        } else if (!(flags & LDLM_FL_TEST_LOCK)) {/*less verbose for test-only*/
+                LDLM_DEBUG_NOLOCK("not matched ns %p type %u mode %u res "
 				  "%llu/%llu (%llu %llu)", ns,
-				  type, mode, res_id->name[0], res_id->name[1],
-				  (type == LDLM_PLAIN || type == LDLM_IBITS) ?
-				  res_id->name[2] : policy->l_extent.start,
-				  (type == LDLM_PLAIN || type == LDLM_IBITS) ?
-				  res_id->name[3] : policy->l_extent.end);
-	}
+                                  type, mode, res_id->name[0], res_id->name[1],
+                                  (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+                                        res_id->name[2] :policy->l_extent.start,
+                                  (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+                                        res_id->name[3] : policy->l_extent.end);
+        }
 	if (data.lmd_old != NULL)
 		LDLM_LOCK_PUT(data.lmd_old);
 
-	return matched;
+	return rc ? mode : 0;
 }
-EXPORT_SYMBOL(ldlm_lock_match_with_skip);
+EXPORT_SYMBOL(ldlm_lock_match);
 
 enum ldlm_mode ldlm_revalidate_lock_handle(const struct lustre_handle *lockh,
 					   __u64 *bits)
@@ -1688,18 +1669,11 @@ struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns,
 		lock->l_glimpse_ast = cbs->lcs_glimpse;
 	}
 
-	switch (type) {
-	case LDLM_EXTENT:
-		rc = ldlm_extent_alloc_lock(lock);
-		break;
-	case LDLM_IBITS:
-		rc = ldlm_inodebits_alloc_lock(lock);
-		break;
-	default:
-		rc = 0;
-	}
-	if (rc)
-		GOTO(out, rc);
+	lock->l_tree_node = NULL;
+	/* if this is the extent lock, allocate the interval tree node */
+	if (type == LDLM_EXTENT)
+		if (ldlm_interval_alloc(lock) == NULL)
+			GOTO(out, rc = -ENOMEM);
 
 	if (lvb_len) {
 		lock->l_lvb_len = lvb_len;
@@ -1720,30 +1694,6 @@ struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns,
 	RETURN(ERR_PTR(rc));
 }
 
-#ifdef HAVE_SERVER_SUPPORT
-static enum ldlm_error ldlm_lock_enqueue_helper(struct ldlm_lock *lock,
-					     __u64 *flags)
-{
-	struct ldlm_resource *res = lock->l_resource;
-	enum ldlm_error rc = ELDLM_OK;
-	struct list_head rpc_list = LIST_HEAD_INIT(rpc_list);
-	ldlm_processing_policy policy;
-
-	ENTRY;
-
-	policy = ldlm_get_processing_policy(res);
-	policy(lock, flags, LDLM_PROCESS_ENQUEUE, &rc, &rpc_list);
-	if (rc == ELDLM_OK && lock->l_granted_mode != lock->l_req_mode &&
-	    res->lr_type != LDLM_FLOCK)
-		rc = ldlm_handle_conflict_lock(lock, flags, &rpc_list);
-
-	if (!list_empty(&rpc_list))
-		ldlm_discard_bl_list(&rpc_list);
-
-	RETURN(rc);
-}
-#endif
-
 /**
  * Enqueue (request) a lock.
  *
@@ -1754,14 +1704,16 @@ static enum ldlm_error ldlm_lock_enqueue_helper(struct ldlm_lock *lock,
  * set, skip all the enqueueing and delegate lock processing to intent policy
  * function.
  */
-enum ldlm_error ldlm_lock_enqueue(const struct lu_env *env,
-				  struct ldlm_namespace *ns,
+enum ldlm_error ldlm_lock_enqueue(struct ldlm_namespace *ns,
 				  struct ldlm_lock **lockp,
 				  void *cookie, __u64 *flags)
 {
 	struct ldlm_lock *lock = *lockp;
 	struct ldlm_resource *res = lock->l_resource;
 	int local = ns_is_client(ldlm_res_to_ns(res));
+#ifdef HAVE_SERVER_SUPPORT
+	ldlm_processing_policy policy;
+#endif
 	enum ldlm_error rc = ELDLM_OK;
 	struct ldlm_interval *node = NULL;
 	ENTRY;
@@ -1769,8 +1721,8 @@ enum ldlm_error ldlm_lock_enqueue(const struct lu_env *env,
         /* policies are not executed on the client or during replay */
         if ((*flags & (LDLM_FL_HAS_INTENT|LDLM_FL_REPLAY)) == LDLM_FL_HAS_INTENT
             && !local && ns->ns_policy) {
-		rc = ns->ns_policy(env, ns, lockp, cookie, lock->l_req_mode,
-				   *flags, NULL);
+                rc = ns->ns_policy(ns, lockp, cookie, lock->l_req_mode, *flags,
+                                   NULL);
                 if (rc == ELDLM_LOCK_REPLACED) {
                         /* The lock that was returned has already been granted,
                          * and placed into lockp.  If it's not the same as the
@@ -1783,7 +1735,7 @@ enum ldlm_error ldlm_lock_enqueue(const struct lu_env *env,
                         *flags |= LDLM_FL_LOCK_CHANGED;
                         RETURN(0);
 		} else if (rc != ELDLM_OK &&
-			   ldlm_is_granted(lock)) {
+			   lock->l_req_mode == lock->l_granted_mode) {
 			LASSERT(*flags & LDLM_FL_RESENT);
 			/* It may happen that ns_policy returns an error in
 			 * resend case, object may be unlinked or just some
@@ -1806,7 +1758,7 @@ enum ldlm_error ldlm_lock_enqueue(const struct lu_env *env,
 		 * Take NO_TIMEOUT from the lock as it is inherited through
 		 * LDLM_FL_INHERIT_MASK */
 		*flags |= LDLM_FL_LOCK_CHANGED;
-		if (!ldlm_is_granted(lock))
+		if (lock->l_req_mode != lock->l_granted_mode)
 			*flags |= LDLM_FL_BLOCK_GRANTED;
 		*flags |= lock->l_flags & LDLM_FL_NO_TIMEOUT;
 		RETURN(ELDLM_OK);
@@ -1819,8 +1771,8 @@ enum ldlm_error ldlm_lock_enqueue(const struct lu_env *env,
 	if (!local && (*flags & LDLM_FL_REPLAY) && res->lr_type == LDLM_EXTENT)
 		OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, GFP_NOFS);
 
-	lock_res_and_lock(lock);
-	if (local && ldlm_is_granted(lock)) {
+        lock_res_and_lock(lock);
+        if (local && lock->l_req_mode == lock->l_granted_mode) {
                 /* The server returned a blocked lock, but it was granted
                  * before we got a chance to actually enqueue it.  We don't
                  * need to do anything else. */
@@ -1861,27 +1813,33 @@ enum ldlm_error ldlm_lock_enqueue(const struct lu_env *env,
 	 * more or less trusting the clients not to lie.
 	 *
 	 * FIXME (bug 268): Detect obvious lies by checking compatibility in
-	 * granted queue. */
+	 * granted/converting queues. */
         if (local) {
-		if (*flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED))
-			ldlm_resource_add_lock(res, &res->lr_waiting, lock);
-		else
-			ldlm_grant_lock(lock, NULL);
+                if (*flags & LDLM_FL_BLOCK_CONV)
+                        ldlm_resource_add_lock(res, &res->lr_converting, lock);
+                else if (*flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED))
+                        ldlm_resource_add_lock(res, &res->lr_waiting, lock);
+                else
+                        ldlm_grant_lock(lock, NULL);
 		GOTO(out, rc = ELDLM_OK);
 #ifdef HAVE_SERVER_SUPPORT
-	} else if (*flags & LDLM_FL_REPLAY) {
-		if (*flags & LDLM_FL_BLOCK_WAIT) {
-			ldlm_resource_add_lock(res, &res->lr_waiting, lock);
+        } else if (*flags & LDLM_FL_REPLAY) {
+                if (*flags & LDLM_FL_BLOCK_CONV) {
+                        ldlm_resource_add_lock(res, &res->lr_converting, lock);
 			GOTO(out, rc = ELDLM_OK);
-		} else if (*flags & LDLM_FL_BLOCK_GRANTED) {
-			ldlm_grant_lock(lock, NULL);
+                } else if (*flags & LDLM_FL_BLOCK_WAIT) {
+                        ldlm_resource_add_lock(res, &res->lr_waiting, lock);
 			GOTO(out, rc = ELDLM_OK);
-		}
-		/* If no flags, fall through to normal enqueue path. */
-	}
+                } else if (*flags & LDLM_FL_BLOCK_GRANTED) {
+                        ldlm_grant_lock(lock, NULL);
+			GOTO(out, rc = ELDLM_OK);
+                }
+                /* If no flags, fall through to normal enqueue path. */
+        }
 
-	rc = ldlm_lock_enqueue_helper(lock, flags);
-	GOTO(out, rc);
+        policy = ldlm_processing_policy_table[res->lr_type];
+	policy(lock, flags, LDLM_PROCESS_ENQUEUE, &rc, NULL);
+        GOTO(out, rc);
 #else
         } else {
                 CERROR("This is client-side-only module, cannot handle "
@@ -1906,42 +1864,31 @@ enum ldlm_error ldlm_lock_enqueue(const struct lu_env *env,
  */
 int ldlm_reprocess_queue(struct ldlm_resource *res, struct list_head *queue,
 			 struct list_head *work_list,
-			 enum ldlm_process_intention intention,
-			 struct ldlm_lock *hint)
+			 enum ldlm_process_intention intention)
 {
 	struct list_head *tmp, *pos;
 	ldlm_processing_policy policy;
 	__u64 flags;
 	int rc = LDLM_ITER_CONTINUE;
 	enum ldlm_error err;
-	struct list_head bl_ast_list = LIST_HEAD_INIT(bl_ast_list);
-
 	ENTRY;
 
 	check_res_locked(res);
 
-	policy = ldlm_get_processing_policy(res);
+	policy = ldlm_processing_policy_table[res->lr_type];
 	LASSERT(policy);
 	LASSERT(intention == LDLM_PROCESS_RESCAN ||
 		intention == LDLM_PROCESS_RECOVERY);
 
-restart:
 	list_for_each_safe(tmp, pos, queue) {
 		struct ldlm_lock *pending;
-		struct list_head rpc_list = LIST_HEAD_INIT(rpc_list);
 
 		pending = list_entry(tmp, struct ldlm_lock, l_res_link);
 
                 CDEBUG(D_INFO, "Reprocessing lock %p\n", pending);
 
                 flags = 0;
-		rc = policy(pending, &flags, intention, &err, &rpc_list);
-		if (pending->l_granted_mode == pending->l_req_mode ||
-		    res->lr_type == LDLM_FLOCK) {
-			list_splice(&rpc_list, work_list);
-		} else {
-			list_splice(&rpc_list, &bl_ast_list);
-		}
+		rc = policy(pending, &flags, intention, &err, work_list);
 		/*
 		 * When this is called from recovery done, we always want
 		 * to scan the whole list no matter what 'rc' is returned.
@@ -1951,20 +1898,6 @@ int ldlm_reprocess_queue(struct ldlm_resource *res, struct list_head *queue,
 			break;
         }
 
-	if (!list_empty(&bl_ast_list)) {
-		unlock_res(res);
-
-		rc = ldlm_run_ast_work(ldlm_res_to_ns(res), &bl_ast_list,
-				       LDLM_WORK_BL_AST);
-
-		lock_res(res);
-		if (rc == -ERESTART)
-			GOTO(restart, rc);
-	}
-
-	if (!list_empty(&bl_ast_list))
-		ldlm_discard_bl_list(&bl_ast_list);
-
         RETURN(intention == LDLM_PROCESS_RESCAN ? rc : LDLM_ITER_CONTINUE);
 }
 
@@ -1975,6 +1908,7 @@ int ldlm_reprocess_queue(struct ldlm_resource *res, struct list_head *queue,
  * \param[in] lock		The lock to be enqueued.
  * \param[out] flags		Lock flags for the lock to be enqueued.
  * \param[in] rpc_list		Conflicting locks list.
+ * \param[in] grant_flags	extra flags when granting a lock.
  *
  * \retval -ERESTART:	Some lock was instantly canceled while sending
  * 			blocking ASTs, caller needs to re-check conflicting
@@ -1983,7 +1917,7 @@ int ldlm_reprocess_queue(struct ldlm_resource *res, struct list_head *queue,
  * \reval 0:		Lock is successfully added in waiting list.
  */
 int ldlm_handle_conflict_lock(struct ldlm_lock *lock, __u64 *flags,
-			      struct list_head *rpc_list)
+			      struct list_head *rpc_list, __u64 grant_flags)
 {
 	struct ldlm_resource *res = lock->l_resource;
 	int rc;
@@ -2008,9 +1942,6 @@ int ldlm_handle_conflict_lock(struct ldlm_lock *lock, __u64 *flags,
 	    !ns_is_client(ldlm_res_to_ns(res)))
 		class_fail_export(lock->l_export);
 
-	if (rc == -ERESTART)
-		ldlm_reprocess_all(res, NULL);
-
 	lock_res(res);
 	if (rc == -ERESTART) {
 		/* 15715: The lock was granted and destroyed after
@@ -2022,7 +1953,7 @@ int ldlm_handle_conflict_lock(struct ldlm_lock *lock, __u64 *flags,
 			RETURN(-EAGAIN);
 
 		/* lock was granted while resource was unlocked. */
-		if (ldlm_is_granted(lock)) {
+		if (lock->l_granted_mode == lock->l_req_mode) {
 			/* bug 11300: if the lock has been granted,
 			 * break earlier because otherwise, we will go
 			 * to restart and ldlm_resource_unlink will be
@@ -2030,10 +1961,12 @@ int ldlm_handle_conflict_lock(struct ldlm_lock *lock, __u64 *flags,
 			 * freed. Then we will fail at
 			 * ldlm_extent_add_lock() */
 			*flags &= ~LDLM_FL_BLOCKED_MASK;
+			RETURN(0);
 		}
 
+		RETURN(rc);
 	}
-	*flags |= LDLM_FL_BLOCK_GRANTED;
+	*flags |= (LDLM_FL_BLOCK_GRANTED | grant_flags);
 
 	RETURN(0);
 }
@@ -2046,21 +1979,27 @@ int ldlm_handle_conflict_lock(struct ldlm_lock *lock, __u64 *flags,
  */
 void ldlm_discard_bl_list(struct list_head *bl_list)
 {
-	struct ldlm_lock *lock, *tmp;
+	struct list_head *tmp, *pos;
+        ENTRY;
 
-	ENTRY;
+	list_for_each_safe(pos, tmp, bl_list) {
+                struct ldlm_lock *lock =
+			list_entry(pos, struct ldlm_lock, l_bl_ast);
 
-	list_for_each_entry_safe(lock, tmp, bl_list, l_bl_ast) {
-		LASSERT(!list_empty(&lock->l_bl_ast));
 		list_del_init(&lock->l_bl_ast);
+		LASSERT(ldlm_is_ast_sent(lock));
 		ldlm_clear_ast_sent(lock);
 		LASSERT(lock->l_bl_ast_run == 0);
-		ldlm_clear_blocking_lock(lock);
+		LASSERT(lock->l_blocking_lock);
+		LDLM_LOCK_RELEASE(lock->l_blocking_lock);
+		lock->l_blocking_lock = NULL;
 		LDLM_LOCK_RELEASE(lock);
 	}
 	EXIT;
 }
 
+#endif
+
 /**
  * Process a call to blocking AST callback for a lock in ast_work list
  */
@@ -2068,11 +2007,9 @@ static int
 ldlm_work_bl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
 {
 	struct ldlm_cb_set_arg *arg = opaq;
-	struct ldlm_lock *lock;
-	struct ldlm_lock_desc d;
-	struct ldlm_bl_desc bld;
-	int rc;
-
+	struct ldlm_lock_desc   d;
+	int                     rc;
+	struct ldlm_lock       *lock;
 	ENTRY;
 
 	if (list_empty(arg->list))
@@ -2080,49 +2017,66 @@ ldlm_work_bl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
 
 	lock = list_entry(arg->list->next, struct ldlm_lock, l_bl_ast);
 
-	/* nobody should touch l_bl_ast but some locks in the list may become
-	 * granted after lock convert or COS downgrade, these locks should be
-	 * just skipped here and removed from the list.
-	 */
+	/* nobody should touch l_bl_ast */
 	lock_res_and_lock(lock);
 	list_del_init(&lock->l_bl_ast);
 
-	/* lock is not blocking lock anymore, but was kept in the list because
-	 * it can managed only here.
-	 */
-	if (!ldlm_is_ast_sent(lock)) {
-		unlock_res_and_lock(lock);
-		LDLM_LOCK_RELEASE(lock);
-		RETURN(0);
-	}
-
-	LASSERT(lock->l_blocking_lock);
-	ldlm_lock2desc(lock->l_blocking_lock, &d);
-	/* copy blocking lock ibits in cancel_bits as well,
-	 * new client may use them for lock convert and it is
-	 * important to use new field to convert locks from
-	 * new servers only
-	 */
-	d.l_policy_data.l_inodebits.cancel_bits =
-		lock->l_blocking_lock->l_policy_data.l_inodebits.bits;
-
-	/* Blocking lock is being destroyed here but some information about it
-	 * may be needed inside l_blocking_ast() function below,
-	 * e.g. in mdt_blocking_ast(). So save needed data in bl_desc.
-	 */
-	bld.bl_same_client = lock->l_client_cookie ==
-			     lock->l_blocking_lock->l_client_cookie;
-	bld.bl_cos_incompat = ldlm_is_cos_incompat(lock->l_blocking_lock);
-	arg->bl_desc = &bld;
-
 	LASSERT(ldlm_is_ast_sent(lock));
 	LASSERT(lock->l_bl_ast_run == 0);
+	LASSERT(lock->l_blocking_lock);
 	lock->l_bl_ast_run++;
-	ldlm_clear_blocking_lock(lock);
 	unlock_res_and_lock(lock);
 
+	ldlm_lock2desc(lock->l_blocking_lock, &d);
+
 	rc = lock->l_blocking_ast(lock, &d, (void *)arg, LDLM_CB_BLOCKING);
+	LDLM_LOCK_RELEASE(lock->l_blocking_lock);
+	lock->l_blocking_lock = NULL;
+	LDLM_LOCK_RELEASE(lock);
 
+	RETURN(rc);
+}
+
+/**
+ * Process a call to completion AST callback for a lock in ast_work list
+ */
+static int
+ldlm_work_cp_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+	struct ldlm_cb_set_arg  *arg = opaq;
+	int                      rc = 0;
+	struct ldlm_lock        *lock;
+	ldlm_completion_callback completion_callback;
+	ENTRY;
+
+	if (list_empty(arg->list))
+		RETURN(-ENOENT);
+
+	lock = list_entry(arg->list->next, struct ldlm_lock, l_cp_ast);
+
+	/* It's possible to receive a completion AST before we've set
+	 * the l_completion_ast pointer: either because the AST arrived
+	 * before the reply, or simply because there's a small race
+	 * window between receiving the reply and finishing the local
+	 * enqueue. (bug 842)
+	 *
+	 * This can't happen with the blocking_ast, however, because we
+	 * will never call the local blocking_ast until we drop our
+	 * reader/writer reference, which we won't do until we get the
+	 * reply and finish enqueueing. */
+
+	/* nobody should touch l_cp_ast */
+	lock_res_and_lock(lock);
+	list_del_init(&lock->l_cp_ast);
+	LASSERT(ldlm_is_cp_reqd(lock));
+	/* save l_completion_ast since it can be changed by
+	 * mds_intent_policy(), see bug 14225 */
+	completion_callback = lock->l_completion_ast;
+	ldlm_clear_cp_reqd(lock);
+	unlock_res_and_lock(lock);
+
+	if (completion_callback != NULL)
+		rc = completion_callback(lock, 0, (void *)arg);
 	LDLM_LOCK_RELEASE(lock);
 
 	RETURN(rc);
@@ -2187,57 +2141,9 @@ int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
 		rc = 1;
 
 	LDLM_LOCK_RELEASE(lock);
-	if (gl_work->gl_flags & LDLM_GL_WORK_SLAB_ALLOCATED)
-		OBD_SLAB_FREE_PTR(gl_work, ldlm_glimpse_work_kmem);
-	else
-		OBD_FREE_PTR(gl_work);
-
-	RETURN(rc);
-}
-#endif
-
-/**
- * Process a call to completion AST callback for a lock in ast_work list
- */
-static int
-ldlm_work_cp_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
-{
-	struct ldlm_cb_set_arg *arg = opaq;
-	struct ldlm_lock *lock;
-	ldlm_completion_callback completion_callback;
-	int rc = 0;
-
-	ENTRY;
-
-	if (list_empty(arg->list))
-		RETURN(-ENOENT);
-
-	lock = list_entry(arg->list->next, struct ldlm_lock, l_cp_ast);
-
-	/* It's possible to receive a completion AST before we've set
-	 * the l_completion_ast pointer: either because the AST arrived
-	 * before the reply, or simply because there's a small race
-	 * window between receiving the reply and finishing the local
-	 * enqueue. (bug 842)
-	 *
-	 * This can't happen with the blocking_ast, however, because we
-	 * will never call the local blocking_ast until we drop our
-	 * reader/writer reference, which we won't do until we get the
-	 * reply and finish enqueueing. */
 
-	/* nobody should touch l_cp_ast */
-	lock_res_and_lock(lock);
-	list_del_init(&lock->l_cp_ast);
-	LASSERT(ldlm_is_cp_reqd(lock));
-	/* save l_completion_ast since it can be changed by
-	 * mds_intent_policy(), see bug 14225 */
-	completion_callback = lock->l_completion_ast;
-	ldlm_clear_cp_reqd(lock);
-	unlock_res_and_lock(lock);
-
-	if (completion_callback != NULL)
-		rc = completion_callback(lock, 0, (void *)arg);
-	LDLM_LOCK_RELEASE(lock);
+	if ((gl_work->gl_flags & LDLM_GL_WORK_NOFREE) == 0)
+		OBD_FREE_PTR(gl_work);
 
 	RETURN(rc);
 }
@@ -2249,11 +2155,11 @@ ldlm_work_cp_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
  * one.
  */
 int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
-		      ldlm_desc_ast_t ast_type)
+                      ldlm_desc_ast_t ast_type)
 {
 	struct ldlm_cb_set_arg *arg;
-	set_producer_func work_ast_lock;
-	int rc;
+	set_producer_func       work_ast_lock;
+	int                     rc;
 
 	if (list_empty(rpc_list))
 		RETURN(0);
@@ -2266,26 +2172,24 @@ int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
 	arg->list = rpc_list;
 
 	switch (ast_type) {
-	case LDLM_WORK_CP_AST:
-		arg->type = LDLM_CP_CALLBACK;
-		work_ast_lock = ldlm_work_cp_ast_lock;
-		break;
-#ifdef HAVE_SERVER_SUPPORT
-	case LDLM_WORK_BL_AST:
-		arg->type = LDLM_BL_CALLBACK;
-		work_ast_lock = ldlm_work_bl_ast_lock;
-		break;
-	case LDLM_WORK_REVOKE_AST:
-		arg->type = LDLM_BL_CALLBACK;
-		work_ast_lock = ldlm_work_revoke_ast_lock;
-		break;
-	case LDLM_WORK_GL_AST:
-		arg->type = LDLM_GL_CALLBACK;
-		work_ast_lock = ldlm_work_gl_ast_lock;
-		break;
-#endif
-	default:
-		LBUG();
+		case LDLM_WORK_BL_AST:
+			arg->type = LDLM_BL_CALLBACK;
+			work_ast_lock = ldlm_work_bl_ast_lock;
+			break;
+		case LDLM_WORK_CP_AST:
+			arg->type = LDLM_CP_CALLBACK;
+			work_ast_lock = ldlm_work_cp_ast_lock;
+			break;
+		case LDLM_WORK_REVOKE_AST:
+			arg->type = LDLM_BL_CALLBACK;
+			work_ast_lock = ldlm_work_revoke_ast_lock;
+			break;
+		case LDLM_WORK_GL_AST:
+			arg->type = LDLM_GL_CALLBACK;
+			work_ast_lock = ldlm_work_gl_ast_lock;
+			break;
+		default:
+			LBUG();
 	}
 
 	/* We create a ptlrpc request set with flow control extension.
@@ -2297,7 +2201,7 @@ int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
 	if (arg->set == NULL)
 		GOTO(out, rc = -ENOMEM);
 
-	ptlrpc_set_wait(NULL, arg->set);
+	ptlrpc_set_wait(arg->set);
 	ptlrpc_set_destroy(arg->set);
 
 	rc = atomic_read(&arg->restart) ? -ERESTART : 0;
@@ -2310,29 +2214,26 @@ int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
 /**
  * Try to grant all waiting locks on a resource.
  *
- * Calls ldlm_reprocess_queue on waiting queue.
+ * Calls ldlm_reprocess_queue on converting and waiting queues.
  *
  * Typically called after some resource locks are cancelled to see
  * if anything could be granted as a result of the cancellation.
  */
 static void __ldlm_reprocess_all(struct ldlm_resource *res,
-				 enum ldlm_process_intention intention,
-				 struct ldlm_lock *hint)
+				 enum ldlm_process_intention intention)
 {
 	struct list_head rpc_list;
 #ifdef HAVE_SERVER_SUPPORT
-	ldlm_reprocessing_policy reprocess;
 	struct obd_device *obd;
-	int rc;
-
-	ENTRY;
+        int rc;
+        ENTRY;
 
 	INIT_LIST_HEAD(&rpc_list);
-	/* Local lock trees don't get reprocessed. */
-	if (ns_is_client(ldlm_res_to_ns(res))) {
-		EXIT;
-		return;
-	}
+        /* Local lock trees don't get reprocessed. */
+        if (ns_is_client(ldlm_res_to_ns(res))) {
+                EXIT;
+                return;
+        }
 
 	/* Disable reprocess during lock replay stage but allow during
 	 * request replay stage.
@@ -2343,32 +2244,35 @@ static void __ldlm_reprocess_all(struct ldlm_resource *res,
 		RETURN_EXIT;
 restart:
 	lock_res(res);
-	reprocess = ldlm_get_reprocessing_policy(res);
-	reprocess(res, &res->lr_waiting, &rpc_list, intention, hint);
+	rc = ldlm_reprocess_queue(res, &res->lr_converting, &rpc_list,
+				  intention);
+	if (rc == LDLM_ITER_CONTINUE)
+		ldlm_reprocess_queue(res, &res->lr_waiting, &rpc_list,
+				     intention);
 	unlock_res(res);
 
-	rc = ldlm_run_ast_work(ldlm_res_to_ns(res), &rpc_list,
-			       LDLM_WORK_CP_AST);
-	if (rc == -ERESTART) {
+        rc = ldlm_run_ast_work(ldlm_res_to_ns(res), &rpc_list,
+                               LDLM_WORK_CP_AST);
+        if (rc == -ERESTART) {
 		LASSERT(list_empty(&rpc_list));
-		goto restart;
-	}
+                goto restart;
+        }
 #else
-	ENTRY;
+        ENTRY;
 
 	INIT_LIST_HEAD(&rpc_list);
-	if (!ns_is_client(ldlm_res_to_ns(res))) {
-		CERROR("This is client-side-only module, cannot handle "
-		       "LDLM_NAMESPACE_SERVER resource type lock.\n");
-		LBUG();
-	}
+        if (!ns_is_client(ldlm_res_to_ns(res))) {
+                CERROR("This is client-side-only module, cannot handle "
+                       "LDLM_NAMESPACE_SERVER resource type lock.\n");
+                LBUG();
+        }
 #endif
-	EXIT;
+        EXIT;
 }
 
-void ldlm_reprocess_all(struct ldlm_resource *res, struct ldlm_lock *hint)
+void ldlm_reprocess_all(struct ldlm_resource *res)
 {
-	__ldlm_reprocess_all(res, LDLM_PROCESS_RESCAN, hint);
+	__ldlm_reprocess_all(res, LDLM_PROCESS_RESCAN);
 }
 EXPORT_SYMBOL(ldlm_reprocess_all);
 
@@ -2378,7 +2282,7 @@ static int ldlm_reprocess_res(struct cfs_hash *hs, struct cfs_hash_bd *bd,
 	struct ldlm_resource *res = cfs_hash_object(hs, hnode);
 
 	/* This is only called once after recovery done. LU-8306. */
-	__ldlm_reprocess_all(res, LDLM_PROCESS_RECOVERY, NULL);
+	__ldlm_reprocess_all(res, LDLM_PROCESS_RECOVERY);
 	return 0;
 }
 
@@ -2460,7 +2364,6 @@ void ldlm_lock_cancel(struct ldlm_lock *lock)
          * talking to me first. -phik */
         if (lock->l_readers || lock->l_writers) {
                 LDLM_ERROR(lock, "lock still has references");
-		unlock_res_and_lock(lock);
                 LBUG();
         }
 
@@ -2478,8 +2381,8 @@ void ldlm_lock_cancel(struct ldlm_lock *lock)
         ldlm_resource_unlink_lock(lock);
         ldlm_lock_destroy_nolock(lock);
 
-	if (ldlm_is_granted(lock))
-		ldlm_pool_del(&ns->ns_pool, lock);
+        if (lock->l_granted_mode == lock->l_req_mode)
+                ldlm_pool_del(&ns->ns_pool, lock);
 
         /* Make sure we will not be called again for same lock what is possible
          * if not to zero out lock->l_granted_mode */
@@ -2511,7 +2414,6 @@ int ldlm_lock_set_data(const struct lustre_handle *lockh, void *data)
 EXPORT_SYMBOL(ldlm_lock_set_data);
 
 struct export_cl_data {
-	const struct lu_env	*ecl_env;
 	struct obd_export	*ecl_exp;
 	int			ecl_loop;
 };
@@ -2524,10 +2426,10 @@ static void ldlm_cancel_lock_for_export(struct obd_export *exp,
 
 	res = ldlm_resource_getref(lock->l_resource);
 
-	ldlm_lvbo_update(res, lock, NULL, 1);
+	ldlm_res_lvbo_update(res, NULL, 1);
 	ldlm_lock_cancel(lock);
 	if (!exp->exp_obd->obd_stopping)
-		ldlm_reprocess_all(res, lock);
+		ldlm_reprocess_all(res);
 	ldlm_resource_putref(res);
 
 	ecl->ecl_loop++;
@@ -2564,17 +2466,10 @@ ldlm_cancel_locks_for_export_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd,
  */
 int ldlm_export_cancel_blocked_locks(struct obd_export *exp)
 {
-	struct lu_env env;
 	struct export_cl_data	ecl = {
 		.ecl_exp	= exp,
 		.ecl_loop	= 0,
 	};
-	int rc;
-
-	rc = lu_env_init(&env, LCT_DT_THREAD);
-	if (rc)
-		RETURN(rc);
-	ecl.ecl_env = &env;
 
 	while (!list_empty(&exp->exp_bl_list)) {
 		struct ldlm_lock *lock;
@@ -2597,8 +2492,6 @@ int ldlm_export_cancel_blocked_locks(struct obd_export *exp)
 		LDLM_LOCK_RELEASE(lock);
 	}
 
-	lu_env_fini(&env);
-
 	CDEBUG(D_DLMTRACE, "Export %p, canceled %d locks, "
 	       "left on hash table %d.\n", exp, ecl.ecl_loop,
 	       atomic_read(&exp->exp_lock_hash->hs_count));
@@ -2613,16 +2506,10 @@ int ldlm_export_cancel_blocked_locks(struct obd_export *exp)
  */
 int ldlm_export_cancel_locks(struct obd_export *exp)
 {
-	struct export_cl_data ecl;
-	struct lu_env env;
-	int rc;
-
-	rc = lu_env_init(&env, LCT_DT_THREAD);
-	if (rc)
-		RETURN(rc);
-	ecl.ecl_env = &env;
-	ecl.ecl_exp = exp;
-	ecl.ecl_loop = 0;
+	struct export_cl_data	ecl = {
+		.ecl_exp	= exp,
+		.ecl_loop	= 0,
+	};
 
 	cfs_hash_for_each_empty(exp->exp_lock_hash,
 				ldlm_cancel_locks_for_export_cb, &ecl);
@@ -2636,35 +2523,26 @@ int ldlm_export_cancel_locks(struct obd_export *exp)
 	    exp->exp_obd->obd_stopping)
 		ldlm_reprocess_recovery_done(exp->exp_obd->obd_namespace);
 
-	lu_env_fini(&env);
-
 	return ecl.ecl_loop;
 }
 
 /**
- * Downgrade an PW/EX lock to COS | CR mode.
+ * Downgrade an exclusive lock.
  *
- * A lock mode convertion from PW/EX mode to less conflict mode. The
+ * A fast variant of ldlm_lock_convert for convertion of exclusive locks. The
  * convertion may fail if lock was canceled before downgrade, but it doesn't
  * indicate any problem, because such lock has no reader or writer, and will
  * be released soon.
- *
- * Used by Commit on Sharing (COS) code to force object changes commit in case
- * of conflict. Converted lock is considered as new lock and all blocking AST
- * things are cleared, so any pending or new blocked lock on that lock will
- * cause new call to blocking_ast and force resource object commit.
- *
- * Also used by layout_change to replace EX lock to CR lock.
+ * Used by Commit on Sharing (COS) code.
  *
  * \param lock A lock to convert
  * \param new_mode new lock mode
  */
-void ldlm_lock_mode_downgrade(struct ldlm_lock *lock, enum ldlm_mode new_mode)
+void ldlm_lock_downgrade(struct ldlm_lock *lock, enum ldlm_mode new_mode)
 {
-#ifdef HAVE_SERVER_SUPPORT
 	ENTRY;
 
-	LASSERT(new_mode == LCK_COS || new_mode == LCK_CR);
+	LASSERT(new_mode == LCK_COS);
 
 	lock_res_and_lock(lock);
 
@@ -2682,22 +2560,146 @@ void ldlm_lock_mode_downgrade(struct ldlm_lock *lock, enum ldlm_mode new_mode)
 	 * ldlm_grant_lock() called below.
 	 */
 	ldlm_pool_del(&ldlm_lock_to_ns(lock)->ns_pool, lock);
-
-	/* Consider downgraded lock as a new lock and clear all states
-	 * related to a previous blocking AST processing.
-	 */
-	ldlm_clear_blocking_data(lock);
-
 	lock->l_req_mode = new_mode;
 	ldlm_grant_lock(lock, NULL);
+
 	unlock_res_and_lock(lock);
 
-	ldlm_reprocess_all(lock->l_resource, lock);
+	ldlm_reprocess_all(lock->l_resource);
 
 	EXIT;
+}
+EXPORT_SYMBOL(ldlm_lock_downgrade);
+
+/**
+ * Attempt to convert already granted lock to a different mode.
+ *
+ * While lock conversion is not currently used, future client-side
+ * optimizations could take advantage of it to avoid discarding cached
+ * pages on a file.
+ */
+struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock,
+					enum ldlm_mode new_mode, __u32 *flags)
+{
+	struct list_head rpc_list;
+	struct ldlm_resource *res;
+	struct ldlm_namespace *ns;
+	int granted = 0;
+#ifdef HAVE_SERVER_SUPPORT
+	int old_mode;
+	struct sl_insert_point prev;
+#endif
+	struct ldlm_interval *node;
+	ENTRY;
+
+	INIT_LIST_HEAD(&rpc_list);
+	/* Just return if mode is unchanged. */
+	if (new_mode == lock->l_granted_mode) {
+		*flags |= LDLM_FL_BLOCK_GRANTED;
+		RETURN(lock->l_resource);
+	}
+
+	/* I can't check the type of lock here because the bitlock of lock
+	 * is not held here, so do the allocation blindly. -jay */
+	OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, GFP_NOFS);
+	if (node == NULL)  /* Actually, this causes EDEADLOCK to be returned */
+		RETURN(NULL);
+
+	LASSERTF((new_mode == LCK_PW && lock->l_granted_mode == LCK_PR),
+		 "new_mode %u, granted %u\n", new_mode, lock->l_granted_mode);
+
+	lock_res_and_lock(lock);
+
+	res = lock->l_resource;
+	ns  = ldlm_res_to_ns(res);
+
+#ifdef HAVE_SERVER_SUPPORT
+	old_mode = lock->l_req_mode;
 #endif
+	lock->l_req_mode = new_mode;
+	if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS) {
+#ifdef HAVE_SERVER_SUPPORT
+		/* remember the lock position where the lock might be
+		 * added back to the granted list later and also
+		 * remember the join mode for skiplist fixing. */
+		prev.res_link = lock->l_res_link.prev;
+		prev.mode_link = lock->l_sl_mode.prev;
+		prev.policy_link = lock->l_sl_policy.prev;
+#endif
+                ldlm_resource_unlink_lock(lock);
+        } else {
+                ldlm_resource_unlink_lock(lock);
+                if (res->lr_type == LDLM_EXTENT) {
+                        /* FIXME: ugly code, I have to attach the lock to a
+                         * interval node again since perhaps it will be granted
+                         * soon */
+			INIT_LIST_HEAD(&node->li_group);
+                        ldlm_interval_attach(node, lock);
+                        node = NULL;
+                }
+        }
+
+        /*
+         * Remove old lock from the pool before adding the lock with new
+         * mode below in ->policy()
+         */
+        ldlm_pool_del(&ns->ns_pool, lock);
+
+        /* If this is a local resource, put it on the appropriate list. */
+        if (ns_is_client(ldlm_res_to_ns(res))) {
+                if (*flags & (LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_GRANTED)) {
+                        ldlm_resource_add_lock(res, &res->lr_converting, lock);
+                } else {
+                        /* This should never happen, because of the way the
+                         * server handles conversions. */
+			LDLM_ERROR(lock, "Erroneous flags %x on local lock\n",
+                                   *flags);
+                        LBUG();
+
+                        ldlm_grant_lock(lock, &rpc_list);
+                        granted = 1;
+                        /* FIXME: completion handling not with lr_lock held ! */
+                        if (lock->l_completion_ast)
+                                lock->l_completion_ast(lock, 0, NULL);
+                }
+#ifdef HAVE_SERVER_SUPPORT
+	} else {
+		int rc;
+		enum ldlm_error err;
+		__u64 pflags = 0;
+		ldlm_processing_policy policy;
+
+                policy = ldlm_processing_policy_table[res->lr_type];
+		rc = policy(lock, &pflags, LDLM_PROCESS_RESCAN, &err,
+			    &rpc_list);
+                if (rc == LDLM_ITER_STOP) {
+                        lock->l_req_mode = old_mode;
+                        if (res->lr_type == LDLM_EXTENT)
+                                ldlm_extent_add_lock(res, lock);
+                        else
+                                ldlm_granted_list_add_lock(lock, &prev);
+
+                        res = NULL;
+                } else {
+                        *flags |= LDLM_FL_BLOCK_GRANTED;
+                        granted = 1;
+                }
+        }
+#else
+        } else {
+                CERROR("This is client-side-only module, cannot handle "
+                       "LDLM_NAMESPACE_SERVER resource type lock.\n");
+                LBUG();
+        }
+#endif
+        unlock_res_and_lock(lock);
+
+        if (granted)
+                ldlm_run_ast_work(ns, &rpc_list, LDLM_WORK_CP_AST);
+        if (node)
+                OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
+        RETURN(res);
 }
-EXPORT_SYMBOL(ldlm_lock_mode_downgrade);
 
 /**
  * Print lock with lock handle \a lockh description into debug log.
@@ -2747,17 +2749,17 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
         va_start(args, fmt);
 
         if (exp && exp->exp_connection) {
-		nid = obd_export_nid2str(exp);
+                nid = libcfs_nid2str(exp->exp_connection->c_peer.nid);
         } else if (exp && exp->exp_obd != NULL) {
                 struct obd_import *imp = exp->exp_obd->u.cli.cl_import;
-		nid = obd_import_nid2str(imp);
+                nid = libcfs_nid2str(imp->imp_connection->c_peer.nid);
         }
 
         if (resource == NULL) {
                 libcfs_debug_vmsg2(msgdata, fmt, args,
 		       " ns: \?\? lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
 		       "res: \?\? rrc=\?\? type: \?\?\? flags: %#llx nid: %s "
-		       "remote: %#llx expref: %d pid: %u timeout: %lld "
+		       "remote: %#llx expref: %d pid: %u timeout: %lu "
 		       "lvb_type: %d\n",
                        lock,
 		       lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
@@ -2777,7 +2779,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
 			" ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
 			"res: "DLDLMRES" rrc: %d type: %s [%llu->%llu] "
 			"(req %llu->%llu) flags: %#llx nid: %s remote: "
-			"%#llx expref: %d pid: %u timeout: %lld lvb_type: %d\n",
+			"%#llx expref: %d pid: %u timeout: %lu lvb_type: %d\n",
 			ldlm_lock_to_ns_name(lock), lock,
 			lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
 			lock->l_readers, lock->l_writers,
@@ -2800,7 +2802,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
 			" ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
 			"res: "DLDLMRES" rrc: %d type: %s pid: %d "
 			"[%llu->%llu] flags: %#llx nid: %s "
-			"remote: %#llx expref: %d pid: %u timeout: %lld\n",
+			"remote: %#llx expref: %d pid: %u timeout: %lu\n",
 			ldlm_lock_to_ns_name(lock), lock,
 			lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
 			lock->l_readers, lock->l_writers,
@@ -2820,9 +2822,9 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
 	case LDLM_IBITS:
 		libcfs_debug_vmsg2(msgdata, fmt, args,
 			" ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
-			"res: "DLDLMRES" bits %#llx/%#llx rrc: %d type: %s "
+			"res: "DLDLMRES" bits %#llx rrc: %d type: %s "
 			"flags: %#llx nid: %s remote: %#llx expref: %d "
-			"pid: %u timeout: %lld lvb_type: %d\n",
+			"pid: %u timeout: %lu lvb_type: %d\n",
 			ldlm_lock_to_ns_name(lock),
 			lock, lock->l_handle.h_cookie,
 			atomic_read(&lock->l_refc),
@@ -2831,7 +2833,6 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
 			ldlm_lockname[lock->l_req_mode],
 			PLDLMRES(resource),
 			lock->l_policy_data.l_inodebits.bits,
-			lock->l_policy_data.l_inodebits.try_bits,
 			atomic_read(&resource->lr_refcount),
 			ldlm_typename[resource->lr_type],
 			lock->l_flags, nid, lock->l_remote_handle.cookie,
@@ -2845,7 +2846,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
 			" ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
 			"res: "DLDLMRES" rrc: %d type: %s flags: %#llx "
 			"nid: %s remote: %#llx expref: %d pid: %u "
-			"timeout: %lld lvb_type: %d\n",
+			"timeout: %lu lvb_type: %d\n",
 			ldlm_lock_to_ns_name(lock),
 			lock, lock->l_handle.h_cookie,
 			atomic_read(&lock->l_refc),
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c
index ac7a9910e4d45..465ffda035dbe 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2017, Intel Corporation.
+ * Copyright (c) 2010, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,7 +40,7 @@
 #include <linux/kthread.h>
 #include <linux/list.h>
 #include <libcfs/libcfs.h>
-#include <lustre_errno.h>
+#include <lustre/lustre_errno.h>
 #include <lustre_dlm.h>
 #include <obd_class.h>
 #include "ldlm_internal.h"
@@ -49,11 +49,6 @@ static int ldlm_num_threads;
 module_param(ldlm_num_threads, int, 0444);
 MODULE_PARM_DESC(ldlm_num_threads, "number of DLM service threads to start");
 
-static unsigned int ldlm_cpu_bind = 1;
-module_param(ldlm_cpu_bind, uint, 0444);
-MODULE_PARM_DESC(ldlm_cpu_bind,
-		 "bind DLM service threads to particular CPU partitions");
-
 static char *ldlm_cpts;
 module_param(ldlm_cpts, charp, 0444);
 MODULE_PARM_DESC(ldlm_cpts, "CPU partitions ldlm threads should run on");
@@ -69,16 +64,18 @@ struct kset *ldlm_svc_kset;
 
 static struct ldlm_state *ldlm_state;
 
-/* timeout for initial callback (AST) reply (bz10399)
- * Due to having to send a 32 bit time value over the
- * wire return it as timeout_t instead of time64_t
- */
-static inline timeout_t ldlm_get_rq_timeout(void)
+static inline cfs_time_t round_timeout(cfs_time_t timeout)
+{
+        return cfs_time_seconds((int)cfs_duration_sec(cfs_time_sub(timeout, 0)) + 1);
+}
+
+/* timeout for initial callback (AST) reply (bz10399) */
+static inline unsigned int ldlm_get_rq_timeout(void)
 {
-	/* Non-AT value */
-	timeout_t timeout = min(ldlm_timeout, obd_timeout / 3);
+        /* Non-AT value */
+        unsigned int timeout = min(ldlm_timeout, obd_timeout / 3);
 
-	return timeout < 1 ? 1 : timeout;
+        return timeout < 1 ? 1 : timeout;
 }
 
 struct ldlm_bl_pool {
@@ -136,7 +133,7 @@ static DEFINE_SPINLOCK(waiting_locks_spinlock); /* BH lock (timer) */
  * All access to it should be under waiting_locks_spinlock.
  */
 static LIST_HEAD(waiting_locks_list);
-static void waiting_locks_callback(TIMER_DATA_TYPE unused);
+static void waiting_locks_callback(cfs_timer_cb_arg_t unused);
 static CFS_DEFINE_TIMER(waiting_locks_timer, waiting_locks_callback, 0, 0);
 
 enum elt_state {
@@ -150,10 +147,6 @@ static enum elt_state expired_lock_thread_state = ELT_STOPPED;
 static int expired_lock_dump;
 static LIST_HEAD(expired_lock_list);
 
-static int ldlm_lock_busy(struct ldlm_lock *lock);
-static int ldlm_add_waiting_lock(struct ldlm_lock *lock, time64_t timeout);
-static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, time64_t seconds);
-
 static inline int have_expired_locks(void)
 {
 	int need_to_run;
@@ -235,30 +228,14 @@ static int expired_lock_main(void *arg)
 			export = class_export_lock_get(lock->l_export, lock);
 			spin_unlock_bh(&waiting_locks_spinlock);
 
-			/* Check if we need to prolong timeout */
-			if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT) &&
-			    lock->l_callback_timeout != 0 && /* not AST error */
-			    ldlm_lock_busy(lock)) {
-				LDLM_DEBUG(lock, "prolong the busy lock");
-				lock_res_and_lock(lock);
-				ldlm_add_waiting_lock(lock,
-						ldlm_bl_timeout(lock) >> 1);
-				unlock_res_and_lock(lock);
-			} else {
-				spin_lock_bh(&export->exp_bl_list_lock);
-				list_del_init(&lock->l_exp_list);
-				spin_unlock_bh(&export->exp_bl_list_lock);
-
-				LDLM_ERROR(lock,
-					   "lock callback timer expired after %llds: evicting client at %s ",
-					   ktime_get_real_seconds() -
-					   lock->l_blast_sent,
-					   obd_export_nid2str(export));
-				ldlm_lock_to_ns(lock)->ns_timeouts++;
-				do_dump++;
-				class_fail_export(export);
-			}
+			spin_lock_bh(&export->exp_bl_list_lock);
+			list_del_init(&lock->l_exp_list);
+			spin_unlock_bh(&export->exp_bl_list_lock);
+
+			do_dump++;
+			class_fail_export(export);
 			class_export_lock_put(export, lock);
+
 			/* release extra ref grabbed by ldlm_add_waiting_lock()
 			 * or ldlm_failed_ast() */
 			LDLM_LOCK_RELEASE(lock);
@@ -281,6 +258,9 @@ static int expired_lock_main(void *arg)
 	RETURN(0);
 }
 
+static int ldlm_add_waiting_lock(struct ldlm_lock *lock);
+static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, int seconds);
+
 /**
  * Check if there is a request in the export request list
  * which prevents the lock canceling.
@@ -294,7 +274,7 @@ static int ldlm_lock_busy(struct ldlm_lock *lock)
 	if (lock->l_export == NULL)
 		return 0;
 
-	spin_lock(&lock->l_export->exp_rpc_lock);
+	spin_lock_bh(&lock->l_export->exp_rpc_lock);
 	list_for_each_entry(req, &lock->l_export->exp_hp_rpcs,
 				rq_exp_list) {
 		if (req->rq_ops->hpreq_lock_match) {
@@ -303,12 +283,12 @@ static int ldlm_lock_busy(struct ldlm_lock *lock)
 				break;
 		}
 	}
-	spin_unlock(&lock->l_export->exp_rpc_lock);
+	spin_unlock_bh(&lock->l_export->exp_rpc_lock);
 	RETURN(match);
 }
 
 /* This is called from within a timer interrupt and cannot schedule */
-static void waiting_locks_callback(TIMER_DATA_TYPE unused)
+static void waiting_locks_callback(cfs_timer_cb_arg_t unused)
 {
 	struct ldlm_lock	*lock;
 	int			need_dump = 0;
@@ -316,10 +296,42 @@ static void waiting_locks_callback(TIMER_DATA_TYPE unused)
 	spin_lock_bh(&waiting_locks_spinlock);
 	while (!list_empty(&waiting_locks_list)) {
 		lock = list_entry(waiting_locks_list.next, struct ldlm_lock,
-				  l_pending_chain);
-		if (lock->l_callback_timeout > ktime_get_seconds() ||
-		    lock->l_req_mode == LCK_GROUP)
-			break;
+                                      l_pending_chain);
+                if (cfs_time_after(lock->l_callback_timeout,
+                                   cfs_time_current()) ||
+                    (lock->l_req_mode == LCK_GROUP))
+                        break;
+
+                /* Check if we need to prolong timeout */
+                if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT) &&
+                    ldlm_lock_busy(lock)) {
+                        int cont = 1;
+
+                        if (lock->l_pending_chain.next == &waiting_locks_list)
+                                cont = 0;
+
+                        LDLM_LOCK_GET(lock);
+
+			spin_unlock_bh(&waiting_locks_spinlock);
+			LDLM_DEBUG(lock, "prolong the busy lock");
+			ldlm_refresh_waiting_lock(lock,
+						  ldlm_bl_timeout(lock) >> 1);
+			spin_lock_bh(&waiting_locks_spinlock);
+
+                        if (!cont) {
+                                LDLM_LOCK_RELEASE(lock);
+                                break;
+                        }
+
+                        LDLM_LOCK_RELEASE(lock);
+                        continue;
+                }
+                ldlm_lock_to_ns(lock)->ns_timeouts++;
+		LDLM_ERROR(lock, "lock callback timer expired after %llds: "
+                           "evicting client at %s ",
+			   ktime_get_real_seconds() - lock->l_blast_sent,
+                           libcfs_nid2str(
+                                   lock->l_export->exp_connection->c_peer.nid));
 
                 /* no needs to take an extra ref on the lock since it was in
                  * the waiting_locks_list and ldlm_add_waiting_lock()
@@ -336,18 +348,17 @@ static void waiting_locks_callback(TIMER_DATA_TYPE unused)
 		wake_up(&expired_lock_wait_queue);
 	}
 
-	/*
-	 * Make sure the timer will fire again if we have any locks
-	 * left.
-	 */
+        /*
+         * Make sure the timer will fire again if we have any locks
+         * left.
+         */
 	if (!list_empty(&waiting_locks_list)) {
-		unsigned long timeout_jiffies;
-
+                cfs_time_t timeout_rounded;
 		lock = list_entry(waiting_locks_list.next, struct ldlm_lock,
-				  l_pending_chain);
-		timeout_jiffies = cfs_time_seconds(lock->l_callback_timeout);
-		mod_timer(&waiting_locks_timer, timeout_jiffies);
-	}
+                                      l_pending_chain);
+                timeout_rounded = (cfs_time_t)round_timeout(lock->l_callback_timeout);
+		mod_timer(&waiting_locks_timer, timeout_rounded);
+        }
 	spin_unlock_bh(&waiting_locks_spinlock);
 }
 
@@ -363,10 +374,10 @@ static void waiting_locks_callback(TIMER_DATA_TYPE unused)
  *
  * Called with the namespace lock held.
  */
-static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, time64_t seconds)
+static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, int seconds)
 {
-	unsigned long timeout_jiffies;
-	time64_t timeout;
+        cfs_time_t timeout;
+        cfs_time_t timeout_rounded;
 
 	if (!list_empty(&lock->l_pending_chain))
                 return 0;
@@ -375,29 +386,28 @@ static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, time64_t seconds)
             OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT))
                 seconds = 1;
 
-	timeout = ktime_get_seconds() + seconds;
-	if (likely(timeout > lock->l_callback_timeout))
+        timeout = cfs_time_shift(seconds);
+        if (likely(cfs_time_after(timeout, lock->l_callback_timeout)))
                 lock->l_callback_timeout = timeout;
 
-	timeout_jiffies = cfs_time_seconds(lock->l_callback_timeout);
-
-	if (time_before(timeout_jiffies, waiting_locks_timer.expires) ||
-	    !timer_pending(&waiting_locks_timer))
-		mod_timer(&waiting_locks_timer, timeout_jiffies);
+        timeout_rounded = round_timeout(lock->l_callback_timeout);
 
-	/* if the new lock has a shorter timeout than something earlier on
-	 * the list, we'll wait the longer amount of time; no big deal.
-	 */
-	/* FIFO */
+	if (cfs_time_before(timeout_rounded, waiting_locks_timer.expires) ||
+	    !timer_pending(&waiting_locks_timer)) {
+		mod_timer(&waiting_locks_timer, timeout_rounded);
+        }
+        /* if the new lock has a shorter timeout than something earlier on
+           the list, we'll wait the longer amount of time; no big deal. */
+        /* FIFO */
 	list_add_tail(&lock->l_pending_chain, &waiting_locks_list);
-	return 1;
+        return 1;
 }
 
 static void ldlm_add_blocked_lock(struct ldlm_lock *lock)
 {
 	spin_lock_bh(&lock->l_export->exp_bl_list_lock);
 	if (list_empty(&lock->l_exp_list)) {
-		if (!ldlm_is_granted(lock))
+		if (lock->l_granted_mode != lock->l_req_mode)
 			list_add_tail(&lock->l_exp_list,
 				      &lock->l_export->exp_bl_list);
 		else
@@ -415,9 +425,10 @@ static void ldlm_add_blocked_lock(struct ldlm_lock *lock)
 		obd_stale_export_adjust(lock->l_export);
 }
 
-static int ldlm_add_waiting_lock(struct ldlm_lock *lock, time64_t timeout)
+static int ldlm_add_waiting_lock(struct ldlm_lock *lock)
 {
 	int ret;
+	int timeout = ldlm_bl_timeout(lock);
 
 	/* NB: must be called with hold of lock_res_and_lock() */
 	LASSERT(ldlm_is_res_locked(lock));
@@ -436,12 +447,12 @@ static int ldlm_add_waiting_lock(struct ldlm_lock *lock, time64_t timeout)
 	}
 
 	if (ldlm_is_destroyed(lock)) {
-		static time64_t next;
+		static cfs_time_t next;
 
 		spin_unlock_bh(&waiting_locks_spinlock);
 		LDLM_ERROR(lock, "not waiting on destroyed lock (bug 5653)");
-		if (ktime_get_seconds() > next) {
-			next = ktime_get_seconds() + 14400;
+		if (cfs_time_after(cfs_time_current(), next)) {
+			next = cfs_time_shift(14400);
 			libcfs_debug_dumpstack(NULL);
 		}
 		return 0;
@@ -460,7 +471,7 @@ static int ldlm_add_waiting_lock(struct ldlm_lock *lock, time64_t timeout)
 	if (ret)
 		ldlm_add_blocked_lock(lock);
 
-	LDLM_DEBUG(lock, "%sadding to wait list(timeout: %lld, AT: %s)",
+	LDLM_DEBUG(lock, "%sadding to wait list(timeout: %d, AT: %s)",
 		   ret == 0 ? "not re-" : "", timeout,
 		   AT_OFF ? "off" : "on");
 	return ret;
@@ -490,11 +501,10 @@ static int __ldlm_del_waiting_lock(struct ldlm_lock *lock)
 			del_timer(&waiting_locks_timer);
                 } else {
                         struct ldlm_lock *next;
-
 			next = list_entry(list_next, struct ldlm_lock,
-					  l_pending_chain);
+                                              l_pending_chain);
 			mod_timer(&waiting_locks_timer,
-				  cfs_time_seconds(next->l_callback_timeout));
+				  round_timeout(next->l_callback_timeout));
                 }
         }
 	list_del_init(&lock->l_pending_chain);
@@ -537,7 +547,7 @@ int ldlm_del_waiting_lock(struct ldlm_lock *lock)
  *
  * Called with namespace lock held.
  */
-int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, time64_t timeout)
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout)
 {
 	if (lock->l_export == NULL) {
 		/* We don't have a "waiting locks list" on clients. */
@@ -577,7 +587,7 @@ int ldlm_del_waiting_lock(struct ldlm_lock *lock)
         RETURN(0);
 }
 
-int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, time64_t timeout)
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout)
 {
         RETURN(0);
 }
@@ -595,9 +605,9 @@ int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, time64_t timeout)
  *
  * \retval            timeout in seconds to wait for the client reply
  */
-time64_t ldlm_bl_timeout(struct ldlm_lock *lock)
+unsigned int ldlm_bl_timeout(struct ldlm_lock *lock)
 {
-	time64_t timeout;
+	unsigned int timeout;
 
 	if (AT_OFF)
 		return obd_timeout / 2;
@@ -607,7 +617,7 @@ time64_t ldlm_bl_timeout(struct ldlm_lock *lock)
 	 * It would be nice to have some kind of "early reply" mechanism for
 	 * lock callbacks too... */
 	timeout = at_get(&lock->l_export->exp_bl_lock_at);
-	return max(timeout + (timeout >> 1), (time64_t)ldlm_enqueue_min);
+	return max(timeout + (timeout >> 1), ldlm_enqueue_min);
 }
 EXPORT_SYMBOL(ldlm_bl_timeout);
 
@@ -629,7 +639,6 @@ static void ldlm_failed_ast(struct ldlm_lock *lock, int rc,
 		/* the lock was not in any list, grab an extra ref before adding
 		 * the lock to the expired list */
 		LDLM_LOCK_GET(lock);
-	lock->l_callback_timeout = 0; /* differentiate it from expired locks */
 	list_add(&lock->l_pending_chain, &expired_lock_list);
 	wake_up(&expired_lock_wait_queue);
 	spin_unlock_bh(&waiting_locks_spinlock);
@@ -645,7 +654,14 @@ static int ldlm_handle_ast_error(struct ldlm_lock *lock,
 	struct lnet_process_id peer = req->rq_import->imp_connection->c_peer;
 
 	if (!req->rq_replied || (rc && rc != -EINVAL)) {
-		if (ldlm_is_cancel(lock)) {
+		if (lock->l_export && lock->l_export->exp_libclient) {
+			LDLM_DEBUG(lock,
+				   "%s AST (req@%p x%llu) to liblustre client (nid %s) timeout, just cancelling lock",
+				   ast_type, req, req->rq_xid,
+				   libcfs_nid2str(peer.nid));
+			ldlm_lock_cancel(lock);
+			rc = -ERESTART;
+		} else if (ldlm_is_cancel(lock)) {
 			LDLM_DEBUG(lock,
 				   "%s AST (req@%p x%llu) timeout from nid %s, but cancel was received (AST reply lost?)",
 				   ast_type, req, req->rq_xid,
@@ -697,7 +713,7 @@ static int ldlm_handle_ast_error(struct ldlm_lock *lock,
 			/* update lvbo to return proper attributes.
 			 * see bug 23174 */
 			ldlm_resource_getref(res);
-			ldlm_lvbo_update(res, lock, NULL, 1);
+			ldlm_res_lvbo_update(res, NULL, 1);
 			ldlm_resource_putref(res);
 		}
 		ldlm_lock_cancel(lock);
@@ -708,9 +724,9 @@ static int ldlm_handle_ast_error(struct ldlm_lock *lock,
 }
 
 static int ldlm_cb_interpret(const struct lu_env *env,
-		             struct ptlrpc_request *req, void *args, int rc)
+                             struct ptlrpc_request *req, void *data, int rc)
 {
-        struct ldlm_cb_async_args *ca   = args;
+        struct ldlm_cb_async_args *ca   = data;
         struct ldlm_lock          *lock = ca->ca_lock;
         struct ldlm_cb_set_arg    *arg  = ca->ca_set_arg;
         ENTRY;
@@ -728,16 +744,15 @@ static int ldlm_cb_interpret(const struct lu_env *env,
 		 *   -ELDLM_NO_LOCK_DATA when inode is cleared. LU-274
 		 */
 		if (unlikely(arg->gl_interpret_reply)) {
-			rc = arg->gl_interpret_reply(NULL, req, args, rc);
+			rc = arg->gl_interpret_reply(env, req, data, rc);
 		} else if (rc == -ELDLM_NO_LOCK_DATA) {
-			LDLM_DEBUG(lock,
-				   "lost race - client has a lock but no inode");
-			ldlm_lvbo_update(lock->l_resource, lock, NULL, 1);
+			LDLM_DEBUG(lock, "lost race - client has a lock but no "
+				   "inode");
+			ldlm_res_lvbo_update(lock->l_resource, NULL, 1);
 		} else if (rc != 0) {
 			rc = ldlm_handle_ast_error(lock, req, rc, "glimpse");
 		} else {
-			rc = ldlm_lvbo_update(lock->l_resource,
-					      lock, req, 1);
+			rc = ldlm_res_lvbo_update(lock->l_resource, req, 1);
 		}
 		break;
 	case LDLM_BL_CALLBACK:
@@ -765,8 +780,8 @@ static int ldlm_cb_interpret(const struct lu_env *env,
 
 static void ldlm_update_resend(struct ptlrpc_request *req, void *data)
 {
-	struct ldlm_cb_async_args *ca = data;
-	struct ldlm_lock *lock = ca->ca_lock;
+	struct ldlm_cb_async_args *ca   = data;
+	struct ldlm_lock          *lock = ca->ca_lock;
 
 	ldlm_refresh_waiting_lock(lock, ldlm_bl_timeout(lock));
 }
@@ -806,7 +821,7 @@ static void ldlm_lock_reorder_req(struct ldlm_lock *lock)
 		RETURN_EXIT;
 	}
 
-	spin_lock(&lock->l_export->exp_rpc_lock);
+	spin_lock_bh(&lock->l_export->exp_rpc_lock);
 	list_for_each_entry(req, &lock->l_export->exp_hp_rpcs,
 			    rq_exp_list) {
 		/* Do not process requests that were not yet added to there
@@ -820,7 +835,7 @@ static void ldlm_lock_reorder_req(struct ldlm_lock *lock)
 		    req->rq_ops->hpreq_lock_match(req, lock))
 			ptlrpc_nrs_req_hp_move(req);
 	}
-	spin_unlock(&lock->l_export->exp_rpc_lock);
+	spin_unlock_bh(&lock->l_export->exp_rpc_lock);
 	EXIT;
 }
 
@@ -859,18 +874,18 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock,
 
         ldlm_lock_reorder_req(lock);
 
-	req = ptlrpc_request_alloc_pack(lock->l_export->exp_imp_reverse,
-					&RQF_LDLM_BL_CALLBACK,
-					LUSTRE_DLM_VERSION, LDLM_BL_CALLBACK);
-	if (req == NULL)
-		RETURN(-ENOMEM);
+        req = ptlrpc_request_alloc_pack(lock->l_export->exp_imp_reverse,
+                                        &RQF_LDLM_BL_CALLBACK,
+                                        LUSTRE_DLM_VERSION, LDLM_BL_CALLBACK);
+        if (req == NULL)
+                RETURN(-ENOMEM);
 
-	CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
-	ca = ptlrpc_req_async_args(req);
-	ca->ca_set_arg = arg;
-	ca->ca_lock = lock;
+        CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
+        ca = ptlrpc_req_async_args(req);
+        ca->ca_set_arg = arg;
+        ca->ca_lock = lock;
 
-	req->rq_interpret_reply = ldlm_cb_interpret;
+        req->rq_interpret_reply = ldlm_cb_interpret;
 
 	lock_res_and_lock(lock);
 	if (ldlm_is_destroyed(lock)) {
@@ -880,7 +895,7 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock,
 		RETURN(0);
 	}
 
-	if (!ldlm_is_granted(lock)) {
+	if (lock->l_granted_mode != lock->l_req_mode) {
 		/* this blocking AST will be communicated as part of the
 		 * completion AST instead */
 		ldlm_add_blocked_lock(lock);
@@ -910,8 +925,8 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock,
 
 		req->rq_no_resend = 1;
 	} else {
-		LASSERT(ldlm_is_granted(lock));
-		ldlm_add_waiting_lock(lock, ldlm_bl_timeout(lock));
+		LASSERT(lock->l_granted_mode == lock->l_req_mode);
+		ldlm_add_waiting_lock(lock);
 		unlock_res_and_lock(lock);
 
 		/* Do not resend after lock callback timeout */
@@ -975,25 +990,26 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
 		lvb_len = 0;
 
 	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_CLIENT, lvb_len);
-	rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CP_CALLBACK);
-	if (rc) {
-		ptlrpc_request_free(req);
-		RETURN(rc);
-	}
+        rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CP_CALLBACK);
+        if (rc) {
+                ptlrpc_request_free(req);
+                RETURN(rc);
+        }
 
-	CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
-	ca = ptlrpc_req_async_args(req);
-	ca->ca_set_arg = arg;
-	ca->ca_lock = lock;
+        CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
+        ca = ptlrpc_req_async_args(req);
+        ca->ca_set_arg = arg;
+        ca->ca_lock = lock;
 
-	req->rq_interpret_reply = ldlm_cb_interpret;
-	body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+        req->rq_interpret_reply = ldlm_cb_interpret;
+        body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
 
-	body->lock_handle[0] = lock->l_remote_handle;
+        body->lock_handle[0] = lock->l_remote_handle;
 	body->lock_flags = ldlm_flags_to_wire(flags);
         ldlm_lock2desc(lock, &body->lock_desc);
 	if (lvb_len > 0) {
 		void *lvb = req_capsule_client_get(&req->rq_pill, &RMF_DLM_LVB);
+
 		lvb_len = ldlm_lvbo_fill(lock, lvb, &lvb_len);
 		if (lvb_len < 0) {
 			/* We still need to send the RPC to wake up the blocked
@@ -1044,7 +1060,7 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
 			lock_res_and_lock(lock);
 		} else {
 			/* start the lock-timeout clock */
-			ldlm_add_waiting_lock(lock, ldlm_bl_timeout(lock));
+			ldlm_add_waiting_lock(lock);
 			/* Do not resend after lock callback timeout */
 			req->rq_delay_limit = ldlm_bl_timeout(lock);
 			req->rq_resend_cb = ldlm_update_resend;
@@ -1082,7 +1098,7 @@ int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data)
 
 	if (arg->gl_desc != NULL)
 		/* There is a glimpse descriptor to pack */
-		req_fmt = &RQF_LDLM_GL_CALLBACK_DESC;
+		req_fmt = &RQF_LDLM_GL_DESC_CALLBACK;
 	else
 		req_fmt = &RQF_LDLM_GL_CALLBACK;
 
@@ -1100,9 +1116,9 @@ int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data)
 		*desc = *arg->gl_desc;
 	}
 
-	body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
-	body->lock_handle[0] = lock->l_remote_handle;
-	ldlm_lock2desc(lock, &body->lock_desc);
+        body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+        body->lock_handle[0] = lock->l_remote_handle;
+        ldlm_lock2desc(lock, &body->lock_desc);
 
 	CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
 	ca = ptlrpc_req_async_args(req);
@@ -1130,7 +1146,6 @@ int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data)
 
 	RETURN(rc);
 }
-EXPORT_SYMBOL(ldlm_server_glimpse_ast);
 
 int ldlm_glimpse_locks(struct ldlm_resource *res,
 		       struct list_head *gl_work_list)
@@ -1141,7 +1156,7 @@ int ldlm_glimpse_locks(struct ldlm_resource *res,
 	rc = ldlm_run_ast_work(ldlm_res_to_ns(res), gl_work_list,
 			       LDLM_WORK_GL_AST);
 	if (rc == -ERESTART)
-		ldlm_reprocess_all(res, NULL);
+		ldlm_reprocess_all(res);
 
 	RETURN(rc);
 }
@@ -1163,6 +1178,40 @@ struct ldlm_lock *ldlm_request_lock(struct ptlrpc_request *req)
 }
 EXPORT_SYMBOL(ldlm_request_lock);
 
+static void ldlm_svc_get_eopc(const struct ldlm_request *dlm_req,
+                       struct lprocfs_stats *srv_stats)
+{
+        int lock_type = 0, op = 0;
+
+        lock_type = dlm_req->lock_desc.l_resource.lr_type;
+
+        switch (lock_type) {
+        case LDLM_PLAIN:
+                op = PTLRPC_LAST_CNTR + LDLM_PLAIN_ENQUEUE;
+                break;
+        case LDLM_EXTENT:
+                if (dlm_req->lock_flags & LDLM_FL_HAS_INTENT)
+                        op = PTLRPC_LAST_CNTR + LDLM_GLIMPSE_ENQUEUE;
+                else
+                        op = PTLRPC_LAST_CNTR + LDLM_EXTENT_ENQUEUE;
+                break;
+        case LDLM_FLOCK:
+                op = PTLRPC_LAST_CNTR + LDLM_FLOCK_ENQUEUE;
+                break;
+        case LDLM_IBITS:
+                op = PTLRPC_LAST_CNTR + LDLM_IBITS_ENQUEUE;
+                break;
+        default:
+                op = 0;
+                break;
+        }
+
+        if (op)
+                lprocfs_counter_incr(srv_stats, op);
+
+        return;
+}
+
 /**
  * Main server-side entry point into LDLM for enqueue. This is called by ptlrpc
  * service threads to carry out client lock enqueueing requests.
@@ -1179,7 +1228,6 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
 	void *cookie = NULL;
 	int rc = 0;
 	struct ldlm_resource *res = NULL;
-	const struct lu_env *env = req->rq_svc_thread->t_env;
 	ENTRY;
 
 	LDLM_DEBUG_NOLOCK("server-side enqueue handler START");
@@ -1189,9 +1237,7 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
 
 	LASSERT(req->rq_export);
 
-	/* for intent enqueue the stat will be updated inside intent policy */
-	if (ptlrpc_req2svc(req)->srv_stats != NULL &&
-	    !(dlm_req->lock_flags & LDLM_FL_HAS_INTENT))
+	if (ptlrpc_req2svc(req)->srv_stats != NULL)
 		ldlm_svc_get_eopc(dlm_req, ptlrpc_req2svc(req)->srv_stats);
 
         if (req->rq_export && req->rq_export->exp_nid_stats &&
@@ -1295,11 +1341,9 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
 				     &lock->l_policy_data);
 	if (dlm_req->lock_desc.l_resource.lr_type == LDLM_EXTENT)
 		lock->l_req_extent = lock->l_policy_data.l_extent;
-	else if (dlm_req->lock_desc.l_resource.lr_type == LDLM_IBITS)
-		lock->l_policy_data.l_inodebits.try_bits =
-			dlm_req->lock_desc.l_policy_data.l_inodebits.try_bits;
 
 existing_lock:
+
         if (flags & LDLM_FL_HAS_INTENT) {
                 /* In this case, the reply buffer is allocated deep in
                  * local_lock_enqueue by the policy function. */
@@ -1311,25 +1355,25 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
 		req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB,
 				     RCL_SERVER, ldlm_lvbo_size(lock));
 
-		if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR))
-			GOTO(out, rc = -ENOMEM);
+                if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR))
+                        GOTO(out, rc = -ENOMEM);
 
-		rc = req_capsule_server_pack(&req->rq_pill);
-		if (rc)
-			GOTO(out, rc);
-	}
+                rc = req_capsule_server_pack(&req->rq_pill);
+                if (rc)
+                        GOTO(out, rc);
+        }
 
-	err = ldlm_lock_enqueue(env, ns, &lock, cookie, &flags);
+	err = ldlm_lock_enqueue(ns, &lock, cookie, &flags);
 	if (err) {
 		if ((int)err < 0)
 			rc = (int)err;
 		GOTO(out, err);
 	}
 
-	dlm_rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+        dlm_rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
 
-	ldlm_lock2desc(lock, &dlm_rep->lock_desc);
-	ldlm_lock2handle(lock, &dlm_rep->lock_handle);
+        ldlm_lock2desc(lock, &dlm_rep->lock_desc);
+        ldlm_lock2handle(lock, &dlm_rep->lock_handle);
 
 	if (lock && lock->l_resource->lr_type == LDLM_EXTENT)
 		OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_BL_EVICT, 6);
@@ -1351,24 +1395,8 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
                 LDLM_ERROR(lock, "lock on destroyed export %p", req->rq_export);
                 rc = -ENOTCONN;
 	} else if (ldlm_is_ast_sent(lock)) {
-		/* fill lock desc for possible lock convert */
-		if (lock->l_blocking_lock &&
-		    lock->l_resource->lr_type == LDLM_IBITS) {
-			struct ldlm_lock *bl_lock = lock->l_blocking_lock;
-			struct ldlm_lock_desc *rep_desc = &dlm_rep->lock_desc;
-
-			LDLM_DEBUG(lock,
-				   "save blocking bits %llx in granted lock",
-				   bl_lock->l_policy_data.l_inodebits.bits);
-			/* If lock is blocked then save blocking ibits
-			 * in returned lock policy for the possible lock
-			 * convert on a client.
-			 */
-			rep_desc->l_policy_data.l_inodebits.cancel_bits =
-				bl_lock->l_policy_data.l_inodebits.bits;
-		}
 		dlm_rep->lock_flags |= ldlm_flags_to_wire(LDLM_FL_AST_SENT);
-		if (ldlm_is_granted(lock)) {
+                if (lock->l_granted_mode == lock->l_req_mode) {
                         /*
                          * Only cancel lock if it was granted, because it would
                          * be destroyed immediately and would never be granted
@@ -1380,15 +1408,38 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
                                 unlock_res_and_lock(lock);
                                 ldlm_lock_cancel(lock);
                                 lock_res_and_lock(lock);
-			} else {
-				ldlm_add_waiting_lock(lock,
-						      ldlm_bl_timeout(lock));
+                        } else
+                                ldlm_add_waiting_lock(lock);
+                }
+        }
+        /* Make sure we never ever grant usual metadata locks to liblustre
+           clients */
+        if ((dlm_req->lock_desc.l_resource.lr_type == LDLM_PLAIN ||
+            dlm_req->lock_desc.l_resource.lr_type == LDLM_IBITS) &&
+             req->rq_export->exp_libclient) {
+		if (unlikely(!ldlm_is_cancel_on_block(lock) ||
+                             !(dlm_rep->lock_flags & LDLM_FL_CANCEL_ON_BLOCK))){
+                        CERROR("Granting sync lock to libclient. "
+			       "req fl %d, rep fl %d, lock fl %#llx\n",
+                               dlm_req->lock_flags, dlm_rep->lock_flags,
+                               lock->l_flags);
+                        LDLM_ERROR(lock, "sync lock");
+			if (dlm_req->lock_flags & LDLM_FL_HAS_INTENT) {
+				struct ldlm_intent *it;
+
+				it = req_capsule_client_get(&req->rq_pill,
+							    &RMF_LDLM_INTENT);
+				if (it != NULL) {
+					CERROR("This is intent %s (%llu)\n",
+					       ldlm_it2str(it->opc), it->opc);
+				}
 			}
                 }
         }
-	unlock_res_and_lock(lock);
 
-	EXIT;
+        unlock_res_and_lock(lock);
+
+        EXIT;
  out:
         req->rq_status = rc ?: err; /* return either error - bug 11190 */
         if (!req->rq_packed_final) {
@@ -1471,126 +1522,114 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
 			}
 		}
 
-		if (!err && !ldlm_is_cbpending(lock) &&
-		    dlm_req->lock_desc.l_resource.lr_type != LDLM_FLOCK)
-			ldlm_reprocess_all(lock->l_resource, lock);
+                if (!err && dlm_req->lock_desc.l_resource.lr_type != LDLM_FLOCK)
+                        ldlm_reprocess_all(lock->l_resource);
 
-		LDLM_LOCK_RELEASE(lock);
-	}
+                LDLM_LOCK_RELEASE(lock);
+        }
 
-	LDLM_DEBUG_NOLOCK("server-side enqueue handler END (lock %p, rc %d)",
-			  lock, rc);
+        LDLM_DEBUG_NOLOCK("server-side enqueue handler END (lock %p, rc %d)",
+                          lock, rc);
 
         return rc;
 }
 
-/* Clear the blocking lock, the race is possible between ldlm_handle_convert0()
- * and ldlm_work_bl_ast_lock(), so this is done under lock with check for NULL.
+/**
+ * Old-style LDLM main entry point for server code enqueue.
  */
-void ldlm_clear_blocking_lock(struct ldlm_lock *lock)
+int ldlm_handle_enqueue(struct ptlrpc_request *req,
+                        ldlm_completion_callback completion_callback,
+                        ldlm_blocking_callback blocking_callback,
+                        ldlm_glimpse_callback glimpse_callback)
 {
-	if (lock->l_blocking_lock) {
-		LDLM_LOCK_RELEASE(lock->l_blocking_lock);
-		lock->l_blocking_lock = NULL;
-	}
-}
+        struct ldlm_request *dlm_req;
+        struct ldlm_callback_suite cbs = {
+                .lcs_completion = completion_callback,
+                .lcs_blocking   = blocking_callback,
+                .lcs_glimpse    = glimpse_callback
+        };
+        int rc;
 
-/* A lock can be converted to new ibits or mode and should be considered
- * as new lock. Clear all states related to a previous blocking AST
- * processing so new conflicts will cause new blocking ASTs.
- *
- * This is used during lock convert below and lock downgrade to COS mode in
- * ldlm_lock_mode_downgrade().
- */
-void ldlm_clear_blocking_data(struct ldlm_lock *lock)
-{
-	ldlm_clear_ast_sent(lock);
-	lock->l_bl_ast_run = 0;
-	ldlm_clear_blocking_lock(lock);
+        dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+        if (dlm_req != NULL) {
+                rc = ldlm_handle_enqueue0(req->rq_export->exp_obd->obd_namespace,
+                                          req, dlm_req, &cbs);
+        } else {
+                rc = -EFAULT;
+        }
+        return rc;
 }
 
 /**
  * Main LDLM entry point for server code to process lock conversion requests.
  */
 int ldlm_handle_convert0(struct ptlrpc_request *req,
-			 const struct ldlm_request *dlm_req)
+                         const struct ldlm_request *dlm_req)
 {
-	struct obd_export *exp = req->rq_export;
-	struct ldlm_reply *dlm_rep;
-	struct ldlm_lock *lock;
-	__u64 bits;
-	__u64 new_bits;
-	int rc;
-
-	ENTRY;
-
-	if (exp && exp->exp_nid_stats && exp->exp_nid_stats->nid_ldlm_stats)
-		lprocfs_counter_incr(exp->exp_nid_stats->nid_ldlm_stats,
-				     LDLM_CONVERT - LDLM_FIRST_OPC);
-
-	rc = req_capsule_server_pack(&req->rq_pill);
-	if (rc)
-		RETURN(rc);
-
-	dlm_rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
-	dlm_rep->lock_flags = dlm_req->lock_flags;
+        struct ldlm_reply *dlm_rep;
+        struct ldlm_lock *lock;
+        int rc;
+        ENTRY;
 
-	lock = ldlm_handle2lock(&dlm_req->lock_handle[0]);
-	if (!lock) {
-		LDLM_DEBUG_NOLOCK("server lock is canceled already");
-		req->rq_status = ELDLM_NO_LOCK_DATA;
-		RETURN(0);
-	}
+        if (req->rq_export && req->rq_export->exp_nid_stats &&
+            req->rq_export->exp_nid_stats->nid_ldlm_stats)
+                lprocfs_counter_incr(req->rq_export->exp_nid_stats->nid_ldlm_stats,
+                                     LDLM_CONVERT - LDLM_FIRST_OPC);
 
-	LDLM_DEBUG(lock, "server-side convert handler START");
+        rc = req_capsule_server_pack(&req->rq_pill);
+        if (rc)
+                RETURN(rc);
 
-	lock_res_and_lock(lock);
-	bits = lock->l_policy_data.l_inodebits.bits;
-	new_bits = dlm_req->lock_desc.l_policy_data.l_inodebits.bits;
+        dlm_rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+        dlm_rep->lock_flags = dlm_req->lock_flags;
 
-	if (ldlm_is_cancel(lock)) {
-		LDLM_DEBUG(lock, "convert on canceled lock!");
-		unlock_res_and_lock(lock);
-		GOTO(out_put, rc = ELDLM_NO_LOCK_DATA);
-	}
+        lock = ldlm_handle2lock(&dlm_req->lock_handle[0]);
+        if (!lock) {
+		req->rq_status = LUSTRE_EINVAL;
+        } else {
+                void *res = NULL;
 
-	if (dlm_req->lock_desc.l_req_mode != lock->l_granted_mode) {
-		LDLM_ERROR(lock, "lock mode differs!");
-		unlock_res_and_lock(lock);
-		GOTO(out_put, rc = -EPROTO);
-	}
+                LDLM_DEBUG(lock, "server-side convert handler START");
 
-	if (bits == new_bits) {
-		/*
-		 * This can be valid situation if CONVERT RPCs are
-		 * re-ordered. Just finish silently
-		 */
-		LDLM_DEBUG(lock, "lock is converted already!");
-		unlock_res_and_lock(lock);
-	} else {
-		if (ldlm_is_waited(lock))
-			ldlm_del_waiting_lock(lock);
+                res = ldlm_lock_convert(lock, dlm_req->lock_desc.l_req_mode,
+                                        &dlm_rep->lock_flags);
+                if (res) {
+                        if (ldlm_del_waiting_lock(lock))
+                                LDLM_DEBUG(lock, "converted waiting lock");
+                        req->rq_status = 0;
+                } else {
+			req->rq_status = LUSTRE_EDEADLK;
+                }
+        }
 
-		ldlm_clear_cbpending(lock);
-		lock->l_policy_data.l_inodebits.cancel_bits = 0;
-		ldlm_inodebits_drop(lock, bits & ~new_bits);
+        if (lock) {
+                if (!req->rq_status)
+                        ldlm_reprocess_all(lock->l_resource);
+                LDLM_DEBUG(lock, "server-side convert handler END");
+                LDLM_LOCK_PUT(lock);
+        } else
+                LDLM_DEBUG_NOLOCK("server-side convert handler END");
 
-		ldlm_clear_blocking_data(lock);
-		unlock_res_and_lock(lock);
+        RETURN(0);
+}
 
-		ldlm_reprocess_all(lock->l_resource, NULL);
-	}
+/**
+ * Old-style main LDLM entry point for server code to process lock conversion
+ * requests.
+ */
+int ldlm_handle_convert(struct ptlrpc_request *req)
+{
+        int rc;
+        struct ldlm_request *dlm_req;
 
-	dlm_rep->lock_handle = lock->l_remote_handle;
-	ldlm_ibits_policy_local_to_wire(&lock->l_policy_data,
-					&dlm_rep->lock_desc.l_policy_data);
-	rc = ELDLM_OK;
-	EXIT;
-out_put:
-	LDLM_DEBUG(lock, "server-side convert handler END, rc = %d", rc);
-	LDLM_LOCK_PUT(lock);
-	req->rq_status = rc;
-	return 0;
+        dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+        if (dlm_req != NULL) {
+                rc = ldlm_handle_convert0(req, dlm_req);
+        } else {
+                CERROR ("Can't unpack dlm_req\n");
+                rc = -EFAULT;
+        }
+        return rc;
 }
 
 /**
@@ -1603,22 +1642,14 @@ int ldlm_request_cancel(struct ptlrpc_request *req,
 			const struct ldlm_request *dlm_req,
 			int first, enum lustre_at_flags flags)
 {
-	struct ldlm_resource *res, *pres = NULL;
-	struct ldlm_lock *lock;
-	int i, count, done = 0;
-	unsigned int size;
-
-	ENTRY;
-
-	size = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT);
-	if (size <= offsetof(struct ldlm_request, lock_handle) ||
-	    (size - offsetof(struct ldlm_request, lock_handle)) /
-	     sizeof(struct lustre_handle) < dlm_req->lock_count)
-		RETURN(0);
+        struct ldlm_resource *res, *pres = NULL;
+        struct ldlm_lock *lock;
+        int i, count, done = 0;
+        ENTRY;
 
-	count = dlm_req->lock_count ? dlm_req->lock_count : 1;
-	if (first >= count)
-		RETURN(0);
+        count = dlm_req->lock_count ? dlm_req->lock_count : 1;
+        if (first >= count)
+                RETURN(0);
 
 	if (count == 1 && dlm_req->lock_handle[0].cookie == 0)
 		RETURN(0);
@@ -1645,24 +1676,20 @@ int ldlm_request_cancel(struct ptlrpc_request *req,
 
 		/* This code is an optimization to only attempt lock
 		 * granting on the resource (that could be CPU-expensive)
-		 * after we are done cancelling lock in that resource.
-		 */
-		if (res != pres) {
-			if (pres != NULL) {
-				ldlm_reprocess_all(pres, NULL);
-				LDLM_RESOURCE_DELREF(pres);
-				ldlm_resource_putref(pres);
-			}
-			if (res != NULL) {
-				ldlm_resource_getref(res);
-				LDLM_RESOURCE_ADDREF(res);
-
-				if (!ldlm_is_discard_data(lock))
-					ldlm_lvbo_update(res, lock,
-							 NULL, 1);
-			}
-			pres = res;
-		}
+		 * after we are done cancelling lock in that resource. */
+                if (res != pres) {
+                        if (pres != NULL) {
+                                ldlm_reprocess_all(pres);
+                                LDLM_RESOURCE_DELREF(pres);
+                                ldlm_resource_putref(pres);
+                        }
+                        if (res != NULL) {
+                                ldlm_resource_getref(res);
+                                LDLM_RESOURCE_ADDREF(res);
+                                ldlm_res_lvbo_update(res, NULL, 1);
+                        }
+                        pres = res;
+                }
 
 		if ((flags & LATF_STATS) && ldlm_is_ast_sent(lock) &&
 		    lock->l_blast_sent != 0) {
@@ -1672,16 +1699,16 @@ int ldlm_request_cancel(struct ptlrpc_request *req,
 				   (s64)delay);
 			at_measured(&lock->l_export->exp_bl_lock_at, delay);
 		}
-		ldlm_lock_cancel(lock);
-		LDLM_LOCK_PUT(lock);
-	}
-	if (pres != NULL) {
-		ldlm_reprocess_all(pres, NULL);
-		LDLM_RESOURCE_DELREF(pres);
-		ldlm_resource_putref(pres);
-	}
-	LDLM_DEBUG_NOLOCK("server-side cancel handler END");
-	RETURN(done);
+                ldlm_lock_cancel(lock);
+                LDLM_LOCK_PUT(lock);
+        }
+        if (pres != NULL) {
+                ldlm_reprocess_all(pres);
+                LDLM_RESOURCE_DELREF(pres);
+                ldlm_resource_putref(pres);
+        }
+        LDLM_DEBUG_NOLOCK("server-side cancel handler END");
+        RETURN(done);
 }
 EXPORT_SYMBOL(ldlm_request_cancel);
 
@@ -1702,18 +1729,14 @@ int ldlm_handle_cancel(struct ptlrpc_request *req)
                 RETURN(-EFAULT);
         }
 
-	if (req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT) <
-	    offsetof(struct ldlm_request, lock_handle[1]))
-		RETURN(-EPROTO);
-
-	if (req->rq_export && req->rq_export->exp_nid_stats &&
-	    req->rq_export->exp_nid_stats->nid_ldlm_stats)
-		lprocfs_counter_incr(req->rq_export->exp_nid_stats->nid_ldlm_stats,
-				     LDLM_CANCEL - LDLM_FIRST_OPC);
+        if (req->rq_export && req->rq_export->exp_nid_stats &&
+            req->rq_export->exp_nid_stats->nid_ldlm_stats)
+                lprocfs_counter_incr(req->rq_export->exp_nid_stats->nid_ldlm_stats,
+                                     LDLM_CANCEL - LDLM_FIRST_OPC);
 
-	rc = req_capsule_server_pack(&req->rq_pill);
-	if (rc)
-		RETURN(rc);
+        rc = req_capsule_server_pack(&req->rq_pill);
+        if (rc)
+                RETURN(rc);
 
 	if (!ldlm_request_cancel(req, dlm_req, 0, LATF_STATS))
 		req->rq_status = LUSTRE_ESTALE;
@@ -1722,62 +1745,20 @@ int ldlm_handle_cancel(struct ptlrpc_request *req)
 }
 #endif /* HAVE_SERVER_SUPPORT */
 
-/**
- * Server may pass additional information about blocking lock.
- * For IBITS locks it is conflicting bits which can be used for
- * lock convert instead of cancel.
- */
-void ldlm_bl_desc2lock(const struct ldlm_lock_desc *ld, struct ldlm_lock *lock)
-{
-	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
-
-	check_res_locked(lock->l_resource);
-	if (ns_is_client(ns) && ld &&
-	    (lock->l_resource->lr_type == LDLM_IBITS)) {
-		/*
-		 * Lock description contains policy of blocking lock,
-		 * and its cancel_bits is used to pass conflicting bits.
-		 * NOTE: ld can be NULL or can be not NULL but zeroed if
-		 * passed from ldlm_bl_thread_blwi(), check below used bits
-		 * in ld to make sure it is valid description.
-		 */
-		if (ld->l_policy_data.l_inodebits.cancel_bits &&
-		    ldlm_res_eq(&ld->l_resource.lr_name,
-				&lock->l_resource->lr_name) &&
-		    !(ldlm_is_cbpending(lock) &&
-		      lock->l_policy_data.l_inodebits.cancel_bits == 0)) {
-			/* always combine conflicting ibits */
-			lock->l_policy_data.l_inodebits.cancel_bits |=
-				ld->l_policy_data.l_inodebits.cancel_bits;
-		} else {
-			/* If cancel_bits are not obtained or
-			 * if the lock is already CBPENDING and
-			 * has no cancel_bits set
-			 * - the full lock is to be cancelled
-			 */
-			lock->l_policy_data.l_inodebits.cancel_bits = 0;
-		}
-	}
-}
-
 /**
  * Callback handler for receiving incoming blocking ASTs.
  *
  * This can only happen on client side.
  */
 void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
-			     struct ldlm_lock_desc *ld, struct ldlm_lock *lock)
+                             struct ldlm_lock_desc *ld, struct ldlm_lock *lock)
 {
-	int do_ast;
-
-	ENTRY;
+        int do_ast;
+        ENTRY;
 
-	LDLM_DEBUG(lock, "client blocking AST callback handler");
+        LDLM_DEBUG(lock, "client blocking AST callback handler");
 
-	lock_res_and_lock(lock);
-
-	/* get extra information from desc if any */
-	ldlm_bl_desc2lock(ld, lock);
+        lock_res_and_lock(lock);
 	ldlm_set_cbpending(lock);
 
 	if (ldlm_is_cancel_on_block(lock))
@@ -1802,26 +1783,12 @@ void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
         EXIT;
 }
 
-static int ldlm_callback_reply(struct ptlrpc_request *req, int rc)
-{
-	if (req->rq_no_reply)
-		return 0;
-
-	req->rq_status = rc;
-	if (!req->rq_packed_final) {
-		rc = lustre_pack_reply(req, 1, NULL, NULL);
-		if (rc)
-			return rc;
-	}
-	return ptlrpc_reply(req);
-}
-
 /**
  * Callback handler for receiving incoming completion ASTs.
  *
  * This only can happen on client side.
  */
-static int ldlm_handle_cp_callback(struct ptlrpc_request *req,
+static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
                                     struct ldlm_namespace *ns,
                                     struct ldlm_request *dlm_req,
                                     struct ldlm_lock *lock)
@@ -1835,14 +1802,11 @@ static int ldlm_handle_cp_callback(struct ptlrpc_request *req,
 
 	INIT_LIST_HEAD(&ast_list);
 	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) {
-		long to = cfs_time_seconds(1);
-
-		ldlm_callback_reply(req, 0);
-
+		int to = cfs_time_seconds(1);
 		while (to > 0) {
 			set_current_state(TASK_INTERRUPTIBLE);
-			to = schedule_timeout(to);
-			if (ldlm_is_granted(lock) ||
+			schedule_timeout(to);
+			if (lock->l_granted_mode == lock->l_req_mode ||
 			    ldlm_is_destroyed(lock))
 				break;
 		}
@@ -1868,29 +1832,8 @@ static int ldlm_handle_cp_callback(struct ptlrpc_request *req,
 	}
 
 	lock_res_and_lock(lock);
-
-	if (!ldlm_res_eq(&dlm_req->lock_desc.l_resource.lr_name,
-			 &lock->l_resource->lr_name)) {
-		ldlm_resource_unlink_lock(lock);
-		unlock_res_and_lock(lock);
-		rc = ldlm_lock_change_resource(ns, lock,
-				&dlm_req->lock_desc.l_resource.lr_name);
-		if (rc < 0) {
-			LDLM_ERROR(lock, "Failed to allocate resource");
-			GOTO(out, rc);
-		}
-		LDLM_DEBUG(lock, "completion AST, new resource");
-		lock_res_and_lock(lock);
-	}
-
-	if (ldlm_is_failed(lock)) {
-		unlock_res_and_lock(lock);
-		LDLM_LOCK_RELEASE(lock);
-		RETURN(-EINVAL);
-	}
-
 	if (ldlm_is_destroyed(lock) ||
-	    ldlm_is_granted(lock)) {
+	    lock->l_granted_mode == lock->l_req_mode) {
 		/* bug 11300: the lock has already been granted */
 		unlock_res_and_lock(lock);
 		LDLM_DEBUG(lock, "Double grant race happened");
@@ -1912,15 +1855,26 @@ static int ldlm_handle_cp_callback(struct ptlrpc_request *req,
 		LDLM_DEBUG(lock, "completion AST, new policy data");
 	}
 
-	ldlm_resource_unlink_lock(lock);
+        ldlm_resource_unlink_lock(lock);
+        if (memcmp(&dlm_req->lock_desc.l_resource.lr_name,
+                   &lock->l_resource->lr_name,
+                   sizeof(lock->l_resource->lr_name)) != 0) {
+                unlock_res_and_lock(lock);
+		rc = ldlm_lock_change_resource(ns, lock,
+				&dlm_req->lock_desc.l_resource.lr_name);
+		if (rc < 0) {
+			LDLM_ERROR(lock, "Failed to allocate resource");
+			GOTO(out, rc);
+		}
+                LDLM_DEBUG(lock, "completion AST, new resource");
+                CERROR("change resource!\n");
+                lock_res_and_lock(lock);
+        }
 
-	if (dlm_req->lock_flags & LDLM_FL_AST_SENT) {
-		/*
-		 * BL_AST locks are not needed in LRU.
-		 * Let ldlm_cancel_lru() be fast.
-		 */
-		ldlm_lock_remove_from_lru(lock);
-		ldlm_bl_desc2lock(&dlm_req->lock_desc, lock);
+        if (dlm_req->lock_flags & LDLM_FL_AST_SENT) {
+		/* BL_AST locks are not needed in LRU.
+		 * Let ldlm_cancel_lru() be fast. */
+                ldlm_lock_remove_from_lru(lock);
 		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST;
                 LDLM_DEBUG(lock, "completion AST includes blocking AST");
         }
@@ -1957,8 +1911,6 @@ static int ldlm_handle_cp_callback(struct ptlrpc_request *req,
 		wake_up(&lock->l_waitq);
 	}
 	LDLM_LOCK_RELEASE(lock);
-
-	return 0;
 }
 
 /**
@@ -1973,12 +1925,10 @@ static void ldlm_handle_gl_callback(struct ptlrpc_request *req,
                                     struct ldlm_request *dlm_req,
                                     struct ldlm_lock *lock)
 {
-	struct ldlm_lock_desc *ld = &dlm_req->lock_desc;
-	int rc = -ENOSYS;
-
-	ENTRY;
+        int rc = -ENOSYS;
+        ENTRY;
 
-	LDLM_DEBUG(lock, "client glimpse AST callback handler");
+        LDLM_DEBUG(lock, "client glimpse AST callback handler");
 
         if (lock->l_glimpse_ast != NULL)
                 rc = lock->l_glimpse_ast(lock, req);
@@ -1995,17 +1945,10 @@ static void ldlm_handle_gl_callback(struct ptlrpc_request *req,
             !lock->l_readers && !lock->l_writers &&
 	    ktime_after(ktime_get(),
 			ktime_add(lock->l_last_used,
-				  ktime_set(ns->ns_dirty_age_limit, 0)))) {
-		unlock_res_and_lock(lock);
-
-		/* For MDS glimpse it is always DOM lock, set corresponding
-		 * cancel_bits to perform lock convert if needed
-		 */
-		if (lock->l_resource->lr_type == LDLM_IBITS)
-			ld->l_policy_data.l_inodebits.cancel_bits =
-							MDS_INODELOCK_DOM;
-		if (ldlm_bl_to_thread_lock(ns, ld, lock))
-			ldlm_handle_bl_callback(ns, ld, lock);
+				  ktime_set(10, 0)))) {
+                unlock_res_and_lock(lock);
+                if (ldlm_bl_to_thread_lock(ns, NULL, lock))
+                        ldlm_handle_bl_callback(ns, NULL, lock);
 
                 EXIT;
                 return;
@@ -2015,6 +1958,20 @@ static void ldlm_handle_gl_callback(struct ptlrpc_request *req,
         EXIT;
 }
 
+static int ldlm_callback_reply(struct ptlrpc_request *req, int rc)
+{
+        if (req->rq_no_reply)
+                return 0;
+
+        req->rq_status = rc;
+        if (!req->rq_packed_final) {
+                rc = lustre_pack_reply(req, 1, NULL, NULL);
+                if (rc)
+                        return rc;
+        }
+        return ptlrpc_reply(req);
+}
+
 static int __ldlm_bl_to_thread(struct ldlm_bl_work_item *blwi,
 			       enum ldlm_cancel_flags cancel_flags)
 {
@@ -2237,6 +2194,35 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
                 rc = ldlm_handle_setinfo(req);
                 ldlm_callback_reply(req, rc);
                 RETURN(0);
+        case LLOG_ORIGIN_HANDLE_CREATE:
+                req_capsule_set(&req->rq_pill, &RQF_LLOG_ORIGIN_HANDLE_CREATE);
+                if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
+                        RETURN(0);
+		rc = llog_origin_handle_open(req);
+                ldlm_callback_reply(req, rc);
+                RETURN(0);
+        case LLOG_ORIGIN_HANDLE_NEXT_BLOCK:
+                req_capsule_set(&req->rq_pill,
+                                &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK);
+                if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
+                        RETURN(0);
+                rc = llog_origin_handle_next_block(req);
+                ldlm_callback_reply(req, rc);
+                RETURN(0);
+        case LLOG_ORIGIN_HANDLE_READ_HEADER:
+                req_capsule_set(&req->rq_pill,
+                                &RQF_LLOG_ORIGIN_HANDLE_READ_HEADER);
+                if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
+                        RETURN(0);
+                rc = llog_origin_handle_read_header(req);
+                ldlm_callback_reply(req, rc);
+                RETURN(0);
+        case LLOG_ORIGIN_HANDLE_CLOSE:
+                if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
+                        RETURN(0);
+                rc = llog_origin_handle_close(req);
+                ldlm_callback_reply(req, rc);
+                RETURN(0);
         default:
                 CERROR("unknown opcode %u\n",
                        lustre_msg_get_opc(req->rq_reqmsg));
@@ -2321,31 +2307,30 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
                 CDEBUG(D_INODE, "blocking ast\n");
                 req_capsule_extend(&req->rq_pill, &RQF_LDLM_BL_CALLBACK);
 		if (!ldlm_is_cancel_on_block(lock)) {
-			rc = ldlm_callback_reply(req, 0);
-			if (req->rq_no_reply || rc)
-				ldlm_callback_errmsg(req, "Normal process", rc,
-						     &dlm_req->lock_handle[0]);
-		}
-		if (ldlm_bl_to_thread_lock(ns, &dlm_req->lock_desc, lock))
-			ldlm_handle_bl_callback(ns, &dlm_req->lock_desc, lock);
-		break;
-	case LDLM_CP_CALLBACK:
-		CDEBUG(D_INODE, "completion ast\n");
-		req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK);
-		rc = ldlm_handle_cp_callback(req, ns, dlm_req, lock);
-		if (!OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE))
-			ldlm_callback_reply(req, rc);
-		break;
-	case LDLM_GL_CALLBACK:
-		CDEBUG(D_INODE, "glimpse ast\n");
-		req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK);
-		ldlm_handle_gl_callback(req, ns, dlm_req, lock);
-		break;
-	default:
-		LBUG(); /* checked above */
-	}
+                        rc = ldlm_callback_reply(req, 0);
+                        if (req->rq_no_reply || rc)
+                                ldlm_callback_errmsg(req, "Normal process", rc,
+                                                     &dlm_req->lock_handle[0]);
+                }
+                if (ldlm_bl_to_thread_lock(ns, &dlm_req->lock_desc, lock))
+                        ldlm_handle_bl_callback(ns, &dlm_req->lock_desc, lock);
+                break;
+        case LDLM_CP_CALLBACK:
+                CDEBUG(D_INODE, "completion ast\n");
+                req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK);
+                ldlm_callback_reply(req, 0);
+                ldlm_handle_cp_callback(req, ns, dlm_req, lock);
+                break;
+        case LDLM_GL_CALLBACK:
+                CDEBUG(D_INODE, "glimpse ast\n");
+                req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK);
+                ldlm_handle_gl_callback(req, ns, dlm_req, lock);
+                break;
+        default:
+                LBUG();                         /* checked above */
+        }
 
-	RETURN(0);
+        RETURN(0);
 }
 
 #ifdef HAVE_SERVER_SUPPORT
@@ -2356,169 +2341,145 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
  */
 static int ldlm_cancel_handler(struct ptlrpc_request *req)
 {
-	int rc;
+        int rc;
+        ENTRY;
 
-	ENTRY;
+        /* Requests arrive in sender's byte order.  The ptlrpc service
+         * handler has already checked and, if necessary, byte-swapped the
+         * incoming request message body, but I am responsible for the
+         * message buffers. */
 
-	/* Requests arrive in sender's byte order.  The ptlrpc service
-	 * handler has already checked and, if necessary, byte-swapped the
-	 * incoming request message body, but I am responsible for the
-	 * message buffers. */
-
-	req_capsule_init(&req->rq_pill, req, RCL_SERVER);
-
-	if (req->rq_export == NULL) {
-		struct ldlm_request *dlm_req;
-
-		CERROR("%s from %s arrived at %llu with bad export cookie %llu\n",
-		       ll_opcode2str(lustre_msg_get_opc(req->rq_reqmsg)),
-		       libcfs_nid2str(req->rq_peer.nid),
-		       (unsigned long long)req->rq_arrival_time.tv_sec,
-		       lustre_msg_get_handle(req->rq_reqmsg)->cookie);
-
-		if (lustre_msg_get_opc(req->rq_reqmsg) == LDLM_CANCEL) {
-			req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK);
-			dlm_req = req_capsule_client_get(&req->rq_pill,
-							 &RMF_DLM_REQ);
-			if (dlm_req != NULL)
-				ldlm_lock_dump_handle(D_ERROR,
-						      &dlm_req->lock_handle[0]);
-		}
-		ldlm_callback_reply(req, -ENOTCONN);
-		RETURN(0);
-	}
+        req_capsule_init(&req->rq_pill, req, RCL_SERVER);
 
-	switch (lustre_msg_get_opc(req->rq_reqmsg)) {
-	/* XXX FIXME move this back to mds/handler.c, bug 249 */
-	case LDLM_CANCEL:
-		req_capsule_set(&req->rq_pill, &RQF_LDLM_CANCEL);
-		CDEBUG(D_INODE, "cancel\n");
+        if (req->rq_export == NULL) {
+                struct ldlm_request *dlm_req;
+
+                CERROR("%s from %s arrived at %lu with bad export cookie "
+		       "%llu\n",
+                       ll_opcode2str(lustre_msg_get_opc(req->rq_reqmsg)),
+                       libcfs_nid2str(req->rq_peer.nid),
+                       req->rq_arrival_time.tv_sec,
+                       lustre_msg_get_handle(req->rq_reqmsg)->cookie);
+
+                if (lustre_msg_get_opc(req->rq_reqmsg) == LDLM_CANCEL) {
+                        req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK);
+                        dlm_req = req_capsule_client_get(&req->rq_pill,
+                                                         &RMF_DLM_REQ);
+                        if (dlm_req != NULL)
+                                ldlm_lock_dump_handle(D_ERROR,
+                                                      &dlm_req->lock_handle[0]);
+                }
+                ldlm_callback_reply(req, -ENOTCONN);
+                RETURN(0);
+        }
+
+        switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+
+        /* XXX FIXME move this back to mds/handler.c, bug 249 */
+        case LDLM_CANCEL:
+                req_capsule_set(&req->rq_pill, &RQF_LDLM_CANCEL);
+                CDEBUG(D_INODE, "cancel\n");
 		if (CFS_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_NET) ||
 		    CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND) ||
 		    CFS_FAIL_CHECK(OBD_FAIL_LDLM_BL_EVICT))
 			RETURN(0);
-		rc = ldlm_handle_cancel(req);
-		break;
-	case LDLM_CONVERT:
-	{
-		struct ldlm_request *dlm_req;
-
-		req_capsule_set(&req->rq_pill, &RQF_LDLM_CONVERT);
-		CDEBUG(D_INODE, "convert\n");
-
-		dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
-		if (dlm_req == NULL) {
-			CDEBUG(D_INFO, "bad request buffer for cancel\n");
-			rc = ldlm_callback_reply(req, -EPROTO);
-		} else {
-			req->rq_status = ldlm_handle_convert0(req, dlm_req);
-			rc = ptlrpc_reply(req);
-		}
-		break;
-	}
-	default:
-		CERROR("invalid opcode %d\n",
-		       lustre_msg_get_opc(req->rq_reqmsg));
-		req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK);
-		rc = ldlm_callback_reply(req, -EINVAL);
-	}
+                rc = ldlm_handle_cancel(req);
+                if (rc)
+                        break;
+                RETURN(0);
+        default:
+                CERROR("invalid opcode %d\n",
+                       lustre_msg_get_opc(req->rq_reqmsg));
+                req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK);
+                ldlm_callback_reply(req, -EINVAL);
+        }
 
-	RETURN(rc);
+        RETURN(0);
 }
 
 static int ldlm_cancel_hpreq_lock_match(struct ptlrpc_request *req,
-					struct ldlm_lock *lock)
+                                        struct ldlm_lock *lock)
 {
-	struct ldlm_request *dlm_req;
-	struct lustre_handle lockh;
-	int rc = 0;
-	int i;
-
-	ENTRY;
+        struct ldlm_request *dlm_req;
+        struct lustre_handle lockh;
+        int rc = 0;
+        int i;
+        ENTRY;
 
-	dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
-	if (dlm_req == NULL)
-		RETURN(0);
+        dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+        if (dlm_req == NULL)
+                RETURN(0);
 
-	ldlm_lock2handle(lock, &lockh);
-	for (i = 0; i < dlm_req->lock_count; i++) {
-		if (lustre_handle_equal(&dlm_req->lock_handle[i],
-					&lockh)) {
-			DEBUG_REQ(D_RPCTRACE, req,
+        ldlm_lock2handle(lock, &lockh);
+        for (i = 0; i < dlm_req->lock_count; i++) {
+                if (lustre_handle_equal(&dlm_req->lock_handle[i],
+                                        &lockh)) {
+                        DEBUG_REQ(D_RPCTRACE, req,
 				  "Prio raised by lock %#llx.", lockh.cookie);
-			rc = 1;
-			break;
-		}
-	}
 
-	RETURN(rc);
+                        rc = 1;
+                        break;
+                }
+        }
+
+        RETURN(rc);
+
 }
 
 static int ldlm_cancel_hpreq_check(struct ptlrpc_request *req)
 {
-	struct ldlm_request *dlm_req;
-	int rc = 0;
-	int i;
-	unsigned int size;
-
-	ENTRY;
-
-	/* no prolong in recovery */
-	if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
-		RETURN(0);
+        struct ldlm_request *dlm_req;
+        int rc = 0;
+        int i;
+        ENTRY;
 
-	dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
-	if (dlm_req == NULL)
-		RETURN(-EFAULT);
+        /* no prolong in recovery */
+        if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
+                RETURN(0);
 
-	size = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT);
-	if (size <= offsetof(struct ldlm_request, lock_handle) ||
-	    (size - offsetof(struct ldlm_request, lock_handle)) /
-	     sizeof(struct lustre_handle) < dlm_req->lock_count)
-		RETURN(-EPROTO);
+        dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+        if (dlm_req == NULL)
+                RETURN(-EFAULT);
 
-	for (i = 0; i < dlm_req->lock_count; i++) {
-		struct ldlm_lock *lock;
+        for (i = 0; i < dlm_req->lock_count; i++) {
+                struct ldlm_lock *lock;
 
-		lock = ldlm_handle2lock(&dlm_req->lock_handle[i]);
-		if (lock == NULL)
-			continue;
+                lock = ldlm_handle2lock(&dlm_req->lock_handle[i]);
+                if (lock == NULL)
+                        continue;
 
 		rc = ldlm_is_ast_sent(lock) ? 1 : 0;
-		if (rc)
-			LDLM_DEBUG(lock, "hpreq cancel/convert lock");
-		LDLM_LOCK_PUT(lock);
+                if (rc)
+                        LDLM_DEBUG(lock, "hpreq cancel lock");
+                LDLM_LOCK_PUT(lock);
 
-		if (rc)
-			break;
-	}
+                if (rc)
+                        break;
+        }
 
-	RETURN(rc);
+        RETURN(rc);
 }
 
 static struct ptlrpc_hpreq_ops ldlm_cancel_hpreq_ops = {
-	.hpreq_lock_match = ldlm_cancel_hpreq_lock_match,
+        .hpreq_lock_match = ldlm_cancel_hpreq_lock_match,
 	.hpreq_check      = ldlm_cancel_hpreq_check,
 	.hpreq_fini       = NULL,
 };
 
 static int ldlm_hpreq_handler(struct ptlrpc_request *req)
 {
-	ENTRY;
+        ENTRY;
 
-	req_capsule_init(&req->rq_pill, req, RCL_SERVER);
+        req_capsule_init(&req->rq_pill, req, RCL_SERVER);
 
-	if (req->rq_export == NULL)
-		RETURN(0);
+        if (req->rq_export == NULL)
+                RETURN(0);
 
-	if (LDLM_CANCEL == lustre_msg_get_opc(req->rq_reqmsg)) {
-		req_capsule_set(&req->rq_pill, &RQF_LDLM_CANCEL);
-		req->rq_ops = &ldlm_cancel_hpreq_ops;
-	} else if (LDLM_CONVERT == lustre_msg_get_opc(req->rq_reqmsg)) {
-		req_capsule_set(&req->rq_pill, &RQF_LDLM_CONVERT);
-		req->rq_ops = &ldlm_cancel_hpreq_ops;
-	}
-	RETURN(0);
+        if (LDLM_CANCEL == lustre_msg_get_opc(req->rq_reqmsg)) {
+                req_capsule_set(&req->rq_pill, &RQF_LDLM_CANCEL);
+                req->rq_ops = &ldlm_cancel_hpreq_ops;
+        }
+        RETURN(0);
 }
 
 static int ldlm_revoke_lock_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd,
@@ -2530,10 +2491,10 @@ static int ldlm_revoke_lock_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd,
 
         lock_res_and_lock(lock);
 
-	if (!ldlm_is_granted(lock)) {
-		unlock_res_and_lock(lock);
-		return 0;
-	}
+        if (lock->l_req_mode != lock->l_granted_mode) {
+                unlock_res_and_lock(lock);
+                return 0;
+        }
 
         LASSERT(lock->l_resource);
         if (lock->l_resource->lr_type != LDLM_IBITS &&
@@ -2765,22 +2726,9 @@ static int ldlm_bl_thread_exports(struct ldlm_bl_pool *blp,
  */
 static int ldlm_bl_thread_main(void *arg)
 {
-	struct lu_env *env;
-	struct ldlm_bl_pool *blp;
+        struct ldlm_bl_pool *blp;
 	struct ldlm_bl_thread_data *bltd = arg;
-	int rc;
-
-	ENTRY;
-
-	OBD_ALLOC_PTR(env);
-	if (!env)
-		RETURN(-ENOMEM);
-	rc = lu_env_init(env, LCT_DT_THREAD);
-	if (rc)
-		GOTO(out_env, rc);
-	rc = lu_env_add(env);
-	if (rc)
-		GOTO(out_env_fini, rc);
+        ENTRY;
 
 	blp = bltd->bltd_blp;
 
@@ -2824,13 +2772,7 @@ static int ldlm_bl_thread_main(void *arg)
 
 	atomic_dec(&blp->blp_num_threads);
 	complete(&blp->blp_comp);
-
-	lu_env_remove(env);
-out_env_fini:
-	lu_env_fini(env);
-out_env:
-	OBD_FREE_PTR(env);
-	RETURN(rc);
+	RETURN(0);
 }
 
 
@@ -3031,7 +2973,7 @@ static int ldlm_setup(void)
         if (ldlm_state == NULL)
                 RETURN(-ENOMEM);
 
-	ldlm_kobj = kobject_create_and_add("ldlm", &lustre_kset->kobj);
+	ldlm_kobj = kobject_create_and_add("ldlm", lustre_kobj);
 	if (!ldlm_kobj)
 		GOTO(out, -ENOMEM);
 
@@ -3047,9 +2989,11 @@ static int ldlm_setup(void)
 	if (!ldlm_svc_kset)
 		GOTO(out, -ENOMEM);
 
-	rc = ldlm_debugfs_setup();
+#ifdef CONFIG_PROC_FS
+	rc = ldlm_proc_setup();
 	if (rc != 0)
 		GOTO(out, rc);
+#endif /* CONFIG_PROC_FS */
 
 	memset(&conf, 0, sizeof(conf));
 	conf = (typeof(conf)) {
@@ -3070,20 +3014,18 @@ static int ldlm_setup(void)
 			.tc_nthrs_base		= LDLM_NTHRS_BASE,
 			.tc_nthrs_max		= LDLM_NTHRS_MAX,
 			.tc_nthrs_user		= ldlm_num_threads,
-			.tc_cpu_bind		= ldlm_cpu_bind,
+			.tc_cpu_affinity	= 1,
 			.tc_ctx_tags		= LCT_MD_THREAD | LCT_DT_THREAD,
 		},
 		.psc_cpt		= {
 			.cc_pattern		= ldlm_cpts,
-			.cc_affinity		= true,
 		},
 		.psc_ops		= {
 			.so_req_handler		= ldlm_callback_handler,
 		},
 	};
 	ldlm_state->ldlm_cb_service = \
-			ptlrpc_register_service(&conf, ldlm_svc_kset,
-						ldlm_svc_debugfs_dir);
+			ptlrpc_register_service(&conf, ldlm_svc_proc_dir);
 	if (IS_ERR(ldlm_state->ldlm_cb_service)) {
 		CERROR("failed to start service\n");
 		rc = PTR_ERR(ldlm_state->ldlm_cb_service);
@@ -3112,14 +3054,13 @@ static int ldlm_setup(void)
 			.tc_nthrs_base		= LDLM_NTHRS_BASE,
 			.tc_nthrs_max		= LDLM_NTHRS_MAX,
 			.tc_nthrs_user		= ldlm_num_threads,
-			.tc_cpu_bind		= ldlm_cpu_bind,
+			.tc_cpu_affinity	= 1,
 			.tc_ctx_tags		= LCT_MD_THREAD | \
 						  LCT_DT_THREAD | \
 						  LCT_CL_THREAD,
 		},
 		.psc_cpt		= {
 			.cc_pattern		= ldlm_cpts,
-			.cc_affinity		= true,
 		},
 		.psc_ops		= {
 			.so_req_handler		= ldlm_cancel_handler,
@@ -3127,8 +3068,7 @@ static int ldlm_setup(void)
 		},
 	};
 	ldlm_state->ldlm_cancel_service = \
-			ptlrpc_register_service(&conf, ldlm_svc_kset,
-						ldlm_svc_debugfs_dir);
+			ptlrpc_register_service(&conf, ldlm_svc_proc_dir);
 	if (IS_ERR(ldlm_state->ldlm_cancel_service)) {
 		CERROR("failed to start service\n");
 		rc = PTR_ERR(ldlm_state->ldlm_cancel_service);
@@ -3239,12 +3179,10 @@ static int ldlm_cleanup(void)
 		kset_unregister(ldlm_ns_kset);
 	if (ldlm_svc_kset)
 		kset_unregister(ldlm_svc_kset);
-	if (ldlm_kobj) {
-		sysfs_remove_group(ldlm_kobj, &ldlm_attr_group);
+	if (ldlm_kobj)
 		kobject_put(ldlm_kobj);
-	}
 
-	ldlm_debugfs_cleanup();
+	ldlm_proc_cleanup();
 
 #ifdef HAVE_SERVER_SUPPORT
 	if (expired_lock_thread_state != ELT_STOPPED) {
@@ -3271,7 +3209,7 @@ int ldlm_init(void)
 
 	ldlm_lock_slab = kmem_cache_create("ldlm_locks",
 			      sizeof(struct ldlm_lock), 0,
-			      SLAB_HWCACHE_ALIGN, NULL);
+			      SLAB_HWCACHE_ALIGN | SLAB_DESTROY_BY_RCU, NULL);
 	if (ldlm_lock_slab == NULL)
 		goto out_resource;
 
@@ -3287,30 +3225,11 @@ int ldlm_init(void)
 	if (ldlm_interval_tree_slab == NULL)
 		goto out_interval;
 
-#ifdef HAVE_SERVER_SUPPORT
-	ldlm_inodebits_slab = kmem_cache_create("ldlm_ibits_node",
-						sizeof(struct ldlm_ibits_node),
-						0, SLAB_HWCACHE_ALIGN, NULL);
-	if (ldlm_inodebits_slab == NULL)
-		goto out_interval_tree;
-
-	ldlm_glimpse_work_kmem = kmem_cache_create("ldlm_glimpse_work_kmem",
-					sizeof(struct ldlm_glimpse_work),
-					0, 0, NULL);
-	if (ldlm_glimpse_work_kmem == NULL)
-		goto out_inodebits;
-#endif
-
 #if LUSTRE_TRACKS_LOCK_EXP_REFS
 	class_export_dump_hook = ldlm_dump_export_locks;
 #endif
 	return 0;
-#ifdef HAVE_SERVER_SUPPORT
-out_inodebits:
-	kmem_cache_destroy(ldlm_inodebits_slab);
-out_interval_tree:
-	kmem_cache_destroy(ldlm_interval_tree_slab);
-#endif
+
 out_interval:
 	kmem_cache_destroy(ldlm_interval_slab);
 out_lock:
@@ -3326,17 +3245,11 @@ void ldlm_exit(void)
 	if (ldlm_refcount)
 		CERROR("ldlm_refcount is %d in ldlm_exit!\n", ldlm_refcount);
 	kmem_cache_destroy(ldlm_resource_slab);
-	/*
-	 * ldlm_lock_put() use RCU to call ldlm_lock_free, so need call
-	 * rcu_barrier() to wait all outstanding RCU callbacks to complete,
-	 * so that ldlm_lock_free() get a chance to be called.
-	 */
-	rcu_barrier();
+	/* ldlm_lock_put() use RCU to call ldlm_lock_free, so need call
+	 * synchronize_rcu() to wait a grace period elapsed, so that
+	 * ldlm_lock_free() get a chance to be called. */
+	synchronize_rcu();
 	kmem_cache_destroy(ldlm_lock_slab);
 	kmem_cache_destroy(ldlm_interval_slab);
 	kmem_cache_destroy(ldlm_interval_tree_slab);
-#ifdef HAVE_SERVER_SUPPORT
-	kmem_cache_destroy(ldlm_inodebits_slab);
-	kmem_cache_destroy(ldlm_glimpse_work_kmem);
-#endif
 }
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_plain.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_plain.c
index 6407fd20884f8..6453cabf1921f 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_plain.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_plain.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -129,14 +129,14 @@ int ldlm_process_plain_lock(struct ldlm_lock *lock, __u64 *flags,
 			    enum ldlm_error *err, struct list_head *work_list)
 {
 	struct ldlm_resource *res = lock->l_resource;
-	struct list_head *grant_work = intention == LDLM_PROCESS_ENQUEUE ?
-							NULL : work_list;
+	struct list_head rpc_list;
 	int rc;
 	ENTRY;
 
-	LASSERT(!ldlm_is_granted(lock));
+	LASSERT(lock->l_granted_mode != lock->l_req_mode);
 	check_res_locked(res);
-	*err = ELDLM_OK;
+	LASSERT(list_empty(&res->lr_converting));
+	INIT_LIST_HEAD(&rpc_list);
 
 	if (intention == LDLM_PROCESS_RESCAN) {
                 LASSERT(work_list != NULL);
@@ -148,19 +148,31 @@ int ldlm_process_plain_lock(struct ldlm_lock *lock, __u64 *flags,
                         RETURN(LDLM_ITER_STOP);
 
                 ldlm_resource_unlink_lock(lock);
-		ldlm_grant_lock(lock, grant_work);
+                ldlm_grant_lock(lock, work_list);
                 RETURN(LDLM_ITER_CONTINUE);
         }
 
-	rc = ldlm_plain_compat_queue(&res->lr_granted, lock, work_list);
-	rc += ldlm_plain_compat_queue(&res->lr_waiting, lock, work_list);
-
-	if (rc == 2) {
+	LASSERT((intention == LDLM_PROCESS_ENQUEUE && work_list == NULL) ||
+		(intention == LDLM_PROCESS_RECOVERY && work_list != NULL));
+ restart:
+        rc = ldlm_plain_compat_queue(&res->lr_granted, lock, &rpc_list);
+        rc += ldlm_plain_compat_queue(&res->lr_waiting, lock, &rpc_list);
+
+        if (rc != 2) {
+		rc = ldlm_handle_conflict_lock(lock, flags, &rpc_list, 0);
+		if (rc == -ERESTART)
+			GOTO(restart, rc);
+		*err = rc;
+	} else {
 		ldlm_resource_unlink_lock(lock);
-		ldlm_grant_lock(lock, grant_work);
+		ldlm_grant_lock(lock, work_list);
+		rc = 0;
 	}
 
-	RETURN(LDLM_ITER_CONTINUE);
+	if (!list_empty(&rpc_list))
+		ldlm_discard_bl_list(&rpc_list);
+
+	RETURN(rc);
 }
 #endif /* HAVE_SERVER_SUPPORT */
 
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_pool.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_pool.c
index 0a423d5615b5b..2afed77ea5f70 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_pool.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_pool.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2017, Intel Corporation.
+ * Copyright (c) 2010, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -93,8 +93,7 @@
 
 #define DEBUG_SUBSYSTEM S_LDLM
 
-#include <linux/workqueue.h>
-#include <libcfs/linux/linux-mem.h>
+#include <linux/kthread.h>
 #include <lustre_dlm.h>
 #include <cl_object.h>
 #include <obd_class.h>
@@ -498,14 +497,22 @@ static int ldlm_cli_pool_recalc(struct ldlm_pool *pl)
         ldlm_cli_pool_pop_slv(pl);
 	spin_unlock(&pl->pl_lock);
 
+        /*
+         * Do not cancel locks in case lru resize is disabled for this ns.
+         */
+        if (!ns_connect_lru_resize(ldlm_pl2ns(pl)))
+		GOTO(out, ret = 0);
+
         /*
          * In the time of canceling locks on client we do not need to maintain
          * sharp timing, we only want to cancel locks asap according to new SLV.
          * It may be called when SLV has changed much, this is why we do not
          * take into account pl->pl_recalc_time here.
          */
-	ret = ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LCF_ASYNC, 0);
+	ret = ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LCF_ASYNC,
+			      LDLM_LRU_FLAG_LRUR);
 
+out:
 	spin_lock(&pl->pl_lock);
 	/*
 	 * Time of LRU resizing might be longer than period,
@@ -549,7 +556,7 @@ static int ldlm_cli_pool_shrink(struct ldlm_pool *pl,
 	if (nr == 0)
 		return (unused / 100) * sysctl_vfs_cache_pressure;
 	else
-		return ldlm_cancel_lru(ns, nr, LCF_ASYNC, 0);
+		return ldlm_cancel_lru(ns, nr, LCF_ASYNC, LDLM_LRU_FLAG_SHRINK);
 }
 
 static struct ldlm_pool_ops ldlm_srv_pool_ops = {
@@ -567,7 +574,7 @@ static struct ldlm_pool_ops ldlm_cli_pool_ops = {
  * Pool recalc wrapper. Will call either client or server pool recalc callback
  * depending what pool \a pl is used.
  */
-time64_t ldlm_pool_recalc(struct ldlm_pool *pl)
+int ldlm_pool_recalc(struct ldlm_pool *pl)
 {
 	time64_t recalc_interval_sec;
 	int count;
@@ -687,8 +694,7 @@ static int lprocfs_pool_state_seq_show(struct seq_file *m, void *unused)
 		   granted, limit);
 	return 0;
 }
-
-LDEBUGFS_SEQ_FOPS_RO(lprocfs_pool_state);
+LPROC_SEQ_FOPS_RO(lprocfs_pool_state);
 
 static ssize_t grant_speed_show(struct kobject *kobj, struct attribute *attr,
 				char *buf)
@@ -772,11 +778,11 @@ static int ldlm_pool_sysfs_init(struct ldlm_pool *pl)
 	return err;
 }
 
-static int ldlm_pool_debugfs_init(struct ldlm_pool *pl)
+static int ldlm_pool_proc_init(struct ldlm_pool *pl)
 {
 	struct ldlm_namespace *ns = ldlm_pl2ns(pl);
-	struct dentry *debugfs_ns_parent;
-	struct ldebugfs_vars pool_vars[2];
+	struct proc_dir_entry *parent_ns_proc;
+	struct lprocfs_vars pool_vars[2];
 	char *var_name = NULL;
 	int rc = 0;
 	ENTRY;
@@ -785,18 +791,18 @@ static int ldlm_pool_debugfs_init(struct ldlm_pool *pl)
 	if (!var_name)
 		RETURN(-ENOMEM);
 
-	debugfs_ns_parent = ns->ns_debugfs_entry;
-	if (IS_ERR_OR_NULL(debugfs_ns_parent)) {
-		CERROR("%s: debugfs entry is not initialized\n",
+	parent_ns_proc = ns->ns_proc_dir_entry;
+	if (parent_ns_proc == NULL) {
+		CERROR("%s: proc entry is not initialized\n",
 		       ldlm_ns_name(ns));
 		GOTO(out_free_name, rc = -EINVAL);
 	}
-	pl->pl_debugfs_entry = ldebugfs_register("pool", debugfs_ns_parent,
-						 NULL, NULL);
-	if (IS_ERR(pl->pl_debugfs_entry)) {
-		rc = PTR_ERR(pl->pl_debugfs_entry);
-		pl->pl_debugfs_entry = NULL;
-		CERROR("%s: cannot create 'pool' debugfs entry: rc = %d\n",
+	pl->pl_proc_dir = lprocfs_register("pool", parent_ns_proc,
+					   NULL, NULL);
+	if (IS_ERR(pl->pl_proc_dir)) {
+		rc = PTR_ERR(pl->pl_proc_dir);
+		pl->pl_proc_dir = NULL;
+		CERROR("%s: cannot create 'pool' proc entry: rc = %d\n",
 		       ldlm_ns_name(ns), rc);
 		GOTO(out_free_name, rc);
 	}
@@ -805,7 +811,7 @@ static int ldlm_pool_debugfs_init(struct ldlm_pool *pl)
 	memset(pool_vars, 0, sizeof(pool_vars));
 	pool_vars[0].name = var_name;
 
-	ldlm_add_var(&pool_vars[0], pl->pl_debugfs_entry, "state", pl,
+	ldlm_add_var(&pool_vars[0], pl->pl_proc_dir, "state", pl,
 		     &lprocfs_pool_state_fops);
 
         pl->pl_stats = lprocfs_alloc_stats(LDLM_POOL_LAST_STAT -
@@ -846,8 +852,7 @@ static int ldlm_pool_debugfs_init(struct ldlm_pool *pl)
         lprocfs_counter_init(pl->pl_stats, LDLM_POOL_TIMING_STAT,
                              LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
                              "recalc_timing", "sec");
-	rc = ldebugfs_register_stats(pl->pl_debugfs_entry, "stats",
-				     pl->pl_stats);
+	rc = lprocfs_register_stats(pl->pl_proc_dir, "stats", pl->pl_stats);
 
         EXIT;
 out_free_name:
@@ -861,15 +866,15 @@ static void ldlm_pool_sysfs_fini(struct ldlm_pool *pl)
 	wait_for_completion(&pl->pl_kobj_unregister);
 }
 
-static void ldlm_pool_debugfs_fini(struct ldlm_pool *pl)
+static void ldlm_pool_proc_fini(struct ldlm_pool *pl)
 {
         if (pl->pl_stats != NULL) {
                 lprocfs_free_stats(&pl->pl_stats);
                 pl->pl_stats = NULL;
         }
-	if (pl->pl_debugfs_entry != NULL) {
-		ldebugfs_remove(&pl->pl_debugfs_entry);
-		pl->pl_debugfs_entry = NULL;
+        if (pl->pl_proc_dir != NULL) {
+                lprocfs_remove(&pl->pl_proc_dir);
+                pl->pl_proc_dir = NULL;
         }
 }
 
@@ -903,7 +908,7 @@ int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
                 pl->pl_recalc_period = LDLM_POOL_CLI_DEF_RECALC_PERIOD;
         }
         pl->pl_client_lock_volume = 0;
-	rc = ldlm_pool_debugfs_init(pl);
+        rc = ldlm_pool_proc_init(pl);
         if (rc)
                 RETURN(rc);
 
@@ -920,7 +925,7 @@ void ldlm_pool_fini(struct ldlm_pool *pl)
 {
 	ENTRY;
 	ldlm_pool_sysfs_fini(pl);
-	ldlm_pool_debugfs_fini(pl);
+	ldlm_pool_proc_fini(pl);
 
         /*
          * Pool should not be used after this point. We can't free it here as
@@ -1065,8 +1070,10 @@ __u32 ldlm_pool_get_lvf(struct ldlm_pool *pl)
 	return atomic_read(&pl->pl_lock_volume_factor);
 }
 
+static struct ptlrpc_thread *ldlm_pools_thread;
 static struct shrinker *ldlm_pools_srv_shrinker;
 static struct shrinker *ldlm_pools_cli_shrinker;
+static struct completion ldlm_pools_comp;
 
 /*
 * count locks from all namespaces (if possible). Returns number of
@@ -1234,35 +1241,108 @@ static int ldlm_pools_cli_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
 
 #endif /* HAVE_SHRINKER_COUNT */
 
-static time64_t ldlm_pools_recalc_delay(enum ldlm_side side)
+int ldlm_pools_recalc(enum ldlm_side client)
 {
+	unsigned long nr_l = 0, nr_p = 0, l;
 	struct ldlm_namespace *ns;
 	struct ldlm_namespace *ns_old = NULL;
+	int nr, equal = 0;
 	/* seconds of sleep if no active namespaces */
-	time64_t delay = side == LDLM_NAMESPACE_SERVER ?
-				 LDLM_POOL_SRV_DEF_RECALC_PERIOD :
-				 LDLM_POOL_CLI_DEF_RECALC_PERIOD;
-	int nr;
-
-	/* Recalc at least ldlm_namespace_nr(side) namespaces. */
-	for (nr = ldlm_namespace_nr_read(side); nr > 0; nr--) {
-		int skip;
+	int time = client ? LDLM_POOL_CLI_DEF_RECALC_PERIOD :
+			    LDLM_POOL_SRV_DEF_RECALC_PERIOD;
+
+	/*
+	 * No need to setup pool limit for client pools.
+	 */
+	if (client == LDLM_NAMESPACE_SERVER) {
 		/*
-		 * Lock the list, get first @ns in the list, getref, move it
-		 * to the tail, unlock and call pool recalc. This way we avoid
-		 * calling recalc under @ns lock, which is really good as we
-		 * get rid of potential deadlock on side nodes when canceling
-		 * locks synchronously.
+		 * Check all modest namespaces first.
 		 */
-		mutex_lock(ldlm_namespace_lock(side));
-		if (list_empty(ldlm_namespace_list(side))) {
-			mutex_unlock(ldlm_namespace_lock(side));
+		mutex_lock(ldlm_namespace_lock(client));
+		list_for_each_entry(ns, ldlm_namespace_list(client),
+				    ns_list_chain)
+		{
+			if (ns->ns_appetite != LDLM_NAMESPACE_MODEST)
+				continue;
+
+                        l = ldlm_pool_granted(&ns->ns_pool);
+                        if (l == 0)
+                                l = 1;
+
+                        /*
+                         * Set the modest pools limit equal to their avg granted
+                         * locks + ~6%.
+                         */
+                        l += dru(l, LDLM_POOLS_MODEST_MARGIN_SHIFT, 0);
+                        ldlm_pool_setup(&ns->ns_pool, l);
+                        nr_l += l;
+                        nr_p++;
+                }
+
+                /*
+                 * Make sure that modest namespaces did not eat more that 2/3
+                 * of limit.
+                 */
+                if (nr_l >= 2 * (LDLM_POOL_HOST_L / 3)) {
+                        CWARN("\"Modest\" pools eat out 2/3 of server locks "
+                              "limit (%lu of %lu). This means that you have too "
+                              "many clients for this amount of server RAM. "
+                              "Upgrade server!\n", nr_l, LDLM_POOL_HOST_L);
+                        equal = 1;
+                }
+
+		/*
+		 * The rest is given to greedy namespaces.
+		 */
+		list_for_each_entry(ns, ldlm_namespace_list(client),
+				    ns_list_chain)
+		{
+			if (!equal && ns->ns_appetite != LDLM_NAMESPACE_GREEDY)
+				continue;
+
+                        if (equal) {
+                                /*
+                                 * In the case 2/3 locks are eaten out by
+                                 * modest pools, we re-setup equal limit
+                                 * for _all_ pools.
+                                 */
+                                l = LDLM_POOL_HOST_L /
+					ldlm_namespace_nr_read(client);
+                        } else {
+                                /*
+                                 * All the rest of greedy pools will have
+                                 * all locks in equal parts.
+                                 */
+                                l = (LDLM_POOL_HOST_L - nr_l) /
+					(ldlm_namespace_nr_read(client) -
+                                         nr_p);
+                        }
+                        ldlm_pool_setup(&ns->ns_pool, l);
+                }
+		mutex_unlock(ldlm_namespace_lock(client));
+        }
+
+        /*
+         * Recalc at least ldlm_namespace_nr(client) namespaces.
+         */
+	for (nr = ldlm_namespace_nr_read(client); nr > 0; nr--) {
+                int     skip;
+                /*
+                 * Lock the list, get first @ns in the list, getref, move it
+                 * to the tail, unlock and call pool recalc. This way we avoid
+                 * calling recalc under @ns lock what is really good as we get
+                 * rid of potential deadlock on client nodes when canceling
+                 * locks synchronously.
+                 */
+		mutex_lock(ldlm_namespace_lock(client));
+		if (list_empty(ldlm_namespace_list(client))) {
+			mutex_unlock(ldlm_namespace_lock(client));
 			break;
 		}
-		ns = ldlm_namespace_first_locked(side);
+		ns = ldlm_namespace_first_locked(client);
 
 		if (ns_old == ns) { /* Full pass complete */
-			mutex_unlock(ldlm_namespace_lock(side));
+			mutex_unlock(ldlm_namespace_lock(client));
 			break;
 		}
 
@@ -1277,8 +1357,8 @@ static time64_t ldlm_pools_recalc_delay(enum ldlm_side side)
 		 *   there).
 		 */
 		if (ldlm_ns_empty(ns)) {
-			ldlm_namespace_move_to_inactive_locked(ns, side);
-			mutex_unlock(ldlm_namespace_lock(side));
+			ldlm_namespace_move_to_inactive_locked(ns, client);
+			mutex_unlock(ldlm_namespace_lock(client));
 			continue;
 		}
 
@@ -1298,118 +1378,144 @@ static time64_t ldlm_pools_recalc_delay(enum ldlm_side side)
 		}
 		spin_unlock(&ns->ns_lock);
 
-		ldlm_namespace_move_to_active_locked(ns, side);
-		mutex_unlock(ldlm_namespace_lock(side));
+		ldlm_namespace_move_to_active_locked(ns, client);
+		mutex_unlock(ldlm_namespace_lock(client));
 
 		/*
 		 * After setup is done - recalc the pool.
 		 */
 		if (!skip) {
-			delay = min(delay, ldlm_pool_recalc(&ns->ns_pool));
+			int ttime = ldlm_pool_recalc(&ns->ns_pool);
+
+			if (ttime < time)
+				time = ttime;
+
 			ldlm_namespace_put(ns);
 		}
-	}
+        }
 
-	return delay;
-}
+	/* Wake up the blocking threads from time to time. */
+	ldlm_bl_thread_wakeup();
 
-static void ldlm_pools_recalc_task(struct work_struct *ws);
-static DECLARE_DELAYED_WORK(ldlm_pools_recalc_work, ldlm_pools_recalc_task);
+	return time;
+}
 
-static void ldlm_pools_recalc_task(struct work_struct *ws)
+static int ldlm_pools_thread_main(void *arg)
 {
-	/* seconds of sleep if no active namespaces */
-	time64_t delay;
-#ifdef HAVE_SERVER_SUPPORT
-	struct ldlm_namespace *ns;
-	unsigned long nr_l = 0, nr_p = 0, l;
-	int equal = 0;
+	struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg;
+	int s_time, c_time;
+	ENTRY;
 
-	/* Check all modest namespaces first. */
-	mutex_lock(ldlm_namespace_lock(LDLM_NAMESPACE_SERVER));
-	list_for_each_entry(ns, ldlm_namespace_list(LDLM_NAMESPACE_SERVER),
-			    ns_list_chain) {
-		if (ns->ns_appetite != LDLM_NAMESPACE_MODEST)
-			continue;
+	thread_set_flags(thread, SVC_RUNNING);
+	wake_up(&thread->t_ctl_waitq);
+
+	CDEBUG(D_DLMTRACE, "%s: pool thread starting, process %d\n",
+	       "ldlm_poold", current_pid());
 
-		l = ldlm_pool_granted(&ns->ns_pool);
-		if (l == 0)
-			l = 1;
+        while (1) {
+                struct l_wait_info lwi;
 
 		/*
-		 * Set the modest pools limit equal to their avg granted
-		 * locks + ~6%.
+		 * Recal all pools on this tick.
 		 */
-		l += dru(l, LDLM_POOLS_MODEST_MARGIN_SHIFT, 0);
-		ldlm_pool_setup(&ns->ns_pool, l);
-		nr_l += l;
-		nr_p++;
-	}
+		s_time = ldlm_pools_recalc(LDLM_NAMESPACE_SERVER);
+		c_time = ldlm_pools_recalc(LDLM_NAMESPACE_CLIENT);
 
-	/*
-	 * Make sure than modest namespaces did not eat more that 2/3
-	 * of limit.
-	 */
-	if (nr_l >= 2 * (LDLM_POOL_HOST_L / 3)) {
-		CWARN("'Modest' pools eat out 2/3 of server locks "
-		      "limit (%lu of %lu). This means that you have too "
-		      "many clients for this amount of server RAM. "
-		      "Upgrade server!\n", nr_l, LDLM_POOL_HOST_L);
-		equal = 1;
-	}
+		/*
+		 * Wait until the next check time, or until we're
+		 * stopped.
+		 */
+		lwi = LWI_TIMEOUT(cfs_time_seconds(min(s_time, c_time)),
+				  NULL, NULL);
+                l_wait_event(thread->t_ctl_waitq,
+                             thread_is_stopping(thread) ||
+                             thread_is_event(thread),
+                             &lwi);
+
+                if (thread_test_and_clear_flags(thread, SVC_STOPPING))
+                        break;
+                else
+                        thread_test_and_clear_flags(thread, SVC_EVENT);
+        }
 
-	/* The rest is given to greedy namespaces. */
-	list_for_each_entry(ns, ldlm_namespace_list(LDLM_NAMESPACE_SERVER),
-			    ns_list_chain) {
-		if (!equal && ns->ns_appetite != LDLM_NAMESPACE_GREEDY)
-			continue;
+	thread_set_flags(thread, SVC_STOPPED);
+	wake_up(&thread->t_ctl_waitq);
 
-		if (equal) {
-			/*
-			 * In the case 2/3 locks are eaten out by
-			 * modest pools, we re-setup equal limit
-			 * for _all_ pools.
-			 */
-			l = LDLM_POOL_HOST_L /
-				ldlm_namespace_nr_read(LDLM_NAMESPACE_SERVER);
-		} else {
-			/*
-			 * All the rest of greedy pools will have
-			 * all locks in equal parts.
-			 */
-			l = (LDLM_POOL_HOST_L - nr_l) /
-				(ldlm_namespace_nr_read(LDLM_NAMESPACE_SERVER) -
-				 nr_p);
-		}
-		ldlm_pool_setup(&ns->ns_pool, l);
+	CDEBUG(D_DLMTRACE, "%s: pool thread exiting, process %d\n",
+		"ldlm_poold", current_pid());
+
+	complete_and_exit(&ldlm_pools_comp, 0);
+}
+
+static int ldlm_pools_thread_start(void)
+{
+	struct l_wait_info lwi = { 0 };
+	struct task_struct *task;
+	ENTRY;
+
+	if (ldlm_pools_thread != NULL)
+		RETURN(-EALREADY);
+
+	OBD_ALLOC_PTR(ldlm_pools_thread);
+	if (ldlm_pools_thread == NULL)
+		RETURN(-ENOMEM);
+
+	init_completion(&ldlm_pools_comp);
+	init_waitqueue_head(&ldlm_pools_thread->t_ctl_waitq);
+
+	task = kthread_run(ldlm_pools_thread_main, ldlm_pools_thread,
+			   "ldlm_poold");
+	if (IS_ERR(task)) {
+		CERROR("Can't start pool thread, error %ld\n", PTR_ERR(task));
+		OBD_FREE(ldlm_pools_thread, sizeof(*ldlm_pools_thread));
+		ldlm_pools_thread = NULL;
+		RETURN(PTR_ERR(task));
 	}
-	mutex_unlock(ldlm_namespace_lock(LDLM_NAMESPACE_SERVER));
+	l_wait_event(ldlm_pools_thread->t_ctl_waitq,
+		     thread_is_running(ldlm_pools_thread), &lwi);
+	RETURN(0);
+}
 
-	delay = min(ldlm_pools_recalc_delay(LDLM_NAMESPACE_SERVER),
-		    ldlm_pools_recalc_delay(LDLM_NAMESPACE_CLIENT));
-#else  /* !HAVE_SERVER_SUPPORT */
-	delay = ldlm_pools_recalc_delay(LDLM_NAMESPACE_CLIENT);
-#endif /* HAVE_SERVER_SUPPORT */
+static void ldlm_pools_thread_stop(void)
+{
+	ENTRY;
 
-	/* Wake up the blocking threads from time to time. */
-	ldlm_bl_thread_wakeup();
+	if (ldlm_pools_thread == NULL) {
+		EXIT;
+		return;
+	}
+
+	thread_set_flags(ldlm_pools_thread, SVC_STOPPING);
+	wake_up(&ldlm_pools_thread->t_ctl_waitq);
 
-	schedule_delayed_work(&ldlm_pools_recalc_work, cfs_time_seconds(delay));
+	/*
+	 * Make sure that pools thread is finished before freeing @thread.
+	 * This fixes possible race and oops due to accessing freed memory
+	 * in pools thread.
+	 */
+	wait_for_completion(&ldlm_pools_comp);
+	OBD_FREE_PTR(ldlm_pools_thread);
+	ldlm_pools_thread = NULL;
+	EXIT;
 }
 
 int ldlm_pools_init(void)
 {
+	int rc;
 	DEF_SHRINKER_VAR(shsvar, ldlm_pools_srv_shrink,
 			 ldlm_pools_srv_count, ldlm_pools_srv_scan);
 	DEF_SHRINKER_VAR(shcvar, ldlm_pools_cli_shrink,
 			 ldlm_pools_cli_count, ldlm_pools_cli_scan);
+	ENTRY;
 
-	schedule_delayed_work(&ldlm_pools_recalc_work,
-			      LDLM_POOL_CLI_DEF_RECALC_PERIOD);
-	ldlm_pools_srv_shrinker = set_shrinker(DEFAULT_SEEKS, &shsvar);
-	ldlm_pools_cli_shrinker = set_shrinker(DEFAULT_SEEKS, &shcvar);
-
-	return 0;
+	rc = ldlm_pools_thread_start();
+	if (rc == 0) {
+		ldlm_pools_srv_shrinker =
+			set_shrinker(DEFAULT_SEEKS, &shsvar);
+		ldlm_pools_cli_shrinker =
+			set_shrinker(DEFAULT_SEEKS, &shcvar);
+	}
+	RETURN(rc);
 }
 
 void ldlm_pools_fini(void)
@@ -1422,7 +1528,7 @@ void ldlm_pools_fini(void)
 		remove_shrinker(ldlm_pools_cli_shrinker);
 		ldlm_pools_cli_shrinker = NULL;
 	}
-	cancel_delayed_work_sync(&ldlm_pools_recalc_work);
+	ldlm_pools_thread_stop();
 }
 
 #else /* !HAVE_LRU_RESIZE_SUPPORT */
@@ -1431,7 +1537,7 @@ int ldlm_pool_setup(struct ldlm_pool *pl, int limit)
         return 0;
 }
 
-time64_t ldlm_pool_recalc(struct ldlm_pool *pl)
+int ldlm_pool_recalc(struct ldlm_pool *pl)
 {
         return 0;
 }
@@ -1508,4 +1614,8 @@ void ldlm_pools_fini(void)
 	return;
 }
 
+int ldlm_pools_recalc(enum ldlm_side client)
+{
+	return 0;
+}
 #endif /* HAVE_LRU_RESIZE_SUPPORT */
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c
index 0bc4df685525c..d15cff5fb27b6 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2017, Intel Corporation.
+ * Copyright (c) 2010, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -57,7 +57,8 @@
 
 #define DEBUG_SUBSYSTEM S_LDLM
 
-#include <lustre_errno.h>
+#include <lustre/lustre_errno.h>
+
 #include <lustre_dlm.h>
 #include <obd_class.h>
 #include <obd.h>
@@ -67,7 +68,6 @@
 unsigned int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT;
 module_param(ldlm_enqueue_min, uint, 0644);
 MODULE_PARM_DESC(ldlm_enqueue_min, "lock enqueue timeout minimum");
-EXPORT_SYMBOL(ldlm_enqueue_min);
 
 /* in client side, whether the cached locks will be canceled before replay */
 unsigned int ldlm_cancel_unused_locks_before_replay = 1;
@@ -121,16 +121,16 @@ int ldlm_expired_completion_wait(void *data)
 
         ENTRY;
         if (lock->l_conn_export == NULL) {
-		static time64_t next_dump, last_dump;
+                static cfs_time_t next_dump = 0, last_dump = 0;
 
 		LDLM_ERROR(lock, "lock timed out (enqueued at %lld, %llds ago); "
 			   "not entering recovery in server code, just going back to sleep",
 			   (s64)lock->l_activity,
 			   (s64)(ktime_get_real_seconds() -
 				 lock->l_activity));
-		if (ktime_get_seconds() > next_dump) {
+                if (cfs_time_after(cfs_time_current(), next_dump)) {
                         last_dump = next_dump;
-			next_dump = ktime_get_seconds() + 300;
+                        next_dump = cfs_time_shift(300);
                         ldlm_namespace_dump(D_DLMTRACE,
                                             ldlm_lock_to_ns(lock));
                         if (last_dump == 0)
@@ -150,19 +150,6 @@ int ldlm_expired_completion_wait(void *data)
         RETURN(0);
 }
 
-int is_granted_or_cancelled_nolock(struct ldlm_lock *lock)
-{
-	int ret = 0;
-
-	check_res_locked(lock->l_resource);
-	if (ldlm_is_granted(lock) && !ldlm_is_cp_reqd(lock))
-		ret = 1;
-	else if (ldlm_is_failed(lock) || ldlm_is_cancel(lock))
-		ret = 1;
-	return ret;
-}
-EXPORT_SYMBOL(is_granted_or_cancelled_nolock);
-
 /**
  * Calculate the Completion timeout (covering enqueue, BL AST, data flush,
  * lock cancel, and their replies). Used for lock completion timeout on the
@@ -175,9 +162,9 @@ EXPORT_SYMBOL(is_granted_or_cancelled_nolock);
 
 /* We use the same basis for both server side and client side functions
    from a single node. */
-static time64_t ldlm_cp_timeout(struct ldlm_lock *lock)
+static unsigned int ldlm_cp_timeout(struct ldlm_lock *lock)
 {
-	time64_t timeout;
+	unsigned int timeout;
 
 	if (AT_OFF)
 		return obd_timeout;
@@ -186,7 +173,7 @@ static time64_t ldlm_cp_timeout(struct ldlm_lock *lock)
 	 * lock from another client.  Server will evict the other client if it
 	 * doesn't respond reasonably, and then give us the lock. */
 	timeout = at_get(ldlm_lock_to_ns_at(lock));
-	return max(3 * timeout, (time64_t) ldlm_enqueue_min);
+	return max(3 * timeout, ldlm_enqueue_min);
 }
 
 /**
@@ -234,9 +221,9 @@ int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data)
 		RETURN(ldlm_completion_tail(lock, data));
 	}
 
-	LDLM_DEBUG(lock,
-		   "client-side enqueue returned a blocked lock, going forward");
-	ldlm_reprocess_all(lock->l_resource, NULL);
+	LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, "
+		   "going forward");
+	ldlm_reprocess_all(lock->l_resource);
 	RETURN(0);
 }
 EXPORT_SYMBOL(ldlm_completion_ast_async);
@@ -256,6 +243,8 @@ EXPORT_SYMBOL(ldlm_completion_ast_async);
  *
  *     - to force all locks when resource is destroyed (cleanup_resource());
  *
+ *     - during lock conversion (not used currently).
+ *
  * If lock is not granted in the first case, this function waits until second
  * or penultimate cases happen in some other thread.
  *
@@ -267,7 +256,7 @@ int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
         struct obd_device *obd;
         struct obd_import *imp = NULL;
         struct l_wait_info lwi;
-	time64_t timeout;
+        __u32 timeout;
         int rc = 0;
         ENTRY;
 
@@ -296,7 +285,7 @@ int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
 	timeout = ldlm_cp_timeout(lock);
 
 	lwd.lwd_lock = lock;
-	lock->l_activity = ktime_get_real_seconds();
+	lock->l_activity = cfs_time_current_sec();
 
 	if (ldlm_is_no_timeout(lock)) {
                 LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT");
@@ -445,8 +434,7 @@ int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp)
 /**
  * Enqueue a local lock (typically on a server).
  */
-int ldlm_cli_enqueue_local(const struct lu_env *env,
-			   struct ldlm_namespace *ns,
+int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
 			   const struct ldlm_res_id *res_id,
 			   enum ldlm_type type, union ldlm_policy_data *policy,
 			   enum ldlm_mode mode, __u64 *flags,
@@ -479,7 +467,6 @@ int ldlm_cli_enqueue_local(const struct lu_env *env,
 	err = ldlm_lvbo_init(lock->l_resource);
 	if (err < 0) {
 		LDLM_ERROR(lock, "delayed lvb init failed (rc %d)", err);
-		ldlm_lock_destroy_nolock(lock);
 		GOTO(out, err);
 	}
 
@@ -504,15 +491,15 @@ int ldlm_cli_enqueue_local(const struct lu_env *env,
 		lock->l_req_extent = policy->l_extent;
 	}
 
-	err = ldlm_lock_enqueue(env, ns, &lock, policy, flags);
-	if (unlikely(err != ELDLM_OK))
-		GOTO(out, err);
+        err = ldlm_lock_enqueue(ns, &lock, policy, flags);
+        if (unlikely(err != ELDLM_OK))
+                GOTO(out, err);
 
-	if (policy != NULL)
-		*policy = lock->l_policy_data;
+        if (policy != NULL)
+                *policy = lock->l_policy_data;
 
-	if (lock->l_completion_ast)
-		lock->l_completion_ast(lock, *flags, NULL);
+        if (lock->l_completion_ast)
+                lock->l_completion_ast(lock, *flags, NULL);
 
         LDLM_DEBUG(lock, "client-side local enqueue handler, new lock created");
         EXIT;
@@ -530,8 +517,9 @@ static void failed_lock_cleanup(struct ldlm_namespace *ns,
 
         /* Set a flag to prevent us from sending a CANCEL (bug 407) */
         lock_res_and_lock(lock);
-	/* Check that lock is not granted or failed, we might race. */
-	if (!ldlm_is_granted(lock) && !ldlm_is_failed(lock)) {
+        /* Check that lock is not granted or failed, we might race. */
+        if ((lock->l_req_mode != lock->l_granted_mode) &&
+	    !ldlm_is_failed(lock)) {
 		/* Make sure that this lock will not be found by raced
 		 * bl_ast and -EINVAL reply is sent to server anyways.
 		 * b=17645*/
@@ -578,16 +566,12 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
 			  __u32 lvb_len, const struct lustre_handle *lockh,
 			  int rc)
 {
-	struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
-	const struct lu_env *env = NULL;
-	int is_replay = *flags & LDLM_FL_REPLAY;
-	struct ldlm_lock *lock;
-	struct ldlm_reply *reply;
-	int cleanup_phase = 1;
-	ENTRY;
-
-	if (req && req->rq_svc_thread)
-		env = req->rq_svc_thread->t_env;
+        struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+        int is_replay = *flags & LDLM_FL_REPLAY;
+        struct ldlm_lock *lock;
+        struct ldlm_reply *reply;
+        int cleanup_phase = 1;
+        ENTRY;
 
         lock = ldlm_handle2lock(lockh);
         /* ldlm_cli_enqueue is holding a reference on this lock. */
@@ -696,27 +680,26 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
 						&lock->l_policy_data);
 		}
 
-		if (type != LDLM_PLAIN)
-			LDLM_DEBUG(lock,"client-side enqueue, new policy data");
-	}
+                if (type != LDLM_PLAIN)
+                        LDLM_DEBUG(lock,"client-side enqueue, new policy data");
+        }
 
 	if ((*flags) & LDLM_FL_AST_SENT) {
-		lock_res_and_lock(lock);
-		ldlm_bl_desc2lock(&reply->lock_desc, lock);
+                lock_res_and_lock(lock);
 		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST;
-		unlock_res_and_lock(lock);
-		LDLM_DEBUG(lock, "enqueue reply includes blocking AST");
-	}
+                unlock_res_and_lock(lock);
+                LDLM_DEBUG(lock, "enqueue reply includes blocking AST");
+        }
 
-	/* If the lock has already been granted by a completion AST, don't
-	 * clobber the LVB with an older one. */
+        /* If the lock has already been granted by a completion AST, don't
+         * clobber the LVB with an older one. */
 	if (lvb_len > 0) {
 		/* We must lock or a racing completion might update lvb without
 		 * letting us know and we'll clobber the correct value.
 		 * Cannot unlock after the check either, a that still leaves
 		 * a tiny window for completion to get in */
 		lock_res_and_lock(lock);
-		if (!ldlm_is_granted(lock))
+		if (lock->l_req_mode != lock->l_granted_mode)
 			rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER,
 					   lock->l_lvb_data, lvb_len);
 		unlock_res_and_lock(lock);
@@ -726,16 +709,16 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
 		}
         }
 
-	if (!is_replay) {
-		rc = ldlm_lock_enqueue(env, ns, &lock, NULL, flags);
-		if (lock->l_completion_ast != NULL) {
-			int err = lock->l_completion_ast(lock, *flags, NULL);
-			if (!rc)
-				rc = err;
-			if (rc)
-				cleanup_phase = 1;
-		}
-	}
+        if (!is_replay) {
+                rc = ldlm_lock_enqueue(ns, &lock, NULL, flags);
+                if (lock->l_completion_ast != NULL) {
+                        int err = lock->l_completion_ast(lock, *flags, NULL);
+                        if (!rc)
+                                rc = err;
+                        if (rc)
+                                cleanup_phase = 1;
+                }
+        }
 
 	if (lvb_len > 0 && lvb != NULL) {
 		/* Copy the LVB here, and not earlier, because the completion
@@ -807,7 +790,8 @@ int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req,
 	struct req_capsule	*pill = &req->rq_pill;
 	struct ldlm_request	*dlm = NULL;
 	struct list_head	head = LIST_HEAD_INIT(head);
-	int avail, to_free = 0, pack = 0;
+	enum ldlm_lru_flags lru_flags;
+	int avail, to_free, pack = 0;
 	int rc;
 	ENTRY;
 
@@ -818,10 +802,10 @@ int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req,
 		req_capsule_filled_sizes(pill, RCL_CLIENT);
 		avail = ldlm_capsule_handles_avail(pill, RCL_CLIENT, canceloff);
 
-		/* If we have reached the limit, free +1 slot for the new one */
-		if (!ns_connect_lru_resize(ns) && opc == LDLM_ENQUEUE &&
-		    ns->ns_nr_unused >= ns->ns_max_unused)
-			to_free = 1;
+		lru_flags = LDLM_LRU_FLAG_NO_WAIT | (ns_connect_lru_resize(ns) ?
+			LDLM_LRU_FLAG_LRUR : LDLM_LRU_FLAG_AGED);
+		to_free = !ns_connect_lru_resize(ns) &&
+			opc == LDLM_ENQUEUE ? 1 : 0;
 
 		/* Cancel LRU locks here _only_ if the server supports
 		 * EARLY_CANCEL. Otherwise we have to send extra CANCEL
@@ -829,7 +813,7 @@ int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req,
 		if (avail > count)
 			count += ldlm_cancel_lru_local(ns, cancels, to_free,
 						       avail - count, 0,
-						       LDLM_LRU_FLAG_NO_WAIT);
+						       lru_flags);
 		if (avail > count)
 			pack = count;
 		else
@@ -943,10 +927,6 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
 					lvb_len, lvb_type);
 		if (IS_ERR(lock))
 			RETURN(PTR_ERR(lock));
-
-		if (einfo->ei_cb_created)
-			einfo->ei_cb_created(lock);
-
                 /* for the local lock, add the reference */
                 ldlm_lock_addref_internal(lock, einfo->ei_mode);
                 ldlm_lock2handle(lock, lockh);
@@ -968,7 +948,7 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
 	lock->l_export = NULL;
 	lock->l_blocking_ast = einfo->ei_cb_bl;
 	lock->l_flags |= (*flags & (LDLM_FL_NO_LRU | LDLM_FL_EXCL));
-	lock->l_activity = ktime_get_real_seconds();
+        lock->l_activity = cfs_time_current_sec();
 
 	/* lock not sent to server yet */
 	if (reqp == NULL || *reqp == NULL) {
@@ -992,42 +972,12 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
 			 DLM_LOCKREQ_OFF, len, (int)sizeof(*body));
 	}
 
-	if (*flags & LDLM_FL_NDELAY) {
-		DEBUG_REQ(D_DLMTRACE, req, "enque lock with no delay\n");
-		req->rq_no_resend = req->rq_no_delay = 1;
-		/* probably set a shorter timeout value and handle ETIMEDOUT
-		 * in osc_lock_upcall() correctly */
-		/* lustre_msg_set_timeout(req, req->rq_timeout / 2); */
-	}
-
 	/* Dump lock data into the request buffer */
 	body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
 	ldlm_lock2desc(lock, &body->lock_desc);
 	body->lock_flags = ldlm_flags_to_wire(*flags);
 	body->lock_handle[0] = *lockh;
 
-	/* extended LDLM opcodes in client stats */
-	if (exp->exp_obd->obd_svc_stats != NULL) {
-		bool glimpse = *flags & LDLM_FL_HAS_INTENT;
-
-		/* OST glimpse has no intent buffer */
-		if (req_capsule_has_field(&req->rq_pill, &RMF_LDLM_INTENT,
-					  RCL_CLIENT)) {
-			struct ldlm_intent *it;
-
-			it = req_capsule_client_get(&req->rq_pill,
-						    &RMF_LDLM_INTENT);
-			glimpse = (it && (it->opc == IT_GLIMPSE));
-		}
-
-		if (!glimpse)
-			ldlm_svc_get_eopc(body, exp->exp_obd->obd_svc_stats);
-		else
-			lprocfs_counter_incr(exp->exp_obd->obd_svc_stats,
-					     PTLRPC_LAST_CNTR +
-					     LDLM_GLIMPSE_ENQUEUE);
-	}
-
 	if (async) {
 		LASSERT(reqp != NULL);
 		RETURN(0);
@@ -1058,78 +1008,103 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
 }
 EXPORT_SYMBOL(ldlm_cli_enqueue);
 
-/**
- * Client-side IBITS lock convert.
- *
- * Inform server that lock has been converted instead of canceling.
- * Server finishes convert on own side and does reprocess to grant
- * all related waiting locks.
- *
- * Since convert means only ibits downgrading, client doesn't need to
- * wait for server reply to finish local converting process so this request
- * is made asynchronous.
- *
- */
-int ldlm_cli_convert_req(struct ldlm_lock *lock, __u32 *flags, __u64 new_bits)
+static int ldlm_cli_convert_local(struct ldlm_lock *lock, int new_mode,
+                                  __u32 *flags)
 {
-	struct ldlm_request *body;
-	struct ptlrpc_request *req;
-	struct obd_export *exp = lock->l_conn_export;
-
-	ENTRY;
-
-	LASSERT(exp != NULL);
+        struct ldlm_resource *res;
+        int rc;
+        ENTRY;
+        if (ns_is_client(ldlm_lock_to_ns(lock))) {
+                CERROR("Trying to cancel local lock\n");
+                LBUG();
+        }
+        LDLM_DEBUG(lock, "client-side local convert");
 
-	/* this is better to check earlier and it is done so already,
-	 * but this check is kept too as final one to issue an error
-	 * if any new code will miss such check.
-	 */
-	if (!exp_connect_lock_convert(exp)) {
-		LDLM_ERROR(lock, "server doesn't support lock convert\n");
-		RETURN(-EPROTO);
-	}
+        res = ldlm_lock_convert(lock, new_mode, flags);
+        if (res) {
+                ldlm_reprocess_all(res);
+                rc = 0;
+        } else {
+		rc = LUSTRE_EDEADLK;
+        }
+        LDLM_DEBUG(lock, "client-side local convert handler END");
+        LDLM_LOCK_PUT(lock);
+        RETURN(rc);
+}
 
-	if (lock->l_resource->lr_type != LDLM_IBITS) {
-		LDLM_ERROR(lock, "convert works with IBITS locks only.");
-		RETURN(-EINVAL);
-	}
+/* FIXME: one of ldlm_cli_convert or the server side should reject attempted
+ * conversion of locks which are on the waiting or converting queue */
+/* Caller of this code is supposed to take care of lock readers/writers
+   accounting */
+int ldlm_cli_convert(const struct lustre_handle *lockh, int new_mode,
+		     __u32 *flags)
+{
+        struct ldlm_request   *body;
+        struct ldlm_reply     *reply;
+        struct ldlm_lock      *lock;
+        struct ldlm_resource  *res;
+        struct ptlrpc_request *req;
+        int                    rc;
+        ENTRY;
 
-	LDLM_DEBUG(lock, "client-side convert");
+        lock = ldlm_handle2lock(lockh);
+        if (!lock) {
+                LBUG();
+                RETURN(-EINVAL);
+        }
+        *flags = 0;
 
-	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
-					&RQF_LDLM_CONVERT, LUSTRE_DLM_VERSION,
-					LDLM_CONVERT);
-	if (req == NULL)
-		RETURN(-ENOMEM);
+        if (lock->l_conn_export == NULL)
+                RETURN(ldlm_cli_convert_local(lock, new_mode, flags));
 
-	body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
-	body->lock_handle[0] = lock->l_remote_handle;
+        LDLM_DEBUG(lock, "client-side convert");
 
-	body->lock_desc.l_req_mode = lock->l_req_mode;
-	body->lock_desc.l_granted_mode = lock->l_granted_mode;
+        req = ptlrpc_request_alloc_pack(class_exp2cliimp(lock->l_conn_export),
+                                        &RQF_LDLM_CONVERT, LUSTRE_DLM_VERSION,
+                                        LDLM_CONVERT);
+        if (req == NULL) {
+                LDLM_LOCK_PUT(lock);
+                RETURN(-ENOMEM);
+        }
 
-	body->lock_desc.l_policy_data.l_inodebits.bits = new_bits;
-	body->lock_desc.l_policy_data.l_inodebits.cancel_bits = 0;
+        body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+        body->lock_handle[0] = lock->l_remote_handle;
 
+        body->lock_desc.l_req_mode = new_mode;
 	body->lock_flags = ldlm_flags_to_wire(*flags);
-	body->lock_count = 1;
 
-	ptlrpc_request_set_replen(req);
-
-	/*
-	 * Use cancel portals for convert as well as high-priority handling.
-	 */
-	req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
-	req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
 
-	ptlrpc_at_set_req_timeout(req);
+        ptlrpc_request_set_replen(req);
+        rc = ptlrpc_queue_wait(req);
+        if (rc != ELDLM_OK)
+                GOTO(out, rc);
 
-	if (exp->exp_obd->obd_svc_stats != NULL)
-		lprocfs_counter_incr(exp->exp_obd->obd_svc_stats,
-				     LDLM_CONVERT - LDLM_FIRST_OPC);
+        reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+        if (reply == NULL)
+                GOTO(out, rc = -EPROTO);
 
-	ptlrpcd_add_req(req);
-	RETURN(0);
+        if (req->rq_status)
+                GOTO(out, rc = req->rq_status);
+
+        res = ldlm_lock_convert(lock, new_mode, &reply->lock_flags);
+        if (res != NULL) {
+                ldlm_reprocess_all(res);
+                /* Go to sleep until the lock is granted. */
+                /* FIXME: or cancelled. */
+                if (lock->l_completion_ast) {
+                        rc = lock->l_completion_ast(lock, LDLM_FL_WAIT_NOREPROC,
+                                                    NULL);
+                        if (rc)
+                                GOTO(out, rc);
+                }
+        } else {
+		rc = LUSTRE_EDEADLK;
+        }
+        EXIT;
+ out:
+        LDLM_LOCK_PUT(lock);
+        ptlrpc_req_finished(req);
+        return rc;
 }
 
 /**
@@ -1147,12 +1122,9 @@ static __u64 ldlm_cli_cancel_local(struct ldlm_lock *lock)
         if (lock->l_conn_export) {
                 bool local_only;
 
-		LDLM_DEBUG(lock, "client-side cancel");
-		OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL,
-				 cfs_fail_val);
-
-		/* Set this flag to prevent others from getting new references*/
-		lock_res_and_lock(lock);
+                LDLM_DEBUG(lock, "client-side cancel");
+                /* Set this flag to prevent others from getting new references*/
+                lock_res_and_lock(lock);
 		ldlm_set_cbpending(lock);
 		local_only = !!(lock->l_flags &
 				(LDLM_FL_LOCAL_ONLY|LDLM_FL_CANCEL_ON_BLOCK));
@@ -1161,23 +1133,23 @@ static __u64 ldlm_cli_cancel_local(struct ldlm_lock *lock)
 			LDLM_FL_BL_AST : LDLM_FL_CANCELING;
 		unlock_res_and_lock(lock);
 
-		if (local_only) {
-			CDEBUG(D_DLMTRACE,
-			       "not sending request (at caller's instruction)\n");
-			rc = LDLM_FL_LOCAL_ONLY;
-		}
-		ldlm_lock_cancel(lock);
-	} else {
-		if (ns_is_client(ldlm_lock_to_ns(lock))) {
-			LDLM_ERROR(lock, "Trying to cancel local lock");
-			LBUG();
-		}
-		LDLM_DEBUG(lock, "server-side local cancel");
-		ldlm_lock_cancel(lock);
-		ldlm_reprocess_all(lock->l_resource, lock);
-	}
+                if (local_only) {
+                        CDEBUG(D_DLMTRACE, "not sending request (at caller's "
+                               "instruction)\n");
+                        rc = LDLM_FL_LOCAL_ONLY;
+                }
+                ldlm_lock_cancel(lock);
+        } else {
+                if (ns_is_client(ldlm_lock_to_ns(lock))) {
+                        LDLM_ERROR(lock, "Trying to cancel local lock");
+                        LBUG();
+                }
+                LDLM_DEBUG(lock, "server-side local cancel");
+                ldlm_lock_cancel(lock);
+                ldlm_reprocess_all(lock->l_resource);
+        }
 
-	RETURN(rc);
+        RETURN(rc);
 }
 
 /**
@@ -1375,27 +1347,6 @@ int ldlm_cli_update_pool(struct ptlrpc_request *req)
         RETURN(0);
 }
 
-int ldlm_cli_convert(struct ldlm_lock *lock,
-		     enum ldlm_cancel_flags cancel_flags)
-{
-	int rc = -EINVAL;
-
-	LASSERT(!lock->l_readers && !lock->l_writers);
-	LDLM_DEBUG(lock, "client lock convert START");
-
-	if (lock->l_resource->lr_type == LDLM_IBITS) {
-		lock_res_and_lock(lock);
-		do {
-			rc = ldlm_cli_inodebits_convert(lock, cancel_flags);
-		} while (rc == -EAGAIN);
-		unlock_res_and_lock(lock);
-	}
-
-	LDLM_DEBUG(lock, "client lock convert END");
-	RETURN(rc);
-}
-EXPORT_SYMBOL(ldlm_cli_convert);
-
 /**
  * Client side lock cancel.
  *
@@ -1405,12 +1356,12 @@ int ldlm_cli_cancel(const struct lustre_handle *lockh,
 		    enum ldlm_cancel_flags cancel_flags)
 {
 	struct obd_export *exp;
+	enum ldlm_lru_flags lru_flags;
 	int avail, count = 1;
 	__u64 rc = 0;
 	struct ldlm_namespace *ns;
 	struct ldlm_lock *lock;
 	struct list_head cancels = LIST_HEAD_INIT(cancels);
-
 	ENTRY;
 
 	lock = ldlm_handle2lock_long(lockh, 0);
@@ -1420,8 +1371,6 @@ int ldlm_cli_cancel(const struct lustre_handle *lockh,
 	}
 
 	lock_res_and_lock(lock);
-	LASSERT(!ldlm_is_converting(lock));
-
 	/* Lock is being canceled and the caller doesn't want to wait */
 	if (ldlm_is_canceling(lock)) {
 		if (cancel_flags & LCF_ASYNC) {
@@ -1458,8 +1407,10 @@ int ldlm_cli_cancel(const struct lustre_handle *lockh,
 		LASSERT(avail > 0);
 
 		ns = ldlm_lock_to_ns(lock);
+		lru_flags = ns_connect_lru_resize(ns) ?
+			LDLM_LRU_FLAG_LRUR : LDLM_LRU_FLAG_AGED;
 		count += ldlm_cancel_lru_local(ns, &cancels, 0, avail - 1,
-					       LCF_BL_AST, 0);
+					       LCF_BL_AST, lru_flags);
 	}
 	ldlm_cli_cancel_list(&cancels, count, NULL, cancel_flags);
 	RETURN(0);
@@ -1522,11 +1473,11 @@ int ldlm_cli_cancel_list_local(struct list_head *cancels, int count,
  */
 static enum ldlm_policy_res
 ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock,
-			   int added, int min)
+			   int unused, int added, int count)
 {
 	enum ldlm_policy_res result = LDLM_POLICY_CANCEL_LOCK;
 
-	/* don't check @added & @min since we want to process all locks
+	/* don't check added & count since we want to process all locks
 	 * from unused list.
 	 * It's fine to not take lock to access lock->l_resource since
 	 * the lock has already been granted so it won't change. */
@@ -1535,7 +1486,7 @@ ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock,
 		case LDLM_IBITS:
 			if (ns->ns_cancel != NULL && ns->ns_cancel(lock) != 0)
 				break;
-			/* fallthrough */
+			/* Fall through */
 		default:
 			result = LDLM_POLICY_SKIP_LOCK;
 			break;
@@ -1546,8 +1497,8 @@ ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock,
 
 /**
  * Callback function for LRU-resize policy. Decides whether to keep
- * \a lock in LRU for \a added in current scan and \a min number of locks
- * to be preferably canceled.
+ * \a lock in LRU for current \a LRU size \a unused, added in current
+ * scan \a added and number of locks to be preferably canceled \a count.
  *
  * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
  *
@@ -1555,29 +1506,32 @@ ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock,
  */
 static enum ldlm_policy_res ldlm_cancel_lrur_policy(struct ldlm_namespace *ns,
 						    struct ldlm_lock *lock,
-						    int added, int min)
+						    int unused, int added,
+						    int count)
 {
 	ktime_t cur = ktime_get();
 	struct ldlm_pool *pl = &ns->ns_pool;
 	u64 slv, lvf, lv;
 	s64 la;
 
-	if (added < min)
-		return LDLM_POLICY_CANCEL_LOCK;
+	/* Stop LRU processing when we reach past @count or have checked all
+	 * locks in LRU. */
+	if (count && added >= count)
+		return LDLM_POLICY_KEEP_LOCK;
 
 	/* Despite of the LV, It doesn't make sense to keep the lock which
 	 * is unused for ns_max_age time.
 	 */
-	if (ktime_after(cur, ktime_add(lock->l_last_used, ns->ns_max_age)))
+	if (ktime_after(ktime_get(),
+			ktime_add(lock->l_last_used, ns->ns_max_age)))
 		return LDLM_POLICY_CANCEL_LOCK;
 
 	slv = ldlm_pool_get_slv(pl);
 	lvf = ldlm_pool_get_lvf(pl);
-	la = div_u64(ktime_to_ns(ktime_sub(cur, lock->l_last_used)),
-		     NSEC_PER_SEC);
-	lv = lvf * la * ns->ns_nr_unused;
+	la = ktime_to_ns(ktime_sub(cur, lock->l_last_used)) / NSEC_PER_SEC;
+	lv = lvf * la * unused;
 
-	/* Inform pool about current CLV to see it via debugfs. */
+	/* Inform pool about current CLV to see it via proc. */
 	ldlm_pool_set_clv(pl, lv);
 
 	/* Stop when SLV is not yet come from server or lv is smaller than
@@ -1591,21 +1545,42 @@ static enum ldlm_policy_res ldlm_cancel_lrur_policy(struct ldlm_namespace *ns,
 static enum ldlm_policy_res
 ldlm_cancel_lrur_no_wait_policy(struct ldlm_namespace *ns,
 				struct ldlm_lock *lock,
-				int added, int min)
+				int unused, int added,
+				int count)
 {
 	enum ldlm_policy_res result;
 
-	result = ldlm_cancel_lrur_policy(ns, lock, added, min);
+	result = ldlm_cancel_lrur_policy(ns, lock, unused, added, count);
 	if (result == LDLM_POLICY_KEEP_LOCK)
 		return result;
 
-	return ldlm_cancel_no_wait_policy(ns, lock, added, min);
+	return ldlm_cancel_no_wait_policy(ns, lock, unused, added, count);
+}
+
+/**
+ * Callback function for proc used policy. Makes decision whether to keep
+ * \a lock in LRU for current \a LRU size \a unused, added in current scan \a
+ * added and number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static enum ldlm_policy_res ldlm_cancel_passed_policy(struct ldlm_namespace *ns,
+						      struct ldlm_lock *lock,
+						      int unused, int added,
+						      int count)
+{
+	/* Stop LRU processing when we reach past @count or have checked all
+	 * locks in LRU. */
+	return (added >= count) ?
+		LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
 }
 
 /**
- * Callback function for aged policy. Decides whether to keep
- * \a lock in LRU for \a added in current scan and \a min number of locks
- * to be preferably canceled.
+ * Callback function for aged policy. Makes decision whether to keep \a lock in
+ * LRU for current LRU size \a unused, added in current scan \a added and
+ * number of locks to be preferably canceled \a count.
  *
  * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
  *
@@ -1613,9 +1588,10 @@ ldlm_cancel_lrur_no_wait_policy(struct ldlm_namespace *ns,
  */
 static enum ldlm_policy_res ldlm_cancel_aged_policy(struct ldlm_namespace *ns,
 						    struct ldlm_lock *lock,
-						    int added, int min)
+						    int unused, int added,
+						    int count)
 {
-	if ((added >= min) &&
+	if ((added >= count) &&
 	    ktime_before(ktime_get(),
 			 ktime_add(lock->l_last_used, ns->ns_max_age)))
 		return LDLM_POLICY_KEEP_LOCK;
@@ -1626,43 +1602,76 @@ static enum ldlm_policy_res ldlm_cancel_aged_policy(struct ldlm_namespace *ns,
 static enum ldlm_policy_res
 ldlm_cancel_aged_no_wait_policy(struct ldlm_namespace *ns,
 				struct ldlm_lock *lock,
-				int added, int min)
+				int unused, int added, int count)
 {
 	enum ldlm_policy_res result;
 
-	result = ldlm_cancel_aged_policy(ns, lock, added, min);
+	result = ldlm_cancel_aged_policy(ns, lock, unused, added, count);
 	if (result == LDLM_POLICY_KEEP_LOCK)
 		return result;
 
-	return ldlm_cancel_no_wait_policy(ns, lock, added, min);
+	return ldlm_cancel_no_wait_policy(ns, lock, unused, added, count);
+}
+
+/**
+ * Callback function for default policy. Makes decision whether to keep \a lock
+ * in LRU for current LRU size \a unused, added in current scan \a added and
+ * number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static
+enum ldlm_policy_res ldlm_cancel_default_policy(struct ldlm_namespace *ns,
+						struct ldlm_lock *lock,
+						int unused, int added,
+						int count)
+{
+	/* Stop LRU processing when we reach past count or have checked all
+	 * locks in LRU. */
+        return (added >= count) ?
+                LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
 }
 
 typedef enum ldlm_policy_res
 (*ldlm_cancel_lru_policy_t)(struct ldlm_namespace *ns, struct ldlm_lock *lock,
-			    int added, int min);
+			    int unused, int added, int count);
 
 static ldlm_cancel_lru_policy_t
 ldlm_cancel_lru_policy(struct ldlm_namespace *ns, enum ldlm_lru_flags lru_flags)
 {
 	if (ns_connect_lru_resize(ns)) {
-		if (lru_flags & LDLM_LRU_FLAG_NO_WAIT)
-			return ldlm_cancel_lrur_no_wait_policy;
-		else
-			return ldlm_cancel_lrur_policy;
+		if (lru_flags & LDLM_LRU_FLAG_SHRINK)
+			/* We kill passed number of old locks. */
+			return ldlm_cancel_passed_policy;
+		if (lru_flags & LDLM_LRU_FLAG_LRUR) {
+			if (lru_flags & LDLM_LRU_FLAG_NO_WAIT)
+				return ldlm_cancel_lrur_no_wait_policy;
+			else
+				return ldlm_cancel_lrur_policy;
+		}
+		if (lru_flags & LDLM_LRU_FLAG_PASSED)
+			return ldlm_cancel_passed_policy;
 	} else {
-		if (lru_flags & LDLM_LRU_FLAG_NO_WAIT)
-			return ldlm_cancel_aged_no_wait_policy;
-		else
-			return ldlm_cancel_aged_policy;
+		if (lru_flags & LDLM_LRU_FLAG_AGED) {
+			if (lru_flags & LDLM_LRU_FLAG_NO_WAIT)
+				return ldlm_cancel_aged_no_wait_policy;
+			else
+				return ldlm_cancel_aged_policy;
+		}
 	}
+	if (lru_flags & LDLM_LRU_FLAG_NO_WAIT)
+		return ldlm_cancel_no_wait_policy;
+
+	return ldlm_cancel_default_policy;
 }
 
 /**
- * - Free space in LRU for \a min new locks,
+ * - Free space in LRU for \a count new locks,
  *   redundant unused locks are canceled locally;
  * - also cancel locally unused aged locks;
  * - do not cancel more than \a max locks;
- * - if some locks are cancelled, try to cancel at least \a batch locks
  * - GET the found locks and add them into the \a cancels list.
  *
  * A client lock can be added to the l_bl_ast list only when it is
@@ -1673,22 +1682,30 @@ ldlm_cancel_lru_policy(struct ldlm_namespace *ns, enum ldlm_lru_flags lru_flags)
  * attempt to cancel a lock rely on this flag, l_bl_ast list is accessed
  * later without any special locking.
  *
- * Locks are cancelled according to the LRU resize policy (SLV from server)
- * if LRU resize is enabled; otherwise, the "aged policy" is used;
- *
- * LRU flags:
+ * Calling policies for enabled LRU resize:
  * ----------------------------------------
+ * flags & LDLM_LRU_FLAG_LRUR - use LRU resize policy (SLV from server) to
+ *				cancel not more than \a count locks;
+ *
+ * flags & LDLM_LRU_FLAG_PASSED - cancel \a count number of old locks (located
+ *				at the beginning of LRU list);
  *
- * flags & LDLM_LRU_FLAG_NO_WAIT - cancel locks w/o sending any RPCs or waiting
- *				for any outstanding RPC to complete.
+ * flags & LDLM_LRU_FLAG_SHRINK - cancel not more than \a count locks according
+ *				to memory pressre policy function;
+ *
+ * flags & LDLM_LRU_FLAG_AGED - cancel \a count locks according to "aged policy"
+ *
+ * flags & LDLM_LRU_FLAG_NO_WAIT - cancel as many unused locks as possible
+ *				(typically before replaying locks) w/o
+ *				sending any RPCs or waiting for any
+ *				outstanding RPC to complete.
  *
  * flags & LDLM_CANCEL_CLEANUP - when cancelling read locks, do not check for
- *				 other read locks covering the same pages, just
- *				 discard those pages.
+ * 				other read locks covering the same pages, just
+ * 				discard those pages.
  */
 static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
-				 struct list_head *cancels,
-				 int min, int max, int batch,
+				 struct list_head *cancels, int count, int max,
 				 enum ldlm_lru_flags lru_flags)
 {
 	ldlm_cancel_lru_policy_t pf;
@@ -1697,26 +1714,8 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
 
 	ENTRY;
 
-	/*
-	 * Let only 1 thread to proceed. However, not for those which have the
-	 * @max limit given (ELC), as LRU may be left not cleaned up in full.
-	 */
-	if (max == 0) {
-		if (test_and_set_bit(LDLM_LRU_CANCEL, &ns->ns_flags))
-			RETURN(0);
-	} else if (test_bit(LDLM_LRU_CANCEL, &ns->ns_flags))
-		RETURN(0);
-
-	LASSERT(ergo(max, min <= max));
-	/* No sense to give @batch for ELC */
-	LASSERT(ergo(max, batch == 0));
-
 	if (!ns_connect_lru_resize(ns))
-		min = max_t(int, min, ns->ns_nr_unused - ns->ns_max_unused);
-
-	/* If at least 1 lock is to be cancelled, cancel at least @batch locks */
-	if (min && min < batch)
-		min = batch;
+		count += ns->ns_nr_unused - ns->ns_max_unused;
 
 	pf = ldlm_cancel_lru_policy(ns, lru_flags);
 	LASSERT(pf != NULL);
@@ -1769,7 +1768,7 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
 		 * old locks, but additionally choose them by
 		 * their weight. Big extent locks will stay in
 		 * the cache. */
-		result = pf(ns, lock, added, min);
+		result = pf(ns, lock, ns->ns_nr_unused, added, count);
 		if (result == LDLM_POLICY_KEEP_LOCK) {
 			lu_ref_del(&lock->l_reference, __func__, current);
 			LDLM_LOCK_RELEASE(lock);
@@ -1778,6 +1777,7 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
 
 		if (result == LDLM_POLICY_SKIP_LOCK) {
 			lu_ref_del(&lock->l_reference, __func__, current);
+			LDLM_LOCK_RELEASE(lock);
 			if (no_wait) {
 				spin_lock(&ns->ns_lock);
 				if (!list_empty(&lock->l_lru) &&
@@ -1785,8 +1785,6 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
 					ns->ns_last_pos = &lock->l_lru;
 				spin_unlock(&ns->ns_lock);
 			}
-
-			LDLM_LOCK_RELEASE(lock);
 			continue;
 		}
 
@@ -1823,8 +1821,8 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
 		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING;
 
 		if ((lru_flags & LDLM_LRU_FLAG_CLEANUP) &&
-		    (lock->l_resource->lr_type == LDLM_EXTENT ||
-		     ldlm_has_dom(lock)) && lock->l_granted_mode == LCK_PR)
+		    lock->l_resource->lr_type == LDLM_EXTENT &&
+		    lock->l_granted_mode == LCK_PR)
 			ldlm_set_discard_data(lock);
 
 		/* We can't re-add to l_lru as it confuses the
@@ -1838,25 +1836,18 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
 		unlock_res_and_lock(lock);
 		lu_ref_del(&lock->l_reference, __FUNCTION__, current);
 		added++;
-		/* Once a lock added, batch the requested amount */
-		if (min == 0)
-			min = batch;
 	}
-
-	if (max == 0)
-		clear_bit(LDLM_LRU_CANCEL, &ns->ns_flags);
-
 	RETURN(added);
 }
 
 int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
-			  int min, int max,
+			  int count, int max,
 			  enum ldlm_cancel_flags cancel_flags,
 			  enum ldlm_lru_flags lru_flags)
 {
 	int added;
 
-	added = ldlm_prepare_lru_list(ns, cancels, min, max, 0, lru_flags);
+	added = ldlm_prepare_lru_list(ns, cancels, count, max, lru_flags);
 	if (added <= 0)
 		return added;
 
@@ -1864,14 +1855,14 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
 }
 
 /**
- * Cancel at least \a min locks from given namespace LRU.
+ * Cancel at least \a nr locks from given namespace LRU.
  *
  * When called with LCF_ASYNC the blocking callback will be handled
  * in a thread and this function will return after the thread has been
  * asked to call the callback.  When called with LCF_ASYNC the blocking
  * callback will be performed in this function.
  */
-int ldlm_cancel_lru(struct ldlm_namespace *ns, int min,
+int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr,
 		    enum ldlm_cancel_flags cancel_flags,
 		    enum ldlm_lru_flags lru_flags)
 {
@@ -1881,7 +1872,7 @@ int ldlm_cancel_lru(struct ldlm_namespace *ns, int min,
 
 	/* Just prepare the list of locks, do not actually cancel them yet.
 	 * Locks are cancelled later in a separate thread. */
-	count = ldlm_prepare_lru_list(ns, &cancels, min, 0, 0, lru_flags);
+	count = ldlm_prepare_lru_list(ns, &cancels, nr, 0, lru_flags);
 	rc = ldlm_bl_to_thread_list(ns, NULL, &cancels, count, cancel_flags);
 	if (rc == 0)
 		RETURN(count);
@@ -1903,50 +1894,47 @@ int ldlm_cancel_resource_local(struct ldlm_resource *res,
 {
 	struct ldlm_lock *lock;
 	int count = 0;
-
 	ENTRY;
 
 	lock_res(res);
 	list_for_each_entry(lock, &res->lr_granted, l_res_link) {
-		if (opaque != NULL && lock->l_ast_data != opaque) {
-			LDLM_ERROR(lock, "data %p doesn't match opaque %p",
-				   lock->l_ast_data, opaque);
-			continue;
-		}
+                if (opaque != NULL && lock->l_ast_data != opaque) {
+                        LDLM_ERROR(lock, "data %p doesn't match opaque %p",
+                                   lock->l_ast_data, opaque);
+                        //LBUG();
+                        continue;
+                }
 
-		if (lock->l_readers || lock->l_writers)
-			continue;
+                if (lock->l_readers || lock->l_writers)
+                        continue;
 
-		/*
-		 * If somebody is already doing CANCEL, or blocking AST came
-		 * then skip this lock.
-		 */
+		/* If somebody is already doing CANCEL, or blocking AST came,
+		 * skip this lock. */
 		if (ldlm_is_bl_ast(lock) || ldlm_is_canceling(lock))
 			continue;
 
-		if (lockmode_compat(lock->l_granted_mode, mode))
-			continue;
+                if (lockmode_compat(lock->l_granted_mode, mode))
+                        continue;
 
-		/* If policy is given and this is IBITS lock, add to list only
-		 * those locks that match by policy.
-		 * Skip locks with DoM bit always to don't flush data.
-		 */
-		if (policy && (lock->l_resource->lr_type == LDLM_IBITS) &&
-		    (!(lock->l_policy_data.l_inodebits.bits &
-		      policy->l_inodebits.bits) || ldlm_has_dom(lock)))
-			continue;
+                /* If policy is given and this is IBITS lock, add to list only
+                 * those locks that match by policy. */
+                if (policy && (lock->l_resource->lr_type == LDLM_IBITS) &&
+                    !(lock->l_policy_data.l_inodebits.bits &
+                      policy->l_inodebits.bits))
+                        continue;
 
 		/* See CBPENDING comment in ldlm_cancel_lru */
 		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING |
 				 lock_flags;
+
 		LASSERT(list_empty(&lock->l_bl_ast));
 		list_add(&lock->l_bl_ast, cancels);
-		LDLM_LOCK_GET(lock);
-		count++;
-	}
-	unlock_res(res);
+                LDLM_LOCK_GET(lock);
+                count++;
+        }
+        unlock_res(res);
 
-	RETURN(ldlm_cli_cancel_list_local(cancels, count, cancel_flags));
+        RETURN(ldlm_cli_cancel_list_local(cancels, count, cancel_flags));
 }
 EXPORT_SYMBOL(ldlm_cancel_resource_local);
 
@@ -2100,34 +2088,41 @@ int ldlm_cli_cancel_unused(struct ldlm_namespace *ns,
 /* Lock iterators. */
 
 int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter,
-			  void *closure)
+                          void *closure)
 {
 	struct list_head *tmp, *next;
-	struct ldlm_lock *lock;
-	int rc = LDLM_ITER_CONTINUE;
+        struct ldlm_lock *lock;
+        int rc = LDLM_ITER_CONTINUE;
 
-	ENTRY;
+        ENTRY;
 
-	if (!res)
-		RETURN(LDLM_ITER_CONTINUE);
+        if (!res)
+                RETURN(LDLM_ITER_CONTINUE);
 
-	lock_res(res);
+        lock_res(res);
 	list_for_each_safe(tmp, next, &res->lr_granted) {
 		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
 
-		if (iter(lock, closure) == LDLM_ITER_STOP)
-			GOTO(out, rc = LDLM_ITER_STOP);
-	}
+                if (iter(lock, closure) == LDLM_ITER_STOP)
+                        GOTO(out, rc = LDLM_ITER_STOP);
+        }
+
+	list_for_each_safe(tmp, next, &res->lr_converting) {
+		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+                if (iter(lock, closure) == LDLM_ITER_STOP)
+                        GOTO(out, rc = LDLM_ITER_STOP);
+        }
 
 	list_for_each_safe(tmp, next, &res->lr_waiting) {
 		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
 
-		if (iter(lock, closure) == LDLM_ITER_STOP)
-			GOTO(out, rc = LDLM_ITER_STOP);
-	}
-out:
-	unlock_res(res);
-	RETURN(rc);
+                if (iter(lock, closure) == LDLM_ITER_STOP)
+                        GOTO(out, rc = LDLM_ITER_STOP);
+        }
+ out:
+        unlock_res(res);
+        RETURN(rc);
 }
 
 struct iter_helper_data {
@@ -2221,8 +2216,6 @@ static int replay_lock_interpret(const struct lu_env *env,
 
 	ENTRY;
 	atomic_dec(&req->rq_import->imp_replay_inflight);
-	wake_up(&req->rq_import->imp_replay_waitq);
-
 	if (rc != ELDLM_OK)
 		GOTO(out, rc);
 
@@ -2288,23 +2281,28 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
                 RETURN(0);
         }
 
-	/*
-	 * If granted mode matches the requested mode, this lock is granted.
-	 *
-	 * If we haven't been granted anything and are on a resource list,
-	 * then we're blocked/waiting.
-	 *
-	 * If we haven't been granted anything and we're NOT on a resource list,
-	 * then we haven't got a reply yet and don't have a known disposition.
-	 * This happens whenever a lock enqueue is the request that triggers
-	 * recovery.
-	 */
-	if (ldlm_is_granted(lock))
-		flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_GRANTED;
+        /*
+         * If granted mode matches the requested mode, this lock is granted.
+         *
+         * If they differ, but we have a granted mode, then we were granted
+         * one mode and now want another: ergo, converting.
+         *
+         * If we haven't been granted anything and are on a resource list,
+         * then we're blocked/waiting.
+         *
+         * If we haven't been granted anything and we're NOT on a resource list,
+         * then we haven't got a reply yet and don't have a known disposition.
+         * This happens whenever a lock enqueue is the request that triggers
+         * recovery.
+         */
+        if (lock->l_granted_mode == lock->l_req_mode)
+                flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_GRANTED;
+        else if (lock->l_granted_mode)
+                flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_CONV;
 	else if (!list_empty(&lock->l_res_link))
-		flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_WAIT;
-	else
-		flags = LDLM_FL_REPLAY;
+                flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_WAIT;
+        else
+                flags = LDLM_FL_REPLAY;
 
         req = ptlrpc_request_alloc_pack(imp, &RQF_LDLM_ENQUEUE,
                                         LUSTRE_DLM_VERSION, LDLM_ENQUEUE);
@@ -2313,8 +2311,6 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
 
         /* We're part of recovery, so don't wait for it. */
         req->rq_send_state = LUSTRE_IMP_REPLAY_LOCKS;
-	/* If the state changed while we were prepared, don't wait */
-	req->rq_no_delay = 1;
 
         body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
         ldlm_lock2desc(lock, &body->lock_desc);
@@ -2373,20 +2369,7 @@ static void ldlm_cancel_unused_locks_for_replay(struct ldlm_namespace *ns)
 			   canceled, ldlm_ns_name(ns));
 }
 
-static int lock_can_replay(struct obd_import *imp)
-{
-	struct client_obd *cli = &imp->imp_obd->u.cli;
-
-	CDEBUG(D_HA, "check lock replay limit, inflights = %u(%u)\n",
-	       atomic_read(&imp->imp_replay_inflight) - 1,
-	       cli->cl_max_rpcs_in_flight);
-
-	/* +1 due to ldlm_lock_replay() increment */
-	return atomic_read(&imp->imp_replay_inflight) <
-	       1 + min_t(u32, cli->cl_max_rpcs_in_flight, 8);
-}
-
-int __ldlm_replay_locks(struct obd_import *imp, bool rate_limit)
+int ldlm_replay_locks(struct obd_import *imp)
 {
 	struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
 	struct list_head list = LIST_HEAD_INIT(list);
@@ -2395,12 +2378,15 @@ int __ldlm_replay_locks(struct obd_import *imp, bool rate_limit)
 
 	ENTRY;
 
-	LASSERT(atomic_read(&imp->imp_replay_inflight) == 1);
+	LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
 
 	/* don't replay locks if import failed recovery */
 	if (imp->imp_vbr_failed)
 		RETURN(0);
 
+	/* ensure this doesn't fall to 0 before all have been queued */
+	atomic_inc(&imp->imp_replay_inflight);
+
 	if (ldlm_cancel_unused_locks_before_replay)
 		ldlm_cancel_unused_locks_for_replay(ns);
 
@@ -2408,64 +2394,15 @@ int __ldlm_replay_locks(struct obd_import *imp, bool rate_limit)
 
 	list_for_each_entry_safe(lock, next, &list, l_pending_chain) {
 		list_del_init(&lock->l_pending_chain);
-		/* If we disconnected in the middle - cleanup and let
-		 * reconnection to happen again. LU-14027 */
-		if (rc || (imp->imp_state != LUSTRE_IMP_REPLAY_LOCKS)) {
+		if (rc) {
 			LDLM_LOCK_RELEASE(lock);
-			continue;
+			continue; /* or try to do the rest? */
 		}
 		rc = replay_one_lock(imp, lock);
 		LDLM_LOCK_RELEASE(lock);
-
-		if (rate_limit)
-			wait_event_idle_exclusive(imp->imp_replay_waitq,
-						  lock_can_replay(imp));
 	}
 
-	RETURN(rc);
-}
-
-/**
- * Lock replay uses rate control and can sleep waiting so
- * must be in separate thread from ptlrpcd itself
- */
-static int ldlm_lock_replay_thread(void *data)
-{
-	struct obd_import *imp = data;
-
-	unshare_fs_struct();
-
-	CDEBUG(D_HA, "lock replay thread %s to %s@%s\n",
-	       imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
-	       imp->imp_connection->c_remote_uuid.uuid);
-
-	__ldlm_replay_locks(imp, true);
 	atomic_dec(&imp->imp_replay_inflight);
-	ptlrpc_import_recovery_state_machine(imp);
-	class_import_put(imp);
 
-	return 0;
-}
-
-int ldlm_replay_locks(struct obd_import *imp)
-{
-	struct task_struct *task;
-	int rc = 0;
-
-	class_import_get(imp);
-	/* ensure this doesn't fall to 0 before all have been queued */
-	atomic_inc(&imp->imp_replay_inflight);
-
-	task = kthread_run(ldlm_lock_replay_thread, imp, "ldlm_lock_replay");
-	if (IS_ERR(task)) {
-		rc = PTR_ERR(task);
-		CDEBUG(D_HA, "can't start lock replay thread: rc = %d\n", rc);
-
-		/* run lock replay without rate control */
-		rc = __ldlm_replay_locks(imp, false);
-		atomic_dec(&imp->imp_replay_inflight);
-		class_import_put(imp);
-	}
-
-	return rc;
+	RETURN(rc);
 }
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c
index 8b36f70af7f56..042633867837b 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2017, Intel Corporation.
+ * Copyright (c) 2010, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -43,7 +43,6 @@
 
 struct kmem_cache *ldlm_resource_slab, *ldlm_lock_slab;
 struct kmem_cache *ldlm_interval_tree_slab;
-struct kmem_cache *ldlm_inodebits_slab;
 
 int ldlm_srv_namespace_nr = 0;
 int ldlm_cli_namespace_nr = 0;
@@ -59,45 +58,26 @@ LIST_HEAD(ldlm_cli_active_namespace_list);
 /* Client namespaces that don't have any locks in them */
 LIST_HEAD(ldlm_cli_inactive_namespace_list);
 
-static struct dentry *ldlm_debugfs_dir;
-static struct dentry *ldlm_ns_debugfs_dir;
-struct dentry *ldlm_svc_debugfs_dir;
+static struct proc_dir_entry *ldlm_type_proc_dir;
+static struct proc_dir_entry *ldlm_ns_proc_dir;
+struct proc_dir_entry *ldlm_svc_proc_dir;
 
 /* during debug dump certain amount of granted locks for one resource to avoid
  * DDOS. */
 static unsigned int ldlm_dump_granted_max = 256;
 
-static ssize_t ldebugfs_dump_ns_seq_write(struct file *file,
-					  const char __user *buffer,
-					  size_t count, loff_t *off)
+#ifdef CONFIG_PROC_FS
+static ssize_t
+lprocfs_dump_ns_seq_write(struct file *file, const char __user *buffer,
+			  size_t count, loff_t *off)
 {
 	ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE);
 	ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE);
 	RETURN(count);
 }
+LPROC_SEQ_FOPS_WO_TYPE(ldlm, dump_ns);
 
-LDEBUGFS_FOPS_WR_ONLY(ldlm, dump_ns);
-
-static int ldlm_rw_uint_seq_show(struct seq_file *m, void *v)
-{
-	seq_printf(m, "%u\n", *(unsigned int *)m->private);
-	return 0;
-}
-
-static ssize_t
-ldlm_rw_uint_seq_write(struct file *file, const char __user *buffer,
-		       size_t count, loff_t *off)
-{
-	struct seq_file *seq = file->private_data;
-
-	if (!count)
-		return 0;
-
-	return kstrtouint_from_user(buffer, count, 0,
-				    (unsigned int *)seq->private);
-}
-
-LDEBUGFS_SEQ_FOPS(ldlm_rw_uint);
+LPROC_SEQ_FOPS_RW_TYPE(ldlm_rw, uint);
 
 #ifdef HAVE_SERVER_SUPPORT
 
@@ -117,7 +97,7 @@ static ssize_t seq_watermark_write(struct file *file,
 	bool wm_low = (data == &ldlm_reclaim_threshold_mb) ? true : false;
 	int rc;
 
-	rc = lprocfs_str_with_units_to_s64(buffer, count, &value, 'M');
+	rc = lprocfs_str_with_units_to_s64(file, buffer, count, &value, 'M');
 	if (rc) {
 		CERROR("Failed to set %s, rc = %d.\n",
 		       wm_low ? "lock_reclaim_threshold_mb" : "lock_limit_mb",
@@ -164,7 +144,7 @@ static ssize_t seq_watermark_write(struct file *file,
 
 static int seq_watermark_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, seq_watermark_show, inode->i_private);
+	return single_open(file, seq_watermark_show, PDE_DATA(inode));
 }
 
 static const struct file_operations ldlm_watermark_fops = {
@@ -185,7 +165,7 @@ static int seq_granted_show(struct seq_file *m, void *data)
 
 static int seq_granted_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, seq_granted_show, inode->i_private);
+	return single_open(file, seq_granted_show, PDE_DATA(inode));
 }
 
 static const struct file_operations ldlm_granted_fops = {
@@ -198,62 +178,59 @@ static const struct file_operations ldlm_granted_fops = {
 
 #endif /* HAVE_SERVER_SUPPORT */
 
-static struct ldebugfs_vars ldlm_debugfs_list[] = {
-	{ .name	=	"dump_namespaces",
-	  .fops	=	&ldlm_dump_ns_fops,
-	  .proc_mode =	0222 },
-	{ .name	=	"dump_granted_max",
-	  .fops	=	&ldlm_rw_uint_fops,
-	  .data	=	&ldlm_dump_granted_max },
-#ifdef HAVE_SERVER_SUPPORT
-	{ .name =	"lock_reclaim_threshold_mb",
-	  .fops =	&ldlm_watermark_fops,
-	  .data =	&ldlm_reclaim_threshold_mb },
-	{ .name =	"lock_limit_mb",
-	  .fops =	&ldlm_watermark_fops,
-	  .data =	&ldlm_lock_limit_mb },
-	{ .name =	"lock_granted_count",
-	  .fops =	&ldlm_granted_fops,
-	  .data =	&ldlm_granted_total },
-#endif
-	{ NULL }
-};
-
-int ldlm_debugfs_setup(void)
+int ldlm_proc_setup(void)
 {
 	int rc;
-
+	struct lprocfs_vars list[] = {
+		{ .name	=	"dump_namespaces",
+		  .fops	=	&ldlm_dump_ns_fops,
+		  .proc_mode =	0222 },
+		{ .name	=	"dump_granted_max",
+		  .fops	=	&ldlm_rw_uint_fops,
+		  .data	=	&ldlm_dump_granted_max },
+#ifdef HAVE_SERVER_SUPPORT
+		{ .name =	"lock_reclaim_threshold_mb",
+		  .fops =	&ldlm_watermark_fops,
+		  .data =	&ldlm_reclaim_threshold_mb },
+		{ .name =	"lock_limit_mb",
+		  .fops =	&ldlm_watermark_fops,
+		  .data =	&ldlm_lock_limit_mb },
+		{ .name =	"lock_granted_count",
+		  .fops =	&ldlm_granted_fops,
+		  .data =	&ldlm_granted_total },
+#endif
+		{ NULL }};
 	ENTRY;
-	ldlm_debugfs_dir = ldebugfs_register(OBD_LDLM_DEVICENAME,
-					     debugfs_lustre_root,
-					     NULL, NULL);
-	if (IS_ERR_OR_NULL(ldlm_debugfs_dir)) {
-		CERROR("LDebugFS failed in ldlm-init\n");
-		rc = ldlm_debugfs_dir ? PTR_ERR(ldlm_debugfs_dir) : -ENOMEM;
+	LASSERT(ldlm_ns_proc_dir == NULL);
+
+	ldlm_type_proc_dir = lprocfs_register(OBD_LDLM_DEVICENAME,
+					      proc_lustre_root,
+					      NULL, NULL);
+	if (IS_ERR(ldlm_type_proc_dir)) {
+		CERROR("LProcFS failed in ldlm-init\n");
+		rc = PTR_ERR(ldlm_type_proc_dir);
 		GOTO(err, rc);
 	}
 
-	ldlm_ns_debugfs_dir = ldebugfs_register("namespaces",
-						ldlm_debugfs_dir,
-						NULL, NULL);
-	if (IS_ERR_OR_NULL(ldlm_ns_debugfs_dir)) {
+	ldlm_ns_proc_dir = lprocfs_register("namespaces",
+					    ldlm_type_proc_dir,
+					    NULL, NULL);
+	if (IS_ERR(ldlm_ns_proc_dir)) {
 		CERROR("LProcFS failed in ldlm-init\n");
-		rc = ldlm_ns_debugfs_dir ? PTR_ERR(ldlm_ns_debugfs_dir)
-					 : -ENOMEM;
+		rc = PTR_ERR(ldlm_ns_proc_dir);
 		GOTO(err_type, rc);
 	}
 
-	ldlm_svc_debugfs_dir = ldebugfs_register("services",
-						 ldlm_debugfs_dir,
-						 NULL, NULL);
-	if (IS_ERR_OR_NULL(ldlm_svc_debugfs_dir)) {
+	ldlm_svc_proc_dir = lprocfs_register("services",
+					     ldlm_type_proc_dir,
+					     NULL, NULL);
+	if (IS_ERR(ldlm_svc_proc_dir)) {
 		CERROR("LProcFS failed in ldlm-init\n");
-		rc = ldlm_svc_debugfs_dir ? PTR_ERR(ldlm_svc_debugfs_dir)
-					  : -ENOMEM;
+		rc = PTR_ERR(ldlm_svc_proc_dir);
 		GOTO(err_ns, rc);
 	}
 
-	rc = ldebugfs_add_vars(ldlm_debugfs_dir, ldlm_debugfs_list, NULL);
+	rc = lprocfs_add_vars(ldlm_type_proc_dir, list, NULL);
 	if (rc != 0) {
 		CERROR("LProcFS failed in ldlm-init\n");
 		GOTO(err_svc, rc);
@@ -262,32 +239,26 @@ int ldlm_debugfs_setup(void)
 	RETURN(0);
 
 err_svc:
-	ldebugfs_remove(&ldlm_svc_debugfs_dir);
+	lprocfs_remove(&ldlm_svc_proc_dir);
 err_ns:
-	ldebugfs_remove(&ldlm_ns_debugfs_dir);
+        lprocfs_remove(&ldlm_ns_proc_dir);
 err_type:
-	ldebugfs_remove(&ldlm_debugfs_dir);
+        lprocfs_remove(&ldlm_type_proc_dir);
 err:
-	ldlm_svc_debugfs_dir = NULL;
-	ldlm_ns_debugfs_dir = NULL;
-	ldlm_debugfs_dir = NULL;
-	RETURN(rc);
+        ldlm_svc_proc_dir = NULL;
+        RETURN(rc);
 }
 
-void ldlm_debugfs_cleanup(void)
+void ldlm_proc_cleanup(void)
 {
-	if (!IS_ERR_OR_NULL(ldlm_svc_debugfs_dir))
-		ldebugfs_remove(&ldlm_svc_debugfs_dir);
+        if (ldlm_svc_proc_dir)
+                lprocfs_remove(&ldlm_svc_proc_dir);
 
-	if (!IS_ERR_OR_NULL(ldlm_ns_debugfs_dir))
-		ldebugfs_remove(&ldlm_ns_debugfs_dir);
+        if (ldlm_ns_proc_dir)
+                lprocfs_remove(&ldlm_ns_proc_dir);
 
-	if (!IS_ERR_OR_NULL(ldlm_debugfs_dir))
-		ldebugfs_remove(&ldlm_debugfs_dir);
-
-	ldlm_svc_debugfs_dir = NULL;
-	ldlm_ns_debugfs_dir = NULL;
-	ldlm_debugfs_dir = NULL;
+        if (ldlm_type_proc_dir)
+                lprocfs_remove(&ldlm_type_proc_dir);
 }
 
 static ssize_t resource_count_show(struct kobject *kobj, struct attribute *attr,
@@ -355,8 +326,18 @@ static ssize_t lru_size_store(struct kobject *kobj, struct attribute *attr,
                 CDEBUG(D_DLMTRACE,
                        "dropping all unused locks from namespace %s\n",
                        ldlm_ns_name(ns));
-		/* Try to cancel all @ns_nr_unused locks. */
-		ldlm_cancel_lru(ns, INT_MAX, 0, LDLM_LRU_FLAG_CLEANUP);
+                if (ns_connect_lru_resize(ns)) {
+			/* Try to cancel all @ns_nr_unused locks. */
+			ldlm_cancel_lru(ns, ns->ns_nr_unused, 0,
+					LDLM_LRU_FLAG_PASSED |
+					LDLM_LRU_FLAG_CLEANUP);
+		} else {
+			tmp = ns->ns_max_unused;
+			ns->ns_max_unused = 0;
+			ldlm_cancel_lru(ns, 0, 0, LDLM_LRU_FLAG_PASSED |
+					LDLM_LRU_FLAG_CLEANUP);
+			ns->ns_max_unused = tmp;
+		}
 		return count;
 	}
 
@@ -379,6 +360,7 @@ static ssize_t lru_size_store(struct kobject *kobj, struct attribute *attr,
 		       "changing namespace %s unused locks from %u to %u\n",
 		       ldlm_ns_name(ns), ns->ns_nr_unused,
 		       (unsigned int)tmp);
+		ldlm_cancel_lru(ns, tmp, LCF_ASYNC, LDLM_LRU_FLAG_PASSED);
 
 		if (!lru_resize) {
 			CDEBUG(D_DLMTRACE,
@@ -386,12 +368,13 @@ static ssize_t lru_size_store(struct kobject *kobj, struct attribute *attr,
 			       ldlm_ns_name(ns));
 			ns->ns_connect_flags &= ~OBD_CONNECT_LRU_RESIZE;
 		}
-		ldlm_cancel_lru(ns, tmp, LCF_ASYNC, 0);
         } else {
 		CDEBUG(D_DLMTRACE,
 		       "changing namespace %s max_unused from %u to %u\n",
 		       ldlm_ns_name(ns), ns->ns_max_unused,
 		       (unsigned int)tmp);
+		ns->ns_max_unused = (unsigned int)tmp;
+		ldlm_cancel_lru(ns, 0, LCF_ASYNC, LDLM_LRU_FLAG_PASSED);
 
 		/* Make sure that LRU resize was originally supported before
 		 * turning it on here.
@@ -403,8 +386,6 @@ static ssize_t lru_size_store(struct kobject *kobj, struct attribute *attr,
                                ldlm_ns_name(ns));
                         ns->ns_connect_flags |= OBD_CONNECT_LRU_RESIZE;
                 }
-		ns->ns_max_unused = (unsigned int)tmp;
-		ldlm_cancel_lru(ns, 0, LCF_ASYNC, 0);
         }
 
         return count;
@@ -428,6 +409,7 @@ static ssize_t lru_max_age_store(struct kobject *kobj, struct attribute *attr,
 	int scale = NSEC_PER_MSEC;
 	unsigned long long tmp;
 	char *buf;
+	int err;
 
 	/* Did the user ask in seconds or milliseconds. Default is in ms */
 	buf = strstr(buffer, "ms");
@@ -440,7 +422,8 @@ static ssize_t lru_max_age_store(struct kobject *kobj, struct attribute *attr,
 	if (buf)
 		*buf = '\0';
 
-	if (kstrtoull(buffer, 10, &tmp))
+	err = kstrtoull(buffer, 10, &tmp);
+	if (err != 0)
 		return -EINVAL;
 
 	ns->ns_max_age = ktime_set(0, tmp * scale);
@@ -481,32 +464,6 @@ static ssize_t early_lock_cancel_store(struct kobject *kobj,
 }
 LUSTRE_RW_ATTR(early_lock_cancel);
 
-static ssize_t dirty_age_limit_show(struct kobject *kobj,
-				    struct attribute *attr, char *buf)
-{
-	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
-						 ns_kobj);
-
-	return sprintf(buf, "%llu\n", ns->ns_dirty_age_limit);
-}
-
-static ssize_t dirty_age_limit_store(struct kobject *kobj,
-				     struct attribute *attr,
-				     const char *buffer, size_t count)
-{
-	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
-						 ns_kobj);
-	unsigned long long tmp;
-
-	if (kstrtoull(buffer, 10, &tmp))
-		return -EINVAL;
-
-	ns->ns_dirty_age_limit = tmp;
-
-	return count;
-}
-LUSTRE_RW_ATTR(dirty_age_limit);
-
 #ifdef HAVE_SERVER_SUPPORT
 static ssize_t ctime_age_limit_show(struct kobject *kobj,
 				    struct attribute *attr, char *buf)
@@ -514,7 +471,7 @@ static ssize_t ctime_age_limit_show(struct kobject *kobj,
 	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
 						 ns_kobj);
 
-	return sprintf(buf, "%llu\n", ns->ns_ctime_age_limit);
+	return sprintf(buf, "%u\n", ns->ns_ctime_age_limit);
 }
 
 static ssize_t ctime_age_limit_store(struct kobject *kobj,
@@ -523,9 +480,11 @@ static ssize_t ctime_age_limit_store(struct kobject *kobj,
 {
 	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
 						 ns_kobj);
-	unsigned long long tmp;
+	unsigned long tmp;
+	int err;
 
-	if (kstrtoull(buffer, 10, &tmp))
+	err = kstrtoul(buffer, 10, &tmp);
+	if (err != 0)
 		return -EINVAL;
 
 	ns->ns_ctime_age_limit = tmp;
@@ -578,7 +537,7 @@ static ssize_t contention_seconds_show(struct kobject *kobj,
 	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
 						 ns_kobj);
 
-	return sprintf(buf, "%llu\n", ns->ns_contention_time);
+	return sprintf(buf, "%u\n", ns->ns_contention_time);
 }
 
 static ssize_t contention_seconds_store(struct kobject *kobj,
@@ -587,9 +546,11 @@ static ssize_t contention_seconds_store(struct kobject *kobj,
 {
 	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
 						 ns_kobj);
-	unsigned long long tmp;
+	unsigned long tmp;
+	int err;
 
-	if (kstrtoull(buffer, 10, &tmp))
+	err = kstrtoul(buffer, 10, &tmp);
+	if (err != 0)
 		return -EINVAL;
 
 	ns->ns_contention_time = tmp;
@@ -664,7 +625,6 @@ static struct attribute *ldlm_ns_attrs[] = {
 	&lustre_attr_lru_size.attr,
 	&lustre_attr_lru_max_age.attr,
 	&lustre_attr_early_lock_cancel.attr,
-	&lustre_attr_dirty_age_limit.attr,
 #ifdef HAVE_SERVER_SUPPORT
 	&lustre_attr_ctime_age_limit.attr,
 	&lustre_attr_lock_timeouts.attr,
@@ -689,13 +649,13 @@ static struct kobj_type ldlm_ns_ktype = {
 	.release	= ldlm_ns_release,
 };
 
-static void ldlm_namespace_debugfs_unregister(struct ldlm_namespace *ns)
+static void ldlm_namespace_proc_unregister(struct ldlm_namespace *ns)
 {
-	if (IS_ERR_OR_NULL(ns->ns_debugfs_entry))
+	if (ns->ns_proc_dir_entry == NULL)
                 CERROR("dlm namespace %s has no procfs dir?\n",
                        ldlm_ns_name(ns));
 	else
-		ldebugfs_remove(&ns->ns_debugfs_entry);
+		lprocfs_remove(&ns->ns_proc_dir_entry);
 
 	if (ns->ns_stats != NULL)
 		lprocfs_free_stats(&ns->ns_stats);
@@ -728,23 +688,31 @@ int ldlm_namespace_sysfs_register(struct ldlm_namespace *ns)
 	return err;
 }
 
-static int ldlm_namespace_debugfs_register(struct ldlm_namespace *ns)
+static int ldlm_namespace_proc_register(struct ldlm_namespace *ns)
 {
-	struct dentry *ns_entry;
+	struct proc_dir_entry *ns_pde;
 
-	if (!IS_ERR_OR_NULL(ns->ns_debugfs_entry)) {
-		ns_entry = ns->ns_debugfs_entry;
+        LASSERT(ns != NULL);
+        LASSERT(ns->ns_rs_hash != NULL);
+
+	if (ns->ns_proc_dir_entry != NULL) {
+		ns_pde = ns->ns_proc_dir_entry;
 	} else {
-		ns_entry = debugfs_create_dir(ldlm_ns_name(ns),
-					      ldlm_ns_debugfs_dir);
-		if (!ns_entry)
+		ns_pde = proc_mkdir(ldlm_ns_name(ns), ldlm_ns_proc_dir);
+		if (ns_pde == NULL)
 			return -ENOMEM;
-		ns->ns_debugfs_entry = ns_entry;
+		ns->ns_proc_dir_entry = ns_pde;
 	}
 
 	return 0;
 }
 #undef MAX_STRING_SIZE
+#else /* CONFIG_PROC_FS */
+
+#define ldlm_namespace_proc_unregister(ns)      ({;})
+#define ldlm_namespace_proc_register(ns)        ({0;})
+
+#endif /* CONFIG_PROC_FS */
 
 static unsigned ldlm_res_hop_hash(struct cfs_hash *hs,
                                   const void *key, unsigned mask)
@@ -959,12 +927,9 @@ struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name,
 		nsb->nsb_reclaim_start = 0;
         }
 
-	ns->ns_obd = obd;
-	ns->ns_appetite = apt;
-	ns->ns_client = client;
-	ns->ns_name = kstrdup(name, GFP_KERNEL);
-	if (!ns->ns_name)
-		goto out_hash;
+        ns->ns_obd      = obd;
+        ns->ns_appetite = apt;
+        ns->ns_client   = client;
 
 	INIT_LIST_HEAD(&ns->ns_list_chain);
 	INIT_LIST_HEAD(&ns->ns_unused_list);
@@ -981,14 +946,12 @@ struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name,
         ns->ns_max_unused         = LDLM_DEFAULT_LRU_SIZE;
 	ns->ns_max_age            = ktime_set(LDLM_DEFAULT_MAX_ALIVE, 0);
         ns->ns_ctime_age_limit    = LDLM_CTIME_AGE_LIMIT;
-	ns->ns_dirty_age_limit    = LDLM_DIRTY_AGE_LIMIT;
         ns->ns_timeouts           = 0;
         ns->ns_orig_connect_flags = 0;
         ns->ns_connect_flags      = 0;
         ns->ns_stopping           = 0;
 	ns->ns_reclaim_start	  = 0;
 	ns->ns_last_pos		  = &ns->ns_unused_list;
-	ns->ns_flags		  = 0;
 
 	rc = ldlm_namespace_sysfs_register(ns);
 	if (rc) {
@@ -996,7 +959,7 @@ struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name,
 		GOTO(out_hash, rc);
 	}
 
-	rc = ldlm_namespace_debugfs_register(ns);
+	rc = ldlm_namespace_proc_register(ns);
 	if (rc) {
 		CERROR("Can't initialize ns proc, rc %d\n", rc);
 		GOTO(out_sysfs, rc);
@@ -1012,13 +975,12 @@ struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name,
         ldlm_namespace_register(ns, client);
         RETURN(ns);
 out_proc:
-	ldlm_namespace_debugfs_unregister(ns);
+	ldlm_namespace_proc_unregister(ns);
 out_sysfs:
 	ldlm_namespace_sysfs_unregister(ns);
 	ldlm_namespace_cleanup(ns, 0);
 out_hash:
-	kfree(ns->ns_name);
-	cfs_hash_putref(ns->ns_rs_hash);
+        cfs_hash_putref(ns->ns_rs_hash);
 out_ns:
         OBD_FREE_PTR(ns);
 out_ref:
@@ -1117,13 +1079,14 @@ static void cleanup_resource(struct ldlm_resource *res, struct list_head *q,
 static int ldlm_resource_clean(struct cfs_hash *hs, struct cfs_hash_bd *bd,
 			       struct hlist_node *hnode, void *arg)
 {
-	struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+        struct ldlm_resource *res = cfs_hash_object(hs, hnode);
 	__u64 flags = *(__u64 *)arg;
 
-	cleanup_resource(res, &res->lr_granted, flags);
-	cleanup_resource(res, &res->lr_waiting, flags);
+        cleanup_resource(res, &res->lr_granted, flags);
+        cleanup_resource(res, &res->lr_converting, flags);
+        cleanup_resource(res, &res->lr_waiting, flags);
 
-	return 0;
+        return 0;
 }
 
 static int ldlm_resource_complain(struct cfs_hash *hs, struct cfs_hash_bd *bd,
@@ -1137,8 +1100,7 @@ static int ldlm_resource_complain(struct cfs_hash *hs, struct cfs_hash_bd *bd,
 	       ldlm_ns_name(ldlm_res_to_ns(res)), PLDLMRES(res), res,
 	       atomic_read(&res->lr_refcount) - 1);
 
-	/* Use D_NETERROR since it is in the default mask */
-	ldlm_resource_dump(D_NETERROR, res);
+	ldlm_resource_dump(D_ERROR, res);
 	unlock_res(res);
 	return 0;
 }
@@ -1280,14 +1242,12 @@ void ldlm_namespace_free_post(struct ldlm_namespace *ns)
 	 * Removing it after @dir may cause oops. */
 	ldlm_pool_fini(&ns->ns_pool);
 
-	ldlm_namespace_debugfs_unregister(ns);
+	ldlm_namespace_proc_unregister(ns);
 	ldlm_namespace_sysfs_unregister(ns);
 	cfs_hash_putref(ns->ns_rs_hash);
-	kfree(ns->ns_name);
 	/* Namespace \a ns should be not on list at this time, otherwise
 	 * this will cause issues related to using freed \a ns in poold
-	 * thread.
-	 */
+	 * thread. */
 	LASSERT(list_empty(&ns->ns_list_chain));
 	OBD_FREE_PTR(ns);
 	ldlm_put_ref();
@@ -1392,62 +1352,33 @@ struct ldlm_namespace *ldlm_namespace_first_locked(enum ldlm_side client)
 			    struct ldlm_namespace, ns_list_chain);
 }
 
-static bool ldlm_resource_extent_new(struct ldlm_resource *res)
-{
-	int idx;
-
-	OBD_SLAB_ALLOC(res->lr_itree, ldlm_interval_tree_slab,
-		       sizeof(*res->lr_itree) * LCK_MODE_NUM);
-	if (res->lr_itree == NULL)
-		return false;
-	/* Initialize interval trees for each lock mode. */
-	for (idx = 0; idx < LCK_MODE_NUM; idx++) {
-		res->lr_itree[idx].lit_size = 0;
-		res->lr_itree[idx].lit_mode = 1 << idx;
-		res->lr_itree[idx].lit_root = NULL;
-	}
-	return true;
-}
-
-static bool ldlm_resource_inodebits_new(struct ldlm_resource *res)
-{
-	int i;
-
-	OBD_ALLOC_PTR(res->lr_ibits_queues);
-	if (res->lr_ibits_queues == NULL)
-		return false;
-	for (i = 0; i < MDS_INODELOCK_NUMBITS; i++)
-		INIT_LIST_HEAD(&res->lr_ibits_queues->liq_waiting[i]);
-	return true;
-}
-
 /** Create and initialize new resource. */
 static struct ldlm_resource *ldlm_resource_new(enum ldlm_type ldlm_type)
 {
 	struct ldlm_resource *res;
-	bool rc;
+	int idx;
 
 	OBD_SLAB_ALLOC_PTR_GFP(res, ldlm_resource_slab, GFP_NOFS);
 	if (res == NULL)
 		return NULL;
 
-	switch (ldlm_type) {
-	case LDLM_EXTENT:
-		rc = ldlm_resource_extent_new(res);
-		break;
-	case LDLM_IBITS:
-		rc = ldlm_resource_inodebits_new(res);
-		break;
-	default:
-		rc = true;
-		break;
-	}
-	if (!rc) {
-		OBD_SLAB_FREE_PTR(res, ldlm_resource_slab);
-		return NULL;
+	if (ldlm_type == LDLM_EXTENT) {
+		OBD_SLAB_ALLOC(res->lr_itree, ldlm_interval_tree_slab,
+			       sizeof(*res->lr_itree) * LCK_MODE_NUM);
+		if (res->lr_itree == NULL) {
+			OBD_SLAB_FREE_PTR(res, ldlm_resource_slab);
+			return NULL;
+		}
+		/* Initialize interval trees for each lock mode. */
+		for (idx = 0; idx < LCK_MODE_NUM; idx++) {
+			res->lr_itree[idx].lit_size = 0;
+			res->lr_itree[idx].lit_mode = 1 << idx;
+			res->lr_itree[idx].lit_root = NULL;
+		}
 	}
 
 	INIT_LIST_HEAD(&res->lr_granted);
+	INIT_LIST_HEAD(&res->lr_converting);
 	INIT_LIST_HEAD(&res->lr_waiting);
 
 	atomic_set(&res->lr_refcount, 1);
@@ -1462,20 +1393,6 @@ static struct ldlm_resource *ldlm_resource_new(enum ldlm_type ldlm_type)
 	return res;
 }
 
-static void ldlm_resource_free(struct ldlm_resource *res)
-{
-	if (res->lr_type == LDLM_EXTENT) {
-		if (res->lr_itree != NULL)
-			OBD_SLAB_FREE(res->lr_itree, ldlm_interval_tree_slab,
-				      sizeof(*res->lr_itree) * LCK_MODE_NUM);
-	} else if (res->lr_type == LDLM_IBITS) {
-		if (res->lr_ibits_queues != NULL)
-			OBD_FREE_PTR(res->lr_ibits_queues);
-	}
-
-	OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res);
-}
-
 /**
  * Return a reference to resource with given name, creating it if necessary.
  * Args: namespace with ns_lock unlocked
@@ -1530,7 +1447,10 @@ ldlm_resource_get(struct ldlm_namespace *ns, struct ldlm_resource *parent,
 		cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
 		/* Clean lu_ref for failed resource. */
 		lu_ref_fini(&res->lr_reference);
-		ldlm_resource_free(res);
+		if (res->lr_itree != NULL)
+			OBD_SLAB_FREE(res->lr_itree, ldlm_interval_tree_slab,
+				      sizeof(*res->lr_itree) * LCK_MODE_NUM);
+		OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res);
 found:
 		res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
 		return res;
@@ -1571,23 +1491,28 @@ struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res)
 static void __ldlm_resource_putref_final(struct cfs_hash_bd *bd,
                                          struct ldlm_resource *res)
 {
-	struct ldlm_ns_bucket *nsb = res->lr_ns_bucket;
+        struct ldlm_ns_bucket *nsb = res->lr_ns_bucket;
 
 	if (!list_empty(&res->lr_granted)) {
-		ldlm_resource_dump(D_ERROR, res);
-		LBUG();
-	}
+                ldlm_resource_dump(D_ERROR, res);
+                LBUG();
+        }
+
+	if (!list_empty(&res->lr_converting)) {
+                ldlm_resource_dump(D_ERROR, res);
+                LBUG();
+        }
 
 	if (!list_empty(&res->lr_waiting)) {
-		ldlm_resource_dump(D_ERROR, res);
-		LBUG();
-	}
+                ldlm_resource_dump(D_ERROR, res);
+                LBUG();
+        }
 
-	cfs_hash_bd_del_locked(nsb->nsb_namespace->ns_rs_hash,
-			       bd, &res->lr_hash);
-	lu_ref_fini(&res->lr_reference);
-	if (cfs_hash_bd_count_get(bd) == 0)
-		ldlm_namespace_put(nsb->nsb_namespace);
+        cfs_hash_bd_del_locked(nsb->nsb_namespace->ns_rs_hash,
+                               bd, &res->lr_hash);
+        lu_ref_fini(&res->lr_reference);
+        if (cfs_hash_bd_count_get(bd) == 0)
+                ldlm_namespace_put(nsb->nsb_namespace);
 }
 
 /* Returns 1 if the resource was freed, 0 if it remains. */
@@ -1606,7 +1531,10 @@ int ldlm_resource_putref(struct ldlm_resource *res)
 		cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
 		if (ns->ns_lvbo && ns->ns_lvbo->lvbo_free)
 			ns->ns_lvbo->lvbo_free(res);
-		ldlm_resource_free(res);
+		if (res->lr_itree != NULL)
+			OBD_SLAB_FREE(res->lr_itree, ldlm_interval_tree_slab,
+				      sizeof(*res->lr_itree) * LCK_MODE_NUM);
+		OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res);
 		return 1;
 	}
 	return 0;
@@ -1631,9 +1559,6 @@ void ldlm_resource_add_lock(struct ldlm_resource *res, struct list_head *head,
 	LASSERT(list_empty(&lock->l_res_link));
 
 	list_add_tail(&lock->l_res_link, head);
-
-	if (res->lr_type == LDLM_IBITS)
-		ldlm_inodebits_add_lock(res, head, lock);
 }
 
 /**
@@ -1666,18 +1591,11 @@ void ldlm_resource_unlink_lock(struct ldlm_lock *lock)
 {
         int type = lock->l_resource->lr_type;
 
-	check_res_locked(lock->l_resource);
-	switch (type) {
-	case LDLM_PLAIN:
-		ldlm_unlink_lock_skiplist(lock);
-		break;
-	case LDLM_EXTENT:
-		ldlm_extent_unlink_lock(lock);
-		break;
-	case LDLM_IBITS:
-		ldlm_inodebits_unlink_lock(lock);
-		break;
-	}
+        check_res_locked(lock->l_resource);
+        if (type == LDLM_IBITS || type == LDLM_PLAIN)
+                ldlm_unlink_lock_skiplist(lock);
+        else if (type == LDLM_EXTENT)
+                ldlm_extent_unlink_lock(lock);
 	list_del_init(&lock->l_res_link);
 }
 EXPORT_SYMBOL(ldlm_resource_unlink_lock);
@@ -1737,14 +1655,14 @@ void ldlm_namespace_dump(int level, struct ldlm_namespace *ns)
 	       ldlm_ns_name(ns), atomic_read(&ns->ns_bref),
 	       ns_is_client(ns) ? "client" : "server");
 
-	if (ktime_get_seconds() < ns->ns_next_dump)
+	if (cfs_time_before(cfs_time_current(), ns->ns_next_dump))
 		return;
 
 	cfs_hash_for_each_nolock(ns->ns_rs_hash,
 				 ldlm_res_hash_dump,
 				 (void *)(unsigned long)level, 0);
 	spin_lock(&ns->ns_lock);
-	ns->ns_next_dump = ktime_get_seconds() + 10;
+	ns->ns_next_dump = cfs_time_shift(10);
 	spin_unlock(&ns->ns_lock);
 }
 
@@ -1777,11 +1695,15 @@ void ldlm_resource_dump(int level, struct ldlm_resource *res)
                         }
                 }
         }
-
+	if (!list_empty(&res->lr_converting)) {
+                CDEBUG(level, "Converting locks:\n");
+		list_for_each_entry(lock, &res->lr_converting, l_res_link)
+                        LDLM_DEBUG_LIMIT(level, lock, "###");
+        }
 	if (!list_empty(&res->lr_waiting)) {
-		CDEBUG(level, "Waiting locks:\n");
+                CDEBUG(level, "Waiting locks:\n");
 		list_for_each_entry(lock, &res->lr_waiting, l_res_link)
-			LDLM_DEBUG_LIMIT(level, lock, "###");
-	}
+                        LDLM_DEBUG_LIMIT(level, lock, "###");
+        }
 }
 EXPORT_SYMBOL(ldlm_resource_dump);
diff --git a/drivers/staging/lustrefsx/lustre/llite/Makefile b/drivers/staging/lustrefsx/lustre/llite/Makefile
index 19f415face716..96430e764665b 100644
--- a/drivers/staging/lustrefsx/lustre/llite/Makefile
+++ b/drivers/staging/lustrefsx/lustre/llite/Makefile
@@ -7,7 +7,7 @@ lustre-y	+= rw26.o super25.o statahead.o xattr_security.o
 lustre-y	+= glimpse.o
 lustre-y	+= lcommon_cl.o
 lustre-y	+= lcommon_misc.o
-lustre-y	+= vvp_dev.o vvp_page.o vvp_io.o vvp_object.o
+lustre-y	+= vvp_dev.o vvp_page.o vvp_lock.o vvp_io.o vvp_object.o
 lustre-y	+= range_lock.o
 
 include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/llite/dcache.c b/drivers/staging/lustrefsx/lustre/llite/dcache.c
index 6a026f0f176e5..6da6b5956ab4e 100644
--- a/drivers/staging/lustrefsx/lustre/llite/dcache.c
+++ b/drivers/staging/lustrefsx/lustre/llite/dcache.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,6 +38,7 @@
 #define DEBUG_SUBSYSTEM S_LLITE
 
 #include <obd_support.h>
+#include <lustre/lustre_idl.h>
 #include <lustre_dlm.h>
 
 #include "llite_internal.h"
diff --git a/drivers/staging/lustrefsx/lustre/llite/dir.c b/drivers/staging/lustrefsx/lustre/llite/dir.c
index dd2452a3459a6..6e987fe2f7387 100644
--- a/drivers/staging/lustrefsx/lustre/llite/dir.c
+++ b/drivers/staging/lustrefsx/lustre/llite/dir.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,20 +38,21 @@
 #include <linux/pagemap.h>
 #include <linux/mm.h>
 #include <linux/version.h>
-#include <linux/security.h>
 #include <linux/user_namespace.h>
 #ifdef HAVE_UIDGID_HEADER
 # include <linux/uidgid.h>
 #endif
-#include <linux/uaccess.h>
+#include <asm/uaccess.h>
 #include <linux/buffer_head.h>   // for wait_on_buffer
 #include <linux/pagevec.h>
 
 #define DEBUG_SUBSYSTEM S_LLITE
 
+#include <lustre/lustre_idl.h>
+
 #include <obd_support.h>
 #include <obd_class.h>
-#include <uapi/linux/lustre/lustre_ioctl.h>
+#include <uapi/linux/lustre_ioctl.h>
 #include <lustre_lib.h>
 #include <lustre_dlm.h>
 #include <lustre_fid.h>
@@ -321,7 +322,6 @@ static int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
 	int			hash64	= sbi->ll_flags & LL_SBI_64BIT_HASH;
 	int			api32	= ll_need_32bit_api(sbi);
 	struct md_op_data	*op_data;
-	struct lu_fid		pfid = { 0 };
 	__u64			pos;
 	int			rc;
 	ENTRY;
@@ -341,36 +341,34 @@ static int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
 		 */
 		GOTO(out, rc = 0);
 
-	if (unlikely(ll_i2info(inode)->lli_lsm_md != NULL)) {
-		/*
-		 * This is only needed for striped dir to fill ..,
-		 * see lmv_read_page()
-		 */
+	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, inode);
+	if (IS_ERR(op_data))
+		GOTO(out, rc = PTR_ERR(op_data));
+
+	if (unlikely(op_data->op_mea1 != NULL)) {
+		/* This is only needed for striped dir to fill ..,
+		 * see lmv_read_entry */
 		if (file_dentry(filp)->d_parent != NULL &&
 		    file_dentry(filp)->d_parent->d_inode != NULL) {
-			__u64 ibits = MDS_INODELOCK_LOOKUP;
+			__u64 ibits = MDS_INODELOCK_UPDATE;
 			struct inode *parent =
 				file_dentry(filp)->d_parent->d_inode;
 
 			if (ll_have_md_lock(parent, &ibits, LCK_MINMODE))
-				pfid = *ll_inode2fid(parent);
+				op_data->op_fid3 = *ll_inode2fid(parent);
 		}
 
 		/* If it can not find in cache, do lookup .. on the master
 		 * object */
-		if (fid_is_zero(&pfid)) {
-			rc = ll_dir_get_parent_fid(inode, &pfid);
-			if (rc != 0)
+		if (fid_is_zero(&op_data->op_fid3)) {
+			rc = ll_dir_get_parent_fid(inode, &op_data->op_fid3);
+			if (rc != 0) {
+				ll_finish_md_op_data(op_data);
 				RETURN(rc);
+			}
 		}
 	}
-
-	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
-				     LUSTRE_OPC_ANY, inode);
-	if (IS_ERR(op_data))
-		GOTO(out, rc = PTR_ERR(op_data));
-	op_data->op_fid3 = pfid;
-
 #ifdef HAVE_DIR_CONTEXT
 	ctx->pos = pos;
 	rc = ll_dir_read(inode, &pos, op_data, ctx);
@@ -437,7 +435,7 @@ static int ll_send_mgc_param(struct obd_export *mgc, char *string)
  *                      <0 if the creation is failed.
  */
 static int ll_dir_setdirstripe(struct dentry *dparent, struct lmv_user_md *lump,
-			       size_t len, const char *dirname, umode_t mode)
+			       const char *dirname, umode_t mode)
 {
 	struct inode *parent = dparent->d_inode;
 	struct ptlrpc_request *request = NULL;
@@ -456,8 +454,7 @@ static int ll_dir_setdirstripe(struct dentry *dparent, struct lmv_user_md *lump,
 	int err;
 	ENTRY;
 
-	if (unlikely(lump->lum_magic != LMV_USER_MAGIC &&
-		     lump->lum_magic != LMV_USER_MAGIC_SPECIFIC))
+	if (unlikely(lump->lum_magic != LMV_USER_MAGIC))
 		RETURN(-EINVAL);
 
 	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p) name %s "
@@ -473,8 +470,7 @@ static int ll_dir_setdirstripe(struct dentry *dparent, struct lmv_user_md *lump,
 	    !OBD_FAIL_CHECK(OBD_FAIL_LLITE_NO_CHECK_DEAD))
 		RETURN(-ENOENT);
 
-	if (lump->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
-	    lump->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
+	if (lump->lum_magic != cpu_to_le32(LMV_USER_MAGIC))
 		lustre_swab_lmv_user_md(lump);
 
 	if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent)))
@@ -499,7 +495,7 @@ static int ll_dir_setdirstripe(struct dentry *dparent, struct lmv_user_md *lump,
 	}
 
 	op_data->op_cli_flags |= CLI_SET_MEA;
-	err = md_create(sbi->ll_md_exp, op_data, lump, len, mode,
+	err = md_create(sbi->ll_md_exp, op_data, lump, sizeof(*lump), mode,
 			from_kuid(&init_user_ns, current_fsuid()),
 			from_kgid(&init_user_ns, current_fsgid()),
 			cfs_curproc_cap_pack(), 0, &request);
@@ -540,67 +536,69 @@ static int ll_dir_setdirstripe(struct dentry *dparent, struct lmv_user_md *lump,
 int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
                      int set_default)
 {
-	struct ll_sb_info *sbi = ll_i2sbi(inode);
-	struct md_op_data *op_data;
-	struct ptlrpc_request *req = NULL;
-	int rc = 0;
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        struct md_op_data *op_data;
+        struct ptlrpc_request *req = NULL;
+        int rc = 0;
 #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0)
-	struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
-	struct obd_device *mgc = lsi->lsi_mgc;
+        struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
+        struct obd_device *mgc = lsi->lsi_mgc;
 #endif
-	int lum_size;
+        int lum_size;
 	ENTRY;
 
-	if (lump != NULL) {
-		switch (lump->lmm_magic) {
-		case LOV_USER_MAGIC_V1:
-			lum_size = sizeof(struct lov_user_md_v1);
-			break;
-		case LOV_USER_MAGIC_V3:
-			lum_size = sizeof(struct lov_user_md_v3);
-			break;
-		case LOV_USER_MAGIC_COMP_V1:
-			lum_size = ((struct lov_comp_md_v1 *)lump)->lcm_size;
+        if (lump != NULL) {
+                /*
+                 * This is coming from userspace, so should be in
+                 * local endian.  But the MDS would like it in little
+                 * endian, so we swab it before we send it.
+                 */
+                switch (lump->lmm_magic) {
+                case LOV_USER_MAGIC_V1: {
+                        if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V1))
+                                lustre_swab_lov_user_md_v1(lump);
+                        lum_size = sizeof(struct lov_user_md_v1);
+                        break;
+                }
+                case LOV_USER_MAGIC_V3: {
+                        if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V3))
+                                lustre_swab_lov_user_md_v3(
+                                        (struct lov_user_md_v3 *)lump);
+                        lum_size = sizeof(struct lov_user_md_v3);
+                        break;
+                }
+		case LOV_USER_MAGIC_COMP_V1: {
+			if (lump->lmm_magic !=
+			    cpu_to_le32(LOV_USER_MAGIC_COMP_V1))
+				lustre_swab_lov_comp_md_v1(
+					(struct lov_comp_md_v1 *)lump);
+			lum_size = le32_to_cpu(
+				((struct lov_comp_md_v1 *)lump)->lcm_size);
 			break;
-		case LMV_USER_MAGIC:
+		}
+		case LMV_USER_MAGIC: {
 			if (lump->lmm_magic != cpu_to_le32(LMV_USER_MAGIC))
 				lustre_swab_lmv_user_md(
 					(struct lmv_user_md *)lump);
 			lum_size = sizeof(struct lmv_user_md);
 			break;
-		case LOV_USER_MAGIC_SPECIFIC: {
-			struct lov_user_md_v3 *v3 =
-				(struct lov_user_md_v3 *)lump;
-			if (v3->lmm_stripe_count > LOV_MAX_STRIPE_COUNT)
-				RETURN(-EINVAL);
-			lum_size = lov_user_md_size(v3->lmm_stripe_count,
-						    LOV_USER_MAGIC_SPECIFIC);
-			break;
-		}
-		default:
-			CDEBUG(D_IOCTL, "bad userland LOV MAGIC:"
-					" %#08x != %#08x nor %#08x\n",
-					lump->lmm_magic, LOV_USER_MAGIC_V1,
-					LOV_USER_MAGIC_V3);
-			RETURN(-EINVAL);
 		}
+                default: {
+                        CDEBUG(D_IOCTL, "bad userland LOV MAGIC:"
+                                        " %#08x != %#08x nor %#08x\n",
+                                        lump->lmm_magic, LOV_USER_MAGIC_V1,
+                                        LOV_USER_MAGIC_V3);
+                        RETURN(-EINVAL);
+                }
+                }
+        } else {
+                lum_size = sizeof(struct lov_user_md_v1);
+        }
 
-		/*
-		 * This is coming from userspace, so should be in
-		 * local endian.  But the MDS would like it in little
-		 * endian, so we swab it before we send it.
-		 */
-		if ((__swab32(lump->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) ==
-		    le32_to_cpu(LOV_MAGIC_MAGIC))
-			lustre_swab_lov_user_md(lump, 0);
-	} else {
-		lum_size = sizeof(struct lov_user_md_v1);
-	}
-
-	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
-				     LUSTRE_OPC_ANY, NULL);
-	if (IS_ERR(op_data))
-		RETURN(PTR_ERR(op_data));
+        op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+                                     LUSTRE_OPC_ANY, NULL);
+        if (IS_ERR(op_data))
+                RETURN(PTR_ERR(op_data));
 
 	/* swabbing is done in lov_setstripe() on server side */
 	rc = md_setattr(sbi->ll_md_exp, op_data, lump, lum_size, &req);
@@ -663,10 +661,16 @@ int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
 	RETURN(rc);
 }
 
-static int ll_dir_get_default_layout(struct inode *inode, void **plmm,
-				     int *plmm_size,
-				     struct ptlrpc_request **request, u64 valid,
-				     enum get_default_layout_type type)
+/**
+ * This function will be used to get default LOV/LMV/Default LMV
+ * @valid will be used to indicate which stripe it will retrieve
+ * 	OBD_MD_MEA  		LMV stripe EA
+ * 	OBD_MD_DEFAULT_MEA	Default LMV stripe EA
+ *  	otherwise		Default LOV EA.
+ * Each time, it can only retrieve 1 stripe EA
+ **/
+int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size,
+		     struct ptlrpc_request **request, u64 valid)
 {
 	struct ll_sb_info *sbi = ll_i2sbi(inode);
 	struct mdt_body   *body;
@@ -674,7 +678,6 @@ static int ll_dir_get_default_layout(struct inode *inode, void **plmm,
 	struct ptlrpc_request *req = NULL;
 	int rc, lmm_size;
 	struct md_op_data *op_data;
-	struct lu_fid fid;
 	ENTRY;
 
 	rc = ll_get_default_mdsize(sbi, &lmm_size);
@@ -688,19 +691,11 @@ static int ll_dir_get_default_layout(struct inode *inode, void **plmm,
 		RETURN(PTR_ERR(op_data));
 
 	op_data->op_valid = valid | OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
-
-	if (type == GET_DEFAULT_LAYOUT_ROOT) {
-		lu_root_fid(&op_data->op_fid1);
-		fid = op_data->op_fid1;
-	} else {
-		fid = *ll_inode2fid(inode);
-	}
-
 	rc = md_getattr(sbi->ll_md_exp, op_data, &req);
 	ll_finish_md_op_data(op_data);
 	if (rc < 0) {
-		CDEBUG(D_INFO, "md_getattr failed on inode "DFID": rc %d\n",
-		       PFID(&fid), rc);
+		CDEBUG(D_INFO, "md_getattr failed on inode "
+		       DFID": rc %d\n", PFID(ll_inode2fid(inode)), rc);
 		GOTO(out, rc);
 	}
 
@@ -726,11 +721,17 @@ static int ll_dir_get_default_layout(struct inode *inode, void **plmm,
 	/* We don't swab objects for directories */
 	switch (le32_to_cpu(lmm->lmm_magic)) {
 	case LOV_MAGIC_V1:
+		if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
+			lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
+		break;
 	case LOV_MAGIC_V3:
+		if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
+			lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
+		break;
 	case LOV_MAGIC_COMP_V1:
-	case LOV_USER_MAGIC_SPECIFIC:
 		if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
-			lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0);
+			lustre_swab_lov_comp_md_v1(
+					(struct lov_comp_md_v1 *)lmm);
 		break;
 	case LMV_MAGIC_V1:
 		if (LMV_MAGIC != cpu_to_le32(LMV_MAGIC))
@@ -751,75 +752,6 @@ static int ll_dir_get_default_layout(struct inode *inode, void **plmm,
 	return rc;
 }
 
-/**
- * This function will be used to get default LOV/LMV/Default LMV
- * @valid will be used to indicate which stripe it will retrieve.
- * If the directory does not have its own default layout, then the
- * function will request the default layout from root FID.
- *	OBD_MD_MEA		LMV stripe EA
- *	OBD_MD_DEFAULT_MEA	Default LMV stripe EA
- *	otherwise		Default LOV EA.
- * Each time, it can only retrieve 1 stripe EA
- **/
-int ll_dir_getstripe_default(struct inode *inode, void **plmm, int *plmm_size,
-			     struct ptlrpc_request **request,
-			     struct ptlrpc_request **root_request,
-			     u64 valid)
-{
-	struct ptlrpc_request *req = NULL;
-	struct ptlrpc_request *root_req = NULL;
-	struct lov_mds_md *lmm = NULL;
-	int lmm_size = 0;
-	int rc = 0;
-	ENTRY;
-
-	rc = ll_dir_get_default_layout(inode, (void **)&lmm, &lmm_size,
-				       &req, valid, 0);
-	if (rc == -ENODATA && !fid_is_root(ll_inode2fid(inode)) &&
-	    !(valid & (OBD_MD_MEA|OBD_MD_DEFAULT_MEA)) && root_request != NULL){
-		int rc2 = ll_dir_get_default_layout(inode, (void **)&lmm,
-						    &lmm_size, &root_req, valid,
-						    GET_DEFAULT_LAYOUT_ROOT);
-		if (rc2 == 0)
-			rc = 0;
-	}
-
-	*plmm = lmm;
-	*plmm_size = lmm_size;
-	*request = req;
-	if (root_request != NULL)
-		*root_request = root_req;
-
-	RETURN(rc);
-}
-
-/**
- * This function will be used to get default LOV/LMV/Default LMV
- * @valid will be used to indicate which stripe it will retrieve
- *	OBD_MD_MEA		LMV stripe EA
- *	OBD_MD_DEFAULT_MEA	Default LMV stripe EA
- *	otherwise		Default LOV EA.
- * Each time, it can only retrieve 1 stripe EA
- **/
-int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size,
-		     struct ptlrpc_request **request, u64 valid)
-{
-	struct ptlrpc_request *req = NULL;
-	struct lov_mds_md *lmm = NULL;
-	int lmm_size = 0;
-	int rc = 0;
-	ENTRY;
-
-	rc = ll_dir_get_default_layout(inode, (void **)&lmm, &lmm_size,
-				       &req, valid, 0);
-
-	*plmm = lmm;
-	*plmm_size = lmm_size;
-	*request = req;
-
-	RETURN(rc);
-}
-
 int ll_get_mdt_idx_by_fid(struct ll_sb_info *sbi, const struct lu_fid *fid)
 {
 	struct md_op_data	*op_data;
@@ -1028,110 +960,25 @@ static int ll_ioc_copy_end(struct super_block *sb, struct hsm_copy *copy)
 }
 
 
-static int copy_and_ct_start(int cmd, struct obd_export *exp,
-			     const struct lustre_kernelcomm __user *data)
+static int copy_and_ioctl(int cmd, struct obd_export *exp,
+			  const void __user *data, size_t size)
 {
-	struct lustre_kernelcomm *lk;
-	struct lustre_kernelcomm *tmp;
-	size_t size = sizeof(*lk);
-	size_t new_size;
-	int i;
+	void *copy;
 	int rc;
 
-	/* copy data from userspace to get numbers of archive_id */
-	OBD_ALLOC(lk, size);
-	if (lk == NULL)
+	OBD_ALLOC(copy, size);
+	if (copy == NULL)
 		return -ENOMEM;
 
-	if (copy_from_user(lk, data, size))
-		GOTO(out_lk, rc = -EFAULT);
-
-	if (lk->lk_flags & LK_FLG_STOP)
-		goto do_ioctl;
-
-	if (!(lk->lk_flags & LK_FLG_DATANR)) {
-		__u32 archive_mask = lk->lk_data_count;
-		int count;
-
-		/* old hsm agent to old MDS */
-		if (!exp_connect_archive_id_array(exp))
-			goto do_ioctl;
-
-		/* old hsm agent to new MDS */
-		lk->lk_flags |= LK_FLG_DATANR;
-
-		if (archive_mask == 0)
-			goto do_ioctl;
-
-		count = hweight32(archive_mask);
-		new_size = offsetof(struct lustre_kernelcomm, lk_data[count]);
-		OBD_ALLOC(tmp, new_size);
-		if (tmp == NULL)
-			GOTO(out_lk, rc = -ENOMEM);
-
-		memcpy(tmp, lk, size);
-		tmp->lk_data_count = count;
-		OBD_FREE(lk, size);
-		lk = tmp;
-		size = new_size;
-
-		count = 0;
-		for (i = 0; i < sizeof(archive_mask) * 8; i++) {
-			if ((1 << i) & archive_mask) {
-				lk->lk_data[count] = i + 1;
-				count++;
-			}
-		}
-		goto do_ioctl;
-	}
-
-	/* new hsm agent to new mds */
-	if (lk->lk_data_count > 0) {
-		new_size = offsetof(struct lustre_kernelcomm,
-				    lk_data[lk->lk_data_count]);
-		OBD_ALLOC(tmp, new_size);
-		if (tmp == NULL)
-			GOTO(out_lk, rc = -ENOMEM);
-
-		OBD_FREE(lk, size);
-		lk = tmp;
-		size = new_size;
-
-		if (copy_from_user(lk, data, size))
-			GOTO(out_lk, rc = -EFAULT);
+	if (copy_from_user(copy, data, size)) {
+		rc = -EFAULT;
+		goto out;
 	}
 
-	/* new hsm agent to old MDS */
-	if (!exp_connect_archive_id_array(exp)) {
-		__u32 archives = 0;
-
-		if (lk->lk_data_count > LL_HSM_ORIGIN_MAX_ARCHIVE)
-			GOTO(out_lk, rc = -EINVAL);
-
-		for (i = 0; i < lk->lk_data_count; i++) {
-			if (lk->lk_data[i] > LL_HSM_ORIGIN_MAX_ARCHIVE) {
-				rc = -EINVAL;
-				CERROR("%s: archive id %d requested but only "
-				       "[0 - %zu] supported: rc = %d\n",
-				       exp->exp_obd->obd_name, lk->lk_data[i],
-				       LL_HSM_ORIGIN_MAX_ARCHIVE, rc);
-				GOTO(out_lk, rc);
-			}
-
-			if (lk->lk_data[i] == 0) {
-				archives = 0;
-				break;
-			}
+	rc = obd_iocontrol(cmd, exp, size, copy, NULL);
+out:
+	OBD_FREE(copy, size);
 
-			archives |= (1 << (lk->lk_data[i] - 1));
-		}
-		lk->lk_flags &= ~LK_FLG_DATANR;
-		lk->lk_data_count = archives;
-	}
-do_ioctl:
-	rc = obd_iocontrol(cmd, exp, size, lk, NULL);
-out_lk:
-	OBD_FREE(lk, size);
 	return rc;
 }
 
@@ -1152,38 +999,32 @@ static int check_owner(int type, int id)
 	return 0;
 }
 
-static int quotactl_ioctl(struct super_block *sb, struct if_quotactl *qctl)
+static int quotactl_ioctl(struct ll_sb_info *sbi, struct if_quotactl *qctl)
 {
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
-	int cmd = qctl->qc_cmd;
-	int type = qctl->qc_type;
-	int id = qctl->qc_id;
-	int valid = qctl->qc_valid;
-	int rc = 0;
-	ENTRY;
+        int cmd = qctl->qc_cmd;
+        int type = qctl->qc_type;
+        int id = qctl->qc_id;
+        int valid = qctl->qc_valid;
+        int rc = 0;
+        ENTRY;
 
-	switch (cmd) {
-	case Q_SETQUOTA:
-	case Q_SETINFO:
-	case LUSTRE_Q_SETDEFAULT:
+        switch (cmd) {
+        case Q_SETQUOTA:
+        case Q_SETINFO:
 		if (!cfs_capable(CFS_CAP_SYS_ADMIN))
 			RETURN(-EPERM);
-
-		if (sb->s_flags & SB_RDONLY)
-			RETURN(-EROFS);
 		break;
 	case Q_GETQUOTA:
-	case LUSTRE_Q_GETDEFAULT:
 		if (check_owner(type, id) &&
 		    (!cfs_capable(CFS_CAP_SYS_ADMIN)))
 			RETURN(-EPERM);
-		break;
-	case Q_GETINFO:
-		break;
-	default:
-		CERROR("unsupported quotactl op: %#x\n", cmd);
-		RETURN(-ENOTSUPP);
-	}
+                break;
+        case Q_GETINFO:
+                break;
+        default:
+                CERROR("unsupported quotactl op: %#x\n", cmd);
+                RETURN(-ENOTTY);
+        }
 
         if (valid != QC_GENERAL) {
                 if (cmd == Q_GETINFO)
@@ -1280,54 +1121,6 @@ static int quotactl_ioctl(struct super_block *sb, struct if_quotactl *qctl)
         RETURN(rc);
 }
 
-int ll_rmfid(struct file *file, void __user *arg)
-{
-	const struct fid_array __user *ufa = arg;
-	struct fid_array *lfa = NULL;
-	size_t size;
-	unsigned nr;
-	int i, rc, *rcs = NULL;
-	ENTRY;
-
-	if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
-	    !(ll_i2sbi(file_inode(file))->ll_flags & LL_SBI_USER_FID2PATH))
-		RETURN(-EPERM);
-	/* Only need to get the buflen */
-	if (get_user(nr, &ufa->fa_nr))
-		RETURN(-EFAULT);
-	/* DoS protection */
-	if (nr > OBD_MAX_FIDS_IN_ARRAY)
-		RETURN(-E2BIG);
-
-	size = offsetof(struct fid_array, fa_fids[nr]);
-	OBD_ALLOC(lfa, size);
-	if (!lfa)
-		RETURN(-ENOMEM);
-	OBD_ALLOC(rcs, sizeof(int) * nr);
-	if (!rcs)
-		GOTO(free_lfa, rc = -ENOMEM);
-
-	if (copy_from_user(lfa, arg, size))
-		GOTO(free_rcs, rc = -EFAULT);
-
-	/* Call mdc_iocontrol */
-	rc = md_rmfid(ll_i2mdexp(file_inode(file)), lfa, rcs, NULL);
-	if (!rc) {
-		for (i = 0; i < nr; i++)
-			if (rcs[i])
-				lfa->fa_fids[i].f_ver = rcs[i];
-		if (copy_to_user(arg, lfa, size))
-			rc = -EFAULT;
-	}
-
-free_rcs:
-	OBD_FREE(rcs, sizeof(int) * nr);
-free_lfa:
-	OBD_FREE(lfa, size);
-
-	RETURN(rc);
-}
-
 /* This function tries to get a single name component,
  * to send to the server. No actual path traversal involved,
  * so we limit to NAME_MAX */
@@ -1360,46 +1153,46 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct dentry *dentry = file_dentry(file);
 	struct inode *inode = file_inode(file);
-	struct ll_sb_info *sbi = ll_i2sbi(inode);
-	struct obd_ioctl_data *data;
-	int rc = 0;
-	ENTRY;
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        struct obd_ioctl_data *data;
+        int rc = 0;
+        ENTRY;
 
 	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%#x\n",
 	       PFID(ll_inode2fid(inode)), inode, cmd);
 
-	/* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
-	if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
-		return -ENOTTY;
-
-	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
-	switch (cmd) {
-	case FS_IOC_GETFLAGS:
-	case FS_IOC_SETFLAGS:
-		RETURN(ll_iocontrol(inode, file, cmd, arg));
-	case FSFILT_IOC_GETVERSION:
-	case FS_IOC_GETVERSION:
+        /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
+        if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
+                return -ENOTTY;
+
+        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
+        switch(cmd) {
+        case FSFILT_IOC_GETFLAGS:
+        case FSFILT_IOC_SETFLAGS:
+                RETURN(ll_iocontrol(inode, file, cmd, arg));
+        case FSFILT_IOC_GETVERSION_OLD:
+        case FSFILT_IOC_GETVERSION:
 		RETURN(put_user(inode->i_generation, (int __user *)arg));
-	/* We need to special case any other ioctls we want to handle,
-	 * to send them to the MDS/OST as appropriate and to properly
-	 * network encode the arg field. */
-	case FS_IOC_SETVERSION:
-		RETURN(-ENOTSUPP);
-
-	case LL_IOC_GET_MDTIDX: {
-		int mdtidx;
-
-		mdtidx = ll_get_mdt_idx(inode);
-		if (mdtidx < 0)
-			RETURN(mdtidx);
+        /* We need to special case any other ioctls we want to handle,
+         * to send them to the MDS/OST as appropriate and to properly
+         * network encode the arg field.
+        case FSFILT_IOC_SETVERSION_OLD:
+        case FSFILT_IOC_SETVERSION:
+        */
+        case LL_IOC_GET_MDTIDX: {
+                int mdtidx;
+
+                mdtidx = ll_get_mdt_idx(inode);
+                if (mdtidx < 0)
+                        RETURN(mdtidx);
 
 		if (put_user((int)mdtidx, (int __user *)arg))
-			RETURN(-EFAULT);
+                        RETURN(-EFAULT);
 
-		return 0;
-	}
-	case IOC_MDC_LOOKUP: {
-				     int namelen, len = 0;
+                return 0;
+        }
+        case IOC_MDC_LOOKUP: {
+		int namelen, len = 0;
 		char *buf = NULL;
 		char *filename;
 
@@ -1455,9 +1248,8 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		lum = (struct lmv_user_md *)data->ioc_inlbuf2;
 		lumlen = data->ioc_inllen2;
 
-		if ((lum->lum_magic != LMV_USER_MAGIC &&
-		     lum->lum_magic != LMV_USER_MAGIC_SPECIFIC) ||
-		    lumlen < sizeof(*lum)) {
+		if (lum->lum_magic != LMV_USER_MAGIC ||
+		    lumlen != sizeof(*lum)) {
 			CERROR("%s: wrong lum magic %x or size %d: rc = %d\n",
 			       filename, lum->lum_magic, lumlen, -EFAULT);
 			GOTO(lmv_out_free, rc = -EINVAL);
@@ -1468,7 +1260,7 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 #else
 		mode = data->ioc_type;
 #endif
-		rc = ll_dir_setdirstripe(dentry, lum, lumlen, filename, mode);
+		rc = ll_dir_setdirstripe(dentry, lum, filename, mode);
 lmv_out_free:
 		OBD_FREE_LARGE(buf, len);
 		RETURN(rc);
@@ -1492,51 +1284,34 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	}
 	case LL_IOC_LOV_SETSTRIPE_NEW:
 	case LL_IOC_LOV_SETSTRIPE: {
-		struct lov_user_md_v3 *lumv3 = NULL;
-		struct lov_user_md_v1 lumv1;
-		struct lov_user_md_v1 *lumv1_ptr = &lumv1;
+		struct lov_user_md_v3 lumv3;
+		struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
 		struct lov_user_md_v1 __user *lumv1p =
 			(struct lov_user_md_v1 __user *)arg;
 		struct lov_user_md_v3 __user *lumv3p =
 			(struct lov_user_md_v3 __user *)arg;
-		int lum_size = 0;
 
 		int set_default = 0;
 
 		CLASSERT(sizeof(struct lov_user_md_v3) >
 			 sizeof(struct lov_comp_md_v1));
-		CLASSERT(sizeof(*lumv3) == sizeof(*lumv3p));
+		LASSERT(sizeof(lumv3) == sizeof(*lumv3p));
+		LASSERT(sizeof(lumv3.lmm_objects[0]) ==
+				sizeof(lumv3p->lmm_objects[0]));
 		/* first try with v1 which is smaller than v3 */
-		if (copy_from_user(&lumv1, lumv1p, sizeof(lumv1)))
-			RETURN(-EFAULT);
+		if (copy_from_user(lumv1, lumv1p, sizeof(*lumv1)))
+                        RETURN(-EFAULT);
+
+		if (lumv1->lmm_magic == LOV_USER_MAGIC_V3)
+			if (copy_from_user(&lumv3, lumv3p, sizeof(lumv3)))
+				RETURN(-EFAULT);
 
 		if (inode->i_sb->s_root == file_dentry(file))
 			set_default = 1;
 
-		switch (lumv1.lmm_magic) {
-		case LOV_USER_MAGIC_V3:
-		case LOV_USER_MAGIC_SPECIFIC:
-			lum_size = ll_lov_user_md_size(&lumv1);
-			if (lum_size < 0)
-				RETURN(lum_size);
-			OBD_ALLOC(lumv3, lum_size);
-			if (!lumv3)
-				RETURN(-ENOMEM);
-			if (copy_from_user(lumv3, lumv3p, lum_size))
-				GOTO(out, rc = -EFAULT);
-			lumv1_ptr = (struct lov_user_md_v1 *)lumv3;
-			break;
-		case LOV_USER_MAGIC_V1:
-			break;
-		default:
-			GOTO(out, rc = -ENOTSUPP);
-		}
-
 		/* in v1 and v3 cases lumv1 points to data */
-		rc = ll_dir_setstripe(inode, lumv1_ptr, set_default);
-out:
-		if (lumv3)
-			OBD_FREE(lumv3, lum_size);
+		rc = ll_dir_setstripe(inode, lumv1, set_default);
+
 		RETURN(rc);
 	}
 	case LL_IOC_LMV_GETSTRIPE: {
@@ -1544,7 +1319,6 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 					(struct lmv_user_md __user *)arg;
 		struct lmv_user_md	lum;
 		struct ptlrpc_request	*request = NULL;
-		struct ptlrpc_request	*root_request = NULL;
 		union lmv_mds_md	*lmm = NULL;
 		int			lmmsize;
 		u64			valid = 0;
@@ -1570,8 +1344,8 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		else
 			RETURN(-EINVAL);
 
-		rc = ll_dir_getstripe_default(inode, (void **)&lmm, &lmmsize,
-					      &request, &root_request, valid);
+		rc = ll_dir_getstripe(inode, (void **)&lmm, &lmmsize, &request,
+				      valid);
 		if (rc != 0)
 			GOTO(finish_req, rc);
 
@@ -1594,8 +1368,7 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			GOTO(finish_req, rc = -E2BIG);
 		}
 
-		lum_size = lmv_user_md_size(stripe_count,
-					    LMV_USER_MAGIC_SPECIFIC);
+		lum_size = lmv_user_md_size(stripe_count, LMV_MAGIC_V1);
 		OBD_ALLOC(tmp, lum_size);
 		if (tmp == NULL)
 			GOTO(finish_req, rc = -ENOMEM);
@@ -1612,15 +1385,12 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			struct lu_fid	fid;
 
 			fid_le_to_cpu(&fid, &lmm->lmv_md_v1.lmv_stripe_fids[i]);
-			if (fid_is_sane(&fid)) {
-				mdt_index = ll_get_mdt_idx_by_fid(sbi, &fid);
-				if (mdt_index < 0)
-					GOTO(out_tmp, rc = mdt_index);
-
-				tmp->lum_objects[i].lum_mds = mdt_index;
-				tmp->lum_objects[i].lum_fid = fid;
-			}
+			mdt_index = ll_get_mdt_idx_by_fid(sbi, &fid);
+			if (mdt_index < 0)
+				GOTO(out_tmp, rc = mdt_index);
 
+			tmp->lum_objects[i].lum_mds = mdt_index;
+			tmp->lum_objects[i].lum_fid = fid;
 			tmp->lum_stripe_count++;
 		}
 
@@ -1630,7 +1400,6 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		OBD_FREE(tmp, lum_size);
 finish_req:
 		ptlrpc_req_finished(request);
-		ptlrpc_req_finished(root_request);
 		return rc;
 	}
 
@@ -1661,8 +1430,6 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                         ll_putname(filename);
 		RETURN(rc);
 	}
-	case LL_IOC_RMFID:
-		RETURN(ll_rmfid(file, (void __user *)arg));
 	case LL_IOC_LOV_SWAP_LAYOUTS:
 		RETURN(-EPERM);
 	case IOC_OBD_STATFS:
@@ -1670,93 +1437,62 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case LL_IOC_LOV_GETSTRIPE:
 	case LL_IOC_LOV_GETSTRIPE_NEW:
 	case LL_IOC_MDC_GETINFO:
-	case LL_IOC_MDC_GETINFO_OLD:
 	case IOC_MDC_GETFILEINFO:
-	case IOC_MDC_GETFILEINFO_OLD:
 	case IOC_MDC_GETFILESTRIPE: {
 		struct ptlrpc_request *request = NULL;
-		struct ptlrpc_request *root_request = NULL;
 		struct lov_user_md __user *lump;
-		struct lov_mds_md *lmm = NULL;
-		struct mdt_body *body;
-		char *filename = NULL;
-		lstat_t __user *statp = NULL;
-		lstatx_t __user *stxp = NULL;
-		__u64 __user *flagsp = NULL;
-		__u32 __user *lmmsizep = NULL;
-		struct lu_fid __user *fidp = NULL;
-		int lmmsize;
-
-		if (cmd == IOC_MDC_GETFILEINFO_OLD ||
-		    cmd == IOC_MDC_GETFILEINFO ||
-		    cmd == IOC_MDC_GETFILESTRIPE) {
-			filename = ll_getname((const char __user *)arg);
-			if (IS_ERR(filename))
-				RETURN(PTR_ERR(filename));
+                struct lov_mds_md *lmm = NULL;
+                struct mdt_body *body;
+                char *filename = NULL;
+                int lmmsize;
 
-			rc = ll_lov_getstripe_ea_info(inode, filename, &lmm,
-						      &lmmsize, &request);
-		} else {
-			rc = ll_dir_getstripe_default(inode, (void **)&lmm,
-						      &lmmsize, &request,
-						      &root_request, 0);
-		}
+                if (cmd == IOC_MDC_GETFILEINFO ||
+                    cmd == IOC_MDC_GETFILESTRIPE) {
+			filename = ll_getname((const char __user *)arg);
+                        if (IS_ERR(filename))
+                                RETURN(PTR_ERR(filename));
 
-		if (request) {
-			body = req_capsule_server_get(&request->rq_pill,
-						      &RMF_MDT_BODY);
-			LASSERT(body != NULL);
+                        rc = ll_lov_getstripe_ea_info(inode, filename, &lmm,
+                                                      &lmmsize, &request);
 		} else {
-			GOTO(out_req, rc);
+			rc = ll_dir_getstripe(inode, (void **)&lmm, &lmmsize,
+					      &request, 0);
 		}
 
-		if (rc == -ENODATA && (cmd == IOC_MDC_GETFILEINFO ||
-				       cmd == LL_IOC_MDC_GETINFO ||
-				       cmd == IOC_MDC_GETFILEINFO_OLD ||
-				       cmd == LL_IOC_MDC_GETINFO_OLD)) {
-			lmmsize = 0;
-			rc = 0;
-		}
+                if (request) {
+                        body = req_capsule_server_get(&request->rq_pill,
+                                                      &RMF_MDT_BODY);
+                        LASSERT(body != NULL);
+                } else {
+                        GOTO(out_req, rc);
+                }
 
-		if (rc < 0)
-			GOTO(out_req, rc);
+                if (rc < 0) {
+                        if (rc == -ENODATA && (cmd == IOC_MDC_GETFILEINFO ||
+                                               cmd == LL_IOC_MDC_GETINFO))
+                                GOTO(skip_lmm, rc = 0);
+                        else
+                                GOTO(out_req, rc);
+                }
 
 		if (cmd == IOC_MDC_GETFILESTRIPE ||
 		    cmd == LL_IOC_LOV_GETSTRIPE ||
 		    cmd == LL_IOC_LOV_GETSTRIPE_NEW) {
 			lump = (struct lov_user_md __user *)arg;
-		} else if (cmd == IOC_MDC_GETFILEINFO_OLD ||
-			   cmd == LL_IOC_MDC_GETINFO_OLD){
-			struct lov_user_mds_data_v1 __user *lmdp;
-
-			lmdp = (struct lov_user_mds_data_v1 __user *)arg;
-			statp = &lmdp->lmd_st;
-			lump = &lmdp->lmd_lmm;
-		} else {
+                } else {
 			struct lov_user_mds_data __user *lmdp;
-
 			lmdp = (struct lov_user_mds_data __user *)arg;
-			fidp = &lmdp->lmd_fid;
-			stxp = &lmdp->lmd_stx;
-			flagsp = &lmdp->lmd_flags;
-			lmmsizep = &lmdp->lmd_lmmsize;
-			lump = &lmdp->lmd_lmm;
-		}
-
-		if (lmmsize == 0) {
-			/* If the file has no striping then zero out *lump so
-			 * that the caller isn't confused by garbage. */
-			if (clear_user(lump, sizeof(*lump)))
-				GOTO(out_req, rc = -EFAULT);
-		} else if (copy_to_user(lump, lmm, lmmsize)) {
+                        lump = &lmdp->lmd_lmm;
+                }
+		if (copy_to_user(lump, lmm, lmmsize)) {
 			if (copy_to_user(lump, lmm, sizeof(*lump)))
-				GOTO(out_req, rc = -EFAULT);
-			rc = -EOVERFLOW;
-		}
-
-		if (cmd == IOC_MDC_GETFILEINFO_OLD ||
-		    cmd == LL_IOC_MDC_GETINFO_OLD) {
-			lstat_t st = { 0 };
+                                GOTO(out_req, rc = -EFAULT);
+                        rc = -EOVERFLOW;
+                }
+        skip_lmm:
+                if (cmd == IOC_MDC_GETFILEINFO || cmd == LL_IOC_MDC_GETINFO) {
+			struct lov_user_mds_data __user *lmdp;
+                        lstat_t st = { 0 };
 
 			st.st_dev	= inode->i_sb->s_dev;
 			st.st_mode	= body->mbo_mode;
@@ -1774,86 +1510,29 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 						sbi->ll_flags &
 						LL_SBI_32BIT_API);
 
-			if (copy_to_user(statp, &st, sizeof(st)))
-				GOTO(out_req, rc = -EFAULT);
-		} else if (cmd == IOC_MDC_GETFILEINFO ||
-			   cmd == LL_IOC_MDC_GETINFO) {
-			lstatx_t stx = { 0 };
-			__u64 valid = body->mbo_valid;
-
-			stx.stx_blksize = PAGE_SIZE;
-			stx.stx_nlink = body->mbo_nlink;
-			stx.stx_uid = body->mbo_uid;
-			stx.stx_gid = body->mbo_gid;
-			stx.stx_mode = body->mbo_mode;
-			stx.stx_ino = cl_fid_build_ino(&body->mbo_fid1,
-						       sbi->ll_flags &
-						       LL_SBI_32BIT_API);
-			stx.stx_size = body->mbo_size;
-			stx.stx_blocks = body->mbo_blocks;
-			stx.stx_atime.tv_sec = body->mbo_atime;
-			stx.stx_ctime.tv_sec = body->mbo_ctime;
-			stx.stx_mtime.tv_sec = body->mbo_mtime;
-			stx.stx_rdev_major = MAJOR(body->mbo_rdev);
-			stx.stx_rdev_minor = MINOR(body->mbo_rdev);
-			stx.stx_dev_major = MAJOR(inode->i_sb->s_dev);
-			stx.stx_dev_minor = MINOR(inode->i_sb->s_dev);
-			stx.stx_mask |= STATX_BASIC_STATS;
-
-			/*
-			 * For a striped directory, the size and blocks returned
-			 * from MDT is not correct.
-			 * The size and blocks are aggregated by client across
-			 * all stripes.
-			 * Thus for a striped directory, do not return the valid
-			 * FLSIZE and FLBLOCKS flags to the caller.
-			 * However, this whould be better decided by the MDS
-			 * instead of the client.
-			 */
-			if (cmd == LL_IOC_MDC_GETINFO &&
-			    ll_i2info(inode)->lli_lsm_md != NULL)
-				valid &= ~(OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
-
-			if (flagsp && copy_to_user(flagsp, &valid,
-						   sizeof(*flagsp)))
-				GOTO(out_req, rc = -EFAULT);
-
-			if (fidp && copy_to_user(fidp, &body->mbo_fid1,
-						 sizeof(*fidp)))
-				GOTO(out_req, rc = -EFAULT);
-
-			if (!(valid & OBD_MD_FLSIZE))
-				stx.stx_mask &= ~STATX_SIZE;
-			if (!(valid & OBD_MD_FLBLOCKS))
-				stx.stx_mask &= ~STATX_BLOCKS;
-
-			if (stxp && copy_to_user(stxp, &stx, sizeof(stx)))
-				GOTO(out_req, rc = -EFAULT);
-
-			if (lmmsizep && copy_to_user(lmmsizep, &lmmsize,
-						     sizeof(*lmmsizep)))
-				GOTO(out_req, rc = -EFAULT);
-		}
+			lmdp = (struct lov_user_mds_data __user *)arg;
+			if (copy_to_user(&lmdp->lmd_st, &st, sizeof(st)))
+                                GOTO(out_req, rc = -EFAULT);
+                }
 
-		EXIT;
-out_req:
-		ptlrpc_req_finished(request);
-		ptlrpc_req_finished(root_request);
-		if (filename)
-			ll_putname(filename);
-		return rc;
-	}
+                EXIT;
+        out_req:
+                ptlrpc_req_finished(request);
+                if (filename)
+                        ll_putname(filename);
+                return rc;
+        }
 	case OBD_IOC_QUOTACTL: {
-		struct if_quotactl *qctl;
+                struct if_quotactl *qctl;
 
-		OBD_ALLOC_PTR(qctl);
-		if (!qctl)
-			RETURN(-ENOMEM);
+                OBD_ALLOC_PTR(qctl);
+                if (!qctl)
+                        RETURN(-ENOMEM);
 
 		if (copy_from_user(qctl, (void __user *)arg, sizeof(*qctl)))
-			GOTO(out_quotactl, rc = -EFAULT);
+                        GOTO(out_quotactl, rc = -EFAULT);
 
-		rc = quotactl_ioctl(inode->i_sb, qctl);
+                rc = quotactl_ioctl(sbi, qctl);
 
 		if (rc == 0 &&
 		    copy_to_user((void __user *)arg, qctl, sizeof(*qctl)))
@@ -2004,8 +1683,8 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		if (!cfs_capable(CFS_CAP_SYS_ADMIN))
 			RETURN(-EPERM);
 
-		rc = copy_and_ct_start(cmd, sbi->ll_md_exp,
-				       (struct lustre_kernelcomm __user *)arg);
+		rc = copy_and_ioctl(cmd, sbi->ll_md_exp, (void __user *)arg,
+				    sizeof(struct lustre_kernelcomm));
 		RETURN(rc);
 
 	case LL_IOC_HSM_COPY_START: {
@@ -2047,15 +1726,15 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		RETURN(rc);
 	}
 	case LL_IOC_MIGRATE: {
-		struct lmv_user_md *lum;
-		char *buf = NULL;
-		int len;
-		char *filename;
-		int namelen = 0;
-		int rc;
+		char		*buf = NULL;
+		const char	*filename;
+		int		namelen = 0;
+		int		len;
+		int		rc;
+		int		mdtidx;
 
 		rc = obd_ioctl_getdata(&buf, &len, (void __user *)arg);
-		if (rc)
+		if (rc < 0)
 			RETURN(rc);
 
 		data = (struct obd_ioctl_data *)buf;
@@ -2065,22 +1744,15 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 		filename = data->ioc_inlbuf1;
 		namelen = data->ioc_inllen1;
-
-		if (namelen < 1 || namelen != strlen(filename) + 1) {
-			CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
+		/* \0 is packed at the end of filename */
+		if (namelen < 1 || namelen != strlen(filename) + 1)
 			GOTO(migrate_free, rc = -EINVAL);
-		}
 
-		lum = (struct lmv_user_md *)data->ioc_inlbuf2;
-		if (lum->lum_magic != LMV_USER_MAGIC &&
-		    lum->lum_magic != LMV_USER_MAGIC_SPECIFIC) {
-			rc = -EINVAL;
-			CERROR("%s: wrong lum magic %x: rc = %d\n",
-			       filename, lum->lum_magic, rc);
-			GOTO(migrate_free, rc);
-		}
+		if (data->ioc_inllen2 != sizeof(mdtidx))
+			GOTO(migrate_free, rc = -EINVAL);
+		mdtidx = *(int *)data->ioc_inlbuf2;
 
-		rc = ll_migrate(inode, file, lum, filename);
+		rc = ll_migrate(inode, file, mdtidx, filename, namelen - 1);
 migrate_free:
 		OBD_FREE_LARGE(buf, len);
 
diff --git a/drivers/staging/lustrefsx/lustre/llite/file.c b/drivers/staging/lustrefsx/lustre/llite/file.c
index 22b09065f90f5..04cc72f451861 100644
--- a/drivers/staging/lustrefsx/lustre/llite/file.c
+++ b/drivers/staging/lustrefsx/lustre/llite/file.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -45,19 +45,15 @@
 #ifdef HAVE_UIDGID_HEADER
 # include <linux/uidgid.h>
 #endif
+#include <lustre/ll_fiemap.h>
 
-#include <uapi/linux/lustre/lustre_ioctl.h>
+#include <uapi/linux/lustre_ioctl.h>
 #include <lustre_swab.h>
 
 #include "cl_object.h"
 #include "llite_internal.h"
 #include "vvp_internal.h"
 
-struct split_param {
-	struct inode	*sp_inode;
-	__u16		sp_mirror_id;
-};
-
 static int
 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
 
@@ -99,15 +95,12 @@ static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
 	op_data->op_attr.ia_mtime = inode->i_mtime;
 	op_data->op_attr.ia_ctime = inode->i_ctime;
 	op_data->op_attr.ia_size = i_size_read(inode);
-	op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
-				      ATTR_MTIME | ATTR_MTIME_SET |
-				      ATTR_CTIME);
-	op_data->op_xvalid |= OP_XVALID_CTIME_SET;
+	op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
+				     ATTR_MTIME | ATTR_MTIME_SET |
+				     ATTR_CTIME | ATTR_CTIME_SET;
 	op_data->op_attr_blocks = inode->i_blocks;
 	op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
-	if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
-		op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
-	op_data->op_open_handle = och->och_open_handle;
+	op_data->op_handle = och->och_fh;
 
 	if (och->och_flags & FMODE_WRITE &&
 	    ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
@@ -152,53 +145,20 @@ static int ll_close_inode_openhandle(struct inode *inode,
 
 	ll_prepare_close(inode, op_data, och);
 	switch (bias) {
-	case MDS_CLOSE_LAYOUT_MERGE:
-		/* merge blocks from the victim inode */
-		op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
-		op_data->op_attr.ia_valid |= ATTR_SIZE;
-		op_data->op_xvalid |= OP_XVALID_BLOCKS;
-		/* fallthrough */
-	case MDS_CLOSE_LAYOUT_SPLIT:
-	case MDS_CLOSE_LAYOUT_SWAP: {
-		struct split_param *sp = data;
-
+	case MDS_CLOSE_LAYOUT_SWAP:
 		LASSERT(data != NULL);
-		op_data->op_bias |= bias;
+		op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
 		op_data->op_data_version = 0;
 		op_data->op_lease_handle = och->och_lease_handle;
-		if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
-			op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
-			op_data->op_mirror_id = sp->sp_mirror_id;
-		} else {
-			op_data->op_fid2 = *ll_inode2fid(data);
-		}
-		break;
-	}
-
-	case MDS_CLOSE_RESYNC_DONE: {
-		struct ll_ioc_lease *ioc = data;
-
-		LASSERT(data != NULL);
-		op_data->op_attr_blocks +=
-			ioc->lil_count * op_data->op_attr_blocks;
-		op_data->op_attr.ia_valid |= ATTR_SIZE;
-		op_data->op_xvalid |= OP_XVALID_BLOCKS;
-		op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
-
-		op_data->op_lease_handle = och->och_lease_handle;
-		op_data->op_data = &ioc->lil_ids[0];
-		op_data->op_data_size =
-			ioc->lil_count * sizeof(ioc->lil_ids[0]);
+		op_data->op_fid2 = *ll_inode2fid(data);
 		break;
-	}
 
 	case MDS_HSM_RELEASE:
 		LASSERT(data != NULL);
 		op_data->op_bias |= MDS_HSM_RELEASE;
 		op_data->op_data_version = *(__u64 *)data;
 		op_data->op_lease_handle = och->och_lease_handle;
-		op_data->op_attr.ia_valid |= ATTR_SIZE;
-		op_data->op_xvalid |= OP_XVALID_BLOCKS;
+		op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 		break;
 
 	default:
@@ -206,17 +166,13 @@ static int ll_close_inode_openhandle(struct inode *inode,
 		break;
 	}
 
-	if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
-		op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
-	if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
-		op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
-
 	rc = md_close(md_exp, op_data, och->och_mod, &req);
 	if (rc != 0 && rc != -EINTR)
 		CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 		       md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 
-	if (rc == 0 && op_data->op_bias & bias) {
+	if (rc == 0 &&
+	    op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
 		struct mdt_body *body;
 
 		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
@@ -229,7 +185,7 @@ static int ll_close_inode_openhandle(struct inode *inode,
 out:
 
 	md_clear_open_replay_data(md_exp, och);
-	och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
+	och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 	OBD_FREE_PTR(och);
 
 	ptlrpc_req_finished(req);	/* This is close request */
@@ -331,9 +287,7 @@ static int ll_md_close(struct inode *inode, struct file *file)
 	}
 	mutex_unlock(&lli->lli_och_mutex);
 
-	/* LU-4398: do not cache write open lock if the file has exec bit */
-	if ((lockmode == LCK_CW && inode->i_mode & S_IXUGO) ||
-	    !md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
+	if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 			   LDLM_IBITS, &policy, lockmode, &lockh))
 		rc = ll_md_real_close(inode, fd->fd_omode);
 
@@ -390,146 +344,12 @@ int ll_file_release(struct inode *inode, struct file *file)
 	RETURN(rc);
 }
 
-static inline int ll_dom_readpage(void *data, struct page *page)
-{
-	struct niobuf_local *lnb = data;
-	void *kaddr;
-
-	kaddr = ll_kmap_atomic(page, KM_USER0);
-	memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
-	if (lnb->lnb_len < PAGE_SIZE)
-		memset(kaddr + lnb->lnb_len, 0,
-		       PAGE_SIZE - lnb->lnb_len);
-	flush_dcache_page(page);
-	SetPageUptodate(page);
-	ll_kunmap_atomic(kaddr, KM_USER0);
-	unlock_page(page);
-
-	return 0;
-}
-
-void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req)
-{
-	struct lu_env *env;
-	struct cl_io *io;
-	struct ll_inode_info *lli = ll_i2info(inode);
-	struct cl_object *obj = lli->lli_clob;
-	struct address_space *mapping = inode->i_mapping;
-	struct page *vmpage;
-	struct niobuf_remote *rnb;
-	struct mdt_body *body;
-	char *data;
-	unsigned long index, start;
-	struct niobuf_local lnb;
-	__u16 refcheck;
-	int rc;
-
-	ENTRY;
-
-	if (obj == NULL)
-		RETURN_EXIT;
-
-	if (!req_capsule_field_present(&req->rq_pill, &RMF_NIOBUF_INLINE,
-				       RCL_SERVER))
-		RETURN_EXIT;
-
-	rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
-	if (rnb == NULL || rnb->rnb_len == 0)
-		RETURN_EXIT;
-
-	/* LU-11595: Server may return whole file and that is OK always or
-	 * it may return just file tail and its offset must be aligned with
-	 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
-	 * smaller then offset may be not aligned and that data is just ignored.
-	 */
-	if (rnb->rnb_offset % PAGE_SIZE)
-		RETURN_EXIT;
-
-	/* Server returns whole file or just file tail if it fills in reply
-	 * buffer, in both cases total size should be equal to the file size.
-	 */
-	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
-	if (rnb->rnb_offset + rnb->rnb_len != body->mbo_dom_size) {
-		CERROR("%s: server returns off/len %llu/%u but size %llu\n",
-		       ll_get_fsname(inode->i_sb, NULL, 0), rnb->rnb_offset,
-		       rnb->rnb_len, body->mbo_dom_size);
-		RETURN_EXIT;
-	}
-
-	env = cl_env_get(&refcheck);
-	if (IS_ERR(env))
-		RETURN_EXIT;
-	io = vvp_env_thread_io(env);
-	io->ci_obj = obj;
-	io->ci_ignore_layout = 1;
-	rc = cl_io_init(env, io, CIT_MISC, obj);
-	if (rc)
-		GOTO(out_io, rc);
-
-	CDEBUG(D_INFO, "Get data along with open at %llu len %i, size %llu\n",
-	       rnb->rnb_offset, rnb->rnb_len, body->mbo_dom_size);
-
-	data = (char *)rnb + sizeof(*rnb);
-
-	lnb.lnb_file_offset = rnb->rnb_offset;
-	start = lnb.lnb_file_offset / PAGE_SIZE;
-	index = 0;
-	LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
-	lnb.lnb_page_offset = 0;
-	do {
-		struct cl_page *page;
-
-		lnb.lnb_data = data + (index << PAGE_SHIFT);
-		lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
-		if (lnb.lnb_len > PAGE_SIZE)
-			lnb.lnb_len = PAGE_SIZE;
-
-		vmpage = read_cache_page(mapping, index + start,
-					 ll_dom_readpage, &lnb);
-		if (IS_ERR(vmpage)) {
-			CWARN("%s: cannot fill page %lu for "DFID
-			      " with data: rc = %li\n",
-			      ll_get_fsname(inode->i_sb, NULL, 0),
-			      index + start, PFID(lu_object_fid(&obj->co_lu)),
-			      PTR_ERR(vmpage));
-			break;
-		}
-		lock_page(vmpage);
-		if (vmpage->mapping == NULL) {
-			unlock_page(vmpage);
-			put_page(vmpage);
-			/* page was truncated */
-			break;
-		}
-		/* attach VM page to CL page cache */
-		page = cl_page_find(env, obj, vmpage->index, vmpage,
-				    CPT_CACHEABLE);
-		if (IS_ERR(page)) {
-			ClearPageUptodate(vmpage);
-			unlock_page(vmpage);
-			put_page(vmpage);
-			break;
-		}
-		cl_page_export(env, page, 1);
-		cl_page_put(env, page);
-		unlock_page(vmpage);
-		put_page(vmpage);
-		index++;
-	} while (rnb->rnb_len > (index << PAGE_SHIFT));
-
-out_io:
-	cl_io_fini(env, io);
-	cl_env_put(env, &refcheck);
-
-	EXIT;
-}
-
 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 				struct lookup_intent *itp)
 {
 	struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 	struct dentry *parent = de->d_parent;
-	char *name = NULL;
+	const char *name = NULL;
 	int len = 0;
 	struct md_op_data *op_data;
 	struct ptlrpc_request *req = NULL;
@@ -541,43 +361,21 @@ static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 
 	/* if server supports open-by-fid, or file name is invalid, don't pack
 	 * name in open request */
-	if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
-	    !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
-retry:
+	if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
+	    lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
+		name = de->d_name.name;
 		len = de->d_name.len;
-		name = kmalloc(len + 1, GFP_NOFS);
-		if (!name)
-			RETURN(-ENOMEM);
-
-		/* race here */
-		spin_lock(&de->d_lock);
-		if (len != de->d_name.len) {
-			spin_unlock(&de->d_lock);
-			kfree(name);
-			goto retry;
-		}
-		memcpy(name, de->d_name.name, len);
-		name[len] = '\0';
-		spin_unlock(&de->d_lock);
-
-		if (!lu_name_is_valid_2(name, len)) {
-			kfree(name);
-			RETURN(-ESTALE);
-		}
 	}
 
 	op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 				     name, len, 0, LUSTRE_OPC_ANY, NULL);
-	if (IS_ERR(op_data)) {
-		kfree(name);
+	if (IS_ERR(op_data))
 		RETURN(PTR_ERR(op_data));
-	}
 	op_data->op_data = lmm;
 	op_data->op_data_size = lmmsize;
 
 	rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 			    &ll_md_blocking_ast, 0);
-	kfree(name);
 	ll_finish_md_op_data(op_data);
 	if (rc == -ESTALE) {
 		/* reason for keep own exit path - don`t flood log
@@ -600,25 +398,8 @@ static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 	}
 
 	rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
-
-	if (!rc && itp->it_lock_mode) {
-		__u64 bits = 0;
-
-		/* If we got a lock back and it has a LOOKUP bit set,
-		 * make sure the dentry is marked as valid so we can find it.
-		 * We don't need to care about actual hashing since other bits
-		 * of kernel will deal with that later.
-		 */
-		ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, &bits);
-		if (bits & MDS_INODELOCK_LOOKUP)
-			d_lustre_revalidate(de);
-
-		/* if DoM bit returned along with LAYOUT bit then there
-		 * can be read-on-open data returned.
-		 */
-		if (bits & MDS_INODELOCK_DOM && bits & MDS_INODELOCK_LAYOUT)
-			ll_dom_finish_open(de->d_inode, req);
-	}
+	if (!rc && itp->it_lock_mode)
+		ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 
 out:
 	ptlrpc_req_finished(req);
@@ -643,7 +424,7 @@ static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 	struct mdt_body *body;
 
 	body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
-	och->och_open_handle = body->mbo_open_handle;
+	och->och_fh = body->mbo_handle;
 	och->och_fid = body->mbo_fid1;
 	och->och_lease_handle.cookie = it->it_lock_handle;
 	och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
@@ -713,7 +494,7 @@ int ll_file_open(struct inode *inode, struct file *file)
 
 	fd = ll_file_data_get();
 	if (fd == NULL)
-		GOTO(out_nofiledata, rc = -ENOMEM);
+		GOTO(out_openerr, rc = -ENOMEM);
 
 	fd->fd_file = file;
 	if (S_ISDIR(inode->i_mode))
@@ -733,13 +514,12 @@ int ll_file_open(struct inode *inode, struct file *file)
                 if (file->f_flags & O_TRUNC)
                         oit.it_flags |= FMODE_WRITE;
 
-		/* kernel only call f_op->open in dentry_open.  filp_open calls
-		 * dentry_open after call to open_namei that checks permissions.
-		 * Only nfsd_open call dentry_open directly without checking
-		 * permissions and because of that this code below is safe.
-		 */
-		if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
-			oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
+                /* kernel only call f_op->open in dentry_open.  filp_open calls
+                 * dentry_open after call to open_namei that checks permissions.
+                 * Only nfsd_open call dentry_open directly without checking
+                 * permissions and because of that this code below is safe. */
+                if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
+                        oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 
                 /* We do not want O_EXCL here, presumably we opened the file
                  * already? XXX - NFS implications? */
@@ -883,7 +663,6 @@ int ll_file_open(struct inode *inode, struct file *file)
                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
         }
 
-out_nofiledata:
 	if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 		ptlrpc_req_finished(it->it_request);
 		it_clear_disposition(it, DISP_ENQ_OPEN_REF);
@@ -921,7 +700,7 @@ static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
  * if it has an open lock in cache already.
  */
 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
-				struct lustre_handle *old_open_handle)
+				struct lustre_handle *old_handle)
 {
 	struct ll_inode_info *lli = ll_i2info(inode);
 	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
@@ -954,7 +733,7 @@ static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 		*och_p = NULL;
 	}
 
-	*old_open_handle = fd->fd_och->och_open_handle;
+	*old_handle = fd->fd_och->och_fh;
 
 	EXIT;
 out_unlock:
@@ -1015,7 +794,7 @@ ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 	struct ll_sb_info *sbi = ll_i2sbi(inode);
 	struct md_op_data *op_data;
 	struct ptlrpc_request *req = NULL;
-	struct lustre_handle old_open_handle = { 0 };
+	struct lustre_handle old_handle = { 0 };
 	struct obd_client_handle *och = NULL;
 	int rc;
 	int rc2;
@@ -1028,7 +807,7 @@ ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 		if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 			RETURN(ERR_PTR(-EPERM));
 
-		rc = ll_lease_och_acquire(inode, file, &old_open_handle);
+		rc = ll_lease_och_acquire(inode, file, &old_handle);
 		if (rc)
 			RETURN(ERR_PTR(rc));
 	}
@@ -1043,7 +822,7 @@ ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 		GOTO(out, rc = PTR_ERR(op_data));
 
 	/* To tell the MDT this openhandle is from the same owner */
-	op_data->op_open_handle = old_open_handle;
+	op_data->op_handle = old_handle;
 
 	it.it_flags = fmode | open_flags;
 	it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
@@ -1069,9 +848,7 @@ ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 		GOTO(out_release_it, rc);
 
 	LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
-	rc = ll_och_fill(sbi->ll_md_exp, &it, och);
-	if (rc)
-		GOTO(out_release_it, rc);
+	ll_och_fill(sbi->ll_md_exp, &it, och);
 
 	if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
 		GOTO(out_close, rc = -EOPNOTSUPP);
@@ -1159,7 +936,7 @@ static int ll_swap_layouts_close(struct obd_client_handle *och,
 	if (rc == 0)
 		GOTO(out_free_och, rc = -EINVAL);
 
-	/* Close the file and {swap,merge} layouts between inode & inode2.
+	/* Close the file and swap layouts between inode & inode2.
 	 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
 	 * because we still need it to pack l_remote_handle to MDT. */
 	rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
@@ -1178,10 +955,8 @@ static int ll_swap_layouts_close(struct obd_client_handle *och,
  * Release lease and close the file.
  * It will check if the lease has ever broken.
  */
-static int ll_lease_close_intent(struct obd_client_handle *och,
-				 struct inode *inode,
-				 bool *lease_broken, enum mds_op_bias bias,
-				 void *data)
+static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
+			  bool *lease_broken)
 {
 	struct ldlm_lock *lock;
 	bool cancelled = true;
@@ -1196,71 +971,19 @@ static int ll_lease_close_intent(struct obd_client_handle *och,
 		LDLM_LOCK_PUT(lock);
 	}
 
-	CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
-	       PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
+	CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
+	       PFID(&ll_i2info(inode)->lli_fid), cancelled);
 
-	if (lease_broken != NULL)
-		*lease_broken = cancelled;
-
-	if (!cancelled && !bias)
+	if (!cancelled)
 		ldlm_cli_cancel(&och->och_lease_handle, 0);
 
-	if (cancelled) { /* no need to excute intent */
-		bias = 0;
-		data = NULL;
-	}
+	if (lease_broken != NULL)
+		*lease_broken = cancelled;
 
-	rc = ll_close_inode_openhandle(inode, och, bias, data);
+	rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 	RETURN(rc);
 }
 
-static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
-			  bool *lease_broken)
-{
-	return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
-}
-
-/**
- * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
- */
-static int ll_lease_file_resync(struct obd_client_handle *och,
-				struct inode *inode, unsigned long arg)
-{
-	struct ll_sb_info *sbi = ll_i2sbi(inode);
-	struct md_op_data *op_data;
-	struct ll_ioc_lease_id ioc;
-	__u64 data_version_unused;
-	int rc;
-	ENTRY;
-
-	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
-				     LUSTRE_OPC_ANY, NULL);
-	if (IS_ERR(op_data))
-		RETURN(PTR_ERR(op_data));
-
-	if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
-			   sizeof(ioc)))
-		RETURN(-EFAULT);
-
-	/* before starting file resync, it's necessary to clean up page cache
-	 * in client memory, otherwise once the layout version is increased,
-	 * writing back cached data will be denied the OSTs. */
-	rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
-	if (rc)
-		GOTO(out, rc);
-
-	op_data->op_lease_handle = och->och_lease_handle;
-	op_data->op_mirror_id = ioc.lil_mirror_id;
-	rc = md_file_resync(sbi->ll_md_exp, op_data);
-	if (rc)
-		GOTO(out, rc);
-
-	EXIT;
-out:
-	ll_finish_md_op_data(op_data);
-	return rc;
-}
-
 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
 {
 	struct ll_inode_info *lli = ll_i2info(inode);
@@ -1300,14 +1023,11 @@ int ll_merge_attr(const struct lu_env *env, struct inode *inode)
 	ctime = inode->i_ctime.tv_sec;
 
 	cl_object_attr_lock(obj);
-	if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
-		rc = -EINVAL;
-	else
-		rc = cl_object_attr_get(env, obj, attr);
+	rc = cl_object_attr_get(env, obj, attr);
 	cl_object_attr_unlock(obj);
 
 	if (rc != 0)
-		GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
+		GOTO(out_size_unlock, rc);
 
 	if (atime < attr->cat_atime)
 		atime = attr->cat_atime;
@@ -1334,32 +1054,6 @@ int ll_merge_attr(const struct lu_env *env, struct inode *inode)
 	RETURN(rc);
 }
 
-/**
- * Set designated mirror for I/O.
- *
- * So far only read, write, and truncated can support to issue I/O to
- * designated mirror.
- */
-void ll_io_set_mirror(struct cl_io *io, const struct file *file)
-{
-	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
-
-	/* clear layout version for generic(non-resync) I/O in case it carries
-	 * stale layout version due to I/O restart */
-	io->ci_layout_version = 0;
-
-	/* FLR: disable non-delay for designated mirror I/O because obviously
-	 * only one mirror is available */
-	if (fd->fd_designated_mirror > 0) {
-		io->ci_ndelay = 0;
-		io->ci_designated_mirror = fd->fd_designated_mirror;
-		io->ci_layout_version = fd->fd_layout_version;
-	}
-
-	CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
-	       file->f_path.dentry->d_name.name, io->ci_designated_mirror);
-}
-
 static bool file_is_noatime(const struct file *file)
 {
 	const struct vfsmount *mnt = file->f_path.mnt;
@@ -1387,17 +1081,20 @@ static bool file_is_noatime(const struct file *file)
 	return false;
 }
 
+static int ll_file_io_ptask(struct cfs_ptask *ptask);
+
 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
 {
 	struct inode *inode = file_inode(file);
-	struct ll_file_data *fd  = LUSTRE_FPRIVATE(file);
-
-	io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
-	io->ci_lock_no_expand = fd->ll_lock_no_expand;
 
+	memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
+	init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
+	io->u.ci_rw.rw_file = file;
+	io->u.ci_rw.rw_ptask = ll_file_io_ptask;
+	io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
 	if (iot == CIT_WRITE) {
-		io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
-		io->u.ci_wr.wr_sync   = !!(file->f_flags & O_SYNC ||
+		io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
+		io->u.ci_rw.rw_sync   = !!(file->f_flags & O_SYNC ||
 					   file->f_flags & O_DIRECT ||
 					   IS_SYNC(inode));
 	}
@@ -1410,12 +1107,94 @@ static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
 		io->ci_lockreq = CILR_MANDATORY;
 	}
 	io->ci_noatime = file_is_noatime(file);
+	if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
+		io->ci_pio = !io->u.ci_rw.rw_append;
+	else
+		io->ci_pio = 0;
+}
+
+static int ll_file_io_ptask(struct cfs_ptask *ptask)
+{
+	struct cl_io_pt *pt = ptask->pt_cbdata;
+	struct file *file = pt->cip_file;
+	struct lu_env *env;
+	struct cl_io *io;
+	loff_t pos = pt->cip_pos;
+	int rc;
+	__u16 refcheck;
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
+		file_dentry(file)->d_name.name,
+		pt->cip_iot == CIT_READ ? "read" : "write",
+		pos, pos + pt->cip_count);
+
+restart:
+	io = vvp_env_thread_io(env);
+	ll_io_init(io, file, pt->cip_iot);
+	io->u.ci_rw.rw_iter = pt->cip_iter;
+	io->u.ci_rw.rw_iocb = pt->cip_iocb;
+	io->ci_pio = 0; /* It's already in parallel task */
+
+	rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
+			   pt->cip_count - pt->cip_result);
+	if (!rc) {
+		struct vvp_io *vio = vvp_env_io(env);
+
+		vio->vui_io_subtype = IO_NORMAL;
+		vio->vui_fd = LUSTRE_FPRIVATE(file);
+
+		ll_cl_add(file, env, io, LCC_RW);
+		rc = cl_io_loop(env, io);
+		ll_cl_remove(file, env);
+	} else {
+		/* cl_io_rw_init() handled IO */
+		rc = io->ci_result;
+	}
+
+	if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
+		if (io->ci_nob > 0)
+			io->ci_nob /= 2;
+		rc = -EIO;
+	}
+
+	if (io->ci_nob > 0) {
+		pt->cip_result += io->ci_nob;
+		iov_iter_advance(&pt->cip_iter, io->ci_nob);
+		pos += io->ci_nob;
+		pt->cip_iocb.ki_pos = pos;
+#ifdef HAVE_KIOCB_KI_LEFT
+		pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
+#elif defined(HAVE_KI_NBYTES)
+		pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
+#endif
+	}
+
+	cl_io_fini(env, io);
 
-	/* FLR: only use non-delay I/O for read as there is only one
-	 * avaliable mirror for write. */
-	io->ci_ndelay = !(iot == CIT_WRITE);
+	if ((rc == 0 || rc == -ENODATA) &&
+	    pt->cip_result < pt->cip_count &&
+	    io->ci_need_restart) {
+		CDEBUG(D_VFSTRACE,
+			"%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
+			file_dentry(file)->d_name.name,
+			pt->cip_iot == CIT_READ ? "read" : "write",
+			pos, pos + pt->cip_count - pt->cip_result,
+			pt->cip_result, rc);
+		goto restart;
+	}
 
-	ll_io_set_mirror(io, file);
+	CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
+		file_dentry(file)->d_name.name,
+		pt->cip_iot == CIT_READ ? "read" : "write",
+		pt->cip_result, rc);
+
+	cl_env_put(env, &refcheck);
+	RETURN(pt->cip_result > 0 ? 0 : rc);
 }
 
 static ssize_t
@@ -1423,43 +1202,45 @@ ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
 		   struct file *file, enum cl_io_type iot,
 		   loff_t *ppos, size_t count)
 {
+	struct range_lock	range;
 	struct vvp_io		*vio = vvp_env_io(env);
 	struct inode		*inode = file_inode(file);
 	struct ll_inode_info	*lli = ll_i2info(inode);
 	struct ll_file_data	*fd  = LUSTRE_FPRIVATE(file);
-	struct range_lock	range;
 	struct cl_io		*io;
+	loff_t			pos = *ppos;
 	ssize_t			result = 0;
 	int			rc = 0;
-	unsigned		retried = 0;
-	bool			restarted = false;
 
 	ENTRY;
 
-	CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
+	CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
 		file_dentry(file)->d_name.name,
-		iot == CIT_READ ? "read" : "write", *ppos, count);
+		iot == CIT_READ ? "read" : "write", pos, pos + count);
 
 restart:
 	io = vvp_env_thread_io(env);
 	ll_io_init(io, file, iot);
-	io->ci_ndelay_tried = retried;
+	if (args->via_io_subtype == IO_NORMAL) {
+		io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
+		io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
+	} else {
+		io->ci_pio = 0;
+	}
 
-	if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
+	if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
 		bool range_locked = false;
 
 		if (file->f_flags & O_APPEND)
 			range_lock_init(&range, 0, LUSTRE_EOF);
 		else
-			range_lock_init(&range, *ppos, *ppos + count - 1);
+			range_lock_init(&range, pos, pos + count - 1);
 
 		vio->vui_fd  = LUSTRE_FPRIVATE(file);
 		vio->vui_io_subtype = args->via_io_subtype;
 
 		switch (vio->vui_io_subtype) {
 		case IO_NORMAL:
-			vio->vui_iter = args->u.normal.via_iter;
-			vio->vui_iocb = args->u.normal.via_iocb;
 			/* Direct IO reads must also take range lock,
 			 * or multiple reads will try to work on the same pages
 			 * See LU-6227 for details. */
@@ -1485,7 +1266,16 @@ ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
 		}
 
 		ll_cl_add(file, env, io, LCC_RW);
+		if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
+		    !lli->lli_inode_locked) {
+			inode_lock(inode);
+			lli->lli_inode_locked = 1;
+		}
 		rc = cl_io_loop(env, io);
+		if (lli->lli_inode_locked) {
+			lli->lli_inode_locked = 0;
+			inode_unlock(inode);
+		}
 		ll_cl_remove(file, env);
 
 		if (range_locked) {
@@ -1501,29 +1291,38 @@ ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
 	if (io->ci_nob > 0) {
 		result += io->ci_nob;
 		count  -= io->ci_nob;
-		*ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
 
-		/* prepare IO restart */
-		if (count > 0 && args->via_io_subtype == IO_NORMAL)
-			args->u.normal.via_iter = vio->vui_iter;
+		if (args->via_io_subtype == IO_NORMAL) {
+			iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
+
+			/* CLIO is too complicated. See LU-11069. */
+			if (cl_io_is_append(io))
+				pos = io->u.ci_rw.rw_iocb.ki_pos;
+			else
+				pos += io->ci_nob;
+
+			args->u.normal.via_iocb->ki_pos = pos;
+			if (io->ci_pio) {
+#ifdef HAVE_KIOCB_KI_LEFT
+				args->u.normal.via_iocb->ki_left = count;
+#elif defined(HAVE_KI_NBYTES)
+				args->u.normal.via_iocb->ki_nbytes = count;
+#endif
+			}
+		} else {
+			/* for splice */
+			pos = io->u.ci_rw.rw_range.cir_pos;
+		}
 	}
 out:
 	cl_io_fini(env, io);
 
-	CDEBUG(D_VFSTRACE,
-	       "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
-	       file->f_path.dentry->d_name.name,
-	       iot, rc, result, io->ci_need_restart);
-
 	if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
 		CDEBUG(D_VFSTRACE,
-		       "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
-		       file_dentry(file)->d_name.name,
-		       iot == CIT_READ ? "read" : "write",
-		       *ppos, count, result, rc);
-		/* preserve the tried count for FLR */
-		retried = io->ci_ndelay_tried;
-		restarted = true;
+			"%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
+			file_dentry(file)->d_name.name,
+			iot == CIT_READ ? "read" : "write",
+			pos, pos + count, result, rc);
 		goto restart;
 	}
 
@@ -1547,7 +1346,11 @@ ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
 		}
 	}
 
-	CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
+	CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
+		file_dentry(file)->d_name.name,
+		iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
+
+	*ppos = pos;
 
 	RETURN(result > 0 ? result : rc);
 }
@@ -1588,7 +1391,8 @@ ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
  * \retval - number of bytes have been read, or error code if error occurred.
  */
 static ssize_t
-ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
+ll_do_fast_read(const struct lu_env *env, struct kiocb *iocb,
+		struct iov_iter *iter)
 {
 	ssize_t result;
 
@@ -1600,7 +1404,9 @@ ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
 	if (iocb->ki_filp->f_flags & O_DIRECT)
 		return 0;
 
+	ll_cl_add(iocb->ki_filp, env, NULL, LCC_RW);
 	result = generic_file_read_iter(iocb, iter);
+	ll_cl_remove(iocb->ki_filp, env);
 
 	/* If the first page is not in cache, generic_file_aio_read() will be
 	 * returned with -ENODATA.
@@ -1622,101 +1428,34 @@ static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
 	struct lu_env *env;
 	struct vvp_io_args *args;
-	struct file *file = iocb->ki_filp;
 	ssize_t result;
 	ssize_t rc2;
 	__u16 refcheck;
 
-	if (!iov_iter_count(to))
-		return 0;
-
-	result = ll_do_fast_read(iocb, to);
-	if (result < 0 || iov_iter_count(to) == 0)
-		GOTO(out, result);
-
 	env = cl_env_get(&refcheck);
 	if (IS_ERR(env))
 		return PTR_ERR(env);
 
+	result = ll_do_fast_read(env, iocb, to);
+	if (result < 0 || iov_iter_count(to) == 0)
+		GOTO(out, result);
+
 	args = ll_env_args(env, IO_NORMAL);
 	args->u.normal.via_iter = to;
 	args->u.normal.via_iocb = iocb;
 
-	rc2 = ll_file_io_generic(env, args, file, CIT_READ,
+	rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
 				 &iocb->ki_pos, iov_iter_count(to));
 	if (rc2 > 0)
 		result += rc2;
 	else if (result == 0)
 		result = rc2;
 
-	cl_env_put(env, &refcheck);
 out:
-	if (result > 0)
-		ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
-				  LUSTRE_FPRIVATE(file), iocb->ki_pos, result,
-				  READ);
-
+	cl_env_put(env, &refcheck);
 	return result;
 }
 
-/**
- * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
- * If a page is already in the page cache and dirty (and some other things -
- * See ll_tiny_write_begin for the instantiation of these rules), then we can
- * write to it without doing a full I/O, because Lustre already knows about it
- * and will write it out.  This saves a lot of processing time.
- *
- * All writes here are within one page, so exclusion is handled by the page
- * lock on the vm page.  We do not do tiny writes for writes which touch
- * multiple pages because it's very unlikely multiple sequential pages are
- * are already dirty.
- *
- * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
- * and are unlikely to be to already dirty pages.
- *
- * Attribute updates are important here, we do them in ll_tiny_write_end.
- */
-static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
-{
-	ssize_t count = iov_iter_count(iter);
-	struct  file *file = iocb->ki_filp;
-	struct  inode *inode = file_inode(file);
-	bool    lock_inode = !IS_NOSEC(inode);
-	ssize_t result = 0;
-
-	ENTRY;
-
-	/* Restrict writes to single page and < PAGE_SIZE.  See comment at top
-	 * of function for why.
-	 */
-	if (count >= PAGE_SIZE ||
-	    (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
-		RETURN(0);
-
-	if (unlikely(lock_inode))
-		inode_lock(inode);
-	result = __generic_file_write_iter(iocb, iter);
-
-	if (unlikely(lock_inode))
-		inode_unlock(inode);
-
-	/* If the page is not already dirty, ll_tiny_write_begin returns
-	 * -ENODATA.  We continue on to normal write.
-	 */
-	if (result == -ENODATA)
-		result = 0;
-
-	if (result > 0) {
-		ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
-				   result);
-		ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
-	}
-
-	CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
-
-	RETURN(result);
-}
-
 /*
  * Write to a file (through the page cache).
  */
@@ -1724,30 +1463,9 @@ static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct vvp_io_args *args;
 	struct lu_env *env;
-	ssize_t rc_tiny = 0, rc_normal;
-	struct file *file = iocb->ki_filp;
+	ssize_t result;
 	__u16 refcheck;
 
-	ENTRY;
-
-	if (!iov_iter_count(from))
-		GOTO(out, rc_normal = 0);
-
-	/* NB: we can't do direct IO for tiny writes because they use the page
-	 * cache, we can't do sync writes because tiny writes can't flush
-	 * pages, and we can't do append writes because we can't guarantee the
-	 * required DLM locks are held to protect file size.
-	 */
-	if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(file))) &&
-	    !(file->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
-		rc_tiny = ll_do_tiny_write(iocb, from);
-
-	/* In case of error, go on and try normal write - Only stop if tiny
-	 * write completed I/O.
-	 */
-	if (iov_iter_count(from) == 0)
-		GOTO(out, rc_normal = rc_tiny);
-
 	env = cl_env_get(&refcheck);
 	if (IS_ERR(env))
 		return PTR_ERR(env);
@@ -1756,25 +1474,10 @@ static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	args->u.normal.via_iter = from;
 	args->u.normal.via_iocb = iocb;
 
-	rc_normal = ll_file_io_generic(env, args, file, CIT_WRITE,
-				       &iocb->ki_pos, iov_iter_count(from));
-
-	/* On success, combine bytes written. */
-	if (rc_tiny >= 0 && rc_normal > 0)
-		rc_normal += rc_tiny;
-	/* On error, only return error from normal write if tiny write did not
-	 * write any bytes.  Otherwise return bytes written by tiny write.
-	 */
-	else if (rc_tiny > 0)
-		rc_normal = rc_tiny;
-
+	result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
+				    &iocb->ki_pos, iov_iter_count(from));
 	cl_env_put(env, &refcheck);
-out:
-	if (rc_normal > 0)
-		ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
-				  LUSTRE_FPRIVATE(file), iocb->ki_pos,
-				  rc_normal, WRITE);
-	RETURN(rc_normal);
+	return result;
 }
 
 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
@@ -1821,9 +1524,6 @@ static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 	if (result)
 		RETURN(result);
 
-	if (!iov_count)
-		RETURN(0);
-
 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
 	iov_iter_init(&to, READ, iov, nr_segs, iov_count);
 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
@@ -1838,26 +1538,30 @@ static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
 			    loff_t *ppos)
 {
+	struct lu_env *env;
 	struct iovec   iov = { .iov_base = buf, .iov_len = count };
-	struct kiocb   kiocb;
+	struct kiocb  *kiocb;
 	ssize_t        result;
-
+	__u16          refcheck;
 	ENTRY;
 
-	if (!count)
-		RETURN(0);
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
 
-	init_sync_kiocb(&kiocb, file);
-	kiocb.ki_pos = *ppos;
+	kiocb = &ll_env_info(env)->lti_kiocb;
+        init_sync_kiocb(kiocb, file);
+        kiocb->ki_pos = *ppos;
 #ifdef HAVE_KIOCB_KI_LEFT
-	kiocb.ki_left = count;
+	kiocb->ki_left = count;
 #elif defined(HAVE_KI_NBYTES)
-	kiocb.i_nbytes = count;
+	kiocb->ki_nbytes = count;
 #endif
 
-	result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
-	*ppos = kiocb.ki_pos;
+	result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
+	*ppos = kiocb->ki_pos;
 
+	cl_env_put(env, &refcheck);
 	RETURN(result);
 }
 
@@ -1877,9 +1581,6 @@ static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	if (result)
 		RETURN(result);
 
-	if (!iov_count)
-		RETURN(0);
-
 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
 	iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
@@ -1894,27 +1595,31 @@ static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 static ssize_t ll_file_write(struct file *file, const char __user *buf,
 			     size_t count, loff_t *ppos)
 {
+	struct lu_env *env;
 	struct iovec   iov = { .iov_base = (void __user *)buf,
 			       .iov_len = count };
-	struct kiocb   kiocb;
-	ssize_t        result;
-
-	ENTRY;
+        struct kiocb  *kiocb;
+        ssize_t        result;
+	__u16          refcheck;
+        ENTRY;
 
-	if (!count)
-		RETURN(0);
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN(PTR_ERR(env));
 
-	init_sync_kiocb(&kiocb, file);
-	kiocb.ki_pos = *ppos;
+	kiocb = &ll_env_info(env)->lti_kiocb;
+        init_sync_kiocb(kiocb, file);
+        kiocb->ki_pos = *ppos;
 #ifdef HAVE_KIOCB_KI_LEFT
-	kiocb.ki_left = count;
+	kiocb->ki_left = count;
 #elif defined(HAVE_KI_NBYTES)
-	kiocb.ki_nbytes = count;
+	kiocb->ki_nbytes = count;
 #endif
 
-	result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
-	*ppos = kiocb.ki_pos;
+	result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
+	*ppos = kiocb->ki_pos;
 
+	cl_env_put(env, &refcheck);
 	RETURN(result);
 }
 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
@@ -1942,11 +1647,6 @@ static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
 
         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
         cl_env_put(env, &refcheck);
-
-	if (result > 0)
-		ll_rw_stats_tally(ll_i2sbi(file_inode(in_file)), current->pid,
-				  LUSTRE_FPRIVATE(in_file), *ppos, result,
-				  READ);
         RETURN(result);
 }
 
@@ -1960,12 +1660,6 @@ int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
 	int rc;
 	ENTRY;
 
-	if ((__swab32(lum->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) ==
-	    le32_to_cpu(LOV_MAGIC_MAGIC)) {
-		/* this code will only exist for big-endian systems */
-		lustre_swab_lov_user_md(lum, 0);
-	}
-
 	ll_inode_size_lock(inode);
 	rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
 	if (rc < 0)
@@ -2028,14 +1722,13 @@ int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
 	    lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
 		GOTO(out, rc = -EPROTO);
 
-	/*
-	 * This is coming from the MDS, so is probably in
-	 * little endian.  We convert it to host endian before
-	 * passing it to userspace.
-	 */
-	if ((lmm->lmm_magic & __swab32(LOV_MAGIC_MAGIC)) ==
-	    __swab32(LOV_MAGIC_MAGIC)) {
-		int stripe_count = 0;
+        /*
+         * This is coming from the MDS, so is probably in
+         * little endian.  We convert it to host endian before
+         * passing it to userspace.
+         */
+        if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
+		int stripe_count;
 
 		if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
 		    lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
@@ -2045,19 +1738,27 @@ int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
 				stripe_count = 0;
 		}
 
-		lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0);
-
-		/* if function called for directory - we should
-		 * avoid swab not existent lsm objects */
-		if (lmm->lmm_magic == LOV_MAGIC_V1 && S_ISREG(body->mbo_mode))
-			lustre_swab_lov_user_md_objects(
-				((struct lov_user_md_v1 *)lmm)->lmm_objects,
-				stripe_count);
-		else if (lmm->lmm_magic == LOV_MAGIC_V3 &&
-			 S_ISREG(body->mbo_mode))
-			lustre_swab_lov_user_md_objects(
-				((struct lov_user_md_v3 *)lmm)->lmm_objects,
-				stripe_count);
+                /* if function called for directory - we should
+                 * avoid swab not existent lsm objects */
+                if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
+			lustre_swab_lov_user_md_v1(
+					(struct lov_user_md_v1 *)lmm);
+			if (S_ISREG(body->mbo_mode))
+				lustre_swab_lov_user_md_objects(
+				    ((struct lov_user_md_v1 *)lmm)->lmm_objects,
+				    stripe_count);
+		} else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
+			lustre_swab_lov_user_md_v3(
+					(struct lov_user_md_v3 *)lmm);
+			if (S_ISREG(body->mbo_mode))
+				lustre_swab_lov_user_md_objects(
+				    ((struct lov_user_md_v3 *)lmm)->lmm_objects,
+				    stripe_count);
+		} else if (lmm->lmm_magic ==
+			   cpu_to_le32(LOV_MAGIC_COMP_V1)) {
+			lustre_swab_lov_comp_md_v1(
+					(struct lov_comp_md_v1 *)lmm);
+		}
 	}
 
 out:
@@ -2144,7 +1845,7 @@ static int ll_lov_setstripe(struct inode *inode, struct file *file,
 	cl_lov_delay_create_clear(&file->f_flags);
 
 out:
-	OBD_FREE_LARGE(klum, lum_size);
+	OBD_FREE(klum, lum_size);
 	RETURN(rc);
 }
 
@@ -2187,10 +1888,6 @@ ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
 		struct cl_layout cl = {
 			.cl_is_composite = false,
 		};
-		struct lu_extent ext = {
-			.e_start = 0,
-			.e_end = OBD_OBJECT_EOF,
-		};
 
 		env = cl_env_get(&refcheck);
 		if (IS_ERR(env))
@@ -2198,8 +1895,7 @@ ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
 
 		rc = cl_object_layout_get(env, obj, &cl);
 		if (!rc && cl.cl_is_composite)
-			rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
-						    &ext);
+			rc = ll_layout_write_intent(inode, 0, OBD_OBJECT_EOF);
 
 		cl_env_put(env, &refcheck);
 		if (rc)
@@ -2293,9 +1989,7 @@ int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
         if (!och)
                 GOTO(out, rc = -ENOMEM);
 
-	rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
-	if (rc)
-		GOTO(out, rc);
+	ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 
 	rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 out:
@@ -2411,8 +2105,18 @@ int ll_fid2path(struct inode *inode, void __user *arg)
 	RETURN(rc);
 }
 
-static int
-ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
+/*
+ * Read the data_version for inode.
+ *
+ * This value is computed using stripe object version on OST.
+ * Version is computed using server side locking.
+ *
+ * @param flags if do sync on the OST side;
+ *		0: no sync
+ *		LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
+ *		LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
+ */
+int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
 {
 	struct cl_object *obj = ll_i2info(inode)->lli_clob;
 	struct lu_env *env;
@@ -2422,12 +2126,11 @@ ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
 
 	ENTRY;
 
-	ioc->idv_version = 0;
-	ioc->idv_layout_version = UINT_MAX;
-
 	/* If no file object initialized, we consider its version is 0. */
-	if (obj == NULL)
+	if (obj == NULL) {
+		*data_version = 0;
 		RETURN(0);
+	}
 
 	env = cl_env_get(&refcheck);
 	if (IS_ERR(env))
@@ -2436,8 +2139,7 @@ ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
 	io = vvp_env_thread_io(env);
 	io->ci_obj = obj;
 	io->u.ci_data_version.dv_data_version = 0;
-	io->u.ci_data_version.dv_layout_version = UINT_MAX;
-	io->u.ci_data_version.dv_flags = ioc->idv_flags;
+	io->u.ci_data_version.dv_flags = flags;
 
 restart:
 	if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
@@ -2445,8 +2147,7 @@ ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
 	else
 		result = io->ci_result;
 
-	ioc->idv_version = io->u.ci_data_version.dv_data_version;
-	ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
+	*data_version = io->u.ci_data_version.dv_data_version;
 
 	cl_io_fini(env, io);
 
@@ -2458,29 +2159,6 @@ ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
 	RETURN(result);
 }
 
-/*
- * Read the data_version for inode.
- *
- * This value is computed using stripe object version on OST.
- * Version is computed using server side locking.
- *
- * @param flags if do sync on the OST side;
- *		0: no sync
- *		LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
- *		LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
- */
-int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
-{
-	struct ioc_data_version ioc = { .idv_flags = flags };
-	int rc;
-
-	rc = ll_ioc_data_version(inode, &ioc);
-	if (!rc)
-		*data_version = ioc.idv_version;
-
-	return rc;
-}
-
 /*
  * Trigger a HSM release request for the provided inode.
  */
@@ -2510,15 +2188,9 @@ int ll_hsm_release(struct inode *inode)
 	if (IS_ERR(env))
 		GOTO(out, rc = PTR_ERR(env));
 
-	rc = ll_merge_attr(env, inode);
+	ll_merge_attr(env, inode);
 	cl_env_put(env, &refcheck);
 
-	/* If error happen, we have the wrong size for a file.
-	 * Don't release it.
-	 */
-	if (rc != 0)
-		GOTO(out, rc);
-
 	/* Release the file.
 	 * NB: lease lock handle is released in mdc_hsm_release_pack() because
 	 * we still need it to pack l_remote_handle to MDT. */
@@ -2651,9 +2323,8 @@ static int ll_swap_layouts(struct file *file1, struct file *file2,
 
 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
 {
-	struct obd_export *exp = ll_i2mdexp(inode);
-	struct md_op_data *op_data;
-	int rc;
+	struct md_op_data	*op_data;
+	int			 rc;
 	ENTRY;
 
 	/* Detect out-of range masks */
@@ -2666,20 +2337,18 @@ int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
 	    !cfs_capable(CFS_CAP_SYS_ADMIN))
 		RETURN(-EPERM);
 
-	if (!exp_connect_archive_id_array(exp)) {
-		/* Detect out-of range archive id */
-		if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
-		    (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
-			RETURN(-EINVAL);
-	}
+	/* Detect out-of range archive id */
+	if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
+	    (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
+		RETURN(-EINVAL);
 
 	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
 				     LUSTRE_OPC_ANY, hss);
 	if (IS_ERR(op_data))
 		RETURN(PTR_ERR(op_data));
 
-	rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
-			   op_data, NULL);
+	rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
+			   sizeof(*op_data), op_data, NULL);
 
 	ll_finish_md_op_data(op_data);
 
@@ -2730,7 +2399,7 @@ static int ll_hsm_import(struct inode *inode, struct file *file,
 
 	inode_lock(inode);
 
-	rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
+	rc = ll_setattr_raw(file_dentry(file), attr, true);
 	if (rc == -ENODATA)
 		rc = 0;
 
@@ -2758,7 +2427,7 @@ static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
 	struct iattr ia = {
 		.ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
 			    ATTR_MTIME | ATTR_MTIME_SET |
-			    ATTR_CTIME,
+			    ATTR_CTIME | ATTR_CTIME_SET,
 		.ia_atime = {
 			.tv_sec = lfu->lfu_atime_sec,
 			.tv_nsec = lfu->lfu_atime_nsec,
@@ -2782,197 +2451,12 @@ static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
 		RETURN(-EINVAL);
 
 	inode_lock(inode);
-	rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
-			    false);
+	rc = ll_setattr_raw(file_dentry(file), &ia, false);
 	inode_unlock(inode);
 
 	RETURN(rc);
 }
 
-static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
-{
-	switch (mode) {
-	case MODE_READ_USER:
-		return CLM_READ;
-	case MODE_WRITE_USER:
-		return CLM_WRITE;
-	default:
-		return -EINVAL;
-	}
-}
-
-static const char *const user_lockname[] = LOCK_MODE_NAMES;
-
-/* Used to allow the upper layers of the client to request an LDLM lock
- * without doing an actual read or write.
- *
- * Used for ladvise lockahead to manually request specific locks.
- *
- * \param[in] file	file this ladvise lock request is on
- * \param[in] ladvise	ladvise struct describing this lock request
- *
- * \retval 0		success, no detailed result available (sync requests
- *			and requests sent to the server [not handled locally]
- *			cannot return detailed results)
- * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
- *					 see definitions for details.
- * \retval negative	negative errno on error
- */
-int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
-{
-	struct lu_env *env = NULL;
-	struct cl_io *io  = NULL;
-	struct cl_lock *lock = NULL;
-	struct cl_lock_descr *descr = NULL;
-	struct dentry *dentry = file->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
-	enum cl_lock_mode cl_mode;
-	off_t start = ladvise->lla_start;
-	off_t end = ladvise->lla_end;
-	int result;
-	__u16 refcheck;
-
-	ENTRY;
-
-	CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
-	       "start=%llu, end=%llu\n", dentry->d_name.len,
-	       dentry->d_name.name, dentry->d_inode,
-	       user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
-	       (__u64) end);
-
-	cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
-	if (cl_mode < 0)
-		GOTO(out, result = cl_mode);
-
-	/* Get IO environment */
-	result = cl_io_get(inode, &env, &io, &refcheck);
-	if (result <= 0)
-		GOTO(out, result);
-
-	result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
-	if (result > 0) {
-		/*
-		 * nothing to do for this io. This currently happens when
-		 * stripe sub-object's are not yet created.
-		 */
-		result = io->ci_result;
-	} else if (result == 0) {
-		lock = vvp_env_lock(env);
-		descr = &lock->cll_descr;
-
-		descr->cld_obj   = io->ci_obj;
-		/* Convert byte offsets to pages */
-		descr->cld_start = cl_index(io->ci_obj, start);
-		descr->cld_end   = cl_index(io->ci_obj, end);
-		descr->cld_mode  = cl_mode;
-		/* CEF_MUST is used because we do not want to convert a
-		 * lockahead request to a lockless lock */
-		descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
-				       CEF_NONBLOCK;
-
-		if (ladvise->lla_peradvice_flags & LF_ASYNC)
-			descr->cld_enq_flags |= CEF_SPECULATIVE;
-
-		result = cl_lock_request(env, io, lock);
-
-		/* On success, we need to release the lock */
-		if (result >= 0)
-			cl_lock_release(env, lock);
-	}
-	cl_io_fini(env, io);
-	cl_env_put(env, &refcheck);
-
-	/* -ECANCELED indicates a matching lock with a different extent
-	 * was already present, and -EEXIST indicates a matching lock
-	 * on exactly the same extent was already present.
-	 * We convert them to positive values for userspace to make
-	 * recognizing true errors easier.
-	 * Note we can only return these detailed results on async requests,
-	 * as sync requests look the same as i/o requests for locking. */
-	if (result == -ECANCELED)
-		result = LLA_RESULT_DIFFERENT;
-	else if (result == -EEXIST)
-		result = LLA_RESULT_SAME;
-
-out:
-	RETURN(result);
-}
-static const char *const ladvise_names[] = LU_LADVISE_NAMES;
-
-static int ll_ladvise_sanity(struct inode *inode,
-			     struct llapi_lu_ladvise *ladvise)
-{
-	enum lu_ladvise_type advice = ladvise->lla_advice;
-	/* Note the peradvice flags is a 32 bit field, so per advice flags must
-	 * be in the first 32 bits of enum ladvise_flags */
-	__u32 flags = ladvise->lla_peradvice_flags;
-	/* 3 lines at 80 characters per line, should be plenty */
-	int rc = 0;
-
-	if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
-		rc = -EINVAL;
-		CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
-		       "last supported advice is %s (value '%d'): rc = %d\n",
-		       ll_get_fsname(inode->i_sb, NULL, 0), advice,
-		       ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
-		GOTO(out, rc);
-	}
-
-	/* Per-advice checks */
-	switch (advice) {
-	case LU_LADVISE_LOCKNOEXPAND:
-		if (flags & ~LF_LOCKNOEXPAND_MASK) {
-			rc = -EINVAL;
-			CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
-			       "rc = %d\n",
-			       ll_get_fsname(inode->i_sb, NULL, 0), flags,
-			       ladvise_names[advice], rc);
-			GOTO(out, rc);
-		}
-		break;
-	case LU_LADVISE_LOCKAHEAD:
-		/* Currently only READ and WRITE modes can be requested */
-		if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
-		    ladvise->lla_lockahead_mode == 0) {
-			rc = -EINVAL;
-			CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
-			       "rc = %d\n",
-			       ll_get_fsname(inode->i_sb, NULL, 0),
-			       ladvise->lla_lockahead_mode,
-			       ladvise_names[advice], rc);
-			GOTO(out, rc);
-		}
-		/* fallthrough */
-	case LU_LADVISE_WILLREAD:
-	case LU_LADVISE_DONTNEED:
-	default:
-		/* Note fall through above - These checks apply to all advices
-		 * except LOCKNOEXPAND */
-		if (flags & ~LF_DEFAULT_MASK) {
-			rc = -EINVAL;
-			CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
-			       "rc = %d\n",
-			       ll_get_fsname(inode->i_sb, NULL, 0), flags,
-			       ladvise_names[advice], rc);
-			GOTO(out, rc);
-		}
-		if (ladvise->lla_start >= ladvise->lla_end) {
-			rc = -EINVAL;
-			CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
-			       "for %s: rc = %d\n",
-			       ll_get_fsname(inode->i_sb, NULL, 0),
-			       ladvise->lla_start, ladvise->lla_end,
-			       ladvise_names[advice], rc);
-			GOTO(out, rc);
-		}
-		break;
-	}
-
-out:
-	return rc;
-}
-#undef ERRSIZE
-
 /*
  * Give file access advices
  *
@@ -3022,15 +2506,6 @@ static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
 	RETURN(rc);
 }
 
-static int ll_lock_noexpand(struct file *file, int flags)
-{
-	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
-
-	fd->ll_lock_no_expand = !(flags & LF_UNSET);
-
-	return 0;
-}
-
 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
 			unsigned long arg)
 {
@@ -3041,287 +2516,64 @@ int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
 			   sizeof(fsxattr)))
 		RETURN(-EFAULT);
 
-	fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
-	if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
-		fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
+	fsxattr.fsx_xflags = ll_inode_to_ext_flags(inode->i_flags);
 	fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
-	if (copy_to_user((struct fsxattr __user *)arg,
-			 &fsxattr, sizeof(fsxattr)))
-		RETURN(-EFAULT);
-
-	RETURN(0);
-}
-
-int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
-{
-	/*
-	 * Project Quota ID state is only allowed to change from within the init
-	 * namespace. Enforce that restriction only if we are trying to change
-	 * the quota ID state. Everything else is allowed in user namespaces.
-	 */
-	if (current_user_ns() == &init_user_ns)
-		return 0;
-
-	if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
-		return -EINVAL;
-
-	if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
-		if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
-			return -EINVAL;
-	} else {
-		if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
-			return -EINVAL;
-	}
-
-	return 0;
-}
-
-int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
-			unsigned long arg)
-{
-
-	struct md_op_data *op_data;
-	struct ptlrpc_request *req = NULL;
-	int rc = 0;
-	struct fsxattr fsxattr;
-	struct cl_object *obj;
-	struct iattr *attr;
-	int flags;
-
-	if (copy_from_user(&fsxattr,
-			   (const struct fsxattr __user *)arg,
-			   sizeof(fsxattr)))
-		RETURN(-EFAULT);
-
-	rc = ll_ioctl_check_project(inode, &fsxattr);
-	if (rc)
-		RETURN(rc);
-
-	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
-				     LUSTRE_OPC_ANY, NULL);
-	if (IS_ERR(op_data))
-		RETURN(PTR_ERR(op_data));
-
-	flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
-	op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
-	if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
-		op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
-	op_data->op_projid = fsxattr.fsx_projid;
-	op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
-	rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
-			0, &req);
-	ptlrpc_req_finished(req);
-	if (rc)
-		GOTO(out_fsxattr, rc);
-	ll_update_inode_flags(inode, op_data->op_attr_flags);
-	obj = ll_i2info(inode)->lli_clob;
-	if (obj == NULL)
-		GOTO(out_fsxattr, rc);
-
-	OBD_ALLOC_PTR(attr);
-	if (attr == NULL)
-		GOTO(out_fsxattr, rc = -ENOMEM);
-
-	rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
-			    fsxattr.fsx_xflags);
-	OBD_FREE_PTR(attr);
-out_fsxattr:
-	ll_finish_md_op_data(op_data);
-	RETURN(rc);
-}
-
-static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
-				 unsigned long arg)
-{
-	struct inode		*inode = file_inode(file);
-	struct ll_file_data	*fd = LUSTRE_FPRIVATE(file);
-	struct ll_inode_info	*lli = ll_i2info(inode);
-	struct obd_client_handle *och = NULL;
-	struct split_param sp;
-	bool lease_broken;
-	fmode_t fmode = 0;
-	enum mds_op_bias bias = 0;
-	struct file *layout_file = NULL;
-	void *data = NULL;
-	size_t data_size = 0;
-	long rc;
-	ENTRY;
-
-	mutex_lock(&lli->lli_och_mutex);
-	if (fd->fd_lease_och != NULL) {
-		och = fd->fd_lease_och;
-		fd->fd_lease_och = NULL;
-	}
-	mutex_unlock(&lli->lli_och_mutex);
-
-	if (och == NULL)
-		GOTO(out, rc = -ENOLCK);
-
-	fmode = och->och_flags;
-
-	switch (ioc->lil_flags) {
-	case LL_LEASE_RESYNC_DONE:
-		if (ioc->lil_count > IOC_IDS_MAX)
-			GOTO(out, rc = -EINVAL);
-
-		data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
-		OBD_ALLOC(data, data_size);
-		if (!data)
-			GOTO(out, rc = -ENOMEM);
-
-		if (copy_from_user(data, (void __user *)arg, data_size))
-			GOTO(out, rc = -EFAULT);
-
-		bias = MDS_CLOSE_RESYNC_DONE;
-		break;
-	case LL_LEASE_LAYOUT_MERGE: {
-		int fd;
-
-		if (ioc->lil_count != 1)
-			GOTO(out, rc = -EINVAL);
-
-		arg += sizeof(*ioc);
-		if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
-			GOTO(out, rc = -EFAULT);
-
-		layout_file = fget(fd);
-		if (!layout_file)
-			GOTO(out, rc = -EBADF);
-
-		if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
-				(layout_file->f_flags & O_ACCMODE) == O_RDONLY)
-			GOTO(out, rc = -EPERM);
-
-		data = file_inode(layout_file);
-		bias = MDS_CLOSE_LAYOUT_MERGE;
-		break;
-	}
-	case LL_LEASE_LAYOUT_SPLIT: {
-		int fdv;
-		int mirror_id;
-
-		if (ioc->lil_count != 2)
-			GOTO(out, rc = -EINVAL);
-
-		arg += sizeof(*ioc);
-		if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
-			GOTO(out, rc = -EFAULT);
-
-		arg += sizeof(__u32);
-		if (copy_from_user(&mirror_id, (void __user *)arg,
-				   sizeof(__u32)))
-			GOTO(out, rc = -EFAULT);
-
-		layout_file = fget(fdv);
-		if (!layout_file)
-			GOTO(out, rc = -EBADF);
-
-		sp.sp_inode = file_inode(layout_file);
-		sp.sp_mirror_id = (__u16)mirror_id;
-		data = &sp;
-		bias = MDS_CLOSE_LAYOUT_SPLIT;
-		break;
-	}
-	default:
-		/* without close intent */
-		break;
-	}
-
-	rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
-	if (rc < 0)
-		GOTO(out, rc);
-
-	rc = ll_lease_och_release(inode, file);
-	if (rc < 0)
-		GOTO(out, rc);
-
-	if (lease_broken)
-		fmode = 0;
-	EXIT;
-
-out:
-	switch (ioc->lil_flags) {
-	case LL_LEASE_RESYNC_DONE:
-		if (data)
-			OBD_FREE(data, data_size);
-		break;
-	case LL_LEASE_LAYOUT_MERGE:
-	case LL_LEASE_LAYOUT_SPLIT:
-		if (layout_file)
-			fput(layout_file);
-		break;
-	}
+	if (copy_to_user((struct fsxattr __user *)arg,
+			 &fsxattr, sizeof(fsxattr)))
+		RETURN(-EFAULT);
 
-	if (!rc)
-		rc = ll_lease_type_from_fmode(fmode);
-	RETURN(rc);
+	RETURN(0);
 }
 
-static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
-			      unsigned long arg)
+int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
+			unsigned long arg)
 {
-	struct inode *inode = file_inode(file);
-	struct ll_inode_info *lli = ll_i2info(inode);
-	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
-	struct obd_client_handle *och = NULL;
-	__u64 open_flags = 0;
-	bool lease_broken;
-	fmode_t fmode;
-	long rc;
-	ENTRY;
 
-	switch (ioc->lil_mode) {
-	case LL_LEASE_WRLCK:
-		if (!(file->f_mode & FMODE_WRITE))
-			RETURN(-EPERM);
-		fmode = FMODE_WRITE;
-		break;
-	case LL_LEASE_RDLCK:
-		if (!(file->f_mode & FMODE_READ))
-			RETURN(-EPERM);
-		fmode = FMODE_READ;
-		break;
-	case LL_LEASE_UNLCK:
-		RETURN(ll_file_unlock_lease(file, ioc, arg));
-	default:
-		RETURN(-EINVAL);
-	}
+	struct md_op_data *op_data;
+	struct ptlrpc_request *req = NULL;
+	int rc = 0;
+	struct fsxattr fsxattr;
+	struct cl_object *obj;
 
-	CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
+	/* only root could change project ID */
+	if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+		RETURN(-EPERM);
 
-	/* apply for lease */
-	if (ioc->lil_flags & LL_LEASE_RESYNC)
-		open_flags = MDS_OPEN_RESYNC;
-	och = ll_lease_open(inode, file, fmode, open_flags);
-	if (IS_ERR(och))
-		RETURN(PTR_ERR(och));
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
 
-	if (ioc->lil_flags & LL_LEASE_RESYNC) {
-		rc = ll_lease_file_resync(och, inode, arg);
-		if (rc) {
-			ll_lease_close(och, inode, NULL);
-			RETURN(rc);
-		}
-		rc = ll_layout_refresh(inode, &fd->fd_layout_version);
-		if (rc) {
-			ll_lease_close(och, inode, NULL);
-			RETURN(rc);
-		}
-	}
+	if (copy_from_user(&fsxattr,
+			   (const struct fsxattr __user *)arg,
+			   sizeof(fsxattr)))
+		GOTO(out_fsxattr1, rc = -EFAULT);
 
-	rc = 0;
-	mutex_lock(&lli->lli_och_mutex);
-	if (fd->fd_lease_och == NULL) {
-		fd->fd_lease_och = och;
-		och = NULL;
-	}
-	mutex_unlock(&lli->lli_och_mutex);
-	if (och != NULL) {
-		/* impossible now that only excl is supported for now */
-		ll_lease_close(och, inode, &lease_broken);
-		rc = -EBUSY;
+	op_data->op_attr_flags = fsxattr.fsx_xflags;
+	op_data->op_projid = fsxattr.fsx_projid;
+	op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
+	rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
+			0, &req);
+	ptlrpc_req_finished(req);
+
+	obj = ll_i2info(inode)->lli_clob;
+	if (obj) {
+		struct iattr *attr;
+
+		inode->i_flags = ll_ext_to_inode_flags(fsxattr.fsx_xflags);
+		OBD_ALLOC_PTR(attr);
+		if (attr == NULL)
+			GOTO(out_fsxattr1, rc = -ENOMEM);
+		attr->ia_valid = ATTR_ATTR_FLAG;
+		rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
+
+		OBD_FREE_PTR(attr);
 	}
+out_fsxattr1:
+	ll_finish_md_op_data(op_data);
 	RETURN(rc);
+
+
 }
 
 static long
@@ -3334,15 +2586,15 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
 	       PFID(ll_inode2fid(inode)), inode, cmd);
-	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
+        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
 
-	/* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
-	if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
-		RETURN(-ENOTTY);
+        /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
+        if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
+                RETURN(-ENOTTY);
 
-	switch (cmd) {
-	case LL_IOC_GETFLAGS:
-		/* Get the current value of the file flags */
+        switch(cmd) {
+        case LL_IOC_GETFLAGS:
+                /* Get the current value of the file flags */
 		return put_user(fd->fd_flags, (int __user *)arg);
         case LL_IOC_SETFLAGS:
         case LL_IOC_CLRFLAGS:
@@ -3395,6 +2647,9 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			struct ll_inode_info		*lli;
 			struct obd_client_handle	*och = NULL;
 
+			if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
+				GOTO(out, rc = -EINVAL);
+
 			lli = ll_i2info(inode);
 			mutex_lock(&lli->lli_och_mutex);
 			if (fd->fd_lease_och != NULL) {
@@ -3416,18 +2671,12 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case LL_IOC_LOV_GETSTRIPE:
 	case LL_IOC_LOV_GETSTRIPE_NEW:
 		RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
-	case FS_IOC_GETFLAGS:
-	case FS_IOC_SETFLAGS:
-		RETURN(ll_iocontrol(inode, file, cmd, arg));
-	case FSFILT_IOC_GETVERSION:
-	case FS_IOC_GETVERSION:
+        case FSFILT_IOC_GETFLAGS:
+        case FSFILT_IOC_SETFLAGS:
+                RETURN(ll_iocontrol(inode, file, cmd, arg));
+        case FSFILT_IOC_GETVERSION_OLD:
+        case FSFILT_IOC_GETVERSION:
 		RETURN(put_user(inode->i_generation, (int __user *)arg));
-	/* We need to special case any other ioctls we want to handle,
-	 * to send them to the MDS/OST as appropriate and to properly
-	 * network encode the arg field. */
-	case FS_IOC_SETVERSION:
-		RETURN(-ENOTSUPP);
-
         case LL_IOC_GROUP_LOCK:
                 RETURN(ll_get_grouplock(inode, file, arg));
         case LL_IOC_GROUP_UNLOCK:
@@ -3435,6 +2684,12 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
         case IOC_OBD_STATFS:
 		RETURN(ll_obd_statfs(inode, (void __user *)arg));
 
+        /* We need to special case any other ioctls we want to handle,
+         * to send them to the MDS/OST as appropriate and to properly
+         * network encode the arg field.
+        case FSFILT_IOC_SETVERSION_OLD:
+        case FSFILT_IOC_SETVERSION:
+        */
 	case LL_IOC_FLUSHCTX:
 		RETURN(ll_flush_ctx(inode));
 	case LL_IOC_PATH2FID: {
@@ -3457,7 +2712,7 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			RETURN(-EFAULT);
 
 		idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
-		rc = ll_ioc_data_version(inode, &idv);
+		rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
 
 		if (rc == 0 &&
 		    copy_to_user((char __user *)arg, &idv, sizeof(idv)))
@@ -3551,18 +2806,71 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		OBD_FREE_PTR(hca);
 		RETURN(rc);
 	}
-	case LL_IOC_SET_LEASE_OLD: {
-		struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
-
-		RETURN(ll_file_set_lease(file, &ioc, 0));
-	}
 	case LL_IOC_SET_LEASE: {
-		struct ll_ioc_lease ioc;
+		struct ll_inode_info *lli = ll_i2info(inode);
+		struct obd_client_handle *och = NULL;
+		bool lease_broken;
+		fmode_t fmode;
 
-		if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
-			RETURN(-EFAULT);
+		switch (arg) {
+		case LL_LEASE_WRLCK:
+			if (!(file->f_mode & FMODE_WRITE))
+				RETURN(-EPERM);
+			fmode = FMODE_WRITE;
+			break;
+		case LL_LEASE_RDLCK:
+			if (!(file->f_mode & FMODE_READ))
+				RETURN(-EPERM);
+			fmode = FMODE_READ;
+			break;
+		case LL_LEASE_UNLCK:
+			mutex_lock(&lli->lli_och_mutex);
+			if (fd->fd_lease_och != NULL) {
+				och = fd->fd_lease_och;
+				fd->fd_lease_och = NULL;
+			}
+			mutex_unlock(&lli->lli_och_mutex);
+
+			if (och == NULL)
+				RETURN(-ENOLCK);
+
+			fmode = och->och_flags;
+			rc = ll_lease_close(och, inode, &lease_broken);
+			if (rc < 0)
+				RETURN(rc);
+
+			rc = ll_lease_och_release(inode, file);
+			if (rc < 0)
+				RETURN(rc);
 
-		RETURN(ll_file_set_lease(file, &ioc, arg));
+			if (lease_broken)
+				fmode = 0;
+
+			RETURN(ll_lease_type_from_fmode(fmode));
+		default:
+			RETURN(-EINVAL);
+		}
+
+		CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
+
+		/* apply for lease */
+		och = ll_lease_open(inode, file, fmode, 0);
+		if (IS_ERR(och))
+			RETURN(PTR_ERR(och));
+
+		rc = 0;
+		mutex_lock(&lli->lli_och_mutex);
+		if (fd->fd_lease_och == NULL) {
+			fd->fd_lease_och = och;
+			och = NULL;
+		}
+		mutex_unlock(&lli->lli_och_mutex);
+		if (och != NULL) {
+			/* impossible now that only excl is supported for now */
+			ll_lease_close(och, inode, &lease_broken);
+			rc = -EBUSY;
+		}
+		RETURN(rc);
 	}
 	case LL_IOC_GET_LEASE: {
 		struct ll_inode_info *lli = ll_i2info(inode);
@@ -3615,92 +2923,55 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		RETURN(ll_file_futimes_3(file, &lfu));
 	}
 	case LL_IOC_LADVISE: {
-		struct llapi_ladvise_hdr *k_ladvise_hdr;
-		struct llapi_ladvise_hdr __user *u_ladvise_hdr;
+		struct llapi_ladvise_hdr *ladvise_hdr;
 		int i;
 		int num_advise;
-		int alloc_size = sizeof(*k_ladvise_hdr);
+		int alloc_size = sizeof(*ladvise_hdr);
 
 		rc = 0;
-		u_ladvise_hdr = (void __user *)arg;
-		OBD_ALLOC_PTR(k_ladvise_hdr);
-		if (k_ladvise_hdr == NULL)
+		OBD_ALLOC_PTR(ladvise_hdr);
+		if (ladvise_hdr == NULL)
 			RETURN(-ENOMEM);
 
-		if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
+		if (copy_from_user(ladvise_hdr,
+				   (const struct llapi_ladvise_hdr __user *)arg,
+				   alloc_size))
 			GOTO(out_ladvise, rc = -EFAULT);
 
-		if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
-		    k_ladvise_hdr->lah_count < 1)
+		if (ladvise_hdr->lah_magic != LADVISE_MAGIC ||
+		    ladvise_hdr->lah_count < 1)
 			GOTO(out_ladvise, rc = -EINVAL);
 
-		num_advise = k_ladvise_hdr->lah_count;
+		num_advise = ladvise_hdr->lah_count;
 		if (num_advise >= LAH_COUNT_MAX)
 			GOTO(out_ladvise, rc = -EFBIG);
 
-		OBD_FREE_PTR(k_ladvise_hdr);
-		alloc_size = offsetof(typeof(*k_ladvise_hdr),
+		OBD_FREE_PTR(ladvise_hdr);
+		alloc_size = offsetof(typeof(*ladvise_hdr),
 				      lah_advise[num_advise]);
-		OBD_ALLOC(k_ladvise_hdr, alloc_size);
-		if (k_ladvise_hdr == NULL)
+		OBD_ALLOC(ladvise_hdr, alloc_size);
+		if (ladvise_hdr == NULL)
 			RETURN(-ENOMEM);
 
 		/*
 		 * TODO: submit multiple advices to one server in a single RPC
 		 */
-		if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
+		if (copy_from_user(ladvise_hdr,
+				   (const struct llapi_ladvise_hdr __user *)arg,
+				   alloc_size))
 			GOTO(out_ladvise, rc = -EFAULT);
 
 		for (i = 0; i < num_advise; i++) {
-			struct llapi_lu_ladvise *k_ladvise =
-					&k_ladvise_hdr->lah_advise[i];
-			struct llapi_lu_ladvise __user *u_ladvise =
-					&u_ladvise_hdr->lah_advise[i];
-
-			rc = ll_ladvise_sanity(inode, k_ladvise);
+			rc = ll_ladvise(inode, file, ladvise_hdr->lah_flags,
+					&ladvise_hdr->lah_advise[i]);
 			if (rc)
-				GOTO(out_ladvise, rc);
-
-			switch (k_ladvise->lla_advice) {
-			case LU_LADVISE_LOCKNOEXPAND:
-				rc = ll_lock_noexpand(file,
-					       k_ladvise->lla_peradvice_flags);
-				GOTO(out_ladvise, rc);
-			case LU_LADVISE_LOCKAHEAD:
-
-				rc = ll_file_lock_ahead(file, k_ladvise);
-
-				if (rc < 0)
-					GOTO(out_ladvise, rc);
-
-				if (put_user(rc,
-					     &u_ladvise->lla_lockahead_result))
-					GOTO(out_ladvise, rc = -EFAULT);
-				break;
-			default:
-				rc = ll_ladvise(inode, file,
-						k_ladvise_hdr->lah_flags,
-						k_ladvise);
-				if (rc)
-					GOTO(out_ladvise, rc);
 				break;
-			}
-
 		}
 
 out_ladvise:
-		OBD_FREE(k_ladvise_hdr, alloc_size);
+		OBD_FREE(ladvise_hdr, alloc_size);
 		RETURN(rc);
 	}
-	case LL_IOC_FLR_SET_MIRROR: {
-		/* mirror I/O must be direct to avoid polluting page cache
-		 * by stale data. */
-		if (!(file->f_flags & O_DIRECT))
-			RETURN(-EINVAL);
-
-		fd->fd_designated_mirror = (__u32)arg;
-		RETURN(0);
-	}
 	case LL_IOC_FSGETXATTR:
 		RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
 	case LL_IOC_FSSETXATTR:
@@ -3889,6 +3160,7 @@ int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct dentry *dentry = file_dentry(file);
+	bool lock_inode;
 #elif defined(HAVE_FILE_FSYNC_2ARGS)
 int ll_fsync(struct file *file, int datasync)
 {
@@ -3913,7 +3185,9 @@ int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
 
 #ifdef HAVE_FILE_FSYNC_4ARGS
 	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
-	inode_lock(inode);
+	lock_inode = !lli->lli_inode_locked;
+	if (lock_inode)
+		inode_lock(inode);
 #else
 	/* fsync's caller has already called _fdata{sync,write}, we want
 	 * that IO to finish before calling the osc and mdc sync methods */
@@ -3953,7 +3227,8 @@ int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
 	}
 
 #ifdef HAVE_FILE_FSYNC_4ARGS
-	inode_unlock(inode);
+	if (lock_inode)
+		inode_unlock(inode);
 #endif
 	RETURN(rc);
 }
@@ -4137,61 +3412,48 @@ int ll_get_fid_by_name(struct inode *parent, const char *name,
 	RETURN(rc);
 }
 
-int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
-	       const char *name)
+int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
+	       const char *name, int namelen)
 {
-	struct dentry *dchild = NULL;
-	struct inode *child_inode = NULL;
-	struct md_op_data *op_data;
+	struct dentry         *dchild = NULL;
+	struct inode          *child_inode = NULL;
+	struct md_op_data     *op_data;
 	struct ptlrpc_request *request = NULL;
 	struct obd_client_handle *och = NULL;
-	struct qstr qstr;
-	struct mdt_body	*body;
-	__u64 data_version = 0;
-	size_t namelen = strlen(name);
-	int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
-	int rc;
+	struct qstr           qstr;
+	struct mdt_body		*body;
+	int                    rc;
+	__u64			data_version = 0;
 	ENTRY;
 
-	CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
-	       PFID(ll_inode2fid(parent)), name,
-	       lum->lum_stripe_offset, lum->lum_stripe_count);
+	CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
+	       name, PFID(ll_inode2fid(parent)), mdtidx);
 
-	if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
-	    lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
-		lustre_swab_lmv_user_md(lum);
+	op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
+				     0, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
 
 	/* Get child FID first */
 	qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
 	qstr.name = name;
 	qstr.len = namelen;
 	dchild = d_lookup(file_dentry(file), &qstr);
-	if (dchild) {
-		if (dchild->d_inode)
+	if (dchild != NULL) {
+		if (dchild->d_inode != NULL)
 			child_inode = igrab(dchild->d_inode);
 		dput(dchild);
 	}
 
-	if (!child_inode) {
-		rc = ll_get_fid_by_name(parent, name, namelen, NULL,
-					&child_inode);
-		if (rc)
-			RETURN(rc);
+	if (child_inode == NULL) {
+		rc = ll_get_fid_by_name(parent, name, namelen,
+					&op_data->op_fid3, &child_inode);
+		if (rc != 0)
+			GOTO(out_free, rc);
 	}
 
-	if (!child_inode)
-		RETURN(-ENOENT);
-
-	if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
-	      OBD_CONNECT2_DIR_MIGRATE)) {
-		if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
-		    ll_i2info(child_inode)->lli_lsm_md) {
-			CERROR("%s: MDT doesn't support stripe directory "
-			       "migration!\n",
-			       ll_get_fsname(parent->i_sb, NULL, 0));
-			GOTO(out_iput, rc = -EOPNOTSUPP);
-		}
-	}
+	if (child_inode == NULL)
+		GOTO(out_free, rc = -EINVAL);
 
 	/*
 	 * lfs migrate command needs to be blocked on the client
@@ -4201,11 +3463,6 @@ int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
 	if (child_inode == parent->i_sb->s_root->d_inode)
 		GOTO(out_iput, rc = -EINVAL);
 
-	op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
-				     child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
-	if (IS_ERR(op_data))
-		GOTO(out_iput, rc = PTR_ERR(op_data));
-
 	inode_lock(child_inode);
 	op_data->op_fid3 = *ll_inode2fid(child_inode);
 	if (!fid_is_sane(&op_data->op_fid3)) {
@@ -4215,10 +3472,15 @@ int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
 		GOTO(out_unlock, rc = -EINVAL);
 	}
 
-	op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
-	op_data->op_data = lum;
-	op_data->op_data_size = lumlen;
+	rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
+	if (rc < 0)
+		GOTO(out_unlock, rc);
 
+	if (rc == mdtidx) {
+		CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
+		       PFID(&op_data->op_fid3), mdtidx);
+		GOTO(out_unlock, rc = 0);
+	}
 again:
 	if (S_ISREG(child_inode->i_mode)) {
 		och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
@@ -4233,18 +3495,17 @@ int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
 		if (rc != 0)
 			GOTO(out_close, rc);
 
-		op_data->op_open_handle = och->och_open_handle;
+		op_data->op_handle = och->och_fh;
+		op_data->op_data = och->och_mod;
 		op_data->op_data_version = data_version;
 		op_data->op_lease_handle = och->och_lease_handle;
-		op_data->op_bias |= MDS_CLOSE_MIGRATE;
-
-		spin_lock(&och->och_mod->mod_open_req->rq_lock);
-		och->och_mod->mod_open_req->rq_replay = 0;
-		spin_unlock(&och->och_mod->mod_open_req->rq_lock);
+		op_data->op_bias |= MDS_RENAME_MIGRATE;
 	}
 
-	rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
-		       name, namelen, &request);
+	op_data->op_mds = mdtidx;
+	op_data->op_cli_flags = CLI_MIGRATE;
+	rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
+		       namelen, name, namelen, &request);
 	if (rc == 0) {
 		LASSERT(request != NULL);
 		ll_update_times(request, parent);
@@ -4254,11 +3515,12 @@ int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
 
 		/* If the server does release layout lock, then we cleanup
 		 * the client och here, otherwise release it in out_close: */
-		if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
+		if (och != NULL &&
+		    body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
 			obd_mod_put(och->och_mod);
 			md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
 						  och);
-			och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
+			och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 			OBD_FREE_PTR(och);
 			och = NULL;
 		}
@@ -4274,15 +3536,16 @@ int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
 		goto again;
 
 out_close:
-	if (och)
+	if (och != NULL) /* close the file */
 		ll_lease_close(och, child_inode, NULL);
-	if (!rc)
+	if (rc == 0)
 		clear_nlink(child_inode);
 out_unlock:
 	inode_unlock(child_inode);
-	ll_finish_md_op_data(op_data);
 out_iput:
 	iput(child_inode);
+out_free:
+	ll_finish_md_op_data(op_data);
 	RETURN(rc);
 }
 
@@ -4323,7 +3586,7 @@ int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
                ldlm_lockname[mode]);
 
 	flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
-	for (i = 0; i < MDS_INODELOCK_NUMBITS && *bits != 0; i++) {
+	for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
 		policy.l_inodebits.bits = *bits & (1 << i);
 		if (policy.l_inodebits.bits == 0)
 			continue;
@@ -4390,81 +3653,105 @@ static int ll_inode_revalidate_fini(struct inode *inode, int rc)
 	return rc;
 }
 
-static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
+static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
 {
-	struct inode *parent;
-	struct inode *inode = dentry->d_inode;
-	struct obd_export *exp = ll_i2mdexp(inode);
-	struct lookup_intent oit = {
-		.it_op = op,
-	};
-	struct ptlrpc_request *req = NULL;
-	struct md_op_data *op_data;
-	const char *name = NULL;
-	size_t namelen = 0;
-	int rc = 0;
-	ENTRY;
+        struct inode *inode = dentry->d_inode;
+        struct ptlrpc_request *req = NULL;
+        struct obd_export *exp;
+        int rc = 0;
+        ENTRY;
+
+        LASSERT(inode != NULL);
 
 	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
 	       PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
 
-	if (exp_connect_flags2(exp) & OBD_CONNECT2_GETATTR_PFID) {
-		parent = dentry->d_parent->d_inode;
-		name = dentry->d_name.name;
-		namelen = dentry->d_name.len;
-	} else {
-		parent = inode;
-	}
+        exp = ll_i2mdexp(inode);
+
+        /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
+         *      But under CMD case, it caused some lock issues, should be fixed
+         *      with new CMD ibits lock. See bug 12718 */
+	if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
+                struct lookup_intent oit = { .it_op = IT_GETATTR };
+                struct md_op_data *op_data;
+
+                if (ibits == MDS_INODELOCK_LOOKUP)
+                        oit.it_op = IT_LOOKUP;
+
+                /* Call getattr by fid, so do not provide name at all. */
+                op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
+                                             dentry->d_inode, NULL, 0, 0,
+                                             LUSTRE_OPC_ANY, NULL);
+                if (IS_ERR(op_data))
+                        RETURN(PTR_ERR(op_data));
+
+		rc = md_intent_lock(exp, op_data, &oit, &req,
+				    &ll_md_blocking_ast, 0);
+                ll_finish_md_op_data(op_data);
+                if (rc < 0) {
+                        rc = ll_inode_revalidate_fini(inode, rc);
+                        GOTO (out, rc);
+                }
 
-	op_data = ll_prep_md_op_data(NULL, parent, inode, name, namelen, 0,
-				     LUSTRE_OPC_ANY, NULL);
-	if (IS_ERR(op_data))
-		RETURN(PTR_ERR(op_data));
+                rc = ll_revalidate_it_finish(req, &oit, dentry);
+                if (rc != 0) {
+                        ll_intent_release(&oit);
+                        GOTO(out, rc);
+                }
 
-	/* Call getattr by fid */
-	if (exp_connect_flags2(exp) & OBD_CONNECT2_GETATTR_PFID)
-		op_data->op_flags = MF_GETATTR_BY_FID;
-	rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
-	ll_finish_md_op_data(op_data);
-	if (rc < 0) {
-		rc = ll_inode_revalidate_fini(inode, rc);
-		GOTO(out, rc);
-	}
+                /* Unlinked? Unhash dentry, so it is not picked up later by
+                   do_lookup() -> ll_revalidate_it(). We cannot use d_drop
+                   here to preserve get_cwd functionality on 2.6.
+                   Bug 10503 */
+		if (!dentry->d_inode->i_nlink) {
+			ll_lock_dcache(inode);
+			d_lustre_invalidate(dentry, 0);
+			ll_unlock_dcache(inode);
+		}
 
-	rc = ll_revalidate_it_finish(req, &oit, dentry);
-	if (rc != 0) {
-		ll_intent_release(&oit);
-		GOTO(out, rc);
-	}
+                ll_lookup_finish_locks(&oit, dentry);
+        } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
+		struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
+		u64 valid = OBD_MD_FLGETATTR;
+		struct md_op_data *op_data;
+		int ealen = 0;
 
-	/* Unlinked? Unhash dentry, so it is not picked up later by
-	 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
-	 * here to preserve get_cwd functionality on 2.6.
-	 * Bug 10503 */
-	if (!dentry->d_inode->i_nlink) {
-		ll_lock_dcache(inode);
-		d_lustre_invalidate(dentry, 0);
-		ll_unlock_dcache(inode);
-	}
+		if (S_ISREG(inode->i_mode)) {
+			rc = ll_get_default_mdsize(sbi, &ealen);
+			if (rc)
+				RETURN(rc);
+			valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
+		}
 
-	ll_lookup_finish_locks(&oit, dentry);
-out:
-	ptlrpc_req_finished(req);
+                op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
+                                             0, ealen, LUSTRE_OPC_ANY,
+                                             NULL);
+                if (IS_ERR(op_data))
+                        RETURN(PTR_ERR(op_data));
 
-	return rc;
+                op_data->op_valid = valid;
+                rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+                ll_finish_md_op_data(op_data);
+                if (rc) {
+                        rc = ll_inode_revalidate_fini(inode, rc);
+                        RETURN(rc);
+                }
+
+                rc = ll_prep_inode(&inode, req, NULL, NULL);
+        }
+out:
+        ptlrpc_req_finished(req);
+        return rc;
 }
 
 static int ll_merge_md_attr(struct inode *inode)
 {
-	struct ll_inode_info *lli = ll_i2info(inode);
 	struct cl_attr attr = { 0 };
 	int rc;
 
-	LASSERT(lli->lli_lsm_md != NULL);
-	down_read(&lli->lli_lsm_sem);
+	LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
 	rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
 			   &attr, ll_md_blocking_ast);
-	up_read(&lli->lli_lsm_sem);
 	if (rc != 0)
 		RETURN(rc);
 
@@ -4479,6 +3766,43 @@ static int ll_merge_md_attr(struct inode *inode)
 	RETURN(0);
 }
 
+static int
+ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
+{
+	struct inode	*inode = dentry->d_inode;
+	int		 rc;
+	ENTRY;
+
+	rc = __ll_inode_revalidate(dentry, ibits);
+	if (rc != 0)
+		RETURN(rc);
+
+	/* if object isn't regular file, don't validate size */
+	if (!S_ISREG(inode->i_mode)) {
+		if (S_ISDIR(inode->i_mode) &&
+		    ll_i2info(inode)->lli_lsm_md != NULL) {
+			rc = ll_merge_md_attr(inode);
+			if (rc != 0)
+				RETURN(rc);
+		}
+
+		inode->i_atime.tv_sec = ll_i2info(inode)->lli_atime;
+		inode->i_mtime.tv_sec = ll_i2info(inode)->lli_mtime;
+		inode->i_ctime.tv_sec = ll_i2info(inode)->lli_ctime;
+	} else {
+		/* In case of restore, the MDT has the right size and has
+		 * already send it back without granting the layout lock,
+		 * inode is up-to-date so glimpse is useless.
+		 * Also to glimpse we need the layout, in case of a running
+		 * restore the MDT holds the layout lock so the glimpse will
+		 * block up to the end of restore (getattr will block)
+		 */
+		if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
+			rc = ll_glimpse_size(inode);
+	}
+	RETURN(rc);
+}
+
 static inline dev_t ll_compat_encode_dev(dev_t dev)
 {
 	/* The compat_sys_*stat*() syscalls will fail unless the
@@ -4494,49 +3818,24 @@ static inline dev_t ll_compat_encode_dev(dev_t dev)
 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
 int ll_getattr(const struct path *path, struct kstat *stat,
 	       u32 request_mask, unsigned int flags)
+
 {
 	struct dentry *de = path->dentry;
 #else
 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
 {
 #endif
-	struct inode *inode = de->d_inode;
-	struct ll_sb_info *sbi = ll_i2sbi(inode);
-	struct ll_inode_info *lli = ll_i2info(inode);
-	int rc;
-
-	ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
-
-	rc = ll_inode_revalidate(de, IT_GETATTR);
-	if (rc < 0)
-		RETURN(rc);
+        struct inode *inode = de->d_inode;
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        struct ll_inode_info *lli = ll_i2info(inode);
+        int res = 0;
 
-	if (S_ISREG(inode->i_mode)) {
-		/* In case of restore, the MDT has the right size and has
-		 * already send it back without granting the layout lock,
-		 * inode is up-to-date so glimpse is useless.
-		 * Also to glimpse we need the layout, in case of a running
-		 * restore the MDT holds the layout lock so the glimpse will
-		 * block up to the end of restore (getattr will block)
-		 */
-		if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
-			rc = ll_glimpse_size(inode);
-			if (rc < 0)
-				RETURN(rc);
-		}
-	} else {
-		/* If object isn't regular a file then don't validate size. */
-		if (S_ISDIR(inode->i_mode) &&
-		    lli->lli_lsm_md != NULL) {
-			rc = ll_merge_md_attr(inode);
-			if (rc < 0)
-				RETURN(rc);
-		}
+	res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
+				      MDS_INODELOCK_LOOKUP);
+        ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
 
-		inode->i_atime.tv_sec = lli->lli_atime;
-		inode->i_mtime.tv_sec = lli->lli_mtime;
-		inode->i_ctime.tv_sec = lli->lli_ctime;
-	}
+        if (res)
+                return res;
 
 	OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
 
@@ -4626,28 +3925,28 @@ int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	const char *name = NULL;
 	char *value = NULL;
 	size_t value_size = 0;
-	int rc = 0;
+	int rc;
 	ENTRY;
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		name = XATTR_NAME_POSIX_ACL_ACCESS;
-		if (acl)
+		if (acl) {
 			rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
-		break;
+			if (rc)
+				GOTO(out, rc);
+		}
 
+		break;
 	case ACL_TYPE_DEFAULT:
 		name = XATTR_NAME_POSIX_ACL_DEFAULT;
 		if (!S_ISDIR(inode->i_mode))
-			rc = acl ? -EACCES : 0;
-		break;
+			GOTO(out, rc = acl ? -EACCES : 0);
 
-	default:
-		rc = -EINVAL;
 		break;
+	default:
+		GOTO(out, rc = -EINVAL);
 	}
-	if (rc)
-		return rc;
 
 	if (acl) {
 		value_size = posix_acl_xattr_size(acl->a_count);
@@ -4662,16 +3961,16 @@ int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 
 	rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
 			 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
-			 name, value, value_size, 0, 0, &req);
+			 name, value, value_size, 0, 0, 0, &req);
 
 	ptlrpc_req_finished(req);
 out_value:
 	kfree(value);
 out:
-	if (rc)
-		forget_cached_acl(inode, type);
-	else
+	if (!rc)
 		set_cached_acl(inode, type, acl);
+	else
+		forget_cached_acl(inode, type);
 	RETURN(rc);
 }
 #endif /* CONFIG_FS_POSIX_ACL */
@@ -4740,7 +4039,8 @@ int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
         * need to do it before permission check. */
 
         if (inode == inode->i_sb->s_root->d_inode) {
-		rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
+		rc = __ll_inode_revalidate(inode->i_sb->s_root,
+					   MDS_INODELOCK_LOOKUP);
                 if (rc)
                         RETURN(rc);
         }
@@ -4942,6 +4242,7 @@ static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
 {
 	struct ll_sb_info *sbi = ll_i2sbi(inode);
 	struct ptlrpc_request *req;
+	struct mdt_body *body;
 	void *lvbdata;
 	void *lmm;
 	int lmmsize;
@@ -4961,20 +4262,18 @@ static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
 	 * layout here. Please note that we can't use the LVB buffer in
 	 * completion AST because it doesn't have a large enough buffer */
 	rc = ll_get_default_mdsize(sbi, &lmmsize);
+	if (rc == 0)
+		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
+				OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
+				lmmsize, 0, &req);
 	if (rc < 0)
 		RETURN(rc);
 
-	rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
-			 XATTR_NAME_LOV, lmmsize, &req);
-	if (rc < 0) {
-		if (rc == -ENODATA)
-			GOTO(out, rc = 0); /* empty layout */
-		else
-			RETURN(rc);
-	}
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EPROTO);
 
-	lmmsize = rc;
-	rc = 0;
+	lmmsize = body->mbo_eadatasize;
 	if (lmmsize == 0) /* empty layout */
 		GOTO(out, rc = 0);
 
@@ -5205,20 +4504,19 @@ int ll_layout_refresh(struct inode *inode, __u32 *gen)
  * Issue layout intent RPC indicating where in a file an IO is about to write.
  *
  * \param[in] inode	file inode.
- * \param[in] ext	write range with start offset of fille in bytes where
- *			an IO is about to write, and exclusive end offset in
- *			bytes.
+ * \param[in] start	start offset of fille in bytes where an IO is about to
+ *			write.
+ * \param[in] end	exclusive end offset in bytes of the write range.
  *
  * \retval 0	on success
  * \retval < 0	error code
  */
-int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
-			   struct lu_extent *ext)
+int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end)
 {
 	struct layout_intent intent = {
-		.li_opc = opc,
-		.li_extent.e_start = ext->e_start,
-		.li_extent.e_end = ext->e_end,
+		.li_opc = LAYOUT_INTENT_WRITE,
+		.li_start = start,
+		.li_end = end,
 	};
 	int rc;
 	ENTRY;
diff --git a/drivers/staging/lustrefsx/lustre/llite/glimpse.c b/drivers/staging/lustrefsx/lustre/llite/glimpse.c
index ddbaa142514de..d34be28747bdd 100644
--- a/drivers/staging/lustrefsx/lustre/llite/glimpse.c
+++ b/drivers/staging/lustrefsx/lustre/llite/glimpse.c
@@ -23,13 +23,14 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  * Lustre is a trademark of Sun Microsystems, Inc.
  *
- * glimpse code used by vvp (and other Lustre clients in the future).
+ * glimpse code shared between vvp and liblustre (and other Lustre clients in
+ * the future).
  *
  *   Author: Nikita Danilov <nikita.danilov@sun.com>
  *   Author: Oleg Drokin <oleg.drokin@sun.com>
@@ -91,7 +92,7 @@ int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
 	CDEBUG(D_DLMTRACE, "Glimpsing inode "DFID"\n", PFID(fid));
 
 	/* NOTE: this looks like DLM lock request, but it may
-	 *       not be one. Due to CEF_GLIMPSE flag (translated
+	 *       not be one. Due to CEF_ASYNC flag (translated
 	 *       to LDLM_FL_HAS_INTENT by osc), this is
 	 *       glimpse request, that won't revoke any
 	 *       conflicting DLM locks held. Instead,
@@ -106,10 +107,14 @@ int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
 	*descr = whole_file;
 	descr->cld_obj = clob;
 	descr->cld_mode = CLM_READ;
-	descr->cld_enq_flags = CEF_GLIMPSE | CEF_MUST;
+	descr->cld_enq_flags = CEF_ASYNC | CEF_MUST;
 	if (agl)
-		descr->cld_enq_flags |= CEF_SPECULATIVE | CEF_NONBLOCK;
+		descr->cld_enq_flags |= CEF_AGL;
 	/*
+	 * CEF_ASYNC is used because glimpse sub-locks cannot
+	 * deadlock (because they never conflict with other
+	 * locks) and, hence, can be enqueued out-of-order.
+	 *
 	 * CEF_MUST protects glimpse lock from conversion into
 	 * a lockless mode.
 	 */
@@ -135,20 +140,7 @@ int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
 	RETURN(result);
 }
 
-/**
- * Get an IO environment for special operations such as glimpse locks and
- * manually requested locks (ladvise lockahead)
- *
- * \param[in]  inode	inode the operation is being performed on
- * \param[out] envout	thread specific execution environment
- * \param[out] ioout	client io description
- * \param[out] refcheck	reference check
- *
- * \retval 1		on success
- * \retval 0		not a regular file, cannot get environment
- * \retval negative	negative errno on error
- */
-int cl_io_get(struct inode *inode, struct lu_env **envout,
+static int cl_io_get(struct inode *inode, struct lu_env **envout,
 		     struct cl_io **ioout, __u16 *refcheck)
 {
 	struct lu_env		*env;
@@ -186,37 +178,31 @@ int cl_glimpse_size0(struct inode *inode, int agl)
          */
         struct lu_env          *env = NULL;
         struct cl_io           *io  = NULL;
-	__u16			refcheck;
-	int			retried = 0;
-	int                     result;
-
-	ENTRY;
-
-	result = cl_io_get(inode, &env, &io, &refcheck);
-	if (result <= 0)
-		RETURN(result);
-
-	do {
-		io->ci_ndelay_tried = retried++;
-		io->ci_ndelay = io->ci_verify_layout = 1;
-		result = cl_io_init(env, io, CIT_GLIMPSE, io->ci_obj);
-		if (result > 0) {
-			/*
-			 * nothing to do for this io. This currently happens
-			 * when stripe sub-object's are not yet created.
-			 */
-			result = io->ci_result;
-		} else if (result == 0) {
-			result = cl_glimpse_lock(env, io, inode, io->ci_obj,
-						 agl);
-			if (!agl && result == -EWOULDBLOCK)
-				io->ci_need_restart = 1;
-		}
+	__u16                   refcheck;
+        int                     result;
+
+        ENTRY;
+
+        result = cl_io_get(inode, &env, &io, &refcheck);
+        if (result > 0) {
+	again:
+		io->ci_verify_layout = 1;
+                result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+                if (result > 0)
+                        /*
+                         * nothing to do for this io. This currently happens
+                         * when stripe sub-object's are not yet created.
+                         */
+                        result = io->ci_result;
+                else if (result == 0)
+                        result = cl_glimpse_lock(env, io, inode, io->ci_obj,
+                                                 agl);
 
 		OBD_FAIL_TIMEOUT(OBD_FAIL_GLIMPSE_DELAY, 2);
-		cl_io_fini(env, io);
-	} while (unlikely(io->ci_need_restart));
-
-	cl_env_put(env, &refcheck);
+                cl_io_fini(env, io);
+		if (unlikely(io->ci_need_restart))
+			goto again;
+		cl_env_put(env, &refcheck);
+	}
 	RETURN(result);
 }
diff --git a/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c b/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c
index 21a10ec551e44..a5fe1978c66a2 100644
--- a/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c
+++ b/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -47,6 +47,7 @@
 #include <obd_support.h>
 #include <lustre_fid.h>
 #include <lustre_dlm.h>
+#include <lustre_ver.h>
 #include <lustre_mdc.h>
 #include <cl_object.h>
 
@@ -68,7 +69,7 @@ __u16 cl_inode_fini_refcheck;
 static DEFINE_MUTEX(cl_inode_fini_guard);
 
 int cl_setattr_ost(struct cl_object *obj, const struct iattr *attr,
-		   enum op_xvalid xvalid, unsigned int attr_flags)
+		   unsigned int attr_flags)
 {
         struct lu_env *env;
         struct cl_io  *io;
@@ -90,14 +91,10 @@ int cl_setattr_ost(struct cl_object *obj, const struct iattr *attr,
 	io->u.ci_setattr.sa_attr.lvb_ctime = attr->ia_ctime.tv_sec;
 	io->u.ci_setattr.sa_attr.lvb_size = attr->ia_size;
 	io->u.ci_setattr.sa_attr_flags = attr_flags;
-	io->u.ci_setattr.sa_avalid = attr->ia_valid;
-	io->u.ci_setattr.sa_xvalid = xvalid;
+	io->u.ci_setattr.sa_valid = attr->ia_valid;
 	io->u.ci_setattr.sa_parent_fid = lu_object_fid(&obj->co_lu);
 
 again:
-	if (attr->ia_valid & ATTR_FILE)
-		ll_io_set_mirror(io, attr->ia_file);
-
         if (cl_io_init(env, io, CIT_SETATTR, io->ci_obj) == 0) {
 		struct vvp_io *vio = vvp_env_io(env);
 
@@ -216,12 +213,12 @@ static void cl_object_put_last(struct lu_env *env, struct cl_object *obj)
 
 	if (unlikely(atomic_read(&header->loh_ref) != 1)) {
 		struct lu_site *site = obj->co_lu.lo_dev->ld_site;
-		wait_queue_head_t *wq;
+		struct lu_site_bkt_data *bkt;
 
-		wq = lu_site_wq_from_fid(site, &header->loh_fid);
+		bkt = lu_site_bkt_from_fid(site, &header->loh_fid);
 
 		init_waitqueue_entry(&waiter, current);
-		add_wait_queue(wq, &waiter);
+		add_wait_queue(&bkt->lsb_marche_funebre, &waiter);
 
 		while (1) {
 			set_current_state(TASK_UNINTERRUPTIBLE);
@@ -231,7 +228,7 @@ static void cl_object_put_last(struct lu_env *env, struct cl_object *obj)
 		}
 
 		set_current_state(TASK_RUNNING);
-		remove_wait_queue(wq, &waiter);
+		remove_wait_queue(&bkt->lsb_marche_funebre, &waiter);
 	}
 
 	cl_object_put(env, obj);
diff --git a/drivers/staging/lustrefsx/lustre/llite/lcommon_misc.c b/drivers/staging/lustrefsx/lustre/llite/lcommon_misc.c
index 5869d949ff97b..ced348a36b42a 100644
--- a/drivers/staging/lustrefsx/lustre/llite/lcommon_misc.c
+++ b/drivers/staging/lustrefsx/lustre/llite/lcommon_misc.c
@@ -23,13 +23,14 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  * Lustre is a trademark of Sun Microsystems, Inc.
  *
- * cl code used by vvp (and other Lustre clients in the future).
+ * cl code shared between vvp and liblustre (and other Lustre clients in the
+ * future).
  *
  */
 #define DEBUG_SUBSYSTEM S_LLITE
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_internal.h b/drivers/staging/lustrefsx/lustre/llite/llite_internal.h
index 4f94f91131a51..ce05c17a2231f 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_internal.h
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -33,6 +33,7 @@
 #ifndef LLITE_INTERNAL_H
 #define LLITE_INTERNAL_H
 #include <lustre_debug.h>
+#include <lustre_ver.h>
 #include <lustre_disk.h>  /* for s2sbi */
 #include <lustre_eacl.h>
 #include <lustre_linkea.h>
@@ -44,8 +45,8 @@
 #include <lustre_intent.h>
 #include <linux/compat.h>
 #include <linux/aio.h>
-#include <lustre_compat.h>
 
+#include <lustre_compat.h>
 #include "vvp_internal.h"
 #include "range_lock.h"
 
@@ -135,7 +136,8 @@ struct ll_inode_info {
 
 	/* update atime from MDS no matter if it's older than
 	 * local inode atime. */
-	unsigned int	lli_update_atime:1;
+	unsigned int	lli_update_atime:1,
+			lli_inode_locked:1;
 
 	/* Try to make the d::member and f::member are aligned. Before using
 	 * these members, make clear whether it is directory or not. */
@@ -165,8 +167,6 @@ struct ll_inode_info {
 			unsigned int			lli_sa_enabled:1;
 			/* generation for statahead */
 			unsigned int			lli_sa_generation;
-			/* rw lock protects lli_lsm_md */
-			struct rw_semaphore		lli_lsm_sem;
 			/* directory stripe information */
 			struct lmv_stripe_md		*lli_lsm_md;
 			/* default directory stripe offset.  This is extracted
@@ -179,8 +179,8 @@ struct ll_inode_info {
 
 		/* for non-directory */
 		struct {
-			struct mutex		lli_size_mutex;
-			char		       *lli_symlink_name;
+			struct mutex			lli_size_mutex;
+			char			       *lli_symlink_name;
 			/*
 			 * struct rw_semaphore {
 			 *    signed long	count;     // align d.d_def_acl
@@ -188,23 +188,23 @@ struct ll_inode_info {
 			 *    struct list_head wait_list;
 			 * }
 			 */
-			struct rw_semaphore	lli_trunc_sem;
-			struct range_lock_tree	lli_write_tree;
+			struct rw_semaphore		lli_trunc_sem;
+			struct range_lock_tree		lli_write_tree;
 
-			struct rw_semaphore	lli_glimpse_sem;
-			ktime_t			lli_glimpse_time;
-			struct list_head	lli_agl_list;
-			__u64			lli_agl_index;
+			struct rw_semaphore		lli_glimpse_sem;
+			cfs_time_t			lli_glimpse_time;
+			struct list_head		lli_agl_list;
+			__u64				lli_agl_index;
 
 			/* for writepage() only to communicate to fsync */
-			int			lli_async_rc;
+			int				lli_async_rc;
 
 			/*
-			 * Whenever a process try to read/write the file, the
+			 * whenever a process try to read/write the file, the
 			 * jobid of the process will be saved here, and it'll
 			 * be packed into the write PRC when flush later.
 			 *
-			 * So the read/write statistics for jobid will not be
+			 * so the read/write statistics for jobid will not be
 			 * accurate if the file is shared by different jobs.
 			 */
 			char                    lli_jobid[LUSTRE_JOBID_SIZE];
@@ -261,8 +261,6 @@ enum ll_file_flags {
 	LLIF_FILE_RESTORING	= 1,
 	/* Xattr cache is attached to the file */
 	LLIF_XATTR_CACHE	= 2,
-	/* Project inherit */
-	LLIF_PROJECT_INHERIT	= 3,
 };
 
 static inline void ll_file_set_flag(struct ll_inode_info *lli,
@@ -297,32 +295,12 @@ int ll_xattr_cache_get(struct inode *inode,
 			size_t size,
 			__u64 valid);
 
-static inline bool obd_connect_has_secctx(struct obd_connect_data *data)
-{
-#if defined(HAVE_SECURITY_DENTRY_INIT_SECURITY) && defined(CONFIG_SECURITY)
-	return data->ocd_connect_flags & OBD_CONNECT_FLAGS2 &&
-	       data->ocd_connect_flags2 & OBD_CONNECT2_FILE_SECCTX;
-#else
-	return false;
-#endif /* HAVE_SECURITY_DENTRY_INIT_SECURITY */
-}
-
-static inline void obd_connect_set_secctx(struct obd_connect_data *data)
-{
-#if defined(HAVE_SECURITY_DENTRY_INIT_SECURITY) && defined(CONFIG_SECURITY)
-	data->ocd_connect_flags2 |= OBD_CONNECT2_FILE_SECCTX;
-#endif /* HAVE_SECURITY_DENTRY_INIT_SECURITY */
-}
-
 int ll_dentry_init_security(struct dentry *dentry, int mode, struct qstr *name,
 			    const char **secctx_name, void **secctx,
 			    __u32 *secctx_size);
 int ll_inode_init_security(struct dentry *dentry, struct inode *inode,
 			   struct inode *dir);
 
-int ll_listsecurity(struct inode *inode, char *secctx_name,
-		    size_t secctx_name_size);
-
 /*
  * Locking to guarantee consistency of non-atomic updates to long long i_size,
  * consistency between file size and KMS.
@@ -333,19 +311,18 @@ int ll_listsecurity(struct inode *inode, char *secctx_name,
 void ll_inode_size_lock(struct inode *inode);
 void ll_inode_size_unlock(struct inode *inode);
 
+// FIXME: replace the name of this with LL_I to conform to kernel stuff
+// static inline struct ll_inode_info *LL_I(struct inode *inode)
 static inline struct ll_inode_info *ll_i2info(struct inode *inode)
 {
-	return container_of(inode, struct ll_inode_info, lli_vfs_inode);
+        return container_of(inode, struct ll_inode_info, lli_vfs_inode);
 }
 
-/* default to use at least 16M for fast read if possible */
-#define RA_REMAIN_WINDOW_MIN			MiB_TO_PAGES(16UL)
-
 /* default to about 64M of readahead on a given system. */
-#define SBI_DEFAULT_READAHEAD_MAX		MiB_TO_PAGES(64UL)
+#define SBI_DEFAULT_READAHEAD_MAX	(64UL << (20 - PAGE_SHIFT))
 
 /* default to read-ahead full files smaller than 2MB on the second read */
-#define SBI_DEFAULT_READAHEAD_WHOLE_MAX		MiB_TO_PAGES(2UL)
+#define SBI_DEFAULT_READAHEAD_WHOLE_MAX	(2UL << (20 - PAGE_SHIFT))
 
 enum ra_stat {
         RA_STAT_HIT = 0,
@@ -454,9 +431,7 @@ enum stats_track_type {
 				       * suppress_pings */
 #define LL_SBI_FAST_READ     0x400000 /* fast read support */
 #define LL_SBI_FILE_SECCTX   0x800000 /* set file security context at create */
-/*	LL_SBI_PIO	    0x1000000    parallel IO support, introduced in
-					 2.10, abandoned */
-#define LL_SBI_TINY_WRITE   0x2000000 /* tiny write support */
+#define LL_SBI_PIO          0x1000000 /* parallel IO support */
 
 #define LL_SBI_FLAGS { 	\
 	"nolck",	\
@@ -484,7 +459,6 @@ enum stats_track_type {
 	"fast_read",	\
 	"file_secctx",	\
 	"pio",		\
-	"tiny_write",	\
 }
 
 /* This is embedded into llite super-blocks to keep track of connect
@@ -503,23 +477,20 @@ struct lustre_client_ocd {
 struct ll_sb_info {
 	/* this protects pglist and ra_info.  It isn't safe to
 	 * grab from interrupt contexts */
-	spinlock_t		 ll_lock;
-	spinlock_t		 ll_pp_extent_lock; /* pp_extent entry*/
-	spinlock_t		 ll_process_lock; /* ll_rw_process_info */
-	struct obd_uuid		 ll_sb_uuid;
-	struct obd_export	*ll_md_exp;
-	struct obd_export	*ll_dt_exp;
-	struct obd_device	*ll_md_obd;
-	struct obd_device	*ll_dt_obd;
-	struct dentry		*ll_debugfs_entry;
-	struct lu_fid		 ll_root_fid; /* root object fid */
+	spinlock_t		  ll_lock;
+	spinlock_t		  ll_pp_extent_lock; /* pp_extent entry*/
+	spinlock_t		  ll_process_lock; /* ll_rw_process_info */
+        struct obd_uuid           ll_sb_uuid;
+        struct obd_export        *ll_md_exp;
+        struct obd_export        *ll_dt_exp;
+        struct proc_dir_entry*    ll_proc_root;
+        struct lu_fid             ll_root_fid; /* root object fid */
 
         int                       ll_flags;
 	unsigned int		  ll_umounting:1,
 				  ll_xattr_cache_enabled:1,
 				  ll_xattr_cache_set:1, /* already set to 0/1 */
-				  ll_client_common_fill_super_succeeded:1,
-				  ll_checksum_set:1;
+				  ll_client_common_fill_super_succeeded:1;
 
         struct lustre_client_ocd  ll_lco;
 
@@ -570,12 +541,6 @@ struct ll_sb_info {
 
 	/* st_blksize returned by stat(2), when non-zero */
 	unsigned int		  ll_stat_blksize;
-
-	/* maximum relative age of cached statfs results */
-	unsigned int		  ll_statfs_max_age;
-
-	struct kset		  ll_kset;	/* sysfs object */
-	struct completion	  ll_kobj_unregister;
 };
 
 /*
@@ -680,19 +645,11 @@ struct ll_file_data {
 	 * true: failure is known, not report again.
 	 * false: unknown failure, should report. */
 	bool fd_write_failed;
-	bool ll_lock_no_expand;
 	rwlock_t fd_lock; /* protect lcc list */
 	struct list_head fd_lccs; /* list of ll_cl_context */
-	/* Used by mirrored file to lead IOs to a specific mirror, usually
-	 * for mirror resync. 0 means default. */
-	__u32 fd_designated_mirror;
-	/* The layout version when resync starts. Resync I/O should carry this
-	 * layout version for verification to OST objects */
-	__u32 fd_layout_version;
 };
 
-void llite_tunables_unregister(void);
-int llite_tunables_register(void);
+extern struct proc_dir_entry *proc_lustre_fs_root;
 
 static inline struct inode *ll_info2i(struct ll_inode_info *lli)
 {
@@ -730,11 +687,6 @@ static inline bool ll_sbi_has_fast_read(struct ll_sb_info *sbi)
 	return !!(sbi->ll_flags & LL_SBI_FAST_READ);
 }
 
-static inline bool ll_sbi_has_tiny_write(struct ll_sb_info *sbi)
-{
-	return !!(sbi->ll_flags & LL_SBI_TINY_WRITE);
-}
-
 void ll_ras_enter(struct file *f);
 
 /* llite/lcommon_misc.c */
@@ -745,9 +697,21 @@ int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock,
 void cl_put_grouplock(struct ll_grouplock *lg);
 
 /* llite/lproc_llite.c */
-int ll_debugfs_register_super(struct super_block *sb, const char *name);
-void ll_debugfs_unregister_super(struct super_block *sb);
+#ifdef CONFIG_PROC_FS
+int lprocfs_ll_register_mountpoint(struct proc_dir_entry *parent,
+				   struct super_block *sb);
+int lprocfs_ll_register_obd(struct super_block *sb, const char *obdname);
+void lprocfs_ll_unregister_mountpoint(struct ll_sb_info *sbi);
 void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count);
+extern struct lprocfs_vars lprocfs_llite_obd_vars[];
+#else
+static inline int lprocfs_ll_register_mountpoint(struct proc_dir_entry *parent,
+					struct super_block *sb) {return 0; }
+static inline int lprocfs_ll_register_obd(struct super_block *sb,
+					  const char *obdname) {return 0; }
+static inline void lprocfs_ll_unregister_mountpoint(struct ll_sb_info *sbi) {}
+static void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count) {}
+#endif
 
 enum {
 	LPROC_LL_DIRTY_HITS,
@@ -789,10 +753,6 @@ enum {
 };
 
 /* llite/dir.c */
-enum get_default_layout_type {
-	GET_DEFAULT_LAYOUT_ROOT = 1,
-};
-
 struct ll_dir_chain {
 };
 
@@ -835,8 +795,6 @@ void ll_update_times(struct ptlrpc_request *request, struct inode *inode);
 int ll_writepage(struct page *page, struct writeback_control *wbc);
 int ll_writepages(struct address_space *, struct writeback_control *wbc);
 int ll_readpage(struct file *file, struct page *page);
-int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
-			   struct cl_page *page, struct file *file);
 void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras);
 int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io);
 
@@ -881,25 +839,8 @@ int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 #endif /* CONFIG_FS_POSIX_ACL */
 
 #endif
-
-static inline int ll_xflags_to_inode_flags(int xflags)
-{
-	return ((xflags & FS_XFLAG_SYNC)      ? S_SYNC      : 0) |
-	       ((xflags & FS_XFLAG_NOATIME)   ? S_NOATIME   : 0) |
-	       ((xflags & FS_XFLAG_APPEND)    ? S_APPEND    : 0) |
-	       ((xflags & FS_XFLAG_IMMUTABLE) ? S_IMMUTABLE : 0);
-}
-
-static inline int ll_inode_flags_to_xflags(int flags)
-{
-	return ((flags & S_SYNC)      ? FS_XFLAG_SYNC      : 0) |
-	       ((flags & S_NOATIME)   ? FS_XFLAG_NOATIME   : 0) |
-	       ((flags & S_APPEND)    ? FS_XFLAG_APPEND    : 0) |
-	       ((flags & S_IMMUTABLE) ? FS_XFLAG_IMMUTABLE : 0);
-}
-
-int ll_migrate(struct inode *parent, struct file *file,
-	       struct lmv_user_md *lum, const char *name);
+int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
+	       const char *name, int namelen);
 int ll_get_fid_by_name(struct inode *parent, const char *name,
 		       int namelen, struct lu_fid *fid, struct inode **inode);
 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
@@ -911,7 +852,6 @@ int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd);
 int ll_inode_permission(struct inode *inode, int mask);
 # endif
 #endif
-int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa);
 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
 			unsigned long arg);
 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
@@ -925,11 +865,9 @@ int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
                              struct ptlrpc_request **request);
 int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
                      int set_default);
-int ll_dir_getstripe_default(struct inode *inode, void **lmmp,
-			     int *lmm_size, struct ptlrpc_request **request,
-			     struct ptlrpc_request **root_request, u64 valid);
-int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size,
-		     struct ptlrpc_request **request, u64 valid);
+int ll_dir_getstripe(struct inode *inode, void **lmmp,
+		     int *lmm_size, struct ptlrpc_request **request,
+		     u64 valid);
 #ifdef HAVE_FILE_FSYNC_4ARGS
 int ll_fsync(struct file *file, loff_t start, loff_t end, int data);
 #elif defined(HAVE_FILE_FSYNC_2ARGS)
@@ -942,7 +880,6 @@ int ll_fid2path(struct inode *inode, void __user *arg);
 int ll_data_version(struct inode *inode, __u64 *data_version, int flags);
 int ll_hsm_release(struct inode *inode);
 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss);
-void ll_io_set_mirror(struct cl_io *io, const struct file *file);
 
 /* llite/dcache.c */
 
@@ -965,14 +902,12 @@ void ll_kill_super(struct super_block *sb);
 struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock);
 void ll_dir_clear_lsm_md(struct inode *inode);
 void ll_clear_inode(struct inode *inode);
-int ll_setattr_raw(struct dentry *dentry, struct iattr *attr,
-		   enum op_xvalid xvalid, bool hsm_import);
+int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import);
 int ll_setattr(struct dentry *de, struct iattr *attr);
 int ll_statfs(struct dentry *de, struct kstatfs *sfs);
-int ll_statfs_internal(struct ll_sb_info *sbi, struct obd_statfs *osfs,
-		       u32 flags);
+int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
+                       __u64 max_age, __u32 flags);
 int ll_update_inode(struct inode *inode, struct lustre_md *md);
-void ll_update_inode_flags(struct inode *inode, int ext_flags);
 int ll_read_inode2(struct inode *inode, void *opaque);
 void ll_delete_inode(struct inode *inode);
 int ll_iocontrol(struct inode *inode, struct file *file,
@@ -992,6 +927,7 @@ int ll_obd_statfs(struct inode *inode, void __user *arg);
 int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize);
 int ll_get_default_mdsize(struct ll_sb_info *sbi, int *default_mdsize);
 int ll_set_default_mdsize(struct ll_sb_info *sbi, int default_mdsize);
+int ll_process_config(struct lustre_cfg *lcfg);
 
 enum {
 	LUSTRE_OPC_MKDIR	= 0,
@@ -1001,7 +937,6 @@ enum {
 	LUSTRE_OPC_ANY		= 5,
 };
 
-void ll_unlock_md_op_lsm(struct md_op_data *op_data);
 struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
 				      struct inode *i1, struct inode *i2,
 				      const char *name, size_t namelen,
@@ -1014,8 +949,6 @@ ssize_t ll_copy_user_md(const struct lov_user_md __user *md,
 			struct lov_user_md **kbuf);
 void ll_open_cleanup(struct super_block *sb, struct ptlrpc_request *open_req);
 
-void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req);
-
 /* Compute expected user md size when passing in a md from user space */
 static inline ssize_t ll_lov_user_md_size(const struct lov_user_md *lum)
 {
@@ -1084,6 +1017,7 @@ struct ll_thread_info {
 	struct iov_iter		lti_iter;
 	struct vvp_io_args	lti_args;
 	struct ra_io_arg	lti_ria;
+	struct kiocb		lti_kiocb;
 	struct ll_cl_context	lti_io_ctx;
 };
 
@@ -1298,18 +1232,11 @@ static inline int cl_glimpse_size(struct inode *inode)
 	return cl_glimpse_size0(inode, 0);
 }
 
-/* AGL is 'asychronous glimpse lock', which is a speculative lock taken as
- * part of statahead */
 static inline int cl_agl(struct inode *inode)
 {
 	return cl_glimpse_size0(inode, 1);
 }
 
-int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise);
-
-int cl_io_get(struct inode *inode, struct lu_env **envout,
-	      struct cl_io **ioout, __u16 *refcheck);
-
 static inline int ll_glimpse_size(struct inode *inode)
 {
 	struct ll_inode_info *lli = ll_i2info(inode);
@@ -1317,7 +1244,7 @@ static inline int ll_glimpse_size(struct inode *inode)
 
 	down_read(&lli->lli_glimpse_sem);
 	rc = cl_glimpse_size(inode);
-	lli->lli_glimpse_time = ktime_get();
+	lli->lli_glimpse_time = cfs_time_current();
 	up_read(&lli->lli_glimpse_sem);
 	return rc;
 }
@@ -1487,8 +1414,7 @@ static inline void d_lustre_revalidate(struct dentry *dentry)
 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf);
 int ll_layout_refresh(struct inode *inode, __u32 *gen);
 int ll_layout_restore(struct inode *inode, loff_t start, __u64 length);
-int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
-			   struct lu_extent *ext);
+int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end);
 
 int ll_xattr_init(void);
 void ll_xattr_fini(void);
@@ -1500,7 +1426,7 @@ int ll_getparent(struct file *file, struct getparent __user *arg);
 
 /* lcommon_cl.c */
 int cl_setattr_ost(struct cl_object *obj, const struct iattr *attr,
-		   enum op_xvalid xvalid, unsigned int attr_flags);
+		   unsigned int attr_flags);
 
 extern struct lu_env *cl_inode_fini_env;
 extern __u16 cl_inode_fini_refcheck;
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_lib.c b/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
index 52bea6c96dc1a..644b1c4e26d47 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -46,18 +46,16 @@
 #ifdef HAVE_UIDGID_HEADER
 # include <linux/uidgid.h>
 #endif
-#include <linux/security.h>
 
-#include <uapi/linux/lustre/lustre_ioctl.h>
+#include <uapi/linux/lustre_ioctl.h>
 #ifdef HAVE_UAPI_LINUX_MOUNT_H
 #include <uapi/linux/mount.h>
 #endif
-
 #include <lustre_ha.h>
 #include <lustre_dlm.h>
 #include <lprocfs_status.h>
 #include <lustre_disk.h>
-#include <uapi/linux/lustre/lustre_param.h>
+#include <uapi/linux/lustre_param.h>
 #include <lustre_log.h>
 #include <cl_object.h>
 #include <obd_cksum.h>
@@ -88,7 +86,6 @@ static struct ll_sb_info *ll_init_sbi(void)
 	spin_lock_init(&sbi->ll_pp_extent_lock);
 	spin_lock_init(&sbi->ll_process_lock);
         sbi->ll_rw_stats_on = 0;
-	sbi->ll_statfs_max_age = OBD_STATFS_CACHE_SECONDS;
 
         si_meminfo(&si);
         pages = si.totalram - si.totalhigh;
@@ -114,9 +111,6 @@ static struct ll_sb_info *ll_init_sbi(void)
 #ifdef ENABLE_CHECKSUM
         sbi->ll_flags |= LL_SBI_CHECKSUM;
 #endif
-#ifdef ENABLE_FLOCK
-	sbi->ll_flags |= LL_SBI_FLOCK;
-#endif
 
 #ifdef HAVE_LRU_RESIZE_SUPPORT
         sbi->ll_flags |= LL_SBI_LRU_RESIZE;
@@ -139,7 +133,6 @@ static struct ll_sb_info *ll_init_sbi(void)
 	atomic_set(&sbi->ll_agl_total, 0);
 	sbi->ll_flags |= LL_SBI_AGL_ENABLED;
 	sbi->ll_flags |= LL_SBI_FAST_READ;
-	sbi->ll_flags |= LL_SBI_TINY_WRITE;
 
 	/* root squash */
 	sbi->ll_squash.rsi_uid = 0;
@@ -167,23 +160,30 @@ static void ll_free_sbi(struct super_block *sb)
 	EXIT;
 }
 
+static inline int obd_connect_has_secctx(struct obd_connect_data *data)
+{
+	return data->ocd_connect_flags & OBD_CONNECT_FLAGS2 &&
+	       data->ocd_connect_flags2 & OBD_CONNECT2_FILE_SECCTX;
+}
+
 static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
                                     struct vfsmount *mnt)
 {
 	struct inode *root = NULL;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
-	struct obd_statfs *osfs = NULL;
-	struct ptlrpc_request *request = NULL;
-	struct obd_connect_data *data = NULL;
-	struct obd_uuid *uuid;
-	struct md_op_data *op_data;
-	struct lustre_md lmd;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        struct obd_device *obd;
+        struct obd_statfs *osfs = NULL;
+        struct ptlrpc_request *request = NULL;
+        struct obd_connect_data *data = NULL;
+        struct obd_uuid *uuid;
+        struct md_op_data *op_data;
+        struct lustre_md lmd;
 	u64 valid;
-	int size, err, checksum;
+        int size, err, checksum;
+        ENTRY;
 
-	ENTRY;
-	sbi->ll_md_obd = class_name2obd(md);
-	if (!sbi->ll_md_obd) {
+        obd = class_name2obd(md);
+        if (!obd) {
                 CERROR("MD %s: not setup or attached\n", md);
                 RETURN(-EINVAL);
         }
@@ -198,18 +198,13 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
                 RETURN(-ENOMEM);
         }
 
-	/* pass client page size via ocd_grant_blkbits, the server should report
-	 * back its backend blocksize for grant calculation purpose */
-	data->ocd_grant_blkbits = PAGE_SHIFT;
-
-	/* indicate MDT features supported by this client */
-	data->ocd_connect_flags = OBD_CONNECT_IBITS    | OBD_CONNECT_NODEVOH  |
-				  OBD_CONNECT_ATTRFID  | OBD_CONNECT_GRANT |
-				  OBD_CONNECT_VERSION  | OBD_CONNECT_BRW_SIZE |
-				  OBD_CONNECT_SRVLOCK  | OBD_CONNECT_TRUNCLOCK|
-				  OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA |
-				  OBD_CONNECT_CANCELSET | OBD_CONNECT_FID     |
-				  OBD_CONNECT_AT       | OBD_CONNECT_LOV_V3   |
+        /* indicate the features supported by this client */
+        data->ocd_connect_flags = OBD_CONNECT_IBITS    | OBD_CONNECT_NODEVOH  |
+                                  OBD_CONNECT_ATTRFID  |
+                                  OBD_CONNECT_VERSION  | OBD_CONNECT_BRW_SIZE |
+                                  OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA |
+                                  OBD_CONNECT_CANCELSET | OBD_CONNECT_FID     |
+                                  OBD_CONNECT_AT       | OBD_CONNECT_LOV_V3   |
 				  OBD_CONNECT_VBR | OBD_CONNECT_FULL20 |
 				  OBD_CONNECT_64BITHASH |
 				  OBD_CONNECT_EINPROGRESS |
@@ -220,20 +215,11 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 				  OBD_CONNECT_DISP_STRIPE | OBD_CONNECT_LFSCK |
 				  OBD_CONNECT_OPEN_BY_FID |
 				  OBD_CONNECT_DIR_STRIPE |
-				  OBD_CONNECT_BULK_MBITS | OBD_CONNECT_CKSUM |
+				  OBD_CONNECT_BULK_MBITS |
 				  OBD_CONNECT_SUBTREE |
-				  OBD_CONNECT_MULTIMODRPCS |
-				  OBD_CONNECT_GRANT_PARAM |
-				  OBD_CONNECT_SHORTIO | OBD_CONNECT_FLAGS2;
-
-	data->ocd_connect_flags2 = OBD_CONNECT2_FLR |
-				   OBD_CONNECT2_LOCK_CONVERT |
-				   OBD_CONNECT2_DIR_MIGRATE |
-				   OBD_CONNECT2_SUM_STATFS |
-				   OBD_CONNECT2_ARCHIVE_ID_ARRAY |
-				   OBD_CONNECT2_LSOM |
-				   OBD_CONNECT2_ASYNC_DISCARD |
-				   OBD_CONNECT2_GETATTR_PFID;
+				  OBD_CONNECT_FLAGS2 | OBD_CONNECT_MULTIMODRPCS;
+
+	data->ocd_connect_flags2 = 0;
 
 #ifdef HAVE_LRU_RESIZE_SUPPORT
         if (sbi->ll_flags & LL_SBI_LRU_RESIZE)
@@ -244,8 +230,6 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 				   OBD_CONNECT_LARGE_ACL;
 #endif
 
-	data->ocd_cksum_types = obd_cksum_types_supported_client();
-
 	if (OBD_FAIL_CHECK(OBD_FAIL_MDC_LIGHTWEIGHT))
 		/* flag mdc connection as lightweight, only used for test
 		 * purpose, use with care */
@@ -277,16 +261,13 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	if (sbi->ll_flags & LL_SBI_ALWAYS_PING)
 		data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS;
 
-	obd_connect_set_secctx(data);
-
-#if defined(CONFIG_SECURITY)
-	data->ocd_connect_flags2 |= OBD_CONNECT2_SELINUX_POLICY;
-#endif
+#if defined(HAVE_SECURITY_DENTRY_INIT_SECURITY) && defined(CONFIG_SECURITY)
+	data->ocd_connect_flags2 |= OBD_CONNECT2_FILE_SECCTX;
+#endif /* HAVE_SECURITY_DENTRY_INIT_SECURITY */
 
 	data->ocd_brw_size = MD_MAX_BRW_SIZE;
 
-	err = obd_connect(NULL, &sbi->ll_md_exp, sbi->ll_md_obd,
-			  &sbi->ll_sb_uuid, data, NULL);
+        err = obd_connect(NULL, &sbi->ll_md_exp, obd, &sbi->ll_sb_uuid, data, NULL);
 	if (err == -EBUSY) {
 		LCONSOLE_ERROR_MSG(0x14f, "An MDT (md %s) is performing "
 				   "recovery, of which this client is not a "
@@ -312,7 +293,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	 * can make sure the client can be mounted as long as MDT0 is
 	 * avaible */
 	err = obd_statfs(NULL, sbi->ll_md_exp, osfs,
-			ktime_get_seconds() - sbi->ll_statfs_max_age,
+			cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
 			OBD_STATFS_FOR_MDT0);
 	if (err)
 		GOTO(out_md_fid, err);
@@ -399,8 +380,8 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 		}
 	}
 
-	sbi->ll_dt_obd = class_name2obd(dt);
-	if (!sbi->ll_dt_obd) {
+	obd = class_name2obd(dt);
+	if (!obd) {
 		CERROR("DT %s: not setup or attached\n", dt);
 		GOTO(out_md_fid, err = -ENODEV);
 	}
@@ -409,7 +390,6 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	 * back its backend blocksize for grant calculation purpose */
 	data->ocd_grant_blkbits = PAGE_SHIFT;
 
-	/* indicate OST features supported by this client */
 	data->ocd_connect_flags = OBD_CONNECT_GRANT | OBD_CONNECT_VERSION |
 				  OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE |
 				  OBD_CONNECT_CANCELSET | OBD_CONNECT_FID |
@@ -421,41 +401,23 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 				  OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
 				  OBD_CONNECT_LAYOUTLOCK |
 				  OBD_CONNECT_PINGLESS | OBD_CONNECT_LFSCK |
-				  OBD_CONNECT_BULK_MBITS | OBD_CONNECT_SHORTIO |
-				  OBD_CONNECT_FLAGS2 | OBD_CONNECT_GRANT_SHRINK;
-
-/* The client currently advertises support for OBD_CONNECT_LOCKAHEAD_OLD so it
- * can interoperate with an older version of lockahead which was released prior
- * to landing in master. This support will be dropped when 2.13 development
- * starts.  At the point, we should not just drop the connect flag (below), we
- * should also remove the support in the code.
- *
- * Removing it means a few things:
- * 1. Remove this section here
- * 2. Remove CEF_NONBLOCK in ll_file_lockahead()
- * 3. Remove function exp_connect_lockahead_old
- * 4. Remove LDLM_FL_LOCKAHEAD_OLD_RESERVED in lustre_dlm_flags.h
- * */
-#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 12, 50, 0)
-	data->ocd_connect_flags |= OBD_CONNECT_LOCKAHEAD_OLD;
-#endif
+				  OBD_CONNECT_BULK_MBITS;
 
-	data->ocd_connect_flags2 = OBD_CONNECT2_LOCKAHEAD;
+	data->ocd_connect_flags2 = 0;
 
 	if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_GRANT_PARAM))
 		data->ocd_connect_flags |= OBD_CONNECT_GRANT_PARAM;
 
 	/* OBD_CONNECT_CKSUM should always be set, even if checksums are
 	 * disabled by default, because it can still be enabled on the
-	 * fly via /sys. As a consequence, we still need to come to an
-	 * agreement on the supported algorithms at connect time
-	 */
+	 * fly via /proc. As a consequence, we still need to come to an
+	 * agreement on the supported algorithms at connect time */
 	data->ocd_connect_flags |= OBD_CONNECT_CKSUM;
 
 	if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CKSUM_ADLER_ONLY))
 		data->ocd_cksum_types = OBD_CKSUM_ADLER;
 	else
-		data->ocd_cksum_types = obd_cksum_types_supported_client();
+		data->ocd_cksum_types = cksum_types_supported_client();
 
 #ifdef HAVE_LRU_RESIZE_SUPPORT
 	data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
@@ -468,13 +430,13 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	       "ocd_grant: %d\n", data->ocd_connect_flags,
 	       data->ocd_version, data->ocd_grant);
 
-	sbi->ll_dt_obd->obd_upcall.onu_owner = &sbi->ll_lco;
-	sbi->ll_dt_obd->obd_upcall.onu_upcall = cl_ocd_update;
+	obd->obd_upcall.onu_owner = &sbi->ll_lco;
+	obd->obd_upcall.onu_upcall = cl_ocd_update;
 
 	data->ocd_brw_size = DT_MAX_BRW_SIZE;
 
-	err = obd_connect(NULL, &sbi->ll_dt_exp, sbi->ll_dt_obd,
-			  &sbi->ll_sb_uuid, data, NULL);
+	err = obd_connect(NULL, &sbi->ll_dt_exp, obd, &sbi->ll_sb_uuid, data,
+			  NULL);
 	if (err == -EBUSY) {
 		LCONSOLE_ERROR_MSG(0x150, "An OST (dt %s) is performing "
 				   "recovery, of which this client is not a "
@@ -490,15 +452,10 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	sbi->ll_dt_exp->exp_connect_data = *data;
 
 	/* Don't change value if it was specified in the config log */
-	if (sbi->ll_ra_info.ra_max_read_ahead_whole_pages == -1) {
+	if (sbi->ll_ra_info.ra_max_read_ahead_whole_pages == -1)
 		sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
 			max_t(unsigned long, SBI_DEFAULT_READAHEAD_WHOLE_MAX,
 			      (data->ocd_brw_size >> PAGE_SHIFT));
-		if (sbi->ll_ra_info.ra_max_read_ahead_whole_pages >
-		    sbi->ll_ra_info.ra_max_pages_per_file)
-			sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
-				sbi->ll_ra_info.ra_max_pages_per_file;
-	}
 
 	err = obd_fid_init(sbi->ll_dt_exp->exp_obd, sbi->ll_dt_exp,
 			   LUSTRE_SEQ_METADATA);
@@ -589,15 +546,13 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	}
 
 	checksum = sbi->ll_flags & LL_SBI_CHECKSUM;
-	if (sbi->ll_checksum_set) {
-		err = obd_set_info_async(NULL, sbi->ll_dt_exp,
-					 sizeof(KEY_CHECKSUM), KEY_CHECKSUM,
-					 sizeof(checksum), &checksum, NULL);
-		if (err) {
-			CERROR("%s: Set checksum failed: rc = %d\n",
-			       sbi->ll_dt_exp->exp_obd->obd_name, err);
-			GOTO(out_root, err);
-		}
+	err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM),
+				 KEY_CHECKSUM, sizeof(checksum), &checksum,
+				 NULL);
+	if (err) {
+		CERROR("%s: Set checksum failed: rc = %d\n",
+		       sbi->ll_dt_exp->exp_obd->obd_name, err);
+		GOTO(out_root, err);
 	}
 	cl_sb_init(sb);
 
@@ -636,21 +591,14 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	if (osfs != NULL)
 		OBD_FREE_PTR(osfs);
 
-	if (sbi->ll_dt_obd) {
-		err = sysfs_create_link(&sbi->ll_kset.kobj,
-					&sbi->ll_dt_obd->obd_kset.kobj,
-					sbi->ll_dt_obd->obd_type->typ_name);
+	if (sbi->ll_proc_root != NULL) {
+		err = lprocfs_ll_register_obd(sb, dt);
 		if (err < 0) {
 			CERROR("%s: could not register %s in llite: rc = %d\n",
 			       dt, ll_get_fsname(sb, NULL, 0), err);
 			err = 0;
 		}
-	}
-
-	if (sbi->ll_md_obd) {
-		err = sysfs_create_link(&sbi->ll_kset.kobj,
-					&sbi->ll_md_obd->obd_kset.kobj,
-					sbi->ll_md_obd->obd_type->typ_name);
+		err = lprocfs_ll_register_obd(sb, md);
 		if (err < 0) {
 			CERROR("%s: could not register %s in llite: rc = %d\n",
 			       md, ll_get_fsname(sb, NULL, 0), err);
@@ -667,13 +615,11 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 out_dt:
 	obd_disconnect(sbi->ll_dt_exp);
 	sbi->ll_dt_exp = NULL;
-	sbi->ll_dt_obd = NULL;
 out_md_fid:
 	obd_fid_fini(sbi->ll_md_exp->exp_obd);
 out_md:
 	obd_disconnect(sbi->ll_md_exp);
 	sbi->ll_md_exp = NULL;
-	sbi->ll_md_obd = NULL;
 out:
 	if (data != NULL)
 		OBD_FREE_PTR(data);
@@ -765,7 +711,7 @@ static void client_common_put_super(struct super_block *sb)
 	obd_disconnect(sbi->ll_dt_exp);
 	sbi->ll_dt_exp = NULL;
 
-	ll_debugfs_unregister_super(sb);
+	lprocfs_ll_unregister_mountpoint(sbi);
 
 	obd_fid_fini(sbi->ll_md_exp->exp_obd);
 	obd_disconnect(sbi->ll_md_exp);
@@ -803,57 +749,56 @@ void ll_kill_super(struct super_block *sb)
 
 static inline int ll_set_opt(const char *opt, char *data, int fl)
 {
-	if (strncmp(opt, data, strlen(opt)) != 0)
-		return 0;
-	else
-		return fl;
+        if (strncmp(opt, data, strlen(opt)) != 0)
+                return(0);
+        else
+                return(fl);
 }
 
 /* non-client-specific mount options are parsed in lmd_parse */
-static int ll_options(char *options, struct ll_sb_info *sbi)
+static int ll_options(char *options, int *flags)
 {
-	int tmp;
-	char *s1 = options, *s2;
-	int *flags = &sbi->ll_flags;
-	ENTRY;
+        int tmp;
+        char *s1 = options, *s2;
+        ENTRY;
 
-	if (!options)
-		RETURN(0);
+        if (!options)
+                RETURN(0);
 
-	CDEBUG(D_CONFIG, "Parsing opts %s\n", options);
+        CDEBUG(D_CONFIG, "Parsing opts %s\n", options);
 
-	while (*s1) {
-		CDEBUG(D_SUPER, "next opt=%s\n", s1);
-		tmp = ll_set_opt("nolock", s1, LL_SBI_NOLCK);
-		if (tmp) {
-			*flags |= tmp;
-			goto next;
-		}
-		tmp = ll_set_opt("flock", s1, LL_SBI_FLOCK);
-		if (tmp) {
-			*flags = (*flags & ~LL_SBI_LOCALFLOCK) | tmp;
-			goto next;
-		}
-		tmp = ll_set_opt("localflock", s1, LL_SBI_LOCALFLOCK);
-		if (tmp) {
-			*flags = (*flags & ~LL_SBI_FLOCK) | tmp;
-			goto next;
-		}
-		tmp = ll_set_opt("noflock", s1, LL_SBI_FLOCK|LL_SBI_LOCALFLOCK);
-		if (tmp) {
-			*flags &= ~tmp;
-			goto next;
-		}
-		tmp = ll_set_opt("user_xattr", s1, LL_SBI_USER_XATTR);
-		if (tmp) {
-			*flags |= tmp;
-			goto next;
-		}
-		tmp = ll_set_opt("nouser_xattr", s1, LL_SBI_USER_XATTR);
-		if (tmp) {
-			*flags &= ~tmp;
-			goto next;
-		}
+        while (*s1) {
+                CDEBUG(D_SUPER, "next opt=%s\n", s1);
+                tmp = ll_set_opt("nolock", s1, LL_SBI_NOLCK);
+                if (tmp) {
+                        *flags |= tmp;
+                        goto next;
+                }
+                tmp = ll_set_opt("flock", s1, LL_SBI_FLOCK);
+                if (tmp) {
+                        *flags |= tmp;
+                        goto next;
+                }
+                tmp = ll_set_opt("localflock", s1, LL_SBI_LOCALFLOCK);
+                if (tmp) {
+                        *flags |= tmp;
+                        goto next;
+                }
+                tmp = ll_set_opt("noflock", s1, LL_SBI_FLOCK|LL_SBI_LOCALFLOCK);
+                if (tmp) {
+                        *flags &= ~tmp;
+                        goto next;
+                }
+                tmp = ll_set_opt("user_xattr", s1, LL_SBI_USER_XATTR);
+                if (tmp) {
+                        *flags |= tmp;
+                        goto next;
+                }
+                tmp = ll_set_opt("nouser_xattr", s1, LL_SBI_USER_XATTR);
+                if (tmp) {
+                        *flags &= ~tmp;
+                        goto next;
+                }
 		tmp = ll_set_opt("context", s1, 1);
 		if (tmp)
 			goto next;
@@ -877,18 +822,16 @@ static int ll_options(char *options, struct ll_sb_info *sbi)
 			goto next;
 		}
 
-		tmp = ll_set_opt("checksum", s1, LL_SBI_CHECKSUM);
-		if (tmp) {
-			*flags |= tmp;
-			sbi->ll_checksum_set = 1;
-			goto next;
-		}
-		tmp = ll_set_opt("nochecksum", s1, LL_SBI_CHECKSUM);
-		if (tmp) {
-			*flags &= ~tmp;
-			sbi->ll_checksum_set = 1;
-			goto next;
-		}
+                tmp = ll_set_opt("checksum", s1, LL_SBI_CHECKSUM);
+                if (tmp) {
+                        *flags |= tmp;
+                        goto next;
+                }
+                tmp = ll_set_opt("nochecksum", s1, LL_SBI_CHECKSUM);
+                if (tmp) {
+                        *flags &= ~tmp;
+                        goto next;
+                }
                 tmp = ll_set_opt("lruresize", s1, LL_SBI_LRU_RESIZE);
                 if (tmp) {
                         *flags |= tmp;
@@ -975,24 +918,21 @@ void ll_lli_init(struct ll_inode_info *lli)
 		lli->lli_opendir_pid = 0;
 		lli->lli_sa_enabled = 0;
 		lli->lli_def_stripe_offset = -1;
-		init_rwsem(&lli->lli_lsm_sem);
 	} else {
 		mutex_init(&lli->lli_size_mutex);
 		lli->lli_symlink_name = NULL;
 		init_rwsem(&lli->lli_trunc_sem);
 		range_lock_tree_init(&lli->lli_write_tree);
 		init_rwsem(&lli->lli_glimpse_sem);
-		lli->lli_glimpse_time = ktime_set(0, 0);
+		lli->lli_glimpse_time = 0;
 		INIT_LIST_HEAD(&lli->lli_agl_list);
 		lli->lli_agl_index = 0;
 		lli->lli_async_rc = 0;
 	}
 	mutex_init(&lli->lli_layout_mutex);
-	memset(lli->lli_jobid, 0, sizeof(lli->lli_jobid));
+	memset(lli->lli_jobid, 0, LUSTRE_JOBID_SIZE);
 }
 
-#define MAX_STRING_SIZE 128
-
 #ifndef HAVE_SUPER_SETUP_BDI_NAME
 
 #define LSI_BDI_INITIALIZED	0x00400000
@@ -1001,6 +941,8 @@ void ll_lli_init(struct ll_inode_info *lli)
 # define BDI_CAP_MAP_COPY	0
 #endif
 
+#define MAX_STRING_SIZE 128
+
 static int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
 {
 	struct  lustre_sb_info *lsi = s2lsi(sb);
@@ -1031,79 +973,68 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
 {
 	struct	lustre_profile *lprof = NULL;
 	struct	lustre_sb_info *lsi = s2lsi(sb);
-	struct	ll_sb_info *sbi = NULL;
+	struct	ll_sb_info *sbi;
 	char	*dt = NULL, *md = NULL;
 	char	*profilenm = get_profile_name(sb);
 	struct config_llog_instance *cfg;
 	/* %p for void* in printf needs 16+2 characters: 0xffffffffffffffff */
-	const int instlen = 16 + 2;
-	unsigned long cfg_instance = ll_get_cfg_instance(sb);
-	char name[MAX_STRING_SIZE];
-	int md_len = 0;
-	int dt_len = 0;
-	char *ptr;
-	int len;
-	int err;
-
+	const int instlen = sizeof(cfg->cfg_instance) * 2 + 2;
+	int	md_len = 0;
+	int	dt_len = 0;
+	int	err;
 	ENTRY;
-	/* for ASLR, to map between cfg_instance and hashed ptr */
-	CDEBUG(D_VFSTRACE, "VFS Op: cfg_instance %s-%016lx (sb %p)\n",
-	       profilenm, cfg_instance, sb);
 
-	try_module_get(THIS_MODULE);
+	CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb);
 
 	OBD_ALLOC_PTR(cfg);
 	if (cfg == NULL)
-		GOTO(out_free_cfg, err = -ENOMEM);
+		RETURN(-ENOMEM);
+
+	try_module_get(THIS_MODULE);
 
 	/* client additional sb info */
 	lsi->lsi_llsbi = sbi = ll_init_sbi();
-	if (!sbi)
-		GOTO(out_free_cfg, err = -ENOMEM);
+	if (!sbi) {
+		module_put(THIS_MODULE);
+		OBD_FREE_PTR(cfg);
+		RETURN(-ENOMEM);
+	}
 
-	err = ll_options(lsi->lsi_lmd->lmd_opts, sbi);
+	err = ll_options(lsi->lsi_lmd->lmd_opts, &sbi->ll_flags);
 	if (err)
-		GOTO(out_free_cfg, err);
+		GOTO(out_free, err);
 
-	err = super_setup_bdi_name(sb, "lustre-%016lx", cfg_instance);
+	err = super_setup_bdi_name(sb, "lustre-%p", sb);
 	if (err)
-		GOTO(out_free_cfg, err);
+		GOTO(out_free, err);
 
 #ifndef HAVE_DCACHE_LOCK
 	/* kernel >= 2.6.38 store dentry operations in sb->s_d_op. */
 	sb->s_d_op = &ll_d_ops;
 #endif
-	/* Get fsname */
-	len = strlen(profilenm);
-	ptr = strrchr(profilenm, '-');
-	if (ptr && (strcmp(ptr, "-client") == 0))
-		len -= 7;
 
-	/* Mount info */
-	snprintf(name, MAX_STRING_SIZE, "%.*s-%016lx", len,
-		 profilenm, cfg_instance);
-
-	/* Call ll_debugfs_register_super() before lustre_process_log()
-	 * so that "llite.*.*" params can be processed correctly.
-	 */
-	err = ll_debugfs_register_super(sb, name);
-	if (err < 0) {
-		CERROR("%s: could not register mountpoint in llite: rc = %d\n",
-		       ll_get_fsname(sb, NULL, 0), err);
-		err = 0;
+	/* Call lprocfs_ll_register_mountpoint() before lustre_process_log()
+	 * so that "llite.*.*" params can be processed correctly. */
+	if (proc_lustre_fs_root != NULL) {
+		err = lprocfs_ll_register_mountpoint(proc_lustre_fs_root, sb);
+		if (err < 0) {
+			CERROR("%s: could not register mountpoint in llite: "
+			       "rc = %d\n", ll_get_fsname(sb, NULL, 0), err);
+			err = 0;
+		}
 	}
 
-	/* The cfg_instance is a value unique to this super, in case some
-	 * joker tries to mount the same fs at two mount points.
-	 */
-	cfg->cfg_instance = cfg_instance;
+	/* Generate a string unique to this super, in case some joker tries
+	   to mount the same fs at two mount points.
+	   Use the address of the super itself.*/
+	cfg->cfg_instance = sb;
 	cfg->cfg_uuid = lsi->lsi_llsbi->ll_sb_uuid;
 	cfg->cfg_callback = class_config_llog_handler;
 	cfg->cfg_sub_clds = CONFIG_SUB_CLIENT;
 	/* set up client obds */
 	err = lustre_process_log(sb, profilenm, cfg);
 	if (err < 0)
-		GOTO(out_debugfs, err);
+		GOTO(out_proc, err);
 
 	/* Profile set with LCFG_MOUNTOPT so we can find our mdc and osc obds */
 	lprof = class_get_profile(profilenm);
@@ -1111,7 +1042,7 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
 		LCONSOLE_ERROR_MSG(0x156, "The client profile '%s' could not be"
 				   " read from the MGS.  Does that filesystem "
 				   "exist?\n", profilenm);
-		GOTO(out_debugfs, err = -EINVAL);
+		GOTO(out_proc, err = -EINVAL);
 	}
 	CDEBUG(D_CONFIG, "Found profile %s: mdc=%s osc=%s\n", profilenm,
 	       lprof->lp_md, lprof->lp_dt);
@@ -1119,68 +1050,58 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
 	dt_len = strlen(lprof->lp_dt) + instlen + 2;
 	OBD_ALLOC(dt, dt_len);
 	if (!dt)
-		GOTO(out_profile, err = -ENOMEM);
-	snprintf(dt, dt_len - 1, "%s-%016lx", lprof->lp_dt, cfg_instance);
+		GOTO(out_proc, err = -ENOMEM);
+	snprintf(dt, dt_len - 1, "%s-%p", lprof->lp_dt, cfg->cfg_instance);
 
 	md_len = strlen(lprof->lp_md) + instlen + 2;
 	OBD_ALLOC(md, md_len);
 	if (!md)
-		GOTO(out_free_dt, err = -ENOMEM);
-	snprintf(md, md_len - 1, "%s-%016lx", lprof->lp_md, cfg_instance);
+		GOTO(out_proc, err = -ENOMEM);
+	snprintf(md, md_len - 1, "%s-%p", lprof->lp_md, cfg->cfg_instance);
 
 	/* connections, registrations, sb setup */
 	err = client_common_fill_super(sb, md, dt, mnt);
 	if (err < 0)
-		GOTO(out_free_md, err);
+		GOTO(out_proc, err);
 
 	sbi->ll_client_common_fill_super_succeeded = 1;
 
-out_free_md:
+out_proc:
+	if (err < 0)
+		lprocfs_ll_unregister_mountpoint(sbi);
+out_free:
 	if (md)
 		OBD_FREE(md, md_len);
-out_free_dt:
 	if (dt)
 		OBD_FREE(dt, dt_len);
-out_profile:
-	if (lprof)
+	if (lprof != NULL)
 		class_put_profile(lprof);
-out_debugfs:
-	if (err < 0)
-		ll_debugfs_unregister_super(sb);
-out_free_cfg:
-	if (cfg)
-		OBD_FREE_PTR(cfg);
-
 	if (err)
 		ll_put_super(sb);
 	else if (sbi->ll_flags & LL_SBI_VERBOSE)
 		LCONSOLE_WARN("Mounted %s\n", profilenm);
+
+	OBD_FREE_PTR(cfg);
 	RETURN(err);
 } /* ll_fill_super */
 
 void ll_put_super(struct super_block *sb)
 {
 	struct config_llog_instance cfg, params_cfg;
-	struct obd_device *obd;
-	struct lustre_sb_info *lsi = s2lsi(sb);
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
-	char *profilenm = get_profile_name(sb);
-	unsigned long cfg_instance = ll_get_cfg_instance(sb);
+        struct obd_device *obd;
+        struct lustre_sb_info *lsi = s2lsi(sb);
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        char *profilenm = get_profile_name(sb);
 	long ccc_count;
 	int next, force = 1, rc = 0;
-	ENTRY;
-
-	if (!sbi)
-		GOTO(out_no_sbi, 0);
+        ENTRY;
 
-	/* Should replace instance_id with something better for ASLR */
-	CDEBUG(D_VFSTRACE, "VFS Op: cfg_instance %s-%016lx (sb %p)\n",
-	       profilenm, cfg_instance, sb);
+        CDEBUG(D_VFSTRACE, "VFS Op: sb %p - %s\n", sb, profilenm);
 
-	cfg.cfg_instance = cfg_instance;
-	lustre_end_log(sb, profilenm, &cfg);
+        cfg.cfg_instance = sb;
+        lustre_end_log(sb, profilenm, &cfg);
 
-	params_cfg.cfg_instance = cfg_instance;
+	params_cfg.cfg_instance = sb;
 	lustre_end_log(sb, PARAMS_FILENAME, &params_cfg);
 
         if (sbi->ll_md_exp) {
@@ -1201,6 +1122,7 @@ void ll_put_super(struct super_block *sb)
 	if (force == 0 && rc != -EINTR)
 		LASSERTF(ccc_count == 0, "count: %li\n", ccc_count);
 
+
         /* We need to set force before the lov_disconnect in
            lustre_common_put_super, since l_d cleans up osc's as well. */
         if (force) {
@@ -1236,7 +1158,7 @@ void ll_put_super(struct super_block *sb)
 
         ll_free_sbi(sb);
         lsi->lsi_llsbi = NULL;
-out_no_sbi:
+
 	lustre_common_put_super(sb);
 
 	cl_env_cache_purge(~0);
@@ -1340,124 +1262,108 @@ static int ll_init_lsm_md(struct inode *inode, struct lustre_md *md)
 {
 	struct lu_fid *fid;
 	struct lmv_stripe_md *lsm = md->lmv;
-	struct ll_inode_info *lli = ll_i2info(inode);
 	int i;
 
 	LASSERT(lsm != NULL);
-
-	CDEBUG(D_INODE, "%s: "DFID" set dir layout:\n",
-		ll_get_fsname(inode->i_sb, NULL, 0),
-		PFID(&lli->lli_fid));
-	lsm_md_dump(D_INODE, lsm);
-
 	/* XXX sigh, this lsm_root initialization should be in
 	 * LMV layer, but it needs ll_iget right now, so we
 	 * put this here right now. */
 	for (i = 0; i < lsm->lsm_md_stripe_count; i++) {
 		fid = &lsm->lsm_md_oinfo[i].lmo_fid;
 		LASSERT(lsm->lsm_md_oinfo[i].lmo_root == NULL);
-
-		if (!fid_is_sane(fid))
-			continue;
-
 		/* Unfortunately ll_iget will call ll_update_inode,
 		 * where the initialization of slave inode is slightly
 		 * different, so it reset lsm_md to NULL to avoid
 		 * initializing lsm for slave inode. */
-		lsm->lsm_md_oinfo[i].lmo_root =
+		/* For migrating inode, master stripe and master object will
+		 * be same, so we only need assign this inode */
+		if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION && i == 0)
+			lsm->lsm_md_oinfo[i].lmo_root = inode;
+		else
+			lsm->lsm_md_oinfo[i].lmo_root =
 				ll_iget_anon_dir(inode->i_sb, fid, md);
+
 		if (IS_ERR(lsm->lsm_md_oinfo[i].lmo_root)) {
 			int rc = PTR_ERR(lsm->lsm_md_oinfo[i].lmo_root);
 
 			lsm->lsm_md_oinfo[i].lmo_root = NULL;
-			while (i-- > 0) {
-				iput(lsm->lsm_md_oinfo[i].lmo_root);
-				lsm->lsm_md_oinfo[i].lmo_root = NULL;
-			}
 			return rc;
 		}
 	}
 
-	lli->lli_lsm_md = lsm;
-
 	return 0;
 }
 
+static inline int lli_lsm_md_eq(const struct lmv_stripe_md *lsm_md1,
+				const struct lmv_stripe_md *lsm_md2)
+{
+	return lsm_md1->lsm_md_magic == lsm_md2->lsm_md_magic &&
+	       lsm_md1->lsm_md_stripe_count == lsm_md2->lsm_md_stripe_count &&
+	       lsm_md1->lsm_md_master_mdt_index ==
+					lsm_md2->lsm_md_master_mdt_index &&
+	       lsm_md1->lsm_md_hash_type == lsm_md2->lsm_md_hash_type &&
+	       lsm_md1->lsm_md_layout_version ==
+					lsm_md2->lsm_md_layout_version &&
+	       strcmp(lsm_md1->lsm_md_pool_name,
+		      lsm_md2->lsm_md_pool_name) == 0;
+}
+
 static int ll_update_lsm_md(struct inode *inode, struct lustre_md *md)
 {
 	struct ll_inode_info *lli = ll_i2info(inode);
 	struct lmv_stripe_md *lsm = md->lmv;
-	struct cl_attr	*attr;
-	int rc = 0;
-
+	int	rc;
 	ENTRY;
 
 	LASSERT(S_ISDIR(inode->i_mode));
 	CDEBUG(D_INODE, "update lsm %p of "DFID"\n", lli->lli_lsm_md,
 	       PFID(ll_inode2fid(inode)));
 
-	/*
-	 * no striped information from request, lustre_md from req does not
-	 * include stripeEA, see ll_md_setattr()
-	 */
-	if (!lsm)
-		RETURN(0);
+	/* no striped information from request. */
+	if (lsm == NULL) {
+		if (lli->lli_lsm_md == NULL) {
+			RETURN(0);
+		} else if (lli->lli_lsm_md->lsm_md_hash_type &
+						LMV_HASH_FLAG_MIGRATION) {
+			/* migration is done, the temporay MIGRATE layout has
+			 * been removed */
+			CDEBUG(D_INODE, DFID" finish migration.\n",
+			       PFID(ll_inode2fid(inode)));
+			lmv_free_memmd(lli->lli_lsm_md);
+			lli->lli_lsm_md = NULL;
+			RETURN(0);
+		} else {
+			/* The lustre_md from req does not include stripeEA,
+			 * see ll_md_setattr */
+			RETURN(0);
+		}
+	}
 
-	/*
-	 * normally dir layout doesn't change, only take read lock to check
-	 * that to avoid blocking other MD operations.
-	 */
-	down_read(&lli->lli_lsm_sem);
+	/* set the directory layout */
+	if (lli->lli_lsm_md == NULL) {
+		struct cl_attr	*attr;
 
-	/* some concurrent lookup initialized lsm, and unchanged */
-	if (lli->lli_lsm_md && lsm_md_eq(lli->lli_lsm_md, lsm))
-		GOTO(unlock, rc = 0);
+		rc = ll_init_lsm_md(inode, md);
+		if (rc != 0)
+			RETURN(rc);
 
-	/* if dir layout doesn't match, check whether version is increased,
-	 * which means layout is changed, this happens in dir split/merge and
-	 * lfsck.
-	 */
-	if (lli->lli_lsm_md &&
-	    lsm->lsm_md_layout_version <=
-	    lli->lli_lsm_md->lsm_md_layout_version) {
-		CERROR("%s: "DFID" dir layout mismatch:\n",
-		       ll_get_fsname(inode->i_sb, NULL, 0),
-		       PFID(&lli->lli_fid));
-		lsm_md_dump(D_ERROR, lli->lli_lsm_md);
-		lsm_md_dump(D_ERROR, lsm);
-		GOTO(unlock, rc = -EINVAL);
-	}
-  
-	up_read(&lli->lli_lsm_sem);
-	down_write(&lli->lli_lsm_sem);
-	/* clear existing lsm */
-	if (lli->lli_lsm_md) {
-		lmv_free_memmd(lli->lli_lsm_md);
-		lli->lli_lsm_md = NULL;
- 	}
+		/* set md->lmv to NULL, so the following free lustre_md
+		 * will not free this lsm */
+		md->lmv = NULL;
+		lli->lli_lsm_md = lsm;
 
-	rc = ll_init_lsm_md(inode, md);
-	up_write(&lli->lli_lsm_sem);
-	if (rc)
-		RETURN(rc);
- 
-	/* set md->lmv to NULL, so the following free lustre_md will not free
-	 * this lsm.
-	 */
-	md->lmv = NULL;
+		OBD_ALLOC_PTR(attr);
+		if (attr == NULL)
+			RETURN(-ENOMEM);
+
+		/* validate the lsm */
+		rc = md_merge_attr(ll_i2mdexp(inode), lsm, attr,
+				   ll_md_blocking_ast);
+		if (rc != 0) {
+			OBD_FREE_PTR(attr);
+			RETURN(rc);
+		}
 
-	/* md_merge_attr() may take long, since lsm is already set, switch to
-	 * read lock.
-	 */
-	down_read(&lli->lli_lsm_sem);
-	OBD_ALLOC_PTR(attr);
-	if (!attr)
-		GOTO(unlock, rc = -ENOMEM);
- 
-	/* validate the lsm */
-	rc = md_merge_attr(ll_i2mdexp(inode), lli->lli_lsm_md, attr,
-			   ll_md_blocking_ast);
-	if (!rc) {
 		if (md->body->mbo_valid & OBD_MD_FLNLINK)
 			md->body->mbo_nlink = attr->cat_nlink;
 		if (md->body->mbo_valid & OBD_MD_FLSIZE)
@@ -1468,14 +1374,51 @@ static int ll_update_lsm_md(struct inode *inode, struct lustre_md *md)
 			md->body->mbo_ctime = attr->cat_ctime;
 		if (md->body->mbo_valid & OBD_MD_FLMTIME)
 			md->body->mbo_mtime = attr->cat_mtime;
+
+		OBD_FREE_PTR(attr);
+
+		CDEBUG(D_INODE, "Set lsm %p magic %x to "DFID"\n", lsm,
+		       lsm->lsm_md_magic, PFID(ll_inode2fid(inode)));
+		RETURN(0);
 	}
 
-	OBD_FREE_PTR(attr);
-	GOTO(unlock, rc);
-unlock:
-	up_read(&lli->lli_lsm_sem);
+	/* Compare the old and new stripe information */
+	if (!lsm_md_eq(lli->lli_lsm_md, lsm)) {
+		struct lmv_stripe_md	*old_lsm = lli->lli_lsm_md;
+		int			idx;
+
+		CERROR("%s: inode "DFID"(%p)'s lmv layout mismatch (%p)/(%p)"
+		       "magic:0x%x/0x%x stripe count: %d/%d master_mdt: %d/%d"
+		       "hash_type:0x%x/0x%x layout: 0x%x/0x%x pool:%s/%s\n",
+		       ll_get_fsname(inode->i_sb, NULL, 0), PFID(&lli->lli_fid),
+		       inode, lsm, old_lsm,
+		       lsm->lsm_md_magic, old_lsm->lsm_md_magic,
+		       lsm->lsm_md_stripe_count,
+		       old_lsm->lsm_md_stripe_count,
+		       lsm->lsm_md_master_mdt_index,
+		       old_lsm->lsm_md_master_mdt_index,
+		       lsm->lsm_md_hash_type, old_lsm->lsm_md_hash_type,
+		       lsm->lsm_md_layout_version,
+		       old_lsm->lsm_md_layout_version,
+		       lsm->lsm_md_pool_name,
+		       old_lsm->lsm_md_pool_name);
+
+		for (idx = 0; idx < old_lsm->lsm_md_stripe_count; idx++) {
+			CERROR("%s: sub FIDs in old lsm idx %d, old: "DFID"\n",
+			       ll_get_fsname(inode->i_sb, NULL, 0), idx,
+			       PFID(&old_lsm->lsm_md_oinfo[idx].lmo_fid));
+		}
+
+		for (idx = 0; idx < lsm->lsm_md_stripe_count; idx++) {
+			CERROR("%s: sub FIDs in new lsm idx %d, new: "DFID"\n",
+			       ll_get_fsname(inode->i_sb, NULL, 0), idx,
+			       PFID(&lsm->lsm_md_oinfo[idx].lmo_fid));
+		}
 
-	return rc;
+		RETURN(-EIO);
+	}
+
+	RETURN(0);
 }
 
 void ll_clear_inode(struct inode *inode)
@@ -1611,8 +1554,7 @@ static int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data)
  *
  * In case of HSMimport, we only set attr on MDS.
  */
-int ll_setattr_raw(struct dentry *dentry, struct iattr *attr,
-		   enum op_xvalid xvalid, bool hsm_import)
+int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import)
 {
         struct inode *inode = dentry->d_inode;
         struct ll_inode_info *lli = ll_i2info(inode);
@@ -1652,12 +1594,12 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr,
 			RETURN(-EPERM);
 	}
 
-	/* We mark all of the fields "set" so MDS/OST does not re-set them */
-	if (!(xvalid & OP_XVALID_CTIME_SET) &&
-	     (attr->ia_valid & ATTR_CTIME)) {
+        /* We mark all of the fields "set" so MDS/OST does not re-set them */
+	if (!(attr->ia_valid & ATTR_CTIME_SET) &&
+	    (attr->ia_valid & ATTR_CTIME)) {
 		attr->ia_ctime = current_time(inode);
-		xvalid |= OP_XVALID_CTIME_SET;
-	}
+                attr->ia_valid |= ATTR_CTIME_SET;
+        }
 	if (!(attr->ia_valid & ATTR_ATIME_SET) &&
 	    (attr->ia_valid & ATTR_ATIME)) {
 		attr->ia_atime = current_time(inode);
@@ -1689,22 +1631,13 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr,
 
 	if (!hsm_import && attr->ia_valid & ATTR_SIZE) {
 		/* If we are changing file size, file content is
-		 * modified, flag it.
-		 */
-		xvalid |= OP_XVALID_OWNEROVERRIDE;
+		 * modified, flag it. */
+		attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
 		op_data->op_bias |= MDS_DATA_MODIFIED;
 		ll_file_clear_flag(lli, LLIF_DATA_MODIFIED);
 	}
 
-	if (attr->ia_valid & ATTR_FILE) {
-		struct ll_file_data *fd = LUSTRE_FPRIVATE(attr->ia_file);
-
-		if (fd->fd_lease_och)
-			op_data->op_bias |= MDS_TRUNC_KEEP_LEASE;
-	}
-
 	op_data->op_attr = *attr;
-	op_data->op_xvalid = xvalid;
 
 	rc = ll_md_setattr(dentry, op_data);
 	if (rc)
@@ -1713,17 +1646,17 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr,
 	if (!S_ISREG(inode->i_mode) || hsm_import)
 		GOTO(out, rc = 0);
 
-	if (attr->ia_valid & (ATTR_SIZE | ATTR_ATIME | ATTR_ATIME_SET |
-			      ATTR_MTIME | ATTR_MTIME_SET | ATTR_CTIME) ||
-	    xvalid & OP_XVALID_CTIME_SET) {
+	if (attr->ia_valid & (ATTR_SIZE |
+			      ATTR_ATIME | ATTR_ATIME_SET |
+			      ATTR_MTIME | ATTR_MTIME_SET |
+			      ATTR_CTIME | ATTR_CTIME_SET)) {
 		/* For truncate and utimes sending attributes to OSTs, setting
 		 * mtime/atime to the past will be performed under PW [0:EOF]
 		 * extent lock (new_size:EOF for truncate).  It may seem
 		 * excessive to send mtime/atime updates to OSTs when not
 		 * setting times to past, but it is necessary due to possible
-		 * time de-synchronization between MDT inode and OST objects
-		 */
-		rc = cl_setattr_ost(lli->lli_clob, attr, xvalid, 0);
+		 * time de-synchronization between MDT inode and OST objects */
+		rc = cl_setattr_ost(lli->lli_clob, attr, 0);
 	}
 
 	/* If the file was restored, it needs to set dirty flag.
@@ -1783,11 +1716,10 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr,
 int ll_setattr(struct dentry *de, struct iattr *attr)
 {
 	int mode = de->d_inode->i_mode;
-	enum op_xvalid xvalid = 0;
 
 	if ((attr->ia_valid & (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) ==
 			      (ATTR_CTIME|ATTR_SIZE|ATTR_MODE))
-		xvalid |= OP_XVALID_OWNEROVERRIDE;
+		attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
 
 	if (((attr->ia_valid & (ATTR_MODE|ATTR_FORCE|ATTR_SIZE)) ==
 			       (ATTR_SIZE|ATTR_MODE)) &&
@@ -1808,60 +1740,61 @@ int ll_setattr(struct dentry *de, struct iattr *attr)
 	    !(attr->ia_valid & ATTR_KILL_SGID))
 		attr->ia_valid |= ATTR_KILL_SGID;
 
-	return ll_setattr_raw(de, attr, xvalid, false);
+	/* avoid polluted from ATTR_TIMES_SET,
+	 * projid is not expected to be set here */
+	attr->ia_valid &= ~MDS_ATTR_PROJID;
+
+	return ll_setattr_raw(de, attr, false);
 }
 
-int ll_statfs_internal(struct ll_sb_info *sbi, struct obd_statfs *osfs,
-		       u32 flags)
+int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
+                       __u64 max_age, __u32 flags)
 {
-	struct obd_statfs obd_osfs = { 0 };
-	time64_t max_age;
-	int rc;
-
-	ENTRY;
-	max_age = ktime_get_seconds() - sbi->ll_statfs_max_age;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        struct obd_statfs obd_osfs;
+        int rc;
+        ENTRY;
 
-	rc = obd_statfs(NULL, sbi->ll_md_exp, osfs, max_age, flags);
-	if (rc)
-		RETURN(rc);
+        rc = obd_statfs(NULL, sbi->ll_md_exp, osfs, max_age, flags);
+        if (rc) {
+                CERROR("md_statfs fails: rc = %d\n", rc);
+                RETURN(rc);
+        }
 
-	osfs->os_type = LL_SUPER_MAGIC;
+        osfs->os_type = sb->s_magic;
 
 	CDEBUG(D_SUPER, "MDC blocks %llu/%llu objects %llu/%llu\n",
-	      osfs->os_bavail, osfs->os_blocks, osfs->os_ffree, osfs->os_files);
-
-	if (osfs->os_state & OS_STATE_SUM)
-		GOTO(out, rc);
+               osfs->os_bavail, osfs->os_blocks, osfs->os_ffree,osfs->os_files);
 
-	if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
-		flags |= OBD_STATFS_NODELAY;
+        if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
+                flags |= OBD_STATFS_NODELAY;
 
-	rc = obd_statfs(NULL, sbi->ll_dt_exp, &obd_osfs, max_age, flags);
-	if (rc) /* Possibly a filesystem with no OSTs.  Report MDT totals. */
-		GOTO(out, rc = 0);
+        rc = obd_statfs_rqset(sbi->ll_dt_exp, &obd_osfs, max_age, flags);
+        if (rc) {
+                CERROR("obd_statfs fails: rc = %d\n", rc);
+                RETURN(rc);
+        }
 
 	CDEBUG(D_SUPER, "OSC blocks %llu/%llu objects %llu/%llu\n",
-	       obd_osfs.os_bavail, obd_osfs.os_blocks, obd_osfs.os_ffree,
-	       obd_osfs.os_files);
-
-	osfs->os_bsize = obd_osfs.os_bsize;
-	osfs->os_blocks = obd_osfs.os_blocks;
-	osfs->os_bfree = obd_osfs.os_bfree;
-	osfs->os_bavail = obd_osfs.os_bavail;
-
-	/* If we have _some_ OSTs, but don't have as many free objects on the
-	 * OSTs as inodes on the MDTs, reduce the reported number of inodes
-	 * to compensate, so that the "inodes in use" number is correct.
-	 * This should be kept in sync with lod_statfs() behaviour.
-	 */
-	if (obd_osfs.os_files && obd_osfs.os_ffree < osfs->os_ffree) {
-		osfs->os_files = (osfs->os_files - osfs->os_ffree) +
-				 obd_osfs.os_ffree;
-		osfs->os_ffree = obd_osfs.os_ffree;
-	}
+               obd_osfs.os_bavail, obd_osfs.os_blocks, obd_osfs.os_ffree,
+               obd_osfs.os_files);
+
+        osfs->os_bsize = obd_osfs.os_bsize;
+        osfs->os_blocks = obd_osfs.os_blocks;
+        osfs->os_bfree = obd_osfs.os_bfree;
+        osfs->os_bavail = obd_osfs.os_bavail;
+
+        /* If we don't have as many objects free on the OST as inodes
+         * on the MDS, we reduce the total number of inodes to
+         * compensate, so that the "inodes in use" number is correct.
+         */
+        if (obd_osfs.os_ffree < osfs->os_ffree) {
+                osfs->os_files = (osfs->os_files - osfs->os_ffree) +
+                        obd_osfs.os_ffree;
+                osfs->os_ffree = obd_osfs.os_ffree;
+        }
 
-out:
-	RETURN(rc);
+        RETURN(rc);
 }
 int ll_statfs(struct dentry *de, struct kstatfs *sfs)
 {
@@ -1873,10 +1806,12 @@ int ll_statfs(struct dentry *de, struct kstatfs *sfs)
 	CDEBUG(D_VFSTRACE, "VFS Op: at %llu jiffies\n", get_jiffies_64());
         ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_STAFS, 1);
 
-	/* Some amount of caching on the client is allowed */
-	rc = ll_statfs_internal(ll_s2sbi(sb), &osfs, OBD_STATFS_SUM);
-	if (rc)
-		return rc;
+        /* Some amount of caching on the client is allowed */
+        rc = ll_statfs_internal(sb, &osfs,
+                                cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                                0);
+        if (rc)
+                return rc;
 
         statfs_unpack(sfs, &osfs);
 
@@ -1920,15 +1855,6 @@ void ll_inode_size_unlock(struct inode *inode)
 	mutex_unlock(&lli->lli_size_mutex);
 }
 
-void ll_update_inode_flags(struct inode *inode, int ext_flags)
-{
-	inode->i_flags = ll_ext_to_inode_flags(ext_flags);
-	if (ext_flags & LUSTRE_PROJINHERIT_FL)
-		ll_file_set_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT);
-	else
-		ll_file_clear_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT);
-}
-
 int ll_update_inode(struct inode *inode, struct lustre_md *md)
 {
 	struct ll_inode_info *lli = ll_i2info(inode);
@@ -1986,7 +1912,7 @@ int ll_update_inode(struct inode *inode, struct lustre_md *md)
 
 	/* Clear i_flags to remove S_NOSEC before permissions are updated */
 	if (body->mbo_valid & OBD_MD_FLFLAGS)
-		ll_update_inode_flags(inode, body->mbo_flags);
+		inode->i_flags = ll_ext_to_inode_flags(body->mbo_flags);
 	if (body->mbo_valid & OBD_MD_FLMODE)
 		inode->i_mode = (inode->i_mode & S_IFMT) |
 				(body->mbo_mode & ~S_IFMT);
@@ -2114,17 +2040,11 @@ void ll_delete_inode(struct inode *inode)
 	unsigned long nrpages;
 	ENTRY;
 
-	if (S_ISREG(inode->i_mode) && lli->lli_clob != NULL) {
+	if (S_ISREG(inode->i_mode) && lli->lli_clob != NULL)
 		/* It is last chance to write out dirty pages,
-		 * otherwise we may lose data while umount.
-		 *
-		 * If i_nlink is 0 then just discard data. This is safe because
-		 * local inode gets i_nlink 0 from server only for the last
-		 * unlink, so that file is not opened somewhere else
-		 */
-		cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, inode->i_nlink ?
-				   CL_FSYNC_LOCAL : CL_FSYNC_DISCARD, 1);
-	}
+		 * otherwise we may lose data while umount */
+		cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, CL_FSYNC_LOCAL, 1);
+
 	truncate_inode_pages_final(mapping);
 
 	/* Workaround for LU-118: Note nrpages may not be totally updated when
@@ -2157,13 +2077,13 @@ void ll_delete_inode(struct inode *inode)
 int ll_iocontrol(struct inode *inode, struct file *file,
                  unsigned int cmd, unsigned long arg)
 {
-	struct ll_sb_info *sbi = ll_i2sbi(inode);
-	struct ptlrpc_request *req = NULL;
-	int rc, flags = 0;
-	ENTRY;
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        struct ptlrpc_request *req = NULL;
+        int rc, flags = 0;
+        ENTRY;
 
-	switch (cmd) {
-	case FS_IOC_GETFLAGS: {
+        switch(cmd) {
+        case FSFILT_IOC_GETFLAGS: {
                 struct mdt_body *body;
                 struct md_op_data *op_data;
 
@@ -2187,41 +2107,32 @@ int ll_iocontrol(struct inode *inode, struct file *file,
 
 		flags = body->mbo_flags;
 
-		ptlrpc_req_finished(req);
+                ptlrpc_req_finished(req);
 
 		RETURN(put_user(flags, (int __user *)arg));
-	}
-	case FS_IOC_SETFLAGS: {
+        }
+        case FSFILT_IOC_SETFLAGS: {
 		struct iattr *attr;
 		struct md_op_data *op_data;
 		struct cl_object *obj;
-		struct fsxattr fa = { 0 };
 
 		if (get_user(flags, (int __user *)arg))
 			RETURN(-EFAULT);
 
-		fa.fsx_projid = ll_i2info(inode)->lli_projid;
-		if (flags & LUSTRE_PROJINHERIT_FL)
-			fa.fsx_xflags = FS_XFLAG_PROJINHERIT;
-
-		rc = ll_ioctl_check_project(inode, &fa);
-		if (rc)
-			RETURN(rc);
-
-		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
-					     LUSTRE_OPC_ANY, NULL);
-		if (IS_ERR(op_data))
-			RETURN(PTR_ERR(op_data));
+                op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+                                             LUSTRE_OPC_ANY, NULL);
+                if (IS_ERR(op_data))
+                        RETURN(PTR_ERR(op_data));
 
 		op_data->op_attr_flags = flags;
-		op_data->op_xvalid |= OP_XVALID_FLAGS;
+                op_data->op_attr.ia_valid |= ATTR_ATTR_FLAG;
 		rc = md_setattr(sbi->ll_md_exp, op_data, NULL, 0, &req);
                 ll_finish_md_op_data(op_data);
                 ptlrpc_req_finished(req);
 		if (rc)
 			RETURN(rc);
 
-		ll_update_inode_flags(inode, flags);
+		inode->i_flags = ll_ext_to_inode_flags(flags);
 
 		obj = ll_i2info(inode)->lli_clob;
 		if (obj == NULL)
@@ -2231,7 +2142,8 @@ int ll_iocontrol(struct inode *inode, struct file *file,
 		if (attr == NULL)
 			RETURN(-ENOMEM);
 
-		rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS, flags);
+		attr->ia_valid = ATTR_ATTR_FLAG;
+		rc = cl_setattr_ost(obj, attr, flags);
 
 		OBD_FREE_PTR(attr);
 		RETURN(rc);
@@ -2379,7 +2291,7 @@ void ll_open_cleanup(struct super_block *sb, struct ptlrpc_request *open_req)
 	}
 
 	op_data->op_fid1 = body->mbo_fid1;
-	op_data->op_open_handle = body->mbo_open_handle;
+	op_data->op_handle = body->mbo_handle;
 	op_data->op_mod_time = ktime_get_real_seconds();
 	md_close(exp, op_data, NULL, &close_req);
 	ptlrpc_req_finished(close_req);
@@ -2472,10 +2384,8 @@ int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req,
 	md_free_lustre_md(sbi->ll_md_exp, &md);
 
 cleanup:
-	if (rc != 0 && it != NULL && it->it_op & IT_OPEN) {
-		ll_intent_drop_lock(it);
+	if (rc != 0 && it != NULL && it->it_op & IT_OPEN)
 		ll_open_cleanup(sb != NULL ? sb : (*inode)->i_sb, req);
-	}
 
 	return rc;
 }
@@ -2523,21 +2433,30 @@ int ll_obd_statfs(struct inode *inode, void __user *arg)
 	return rc;
 }
 
-/*
- * this is normally called in ll_fini_md_op_data(), but sometimes it needs to
- * be called early to avoid deadlock.
- */
-void ll_unlock_md_op_lsm(struct md_op_data *op_data)
+int ll_process_config(struct lustre_cfg *lcfg)
 {
-	if (op_data->op_mea2_sem) {
-		up_read(op_data->op_mea2_sem);
-		op_data->op_mea2_sem = NULL;
-	}
+	struct super_block *sb;
+	unsigned long x;
+	int rc = 0;
+	char *ptr;
 
-	if (op_data->op_mea1_sem) {
-		up_read(op_data->op_mea1_sem);
-		op_data->op_mea1_sem = NULL;
-	}
+	/* The instance name contains the sb: lustre-client-aacfe000 */
+	ptr = strrchr(lustre_cfg_string(lcfg, 0), '-');
+	if (!ptr || !*(++ptr))
+		return -EINVAL;
+	if (sscanf(ptr, "%lx", &x) != 1)
+		return -EINVAL;
+	sb = (struct super_block *)x;
+	/* This better be a real Lustre superblock! */
+	LASSERT(s2lsi(sb)->lsi_lmd->lmd_magic == LMD_MAGIC);
+
+	/* Note we have not called client_common_fill_super yet, so
+	   proc fns must be able to handle that! */
+	rc = class_process_proc_param(PARAM_LLITE, lprocfs_llite_obd_vars,
+				      lcfg, sb);
+	if (rc > 0)
+		rc = 0;
+	return rc;
 }
 
 /* this function prepares md_op_data hint for passing it down to MD stack. */
@@ -2556,9 +2475,7 @@ struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
 		if (namelen > ll_i2sbi(i1)->ll_namelen)
 			return ERR_PTR(-ENAMETOOLONG);
 
-		/* "/" is not valid name, but it's allowed */
-		if (!lu_name_is_valid_2(name, namelen) &&
-		    strncmp("/", name, namelen) != 0)
+		if (!lu_name_is_valid_2(name, namelen))
 			return ERR_PTR(-EINVAL);
 	}
 
@@ -2571,10 +2488,7 @@ struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
 	ll_i2gids(op_data->op_suppgids, i1, i2);
 	op_data->op_fid1 = *ll_inode2fid(i1);
 	op_data->op_default_stripe_offset = -1;
-
 	if (S_ISDIR(i1->i_mode)) {
-		down_read(&ll_i2info(i1)->lli_lsm_sem);
-		op_data->op_mea1_sem = &ll_i2info(i1)->lli_lsm_sem;
 		op_data->op_mea1 = ll_i2info(i1)->lli_lsm_md;
 		if (opc == LUSTRE_OPC_MKDIR)
 			op_data->op_default_stripe_offset =
@@ -2583,14 +2497,8 @@ struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
 
 	if (i2) {
 		op_data->op_fid2 = *ll_inode2fid(i2);
-		if (S_ISDIR(i2->i_mode)) {
-			if (i2 != i1) {
-				down_read(&ll_i2info(i2)->lli_lsm_sem);
-				op_data->op_mea2_sem =
-						&ll_i2info(i2)->lli_lsm_sem;
-			}
+		if (S_ISDIR(i2->i_mode))
 			op_data->op_mea2 = ll_i2info(i2)->lli_lsm_md;
-		}
 	} else {
 		fid_zero(&op_data->op_fid2);
 	}
@@ -2604,14 +2512,15 @@ struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
 	op_data->op_name = name;
 	op_data->op_namelen = namelen;
 	op_data->op_mode = mode;
-	op_data->op_mod_time = ktime_get_real_seconds();
+	op_data->op_mod_time = cfs_time_current_sec();
 	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
 	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
 	op_data->op_cap = cfs_curproc_cap_pack();
-	op_data->op_mds = 0;
 	if ((opc == LUSTRE_OPC_CREATE) && (name != NULL) &&
 	     filename_is_volatile(name, namelen, &op_data->op_mds)) {
 		op_data->op_bias |= MDS_CREATE_VOLATILE;
+	} else {
+		op_data->op_mds = 0;
 	}
 	op_data->op_data = data;
 
@@ -2620,10 +2529,9 @@ struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
 
 void ll_finish_md_op_data(struct md_op_data *op_data)
 {
-	ll_unlock_md_op_lsm(op_data);
 	ll_security_release_secctx(op_data->op_file_secctx,
 				   op_data->op_file_secctx_size);
-	OBD_FREE_PTR(op_data);
+        OBD_FREE_PTR(op_data);
 }
 
 #ifdef HAVE_SUPEROPS_USE_DENTRY
@@ -2632,7 +2540,7 @@ int ll_show_options(struct seq_file *seq, struct dentry *dentry)
 int ll_show_options(struct seq_file *seq, struct vfsmount *vfs)
 #endif
 {
-	struct ll_sb_info *sbi;
+        struct ll_sb_info *sbi;
 
 #ifdef HAVE_SUPEROPS_USE_DENTRY
 	LASSERT((seq != NULL) && (dentry != NULL));
@@ -2642,25 +2550,20 @@ int ll_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	sbi = ll_s2sbi(vfs->mnt_sb);
 #endif
 
-	if (sbi->ll_flags & LL_SBI_NOLCK)
-		seq_puts(seq, ",nolock");
+        if (sbi->ll_flags & LL_SBI_NOLCK)
+                seq_puts(seq, ",nolock");
 
-	/* "flock" is the default since 2.13, but it wasn't for many years,
-	 * so it is still useful to print this to show it is enabled.
-	 * Start to print "noflock" so it is now clear when flock is disabled.
-	 */
-	if (sbi->ll_flags & LL_SBI_FLOCK)
-		seq_puts(seq, ",flock");
-	else if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
-		seq_puts(seq, ",localflock");
-	else
-		seq_puts(seq, ",noflock");
+        if (sbi->ll_flags & LL_SBI_FLOCK)
+                seq_puts(seq, ",flock");
 
-	if (sbi->ll_flags & LL_SBI_USER_XATTR)
-		seq_puts(seq, ",user_xattr");
+        if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
+                seq_puts(seq, ",localflock");
+
+        if (sbi->ll_flags & LL_SBI_USER_XATTR)
+                seq_puts(seq, ",user_xattr");
 
-	if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
-		seq_puts(seq, ",lazystatfs");
+        if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
+                seq_puts(seq, ",lazystatfs");
 
 	if (sbi->ll_flags & LL_SBI_USER_FID2PATH)
 		seq_puts(seq, ",user_fid2path");
@@ -2668,7 +2571,7 @@ int ll_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (sbi->ll_flags & LL_SBI_ALWAYS_PING)
 		seq_puts(seq, ",always_ping");
 
-	RETURN(0);
+        RETURN(0);
 }
 
 /**
@@ -2786,12 +2689,12 @@ ssize_t ll_copy_user_md(const struct lov_user_md __user *md,
 	if (lum_size < 0)
 		RETURN(lum_size);
 
-	OBD_ALLOC_LARGE(*kbuf, lum_size);
+	OBD_ALLOC(*kbuf, lum_size);
 	if (*kbuf == NULL)
 		RETURN(-ENOMEM);
 
 	if (copy_from_user(*kbuf, md, lum_size) != 0) {
-		OBD_FREE_LARGE(*kbuf, lum_size);
+		OBD_FREE(*kbuf, lum_size);
 		RETURN(-EFAULT);
 	}
 
@@ -2819,7 +2722,7 @@ void ll_compute_rootsquash_state(struct ll_sb_info *sbi)
 		matched = false;
 		i = 0;
 		while (LNetGetId(i++, &id) != -ENOENT) {
-			if (id.nid == LNET_NID_LO_0)
+			if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
 				continue;
 			if (cfs_match_nid(id.nid, &squash->rsi_nosquash_nids)) {
 				matched = true;
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c b/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
index 9be9bd690ee6d..e286c559c1f67 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
@@ -150,7 +150,7 @@ static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage,
 	int                      result;
 	__u16			 refcheck;
 	sigset_t		 set;
-	struct inode             *inode = NULL;
+	struct inode             *inode;
 	struct ll_inode_info     *lli;
 	ENTRY;
 
@@ -222,16 +222,6 @@ static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage,
 	CDEBUG(D_MMAP, "%s mkwrite with %d\n", current->comm, result);
 	LASSERT(ergo(result == 0, PageLocked(vmpage)));
 
-	/* if page has been unmapped, presumably due to lock reclaim for
-	 * concurrent usage, add some delay before retrying to prevent
-	 * entering live-lock situation with competitors
-	 */
-	if (result == -ENODATA && inode != NULL) {
-		CDEBUG(D_MMAP, "delaying new page-fault for inode %p to "
-			       "prevent live-lock\n", inode);
-		msleep(10);
-	}
-
 	return result;
 }
 
@@ -393,12 +383,6 @@ static vm_fault_t ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                 result |= VM_FAULT_LOCKED;
         }
 	cfs_restore_sigs(set);
-
-	if (vmf->page && result == VM_FAULT_LOCKED)
-		ll_rw_stats_tally(ll_i2sbi(file_inode(vma->vm_file)),
-				  current->pid, LUSTRE_FPRIVATE(vma->vm_file),
-				  cl_offset(NULL, vmf->page->index), PAGE_SIZE,
-				  READ);
         return result;
 }
 
@@ -455,11 +439,6 @@ static vm_fault_t ll_page_mkwrite(struct vm_area_struct *vma,
                 break;
         }
 
-	if (result == VM_FAULT_LOCKED)
-		ll_rw_stats_tally(ll_i2sbi(file_inode(vma->vm_file)),
-				  current->pid, LUSTRE_FPRIVATE(vma->vm_file),
-				  cl_offset(NULL, vmf->page->index), PAGE_SIZE,
-				  WRITE);
         return result;
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_nfs.c b/drivers/staging/lustrefsx/lustre/llite/llite_nfs.c
index 2e207361dd908..c24f7f6498ba0 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_nfs.c
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_nfs.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c b/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
old mode 100644
new mode 100755
index 7f95090796d42..ee696ef0a4c79
--- a/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
+++ b/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -36,124 +36,58 @@
 #ifdef HAVE_UIDGID_HEADER
 # include <linux/uidgid.h>
 #endif
-#include <uapi/linux/lustre/lustre_param.h>
+#include <uapi/linux/lustre_param.h>
 #include <lprocfs_status.h>
 #include <obd_support.h>
 
 #include "llite_internal.h"
 #include "vvp_internal.h"
 
-static struct kobject *llite_kobj;
-static struct dentry *llite_root;
+struct proc_dir_entry *proc_lustre_fs_root;
 
-int llite_tunables_register(void)
-{
-	int rc = 0;
-
-	llite_kobj = class_setup_tunables("llite");
-	if (IS_ERR(llite_kobj))
-		return PTR_ERR(llite_kobj);
-
-	llite_root = debugfs_create_dir("llite", debugfs_lustre_root);
-	if (IS_ERR_OR_NULL(llite_root)) {
-		rc = llite_root ? PTR_ERR(llite_root) : -ENOMEM;
-		llite_root = NULL;
-		kobject_put(llite_kobj);
-		llite_kobj = NULL;
-	}
-
-	return rc;
-}
-
-void llite_tunables_unregister(void)
-{
-	if (llite_kobj) {
-		kobject_put(llite_kobj);
-		llite_kobj = NULL;
-	}
-
-	if (!IS_ERR_OR_NULL(llite_root)) {
-		debugfs_remove(llite_root);
-		llite_root = NULL;
-	}
-}
-
-/* <debugfs>/lustre/llite mount point registration */
-static const struct file_operations ll_rw_extents_stats_fops;
-static const struct file_operations ll_rw_extents_stats_pp_fops;
-static const struct file_operations ll_rw_offset_stats_fops;
-
-/**
- * ll_stats_pid_write() - Determine if stats collection should be enabled
- * @buf: Buffer containing the data written
- * @len: Number of bytes in the buffer
- *
- * Several proc files begin collecting stats when a value is written, and stop
- * collecting when either '0' or 'disable' is written. This function checks the
- * written value to see if collection should be enabled or disabled.
- *
- * Return: If '0' or 'disable' is provided, 0 is returned. If the text
- * equivalent of a number is written, that number is returned. Otherwise,
- * 1 is returned. Non-zero return values indicate collection should be enabled.
- */
-static s64 ll_stats_pid_write(const char __user *buf, size_t len)
-{
-	unsigned long long value = 1;
-	char kernbuf[16];
-	int rc;
-
-	rc = kstrtoull_from_user(buf, len, 0, &value);
-	if (rc < 0 && len < sizeof(kernbuf)) {
-		if (copy_from_user(kernbuf, buf, len))
-			return -EFAULT;
-		kernbuf[len] = 0;
-
-		if (kernbuf[len - 1] == '\n')
-			kernbuf[len - 1] = 0;
-
-		if (strncasecmp(kernbuf, "disable", 7) == 0)
-			value = 0;
-	}
+#ifdef CONFIG_PROC_FS
+/* /proc/lustre/llite mount point registration */
+static const struct proc_ops ll_rw_extents_stats_fops;
+static const struct proc_ops ll_rw_extents_stats_pp_fops;
+static const struct proc_ops ll_rw_offset_stats_fops;
+static __s64 ll_stats_pid_write(struct file *file,
+				const char __user *buf, size_t len);
 
-	return value;
-}
-
-static ssize_t blocksize_show(struct kobject *kobj, struct attribute *attr,
-			      char *buf)
+static int ll_blksize_seq_show(struct seq_file *m, void *v)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
+	struct super_block *sb = m->private;
 	struct obd_statfs osfs;
 	int rc;
 
-	rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY);
-	if (rc)
-		return rc;
-
-	return sprintf(buf, "%u\n", osfs.os_bsize);
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc)
+		seq_printf(m, "%u\n", osfs.os_bsize);
+	return rc;
 }
-LUSTRE_RO_ATTR(blocksize);
+LPROC_SEQ_FOPS_RO(ll_blksize);
 
-static ssize_t stat_blocksize_show(struct kobject *kobj, struct attribute *attr,
-				   char *buf)
+static int ll_stat_blksize_seq_show(struct seq_file *m, void *v)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
+	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
+
+	seq_printf(m, "%u\n", sbi->ll_stat_blksize);
 
-	return sprintf(buf, "%u\n", sbi->ll_stat_blksize);
+	return 0;
 }
 
-static ssize_t stat_blocksize_store(struct kobject *kobj,
-				    struct attribute *attr,
-				    const char *buffer,
-				    size_t count)
+static ssize_t ll_stat_blksize_seq_write(struct file *file,
+					 const char __user *buffer,
+					 size_t count, loff_t *off)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
-	unsigned int val;
+	struct seq_file *m = file->private_data;
+	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
+	__s64 val;
 	int rc;
 
-	rc = kstrtouint(buffer, 10, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
@@ -164,135 +98,173 @@ static ssize_t stat_blocksize_store(struct kobject *kobj,
 
 	return count;
 }
-LUSTRE_RW_ATTR(stat_blocksize);
+LPROC_SEQ_FOPS(ll_stat_blksize);
 
-static ssize_t kbytestotal_show(struct kobject *kobj, struct attribute *attr,
-				char *buf)
+static int ll_kbytestotal_seq_show(struct seq_file *m, void *v)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
+	struct super_block *sb = m->private;
 	struct obd_statfs osfs;
-	u32 blk_size;
-	u64 result;
 	int rc;
 
-	rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY);
-	if (rc)
-		return rc;
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_blocks;
 
-	blk_size = osfs.os_bsize >> 10;
-	result = osfs.os_blocks;
+		while (blk_size >>= 1)
+			result <<= 1;
 
-	while (blk_size >>= 1)
-		result <<= 1;
-
-	return sprintf(buf, "%llu\n", result);
+		seq_printf(m, "%llu\n", result);
+	}
+	return rc;
 }
-LUSTRE_RO_ATTR(kbytestotal);
+LPROC_SEQ_FOPS_RO(ll_kbytestotal);
 
-static ssize_t kbytesfree_show(struct kobject *kobj, struct attribute *attr,
-			       char *buf)
+static int ll_kbytesfree_seq_show(struct seq_file *m, void *v)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
+	struct super_block *sb = m->private;
 	struct obd_statfs osfs;
-	u32 blk_size;
-	u64 result;
 	int rc;
 
-	rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY);
-	if (rc)
-		return rc;
-
-	blk_size = osfs.os_bsize >> 10;
-	result = osfs.os_bfree;
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bfree;
 
-	while (blk_size >>= 1)
-		result <<= 1;
+		while (blk_size >>= 1)
+			result <<= 1;
 
-	return sprintf(buf, "%llu\n", result);
+		seq_printf(m, "%llu\n", result);
+	}
+	return rc;
 }
-LUSTRE_RO_ATTR(kbytesfree);
+LPROC_SEQ_FOPS_RO(ll_kbytesfree);
 
-static ssize_t kbytesavail_show(struct kobject *kobj, struct attribute *attr,
-				char *buf)
+static int ll_kbytesavail_seq_show(struct seq_file *m, void *v)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
+	struct super_block *sb = m->private;
 	struct obd_statfs osfs;
-	u32 blk_size;
-	u64 result;
 	int rc;
 
-	rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY);
-	if (rc)
-		return rc;
-
-	blk_size = osfs.os_bsize >> 10;
-	result = osfs.os_bavail;
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bavail;
 
-	while (blk_size >>= 1)
-		result <<= 1;
+		while (blk_size >>= 1)
+			result <<= 1;
 
-	return sprintf(buf, "%llu\n", result);
+		seq_printf(m, "%llu\n", result);
+	}
+	return rc;
 }
-LUSTRE_RO_ATTR(kbytesavail);
+LPROC_SEQ_FOPS_RO(ll_kbytesavail);
 
-static ssize_t filestotal_show(struct kobject *kobj, struct attribute *attr,
-			       char *buf)
+static int ll_filestotal_seq_show(struct seq_file *m, void *v)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
+	struct super_block *sb = m->private;
 	struct obd_statfs osfs;
 	int rc;
 
-	rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY);
-	if (rc)
-		return rc;
-
-	return sprintf(buf, "%llu\n", osfs.os_files);
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc)
+		seq_printf(m, "%llu\n", osfs.os_files);
+	return rc;
 }
-LUSTRE_RO_ATTR(filestotal);
+LPROC_SEQ_FOPS_RO(ll_filestotal);
 
-static ssize_t filesfree_show(struct kobject *kobj, struct attribute *attr,
-			      char *buf)
+static int ll_filesfree_seq_show(struct seq_file *m, void *v)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
+	struct super_block *sb = m->private;
 	struct obd_statfs osfs;
 	int rc;
 
-	rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY);
-	if (rc)
-		return rc;
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc)
+		seq_printf(m, "%llu\n", osfs.os_ffree);
+	return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_filesfree);
+
+static int ll_client_type_seq_show(struct seq_file *m, void *v)
+{
+	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
+
+	LASSERT(sbi != NULL);
 
-	return sprintf(buf, "%llu\n", osfs.os_ffree);
+	seq_puts(m, "local client\n");
+	return 0;
 }
-LUSTRE_RO_ATTR(filesfree);
+LPROC_SEQ_FOPS_RO(ll_client_type);
 
-static ssize_t client_type_show(struct kobject *kobj, struct attribute *attr,
-				char *buf)
+static int ll_fstype_seq_show(struct seq_file *m, void *v)
 {
-	return sprintf(buf, "local client\n");
+	struct super_block *sb = m->private;
+
+	LASSERT(sb != NULL);
+	seq_printf(m, "%s\n", sb->s_type->name);
+	return 0;
 }
-LUSTRE_RO_ATTR(client_type);
+LPROC_SEQ_FOPS_RO(ll_fstype);
 
-static ssize_t fstype_show(struct kobject *kobj, struct attribute *attr,
-			   char *buf)
+static int ll_sb_uuid_seq_show(struct seq_file *m, void *v)
 {
-	return sprintf(buf, "lustre\n");
+	struct super_block *sb = m->private;
+
+	LASSERT(sb != NULL);
+	seq_printf(m, "%s\n", ll_s2sbi(sb)->ll_sb_uuid.uuid);
+	return 0;
 }
-LUSTRE_RO_ATTR(fstype);
+LPROC_SEQ_FOPS_RO(ll_sb_uuid);
 
-static ssize_t uuid_show(struct kobject *kobj, struct attribute *attr,
-			 char *buf)
+static int ll_xattr_cache_seq_show(struct seq_file *m, void *v)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
+	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
+
+	seq_printf(m, "%u\n", sbi->ll_xattr_cache_enabled);
+	return 0;
+}
+
+static ssize_t ll_xattr_cache_seq_write(struct file *file,
+					const char __user *buffer,
+					size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
+	__s64 val;
+	int rc;
+
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val != 0 && val != 1)
+		return -ERANGE;
 
-	return sprintf(buf, "%s\n", sbi->ll_sb_uuid.uuid);
+	if (val == 1 && !(sbi->ll_flags & LL_SBI_XATTR_CACHE))
+		return -ENOTSUPP;
+
+	sbi->ll_xattr_cache_enabled = val;
+	sbi->ll_xattr_cache_set = 1;
+
+	return count;
 }
-LUSTRE_RO_ATTR(uuid);
+LPROC_SEQ_FOPS(ll_xattr_cache);
 
 static int ll_site_stats_seq_show(struct seq_file *m, void *v)
 {
@@ -304,21 +276,21 @@ static int ll_site_stats_seq_show(struct seq_file *m, void *v)
 	 */
 	return cl_site_stats_print(lu2cl_site(ll_s2sbi(sb)->ll_site), m);
 }
-
-LDEBUGFS_SEQ_FOPS_RO(ll_site_stats);
+LPROC_SEQ_FOPS_RO(ll_site_stats);
 
 static int ll_max_readahead_mb_seq_show(struct seq_file *m, void *v)
 {
 	struct super_block *sb = m->private;
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
-	unsigned long ra_max_mb;
+	long pages_number;
+	int mult;
 
 	spin_lock(&sbi->ll_lock);
-	ra_max_mb = PAGES_TO_MiB(sbi->ll_ra_info.ra_max_pages);
+	pages_number = sbi->ll_ra_info.ra_max_pages;
 	spin_unlock(&sbi->ll_lock);
 
-	seq_printf(m, "%lu\n", ra_max_mb);
-	return 0;
+	mult = 1 << (20 - PAGE_SHIFT);
+	return lprocfs_seq_read_frac_helper(m, pages_number, mult);
 }
 
 static ssize_t
@@ -328,43 +300,45 @@ ll_max_readahead_mb_seq_write(struct file *file, const char __user *buffer,
 	struct seq_file *m = file->private_data;
 	struct super_block *sb = m->private;
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
-	s64 ra_max_mb, pages_number;
+	__s64 pages_number;
 	int rc;
 
-	rc = lprocfs_str_with_units_to_s64(buffer, count, &ra_max_mb, 'M');
+	rc = lprocfs_str_with_units_to_s64(file, buffer, count,
+			&pages_number, 'M');
 	if (rc)
 		return rc;
 
-	pages_number = round_up(ra_max_mb, 1024 * 1024) >> PAGE_SHIFT;
+	pages_number >>= PAGE_SHIFT;
+
 	if (pages_number < 0 || pages_number > cfs_totalram_pages() / 2) {
 		/* 1/2 of RAM */
-		CERROR("%s: can't set max_readahead_mb=%llu > %luMB\n",
-		       ll_get_fsname(sb, NULL, 0), PAGES_TO_MiB(pages_number),
-		       PAGES_TO_MiB(cfs_totalram_pages()));
+		CERROR("%s: can't set max_readahead_mb=%lu > %luMB\n",
+		       ll_get_fsname(sb, NULL, 0),
+		       (unsigned long)pages_number >> (20 - PAGE_SHIFT),
+		       cfs_totalram_pages() >> (20 - PAGE_SHIFT + 1));
 		return -ERANGE;
 	}
 
 	spin_lock(&sbi->ll_lock);
 	sbi->ll_ra_info.ra_max_pages = pages_number;
 	spin_unlock(&sbi->ll_lock);
-
 	return count;
 }
-
-LDEBUGFS_SEQ_FOPS(ll_max_readahead_mb);
+LPROC_SEQ_FOPS(ll_max_readahead_mb);
 
 static int ll_max_readahead_per_file_mb_seq_show(struct seq_file *m, void *v)
 {
 	struct super_block *sb = m->private;
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
-	unsigned long ra_max_file_mb;
+	long pages_number;
+	int mult;
 
 	spin_lock(&sbi->ll_lock);
-	ra_max_file_mb = PAGES_TO_MiB(sbi->ll_ra_info.ra_max_pages_per_file);
+	pages_number = sbi->ll_ra_info.ra_max_pages_per_file;
 	spin_unlock(&sbi->ll_lock);
 
-	seq_printf(m, "%lu\n", ra_max_file_mb);
-	return 0;
+	mult = 1 << (20 - PAGE_SHIFT);
+	return lprocfs_seq_read_frac_helper(m, pages_number, mult);
 }
 
 static ssize_t
@@ -375,43 +349,44 @@ ll_max_readahead_per_file_mb_seq_write(struct file *file,
 	struct seq_file *m = file->private_data;
 	struct super_block *sb = m->private;
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
-	s64 ra_max_file_mb, pages_number;
 	int rc;
+	__s64 pages_number;
 
-	rc = lprocfs_str_with_units_to_s64(buffer, count, &ra_max_file_mb,
-					   'M');
+	rc = lprocfs_str_with_units_to_s64(file, buffer, count,
+			&pages_number, 'M');
 	if (rc)
 		return rc;
 
-	pages_number = round_up(ra_max_file_mb, 1024 * 1024) >> PAGE_SHIFT;
+	pages_number >>= PAGE_SHIFT;
+
 	if (pages_number < 0 || pages_number > sbi->ll_ra_info.ra_max_pages) {
-		CERROR("%s: can't set max_readahead_per_file_mb=%llu > max_read_ahead_mb=%lu\n",
-		       ll_get_fsname(sb, NULL, 0), PAGES_TO_MiB(pages_number),
-		       PAGES_TO_MiB(sbi->ll_ra_info.ra_max_pages));
+		CERROR("%s: can't set max_readahead_per_file_mb=%lu > "
+		       "max_read_ahead_mb=%lu\n", ll_get_fsname(sb, NULL, 0),
+		       (unsigned long)pages_number >> (20 - PAGE_SHIFT),
+		       sbi->ll_ra_info.ra_max_pages >> (20 - PAGE_SHIFT));
 		return -ERANGE;
 	}
 
 	spin_lock(&sbi->ll_lock);
 	sbi->ll_ra_info.ra_max_pages_per_file = pages_number;
 	spin_unlock(&sbi->ll_lock);
-
 	return count;
 }
-
-LDEBUGFS_SEQ_FOPS(ll_max_readahead_per_file_mb);
+LPROC_SEQ_FOPS(ll_max_readahead_per_file_mb);
 
 static int ll_max_read_ahead_whole_mb_seq_show(struct seq_file *m, void *v)
 {
 	struct super_block *sb = m->private;
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
-	unsigned long ra_max_whole_mb;
+	long pages_number;
+	int mult;
 
 	spin_lock(&sbi->ll_lock);
-	ra_max_whole_mb = PAGES_TO_MiB(sbi->ll_ra_info.ra_max_read_ahead_whole_pages);
+	pages_number = sbi->ll_ra_info.ra_max_read_ahead_whole_pages;
 	spin_unlock(&sbi->ll_lock);
 
-	seq_printf(m, "%lu\n", ra_max_whole_mb);
-	return 0;
+	mult = 1 << (20 - PAGE_SHIFT);
+	return lprocfs_seq_read_frac_helper(m, pages_number, mult);
 }
 
 static ssize_t
@@ -422,50 +397,52 @@ ll_max_read_ahead_whole_mb_seq_write(struct file *file,
 	struct seq_file *m = file->private_data;
 	struct super_block *sb = m->private;
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
-	s64 ra_max_whole_mb, pages_number;
 	int rc;
+	__s64 pages_number;
 
-	rc = lprocfs_str_with_units_to_s64(buffer, count, &ra_max_whole_mb,
-					   'M');
+	rc = lprocfs_str_with_units_to_s64(file, buffer, count,
+			&pages_number, 'M');
 	if (rc)
 		return rc;
 
-	pages_number = round_up(ra_max_whole_mb, 1024 * 1024) >> PAGE_SHIFT;
+	pages_number >>= PAGE_SHIFT;
+
 	/* Cap this at the current max readahead window size, the readahead
-	 * algorithm does this anyway so it's pointless to set it larger.
-	 */
+	 * algorithm does this anyway so it's pointless to set it larger. */
 	if (pages_number < 0 ||
 	    pages_number > sbi->ll_ra_info.ra_max_pages_per_file) {
-		CERROR("%s: can't set max_read_ahead_whole_mb=%llu > max_read_ahead_per_file_mb=%lu\n",
-		       ll_get_fsname(sb, NULL, 0), PAGES_TO_MiB(pages_number),
-		       PAGES_TO_MiB(sbi->ll_ra_info.ra_max_pages_per_file));
+		int pages_shift = 20 - PAGE_SHIFT;
+		CERROR("%s: can't set max_read_ahead_whole_mb=%lu > "
+		       "max_read_ahead_per_file_mb=%lu\n",
+		       ll_get_fsname(sb, NULL, 0),
+		       (unsigned long)pages_number >> pages_shift,
+		       sbi->ll_ra_info.ra_max_pages_per_file >> pages_shift);
 		return -ERANGE;
 	}
 
 	spin_lock(&sbi->ll_lock);
 	sbi->ll_ra_info.ra_max_read_ahead_whole_pages = pages_number;
 	spin_unlock(&sbi->ll_lock);
-
 	return count;
 }
-
-LDEBUGFS_SEQ_FOPS(ll_max_read_ahead_whole_mb);
+LPROC_SEQ_FOPS(ll_max_read_ahead_whole_mb);
 
 static int ll_max_cached_mb_seq_show(struct seq_file *m, void *v)
 {
 	struct super_block     *sb    = m->private;
 	struct ll_sb_info      *sbi   = ll_s2sbi(sb);
 	struct cl_client_cache *cache = sbi->ll_cache;
+	int shift = 20 - PAGE_SHIFT;
 	long max_cached_mb;
 	long unused_mb;
 
-	max_cached_mb = PAGES_TO_MiB(cache->ccc_lru_max);
-	unused_mb = PAGES_TO_MiB(atomic_long_read(&cache->ccc_lru_left));
+	max_cached_mb = cache->ccc_lru_max >> shift;
+	unused_mb = atomic_long_read(&cache->ccc_lru_left) >> shift;
 	seq_printf(m, "users: %d\n"
-		      "max_cached_mb: %ld\n"
-		      "used_mb: %ld\n"
-		      "unused_mb: %ld\n"
-		      "reclaim_count: %u\n",
+		   "max_cached_mb: %ld\n"
+		   "used_mb: %ld\n"
+		   "unused_mb: %ld\n"
+		   "reclaim_count: %u\n",
 		   atomic_read(&cache->ccc_users),
 		   max_cached_mb,
 		   max_cached_mb - unused_mb,
@@ -474,9 +451,9 @@ static int ll_max_cached_mb_seq_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static ssize_t ll_max_cached_mb_seq_write(struct file *file,
-					  const char __user *buffer,
-					  size_t count, loff_t *off)
+static ssize_t
+ll_max_cached_mb_seq_write(struct file *file, const char __user *buffer,
+			   size_t count, loff_t *off)
 {
 	struct seq_file *m = file->private_data;
 	struct super_block *sb = m->private;
@@ -487,20 +464,21 @@ static ssize_t ll_max_cached_mb_seq_write(struct file *file,
 	long nrpages = 0;
 	__u16 refcheck;
 	__s64 pages_number;
-	int rc;
+	long rc;
 	char kernbuf[128];
-
 	ENTRY;
+
 	if (count >= sizeof(kernbuf))
 		RETURN(-EINVAL);
 
-	if (copy_from_user(kernbuf, buffer, count))
+	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
 		RETURN(-EFAULT);
 	kernbuf[count] = 0;
 
 	buffer += lprocfs_find_named_value(kernbuf, "max_cached_mb:", &count) -
 		  kernbuf;
-	rc = lprocfs_str_with_units_to_s64(buffer, count, &pages_number, 'M');
+	rc = lprocfs_str_with_units_to_s64(file, buffer, count,
+			&pages_number, 'M');
 	if (rc)
 		RETURN(rc);
 
@@ -509,7 +487,7 @@ static ssize_t ll_max_cached_mb_seq_write(struct file *file,
 	if (pages_number < 0 || pages_number > cfs_totalram_pages()) {
 		CERROR("%s: can't set max cache more than %lu MB\n",
 		       ll_get_fsname(sb, NULL, 0),
-		       PAGES_TO_MiB(cfs_totalram_pages()));
+		       cfs_totalram_pages() >> (20 - PAGE_SHIFT));
 		RETURN(-ERANGE);
 	}
 	/* Allow enough cache so clients can make well-formed RPCs */
@@ -527,7 +505,7 @@ static ssize_t ll_max_cached_mb_seq_write(struct file *file,
 
 	env = cl_env_get(&refcheck);
 	if (IS_ERR(env))
-		RETURN(PTR_ERR(env));
+		RETURN(rc);
 
 	diff = -diff;
 	while (diff > 0) {
@@ -580,225 +558,218 @@ static ssize_t ll_max_cached_mb_seq_write(struct file *file,
 	}
 	return rc;
 }
+LPROC_SEQ_FOPS(ll_max_cached_mb);
 
-LDEBUGFS_SEQ_FOPS(ll_max_cached_mb);
-
-static ssize_t checksums_show(struct kobject *kobj, struct attribute *attr,
-			      char *buf)
+static int ll_checksum_seq_show(struct seq_file *m, void *v)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
 
-	return sprintf(buf, "%u\n", (sbi->ll_flags & LL_SBI_CHECKSUM) ? 1 : 0);
+	seq_printf(m, "%u\n", (sbi->ll_flags & LL_SBI_CHECKSUM) ? 1 : 0);
+	return 0;
 }
 
-static ssize_t checksums_store(struct kobject *kobj, struct attribute *attr,
-			       const char *buffer, size_t count)
+static ssize_t ll_checksum_seq_write(struct file *file,
+				     const char __user *buffer,
+				     size_t count, loff_t *off)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
-	bool val;
-	int tmp;
+	struct seq_file *m = file->private_data;
+	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
 	int rc;
+	__s64 val;
 
 	if (!sbi->ll_dt_exp)
 		/* Not set up yet */
 		return -EAGAIN;
 
-	rc = kstrtobool(buffer, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 	if (val)
 		sbi->ll_flags |= LL_SBI_CHECKSUM;
 	else
 		sbi->ll_flags &= ~LL_SBI_CHECKSUM;
-	tmp = val;
 
 	rc = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM),
-				KEY_CHECKSUM, sizeof(tmp), &tmp, NULL);
+				KEY_CHECKSUM, sizeof(val), &val, NULL);
 	if (rc)
 		CWARN("Failed to set OSC checksum flags: %d\n", rc);
 
 	return count;
 }
-LUSTRE_RW_ATTR(checksums);
+LPROC_SEQ_FOPS(ll_checksum);
 
-LUSTRE_ATTR(checksum_pages, 0644, checksums_show, checksums_store);
-
-static ssize_t ll_rd_track_id(struct kobject *kobj, char *buf,
-			      enum stats_track_type type)
+static int ll_rd_track_id(struct seq_file *m, enum stats_track_type type)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
-
-	if (sbi->ll_stats_track_type == type)
-		return sprintf(buf, "%d\n", sbi->ll_stats_track_id);
-	else if (sbi->ll_stats_track_type == STATS_TRACK_ALL)
-		return sprintf(buf, "0 (all)\n");
+	struct super_block *sb = m->private;
 
-	return sprintf(buf, "untracked\n");
+	if (ll_s2sbi(sb)->ll_stats_track_type == type) {
+		seq_printf(m, "%d\n",
+			   ll_s2sbi(sb)->ll_stats_track_id);
+	} else if (ll_s2sbi(sb)->ll_stats_track_type == STATS_TRACK_ALL) {
+		seq_puts(m, "0 (all)\n");
+	} else {
+		seq_puts(m, "untracked\n");
+	}
+	return 0;
 }
 
-static ssize_t ll_wr_track_id(struct kobject *kobj, const char *buffer,
-			      size_t count, enum stats_track_type type)
+static int ll_wr_track_id(struct file *file,
+			  const char __user *buffer, unsigned long count,
+			  void *data, enum stats_track_type type)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
-	unsigned long pid;
+	struct super_block *sb = data;
 	int rc;
+	__s64 pid;
 
-	rc = kstrtoul(buffer, 10, &pid);
+	rc = lprocfs_str_to_s64(file, buffer, count, &pid);
 	if (rc)
 		return rc;
+	if (pid > INT_MAX || pid < 0)
+		return -ERANGE;
 
-	sbi->ll_stats_track_id = pid;
+	ll_s2sbi(sb)->ll_stats_track_id = pid;
 	if (pid == 0)
-		sbi->ll_stats_track_type = STATS_TRACK_ALL;
+		ll_s2sbi(sb)->ll_stats_track_type = STATS_TRACK_ALL;
 	else
-		sbi->ll_stats_track_type = type;
-	lprocfs_clear_stats(sbi->ll_stats);
+		ll_s2sbi(sb)->ll_stats_track_type = type;
+	lprocfs_clear_stats(ll_s2sbi(sb)->ll_stats);
 	return count;
 }
 
-static ssize_t stats_track_pid_show(struct kobject *kobj,
-				    struct attribute *attr,
-				    char *buf)
+static int ll_track_pid_seq_show(struct seq_file *m, void *v)
 {
-	return ll_rd_track_id(kobj, buf, STATS_TRACK_PID);
+	return ll_rd_track_id(m, STATS_TRACK_PID);
 }
 
-static ssize_t stats_track_pid_store(struct kobject *kobj,
-				     struct attribute *attr,
-				     const char *buffer,
-				     size_t count)
+static ssize_t ll_track_pid_seq_write(struct file *file,
+				      const char __user *buffer,
+				      size_t count, loff_t *off)
 {
-	return ll_wr_track_id(kobj, buffer, count, STATS_TRACK_PID);
+	struct seq_file *seq = file->private_data;
+	return ll_wr_track_id(file, buffer, count, seq->private,
+			STATS_TRACK_PID);
 }
-LUSTRE_RW_ATTR(stats_track_pid);
+LPROC_SEQ_FOPS(ll_track_pid);
 
-static ssize_t stats_track_ppid_show(struct kobject *kobj,
-				     struct attribute *attr,
-				     char *buf)
+static int ll_track_ppid_seq_show(struct seq_file *m, void *v)
 {
-	return ll_rd_track_id(kobj, buf, STATS_TRACK_PPID);
+	return ll_rd_track_id(m, STATS_TRACK_PPID);
 }
 
-static ssize_t stats_track_ppid_store(struct kobject *kobj,
-				      struct attribute *attr,
-				      const char *buffer,
-				      size_t count)
+static ssize_t ll_track_ppid_seq_write(struct file *file,
+				       const char __user *buffer,
+				       size_t count, loff_t *off)
 {
-	return ll_wr_track_id(kobj, buffer, count, STATS_TRACK_PPID);
+	struct seq_file *seq = file->private_data;
+	return ll_wr_track_id(file, buffer, count, seq->private,
+			STATS_TRACK_PPID);
 }
-LUSTRE_RW_ATTR(stats_track_ppid);
+LPROC_SEQ_FOPS(ll_track_ppid);
 
-static ssize_t stats_track_gid_show(struct kobject *kobj,
-				    struct attribute *attr,
-				    char *buf)
+static int ll_track_gid_seq_show(struct seq_file *m, void *v)
 {
-	return ll_rd_track_id(kobj, buf, STATS_TRACK_GID);
+	return ll_rd_track_id(m, STATS_TRACK_GID);
 }
 
-static ssize_t stats_track_gid_store(struct kobject *kobj,
-				     struct attribute *attr,
-				     const char *buffer,
-				     size_t count)
+static ssize_t ll_track_gid_seq_write(struct file *file,
+				      const char __user *buffer,
+				      size_t count, loff_t *off)
 {
-	return ll_wr_track_id(kobj, buffer, count, STATS_TRACK_GID);
+	struct seq_file *seq = file->private_data;
+	return ll_wr_track_id(file, buffer, count, seq->private,
+			STATS_TRACK_GID);
 }
-LUSTRE_RW_ATTR(stats_track_gid);
+LPROC_SEQ_FOPS(ll_track_gid);
 
-static ssize_t statahead_running_max_show(struct kobject *kobj,
-					  struct attribute *attr,
-					  char *buf)
+static int ll_statahead_running_max_seq_show(struct seq_file *m, void *v)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
 
-	return snprintf(buf, 16, "%u\n", sbi->ll_sa_running_max);
+	seq_printf(m, "%u\n", sbi->ll_sa_running_max);
+	return 0;
 }
 
-static ssize_t statahead_running_max_store(struct kobject *kobj,
-					   struct attribute *attr,
-					   const char *buffer,
-					   size_t count)
+static ssize_t ll_statahead_running_max_seq_write(struct file *file,
+					   const char __user *buffer,
+					   size_t count, loff_t *off)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
-	unsigned long val;
+	struct seq_file *m = file->private_data;
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
 	int rc;
+	__s64 val;
 
-	rc = kstrtoul(buffer, 0, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
-	if (val <= LL_SA_RUNNING_MAX) {
+	if (val >= 0 || val <= LL_SA_RUNNING_MAX)
 		sbi->ll_sa_running_max = val;
-		return count;
-	}
-
-	CERROR("Bad statahead_running_max value %lu. Valid values "
-	       "are in the range [0, %d]\n", val, LL_SA_RUNNING_MAX);
+	else
+		CERROR("%s: bad statahead_running_max value %lld. Valid values "
+		       "are in the range [0, %u]\n", ll_get_fsname(sb, NULL, 0),
+		       val, LL_SA_RUNNING_MAX);
 
-	return -ERANGE;
+	return count;
 }
-LUSTRE_RW_ATTR(statahead_running_max);
+LPROC_SEQ_FOPS(ll_statahead_running_max);
 
-static ssize_t statahead_max_show(struct kobject *kobj,
-				  struct attribute *attr,
-				  char *buf)
+static int ll_statahead_max_seq_show(struct seq_file *m, void *v)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
 
-	return sprintf(buf, "%u\n", sbi->ll_sa_max);
+	seq_printf(m, "%u\n", sbi->ll_sa_max);
+	return 0;
 }
 
-static ssize_t statahead_max_store(struct kobject *kobj,
-				   struct attribute *attr,
-				   const char *buffer,
-				   size_t count)
+static ssize_t ll_statahead_max_seq_write(struct file *file,
+					  const char __user *buffer,
+					  size_t count, loff_t *off)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
-	unsigned long val;
+	struct seq_file *m = file->private_data;
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
 	int rc;
+	__s64 val;
 
-	rc = kstrtoul(buffer, 0, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
-	if (val <= LL_SA_RPC_MAX)
+	if (val >= 0 && val <= LL_SA_RPC_MAX)
 		sbi->ll_sa_max = val;
 	else
-		CERROR("Bad statahead_max value %lu. Valid values are in the range [0, %d]\n",
+		CERROR("%s: bad statahead_max value %lld. Valid values are in "
+		       "are in the range [0, %u]\n", ll_get_fsname(sb, NULL, 0),
 		       val, LL_SA_RPC_MAX);
 
 	return count;
 }
-LUSTRE_RW_ATTR(statahead_max);
+LPROC_SEQ_FOPS(ll_statahead_max);
 
-static ssize_t statahead_agl_show(struct kobject *kobj,
-				  struct attribute *attr,
-				  char *buf)
+static int ll_statahead_agl_seq_show(struct seq_file *m, void *v)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
 
-	return sprintf(buf, "%u\n", sbi->ll_flags & LL_SBI_AGL_ENABLED ? 1 : 0);
+	seq_printf(m, "%u\n",
+		   sbi->ll_flags & LL_SBI_AGL_ENABLED ? 1 : 0);
+	return 0;
 }
 
-static ssize_t statahead_agl_store(struct kobject *kobj,
-				   struct attribute *attr,
-				   const char *buffer,
-				   size_t count)
+static ssize_t ll_statahead_agl_seq_write(struct file *file,
+					  const char __user *buffer,
+					  size_t count, loff_t *off)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
-	bool val;
+	struct seq_file *m = file->private_data;
+	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
 	int rc;
+	__s64 val;
 
-	rc = kstrtobool(buffer, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
@@ -809,7 +780,7 @@ static ssize_t statahead_agl_store(struct kobject *kobj,
 
 	return count;
 }
-LUSTRE_RW_ATTR(statahead_agl);
+LPROC_SEQ_FOPS(ll_statahead_agl);
 
 static int ll_statahead_stats_seq_show(struct seq_file *m, void *v)
 {
@@ -817,37 +788,35 @@ static int ll_statahead_stats_seq_show(struct seq_file *m, void *v)
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
 
 	seq_printf(m, "statahead total: %u\n"
-		      "statahead wrong: %u\n"
-		      "agl total: %u\n",
-		   atomic_read(&sbi->ll_sa_total),
-		   atomic_read(&sbi->ll_sa_wrong),
-		   atomic_read(&sbi->ll_agl_total));
+		    "statahead wrong: %u\n"
+		    "agl total: %u\n",
+		    atomic_read(&sbi->ll_sa_total),
+		    atomic_read(&sbi->ll_sa_wrong),
+		    atomic_read(&sbi->ll_agl_total));
 	return 0;
 }
+LPROC_SEQ_FOPS_RO(ll_statahead_stats);
 
-LDEBUGFS_SEQ_FOPS_RO(ll_statahead_stats);
-
-static ssize_t lazystatfs_show(struct kobject *kobj,
-			       struct attribute *attr,
-			       char *buf)
+static int ll_lazystatfs_seq_show(struct seq_file *m, void *v)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
 
-	return sprintf(buf, "%u\n", (sbi->ll_flags & LL_SBI_LAZYSTATFS) ? 1 : 0);
+	seq_printf(m, "%u\n",
+		   (sbi->ll_flags & LL_SBI_LAZYSTATFS) ? 1 : 0);
+	return 0;
 }
 
-static ssize_t lazystatfs_store(struct kobject *kobj,
-				struct attribute *attr,
-				const char *buffer,
-				size_t count)
+static ssize_t ll_lazystatfs_seq_write(struct file *file,
+				       const char __user *buffer,
+					size_t count, loff_t *off)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
-	bool val;
+	struct seq_file *m = file->private_data;
+	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
 	int rc;
+	__s64 val;
 
-	rc = kstrtobool(buffer, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
@@ -858,44 +827,12 @@ static ssize_t lazystatfs_store(struct kobject *kobj,
 
 	return count;
 }
-LUSTRE_RW_ATTR(lazystatfs);
+LPROC_SEQ_FOPS(ll_lazystatfs);
 
-static ssize_t statfs_max_age_show(struct kobject *kobj, struct attribute *attr,
-				   char *buf)
+static int ll_max_easize_seq_show(struct seq_file *m, void *v)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
-
-	return snprintf(buf, PAGE_SIZE, "%u\n", sbi->ll_statfs_max_age);
-}
-
-static ssize_t statfs_max_age_store(struct kobject *kobj,
-				    struct attribute *attr, const char *buffer,
-				    size_t count)
-{
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
-	unsigned int val;
-	int rc;
-
-	rc = kstrtouint(buffer, 10, &val);
-	if (rc)
-		return rc;
-	if (val > OBD_STATFS_CACHE_MAX_AGE)
-		return -EINVAL;
-
-	sbi->ll_statfs_max_age = val;
-
-	return count;
-}
-LUSTRE_RW_ATTR(statfs_max_age);
-
-static ssize_t max_easize_show(struct kobject *kobj,
-			       struct attribute *attr,
-			       char *buf)
-{
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
 	unsigned int ealen;
 	int rc;
 
@@ -903,9 +840,10 @@ static ssize_t max_easize_show(struct kobject *kobj,
 	if (rc)
 		return rc;
 
-	return sprintf(buf, "%u\n", ealen);
+	seq_printf(m, "%u\n", ealen);
+	return 0;
 }
-LUSTRE_RO_ATTR(max_easize);
+LPROC_SEQ_FOPS_RO(ll_max_easize);
 
 /**
  * Get default_easize.
@@ -918,12 +856,10 @@ LUSTRE_RO_ATTR(max_easize);
  * \retval 0		on success
  * \retval negative	negated errno on failure
  */
-static ssize_t default_easize_show(struct kobject *kobj,
-				   struct attribute *attr,
-				   char *buf)
+static int ll_default_easize_seq_show(struct seq_file *m, void *v)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
 	unsigned int ealen;
 	int rc;
 
@@ -931,7 +867,8 @@ static ssize_t default_easize_show(struct kobject *kobj,
 	if (rc)
 		return rc;
 
-	return sprintf(buf, "%u\n", ealen);
+	seq_printf(m, "%u\n", ealen);
+	return 0;
 }
 
 /**
@@ -950,22 +887,24 @@ static ssize_t default_easize_show(struct kobject *kobj,
  * \retval positive	\a count on success
  * \retval negative	negated errno on failure
  */
-static ssize_t default_easize_store(struct kobject *kobj,
-				    struct attribute *attr,
-				    const char *buffer,
-				    size_t count)
-{
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
-	unsigned int val;
+static ssize_t ll_default_easize_seq_write(struct file *file,
+					   const char __user *buffer,
+					   size_t count, loff_t *unused)
+{
+	struct seq_file	*seq = file->private_data;
+	struct super_block *sb = (struct super_block *)seq->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	__s64 val;
 	int rc;
 
 	if (count == 0)
 		return 0;
 
-	rc = kstrtouint(buffer, 10, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
+	if (val < 0 || val > INT_MAX)
+		return -ERANGE;
 
 	rc = ll_set_default_mdsize(sbi, val);
 	if (rc)
@@ -973,7 +912,7 @@ static ssize_t default_easize_store(struct kobject *kobj,
 
 	return count;
 }
-LUSTRE_RW_ATTR(default_easize);
+LPROC_SEQ_FOPS(ll_default_easize);
 
 static int ll_sbi_flags_seq_show(struct seq_file *m, void *v)
 {
@@ -997,112 +936,74 @@ static int ll_sbi_flags_seq_show(struct seq_file *m, void *v)
 	seq_printf(m, "\b\n");
 	return 0;
 }
+LPROC_SEQ_FOPS_RO(ll_sbi_flags);
 
-LDEBUGFS_SEQ_FOPS_RO(ll_sbi_flags);
-
-static ssize_t xattr_cache_show(struct kobject *kobj,
-				struct attribute *attr,
-				char *buf)
+static int ll_fast_read_seq_show(struct seq_file *m, void *v)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
-
-	return sprintf(buf, "%u\n", sbi->ll_xattr_cache_enabled);
-}
-
-static ssize_t xattr_cache_store(struct kobject *kobj,
-				 struct attribute *attr,
-				 const char *buffer,
-				 size_t count)
-{
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
-	bool val;
-	int rc;
-
-	rc = kstrtobool(buffer, &val);
-	if (rc)
-		return rc;
-
-	if (val && !(sbi->ll_flags & LL_SBI_XATTR_CACHE))
-		return -ENOTSUPP;
-
-	sbi->ll_xattr_cache_enabled = val;
-	sbi->ll_xattr_cache_set = 1;
-
-	return count;
-}
-LUSTRE_RW_ATTR(xattr_cache);
-
-static ssize_t tiny_write_show(struct kobject *kobj,
-			       struct attribute *attr,
-			       char *buf)
-{
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
 
-	return sprintf(buf, "%u\n", !!(sbi->ll_flags & LL_SBI_TINY_WRITE));
+	seq_printf(m, "%u\n", !!(sbi->ll_flags & LL_SBI_FAST_READ));
+	return 0;
 }
 
-static ssize_t tiny_write_store(struct kobject *kobj,
-				struct attribute *attr,
-				const char *buffer,
-				size_t count)
+static ssize_t
+ll_fast_read_seq_write(struct file *file, const char __user *buffer,
+		       size_t count, loff_t *off)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
-	bool val;
+	struct seq_file *m = file->private_data;
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
 	int rc;
+	__s64 val;
 
-	rc = kstrtobool(buffer, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
 	spin_lock(&sbi->ll_lock);
-	if (val)
-		sbi->ll_flags |= LL_SBI_TINY_WRITE;
+	if (val == 1)
+		sbi->ll_flags |= LL_SBI_FAST_READ;
 	else
-		sbi->ll_flags &= ~LL_SBI_TINY_WRITE;
+		sbi->ll_flags &= ~LL_SBI_FAST_READ;
 	spin_unlock(&sbi->ll_lock);
 
 	return count;
 }
-LUSTRE_RW_ATTR(tiny_write);
+LPROC_SEQ_FOPS(ll_fast_read);
 
-static ssize_t fast_read_show(struct kobject *kobj,
-			      struct attribute *attr,
-			      char *buf)
+static int ll_pio_seq_show(struct seq_file *m, void *v)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
 
-	return sprintf(buf, "%u\n", !!(sbi->ll_flags & LL_SBI_FAST_READ));
+	seq_printf(m, "%u\n", !!(sbi->ll_flags & LL_SBI_PIO));
+	return 0;
 }
 
-static ssize_t fast_read_store(struct kobject *kobj,
-			       struct attribute *attr,
-			       const char *buffer,
-			       size_t count)
+static ssize_t ll_pio_seq_write(struct file *file, const char __user *buffer,
+				size_t count, loff_t *off)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
-	bool val;
+	struct seq_file *m = file->private_data;
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
 	int rc;
+	__s64 val;
 
-	rc = kstrtobool(buffer, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
 	spin_lock(&sbi->ll_lock);
-	if (val)
-		sbi->ll_flags |= LL_SBI_FAST_READ;
+	if (val == 1)
+		sbi->ll_flags |= LL_SBI_PIO;
 	else
-		sbi->ll_flags &= ~LL_SBI_FAST_READ;
+		sbi->ll_flags &= ~LL_SBI_PIO;
 	spin_unlock(&sbi->ll_lock);
 
 	return count;
 }
-LUSTRE_RW_ATTR(fast_read);
+LPROC_SEQ_FOPS(ll_pio);
 
 static int ll_unstable_stats_seq_show(struct seq_file *m, void *v)
 {
@@ -1116,8 +1017,8 @@ static int ll_unstable_stats_seq_show(struct seq_file *m, void *v)
 	mb    = (pages * PAGE_SIZE) >> 20;
 
 	seq_printf(m, "unstable_check:     %8d\n"
-		      "unstable_pages: %12ld\n"
-		      "unstable_mb:        %8d\n",
+		   "unstable_pages: %12ld\n"
+		   "unstable_mb:        %8d\n",
 		   cache->ccc_unstable_check, pages, mb);
 	return 0;
 }
@@ -1129,33 +1030,32 @@ static ssize_t ll_unstable_stats_seq_write(struct file *file,
 	struct seq_file *seq = file->private_data;
 	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)seq->private);
 	char kernbuf[128];
-	bool val;
 	int rc;
+	__s64 val;
 
 	if (count == 0)
 		return 0;
 	if (count >= sizeof(kernbuf))
 		return -EINVAL;
 
-	if (copy_from_user(kernbuf, buffer, count))
+	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
 		return -EFAULT;
 	kernbuf[count] = 0;
 
 	buffer += lprocfs_find_named_value(kernbuf, "unstable_check:", &count) -
 		  kernbuf;
-	rc = kstrtobool_from_user(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc < 0)
 		return rc;
 
 	/* borrow lru lock to set the value */
 	spin_lock(&sbi->ll_cache->ccc_lru_lock);
-	sbi->ll_cache->ccc_unstable_check = val;
+	sbi->ll_cache->ccc_unstable_check = !!val;
 	spin_unlock(&sbi->ll_cache->ccc_lru_lock);
 
 	return count;
 }
-
-LDEBUGFS_SEQ_FOPS(ll_unstable_stats);
+LPROC_SEQ_FOPS(ll_unstable_stats);
 
 static int ll_root_squash_seq_show(struct seq_file *m, void *v)
 {
@@ -1176,11 +1076,10 @@ static ssize_t ll_root_squash_seq_write(struct file *file,
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
 	struct root_squash_info *squash = &sbi->ll_squash;
 
-	return lprocfs_wr_root_squash(buffer, count, squash,
+	return lprocfs_wr_root_squash(file, buffer, count, squash,
 				      ll_get_fsname(sb, NULL, 0));
 }
-
-LDEBUGFS_SEQ_FOPS(ll_root_squash);
+LPROC_SEQ_FOPS(ll_root_squash);
 
 static int ll_nosquash_nids_seq_show(struct seq_file *m, void *v)
 {
@@ -1213,7 +1112,7 @@ static ssize_t ll_nosquash_nids_seq_write(struct file *file,
 	struct root_squash_info *squash = &sbi->ll_squash;
 	int rc;
 
-	rc = lprocfs_wr_nosquash_nids(buffer, count, squash,
+	rc = lprocfs_wr_nosquash_nids(file, buffer, count, squash,
 				      ll_get_fsname(sb, NULL, 0));
 	if (rc < 0)
 		return rc;
@@ -1222,77 +1121,80 @@ static ssize_t ll_nosquash_nids_seq_write(struct file *file,
 
 	return rc;
 }
+LPROC_SEQ_FOPS(ll_nosquash_nids);
 
-LDEBUGFS_SEQ_FOPS(ll_nosquash_nids);
-
-struct ldebugfs_vars lprocfs_llite_obd_vars[] = {
+struct lprocfs_vars lprocfs_llite_obd_vars[] = {
+	{ .name	=	"uuid",
+	  .fops	=	&ll_sb_uuid_fops			},
+	{ .name	=	"fstype",
+	  .fops	=	&ll_fstype_fops				},
 	{ .name	=	"site",
 	  .fops	=	&ll_site_stats_fops			},
-	{ .name =	"max_read_ahead_mb",
-	  .fops =	&ll_max_readahead_mb_fops		},
-	{ .name =	"max_read_ahead_per_file_mb",
-	  .fops =	&ll_max_readahead_per_file_mb_fops	},
-	{ .name =	"max_read_ahead_whole_mb",
-	  .fops =	&ll_max_read_ahead_whole_mb_fops	},
+	{ .name	=	"blocksize",
+	  .fops	=	&ll_blksize_fops			},
+	{ .name	=	"stat_blocksize",
+	  .fops	=	&ll_stat_blksize_fops			},
+	{ .name	=	"kbytestotal",
+	  .fops	=	&ll_kbytestotal_fops			},
+	{ .name	=	"kbytesfree",
+	  .fops	=	&ll_kbytesfree_fops			},
+	{ .name	=	"kbytesavail",
+	  .fops	=	&ll_kbytesavail_fops			},
+	{ .name	=	"filestotal",
+	  .fops	=	&ll_filestotal_fops			},
+	{ .name	=	"filesfree",
+	  .fops	=	&ll_filesfree_fops			},
+	{ .name	=	"client_type",
+	  .fops	=	&ll_client_type_fops			},
+	{ .name	=	"max_read_ahead_mb",
+	  .fops	=	&ll_max_readahead_mb_fops		},
+	{ .name	=	"max_read_ahead_per_file_mb",
+	  .fops	=	&ll_max_readahead_per_file_mb_fops	},
+	{ .name	=	"max_read_ahead_whole_mb",
+	  .fops	=	&ll_max_read_ahead_whole_mb_fops	},
 	{ .name	=	"max_cached_mb",
 	  .fops	=	&ll_max_cached_mb_fops			},
+	{ .name	=	"checksum_pages",
+	  .fops	=	&ll_checksum_fops			},
+	{ .name	=	"stats_track_pid",
+	  .fops	=	&ll_track_pid_fops			},
+	{ .name	=	"stats_track_ppid",
+	  .fops	=	&ll_track_ppid_fops			},
+	{ .name	=	"stats_track_gid",
+	  .fops	=	&ll_track_gid_fops			},
+	{ .name	=	"statahead_max",
+	  .fops	=	&ll_statahead_max_fops			},
+	{ .name	=	"statahead_running_max",
+	  .fops	=	&ll_statahead_running_max_fops		},
+	{ .name	=	"statahead_agl",
+	  .fops	=	&ll_statahead_agl_fops			},
 	{ .name	=	"statahead_stats",
 	  .fops	=	&ll_statahead_stats_fops		},
+	{ .name	=	"lazystatfs",
+	  .fops	=	&ll_lazystatfs_fops			},
+	{ .name	=	"max_easize",
+	  .fops	=	&ll_max_easize_fops			},
+	{ .name	=	"default_easize",
+	  .fops	=	&ll_default_easize_fops			},
+	{ .name	=	"sbi_flags",
+	  .fops	=	&ll_sbi_flags_fops			},
+	{ .name	=	"xattr_cache",
+	  .fops	=	&ll_xattr_cache_fops			},
 	{ .name	=	"unstable_stats",
 	  .fops	=	&ll_unstable_stats_fops			},
-	{ .name =	"sbi_flags",
-	  .fops =	&ll_sbi_flags_fops			},
 	{ .name	=	"root_squash",
 	  .fops	=	&ll_root_squash_fops			},
 	{ .name	=	"nosquash_nids",
 	  .fops	=	&ll_nosquash_nids_fops			},
+	{ .name =	"fast_read",
+	  .fops =	&ll_fast_read_fops,			},
+	{ .name =	"pio",
+	  .fops =	&ll_pio_fops,				},
 	{ NULL }
 };
 
 #define MAX_STRING_SIZE 128
 
-static struct attribute *llite_attrs[] = {
-	&lustre_attr_blocksize.attr,
-	&lustre_attr_stat_blocksize.attr,
-	&lustre_attr_kbytestotal.attr,
-	&lustre_attr_kbytesfree.attr,
-	&lustre_attr_kbytesavail.attr,
-	&lustre_attr_filestotal.attr,
-	&lustre_attr_filesfree.attr,
-	&lustre_attr_client_type.attr,
-	&lustre_attr_fstype.attr,
-	&lustre_attr_uuid.attr,
-	&lustre_attr_checksums.attr,
-	&lustre_attr_checksum_pages.attr,
-	&lustre_attr_stats_track_pid.attr,
-	&lustre_attr_stats_track_ppid.attr,
-	&lustre_attr_stats_track_gid.attr,
-	&lustre_attr_statahead_running_max.attr,
-	&lustre_attr_statahead_max.attr,
-	&lustre_attr_statahead_agl.attr,
-	&lustre_attr_lazystatfs.attr,
-	&lustre_attr_statfs_max_age.attr,
-	&lustre_attr_max_easize.attr,
-	&lustre_attr_default_easize.attr,
-	&lustre_attr_xattr_cache.attr,
-	&lustre_attr_fast_read.attr,
-	&lustre_attr_tiny_write.attr,
-	NULL,
-};
-
-static void llite_kobj_release(struct kobject *kobj)
-{
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kset.kobj);
-	complete(&sbi->ll_kobj_unregister);
-}
-
-static struct kobj_type llite_ktype = {
-	.default_attrs  = llite_attrs,
-	.sysfs_ops      = &lustre_sysfs_ops,
-	.release        = llite_kobj_release,
-};
-
 static const struct llite_file_opcode {
         __u32       opcode;
         __u32       type;
@@ -1378,45 +1280,60 @@ static const char *ra_stat_string[] = {
 	[RA_STAT_FAILED_REACH_END] = "failed to reach end"
 };
 
-int ll_debugfs_register_super(struct super_block *sb, const char *name)
+LPROC_SEQ_FOPS_RO_TYPE(llite, name);
+LPROC_SEQ_FOPS_RO_TYPE(llite, uuid);
+
+int lprocfs_ll_register_mountpoint(struct proc_dir_entry *parent,
+				   struct super_block *sb)
 {
+	struct lprocfs_vars lvars[2];
 	struct lustre_sb_info *lsi = s2lsi(sb);
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
-	int err, id, rc;
-
+	char name[MAX_STRING_SIZE + 1], *ptr;
+	int err, id, len, rc;
 	ENTRY;
-	LASSERT(sbi);
 
-	if (IS_ERR_OR_NULL(llite_root))
-		goto out_ll_kset;
+	memset(lvars, 0, sizeof(lvars));
+
+	name[MAX_STRING_SIZE] = '\0';
+	lvars[0].name = name;
+
+	LASSERT(sbi != NULL);
+
+	/* Get fsname */
+	len = strlen(lsi->lsi_lmd->lmd_profile);
+	ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-');
+	if (ptr && (strcmp(ptr, "-client") == 0))
+		len -= 7;
+
+	/* Mount info */
+	snprintf(name, MAX_STRING_SIZE, "%.*s-%p", len,
+		 lsi->lsi_lmd->lmd_profile, sb);
 
-	sbi->ll_debugfs_entry = ldebugfs_register(name, llite_root,
-						  lprocfs_llite_obd_vars, sb);
-	if (IS_ERR_OR_NULL(sbi->ll_debugfs_entry)) {
-		err = sbi->ll_debugfs_entry ? PTR_ERR(sbi->ll_debugfs_entry) :
-					      -ENOMEM;
-		sbi->ll_debugfs_entry = NULL;
+	sbi->ll_proc_root = lprocfs_register(name, parent, NULL, NULL);
+	if (IS_ERR(sbi->ll_proc_root)) {
+		err = PTR_ERR(sbi->ll_proc_root);
+		sbi->ll_proc_root = NULL;
 		RETURN(err);
 	}
 
-	rc = ldebugfs_seq_create(sbi->ll_debugfs_entry, "dump_page_cache",0444,
-				 &vvp_dump_pgcache_file_ops, sbi);
+	rc = lprocfs_seq_create(sbi->ll_proc_root, "dump_page_cache", 0444,
+				&vvp_dump_pgcache_file_ops, sbi);
 	if (rc)
 		CWARN("Error adding the dump_page_cache file\n");
 
-	rc = ldebugfs_seq_create(sbi->ll_debugfs_entry, "extents_stats", 0644,
-				 &ll_rw_extents_stats_fops, sbi);
+	rc = lprocfs_seq_create(sbi->ll_proc_root, "extents_stats", 0644,
+				&ll_rw_extents_stats_fops, sbi);
 	if (rc)
 		CWARN("Error adding the extent_stats file\n");
 
-	rc = ldebugfs_seq_create(sbi->ll_debugfs_entry,
-				 "extents_stats_per_process", 0644,
-				 &ll_rw_extents_stats_pp_fops, sbi);
+	rc = lprocfs_seq_create(sbi->ll_proc_root, "extents_stats_per_process",
+				0644, &ll_rw_extents_stats_pp_fops, sbi);
 	if (rc)
 		CWARN("Error adding the extents_stats_per_process file\n");
 
-	rc = ldebugfs_seq_create(sbi->ll_debugfs_entry, "offset_stats", 0644,
-				 &ll_rw_offset_stats_fops, sbi);
+	rc = lprocfs_seq_create(sbi->ll_proc_root, "offset_stats", 0644,
+				&ll_rw_offset_stats_fops, sbi);
 	if (rc)
 		CWARN("Error adding the offset_stats file\n");
 
@@ -1424,13 +1341,11 @@ int ll_debugfs_register_super(struct super_block *sb, const char *name)
 	sbi->ll_stats = lprocfs_alloc_stats(LPROC_LL_FILE_OPCODES,
 					    LPROCFS_STATS_FLAG_NONE);
 	if (sbi->ll_stats == NULL)
-		GOTO(out_debugfs, err = -ENOMEM);
-
+		GOTO(out, err = -ENOMEM);
 	/* do counter init */
 	for (id = 0; id < LPROC_LL_FILE_OPCODES; id++) {
-		u32 type = llite_opcode_table[id].type;
+		__u32 type = llite_opcode_table[id].type;
 		void *ptr = NULL;
-
 		if (type & LPROCFS_TYPE_REGS)
 			ptr = "regs";
 		else if (type & LPROCFS_TYPE_BYTES)
@@ -1442,78 +1357,98 @@ int ll_debugfs_register_super(struct super_block *sb, const char *name)
 				     (type & LPROCFS_CNTR_AVGMINMAX),
 				     llite_opcode_table[id].opname, ptr);
 	}
-
-	err = ldebugfs_register_stats(sbi->ll_debugfs_entry, "stats",
-				      sbi->ll_stats);
+	err = lprocfs_register_stats(sbi->ll_proc_root, "stats", sbi->ll_stats);
 	if (err)
-		GOTO(out_stats, err);
+		GOTO(out, err);
 
 	sbi->ll_ra_stats = lprocfs_alloc_stats(ARRAY_SIZE(ra_stat_string),
 					       LPROCFS_STATS_FLAG_NONE);
 	if (sbi->ll_ra_stats == NULL)
-		GOTO(out_stats, err = -ENOMEM);
+		GOTO(out, err = -ENOMEM);
 
 	for (id = 0; id < ARRAY_SIZE(ra_stat_string); id++)
 		lprocfs_counter_init(sbi->ll_ra_stats, id, 0,
 				     ra_stat_string[id], "pages");
-
-	err = ldebugfs_register_stats(sbi->ll_debugfs_entry, "read_ahead_stats",
-				      sbi->ll_ra_stats);
-	if (err)
-		GOTO(out_ra_stats, err);
-
-out_ll_kset:
-	/* Yes we also register sysfs mount kset here as well */
-	sbi->ll_kset.kobj.parent = llite_kobj;
-	sbi->ll_kset.kobj.ktype = &llite_ktype;
-	init_completion(&sbi->ll_kobj_unregister);
-	err = kobject_set_name(&sbi->ll_kset.kobj, "%s", name);
+	err = lprocfs_register_stats(sbi->ll_proc_root, "read_ahead_stats",
+				     sbi->ll_ra_stats);
 	if (err)
-		GOTO(out_ra_stats, err);
+		GOTO(out, err);
 
-	err = kset_register(&sbi->ll_kset);
-	if (err)
-		GOTO(out_ra_stats, err);
 
-	lsi->lsi_kobj = kobject_get(&sbi->ll_kset.kobj);
-
-	RETURN(0);
-out_ra_stats:
-	lprocfs_free_stats(&sbi->ll_ra_stats);
-out_stats:
-	lprocfs_free_stats(&sbi->ll_stats);
-out_debugfs:
-	ldebugfs_remove(&sbi->ll_debugfs_entry);
+	err = lprocfs_add_vars(sbi->ll_proc_root, lprocfs_llite_obd_vars, sb);
+	if (err)
+		GOTO(out, err);
 
+out:
+	if (err) {
+		lprocfs_remove(&sbi->ll_proc_root);
+		lprocfs_free_stats(&sbi->ll_ra_stats);
+		lprocfs_free_stats(&sbi->ll_stats);
+	}
 	RETURN(err);
 }
 
-void ll_debugfs_unregister_super(struct super_block *sb)
+int lprocfs_ll_register_obd(struct super_block *sb, const char *obdname)
 {
-	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct lprocfs_vars lvars[2];
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct obd_device *obd;
+	struct proc_dir_entry *dir;
+	char name[MAX_STRING_SIZE + 1];
+	int err;
+	ENTRY;
+
+	memset(lvars, 0, sizeof(lvars));
+
+	name[MAX_STRING_SIZE] = '\0';
+	lvars[0].name = name;
 
-	if (!IS_ERR_OR_NULL(sbi->ll_debugfs_entry))
-		ldebugfs_remove(&sbi->ll_debugfs_entry);
+	LASSERT(sbi != NULL);
+	LASSERT(obdname != NULL);
 
-	if (sbi->ll_dt_obd)
-		sysfs_remove_link(&sbi->ll_kset.kobj,
-				  sbi->ll_dt_obd->obd_type->typ_name);
+	obd = class_name2obd(obdname);
 
-	if (sbi->ll_md_obd)
-		sysfs_remove_link(&sbi->ll_kset.kobj,
-				  sbi->ll_md_obd->obd_type->typ_name);
+	LASSERT(obd != NULL);
+	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+	LASSERT(obd->obd_type->typ_name != NULL);
 
-	kobject_put(lsi->lsi_kobj);
+	dir = proc_mkdir(obd->obd_type->typ_name, sbi->ll_proc_root);
+	if (dir == NULL)
+		GOTO(out, err = -ENOMEM);
+
+	snprintf(name, MAX_STRING_SIZE, "common_name");
+	lvars[0].fops = &llite_name_fops;
+	err = lprocfs_add_vars(dir, lvars, obd);
+	if (err)
+		GOTO(out, err);
 
-	kset_unregister(&sbi->ll_kset);
-	wait_for_completion(&sbi->ll_kobj_unregister);
+	snprintf(name, MAX_STRING_SIZE, "uuid");
+	lvars[0].fops = &llite_uuid_fops;
+	err = lprocfs_add_vars(dir, lvars, obd);
+	if (err)
+		GOTO(out, err);
 
-	lprocfs_free_stats(&sbi->ll_ra_stats);
-	lprocfs_free_stats(&sbi->ll_stats);
+out:
+	if (err) {
+		lprocfs_remove(&sbi->ll_proc_root);
+		lprocfs_free_stats(&sbi->ll_ra_stats);
+		lprocfs_free_stats(&sbi->ll_stats);
+	}
+	RETURN(err);
+}
+
+void lprocfs_ll_unregister_mountpoint(struct ll_sb_info *sbi)
+{
+        if (sbi->ll_proc_root) {
+                lprocfs_remove(&sbi->ll_proc_root);
+                lprocfs_free_stats(&sbi->ll_ra_stats);
+                lprocfs_free_stats(&sbi->ll_stats);
+        }
 }
 #undef MAX_STRING_SIZE
 
+#define pct(a,b) (b ? a * 100 / b : 0)
+
 static void ll_display_extents_info(struct ll_rw_extents_info *io_extents,
                                    struct seq_file *seq, int which)
 {
@@ -1537,14 +1472,14 @@ static void ll_display_extents_info(struct ll_rw_extents_info *io_extents,
                 w = pp_info->pp_w_hist.oh_buckets[i];
                 read_cum += r;
                 write_cum += w;
-		end = BIT(i + LL_HIST_START - units);
-		seq_printf(seq, "%4lu%c - %4lu%c%c: %14lu %4u %4u  | "
-			   "%14lu %4u %4u\n", start, *unitp, end, *unitp,
+                end = 1 << (i + LL_HIST_START - units);
+                seq_printf(seq, "%4lu%c - %4lu%c%c: %14lu %4lu %4lu  | "
+                           "%14lu %4lu %4lu\n", start, *unitp, end, *unitp,
                            (i == LL_HIST_MAX - 1) ? '+' : ' ',
                            r, pct(r, read_tot), pct(read_cum, read_tot),
                            w, pct(w, write_tot), pct(write_cum, write_tot));
                 start = end;
-		if (start == BIT(10)) {
+                if (start == 1<<10) {
                         start = 1;
                         units += 10;
                         unitp++;
@@ -1599,7 +1534,7 @@ static ssize_t ll_rw_extents_stats_pp_seq_write(struct file *file,
 	if (len == 0)
 		return -EINVAL;
 
-	value = ll_stats_pid_write(buf, len);
+	value = ll_stats_pid_write(file, buf, len);
 
 	if (value == 0)
 		sbi->ll_rw_stats_on = 0;
@@ -1616,7 +1551,7 @@ static ssize_t ll_rw_extents_stats_pp_seq_write(struct file *file,
 	return len;
 }
 
-LDEBUGFS_SEQ_FOPS(ll_rw_extents_stats_pp);
+LPROC_SEQ_FOPS(ll_rw_extents_stats_pp);
 
 static int ll_rw_extents_stats_seq_show(struct seq_file *seq, void *v)
 {
@@ -1657,7 +1592,7 @@ static ssize_t ll_rw_extents_stats_seq_write(struct file *file,
 	if (len == 0)
 		return -EINVAL;
 
-	value = ll_stats_pid_write(buf, len);
+	value = ll_stats_pid_write(file, buf, len);
 
 	if (value == 0)
 		sbi->ll_rw_stats_on = 0;
@@ -1674,8 +1609,7 @@ static ssize_t ll_rw_extents_stats_seq_write(struct file *file,
 
 	return len;
 }
-
-LDEBUGFS_SEQ_FOPS(ll_rw_extents_stats);
+LPROC_SEQ_FOPS(ll_rw_extents_stats);
 
 void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
                        struct ll_file_data *file, loff_t pos,
@@ -1712,15 +1646,15 @@ void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
                 lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_w_hist);
         }
 
-	for (i = 0; (count >= BIT(LL_HIST_START + i)) &&
-	     (i < (LL_HIST_MAX - 1)); i++);
-	if (rw == 0) {
-		io_extents->pp_extents[cur].pp_r_hist.oh_buckets[i]++;
-		io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_r_hist.oh_buckets[i]++;
-	} else {
-		io_extents->pp_extents[cur].pp_w_hist.oh_buckets[i]++;
-		io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_w_hist.oh_buckets[i]++;
-	}
+        for(i = 0; (count >= (1 << LL_HIST_START << i)) &&
+             (i < (LL_HIST_MAX - 1)); i++);
+        if (rw == 0) {
+                io_extents->pp_extents[cur].pp_r_hist.oh_buckets[i]++;
+                io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_r_hist.oh_buckets[i]++;
+        } else {
+                io_extents->pp_extents[cur].pp_w_hist.oh_buckets[i]++;
+                io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_w_hist.oh_buckets[i]++;
+        }
 	spin_unlock(&sbi->ll_pp_extent_lock);
 
 	spin_lock(&sbi->ll_process_lock);
@@ -1806,7 +1740,7 @@ static int ll_rw_offset_stats_seq_show(struct seq_file *seq, void *v)
 	for (i = 0; i < LL_OFFSET_HIST_MAX; i++) {
 		if (offset[i].rw_pid != 0)
 			seq_printf(seq,
-				  "%3c %10d %14llu %14llu %17lu %17lu %14lld\n",
+				   "%3c %10d %14Lu %14Lu %17lu %17lu %14Lu",
 				   offset[i].rw_op == READ ? 'R' : 'W',
 				   offset[i].rw_pid,
 				   offset[i].rw_range_start,
@@ -1820,7 +1754,7 @@ static int ll_rw_offset_stats_seq_show(struct seq_file *seq, void *v)
 	for (i = 0; i < LL_PROCESS_HIST_MAX; i++) {
 		if (process[i].rw_pid != 0)
 			seq_printf(seq,
-				  "%3c %10d %14llu %14llu %17lu %17lu %14lld\n",
+				   "%3c %10d %14Lu %14Lu %17lu %17lu %14Lu",
 				   process[i].rw_op == READ ? 'R' : 'W',
 				   process[i].rw_pid,
 				   process[i].rw_range_start,
@@ -1847,7 +1781,7 @@ static ssize_t ll_rw_offset_stats_seq_write(struct file *file,
 	if (len == 0)
 		return -EINVAL;
 
-	value = ll_stats_pid_write(buf, len);
+	value = ll_stats_pid_write(file, buf, len);
 
 	if (value == 0)
 		sbi->ll_rw_stats_on = 0;
@@ -1866,4 +1800,43 @@ static ssize_t ll_rw_offset_stats_seq_write(struct file *file,
 	return len;
 }
 
-LDEBUGFS_SEQ_FOPS(ll_rw_offset_stats);
+/**
+ * ll_stats_pid_write() - Determine if stats collection should be enabled
+ * @buf: Buffer containing the data written
+ * @len: Number of bytes in the buffer
+ *
+ * Several proc files begin collecting stats when a value is written, and stop
+ * collecting when either '0' or 'disable' is written. This function checks the
+ * written value to see if collection should be enabled or disabled.
+ *
+ * Return: If '0' or 'disable' is provided, 0 is returned. If the text
+ * equivalent of a number is written, that number is returned. Otherwise,
+ * 1 is returned. Non-zero return values indicate collection should be enabled.
+ */
+static __s64 ll_stats_pid_write(struct file *file, const char __user *buf,
+				size_t len)
+{
+	__s64 value = 1;
+	int rc;
+	char kernbuf[16];
+
+	rc = lprocfs_str_to_s64(file, buf, len, &value);
+
+	if (rc < 0 && len < sizeof(kernbuf)) {
+
+		if (lprocfs_copy_from_user(file, kernbuf, buf, len))
+			return -EFAULT;
+		kernbuf[len] = 0;
+
+		if (kernbuf[len - 1] == '\n')
+			kernbuf[len - 1] = 0;
+
+		if (strncasecmp(kernbuf, "disable", 7) == 0)
+			value = 0;
+	}
+
+	return value;
+}
+
+LPROC_SEQ_FOPS(ll_rw_offset_stats);
+#endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/llite/namei.c b/drivers/staging/lustrefsx/lustre/llite/namei.c
index dea41a48b589a..ae7101b1885f2 100644
--- a/drivers/staging/lustrefsx/lustre/llite/namei.c
+++ b/drivers/staging/lustrefsx/lustre/llite/namei.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -46,6 +46,7 @@
 #include <obd_support.h>
 #include <lustre_fid.h>
 #include <lustre_dlm.h>
+#include <lustre_ver.h>
 #include "llite_internal.h"
 
 static int ll_create_it(struct inode *dir, struct dentry *dentry,
@@ -137,9 +138,6 @@ struct inode *ll_iget(struct super_block *sb, ino_t hash,
 			inode_has_no_xattr(inode);
 			unlock_new_inode(inode);
 		}
-	} else if (is_bad_inode(inode)) {
-		iput(inode);
-		inode = ERR_PTR(-ESTALE);
 	} else if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
 		rc = ll_update_inode(inode, md);
 		CDEBUG(D_VFSTRACE, "got inode: "DFID"(%p): rc = %d\n",
@@ -183,314 +181,168 @@ int ll_test_inode_by_fid(struct inode *inode, void *opaque)
 	return lu_fid_eq(&ll_i2info(inode)->lli_fid, opaque);
 }
 
-static int ll_dom_lock_cancel(struct inode *inode, struct ldlm_lock *lock)
-{
-	struct lu_env *env;
-	struct ll_inode_info *lli = ll_i2info(inode);
-	__u16 refcheck;
-	int rc;
-	ENTRY;
-
-	env = cl_env_get(&refcheck);
-	if (IS_ERR(env))
-		RETURN(PTR_ERR(env));
-
-	/* reach MDC layer to flush data under  the DoM ldlm lock */
-	rc = cl_object_flush(env, lli->lli_clob, lock);
-	if (rc == -ENODATA) {
-		CDEBUG(D_INODE, "inode "DFID" layout has no DoM stripe\n",
-		       PFID(ll_inode2fid(inode)));
-		/* most likely result of layout change, do nothing */
-		rc = 0;
-	}
-
-	cl_env_put(env, &refcheck);
-	RETURN(rc);
-}
-
-static void ll_lock_cancel_bits(struct ldlm_lock *lock, __u64 to_cancel)
+int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+		       void *data, int flag)
 {
-	struct inode *inode = ll_inode_from_resource_lock(lock);
-	__u64 bits = to_cancel;
+	struct lustre_handle lockh;
 	int rc;
-
 	ENTRY;
 
-	if (!inode) {
-		/* That means the inode is evicted most likely and may cause
-		 * the skipping of lock cleanups below, so print the message
-		 * about that in log.
-		 */
-		if (lock->l_resource->lr_lvb_inode)
-			LDLM_DEBUG(lock,
-				   "can't take inode for the lock (%sevicted)\n",
-				   lock->l_resource->lr_lvb_inode->i_state &
-				   I_FREEING ? "" : "not ");
-		RETURN_EXIT;
-	}
-
-	if (!fid_res_name_eq(ll_inode2fid(inode),
-			     &lock->l_resource->lr_name)) {
-		LDLM_ERROR(lock, "data mismatch with object "DFID"(%p)",
-			   PFID(ll_inode2fid(inode)), inode);
-		LBUG();
-	}
+	switch (flag) {
+	case LDLM_CB_BLOCKING:
+		ldlm_lock2handle(lock, &lockh);
+		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+		if (rc < 0) {
+			CDEBUG(D_INODE, "ldlm_cli_cancel: rc = %d\n", rc);
+			RETURN(rc);
+		}
+		break;
+	case LDLM_CB_CANCELING: {
+		struct inode *inode = ll_inode_from_resource_lock(lock);
+		__u64 bits = lock->l_policy_data.l_inodebits.bits;
 
-	if (bits & MDS_INODELOCK_XATTR) {
-		if (S_ISDIR(inode->i_mode))
-			ll_i2info(inode)->lli_def_stripe_offset = -1;
-		ll_xattr_cache_destroy(inode);
-		bits &= ~MDS_INODELOCK_XATTR;
-	}
+		/* Inode is set to lock->l_resource->lr_lvb_inode
+		 * for mdc - bug 24555 */
+		LASSERT(lock->l_ast_data == NULL);
 
-	/* For OPEN locks we differentiate between lock modes
-	 * LCK_CR, LCK_CW, LCK_PR - bug 22891 */
-	if (bits & MDS_INODELOCK_OPEN)
-		ll_have_md_lock(inode, &bits, lock->l_req_mode);
+		if (inode == NULL)
+			break;
 
-	if (bits & MDS_INODELOCK_OPEN) {
-		fmode_t fmode;
+		/* Invalidate all dentries associated with this inode */
+		LASSERT(ldlm_is_canceling(lock));
 
-		switch (lock->l_req_mode) {
-		case LCK_CW:
-			fmode = FMODE_WRITE;
-			break;
-		case LCK_PR:
-			fmode = FMODE_EXEC;
-			break;
-		case LCK_CR:
-			fmode = FMODE_READ;
-			break;
-		default:
-			LDLM_ERROR(lock, "bad lock mode for OPEN lock");
+		if (!fid_res_name_eq(ll_inode2fid(inode),
+				     &lock->l_resource->lr_name)) {
+			LDLM_ERROR(lock, "data mismatch with object "DFID"(%p)",
+				   PFID(ll_inode2fid(inode)), inode);
 			LBUG();
 		}
 
-		ll_md_real_close(inode, fmode);
-
-		bits &= ~MDS_INODELOCK_OPEN;
-	}
-
-	if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE |
-		    MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM |
-		    MDS_INODELOCK_DOM))
-		ll_have_md_lock(inode, &bits, LCK_MINMODE);
+		if (bits & MDS_INODELOCK_XATTR) {
+			if (S_ISDIR(inode->i_mode))
+				ll_i2info(inode)->lli_def_stripe_offset = -1;
+			ll_xattr_cache_destroy(inode);
+			bits &= ~MDS_INODELOCK_XATTR;
+		}
 
-	if (bits & MDS_INODELOCK_DOM) {
-		rc =  ll_dom_lock_cancel(inode, lock);
-		if (rc < 0)
-			CDEBUG(D_INODE, "cannot flush DoM data "
-			       DFID": rc = %d\n",
-			       PFID(ll_inode2fid(inode)), rc);
-	}
+		/* For OPEN locks we differentiate between lock modes
+		 * LCK_CR, LCK_CW, LCK_PR - bug 22891 */
+		if (bits & MDS_INODELOCK_OPEN)
+			ll_have_md_lock(inode, &bits, lock->l_req_mode);
+
+		if (bits & MDS_INODELOCK_OPEN) {
+			fmode_t fmode;
+
+			switch (lock->l_req_mode) {
+			case LCK_CW:
+				fmode = FMODE_WRITE;
+				break;
+			case LCK_PR:
+				fmode = FMODE_EXEC;
+				break;
+			case LCK_CR:
+				fmode = FMODE_READ;
+				break;
+			default:
+				LDLM_ERROR(lock, "bad lock mode for OPEN lock");
+				LBUG();
+			}
 
-	if (bits & MDS_INODELOCK_LAYOUT) {
-		struct cl_object_conf conf = {
-			.coc_opc = OBJECT_CONF_INVALIDATE,
-			.coc_inode = inode,
-		};
+			ll_md_real_close(inode, fmode);
 
-		rc = ll_layout_conf(inode, &conf);
-		if (rc < 0)
-			CDEBUG(D_INODE, "cannot invalidate layout of "
-			       DFID": rc = %d\n",
-			       PFID(ll_inode2fid(inode)), rc);
-	}
+			bits &= ~MDS_INODELOCK_OPEN;
+		}
 
-	if (bits & MDS_INODELOCK_UPDATE) {
-		struct ll_inode_info *lli = ll_i2info(inode);
+		if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE |
+			    MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM))
+			ll_have_md_lock(inode, &bits, LCK_MINMODE);
+
+		if (bits & MDS_INODELOCK_LAYOUT) {
+			struct cl_object_conf conf = {
+				.coc_opc = OBJECT_CONF_INVALIDATE,
+				.coc_inode = inode,
+			};
+
+			rc = ll_layout_conf(inode, &conf);
+			if (rc < 0)
+				CDEBUG(D_INODE, "cannot invalidate layout of "
+				       DFID": rc = %d\n",
+				       PFID(ll_inode2fid(inode)), rc);
+		}
 
-		lli->lli_update_atime = 1;
-	}
+		if (bits & MDS_INODELOCK_UPDATE) {
+			struct ll_inode_info *lli = ll_i2info(inode);
+			lli->lli_update_atime = 1;
+		}
 
-	if ((bits & MDS_INODELOCK_UPDATE) && S_ISDIR(inode->i_mode)) {
-		struct ll_inode_info *lli = ll_i2info(inode);
+		if ((bits & MDS_INODELOCK_UPDATE) && S_ISDIR(inode->i_mode)) {
+			struct ll_inode_info *lli = ll_i2info(inode);
 
-		CDEBUG(D_INODE, "invalidating inode "DFID" lli = %p, "
-		       "pfid  = "DFID"\n", PFID(ll_inode2fid(inode)),
-		       lli, PFID(&lli->lli_pfid));
-		truncate_inode_pages(inode->i_mapping, 0);
+			CDEBUG(D_INODE, "invalidating inode "DFID" lli = %p, "
+			       "pfid  = "DFID"\n", PFID(ll_inode2fid(inode)),
+			       lli, PFID(&lli->lli_pfid));
+			truncate_inode_pages(inode->i_mapping, 0);
 
-		if (unlikely(!fid_is_zero(&lli->lli_pfid))) {
-			struct inode *master_inode = NULL;
-			unsigned long hash;
+			if (unlikely(!fid_is_zero(&lli->lli_pfid))) {
+				struct inode *master_inode = NULL;
+				unsigned long hash;
 
-			/* This is slave inode, since all of the child dentry
-			 * is connected on the master inode, so we have to
-			 * invalidate the negative children on master inode */
-			CDEBUG(D_INODE, "Invalidate s"DFID" m"DFID"\n",
-			       PFID(ll_inode2fid(inode)), PFID(&lli->lli_pfid));
+				/* This is slave inode, since all of the child
+				 * dentry is connected on the master inode, so
+				 * we have to invalidate the negative children
+				 * on master inode */
+				CDEBUG(D_INODE, "Invalidate s"DFID" m"DFID"\n",
+				       PFID(ll_inode2fid(inode)),
+				       PFID(&lli->lli_pfid));
 
-			hash = cl_fid_build_ino(&lli->lli_pfid,
+				hash = cl_fid_build_ino(&lli->lli_pfid,
 					ll_need_32bit_api(ll_i2sbi(inode)));
 
-			/* Do not lookup the inode with ilookup5, otherwise
-			 * it will cause dead lock,
-			 * 1. Client1 send chmod req to the MDT0, then on MDT0,
-			 * it enqueues master and all of its slaves lock,
-			 * (mdt_attr_set() -> mdt_lock_slaves()), after gets
-			 * master and stripe0 lock, it will send the enqueue
-			 * req (for stripe1) to MDT1, then MDT1 finds the lock
-			 * has been granted to client2. Then MDT1 sends blocking
-			 * ast to client2.
-			 * 2. At the same time, client2 tries to unlink
-			 * the striped dir (rm -rf striped_dir), and during
-			 * lookup, it will hold the master inode of the striped
-			 * directory, whose inode state is NEW, then tries to
-			 * revalidate all of its slaves, (ll_prep_inode()->
-			 * ll_iget()->ll_read_inode2()-> ll_update_inode().).
-			 * And it will be blocked on the server side because
-			 * of 1.
-			 * 3. Then the client get the blocking_ast req, cancel
-			 * the lock, but being blocked if using ->ilookup5()),
-			 * because master inode state is NEW. */
-			master_inode = ilookup5_nowait(inode->i_sb, hash,
-							ll_test_inode_by_fid,
+				/* Do not lookup the inode with ilookup5,
+				 * otherwise it will cause dead lock,
+				 *
+				 * 1. Client1 send chmod req to the MDT0, then
+				 * on MDT0, it enqueues master and all of its
+				 * slaves lock, (mdt_attr_set() ->
+				 * mdt_lock_slaves()), after gets master and
+				 * stripe0 lock, it will send the enqueue req
+				 * (for stripe1) to MDT1, then MDT1 finds the
+				 * lock has been granted to client2. Then MDT1
+				 * sends blocking ast to client2.
+				 *
+				 * 2. At the same time, client2 tries to unlink
+				 * the striped dir (rm -rf striped_dir), and
+				 * during lookup, it will hold the master inode
+				 * of the striped directory, whose inode state
+				 * is NEW, then tries to revalidate all of its
+				 * slaves, (ll_prep_inode()->ll_iget()->
+				 * ll_read_inode2()-> ll_update_inode().). And
+				 * it will be blocked on the server side because
+				 * of 1.
+				 *
+				 * 3. Then the client get the blocking_ast req,
+				 * cancel the lock, but being blocked if using
+				 * ->ilookup5()), because master inode state is
+				 *  NEW. */
+				master_inode = ilookup5_nowait(inode->i_sb,
+						    hash, ll_test_inode_by_fid,
 							(void *)&lli->lli_pfid);
-			if (master_inode) {
-				ll_invalidate_negative_children(master_inode);
-				iput(master_inode);
+				if (master_inode) {
+					ll_invalidate_negative_children(
+								master_inode);
+					iput(master_inode);
+				}
+			} else {
+				ll_invalidate_negative_children(inode);
 			}
-		} else {
-			ll_invalidate_negative_children(inode);
-		}
-	}
-
-	if ((bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM)) &&
-	    inode->i_sb->s_root != NULL &&
-	    inode != inode->i_sb->s_root->d_inode)
-		ll_invalidate_aliases(inode);
-
-	if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM))
-		forget_all_cached_acls(inode);
-
-	iput(inode);
-	RETURN_EXIT;
-}
-
-/* Check if the given lock may be downgraded instead of canceling and
- * that convert is really needed. */
-int ll_md_need_convert(struct ldlm_lock *lock)
-{
-	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
-	struct inode *inode;
-	__u64 wanted = lock->l_policy_data.l_inodebits.cancel_bits;
-	__u64 bits = lock->l_policy_data.l_inodebits.bits & ~wanted;
-	enum ldlm_mode mode = LCK_MINMODE;
-
-	if (!lock->l_conn_export ||
-	    !exp_connect_lock_convert(lock->l_conn_export))
-		return 0;
-
-	if (!wanted || !bits || ldlm_is_cancel(lock))
-		return 0;
-
-	/* do not convert locks other than DOM for now */
-	if (!((bits | wanted) & MDS_INODELOCK_DOM))
-		return 0;
-
-	/* We may have already remaining bits in some other lock so
-	 * lock convert will leave us just extra lock for the same bit.
-	 * Check if client has other lock with the same bits and the same
-	 * or lower mode and don't convert if any.
-	 */
-	switch (lock->l_req_mode) {
-	case LCK_PR:
-		mode = LCK_PR;
-		/* fallthrough */
-	case LCK_PW:
-		mode |= LCK_CR;
-		break;
-	case LCK_CW:
-		mode = LCK_CW;
-		/* fallthrough */
-	case LCK_CR:
-		mode |= LCK_CR;
-		break;
-	default:
-		/* do not convert other modes */
-		return 0;
-	}
-
-	/* is lock is too old to be converted? */
-	lock_res_and_lock(lock);
-	if (ktime_after(ktime_get(),
-			ktime_add(lock->l_last_used,
-				  ktime_set(ns->ns_dirty_age_limit, 0)))) {
-		unlock_res_and_lock(lock);
-		return 0;
-	}
-	unlock_res_and_lock(lock);
-
-	inode = ll_inode_from_resource_lock(lock);
-	ll_have_md_lock(inode, &bits, mode);
-	iput(inode);
-	return !!(bits);
-}
-
-int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *ld,
-		       void *data, int flag)
-{
-	struct lustre_handle lockh;
-	int rc;
-
-	ENTRY;
-
-	switch (flag) {
-	case LDLM_CB_BLOCKING:
-	{
-		__u64 cancel_flags = LCF_ASYNC;
-
-		/* if lock convert is not needed then still have to
-		 * pass lock via ldlm_cli_convert() to keep all states
-		 * correct, set cancel_bits to full lock bits to cause
-		 * full cancel to happen.
-		 */
-		if (!ll_md_need_convert(lock)) {
-			lock_res_and_lock(lock);
-			lock->l_policy_data.l_inodebits.cancel_bits =
-					lock->l_policy_data.l_inodebits.bits;
-			unlock_res_and_lock(lock);
 		}
-		rc = ldlm_cli_convert(lock, cancel_flags);
-		if (!rc)
-			RETURN(0);
-		/* continue with cancel otherwise */
-		ldlm_lock2handle(lock, &lockh);
-		rc = ldlm_cli_cancel(&lockh, cancel_flags);
-		if (rc < 0) {
-			CDEBUG(D_INODE, "ldlm_cli_cancel: rc = %d\n", rc);
-			RETURN(rc);
-		}
-		break;
-	}
-	case LDLM_CB_CANCELING:
-	{
-		__u64 to_cancel = lock->l_policy_data.l_inodebits.bits;
 
-		/* Nothing to do for non-granted locks */
-		if (!ldlm_is_granted(lock))
-			break;
+		if ((bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM)) &&
+		    inode->i_sb->s_root != NULL &&
+		    inode != inode->i_sb->s_root->d_inode)
+			ll_invalidate_aliases(inode);
 
-		/* If 'ld' is supplied then bits to be cancelled are passed
-		 * implicitly by lock converting and cancel_bits from 'ld'
-		 * should be used. Otherwise full cancel is being performed
-		 * and lock inodebits are used.
-		 *
-		 * Note: we cannot rely on cancel_bits in lock itself at this
-		 * moment because they can be changed by concurrent thread,
-		 * so ldlm_cli_inodebits_convert() pass cancel bits implicitly
-		 * in 'ld' parameter.
-		 */
-		if (ld) {
-			/* partial bits cancel allowed only during convert */
-			LASSERT(ldlm_is_converting(lock));
-			/* mask cancel bits by lock bits so only no any unused
-			 * bits are passed to ll_lock_cancel_bits()
-			 */
-			to_cancel &= ld->l_policy_data.l_inodebits.cancel_bits;
-		}
-		ll_lock_cancel_bits(lock, to_cancel);
+		iput(inode);
 		break;
 	}
 	default:
@@ -610,8 +462,7 @@ struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de)
 
 static int ll_lookup_it_finish(struct ptlrpc_request *request,
 			       struct lookup_intent *it,
-			       struct inode *parent, struct dentry **de,
-			       void *secctx, __u32 secctxlen, ktime_t kstart)
+			       struct inode *parent, struct dentry **de)
 {
 	struct inode		 *inode = NULL;
 	__u64			  bits = 0;
@@ -624,56 +475,20 @@ static int ll_lookup_it_finish(struct ptlrpc_request *request,
 	CDEBUG(D_DENTRY, "it %p it_disposition %x\n", it,
 	       it->it_disposition);
 	if (!it_disposition(it, DISP_LOOKUP_NEG)) {
-		struct req_capsule *pill = &request->rq_pill;
-		struct mdt_body *body = req_capsule_server_get(pill,
-							       &RMF_MDT_BODY);
-
-		rc = ll_prep_inode(&inode, request, (*de)->d_sb, it);
-		if (rc)
-			RETURN(rc);
-
-		ll_set_lock_data(ll_i2sbi(parent)->ll_md_exp, inode, it, &bits);
-		/* OPEN can return data if lock has DoM+LAYOUT bits set */
-		if (it->it_op & IT_OPEN &&
-		    bits & MDS_INODELOCK_DOM && bits & MDS_INODELOCK_LAYOUT)
-			ll_dom_finish_open(inode, request);
-
-		/* We used to query real size from OSTs here, but actually
-		 * this is not needed. For stat() calls size would be updated
-		 * from subsequent do_revalidate()->ll_inode_revalidate_it() in
-		 * 2.4 and
-		 * vfs_getattr_it->ll_getattr()->ll_inode_revalidate_it() in 2.6
-		 * Everybody else who needs correct file size would call
-		 * ll_glimpse_size or some equivalent themselves anyway.
-		 * Also see bug 7198.
-		 */
-
-		/* If security context was returned by MDT, put it in
-		 * inode now to save an extra getxattr from security hooks,
-		 * and avoid deadlock.
-		 */
-		if (body->mbo_valid & OBD_MD_SECCTX) {
-			secctx = req_capsule_server_get(pill, &RMF_FILE_SECCTX);
-			secctxlen = req_capsule_get_size(pill,
-							   &RMF_FILE_SECCTX,
-							   RCL_SERVER);
-
-			if (secctxlen)
-				CDEBUG(D_SEC, "server returned security context"
-				       " for "DFID"\n",
-				       PFID(ll_inode2fid(inode)));
-		}
-
-		if (secctx && secctxlen) {
-			inode_lock(inode);
-			rc = security_inode_notifysecctx(inode, secctx,
-							 secctxlen);
-			inode_unlock(inode);
-			if (rc)
-				CWARN("cannot set security context for "
-				      DFID": rc = %d\n",
-				      PFID(ll_inode2fid(inode)), rc);
-		}
+                rc = ll_prep_inode(&inode, request, (*de)->d_sb, it);
+                if (rc)
+                        RETURN(rc);
+
+                ll_set_lock_data(ll_i2sbi(parent)->ll_md_exp, inode, it, &bits);
+
+                /* We used to query real size from OSTs here, but actually
+                   this is not needed. For stat() calls size would be updated
+                   from subsequent do_revalidate()->ll_inode_revalidate_it() in
+                   2.4 and
+                   vfs_getattr_it->ll_getattr()->ll_inode_revalidate_it() in 2.6
+                   Everybody else who needs correct file size would call
+                   ll_glimpse_size or some equivalent themselves anyway.
+                   Also see bug 7198. */
 	}
 
 	/* Only hash *de if it is unhashed (new dentry).
@@ -690,9 +505,9 @@ static int ll_lookup_it_finish(struct ptlrpc_request *request,
 		if (bits & MDS_INODELOCK_LOOKUP)
 			d_lustre_revalidate(*de);
 	} else if (!it_disposition(it, DISP_OPEN_CREATE)) {
-		/*
-		 * If file was created on the server, the dentry is revalidated
-		 * in ll_create_it if the lock allows for it.
+		/* If file created on server, don't depend on parent UPDATE
+		 * lock to unhide it. It is left hidden and next lookup can
+		 * find it in ll_splice_alias.
 		 */
 		/* Check that parent has UPDATE lock. */
 		struct lookup_intent parent_it = {
@@ -717,18 +532,11 @@ static int ll_lookup_it_finish(struct ptlrpc_request *request,
 		}
 	}
 
-	if (it_disposition(it, DISP_OPEN_CREATE)) {
-		ll_stats_ops_tally(ll_i2sbi(parent), LPROC_LL_MKNOD,
-				   ktime_us_delta(ktime_get(), kstart));
-	}
-
 	GOTO(out, rc = 0);
 
 out:
-	if (rc != 0 && it->it_op & IT_OPEN) {
-		ll_intent_drop_lock(it);
+	if (rc != 0 && it->it_op & IT_OPEN)
 		ll_open_cleanup((*de)->d_sb, request);
-	}
 
 	return rc;
 }
@@ -737,16 +545,13 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
 				   struct lookup_intent *it,
 				   void **secctx, __u32 *secctxlen)
 {
-	ktime_t kstart = ktime_get();
 	struct lookup_intent lookup_it = { .it_op = IT_LOOKUP };
 	struct dentry *save = dentry, *retval;
 	struct ptlrpc_request *req = NULL;
 	struct md_op_data *op_data = NULL;
-	__u32 opc;
-	int rc;
-	char secctx_name[XATTR_NAME_MAX + 1];
-
-	ENTRY;
+        __u32 opc;
+        int rc;
+        ENTRY;
 
         if (dentry->d_name.len > ll_i2sbi(parent)->ll_namelen)
                 RETURN(ERR_PTR(-ENAMETOOLONG));
@@ -794,32 +599,10 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
 					     &op_data->op_file_secctx_size);
 		if (rc < 0)
 			GOTO(out, retval = ERR_PTR(rc));
-		if (secctx)
+		if (secctx != NULL)
 			*secctx = op_data->op_file_secctx;
-		if (secctxlen)
+		if (secctxlen != NULL)
 			*secctxlen = op_data->op_file_secctx_size;
-	} else {
-		if (secctx)
-			*secctx = NULL;
-		if (secctxlen)
-			*secctxlen = 0;
-	}
-
-	/* ask for security context upon intent */
-	if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_OPEN)) {
-		/* get name of security xattr to request to server */
-		rc = ll_listsecurity(parent, secctx_name,
-				     sizeof(secctx_name));
-		if (rc < 0) {
-			CDEBUG(D_SEC, "cannot get security xattr name for "
-			       DFID": rc = %d\n",
-			       PFID(ll_inode2fid(parent)), rc);
-		} else if (rc > 0) {
-			op_data->op_file_secctx_name = secctx_name;
-			op_data->op_file_secctx_name_size = rc;
-			CDEBUG(D_SEC, "'%.*s' is security xattr for "DFID"\n",
-			       rc, secctx_name, PFID(ll_inode2fid(parent)));
-		}
 	}
 
 	rc = md_intent_lock(ll_i2mdexp(parent), op_data, it, &req,
@@ -853,15 +636,11 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
 	if (rc < 0)
 		GOTO(out, retval = ERR_PTR(rc));
 
-	/* dir layout may change */
-	ll_unlock_md_op_lsm(op_data);
-	rc = ll_lookup_it_finish(req, it, parent, &dentry,
-				 secctx ? *secctx : NULL,
-				 secctxlen ? *secctxlen : 0, kstart);
-	if (rc != 0) {
-		ll_intent_release(it);
-		GOTO(out, retval = ERR_PTR(rc));
-	}
+	rc = ll_lookup_it_finish(req, it, parent, &dentry);
+        if (rc != 0) {
+                ll_intent_release(it);
+                GOTO(out, retval = ERR_PTR(rc));
+        }
 
         if ((it->it_op & IT_OPEN) && dentry->d_inode &&
             !S_ISREG(dentry->d_inode->i_mode) &&
@@ -874,7 +653,7 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
 
 out:
 	if (op_data != NULL && !IS_ERR(op_data)) {
-		if (secctx && secctxlen) {
+		if (secctx != NULL && secctxlen != NULL) {
 			/* caller needs sec ctx info, so reset it in op_data to
 			 * prevent it from being freed */
 			op_data->op_file_secctx = NULL;
@@ -1199,7 +978,6 @@ static int ll_create_it(struct inode *dir, struct dentry *dentry,
 			void *secctx, __u32 secctxlen)
 {
 	struct inode *inode;
-	__u64 bits = 0;
 	int rc = 0;
 	ENTRY;
 
@@ -1215,7 +993,8 @@ static int ll_create_it(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		RETURN(PTR_ERR(inode));
 
-	if ((ll_i2sbi(inode)->ll_flags & LL_SBI_FILE_SECCTX) && secctx) {
+	if ((ll_i2sbi(inode)->ll_flags & LL_SBI_FILE_SECCTX) &&
+	    secctx != NULL) {
 		inode_lock(inode);
 		/* must be done before d_instantiate, because it calls
 		 * security_d_instantiate, which means a getxattr if security
@@ -1234,10 +1013,6 @@ static int ll_create_it(struct inode *dir, struct dentry *dentry,
 			RETURN(rc);
 	}
 
-	ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, inode, it, &bits);
-	if (bits & MDS_INODELOCK_LOOKUP)
-		d_lustre_revalidate(dentry);
-
 	RETURN(0);
 }
 
@@ -1377,38 +1152,38 @@ static int ll_mknod(struct inode *dir, struct dentry *dchild, ll_umode_t mode,
 {
 	struct qstr *name = &dchild->d_name;
 	int err;
-	ENTRY;
+        ENTRY;
 
 	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p) mode %o dev %x\n",
 	       name->len, name->name, PFID(ll_inode2fid(dir)), dir,
-	       mode, rdev);
+               mode, rdev);
 
 	if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir)))
 		mode &= ~current_umask();
 
-	switch (mode & S_IFMT) {
-	case 0:
-		mode |= S_IFREG;
-		/* fallthrough */
-	case S_IFREG:
-	case S_IFCHR:
-	case S_IFBLK:
-	case S_IFIFO:
-	case S_IFSOCK:
+        switch (mode & S_IFMT) {
+        case 0:
+                mode |= S_IFREG; /* for mode = 0 case, fallthrough */
+		/* Fall through */
+        case S_IFREG:
+        case S_IFCHR:
+        case S_IFBLK:
+        case S_IFIFO:
+        case S_IFSOCK:
 		err = ll_new_node(dir, dchild, NULL, mode, old_encode_dev(rdev),
 				  LUSTRE_OPC_MKNOD);
-		break;
-	case S_IFDIR:
-		err = -EPERM;
-		break;
-	default:
-		err = -EINVAL;
-	}
+                break;
+        case S_IFDIR:
+                err = -EPERM;
+                break;
+        default:
+                err = -EINVAL;
+        }
 
-	if (!err)
-		ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKNOD, 1);
+        if (!err)
+                ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKNOD, 1);
 
-	RETURN(err);
+        RETURN(err);
 }
 
 #ifdef HAVE_IOP_ATOMIC_OPEN
diff --git a/drivers/staging/lustrefsx/lustre/llite/range_lock.c b/drivers/staging/lustrefsx/lustre/llite/range_lock.c
index 7a4c9c4cb766a..56e129165c4be 100644
--- a/drivers/staging/lustrefsx/lustre/llite/range_lock.c
+++ b/drivers/staging/lustrefsx/lustre/llite/range_lock.c
@@ -33,11 +33,8 @@
  * Author: Prakash Surya <surya1@llnl.gov>
  * Author: Bobi Jam <bobijam.xu@intel.com>
  */
-#ifdef HAVE_SCHED_HEADERS
-#include <linux/sched/signal.h>
-#endif
 #include "range_lock.h"
-#include <uapi/linux/lustre/lustre_user.h>
+#include <lustre/lustre_user.h>
 
 /**
  * Initialize a range lock tree
diff --git a/drivers/staging/lustrefsx/lustre/llite/rw.c b/drivers/staging/lustrefsx/lustre/llite/rw.c
index a5f3f9c187d57..a00ccef398702 100644
--- a/drivers/staging/lustrefsx/lustre/llite/rw.c
+++ b/drivers/staging/lustrefsx/lustre/llite/rw.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -369,7 +369,7 @@ ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io,
 					 io->ci_obj, ra.cra_end, page_idx);
 				/* update read ahead RPC size.
 				 * NB: it's racy but doesn't matter */
-				if (ras->ras_rpc_size != ra.cra_rpc_size &&
+				if (ras->ras_rpc_size > ra.cra_rpc_size &&
 				    ra.cra_rpc_size > 0)
 					ras->ras_rpc_size = ra.cra_rpc_size;
 				/* trim it to align with optimal RPC size */
@@ -714,10 +714,7 @@ static void ras_increase_window(struct inode *inode,
 
 		wlen = min(ras->ras_window_len + ras->ras_rpc_size,
 			   ra->ra_max_pages_per_file);
-		if (wlen < ras->ras_rpc_size)
-			ras->ras_window_len = wlen;
-		else
-			ras->ras_window_len = ras_align(ras, wlen, NULL);
+		ras->ras_window_len = ras_align(ras, wlen, NULL);
 	}
 }
 
@@ -1077,7 +1074,7 @@ void ll_cl_remove(struct file *file, const struct lu_env *env)
 	write_unlock(&fd->fd_lock);
 }
 
-int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
+static int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
 			   struct cl_page *page, struct file *file)
 {
 	struct inode              *inode  = vvp_object_inode(page->cp_obj);
@@ -1085,7 +1082,6 @@ int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
 	struct ll_file_data       *fd     = LUSTRE_FPRIVATE(file);
 	struct ll_readahead_state *ras    = &fd->fd_ras;
 	struct cl_2queue          *queue  = &io->ci_queue;
-	struct cl_sync_io	  *anchor = NULL;
 	struct vvp_page           *vpg;
 	int			   rc = 0;
 	bool			   uptodate;
@@ -1113,10 +1109,6 @@ int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
 		cl_page_export(env, page, 1);
 		cl_page_disown(env, io, page);
 	} else {
-		anchor = &vvp_env_info(env)->vti_anchor;
-		cl_sync_io_init(anchor, 1, &cl_sync_io_end);
-		page->cp_sync_io = anchor;
-
 		cl_2queue_add(queue, page);
 	}
 
@@ -1137,30 +1129,10 @@ int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
 			task_io_account_read(PAGE_SIZE * count);
 	}
 
-
-	if (anchor != NULL && !cl_page_is_owned(page, io)) { /* have sent */
-		rc = cl_sync_io_wait(env, anchor, 0);
-
-		cl_page_assume(env, io, page);
-		cl_page_list_del(env, &queue->c2_qout, page);
-
-		if (!PageUptodate(cl_page_vmpage(page))) {
-			/* Failed to read a mirror, discard this page so that
-			 * new page can be created with new mirror.
-			 *
-			 * TODO: this is not needed after page reinit
-			 * route is implemented */
-			cl_page_discard(env, io, page);
-		}
-		cl_page_disown(env, io, page);
-	}
-
-	/* TODO: discard all pages until page reinit route is implemented */
-	cl_page_list_discard(env, io, &queue->c2_qin);
-
-	/* Unlock unsent read pages in case of error. */
+	/*
+	 * Unlock unsent pages in case of error.
+	 */
 	cl_page_list_disown(env, io, &queue->c2_qin);
-
 	cl_2queue_fini(env, queue);
 
 	RETURN(rc);
@@ -1171,25 +1143,24 @@ int ll_readpage(struct file *file, struct page *vmpage)
 	struct inode *inode = file_inode(file);
 	struct cl_object *clob = ll_i2info(inode)->lli_clob;
 	struct ll_cl_context *lcc;
-	const struct lu_env  *env = NULL;
-	struct cl_io   *io = NULL;
+	const struct lu_env  *env;
+	struct cl_io   *io;
 	struct cl_page *page;
 	int result;
 	ENTRY;
 
 	lcc = ll_cl_find(file);
-	if (lcc != NULL) {
-		env = lcc->lcc_env;
-		io  = lcc->lcc_io;
+	if (lcc == NULL) {
+		unlock_page(vmpage);
+		RETURN(-EIO);
 	}
 
+	env = lcc->lcc_env;
+	io  = lcc->lcc_io;
 	if (io == NULL) { /* fast read */
 		struct inode *inode = file_inode(file);
 		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 		struct ll_readahead_state *ras = &fd->fd_ras;
-		struct lu_env  *local_env = NULL;
-		unsigned long fast_read_pages =
-			max(RA_REMAIN_WINDOW_MIN, ras->ras_rpc_size);
 		struct vvp_page *vpg;
 
 		result = -ENODATA;
@@ -1202,16 +1173,11 @@ int ll_readpage(struct file *file, struct page *vmpage)
 			RETURN(result);
 		}
 
-		if (!env) {
-			local_env = cl_env_percpu_get();
-			env = local_env;
-		}
-
 		vpg = cl2vvp_page(cl_object_page_slice(page->cp_obj, page));
 		if (vpg->vpg_defer_uptodate) {
 			enum ras_update_flags flags = LL_RAS_HIT;
 
-			if (lcc && lcc->lcc_type == LCC_MMAP)
+			if (lcc->lcc_type == LCC_MMAP)
 				flags |= LL_RAS_MMAP;
 
 			/* For fast read, it updates read ahead state only
@@ -1226,7 +1192,7 @@ int ll_readpage(struct file *file, struct page *vmpage)
 			 * the case, we can't do fast IO because we will need
 			 * a cl_io to issue the RPC. */
 			if (ras->ras_window_start + ras->ras_window_len <
-			    ras->ras_next_readahead + fast_read_pages) {
+			    ras->ras_next_readahead + PTLRPC_MAX_BRW_PAGES) {
 				/* export the page and skip io stack */
 				vpg->vpg_ra_used = 1;
 				cl_page_export(env, page, 1);
@@ -1234,14 +1200,8 @@ int ll_readpage(struct file *file, struct page *vmpage)
 			}
 		}
 
-		/* release page refcount before unlocking the page to ensure
-		 * the object won't be destroyed in the calling path of
-		 * cl_page_put(). Please see comment in ll_releasepage(). */
-		cl_page_put(env, page);
 		unlock_page(vmpage);
-		if (local_env)
-			cl_env_percpu_put(local_env);
-
+		cl_page_put(env, page);
 		RETURN(result);
 	}
 
@@ -1251,7 +1211,6 @@ int ll_readpage(struct file *file, struct page *vmpage)
 		LASSERT(page->cp_type == CPT_CACHEABLE);
 		if (likely(!PageUptodate(vmpage))) {
 			cl_page_assume(env, io, page);
-
 			result = ll_io_read_page(env, io, page, file);
 		} else {
 			/* Page from a non-object file. */
@@ -1265,3 +1224,28 @@ int ll_readpage(struct file *file, struct page *vmpage)
         }
 	RETURN(result);
 }
+
+int ll_page_sync_io(const struct lu_env *env, struct cl_io *io,
+		    struct cl_page *page, enum cl_req_type crt)
+{
+	struct cl_2queue  *queue;
+	int result;
+
+	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+	queue = &io->ci_queue;
+	cl_2queue_init_page(queue, page);
+
+	result = cl_io_submit_sync(env, io, crt, queue, 0);
+	LASSERT(cl_page_is_owned(page, io));
+
+	if (crt == CRT_READ)
+		/*
+		 * in CRT_WRITE case page is left locked even in case of
+		 * error.
+		 */
+		cl_page_list_disown(env, io, &queue->c2_qin);
+	cl_2queue_fini(env, queue);
+
+	return result;
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/rw26.c b/drivers/staging/lustrefsx/lustre/llite/rw26.c
index 9a1f0b6021baf..9cba2d0b5e8e3 100644
--- a/drivers/staging/lustrefsx/lustre/llite/rw26.c
+++ b/drivers/staging/lustrefsx/lustre/llite/rw26.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -577,83 +577,45 @@ ll_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 
 /**
  * Prepare partially written-to page for a write.
- * @pg is owned when passed in and disowned when it returns non-zero result to
- * the caller.
  */
 static int ll_prepare_partial_page(const struct lu_env *env, struct cl_io *io,
-				   struct cl_page *pg, struct file *file)
+				   struct cl_page *pg)
 {
 	struct cl_attr *attr   = vvp_env_thread_attr(env);
 	struct cl_object *obj  = io->ci_obj;
 	struct vvp_page *vpg   = cl_object_page_slice(obj, pg);
 	loff_t          offset = cl_offset(obj, vvp_index(vpg));
 	int             result;
-	ENTRY;
 
 	cl_object_attr_lock(obj);
 	result = cl_object_attr_get(env, obj, attr);
 	cl_object_attr_unlock(obj);
-	if (result) {
-		cl_page_disown(env, io, pg);
-		GOTO(out, result);
-	}
-
-	/*
-	 * If are writing to a new page, no need to read old data.
-	 * The extent locking will have updated the KMS, and for our
-	 * purposes here we can treat it like i_size.
-	 */
-	if (attr->cat_kms <= offset) {
-		char *kaddr = ll_kmap_atomic(vpg->vpg_page, KM_USER0);
-
-		memset(kaddr, 0, cl_page_size(obj));
-		ll_kunmap_atomic(kaddr, KM_USER0);
-		GOTO(out, result = 0);
-	}
-
-	if (vpg->vpg_defer_uptodate) {
-		vpg->vpg_ra_used = 1;
-		GOTO(out, result = 0);
-	}
-
-	result = ll_io_read_page(env, io, pg, file);
-	if (result)
-		GOTO(out, result);
+	if (result == 0) {
+		/*
+		 * If are writing to a new page, no need to read old data.
+		 * The extent locking will have updated the KMS, and for our
+		 * purposes here we can treat it like i_size.
+		 */
+		if (attr->cat_kms <= offset) {
+			char *kaddr = ll_kmap_atomic(vpg->vpg_page, KM_USER0);
 
-	/* ll_io_read_page() disowns the page */
-	result = cl_page_own(env, io, pg);
-	if (!result) {
-		if (!PageUptodate(cl_page_vmpage(pg))) {
-			cl_page_disown(env, io, pg);
-			result = -EIO;
-		}
-	} else if (result == -ENOENT) {
-		/* page was truncated */
-		result = -EAGAIN;
+			memset(kaddr, 0, cl_page_size(obj));
+			ll_kunmap_atomic(kaddr, KM_USER0);
+		} else if (vpg->vpg_defer_uptodate)
+			vpg->vpg_ra_used = 1;
+		else
+			result = ll_page_sync_io(env, io, pg, CRT_READ);
 	}
-	EXIT;
-
-out:
 	return result;
 }
 
-static int ll_tiny_write_begin(struct page *vmpage)
-{
-	/* Page must be present, up to date, dirty, and not in writeback. */
-	if (!vmpage || !PageUptodate(vmpage) || !PageDirty(vmpage) ||
-	    PageWriteback(vmpage))
-		return -ENODATA;
-
-	return 0;
-}
-
 static int ll_write_begin(struct file *file, struct address_space *mapping,
 			  loff_t pos, unsigned len, unsigned flags,
 			  struct page **pagep, void **fsdata)
 {
-	struct ll_cl_context *lcc = NULL;
+	struct ll_cl_context *lcc;
 	const struct lu_env  *env = NULL;
-	struct cl_io   *io = NULL;
+	struct cl_io   *io;
 	struct cl_page *page = NULL;
 
 	struct cl_object *clob = ll_i2info(mapping->host)->lli_clob;
@@ -664,27 +626,17 @@ static int ll_write_begin(struct file *file, struct address_space *mapping,
 	int result = 0;
 	ENTRY;
 
-	CDEBUG(D_VFSTRACE, "Writing %lu of %d to %d bytes\n", index, from, len);
+	CDEBUG(D_PAGE, "Writing %lu of %d to %d bytes\n", index, from, len);
 
 	lcc = ll_cl_find(file);
 	if (lcc == NULL) {
-		vmpage = grab_cache_page_nowait(mapping, index);
-		result = ll_tiny_write_begin(vmpage);
-		GOTO(out, result);
+		io = NULL;
+		GOTO(out, result = -EIO);
 	}
 
 	env = lcc->lcc_env;
 	io  = lcc->lcc_io;
 
-	if (file->f_flags & O_DIRECT && io->ci_designated_mirror > 0) {
-		/* direct IO failed because it couldn't clean up cached pages,
-		 * this causes a problem for mirror write because the cached
-		 * page may belong to another mirror, which will result in
-		 * problem submitting the I/O. */
-		GOTO(out, result = -EBUSY);
-	}
-
-again:
 	/* To avoid deadlock, try to lock page first. */
 	vmpage = grab_cache_page_nowait(mapping, index);
 
@@ -737,18 +689,13 @@ static int ll_write_begin(struct file *file, struct address_space *mapping,
 			/* TODO: can be optimized at OSC layer to check if it
 			 * is a lockless IO. In that case, it's not necessary
 			 * to read the data. */
-			result = ll_prepare_partial_page(env, io, page, file);
-			if (result) {
-				/* vmpage should have been unlocked */
-				put_page(vmpage);
-				vmpage = NULL;
-
-				if (result == -EAGAIN)
-					goto again;
-				GOTO(out, result);
-			}
+			result = ll_prepare_partial_page(env, io, page);
+			if (result == 0)
+				SetPageUptodate(vmpage);
 		}
 	}
+	if (result < 0)
+		cl_page_unassume(env, io, page);
 	EXIT;
 out:
 	if (result < 0) {
@@ -756,7 +703,6 @@ static int ll_write_begin(struct file *file, struct address_space *mapping,
 			unlock_page(vmpage);
 			put_page(vmpage);
 		}
-		/* On tiny_write failure, page and io are always null. */
 		if (!IS_ERR_OR_NULL(page)) {
 			lu_ref_del(&page->cp_reference, "cl_io", io);
 			cl_page_put(env, page);
@@ -770,47 +716,6 @@ static int ll_write_begin(struct file *file, struct address_space *mapping,
 	RETURN(result);
 }
 
-static int ll_tiny_write_end(struct file *file, struct address_space *mapping,
-			     loff_t pos, unsigned int len, unsigned int copied,
-			     struct page *vmpage)
-{
-	struct cl_page *clpage = (struct cl_page *) vmpage->private;
-	loff_t kms = pos+copied;
-	loff_t to = kms & (PAGE_SIZE-1) ? kms & (PAGE_SIZE-1) : PAGE_SIZE;
-	__u16 refcheck;
-	struct lu_env *env = cl_env_get(&refcheck);
-	int rc = 0;
-
-	ENTRY;
-
-	if (IS_ERR(env)) {
-		rc = PTR_ERR(env);
-		goto out;
-	}
-
-	/* This page is dirty in cache, so it should have a cl_page pointer
-	 * set in vmpage->private.
-	 */
-	LASSERT(clpage != NULL);
-
-	if (copied == 0)
-		goto out_env;
-
-	/* Update the underlying size information in the OSC/LOV objects this
-	 * page is part of.
-	 */
-	cl_page_touch(env, clpage, to);
-
-out_env:
-	cl_env_put(env, &refcheck);
-
-out:
-	/* Must return page unlocked. */
-	unlock_page(vmpage);
-
-	RETURN(rc);
-}
-
 static int ll_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *vmpage, void *fsdata)
@@ -827,14 +732,6 @@ static int ll_write_end(struct file *file, struct address_space *mapping,
 
 	put_page(vmpage);
 
-	CDEBUG(D_VFSTRACE, "pos %llu, len %u, copied %u\n", pos, len, copied);
-
-	if (lcc == NULL) {
-		result = ll_tiny_write_end(file, mapping, pos, len, copied,
-					   vmpage);
-		GOTO(out, result);
-	}
-
 	LASSERT(lcc != NULL);
 	env  = lcc->lcc_env;
 	page = lcc->lcc_page;
@@ -864,7 +761,7 @@ static int ll_write_end(struct file *file, struct address_space *mapping,
 		if (plist->pl_nr >= PTLRPC_MAX_BRW_PAGES)
 			unplug = true;
 
-		CL_PAGE_DEBUG(D_VFSTRACE, env, page,
+		CL_PAGE_DEBUG(D_PAGE, env, page,
 			      "queued page: %d.\n", plist->pl_nr);
 	} else {
 		cl_page_disown(env, io, page);
@@ -876,14 +773,11 @@ static int ll_write_end(struct file *file, struct address_space *mapping,
 		/* page list is not contiguous now, commit it now */
 		unplug = true;
 	}
-	if (unplug || io->u.ci_wr.wr_sync)
+	if (unplug || io->u.ci_rw.rw_sync)
 		result = vvp_io_write_commit(env, io);
 
 	if (result < 0)
 		io->ci_result = result;
-
-
-out:
 	RETURN(result >= 0 ? copied : result);
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/llite/statahead.c b/drivers/staging/lustrefsx/lustre/llite/statahead.c
index 397712909b3f4..5b2af025d28f9 100644
--- a/drivers/staging/lustrefsx/lustre/llite/statahead.c
+++ b/drivers/staging/lustrefsx/lustre/llite/statahead.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -330,58 +330,6 @@ __sa_make_ready(struct ll_statahead_info *sai, struct sa_entry *entry, int ret)
 	return (index == sai->sai_index_wait);
 }
 
-/* finish async stat RPC arguments */
-static void sa_fini_data(struct md_enqueue_info *minfo)
-{
-	ll_unlock_md_op_lsm(&minfo->mi_data);
-	iput(minfo->mi_dir);
-	OBD_FREE_PTR(minfo);
-}
-
-static int ll_statahead_interpret(struct ptlrpc_request *req,
-				  struct md_enqueue_info *minfo, int rc);
-
-/*
- * prepare arguments for async stat RPC.
- */
-static struct md_enqueue_info *
-sa_prep_data(struct inode *dir, struct inode *child, struct sa_entry *entry)
-{
-	struct md_enqueue_info   *minfo;
-	struct ldlm_enqueue_info *einfo;
-	struct md_op_data        *op_data;
-
-	OBD_ALLOC_PTR(minfo);
-	if (minfo == NULL)
-		return ERR_PTR(-ENOMEM);
-
-	op_data = ll_prep_md_op_data(&minfo->mi_data, dir, child,
-				     entry->se_qstr.name, entry->se_qstr.len, 0,
-				     LUSTRE_OPC_ANY, NULL);
-	if (IS_ERR(op_data)) {
-		OBD_FREE_PTR(minfo);
-		return (struct md_enqueue_info *)op_data;
-	}
-
-	if (child == NULL)
-		op_data->op_fid2 = entry->se_fid;
-
-	minfo->mi_it.it_op = IT_GETATTR;
-	minfo->mi_dir = igrab(dir);
-	minfo->mi_cb = ll_statahead_interpret;
-	minfo->mi_cbdata = entry;
-
-	einfo = &minfo->mi_einfo;
-	einfo->ei_type   = LDLM_IBITS;
-	einfo->ei_mode   = it_to_lock_mode(&minfo->mi_it);
-	einfo->ei_cb_bl  = ll_md_blocking_ast;
-	einfo->ei_cb_cp  = ldlm_completion_ast;
-	einfo->ei_cb_gl  = NULL;
-	einfo->ei_cbdata = NULL;
-
-	return minfo;
-}
-
 /*
  * release resources used in async stat RPC, update entry state and wakeup if
  * scanner process it waiting on this entry.
@@ -398,7 +346,8 @@ sa_make_ready(struct ll_statahead_info *sai, struct sa_entry *entry, int ret)
 	if (minfo) {
 		entry->se_minfo = NULL;
 		ll_intent_release(&minfo->mi_it);
-		sa_fini_data(minfo);
+		iput(minfo->mi_dir);
+		OBD_FREE_PTR(minfo);
 	}
 
 	if (req) {
@@ -544,11 +493,10 @@ static void ll_sai_put(struct ll_statahead_info *sai)
 static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai)
 {
 	struct ll_inode_info *lli = ll_i2info(inode);
-	u64 index = lli->lli_agl_index;
-	ktime_t expire;
+	__u64 index = lli->lli_agl_index;
 	int rc;
-
 	ENTRY;
+
 	LASSERT(list_empty(&lli->lli_agl_list));
 
         /* AGL maybe fall behind statahead with one entry */
@@ -591,9 +539,8 @@ static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai)
          *    relative rare. AGL can ignore such case, and it will not muchly
          *    affect the performance.
          */
-	expire = ktime_sub_ns(ktime_get(), NSEC_PER_SEC);
-	if (ktime_to_ns(lli->lli_glimpse_time) &&
-	    ktime_before(expire, lli->lli_glimpse_time)) {
+        if (lli->lli_glimpse_time != 0 &&
+            cfs_time_before(cfs_time_shift(-1), lli->lli_glimpse_time)) {
 		up_write(&lli->lli_glimpse_sem);
                 lli->lli_agl_index = 0;
                 iput(inode);
@@ -605,7 +552,7 @@ static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai)
 
         cl_agl(inode);
         lli->lli_agl_index = 0;
-	lli->lli_glimpse_time = ktime_get();
+        lli->lli_glimpse_time = cfs_time_current();
 	up_write(&lli->lli_glimpse_sem);
 
         CDEBUG(D_READA, "Handled (init) async glimpse: inode= "
@@ -633,14 +580,14 @@ static void sa_instantiate(struct ll_statahead_info *sai,
 	int rc = 0;
 	ENTRY;
 
-	LASSERT(entry->se_handle != 0);
+        LASSERT(entry->se_handle != 0);
 
-	minfo = entry->se_minfo;
-	it = &minfo->mi_it;
-	req = entry->se_req;
-	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
-	if (body == NULL)
-		GOTO(out, rc = -EFAULT);
+        minfo = entry->se_minfo;
+        it = &minfo->mi_it;
+        req = entry->se_req;
+        body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+        if (body == NULL)
+                GOTO(out, rc = -EFAULT);
 
 	child = entry->se_inode;
 	if (child != NULL) {
@@ -655,25 +602,25 @@ static void sa_instantiate(struct ll_statahead_info *sai,
 
 	it->it_lock_handle = entry->se_handle;
 	rc = md_revalidate_lock(ll_i2mdexp(dir), it, ll_inode2fid(dir), NULL);
-	if (rc != 1)
-		GOTO(out, rc = -EAGAIN);
+        if (rc != 1)
+                GOTO(out, rc = -EAGAIN);
 
-	rc = ll_prep_inode(&child, req, dir->i_sb, it);
-	if (rc)
-		GOTO(out, rc);
+        rc = ll_prep_inode(&child, req, dir->i_sb, it);
+        if (rc)
+                GOTO(out, rc);
 
 	CDEBUG(D_READA, "%s: setting %.*s"DFID" l_data to inode %p\n",
 	       ll_get_fsname(child->i_sb, NULL, 0),
 	       entry->se_qstr.len, entry->se_qstr.name,
 	       PFID(ll_inode2fid(child)), child);
-	ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, child, it, NULL);
+        ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, child, it, NULL);
 
-	entry->se_inode = child;
+        entry->se_inode = child;
 
-	if (agl_should_run(sai, child))
-		ll_agl_add(sai, child, entry->se_index);
+        if (agl_should_run(sai, child))
+                ll_agl_add(sai, child, entry->se_index);
 
-	EXIT;
+        EXIT;
 
 out:
 	/* sa_make_ready() will drop ldlm ibits lock refcount by calling
@@ -737,7 +684,8 @@ static int ll_statahead_interpret(struct ptlrpc_request *req,
 
 	if (rc != 0) {
 		ll_intent_release(it);
-		sa_fini_data(minfo);
+		iput(dir);
+		OBD_FREE_PTR(minfo);
 	} else {
 		/* release ibits lock ASAP to avoid deadlock when statahead
 		 * thread enqueues lock on parent in readdir and another
@@ -745,7 +693,6 @@ static int ll_statahead_interpret(struct ptlrpc_request *req,
 		 * unlink. */
 		handle = it->it_lock_handle;
 		ll_intent_drop_lock(it);
-		ll_unlock_md_op_lsm(&minfo->mi_data);
 	}
 
 	spin_lock(&lli->lli_sa_lock);
@@ -775,6 +722,53 @@ static int ll_statahead_interpret(struct ptlrpc_request *req,
 	RETURN(rc);
 }
 
+/* finish async stat RPC arguments */
+static void sa_fini_data(struct md_enqueue_info *minfo)
+{
+        iput(minfo->mi_dir);
+        OBD_FREE_PTR(minfo);
+}
+
+/*
+ * prepare arguments for async stat RPC.
+ */
+static struct md_enqueue_info *
+sa_prep_data(struct inode *dir, struct inode *child, struct sa_entry *entry)
+{
+	struct md_enqueue_info   *minfo;
+	struct ldlm_enqueue_info *einfo;
+	struct md_op_data        *op_data;
+
+	OBD_ALLOC_PTR(minfo);
+	if (minfo == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	op_data = ll_prep_md_op_data(&minfo->mi_data, dir, child, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data)) {
+		OBD_FREE_PTR(minfo);
+		return (struct md_enqueue_info *)op_data;
+	}
+
+	if (child == NULL)
+		op_data->op_fid2 = entry->se_fid;
+
+	minfo->mi_it.it_op = IT_GETATTR;
+	minfo->mi_dir = igrab(dir);
+	minfo->mi_cb = ll_statahead_interpret;
+	minfo->mi_cbdata = entry;
+
+	einfo = &minfo->mi_einfo;
+	einfo->ei_type   = LDLM_IBITS;
+	einfo->ei_mode   = it_to_lock_mode(&minfo->mi_it);
+	einfo->ei_cb_bl  = ll_md_blocking_ast;
+	einfo->ei_cb_cp  = ldlm_completion_ast;
+	einfo->ei_cb_gl  = NULL;
+	einfo->ei_cbdata = NULL;
+
+	return minfo;
+}
+
 /* async stat for file not found in dcache */
 static int sa_lookup(struct inode *dir, struct sa_entry *entry)
 {
@@ -816,20 +810,22 @@ static int sa_revalidate(struct inode *dir, struct sa_entry *entry,
 	if (d_mountpoint(dentry))
 		RETURN(1);
 
-	minfo = sa_prep_data(dir, inode, entry);
-	if (IS_ERR(minfo))
-		RETURN(PTR_ERR(minfo));
-
 	entry->se_inode = igrab(inode);
 	rc = md_revalidate_lock(ll_i2mdexp(dir), &it, ll_inode2fid(inode),
 				NULL);
 	if (rc == 1) {
 		entry->se_handle = it.it_lock_handle;
 		ll_intent_release(&it);
-		sa_fini_data(minfo);
 		RETURN(1);
 	}
 
+	minfo = sa_prep_data(dir, inode, entry);
+	if (IS_ERR(minfo)) {
+		entry->se_inode = NULL;
+		iput(inode);
+		RETURN(PTR_ERR(minfo));
+	}
+
 	rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo);
 	if (rc < 0) {
 		entry->se_inode = NULL;
@@ -926,7 +922,6 @@ static int ll_agl_thread(void *arg)
 			list_del_init(&clli->lli_agl_list);
 			spin_unlock(&plli->lli_agl_lock);
 			ll_agl_trigger(&clli->lli_vfs_inode, sai);
-			cond_resched();
 		} else {
 			spin_unlock(&plli->lli_agl_lock);
 		}
@@ -1004,7 +999,8 @@ static int ll_statahead_thread(void *arg)
 	CDEBUG(D_READA, "statahead thread starting: sai %p, parent %.*s\n",
 	       sai, parent->d_name.len, parent->d_name.name);
 
-	OBD_ALLOC_PTR(op_data);
+	op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, dir);
 	if (IS_ERR(op_data))
 		GOTO(out, rc = PTR_ERR(op_data));
 
@@ -1026,16 +1022,8 @@ static int ll_statahead_thread(void *arg)
 		struct lu_dirpage *dp;
 		struct lu_dirent  *ent;
 
-		op_data = ll_prep_md_op_data(op_data, dir, dir, NULL, 0, 0,
-				     LUSTRE_OPC_ANY, dir);
-		if (IS_ERR(op_data)) {
-			rc = PTR_ERR(op_data);
-			break;
-		}
-
 		sai->sai_in_readpage = 1;
 		page = ll_get_dir_page(dir, op_data, pos, &chain);
-		ll_unlock_md_op_lsm(op_data);
 		sai->sai_in_readpage = 0;
 		if (IS_ERR(page)) {
 			rc = PTR_ERR(page);
@@ -1121,7 +1109,7 @@ static int ll_statahead_thread(void *arg)
 
 					ll_agl_trigger(&clli->lli_vfs_inode,
 							sai);
-					cond_resched();
+
 					spin_lock(&lli->lli_agl_lock);
 				}
 				spin_unlock(&lli->lli_agl_lock);
@@ -1610,6 +1598,7 @@ static int start_statahead_thread(struct inode *dir, struct dentry *dentry)
 		spin_lock(&lli->lli_sa_lock);
 		lli->lli_sai = NULL;
 		spin_unlock(&lli->lli_sa_lock);
+		atomic_dec(&ll_i2sbi(parent->d_inode)->ll_sa_running);
 		rc = PTR_ERR(task);
 		CERROR("can't start ll_sa thread, rc: %d\n", rc);
 		GOTO(out, rc);
diff --git a/drivers/staging/lustrefsx/lustre/llite/super25.c b/drivers/staging/lustrefsx/lustre/llite/super25.c
index 84e5de9ea8782..7118cce98561b 100644
--- a/drivers/staging/lustrefsx/lustre/llite/super25.c
+++ b/drivers/staging/lustrefsx/lustre/llite/super25.c
@@ -95,8 +95,12 @@ struct super_operations lustre_super_operations =
         .show_options  = ll_show_options,
 };
 
+
+void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg));
+
 static int __init lustre_init(void)
 {
+	struct proc_dir_entry *entry;
 	struct lnet_process_id lnet_id;
 	struct timespec64 ts;
 	int i, rc, seed[2];
@@ -128,9 +132,15 @@ static int __init lustre_init(void)
 	if (ll_file_data_slab == NULL)
 		GOTO(out_cache, rc = -ENOMEM);
 
-	rc = llite_tunables_register();
-	if (rc)
+	entry = lprocfs_register("llite", proc_lustre_root, NULL, NULL);
+	if (IS_ERR(entry)) {
+		rc = PTR_ERR(entry);
+		CERROR("cannot register '/proc/fs/lustre/llite': rc = %d\n",
+		       rc);
 		GOTO(out_cache, rc);
+	}
+
+	proc_lustre_fs_root = entry;
 
 	cfs_get_random_bytes(seed, sizeof(seed));
 
@@ -140,7 +150,7 @@ static int __init lustre_init(void)
 		if (LNetGetId(i, &lnet_id) == -ENOENT)
 			break;
 
-		if (lnet_id.nid != LNET_NID_LO_0)
+		if (LNET_NETTYP(LNET_NIDNET(lnet_id.nid)) != LOLND)
 			seed[0] ^= LNET_NIDADDR(lnet_id.nid);
 	}
 
@@ -149,7 +159,7 @@ static int __init lustre_init(void)
 
 	rc = vvp_global_init();
 	if (rc != 0)
-		GOTO(out_tunables, rc);
+		GOTO(out_proc, rc);
 
 	cl_inode_fini_env = cl_env_alloc(&cl_inode_fini_refcheck,
 					 LCT_REMEMBER | LCT_NOREF);
@@ -164,6 +174,7 @@ static int __init lustre_init(void)
 
 	lustre_register_client_fill_super(ll_fill_super);
 	lustre_register_kill_super_cb(ll_kill_super);
+	lustre_register_client_process_config(ll_process_config);
 
 	RETURN(0);
 
@@ -171,11 +182,15 @@ static int __init lustre_init(void)
 	cl_env_put(cl_inode_fini_env, &cl_inode_fini_refcheck);
 out_vvp:
 	vvp_global_fini();
-out_tunables:
-	llite_tunables_unregister();
+out_proc:
+	lprocfs_remove(&proc_lustre_fs_root);
 out_cache:
-	kmem_cache_destroy(ll_inode_cachep);
-	kmem_cache_destroy(ll_file_data_slab);
+	if (ll_inode_cachep != NULL)
+		kmem_cache_destroy(ll_inode_cachep);
+
+	if (ll_file_data_slab != NULL)
+		kmem_cache_destroy(ll_file_data_slab);
+
 	return rc;
 }
 
@@ -183,20 +198,14 @@ static void __exit lustre_exit(void)
 {
 	lustre_register_client_fill_super(NULL);
 	lustre_register_kill_super_cb(NULL);
+	lustre_register_client_process_config(NULL);
 
-	llite_tunables_unregister();
+	lprocfs_remove(&proc_lustre_fs_root);
 
 	ll_xattr_fini();
 	cl_env_put(cl_inode_fini_env, &cl_inode_fini_refcheck);
 	vvp_global_fini();
 
-#ifdef HAVE_INODE_I_RCU
-	/*
-	 * Make sure all delayed rcu free inodes are flushed before we
-	 * destroy cache.
-	 */
-	rcu_barrier();
-#endif
 	kmem_cache_destroy(ll_inode_cachep);
 	kmem_cache_destroy(ll_file_data_slab);
 }
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c b/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c
index d36aed3919268..2f640635afea2 100644
--- a/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c
@@ -53,6 +53,7 @@
  */
 
 static struct kmem_cache *ll_thread_kmem;
+struct kmem_cache *vvp_lock_kmem;
 struct kmem_cache *vvp_object_kmem;
 static struct kmem_cache *vvp_session_kmem;
 static struct kmem_cache *vvp_thread_kmem;
@@ -63,6 +64,11 @@ static struct lu_kmem_descr vvp_caches[] = {
 		.ckd_name  = "ll_thread_kmem",
 		.ckd_size  = sizeof(struct ll_thread_info),
 	},
+	{
+		.ckd_cache = &vvp_lock_kmem,
+		.ckd_name  = "vvp_lock_kmem",
+		.ckd_size  = sizeof(struct vvp_lock),
+	},
 	{
 		.ckd_cache = &vvp_object_kmem,
 		.ckd_name  = "vvp_object_kmem",
@@ -355,10 +361,26 @@ int cl_sb_fini(struct super_block *sb)
 
 /****************************************************************************
  *
- * debugfs/lustre/llite/$MNT/dump_page_cache
+ * /proc/fs/lustre/llite/$MNT/dump_page_cache
  *
  ****************************************************************************/
 
+/*
+ * To represent contents of a page cache as a byte stream, following
+ * information if encoded in 64bit offset:
+ *
+ *       - file hash bucket in lu_site::ls_hash[]       28bits
+ *
+ *       - how far file is from bucket head              4bits
+ *
+ *       - page index                                   32bits
+ *
+ * First two data identify a file in the cache uniquely.
+ */
+
+#define PGC_OBJ_SHIFT (32 + 4)
+#define PGC_DEPTH_SHIFT (32)
+
 struct vvp_pgcache_id {
         unsigned                 vpi_bucket;
         unsigned                 vpi_depth;
@@ -368,18 +390,22 @@ struct vvp_pgcache_id {
         struct lu_object_header *vpi_obj;
 };
 
-struct vvp_seq_private {
-	struct ll_sb_info	*vsp_sbi;
-	struct lu_env		*vsp_env;
-	u16			vsp_refcheck;
-	struct cl_object	*vsp_clob;
-	struct vvp_pgcache_id	vvp_id;
-	/*
-	 * prev_pos is the 'pos' of the last object returned
-	 * by ->start of ->next.
-	 */
-	loff_t			vvp_prev_pos;
-};
+static void vvp_pgcache_id_unpack(loff_t pos, struct vvp_pgcache_id *id)
+{
+        CLASSERT(sizeof(pos) == sizeof(__u64));
+
+        id->vpi_index  = pos & 0xffffffff;
+        id->vpi_depth  = (pos >> PGC_DEPTH_SHIFT) & 0xf;
+        id->vpi_bucket = ((unsigned long long)pos >> PGC_OBJ_SHIFT);
+}
+
+static loff_t vvp_pgcache_id_pack(struct vvp_pgcache_id *id)
+{
+        return
+                ((__u64)id->vpi_index) |
+                ((__u64)id->vpi_depth  << PGC_DEPTH_SHIFT) |
+                ((__u64)id->vpi_bucket << PGC_OBJ_SHIFT);
+}
 
 static int vvp_pgcache_obj_get(struct cfs_hash *hs, struct cfs_hash_bd *bd,
 			       struct hlist_node *hnode, void *data)
@@ -387,12 +413,12 @@ static int vvp_pgcache_obj_get(struct cfs_hash *hs, struct cfs_hash_bd *bd,
         struct vvp_pgcache_id   *id  = data;
         struct lu_object_header *hdr = cfs_hash_object(hs, hnode);
 
-	if (lu_object_is_dying(hdr))
-		return 0;
-
         if (id->vpi_curdep-- > 0)
                 return 0; /* continue */
 
+        if (lu_object_is_dying(hdr))
+                return 1;
+
         cfs_hash_get(hs, hnode);
         id->vpi_obj = hdr;
         return 1;
@@ -404,7 +430,8 @@ static struct cl_object *vvp_pgcache_obj(const struct lu_env *env,
 {
 	LASSERT(lu_device_is_cl(dev));
 
-	id->vpi_obj = NULL;
+	id->vpi_depth &= 0xf;
+	id->vpi_obj    = NULL;
 	id->vpi_curdep = id->vpi_depth;
 
 	cfs_hash_hlist_for_each(dev->ld_site->ls_obj_hash, id->vpi_bucket,
@@ -418,42 +445,52 @@ static struct cl_object *vvp_pgcache_obj(const struct lu_env *env,
 			return lu2cl(lu_obj);
 		}
 		lu_object_put(env, lu_object_top(id->vpi_obj));
+
+	} else if (id->vpi_curdep > 0) {
+		id->vpi_depth = 0xf;
 	}
 	return NULL;
 }
 
-static struct page *vvp_pgcache_current(struct vvp_seq_private *priv)
+static loff_t vvp_pgcache_find(const struct lu_env *env,
+			       struct lu_device *dev, loff_t pos)
 {
-	struct lu_device *dev = &priv->vsp_sbi->ll_cl->cd_lu_dev;
+	struct cl_object     *clob;
+	struct lu_site       *site;
+	struct vvp_pgcache_id id;
 
-	while (1) {
-		struct inode *inode;
-		struct page *vmpage;
-		int nr;
-
-		if (!priv->vsp_clob) {
-			struct cl_object *clob;
-
-			while ((clob = vvp_pgcache_obj(priv->vsp_env, dev, &priv->vvp_id)) == NULL &&
-			       ++(priv->vvp_id.vpi_bucket) < CFS_HASH_NHLIST(dev->ld_site->ls_obj_hash))
-				priv->vvp_id.vpi_depth = 0;
-			if (!clob)
-				return NULL;
-			priv->vsp_clob = clob;
-			priv->vvp_id.vpi_index = 0;
-		}
+	site = dev->ld_site;
+	vvp_pgcache_id_unpack(pos, &id);
 
-		inode = vvp_object_inode(priv->vsp_clob);
-		nr = find_get_pages_contig(inode->i_mapping, priv->vvp_id.vpi_index, 1, &vmpage);
-		if (nr > 0) {
-			priv->vvp_id.vpi_index = vmpage->index;
-			return vmpage;
+	while (1) {
+		if (id.vpi_bucket >= CFS_HASH_NHLIST(site->ls_obj_hash))
+			return ~0ULL;
+		clob = vvp_pgcache_obj(env, dev, &id);
+		if (clob != NULL) {
+			struct inode *inode = vvp_object_inode(clob);
+			struct page *vmpage;
+			int nr;
+
+			nr = find_get_pages_contig(inode->i_mapping,
+						   id.vpi_index, 1, &vmpage);
+			if (nr > 0) {
+				id.vpi_index = vmpage->index;
+				/* Cant support over 16T file */
+				nr = !(vmpage->index > 0xffffffff);
+				put_page(vmpage);
+			}
+
+			lu_object_ref_del(&clob->co_lu, "dump", current);
+			cl_object_put(env, clob);
+			if (nr > 0)
+				return vvp_pgcache_id_pack(&id);
 		}
-		lu_object_ref_del(&priv->vsp_clob->co_lu, "dump", current);
-		cl_object_put(priv->vsp_env, priv->vsp_clob);
-		priv->vsp_clob = NULL;
-		priv->vvp_id.vpi_index = 0;
-		priv->vvp_id.vpi_depth++;
+		/* to the next object. */
+		++id.vpi_depth;
+		id.vpi_depth &= 0xf;
+		if (id.vpi_depth == 0 && ++id.vpi_bucket == 0)
+			return ~0ULL;
+		id.vpi_index = 0;
 	}
 }
 
@@ -495,72 +532,92 @@ static void vvp_pgcache_page_show(const struct lu_env *env,
 
 static int vvp_pgcache_show(struct seq_file *f, void *v)
 {
-	struct vvp_seq_private *priv = f->private;
-	struct page *vmpage = v;
-	struct cl_page *page;
-
-	seq_printf(f, "%8lx@" DFID ": ", vmpage->index,
-		   PFID(lu_object_fid(&priv->vsp_clob->co_lu)));
-	lock_page(vmpage);
-	page = cl_vmpage_page(vmpage, priv->vsp_clob);
-	unlock_page(vmpage);
-	put_page(vmpage);
-
-	if (page) {
-		vvp_pgcache_page_show(priv->vsp_env, f, page);
-		cl_page_put(priv->vsp_env, page);
-	} else {
-		seq_puts(f, "missing\n");
-	}
-
-	return 0;
-}
-
-static void vvp_pgcache_rewind(struct vvp_seq_private *priv)
-{
-	if (priv->vvp_prev_pos) {
-		memset(&priv->vvp_id, 0, sizeof(priv->vvp_id));
-		priv->vvp_prev_pos = 0;
-		if (priv->vsp_clob) {
-			lu_object_ref_del(&priv->vsp_clob->co_lu, "dump",
-					  current);
-			cl_object_put(priv->vsp_env, priv->vsp_clob);
-		}
-		priv->vsp_clob = NULL;
-	}
-}
-
-static struct page *vvp_pgcache_next_page(struct vvp_seq_private *priv)
-{
-	priv->vvp_id.vpi_index += 1;
-	return vvp_pgcache_current(priv);
+	loff_t                   pos;
+	struct ll_sb_info       *sbi;
+	struct cl_object        *clob;
+	struct lu_env           *env;
+	struct vvp_pgcache_id    id;
+	__u16                    refcheck;
+	int                      result;
+
+	env = cl_env_get(&refcheck);
+	if (!IS_ERR(env)) {
+		pos = *(loff_t *) v;
+		vvp_pgcache_id_unpack(pos, &id);
+		sbi = f->private;
+		clob = vvp_pgcache_obj(env, &sbi->ll_cl->cd_lu_dev, &id);
+		if (clob != NULL) {
+			struct inode *inode = vvp_object_inode(clob);
+			struct cl_page *page = NULL;
+			struct page *vmpage;
+
+			result = find_get_pages_contig(inode->i_mapping,
+						      id.vpi_index, 1, &vmpage);
+			if (result > 0) {
+				lock_page(vmpage);
+				page = cl_vmpage_page(vmpage, clob);
+				unlock_page(vmpage);
+
+				put_page(vmpage);
+			}
+
+			seq_printf(f, "%8x@"DFID": ", id.vpi_index,
+				   PFID(lu_object_fid(&clob->co_lu)));
+			if (page != NULL) {
+				vvp_pgcache_page_show(env, f, page);
+				cl_page_put(env, page);
+			} else
+				seq_puts(f, "missing\n");
+			lu_object_ref_del(&clob->co_lu, "dump", current);
+			cl_object_put(env, clob);
+		} else
+			seq_printf(f, "%llx missing\n", pos);
+		cl_env_put(env, &refcheck);
+		result = 0;
+	} else
+		result = PTR_ERR(env);
+	return result;
 }
 
 static void *vvp_pgcache_start(struct seq_file *f, loff_t *pos)
 {
-	struct vvp_seq_private *priv = f->private;
-
-	if (*pos == 0) {
-		vvp_pgcache_rewind(priv);
-	} else if (*pos == priv->vvp_prev_pos) {
-		/* Return the current item */;
-	} else {
-		WARN_ON(*pos != priv->vvp_prev_pos + 1);
-		priv->vvp_id.vpi_index += 1;
-	}
+        struct ll_sb_info *sbi;
+        struct lu_env     *env;
+	__u16              refcheck;
+
+        sbi = f->private;
 
-	priv->vvp_prev_pos = *pos;
-	return vvp_pgcache_current(priv);
+        env = cl_env_get(&refcheck);
+        if (!IS_ERR(env)) {
+                sbi = f->private;
+                if (sbi->ll_site->ls_obj_hash->hs_cur_bits > 64 - PGC_OBJ_SHIFT)
+                        pos = ERR_PTR(-EFBIG);
+                else {
+                        *pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev,
+                                                *pos);
+                        if (*pos == ~0ULL)
+                                pos = NULL;
+                }
+                cl_env_put(env, &refcheck);
+        }
+        return pos;
 }
 
 static void *vvp_pgcache_next(struct seq_file *f, void *v, loff_t *pos)
 {
-	struct vvp_seq_private *priv = f->private;
+        struct ll_sb_info *sbi;
+        struct lu_env     *env;
+	__u16              refcheck;
 
-	WARN_ON(*pos != priv->vvp_prev_pos);
-	*pos += 1;
-	priv->vvp_prev_pos = *pos;
-	return vvp_pgcache_next_page(priv);
+        env = cl_env_get(&refcheck);
+        if (!IS_ERR(env)) {
+                sbi = f->private;
+                *pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev, *pos + 1);
+                if (*pos == ~0ULL)
+                        pos = NULL;
+                cl_env_put(env, &refcheck);
+        }
+        return pos;
 }
 
 static void vvp_pgcache_stop(struct seq_file *f, void *v)
@@ -577,44 +634,22 @@ static struct seq_operations vvp_pgcache_ops = {
 
 static int vvp_dump_pgcache_seq_open(struct inode *inode, struct file *filp)
 {
-	struct vvp_seq_private *priv;
-
-	priv = __seq_open_private(filp, &vvp_pgcache_ops, sizeof(*priv));
-	if (!priv)
-		return -ENOMEM;
-
-	priv->vsp_sbi = inode->i_private;
-	priv->vsp_env = cl_env_get(&priv->vsp_refcheck);
-	priv->vsp_clob = NULL;
-	memset(&priv->vvp_id, 0, sizeof(priv->vvp_id));
-	if (IS_ERR(priv->vsp_env)) {
-		int err = PTR_ERR(priv->vsp_env);
-
-		seq_release_private(inode, filp);
-		return err;
+	struct ll_sb_info	*sbi = PDE_DATA(inode);
+	struct seq_file		*seq;
+	int			result;
+
+	result = seq_open(filp, &vvp_pgcache_ops);
+	if (result == 0) {
+		seq = filp->private_data;
+		seq->private = sbi;
 	}
-
-	return 0;
-}
-
-static int vvp_dump_pgcache_seq_release(struct inode *inode, struct file *file)
-{
-	struct seq_file *seq = file->private_data;
-	struct vvp_seq_private *priv = seq->private;
-
-	if (priv->vsp_clob) {
-		lu_object_ref_del(&priv->vsp_clob->co_lu, "dump", current);
-		cl_object_put(priv->vsp_env, priv->vsp_clob);
-	}
-
-	cl_env_put(priv->vsp_env, &priv->vsp_refcheck);
-	return seq_release_private(inode, file);
+	return result;
 }
 
-const struct file_operations vvp_dump_pgcache_file_ops = {
-	.owner	 = THIS_MODULE,
-	.open	 = vvp_dump_pgcache_seq_open,
-	.read	 = seq_read,
-	.llseek	 = seq_lseek,
-	.release = vvp_dump_pgcache_seq_release,
+const struct proc_ops vvp_dump_pgcache_file_ops = {
+	PROC_OWNER(THIS_MODULE)
+        .proc_open    = vvp_dump_pgcache_seq_open,
+        .proc_read    = seq_read,
+        .proc_lseek   = seq_lseek,
+        .proc_release = seq_release,
 };
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_internal.h b/drivers/staging/lustrefsx/lustre/llite/vvp_internal.h
index 0fb9b51a8f618..9973d646ae703 100644
--- a/drivers/staging/lustrefsx/lustre/llite/vvp_internal.h
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2013, 2017, Intel Corporation.
+ * Copyright (c) 2013, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -37,6 +37,7 @@
 #ifndef VVP_INTERNAL_H
 #define VVP_INTERNAL_H
 
+#include <lustre/lustre_idl.h>
 #include <cl_object.h>
 
 enum obd_notify_event;
@@ -60,13 +61,7 @@ struct vvp_io {
 	/** super class */
 	struct cl_io_slice     vui_cl;
 	struct cl_io_lock_link vui_link;
-	/**
-	 * I/O vector information to or from which read/write is going.
-	 */
-	struct iov_iter *vui_iter;
-	/**
-	 * Total size for the left IO.
-	 */
+	/** Total size for the left IO. */
 	size_t vui_tot_count;
 
 	union {
@@ -93,7 +88,6 @@ struct vvp_io {
 			 * check that flags are from filemap_fault
 			 */
 			bool			 ft_flags_valid;
-			struct cl_page_list	 ft_queue;
 		} fault;
 		struct {
 			struct pipe_inode_info	*vui_pipe;
@@ -117,7 +111,6 @@ struct vvp_io {
 	* File descriptor against which IO is done.
 	*/
 	struct ll_file_data	*vui_fd;
-	struct kiocb		*vui_iocb;
 
 	/* Readahead state. */
 	pgoff_t	vui_ra_start;
@@ -131,6 +124,7 @@ extern struct lu_device_type vvp_device_type;
 extern struct lu_context_key vvp_session_key;
 extern struct lu_context_key vvp_thread_key;
 
+extern struct kmem_cache *vvp_lock_kmem;
 extern struct kmem_cache *vvp_object_kmem;
 
 struct vvp_thread_info {
@@ -138,7 +132,6 @@ struct vvp_thread_info {
 	struct cl_lock_descr	vti_descr;
 	struct cl_io		vti_io;
 	struct cl_attr		vti_attr;
-	struct cl_sync_io	vti_anchor;
 };
 
 static inline struct vvp_thread_info *vvp_env_info(const struct lu_env *env)
@@ -258,6 +251,10 @@ struct vvp_device {
 	struct cl_device   *vdv_next;
 };
 
+struct vvp_lock {
+	struct cl_lock_slice vlk_cl;
+};
+
 static inline struct lu_device *vvp2lu_dev(struct vvp_device *vdv)
 {
 	return &vdv->vdv_cl.cd_lu_dev;
@@ -296,6 +293,11 @@ static inline struct page *cl2vm_page(const struct cl_page_slice *slice)
 	return cl2vvp_page(slice)->vpg_page;
 }
 
+static inline struct vvp_lock *cl2vvp_lock(const struct cl_lock_slice *slice)
+{
+	return container_of(slice, struct vvp_lock, vlk_cl);
+}
+
 #ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
 # define CLOBINVRNT(env, clob, expr)					\
 	do {								\
@@ -315,6 +317,8 @@ int lov_read_and_clear_async_rc(struct cl_object *clob);
 int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
 		struct cl_io *io);
 int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io);
+int vvp_lock_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_lock *lock, const struct cl_io *io);
 int vvp_page_init(const struct lu_env *env, struct cl_object *obj,
 		  struct cl_page *page, pgoff_t index);
 struct lu_object *vvp_object_alloc(const struct lu_env *env,
@@ -324,6 +328,6 @@ struct lu_object *vvp_object_alloc(const struct lu_env *env,
 int vvp_global_init(void);
 void vvp_global_fini(void);
 
-extern const struct file_operations vvp_dump_pgcache_file_ops;
+extern const struct proc_ops vvp_dump_pgcache_file_ops;
 
 #endif /* VVP_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_io.c b/drivers/staging/lustrefsx/lustre/llite/vvp_io.c
index 6d8070c5b8bfd..1bcadeb7cf0da 100644
--- a/drivers/staging/lustrefsx/lustre/llite/vvp_io.c
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_io.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -299,14 +299,12 @@ static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
 	struct cl_object *obj = io->ci_obj;
 	struct vvp_io    *vio = cl2vvp_io(env, ios);
 	struct inode     *inode = vvp_object_inode(obj);
-	__u32		  gen = 0;
 	int rc;
-	ENTRY;
 
 	CLOBINVRNT(env, obj, vvp_object_invariant(obj));
 
 	CDEBUG(D_VFSTRACE, DFID" ignore/verify layout %d/%d, layout version %d "
-			   "need write layout %d, restore needed %d\n",
+	       "need write layout %d, restore needed %d\n",
 	       PFID(lu_object_fid(&obj->co_lu)),
 	       io->ci_ignore_layout, io->ci_verify_layout,
 	       vio->vui_layout_gen, io->ci_need_write_intent,
@@ -323,40 +321,18 @@ static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
 		 * block on layout lock held by the MDT
 		 * as MDT will not send new layout in lvb (see LU-3124)
 		 * we have to explicitly fetch it, all this will be done
-		 * by ll_layout_refresh().
-		 * Even if ll_layout_restore() returns zero, it doesn't mean
-		 * that restore has been successful. Therefore it sets
-		 * ci_verify_layout so that it will check layout at the end
-		 * of this function.
+		 * by ll_layout_refresh()
 		 */
-		if (rc) {
+		if (rc == 0) {
+			io->ci_restore_needed = 0;
+			io->ci_need_restart = 1;
+			io->ci_verify_layout = 1;
+		} else {
 			io->ci_restore_needed = 1;
 			io->ci_need_restart = 0;
 			io->ci_verify_layout = 0;
 			io->ci_result = rc;
-			GOTO(out, rc);
-		}
-
-		io->ci_restore_needed = 0;
-
-		/* Even if ll_layout_restore() returns zero, it doesn't mean
-		 * that restore has been successful. Therefore it should verify
-		 * if there was layout change and restart I/O correspondingly.
-		 */
-		ll_layout_refresh(inode, &gen);
-		io->ci_need_restart = vio->vui_layout_gen != gen;
-		if (io->ci_need_restart) {
-			CDEBUG(D_VFSTRACE,
-			       DFID" layout changed from %d to %d.\n",
-			       PFID(lu_object_fid(&obj->co_lu)),
-			       vio->vui_layout_gen, gen);
-			/* today successful restore is the only possible
-			 * case */
-			/* restore was done, clear restoring state */
-			ll_file_clear_flag(ll_i2info(vvp_object_inode(obj)),
-					   LLIF_FILE_RESTORING);
 		}
-		GOTO(out, 0);
 	}
 
 	/**
@@ -364,29 +340,47 @@ static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
 	 * RPC.
 	 */
 	if (io->ci_need_write_intent) {
-		enum layout_intent_opc opc = LAYOUT_INTENT_WRITE;
+		loff_t start = 0;
+		loff_t end = OBD_OBJECT_EOF;
 
 		io->ci_need_write_intent = 0;
 
 		LASSERT(io->ci_type == CIT_WRITE ||
 			cl_io_is_trunc(io) || cl_io_is_mkwrite(io));
 
-		CDEBUG(D_VFSTRACE, DFID" write layout, type %u "DEXT"\n",
-		       PFID(lu_object_fid(&obj->co_lu)), io->ci_type,
-		       PEXT(&io->ci_write_intent));
+		if (io->ci_type == CIT_WRITE) {
+			if (!cl_io_is_append(io)) {
+				start = io->u.ci_rw.rw_range.cir_pos;
+				end = start + io->u.ci_rw.rw_range.cir_count;
+			}
+		} else if (cl_io_is_trunc(io)) {
+			/* for writes, e_end is endpos, the location of the file
+			 * pointer after the write is completed, so it is not accessed.
+			 * For truncate, 'end' is the size, and *is* acccessed.
+			 * In other words, writes are [start, end), but truncate is
+			 * [start, size], where both are included.  So add 1 to the
+			 * size when creating the write intent to account for this.
+			 */
+			end = io->u.ci_setattr.sa_attr.lvb_size + 1;
+		} else { /* mkwrite */
+			pgoff_t index = io->u.ci_fault.ft_index;
 
-		if (cl_io_is_trunc(io))
-			opc = LAYOUT_INTENT_TRUNC;
+			start = cl_offset(io->ci_obj, index);
+			end = cl_offset(io->ci_obj, index + 1);
+		}
 
-		rc = ll_layout_write_intent(inode, opc, &io->ci_write_intent);
+		CDEBUG(D_VFSTRACE, DFID" write layout, type %u [%llu, %llu)\n",
+		       PFID(lu_object_fid(&obj->co_lu)), io->ci_type,
+		       start, end);
+		rc = ll_layout_write_intent(inode, start, end);
 		io->ci_result = rc;
 		if (!rc)
 			io->ci_need_restart = 1;
-		GOTO(out, rc);
 	}
 
-	if (!io->ci_need_restart &&
-	    !io->ci_ignore_layout && io->ci_verify_layout) {
+	if (!io->ci_ignore_layout && io->ci_verify_layout) {
+		__u32 gen = 0;
+
 		/* check layout version */
 		ll_layout_refresh(inode, &gen);
 		io->ci_need_restart = vio->vui_layout_gen != gen;
@@ -395,11 +389,13 @@ static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
 			       DFID" layout changed from %d to %d.\n",
 			       PFID(lu_object_fid(&obj->co_lu)),
 			       vio->vui_layout_gen, gen);
+			/* today successful restore is the only possible
+			 * case */
+			/* restore was done, clear restoring state */
+			ll_file_clear_flag(ll_i2info(vvp_object_inode(obj)),
+					   LLIF_FILE_RESTORING);
 		}
-		GOTO(out, 0);
 	}
-out:
-	EXIT;
 }
 
 static void vvp_io_fault_fini(const struct lu_env *env,
@@ -430,8 +426,7 @@ static enum cl_lock_mode vvp_mode_from_vma(struct vm_area_struct *vma)
         return CLM_READ;
 }
 
-static int vvp_mmap_locks(const struct lu_env *env,
-			  struct vvp_io *vio, struct cl_io *io)
+static int vvp_mmap_locks(const struct lu_env *env, struct cl_io *io)
 {
 	struct vvp_thread_info *vti = vvp_env_info(env);
 	struct mm_struct *mm = current->mm;
@@ -450,18 +445,14 @@ static int vvp_mmap_locks(const struct lu_env *env,
 	if (!cl_is_normalio(env, io))
 		RETURN(0);
 
-	/* nfs or loop back device write */
-	if (vio->vui_iter == NULL)
-		RETURN(0);
-
 	/* No MM (e.g. NFS)? No vmas too. */
 	if (mm == NULL)
 		RETURN(0);
 
-	if (!iter_is_iovec(vio->vui_iter) && !iov_iter_is_kvec(vio->vui_iter))
+	if (!iter_is_iovec(&io->u.ci_rw.rw_iter) && !iov_iter_is_kvec(&io->u.ci_rw.rw_iter))
 		RETURN(0);
 
-	for (i = *vio->vui_iter;
+	for (i = io->u.ci_rw.rw_iter;
 	     iov_iter_count(&i);
 	     iov_iter_advance(&i, iov.iov_len)) {
 		iov = iov_iter_iovec(&i);
@@ -537,38 +528,37 @@ static void vvp_io_advance(const struct lu_env *env,
 		return;
 
 	vio->vui_tot_count -= nob;
-	iov_iter_reexpand(vio->vui_iter, vio->vui_tot_count);
-}
-
-static void vvp_io_update_iov(const struct lu_env *env,
-			      struct vvp_io *vio, struct cl_io *io)
-{
-	size_t size = io->u.ci_rw.crw_count;
-
-	if (!cl_is_normalio(env, io) || vio->vui_iter == NULL)
-		return;
-
-	iov_iter_truncate(vio->vui_iter, size);
+	if (io->ci_pio) {
+		iov_iter_advance(&io->u.ci_rw.rw_iter, nob);
+		io->u.ci_rw.rw_iocb.ki_pos = io->u.ci_rw.rw_range.cir_pos;
+#ifdef HAVE_KIOCB_KI_LEFT
+		io->u.ci_rw.rw_iocb.ki_left = vio->vui_tot_count;
+#elif defined(HAVE_KI_NBYTES)
+		io->u.ci_rw.rw_iocb.ki_nbytes = vio->vui_tot_count;
+#endif
+	} else {
+		/* It was truncated to stripe size in vvp_io_rw_lock() */
+		iov_iter_reexpand(&io->u.ci_rw.rw_iter, vio->vui_tot_count);
+	}
 }
 
 static int vvp_io_rw_lock(const struct lu_env *env, struct cl_io *io,
                           enum cl_lock_mode mode, loff_t start, loff_t end)
 {
-	struct vvp_io *vio = vvp_env_io(env);
 	int result;
 	int ast_flags = 0;
 
 	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
 	ENTRY;
 
-	vvp_io_update_iov(env, vio, io);
+	if (cl_is_normalio(env, io))
+		iov_iter_truncate(&io->u.ci_rw.rw_iter,
+				  io->u.ci_rw.rw_range.cir_count);
 
-	if (io->u.ci_rw.crw_nonblock)
+	if (io->u.ci_rw.rw_nonblock)
 		ast_flags |= CEF_NONBLOCK;
-	if (io->ci_lock_no_expand)
-		ast_flags |= CEF_LOCK_NO_EXPAND;
 
-	result = vvp_mmap_locks(env, vio, io);
+	result = vvp_mmap_locks(env, io);
 	if (result == 0)
 		result = vvp_io_one_lock(env, io, ast_flags, mode, start, end);
 
@@ -579,13 +569,13 @@ static int vvp_io_read_lock(const struct lu_env *env,
                             const struct cl_io_slice *ios)
 {
 	struct cl_io *io = ios->cis_io;
-	struct cl_io_rw_common *rd = &io->u.ci_rd.rd;
-	int result;
+	struct cl_io_range *range = &io->u.ci_rw.rw_range;
+	int rc;
 
 	ENTRY;
-	result = vvp_io_rw_lock(env, io, CLM_READ, rd->crw_pos,
-				rd->crw_pos + rd->crw_count - 1);
-	RETURN(result);
+	rc = vvp_io_rw_lock(env, io, CLM_READ, range->cir_pos,
+			    range->cir_pos + range->cir_count - 1);
+	RETURN(rc);
 }
 
 static int vvp_io_fault_lock(const struct lu_env *env,
@@ -604,26 +594,27 @@ static int vvp_io_fault_lock(const struct lu_env *env,
 }
 
 static int vvp_io_write_lock(const struct lu_env *env,
-			     const struct cl_io_slice *ios)
+                             const struct cl_io_slice *ios)
 {
 	struct cl_io *io = ios->cis_io;
 	loff_t start;
 	loff_t end;
+	int rc;
 
-	if (io->u.ci_wr.wr_append) {
+	ENTRY;
+	if (io->u.ci_rw.rw_append) {
 		start = 0;
 		end   = OBD_OBJECT_EOF;
 	} else {
-		start = io->u.ci_wr.wr.crw_pos;
-		end   = start + io->u.ci_wr.wr.crw_count - 1;
+		start = io->u.ci_rw.rw_range.cir_pos;
+		end   = start + io->u.ci_rw.rw_range.cir_count - 1;
 	}
-
-	RETURN(vvp_io_rw_lock(env, io, CLM_WRITE, start, end));
+	rc = vvp_io_rw_lock(env, io, CLM_WRITE, start, end);
+	RETURN(rc);
 }
 
 static int vvp_io_setattr_iter_init(const struct lu_env *env,
 				    const struct cl_io_slice *ios)
-
 {
 	return 0;
 }
@@ -640,12 +631,12 @@ static int vvp_io_setattr_lock(const struct lu_env *env,
 	__u64 new_size;
 	__u32 enqflags = 0;
 
-	if (cl_io_is_trunc(io)) {
-		new_size = io->u.ci_setattr.sa_attr.lvb_size;
-		if (new_size == 0)
-			enqflags = CEF_DISCARD_DATA;
-	} else {
-		unsigned int valid = io->u.ci_setattr.sa_avalid;
+        if (cl_io_is_trunc(io)) {
+                new_size = io->u.ci_setattr.sa_attr.lvb_size;
+                if (new_size == 0)
+                        enqflags = CEF_DISCARD_DATA;
+        } else {
+		unsigned int valid = io->u.ci_setattr.sa_valid;
 
 		if (!(valid & TIMES_SET_FLAGS))
 			return 0;
@@ -694,16 +685,16 @@ static int vvp_io_setattr_time(const struct lu_env *env,
         int result;
         unsigned valid = CAT_CTIME;
 
-	cl_object_attr_lock(obj);
-	attr->cat_ctime = io->u.ci_setattr.sa_attr.lvb_ctime;
-	if (io->u.ci_setattr.sa_avalid & ATTR_ATIME_SET) {
-		attr->cat_atime = io->u.ci_setattr.sa_attr.lvb_atime;
-		valid |= CAT_ATIME;
-	}
-	if (io->u.ci_setattr.sa_avalid & ATTR_MTIME_SET) {
-		attr->cat_mtime = io->u.ci_setattr.sa_attr.lvb_mtime;
-		valid |= CAT_MTIME;
-	}
+        cl_object_attr_lock(obj);
+        attr->cat_ctime = io->u.ci_setattr.sa_attr.lvb_ctime;
+        if (io->u.ci_setattr.sa_valid & ATTR_ATIME_SET) {
+                attr->cat_atime = io->u.ci_setattr.sa_attr.lvb_atime;
+                valid |= CAT_ATIME;
+        }
+        if (io->u.ci_setattr.sa_valid & ATTR_MTIME_SET) {
+                attr->cat_mtime = io->u.ci_setattr.sa_attr.lvb_mtime;
+                valid |= CAT_MTIME;
+        }
 	result = cl_object_attr_update(env, obj, attr, valid);
 	cl_object_attr_unlock(obj);
 
@@ -725,7 +716,7 @@ static int vvp_io_setattr_start(const struct lu_env *env,
 		inode_lock(inode);
 	}
 
-	if (io->u.ci_setattr.sa_avalid & TIMES_SET_FLAGS)
+	if (io->u.ci_setattr.sa_valid & TIMES_SET_FLAGS)
 		return vvp_io_setattr_time(env, ios);
 
 	return 0;
@@ -773,36 +764,34 @@ static int vvp_io_read_start(const struct lu_env *env,
 	struct inode		*inode = vvp_object_inode(obj);
 	struct ll_inode_info	*lli   = ll_i2info(inode);
 	struct file		*file  = vio->vui_fd->fd_file;
-	loff_t  pos = io->u.ci_rd.rd.crw_pos;
-	long    cnt = io->u.ci_rd.rd.crw_count;
-	long    tot = vio->vui_tot_count;
-	int     exceed = 0;
-	int     result;
-	ENTRY;
+	struct cl_io_range	*range = &io->u.ci_rw.rw_range;
+	loff_t pos = range->cir_pos; /* for generic_file_splice_read() only */
+	size_t tot = vio->vui_tot_count;
+	int exceed = 0;
+	int result;
 
 	CLOBINVRNT(env, obj, vvp_object_invariant(obj));
 
 	CDEBUG(D_VFSTRACE, "%s: read [%llu, %llu)\n",
 		file_dentry(file)->d_name.name,
-		pos, pos + cnt);
+		range->cir_pos, range->cir_pos + range->cir_count);
 
 	if (vio->vui_io_subtype == IO_NORMAL)
 		down_read(&lli->lli_trunc_sem);
 
 	if (!can_populate_pages(env, io, inode))
-		RETURN(0);
+		return 0;
 
-	/* Unless this is reading a sparse file, otherwise the lock has already
-	 * been acquired so vvp_prep_size() is an empty op. */
-	result = vvp_prep_size(env, obj, io, pos, cnt, &exceed);
+	result = vvp_prep_size(env, obj, io, range->cir_pos, tot, &exceed);
 	if (result != 0)
-		RETURN(result);
+		return result;
 	else if (exceed != 0)
-		GOTO(out, result);
+		goto out;
 
 	LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu,
 			 "Read ino %lu, %lu bytes, offset %lld, size %llu\n",
-			 inode->i_ino, cnt, pos, i_size_read(inode));
+			 inode->i_ino, range->cir_count, range->cir_pos,
+			 i_size_read(inode));
 
 	/* turn off the kernel's read-ahead */
 	vio->vui_fd->fd_file->f_ra.ra_pages = 0;
@@ -810,7 +799,7 @@ static int vvp_io_read_start(const struct lu_env *env,
 	/* initialize read-ahead window once per syscall */
 	if (!vio->vui_ra_valid) {
 		vio->vui_ra_valid = true;
-		vio->vui_ra_start = cl_index(obj, pos);
+		vio->vui_ra_start = cl_index(obj, range->cir_pos);
 		vio->vui_ra_count = cl_index(obj, tot + PAGE_SIZE - 1);
 		ll_ras_enter(file);
 	}
@@ -819,12 +808,17 @@ static int vvp_io_read_start(const struct lu_env *env,
 	file_accessed(file);
 	switch (vio->vui_io_subtype) {
 	case IO_NORMAL:
-		LASSERT(vio->vui_iocb->ki_pos == pos);
-		result = generic_file_read_iter(vio->vui_iocb, vio->vui_iter);
+		LASSERTF(io->u.ci_rw.rw_iocb.ki_pos == range->cir_pos,
+			 "ki_pos %lld [%lld, %lld)\n",
+			 io->u.ci_rw.rw_iocb.ki_pos,
+			 range->cir_pos, range->cir_pos + range->cir_count);
+		result = generic_file_read_iter(&io->u.ci_rw.rw_iocb,
+						&io->u.ci_rw.rw_iter);
 		break;
 	case IO_SPLICE:
 		result = generic_file_splice_read(file, &pos,
-						  vio->u.splice.vui_pipe, cnt,
+						  vio->u.splice.vui_pipe,
+						  range->cir_count,
 						  vio->u.splice.vui_flags);
 		/* LU-1109: do splice read stripe by stripe otherwise if it
 		 * may make nfsd stuck if this read occupied all internal pipe
@@ -835,13 +829,14 @@ static int vvp_io_read_start(const struct lu_env *env,
 		CERROR("Wrong IO type %u\n", vio->vui_io_subtype);
 		LBUG();
 	}
-	GOTO(out, result);
 
 out:
 	if (result >= 0) {
-		if (result < cnt)
+		if (result < range->cir_count)
 			io->ci_continue = 0;
 		io->ci_nob += result;
+		ll_rw_stats_tally(ll_i2sbi(inode), current->pid, vio->vui_fd,
+				  range->cir_pos, result, READ);
 		result = 0;
 	}
 
@@ -897,7 +892,6 @@ static int vvp_io_commit_sync(const struct lu_env *env, struct cl_io *io,
 			SetPageUptodate(cl_page_vmpage(page));
 			cl_page_disown(env, io, page);
 
-			/* held in ll_cl_init() */
 			lu_ref_del(&page->cp_reference, "cl_io", io);
 			cl_page_put(env, page);
 		}
@@ -916,7 +910,6 @@ static void write_commit_callback(const struct lu_env *env, struct cl_io *io,
 
 	cl_page_disown(env, io, page);
 
-	/* held in ll_cl_init() */
 	lu_ref_del(&page->cp_reference, "cl_io", cl_io_top(io));
 	cl_page_put(env, page);
 }
@@ -1017,7 +1010,6 @@ int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io)
 
 		cl_page_disown(env, io, page);
 
-		/* held in ll_cl_init() */
 		lu_ref_del(&page->cp_reference, "cl_io", io);
 		cl_page_put(env, page);
 	}
@@ -1035,14 +1027,10 @@ static int vvp_io_write_start(const struct lu_env *env,
 	struct inode		*inode = vvp_object_inode(obj);
 	struct ll_inode_info	*lli   = ll_i2info(inode);
 	struct file		*file  = vio->vui_fd->fd_file;
+	struct cl_io_range	*range = &io->u.ci_rw.rw_range;
+	bool			 lock_inode = !lli->lli_inode_locked &&
+					      !IS_NOSEC(inode);
 	ssize_t			 result = 0;
-	loff_t			 pos = io->u.ci_wr.wr.crw_pos;
-	size_t			 cnt = io->u.ci_wr.wr.crw_count;
-	bool			 lock_inode = !IS_NOSEC(inode);
-	size_t nob = io->ci_nob;
-	struct iov_iter iter;
-	size_t written = 0;
-
 	ENTRY;
 
 	if (vio->vui_io_subtype == IO_NORMAL)
@@ -1057,28 +1045,29 @@ static int vvp_io_write_start(const struct lu_env *env,
 		 * out-of-order writes.
 		 */
 		ll_merge_attr(env, inode);
-		pos = io->u.ci_wr.wr.crw_pos = i_size_read(inode);
-		vio->vui_iocb->ki_pos = pos;
+		range->cir_pos = i_size_read(inode);
+		io->u.ci_rw.rw_iocb.ki_pos = range->cir_pos;
 	} else {
-		LASSERTF(vio->vui_iocb->ki_pos == pos,
+		LASSERTF(io->u.ci_rw.rw_iocb.ki_pos == range->cir_pos,
 			 "ki_pos %lld [%lld, %lld)\n",
-			 vio->vui_iocb->ki_pos,
-			 pos, pos + cnt);
+			 io->u.ci_rw.rw_iocb.ki_pos,
+			 range->cir_pos, range->cir_pos + range->cir_count);
 	}
 
 	CDEBUG(D_VFSTRACE, "%s: write [%llu, %llu)\n",
 		file_dentry(file)->d_name.name,
-		pos, pos + cnt);
+		range->cir_pos, range->cir_pos + range->cir_count);
 
 	/* The maximum Lustre file size is variable, based on the OST maximum
 	 * object size and number of stripes.  This needs another check in
 	 * addition to the VFS checks earlier. */
-	if (pos + cnt > ll_file_maxbytes(inode)) {
+	if (range->cir_pos + range->cir_count > ll_file_maxbytes(inode)) {
 		CDEBUG(D_INODE,
 		       "%s: file %s ("DFID") offset %llu > maxbytes %llu\n",
 		       ll_get_fsname(inode->i_sb, NULL, 0),
 		       file_dentry(file)->d_name.name,
-		       PFID(ll_inode2fid(inode)), pos + cnt,
+		       PFID(ll_inode2fid(inode)),
+		       range->cir_pos + range->cir_count,
 		       ll_file_maxbytes(inode));
 		RETURN(-EFBIG);
 	}
@@ -1090,85 +1079,52 @@ static int vvp_io_write_start(const struct lu_env *env,
 	if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_IMUTEX_NOSEC) && lock_inode)
 		RETURN(-EINVAL);
 
-	if (vio->vui_iter == NULL) {
-		/* from a temp io in ll_cl_init(). */
-		result = 0;
-	} else {
-		/*
-		 * When using the locked AIO function (generic_file_aio_write())
-		 * testing has shown the inode mutex to be a limiting factor
-		 * with multi-threaded single shared file performance. To get
-		 * around this, we now use the lockless version. To maintain
-		 * consistency, proper locking to protect against writes,
-		 * trucates, etc. is handled in the higher layers of lustre.
-		 */
-		lock_inode = !IS_NOSEC(inode);
-		iter = *vio->vui_iter;
-
-		if (unlikely(lock_inode))
-			inode_lock(inode);
-		result = __generic_file_write_iter(vio->vui_iocb,
-						   vio->vui_iter);
-		if (unlikely(lock_inode))
-			inode_unlock(inode);
-
-		written = result;
-		if (result > 0 || result == -EIOCBQUEUED)
+	/*
+	 * When using the locked AIO function (generic_file_aio_write())
+	 * testing has shown the inode mutex to be a limiting factor
+	 * with multi-threaded single shared file performance. To get
+	 * around this, we now use the lockless version. To maintain
+	 * consistency, proper locking to protect against writes,
+	 * trucates, etc. is handled in the higher layers of lustre.
+	 */
+	if (lock_inode)
+		inode_lock(inode);
+	result = __generic_file_write_iter(&io->u.ci_rw.rw_iocb,
+					   &io->u.ci_rw.rw_iter);
+	if (lock_inode)
+		inode_unlock(inode);
+
+	if (result > 0 || result == -EIOCBQUEUED)
 #ifdef HAVE_GENERIC_WRITE_SYNC_2ARGS
-			result = generic_write_sync(vio->vui_iocb, result);
+		result = generic_write_sync(&io->u.ci_rw.rw_iocb, result);
 #else
-		{
-			ssize_t err;
+	{
+		ssize_t err;
 
-			err = generic_write_sync(vio->vui_iocb->ki_filp, pos,
-						 result);
-			if (err < 0 && result > 0)
-				result = err;
-		}
-#endif
+		err = generic_write_sync(io->u.ci_rw.rw_iocb.ki_filp,
+					 range->cir_pos, result);
+		if (err < 0 && result > 0)
+			result = err;
 	}
+#endif
 
 	if (result > 0) {
 		result = vvp_io_write_commit(env, io);
-		/* Simulate short commit */
-		if (CFS_FAULT_CHECK(OBD_FAIL_LLITE_SHORT_COMMIT)) {
-			vio->u.write.vui_written >>= 1;
-			if (vio->u.write.vui_written > 0)
-				io->ci_need_restart = 1;
-		}
 		if (vio->u.write.vui_written > 0) {
 			result = vio->u.write.vui_written;
 			CDEBUG(D_VFSTRACE, "%s: write nob %zd, result: %zd\n",
 				file_dentry(file)->d_name.name,
 				io->ci_nob, result);
 			io->ci_nob += result;
-		} else {
-			io->ci_continue = 0;
 		}
 	}
-	if (vio->vui_iocb->ki_pos != (pos + io->ci_nob - nob)) {
-		CDEBUG(D_VFSTRACE, "%s: write position mismatch: "
-		       "ki_pos %lld vs. pos %lld, written %ld, commit %ld "
-		       "rc %ld\n",
-		       file_dentry(file)->d_name.name,
-		       vio->vui_iocb->ki_pos, pos + io->ci_nob - nob,
-		       written, io->ci_nob - nob, result);
-		/*
-		 * Rewind ki_pos and vui_iter to where it has
-		 * successfully committed.
-		 */
-		vio->vui_iocb->ki_pos = pos + io->ci_nob - nob;
-		iov_iter_advance(&iter, io->ci_nob - nob);
-		vio->vui_iter->iov = iter.iov;
-		vio->vui_iter->nr_segs = iter.nr_segs;
-		vio->vui_iter->iov_offset = iter.iov_offset;
-		vio->vui_iter->count = iter.count;
-	}
 	if (result > 0) {
 		ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
 
-		if (result < cnt)
+		if (result < range->cir_count)
 			io->ci_continue = 0;
+		ll_rw_stats_tally(ll_i2sbi(inode), current->pid,
+				  vio->vui_fd, range->cir_pos, result, WRITE);
 		result = 0;
 	}
 
@@ -1323,7 +1279,7 @@ static int vvp_io_fault_start(const struct lu_env *env,
 	if (fio->ft_mkwrite) {
 		wait_on_page_writeback(vmpage);
 		if (!PageDirty(vmpage)) {
-			struct cl_page_list *plist = &vio->u.fault.ft_queue;
+			struct cl_page_list *plist = &io->ci_queue.c2_qin;
 			struct vvp_page *vpg = cl_object_page_slice(obj, page);
 			int to = PAGE_SIZE;
 
@@ -1335,34 +1291,13 @@ static int vvp_io_fault_start(const struct lu_env *env,
 
 			/* size fixup */
 			if (last_index == vvp_index(vpg))
-				to = ((size - 1) & ~PAGE_MASK) + 1;
+				to = size & ~PAGE_MASK;
 
 			/* Do not set Dirty bit here so that in case IO is
 			 * started before the page is really made dirty, we
 			 * still have chance to detect it. */
 			result = cl_io_commit_async(env, io, plist, 0, to,
 						    mkwrite_commit_callback);
-			/* Have overquota flag, trying sync write to check
-			 * whether indeed out of quota */
-			if (result == -EDQUOT) {
-				cl_page_get(page);
-				result = vvp_io_commit_sync(env, io,
-							    plist, 0, to);
-				if (result >= 0) {
-					io->ci_noquota = 1;
-					cl_page_own(env, io, page);
-					cl_page_list_add(plist, page);
-					lu_ref_add(&page->cp_reference,
-						   "cl_io", io);
-					result = cl_io_commit_async(env, io,
-						plist, 0, to,
-						mkwrite_commit_callback);
-					io->ci_noquota = 0;
-				} else {
-					cl_page_put(env, page);
-				}
-			}
-
 			LASSERT(cl_page_is_owned(page, io));
 			cl_page_list_fini(env, plist);
 
@@ -1377,9 +1312,8 @@ static int vvp_io_fault_start(const struct lu_env *env,
 				if (result == -EDQUOT)
 					result = -ENOSPC;
 				GOTO(out, result);
-			} else {
+			} else
 				cl_page_disown(env, io, page);
-			}
 		}
 	}
 
@@ -1488,9 +1422,6 @@ static const struct cl_io_operations vvp_io_ops = {
 			.cio_start	= vvp_io_fsync_start,
 			.cio_fini	= vvp_io_fini
 		},
-		[CIT_GLIMPSE] = {
-			.cio_fini	= vvp_io_fini
-		},
 		[CIT_MISC] = {
 			.cio_fini	= vvp_io_fini
 		},
@@ -1522,16 +1453,13 @@ int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
 	vio->vui_ra_valid = false;
 	result = 0;
 	if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE) {
-		size_t count;
 		struct ll_inode_info *lli = ll_i2info(inode);
 
-		count = io->u.ci_rw.crw_count;
+		vio->vui_tot_count = io->u.ci_rw.rw_range.cir_count;
 		/* "If nbyte is 0, read() will return 0 and have no other
 		 *  results."  -- Single Unix Spec */
-		if (count == 0)
+		if (vio->vui_tot_count == 0)
 			result = 1;
-		else
-			vio->vui_tot_count = count;
 
 		/* for read/write, we store the jobid in the inode, and
 		 * it'll be fetched by osc when building RPC.
@@ -1539,7 +1467,7 @@ int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
 		 * it's not accurate if the file is shared by different
 		 * jobs.
 		 */
-		lustre_get_jobid(lli->lli_jobid, sizeof(lli->lli_jobid));
+		lustre_get_jobid(lli->lli_jobid);
 	} else if (io->ci_type == CIT_SETATTR) {
 		if (!cl_io_is_trunc(io))
 			io->ci_lockreq = CILR_MANDATORY;
@@ -1562,6 +1490,5 @@ int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
 				PFID(lu_object_fid(&obj->co_lu)), result);
 	}
 
-	io->ci_result = result < 0 ? result : 0;
 	RETURN(result);
 }
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_lock.c b/drivers/staging/lustrefsx/lustre/llite/vvp_lock.c
new file mode 100644
index 0000000000000..651b8e128239d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_lock.c
@@ -0,0 +1,86 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_support.h>
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Vvp lock functions.
+ *
+ */
+
+static void vvp_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice)
+{
+	struct vvp_lock *vlk = cl2vvp_lock(slice);
+
+	OBD_SLAB_FREE_PTR(vlk, vvp_lock_kmem);
+}
+
+static int vvp_lock_enqueue(const struct lu_env *env,
+			    const struct cl_lock_slice *slice,
+			    struct cl_io *unused, struct cl_sync_io *anchor)
+{
+	CLOBINVRNT(env, slice->cls_obj, vvp_object_invariant(slice->cls_obj));
+
+	return 0;
+}
+
+static const struct cl_lock_operations vvp_lock_ops = {
+	.clo_fini	= vvp_lock_fini,
+	.clo_enqueue	= vvp_lock_enqueue,
+};
+
+int vvp_lock_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_lock *lock, const struct cl_io *unused)
+{
+	struct vvp_lock *vlk;
+	int result;
+
+	CLOBINVRNT(env, obj, vvp_object_invariant(obj));
+
+	OBD_SLAB_ALLOC_PTR_GFP(vlk, vvp_lock_kmem, GFP_NOFS);
+	if (vlk != NULL) {
+		cl_lock_slice_add(lock, &vlk->vlk_cl, obj, &vvp_lock_ops);
+		result = 0;
+	} else {
+		result = -ENOMEM;
+	}
+
+	return result;
+}
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_object.c b/drivers/staging/lustrefsx/lustre/llite/vvp_object.c
index c3bf715667577..fd7211f60c61f 100644
--- a/drivers/staging/lustrefsx/lustre/llite/vvp_object.c
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_object.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -169,13 +169,6 @@ static int vvp_prune(const struct lu_env *env, struct cl_object *obj)
 	}
 
 	truncate_inode_pages(inode->i_mapping, 0);
-	if (inode->i_mapping->nrpages) {
-		CDEBUG(D_VFSTRACE, DFID ": still has %lu pages remaining\n",
-		       PFID(lu_object_fid(&obj->co_lu)),
-		       inode->i_mapping->nrpages);
-		RETURN(-EIO);
-	}
-
 	RETURN(0);
 }
 
@@ -205,25 +198,26 @@ static void vvp_req_attr_set(const struct lu_env *env, struct cl_object *obj,
 {
 	struct inode *inode;
 	struct obdo  *oa;
-	u64 valid_flags = OBD_MD_FLTYPE | OBD_MD_FLUID | OBD_MD_FLGID;
+	u64 valid_flags = OBD_MD_FLTYPE;
 
 	oa = attr->cra_oa;
 	inode = vvp_object_inode(obj);
 
 	if (attr->cra_type == CRT_WRITE) {
-		valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME;
+		valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME |
+			       OBD_MD_FLUID | OBD_MD_FLGID;
 		obdo_set_o_projid(oa, ll_i2info(inode)->lli_projid);
 	}
 	obdo_from_inode(oa, inode, valid_flags & attr->cra_flags);
 	obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
 	if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_INVALID_PFID))
 		oa->o_parent_oid++;
-	memcpy(attr->cra_jobid, ll_i2info(inode)->lli_jobid,
-	       sizeof(attr->cra_jobid));
+	memcpy(attr->cra_jobid, ll_i2info(inode)->lli_jobid, LUSTRE_JOBID_SIZE);
 }
 
 static const struct cl_object_operations vvp_ops = {
 	.coo_page_init    = vvp_page_init,
+	.coo_lock_init    = vvp_lock_init,
 	.coo_io_init      = vvp_io_init,
 	.coo_attr_get     = vvp_attr_get,
 	.coo_attr_update  = vvp_attr_update,
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_page.c b/drivers/staging/lustrefsx/lustre/llite/vvp_page.c
index 0f4e2a9e83dac..47d48639ad43c 100644
--- a/drivers/staging/lustrefsx/lustre/llite/vvp_page.c
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_page.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -54,22 +54,16 @@
  *
  */
 
-static void vvp_page_fini_common(struct vvp_page *vpg, struct pagevec *pvec)
+static void vvp_page_fini_common(struct vvp_page *vpg)
 {
 	struct page *vmpage = vpg->vpg_page;
 
 	LASSERT(vmpage != NULL);
-	if (pvec) {
-		if (!pagevec_add(pvec, vmpage))
-			pagevec_release(pvec);
-	} else {
-		put_page(vmpage);
-	}
+	put_page(vmpage);
 }
 
 static void vvp_page_fini(const struct lu_env *env,
-			  struct cl_page_slice *slice,
-			  struct pagevec *pvec)
+			  struct cl_page_slice *slice)
 {
 	struct vvp_page *vpg     = cl2vvp_page(slice);
 	struct page     *vmpage  = vpg->vpg_page;
@@ -79,7 +73,7 @@ static void vvp_page_fini(const struct lu_env *env,
 	 * VPG_FREEING state.
 	 */
 	LASSERT((struct cl_page *)vmpage->private != slice->cpl_page);
-	vvp_page_fini_common(vpg, pvec);
+	vvp_page_fini_common(vpg);
 }
 
 static int vvp_page_own(const struct lu_env *env,
@@ -150,7 +144,7 @@ static void vvp_page_discard(const struct lu_env *env,
 	LASSERT(vmpage != NULL);
 	LASSERT(PageLocked(vmpage));
 
-	if (vpg->vpg_defer_uptodate && !vpg->vpg_ra_used && vmpage->mapping)
+	if (vpg->vpg_defer_uptodate && !vpg->vpg_ra_used)
 		ll_ra_stats_inc(vmpage->mapping->host, RA_STAT_DISCARDED);
 
 	ll_invalidate_page(vmpage);
@@ -160,12 +154,14 @@ static void vvp_page_delete(const struct lu_env *env,
 			    const struct cl_page_slice *slice)
 {
 	struct page      *vmpage = cl2vm_page(slice);
+	struct inode     *inode  = vmpage->mapping->host;
+	struct cl_object *obj    = slice->cpl_obj;
 	struct cl_page   *page   = slice->cpl_page;
 	int refc;
 
 	LASSERT(PageLocked(vmpage));
 	LASSERT((struct cl_page *)vmpage->private == page);
-
+	LASSERT(inode == vvp_object_inode(obj));
 
 	/* Drop the reference count held in vvp_page_init */
 	refc = atomic_dec_return(&page->cp_ref);
@@ -246,8 +242,8 @@ static void vvp_vmpage_error(struct inode *inode, struct page *vmpage, int ioret
 		else
 			set_bit(AS_EIO, &inode->i_mapping->flags);
 
-		if ((ioret == -ESHUTDOWN || ioret == -EINTR ||
-		     ioret == -EIO) && obj->vob_discard_page_warned == 0) {
+		if ((ioret == -ESHUTDOWN || ioret == -EINTR) &&
+		     obj->vob_discard_page_warned == 0) {
 			obj->vob_discard_page_warned = 1;
 			ll_dirty_page_discard_warn(vmpage, ioret);
 		}
@@ -273,14 +269,8 @@ static void vvp_page_completion_read(const struct lu_env *env,
 	if (ioret == 0)  {
 		if (!vpg->vpg_defer_uptodate)
 			cl_page_export(env, page, 1);
-	} else if (vpg->vpg_defer_uptodate) {
+	} else {
 		vpg->vpg_defer_uptodate = 0;
-		if (ioret == -EWOULDBLOCK) {
-			/* mirror read failed, it needs to destroy the page
-			 * because subpage would be from wrong osc when trying
-			 * to read from a new mirror */
-			ll_invalidate_page(vmpage);
-		}
 	}
 
 	if (page->cp_sync_io == NULL)
@@ -494,14 +484,13 @@ vvp_transient_page_completion(const struct lu_env *env,
 }
 
 static void vvp_transient_page_fini(const struct lu_env *env,
-				    struct cl_page_slice *slice,
-				    struct pagevec *pvec)
+				    struct cl_page_slice *slice)
 {
 	struct vvp_page *vpg = cl2vvp_page(slice);
 	struct cl_page *clp = slice->cpl_page;
 	struct vvp_object *clobj = cl2vvp(clp->cp_obj);
 
-	vvp_page_fini_common(vpg, pvec);
+	vvp_page_fini_common(vpg);
 	atomic_dec(&clobj->vob_transient_pages);
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr.c b/drivers/staging/lustrefsx/lustre/llite/xattr.c
index 7e6bfc0a51839..78c774ef738c4 100644
--- a/drivers/staging/lustrefsx/lustre/llite/xattr.c
+++ b/drivers/staging/lustrefsx/lustre/llite/xattr.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -42,8 +42,8 @@
 
 #include <obd_support.h>
 #include <lustre_dlm.h>
+#include <lustre_ver.h>
 #include <lustre_eacl.h>
-#include <lustre_swab.h>
 
 #include "llite_internal.h"
 
@@ -105,10 +105,7 @@ static int ll_xattr_set_common(const struct xattr_handler *handler,
 	int rc;
 	ENTRY;
 
-	/* When setxattr() is called with a size of 0 the value is
-	 * unconditionally replaced by "". When removexattr() is
-	 * called we get a NULL value and XATTR_REPLACE for flags. */
-	if (!value && flags == XATTR_REPLACE) {
+	if (flags == XATTR_REPLACE) {
 		ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REMOVEXATTR, 1);
 		valid = OBD_MD_FLXATTRRM;
 	} else {
@@ -159,7 +156,7 @@ static int ll_xattr_set_common(const struct xattr_handler *handler,
 		RETURN(-ENOMEM);
 
 	rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode), valid, fullname,
-			 pv, size, flags, ll_i2suppgid(inode), &req);
+			 pv, size, 0, flags, ll_i2suppgid(inode), &req);
 	kfree(fullname);
 	if (rc) {
 		if (rc == -EOPNOTSUPP && handler->flags == XATTR_USER_T) {
@@ -204,7 +201,7 @@ static int get_hsm_state(struct inode *inode, u32 *hus_states)
 	return rc;
 }
 
-static int ll_adjust_lum(struct inode *inode, struct lov_user_md *lump, size_t size)
+static int ll_adjust_lum(struct inode *inode, struct lov_user_md *lump)
 {
 	struct lov_comp_md_v1 *comp_v1 = (struct lov_comp_md_v1 *)lump;
 	struct lov_user_md *v1 = lump;
@@ -219,12 +216,7 @@ static int ll_adjust_lum(struct inode *inode, struct lov_user_md *lump, size_t s
 		return 0;
 
 	if (lump->lmm_magic == LOV_USER_MAGIC_COMP_V1) {
-		if (size < sizeof(*comp_v1))
-			return -ERANGE;
-
 		entry_count = comp_v1->lcm_entry_count;
-		if (size < offsetof(typeof(*comp_v1), lcm_entries[entry_count]))
-			return -ERANGE;
 		is_composite = true;
 	}
 
@@ -232,10 +224,6 @@ static int ll_adjust_lum(struct inode *inode, struct lov_user_md *lump, size_t s
 		if (lump->lmm_magic == LOV_USER_MAGIC_COMP_V1) {
 			void *ptr = comp_v1;
 
-			if (comp_v1->lcm_entries[i].lcme_offset + sizeof(*v1) >
-			    size)
-				return -ERANGE;
-
 			ptr += comp_v1->lcm_entries[i].lcme_offset;
 			v1 = (struct lov_user_md *)ptr;
 		}
@@ -283,13 +271,7 @@ static int ll_setstripe_ea(struct dentry *dentry, struct lov_user_md *lump,
 	if (!size && lump)
 		lump = NULL;
 
-	if (size && size < sizeof(*lump)) {
-		/* ll_adjust_lum() or ll_lov_user_md_size() might access
-		 * before size - just give up now.
-		 */
-		return -ERANGE;
-	}
-	rc = ll_adjust_lum(inode, lump, size);
+	rc = ll_adjust_lum(inode, lump);
 	if (rc)
 		return rc;
 
@@ -351,11 +333,6 @@ static int ll_xattr_set(const struct xattr_handler *handler,
 		return 0;
 	}
 
-	if (strncmp(name, "lov.", 4) == 0 &&
-	    (__swab32(((struct lov_user_md *)value)->lmm_magic) &
-	    le32_to_cpu(LOV_MAGIC_MASK)) == le32_to_cpu(LOV_MAGIC_MAGIC))
-		lustre_swab_lov_user_md((struct lov_user_md *)value, 0);
-
 	return ll_xattr_set_common(handler, dentry, inode, name, value, size,
 				   flags);
 }
@@ -366,6 +343,7 @@ int ll_xattr_list(struct inode *inode, const char *name, int type, void *buffer,
 	struct ll_inode_info *lli = ll_i2info(inode);
         struct ll_sb_info *sbi = ll_i2sbi(inode);
         struct ptlrpc_request *req = NULL;
+        struct mdt_body *body;
         void *xdata;
 	int rc;
 	ENTRY;
@@ -392,25 +370,35 @@ int ll_xattr_list(struct inode *inode, const char *name, int type, void *buffer,
 		}
 	} else {
 getxattr_nocache:
-		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), valid,
-				 name, size, &req);
+		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
+				 valid, name, NULL, 0, size, 0, &req);
 		if (rc < 0)
 			GOTO(out_xattr, rc);
 
+		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+		LASSERT(body);
+
 		/* only detect the xattr size */
 		if (size == 0)
-			GOTO(out, rc);
+			GOTO(out, rc = body->mbo_eadatasize);
 
-		if (size < rc)
+		if (size < body->mbo_eadatasize) {
+			CERROR("server bug: replied size %u > %u\n",
+				body->mbo_eadatasize, (int)size);
 			GOTO(out, rc = -ERANGE);
+		}
+
+		if (body->mbo_eadatasize == 0)
+			GOTO(out, rc = -ENODATA);
 
 		/* do not need swab xattr data */
 		xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA,
-						     rc);
+							body->mbo_eadatasize);
 		if (!xdata)
-			GOTO(out, rc = -EPROTO);
+			GOTO(out, rc = -EFAULT);
 
-		memcpy(buffer, xdata, rc);
+		memcpy(buffer, xdata, body->mbo_eadatasize);
+		rc = body->mbo_eadatasize;
 	}
 
 	EXIT;
@@ -523,37 +511,21 @@ static ssize_t ll_getxattr_lov(struct inode *inode, void *buf, size_t buf_size)
 		 * recognizing layout gen as stripe offset when the
 		 * file is restored. See LU-2809.
 		 */
-		if ((((struct lov_mds_md *)buf)->lmm_magic &
-		    __swab32(LOV_MAGIC_MAGIC)) == __swab32(LOV_MAGIC_MAGIC))
-			lustre_swab_lov_user_md((struct lov_user_md *)buf,
-						cl.cl_size);
-
-		switch (((struct lov_mds_md *)buf)->lmm_magic) {
-		case LOV_MAGIC_V1:
-		case LOV_MAGIC_V3:
-		case LOV_MAGIC_SPECIFIC:
-			((struct lov_mds_md *)buf)->lmm_layout_gen = 0;
-			break;
-		case LOV_MAGIC_COMP_V1:
+		if (((struct lov_mds_md *)buf)->lmm_magic == LOV_MAGIC_COMP_V1)
 			goto out_env;
-		default:
-			CERROR("Invalid LOV magic %08x\n",
-			       ((struct lov_mds_md *)buf)->lmm_magic);
-			GOTO(out_env, rc = -EINVAL);
-		}
 
+		((struct lov_mds_md *)buf)->lmm_layout_gen = 0;
 out_env:
 		cl_env_put(env, &refcheck);
 
 		RETURN(rc);
 	} else if (S_ISDIR(inode->i_mode)) {
 		struct ptlrpc_request *req = NULL;
-		struct ptlrpc_request *root_req = NULL;
 		struct lov_mds_md *lmm = NULL;
 		int lmm_size = 0;
 
-		rc = ll_dir_getstripe_default(inode, (void **)&lmm, &lmm_size,
-					      &req, &root_req, 0);
+		rc = ll_dir_getstripe(inode, (void **)&lmm, &lmm_size,
+				      &req, 0);
 		if (rc < 0)
 			GOTO(out_req, rc);
 
@@ -568,8 +540,6 @@ static ssize_t ll_getxattr_lov(struct inode *inode, void *buf, size_t buf_size)
 out_req:
 		if (req)
 			ptlrpc_req_finished(req);
-		if (root_req)
-			ptlrpc_req_finished(root_req);
 
 		RETURN(rc);
 	} else {
diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr26.c b/drivers/staging/lustrefsx/lustre/llite/xattr26.c
index 28772dd5a74a1..84e9b8bcbe915 100644
--- a/drivers/staging/lustrefsx/lustre/llite/xattr26.c
+++ b/drivers/staging/lustrefsx/lustre/llite/xattr26.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,7 +40,7 @@
 
 #include <obd_support.h>
 #include <lustre_dlm.h>
-#include <uapi/linux/lustre/lustre_ver.h>
+#include <lustre_ver.h>
 #include <lustre_eacl.h>
 
 #include "llite_internal.h"
@@ -152,7 +152,7 @@ int ll_setxattr_common(struct inode *inode, const char *name,
 	}
 
 	rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode), valid, name, pv,
-			 size, flags, ll_i2suppgid(inode), &req);
+			 size, 0, flags, ll_i2suppgid(inode), &req);
 	if (rc) {
 		if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) {
 			LCONSOLE_INFO("Disabling user_xattr feature because "
@@ -329,6 +329,7 @@ int ll_getxattr_common(struct inode *inode, const char *name,
 {
 	struct ll_sb_info *sbi = ll_i2sbi(inode);
 	struct ptlrpc_request *req = NULL;
+	struct mdt_body *body;
 	int xattr_type, rc;
 	void *xdata;
 	struct ll_inode_info *lli = ll_i2info(inode);
@@ -404,25 +405,36 @@ int ll_getxattr_common(struct inode *inode, const char *name,
 		}
 	} else {
 getxattr_nocache:
-		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), valid,
-				 name, size, &req);
+		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
+				valid, name, NULL, 0, size, 0, &req);
+
 		if (rc < 0)
 			GOTO(out_xattr, rc);
 
+		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+		LASSERT(body);
+
 		/* only detect the xattr size */
 		if (size == 0)
-			GOTO(out, rc);
+			GOTO(out, rc = body->mbo_eadatasize);
 
-		if (size < rc)
+		if (size < body->mbo_eadatasize) {
+			CERROR("server bug: replied size %u > %u\n",
+				body->mbo_eadatasize, (int)size);
 			GOTO(out, rc = -ERANGE);
+		}
+
+		if (body->mbo_eadatasize == 0)
+			GOTO(out, rc = -ENODATA);
 
 		/* do not need swab xattr data */
 		xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA,
-						     rc);
+							body->mbo_eadatasize);
 		if (!xdata)
-			GOTO(out, rc = -EPROTO);
+			GOTO(out, rc = -EFAULT);
 
-		memcpy(buffer, xdata, rc);
+		memcpy(buffer, xdata, body->mbo_eadatasize);
+		rc = body->mbo_eadatasize;
 	}
 
 	EXIT;
diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr_cache.c b/drivers/staging/lustrefsx/lustre/llite/xattr_cache.c
index f1022b0296f47..a001e5c2d8a7b 100644
--- a/drivers/staging/lustrefsx/lustre/llite/xattr_cache.c
+++ b/drivers/staging/lustrefsx/lustre/llite/xattr_cache.c
@@ -24,7 +24,7 @@
 /*
  * Copyright 2012 Xyratex Technology Limited
  *
- * Copyright (c) 2013, 2017, Intel Corporation.
+ * Copyright (c) 2013, 2016, Intel Corporation.
  *
  * Author: Andrew Perepechko <Andrew_Perepechko@xyratex.com>
  *
@@ -37,6 +37,7 @@
 #include <linux/mm.h>
 #include <obd_support.h>
 #include <lustre_dlm.h>
+#include <lustre_ver.h>
 #include "llite_internal.h"
 
 /* If we ever have hundreds of extended attributes, we might want to consider
diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr_security.c b/drivers/staging/lustrefsx/lustre/llite/xattr_security.c
index 094266223b3bd..8f2e2e5cc1fa0 100644
--- a/drivers/staging/lustrefsx/lustre/llite/xattr_security.c
+++ b/drivers/staging/lustrefsx/lustre/llite/xattr_security.c
@@ -205,36 +205,3 @@ ll_inode_init_security(struct dentry *dentry, struct inode *inode,
 	return err;
 }
 #endif /* HAVE_SECURITY_IINITSEC_CALLBACK */
-
-/**
- * Get security context xattr name used by policy.
- *
- * \retval >= 0     length of xattr name
- * \retval < 0      failure to get security context xattr name
- */
-int
-ll_listsecurity(struct inode *inode, char *secctx_name, size_t secctx_name_size)
-{
-	int rc;
-
-	if (!selinux_is_enabled())
-		return 0;
-
-#ifdef HAVE_SECURITY_INODE_LISTSECURITY
-	rc = security_inode_listsecurity(inode, secctx_name, secctx_name_size);
-	if (rc >= secctx_name_size)
-		rc = -ERANGE;
-	else if (rc >= 0)
-		secctx_name[rc] = '\0';
-	return rc;
-#else /* !HAVE_SECURITY_INODE_LISTSECURITY */
-	rc = sizeof(XATTR_NAME_SELINUX);
-	if (secctx_name && rc < secctx_name_size) {
-		memcpy(secctx_name, XATTR_NAME_SELINUX, rc);
-		secctx_name[rc] = '\0';
-	} else {
-		rc = -ERANGE;
-	}
-	return rc;
-#endif /* HAVE_SECURITY_INODE_LISTSECURITY */
-}
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c b/drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c
index b439d87ae9348..b5ec306dcc224 100644
--- a/drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c
@@ -40,6 +40,7 @@
 #include <linux/seq_file.h>
 
 #include <obd_support.h>
+#include <lustre/lustre_idl.h>
 #include <lustre_fid.h>
 #include <lustre_lib.h>
 #include <lustre_net.h>
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c b/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c
index 24c616b4b6cd9..bb792e751e94f 100644
--- a/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c
@@ -42,6 +42,7 @@
 #include <lustre_intent.h>
 
 #include <obd_support.h>
+#include <lustre/lustre_idl.h>
 #include <lustre_lib.h>
 #include <lustre_net.h>
 #include <lustre_dlm.h>
@@ -54,8 +55,7 @@ static int lmv_intent_remote(struct obd_export *exp, struct lookup_intent *it,
 			     const struct lu_fid *parent_fid,
 			     struct ptlrpc_request **reqp,
 			     ldlm_blocking_callback cb_blocking,
-			     __u64 extra_lock_flags,
-			     const char *secctx_name, __u32 secctx_name_size)
+			     __u64 extra_lock_flags)
 {
 	struct obd_device	*obd = exp->exp_obd;
 	struct lmv_obd		*lmv = &obd->u.lmv;
@@ -74,6 +74,13 @@ static int lmv_intent_remote(struct obd_export *exp, struct lookup_intent *it,
 
 	LASSERT((body->mbo_valid & OBD_MD_MDS));
 
+	/*
+	 * Unfortunately, we have to lie to MDC/MDS to retrieve
+	 * attributes llite needs and provideproper locking.
+	 */
+	if (it->it_op & IT_LOOKUP)
+		it->it_op = IT_GETATTR;
+
 	/*
 	 * We got LOOKUP lock, but we really need attrs.
 	 */
@@ -108,16 +115,6 @@ static int lmv_intent_remote(struct obd_export *exp, struct lookup_intent *it,
 	CDEBUG(D_INODE, "REMOTE_INTENT with fid="DFID" -> mds #%u\n",
 	       PFID(&body->mbo_fid1), tgt->ltd_idx);
 
-	/* ask for security context upon intent */
-	if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_OPEN) &&
-	    secctx_name_size != 0 && secctx_name != NULL) {
-		op_data->op_file_secctx_name = secctx_name;
-		op_data->op_file_secctx_name_size = secctx_name_size;
-		CDEBUG(D_SEC, "'%.*s' is security xattr to fetch for "
-		       DFID"\n",
-		       secctx_name_size, secctx_name, PFID(&body->mbo_fid1));
-	}
-
 	rc = md_intent_lock(tgt->ltd_exp, op_data, it, &req, cb_blocking,
 			    extra_lock_flags);
         if (rc)
@@ -156,14 +153,13 @@ int lmv_revalidate_slaves(struct obd_export *exp,
 			  ldlm_blocking_callback cb_blocking,
 			  int extra_lock_flags)
 {
-	struct obd_device *obd = exp->exp_obd;
-	struct lmv_obd *lmv = &obd->u.lmv;
-	struct ptlrpc_request *req = NULL;
-	struct mdt_body *body;
-	struct md_op_data *op_data;
-	int i;
-	int valid_stripe_count = 0;
-	int rc = 0;
+	struct obd_device      *obd = exp->exp_obd;
+	struct lmv_obd         *lmv = &obd->u.lmv;
+	struct ptlrpc_request	*req = NULL;
+	struct mdt_body		*body;
+	struct md_op_data      *op_data;
+	int                     i;
+	int                     rc = 0;
 
 	ENTRY;
 
@@ -189,9 +185,6 @@ int lmv_revalidate_slaves(struct obd_export *exp,
 		fid = lsm->lsm_md_oinfo[i].lmo_fid;
 		inode = lsm->lsm_md_oinfo[i].lmo_root;
 
-		if (!inode)
-			continue;
-
 		/*
 		 * Prepare op_data for revalidating. Note that @fid2 shluld be
 		 * defined otherwise it will go to server and take new lock
@@ -200,14 +193,8 @@ int lmv_revalidate_slaves(struct obd_export *exp,
 		memset(op_data, 0, sizeof(*op_data));
 		op_data->op_fid1 = fid;
 		op_data->op_fid2 = fid;
-		/* shard revalidate only needs to fetch attributes and UPDATE
-		 * lock, which is similar to the bottom half of remote object
-		 * getattr, set this flag so that MDT skips checking whether
-		 * it's remote object.
-		 */
-		op_data->op_bias = MDS_CROSS_REF;
 
-		tgt = lmv_get_target(lmv, lsm->lsm_md_oinfo[i].lmo_mds, NULL);
+		tgt = lmv_locate_mds(lmv, op_data, &fid);
 		if (IS_ERR(tgt))
 			GOTO(cleanup, rc = PTR_ERR(tgt));
 
@@ -221,12 +208,6 @@ int lmv_revalidate_slaves(struct obd_export *exp,
 
 		rc = md_intent_lock(tgt->ltd_exp, op_data, &it, &req,
 				    cb_blocking, extra_lock_flags);
-		if (rc == -ENOENT) {
-			/* skip stripe is not exists */
-			rc = 0;
-			continue;
-		}
-
 		if (rc < 0)
 			GOTO(cleanup, rc);
 
@@ -262,22 +243,17 @@ int lmv_revalidate_slaves(struct obd_export *exp,
 			ldlm_lock_decref(lockh, it.it_lock_mode);
 			it.it_lock_mode = 0;
 		}
-
-		valid_stripe_count++;
 	}
 
 cleanup:
 	if (req != NULL)
 		ptlrpc_req_finished(req);
 
-	/* if all stripes are invalid, return -ENOENT to notify user */
-	if (!rc && !valid_stripe_count)
-		rc = -ENOENT;
-
 	OBD_FREE_PTR(op_data);
 	RETURN(rc);
 }
 
+
 /*
  * IT_OPEN is intended to open (and create, possible) an object. Parent (pid)
  * may be split dir.
@@ -288,58 +264,13 @@ static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
 			   ldlm_blocking_callback cb_blocking,
 			   __u64 extra_lock_flags)
 {
-	struct obd_device *obd = exp->exp_obd;
-	struct lmv_obd *lmv = &obd->u.lmv;
-	struct lmv_tgt_desc *tgt;
-	struct mdt_body *body;
-	__u64 flags = it->it_flags;
-	int rc;
-
+	struct obd_device	*obd = exp->exp_obd;
+	struct lmv_obd		*lmv = &obd->u.lmv;
+	struct lmv_tgt_desc	*tgt;
+	struct mdt_body		*body;
+	int			rc;
 	ENTRY;
 
-	if ((it->it_op & IT_CREAT) && !(flags & MDS_OPEN_BY_FID)) {
-		/* don't allow create under dir with bad hash */
-		if (lmv_is_dir_bad_hash(op_data->op_mea1))
-			RETURN(-EBADF);
-
-		if (lmv_is_dir_migrating(op_data->op_mea1)) {
-			if (flags & O_EXCL) {
-				/*
-				 * open(O_CREAT | O_EXCL) needs to check
-				 * existing name, which should be done on both
-				 * old and new layout, to avoid creating new
-				 * file under old layout, check old layout on
-				 * client side.
-				 */
-				tgt = lmv_locate_tgt(lmv, op_data,
-						     &op_data->op_fid1);
-				if (IS_ERR(tgt))
-					RETURN(PTR_ERR(tgt));
-
-				rc = md_getattr_name(tgt->ltd_exp, op_data,
-						     reqp);
-				if (!rc) {
-					ptlrpc_req_finished(*reqp);
-					*reqp = NULL;
-					RETURN(-EEXIST);
-				}
-
-				if (rc != -ENOENT)
-					RETURN(rc);
-
-				op_data->op_post_migrate = true;
-			} else {
-				/*
-				 * open(O_CREAT) will be sent to MDT in old
-				 * layout first, to avoid creating new file
-				 * under old layout, clear O_CREAT.
-				 */
-				it->it_flags &= ~O_CREAT;
-			}
-		}
-	}
-
-retry:
 	if (it->it_flags & MDS_OPEN_BY_FID) {
 		LASSERT(fid_is_sane(&op_data->op_fid2));
 
@@ -359,7 +290,7 @@ static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
 		LASSERT(fid_is_zero(&op_data->op_fid2));
 		LASSERT(op_data->op_name != NULL);
 
-		tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
+		tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
 		if (IS_ERR(tgt))
 			RETURN(PTR_ERR(tgt));
 	}
@@ -390,21 +321,8 @@ static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
 	 */
 	if ((it->it_disposition & DISP_LOOKUP_NEG) &&
 	    !(it->it_disposition & DISP_OPEN_CREATE) &&
-	    !(it->it_disposition & DISP_OPEN_OPEN)) {
-		if (!(it->it_flags & MDS_OPEN_BY_FID) &&
-		    lmv_dir_retry_check_update(op_data)) {
-			ptlrpc_req_finished(*reqp);
-			it->it_request = NULL;
-			it->it_disposition = 0;
-			*reqp = NULL;
-
-			it->it_flags = flags;
-			fid_zero(&op_data->op_fid2);
-			goto retry;
-		}
-
+	    !(it->it_disposition & DISP_OPEN_OPEN))
 		RETURN(rc);
-	}
 
 	body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
 	if (body == NULL)
@@ -413,9 +331,7 @@ static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
 	/* Not cross-ref case, just get out of here. */
 	if (unlikely((body->mbo_valid & OBD_MD_MDS))) {
 		rc = lmv_intent_remote(exp, it, &op_data->op_fid1, reqp,
-				       cb_blocking, extra_lock_flags,
-				       op_data->op_file_secctx_name,
-				       op_data->op_file_secctx_name_size);
+				       cb_blocking, extra_lock_flags);
 		if (rc != 0)
 			RETURN(rc);
 
@@ -436,56 +352,42 @@ lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data,
 		  ldlm_blocking_callback cb_blocking,
 		  __u64 extra_lock_flags)
 {
-	struct obd_device *obd = exp->exp_obd;
-	struct lmv_obd *lmv = &obd->u.lmv;
-	struct lmv_tgt_desc *tgt = NULL;
-	struct mdt_body *body;
-	int rc;
+	struct obd_device	*obd = exp->exp_obd;
+	struct lmv_obd		*lmv = &obd->u.lmv;
+	struct lmv_tgt_desc	*tgt = NULL;
+	struct mdt_body		*body;
+	struct lmv_stripe_md	*lsm = op_data->op_mea1;
+	int			rc = 0;
 	ENTRY;
 
-retry:
-	if (op_data->op_flags & MF_GETATTR_BY_FID) {
-		/* getattr by FID, replace fid1 with stripe FID,
-		 * NB, don't replace if name is "/", because it may be a subtree
-		 * mount, and if it's a striped directory, fid1 will be replaced
-		 * to stripe FID by hash, while fid2 is master object FID, which
-		 * will be treated as a remote object if the two FIDs are
-		 * located on different MDTs, and LOOKUP lock can't be fetched.
-		 */
-		LASSERT(op_data->op_name);
-		if (op_data->op_namelen != 1 ||
-		    strncmp(op_data->op_name, "/", 1) != 0) {
-			tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
-			if (IS_ERR(tgt))
-				RETURN(PTR_ERR(tgt));
-		}
+	/* If it returns ERR_PTR(-EBADFD) then it is an unknown hash type
+	 * it will try all stripes to locate the object */
+	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(tgt) && (PTR_ERR(tgt) != -EBADFD))
+		RETURN(PTR_ERR(tgt));
 
-		/* name is used to locate stripe target, clear it here
-		 * to avoid packing name in request, so that MDS knows
-		 * it's getattr by FID.
-		 */
-		op_data->op_name = NULL;
-		op_data->op_namelen = 0;
+	/* Both migrating dir and unknown hash dir need to try
+	 * all of sub-stripes */
+	if (lsm != NULL && !lmv_is_known_hash_type(lsm->lsm_md_hash_type)) {
+		struct lmv_oinfo *oinfo;
 
-		/* getattr request is sent to MDT where fid2 inode is */
-		tgt = lmv_find_target(lmv, &op_data->op_fid2);
-	} else if (op_data->op_name) {
-		/* getattr by name */
-		tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
-		if (!fid_is_sane(&op_data->op_fid2))
-			fid_zero(&op_data->op_fid2);
-	} else {
-		/* old way to getattr by FID, parent FID not packed */
-		tgt = lmv_find_target(lmv, &op_data->op_fid1);
+		oinfo = &lsm->lsm_md_oinfo[0];
+
+		op_data->op_fid1 = oinfo->lmo_fid;
+		op_data->op_mds = oinfo->lmo_mds;
+		tgt = lmv_get_target(lmv, oinfo->lmo_mds, NULL);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
 	}
-	if (IS_ERR(tgt))
-		RETURN(PTR_ERR(tgt));
+
+	if (!fid_is_sane(&op_data->op_fid2))
+		fid_zero(&op_data->op_fid2);
 
 	CDEBUG(D_INODE, "LOOKUP_INTENT with fid1="DFID", fid2="DFID
-	       ", name='%s' -> mds #%u\n",
+	       ", name='%s' -> mds #%u lsm=%p lsm_magic=%x\n",
 	       PFID(&op_data->op_fid1), PFID(&op_data->op_fid2),
 	       op_data->op_name ? op_data->op_name : "<NULL>",
-	       tgt->ltd_idx);
+	       tgt->ltd_idx, lsm, lsm == NULL ? -1 : lsm->lsm_md_magic);
 
 	op_data->op_bias &= ~MDS_CROSS_REF;
 
@@ -505,14 +407,37 @@ lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data,
 				RETURN(rc);
 		}
 		RETURN(rc);
-	} else if (it_disposition(it, DISP_LOOKUP_NEG) &&
-		   lmv_dir_retry_check_update(op_data)) {
-		ptlrpc_req_finished(*reqp);
-		it->it_request = NULL;
-		it->it_disposition = 0;
-		*reqp = NULL;
+	} else if (it_disposition(it, DISP_LOOKUP_NEG) && lsm != NULL &&
+		   lmv_need_try_all_stripes(lsm)) {
+		/* For migrating and unknown hash type directory, it will
+		 * try to target the entry on other stripes */
+		int stripe_index;
+
+		for (stripe_index = 1;
+		     stripe_index < lsm->lsm_md_stripe_count &&
+		     it_disposition(it, DISP_LOOKUP_NEG); stripe_index++) {
+			struct lmv_oinfo *oinfo;
+
+			/* release the previous request */
+			ptlrpc_req_finished(*reqp);
+			it->it_request = NULL;
+			*reqp = NULL;
 
-		goto retry;
+			oinfo = &lsm->lsm_md_oinfo[stripe_index];
+			tgt = lmv_find_target(lmv, &oinfo->lmo_fid);
+			if (IS_ERR(tgt))
+				RETURN(PTR_ERR(tgt));
+
+			CDEBUG(D_INODE, "Try other stripes " DFID"\n",
+			       PFID(&oinfo->lmo_fid));
+
+			op_data->op_fid1 = oinfo->lmo_fid;
+			it->it_disposition &= ~DISP_ENQ_COMPLETE;
+			rc = md_intent_lock(tgt->ltd_exp, op_data, it, reqp,
+					    cb_blocking, extra_lock_flags);
+			if (rc != 0)
+				RETURN(rc);
+		}
 	}
 
 	if (!it_has_reply_body(it))
@@ -529,9 +454,7 @@ lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data,
 	/* Not cross-ref case, just get out of here. */
 	if (unlikely((body->mbo_valid & OBD_MD_MDS))) {
 		rc = lmv_intent_remote(exp, it, NULL, reqp, cb_blocking,
-				       extra_lock_flags,
-				       op_data->op_file_secctx_name,
-				       op_data->op_file_secctx_name_size);
+				       extra_lock_flags);
 		if (rc != 0)
 			RETURN(rc);
 		body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h b/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h
index 0ad743244e93e..8ef0631f3301a 100644
--- a/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h
@@ -33,6 +33,7 @@
 #ifndef _LMV_INTERNAL_H_
 #define _LMV_INTERNAL_H_
 
+#include <lustre/lustre_idl.h>
 #include <obd.h>
 #include <lustre_lmv.h>
 
@@ -58,9 +59,6 @@ int lmv_revalidate_slaves(struct obd_export *exp,
 			  ldlm_blocking_callback cb_blocking,
 			  int extra_lock_flags);
 
-int lmv_getattr_name(struct obd_export *exp, struct md_op_data *op_data,
-		     struct ptlrpc_request **preq);
-
 static inline struct obd_device *lmv2obd_dev(struct lmv_obd *lmv)
 {
 	return container_of0(lmv, struct obd_device, u.lmv);
@@ -125,90 +123,39 @@ static inline int lmv_stripe_md_size(int stripe_count)
 	return sizeof(*lsm) + stripe_count * sizeof(lsm->lsm_md_oinfo[0]);
 }
 
-/* for file under migrating directory, return the target stripe info */
 static inline const struct lmv_oinfo *
 lsm_name_to_stripe_info(const struct lmv_stripe_md *lsm, const char *name,
-			int namelen, bool post_migrate)
+			int namelen)
 {
-	__u32 hash_type = lsm->lsm_md_hash_type;
-	__u32 stripe_count = lsm->lsm_md_stripe_count;
 	int stripe_index;
 
-	if (hash_type & LMV_HASH_FLAG_MIGRATION) {
-		if (post_migrate) {
-			hash_type &= ~LMV_HASH_FLAG_MIGRATION;
-			stripe_count = lsm->lsm_md_migrate_offset;
-		} else {
-			hash_type = lsm->lsm_md_migrate_hash;
-			stripe_count -= lsm->lsm_md_migrate_offset;
-		}
-	}
-
-	stripe_index = lmv_name_to_stripe_index(hash_type, stripe_count,
+	stripe_index = lmv_name_to_stripe_index(lsm->lsm_md_hash_type,
+						lsm->lsm_md_stripe_count,
 						name, namelen);
 	if (stripe_index < 0)
 		return ERR_PTR(stripe_index);
 
-	if ((lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION) && !post_migrate)
-		stripe_index += lsm->lsm_md_migrate_offset;
-
-	if (stripe_index >= lsm->lsm_md_stripe_count) {
-		CERROR("stripe_index %d stripe_count %d hash_type %#x "
-			"migrate_offset %d migrate_hash %#x name %.*s\n",
-			stripe_index, lsm->lsm_md_stripe_count,
-			lsm->lsm_md_hash_type, lsm->lsm_md_migrate_offset,
-			lsm->lsm_md_migrate_hash, namelen, name);
-		return ERR_PTR(-EBADF);
-	}
+	LASSERTF(stripe_index < lsm->lsm_md_stripe_count,
+		 "stripe_index = %d, stripe_count = %d hash_type = %x"
+		 "name = %.*s\n", stripe_index, lsm->lsm_md_stripe_count,
+		 lsm->lsm_md_hash_type, namelen, name);
 
 	return &lsm->lsm_md_oinfo[stripe_index];
 }
 
-static inline bool lmv_is_dir_migrating(const struct lmv_stripe_md *lsm)
+static inline bool lmv_need_try_all_stripes(const struct lmv_stripe_md *lsm)
 {
-	return lsm ? lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION : false;
-}
-
-static inline bool lmv_is_dir_bad_hash(const struct lmv_stripe_md *lsm)
-{
-	if (!lsm)
-		return false;
-
-	if (lmv_is_dir_migrating(lsm)) {
-		if (lsm->lsm_md_stripe_count - lsm->lsm_md_migrate_offset > 1)
-			return !lmv_is_known_hash_type(
-					lsm->lsm_md_migrate_hash);
-		return false;
-	}
-
-	return !lmv_is_known_hash_type(lsm->lsm_md_hash_type);
+	return !lmv_is_known_hash_type(lsm->lsm_md_hash_type) ||
+	       lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION;
 }
 
-static inline bool lmv_dir_retry_check_update(struct md_op_data *op_data)
-{
-	const struct lmv_stripe_md *lsm = op_data->op_mea1;
-
-	if (!lsm)
-		return false;
-
-	if (lmv_is_dir_migrating(lsm) && !op_data->op_post_migrate) {
-		op_data->op_post_migrate = true;
-		return true;
-	}
-
-	if (lmv_is_dir_bad_hash(lsm) &&
-	    op_data->op_stripe_index < lsm->lsm_md_stripe_count - 1) {
-		op_data->op_stripe_index++;
-		return true;
-	}
-
-	return false;
-}
-
-struct lmv_tgt_desc *lmv_locate_tgt(struct lmv_obd *lmv,
-				    struct md_op_data *op_data,
-				    struct lu_fid *fid);
+struct lmv_tgt_desc
+*lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
+		struct lu_fid *fid);
 /* lproc_lmv.c */
-int lmv_tunables_init(struct obd_device *obd);
+#ifdef CONFIG_PROC_FS
+extern struct lprocfs_vars lprocfs_lmv_obd_vars[];
+#endif
+extern const struct proc_ops lmv_proc_target_fops;
 
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c b/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c
index 078f6e2a59aad..8b073a6d9846f 100644
--- a/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -31,8 +31,7 @@
  */
 
 #define DEBUG_SUBSYSTEM S_LMV
-
-#include <linux/file.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/user_namespace.h>
@@ -46,6 +45,7 @@
 #include <linux/seq_file.h>
 #include <linux/namei.h>
 
+#include <lustre/lustre_idl.h>
 #include <obd_support.h>
 #include <lustre_lib.h>
 #include <lustre_net.h>
@@ -54,7 +54,7 @@
 #include <lprocfs_status.h>
 #include <cl_object.h>
 #include <lustre_fid.h>
-#include <uapi/linux/lustre/lustre_ioctl.h>
+#include <uapi/linux/lustre_ioctl.h>
 #include <lustre_kernelcomm.h>
 #include "lmv_internal.h"
 
@@ -213,24 +213,30 @@ static int lmv_connect(const struct lu_env *env,
 	lmv->connected = 0;
 	lmv->conn_data = *data;
 
-	lmv->lmv_tgts_kobj = kobject_create_and_add("target_obds",
-						    &obd->obd_kset.kobj);
-	if (!lmv->lmv_tgts_kobj) {
-		CERROR("%s: cannot create /sys/fs/lustre/%s/%s/target_obds\n",
-		       obd->obd_name, obd->obd_type->typ_name, obd->obd_name);
+	if (lmv->targets_proc_entry == NULL) {
+		lmv->targets_proc_entry = lprocfs_register("target_obds",
+							   obd->obd_proc_entry,
+							   NULL, NULL);
+		if (IS_ERR(lmv->targets_proc_entry)) {
+			CERROR("%s: cannot register "
+			       "/proc/fs/lustre/%s/%s/target_obds\n",
+			       obd->obd_name, obd->obd_type->typ_name,
+			       obd->obd_name);
+			lmv->targets_proc_entry = NULL;
+		}
 	}
 
 	rc = lmv_check_connect(obd);
 	if (rc != 0)
-		GOTO(out_sysfs, rc);
+		GOTO(out_proc, rc);
 
 	*pexp = exp;
 
 	RETURN(rc);
 
-out_sysfs:
-	if (lmv->lmv_tgts_kobj)
-		kobject_put(lmv->lmv_tgts_kobj);
+out_proc:
+	if (lmv->targets_proc_entry != NULL)
+		lprocfs_remove(&lmv->targets_proc_entry);
 
 	class_disconnect(exp);
 
@@ -265,12 +271,10 @@ static int lmv_init_ea_size(struct obd_export *exp, __u32 easize,
 	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
 		struct lmv_tgt_desc *tgt = lmv->tgts[i];
 
-		if (tgt == NULL || tgt->ltd_exp == NULL) {
+		if (tgt == NULL || tgt->ltd_exp == NULL || !tgt->ltd_active) {
 			CWARN("%s: NULL export for %d\n", obd->obd_name, i);
 			continue;
 		}
-		if (!tgt->ltd_active)
-			continue;
 
 		rc = md_init_ea_size(tgt->ltd_exp, easize, def_easize);
 		if (rc) {
@@ -359,11 +363,23 @@ int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
 		mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
 		atomic_read(&obd->obd_refcount));
 
-	if (lmv->lmv_tgts_kobj)
-		/* Even if we failed to create the link, that's fine */
-		rc = sysfs_create_link(lmv->lmv_tgts_kobj,
-				       &mdc_obd->obd_kset.kobj,
-				       mdc_obd->obd_name);
+	if (lmv->targets_proc_entry != NULL) {
+		struct proc_dir_entry *mdc_symlink;
+
+		LASSERT(mdc_obd->obd_type != NULL);
+		LASSERT(mdc_obd->obd_type->typ_name != NULL);
+		mdc_symlink = lprocfs_add_symlink(mdc_obd->obd_name,
+						  lmv->targets_proc_entry,
+						  "../../../%s/%s",
+						  mdc_obd->obd_type->typ_name,
+						  mdc_obd->obd_name);
+		if (mdc_symlink == NULL) {
+			CERROR("cannot register LMV target "
+			       "/proc/fs/lustre/%s/%s/target_obds/%s\n",
+			       obd->obd_type->typ_name, obd->obd_name,
+			       mdc_obd->obd_name);
+		}
+	}
 	RETURN(0);
 }
 
@@ -399,7 +415,7 @@ static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
 	mutex_lock(&lmv->lmv_init_mutex);
 	if ((index < lmv->tgts_size) && (lmv->tgts[index] != NULL)) {
 		tgt = lmv->tgts[index];
-		CERROR("%s: UUID %s already assigned at LMV target index %d:"
+		CERROR("%s: UUID %s already assigned at LOV target index %d:"
 		       " rc = %d\n", obd->obd_name,
 		       obd_uuid2str(&tgt->ltd_uuid), index, -EEXIST);
 		mutex_unlock(&lmv->lmv_init_mutex);
@@ -568,9 +584,9 @@ static int lmv_disconnect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
                 mdc_obd->obd_fail = obd->obd_fail;
                 mdc_obd->obd_no_recov = obd->obd_no_recov;
 
-		if (lmv->lmv_tgts_kobj)
-			sysfs_remove_link(lmv->lmv_tgts_kobj,
-					  mdc_obd->obd_name);
+		if (lmv->targets_proc_entry != NULL)
+			lprocfs_remove_proc_entry(mdc_obd->obd_name,
+						  lmv->targets_proc_entry);
 	}
 
 	rc = obd_fid_fini(tgt->ltd_exp->exp_obd);
@@ -613,8 +629,11 @@ static int lmv_disconnect(struct obd_export *exp)
 		lmv_disconnect_mdc(obd, lmv->tgts[i]);
         }
 
-	if (lmv->lmv_tgts_kobj)
-		kobject_put(lmv->lmv_tgts_kobj);
+	if (lmv->targets_proc_entry != NULL)
+		lprocfs_remove(&lmv->targets_proc_entry);
+	else
+		CERROR("/proc/fs/lustre/%s/%s/target_obds missing\n",
+		       obd->obd_type->typ_name, obd->obd_name);
 
 out_local:
         /*
@@ -662,8 +681,8 @@ static int lmv_fid2path(struct obd_export *exp, int len, void *karg,
 		int len;
 
 		ori_gf = (struct getinfo_fid2path *)karg;
-		if (strlen(ori_gf->gf_u.gf_path) + 1 +
-		    strlen(gf->gf_u.gf_path) + 1 > ori_gf->gf_pathlen)
+		if (strlen(ori_gf->gf_u.gf_path) +
+		    strlen(gf->gf_u.gf_path) > ori_gf->gf_pathlen)
 			GOTO(out_fid2path, rc = -EOVERFLOW);
 
 		ptr = ori_gf->gf_u.gf_path;
@@ -800,42 +819,23 @@ static int lmv_hsm_ct_register(struct obd_device *obd, unsigned int cmd,
 			       void __user *uarg)
 {
 	struct lmv_obd *lmv = &obd->u.lmv;
-	struct file *filp;
-	__u32 i, j;
-	int err;
-	bool any_set = false;
-	struct kkuc_ct_data *kcd;
-	size_t kcd_size;
-	int rc = 0;
+	struct file		*filp;
+	__u32			 i, j;
+	int			 err;
+	bool			 any_set = false;
+	struct kkuc_ct_data	 kcd = {
+		.kcd_magic   = KKUC_CT_DATA_MAGIC,
+		.kcd_archive = lk->lk_data,
+	};
+	int			 rc = 0;
 	ENTRY;
 
 	filp = fget(lk->lk_wfd);
 	if (!filp)
 		RETURN(-EBADF);
 
-	if (lk->lk_flags & LK_FLG_DATANR)
-		kcd_size = offsetof(struct kkuc_ct_data,
-				    kcd_archives[lk->lk_data_count]);
-	else
-		kcd_size = sizeof(*kcd);
-
-	OBD_ALLOC(kcd, kcd_size);
-	if (kcd == NULL)
-		GOTO(err_fput, rc = -ENOMEM);
-
-	kcd->kcd_nr_archives = lk->lk_data_count;
-	if (lk->lk_flags & LK_FLG_DATANR) {
-		kcd->kcd_magic = KKUC_CT_DATA_ARRAY_MAGIC;
-		if (lk->lk_data_count > 0)
-			memcpy(kcd->kcd_archives, lk->lk_data,
-			       sizeof(*kcd->kcd_archives) * lk->lk_data_count);
-	} else {
-		kcd->kcd_magic = KKUC_CT_DATA_BITMAP_MAGIC;
-	}
-
 	rc = libcfs_kkuc_group_add(filp, &obd->obd_uuid, lk->lk_uid,
-				   lk->lk_group, kcd, kcd_size);
-	OBD_FREE(kcd, kcd_size);
+				   lk->lk_group, &kcd, sizeof(kcd));
 	if (rc)
 		GOTO(err_fput, rc);
 
@@ -934,7 +934,7 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
 			RETURN(-EFAULT);
 
 		rc = obd_statfs(NULL, tgt->ltd_exp, &stat_buf,
-				ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
 				0);
 		if (rc)
 			RETURN(rc);
@@ -1175,7 +1175,7 @@ static int lmv_placement_policy(struct obd_device *obd,
 	 * 1. See if the stripe offset is specified by lum.
 	 * 2. Then check if there is default stripe offset.
 	 * 3. Finally choose MDS by name hash if the parent
-	 *    is striped directory. (see lmv_locate_tgt()). */
+	 *    is striped directory. (see lmv_locate_mds()). */
 	if (op_data->op_cli_flags & CLI_SET_MEA && lum != NULL &&
 	    le32_to_cpu(lum->lum_stripe_offset) != (__u32)-1) {
 		*mds = le32_to_cpu(lum->lum_stripe_offset);
@@ -1287,11 +1287,16 @@ static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 	spin_lock_init(&lmv->lmv_lock);
 	mutex_init(&lmv->lmv_init_mutex);
 
-	rc = lmv_tunables_init(obd);
+#ifdef CONFIG_PROC_FS
+	obd->obd_vars = lprocfs_lmv_obd_vars;
+	lprocfs_obd_setup(obd);
+	lprocfs_alloc_md_stats(obd, 0);
+	rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd",
+				0444, &lmv_proc_target_fops, obd);
 	if (rc)
-		CWARN("%s: error adding LMV sysfs/debugfs files: rc = %d\n",
+		CWARN("%s: error adding LMV target_obd file: rc = %d\n",
 		      obd->obd_name, rc);
-
+#endif
 	rc = fld_client_init(&lmv->lmv_fld, obd->obd_name,
 			     LUSTRE_CLI_FLD_HASH_DHT);
 	if (rc) {
@@ -1356,88 +1361,49 @@ static int lmv_process_config(struct obd_device *obd, size_t len, void *buf)
 	RETURN(rc);
 }
 
-static int lmv_select_statfs_mdt(struct lmv_obd *lmv, __u32 flags)
-{
-	int i;
-
-	if (flags & OBD_STATFS_FOR_MDT0)
-		return 0;
-
-	if (lmv->lmv_statfs_start || lmv->desc.ld_tgt_count == 1)
-		return lmv->lmv_statfs_start;
-
-	/* choose initial MDT for this client */
-	for (i = 0;; i++) {
-		struct lnet_process_id lnet_id;
-		if (LNetGetId(i, &lnet_id) == -ENOENT)
-			break;
-
-		if (lnet_id.nid != LNET_NID_LO_0) {
-			/* We dont need a full 64-bit modulus, just enough
-			 * to distribute the requests across MDTs evenly.
-			 */
-			lmv->lmv_statfs_start =
-				(u32)lnet_id.nid % lmv->desc.ld_tgt_count;
-			break;
-		}
-	}
-
-	return lmv->lmv_statfs_start;
-}
-
 static int lmv_statfs(const struct lu_env *env, struct obd_export *exp,
-		      struct obd_statfs *osfs, time64_t max_age, __u32 flags)
+                      struct obd_statfs *osfs, __u64 max_age, __u32 flags)
 {
 	struct obd_device	*obd = class_exp2obd(exp);
 	struct lmv_obd		*lmv = &obd->u.lmv;
 	struct obd_statfs	*temp;
 	int			 rc = 0;
-	__u32			 i, idx;
+	__u32			 i;
 	ENTRY;
 
         OBD_ALLOC(temp, sizeof(*temp));
         if (temp == NULL)
                 RETURN(-ENOMEM);
 
-	/* distribute statfs among MDTs */
-	idx = lmv_select_statfs_mdt(lmv, flags);
-
-	for (i = 0; i < lmv->desc.ld_tgt_count; i++, idx++) {
-		idx = idx % lmv->desc.ld_tgt_count;
-		if (lmv->tgts[idx] == NULL || lmv->tgts[idx]->ltd_exp == NULL)
+        for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
 			continue;
 
-		rc = obd_statfs(env, lmv->tgts[idx]->ltd_exp, temp,
+		rc = obd_statfs(env, lmv->tgts[i]->ltd_exp, temp,
 				max_age, flags);
 		if (rc) {
-			CERROR("%s: can't stat MDS #%d: rc = %d\n",
-			       lmv->tgts[idx]->ltd_exp->exp_obd->obd_name, i,
+			CERROR("can't stat MDS #%d (%s), error %d\n", i,
+			       lmv->tgts[i]->ltd_exp->exp_obd->obd_name,
 			       rc);
 			GOTO(out_free_temp, rc);
 		}
 
-		if (temp->os_state & OS_STATE_SUM ||
-		    flags == OBD_STATFS_FOR_MDT0) {
-			/* reset to the last aggregated values
-			 * and don't sum with non-aggrated data */
-			/* If the statfs is from mount, it needs to retrieve
-			 * necessary information from MDT0. i.e. mount does
-			 * not need the merged osfs from all of MDT. Also
-			 * clients can be mounted as long as MDT0 is in
-			 * service */
-			*osfs = *temp;
-			break;
-		}
-
 		if (i == 0) {
 			*osfs = *temp;
-		} else {
-			osfs->os_bavail += temp->os_bavail;
-			osfs->os_blocks += temp->os_blocks;
-			osfs->os_ffree += temp->os_ffree;
-			osfs->os_files += temp->os_files;
-			osfs->os_granted += temp->os_granted;
-		}
+			/* If the statfs is from mount, it will needs
+			 * retrieve necessary information from MDT0.
+			 * i.e. mount does not need the merged osfs
+			 * from all of MDT.
+			 * And also clients can be mounted as long as
+			 * MDT0 is in service*/
+			if (flags & OBD_STATFS_FOR_MDT0)
+				GOTO(out_free_temp, rc);
+                } else {
+                        osfs->os_bavail += temp->os_bavail;
+                        osfs->os_blocks += temp->os_blocks;
+                        osfs->os_ffree += temp->os_ffree;
+                        osfs->os_files += temp->os_files;
+                }
         }
 
         EXIT;
@@ -1459,8 +1425,9 @@ static int lmv_get_root(struct obd_export *exp, const char *fileset,
 }
 
 static int lmv_getxattr(struct obd_export *exp, const struct lu_fid *fid,
-			u64 obd_md_valid, const char *name, size_t buf_size,
-			struct ptlrpc_request **req)
+			u64 valid, const char *name,
+			const char *input, int input_size, int output_size,
+			int flags, struct ptlrpc_request **request)
 {
         struct obd_device      *obd = exp->exp_obd;
         struct lmv_obd         *lmv = &obd->u.lmv;
@@ -1472,16 +1439,17 @@ static int lmv_getxattr(struct obd_export *exp, const struct lu_fid *fid,
         if (IS_ERR(tgt))
                 RETURN(PTR_ERR(tgt));
 
-	rc = md_getxattr(tgt->ltd_exp, fid, obd_md_valid, name, buf_size, req);
+	rc = md_getxattr(tgt->ltd_exp, fid, valid, name, input,
+			 input_size, output_size, flags, request);
 
 	RETURN(rc);
 }
 
 static int lmv_setxattr(struct obd_export *exp, const struct lu_fid *fid,
-			u64 obd_md_valid, const char *name,
-			const void *value, size_t value_size,
-			unsigned int xattr_flags, u32 suppgid,
-			struct ptlrpc_request **req)
+			u64 valid, const char *name,
+			const char *input, int input_size, int output_size,
+			int flags, __u32 suppgid,
+			struct ptlrpc_request **request)
 {
         struct obd_device      *obd = exp->exp_obd;
         struct lmv_obd         *lmv = &obd->u.lmv;
@@ -1493,8 +1461,9 @@ static int lmv_setxattr(struct obd_export *exp, const struct lu_fid *fid,
         if (IS_ERR(tgt))
                 RETURN(PTR_ERR(tgt));
 
-	rc = md_setxattr(tgt->ltd_exp, fid, obd_md_valid, name,
-			 value, value_size, xattr_flags, suppgid, req);
+	rc = md_setxattr(tgt->ltd_exp, fid, valid, name, input,
+			 input_size, output_size, flags, suppgid,
+			 request);
 
 	RETURN(rc);
 }
@@ -1563,93 +1532,81 @@ static int lmv_close(struct obd_export *exp, struct md_op_data *op_data,
         RETURN(rc);
 }
 
-struct lmv_tgt_desc*
-__lmv_locate_tgt(struct lmv_obd *lmv, struct lmv_stripe_md *lsm,
-		 const char *name, int namelen, struct lu_fid *fid, u32 *mds,
-		 bool post_migrate)
+/**
+ * Choosing the MDT by name or FID in @op_data.
+ * For non-striped directory, it will locate MDT by fid.
+ * For striped-directory, it will locate MDT by name. And also
+ * it will reset op_fid1 with the FID of the choosen stripe.
+ **/
+struct lmv_tgt_desc *
+lmv_locate_target_for_name(struct lmv_obd *lmv, struct lmv_stripe_md *lsm,
+			   const char *name, int namelen, struct lu_fid *fid,
+			   u32 *mds)
 {
-	struct lmv_tgt_desc *tgt;
-	const struct lmv_oinfo *oinfo;
-
-	if (lsm == NULL || namelen == 0) {
-		tgt = lmv_find_target(lmv, fid);
-		if (IS_ERR(tgt))
-			return tgt;
-
-		LASSERT(mds);
-		*mds = tgt->ltd_idx;
-		return tgt;
-	}
+	struct lmv_tgt_desc	*tgt;
+	const struct lmv_oinfo	*oinfo;
 
 	if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_NAME_HASH)) {
 		if (cfs_fail_val >= lsm->lsm_md_stripe_count)
-			return ERR_PTR(-EBADF);
+			RETURN(ERR_PTR(-EBADF));
 		oinfo = &lsm->lsm_md_oinfo[cfs_fail_val];
 	} else {
-		oinfo = lsm_name_to_stripe_info(lsm, name, namelen,
-						post_migrate);
+		oinfo = lsm_name_to_stripe_info(lsm, name, namelen);
 		if (IS_ERR(oinfo))
-			return ERR_CAST(oinfo);
+			RETURN(ERR_CAST(oinfo));
 	}
 
 	if (fid != NULL)
 		*fid = oinfo->lmo_fid;
 	if (mds != NULL)
 		*mds = oinfo->lmo_mds;
-	/* check stripe FID is sane */
-	if (!fid_is_sane(&oinfo->lmo_fid))
-		return ERR_PTR(-ENODEV);
 
 	tgt = lmv_get_target(lmv, oinfo->lmo_mds, NULL);
 
 	CDEBUG(D_INFO, "locate on mds %u "DFID"\n", oinfo->lmo_mds,
 	       PFID(&oinfo->lmo_fid));
-
 	return tgt;
 }
 
-
 /**
- * Locate mdt by fid or name
+ * Locate mds by fid or name
  *
- * For striped directory, it will locate the stripe by name hash, if hash_type
- * is unknown, it will return the stripe specified by 'op_data->op_stripe_index'
- * which is set outside, and if dir is migrating, 'op_data->op_post_migrate'
- * indicates whether old or new layout is used to locate.
+ * For striped directory (lsm != NULL), it will locate the stripe
+ * by name hash (see lsm_name_to_stripe_info()). Note: if the hash_type
+ * is unknown, it will return -EBADFD, and lmv_intent_lookup might need
+ * walk through all of stripes to locate the entry.
  *
  * For normal direcotry, it will locate MDS by FID directly.
- *
- * \param[in] lmv		LMV device
- * \param[in/out] op_data	client MD stack parameters, name, namelen etc,
- *                      	op_mds and op_fid1 will be updated if op_mea1
- *                      	indicates fid1 represents a striped directory.
- * \param[out] fid		object FID used to locate MDS.
+ * \param[in] lmv	LMV device
+ * \param[in] op_data	client MD stack parameters, name, namelen
+ *                      mds_num etc.
+ * \param[in] fid	object FID used to locate MDS.
  *
  * retval		pointer to the lmv_tgt_desc if succeed.
  *                      ERR_PTR(errno) if failed.
  */
 struct lmv_tgt_desc*
-lmv_locate_tgt(struct lmv_obd *lmv, struct md_op_data *op_data,
+lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
 	       struct lu_fid *fid)
 {
-	struct lmv_stripe_md *lsm = op_data->op_mea1;
-	struct lmv_oinfo *oinfo;
-	struct lmv_tgt_desc *tgt;
+	struct lmv_stripe_md	*lsm = op_data->op_mea1;
+	struct lmv_tgt_desc	*tgt;
 
 	/* During creating VOLATILE file, it should honor the mdt
 	 * index if the file under striped dir is being restored, see
 	 * ct_restore(). */
 	if (op_data->op_bias & MDS_CREATE_VOLATILE &&
 	    (int)op_data->op_mds != -1) {
+		int i;
 		tgt = lmv_get_target(lmv, op_data->op_mds, NULL);
 		if (IS_ERR(tgt))
 			return tgt;
 
-		if (lsm) {
-			int i;
-
+		if (lsm != NULL) {
 			/* refill the right parent fid */
 			for (i = 0; i < lsm->lsm_md_stripe_count; i++) {
+				struct lmv_oinfo *oinfo;
+
 				oinfo = &lsm->lsm_md_oinfo[i];
 				if (oinfo->lmo_mds == op_data->op_mds) {
 					*fid = oinfo->lmo_fid;
@@ -1660,21 +1617,22 @@ lmv_locate_tgt(struct lmv_obd *lmv, struct md_op_data *op_data,
 			if (i == lsm->lsm_md_stripe_count)
 				*fid = lsm->lsm_md_oinfo[0].lmo_fid;
 		}
-	} else if (lmv_is_dir_bad_hash(lsm)) {
-		LASSERT(op_data->op_stripe_index < lsm->lsm_md_stripe_count);
-		oinfo = &lsm->lsm_md_oinfo[op_data->op_stripe_index];
 
-		*fid = oinfo->lmo_fid;
-		op_data->op_mds = oinfo->lmo_mds;
-		tgt = lmv_get_target(lmv, oinfo->lmo_mds, NULL);
-	} else {
-		tgt = __lmv_locate_tgt(lmv, lsm, op_data->op_name,
-				       op_data->op_namelen, fid,
-				       &op_data->op_mds,
-				       op_data->op_post_migrate);
+		return tgt;
 	}
 
-	return tgt;
+	if (lsm == NULL || op_data->op_namelen == 0) {
+		tgt = lmv_find_target(lmv, fid);
+		if (IS_ERR(tgt))
+			return tgt;
+
+		op_data->op_mds = tgt->ltd_idx;
+		return tgt;
+	}
+
+	return lmv_locate_target_for_name(lmv, lsm, op_data->op_name,
+					  op_data->op_namelen, fid,
+					  &op_data->op_mds);
 }
 
 int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
@@ -1691,33 +1649,7 @@ int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
 	if (!lmv->desc.ld_active_tgt_count)
 		RETURN(-EIO);
 
-	if (lmv_is_dir_bad_hash(op_data->op_mea1))
-		RETURN(-EBADF);
-
-	if (lmv_is_dir_migrating(op_data->op_mea1)) {
-		/*
-		 * if parent is migrating, create() needs to lookup existing
-		 * name, to avoid creating new file under old layout of
-		 * migrating directory, check old layout here.
-		 */
-		tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
-		if (IS_ERR(tgt))
-			RETURN(PTR_ERR(tgt));
-
-		rc = md_getattr_name(tgt->ltd_exp, op_data, request);
-		if (!rc) {
-			ptlrpc_req_finished(*request);
-			*request = NULL;
-			RETURN(-EEXIST);
-		}
-
-		if (rc != -ENOENT)
-			RETURN(rc);
-
-		op_data->op_post_migrate = true;
-	}
-
-	tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
+	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
 	if (IS_ERR(tgt))
 		RETURN(PTR_ERR(tgt));
 
@@ -1728,7 +1660,6 @@ int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
 	rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
 	if (rc)
 		RETURN(rc);
-
 	if (exp_connect_flags(exp) & OBD_CONNECT_DIR_STRIPE) {
 		/* Send the create request to the MDT where the object
 		 * will be located */
@@ -1768,7 +1699,7 @@ lmv_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
 
 	CDEBUG(D_INODE, "ENQUEUE on "DFID"\n", PFID(&op_data->op_fid1));
 
-	tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
 	if (IS_ERR(tgt))
 		RETURN(PTR_ERR(tgt));
 
@@ -1781,20 +1712,19 @@ lmv_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
 	RETURN(rc);
 }
 
-int
+static int
 lmv_getattr_name(struct obd_export *exp,struct md_op_data *op_data,
 		 struct ptlrpc_request **preq)
 {
-	struct obd_device *obd = exp->exp_obd;
-	struct lmv_obd *lmv = &obd->u.lmv;
-	struct lmv_tgt_desc *tgt;
-	struct mdt_body *body;
-	int rc;
-
+	struct ptlrpc_request   *req = NULL;
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd          *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	struct mdt_body         *body;
+	int                      rc;
 	ENTRY;
 
-retry:
-	tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
+	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
 	if (IS_ERR(tgt))
 		RETURN(PTR_ERR(tgt));
 
@@ -1803,28 +1733,31 @@ lmv_getattr_name(struct obd_export *exp,struct md_op_data *op_data,
 		PFID(&op_data->op_fid1), tgt->ltd_idx);
 
 	rc = md_getattr_name(tgt->ltd_exp, op_data, preq);
-	if (rc == -ENOENT && lmv_dir_retry_check_update(op_data)) {
-		ptlrpc_req_finished(*preq);
-		*preq = NULL;
-		goto retry;
-	}
-
-	if (rc)
+	if (rc != 0)
 		RETURN(rc);
 
 	body = req_capsule_server_get(&(*preq)->rq_pill, &RMF_MDT_BODY);
 	LASSERT(body != NULL);
 
 	if (body->mbo_valid & OBD_MD_MDS) {
-		op_data->op_fid1 = body->mbo_fid1;
+		struct lu_fid rid = body->mbo_fid1;
+		CDEBUG(D_INODE, "Request attrs for "DFID"\n",
+		       PFID(&rid));
+
+		tgt = lmv_find_target(lmv, &rid);
+		if (IS_ERR(tgt)) {
+			ptlrpc_req_finished(*preq);
+			preq = NULL;
+			RETURN(PTR_ERR(tgt));
+		}
+
+		op_data->op_fid1 = rid;
 		op_data->op_valid |= OBD_MD_FLCROSSREF;
 		op_data->op_namelen = 0;
 		op_data->op_name = NULL;
-
+		rc = md_getattr_name(tgt->ltd_exp, op_data, &req);
 		ptlrpc_req_finished(*preq);
-		*preq = NULL;
-
-		goto retry;
+		*preq = req;
 	}
 
 	RETURN(rc);
@@ -1894,40 +1827,19 @@ static int lmv_link(struct obd_export *exp, struct md_op_data *op_data,
 	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
 	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
 	op_data->op_cap = cfs_curproc_cap_pack();
+	if (op_data->op_mea2 != NULL) {
+		struct lmv_stripe_md	*lsm = op_data->op_mea2;
+		const struct lmv_oinfo	*oinfo;
 
-	if (lmv_is_dir_migrating(op_data->op_mea2)) {
-		struct lu_fid fid1 = op_data->op_fid1;
-		struct lmv_stripe_md *lsm1 = op_data->op_mea1;
-
-		/*
-		 * avoid creating new file under old layout of migrating
-		 * directory, check it here.
-		 */
-		tgt = __lmv_locate_tgt(lmv, op_data->op_mea2, op_data->op_name,
-				       op_data->op_namelen, &op_data->op_fid2,
-				       &op_data->op_mds, false);
-		tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
-		if (IS_ERR(tgt))
-			RETURN(PTR_ERR(tgt));
-
-		op_data->op_fid1 = op_data->op_fid2;
-		op_data->op_mea1 = op_data->op_mea2;
-		rc = md_getattr_name(tgt->ltd_exp, op_data, request);
-		op_data->op_fid1 = fid1;
-		op_data->op_mea1 = lsm1;
-		if (!rc) {
-			ptlrpc_req_finished(*request);
-			*request = NULL;
-			RETURN(-EEXIST);
-		}
+		oinfo = lsm_name_to_stripe_info(lsm, op_data->op_name,
+						op_data->op_namelen);
+		if (IS_ERR(oinfo))
+			RETURN(PTR_ERR(oinfo));
 
-		if (rc != -ENOENT)
-			RETURN(rc);
+		op_data->op_fid2 = oinfo->lmo_fid;
 	}
 
-	tgt = __lmv_locate_tgt(lmv, op_data->op_mea2, op_data->op_name,
-			       op_data->op_namelen, &op_data->op_fid2,
-			       &op_data->op_mds, true);
+	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
 	if (IS_ERR(tgt))
 		RETURN(PTR_ERR(tgt));
 
@@ -1945,323 +1857,158 @@ static int lmv_link(struct obd_export *exp, struct md_op_data *op_data,
 	RETURN(rc);
 }
 
-static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data,
-			const char *name, size_t namelen,
-			struct ptlrpc_request **request)
+static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
+		      const char *old, size_t oldlen,
+		      const char *new, size_t newlen,
+		      struct ptlrpc_request **request)
 {
-	struct obd_device *obd = exp->exp_obd;
-	struct lmv_obd *lmv = &obd->u.lmv;
-	struct lmv_stripe_md *lsm = op_data->op_mea1;
-	struct lmv_tgt_desc *parent_tgt;
-	struct lmv_tgt_desc *sp_tgt;
-	struct lmv_tgt_desc *tp_tgt = NULL;
-	struct lmv_tgt_desc *child_tgt;
-	struct lmv_tgt_desc *tgt;
-	struct lu_fid target_fid;
-	int rc;
-
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd          *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *src_tgt;
+	struct lmv_tgt_desc     *tgt_tgt;
+	struct obd_export	*target_exp;
+	struct mdt_body		*body;
+	int			rc;
 	ENTRY;
 
-	LASSERT(op_data->op_cli_flags & CLI_MIGRATE);
+	LASSERT(oldlen != 0);
 
-	CDEBUG(D_INODE, "MIGRATE "DFID"/%.*s\n",
-	       PFID(&op_data->op_fid1), (int)namelen, name);
+	CDEBUG(D_INODE, "RENAME %.*s in "DFID":%d to %.*s in "DFID":%d\n",
+	       (int)oldlen, old, PFID(&op_data->op_fid1),
+	       op_data->op_mea1 ? op_data->op_mea1->lsm_md_stripe_count : 0,
+	       (int)newlen, new, PFID(&op_data->op_fid2),
+	       op_data->op_mea2 ? op_data->op_mea2->lsm_md_stripe_count : 0);
 
 	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
 	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
 	op_data->op_cap = cfs_curproc_cap_pack();
-
-	parent_tgt = lmv_find_target(lmv, &op_data->op_fid1);
-	if (IS_ERR(parent_tgt))
-		RETURN(PTR_ERR(parent_tgt));
-
-	if (lsm) {
-		__u32 hash_type = lsm->lsm_md_hash_type;
-		__u32 stripe_count = lsm->lsm_md_stripe_count;
-
-		/*
-		 * old stripes are appended after new stripes for migrating
-		 * directory.
-		 */
-		if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION) {
-			hash_type = lsm->lsm_md_migrate_hash;
-			stripe_count -= lsm->lsm_md_migrate_offset;
+	if (op_data->op_cli_flags & CLI_MIGRATE) {
+		LASSERTF(fid_is_sane(&op_data->op_fid3), "invalid FID "DFID"\n",
+			 PFID(&op_data->op_fid3));
+
+		if (op_data->op_mea1 != NULL) {
+			struct lmv_stripe_md	*lsm = op_data->op_mea1;
+			struct lmv_tgt_desc	*tmp;
+
+			/* Fix the parent fid for striped dir */
+			tmp = lmv_locate_target_for_name(lmv, lsm, old,
+							 oldlen,
+							 &op_data->op_fid1,
+							 NULL);
+			if (IS_ERR(tmp))
+				RETURN(PTR_ERR(tmp));
 		}
 
-		rc = lmv_name_to_stripe_index(hash_type, stripe_count, name,
-					      namelen);
-		if (rc < 0)
+		rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
+		if (rc != 0)
 			RETURN(rc);
 
-		if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION)
-			rc += lsm->lsm_md_migrate_offset;
-
-		/* save it in fid4 temporarily for early cancel */
-		op_data->op_fid4 = lsm->lsm_md_oinfo[rc].lmo_fid;
-		sp_tgt = lmv_get_target(lmv, lsm->lsm_md_oinfo[rc].lmo_mds,
-					NULL);
-		if (IS_ERR(sp_tgt))
-			RETURN(PTR_ERR(sp_tgt));
+		src_tgt = lmv_find_target(lmv, &op_data->op_fid3);
+		if (IS_ERR(src_tgt))
+			RETURN(PTR_ERR(src_tgt));
 
-		/*
-		 * if parent is being migrated too, fill op_fid2 with target
-		 * stripe fid, otherwise the target stripe is not created yet.
-		 */
-		if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION) {
-			hash_type = lsm->lsm_md_hash_type &
-				    ~LMV_HASH_FLAG_MIGRATION;
-			stripe_count = lsm->lsm_md_migrate_offset;
-
-			rc = lmv_name_to_stripe_index(hash_type, stripe_count,
-						      name, namelen);
-			if (rc < 0)
-				RETURN(rc);
-
-			op_data->op_fid2 = lsm->lsm_md_oinfo[rc].lmo_fid;
-			tp_tgt = lmv_get_target(lmv,
-						lsm->lsm_md_oinfo[rc].lmo_mds,
-						NULL);
-			if (IS_ERR(tp_tgt))
-				RETURN(PTR_ERR(tp_tgt));
-		}
+		target_exp = src_tgt->ltd_exp;
 	} else {
-		sp_tgt = parent_tgt;
-	}
-
-	child_tgt = lmv_find_target(lmv, &op_data->op_fid3);
-	if (IS_ERR(child_tgt))
-		RETURN(PTR_ERR(child_tgt));
+		if (op_data->op_mea1 != NULL) {
+			struct lmv_stripe_md	*lsm = op_data->op_mea1;
 
-	if (!S_ISDIR(op_data->op_mode) && tp_tgt)
-		rc = __lmv_fid_alloc(lmv, &target_fid, tp_tgt->ltd_idx);
-	else
-		rc = lmv_fid_alloc(NULL, exp, &target_fid, op_data);
-	if (rc)
-		RETURN(rc);
-
-	/*
-	 * for directory, send migrate request to the MDT where the object will
-	 * be migrated to, because we can't create a striped directory remotely.
-	 *
-	 * otherwise, send to the MDT where source is located because regular
-	 * file may open lease.
-	 *
-	 * NB. if MDT doesn't support DIR_MIGRATE, send to source MDT too for
-	 * backward compatibility.
-	 */
-	if (S_ISDIR(op_data->op_mode) &&
-	    (exp_connect_flags2(exp) & OBD_CONNECT2_DIR_MIGRATE)) {
-		tgt = lmv_find_target(lmv, &target_fid);
-		if (IS_ERR(tgt))
-			RETURN(PTR_ERR(tgt));
-	} else {
-		tgt = child_tgt;
-	}
+			src_tgt = lmv_locate_target_for_name(lmv, lsm, old,
+							     oldlen,
+							     &op_data->op_fid1,
+							     &op_data->op_mds);
+		} else {
+			src_tgt = lmv_find_target(lmv, &op_data->op_fid1);
+		}
+		if (IS_ERR(src_tgt))
+			RETURN(PTR_ERR(src_tgt));
 
-	/* cancel UPDATE lock of parent master object */
-	rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_idx, LCK_EX,
-			      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1);
-	if (rc)
-		RETURN(rc);
 
-	/* cancel UPDATE lock of source parent */
-	if (sp_tgt != parent_tgt) {
-		/*
-		 * migrate RPC packs master object FID, because we can only pack
-		 * two FIDs in reint RPC, but MDS needs to know both source
-		 * parent and target parent, and it will obtain them from master
-		 * FID and LMV, the other FID in RPC is kept for target.
-		 *
-		 * since this FID is not passed to MDC, cancel it anyway.
-		 */
-		rc = lmv_early_cancel(exp, sp_tgt, op_data, -1, LCK_EX,
-				      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID4);
-		if (rc)
-			RETURN(rc);
+		if (op_data->op_mea2 != NULL) {
+			struct lmv_stripe_md	*lsm = op_data->op_mea2;
 
-		op_data->op_flags &= ~MF_MDC_CANCEL_FID4;
-	}
-	op_data->op_fid4 = target_fid;
+			tgt_tgt = lmv_locate_target_for_name(lmv, lsm, new,
+							     newlen,
+							     &op_data->op_fid2,
+							     &op_data->op_mds);
+		} else {
+			tgt_tgt = lmv_find_target(lmv, &op_data->op_fid2);
 
-	/* cancel UPDATE locks of target parent */
-	rc = lmv_early_cancel(exp, tp_tgt, op_data, tgt->ltd_idx, LCK_EX,
-			      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID2);
-	if (rc)
-		RETURN(rc);
+		}
+		if (IS_ERR(tgt_tgt))
+			RETURN(PTR_ERR(tgt_tgt));
 
-	/* cancel LOOKUP lock of source if source is remote object */
-	if (child_tgt != sp_tgt) {
-		rc = lmv_early_cancel(exp, sp_tgt, op_data, tgt->ltd_idx,
-				      LCK_EX, MDS_INODELOCK_LOOKUP,
-				      MF_MDC_CANCEL_FID3);
-		if (rc)
-			RETURN(rc);
+		target_exp = tgt_tgt->ltd_exp;
 	}
 
-	/* cancel ELC locks of source */
-	rc = lmv_early_cancel(exp, child_tgt, op_data, tgt->ltd_idx, LCK_EX,
-			      MDS_INODELOCK_ELC, MF_MDC_CANCEL_FID3);
-	if (rc)
-		RETURN(rc);
-
-	rc = md_rename(tgt->ltd_exp, op_data, name, namelen, NULL, 0, request);
-
-	RETURN(rc);
-}
-
-static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
-		      const char *old, size_t oldlen,
-		      const char *new, size_t newlen,
-		      struct ptlrpc_request **request)
-{
-	struct obd_device *obd = exp->exp_obd;
-	struct lmv_obd *lmv = &obd->u.lmv;
-	struct lmv_tgt_desc *sp_tgt;
-	struct lmv_tgt_desc *tp_tgt = NULL;
-	struct lmv_tgt_desc *src_tgt = NULL;
-	struct lmv_tgt_desc *tgt;
-	struct mdt_body *body;
-	int rc;
-
-	ENTRY;
+	/*
+	 * LOOKUP lock on src child (fid3) should also be cancelled for
+	 * src_tgt in mdc_rename.
+	 */
+	op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
 
-	LASSERT(oldlen != 0);
+	/*
+	 * Cancel UPDATE locks on tgt parent (fid2), tgt_tgt is its
+	 * own target.
+	 */
+	rc = lmv_early_cancel(exp, NULL, op_data, src_tgt->ltd_idx,
+			      LCK_EX, MDS_INODELOCK_UPDATE,
+			      MF_MDC_CANCEL_FID2);
 
-	if (op_data->op_cli_flags & CLI_MIGRATE) {
-		rc = lmv_migrate(exp, op_data, old, oldlen, request);
+	if (rc != 0)
 		RETURN(rc);
-	}
-
-	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
-	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
-	op_data->op_cap = cfs_curproc_cap_pack();
-
-	if (lmv_is_dir_migrating(op_data->op_mea2)) {
-		struct lu_fid fid1 = op_data->op_fid1;
-		struct lmv_stripe_md *lsm1 = op_data->op_mea1;
+	/*
+	 * Cancel LOOKUP locks on source child (fid3) for parent tgt_tgt.
+	 */
+	if (fid_is_sane(&op_data->op_fid3)) {
+		struct lmv_tgt_desc *tgt;
 
-		/*
-		 * we avoid creating new file under old layout of migrating
-		 * directory, if there is an existing file with new name under
-		 * old layout, we can't unlink file in old layout and rename to
-		 * new layout in one transaction, so return -EBUSY here.`
-		 */
-		tgt = __lmv_locate_tgt(lmv, op_data->op_mea2, new, newlen,
-				       &op_data->op_fid2, &op_data->op_mds,
-				       false);
+		tgt = lmv_find_target(lmv, &op_data->op_fid1);
 		if (IS_ERR(tgt))
 			RETURN(PTR_ERR(tgt));
 
-		op_data->op_fid1 = op_data->op_fid2;
-		op_data->op_mea1 = op_data->op_mea2;
-		op_data->op_name = new;
-		op_data->op_namelen = newlen;
-		rc = md_getattr_name(tgt->ltd_exp, op_data, request);
-		op_data->op_fid1 = fid1;
-		op_data->op_mea1 = lsm1;
-		op_data->op_name = NULL;
-		op_data->op_namelen = 0;
-		if (!rc) {
-			ptlrpc_req_finished(*request);
-			*request = NULL;
-			RETURN(-EBUSY);
-		}
-
-		if (rc != -ENOENT)
+		/* Cancel LOOKUP lock on its parent */
+		rc = lmv_early_cancel(exp, tgt, op_data, src_tgt->ltd_idx,
+				      LCK_EX, MDS_INODELOCK_LOOKUP,
+				      MF_MDC_CANCEL_FID3);
+		if (rc != 0)
 			RETURN(rc);
-	}
 
-	/* rename to new layout for migrating directory */
-	tp_tgt = __lmv_locate_tgt(lmv, op_data->op_mea2, new, newlen,
-				  &op_data->op_fid2, &op_data->op_mds, true);
-	if (IS_ERR(tp_tgt))
-		RETURN(PTR_ERR(tp_tgt));
-
-	/* Since the target child might be destroyed, and it might become
-	 * orphan, and we can only check orphan on the local MDT right now, so
-	 * we send rename request to the MDT where target child is located. If
-	 * target child does not exist, then it will send the request to the
-	 * target parent */
-	if (fid_is_sane(&op_data->op_fid4)) {
-		tgt = lmv_find_target(lmv, &op_data->op_fid4);
-		if (IS_ERR(tgt))
-			RETURN(PTR_ERR(tgt));
-	} else {
-		tgt = tp_tgt;
+		rc = lmv_early_cancel(exp, NULL, op_data, src_tgt->ltd_idx,
+				      LCK_EX, MDS_INODELOCK_FULL,
+				      MF_MDC_CANCEL_FID3);
+		if (rc != 0)
+			RETURN(rc);
 	}
 
-	op_data->op_flags |= MF_MDC_CANCEL_FID4;
-
-	/* cancel UPDATE locks of target parent */
-	rc = lmv_early_cancel(exp, tp_tgt, op_data, tgt->ltd_idx, LCK_EX,
-			      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID2);
-	if (rc != 0)
-		RETURN(rc);
-
+retry_rename:
+	/*
+	 * Cancel all the locks on tgt child (fid4).
+	 */
 	if (fid_is_sane(&op_data->op_fid4)) {
-		/* cancel LOOKUP lock of target on target parent */
-		if (tgt != tp_tgt) {
-			rc = lmv_early_cancel(exp, tp_tgt, op_data,
-					      tgt->ltd_idx, LCK_EX,
-					      MDS_INODELOCK_LOOKUP,
-					      MF_MDC_CANCEL_FID4);
-			if (rc != 0)
-				RETURN(rc);
-		}
-	}
+		struct lmv_tgt_desc *tgt;
 
-	if (fid_is_sane(&op_data->op_fid3)) {
-		src_tgt = lmv_find_target(lmv, &op_data->op_fid3);
-		if (IS_ERR(src_tgt))
-			RETURN(PTR_ERR(src_tgt));
-
-		/* cancel ELC locks of source */
-		rc = lmv_early_cancel(exp, src_tgt, op_data, tgt->ltd_idx,
-				      LCK_EX, MDS_INODELOCK_ELC,
-				      MF_MDC_CANCEL_FID3);
+		rc = lmv_early_cancel(exp, NULL, op_data, src_tgt->ltd_idx,
+				      LCK_EX, MDS_INODELOCK_FULL,
+				      MF_MDC_CANCEL_FID4);
 		if (rc != 0)
 			RETURN(rc);
-	}
 
-retry:
-	sp_tgt = __lmv_locate_tgt(lmv, op_data->op_mea1, old, oldlen,
-				  &op_data->op_fid1, &op_data->op_mds,
-				  op_data->op_post_migrate);
-	if (IS_ERR(sp_tgt))
-		RETURN(PTR_ERR(sp_tgt));
-
-	/* cancel UPDATE locks of source parent */
-	rc = lmv_early_cancel(exp, sp_tgt, op_data, tgt->ltd_idx, LCK_EX,
-			      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1);
-	if (rc != 0)
-		RETURN(rc);
+		tgt = lmv_find_target(lmv, &op_data->op_fid4);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
 
-	if (fid_is_sane(&op_data->op_fid3)) {
-		/* cancel LOOKUP lock of source on source parent */
-		if (src_tgt != sp_tgt) {
-			rc = lmv_early_cancel(exp, sp_tgt, op_data,
-					      tgt->ltd_idx, LCK_EX,
-					      MDS_INODELOCK_LOOKUP,
-					      MF_MDC_CANCEL_FID3);
-			if (rc != 0)
-				RETURN(rc);
-		}
+		/* Since the target child might be destroyed, and it might
+		 * become orphan, and we can only check orphan on the local
+		 * MDT right now, so we send rename request to the MDT where
+		 * target child is located. If target child does not exist,
+		 * then it will send the request to the target parent */
+		target_exp = tgt->ltd_exp;
 	}
 
-rename:
-	CDEBUG(D_INODE, "RENAME "DFID"/%.*s to "DFID"/%.*s\n",
-		PFID(&op_data->op_fid1), (int)oldlen, old,
-		PFID(&op_data->op_fid2), (int)newlen, new);
+	rc = md_rename(target_exp, op_data, old, oldlen, new, newlen,
+		       request);
 
-	rc = md_rename(tgt->ltd_exp, op_data, old, oldlen, new, newlen,
-			request);
-	if (rc == -ENOENT && lmv_dir_retry_check_update(op_data)) {
-		ptlrpc_req_finished(*request);
-		*request = NULL;
-		goto retry;
-	}
-
-	if (rc && rc != -EXDEV)
+	if (rc != 0 && rc != -EXDEV)
 		RETURN(rc);
 
 	body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY);
@@ -2272,28 +2019,13 @@ static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
 	if (likely(!(body->mbo_valid & OBD_MD_MDS)))
 		RETURN(rc);
 
-	op_data->op_fid4 = body->mbo_fid1;
+	CDEBUG(D_INODE, "%s: try rename to another MDT for "DFID"\n",
+	       exp->exp_obd->obd_name, PFID(&body->mbo_fid1));
 
+	op_data->op_fid4 = body->mbo_fid1;
 	ptlrpc_req_finished(*request);
 	*request = NULL;
-
-	tgt = lmv_find_target(lmv, &op_data->op_fid4);
-	if (IS_ERR(tgt))
-		RETURN(PTR_ERR(tgt));
-
-	if (fid_is_sane(&op_data->op_fid4)) {
-		/* cancel LOOKUP lock of target on target parent */
-		if (tgt != tp_tgt) {
-			rc = lmv_early_cancel(exp, tp_tgt, op_data,
-					      tgt->ltd_idx, LCK_EX,
-					      MDS_INODELOCK_LOOKUP,
-					      MF_MDC_CANCEL_FID4);
-			if (rc != 0)
-				RETURN(rc);
-		}
-	}
-
-	goto rename;
+	goto retry_rename;
 }
 
 static int lmv_setattr(struct obd_export *exp, struct md_op_data *op_data,
@@ -2305,9 +2037,8 @@ static int lmv_setattr(struct obd_export *exp, struct md_op_data *op_data,
 	int                      rc = 0;
 	ENTRY;
 
-	CDEBUG(D_INODE, "SETATTR for "DFID", valid 0x%x/0x%x\n",
-	       PFID(&op_data->op_fid1), op_data->op_attr.ia_valid,
-	       op_data->op_xvalid);
+	CDEBUG(D_INODE, "SETATTR for "DFID", valid 0x%x\n",
+	       PFID(&op_data->op_fid1), op_data->op_attr.ia_valid);
 
 	op_data->op_flags |= MF_MDC_CANCEL_FID1;
 	tgt = lmv_find_target(lmv, &op_data->op_fid1);
@@ -2336,228 +2067,146 @@ static int lmv_fsync(struct obd_export *exp, const struct lu_fid *fid,
 	RETURN(rc);
 }
 
-struct stripe_dirent {
-	struct page		*sd_page;
-	struct lu_dirpage	*sd_dp;
-	struct lu_dirent	*sd_ent;
-	bool			 sd_eof;
-};
-
-struct lmv_dir_ctxt {
-	struct lmv_obd		*ldc_lmv;
-	struct md_op_data	*ldc_op_data;
-	struct md_callback	*ldc_cb_op;
-	__u64			 ldc_hash;
-	int			 ldc_count;
-	struct stripe_dirent	 ldc_stripes[0];
-};
-
-static inline void stripe_dirent_unload(struct stripe_dirent *stripe)
-{
-	if (stripe->sd_page) {
-		kunmap(stripe->sd_page);
-		put_page(stripe->sd_page);
-		stripe->sd_page = NULL;
-		stripe->sd_ent = NULL;
-	}
-}
-
-static inline void put_lmv_dir_ctxt(struct lmv_dir_ctxt *ctxt)
-{
-	int i;
-
-	for (i = 0; i < ctxt->ldc_count; i++)
-		stripe_dirent_unload(&ctxt->ldc_stripes[i]);
-}
-
-/* if @ent is dummy, or . .., get next */
-static struct lu_dirent *stripe_dirent_get(struct lmv_dir_ctxt *ctxt,
-					   struct lu_dirent *ent,
-					   int stripe_index)
-{
-	for (; ent; ent = lu_dirent_next(ent)) {
-		/* Skip dummy entry */
-		if (le16_to_cpu(ent->lde_namelen) == 0)
-			continue;
-
-		/* skip . and .. for other stripes */
-		if (stripe_index &&
-		    (strncmp(ent->lde_name, ".",
-			     le16_to_cpu(ent->lde_namelen)) == 0 ||
-		     strncmp(ent->lde_name, "..",
-			     le16_to_cpu(ent->lde_namelen)) == 0))
-			continue;
-
-		if (le64_to_cpu(ent->lde_hash) >= ctxt->ldc_hash)
-			break;
-	}
-
-	return ent;
-}
-
-static struct lu_dirent *stripe_dirent_load(struct lmv_dir_ctxt *ctxt,
-					    struct stripe_dirent *stripe,
-					    int stripe_index)
+/**
+ * Get current minimum entry from striped directory
+ *
+ * This function will search the dir entry, whose hash value is the
+ * closest(>=) to @hash_offset, from all of sub-stripes, and it is
+ * only being called for striped directory.
+ *
+ * \param[in] exp		export of LMV
+ * \param[in] op_data		parameters transferred beween client MD stack
+ *                              stripe_information will be included in this
+ *                              parameter
+ * \param[in] cb_op		ldlm callback being used in enqueue in
+ *                              mdc_read_page
+ * \param[in] hash_offset	the hash value, which is used to locate
+ *                              minum(closet) dir entry
+ * \param[in|out] stripe_offset the caller use this to indicate the stripe
+ *                              index of last entry, so to avoid hash conflict
+ *                              between stripes. It will also be used to
+ *                              return the stripe index of current dir entry.
+ * \param[in|out] entp		the minum entry and it also is being used
+ *                              to input the last dir entry to resolve the
+ *                              hash conflict
+ *
+ * \param[out] ppage		the page which holds the minum entry
+ *
+ * \retval                      = 0 get the entry successfully
+ *                              negative errno (< 0) does not get the entry
+ */
+static int lmv_get_min_striped_entry(struct obd_export *exp,
+				     struct md_op_data *op_data,
+				     struct md_callback *cb_op,
+				     __u64 hash_offset, int *stripe_offset,
+				     struct lu_dirent **entp,
+				     struct page **ppage)
 {
-	struct md_op_data *op_data = ctxt->ldc_op_data;
-	struct lmv_oinfo *oinfo;
-	struct lu_fid fid = op_data->op_fid1;
-	struct inode *inode = op_data->op_data;
-	struct lmv_tgt_desc *tgt;
-	struct lu_dirent *ent = stripe->sd_ent;
-	__u64 hash = ctxt->ldc_hash;
-	int rc = 0;
-
+	struct obd_device	*obd = exp->exp_obd;
+	struct lmv_obd		*lmv = &obd->u.lmv;
+	struct lmv_stripe_md	*lsm = op_data->op_mea1;
+	struct lmv_tgt_desc	*tgt;
+	int			stripe_count;
+	struct lu_dirent	*min_ent = NULL;
+	struct page		*min_page = NULL;
+	int			min_idx = 0;
+	int			i;
+	int			rc = 0;
 	ENTRY;
 
-	LASSERT(stripe == &ctxt->ldc_stripes[stripe_index]);
-	LASSERT(!ent);
-
-	do {
-		if (stripe->sd_page) {
-			__u64 end = le64_to_cpu(stripe->sd_dp->ldp_hash_end);
-
-			/* @hash should be the last dirent hash */
-			LASSERTF(hash <= end,
-				 "ctxt@%p stripe@%p hash %llx end %llx\n",
-				 ctxt, stripe, hash, end);
-			/* unload last page */
-			stripe_dirent_unload(stripe);
-			/* eof */
-			if (end == MDS_DIR_END_OFF) {
-				stripe->sd_eof = true;
-				break;
-			}
-			hash = end;
-		}
-
-		oinfo = &op_data->op_mea1->lsm_md_oinfo[stripe_index];
-		if (!oinfo->lmo_root) {
-			rc = -ENOENT;
-			break;
-		}
+	stripe_count = lsm->lsm_md_stripe_count;
+	for (i = 0; i < stripe_count; i++) {
+		struct lu_dirent	*ent = NULL;
+		struct page		*page = NULL;
+		struct lu_dirpage	*dp;
+		__u64			stripe_hash = hash_offset;
 
-		tgt = lmv_get_target(ctxt->ldc_lmv, oinfo->lmo_mds, NULL);
-		if (IS_ERR(tgt)) {
-			rc = PTR_ERR(tgt);
-			break;
-		}
+		tgt = lmv_get_target(lmv, lsm->lsm_md_oinfo[i].lmo_mds, NULL);
+		if (IS_ERR(tgt))
+			GOTO(out, rc = PTR_ERR(tgt));
+
+		/* op_data will be shared by each stripe, so we need
+		 * reset these value for each stripe */
+		op_data->op_fid1 = lsm->lsm_md_oinfo[i].lmo_fid;
+		op_data->op_fid2 = lsm->lsm_md_oinfo[i].lmo_fid;
+		op_data->op_data = lsm->lsm_md_oinfo[i].lmo_root;
+next:
+		rc = md_read_page(tgt->ltd_exp, op_data, cb_op, stripe_hash,
+				  &page);
+		if (rc != 0)
+			GOTO(out, rc);
 
-		/* op_data is shared by stripes, reset after use */
-		op_data->op_fid1 = oinfo->lmo_fid;
-		op_data->op_fid2 = oinfo->lmo_fid;
-		op_data->op_data = oinfo->lmo_root;
+		dp = page_address(page);
+		for (ent = lu_dirent_start(dp); ent != NULL;
+		     ent = lu_dirent_next(ent)) {
+			/* Skip dummy entry */
+			if (le16_to_cpu(ent->lde_namelen) == 0)
+				continue;
 
-		rc = md_read_page(tgt->ltd_exp, op_data, ctxt->ldc_cb_op, hash,
-				  &stripe->sd_page);
+			if (le64_to_cpu(ent->lde_hash) < hash_offset)
+				continue;
 
-		op_data->op_fid1 = fid;
-		op_data->op_fid2 = fid;
-		op_data->op_data = inode;
+			if (le64_to_cpu(ent->lde_hash) == hash_offset &&
+			    (*entp == ent || i < *stripe_offset))
+				continue;
 
-		if (rc)
+			/* skip . and .. for other stripes */
+			if (i != 0 &&
+			    (strncmp(ent->lde_name, ".",
+				     le16_to_cpu(ent->lde_namelen)) == 0 ||
+			     strncmp(ent->lde_name, "..",
+				     le16_to_cpu(ent->lde_namelen)) == 0))
+				continue;
 			break;
+		}
 
-		stripe->sd_dp = page_address(stripe->sd_page);
-		ent = stripe_dirent_get(ctxt, lu_dirent_start(stripe->sd_dp),
-					stripe_index);
-		/* in case a page filled with ., .. and dummy, read next */
-	} while (!ent);
-
-	stripe->sd_ent = ent;
-	if (rc) {
-		LASSERT(!ent);
-		/* treat error as eof, so dir can be partially accessed */
-		stripe->sd_eof = true;
-		LCONSOLE_WARN("dir "DFID" stripe %d readdir failed: %d, "
-			      "directory is partially accessed!\n",
-			      PFID(&ctxt->ldc_op_data->op_fid1), stripe_index,
-			      rc);
-	}
-
-	RETURN(ent);
-}
-
-static int lmv_file_resync(struct obd_export *exp, struct md_op_data *data)
-{
-	struct obd_device	*obd = exp->exp_obd;
-	struct lmv_obd		*lmv = &obd->u.lmv;
-	struct lmv_tgt_desc	*tgt;
-	int			 rc;
-	ENTRY;
-
-	rc = lmv_check_connect(obd);
-	if (rc != 0)
-		RETURN(rc);
-
-	tgt = lmv_find_target(lmv, &data->op_fid1);
-	if (IS_ERR(tgt))
-		RETURN(PTR_ERR(tgt));
-
-	data->op_flags |= MF_MDC_CANCEL_FID1;
-	rc = md_file_resync(tgt->ltd_exp, data);
-	RETURN(rc);
-}
-
-/**
- * Get dirent with the closest hash for striped directory
- *
- * This function will search the dir entry, whose hash value is the
- * closest(>=) to hash from all of sub-stripes, and it is only being called
- * for striped directory.
- *
- * \param[in] ctxt		dir read context
- *
- * \retval                      dirent get the entry successfully
- *                              NULL does not get the entry, normally it means
- *                              it reaches the end of the directory, while read
- *                              stripe dirent error is ignored to allow partial
- *                              access.
- */
-static struct lu_dirent *lmv_dirent_next(struct lmv_dir_ctxt *ctxt)
-{
-	struct stripe_dirent *stripe;
-	struct lu_dirent *ent = NULL;
-	int i;
-	int min = -1;
+		if (ent == NULL) {
+			stripe_hash = le64_to_cpu(dp->ldp_hash_end);
 
-	/* TODO: optimize with k-way merge sort */
-	for (i = 0; i < ctxt->ldc_count; i++) {
-		stripe = &ctxt->ldc_stripes[i];
-		if (stripe->sd_eof)
-			continue;
+			kunmap(page);
+			put_page(page);
+			page = NULL;
 
-		if (!stripe->sd_ent) {
-			stripe_dirent_load(ctxt, stripe, i);
-			if (!stripe->sd_ent) {
-				LASSERT(stripe->sd_eof);
+			/* reach the end of current stripe, go to next stripe */
+			if (stripe_hash == MDS_DIR_END_OFF)
 				continue;
-			}
+			else
+				goto next;
 		}
 
-		if (min == -1 ||
-		    le64_to_cpu(ctxt->ldc_stripes[min].sd_ent->lde_hash) >
-		    le64_to_cpu(stripe->sd_ent->lde_hash)) {
-			min = i;
-			if (le64_to_cpu(stripe->sd_ent->lde_hash) ==
-			    ctxt->ldc_hash)
-				break;
+		if (min_ent != NULL) {
+			if (le64_to_cpu(min_ent->lde_hash) >
+			    le64_to_cpu(ent->lde_hash)) {
+				min_ent = ent;
+				kunmap(min_page);
+				put_page(min_page);
+				min_idx = i;
+				min_page = page;
+			} else {
+				kunmap(page);
+				put_page(page);
+				page = NULL;
+			}
+		} else {
+			min_ent = ent;
+			min_page = page;
+			min_idx = i;
 		}
 	}
 
-	if (min != -1) {
-		stripe = &ctxt->ldc_stripes[min];
-		ent = stripe->sd_ent;
-		/* pop found dirent */
-		stripe->sd_ent = stripe_dirent_get(ctxt, lu_dirent_next(ent),
-						   min);
+out:
+	if (*ppage != NULL) {
+		kunmap(*ppage);
+		put_page(*ppage);
 	}
-
-	return ent;
+	*stripe_offset = min_idx;
+	*entp = min_ent;
+	*ppage = min_page;
+	RETURN(rc);
 }
 
 /**
- * Build dir entry page for striped directory
+ * Build dir entry page from a striped directory
  *
  * This function gets one entry by @offset from a striped directory. It will
  * read entries from all of stripes, and choose one closest to the required
@@ -2566,11 +2215,12 @@ static struct lu_dirent *lmv_dirent_next(struct lmv_dir_ctxt *ctxt)
  * and .. in a directory.
  * 2. op_data will be shared by all of stripes, instead of allocating new
  * one, so need to restore before reusing.
+ * 3. release the entry page if that is not being chosen.
  *
  * \param[in] exp	obd export refer to LMV
  * \param[in] op_data	hold those MD parameters of read_entry
  * \param[in] cb_op	ldlm callback being used in enqueue in mdc_read_entry
- * \param[in] offset	starting hash offset
+ * \param[out] ldp	the entry being read
  * \param[out] ppage	the page holding the entry. Note: because the entry
  *                      will be accessed in upper layer, so we need hold the
  *                      page until the usages of entry is finished, see
@@ -2579,117 +2229,124 @@ static struct lu_dirent *lmv_dirent_next(struct lmv_dir_ctxt *ctxt)
  * retval		=0 if get entry successfully
  *                      <0 cannot get entry
  */
-static int lmv_striped_read_page(struct obd_export *exp,
+static int lmv_read_striped_page(struct obd_export *exp,
 				 struct md_op_data *op_data,
 				 struct md_callback *cb_op,
 				 __u64 offset, struct page **ppage)
 {
-	struct page *page = NULL;
-	struct lu_dirpage *dp;
-	void *start;
-	struct lu_dirent *ent;
-	struct lu_dirent *last_ent;
-	int stripe_count;
-	struct lmv_dir_ctxt *ctxt;
-	struct lu_dirent *next = NULL;
-	__u16 ent_size;
-	size_t left_bytes;
-	int rc = 0;
+	struct lu_fid		master_fid = op_data->op_fid1;
+	struct inode		*master_inode = op_data->op_data;
+	__u64			hash_offset = offset;
+	struct lu_dirpage	*dp;
+	struct page		*min_ent_page = NULL;
+	struct page		*ent_page = NULL;
+	struct lu_dirent	*ent;
+	void			*area;
+	int			ent_idx = 0;
+	struct lu_dirent	*min_ent = NULL;
+	struct lu_dirent	*last_ent;
+	size_t			left_bytes;
+	int			rc;
 	ENTRY;
 
 	/* Allocate a page and read entries from all of stripes and fill
 	 * the page by hash order */
-	page = alloc_page(GFP_KERNEL);
-	if (!page)
+	ent_page = alloc_page(GFP_KERNEL);
+	if (ent_page == NULL)
 		RETURN(-ENOMEM);
 
 	/* Initialize the entry page */
-	dp = kmap(page);
+	dp = kmap(ent_page);
 	memset(dp, 0, sizeof(*dp));
 	dp->ldp_hash_start = cpu_to_le64(offset);
+	dp->ldp_flags |= LDF_COLLIDE;
 
-	start = dp + 1;
+	area = dp + 1;
 	left_bytes = PAGE_SIZE - sizeof(*dp);
-	ent = start;
+	ent = area;
 	last_ent = ent;
+	do {
+		__u16	ent_size;
 
-	/* initalize dir read context */
-	stripe_count = op_data->op_mea1->lsm_md_stripe_count;
-	OBD_ALLOC(ctxt, offsetof(typeof(*ctxt), ldc_stripes[stripe_count]));
-	if (!ctxt)
-		GOTO(free_page, rc = -ENOMEM);
-	ctxt->ldc_lmv = &exp->exp_obd->u.lmv;
-	ctxt->ldc_op_data = op_data;
-	ctxt->ldc_cb_op = cb_op;
-	ctxt->ldc_hash = offset;
-	ctxt->ldc_count = stripe_count;
-
-	while (1) {
-		next = lmv_dirent_next(ctxt);
-
-		/* end of directory */
-		if (!next) {
-			ctxt->ldc_hash = MDS_DIR_END_OFF;
-			break;
+		/* Find the minum entry from all sub-stripes */
+		rc = lmv_get_min_striped_entry(exp, op_data, cb_op, hash_offset,
+					       &ent_idx, &min_ent,
+					       &min_ent_page);
+		if (rc != 0)
+			GOTO(out, rc);
+
+		/* If it can not get minum entry, it means it already reaches
+		 * the end of this directory */
+		if (min_ent == NULL) {
+			last_ent->lde_reclen = 0;
+			hash_offset = MDS_DIR_END_OFF;
+			GOTO(out, rc);
 		}
-		ctxt->ldc_hash = le64_to_cpu(next->lde_hash);
 
-		ent_size = le16_to_cpu(next->lde_reclen);
+		ent_size = le16_to_cpu(min_ent->lde_reclen);
 
-		/* the last entry lde_reclen is 0, but it might not be the last
-		 * one of this temporay dir page */
-		if (!ent_size)
+		/* the last entry lde_reclen is 0, but it might not
+		 * the end of this entry of this temporay entry */
+		if (ent_size == 0)
 			ent_size = lu_dirent_calc_size(
-					le16_to_cpu(next->lde_namelen),
-					le32_to_cpu(next->lde_attrs));
-		/* page full */
-		if (ent_size > left_bytes)
-			break;
+					le16_to_cpu(min_ent->lde_namelen),
+					le32_to_cpu(min_ent->lde_attrs));
+		if (ent_size > left_bytes) {
+			last_ent->lde_reclen = cpu_to_le16(0);
+			hash_offset = le64_to_cpu(min_ent->lde_hash);
+			GOTO(out, rc);
+		}
 
-		memcpy(ent, next, ent_size);
+		memcpy(ent, min_ent, ent_size);
 
 		/* Replace . with master FID and Replace .. with the parent FID
 		 * of master object */
 		if (strncmp(ent->lde_name, ".",
 			    le16_to_cpu(ent->lde_namelen)) == 0 &&
 		    le16_to_cpu(ent->lde_namelen) == 1)
-			fid_cpu_to_le(&ent->lde_fid, &op_data->op_fid1);
+			fid_cpu_to_le(&ent->lde_fid, &master_fid);
 		else if (strncmp(ent->lde_name, "..",
 				   le16_to_cpu(ent->lde_namelen)) == 0 &&
 			   le16_to_cpu(ent->lde_namelen) == 2)
 			fid_cpu_to_le(&ent->lde_fid, &op_data->op_fid3);
 
-		CDEBUG(D_INODE, "entry %.*s hash %#llx\n",
-		       le16_to_cpu(ent->lde_namelen), ent->lde_name,
-		       le64_to_cpu(ent->lde_hash));
-
 		left_bytes -= ent_size;
 		ent->lde_reclen = cpu_to_le16(ent_size);
 		last_ent = ent;
 		ent = (void *)ent + ent_size;
-	};
-
-	last_ent->lde_reclen = 0;
-
-	if (ent == start)
-		dp->ldp_flags |= LDF_EMPTY;
-	else if (ctxt->ldc_hash == le64_to_cpu(last_ent->lde_hash))
-		dp->ldp_flags |= LDF_COLLIDE;
-	dp->ldp_flags = cpu_to_le32(dp->ldp_flags);
-	dp->ldp_hash_end = cpu_to_le64(ctxt->ldc_hash);
-
-	put_lmv_dir_ctxt(ctxt);
-	OBD_FREE(ctxt, offsetof(typeof(*ctxt), ldc_stripes[stripe_count]));
+		hash_offset = le64_to_cpu(min_ent->lde_hash);
+		if (hash_offset == MDS_DIR_END_OFF) {
+			last_ent->lde_reclen = 0;
+			break;
+		}
+	} while (1);
+out:
+	if (min_ent_page != NULL) {
+		kunmap(min_ent_page);
+		put_page(min_ent_page);
+	}
 
-	*ppage = page;
+	if (unlikely(rc != 0)) {
+		__free_page(ent_page);
+		ent_page = NULL;
+	} else {
+		if (ent == area)
+			dp->ldp_flags |= LDF_EMPTY;
+		dp->ldp_flags = cpu_to_le32(dp->ldp_flags);
+		dp->ldp_hash_end = cpu_to_le64(hash_offset);
+	}
 
-	RETURN(0);
+	/* We do not want to allocate md_op_data during each
+	 * dir entry reading, so op_data will be shared by every stripe,
+	 * then we need to restore it back to original value before
+	 * return to the upper layer */
+	op_data->op_fid1 = master_fid;
+	op_data->op_fid2 = master_fid;
+	op_data->op_data = master_inode;
 
-free_page:
-	kunmap(page);
-	__free_page(page);
+	*ppage = ent_page;
 
-	return rc;
+	RETURN(rc);
 }
 
 int lmv_read_page(struct obd_export *exp, struct md_op_data *op_data,
@@ -2704,7 +2361,7 @@ int lmv_read_page(struct obd_export *exp, struct md_op_data *op_data,
 	ENTRY;
 
 	if (unlikely(lsm != NULL)) {
-		rc = lmv_striped_read_page(exp, op_data, cb_op, offset, ppage);
+		rc = lmv_read_striped_page(exp, op_data, cb_op, offset, ppage);
 		RETURN(rc);
 	}
 
@@ -2742,34 +2399,68 @@ int lmv_read_page(struct obd_export *exp, struct md_op_data *op_data,
  *                      negative errno if failed.
  */
 static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data,
-		      struct ptlrpc_request **request)
+                      struct ptlrpc_request **request)
 {
-	struct obd_device *obd = exp->exp_obd;
-	struct lmv_obd *lmv = &obd->u.lmv;
-	struct lmv_tgt_desc *tgt;
-	struct lmv_tgt_desc *parent_tgt;
-	struct mdt_body *body;
-	int rc;
-
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd          *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt = NULL;
+	struct lmv_tgt_desc     *parent_tgt = NULL;
+	struct mdt_body		*body;
+	int                     rc;
+	int			stripe_index = 0;
+	struct lmv_stripe_md	*lsm = op_data->op_mea1;
 	ENTRY;
 
+retry_unlink:
+	/* For striped dir, we need to locate the parent as well */
+	if (lsm != NULL) {
+		struct lmv_tgt_desc *tmp;
+
+		LASSERT(op_data->op_name != NULL &&
+			op_data->op_namelen != 0);
+
+		tmp = lmv_locate_target_for_name(lmv, lsm,
+						 op_data->op_name,
+						 op_data->op_namelen,
+						 &op_data->op_fid1,
+						 &op_data->op_mds);
+
+		/* return -EBADFD means unknown hash type, might
+		 * need try all sub-stripe here */
+		if (IS_ERR(tmp) && PTR_ERR(tmp) != -EBADFD)
+			RETURN(PTR_ERR(tmp));
+
+		/* Note: both migrating dir and unknown hash dir need to
+		 * try all of sub-stripes, so we need start search the
+		 * name from stripe 0, but migrating dir is already handled
+		 * inside lmv_locate_target_for_name(), so we only check
+		 * unknown hash type directory here */
+		if (!lmv_is_known_hash_type(lsm->lsm_md_hash_type)) {
+			struct lmv_oinfo *oinfo;
+
+			oinfo = &lsm->lsm_md_oinfo[stripe_index];
+
+			op_data->op_fid1 = oinfo->lmo_fid;
+			op_data->op_mds = oinfo->lmo_mds;
+		}
+	}
+
+try_next_stripe:
+	/* Send unlink requests to the MDT where the child is located */
+	if (likely(!fid_is_zero(&op_data->op_fid2)))
+		tgt = lmv_find_target(lmv, &op_data->op_fid2);
+	else if (lsm != NULL)
+		tgt = lmv_get_target(lmv, op_data->op_mds, NULL);
+	else
+		tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
 	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
 	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
 	op_data->op_cap = cfs_curproc_cap_pack();
 
-retry:
-	parent_tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
-	if (IS_ERR(parent_tgt))
-		RETURN(PTR_ERR(parent_tgt));
-
-	if (likely(!fid_is_zero(&op_data->op_fid2))) {
-		tgt = lmv_find_target(lmv, &op_data->op_fid2);
-		if (IS_ERR(tgt))
-			RETURN(PTR_ERR(tgt));
-	} else {
-		tgt = parent_tgt;
-	}
-
 	/*
 	 * If child's fid is given, cancel unused locks for it if it is from
 	 * another export than parent.
@@ -2779,28 +2470,49 @@ static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data,
 	 */
 	op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
 
-	if (parent_tgt != tgt)
+	/*
+	 * Cancel FULL locks on child (fid3).
+	 */
+	parent_tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	if (IS_ERR(parent_tgt))
+		RETURN(PTR_ERR(parent_tgt));
+
+	if (parent_tgt != tgt) {
 		rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_idx,
 				      LCK_EX, MDS_INODELOCK_LOOKUP,
 				      MF_MDC_CANCEL_FID3);
+	}
 
 	rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_idx, LCK_EX,
-			      MDS_INODELOCK_ELC, MF_MDC_CANCEL_FID3);
-	if (rc)
+			      MDS_INODELOCK_FULL, MF_MDC_CANCEL_FID3);
+	if (rc != 0)
 		RETURN(rc);
 
 	CDEBUG(D_INODE, "unlink with fid="DFID"/"DFID" -> mds #%u\n",
 	       PFID(&op_data->op_fid1), PFID(&op_data->op_fid2), tgt->ltd_idx);
 
 	rc = md_unlink(tgt->ltd_exp, op_data, request);
-	if (rc == -ENOENT && lmv_dir_retry_check_update(op_data)) {
+	if (rc != 0 && rc != -EREMOTE && rc != -ENOENT)
+		RETURN(rc);
+
+	/* Try next stripe if it is needed. */
+	if (rc == -ENOENT && lsm != NULL && lmv_need_try_all_stripes(lsm)) {
+		struct lmv_oinfo *oinfo;
+
+		stripe_index++;
+		if (stripe_index >= lsm->lsm_md_stripe_count)
+			RETURN(rc);
+
+		oinfo = &lsm->lsm_md_oinfo[stripe_index];
+
+		op_data->op_fid1 = oinfo->lmo_fid;
+		op_data->op_mds = oinfo->lmo_mds;
+
 		ptlrpc_req_finished(*request);
 		*request = NULL;
-		goto retry;
-	}
 
-	if (rc != -EREMOTE)
-		RETURN(rc);
+		goto try_next_stripe;
+	}
 
 	body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY);
 	if (body == NULL)
@@ -2810,23 +2522,40 @@ static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data,
 	if (likely(!(body->mbo_valid & OBD_MD_MDS)))
 		RETURN(rc);
 
-	/* This is a remote object, try remote MDT. */
+	CDEBUG(D_INODE, "%s: try unlink to another MDT for "DFID"\n",
+	       exp->exp_obd->obd_name, PFID(&body->mbo_fid1));
+
+	/* This is a remote object, try remote MDT, Note: it may
+	 * try more than 1 time here, Considering following case
+	 * /mnt/lustre is root on MDT0, remote1 is on MDT1
+	 * 1. Initially A does not know where remote1 is, it send
+	 *    unlink RPC to MDT0, MDT0 return -EREMOTE, it will
+	 *    resend unlink RPC to MDT1 (retry 1st time).
+	 *
+	 * 2. During the unlink RPC in flight,
+	 *    client B mv /mnt/lustre/remote1 /mnt/lustre/remote2
+	 *    and create new remote1, but on MDT0
+	 *
+	 * 3. MDT1 get unlink RPC(from A), then do remote lock on
+	 *    /mnt/lustre, then lookup get fid of remote1, and find
+	 *    it is remote dir again, and replay -EREMOTE again.
+	 *
+	 * 4. Then A will resend unlink RPC to MDT0. (retry 2nd times).
+	 *
+	 * In theory, it might try unlimited time here, but it should
+	 * be very rare case.  */
 	op_data->op_fid2 = body->mbo_fid1;
 	ptlrpc_req_finished(*request);
 	*request = NULL;
 
-	tgt = lmv_find_target(lmv, &op_data->op_fid2);
-	if (IS_ERR(tgt))
-		RETURN(PTR_ERR(tgt));
-
-	goto retry;
+	goto retry_unlink;
 }
 
 static int lmv_precleanup(struct obd_device *obd)
 {
 	ENTRY;
 	libcfs_kkuc_group_rem(&obd->obd_uuid, 0, KUC_GRP_HSM);
-	fld_client_debugfs_fini(&obd->u.lmv.lmv_fld);
+	fld_client_proc_fini(&obd->u.lmv.lmv_fld);
 	lprocfs_obd_cleanup(obd);
 	lprocfs_free_md_stats(obd);
 	RETURN(0);
@@ -2902,96 +2631,6 @@ static int lmv_get_info(const struct lu_env *env, struct obd_export *exp,
         RETURN(-EINVAL);
 }
 
-static int lmv_rmfid(struct obd_export *exp, struct fid_array *fa,
-		     int *__rcs, struct ptlrpc_request_set *_set)
-{
-	struct obd_device *obddev = class_exp2obd(exp);
-	struct ptlrpc_request_set *set = _set;
-	struct lmv_obd *lmv = &obddev->u.lmv;
-	int tgt_count = lmv->desc.ld_tgt_count;
-	struct fid_array *fat, **fas = NULL;
-	int i, rc, **rcs = NULL;
-
-	if (!set) {
-		set = ptlrpc_prep_set();
-		if (!set)
-			RETURN(-ENOMEM);
-	}
-
-	/* split FIDs by targets */
-	OBD_ALLOC(fas, sizeof(fas) * tgt_count);
-	if (fas == NULL)
-		GOTO(out, rc = -ENOMEM);
-	OBD_ALLOC(rcs, sizeof(int *) * tgt_count);
-	if (rcs == NULL)
-		GOTO(out_fas, rc = -ENOMEM);
-
-	for (i = 0; i < fa->fa_nr; i++) {
-		unsigned int idx;
-
-		rc = lmv_fld_lookup(lmv, &fa->fa_fids[i], &idx);
-		if (rc) {
-			CDEBUG(D_OTHER, "can't lookup "DFID": rc = %d\n",
-			       PFID(&fa->fa_fids[i]), rc);
-			continue;
-		}
-		LASSERT(idx < tgt_count);
-		if (!fas[idx])
-			OBD_ALLOC(fas[idx], offsetof(struct fid_array,
-				  fa_fids[fa->fa_nr]));
-		if (!fas[idx])
-			GOTO(out, rc = -ENOMEM);
-		if (!rcs[idx])
-			OBD_ALLOC(rcs[idx], sizeof(int) * fa->fa_nr);
-		if (!rcs[idx])
-			GOTO(out, rc = -ENOMEM);
-
-		fat = fas[idx];
-		fat->fa_fids[fat->fa_nr++] = fa->fa_fids[i];
-	}
-
-	for (i = 0; i < tgt_count; i++) {
-		fat = fas[i];
-		if (!fat || fat->fa_nr == 0)
-			continue;
-		rc = md_rmfid(lmv->tgts[i]->ltd_exp, fat, rcs[i], set);
-	}
-
-	rc = ptlrpc_set_wait(NULL, set);
-	if (rc == 0) {
-		int j = 0;
-		for (i = 0; i < tgt_count; i++) {
-			fat = fas[i];
-			if (!fat || fat->fa_nr == 0)
-				continue;
-			/* copy FIDs back */
-			memcpy(fa->fa_fids + j, fat->fa_fids,
-			       fat->fa_nr * sizeof(struct lu_fid));
-			/* copy rcs back */
-			memcpy(__rcs + j, rcs[i], fat->fa_nr * sizeof(**rcs));
-			j += fat->fa_nr;
-		}
-	}
-	if (set != _set)
-		ptlrpc_set_destroy(set);
-
-out:
-	for (i = 0; i < tgt_count; i++) {
-		if (fas && fas[i])
-			OBD_FREE(fas[i], offsetof(struct fid_array,
-						fa_fids[fa->fa_nr]));
-		if (rcs && rcs[i])
-			OBD_FREE(rcs[i], sizeof(int) * fa->fa_nr);
-	}
-	if (rcs)
-		OBD_FREE(rcs, sizeof(int *) * tgt_count);
-out_fas:
-	if (fas)
-		OBD_FREE(fas, sizeof(fas) * tgt_count);
-
-	RETURN(rc);
-}
-
 /**
  * Asynchronously set by key a value associated with a LMV device.
  *
@@ -3066,15 +2705,13 @@ static int lmv_unpack_md_v1(struct obd_export *exp, struct lmv_stripe_md *lsm,
 	else
 		lsm->lsm_md_hash_type = le32_to_cpu(lmm1->lmv_hash_type);
 	lsm->lsm_md_layout_version = le32_to_cpu(lmm1->lmv_layout_version);
-	lsm->lsm_md_migrate_offset = le32_to_cpu(lmm1->lmv_migrate_offset);
-	lsm->lsm_md_migrate_hash = le32_to_cpu(lmm1->lmv_migrate_hash);
 	cplen = strlcpy(lsm->lsm_md_pool_name, lmm1->lmv_pool_name,
 			sizeof(lsm->lsm_md_pool_name));
 
 	if (cplen >= sizeof(lsm->lsm_md_pool_name))
 		RETURN(-E2BIG);
 
-	CDEBUG(D_INFO, "unpack lsm count %d, master %d hash_type %#x "
+	CDEBUG(D_INFO, "unpack lsm count %d, master %d hash_type %d"
 	       "layout_version %d\n", lsm->lsm_md_stripe_count,
 	       lsm->lsm_md_master_mdt_index, lsm->lsm_md_hash_type,
 	       lsm->lsm_md_layout_version);
@@ -3083,22 +2720,10 @@ static int lmv_unpack_md_v1(struct obd_export *exp, struct lmv_stripe_md *lsm,
 	for (i = 0; i < stripe_count; i++) {
 		fid_le_to_cpu(&lsm->lsm_md_oinfo[i].lmo_fid,
 			      &lmm1->lmv_stripe_fids[i]);
-		/*
-		 * set default value -1, so lmv_locate_tgt() knows this stripe
-		 * target is not initialized.
-		 */
-		lsm->lsm_md_oinfo[i].lmo_mds = (u32)-1;
-		if (!fid_is_sane(&lsm->lsm_md_oinfo[i].lmo_fid))
-			continue;
-
 		rc = lmv_fld_lookup(lmv, &lsm->lsm_md_oinfo[i].lmo_fid,
 				    &lsm->lsm_md_oinfo[i].lmo_mds);
-		if (rc == -ENOENT)
-			continue;
-
-		if (rc)
+		if (rc != 0)
 			RETURN(rc);
-
 		CDEBUG(D_INFO, "unpack fid #%d "DFID"\n", i,
 		       PFID(&lsm->lsm_md_oinfo[i].lmo_fid));
 	}
@@ -3121,9 +2746,12 @@ static int lmv_unpackmd(struct obd_export *exp, struct lmv_stripe_md **lsmp,
 	/* Free memmd */
 	if (lsm != NULL && lmm == NULL) {
 		int i;
-
 		for (i = 0; i < lsm->lsm_md_stripe_count; i++) {
-			if (lsm->lsm_md_oinfo[i].lmo_root)
+			/* For migrating inode, the master stripe and master
+			 * object will be the same, so do not need iput, see
+			 * ll_update_lsm_md */
+			if (!(lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION &&
+			      i == 0) && lsm->lsm_md_oinfo[i].lmo_root != NULL)
 				iput(lsm->lsm_md_oinfo[i].lmo_root);
 		}
 		lsm_size = lmv_stripe_md_size(lsm->lsm_md_stripe_count);
@@ -3335,34 +2963,35 @@ int lmv_clear_open_replay_data(struct obd_export *exp,
 int lmv_intent_getattr_async(struct obd_export *exp,
 			     struct md_enqueue_info *minfo)
 {
-	struct md_op_data *op_data = &minfo->mi_data;
-	struct obd_device *obd = exp->exp_obd;
-	struct lmv_obd *lmv = &obd->u.lmv;
-	struct lmv_tgt_desc *ptgt;
-	struct lmv_tgt_desc *ctgt;
-	int rc;
-
+	struct md_op_data       *op_data = &minfo->mi_data;
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd          *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *ptgt = NULL;
+	struct lmv_tgt_desc     *ctgt = NULL;
+	int                      rc;
 	ENTRY;
 
 	if (!fid_is_sane(&op_data->op_fid2))
 		RETURN(-EINVAL);
 
-	ptgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
+	ptgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
 	if (IS_ERR(ptgt))
 		RETURN(PTR_ERR(ptgt));
 
-	ctgt = lmv_find_target(lmv, &op_data->op_fid2);
+	ctgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
 	if (IS_ERR(ctgt))
 		RETURN(PTR_ERR(ctgt));
 
-	/* remote object needs two RPCs to lookup and getattr, considering the
-	 * complexity, don't support statahead for now.
+	/*
+	 * if child is on remote MDT, we need 2 async RPCs to fetch both LOOKUP
+	 * lock on parent, and UPDATE lock on child MDT, which makes all
+	 * complicated. Considering remote dir is rare case, and not supporting
+	 * it in statahead won't cause any issue, drop its support for now.
 	 */
 	if (ptgt != ctgt)
-		RETURN(-EREMOTE);
+		RETURN(-ENOTSUPP);
 
 	rc = md_intent_getattr_async(ptgt->ltd_exp, minfo);
-
 	RETURN(rc);
 }
 
@@ -3390,7 +3019,7 @@ int lmv_get_fid_from_lsm(struct obd_export *exp,
 	const struct lmv_oinfo *oinfo;
 
 	LASSERT(lsm != NULL);
-	oinfo = lsm_name_to_stripe_info(lsm, name, namelen, false);
+	oinfo = lsm_name_to_stripe_info(lsm, name, namelen);
 	if (IS_ERR(oinfo))
 		return PTR_ERR(oinfo);
 
@@ -3467,9 +3096,6 @@ static int lmv_merge_attr(struct obd_export *exp,
 	for (i = 0; i < lsm->lsm_md_stripe_count; i++) {
 		struct inode *inode = lsm->lsm_md_oinfo[i].lmo_root;
 
-		if (!inode)
-			continue;
-
 		CDEBUG(D_INFO,
 		       "" DFID " size %llu, blocks %llu nlink %u, atime %lld ctime %lld, mtime %lld.\n",
 		       PFID(&lsm->lsm_md_oinfo[i].lmo_fid),
@@ -3530,7 +3156,6 @@ struct md_ops lmv_md_ops = {
         .m_setattr              = lmv_setattr,
         .m_setxattr             = lmv_setxattr,
 	.m_fsync		= lmv_fsync,
-	.m_file_resync		= lmv_file_resync,
 	.m_read_page		= lmv_read_page,
         .m_unlink               = lmv_unlink,
         .m_init_ea_size         = lmv_init_ea_size,
@@ -3546,7 +3171,6 @@ struct md_ops lmv_md_ops = {
 	.m_revalidate_lock      = lmv_revalidate_lock,
 	.m_get_fid_from_lsm	= lmv_get_fid_from_lsm,
 	.m_unpackmd		= lmv_unpackmd,
-	.m_rmfid		= lmv_rmfid,
 };
 
 static int __init lmv_init(void)
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c b/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c
index dc35e7d9d9e66..37c22a92de716 100644
--- a/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c
+++ b/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -32,58 +32,63 @@
 
 #define DEBUG_SUBSYSTEM S_CLASS
 
+#include <linux/version.h>
 #include <linux/seq_file.h>
-#include <linux/statfs.h>
+#include <asm/statfs.h>
 #include <lprocfs_status.h>
 #include <obd_class.h>
 
 #include "lmv_internal.h"
 
-static ssize_t numobd_show(struct kobject *kobj, struct attribute *attr,
-			   char *buf)
+#ifndef CONFIG_PROC_FS
+static struct lprocfs_vars lprocfs_module_vars[] = { {0} };
+static struct lprocfs_vars lprocfs_obd_vars[] = { {0} };
+#else
+static int lmv_numobd_seq_show(struct seq_file *m, void *v)
 {
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct lmv_desc *desc;
+	struct obd_device	*dev = (struct obd_device *)m->private;
+        struct lmv_desc         *desc;
 
+        LASSERT(dev != NULL);
         desc = &dev->u.lmv.desc;
-	return sprintf(buf, "%u\n", desc->ld_tgt_count);
+	seq_printf(m, "%u\n", desc->ld_tgt_count);
+	return 0;
 }
-LUSTRE_RO_ATTR(numobd);
+LPROC_SEQ_FOPS_RO(lmv_numobd);
 
-static ssize_t activeobd_show(struct kobject *kobj, struct attribute *attr,
-			      char *buf)
+static int lmv_activeobd_seq_show(struct seq_file *m, void *v)
 {
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct lmv_desc *desc;
+	struct obd_device	*dev = (struct obd_device *)m->private;
+        struct lmv_desc         *desc;
 
+        LASSERT(dev != NULL);
         desc = &dev->u.lmv.desc;
-	return sprintf(buf, "%u\n", desc->ld_active_tgt_count);
+	seq_printf(m, "%u\n", desc->ld_active_tgt_count);
+	return 0;
 }
-LUSTRE_RO_ATTR(activeobd);
+LPROC_SEQ_FOPS_RO(lmv_activeobd);
 
-static ssize_t desc_uuid_show(struct kobject *kobj, struct attribute *attr,
-			      char *buf)
+static int lmv_desc_uuid_seq_show(struct seq_file *m, void *v)
 {
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct lmv_desc *desc;
+	struct obd_device	*dev = (struct obd_device*)m->private;
+        struct lmv_obd          *lmv;
 
-	desc = &dev->u.lmv.desc;
-	return sprintf(buf, "%s\n", desc->ld_uuid.uuid);
+        LASSERT(dev != NULL);
+        lmv = &dev->u.lmv;
+	seq_printf(m, "%s\n", lmv->desc.ld_uuid.uuid);
+	return 0;
 }
-LUSTRE_RO_ATTR(desc_uuid);
+LPROC_SEQ_FOPS_RO(lmv_desc_uuid);
 
-#ifdef CONFIG_PROC_FS
 static void *lmv_tgt_seq_start(struct seq_file *p, loff_t *pos)
 {
 	struct obd_device       *dev = p->private;
 	struct lmv_obd          *lmv = &dev->u.lmv;
 
 	while (*pos < lmv->tgts_size) {
-		if (lmv->tgts[*pos])
+		if (lmv->tgts[*pos] != NULL)
 			return lmv->tgts[*pos];
+
 		++*pos;
 	}
 
@@ -92,6 +97,7 @@ static void *lmv_tgt_seq_start(struct seq_file *p, loff_t *pos)
 
 static void lmv_tgt_seq_stop(struct seq_file *p, void *v)
 {
+        return;
 }
 
 static void *lmv_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos)
@@ -101,8 +107,9 @@ static void *lmv_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos)
 
 	++*pos;
 	while (*pos < lmv->tgts_size) {
-		if (lmv->tgts[*pos])
+		if (lmv->tgts[*pos] != NULL)
 			return lmv->tgts[*pos];
+
 		++*pos;
 	}
 
@@ -113,12 +120,10 @@ static int lmv_tgt_seq_show(struct seq_file *p, void *v)
 {
 	struct lmv_tgt_desc     *tgt = v;
 
-	if (!tgt)
+	if (tgt == NULL)
 		return 0;
-
-	seq_printf(p, "%u: %s %sACTIVE\n",
-		   tgt->ltd_idx, tgt->ltd_uuid.uuid,
-		   tgt->ltd_active ? "" : "IN");
+	seq_printf(p, "%u: %s %sACTIVE\n", tgt->ltd_idx,
+		  tgt->ltd_uuid.uuid, tgt->ltd_active ? "" : "IN");
 	return 0;
 }
 
@@ -143,7 +148,21 @@ static int lmv_target_seq_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static const struct proc_ops lmv_proc_target_fops = {
+LPROC_SEQ_FOPS_RO_TYPE(lmv, uuid);
+
+struct lprocfs_vars lprocfs_lmv_obd_vars[] = {
+	{ .name	=	"numobd",
+	  .fops	=	&lmv_numobd_fops	},
+	{ .name	=	"activeobd",
+	  .fops	=	&lmv_activeobd_fops	},
+	{ .name	=	"uuid",
+	  .fops	=	&lmv_uuid_fops		},
+	{ .name	=	"desc_uuid",
+	  .fops	=	&lmv_desc_uuid_fops	},
+	{ NULL }
+};
+
+const struct proc_ops lmv_proc_target_fops = {
 	PROC_OWNER(THIS_MODULE)
 	.proc_open	= lmv_target_seq_open,
 	.proc_read	= seq_read,
@@ -151,39 +170,3 @@ static const struct proc_ops lmv_proc_target_fops = {
 	.proc_release	= seq_release,
 };
 #endif /* CONFIG_PROC_FS */
-
-static struct attribute *lmv_attrs[] = {
-	&lustre_attr_activeobd.attr,
-	&lustre_attr_desc_uuid.attr,
-	&lustre_attr_numobd.attr,
-	NULL,
-};
-
-int lmv_tunables_init(struct obd_device *obd)
-{
-	int rc;
-
-	obd->obd_ktype.default_attrs = lmv_attrs;
-	rc = lprocfs_obd_setup(obd, true);
-	if (rc)
-		goto out_failed;
-#ifdef CONFIG_PROC_FS
-	rc = lprocfs_alloc_md_stats(obd, 0);
-	if (rc) {
-		lprocfs_obd_cleanup(obd);
-		goto out_failed;
-	}
-
-	rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd",
-				0444, &lmv_proc_target_fops, obd);
-	if (rc) {
-		lprocfs_free_md_stats(obd);
-		lprocfs_obd_cleanup(obd);
-		CWARN("%s: error adding LMV target_obd file: rc = %d\n",
-		      obd->obd_name, rc);
-		rc = 0;
-	}
-#endif /* CONFIG_PROC_FS */
-out_failed:
-	return rc;
-}
diff --git a/drivers/staging/lustrefsx/lustre/lov/Makefile b/drivers/staging/lustrefsx/lustre/lov/Makefile
index dae11b1647cbe..e74389ed4c3e3 100644
--- a/drivers/staging/lustrefsx/lustre/lov/Makefile
+++ b/drivers/staging/lustrefsx/lustre/lov/Makefile
@@ -2,7 +2,7 @@ obj-$(CONFIG_LUSTREFSX_FS)	+= lov.o
 
 lov-y	:= lov_dev.o lov_ea.o lov_io.o lov_lock.o lov_merge.o lov_obd.o
 lov-y	+= lov_object.o lov_offset.o lov_pack.o lov_page.o lov_pool.o
-lov-y	+= lov_request.o lovsub_dev.o lovsub_object.o
-lov-y	+= lproc_lov.o
+lov-y	+= lov_request.o lovsub_dev.o lovsub_lock.o lovsub_object.o
+lov-y	+= lovsub_page.o lproc_lov.o
 
 include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_cl_internal.h b/drivers/staging/lustrefsx/lustre/lov/lov_cl_internal.h
index 62ee46daed68f..0e84ab38e189a 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_cl_internal.h
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_cl_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -81,6 +81,7 @@
 
 struct lovsub_device;
 struct lovsub_object;
+struct lovsub_lock;
 
 enum lov_device_flags {
         LOV_DEV_INITIALIZED = 1 << 0
@@ -90,12 +91,6 @@ enum lov_device_flags {
  * Upper half.
  */
 
-/* Data-on-MDT array item in lov_device::ld_md_tgts[] */
-struct lovdom_device {
-	struct cl_device	*ldm_mdc;
-	int			 ldm_idx;
-};
-
 struct lov_device {
         /*
          * XXX Locking of lov-private data is missing.
@@ -106,13 +101,6 @@ struct lov_device {
         __u32                     ld_target_nr;
         struct lovsub_device    **ld_target;
         __u32                     ld_flags;
-
-	/* Data-on-MDT devices */
-	__u32			  ld_md_tgts_nr;
-	struct lovdom_device	 *ld_md_tgts;
-	struct obd_device	 *ld_lmv;
-	/* LU site for subdevices */
-	struct lu_site		  ld_site;
 };
 
 /**
@@ -141,48 +129,15 @@ static inline char *llt2str(enum lov_layout_type llt)
 	return "";
 }
 
-/**
- * Return lov_layout_entry_type associated with a given composite layout
- * entry.
- */
-static inline __u32 lov_entry_type(struct lov_stripe_md_entry *lsme)
-{
-	if ((lov_pattern(lsme->lsme_pattern) == LOV_PATTERN_RAID0) ||
-	    (lov_pattern(lsme->lsme_pattern) == LOV_PATTERN_MDT))
-		return lov_pattern(lsme->lsme_pattern);
-	return 0;
-}
-
-struct lov_layout_entry;
-struct lov_object;
-struct lov_lock_sub;
-
-struct lov_comp_layout_entry_ops {
-	int (*lco_init)(const struct lu_env *env, struct lov_device *dev,
-			struct lov_object *lov, unsigned int index,
-			const struct cl_object_conf *conf,
-			struct lov_layout_entry *lle);
-	void (*lco_fini)(const struct lu_env *env,
-			 struct lov_layout_entry *lle);
-	int  (*lco_getattr)(const struct lu_env *env, struct lov_object *obj,
-			    unsigned int index, struct lov_layout_entry *lle,
-			    struct cl_attr **attr);
-};
-
 struct lov_layout_raid0 {
 	unsigned               lo_nr;
-	/**
-	 * record the stripe no before the truncate size, used for setting OST
-	 * object size for truncate. LU-14128.
-	 */
-	int                    lo_trunc_stripeno;
 	/**
 	 * When this is true, lov_object::lo_attr contains
 	 * valid up to date attributes for a top-level
 	 * object. This field is reset to 0 when attributes of
 	 * any sub-object change.
 	 */
-	bool		       lo_attr_valid;
+	int		       lo_attr_valid;
 	/**
 	 * Array of sub-objects. Allocated when top-object is
 	 * created (lov_init_raid0()).
@@ -210,38 +165,6 @@ struct lov_layout_raid0 {
 	struct cl_attr         lo_attr;
 };
 
-struct lov_layout_dom {
-	/* keep this always at first place so DOM layout entry
-	 * can be addressed also as RAID0 after initialization.
-	 */
-	struct lov_layout_raid0 lo_dom_r0;
-	struct lovsub_object *lo_dom;
-	struct lov_oinfo *lo_loi;
-};
-
-struct lov_layout_entry {
-	__u32				lle_type;
-	unsigned int			lle_valid:1;
-	struct lu_extent		*lle_extent;
-	struct lov_stripe_md_entry	*lle_lsme;
-	struct lov_comp_layout_entry_ops *lle_comp_ops;
-	union {
-		struct lov_layout_raid0	lle_raid0;
-		struct lov_layout_dom	lle_dom;
-	};
-};
-
-struct lov_mirror_entry {
-	unsigned short	lre_mirror_id;
-	unsigned short	lre_preferred:1,
-			lre_stale:1,	/* set if any components is stale */
-			lre_valid:1;	/* set if at least one of components
-					 * in this mirror is valid */
-	unsigned short	lre_start;	/* index to lo_entries, start index of
-					 * this mirror */
-	unsigned short	lre_end;	/* end index of this mirror */
-};
-
 /**
  * lov-specific file state.
  *
@@ -257,7 +180,7 @@ struct lov_mirror_entry {
  * function corresponding to the current layout type.
  */
 struct lov_object {
-	struct cl_object	lo_cl;
+	struct cl_object       lo_cl;
 	/**
 	 * Serializes object operations with transitions between layout types.
 	 *
@@ -297,37 +220,13 @@ struct lov_object {
 		} released;
 		struct lov_layout_composite {
 			/**
-			 * flags of lov_comp_md_v1::lcm_flags. Mainly used
-			 * by FLR.
-			 */
-			uint32_t        lo_flags;
-			/**
-			 * For FLR: index of preferred mirror to read.
-			 * Preferred mirror is initialized by the preferred
-			 * bit of lsme. It can be changed when the preferred
-			 * is inaccessible.
-			 * In order to make lov_lsm_entry() return the same
-			 * mirror in the same IO context, it's only possible
-			 * to change the preferred mirror when the
-			 * lo_active_ios reaches zero.
-			 */
-			int             lo_preferred_mirror;
-			/**
-			 * For FLR: the lock to protect access to
-			 * lo_preferred_mirror.
+			 * Current valid entry count of lo_entries.
 			 */
-			spinlock_t      lo_write_lock;
-			/**
-			 * For FLR: Number of (valid) mirrors.
-			 */
-			unsigned        lo_mirror_count;
-			struct lov_mirror_entry *lo_mirrors;
-			/**
-			 * Current entry count of lo_entries, include
-			 * invalid entries.
-			 */
-			unsigned int    lo_entry_count;
-			struct lov_layout_entry *lo_entries;
+			unsigned int lo_entry_count;
+			struct lov_layout_entry {
+				struct lu_extent lle_extent;
+				struct lov_layout_raid0 lle_raid0;
+			} *lo_entries;
 		} composite;
 	} u;
 	/**
@@ -337,80 +236,11 @@ struct lov_object {
 	struct task_struct            *lo_owner;
 };
 
-static inline struct lov_layout_raid0 *lov_r0(struct lov_object *lov, int i)
-{
-	LASSERT(lov->lo_type == LLT_COMP);
-	LASSERTF(i < lov->u.composite.lo_entry_count,
-		 "entry %d entry_count %d", i, lov->u.composite.lo_entry_count);
-
-	return &lov->u.composite.lo_entries[i].lle_raid0;
-}
-
-static inline struct lov_stripe_md_entry *lov_lse(struct lov_object *lov, int i)
-{
-	LASSERT(lov->lo_lsm != NULL);
-	LASSERT(i < lov->lo_lsm->lsm_entry_count);
-
-	return lov->lo_lsm->lsm_entries[i];
-}
-
-static inline unsigned lov_flr_state(const struct lov_object *lov)
-{
-	if (lov->lo_type != LLT_COMP)
-		return LCM_FL_NONE;
-
-	return lov->u.composite.lo_flags & LCM_FL_FLR_MASK;
-}
-
-static inline bool lov_is_flr(const struct lov_object *lov)
-{
-	return lov_flr_state(lov) != LCM_FL_NONE;
-}
-
-static inline struct lov_layout_entry *lov_entry(struct lov_object *lov, int i)
-{
-	LASSERT(lov->lo_type == LLT_COMP);
-	LASSERTF(i < lov->u.composite.lo_entry_count,
-		 "entry %d entry_count %d", i, lov->u.composite.lo_entry_count);
-
-	return &lov->u.composite.lo_entries[i];
-}
-
-#define lov_for_layout_entry(lov, entry, start, end)			\
-	for (entry = lov_entry(lov, start);				\
-	     entry <= lov_entry(lov, end); entry++)
-
-#define lov_foreach_layout_entry(lov, entry)				\
-	lov_for_layout_entry(lov, entry, 0,				\
-			     (lov)->u.composite.lo_entry_count - 1)
-
-#define lov_foreach_mirror_layout_entry(lov, entry, lre)		\
-	lov_for_layout_entry(lov, entry, (lre)->lre_start, (lre)->lre_end)
-
-static inline struct lov_mirror_entry *
-lov_mirror_entry(struct lov_object *lov, int i)
-{
-	LASSERT(i < lov->u.composite.lo_mirror_count);
-	return &lov->u.composite.lo_mirrors[i];
-}
-
-#define lov_foreach_mirror_entry(lov, lre)				\
-	for (lre = lov_mirror_entry(lov, 0);				\
-	     lre <= lov_mirror_entry(lov,				\
-				lov->u.composite.lo_mirror_count - 1);	\
-	     lre++)
-
-static inline unsigned
-lov_layout_entry_index(struct lov_object *lov, struct lov_layout_entry *entry)
-{
-	struct lov_layout_entry *first = &lov->u.composite.lo_entries[0];
-	unsigned index = (unsigned)(entry - first);
-
-	LASSERT(entry >= first);
-	LASSERT(index < lov->u.composite.lo_entry_count);
-
-	return index;
-}
+#define lov_foreach_layout_entry(lov, entry)			\
+	for (entry = &lov->u.composite.lo_entries[0];		\
+	     entry < &lov->u.composite.lo_entries		\
+			[lov->u.composite.lo_entry_count];	\
+	     entry++)
 
 /**
  * State lov_lock keeps for each sub-lock.
@@ -440,8 +270,6 @@ struct lov_page {
 	struct cl_page_slice	lps_cl;
 	/** layout_entry + stripe index, composed using lov_comp_index() */
 	unsigned int		lps_index;
-	/* the layout gen when this page was created */
-	__u32			lps_layout_gen;
 };
 
 /*
@@ -460,6 +288,13 @@ struct lovsub_object {
         int                     lso_index;
 };
 
+/**
+ * Lock state at lovsub layer.
+ */
+struct lovsub_lock {
+        struct cl_lock_slice  lss_cl;
+};
+
 /**
  * Describe the environment settings for sublocks.
  */
@@ -468,6 +303,11 @@ struct lov_sublock_env {
         struct cl_io        *lse_io;
 };
 
+struct lovsub_page {
+        struct cl_page_slice lsb_cl;
+};
+
+
 struct lov_thread_info {
 	struct cl_object_conf   lti_stripe_conf;
 	struct lu_fid           lti_fid;
@@ -516,26 +356,6 @@ struct lov_io_sub {
 struct lov_io {
         /** super-class */
         struct cl_io_slice lis_cl;
-
-	/**
-	 * FLR: index to lo_mirrors. Valid only if lov_is_flr() returns true.
-	 *
-	 * The mirror index of this io. Preserved over cl_io_init()
-	 * if io->ci_ndelay_tried is greater than zero.
-	 */
-	int			lis_mirror_index;
-	/**
-	 * FLR: the layout gen when lis_mirror_index was cached. The
-	 * mirror index makes sense only when the layout gen doesn't
-	 * change.
-	 */
-	int			lis_mirror_layout_gen;
-
-	/**
-	 * fields below this will be initialized in lov_io_init().
-	 */
-	unsigned		lis_preserved;
-
         /**
          * Pointer to the object slice. This is a duplicate of
          * lov_io::lis_cl::cis_object.
@@ -578,7 +398,6 @@ struct lov_io {
 	 * All sub-io's created in this lov_io.
 	 */
 	struct list_head	lis_subios;
-
 };
 
 struct lov_session {
@@ -597,6 +416,7 @@ extern struct kmem_cache *lov_object_kmem;
 extern struct kmem_cache *lov_thread_kmem;
 extern struct kmem_cache *lov_session_kmem;
 
+extern struct kmem_cache *lovsub_lock_kmem;
 extern struct kmem_cache *lovsub_object_kmem;
 
 int   lov_object_init     (const struct lu_env *env, struct lu_object *obj,
@@ -607,6 +427,8 @@ int   lov_lock_init       (const struct lu_env *env, struct cl_object *obj,
                            struct cl_lock *lock, const struct cl_io *io);
 int   lov_io_init         (const struct lu_env *env, struct cl_object *obj,
                            struct cl_io *io);
+int   lovsub_lock_init    (const struct lu_env *env, struct cl_object *obj,
+                           struct cl_lock *lock, const struct cl_io *io);
 
 int   lov_lock_init_composite(const struct lu_env *env, struct cl_object *obj,
                            struct cl_lock *lock, const struct cl_io *io);
@@ -624,6 +446,8 @@ struct lov_io_sub *lov_sub_get(const struct lu_env *env, struct lov_io *lio,
 
 int   lov_page_init       (const struct lu_env *env, struct cl_object *ob,
 			   struct cl_page *page, pgoff_t index);
+int   lovsub_page_init    (const struct lu_env *env, struct cl_object *ob,
+			   struct cl_page *page, pgoff_t index);
 int   lov_page_init_empty (const struct lu_env *env, struct cl_object *obj,
 			   struct cl_page *page, pgoff_t index);
 int   lov_page_init_composite(const struct lu_env *env, struct cl_object *obj,
@@ -637,27 +461,11 @@ struct lu_object *lovsub_object_alloc(const struct lu_env *env,
 
 struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov);
 int lov_page_stripe(const struct cl_page *page);
-bool lov_page_is_empty(const struct cl_page *page);
 int lov_lsm_entry(const struct lov_stripe_md *lsm, __u64 offset);
-int lov_io_layout_at(struct lov_io *lio, __u64 offset);
 
 #define lov_foreach_target(lov, var)                    \
         for (var = 0; var < lov_targets_nr(lov); ++var)
 
-static inline struct lu_extent *lov_io_extent(struct lov_io *io, int i)
-{
-	return &lov_lse(io->lis_object, i)->lsme_extent;
-}
-
-/**
- * For layout entries within @ext.
- */
-#define lov_foreach_io_layout(ind, lio, ext)				\
-	for (ind = lov_io_layout_at(lio, (ext)->e_start);		\
-	     ind >= 0 &&						\
-	     lu_extent_is_overlapped(lov_io_extent(lio, ind), ext);	\
-	     ind = lov_io_layout_at(lio, lov_io_extent(lio, ind)->e_end))
-
 /*****************************************************************************
  *
  * Type conversions.
@@ -767,6 +575,22 @@ static inline struct lovsub_object *lu2lovsub(const struct lu_object *obj)
         return container_of0(obj, struct lovsub_object, lso_cl.co_lu);
 }
 
+static inline struct lovsub_lock *
+cl2lovsub_lock(const struct cl_lock_slice *slice)
+{
+        LINVRNT(lovsub_is_object(&slice->cls_obj->co_lu));
+        return container_of(slice, struct lovsub_lock, lss_cl);
+}
+
+static inline struct lovsub_lock *cl2sub_lock(const struct cl_lock *lock)
+{
+        const struct cl_lock_slice *slice;
+
+        slice = cl_lock_at(lock, &lovsub_device_type);
+        LASSERT(slice != NULL);
+        return cl2lovsub_lock(slice);
+}
+
 static inline struct lov_lock *cl2lov_lock(const struct cl_lock_slice *slice)
 {
         LINVRNT(lov_is_object(&slice->cls_obj->co_lu));
@@ -779,6 +603,13 @@ static inline struct lov_page *cl2lov_page(const struct cl_page_slice *slice)
         return container_of0(slice, struct lov_page, lps_cl);
 }
 
+static inline struct lovsub_page *
+cl2lovsub_page(const struct cl_page_slice *slice)
+{
+        LINVRNT(lovsub_is_object(&slice->cpl_obj->co_lu));
+        return container_of0(slice, struct lovsub_page, lsb_cl);
+}
+
 static inline struct lov_io *cl2lov_io(const struct lu_env *env,
                                 const struct cl_io_slice *ios)
 {
@@ -803,6 +634,23 @@ static inline struct lov_thread_info *lov_env_info(const struct lu_env *env)
         return info;
 }
 
+static inline struct lov_layout_raid0 *lov_r0(struct lov_object *lov, int i)
+{
+	LASSERT(lov->lo_type == LLT_COMP);
+	LASSERTF(i < lov->u.composite.lo_entry_count,
+		 "entry %d entry_count %d", i, lov->u.composite.lo_entry_count);
+
+	return &lov->u.composite.lo_entries[i].lle_raid0;
+}
+
+static inline struct lov_stripe_md_entry *lov_lse(struct lov_object *lov, int i)
+{
+	LASSERT(lov->lo_lsm != NULL);
+	LASSERT(i < lov->lo_lsm->lsm_entry_count);
+
+	return lov->lo_lsm->lsm_entries[i];
+}
+
 /* lov_pack.c */
 int lov_getstripe(const struct lu_env *env, struct lov_object *obj,
 		  struct lov_stripe_md *lsm, struct lov_user_md __user *lump,
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_dev.c b/drivers/staging/lustrefsx/lustre/lov/lov_dev.c
index 1faef7ad76afa..2506c39ec7296 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_dev.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_dev.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -46,37 +46,43 @@ struct kmem_cache *lov_object_kmem;
 struct kmem_cache *lov_thread_kmem;
 struct kmem_cache *lov_session_kmem;
 
+struct kmem_cache *lovsub_lock_kmem;
 struct kmem_cache *lovsub_object_kmem;
 
 struct lu_kmem_descr lov_caches[] = {
-	{
-		.ckd_cache = &lov_lock_kmem,
-		.ckd_name  = "lov_lock_kmem",
-		.ckd_size  = sizeof(struct lov_lock)
-	},
-	{
-		.ckd_cache = &lov_object_kmem,
-		.ckd_name  = "lov_object_kmem",
-		.ckd_size  = sizeof(struct lov_object)
-	},
-	{
-		.ckd_cache = &lov_thread_kmem,
-		.ckd_name  = "lov_thread_kmem",
-		.ckd_size  = sizeof(struct lov_thread_info)
-	},
-	{
-		.ckd_cache = &lov_session_kmem,
-		.ckd_name  = "lov_session_kmem",
-		.ckd_size  = sizeof(struct lov_session)
-	},
-	{
-		.ckd_cache = &lovsub_object_kmem,
-		.ckd_name  = "lovsub_object_kmem",
-		.ckd_size  = sizeof(struct lovsub_object)
-	},
-	{
-		.ckd_cache = NULL
-	}
+        {
+                .ckd_cache = &lov_lock_kmem,
+                .ckd_name  = "lov_lock_kmem",
+                .ckd_size  = sizeof (struct lov_lock)
+        },
+        {
+                .ckd_cache = &lov_object_kmem,
+                .ckd_name  = "lov_object_kmem",
+                .ckd_size  = sizeof (struct lov_object)
+        },
+        {
+                .ckd_cache = &lov_thread_kmem,
+                .ckd_name  = "lov_thread_kmem",
+                .ckd_size  = sizeof (struct lov_thread_info)
+        },
+        {
+                .ckd_cache = &lov_session_kmem,
+                .ckd_name  = "lov_session_kmem",
+                .ckd_size  = sizeof (struct lov_session)
+        },
+        {
+                .ckd_cache = &lovsub_lock_kmem,
+                .ckd_name  = "lovsub_lock_kmem",
+                .ckd_size  = sizeof (struct lovsub_lock)
+        },
+        {
+                .ckd_cache = &lovsub_object_kmem,
+                .ckd_name  = "lovsub_object_kmem",
+                .ckd_size  = sizeof (struct lovsub_object)
+        },
+        {
+                .ckd_cache = NULL
+        }
 };
 
 /*****************************************************************************
@@ -91,7 +97,7 @@ static void *lov_key_init(const struct lu_context *ctx,
 	struct lov_thread_info *info;
 
 	OBD_SLAB_ALLOC_PTR_GFP(info, lov_thread_kmem, GFP_NOFS);
-	if (!info)
+	if (info == NULL)
 		info = ERR_PTR(-ENOMEM);
 	return info;
 }
@@ -104,9 +110,9 @@ static void lov_key_fini(const struct lu_context *ctx,
 }
 
 struct lu_context_key lov_key = {
-	.lct_tags = LCT_CL_THREAD,
-	.lct_init = lov_key_init,
-	.lct_fini = lov_key_fini
+        .lct_tags = LCT_CL_THREAD,
+        .lct_init = lov_key_init,
+        .lct_fini = lov_key_fini
 };
 
 static void *lov_session_key_init(const struct lu_context *ctx,
@@ -115,180 +121,113 @@ static void *lov_session_key_init(const struct lu_context *ctx,
 	struct lov_session *info;
 
 	OBD_SLAB_ALLOC_PTR_GFP(info, lov_session_kmem, GFP_NOFS);
-	if (!info)
+	if (info == NULL)
 		info = ERR_PTR(-ENOMEM);
 	return info;
 }
 
 static void lov_session_key_fini(const struct lu_context *ctx,
-				 struct lu_context_key *key, void *data)
+                                 struct lu_context_key *key, void *data)
 {
-	struct lov_session *info = data;
-
-	OBD_SLAB_FREE_PTR(info, lov_session_kmem);
+        struct lov_session *info = data;
+        OBD_SLAB_FREE_PTR(info, lov_session_kmem);
 }
 
 struct lu_context_key lov_session_key = {
-	.lct_tags = LCT_SESSION,
-	.lct_init = lov_session_key_init,
-	.lct_fini = lov_session_key_fini
+        .lct_tags = LCT_SESSION,
+        .lct_init = lov_session_key_init,
+        .lct_fini = lov_session_key_fini
 };
 
 /* type constructor/destructor: lov_type_{init,fini,start,stop}() */
 LU_TYPE_INIT_FINI(lov, &lov_key, &lov_session_key);
 
-
-static int lov_mdc_dev_init(const struct lu_env *env, struct lov_device *ld,
-			    struct lu_device *mdc_dev, __u32 idx, __u32 nr)
-{
-	struct cl_device *cl;
-
-	ENTRY;
-	cl = cl_type_setup(env, &ld->ld_site, &lovsub_device_type,
-			   mdc_dev);
-	if (IS_ERR(cl))
-		RETURN(PTR_ERR(cl));
-
-	ld->ld_md_tgts[nr].ldm_mdc = cl;
-	ld->ld_md_tgts[nr].ldm_idx = idx;
-	RETURN(0);
-}
-
 static struct lu_device *lov_device_fini(const struct lu_env *env,
-					 struct lu_device *d)
+                                         struct lu_device *d)
 {
-	struct lov_device *ld = lu2lov_dev(d);
-	int i;
-
-	LASSERT(ld->ld_lov != NULL);
+        int i;
+        struct lov_device *ld = lu2lov_dev(d);
 
-	if (ld->ld_lmv) {
-		class_decref(ld->ld_lmv, "lov", d);
-		ld->ld_lmv = NULL;
-	}
+        LASSERT(ld->ld_lov != NULL);
+        if (ld->ld_target == NULL)
+                RETURN(NULL);
 
-	if (ld->ld_md_tgts) {
-		for (i = 0; i < ld->ld_md_tgts_nr; i++) {
-			if (!ld->ld_md_tgts[i].ldm_mdc)
-				continue;
+        lov_foreach_target(ld, i) {
+                struct lovsub_device *lsd;
 
-			cl_stack_fini(env, ld->ld_md_tgts[i].ldm_mdc);
-			ld->ld_md_tgts[i].ldm_mdc = NULL;
-			ld->ld_lov->lov_mdc_tgts[i].lmtd_mdc = NULL;
-		}
-	}
-
-	if (ld->ld_target) {
-		lov_foreach_target(ld, i) {
-			struct lovsub_device *lsd;
-
-			lsd = ld->ld_target[i];
-			if (lsd) {
-				cl_stack_fini(env, lovsub2cl_dev(lsd));
-				ld->ld_target[i] = NULL;
-			}
-		}
-	}
-	RETURN(NULL);
+                lsd = ld->ld_target[i];
+                if (lsd != NULL) {
+                        cl_stack_fini(env, lovsub2cl_dev(lsd));
+                        ld->ld_target[i] = NULL;
+                }
+        }
+        RETURN(NULL);
 }
 
 static int lov_device_init(const struct lu_env *env, struct lu_device *d,
-			   const char *name, struct lu_device *next)
+                           const char *name, struct lu_device *next)
 {
-	struct lov_device *ld = lu2lov_dev(d);
-	int i;
-	int rc = 0;
-
-	/* check all added already MDC subdevices and initialize them */
-	for (i = 0; i < ld->ld_md_tgts_nr; i++) {
-		struct obd_device *mdc;
-		__u32 idx;
-
-		mdc = ld->ld_lov->lov_mdc_tgts[i].lmtd_mdc;
-		idx = ld->ld_lov->lov_mdc_tgts[i].lmtd_index;
-
-		if (!mdc)
-			continue;
-
-		rc = lov_mdc_dev_init(env, ld, mdc->obd_lu_dev, idx, i);
-		if (rc) {
-			CERROR("%s: failed to add MDC %s as target: rc = %d\n",
-			       d->ld_obd->obd_name,
-			       obd_uuid2str(&mdc->obd_uuid), rc);
-			GOTO(out_err, rc);
-		}
-	}
-
-	if (!ld->ld_target)
-		RETURN(0);
-
-	lov_foreach_target(ld, i) {
-		struct lovsub_device *lsd;
-		struct cl_device *cl;
-		struct lov_tgt_desc *desc;
-
-		desc = ld->ld_lov->lov_tgts[i];
-		if (!desc)
-			continue;
-
-		cl = cl_type_setup(env, &ld->ld_site, &lovsub_device_type,
-				   desc->ltd_obd->obd_lu_dev);
-		if (IS_ERR(cl))
-			GOTO(out_err, rc = PTR_ERR(cl));
+        struct lov_device *ld = lu2lov_dev(d);
+        int i;
+        int rc = 0;
+
+        LASSERT(d->ld_site != NULL);
+        if (ld->ld_target == NULL)
+                RETURN(rc);
+
+        lov_foreach_target(ld, i) {
+                struct lovsub_device *lsd;
+                struct cl_device     *cl;
+                struct lov_tgt_desc  *desc;
+
+                desc = ld->ld_lov->lov_tgts[i];
+                if (desc == NULL)
+                        continue;
+
+                cl = cl_type_setup(env, d->ld_site, &lovsub_device_type,
+                                   desc->ltd_obd->obd_lu_dev);
+                if (IS_ERR(cl)) {
+                        rc = PTR_ERR(cl);
+                        break;
+                }
+                lsd = cl2lovsub_dev(cl);
+                ld->ld_target[i] = lsd;
+        }
 
-		lsd = cl2lovsub_dev(cl);
-		ld->ld_target[i] = lsd;
-	}
-	ld->ld_flags |= LOV_DEV_INITIALIZED;
-	RETURN(0);
+        if (rc)
+                lov_device_fini(env, d);
+        else
+                ld->ld_flags |= LOV_DEV_INITIALIZED;
 
-out_err:
-	lu_device_fini(d);
-	RETURN(rc);
+        RETURN(rc);
 }
 
 /* Free the lov specific data created for the back end lu_device. */
 static struct lu_device *lov_device_free(const struct lu_env *env,
-					 struct lu_device *d)
+                                         struct lu_device *d)
 {
 	struct lov_device *ld = lu2lov_dev(d);
 	const int nr = ld->ld_target_nr;
 
-	lu_site_fini(&ld->ld_site);
-
 	cl_device_fini(lu2cl_dev(d));
-	if (ld->ld_target) {
+	if (ld->ld_target != NULL)
 		OBD_FREE(ld->ld_target, nr * sizeof ld->ld_target[0]);
-		ld->ld_target = NULL;
-	}
-	if (ld->ld_md_tgts) {
-		OBD_FREE(ld->ld_md_tgts,
-			 sizeof(*ld->ld_md_tgts) * LOV_MDC_TGT_MAX);
-		ld->ld_md_tgts = NULL;
-	}
-	/* free array of MDCs */
-	if (ld->ld_lov->lov_mdc_tgts) {
-		OBD_FREE(ld->ld_lov->lov_mdc_tgts,
-			 sizeof(*ld->ld_lov->lov_mdc_tgts) * LOV_MDC_TGT_MAX);
-		ld->ld_lov->lov_mdc_tgts = NULL;
-	}
 
 	OBD_FREE_PTR(ld);
 	return NULL;
 }
 
 static void lov_cl_del_target(const struct lu_env *env, struct lu_device *dev,
-			      __u32 index)
+                              __u32 index)
 {
-	struct lov_device *ld = lu2lov_dev(dev);
-
-	ENTRY;
+        struct lov_device *ld = lu2lov_dev(dev);
+        ENTRY;
 
-	if (ld->ld_target[index]) {
-		cl_stack_fini(env, lovsub2cl_dev(ld->ld_target[index]));
-		ld->ld_target[index] = NULL;
-	}
-	EXIT;
+        if (ld->ld_target[index] != NULL) {
+                cl_stack_fini(env, lovsub2cl_dev(ld->ld_target[index]));
+                ld->ld_target[index] = NULL;
+        }
+        EXIT;
 }
 
 static int lov_expand_targets(const struct lu_env *env, struct lov_device *dev)
@@ -306,7 +245,7 @@ static int lov_expand_targets(const struct lu_env *env, struct lov_device *dev)
 		const size_t sz = sizeof(newd[0]);
 
 		OBD_ALLOC(newd, tgt_size * sz);
-		if (newd) {
+		if (newd != NULL) {
 			if (sub_size > 0) {
 				memcpy(newd, dev->ld_target, sub_size * sz);
 				OBD_FREE(dev->ld_target, sub_size * sz);
@@ -323,31 +262,32 @@ static int lov_expand_targets(const struct lu_env *env, struct lov_device *dev)
 }
 
 static int lov_cl_add_target(const struct lu_env *env, struct lu_device *dev,
-			     __u32 index)
+                             __u32 index)
 {
-	struct obd_device    *obd = dev->ld_obd;
-	struct lov_device    *ld  = lu2lov_dev(dev);
-	struct lov_tgt_desc  *tgt;
-	struct lovsub_device *lsd;
-	struct cl_device     *cl;
-	int rc;
-
-	ENTRY;
-
-	lov_tgts_getref(obd);
-
-	tgt = obd->u.lov.lov_tgts[index];
-	LASSERT(tgt != NULL);
-	LASSERT(tgt->ltd_obd != NULL);
+        struct obd_device    *obd = dev->ld_obd;
+        struct lov_device    *ld  = lu2lov_dev(dev);
+        struct lov_tgt_desc  *tgt;
+        struct lovsub_device *lsd;
+        struct cl_device     *cl;
+        int rc;
+        ENTRY;
+
+        obd_getref(obd);
+
+        tgt = obd->u.lov.lov_tgts[index];
+        LASSERT(tgt != NULL);
+        LASSERT(tgt->ltd_obd != NULL);
+
+        if (!tgt->ltd_obd->obd_set_up) {
+                CERROR("Target %s not set up\n", obd_uuid2str(&tgt->ltd_uuid));
+                RETURN(-EINVAL);
+        }
 
-	if (!tgt->ltd_obd->obd_set_up) {
-		CERROR("Target %s not set up\n", obd_uuid2str(&tgt->ltd_uuid));
-		RETURN(-EINVAL);
-	}
+        rc = lov_expand_targets(env, ld);
+        if (rc == 0 && ld->ld_flags & LOV_DEV_INITIALIZED) {
+                LASSERT(dev->ld_site != NULL);
 
-	rc = lov_expand_targets(env, ld);
-	if (rc == 0 && ld->ld_flags & LOV_DEV_INITIALIZED) {
-		cl = cl_type_setup(env, &ld->ld_site, &lovsub_device_type,
+		cl = cl_type_setup(env, dev->ld_site, &lovsub_device_type,
 				   tgt->ltd_obd->obd_lu_dev);
 		if (!IS_ERR(cl)) {
 			lsd = cl2lovsub_dev(cl);
@@ -359,239 +299,94 @@ static int lov_cl_add_target(const struct lu_env *env, struct lu_device *dev,
 			rc = PTR_ERR(cl);
 		}
         }
-
-	lov_tgts_putref(obd);
-
-	RETURN(rc);
-}
-
-/**
- * Add new MDC target device in LOV.
- *
- * This function is part of the configuration log processing. It adds new MDC
- * device to the MDC device array indexed by their indexes.
- *
- * \param[in] env	execution environment
- * \param[in] d		LU device of LOV device
- * \param[in] mdc	MDC device to add
- * \param[in] idx	MDC device index
- *
- * \retval		0 if successful
- * \retval		negative value on error
- */
-static int lov_add_mdc_target(const struct lu_env *env, struct lu_device *d,
-			      struct obd_device *mdc, __u32 idx)
-{
-	struct lov_device *ld = lu2lov_dev(d);
-	struct obd_device *lov_obd = d->ld_obd;
-	struct obd_device *lmv_obd;
-	int next;
-	int rc = 0;
-
-	ENTRY;
-
-	LASSERT(mdc != NULL);
-	if (ld->ld_md_tgts_nr == LOV_MDC_TGT_MAX) {
-		/*
-		 * If the maximum value of LOV_MDC_TGT_MAX will become too
-		 * small then all MD target handling must be rewritten in LOD
-		 * manner, check lod_add_device() and related functionality.
-		 */
-		CERROR("%s: cannot serve more than %d MDC devices\n",
-		       lov_obd->obd_name, LOV_MDC_TGT_MAX);
-		RETURN(-ERANGE);
-	}
-
-	/*
-	 * grab FLD from lmv, do that here, when first MDC is added
-	 * to be sure LMV is set up and can be found
-	 */
-	if (!ld->ld_lmv) {
-		next = 0;
-		while ((lmv_obd = class_devices_in_group(&lov_obd->obd_uuid,
-							 &next)) != NULL) {
-			if ((strncmp(lmv_obd->obd_type->typ_name,
-				     LUSTRE_LMV_NAME,
-				     strlen(LUSTRE_LMV_NAME)) == 0))
-				break;
-		}
-		if (!lmv_obd) {
-			CERROR("%s: cannot find LMV OBD by UUID (%s)\n",
-			       lov_obd->obd_name,
-			       obd_uuid2str(&lmv_obd->obd_uuid));
-			RETURN(-ENODEV);
-		}
-		spin_lock(&lmv_obd->obd_dev_lock);
-		class_incref(lmv_obd, "lov", ld);
-		spin_unlock(&lmv_obd->obd_dev_lock);
-		ld->ld_lmv = lmv_obd;
-	}
-
-	LASSERT(lov_obd->u.lov.lov_mdc_tgts[ld->ld_md_tgts_nr].lmtd_mdc ==
-		NULL);
-
-	if (ld->ld_flags & LOV_DEV_INITIALIZED) {
-		rc = lov_mdc_dev_init(env, ld, mdc->obd_lu_dev, idx,
-				      ld->ld_md_tgts_nr);
-		if (rc) {
-			CERROR("%s: failed to add MDC %s as target: rc = %d\n",
-			       lov_obd->obd_name, obd_uuid2str(&mdc->obd_uuid),
-			       rc);
-			RETURN(rc);
-		}
-	}
-
-	lov_obd->u.lov.lov_mdc_tgts[ld->ld_md_tgts_nr].lmtd_mdc = mdc;
-	lov_obd->u.lov.lov_mdc_tgts[ld->ld_md_tgts_nr].lmtd_index = idx;
-	ld->ld_md_tgts_nr++;
-
-	RETURN(rc);
+        obd_putref(obd);
+        RETURN(rc);
 }
 
 static int lov_process_config(const struct lu_env *env,
-			      struct lu_device *d, struct lustre_cfg *cfg)
+                              struct lu_device *d, struct lustre_cfg *cfg)
 {
-	struct obd_device *obd = d->ld_obd;
-	int cmd;
-	int rc;
-	int gen;
-	u32 index;
-
-	lov_tgts_getref(obd);
-
-	cmd = cfg->lcfg_command;
-
-	rc = lov_process_config_base(d->ld_obd, cfg, &index, &gen);
-	if (rc < 0)
-		GOTO(out, rc);
-
-	switch (cmd) {
-	case LCFG_LOV_ADD_OBD:
-	case LCFG_LOV_ADD_INA:
-		rc = lov_cl_add_target(env, d, index);
-		if (rc != 0)
-			lov_del_target(d->ld_obd, index, NULL, 0);
-		break;
-	case LCFG_LOV_DEL_OBD:
-		lov_cl_del_target(env, d, index);
-		break;
-	case LCFG_ADD_MDC:
-	{
-		struct obd_device *mdc;
-		struct obd_uuid tgt_uuid;
-
-		/*
-		 * modify_mdc_tgts add 0:lustre-clilmv  1:lustre-MDT0000_UUID
-		 * 2:0  3:1  4:lustre-MDT0000-mdc_UUID
-		 */
-		if (LUSTRE_CFG_BUFLEN(cfg, 1) > sizeof(tgt_uuid.uuid))
-			GOTO(out, rc = -EINVAL);
-
-		obd_str2uuid(&tgt_uuid, lustre_cfg_buf(cfg, 1));
-
-		rc = kstrtou32(lustre_cfg_buf(cfg, 2), 10, &index);
-		if (rc)
-			GOTO(out, rc);
-
-		mdc = class_find_client_obd(&tgt_uuid, LUSTRE_MDC_NAME,
-					    &obd->obd_uuid);
-		if (!mdc)
-			GOTO(out, rc = -ENODEV);
-		rc = lov_add_mdc_target(env, d, mdc, index);
-		break;
-	}
-	}
-out:
-	lov_tgts_putref(obd);
-	RETURN(rc);
+        struct obd_device *obd = d->ld_obd;
+        int cmd;
+        int rc;
+        int gen;
+        __u32 index;
+
+        obd_getref(obd);
+
+        cmd = cfg->lcfg_command;
+        rc = lov_process_config_base(d->ld_obd, cfg, &index, &gen);
+        if (rc == 0) {
+                switch(cmd) {
+                case LCFG_LOV_ADD_OBD:
+                case LCFG_LOV_ADD_INA:
+                        rc = lov_cl_add_target(env, d, index);
+                        if (rc != 0)
+				lov_del_target(d->ld_obd, index, NULL, 0);
+                        break;
+                case LCFG_LOV_DEL_OBD:
+                        lov_cl_del_target(env, d, index);
+                        break;
+                }
+        }
+        obd_putref(obd);
+        RETURN(rc);
 }
 
 static const struct lu_device_operations lov_lu_ops = {
-	.ldo_object_alloc      = lov_object_alloc,
-	.ldo_process_config    = lov_process_config,
+        .ldo_object_alloc      = lov_object_alloc,
+        .ldo_process_config    = lov_process_config,
 };
 
 static struct lu_device *lov_device_alloc(const struct lu_env *env,
-					  struct lu_device_type *t,
-					  struct lustre_cfg *cfg)
+                                          struct lu_device_type *t,
+                                          struct lustre_cfg *cfg)
 {
-	struct lu_device *d;
-	struct lov_device *ld;
-	struct obd_device *obd;
-	int rc;
+        struct lu_device *d;
+        struct lov_device *ld;
+        struct obd_device *obd;
+        int rc;
 
-	OBD_ALLOC_PTR(ld);
-	if (!ld)
-		RETURN(ERR_PTR(-ENOMEM));
+        OBD_ALLOC_PTR(ld);
+        if (ld == NULL)
+                RETURN(ERR_PTR(-ENOMEM));
 
 	cl_device_init(&ld->ld_cl, t);
 	d = lov2lu_dev(ld);
 	d->ld_ops = &lov_lu_ops;
 
-	/* setup the LOV OBD */
-	obd = class_name2obd(lustre_cfg_string(cfg, 0));
-	LASSERT(obd != NULL);
-	rc = lov_setup(obd, cfg);
-	if (rc)
-		GOTO(out, rc);
-
-	/* Alloc MDC devices array */
-	/* XXX: need dynamic allocation at some moment */
-	OBD_ALLOC(ld->ld_md_tgts, sizeof(*ld->ld_md_tgts) * LOV_MDC_TGT_MAX);
-	if (!ld->ld_md_tgts)
-		GOTO(out, rc = -ENOMEM);
-
-	ld->ld_md_tgts_nr = 0;
-
-	ld->ld_lov = &obd->u.lov;
-	OBD_ALLOC(ld->ld_lov->lov_mdc_tgts,
-		  sizeof(*ld->ld_lov->lov_mdc_tgts) * LOV_MDC_TGT_MAX);
-	if (!ld->ld_lov->lov_mdc_tgts)
-		GOTO(out_md_tgts, rc = -ENOMEM);
-
-	rc = lu_site_init(&ld->ld_site, d);
-	if (rc != 0)
-		GOTO(out_mdc_tgts, rc);
-
-	rc = lu_site_init_finish(&ld->ld_site);
-	if (rc != 0)
-		GOTO(out_site, rc);
-
-	RETURN(d);
-out_site:
-	lu_site_fini(&ld->ld_site);
-out_mdc_tgts:
-	OBD_FREE(ld->ld_lov->lov_mdc_tgts,
-		 sizeof(*ld->ld_lov->lov_mdc_tgts) * LOV_MDC_TGT_MAX);
-	ld->ld_lov->lov_mdc_tgts = NULL;
-out_md_tgts:
-	OBD_FREE(ld->ld_md_tgts, sizeof(*ld->ld_md_tgts) * LOV_MDC_TGT_MAX);
-	ld->ld_md_tgts = NULL;
-out:
-	OBD_FREE_PTR(ld);
+        /* setup the LOV OBD */
+        obd = class_name2obd(lustre_cfg_string(cfg, 0));
+        LASSERT(obd != NULL);
+        rc = lov_setup(obd, cfg);
+        if (rc) {
+                lov_device_free(env, d);
+                RETURN(ERR_PTR(rc));
+        }
 
-	return ERR_PTR(rc);
+        ld->ld_lov = &obd->u.lov;
+        RETURN(d);
 }
 
 static const struct lu_device_type_operations lov_device_type_ops = {
-	.ldto_init = lov_type_init,
-	.ldto_fini = lov_type_fini,
+        .ldto_init = lov_type_init,
+        .ldto_fini = lov_type_fini,
 
-	.ldto_start = lov_type_start,
-	.ldto_stop  = lov_type_stop,
+        .ldto_start = lov_type_start,
+        .ldto_stop  = lov_type_stop,
 
-	.ldto_device_alloc = lov_device_alloc,
-	.ldto_device_free  = lov_device_free,
+        .ldto_device_alloc = lov_device_alloc,
+        .ldto_device_free  = lov_device_free,
 
-	.ldto_device_init    = lov_device_init,
-	.ldto_device_fini    = lov_device_fini
+        .ldto_device_init    = lov_device_init,
+        .ldto_device_fini    = lov_device_fini
 };
 
 struct lu_device_type lov_device_type = {
-	.ldt_tags     = LU_DEVICE_CL,
-	.ldt_name     = LUSTRE_LOV_NAME,
-	.ldt_ops      = &lov_device_type_ops,
-	.ldt_ctx_tags = LCT_CL_THREAD
+        .ldt_tags     = LU_DEVICE_CL,
+        .ldt_name     = LUSTRE_LOV_NAME,
+        .ldt_ops      = &lov_device_type_ops,
+        .ldt_ctx_tags = LCT_CL_THREAD
 };
 
 /** @} lov */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_ea.c b/drivers/staging/lustrefsx/lustre/lov/lov_ea.c
index 1d388637d0235..5b50b0a9294dc 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_ea.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_ea.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -41,6 +41,9 @@
 #include <libcfs/libcfs.h>
 
 #include <obd_class.h>
+#include <lustre/lustre_idl.h>
+#include <lustre/lustre_user.h>
+
 #include "lov_internal.h"
 
 static inline void
@@ -50,10 +53,8 @@ lu_extent_le_to_cpu(struct lu_extent *dst, const struct lu_extent *src)
 	dst->e_end = le64_to_cpu(src->e_end);
 }
 
-/*
- * Find minimum stripe maxbytes value.  For inactive or
- * reconnecting targets use LUSTRE_EXT3_STRIPE_MAXBYTES.
- */
+/* Find minimum stripe maxbytes value.  For inactive or
+ * reconnecting targets use LUSTRE_EXT3_STRIPE_MAXBYTES. */
 static loff_t lov_tgt_maxbytes(struct lov_tgt_desc *tgt)
 {
 	struct obd_import *imp;
@@ -63,12 +64,11 @@ static loff_t lov_tgt_maxbytes(struct lov_tgt_desc *tgt)
 		return maxbytes;
 
 	imp = tgt->ltd_obd->u.cli.cl_import;
-	if (!imp)
+	if (imp == NULL)
 		return maxbytes;
 
 	spin_lock(&imp->imp_lock);
-	if ((imp->imp_state == LUSTRE_IMP_FULL ||
-	    imp->imp_state == LUSTRE_IMP_IDLE) &&
+	if (imp->imp_state == LUSTRE_IMP_FULL &&
 	    (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES) &&
 	    imp->imp_connect_data.ocd_maxbytes > 0)
 		maxbytes = imp->imp_connect_data.ocd_maxbytes;
@@ -93,8 +93,7 @@ static int lsm_lmm_verify_v1v3(struct lov_mds_md *lmm, size_t lmm_size,
 		return -EINVAL;
 	}
 
-	if (lov_pattern(le32_to_cpu(lmm->lmm_pattern)) != LOV_PATTERN_MDT &&
-	    lov_pattern(le32_to_cpu(lmm->lmm_pattern)) != LOV_PATTERN_RAID0) {
+	if (lov_pattern(le32_to_cpu(lmm->lmm_pattern)) != LOV_PATTERN_RAID0) {
 		CERROR("bad striping pattern\n");
 		lov_dump_lmm_common(D_WARNING, lmm);
 		return -EINVAL;
@@ -185,7 +184,7 @@ lsme_unpack(struct lov_obd *lov, struct lov_mds_md *lmm, size_t buf_size,
 
 	lsme_size = offsetof(typeof(*lsme), lsme_oinfo[stripe_count]);
 	OBD_ALLOC_LARGE(lsme, lsme_size);
-	if (!lsme)
+	if (lsme == NULL)
 		RETURN(ERR_PTR(-ENOMEM));
 
 	lsme->lsme_magic = magic;
@@ -196,7 +195,7 @@ lsme_unpack(struct lov_obd *lov, struct lov_mds_md *lmm, size_t buf_size,
 	lsme->lsme_stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
 	lsme->lsme_layout_gen = le16_to_cpu(lmm->lmm_layout_gen);
 
-	if (pool_name) {
+	if (pool_name != NULL) {
 		size_t pool_name_len;
 
 		pool_name_len = strlcpy(lsme->lsme_pool_name, pool_name,
@@ -205,22 +204,12 @@ lsme_unpack(struct lov_obd *lov, struct lov_mds_md *lmm, size_t buf_size,
 			GOTO(out_lsme, rc = -E2BIG);
 	}
 
-	/* with Data-on-MDT set maxbytes to stripe size */
-	if (lsme_is_dom(lsme)) {
-		if (maxbytes) {
-			lov_bytes = lsme->lsme_stripe_size;
-			goto out_dom1;
-		} else {
-			goto out_dom2;
-		}
-	}
-
 	for (i = 0; i < stripe_count; i++) {
 		struct lov_oinfo *loi;
 		struct lov_tgt_desc *ltd;
 
 		OBD_SLAB_ALLOC_PTR_GFP(loi, lov_oinfo_slab, GFP_NOFS);
-		if (!loi)
+		if (loi == NULL)
 			GOTO(out_lsme, rc = -ENOMEM);
 
 		lsme->lsme_oinfo[i] = loi;
@@ -241,7 +230,7 @@ lsme_unpack(struct lov_obd *lov, struct lov_mds_md *lmm, size_t buf_size,
 		}
 
 		ltd = lov->lov_tgts[loi->loi_ost_idx];
-		if (!ltd) {
+		if (ltd == NULL) {
 			CERROR("%s: OST index %d missing\n",
 			       (char*)lov->desc.ld_uuid.uuid, loi->loi_ost_idx);
 			lov_dump_lmm_v1(D_WARNING, lmm);
@@ -253,21 +242,17 @@ lsme_unpack(struct lov_obd *lov, struct lov_mds_md *lmm, size_t buf_size,
 			min_stripe_maxbytes = lov_bytes;
 	}
 
-	if (maxbytes) {
-		if (min_stripe_maxbytes == 0)
-			min_stripe_maxbytes = LUSTRE_EXT3_STRIPE_MAXBYTES;
+	if (min_stripe_maxbytes == 0)
+		min_stripe_maxbytes = LUSTRE_EXT3_STRIPE_MAXBYTES;
 
-		if (stripe_count == 0)
-			stripe_count = lov->desc.ld_tgt_count;
+	lov_bytes = min_stripe_maxbytes * stripe_count;
 
-		if (min_stripe_maxbytes <= LLONG_MAX / stripe_count)
-			lov_bytes = min_stripe_maxbytes * stripe_count;
+	if (maxbytes != NULL) {
+		if (lov_bytes < min_stripe_maxbytes) /* handle overflow */
+			*maxbytes = MAX_LFS_FILESIZE;
 		else
-			lov_bytes = MAX_LFS_FILESIZE;
-out_dom1:
-		*maxbytes = min_t(loff_t, lov_bytes, MAX_LFS_FILESIZE);
+			*maxbytes = lov_bytes;
 	}
-out_dom2:
 
 	return lsme;
 
@@ -275,7 +260,7 @@ lsme_unpack(struct lov_obd *lov, struct lov_mds_md *lmm, size_t buf_size,
 	for (i = 0; i < stripe_count; i++) {
 		struct lov_oinfo *loi = lsme->lsme_oinfo[i];
 
-		if (loi)
+		if (loi != NULL)
 			OBD_SLAB_FREE_PTR(lsme->lsme_oinfo[i], lov_oinfo_slab);
 	}
 	OBD_FREE_LARGE(lsme, lsme_size);
@@ -308,7 +293,7 @@ lov_stripe_md *lsm_unpackmd_v1v3(struct lov_obd *lov, struct lov_mds_md *lmm,
 
 	lsm_size = offsetof(typeof(*lsm), lsm_entries[1]);
 	OBD_ALLOC(lsm, lsm_size);
-	if (!lsm)
+	if (lsm == NULL)
 		GOTO(out_lsme, rc = -ENOMEM);
 
 	atomic_set(&lsm->lsm_refc, 1);
@@ -399,8 +384,7 @@ lsme_unpack_comp(struct lov_obd *lov, struct lov_mds_md *lmm,
 	unsigned int stripe_count;
 
 	stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
-	if (stripe_count == 0 &&
-	    lov_pattern(le32_to_cpu(lmm->lmm_pattern)) != LOV_PATTERN_MDT)
+	if (stripe_count == 0)
 		RETURN(ERR_PTR(-EINVAL));
 	/* un-instantiated lmm contains no ost id info, i.e. lov_ost_data_v1 */
 	if (!inited)
@@ -443,7 +427,7 @@ lsm_unpackmd_comp_md_v1(struct lov_obd *lov, void *buf, size_t buf_size)
 
 	lsm_size = offsetof(typeof(*lsm), lsm_entries[entry_count]);
 	OBD_ALLOC(lsm, lsm_size);
-	if (!lsm)
+	if (lsm == NULL)
 		return ERR_PTR(-ENOMEM);
 
 	atomic_set(&lsm->lsm_refc, 1);
@@ -451,8 +435,6 @@ lsm_unpackmd_comp_md_v1(struct lov_obd *lov, void *buf, size_t buf_size)
 	lsm->lsm_magic = le32_to_cpu(lcm->lcm_magic);
 	lsm->lsm_layout_gen = le32_to_cpu(lcm->lcm_layout_gen);
 	lsm->lsm_entry_count = entry_count;
-	lsm->lsm_mirror_count = le16_to_cpu(lcm->lcm_mirror_count);
-	lsm->lsm_flags = le16_to_cpu(lcm->lcm_flags);
 	lsm->lsm_is_released = true;
 	lsm->lsm_maxbytes = LLONG_MIN;
 
@@ -481,22 +463,16 @@ lsm_unpackmd_comp_md_v1(struct lov_obd *lov, void *buf, size_t buf_size)
 		lsm->lsm_entries[i] = lsme;
 		lsme->lsme_id = le32_to_cpu(lcme->lcme_id);
 		lsme->lsme_flags = le32_to_cpu(lcme->lcme_flags);
-		if (lsme->lsme_flags & LCME_FL_NOSYNC)
-			lsme->lsme_timestamp =
-				le64_to_cpu(lcme->lcme_timestamp);
 		lu_extent_le_to_cpu(&lsme->lsme_extent, &lcme->lcme_extent);
 
 		if (i == entry_count - 1) {
 			lsm->lsm_maxbytes = (loff_t)lsme->lsme_extent.e_start +
 					    maxbytes;
-			/*
-			 * the last component hasn't been defined, or
-			 * lsm_maxbytes overflowed.
-			 */
-			if (!lsme_is_dom(lsme) &&
-			    (lsme->lsme_extent.e_end != LUSTRE_EOF ||
-			     lsm->lsm_maxbytes <
-			     (loff_t)lsme->lsme_extent.e_start))
+			/* the last component hasn't been defined, or
+			 * lsm_maxbytes overflowed. */
+			if (lsme->lsme_extent.e_end != LUSTRE_EOF ||
+			    lsm->lsm_maxbytes <
+			    (loff_t)lsme->lsme_extent.e_start)
 				lsm->lsm_maxbytes = MAX_LFS_FILESIZE;
 		}
 	}
@@ -505,7 +481,7 @@ lsm_unpackmd_comp_md_v1(struct lov_obd *lov, void *buf, size_t buf_size)
 
 out_lsm:
 	for (i = 0; i < entry_count; i++)
-		if (lsm->lsm_entries[i])
+		if (lsm->lsm_entries[i] != NULL)
 			lsme_free(lsm->lsm_entries[i]);
 
 	OBD_FREE(lsm, lsm_size);
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_internal.h b/drivers/staging/lustrefsx/lustre/lov/lov_internal.h
index a1cbea9a5c4d4..524b0a4eac681 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_internal.h
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -34,7 +34,7 @@
 #define LOV_INTERNAL_H
 
 #include <obd_class.h>
-#include <uapi/linux/lustre/lustre_user.h>
+#include <lustre/lustre_user.h>
 
 /* If we are unable to get the maximum object size from the OST in
  * ocd_maxbytes using OBD_CONNECT_MAXBYTES, then we fall back to using
@@ -47,7 +47,6 @@ struct lov_stripe_md_entry {
 	u32			lsme_magic;
 	u32			lsme_flags;
 	u32			lsme_pattern;
-	u64			lsme_timestamp;
 	u32			lsme_stripe_size;
 	u16			lsme_stripe_count;
 	u16			lsme_layout_gen;
@@ -55,11 +54,6 @@ struct lov_stripe_md_entry {
 	struct lov_oinfo       *lsme_oinfo[];
 };
 
-static inline bool lsme_is_dom(struct lov_stripe_md_entry *lsme)
-{
-	return (lov_pattern(lsme->lsme_pattern) == LOV_PATTERN_MDT);
-}
-
 static inline void copy_lsm_entry(struct lov_stripe_md_entry *dst,
 				  struct lov_stripe_md_entry *src)
 {
@@ -81,10 +75,8 @@ struct lov_stripe_md {
 	struct ost_id	lsm_oi;
 	u32		lsm_magic;
 	u32		lsm_layout_gen;
-	u16		lsm_flags;
+	u32		lsm_entry_count;
 	bool		lsm_is_released;
-	u16		lsm_mirror_count;
-	u16		lsm_entry_count;
 	struct lov_stripe_md_entry *lsm_entries[];
 };
 
@@ -127,7 +119,7 @@ static inline size_t lov_comp_md_size(const struct lov_stripe_md *lsm)
 			stripe_count = 0;
 
 		size += sizeof(*lsme);
-		size += lov_mds_md_size(stripe_count,
+		size += lov_mds_md_size(lsme->lsme_stripe_count,
 					lsme->lsme_magic);
 	}
 
@@ -195,22 +187,19 @@ void lsm_free(struct lov_stripe_md *lsm);
 })
 #elif BITS_PER_LONG == 32
 # define lov_do_div64(n, base) ({					\
-	uint64_t __num = (n);						\
 	uint64_t __rem;							\
 	if ((sizeof(base) > 4) && (((base) & 0xffffffff00000000ULL) != 0)) {  \
-		int __remainder;					\
-		LASSERTF(!((base) & (LOV_MIN_STRIPE_SIZE - 1)),		\
-			 "64 bit lov division %llu / %llu\n",		\
-			 __num, (uint64_t)(base));			\
-		__remainder = __num & (LOV_MIN_STRIPE_SIZE - 1);	\
-		__num >>= LOV_MIN_STRIPE_BITS;				\
-		__rem = do_div(__num, (base) >> LOV_MIN_STRIPE_BITS);	\
+		int __remainder;					      \
+		LASSERTF(!((base) & (LOV_MIN_STRIPE_SIZE - 1)), "64 bit lov " \
+			 "division %llu / %llu\n", (n), (uint64_t)(base));    \
+		__remainder = (n) & (LOV_MIN_STRIPE_SIZE - 1);		\
+		(n) >>= LOV_MIN_STRIPE_BITS;				\
+		__rem = do_div(n, (base) >> LOV_MIN_STRIPE_BITS);	\
 		__rem <<= LOV_MIN_STRIPE_BITS;				\
 		__rem += __remainder;					\
 	} else {							\
-		__rem = do_div(__num, base);				\
+		__rem = do_div(n, base);				\
 	}								\
-	(n) = __num;							\
 	__rem;								\
 })
 #endif
@@ -257,7 +246,6 @@ int lov_merge_lvb_kms(struct lov_stripe_md *lsm, int index,
                       struct ost_lvb *lvb, __u64 *kms_place);
 
 /* lov_offset.c */
-loff_t stripe_width(struct lov_stripe_md *lsm, unsigned int index);
 u64 lov_stripe_size(struct lov_stripe_md *lsm, int index,
 		    u64 ost_size, int stripeno);
 int lov_stripe_offset(struct lov_stripe_md *lsm, int index, loff_t lov_off,
@@ -276,8 +264,6 @@ int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
 int lov_fini_statfs_set(struct lov_request_set *set);
 
 /* lov_obd.c */
-void lov_tgts_getref(struct obd_device *obd);
-void lov_tgts_putref(struct obd_device *obd);
 void lov_stripe_lock(struct lov_stripe_md *md);
 void lov_stripe_unlock(struct lov_stripe_md *md);
 void lov_fix_desc(struct lov_desc *desc);
@@ -287,13 +273,13 @@ void lov_fix_desc_pattern(__u32 *val);
 void lov_fix_desc_qos_maxage(__u32 *val);
 __u16 lov_get_stripe_count(struct lov_obd *lov, __u32 magic,
 			   __u16 stripe_count);
-int lov_connect_obd(struct obd_device *obd, u32 index, int activate,
-		    struct obd_connect_data *data);
+int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
+                    struct obd_connect_data *data);
 int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
 int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg,
-			    u32 *indexp, int *genp);
-int lov_del_target(struct obd_device *obd, u32 index,
-		   struct obd_uuid *uuidp, int gen);
+                            __u32 *indexp, int *genp);
+int lov_del_target(struct obd_device *obd, __u32 index,
+                   struct obd_uuid *uuidp, int gen);
 
 /* lov_pack.c */
 ssize_t lov_lsm_pack(const struct lov_stripe_md *lsm, void *buf,
@@ -312,13 +298,14 @@ void lsm_free_plain(struct lov_stripe_md *lsm);
 void dump_lsm(unsigned int level, const struct lov_stripe_md *lsm);
 
 /* lproc_lov.c */
-int lov_tunables_init(struct obd_device *obd);
+extern const struct proc_ops lov_proc_target_fops;
+#ifdef CONFIG_PROC_FS
+extern struct lprocfs_vars lprocfs_lov_obd_vars[];
+#endif
 
 /* lov_cl.c */
 extern struct lu_device_type lov_device_type;
 
-#define LOV_MDC_TGT_MAX 256
-
 /* pools */
 extern struct cfs_hash_ops pool_hash_operations;
 /* ost_pool methods */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_io.c b/drivers/staging/lustrefsx/lustre/lov/lov_io.c
index c6eb7121b5db9..5544a9744b73e 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_io.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_io.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -56,7 +56,7 @@ static inline struct lov_io_sub *lov_sub_alloc(struct lov_io *lio, int index)
 		OBD_ALLOC_PTR(sub);
 	}
 
-	if (sub) {
+	if (sub != NULL) {
 		INIT_LIST_HEAD(&sub->sub_list);
 		INIT_LIST_HEAD(&sub->sub_linkage);
 		sub->sub_subio_index = index;
@@ -82,22 +82,13 @@ static void lov_io_sub_fini(const struct lu_env *env, struct lov_io *lio,
 
 	cl_io_fini(sub->sub_env, &sub->sub_io);
 
-	if (sub->sub_env && !IS_ERR(sub->sub_env)) {
+	if (sub->sub_env != NULL && !IS_ERR(sub->sub_env)) {
 		cl_env_put(sub->sub_env, &sub->sub_refcheck);
 		sub->sub_env = NULL;
 	}
 	EXIT;
 }
 
-static inline bool
-is_index_within_mirror(struct lov_object *lov, int index, int mirror_index)
-{
-	struct lov_layout_composite *comp = &lov->u.composite;
-	struct lov_mirror_entry *lre = &comp->lo_mirrors[mirror_index];
-
-	return (index >= lre->lre_start && index <= lre->lre_end);
-}
-
 static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio,
 			   struct lov_io_sub *sub)
 {
@@ -115,17 +106,10 @@ static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio,
 		     !lov_r0(lov, index)->lo_sub[stripe]))
 		RETURN(-EIO);
 
-	LASSERTF(is_index_within_mirror(lov, index, lio->lis_mirror_index),
-		 DFID "iot = %d, index = %d, mirror = %d\n",
-		 PFID(lu_object_fid(lov2lu(lov))), io->ci_type, index,
-		 lio->lis_mirror_index);
-
 	/* obtain new environment */
 	sub->sub_env = cl_env_get(&sub->sub_refcheck);
-	if (IS_ERR(sub->sub_env)) {
+	if (IS_ERR(sub->sub_env))
 		result = PTR_ERR(sub->sub_env);
-		RETURN(result);
-	}
 
 	sub_obj = lovsub2cl(lov_r0(lov, index)->lo_sub[stripe]);
 	sub_io  = &sub->sub_io;
@@ -138,10 +122,7 @@ static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio,
 	sub_io->ci_type    = io->ci_type;
 	sub_io->ci_no_srvlock = io->ci_no_srvlock;
 	sub_io->ci_noatime = io->ci_noatime;
-	sub_io->ci_lock_no_expand = io->ci_lock_no_expand;
-	sub_io->ci_ndelay = io->ci_ndelay;
-	sub_io->ci_layout_version = io->ci_layout_version;
-	sub_io->ci_tried_all_mirrors = io->ci_tried_all_mirrors;
+	sub_io->ci_pio = io->ci_pio;
 
 	result = cl_io_sub_init(sub->sub_env, sub_io, io->ci_type, sub_obj);
 
@@ -168,7 +149,7 @@ struct lov_io_sub *lov_sub_get(const struct lu_env *env,
 
 	if (rc == 0) {
 		sub = lov_sub_alloc(lio, index);
-		if (!sub)
+		if (sub == NULL)
 			GOTO(out, rc = -ENOMEM);
 
 		rc = lov_io_sub_init(env, lio, sub);
@@ -183,8 +164,6 @@ struct lov_io_sub *lov_sub_get(const struct lu_env *env,
 out:
 	if (rc < 0)
 		sub = ERR_PTR(rc);
-	else
-		sub->sub_io.ci_noquota = lio->lis_cl.cis_io->ci_noquota;
 	RETURN(sub);
 }
 
@@ -220,270 +199,9 @@ static int lov_io_subio_init(const struct lu_env *env, struct lov_io *lio,
 	RETURN(0);
 }
 
-/**
- * Decide if it will need write intent RPC
- */
-static int lov_io_mirror_write_intent(struct lov_io *lio,
-	struct lov_object *obj, struct cl_io *io)
-{
-	struct lov_layout_composite *comp = &obj->u.composite;
-	struct lu_extent *ext = &io->ci_write_intent;
-	struct lov_mirror_entry *lre;
-	struct lov_mirror_entry *primary;
-	struct lov_layout_entry *lle;
-	size_t count = 0;
-	ENTRY;
-
-	*ext = (typeof(*ext)) { lio->lis_pos, lio->lis_endpos };
-	io->ci_need_write_intent = 0;
-
-	if (!(io->ci_type == CIT_WRITE || cl_io_is_trunc(io) ||
-	      cl_io_is_mkwrite(io)))
-		RETURN(0);
-
-	/*
-	 * FLR: check if it needs to send a write intent RPC to server.
-	 * Writing to sync_pending file needs write intent RPC to change
-	 * the file state back to write_pending, so that the layout version
-	 * can be increased when the state changes to sync_pending at a later
-	 * time. Otherwise there exists a chance that an evicted client may
-	 * dirty the file data while resync client is working on it.
-	 * Designated I/O is allowed for resync workload.
-	 */
-	if (lov_flr_state(obj) == LCM_FL_RDONLY ||
-	    (lov_flr_state(obj) == LCM_FL_SYNC_PENDING &&
-	     io->ci_designated_mirror == 0)) {
-		io->ci_need_write_intent = 1;
-		RETURN(0);
-	}
-
-	LASSERT((lov_flr_state(obj) == LCM_FL_WRITE_PENDING));
-	LASSERT(comp->lo_preferred_mirror >= 0);
-
-	/*
-	 * need to iterate all components to see if there are
-	 * multiple components covering the writing component
-	 */
-	primary = &comp->lo_mirrors[comp->lo_preferred_mirror];
-	LASSERT(!primary->lre_stale);
-	lov_foreach_mirror_layout_entry(obj, lle, primary) {
-		LASSERT(lle->lle_valid);
-		if (!lu_extent_is_overlapped(ext, lle->lle_extent))
-			continue;
-
-		ext->e_start = MIN(ext->e_start, lle->lle_extent->e_start);
-		ext->e_end = MAX(ext->e_end, lle->lle_extent->e_end);
-		++count;
-	}
-	if (count == 0) {
-		CERROR(DFID ": cannot find any valid components covering "
-		       "file extent "DEXT", mirror: %d\n",
-		       PFID(lu_object_fid(lov2lu(obj))), PEXT(ext),
-		       primary->lre_mirror_id);
-		RETURN(-EIO);
-	}
-
-	count = 0;
-	lov_foreach_mirror_entry(obj, lre) {
-		if (lre == primary)
-			continue;
-
-		lov_foreach_mirror_layout_entry(obj, lle, lre) {
-			if (!lle->lle_valid)
-				continue;
-
-			if (lu_extent_is_overlapped(ext, lle->lle_extent)) {
-				++count;
-				break;
-			}
-		}
-	}
-
-	CDEBUG(D_VFSTRACE, DFID "there are %zd components to be staled to "
-	       "modify file extent "DEXT", iot: %d\n",
-	       PFID(lu_object_fid(lov2lu(obj))), count, PEXT(ext), io->ci_type);
-
-	io->ci_need_write_intent = count > 0;
-
-	RETURN(0);
-}
-
-static int lov_io_mirror_init(struct lov_io *lio, struct lov_object *obj,
-			       struct cl_io *io)
-{
-	struct lov_layout_composite *comp = &obj->u.composite;
-	int index;
-	int i;
-	int result;
-	ENTRY;
-
-	if (!lov_is_flr(obj)) {
-		/* only locks/pages are manipulated for CIT_MISC op, no
-		 * cl_io_loop() will be called, don't check/set mirror info.
-		 */
-		if (io->ci_type != CIT_MISC) {
-			LASSERT(comp->lo_preferred_mirror == 0);
-			lio->lis_mirror_index = comp->lo_preferred_mirror;
-		}
-		io->ci_ndelay = 0;
-		RETURN(0);
-	}
-
-	/* transfer the layout version for verification */
-	if (io->ci_layout_version == 0)
-		io->ci_layout_version = obj->lo_lsm->lsm_layout_gen;
-
-	/* find the corresponding mirror for designated mirror IO */
-	if (io->ci_designated_mirror > 0) {
-		struct lov_mirror_entry *entry;
-
-		LASSERT(!io->ci_ndelay);
-
-		CDEBUG(D_LAYOUT, "designated I/O mirror state: %d\n",
-		      lov_flr_state(obj));
-
-		if ((cl_io_is_trunc(io) || io->ci_type == CIT_WRITE) &&
-		    (io->ci_layout_version != obj->lo_lsm->lsm_layout_gen)) {
-			/*
-			 * For resync I/O, the ci_layout_version was the layout
-			 * version when resync starts. If it doesn't match the
-			 * current object layout version, it means the layout
-			 * has been changed
-			 */
-			RETURN(-ESTALE);
-		}
-
-		io->ci_layout_version |= LU_LAYOUT_RESYNC;
-
-		index = 0;
-		lio->lis_mirror_index = -1;
-		lov_foreach_mirror_entry(obj, entry) {
-			if (entry->lre_mirror_id ==
-			    io->ci_designated_mirror) {
-				lio->lis_mirror_index = index;
-				break;
-			}
-
-			index++;
-		}
-
-		RETURN(lio->lis_mirror_index < 0 ? -EINVAL : 0);
-	}
-
-	result = lov_io_mirror_write_intent(lio, obj, io);
-	if (result)
-		RETURN(result);
-
-	if (io->ci_need_write_intent) {
-		CDEBUG(D_VFSTRACE, DFID " need write intent for [%llu, %llu)\n",
-		       PFID(lu_object_fid(lov2lu(obj))),
-		       lio->lis_pos, lio->lis_endpos);
-
-		if (cl_io_is_trunc(io)) {
-			/**
-			 * for truncate, we uses [size, EOF) to judge whether
-			 * a write intent needs to be send, but we need to
-			 * restore the write extent to [0, size], in truncate,
-			 * the byte in the size position is accessed.
-			 */
-			io->ci_write_intent.e_start = 0;
-			io->ci_write_intent.e_end =
-					io->u.ci_setattr.sa_attr.lvb_size + 1;
-		}
-		/* stop cl_io_init() loop */
-		RETURN(1);
-	}
-
-	if (io->ci_ndelay_tried == 0 || /* first time to try */
-	    /* reset the mirror index if layout has changed */
-	    lio->lis_mirror_layout_gen != obj->lo_lsm->lsm_layout_gen) {
-		lio->lis_mirror_layout_gen = obj->lo_lsm->lsm_layout_gen;
-		index = lio->lis_mirror_index = comp->lo_preferred_mirror;
-	} else {
-		index = lio->lis_mirror_index;
-		LASSERT(index >= 0);
-
-		/* move mirror index to the next one */
-		index = (index + 1) % comp->lo_mirror_count;
-	}
-
-	for (i = 0; i < comp->lo_mirror_count; i++) {
-		struct lu_extent ext = { .e_start = lio->lis_pos,
-					 .e_end   = lio->lis_pos + 1 };
-		struct lov_mirror_entry *lre;
-		struct lov_layout_entry *lle;
-		bool found = false;
-
-		lre = &comp->lo_mirrors[(index + i) % comp->lo_mirror_count];
-		if (!lre->lre_valid)
-			continue;
-
-		lov_foreach_mirror_layout_entry(obj, lle, lre) {
-			if (!lle->lle_valid)
-				continue;
-
-			if (lu_extent_is_overlapped(&ext, lle->lle_extent)) {
-				found = true;
-				break;
-			}
-		} /* each component of the mirror */
-		if (found) {
-			index = (index + i) % comp->lo_mirror_count;
-			break;
-		}
-	} /* each mirror */
-
-	if (i == comp->lo_mirror_count) {
-		CERROR(DFID": failed to find a component covering "
-		       "I/O region at %llu\n",
-		       PFID(lu_object_fid(lov2lu(obj))), lio->lis_pos);
-
-		dump_lsm(D_ERROR, obj->lo_lsm);
-
-		RETURN(-EIO);
-	}
-
-	CDEBUG(D_VFSTRACE, DFID ": flr state: %d, move mirror from %d to %d, "
-	       "have retried: %d, mirror count: %d\n",
-	       PFID(lu_object_fid(lov2lu(obj))), lov_flr_state(obj),
-	       lio->lis_mirror_index, index, io->ci_ndelay_tried,
-	       comp->lo_mirror_count);
-
-	lio->lis_mirror_index = index;
-
-	/*
-	 * FLR: if all mirrors have been tried once, most likely the network
-	 * of this client has been partitioned. We should relinquish CPU for
-	 * a while before trying again.
-	 */
-	if (io->ci_ndelay && io->ci_ndelay_tried > 0 &&
-	    (io->ci_ndelay_tried % comp->lo_mirror_count == 0)) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC)); /* 10ms */
-		if (signal_pending(current))
-			RETURN(-EINTR);
-
-		/**
-		 * we'd set ci_tried_all_mirrors to turn off fast mirror
-		 * switching for read after we've tried all mirrors several
-		 * rounds.
-		 */
-		io->ci_tried_all_mirrors = io->ci_ndelay_tried %
-					   (comp->lo_mirror_count * 4) == 0;
-	}
-	++io->ci_ndelay_tried;
-
-	CDEBUG(D_VFSTRACE, "use %sdelayed RPC state for this IO\n",
-	       io->ci_ndelay ? "non-" : "");
-
-	RETURN(0);
-}
-
 static int lov_io_slice_init(struct lov_io *lio,
 			     struct lov_object *obj, struct cl_io *io)
 {
-	int index;
-	int result = 0;
 	ENTRY;
 
 	io->ci_result = 0;
@@ -494,45 +212,42 @@ static int lov_io_slice_init(struct lov_io *lio,
 	switch (io->ci_type) {
 	case CIT_READ:
 	case CIT_WRITE:
-		lio->lis_pos = io->u.ci_rw.crw_pos;
-		lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count;
+		lio->lis_pos = io->u.ci_rw.rw_range.cir_pos;
+		lio->lis_endpos = lio->lis_pos + io->u.ci_rw.rw_range.cir_count;
 		lio->lis_io_endpos = lio->lis_endpos;
 		if (cl_io_is_append(io)) {
 			LASSERT(io->ci_type == CIT_WRITE);
 
-			/*
-			 * If there is LOV EA hole, then we may cannot locate
-			 * the current file-tail exactly.
-			 */
+			/* If there is LOV EA hole, then we may cannot locate
+			 * the current file-tail exactly. */
 			if (unlikely(obj->lo_lsm->lsm_entries[0]->lsme_pattern &
 				     LOV_PATTERN_F_HOLE))
-				GOTO(out, result = -EIO);
+				RETURN(-EIO);
 
 			lio->lis_pos = 0;
 			lio->lis_endpos = OBD_OBJECT_EOF;
 		}
 		break;
 
-	case CIT_SETATTR:
-		if (cl_io_is_trunc(io))
-			lio->lis_pos = io->u.ci_setattr.sa_attr.lvb_size;
-		else
-			lio->lis_pos = 0;
-		lio->lis_endpos = OBD_OBJECT_EOF;
-		break;
+        case CIT_SETATTR:
+                if (cl_io_is_trunc(io))
+                        lio->lis_pos = io->u.ci_setattr.sa_attr.lvb_size;
+                else
+                        lio->lis_pos = 0;
+                lio->lis_endpos = OBD_OBJECT_EOF;
+                break;
 
 	case CIT_DATA_VERSION:
 		lio->lis_pos = 0;
 		lio->lis_endpos = OBD_OBJECT_EOF;
 		break;
 
-	case CIT_FAULT: {
-		pgoff_t index = io->u.ci_fault.ft_index;
-
-		lio->lis_pos = cl_offset(io->ci_obj, index);
-		lio->lis_endpos = cl_offset(io->ci_obj, index + 1);
-		break;
-	}
+        case CIT_FAULT: {
+                pgoff_t index = io->u.ci_fault.ft_index;
+                lio->lis_pos = cl_offset(io->ci_obj, index);
+                lio->lis_endpos = cl_offset(io->ci_obj, index + 1);
+                break;
+        }
 
 	case CIT_FSYNC: {
 		lio->lis_pos = io->u.ci_fsync.fi_start;
@@ -546,84 +261,16 @@ static int lov_io_slice_init(struct lov_io *lio,
 		break;
 	}
 
-	case CIT_GLIMPSE:
-		lio->lis_pos = 0;
-		lio->lis_endpos = OBD_OBJECT_EOF;
-
-		if (lov_flr_state(obj) == LCM_FL_RDONLY &&
-		    !OBD_FAIL_CHECK(OBD_FAIL_FLR_GLIMPSE_IMMUTABLE))
-			/* SoM is accurate, no need glimpse */
-			GOTO(out, result = 1);
-		break;
-
-	case CIT_MISC:
-		lio->lis_pos = 0;
-		lio->lis_endpos = OBD_OBJECT_EOF;
-		break;
-
-	default:
-		LBUG();
-	}
-
-	result = lov_io_mirror_init(lio, obj, io);
-	if (result)
-		GOTO(out, result);
-
-	/* check if it needs to instantiate layout */
-	if (!(io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io) ||
-	      (cl_io_is_trunc(io) && io->u.ci_setattr.sa_attr.lvb_size > 0)))
-		GOTO(out, result = 0);
-
-	/*
-	 * for truncate, it only needs to instantiate the components
-	 * before the truncated size.
-	 */
-	if (cl_io_is_trunc(io)) {
-		io->ci_write_intent.e_start = 0;
-		/* for writes, e_end is endpos, the location of the file
-		 * pointer after the write is completed, so it is not accessed.
-		 * For truncate, 'end' is the size, and *is* acccessed.
-		 * In other words, writes are [start, end), but truncate is
-		 * [start, size], where both are included.  So add 1 to the
-		 * size when creating the write intent to account for this.
-		 */
-		io->ci_write_intent.e_end =
-			io->u.ci_setattr.sa_attr.lvb_size + 1;
-	} else {
-		io->ci_write_intent.e_start = lio->lis_pos;
-		io->ci_write_intent.e_end = lio->lis_endpos;
-	}
-
-	index = 0;
-	lov_foreach_io_layout(index, lio, &io->ci_write_intent) {
-		if (!lsm_entry_inited(obj->lo_lsm, index)) {
-			io->ci_need_write_intent = 1;
-			break;
-		}
-	}
-
-	if (io->ci_need_write_intent && io->ci_designated_mirror > 0) {
-		/*
-		 * REINT_SYNC RPC has already tried to instantiate all of the
-		 * components involved, obviously it didn't succeed. Skip this
-		 * mirror for now. The server won't be able to figure out
-		 * which mirror it should instantiate components
-		 */
-		CERROR(DFID": trying to instantiate components for designated "
-		       "I/O, file state: %d\n",
-		       PFID(lu_object_fid(lov2lu(obj))), lov_flr_state(obj));
+        case CIT_MISC:
+                lio->lis_pos = 0;
+                lio->lis_endpos = OBD_OBJECT_EOF;
+                break;
 
-		io->ci_need_write_intent = 0;
-		GOTO(out, result = -EIO);
-	}
-
-	if (io->ci_need_write_intent)
-		GOTO(out, result = 1);
-
-	EXIT;
+        default:
+                LBUG();
+        }
 
-out:
-	return result;
+	RETURN(0);
 }
 
 static void lov_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
@@ -663,13 +310,13 @@ static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio,
 	int index = lov_comp_entry(sub->sub_subio_index);
 	int stripe = lov_comp_stripe(sub->sub_subio_index);
 
+	io->ci_pio = parent->ci_pio;
 	switch (io->ci_type) {
 	case CIT_SETATTR: {
 		io->u.ci_setattr.sa_attr = parent->u.ci_setattr.sa_attr;
 		io->u.ci_setattr.sa_attr_flags =
 			parent->u.ci_setattr.sa_attr_flags;
-		io->u.ci_setattr.sa_avalid = parent->u.ci_setattr.sa_avalid;
-		io->u.ci_setattr.sa_xvalid = parent->u.ci_setattr.sa_xvalid;
+		io->u.ci_setattr.sa_valid = parent->u.ci_setattr.sa_valid;
 		io->u.ci_setattr.sa_stripe_index = stripe;
 		io->u.ci_setattr.sa_parent_fid =
 					parent->u.ci_setattr.sa_parent_fid;
@@ -708,13 +355,16 @@ static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio,
 	}
 	case CIT_READ:
 	case CIT_WRITE: {
-		io->u.ci_wr.wr_sync = cl_io_is_sync_write(parent);
-		io->ci_tried_all_mirrors = parent->ci_tried_all_mirrors;
+		io->u.ci_rw.rw_ptask = parent->u.ci_rw.rw_ptask;
+		io->u.ci_rw.rw_iter = parent->u.ci_rw.rw_iter;
+		io->u.ci_rw.rw_iocb = parent->u.ci_rw.rw_iocb;
+		io->u.ci_rw.rw_file = parent->u.ci_rw.rw_file;
+		io->u.ci_rw.rw_sync = parent->u.ci_rw.rw_sync;
 		if (cl_io_is_append(parent)) {
-			io->u.ci_wr.wr_append = 1;
+			io->u.ci_rw.rw_append = 1;
 		} else {
-			io->u.ci_rw.crw_pos = start;
-			io->u.ci_rw.crw_count = end - start;
+			io->u.ci_rw.rw_range.cir_pos = start;
+			io->u.ci_rw.rw_range.cir_count = end - start;
 		}
 		break;
 	}
@@ -726,8 +376,6 @@ static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio,
 		io->u.ci_ladvise.li_flags = parent->u.ci_ladvise.li_flags;
 		break;
 	}
-	case CIT_GLIMPSE:
-	case CIT_MISC:
 	default:
 		break;
 	}
@@ -735,75 +383,63 @@ static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio,
 
 static loff_t lov_offset_mod(loff_t val, int delta)
 {
-	if (val != OBD_OBJECT_EOF)
-		val += delta;
-	return val;
+        if (val != OBD_OBJECT_EOF)
+                val += delta;
+        return val;
 }
 
-static int lov_io_add_sub(const struct lu_env *env, struct lov_io *lio,
-			  struct lov_io_sub *sub, u64 start, u64 end)
-{
-	int rc;
-
-	end = lov_offset_mod(end, 1);
-	lov_io_sub_inherit(sub, lio, start, end);
-	rc = cl_io_iter_init(sub->sub_env, &sub->sub_io);
-	if (rc != 0) {
-		cl_io_iter_fini(sub->sub_env, &sub->sub_io);
-		return rc;
-	}
-
-	list_add_tail(&sub->sub_linkage, &lio->lis_active);
-
-	return rc;
-}
 static int lov_io_iter_init(const struct lu_env *env,
 			    const struct cl_io_slice *ios)
 {
-	struct lov_io *lio = cl2lov_io(env, ios);
+	struct cl_io         *io = ios->cis_io;
+	struct lov_io        *lio = cl2lov_io(env, ios);
 	struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
-	struct lov_io_sub *sub;
+	struct lov_io_sub    *sub;
+	struct lov_layout_entry *le;
 	struct lu_extent ext;
 	int index;
 	int rc = 0;
 
-	ENTRY;
+        ENTRY;
 
 	ext.e_start = lio->lis_pos;
 	ext.e_end = lio->lis_endpos;
 
-	lov_foreach_io_layout(index, lio, &ext) {
-		struct lov_layout_entry *le = lov_entry(lio->lis_object, index);
+	index = 0;
+	lov_foreach_layout_entry(lio->lis_object, le) {
 		struct lov_layout_raid0 *r0 = &le->lle_raid0;
 		u64 start;
 		u64 end;
 		int stripe;
-		bool tested_trunc_stripe = false;
 
-		r0->lo_trunc_stripeno = -1;
+		index++;
+		if (!lu_extent_is_overlapped(&ext, &le->lle_extent))
+			continue;
 
 		CDEBUG(D_VFSTRACE, "component[%d] flags %#x\n",
-		       index, lsm->lsm_entries[index]->lsme_flags);
-		if (!lsm_entry_inited(lsm, index)) {
-			/*
-			 * Read from uninitialized components should return
-			 * zero filled pages.
-			 */
-			continue;
-		}
+		       index - 1, lsm->lsm_entries[index - 1]->lsme_flags);
+		if (!lsm_entry_inited(lsm, index - 1)) {
+			/* truncate IO will trigger write intent as well, and
+			 * it's handled in lov_io_setattr_iter_init() */
+			if (io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io)) {
+				io->ci_need_write_intent = 1;
+				/* execute it in main thread */
+				io->ci_pio = 0;
+				rc = -ENODATA;
+				break;
+			}
 
-		if (!le->lle_valid && !ios->cis_io->ci_designated_mirror) {
-			CERROR("I/O to invalid component: %d, mirror: %d\n",
-			       index, lio->lis_mirror_index);
-			RETURN(-EIO);
+			/* Read from uninitialized components should return
+			 * zero filled pages. */
+			continue;
 		}
 
 		for (stripe = 0; stripe < r0->lo_nr; stripe++) {
-			if (!lov_stripe_intersects(lsm, index, stripe,
+			if (!lov_stripe_intersects(lsm, index - 1, stripe,
 						   &ext, &start, &end))
 				continue;
 
-			if (unlikely(!r0->lo_sub[stripe])) {
+			if (unlikely(r0->lo_sub[stripe] == NULL)) {
 				if (ios->cis_io->ci_type == CIT_READ ||
 				    ios->cis_io->ci_type == CIT_WRITE ||
 				    ios->cis_io->ci_type == CIT_FAULT)
@@ -812,79 +448,29 @@ static int lov_io_iter_init(const struct lu_env *env,
 				continue;
 			}
 
-			if (cl_io_is_trunc(ios->cis_io) &&
-			    !tested_trunc_stripe) {
-				int prev;
-				u64 tr_start;
-
-				prev = (stripe == 0) ? r0->lo_nr - 1 :
-							stripe - 1;
-				/**
-				 * Only involving previous stripe if the
-				 * truncate in this component is at the
-				 * beginning of this stripe.
-				 */
-				tested_trunc_stripe = true;
-				if (ext.e_start < lsm->lsm_entries[index]->
-							lsme_extent.e_start) {
-					/* need previous stripe involvement */
-					r0->lo_trunc_stripeno = prev;
-				} else {
-					tr_start = ext.e_start;
-					tr_start = lov_do_div64(tr_start,
-						      stripe_width(lsm, index));
-					/* tr_start %= stripe_swidth */
-					if (tr_start == stripe * lsm->
-							lsm_entries[index]->
-							lsme_stripe_size)
-						r0->lo_trunc_stripeno = prev;
-				}
-			}
-
-			/* if the last stripe is the trunc stripeno */
-			if (r0->lo_trunc_stripeno == stripe)
-				r0->lo_trunc_stripeno = -1;
-
+			end = lov_offset_mod(end, 1);
 			sub = lov_sub_get(env, lio,
-					  lov_comp_index(index, stripe));
-			if (IS_ERR(sub))
-				return PTR_ERR(sub);
-
-			rc = lov_io_add_sub(env, lio, sub, start, end);
-			if (rc != 0)
+					  lov_comp_index(index - 1, stripe));
+			if (IS_ERR(sub)) {
+				rc = PTR_ERR(sub);
 				break;
-		}
-		if (rc != 0)
-			break;
-
-		if (r0->lo_trunc_stripeno != -1) {
-			stripe = r0->lo_trunc_stripeno;
-			if (unlikely(!r0->lo_sub[stripe])) {
-				r0->lo_trunc_stripeno = -1;
-				continue;
-			}
-			sub = lov_sub_get(env, lio,
-					  lov_comp_index(index, stripe));
-			if (IS_ERR(sub))
-				return PTR_ERR(sub);
-
-			/**
-			 * the prev sub could be used by another truncate, we'd
-			 * skip it. LU-14128 happends when expand truncate +
-			 * read get wrong kms.
-			 */
-			if (!list_empty(&sub->sub_linkage)) {
-				r0->lo_trunc_stripeno = -1;
-				continue;
 			}
 
-			(void)lov_stripe_intersects(lsm, index, stripe, &ext,
-						    &start, &end);
-			rc = lov_io_add_sub(env, lio, sub, start, end);
+			lov_io_sub_inherit(sub, lio, start, end);
+			rc = cl_io_iter_init(sub->sub_env, &sub->sub_io);
+			if (rc != 0)
+				cl_io_iter_fini(sub->sub_env, &sub->sub_io);
 			if (rc != 0)
 				break;
 
+			CDEBUG(D_VFSTRACE,
+				"shrink stripe: {%d, %d} range: [%llu, %llu)\n",
+				index, stripe, start, end);
+
+			list_add_tail(&sub->sub_linkage, &lio->lis_active);
 		}
+		if (rc != 0)
+			break;
 	}
 	RETURN(rc);
 }
@@ -892,10 +478,12 @@ static int lov_io_iter_init(const struct lu_env *env,
 static int lov_io_rw_iter_init(const struct lu_env *env,
 			       const struct cl_io_slice *ios)
 {
-	struct lov_io *lio = cl2lov_io(env, ios);
 	struct cl_io *io = ios->cis_io;
+	struct lov_io *lio = cl2lov_io(env, ios);
+	struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
 	struct lov_stripe_md_entry *lse;
-	loff_t start = io->u.ci_rw.crw_pos;
+	struct cl_io_range *range = &io->u.ci_rw.rw_range;
+	loff_t start = range->cir_pos;
 	loff_t next;
 	int index;
 
@@ -905,14 +493,14 @@ static int lov_io_rw_iter_init(const struct lu_env *env,
 	if (cl_io_is_append(io))
 		RETURN(lov_io_iter_init(env, ios));
 
-	index = lov_io_layout_at(lio, io->u.ci_rw.crw_pos);
+	index = lov_lsm_entry(lsm, range->cir_pos);
 	if (index < 0) { /* non-existing layout component */
 		if (io->ci_type == CIT_READ) {
-			/*
-			 * TODO: it needs to detect the next component and
-			 * then set the next pos
-			 */
+			/* TODO: it needs to detect the next component and
+			 * then set the next pos */
 			io->ci_continue = 0;
+			/* execute it in main thread */
+			io->ci_pio = 0;
 
 			RETURN(lov_io_iter_init(env, ios));
 		}
@@ -920,10 +508,6 @@ static int lov_io_rw_iter_init(const struct lu_env *env,
 		RETURN(-ENODATA);
 	}
 
-	if (!lov_entry(lio->lis_object, index)->lle_valid &&
-	    !io->ci_designated_mirror)
-		RETURN(io->ci_type == CIT_READ ? -EAGAIN : -EIO);
-
 	lse = lov_lse(lio->lis_object, index);
 
 	next = MAX_LFS_FILESIZE;
@@ -936,20 +520,37 @@ static int lov_io_rw_iter_init(const struct lu_env *env,
 			next = MAX_LFS_FILESIZE;
 	}
 
-	LASSERTF(io->u.ci_rw.crw_pos >= lse->lsme_extent.e_start,
-		 "pos %lld, [%lld, %lld)\n", io->u.ci_rw.crw_pos,
+	LASSERTF(range->cir_pos >= lse->lsme_extent.e_start,
+		 "pos %lld, [%lld, %lld)\n", range->cir_pos,
 		 lse->lsme_extent.e_start, lse->lsme_extent.e_end);
 	next = min_t(__u64, next, lse->lsme_extent.e_end);
 	next = min_t(loff_t, next, lio->lis_io_endpos);
 
-	io->ci_continue = next < lio->lis_io_endpos;
-	io->u.ci_rw.crw_count = next - io->u.ci_rw.crw_pos;
-	lio->lis_pos    = io->u.ci_rw.crw_pos;
-	lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count;
+	io->ci_continue  = next < lio->lis_io_endpos;
+	range->cir_count = next - range->cir_pos;
+	lio->lis_pos     = range->cir_pos;
+	lio->lis_endpos  = range->cir_pos + range->cir_count;
 	CDEBUG(D_VFSTRACE,
-	       "stripe: %llu chunk: [%llu, %llu) %llu, %zd\n",
-	       (__u64)start, lio->lis_pos, lio->lis_endpos,
-	       (__u64)lio->lis_io_endpos, io->u.ci_rw.crw_count);
+	       "stripe: {%d, %llu} range: [%llu, %llu) end: %llu, count: %zd\n",
+	       index, start, lio->lis_pos, lio->lis_endpos,
+	       lio->lis_io_endpos, range->cir_count);
+
+	if (!io->ci_continue) {
+		/* the last piece of IO, execute it in main thread */
+		io->ci_pio = 0;
+	}
+
+	if (io->ci_pio) {
+		/* it only splits IO here for parallel IO,
+		 * there will be no actual IO going to occur,
+		 * so it doesn't need to invoke lov_io_iter_init()
+		 * to initialize sub IOs. */
+		if (!lsm_entry_inited(lsm, index)) {
+			io->ci_need_write_intent = 1;
+			RETURN(-ENODATA);
+		}
+		RETURN(0);
+	}
 
 	/*
 	 * XXX The following call should be optimized: we know, that
@@ -963,14 +564,18 @@ static int lov_io_setattr_iter_init(const struct lu_env *env,
 {
 	struct lov_io *lio = cl2lov_io(env, ios);
 	struct cl_io *io = ios->cis_io;
+	struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
 	int index;
 	ENTRY;
 
 	if (cl_io_is_trunc(io) && lio->lis_pos > 0) {
-		index = lov_io_layout_at(lio, lio->lis_pos - 1);
-		/* no entry found for such offset */
-		if (index < 0)
+		index = lov_lsm_entry(lsm, lio->lis_pos);
+		CDEBUG(D_VFSTRACE, "component[%d] flags %#x pos %llu\n",
+			index, lsm->lsm_entries[index]->lsme_flags, lio->lis_pos);
+		if (index > 0 && !lsm_entry_inited(lsm, index)) {
+			io->ci_need_write_intent = 1;
 			RETURN(io->ci_result = -ENODATA);
+		}
 	}
 
 	RETURN(lov_io_iter_init(env, ios));
@@ -997,49 +602,49 @@ static int lov_io_call(const struct lu_env *env, struct lov_io *lio,
 
 static int lov_io_lock(const struct lu_env *env, const struct cl_io_slice *ios)
 {
-	ENTRY;
-	RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_lock));
+        ENTRY;
+        RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_lock));
 }
 
 static int lov_io_start(const struct lu_env *env, const struct cl_io_slice *ios)
 {
-	ENTRY;
-	RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_start));
+        ENTRY;
+        RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_start));
 }
 
 static int lov_io_end_wrapper(const struct lu_env *env, struct cl_io *io)
 {
-	ENTRY;
-	/*
-	 * It's possible that lov_io_start() wasn't called against this
-	 * sub-io, either because previous sub-io failed, or upper layer
-	 * completed IO.
-	 */
-	if (io->ci_state == CIS_IO_GOING)
-		cl_io_end(env, io);
-	else
-		io->ci_state = CIS_IO_FINISHED;
-	RETURN(0);
+        ENTRY;
+        /*
+         * It's possible that lov_io_start() wasn't called against this
+         * sub-io, either because previous sub-io failed, or upper layer
+         * completed IO.
+         */
+        if (io->ci_state == CIS_IO_GOING)
+                cl_io_end(env, io);
+        else
+                io->ci_state = CIS_IO_FINISHED;
+        RETURN(0);
 }
 
 static int lov_io_iter_fini_wrapper(const struct lu_env *env, struct cl_io *io)
 {
-	cl_io_iter_fini(env, io);
-	RETURN(0);
+        cl_io_iter_fini(env, io);
+        RETURN(0);
 }
 
 static int lov_io_unlock_wrapper(const struct lu_env *env, struct cl_io *io)
 {
-	cl_io_unlock(env, io);
-	RETURN(0);
+        cl_io_unlock(env, io);
+        RETURN(0);
 }
 
 static void lov_io_end(const struct lu_env *env, const struct cl_io_slice *ios)
 {
-	int rc;
+        int rc;
 
-	rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_end_wrapper);
-	LASSERT(rc == 0);
+        rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_end_wrapper);
+        LASSERT(rc == 0);
 }
 
 static void
@@ -1047,18 +652,14 @@ lov_io_data_version_end(const struct lu_env *env, const struct cl_io_slice *ios)
 {
 	struct lov_io *lio = cl2lov_io(env, ios);
 	struct cl_io *parent = lio->lis_cl.cis_io;
-	struct cl_data_version_io *pdv = &parent->u.ci_data_version;
 	struct lov_io_sub *sub;
 
 	ENTRY;
 	list_for_each_entry(sub, &lio->lis_active, sub_linkage) {
-		struct cl_data_version_io *sdv = &sub->sub_io.u.ci_data_version;
+		lov_io_end_wrapper(env, &sub->sub_io);
 
-		lov_io_end_wrapper(sub->sub_env, &sub->sub_io);
-
-		pdv->dv_data_version += sdv->dv_data_version;
-		if (pdv->dv_layout_version > sdv->dv_layout_version)
-			pdv->dv_layout_version = sdv->dv_layout_version;
+		parent->u.ci_data_version.dv_data_version +=
+			sub->sub_io.u.ci_data_version.dv_data_version;
 
 		if (parent->ci_result == 0)
 			parent->ci_result = sub->sub_io.ci_result;
@@ -1070,26 +671,26 @@ lov_io_data_version_end(const struct lu_env *env, const struct cl_io_slice *ios)
 static void lov_io_iter_fini(const struct lu_env *env,
                              const struct cl_io_slice *ios)
 {
-	struct lov_io *lio = cl2lov_io(env, ios);
-	int rc;
+        struct lov_io *lio = cl2lov_io(env, ios);
+        int rc;
 
-	ENTRY;
-	rc = lov_io_call(env, lio, lov_io_iter_fini_wrapper);
-	LASSERT(rc == 0);
+        ENTRY;
+        rc = lov_io_call(env, lio, lov_io_iter_fini_wrapper);
+        LASSERT(rc == 0);
 	while (!list_empty(&lio->lis_active))
 		list_del_init(lio->lis_active.next);
-	EXIT;
+        EXIT;
 }
 
 static void lov_io_unlock(const struct lu_env *env,
                           const struct cl_io_slice *ios)
 {
-	int rc;
+        int rc;
 
-	ENTRY;
-	rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_unlock_wrapper);
-	LASSERT(rc == 0);
-	EXIT;
+        ENTRY;
+        rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_unlock_wrapper);
+        LASSERT(rc == 0);
+        EXIT;
 }
 
 static int lov_io_read_ahead(const struct lu_env *env,
@@ -1111,18 +712,14 @@ static int lov_io_read_ahead(const struct lu_env *env,
 	ENTRY;
 
 	offset = cl_offset(obj, start);
-	index = lov_io_layout_at(lio, offset);
+	index = lov_lsm_entry(loo->lo_lsm, offset);
 	if (index < 0 || !lsm_entry_inited(loo->lo_lsm, index))
 		RETURN(-ENODATA);
 
-	/* avoid readahead to expand to stale components */
-	if (!lov_entry(loo, index)->lle_valid)
-		RETURN(-EIO);
-
 	stripe = lov_stripe_number(loo->lo_lsm, index, offset);
 
 	r0 = lov_r0(loo, index);
-	if (unlikely(!r0->lo_sub[stripe]))
+	if (unlikely(r0->lo_sub[stripe] == NULL))
 		RETURN(-EIO);
 
 	sub = lov_sub_get(env, lio, lov_comp_index(index, stripe));
@@ -1153,7 +750,7 @@ static int lov_io_read_ahead(const struct lu_env *env,
 					       ra_end, stripe);
 
 	/* boundary of current component */
-	ra_end = cl_index(obj, (loff_t)lov_io_extent(lio, index)->e_end);
+	ra_end = cl_index(obj, (loff_t)lov_lse(loo, index)->lsme_extent.e_end);
 	if (ra_end != CL_PAGE_EOF && ra->cra_end >= ra_end)
 		ra->cra_end = ra_end - 1;
 
@@ -1197,37 +794,35 @@ static int lov_io_submit(const struct lu_env *env,
 	struct lov_io_sub	*sub;
 	struct cl_page_list	*plist = &lov_env_info(env)->lti_plist;
 	struct cl_page		*page;
-	struct cl_page		*tmp;
 	int index;
 	int rc = 0;
 	ENTRY;
 
+	if (lio->lis_nr_subios == 1) {
+		int idx = lio->lis_single_subio_index;
+
+		sub = lov_sub_get(env, lio, idx);
+		LASSERT(!IS_ERR(sub));
+		LASSERT(sub == &lio->lis_single_subio);
+		rc = cl_io_submit_rw(sub->sub_env, &sub->sub_io,
+				     crt, queue);
+		RETURN(rc);
+	}
+
 	cl_page_list_init(plist);
 	while (qin->pl_nr > 0) {
 		struct cl_2queue  *cl2q = &lov_env_info(env)->lti_cl2q;
 
-		page = cl_page_list_first(qin);
-		if (lov_page_is_empty(page)) {
-			cl_page_list_move(&queue->c2_qout, qin, page);
-
-			/*
-			 * it could only be mirror read to get here therefore
-			 * the pages will be transient. We don't care about
-			 * the return code of cl_page_prep() at all.
-			 */
-			(void) cl_page_prep(env, ios->cis_io, page, crt);
-			cl_page_completion(env, page, crt, 0);
-			continue;
-		}
-
 		cl_2queue_init(cl2q);
+
+		page = cl_page_list_first(qin);
 		cl_page_list_move(&cl2q->c2_qin, qin, page);
 
 		index = lov_page_index(page);
-		cl_page_list_for_each_safe(page, tmp, qin) {
-			/* this page is not on this stripe */
+		while (qin->pl_nr > 0) {
+			page = cl_page_list_first(qin);
 			if (index != lov_page_index(page))
-				continue;
+				break;
 
 			cl_page_list_move(&cl2q->c2_qin, qin, page);
 		}
@@ -1260,7 +855,7 @@ static int lov_io_commit_async(const struct lu_env *env,
 			       cl_commit_cbt cb)
 {
 	struct cl_page_list *plist = &lov_env_info(env)->lti_plist;
-	struct lov_io *lio = cl2lov_io(env, ios);
+	struct lov_io     *lio = cl2lov_io(env, ios);
 	struct lov_io_sub *sub;
 	struct cl_page *page;
 	int rc = 0;
@@ -1269,8 +864,6 @@ static int lov_io_commit_async(const struct lu_env *env,
 	if (lio->lis_nr_subios == 1) {
 		int idx = lio->lis_single_subio_index;
 
-		LASSERT(!lov_page_is_empty(cl_page_list_first(queue)));
-
 		sub = lov_sub_get(env, lio, idx);
 		LASSERT(!IS_ERR(sub));
 		LASSERT(sub == &lio->lis_single_subio);
@@ -1286,8 +879,6 @@ static int lov_io_commit_async(const struct lu_env *env,
 
 		LASSERT(plist->pl_nr == 0);
 		page = cl_page_list_first(queue);
-		LASSERT(!lov_page_is_empty(page));
-
 		cl_page_list_move(plist, queue, page);
 
 		index = lov_page_index(page);
@@ -1366,25 +957,25 @@ static void lov_io_fsync_end(const struct lu_env *env,
 }
 
 static const struct cl_io_operations lov_io_ops = {
-	.op = {
-		[CIT_READ] = {
-			.cio_fini      = lov_io_fini,
-			.cio_iter_init = lov_io_rw_iter_init,
-			.cio_iter_fini = lov_io_iter_fini,
-			.cio_lock      = lov_io_lock,
-			.cio_unlock    = lov_io_unlock,
-			.cio_start     = lov_io_start,
-			.cio_end       = lov_io_end
-		},
-		[CIT_WRITE] = {
-			.cio_fini      = lov_io_fini,
-			.cio_iter_init = lov_io_rw_iter_init,
-			.cio_iter_fini = lov_io_iter_fini,
-			.cio_lock      = lov_io_lock,
-			.cio_unlock    = lov_io_unlock,
-			.cio_start     = lov_io_start,
-			.cio_end       = lov_io_end
-		},
+        .op = {
+                [CIT_READ] = {
+                        .cio_fini      = lov_io_fini,
+                        .cio_iter_init = lov_io_rw_iter_init,
+                        .cio_iter_fini = lov_io_iter_fini,
+                        .cio_lock      = lov_io_lock,
+                        .cio_unlock    = lov_io_unlock,
+                        .cio_start     = lov_io_start,
+                        .cio_end       = lov_io_end
+                },
+                [CIT_WRITE] = {
+                        .cio_fini      = lov_io_fini,
+                        .cio_iter_init = lov_io_rw_iter_init,
+                        .cio_iter_fini = lov_io_iter_fini,
+                        .cio_lock      = lov_io_lock,
+                        .cio_unlock    = lov_io_unlock,
+                        .cio_start     = lov_io_start,
+                        .cio_end       = lov_io_end
+                },
 		[CIT_SETATTR] = {
 			.cio_fini      = lov_io_fini,
 			.cio_iter_init = lov_io_setattr_iter_init,
@@ -1395,23 +986,23 @@ static const struct cl_io_operations lov_io_ops = {
 			.cio_end       = lov_io_end
 		},
 		[CIT_DATA_VERSION] = {
-			.cio_fini       = lov_io_fini,
-			.cio_iter_init  = lov_io_iter_init,
-			.cio_iter_fini  = lov_io_iter_fini,
-			.cio_lock       = lov_io_lock,
-			.cio_unlock     = lov_io_unlock,
-			.cio_start      = lov_io_start,
-			.cio_end        = lov_io_data_version_end,
-		},
-		[CIT_FAULT] = {
-			.cio_fini      = lov_io_fini,
-			.cio_iter_init = lov_io_iter_init,
-			.cio_iter_fini = lov_io_iter_fini,
-			.cio_lock      = lov_io_lock,
-			.cio_unlock    = lov_io_unlock,
-			.cio_start     = lov_io_fault_start,
-			.cio_end       = lov_io_end
+			.cio_fini	= lov_io_fini,
+			.cio_iter_init	= lov_io_iter_init,
+			.cio_iter_fini	= lov_io_iter_fini,
+			.cio_lock	= lov_io_lock,
+			.cio_unlock	= lov_io_unlock,
+			.cio_start	= lov_io_start,
+			.cio_end	= lov_io_data_version_end,
 		},
+                [CIT_FAULT] = {
+                        .cio_fini      = lov_io_fini,
+                        .cio_iter_init = lov_io_iter_init,
+                        .cio_iter_fini = lov_io_iter_fini,
+                        .cio_lock      = lov_io_lock,
+                        .cio_unlock    = lov_io_unlock,
+                        .cio_start     = lov_io_fault_start,
+                        .cio_end       = lov_io_end
+                },
 		[CIT_FSYNC] = {
 			.cio_fini      = lov_io_fini,
 			.cio_iter_init = lov_io_iter_init,
@@ -1430,14 +1021,11 @@ static const struct cl_io_operations lov_io_ops = {
 			.cio_start     = lov_io_start,
 			.cio_end       = lov_io_end
 		},
-		[CIT_GLIMPSE] = {
-			.cio_fini      = lov_io_fini,
-		},
 		[CIT_MISC] = {
 			.cio_fini      = lov_io_fini
 		}
 	},
-	.cio_read_ahead                = lov_io_read_ahead,
+	.cio_read_ahead		       = lov_io_read_ahead,
 	.cio_submit                    = lov_io_submit,
 	.cio_commit_async              = lov_io_commit_async,
 };
@@ -1469,7 +1057,7 @@ static int lov_empty_io_submit(const struct lu_env *env,
 static void lov_empty_impossible(const struct lu_env *env,
                                  struct cl_io_slice *ios)
 {
-	LBUG();
+        LBUG();
 }
 
 #define LOV_EMPTY_IMPOSSIBLE ((void *)lov_empty_impossible)
@@ -1478,46 +1066,43 @@ static void lov_empty_impossible(const struct lu_env *env,
  * An io operation vector for files without stripes.
  */
 static const struct cl_io_operations lov_empty_io_ops = {
-	.op = {
-		[CIT_READ] = {
-			.cio_fini       = lov_empty_io_fini,
+        .op = {
+                [CIT_READ] = {
+                        .cio_fini       = lov_empty_io_fini,
 #if 0
-			.cio_iter_init  = LOV_EMPTY_IMPOSSIBLE,
-			.cio_lock       = LOV_EMPTY_IMPOSSIBLE,
-			.cio_start      = LOV_EMPTY_IMPOSSIBLE,
-			.cio_end        = LOV_EMPTY_IMPOSSIBLE
+                        .cio_iter_init  = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_lock       = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_start      = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_end        = LOV_EMPTY_IMPOSSIBLE
 #endif
-		},
-		[CIT_WRITE] = {
-			.cio_fini      = lov_empty_io_fini,
-			.cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
-			.cio_lock      = LOV_EMPTY_IMPOSSIBLE,
-			.cio_start     = LOV_EMPTY_IMPOSSIBLE,
-			.cio_end       = LOV_EMPTY_IMPOSSIBLE
-		},
-		[CIT_SETATTR] = {
-			.cio_fini      = lov_empty_io_fini,
-			.cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
-			.cio_lock      = LOV_EMPTY_IMPOSSIBLE,
-			.cio_start     = LOV_EMPTY_IMPOSSIBLE,
-			.cio_end       = LOV_EMPTY_IMPOSSIBLE
-		},
-		[CIT_FAULT] = {
-			.cio_fini      = lov_empty_io_fini,
-			.cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
-			.cio_lock      = LOV_EMPTY_IMPOSSIBLE,
-			.cio_start     = LOV_EMPTY_IMPOSSIBLE,
-			.cio_end       = LOV_EMPTY_IMPOSSIBLE
-		},
+                },
+                [CIT_WRITE] = {
+                        .cio_fini      = lov_empty_io_fini,
+                        .cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_start     = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_end       = LOV_EMPTY_IMPOSSIBLE
+                },
+                [CIT_SETATTR] = {
+                        .cio_fini      = lov_empty_io_fini,
+                        .cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_start     = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_end       = LOV_EMPTY_IMPOSSIBLE
+                },
+                [CIT_FAULT] = {
+                        .cio_fini      = lov_empty_io_fini,
+                        .cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_start     = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_end       = LOV_EMPTY_IMPOSSIBLE
+                },
 		[CIT_FSYNC] = {
 			.cio_fini      = lov_empty_io_fini
 		},
 		[CIT_LADVISE] = {
 			.cio_fini   = lov_empty_io_fini
 		},
-		[CIT_GLIMPSE] = {
-			.cio_fini      = lov_empty_io_fini
-		},
 		[CIT_MISC] = {
 			.cio_fini      = lov_empty_io_fini
 		}
@@ -1529,26 +1114,23 @@ static const struct cl_io_operations lov_empty_io_ops = {
 int lov_io_init_composite(const struct lu_env *env, struct cl_object *obj,
 			  struct cl_io *io)
 {
-	struct lov_io *lio = lov_env_io(env);
-	struct lov_object *lov = cl2lov(obj);
-	int result;
+	struct lov_io       *lio = lov_env_io(env);
+	struct lov_object   *lov = cl2lov(obj);
 
 	ENTRY;
-
 	INIT_LIST_HEAD(&lio->lis_active);
-	result = lov_io_slice_init(lio, lov, io);
-	if (result)
-		GOTO(out, result);
-
-	result = lov_io_subio_init(env, lio, io);
-	if (!result) {
-		cl_io_slice_add(io, &lio->lis_cl, obj, &lov_io_ops);
-		atomic_inc(&lov->lo_active_ios);
+	io->ci_result = lov_io_slice_init(lio, lov, io);
+	if (io->ci_result != 0)
+		RETURN(io->ci_result);
+
+	if (io->ci_result == 0) {
+		io->ci_result = lov_io_subio_init(env, lio, io);
+		if (io->ci_result == 0) {
+			cl_io_slice_add(io, &lio->lis_cl, obj, &lov_io_ops);
+			atomic_inc(&lov->lo_active_ios);
+		}
 	}
-	EXIT;
-out:
-	io->ci_result = result < 0 ? result : 0;
-	return result;
+	RETURN(io->ci_result);
 }
 
 int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj,
@@ -1564,7 +1146,6 @@ int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj,
 	default:
 		LBUG();
 	case CIT_MISC:
-	case CIT_GLIMPSE:
 	case CIT_READ:
 		result = 0;
 		break;
@@ -1608,7 +1189,6 @@ int lov_io_init_released(const struct lu_env *env, struct cl_object *obj,
 		LASSERTF(0, "invalid type %d\n", io->ci_type);
 		result = -EOPNOTSUPP;
 		break;
-	case CIT_GLIMPSE:
 	case CIT_MISC:
 	case CIT_FSYNC:
 	case CIT_LADVISE:
@@ -1616,8 +1196,7 @@ int lov_io_init_released(const struct lu_env *env, struct cl_object *obj,
 		result = 1;
 		break;
 	case CIT_SETATTR:
-		/*
-		 * the truncate to 0 is managed by MDT:
+		/* the truncate to 0 is managed by MDT:
 		 * - in open, for open O_TRUNC
 		 * - in setattr, for truncate
 		 */
@@ -1644,45 +1223,4 @@ int lov_io_init_released(const struct lu_env *env, struct cl_object *obj,
 	io->ci_result = result < 0 ? result : 0;
 	RETURN(result);
 }
-
-/**
- * Return the index in composite:lo_entries by the file offset
- */
-int lov_io_layout_at(struct lov_io *lio, __u64 offset)
-{
-	struct lov_object *lov = lio->lis_object;
-	struct lov_layout_composite *comp = &lov->u.composite;
-	int start_index = 0;
-	int end_index = comp->lo_entry_count - 1;
-	int i;
-
-	LASSERT(lov->lo_type == LLT_COMP);
-
-	/* This is actual file offset so nothing can cover eof. */
-	if (offset == LUSTRE_EOF)
-		return -1;
-
-	if (lov_is_flr(lov)) {
-		struct lov_mirror_entry *lre;
-
-		LASSERT(lio->lis_mirror_index >= 0);
-
-		lre = &comp->lo_mirrors[lio->lis_mirror_index];
-		start_index = lre->lre_start;
-		end_index = lre->lre_end;
-	}
-
-	for (i = start_index; i <= end_index; i++) {
-		struct lov_layout_entry *lle = lov_entry(lov, i);
-
-		if ((offset >= lle->lle_extent->e_start &&
-		     offset < lle->lle_extent->e_end) ||
-		    (offset == OBD_OBJECT_EOF &&
-		     lle->lle_extent->e_end == OBD_OBJECT_EOF))
-			return i;
-	}
-
-	return -1;
-}
-
 /** @} lov */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_lock.c b/drivers/staging/lustrefsx/lustre/lov/lov_lock.c
index 1b4a95876cc75..efa4cc11ea94e 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_lock.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_lock.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -52,22 +52,22 @@ static struct lov_sublock_env *lov_sublock_env_get(const struct lu_env *env,
 						   const struct cl_lock *parent,
 						   struct lov_lock_sub *lls)
 {
-	struct lov_sublock_env *subenv;
-	struct lov_io          *lio    = lov_env_io(env);
-	struct cl_io           *io     = lio->lis_cl.cis_io;
-	struct lov_io_sub      *sub;
-
-	subenv = &lov_env_session(env)->ls_subenv;
-
-	/*
-	 * FIXME: We tend to use the subio's env & io to call the sublock
-	 * lock operations because osc lock sometimes stores some control
-	 * variables in thread's IO infomation(Now only lockless information).
-	 * However, if the lock's host(object) is different from the object
-	 * for current IO, we have no way to get the subenv and subio because
-	 * they are not initialized at all. As a temp fix, in this case,
-	 * we still borrow the parent's env to call sublock operations.
-	 */
+        struct lov_sublock_env *subenv;
+        struct lov_io          *lio    = lov_env_io(env);
+        struct cl_io           *io     = lio->lis_cl.cis_io;
+        struct lov_io_sub      *sub;
+
+        subenv = &lov_env_session(env)->ls_subenv;
+
+        /*
+         * FIXME: We tend to use the subio's env & io to call the sublock
+         * lock operations because osc lock sometimes stores some control
+         * variables in thread's IO infomation(Now only lockless information).
+         * However, if the lock's host(object) is different from the object
+         * for current IO, we have no way to get the subenv and subio because
+         * they are not initialized at all. As a temp fix, in this case,
+         * we still borrow the parent's env to call sublock operations.
+         */
 	if (!io || !cl_object_same(io->ci_obj, parent->cll_descr.cld_obj)) {
 		subenv->lse_env = env;
 		subenv->lse_io = io;
@@ -89,7 +89,6 @@ static int lov_sublock_init(const struct lu_env *env,
 {
 	struct lov_sublock_env *subenv;
 	int result;
-
 	ENTRY;
 
 	subenv = lov_sublock_env_get(env, parent, lls);
@@ -112,7 +111,6 @@ static int lov_sublock_init(const struct lu_env *env,
  * through already created sub-locks (possibly shared with other top-locks).
  */
 static struct lov_lock *lov_lock_sub_init(const struct lu_env *env,
-					  const struct cl_io *io,
 					  const struct cl_object *obj,
 					  struct cl_lock *lock)
 {
@@ -135,18 +133,20 @@ static struct lov_lock *lov_lock_sub_init(const struct lu_env *env,
 		ext.e_end  = cl_offset(obj, lock->cll_descr.cld_end + 1);
 
 	nr = 0;
-	lov_foreach_io_layout(index, lov_env_io(env), &ext) {
+	for (index = lov_lsm_entry(lov->lo_lsm, ext.e_start);
+	     index >= 0 && index < lov->lo_lsm->lsm_entry_count; index++) {
 		struct lov_layout_raid0 *r0 = lov_r0(lov, index);
 
+		/* assume lsm entries are sorted. */
+		if (!lu_extent_is_overlapped(&ext,
+					     &lov_lse(lov, index)->lsme_extent))
+			break;
+
 		for (i = 0; i < r0->lo_nr; i++) {
-			if (likely(r0->lo_sub[i])) {/* spare layout */
-				if (lov_stripe_intersects(lov->lo_lsm, index, i,
-							  &ext, &start, &end))
-					nr++;
-				else if (cl_io_is_trunc(io) &&
-					 r0->lo_trunc_stripeno == i)
-					nr++;
-			}
+			if (likely(r0->lo_sub[i] != NULL) && /* spare layout */
+			    lov_stripe_intersects(lov->lo_lsm, index, i,
+						  &ext, &start, &end))
+				nr++;
 		}
 	}
 	/**
@@ -156,33 +156,28 @@ static struct lov_lock *lov_lock_sub_init(const struct lu_env *env,
 	 */
 
 	OBD_ALLOC_LARGE(lovlck, offsetof(struct lov_lock, lls_sub[nr]));
-	if (!lovlck)
+	if (lovlck == NULL)
 		RETURN(ERR_PTR(-ENOMEM));
 
 	lovlck->lls_nr = nr;
 	nr = 0;
-	lov_foreach_io_layout(index, lov_env_io(env), &ext) {
+	for (index = lov_lsm_entry(lov->lo_lsm, ext.e_start);
+	     index >= 0 && index < lov->lo_lsm->lsm_entry_count; index++) {
 		struct lov_layout_raid0 *r0 = lov_r0(lov, index);
 
+		/* assume lsm entries are sorted. */
+		if (!lu_extent_is_overlapped(&ext,
+					     &lov_lse(lov, index)->lsme_extent))
+			break;
 		for (i = 0; i < r0->lo_nr; ++i) {
 			struct lov_lock_sub *lls = &lovlck->lls_sub[nr];
 			struct cl_lock_descr *descr = &lls->sub_lock.cll_descr;
-			bool intersect = false;
 
-			if (unlikely(!r0->lo_sub[i]))
+			if (unlikely(r0->lo_sub[i] == NULL) ||
+			    !lov_stripe_intersects(lov->lo_lsm, index, i,
+						   &ext, &start, &end))
 				continue;
 
-			intersect = lov_stripe_intersects(lov->lo_lsm, index, i,
-							  &ext, &start, &end);
-			if (intersect)
-				goto init_sublock;
-
-			if (cl_io_is_trunc(io) && i == r0->lo_trunc_stripeno)
-				goto init_sublock;
-
-			continue;
-
-init_sublock:
 			LASSERT(descr->cld_obj == NULL);
 			descr->cld_obj   = lovsub2cl(r0->lo_sub[i]);
 			descr->cld_start = cl_index(descr->cld_obj, start);
@@ -249,10 +244,10 @@ static int lov_lock_enqueue(const struct lu_env *env,
 			    const struct cl_lock_slice *slice,
 			    struct cl_io *io, struct cl_sync_io *anchor)
 {
-	struct cl_lock *lock = slice->cls_lock;
-	struct lov_lock *lovlck = cl2lov_lock(slice);
-	int i;
-	int rc = 0;
+	struct cl_lock          *lock   = slice->cls_lock;
+	struct lov_lock         *lovlck = cl2lov_lock(slice);
+	int                     i;
+	int                     rc      = 0;
 
 	ENTRY;
 
@@ -279,16 +274,16 @@ static int lov_lock_enqueue(const struct lu_env *env,
 static void lov_lock_cancel(const struct lu_env *env,
 			    const struct cl_lock_slice *slice)
 {
-	struct cl_lock *lock = slice->cls_lock;
+	struct cl_lock  *lock   = slice->cls_lock;
 	struct lov_lock *lovlck = cl2lov_lock(slice);
 	int i;
 
 	ENTRY;
 
 	for (i = 0; i < lovlck->lls_nr; ++i) {
-		struct lov_lock_sub *lls = &lovlck->lls_sub[i];
-		struct cl_lock *sublock = &lls->sub_lock;
-		struct lov_sublock_env *subenv;
+		struct lov_lock_sub     *lls = &lovlck->lls_sub[i];
+		struct cl_lock          *sublock = &lls->sub_lock;
+		struct lov_sublock_env  *subenv;
 
 		if (!lls->sub_is_enqueued)
 			continue;
@@ -306,27 +301,27 @@ static void lov_lock_cancel(const struct lu_env *env,
 }
 
 static int lov_lock_print(const struct lu_env *env, void *cookie,
-			  lu_printer_t p, const struct cl_lock_slice *slice)
+                          lu_printer_t p, const struct cl_lock_slice *slice)
 {
-	struct lov_lock *lck = cl2lov_lock(slice);
-	int i;
+        struct lov_lock *lck = cl2lov_lock(slice);
+        int              i;
 
-	(*p)(env, cookie, "%d\n", lck->lls_nr);
-	for (i = 0; i < lck->lls_nr; ++i) {
-		struct lov_lock_sub *sub;
+        (*p)(env, cookie, "%d\n", lck->lls_nr);
+        for (i = 0; i < lck->lls_nr; ++i) {
+                struct lov_lock_sub *sub;
 
-		sub = &lck->lls_sub[i];
+                sub = &lck->lls_sub[i];
 		(*p)(env, cookie, "    %d %x: ", i, sub->sub_is_enqueued);
 		cl_lock_print(env, cookie, p, &sub->sub_lock);
-	}
-	return 0;
+        }
+        return 0;
 }
 
 static const struct cl_lock_operations lov_lock_ops = {
-	.clo_fini      = lov_lock_fini,
-	.clo_enqueue   = lov_lock_enqueue,
-	.clo_cancel    = lov_lock_cancel,
-	.clo_print     = lov_lock_print
+        .clo_fini      = lov_lock_fini,
+        .clo_enqueue   = lov_lock_enqueue,
+        .clo_cancel    = lov_lock_cancel,
+        .clo_print     = lov_lock_print
 };
 
 int lov_lock_init_composite(const struct lu_env *env, struct cl_object *obj,
@@ -336,7 +331,7 @@ int lov_lock_init_composite(const struct lu_env *env, struct cl_object *obj,
 	int result = 0;
 
 	ENTRY;
-	lck = lov_lock_sub_init(env, io, obj, lock);
+	lck = lov_lock_sub_init(env, obj, lock);
 	if (!IS_ERR(lck))
 		cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_lock_ops);
 	else
@@ -348,7 +343,6 @@ static void lov_empty_lock_fini(const struct lu_env *env,
 				struct cl_lock_slice *slice)
 {
 	struct lov_lock *lck = cl2lov_lock(slice);
-
 	OBD_SLAB_FREE_PTR(lck, lov_lock_kmem);
 }
 
@@ -373,7 +367,7 @@ int lov_lock_init_empty(const struct lu_env *env, struct cl_object *obj,
 
 	ENTRY;
 	OBD_SLAB_ALLOC_PTR_GFP(lck, lov_lock_kmem, GFP_NOFS);
-	if (lck) {
+	if (lck != NULL) {
 		cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_empty_lock_ops);
 		result = 0;
 	}
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_merge.c b/drivers/staging/lustrefsx/lustre/lov/lov_merge.c
index 8a6ced24ff522..de9e4298dd884 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_merge.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_merge.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_obd.c b/drivers/staging/lustrefsx/lustre/lov/lov_obd.c
index b9c42313fe3ae..8cdd60fc90171 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_obd.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_obd.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,14 +40,16 @@
 #define DEBUG_SUBSYSTEM S_LOV
 #include <libcfs/libcfs.h>
 
+#include <lustre/lustre_idl.h>
+
 #include <cl_object.h>
 #include <lustre_dlm.h>
 #include <lustre_fid.h>
-#include <uapi/linux/lustre/lustre_ioctl.h>
+#include <uapi/linux/lustre_ioctl.h>
 #include <lustre_lib.h>
 #include <lustre_mds.h>
 #include <lustre_net.h>
-#include <uapi/linux/lustre/lustre_param.h>
+#include <uapi/linux/lustre_param.h>
 #include <lustre_swab.h>
 #include <lprocfs_status.h>
 #include <obd_class.h>
@@ -57,7 +59,7 @@
 
 /* Keep a refcount of lov->tgt usage to prevent racing with addition/deletion.
    Any function that expects lov_tgts to remain stationary must take a ref. */
-void lov_tgts_getref(struct obd_device *obd)
+static void lov_getref(struct obd_device *obd)
 {
 	struct lov_obd *lov = &obd->u.lov;
 
@@ -70,7 +72,7 @@ void lov_tgts_getref(struct obd_device *obd)
 
 static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt);
 
-void lov_tgts_putref(struct obd_device *obd)
+static void lov_putref(struct obd_device *obd)
 {
 	struct lov_obd *lov = &obd->u.lov;
 
@@ -100,21 +102,21 @@ void lov_tgts_putref(struct obd_device *obd)
 
 		list_for_each_entry_safe(tgt, n, &kill, ltd_kill) {
 			list_del(&tgt->ltd_kill);
-			/* Disconnect */
-			__lov_del_obd(obd, tgt);
-		}
-	} else {
+                        /* Disconnect */
+                        __lov_del_obd(obd, tgt);
+                }
+        } else {
 		mutex_unlock(&lov->lov_lock);
-	}
+        }
 }
 
 static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
-			      enum obd_notify_event ev);
+                              enum obd_notify_event ev);
 static int lov_notify(struct obd_device *obd, struct obd_device *watched,
 		      enum obd_notify_event ev);
 
-int lov_connect_obd(struct obd_device *obd, u32 index, int activate,
-		    struct obd_connect_data *data)
+int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
+                    struct obd_connect_data *data)
 {
 	struct lov_obd *lov = &obd->u.lov;
 	struct obd_uuid *tgt_uuid;
@@ -146,12 +148,12 @@ int lov_connect_obd(struct obd_device *obd, u32 index, int activate,
          */
         imp = tgt_obd->u.cli.cl_import;
 
-	if (activate) {
-		tgt_obd->obd_no_recov = 0;
-		/* FIXME this is probably supposed to be
-		   ptlrpc_set_import_active.  Horrible naming. */
-		ptlrpc_activate_import(imp, false);
-	}
+        if (activate) {
+                tgt_obd->obd_no_recov = 0;
+                /* FIXME this is probably supposed to be
+                   ptlrpc_set_import_active.  Horrible naming. */
+                ptlrpc_activate_import(imp);
+        }
 
         rc = obd_register_observer(tgt_obd, obd);
         if (rc) {
@@ -180,17 +182,26 @@ int lov_connect_obd(struct obd_device *obd, u32 index, int activate,
         CDEBUG(D_CONFIG, "Connected tgt idx %d %s (%s) %sactive\n", index,
                obd_uuid2str(tgt_uuid), tgt_obd->obd_name, activate ? "":"in");
 
-	if (lov->lov_tgts_kobj) {
-		/* Even if we failed, that's ok */
-		rc = sysfs_create_link(lov->lov_tgts_kobj,
-				       &tgt_obd->obd_kset.kobj,
-				       tgt_obd->obd_name);
-		if (rc) {
-			CERROR("%s: can't register LOV target /sys/fs/lustre/%s/%s/target_obds/%s : rc = %d\n",
-			       obd->obd_name, obd->obd_type->typ_name,
-			       obd->obd_name,
-			       lov->lov_tgts[index]->ltd_exp->exp_obd->obd_name,
-			       rc);
+	if (lov->targets_proc_entry != NULL) {
+		struct proc_dir_entry *osc_symlink;
+		struct obd_device *osc_obd;
+
+		osc_obd = lov->lov_tgts[index]->ltd_exp->exp_obd;
+
+		LASSERT(osc_obd != NULL);
+		LASSERT(osc_obd->obd_magic == OBD_DEVICE_MAGIC);
+		LASSERT(osc_obd->obd_type->typ_name != NULL);
+
+		osc_symlink = lprocfs_add_symlink(osc_obd->obd_name,
+						  lov->targets_proc_entry,
+						  "../../../%s/%s",
+						  osc_obd->obd_type->typ_name,
+						  osc_obd->obd_name);
+		if (osc_symlink == NULL) {
+			CERROR("cannot register LOV target "
+			       "/proc/fs/lustre/%s/%s/target_obds/%s\n",
+			       obd->obd_type->typ_name, obd->obd_name,
+			       osc_obd->obd_name);
 		}
 	}
 	RETURN(0);
@@ -223,8 +234,17 @@ static int lov_connect(const struct lu_env *env,
         if (data)
                 lov->lov_ocd = *data;
 
-	lov_tgts_getref(obd);
+	lov->targets_proc_entry = lprocfs_register("target_obds",
+						   obd->obd_proc_entry,
+						   NULL, NULL);
+	if (IS_ERR(lov->targets_proc_entry)) {
+		CERROR("%s: cannot register "
+		       "/proc/fs/lustre/%s/%s/target_obds\n",
+		       obd->obd_name, obd->obd_type->typ_name, obd->obd_name);
+		lov->targets_proc_entry = NULL;
+	}
 
+        obd_getref(obd);
         for (i = 0; i < lov->desc.ld_tgt_count; i++) {
                 tgt = lov->lov_tgts[i];
                 if (!tgt || obd_uuid_empty(&tgt->ltd_uuid))
@@ -247,10 +267,9 @@ static int lov_connect(const struct lu_env *env,
                                obd->obd_name, rc);
                 }
         }
+        obd_putref(obd);
 
-	lov_tgts_putref(obd);
-
-	RETURN(0);
+        RETURN(0);
 }
 
 static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
@@ -271,10 +290,6 @@ static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
         }
 
 	if (osc_obd) {
-		if (lov->lov_tgts_kobj)
-			sysfs_remove_link(lov->lov_tgts_kobj,
-					  osc_obd->obd_name);
-
 		/* Pass it on to our clients.
 		 * XXX This should be an argument to disconnect,
 		 * XXX not a back-door flag on the OBD.  Ah well.
@@ -303,39 +318,40 @@ static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
 
 static int lov_disconnect(struct obd_export *exp)
 {
-	struct obd_device *obd = class_exp2obd(exp);
-	struct lov_obd *lov = &obd->u.lov;
-	u32 index;
-	int rc;
+        struct obd_device *obd = class_exp2obd(exp);
+        struct lov_obd *lov = &obd->u.lov;
+        int i, rc;
+        ENTRY;
 
-	ENTRY;
-	if (!lov->lov_tgts)
-		goto out;
-
-	/* Only disconnect the underlying layers on the final disconnect. */
-	lov->lov_connects--;
-	if (lov->lov_connects != 0) {
-		/* why should there be more than 1 connect? */
-		CWARN("%s: unexpected disconnect #%d\n",
-		      obd->obd_name, lov->lov_connects);
-		goto out;
-	}
+        if (!lov->lov_tgts)
+                goto out;
 
-	/* hold another ref so lov_del_obd() doesn't spin in putref each time */
-	lov_tgts_getref(obd);
+        /* Only disconnect the underlying layers on the final disconnect. */
+        lov->lov_connects--;
+        if (lov->lov_connects != 0) {
+                /* why should there be more than 1 connect? */
+                CERROR("disconnect #%d\n", lov->lov_connects);
+                goto out;
+        }
 
-	for (index = 0; index < lov->desc.ld_tgt_count; index++) {
-		if (lov->lov_tgts[index] && lov->lov_tgts[index]->ltd_exp) {
-			/* Disconnection is the last we know about an OBD */
-			lov_del_target(obd, index, NULL,
-				       lov->lov_tgts[index]->ltd_gen);
-		}
-	}
-	lov_tgts_putref(obd);
+        /* Let's hold another reference so lov_del_obd doesn't spin through
+           putref every time */
+        obd_getref(obd);
+
+        for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+                if (lov->lov_tgts[i] && lov->lov_tgts[i]->ltd_exp) {
+                        /* Disconnection is the last we know about an obd */
+			lov_del_target(obd, i, NULL, lov->lov_tgts[i]->ltd_gen);
+                }
+        }
+        obd_putref(obd);
+
+	if (lov->targets_proc_entry != NULL)
+		lprocfs_remove(&lov->targets_proc_entry);
 
 out:
-	rc = class_disconnect(exp); /* bz 9811 */
-	RETURN(rc);
+        rc = class_disconnect(exp); /* bz 9811 */
+        RETURN(rc);
 }
 
 /* Error codes:
@@ -356,7 +372,7 @@ static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
         CDEBUG(D_INFO, "Searching in lov %p for uuid %s event(%d)\n",
                lov, uuid->uuid, ev);
 
-	lov_tgts_getref(obd);
+	obd_getref(obd);
 	for (index = 0; index < lov->desc.ld_tgt_count; index++) {
 		tgt = lov->lov_tgts[index];
 		if (!tgt)
@@ -431,7 +447,7 @@ static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
 		       index, tgt->ltd_exp->exp_handle.h_cookie);
 
  out:
-	lov_tgts_putref(obd);
+	obd_putref(obd);
 	RETURN(index);
 }
 
@@ -481,37 +497,37 @@ static int lov_notify(struct obd_device *obd, struct obd_device *watched,
 }
 
 static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
-			  u32 index, int gen, int active)
+                          __u32 index, int gen, int active)
 {
-	struct lov_obd *lov = &obd->u.lov;
-	struct lov_tgt_desc *tgt;
-	struct obd_device *tgt_obd;
-	int rc;
+        struct lov_obd *lov = &obd->u.lov;
+        struct lov_tgt_desc *tgt;
+        struct obd_device *tgt_obd;
+        int rc;
+        ENTRY;
 
-	ENTRY;
-	CDEBUG(D_CONFIG, "uuid:%s idx:%u gen:%d active:%d\n",
-	       uuidp->uuid, index, gen, active);
+        CDEBUG(D_CONFIG, "uuid:%s idx:%d gen:%d active:%d\n",
+               uuidp->uuid, index, gen, active);
 
-	if (gen <= 0) {
-		CERROR("%s: request to add '%s' with invalid generation: %d\n",
-		       obd->obd_name, uuidp->uuid, gen);
-		RETURN(-EINVAL);
-	}
+        if (gen <= 0) {
+                CERROR("request to add OBD %s with invalid generation: %d\n",
+                       uuidp->uuid, gen);
+                RETURN(-EINVAL);
+        }
 
-	tgt_obd = class_find_client_obd(uuidp, LUSTRE_OSC_NAME, &obd->obd_uuid);
-	if (tgt_obd == NULL)
-		RETURN(-EINVAL);
+        tgt_obd = class_find_client_obd(uuidp, LUSTRE_OSC_NAME,
+                                        &obd->obd_uuid);
+        if (tgt_obd == NULL)
+                RETURN(-EINVAL);
 
 	mutex_lock(&lov->lov_lock);
 
-	if ((index < lov->lov_tgt_size) && (lov->lov_tgts[index] != NULL)) {
-		tgt = lov->lov_tgts[index];
-		rc = -EEXIST;
-		CERROR("%s: UUID %s already assigned at index %d: rc = %d\n",
-		       obd->obd_name, obd_uuid2str(&tgt->ltd_uuid), index, rc);
+        if ((index < lov->lov_tgt_size) && (lov->lov_tgts[index] != NULL)) {
+                tgt = lov->lov_tgts[index];
+                CERROR("UUID %s already assigned at LOV target index %d\n",
+                       obd_uuid2str(&tgt->ltd_uuid), index);
 		mutex_unlock(&lov->lov_lock);
-		RETURN(rc);
-	}
+                RETURN(-EEXIST);
+        }
 
         if (index >= lov->lov_tgt_size) {
                 /* We need to reallocate the lov target array. */
@@ -579,7 +595,7 @@ static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
                 RETURN(0);
         }
 
-	lov_tgts_getref(obd);
+        obd_getref(obd);
 
         rc = lov_connect_obd(obd, index, active, &lov->lov_ocd);
         if (rc)
@@ -602,17 +618,17 @@ static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
 			active ? OBD_NOTIFY_CONNECT : OBD_NOTIFY_INACTIVE);
 
 out:
-	if (rc) {
-		CERROR("%s: add failed, deleting %s: rc = %d\n",
-		       obd->obd_name, obd_uuid2str(&tgt->ltd_uuid), rc);
+        if (rc) {
+                CERROR("add failed (%d), deleting %s\n", rc,
+                       obd_uuid2str(&tgt->ltd_uuid));
 		lov_del_target(obd, index, NULL, 0);
-	}
-	lov_tgts_putref(obd);
-	RETURN(rc);
+        }
+        obd_putref(obd);
+        RETURN(rc);
 }
 
 /* Schedule a target for deletion */
-int lov_del_target(struct obd_device *obd, u32 index,
+int lov_del_target(struct obd_device *obd, __u32 index,
                    struct obd_uuid *uuidp, int gen)
 {
         struct lov_obd *lov = &obd->u.lov;
@@ -628,7 +644,7 @@ int lov_del_target(struct obd_device *obd, u32 index,
 
 	/* to make sure there's no ongoing lov_notify() now */
 	down_write(&lov->lov_notify_lock);
-	lov_tgts_getref(obd);
+        obd_getref(obd);
 
         if (!lov->lov_tgts[index]) {
                 CERROR("LOV target at index %d is not setup.\n", index);
@@ -649,12 +665,12 @@ int lov_del_target(struct obd_device *obd, u32 index,
 
         lov->lov_tgts[index]->ltd_reap = 1;
         lov->lov_death_row++;
-	/* we really delete it from lov_tgts_putref() */
+        /* we really delete it from obd_putref */
 out:
-	lov_tgts_putref(obd);
+        obd_putref(obd);
 	up_write(&lov->lov_notify_lock);
 
-	RETURN(rc);
+        RETURN(rc);
 }
 
 static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
@@ -731,6 +747,9 @@ int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 {
 	struct lov_desc *desc;
 	struct lov_obd *lov = &obd->u.lov;
+#ifdef CONFIG_PROC_FS
+	struct obd_type *type;
+#endif
 	int rc;
 	ENTRY;
 
@@ -784,12 +803,45 @@ int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
         if (rc)
 		GOTO(out, rc);
 
-	rc = lov_tunables_init(obd);
-	if (rc)
-		GOTO(out, rc);
+#ifdef CONFIG_PROC_FS
+	obd->obd_vars = lprocfs_lov_obd_vars;
+	/* If this is true then both client (lov) and server
+	 * (lod) are on the same node. The lod layer if loaded
+	 * first will register the lov proc directory. In that
+	 * case obd->obd_type->typ_procroot will be not set.
+	 * Instead we use type->typ_procsym as the parent. */
+	type = class_search_type(LUSTRE_LOD_NAME);
+	if (type != NULL && type->typ_procsym != NULL) {
+		obd->obd_proc_entry = lprocfs_register(obd->obd_name,
+						       type->typ_procsym,
+						       obd->obd_vars, obd);
+		if (IS_ERR(obd->obd_proc_entry)) {
+			rc = PTR_ERR(obd->obd_proc_entry);
+			CERROR("error %d setting up lprocfs for %s\n", rc,
+			       obd->obd_name);
+			obd->obd_proc_entry = NULL;
+		}
+	} else {
+		rc = lprocfs_obd_setup(obd);
+	}
 
-	lov->lov_tgts_kobj = kobject_create_and_add("target_obds",
-						    &obd->obd_kset.kobj);
+	if (rc == 0) {
+		rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd",
+					0444, &lov_proc_target_fops, obd);
+		if (rc)
+			CWARN("Error adding the target_obd file\n");
+
+		lov->lov_pool_proc_entry = lprocfs_register("pools",
+							    obd->obd_proc_entry,
+							    NULL, NULL);
+		if (IS_ERR(lov->lov_pool_proc_entry)) {
+			rc = PTR_ERR(lov->lov_pool_proc_entry);
+			CERROR("error %d setting up lprocfs for pools\n", rc);
+			lov->lov_pool_proc_entry = NULL;
+		}
+	}
+#endif
+	RETURN(0);
 
 out:
 	return rc;
@@ -802,11 +854,6 @@ static int lov_cleanup(struct obd_device *obd)
         struct pool_desc *pool;
         ENTRY;
 
-	if (lov->lov_tgts_kobj) {
-		kobject_put(lov->lov_tgts_kobj);
-		lov->lov_tgts_kobj = NULL;
-	}
-
 	list_for_each_safe(pos, tmp, &lov->lov_pool_list) {
 		pool = list_entry(pos, struct pool_desc, pool_list);
                 /* free pool structs */
@@ -822,13 +869,14 @@ static int lov_cleanup(struct obd_device *obd)
 	lprocfs_obd_cleanup(obd);
         if (lov->lov_tgts) {
                 int i;
-		lov_tgts_getref(obd);
+                obd_getref(obd);
                 for (i = 0; i < lov->desc.ld_tgt_count; i++) {
 			if (!lov->lov_tgts[i])
 				continue;
 
 			/* Inactive targets may never have connected */
-			if (lov->lov_tgts[i]->ltd_active)
+			if (lov->lov_tgts[i]->ltd_active ||
+			    atomic_read(&lov->lov_refcount))
 				/* We should never get here - these
 				 * should have been removed in the
 				 * disconnect. */
@@ -838,7 +886,7 @@ static int lov_cleanup(struct obd_device *obd)
 				       atomic_read(&lov->lov_refcount));
 			lov_del_target(obd, i, NULL, 0);
 		}
-		lov_tgts_putref(obd);
+                obd_putref(obd);
                 OBD_FREE(lov->lov_tgts, sizeof(*lov->lov_tgts) *
                          lov->lov_tgt_size);
                 lov->lov_tgt_size = 0;
@@ -853,56 +901,50 @@ static int lov_cleanup(struct obd_device *obd)
 }
 
 int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg,
-			    u32 *indexp, int *genp)
+                            __u32 *indexp, int *genp)
 {
-	struct obd_uuid obd_uuid;
-	int cmd;
-	int rc = 0;
-
-	ENTRY;
-	switch (cmd = lcfg->lcfg_command) {
-	case LCFG_ADD_MDC:
-	case LCFG_DEL_MDC:
-		break;
-	case LCFG_LOV_ADD_OBD:
-	case LCFG_LOV_ADD_INA:
-	case LCFG_LOV_DEL_OBD: {
-		u32 index;
-		int gen;
-
-		/* lov_modify_tgts add  0:lov_mdsA  1:ost1_UUID  2:0  3:1 */
-		if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid))
-			GOTO(out, rc = -EINVAL);
-
-		obd_str2uuid(&obd_uuid,  lustre_cfg_buf(lcfg, 1));
-
-		rc = kstrtou32(lustre_cfg_buf(lcfg, 2), 10, indexp);
-		if (rc)
-			GOTO(out, rc);
-		rc = kstrtoint(lustre_cfg_buf(lcfg, 3), 10, genp);
-		if (rc)
-			GOTO(out, rc);
-		index = *indexp;
-		gen = *genp;
-		if (cmd == LCFG_LOV_ADD_OBD)
-			rc = lov_add_target(obd, &obd_uuid, index, gen, 1);
-		else if (cmd == LCFG_LOV_ADD_INA)
-			rc = lov_add_target(obd, &obd_uuid, index, gen, 0);
-		else
-			rc = lov_del_target(obd, index, &obd_uuid, gen);
+        struct obd_uuid obd_uuid;
+        int cmd;
+        int rc = 0;
+        ENTRY;
 
-		GOTO(out, rc);
-	}
-	case LCFG_PARAM: {
+        switch(cmd = lcfg->lcfg_command) {
+        case LCFG_LOV_ADD_OBD:
+        case LCFG_LOV_ADD_INA:
+        case LCFG_LOV_DEL_OBD: {
+                __u32 index;
+                int gen;
+                /* lov_modify_tgts add  0:lov_mdsA  1:ost1_UUID  2:0  3:1 */
+                if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid))
+                        GOTO(out, rc = -EINVAL);
+
+                obd_str2uuid(&obd_uuid,  lustre_cfg_buf(lcfg, 1));
+
+		if (sscanf(lustre_cfg_buf(lcfg, 2), "%u", indexp) != 1)
+                        GOTO(out, rc = -EINVAL);
+                if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", genp) != 1)
+                        GOTO(out, rc = -EINVAL);
+                index = *indexp;
+                gen = *genp;
+                if (cmd == LCFG_LOV_ADD_OBD)
+                        rc = lov_add_target(obd, &obd_uuid, index, gen, 1);
+                else if (cmd == LCFG_LOV_ADD_INA)
+                        rc = lov_add_target(obd, &obd_uuid, index, gen, 0);
+                else
+                        rc = lov_del_target(obd, index, &obd_uuid, gen);
+                GOTO(out, rc);
+        }
+        case LCFG_PARAM: {
 		struct lov_desc *desc = &(obd->u.lov.desc);
-		ssize_t count;
 
 		if (!desc)
 			GOTO(out, rc = -EINVAL);
 
-		count = class_modify_config(lcfg, PARAM_LOV,
-					    &obd->obd_kset.kobj);
-		GOTO(out, rc = count < 0 ? count : 0);
+		rc = class_process_proc_param(PARAM_LOV, obd->obd_vars,
+					      lcfg, obd);
+		if (rc > 0)
+			rc = 0;
+                GOTO(out, rc);
         }
         case LCFG_POOL_NEW:
         case LCFG_POOL_ADD:
@@ -920,50 +962,84 @@ int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg,
         RETURN(rc);
 }
 
+static int
+lov_statfs_interpret(struct ptlrpc_request_set *rqset, void *data, int rc)
+{
+	struct lov_request_set *lovset = (struct lov_request_set *)data;
+	int err;
+	ENTRY;
+
+	if (rc)
+		atomic_set(&lovset->set_completes, 0);
+
+	err = lov_fini_statfs_set(lovset);
+	RETURN(rc ? rc : err);
+}
+
+static int lov_statfs_async(struct obd_export *exp, struct obd_info *oinfo,
+                            __u64 max_age, struct ptlrpc_request_set *rqset)
+{
+        struct obd_device      *obd = class_exp2obd(exp);
+        struct lov_request_set *set;
+        struct lov_request *req;
+	struct list_head *pos;
+        struct lov_obd *lov;
+        int rc = 0;
+        ENTRY;
+
+        LASSERT(oinfo != NULL);
+        LASSERT(oinfo->oi_osfs != NULL);
+
+        lov = &obd->u.lov;
+        rc = lov_prep_statfs_set(obd, oinfo, &set);
+        if (rc)
+                RETURN(rc);
+
+	list_for_each(pos, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+                rc = obd_statfs_async(lov->lov_tgts[req->rq_idx]->ltd_exp,
+                                      &req->rq_oi, max_age, rqset);
+                if (rc)
+                        break;
+        }
+
+	if (rc || list_empty(&rqset->set_requests)) {
+		int err;
+		if (rc)
+			atomic_set(&set->set_completes, 0);
+		err = lov_fini_statfs_set(set);
+		RETURN(rc ? rc : err);
+	}
+
+	LASSERT(rqset->set_interpret == NULL);
+	rqset->set_interpret = lov_statfs_interpret;
+	rqset->set_arg = (void *)set;
+	RETURN(0);
+}
+
 static int lov_statfs(const struct lu_env *env, struct obd_export *exp,
-		      struct obd_statfs *osfs, time64_t max_age, __u32 flags)
+		      struct obd_statfs *osfs, __u64 max_age, __u32 flags)
 {
-	struct obd_device *obd = class_exp2obd(exp);
-	struct lov_obd *lov = &obd->u.lov;
+	struct ptlrpc_request_set *set = NULL;
 	struct obd_info oinfo = {
 		.oi_osfs = osfs,
 		.oi_flags = flags,
 	};
-	struct ptlrpc_request_set *rqset;
-	struct lov_request_set *set = NULL;
-	struct lov_request *req;
 	int rc = 0;
-	int rc2;
 
 	ENTRY;
 
-	rqset = ptlrpc_prep_set();
-	if (rqset == NULL)
+	/* for obdclass we forbid using obd_statfs_rqset, but prefer using async
+	 * statfs requests */
+	set = ptlrpc_prep_set();
+	if (set == NULL)
 		RETURN(-ENOMEM);
 
-	rc = lov_prep_statfs_set(obd, &oinfo, &set);
-	if (rc < 0)
-		GOTO(out_rqset, rc);
-
-	list_for_each_entry(req, &set->set_list, rq_link) {
-		rc = obd_statfs_async(lov->lov_tgts[req->rq_idx]->ltd_exp,
-				      &req->rq_oi, max_age, rqset);
-		if (rc < 0)
-			GOTO(out_set, rc);
-	}
-
-	rc = ptlrpc_set_wait(env, rqset);
-
-out_set:
-	if (rc < 0)
-		atomic_set(&set->set_completes, 0);
-
-	rc2 = lov_fini_statfs_set(set);
+	rc = lov_statfs_async(exp, &oinfo, max_age, set);
 	if (rc == 0)
-		rc = rc2;
+		rc = ptlrpc_set_wait(set);
 
-out_rqset:
-	ptlrpc_set_destroy(rqset);
+	ptlrpc_set_destroy(set);
 
 	RETURN(rc);
 }
@@ -971,39 +1047,35 @@ static int lov_statfs(const struct lu_env *env, struct obd_export *exp,
 static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
 			 void *karg, void __user *uarg)
 {
-	struct obd_device *obd = class_exp2obd(exp);
-	struct lov_obd *lov = &obd->u.lov;
-	int i = 0, rc = 0, count = lov->desc.ld_tgt_count;
-	struct obd_uuid *uuidp;
+        struct obd_device *obddev = class_exp2obd(exp);
+        struct lov_obd *lov = &obddev->u.lov;
+        int i = 0, rc = 0, count = lov->desc.ld_tgt_count;
+        struct obd_uuid *uuidp;
+        ENTRY;
 
-	ENTRY;
-	switch (cmd) {
-	case IOC_OBD_STATFS: {
-		struct obd_ioctl_data *data = karg;
-		struct obd_device *osc_obd;
-		struct obd_statfs stat_buf = {0};
-		struct obd_import *imp;
-		__u32 index;
+        switch (cmd) {
+        case IOC_OBD_STATFS: {
+                struct obd_ioctl_data *data = karg;
+                struct obd_device *osc_obd;
+                struct obd_statfs stat_buf = {0};
+                __u32 index;
 		__u32 flags;
 
-		memcpy(&index, data->ioc_inlbuf2, sizeof(index));
-		if (index >= count)
-			RETURN(-ENODEV);
-
-		if (!lov->lov_tgts[index])
-			/* Try again with the next index */
-			RETURN(-EAGAIN);
+                memcpy(&index, data->ioc_inlbuf2, sizeof(index));
+                if ((index >= count))
+                        RETURN(-ENODEV);
 
-		osc_obd = class_exp2obd(lov->lov_tgts[index]->ltd_exp);
-		if (!osc_obd)
-			RETURN(-EINVAL);
+                if (!lov->lov_tgts[index])
+                        /* Try again with the next index */
+                        RETURN(-EAGAIN);
+                if (!lov->lov_tgts[index]->ltd_active)
+                        RETURN(-ENODATA);
 
-		imp = osc_obd->u.cli.cl_import;
-		if (!lov->lov_tgts[index]->ltd_active &&
-		    imp->imp_state != LUSTRE_IMP_IDLE)
-			RETURN(-ENODATA);
+                osc_obd = class_exp2obd(lov->lov_tgts[index]->ltd_exp);
+                if (!osc_obd)
+                        RETURN(-EINVAL);
 
-		/* copy UUID */
+                /* copy UUID */
 		if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(osc_obd),
 				 min_t(unsigned long, data->ioc_plen2,
 				       sizeof(struct obd_uuid))))
@@ -1012,12 +1084,12 @@ static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
 		memcpy(&flags, data->ioc_inlbuf1, sizeof(flags));
 		flags = flags & LL_STATFS_NODELAY ? OBD_STATFS_NODELAY : 0;
 
-		/* got statfs data */
-		rc = obd_statfs(NULL, lov->lov_tgts[index]->ltd_exp, &stat_buf,
-				ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
-				flags);
-		if (rc)
-			RETURN(rc);
+                /* got statfs data */
+                rc = obd_statfs(NULL, lov->lov_tgts[index]->ltd_exp, &stat_buf,
+                                cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                                flags);
+                if (rc)
+                        RETURN(rc);
 		if (copy_to_user(data->ioc_pbuf1, &stat_buf,
 				 min_t(unsigned long, data->ioc_plen1,
 				       sizeof(struct obd_statfs))))
@@ -1130,11 +1202,12 @@ static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                         if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp)
                                 continue;
 
-			/* ll_umount_begin() sets force on lov, pass to osc */
-			osc_obd = class_exp2obd(lov->lov_tgts[i]->ltd_exp);
-			osc_obd->obd_force = obd->obd_force;
-			err = obd_iocontrol(cmd, lov->lov_tgts[i]->ltd_exp,
-					    len, karg, uarg);
+                        /* ll_umount_begin() sets force flag but for lov, not
+                         * osc. Let's pass it through */
+                        osc_obd = class_exp2obd(lov->lov_tgts[i]->ltd_exp);
+                        osc_obd->obd_force = obddev->obd_force;
+                        err = obd_iocontrol(cmd, lov->lov_tgts[i]->ltd_exp,
+                                            len, karg, uarg);
 			if (err) {
                                 if (lov->lov_tgts[i]->ltd_active) {
                                         CDEBUG(err == -ENOTTY ?
@@ -1170,7 +1243,7 @@ static int lov_get_info(const struct lu_env *env, struct obd_export *exp,
 	if (vallen == NULL || val == NULL)
 		RETURN(-EFAULT);
 
-	lov_tgts_getref(obddev);
+	obd_getref(obddev);
 
 	if (KEY_IS(KEY_MAX_EASIZE)) {
 		u32 max_stripe_count = min_t(u32, ld->ld_active_tgt_count,
@@ -1188,7 +1261,7 @@ static int lov_get_info(const struct lu_env *env, struct obd_export *exp,
 		rc = -EINVAL;
 	}
 
-	lov_tgts_putref(obddev);
+	obd_putref(obddev);
 
 	RETURN(rc);
 }
@@ -1201,71 +1274,58 @@ static int lov_set_info_async(const struct lu_env *env, struct obd_export *exp,
 	struct obd_device *obddev = class_exp2obd(exp);
 	struct lov_obd *lov = &obddev->u.lov;
 	struct lov_tgt_desc *tgt;
-	bool do_inactive = false, no_set = false;
+	int do_inactive = 0;
+	int no_set = 0;
+	u32 count;
 	u32 i;
 	int rc = 0;
 	int err;
+        ENTRY;
 
-	ENTRY;
-
-	if (set == NULL) {
-		no_set = true;
-		set = ptlrpc_prep_set();
-		if (!set)
-			RETURN(-ENOMEM);
-	}
+        if (set == NULL) {
+                no_set = 1;
+                set = ptlrpc_prep_set();
+                if (!set)
+                        RETURN(-ENOMEM);
+        }
 
-	lov_tgts_getref(obddev);
+        obd_getref(obddev);
+        count = lov->desc.ld_tgt_count;
 
 	if (KEY_IS(KEY_CHECKSUM)) {
-		do_inactive = true;
+                do_inactive = 1;
 	} else if (KEY_IS(KEY_CACHE_SET)) {
 		LASSERT(lov->lov_cache == NULL);
 		lov->lov_cache = val;
-		do_inactive = true;
+		do_inactive = 1;
 		cl_cache_incref(lov->lov_cache);
 	}
 
-	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+	for (i = 0; i < count; i++) {
 		tgt = lov->lov_tgts[i];
 
-		/* OST was disconnected */
-		if (tgt == NULL || tgt->ltd_exp == NULL)
-			continue;
+                /* OST was disconnected */
+                if (!tgt || !tgt->ltd_exp)
+                        continue;
 
-		/* OST is inactive and we don't want inactive OSCs */
-		if (!tgt->ltd_active && !do_inactive)
-			continue;
+                /* OST is inactive and we don't want inactive OSCs */
+                if (!tgt->ltd_active && !do_inactive)
+                        continue;
 
 		err = obd_set_info_async(env, tgt->ltd_exp, keylen, key,
 					 vallen, val, set);
+                if (!rc)
+                        rc = err;
+        }
 
-		if (rc == 0)
-			rc = err;
-	}
-
-	/* cycle through MDC target for Data-on-MDT */
-	for (i = 0; i < LOV_MDC_TGT_MAX; i++) {
-		struct obd_device *mdc;
-
-		mdc = lov->lov_mdc_tgts[i].lmtd_mdc;
-		if (mdc == NULL)
-			continue;
-
-		err = obd_set_info_async(env, mdc->obd_self_export,
-					 keylen, key, vallen, val, set);
-		if (rc == 0)
-			rc = err;
-	}
-
-	lov_tgts_putref(obddev);
-	if (no_set) {
-		err = ptlrpc_set_wait(env, set);
-		if (rc == 0)
-			rc = err;
-		ptlrpc_set_destroy(set);
-	}
-	RETURN(rc);
+        obd_putref(obddev);
+        if (no_set) {
+                err = ptlrpc_set_wait(set);
+                if (!rc)
+                        rc = err;
+                ptlrpc_set_destroy(set);
+        }
+        RETURN(rc);
 }
 
 void lov_stripe_lock(struct lov_stripe_md *md)
@@ -1303,7 +1363,7 @@ static int lov_quotactl(struct obd_device *obd, struct obd_export *exp,
 	}
 
         /* for lov tgt */
-	lov_tgts_getref(obd);
+        obd_getref(obd);
         for (i = 0; i < lov->desc.ld_tgt_count; i++) {
                 int err;
 
@@ -1335,7 +1395,7 @@ static int lov_quotactl(struct obd_device *obd, struct obd_export *exp,
                         bhardlimit += oqctl->qc_dqblk.dqb_bhardlimit;
                 }
         }
-	lov_tgts_putref(obd);
+        obd_putref(obd);
 
         if (oqctl->qc_cmd == Q_GETOQUOTA) {
                 oqctl->qc_dqblk.dqb_curspace = curspace;
@@ -1351,6 +1411,7 @@ static struct obd_ops lov_obd_ops = {
 	.o_connect		= lov_connect,
 	.o_disconnect		= lov_disconnect,
 	.o_statfs		= lov_statfs,
+	.o_statfs_async		= lov_statfs_async,
 	.o_iocontrol		= lov_iocontrol,
 	.o_get_info		= lov_get_info,
 	.o_set_info_async	= lov_set_info_async,
@@ -1359,6 +1420,8 @@ static struct obd_ops lov_obd_ops = {
 	.o_pool_rem		= lov_pool_remove,
 	.o_pool_add		= lov_pool_add,
 	.o_pool_del		= lov_pool_del,
+	.o_getref		= lov_getref,
+	.o_putref		= lov_putref,
 	.o_quotactl		= lov_quotactl,
 };
 
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_object.c b/drivers/staging/lustrefsx/lustre/lov/lov_object.c
index f9f3522806a47..c1cf76367697e 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_object.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_object.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -37,8 +37,6 @@
 
 #define DEBUG_SUBSYSTEM S_LOV
 
-#include <linux/random.h>
-
 #include "lov_cl_internal.h"
 
 static inline struct lov_device *lov_object_dev(struct lov_object *obj)
@@ -76,8 +74,6 @@ struct lov_layout_operations {
                             struct cl_object *obj, struct cl_io *io);
         int  (*llo_getattr)(const struct lu_env *env, struct cl_object *obj,
                             struct cl_attr *attr);
-	int  (*llo_flush)(const struct lu_env *env, struct cl_object *obj,
-			  struct ldlm_lock *lock);
 };
 
 static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov);
@@ -93,40 +89,30 @@ static void lov_lsm_put(struct lov_stripe_md *lsm)
  * Lov object layout operations.
  *
  */
-
-static struct cl_object *lov_sub_find(const struct lu_env *env,
-				      struct cl_device *dev,
-				      const struct lu_fid *fid,
-				      const struct cl_object_conf *conf)
+static int lov_init_empty(const struct lu_env *env, struct lov_device *dev,
+			  struct lov_object *lov, struct lov_stripe_md *lsm,
+			  const struct cl_object_conf *conf,
+			  union lov_layout_state *state)
 {
-	struct lu_object *o;
-
-	ENTRY;
-
-	o = lu_object_find_at(env, cl2lu_dev(dev), fid, &conf->coc_lu);
-	LASSERT(ergo(!IS_ERR(o), o->lo_dev->ld_type == &lovsub_device_type));
-	RETURN(lu2cl(o));
+	return 0;
 }
 
-static int lov_page_slice_fixup(struct lov_object *lov,
-				struct cl_object *stripe)
+static struct cl_object *lov_sub_find(const struct lu_env *env,
+                                      struct cl_device *dev,
+                                      const struct lu_fid *fid,
+                                      const struct cl_object_conf *conf)
 {
-	struct cl_object_header *hdr = cl_object_header(&lov->lo_cl);
-	struct cl_object *o;
+        struct lu_object *o;
 
-	if (stripe == NULL)
-		return hdr->coh_page_bufsize - lov->lo_cl.co_slice_off -
-		       cfs_size_round(sizeof(struct lov_page));
-
-	cl_object_for_each(o, stripe)
-		o->co_slice_off += hdr->coh_page_bufsize;
-
-	return cl_object_header(stripe)->coh_page_bufsize;
+        ENTRY;
+        o = lu_object_find_at(env, cl2lu_dev(dev), fid, &conf->coc_lu);
+        LASSERT(ergo(!IS_ERR(o), o->lo_dev->ld_type == &lovsub_device_type));
+        RETURN(lu2cl(o));
 }
 
 static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
-			struct cl_object *subobj, struct lov_oinfo *oinfo,
-			int idx)
+			struct cl_object *subobj, struct lov_layout_raid0 *r0,
+			struct lov_oinfo *oinfo, int idx)
 {
 	struct cl_object_header *hdr;
 	struct cl_object_header *subhdr;
@@ -146,7 +132,7 @@ static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
 		return -EIO;
 	}
 
-	hdr = cl_object_header(lov2cl(lov));
+	hdr    = cl_object_header(lov2cl(lov));
 	subhdr = cl_object_header(subobj);
 
 	CDEBUG(D_INODE, DFID"@%p[%d:%d] -> "DFID"@%p: ostid: "DOSTID
@@ -159,14 +145,13 @@ static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
 	spin_lock(&subhdr->coh_attr_guard);
 	parent = subhdr->coh_parent;
 	if (parent == NULL) {
-		struct lovsub_object *lso = cl2lovsub(subobj);
-
 		subhdr->coh_parent = hdr;
 		spin_unlock(&subhdr->coh_attr_guard);
 		subhdr->coh_nesting = hdr->coh_nesting + 1;
 		lu_object_ref_add(&subobj->co_lu, "lov-parent", lov);
-		lso->lso_super = lov;
-		lso->lso_index = idx;
+		r0->lo_sub[stripe] = cl2lovsub(subobj);
+		r0->lo_sub[stripe]->lso_super = lov;
+		r0->lo_sub[stripe]->lso_index = idx;
 		result = 0;
 	} else {
 		struct lu_object  *old_obj;
@@ -196,28 +181,42 @@ static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
 	return result;
 }
 
+static int lov_page_slice_fixup(struct lov_object *lov,
+				struct cl_object *stripe)
+{
+	struct cl_object_header *hdr = cl_object_header(&lov->lo_cl);
+	struct cl_object *o;
+
+	if (stripe == NULL)
+		return hdr->coh_page_bufsize - lov->lo_cl.co_slice_off -
+		       cfs_size_round(sizeof(struct lov_page));
+
+	cl_object_for_each(o, stripe)
+		o->co_slice_off += hdr->coh_page_bufsize;
+
+	return cl_object_header(stripe)->coh_page_bufsize;
+}
+
 static int lov_init_raid0(const struct lu_env *env, struct lov_device *dev,
-			  struct lov_object *lov, unsigned int index,
-			  const struct cl_object_conf *conf,
-			  struct lov_layout_entry *lle)
+			  struct lov_object *lov, int index,
+			  struct lov_layout_raid0 *r0)
 {
-	struct lov_layout_raid0 *r0 = &lle->lle_raid0;
-	struct lov_thread_info *lti = lov_env_info(env);
-	struct cl_object_conf *subconf = &lti->lti_stripe_conf;
-	struct lu_fid *ofid = &lti->lti_fid;
-	struct cl_object *stripe;
+	struct lov_thread_info  *lti     = lov_env_info(env);
+	struct cl_object_conf   *subconf = &lti->lti_stripe_conf;
+	struct lu_fid           *ofid    = &lti->lti_fid;
+	struct cl_object        *stripe;
 	struct lov_stripe_md_entry *lse  = lov_lse(lov, index);
 	int result;
-	int psz, sz;
+	int psz;
 	int i;
 
 	ENTRY;
 
 	spin_lock_init(&r0->lo_sub_lock);
 	r0->lo_nr = lse->lsme_stripe_count;
-	r0->lo_trunc_stripeno = -1;
+	LASSERT(r0->lo_nr <= lov_targets_nr(dev));
 
-	OBD_ALLOC_LARGE(r0->lo_sub, r0->lo_nr * sizeof(r0->lo_sub[0]));
+	OBD_ALLOC_LARGE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]);
 	if (r0->lo_sub == NULL)
 		GOTO(out, result = -ENOMEM);
 
@@ -256,7 +255,7 @@ static int lov_init_raid0(const struct lu_env *env, struct lov_device *dev,
 		if (IS_ERR(stripe))
 			GOTO(out, result = PTR_ERR(stripe));
 
-		result = lov_init_sub(env, lov, stripe, oinfo,
+		result = lov_init_sub(env, lov, stripe, r0, oinfo,
 				      lov_comp_index(index, i));
 		if (result == -EAGAIN) { /* try again */
 			--i;
@@ -265,9 +264,7 @@ static int lov_init_raid0(const struct lu_env *env, struct lov_device *dev,
 		}
 
 		if (result == 0) {
-			r0->lo_sub[i] = cl2lovsub(stripe);
-
-			sz = lov_page_slice_fixup(lov, stripe);
+			int sz = lov_page_slice_fixup(lov, stripe);
 			LASSERT(ergo(psz > 0, psz == sz));
 			psz = sz;
 		}
@@ -278,369 +275,16 @@ static int lov_init_raid0(const struct lu_env *env, struct lov_device *dev,
 	RETURN(result);
 }
 
-static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov,
-			       struct lov_layout_raid0 *r0,
-			       struct lovsub_object *los, int idx)
-{
-	struct cl_object        *sub;
-	struct lu_site          *site;
-	wait_queue_head_t *wq;
-	wait_queue_entry_t *waiter;
-
-        LASSERT(r0->lo_sub[idx] == los);
-
-	sub = lovsub2cl(los);
-	site = sub->co_lu.lo_dev->ld_site;
-	wq = lu_site_wq_from_fid(site, &sub->co_lu.lo_header->loh_fid);
-
-        cl_object_kill(env, sub);
-        /* release a reference to the sub-object and ... */
-        lu_object_ref_del(&sub->co_lu, "lov-parent", lov);
-        cl_object_put(env, sub);
-
-	/* ... wait until it is actually destroyed---sub-object clears its
-	 * ->lo_sub[] slot in lovsub_object_free() */
-	if (r0->lo_sub[idx] == los) {
-		waiter = &lov_env_info(env)->lti_waiter;
-		init_waitqueue_entry(waiter, current);
-		add_wait_queue(wq, waiter);
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		while (1) {
-			/* this wait-queue is signaled at the end of
-			 * lu_object_free(). */
-			set_current_state(TASK_UNINTERRUPTIBLE);
-			spin_lock(&r0->lo_sub_lock);
-			if (r0->lo_sub[idx] == los) {
-				spin_unlock(&r0->lo_sub_lock);
-				schedule();
-			} else {
-				spin_unlock(&r0->lo_sub_lock);
-				set_current_state(TASK_RUNNING);
-				break;
-			}
-		}
-		remove_wait_queue(wq, waiter);
-	}
-	LASSERT(r0->lo_sub[idx] == NULL);
-}
-
-static void lov_delete_raid0(const struct lu_env *env, struct lov_object *lov,
-			     struct lov_layout_entry *lle)
-{
-	struct lov_layout_raid0 *r0 = &lle->lle_raid0;
-
-	ENTRY;
-
-        if (r0->lo_sub != NULL) {
-		int i;
-
-		for (i = 0; i < r0->lo_nr; ++i) {
-			struct lovsub_object *los = r0->lo_sub[i];
-
-			if (los != NULL) {
-				cl_object_prune(env, &los->lso_cl);
-				/*
-				 * If top-level object is to be evicted from
-				 * the cache, so are its sub-objects.
-				 */
-				lov_subobject_kill(env, lov, r0, los, i);
-			}
-		}
-	}
-
-	EXIT;
-}
-
-static void lov_fini_raid0(const struct lu_env *env,
-			   struct lov_layout_entry *lle)
-{
-	struct lov_layout_raid0 *r0 = &lle->lle_raid0;
-
-	if (r0->lo_sub != NULL) {
-		OBD_FREE_LARGE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]);
-		r0->lo_sub = NULL;
-	}
-}
-
-static int lov_print_raid0(const struct lu_env *env, void *cookie,
-			   lu_printer_t p, const struct lov_layout_entry *lle)
-{
-	const struct lov_layout_raid0 *r0 = &lle->lle_raid0;
-	int i;
-
-	for (i = 0; i < r0->lo_nr; ++i) {
-		struct lu_object *sub;
-
-		if (r0->lo_sub[i] != NULL) {
-			sub = lovsub2lu(r0->lo_sub[i]);
-			lu_object_print(env, cookie, p, sub);
-		} else {
-			(*p)(env, cookie, "sub %d absent\n", i);
-		}
-	}
-	return 0;
-}
-
-static int lov_attr_get_raid0(const struct lu_env *env, struct lov_object *lov,
-			      unsigned int index, struct lov_layout_entry *lle,
-			      struct cl_attr **lov_attr)
-{
-	struct lov_layout_raid0 *r0 = &lle->lle_raid0;
-	struct lov_stripe_md *lsm = lov->lo_lsm;
-	struct ost_lvb *lvb = &lov_env_info(env)->lti_lvb;
-	struct cl_attr *attr = &r0->lo_attr;
-	__u64 kms = 0;
-	int result = 0;
-
-	if (r0->lo_attr_valid) {
-		*lov_attr = attr;
-		return 0;
-	}
-
-	memset(lvb, 0, sizeof(*lvb));
-
-	/* XXX: timestamps can be negative by sanity:test_39m,
-	 * how can it be? */
-	lvb->lvb_atime = LLONG_MIN;
-	lvb->lvb_ctime = LLONG_MIN;
-	lvb->lvb_mtime = LLONG_MIN;
-
-	/*
-	 * XXX that should be replaced with a loop over sub-objects,
-	 * doing cl_object_attr_get() on them. But for now, let's
-	 * reuse old lov code.
-	 */
-
-	/*
-	 * XXX take lsm spin-lock to keep lov_merge_lvb_kms()
-	 * happy. It's not needed, because new code uses
-	 * ->coh_attr_guard spin-lock to protect consistency of
-	 * sub-object attributes.
-	 */
-	lov_stripe_lock(lsm);
-	result = lov_merge_lvb_kms(lsm, index, lvb, &kms);
-	lov_stripe_unlock(lsm);
-	if (result == 0) {
-		cl_lvb2attr(attr, lvb);
-		attr->cat_kms = kms;
-		r0->lo_attr_valid = 1;
-		*lov_attr = attr;
-	}
-
-	return result;
-}
-
-static struct lov_comp_layout_entry_ops raid0_ops = {
-	.lco_init      = lov_init_raid0,
-	.lco_fini      = lov_fini_raid0,
-	.lco_getattr   = lov_attr_get_raid0,
-};
-
-static int lov_attr_get_dom(const struct lu_env *env, struct lov_object *lov,
-			    unsigned int index, struct lov_layout_entry *lle,
-			    struct cl_attr **lov_attr)
-{
-	struct lov_layout_dom *dom = &lle->lle_dom;
-	struct lov_oinfo *loi = dom->lo_loi;
-	struct cl_attr *attr = &dom->lo_dom_r0.lo_attr;
-
-	if (dom->lo_dom_r0.lo_attr_valid) {
-		*lov_attr = attr;
-		return 0;
-	}
-
-	if (OST_LVB_IS_ERR(loi->loi_lvb.lvb_blocks))
-		return OST_LVB_GET_ERR(loi->loi_lvb.lvb_blocks);
-
-	cl_lvb2attr(attr, &loi->loi_lvb);
-
-	/* DoM component size can be bigger than stripe size after
-	 * client's setattr RPC, so do not count anything beyond
-	 * component end. Alternatively, check that limit on server
-	 * and do not allow size overflow there. */
-	if (attr->cat_size > lle->lle_extent->e_end)
-		attr->cat_size = lle->lle_extent->e_end;
-
-	attr->cat_kms = attr->cat_size;
-
-	dom->lo_dom_r0.lo_attr_valid = 1;
-	*lov_attr = attr;
-
-	return 0;
-}
-
-/**
- * Lookup FLD to get MDS index of the given DOM object FID.
- *
- * \param[in]  ld	LOV device
- * \param[in]  fid	FID to lookup
- * \param[out] nr	index in MDC array to return back
- *
- * \retval		0 and \a mds filled with MDS index if successful
- * \retval		negative value on error
- */
-static int lov_fld_lookup(struct lov_device *ld, const struct lu_fid *fid,
-			  __u32 *nr)
-{
-	__u32 mds_idx;
-	int i, rc;
-
-	ENTRY;
-
-	rc = fld_client_lookup(&ld->ld_lmv->u.lmv.lmv_fld, fid_seq(fid),
-			       &mds_idx, LU_SEQ_RANGE_MDT, NULL);
-	if (rc) {
-		CERROR("%s: error while looking for mds number. Seq %#llx"
-		       ", err = %d\n", lu_dev_name(cl2lu_dev(&ld->ld_cl)),
-		       fid_seq(fid), rc);
-		RETURN(rc);
-	}
-
-	CDEBUG(D_INODE, "FLD lookup got mds #%x for fid="DFID"\n",
-	       mds_idx, PFID(fid));
-
-	/* find proper MDC device in the array */
-	for (i = 0; i < ld->ld_md_tgts_nr; i++) {
-		if (ld->ld_md_tgts[i].ldm_mdc != NULL &&
-		    ld->ld_md_tgts[i].ldm_idx == mds_idx)
-			break;
-	}
-
-	if (i == ld->ld_md_tgts_nr) {
-		CERROR("%s: cannot find corresponding MDC device for mds #%x "
-		       "for fid="DFID"\n", lu_dev_name(cl2lu_dev(&ld->ld_cl)),
-		       mds_idx, PFID(fid));
-		rc = -EINVAL;
-	} else {
-		*nr = i;
-	}
-	RETURN(rc);
-}
-
-/**
- * Implementation of lov_comp_layout_entry_ops::lco_init for DOM object.
- *
- * Init the DOM object for the first time. It prepares also RAID0 entry
- * for it to use in common methods with ordinary RAID0 layout entries.
- *
- * \param[in] env	execution environment
- * \param[in] dev	LOV device
- * \param[in] lov	LOV object
- * \param[in] index	Composite layout entry index in LSM
- * \param[in] lle	Composite LOV layout entry
- */
-static int lov_init_dom(const struct lu_env *env, struct lov_device *dev,
-			struct lov_object *lov, unsigned int index,
-			const struct cl_object_conf *conf,
-			struct lov_layout_entry *lle)
-{
-	struct lov_thread_info *lti = lov_env_info(env);
-	struct lov_stripe_md_entry *lsme = lov_lse(lov, index);
-	struct cl_object *clo;
-	struct lu_object *o = lov2lu(lov);
-	const struct lu_fid *fid = lu_object_fid(o);
-	struct cl_device *mdcdev;
-	struct lov_oinfo *loi = NULL;
-	struct cl_object_conf *sconf = &lti->lti_stripe_conf;
-
-	int rc;
-	__u32 idx = 0;
-
-	ENTRY;
-
-	LASSERT(index == 0);
-
-	/* find proper MDS device */
-	rc = lov_fld_lookup(dev, fid, &idx);
-	if (rc)
-		RETURN(rc);
-
-	LASSERTF(dev->ld_md_tgts[idx].ldm_mdc != NULL,
-		 "LOV md target[%u] is NULL\n", idx);
-
-	/* check lsm is DOM, more checks are needed */
-	LASSERT(lsme->lsme_stripe_count == 0);
-
-	/*
-	 * Create lower cl_objects.
-	 */
-	mdcdev = dev->ld_md_tgts[idx].ldm_mdc;
-
-	LASSERTF(mdcdev != NULL, "non-initialized mdc subdev\n");
-
-	/* DoM object has no oinfo in LSM entry, create it exclusively */
-	OBD_SLAB_ALLOC_PTR_GFP(loi, lov_oinfo_slab, GFP_NOFS);
-	if (loi == NULL)
-		RETURN(-ENOMEM);
-
-	fid_to_ostid(lu_object_fid(lov2lu(lov)), &loi->loi_oi);
-
-	sconf->u.coc_oinfo = loi;
-again:
-	clo = lov_sub_find(env, mdcdev, fid, sconf);
-	if (IS_ERR(clo))
-		GOTO(out, rc = PTR_ERR(clo));
-
-	rc = lov_init_sub(env, lov, clo, loi, lov_comp_index(index, 0));
-	if (rc == -EAGAIN) /* try again */
-		goto again;
-	else if (rc != 0)
-		GOTO(out, rc);
-
-	lle->lle_dom.lo_dom = cl2lovsub(clo);
-	spin_lock_init(&lle->lle_dom.lo_dom_r0.lo_sub_lock);
-	lle->lle_dom.lo_dom_r0.lo_nr = 1;
-	lle->lle_dom.lo_dom_r0.lo_sub = &lle->lle_dom.lo_dom;
-	lle->lle_dom.lo_loi = loi;
-
-	rc = lov_page_slice_fixup(lov, clo);
-	RETURN(rc);
-
-out:
-	if (loi != NULL)
-		OBD_SLAB_FREE_PTR(loi, lov_oinfo_slab);
-	return rc;
-}
-
-/**
- * Implementation of lov_layout_operations::llo_fini for DOM object.
- *
- * Finish the DOM object and free related memory.
- *
- * \param[in] env	execution environment
- * \param[in] lov	LOV object
- * \param[in] state	LOV layout state
- */
-static void lov_fini_dom(const struct lu_env *env,
-			 struct lov_layout_entry *lle)
-{
-	if (lle->lle_dom.lo_dom != NULL)
-		lle->lle_dom.lo_dom = NULL;
-	if (lle->lle_dom.lo_loi != NULL)
-		OBD_SLAB_FREE_PTR(lle->lle_dom.lo_loi, lov_oinfo_slab);
-}
-
-static struct lov_comp_layout_entry_ops dom_ops = {
-	.lco_init = lov_init_dom,
-	.lco_fini = lov_fini_dom,
-	.lco_getattr = lov_attr_get_dom,
-};
-
 static int lov_init_composite(const struct lu_env *env, struct lov_device *dev,
 			      struct lov_object *lov, struct lov_stripe_md *lsm,
 			      const struct cl_object_conf *conf,
 			      union lov_layout_state *state)
 {
 	struct lov_layout_composite *comp = &state->composite;
-	struct lov_layout_entry *lle;
-	struct lov_mirror_entry *lre;
 	unsigned int entry_count;
 	unsigned int psz = 0;
-	unsigned int mirror_count;
-	int flr_state = lsm->lsm_flags & LCM_FL_FLR_MASK;
 	int result = 0;
-	unsigned int seq;
-	int i, j;
+	int i;
 
 	ENTRY;
 
@@ -649,155 +293,36 @@ static int lov_init_composite(const struct lu_env *env, struct lov_device *dev,
 	lov->lo_lsm = lsm_addref(lsm);
 	lov->lo_layout_invalid = true;
 
-	dump_lsm(D_INODE, lsm);
-
 	entry_count = lsm->lsm_entry_count;
-
-	spin_lock_init(&comp->lo_write_lock);
-	comp->lo_flags = lsm->lsm_flags;
-	comp->lo_mirror_count = lsm->lsm_mirror_count + 1;
-	comp->lo_entry_count = lsm->lsm_entry_count;
-	comp->lo_preferred_mirror = -1;
-
-	if (equi(flr_state == LCM_FL_NONE, comp->lo_mirror_count > 1))
-		RETURN(-EINVAL);
-
-	OBD_ALLOC(comp->lo_mirrors,
-		  comp->lo_mirror_count * sizeof(*comp->lo_mirrors));
-	if (comp->lo_mirrors == NULL)
-		RETURN(-ENOMEM);
+	comp->lo_entry_count = entry_count;
 
 	OBD_ALLOC(comp->lo_entries, entry_count * sizeof(*comp->lo_entries));
 	if (comp->lo_entries == NULL)
 		RETURN(-ENOMEM);
 
-	/* Initiate all entry types and extents data at first */
-	for (i = 0, j = 0, mirror_count = 1; i < entry_count; i++) {
-		int mirror_id = 0;
-
-		lle = &comp->lo_entries[i];
-
-		lle->lle_lsme = lsm->lsm_entries[i];
-		lle->lle_type = lov_entry_type(lle->lle_lsme);
-		switch (lle->lle_type) {
-		case LOV_PATTERN_RAID0:
-			lle->lle_comp_ops = &raid0_ops;
-			break;
-		case LOV_PATTERN_MDT:
-			lle->lle_comp_ops = &dom_ops;
-			break;
-		default:
-			CERROR("%s: unknown composite layout entry type %i\n",
-			       lov2obd(dev->ld_lov)->obd_name,
-			       lsm->lsm_entries[i]->lsme_pattern);
-			dump_lsm(D_ERROR, lsm);
-			RETURN(-EIO);
-		}
-
-		lle->lle_extent = &lle->lle_lsme->lsme_extent;
-		lle->lle_valid = !(lle->lle_lsme->lsme_flags & LCME_FL_STALE);
-
-		if (flr_state != LCM_FL_NONE)
-			mirror_id = mirror_id_of(lle->lle_lsme->lsme_id);
-
-		lre = &comp->lo_mirrors[j];
-		if (i > 0) {
-			if (mirror_id == lre->lre_mirror_id) {
-				lre->lre_valid |= lle->lle_valid;
-				lre->lre_stale |= !lle->lle_valid;
-				lre->lre_end = i;
-				continue;
-			}
-
-			/* new mirror detected, assume that the mirrors
-			 * are shorted in layout */
-			++mirror_count;
-			++j;
-			if (j >= comp->lo_mirror_count)
-				break;
-
-			lre = &comp->lo_mirrors[j];
-		}
-
-		/* entries must be sorted by mirrors */
-		lre->lre_mirror_id = mirror_id;
-		lre->lre_start = lre->lre_end = i;
-		lre->lre_preferred = !!(lle->lle_lsme->lsme_flags &
-					LCME_FL_PREF_RD);
-		lre->lre_valid = lle->lle_valid;
-		lre->lre_stale = !lle->lle_valid;
-	}
-
-	/* sanity check for FLR */
-	if (mirror_count != comp->lo_mirror_count) {
-		CDEBUG(D_INODE, DFID
-		       " doesn't have the # of mirrors it claims, %u/%u\n",
-		       PFID(lu_object_fid(lov2lu(lov))), mirror_count,
-		       comp->lo_mirror_count + 1);
-
-		GOTO(out, result = -EINVAL);
-	}
-
-	lov_foreach_layout_entry(lov, lle) {
-		int index = lov_layout_entry_index(lov, lle);
+	for (i = 0; i < entry_count; i++) {
+		struct lov_layout_entry *le = &comp->lo_entries[i];
 
+		le->lle_extent = lsm->lsm_entries[i]->lsme_extent;
 		/**
 		 * If the component has not been init-ed on MDS side, for
 		 * PFL layout, we'd know that the components beyond this one
 		 * will be dynamically init-ed later on file write/trunc ops.
 		 */
-		if (!lsme_inited(lle->lle_lsme))
+		if (!lsm_entry_inited(lsm, i))
 			continue;
 
-		result = lle->lle_comp_ops->lco_init(env, dev, lov, index,
-						     conf, lle);
+		result = lov_init_raid0(env, dev, lov, i, &le->lle_raid0);
 		if (result < 0)
 			break;
 
 		LASSERT(ergo(psz > 0, psz == result));
 		psz = result;
 	}
-
 	if (psz > 0)
-		cl_object_header(&lov->lo_cl)->coh_page_bufsize += psz;
-
-	/* decide the preferred mirror. It uses the hash value of lov_object
-	 * so that different clients would use different mirrors for read. */
-	mirror_count = 0;
-	seq = hash_long((unsigned long)lov, 8);
-	for (i = 0; i < comp->lo_mirror_count; i++) {
-		unsigned int idx = (i + seq) % comp->lo_mirror_count;
-
-		lre = lov_mirror_entry(lov, idx);
-		if (lre->lre_stale)
-			continue;
-
-		mirror_count++; /* valid mirror */
-
-		if (lre->lre_preferred || comp->lo_preferred_mirror < 0)
-			comp->lo_preferred_mirror = idx;
-	}
-	if (!mirror_count) {
-		CDEBUG(D_INODE, DFID
-		       " doesn't have any valid mirrors\n",
-		       PFID(lu_object_fid(lov2lu(lov))));
-
-		comp->lo_preferred_mirror = 0;
-	}
-
-	LASSERT(comp->lo_preferred_mirror >= 0);
-
-	EXIT;
-out:
-	return result > 0 ? 0 : result;
-}
+		cl_object_header(&lov->lo_cl)->coh_page_bufsize += psz;
 
-static int lov_init_empty(const struct lu_env *env, struct lov_device *dev,
-			  struct lov_object *lov, struct lov_stripe_md *lsm,
-			  const struct cl_object_conf *conf,
-			  union lov_layout_state *state)
-{
-	return 0;
+	return result > 0 ? 0 : result;
 }
 
 static int lov_init_released(const struct lu_env *env,
@@ -814,6 +339,43 @@ static int lov_init_released(const struct lu_env *env,
 	return 0;
 }
 
+static struct cl_object *lov_find_subobj(const struct lu_env *env,
+					 struct lov_object *lov,
+					 struct lov_stripe_md *lsm,
+					 int index)
+{
+	struct lov_device	*dev = lu2lov_dev(lov2lu(lov)->lo_dev);
+	struct lov_thread_info  *lti = lov_env_info(env);
+	struct lu_fid		*ofid = &lti->lti_fid;
+	struct lov_oinfo	*oinfo;
+	struct cl_device	*subdev;
+	int			entry = lov_comp_entry(index);
+	int			stripe = lov_comp_stripe(index);
+	int			ost_idx;
+	int			rc;
+	struct cl_object	*result;
+
+	if (lov->lo_type != LLT_COMP)
+		GOTO(out, result = NULL);
+
+	if (entry >= lsm->lsm_entry_count ||
+	    stripe >= lsm->lsm_entries[entry]->lsme_stripe_count)
+		GOTO(out, result = NULL);
+
+	oinfo = lsm->lsm_entries[entry]->lsme_oinfo[stripe];
+	ost_idx = oinfo->loi_ost_idx;
+	rc = ostid_to_fid(ofid, &oinfo->loi_oi, ost_idx);
+	if (rc != 0)
+		GOTO(out, result = NULL);
+
+	subdev = lovsub2cl_dev(dev->ld_target[ost_idx]);
+	result = lov_sub_find(env, subdev, ofid, NULL);
+out:
+	if (result == NULL)
+		result = ERR_PTR(-EINVAL);
+	return result;
+}
+
 static int lov_delete_empty(const struct lu_env *env, struct lov_object *lov,
 			    union lov_layout_state *state)
 {
@@ -823,6 +385,77 @@ static int lov_delete_empty(const struct lu_env *env, struct lov_object *lov,
 	return 0;
 }
 
+static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov,
+			       struct lov_layout_raid0 *r0,
+			       struct lovsub_object *los, int idx)
+{
+	struct cl_object        *sub;
+	struct lu_site          *site;
+	struct lu_site_bkt_data *bkt;
+	wait_queue_entry_t      *waiter;
+
+        LASSERT(r0->lo_sub[idx] == los);
+
+        sub  = lovsub2cl(los);
+        site = sub->co_lu.lo_dev->ld_site;
+        bkt  = lu_site_bkt_from_fid(site, &sub->co_lu.lo_header->loh_fid);
+
+        cl_object_kill(env, sub);
+        /* release a reference to the sub-object and ... */
+        lu_object_ref_del(&sub->co_lu, "lov-parent", lov);
+        cl_object_put(env, sub);
+
+        /* ... wait until it is actually destroyed---sub-object clears its
+         * ->lo_sub[] slot in lovsub_object_fini() */
+	if (r0->lo_sub[idx] == los) {
+		waiter = &lov_env_info(env)->lti_waiter;
+		init_waitqueue_entry(waiter, current);
+		add_wait_queue(&bkt->lsb_marche_funebre, waiter);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		while (1) {
+			/* this wait-queue is signaled at the end of
+			 * lu_object_free(). */
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			spin_lock(&r0->lo_sub_lock);
+			if (r0->lo_sub[idx] == los) {
+				spin_unlock(&r0->lo_sub_lock);
+				schedule();
+			} else {
+				spin_unlock(&r0->lo_sub_lock);
+				set_current_state(TASK_RUNNING);
+				break;
+			}
+		}
+		remove_wait_queue(&bkt->lsb_marche_funebre, waiter);
+	}
+	LASSERT(r0->lo_sub[idx] == NULL);
+}
+
+static void lov_delete_raid0(const struct lu_env *env, struct lov_object *lov,
+			     struct lov_layout_raid0 *r0)
+{
+	ENTRY;
+
+        if (r0->lo_sub != NULL) {
+		int i;
+
+		for (i = 0; i < r0->lo_nr; ++i) {
+			struct lovsub_object *los = r0->lo_sub[i];
+
+			if (los != NULL) {
+				cl_object_prune(env, &los->lso_cl);
+				/*
+				 * If top-level object is to be evicted from
+				 * the cache, so are its sub-objects.
+				 */
+				lov_subobject_kill(env, lov, r0, los, i);
+			}
+		}
+	}
+
+	EXIT;
+}
+
 static int lov_delete_composite(const struct lu_env *env,
 				struct lov_object *lov,
 				union lov_layout_state *state)
@@ -837,7 +470,7 @@ static int lov_delete_composite(const struct lu_env *env,
 	lov_layout_wait(env, lov);
 	if (comp->lo_entries)
 		lov_foreach_layout_entry(lov, entry)
-			lov_delete_raid0(env, lov, entry);
+			lov_delete_raid0(env, lov, &entry->lle_raid0);
 
 	RETURN(0);
 }
@@ -848,6 +481,15 @@ static void lov_fini_empty(const struct lu_env *env, struct lov_object *lov,
 	LASSERT(lov->lo_type == LLT_EMPTY || lov->lo_type == LLT_RELEASED);
 }
 
+static void lov_fini_raid0(const struct lu_env *env,
+			   struct lov_layout_raid0 *r0)
+{
+	if (r0->lo_sub != NULL) {
+		OBD_FREE_LARGE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]);
+		r0->lo_sub = NULL;
+	}
+}
+
 static void lov_fini_composite(const struct lu_env *env,
 			       struct lov_object *lov,
 			       union lov_layout_state *state)
@@ -859,21 +501,13 @@ static void lov_fini_composite(const struct lu_env *env,
 		struct lov_layout_entry *entry;
 
 		lov_foreach_layout_entry(lov, entry)
-			entry->lle_comp_ops->lco_fini(env, entry);
+			lov_fini_raid0(env, &entry->lle_raid0);
 
 		OBD_FREE(comp->lo_entries,
 			 comp->lo_entry_count * sizeof(*comp->lo_entries));
 		comp->lo_entries = NULL;
 	}
 
-	if (comp->lo_mirrors != NULL) {
-		OBD_FREE(comp->lo_mirrors,
-			 comp->lo_mirror_count * sizeof(*comp->lo_mirrors));
-		comp->lo_mirrors = NULL;
-	}
-
-	memset(comp, 0, sizeof(*comp));
-
 	dump_lsm(D_INODE, lov->lo_lsm);
 	lov_free_memmd(&lov->lo_lsm);
 
@@ -896,6 +530,24 @@ static int lov_print_empty(const struct lu_env *env, void *cookie,
         return 0;
 }
 
+static int lov_print_raid0(const struct lu_env *env, void *cookie,
+			   lu_printer_t p, struct lov_layout_raid0 *r0)
+{
+	int i;
+
+	for (i = 0; i < r0->lo_nr; ++i) {
+		struct lu_object *sub;
+
+		if (r0->lo_sub[i] != NULL) {
+			sub = lovsub2lu(r0->lo_sub[i]);
+			lu_object_print(env, cookie, p, sub);
+		} else {
+			(*p)(env, cookie, "sub %d absent\n", i);
+		}
+	}
+	return 0;
+}
+
 static int lov_print_composite(const struct lu_env *env, void *cookie,
 			       lu_printer_t p, const struct lu_object *o)
 {
@@ -911,15 +563,12 @@ static int lov_print_composite(const struct lu_env *env, void *cookie,
 
 	for (i = 0; i < lsm->lsm_entry_count; i++) {
 		struct lov_stripe_md_entry *lse = lsm->lsm_entries[i];
-		struct lov_layout_entry *lle = lov_entry(lov, i);
 
-		(*p)(env, cookie,
-		     DEXT ": { 0x%08X, %u, %#x, %u, %#x, %u, %u }\n",
+		(*p)(env, cookie, DEXT ": { 0x%08X, %u, %u, %#x, %u, %u }\n",
 		     PEXT(&lse->lsme_extent), lse->lsme_magic,
-		     lse->lsme_id, lse->lsme_pattern, lse->lsme_layout_gen,
-		     lse->lsme_flags, lse->lsme_stripe_count,
-		     lse->lsme_stripe_size);
-		lov_print_raid0(env, cookie, p, lle);
+		     lse->lsme_id, lse->lsme_layout_gen, lse->lsme_flags,
+		     lse->lsme_stripe_count, lse->lsme_stripe_size);
+		lov_print_raid0(env, cookie, p, lov_r0(lov, i));
 	}
 
 	return 0;
@@ -953,6 +602,51 @@ static int lov_attr_get_empty(const struct lu_env *env, struct cl_object *obj,
         return 0;
 }
 
+static int lov_attr_get_raid0(const struct lu_env *env, struct lov_object *lov,
+			      unsigned int index, struct lov_layout_raid0 *r0)
+
+{
+	struct lov_stripe_md *lsm = lov->lo_lsm;
+	struct ost_lvb *lvb = &lov_env_info(env)->lti_lvb;
+	struct cl_attr *attr = &r0->lo_attr;
+	__u64 kms = 0;
+	int result = 0;
+
+	if (r0->lo_attr_valid)
+		return 0;
+
+	memset(lvb, 0, sizeof(*lvb));
+
+	/* XXX: timestamps can be negative by sanity:test_39m,
+	 * how can it be? */
+	lvb->lvb_atime = LLONG_MIN;
+	lvb->lvb_ctime = LLONG_MIN;
+	lvb->lvb_mtime = LLONG_MIN;
+
+	/*
+	 * XXX that should be replaced with a loop over sub-objects,
+	 * doing cl_object_attr_get() on them. But for now, let's
+	 * reuse old lov code.
+	 */
+
+	/*
+	 * XXX take lsm spin-lock to keep lov_merge_lvb_kms()
+	 * happy. It's not needed, because new code uses
+	 * ->coh_attr_guard spin-lock to protect consistency of
+	 * sub-object attributes.
+	 */
+	lov_stripe_lock(lsm);
+	result = lov_merge_lvb_kms(lsm, index, lvb, &kms);
+	lov_stripe_unlock(lsm);
+	if (result == 0) {
+		cl_lvb2attr(attr, lvb);
+		attr->cat_kms = kms;
+		r0->lo_attr_valid = 1;
+	}
+
+	return result;
+}
+
 static int lov_attr_get_composite(const struct lu_env *env,
 				  struct cl_object *obj,
 				  struct cl_attr *attr)
@@ -960,34 +654,25 @@ static int lov_attr_get_composite(const struct lu_env *env,
 	struct lov_object	*lov = cl2lov(obj);
 	struct lov_layout_entry *entry;
 	int			 result = 0;
+	int			 index = 0;
 
 	ENTRY;
 
 	attr->cat_size = 0;
 	attr->cat_blocks = 0;
 	lov_foreach_layout_entry(lov, entry) {
-		struct cl_attr *lov_attr = NULL;
-		int index = lov_layout_entry_index(lov, entry);
-
-		if (!entry->lle_valid)
-			continue;
+		struct lov_layout_raid0 *r0 = &entry->lle_raid0;
+		struct cl_attr *lov_attr = &r0->lo_attr;
 
 		/* PFL: This component has not been init-ed. */
 		if (!lsm_entry_inited(lov->lo_lsm, index))
-			continue;
-
-		result = entry->lle_comp_ops->lco_getattr(env, lov, index,
-							  entry, &lov_attr);
-		if (result < 0)
-			RETURN(result);
+			break;
 
-		if (lov_attr == NULL)
-			continue;
+		result = lov_attr_get_raid0(env, lov, index, r0);
+		if (result != 0)
+			break;
 
-		CDEBUG(D_INODE, "COMP ID #%i: s=%llu m=%llu a=%llu c=%llu "
-		       "b=%llu\n", index - 1, lov_attr->cat_size,
-		       lov_attr->cat_mtime, lov_attr->cat_atime,
-		       lov_attr->cat_ctime, lov_attr->cat_blocks);
+		index++;
 
 		/* merge results */
 		attr->cat_blocks += lov_attr->cat_blocks;
@@ -1002,58 +687,29 @@ static int lov_attr_get_composite(const struct lu_env *env,
 		if (attr->cat_mtime < lov_attr->cat_mtime)
 			attr->cat_mtime = lov_attr->cat_mtime;
 	}
-
-	RETURN(0);
-}
-
-static int lov_flush_composite(const struct lu_env *env,
-			       struct cl_object *obj,
-			       struct ldlm_lock *lock)
-{
-	struct lov_object *lov = cl2lov(obj);
-	struct lov_layout_entry *lle;
-	int rc = -ENODATA;
-
-	ENTRY;
-
-	lov_foreach_layout_entry(lov, lle) {
-		if (!lsme_is_dom(lle->lle_lsme))
-			continue;
-		rc = cl_object_flush(env, lovsub2cl(lle->lle_dom.lo_dom), lock);
-		break;
-	}
-
-	RETURN(rc);
-}
-
-static int lov_flush_empty(const struct lu_env *env, struct cl_object *obj,
-			   struct ldlm_lock *lock)
-{
-	return 0;
+	RETURN(result);
 }
 
 const static struct lov_layout_operations lov_dispatch[] = {
-	[LLT_EMPTY] = {
-		.llo_init      = lov_init_empty,
-		.llo_delete    = lov_delete_empty,
-		.llo_fini      = lov_fini_empty,
-		.llo_print     = lov_print_empty,
-		.llo_page_init = lov_page_init_empty,
-		.llo_lock_init = lov_lock_init_empty,
-		.llo_io_init   = lov_io_init_empty,
+        [LLT_EMPTY] = {
+                .llo_init      = lov_init_empty,
+                .llo_delete    = lov_delete_empty,
+                .llo_fini      = lov_fini_empty,
+                .llo_print     = lov_print_empty,
+                .llo_page_init = lov_page_init_empty,
+                .llo_lock_init = lov_lock_init_empty,
+                .llo_io_init   = lov_io_init_empty,
 		.llo_getattr   = lov_attr_get_empty,
-		.llo_flush     = lov_flush_empty,
-	},
-	[LLT_RELEASED] = {
-		.llo_init      = lov_init_released,
-		.llo_delete    = lov_delete_empty,
-		.llo_fini      = lov_fini_released,
-		.llo_print     = lov_print_released,
-		.llo_page_init = lov_page_init_empty,
-		.llo_lock_init = lov_lock_init_empty,
-		.llo_io_init   = lov_io_init_released,
+        },
+        [LLT_RELEASED] = {
+                .llo_init      = lov_init_released,
+                .llo_delete    = lov_delete_empty,
+                .llo_fini      = lov_fini_released,
+                .llo_print     = lov_print_released,
+                .llo_page_init = lov_page_init_empty,
+                .llo_lock_init = lov_lock_init_empty,
+                .llo_io_init   = lov_io_init_released,
 		.llo_getattr   = lov_attr_get_empty,
-		.llo_flush     = lov_flush_empty,
 	},
 	[LLT_COMP] = {
 		.llo_init      = lov_init_composite,
@@ -1064,7 +720,6 @@ const static struct lov_layout_operations lov_dispatch[] = {
 		.llo_lock_init = lov_lock_init_composite,
 		.llo_io_init   = lov_io_init_composite,
 		.llo_getattr   = lov_attr_get_composite,
-		.llo_flush     = lov_flush_composite,
 	},
 };
 
@@ -1226,11 +881,12 @@ static int lov_layout_change(const struct lu_env *unused,
 	CDEBUG(D_INODE, DFID "Apply new layout lov %p, type %d\n",
 	       PFID(lu_object_fid(lov2lu(lov))), lov, llt);
 
+	lov->lo_type = LLT_EMPTY;
+
 	/* page bufsize fixup */
 	cl_object_header(&lov->lo_cl)->coh_page_bufsize -=
 		lov_page_slice_fixup(lov, NULL);
 
-	lov->lo_type = llt;
 	rc = new_ops->llo_init(env, lov_dev, lov, lsm, conf, state);
 	if (rc != 0) {
 		struct obd_device *obd = lov2obd(lov_dev->ld_lov);
@@ -1240,10 +896,11 @@ static int lov_layout_change(const struct lu_env *unused,
 		new_ops->llo_delete(env, lov, state);
 		new_ops->llo_fini(env, lov, state);
 		/* this file becomes an EMPTY file. */
-		lov->lo_type = LLT_EMPTY;
 		GOTO(out, rc);
 	}
 
+	lov->lo_type = llt;
+
 out:
 	cl_env_put(env, &refcheck);
 	RETURN(rc);
@@ -1399,7 +1056,7 @@ int lov_page_init(const struct lu_env *env, struct cl_object *obj,
 int lov_io_init(const struct lu_env *env, struct cl_object *obj,
 		struct cl_io *io)
 {
-	CL_IO_SLICE_CLEAN(lov_env_io(env), lis_preserved);
+	CL_IO_SLICE_CLEAN(lov_env_io(env), lis_cl);
 
 	CDEBUG(D_INODE, DFID "io %p type %d ignore/verify layout %d/%d\n",
 	       PFID(lu_object_fid(&obj->co_lu)), io, io->ci_type,
@@ -1601,43 +1258,6 @@ struct fiemap_state {
 	bool			fs_enough;
 };
 
-static struct cl_object *lov_find_subobj(const struct lu_env *env,
-					 struct lov_object *lov,
-					 struct lov_stripe_md *lsm,
-					 int index)
-{
-	struct lov_device	*dev = lu2lov_dev(lov2lu(lov)->lo_dev);
-	struct lov_thread_info  *lti = lov_env_info(env);
-	struct lu_fid		*ofid = &lti->lti_fid;
-	struct lov_oinfo	*oinfo;
-	struct cl_device	*subdev;
-	int			entry = lov_comp_entry(index);
-	int			stripe = lov_comp_stripe(index);
-	int			ost_idx;
-	int			rc;
-	struct cl_object	*result;
-
-	if (lov->lo_type != LLT_COMP)
-		GOTO(out, result = NULL);
-
-	if (entry >= lsm->lsm_entry_count ||
-	    stripe >= lsm->lsm_entries[entry]->lsme_stripe_count)
-		GOTO(out, result = NULL);
-
-	oinfo = lsm->lsm_entries[entry]->lsme_oinfo[stripe];
-	ost_idx = oinfo->loi_ost_idx;
-	rc = ostid_to_fid(ofid, &oinfo->loi_oi, ost_idx);
-	if (rc != 0)
-		GOTO(out, result = NULL);
-
-	subdev = lovsub2cl_dev(dev->ld_target[ost_idx]);
-	result = lov_sub_find(env, subdev, ofid, NULL);
-out:
-	if (result == NULL)
-		result = ERR_PTR(-EINVAL);
-	return result;
-}
-
 int fiemap_for_stripe(const struct lu_env *env, struct cl_object *obj,
 		      struct lov_stripe_md *lsm, struct fiemap *fiemap,
 		      size_t *buflen, struct ll_fiemap_info_key *fmkey,
@@ -1678,7 +1298,7 @@ int fiemap_for_stripe(const struct lu_env *env, struct cl_object *obj,
 	if (lun_start == lun_end)
 		return 0;
 
-	req_fm_len = obd_object_end - lun_start + 1;
+	req_fm_len = obd_object_end - lun_start;
 	fs->fs_fm->fm_length = 0;
 	len_mapped_single_call = 0;
 
@@ -1721,7 +1341,7 @@ int fiemap_for_stripe(const struct lu_env *env, struct cl_object *obj,
 			fs->fs_fm->fm_mapped_extents = 1;
 
 			fm_ext[0].fe_logical = lun_start;
-			fm_ext[0].fe_length = obd_object_end - lun_start + 1;
+			fm_ext[0].fe_length = obd_object_end - lun_start;
 			fm_ext[0].fe_flags |= FIEMAP_EXTENT_UNKNOWN;
 
 			goto inactive_tgt;
@@ -1836,11 +1456,8 @@ static int lov_object_fiemap(const struct lu_env *env, struct cl_object *obj,
 	ENTRY;
 
 	lsm = lov_lsm_addref(cl2lov(obj));
-	if (lsm == NULL) {
-		/* no extent: there is no object for mapping */
-		fiemap->fm_mapped_extents = 0;
-		return 0;
-	}
+	if (lsm == NULL)
+		RETURN(-ENODATA);
 
 	if (!(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) {
 		/**
@@ -1854,10 +1471,6 @@ static int lov_object_fiemap(const struct lu_env *env, struct cl_object *obj,
 			GOTO(out_lsm, rc = -ENOTSUPP);
 	}
 
-	/* No support for DOM layout yet. */
-	if (lsme_is_dom(lsm->lsm_entries[0]))
-		GOTO(out_lsm, rc = -ENOTSUPP);
-
 	if (lsm->lsm_is_released) {
 		if (fiemap->fm_start < fmkey->lfik_oa.o_size) {
 			/**
@@ -1924,7 +1537,6 @@ static int lov_object_fiemap(const struct lu_env *env, struct cl_object *obj,
 	if (start_entry == -1 || end_entry == -1)
 		GOTO(out_fm_local, rc = -EINVAL);
 
-	/* TODO: rewrite it with lov_foreach_io_layout() */
 	for (entry = start_entry; entry <= end_entry; entry++) {
 		lsme = lsm->lsm_entries[entry];
 
@@ -2054,13 +1666,6 @@ static loff_t lov_object_maxbytes(struct cl_object *obj)
 	return maxbytes;
 }
 
-static int lov_object_flush(const struct lu_env *env, struct cl_object *obj,
-			    struct ldlm_lock *lock)
-{
-	return LOV_2DISPATCH_MAYLOCK(cl2lov(obj), llo_flush, true, env, obj,
-				     lock);
-}
-
 static const struct cl_object_operations lov_ops = {
 	.coo_page_init    = lov_page_init,
 	.coo_lock_init    = lov_lock_init,
@@ -2072,7 +1677,6 @@ static const struct cl_object_operations lov_ops = {
 	.coo_layout_get   = lov_object_layout_get,
 	.coo_maxbytes     = lov_object_maxbytes,
 	.coo_fiemap       = lov_object_fiemap,
-	.coo_object_flush = lov_object_flush
 };
 
 static const struct lu_object_operations lov_lu_obj_ops = {
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_offset.c b/drivers/staging/lustrefsx/lustre/lov/lov_offset.c
index de2e6c47da8ee..3ff0a38a7e263 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_offset.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_offset.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,15 +38,12 @@
 
 #include "lov_internal.h"
 
-loff_t stripe_width(struct lov_stripe_md *lsm, unsigned int index)
+static loff_t stripe_width(struct lov_stripe_md *lsm, unsigned int index)
 {
 	struct lov_stripe_md_entry *entry = lsm->lsm_entries[index];
 
 	LASSERT(index < lsm->lsm_entry_count);
 
-	if (lsme_is_dom(entry))
-		return (loff_t)entry->lsme_stripe_size;
-
 	return (loff_t)entry->lsme_stripe_size * entry->lsme_stripe_count;
 }
 
@@ -58,11 +55,10 @@ u64 lov_stripe_size(struct lov_stripe_md *lsm, int index, u64 ost_size,
 	unsigned long stripe_size;
 	loff_t swidth;
 	loff_t lov_size;
+        ENTRY;
 
-	ENTRY;
-
-	if (ost_size == 0)
-		RETURN(0);
+        if (ost_size == 0)
+                RETURN(0);
 
 	swidth = stripe_width(lsm, index);
 
@@ -73,7 +69,7 @@ u64 lov_stripe_size(struct lov_stripe_md *lsm, int index, u64 ost_size,
 	else
 		lov_size = (ost_size - 1) * swidth + (stripeno + 1) * ssize;
 
-	RETURN(lov_size);
+        RETURN(lov_size);
 }
 
 /**
@@ -90,8 +86,7 @@ pgoff_t lov_stripe_pgoff(struct lov_stripe_md *lsm, int index,
 	return offset >> PAGE_SHIFT;
 }
 
-/*
- * we have an offset in file backed by an lov and want to find out where
+/* we have an offset in file backed by an lov and want to find out where
  * that offset lands in our given stripe of the file.  for the easy
  * case where the offset is within the stripe, we just have to scale the
  * offset down to make it relative to the stripe instead of the lov.
@@ -138,8 +133,7 @@ pgoff_t lov_stripe_pgoff(struct lov_stripe_md *lsm, int index,
  * this function returns < 0 when the offset was "before" the stripe and
  * was moved forward to the start of the stripe in question;  0 when it
  * falls in the stripe and no shifting was done; > 0 when the offset
- * was outside the stripe and was pulled back to its final byte.
- */
+ * was outside the stripe and was pulled back to its final byte. */
 int lov_stripe_offset(struct lov_stripe_md *lsm, int index, loff_t lov_off,
 		      int stripeno, loff_t *obdoff)
 {
@@ -147,12 +141,12 @@ int lov_stripe_offset(struct lov_stripe_md *lsm, int index, loff_t lov_off,
 	loff_t stripe_off;
 	loff_t this_stripe;
 	loff_t swidth;
-	int ret = 0;
+        int ret = 0;
 
-	if (lov_off == OBD_OBJECT_EOF) {
-		*obdoff = OBD_OBJECT_EOF;
-		return 0;
-	}
+        if (lov_off == OBD_OBJECT_EOF) {
+                *obdoff = OBD_OBJECT_EOF;
+                return 0;
+        }
 
 	swidth = stripe_width(lsm, index);
 
@@ -160,24 +154,23 @@ int lov_stripe_offset(struct lov_stripe_md *lsm, int index, loff_t lov_off,
 	stripe_off = lov_do_div64(lov_off, swidth);
 
 	this_stripe = (loff_t)stripeno * ssize;
-	if (stripe_off < this_stripe) {
-		stripe_off = 0;
-		ret = -1;
-	} else {
-		stripe_off -= this_stripe;
-
-		if (stripe_off >= ssize) {
-			stripe_off = ssize;
-			ret = 1;
-		}
-	}
-
-	*obdoff = lov_off * ssize + stripe_off;
-	return ret;
+        if (stripe_off < this_stripe) {
+                stripe_off = 0;
+                ret = -1;
+        } else {
+                stripe_off -= this_stripe;
+
+                if (stripe_off >= ssize) {
+                        stripe_off = ssize;
+                        ret = 1;
+                }
+        }
+
+        *obdoff = lov_off * ssize + stripe_off;
+        return ret;
 }
 
-/*
- * Given a whole-file size and a stripe number, give the file size which
+/* Given a whole-file size and a stripe number, give the file size which
  * corresponds to the individual object of that stripe.
  *
  * This behaves basically in the same was as lov_stripe_offset, except that
@@ -204,8 +197,8 @@ loff_t lov_size_to_stripe(struct lov_stripe_md *lsm, int index, u64 file_size,
 	loff_t this_stripe;
 	loff_t swidth;
 
-	if (file_size == OBD_OBJECT_EOF)
-		return OBD_OBJECT_EOF;
+        if (file_size == OBD_OBJECT_EOF)
+                return OBD_OBJECT_EOF;
 
 	swidth = stripe_width(lsm, index);
 
@@ -213,39 +206,35 @@ loff_t lov_size_to_stripe(struct lov_stripe_md *lsm, int index, u64 file_size,
 	stripe_off = lov_do_div64(file_size, swidth);
 
 	this_stripe = (loff_t)stripeno * ssize;
-	if (stripe_off < this_stripe) {
-		/* Move to end of previous stripe, or zero */
-		if (file_size > 0) {
-			file_size--;
-			stripe_off = ssize;
-		} else {
-			stripe_off = 0;
-		}
-	} else {
-		stripe_off -= this_stripe;
-
-		if (stripe_off >= ssize) {
-			/* Clamp to end of this stripe */
-			stripe_off = ssize;
-		}
-	}
-
-	return (file_size * ssize + stripe_off);
+        if (stripe_off < this_stripe) {
+                /* Move to end of previous stripe, or zero */
+                if (file_size > 0) {
+                        file_size--;
+                        stripe_off = ssize;
+                } else {
+                        stripe_off = 0;
+                }
+        } else {
+                stripe_off -= this_stripe;
+
+                if (stripe_off >= ssize) {
+                        /* Clamp to end of this stripe */
+                        stripe_off = ssize;
+                }
+        }
+
+        return (file_size * ssize + stripe_off);
 }
 
-/*
- * given an extent in an lov and a stripe, calculate the extent of the stripe
+/* given an extent in an lov and a stripe, calculate the extent of the stripe
  * that is contained within the lov extent.  this returns true if the given
- * stripe does intersect with the lov extent.
- *
- * Closed interval [@obd_start, @obd_end] will be returned.
- */
+ * stripe does intersect with the lov extent. */
 int lov_stripe_intersects(struct lov_stripe_md *lsm, int index, int stripeno,
 			  struct lu_extent *ext, u64 *obd_start, u64 *obd_end)
 {
 	struct lov_stripe_md_entry *entry = lsm->lsm_entries[index];
 	u64 start, end;
-	int start_side, end_side;
+        int start_side, end_side;
 
 	if (!lu_extent_is_overlapped(ext, &entry->lsme_extent))
 			return 0;
@@ -261,28 +250,24 @@ int lov_stripe_intersects(struct lov_stripe_md *lsm, int index, int stripeno,
 	CDEBUG(D_INODE, "[%lld->%lld] -> [(%d) %lld->%lld (%d)]\n",
 		start, end, start_side, *obd_start, *obd_end, end_side);
 
-	/*
-	 * this stripe doesn't intersect the file extent when neither
-	 * start or the end intersected the stripe and obd_start and
-	 * obd_end got rounded up to the save value.
-	 */
-	if (start_side != 0 && end_side != 0 && *obd_start == *obd_end)
-		return 0;
-
-	/*
-	 * as mentioned in the lov_stripe_offset commentary, end
-	 * might have been shifted in the wrong direction.  This
-	 * happens when an end offset is before the stripe when viewed
-	 * through the "mod stripe size" math. we detect it being shifted
-	 * in the wrong direction and touch it up.
-	 * interestingly, this can't underflow since end must be > start
-	 * if we passed through the previous check.
-	 * (should we assert for that somewhere?)
-	 */
-	if (end_side != 0)
-		(*obd_end)--;
-
-	return 1;
+        /* this stripe doesn't intersect the file extent when neither
+         * start or the end intersected the stripe and obd_start and
+         * obd_end got rounded up to the save value. */
+        if (start_side != 0 && end_side != 0 && *obd_start == *obd_end)
+                return 0;
+
+        /* as mentioned in the lov_stripe_offset commentary, end
+         * might have been shifted in the wrong direction.  This
+         * happens when an end offset is before the stripe when viewed
+         * through the "mod stripe size" math. we detect it being shifted
+         * in the wrong direction and touch it up.
+         * interestingly, this can't underflow since end must be > start
+         * if we passed through the previous check.
+         * (should we assert for that somewhere?) */
+        if (end_side != 0)
+                (*obd_end)--;
+
+        return 1;
 }
 
 /* compute which stripe number "lov_off" will be written into */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_pack.c b/drivers/staging/lustrefsx/lustre/lov/lov_pack.c
index 6fe3c2ff5bd5b..dd29ff51dcc1c 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_pack.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_pack.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,6 +38,9 @@
 
 #define DEBUG_SUBSYSTEM S_LOV
 
+#include <lustre/lustre_idl.h>
+#include <lustre/lustre_user.h>
+
 #include <lustre_net.h>
 #include <lustre_swab.h>
 #include <obd.h>
@@ -50,16 +53,16 @@
 void lov_dump_lmm_common(int level, void *lmmp)
 {
 	struct lov_mds_md *lmm = lmmp;
-	struct ost_id oi;
+	struct ost_id	oi;
 
 	lmm_oi_le_to_cpu(&oi, &lmm->lmm_oi);
-	CDEBUG(level, "objid "DOSTID", magic 0x%08x, pattern %#x\n",
-	       POSTID(&oi), le32_to_cpu(lmm->lmm_magic),
-	       le32_to_cpu(lmm->lmm_pattern));
-	CDEBUG(level, "stripe_size %u, stripe_count %u, layout_gen %u\n",
-	       le32_to_cpu(lmm->lmm_stripe_size),
-	       le16_to_cpu(lmm->lmm_stripe_count),
-	       le16_to_cpu(lmm->lmm_layout_gen));
+	CDEBUG_LIMIT(level, "objid "DOSTID", magic 0x%08x, pattern %#x\n",
+		     POSTID(&oi), le32_to_cpu(lmm->lmm_magic),
+		     le32_to_cpu(lmm->lmm_pattern));
+	CDEBUG_LIMIT(level, "stripe_size %u, stripe_count %u, layout_gen %u\n",
+		     le32_to_cpu(lmm->lmm_stripe_size),
+		     le16_to_cpu(lmm->lmm_stripe_count),
+		     le16_to_cpu(lmm->lmm_layout_gen));
 }
 
 static void lov_dump_lmm_objects(int level, struct lov_ost_data *lod,
@@ -68,8 +71,9 @@ static void lov_dump_lmm_objects(int level, struct lov_ost_data *lod,
 	int i;
 
 	if (stripe_count > LOV_V1_INSANE_STRIPE_COUNT) {
-		CDEBUG(level, "bad stripe_count %u > max_stripe_count %u\n",
-		       stripe_count, LOV_V1_INSANE_STRIPE_COUNT);
+		CDEBUG_LIMIT(level,
+			     "bad stripe_count %u > max_stripe_count %u\n",
+			     stripe_count, LOV_V1_INSANE_STRIPE_COUNT);
 		return;
 	}
 
@@ -77,22 +81,22 @@ static void lov_dump_lmm_objects(int level, struct lov_ost_data *lod,
 		struct ost_id oi;
 
 		ostid_le_to_cpu(&lod->l_ost_oi, &oi);
-		CDEBUG(level, "stripe %u idx %u subobj "DOSTID"\n", i,
-		       le32_to_cpu(lod->l_ost_idx), POSTID(&oi));
+		CDEBUG_LIMIT(level, "stripe %u idx %u subobj "DOSTID"\n", i,
+			     le32_to_cpu(lod->l_ost_idx), POSTID(&oi));
 	}
 }
 
 void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm)
 {
-	lov_dump_lmm_common(level, lmm);
-	lov_dump_lmm_objects(level, lmm->lmm_objects,
-			     le16_to_cpu(lmm->lmm_stripe_count));
+        lov_dump_lmm_common(level, lmm);
+        lov_dump_lmm_objects(level, lmm->lmm_objects,
+                             le16_to_cpu(lmm->lmm_stripe_count));
 }
 
 void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm)
 {
 	lov_dump_lmm_common(level, lmm);
-	CDEBUG(level, "pool_name "LOV_POOLNAMEF"\n", lmm->lmm_pool_name);
+	CDEBUG_LIMIT(level, "pool_name "LOV_POOLNAMEF"\n", lmm->lmm_pool_name);
 	lov_dump_lmm_objects(level, lmm->lmm_objects,
 			     le16_to_cpu(lmm->lmm_stripe_count));
 }
@@ -110,8 +114,8 @@ void lov_dump_lmm(int level, void *lmm)
 		lov_dump_lmm_v3(level, (struct lov_mds_md_v3 *)lmm);
 		break;
 	default:
-		CDEBUG(level, "unrecognized lmm_magic %x, assuming %x\n",
-		       magic, LOV_MAGIC_V1);
+		CDEBUG_LIMIT(level, "unrecognized lmm_magic %x, assuming %x\n",
+			     magic, LOV_MAGIC_V1);
 		lov_dump_lmm_common(level, lmm);
 		break;
 	}
@@ -133,7 +137,6 @@ ssize_t lov_lsm_pack_v1v3(const struct lov_stripe_md *lsm, void *buf,
 	struct lov_ost_data_v1 *lmm_objects;
 	size_t lmm_size;
 	unsigned int i;
-
 	ENTRY;
 
 	lmm_size = lov_mds_md_size(lsm->lsm_entries[0]->lsme_stripe_count,
@@ -144,8 +147,7 @@ ssize_t lov_lsm_pack_v1v3(const struct lov_stripe_md *lsm, void *buf,
 	if (buf_size < lmm_size)
 		RETURN(-ERANGE);
 
-	/*
-	 * lmmv1 and lmmv3 point to the same struct and have the
+	/* lmmv1 and lmmv3 point to the same struct and have the
 	 * same first fields
 	 */
 	lmmv1->lmm_magic = cpu_to_le32(lsm->lsm_magic);
@@ -193,7 +195,6 @@ ssize_t lov_lsm_pack(const struct lov_stripe_md *lsm, void *buf,
 	unsigned int offset;
 	unsigned int size;
 	unsigned int i;
-
 	ENTRY;
 
 	if (lsm->lsm_magic == LOV_MAGIC_V1 || lsm->lsm_magic == LOV_MAGIC_V3)
@@ -209,8 +210,6 @@ ssize_t lov_lsm_pack(const struct lov_stripe_md *lsm, void *buf,
 	lcmv1->lcm_magic = cpu_to_le32(lsm->lsm_magic);
 	lcmv1->lcm_size = cpu_to_le32(lmm_size);
 	lcmv1->lcm_layout_gen = cpu_to_le32(lsm->lsm_layout_gen);
-	lcmv1->lcm_flags = cpu_to_le16(lsm->lsm_flags);
-	lcmv1->lcm_mirror_count = cpu_to_le16(lsm->lsm_mirror_count);
 	lcmv1->lcm_entry_count = cpu_to_le16(lsm->lsm_entry_count);
 
 	offset = sizeof(*lcmv1) + sizeof(*lcme) * lsm->lsm_entry_count;
@@ -225,9 +224,6 @@ ssize_t lov_lsm_pack(const struct lov_stripe_md *lsm, void *buf,
 
 		lcme->lcme_id = cpu_to_le32(lsme->lsme_id);
 		lcme->lcme_flags = cpu_to_le32(lsme->lsme_flags);
-		if (lsme->lsme_flags & LCME_FL_NOSYNC)
-			lcme->lcme_timestamp =
-				cpu_to_le64(lsme->lsme_timestamp);
 		lcme->lcme_extent.e_start =
 			cpu_to_le64(lsme->lsme_extent.e_start);
 		lcme->lcme_extent.e_end =
@@ -290,10 +286,8 @@ __u16 lov_get_stripe_count(struct lov_obd *lov, __u32 magic, __u16 stripe_count)
 	if (!stripe_count)
 		stripe_count = 1;
 
-	/*
-	 * stripe count is based on whether ldiskfs can handle
-	 * larger EA sizes
-	 */
+	/* stripe count is based on whether ldiskfs can handle
+	 * larger EA sizes */
 	if (lov->lov_ocd.ocd_connect_flags & OBD_CONNECT_MAX_EASIZE &&
 	    lov->lov_ocd.ocd_max_easize)
 		max_stripes = lov_mds_md_max_stripe_count(
@@ -319,8 +313,7 @@ int lov_free_memmd(struct lov_stripe_md **lsmp)
 	return refc;
 }
 
-/*
- * Unpack LOV object metadata from disk storage.  It is packed in LE byte
+/* Unpack LOV object metadata from disk storage.  It is packed in LE byte
  * order and is opaque to the networking layer.
  */
 struct lov_stripe_md *lov_unpackmd(struct lov_obd *lov, void *buf,
@@ -329,7 +322,6 @@ struct lov_stripe_md *lov_unpackmd(struct lov_obd *lov, void *buf,
 	const struct lsm_operations *op;
 	struct lov_stripe_md *lsm;
 	u32 magic;
-
 	ENTRY;
 
 	if (buf_size < sizeof(magic))
@@ -337,7 +329,7 @@ struct lov_stripe_md *lov_unpackmd(struct lov_obd *lov, void *buf,
 
 	magic = le32_to_cpu(*(u32 *)buf);
 	op = lsm_op_find(magic);
-	if (!op)
+	if (op == NULL)
 		RETURN(ERR_PTR(-EINVAL));
 
 	lsm = op->lsm_unpackmd(lov, buf, buf_size);
@@ -345,8 +337,7 @@ struct lov_stripe_md *lov_unpackmd(struct lov_obd *lov, void *buf,
 	RETURN(lsm);
 }
 
-/*
- * Retrieve object striping information.
+/* Retrieve object striping information.
  *
  * @lump is a pointer to an in-core struct with lmm_ost_count indicating
  * the maximum number of OST indices which will fit in the user buffer.
@@ -362,10 +353,10 @@ int lov_getstripe(const struct lu_env *env, struct lov_object *obj,
 	/* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */
 	struct lov_mds_md *lmmk, *lmm;
 	struct lov_user_md_v1 lum;
-	size_t lmmk_size, lum_size = 0;
-	ssize_t lmm_size;
-	int rc = 0;
-
+	size_t	lmmk_size;
+	ssize_t	lmm_size, lum_size = 0;
+	static bool printed;
+	int	rc = 0;
 	ENTRY;
 
 	if (lsm->lsm_magic != LOV_MAGIC_V1 && lsm->lsm_magic != LOV_MAGIC_V3 &&
@@ -375,10 +366,18 @@ int lov_getstripe(const struct lu_env *env, struct lov_object *obj,
 		GOTO(out, rc = -EIO);
 	}
 
+	if (!printed) {
+		LCONSOLE_WARN("%s: using old ioctl(LL_IOC_LOV_GETSTRIPE) on "
+			      DFID", use llapi_layout_get_by_path()\n",
+			      current->comm,
+			      PFID(&obj->lo_cl.co_lu.lo_header->loh_fid));
+		printed = true;
+	}
+
 	lmmk_size = lov_comp_md_size(lsm);
 
 	OBD_ALLOC_LARGE(lmmk, lmmk_size);
-	if (!lmmk)
+	if (lmmk == NULL)
 		GOTO(out, rc = -ENOMEM);
 
 	lmm_size = lov_lsm_pack(lsm, lmmk, lmmk_size);
@@ -398,10 +397,8 @@ int lov_getstripe(const struct lu_env *env, struct lov_object *obj,
 		}
 	}
 
-	/*
-	 * Legacy appication passes limited buffer, we need to figure out
-	 * the user buffer size by the passed in lmm_stripe_count.
-	 */
+	/* Legacy appication passes limited buffer, we need to figure out
+	 * the user buffer size by the passed in lmm_stripe_count. */
 	if (copy_from_user(&lum, lump, sizeof(struct lov_user_md_v1)))
 		GOTO(out_free, rc = -EFAULT);
 
@@ -413,10 +410,8 @@ int lov_getstripe(const struct lu_env *env, struct lov_object *obj,
 	if (lum_size != 0) {
 		struct lov_mds_md *comp_md = lmmk;
 
-		/*
-		 * Legacy app (ADIO for instance) treats the layout as V1/V3
-		 * blindly, we'd return a reasonable V1/V3 for them.
-		 */
+		/* Legacy app (ADIO for instance) treats the layout as V1/V3
+		 * blindly, we'd return a reasonable V1/V3 for them. */
 		if (lmmk->lmm_magic == LOV_MAGIC_COMP_V1) {
 			struct lov_comp_md_v1 *comp_v1;
 			struct cl_object *cl_obj;
@@ -429,10 +424,8 @@ int lov_getstripe(const struct lu_env *env, struct lov_object *obj,
 			cl_object_attr_get(env, cl_obj, &attr);
 			cl_object_attr_unlock(cl_obj);
 
-			/*
-			 * return the last instantiated component if file size
-			 * is non-zero, otherwise, return the last component.
-			 */
+			/* return the last instantiated component if file size
+			 * is non-zero, otherwise, return the last component.*/
 			comp_v1 = (struct lov_comp_md_v1 *)lmmk;
 			i = attr.cat_size == 0 ? comp_v1->lcm_entry_count : 0;
 			for (; i < comp_v1->lcm_entry_count; i++) {
@@ -444,11 +437,10 @@ int lov_getstripe(const struct lu_env *env, struct lov_object *obj,
 				i--;
 			comp_md = (struct lov_mds_md *)((char *)comp_v1 +
 					comp_v1->lcm_entries[i].lcme_offset);
-			lum_size = comp_v1->lcm_entries[i].lcme_size;
 		}
 
 		lmm = comp_md;
-		lmm_size = min(lum_size, lmmk_size);
+		lmm_size = lum_size;
 	} else {
 		lmm = lmmk;
 		lmm_size = lmmk_size;
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_page.c b/drivers/staging/lustrefsx/lustre/lov/lov_page.c
index 34fbc66e47172..869c0b8478760 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_page.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_page.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -56,8 +56,8 @@ static int lov_comp_page_print(const struct lu_env *env,
 	struct lov_page *lp = cl2lov_page(slice);
 
 	return (*printer)(env, cookie,
-			  LUSTRE_LOV_NAME"-page@%p, comp index: %x, gen: %u\n",
-			  lp, lp->lps_index, lp->lps_layout_gen);
+			  LUSTRE_LOV_NAME"-page@%p, comp index: %x\n",
+			  lp, lp->lps_index);
 }
 
 static const struct cl_page_operations lov_comp_page_ops = {
@@ -68,22 +68,21 @@ int lov_page_init_composite(const struct lu_env *env, struct cl_object *obj,
 			    struct cl_page *page, pgoff_t index)
 {
 	struct lov_object *loo = cl2lov(obj);
-	struct lov_io *lio = lov_env_io(env);
-	struct cl_object *subobj;
-	struct cl_object *o;
+	struct lov_io     *lio = lov_env_io(env);
+	struct cl_object  *subobj;
+	struct cl_object  *o;
 	struct lov_io_sub *sub;
-	struct lov_page *lpg = cl_object_page_slice(obj, page);
+	struct lov_page   *lpg = cl_object_page_slice(obj, page);
 	struct lov_layout_raid0 *r0;
-	loff_t offset;
-	loff_t suboff;
-	int entry;
-	int stripe;
-	int rc;
-
+	loff_t             offset;
+	loff_t             suboff;
+	int                entry;
+	int                stripe;
+	int                rc;
 	ENTRY;
 
 	offset = cl_offset(obj, index);
-	entry = lov_io_layout_at(lio, offset);
+	entry = lov_lsm_entry(loo->lo_lsm, offset);
 	if (entry < 0 || !lsm_entry_inited(loo->lo_lsm, entry)) {
 		/* non-existing layout component */
 		lov_page_init_empty(env, obj, page, index);
@@ -97,7 +96,6 @@ int lov_page_init_composite(const struct lu_env *env, struct cl_object *obj,
 	LASSERT(rc == 0);
 
 	lpg->lps_index = lov_comp_index(entry, stripe);
-	lpg->lps_layout_gen = loo->lo_lsm->lsm_layout_gen;
 	cl_page_slice_add(page, &lpg->lps_cl, obj, index, &lov_comp_page_ops);
 
 	sub = lov_sub_get(env, lio, lpg->lps_index);
@@ -107,7 +105,7 @@ int lov_page_init_composite(const struct lu_env *env, struct cl_object *obj,
 	subobj = lovsub2cl(r0->lo_sub[stripe]);
 	list_for_each_entry(o, &subobj->co_lu.lo_header->loh_layers,
 			    co_lu.lo_linkage) {
-		if (o->co_ops->coo_page_init) {
+		if (o->co_ops->coo_page_init != NULL) {
 			rc = o->co_ops->coo_page_init(sub->sub_env, o, page,
 						      cl_index(subobj, suboff));
 			if (rc != 0)
@@ -122,9 +120,9 @@ static int lov_empty_page_print(const struct lu_env *env,
 				const struct cl_page_slice *slice,
 				void *cookie, lu_printer_t printer)
 {
-	struct lov_page *lp = cl2lov_page(slice);
+        struct lov_page *lp = cl2lov_page(slice);
 
-	return (*printer)(env, cookie, LUSTRE_LOV_NAME"-page@%p, empty.\n", lp);
+        return (*printer)(env, cookie, LUSTRE_LOV_NAME"-page@%p, empty.\n", lp);
 }
 
 static const struct cl_page_operations lov_empty_page_ops = {
@@ -136,10 +134,8 @@ int lov_page_init_empty(const struct lu_env *env, struct cl_object *obj,
 {
 	struct lov_page *lpg = cl_object_page_slice(obj, page);
 	void *addr;
-
 	ENTRY;
 
-	lpg->lps_index = ~0;
 	cl_page_slice_add(page, &lpg->lps_cl, obj, index, &lov_empty_page_ops);
 	addr = kmap(page->cp_vmpage);
 	memset(addr, 0, cl_page_size(obj));
@@ -148,14 +144,6 @@ int lov_page_init_empty(const struct lu_env *env, struct cl_object *obj,
 	RETURN(0);
 }
 
-bool lov_page_is_empty(const struct cl_page *page)
-{
-	const struct cl_page_slice *slice = cl_page_at(page, &lov_device_type);
-
-	LASSERT(slice != NULL);
-	return slice->cpl_ops == &lov_empty_page_ops;
-}
-
 
 /** @} lov */
 
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_pool.c b/drivers/staging/lustrefsx/lustre/lov/lov_pool.c
index 6173dbe1429ae..02b8899cb1b68 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_pool.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_pool.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -152,6 +152,7 @@ struct cfs_hash_ops pool_hash_operations = {
 };
 
 #ifdef CONFIG_PROC_FS
+/* ifdef needed for liblustre support */
 /*
  * pool /proc seq_file methods
  */
@@ -181,11 +182,14 @@ static void *pool_proc_next(struct seq_file *s, void *v, loff_t *pos)
 
         /* iterate to find a non empty entry */
         prev_idx = iter->idx;
+	down_read(&pool_tgt_rw_sem(iter->pool));
         iter->idx++;
-	if (iter->idx >= pool_tgt_count(iter->pool)) {
+        if (iter->idx == pool_tgt_count(iter->pool)) {
                 iter->idx = prev_idx; /* we stay on the last entry */
+		up_read(&pool_tgt_rw_sem(iter->pool));
                 return NULL;
         }
+	up_read(&pool_tgt_rw_sem(iter->pool));
         (*pos)++;
         /* return != NULL to continue */
         return iter;
@@ -216,7 +220,6 @@ static void *pool_proc_start(struct seq_file *s, loff_t *pos)
          * we can free it at stop() */
         /* /!\ do not forget to restore it to pool before freeing it */
         s->private = iter;
-	down_read(&pool_tgt_rw_sem(pool));
         if (*pos > 0) {
                 loff_t i;
                 void *ptr;
@@ -238,7 +241,6 @@ static void pool_proc_stop(struct seq_file *s, void *v)
          * calling start() method (see seq_read() from fs/seq_file.c)
          * we have to free only if s->private is an iterator */
         if ((iter) && (iter->magic == POOL_IT_MAGIC)) {
-		up_read(&pool_tgt_rw_sem(iter->pool));
                 /* we restore s->private so next call to pool_proc_start()
                  * will work */
                 s->private = iter->pool;
@@ -257,7 +259,9 @@ static int pool_proc_show(struct seq_file *s, void *v)
 	LASSERT(iter->pool != NULL);
 	LASSERT(iter->idx <= pool_tgt_count(iter->pool));
 
+	down_read(&pool_tgt_rw_sem(iter->pool));
         tgt = pool_tgt(iter->pool, iter->idx);
+	up_read(&pool_tgt_rw_sem(iter->pool));
         if (tgt)
                 seq_printf(s, "%s\n", obd_uuid2str(&(tgt->ltd_uuid)));
 
@@ -283,7 +287,7 @@ static int pool_proc_open(struct inode *inode, struct file *file)
         return rc;
 }
 
-const static struct proc_ops pool_proc_operations = {
+static struct proc_ops pool_proc_operations = {
 	.proc_open	= pool_proc_open,
 	.proc_read	= seq_read,
 	.proc_lseek	= seq_lseek,
@@ -545,7 +549,7 @@ int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname)
 
 
         /* search ost in lov array */
-	lov_tgts_getref(obd);
+        obd_getref(obd);
         for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) {
                 if (!lov->lov_tgts[lov_idx])
                         continue;
@@ -566,10 +570,9 @@ int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname)
 
         EXIT;
 out:
-	lov_tgts_putref(obd);
-	lov_pool_putref(pool);
-
-	return rc;
+        obd_putref(obd);
+        lov_pool_putref(pool);
+        return rc;
 }
 
 int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname)
@@ -589,7 +592,7 @@ int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname)
 
         obd_str2uuid(&ost_uuid, ostname);
 
-	lov_tgts_getref(obd);
+        obd_getref(obd);
         /* search ost in lov array, to get index */
         for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) {
                 if (!lov->lov_tgts[lov_idx])
@@ -611,8 +614,7 @@ int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname)
 
         EXIT;
 out:
-	lov_tgts_putref(obd);
-	lov_pool_putref(pool);
-
-	return rc;
+        obd_putref(obd);
+        lov_pool_putref(pool);
+        return rc;
 }
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_request.c b/drivers/staging/lustrefsx/lustre/lov/lov_request.c
index 75e5c901fd91e..fe74af4b7f82d 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_request.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_request.c
@@ -35,6 +35,8 @@
 #include <libcfs/libcfs.h>
 
 #include <obd_class.h>
+#include <lustre/lustre_idl.h>
+
 #include "lov_internal.h"
 
 static void lov_init_set(struct lov_request_set *set)
@@ -49,7 +51,6 @@ static void lov_finish_set(struct lov_request_set *set)
 {
 	struct list_head *pos, *n;
 	struct lov_request *req;
-
 	ENTRY;
 
 	LASSERT(set != NULL);
@@ -57,7 +58,7 @@ static void lov_finish_set(struct lov_request_set *set)
 		req = list_entry(pos, struct lov_request, rq_link);
 		list_del_init(&req->rq_link);
 
-		if (req->rq_oi.oi_osfs)
+		if (req->rq_oi.oi_osfs != NULL)
 			OBD_FREE_PTR(req->rq_oi.oi_osfs);
 
 		OBD_FREE_PTR(req);
@@ -79,18 +80,18 @@ static void
 lov_set_add_req(struct lov_request *req, struct lov_request_set *set)
 {
 	list_add_tail(&req->rq_link, &set->set_list);
-	set->set_count++;
-	req->rq_rqset = set;
+        set->set_count++;
+        req->rq_rqset = set;
 }
 
 static int lov_check_set(struct lov_obd *lov, int idx)
 {
 	int rc = 0;
-
 	mutex_lock(&lov->lov_lock);
 
-	if (!lov->lov_tgts[idx] || lov->lov_tgts[idx]->ltd_active ||
-	    (lov->lov_tgts[idx]->ltd_exp &&
+	if (lov->lov_tgts[idx] == NULL ||
+	    lov->lov_tgts[idx]->ltd_active ||
+	    (lov->lov_tgts[idx]->ltd_exp != NULL &&
 	     class_exp2cliimp(lov->lov_tgts[idx]->ltd_exp)->imp_connect_tried))
 		rc = 1;
 
@@ -98,8 +99,7 @@ static int lov_check_set(struct lov_obd *lov, int idx)
 	return rc;
 }
 
-/*
- * Check if the OSC connection exists and is active.
+/* Check if the OSC connection exists and is active.
  * If the OSC has not yet had a chance to connect to the OST the first time,
  * wait once for it to connect instead of returning an error.
  */
@@ -108,24 +108,19 @@ static int lov_check_and_wait_active(struct lov_obd *lov, int ost_idx)
 	wait_queue_head_t waitq;
 	struct l_wait_info lwi;
 	struct lov_tgt_desc *tgt;
-	struct obd_import *imp = NULL;
 	int rc = 0;
 
 	mutex_lock(&lov->lov_lock);
 
 	tgt = lov->lov_tgts[ost_idx];
 
-	if (unlikely(!tgt))
+	if (unlikely(tgt == NULL))
 		GOTO(out, rc = 0);
 
 	if (likely(tgt->ltd_active))
 		GOTO(out, rc = 1);
 
-	if (tgt->ltd_exp)
-		imp = class_exp2cliimp(tgt->ltd_exp);
-	if (imp && imp->imp_connect_tried)
-		GOTO(out, rc = 0);
-	if (imp && imp->imp_state == LUSTRE_IMP_IDLE)
+	if (tgt->ltd_exp && class_exp2cliimp(tgt->ltd_exp)->imp_connect_tried)
 		GOTO(out, rc = 0);
 
 	mutex_unlock(&lov->lov_lock);
@@ -147,20 +142,20 @@ static int lov_check_and_wait_active(struct lov_obd *lov, int ost_idx)
 
 #define LOV_U64_MAX ((__u64)~0ULL)
 #define LOV_SUM_MAX(tot, add)                                           \
-	do {                                                            \
-		if ((tot) + (add) < (tot))                              \
-			(tot) = LOV_U64_MAX;                            \
-		else                                                    \
-			(tot) += (add);                                 \
-	} while (0)
+        do {                                                            \
+                if ((tot) + (add) < (tot))                              \
+                        (tot) = LOV_U64_MAX;                            \
+                else                                                    \
+                        (tot) += (add);                                 \
+        } while(0)
 
 static int
 lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs, int success)
 {
-	ENTRY;
+        ENTRY;
 
-	if (success) {
-		__u32 expected_stripes = lov_get_stripe_count(&obd->u.lov,
+        if (success) {
+                __u32 expected_stripes = lov_get_stripe_count(&obd->u.lov,
 							      LOV_MAGIC, 0);
 		if (osfs->os_files != LOV_U64_MAX)
 			lov_do_div64(osfs->os_files, expected_stripes);
@@ -169,7 +164,7 @@ lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs, int success)
 
 		spin_lock(&obd->obd_osfs_lock);
 		memcpy(&obd->obd_osfs, osfs, sizeof(*osfs));
-		obd->obd_osfs_age = ktime_get_seconds();
+		obd->obd_osfs_age = cfs_time_current_64();
 		spin_unlock(&obd->obd_osfs_lock);
 		RETURN(0);
 	}
@@ -182,7 +177,7 @@ int lov_fini_statfs_set(struct lov_request_set *set)
 	int rc = 0;
 	ENTRY;
 
-	if (!set)
+	if (set == NULL)
 		RETURN(0);
 
 	if (atomic_read(&set->set_completes)) {
@@ -199,91 +194,84 @@ static void
 lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs,
 		  int success)
 {
-	int shift = 0, quit = 0;
-	__u64 tmp;
-
-	if (success == 0) {
-		memcpy(osfs, lov_sfs, sizeof(*lov_sfs));
-	} else {
-		if (osfs->os_bsize != lov_sfs->os_bsize) {
-			/* assume all block sizes are always powers of 2 */
-			/* get the bits difference */
-			tmp = osfs->os_bsize | lov_sfs->os_bsize;
-			for (shift = 0; shift <= 64; ++shift) {
-				if (tmp & 1) {
-					if (quit)
-						break;
-					quit = 1;
-					shift = 0;
-				}
-				tmp >>= 1;
-			}
-		}
-
-		if (osfs->os_bsize < lov_sfs->os_bsize) {
-			osfs->os_bsize = lov_sfs->os_bsize;
-
-			osfs->os_bfree  >>= shift;
-			osfs->os_bavail >>= shift;
-			osfs->os_blocks >>= shift;
-		} else if (shift != 0) {
-			lov_sfs->os_bfree  >>= shift;
-			lov_sfs->os_bavail >>= shift;
-			lov_sfs->os_blocks >>= shift;
-		}
+        int shift = 0, quit = 0;
+        __u64 tmp;
+
+        if (success == 0) {
+                memcpy(osfs, lov_sfs, sizeof(*lov_sfs));
+        } else {
+                if (osfs->os_bsize != lov_sfs->os_bsize) {
+                        /* assume all block sizes are always powers of 2 */
+                        /* get the bits difference */
+                        tmp = osfs->os_bsize | lov_sfs->os_bsize;
+                        for (shift = 0; shift <= 64; ++shift) {
+                                if (tmp & 1) {
+                                        if (quit)
+                                                break;
+                                        else
+                                                quit = 1;
+                                        shift = 0;
+                                }
+                                tmp >>= 1;
+                        }
+                }
+
+                if (osfs->os_bsize < lov_sfs->os_bsize) {
+                        osfs->os_bsize = lov_sfs->os_bsize;
+
+                        osfs->os_bfree  >>= shift;
+                        osfs->os_bavail >>= shift;
+                        osfs->os_blocks >>= shift;
+                } else if (shift != 0) {
+                        lov_sfs->os_bfree  >>= shift;
+                        lov_sfs->os_bavail >>= shift;
+                        lov_sfs->os_blocks >>= shift;
+                }
 #ifdef MIN_DF
-		/*
-		 * Sandia requested that df (and so, statfs) only
-		 * returned minimal available space on
-		 * a single OST, so people would be able to
-		 * write this much data guaranteed.
-		 */
-		if (osfs->os_bavail > lov_sfs->os_bavail) {
-			/*
-			 * Presumably if new bavail is smaller,
-			 * new bfree is bigger as well
-			 */
-			osfs->os_bfree = lov_sfs->os_bfree;
-			osfs->os_bavail = lov_sfs->os_bavail;
-		}
+                /* Sandia requested that df (and so, statfs) only
+                   returned minimal available space on
+                   a single OST, so people would be able to
+                   write this much data guaranteed. */
+                if (osfs->os_bavail > lov_sfs->os_bavail) {
+                        /* Presumably if new bavail is smaller,
+                           new bfree is bigger as well */
+                        osfs->os_bfree = lov_sfs->os_bfree;
+                        osfs->os_bavail = lov_sfs->os_bavail;
+                }
 #else
-		osfs->os_bfree += lov_sfs->os_bfree;
-		osfs->os_bavail += lov_sfs->os_bavail;
+                osfs->os_bfree += lov_sfs->os_bfree;
+                osfs->os_bavail += lov_sfs->os_bavail;
 #endif
-		osfs->os_blocks += lov_sfs->os_blocks;
-		/*
-		 * XXX not sure about this one - depends on policy.
-		 *   - could be minimum if we always stripe on all OBDs
-		 *     (but that would be wrong for any other policy,
-		 *     if one of the OBDs has no more objects left)
-		 *   - could be sum if we stripe whole objects
-		 *   - could be average, just to give a nice number
-		 *
-		 * To give a "reasonable" (if not wholly accurate)
-		 * number, we divide the total number of free objects
-		 * by expected stripe count (watch out for overflow).
-		 */
-		LOV_SUM_MAX(osfs->os_files, lov_sfs->os_files);
-		LOV_SUM_MAX(osfs->os_ffree, lov_sfs->os_ffree);
-	}
+                osfs->os_blocks += lov_sfs->os_blocks;
+                /* XXX not sure about this one - depends on policy.
+                 *   - could be minimum if we always stripe on all OBDs
+                 *     (but that would be wrong for any other policy,
+                 *     if one of the OBDs has no more objects left)
+                 *   - could be sum if we stripe whole objects
+                 *   - could be average, just to give a nice number
+                 *
+                 * To give a "reasonable" (if not wholly accurate)
+                 * number, we divide the total number of free objects
+                 * by expected stripe count (watch out for overflow).
+                 */
+                LOV_SUM_MAX(osfs->os_files, lov_sfs->os_files);
+                LOV_SUM_MAX(osfs->os_ffree, lov_sfs->os_ffree);
+        }
 }
 
-/*
- * The callback for osc_statfs_async that finilizes a request info when a
- * response is received.
- */
+/* The callback for osc_statfs_async that finilizes a request info when a
+ * response is received. */
 static int cb_statfs_update(void *cookie, int rc)
 {
-	struct obd_info *oinfo = cookie;
-	struct lov_request *lovreq;
-	struct lov_request_set *set;
-	struct obd_statfs *osfs, *lov_sfs;
-	struct lov_obd *lov;
-	struct lov_tgt_desc *tgt;
-	struct obd_device *lovobd, *tgtobd;
-	int success;
-
-	ENTRY;
+        struct obd_info *oinfo = cookie;
+        struct lov_request *lovreq;
+        struct lov_request_set *set;
+        struct obd_statfs *osfs, *lov_sfs;
+        struct lov_obd *lov;
+        struct lov_tgt_desc *tgt;
+        struct obd_device *lovobd, *tgtobd;
+        int success;
+        ENTRY;
 
 	lovreq = container_of(oinfo, struct lov_request, rq_oi);
 	set = lovreq->rq_rqset;
@@ -292,101 +280,91 @@ static int cb_statfs_update(void *cookie, int rc)
 	osfs = set->set_oi->oi_osfs;
 	lov_sfs = oinfo->oi_osfs;
 	success = atomic_read(&set->set_success);
-	/*
-	 * XXX: the same is done in lov_update_common_set, however
-	 * lovset->set_exp is not initialized.
-	 */
+	/* XXX: the same is done in lov_update_common_set, however
+	   lovset->set_exp is not initialized. */
 	lov_update_set(set, lovreq, rc);
 	if (rc)
 		GOTO(out, rc);
 
-	lov_tgts_getref(lovobd);
-	tgt = lov->lov_tgts[lovreq->rq_idx];
-	if (!tgt || !tgt->ltd_active)
-		GOTO(out_update, rc);
+        obd_getref(lovobd);
+        tgt = lov->lov_tgts[lovreq->rq_idx];
+        if (!tgt || !tgt->ltd_active)
+                GOTO(out_update, rc);
 
-	tgtobd = class_exp2obd(tgt->ltd_exp);
+        tgtobd = class_exp2obd(tgt->ltd_exp);
 	spin_lock(&tgtobd->obd_osfs_lock);
 	memcpy(&tgtobd->obd_osfs, lov_sfs, sizeof(*lov_sfs));
 	if ((oinfo->oi_flags & OBD_STATFS_FROM_CACHE) == 0)
-		tgtobd->obd_osfs_age = ktime_get_seconds();
+		tgtobd->obd_osfs_age = cfs_time_current_64();
 	spin_unlock(&tgtobd->obd_osfs_lock);
 
 out_update:
-	lov_update_statfs(osfs, lov_sfs, success);
-	lov_tgts_putref(lovobd);
+        lov_update_statfs(osfs, lov_sfs, success);
+        obd_putref(lovobd);
+
 out:
 	RETURN(0);
 }
 
 int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
-			struct lov_request_set **reqset)
+                        struct lov_request_set **reqset)
 {
-	struct lov_request_set *set;
-	struct lov_obd *lov = &obd->u.lov;
-	int rc = 0, i;
-
-	ENTRY;
+        struct lov_request_set *set;
+        struct lov_obd *lov = &obd->u.lov;
+        int rc = 0, i;
+        ENTRY;
 
-	OBD_ALLOC(set, sizeof(*set));
-	if (!set)
-		RETURN(-ENOMEM);
-	lov_init_set(set);
+        OBD_ALLOC(set, sizeof(*set));
+        if (set == NULL)
+                RETURN(-ENOMEM);
+        lov_init_set(set);
 
-	set->set_obd = obd;
-	set->set_oi = oinfo;
+        set->set_obd = obd;
+        set->set_oi = oinfo;
 
-	/* We only get block data from the OBD */
-	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
-		struct lov_tgt_desc *ltd = lov->lov_tgts[i];
+        /* We only get block data from the OBD */
+        for (i = 0; i < lov->desc.ld_tgt_count; i++) {
 		struct lov_request *req;
 
-		if (!ltd) {
+		if (lov->lov_tgts[i] == NULL ||
+		    (oinfo->oi_flags & OBD_STATFS_NODELAY &&
+		     !lov->lov_tgts[i]->ltd_active)) {
 			CDEBUG(D_HA, "lov idx %d inactive\n", i);
 			continue;
 		}
 
-		/*
-		 * skip targets that have been explicitely disabled by the
-		 * administrator
-		 */
-		if (!ltd->ltd_exp) {
+		/* skip targets that have been explicitely disabled by the
+		 * administrator */
+		if (!lov->lov_tgts[i]->ltd_exp) {
 			CDEBUG(D_HA, "lov idx %d administratively disabled\n",
 			       i);
 			continue;
 		}
 
-		if (oinfo->oi_flags & OBD_STATFS_NODELAY &&
-		    class_exp2cliimp(ltd->ltd_exp)->imp_state !=
-		    LUSTRE_IMP_IDLE && !ltd->ltd_active) {
-			CDEBUG(D_HA, "lov idx %d inactive\n", i);
-			continue;
-		}
-
-		if (!ltd->ltd_active)
+		if (!lov->lov_tgts[i]->ltd_active)
 			lov_check_and_wait_active(lov, i);
 
 		OBD_ALLOC(req, sizeof(*req));
-		if (!req)
+		if (req == NULL)
 			GOTO(out_set, rc = -ENOMEM);
 
-		OBD_ALLOC(req->rq_oi.oi_osfs, sizeof(*req->rq_oi.oi_osfs));
-		if (!req->rq_oi.oi_osfs) {
-			OBD_FREE(req, sizeof(*req));
-			GOTO(out_set, rc = -ENOMEM);
-		}
-
-		req->rq_idx = i;
-		req->rq_oi.oi_cb_up = cb_statfs_update;
-		req->rq_oi.oi_flags = oinfo->oi_flags;
-
-		lov_set_add_req(req, set);
-	}
-	if (!set->set_count)
-		GOTO(out_set, rc = -EIO);
-	*reqset = set;
-	RETURN(rc);
+                OBD_ALLOC(req->rq_oi.oi_osfs, sizeof(*req->rq_oi.oi_osfs));
+                if (req->rq_oi.oi_osfs == NULL) {
+                        OBD_FREE(req, sizeof(*req));
+                        GOTO(out_set, rc = -ENOMEM);
+                }
+
+                req->rq_idx = i;
+                req->rq_oi.oi_cb_up = cb_statfs_update;
+                req->rq_oi.oi_flags = oinfo->oi_flags;
+
+                lov_set_add_req(req, set);
+        }
+        if (!set->set_count)
+                GOTO(out_set, rc = -EIO);
+        *reqset = set;
+        RETURN(rc);
 out_set:
-	lov_fini_statfs_set(set);
-	RETURN(rc);
+        lov_fini_statfs_set(set);
+        RETURN(rc);
 }
diff --git a/drivers/staging/lustrefsx/lustre/lov/lovsub_dev.c b/drivers/staging/lustrefsx/lustre/lov/lovsub_dev.c
index 90a11e75393b9..0ada9b5b9ce53 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lovsub_dev.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lovsub_dev.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2013, 2015, Intel Corporation.
+ * Copyright (c) 2013, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -49,33 +49,33 @@
  */
 
 static int lovsub_device_init(const struct lu_env *env, struct lu_device *d,
-			      const char *name, struct lu_device *next)
+                              const char *name, struct lu_device *next)
 {
-	struct lovsub_device  *lsd = lu2lovsub_dev(d);
-	struct lu_device_type *ldt;
-	int rc;
-
-	ENTRY;
-	next->ld_site = d->ld_site;
-	ldt = next->ld_type;
-	LASSERT(ldt != NULL);
-	rc = ldt->ldt_ops->ldto_device_init(env, next, ldt->ldt_name, NULL);
-	if (rc) {
-		next->ld_site = NULL;
-		RETURN(rc);
-	}
-
-	lu_device_get(next);
-	lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init);
-	lsd->acid_next = lu2cl_dev(next);
-	RETURN(rc);
+        struct lovsub_device  *lsd = lu2lovsub_dev(d);
+        struct lu_device_type *ldt;
+        int rc;
+
+        ENTRY;
+        next->ld_site = d->ld_site;
+        ldt = next->ld_type;
+        LASSERT(ldt != NULL);
+        rc = ldt->ldt_ops->ldto_device_init(env, next, ldt->ldt_name, NULL);
+        if (rc) {
+                next->ld_site = NULL;
+                RETURN(rc);
+        }
+
+        lu_device_get(next);
+        lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init);
+        lsd->acid_next = lu2cl_dev(next);
+        RETURN(rc);
 }
 
 static struct lu_device *lovsub_device_fini(const struct lu_env *env,
-					    struct lu_device *d)
+                                            struct lu_device *d)
 {
-	struct lu_device *next;
-	struct lovsub_device *lsd;
+        struct lu_device *next;
+        struct lovsub_device *lsd;
 
 	ENTRY;
 	lsd = lu2lovsub_dev(d);
@@ -87,8 +87,8 @@ static struct lu_device *lovsub_device_fini(const struct lu_env *env,
 static struct lu_device *lovsub_device_free(const struct lu_env *env,
 					    struct lu_device *d)
 {
-	struct lovsub_device *lsd = lu2lovsub_dev(d);
-	struct lu_device *next = cl2lu_dev(lsd->acid_next);
+	struct lovsub_device *lsd  = lu2lovsub_dev(d);
+	struct lu_device     *next = cl2lu_dev(lsd->acid_next);
 
 	if (atomic_read(&d->ld_ref) && d->ld_site) {
 		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_ERROR, NULL);
@@ -100,48 +100,48 @@ static struct lu_device *lovsub_device_free(const struct lu_env *env,
 }
 
 static const struct lu_device_operations lovsub_lu_ops = {
-	.ldo_object_alloc      = lovsub_object_alloc,
-	.ldo_process_config    = NULL,
-	.ldo_recovery_complete = NULL
+        .ldo_object_alloc      = lovsub_object_alloc,
+        .ldo_process_config    = NULL,
+        .ldo_recovery_complete = NULL
 };
 
 static struct lu_device *lovsub_device_alloc(const struct lu_env *env,
-					     struct lu_device_type *t,
-					     struct lustre_cfg *cfg)
+                                             struct lu_device_type *t,
+                                             struct lustre_cfg *cfg)
 {
-	struct lu_device *d;
-	struct lovsub_device *lsd;
-
-	OBD_ALLOC_PTR(lsd);
-	if (lsd) {
-		int result;
-
-		result = cl_device_init(&lsd->acid_cl, t);
-		if (result == 0) {
-			d = lovsub2lu_dev(lsd);
-			d->ld_ops         = &lovsub_lu_ops;
-		} else
-			d = ERR_PTR(result);
-	} else
-		d = ERR_PTR(-ENOMEM);
-	return d;
+        struct lu_device     *d;
+        struct lovsub_device *lsd;
+
+        OBD_ALLOC_PTR(lsd);
+        if (lsd != NULL) {
+                int result;
+
+                result = cl_device_init(&lsd->acid_cl, t);
+                if (result == 0) {
+                        d = lovsub2lu_dev(lsd);
+                        d->ld_ops         = &lovsub_lu_ops;
+                } else
+                        d = ERR_PTR(result);
+        } else
+                d = ERR_PTR(-ENOMEM);
+        return d;
 }
 
 static const struct lu_device_type_operations lovsub_device_type_ops = {
-	.ldto_device_alloc = lovsub_device_alloc,
-	.ldto_device_free = lovsub_device_free,
+        .ldto_device_alloc = lovsub_device_alloc,
+        .ldto_device_free  = lovsub_device_free,
 
-	.ldto_device_init = lovsub_device_init,
-	.ldto_device_fini = lovsub_device_fini
+        .ldto_device_init    = lovsub_device_init,
+        .ldto_device_fini    = lovsub_device_fini
 };
 
 #define LUSTRE_LOVSUB_NAME         "lovsub"
 
 struct lu_device_type lovsub_device_type = {
-	.ldt_tags     = LU_DEVICE_CL,
-	.ldt_name     = LUSTRE_LOVSUB_NAME,
-	.ldt_ops      = &lovsub_device_type_ops,
-	.ldt_ctx_tags = LCT_CL_THREAD
+        .ldt_tags     = LU_DEVICE_CL,
+        .ldt_name     = LUSTRE_LOVSUB_NAME,
+        .ldt_ops      = &lovsub_device_type_ops,
+        .ldt_ctx_tags = LCT_CL_THREAD
 };
 
 
diff --git a/drivers/staging/lustrefsx/lustre/lov/lovsub_lock.c b/drivers/staging/lustrefsx/lustre/lov/lovsub_lock.c
new file mode 100644
index 0000000000000..de8b5c72260d7
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lovsub_lock.c
@@ -0,0 +1,82 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub lock operations.
+ *
+ */
+
+static void lovsub_lock_fini(const struct lu_env *env,
+                             struct cl_lock_slice *slice)
+{
+        struct lovsub_lock   *lsl;
+
+	ENTRY;
+	lsl = cl2lovsub_lock(slice);
+	OBD_SLAB_FREE_PTR(lsl, lovsub_lock_kmem);
+	EXIT;
+}
+
+static const struct cl_lock_operations lovsub_lock_ops = {
+        .clo_fini    = lovsub_lock_fini,
+};
+
+int lovsub_lock_init(const struct lu_env *env, struct cl_object *obj,
+		     struct cl_lock *lock, const struct cl_io *io)
+{
+	struct lovsub_lock *lsk;
+	int result;
+
+	ENTRY;
+	OBD_SLAB_ALLOC_PTR_GFP(lsk, lovsub_lock_kmem, GFP_NOFS);
+	if (lsk != NULL) {
+		cl_lock_slice_add(lock, &lsk->lss_cl, obj, &lovsub_lock_ops);
+		result = 0;
+	} else
+		result = -ENOMEM;
+	RETURN(result);
+}
+
+/** @} lov */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lovsub_object.c b/drivers/staging/lustrefsx/lustre/lov/lovsub_object.c
index d219356cb3ad3..1471de7915162 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lovsub_object.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lovsub_object.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -49,39 +49,37 @@
  */
 
 int lovsub_object_init(const struct lu_env *env, struct lu_object *obj,
-		       const struct lu_object_conf *conf)
+                       const struct lu_object_conf *conf)
 {
-	struct lovsub_device *dev = lu2lovsub_dev(obj->lo_dev);
-	struct lu_object *below;
-	struct lu_device *under;
-
-	int result;
-
-	ENTRY;
-	under = &dev->acid_next->cd_lu_dev;
-	below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under);
-	if (below) {
-		lu_object_add(obj, below);
-		cl_object_page_init(lu2cl(obj), 0);
-		result = 0;
-	} else
-		result = -ENOMEM;
-	RETURN(result);
+        struct lovsub_device  *dev   = lu2lovsub_dev(obj->lo_dev);
+        struct lu_object      *below;
+        struct lu_device      *under;
+
+        int result;
+
+        ENTRY;
+        under = &dev->acid_next->cd_lu_dev;
+        below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under);
+        if (below != NULL) {
+                lu_object_add(obj, below);
+		cl_object_page_init(lu2cl(obj), sizeof(struct lovsub_page));
+                result = 0;
+        } else
+                result = -ENOMEM;
+        RETURN(result);
 
 }
 
 static void lovsub_object_free(const struct lu_env *env, struct lu_object *obj)
 {
-	struct lovsub_object *los = lu2lovsub(obj);
-	struct lov_object *lov = los->lso_super;
-
-	ENTRY;
-
-	/*
-	 * We can't assume lov was assigned here, because of the shadow
-	 * object handling in lu_object_find.
-	 */
-	if (lov) {
+        struct lovsub_object *los = lu2lovsub(obj);
+        struct lov_object    *lov = los->lso_super;
+        ENTRY;
+
+        /* We can't assume lov was assigned here, because of the shadow
+         * object handling in lu_object_find.
+         */
+	if (lov != NULL) {
 		int index = lov_comp_entry(los->lso_index);
 		int stripe = lov_comp_stripe(los->lso_index);
 		struct lov_layout_raid0 *r0 = lov_r0(lov, index);
@@ -93,18 +91,18 @@ static void lovsub_object_free(const struct lu_env *env, struct lu_object *obj)
 		spin_unlock(&r0->lo_sub_lock);
 	}
 
-	lu_object_fini(obj);
-	lu_object_header_fini(&los->lso_header.coh_lu);
-	OBD_SLAB_FREE_PTR(los, lovsub_object_kmem);
-	EXIT;
+        lu_object_fini(obj);
+        lu_object_header_fini(&los->lso_header.coh_lu);
+        OBD_SLAB_FREE_PTR(los, lovsub_object_kmem);
+        EXIT;
 }
 
 static int lovsub_object_print(const struct lu_env *env, void *cookie,
-			       lu_printer_t p, const struct lu_object *obj)
+                               lu_printer_t p, const struct lu_object *obj)
 {
-	struct lovsub_object *los = lu2lovsub(obj);
+        struct lovsub_object *los = lu2lovsub(obj);
 
-	return (*p)(env, cookie, "[%d]", los->lso_index);
+        return (*p)(env, cookie, "[%d]", los->lso_index);
 }
 
 static int lovsub_attr_update(const struct lu_env *env, struct cl_object *obj,
@@ -119,13 +117,13 @@ static int lovsub_attr_update(const struct lu_env *env, struct cl_object *obj,
 }
 
 static int lovsub_object_glimpse(const struct lu_env *env,
-				 const struct cl_object *obj,
-				 struct ost_lvb *lvb)
+                                 const struct cl_object *obj,
+                                 struct ost_lvb *lvb)
 {
-	struct lovsub_object *los = cl2lovsub(obj);
+        struct lovsub_object *los = cl2lovsub(obj);
 
-	ENTRY;
-	RETURN(cl_object_glimpse(env, &los->lso_super->lo_cl, lvb));
+        ENTRY;
+        RETURN(cl_object_glimpse(env, &los->lso_super->lo_cl, lvb));
 }
 
 /**
@@ -138,7 +136,6 @@ static void lovsub_req_attr_set(const struct lu_env *env, struct cl_object *obj,
 {
 	struct lovsub_object *subobj = cl2lovsub(obj);
 	struct lov_stripe_md *lsm = subobj->lso_super->lo_lsm;
-
 	ENTRY;
 	cl_req_attr_set(env, &subobj->lso_super->lo_cl, attr);
 
@@ -154,18 +151,20 @@ static void lovsub_req_attr_set(const struct lu_env *env, struct cl_object *obj,
 }
 
 static const struct cl_object_operations lovsub_ops = {
+	.coo_page_init    = lovsub_page_init,
+	.coo_lock_init    = lovsub_lock_init,
 	.coo_attr_update  = lovsub_attr_update,
 	.coo_glimpse      = lovsub_object_glimpse,
 	.coo_req_attr_set = lovsub_req_attr_set
 };
 
 static const struct lu_object_operations lovsub_lu_obj_ops = {
-	.loo_object_init      = lovsub_object_init,
-	.loo_object_delete    = NULL,
-	.loo_object_release   = NULL,
-	.loo_object_free      = lovsub_object_free,
-	.loo_object_print     = lovsub_object_print,
-	.loo_object_invariant = NULL
+        .loo_object_init      = lovsub_object_init,
+        .loo_object_delete    = NULL,
+        .loo_object_release   = NULL,
+        .loo_object_free      = lovsub_object_free,
+        .loo_object_print     = lovsub_object_print,
+        .loo_object_invariant = NULL
 };
 
 struct lu_object *lovsub_object_alloc(const struct lu_env *env,
@@ -177,7 +176,7 @@ struct lu_object *lovsub_object_alloc(const struct lu_env *env,
 
 	ENTRY;
 	OBD_SLAB_ALLOC_PTR_GFP(los, lovsub_object_kmem, GFP_NOFS);
-	if (los) {
+	if (los != NULL) {
 		struct cl_object_header *hdr;
 
 		obj = lovsub2lu(los);
diff --git a/drivers/staging/lustrefsx/lustre/lov/lovsub_page.c b/drivers/staging/lustrefsx/lustre/lov/lovsub_page.c
new file mode 100644
index 0000000000000..c10a3dfa38c1e
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/lov/lovsub_page.c
@@ -0,0 +1,70 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2013, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub page operations.
+ *
+ */
+
+static void lovsub_page_fini(const struct lu_env *env,
+                             struct cl_page_slice *slice)
+{
+}
+
+static const struct cl_page_operations lovsub_page_ops = {
+        .cpo_fini   = lovsub_page_fini
+};
+
+int lovsub_page_init(const struct lu_env *env, struct cl_object *obj,
+		     struct cl_page *page, pgoff_t index)
+{
+	struct lovsub_page *lsb = cl_object_page_slice(obj, page);
+	ENTRY;
+
+	cl_page_slice_add(page, &lsb->lsb_cl, obj, index, &lovsub_page_ops);
+	RETURN(0);
+}
+
+/** @} lov */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c b/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c
index f6eeebed9e2b0..41215c11998ef 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -35,9 +35,10 @@
 #include <asm/statfs.h>
 #include <lprocfs_status.h>
 #include <obd_class.h>
-#include <uapi/linux/lustre/lustre_param.h>
+#include <uapi/linux/lustre_param.h>
 #include "lov_internal.h"
 
+#ifdef CONFIG_PROC_FS
 static int lov_stripesize_seq_show(struct seq_file *m, void *v)
 {
 	struct obd_device *dev = (struct obd_device *)m->private;
@@ -56,12 +57,12 @@ static ssize_t lov_stripesize_seq_write(struct file *file,
 {
 	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
 	struct lov_desc *desc;
-	s64 val;
+	__s64 val;
 	int rc;
 
 	LASSERT(dev != NULL);
 	desc = &dev->u.lov.desc;
-	rc = lprocfs_str_with_units_to_s64(buffer, count, &val, '1');
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 	if (val < 0)
@@ -74,135 +75,150 @@ static ssize_t lov_stripesize_seq_write(struct file *file,
 }
 LPROC_SEQ_FOPS(lov_stripesize);
 
-static ssize_t stripeoffset_show(struct kobject *kobj, struct attribute *attr,
-				 char *buf)
+static int lov_stripeoffset_seq_show(struct seq_file *m, void *v)
 {
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct lov_desc *desc = &dev->u.lov.desc;
+	struct obd_device *dev = (struct obd_device *)m->private;
+	struct lov_desc *desc;
 
-	return sprintf(buf, "%lld\n", desc->ld_default_stripe_offset);
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	seq_printf(m, "%lld\n", desc->ld_default_stripe_offset);
+	return 0;
 }
 
-static ssize_t stripeoffset_store(struct kobject *kobj, struct attribute *attr,
-				  const char *buf, size_t count)
+static ssize_t lov_stripeoffset_seq_write(struct file *file,
+					  const char __user *buffer,
+					  size_t count, loff_t *off)
 {
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct lov_desc *desc = &dev->u.lov.desc;
-	long val;
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct lov_desc *desc;
+	__s64 val;
 	int rc;
 
-	rc = kstrtol(buf, 0, &val);
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
-	if (val < -1 || val > LOV_MAX_STRIPE_COUNT)
+	if (val < -1)
 		return -ERANGE;
 
 	desc->ld_default_stripe_offset = val;
 
 	return count;
 }
-LUSTRE_RW_ATTR(stripeoffset);
+LPROC_SEQ_FOPS(lov_stripeoffset);
 
-static ssize_t stripetype_show(struct kobject *kobj, struct attribute *attr,
-			       char *buf)
+static int lov_stripetype_seq_show(struct seq_file *m, void *v)
 {
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct lov_desc *desc = &dev->u.lov.desc;
+	struct obd_device* dev = (struct obd_device*)m->private;
+	struct lov_desc *desc;
 
-	return sprintf(buf, "%u\n", desc->ld_pattern);
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	seq_printf(m, "%u\n", desc->ld_pattern);
+	return 0;
 }
 
-static ssize_t stripetype_store(struct kobject *kobj, struct attribute *attr,
-				const char *buffer, size_t count)
+static ssize_t lov_stripetype_seq_write(struct file *file,
+					const char __user *buffer,
+					size_t count, loff_t *off)
 {
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct lov_desc *desc = &dev->u.lov.desc;
-	u32 pattern;
-	int rc;
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct lov_desc *desc;
+	int pattern, rc;
+	__s64 val;
 
-	rc = kstrtouint(buffer, 0, &pattern);
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
+	if (val < INT_MIN || val > INT_MAX)
+		return -ERANGE;
 
+	pattern = val;
 	lov_fix_desc_pattern(&pattern);
 	desc->ld_pattern = pattern;
 
 	return count;
 }
-LUSTRE_RW_ATTR(stripetype);
+LPROC_SEQ_FOPS(lov_stripetype);
 
-static ssize_t stripecount_show(struct kobject *kobj, struct attribute *attr,
-				char *buf)
+static int lov_stripecount_seq_show(struct seq_file *m, void *v)
 {
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct lov_desc *desc = &dev->u.lov.desc;
+	struct obd_device *dev = (struct obd_device *)m->private;
+	struct lov_desc *desc;
 
-	return sprintf(buf, "%d\n",
-		       (__s16)(desc->ld_default_stripe_count + 1) - 1);
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	seq_printf(m, "%d\n",
+		  (__s16)(desc->ld_default_stripe_count + 1) - 1);
+	return 0;
 }
 
-static ssize_t stripecount_store(struct kobject *kobj, struct attribute *attr,
-				 const char *buffer, size_t count)
+static ssize_t lov_stripecount_seq_write(struct file *file,
+					 const char __user *buffer,
+					 size_t count, loff_t *off)
 {
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct lov_desc *desc = &dev->u.lov.desc;
-	int stripe_count;
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct lov_desc *desc;
 	int rc;
+	__u32 stripe_count;
+	__s64 val;
 
-	rc = kstrtoint(buffer, 0, &stripe_count);
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
-
-	if (stripe_count < -1)
+	if (val < -1)
 		return -ERANGE;
 
+	stripe_count = val;
 	lov_fix_desc_stripe_count(&stripe_count);
 	desc->ld_default_stripe_count = stripe_count;
 
 	return count;
 }
-LUSTRE_RW_ATTR(stripecount);
+LPROC_SEQ_FOPS(lov_stripecount);
 
-static ssize_t numobd_show(struct kobject *kobj, struct attribute *attr,
-			   char *buf)
+static int lov_numobd_seq_show(struct seq_file *m, void *v)
 {
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct lov_desc *desc = &dev->u.lov.desc;
+	struct obd_device *dev = (struct obd_device*)m->private;
+	struct lov_desc *desc;
 
-	return sprintf(buf, "%u\n", desc->ld_tgt_count);
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	seq_printf(m, "%u\n", desc->ld_tgt_count);
+	return 0;
 }
-LUSTRE_RO_ATTR(numobd);
+LPROC_SEQ_FOPS_RO(lov_numobd);
 
-static ssize_t activeobd_show(struct kobject *kobj, struct attribute *attr,
-			      char *buf)
+static int lov_activeobd_seq_show(struct seq_file *m, void *v)
 {
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct lov_desc *desc = &dev->u.lov.desc;
+	struct obd_device* dev = (struct obd_device*)m->private;
+	struct lov_desc *desc;
 
-	return sprintf(buf, "%u\n", desc->ld_active_tgt_count);
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	seq_printf(m, "%u\n", desc->ld_active_tgt_count);
+	return 0;
 }
-LUSTRE_RO_ATTR(activeobd);
+LPROC_SEQ_FOPS_RO(lov_activeobd);
 
-static ssize_t desc_uuid_show(struct kobject *kobj, struct attribute *attr,
-			      char *buf)
+static int lov_desc_uuid_seq_show(struct seq_file *m, void *v)
 {
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct lov_desc *desc = &dev->u.lov.desc;
+	struct obd_device *dev = m->private;
+	struct lov_obd *lov;
 
-	return sprintf(buf, "%s\n", desc->ld_uuid.uuid);
+	LASSERT(dev != NULL);
+	lov = &dev->u.lov;
+	seq_printf(m, "%s\n", lov->desc.ld_uuid.uuid);
+	return 0;
 }
-LUSTRE_RO_ATTR(desc_uuid);
+LPROC_SEQ_FOPS_RO(lov_desc_uuid);
 
-#ifdef CONFIG_PROC_FS
 static void *lov_tgt_seq_start(struct seq_file *p, loff_t *pos)
 {
         struct obd_device *dev = p->private;
@@ -235,7 +251,6 @@ static void *lov_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos)
 static int lov_tgt_seq_show(struct seq_file *p, void *v)
 {
         struct lov_tgt_desc *tgt = v;
-
 	seq_printf(p, "%d: %s %sACTIVE\n", tgt->ltd_index,
 		   obd_uuid2str(&tgt->ltd_uuid),
 		   tgt->ltd_active ? "" : "IN");
@@ -254,6 +269,10 @@ static int lov_target_seq_open(struct inode *inode, struct file *file)
 	struct seq_file *seq;
 	int rc;
 
+	rc = LPROCFS_ENTRY_CHECK(inode);
+	if (rc < 0)
+		return rc;
+
 	rc = seq_open(file, &lov_tgt_sops);
 	if (rc)
 		return rc;
@@ -263,13 +282,47 @@ static int lov_target_seq_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
+LPROC_SEQ_FOPS_RO_TYPE(lov, uuid);
+LPROC_SEQ_FOPS_RO_TYPE(lov, filestotal);
+LPROC_SEQ_FOPS_RO_TYPE(lov, filesfree);
+LPROC_SEQ_FOPS_RO_TYPE(lov, blksize);
+LPROC_SEQ_FOPS_RO_TYPE(lov, kbytestotal);
+LPROC_SEQ_FOPS_RO_TYPE(lov, kbytesfree);
+LPROC_SEQ_FOPS_RO_TYPE(lov, kbytesavail);
+
 struct lprocfs_vars lprocfs_lov_obd_vars[] = {
-	{ .name =	"stripesize",
-	  .fops =	&lov_stripesize_fops	},
+	{ .name	=	"uuid",
+	  .fops	=	&lov_uuid_fops		},
+	{ .name	=	"stripesize",
+	  .fops	=	&lov_stripesize_fops	},
+	{ .name	=	"stripeoffset",
+	  .fops	=	&lov_stripeoffset_fops	},
+	{ .name	=	"stripecount",
+	  .fops	=	&lov_stripecount_fops	},
+	{ .name	=	"stripetype",
+	  .fops	=	&lov_stripetype_fops	},
+	{ .name	=	"numobd",
+	  .fops	=	&lov_numobd_fops	},
+	{ .name	=	"activeobd",
+	  .fops	=	&lov_activeobd_fops	},
+	{ .name	=	"filestotal",
+	  .fops	=	&lov_filestotal_fops	},
+	{ .name	=	"filesfree",
+	  .fops	=	&lov_filesfree_fops	},
+	{ .name	=	"blocksize",
+	  .fops	=	&lov_blksize_fops	},
+	{ .name	=	"kbytestotal",
+	  .fops	=	&lov_kbytestotal_fops	},
+	{ .name	=	"kbytesfree",
+	  .fops	=	&lov_kbytesfree_fops	},
+	{ .name	=	"kbytesavail",
+	  .fops	=	&lov_kbytesavail_fops	},
+	{ .name	=	"desc_uuid",
+	  .fops	=	&lov_desc_uuid_fops	},
 	{ NULL }
 };
 
-static const struct proc_ops lov_proc_target_fops = {
+const struct proc_ops lov_proc_target_fops = {
 	PROC_OWNER(THIS_MODULE)
 	.proc_open	= lov_target_seq_open,
 	.proc_read	= seq_read,
@@ -277,68 +330,3 @@ static const struct proc_ops lov_proc_target_fops = {
 	.proc_release	= lprocfs_seq_release,
 };
 #endif /* CONFIG_PROC_FS */
-
-static struct attribute *lov_attrs[] = {
-	&lustre_attr_activeobd.attr,
-	&lustre_attr_numobd.attr,
-	&lustre_attr_desc_uuid.attr,
-	&lustre_attr_stripeoffset.attr,
-	&lustre_attr_stripetype.attr,
-	&lustre_attr_stripecount.attr,
-	NULL,
-};
-
-int lov_tunables_init(struct obd_device *obd)
-{
-	struct lov_obd *lov = &obd->u.lov;
-#if defined(CONFIG_PROC_FS) && defined(HAVE_SERVER_SUPPORT)
-	struct obd_type *type;
-#endif
-	int rc;
-
-	obd->obd_vars = lprocfs_lov_obd_vars;
-#if defined(CONFIG_PROC_FS) && defined(HAVE_SERVER_SUPPORT)
-	/* If this is true then both client (lov) and server
-	 * (lod) are on the same node. The lod layer if loaded
-	 * first will register the lov proc directory. In that
-	 * case obd->obd_type->typ_procroot will be not set.
-	 * Instead we use type->typ_procsym as the parent.
-	 */
-	type = class_search_type(LUSTRE_LOD_NAME);
-	if (type && type->typ_procsym) {
-		obd->obd_proc_entry = lprocfs_register(obd->obd_name,
-						       type->typ_procsym,
-						       obd->obd_vars, obd);
-		if (IS_ERR(obd->obd_proc_entry)) {
-			rc = PTR_ERR(obd->obd_proc_entry);
-			CERROR("error %d setting up lprocfs for %s\n", rc,
-			       obd->obd_name);
-			obd->obd_proc_entry = NULL;
-		}
-	}
-#endif
-	obd->obd_ktype.default_attrs = lov_attrs;
-	rc = lprocfs_obd_setup(obd, false);
-	if (rc)
-		GOTO(out, rc);
-
-#ifdef CONFIG_PROC_FS
-	rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd", 0444,
-				&lov_proc_target_fops, obd);
-	if (rc)
-		CWARN("%s: Error adding the target_obd file : rc %d\n",
-		      obd->obd_name, rc);
-
-	lov->lov_pool_proc_entry = lprocfs_register("pools",
-						    obd->obd_proc_entry,
-						    NULL, NULL);
-	if (IS_ERR(lov->lov_pool_proc_entry)) {
-		rc = PTR_ERR(lov->lov_pool_proc_entry);
-		CERROR("%s: error setting up debugfs for pools : rc %d\n",
-		       obd->obd_name, rc);
-		lov->lov_pool_proc_entry = NULL;
-	}
-#endif /* CONFIG_FS_PROC */
-out:
-	return rc;
-}
diff --git a/drivers/staging/lustrefsx/lustre/mdc/Makefile b/drivers/staging/lustrefsx/lustre/mdc/Makefile
index 7c9329681bdf2..e13d6af6f9949 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/Makefile
+++ b/drivers/staging/lustrefsx/lustre/mdc/Makefile
@@ -1,6 +1,6 @@
 obj-$(CONFIG_LUSTREFSX_FS)	+= mdc.o
 
 mdc-y		:= mdc_request.o mdc_reint.o lproc_mdc.o mdc_lib.o mdc_locks.o
-mdc-y		+= mdc_changelog.o mdc_dev.o
+mdc-y		+= mdc_changelog.o
 
 include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c b/drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c
index 0c2e79a2a336d..57cd679138950 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c
+++ b/drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -31,279 +31,126 @@
  */
 #define DEBUG_SUBSYSTEM S_CLASS
 
+#include <linux/version.h>
 #include <linux/vfs.h>
 #include <obd_class.h>
 #include <lprocfs_status.h>
-#include <lustre_osc.h>
-#include <cl_object.h>
+
 #include "mdc_internal.h"
 
-static ssize_t active_show(struct kobject *kobj, struct attribute *attr,
-			   char *buf)
+#ifdef CONFIG_PROC_FS
+static int mdc_active_seq_show(struct seq_file *m, void *v)
 {
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	ssize_t len;
+	struct obd_device *dev = m->private;
 
 	LPROCFS_CLIMP_CHECK(dev);
-	len = sprintf(buf, "%d\n", !dev->u.cli.cl_import->imp_deactive);
+	seq_printf(m, "%d\n", !dev->u.cli.cl_import->imp_deactive);
 	LPROCFS_CLIMP_EXIT(dev);
-	return len;
+	return 0;
 }
 
-static ssize_t active_store(struct kobject *kobj, struct attribute *attr,
-			    const char *buffer, size_t count)
+static ssize_t mdc_active_seq_write(struct file *file,
+				    const char __user *buffer,
+				    size_t count, loff_t *off)
 {
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	bool val;
+	struct obd_device *dev;
 	int rc;
+	__s64 val;
 
-	rc = kstrtobool(buffer, &val);
+	dev = ((struct seq_file *)file->private_data)->private;
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
+	if (val < 0 || val > 1)
+		return -ERANGE;
 
 	/* opposite senses */
 	if (dev->u.cli.cl_import->imp_deactive == val)
 		rc = ptlrpc_set_import_active(dev->u.cli.cl_import, val);
 	else
-		CDEBUG(D_CONFIG, "activate %u: ignoring repeat request\n",
+		CDEBUG(D_CONFIG, "activate %llu: ignoring repeat request\n",
 		       val);
 
 	return count;
 }
-LUSTRE_RW_ATTR(active);
-
-static ssize_t max_rpcs_in_flight_show(struct kobject *kobj,
-				       struct attribute *attr,
-				       char *buf)
-{
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	ssize_t len;
-	u32 max;
-
-	max = obd_get_max_rpcs_in_flight(&dev->u.cli);
-	len = sprintf(buf, "%u\n", max);
-
-	return len;
-}
-
-static ssize_t max_rpcs_in_flight_store(struct kobject *kobj,
-					struct attribute *attr,
-					const char *buffer,
-					size_t count)
-{
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	unsigned int val;
-	int rc;
-
-	rc = kstrtouint(buffer, 10, &val);
-	if (rc)
-		return rc;
-
-	rc = obd_set_max_rpcs_in_flight(&dev->u.cli, val);
-	if (rc)
-		count = rc;
-
-	return count;
-}
-LUSTRE_RW_ATTR(max_rpcs_in_flight);
-
-static ssize_t max_mod_rpcs_in_flight_show(struct kobject *kobj,
-					   struct attribute *attr,
-					   char *buf)
-{
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	u16 max;
-
-	max = obd_get_max_mod_rpcs_in_flight(&dev->u.cli);
-	return sprintf(buf, "%hu\n", max);
-}
-
-static ssize_t max_mod_rpcs_in_flight_store(struct kobject *kobj,
-					    struct attribute *attr,
-					    const char *buffer,
-					    size_t count)
-{
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	u16 val;
-	int rc;
-
-	rc = kstrtou16(buffer, 10, &val);
-	if (rc)
-		return rc;
-
-	rc = obd_set_max_mod_rpcs_in_flight(&dev->u.cli, val);
-	if (rc)
-		count = rc;
-
-	return count;
-}
-LUSTRE_RW_ATTR(max_mod_rpcs_in_flight);
+LPROC_SEQ_FOPS(mdc_active);
 
-static int mdc_max_dirty_mb_seq_show(struct seq_file *m, void *v)
+static int mdc_max_rpcs_in_flight_seq_show(struct seq_file *m, void *v)
 {
 	struct obd_device *dev = m->private;
-	struct client_obd *cli = &dev->u.cli;
-	unsigned long val;
+	__u32 max;
 
-	spin_lock(&cli->cl_loi_list_lock);
-	val = PAGES_TO_MiB(cli->cl_dirty_max_pages);
-	spin_unlock(&cli->cl_loi_list_lock);
+	max = obd_get_max_rpcs_in_flight(&dev->u.cli);
+	seq_printf(m, "%u\n", max);
 
-	seq_printf(m, "%lu\n", val);
 	return 0;
 }
 
-static ssize_t mdc_max_dirty_mb_seq_write(struct file *file,
-					  const char __user *buffer,
-					  size_t count, loff_t *off)
+static ssize_t mdc_max_rpcs_in_flight_seq_write(struct file *file,
+						const char __user *buffer,
+						size_t count, loff_t *off)
 {
-	struct seq_file *sfl = file->private_data;
-	struct obd_device *dev = sfl->private;
-	struct client_obd *cli = &dev->u.cli;
-	s64 pages_number;
+	struct obd_device *dev;
+	__s64 val;
 	int rc;
 
-	rc = lprocfs_str_with_units_to_s64(buffer, count, &pages_number, 'M');
+	dev = ((struct seq_file *)file->private_data)->private;
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
-	/* MB -> pages */
-	pages_number = round_up(pages_number, 1024 * 1024) >> PAGE_SHIFT;
-	if (pages_number <= 0 ||
-	    pages_number >= MiB_TO_PAGES(OSC_MAX_DIRTY_MB_MAX) ||
-	    pages_number > cfs_totalram_pages() / 4) /* 1/4 of RAM */
+	if (val < 0 || val > UINT_MAX)
 		return -ERANGE;
 
-	spin_lock(&cli->cl_loi_list_lock);
-	cli->cl_dirty_max_pages = pages_number;
-	osc_wake_cache_waiters(cli);
-	spin_unlock(&cli->cl_loi_list_lock);
-
-	return count;
-}
-LPROC_SEQ_FOPS(mdc_max_dirty_mb);
-
-static ssize_t contention_seconds_show(struct kobject *kobj,
-				       struct attribute *attr,
-				       char *buf)
-{
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct osc_device *od = obd2osc_dev(obd);
-
-	return sprintf(buf, "%lld\n", od->od_contention_time);
-}
-
-static ssize_t contention_seconds_store(struct kobject *kobj,
-					struct attribute *attr,
-					const char *buffer,
-					size_t count)
-{
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct osc_device *od = obd2osc_dev(obd);
-	time64_t val;
-	int rc;
-
-	rc = kstrtoll(buffer, 0, &val);
+	rc = obd_set_max_rpcs_in_flight(&dev->u.cli, val);
 	if (rc)
 		return rc;
 
-	od->od_contention_time = val;
-
 	return count;
 }
-LUSTRE_RW_ATTR(contention_seconds);
+LPROC_SEQ_FOPS(mdc_max_rpcs_in_flight);
 
-LUSTRE_ATTR(mds_conn_uuid, 0444, conn_uuid_show, NULL);
-LUSTRE_RO_ATTR(conn_uuid);
-
-LUSTRE_RW_ATTR(ping);
-
-static int mdc_cached_mb_seq_show(struct seq_file *m, void *v)
+static int mdc_max_mod_rpcs_in_flight_seq_show(struct seq_file *m, void *v)
 {
 	struct obd_device *dev = m->private;
-	struct client_obd *cli = &dev->u.cli;
-	int shift = 20 - PAGE_SHIFT;
+	__u16 max;
 
-	seq_printf(m, "used_mb: %ld\n"
-		   "busy_cnt: %ld\n"
-		   "reclaim: %llu\n",
-		   (atomic_long_read(&cli->cl_lru_in_list) +
-		    atomic_long_read(&cli->cl_lru_busy)) >> shift,
-		    atomic_long_read(&cli->cl_lru_busy),
-		   cli->cl_lru_reclaim);
+	max = obd_get_max_mod_rpcs_in_flight(&dev->u.cli);
+	seq_printf(m, "%hu\n", max);
 
 	return 0;
 }
 
-/* shrink the number of caching pages to a specific number */
-static ssize_t
-mdc_cached_mb_seq_write(struct file *file, const char __user *buffer,
-			size_t count, loff_t *off)
+static ssize_t mdc_max_mod_rpcs_in_flight_seq_write(struct file *file,
+						    const char __user *buffer,
+						    size_t count, loff_t *off)
 {
-	struct seq_file *sfl = file->private_data;
-	struct obd_device *dev = sfl->private;
-	struct client_obd *cli = &dev->u.cli;
-	__s64 pages_number;
-	long rc;
-	char kernbuf[128];
-
-	if (count >= sizeof(kernbuf))
-		return -EINVAL;
-
-	if (copy_from_user(kernbuf, buffer, count))
-		return -EFAULT;
-	kernbuf[count] = 0;
+	struct obd_device *dev =
+			((struct seq_file *)file->private_data)->private;
+	__s64 val;
+	int rc;
 
-	buffer += lprocfs_find_named_value(kernbuf, "used_mb:", &count) -
-		  kernbuf;
-	rc = lprocfs_str_with_units_to_s64(buffer, count, &pages_number, 'M');
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
-	pages_number >>= PAGE_SHIFT;
-
-	if (pages_number < 0)
+	if (val < 0 || val > USHRT_MAX)
 		return -ERANGE;
 
-	rc = atomic_long_read(&cli->cl_lru_in_list) - pages_number;
-	if (rc > 0) {
-		struct lu_env *env;
-		__u16 refcheck;
-
-		env = cl_env_get(&refcheck);
-		if (!IS_ERR(env)) {
-			(void)osc_lru_shrink(env, cli, rc, true);
-			cl_env_put(env, &refcheck);
-		}
-	}
+	rc = obd_set_max_mod_rpcs_in_flight(&dev->u.cli, val);
+	if (rc)
+		count = rc;
 
 	return count;
 }
-LPROC_SEQ_FOPS(mdc_cached_mb);
+LPROC_SEQ_FOPS(mdc_max_mod_rpcs_in_flight);
 
-static int mdc_unstable_stats_seq_show(struct seq_file *m, void *v)
+static int mdc_rpc_stats_seq_show(struct seq_file *seq, void *v)
 {
-	struct obd_device *dev = m->private;
-	struct client_obd *cli = &dev->u.cli;
-	long pages;
-	int mb;
-
-	pages = atomic_long_read(&cli->cl_unstable_count);
-	mb    = (pages * PAGE_SIZE) >> 20;
+	struct obd_device *dev = seq->private;
 
-	seq_printf(m, "unstable_pages: %20ld\n"
-		   "unstable_mb:              %10d\n", pages, mb);
-	return 0;
+	return obd_mod_rpc_stats_seq_show(&dev->u.cli, seq);
 }
-LPROC_SEQ_FOPS_RO(mdc_unstable_stats);
 
 static ssize_t mdc_rpc_stats_seq_write(struct file *file,
 				       const char __user *buf,
@@ -315,174 +162,22 @@ static ssize_t mdc_rpc_stats_seq_write(struct file *file,
 
 	lprocfs_oh_clear(&cli->cl_mod_rpcs_hist);
 
-	lprocfs_oh_clear(&cli->cl_read_rpc_hist);
-	lprocfs_oh_clear(&cli->cl_write_rpc_hist);
-	lprocfs_oh_clear(&cli->cl_read_page_hist);
-	lprocfs_oh_clear(&cli->cl_write_page_hist);
-	lprocfs_oh_clear(&cli->cl_read_offset_hist);
-	lprocfs_oh_clear(&cli->cl_write_offset_hist);
-
 	return len;
 }
-
-static int mdc_rpc_stats_seq_show(struct seq_file *seq, void *v)
-{
-	struct obd_device *dev = seq->private;
-	struct client_obd *cli = &dev->u.cli;
-	unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum;
-	int i;
-
-	obd_mod_rpc_stats_seq_show(&dev->u.cli, seq);
-
-	spin_lock(&cli->cl_loi_list_lock);
-
-	seq_printf(seq, "\nread RPCs in flight:  %d\n",
-		   cli->cl_r_in_flight);
-	seq_printf(seq, "write RPCs in flight: %d\n",
-		   cli->cl_w_in_flight);
-	seq_printf(seq, "pending write pages:  %d\n",
-		   atomic_read(&cli->cl_pending_w_pages));
-	seq_printf(seq, "pending read pages:   %d\n",
-		   atomic_read(&cli->cl_pending_r_pages));
-
-	seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
-	seq_printf(seq, "pages per rpc         rpcs   %% cum %% |");
-	seq_printf(seq, "       rpcs   %% cum %%\n");
-
-	read_tot = lprocfs_oh_sum(&cli->cl_read_page_hist);
-	write_tot = lprocfs_oh_sum(&cli->cl_write_page_hist);
-
-	read_cum = 0;
-	write_cum = 0;
-	for (i = 0; i < OBD_HIST_MAX; i++) {
-		unsigned long r = cli->cl_read_page_hist.oh_buckets[i];
-		unsigned long w = cli->cl_write_page_hist.oh_buckets[i];
-
-		read_cum += r;
-		write_cum += w;
-		seq_printf(seq, "%d:\t\t%10lu %3u %3u   | %10lu %3u %3u\n",
-			   1 << i, r, pct(r, read_tot),
-			   pct(read_cum, read_tot), w,
-			   pct(w, write_tot),
-			   pct(write_cum, write_tot));
-		if (read_cum == read_tot && write_cum == write_tot)
-			break;
-	}
-
-	seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
-	seq_printf(seq, "rpcs in flight        rpcs   %% cum %% |");
-	seq_printf(seq, "       rpcs   %% cum %%\n");
-
-	read_tot = lprocfs_oh_sum(&cli->cl_read_rpc_hist);
-	write_tot = lprocfs_oh_sum(&cli->cl_write_rpc_hist);
-
-	read_cum = 0;
-	write_cum = 0;
-	for (i = 0; i < OBD_HIST_MAX; i++) {
-		unsigned long r = cli->cl_read_rpc_hist.oh_buckets[i];
-		unsigned long w = cli->cl_write_rpc_hist.oh_buckets[i];
-
-		read_cum += r;
-		write_cum += w;
-		seq_printf(seq, "%d:\t\t%10lu %3u %3u   | %10lu %3u %3u\n",
-			   i, r, pct(r, read_tot), pct(read_cum, read_tot), w,
-			   pct(w, write_tot), pct(write_cum, write_tot));
-		if (read_cum == read_tot && write_cum == write_tot)
-			break;
-	}
-
-	seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
-	seq_printf(seq, "offset                rpcs   %% cum %% |");
-	seq_printf(seq, "       rpcs   %% cum %%\n");
-
-	read_tot = lprocfs_oh_sum(&cli->cl_read_offset_hist);
-	write_tot = lprocfs_oh_sum(&cli->cl_write_offset_hist);
-
-	read_cum = 0;
-	write_cum = 0;
-	for (i = 0; i < OBD_HIST_MAX; i++) {
-		unsigned long r = cli->cl_read_offset_hist.oh_buckets[i];
-		unsigned long w = cli->cl_write_offset_hist.oh_buckets[i];
-
-		read_cum += r;
-		write_cum += w;
-		seq_printf(seq, "%d:\t\t%10lu %3u %3u   | %10lu %3u %3u\n",
-			   (i == 0) ? 0 : 1 << (i - 1),
-			   r, pct(r, read_tot), pct(read_cum, read_tot),
-			   w, pct(w, write_tot), pct(write_cum, write_tot));
-		if (read_cum == read_tot && write_cum == write_tot)
-			break;
-	}
-	spin_unlock(&cli->cl_loi_list_lock);
-
-	return 0;
-}
 LPROC_SEQ_FOPS(mdc_rpc_stats);
 
-static int mdc_stats_seq_show(struct seq_file *seq, void *v)
-{
-	struct timespec64 now;
-	struct obd_device *dev = seq->private;
-	struct osc_stats *stats = &obd2osc_dev(dev)->od_stats;
-
-	ktime_get_real_ts64(&now);
-
-	seq_printf(seq, "snapshot_time:         %lld.%09lu (secs.nsecs)\n",
-		   (s64)now.tv_sec, now.tv_nsec);
-	seq_printf(seq, "lockless_write_bytes\t\t%llu\n",
-		   stats->os_lockless_writes);
-	seq_printf(seq, "lockless_read_bytes\t\t%llu\n",
-		   stats->os_lockless_reads);
-	seq_printf(seq, "lockless_truncate\t\t%llu\n",
-		   stats->os_lockless_truncates);
-	return 0;
-}
-
-static ssize_t mdc_stats_seq_write(struct file *file,
-				   const char __user *buf,
-				   size_t len, loff_t *off)
-{
-	struct seq_file *seq = file->private_data;
-	struct obd_device *dev = seq->private;
-	struct osc_stats *stats = &obd2osc_dev(dev)->od_stats;
-
-	memset(stats, 0, sizeof(*stats));
-	return len;
-}
-LPROC_SEQ_FOPS(mdc_stats);
-
-static int mdc_dom_min_repsize_seq_show(struct seq_file *m, void *v)
-{
-	struct obd_device *dev = m->private;
-
-	seq_printf(m, "%u\n", dev->u.cli.cl_dom_min_inline_repsize);
-
-	return 0;
-}
-
-static ssize_t mdc_dom_min_repsize_seq_write(struct file *file,
-					     const char __user *buffer,
-					     size_t count, loff_t *off)
-{
-	struct obd_device *dev;
-	unsigned int val;
-	int rc;
-
-	dev =  ((struct seq_file *)file->private_data)->private;
-	rc = kstrtouint_from_user(buffer, count, 0, &val);
-	if (rc)
-		return rc;
-
-	if (val > MDC_DOM_MAX_INLINE_REPSIZE)
-		return -ERANGE;
-
-	dev->u.cli.cl_dom_min_inline_repsize = val;
-	return count;
-}
-LPROC_SEQ_FOPS(mdc_dom_min_repsize);
+LPROC_SEQ_FOPS_WO_TYPE(mdc, ping);
 
+LPROC_SEQ_FOPS_RO_TYPE(mdc, uuid);
 LPROC_SEQ_FOPS_RO_TYPE(mdc, connect_flags);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, blksize);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, kbytestotal);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, kbytesfree);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, kbytesavail);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, filestotal);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, filesfree);
 LPROC_SEQ_FOPS_RO_TYPE(mdc, server_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, conn_uuid);
 LPROC_SEQ_FOPS_RO_TYPE(mdc, timeouts);
 LPROC_SEQ_FOPS_RO_TYPE(mdc, state);
 LPROC_SEQ_FOPS_RW_TYPE(mdc, obd_max_pages_per_rpc);
@@ -490,16 +185,35 @@ LPROC_SEQ_FOPS_RW_TYPE(mdc, import);
 LPROC_SEQ_FOPS_RW_TYPE(mdc, pinger_recov);
 
 struct lprocfs_vars lprocfs_mdc_obd_vars[] = {
+	{ .name	=	"uuid",
+	  .fops	=	&mdc_uuid_fops		},
+	{ .name	=	"ping",
+	  .fops	=	&mdc_ping_fops,
+	  .proc_mode =	0222			},
 	{ .name	=	"connect_flags",
 	  .fops	=	&mdc_connect_flags_fops	},
+	{ .name	=	"blocksize",
+	  .fops	=	&mdc_blksize_fops	},
+	{ .name	=	"kbytestotal",
+	  .fops	=	&mdc_kbytestotal_fops	},
+	{ .name	=	"kbytesfree",
+	  .fops	=	&mdc_kbytesfree_fops	},
+	{ .name	=	"kbytesavail",
+	  .fops	=	&mdc_kbytesavail_fops	},
+	{ .name	=	"filestotal",
+	  .fops	=	&mdc_filestotal_fops	},
+	{ .name	=	"filesfree",
+	  .fops	=	&mdc_filesfree_fops	},
 	{ .name	=	"mds_server_uuid",
 	  .fops	=	&mdc_server_uuid_fops	},
-	{ .name =	"max_pages_per_rpc",
-	  .fops =	&mdc_obd_max_pages_per_rpc_fops },
-	{ .name =	"max_dirty_mb",
-	  .fops =	&mdc_max_dirty_mb_fops		},
-	{ .name	=	"mdc_cached_mb",
-	  .fops	=	&mdc_cached_mb_fops		},
+	{ .name	=	"mds_conn_uuid",
+	  .fops	=	&mdc_conn_uuid_fops	},
+	{ .name	=	"max_pages_per_rpc",
+	  .fops	=	&mdc_obd_max_pages_per_rpc_fops	},
+	{ .name	=	"max_rpcs_in_flight",
+	  .fops	=	&mdc_max_rpcs_in_flight_fops	},
+	{ .name	=	"max_mod_rpcs_in_flight",
+	  .fops	=	&mdc_max_mod_rpcs_in_flight_fops	},
 	{ .name	=	"timeouts",
 	  .fops	=	&mdc_timeouts_fops		},
 	{ .name	=	"import",
@@ -510,53 +224,8 @@ struct lprocfs_vars lprocfs_mdc_obd_vars[] = {
 	  .fops	=	&mdc_pinger_recov_fops		},
 	{ .name	=	"rpc_stats",
 	  .fops	=	&mdc_rpc_stats_fops		},
-	{ .name	=	"unstable_stats",
-	  .fops	=	&mdc_unstable_stats_fops	},
-	{ .name	=	"mdc_stats",
-	  .fops	=	&mdc_stats_fops			},
-	{ .name	=	"mdc_dom_min_repsize",
-	  .fops	=	&mdc_dom_min_repsize_fops	},
+	{ .name	=	"active",
+	  .fops	=	&mdc_active_fops		},
 	{ NULL }
 };
-
-static struct attribute *mdc_attrs[] = {
-	&lustre_attr_active.attr,
-	&lustre_attr_max_rpcs_in_flight.attr,
-	&lustre_attr_max_mod_rpcs_in_flight.attr,
-	&lustre_attr_contention_seconds.attr,
-	&lustre_attr_mds_conn_uuid.attr,
-	&lustre_attr_conn_uuid.attr,
-	&lustre_attr_ping.attr,
-	NULL,
-};
-
-int mdc_tunables_init(struct obd_device *obd)
-{
-	int rc;
-
-	obd->obd_ktype.default_attrs = mdc_attrs;
-	obd->obd_vars = lprocfs_mdc_obd_vars;
-
-	rc = lprocfs_obd_setup(obd, false);
-	if (rc)
-		goto out_failed;
-#ifdef CONFIG_PROC_FS
-	rc = lprocfs_alloc_md_stats(obd, 0);
-	if (rc) {
-		lprocfs_obd_cleanup(obd);
-		goto out_failed;
-	}
-#endif
-	rc = sptlrpc_lprocfs_cliobd_attach(obd);
-	if (rc) {
-#ifdef CONFIG_PROC_FS
-		lprocfs_free_md_stats(obd);
-#endif
-		lprocfs_obd_cleanup(obd);
-		goto out_failed;
-	}
-	ptlrpc_lprocfs_register_obd(obd);
-
-out_failed:
-	return rc;
-}
+#endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c
index 1c8eb65110500..c99a3bacf24d6 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c
@@ -23,8 +23,6 @@
  * Copyright (c) 2017, Commissariat a l'Energie Atomique et aux Energies
  *                     Alternatives.
  *
- * Copyright (c) 2017, Intel Corporation.
- *
  * Author: Henri Doreau <henri.doreau@cea.fr>
  */
 
@@ -33,11 +31,9 @@
 #include <linux/init.h>
 #include <linux/kthread.h>
 #include <linux/poll.h>
-#include <linux/device.h>
-#include <linux/cdev.h>
+#include <linux/miscdevice.h>
 
 #include <lustre_log.h>
-#include <uapi/linux/lustre/lustre_ioctl.h>
 
 #include "mdc_internal.h"
 
@@ -59,44 +55,38 @@ static LIST_HEAD(chlg_registered_devices);
 
 struct chlg_registered_dev {
 	/* Device name of the form "changelog-{MDTNAME}" */
-	char			 ced_name[32];
-	/* changelog char device */
-	struct cdev		 ced_cdev;
-	struct device		*ced_device;
+	char			ced_name[32];
+	/* Misc device descriptor */
+	struct miscdevice	ced_misc;
 	/* OBDs referencing this device (multiple mount point) */
-	struct list_head	 ced_obds;
+	struct list_head	ced_obds;
 	/* Reference counter for proper deregistration */
-	struct kref		 ced_refs;
+	struct kref		ced_refs;
 	/* Link within the global chlg_registered_devices */
-	struct list_head	 ced_link;
+	struct list_head	ced_link;
 };
 
 struct chlg_reader_state {
 	/* Shortcut to the corresponding OBD device */
-	struct obd_device	   *crs_obd;
-	/* the corresponding chlg_registered_dev */
-	struct chlg_registered_dev *crs_ced;
+	struct obd_device	*crs_obd;
 	/* Producer thread (if any) */
-	struct task_struct	   *crs_prod_task;
+	struct task_struct	*crs_prod_task;
 	/* An error occurred that prevents from reading further */
-	int			    crs_err;
+	bool			 crs_err;
 	/* EOF, no more records available */
-	bool			    crs_eof;
+	bool			 crs_eof;
 	/* Desired start position */
-	__u64			    crs_start_offset;
+	__u64			 crs_start_offset;
 	/* Wait queue for the catalog processing thread */
-	wait_queue_head_t	    crs_waitq_prod;
+	wait_queue_head_t	 crs_waitq_prod;
 	/* Wait queue for the record copy threads */
-	wait_queue_head_t	    crs_waitq_cons;
+	wait_queue_head_t	 crs_waitq_cons;
 	/* Mutex protecting crs_rec_count and crs_rec_queue */
-	struct mutex		    crs_lock;
+	struct mutex		 crs_lock;
 	/* Number of item in the list */
-	__u64			    crs_rec_count;
+	__u64			 crs_rec_count;
 	/* List of prefetched enqueued_record::enq_linkage_items */
-	struct list_head	    crs_rec_queue;
-	unsigned int		    crs_last_catidx;
-	unsigned int		    crs_last_idx;
-	bool			    crs_poll;
+	struct list_head	 crs_rec_queue;
 };
 
 struct chlg_rec_entry {
@@ -113,81 +103,6 @@ enum {
 	CDEV_CHLG_MAX_PREFETCH = 1024,
 };
 
-static DEFINE_IDR(chlg_minor_idr);
-static DEFINE_SPINLOCK(chlg_minor_lock);
-
-static int chlg_minor_alloc(int *pminor)
-{
-	void *minor_allocated = (void *)-1;
-	int minor;
-
-	idr_preload(GFP_KERNEL);
-	spin_lock(&chlg_minor_lock);
-	minor = idr_alloc(&chlg_minor_idr, minor_allocated, 0,
-			  MDC_CHANGELOG_DEV_COUNT, GFP_NOWAIT);
-	spin_unlock(&chlg_minor_lock);
-	idr_preload_end();
-
-	if (minor < 0)
-		return minor;
-
-	*pminor = minor;
-	return 0;
-}
-
-static void chlg_minor_free(int minor)
-{
-	spin_lock(&chlg_minor_lock);
-	idr_remove(&chlg_minor_idr, minor);
-	spin_unlock(&chlg_minor_lock);
-}
-
-static void chlg_device_release(struct device *dev)
-{
-	struct chlg_registered_dev *entry = dev_get_drvdata(dev);
-
-	chlg_minor_free(MINOR(entry->ced_cdev.dev));
-	OBD_FREE_PTR(entry);
-}
-
-/**
- * Deregister a changelog character device whose refcount has reached zero.
- */
-static void chlg_dev_clear(struct kref *kref)
-{
-	struct chlg_registered_dev *entry;
-	
-	ENTRY;
-	entry = container_of(kref, struct chlg_registered_dev,
-			     ced_refs);
-
-	list_del(&entry->ced_link);
-	cdev_del(&entry->ced_cdev);
-	device_destroy(mdc_changelog_class, entry->ced_cdev.dev);
-	EXIT;
-}
-
-static inline struct obd_device* chlg_obd_get(struct chlg_registered_dev *dev)
-{
-	struct obd_device *obd;
-
-	mutex_lock(&chlg_registered_dev_lock);
-	if (list_empty(&dev->ced_obds))
-		return NULL;
-
-	obd = list_first_entry(&dev->ced_obds, struct obd_device,
-			       u.cli.cl_chg_dev_linkage);
-	class_incref(obd, "changelog", dev);
-	mutex_unlock(&chlg_registered_dev_lock);
-	return obd;
-}
-
-static inline void chlg_obd_put(struct chlg_registered_dev *dev,
-			 struct obd_device *obd)
-{
-	class_decref(obd, "changelog", dev);
-}
-
 /**
  * ChangeLog catalog processing callback invoked on each record.
  * If the current record is eligible to userland delivery, push
@@ -207,6 +122,7 @@ static int chlg_read_cat_process_cb(const struct lu_env *env,
 	struct llog_changelog_rec *rec;
 	struct chlg_reader_state *crs = data;
 	struct chlg_rec_entry *enq;
+	struct l_wait_info lwi = { 0 };
 	size_t len;
 	int rc;
 	ENTRY;
@@ -216,9 +132,6 @@ static int chlg_read_cat_process_cb(const struct lu_env *env,
 
 	rec = container_of(hdr, struct llog_changelog_rec, cr_hdr);
 
-	crs->crs_last_catidx = llh->lgh_hdr->llh_cat_idx;
-	crs->crs_last_idx = hdr->lrh_index;
-
 	if (rec->cr_hdr.lrh_type != CHANGELOG_REC) {
 		rc = -EINVAL;
 		CERROR("%s: not a changelog rec %x/%d in llog "DFID" rc = %d\n",
@@ -239,9 +152,9 @@ static int chlg_read_cat_process_cb(const struct lu_env *env,
 	       PFID(&rec->cr.cr_tfid), PFID(&rec->cr.cr_pfid),
 	       rec->cr.cr_namelen, changelog_rec_name(&rec->cr));
 
-	wait_event_interruptible(crs->crs_waitq_prod,
-				 crs->crs_rec_count < CDEV_CHLG_MAX_PREFETCH ||
-				 kthread_should_stop());
+	l_wait_event(crs->crs_waitq_prod,
+		     (crs->crs_rec_count < CDEV_CHLG_MAX_PREFETCH ||
+		      kthread_should_stop()), &lwi);
 
 	if (kthread_should_stop())
 		RETURN(LLOG_PROC_BREAK);
@@ -284,23 +197,13 @@ static void enq_record_delete(struct chlg_rec_entry *rec)
 static int chlg_load(void *args)
 {
 	struct chlg_reader_state *crs = args;
-	struct chlg_registered_dev *ced = crs->crs_ced;
-	struct obd_device *obd = NULL;
+	struct obd_device *obd = crs->crs_obd;
 	struct llog_ctxt *ctx = NULL;
 	struct llog_handle *llh = NULL;
+	struct l_wait_info lwi = { 0 };
 	int rc;
 	ENTRY;
 
-	crs->crs_last_catidx = -1;
-	crs->crs_last_idx = 0;
-
-again:
-	obd = chlg_obd_get(ced);
-	if (obd == NULL)
-		RETURN(-ENODEV);
-
-	crs->crs_obd = obd;
-
 	ctx = llog_get_context(obd, LLOG_CHANGELOG_REPL_CTXT);
 	if (ctx == NULL)
 		GOTO(err_out, rc = -ENOENT);
@@ -313,41 +216,24 @@ static int chlg_load(void *args)
 		GOTO(err_out, rc);
 	}
 
-
-	rc = llog_init_handle(NULL, llh,
-			      LLOG_F_IS_CAT |
-			      LLOG_F_EXT_JOBID |
-			      LLOG_F_EXT_EXTRA_FLAGS |
-			      LLOG_F_EXT_X_UIDGID |
-			      LLOG_F_EXT_X_NID |
-			      LLOG_F_EXT_X_OMODE |
-			      LLOG_F_EXT_X_XATTR,
-			      NULL);
+	rc = llog_init_handle(NULL, llh, LLOG_F_IS_CAT|LLOG_F_EXT_JOBID, NULL);
 	if (rc) {
 		CERROR("%s: fail to init llog handle: rc = %d\n",
 		       obd->obd_name, rc);
 		GOTO(err_out, rc);
 	}
 
-	rc = llog_cat_process(NULL, llh, chlg_read_cat_process_cb, crs,
-				crs->crs_last_catidx, crs->crs_last_idx);
+	rc = llog_cat_process(NULL, llh, chlg_read_cat_process_cb, crs, 0, 0);
 	if (rc < 0) {
 		CERROR("%s: fail to process llog: rc = %d\n", obd->obd_name, rc);
 		GOTO(err_out, rc);
 	}
-	if (!kthread_should_stop() && crs->crs_poll) {
-		llog_cat_close(NULL, llh);
-		llog_ctxt_put(ctx);
-		class_decref(obd, "changelog", crs);
-		schedule_timeout_interruptible(HZ);
-		goto again;
-	}
 
 	crs->crs_eof = true;
 
 err_out:
 	if (rc < 0)
-		crs->crs_err = rc;
+		crs->crs_err = true;
 
 	wake_up_all(&crs->crs_waitq_cons);
 
@@ -357,9 +243,7 @@ static int chlg_load(void *args)
 	if (ctx != NULL)
 		llog_ctxt_put(ctx);
 
-	crs->crs_obd = NULL;
-	chlg_obd_put(ced, obd);
-	wait_event_interruptible(crs->crs_waitq_prod, kthread_should_stop());
+	l_wait_event(crs->crs_waitq_prod, kthread_should_stop(), &lwi);
 
 	RETURN(rc);
 }
@@ -382,22 +266,17 @@ static ssize_t chlg_read(struct file *file, char __user *buff, size_t count,
 	struct chlg_reader_state *crs = file->private_data;
 	struct chlg_rec_entry *rec;
 	struct chlg_rec_entry *tmp;
-	size_t written_total = 0;
-	ssize_t rc;
+	struct l_wait_info lwi = { 0 };
+	ssize_t  written_total = 0;
 	LIST_HEAD(consumed);
 	ENTRY;
 
-	if (file->f_flags & O_NONBLOCK && crs->crs_rec_count == 0) {
-		if (crs->crs_err < 0)
-			RETURN(crs->crs_err);
-		else if (crs->crs_eof)
-			RETURN(0);
-		else
-			RETURN(-EAGAIN);
-	}
+	if (file->f_flags & O_NONBLOCK && crs->crs_rec_count == 0)
+		RETURN(-EAGAIN);
 
-	rc = wait_event_interruptible(crs->crs_waitq_cons,
-			crs->crs_rec_count > 0 || crs->crs_eof || crs->crs_err);
+	l_wait_event(crs->crs_waitq_cons,
+		     crs->crs_rec_count > 0 || crs->crs_eof || crs->crs_err,
+		     &lwi);
 
 	mutex_lock(&crs->crs_lock);
 	list_for_each_entry_safe(rec, tmp, &crs->crs_rec_queue, enq_linkage) {
@@ -405,7 +284,8 @@ static ssize_t chlg_read(struct file *file, char __user *buff, size_t count,
 			break;
 
 		if (copy_to_user(buff, rec->enq_record, rec->enq_length)) {
-			rc = -EFAULT;
+			if (written_total == 0)
+				written_total = -EFAULT;
 			break;
 		}
 
@@ -419,19 +299,15 @@ static ssize_t chlg_read(struct file *file, char __user *buff, size_t count,
 	}
 	mutex_unlock(&crs->crs_lock);
 
-	if (written_total > 0) {
-		rc = written_total;
+	if (written_total > 0)
 		wake_up_all(&crs->crs_waitq_prod);
-	} else if (rc == 0) {
-		rc = crs->crs_err;
-	}
 
 	list_for_each_entry_safe(rec, tmp, &consumed, enq_linkage)
 		enq_record_delete(rec);
 
 	*ppos = crs->crs_start_offset;
 
-	RETURN(rc);
+	RETURN(written_total);
 }
 
 /**
@@ -516,23 +392,15 @@ static loff_t chlg_llseek(struct file *file, loff_t off, int whence)
  */
 static int chlg_clear(struct chlg_reader_state *crs, __u32 reader, __u64 record)
 {
-	struct obd_device *obd = NULL;
+	struct obd_device *obd = crs->crs_obd;
 	struct changelog_setinfo cs  = {
 		.cs_recno = record,
 		.cs_id    = reader
 	};
-	int rc;
-
-	obd = chlg_obd_get(crs->crs_ced);
-	if (obd == NULL)
-		return -ENODEV;
-
-	rc = obd_set_info_async(NULL, obd->obd_self_export,
-				strlen(KEY_CHANGELOG_CLEAR),
-				KEY_CHANGELOG_CLEAR, sizeof(cs), &cs, NULL);
 
-	chlg_obd_put(crs->crs_ced, obd);
-	return rc;
+	return obd_set_info_async(NULL, obd->obd_self_export,
+				  strlen(KEY_CHANGELOG_CLEAR),
+				  KEY_CHANGELOG_CLEAR, sizeof(cs), &cs, NULL);
 }
 
 /** Maximum changelog control command size */
@@ -581,6 +449,31 @@ static ssize_t chlg_write(struct file *file, const char __user *buff,
 	return rc < 0 ? rc : count;
 }
 
+/**
+ * Find the OBD device associated to a changelog character device.
+ * @param[in]  cdev  character device instance descriptor
+ * @return corresponding OBD device or NULL if none was found.
+ */
+static struct obd_device *chlg_obd_get(dev_t cdev)
+{
+	int minor = MINOR(cdev);
+	struct obd_device *obd = NULL;
+	struct chlg_registered_dev *curr;
+
+	mutex_lock(&chlg_registered_dev_lock);
+	list_for_each_entry(curr, &chlg_registered_devices, ced_link) {
+		if (curr->ced_misc.minor == minor) {
+			/* take the first available OBD device attached */
+			obd = list_first_entry(&curr->ced_obds,
+					       struct obd_device,
+					       u.cli.cl_chg_dev_linkage);
+			break;
+		}
+	}
+	mutex_unlock(&chlg_registered_dev_lock);
+	return obd;
+}
+
 /**
  * Open handler, initialize internal CRS state and spawn prefetch thread if
  * needed.
@@ -591,19 +484,19 @@ static ssize_t chlg_write(struct file *file, const char __user *buff,
 static int chlg_open(struct inode *inode, struct file *file)
 {
 	struct chlg_reader_state *crs;
-	struct chlg_registered_dev *dev;
+	struct obd_device *obd = chlg_obd_get(inode->i_rdev);
 	struct task_struct *task;
 	int rc;
 	ENTRY;
 
-	dev = container_of(inode->i_cdev, struct chlg_registered_dev, ced_cdev);
+	if (!obd)
+		RETURN(-ENODEV);
 
 	OBD_ALLOC_PTR(crs);
 	if (!crs)
 		RETURN(-ENOMEM);
 
-	kref_get(&dev->ced_refs);
-	crs->crs_ced = dev;
+	crs->crs_obd = obd;
 	crs->crs_err = false;
 	crs->crs_eof = false;
 
@@ -617,7 +510,7 @@ static int chlg_open(struct inode *inode, struct file *file)
 		if (IS_ERR(task)) {
 			rc = PTR_ERR(task);
 			CERROR("%s: cannot start changelog thread: rc = %d\n",
-			       dev->ced_name, rc);
+			       obd->obd_name, rc);
 			GOTO(err_crs, rc);
 		}
 		crs->crs_prod_task = task;
@@ -627,7 +520,6 @@ static int chlg_open(struct inode *inode, struct file *file)
 	RETURN(0);
 
 err_crs:
-	kref_put(&dev->ced_refs, chlg_dev_clear);
 	OBD_FREE_PTR(crs);
 	return rc;
 }
@@ -644,18 +536,15 @@ static int chlg_release(struct inode *inode, struct file *file)
 	struct chlg_reader_state *crs = file->private_data;
 	struct chlg_rec_entry *rec;
 	struct chlg_rec_entry *tmp;
-	int rc = 0;
 
 	if (crs->crs_prod_task)
-		rc = kthread_stop(crs->crs_prod_task);
+		kthread_stop(crs->crs_prod_task);
 
 	list_for_each_entry_safe(rec, tmp, &crs->crs_rec_queue, enq_linkage)
 		enq_record_delete(rec);
 
-	kref_put(&crs->crs_ced->ced_refs, chlg_dev_clear);
 	OBD_FREE_PTR(crs);
-
-	return rc;
+	return 0;
 }
 
 /**
@@ -683,23 +572,6 @@ static unsigned int chlg_poll(struct file *file, poll_table *wait)
 	return mask;
 }
 
-static long chlg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
-	int rc;
-
-	struct chlg_reader_state *crs = file->private_data;
-	switch (cmd) {
-	case OBD_IOC_CHLG_POLL:
-		crs->crs_poll = !!arg;
-		rc = 0;
-		break;
-	default:
-		rc = -EINVAL;
-		break;
-	}
-	return rc;
-}
-
 static const struct file_operations chlg_fops = {
 	.owner		= THIS_MODULE,
 	.llseek		= chlg_llseek,
@@ -708,18 +580,17 @@ static const struct file_operations chlg_fops = {
 	.open		= chlg_open,
 	.release	= chlg_release,
 	.poll		= chlg_poll,
-	.unlocked_ioctl	= chlg_ioctl,
 };
 
 /**
  * This uses obd_name of the form: "testfs-MDT0000-mdc-ffff88006501600"
  * and returns a name of the form: "changelog-testfs-MDT0000".
  */
-static void get_target_name(char *name, size_t name_len, struct obd_device *obd)
+static void get_chlg_name(char *name, size_t name_len, struct obd_device *obd)
 {
 	int i;
 
-	snprintf(name, name_len, "%s", obd->obd_name);
+	snprintf(name, name_len, "changelog-%s", obd->obd_name);
 
 	/* Find the 2nd '-' from the end and truncate on it */
 	for (i = 0; i < 2; i++) {
@@ -781,16 +652,18 @@ int mdc_changelog_cdev_init(struct obd_device *obd)
 {
 	struct chlg_registered_dev *exist;
 	struct chlg_registered_dev *entry;
-	struct device *device;
-	dev_t dev;
-	int minor, rc;
+	int rc;
 	ENTRY;
 
 	OBD_ALLOC_PTR(entry);
 	if (entry == NULL)
 		RETURN(-ENOMEM);
 
-	get_target_name(entry->ced_name, sizeof(entry->ced_name), obd);
+	get_chlg_name(entry->ced_name, sizeof(entry->ced_name), obd);
+
+	entry->ced_misc.minor = MISC_DYNAMIC_MINOR;
+	entry->ced_misc.name  = entry->ced_name;
+	entry->ced_misc.fops  = &chlg_fops;
 
 	kref_init(&entry->ced_refs);
 	INIT_LIST_HEAD(&entry->ced_obds);
@@ -804,41 +677,15 @@ int mdc_changelog_cdev_init(struct obd_device *obd)
 		GOTO(out_unlock, rc = 0);
 	}
 
-	list_add_tail(&obd->u.cli.cl_chg_dev_linkage, &entry->ced_obds);
-	list_add_tail(&entry->ced_link, &chlg_registered_devices);
-
 	/* Register new character device */
-	cdev_init(&entry->ced_cdev, &chlg_fops);
-	entry->ced_cdev.owner = THIS_MODULE;
-
-	rc = chlg_minor_alloc(&minor);
-	if (rc)
+	rc = misc_register(&entry->ced_misc);
+	if (rc != 0)
 		GOTO(out_unlock, rc);
 
-	dev = MKDEV(MAJOR(mdc_changelog_dev), minor);
-	rc = cdev_add(&entry->ced_cdev, dev, 1);
-	if (rc)
-		GOTO(out_minor, rc);
-
-	device = device_create(mdc_changelog_class, NULL, dev, entry, "%s-%s",
-			       MDC_CHANGELOG_DEV_NAME, entry->ced_name);
-	if (IS_ERR(device))
-		GOTO(out_cdev, rc = PTR_ERR(device));
-
-	device->release = chlg_device_release;
-	entry->ced_device = device;
+	list_add_tail(&obd->u.cli.cl_chg_dev_linkage, &entry->ced_obds);
+	list_add_tail(&entry->ced_link, &chlg_registered_devices);
 
 	entry = NULL;	/* prevent it from being freed below */
-	GOTO(out_unlock, rc = 0);
-
-out_cdev:
-	cdev_del(&entry->ced_cdev);
-
-out_minor:
-	chlg_minor_free(minor);
-
-	list_del_init(&obd->u.cli.cl_chg_dev_linkage);
-	list_del(&entry->ced_link);
 
 out_unlock:
 	mutex_unlock(&chlg_registered_dev_lock);
@@ -847,6 +694,23 @@ int mdc_changelog_cdev_init(struct obd_device *obd)
 	RETURN(rc);
 }
 
+/**
+ * Deregister a changelog character device whose refcount has reached zero.
+ */
+static void chlg_dev_clear(struct kref *kref)
+{
+	struct chlg_registered_dev *entry = container_of(kref,
+						      struct chlg_registered_dev,
+						      ced_refs);
+	ENTRY;
+
+	LASSERT(mutex_is_locked(&chlg_registered_dev_lock));
+	list_del(&entry->ced_link);
+	misc_deregister(&entry->ced_misc);
+	OBD_FREE_PTR(entry);
+	EXIT;
+}
+
 /**
  * Release OBD, decrease reference count of the corresponding changelog device.
  */
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_dev.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_dev.c
deleted file mode 100644
index 3606778434879..0000000000000
--- a/drivers/staging/lustrefsx/lustre/mdc/mdc_dev.c
+++ /dev/null
@@ -1,1564 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2017, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- *
- * Implementation of cl_device, cl_req for MDC layer.
- *
- * Author: Mikhail Pershin <mike.pershin@intel.com>
- */
-
-#define DEBUG_SUBSYSTEM S_MDC
-
-#include <obd_class.h>
-#include <lustre_osc.h>
-
-#include "mdc_internal.h"
-
-static void mdc_lock_build_policy(const struct lu_env *env,
-				  union ldlm_policy_data *policy)
-{
-	memset(policy, 0, sizeof *policy);
-	policy->l_inodebits.bits = MDS_INODELOCK_DOM;
-}
-
-int mdc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
-{
-	return osc_ldlm_glimpse_ast(dlmlock, data);
-}
-
-static void mdc_lock_build_einfo(const struct lu_env *env,
-				 const struct cl_lock *lock,
-				 struct osc_object *osc,
-				 struct ldlm_enqueue_info *einfo)
-{
-	einfo->ei_type = LDLM_IBITS;
-	einfo->ei_mode = osc_cl_lock2ldlm(lock->cll_descr.cld_mode);
-	einfo->ei_cb_bl = mdc_ldlm_blocking_ast;
-	einfo->ei_cb_cp = ldlm_completion_ast;
-	einfo->ei_cb_gl = mdc_ldlm_glimpse_ast;
-	einfo->ei_cbdata = osc; /* value to be put into ->l_ast_data */
-}
-
-static void mdc_lock_lvb_update(const struct lu_env *env,
-				struct osc_object *osc,
-				struct ldlm_lock *dlmlock,
-				struct ost_lvb *lvb);
-
-static int mdc_set_dom_lock_data(struct ldlm_lock *lock, void *data)
-{
-	int set = 0;
-
-	LASSERT(lock != NULL);
-	LASSERT(lock->l_glimpse_ast == mdc_ldlm_glimpse_ast);
-
-	lock_res_and_lock(lock);
-
-	if (lock->l_ast_data == NULL)
-		lock->l_ast_data = data;
-	if (lock->l_ast_data == data)
-		set = 1;
-
-	unlock_res_and_lock(lock);
-
-	return set;
-}
-
-int mdc_dom_lock_match(const struct lu_env *env, struct obd_export *exp,
-		       struct ldlm_res_id *res_id, enum ldlm_type type,
-		       union ldlm_policy_data *policy, enum ldlm_mode mode,
-		       __u64 *flags, struct osc_object *obj,
-		       struct lustre_handle *lockh, int unref)
-{
-	struct obd_device *obd = exp->exp_obd;
-	__u64 lflags = *flags;
-	enum ldlm_mode rc;
-
-	ENTRY;
-
-	rc = ldlm_lock_match(obd->obd_namespace, lflags,
-			     res_id, type, policy, mode, lockh, unref);
-	if (rc == 0 || lflags & LDLM_FL_TEST_LOCK)
-		RETURN(rc);
-
-	if (obj != NULL) {
-		struct ldlm_lock *lock = ldlm_handle2lock(lockh);
-
-		LASSERT(lock != NULL);
-		if (mdc_set_dom_lock_data(lock, obj)) {
-			lock_res_and_lock(lock);
-			if (!ldlm_is_lvb_cached(lock)) {
-				LASSERT(lock->l_ast_data == obj);
-				mdc_lock_lvb_update(env, obj, lock, NULL);
-				ldlm_set_lvb_cached(lock);
-			}
-			unlock_res_and_lock(lock);
-		} else {
-			ldlm_lock_decref(lockh, rc);
-			rc = 0;
-		}
-		LDLM_LOCK_PUT(lock);
-	}
-	RETURN(rc);
-}
-
-/**
- * Finds an existing lock covering a page with given index.
- * Copy of osc_obj_dlmlock_at_pgoff() but for DoM IBITS lock.
- */
-struct ldlm_lock *mdc_dlmlock_at_pgoff(const struct lu_env *env,
-				       struct osc_object *obj, pgoff_t index,
-				       enum osc_dap_flags dap_flags)
-{
-	struct osc_thread_info *info = osc_env_info(env);
-	struct ldlm_res_id *resname = &info->oti_resname;
-	union ldlm_policy_data *policy = &info->oti_policy;
-	struct lustre_handle lockh;
-	struct ldlm_lock *lock = NULL;
-	enum ldlm_mode mode;
-	__u64 flags;
-
-	ENTRY;
-
-	fid_build_reg_res_name(lu_object_fid(osc2lu(obj)), resname);
-	mdc_lock_build_policy(env, policy);
-
-	flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
-	if (dap_flags & OSC_DAP_FL_TEST_LOCK)
-		flags |= LDLM_FL_TEST_LOCK;
-
-again:
-	/* Next, search for already existing extent locks that will cover us */
-	/* If we're trying to read, we also search for an existing PW lock.  The
-	 * VFS and page cache already protect us locally, so lots of readers/
-	 * writers can share a single PW lock. */
-	mode = mdc_dom_lock_match(env, osc_export(obj), resname, LDLM_IBITS,
-				  policy, LCK_PR | LCK_PW | LCK_GROUP, &flags,
-				  obj, &lockh,
-				  dap_flags & OSC_DAP_FL_CANCELING);
-	if (mode != 0) {
-		lock = ldlm_handle2lock(&lockh);
-		/* RACE: the lock is cancelled so let's try again */
-		if (unlikely(lock == NULL))
-			goto again;
-	}
-
-	RETURN(lock);
-}
-
-/**
- * Check if page @page is covered by an extra lock or discard it.
- */
-static int mdc_check_and_discard_cb(const struct lu_env *env, struct cl_io *io,
-				    struct osc_page *ops, void *cbdata)
-{
-	struct osc_thread_info *info = osc_env_info(env);
-	struct osc_object *osc = cbdata;
-	pgoff_t index;
-
-	index = osc_index(ops);
-	if (index >= info->oti_fn_index) {
-		struct ldlm_lock *tmp;
-		struct cl_page *page = ops->ops_cl.cpl_page;
-
-		/* refresh non-overlapped index */
-		tmp = mdc_dlmlock_at_pgoff(env, osc, index,
-					   OSC_DAP_FL_TEST_LOCK);
-		if (tmp != NULL) {
-			info->oti_fn_index = CL_PAGE_EOF;
-			LDLM_LOCK_PUT(tmp);
-		} else if (cl_page_own(env, io, page) == 0) {
-			/* discard the page */
-			cl_page_discard(env, io, page);
-			cl_page_disown(env, io, page);
-		} else {
-			LASSERT(page->cp_state == CPS_FREEING);
-		}
-	}
-
-	info->oti_next_index = index + 1;
-	return CLP_GANG_OKAY;
-}
-
-/**
- * Discard pages protected by the given lock. This function traverses radix
- * tree to find all covering pages and discard them. If a page is being covered
- * by other locks, it should remain in cache.
- *
- * If error happens on any step, the process continues anyway (the reasoning
- * behind this being that lock cancellation cannot be delayed indefinitely).
- */
-static int mdc_lock_discard_pages(const struct lu_env *env,
-				  struct osc_object *osc,
-				  pgoff_t start, pgoff_t end,
-				  bool discard)
-{
-	struct osc_thread_info *info = osc_env_info(env);
-	struct cl_io *io = &info->oti_io;
-	osc_page_gang_cbt cb;
-	int res;
-	int result;
-
-	ENTRY;
-
-	io->ci_obj = cl_object_top(osc2cl(osc));
-	io->ci_ignore_layout = 1;
-	result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
-	if (result != 0)
-		GOTO(out, result);
-
-	cb = discard ? osc_discard_cb : mdc_check_and_discard_cb;
-	info->oti_fn_index = info->oti_next_index = start;
-	do {
-		res = osc_page_gang_lookup(env, io, osc, info->oti_next_index,
-					   end, cb, (void *)osc);
-		if (info->oti_next_index > end)
-			break;
-
-		if (res == CLP_GANG_RESCHED)
-			cond_resched();
-	} while (res != CLP_GANG_OKAY);
-out:
-	cl_io_fini(env, io);
-	RETURN(result);
-}
-
-static int mdc_lock_flush(const struct lu_env *env, struct osc_object *obj,
-			  pgoff_t start, pgoff_t end, enum cl_lock_mode mode,
-			  bool discard)
-{
-	int result = 0;
-	int rc;
-
-	ENTRY;
-
-	if (mode == CLM_WRITE) {
-		result = osc_cache_writeback_range(env, obj, start, end, 1,
-						   discard);
-		CDEBUG(D_CACHE, "object %p: [%lu -> %lu] %d pages were %s.\n",
-		       obj, start, end, result,
-		       discard ? "discarded" : "written back");
-		if (result > 0)
-			result = 0;
-	}
-
-	rc = mdc_lock_discard_pages(env, obj, start, end, discard);
-	if (result == 0 && rc < 0)
-		result = rc;
-
-	RETURN(result);
-}
-
-void mdc_lock_lockless_cancel(const struct lu_env *env,
-			      const struct cl_lock_slice *slice)
-{
-	struct osc_lock *ols = cl2osc_lock(slice);
-	struct osc_object *osc = cl2osc(slice->cls_obj);
-	struct cl_lock_descr *descr = &slice->cls_lock->cll_descr;
-	int rc;
-
-	LASSERT(ols->ols_dlmlock == NULL);
-	rc = mdc_lock_flush(env, osc, descr->cld_start, descr->cld_end,
-			    descr->cld_mode, 0);
-	if (rc != 0)
-		CERROR("Pages for lockless lock %p were not purged(%d)\n",
-		       ols, rc);
-
-	osc_lock_wake_waiters(env, osc, ols);
-}
-
-/**
- * Helper for osc_dlm_blocking_ast() handling discrepancies between cl_lock
- * and ldlm_lock caches.
- */
-static int mdc_dlm_blocking_ast0(const struct lu_env *env,
-				 struct ldlm_lock *dlmlock,
-				 int flag)
-{
-	struct cl_object *obj = NULL;
-	int result = 0;
-	bool discard;
-	enum cl_lock_mode mode = CLM_READ;
-
-	ENTRY;
-
-	LASSERT(flag == LDLM_CB_CANCELING);
-	LASSERT(dlmlock != NULL);
-
-	lock_res_and_lock(dlmlock);
-	if (dlmlock->l_granted_mode != dlmlock->l_req_mode) {
-		dlmlock->l_ast_data = NULL;
-		unlock_res_and_lock(dlmlock);
-		RETURN(0);
-	}
-
-	discard = ldlm_is_discard_data(dlmlock);
-	if (dlmlock->l_granted_mode & (LCK_PW | LCK_GROUP))
-		mode = CLM_WRITE;
-
-	if (dlmlock->l_ast_data != NULL) {
-		obj = osc2cl(dlmlock->l_ast_data);
-		dlmlock->l_ast_data = NULL;
-		cl_object_get(obj);
-	}
-	unlock_res_and_lock(dlmlock);
-
-	/* if l_ast_data is NULL, the dlmlock was enqueued by AGL or
-	 * the object has been destroyed. */
-	if (obj != NULL) {
-		struct cl_attr *attr = &osc_env_info(env)->oti_attr;
-
-		/* Destroy pages covered by the extent of the DLM lock */
-		result = mdc_lock_flush(env, cl2osc(obj), cl_index(obj, 0),
-					CL_PAGE_EOF, mode, discard);
-		/* Losing a lock, set KMS to 0.
-		 * NB: assumed that DOM lock covers whole data on MDT.
-		 */
-		/* losing a lock, update kms */
-		lock_res_and_lock(dlmlock);
-		cl_object_attr_lock(obj);
-		attr->cat_kms = 0;
-		cl_object_attr_update(env, obj, attr, CAT_KMS);
-		cl_object_attr_unlock(obj);
-		unlock_res_and_lock(dlmlock);
-		cl_object_put(env, obj);
-	}
-	RETURN(result);
-}
-
-int mdc_ldlm_blocking_ast(struct ldlm_lock *dlmlock,
-			  struct ldlm_lock_desc *new, void *data, int flag)
-{
-	int rc = 0;
-
-	ENTRY;
-
-	switch (flag) {
-	case LDLM_CB_BLOCKING: {
-		struct lustre_handle lockh;
-
-		ldlm_lock2handle(dlmlock, &lockh);
-		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
-		if (rc == -ENODATA)
-			rc = 0;
-		break;
-	}
-	case LDLM_CB_CANCELING: {
-		struct lu_env *env;
-		__u16 refcheck;
-
-		/*
-		 * This can be called in the context of outer IO, e.g.,
-		 *
-		 *    osc_enqueue_base()->...
-		 *      ->ldlm_prep_elc_req()->...
-		 *        ->ldlm_cancel_callback()->...
-		 *          ->osc_ldlm_blocking_ast()
-		 *
-		 * new environment has to be created to not corrupt outer
-		 * context.
-		 */
-		env = cl_env_get(&refcheck);
-		if (IS_ERR(env)) {
-			rc = PTR_ERR(env);
-			break;
-		}
-
-		rc = mdc_dlm_blocking_ast0(env, dlmlock, flag);
-		cl_env_put(env, &refcheck);
-		break;
-	}
-	default:
-		LBUG();
-	}
-	RETURN(rc);
-}
-
-/**
- * Updates object attributes from a lock value block (lvb) received together
- * with the DLM lock reply from the server.
- * This can be optimized to not update attributes when lock is a result of a
- * local match.
- *
- * Called under lock and resource spin-locks.
- */
-void mdc_lock_lvb_update(const struct lu_env *env, struct osc_object *osc,
-			 struct ldlm_lock *dlmlock, struct ost_lvb *lvb)
-{
-	struct cl_object *obj = osc2cl(osc);
-	struct lov_oinfo *oinfo = osc->oo_oinfo;
-	struct cl_attr *attr = &osc_env_info(env)->oti_attr;
-	unsigned valid = CAT_BLOCKS | CAT_ATIME | CAT_CTIME | CAT_MTIME |
-			 CAT_SIZE;
-	unsigned int setkms = 0;
-
-	ENTRY;
-
-	if (lvb == NULL) {
-		LASSERT(dlmlock != NULL);
-		lvb = &dlmlock->l_ost_lvb;
-	}
-	cl_lvb2attr(attr, lvb);
-
-	cl_object_attr_lock(obj);
-	if (dlmlock != NULL) {
-		__u64 size;
-
-		check_res_locked(dlmlock->l_resource);
-		size = lvb->lvb_size;
-
-		if (size >= oinfo->loi_kms) {
-			valid |= CAT_KMS;
-			attr->cat_kms = size;
-			setkms = 1;
-		}
-	}
-
-	/* The size should not be less than the kms */
-	if (attr->cat_size < oinfo->loi_kms)
-		attr->cat_size = oinfo->loi_kms;
-
-	LDLM_DEBUG(dlmlock, "acquired size %llu, setting rss=%llu;%s "
-		   "kms=%llu, end=%llu", lvb->lvb_size, attr->cat_size,
-		   setkms ? "" : " leaving",
-		   setkms ? attr->cat_kms : oinfo->loi_kms,
-		   dlmlock ? dlmlock->l_policy_data.l_extent.end : -1ull);
-
-	cl_object_attr_update(env, obj, attr, valid);
-	cl_object_attr_unlock(obj);
-	EXIT;
-}
-
-static void mdc_lock_granted(const struct lu_env *env, struct osc_lock *oscl,
-			     struct lustre_handle *lockh)
-{
-	struct osc_object *osc = cl2osc(oscl->ols_cl.cls_obj);
-	struct ldlm_lock *dlmlock;
-
-	ENTRY;
-
-	dlmlock = ldlm_handle2lock_long(lockh, 0);
-	LASSERT(dlmlock != NULL);
-
-	/* lock reference taken by ldlm_handle2lock_long() is
-	 * owned by osc_lock and released in osc_lock_detach()
-	 */
-	lu_ref_add(&dlmlock->l_reference, "osc_lock", oscl);
-	oscl->ols_has_ref = 1;
-
-	LASSERT(oscl->ols_dlmlock == NULL);
-	oscl->ols_dlmlock = dlmlock;
-
-	/* This may be a matched lock for glimpse request, do not hold
-	 * lock reference in that case. */
-	if (!oscl->ols_glimpse) {
-		/* hold a refc for non glimpse lock which will
-		 * be released in osc_lock_cancel() */
-		lustre_handle_copy(&oscl->ols_handle, lockh);
-		ldlm_lock_addref(lockh, oscl->ols_einfo.ei_mode);
-		oscl->ols_hold = 1;
-	}
-
-	/* Lock must have been granted. */
-	lock_res_and_lock(dlmlock);
-	if (dlmlock->l_granted_mode == dlmlock->l_req_mode) {
-		struct cl_lock_descr *descr = &oscl->ols_cl.cls_lock->cll_descr;
-
-		/* extend the lock extent, otherwise it will have problem when
-		 * we decide whether to grant a lockless lock. */
-		descr->cld_mode = osc_ldlm2cl_lock(dlmlock->l_granted_mode);
-		descr->cld_start = cl_index(descr->cld_obj, 0);
-		descr->cld_end = CL_PAGE_EOF;
-
-		/* no lvb update for matched lock */
-		if (!ldlm_is_lvb_cached(dlmlock)) {
-			LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY);
-			LASSERT(osc == dlmlock->l_ast_data);
-			mdc_lock_lvb_update(env, osc, dlmlock, NULL);
-			ldlm_set_lvb_cached(dlmlock);
-		}
-	}
-	unlock_res_and_lock(dlmlock);
-
-	LASSERT(oscl->ols_state != OLS_GRANTED);
-	oscl->ols_state = OLS_GRANTED;
-	EXIT;
-}
-
-/**
- * Lock upcall function that is executed either when a reply to ENQUEUE rpc is
- * received from a server, or after osc_enqueue_base() matched a local DLM
- * lock.
- */
-static int mdc_lock_upcall(void *cookie, struct lustre_handle *lockh,
-			   int errcode)
-{
-	struct osc_lock *oscl = cookie;
-	struct cl_lock_slice *slice = &oscl->ols_cl;
-	struct lu_env *env;
-	int rc;
-
-	ENTRY;
-
-	env = cl_env_percpu_get();
-	/* should never happen, similar to osc_ldlm_blocking_ast(). */
-	LASSERT(!IS_ERR(env));
-
-	rc = ldlm_error2errno(errcode);
-	if (oscl->ols_state == OLS_ENQUEUED) {
-		oscl->ols_state = OLS_UPCALL_RECEIVED;
-	} else if (oscl->ols_state == OLS_CANCELLED) {
-		rc = -EIO;
-	} else {
-		CERROR("Impossible state: %d\n", oscl->ols_state);
-		LBUG();
-	}
-
-	CDEBUG(D_INODE, "rc %d, err %d\n", rc, errcode);
-	if (rc == 0)
-		mdc_lock_granted(env, oscl, lockh);
-
-	/* Error handling, some errors are tolerable. */
-	if (oscl->ols_locklessable && rc == -EUSERS) {
-		/* This is a tolerable error, turn this lock into
-		 * lockless lock.
-		 */
-		osc_object_set_contended(cl2osc(slice->cls_obj));
-		LASSERT(slice->cls_ops != oscl->ols_lockless_ops);
-
-		/* Change this lock to ldlmlock-less lock. */
-		osc_lock_to_lockless(env, oscl, 1);
-		oscl->ols_state = OLS_GRANTED;
-		rc = 0;
-	} else if (oscl->ols_glimpse && rc == -ENAVAIL) {
-		LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY);
-		mdc_lock_lvb_update(env, cl2osc(slice->cls_obj),
-				    NULL, &oscl->ols_lvb);
-		/* Hide the error. */
-		rc = 0;
-	}
-
-	if (oscl->ols_owner != NULL)
-		cl_sync_io_note(env, oscl->ols_owner, rc);
-	cl_env_percpu_put(env);
-
-	RETURN(rc);
-}
-
-int mdc_fill_lvb(struct ptlrpc_request *req, struct ost_lvb *lvb)
-{
-	struct mdt_body *body;
-
-	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
-	if (!body)
-		RETURN(-EPROTO);
-
-	lvb->lvb_mtime = body->mbo_mtime;
-	lvb->lvb_atime = body->mbo_atime;
-	lvb->lvb_ctime = body->mbo_ctime;
-	lvb->lvb_blocks = body->mbo_dom_blocks;
-	lvb->lvb_size = body->mbo_dom_size;
-
-	RETURN(0);
-}
-
-int mdc_enqueue_fini(struct ptlrpc_request *req, osc_enqueue_upcall_f upcall,
-		     void *cookie, struct lustre_handle *lockh,
-		     enum ldlm_mode mode, __u64 *flags, int errcode)
-{
-	struct osc_lock *ols = cookie;
-	struct ldlm_lock *lock;
-	int rc = 0;
-
-	ENTRY;
-
-	/* The request was created before ldlm_cli_enqueue call. */
-	if (errcode == ELDLM_LOCK_ABORTED) {
-		struct ldlm_reply *rep;
-
-		rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
-		LASSERT(rep != NULL);
-
-		rep->lock_policy_res2 =
-			ptlrpc_status_ntoh(rep->lock_policy_res2);
-		if (rep->lock_policy_res2)
-			errcode = rep->lock_policy_res2;
-
-		rc = mdc_fill_lvb(req, &ols->ols_lvb);
-		*flags |= LDLM_FL_LVB_READY;
-	} else if (errcode == ELDLM_OK) {
-		/* Callers have references, should be valid always */
-		lock = ldlm_handle2lock(lockh);
-		LASSERT(lock);
-
-		rc = mdc_fill_lvb(req, &lock->l_ost_lvb);
-		LDLM_LOCK_PUT(lock);
-		*flags |= LDLM_FL_LVB_READY;
-	}
-
-	/* Call the update callback. */
-	rc = (*upcall)(cookie, lockh, rc < 0 ? rc : errcode);
-
-	/* release the reference taken in ldlm_cli_enqueue() */
-	if (errcode == ELDLM_LOCK_MATCHED)
-		errcode = ELDLM_OK;
-	if (errcode == ELDLM_OK && lustre_handle_is_used(lockh))
-		ldlm_lock_decref(lockh, mode);
-
-	RETURN(rc);
-}
-
-int mdc_enqueue_interpret(const struct lu_env *env, struct ptlrpc_request *req,
-			  struct osc_enqueue_args *aa, int rc)
-{
-	struct ldlm_lock *lock;
-	struct lustre_handle *lockh = &aa->oa_lockh;
-	enum ldlm_mode mode = aa->oa_mode;
-
-	ENTRY;
-
-	LASSERT(!aa->oa_speculative);
-
-	/* ldlm_cli_enqueue is holding a reference on the lock, so it must
-	 * be valid. */
-	lock = ldlm_handle2lock(lockh);
-	LASSERTF(lock != NULL,
-		 "lockh %#llx, req %p, aa %p - client evicted?\n",
-		 lockh->cookie, req, aa);
-
-	/* Take an additional reference so that a blocking AST that
-	 * ldlm_cli_enqueue_fini() might post for a failed lock, is guaranteed
-	 * to arrive after an upcall has been executed by
-	 * osc_enqueue_fini(). */
-	ldlm_lock_addref(lockh, mode);
-
-	/* Let cl_lock_state_wait fail with -ERESTARTSYS to unuse sublocks. */
-	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_HANG, 2);
-
-	/* Let CP AST to grant the lock first. */
-	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1);
-
-	/* Complete obtaining the lock procedure. */
-	rc = ldlm_cli_enqueue_fini(aa->oa_exp, req, aa->oa_type, 1,
-				   aa->oa_mode, aa->oa_flags, NULL, 0,
-				   lockh, rc);
-	/* Complete mdc stuff. */
-	rc = mdc_enqueue_fini(req, aa->oa_upcall, aa->oa_cookie, lockh, mode,
-			      aa->oa_flags, rc);
-
-	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
-
-	ldlm_lock_decref(lockh, mode);
-	LDLM_LOCK_PUT(lock);
-	RETURN(rc);
-}
-
-/* When enqueuing asynchronously, locks are not ordered, we can obtain a lock
- * from the 2nd OSC before a lock from the 1st one. This does not deadlock with
- * other synchronous requests, however keeping some locks and trying to obtain
- * others may take a considerable amount of time in a case of ost failure; and
- * when other sync requests do not get released lock from a client, the client
- * is excluded from the cluster -- such scenarious make the life difficult, so
- * release locks just after they are obtained. */
-int mdc_enqueue_send(const struct lu_env *env, struct obd_export *exp,
-		     struct ldlm_res_id *res_id, __u64 *flags,
-		     union ldlm_policy_data *policy,
-		     struct ost_lvb *lvb, int kms_valid,
-		     osc_enqueue_upcall_f upcall, void *cookie,
-		     struct ldlm_enqueue_info *einfo, int async)
-{
-	struct obd_device *obd = exp->exp_obd;
-	struct lustre_handle lockh = { 0 };
-	struct ptlrpc_request *req = NULL;
-	struct ldlm_intent *lit;
-	enum ldlm_mode mode;
-	bool glimpse = *flags & LDLM_FL_HAS_INTENT;
-	__u64 match_flags = *flags;
-	int rc;
-
-	ENTRY;
-
-	mode = einfo->ei_mode;
-	if (einfo->ei_mode == LCK_PR)
-		mode |= LCK_PW;
-
-	if (glimpse)
-		match_flags |= LDLM_FL_BLOCK_GRANTED;
-	/* DOM locking uses LDLM_FL_KMS_IGNORE to mark locks wich have no valid
-	 * LVB information, e.g. canceled locks or locks of just pruned object,
-	 * such locks should be skipped.
-	 */
-	mode = ldlm_lock_match(obd->obd_namespace, match_flags, res_id,
-			       einfo->ei_type, policy, mode, &lockh, 0);
-	if (mode) {
-		struct ldlm_lock *matched;
-
-		if (*flags & LDLM_FL_TEST_LOCK)
-			RETURN(ELDLM_OK);
-
-		matched = ldlm_handle2lock(&lockh);
-
-		if (OBD_FAIL_CHECK(OBD_FAIL_MDC_GLIMPSE_DDOS))
-			ldlm_set_kms_ignore(matched);
-
-		if (mdc_set_dom_lock_data(matched, einfo->ei_cbdata)) {
-			*flags |= LDLM_FL_LVB_READY;
-
-			/* We already have a lock, and it's referenced. */
-			(*upcall)(cookie, &lockh, ELDLM_LOCK_MATCHED);
-
-			ldlm_lock_decref(&lockh, mode);
-			LDLM_LOCK_PUT(matched);
-			RETURN(ELDLM_OK);
-		}
-		ldlm_lock_decref(&lockh, mode);
-		LDLM_LOCK_PUT(matched);
-	}
-
-	if (*flags & (LDLM_FL_TEST_LOCK | LDLM_FL_MATCH_LOCK))
-		RETURN(-ENOLCK);
-
-	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_INTENT);
-	if (req == NULL)
-		RETURN(-ENOMEM);
-
-	rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
-	if (rc < 0) {
-		ptlrpc_request_free(req);
-		RETURN(rc);
-	}
-
-	/* pack the intent */
-	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
-	lit->opc = glimpse ? IT_GLIMPSE : IT_BRW;
-
-	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, 0);
-	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, 0);
-	ptlrpc_request_set_replen(req);
-
-	/* users of mdc_enqueue() can pass this flag for ldlm_lock_match() */
-	*flags &= ~LDLM_FL_BLOCK_GRANTED;
-	/* All MDC IO locks are intents */
-	*flags |= LDLM_FL_HAS_INTENT;
-	rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, NULL,
-			      0, LVB_T_NONE, &lockh, async);
-	if (async) {
-		if (!rc) {
-			struct osc_enqueue_args *aa;
-
-			CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
-			aa = ptlrpc_req_async_args(req);
-			aa->oa_exp = exp;
-			aa->oa_mode = einfo->ei_mode;
-			aa->oa_type = einfo->ei_type;
-			lustre_handle_copy(&aa->oa_lockh, &lockh);
-			aa->oa_upcall = upcall;
-			aa->oa_cookie = cookie;
-			aa->oa_speculative = false;
-			aa->oa_flags = flags;
-			aa->oa_lvb = lvb;
-
-			req->rq_interpret_reply =
-				(ptlrpc_interpterer_t)mdc_enqueue_interpret;
-			ptlrpcd_add_req(req);
-		} else {
-			ptlrpc_req_finished(req);
-		}
-		RETURN(rc);
-	}
-
-	rc = mdc_enqueue_fini(req, upcall, cookie, &lockh, einfo->ei_mode,
-			      flags, rc);
-	ptlrpc_req_finished(req);
-	RETURN(rc);
-}
-
-/**
- * Implementation of cl_lock_operations::clo_enqueue() method for osc
- * layer. This initiates ldlm enqueue:
- *
- *     - cancels conflicting locks early (osc_lock_enqueue_wait());
- *
- *     - calls osc_enqueue_base() to do actual enqueue.
- *
- * osc_enqueue_base() is supplied with an upcall function that is executed
- * when lock is received either after a local cached ldlm lock is matched, or
- * when a reply from the server is received.
- *
- * This function does not wait for the network communication to complete.
- */
-static int mdc_lock_enqueue(const struct lu_env *env,
-			    const struct cl_lock_slice *slice,
-			    struct cl_io *unused, struct cl_sync_io *anchor)
-{
-	struct osc_thread_info *info = osc_env_info(env);
-	struct osc_io *oio = osc_env_io(env);
-	struct osc_object *osc = cl2osc(slice->cls_obj);
-	struct osc_lock *oscl = cl2osc_lock(slice);
-	struct cl_lock *lock = slice->cls_lock;
-	struct ldlm_res_id *resname = &info->oti_resname;
-	union ldlm_policy_data *policy = &info->oti_policy;
-	osc_enqueue_upcall_f upcall = mdc_lock_upcall;
-	void *cookie = (void *)oscl;
-	bool async = false;
-	int result;
-
-	ENTRY;
-
-	LASSERTF(ergo(oscl->ols_glimpse, lock->cll_descr.cld_mode <= CLM_READ),
-		"lock = %p, ols = %p\n", lock, oscl);
-
-	if (oscl->ols_state == OLS_GRANTED)
-		RETURN(0);
-
-	/* Lockahead is not supported on MDT yet */
-	if (oscl->ols_flags & LDLM_FL_NO_EXPANSION) {
-		result = -EOPNOTSUPP;
-		RETURN(result);
-	}
-
-	if (oscl->ols_flags & LDLM_FL_TEST_LOCK)
-		GOTO(enqueue_base, 0);
-
-	if (oscl->ols_glimpse) {
-		LASSERT(equi(oscl->ols_speculative, anchor == NULL));
-		async = true;
-		GOTO(enqueue_base, 0);
-	}
-
-	result = osc_lock_enqueue_wait(env, osc, oscl);
-	if (result < 0)
-		GOTO(out, result);
-
-	/* we can grant lockless lock right after all conflicting locks
-	 * are canceled. */
-	if (osc_lock_is_lockless(oscl)) {
-		oscl->ols_state = OLS_GRANTED;
-		oio->oi_lockless = 1;
-		RETURN(0);
-	}
-
-enqueue_base:
-	oscl->ols_state = OLS_ENQUEUED;
-	if (anchor != NULL) {
-		atomic_inc(&anchor->csi_sync_nr);
-		oscl->ols_owner = anchor;
-	}
-
-	/**
-	 * DLM lock's ast data must be osc_object;
-	 * DLM's enqueue callback set to osc_lock_upcall() with cookie as
-	 * osc_lock.
-	 */
-	fid_build_reg_res_name(lu_object_fid(osc2lu(osc)), resname);
-	mdc_lock_build_policy(env, policy);
-	LASSERT(!oscl->ols_speculative);
-	result = mdc_enqueue_send(env, osc_export(osc), resname,
-				  &oscl->ols_flags, policy,
-				  &oscl->ols_lvb, osc->oo_oinfo->loi_kms_valid,
-				  upcall, cookie, &oscl->ols_einfo, async);
-	if (result == 0) {
-		if (osc_lock_is_lockless(oscl)) {
-			oio->oi_lockless = 1;
-		} else if (!async) {
-			LASSERT(oscl->ols_state == OLS_GRANTED);
-			LASSERT(oscl->ols_hold);
-			LASSERT(oscl->ols_dlmlock != NULL);
-		}
-	}
-out:
-	if (result < 0) {
-		oscl->ols_state = OLS_CANCELLED;
-		osc_lock_wake_waiters(env, osc, oscl);
-
-		if (anchor != NULL)
-			cl_sync_io_note(env, anchor, result);
-	}
-	RETURN(result);
-}
-
-static const struct cl_lock_operations mdc_lock_lockless_ops = {
-	.clo_fini = osc_lock_fini,
-	.clo_enqueue = mdc_lock_enqueue,
-	.clo_cancel = mdc_lock_lockless_cancel,
-	.clo_print = osc_lock_print
-};
-
-static const struct cl_lock_operations mdc_lock_ops = {
-	.clo_fini	= osc_lock_fini,
-	.clo_enqueue	= mdc_lock_enqueue,
-	.clo_cancel	= osc_lock_cancel,
-	.clo_print	= osc_lock_print,
-};
-
-int mdc_lock_init(const struct lu_env *env, struct cl_object *obj,
-		  struct cl_lock *lock, const struct cl_io *io)
-{
-	struct osc_lock *ols;
-	__u32 enqflags = lock->cll_descr.cld_enq_flags;
-	__u64 flags = osc_enq2ldlm_flags(enqflags);
-
-	ENTRY;
-
-	/* Ignore AGL for Data-on-MDT, stat returns size data */
-	if ((enqflags & CEF_SPECULATIVE) != 0)
-		RETURN(0);
-
-	OBD_SLAB_ALLOC_PTR_GFP(ols, osc_lock_kmem, GFP_NOFS);
-	if (unlikely(ols == NULL))
-		RETURN(-ENOMEM);
-
-	ols->ols_state = OLS_NEW;
-	spin_lock_init(&ols->ols_lock);
-	INIT_LIST_HEAD(&ols->ols_waiting_list);
-	INIT_LIST_HEAD(&ols->ols_wait_entry);
-	INIT_LIST_HEAD(&ols->ols_nextlock_oscobj);
-	ols->ols_lockless_ops = &mdc_lock_lockless_ops;
-
-	ols->ols_flags = flags;
-	ols->ols_speculative = !!(enqflags & CEF_SPECULATIVE);
-
-	if (ols->ols_flags & LDLM_FL_HAS_INTENT) {
-		ols->ols_flags |= LDLM_FL_BLOCK_GRANTED;
-		ols->ols_glimpse = 1;
-	}
-	mdc_lock_build_einfo(env, lock, cl2osc(obj), &ols->ols_einfo);
-
-	cl_lock_slice_add(lock, &ols->ols_cl, obj, &mdc_lock_ops);
-
-	if (!(enqflags & CEF_MUST))
-		osc_lock_to_lockless(env, ols, (enqflags & CEF_NEVER));
-	if (ols->ols_locklessable && !(enqflags & CEF_DISCARD_DATA))
-		ols->ols_flags |= LDLM_FL_DENY_ON_CONTENTION;
-
-	if (io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io))
-		osc_lock_set_writer(env, io, obj, ols);
-
-	LDLM_DEBUG_NOLOCK("lock %p, mdc lock %p, flags %llx\n",
-			  lock, ols, ols->ols_flags);
-	RETURN(0);
-}
-
-/**
- * IO operations.
- *
- * An implementation of cl_io_operations specific methods for MDC layer.
- *
- */
-static int mdc_async_upcall(void *a, int rc)
-{
-	struct osc_async_cbargs *args = a;
-
-	args->opc_rc = rc;
-	complete(&args->opc_sync);
-	return 0;
-}
-
-static int mdc_get_lock_handle(const struct lu_env *env, struct osc_object *osc,
-			       pgoff_t index, struct lustre_handle *lh)
-{
-	struct ldlm_lock *lock;
-
-	/* find DOM lock protecting object */
-	lock = mdc_dlmlock_at_pgoff(env, osc, index,
-				    OSC_DAP_FL_TEST_LOCK |
-				    OSC_DAP_FL_CANCELING);
-	if (lock == NULL) {
-		struct ldlm_resource *res;
-		struct ldlm_res_id *resname;
-
-		resname = &osc_env_info(env)->oti_resname;
-		fid_build_reg_res_name(lu_object_fid(osc2lu(osc)), resname);
-		res = ldlm_resource_get(osc_export(osc)->exp_obd->obd_namespace,
-					NULL, resname, LDLM_IBITS, 0);
-		ldlm_resource_dump(D_ERROR, res);
-		libcfs_debug_dumpstack(NULL);
-		return -ENOENT;
-	} else {
-		*lh = lock->l_remote_handle;
-		LDLM_LOCK_PUT(lock);
-	}
-	return 0;
-}
-
-static int mdc_io_setattr_start(const struct lu_env *env,
-				const struct cl_io_slice *slice)
-{
-	struct cl_io *io = slice->cis_io;
-	struct osc_io *oio = cl2osc_io(env, slice);
-	struct cl_object *obj = slice->cis_obj;
-	struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo;
-	struct cl_attr *attr = &osc_env_info(env)->oti_attr;
-	struct obdo *oa = &oio->oi_oa;
-	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
-	__u64 size = io->u.ci_setattr.sa_attr.lvb_size;
-	unsigned int ia_avalid = io->u.ci_setattr.sa_avalid;
-	enum op_xvalid ia_xvalid = io->u.ci_setattr.sa_xvalid;
-	int rc;
-
-	/* silently ignore non-truncate setattr for Data-on-MDT object */
-	if (cl_io_is_trunc(io)) {
-		/* truncate cache dirty pages first */
-		rc = osc_cache_truncate_start(env, cl2osc(obj), size,
-					      &oio->oi_trunc);
-		if (rc < 0)
-			return rc;
-	}
-
-	if (oio->oi_lockless == 0) {
-		cl_object_attr_lock(obj);
-		rc = cl_object_attr_get(env, obj, attr);
-		if (rc == 0) {
-			struct ost_lvb *lvb = &io->u.ci_setattr.sa_attr;
-			unsigned int cl_valid = 0;
-
-			if (ia_avalid & ATTR_SIZE) {
-				attr->cat_size = size;
-				attr->cat_kms = size;
-				cl_valid = (CAT_SIZE | CAT_KMS);
-			}
-			if (ia_avalid & ATTR_MTIME_SET) {
-				attr->cat_mtime = lvb->lvb_mtime;
-				cl_valid |= CAT_MTIME;
-			}
-			if (ia_avalid & ATTR_ATIME_SET) {
-				attr->cat_atime = lvb->lvb_atime;
-				cl_valid |= CAT_ATIME;
-			}
-			if (ia_xvalid & OP_XVALID_CTIME_SET) {
-				attr->cat_ctime = lvb->lvb_ctime;
-				cl_valid |= CAT_CTIME;
-			}
-			rc = cl_object_attr_update(env, obj, attr, cl_valid);
-		}
-		cl_object_attr_unlock(obj);
-		if (rc < 0)
-			return rc;
-	}
-
-	if (!(ia_avalid & ATTR_SIZE))
-		return 0;
-
-	memset(oa, 0, sizeof(*oa));
-	oa->o_oi = loi->loi_oi;
-	oa->o_mtime = attr->cat_mtime;
-	oa->o_atime = attr->cat_atime;
-	oa->o_ctime = attr->cat_ctime;
-
-	oa->o_size = size;
-	oa->o_blocks = OBD_OBJECT_EOF;
-	oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLATIME |
-		      OBD_MD_FLCTIME | OBD_MD_FLMTIME | OBD_MD_FLSIZE |
-		      OBD_MD_FLBLOCKS;
-	if (oio->oi_lockless) {
-		oa->o_flags = OBD_FL_SRVLOCK;
-		oa->o_valid |= OBD_MD_FLFLAGS;
-	} else {
-		rc = mdc_get_lock_handle(env, cl2osc(obj), CL_PAGE_EOF,
-					 &oa->o_handle);
-		if (!rc)
-			oa->o_valid |= OBD_MD_FLHANDLE;
-	}
-
-	init_completion(&cbargs->opc_sync);
-
-	rc = osc_punch_send(osc_export(cl2osc(obj)), oa,
-			    mdc_async_upcall, cbargs);
-	cbargs->opc_rpc_sent = rc == 0;
-	return rc;
-}
-
-static int mdc_io_read_ahead(const struct lu_env *env,
-			     const struct cl_io_slice *ios,
-			     pgoff_t start, struct cl_read_ahead *ra)
-{
-	struct osc_object *osc = cl2osc(ios->cis_obj);
-	struct ldlm_lock *dlmlock;
-
-	ENTRY;
-
-	dlmlock = mdc_dlmlock_at_pgoff(env, osc, start, 0);
-	if (dlmlock == NULL)
-		RETURN(-ENODATA);
-
-	if (dlmlock->l_req_mode != LCK_PR) {
-		struct lustre_handle lockh;
-
-		ldlm_lock2handle(dlmlock, &lockh);
-		ldlm_lock_addref(&lockh, LCK_PR);
-		ldlm_lock_decref(&lockh, dlmlock->l_req_mode);
-	}
-
-	ra->cra_rpc_size = osc_cli(osc)->cl_max_pages_per_rpc;
-	ra->cra_end = CL_PAGE_EOF;
-	ra->cra_release = osc_read_ahead_release;
-	ra->cra_cbdata = dlmlock;
-
-	RETURN(0);
-}
-
-int mdc_io_fsync_start(const struct lu_env *env,
-		       const struct cl_io_slice *slice)
-{
-	struct cl_io *io = slice->cis_io;
-	struct cl_fsync_io *fio = &io->u.ci_fsync;
-	struct cl_object *obj = slice->cis_obj;
-	struct osc_object *osc = cl2osc(obj);
-	int result = 0;
-
-	ENTRY;
-
-	/* a MDC lock always covers whole object, do sync for whole
-	 * possible range despite of supplied start/end values.
-	 */
-	result = osc_cache_writeback_range(env, osc, 0, CL_PAGE_EOF, 0,
-					   fio->fi_mode == CL_FSYNC_DISCARD);
-	if (result > 0) {
-		fio->fi_nr_written += result;
-		result = 0;
-	}
-	if (fio->fi_mode == CL_FSYNC_ALL) {
-		int rc;
-
-		rc = osc_cache_wait_range(env, osc, 0, CL_PAGE_EOF);
-		if (result == 0)
-			result = rc;
-		/* Use OSC sync code because it is asynchronous.
-		 * It is to be added into MDC and avoid the using of
-		 * OST_SYNC at both MDC and MDT.
-		 */
-		rc = osc_fsync_ost(env, osc, fio);
-		if (result == 0)
-			result = rc;
-	}
-
-	RETURN(result);
-}
-
-struct mdc_data_version_args {
-	struct osc_io *dva_oio;
-};
-
-static int
-mdc_data_version_interpret(const struct lu_env *env, struct ptlrpc_request *req,
-			   void *arg, int rc)
-{
-	struct mdc_data_version_args *dva = arg;
-	struct osc_io *oio = dva->dva_oio;
-	const struct mdt_body *body;
-
-	ENTRY;
-	if (rc < 0)
-		GOTO(out, rc);
-
-	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
-	if (body == NULL)
-		GOTO(out, rc = -EPROTO);
-
-	/* Prepare OBDO from mdt_body for CLIO */
-	oio->oi_oa.o_valid = body->mbo_valid;
-	oio->oi_oa.o_flags = body->mbo_flags;
-	oio->oi_oa.o_data_version = body->mbo_version;
-	oio->oi_oa.o_layout_version = body->mbo_layout_gen;
-	EXIT;
-out:
-	oio->oi_cbarg.opc_rc = rc;
-	complete(&oio->oi_cbarg.opc_sync);
-	return 0;
-}
-
-static int mdc_io_data_version_start(const struct lu_env *env,
-				     const struct cl_io_slice *slice)
-{
-	struct cl_data_version_io *dv = &slice->cis_io->u.ci_data_version;
-	struct osc_io *oio = cl2osc_io(env, slice);
-	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
-	struct osc_object *obj = cl2osc(slice->cis_obj);
-	struct obd_export *exp = osc_export(obj);
-	struct ptlrpc_request *req;
-	struct mdt_body *body;
-	struct mdc_data_version_args *dva;
-	int rc;
-
-	ENTRY;
-
-	memset(&oio->oi_oa, 0, sizeof(oio->oi_oa));
-	oio->oi_oa.o_oi.oi_fid = *lu_object_fid(osc2lu(obj));
-	oio->oi_oa.o_valid = OBD_MD_FLID;
-
-	init_completion(&cbargs->opc_sync);
-
-	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR);
-	if (req == NULL)
-		RETURN(-ENOMEM);
-
-	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR);
-	if (rc < 0) {
-		ptlrpc_request_free(req);
-		RETURN(rc);
-	}
-
-	body = req_capsule_client_get(&req->rq_pill, &RMF_MDT_BODY);
-	body->mbo_fid1 = *lu_object_fid(osc2lu(obj));
-	body->mbo_valid = OBD_MD_FLID;
-	/* Indicate that data version is needed */
-	body->mbo_valid |= OBD_MD_FLDATAVERSION;
-	body->mbo_flags = 0;
-
-	if (dv->dv_flags & (LL_DV_RD_FLUSH | LL_DV_WR_FLUSH)) {
-		body->mbo_valid |= OBD_MD_FLFLAGS;
-		body->mbo_flags |= OBD_FL_SRVLOCK;
-		if (dv->dv_flags & LL_DV_WR_FLUSH)
-			body->mbo_flags |= OBD_FL_FLUSH;
-	}
-
-	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, 0);
-	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, 0);
-	ptlrpc_request_set_replen(req);
-
-	req->rq_interpret_reply = mdc_data_version_interpret;
-	CLASSERT(sizeof(*dva) <= sizeof(req->rq_async_args));
-	dva = ptlrpc_req_async_args(req);
-	dva->dva_oio = oio;
-
-	ptlrpcd_add_req(req);
-
-	RETURN(0);
-}
-
-static void mdc_io_data_version_end(const struct lu_env *env,
-				    const struct cl_io_slice *slice)
-{
-	struct cl_data_version_io *dv = &slice->cis_io->u.ci_data_version;
-	struct osc_io *oio = cl2osc_io(env, slice);
-	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
-
-	ENTRY;
-	wait_for_completion(&cbargs->opc_sync);
-
-	if (cbargs->opc_rc != 0) {
-		slice->cis_io->ci_result = cbargs->opc_rc;
-	} else {
-		slice->cis_io->ci_result = 0;
-		if (!(oio->oi_oa.o_valid &
-		      (OBD_MD_LAYOUT_VERSION | OBD_MD_FLDATAVERSION)))
-			slice->cis_io->ci_result = -ENOTSUPP;
-
-		if (oio->oi_oa.o_valid & OBD_MD_LAYOUT_VERSION)
-			dv->dv_layout_version = oio->oi_oa.o_layout_version;
-		if (oio->oi_oa.o_valid & OBD_MD_FLDATAVERSION)
-			dv->dv_data_version = oio->oi_oa.o_data_version;
-	}
-
-	EXIT;
-}
-
-static struct cl_io_operations mdc_io_ops = {
-	.op = {
-		[CIT_READ] = {
-			.cio_iter_init = osc_io_iter_init,
-			.cio_iter_fini = osc_io_iter_fini,
-			.cio_start     = osc_io_read_start,
-		},
-		[CIT_WRITE] = {
-			.cio_iter_init = osc_io_write_iter_init,
-			.cio_iter_fini = osc_io_write_iter_fini,
-			.cio_start     = osc_io_write_start,
-			.cio_end       = osc_io_end,
-		},
-		[CIT_SETATTR] = {
-			.cio_iter_init = osc_io_iter_init,
-			.cio_iter_fini = osc_io_iter_fini,
-			.cio_start     = mdc_io_setattr_start,
-			.cio_end       = osc_io_setattr_end,
-		},
-		[CIT_DATA_VERSION] = {
-			.cio_start = mdc_io_data_version_start,
-			.cio_end   = mdc_io_data_version_end,
-		},
-		[CIT_FAULT] = {
-			.cio_iter_init = osc_io_iter_init,
-			.cio_iter_fini = osc_io_iter_fini,
-			.cio_start     = osc_io_fault_start,
-			.cio_end       = osc_io_end,
-		},
-		[CIT_FSYNC] = {
-			.cio_start = mdc_io_fsync_start,
-			.cio_end   = osc_io_fsync_end,
-		},
-	},
-	.cio_read_ahead   = mdc_io_read_ahead,
-	.cio_submit	  = osc_io_submit,
-	.cio_commit_async = osc_io_commit_async,
-};
-
-int mdc_io_init(const struct lu_env *env, struct cl_object *obj,
-		struct cl_io *io)
-{
-	struct osc_io *oio = osc_env_io(env);
-
-	CL_IO_SLICE_CLEAN(oio, oi_cl);
-	cl_io_slice_add(io, &oio->oi_cl, obj, &mdc_io_ops);
-	return 0;
-}
-
-static void mdc_build_res_name(struct osc_object *osc,
-				   struct ldlm_res_id *resname)
-{
-	fid_build_reg_res_name(lu_object_fid(osc2lu(osc)), resname);
-}
-
-/**
- * Implementation of struct cl_req_operations::cro_attr_set() for MDC
- * layer. MDC is responsible for struct obdo::o_id and struct obdo::o_seq
- * fields.
- */
-static void mdc_req_attr_set(const struct lu_env *env, struct cl_object *obj,
-			     struct cl_req_attr *attr)
-{
-	u64 flags = attr->cra_flags;
-
-	/* Copy object FID to cl_attr */
-	attr->cra_oa->o_oi.oi_fid = *lu_object_fid(&obj->co_lu);
-
-	if (flags & OBD_MD_FLGROUP)
-		attr->cra_oa->o_valid |= OBD_MD_FLGROUP;
-
-	if (flags & OBD_MD_FLID)
-		attr->cra_oa->o_valid |= OBD_MD_FLID;
-
-	if (flags & OBD_MD_FLHANDLE) {
-		struct osc_page *opg;
-
-		opg = osc_cl_page_osc(attr->cra_page, cl2osc(obj));
-		if (!opg->ops_srvlock) {
-			int rc;
-
-			rc = mdc_get_lock_handle(env, cl2osc(obj),
-						 osc_index(opg),
-						 &attr->cra_oa->o_handle);
-			if (rc) {
-				CL_PAGE_DEBUG(D_ERROR, env, attr->cra_page,
-					      "uncovered page!\n");
-				LBUG();
-			} else {
-				attr->cra_oa->o_valid |= OBD_MD_FLHANDLE;
-			}
-		}
-	}
-}
-
-static int mdc_attr_get(const struct lu_env *env, struct cl_object *obj,
-			struct cl_attr *attr)
-{
-	struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
-
-	if (OST_LVB_IS_ERR(oinfo->loi_lvb.lvb_blocks))
-		return OST_LVB_GET_ERR(oinfo->loi_lvb.lvb_blocks);
-
-	return osc_attr_get(env, obj, attr);
-}
-
-static int mdc_object_ast_clear(struct ldlm_lock *lock, void *data)
-{
-	struct osc_object *osc = (struct osc_object *)data;
-	struct ost_lvb *lvb = &lock->l_ost_lvb;
-	struct lov_oinfo *oinfo;
-	ENTRY;
-
-	if (lock->l_ast_data == data) {
-		lock->l_ast_data = NULL;
-
-		LASSERT(osc != NULL);
-		LASSERT(osc->oo_oinfo != NULL);
-		LASSERT(lvb != NULL);
-
-		/* Updates lvb in lock by the cached oinfo */
-		oinfo = osc->oo_oinfo;
-
-		LDLM_DEBUG(lock, "update lock size %llu blocks %llu [cma]time: "
-			   "%llu %llu %llu by oinfo size %llu blocks %llu "
-			   "[cma]time %llu %llu %llu", lvb->lvb_size,
-			   lvb->lvb_blocks, lvb->lvb_ctime, lvb->lvb_mtime,
-			   lvb->lvb_atime, oinfo->loi_lvb.lvb_size,
-			   oinfo->loi_lvb.lvb_blocks, oinfo->loi_lvb.lvb_ctime,
-			   oinfo->loi_lvb.lvb_mtime, oinfo->loi_lvb.lvb_atime);
-		LASSERT(oinfo->loi_lvb.lvb_size >= oinfo->loi_kms);
-
-		cl_object_attr_lock(&osc->oo_cl);
-		memcpy(lvb, &oinfo->loi_lvb, sizeof(oinfo->loi_lvb));
-		cl_object_attr_unlock(&osc->oo_cl);
-		ldlm_clear_lvb_cached(lock);
-	}
-	RETURN(LDLM_ITER_CONTINUE);
-}
-
-int mdc_object_prune(const struct lu_env *env, struct cl_object *obj)
-{
-	struct osc_object *osc = cl2osc(obj);
-	struct ldlm_res_id *resname = &osc_env_info(env)->oti_resname;
-
-	/* DLM locks don't hold a reference of osc_object so we have to
-	 * clear it before the object is being destroyed. */
-	osc_build_res_name(osc, resname);
-	ldlm_resource_iterate(osc_export(osc)->exp_obd->obd_namespace, resname,
-			      mdc_object_ast_clear, osc);
-	return 0;
-}
-
-static int mdc_object_flush(const struct lu_env *env, struct cl_object *obj,
-			    struct ldlm_lock *lock)
-{
-	/* if lock cancel is initiated from llite then it is combined
-	 * lock with DOM bit and it may have no l_ast_data initialized yet,
-	 * so init it here with given osc_object.
-	 */
-	mdc_set_dom_lock_data(lock, cl2osc(obj));
-	RETURN(mdc_dlm_blocking_ast0(env, lock, LDLM_CB_CANCELING));
-}
-
-static const struct cl_object_operations mdc_ops = {
-	.coo_page_init = osc_page_init,
-	.coo_lock_init = mdc_lock_init,
-	.coo_io_init = mdc_io_init,
-	.coo_attr_get = mdc_attr_get,
-	.coo_attr_update = osc_attr_update,
-	.coo_glimpse = osc_object_glimpse,
-	.coo_req_attr_set = mdc_req_attr_set,
-	.coo_prune = mdc_object_prune,
-	.coo_object_flush = mdc_object_flush
-};
-
-static const struct osc_object_operations mdc_object_ops = {
-	.oto_build_res_name = mdc_build_res_name,
-	.oto_dlmlock_at_pgoff = mdc_dlmlock_at_pgoff,
-};
-
-static int mdc_object_init(const struct lu_env *env, struct lu_object *obj,
-			   const struct lu_object_conf *conf)
-{
-	struct osc_object *osc = lu2osc(obj);
-
-	if (osc->oo_initialized)
-		return 0;
-
-	osc->oo_initialized = true;
-
-	return osc_object_init(env, obj, conf);
-}
-
-static void mdc_object_free(const struct lu_env *env, struct lu_object *obj)
-{
-	osc_object_free(env, obj);
-}
-
-static const struct lu_object_operations mdc_lu_obj_ops = {
-	.loo_object_init = mdc_object_init,
-	.loo_object_delete = NULL,
-	.loo_object_release = NULL,
-	.loo_object_free = mdc_object_free,
-	.loo_object_print = osc_object_print,
-	.loo_object_invariant = NULL
-};
-
-struct lu_object *mdc_object_alloc(const struct lu_env *env,
-				   const struct lu_object_header *unused,
-				   struct lu_device *dev)
-{
-	struct osc_object *osc;
-	struct lu_object  *obj;
-
-	OBD_SLAB_ALLOC_PTR_GFP(osc, osc_object_kmem, GFP_NOFS);
-	if (osc != NULL) {
-		obj = osc2lu(osc);
-		lu_object_init(obj, NULL, dev);
-		osc->oo_cl.co_ops = &mdc_ops;
-		obj->lo_ops = &mdc_lu_obj_ops;
-		osc->oo_obj_ops = &mdc_object_ops;
-		osc->oo_initialized = false;
-	} else {
-		obj = NULL;
-	}
-	return obj;
-}
-
-static int mdc_cl_process_config(const struct lu_env *env,
-				 struct lu_device *d, struct lustre_cfg *cfg)
-{
-	return mdc_process_config(d->ld_obd, 0, cfg);
-}
-
-const struct lu_device_operations mdc_lu_ops = {
-	.ldo_object_alloc = mdc_object_alloc,
-	.ldo_process_config = mdc_cl_process_config,
-	.ldo_recovery_complete = NULL,
-};
-
-static struct lu_device *mdc_device_alloc(const struct lu_env *env,
-					  struct lu_device_type *t,
-					  struct lustre_cfg *cfg)
-{
-	struct lu_device *d;
-	struct osc_device *od;
-	struct obd_device *obd;
-	int rc;
-
-	OBD_ALLOC_PTR(od);
-	if (od == NULL)
-		RETURN(ERR_PTR(-ENOMEM));
-
-	cl_device_init(&od->od_cl, t);
-	d = osc2lu_dev(od);
-	d->ld_ops = &mdc_lu_ops;
-
-	/* Setup MDC OBD */
-	obd = class_name2obd(lustre_cfg_string(cfg, 0));
-	if (obd == NULL)
-		RETURN(ERR_PTR(-ENODEV));
-
-	rc = mdc_setup(obd, cfg);
-	if (rc < 0) {
-		osc_device_free(env, d);
-		RETURN(ERR_PTR(rc));
-	}
-	od->od_exp = obd->obd_self_export;
-	RETURN(d);
-}
-
-static const struct lu_device_type_operations mdc_device_type_ops = {
-	.ldto_device_alloc = mdc_device_alloc,
-	.ldto_device_free = osc_device_free,
-	.ldto_device_init = osc_device_init,
-	.ldto_device_fini = osc_device_fini
-};
-
-struct lu_device_type mdc_device_type = {
-	.ldt_tags = LU_DEVICE_CL,
-	.ldt_name = LUSTRE_MDC_NAME,
-	.ldt_ops = &mdc_device_type_ops,
-	.ldt_ctx_tags = LCT_CL_THREAD
-};
-
-/** @} osc */
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_internal.h b/drivers/staging/lustrefsx/lustre/mdc/mdc_internal.h
index c0df4152bf80f..98773524caee9 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/mdc_internal.h
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -35,7 +35,9 @@
 
 #include <lustre_mdc.h>
 
-int mdc_tunables_init(struct obd_device *obd);
+#ifdef CONFIG_PROC_FS
+extern struct lprocfs_vars lprocfs_mdc_obd_vars[];
+#endif
 
 void mdc_pack_body(struct ptlrpc_request *req, const struct lu_fid *fid,
 		   u64 valid, size_t ea_size, u32 suppgid, u32 flags);
@@ -56,7 +58,6 @@ void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
 void mdc_file_secctx_pack(struct ptlrpc_request *req,
 			  const char *secctx_name,
 			  const void *secctx, size_t secctx_size);
-void mdc_file_sepol_pack(struct ptlrpc_request *req);
 
 void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
 void mdc_getxattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
@@ -64,8 +65,6 @@ void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
 void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
 		     const char *old, size_t oldlen,
 		     const char *new, size_t newlen);
-void mdc_migrate_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
-			const char *name, size_t namelen);
 void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
 
 /* mdc/mdc_locks.c */
@@ -96,8 +95,6 @@ int mdc_save_lovea(struct ptlrpc_request *req,
 /* mdc/mdc_request.c */
 int mdc_fid_alloc(const struct lu_env *env, struct obd_export *exp,
 		  struct lu_fid *fid, struct md_op_data *op_data);
-int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg);
-int mdc_process_config(struct obd_device *obd, size_t len, void *buf);
 
 struct obd_client_handle;
 
@@ -130,7 +127,6 @@ int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data,
 		void *ea, size_t ealen, struct ptlrpc_request **request);
 int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
 	       struct ptlrpc_request **request);
-int mdc_file_resync(struct obd_export *exp, struct md_op_data *data);
 int mdc_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
 		      union ldlm_policy_data *policy, enum ldlm_mode mode,
 		      enum ldlm_cancel_flags flags, void *opaque);
@@ -147,11 +143,6 @@ enum ldlm_mode mdc_lock_match(struct obd_export *exp, __u64 flags,
 			      enum ldlm_mode mode, struct lustre_handle *lockh);
 
 
-#define MDC_CHANGELOG_DEV_COUNT LMV_MAX_STRIPE_COUNT
-#define MDC_CHANGELOG_DEV_NAME	"changelog"
-extern struct class *mdc_changelog_class;
-extern dev_t mdc_changelog_dev;
-
 int mdc_changelog_cdev_init(struct obd_device *obd);
 
 void mdc_changelog_cdev_finish(struct obd_device *obd);
@@ -172,15 +163,4 @@ static inline unsigned long hash_x_index(__u64 hash, int hash64)
 	return ~0UL - (hash + !hash);
 }
 
-/* mdc_dev.c */
-extern struct lu_device_type mdc_device_type;
-int mdc_ldlm_blocking_ast(struct ldlm_lock *dlmlock,
-			  struct ldlm_lock_desc *new, void *data, int flag);
-int mdc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data);
-int mdc_fill_lvb(struct ptlrpc_request *req, struct ost_lvb *lvb);
-
-/* the minimum inline repsize should be PAGE_SIZE at least */
-#define MDC_DOM_DEF_INLINE_REPSIZE max(8192UL, PAGE_SIZE)
-#define MDC_DOM_MAX_INLINE_REPSIZE XATTR_SIZE_MAX
-
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c
index dcc42508aca98..c93ec985f6581 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -36,6 +36,7 @@
 # include <linux/uidgid.h>
 #endif
 #include <lustre_net.h>
+#include <lustre/lustre_idl.h>
 #include <obd_class.h>
 #include <obd.h>
 #include <cl_object.h>
@@ -147,22 +148,6 @@ void mdc_file_secctx_pack(struct ptlrpc_request *req, const char *secctx_name,
 	memcpy(buf, secctx, buf_size);
 }
 
-void mdc_file_sepol_pack(struct ptlrpc_request *req)
-{
-	void *buf;
-	size_t buf_size;
-
-	if (strlen(req->rq_sepol) == 0)
-		return;
-
-	buf = req_capsule_client_get(&req->rq_pill, &RMF_SELINUX_POL);
-	buf_size = req_capsule_get_size(&req->rq_pill, &RMF_SELINUX_POL,
-					RCL_CLIENT);
-
-	LASSERT(buf_size == strlen(req->rq_sepol) + 1);
-	snprintf(buf, strlen(req->rq_sepol) + 1, "%s", req->rq_sepol);
-}
-
 void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff, size_t size,
 		      const struct lu_fid *fid)
 {
@@ -181,9 +166,9 @@ void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
 		     const void *data, size_t datalen, umode_t mode,
 		     uid_t uid, gid_t gid, cfs_cap_t cap_effective, __u64 rdev)
 {
-	struct mdt_rec_create *rec;
-	char *tmp;
-	__u64 flags;
+	struct mdt_rec_create	*rec;
+	char			*tmp;
+	__u64			 flags;
 
 	CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_create));
 	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
@@ -216,19 +201,13 @@ void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
 	mdc_file_secctx_pack(req, op_data->op_file_secctx_name,
 			     op_data->op_file_secctx,
 			     op_data->op_file_secctx_size);
-
-	/* pack SELinux policy info if any */
-	mdc_file_sepol_pack(req);
 }
 
 static inline __u64 mds_pack_open_flags(__u64 flags)
 {
-	__u64 cr_flags = (flags & MDS_OPEN_FL_INTERNAL);
+	__u64 cr_flags = (flags & (FMODE_READ | FMODE_WRITE |
+				   MDS_OPEN_FL_INTERNAL));
 
-	if (flags & FMODE_READ)
-		cr_flags |= MDS_FMODE_READ;
-	if (flags & FMODE_WRITE)
-		cr_flags |= MDS_FMODE_WRITE;
 	if (flags & O_CREAT)
 		cr_flags |= MDS_OPEN_CREAT;
 	if (flags & O_EXCL)
@@ -282,7 +261,7 @@ void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
 		rec->cr_suppgid1   = op_data->op_suppgids[0];
 		rec->cr_suppgid2   = op_data->op_suppgids[1];
 		rec->cr_bias       = op_data->op_bias;
-		rec->cr_open_handle_old = op_data->op_open_handle;
+		rec->cr_old_handle = op_data->op_handle;
 
 		if (op_data->op_name) {
 			mdc_pack_name(req, &RMF_NAME, op_data->op_name,
@@ -295,9 +274,6 @@ void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
 		mdc_file_secctx_pack(req, op_data->op_file_secctx_name,
 				     op_data->op_file_secctx,
 				     op_data->op_file_secctx_size);
-
-		/* pack SELinux policy info if any */
-		mdc_file_sepol_pack(req);
 	}
 
 	if (lmm) {
@@ -308,9 +284,8 @@ void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
 	set_mrc_cr_flags(rec, cr_flags);
 }
 
-static inline u64 attr_pack(unsigned int ia_valid, enum op_xvalid ia_xvalid)
-{
-	u64 sa_valid = 0;
+static inline __u64 attr_pack(unsigned int ia_valid) {
+        __u64 sa_valid = 0;
 
         if (ia_valid & ATTR_MODE)
                 sa_valid |= MDS_ATTR_MODE;
@@ -332,27 +307,23 @@ static inline u64 attr_pack(unsigned int ia_valid, enum op_xvalid ia_xvalid)
                 sa_valid |= MDS_ATTR_MTIME_SET;
         if (ia_valid & ATTR_FORCE)
                 sa_valid |= MDS_ATTR_FORCE;
-	if (ia_xvalid & OP_XVALID_FLAGS)
-		sa_valid |= MDS_ATTR_ATTR_FLAG;
-	if (ia_valid & ATTR_KILL_SUID)
-		sa_valid |=  MDS_ATTR_KILL_SUID;
-	if (ia_valid & ATTR_KILL_SGID)
-		sa_valid |= MDS_ATTR_KILL_SGID;
-	if (ia_xvalid & OP_XVALID_CTIME_SET)
-		sa_valid |= MDS_ATTR_CTIME_SET;
-	if (ia_valid & ATTR_OPEN)
-		sa_valid |= MDS_ATTR_FROM_OPEN;
-	if (ia_xvalid & OP_XVALID_BLOCKS)
-		sa_valid |= MDS_ATTR_BLOCKS;
-	if (ia_xvalid & OP_XVALID_OWNEROVERRIDE)
-		/* NFSD hack (see bug 5781) */
-		sa_valid |= MDS_OPEN_OWNEROVERRIDE;
-	if (ia_xvalid & OP_XVALID_PROJID)
+        if (ia_valid & ATTR_ATTR_FLAG)
+                sa_valid |= MDS_ATTR_ATTR_FLAG;
+        if (ia_valid & ATTR_KILL_SUID)
+                sa_valid |=  MDS_ATTR_KILL_SUID;
+        if (ia_valid & ATTR_KILL_SGID)
+                sa_valid |= MDS_ATTR_KILL_SGID;
+        if (ia_valid & ATTR_CTIME_SET)
+                sa_valid |= MDS_ATTR_CTIME_SET;
+        if (ia_valid & ATTR_FROM_OPEN)
+                sa_valid |= MDS_ATTR_FROM_OPEN;
+        if (ia_valid & ATTR_BLOCKS)
+                sa_valid |= MDS_ATTR_BLOCKS;
+        if (ia_valid & MDS_OPEN_OWNEROVERRIDE)
+                /* NFSD hack (see bug 5781) */
+                sa_valid |= MDS_OPEN_OWNEROVERRIDE;
+	if (ia_valid & MDS_ATTR_PROJID)
 		sa_valid |= MDS_ATTR_PROJID;
-	if (ia_xvalid & OP_XVALID_LAZYSIZE)
-		sa_valid |= MDS_ATTR_LSIZE;
-	if (ia_xvalid & OP_XVALID_LAZYBLOCKS)
-		sa_valid |= MDS_ATTR_LBLOCKS;
         return sa_valid;
 }
 
@@ -366,8 +337,7 @@ static void mdc_setattr_pack_rec(struct mdt_rec_setattr *rec,
 	rec->sa_suppgid = -1;
 
 	rec->sa_fid    = op_data->op_fid1;
-	rec->sa_valid  = attr_pack(op_data->op_attr.ia_valid,
-				   op_data->op_xvalid);
+	rec->sa_valid  = attr_pack(op_data->op_attr.ia_valid);
 	rec->sa_mode   = op_data->op_attr.ia_mode;
 	rec->sa_uid    = from_kuid(&init_user_ns, op_data->op_attr.ia_uid);
 	rec->sa_gid    = from_kgid(&init_user_ns, op_data->op_attr.ia_gid);
@@ -391,7 +361,7 @@ static void mdc_setattr_pack_rec(struct mdt_rec_setattr *rec,
 static void mdc_ioepoch_pack(struct mdt_ioepoch *epoch,
 			     struct md_op_data *op_data)
 {
-	epoch->mio_open_handle = op_data->op_open_handle;
+	epoch->mio_handle = op_data->op_handle;
 	epoch->mio_unused1 = 0;
 	epoch->mio_unused2 = 0;
 	epoch->mio_padding = 0;
@@ -444,9 +414,6 @@ void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
         rec->ul_bias    = op_data->op_bias;
 
 	mdc_pack_name(req, &RMF_NAME, op_data->op_name, op_data->op_namelen);
-
-	/* pack SELinux policy info if any */
-	mdc_file_sepol_pack(req);
 }
 
 void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
@@ -469,19 +436,17 @@ void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
         rec->lk_bias     = op_data->op_bias;
 
 	mdc_pack_name(req, &RMF_NAME, op_data->op_name, op_data->op_namelen);
-
-	/* pack SELinux policy info if any */
-	mdc_file_sepol_pack(req);
 }
 
-static void mdc_close_intent_pack(struct ptlrpc_request *req,
+static void mdc_intent_close_pack(struct ptlrpc_request *req,
 				  struct md_op_data *op_data)
 {
 	struct close_data	*data;
 	struct ldlm_lock	*lock;
 	enum mds_op_bias	 bias = op_data->op_bias;
 
-	if (!(bias & (MDS_CLOSE_INTENT | MDS_CLOSE_MIGRATE)))
+	if (!(bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP |
+		      MDS_RENAME_MIGRATE)))
 		return;
 
 	data = req_capsule_client_get(&req->rq_pill, &RMF_CLOSE_DATA);
@@ -496,90 +461,44 @@ static void mdc_close_intent_pack(struct ptlrpc_request *req,
 
 	data->cd_data_version = op_data->op_data_version;
 	data->cd_fid = op_data->op_fid2;
-
-	if (bias & MDS_CLOSE_LAYOUT_SPLIT) {
-		data->cd_mirror_id = op_data->op_mirror_id;
-	} else if (bias & MDS_CLOSE_RESYNC_DONE) {
-		struct close_data_resync_done *sync = &data->cd_resync;
-
-		CLASSERT(sizeof(data->cd_resync) <= sizeof(data->cd_reserved));
-		sync->resync_count = op_data->op_data_size / sizeof(__u32);
-		if (sync->resync_count <= INLINE_RESYNC_ARRAY_SIZE) {
-			memcpy(sync->resync_ids_inline, op_data->op_data,
-			       op_data->op_data_size);
-		} else {
-			size_t count = sync->resync_count;
-
-			memcpy(req_capsule_client_get(&req->rq_pill, &RMF_U32),
-				op_data->op_data, count * sizeof(__u32));
-		}
-	}
 }
 
 void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
 		     const char *old, size_t oldlen,
 		     const char *new, size_t newlen)
 {
-	struct mdt_rec_rename *rec;
+        struct mdt_rec_rename *rec;
 
-	CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_rename));
-	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+        CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_rename));
+        rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
 
-	/* XXX do something about time, uid, gid */
-	rec->rn_opcode	 = REINT_RENAME;
-	rec->rn_fsuid    = op_data->op_fsuid;
-	rec->rn_fsgid    = op_data->op_fsgid;
-	rec->rn_cap      = op_data->op_cap;
-	rec->rn_suppgid1 = op_data->op_suppgids[0];
-	rec->rn_suppgid2 = op_data->op_suppgids[1];
-	rec->rn_fid1     = op_data->op_fid1;
-	rec->rn_fid2     = op_data->op_fid2;
-	rec->rn_time     = op_data->op_mod_time;
-	rec->rn_mode     = op_data->op_mode;
-	rec->rn_bias     = op_data->op_bias;
+        /* XXX do something about time, uid, gid */
+	rec->rn_opcode  = op_data->op_cli_flags & CLI_MIGRATE ?
+					REINT_MIGRATE : REINT_RENAME;
+        rec->rn_fsuid    = op_data->op_fsuid;
+        rec->rn_fsgid    = op_data->op_fsgid;
+        rec->rn_cap      = op_data->op_cap;
+        rec->rn_suppgid1 = op_data->op_suppgids[0];
+        rec->rn_suppgid2 = op_data->op_suppgids[1];
+        rec->rn_fid1     = op_data->op_fid1;
+        rec->rn_fid2     = op_data->op_fid2;
+        rec->rn_time     = op_data->op_mod_time;
+        rec->rn_mode     = op_data->op_mode;
+        rec->rn_bias     = op_data->op_bias;
 
 	mdc_pack_name(req, &RMF_NAME, old, oldlen);
 
 	if (new != NULL)
 		mdc_pack_name(req, &RMF_SYMTGT, new, newlen);
 
-	/* pack SELinux policy info if any */
-	mdc_file_sepol_pack(req);
-}
-
-void mdc_migrate_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
-		      const char *name, size_t namelen)
-{
-	struct mdt_rec_rename *rec;
-	char *ea;
-
-	CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_rename));
-	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
-
-	rec->rn_opcode	 = REINT_MIGRATE;
-	rec->rn_fsuid    = op_data->op_fsuid;
-	rec->rn_fsgid    = op_data->op_fsgid;
-	rec->rn_cap      = op_data->op_cap;
-	rec->rn_suppgid1 = op_data->op_suppgids[0];
-	rec->rn_suppgid2 = op_data->op_suppgids[1];
-	rec->rn_fid1     = op_data->op_fid1;
-	rec->rn_fid2	 = op_data->op_fid4;
-	rec->rn_time     = op_data->op_mod_time;
-	rec->rn_mode     = op_data->op_mode;
-	rec->rn_bias     = op_data->op_bias;
-
-	mdc_pack_name(req, &RMF_NAME, name, namelen);
-
-	if (op_data->op_bias & MDS_CLOSE_MIGRATE) {
+	if (op_data->op_cli_flags & CLI_MIGRATE &&
+	    op_data->op_bias & MDS_RENAME_MIGRATE) {
 		struct mdt_ioepoch *epoch;
 
-		mdc_close_intent_pack(req, op_data);
+		mdc_intent_close_pack(req, op_data);
 		epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
 		mdc_ioepoch_pack(epoch, op_data);
 	}
-
-	ea = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
-	memcpy(ea, op_data->op_data, op_data->op_data_size);
 }
 
 void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, __u32 flags,
@@ -589,6 +508,8 @@ void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, __u32 flags,
                                                     &RMF_MDT_BODY);
 
 	b->mbo_valid = valid;
+	if (op_data->op_bias & MDS_CHECK_SPLIT)
+		b->mbo_valid |= OBD_MD_FLCKSPLIT;
 	if (op_data->op_bias & MDS_CROSS_REF)
 		b->mbo_valid |= OBD_MD_FLCROSSREF;
 	b->mbo_eadatasize = ea_size;
@@ -626,5 +547,5 @@ void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
 		rec->sa_valid &= ~MDS_ATTR_ATIME;
 
 	mdc_ioepoch_pack(epoch, op_data);
-	mdc_close_intent_pack(req, op_data);
+	mdc_intent_close_pack(req, op_data);
 }
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c
index 1c1e54b87590f..cb809c2ce4b89 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -43,7 +43,6 @@
 #include <lustre_net.h>
 #include <lustre_req_layout.h>
 #include <lustre_swab.h>
-#include <lustre_acl.h>
 
 #include "mdc_internal.h"
 
@@ -245,7 +244,7 @@ int mdc_save_lovea(struct ptlrpc_request *req,
 
 static struct ptlrpc_request *
 mdc_intent_open_pack(struct obd_export *exp, struct lookup_intent *it,
-		     struct md_op_data *op_data, __u32 acl_bufsize)
+		     struct md_op_data *op_data)
 {
 	struct ptlrpc_request	*req;
 	struct obd_device	*obddev = class_exp2obd(exp);
@@ -256,8 +255,6 @@ mdc_intent_open_pack(struct obd_export *exp, struct lookup_intent *it,
 	int			 count = 0;
 	enum ldlm_mode		 mode;
 	int			 rc;
-	int repsize, repsize_estimate;
-
 	ENTRY;
 
 	it->it_create_mode = (it->it_create_mode & ~S_IFMT) | S_IFREG;
@@ -266,12 +263,12 @@ mdc_intent_open_pack(struct obd_export *exp, struct lookup_intent *it,
 	/* If inode is known, cancel conflicting OPEN locks. */
 	if (fid_is_sane(&op_data->op_fid2)) {
 		if (it->it_flags & MDS_OPEN_LEASE) { /* try to get lease */
-			if (it->it_flags & MDS_FMODE_WRITE)
+			if (it->it_flags & FMODE_WRITE)
 				mode = LCK_EX;
 			else
 				mode = LCK_PR;
 		} else {
-			if (it->it_flags & (MDS_FMODE_WRITE | MDS_OPEN_TRUNC))
+			if (it->it_flags & (FMODE_WRITE|MDS_OPEN_TRUNC))
 				mode = LCK_CW;
 #ifdef FMODE_EXEC
 			else if (it->it_flags & FMODE_EXEC)
@@ -303,32 +300,16 @@ mdc_intent_open_pack(struct obd_export *exp, struct lookup_intent *it,
 
         req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
                              op_data->op_namelen + 1);
-	if (cl_is_lov_delay_create(it->it_flags)) {
-		/* open(O_LOV_DELAY_CREATE) won't pack lmm */
-		LASSERT(lmmsize == 0);
-		req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, 0);
-	} else {
-		req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
 			     max(lmmsize, obddev->u.cli.cl_default_mds_easize));
-	}
 
 	req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX_NAME,
 			     RCL_CLIENT, op_data->op_file_secctx_name != NULL ?
-			     op_data->op_file_secctx_name_size : 0);
+			     strlen(op_data->op_file_secctx_name) + 1 : 0);
 
 	req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX, RCL_CLIENT,
 			     op_data->op_file_secctx_size);
 
-	/* get SELinux policy info if any */
-	rc = sptlrpc_get_sepol(req);
-	if (rc < 0) {
-		ptlrpc_request_free(req);
-		RETURN(ERR_PTR(rc));
-	}
-	req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
-			     strlen(req->rq_sepol) ?
-			     strlen(req->rq_sepol) + 1 : 0);
-
 	rc = ldlm_prep_enqueue_req(exp, req, &cancels, count);
 	if (rc < 0) {
 		ptlrpc_request_free(req);
@@ -349,71 +330,10 @@ mdc_intent_open_pack(struct obd_export *exp, struct lookup_intent *it,
 
 	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
 			     obddev->u.cli.cl_max_mds_easize);
-	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, acl_bufsize);
-
-	if (!(it->it_op & IT_CREAT) && it->it_op & IT_OPEN &&
-	    req_capsule_has_field(&req->rq_pill, &RMF_FILE_SECCTX_NAME,
-				  RCL_CLIENT) &&
-	    op_data->op_file_secctx_name_size > 0 &&
-	    op_data->op_file_secctx_name != NULL) {
-		char *secctx_name;
-
-		secctx_name = req_capsule_client_get(&req->rq_pill,
-						     &RMF_FILE_SECCTX_NAME);
-		memcpy(secctx_name, op_data->op_file_secctx_name,
-		       op_data->op_file_secctx_name_size);
-		req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX,
-				     RCL_SERVER,
-				     obddev->u.cli.cl_max_mds_easize);
-
-		CDEBUG(D_SEC, "packed '%.*s' as security xattr name\n",
-		       op_data->op_file_secctx_name_size,
-		       op_data->op_file_secctx_name);
-
-	} else {
-		req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX,
-				     RCL_SERVER, 0);
-	}
-
-	/**
-	 * Inline buffer for possible data from Data-on-MDT files.
-	 */
-	req_capsule_set_size(&req->rq_pill, &RMF_NIOBUF_INLINE, RCL_SERVER,
-			     sizeof(struct niobuf_remote));
-	ptlrpc_request_set_replen(req);
-
-	/* Get real repbuf allocated size as rounded up power of 2 */
-	repsize = size_roundup_power2(req->rq_replen +
-				      lustre_msg_early_size());
-	/* Estimate free space for DoM files in repbuf */
-	repsize_estimate = repsize - (req->rq_replen -
-			   obddev->u.cli.cl_max_mds_easize +
-			   sizeof(struct lov_comp_md_v1) +
-			   sizeof(struct lov_comp_md_entry_v1) +
-			   lov_mds_md_size(0, LOV_MAGIC_V3));
-
-	if (repsize_estimate < obddev->u.cli.cl_dom_min_inline_repsize) {
-		repsize = obddev->u.cli.cl_dom_min_inline_repsize -
-			  repsize_estimate + sizeof(struct niobuf_remote);
-		req_capsule_set_size(&req->rq_pill, &RMF_NIOBUF_INLINE,
-				     RCL_SERVER,
-				     sizeof(struct niobuf_remote) + repsize);
-		ptlrpc_request_set_replen(req);
-		CDEBUG(D_INFO, "Increase repbuf by %d bytes, total: %d\n",
-		       repsize, req->rq_replen);
-		repsize = size_roundup_power2(req->rq_replen +
-					      lustre_msg_early_size());
-	}
-	/* The only way to report real allocated repbuf size to the server
-	 * is the lm_repsize but it must be set prior buffer allocation itself
-	 * due to security reasons - it is part of buffer used in signature
-	 * calculation (see LU-11414). Therefore the saved size is predicted
-	 * value as rq_replen rounded to the next higher power of 2.
-	 * Such estimation is safe. Though the final allocated buffer might
-	 * be even larger, it is not possible to know that at this point.
-	 */
-	req->rq_reqmsg->lm_repsize = repsize;
-	return req;
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+			     req->rq_import->imp_connect_data.ocd_max_easize);
+        ptlrpc_request_set_replen(req);
+        return req;
 }
 
 #define GA_DEFAULT_EA_NAME_LEN 20
@@ -429,7 +349,7 @@ mdc_intent_getxattr_pack(struct obd_export *exp,
 	struct ldlm_intent	*lit;
 	int			rc, count = 0;
 	struct list_head	cancels = LIST_HEAD_INIT(cancels);
-	u32 ea_vals_buf_size = GA_DEFAULT_EA_VAL_LEN * GA_DEFAULT_EA_NUM;
+	u32 min_buf_size = 0;
 
 	ENTRY;
 
@@ -438,16 +358,6 @@ mdc_intent_getxattr_pack(struct obd_export *exp,
 	if (req == NULL)
 		RETURN(ERR_PTR(-ENOMEM));
 
-	/* get SELinux policy info if any */
-	rc = sptlrpc_get_sepol(req);
-	if (rc < 0) {
-		ptlrpc_request_free(req);
-		RETURN(ERR_PTR(rc));
-	}
-	req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
-			     strlen(req->rq_sepol) ?
-			     strlen(req->rq_sepol) + 1 : 0);
-
 	rc = ldlm_prep_enqueue_req(exp, req, &cancels, count);
 	if (rc) {
 		ptlrpc_request_free(req);
@@ -457,8 +367,6 @@ mdc_intent_getxattr_pack(struct obd_export *exp,
 	/* pack the intent */
 	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
 	lit->opc = IT_GETXATTR;
-	CDEBUG(D_INFO, "%s: get xattrs for "DFID"\n",
-	       exp->exp_obd->obd_name, PFID(&op_data->op_fid1));
 
 #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
 	/* If the supplied buffer is too small then the server will
@@ -470,25 +378,26 @@ mdc_intent_getxattr_pack(struct obd_export *exp,
 	 * of LU-9417 when it would be *more* likely to crash the
 	 * server. See LU-9856. */
 	if (exp->exp_connect_data.ocd_version < OBD_OCD_VERSION(2, 10, 1, 0))
-		ea_vals_buf_size = max_t(u32, ea_vals_buf_size,
-					 exp->exp_connect_data.ocd_max_easize);
+		min_buf_size = exp->exp_connect_data.ocd_max_easize;
 #endif
 
 	/* pack the intended request */
 	mdc_pack_body(req, &op_data->op_fid1, op_data->op_valid,
-		      ea_vals_buf_size, -1, 0);
-
-	/* get SELinux policy info if any */
-	mdc_file_sepol_pack(req);
+		      max_t(u32, min_buf_size,
+			    GA_DEFAULT_EA_VAL_LEN * GA_DEFAULT_EA_NUM),
+		      -1, 0);
 
 	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER,
-			     GA_DEFAULT_EA_NAME_LEN * GA_DEFAULT_EA_NUM);
+			     max_t(u32, min_buf_size,
+				 GA_DEFAULT_EA_NAME_LEN * GA_DEFAULT_EA_NUM));
 
 	req_capsule_set_size(&req->rq_pill, &RMF_EAVALS, RCL_SERVER,
-			     ea_vals_buf_size);
+			     max_t(u32, min_buf_size,
+				 GA_DEFAULT_EA_VAL_LEN * GA_DEFAULT_EA_NUM));
 
 	req_capsule_set_size(&req->rq_pill, &RMF_EAVALS_LENS, RCL_SERVER,
-			     sizeof(u32) * GA_DEFAULT_EA_NUM);
+			     max_t(u32, min_buf_size,
+				 sizeof(__u32) * GA_DEFAULT_EA_NUM));
 
 	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, 0);
 
@@ -497,9 +406,46 @@ mdc_intent_getxattr_pack(struct obd_export *exp,
 	RETURN(req);
 }
 
-static struct ptlrpc_request *
-mdc_intent_getattr_pack(struct obd_export *exp, struct lookup_intent *it,
-			struct md_op_data *op_data, __u32 acl_bufsize)
+static struct ptlrpc_request *mdc_intent_unlink_pack(struct obd_export *exp,
+                                                     struct lookup_intent *it,
+                                                     struct md_op_data *op_data)
+{
+        struct ptlrpc_request *req;
+        struct obd_device     *obddev = class_exp2obd(exp);
+        struct ldlm_intent    *lit;
+        int                    rc;
+        ENTRY;
+
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                   &RQF_LDLM_INTENT_UNLINK);
+        if (req == NULL)
+                RETURN(ERR_PTR(-ENOMEM));
+
+        req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                             op_data->op_namelen + 1);
+
+        rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+        if (rc) {
+                ptlrpc_request_free(req);
+                RETURN(ERR_PTR(rc));
+        }
+
+        /* pack the intent */
+        lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+        lit->opc = (__u64)it->it_op;
+
+        /* pack the intended request */
+        mdc_unlink_pack(req, op_data);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     obddev->u.cli.cl_default_mds_easize);
+	ptlrpc_request_set_replen(req);
+	RETURN(req);
+}
+
+static struct ptlrpc_request *mdc_intent_getattr_pack(struct obd_export *exp,
+                                                      struct lookup_intent *it,
+                                                      struct md_op_data *op_data)
 {
 	struct ptlrpc_request	*req;
 	struct obd_device	*obddev = class_exp2obd(exp);
@@ -509,38 +455,25 @@ mdc_intent_getattr_pack(struct obd_export *exp, struct lookup_intent *it,
 	struct ldlm_intent	*lit;
 	int			 rc;
 	__u32			 easize;
-	bool			 have_secctx = false;
 	ENTRY;
 
-	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
-				   &RQF_LDLM_INTENT_GETATTR);
-	if (req == NULL)
-		RETURN(ERR_PTR(-ENOMEM));
-
-	/* send name of security xattr to get upon intent */
-	if (it->it_op & (IT_LOOKUP | IT_GETATTR) &&
-	    req_capsule_has_field(&req->rq_pill, &RMF_FILE_SECCTX_NAME,
-				  RCL_CLIENT) &&
-	    op_data->op_file_secctx_name_size > 0 &&
-	    op_data->op_file_secctx_name != NULL) {
-		have_secctx = true;
-		req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX_NAME,
-				     RCL_CLIENT,
-				     op_data->op_file_secctx_name_size);
-	}
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                   &RQF_LDLM_INTENT_GETATTR);
+        if (req == NULL)
+                RETURN(ERR_PTR(-ENOMEM));
 
-	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
-			     op_data->op_namelen + 1);
+        req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                             op_data->op_namelen + 1);
 
-	rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
-	if (rc) {
-		ptlrpc_request_free(req);
-		RETURN(ERR_PTR(rc));
-	}
+        rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+        if (rc) {
+                ptlrpc_request_free(req);
+                RETURN(ERR_PTR(rc));
+        }
 
         /* pack the intent */
-	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
-	lit->opc = (__u64)it->it_op;
+        lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+        lit->opc = (__u64)it->it_op;
 
 	if (obddev->u.cli.cl_default_mds_easize > 0)
 		easize = obddev->u.cli.cl_default_mds_easize;
@@ -551,27 +484,8 @@ mdc_intent_getattr_pack(struct obd_export *exp, struct lookup_intent *it,
 	mdc_getattr_pack(req, valid, it->it_flags, op_data, easize);
 
 	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, easize);
-	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, acl_bufsize);
-
-	if (have_secctx) {
-		char *secctx_name;
-
-		secctx_name = req_capsule_client_get(&req->rq_pill,
-						     &RMF_FILE_SECCTX_NAME);
-		memcpy(secctx_name, op_data->op_file_secctx_name,
-		       op_data->op_file_secctx_name_size);
-
-		req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX,
-				     RCL_SERVER, easize);
-
-		CDEBUG(D_SEC, "packed '%.*s' as security xattr name\n",
-		       op_data->op_file_secctx_name_size,
-		       op_data->op_file_secctx_name);
-	} else {
-		req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX,
-				     RCL_SERVER, 0);
-	}
-
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+			     req->rq_import->imp_connect_data.ocd_max_easize);
 	ptlrpc_request_set_replen(req);
 	RETURN(req);
 }
@@ -648,10 +562,8 @@ static int mdc_finish_enqueue(struct obd_export *exp,
 	struct ldlm_request *lockreq;
 	struct ldlm_reply   *lockrep;
 	struct ldlm_lock    *lock;
-	struct mdt_body     *body = NULL;
 	void                *lvb_data = NULL;
 	__u32                lvb_len = 0;
-
         ENTRY;
 
         LASSERT(rc >= 0);
@@ -710,6 +622,8 @@ static int mdc_finish_enqueue(struct obd_export *exp,
 
 	/* We know what to expect, so we do any byte flipping required here */
 	if (it_has_reply_body(it)) {
+                struct mdt_body *body;
+
                 body = req_capsule_server_get(pill, &RMF_MDT_BODY);
                 if (body == NULL) {
                         CERROR ("Can't swab mdt_body\n");
@@ -727,12 +641,6 @@ static int mdc_finish_enqueue(struct obd_export *exp,
 			mdc_set_open_replay_data(NULL, NULL, it);
 		}
 
-		if (it_disposition(it, DISP_OPEN_CREATE) &&
-		    !it_open_error(DISP_OPEN_CREATE, it)) {
-			lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
-					     LPROC_MD_CREATE);
-		}
-
 		if (body->mbo_valid & (OBD_MD_FLDIREA | OBD_MD_FLEASIZE)) {
                         void *eadata;
 
@@ -800,10 +708,7 @@ static int mdc_finish_enqueue(struct obd_export *exp,
 	 * client still does this checking in case it's talking with an old
 	 * server. - Jinshan */
 	lock = ldlm_handle2lock(lockh);
-	if (lock == NULL)
-		RETURN(rc);
-
-	if (ldlm_has_layout(lock) && lvb_data != NULL &&
+	if (lock != NULL && ldlm_has_layout(lock) && lvb_data != NULL &&
 	    !(lockrep->lock_flags & LDLM_FL_BLOCKED_MASK)) {
 		void *lmm;
 
@@ -811,9 +716,10 @@ static int mdc_finish_enqueue(struct obd_export *exp,
 			ldlm_it2str(it->it_op), lvb_len);
 
 		OBD_ALLOC_LARGE(lmm, lvb_len);
-		if (lmm == NULL)
-			GOTO(out_lock, rc = -ENOMEM);
-
+		if (lmm == NULL) {
+			LDLM_LOCK_PUT(lock);
+			RETURN(-ENOMEM);
+		}
 		memcpy(lmm, lvb_data, lvb_len);
 
 		/* install lvb_data */
@@ -828,24 +734,8 @@ static int mdc_finish_enqueue(struct obd_export *exp,
 		if (lmm != NULL)
 			OBD_FREE_LARGE(lmm, lvb_len);
 	}
-
-	if (ldlm_has_dom(lock)) {
-		LASSERT(lock->l_glimpse_ast == mdc_ldlm_glimpse_ast);
-
-		body = req_capsule_server_get(pill, &RMF_MDT_BODY);
-		if (!(body->mbo_valid & OBD_MD_DOM_SIZE)) {
-			LDLM_ERROR(lock, "%s: DoM lock without size.",
-				   exp->exp_obd->obd_name);
-			GOTO(out_lock, rc = -EPROTO);
-		}
-
-		LDLM_DEBUG(lock, "DoM lock is returned by: %s, size: %llu",
-			   ldlm_it2str(it->it_op), body->mbo_dom_size);
-
-		rc = mdc_fill_lvb(req, &lock->l_ost_lvb);
-	}
-out_lock:
-	LDLM_LOCK_PUT(lock);
+	if (lock != NULL)
+		LDLM_LOCK_PUT(lock);
 
 	RETURN(rc);
 }
@@ -874,8 +764,6 @@ static int mdc_enqueue_base(struct obd_export *exp,
 				  .l_inodebits = { MDS_INODELOCK_XATTR } };
 	int generation, resends = 0;
 	struct ldlm_reply *lockrep;
-	struct obd_import *imp = class_exp2cliimp(exp);
-	__u32 acl_bufsize;
 	enum lvb_type lvb_type = 0;
 	int rc;
 	ENTRY;
@@ -888,37 +776,34 @@ static int mdc_enqueue_base(struct obd_export *exp,
 		LASSERT(policy == NULL);
 
 		saved_flags |= LDLM_FL_HAS_INTENT;
-		if (it->it_op & (IT_GETATTR | IT_READDIR))
+		if (it->it_op & (IT_UNLINK | IT_GETATTR | IT_READDIR))
 			policy = &update_policy;
 		else if (it->it_op & IT_LAYOUT)
 			policy = &layout_policy;
-		else if (it->it_op & IT_GETXATTR)
+		else if (it->it_op & (IT_GETXATTR | IT_SETXATTR))
 			policy = &getxattr_policy;
 		else
 			policy = &lookup_policy;
 	}
 
-	generation = obddev->u.cli.cl_import->imp_generation;
-	if (!it || (it->it_op & (IT_OPEN | IT_CREAT)))
-		acl_bufsize = imp->imp_connect_data.ocd_max_easize;
-	else
-		acl_bufsize = LUSTRE_POSIX_ACL_MAX_SIZE_OLD;
-
+        generation = obddev->u.cli.cl_import->imp_generation;
 resend:
-	flags = saved_flags;
+        flags = saved_flags;
 	if (it == NULL) {
 		/* The only way right now is FLOCK. */
 		LASSERTF(einfo->ei_type == LDLM_FLOCK, "lock type %d\n",
 			 einfo->ei_type);
 		res_id.name[3] = LDLM_FLOCK;
 	} else if (it->it_op & IT_OPEN) {
-		req = mdc_intent_open_pack(exp, it, op_data, acl_bufsize);
+		req = mdc_intent_open_pack(exp, it, op_data);
+	} else if (it->it_op & IT_UNLINK) {
+		req = mdc_intent_unlink_pack(exp, it, op_data);
 	} else if (it->it_op & (IT_GETATTR | IT_LOOKUP)) {
-		req = mdc_intent_getattr_pack(exp, it, op_data, acl_bufsize);
+		req = mdc_intent_getattr_pack(exp, it, op_data);
 	} else if (it->it_op & IT_READDIR) {
 		req = mdc_enqueue_pack(exp, 0);
 	} else if (it->it_op & IT_LAYOUT) {
-		if (!imp_connect_lvb_type(imp))
+		if (!imp_connect_lvb_type(class_exp2cliimp(exp)))
 			RETURN(-EOPNOTSUPP);
 		req = mdc_intent_layout_pack(exp, it, op_data);
 		lvb_type = LVB_T_LAYOUT;
@@ -947,25 +832,18 @@ static int mdc_enqueue_base(struct obd_export *exp,
 		rc = obd_get_request_slot(&obddev->u.cli);
 		if (rc != 0) {
 			mdc_put_mod_rpc_slot(req, it);
-			mdc_clear_replay_flag(req, 0);
-			ptlrpc_req_finished(req);
-			RETURN(rc);
-		}
-	}
-
-	/* With Data-on-MDT the glimpse callback is needed too.
-	 * It is set here in advance but not in mdc_finish_enqueue()
-	 * to avoid possible races. It is safe to have glimpse handler
-	 * for non-DOM locks and costs nothing.*/
-	if (einfo->ei_cb_gl == NULL)
-		einfo->ei_cb_gl = mdc_ldlm_glimpse_ast;
+                        mdc_clear_replay_flag(req, 0);
+                        ptlrpc_req_finished(req);
+                        RETURN(rc);
+                }
+        }
 
-	rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, policy, &flags, NULL,
+        rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, policy, &flags, NULL,
 			      0, lvb_type, lockh, 0);
-	if (!it) {
-		/* For flock requests we immediatelly return without further
-		   delay and let caller deal with the rest, since rest of
-		   this function metadata processing makes no sense for flock
+        if (!it) {
+                /* For flock requests we immediatelly return without further
+                   delay and let caller deal with the rest, since rest of
+                   this function metadata processing makes no sense for flock
 		   requests anyway. But in case of problem during comms with
 		   Server (ETIMEDOUT) or any signal/kill attempt (EINTR), we
 		   can not rely on caller and this mainly for F_UNLCKs
@@ -1020,15 +898,6 @@ static int mdc_enqueue_base(struct obd_export *exp,
 		}
 	}
 
-	if ((int)lockrep->lock_policy_res2 == -ERANGE &&
-	    it->it_op & (IT_OPEN | IT_GETATTR | IT_LOOKUP) &&
-	    acl_bufsize != imp->imp_connect_data.ocd_max_easize) {
-		mdc_clear_replay_flag(req, -ERANGE);
-		ptlrpc_req_finished(req);
-		acl_bufsize = imp->imp_connect_data.ocd_max_easize;
-		goto resend;
-	}
-
 	rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc);
 	if (rc < 0) {
 		if (lustre_handle_is_used(lockh)) {
@@ -1202,6 +1071,7 @@ int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
 			 * but for old MDTs (< 2.4), permission is covered
 			 * by LOOKUP lock, so it needs to match all bits here.*/
 			policy.l_inodebits.bits = MDS_INODELOCK_UPDATE |
+						  MDS_INODELOCK_LOOKUP |
 						  MDS_INODELOCK_PERM;
 			break;
 		case IT_READDIR:
@@ -1268,7 +1138,6 @@ int mdc_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
 		.ei_mode	= it_to_lock_mode(it),
 		.ei_cb_bl	= cb_blocking,
 		.ei_cb_cp	= ldlm_completion_ast,
-		.ei_cb_gl	= mdc_ldlm_glimpse_ast,
 	};
 	struct lustre_handle lockh;
 	int rc = 0;
@@ -1385,10 +1254,7 @@ int mdc_intent_getattr_async(struct obd_export *exp,
 		PFID(&op_data->op_fid1), ldlm_it2str(it->it_op), it->it_flags);
 
 	fid_build_reg_res_name(&op_data->op_fid1, &res_id);
-	/* If the MDT return -ERANGE because of large ACL, then the sponsor
-	 * of the async getattr RPC will handle that by itself. */
-	req = mdc_intent_getattr_pack(exp, it, op_data,
-				      LUSTRE_POSIX_ACL_MAX_SIZE_OLD);
+	req = mdc_intent_getattr_pack(exp, it, op_data);
 	if (IS_ERR(req))
 		RETURN(PTR_ERR(req));
 
@@ -1398,13 +1264,6 @@ int mdc_intent_getattr_async(struct obd_export *exp,
 		RETURN(rc);
 	}
 
-	/* With Data-on-MDT the glimpse callback is needed too.
-	 * It is set here in advance but not in mdc_finish_enqueue()
-	 * to avoid possible races. It is safe to have glimpse handler
-	 * for non-DOM locks and costs nothing.*/
-	if (minfo->mi_einfo.ei_cb_gl == NULL)
-		minfo->mi_einfo.ei_cb_gl = mdc_ldlm_glimpse_ast;
-
 	rc = ldlm_cli_enqueue(exp, &req, &minfo->mi_einfo, &res_id, &policy,
 			      &flags, NULL, 0, LVB_T_NONE, &minfo->mi_lockh, 1);
 	if (rc < 0) {
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c
index 096b20fd4847a..db2e665658746 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -201,16 +201,6 @@ int mdc_create(struct obd_export *exp, struct md_op_data *op_data,
 	req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX, RCL_CLIENT,
 			     op_data->op_file_secctx_size);
 
-	/* get SELinux policy info if any */
-	rc = sptlrpc_get_sepol(req);
-	if (rc < 0) {
-		ptlrpc_request_free(req);
-		RETURN(rc);
-	}
-	req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
-			     strlen(req->rq_sepol) ?
-			     strlen(req->rq_sepol) + 1 : 0);
-
 	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
 	if (rc) {
 		ptlrpc_request_free(req);
@@ -285,10 +275,9 @@ int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
 						MDS_INODELOCK_UPDATE);
 	if ((op_data->op_flags & MF_MDC_CANCEL_FID3) &&
 	    (fid_is_sane(&op_data->op_fid3)))
-		/* don't cancel DoM lock which may cause data flush */
 		count += mdc_resource_get_unused(exp, &op_data->op_fid3,
 						 &cancels, LCK_EX,
-						 MDS_INODELOCK_ELC);
+						 MDS_INODELOCK_FULL);
         req = ptlrpc_request_alloc(class_exp2cliimp(exp),
                                    &RQF_MDS_REINT_UNLINK);
         if (req == NULL) {
@@ -299,16 +288,6 @@ int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
         req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
                              op_data->op_namelen + 1);
 
-	/* get SELinux policy info if any */
-	rc = sptlrpc_get_sepol(req);
-	if (rc < 0) {
-		ptlrpc_request_free(req);
-		RETURN(rc);
-	}
-	req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
-			     strlen(req->rq_sepol) ?
-			     strlen(req->rq_sepol) + 1 : 0);
-
 	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
 	if (rc) {
 		ptlrpc_request_free(req);
@@ -357,16 +336,6 @@ int mdc_link(struct obd_export *exp, struct md_op_data *op_data,
         req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
                              op_data->op_namelen + 1);
 
-	/* get SELinux policy info if any */
-	rc = sptlrpc_get_sepol(req);
-	if (rc < 0) {
-		ptlrpc_request_free(req);
-		RETURN(rc);
-	}
-	req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
-			     strlen(req->rq_sepol) ?
-			     strlen(req->rq_sepol) + 1 : 0);
-
 	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
 	if (rc) {
 		ptlrpc_request_free(req);
@@ -389,32 +358,31 @@ int mdc_rename(struct obd_export *exp, struct md_op_data *op_data,
 		struct ptlrpc_request **request)
 {
 	struct list_head cancels = LIST_HEAD_INIT(cancels);
-	struct obd_device *obd = exp->exp_obd;
-	struct ptlrpc_request *req;
-	int count = 0, rc;
-
-	ENTRY;
+        struct obd_device *obd = exp->exp_obd;
+        struct ptlrpc_request *req;
+        int count = 0, rc;
+        ENTRY;
 
-	if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
-	    (fid_is_sane(&op_data->op_fid1)))
-		count = mdc_resource_get_unused(exp, &op_data->op_fid1,
-						&cancels, LCK_EX,
-						MDS_INODELOCK_UPDATE);
-	if ((op_data->op_flags & MF_MDC_CANCEL_FID2) &&
-	    (fid_is_sane(&op_data->op_fid2)))
-		count += mdc_resource_get_unused(exp, &op_data->op_fid2,
-						 &cancels, LCK_EX,
-						 MDS_INODELOCK_UPDATE);
-	if ((op_data->op_flags & MF_MDC_CANCEL_FID3) &&
-	    (fid_is_sane(&op_data->op_fid3)))
-		count += mdc_resource_get_unused(exp, &op_data->op_fid3,
-						 &cancels, LCK_EX,
-						 MDS_INODELOCK_LOOKUP);
-	if ((op_data->op_flags & MF_MDC_CANCEL_FID4) &&
-	    (fid_is_sane(&op_data->op_fid4)))
-		count += mdc_resource_get_unused(exp, &op_data->op_fid4,
-						 &cancels, LCK_EX,
-						 MDS_INODELOCK_ELC);
+        if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+            (fid_is_sane(&op_data->op_fid1)))
+                count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+                                                &cancels, LCK_EX,
+                                                MDS_INODELOCK_UPDATE);
+        if ((op_data->op_flags & MF_MDC_CANCEL_FID2) &&
+            (fid_is_sane(&op_data->op_fid2)))
+                count += mdc_resource_get_unused(exp, &op_data->op_fid2,
+                                                 &cancels, LCK_EX,
+                                                 MDS_INODELOCK_UPDATE);
+        if ((op_data->op_flags & MF_MDC_CANCEL_FID3) &&
+            (fid_is_sane(&op_data->op_fid3)))
+                count += mdc_resource_get_unused(exp, &op_data->op_fid3,
+                                                 &cancels, LCK_EX,
+                                                 MDS_INODELOCK_LOOKUP);
+        if ((op_data->op_flags & MF_MDC_CANCEL_FID4) &&
+             (fid_is_sane(&op_data->op_fid4)))
+                count += mdc_resource_get_unused(exp, &op_data->op_fid4,
+                                                 &cancels, LCK_EX,
+                                                 MDS_INODELOCK_FULL);
 
 	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
 			   op_data->op_cli_flags & CLI_MIGRATE ?
@@ -424,21 +392,8 @@ int mdc_rename(struct obd_export *exp, struct md_op_data *op_data,
 		RETURN(-ENOMEM);
 	}
 
-	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, oldlen + 1);
-	req_capsule_set_size(&req->rq_pill, &RMF_SYMTGT, RCL_CLIENT, newlen+1);
-	if (op_data->op_cli_flags & CLI_MIGRATE)
-		req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
-				     op_data->op_data_size);
-
-	/* get SELinux policy info if any */
-	rc = sptlrpc_get_sepol(req);
-	if (rc < 0) {
-		ptlrpc_request_free(req);
-		RETURN(rc);
-	}
-	req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
-			     strlen(req->rq_sepol) ?
-			     strlen(req->rq_sepol) + 1 : 0);
+        req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, oldlen + 1);
+        req_capsule_set_size(&req->rq_pill, &RMF_SYMTGT, RCL_CLIENT, newlen+1);
 
 	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
 	if (rc) {
@@ -446,76 +401,34 @@ int mdc_rename(struct obd_export *exp, struct md_op_data *op_data,
 		RETURN(rc);
 	}
 
-	if (exp_connect_cancelset(exp) && req)
-		ldlm_cli_cancel_list(&cancels, count, req, 0);
-
-	if (op_data->op_cli_flags & CLI_MIGRATE)
-		mdc_migrate_pack(req, op_data, old, oldlen);
-	else
-		mdc_rename_pack(req, op_data, old, oldlen, new, newlen);
-
-	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
-			     obd->u.cli.cl_default_mds_easize);
-	ptlrpc_request_set_replen(req);
-
-	rc = mdc_reint(req, LUSTRE_IMP_FULL);
-	*request = req;
-	if (rc == -ERESTARTSYS)
-		rc = 0;
-
-	RETURN(rc);
-}
+	if (op_data->op_cli_flags & CLI_MIGRATE && op_data->op_data != NULL) {
+		struct md_open_data *mod = op_data->op_data;
 
-int mdc_file_resync(struct obd_export *exp, struct md_op_data *op_data)
-{
-	struct list_head cancels = LIST_HEAD_INIT(cancels);
-	struct ptlrpc_request *req;
-	struct ldlm_lock *lock;
-	struct mdt_rec_resync *rec;
-	int count = 0, rc;
-	ENTRY;
+		LASSERTF(mod->mod_open_req != NULL &&
+			 mod->mod_open_req->rq_type != LI_POISON,
+			 "POISONED open %p!\n", mod->mod_open_req);
 
-	if (op_data->op_flags & MF_MDC_CANCEL_FID1 &&
-	    fid_is_sane(&op_data->op_fid1))
-		count = mdc_resource_get_unused(exp, &op_data->op_fid1,
-						&cancels, LCK_EX,
-						MDS_INODELOCK_LAYOUT);
-
-	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
-				   &RQF_MDS_REINT_RESYNC);
-	if (req == NULL) {
-		ldlm_lock_list_put(&cancels, l_bl_ast, count);
-		RETURN(-ENOMEM);
+		DEBUG_REQ(D_HA, mod->mod_open_req, "matched open");
+		/* We no longer want to preserve this open for replay even
+		 * though the open was committed. b=3632, b=3633 */
+		spin_lock(&mod->mod_open_req->rq_lock);
+		mod->mod_open_req->rq_replay = 0;
+		spin_unlock(&mod->mod_open_req->rq_lock);
 	}
 
-	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
-	if (rc) {
-		ptlrpc_request_free(req);
-		RETURN(rc);
-	}
+        if (exp_connect_cancelset(exp) && req)
+                ldlm_cli_cancel_list(&cancels, count, req, 0);
 
-	CLASSERT(sizeof(*rec) == sizeof(struct mdt_rec_reint));
-	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
-	rec->rs_opcode	= REINT_RESYNC;
-	rec->rs_fsuid	= op_data->op_fsuid;
-	rec->rs_fsgid	= op_data->op_fsgid;
-	rec->rs_cap	= op_data->op_cap;
-	rec->rs_fid	= op_data->op_fid1;
-	rec->rs_bias	= op_data->op_bias;
-	rec->rs_mirror_id = op_data->op_mirror_id;
-
-	lock = ldlm_handle2lock(&op_data->op_lease_handle);
-	if (lock != NULL) {
-		rec->rs_lease_handle = lock->l_remote_handle;
-		LDLM_LOCK_PUT(lock);
-	}
+        mdc_rename_pack(req, op_data, old, oldlen, new, newlen);
 
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     obd->u.cli.cl_default_mds_easize);
 	ptlrpc_request_set_replen(req);
 
 	rc = mdc_reint(req, LUSTRE_IMP_FULL);
-	if (rc == -ERESTARTSYS)
-		rc = 0;
+        *request = req;
+        if (rc == -ERESTARTSYS)
+                rc = 0;
 
-	ptlrpc_req_finished(req);
-	RETURN(rc);
+        RETURN(rc);
 }
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c
index 5a29a285e5943..6c8da5866a8b9 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -41,23 +41,21 @@
 #ifdef HAVE_UIDGID_HEADER
 # include <linux/uidgid.h>
 #endif
-#include <linux/device.h>
 
-#include <lustre_errno.h>
+#include <lustre/lustre_errno.h>
 
 #include <cl_object.h>
 #include <llog_swab.h>
 #include <lprocfs_status.h>
 #include <lustre_acl.h>
 #include <lustre_fid.h>
-#include <uapi/linux/lustre/lustre_ioctl.h>
+#include <uapi/linux/lustre_ioctl.h>
 #include <lustre_kernelcomm.h>
 #include <lustre_lmv.h>
 #include <lustre_log.h>
-#include <uapi/linux/lustre/lustre_param.h>
+#include <uapi/linux/lustre_param.h>
 #include <lustre_swab.h>
 #include <obd_class.h>
-#include <lustre_osc.h>
 
 #include "mdc_internal.h"
 
@@ -193,34 +191,20 @@ static int mdc_getattr_common(struct obd_export *exp,
         RETURN(0);
 }
 
-static void mdc_reset_acl_req(struct ptlrpc_request *req)
-{
-	spin_lock(&req->rq_early_free_lock);
-	sptlrpc_cli_free_repbuf(req);
-	req->rq_repbuf = NULL;
-	req->rq_repbuf_len = 0;
-	req->rq_repdata = NULL;
-	req->rq_reqdata_len = 0;
-	spin_unlock(&req->rq_early_free_lock);
-}
-
 static int mdc_getattr(struct obd_export *exp, struct md_op_data *op_data,
 		       struct ptlrpc_request **request)
 {
-	struct ptlrpc_request *req;
-	struct obd_import *imp = class_exp2cliimp(exp);
-	__u32 acl_bufsize = LUSTRE_POSIX_ACL_MAX_SIZE_OLD;
-	int rc;
-	ENTRY;
+        struct ptlrpc_request *req;
+        int                    rc;
+        ENTRY;
 
 	/* Single MDS without an LMV case */
 	if (op_data->op_flags & MF_GET_MDT_IDX) {
 		op_data->op_mds = 0;
 		RETURN(0);
 	}
-
-	*request = NULL;
-	req = ptlrpc_request_alloc(imp, &RQF_MDS_GETATTR);
+        *request = NULL;
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR);
         if (req == NULL)
                 RETURN(-ENOMEM);
 
@@ -230,42 +214,33 @@ static int mdc_getattr(struct obd_export *exp, struct md_op_data *op_data,
                 RETURN(rc);
         }
 
-again:
 	mdc_pack_body(req, &op_data->op_fid1, op_data->op_valid,
 		      op_data->op_mode, -1, 0);
-	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, acl_bufsize);
-	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
-			     op_data->op_mode);
-	ptlrpc_request_set_replen(req);
 
-	rc = mdc_getattr_common(exp, req);
-	if (rc) {
-		if (rc == -ERANGE &&
-		    acl_bufsize != imp->imp_connect_data.ocd_max_easize) {
-			acl_bufsize = imp->imp_connect_data.ocd_max_easize;
-			mdc_reset_acl_req(req);
-			goto again;
-		}
-
-		ptlrpc_req_finished(req);
-	} else {
-		*request = req;
-	}
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+			     req->rq_import->imp_connect_data.ocd_max_easize);
+        req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+                             op_data->op_mode);
+        ptlrpc_request_set_replen(req);
 
-	RETURN(rc);
+        rc = mdc_getattr_common(exp, req);
+        if (rc)
+                ptlrpc_req_finished(req);
+        else
+                *request = req;
+        RETURN(rc);
 }
 
 static int mdc_getattr_name(struct obd_export *exp, struct md_op_data *op_data,
 			    struct ptlrpc_request **request)
 {
-	struct ptlrpc_request *req;
-	struct obd_import *imp = class_exp2cliimp(exp);
-	__u32 acl_bufsize = LUSTRE_POSIX_ACL_MAX_SIZE_OLD;
-	int rc;
-	ENTRY;
+        struct ptlrpc_request *req;
+        int                    rc;
+        ENTRY;
 
-	*request = NULL;
-	req = ptlrpc_request_alloc(imp, &RQF_MDS_GETATTR_NAME);
+        *request = NULL;
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                   &RQF_MDS_GETATTR_NAME);
         if (req == NULL)
                 RETURN(-ENOMEM);
 
@@ -278,6 +253,9 @@ static int mdc_getattr_name(struct obd_export *exp, struct md_op_data *op_data,
                 RETURN(rc);
         }
 
+	mdc_pack_body(req, &op_data->op_fid1, op_data->op_valid,
+		      op_data->op_mode, op_data->op_suppgids[0], 0);
+
         if (op_data->op_name) {
                 char *name = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
                 LASSERT(strnlen(op_data->op_name, op_data->op_namelen) ==
@@ -285,29 +263,18 @@ static int mdc_getattr_name(struct obd_export *exp, struct md_op_data *op_data,
                 memcpy(name, op_data->op_name, op_data->op_namelen);
         }
 
-again:
-	mdc_pack_body(req, &op_data->op_fid1, op_data->op_valid,
-		      op_data->op_mode, op_data->op_suppgids[0], 0);
-	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
-			     op_data->op_mode);
-	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, acl_bufsize);
-	ptlrpc_request_set_replen(req);
-
-	rc = mdc_getattr_common(exp, req);
-	if (rc) {
-		if (rc == -ERANGE &&
-		    acl_bufsize != imp->imp_connect_data.ocd_max_easize) {
-			acl_bufsize = imp->imp_connect_data.ocd_max_easize;
-			mdc_reset_acl_req(req);
-			goto again;
-		}
-
-		ptlrpc_req_finished(req);
-	} else {
-		*request = req;
-	}
+        req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+                             op_data->op_mode);
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+			     req->rq_import->imp_connect_data.ocd_max_easize);
+        ptlrpc_request_set_replen(req);
 
-	RETURN(rc);
+        rc = mdc_getattr_common(exp, req);
+        if (rc)
+                ptlrpc_req_finished(req);
+        else
+                *request = req;
+        RETURN(rc);
 }
 
 static int mdc_xattr_common(struct obd_export *exp,const struct req_format *fmt,
@@ -327,25 +294,16 @@ static int mdc_xattr_common(struct obd_export *exp,const struct req_format *fmt,
         if (req == NULL)
                 RETURN(-ENOMEM);
 
-	if (xattr_name) {
-		xattr_namelen = strlen(xattr_name) + 1;
-		req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
-				     xattr_namelen);
-	}
-	if (input_size)
-		LASSERT(input);
-	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
-			     input_size);
-
-	/* get SELinux policy info if any */
-	rc = sptlrpc_get_sepol(req);
-	if (rc < 0) {
-		ptlrpc_request_free(req);
-		RETURN(rc);
-	}
-	req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
-			     strlen(req->rq_sepol) ?
-			     strlen(req->rq_sepol) + 1 : 0);
+        if (xattr_name) {
+                xattr_namelen = strlen(xattr_name) + 1;
+                req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                                     xattr_namelen);
+        }
+        if (input_size) {
+                LASSERT(input);
+                req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+                                     input_size);
+        }
 
 	/* Flush local XATTR locks to get rid of a possible cancel RPC */
 	if (opcode == MDS_REINT && fid_is_sane(fid) &&
@@ -375,11 +333,11 @@ static int mdc_xattr_common(struct obd_export *exp,const struct req_format *fmt,
 		}
 	}
 
-	if (opcode == MDS_REINT) {
-		struct mdt_rec_setxattr *rec;
+        if (opcode == MDS_REINT) {
+                struct mdt_rec_setxattr *rec;
 
-		CLASSERT(sizeof(struct mdt_rec_setxattr) ==
-			 sizeof(struct mdt_rec_reint));
+                CLASSERT(sizeof(struct mdt_rec_setxattr) ==
+                         sizeof(struct mdt_rec_reint));
 		rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
 		rec->sx_opcode = REINT_SETXATTR;
 		rec->sx_fsuid  = from_kuid(&init_user_ns, current_fsuid());
@@ -405,8 +363,6 @@ static int mdc_xattr_common(struct obd_export *exp,const struct req_format *fmt,
                 memcpy(tmp, input, input_size);
         }
 
-	mdc_file_sepol_pack(req);
-
         if (req_capsule_has_field(&req->rq_pill, &RMF_EADATA, RCL_SERVER))
                 req_capsule_set_size(&req->rq_pill, &RMF_EADATA,
                                      RCL_SERVER, output_size);
@@ -429,77 +385,26 @@ static int mdc_xattr_common(struct obd_export *exp,const struct req_format *fmt,
 }
 
 static int mdc_setxattr(struct obd_export *exp, const struct lu_fid *fid,
-			u64 obd_md_valid, const char *name,
-			const void *value, size_t value_size,
-			unsigned int xattr_flags, u32 suppgid,
-			struct ptlrpc_request **req)
+			u64 valid, const char *xattr_name,
+			const char *input, int input_size, int output_size,
+			int flags, __u32 suppgid,
+			struct ptlrpc_request **request)
 {
-	LASSERT(obd_md_valid == OBD_MD_FLXATTR ||
-		obd_md_valid == OBD_MD_FLXATTRRM);
-
 	return mdc_xattr_common(exp, &RQF_MDS_REINT_SETXATTR,
-				fid, MDS_REINT, obd_md_valid, name,
-				value, value_size, 0, xattr_flags, suppgid,
-				req);
+				fid, MDS_REINT, valid, xattr_name,
+				input, input_size, output_size, flags,
+				suppgid, request);
 }
 
 static int mdc_getxattr(struct obd_export *exp, const struct lu_fid *fid,
-			u64 obd_md_valid, const char *name, size_t buf_size,
-			struct ptlrpc_request **req)
+			u64 valid, const char *xattr_name,
+			const char *input, int input_size, int output_size,
+			int flags, struct ptlrpc_request **request)
 {
-	struct mdt_body *body;
-	int rc;
-
-	LASSERT(obd_md_valid == OBD_MD_FLXATTR ||
-		obd_md_valid == OBD_MD_FLXATTRLS);
-
-	CDEBUG(D_INFO, "%s: get xattr '%s' for "DFID"\n",
-	       exp->exp_obd->obd_name, name, PFID(fid));
-	rc = mdc_xattr_common(exp, &RQF_MDS_GETXATTR, fid, MDS_GETXATTR,
-			      obd_md_valid, name, NULL, 0, buf_size, 0, -1,
-			      req);
-	if (rc < 0)
-		GOTO(out, rc);
-
-	body = req_capsule_server_get(&(*req)->rq_pill, &RMF_MDT_BODY);
-	if (body == NULL)
-		GOTO(out, rc = -EPROTO);
-
-	/* only detect the xattr size */
-	if (buf_size == 0) {
-		/* LU-11109: Older MDTs do not distinguish
-		 * between nonexistent xattrs and zero length
-		 * values in this case. Newer MDTs will return
-		 * -ENODATA or set OBD_MD_FLXATTR. */
-		GOTO(out, rc = body->mbo_eadatasize);
-	}
-
-	if (body->mbo_eadatasize == 0) {
-		/* LU-11109: Newer MDTs set OBD_MD_FLXATTR on
-		 * success so that we can distinguish between
-		 * zero length value and nonexistent xattr.
-		 *
-		 * If OBD_MD_FLXATTR is not set then we keep
-		 * the old behavior and return -ENODATA for
-		 * getxattr() when mbo_eadatasize is 0. But
-		 * -ENODATA only makes sense for getxattr()
-		 * and not for listxattr(). */
-		if (body->mbo_valid & OBD_MD_FLXATTR)
-			GOTO(out, rc = 0);
-		else if (obd_md_valid == OBD_MD_FLXATTR)
-			GOTO(out, rc = -ENODATA);
-		else
-			GOTO(out, rc = 0);
-	}
-
-	GOTO(out, rc = body->mbo_eadatasize);
-out:
-	if (rc < 0) {
-		ptlrpc_req_finished(*req);
-		*req = NULL;
-	}
-
-	return rc;
+	return mdc_xattr_common(exp, &RQF_MDS_GETXATTR,
+				fid, MDS_GETXATTR, valid, xattr_name,
+				input, input_size, output_size, flags,
+				-1, request);
 }
 
 #ifdef CONFIG_FS_POSIX_ACL
@@ -647,41 +552,41 @@ int mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md)
 
 void mdc_replay_open(struct ptlrpc_request *req)
 {
-	struct md_open_data *mod = req->rq_cb_data;
-	struct ptlrpc_request *close_req;
-	struct obd_client_handle *och;
-	struct lustre_handle old_open_handle = { };
-	struct mdt_body *body;
-	ENTRY;
+        struct md_open_data *mod = req->rq_cb_data;
+        struct ptlrpc_request *close_req;
+        struct obd_client_handle *och;
+        struct lustre_handle old;
+        struct mdt_body *body;
+        ENTRY;
 
-	if (mod == NULL) {
-		DEBUG_REQ(D_ERROR, req,
-			  "Can't properly replay without open data.");
-		EXIT;
-		return;
-	}
+        if (mod == NULL) {
+                DEBUG_REQ(D_ERROR, req,
+                          "Can't properly replay without open data.");
+                EXIT;
+                return;
+        }
 
-	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
-	LASSERT(body != NULL);
+        body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+        LASSERT(body != NULL);
 
 	spin_lock(&req->rq_lock);
 	och = mod->mod_och;
-	if (och && och->och_open_handle.cookie)
+	if (och && och->och_fh.cookie)
 		req->rq_early_free_repbuf = 1;
 	else
 		req->rq_early_free_repbuf = 0;
 	spin_unlock(&req->rq_lock);
 
 	if (req->rq_early_free_repbuf) {
-		struct lustre_handle *file_open_handle;
+		struct lustre_handle *file_fh;
 
 		LASSERT(och->och_magic == OBD_CLIENT_HANDLE_MAGIC);
 
-		file_open_handle = &och->och_open_handle;
+		file_fh = &och->och_fh;
 		CDEBUG(D_HA, "updating handle from %#llx to %#llx\n",
-		       file_open_handle->cookie, body->mbo_open_handle.cookie);
-		old_open_handle = *file_open_handle;
-		*file_open_handle = body->mbo_open_handle;
+		       file_fh->cookie, body->mbo_handle.cookie);
+		old = *file_fh;
+		*file_fh = body->mbo_handle;
 	}
 
 	close_req = mod->mod_close_req;
@@ -695,11 +600,10 @@ void mdc_replay_open(struct ptlrpc_request *req)
 		LASSERT(epoch);
 
 		if (req->rq_early_free_repbuf)
-			LASSERT(old_open_handle.cookie ==
-				epoch->mio_open_handle.cookie);
+			LASSERT(!memcmp(&old, &epoch->mio_handle, sizeof(old)));
 
 		DEBUG_REQ(D_HA, close_req, "updating close body with new fh");
-		epoch->mio_open_handle = body->mbo_open_handle;
+		epoch->mio_handle = body->mbo_handle;
 	}
 	EXIT;
 }
@@ -781,20 +685,20 @@ int mdc_set_open_replay_data(struct obd_export *exp,
 		open_req->rq_commit_cb = mdc_commit_open;
 		open_req->rq_early_free_repbuf = 1;
 		spin_unlock(&open_req->rq_lock);
-	}
+        }
 
 	rec->cr_fid2 = body->mbo_fid1;
-	rec->cr_open_handle_old = body->mbo_open_handle;
+	rec->cr_ioepoch = body->mbo_ioepoch;
+	rec->cr_old_handle.cookie = body->mbo_handle.cookie;
 	open_req->rq_replay_cb = mdc_replay_open;
 	if (!fid_is_sane(&body->mbo_fid1)) {
-		DEBUG_REQ(D_ERROR, open_req,
-			  "saving replay request with insane FID " DFID,
-			  PFID(&body->mbo_fid1));
-		LBUG();
-	}
+                DEBUG_REQ(D_ERROR, open_req, "Saving replay request with "
+                          "insane fid");
+                LBUG();
+        }
 
-	DEBUG_REQ(D_RPCTRACE, open_req, "Set up open replay data");
-	RETURN(0);
+        DEBUG_REQ(D_RPCTRACE, open_req, "Set up open replay data");
+        RETURN(0);
 }
 
 static void mdc_free_open(struct md_open_data *mod)
@@ -838,7 +742,7 @@ int mdc_clear_open_replay_data(struct obd_export *exp,
 
 	spin_lock(&mod->mod_open_req->rq_lock);
 	if (mod->mod_och)
-		mod->mod_och->och_open_handle.cookie = 0;
+		mod->mod_och->och_fh.cookie = 0;
 	mod->mod_open_req->rq_early_free_repbuf = 0;
 	spin_unlock(&mod->mod_open_req->rq_lock);
 	mdc_free_open(mod);
@@ -856,35 +760,23 @@ static int mdc_close(struct obd_export *exp, struct md_op_data *op_data,
 	struct obd_device     *obd = class_exp2obd(exp);
 	struct ptlrpc_request *req;
 	struct req_format     *req_fmt;
-	size_t		       u32_count = 0;
 	int                    rc;
 	int		       saved_rc = 0;
 	ENTRY;
 
-	CDEBUG(D_INODE, "%s: "DFID" file closed with intent: %x\n",
-	       exp->exp_obd->obd_name, PFID(&op_data->op_fid1),
-	       op_data->op_bias);
-
-	if (op_data->op_bias & MDS_CLOSE_INTENT) {
-		req_fmt = &RQF_MDS_CLOSE_INTENT;
-		if (op_data->op_bias & MDS_HSM_RELEASE) {
-			/* allocate a FID for volatile file */
-			rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2,
-					   op_data);
-			if (rc < 0) {
-				CERROR("%s: "DFID" allocating FID: rc = %d\n",
-				       obd->obd_name, PFID(&op_data->op_fid1),
-				       rc);
-				/* save the errcode and proceed to close */
-				saved_rc = rc;
-			}
-		}
-		if (op_data->op_bias & MDS_CLOSE_RESYNC_DONE) {
-			size_t count = op_data->op_data_size / sizeof(__u32);
+	if (op_data->op_bias & MDS_HSM_RELEASE) {
+		req_fmt = &RQF_MDS_INTENT_CLOSE;
 
-			if (count > INLINE_RESYNC_ARRAY_SIZE)
-				u32_count = count;
+		/* allocate a FID for volatile file */
+		rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
+		if (rc < 0) {
+			CERROR("%s: "DFID" failed to allocate FID: %d\n",
+			       obd->obd_name, PFID(&op_data->op_fid1), rc);
+			/* save the errcode and proceed to close */
+			saved_rc = rc;
 		}
+	} else if (op_data->op_bias & MDS_CLOSE_LAYOUT_SWAP) {
+		req_fmt = &RQF_MDS_INTENT_CLOSE;
 	} else {
 		req_fmt = &RQF_MDS_CLOSE;
 	}
@@ -922,10 +814,6 @@ static int mdc_close(struct obd_export *exp, struct md_op_data *op_data,
 		GOTO(out, rc = -ENOMEM);
 	}
 
-	if (u32_count > 0)
-		req_capsule_set_size(&req->rq_pill, &RMF_U32, RCL_CLIENT,
-				     u32_count * sizeof(__u32));
-
 	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_CLOSE);
 	if (rc) {
 		ptlrpc_request_free(req);
@@ -939,9 +827,6 @@ static int mdc_close(struct obd_export *exp, struct md_op_data *op_data,
         req->rq_request_portal = MDS_READPAGE_PORTAL;
         ptlrpc_at_set_req_timeout(req);
 
-	if (!(exp_connect_flags2(exp) & OBD_CONNECT2_LSOM))
-		op_data->op_xvalid &= ~(OP_XVALID_LAZYSIZE |
-					OP_XVALID_LAZYBLOCKS);
 
         mdc_close_pack(req, op_data);
 
@@ -1225,12 +1110,12 @@ static void mdc_adjust_dirpages(struct page **pages, int cfs_pgs, int lu_pgs)
 	int i;
 
 	for (i = 0; i < cfs_pgs; i++) {
-		struct lu_dirpage *dp = kmap(pages[i]);
-		struct lu_dirpage *first = dp;
-		struct lu_dirent *end_dirent = NULL;
-		struct lu_dirent *ent;
-		__u64 hash_end = dp->ldp_hash_end;
-		__u32 flags = dp->ldp_flags;
+		struct lu_dirpage	*dp = kmap(pages[i]);
+		struct lu_dirpage	*first = dp;
+		struct lu_dirent	*end_dirent = NULL;
+		struct lu_dirent	*ent;
+		__u64		hash_end = le64_to_cpu(dp->ldp_hash_end);
+		__u32		flags = le32_to_cpu(dp->ldp_flags);
 
 		while (--lu_pgs > 0) {
 			ent = lu_dirent_start(dp);
@@ -1245,8 +1130,8 @@ static void mdc_adjust_dirpages(struct page **pages, int cfs_pgs, int lu_pgs)
 				break;
 
 			/* Save the hash and flags of this lu_dirpage. */
-			hash_end = dp->ldp_hash_end;
-			flags = dp->ldp_flags;
+			hash_end = le64_to_cpu(dp->ldp_hash_end);
+			flags = le32_to_cpu(dp->ldp_flags);
 
 			/* Check if lu_dirpage contains no entries. */
 			if (end_dirent == NULL)
@@ -1544,48 +1429,33 @@ static int mdc_read_page(struct obd_export *exp, struct md_op_data *op_data,
 	goto out_unlock;
 }
 
+
 static int mdc_statfs(const struct lu_env *env,
                       struct obd_export *exp, struct obd_statfs *osfs,
-		      time64_t max_age, __u32 flags)
+                      __u64 max_age, __u32 flags)
 {
-	struct obd_device *obd = class_exp2obd(exp);
-	struct req_format *fmt;
-	struct ptlrpc_request *req;
-	struct obd_statfs *msfs;
-	struct obd_import *imp = NULL;
-	int rc;
-	ENTRY;
+        struct obd_device     *obd = class_exp2obd(exp);
+        struct ptlrpc_request *req;
+        struct obd_statfs     *msfs;
+        struct obd_import     *imp = NULL;
+        int                    rc;
+        ENTRY;
 
         /*
          * Since the request might also come from lprocfs, so we need
          * sync this with client_disconnect_export Bug15684
          */
 	down_read(&obd->u.cli.cl_sem);
-	if (obd->u.cli.cl_import)
-		imp = class_import_get(obd->u.cli.cl_import);
+        if (obd->u.cli.cl_import)
+                imp = class_import_get(obd->u.cli.cl_import);
 	up_read(&obd->u.cli.cl_sem);
-	if (!imp)
-		RETURN(-ENODEV);
-
-	fmt = &RQF_MDS_STATFS;
-	if ((exp_connect_flags2(exp) & OBD_CONNECT2_SUM_STATFS) &&
-	    (flags & OBD_STATFS_SUM))
-		fmt = &RQF_MDS_STATFS_NEW;
-	req = ptlrpc_request_alloc_pack(imp, fmt, LUSTRE_MDS_VERSION,
-					MDS_STATFS);
-	if (req == NULL)
-		GOTO(output, rc = -ENOMEM);
+        if (!imp)
+                RETURN(-ENODEV);
 
-	if ((flags & OBD_STATFS_SUM) &&
-	    (exp_connect_flags2(exp) & OBD_CONNECT2_SUM_STATFS)) {
-		/* request aggregated states */
-		struct mdt_body *body;
-
-		body = req_capsule_client_get(&req->rq_pill, &RMF_MDT_BODY);
-		if (body == NULL)
-			GOTO(out, rc = -EPROTO);
-		body->mbo_valid = OBD_MD_FLAGSTATFS;
-	}
+        req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_STATFS,
+                                        LUSTRE_MDS_VERSION, MDS_STATFS);
+        if (req == NULL)
+                GOTO(output, rc = -ENOMEM);
 
         ptlrpc_request_set_replen(req);
 
@@ -1701,53 +1571,29 @@ static int mdc_ioc_hsm_progress(struct obd_export *exp,
 	ptlrpc_req_finished(req);
 	return rc;
 }
-/**
- * Send hsm_ct_register to MDS
- *
- * \param[in]	imp		import
- * \param[in]	archive_count	if in bitmap format, it is the bitmap,
- *				else it is the count of archive_ids
- * \param[in]	archives	if in bitmap format, it is NULL,
- *				else it is archive_id lists
- */
-static int mdc_ioc_hsm_ct_register(struct obd_import *imp, __u32 archive_count,
-				   __u32 *archives)
+
+static int mdc_ioc_hsm_ct_register(struct obd_import *imp, __u32 archives)
 {
-	struct ptlrpc_request *req;
-	__u32 *archive_array;
-	size_t archives_size;
-	int rc;
+	__u32			*archive_mask;
+	struct ptlrpc_request	*req;
+	int			 rc;
 	ENTRY;
 
-	req = ptlrpc_request_alloc(imp, &RQF_MDS_HSM_CT_REGISTER);
+	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_CT_REGISTER,
+					LUSTRE_MDS_VERSION,
+					MDS_HSM_CT_REGISTER);
 	if (req == NULL)
-		RETURN(-ENOMEM);
-
-	if (archives != NULL)
-		archives_size = sizeof(*archive_array) * archive_count;
-	else
-		archives_size = sizeof(archive_count);
-
-	req_capsule_set_size(&req->rq_pill, &RMF_MDS_HSM_ARCHIVE,
-			     RCL_CLIENT, archives_size);
-
-	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_CT_REGISTER);
-	if (rc) {
-		ptlrpc_request_free(req);
-		RETURN(-ENOMEM);
-	}
+		GOTO(out, rc = -ENOMEM);
 
 	mdc_pack_body(req, NULL, 0, 0, -1, 0);
 
-	archive_array = req_capsule_client_get(&req->rq_pill,
-					       &RMF_MDS_HSM_ARCHIVE);
-	if (archive_array == NULL)
+	/* Copy hsm_progress struct */
+	archive_mask = req_capsule_client_get(&req->rq_pill,
+					      &RMF_MDS_HSM_ARCHIVE);
+	if (archive_mask == NULL)
 		GOTO(out, rc = -EPROTO);
 
-	if (archives != NULL)
-		memcpy(archive_array, archives, archives_size);
-	else
-		*archive_array = archive_count;
+	*archive_mask = archives;
 
 	ptlrpc_request_set_replen(req);
 
@@ -2131,7 +1977,7 @@ static int mdc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
 			GOTO(out, rc = -EFAULT);
 
 		rc = mdc_statfs(NULL, obd->obd_self_export, &stat_buf,
-				ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
 				0);
 		if (rc != 0)
 			GOTO(out, rc);
@@ -2215,13 +2061,6 @@ static int mdc_get_info_rpc(struct obd_export *exp,
                              RCL_SERVER, vallen);
         ptlrpc_request_set_replen(req);
 
-	/* if server failed to resolve FID, and OI scrub not able to fix it, it
-	 * will return -EINPROGRESS, ptlrpc_queue_wait() will keep retrying,
-	 * set request interruptible to avoid deadlock.
-	 */
-	if (KEY_IS(KEY_FID2PATH))
-		req->rq_allow_intr = 1;
-
 	rc = ptlrpc_queue_wait(req);
 	/* -EREMOTE means the get_info result is partial, and it needs to
 	 * continue on another MDT, see fid2path part in lmv_iocontrol */
@@ -2275,8 +2114,9 @@ static void lustre_swab_kuch(struct kuc_hdr *l)
 static int mdc_ioc_hsm_ct_start(struct obd_export *exp,
 				struct lustre_kernelcomm *lk)
 {
-	struct obd_import *imp = class_exp2cliimp(exp);
-	int rc = 0;
+	struct obd_import  *imp = class_exp2cliimp(exp);
+	__u32		    archive = lk->lk_data;
+	int		    rc = 0;
 
 	if (lk->lk_group != KUC_GRP_HSM) {
 		CERROR("Bad copytool group %d\n", lk->lk_group);
@@ -2290,12 +2130,7 @@ static int mdc_ioc_hsm_ct_start(struct obd_export *exp,
 		/* Unregister with the coordinator */
 		rc = mdc_ioc_hsm_ct_unregister(imp);
 	} else {
-		__u32 *archives = NULL;
-
-		if ((lk->lk_flags & LK_FLG_DATANR) && lk->lk_data_count > 0)
-			archives = lk->lk_data;
-
-		rc = mdc_ioc_hsm_ct_register(imp, lk->lk_data_count, archives);
+		rc = mdc_ioc_hsm_ct_register(imp, archive);
 	}
 
 	return rc;
@@ -2346,29 +2181,17 @@ static int mdc_hsm_copytool_send(const struct obd_uuid *uuid,
  */
 static int mdc_hsm_ct_reregister(void *data, void *cb_arg)
 {
-	struct obd_import *imp = (struct obd_import *)cb_arg;
-	struct kkuc_ct_data *kcd = data;
-	__u32 *archives = NULL;
-	int rc;
+	struct kkuc_ct_data	*kcd = data;
+	struct obd_import	*imp = (struct obd_import *)cb_arg;
+	int			 rc;
 
-	if (kcd == NULL ||
-	    (kcd->kcd_magic != KKUC_CT_DATA_ARRAY_MAGIC &&
-	     kcd->kcd_magic != KKUC_CT_DATA_BITMAP_MAGIC))
+	if (kcd == NULL || kcd->kcd_magic != KKUC_CT_DATA_MAGIC)
 		return -EPROTO;
 
-	if (kcd->kcd_magic == KKUC_CT_DATA_BITMAP_MAGIC) {
-		CDEBUG(D_HA, "%s: recover copytool registration to MDT "
-		       "(archive=%#x)\n", imp->imp_obd->obd_name,
-		       kcd->kcd_nr_archives);
-	} else {
-		CDEBUG(D_HA, "%s: recover copytool registration to MDT "
-		       "(archive nr = %u)\n",
-		       imp->imp_obd->obd_name, kcd->kcd_nr_archives);
-		if (kcd->kcd_nr_archives != 0)
-			archives = kcd->kcd_archives;
-	}
+	CDEBUG(D_HA, "%s: recover copytool registration to MDT (archive=%#x)\n",
+	       imp->imp_obd->obd_name, kcd->kcd_archive);
+	rc = mdc_ioc_hsm_ct_register(imp, kcd->kcd_archive);
 
-	rc = mdc_ioc_hsm_ct_register(imp, kcd->kcd_nr_archives, archives);
 	/* ignore error if the copytool is already registered */
 	return (rc == -EEXIST) ? 0 : rc;
 }
@@ -2414,6 +2237,14 @@ static int mdc_set_info_async(const struct lu_env *env,
                                        keylen, key, vallen, val, set);
                 RETURN(rc);
         }
+        if (KEY_IS(KEY_SPTLRPC_CONF)) {
+                sptlrpc_conf_client_adapt(exp->exp_obd);
+                RETURN(0);
+        }
+        if (KEY_IS(KEY_FLUSH_CTX)) {
+                sptlrpc_import_flush_my_ctx(imp);
+                RETURN(0);
+        }
         if (KEY_IS(KEY_CHANGELOG_CLEAR)) {
                 rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION,
                                        keylen, key, vallen, val, set);
@@ -2432,8 +2263,8 @@ static int mdc_set_info_async(const struct lu_env *env,
 		RETURN(0);
 	}
 
-	rc = osc_set_info_async(env, exp, keylen, key, vallen, val, set);
-	RETURN(rc);
+	CERROR("Unknown key %s\n", (char *)key);
+	RETURN(-EINVAL);
 }
 
 static int mdc_get_info(const struct lu_env *env, struct obd_export *exp,
@@ -2509,97 +2340,17 @@ static int mdc_fsync(struct obd_export *exp, const struct lu_fid *fid,
         RETURN(rc);
 }
 
-struct mdc_rmfid_args {
-	int *mra_rcs;
-	int mra_nr;
-};
-
-int mdc_rmfid_interpret(const struct lu_env *env, struct ptlrpc_request *req,
-			  void *args, int rc)
-{
-	struct mdc_rmfid_args *aa;
-	int *rcs, size;
-	ENTRY;
-
-	if (!rc) {
-		aa = ptlrpc_req_async_args(req);
-
-		size = req_capsule_get_size(&req->rq_pill, &RMF_RCS,
-					    RCL_SERVER);
-		LASSERT(size == sizeof(int) * aa->mra_nr);
-		rcs = req_capsule_server_get(&req->rq_pill, &RMF_RCS);
-		LASSERT(rcs);
-		LASSERT(aa->mra_rcs);
-		LASSERT(aa->mra_nr);
-		memcpy(aa->mra_rcs, rcs, size);
-	}
-
-	RETURN(rc);
-}
-
-static int mdc_rmfid(struct obd_export *exp, struct fid_array *fa,
-		     int *rcs, struct ptlrpc_request_set *set)
-{
-	struct ptlrpc_request *req;
-	struct mdc_rmfid_args *aa;
-	struct mdt_body *b;
-	struct lu_fid *tmp;
-	int rc, flen;
-	ENTRY;
-
-	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_RMFID);
-	if (req == NULL)
-		RETURN(-ENOMEM);
-
-	flen = fa->fa_nr * sizeof(struct lu_fid);
-	req_capsule_set_size(&req->rq_pill, &RMF_FID_ARRAY,
-			     RCL_CLIENT, flen);
-	req_capsule_set_size(&req->rq_pill, &RMF_FID_ARRAY,
-			     RCL_SERVER, flen);
-	req_capsule_set_size(&req->rq_pill, &RMF_RCS,
-			     RCL_SERVER, fa->fa_nr * sizeof(__u32));
-	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_RMFID);
-	if (rc) {
-		ptlrpc_request_free(req);
-		RETURN(rc);
-	}
-	tmp = req_capsule_client_get(&req->rq_pill, &RMF_FID_ARRAY);
-	memcpy(tmp, fa->fa_fids, flen);
-
-	mdc_pack_body(req, NULL, 0, 0, -1, 0);
-	b = req_capsule_client_get(&req->rq_pill, &RMF_MDT_BODY);
-	b->mbo_ctime = ktime_get_real_seconds();
-
-	ptlrpc_request_set_replen(req);
-
-	LASSERT(rcs);
-	aa = ptlrpc_req_async_args(req);
-	aa->mra_rcs = rcs;
-	aa->mra_nr = fa->fa_nr;
-	req->rq_interpret_reply = mdc_rmfid_interpret;
-
-	ptlrpc_set_add_req(set, req);
-	ptlrpc_check_set(NULL, set);
-
-	RETURN(rc);
-}
-
 static int mdc_import_event(struct obd_device *obd, struct obd_import *imp,
 			    enum obd_import_event event)
 {
-	struct client_obd *cli = &obd->u.cli;
 	int rc = 0;
 
 	LASSERT(imp->imp_obd == obd);
 
 	switch (event) {
-	case IMP_EVENT_DISCON:
-		spin_lock(&cli->cl_loi_list_lock);
-		cli->cl_avail_grant = 0;
-		cli->cl_lost_grant = 0;
-		spin_unlock(&cli->cl_loi_list_lock);
-		break;
-	case IMP_EVENT_INACTIVE:
+
+	case IMP_EVENT_INACTIVE: {
+		struct client_obd *cli = &obd->u.cli;
 		/*
 		 * Flush current sequence to make client obtain new one
 		 * from server in case of disconnect/reconnect.
@@ -2611,28 +2362,12 @@ static int mdc_import_event(struct obd_device *obd, struct obd_import *imp,
 
 		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE);
 		break;
+	}
 	case IMP_EVENT_INVALIDATE: {
 		struct ldlm_namespace *ns = obd->obd_namespace;
-		struct lu_env *env;
-		__u16 refcheck;
 
 		ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
 
-		env = cl_env_get(&refcheck);
-		if (!IS_ERR(env)) {
-			/* Reset grants. All pages go to failing rpcs due to
-			 * the invalid import.
-			 */
-			osc_io_unplug(env, cli, NULL);
-
-			cfs_hash_for_each_nolock(ns->ns_rs_hash,
-						 osc_ldlm_resource_invalidate,
-						 env, 0);
-			cl_env_put(env, &refcheck);
-			ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
-		} else {
-			rc = PTR_ERR(env);
-		}
 		break;
 	}
 	case IMP_EVENT_ACTIVE:
@@ -2641,15 +2376,10 @@ static int mdc_import_event(struct obd_device *obd, struct obd_import *imp,
 		if (rc == 0)
 			rc = mdc_kuc_reregister(imp);
 		break;
-	case IMP_EVENT_OCD: {
-		struct obd_connect_data *ocd = &imp->imp_connect_data;
-
-		if (OCD_HAS_FLAG(ocd, GRANT))
-			osc_init_grant(cli, ocd);
-
+	case IMP_EVENT_OCD:
 		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD);
 		break;
-	}
+	case IMP_EVENT_DISCON:
 	case IMP_EVENT_DEACTIVATE:
 	case IMP_EVENT_ACTIVATE:
 		break;
@@ -2698,12 +2428,6 @@ static int mdc_cancel_weight(struct ldlm_lock *lock)
 	if (lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_OPEN)
 		RETURN(0);
 
-	/* Special case for DoM locks, cancel only unused and granted locks */
-	if (ldlm_has_dom(lock) &&
-	    (lock->l_granted_mode != lock->l_req_mode ||
-	     osc_ldlm_weigh_ast(lock) != 0))
-		RETURN(0);
-
 	RETURN(1);
 }
 
@@ -2752,21 +2476,25 @@ static void mdc_llog_finish(struct obd_device *obd)
 	EXIT;
 }
 
-int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg)
+static int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg)
 {
-	int rc;
-
+	int				rc;
 	ENTRY;
 
-	rc = osc_setup_common(obd, cfg);
+	rc = ptlrpcd_addref();
 	if (rc < 0)
 		RETURN(rc);
 
-	rc = mdc_tunables_init(obd);
-	if (rc)
-		GOTO(err_osc_cleanup, rc);
-
-	obd->u.cli.cl_dom_min_inline_repsize = MDC_DOM_DEF_INLINE_REPSIZE;
+        rc = client_obd_setup(obd, cfg);
+        if (rc)
+		GOTO(err_ptlrpcd_decref, rc);
+#ifdef CONFIG_PROC_FS
+	obd->obd_vars = lprocfs_mdc_obd_vars;
+	lprocfs_obd_setup(obd);
+	lprocfs_alloc_md_stats(obd, 0);
+#endif
+	sptlrpc_lprocfs_cliobd_attach(obd);
+	ptlrpc_lprocfs_register_obd(obd);
 
 	ns_register_cancel(obd->obd_namespace, mdc_cancel_weight);
 
@@ -2776,26 +2504,25 @@ int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg)
         if (rc) {
                 CERROR("%s: failed to setup llogging subsystems: rc = %d\n",
 		       obd->obd_name, rc);
-		GOTO(err_llog_cleanup, rc);
+		GOTO(err_mdc_cleanup, rc);
         }
 
 	rc = mdc_changelog_cdev_init(obd);
 	if (rc) {
 		CERROR("%s: failed to setup changelog char device: rc = %d\n",
 		       obd->obd_name, rc);
-		GOTO(err_changelog_cleanup, rc);
+		GOTO(err_mdc_cleanup, rc);
 	}
 
-	RETURN(rc);
+	EXIT;
+err_mdc_cleanup:
+	if (rc)
+		client_obd_cleanup(obd);
 
-err_changelog_cleanup:
-	mdc_llog_finish(obd);
-err_llog_cleanup:
-	lprocfs_free_md_stats(obd);
-	ptlrpc_lprocfs_unregister_obd(obd);
-err_osc_cleanup:
-	osc_cleanup_common(obd);
-	return rc;
+err_ptlrpcd_decref:
+	if (rc)
+	        ptlrpcd_decref();
+        return rc;
 }
 
 /* Initialize the default and maximum LOV EA sizes.  This allows
@@ -2826,7 +2553,6 @@ static int mdc_precleanup(struct obd_device *obd)
 {
 	ENTRY;
 
-	osc_precleanup_common(obd);
 	mdc_changelog_cdev_finish(obd);
 
 	obd_cleanup_client_import(obd);
@@ -2838,16 +2564,16 @@ static int mdc_precleanup(struct obd_device *obd)
 
 static int mdc_cleanup(struct obd_device *obd)
 {
-	return osc_cleanup_common(obd);
+        ptlrpcd_decref();
+
+        return client_obd_cleanup(obd);
 }
 
-int mdc_process_config(struct obd_device *obd, size_t len, void *buf)
+static int mdc_process_config(struct obd_device *obd, size_t len, void *buf)
 {
-	struct lustre_cfg *lcfg = buf;
-	size_t count  = class_modify_config(lcfg, PARAM_MDC,
-					    &obd->obd_kset.kobj);
-
-	return count > 0 ? 0 : count;
+        struct lustre_cfg *lcfg = buf;
+	int rc = class_process_proc_param(PARAM_MDC, obd->obd_vars, lcfg, obd);
+	return (rc > 0 ? 0: rc);
 }
 
 static struct obd_ops mdc_obd_ops = {
@@ -2858,8 +2584,7 @@ static struct obd_ops mdc_obd_ops = {
         .o_add_conn         = client_import_add_conn,
         .o_del_conn         = client_import_del_conn,
         .o_connect          = client_connect_import,
-	.o_reconnect	    = osc_reconnect,
-	.o_disconnect	    = osc_disconnect,
+        .o_disconnect       = client_disconnect_export,
         .o_iocontrol        = mdc_iocontrol,
         .o_set_info_async   = mdc_set_info_async,
         .o_statfs           = mdc_statfs,
@@ -2875,69 +2600,42 @@ static struct obd_ops mdc_obd_ops = {
 
 static struct md_ops mdc_md_ops = {
 	.m_get_root	    = mdc_get_root,
-	.m_null_inode	    = mdc_null_inode,
-	.m_close            = mdc_close,
-	.m_create           = mdc_create,
-	.m_enqueue          = mdc_enqueue,
-	.m_getattr          = mdc_getattr,
-	.m_getattr_name     = mdc_getattr_name,
-	.m_intent_lock      = mdc_intent_lock,
-	.m_link             = mdc_link,
-	.m_rename           = mdc_rename,
-	.m_setattr          = mdc_setattr,
-	.m_setxattr         = mdc_setxattr,
-	.m_getxattr         = mdc_getxattr,
+        .m_null_inode	    = mdc_null_inode,
+        .m_close            = mdc_close,
+        .m_create           = mdc_create,
+        .m_enqueue          = mdc_enqueue,
+        .m_getattr          = mdc_getattr,
+        .m_getattr_name     = mdc_getattr_name,
+        .m_intent_lock      = mdc_intent_lock,
+        .m_link             = mdc_link,
+        .m_rename           = mdc_rename,
+        .m_setattr          = mdc_setattr,
+        .m_setxattr         = mdc_setxattr,
+        .m_getxattr         = mdc_getxattr,
 	.m_fsync		= mdc_fsync,
-	.m_file_resync		= mdc_file_resync,
 	.m_read_page		= mdc_read_page,
-	.m_unlink           = mdc_unlink,
-	.m_cancel_unused    = mdc_cancel_unused,
-	.m_init_ea_size     = mdc_init_ea_size,
-	.m_set_lock_data    = mdc_set_lock_data,
-	.m_lock_match       = mdc_lock_match,
-	.m_get_lustre_md    = mdc_get_lustre_md,
-	.m_free_lustre_md   = mdc_free_lustre_md,
-	.m_set_open_replay_data = mdc_set_open_replay_data,
-	.m_clear_open_replay_data = mdc_clear_open_replay_data,
-	.m_intent_getattr_async = mdc_intent_getattr_async,
-	.m_revalidate_lock      = mdc_revalidate_lock,
-	.m_rmfid		= mdc_rmfid,
+        .m_unlink           = mdc_unlink,
+        .m_cancel_unused    = mdc_cancel_unused,
+        .m_init_ea_size     = mdc_init_ea_size,
+        .m_set_lock_data    = mdc_set_lock_data,
+        .m_lock_match       = mdc_lock_match,
+        .m_get_lustre_md    = mdc_get_lustre_md,
+        .m_free_lustre_md   = mdc_free_lustre_md,
+        .m_set_open_replay_data = mdc_set_open_replay_data,
+        .m_clear_open_replay_data = mdc_clear_open_replay_data,
+        .m_intent_getattr_async = mdc_intent_getattr_async,
+        .m_revalidate_lock      = mdc_revalidate_lock
 };
 
-dev_t mdc_changelog_dev;
-struct class *mdc_changelog_class;
 static int __init mdc_init(void)
 {
-	int rc = 0;
-	rc = alloc_chrdev_region(&mdc_changelog_dev, 0,
-				 MDC_CHANGELOG_DEV_COUNT,
-				 MDC_CHANGELOG_DEV_NAME);
-	if (rc)
-		return rc;
-
-	mdc_changelog_class = class_create(THIS_MODULE, MDC_CHANGELOG_DEV_NAME);
-	if (IS_ERR(mdc_changelog_class)) {
-		rc = PTR_ERR(mdc_changelog_class);
-		goto out_dev;
-	}
-
-	rc = class_register_type(&mdc_obd_ops, &mdc_md_ops, true, NULL,
-				 LUSTRE_MDC_NAME, &mdc_device_type);
-	if (rc)
-		goto out_dev;
-
-	return 0;
-
-out_dev:
-	unregister_chrdev_region(mdc_changelog_dev, MDC_CHANGELOG_DEV_COUNT);
-	return rc;
+	return class_register_type(&mdc_obd_ops, &mdc_md_ops, true, NULL,
+				   LUSTRE_MDC_NAME, NULL);
 }
 
 static void __exit mdc_exit(void)
 {
-	class_destroy(mdc_changelog_class);
-	unregister_chrdev_region(mdc_changelog_dev, MDC_CHANGELOG_DEV_COUNT);
-	class_unregister_type(LUSTRE_MDC_NAME);
+        class_unregister_type(LUSTRE_MDC_NAME);
 }
 
 MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
diff --git a/drivers/staging/lustrefsx/lustre/mgc/lproc_mgc.c b/drivers/staging/lustrefsx/lustre/mgc/lproc_mgc.c
index f277d3e489e70..ab1985d9d9d24 100644
--- a/drivers/staging/lustrefsx/lustre/mgc/lproc_mgc.c
+++ b/drivers/staging/lustrefsx/lustre/mgc/lproc_mgc.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -39,26 +39,33 @@
 
 #ifdef CONFIG_PROC_FS
 
-LDEBUGFS_SEQ_FOPS_RO_TYPE(mgc, connect_flags);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, connect_flags);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, server_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, conn_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, import);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, state);
 
-LDEBUGFS_SEQ_FOPS_RO_TYPE(mgc, server_uuid);
-
-LDEBUGFS_SEQ_FOPS_RO_TYPE(mgc, import);
-
-LDEBUGFS_SEQ_FOPS_RO_TYPE(mgc, state);
+LPROC_SEQ_FOPS_WO_TYPE(mgc, ping);
 
 static int mgc_ir_state_seq_show(struct seq_file *m, void *v)
 {
 	return lprocfs_mgc_rd_ir_state(m, m->private);
 }
+LPROC_SEQ_FOPS_RO(mgc_ir_state);
 
-LDEBUGFS_SEQ_FOPS_RO(mgc_ir_state);
-
-struct ldebugfs_vars ldebugfs_mgc_obd_vars[] = {
+struct lprocfs_vars lprocfs_mgc_obd_vars[] = {
+	{ .name	=	"uuid",
+	  .fops	=	&mgc_uuid_fops		},
+	{ .name	=	"ping",
+	  .fops	=	&mgc_ping_fops,
+	  .proc_mode =	0222			},
 	{ .name	=	"connect_flags",
 	  .fops	=	&mgc_connect_flags_fops	},
 	{ .name	=	"mgs_server_uuid",
 	  .fops	=	&mgc_server_uuid_fops	},
+	{ .name	=	"mgs_conn_uuid",
+	  .fops	=	&mgc_conn_uuid_fops	},
 	{ .name	=	"import",
 	  .fops	=	&mgc_import_fops	},
 	{ .name	=	"state",
@@ -68,28 +75,3 @@ struct ldebugfs_vars ldebugfs_mgc_obd_vars[] = {
 	{ NULL }
 };
 #endif /* CONFIG_PROC_FS */
-
-LUSTRE_ATTR(mgs_conn_uuid, 0444, conn_uuid_show, NULL);
-LUSTRE_RO_ATTR(conn_uuid);
-
-LUSTRE_RW_ATTR(ping);
-
-static struct attribute *mgc_attrs[] = {
-	&lustre_attr_mgs_conn_uuid.attr,
-	&lustre_attr_conn_uuid.attr,
-	&lustre_attr_ping.attr,
-	NULL,
-};
-
-int mgc_tunables_init(struct obd_device *obd)
-{
-	int rc;
-
-	obd->obd_ktype.default_attrs = mgc_attrs;
-	obd->obd_debugfs_vars = ldebugfs_mgc_obd_vars;
-	rc = lprocfs_obd_setup(obd, true);
-	if (rc)
-		return rc;
-
-	return sptlrpc_lprocfs_cliobd_attach(obd);
-}
diff --git a/drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h b/drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h
index 50a13ebf4d3ca..1a37720e901eb 100644
--- a/drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h
+++ b/drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -34,13 +34,16 @@
 #define _MGC_INTERNAL_H
 
 #include <libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
 #include <lustre_lib.h>
 #include <lustre_dlm.h>
 #include <lustre_log.h>
 #include <lustre_export.h>
 
-int mgc_tunables_init(struct obd_device *obd);
+#ifdef CONFIG_PROC_FS
+extern struct lprocfs_vars lprocfs_mgc_obd_vars[];
 int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data);
+#endif /* CONFIG_PROC_FS */
 
 int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld);
 
diff --git a/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c b/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c
index a495e75ad5b4f..a2a2bdd1f0732 100644
--- a/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c
+++ b/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -118,7 +118,7 @@ EXPORT_SYMBOL(mgc_logname2resid);
 
 /********************** config llog list **********************/
 static struct list_head config_llog_list = LIST_HEAD_INIT(config_llog_list);
-static DEFINE_SPINLOCK(config_list_lock);	/* protects config_llog_list */
+static DEFINE_SPINLOCK(config_list_lock);
 
 /* Take a reference to a config log */
 static int config_log_get(struct config_llog_data *cld)
@@ -170,18 +170,18 @@ static
 struct config_llog_data *config_log_find(char *logname,
                                          struct config_llog_instance *cfg)
 {
-	struct config_llog_data *cld;
-	struct config_llog_data *found = NULL;
-	unsigned long cfg_instance;
+        struct config_llog_data *cld;
+        struct config_llog_data *found = NULL;
+        void *                   instance;
+        ENTRY;
 
-	ENTRY;
-	LASSERT(logname != NULL);
+        LASSERT(logname != NULL);
 
-	cfg_instance = cfg ? cfg->cfg_instance : 0;
+        instance = cfg ? cfg->cfg_instance : NULL;
 	spin_lock(&config_list_lock);
 	list_for_each_entry(cld, &config_llog_list, cld_list_chain) {
-		/* check if cfg_instance is the one we want */
-		if (cfg_instance != cld->cld_cfg.cfg_instance)
+		/* check if instance equals */
+		if (instance != cld->cld_cfg.cfg_instance)
 			continue;
 
 		/* instance may be NULL, should check name */
@@ -207,8 +207,8 @@ struct config_llog_data *do_config_log_add(struct obd_device *obd,
 
 	ENTRY;
 
-	CDEBUG(D_MGC, "do adding config log %s-%016lx\n", logname,
-	       cfg ? cfg->cfg_instance : 0);
+	CDEBUG(D_MGC, "do adding config log %s:%p\n", logname,
+	       cfg ? cfg->cfg_instance : NULL);
 
 	OBD_ALLOC(cld, sizeof(*cld) + strlen(logname) + 1);
 	if (!cld)
@@ -253,49 +253,47 @@ struct config_llog_data *do_config_log_add(struct obd_device *obd,
 }
 
 static struct config_llog_data *config_recover_log_add(struct obd_device *obd,
-					char *fsname,
-					struct config_llog_instance *cfg,
-					struct super_block *sb)
+        char *fsname,
+        struct config_llog_instance *cfg,
+        struct super_block *sb)
 {
-	struct config_llog_instance lcfg = *cfg;
-	struct lustre_sb_info *lsi = s2lsi(sb);
-	struct config_llog_data *cld;
-	char logname[32];
+        struct config_llog_instance lcfg = *cfg;
+        struct lustre_sb_info *lsi = s2lsi(sb);
+        struct config_llog_data *cld;
+        char logname[32];
 
 	if (IS_OST(lsi))
-		return NULL;
+                return NULL;
 
 	/* for osp-on-ost, see lustre_start_osp() */
 	if (IS_MDT(lsi) && lcfg.cfg_instance)
 		return NULL;
 
-	/* We have to use different llog for clients and MDTs for DNE,
-	 * where only clients are notified if one of DNE server restarts.
-	 */
-	LASSERT(strlen(fsname) < sizeof(logname) / 2);
-	strncpy(logname, fsname, sizeof(logname));
+        /* we have to use different llog for clients and mdts for cmd
+         * where only clients are notified if one of cmd server restarts */
+        LASSERT(strlen(fsname) < sizeof(logname) / 2);
+        strcpy(logname, fsname);
 	if (IS_SERVER(lsi)) { /* mdt */
-		LASSERT(lcfg.cfg_instance == 0);
-		lcfg.cfg_instance = ll_get_cfg_instance(sb);
-		strncat(logname, "-mdtir", sizeof(logname));
-	} else {
-		LASSERT(lcfg.cfg_instance != 0);
-		strncat(logname, "-cliir", sizeof(logname));
-	}
+                LASSERT(lcfg.cfg_instance == NULL);
+                lcfg.cfg_instance = sb;
+                strcat(logname, "-mdtir");
+        } else {
+                LASSERT(lcfg.cfg_instance != NULL);
+                strcat(logname, "-cliir");
+        }
 
-	cld = do_config_log_add(obd, logname, CONFIG_T_RECOVER, &lcfg, sb);
-	return cld;
+        cld = do_config_log_add(obd, logname, CONFIG_T_RECOVER, &lcfg, sb);
+        return cld;
 }
 
 static struct config_llog_data *config_log_find_or_add(struct obd_device *obd,
 				char *logname, struct super_block *sb, int type,
 				struct config_llog_instance *cfg)
 {
-	struct config_llog_instance lcfg = *cfg;
-	struct config_llog_data *cld;
+	struct config_llog_instance	lcfg = *cfg;
+	struct config_llog_data		*cld;
 
-	/* Note class_config_llog_handler() depends on getting "obd" back */
-	lcfg.cfg_instance = sb ? ll_get_cfg_instance(sb) : (unsigned long)obd;
+	lcfg.cfg_instance = sb != NULL ? (void *)sb : (void *)obd;
 
 	cld = config_log_find(logname, &lcfg);
 	if (unlikely(cld != NULL))
@@ -325,8 +323,7 @@ config_log_add(struct obd_device *obd, char *logname,
 	bool locked = false;
 	ENTRY;
 
-	CDEBUG(D_MGC, "add config log %s-%016lx\n", logname,
-	       cfg->cfg_instance);
+	CDEBUG(D_MGC, "adding config log %s:%p\n", logname, cfg->cfg_instance);
 
 	/*
 	 * for each regular log, the depended sptlrpc log name is
@@ -536,15 +533,16 @@ static int config_log_end(char *logname, struct config_llog_instance *cfg)
 	RETURN(rc);
 }
 
+#ifdef CONFIG_PROC_FS
 int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data)
 {
 	struct obd_device       *obd = data;
 	struct obd_import       *imp;
 	struct obd_connect_data *ocd;
 	struct config_llog_data *cld;
-
 	ENTRY;
-	LASSERT(obd);
+
+	LASSERT(obd != NULL);
 	LPROCFS_CLIMP_CHECK(obd);
 	imp = obd->u.cli.cl_import;
 	ocd = &imp->imp_connect_data;
@@ -566,6 +564,7 @@ int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data)
 	LPROCFS_CLIMP_EXIT(obd);
 	RETURN(0);
 }
+#endif
 
 /* reenqueue any lost locks */
 #define RQ_RUNNING	0x1
@@ -963,9 +962,11 @@ static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 		GOTO(err_cleanup, rc);
 	}
 
-	rc = mgc_tunables_init(obd);
-	if (rc)
-		GOTO(err_sysfs, rc);
+#ifdef CONFIG_PROC_FS
+	obd->obd_vars = lprocfs_mgc_obd_vars;
+	lprocfs_obd_setup(obd);
+#endif
+	sptlrpc_lprocfs_cliobd_attach(obd);
 
 	if (atomic_inc_return(&mgc_count) == 1) {
 		rq_state = 0;
@@ -978,7 +979,7 @@ static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 			CERROR("%s: cannot start requeue thread: rc = %d; "
 			       "no more log updates\n",
 			       obd->obd_name, rc);
-			GOTO(err_sysfs, rc);
+			GOTO(err_cleanup, rc);
 		}
 		/* rc is the task_struct pointer of mgc_requeue_thread. */
 		rc = 0;
@@ -987,8 +988,6 @@ static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 
 	RETURN(rc);
 
-err_sysfs:
-	lprocfs_obd_cleanup(obd);
 err_cleanup:
 	client_obd_cleanup(obd);
 err_decref:
@@ -1405,34 +1404,34 @@ static int mgc_apply_recover_logs(struct obd_device *mgc,
 				  __u64 max_version,
 				  void *data, int datalen, bool mne_swab)
 {
-	struct config_llog_instance *cfg = &cld->cld_cfg;
-	struct lustre_sb_info *lsi = s2lsi(cfg->cfg_sb);
-	struct mgs_nidtbl_entry *entry;
-	struct lustre_cfg *lcfg;
-	struct lustre_cfg_bufs bufs;
-	u64 prev_version = 0;
-	char *inst;
-	char *buf;
-	int bufsz;
-	int pos;
-	int rc  = 0;
-	int off = 0;
+        struct config_llog_instance *cfg = &cld->cld_cfg;
+        struct lustre_sb_info       *lsi = s2lsi(cfg->cfg_sb);
+        struct mgs_nidtbl_entry *entry;
+        struct lustre_cfg       *lcfg;
+        struct lustre_cfg_bufs   bufs;
+        u64   prev_version = 0;
+        char *inst;
+        char *buf;
+        int   bufsz;
+        int   pos;
+        int   rc  = 0;
+        int   off = 0;
+        ENTRY;
 
-	ENTRY;
-	LASSERT(cfg->cfg_instance != 0);
-	LASSERT(ll_get_cfg_instance(cfg->cfg_sb) == cfg->cfg_instance);
+        LASSERT(cfg->cfg_instance != NULL);
+        LASSERT(cfg->cfg_sb == cfg->cfg_instance);
 
 	OBD_ALLOC(inst, PAGE_SIZE);
 	if (inst == NULL)
 		RETURN(-ENOMEM);
 
 	if (!IS_SERVER(lsi)) {
-		pos = snprintf(inst, PAGE_SIZE, "%016lx", cfg->cfg_instance);
+		pos = snprintf(inst, PAGE_SIZE, "%p", cfg->cfg_instance);
 		if (pos >= PAGE_SIZE) {
 			OBD_FREE(inst, PAGE_SIZE);
 			return -E2BIG;
 		}
-	} else {
+        } else {
 		LASSERT(IS_MDT(lsi));
 		rc = server_name2svname(lsi->lsi_svname, inst, NULL,
 					PAGE_SIZE);
@@ -1637,7 +1636,8 @@ static int mgc_process_recover_nodemap_log(struct obd_device *obd,
 	mgc_conn = class_exp2cliimp(cld->cld_mgcexp)->imp_connection;
 
 	/* don't need to get local config */
-	if (cld_is_nodemap(cld) && LNetIsPeerLocal(mgc_conn->c_peer.nid))
+	if (cld_is_nodemap(cld) &&
+	    (LNET_NETTYP(LNET_NIDNET(mgc_conn->c_peer.nid)) == LOLND))
 		GOTO(out, rc = 0);
 
         /* allocate buffer for bulk transfer.
@@ -1748,8 +1748,15 @@ static int mgc_process_recover_nodemap_log(struct obd_device *obd,
 #ifdef HAVE_SERVER_SUPPORT
 		/* config changed since first read RPC */
 		if (cld_is_nodemap(cld) && config_read_offset == 0) {
+			recent_nodemap = NULL;
+			nodemap_config_dealloc(new_config);
+			new_config = NULL;
+
 			CDEBUG(D_INFO, "nodemap config changed in transit, retrying\n");
-			GOTO(out, rc = -EAGAIN);
+
+			/* setting eof to false, we request config again */
+			eof = false;
+			GOTO(out, rc = 0);
 		}
 #endif
 		if (!eof)
@@ -1757,7 +1764,13 @@ static int mgc_process_recover_nodemap_log(struct obd_device *obd,
 		GOTO(out, rc);
 	}
 
-	mne_swab = ptlrpc_rep_need_swab(req);
+	mne_swab = !!ptlrpc_rep_need_swab(req);
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
+	/* This import flag means the server did an extra swab of IR MNE
+	 * records (fixed in LU-1252), reverse it here if needed. LU-1644 */
+	if (unlikely(req->rq_import->imp_need_mne_swab))
+		mne_swab = !mne_swab;
+#endif
 
 	/* When a nodemap config is received, we build a new nodemap config,
 	 * with new nodemap structs. We keep track of the most recently added
@@ -2049,12 +2062,12 @@ int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld)
 	mutex_lock(&cld->cld_lock);
 	if (cld->cld_stopping) {
 		mutex_unlock(&cld->cld_lock);
-		RETURN(0);
-	}
+                RETURN(0);
+        }
 
-	OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PAUSE_PROCESS_LOG, 20);
+        OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PAUSE_PROCESS_LOG, 20);
 
-	CDEBUG(D_MGC, "Process log %s-%016lx from %d\n", cld->cld_logname,
+	CDEBUG(D_MGC, "Process log %s:%p from %d\n", cld->cld_logname,
 	       cld->cld_cfg.cfg_instance, cld->cld_cfg.cfg_last_idx + 1);
 
 	/* Get the cfg lock on the llog */
@@ -2096,11 +2109,6 @@ int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld)
 				goto restart;
 			} else {
 				mutex_lock(&cld->cld_lock);
-				/* unlock/lock mutex, so check stopping again */
-				if (cld->cld_stopping) {
-					mutex_unlock(&cld->cld_lock);
-					RETURN(0);
-				}
 				spin_lock(&config_list_lock);
 				cld->cld_lostlock = 1;
 				spin_unlock(&config_list_lock);
@@ -2146,12 +2154,6 @@ int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld)
 			CERROR("Can't drop cfg lock: %d\n", rcl);
 	}
 
-	/* requeue nodemap lock immediately if transfer was interrupted */
-	if (cld_is_nodemap(cld) && rc == -EAGAIN) {
-		mgc_requeue_add(cld);
-		rc = 0;
-	}
-
 	RETURN(rc);
 }
 
@@ -2210,6 +2212,11 @@ static int mgc_process_config(struct obd_device *obd, size_t len, void *buf)
 			break;
 		}
 
+		/* COMPAT_146 */
+		/* FIXME only set this for old logs!  Right now this forces
+		   us to always skip the "inside markers" check */
+		cld->cld_cfg.cfg_flags |= CFG_F_COMPAT146;
+
 		rc = mgc_process_log(obd, cld);
 		if (rc == 0 && cld->cld_recover != NULL) {
 			if (OCD_HAS_FLAG(&obd->u.cli.cl_import->
@@ -2280,7 +2287,7 @@ static struct obd_ops mgc_obd_ops = {
 
 static int __init mgc_init(void)
 {
-	return class_register_type(&mgc_obd_ops, NULL, false, NULL,
+	return class_register_type(&mgc_obd_ops, NULL, true, NULL,
 				   LUSTRE_MGC_NAME, NULL);
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/Makefile b/drivers/staging/lustrefsx/lustre/obdclass/Makefile
index 6f470dd9a2fc0..57450ea2824c1 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/Makefile
+++ b/drivers/staging/lustrefsx/lustre/obdclass/Makefile
@@ -1,14 +1,16 @@
 obj-$(CONFIG_LUSTREFSX_FS)	+= obdclass.o
 
-obdclass-y := llog.o llog_cat.o llog_obd.o llog_swab.o llog_osd.o
-obdclass-y += class_obd.o debug.o genops.o llog_ioctl.o
+obdclass-linux-objs := linux-module.o linux-obdo.o linux-sysctl.o
+obdclass-linux-objs := $(addprefix linux/,$(obdclass-linux-objs))
+
+obdclass-y := $(obdclass-linux-objs)
+obdclass-y += llog.o llog_cat.o llog_obd.o llog_swab.o llog_osd.o
+obdclass-y += class_obd.o debug.o genops.o uuid.o llog_ioctl.o
 obdclass-y += lprocfs_status.o lprocfs_counters.o
 obdclass-y += lustre_handles.o lustre_peer.o local_storage.o
-obdclass-y += statfs_pack.o obdo.o obd_config.o obd_mount.o obd_sysfs.o
+obdclass-y += statfs_pack.o obdo.o obd_config.o obd_mount.o
 obdclass-y += lu_object.o dt_object.o
 obdclass-y += cl_object.o cl_page.o cl_lock.o cl_io.o lu_ref.o
-obdclass-y += linkea.o
-obdclass-y += kernelcomm.o jobid.o
-obdclass-y += integrity.o obd_cksum.o
+obdclass-y += linkea.o kernelcomm.o
 
 include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/acl.c b/drivers/staging/lustrefsx/lustre/obdclass/acl.c
index 599946f846ec3..77ea22644e27b 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/acl.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/acl.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2013, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -49,22 +49,20 @@
 #ifdef CONFIG_FS_POSIX_ACL
 
 static inline void lustre_posix_acl_le_to_cpu(posix_acl_xattr_entry *d,
-					      posix_acl_xattr_entry *s)
+                                              posix_acl_xattr_entry *s)
 {
-	d->e_tag = le16_to_cpu(s->e_tag);
-	d->e_perm = le16_to_cpu(s->e_perm);
-	d->e_id = le32_to_cpu(s->e_id);
+        d->e_tag        = le16_to_cpu(s->e_tag);
+        d->e_perm       = le16_to_cpu(s->e_perm);
+        d->e_id         = le32_to_cpu(s->e_id);
 }
 
-#if 0
-static inline void lustre_posix_acl_cpu_to_le(posix_acl_xattr_entry *d,
-					      posix_acl_xattr_entry *s)
+/*static inline void lustre_posix_acl_cpu_to_le(posix_acl_xattr_entry *d,
+                                              posix_acl_xattr_entry *s)
 {
-	d->e_tag = cpu_to_le16(s->e_tag);
-	d->e_perm = cpu_to_le16(s->e_perm);
-	d->e_id = cpu_to_le32(s->e_id);
-}
-#endif
+        d->e_tag        = cpu_to_le16(s->e_tag);
+        d->e_perm       = cpu_to_le16(s->e_perm);
+        d->e_id         = cpu_to_le32(s->e_id);
+}*/
 
 /*
  * Check permission based on POSIX ACL.
@@ -73,79 +71,80 @@ int lustre_posix_acl_permission(struct lu_ucred *mu, const struct lu_attr *la,
 				int want, posix_acl_xattr_entry *entry,
 				int count)
 {
-	posix_acl_xattr_entry *pa, *pe, *mask_obj;
-	posix_acl_xattr_entry ae, me;
-	int found = 0;
+        posix_acl_xattr_entry *pa, *pe, *mask_obj;
+        posix_acl_xattr_entry ae, me;
+        int found = 0;
 
-	if (count <= 0)
-		return -EACCES;
+        if (count <= 0)
+                return -EACCES;
 
-	for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) {
-		lustre_posix_acl_le_to_cpu(&ae, pa);
-		switch (ae.e_tag) {
-		case ACL_USER_OBJ:
-			/* (May have been checked already) */
+        for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) {
+                lustre_posix_acl_le_to_cpu(&ae, pa);
+                switch (ae.e_tag) {
+                case ACL_USER_OBJ:
+                        /* (May have been checked already) */
 			if (la->la_uid == mu->uc_fsuid)
 				goto check_perm;
-			break;
-		case ACL_USER:
+                        break;
+                case ACL_USER:
 			if (ae.e_id == mu->uc_fsuid)
 				goto mask;
-			break;
-		case ACL_GROUP_OBJ:
-			if (lustre_in_group_p(mu, la->la_gid)) {
-				found = 1;
-				if ((ae.e_perm & want) == want)
-					goto mask;
-			}
-			break;
-		case ACL_GROUP:
-			if (lustre_in_group_p(mu, ae.e_id)) {
-				found = 1;
-				if ((ae.e_perm & want) == want)
-					goto mask;
-			}
-			break;
-		case ACL_MASK:
-			break;
-		case ACL_OTHER:
-			if (found)
-				return -EACCES;
-			goto check_perm;
-		default:
-			return -EIO;
-}
-	}
-	return -EIO;
+                        break;
+                case ACL_GROUP_OBJ:
+                        if (lustre_in_group_p(mu, la->la_gid)) {
+                                found = 1;
+                                if ((ae.e_perm & want) == want)
+                                        goto mask;
+                        }
+                        break;
+                case ACL_GROUP:
+                        if (lustre_in_group_p(mu, ae.e_id)) {
+                                found = 1;
+                                if ((ae.e_perm & want) == want)
+                                        goto mask;
+                        }
+                        break;
+                case ACL_MASK:
+                        break;
+                case ACL_OTHER:
+                        if (found)
+                                return -EACCES;
+                        else
+                                goto check_perm;
+                default:
+                        return -EIO;
+                }
+        }
+        return -EIO;
 
 mask:
-	for (mask_obj = pa + 1; mask_obj <= pe; mask_obj++) {
-		lustre_posix_acl_le_to_cpu(&me, mask_obj);
-		if (me.e_tag == ACL_MASK) {
-			if ((ae.e_perm & me.e_perm & want) == want)
-				return 0;
+        for (mask_obj = pa + 1; mask_obj <= pe; mask_obj++) {
+                lustre_posix_acl_le_to_cpu(&me, mask_obj);
+                if (me.e_tag == ACL_MASK) {
+                        if ((ae.e_perm & me.e_perm & want) == want)
+                                return 0;
 
-			return -EACCES;
-		}
-	}
+                        return -EACCES;
+                }
+        }
 
 check_perm:
-	if ((ae.e_perm & want) == want)
-		return 0;
+        if ((ae.e_perm & want) == want)
+                return 0;
 
-	return -EACCES;
+        return -EACCES;
 }
 EXPORT_SYMBOL(lustre_posix_acl_permission);
 
 /*
  * Modify the ACL for the chmod.
  */
-int lustre_posix_acl_chmod_masq(posix_acl_xattr_entry *entry, u32 mode,
-				int count)
+int lustre_posix_acl_chmod_masq(posix_acl_xattr_entry *entry, __u32 mode,
+                                int count)
 {
 	posix_acl_xattr_entry *group_obj = NULL, *mask_obj = NULL, *pa, *pe;
 
-	for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) {
+        for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) {
 		switch (le16_to_cpu(pa->e_tag)) {
 		case ACL_USER_OBJ:
 			pa->e_perm = cpu_to_le16((mode & S_IRWXU) >> 6);
@@ -188,8 +187,8 @@ lustre_posix_acl_equiv_mode(posix_acl_xattr_entry *entry, mode_t *mode_p,
 			    int count)
 {
 	posix_acl_xattr_entry *pa, *pe;
-	mode_t mode = 0;
-	int not_equiv = 0;
+	mode_t                 mode = 0;
+	int                    not_equiv = 0;
 
 	for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) {
 		__u16 perm = le16_to_cpu(pa->e_perm);
@@ -225,19 +224,19 @@ EXPORT_SYMBOL(lustre_posix_acl_equiv_mode);
 /*
  * Modify acl when creating a new object.
  */
-int lustre_posix_acl_create_masq(posix_acl_xattr_entry *entry, u32 *pmode,
-				 int count)
+int lustre_posix_acl_create_masq(posix_acl_xattr_entry *entry, __u32 *pmode,
+                                 int count)
 {
-	posix_acl_xattr_entry *group_obj = NULL, *mask_obj = NULL, *pa, *pe;
-	posix_acl_xattr_entry ae;
-	u32 mode = *pmode;
+        posix_acl_xattr_entry *group_obj = NULL, *mask_obj = NULL, *pa, *pe;
+        posix_acl_xattr_entry ae;
+	__u32 mode = *pmode;
 	int not_equiv = 0;
 
-	for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) {
-		lustre_posix_acl_le_to_cpu(&ae, pa);
-		switch (ae.e_tag) {
-		case ACL_USER_OBJ:
-			ae.e_perm &= (mode >> 6) | ~(0007);
+        for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) {
+                lustre_posix_acl_le_to_cpu(&ae, pa);
+                switch (ae.e_tag) {
+                case ACL_USER_OBJ:
+                        ae.e_perm &= (mode >> 6) | ~S_IRWXO;
 			pa->e_perm = cpu_to_le16(ae.e_perm);
 			mode &= (ae.e_perm << 6) | ~S_IRWXU;
 			break;
@@ -245,39 +244,39 @@ int lustre_posix_acl_create_masq(posix_acl_xattr_entry *entry, u32 *pmode,
 		case ACL_GROUP:
 			not_equiv = 1;
 			break;
-		case ACL_GROUP_OBJ:
+                case ACL_GROUP_OBJ:
 			group_obj = pa;
-			break;
-		case ACL_OTHER:
-			ae.e_perm &= mode | ~(0007);
+                        break;
+                case ACL_OTHER:
+                        ae.e_perm &= mode | ~S_IRWXO;
 			pa->e_perm = cpu_to_le16(ae.e_perm);
-			mode &= ae.e_perm | ~(0007);
-			break;
-		case ACL_MASK:
+			mode &= ae.e_perm | ~S_IRWXO;
+                        break;
+                case ACL_MASK:
 			mask_obj = pa;
 			not_equiv = 1;
-			break;
+                        break;
 		default:
 			return -EIO;
-		}
-	}
+                }
+        }
 
 	if (mask_obj) {
 		ae.e_perm = le16_to_cpu(mask_obj->e_perm) &
-					((mode >> 3) | ~(0007));
+                            ((mode >> 3) | ~S_IRWXO);
 		mode &= (ae.e_perm << 3) | ~S_IRWXG;
-		mask_obj->e_perm = cpu_to_le16(ae.e_perm);
+                mask_obj->e_perm = cpu_to_le16(ae.e_perm);
 	} else {
 		if (!group_obj)
 			return -EIO;
 		ae.e_perm = le16_to_cpu(group_obj->e_perm) &
-					((mode >> 3) | ~(0007));
+                            ((mode >> 3) | ~S_IRWXO);
 		mode &= (ae.e_perm << 3) | ~S_IRWXG;
-		group_obj->e_perm = cpu_to_le16(ae.e_perm);
+                group_obj->e_perm = cpu_to_le16(ae.e_perm);
 	}
 
 	*pmode = (*pmode & ~S_IRWXUGO) | mode;
-	return not_equiv;
+        return not_equiv;
 }
 EXPORT_SYMBOL(lustre_posix_acl_create_masq);
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_internal.h b/drivers/staging/lustrefsx/lustre/obdclass/cl_internal.h
index 0c1276deb37bc..0f95caf310755 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/cl_internal.h
+++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_io.c b/drivers/staging/lustrefsx/lustre/obdclass/cl_io.c
index 181ef89299b2d..fc22b2c89f17d 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/cl_io.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_io.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -44,6 +44,7 @@
 #include <lustre_fid.h>
 #include <cl_object.h>
 #include "cl_internal.h"
+#include <lustre_compat.h>
 
 /*****************************************************************************
  *
@@ -121,7 +122,6 @@ void cl_io_fini(const struct lu_env *env, struct cl_io *io)
 		/* Check ignore layout change conf */
 		LASSERT(ergo(io->ci_ignore_layout || !io->ci_verify_layout,
 				!io->ci_need_restart));
-	case CIT_GLIMPSE:
 		break;
 	case CIT_LADVISE:
 		break;
@@ -188,12 +188,9 @@ EXPORT_SYMBOL(cl_io_sub_init);
 int cl_io_init(const struct lu_env *env, struct cl_io *io,
                enum cl_io_type iot, struct cl_object *obj)
 {
-	LASSERT(obj == cl_object_top(obj));
+        LASSERT(obj == cl_object_top(obj));
 
-	/* clear I/O restart from previous instance */
-	io->ci_need_restart = 0;
-
-	return cl_io_init0(env, io, iot, obj);
+        return cl_io_init0(env, io, iot, obj);
 }
 EXPORT_SYMBOL(cl_io_init);
 
@@ -203,24 +200,33 @@ EXPORT_SYMBOL(cl_io_init);
  * \pre iot == CIT_READ || iot == CIT_WRITE
  */
 int cl_io_rw_init(const struct lu_env *env, struct cl_io *io,
-		  enum cl_io_type iot, loff_t pos, size_t count)
+                  enum cl_io_type iot, loff_t pos, size_t count)
 {
 	LINVRNT(iot == CIT_READ || iot == CIT_WRITE);
 	LINVRNT(io->ci_obj != NULL);
 	ENTRY;
 
+	if (cfs_ptengine_weight(cl_io_engine) < 2)
+		io->ci_pio = 0;
+
 	LU_OBJECT_HEADER(D_VFSTRACE, env, &io->ci_obj->co_lu,
-			 "io range: %u [%llu, %llu) %u %u\n",
-			 iot, (__u64)pos, (__u64)pos + count,
-			 io->u.ci_rw.crw_nonblock, io->u.ci_wr.wr_append);
-	io->u.ci_rw.crw_pos    = pos;
-	io->u.ci_rw.crw_count  = count;
+			 "io %s range: [%llu, %llu) %s %s %s %s\n",
+			 iot == CIT_READ ? "read" : "write",
+			 pos, pos + count,
+			 io->u.ci_rw.rw_nonblock ? "nonblock" : "block",
+			 io->u.ci_rw.rw_append ? "append" : "-",
+			 io->u.ci_rw.rw_sync ? "sync" : "-",
+			 io->ci_pio ? "pio" : "-");
+
+	io->u.ci_rw.rw_range.cir_pos   = pos;
+	io->u.ci_rw.rw_range.cir_count = count;
+
 	RETURN(cl_io_init(env, io, iot, io->ci_obj));
 }
 EXPORT_SYMBOL(cl_io_rw_init);
 
 static int cl_lock_descr_sort(const struct cl_lock_descr *d0,
-			      const struct cl_lock_descr *d1)
+                              const struct cl_lock_descr *d1)
 {
 	return lu_fid_cmp(lu_object_fid(&d0->cld_obj->co_lu),
 			  lu_object_fid(&d1->cld_obj->co_lu));
@@ -464,25 +470,25 @@ EXPORT_SYMBOL(cl_io_iter_fini);
  */
 void cl_io_rw_advance(const struct lu_env *env, struct cl_io *io, size_t nob)
 {
-	const struct cl_io_slice *scan;
+        const struct cl_io_slice *scan;
 
-	ENTRY;
+        LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE ||
+                nob == 0);
+        LINVRNT(cl_io_is_loopable(io));
+        LINVRNT(cl_io_invariant(io));
 
-	LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE ||
-		nob == 0);
-	LINVRNT(cl_io_is_loopable(io));
-	LINVRNT(cl_io_invariant(io));
+        ENTRY;
 
-	io->u.ci_rw.crw_pos   += nob;
-	io->u.ci_rw.crw_count -= nob;
+	io->u.ci_rw.rw_range.cir_pos   += nob;
+	io->u.ci_rw.rw_range.cir_count -= nob;
 
-	/* layers have to be notified. */
+        /* layers have to be notified. */
 	list_for_each_entry_reverse(scan, &io->ci_layers, cis_linkage) {
 		if (scan->cis_iop->op[io->ci_type].cio_advance != NULL)
 			scan->cis_iop->op[io->ci_type].cio_advance(env, scan,
 								   nob);
 	}
-	EXIT;
+        EXIT;
 }
 
 /**
@@ -681,7 +687,6 @@ int cl_io_submit_sync(const struct lu_env *env, struct cl_io *io,
 	struct cl_sync_io *anchor = &cl_env_info(env)->clt_anchor;
 	struct cl_page *pg;
 	int rc;
-	ENTRY;
 
 	cl_page_list_for_each(pg, &queue->c2_qin) {
 		LASSERT(pg->cp_sync_io == NULL);
@@ -710,7 +715,7 @@ int cl_io_submit_sync(const struct lu_env *env, struct cl_io *io,
 		cl_page_list_for_each(pg, &queue->c2_qin)
 			pg->cp_sync_io = NULL;
 	}
-	RETURN(rc);
+	return rc;
 }
 EXPORT_SYMBOL(cl_io_submit_sync);
 
@@ -733,6 +738,53 @@ int cl_io_cancel(const struct lu_env *env, struct cl_io *io,
         return result;
 }
 
+static
+struct cl_io_pt *cl_io_submit_pt(struct cl_io *io, loff_t pos, size_t count)
+{
+	struct cl_io_pt *pt;
+	int rc;
+
+	OBD_ALLOC(pt, sizeof(*pt));
+	if (pt == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	pt->cip_next = NULL;
+	init_sync_kiocb(&pt->cip_iocb, io->u.ci_rw.rw_file);
+	pt->cip_iocb.ki_pos = pos;
+#ifdef HAVE_KIOCB_KI_LEFT
+	pt->cip_iocb.ki_left = count;
+#elif defined(HAVE_KI_NBYTES)
+	pt->cip_iocb.ki_nbytes = count;
+#endif
+	pt->cip_iter = io->u.ci_rw.rw_iter;
+	iov_iter_truncate(&pt->cip_iter, count);
+	pt->cip_file   = io->u.ci_rw.rw_file;
+	pt->cip_iot    = io->ci_type;
+	pt->cip_pos    = pos;
+	pt->cip_count  = count;
+	pt->cip_result = 0;
+
+	rc = cfs_ptask_init(&pt->cip_task, io->u.ci_rw.rw_ptask, pt,
+			    PTF_ORDERED | PTF_COMPLETE |
+			    PTF_USER_MM | PTF_RETRY, smp_processor_id());
+	if (rc)
+		GOTO(out_error, rc);
+
+	CDEBUG(D_VFSTRACE, "submit %s range: [%llu, %llu)\n",
+		io->ci_type == CIT_READ ? "read" : "write",
+		pos, pos + count);
+
+	rc = cfs_ptask_submit(&pt->cip_task, cl_io_engine);
+	if (rc)
+		GOTO(out_error, rc);
+
+	RETURN(pt);
+
+out_error:
+	OBD_FREE(pt, sizeof(*pt));
+	RETURN(ERR_PTR(rc));
+}
+
 /**
  * Main io loop.
  *
@@ -754,50 +806,124 @@ int cl_io_cancel(const struct lu_env *env, struct cl_io *io,
  */
 int cl_io_loop(const struct lu_env *env, struct cl_io *io)
 {
-	int result   = 0;
+	struct cl_io_pt *pt = NULL, *head = NULL;
+	struct cl_io_pt **tail = &head;
+	loff_t pos;
+	size_t count;
+	size_t last_chunk_count = 0;
+	bool short_io = false;
+	int rc = 0;
+	ENTRY;
 
 	LINVRNT(cl_io_is_loopable(io));
-	ENTRY;
 
 	do {
-		size_t nob;
-
 		io->ci_continue = 0;
-		result = cl_io_iter_init(env, io);
-		if (result == 0) {
-			nob    = io->ci_nob;
-			result = cl_io_lock(env, io);
-			if (result == 0) {
-				/*
-				 * Notify layers that locks has been taken,
-				 * and do actual i/o.
-				 *
-				 *   - llite: kms, short read;
-				 *   - llite: generic_file_read();
-				 */
-				result = cl_io_start(env, io);
-				/*
-				 * Send any remaining pending
-				 * io, etc.
-				 *
-				 **   - llite: ll_rw_stats_tally.
-				 */
-				cl_io_end(env, io);
-				cl_io_unlock(env, io);
-				cl_io_rw_advance(env, io, io->ci_nob - nob);
+
+		rc = cl_io_iter_init(env, io);
+		if (rc) {
+			cl_io_iter_fini(env, io);
+			break;
+		}
+
+		pos   = io->u.ci_rw.rw_range.cir_pos;
+		count = io->u.ci_rw.rw_range.cir_count;
+
+		if (io->ci_pio) {
+			/* submit this range for parallel execution */
+			pt = cl_io_submit_pt(io, pos, count);
+			if (IS_ERR(pt)) {
+				cl_io_iter_fini(env, io);
+				rc = PTR_ERR(pt);
+				break;
+			}
+
+			*tail = pt;
+			tail = &pt->cip_next;
+		} else {
+			size_t nob = io->ci_nob;
+
+			CDEBUG(D_VFSTRACE,
+				"execute type %u range: [%llu, %llu) nob: %zu %s\n",
+				io->ci_type, pos, pos + count, nob,
+				io->ci_continue ? "continue" : "stop");
+
+			rc = cl_io_lock(env, io);
+			if (rc) {
+				cl_io_iter_fini(env, io);
+				break;
 			}
+
+			/*
+			 * Notify layers that locks has been taken,
+			 * and do actual i/o.
+			 *
+			 *   - llite: kms, short read;
+			 *   - llite: generic_file_read();
+			 */
+			rc = cl_io_start(env, io);
+
+			/*
+			 * Send any remaining pending
+			 * io, etc.
+			 *
+			 *   - llite: ll_rw_stats_tally.
+			 */
+			cl_io_end(env, io);
+			cl_io_unlock(env, io);
+
+			count = io->ci_nob - nob;
+			last_chunk_count = count;
 		}
-		cl_io_iter_fini(env, io);
-	} while (result == 0 && io->ci_continue);
 
-	if (result == -EWOULDBLOCK && io->ci_ndelay) {
-		io->ci_need_restart = 1;
-		result = 0;
+		cl_io_rw_advance(env, io, count);
+		cl_io_iter_fini(env, io);
+	} while (!rc && io->ci_continue);
+
+	CDEBUG(D_VFSTRACE, "loop type %u done: nob: %zu, rc: %d %s\n",
+		io->ci_type, io->ci_nob, rc,
+		io->ci_continue ? "continue" : "stop");
+
+	while (head != NULL) {
+		int rc2;
+
+		pt = head;
+		head = head->cip_next;
+
+		rc2 = cfs_ptask_wait_for(&pt->cip_task);
+		LASSERTF(!rc2, "wait for task error: %d\n", rc2);
+
+		rc2 = cfs_ptask_result(&pt->cip_task);
+		CDEBUG(D_VFSTRACE,
+			"done %s range: [%llu, %llu) ret: %zd, rc: %d\n",
+			pt->cip_iot == CIT_READ ? "read" : "write",
+			pt->cip_pos, pt->cip_pos + pt->cip_count,
+			pt->cip_result, rc2);
+		if (rc2)
+			rc = rc ? rc : rc2;
+		if (!short_io) {
+			if (!rc2) /* IO is done by this task successfully */
+				io->ci_nob += pt->cip_result;
+			if (pt->cip_result < pt->cip_count) {
+				/* short IO happened.
+				 * Not necessary to be an error */
+				CDEBUG(D_VFSTRACE,
+					"incomplete range: [%llu, %llu) "
+					"last_chunk_count: %zu\n",
+					pt->cip_pos,
+					pt->cip_pos + pt->cip_count,
+					last_chunk_count);
+				io->ci_nob -= last_chunk_count;
+				short_io = true;
+			}
+		}
+		OBD_FREE(pt, sizeof(*pt));
 	}
 
-	if (result == 0)
-		result = io->ci_result;
-	RETURN(result < 0 ? result : 0);
+	CDEBUG(D_VFSTRACE, "return nob: %zu (%s io), rc: %d\n",
+		io->ci_nob, short_io ? "short" : "full", rc);
+
+	RETURN(rc < 0 ? rc : io->ci_result);
 }
 EXPORT_SYMBOL(cl_io_loop);
 
@@ -811,20 +937,20 @@ EXPORT_SYMBOL(cl_io_loop);
  * \see cl_lock_slice_add(), cl_req_slice_add(), cl_page_slice_add()
  */
 void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice,
-		     struct cl_object *obj,
-		     const struct cl_io_operations *ops)
+                     struct cl_object *obj,
+                     const struct cl_io_operations *ops)
 {
 	struct list_head *linkage = &slice->cis_linkage;
 
-	LASSERT((linkage->prev == NULL && linkage->next == NULL) ||
+        LASSERT((linkage->prev == NULL && linkage->next == NULL) ||
 		list_empty(linkage));
-	ENTRY;
+        ENTRY;
 
 	list_add_tail(linkage, &io->ci_layers);
-	slice->cis_io  = io;
-	slice->cis_obj = obj;
-	slice->cis_iop = ops;
-	EXIT;
+        slice->cis_io  = io;
+        slice->cis_obj = obj;
+        slice->cis_iop = ops;
+        EXIT;
 }
 EXPORT_SYMBOL(cl_io_slice_add);
 
@@ -1019,7 +1145,6 @@ void cl_page_list_discard(const struct lu_env *env, struct cl_io *io,
 		cl_page_discard(env, io, page);
 	EXIT;
 }
-EXPORT_SYMBOL(cl_page_list_discard);
 
 /**
  * Initialize dual page queue.
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_lock.c b/drivers/staging/lustrefsx/lustre/obdclass/cl_lock.c
index 30c7186651dba..e92dbaf4fda68 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/cl_lock.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_lock.c
@@ -111,10 +111,7 @@ int cl_lock_init(const struct lu_env *env, struct cl_lock *lock,
 	INIT_LIST_HEAD(&lock->cll_layers);
 	list_for_each_entry(scan, &obj->co_lu.lo_header->loh_layers,
 			    co_lu.lo_linkage) {
-		if (scan->co_ops->coo_lock_init != NULL)
-			result = scan->co_ops->coo_lock_init(env, scan, lock,
-							     io);
-
+		result = scan->co_ops->coo_lock_init(env, scan, lock, io);
 		if (result != 0) {
 			cl_lock_fini(env, lock);
 			break;
@@ -170,8 +167,8 @@ EXPORT_SYMBOL(cl_lock_cancel);
 int cl_lock_enqueue(const struct lu_env *env, struct cl_io *io,
 		    struct cl_lock *lock, struct cl_sync_io *anchor)
 {
-	const struct cl_lock_slice *slice;
-	int rc = 0;
+	const struct cl_lock_slice	*slice;
+	int				rc = -ENOSYS;
 
 	ENTRY;
 
@@ -203,7 +200,7 @@ int cl_lock_request(const struct lu_env *env, struct cl_io *io,
 	if (rc < 0)
 		RETURN(rc);
 
-	if ((enq_flags & CEF_GLIMPSE) && !(enq_flags & CEF_SPECULATIVE)) {
+	if ((enq_flags & CEF_ASYNC) && !(enq_flags & CEF_AGL)) {
 		anchor = &cl_env_info(env)->clt_anchor;
 		cl_sync_io_init(anchor, 1, cl_sync_io_end);
 	}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_object.c b/drivers/staging/lustrefsx/lustre/obdclass/cl_object.c
index 5aa59de91b53e..ddf97fc2cf057 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/cl_object.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_object.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -422,24 +422,6 @@ loff_t cl_object_maxbytes(struct cl_object *obj)
 }
 EXPORT_SYMBOL(cl_object_maxbytes);
 
-int cl_object_flush(const struct lu_env *env, struct cl_object *obj,
-			 struct ldlm_lock *lock)
-{
-	struct lu_object_header *top = obj->co_lu.lo_header;
-	int rc = 0;
-	ENTRY;
-
-	list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) {
-		if (obj->co_ops->coo_object_flush) {
-			rc = obj->co_ops->coo_object_flush(env, obj, lock);
-			if (rc)
-				break;
-		}
-	}
-	RETURN(rc);
-}
-EXPORT_SYMBOL(cl_object_flush);
-
 /**
  * Helper function removing all object locks, and marking object for
  * deletion. All object pages must have been deleted at this point.
@@ -568,16 +550,19 @@ EXPORT_SYMBOL(cl_site_stats_print);
 
 /**
  * The most efficient way is to store cl_env pointer in task specific
- * structures. On Linux, it isn't easy to use task_struct->journal_info
- * because Lustre code may call into other fs during memory reclaim, which
- * has certain assumptions about journal_info. There are not currently any
- * fields in task_struct that can be used for this purpose.
+ * structures. On Linux, it wont' be easy to use task_struct->journal_info
+ * because Lustre code may call into other fs which has certain assumptions
+ * about journal_info. Currently following fields in task_struct are identified
+ * can be used for this purpose:
+ *  - cl_env: for liblustre.
+ *  - tux_info: ony on RedHat kernel.
+ *  - ...
  * \note As long as we use task_struct to store cl_env, we assume that once
  * called into Lustre, we'll never call into the other part of the kernel
  * which will use those fields in task_struct without explicitly exiting
  * Lustre.
  *
- * Since there's no space in task_struct is available, hash will be used.
+ * If there's no space in task_struct is available, hash will be used.
  * bz20044, bz22683.
  */
 
@@ -610,20 +595,17 @@ struct cl_env {
         void             *ce_debug;
 };
 
-static void cl_env_inc(enum cache_stats_item item)
-{
 #ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
-	atomic_inc(&cl_env_stats.cs_stats[item]);
+#define CL_ENV_INC(counter) atomic_inc(&cl_env_stats.cs_stats[CS_##counter])
+
+#define CL_ENV_DEC(counter) do {                                              \
+	LASSERT(atomic_read(&cl_env_stats.cs_stats[CS_##counter]) > 0);   \
+	atomic_dec(&cl_env_stats.cs_stats[CS_##counter]);                 \
+} while (0)
+#else
+#define CL_ENV_INC(counter)
+#define CL_ENV_DEC(counter)
 #endif
-}
-
-static void cl_env_dec(enum cache_stats_item item)
-{
-#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
-	LASSERT(atomic_read(&cl_env_stats.cs_stats[item]) > 0);
-	atomic_dec(&cl_env_stats.cs_stats[item]);
-#endif
-}
 
 static void cl_env_init0(struct cl_env *cle, void *debug)
 {
@@ -633,7 +615,7 @@ static void cl_env_init0(struct cl_env *cle, void *debug)
 
 	cle->ce_ref = 1;
 	cle->ce_debug = debug;
-	cl_env_inc(CS_busy);
+	CL_ENV_INC(busy);
 }
 
 static struct lu_env *cl_env_new(__u32 ctx_tags, __u32 ses_tags, void *debug)
@@ -663,8 +645,8 @@ static struct lu_env *cl_env_new(__u32 ctx_tags, __u32 ses_tags, void *debug)
 			OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
 			env = ERR_PTR(rc);
 		} else {
-			cl_env_inc(CS_create);
-			cl_env_inc(CS_total);
+			CL_ENV_INC(create);
+			CL_ENV_INC(total);
 		}
 	} else
 		env = ERR_PTR(-ENOMEM);
@@ -673,10 +655,10 @@ static struct lu_env *cl_env_new(__u32 ctx_tags, __u32 ses_tags, void *debug)
 
 static void cl_env_fini(struct cl_env *cle)
 {
-	cl_env_dec(CS_total);
-	lu_context_fini(&cle->ce_lu.le_ctx);
-	lu_context_fini(&cle->ce_ses);
-	OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
+        CL_ENV_DEC(total);
+        lu_context_fini(&cle->ce_lu.le_ctx);
+        lu_context_fini(&cle->ce_ses);
+        OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
 }
 
 static struct lu_env *cl_env_obtain(void *debug)
@@ -832,15 +814,15 @@ void cl_env_put(struct lu_env *env, __u16 *refcheck)
         if (--cle->ce_ref == 0) {
 		int cpu = get_cpu();
 
-		cl_env_dec(CS_busy);
-		cle->ce_debug = NULL;
-		cl_env_exit(cle);
-		/*
-		 * Don't bother to take a lock here.
-		 *
-		 * Return environment to the cache only when it was allocated
-		 * with the standard tags.
-		 */
+                CL_ENV_DEC(busy);
+                cle->ce_debug = NULL;
+                cl_env_exit(cle);
+                /*
+                 * Don't bother to take a lock here.
+                 *
+                 * Return environment to the cache only when it was allocated
+                 * with the standard tags.
+                 */
 		if (cl_envs[cpu].cec_count < cl_envs_cached_max &&
 		    (env->le_ctx.lc_tags & ~LCT_HAS_EXIT) == LCT_CL_THREAD &&
 		    (env->le_ses->lc_tags & ~LCT_HAS_EXIT) == LCT_SESSION) {
@@ -862,11 +844,13 @@ EXPORT_SYMBOL(cl_env_put);
  */
 void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr)
 {
+        ENTRY;
         lvb->lvb_size   = attr->cat_size;
         lvb->lvb_mtime  = attr->cat_mtime;
         lvb->lvb_atime  = attr->cat_atime;
         lvb->lvb_ctime  = attr->cat_ctime;
         lvb->lvb_blocks = attr->cat_blocks;
+        EXIT;
 }
 
 /**
@@ -876,11 +860,13 @@ void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr)
  */
 void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb)
 {
+        ENTRY;
         attr->cat_size   = lvb->lvb_size;
         attr->cat_mtime  = lvb->lvb_mtime;
         attr->cat_atime  = lvb->lvb_atime;
         attr->cat_ctime  = lvb->lvb_ctime;
         attr->cat_blocks = lvb->lvb_blocks;
+        EXIT;
 }
 EXPORT_SYMBOL(cl_lvb2attr);
 
@@ -965,7 +951,7 @@ void cl_env_percpu_put(struct lu_env *env)
 	cle->ce_ref--;
 	LASSERT(cle->ce_ref == 0);
 
-	cl_env_dec(CS_busy);
+	CL_ENV_DEC(busy);
 	cle->ce_debug = NULL;
 
 	put_cpu();
@@ -1057,6 +1043,8 @@ static struct lu_kmem_descr cl_object_caches[] = {
         }
 };
 
+struct cfs_ptask_engine *cl_io_engine;
+
 /**
  * Global initialization of cl-data. Create kmem caches, register
  * lu_context_key's, etc.
@@ -1084,8 +1072,17 @@ int cl_global_init(void)
 	if (result) /* no cl_env_percpu_fini on error */
 		GOTO(out_keys, result);
 
+	cl_io_engine = cfs_ptengine_init("clio", cpu_online_mask);
+	if (IS_ERR(cl_io_engine)) {
+		result = PTR_ERR(cl_io_engine);
+		cl_io_engine = NULL;
+		GOTO(out_percpu, result);
+	}
+
 	return 0;
 
+out_percpu:
+	cl_env_percpu_fini();
 out_keys:
 	lu_context_key_degister(&cl_key);
 out_kmem:
@@ -1101,6 +1098,8 @@ int cl_global_init(void)
  */
 void cl_global_fini(void)
 {
+	cfs_ptengine_fini(cl_io_engine);
+	cl_io_engine = NULL;
 	cl_env_percpu_fini();
 	lu_context_key_degister(&cl_key);
 	lu_kmem_fini(cl_object_caches);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_page.c b/drivers/staging/lustrefsx/lustre/obdclass/cl_page.c
index a1b1e130f31c6..74f9225ec1d59 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/cl_page.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_page.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -74,37 +74,21 @@ static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg);
 #endif /* !CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK */
 
 /* Disable page statistic by default due to huge performance penalty. */
-static void cs_page_inc(const struct cl_object *obj,
-			enum cache_stats_item item)
-{
-#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
-	atomic_inc(&cl_object_site(obj)->cs_pages.cs_stats[item]);
-#endif
-}
-
-static void cs_page_dec(const struct cl_object *obj,
-			enum cache_stats_item item)
-{
 #ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
-	atomic_dec(&cl_object_site(obj)->cs_pages.cs_stats[item]);
+#define CS_PAGE_INC(o, item) \
+	atomic_inc(&cl_object_site(o)->cs_pages.cs_stats[CS_##item])
+#define CS_PAGE_DEC(o, item) \
+	atomic_dec(&cl_object_site(o)->cs_pages.cs_stats[CS_##item])
+#define CS_PAGESTATE_INC(o, state) \
+	atomic_inc(&cl_object_site(o)->cs_pages_state[state])
+#define CS_PAGESTATE_DEC(o, state) \
+	atomic_dec(&cl_object_site(o)->cs_pages_state[state])
+#else
+#define CS_PAGE_INC(o, item)
+#define CS_PAGE_DEC(o, item)
+#define CS_PAGESTATE_INC(o, state)
+#define CS_PAGESTATE_DEC(o, state)
 #endif
-}
-
-static void cs_pagestate_inc(const struct cl_object *obj,
-			     enum cl_page_state state)
-{
-#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
-	atomic_inc(&cl_object_site(obj)->cs_pages_state[state]);
-#endif
-}
-
-static void cs_pagestate_dec(const struct cl_object *obj,
-			      enum cl_page_state state)
-{
-#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
-	atomic_dec(&cl_object_site(obj)->cs_pages_state[state]);
-#endif
-}
 
 /**
  * Internal version of cl_page_get().
@@ -142,8 +126,7 @@ cl_page_at_trusted(const struct cl_page *page,
 	RETURN(NULL);
 }
 
-static void cl_page_free(const struct lu_env *env, struct cl_page *page,
-			 struct pagevec *pvec)
+static void cl_page_free(const struct lu_env *env, struct cl_page *page)
 {
 	struct cl_object *obj  = page->cp_obj;
 	int pagesize = cl_object_header(obj)->coh_page_bufsize;
@@ -160,10 +143,10 @@ static void cl_page_free(const struct lu_env *env, struct cl_page *page,
 				   struct cl_page_slice, cpl_linkage);
 		list_del_init(page->cp_layers.next);
 		if (unlikely(slice->cpl_ops->cpo_fini != NULL))
-			slice->cpl_ops->cpo_fini(env, slice, pvec);
+			slice->cpl_ops->cpo_fini(env, slice);
 	}
-	cs_page_dec(obj, CS_total);
-	cs_pagestate_dec(obj, page->cp_state);
+	CS_PAGE_DEC(obj, total);
+	CS_PAGESTATE_DEC(obj, page->cp_state);
 	lu_object_ref_del_at(&obj->co_lu, &page->cp_obj_ref, "cl_page", page);
 	cl_object_put(env, obj);
 	lu_ref_fini(&page->cp_reference);
@@ -213,16 +196,16 @@ struct cl_page *cl_page_alloc(const struct lu_env *env,
 								  ind);
 				if (result != 0) {
 					cl_page_delete0(env, page);
-					cl_page_free(env, page, NULL);
+					cl_page_free(env, page);
 					page = ERR_PTR(result);
 					break;
 				}
 			}
 		}
 		if (result == 0) {
-			cs_page_inc(o, CS_total);
-			cs_page_inc(o, CS_create);
-			cs_pagestate_dec(o, CPS_CACHED);
+			CS_PAGE_INC(o, total);
+			CS_PAGE_INC(o, create);
+			CS_PAGESTATE_DEC(o, CPS_CACHED);
 		}
 	} else {
 		page = ERR_PTR(-ENOMEM);
@@ -255,7 +238,7 @@ struct cl_page *cl_page_find(const struct lu_env *env,
 	ENTRY;
 
 	hdr = cl_object_header(o);
-	cs_page_inc(o, CS_lookup);
+	CS_PAGE_INC(o, lookup);
 
         CDEBUG(D_PAGE, "%lu@"DFID" %p %lx %d\n",
                idx, PFID(&hdr->coh_lu.loh_fid), vmpage, vmpage->private, type);
@@ -275,7 +258,7 @@ struct cl_page *cl_page_find(const struct lu_env *env,
                  */
                 page = cl_vmpage_page(vmpage, o);
 		if (page != NULL) {
-			cs_page_inc(o, CS_hit);
+			CS_PAGE_INC(o, hit);
 			RETURN(page);
 		}
         }
@@ -345,8 +328,8 @@ static void cl_page_state_set0(const struct lu_env *env,
 	PASSERT(env, page, page->cp_state == old);
 	PASSERT(env, page, equi(state == CPS_OWNED, page->cp_owner != NULL));
 
-	cs_pagestate_dec(page->cp_obj, page->cp_state);
-	cs_pagestate_inc(page->cp_obj, state);
+	CS_PAGESTATE_DEC(page->cp_obj, page->cp_state);
+	CS_PAGESTATE_INC(page->cp_obj, state);
 	cl_page_state_set_trust(page, state);
 	EXIT;
 }
@@ -374,13 +357,15 @@ void cl_page_get(struct cl_page *page)
 EXPORT_SYMBOL(cl_page_get);
 
 /**
- * Releases a reference to a page, use the pagevec to release the pages
- * in batch if provided.
+ * Releases a reference to a page.
  *
- * Users need to do a final pagevec_release() to release any trailing pages.
+ * When last reference is released, page is returned to the cache, unless it
+ * is in cl_page_state::CPS_FREEING state, in which case it is immediately
+ * destroyed.
+ *
+ * \see cl_object_put(), cl_lock_put().
  */
-void cl_pagevec_put(const struct lu_env *env, struct cl_page *page,
-		  struct pagevec *pvec)
+void cl_page_put(const struct lu_env *env, struct cl_page *page)
 {
         ENTRY;
         CL_PAGE_HEADER(D_TRACE, env, page, "%d\n",
@@ -396,26 +381,11 @@ void cl_pagevec_put(const struct lu_env *env, struct cl_page *page,
 		 * Page is no longer reachable by other threads. Tear
 		 * it down.
 		 */
-		cl_page_free(env, page, pvec);
+		cl_page_free(env, page);
 	}
 
 	EXIT;
 }
-EXPORT_SYMBOL(cl_pagevec_put);
-
-/**
- * Releases a reference to a page, wrapper to cl_pagevec_put
- *
- * When last reference is released, page is returned to the cache, unless it
- * is in cl_page_state::CPS_FREEING state, in which case it is immediately
- * destroyed.
- *
- * \see cl_object_put(), cl_lock_put().
- */
-void cl_page_put(const struct lu_env *env, struct cl_page *page)
-{
-	cl_pagevec_put(env, page, NULL);
-}
 EXPORT_SYMBOL(cl_page_put);
 
 /**
@@ -818,22 +788,6 @@ int cl_page_is_vmlocked(const struct lu_env *env, const struct cl_page *pg)
 }
 EXPORT_SYMBOL(cl_page_is_vmlocked);
 
-void cl_page_touch(const struct lu_env *env, const struct cl_page *pg,
-		  size_t to)
-{
-	const struct cl_page_slice *slice;
-
-	ENTRY;
-
-	list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) {
-		if (slice->cpl_ops->cpo_page_touch != NULL)
-			(*slice->cpl_ops->cpo_page_touch)(env, slice, to);
-	}
-
-	EXIT;
-}
-EXPORT_SYMBOL(cl_page_touch);
-
 static enum cl_page_state cl_req_type_state(enum cl_req_type crt)
 {
         ENTRY;
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c b/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c
index 3cf9b86b2835a..b6576eb9b52e0 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c
@@ -23,7 +23,7 @@
  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -42,16 +42,17 @@
 
 #include <obd_support.h>
 #include <obd_class.h>
-#include <uapi/linux/lnet/lnetctl.h>
+#include <lnet/lnetctl.h>
 #include <lustre_debug.h>
 #include <lustre_kernelcomm.h>
 #include <lprocfs_status.h>
+#include <lustre_ver.h>
 #include <cl_object.h>
 #ifdef HAVE_SERVER_SUPPORT
 # include <dt_object.h>
 # include <md_object.h>
 #endif /* HAVE_SERVER_SUPPORT */
-#include <uapi/linux/lustre/lustre_ioctl.h>
+#include <uapi/linux/lustre_ioctl.h>
 #include "llog_internal.h"
 
 #ifdef CONFIG_PROC_FS
@@ -69,8 +70,6 @@ unsigned int obd_dump_on_timeout;
 EXPORT_SYMBOL(obd_dump_on_timeout);
 unsigned int obd_dump_on_eviction;
 EXPORT_SYMBOL(obd_dump_on_eviction);
-unsigned int obd_lbug_on_eviction;
-EXPORT_SYMBOL(obd_lbug_on_eviction);
 unsigned long obd_max_dirty_pages;
 EXPORT_SYMBOL(obd_max_dirty_pages);
 atomic_long_t obd_dirty_pages;
@@ -98,11 +97,92 @@ EXPORT_SYMBOL(at_early_margin);
 int at_extra = 30;
 EXPORT_SYMBOL(at_extra);
 
+atomic_long_t obd_dirty_transit_pages;
+EXPORT_SYMBOL(obd_dirty_transit_pages);
+
+char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE;
+
 #ifdef CONFIG_PROC_FS
 struct lprocfs_stats *obd_memory = NULL;
 EXPORT_SYMBOL(obd_memory);
 #endif
 
+char obd_jobid_node[LUSTRE_JOBID_SIZE + 1];
+
+/* Get jobid of current process by reading the environment variable
+ * stored in between the "env_start" & "env_end" of task struct.
+ *
+ * TODO:
+ * It's better to cache the jobid for later use if there is any
+ * efficient way, the cl_env code probably could be reused for this
+ * purpose.
+ *
+ * If some job scheduler doesn't store jobid in the "env_start/end",
+ * then an upcall could be issued here to get the jobid by utilizing
+ * the userspace tools/api. Then, the jobid must be cached.
+ */
+int lustre_get_jobid(char *jobid)
+{
+	int jobid_len = LUSTRE_JOBID_SIZE;
+	char tmp_jobid[LUSTRE_JOBID_SIZE] = { 0 };
+	int rc = 0;
+	ENTRY;
+
+	/* Jobstats isn't enabled */
+	if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0)
+		GOTO(out, rc = 0);
+
+	/* Whole node dedicated to single job */
+	if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) == 0) {
+		memcpy(tmp_jobid, obd_jobid_node, LUSTRE_JOBID_SIZE);
+		GOTO(out, rc = 0);
+	}
+
+	/* Use process name + fsuid as jobid */
+	if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) {
+		snprintf(tmp_jobid, LUSTRE_JOBID_SIZE, "%s.%u",
+			 current_comm(),
+			 from_kuid(&init_user_ns, current_fsuid()));
+		GOTO(out, rc = 0);
+	}
+
+	rc = cfs_get_environ(obd_jobid_var, tmp_jobid, &jobid_len);
+	if (rc) {
+		if (rc == -EOVERFLOW) {
+			/* For the PBS_JOBID and LOADL_STEP_ID keys (which are
+			 * variable length strings instead of just numbers), it
+			 * might make sense to keep the unique parts for JobID,
+			 * instead of just returning an error.  That means a
+			 * larger temp buffer for cfs_get_environ(), then
+			 * truncating the string at some separator to fit into
+			 * the specified jobid_len.  Fix later if needed. */
+			static bool printed;
+			if (unlikely(!printed)) {
+				LCONSOLE_ERROR_MSG(0x16b, "%s value too large "
+						   "for JobID buffer (%d)\n",
+						   obd_jobid_var, jobid_len);
+				printed = true;
+			}
+		} else {
+			CDEBUG((rc == -ENOENT || rc == -EINVAL ||
+				rc == -EDEADLK) ? D_INFO : D_ERROR,
+			       "Get jobid for (%s) failed: rc = %d\n",
+			       obd_jobid_var, rc);
+		}
+	}
+
+out:
+	if (rc != 0)
+		RETURN(rc);
+
+	/* Only replace the job ID if it changed. */
+	if (strcmp(jobid, tmp_jobid) != 0)
+		memcpy(jobid, tmp_jobid, jobid_len);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lustre_get_jobid);
+
 static int class_resolve_dev_name(__u32 len, const char *name)
 {
         int rc;
@@ -132,159 +212,6 @@ static int class_resolve_dev_name(__u32 len, const char *name)
         RETURN(rc);
 }
 
-#define OBD_MAX_IOCTL_BUFFER	8192
-
-static int obd_ioctl_is_invalid(struct obd_ioctl_data *data)
-{
-	if (data->ioc_len > BIT(30)) {
-		CERROR("OBD ioctl: ioc_len larger than 1<<30\n");
-		return 1;
-	}
-
-	if (data->ioc_inllen1 > BIT(30)) {
-		CERROR("OBD ioctl: ioc_inllen1 larger than 1<<30\n");
-		return 1;
-	}
-
-	if (data->ioc_inllen2 > BIT(30)) {
-		CERROR("OBD ioctl: ioc_inllen2 larger than 1<<30\n");
-		return 1;
-	}
-
-	if (data->ioc_inllen3 > BIT(30)) {
-		CERROR("OBD ioctl: ioc_inllen3 larger than 1<<30\n");
-		return 1;
-	}
-
-	if (data->ioc_inllen4 > BIT(30)) {
-		CERROR("OBD ioctl: ioc_inllen4 larger than 1<<30\n");
-		return 1;
-	}
-
-	if (data->ioc_inlbuf1 && data->ioc_inllen1 == 0) {
-		CERROR("OBD ioctl: inlbuf1 pointer but 0 length\n");
-		return 1;
-	}
-
-	if (data->ioc_inlbuf2 && data->ioc_inllen2 == 0) {
-		CERROR("OBD ioctl: inlbuf2 pointer but 0 length\n");
-		return 1;
-	}
-
-	if (data->ioc_inlbuf3 && data->ioc_inllen3 == 0) {
-		CERROR("OBD ioctl: inlbuf3 pointer but 0 length\n");
-		return 1;
-	}
-
-	if (data->ioc_inlbuf4 && data->ioc_inllen4 == 0) {
-		CERROR("OBD ioctl: inlbuf4 pointer but 0 length\n");
-		return 1;
-	}
-
-	if (data->ioc_pbuf1 && data->ioc_plen1 == 0) {
-		CERROR("OBD ioctl: pbuf1 pointer but 0 length\n");
-		return 1;
-	}
-
-	if (data->ioc_pbuf2 && data->ioc_plen2 == 0) {
-		CERROR("OBD ioctl: pbuf2 pointer but 0 length\n");
-		return 1;
-	}
-
-	if (!data->ioc_pbuf1 && data->ioc_plen1 != 0) {
-		CERROR("OBD ioctl: plen1 set but NULL pointer\n");
-		return 1;
-	}
-
-	if (!data->ioc_pbuf2 && data->ioc_plen2 != 0) {
-		CERROR("OBD ioctl: plen2 set but NULL pointer\n");
-		return 1;
-	}
-
-	if (obd_ioctl_packlen(data) > data->ioc_len) {
-		CERROR("OBD ioctl: packlen exceeds ioc_len (%d > %d)\n",
-		       obd_ioctl_packlen(data), data->ioc_len);
-		return 1;
-	}
-
-	return 0;
-}
-
-/* buffer MUST be at least the size of obd_ioctl_hdr */
-int obd_ioctl_getdata(char **buf, int *len, void __user *arg)
-{
-	struct obd_ioctl_hdr hdr;
-	struct obd_ioctl_data *data;
-	int offset = 0;
-
-	ENTRY;
-	if (copy_from_user(&hdr, arg, sizeof(hdr)))
-		RETURN(-EFAULT);
-
-	if (hdr.ioc_version != OBD_IOCTL_VERSION) {
-		CERROR("Version mismatch kernel (%x) vs application (%x)\n",
-		       OBD_IOCTL_VERSION, hdr.ioc_version);
-		RETURN(-EINVAL);
-	}
-
-	if (hdr.ioc_len > OBD_MAX_IOCTL_BUFFER) {
-		CERROR("User buffer len %d exceeds %d max buffer\n",
-		       hdr.ioc_len, OBD_MAX_IOCTL_BUFFER);
-		RETURN(-EINVAL);
-	}
-
-	if (hdr.ioc_len < sizeof(struct obd_ioctl_data)) {
-		CERROR("User buffer too small for ioctl (%d)\n", hdr.ioc_len);
-		RETURN(-EINVAL);
-	}
-
-	/* When there are lots of processes calling vmalloc on multi-core
-	 * system, the high lock contention will hurt performance badly,
-	 * obdfilter-survey is an example, which relies on ioctl. So we'd
-	 * better avoid vmalloc on ioctl path. LU-66
-	 */
-	OBD_ALLOC_LARGE(*buf, hdr.ioc_len);
-	if (!*buf) {
-		CERROR("Cannot allocate control buffer of len %d\n",
-		       hdr.ioc_len);
-		RETURN(-EINVAL);
-	}
-	*len = hdr.ioc_len;
-	data = (struct obd_ioctl_data *)*buf;
-
-	if (copy_from_user(*buf, arg, hdr.ioc_len)) {
-		OBD_FREE_LARGE(*buf, hdr.ioc_len);
-		RETURN(-EFAULT);
-	}
-
-	if (obd_ioctl_is_invalid(data)) {
-		CERROR("ioctl not correctly formatted\n");
-		OBD_FREE_LARGE(*buf, hdr.ioc_len);
-		RETURN(-EINVAL);
-	}
-
-	if (data->ioc_inllen1) {
-		data->ioc_inlbuf1 = &data->ioc_bulk[0];
-		offset += cfs_size_round(data->ioc_inllen1);
-	}
-
-	if (data->ioc_inllen2) {
-		data->ioc_inlbuf2 = &data->ioc_bulk[0] + offset;
-		offset += cfs_size_round(data->ioc_inllen2);
-	}
-
-	if (data->ioc_inllen3) {
-		data->ioc_inlbuf3 = &data->ioc_bulk[0] + offset;
-		offset += cfs_size_round(data->ioc_inllen3);
-	}
-
-	if (data->ioc_inllen4)
-		data->ioc_inlbuf4 = &data->ioc_bulk[0] + offset;
-
-	RETURN(0);
-}
-EXPORT_SYMBOL(obd_ioctl_getdata);
-
 int class_handle_ioctl(unsigned int cmd, unsigned long arg)
 {
         char *buf = NULL;
@@ -500,57 +427,8 @@ int class_handle_ioctl(unsigned int cmd, unsigned long arg)
 	RETURN(err);
 } /* class_handle_ioctl */
 
-/*  opening /dev/obd */
-static int obd_class_open(struct inode * inode, struct file * file)
-{
-	ENTRY;
-	try_module_get(THIS_MODULE);
-	RETURN(0);
-}
-
-/*  closing /dev/obd */
-static int obd_class_release(struct inode * inode, struct file * file)
-{
-	ENTRY;
-
-	module_put(THIS_MODULE);
-	RETURN(0);
-}
-
-/* to control /dev/obd */
-static long obd_class_ioctl(struct file *filp, unsigned int cmd,
-			    unsigned long arg)
-{
-	int err = 0;
-
-	ENTRY;
-	/* Allow non-root access for OBD_IOC_PING_TARGET - used by lfs check */
-	if (!cfs_capable(CFS_CAP_SYS_ADMIN) && (cmd != OBD_IOC_PING_TARGET))
-		RETURN(err = -EACCES);
-
-	if ((cmd & 0xffffff00) == ((int)'T') << 8) /* ignore all tty ioctls */
-		RETURN(err = -ENOTTY);
-
-	err = class_handle_ioctl(cmd, (unsigned long)arg);
-
-	RETURN(err);
-}
-
-/* declare character device */
-static struct file_operations obd_psdev_fops = {
-	.owner		= THIS_MODULE,
-	.unlocked_ioctl	= obd_class_ioctl,	/* unlocked_ioctl */
-	.open		= obd_class_open,	/* open */
-	.release	= obd_class_release,	/* release */
-};
-
-/* modules setup */
-struct miscdevice obd_psdev = {
-	.minor	= MISC_DYNAMIC_MINOR,
-	.name	= OBD_DEV_NAME,
-	.fops	= &obd_psdev_fops,
-};
-
+#define OBD_INIT_CHECK
+#ifdef OBD_INIT_CHECK
 static int obd_init_checks(void)
 {
         __u64 u64val, div64val;
@@ -616,6 +494,9 @@ static int obd_init_checks(void)
 
         return ret;
 }
+#else
+#define obd_init_checks() do {} while(0)
+#endif
 
 static int __init obdclass_init(void)
 {
@@ -732,6 +613,7 @@ static int __init obdclass_init(void)
 	lu_global_fini();
 
 cleanup_class_procfs:
+	obd_sysctl_clean();
 	class_procfs_clean();
 
 cleanup_caches:
@@ -801,6 +683,7 @@ static void __exit obdclass_exit(void)
 	lu_global_fini();
 
         obd_cleanup_caches();
+        obd_sysctl_clean();
 
         class_procfs_clean();
 
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/dt_object.c b/drivers/staging/lustrefsx/lustre/obdclass/dt_object.c
index 68952df7e1242..a48e7cbe7ec18 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/dt_object.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/dt_object.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,7 +40,7 @@
 #define DEBUG_SUBSYSTEM S_CLASS
 
 #include <linux/list.h>
-#include <obd_class.h>
+#include <obd.h>
 #include <dt_object.h>
 /* fid_be_to_cpu() */
 #include <lustre_fid.h>
@@ -53,13 +53,12 @@ LU_KEY_INIT(dt_global, struct dt_thread_info);
 LU_KEY_FINI(dt_global, struct dt_thread_info);
 
 struct lu_context_key dt_key = {
-	.lct_tags = LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD | LCT_LOCAL,
-	.lct_init = dt_global_key_init,
-	.lct_fini = dt_global_key_fini
+        .lct_tags = LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD | LCT_LOCAL,
+        .lct_init = dt_global_key_init,
+        .lct_fini = dt_global_key_fini
 };
 
-/*
- * no lock is necessary to protect the list, because call-backs
+/* no lock is necessary to protect the list, because call-backs
  * are added during system startup. Please refer to "struct dt_device".
  */
 void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb)
@@ -75,7 +74,7 @@ void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb)
 EXPORT_SYMBOL(dt_txn_callback_del);
 
 int dt_txn_hook_start(const struct lu_env *env,
-		      struct dt_device *dev, struct thandle *th)
+                      struct dt_device *dev, struct thandle *th)
 {
 	int rc = 0;
 	struct dt_txn_callback *cb;
@@ -90,11 +89,9 @@ int dt_txn_hook_start(const struct lu_env *env,
 		    !(cb->dtc_tag & env->le_ctx.lc_tags))
 			continue;
 
-		/*
-		 * Usually dt_txn_hook_start is called from bottom device,
+		/* Usually dt_txn_hook_start is called from bottom device,
 		 * and if the thandle has th_top, then we need use top
-		 * thandle for the callback in the top thandle layer
-		 */
+		 * thandle for the callback in the top thandle layer */
 		if (th->th_top != NULL)
 			dtc_th = th->th_top;
 
@@ -108,9 +105,9 @@ EXPORT_SYMBOL(dt_txn_hook_start);
 
 int dt_txn_hook_stop(const struct lu_env *env, struct thandle *th)
 {
-	struct dt_device *dev = th->th_dev;
+	struct dt_device       *dev = th->th_dev;
 	struct dt_txn_callback *cb;
-	int rc = 0;
+	int                     rc = 0;
 
 	if (th->th_local)
 		return 0;
@@ -125,11 +122,9 @@ int dt_txn_hook_stop(const struct lu_env *env, struct thandle *th)
 		    !(cb->dtc_tag & env->le_ctx.lc_tags))
 			continue;
 
-		/*
-		 * Usually dt_txn_hook_stop is called from bottom device,
+		/* Usually dt_txn_hook_stop is called from bottom device,
 		 * and if the thandle has th_top, then we need use top
-		 * thandle for the callback in the top thandle layer
-		 */
+		 * thandle for the callback in the top thandle layer */
 		if (th->th_top != NULL)
 			dtc_th = th->th_top;
 
@@ -150,53 +145,53 @@ EXPORT_SYMBOL(dt_device_init);
 
 void dt_device_fini(struct dt_device *dev)
 {
-	lu_device_fini(&dev->dd_lu_dev);
+        lu_device_fini(&dev->dd_lu_dev);
 }
 EXPORT_SYMBOL(dt_device_fini);
 
 int dt_object_init(struct dt_object *obj,
-		   struct lu_object_header *h, struct lu_device *d)
+                   struct lu_object_header *h, struct lu_device *d)
 
 {
-	return lu_object_init(&obj->do_lu, h, d);
+        return lu_object_init(&obj->do_lu, h, d);
 }
 EXPORT_SYMBOL(dt_object_init);
 
 void dt_object_fini(struct dt_object *obj)
 {
-	lu_object_fini(&obj->do_lu);
+        lu_object_fini(&obj->do_lu);
 }
 EXPORT_SYMBOL(dt_object_fini);
 
 int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj)
 {
-	if (obj->do_index_ops == NULL)
-		obj->do_ops->do_index_try(env, obj, &dt_directory_features);
-	return obj->do_index_ops != NULL;
+        if (obj->do_index_ops == NULL)
+                obj->do_ops->do_index_try(env, obj, &dt_directory_features);
+        return obj->do_index_ops != NULL;
 }
 EXPORT_SYMBOL(dt_try_as_dir);
 
 enum dt_format_type dt_mode_to_dft(__u32 mode)
 {
-	enum dt_format_type result;
-
-	switch (mode & S_IFMT) {
-	case S_IFDIR:
-		result = DFT_DIR;
-		break;
-	case S_IFREG:
-		result = DFT_REGULAR;
-		break;
-	case S_IFLNK:
-		result = DFT_SYM;
-		break;
-	case S_IFCHR:
-	case S_IFBLK:
-	case S_IFIFO:
-	case S_IFSOCK:
-		result = DFT_NODE;
-		break;
-	default:
+        enum dt_format_type result;
+
+        switch (mode & S_IFMT) {
+        case S_IFDIR:
+                result = DFT_DIR;
+                break;
+        case S_IFREG:
+                result = DFT_REGULAR;
+                break;
+        case S_IFLNK:
+                result = DFT_SYM;
+                break;
+        case S_IFCHR:
+        case S_IFBLK:
+        case S_IFIFO:
+        case S_IFSOCK:
+                result = DFT_NODE;
+                break;
+        default:
 		LASSERTF(0, "invalid mode %o\n", mode);
 		result = 0; /* Just for satisfying compiler. */
 		break;
@@ -219,10 +214,8 @@ int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir,
 }
 EXPORT_SYMBOL(dt_lookup_dir);
 
-/*
- * this differs from dt_locate by top_dev as parameter
- * but not one from lu_site
- */
+/* this differs from dt_locate by top_dev as parameter
+ * but not one from lu_site */
 struct dt_object *dt_locate_at(const struct lu_env *env,
 			       struct dt_device *dev,
 			       const struct lu_fid *fid,
@@ -243,7 +236,6 @@ struct dt_object *dt_locate_at(const struct lu_env *env,
 			return container_of0(n, struct dt_object, do_lu);
 	}
 
-	lu_object_put(env, lo);
 	return ERR_PTR(-ENOENT);
 }
 EXPORT_SYMBOL(dt_locate_at);
@@ -277,28 +269,28 @@ static int dt_find_entry(const struct lu_env *env, const char *entry,
  * path component to \a entry_func.
  */
 int dt_path_parser(const struct lu_env *env,
-		   char *path, dt_entry_func_t entry_func,
-		   void *data)
+                   char *path, dt_entry_func_t entry_func,
+                   void *data)
 {
-	char *e;
-	int rc = 0;
-
-	while (1) {
-		e = strsep(&path, "/");
-		if (e == NULL)
-			break;
-
-		if (e[0] == 0) {
-			if (!path || path[0] == '\0')
-				break;
-			continue;
-		}
-		rc = entry_func(env, e, data);
-		if (rc)
-			break;
-	}
-
-	return rc;
+        char *e;
+        int rc = 0;
+
+        while (1) {
+                e = strsep(&path, "/");
+                if (e == NULL)
+                        break;
+
+                if (e[0] == 0) {
+                        if (!path || path[0] == '\0')
+                                break;
+                        continue;
+                }
+                rc = entry_func(env, e, data);
+                if (rc)
+                        break;
+        }
+
+        return rc;
 }
 
 struct dt_object *
@@ -306,50 +298,51 @@ dt_store_resolve(const struct lu_env *env, struct dt_device *dt,
 		 const char *path, struct lu_fid *fid)
 {
 	struct dt_thread_info *info = dt_info(env);
-	struct dt_find_hint *dfh = &info->dti_dfh;
-	struct dt_object *obj;
-	int result;
+	struct dt_find_hint   *dfh = &info->dti_dfh;
+	struct dt_object      *obj;
+	int		       result;
 
 
-	dfh->dfh_dt = dt;
-	dfh->dfh_fid = fid;
+        dfh->dfh_dt = dt;
+        dfh->dfh_fid = fid;
 
 	strlcpy(info->dti_buf, path, sizeof(info->dti_buf));
 
-	result = dt->dd_ops->dt_root_get(env, dt, fid);
-	if (result == 0) {
-		obj = dt_locate(env, dt, fid);
-		if (!IS_ERR(obj)) {
-			dfh->dfh_o = obj;
+        result = dt->dd_ops->dt_root_get(env, dt, fid);
+        if (result == 0) {
+                obj = dt_locate(env, dt, fid);
+                if (!IS_ERR(obj)) {
+                        dfh->dfh_o = obj;
 			result = dt_path_parser(env, info->dti_buf,
 						dt_find_entry, dfh);
-			if (result != 0)
-				obj = ERR_PTR(result);
-			else
-				obj = dfh->dfh_o;
-		}
-	} else {
-		obj = ERR_PTR(result);
-	}
-	return obj;
+                        if (result != 0)
+                                obj = ERR_PTR(result);
+                        else
+                                obj = dfh->dfh_o;
+                }
+        } else {
+                obj = ERR_PTR(result);
+        }
+        return obj;
 }
 
 static struct dt_object *dt_reg_open(const struct lu_env *env,
-				     struct dt_device *dt,
-				     struct dt_object *p,
-				     const char *name,
-				     struct lu_fid *fid)
+                                     struct dt_device *dt,
+                                     struct dt_object *p,
+                                     const char *name,
+                                     struct lu_fid *fid)
 {
-	struct dt_object *o;
-	int result;
+        struct dt_object *o;
+        int result;
 
-	result = dt_lookup_dir(env, p, name, fid);
-	if (result == 0)
-		o = dt_locate(env, dt, fid);
-	else
-		o = ERR_PTR(result);
+        result = dt_lookup_dir(env, p, name, fid);
+        if (result == 0){
+                o = dt_locate(env, dt, fid);
+        }
+        else
+                o = ERR_PTR(result);
 
-	return o;
+        return o;
 }
 
 /**
@@ -376,47 +369,47 @@ struct dt_object *dt_store_open(const struct lu_env *env, struct dt_device *dt,
 }
 
 struct dt_object *dt_find_or_create(const struct lu_env *env,
-				    struct dt_device *dt,
-				    const struct lu_fid *fid,
-				    struct dt_object_format *dof,
-				    struct lu_attr *at)
+                                    struct dt_device *dt,
+                                    const struct lu_fid *fid,
+                                    struct dt_object_format *dof,
+                                    struct lu_attr *at)
 {
-	struct dt_object *dto;
-	struct thandle *th;
-	int rc;
+        struct dt_object *dto;
+        struct thandle *th;
+        int rc;
 
-	ENTRY;
+        ENTRY;
 
-	dto = dt_locate(env, dt, fid);
-	if (IS_ERR(dto))
-		RETURN(dto);
+        dto = dt_locate(env, dt, fid);
+        if (IS_ERR(dto))
+                RETURN(dto);
 
-	LASSERT(dto != NULL);
-	if (dt_object_exists(dto))
-		RETURN(dto);
+        LASSERT(dto != NULL);
+        if (dt_object_exists(dto))
+                RETURN(dto);
 
-	th = dt_trans_create(env, dt);
-	if (IS_ERR(th))
-		GOTO(out, rc = PTR_ERR(th));
+        th = dt_trans_create(env, dt);
+        if (IS_ERR(th))
+                GOTO(out, rc = PTR_ERR(th));
 
-	rc = dt_declare_create(env, dto, at, NULL, dof, th);
-	if (rc)
-		GOTO(trans_stop, rc);
+        rc = dt_declare_create(env, dto, at, NULL, dof, th);
+        if (rc)
+                GOTO(trans_stop, rc);
 
-	rc = dt_trans_start_local(env, dt, th);
-	if (rc)
-		GOTO(trans_stop, rc);
+        rc = dt_trans_start_local(env, dt, th);
+        if (rc)
+                GOTO(trans_stop, rc);
 
-	dt_write_lock(env, dto, 0);
-	if (dt_object_exists(dto))
-		GOTO(unlock, rc = 0);
+        dt_write_lock(env, dto, 0);
+        if (dt_object_exists(dto))
+                GOTO(unlock, rc = 0);
 
-	CDEBUG(D_OTHER, "create new object "DFID"\n", PFID(fid));
+        CDEBUG(D_OTHER, "create new object "DFID"\n", PFID(fid));
 
-	rc = dt_create(env, dto, at, NULL, dof, th);
-	if (rc)
+        rc = dt_create(env, dto, at, NULL, dof, th);
+        if (rc)
                 GOTO(unlock, rc);
-	LASSERT(dt_object_exists(dto));
+        LASSERT(dt_object_exists(dto));
 unlock:
 	dt_write_unlock(env, dto);
 trans_stop:
@@ -434,16 +427,16 @@ EXPORT_SYMBOL(dt_find_or_create);
 /* dt class init function. */
 int dt_global_init(void)
 {
-	int result;
+        int result;
 
-	LU_CONTEXT_KEY_INIT(&dt_key);
-	result = lu_context_key_register(&dt_key);
-	return result;
+        LU_CONTEXT_KEY_INIT(&dt_key);
+        result = lu_context_key_register(&dt_key);
+        return result;
 }
 
 void dt_global_fini(void)
 {
-	lu_context_key_degister(&dt_key);
+        lu_context_key_degister(&dt_key);
 }
 
 /**
@@ -458,7 +451,7 @@ void dt_global_fini(void)
  * \retval -ve errno on failure
  */
 int dt_read(const struct lu_env *env, struct dt_object *dt,
-	    struct lu_buf *buf, loff_t *pos)
+            struct lu_buf *buf, loff_t *pos)
 {
 	LASSERTF(dt != NULL, "dt is NULL when we want to read record\n");
 	return dt->do_body_ops->dbo_read(env, dt, buf, pos);
@@ -493,7 +486,7 @@ int dt_record_read(const struct lu_env *env, struct dt_object *dt,
 EXPORT_SYMBOL(dt_record_read);
 
 int dt_record_write(const struct lu_env *env, struct dt_object *dt,
-		    const struct lu_buf *buf, loff_t *pos, struct thandle *th)
+                    const struct lu_buf *buf, loff_t *pos, struct thandle *th)
 {
 	ssize_t size;
 
@@ -502,7 +495,7 @@ int dt_record_write(const struct lu_env *env, struct dt_object *dt,
 	LASSERT(dt->do_body_ops);
 	LASSERT(dt->do_body_ops->dbo_write);
 
-	size = dt->do_body_ops->dbo_write(env, dt, buf, pos, th);
+	size = dt->do_body_ops->dbo_write(env, dt, buf, pos, th, 1);
 	if (size < 0)
 		return size;
 	return (size == (ssize_t)buf->lb_len) ? 0 : -EFAULT;
@@ -510,53 +503,53 @@ int dt_record_write(const struct lu_env *env, struct dt_object *dt,
 EXPORT_SYMBOL(dt_record_write);
 
 int dt_declare_version_set(const struct lu_env *env, struct dt_object *o,
-			   struct thandle *th)
+                           struct thandle *th)
 {
-	struct lu_buf vbuf;
-	char *xname = XATTR_NAME_VERSION;
+        struct lu_buf vbuf;
+        char *xname = XATTR_NAME_VERSION;
 
-	LASSERT(o);
-	vbuf.lb_buf = NULL;
-	vbuf.lb_len = sizeof(dt_obj_version_t);
-	return dt_declare_xattr_set(env, o, &vbuf, xname, 0, th);
+        LASSERT(o);
+        vbuf.lb_buf = NULL;
+        vbuf.lb_len = sizeof(dt_obj_version_t);
+        return dt_declare_xattr_set(env, o, &vbuf, xname, 0, th);
 
 }
 EXPORT_SYMBOL(dt_declare_version_set);
 
 void dt_version_set(const struct lu_env *env, struct dt_object *o,
-		    dt_obj_version_t version, struct thandle *th)
+                    dt_obj_version_t version, struct thandle *th)
 {
-	struct lu_buf vbuf;
-	char *xname = XATTR_NAME_VERSION;
-	int rc;
+        struct lu_buf vbuf;
+        char *xname = XATTR_NAME_VERSION;
+        int rc;
 
-	LASSERT(o);
-	vbuf.lb_buf = &version;
-	vbuf.lb_len = sizeof(version);
+        LASSERT(o);
+        vbuf.lb_buf = &version;
+        vbuf.lb_len = sizeof(version);
 
 	rc = dt_xattr_set(env, o, &vbuf, xname, 0, th);
-	if (rc < 0)
-		CDEBUG(D_INODE, "Can't set version, rc %d\n", rc);
-	return;
+        if (rc < 0)
+                CDEBUG(D_INODE, "Can't set version, rc %d\n", rc);
+        return;
 }
 EXPORT_SYMBOL(dt_version_set);
 
 dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o)
 {
-	struct lu_buf vbuf;
-	char *xname = XATTR_NAME_VERSION;
-	dt_obj_version_t version;
-	int rc;
-
-	LASSERT(o);
-	vbuf.lb_buf = &version;
-	vbuf.lb_len = sizeof(version);
+        struct lu_buf vbuf;
+        char *xname = XATTR_NAME_VERSION;
+        dt_obj_version_t version;
+        int rc;
+
+        LASSERT(o);
+        vbuf.lb_buf = &version;
+        vbuf.lb_len = sizeof(version);
 	rc = dt_xattr_get(env, o, &vbuf, xname);
-	if (rc != sizeof(version)) {
-		CDEBUG(D_INODE, "Can't get version, rc %d\n", rc);
-		version = 0;
-	}
-	return version;
+        if (rc != sizeof(version)) {
+                CDEBUG(D_INODE, "Can't get version, rc %d\n", rc);
+                version = 0;
+        }
+        return version;
 }
 EXPORT_SYMBOL(dt_version_get);
 
@@ -575,8 +568,8 @@ const struct dt_index_features dt_lfsck_layout_orphan_features = {
 	.dif_flags		= 0,
 	.dif_keysize_min	= sizeof(struct lu_fid),
 	.dif_keysize_max	= sizeof(struct lu_fid),
-	.dif_recsize_min	= sizeof(struct lu_orphan_rec_v3),
-	.dif_recsize_max	= sizeof(struct lu_orphan_rec_v3),
+	.dif_recsize_min	= sizeof(struct lu_orphan_rec_v2),
+	.dif_recsize_max	= sizeof(struct lu_orphan_rec_v2),
 	.dif_ptrsize		= 4
 };
 EXPORT_SYMBOL(dt_lfsck_layout_orphan_features);
@@ -649,10 +642,8 @@ const struct dt_index_features dt_nodemap_features = {
 };
 EXPORT_SYMBOL(dt_nodemap_features);
 
-/*
- * helper function returning what dt_index_features structure should be used
- * based on the FID sequence. This is used by OBD_IDX_READ RPC
- */
+/* helper function returning what dt_index_features structure should be used
+ * based on the FID sequence. This is used by OBD_IDX_READ RPC */
 static inline const struct dt_index_features *dt_index_feat_select(__u64 seq,
 								   __u32 mode)
 {
@@ -698,15 +689,11 @@ static int dt_index_page_build(const struct lu_env *env, union lu_page *lp,
 			       size_t nob, const struct dt_it_ops *iops,
 			       struct dt_it *it, __u32 attr, void *arg)
 {
-	struct idx_info *ii = (struct idx_info *)arg;
-	struct lu_idxpage *lip = &lp->lp_idx;
-	char *entry;
-	__u64 hash;
-	__u16 hashsize = 0;
-	__u16 keysize = 0;
-	__u16 recsize;
-	int rc;
-
+	struct idx_info		*ii = (struct idx_info *)arg;
+	struct lu_idxpage	*lip = &lp->lp_idx;
+	char			*entry;
+	size_t			 size;
+	int			 rc;
 	ENTRY;
 
 	if (nob < LIP_HDR_SIZE)
@@ -717,12 +704,20 @@ static int dt_index_page_build(const struct lu_env *env, union lu_page *lp,
 	lip->lip_magic = LIP_MAGIC;
 	nob           -= LIP_HDR_SIZE;
 
-	/* client wants to the 64-bit hash value associated with each record */
-	if (!(ii->ii_flags & II_FL_NOHASH))
-		hashsize = sizeof(hash);
+	/* compute size needed to store a key/record pair */
+	size = ii->ii_recsize + ii->ii_keysize;
+	if ((ii->ii_flags & II_FL_NOHASH) == 0)
+		/* add hash if the client wants it */
+		size += sizeof(__u64);
 
 	entry = lip->lip_entries;
 	do {
+		char		*tmp_entry = entry;
+		struct dt_key	*key;
+		__u64		hash;
+		__u16		keysize;
+		__u16		recsize;
+
 		/* fetch 64-bit hash value */
 		hash = iops->store(env, it);
 		ii->ii_hash_end = hash;
@@ -732,54 +727,56 @@ static int dt_index_page_build(const struct lu_env *env, union lu_page *lp,
 				GOTO(out, rc = 0);
 		}
 
-		if (!(ii->ii_flags & II_FL_NOKEY)) {
-			keysize = iops->key_size(env, it);
-			if (!(ii->ii_flags & II_FL_VARKEY) &&
-			    keysize != ii->ii_keysize) {
-				CERROR("keysize mismatch %hu != %hu.\n",
-				       keysize, ii->ii_keysize);
+		if (nob < size) {
+			if (lip->lip_nr == 0)
 				GOTO(out, rc = -EINVAL);
-			}
+			GOTO(out, rc = 0);
 		}
 
-		/* and finally the record */
-		if (ii->ii_flags & II_FL_VARREC)
-			recsize = iops->rec_size(env, it, attr);
+		if (!(ii->ii_flags & II_FL_NOHASH)) {
+			/* client wants to the 64-bit hash value associated with
+			 * each record */
+			memcpy(tmp_entry, &hash, sizeof(hash));
+			tmp_entry += sizeof(hash);
+		}
+
+		if (ii->ii_flags & II_FL_VARKEY)
+			keysize = iops->key_size(env, it);
 		else
-			recsize = ii->ii_recsize;
+			keysize = ii->ii_keysize;
 
-		if (nob < hashsize + keysize + recsize) {
-			if (lip->lip_nr == 0)
-				GOTO(out, rc = -E2BIG);
-			GOTO(out, rc = 0);
+		if (!(ii->ii_flags & II_FL_NOKEY)) {
+			/* then the key value */
+			key = iops->key(env, it);
+			memcpy(tmp_entry, key, keysize);
+			tmp_entry += keysize;
 		}
 
-		rc = iops->rec(env, it,
-			       (struct dt_rec *)(entry + hashsize + keysize),
-			       attr);
-		if (!rc) {
-			if (hashsize)
-				memcpy(entry, &hash, hashsize);
-			if (keysize) {
-				struct dt_key *key;
-
-				key = iops->key(env, it);
-				memcpy(entry + hashsize, key, keysize);
-			}
+		/* and finally the record */
+		rc = iops->rec(env, it, (struct dt_rec *)tmp_entry, attr);
+		if (rc != -ESTALE) {
+			if (rc != 0)
+				GOTO(out, rc);
+
 			/* hash/key/record successfully copied! */
 			lip->lip_nr++;
 			if (unlikely(lip->lip_nr == 1 && ii->ii_count == 0))
 				ii->ii_hash_start = hash;
-			entry += hashsize + keysize + recsize;
-			nob -= hashsize + keysize + recsize;
-		} else if (rc != -ESTALE) {
-			GOTO(out, rc);
+
+			if (ii->ii_flags & II_FL_VARREC)
+				recsize = iops->rec_size(env, it, attr);
+			else
+				recsize = ii->ii_recsize;
+
+			entry = tmp_entry + recsize;
+			nob -= size;
 		}
 
 		/* move on to the next record */
 		do {
 			rc = iops->next(env, it);
 		} while (rc == -ESTALE);
+
 	} while (rc == 0);
 
 	GOTO(out, rc);
@@ -812,10 +809,10 @@ int dt_index_walk(const struct lu_env *env, struct dt_object *obj,
 		  const struct lu_rdpg *rdpg, dt_index_page_build_t filler,
 		  void *arg)
 {
-	struct dt_it *it;
-	const struct dt_it_ops *iops;
-	size_t pageidx, nob, nlupgs = 0;
-	int rc;
+	struct dt_it		*it;
+	const struct dt_it_ops	*iops;
+	size_t			 pageidx, nob, nlupgs = 0;
+	int			 rc;
 	ENTRY;
 
 	LASSERT(rdpg->rp_pages != NULL);
@@ -856,15 +853,13 @@ int dt_index_walk(const struct lu_env *env, struct dt_object *obj,
 		GOTO(out, rc);
 	}
 
-	/*
-	 * Fill containers one after the other. There might be multiple
+	/* Fill containers one after the other. There might be multiple
 	 * containers per physical page.
 	 *
 	 * At this point and across for-loop:
 	 *  rc == 0 -> ok, proceed.
 	 *  rc >  0 -> end of index.
-	 *  rc <  0 -> error.
-	 */
+	 *  rc <  0 -> error. */
 	for (pageidx = 0; rc == 0 && nob > 0; pageidx++) {
 		union lu_page	*lp;
 		int		 i;
@@ -920,10 +915,8 @@ int dt_index_read(const struct lu_env *env, struct dt_device *dev,
 	int				 rc;
 	ENTRY;
 
-	/*
-	 * rp_count shouldn't be null and should be a multiple of the container
-	 * size
-	 */
+	/* rp_count shouldn't be null and should be a multiple of the container
+	 * size */
 	if (rdpg->rp_count == 0 || (rdpg->rp_count & (LU_PAGE_SIZE - 1)) != 0)
 		RETURN(-EFAULT);
 
@@ -1084,221 +1077,3 @@ int lprocfs_dt_filesfree_seq_show(struct seq_file *m, void *v)
 EXPORT_SYMBOL(lprocfs_dt_filesfree_seq_show);
 
 #endif /* CONFIG_PROC_FS */
-
-static ssize_t uuid_show(struct kobject *kobj, struct attribute *attr,
-			 char *buf)
-{
-	struct dt_device *dt = container_of(kobj, struct dt_device,
-					    dd_kobj);
-	struct lu_device *lu = dt2lu_dev(dt);
-
-	if (!lu->ld_obd)
-		return -ENODEV;
-
-	return sprintf(buf, "%s\n", lu->ld_obd->obd_uuid.uuid);
-}
-LUSTRE_RO_ATTR(uuid);
-
-static ssize_t blocksize_show(struct kobject *kobj, struct attribute *attr,
-			      char *buf)
-{
-	struct dt_device *dt = container_of(kobj, struct dt_device,
-					    dd_kobj);
-	struct obd_statfs osfs;
-	int rc;
-
-	rc = dt_statfs(NULL, dt, &osfs);
-	if (rc)
-		return rc;
-
-	return sprintf(buf, "%u\n", (unsigned) osfs.os_bsize);
-}
-LUSTRE_RO_ATTR(blocksize);
-
-static ssize_t kbytestotal_show(struct kobject *kobj, struct attribute *attr,
-				char *buf)
-{
-	struct dt_device *dt = container_of(kobj, struct dt_device,
-					    dd_kobj);
-	struct obd_statfs osfs;
-	u32 blk_size;
-	u64 result;
-	int rc;
-
-	rc = dt_statfs(NULL, dt, &osfs);
-	if (rc)
-		return rc;
-
-	blk_size = osfs.os_bsize >> 10;
-	result = osfs.os_blocks;
-
-	while (blk_size >>= 1)
-		result <<= 1;
-
-	return sprintf(buf, "%llu\n", result);
-}
-LUSTRE_RO_ATTR(kbytestotal);
-
-static ssize_t kbytesfree_show(struct kobject *kobj, struct attribute *attr,
-			       char *buf)
-{
-	struct dt_device *dt = container_of(kobj, struct dt_device,
-					    dd_kobj);
-	struct obd_statfs osfs;
-	u32 blk_size;
-	u64 result;
-	int rc;
-
-	rc = dt_statfs(NULL, dt, &osfs);
-	if (rc)
-		return rc;
-
-	blk_size = osfs.os_bsize >> 10;
-	result = osfs.os_bfree;
-
-	while (blk_size >>= 1)
-		result <<= 1;
-
-	return sprintf(buf, "%llu\n", result);
-}
-LUSTRE_RO_ATTR(kbytesfree);
-
-static ssize_t kbytesavail_show(struct kobject *kobj, struct attribute *attr,
-				char *buf)
-{
-	struct dt_device *dt = container_of(kobj, struct dt_device,
-					    dd_kobj);
-	struct obd_statfs osfs;
-	u32 blk_size;
-	u64 result;
-	int rc;
-
-	rc = dt_statfs(NULL, dt, &osfs);
-	if (rc)
-		return rc;
-
-	blk_size = osfs.os_bsize >> 10;
-	result = osfs.os_bavail;
-
-	while (blk_size >>= 1)
-		result <<= 1;
-
-	return sprintf(buf, "%llu\n", result);
-}
-LUSTRE_RO_ATTR(kbytesavail);
-
-static ssize_t filestotal_show(struct kobject *kobj, struct attribute *attr,
-			       char *buf)
-{
-	struct dt_device *dt = container_of(kobj, struct dt_device,
-					    dd_kobj);
-	struct obd_statfs osfs;
-	int rc;
-
-	rc = dt_statfs(NULL, dt, &osfs);
-	if (rc)
-		return rc;
-
-	return sprintf(buf, "%llu\n", osfs.os_files);
-}
-LUSTRE_RO_ATTR(filestotal);
-
-static ssize_t filesfree_show(struct kobject *kobj, struct attribute *attr,
-			      char *buf)
-{
-	struct dt_device *dt = container_of(kobj, struct dt_device,
-					    dd_kobj);
-	struct obd_statfs osfs;
-	int rc;
-
-	rc = dt_statfs(NULL, dt, &osfs);
-	if (rc)
-		return rc;
-
-	return sprintf(buf, "%llu\n", osfs.os_ffree);
-}
-LUSTRE_RO_ATTR(filesfree);
-
-static const struct attribute *dt_def_attrs[] = {
-	&lustre_attr_uuid.attr,
-	&lustre_attr_blocksize.attr,
-	&lustre_attr_kbytestotal.attr,
-	&lustre_attr_kbytesfree.attr,
-	&lustre_attr_kbytesavail.attr,
-	&lustre_attr_filestotal.attr,
-	&lustre_attr_filesfree.attr,
-	NULL,
-};
-
-static void dt_sysfs_release(struct kobject *kobj)
-{
-	struct dt_device *dt = container_of(kobj, struct dt_device,
-					    dd_kobj);
-
-	complete(&dt->dd_kobj_unregister);
-}
-
-int dt_tunables_fini(struct dt_device *dt)
-{
-	if (!dt)
-		return -EINVAL;
-
-	if (!IS_ERR_OR_NULL(dt->dd_debugfs_entry))
-		ldebugfs_remove(&dt->dd_debugfs_entry);
-
-	if (dt->dd_def_attrs)
-		sysfs_remove_files(&dt->dd_kobj, dt->dd_def_attrs);
-
-	kobject_put(&dt->dd_kobj);
-	wait_for_completion(&dt->dd_kobj_unregister);
-
-	return 0;
-}
-EXPORT_SYMBOL(dt_tunables_fini);
-
-int dt_tunables_init(struct dt_device *dt, struct obd_type *type,
-		     const char *name, struct ldebugfs_vars *list)
-{
-	int rc;
-
-	dt->dd_ktype.sysfs_ops = &lustre_sysfs_ops;
-	dt->dd_ktype.release = dt_sysfs_release;
-
-	init_completion(&dt->dd_kobj_unregister);
-	rc = kobject_init_and_add(&dt->dd_kobj, &dt->dd_ktype, type->typ_kobj,
-				  "%s", name);
-	if (rc)
-		return rc;
-
-	dt->dd_def_attrs = dt_def_attrs;
-
-	rc = sysfs_create_files(&dt->dd_kobj, dt->dd_def_attrs);
-	if (rc) {
-		kobject_put(&dt->dd_kobj);
-		return rc;
-	}
-
-	/*
-	 * No need to register debugfs if no enteries. This allows us to
-	 * choose between using dt_device or obd_device for debugfs.
-	 */
-	if (!list)
-		return rc;
-
-	dt->dd_debugfs_entry = ldebugfs_register(name,
-						 type->typ_debugfs_entry,
-						 list, dt);
-	if (IS_ERR_OR_NULL(dt->dd_debugfs_entry)) {
-		rc = dt->dd_debugfs_entry ? PTR_ERR(dt->dd_debugfs_entry)
-					  : -ENOMEM;
-		CERROR("%s: error %d setting up debugfs\n",
-		       name, rc);
-		dt->dd_debugfs_entry = NULL;
-		sysfs_remove_files(&dt->dd_kobj, dt->dd_def_attrs);
-		kobject_put(&dt->dd_kobj);
-		return rc;
-	}
-
-	return rc;
-}
-EXPORT_SYMBOL(dt_tunables_init);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/genops.c b/drivers/staging/lustrefsx/lustre/obdclass/genops.c
index bd9330daafd8a..2c8e4db905d01 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/genops.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/genops.c
@@ -23,7 +23,7 @@
  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,10 +38,8 @@
 #define DEBUG_SUBSYSTEM S_CLASS
 
 #include <linux/pid_namespace.h>
-#include <linux/workqueue.h>
-#include <lustre_compat.h>
+#include <linux/kthread.h>
 #include <obd_class.h>
-#include <lustre_log.h>
 #include <lprocfs_status.h>
 #include <lustre_disk.h>
 #include <lustre_kernelcomm.h>
@@ -52,9 +50,15 @@ DEFINE_RWLOCK(obd_dev_lock);
 static struct obd_device *obd_devs[MAX_OBD_DEVICES];
 
 static struct kmem_cache *obd_device_cachep;
+struct kmem_cache *obdo_cachep;
+EXPORT_SYMBOL(obdo_cachep);
+static struct kmem_cache *import_cachep;
 
-static struct workqueue_struct *zombie_wq;
+static LIST_HEAD(obd_zombie_imports);
+static LIST_HEAD(obd_zombie_exports);
+static DEFINE_SPINLOCK(obd_zombie_impexp_lock);
 
+static void obd_zombie_impexp_notify(void);
 static void obd_zombie_export_add(struct obd_export *exp);
 static void obd_zombie_import_add(struct obd_import *imp);
 static void print_export_data(struct obd_export *exp,
@@ -158,57 +162,18 @@ void class_put_type(struct obd_type *type)
 	spin_unlock(&type->obd_type_lock);
 }
 
-static void class_sysfs_release(struct kobject *kobj)
-{
-	OBD_FREE(kobj, sizeof(*kobj));
-}
-
-static struct kobj_type class_ktype = {
-	.sysfs_ops      = &lustre_sysfs_ops,
-	.release        = class_sysfs_release,
-};
-
-struct kobject *class_setup_tunables(const char *name)
-{
-	struct kobject *kobj;
-	int rc;
-
-#ifdef HAVE_SERVER_SUPPORT
-	kobj = kset_find_obj(lustre_kset, name);
-	if (kobj)
-		return kobj;
-#endif
-	OBD_ALLOC(kobj, sizeof(*kobj));
-	if (!kobj)
-		return ERR_PTR(-ENOMEM);
-
-	kobj->kset = lustre_kset;
-	kobject_init(kobj, &class_ktype);
-	rc = kobject_add(kobj, &lustre_kset->kobj, "%s", name);
-	if (rc) {
-		kobject_put(kobj);
-		return ERR_PTR(rc);
-	}
-	return kobj;
-}
-EXPORT_SYMBOL(class_setup_tunables);
-
 #define CLASS_MAX_NAME 1024
 
-int class_register_type(const struct obd_ops *dt_ops,
-			const struct md_ops *md_ops,
-			bool enable_proc, struct ldebugfs_vars *vars,
+int class_register_type(struct obd_ops *dt_ops, struct md_ops *md_ops,
+			bool enable_proc, struct lprocfs_vars *vars,
 			const char *name, struct lu_device_type *ldt)
 {
-	struct obd_type *type;
-#ifdef HAVE_SERVER_SUPPORT
-	struct qstr dname;
-#endif /* HAVE_SERVER_SUPPORT */
-	int rc = 0;
+        struct obd_type *type;
+        int rc = 0;
+        ENTRY;
 
-	ENTRY;
-	/* sanity check */
-	LASSERT(strnlen(name, CLASS_MAX_NAME) < CLASS_MAX_NAME);
+        /* sanity check */
+        LASSERT(strnlen(name, CLASS_MAX_NAME) < CLASS_MAX_NAME);
 
         if (class_search_type(name)) {
                 CDEBUG(D_IOCTL, "Type %s already registered\n", name);
@@ -240,7 +205,7 @@ int class_register_type(const struct obd_ops *dt_ops,
 	if (enable_proc) {
 		type->typ_procroot = lprocfs_register(type->typ_name,
 						      proc_lustre_root,
-						      NULL, type);
+						      vars, type);
 		if (IS_ERR(type->typ_procroot)) {
 			rc = PTR_ERR(type->typ_procroot);
 			type->typ_procroot = NULL;
@@ -248,57 +213,20 @@ int class_register_type(const struct obd_ops *dt_ops,
 		}
 	}
 #endif
-#ifdef HAVE_SERVER_SUPPORT
-	dname.name = name;
-	dname.len = strlen(dname.name);
-	dname.hash = ll_full_name_hash(debugfs_lustre_root, dname.name,
-				       dname.len);
-	type->typ_debugfs_entry = d_lookup(debugfs_lustre_root, &dname);
-	if (type->typ_debugfs_entry) {
-		dput(type->typ_debugfs_entry);
-		type->typ_sym_filter = true;
-		goto dir_exist;
-	}
-#endif /* HAVE_SERVER_SUPPORT */
-
-	type->typ_debugfs_entry = ldebugfs_register(type->typ_name,
-						    debugfs_lustre_root,
-						    vars, type);
-	if (IS_ERR_OR_NULL(type->typ_debugfs_entry)) {
-		rc = type->typ_debugfs_entry ? PTR_ERR(type->typ_debugfs_entry)
-					     : -ENOMEM;
-		type->typ_debugfs_entry = NULL;
-		GOTO(failed, rc);
-	}
-#ifdef HAVE_SERVER_SUPPORT
-dir_exist:
-#endif
-	type->typ_kobj = class_setup_tunables(type->typ_name);
-	if (IS_ERR(type->typ_kobj))
-		GOTO(failed, rc = PTR_ERR(type->typ_kobj));
-
-	if (ldt) {
-		type->typ_lu = ldt;
-		rc = lu_device_type_init(ldt);
-		if (rc) {
-			kobject_put(type->typ_kobj);
-			GOTO(failed, rc);
-		}
-	}
+        if (ldt != NULL) {
+                type->typ_lu = ldt;
+                rc = lu_device_type_init(ldt);
+                if (rc != 0)
+                        GOTO (failed, rc);
+        }
 
 	spin_lock(&obd_types_lock);
 	list_add(&type->typ_chain, &obd_types);
 	spin_unlock(&obd_types_lock);
 
-	RETURN(0);
+        RETURN (0);
 
 failed:
-#ifdef HAVE_SERVER_SUPPORT
-	if (type->typ_sym_filter)
-		type->typ_debugfs_entry = NULL;
-#endif
-	if (!IS_ERR_OR_NULL(type->typ_debugfs_entry))
-		ldebugfs_remove(&type->typ_debugfs_entry);
 	if (type->typ_name != NULL) {
 #ifdef CONFIG_PROC_FS
 		if (type->typ_procroot != NULL)
@@ -334,8 +262,6 @@ int class_unregister_type(const char *name)
                 RETURN(-EBUSY);
         }
 
-	kobject_put(type->typ_kobj);
-
 	/* we do not use type->typ_procroot as for compatibility purposes
 	 * other modules can share names (i.e. lod can use lov entry). so
 	 * we can't reference pointer as it can get invalided when another
@@ -346,13 +272,6 @@ int class_unregister_type(const char *name)
 	if (type->typ_procsym != NULL)
 		lprocfs_remove(&type->typ_procsym);
 #endif
-#ifdef HAVE_SERVER_SUPPORT
-	if (type->typ_sym_filter)
-		type->typ_debugfs_entry = NULL;
-#endif
-	if (!IS_ERR_OR_NULL(type->typ_debugfs_entry))
-		ldebugfs_remove(&type->typ_debugfs_entry);
-
         if (type->typ_lu)
                 lu_device_type_fini(type->typ_lu);
 
@@ -372,20 +291,22 @@ EXPORT_SYMBOL(class_unregister_type);
 /**
  * Create a new obd device.
  *
- * Allocate the new obd_device and initialize it.
+ * Find an empty slot in ::obd_devs[], create a new obd device in it.
  *
  * \param[in] type_name obd device type string.
  * \param[in] name      obd device name.
- * \param[in] uuid      obd device UUID
  *
- * \retval newdev         pointer to created obd_device
- * \retval ERR_PTR(errno) on error
+ * \retval NULL if create fails, otherwise return the obd device
+ *         pointer created.
  */
-struct obd_device *class_newdev(const char *type_name, const char *name,
-				const char *uuid)
+struct obd_device *class_newdev(const char *type_name, const char *name)
 {
+        struct obd_device *result = NULL;
         struct obd_device *newdev;
         struct obd_type *type = NULL;
+        int i;
+        int new_obd_minor = 0;
+        bool retried = false;
         ENTRY;
 
         if (strlen(name) >= MAX_OBD_NAME) {
@@ -400,197 +321,106 @@ struct obd_device *class_newdev(const char *type_name, const char *name,
         }
 
         newdev = obd_device_alloc();
-	if (newdev == NULL) {
-		class_put_type(type);
-		RETURN(ERR_PTR(-ENOMEM));
-	}
+	if (newdev == NULL)
+		GOTO(out_type, result = ERR_PTR(-ENOMEM));
+
         LASSERT(newdev->obd_magic == OBD_DEVICE_MAGIC);
-	strncpy(newdev->obd_name, name, sizeof(newdev->obd_name) - 1);
-	newdev->obd_type = type;
-	newdev->obd_minor = -1;
-
-	rwlock_init(&newdev->obd_pool_lock);
-	newdev->obd_pool_limit = 0;
-	newdev->obd_pool_slv = 0;
-
-	INIT_LIST_HEAD(&newdev->obd_exports);
-	INIT_LIST_HEAD(&newdev->obd_unlinked_exports);
-	INIT_LIST_HEAD(&newdev->obd_delayed_exports);
-	INIT_LIST_HEAD(&newdev->obd_exports_timed);
-	INIT_LIST_HEAD(&newdev->obd_nid_stats);
-	spin_lock_init(&newdev->obd_nid_lock);
-	spin_lock_init(&newdev->obd_dev_lock);
-	mutex_init(&newdev->obd_dev_mutex);
-	spin_lock_init(&newdev->obd_osfs_lock);
-	/* newdev->obd_osfs_age must be set to a value in the distant
-	 * past to guarantee a fresh statfs is fetched on mount. */
-	newdev->obd_osfs_age = ktime_get_seconds() - 1000;
-
-	/* XXX belongs in setup not attach  */
-	init_rwsem(&newdev->obd_observer_link_sem);
-	/* recovery data */
-	spin_lock_init(&newdev->obd_recovery_task_lock);
-	init_waitqueue_head(&newdev->obd_next_transno_waitq);
-	init_waitqueue_head(&newdev->obd_evict_inprogress_waitq);
-	INIT_LIST_HEAD(&newdev->obd_req_replay_queue);
-	INIT_LIST_HEAD(&newdev->obd_lock_replay_queue);
-	INIT_LIST_HEAD(&newdev->obd_final_req_queue);
-	INIT_LIST_HEAD(&newdev->obd_evict_list);
-	INIT_LIST_HEAD(&newdev->obd_lwp_list);
-
-	llog_group_init(&newdev->obd_olg);
-	/* Detach drops this */
-	atomic_set(&newdev->obd_refcount, 1);
-	lu_ref_init(&newdev->obd_reference);
-	lu_ref_add(&newdev->obd_reference, "newdev", newdev);
-
-	newdev->obd_conn_inprogress = 0;
-
-	strncpy(newdev->obd_uuid.uuid, uuid, UUID_MAX);
-
-	CDEBUG(D_IOCTL, "Allocate new device %s (%p)\n",
-	       newdev->obd_name, newdev);
-
-	return newdev;
-}
 
-/**
- * Free obd device.
- *
- * \param[in] obd obd_device to be freed
- *
- * \retval none
- */
-void class_free_dev(struct obd_device *obd)
-{
-	struct obd_type *obd_type = obd->obd_type;
+        again:
+	write_lock(&obd_dev_lock);
+        for (i = 0; i < class_devno_max(); i++) {
+                struct obd_device *obd = class_num2obd(i);
 
-	LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x "
-		 "!= %08x\n", obd, obd->obd_magic, OBD_DEVICE_MAGIC);
-	LASSERTF(obd->obd_minor == -1 || obd_devs[obd->obd_minor] == obd,
-		 "obd %p != obd_devs[%d] %p\n",
-		 obd, obd->obd_minor, obd_devs[obd->obd_minor]);
-	LASSERTF(atomic_read(&obd->obd_refcount) == 0,
-		 "obd_refcount should be 0, not %d\n",
-		 atomic_read(&obd->obd_refcount));
-	LASSERT(obd_type != NULL);
+		if (obd && (strcmp(name, obd->obd_name) == 0)) {
 
-	CDEBUG(D_INFO, "Release obd device %s obd_type name = %s\n",
-	       obd->obd_name, obd->obd_type->typ_name);
+                        if (!retried) {
+                                write_unlock(&obd_dev_lock);
 
-	CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n",
-			 obd->obd_name, obd->obd_uuid.uuid);
-	if (obd->obd_stopping) {
-		int err;
+                                /* the obd_device could be waited to be
+                                 * destroyed by the "obd_zombie_impexp_thread".
+                                 */
+                                obd_zombie_barrier();
+                                retried = true;
+                                goto again;
+                        }
 
-		/* If we're not stopping, we were never set up */
-		err = obd_cleanup(obd);
-		if (err)
-			CERROR("Cleanup %s returned %d\n",
-				obd->obd_name, err);
-	}
+                        CERROR("Device %s already exists at %d, won't add\n",
+                               name, i);
+                        if (result) {
+                                LASSERTF(result->obd_magic == OBD_DEVICE_MAGIC,
+                                         "%p obd_magic %08x != %08x\n", result,
+                                         result->obd_magic, OBD_DEVICE_MAGIC);
+                                LASSERTF(result->obd_minor == new_obd_minor,
+                                         "%p obd_minor %d != %d\n", result,
+                                         result->obd_minor, new_obd_minor);
+
+                                obd_devs[result->obd_minor] = NULL;
+                                result->obd_name[0]='\0';
+                         }
+                        result = ERR_PTR(-EEXIST);
+                        break;
+                }
+                if (!result && !obd) {
+                        result = newdev;
+                        result->obd_minor = i;
+                        new_obd_minor = i;
+                        result->obd_type = type;
+                        strncpy(result->obd_name, name,
+                                sizeof(result->obd_name) - 1);
+                        obd_devs[i] = result;
+                }
+        }
+	write_unlock(&obd_dev_lock);
 
-	obd_device_free(obd);
+        if (result == NULL && i >= class_devno_max()) {
+                CERROR("all %u OBD devices used, increase MAX_OBD_DEVICES\n",
+                       class_devno_max());
+		GOTO(out, result = ERR_PTR(-EOVERFLOW));
+        }
 
-	class_put_type(obd_type);
-}
+	if (IS_ERR(result))
+	        GOTO(out, result);
 
-/**
- * Unregister obd device.
- *
- * Free slot in obd_dev[] used by \a obd.
- *
- * \param[in] new_obd obd_device to be unregistered
- *
- * \retval none
- */
-void class_unregister_device(struct obd_device *obd)
-{
-	write_lock(&obd_dev_lock);
-	if (obd->obd_minor >= 0) {
-		LASSERT(obd_devs[obd->obd_minor] == obd);
-		obd_devs[obd->obd_minor] = NULL;
-		obd->obd_minor = -1;
-	}
-	write_unlock(&obd_dev_lock);
+	CDEBUG(D_IOCTL, "Adding new device %s (%p)\n",
+	       result->obd_name, result);
+
+	RETURN(result);
+out:
+	obd_device_free(newdev);
+out_type:
+	class_put_type(type);
+	return result;
 }
 
-/**
- * Register obd device.
- *
- * Find free slot in obd_devs[], fills it with \a new_obd.
- *
- * \param[in] new_obd obd_device to be registered
- *
- * \retval 0          success
- * \retval -EEXIST    device with this name is registered
- * \retval -EOVERFLOW obd_devs[] is full
- */
-int class_register_device(struct obd_device *new_obd)
+void class_release_dev(struct obd_device *obd)
 {
-	int ret = 0;
-	int i;
-	int new_obd_minor = 0;
-	bool minor_assign = false;
-	bool retried = false;
+        struct obd_type *obd_type = obd->obd_type;
 
-again:
-	write_lock(&obd_dev_lock);
-	for (i = 0; i < class_devno_max(); i++) {
-		struct obd_device *obd = class_num2obd(i);
-
-		if (obd != NULL &&
-		    (strcmp(new_obd->obd_name, obd->obd_name) == 0)) {
-
-			if (!retried) {
-				write_unlock(&obd_dev_lock);
-
-				/* the obd_device could be waited to be
- 				 * destroyed by the "obd_zombie_impexp_thread".
- 				 */
-				obd_zombie_barrier();
-				retried = true;
-				goto again;
-			}
-
-			CERROR("%s: already exists, won't add\n",
-			       obd->obd_name);
-			/* in case we found a free slot before duplicate */
-			minor_assign = false;
-			ret = -EEXIST;
-			break;
-		}
-		if (!minor_assign && obd == NULL) {
-			new_obd_minor = i;
-			minor_assign = true;
-		}
-	}
+        LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x != %08x\n",
+                 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+        LASSERTF(obd == obd_devs[obd->obd_minor], "obd %p != obd_devs[%d] %p\n",
+                 obd, obd->obd_minor, obd_devs[obd->obd_minor]);
+        LASSERT(obd_type != NULL);
 
-	if (minor_assign) {
-		new_obd->obd_minor = new_obd_minor;
-		LASSERTF(obd_devs[new_obd_minor] == NULL, "obd_devs[%d] "
-			 "%p\n", new_obd_minor, obd_devs[new_obd_minor]);
-		obd_devs[new_obd_minor] = new_obd;
-	} else {
-		if (ret == 0) {
-			ret = -EOVERFLOW;
-			CERROR("%s: all %u/%u devices used, increase "
-			       "MAX_OBD_DEVICES: rc = %d\n", new_obd->obd_name,
-			       i, class_devno_max(), ret);
-		}
-	}
+        CDEBUG(D_INFO, "Release obd device %s at %d obd_type name =%s\n",
+               obd->obd_name, obd->obd_minor, obd->obd_type->typ_name);
+
+	write_lock(&obd_dev_lock);
+        obd_devs[obd->obd_minor] = NULL;
 	write_unlock(&obd_dev_lock);
+        obd_device_free(obd);
 
-	RETURN(ret);
+        class_put_type(obd_type);
 }
 
-static int class_name2dev_nolock(const char *name)
+int class_name2dev(const char *name)
 {
         int i;
 
         if (!name)
                 return -1;
 
+	read_lock(&obd_dev_lock);
         for (i = 0; i < class_devno_max(); i++) {
                 struct obd_device *obd = class_num2obd(i);
 
@@ -599,29 +429,16 @@ static int class_name2dev_nolock(const char *name)
                            out any references */
                         LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
                         if (obd->obd_attached) {
+				read_unlock(&obd_dev_lock);
                                 return i;
                         }
                         break;
                 }
         }
-
-        return -1;
-}
-
-int class_name2dev(const char *name)
-{
-	int i;
-
-	if (!name)
-		return -1;
-
-	read_lock(&obd_dev_lock);
-	i = class_name2dev_nolock(name);
 	read_unlock(&obd_dev_lock);
 
-	return i;
+        return -1;
 }
-EXPORT_SYMBOL(class_name2dev);
 
 struct obd_device *class_name2obd(const char *name)
 {
@@ -633,33 +450,24 @@ struct obd_device *class_name2obd(const char *name)
 }
 EXPORT_SYMBOL(class_name2obd);
 
-int class_uuid2dev_nolock(struct obd_uuid *uuid)
+int class_uuid2dev(struct obd_uuid *uuid)
 {
         int i;
 
+	read_lock(&obd_dev_lock);
         for (i = 0; i < class_devno_max(); i++) {
                 struct obd_device *obd = class_num2obd(i);
 
                 if (obd && obd_uuid_equals(uuid, &obd->obd_uuid)) {
                         LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+			read_unlock(&obd_dev_lock);
                         return i;
                 }
         }
-
-        return -1;
-}
-
-int class_uuid2dev(struct obd_uuid *uuid)
-{
-	int i;
-
-	read_lock(&obd_dev_lock);
-	i = class_uuid2dev_nolock(uuid);
 	read_unlock(&obd_dev_lock);
 
-	return i;
+        return -1;
 }
-EXPORT_SYMBOL(class_uuid2dev);
 
 struct obd_device *class_uuid2obd(struct obd_uuid *uuid)
 {
@@ -698,40 +506,6 @@ struct obd_device *class_num2obd(int num)
         return obd;
 }
 
-/**
- * Find obd in obd_dev[] by name or uuid.
- *
- * Increment obd's refcount if found.
- *
- * \param[in] str obd name or uuid
- *
- * \retval NULL    if not found
- * \retval target  pointer to found obd_device
- */
-struct obd_device *class_dev_by_str(const char *str)
-{
-	struct obd_device *target = NULL;
-	struct obd_uuid tgtuuid;
-	int rc;
-
-	obd_str2uuid(&tgtuuid, str);
-
-	read_lock(&obd_dev_lock);
-	rc = class_uuid2dev_nolock(&tgtuuid);
-	if (rc < 0)
-		rc = class_name2dev_nolock(str);
-
-	if (rc >= 0)
-		target = class_num2obd(rc);
-
-	if (target != NULL)
-		class_incref(target, "find", current);
-	read_unlock(&obd_dev_lock);
-
-	RETURN(target);
-}
-EXPORT_SYMBOL(class_dev_by_str);
-
 /**
  * Get obd devices count. Device in any
  *    state are counted
@@ -901,6 +675,14 @@ void obd_cleanup_caches(void)
 		kmem_cache_destroy(obd_device_cachep);
                 obd_device_cachep = NULL;
         }
+        if (obdo_cachep) {
+		kmem_cache_destroy(obdo_cachep);
+                obdo_cachep = NULL;
+        }
+        if (import_cachep) {
+		kmem_cache_destroy(import_cachep);
+                import_cachep = NULL;
+        }
 
         EXIT;
 }
@@ -917,6 +699,19 @@ int obd_init_caches(void)
 	if (!obd_device_cachep)
 		GOTO(out, rc = -ENOMEM);
 
+	LASSERT(obdo_cachep == NULL);
+	obdo_cachep = kmem_cache_create("ll_obdo_cache", sizeof(struct obdo),
+					0, 0, NULL);
+	if (!obdo_cachep)
+		GOTO(out, rc = -ENOMEM);
+
+	LASSERT(import_cachep == NULL);
+	import_cachep = kmem_cache_create("ll_import_cache",
+					  sizeof(struct obd_import),
+					  0, 0, NULL);
+	if (!import_cachep)
+		GOTO(out, rc = -ENOMEM);
+
 	RETURN(0);
 out:
 	obd_cleanup_caches();
@@ -953,6 +748,18 @@ struct obd_device *class_exp2obd(struct obd_export *exp)
 }
 EXPORT_SYMBOL(class_exp2obd);
 
+struct obd_device *class_conn2obd(struct lustre_handle *conn)
+{
+        struct obd_export *export;
+        export = class_conn2export(conn);
+        if (export) {
+                struct obd_device *obd = export->exp_obd;
+                class_export_put(export);
+                return obd;
+        }
+        return NULL;
+}
+
 struct obd_import *class_exp2cliimp(struct obd_export *exp)
 {
         struct obd_device *obd = exp->exp_obd;
@@ -962,6 +769,14 @@ struct obd_import *class_exp2cliimp(struct obd_export *exp)
 }
 EXPORT_SYMBOL(class_exp2cliimp);
 
+struct obd_import *class_conn2cliimp(struct lustre_handle *conn)
+{
+        struct obd_device *obd = class_conn2obd(conn);
+        if (obd == NULL)
+                return NULL;
+        return obd->u.cli.cl_import;
+}
+
 /* Export management functions */
 static void class_export_destroy(struct obd_export *exp)
 {
@@ -983,10 +798,7 @@ static void class_export_destroy(struct obd_export *exp)
 	LASSERT(list_empty(&exp->exp_req_replay_queue));
 	LASSERT(list_empty(&exp->exp_hp_rpcs));
         obd_destroy_export(exp);
-	/* self export doesn't hold a reference to an obd, although it
-	 * exists until freeing of the obd */
-	if (exp != obd->obd_self_export)
-		class_decref(obd, "export", exp);
+        class_decref(obd, "export", exp);
 
         OBD_FREE_RCU(exp, sizeof(*exp), &exp->exp_handle);
         EXIT;
@@ -1019,46 +831,24 @@ void class_export_put(struct obd_export *exp)
 	       atomic_read(&exp->exp_refcount) - 1);
 
 	if (atomic_dec_and_test(&exp->exp_refcount)) {
-		struct obd_device *obd = exp->exp_obd;
-
+		LASSERT(!list_empty(&exp->exp_obd_chain));
+		LASSERT(list_empty(&exp->exp_stale_list));
 		CDEBUG(D_IOCTL, "final put %p/%s\n",
 		       exp, exp->exp_client_uuid.uuid);
 
 		/* release nid stat refererence */
 		lprocfs_exp_cleanup(exp);
 
-		if (exp == obd->obd_self_export) {
-			/* self export should be destroyed without
-			 * zombie thread as it doesn't hold a
-			 * reference to obd and doesn't hold any
-			 * resources */
-			class_export_destroy(exp);
-			/* self export is destroyed, no class
-			 * references exist and it is safe to free
-			 * obd */
-			class_free_dev(obd);
-		} else {
-			LASSERT(!list_empty(&exp->exp_obd_chain));
-			obd_zombie_export_add(exp);
-		}
-
+		obd_zombie_export_add(exp);
 	}
 }
 EXPORT_SYMBOL(class_export_put);
 
-static void obd_zombie_exp_cull(struct work_struct *ws)
-{
-	struct obd_export *export;
-
-	export = container_of(ws, struct obd_export, exp_zombie_work);
-	class_export_destroy(export);
-}
-
 /* Creates a new export, adds it to the hash table, and returns a
  * pointer to it. The refcount is 2: one for the hash reference, and
  * one for the pointer returned by this function. */
-struct obd_export *__class_new_export(struct obd_device *obd,
-				      struct obd_uuid *cluuid, bool is_self)
+struct obd_export *class_new_export(struct obd_device *obd,
+                                    struct obd_uuid *cluuid)
 {
         struct obd_export *export;
 	struct cfs_hash *hash = NULL;
@@ -1072,7 +862,6 @@ struct obd_export *__class_new_export(struct obd_device *obd,
         export->exp_conn_cnt = 0;
         export->exp_lock_hash = NULL;
 	export->exp_flock_hash = NULL;
-	/* 2 = class_handle_hash + last */
 	atomic_set(&export->exp_refcount, 2);
 	atomic_set(&export->exp_rpc_count, 0);
 	atomic_set(&export->exp_cb_count, 0);
@@ -1087,11 +876,11 @@ struct obd_export *__class_new_export(struct obd_device *obd,
 	spin_lock_init(&export->exp_uncommitted_replies_lock);
 	INIT_LIST_HEAD(&export->exp_uncommitted_replies);
 	INIT_LIST_HEAD(&export->exp_req_replay_queue);
-	INIT_LIST_HEAD_RCU(&export->exp_handle.h_link);
+	INIT_LIST_HEAD(&export->exp_handle.h_link);
 	INIT_LIST_HEAD(&export->exp_hp_rpcs);
 	INIT_LIST_HEAD(&export->exp_reg_rpcs);
 	class_handle_hash(&export->exp_handle, &export_handle_ops);
-	export->exp_last_request_time = ktime_get_real_seconds();
+	export->exp_last_request_time = cfs_time_current_sec();
 	spin_lock_init(&export->exp_lock);
 	spin_lock_init(&export->exp_rpc_lock);
 	INIT_HLIST_NODE(&export->exp_uuid_hash);
@@ -1100,24 +889,23 @@ struct obd_export *__class_new_export(struct obd_device *obd,
 	spin_lock_init(&export->exp_bl_list_lock);
 	INIT_LIST_HEAD(&export->exp_bl_list);
 	INIT_LIST_HEAD(&export->exp_stale_list);
-	INIT_WORK(&export->exp_zombie_work, obd_zombie_exp_cull);
 
 	export->exp_sp_peer = LUSTRE_SP_ANY;
 	export->exp_flvr.sf_rpc = SPTLRPC_FLVR_INVALID;
 	export->exp_client_uuid = *cluuid;
 	obd_init_export(export);
 
-	if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) {
-		spin_lock(&obd->obd_dev_lock);
-		/* shouldn't happen, but might race */
-		if (obd->obd_stopping)
-			GOTO(exit_unlock, rc = -ENODEV);
+	spin_lock(&obd->obd_dev_lock);
+	/* shouldn't happen, but might race */
+	if (obd->obd_stopping)
+		GOTO(exit_unlock, rc = -ENODEV);
 
-		hash = cfs_hash_getref(obd->obd_uuid_hash);
-		if (hash == NULL)
-			GOTO(exit_unlock, rc = -ENODEV);
-		spin_unlock(&obd->obd_dev_lock);
+	hash = cfs_hash_getref(obd->obd_uuid_hash);
+	if (hash == NULL)
+		GOTO(exit_unlock, rc = -ENODEV);
+	spin_unlock(&obd->obd_dev_lock);
 
+        if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) {
                 rc = cfs_hash_add_unique(hash, cluuid, &export->exp_uuid_hash);
                 if (rc != 0) {
                         LCONSOLE_WARN("%s: denying duplicate export for %s, %d\n",
@@ -1129,24 +917,17 @@ struct obd_export *__class_new_export(struct obd_device *obd,
 	at_init(&export->exp_bl_lock_at, obd_timeout, 0);
 	spin_lock(&obd->obd_dev_lock);
         if (obd->obd_stopping) {
-		if (hash)
-			cfs_hash_del(hash, cluuid, &export->exp_uuid_hash);
-		GOTO(exit_unlock, rc = -ESHUTDOWN);
+                cfs_hash_del(hash, cluuid, &export->exp_uuid_hash);
+                GOTO(exit_unlock, rc = -ENODEV);
         }
 
-	if (!is_self) {
-		class_incref(obd, "export", export);
-		list_add_tail(&export->exp_obd_chain_timed,
-			      &obd->obd_exports_timed);
-		list_add(&export->exp_obd_chain, &obd->obd_exports);
-		obd->obd_num_exports++;
-	} else {
-		INIT_LIST_HEAD(&export->exp_obd_chain_timed);
-		INIT_LIST_HEAD(&export->exp_obd_chain);
-	}
+        class_incref(obd, "export", export);
+	list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports);
+	list_add_tail(&export->exp_obd_chain_timed,
+		      &export->exp_obd->obd_exports_timed);
+        export->exp_obd->obd_num_exports++;
 	spin_unlock(&obd->obd_dev_lock);
-	if (hash)
-		cfs_hash_putref(hash);
+	cfs_hash_putref(hash);
 	RETURN(export);
 
 exit_unlock:
@@ -1160,29 +941,12 @@ struct obd_export *__class_new_export(struct obd_device *obd,
         OBD_FREE_PTR(export);
         return ERR_PTR(rc);
 }
-
-struct obd_export *class_new_export(struct obd_device *obd,
-				    struct obd_uuid *uuid)
-{
-	return __class_new_export(obd, uuid, false);
-}
 EXPORT_SYMBOL(class_new_export);
 
-struct obd_export *class_new_export_self(struct obd_device *obd,
-					 struct obd_uuid *uuid)
-{
-	return __class_new_export(obd, uuid, true);
-}
-
 void class_unlink_export(struct obd_export *exp)
 {
 	class_handle_unhash(&exp->exp_handle);
 
-	if (exp->exp_obd->obd_self_export == exp) {
-		class_export_put(exp);
-		return;
-	}
-
 	spin_lock(&exp->exp_obd->obd_dev_lock);
 	/* delete an uuid-export hashitem from hashtables */
 	if (!hlist_unhashed(&exp->exp_uuid_hash))
@@ -1217,7 +981,7 @@ void class_unlink_export(struct obd_export *exp)
 EXPORT_SYMBOL(class_unlink_export);
 
 /* Import management functions */
-static void obd_zombie_import_free(struct obd_import *imp)
+static void class_import_destroy(struct obd_import *imp)
 {
         ENTRY;
 
@@ -1239,13 +1003,21 @@ static void obd_zombie_import_free(struct obd_import *imp)
         }
 
         LASSERT(imp->imp_sec == NULL);
-	LASSERTF(atomic_read(&imp->imp_reqs) == 0, "%s: imp_reqs = %d\n",
-		 imp->imp_obd->obd_name, atomic_read(&imp->imp_reqs));
         class_decref(imp->imp_obd, "import", imp);
-	OBD_FREE_PTR(imp);
-	EXIT;
+        OBD_FREE_RCU(imp, sizeof(*imp), &imp->imp_handle);
+        EXIT;
 }
 
+static void import_handle_addref(void *import)
+{
+        class_import_get(import);
+}
+
+static struct portals_handle_ops import_handle_ops = {
+	.hop_addref = import_handle_addref,
+	.hop_free   = NULL,
+};
+
 struct obd_import *class_import_get(struct obd_import *import)
 {
 	atomic_inc(&import->imp_refcount);
@@ -1260,6 +1032,7 @@ void class_import_put(struct obd_import *imp)
 {
 	ENTRY;
 
+	LASSERT(list_empty(&imp->imp_zombie_chain));
         LASSERT_ATOMIC_GT_LT(&imp->imp_refcount, 0, LI_POISON);
 
         CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", imp,
@@ -1271,6 +1044,8 @@ void class_import_put(struct obd_import *imp)
                 obd_zombie_import_add(imp);
         }
 
+	/* catch possible import put race */
+	LASSERT_ATOMIC_GE_LT(&imp->imp_refcount, 0, LI_POISON);
 	EXIT;
 }
 EXPORT_SYMBOL(class_import_put);
@@ -1287,14 +1062,6 @@ static void init_imp_at(struct imp_at *at) {
         }
 }
 
-static void obd_zombie_imp_cull(struct work_struct *ws)
-{
-	struct obd_import *import;
-
-	import = container_of(ws, struct obd_import, imp_zombie_work);
-	obd_zombie_import_free(import);
-}
-
 struct obd_import *class_new_import(struct obd_device *obd)
 {
 	struct obd_import *imp;
@@ -1305,6 +1072,7 @@ struct obd_import *class_new_import(struct obd_device *obd)
 		return NULL;
 
 	INIT_LIST_HEAD(&imp->imp_pinger_chain);
+	INIT_LIST_HEAD(&imp->imp_zombie_chain);
 	INIT_LIST_HEAD(&imp->imp_replay_list);
 	INIT_LIST_HEAD(&imp->imp_sending_list);
 	INIT_LIST_HEAD(&imp->imp_delayed_list);
@@ -1318,21 +1086,20 @@ struct obd_import *class_new_import(struct obd_device *obd)
 	imp->imp_obd = class_incref(obd, "import", imp);
 	mutex_init(&imp->imp_sec_mutex);
 	init_waitqueue_head(&imp->imp_recovery_waitq);
-	INIT_WORK(&imp->imp_zombie_work, obd_zombie_imp_cull);
 
-	if (curr_pid_ns && curr_pid_ns->child_reaper)
+	if (curr_pid_ns->child_reaper)
 		imp->imp_sec_refpid = curr_pid_ns->child_reaper->pid;
 	else
 		imp->imp_sec_refpid = 1;
 
 	atomic_set(&imp->imp_refcount, 2);
 	atomic_set(&imp->imp_unregistering, 0);
-	atomic_set(&imp->imp_reqs, 0);
 	atomic_set(&imp->imp_inflight, 0);
 	atomic_set(&imp->imp_replay_inflight, 0);
-	init_waitqueue_head(&imp->imp_replay_waitq);
 	atomic_set(&imp->imp_inval_count, 0);
 	INIT_LIST_HEAD(&imp->imp_conn_list);
+	INIT_LIST_HEAD(&imp->imp_handle.h_link);
+	class_handle_hash(&imp->imp_handle, &import_handle_ops);
 	init_imp_at(&imp->imp_at);
 
 	/* the default magic is V2, will be used in connect RPC, and
@@ -1348,6 +1115,8 @@ void class_destroy_import(struct obd_import *import)
 	LASSERT(import != NULL);
 	LASSERT(import != LP_POISON);
 
+	class_handle_unhash(&import->imp_handle);
+
 	spin_lock(&import->imp_lock);
 	import->imp_generation++;
 	spin_unlock(&import->imp_lock);
@@ -1560,7 +1329,7 @@ static void class_disconnect_export_list(struct list_head *list,
 
                 class_export_get(exp);
                 CDEBUG(D_HA, "%s: disconnecting export at %s (%p), "
-		       "last request at %lld\n",
+		       "last request at %ld\n",
                        exp->exp_obd->obd_name, obd_export_nid2str(exp),
                        exp, exp->exp_last_request_time);
                 /* release one export reference anyway */
@@ -1630,12 +1399,13 @@ void class_disconnect_stale_exports(struct obd_device *obd,
 		spin_unlock(&exp->exp_lock);
 
 		list_move(&exp->exp_obd_chain, &work_list);
-		evicted++;
-		CDEBUG(D_HA, "%s: disconnect stale client %s@%s\n",
-		       obd->obd_name, exp->exp_client_uuid.uuid,
-		       obd_export_nid2str(exp));
-		print_export_data(exp, "EVICTING", 0, D_HA);
-	}
+                evicted++;
+                CDEBUG(D_HA, "%s: disconnect stale client %s@%s\n",
+                       obd->obd_name, exp->exp_client_uuid.uuid,
+                       exp->exp_connection == NULL ? "<unknown>" :
+                       libcfs_nid2str(exp->exp_connection->c_peer.nid));
+                print_export_data(exp, "EVICTING", 0, D_HA);
+        }
 	spin_unlock(&obd->obd_dev_lock);
 
 	if (evicted)
@@ -1686,6 +1456,15 @@ void class_fail_export(struct obd_export *exp)
 }
 EXPORT_SYMBOL(class_fail_export);
 
+char *obd_export_nid2str(struct obd_export *exp)
+{
+        if (exp->exp_connection != NULL)
+                return libcfs_nid2str(exp->exp_connection->c_peer.nid);
+
+        return "(no nid)";
+}
+EXPORT_SYMBOL(obd_export_nid2str);
+
 int obd_export_evict_by_nid(struct obd_device *obd, const char *nid)
 {
 	struct cfs_hash *nid_hash;
@@ -1823,6 +1602,10 @@ void dump_exports(struct obd_device *obd, int locks, int debug_level)
 	list_for_each_entry(exp, &obd->obd_delayed_exports, exp_obd_chain)
 		print_export_data(exp, "DELAYED", locks, debug_level);
 	spin_unlock(&obd->obd_dev_lock);
+	spin_lock(&obd_zombie_impexp_lock);
+	list_for_each_entry(exp, &obd_zombie_exports, exp_obd_chain)
+		print_export_data(exp, "ZOMBIE", locks, debug_level);
+	spin_unlock(&obd_zombie_impexp_lock);
 }
 
 void obd_exports_barrier(struct obd_device *obd)
@@ -1849,6 +1632,83 @@ void obd_exports_barrier(struct obd_device *obd)
 }
 EXPORT_SYMBOL(obd_exports_barrier);
 
+/* Total amount of zombies to be destroyed */
+static int zombies_count = 0;
+
+/**
+ * kill zombie imports and exports
+ */
+void obd_zombie_impexp_cull(void)
+{
+	struct obd_import *import;
+	struct obd_export *export;
+	ENTRY;
+
+	do {
+		spin_lock(&obd_zombie_impexp_lock);
+
+		import = NULL;
+		if (!list_empty(&obd_zombie_imports)) {
+			import = list_entry(obd_zombie_imports.next,
+					    struct obd_import,
+					    imp_zombie_chain);
+			list_del_init(&import->imp_zombie_chain);
+		}
+
+		export = NULL;
+		if (!list_empty(&obd_zombie_exports)) {
+			export = list_entry(obd_zombie_exports.next,
+					    struct obd_export,
+					    exp_obd_chain);
+			list_del_init(&export->exp_obd_chain);
+		}
+
+		spin_unlock(&obd_zombie_impexp_lock);
+
+		if (import != NULL) {
+			class_import_destroy(import);
+			spin_lock(&obd_zombie_impexp_lock);
+			zombies_count--;
+			spin_unlock(&obd_zombie_impexp_lock);
+		}
+
+		if (export != NULL) {
+			class_export_destroy(export);
+			spin_lock(&obd_zombie_impexp_lock);
+			zombies_count--;
+			spin_unlock(&obd_zombie_impexp_lock);
+		}
+
+		cond_resched();
+	} while (import != NULL || export != NULL);
+	EXIT;
+}
+
+static DECLARE_COMPLETION(obd_zombie_start);
+static DECLARE_COMPLETION(obd_zombie_stop);
+static unsigned long obd_zombie_flags;
+static DECLARE_WAIT_QUEUE_HEAD(obd_zombie_waitq);
+static pid_t obd_zombie_pid;
+
+enum {
+	OBD_ZOMBIE_STOP		= 0x0001,
+};
+
+/**
+ * check for work for kill zombie import/export thread.
+ */
+static int obd_zombie_impexp_check(void *arg)
+{
+	int rc;
+
+	spin_lock(&obd_zombie_impexp_lock);
+	rc = (zombies_count == 0) &&
+	     !test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags);
+	spin_unlock(&obd_zombie_impexp_lock);
+
+	RETURN(rc);
+}
+
 /**
  * Add export to the obd_zombe thread and notify it.
  */
@@ -1858,8 +1718,12 @@ static void obd_zombie_export_add(struct obd_export *exp) {
 	LASSERT(!list_empty(&exp->exp_obd_chain));
 	list_del_init(&exp->exp_obd_chain);
 	spin_unlock(&exp->exp_obd->obd_dev_lock);
+	spin_lock(&obd_zombie_impexp_lock);
+	zombies_count++;
+	list_add(&exp->exp_obd_chain, &obd_zombie_exports);
+	spin_unlock(&obd_zombie_impexp_lock);
 
-	queue_work(zombie_wq, &exp->exp_zombie_work);
+	obd_zombie_impexp_notify();
 }
 
 /**
@@ -1867,8 +1731,40 @@ static void obd_zombie_export_add(struct obd_export *exp) {
  */
 static void obd_zombie_import_add(struct obd_import *imp) {
 	LASSERT(imp->imp_sec == NULL);
+	spin_lock(&obd_zombie_impexp_lock);
+	LASSERT(list_empty(&imp->imp_zombie_chain));
+	zombies_count++;
+	list_add(&imp->imp_zombie_chain, &obd_zombie_imports);
+	spin_unlock(&obd_zombie_impexp_lock);
 
-	queue_work(zombie_wq, &imp->imp_zombie_work);
+	obd_zombie_impexp_notify();
+}
+
+/**
+ * notify import/export destroy thread about new zombie.
+ */
+static void obd_zombie_impexp_notify(void)
+{
+	/*
+	 * Make sure obd_zomebie_impexp_thread get this notification.
+	 * It is possible this signal only get by obd_zombie_barrier, and
+	 * barrier gulps this notification and sleeps away and hangs ensues
+	 */
+	wake_up_all(&obd_zombie_waitq);
+}
+
+/**
+ * check whether obd_zombie is idle
+ */
+static int obd_zombie_is_idle(void)
+{
+	int rc;
+
+	LASSERT(!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags));
+	spin_lock(&obd_zombie_impexp_lock);
+	rc = (zombies_count == 0);
+	spin_unlock(&obd_zombie_impexp_lock);
+	return rc;
 }
 
 /**
@@ -1876,7 +1772,12 @@ static void obd_zombie_import_add(struct obd_import *imp) {
  */
 void obd_zombie_barrier(void)
 {
-	flush_workqueue(zombie_wq);
+	struct l_wait_info lwi = { 0 };
+
+	if (obd_zombie_pid == current_pid())
+		/* don't wait for myself */
+		return;
+	l_wait_event(obd_zombie_waitq, obd_zombie_is_idle(), &lwi);
 }
 EXPORT_SYMBOL(obd_zombie_barrier);
 
@@ -1951,24 +1852,58 @@ void obd_stale_export_adjust(struct obd_export *exp)
 }
 EXPORT_SYMBOL(obd_stale_export_adjust);
 
+/**
+ * destroy zombie export/import thread.
+ */
+static int obd_zombie_impexp_thread(void *unused)
+{
+	unshare_fs_struct();
+	complete(&obd_zombie_start);
+
+	obd_zombie_pid = current_pid();
+
+	while (!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags)) {
+		struct l_wait_info lwi = { 0 };
+
+		l_wait_event(obd_zombie_waitq,
+			     !obd_zombie_impexp_check(NULL), &lwi);
+		obd_zombie_impexp_cull();
+
+		/*
+		 * Notify obd_zombie_barrier callers that queues
+		 * may be empty.
+		 */
+		wake_up(&obd_zombie_waitq);
+	}
+
+	complete(&obd_zombie_stop);
+
+	RETURN(0);
+}
+
+
 /**
  * start destroy zombie import/export thread
  */
 int obd_zombie_impexp_init(void)
 {
-	zombie_wq = alloc_workqueue("obd_zombid", 0, 0);
-	if (!zombie_wq)
-		return -ENOMEM;
+	struct task_struct *task;
 
-	return 0;
-}
+	task = kthread_run(obd_zombie_impexp_thread, NULL, "obd_zombid");
+	if (IS_ERR(task))
+		RETURN(PTR_ERR(task));
 
+	wait_for_completion(&obd_zombie_start);
+	RETURN(0);
+}
 /**
  * stop destroy zombie import/export thread
  */
 void obd_zombie_impexp_stop(void)
 {
-	destroy_workqueue(zombie_wq);
+	set_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags);
+        obd_zombie_impexp_notify();
+	wait_for_completion(&obd_zombie_stop);
 	LASSERT(list_empty(&obd_stale_exports));
 }
 
@@ -2054,14 +1989,14 @@ int obd_get_request_slot(struct client_obd *cli)
 	int				 rc;
 
 	spin_lock(&cli->cl_loi_list_lock);
-	if (cli->cl_rpcs_in_flight < cli->cl_max_rpcs_in_flight) {
-		cli->cl_rpcs_in_flight++;
+	if (cli->cl_r_in_flight < cli->cl_max_rpcs_in_flight) {
+		cli->cl_r_in_flight++;
 		spin_unlock(&cli->cl_loi_list_lock);
 		return 0;
 	}
 
 	init_waitqueue_head(&orsw.orsw_waitq);
-	list_add_tail(&orsw.orsw_entry, &cli->cl_flight_waiters);
+	list_add_tail(&orsw.orsw_entry, &cli->cl_loi_read_list);
 	orsw.orsw_signaled = false;
 	spin_unlock(&cli->cl_loi_list_lock);
 
@@ -2077,7 +2012,7 @@ int obd_get_request_slot(struct client_obd *cli)
 	if (rc != 0) {
 		if (!orsw.orsw_signaled) {
 			if (list_empty(&orsw.orsw_entry))
-				cli->cl_rpcs_in_flight--;
+				cli->cl_r_in_flight--;
 			else
 				list_del(&orsw.orsw_entry);
 		}
@@ -2099,15 +2034,15 @@ void obd_put_request_slot(struct client_obd *cli)
 	struct obd_request_slot_waiter *orsw;
 
 	spin_lock(&cli->cl_loi_list_lock);
-	cli->cl_rpcs_in_flight--;
+	cli->cl_r_in_flight--;
 
 	/* If there is free slot, wakeup the first waiter. */
-	if (!list_empty(&cli->cl_flight_waiters) &&
-	    likely(cli->cl_rpcs_in_flight < cli->cl_max_rpcs_in_flight)) {
-		orsw = list_entry(cli->cl_flight_waiters.next,
+	if (!list_empty(&cli->cl_loi_read_list) &&
+	    likely(cli->cl_r_in_flight < cli->cl_max_rpcs_in_flight)) {
+		orsw = list_entry(cli->cl_loi_read_list.next,
 				  struct obd_request_slot_waiter, orsw_entry);
 		list_del_init(&orsw->orsw_entry);
-		cli->cl_rpcs_in_flight++;
+		cli->cl_r_in_flight++;
 		wake_up(&orsw->orsw_waitq);
 	}
 	spin_unlock(&cli->cl_loi_list_lock);
@@ -2126,21 +2061,20 @@ int obd_set_max_rpcs_in_flight(struct client_obd *cli, __u32 max)
 	__u32				old;
 	int				diff;
 	int				i;
+	char				*typ_name;
 	int				rc;
 
 	if (max > OBD_MAX_RIF_MAX || max < 1)
 		return -ERANGE;
 
-	CDEBUG(D_INFO, "%s: max = %hu max_mod = %u rif = %u\n",
-	       cli->cl_import->imp_obd->obd_name, max,
-	       cli->cl_max_mod_rpcs_in_flight, cli->cl_max_rpcs_in_flight);
-
-	if (strcmp(cli->cl_import->imp_obd->obd_type->typ_name,
-		   LUSTRE_MDC_NAME) == 0) {
+	typ_name = cli->cl_import->imp_obd->obd_type->typ_name;
+	if (strcmp(typ_name, LUSTRE_MDC_NAME) == 0) {
 		/* adjust max_mod_rpcs_in_flight to ensure it is always
 		 * strictly lower that max_rpcs_in_flight */
 		if (max < 2) {
-			CERROR("%s: cannot set mdc.*.max_rpcs_in_flight=1\n",
+			CERROR("%s: cannot set max_rpcs_in_flight to 1 "
+			       "because it must be higher than "
+			       "max_mod_rpcs_in_flight value",
 			       cli->cl_import->imp_obd->obd_name);
 			return -ERANGE;
 		}
@@ -2154,19 +2088,17 @@ int obd_set_max_rpcs_in_flight(struct client_obd *cli, __u32 max)
 	spin_lock(&cli->cl_loi_list_lock);
 	old = cli->cl_max_rpcs_in_flight;
 	cli->cl_max_rpcs_in_flight = max;
-	client_adjust_max_dirty(cli);
-
 	diff = max - old;
 
 	/* We increase the max_rpcs_in_flight, then wakeup some waiters. */
 	for (i = 0; i < diff; i++) {
-		if (list_empty(&cli->cl_flight_waiters))
+		if (list_empty(&cli->cl_loi_read_list))
 			break;
 
-		orsw = list_entry(cli->cl_flight_waiters.next,
+		orsw = list_entry(cli->cl_loi_read_list.next,
 				  struct obd_request_slot_waiter, orsw_entry);
 		list_del_init(&orsw->orsw_entry);
-		cli->cl_rpcs_in_flight++;
+		cli->cl_r_in_flight++;
 		wake_up(&orsw->orsw_waitq);
 	}
 	spin_unlock(&cli->cl_loi_list_lock);
@@ -2183,50 +2115,32 @@ EXPORT_SYMBOL(obd_get_max_mod_rpcs_in_flight);
 
 int obd_set_max_mod_rpcs_in_flight(struct client_obd *cli, __u16 max)
 {
-	struct obd_connect_data *ocd;
+	struct obd_connect_data	*ocd;
 	__u16 maxmodrpcs;
 	__u16 prev;
 
 	if (max > OBD_MAX_RIF_MAX || max < 1)
 		return -ERANGE;
 
-	ocd = &cli->cl_import->imp_connect_data;
-	CDEBUG(D_INFO, "%s: max = %hu flags = %llx, max_mod = %u rif = %u\n",
-	       cli->cl_import->imp_obd->obd_name, max, ocd->ocd_connect_flags,
-	       ocd->ocd_maxmodrpcs, cli->cl_max_rpcs_in_flight);
-
-	if (max == OBD_MAX_RIF_MAX)
-		max = OBD_MAX_RIF_MAX - 1;
-
-	/* Cannot exceed or equal max_rpcs_in_flight.  If we are asked to
-	 * increase this value, also bump up max_rpcs_in_flight to match.
-	 */
+	/* cannot exceed or equal max_rpcs_in_flight */
 	if (max >= cli->cl_max_rpcs_in_flight) {
-		CDEBUG(D_INFO,
-		       "%s: increasing max_rpcs_in_flight=%hu to allow larger max_mod_rpcs_in_flight=%u\n",
-		       cli->cl_import->imp_obd->obd_name, max + 1, max);
-		obd_set_max_rpcs_in_flight(cli, max + 1);
+		CERROR("%s: can't set max_mod_rpcs_in_flight to a value (%hu) "
+		       "higher or equal to max_rpcs_in_flight value (%u)\n",
+		       cli->cl_import->imp_obd->obd_name,
+		       max, cli->cl_max_rpcs_in_flight);
+		return -ERANGE;
 	}
 
-	/* cannot exceed max modify RPCs in flight supported by the server,
-	 * but verify ocd_connect_flags is at least initialized first.  If
-	 * not, allow it and fix value later in ptlrpc_connect_set_flags().
-	 */
-	if (!ocd->ocd_connect_flags) {
-		maxmodrpcs = cli->cl_max_rpcs_in_flight - 1;
-	} else if (ocd->ocd_connect_flags & OBD_CONNECT_MULTIMODRPCS) {
+	/* cannot exceed max modify RPCs in flight supported by the server */
+	ocd = &cli->cl_import->imp_connect_data;
+	if (ocd->ocd_connect_flags & OBD_CONNECT_MULTIMODRPCS)
 		maxmodrpcs = ocd->ocd_maxmodrpcs;
-		if (maxmodrpcs == 0) { /* connection not finished yet */
-			maxmodrpcs = cli->cl_max_rpcs_in_flight - 1;
-			CDEBUG(D_INFO,
-			       "%s: partial connect, assume maxmodrpcs=%hu\n",
-			       cli->cl_import->imp_obd->obd_name, maxmodrpcs);
-		}
-	} else {
+	else
 		maxmodrpcs = 1;
-	}
 	if (max > maxmodrpcs) {
-		CERROR("%s: can't set max_mod_rpcs_in_flight=%hu higher than ocd_maxmodrpcs=%hu returned by the server at connection\n",
+		CERROR("%s: can't set max_mod_rpcs_in_flight to a value (%hu) "
+		       "higher than max_mod_rpcs_per_client value (%hu) "
+		       "returned by the server at connection\n",
 		       cli->cl_import->imp_obd->obd_name,
 		       max, maxmodrpcs);
 		return -ERANGE;
@@ -2247,6 +2161,8 @@ int obd_set_max_mod_rpcs_in_flight(struct client_obd *cli, __u16 max)
 }
 EXPORT_SYMBOL(obd_set_max_mod_rpcs_in_flight);
 
+
+#define pct(a, b) (b ? a * 100 / b : 0)
 int obd_mod_rpc_stats_seq_show(struct client_obd *cli,
 			       struct seq_file *seq)
 {
@@ -2272,7 +2188,7 @@ int obd_mod_rpc_stats_seq_show(struct client_obd *cli,
 	for (i = 0; i < OBD_HIST_MAX; i++) {
 		unsigned long mod = cli->cl_mod_rpcs_hist.oh_buckets[i];
 		mod_cum += mod;
-		seq_printf(seq, "%d:\t\t%10lu %3u %3u\n",
+		seq_printf(seq, "%d:\t\t%10lu %3lu %3lu\n",
 			   i, mod, pct(mod, mod_tot),
 			   pct(mod_cum, mod_tot));
 		if (mod_cum == mod_tot)
@@ -2284,6 +2200,8 @@ int obd_mod_rpc_stats_seq_show(struct client_obd *cli,
 	return 0;
 }
 EXPORT_SYMBOL(obd_mod_rpc_stats_seq_show);
+#undef pct
+
 
 /* The number of modify RPCs sent in parallel is limited
  * because the server has a finite number of slots per client to
@@ -2325,7 +2243,7 @@ static inline bool obd_skip_mod_rpc_slot(const struct lookup_intent *it)
 	if (it != NULL &&
 	    (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP ||
 	     it->it_op == IT_READDIR ||
-	     (it->it_op == IT_LAYOUT && !(it->it_flags & MDS_FMODE_WRITE))))
+	     (it->it_op == IT_LAYOUT && !(it->it_flags & FMODE_WRITE))))
 			return true;
 	return false;
 }
@@ -2379,9 +2297,8 @@ __u16 obd_get_mod_rpc_slot(struct client_obd *cli, __u32 opc,
 		       "opc %u, max %hu\n",
 		       cli->cl_import->imp_obd->obd_name, opc, max);
 
-		l_wait_event_exclusive(cli->cl_mod_rpcs_waitq,
-				       obd_mod_rpc_slot_avail(cli, close_req),
-				       &lwi);
+		l_wait_event(cli->cl_mod_rpcs_waitq,
+			     obd_mod_rpc_slot_avail(cli, close_req), &lwi);
 	} while (true);
 }
 EXPORT_SYMBOL(obd_get_mod_rpc_slot);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/idmap.c b/drivers/staging/lustrefsx/lustre/obdclass/idmap.c
index 1fcbb2a839f9d..b45c6d6a55357 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/idmap.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/idmap.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -47,6 +47,15 @@
 #include <md_object.h>
 #include <obd_support.h>
 
+#define lustre_get_group_info(group_info) do {		\
+	atomic_inc(&(group_info)->usage);		\
+} while (0)
+
+#define lustre_put_group_info(group_info) do {		\
+	if (atomic_dec_and_test(&(group_info)->usage))	\
+		groups_free(group_info);		\
+} while (0)
+
 /*
  * groups_search() is copied from linux kernel!
  * A simple bsearch.
@@ -101,12 +110,12 @@ EXPORT_SYMBOL(lustre_groups_from_list);
 /* a simple shell-metzner sort */
 void lustre_groups_sort(struct group_info *group_info)
 {
-	int base, max, stride;
-	int gidsetsize = group_info->ngroups;
+        int base, max, stride;
+        int gidsetsize = group_info->ngroups;
 
-	for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
-		; /* nothing */
-	stride /= 3;
+        for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
+                ; /* nothing */
+        stride /= 3;
 
 	while (stride) {
 		max = gidsetsize - stride;
@@ -153,10 +162,9 @@ int lustre_in_group_p(struct lu_ucred *mu, gid_t grp)
 		if (!group_info)
 			return 0;
 
-		atomic_inc(&group_info->usage);
+		lustre_get_group_info(group_info);
 		rc = lustre_groups_search(group_info, grp);
-		if (atomic_dec_and_test(&group_info->usage))
-			groups_free(group_info);
+		lustre_put_group_info(group_info);
 	}
 	return rc;
 }
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/integrity.c b/drivers/staging/lustrefsx/lustre/obdclass/integrity.c
deleted file mode 100644
index 4a6d27aa6ae36..0000000000000
--- a/drivers/staging/lustrefsx/lustre/obdclass/integrity.c
+++ /dev/null
@@ -1,277 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2018, DataDirect Networks Storage.
- * Author: Li Xi.
- *
- * General data integrity functions
- */
-#include <linux/blkdev.h>
-#include <linux/crc-t10dif.h>
-#include <asm/checksum.h>
-#include <obd_class.h>
-#include <obd_cksum.h>
-
-#if IS_ENABLED(CONFIG_CRC_T10DIF)
-__u16 obd_dif_crc_fn(void *data, unsigned int len)
-{
-	return cpu_to_be16(crc_t10dif(data, len));
-}
-EXPORT_SYMBOL(obd_dif_crc_fn);
-
-__u16 obd_dif_ip_fn(void *data, unsigned int len)
-{
-	return ip_compute_csum(data, len);
-}
-EXPORT_SYMBOL(obd_dif_ip_fn);
-
-int obd_page_dif_generate_buffer(const char *obd_name, struct page *page,
-				 __u32 offset, __u32 length,
-				 __u16 *guard_start, int guard_number,
-				 int *used_number, int sector_size,
-				 obd_dif_csum_fn *fn)
-{
-	unsigned int i = offset;
-	unsigned int end = offset + length;
-	char *data_buf;
-	__u16 *guard_buf = guard_start;
-	unsigned int data_size;
-	int used = 0;
-
-	data_buf = kmap(page) + offset;
-	while (i < end) {
-		if (used >= guard_number) {
-			CERROR("%s: unexpected used guard number of DIF %u/%u, "
-			       "data length %u, sector size %u: rc = %d\n",
-			       obd_name, used, guard_number, length,
-			       sector_size, -E2BIG);
-			return -E2BIG;
-		}
-		data_size = min(round_up(i + 1, sector_size), end) - i;
-		*guard_buf = fn(data_buf, data_size);
-		guard_buf++;
-		data_buf += data_size;
-		i += data_size;
-		used++;
-	}
-	kunmap(page);
-	*used_number = used;
-
-	return 0;
-}
-EXPORT_SYMBOL(obd_page_dif_generate_buffer);
-
-static int __obd_t10_performance_test(const char *obd_name,
-				      enum cksum_types cksum_type,
-				      struct page *data_page,
-				      int repeat_number)
-{
-	unsigned char cfs_alg = cksum_obd2cfs(OBD_CKSUM_T10_TOP);
-	struct ahash_request *req;
-	obd_dif_csum_fn *fn = NULL;
-	unsigned int bufsize;
-	unsigned char *buffer;
-	struct page *__page;
-	__u16 *guard_start;
-	int guard_number;
-	int used_number = 0;
-	int sector_size = 0;
-	__u32 cksum;
-	int rc = 0;
-	int rc2;
-	int used;
-	int i;
-
-	obd_t10_cksum2dif(cksum_type, &fn, &sector_size);
-	if (!fn)
-		return -EINVAL;
-
-	__page = alloc_page(GFP_KERNEL);
-	if (__page == NULL)
-		return -ENOMEM;
-
-	req = cfs_crypto_hash_init(cfs_alg, NULL, 0);
-	if (IS_ERR(req)) {
-		rc = PTR_ERR(req);
-		CERROR("%s: unable to initialize checksum hash %s: rc = %d\n",
-		       obd_name, cfs_crypto_hash_name(cfs_alg), rc);
-		GOTO(out, rc);
-	}
-
-	buffer = kmap(__page);
-	guard_start = (__u16 *)buffer;
-	guard_number = PAGE_SIZE / sizeof(*guard_start);
-	for (i = 0; i < repeat_number; i++) {
-		/*
-		 * The left guard number should be able to hold checksums of a
-		 * whole page
-		 */
-		rc = obd_page_dif_generate_buffer(obd_name, data_page, 0,
-						  PAGE_SIZE,
-						  guard_start + used_number,
-						  guard_number - used_number,
-						  &used, sector_size, fn);
-		if (rc)
-			break;
-
-		used_number += used;
-		if (used_number == guard_number) {
-			cfs_crypto_hash_update_page(req, __page, 0,
-				used_number * sizeof(*guard_start));
-			used_number = 0;
-		}
-	}
-	kunmap(__page);
-	if (rc)
-		GOTO(out_final, rc);
-
-	if (used_number != 0)
-		cfs_crypto_hash_update_page(req, __page, 0,
-			used_number * sizeof(*guard_start));
-
-	bufsize = sizeof(cksum);
-out_final:
-	rc2 = cfs_crypto_hash_final(req, (unsigned char *)&cksum, &bufsize);
-	rc = rc ? rc : rc2;
-out:
-	__free_page(__page);
-
-	return rc;
-}
-
-/**
- *  Array of T10PI checksum algorithm speed in MByte per second
- */
-static int obd_t10_cksum_speeds[OBD_T10_CKSUM_MAX];
-
-static enum obd_t10_cksum_type
-obd_t10_cksum2type(enum cksum_types cksum_type)
-{
-	switch (cksum_type) {
-	case OBD_CKSUM_T10IP512:
-		return OBD_T10_CKSUM_IP512;
-	case OBD_CKSUM_T10IP4K:
-		return OBD_T10_CKSUM_IP4K;
-	case OBD_CKSUM_T10CRC512:
-		return OBD_T10_CKSUM_CRC512;
-	case OBD_CKSUM_T10CRC4K:
-		return OBD_T10_CKSUM_CRC4K;
-	default:
-		return OBD_T10_CKSUM_UNKNOWN;
-	}
-}
-
-static const char *obd_t10_cksum_name(enum obd_t10_cksum_type index)
-{
-	DECLARE_CKSUM_NAME;
-
-	/* Need to skip "crc32", "adler", "crc32c", "reserved" */
-	return cksum_name[3 + index];
-}
-
-/**
- * Compute the speed of specified T10PI checksum type
- *
- * Run a speed test on the given T10PI checksum on buffer using a 1MB buffer
- * size. This is a reasonable buffer size for Lustre RPCs, even if the actual
- * RPC size is larger or smaller.
- *
- * The speed is stored internally in the obd_t10_cksum_speeds[] array, and
- * is available through the obd_t10_cksum_speed() function.
- *
- * This function needs to stay the same as cfs_crypto_performance_test() so
- * that the speeds are comparable. And this function should reflect the real
- * cost of the checksum calculation.
- *
- * \param[in] obd_name		name of the OBD device
- * \param[in] cksum_type	checksum type (OBD_CKSUM_T10*)
- */
-static void obd_t10_performance_test(const char *obd_name,
-				     enum cksum_types cksum_type)
-{
-	enum obd_t10_cksum_type index = obd_t10_cksum2type(cksum_type);
-	const int buf_len = max(PAGE_SIZE, 1048576UL);
-	unsigned long bcount;
-	unsigned long start;
-	unsigned long end;
-	struct page *page;
-	int rc = 0;
-	void *buf;
-
-	page = alloc_page(GFP_KERNEL);
-	if (page == NULL) {
-		rc = -ENOMEM;
-		goto out;
-	}
-
-	buf = kmap(page);
-	memset(buf, 0xAD, PAGE_SIZE);
-	kunmap(page);
-
-	for (start = jiffies, end = start + msecs_to_jiffies(MSEC_PER_SEC / 4),
-	     bcount = 0; time_before(jiffies, end) && rc == 0; bcount++) {
-		rc = __obd_t10_performance_test(obd_name, cksum_type, page,
-						buf_len / PAGE_SIZE);
-		if (rc)
-			break;
-	}
-	end = jiffies;
-	__free_page(page);
-out:
-	if (rc) {
-		obd_t10_cksum_speeds[index] = rc;
-		CDEBUG(D_INFO, "%s: T10 checksum algorithm %s test error: "
-		       "rc = %d\n", obd_name, obd_t10_cksum_name(index), rc);
-	} else {
-		unsigned long tmp;
-
-		tmp = ((bcount * buf_len / jiffies_to_msecs(end - start)) *
-		       1000) / (1024 * 1024);
-		obd_t10_cksum_speeds[index] = (int)tmp;
-		CDEBUG(D_CONFIG, "%s: T10 checksum algorithm %s speed = %d "
-		       "MB/s\n", obd_name, obd_t10_cksum_name(index),
-		       obd_t10_cksum_speeds[index]);
-	}
-}
-#endif /* CONFIG_CRC_T10DIF */
-
-int obd_t10_cksum_speed(const char *obd_name,
-			enum cksum_types cksum_type)
-{
-#if IS_ENABLED(CONFIG_CRC_T10DIF)
-	enum obd_t10_cksum_type index = obd_t10_cksum2type(cksum_type);
-
-	if (unlikely(obd_t10_cksum_speeds[index] == 0)) {
-		static DEFINE_MUTEX(obd_t10_cksum_speed_mutex);
-
-		mutex_lock(&obd_t10_cksum_speed_mutex);
-		if (obd_t10_cksum_speeds[index] == 0)
-			obd_t10_performance_test(obd_name, cksum_type);
-		mutex_unlock(&obd_t10_cksum_speed_mutex);
-	}
-
-	return obd_t10_cksum_speeds[index];
-#else /* !CONFIG_CRC_T10DIF */
-	return 0;
-#endif /* !CONFIG_CRC_T10DIF */
-}
-EXPORT_SYMBOL(obd_t10_cksum_speed);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/jobid.c b/drivers/staging/lustrefsx/lustre/obdclass/jobid.c
deleted file mode 100644
index b7a08d495b2ce..0000000000000
--- a/drivers/staging/lustrefsx/lustre/obdclass/jobid.c
+++ /dev/null
@@ -1,575 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2011, 2014, Intel Corporation.
- *
- * Copyright 2017 Cray Inc, all rights reserved.
- * Author: Ben Evans.
- *
- * Store PID->JobID mappings
- */
-
-#define DEBUG_SUBSYSTEM S_RPC
-#include <linux/user_namespace.h>
-#ifdef HAVE_UIDGID_HEADER
-#include <linux/uidgid.h>
-#endif
-#include <linux/utsname.h>
-
-#include <libcfs/libcfs.h>
-#include <obd_support.h>
-#include <obd_class.h>
-#include <lustre_net.h>
-
-static struct cfs_hash *jobid_hash;
-static struct cfs_hash_ops jobid_hash_ops;
-spinlock_t jobid_hash_lock;
-
-#define RESCAN_INTERVAL 30
-#define DELETE_INTERVAL 300
-
-char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE;
-char obd_jobid_name[LUSTRE_JOBID_SIZE] = "%e.%u";
-
-/**
- * Structure to store a single PID->JobID mapping
- */
-struct jobid_pid_map {
-	struct hlist_node	jp_hash;
-	time64_t		jp_time;
-	spinlock_t		jp_lock; /* protects jp_jobid */
-	char			jp_jobid[LUSTRE_JOBID_SIZE];
-	unsigned int		jp_joblen;
-	atomic_t		jp_refcount;
-	pid_t			jp_pid;
-};
-
-/*
- * Get jobid of current process by reading the environment variable
- * stored in between the "env_start" & "env_end" of task struct.
- *
- * If some job scheduler doesn't store jobid in the "env_start/end",
- * then an upcall could be issued here to get the jobid by utilizing
- * the userspace tools/API. Then, the jobid must be cached.
- */
-int jobid_get_from_environ(char *jobid_var, char *jobid, int *jobid_len)
-{
-	int rc;
-
-	rc = cfs_get_environ(jobid_var, jobid, jobid_len);
-	if (!rc)
-		goto out;
-
-	if (rc == -EOVERFLOW) {
-		/* For the PBS_JOBID and LOADL_STEP_ID keys (which are
-		 * variable length strings instead of just numbers), it
-		 * might make sense to keep the unique parts for JobID,
-		 * instead of just returning an error.  That means a
-		 * larger temp buffer for cfs_get_environ(), then
-		 * truncating the string at some separator to fit into
-		 * the specified jobid_len.  Fix later if needed. */
-		static ktime_t printed;
-
-		if (unlikely(ktime_to_ns(printed) == 0 ||
-			     ktime_after(ktime_get(),
-					 ktime_add_ns(printed,
-						      3600*24*NSEC_PER_SEC)))) {
-			LCONSOLE_WARN("jobid: '%s' value too large (%d)\n",
-				      obd_jobid_var, *jobid_len);
-			printed = ktime_get();
-		}
-
-		rc = 0;
-	} else {
-		CDEBUG_LIMIT((rc == -ENOENT || rc == -EINVAL ||
-			      rc == -EDEADLK) ? D_INFO : D_ERROR,
-			     "jobid: get '%s' failed: rc = %d\n",
-			     obd_jobid_var, rc);
-	}
-
-out:
-	return rc;
-}
-
-/*
- * jobid_should_free_item
- *
- * Each item is checked to see if it should be released
- * Removed from hash table by caller
- * Actually freed in jobid_put_locked
- *
- * Returns 1 if item is to be freed, 0 if it is to be kept
- */
-
-static int jobid_should_free_item(void *obj, void *data)
-{
-	char *jobid = data;
-	struct jobid_pid_map *pidmap = obj;
-	int rc = 0;
-
-	if (obj == NULL)
-		return 0;
-
-	if (jobid == NULL) {
-		WARN_ON_ONCE(atomic_read(&pidmap->jp_refcount) != 1);
-		return 1;
-	}
-
-	spin_lock(&pidmap->jp_lock);
-	/* prevent newly inserted items from deleting */
-	if (jobid[0] == '\0' && atomic_read(&pidmap->jp_refcount) == 1)
-		rc = 1;
-	else if (ktime_get_real_seconds() - pidmap->jp_time > DELETE_INTERVAL)
-		rc = 1;
-	else if (strcmp(pidmap->jp_jobid, jobid) == 0)
-		rc = 1;
-	spin_unlock(&pidmap->jp_lock);
-
-	return rc;
-}
-
-/*
- * jobid_name_is_valid
- *
- * Checks if the jobid is a Lustre process
- *
- * Returns true if jobid is valid
- * Returns false if jobid looks like it's a Lustre process
- */
-static bool jobid_name_is_valid(char *jobid)
-{
-	const char *const lustre_reserved[] = { "ll_ping", "ptlrpc",
-						"ldlm", "ll_sa", NULL };
-	int i;
-
-	if (jobid[0] == '\0')
-		return false;
-
-	for (i = 0; lustre_reserved[i] != NULL; i++) {
-		if (strncmp(jobid, lustre_reserved[i],
-			    strlen(lustre_reserved[i])) == 0)
-			return false;
-	}
-	return true;
-}
-
-/*
- * jobid_get_from_cache()
- *
- * Returns contents of jobid_var from process environment for current PID.
- * This will be cached for some time to avoid overhead scanning environment.
- *
- * Return: -ENOMEM if allocating a new pidmap fails
- *         -ENOENT if no entry could be found
- *         +ve string length for success (something was returned in jobid)
- */
-static int jobid_get_from_cache(char *jobid, size_t joblen)
-{
-	static time64_t last_expire;
-	bool expire_cache = false;
-	pid_t pid = current_pid();
-	struct jobid_pid_map *pidmap = NULL;
-	time64_t now = ktime_get_real_seconds();
-	int rc = 0;
-	ENTRY;
-
-	LASSERT(jobid_hash != NULL);
-
-	/* scan hash periodically to remove old PID entries from cache */
-	spin_lock(&jobid_hash_lock);
-	if (unlikely(last_expire + DELETE_INTERVAL <= now)) {
-		expire_cache = true;
-		last_expire = now;
-	}
-	spin_unlock(&jobid_hash_lock);
-
-	if (expire_cache)
-		cfs_hash_cond_del(jobid_hash, jobid_should_free_item,
-				  "intentionally_bad_jobid");
-
-	/* first try to find PID in the hash and use that value */
-	pidmap = cfs_hash_lookup(jobid_hash, &pid);
-	if (pidmap == NULL) {
-		struct jobid_pid_map *pidmap2;
-
-		OBD_ALLOC_PTR(pidmap);
-		if (pidmap == NULL)
-			GOTO(out, rc = -ENOMEM);
-
-		pidmap->jp_pid = pid;
-		pidmap->jp_time = 0;
-		pidmap->jp_jobid[0] = '\0';
-		spin_lock_init(&pidmap->jp_lock);
-		INIT_HLIST_NODE(&pidmap->jp_hash);
-		/*
-		 * @pidmap might be reclaimed just after it is added into
-		 * hash list, init @jp_refcount as 1 to make sure memory
-		 * could be not freed during access.
-		 */
-		atomic_set(&pidmap->jp_refcount, 1);
-
-		/*
-		 * Add the newly created map to the hash, on key collision we
-		 * lost a racing addition and must destroy our newly allocated
-		 * map.  The object which exists in the hash will be returned.
-		 */
-		pidmap2 = cfs_hash_findadd_unique(jobid_hash, &pid,
-						  &pidmap->jp_hash);
-		if (unlikely(pidmap != pidmap2)) {
-			CDEBUG(D_INFO, "jobid: duplicate found for PID=%u\n",
-			       pid);
-			OBD_FREE_PTR(pidmap);
-			pidmap = pidmap2;
-		}
-	}
-
-	/*
-	 * If pidmap is old (this is always true for new entries) refresh it.
-	 * If obd_jobid_var is not found, cache empty entry and try again
-	 * later, to avoid repeat lookups for PID if obd_jobid_var missing.
-	 */
-	spin_lock(&pidmap->jp_lock);
-	if (pidmap->jp_time + RESCAN_INTERVAL <= now) {
-		char env_jobid[LUSTRE_JOBID_SIZE] = "";
-		int env_len = sizeof(env_jobid);
-
-		pidmap->jp_time = now;
-
-		spin_unlock(&pidmap->jp_lock);
-		rc = jobid_get_from_environ(obd_jobid_var, env_jobid, &env_len);
-
-		CDEBUG(D_INFO, "jobid: PID mapping established: %d->%s\n",
-		       pidmap->jp_pid, env_jobid);
-		spin_lock(&pidmap->jp_lock);
-		if (!rc) {
-			pidmap->jp_joblen = env_len;
-			strlcpy(pidmap->jp_jobid, env_jobid,
-				sizeof(pidmap->jp_jobid));
-			rc = 0;
-		} else if (rc == -ENOENT) {
-			/* It might have been deleted, clear out old entry */
-			pidmap->jp_joblen = 0;
-			pidmap->jp_jobid[0] = '\0';
-		}
-	}
-
-	/*
-	 * Regardless of how pidmap was found, if it contains a valid entry
-	 * use that for now.  If there was a technical error (e.g. -ENOMEM)
-	 * use the old cached value until it can be looked up again properly.
-	 * If a cached missing entry was found, return -ENOENT.
-	 */
-	if (pidmap->jp_joblen) {
-		strlcpy(jobid, pidmap->jp_jobid, joblen);
-		joblen = pidmap->jp_joblen;
-		rc = 0;
-	} else if (!rc) {
-		rc = -ENOENT;
-	}
-	spin_unlock(&pidmap->jp_lock);
-
-	cfs_hash_put(jobid_hash, &pidmap->jp_hash);
-
-	EXIT;
-out:
-	return rc < 0 ? rc : joblen;
-}
-
-/*
- * jobid_interpret_string()
- *
- * Interpret the jobfmt string to expand specified fields, like coredumps do:
- *   %e = executable
- *   %g = gid
- *   %h = hostname
- *   %j = jobid from environment
- *   %p = pid
- *   %u = uid
- *
- * Unknown escape strings are dropped.  Other characters are copied through,
- * excluding whitespace (to avoid making jobid parsing difficult).
- *
- * Return: -EOVERFLOW if the expanded string does not fit within @joblen
- *         0 for success
- */
-static int jobid_interpret_string(const char *jobfmt, char *jobid,
-				  ssize_t joblen)
-{
-	char c;
-
-	while ((c = *jobfmt++) && joblen > 1) {
-		char f;
-		int l;
-
-		if (isspace(c)) /* Don't allow embedded spaces */
-			continue;
-
-		if (c != '%') {
-			*jobid = c;
-			joblen--;
-			jobid++;
-			continue;
-		}
-
-		switch ((f = *jobfmt++)) {
-		case 'e': /* executable name */
-			l = snprintf(jobid, joblen, "%s", current_comm());
-			break;
-		case 'g': /* group ID */
-			l = snprintf(jobid, joblen, "%u",
-				     from_kgid(&init_user_ns, current_fsgid()));
-			break;
-		case 'h': /* hostname */
-			l = snprintf(jobid, joblen, "%s",
-				     init_utsname()->nodename);
-			break;
-		case 'j': /* jobid stored in process environment */
-			l = jobid_get_from_cache(jobid, joblen);
-			if (l < 0)
-				l = 0;
-			break;
-		case 'p': /* process ID */
-			l = snprintf(jobid, joblen, "%u", current_pid());
-			break;
-		case 'u': /* user ID */
-			l = snprintf(jobid, joblen, "%u",
-				     from_kuid(&init_user_ns, current_fsuid()));
-			break;
-		case '\0': /* '%' at end of format string */
-			l = 0;
-			goto out;
-		default: /* drop unknown %x format strings */
-			l = 0;
-			break;
-		}
-		jobid += l;
-		joblen -= l;
-	}
-	/*
-	 * This points at the end of the buffer, so long as jobid is always
-	 * incremented the same amount as joblen is decremented.
-	 */
-out:
-	jobid[joblen - 1] = '\0';
-
-	return joblen < 0 ? -EOVERFLOW : 0;
-}
-
-/*
- * Hash initialization, copied from server-side job stats bucket sizes
- */
-#define HASH_JOBID_BKT_BITS 5
-#define HASH_JOBID_CUR_BITS 7
-#define HASH_JOBID_MAX_BITS 12
-
-int jobid_cache_init(void)
-{
-	int rc = 0;
-	ENTRY;
-
-	if (jobid_hash)
-		return 0;
-
-	spin_lock_init(&jobid_hash_lock);
-	jobid_hash = cfs_hash_create("JOBID_HASH", HASH_JOBID_CUR_BITS,
-				     HASH_JOBID_MAX_BITS, HASH_JOBID_BKT_BITS,
-				     0, CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA,
-				     &jobid_hash_ops, CFS_HASH_DEFAULT);
-	if (!jobid_hash)
-		rc = -ENOMEM;
-
-	RETURN(rc);
-}
-EXPORT_SYMBOL(jobid_cache_init);
-
-void jobid_cache_fini(void)
-{
-	struct cfs_hash *tmp_hash;
-	ENTRY;
-
-	spin_lock(&jobid_hash_lock);
-	tmp_hash = jobid_hash;
-	jobid_hash = NULL;
-	spin_unlock(&jobid_hash_lock);
-
-	if (tmp_hash != NULL) {
-		cfs_hash_cond_del(tmp_hash, jobid_should_free_item, NULL);
-		cfs_hash_putref(tmp_hash);
-	}
-
-	EXIT;
-}
-EXPORT_SYMBOL(jobid_cache_fini);
-
-/*
- * Hash operations for pid<->jobid
- */
-static unsigned jobid_hashfn(struct cfs_hash *hs, const void *key,
-			     unsigned mask)
-{
-	return cfs_hash_djb2_hash(key, sizeof(pid_t), mask);
-}
-
-static void *jobid_key(struct hlist_node *hnode)
-{
-	struct jobid_pid_map *pidmap;
-
-	pidmap = hlist_entry(hnode, struct jobid_pid_map, jp_hash);
-	return &pidmap->jp_pid;
-}
-
-static int jobid_keycmp(const void *key, struct hlist_node *hnode)
-{
-	const pid_t *pid_key1;
-	const pid_t *pid_key2;
-
-	LASSERT(key != NULL);
-	pid_key1 = (pid_t *)key;
-	pid_key2 = (pid_t *)jobid_key(hnode);
-
-	return *pid_key1 == *pid_key2;
-}
-
-static void *jobid_object(struct hlist_node *hnode)
-{
-	return hlist_entry(hnode, struct jobid_pid_map, jp_hash);
-}
-
-static void jobid_get(struct cfs_hash *hs, struct hlist_node *hnode)
-{
-	struct jobid_pid_map *pidmap;
-
-	pidmap = hlist_entry(hnode, struct jobid_pid_map, jp_hash);
-
-	atomic_inc(&pidmap->jp_refcount);
-}
-
-static void jobid_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
-{
-	struct jobid_pid_map *pidmap;
-
-	if (hnode == NULL)
-		return;
-
-	pidmap = hlist_entry(hnode, struct jobid_pid_map, jp_hash);
-	LASSERT(atomic_read(&pidmap->jp_refcount) > 0);
-	if (atomic_dec_and_test(&pidmap->jp_refcount)) {
-		CDEBUG(D_INFO, "Freeing: %d->%s\n",
-		       pidmap->jp_pid, pidmap->jp_jobid);
-
-		OBD_FREE_PTR(pidmap);
-	}
-}
-
-static struct cfs_hash_ops jobid_hash_ops = {
-	.hs_hash	= jobid_hashfn,
-	.hs_keycmp	= jobid_keycmp,
-	.hs_key		= jobid_key,
-	.hs_object	= jobid_object,
-	.hs_get		= jobid_get,
-	.hs_put		= jobid_put_locked,
-	.hs_put_locked	= jobid_put_locked,
-};
-
-/**
- * Generate the job identifier string for this process for tracking purposes.
- *
- * Fill in @jobid string based on the value of obd_jobid_var:
- * JOBSTATS_DISABLE:      none
- * JOBSTATS_NODELOCAL:    content of obd_jobid_node (jobid_interpret_string())
- * JOBSTATS_PROCNAME_UID: process name/UID
- * anything else:         look up obd_jobid_var in the processes environment
- *
- * Return -ve error number, 0 on success.
- */
-int lustre_get_jobid(char *jobid, size_t joblen)
-{
-	int rc = 0;
-	ENTRY;
-
-	if (unlikely(joblen < 2)) {
-		if (joblen == 1)
-			jobid[0] = '\0';
-		RETURN(-EINVAL);
-	}
-
-	if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0) {
-		/* Jobstats isn't enabled */
-		memset(jobid, 0, joblen);
-	} else if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) == 0) {
-		/* Whole node dedicated to single job */
-		rc = jobid_interpret_string(obd_jobid_name, jobid, joblen);
-	} else if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) {
-		rc = jobid_interpret_string("%e.%u", jobid, joblen);
-	} else if (jobid_name_is_valid(current_comm())) {
-		/*
-		 * obd_jobid_var holds the jobid environment variable name.
-		 * Skip initial check if obd_jobid_name already uses "%j",
-		 * otherwise try just "%j" first, then fall back to whatever
-		 * is in obd_jobid_name if obd_jobid_var is not found.
-		 */
-		rc = -EAGAIN;
-		if (!strnstr(obd_jobid_name, "%j", joblen))
-			rc = jobid_get_from_cache(jobid, joblen);
-
-		/* fall back to jobid_node if jobid_var not in environment */
-		if (rc < 0) {
-			int rc2 = jobid_interpret_string(obd_jobid_name,
-							 jobid, joblen);
-			if (!rc2)
-				rc = 0;
-		}
-	}
-
-	RETURN(rc);
-}
-EXPORT_SYMBOL(lustre_get_jobid);
-
-/*
- * lustre_jobid_clear
- *
- * Search cache for JobID given by @find_jobid.
- * If any entries in the hash table match the value, they are removed
- */
-void lustre_jobid_clear(const char *find_jobid)
-{
-	char jobid[LUSTRE_JOBID_SIZE];
-	char *end;
-
-	if (jobid_hash == NULL)
-		return;
-
-	strlcpy(jobid, find_jobid, sizeof(jobid));
-	/* trim \n off the end of the incoming jobid */
-	end = strchr(jobid, '\n');
-	if (end && *end == '\n')
-		*end = '\0';
-
-	CDEBUG(D_INFO, "Clearing Jobid: %s\n", jobid);
-	cfs_hash_cond_del(jobid_hash, jobid_should_free_item, jobid);
-
-	CDEBUG(D_INFO, "%d items remain in jobID table\n",
-	       atomic_read(&jobid_hash->hs_count));
-}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/kernelcomm.c b/drivers/staging/lustrefsx/lustre/obdclass/kernelcomm.c
index 7afb9484a8a69..79d176dcd3d53 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/kernelcomm.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/kernelcomm.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2015, 2017, Intel Corporation.
+ * Copyright (c) 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -35,8 +35,7 @@
  */
 
 #define DEBUG_SUBSYSTEM S_CLASS
-
-#include <linux/file.h>
+#define D_KUC D_OTHER
 
 #include <obd_support.h>
 #include <lustre_kernelcomm.h>
@@ -74,7 +73,7 @@ int libcfs_kkuc_msg_put(struct file *filp, void *payload)
 	if (rc < 0)
 		CWARN("message send failed (%d)\n", rc);
 	else
-		CDEBUG(D_HSM, "Sent message rc=%d, fp=%p\n", rc, filp);
+		CDEBUG(D_KUC, "Sent message rc=%d, fp=%p\n", rc, filp);
 
 	return rc;
 }
@@ -143,7 +142,7 @@ int libcfs_kkuc_group_add(struct file *filp, const struct obd_uuid *uuid,
 	list_add(&reg->kr_chain, &kkuc_groups[group]);
 	up_write(&kg_sem);
 
-	CDEBUG(D_HSM, "Added uid=%d fp=%p to group %d\n", uid, filp, group);
+	CDEBUG(D_KUC, "Added uid=%d fp=%p to group %d\n", uid, filp, group);
 
 	return 0;
 }
@@ -175,7 +174,7 @@ int libcfs_kkuc_group_rem(const struct obd_uuid *uuid, int uid, int group)
 		if (obd_uuid_equals(uuid, &reg->kr_uuid) &&
 		    (uid == 0 || uid == reg->kr_uid)) {
 			list_del(&reg->kr_chain);
-			CDEBUG(D_HSM, "Removed uid=%d fp=%p from group %d\n",
+			CDEBUG(D_KUC, "Removed uid=%d fp=%p from group %d\n",
 				reg->kr_uid, reg->kr_fp, group);
 			if (reg->kr_fp != NULL)
 				fput(reg->kr_fp);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/linkea.c b/drivers/staging/lustrefsx/lustre/obdclass/linkea.c
index cf17a50999f8d..a1bcc3d7de608 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/linkea.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/linkea.c
@@ -21,12 +21,13 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2013, 2017, Intel Corporation.
+ * Copyright (c) 2013, 2016, Intel Corporation.
  * Use is subject to license terms.
  *
  * Author: Di Wang <di.wang@intel.com>
  */
 
+#include <lustre/lustre_idl.h>
 #include <obd.h>
 #include <lustre_linkea.h>
 
@@ -143,11 +144,10 @@ int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname,
 	reclen = lname->ln_namelen + sizeof(struct link_ea_entry);
 	if (unlikely(leh->leh_len + reclen > MAX_LINKEA_SIZE)) {
 		/* Use 32-bits to save the overflow time, although it will
-		 * shrink the ktime_get_real_seconds() returned 64-bits value
+		 * shrink the cfs_time_current_sec() returned 64-bits value
 		 * to 32-bits value, it is still quite large and can be used
-		 * for about 140 years. That is enough.
-		 */
-		leh->leh_overflow_time = ktime_get_real_seconds();
+		 * for about 140 years. That is enough. */
+		leh->leh_overflow_time = cfs_time_current_sec();
 		if (unlikely(leh->leh_overflow_time == 0))
 			leh->leh_overflow_time++;
 
@@ -236,7 +236,7 @@ int linkea_overflow_shrink(struct linkea_data *ldata)
 	if (unlikely(leh->leh_reccount == 0))
 		return 0;
 
-	leh->leh_overflow_time = ktime_get_real_seconds();
+	leh->leh_overflow_time = cfs_time_current_sec();
 	if (unlikely(leh->leh_overflow_time == 0))
 		leh->leh_overflow_time++;
 	ldata->ld_reclen = 0;
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-module.c b/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-module.c
new file mode 100644
index 0000000000000..dabbf58057caf
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-module.c
@@ -0,0 +1,582 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2016, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/linux/linux-module.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/lp.h>
+#include <linux/slab.h>
+#include <linux/ioport.h>
+#include <linux/fcntl.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <asm/io.h>
+#include <asm/ioctls.h>
+#include <asm/poll.h>
+#include <asm/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/seq_file.h>
+#include <linux/kobject.h>
+
+#include <libcfs/libcfs.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lnet/lnetctl.h>
+#include <lprocfs_status.h>
+#include <uapi/linux/lustre_ioctl.h>
+#include <lustre_ver.h>
+
+static int obd_ioctl_is_invalid(struct obd_ioctl_data *data)
+{
+	if (data->ioc_len > BIT(30)) {
+		CERROR("OBD ioctl: ioc_len larger than 1<<30\n");
+		return 1;
+	}
+
+	if (data->ioc_inllen1 > BIT(30)) {
+		CERROR("OBD ioctl: ioc_inllen1 larger than 1<<30\n");
+		return 1;
+	}
+
+	if (data->ioc_inllen2 > BIT(30)) {
+		CERROR("OBD ioctl: ioc_inllen2 larger than 1<<30\n");
+		return 1;
+	}
+
+	if (data->ioc_inllen3 > BIT(30)) {
+		CERROR("OBD ioctl: ioc_inllen3 larger than 1<<30\n");
+		return 1;
+	}
+
+	if (data->ioc_inllen4 > BIT(30)) {
+		CERROR("OBD ioctl: ioc_inllen4 larger than 1<<30\n");
+		return 1;
+	}
+
+	if (data->ioc_inlbuf1 && data->ioc_inllen1 == 0) {
+		CERROR("OBD ioctl: inlbuf1 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (data->ioc_inlbuf2 && data->ioc_inllen2 == 0) {
+		CERROR("OBD ioctl: inlbuf2 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (data->ioc_inlbuf3 && data->ioc_inllen3 == 0) {
+		CERROR("OBD ioctl: inlbuf3 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (data->ioc_inlbuf4 && data->ioc_inllen4 == 0) {
+		CERROR("OBD ioctl: inlbuf4 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (data->ioc_pbuf1 && data->ioc_plen1 == 0) {
+		CERROR("OBD ioctl: pbuf1 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (data->ioc_pbuf2 && data->ioc_plen2 == 0) {
+		CERROR("OBD ioctl: pbuf2 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (!data->ioc_pbuf1 && data->ioc_plen1 != 0) {
+		CERROR("OBD ioctl: plen1 set but NULL pointer\n");
+		return 1;
+	}
+
+	if (!data->ioc_pbuf2 && data->ioc_plen2 != 0) {
+		CERROR("OBD ioctl: plen2 set but NULL pointer\n");
+		return 1;
+	}
+
+	if (obd_ioctl_packlen(data) > data->ioc_len) {
+		CERROR("OBD ioctl: packlen exceeds ioc_len (%d > %d)\n",
+		       obd_ioctl_packlen(data), data->ioc_len);
+		return 1;
+	}
+
+	return 0;
+}
+
+/* buffer MUST be at least the size of obd_ioctl_hdr */
+int obd_ioctl_getdata(char **buf, int *len, void __user *arg)
+{
+	struct obd_ioctl_hdr hdr;
+	struct obd_ioctl_data *data;
+	int offset = 0;
+	ENTRY;
+
+	if (copy_from_user(&hdr, arg, sizeof(hdr)))
+		RETURN(-EFAULT);
+
+        if (hdr.ioc_version != OBD_IOCTL_VERSION) {
+                CERROR("Version mismatch kernel (%x) vs application (%x)\n",
+                       OBD_IOCTL_VERSION, hdr.ioc_version);
+                RETURN(-EINVAL);
+        }
+
+        if (hdr.ioc_len > OBD_MAX_IOCTL_BUFFER) {
+                CERROR("User buffer len %d exceeds %d max buffer\n",
+                       hdr.ioc_len, OBD_MAX_IOCTL_BUFFER);
+                RETURN(-EINVAL);
+        }
+
+        if (hdr.ioc_len < sizeof(struct obd_ioctl_data)) {
+                CERROR("User buffer too small for ioctl (%d)\n", hdr.ioc_len);
+                RETURN(-EINVAL);
+        }
+
+        /* When there are lots of processes calling vmalloc on multi-core
+         * system, the high lock contention will hurt performance badly,
+         * obdfilter-survey is an example, which relies on ioctl. So we'd
+         * better avoid vmalloc on ioctl path. LU-66 */
+        OBD_ALLOC_LARGE(*buf, hdr.ioc_len);
+        if (*buf == NULL) {
+                CERROR("Cannot allocate control buffer of len %d\n",
+                       hdr.ioc_len);
+                RETURN(-EINVAL);
+        }
+        *len = hdr.ioc_len;
+        data = (struct obd_ioctl_data *)*buf;
+
+	if (copy_from_user(*buf, arg, hdr.ioc_len)) {
+		OBD_FREE_LARGE(*buf, hdr.ioc_len);
+		RETURN(-EFAULT);
+	}
+
+        if (obd_ioctl_is_invalid(data)) {
+                CERROR("ioctl not correctly formatted\n");
+                OBD_FREE_LARGE(*buf, hdr.ioc_len);
+                RETURN(-EINVAL);
+        }
+
+        if (data->ioc_inllen1) {
+                data->ioc_inlbuf1 = &data->ioc_bulk[0];
+                offset += cfs_size_round(data->ioc_inllen1);
+        }
+
+        if (data->ioc_inllen2) {
+                data->ioc_inlbuf2 = &data->ioc_bulk[0] + offset;
+                offset += cfs_size_round(data->ioc_inllen2);
+        }
+
+        if (data->ioc_inllen3) {
+                data->ioc_inlbuf3 = &data->ioc_bulk[0] + offset;
+                offset += cfs_size_round(data->ioc_inllen3);
+        }
+
+	if (data->ioc_inllen4)
+		data->ioc_inlbuf4 = &data->ioc_bulk[0] + offset;
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(obd_ioctl_getdata);
+
+/*  opening /dev/obd */
+static int obd_class_open(struct inode * inode, struct file * file)
+{
+	ENTRY;
+
+	try_module_get(THIS_MODULE);
+	RETURN(0);
+}
+
+/*  closing /dev/obd */
+static int obd_class_release(struct inode * inode, struct file * file)
+{
+	ENTRY;
+
+	module_put(THIS_MODULE);
+	RETURN(0);
+}
+
+/* to control /dev/obd */
+static long obd_class_ioctl(struct file *filp, unsigned int cmd,
+			    unsigned long arg)
+{
+        int err = 0;
+        ENTRY;
+
+        /* Allow non-root access for OBD_IOC_PING_TARGET - used by lfs check */
+        if (!cfs_capable(CFS_CAP_SYS_ADMIN) && (cmd != OBD_IOC_PING_TARGET))
+                RETURN(err = -EACCES);
+        if ((cmd & 0xffffff00) == ((int)'T') << 8) /* ignore all tty ioctls */
+                RETURN(err = -ENOTTY);
+
+        err = class_handle_ioctl(cmd, (unsigned long)arg);
+
+        RETURN(err);
+}
+
+/* declare character device */
+static struct file_operations obd_psdev_fops = {
+	.owner          = THIS_MODULE,
+	.unlocked_ioctl = obd_class_ioctl, /* unlocked_ioctl */
+	.open           = obd_class_open,      /* open */
+	.release        = obd_class_release,   /* release */
+};
+
+/* modules setup */
+struct miscdevice obd_psdev = {
+	.minor	= MISC_DYNAMIC_MINOR,
+	.name	= OBD_DEV_NAME,
+	.fops	= &obd_psdev_fops,
+};
+
+static ssize_t version_show(struct kobject *kobj, struct attribute *attr,
+			    char *buf)
+{
+	return sprintf(buf, "%s\n", LUSTRE_VERSION_STRING);
+}
+
+static ssize_t pinger_show(struct kobject *kobj, struct attribute *attr,
+			   char *buf)
+{
+#ifdef ENABLE_PINGER
+	const char *state = "on";
+#else
+	const char *state = "off";
+#endif
+	return sprintf(buf, "%s\n", state);
+}
+
+/**
+ * Check all obd devices health
+ *
+ * \param kobj
+ * \param buf [in]
+ *
+ * \retval number of characters printed if healthy
+ */
+static ssize_t
+health_check_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	bool healthy = true;
+	size_t len = 0;
+	int i;
+
+	if (libcfs_catastrophe) {
+		len = sprintf(buf, "LBUG\n");
+		healthy = false;
+	}
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd;
+
+		obd = class_num2obd(i);
+		if (obd == NULL || !obd->obd_attached || !obd->obd_set_up)
+			continue;
+
+		LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+		if (obd->obd_stopping)
+			continue;
+
+		class_incref(obd, __FUNCTION__, current);
+		read_unlock(&obd_dev_lock);
+
+		if (obd_health_check(NULL, obd)) {
+			len = sprintf(buf, "device %s reported unhealthy\n",
+				      obd->obd_name);
+			healthy = false;
+		}
+		class_decref(obd, __FUNCTION__, current);
+		read_lock(&obd_dev_lock);
+	}
+	read_unlock(&obd_dev_lock);
+
+	if (healthy)
+		len = sprintf(buf, "healthy\n");
+	else
+		len = sprintf(buf, "NOT HEALTHY\n");
+
+	return len;
+}
+
+static ssize_t jobid_var_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
+{
+	int rc = 0;
+
+	if (strlen(obd_jobid_var))
+		rc = snprintf(buf, PAGE_SIZE, "%s\n", obd_jobid_var);
+	return rc;
+}
+
+static ssize_t jobid_var_store(struct kobject *kobj, struct attribute *attr,
+			       const char *buffer, size_t count)
+{
+	if (!count || count > JOBSTATS_JOBID_VAR_MAX_LEN)
+		return -EINVAL;
+
+	memset(obd_jobid_var, 0, JOBSTATS_JOBID_VAR_MAX_LEN + 1);
+
+	memcpy(obd_jobid_var, buffer, count);
+
+	/* Trim the trailing '\n' if any */
+	if (obd_jobid_var[count - 1] == '\n')
+		obd_jobid_var[count - 1] = 0;
+
+	return count;
+}
+
+static ssize_t jobid_name_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	int rc = 0;
+
+	if (strlen(obd_jobid_node))
+		rc = snprintf(buf, PAGE_SIZE, "%s\n", obd_jobid_node);
+	return rc;
+}
+
+static ssize_t jobid_name_store(struct kobject *kobj, struct attribute *attr,
+				const char *buffer, size_t count)
+{
+	if (!count || count > LUSTRE_JOBID_SIZE)
+		return -EINVAL;
+
+	/* clear previous value */
+	memset(obd_jobid_node, 0, LUSTRE_JOBID_SIZE);
+
+	memcpy(obd_jobid_node, buffer, count);
+
+	/* Trim the trailing '\n' if any */
+	if (obd_jobid_node[count - 1] == '\n') {
+		/* Don't echo just a newline */
+		if (count == 1)
+			return -EINVAL;
+		obd_jobid_node[count - 1] = 0;
+	}
+
+	return count;
+}
+
+/* Root for /sys/kernel/debug/lustre */
+struct dentry *debugfs_lustre_root;
+EXPORT_SYMBOL_GPL(debugfs_lustre_root);
+
+#ifdef CONFIG_PROC_FS
+/* Root for /proc/fs/lustre */
+struct proc_dir_entry *proc_lustre_root = NULL;
+EXPORT_SYMBOL(proc_lustre_root);
+#else
+#define lprocfs_base NULL
+#endif /* CONFIG_PROC_FS */
+
+LUSTRE_RO_ATTR(version);
+LUSTRE_RO_ATTR(pinger);
+LUSTRE_RO_ATTR(health_check);
+LUSTRE_RW_ATTR(jobid_var);
+LUSTRE_RW_ATTR(jobid_name);
+
+static struct attribute *lustre_attrs[] = {
+	&lustre_attr_version.attr,
+	&lustre_attr_pinger.attr,
+	&lustre_attr_health_check.attr,
+	&lustre_attr_jobid_name.attr,
+	&lustre_attr_jobid_var.attr,
+	NULL,
+};
+
+static void *obd_device_list_seq_start(struct seq_file *p, loff_t *pos)
+{
+        if (*pos >= class_devno_max())
+                return NULL;
+
+        return pos;
+}
+
+static void obd_device_list_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *obd_device_list_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+        ++*pos;
+        if (*pos >= class_devno_max())
+                return NULL;
+
+        return pos;
+}
+
+static int obd_device_list_seq_show(struct seq_file *p, void *v)
+{
+        loff_t index = *(loff_t *)v;
+        struct obd_device *obd = class_num2obd((int)index);
+        char *status;
+
+        if (obd == NULL)
+                return 0;
+
+        LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+        if (obd->obd_stopping)
+                status = "ST";
+        else if (obd->obd_inactive)
+                status = "IN";
+        else if (obd->obd_set_up)
+                status = "UP";
+        else if (obd->obd_attached)
+                status = "AT";
+        else
+                status = "--";
+
+	seq_printf(p, "%3d %s %s %s %s %d\n",
+		   (int)index, status, obd->obd_type->typ_name,
+		   obd->obd_name, obd->obd_uuid.uuid,
+		   atomic_read(&obd->obd_refcount));
+	return 0;
+}
+
+static const struct seq_operations obd_device_list_sops = {
+        .start = obd_device_list_seq_start,
+        .stop = obd_device_list_seq_stop,
+        .next = obd_device_list_seq_next,
+        .show = obd_device_list_seq_show,
+};
+
+static int obd_device_list_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc = seq_open(file, &obd_device_list_sops);
+
+	if (rc)
+		return rc;
+
+	seq = file->private_data;
+	seq->private = inode->i_private;
+	return 0;
+}
+
+static const struct file_operations obd_device_list_fops = {
+        .owner   = THIS_MODULE,
+        .open    = obd_device_list_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release,
+};
+
+struct kobject *lustre_kobj;
+EXPORT_SYMBOL_GPL(lustre_kobj);
+
+static struct attribute_group lustre_attr_group = {
+	.attrs = lustre_attrs,
+};
+
+int class_procfs_init(void)
+{
+	struct proc_dir_entry *entry;
+	struct dentry *file;
+	int rc = -ENOMEM;
+	ENTRY;
+
+	lustre_kobj = kobject_create_and_add("lustre", fs_kobj);
+	if (lustre_kobj == NULL)
+		goto out;
+
+	/* Create the files associated with this kobject */
+	rc = sysfs_create_group(lustre_kobj, &lustre_attr_group);
+	if (rc) {
+		kobject_put(lustre_kobj);
+		goto out;
+	}
+
+	rc = obd_sysctl_init();
+	if (rc) {
+		kobject_put(lustre_kobj);
+		goto out;
+	}
+
+	debugfs_lustre_root = debugfs_create_dir("lustre", NULL);
+	if (IS_ERR_OR_NULL(debugfs_lustre_root)) {
+		rc = debugfs_lustre_root ? PTR_ERR(debugfs_lustre_root)
+					 : -ENOMEM;
+		debugfs_lustre_root = NULL;
+		kobject_put(lustre_kobj);
+		goto out;
+	}
+
+	file = debugfs_create_file("devices", 0444, debugfs_lustre_root, NULL,
+				   &obd_device_list_fops);
+	if (IS_ERR_OR_NULL(file)) {
+		rc = file ? PTR_ERR(file) : -ENOMEM;
+		kobject_put(lustre_kobj);
+		goto out;
+	}
+
+	entry = lprocfs_register("fs/lustre", NULL, NULL, NULL);
+	if (IS_ERR(entry)) {
+		rc = PTR_ERR(entry);
+		CERROR("cannot create '/proc/fs/lustre': rc = %d\n", rc);
+		kobject_put(lustre_kobj);
+		goto out;
+	}
+
+	proc_lustre_root = entry;
+out:
+	RETURN(rc);
+}
+
+int class_procfs_clean(void)
+{
+	ENTRY;
+
+	debugfs_remove_recursive(debugfs_lustre_root);
+
+	debugfs_lustre_root = NULL;
+
+	if (proc_lustre_root)
+		lprocfs_remove(&proc_lustre_root);
+
+	kobject_put(lustre_kobj);
+
+	RETURN(0);
+}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-obdo.c b/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-obdo.c
new file mode 100644
index 0000000000000..5f8e2b55d7258
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-obdo.c
@@ -0,0 +1,157 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/linux/linux-obdo.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/pagemap.h> /* for PAGE_SIZE */
+#include <lustre/lustre_idl.h>
+#include <obd_class.h>
+
+/*FIXME: Just copy from obdo_from_inode*/
+void obdo_from_la(struct obdo *dst, const struct lu_attr *la, u64 valid)
+{
+	u64 newvalid = 0;
+
+        if (valid & LA_ATIME) {
+                dst->o_atime = la->la_atime;
+                newvalid |= OBD_MD_FLATIME;
+        }
+        if (valid & LA_MTIME) {
+                dst->o_mtime = la->la_mtime;
+                newvalid |= OBD_MD_FLMTIME;
+        }
+        if (valid & LA_CTIME) {
+                dst->o_ctime = la->la_ctime;
+                newvalid |= OBD_MD_FLCTIME;
+        }
+        if (valid & LA_SIZE) {
+                dst->o_size = la->la_size;
+                newvalid |= OBD_MD_FLSIZE;
+        }
+        if (valid & LA_BLOCKS) {  /* allocation of space (x512 bytes) */
+                dst->o_blocks = la->la_blocks;
+                newvalid |= OBD_MD_FLBLOCKS;
+        }
+        if (valid & LA_TYPE) {
+                dst->o_mode = (dst->o_mode & S_IALLUGO) |
+                              (la->la_mode & S_IFMT);
+                newvalid |= OBD_MD_FLTYPE;
+        }
+        if (valid & LA_MODE) {
+                dst->o_mode = (dst->o_mode & S_IFMT) |
+                              (la->la_mode & S_IALLUGO);
+                newvalid |= OBD_MD_FLMODE;
+        }
+        if (valid & LA_UID) {
+                dst->o_uid = la->la_uid;
+                newvalid |= OBD_MD_FLUID;
+        }
+        if (valid & LA_GID) {
+                dst->o_gid = la->la_gid;
+                newvalid |= OBD_MD_FLGID;
+        }
+	if (valid & LA_PROJID) {
+		dst->o_projid = la->la_projid;
+		newvalid |= OBD_MD_FLPROJID;
+	}
+	if (valid & LA_FLAGS) {
+		dst->o_flags = la->la_flags;
+		newvalid |= OBD_MD_FLFLAGS;
+	}
+	dst->o_valid |= newvalid;
+}
+EXPORT_SYMBOL(obdo_from_la);
+
+/*FIXME: Just copy from obdo_from_inode*/
+void la_from_obdo(struct lu_attr *dst, const struct obdo *obdo, u64 valid)
+{
+	u64 newvalid = 0;
+
+        valid &= obdo->o_valid;
+
+        if (valid & OBD_MD_FLATIME) {
+                dst->la_atime = obdo->o_atime;
+                newvalid |= LA_ATIME;
+        }
+        if (valid & OBD_MD_FLMTIME) {
+                dst->la_mtime = obdo->o_mtime;
+                newvalid |= LA_MTIME;
+        }
+        if (valid & OBD_MD_FLCTIME) {
+                dst->la_ctime = obdo->o_ctime;
+                newvalid |= LA_CTIME;
+        }
+        if (valid & OBD_MD_FLSIZE) {
+                dst->la_size = obdo->o_size;
+                newvalid |= LA_SIZE;
+        }
+        if (valid & OBD_MD_FLBLOCKS) {
+                dst->la_blocks = obdo->o_blocks;
+                newvalid |= LA_BLOCKS;
+        }
+        if (valid & OBD_MD_FLTYPE) {
+                dst->la_mode = (dst->la_mode & S_IALLUGO) |
+                               (obdo->o_mode & S_IFMT);
+                newvalid |= LA_TYPE;
+        }
+        if (valid & OBD_MD_FLMODE) {
+                dst->la_mode = (dst->la_mode & S_IFMT) |
+                               (obdo->o_mode & S_IALLUGO);
+                newvalid |= LA_MODE;
+        }
+        if (valid & OBD_MD_FLUID) {
+                dst->la_uid = obdo->o_uid;
+                newvalid |= LA_UID;
+        }
+        if (valid & OBD_MD_FLGID) {
+                dst->la_gid = obdo->o_gid;
+                newvalid |= LA_GID;
+        }
+	if (valid & OBD_MD_FLPROJID) {
+		dst->la_projid = obdo->o_projid;
+		newvalid |= LA_PROJID;
+	}
+	if (valid & OBD_MD_FLFLAGS) {
+		dst->la_flags = obdo->o_flags;
+		newvalid |= LA_FLAGS;
+	}
+	dst->la_valid = newvalid;
+}
+EXPORT_SYMBOL(la_from_obdo);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-sysctl.c b/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-sysctl.c
new file mode 100644
index 0000000000000..e8016c77c7506
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-sysctl.c
@@ -0,0 +1,190 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/ctype.h>
+#include <linux/bitops.h>
+#include <linux/uaccess.h>
+#include <linux/utsname.h>
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_support.h>
+#include <lprocfs_status.h>
+#include <obd_class.h>
+
+struct static_lustre_uintvalue_attr {
+	struct {
+		struct attribute attr;
+		ssize_t (*show)(struct kobject *kobj, struct attribute *attr,
+				char *buf);
+		ssize_t (*store)(struct kobject *kobj, struct attribute *attr,
+				 const char *buf, size_t len);
+	} u;
+	int *value;
+};
+
+static ssize_t static_uintvalue_show(struct kobject *kobj,
+				     struct attribute *attr,
+				     char *buf)
+{
+	struct static_lustre_uintvalue_attr *lattr = (void *)attr;
+
+	return sprintf(buf, "%d\n", *lattr->value);
+}
+
+static ssize_t static_uintvalue_store(struct kobject *kobj,
+				      struct attribute *attr,
+				      const char *buffer, size_t count)
+{
+	struct static_lustre_uintvalue_attr *lattr  = (void *)attr;
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buffer, 10, &val);
+	if (rc)
+		return rc;
+
+	*lattr->value = val;
+
+	return count;
+}
+
+#define LUSTRE_STATIC_UINT_ATTR(name, value) \
+static struct static_lustre_uintvalue_attr lustre_sattr_##name =	\
+					{__ATTR(name, 0644,		\
+						static_uintvalue_show,	\
+						static_uintvalue_store),\
+					  value }
+
+LUSTRE_STATIC_UINT_ATTR(timeout, &obd_timeout);
+
+static ssize_t max_dirty_mb_show(struct kobject *kobj, struct attribute *attr,
+				 char *buf)
+{
+	return sprintf(buf, "%lu\n",
+		       obd_max_dirty_pages / (1 << (20 - PAGE_SHIFT)));
+}
+
+static ssize_t max_dirty_mb_store(struct kobject *kobj, struct attribute *attr,
+				  const char *buffer, size_t count)
+{
+	unsigned long val;
+	int rc;
+
+	rc = kstrtoul(buffer, 10, &val);
+	if (rc)
+		return rc;
+
+	val *= 1 << (20 - PAGE_SHIFT); /* convert to pages */
+
+	if (val > ((cfs_totalram_pages() / 10) * 9)) {
+		/* Somebody wants to assign too much memory to dirty pages */
+		return -EINVAL;
+	}
+
+	if (val < 4 << (20 - PAGE_SHIFT)) {
+		/* Less than 4 Mb for dirty cache is also bad */
+		return -EINVAL;
+	}
+
+	obd_max_dirty_pages = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(max_dirty_mb);
+
+LUSTRE_STATIC_UINT_ATTR(debug_peer_on_timeout, &obd_debug_peer_on_timeout);
+LUSTRE_STATIC_UINT_ATTR(dump_on_timeout, &obd_dump_on_timeout);
+LUSTRE_STATIC_UINT_ATTR(dump_on_eviction, &obd_dump_on_eviction);
+LUSTRE_STATIC_UINT_ATTR(at_min, &at_min);
+LUSTRE_STATIC_UINT_ATTR(at_max, &at_max);
+LUSTRE_STATIC_UINT_ATTR(at_extra, &at_extra);
+LUSTRE_STATIC_UINT_ATTR(at_early_margin, &at_early_margin);
+LUSTRE_STATIC_UINT_ATTR(at_history, &at_history);
+
+#ifdef HAVE_SERVER_SUPPORT
+LUSTRE_STATIC_UINT_ATTR(ldlm_timeout, &ldlm_timeout);
+LUSTRE_STATIC_UINT_ATTR(bulk_timeout, &bulk_timeout);
+#endif
+
+static ssize_t memused_show(struct kobject *kobj, struct attribute *attr,
+			    char *buf)
+{
+	return sprintf(buf, "%llu\n", obd_memory_sum());
+}
+LUSTRE_RO_ATTR(memused);
+
+static ssize_t memused_max_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	return sprintf(buf, "%llu\n", obd_memory_max());
+}
+LUSTRE_RO_ATTR(memused_max);
+
+static struct attribute *lustre_attrs[] = {
+	&lustre_sattr_timeout.u.attr,
+	&lustre_attr_max_dirty_mb.attr,
+	&lustre_sattr_debug_peer_on_timeout.u.attr,
+	&lustre_sattr_dump_on_timeout.u.attr,
+	&lustre_sattr_dump_on_eviction.u.attr,
+	&lustre_sattr_at_min.u.attr,
+	&lustre_sattr_at_max.u.attr,
+	&lustre_sattr_at_extra.u.attr,
+	&lustre_sattr_at_early_margin.u.attr,
+	&lustre_sattr_at_history.u.attr,
+	&lustre_attr_memused_max.attr,
+	&lustre_attr_memused.attr,
+#ifdef HAVE_SERVER_SUPPORT
+	&lustre_sattr_ldlm_timeout.u.attr,
+	&lustre_sattr_bulk_timeout.u.attr,
+#endif
+	NULL,
+};
+
+static struct attribute_group lustre_attr_group = {
+	.attrs = lustre_attrs,
+};
+
+int obd_sysctl_init(void)
+{
+	return sysfs_create_group(lustre_kobj, &lustre_attr_group);
+}
+
+void obd_sysctl_clean(void)
+{
+	sysfs_remove_group(lustre_kobj, &lustre_attr_group);
+}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog.c b/drivers/staging/lustrefsx/lustre/obdclass/llog.c
index e9228b33339f3..61c9a1d1f4e8a 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/llog.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -47,10 +47,8 @@
 #include <linux/kthread.h>
 #include <llog_swab.h>
 #include <lustre_log.h>
-#include <obd_support.h>
 #include <obd_class.h>
 #include "llog_internal.h"
-
 /*
  * Allocate a new log or catalog handle
  * Used inside llog_open().
@@ -65,7 +63,6 @@ static struct llog_handle *llog_alloc_handle(void)
 
 	init_rwsem(&loghandle->lgh_lock);
 	mutex_init(&loghandle->lgh_hdr_mutex);
-	init_rwsem(&loghandle->lgh_last_sem);
 	INIT_LIST_HEAD(&loghandle->u.phd.phd_entry);
 	atomic_set(&loghandle->lgh_refcount, 1);
 
@@ -92,30 +89,16 @@ static void llog_free_handle(struct llog_handle *loghandle)
 	OBD_FREE_PTR(loghandle);
 }
 
-struct llog_handle *llog_handle_get(struct llog_handle *loghandle)
+void llog_handle_get(struct llog_handle *loghandle)
 {
-	if (atomic_inc_not_zero(&loghandle->lgh_refcount))
-		return loghandle;
-	return NULL;
+	atomic_inc(&loghandle->lgh_refcount);
 }
 
-int llog_handle_put(const struct lu_env *env, struct llog_handle *loghandle)
+void llog_handle_put(struct llog_handle *loghandle)
 {
-	int rc = 0;
-
-	if (atomic_dec_and_test(&loghandle->lgh_refcount)) {
-		struct llog_operations *lop;
-
-		rc = llog_handle2ops(loghandle, &lop);
-		if (!rc) {
-			if (lop->lop_close)
-				rc = lop->lop_close(env, loghandle);
-			else
-				rc = -EOPNOTSUPP;
-		}
+	LASSERT(atomic_read(&loghandle->lgh_refcount) > 0);
+	if (atomic_dec_and_test(&loghandle->lgh_refcount))
 		llog_free_handle(loghandle);
-	}
-	return rc;
 }
 
 static int llog_declare_destroy(const struct lu_env *env,
@@ -152,7 +135,7 @@ int llog_trans_destroy(const struct lu_env *env, struct llog_handle *handle,
 		RETURN(-EOPNOTSUPP);
 
 	LASSERT(handle->lgh_obj != NULL);
-	if (!llog_exist(handle))
+	if (!dt_object_exists(handle->lgh_obj))
 		RETURN(0);
 
 	rc = lop->lop_destroy(env, handle, th);
@@ -181,14 +164,11 @@ int llog_destroy(const struct lu_env *env, struct llog_handle *handle)
 		RETURN(rc);
 	}
 
-	if (!llog_exist(handle))
+	if (!dt_object_exists(handle->lgh_obj))
 		RETURN(0);
 
 	dt = lu2dt_dev(handle->lgh_obj->do_lu.lo_dev);
 
-	if (unlikely(unlikely(dt->dd_rdonly)))
-		RETURN(-EROFS);
-
 	th = dt_trans_create(env, dt);
 	if (IS_ERR(th))
 		RETURN(PTR_ERR(th));
@@ -216,21 +196,14 @@ int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
 {
 	struct llog_thread_info *lgi = llog_info(env);
 	struct dt_device	*dt;
-	struct llog_log_hdr	*llh;
+	struct llog_log_hdr	*llh = loghandle->lgh_hdr;
 	struct thandle		*th;
-	__u32			 tmp_lgc_index;
 	int			 rc;
 	int rc1;
 	bool subtract_count = false;
 
 	ENTRY;
 
-	LASSERT(loghandle != NULL);
-	LASSERT(loghandle->lgh_ctxt != NULL);
-	LASSERT(loghandle->lgh_obj != NULL);
-
-	llh = loghandle->lgh_hdr;
-
 	CDEBUG(D_RPCTRACE, "Canceling %d in log "DFID"\n", index,
 	       PFID(&loghandle->lgh_id.lgl_oi.oi_fid));
 
@@ -239,10 +212,11 @@ int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
 		RETURN(-EINVAL);
 	}
 
-	dt = lu2dt_dev(loghandle->lgh_obj->do_lu.lo_dev);
+	LASSERT(loghandle != NULL);
+	LASSERT(loghandle->lgh_ctxt != NULL);
+	LASSERT(loghandle->lgh_obj != NULL);
 
-	if (unlikely(unlikely(dt->dd_rdonly)))
-		RETURN(0);
+	dt = lu2dt_dev(loghandle->lgh_obj->do_lu.lo_dev);
 
 	th = dt_trans_create(env, dt);
 	if (IS_ERR(th))
@@ -273,19 +247,12 @@ int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
 
 	loghandle->lgh_hdr->llh_count--;
 	subtract_count = true;
-
-	/* Since llog_process_thread use lgi_cookie, it`s better to save them
-	 * and restore after using
-	 */
-	tmp_lgc_index = lgi->lgi_cookie.lgc_index;
 	/* Pass this index to llog_osd_write_rec(), which will use the index
 	 * to only update the necesary bitmap. */
 	lgi->lgi_cookie.lgc_index = index;
 	/* update header */
 	rc = llog_write_rec(env, loghandle, &llh->llh_hdr, &lgi->lgi_cookie,
 			    LLOG_HEADER_IDX, th);
-	lgi->lgi_cookie.lgc_index = tmp_lgc_index;
-
 	if (rc != 0)
 		GOTO(out_unlock, rc);
 
@@ -304,7 +271,7 @@ int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
 			 * be accessed anymore, let's return 0 for now, and
 			 * the orphan will be handled by LFSCK. */
 			CERROR("%s: can't destroy empty llog "DFID": rc = %d\n",
-			       loghandle2name(loghandle),
+			       loghandle->lgh_ctxt->loc_obd->obd_name,
 			       PFID(&loghandle->lgh_id.lgl_oi.oi_fid), rc);
 			GOTO(out_unlock, rc = 0);
 		}
@@ -399,7 +366,7 @@ int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
 			     (llh->llh_flags & LLOG_F_IS_CAT &&
 			      flags & LLOG_F_IS_PLAIN))) {
 			CERROR("%s: llog type is %s but initializing %s\n",
-			       loghandle2name(handle),
+			       handle->lgh_ctxt->loc_obd->obd_name,
 			       llh->llh_flags & LLOG_F_IS_CAT ?
 			       "catalog" : "plain",
 			       flags & LLOG_F_IS_CAT ? "catalog" : "plain");
@@ -419,7 +386,7 @@ int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
 		if (unlikely(uuid &&
 			     !obd_uuid_equals(uuid, &llh->llh_tgtuuid))) {
 			CERROR("%s: llog uuid mismatch: %s/%s\n",
-			       loghandle2name(handle),
+			       handle->lgh_ctxt->loc_obd->obd_name,
 			       (char *)uuid->uuid,
 			       (char *)llh->llh_tgtuuid.uuid);
 			GOTO(out, rc = -EEXIST);
@@ -432,8 +399,8 @@ int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
 		llh->llh_flags |= LLOG_F_IS_FIXSIZE;
 	} else if (!(flags & LLOG_F_IS_PLAIN)) {
 		CERROR("%s: unknown flags: %#x (expected %#x or %#x)\n",
-		       loghandle2name(handle), flags, LLOG_F_IS_CAT,
-		       LLOG_F_IS_PLAIN);
+		       handle->lgh_ctxt->loc_obd->obd_name,
+		       flags, LLOG_F_IS_CAT, LLOG_F_IS_PLAIN);
 		rc = -EINVAL;
 	}
 	llh->llh_flags |= fmt;
@@ -446,37 +413,12 @@ int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
 }
 EXPORT_SYMBOL(llog_init_handle);
 
-int llog_verify_record(const struct llog_handle *llh, struct llog_rec_hdr *rec)
-{
-	int chunk_size = llh->lgh_hdr->llh_hdr.lrh_len;
-
-	if (rec->lrh_len == 0 || rec->lrh_len > chunk_size) {
-		CERROR("%s: record is too large: %d > %d\n",
-		       loghandle2name(llh), rec->lrh_len, chunk_size);
-		return -EINVAL;
-	}
-	if (rec->lrh_index >= LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr)) {
-		CERROR("%s: index is too high: %d\n",
-		       loghandle2name(llh), rec->lrh_index);
-		return -EINVAL;
-	}
-	if ((rec->lrh_type & LLOG_OP_MASK) != LLOG_OP_MAGIC) {
-		CERROR("%s: magic %x is bad\n",
-		       loghandle2name(llh), rec->lrh_type);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(llog_verify_record);
-
 static int llog_process_thread(void *arg)
 {
 	struct llog_process_info	*lpi = arg;
 	struct llog_handle		*loghandle = lpi->lpi_loghandle;
 	struct llog_log_hdr		*llh = loghandle->lgh_hdr;
 	struct llog_process_cat_data	*cd  = lpi->lpi_catdata;
-	struct llog_thread_info		*lti;
 	char				*buf;
 	size_t				 chunk_size;
 	__u64				 cur_offset;
@@ -484,15 +426,12 @@ static int llog_process_thread(void *arg)
 	int				 saved_index = 0;
 	int				 last_called_index = 0;
 	bool				 repeated = false;
-	bool				refresh_idx = false;
 
 	ENTRY;
 
 	if (llh == NULL)
 		RETURN(-EINVAL);
 
-	lti = lpi->lpi_env == NULL ? NULL : llog_info(lpi->lpi_env);
-
 	cur_offset = chunk_size = llh->llh_hdr.lrh_len;
 	/* expect chunk_size to be power of two */
 	LASSERT(is_power_of_2(chunk_size));
@@ -518,7 +457,6 @@ static int llog_process_thread(void *arg)
 		unsigned int buf_offset = 0;
 		bool partial_chunk;
 		int	lh_last_idx;
-		int	synced_idx = 0;
 
 		/* skip records not set in bitmap */
 		while (index <= last_index &&
@@ -536,8 +474,7 @@ static int llog_process_thread(void *arg)
 		/* get the buf with our target record; avoid old garbage */
 		memset(buf, 0, chunk_size);
 		/* the record index for outdated chunk data */
-		/* it is safe to process buffer until saved lgh_last_idx */
-		lh_last_idx = LLOG_HDR_TAIL(llh)->lrt_index;
+		lh_last_idx = loghandle->lgh_last_idx + 1;
 		rc = llog_next_block(lpi->lpi_env, loghandle, &saved_index,
 				     index, &cur_offset, buf, chunk_size);
 		if (repeated && rc)
@@ -581,72 +518,60 @@ static int llog_process_thread(void *arg)
 			CDEBUG(D_OTHER, "after swabbing, type=%#x idx=%d\n",
 			       rec->lrh_type, rec->lrh_index);
 
-			if (index == (synced_idx + 1) &&
-			    synced_idx == LLOG_HDR_TAIL(llh)->lrt_index)
-				GOTO(out, rc = 0);
-
-			if (OBD_FAIL_PRECHECK(OBD_FAIL_LLOG_PROCESS_TIMEOUT) &&
-				cfs_fail_val == (unsigned int)
-					(loghandle->lgh_id.lgl_oi.oi.oi_id &
-					 0xFFFFFFFF)) {
-				OBD_RACE(OBD_FAIL_LLOG_PROCESS_TIMEOUT);
-			}
-
 			/* the bitmap could be changed during processing
 			 * records from the chunk. For wrapped catalog
 			 * it means we can read deleted record and try to
-			 * process it. Check this case and reread the chunk.
-			 * It is safe to process to lh_last_idx, including
-			 * lh_last_idx if it was synced. We can not do <=
-			 * comparison, cause for wrapped catalog lgh_last_idx
-			 * could be less than index. So we detect last index
-			 * for processing as index == lh_last_idx+1. But when
-			 * catalog is wrapped and full lgh_last_idx=llh_cat_idx,
-			 * the first processing index is llh_cat_idx+1.The
-			 * exception is !(lgh_last_idx == llh_cat_idx &&
-			 * index == llh_cat_idx + 1), and after simplification
-			 * it turns to
-			 * lh_last_idx != LLOG_HDR_TAIL(llh)->lrt_index
-			 * This exception is working for catalog only.
-			 */
-
-			if ((index == lh_last_idx && synced_idx != index) ||
-			    (index == (lh_last_idx + 1) &&
-			     lh_last_idx != LLOG_HDR_TAIL(llh)->lrt_index) ||
-			    (rec->lrh_index == 0 && !repeated)) {
-
+			 * process it. Check this case and reread the chunk. */
+
+			/* for partial chunk the end of it is zeroed, check
+			 * for index 0 to distinguish it. */
+			if ((partial_chunk && rec->lrh_index == 0) ||
+			     (index == lh_last_idx &&
+			      lh_last_idx != (loghandle->lgh_last_idx + 1))) {
+				/* concurrent llog_add() might add new records
+				 * while llog_processing, check this is not
+				 * the case and re-read the current chunk
+				 * otherwise. */
+				int records;
+				/* lgh_last_idx could be less then index
+				 * for catalog, if catalog is wrapped */
+				if ((index > loghandle->lgh_last_idx &&
+				    !(loghandle->lgh_hdr->llh_flags &
+				      LLOG_F_IS_CAT)) || repeated ||
+				    (loghandle->lgh_obj != NULL &&
+				     dt_object_remote(loghandle->lgh_obj)))
+					GOTO(out, rc = 0);
+				/* <2 records means no more records
+				 * if the last record we processed was
+				 * the final one, then the underlying
+				 * object might have been destroyed yet.
+				 * we better don't access that.. */
+				mutex_lock(&loghandle->lgh_hdr_mutex);
+				records = loghandle->lgh_hdr->llh_count;
+				mutex_unlock(&loghandle->lgh_hdr_mutex);
+				if (records <= 1)
+					GOTO(out, rc = 0);
+				CDEBUG(D_OTHER, "Re-read last llog buffer for "
+				       "new records, index %u, last %u\n",
+				       index, loghandle->lgh_last_idx);
 				/* save offset inside buffer for the re-read */
 				buf_offset = (char *)rec - (char *)buf;
 				cur_offset = chunk_offset;
 				repeated = true;
-				/* We need to be sure lgh_last_idx
-				 * record was saved to disk
-				 */
-				down_read(&loghandle->lgh_last_sem);
-				synced_idx = LLOG_HDR_TAIL(llh)->lrt_index;
-				up_read(&loghandle->lgh_last_sem);
-				CDEBUG(D_OTHER, "synced_idx: %d\n", synced_idx);
 				goto repeat;
-
 			}
 
 			repeated = false;
 
-			rc = llog_verify_record(loghandle, rec);
-			if (rc) {
-				CERROR("%s: invalid record in llog "DFID
-				       " record for index %d/%d: rc = %d\n",
-				       loghandle2name(loghandle),
+			if (rec->lrh_len == 0 || rec->lrh_len > chunk_size) {
+				CWARN("%s: invalid length %d in llog "DFID
+				      "record for index %d/%d\n",
+				       loghandle->lgh_ctxt->loc_obd->obd_name,
+				       rec->lrh_len,
 				       PFID(&loghandle->lgh_id.lgl_oi.oi_fid),
-				       rec->lrh_index, index, rc);
-				/*
-				 * the block seem to be corrupted, let's try
-				 * with the next one. reset rc to go to the
-				 * next chunk.
-				 */
-				refresh_idx = true;
-				index = 0;
-				GOTO(repeat, rc = 0);
+				       rec->lrh_index, index);
+
+				GOTO(out, rc = -EINVAL);
 			}
 
 			if (rec->lrh_index < index) {
@@ -656,22 +581,12 @@ static int llog_process_thread(void *arg)
 			}
 
 			if (rec->lrh_index != index) {
-				/*
-				 * the last time we couldn't parse the block due
-				 * to corruption, thus has no idea about the
-				 * next index, take it from the block, once.
-				 */
-				if (refresh_idx) {
-					refresh_idx = false;
-					index = rec->lrh_index;
-				} else {
-					CERROR("%s: "DFID" Invalid record: index"
-					       " %u but expected %u\n",
-					       loghandle2name(loghandle),
-					       PFID(&loghandle->lgh_id.lgl_oi.oi_fid),
-					       rec->lrh_index, index);
-					GOTO(out, rc = -ERANGE);
-				}
+				CERROR("%s: "DFID" Invalid record: index %u"
+				       " but expected %u\n",
+				       loghandle->lgh_ctxt->loc_obd->obd_name,
+				       PFID(&loghandle->lgh_id.lgl_oi.oi_fid),
+				       rec->lrh_index, index);
+				GOTO(out, rc = -ERANGE);
 			}
 
 			CDEBUG(D_OTHER,
@@ -679,44 +594,15 @@ static int llog_process_thread(void *arg)
 			       rec->lrh_index, rec->lrh_len,
 			       (int)(buf + chunk_size - (char *)rec));
 
-			/* lgh_cur_offset is used only at llog_test_3 */
+			loghandle->lgh_cur_idx = rec->lrh_index;
 			loghandle->lgh_cur_offset = (char *)rec - (char *)buf +
 						    chunk_offset;
 
 			/* if set, process the callback on this record */
 			if (ext2_test_bit(index, LLOG_HDR_BITMAP(llh))) {
-				struct llog_cookie *lgc;
-				__u64	tmp_off;
-				int	tmp_idx;
-
-				CDEBUG((llh->llh_flags & LLOG_F_IS_CAT ?
-					D_HA : D_OTHER),
-				       "index: %d, lh_last_idx: %d "
-				       "synced_idx: %d lgh_last_idx: %d\n",
-				       index, lh_last_idx, synced_idx,
-				       loghandle->lgh_last_idx);
-
-				if (lti != NULL) {
-					lgc = &lti->lgi_cookie;
-					/* store lu_env for recursive calls */
-					tmp_off = lgc->lgc_offset;
-					tmp_idx = lgc->lgc_index;
-
-					lgc->lgc_offset = (char *)rec -
-						(char *)buf + chunk_offset;
-					lgc->lgc_index = rec->lrh_index;
-				}
-				/* using lu_env for passing record offset to
-				 * llog_write through various callbacks */
 				rc = lpi->lpi_cb(lpi->lpi_env, loghandle, rec,
 						 lpi->lpi_cbdata);
 				last_called_index = index;
-
-				if (lti != NULL) {
-					lgc->lgc_offset = tmp_off;
-					lgc->lgc_index = tmp_idx;
-				}
-
 				if (rc == LLOG_PROC_BREAK) {
 					GOTO(out, rc);
 				} else if (rc == LLOG_DEL_RECORD) {
@@ -741,11 +627,6 @@ static int llog_process_thread(void *arg)
 	}
 
 out:
-	CDEBUG(D_HA, "stop processing %s "DOSTID":%x index %d count %d\n",
-	       ((llh->llh_flags & LLOG_F_IS_CAT) ? "catalog" : "plain"),
-	       POSTID(&loghandle->lgh_id.lgl_oi), loghandle->lgh_id.lgl_ogen,
-	       index, llh->llh_count);
-
 	if (cd != NULL)
 		cd->lpcd_last_idx = last_called_index;
 
@@ -757,7 +638,7 @@ static int llog_process_thread(void *arg)
 			 * retry until the umount or abort recovery, see
 			 * lod_sub_recovery_thread() */
 			CERROR("%s retry remote llog process\n",
-			       loghandle2name(loghandle));
+			       loghandle->lgh_ctxt->loc_obd->obd_name);
 			rc = -EAGAIN;
 		} else {
 			/* something bad happened to the processing of a local
@@ -766,7 +647,7 @@ static int llog_process_thread(void *arg)
 			 * discard any remaining bits in the header */
 			CERROR("%s: Local llog found corrupted #"DOSTID":%x"
 			       " %s index %d count %d\n",
-			       loghandle2name(loghandle),
+			       loghandle->lgh_ctxt->loc_obd->obd_name,
 			       POSTID(&loghandle->lgh_id.lgl_oi),
 			       loghandle->lgh_id.lgl_ogen,
 			       ((llh->llh_flags & LLOG_F_IS_CAT) ? "catalog" :
@@ -806,8 +687,7 @@ static int llog_process_thread_daemonize(void *arg)
 		 * used outside of the kernel itself, because it calls
 		 * free_nsproxy() which is not exported by the kernel
 		 * (defined in kernel/nsproxy.c) */
-		if (curr_ns)
-			atomic_dec(&curr_ns->count);
+		atomic_dec(&curr_ns->count);
 	}
 	task_unlock(lpi->lpi_reftask);
 
@@ -862,7 +742,7 @@ int llog_process_or_fork(const struct lu_env *env,
 		if (IS_ERR(task)) {
 			rc = PTR_ERR(task);
 			CERROR("%s: cannot start thread: rc = %d\n",
-			       loghandle2name(loghandle), rc);
+			       loghandle->lgh_ctxt->loc_obd->obd_name, rc);
 			GOTO(out_lpi, rc);
 		}
 		wait_for_completion(&lpi->lpi_completion);
@@ -1099,11 +979,12 @@ int llog_write_rec(const struct lu_env *env, struct llog_handle *handle,
 		RETURN(-EPROTO);
 	} else if (th == NULL) {
 		CERROR("%s: missed transaction handle\n",
-		       loghandle2name(handle));
+			handle->lgh_obj->do_lu.lo_dev->ld_obd->obd_name);
 		RETURN(-EPROTO);
 	} else if (handle->lgh_hdr == NULL) {
 		CERROR("%s: loghandle %p with no header\n",
-		       loghandle2name(handle), handle);
+			handle->lgh_obj->do_lu.lo_dev->ld_obd->obd_name,
+			handle);
 		RETURN(-EPROTO);
 	}
 
@@ -1192,9 +1073,6 @@ int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt,
 
 	d = lu2dt_dev((*res)->lgh_obj->do_lu.lo_dev);
 
-	if (unlikely(unlikely(d->dd_rdonly)))
-		RETURN(-EROFS);
-
 	th = dt_trans_create(env, d);
 	if (IS_ERR(th))
 		GOTO(out, rc = PTR_ERR(th));
@@ -1262,8 +1140,7 @@ int llog_write(const struct lu_env *env, struct llog_handle *loghandle,
 {
 	struct dt_device	*dt;
 	struct thandle		*th;
-	bool			need_cookie;
-	int			rc;
+	int			 rc;
 
 	ENTRY;
 
@@ -1273,9 +1150,6 @@ int llog_write(const struct lu_env *env, struct llog_handle *loghandle,
 
 	dt = lu2dt_dev(loghandle->lgh_obj->do_lu.lo_dev);
 
-	if (unlikely(unlikely(dt->dd_rdonly)))
-		RETURN(-EROFS);
-
 	th = dt_trans_create(env, dt);
 	if (IS_ERR(th))
 		RETURN(PTR_ERR(th));
@@ -1289,21 +1163,8 @@ int llog_write(const struct lu_env *env, struct llog_handle *loghandle,
 	if (rc)
 		GOTO(out_trans, rc);
 
-	need_cookie = !(idx == LLOG_HEADER_IDX || idx == LLOG_NEXT_IDX);
-
 	down_write(&loghandle->lgh_lock);
-	if (need_cookie) {
-		struct llog_thread_info *lti = llog_info(env);
-
-		/* cookie comes from llog_process_thread */
-		rc = llog_write_rec(env, loghandle, rec, &lti->lgi_cookie,
-				    rec->lrh_index, th);
-		/* upper layer didn`t pass cookie so change rc */
-		rc = (rc == 1 ? 0 : rc);
-	} else {
-		rc = llog_write_rec(env, loghandle, rec, NULL, idx, th);
-	}
-
+	rc = llog_write_rec(env, loghandle, rec, NULL, idx, th);
 	up_write(&loghandle->lgh_lock);
 out_trans:
 	dt_trans_stop(env, dt, th);
@@ -1350,7 +1211,20 @@ EXPORT_SYMBOL(llog_open);
 
 int llog_close(const struct lu_env *env, struct llog_handle *loghandle)
 {
-	return llog_handle_put(env, loghandle);
+	struct llog_operations	*lop;
+	int			 rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(loghandle, &lop);
+	if (rc)
+		GOTO(out, rc);
+	if (lop->lop_close == NULL)
+		GOTO(out, rc = -EOPNOTSUPP);
+	rc = lop->lop_close(env, loghandle);
+out:
+	llog_handle_put(loghandle);
+	RETURN(rc);
 }
 EXPORT_SYMBOL(llog_close);
 
@@ -1474,9 +1348,8 @@ __u64 llog_size(const struct lu_env *env, struct llog_handle *llh)
 
 	rc = llh->lgh_obj->do_ops->do_attr_get(env, llh->lgh_obj, &la);
 	if (rc) {
-		CERROR("%s: attr_get failed for "DFID": rc = %d\n",
-		       loghandle2name(llh), PFID(&llh->lgh_id.lgl_oi.oi_fid),
-		       rc);
+		CERROR("%s: attr_get failed, rc = %d\n",
+		       llh->lgh_ctxt->loc_obd->obd_name, rc);
 		return 0;
 	}
 
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c
index 91f029052585e..e85e08bbd10c6 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -88,12 +88,13 @@ static int llog_cat_new_log(const struct lu_env *env,
 		if (cathandle->lgh_name == NULL) {
 			CWARN("%s: there are no more free slots in catalog "
 			      DFID":%x\n",
-			      loghandle2name(loghandle),
+			      loghandle->lgh_ctxt->loc_obd->obd_name,
 			      PFID(&cathandle->lgh_id.lgl_oi.oi_fid),
 			      cathandle->lgh_id.lgl_ogen);
 		} else {
 			CWARN("%s: there are no more free slots in "
-			      "catalog %s\n", loghandle2name(loghandle),
+			      "catalog %s\n",
+			      loghandle->lgh_ctxt->loc_obd->obd_name,
 			      cathandle->lgh_name);
 		}
 		RETURN(-ENOSPC);
@@ -152,7 +153,7 @@ static int llog_cat_new_log(const struct lu_env *env,
 		GOTO(out, rc = 0);
 	} else if (rc != 0) {
 		CERROR("%s: can't create new plain llog in catalog: rc = %d\n",
-		       loghandle2name(loghandle), rc);
+		       loghandle->lgh_ctxt->loc_obd->obd_name, rc);
 		GOTO(out, rc);
 	}
 
@@ -212,137 +213,11 @@ static int llog_cat_new_log(const struct lu_env *env,
 	loghandle->lgh_hdr->llh_flags &= ~LLOG_F_ZAP_WHEN_EMPTY;
 	/* this is to mimic full log, so another llog_cat_current_log()
 	 * can skip it and ask for another onet */
-	loghandle->lgh_last_idx = LLOG_HDR_BITMAP_SIZE(loghandle->lgh_hdr) + 1;
+	loghandle->lgh_last_idx = LLOG_HDR_BITMAP_SIZE(llh) + 1;
 	llog_trans_destroy(env, loghandle, th);
-	if (handle != NULL)
-		dt_trans_stop(env, dt, handle);
 	RETURN(rc);
 }
 
-static int llog_cat_refresh(const struct lu_env *env,
-			    struct llog_handle *cathandle)
-{
-	struct llog_handle *loghandle;
-	int rc;
-
-	down_write(&cathandle->lgh_lock);
-	list_for_each_entry(loghandle, &cathandle->u.chd.chd_head,
-			    u.phd.phd_entry) {
-		if (!llog_exist(loghandle))
-			continue;
-
-		rc = llog_read_header(env, loghandle, NULL);
-		if (rc)
-			goto unlock;
-	}
-
-	rc = llog_read_header(env, cathandle, NULL);
-unlock:
-	up_write(&loghandle->lgh_lock);
-
-	return rc;
-}
-
-/*
- * prepare current/next log for catalog.
- *
- * if \a *ploghandle is NULL, open it, and declare create, NB, if \a
- * *ploghandle is remote, create it synchronously here, see comments
- * below.
- *
- * \a cathandle->lgh_lock is down_read-ed, it gets down_write-ed if \a
- * *ploghandle has to be opened.
- */
-static int llog_cat_prep_log(const struct lu_env *env,
-			     struct llog_handle *cathandle,
-			     struct llog_handle **ploghandle,
-			     struct thandle *th)
-{
-	int rc;
-	int sem_upgraded;
-
-start:
-	rc = 0;
-	sem_upgraded = 0;
-	if (IS_ERR_OR_NULL(*ploghandle)) {
-		up_read(&cathandle->lgh_lock);
-		down_write(&cathandle->lgh_lock);
-		sem_upgraded = 1;
-		if (IS_ERR_OR_NULL(*ploghandle)) {
-			struct llog_handle *loghandle;
-
-			rc = llog_open(env, cathandle->lgh_ctxt, &loghandle,
-				       NULL, NULL, LLOG_OPEN_NEW);
-			if (!rc) {
-				*ploghandle = loghandle;
-				list_add_tail(&loghandle->u.phd.phd_entry,
-					      &cathandle->u.chd.chd_head);
-			}
-		}
-		if (rc)
-			GOTO(out, rc);
-	}
-
-	rc = llog_exist(*ploghandle);
-	if (rc < 0)
-		GOTO(out, rc);
-	if (rc)
-		GOTO(out, rc = 0);
-
-	if (dt_object_remote(cathandle->lgh_obj)) {
-		down_write_nested(&(*ploghandle)->lgh_lock, LLOGH_LOG);
-		if (!llog_exist(*ploghandle)) {
-			/* For remote operation, if we put the llog object
-			 * creation in the current transaction, then the
-			 * llog object will not be created on the remote
-			 * target until the transaction stop, if other
-			 * operations start before the transaction stop,
-			 * and use the same llog object, will be dependent
-			 * on the success of this transaction. So let's
-			 * create the llog object synchronously here to
-			 * remove the dependency. */
-			rc = llog_cat_new_log(env, cathandle, *ploghandle,
-					      NULL);
-			if (rc == -ESTALE) {
-				up_write(&(*ploghandle)->lgh_lock);
-				if (sem_upgraded)
-					up_write(&cathandle->lgh_lock);
-				else
-					up_read(&cathandle->lgh_lock);
-
-				rc = llog_cat_refresh(env, cathandle);
-				down_read_nested(&cathandle->lgh_lock,
-						 LLOGH_CAT);
-				if (rc)
-					return rc;
-				/* *ploghandle might become NULL, restart */
-				goto start;
-			}
-		}
-		up_write(&(*ploghandle)->lgh_lock);
-	} else {
-		struct llog_thread_info	*lgi = llog_info(env);
-		struct llog_logid_rec *lirec = &lgi->lgi_logid;
-
-		rc = llog_declare_create(env, *ploghandle, th);
-		if (rc)
-			GOTO(out, rc);
-
-		lirec->lid_hdr.lrh_len = sizeof(*lirec);
-		rc = llog_declare_write_rec(env, cathandle, &lirec->lid_hdr, -1,
-					    th);
-	}
-
-out:
-	if (sem_upgraded) {
-		up_write(&cathandle->lgh_lock);
-		down_read_nested(&cathandle->lgh_lock, LLOGH_CAT);
-		if (rc == 0)
-			goto start;
-	}
-	return rc;
-}
-
 /* Open an existent log handle and add it to the open list.
  * This log handle will be closed when all of the records in it are removed.
  *
@@ -374,21 +249,14 @@ int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
 		    ostid_seq(&cgl->lgl_oi) == ostid_seq(&logid->lgl_oi)) {
 			if (cgl->lgl_ogen != logid->lgl_ogen) {
 				CWARN("%s: log "DFID" generation %x != %x\n",
-				      loghandle2name(loghandle),
+				      loghandle->lgh_ctxt->loc_obd->obd_name,
 				      PFID(&logid->lgl_oi.oi_fid),
 				      cgl->lgl_ogen, logid->lgl_ogen);
 				continue;
 			}
-			*res = llog_handle_get(loghandle);
-			if (!*res) {
-				CERROR("%s: log "DFID" refcount is zero!\n",
-				       loghandle2name(loghandle),
-				       PFID(&logid->lgl_oi.oi_fid));
-				continue;
-			}
 			loghandle->u.phd.phd_cat_handle = cathandle;
 			up_write(&cathandle->lgh_lock);
-			RETURN(rc);
+			GOTO(out, rc = 0);
 		}
 	}
 	up_write(&cathandle->lgh_lock);
@@ -397,20 +265,18 @@ int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
 		       LLOG_OPEN_EXISTS);
 	if (rc < 0) {
 		CERROR("%s: error opening log id "DFID":%x: rc = %d\n",
-		       loghandle2name(cathandle), PFID(&logid->lgl_oi.oi_fid),
-		       logid->lgl_ogen, rc);
+		       cathandle->lgh_ctxt->loc_obd->obd_name,
+		       PFID(&logid->lgl_oi.oi_fid), logid->lgl_ogen, rc);
 		RETURN(rc);
 	}
 
 	rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN | fmt, NULL);
 	if (rc < 0) {
 		llog_close(env, loghandle);
-		*res = NULL;
+		loghandle = NULL;
 		RETURN(rc);
 	}
 
-	*res = llog_handle_get(loghandle);
-	LASSERT(*res);
 	down_write(&cathandle->lgh_lock);
 	list_add_tail(&loghandle->u.phd.phd_entry, &cathandle->u.chd.chd_head);
 	up_write(&cathandle->lgh_lock);
@@ -419,7 +285,11 @@ int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
 	loghandle->u.phd.phd_cookie.lgc_lgl = cathandle->lgh_id;
 	loghandle->u.phd.phd_cookie.lgc_index =
 				loghandle->lgh_hdr->llh_cat_idx;
-	RETURN(0);
+	EXIT;
+out:
+	llog_handle_get(loghandle);
+	*res = loghandle;
+	return 0;
 }
 
 int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle)
@@ -444,7 +314,8 @@ int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle)
 			if (rc)
 				CERROR("%s: failure destroying log during "
 				       "cleanup: rc = %d\n",
-				       loghandle2name(loghandle), rc);
+				       loghandle->lgh_ctxt->loc_obd->obd_name,
+				       rc);
 
 			index = loghandle->u.phd.phd_cookie.lgc_index;
 			llog_cat_cleanup(env, cathandle, NULL, index);
@@ -530,7 +401,7 @@ static struct llog_handle *llog_cat_current_log(struct llog_handle *cathandle,
 	 * meet this situation. */
 	if (IS_ERR_OR_NULL(cathandle->u.chd.chd_next_log)) {
 		CERROR("%s: next log does not exist!\n",
-		       loghandle2name(cathandle));
+		       cathandle->lgh_ctxt->loc_obd->obd_name);
 		loghandle = ERR_PTR(-EIO);
 		if (cathandle->u.chd.chd_next_log == NULL) {
 			/* Store the error in chd_next_log, so
@@ -554,6 +425,40 @@ static struct llog_handle *llog_cat_current_log(struct llog_handle *cathandle,
 	RETURN(loghandle);
 }
 
+static int llog_cat_update_header(const struct lu_env *env,
+			   struct llog_handle *cathandle)
+{
+	struct llog_handle *loghandle;
+	int rc;
+	ENTRY;
+
+	/* refresh llog */
+	down_write(&cathandle->lgh_lock);
+	if (!cathandle->lgh_stale) {
+		up_write(&cathandle->lgh_lock);
+		RETURN(0);
+	}
+	list_for_each_entry(loghandle, &cathandle->u.chd.chd_head,
+			    u.phd.phd_entry) {
+		if (!llog_exist(loghandle))
+			continue;
+
+		rc = llog_read_header(env, loghandle, NULL);
+		if (rc != 0) {
+			up_write(&cathandle->lgh_lock);
+			GOTO(out, rc);
+		}
+	}
+	rc = llog_read_header(env, cathandle, NULL);
+	if (rc == 0)
+		cathandle->lgh_stale = 0;
+	up_write(&cathandle->lgh_lock);
+	if (rc != 0)
+		GOTO(out, rc);
+out:
+	RETURN(rc);
+}
+
 /* Add a single record to the recovery log(s) using a catalog
  * Returns as llog_write_record
  *
@@ -607,7 +512,7 @@ int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle,
 		if (retried++ == 0)
 			GOTO(retry, rc);
 		CERROR("%s: error on 2nd llog: rc = %d\n",
-		       loghandle2name(cathandle), rc);
+		       cathandle->lgh_ctxt->loc_obd->obd_name, rc);
 	}
 
 	RETURN(rc);
@@ -618,43 +523,167 @@ int llog_cat_declare_add_rec(const struct lu_env *env,
 			     struct llog_handle *cathandle,
 			     struct llog_rec_hdr *rec, struct thandle *th)
 {
-	int rc;
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct llog_logid_rec	*lirec = &lgi->lgi_logid;
+	struct llog_handle	*loghandle, *next;
+	int			 rc = 0;
 
 	ENTRY;
 
-start:
-	down_read_nested(&cathandle->lgh_lock, LLOGH_CAT);
-	rc = llog_cat_prep_log(env, cathandle,
-			       &cathandle->u.chd.chd_current_log, th);
+	if (cathandle->u.chd.chd_current_log == NULL) {
+		/* declare new plain llog */
+		down_write(&cathandle->lgh_lock);
+		if (cathandle->u.chd.chd_current_log == NULL) {
+			rc = llog_open(env, cathandle->lgh_ctxt, &loghandle,
+				       NULL, NULL, LLOG_OPEN_NEW);
+			if (rc == 0) {
+				cathandle->u.chd.chd_current_log = loghandle;
+				list_add_tail(&loghandle->u.phd.phd_entry,
+					      &cathandle->u.chd.chd_head);
+			}
+		}
+		up_write(&cathandle->lgh_lock);
+	} else if (cathandle->u.chd.chd_next_log == NULL ||
+		   IS_ERR(cathandle->u.chd.chd_next_log)) {
+		/* declare next plain llog */
+		down_write(&cathandle->lgh_lock);
+		if (cathandle->u.chd.chd_next_log == NULL ||
+		    IS_ERR(cathandle->u.chd.chd_next_log)) {
+			rc = llog_open(env, cathandle->lgh_ctxt, &loghandle,
+				       NULL, NULL, LLOG_OPEN_NEW);
+			if (rc == 0) {
+				cathandle->u.chd.chd_next_log = loghandle;
+				list_add_tail(&loghandle->u.phd.phd_entry,
+					      &cathandle->u.chd.chd_head);
+			}
+		}
+		up_write(&cathandle->lgh_lock);
+	}
 	if (rc)
-		GOTO(unlock, rc);
+		GOTO(out, rc);
 
-	rc = llog_cat_prep_log(env, cathandle, &cathandle->u.chd.chd_next_log,
-			       th);
-	if (rc)
-		GOTO(unlock, rc);
+	lirec->lid_hdr.lrh_len = sizeof(*lirec);
+
+	if (!llog_exist(cathandle->u.chd.chd_current_log)) {
+		if (dt_object_remote(cathandle->lgh_obj)) {
+			/* For remote operation, if we put the llog object
+			 * creation in the current transaction, then the
+			 * llog object will not be created on the remote
+			 * target until the transaction stop, if other
+			 * operations start before the transaction stop,
+			 * and use the same llog object, will be dependent
+			 * on the success of this transaction. So let's
+			 * create the llog object synchronously here to
+			 * remove the dependency. */
+create_again:
+			down_read_nested(&cathandle->lgh_lock, LLOGH_CAT);
+			loghandle = cathandle->u.chd.chd_current_log;
+			down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+			if (cathandle->lgh_stale) {
+				up_write(&loghandle->lgh_lock);
+				up_read(&cathandle->lgh_lock);
+				GOTO(out, rc = -EIO);
+			}
+			if (!llog_exist(loghandle)) {
+				rc = llog_cat_new_log(env, cathandle, loghandle,
+						      NULL);
+				if (rc == -ESTALE)
+					cathandle->lgh_stale = 1;
+			}
+			up_write(&loghandle->lgh_lock);
+			up_read(&cathandle->lgh_lock);
+			if (rc == -ESTALE) {
+				rc = llog_cat_update_header(env, cathandle);
+				if (rc != 0)
+					GOTO(out, rc);
+				goto create_again;
+			} else if (rc < 0) {
+				GOTO(out, rc);
+			}
+		} else {
+			rc = llog_declare_create(env,
+					cathandle->u.chd.chd_current_log, th);
+			if (rc)
+				GOTO(out, rc);
+			llog_declare_write_rec(env, cathandle,
+					       &lirec->lid_hdr, -1, th);
+		}
+	}
 
+write_again:
+	/* declare records in the llogs */
 	rc = llog_declare_write_rec(env, cathandle->u.chd.chd_current_log,
 				    rec, -1, th);
-	if (rc == -ESTALE && dt_object_remote(cathandle->lgh_obj)) {
-		up_read(&cathandle->lgh_lock);
-		rc = llog_cat_refresh(env, cathandle);
-		if (rc)
-			RETURN(rc);
-		goto start;
+	if (rc == -ESTALE) {
+		down_write(&cathandle->lgh_lock);
+		if (cathandle->lgh_stale) {
+			up_write(&cathandle->lgh_lock);
+			GOTO(out, rc = -EIO);
+		}
+
+		cathandle->lgh_stale = 1;
+		up_write(&cathandle->lgh_lock);
+		rc = llog_cat_update_header(env, cathandle);
+		if (rc != 0)
+			GOTO(out, rc);
+		goto write_again;
+	} else if (rc < 0) {
+		GOTO(out, rc);
 	}
 
-#if 0
-	/*
-	 * XXX: we hope for declarations made for existing llog this might be
-	 * not correct with some backends where declarations are expected
-	 * against specific object like ZFS with full debugging enabled.
-	 */
-	rc = llog_declare_write_rec(env, cathandle->u.chd.chd_next_log, rec, -1,
-				    th);
-#endif
-unlock:
-	up_read(&cathandle->lgh_lock);
+	next = cathandle->u.chd.chd_next_log;
+	if (!IS_ERR_OR_NULL(next)) {
+		if (!llog_exist(next)) {
+			if (dt_object_remote(cathandle->lgh_obj)) {
+				/* For remote operation, if we put the llog
+				 * object creation in the current transaction,
+				 * then the llog object will not be created on
+				 * the remote target until the transaction stop,
+				 * if other operations start before the
+				 * transaction stop, and use the same llog
+				 * object, will be dependent on the success of
+				 * this transaction. So let's create the llog
+				 * object synchronously here to remove the
+				 * dependency. */
+				down_write_nested(&cathandle->lgh_lock,
+						 LLOGH_CAT);
+				next = cathandle->u.chd.chd_next_log;
+				if (IS_ERR_OR_NULL(next)) {
+					/* Sigh, another thread just tried,
+					 * let's fail as well */
+					up_write(&cathandle->lgh_lock);
+					if (next == NULL)
+						rc = -EIO;
+					else
+						rc = PTR_ERR(next);
+					GOTO(out, rc);
+				}
+
+				down_write_nested(&next->lgh_lock, LLOGH_LOG);
+				if (!llog_exist(next)) {
+					rc = llog_cat_new_log(env, cathandle,
+							      next, NULL);
+					if (rc < 0)
+						cathandle->u.chd.chd_next_log =
+								ERR_PTR(rc);
+				}
+				up_write(&next->lgh_lock);
+				up_write(&cathandle->lgh_lock);
+				if (rc < 0)
+					GOTO(out, rc);
+			} else {
+				rc = llog_declare_create(env, next, th);
+				llog_declare_write_rec(env, cathandle,
+						&lirec->lid_hdr, -1, th);
+			}
+		}
+		/* XXX: we hope for declarations made for existing llog
+		 *	this might be not correct with some backends
+		 *	where declarations are expected against specific
+		 *	object like ZFS with full debugging enabled */
+		/*llog_declare_write_rec(env, next, rec, -1, th);*/
+	}
+out:
 	RETURN(rc);
 }
 EXPORT_SYMBOL(llog_cat_declare_add_rec);
@@ -717,7 +746,8 @@ int llog_cat_cancel_records(const struct lu_env *env,
 		rc = llog_cat_id2handle(env, cathandle, &loghandle, lgl);
 		if (rc) {
 			CDEBUG(D_HA, "%s: cannot find llog for handle "DFID":%x"
-			       ": rc = %d\n", loghandle2name(cathandle),
+			       ": rc = %d\n",
+			       cathandle->lgh_ctxt->loc_obd->obd_name,
 			       PFID(&lgl->lgl_oi.oi_fid), lgl->lgl_ogen, rc);
 			failed++;
 			continue;
@@ -732,7 +762,8 @@ int llog_cat_cancel_records(const struct lu_env *env,
 			 */
 			lrc = -ENOENT;
 			CDEBUG(D_HA, "%s: llog "DFID":%x does not exist"
-			       ": rc = %d\n", loghandle2name(cathandle),
+			       ": rc = %d\n",
+			       cathandle->lgh_ctxt->loc_obd->obd_name,
 			       PFID(&lgl->lgl_oi.oi_fid), lgl->lgl_ogen, lrc);
 			failed++;
 			if (rc == 0)
@@ -755,86 +786,68 @@ int llog_cat_cancel_records(const struct lu_env *env,
 			if (rc == 0)
 				rc = lrc;
 		}
-		llog_handle_put(env, loghandle);
+		llog_handle_put(loghandle);
 	}
 	if (rc)
 		CERROR("%s: fail to cancel %d of %d llog-records: rc = %d\n",
-		       loghandle2name(cathandle), failed, count, rc);
+		       cathandle->lgh_ctxt->loc_obd->obd_name, failed, count,
+		       rc);
 
 	RETURN(rc);
 }
 EXPORT_SYMBOL(llog_cat_cancel_records);
 
-static int llog_cat_process_common(const struct lu_env *env,
-				   struct llog_handle *cat_llh,
-				   struct llog_rec_hdr *rec,
-				   struct llog_handle **llhp)
+static int llog_cat_process_cb(const struct lu_env *env,
+			       struct llog_handle *cat_llh,
+			       struct llog_rec_hdr *rec, void *data)
 {
-	struct llog_logid_rec *lir = container_of(rec, typeof(*lir), lid_hdr);
+	struct llog_process_data *d = data;
+	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+	struct llog_handle *llh;
 	struct llog_log_hdr *hdr;
 	int rc;
 
 	ENTRY;
-	if (rec->lrh_type != le32_to_cpu(LLOG_LOGID_MAGIC)) {
-		rc = -EINVAL;
-		CWARN("%s: invalid record in catalog "DFID":%x: rc = %d\n",
-		      loghandle2name(cat_llh),
-		      PFID(&cat_llh->lgh_id.lgl_oi.oi_fid),
-		      cat_llh->lgh_id.lgl_ogen, rc);
-		RETURN(rc);
+	if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+		CERROR("invalid record in catalog\n");
+		RETURN(-EINVAL);
 	}
-	CDEBUG(D_HA, "processing log "DFID":%x at index %u of catalog "DFID"\n",
-	       PFID(&lir->lid_id.lgl_oi.oi_fid), lir->lid_id.lgl_ogen,
-	       le32_to_cpu(rec->lrh_index),
-	       PFID(&cat_llh->lgh_id.lgl_oi.oi_fid));
+	CDEBUG(D_HA, "processing log "DFID":%x at index %u of catalog "
+	       DFID"\n", PFID(&lir->lid_id.lgl_oi.oi_fid), lir->lid_id.lgl_ogen,
+	       rec->lrh_index, PFID(&cat_llh->lgh_id.lgl_oi.oi_fid));
 
-	rc = llog_cat_id2handle(env, cat_llh, llhp, &lir->lid_id);
+	rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id);
 	if (rc) {
-		/* After a server crash, a stub of index record in catlog could
-		 * be kept, because plain log destroy + catlog index record
-		 * deletion are not atomic. So we end up with an index but no
-		 * actual record. Destroy the index and move on. */
-		if (rc == -ENOENT || rc == -ESTALE)
-			rc = LLOG_DEL_RECORD;
-		else if (rc)
-			CWARN("%s: can't find llog handle "DFID":%x: rc = %d\n",
-			      loghandle2name(cat_llh),
-			      PFID(&lir->lid_id.lgl_oi.oi_fid),
-			      lir->lid_id.lgl_ogen, rc);
+		CERROR("%s: cannot find handle for llog "DFID": rc = %d\n",
+		       cat_llh->lgh_ctxt->loc_obd->obd_name,
+		       PFID(&lir->lid_id.lgl_oi.oi_fid), rc);
+		if (rc == -ENOENT || rc == -ESTALE) {
+			/* After a server crash, a stub of index
+			 * record in catlog could be kept, because
+			 * plain log destroy + catlog index record
+			 * deletion are not atomic. So we end up with
+			 * an index but no actual record. Destroy the
+			 * index and move on. */
+			rc = llog_cat_cleanup(env, cat_llh, NULL,
+					      rec->lrh_index);
+		}
 
 		RETURN(rc);
 	}
 
 	/* clean old empty llogs, do not consider current llog in use */
-	/* ignore remote (lgh_obj == NULL) llogs */
-	hdr = (*llhp)->lgh_hdr;
+	/* ignore remote (lgh_obj=NULL) llogs */
+	hdr = llh->lgh_hdr;
 	if ((hdr->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
 	    hdr->llh_count == 1 && cat_llh->lgh_obj != NULL &&
-	    *llhp != cat_llh->u.chd.chd_current_log) {
-		rc = llog_destroy(env, *llhp);
+	    llh != cat_llh->u.chd.chd_current_log) {
+		rc = llog_destroy(env, llh);
 		if (rc)
-			CWARN("%s: can't destroy empty log "DFID": rc = %d\n",
-			      loghandle2name((*llhp)),
-			      PFID(&lir->lid_id.lgl_oi.oi_fid), rc);
-		rc = LLOG_DEL_PLAIN;
+			CERROR("%s: fail to destroy empty log: rc = %d\n",
+			       llh->lgh_ctxt->loc_obd->obd_name, rc);
+		GOTO(out, rc = LLOG_DEL_PLAIN);
 	}
 
-	RETURN(rc);
-}
-
-static int llog_cat_process_cb(const struct lu_env *env,
-			       struct llog_handle *cat_llh,
-			       struct llog_rec_hdr *rec, void *data)
-{
-	struct llog_process_data *d = data;
-	struct llog_handle *llh = NULL;
-	int rc;
-
-	ENTRY;
-	rc = llog_cat_process_common(env, cat_llh, rec, &llh);
-	if (rc)
-		GOTO(out, rc);
-
 	if (rec->lrh_index < d->lpd_startcat) {
 		/* Skip processing of the logs until startcat */
 		rc = 0;
@@ -851,29 +864,13 @@ static int llog_cat_process_cb(const struct lu_env *env,
 		rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data,
 					  NULL, false);
 	}
-	if (rc == -ENOENT && (cat_llh->lgh_hdr->llh_flags & LLOG_F_RM_ON_ERR)) {
-		/*
-		 * plain llog is reported corrupted, so better to just remove
-		 * it if the caller is fine with that.
-		 */
-		CERROR("%s: remove corrupted/missing llog "DFID"\n",
-		       loghandle2name(cat_llh),
-		       PFID(&llh->lgh_id.lgl_oi.oi_fid));
-		rc = LLOG_DEL_PLAIN;
-	}
 
 out:
 	/* The empty plain log was destroyed while processing */
-	if (rc == LLOG_DEL_PLAIN) {
+	if (rc == LLOG_DEL_PLAIN)
 		rc = llog_cat_cleanup(env, cat_llh, llh,
 				      llh->u.phd.phd_cookie.lgc_index);
-	} else if (rc == LLOG_DEL_RECORD) {
-		/* clear wrong catalog entry */
-		rc = llog_cat_cleanup(env, cat_llh, NULL, rec->lrh_index);
-	}
-
-	if (llh)
-		llog_handle_put(env, llh);
+	llog_handle_put(llh);
 
 	RETURN(rc);
 }
@@ -883,62 +880,43 @@ int llog_cat_process_or_fork(const struct lu_env *env,
 			     llog_cb_t cb, void *data, int startcat,
 			     int startidx, bool fork)
 {
-	struct llog_process_data d;
-	struct llog_log_hdr *llh = cat_llh->lgh_hdr;
-	int rc;
-
-	ENTRY;
+        struct llog_process_data d;
+        struct llog_log_hdr *llh = cat_llh->lgh_hdr;
+        int rc;
+        ENTRY;
 
-	LASSERT(llh->llh_flags & LLOG_F_IS_CAT);
-	d.lpd_data = data;
-	d.lpd_cb = cb;
-	d.lpd_startcat = (startcat == LLOG_CAT_FIRST ? 0 : startcat);
-	d.lpd_startidx = startidx;
+        LASSERT(llh->llh_flags & LLOG_F_IS_CAT);
+        d.lpd_data = data;
+        d.lpd_cb = cb;
+        d.lpd_startcat = startcat;
+        d.lpd_startidx = startidx;
 
 	if (llh->llh_cat_idx >= cat_llh->lgh_last_idx &&
 	    llh->llh_count > 1) {
 		struct llog_process_cat_data cd;
 
 		CWARN("%s: catlog "DFID" crosses index zero\n",
-		      loghandle2name(cat_llh),
+		      cat_llh->lgh_ctxt->loc_obd->obd_name,
 		      PFID(&cat_llh->lgh_id.lgl_oi.oi_fid));
-		/*startcat = 0 is default value for general processing */
-		if ((startcat != LLOG_CAT_FIRST &&
-		    startcat >= llh->llh_cat_idx) || !startcat) {
-			/* processing the catalog part at the end */
-			cd.lpcd_first_idx = (startcat ? startcat :
-					     llh->llh_cat_idx);
-			if (OBD_FAIL_PRECHECK(OBD_FAIL_CAT_RECORDS))
-				cd.lpcd_last_idx = cfs_fail_val;
-			else
-				cd.lpcd_last_idx = 0;
-			rc = llog_process_or_fork(env, cat_llh, cat_cb,
-						  &d, &cd, fork);
-			/* Reset the startcat becasue it has already reached
-			 * catalog bottom.
-			 */
-			startcat = 0;
-			if (rc != 0)
-				RETURN(rc);
-		}
-		/* processing the catalog part at the begining */
-		cd.lpcd_first_idx = (startcat == LLOG_CAT_FIRST) ? 0 : startcat;
-		/* Note, the processing will stop at the lgh_last_idx value,
-		 * and it could be increased during processing. So records
-		 * between current lgh_last_idx and lgh_last_idx in future
-		 * would left unprocessed.
-		 */
+
+		cd.lpcd_first_idx = llh->llh_cat_idx;
+		cd.lpcd_last_idx = 0;
+		rc = llog_process_or_fork(env, cat_llh, cat_cb,
+					  &d, &cd, fork);
+		if (rc != 0)
+			RETURN(rc);
+
+		cd.lpcd_first_idx = 0;
 		cd.lpcd_last_idx = cat_llh->lgh_last_idx;
 		rc = llog_process_or_fork(env, cat_llh, cat_cb,
 					  &d, &cd, fork);
-	} else {
+        } else {
 		rc = llog_process_or_fork(env, cat_llh, cat_cb,
 					  &d, NULL, fork);
-	}
+        }
 
-	RETURN(rc);
+        RETURN(rc);
 }
-EXPORT_SYMBOL(llog_cat_process_or_fork);
 
 int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh,
 		     llog_cb_t cb, void *data, int startcat, int startidx)
@@ -953,33 +931,39 @@ static int llog_cat_size_cb(const struct lu_env *env,
 			     struct llog_rec_hdr *rec, void *data)
 {
 	struct llog_process_data *d = data;
-	struct llog_handle *llh = NULL;
+	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+	struct llog_handle *llh;
+	int rc;
 	__u64 *cum_size = d->lpd_data;
 	__u64 size;
-	int rc;
 
 	ENTRY;
-	rc = llog_cat_process_common(env, cat_llh, rec, &llh);
-
-	if (rc == LLOG_DEL_PLAIN) {
-		/* empty log was deleted, don't count it */
-		rc = llog_cat_cleanup(env, cat_llh, llh,
-				      llh->u.phd.phd_cookie.lgc_index);
-	} else if (rc == LLOG_DEL_RECORD) {
-		/* clear wrong catalog entry */
-		rc = llog_cat_cleanup(env, cat_llh, NULL, rec->lrh_index);
-	} else {
-		size = llog_size(env, llh);
-		*cum_size += size;
+	if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+		CERROR("%s: invalid record in catalog, rc = %d\n",
+		       cat_llh->lgh_ctxt->loc_obd->obd_name, -EINVAL);
+		RETURN(-EINVAL);
+	}
+	CDEBUG(D_HA, "processing log "DFID":%x at index %u of catalog "
+	       DFID"\n", PFID(&lir->lid_id.lgl_oi.oi_fid), lir->lid_id.lgl_ogen,
+	       rec->lrh_index, PFID(&cat_llh->lgh_id.lgl_oi.oi_fid));
 
-		CDEBUG(D_INFO, "Add llog entry "DFID" size=%llu, tot=%llu\n",
-		       PFID(&llh->lgh_id.lgl_oi.oi_fid), size, *cum_size);
+	rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id);
+	if (rc) {
+		CWARN("%s: cannot find handle for llog "DFID": rc = %d\n",
+		      cat_llh->lgh_ctxt->loc_obd->obd_name,
+		      PFID(&lir->lid_id.lgl_oi.oi_fid), rc);
+		RETURN(0);
 	}
+	size = llog_size(env, llh);
+	*cum_size += size;
+
+	CDEBUG(D_INFO, "Add llog entry "DFID" size %llu\n",
+	       PFID(&llh->lgh_id.lgl_oi.oi_fid), size);
 
-	if (llh != NULL)
-		llog_handle_put(env, llh);
+	llog_handle_put(llh);
 
 	RETURN(0);
+
 }
 
 __u64 llog_cat_size(const struct lu_env *env, struct llog_handle *cat_llh)
@@ -993,58 +977,65 @@ __u64 llog_cat_size(const struct lu_env *env, struct llog_handle *cat_llh)
 }
 EXPORT_SYMBOL(llog_cat_size);
 
-/* currently returns the number of "free" entries in catalog,
- * ie the available entries for a new plain LLOG file creation,
- * even if catalog has wrapped
- */
-__u32 llog_cat_free_space(struct llog_handle *cat_llh)
-{
-	/* simulate almost full Catalog */
-	if (OBD_FAIL_CHECK(OBD_FAIL_CAT_FREE_RECORDS))
-		return cfs_fail_val;
-
-	if (cat_llh->lgh_hdr->llh_count == 1)
-		return LLOG_HDR_BITMAP_SIZE(cat_llh->lgh_hdr) - 1;
-
-	if (cat_llh->lgh_last_idx > cat_llh->lgh_hdr->llh_cat_idx)
-		return LLOG_HDR_BITMAP_SIZE(cat_llh->lgh_hdr) - 1 +
-		       cat_llh->lgh_hdr->llh_cat_idx - cat_llh->lgh_last_idx;
-
-	/* catalog is presently wrapped */
-	return cat_llh->lgh_hdr->llh_cat_idx - cat_llh->lgh_last_idx;
-}
-EXPORT_SYMBOL(llog_cat_free_space);
-
 static int llog_cat_reverse_process_cb(const struct lu_env *env,
 				       struct llog_handle *cat_llh,
 				       struct llog_rec_hdr *rec, void *data)
 {
 	struct llog_process_data *d = data;
+	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
 	struct llog_handle *llh;
+	struct llog_log_hdr *hdr;
 	int rc;
 
-	ENTRY;
-	rc = llog_cat_process_common(env, cat_llh, rec, &llh);
-
-	/* The empty plain log was destroyed while processing */
-	if (rc == LLOG_DEL_PLAIN) {
-		rc = llog_cat_cleanup(env, cat_llh, llh,
-				      llh->u.phd.phd_cookie.lgc_index);
-	} else if (rc == LLOG_DEL_RECORD) {
-		/* clear wrong catalog entry */
-		rc = llog_cat_cleanup(env, cat_llh, NULL, rec->lrh_index);
+	if (le32_to_cpu(rec->lrh_type) != LLOG_LOGID_MAGIC) {
+		CERROR("invalid record in catalog\n");
+		RETURN(-EINVAL);
 	}
-	if (rc)
+	CDEBUG(D_HA, "processing log "DFID":%x at index %u of catalog "
+	       DFID"\n", PFID(&lir->lid_id.lgl_oi.oi_fid), lir->lid_id.lgl_ogen,
+	       le32_to_cpu(rec->lrh_index),
+	       PFID(&cat_llh->lgh_id.lgl_oi.oi_fid));
+
+	rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id);
+	if (rc) {
+		CERROR("%s: cannot find handle for llog "DFID": rc = %d\n",
+		       cat_llh->lgh_ctxt->loc_obd->obd_name,
+		       PFID(&lir->lid_id.lgl_oi.oi_fid), rc);
+		if (rc == -ENOENT || rc == -ESTALE) {
+			/* After a server crash, a stub of index
+			 * record in catlog could be kept, because
+			 * plain log destroy + catlog index record
+			 * deletion are not atomic. So we end up with
+			 * an index but no actual record. Destroy the
+			 * index and move on. */
+			rc = llog_cat_cleanup(env, cat_llh, NULL,
+					      rec->lrh_index);
+		}
+
 		RETURN(rc);
+	}
+
+	/* clean old empty llogs, do not consider current llog in use */
+	hdr = llh->lgh_hdr;
+	if ((hdr->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+	    hdr->llh_count == 1 &&
+	    llh != cat_llh->u.chd.chd_current_log) {
+		rc = llog_destroy(env, llh);
+		if (rc)
+			CERROR("%s: fail to destroy empty log: rc = %d\n",
+			       llh->lgh_ctxt->loc_obd->obd_name, rc);
+		GOTO(out, rc = LLOG_DEL_PLAIN);
+	}
 
 	rc = llog_reverse_process(env, llh, d->lpd_cb, d->lpd_data, NULL);
 
+out:
 	/* The empty plain was destroyed while processing */
 	if (rc == LLOG_DEL_PLAIN)
 		rc = llog_cat_cleanup(env, cat_llh, llh,
 				      llh->u.phd.phd_cookie.lgc_index);
 
-	llog_handle_put(env, llh);
+	llog_handle_put(llh);
 	RETURN(rc);
 }
 
@@ -1065,7 +1056,7 @@ int llog_cat_reverse_process(const struct lu_env *env,
 	if (llh->llh_cat_idx >= cat_llh->lgh_last_idx &&
 	    llh->llh_count > 1) {
 		CWARN("%s: catalog "DFID" crosses index zero\n",
-		      loghandle2name(cat_llh),
+		      cat_llh->lgh_ctxt->loc_obd->obd_name,
 		      PFID(&cat_llh->lgh_id.lgl_oi.oi_fid));
 
 		cd.lpcd_first_idx = 0;
@@ -1123,7 +1114,7 @@ static int llog_cat_set_first_idx(struct llog_handle *cathandle, int idx)
 			}
 		}
 
-		CDEBUG(D_HA, "catlog "DFID" first idx %u, last_idx %u\n",
+		CDEBUG(D_RPCTRACE, "catlog "DFID" first idx %u, last_idx %u\n",
 		       PFID(&cathandle->lgh_id.lgl_oi.oi_fid),
 		       llh->llh_cat_idx, cathandle->lgh_last_idx);
 	}
@@ -1136,13 +1127,11 @@ int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle,
 		     struct llog_handle *loghandle, int index)
 {
 	int rc;
-	struct lu_fid fid = {.f_seq = 0, .f_oid = 0, .f_ver = 0};
 
 	LASSERT(index);
 	if (loghandle != NULL) {
 		/* remove destroyed llog from catalog list and
 		 * chd_current_log variable */
-		fid = loghandle->lgh_id.lgl_oi.oi_fid;
 		down_write(&cathandle->lgh_lock);
 		if (cathandle->u.chd.chd_current_log == loghandle)
 			cathandle->u.chd.chd_current_log = NULL;
@@ -1161,9 +1150,7 @@ int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle,
 	llog_cat_set_first_idx(cathandle, index);
 	rc = llog_cancel_rec(env, cathandle, index);
 	if (rc == 0)
-		CDEBUG(D_HA,
-		       "cancel plain log "DFID" at index %u of catalog "DFID"\n",
-		       PFID(&fid), index,
-		       PFID(&cathandle->lgh_id.lgl_oi.oi_fid));
+		CDEBUG(D_HA, "cancel plain log at index %u of catalog "DFID"\n",
+		       index, PFID(&cathandle->lgh_id.lgl_oi.oi_fid));
 	return rc;
 }
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_internal.h b/drivers/staging/lustrefsx/lustre/obdclass/llog_internal.h
index c42f13ea6824f..eb9526ad504d0 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/llog_internal.h
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -74,8 +74,8 @@ static inline struct llog_thread_info *llog_info(const struct lu_env *env)
 int llog_info_init(void);
 void llog_info_fini(void);
 
-struct llog_handle *llog_handle_get(struct llog_handle *loghandle);
-int llog_handle_put(const struct lu_env *env, struct llog_handle *loghandle);
+void llog_handle_get(struct llog_handle *loghandle);
+void llog_handle_put(struct llog_handle *loghandle);
 int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
 		       struct llog_handle **res, struct llog_logid *logid);
 int class_config_dump_handler(const struct lu_env *env,
@@ -92,9 +92,4 @@ static inline struct llog_rec_hdr *llog_rec_hdr_next(struct llog_rec_hdr *rec)
 {
 	return (struct llog_rec_hdr *)((char *)rec + rec->lrh_len);
 }
-int llog_verify_record(const struct llog_handle *llh, struct llog_rec_hdr *rec);
-static inline char *loghandle2name(const struct llog_handle *lgh)
-{
-	return lgh->lgh_ctxt->loc_obd->obd_name;
-}
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_ioctl.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_ioctl.c
index 276ffa8280c84..906e6e64ef4e6 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/llog_ioctl.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_ioctl.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -33,16 +33,14 @@
 #define DEBUG_SUBSYSTEM S_LOG
 
 #include <obd_class.h>
-#include <uapi/linux/lustre/lustre_ioctl.h>
+#include <uapi/linux/lustre_ioctl.h>
 #include <lustre_log.h>
 #include "llog_internal.h"
 
 static int str2logid(struct llog_logid *logid, char *str, int len)
 {
-	unsigned long long id, seq;
-	char *start, *end;
-	u32 ogen;
-	int rc;
+	char *start, *end, *endp;
+	__u64 id, seq;
 
 	ENTRY;
 	start = str;
@@ -58,12 +56,10 @@ static int str2logid(struct llog_logid *logid, char *str, int len)
 	}
 
 #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 1, 53, 0)
-	/*
-	 * logids used to be input in the form "#id#seq:ogen" before they
+	/* logids used to be input in the form "#id#seq:ogen" before they
 	 * were changed over to accept the FID [seq:oid:ver] format.
 	 * This is accepted for compatibility reasons, though I doubt
-	 * anyone is actually using this for anything.
-	 */
+	 * anyone is actually using this for anything. */
 	if (start[0] != '#')
 		RETURN(-EINVAL);
 
@@ -75,37 +71,34 @@ static int str2logid(struct llog_logid *logid, char *str, int len)
 		RETURN(-EINVAL);
 
 	*end = '\0';
-	rc = kstrtoull(start, 0, &id);
-	if (rc)
-		RETURN(rc);
-
-	start = ++end;
-	if (start - str >= len - 1)
-		RETURN(-EINVAL);
+	id = simple_strtoull(start, &endp, 0);
+        if (endp != end)
+                RETURN(-EINVAL);
 
-	end = strchr(start, '#');
-	if (!end || end == start)
-		RETURN(-EINVAL);
+        start = ++end;
+        if (start - str >= len - 1)
+                RETURN(-EINVAL);
+        end = strchr(start, '#');
+        if (end == NULL || end == start)
+                RETURN(-EINVAL);
 
-	*end = '\0';
-	rc = kstrtoull(start, 0, &seq);
-	if (rc)
-		RETURN(rc);
+        *end = '\0';
+	seq = simple_strtoull(start, &endp, 0);
+        if (endp != end)
+                RETURN(-EINVAL);
 
 	ostid_set_seq(&logid->lgl_oi, seq);
 	if (ostid_set_id(&logid->lgl_oi, id))
 		RETURN(-EINVAL);
 
 	start = ++end;
-	if (start - str >= len - 1)
-		RETURN(-EINVAL);
-
-	rc = kstrtouint(start, 16, &ogen);
-	if (rc)
+        if (start - str >= len - 1)
+                RETURN(-EINVAL);
+        logid->lgl_ogen = simple_strtoul(start, &endp, 16);
+        if (*endp != '\0')
                 RETURN(-EINVAL);
-	logid->lgl_ogen = ogen;
 
-	RETURN(0);
+        RETURN(0);
 #else
 	RETURN(-EINVAL);
 #endif
@@ -114,31 +107,29 @@ static int str2logid(struct llog_logid *logid, char *str, int len)
 static int llog_check_cb(const struct lu_env *env, struct llog_handle *handle,
 			 struct llog_rec_hdr *rec, void *data)
 {
-	struct obd_ioctl_data *ioc_data = data;
+        struct obd_ioctl_data *ioc_data = (struct obd_ioctl_data *)data;
 	static int l, remains;
 	static long from, to;
-	static char *out;
-	int cur_index;
-	int rc = 0;
-
-	ENTRY;
-	if (ioc_data && ioc_data->ioc_inllen1 > 0) {
-		l = 0;
-		remains = ioc_data->ioc_inllen4 +
-			  round_up(ioc_data->ioc_inllen1, 8) +
-			  round_up(ioc_data->ioc_inllen2, 8) +
-			  round_up(ioc_data->ioc_inllen3, 8);
+        static char *out;
+        char *endp;
+        int cur_index, rc = 0;
 
-		rc = kstrtol(ioc_data->ioc_inlbuf2, 0, &from);
-		if (rc)
-			RETURN(rc);
+        ENTRY;
 
-		rc = kstrtol(ioc_data->ioc_inlbuf3, 0, &to);
-		if (rc)
-			RETURN(rc);
-
-		ioc_data->ioc_inllen1 = 0;
-		out = ioc_data->ioc_bulk;
+	if (ioc_data && ioc_data->ioc_inllen1 > 0) {
+                l = 0;
+                remains = ioc_data->ioc_inllen4 +
+                        cfs_size_round(ioc_data->ioc_inllen1) +
+                        cfs_size_round(ioc_data->ioc_inllen2) +
+                        cfs_size_round(ioc_data->ioc_inllen3);
+                from = simple_strtol(ioc_data->ioc_inlbuf2, &endp, 0);
+                if (*endp != '\0')
+                        RETURN(-EINVAL);
+                to = simple_strtol(ioc_data->ioc_inlbuf3, &endp, 0);
+                if (*endp != '\0')
+                        RETURN(-EINVAL);
+                ioc_data->ioc_inllen1 = 0;
+                out = ioc_data->ioc_bulk;
 	}
 
 	cur_index = rec->lrh_index;
@@ -148,17 +139,17 @@ static int llog_check_cb(const struct lu_env *env, struct llog_handle *handle,
 		RETURN(-LLOG_EEMPTY);
 
 	if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) {
-		struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
-		struct llog_handle *loghandle;
-
-		if (rec->lrh_type != LLOG_LOGID_MAGIC) {
-			l = snprintf(out, remains,
-				     "[index]: %05d  [type]: %02x  [len]: %04d failed\n",
-				     cur_index, rec->lrh_type,
-				     rec->lrh_len);
-		}
-		if (handle->lgh_ctxt == NULL)
-			RETURN(-EOPNOTSUPP);
+		struct llog_logid_rec	*lir = (struct llog_logid_rec *)rec;
+		struct llog_handle	*loghandle;
+
+                if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+                        l = snprintf(out, remains, "[index]: %05d  [type]: "
+                                     "%02x  [len]: %04d failed\n",
+                                     cur_index, rec->lrh_type,
+                                     rec->lrh_len);
+                }
+                if (handle->lgh_ctxt == NULL)
+                        RETURN(-EOPNOTSUPP);
 		rc = llog_cat_id2handle(env, handle, &loghandle, &lir->lid_id);
 		if (rc) {
 			CDEBUG(D_IOCTL, "cannot find log "DFID":%x\n",
@@ -167,16 +158,16 @@ static int llog_check_cb(const struct lu_env *env, struct llog_handle *handle,
 			RETURN(rc);
 		}
 		rc = llog_process(env, loghandle, llog_check_cb, NULL, NULL);
-		llog_handle_put(env, loghandle);
+		llog_handle_put(loghandle);
 	} else {
 		bool ok;
 
-		switch (rec->lrh_type) {
-		case OST_SZ_REC:
-		case MDS_UNLINK_REC:
+                switch (rec->lrh_type) {
+                case OST_SZ_REC:
+                case MDS_UNLINK_REC:
 		case MDS_UNLINK64_REC:
-		case MDS_SETATTR64_REC:
-		case OBD_CFG_REC:
+                case MDS_SETATTR64_REC:
+                case OBD_CFG_REC:
 		case LLOG_GEN_REC:
 		case LLOG_HDR_MAGIC:
 			ok = true;
@@ -203,46 +194,43 @@ static int llog_check_cb(const struct lu_env *env, struct llog_handle *handle,
 static int llog_print_cb(const struct lu_env *env, struct llog_handle *handle,
 			 struct llog_rec_hdr *rec, void *data)
 {
-	struct obd_ioctl_data *ioc_data = data;
+        struct obd_ioctl_data *ioc_data = (struct obd_ioctl_data *)data;
 	static int l, remains;
 	static long from, to;
-	static char *out;
-	int cur_index;
-	int rc;
-
-	ENTRY;
-	if (ioc_data && ioc_data->ioc_inllen1 > 0) {
-		l = 0;
-		remains = ioc_data->ioc_inllen4 +
-			  round_up(ioc_data->ioc_inllen1, 8) +
-			  round_up(ioc_data->ioc_inllen2, 8) +
-			  round_up(ioc_data->ioc_inllen3, 8);
-
-		rc = kstrtol(ioc_data->ioc_inlbuf2, 0, &from);
-		if (rc)
-			RETURN(rc);
-
-		rc = kstrtol(ioc_data->ioc_inlbuf3, 0, &to);
-		if (rc)
-			RETURN(rc);
-
-		out = ioc_data->ioc_bulk;
-		ioc_data->ioc_inllen1 = 0;
-	}
-
-	cur_index = rec->lrh_index;
-	if (cur_index < from)
-		RETURN(0);
-	if (to > 0 && cur_index > to)
-		RETURN(-LLOG_EEMPTY);
-
-	if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) {
-		struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
-
-		if (rec->lrh_type != LLOG_LOGID_MAGIC) {
-			CERROR("invalid record in catalog\n");
-			RETURN(-EINVAL);
-		}
+        static char *out;
+        char *endp;
+        int cur_index;
+
+        ENTRY;
+	if (ioc_data != NULL && ioc_data->ioc_inllen1 > 0) {
+                l = 0;
+                remains = ioc_data->ioc_inllen4 +
+                        cfs_size_round(ioc_data->ioc_inllen1) +
+                        cfs_size_round(ioc_data->ioc_inllen2) +
+                        cfs_size_round(ioc_data->ioc_inllen3);
+                from = simple_strtol(ioc_data->ioc_inlbuf2, &endp, 0);
+                if (*endp != '\0')
+                        RETURN(-EINVAL);
+                to = simple_strtol(ioc_data->ioc_inlbuf3, &endp, 0);
+                if (*endp != '\0')
+                        RETURN(-EINVAL);
+                out = ioc_data->ioc_bulk;
+                ioc_data->ioc_inllen1 = 0;
+        }
+
+        cur_index = rec->lrh_index;
+        if (cur_index < from)
+                RETURN(0);
+        if (to > 0 && cur_index > to)
+                RETURN(-LLOG_EEMPTY);
+
+        if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) {
+                struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+
+                if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+                        CERROR("invalid record in catalog\n");
+                        RETURN(-EINVAL);
+                }
 
 		l = snprintf(out, remains,
 			     "[index]: %05d  [logid]: "DFID":%x\n",
@@ -259,21 +247,21 @@ static int llog_print_cb(const struct lu_env *env, struct llog_handle *handle,
 		l = snprintf(out, remains,
 			     "[index]: %05d  [type]: %02x  [len]: %04d\n",
 			     cur_index, rec->lrh_type, rec->lrh_len);
-	}
-	out += l;
-	remains -= l;
-	if (remains <= 0) {
-		CERROR("not enough space for print log records\n");
-		RETURN(-LLOG_EEMPTY);
-	}
-
-	RETURN(0);
+        }
+        out += l;
+        remains -= l;
+        if (remains <= 0) {
+                CERROR("not enough space for print log records\n");
+                RETURN(-LLOG_EEMPTY);
+        }
+
+        RETURN(0);
 }
 static int llog_remove_log(const struct lu_env *env, struct llog_handle *cat,
 			   struct llog_logid *logid)
 {
-	struct llog_handle *log;
-	int rc;
+	struct llog_handle	*log;
+	int			 rc;
 
 	ENTRY;
 
@@ -292,7 +280,7 @@ static int llog_remove_log(const struct lu_env *env, struct llog_handle *cat,
 	}
 	llog_cat_cleanup(env, cat, log, log->u.phd.phd_cookie.lgc_index);
 out:
-	llog_handle_put(env, log);
+	llog_handle_put(log);
 	RETURN(rc);
 
 }
@@ -300,8 +288,8 @@ static int llog_remove_log(const struct lu_env *env, struct llog_handle *cat,
 static int llog_delete_cb(const struct lu_env *env, struct llog_handle *handle,
 			  struct llog_rec_hdr *rec, void *data)
 {
-	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
-	int rc;
+	struct llog_logid_rec	*lir = (struct llog_logid_rec *)rec;
+	int			 rc;
 
 	ENTRY;
 	if (rec->lrh_type != LLOG_LOGID_MAGIC)
@@ -315,16 +303,15 @@ static int llog_delete_cb(const struct lu_env *env, struct llog_handle *handle,
 int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
 	       struct obd_ioctl_data *data)
 {
-	struct llog_logid logid;
-	int rc = 0;
-	struct llog_handle *handle = NULL;
-	char *logname, start;
+	struct llog_logid	 logid;
+	int			 rc = 0;
+	struct llog_handle	*handle = NULL;
+	char *logname;
 
 	ENTRY;
 
 	logname = data->ioc_inlbuf1;
-	start = logname[0];
-	if (start == '#' || start == '[') {
+	if (logname[0] == '#' || logname[0] == '[') {
 		rc = str2logid(&logid, logname, data->ioc_inllen1);
 		if (rc)
 			RETURN(rc);
@@ -332,8 +319,8 @@ int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
 			       LLOG_OPEN_EXISTS);
 		if (rc)
 			RETURN(rc);
-	} else if (start == '$' || isalpha(start) || isdigit(start)) {
-		if (start == '$')
+	} else if (logname[0] == '$' || isalpha(logname[0])) {
+		if (logname[0] == '$')
 			logname++;
 
 		rc = llog_open(env, ctxt, &handle, NULL, logname,
@@ -341,10 +328,7 @@ int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
 		if (rc)
 			RETURN(rc);
 	} else {
-		rc = -EINVAL;
-		CDEBUG(D_INFO, "%s: invalid log name '%s': rc = %d\n",
-		      ctxt->loc_obd->obd_name, logname, rc);
-		RETURN(rc);
+		RETURN(-EINVAL);
 	}
 
 	rc = llog_init_handle(env, handle, 0, NULL);
@@ -353,10 +337,10 @@ int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
 
 	switch (cmd) {
 	case OBD_IOC_LLOG_INFO: {
-		int l;
-		int remains = data->ioc_inllen2 +
+		int	 l;
+		int	 remains = data->ioc_inllen2 +
 				   cfs_size_round(data->ioc_inllen1);
-		char *out = data->ioc_bulk;
+		char	*out = data->ioc_bulk;
 
 		l = snprintf(out, remains,
 			     "logid:            "DFID":%x\n"
@@ -398,12 +382,11 @@ int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
 	case OBD_IOC_LLOG_CANCEL: {
 		struct llog_cookie cookie;
 		struct llog_logid plain;
-		u32 lgc_index;
+		char *endp;
 
-		rc = kstrtouint(data->ioc_inlbuf3, 0, &lgc_index);
-		if (rc)
-			GOTO(out_close, rc);
-		cookie.lgc_index = lgc_index;
+		cookie.lgc_index = simple_strtoul(data->ioc_inlbuf3, &endp, 0);
+		if (*endp != '\0')
+			GOTO(out_close, rc = -EINVAL);
 
 		if (handle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) {
 			rc = llog_cancel_rec(env, handle, cookie.lgc_index);
@@ -470,11 +453,11 @@ int llog_catalog_list(const struct lu_env *env, struct dt_device *d,
 		      int count, struct obd_ioctl_data *data,
 		      const struct lu_fid *fid)
 {
-	int size, i;
-	struct llog_catid *idarray;
-	struct llog_logid *id;
-	char *out;
-	int l, remains, rc = 0;
+	int			 size, i;
+	struct llog_catid	*idarray;
+	struct llog_logid	*id;
+	char			*out;
+	int			 l, remains, rc = 0;
 
 	ENTRY;
 
@@ -497,28 +480,15 @@ int llog_catalog_list(const struct lu_env *env, struct dt_device *d,
 
 	out = data->ioc_bulk;
 	remains = data->ioc_inllen1;
-	/* OBD_FAIL: fetch the catalog records from the specified one */
-	if (OBD_FAIL_CHECK(OBD_FAIL_CATLIST))
-		data->ioc_count = cfs_fail_val - 1;
-	for (i = data->ioc_count; i < count; i++) {
+	for (i = 0; i < count; i++) {
 		id = &idarray[i].lci_logid;
 		l = snprintf(out, remains, "catalog_log: "DFID":%x\n",
-			      PFID(&id->lgl_oi.oi_fid), id->lgl_ogen);
+			     PFID(&id->lgl_oi.oi_fid), id->lgl_ogen);
 		out += l;
 		remains -= l;
-		if (remains <= 0) {
-			if (remains < 0) {
-				/* the print is not complete */
-				remains += l;
-				data->ioc_bulk[out - data->ioc_bulk - l] = '\0';
-				data->ioc_count = i;
-			} else {
-				data->ioc_count = i++;
-			}
-			goto out;
-		}
+		if (remains <= 0)
+			break;
 	}
-	data->ioc_count = 0;
 out:
 	OBD_FREE_LARGE(idarray, size);
 	RETURN(rc);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_obd.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_obd.c
index 1d1f953992301..a5cdc6e184185 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/llog_obd.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_obd.c
@@ -40,36 +40,36 @@
 /* helper functions for calling the llog obd methods */
 static struct llog_ctxt* llog_new_ctxt(struct obd_device *obd)
 {
-	struct llog_ctxt *ctxt;
+        struct llog_ctxt *ctxt;
 
-	OBD_ALLOC_PTR(ctxt);
-	if (!ctxt)
-		return NULL;
+        OBD_ALLOC_PTR(ctxt);
+        if (!ctxt)
+                return NULL;
 
-	ctxt->loc_obd = obd;
+        ctxt->loc_obd = obd;
 	atomic_set(&ctxt->loc_refcount, 1);
 
-	return ctxt;
+        return ctxt;
 }
 
 static void llog_ctxt_destroy(struct llog_ctxt *ctxt)
 {
-	if (ctxt->loc_exp) {
-		class_export_put(ctxt->loc_exp);
-		ctxt->loc_exp = NULL;
-	}
-	if (ctxt->loc_imp) {
-		class_import_put(ctxt->loc_imp);
-		ctxt->loc_imp = NULL;
-	}
-	OBD_FREE_PTR(ctxt);
+        if (ctxt->loc_exp) {
+                class_export_put(ctxt->loc_exp);
+                ctxt->loc_exp = NULL;
+        }
+        if (ctxt->loc_imp) {
+                class_import_put(ctxt->loc_imp);
+                ctxt->loc_imp = NULL;
+        }
+        OBD_FREE_PTR(ctxt);
 }
 
 int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt)
 {
-	struct obd_llog_group *olg = ctxt->loc_olg;
-	struct obd_device *obd;
-	int rc = 0;
+        struct obd_llog_group *olg = ctxt->loc_olg;
+        struct obd_device *obd;
+        int rc = 0;
 
 	spin_lock(&olg->olg_lock);
 	if (!atomic_dec_and_test(&ctxt->loc_refcount)) {
@@ -84,18 +84,16 @@ int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt)
 	/* sync with llog ctxt user thread */
 	spin_unlock(&obd->obd_dev_lock);
 
-	/*
-	 * obd->obd_starting is needed for the case of cleanup
-	 * in error case while obd is starting up.
-	 */
-	LASSERTF(obd->obd_starting == 1 ||
-		 obd->obd_stopping == 1 || obd->obd_set_up == 0,
-		 "wrong obd state: %d/%d/%d\n", !!obd->obd_starting,
-		 !!obd->obd_stopping, !!obd->obd_set_up);
+        /* obd->obd_starting is needed for the case of cleanup
+         * in error case while obd is starting up. */
+        LASSERTF(obd->obd_starting == 1 ||
+                 obd->obd_stopping == 1 || obd->obd_set_up == 0,
+                 "wrong obd state: %d/%d/%d\n", !!obd->obd_starting,
+                 !!obd->obd_stopping, !!obd->obd_set_up);
 
-	/* cleanup the llog ctxt here */
-	if (ctxt->loc_logops->lop_cleanup)
-		rc = ctxt->loc_logops->lop_cleanup(env, ctxt);
+        /* cleanup the llog ctxt here */
+        if (CTXTP(ctxt, cleanup))
+		rc = CTXTP(ctxt, cleanup)(env, ctxt);
 
 	llog_ctxt_destroy(ctxt);
 	wake_up(&olg->olg_waitq);
@@ -105,40 +103,39 @@ EXPORT_SYMBOL(__llog_ctxt_put);
 
 int llog_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt)
 {
-	struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
-	struct obd_llog_group *olg;
-	int rc, idx;
-
-	ENTRY;
+        struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+        struct obd_llog_group *olg;
+        int rc, idx;
+        ENTRY;
 
-	LASSERT(ctxt != NULL);
-	LASSERT(ctxt != LP_POISON);
+        LASSERT(ctxt != NULL);
+        LASSERT(ctxt != LP_POISON);
 
-	olg = ctxt->loc_olg;
-	LASSERT(olg != NULL);
-	LASSERT(olg != LP_POISON);
+        olg = ctxt->loc_olg;
+        LASSERT(olg != NULL);
+        LASSERT(olg != LP_POISON);
 
-	idx = ctxt->loc_idx;
+        idx = ctxt->loc_idx;
 
 	/*
-	 * Banlance the ctxt get when calling llog_cleanup()
-	 */
+         * Banlance the ctxt get when calling llog_cleanup()
+         */
 	LASSERT(atomic_read(&ctxt->loc_refcount) < LI_POISON);
 	LASSERT(atomic_read(&ctxt->loc_refcount) > 1);
-	llog_ctxt_put(ctxt);
+        llog_ctxt_put(ctxt);
 
 	/*
 	 * Try to free the ctxt.
 	 */
 	rc = __llog_ctxt_put(env, ctxt);
-	if (rc)
-		CERROR("Error %d while cleaning up ctxt %p\n",
-			rc, ctxt);
+        if (rc)
+                CERROR("Error %d while cleaning up ctxt %p\n",
+                       rc, ctxt);
 
-	l_wait_event(olg->olg_waitq,
-		     llog_group_ctxt_null(olg, idx), &lwi);
+        l_wait_event(olg->olg_waitq,
+                     llog_group_ctxt_null(olg, idx), &lwi);
 
-	RETURN(rc);
+        RETURN(rc);
 }
 EXPORT_SYMBOL(llog_cleanup);
 
@@ -146,24 +143,23 @@ int llog_setup(const struct lu_env *env, struct obd_device *obd,
 	       struct obd_llog_group *olg, int index,
 	       struct obd_device *disk_obd, struct llog_operations *op)
 {
-	struct llog_ctxt *ctxt;
-	int rc = 0;
-
-	ENTRY;
+        struct llog_ctxt *ctxt;
+        int rc = 0;
+        ENTRY;
 
-	if (index < 0 || index >= LLOG_MAX_CTXTS)
-		RETURN(-EINVAL);
+        if (index < 0 || index >= LLOG_MAX_CTXTS)
+                RETURN(-EINVAL);
 
-	LASSERT(olg != NULL);
+        LASSERT(olg != NULL);
 
-	ctxt = llog_new_ctxt(obd);
-	if (!ctxt)
-		RETURN(-ENOMEM);
+        ctxt = llog_new_ctxt(obd);
+        if (!ctxt)
+                RETURN(-ENOMEM);
 
-	ctxt->loc_obd = obd;
-	ctxt->loc_olg = olg;
-	ctxt->loc_idx = index;
-	ctxt->loc_logops = op;
+        ctxt->loc_obd = obd;
+        ctxt->loc_olg = olg;
+        ctxt->loc_idx = index;
+        ctxt->loc_logops = op;
 	mutex_init(&ctxt->loc_mutex);
 	if (disk_obd != NULL)
 		ctxt->loc_exp = class_export_get(disk_obd->obd_self_export);
@@ -173,11 +169,11 @@ int llog_setup(const struct lu_env *env, struct obd_device *obd,
 	ctxt->loc_flags = LLOG_CTXT_FLAG_UNINITIALIZED;
 	ctxt->loc_chunk_size = LLOG_MIN_CHUNK_SIZE;
 
-	rc = llog_group_set_ctxt(olg, ctxt, index);
-	if (rc) {
-		llog_ctxt_destroy(ctxt);
-		if (rc == -EEXIST) {
-			ctxt = llog_group_get_ctxt(olg, index);
+        rc = llog_group_set_ctxt(olg, ctxt, index);
+        if (rc) {
+                llog_ctxt_destroy(ctxt);
+                if (rc == -EEXIST) {
+                        ctxt = llog_group_get_ctxt(olg, index);
 			if (ctxt) {
 				CDEBUG(D_CONFIG, "%s: ctxt %d already set up\n",
 				       obd->obd_name, index);
@@ -192,10 +188,10 @@ int llog_setup(const struct lu_env *env, struct obd_device *obd,
 				LASSERT(ctxt->loc_logops == op);
 				llog_ctxt_put(ctxt);
 			}
-			rc = 0;
-		}
-		RETURN(rc);
-	}
+                        rc = 0;
+                }
+                RETURN(rc);
+        }
 
 	if (op->lop_setup) {
 		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LLOG_SETUP))
@@ -209,28 +205,48 @@ int llog_setup(const struct lu_env *env, struct obd_device *obd,
 		       obd->obd_name, index, op->lop_setup, rc);
 		llog_group_clear_ctxt(olg, index);
 		llog_ctxt_destroy(ctxt);
-	} else {
-		CDEBUG(D_CONFIG, "obd %s ctxt %d is initialized\n",
-		       obd->obd_name, index);
-		ctxt->loc_flags &= ~LLOG_CTXT_FLAG_UNINITIALIZED;
-	}
+        } else {
+                CDEBUG(D_CONFIG, "obd %s ctxt %d is initialized\n",
+                       obd->obd_name, index);
+                ctxt->loc_flags &= ~LLOG_CTXT_FLAG_UNINITIALIZED;
+        }
 
-	RETURN(rc);
+        RETURN(rc);
 }
 EXPORT_SYMBOL(llog_setup);
 
 int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags)
 {
-	int rc = 0;
+        int rc = 0;
+        ENTRY;
 
-	ENTRY;
-	if (ctxt && ctxt->loc_logops->lop_sync)
-		rc = ctxt->loc_logops->lop_sync(ctxt, exp, flags);
+        if (!ctxt)
+                RETURN(0);
 
-	RETURN(rc);
+        if (CTXTP(ctxt, sync))
+		rc = CTXTP(ctxt, sync)(ctxt, exp, flags);
+
+        RETURN(rc);
 }
 EXPORT_SYMBOL(llog_sync);
 
+int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt,
+		struct llog_cookie *cookies, int flags)
+{
+        int rc;
+        ENTRY;
+
+        if (!ctxt) {
+                CERROR("No ctxt\n");
+                RETURN(-ENODEV);
+        }
+
+        CTXT_CHECK_OP(ctxt, cancel, -EOPNOTSUPP);
+	rc = CTXTP(ctxt, cancel)(env, ctxt, cookies, flags);
+        RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cancel);
+
 /* context key constructor/destructor: llog_key_init, llog_key_fini */
 LU_KEY_INIT_FINI(llog, struct llog_thread_info);
 /* context key: llog_thread_key */
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_osd.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_osd.c
index 55088d417146d..ffa1ad0149b25 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/llog_osd.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_osd.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -44,8 +44,6 @@
 
 #define DEBUG_SUBSYSTEM S_LOG
 
-#include <linux/delay.h>
-
 #include <dt_object.h>
 #include <llog_swab.h>
 #include <lustre_fid.h>
@@ -126,7 +124,8 @@ static int llog_osd_create_new_object(const struct lu_env *env,
 static int llog_osd_exist(struct llog_handle *handle)
 {
 	LASSERT(handle->lgh_obj);
-	return dt_object_exists(handle->lgh_obj) && !handle->lgh_destroyed;
+	return dt_object_exists(handle->lgh_obj) &&
+		!lu_object_is_dying(handle->lgh_obj->do_lu.lo_header);
 }
 
 static void *rec_tail(struct llog_rec_hdr *rec)
@@ -363,7 +362,7 @@ static int llog_osd_declare_write_rec(const struct lu_env *env,
  *				the full llog record to write. This is
  *				the beginning of buffer to write, the length
  *				of buffer is stored in \a rec::lrh_len
- * \param[in,out] reccookie	pointer to the cookie to return back if needed.
+ * \param[out] reccookie	pointer to the cookie to return back if needed.
  *				It is used for further cancel of this llog
  *				record.
  * \param[in]  idx		index of the llog record. If \a idx == -1 then
@@ -491,26 +490,26 @@ static int llog_osd_write_rec(const struct lu_env *env,
 					     &lgi->lgi_off, th);
 
 			RETURN(rc);
-		} else if (llh->llh_flags & LLOG_F_IS_FIXSIZE) {
-			lgi->lgi_off = llh->llh_hdr.lrh_len +
-				       (idx - 1) * reclen;
-		} else if (reccookie != NULL && reccookie->lgc_index > 0) {
+		} else if (loghandle->lgh_cur_idx > 0) {
 			/**
-			 * The lgc_offset can be used only if index is
+			 * The lgh_cur_offset can be used only if index is
 			 * the same.
 			 */
-			if (idx != reccookie->lgc_index) {
+			if (idx != loghandle->lgh_cur_idx) {
 				CERROR("%s: modify index mismatch %d %d\n",
 				       o->do_lu.lo_dev->ld_obd->obd_name, idx,
-				       reccookie->lgc_index);
+				       loghandle->lgh_cur_idx);
 				RETURN(-EFAULT);
 			}
 
-			lgi->lgi_off = reccookie->lgc_offset;
+			lgi->lgi_off = loghandle->lgh_cur_offset;
 			CDEBUG(D_OTHER, "modify record "DFID": idx:%u, "
 			       "len:%u offset %llu\n",
 			       PFID(&loghandle->lgh_id.lgl_oi.oi_fid), idx,
 			       rec->lrh_len, (long long)lgi->lgi_off);
+		} else if (llh->llh_flags & LLOG_F_IS_FIXSIZE) {
+			lgi->lgi_off = llh->llh_hdr.lrh_len +
+				       (idx - 1) * reclen;
 		} else {
 			/* This can be result of lgh_cur_idx is not set during
 			 * llog processing or llh_size is not set to proper
@@ -591,7 +590,6 @@ static int llog_osd_write_rec(const struct lu_env *env,
 			RETURN(-ENOSPC);
 	}
 
-	down_write(&loghandle->lgh_last_sem);
 	/* increment the last_idx along with llh_tail index, they should
 	 * be equal for a llog lifetime */
 	loghandle->lgh_last_idx++;
@@ -675,12 +673,6 @@ static int llog_osd_write_rec(const struct lu_env *env,
 	if (rc)
 		GOTO(out, rc);
 
-	if (OBD_FAIL_PRECHECK(OBD_FAIL_LLOG_PROCESS_TIMEOUT) &&
-	   cfs_fail_val == (unsigned int)(loghandle->lgh_id.lgl_oi.oi.oi_id &
-					  0xFFFFFFFF)) {
-		OBD_RACE(OBD_FAIL_LLOG_PROCESS_TIMEOUT);
-		msleep(1 * MSEC_PER_SEC);
-	}
 	/* computed index can be used to determine offset for fixed-size
 	 * records. This also allows to handle Catalog wrap around case */
 	if (llh->llh_flags & LLOG_F_IS_FIXSIZE) {
@@ -701,8 +693,6 @@ static int llog_osd_write_rec(const struct lu_env *env,
 	if (rc < 0)
 		GOTO(out, rc);
 
-	up_write(&loghandle->lgh_last_sem);
-
 	CDEBUG(D_HA, "added record "DFID".%u, %u off%llu\n",
 	       PFID(lu_object_fid(&o->do_lu)), index, rec->lrh_len,
 	       lgi->lgi_off);
@@ -736,7 +726,6 @@ static int llog_osd_write_rec(const struct lu_env *env,
 	}
 
 	LLOG_HDR_TAIL(llh)->lrt_index = loghandle->lgh_last_idx;
-	up_write(&loghandle->lgh_last_sem);
 
 	RETURN(rc);
 }
@@ -792,46 +781,19 @@ static inline void llog_skip_over(struct llog_handle *lgh, __u64 *off,
  * big enough to handle the remapped records. It is also assumed that records
  * of a block have the same format (i.e.: the same features enabled).
  *
- * \param[in,out]    hdr	   Header of the block of records to remap.
- * \param[in,out]    last_hdr      Last header, don't read past this point.
- * \param[in]        flags	   Flags describing the fields to keep.
- * \param[in]        extra_flags   Flags describing the extra fields to keep.
+ * \param[in,out]    hdr	Header of the block of records to remap.
+ * \param[in,out]    last_hdr   Last header, don't read past this point.
+ * \param[in]        flags	Flags describing the fields to keep.
  */
 static void changelog_block_trim_ext(struct llog_rec_hdr *hdr,
 				     struct llog_rec_hdr *last_hdr,
-				     struct llog_handle *loghandle)
+				     enum changelog_rec_flags flags)
 {
-	enum changelog_rec_flags flags = CLF_SUPPORTED;
-	enum changelog_rec_extra_flags extra_flags = CLFE_SUPPORTED;
-
-	if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_X_XATTR))
-		extra_flags &= ~CLFE_XATTR;
-	if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_X_OMODE))
-		extra_flags &= ~CLFE_OPEN;
-	if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_X_NID))
-		extra_flags &= ~CLFE_NID;
-	if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_X_UIDGID))
-		extra_flags &= ~CLFE_UIDGID;
-	if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_EXTRA_FLAGS))
-		flags &= ~CLF_EXTRA_FLAGS;
-	if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_JOBID))
-		flags &= ~CLF_JOBID;
-
-	if (flags == CLF_SUPPORTED && extra_flags == CLFE_SUPPORTED)
-		return;
-
 	if (hdr->lrh_type != CHANGELOG_REC)
 		return;
 
 	do {
 		struct changelog_rec *rec = (struct changelog_rec *)(hdr + 1);
-		enum changelog_rec_extra_flags xflag = CLFE_INVALID;
-
-		if (flags & CLF_EXTRA_FLAGS &&
-		    rec->cr_flags & CLF_EXTRA_FLAGS) {
-			xflag = changelog_rec_extra_flags(rec)->cr_extra_flags &
-				extra_flags;
-		}
 
 		if (unlikely(hdr->lrh_len == 0)) {
 			/* It is corruption case, we cannot know the next rec,
@@ -848,7 +810,7 @@ static void changelog_block_trim_ext(struct llog_rec_hdr *hdr,
 			break;
 		}
 
-		changelog_remap_rec(rec, rec->cr_flags & flags, xflag);
+		changelog_remap_rec(rec, rec->cr_flags & flags);
 		hdr = llog_rec_hdr_next(hdr);
 		/* Yield CPU to avoid soft-lockup if there are too many records
 		 * to be handled. */
@@ -902,7 +864,7 @@ static int llog_osd_next_block(const struct lu_env *env,
 
 	o = loghandle->lgh_obj;
 	LASSERT(o);
-	LASSERT(llog_osd_exist(loghandle));
+	LASSERT(dt_object_exists(o));
 	dt = lu2dt_dev(o->do_lu.lo_dev);
 	LASSERT(dt);
 
@@ -966,25 +928,9 @@ static int llog_osd_next_block(const struct lu_env *env,
 		rec = buf;
 		if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
 			lustre_swab_llog_rec(rec);
+
 		tail = (struct llog_rec_tail *)((char *)buf + rc -
 						sizeof(struct llog_rec_tail));
-
-		if (llog_verify_record(loghandle, rec)) {
-			/*
-			 * the block seems corrupted. make a pad record so the
-			 * caller can skip the block and try with the next one
-			 */
-			rec->lrh_len = rc;
-			rec->lrh_index = next_idx;
-			rec->lrh_type = LLOG_PAD_MAGIC;
-
-			tail = rec_tail(rec);
-			tail->lrt_len = rc;
-			tail->lrt_index = next_idx;
-
-			GOTO(out, rc = 0);
-		}
-
 		/* get the last record in block */
 		last_rec = (struct llog_rec_hdr *)((char *)buf + rc -
 						   tail->lrt_len);
@@ -1023,7 +969,7 @@ static int llog_osd_next_block(const struct lu_env *env,
 
 		/* sanity check that the start of the new buffer is no farther
 		 * than the record that we wanted.  This shouldn't happen. */
-		if (next_idx && rec->lrh_index > next_idx) {
+		if (rec->lrh_index > next_idx) {
 			if (!force_mini_rec && next_idx > last_idx)
 				goto retry;
 
@@ -1034,7 +980,9 @@ static int llog_osd_next_block(const struct lu_env *env,
 		}
 
 		/* Trim unsupported extensions for compat w/ older clients */
-		changelog_block_trim_ext(rec, last_rec, loghandle);
+		if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_JOBID))
+			changelog_block_trim_ext(rec, last_rec,
+						 CLF_VERSION | CLF_RENAME);
 
 		GOTO(out, rc = 0);
 
@@ -1092,7 +1040,7 @@ static int llog_osd_prev_block(const struct lu_env *env,
 
 	o = loghandle->lgh_obj;
 	LASSERT(o);
-	LASSERT(llog_osd_exist(loghandle));
+	LASSERT(dt_object_exists(o));
 	dt = lu2dt_dev(o->do_lu.lo_dev);
 	LASSERT(dt);
 
@@ -1169,7 +1117,9 @@ static int llog_osd_prev_block(const struct lu_env *env,
 		}
 
 		/* Trim unsupported extensions for compat w/ older clients */
-		changelog_block_trim_ext(rec, last_rec, loghandle);
+		if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_JOBID))
+			changelog_block_trim_ext(rec, last_rec,
+						 CLF_VERSION | CLF_RENAME);
 
 		GOTO(out, rc = 0);
 	}
@@ -1458,7 +1408,7 @@ llog_osd_regular_fid_add_name_entry(const struct lu_env *env,
 			       (struct dt_key *)name, th);
 	} else {
 		rc = dt_insert(env, dir, (struct dt_rec *)rec,
-			       (struct dt_key *)name, th);
+			       (struct dt_key *)name, th, 1);
 	}
 	dt_write_unlock(env, dir);
 
@@ -1625,7 +1575,8 @@ static int llog_osd_create(const struct lu_env *env, struct llog_handle *res,
 		rec->rec_type = S_IFREG;
 		dt_read_lock(env, llog_dir, 0);
 		rc = dt_insert(env, llog_dir, (struct dt_rec *)rec,
-			       (struct dt_key *)res->lgh_name, th);
+			       (struct dt_key *)res->lgh_name,
+			       th, 1);
 		dt_read_unlock(env, llog_dir);
 		dt_object_put(env, llog_dir);
 		if (rc)
@@ -1815,7 +1766,7 @@ static int llog_osd_destroy(const struct lu_env *env,
 	LASSERT(o != NULL);
 
 	dt_write_lock(env, o, 0);
-	if (!llog_osd_exist(loghandle))
+	if (!dt_object_exists(o))
 		GOTO(out_unlock, rc = 0);
 
 	if (loghandle->lgh_name) {
@@ -1841,7 +1792,6 @@ static int llog_osd_destroy(const struct lu_env *env,
 	if (rc < 0)
 		GOTO(out_unlock, rc);
 
-	loghandle->lgh_destroyed = true;
 	if (loghandle->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) {
 		rc = llog_osd_regular_fid_del_name_entry(env, o, th, false);
 		if (rc < 0)
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_swab.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_swab.c
index c644efb64ac1f..3ab0b430fca14 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/llog_swab.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_swab.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -58,9 +58,9 @@ static void print_llogd_body(struct llogd_body *d)
 
 void lustre_swab_lu_fid(struct lu_fid *fid)
 {
-	__swab64s(&fid->f_seq);
-	__swab32s(&fid->f_oid);
-	__swab32s(&fid->f_ver);
+        __swab64s (&fid->f_seq);
+        __swab32s (&fid->f_oid);
+        __swab32s (&fid->f_ver);
 }
 EXPORT_SYMBOL(lustre_swab_lu_fid);
 
@@ -80,47 +80,47 @@ void lustre_swab_llog_id(struct llog_logid *log_id)
 {
 	__swab64s(&log_id->lgl_oi.oi.oi_id);
 	__swab64s(&log_id->lgl_oi.oi.oi_seq);
-	__swab32s(&log_id->lgl_ogen);
+        __swab32s(&log_id->lgl_ogen);
 }
 
 void lustre_swab_llogd_body (struct llogd_body *d)
 {
-	ENTRY;
-	print_llogd_body(d);
+        ENTRY;
+        print_llogd_body(d);
 	lustre_swab_llog_id(&d->lgd_logid);
-	__swab32s(&d->lgd_ctxt_idx);
-	__swab32s(&d->lgd_llh_flags);
-	__swab32s(&d->lgd_index);
-	__swab32s(&d->lgd_saved_index);
-	__swab32s(&d->lgd_len);
-	__swab64s(&d->lgd_cur_offset);
-	print_llogd_body(d);
-	EXIT;
+        __swab32s (&d->lgd_ctxt_idx);
+        __swab32s (&d->lgd_llh_flags);
+        __swab32s (&d->lgd_index);
+        __swab32s (&d->lgd_saved_index);
+        __swab32s (&d->lgd_len);
+        __swab64s (&d->lgd_cur_offset);
+        print_llogd_body(d);
+        EXIT;
 }
 EXPORT_SYMBOL(lustre_swab_llogd_body);
 
 void lustre_swab_llogd_conn_body (struct llogd_conn_body *d)
 {
-	__swab64s(&d->lgdc_gen.mnt_cnt);
-	__swab64s(&d->lgdc_gen.conn_cnt);
+        __swab64s (&d->lgdc_gen.mnt_cnt);
+        __swab64s (&d->lgdc_gen.conn_cnt);
 	lustre_swab_llog_id(&d->lgdc_logid);
-	__swab32s(&d->lgdc_ctxt_idx);
+        __swab32s (&d->lgdc_ctxt_idx);
 }
 EXPORT_SYMBOL(lustre_swab_llogd_conn_body);
 
 void lustre_swab_ll_fid(struct ll_fid *fid)
 {
-	__swab64s(&fid->id);
-	__swab32s(&fid->generation);
-	__swab32s(&fid->f_type);
+        __swab64s (&fid->id);
+        __swab32s (&fid->generation);
+        __swab32s (&fid->f_type);
 }
 
 void lustre_swab_lu_seq_range(struct lu_seq_range *range)
 {
-	__swab64s(&range->lsr_start);
-	__swab64s(&range->lsr_end);
-	__swab32s(&range->lsr_index);
-	__swab32s(&range->lsr_flags);
+        __swab64s (&range->lsr_start);
+        __swab64s (&range->lsr_end);
+        __swab32s (&range->lsr_index);
+        __swab32s (&range->lsr_flags);
 }
 EXPORT_SYMBOL(lustre_swab_lu_seq_range);
 
@@ -143,32 +143,32 @@ void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
 {
 	struct llog_rec_tail *tail = NULL;
 
-	__swab32s(&rec->lrh_len);
-	__swab32s(&rec->lrh_index);
-	__swab32s(&rec->lrh_type);
+        __swab32s(&rec->lrh_len);
+        __swab32s(&rec->lrh_index);
+        __swab32s(&rec->lrh_type);
 	__swab32s(&rec->lrh_id);
 
-	switch (rec->lrh_type) {
+        switch (rec->lrh_type) {
 	case OST_SZ_REC:
 	{
-		struct llog_size_change_rec *lsc =
-			(struct llog_size_change_rec *)rec;
+                struct llog_size_change_rec *lsc =
+                        (struct llog_size_change_rec *)rec;
 
-		lustre_swab_ll_fid(&lsc->lsc_fid);
-		__swab32s(&lsc->lsc_ioepoch);
+                lustre_swab_ll_fid(&lsc->lsc_fid);
+                __swab32s(&lsc->lsc_ioepoch);
 		tail = &lsc->lsc_tail;
-		break;
-	}
+                break;
+        }
 	case MDS_UNLINK_REC:
 	{
-		struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec;
+                struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec;
 
-		__swab64s(&lur->lur_oid);
-		__swab32s(&lur->lur_oseq);
-		__swab32s(&lur->lur_count);
+                __swab64s(&lur->lur_oid);
+                __swab32s(&lur->lur_oseq);
+                __swab32s(&lur->lur_count);
 		tail = &lur->lur_tail;
-		break;
-	}
+                break;
+        }
 	case MDS_UNLINK64_REC:
 	{
 		struct llog_unlink64_rec *lur =
@@ -199,10 +199,8 @@ void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
 			lustre_swab_lu_fid(&rnm->cr_sfid);
 			lustre_swab_lu_fid(&rnm->cr_spfid);
 		}
-		/*
-		 * Because the tail follows a variable-length structure we need
-		 * to compute its location at runtime
-		 */
+		/* Because the tail follows a variable-length structure we need
+		 * to compute its location at runtime */
 		tail = (struct llog_rec_tail *)((char *)&cr->cr +
 						changelog_rec_size(&cr->cr) +
 						cr->cr.cr_namelen);
@@ -211,15 +209,14 @@ void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
 
 	case CHANGELOG_USER_REC:
 	{
-		struct llog_changelog_user_rec *cur =
-			(struct llog_changelog_user_rec *)rec;
+                struct llog_changelog_user_rec *cur =
+                        (struct llog_changelog_user_rec*)rec;
 
-		__swab32s(&cur->cur_id);
-		__swab64s(&cur->cur_endrec);
-		__swab32s(&cur->cur_time);
+                __swab32s(&cur->cur_id);
+                __swab64s(&cur->cur_endrec);
 		tail = &cur->cur_tail;
-		break;
-	}
+                break;
+        }
 
 	case HSM_AGENT_REC: {
 		struct llog_agent_req_rec *arr =
@@ -233,10 +230,8 @@ void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
 		__swab64s(&arr->arr_hai.hai_extent.offset);
 		__swab64s(&arr->arr_hai.hai_extent.length);
 		__swab64s(&arr->arr_hai.hai_gid);
-		/*
-		 * no swabing for opaque data
-		 * hai_data[0];
-		 */
+		/* no swabing for opaque data */
+		/* hai_data[0]; */
 		break;
 	}
 
@@ -257,7 +252,6 @@ void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
 				(struct llog_setattr64_rec_v2 *)rec;
 
 			__swab32s(&lsr2->lsr_projid);
-			__swab32s(&lsr2->lsr_layout_version);
 			tail = &lsr2->lsr_tail;
 		} else {
 			tail = &lsr->lsr_tail;
@@ -297,8 +291,8 @@ void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
 		tail = &lgr->lgr_tail;
 		break;
 	}
-	case LLOG_PAD_MAGIC:
-		break;
+        case LLOG_PAD_MAGIC:
+                break;
 	case UPDATE_REC:
 	{
 		struct llog_update_record *lur =
@@ -318,10 +312,10 @@ void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
 						update_records_size(record));
 		break;
 	}
-	default:
-		CERROR("Unknown llog rec type %#x swabbing rec %p\n",
-			rec->lrh_type, rec);
-	}
+        default:
+                CERROR("Unknown llog rec type %#x swabbing rec %p\n",
+                       rec->lrh_type, rec);
+        }
 
 	if (tail) {
 		__swab32s(&tail->lrt_len);
@@ -349,33 +343,31 @@ static void print_llog_hdr(struct llog_log_hdr *h)
 
 void lustre_swab_llog_hdr (struct llog_log_hdr *h)
 {
-	ENTRY;
-	print_llog_hdr(h);
+        ENTRY;
+        print_llog_hdr(h);
 
 	lustre_swab_llog_rec(&h->llh_hdr);
 
-	print_llog_hdr(h);
-	EXIT;
+        print_llog_hdr(h);
+        EXIT;
 }
 EXPORT_SYMBOL(lustre_swab_llog_hdr);
 
 void print_lustre_cfg(struct lustre_cfg *lcfg)
 {
-	int i;
+        int i;
+        ENTRY;
 
-	ENTRY;
+        if (!(libcfs_debug & D_OTHER)) /* don't loop on nothing */
+                return;
 
-	if (!(libcfs_debug & D_OTHER)) /* don't loop on nothing */
-		return;
+        CDEBUG(D_OTHER, "lustre_cfg: %p\n", lcfg);
+        CDEBUG(D_OTHER, "\tlcfg->lcfg_version: %#x\n", lcfg->lcfg_version);
 
-	CDEBUG(D_OTHER, "lustre_cfg: %p\n", lcfg);
-	CDEBUG(D_OTHER, "\tlcfg->lcfg_version: %#x\n", lcfg->lcfg_version);
-
-	CDEBUG(D_OTHER, "\tlcfg->lcfg_command: %#x\n", lcfg->lcfg_command);
-	CDEBUG(D_OTHER, "\tlcfg->lcfg_num: %#x\n", lcfg->lcfg_num);
-	CDEBUG(D_OTHER, "\tlcfg->lcfg_flags: %#x\n", lcfg->lcfg_flags);
-	CDEBUG(D_OTHER, "\tlcfg->lcfg_nid: %s\n",
-	       libcfs_nid2str(lcfg->lcfg_nid));
+        CDEBUG(D_OTHER, "\tlcfg->lcfg_command: %#x\n", lcfg->lcfg_command);
+        CDEBUG(D_OTHER, "\tlcfg->lcfg_num: %#x\n", lcfg->lcfg_num);
+        CDEBUG(D_OTHER, "\tlcfg->lcfg_flags: %#x\n", lcfg->lcfg_flags);
+        CDEBUG(D_OTHER, "\tlcfg->lcfg_nid: %s\n", libcfs_nid2str(lcfg->lcfg_nid));
 
 	CDEBUG(D_OTHER, "\tlcfg->lcfg_bufcount: %d\n", lcfg->lcfg_bufcount);
 	if (lcfg->lcfg_bufcount < LUSTRE_CFG_MAX_BUFCOUNT)
@@ -385,48 +377,47 @@ void print_lustre_cfg(struct lustre_cfg *lcfg)
 			       lustre_cfg_string(lcfg, i));
 		}
 
-	EXIT;
+        EXIT;
 }
 EXPORT_SYMBOL(print_lustre_cfg);
 
 void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg)
 {
-	int i;
-
-	ENTRY;
-
-	__swab32s(&lcfg->lcfg_version);
-
-	if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) {
-		CERROR("not swabbing lustre_cfg version %#x (expecting %#x)\n",
-			lcfg->lcfg_version, LUSTRE_CFG_VERSION);
-		EXIT;
-		return;
-	}
-
-	__swab32s(&lcfg->lcfg_command);
-	__swab32s(&lcfg->lcfg_num);
-	__swab32s(&lcfg->lcfg_flags);
-	__swab64s(&lcfg->lcfg_nid);
-	__swab32s(&lcfg->lcfg_bufcount);
-	for (i = 0; i < lcfg->lcfg_bufcount && i < LUSTRE_CFG_MAX_BUFCOUNT; i++)
-		__swab32s(&lcfg->lcfg_buflens[i]);
-
-	print_lustre_cfg(lcfg);
-	EXIT;
-	return;
+        int i;
+        ENTRY;
+
+        __swab32s(&lcfg->lcfg_version);
+
+        if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) {
+                CERROR("not swabbing lustre_cfg version %#x (expecting %#x)\n",
+                       lcfg->lcfg_version, LUSTRE_CFG_VERSION);
+                EXIT;
+                return;
+        }
+
+        __swab32s(&lcfg->lcfg_command);
+        __swab32s(&lcfg->lcfg_num);
+        __swab32s(&lcfg->lcfg_flags);
+        __swab64s(&lcfg->lcfg_nid);
+        __swab32s(&lcfg->lcfg_bufcount);
+        for (i = 0; i < lcfg->lcfg_bufcount && i < LUSTRE_CFG_MAX_BUFCOUNT; i++)
+                __swab32s(&lcfg->lcfg_buflens[i]);
+
+        print_lustre_cfg(lcfg);
+        EXIT;
+        return;
 }
 
 /* used only for compatibility with old on-disk cfg_marker data */
 struct cfg_marker32 {
-	__u32   cm_step;
-	__u32   cm_flags;
-	__u32   cm_vers;
-	__u32   padding;
-	__u32   cm_createtime;
-	__u32   cm_canceltime;
-	char    cm_tgtname[MTI_NAME_MAXLEN];
-	char    cm_comment[MTI_NAME_MAXLEN];
+        __u32   cm_step;
+        __u32   cm_flags;
+        __u32   cm_vers;
+        __u32   padding;
+        __u32   cm_createtime;
+        __u32   cm_canceltime;
+        char    cm_tgtname[MTI_NAME_MAXLEN];
+        char    cm_comment[MTI_NAME_MAXLEN];
 };
 
 #define MTI_NAMELEN32    (MTI_NAME_MAXLEN - \
@@ -434,51 +425,48 @@ struct cfg_marker32 {
 
 void lustre_swab_cfg_marker(struct cfg_marker *marker, int swab, int size)
 {
-	struct cfg_marker32 *cm32 = (struct cfg_marker32 *)marker;
-
-	ENTRY;
-
-	if (swab) {
-		__swab32s(&marker->cm_step);
-		__swab32s(&marker->cm_flags);
-		__swab32s(&marker->cm_vers);
-	}
-	if (size == sizeof(*cm32)) {
-		__u32 createtime, canceltime;
-		/*
-		 * There was a problem with the original declaration of
-		 * cfg_marker on 32-bit systems because it used time_t as
-		 * a wire protocol structure, and didn't verify this in
-		 * wirecheck.  We now have to convert the offsets of the
-		 * later fields in order to work on 32- and 64-bit systems.
-		 *
-		 * Fortunately, the cm_comment field has no functional use
-		 * so can be sacrificed when converting the timestamp size.
-		 *
-		 * Overwrite fields from the end first, so they are not
-		 * clobbered, and use memmove() instead of memcpy() because
-		 * the source and target buffers overlap.  bug 16771
-		 */
-		createtime = cm32->cm_createtime;
-		canceltime = cm32->cm_canceltime;
-		memmove(marker->cm_comment, cm32->cm_comment, MTI_NAMELEN32);
-		marker->cm_comment[MTI_NAMELEN32 - 1] = '\0';
-		memmove(marker->cm_tgtname, cm32->cm_tgtname,
-			sizeof(marker->cm_tgtname));
-		if (swab) {
-			__swab32s(&createtime);
-			__swab32s(&canceltime);
-		}
-		marker->cm_createtime = createtime;
-		marker->cm_canceltime = canceltime;
-		CDEBUG(D_CONFIG,
-		       "Find old cfg_marker(Srv32b,Clt64b) for target %s, converting\n",
-		       marker->cm_tgtname);
-	} else if (swab) {
-		__swab64s(&marker->cm_createtime);
-		__swab64s(&marker->cm_canceltime);
-	}
-
-	EXIT;
-	return;
+        struct cfg_marker32 *cm32 = (struct cfg_marker32*)marker;
+        ENTRY;
+
+        if (swab) {
+                __swab32s(&marker->cm_step);
+                __swab32s(&marker->cm_flags);
+                __swab32s(&marker->cm_vers);
+        }
+        if (size == sizeof(*cm32)) {
+                __u32 createtime, canceltime;
+                /* There was a problem with the original declaration of
+                 * cfg_marker on 32-bit systems because it used time_t as
+                 * a wire protocol structure, and didn't verify this in
+                 * wirecheck.  We now have to convert the offsets of the
+                 * later fields in order to work on 32- and 64-bit systems.
+                 *
+                 * Fortunately, the cm_comment field has no functional use
+                 * so can be sacrificed when converting the timestamp size.
+                 *
+                 * Overwrite fields from the end first, so they are not
+                 * clobbered, and use memmove() instead of memcpy() because
+                 * the source and target buffers overlap.  bug 16771 */
+                createtime = cm32->cm_createtime;
+                canceltime = cm32->cm_canceltime;
+                memmove(marker->cm_comment, cm32->cm_comment, MTI_NAMELEN32);
+                marker->cm_comment[MTI_NAMELEN32 - 1] = '\0';
+                memmove(marker->cm_tgtname, cm32->cm_tgtname,
+                        sizeof(marker->cm_tgtname));
+                if (swab) {
+                        __swab32s(&createtime);
+                        __swab32s(&canceltime);
+                }
+                marker->cm_createtime = createtime;
+                marker->cm_canceltime = canceltime;
+                CDEBUG(D_CONFIG, "Find old cfg_marker(Srv32b,Clt64b) "
+                       "for target %s, converting\n",
+                       marker->cm_tgtname);
+        } else if (swab) {
+                __swab64s(&marker->cm_createtime);
+                __swab64s(&marker->cm_canceltime);
+        }
+
+        EXIT;
+        return;
 }
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_test.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_test.c
index f1517ceef7198..27f52aa15078b 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/llog_test.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_test.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -39,8 +39,6 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/kthread.h>
-#include <linux/delay.h>
 
 #include <obd_class.h>
 #include <lustre_fid.h>
@@ -49,14 +47,15 @@
 /* This is slightly more than the number of records that can fit into a
  * single llog file, because the llog_log_header takes up some of the
  * space in the first block that cannot be used for the bitmap. */
-static int llog_test_recnum = (LLOG_MIN_CHUNK_SIZE * 8);
+#define LLOG_TEST_RECNUM  (LLOG_MIN_CHUNK_SIZE * 8)
+
 static int llog_test_rand;
 static struct obd_uuid uuid = { .uuid = "test_uuid" };
 static struct llog_logid cat_logid;
 
 struct llog_mini_rec {
-	struct llog_rec_hdr lmr_hdr;
-	struct llog_rec_tail lmr_tail;
+        struct llog_rec_hdr     lmr_hdr;
+        struct llog_rec_tail    lmr_tail;
 } __attribute__((packed));
 
 static int verify_handle(char *test, struct llog_handle *llh, int num_recs)
@@ -102,8 +101,8 @@ static int verify_handle(char *test, struct llog_handle *llh, int num_recs)
 static int llog_test_1(const struct lu_env *env,
 		       struct obd_device *obd, char *name)
 {
-	struct llog_handle *llh;
-	struct llog_ctxt *ctxt;
+	struct llog_handle	*llh;
+	struct llog_ctxt	*ctxt;
 	int rc;
 	int rc2;
 
@@ -149,11 +148,11 @@ static int test_2_cancel_cb(const struct lu_env *env, struct llog_handle *llh,
 static int llog_test_2(const struct lu_env *env, struct obd_device *obd,
 		       char *name, struct llog_handle **llh)
 {
-	struct llog_ctxt *ctxt;
-	struct llog_handle *lgh;
-	struct llog_logid  logid;
-	int rc;
-	struct llog_mini_rec lmr;
+	struct llog_ctxt	*ctxt;
+	struct llog_handle	*lgh;
+	struct llog_logid	 logid;
+	int			 rc;
+	struct llog_mini_rec	 lmr;
 
 	ENTRY;
 
@@ -192,7 +191,7 @@ static int llog_test_2(const struct lu_env *env, struct obd_device *obd,
 	logid = lgh->lgh_id;
 
 	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
-	lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC;
+	lmr.lmr_hdr.lrh_type = 0xf02f02;
 
 	/* Check llog header values are correct after record add/cancel */
 	CWARN("2b: write 1 llog records, check llh_count\n");
@@ -302,10 +301,8 @@ static int test3_check_n_add_cb(const struct lu_env *env,
 	} else {
 		size_t chunk_size = lgh->lgh_hdr->llh_hdr.lrh_len;
 
-		/*
-		 * For variable size records the start offset is unknown, trust
-		 * the first value and check others are consistent with it.
-		 */
+		/* For variable size records the start offset is unknown, trust
+		 * the first value and check others are consistent with it. */
 		if (test_3_rec_off == 0)
 			test_3_rec_off = lgh->lgh_cur_offset;
 
@@ -340,10 +337,8 @@ static int test3_check_n_add_cb(const struct lu_env *env,
 	if (rc < 0)
 		CERROR("cb_test_3: cannot modify record while processing\n");
 
-	/*
-	 * Add new record to the llog at *last_rec position one by one to
-	 * check that last block is re-read during processing
-	 */
+	/* Add new record to the llog at *last_rec position one by one to
+	 * check that last block is re-read during processing */
 	if (cur_idx == *last_rec || cur_idx == (*last_rec + 1)) {
 		rc = llog_write(env, lgh, rec, LLOG_NEXT_IDX);
 		if (rc < 0)
@@ -409,8 +404,7 @@ static int llog_test_3(const struct lu_env *env, struct obd_device *obd,
 	llh->lgh_hdr->llh_size = sizeof(struct llog_gen_rec);
 	llh->lgh_hdr->llh_flags |= LLOG_F_IS_FIXSIZE;
 
-	/*
-	 * Fill the llog with 64-bytes records, use 1023 records,
+	/* Fill the llog with 64-bytes records, use 1023 records,
 	 * so last chunk will be partially full. Don't change this
 	 * value until record size is changed.
 	 */
@@ -472,17 +466,14 @@ static int llog_test_3(const struct lu_env *env, struct obd_device *obd,
 
 	CWARN("3b: write 566 variable size llog records\n");
 
-	/*
-	 * Drop llh_size to 0 to mark llog as variable-size and write
-	 * header to make this change permanent.
-	 */
+	/* Drop llh_size to 0 to mark llog as variable-size and write
+	 * header to make this change permanent. */
 	llh->lgh_hdr->llh_flags &= ~LLOG_F_IS_FIXSIZE;
 	llog_write(env, llh, &llh->lgh_hdr->llh_hdr, LLOG_HEADER_IDX);
 
 	hdr->lrh_type = OBD_CFG_REC;
 
-	/*
-	 * there are 1025 64-bytes records in llog already,
+	/* there are 1025 64-bytes records in llog already,
 	 * the last chunk contains single record, i.e. 64 bytes.
 	 * Each pair of variable size records is 200 bytes, so
 	 * we will have the following distribution per chunks:
@@ -575,15 +566,15 @@ static int llog_test_3(const struct lu_env *env, struct obd_device *obd,
 /* Test catalogue additions */
 static int llog_test_4(const struct lu_env *env, struct obd_device *obd)
 {
-	struct llog_handle *cath, *llh;
-	char name[10];
-	int rc, rc2, i, buflen;
-	struct llog_mini_rec lmr;
-	struct llog_cookie cookie;
-	struct llog_ctxt *ctxt;
-	int num_recs = 0;
-	char *buf;
-	struct llog_rec_hdr *rec;
+	struct llog_handle	*cath;
+	char			 name[10];
+	int			 rc, rc2, i, buflen;
+	struct llog_mini_rec	 lmr;
+	struct llog_cookie	 cookie;
+	struct llog_ctxt	*ctxt;
+	int			 num_recs = 0;
+	char			*buf;
+	struct llog_rec_hdr	*rec;
 
 	ENTRY;
 
@@ -591,7 +582,7 @@ static int llog_test_4(const struct lu_env *env, struct obd_device *obd)
 	LASSERT(ctxt);
 
 	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
-	lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC;
+	lmr.lmr_hdr.lrh_type = 0xf00f00;
 
 	sprintf(name, "%x", llog_test_rand + 1);
 	CWARN("4a: create a catalog log with name: %s\n", name);
@@ -624,18 +615,6 @@ static int llog_test_4(const struct lu_env *env, struct obd_device *obd)
 	if (rc)
 		GOTO(out, rc);
 
-	/* estimate the max number of record for the plain llog
-	 * cause it depends on disk size
-	 */
-	llh = cath->u.chd.chd_current_log;
-	if (llh->lgh_max_size != 0) {
-		llog_test_recnum = (llh->lgh_max_size -
-			sizeof(struct llog_log_hdr)) / LLOG_MIN_REC_SIZE;
-	}
-
-	if (llog_test_recnum >= LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr))
-		llog_test_recnum = LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr) - 1;
-
 	CWARN("4c: cancel 1 log record\n");
 	rc = llog_cat_cancel_records(env, cath, 1, &cookie);
 	if (rc) {
@@ -648,12 +627,12 @@ static int llog_test_4(const struct lu_env *env, struct obd_device *obd)
 	if (rc)
 		GOTO(out, rc);
 
-	CWARN("4d: write %d more log records\n", llog_test_recnum);
-	for (i = 0; i < llog_test_recnum; i++) {
+	CWARN("4d: write %d more log records\n", LLOG_TEST_RECNUM);
+	for (i = 0; i < LLOG_TEST_RECNUM; i++) {
 		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
 		if (rc) {
 			CERROR("4d: write %d records failed at #%d: %d\n",
-			       llog_test_recnum, i + 1, rc);
+			       LLOG_TEST_RECNUM, i + 1, rc);
 			GOTO(out, rc);
 		}
 		num_recs++;
@@ -701,8 +680,8 @@ static int cat_counter;
 static int cat_print_cb(const struct lu_env *env, struct llog_handle *llh,
 			struct llog_rec_hdr *rec, void *data)
 {
-	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
-	struct lu_fid fid = {0};
+	struct llog_logid_rec	*lir = (struct llog_logid_rec *)rec;
+	struct lu_fid		 fid = {0};
 
 	if (rec->lrh_type != LLOG_LOGID_MAGIC) {
 		CERROR("invalid record in catalog\n");
@@ -760,7 +739,7 @@ static int llog_cancel_rec_cb(const struct lu_env *env,
 
 	llog_cat_cancel_records(env, llh->u.phd.phd_cat_handle, 1, &cookie);
 	cancel_count++;
-	if (cancel_count == llog_test_recnum)
+	if (cancel_count == LLOG_TEST_RECNUM)
 		RETURN(-LLOG_EEMPTY);
 	RETURN(0);
 }
@@ -768,11 +747,11 @@ static int llog_cancel_rec_cb(const struct lu_env *env,
 /* Test log and catalogue processing */
 static int llog_test_5(const struct lu_env *env, struct obd_device *obd)
 {
-	struct llog_handle *llh = NULL;
-	char name[10];
-	int rc, rc2;
-	struct llog_mini_rec lmr;
-	struct llog_ctxt *ctxt;
+	struct llog_handle	*llh = NULL;
+	char			 name[10];
+	int			 rc, rc2;
+	struct llog_mini_rec	 lmr;
+	struct llog_ctxt	*ctxt;
 
 	ENTRY;
 
@@ -780,7 +759,7 @@ static int llog_test_5(const struct lu_env *env, struct obd_device *obd)
 	LASSERT(ctxt);
 
 	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
-	lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC;
+	lmr.lmr_hdr.lrh_type = 0xf00f00;
 
 	CWARN("5a: re-open catalog by id\n");
 	rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS);
@@ -807,7 +786,7 @@ static int llog_test_5(const struct lu_env *env, struct obd_device *obd)
 		GOTO(out, rc = -EINVAL);
 	}
 
-	CWARN("5c: Cancel %d records, see one log zapped\n", llog_test_recnum);
+	CWARN("5c: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
 	cancel_count = 0;
 	rc = llog_cat_process(env, llh, llog_cancel_rec_cb, "foobar", 0, 0);
 	if (rc != -LLOG_EEMPTY) {
@@ -878,14 +857,14 @@ static int llog_test_5(const struct lu_env *env, struct obd_device *obd)
 static int llog_test_6(const struct lu_env *env, struct obd_device *obd,
 		       char *name)
 {
-	struct obd_device *mgc_obd;
-	struct llog_ctxt *ctxt;
-	struct obd_uuid *mgs_uuid;
-	struct obd_export *exp;
-	struct obd_uuid uuid = { "LLOG_TEST6_UUID" };
-	struct llog_handle *llh = NULL;
-	struct llog_ctxt *nctxt;
-	int rc, rc2;
+	struct obd_device	*mgc_obd;
+	struct llog_ctxt	*ctxt;
+	struct obd_uuid		*mgs_uuid;
+	struct obd_export	*exp;
+	struct obd_uuid		 uuid = { "LLOG_TEST6_UUID" };
+	struct llog_handle	*llh = NULL;
+	struct llog_ctxt	*nctxt;
+	int			 rc, rc2;
 
 	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
 	LASSERT(ctxt);
@@ -994,9 +973,9 @@ static int test_7_cancel_cb(const struct lu_env *env, struct llog_handle *llh,
 
 static int llog_test_7_sub(const struct lu_env *env, struct llog_ctxt *ctxt)
 {
-	struct llog_handle *llh;
-	int rc = 0, i, process_count;
-	int num_recs = 0;
+	struct llog_handle	*llh;
+	int			 rc = 0, i, process_count;
+	int			 num_recs = 0;
 
 	ENTRY;
 
@@ -1079,8 +1058,8 @@ static int llog_test_7_sub(const struct lu_env *env, struct llog_ctxt *ctxt)
 /* Test all llog records writing and processing */
 static int llog_test_7(const struct lu_env *env, struct obd_device *obd)
 {
-	struct llog_ctxt *ctxt;
-	int rc;
+	struct llog_ctxt	*ctxt;
+	int			 rc;
 
 	ENTRY;
 
@@ -1179,6 +1158,61 @@ static int llog_test_7(const struct lu_env *env, struct obd_device *obd)
 	RETURN(rc);
 }
 
+static int llog_truncate(const struct lu_env *env, struct dt_object *o)
+{
+	struct lu_attr		 la;
+	struct thandle		*th;
+	struct dt_device	*d;
+	int			 rc;
+	ENTRY;
+
+	LASSERT(o);
+	d = lu2dt_dev(o->do_lu.lo_dev);
+	LASSERT(d);
+
+	rc = dt_attr_get(env, o, &la);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_OTHER, "original size %llu\n", la.la_size);
+	rc = sizeof(struct llog_log_hdr) + sizeof(struct llog_mini_rec);
+	if (la.la_size < rc) {
+		CERROR("too small llog: %llu\n", la.la_size);
+		RETURN(0);
+	}
+
+	/* drop 2 records */
+	la.la_size = la.la_size - (sizeof(struct llog_mini_rec) * 2);
+	la.la_valid = LA_SIZE;
+
+	th = dt_trans_create(env, d);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	rc = dt_declare_attr_set(env, o, &la, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_declare_punch(env, o, la.la_size, OBD_OBJECT_EOF, th);
+
+	rc = dt_trans_start_local(env, d, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_punch(env, o, la.la_size, OBD_OBJECT_EOF, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_attr_set(env, o, &la, th);
+	if (rc)
+		GOTO(stop, rc);
+
+stop:
+	dt_trans_stop(env, d, th);
+
+	RETURN(rc);
+}
+
 static int test_8_cb(const struct lu_env *env, struct llog_handle *llh,
 			  struct llog_rec_hdr *rec, void *data)
 {
@@ -1188,13 +1222,13 @@ static int test_8_cb(const struct lu_env *env, struct llog_handle *llh,
 
 static int llog_test_8(const struct lu_env *env, struct obd_device *obd)
 {
-	struct llog_handle *llh = NULL;
-	char name[10];
-	int rc, rc2, i;
-	int orig_counter;
-	struct llog_mini_rec lmr;
-	struct llog_ctxt *ctxt;
-	struct dt_object *obj = NULL;
+	struct llog_handle	*llh = NULL;
+	char			 name[10];
+	int			 rc, rc2, i;
+	int			 orig_counter;
+	struct llog_mini_rec	 lmr;
+	struct llog_ctxt	*ctxt;
+	struct dt_object	*obj = NULL;
 
 	ENTRY;
 
@@ -1202,7 +1236,7 @@ static int llog_test_8(const struct lu_env *env, struct obd_device *obd)
 	LASSERT(ctxt);
 
 	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
-	lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC;
+	lmr.lmr_hdr.lrh_type = 0xf00f00;
 
 	CWARN("8a: fill the first plain llog\n");
 	rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS);
@@ -1268,7 +1302,7 @@ static int llog_test_8(const struct lu_env *env, struct obd_device *obd)
 		}
 	}
 	CWARN("8b: second llog "DFID"\n",
-	      PFID(lu_object_fid(&llh->u.chd.chd_current_log->lgh_obj->do_lu)));
+		PFID(lu_object_fid(&llh->u.chd.chd_current_log->lgh_obj->do_lu)));
 
 	rc2 = llog_cat_close(env, llh);
 	if (rc2) {
@@ -1278,10 +1312,8 @@ static int llog_test_8(const struct lu_env *env, struct obd_device *obd)
 		GOTO(out_put, rc);
 	}
 
-	/* Here was 8c: drop two records from the first plain llog
-	 * llog_truncate was bad idea cause it creates a wrong state,
-	 * lgh_last_idx is wrong and two records belongs to zeroed buffer
-	 */
+	CWARN("8c: drop two records from the first plain llog\n");
+	llog_truncate(env, obj);
 
 	CWARN("8d: count survived records\n");
 	rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS);
@@ -1303,9 +1335,9 @@ static int llog_test_8(const struct lu_env *env, struct obd_device *obd)
 		GOTO(out, rc);
 	}
 
-	if (orig_counter + 200 != plain_counter) {
+	if (orig_counter + 200 - 2 != plain_counter) {
 		CERROR("found %d records (expected %d)\n", plain_counter,
-		       orig_counter + 200);
+		       orig_counter + 200 - 2);
 		rc = -EIO;
 	}
 
@@ -1328,9 +1360,9 @@ static int llog_test_8(const struct lu_env *env, struct obd_device *obd)
 
 static int llog_test_9_sub(const struct lu_env *env, struct llog_ctxt *ctxt)
 {
-	struct llog_handle *llh;
-	struct lu_fid fid;
-	int rc = 0;
+	struct llog_handle	*llh;
+	struct lu_fid		 fid;
+	int			 rc = 0;
 
 	ENTRY;
 
@@ -1365,8 +1397,8 @@ static int llog_test_9_sub(const struct lu_env *env, struct llog_ctxt *ctxt)
 /* Prepare different types of llog records for llog_reader test*/
 static int llog_test_9(const struct lu_env *env, struct obd_device *obd)
 {
-	struct llog_ctxt *ctxt;
-	int rc;
+	struct llog_ctxt	*ctxt;
+	int			 rc;
 
 	ENTRY;
 
@@ -1422,80 +1454,17 @@ static int llog_test_9(const struct lu_env *env, struct obd_device *obd)
 	RETURN(rc);
 }
 
-struct llog_process_info {
-	struct llog_handle *lpi_loghandle;
-	llog_cb_t lpi_cb;
-	void *lpi_cbdata;
-	void *lpi_catdata;
-	int lpi_rc;
-	struct completion lpi_completion;
-	const struct lu_env *lpi_env;
-	struct task_struct *lpi_reftask;
-};
-
-
-static int llog_test_process_thread(void *arg)
-{
-	struct llog_process_info *lpi = arg;
-	int rc;
-
-	rc = llog_cat_process_or_fork(NULL, lpi->lpi_loghandle, lpi->lpi_cb,
-				      NULL, lpi->lpi_cbdata, 1, 0, true);
-
-	complete(&lpi->lpi_completion);
-
-	lpi->lpi_rc = rc;
-	if (rc)
-		CWARN("10h: Error during catalog processing %d\n", rc);
-	return rc;
-}
-
-static int cat_check_old_cb(const struct lu_env *env, struct llog_handle *llh,
-			struct llog_rec_hdr *rec, void *data)
-{
-	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
-	struct lu_fid fid = {0};
-	struct lu_fid *prev_fid = data;
-
-	if (rec->lrh_type != LLOG_LOGID_MAGIC) {
-		CERROR("invalid record in catalog\n");
-		RETURN(-EINVAL);
-	}
-
-	logid_to_fid(&lir->lid_id, &fid);
-
-	CWARN("seeing record at index %d - "DFID" in log "DFID"\n",
-	      rec->lrh_index, PFID(&fid),
-	      PFID(lu_object_fid(&llh->lgh_obj->do_lu)));
-
-	if (prev_fid->f_oid > fid.f_oid) {
-		CWARN("processing old record, fail\n");
-		prev_fid->f_oid = 0xbad;
-		RETURN(-LLOG_EEMPTY);
-	}
-
-	if (prev_fid->f_oid == 0) {
-		cfs_fail_loc = OBD_FAIL_ONCE | OBD_FAIL_LLOG_PROCESS_TIMEOUT;
-		cfs_fail_val = (unsigned int) (llh->lgh_id.lgl_oi.oi.oi_id &
-					       0xFFFFFFFF);
-		msleep(1 * MSEC_PER_SEC);
-	}
-	*prev_fid = fid;
-
-	RETURN(0);
-}
-
 /* test catalog wrap around */
 static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 {
-	struct llog_handle *cath;
-	char name[10];
-	int rc, rc2, i, enospc, eok;
-	struct llog_mini_rec lmr;
-	struct llog_ctxt *ctxt;
-	struct lu_attr la;
-	__u64 cat_max_size;
-	struct dt_device *dt;
+	struct llog_handle	*cath;
+	char			 name[10];
+	int			 rc, rc2, i, enospc, eok;
+	struct llog_mini_rec	 lmr;
+	struct llog_ctxt	*ctxt;
+	struct lu_attr		 la;
+	__u64			 cat_max_size;
+	struct dt_device	*dt;
 
 	ENTRY;
 
@@ -1503,7 +1472,7 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	LASSERT(ctxt);
 
 	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
-	lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC;
+	lmr.lmr_hdr.lrh_type = 0xf00f00;
 
 	snprintf(name, sizeof(name), "%x", llog_test_rand + 2);
 	CWARN("10a: create a catalog log with name: %s\n", name);
@@ -1521,11 +1490,9 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	cat_logid = cath->lgh_id;
 	dt = lu2dt_dev(cath->lgh_obj->do_lu.lo_dev);
 
-	/*
-	 * sync device to commit all recent LLOG changes to disk and avoid
+	/* sync device to commit all recent LLOG changes to disk and avoid
 	 * to consume a huge space with delayed journal commit callbacks
-	 * particularly on low memory nodes or VMs
-	 */
+	 * particularly on low memory nodes or VMs */
 	rc = dt_sync(env, dt);
 	if (rc) {
 		CERROR("10c: sync failed: %d\n", rc);
@@ -1536,12 +1503,12 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	cfs_fail_loc = CFS_FAIL_SKIP|OBD_FAIL_CAT_RECORDS;
 	cfs_fail_val = 4;
 
-	CWARN("10b: write %d log records\n", llog_test_recnum);
-	for (i = 0; i < llog_test_recnum; i++) {
+	CWARN("10b: write %d log records\n", LLOG_TEST_RECNUM);
+	for (i = 0; i < LLOG_TEST_RECNUM; i++) {
 		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
 		if (rc) {
 			CERROR("10b: write %d records failed at #%d: %d\n",
-			       llog_test_recnum, i + 1, rc);
+			       LLOG_TEST_RECNUM, i + 1, rc);
 			GOTO(out, rc);
 		}
 	}
@@ -1551,23 +1518,21 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	if (rc)
 		GOTO(out, rc);
 
-	/*
-	 * sync device to commit all recent LLOG changes to disk and avoid
+	/* sync device to commit all recent LLOG changes to disk and avoid
 	 * to consume a huge space with delayed journal commit callbacks
-	 * particularly on low memory nodes or VMs
-	 */
+	 * particularly on low memory nodes or VMs */
 	rc = dt_sync(env, dt);
 	if (rc) {
 		CERROR("10b: sync failed: %d\n", rc);
 		GOTO(out, rc);
 	}
 
-	CWARN("10c: write %d more log records\n", 2 * llog_test_recnum);
-	for (i = 0; i < 2 * llog_test_recnum; i++) {
+	CWARN("10c: write %d more log records\n", 2 * LLOG_TEST_RECNUM);
+	for (i = 0; i < 2 * LLOG_TEST_RECNUM; i++) {
 		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
 		if (rc) {
 			CERROR("10c: write %d records failed at #%d: %d\n",
-			       2*llog_test_recnum, i + 1, rc);
+			       2*LLOG_TEST_RECNUM, i + 1, rc);
 			GOTO(out, rc);
 		}
 	}
@@ -1577,35 +1542,29 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	if (rc)
 		GOTO(out, rc);
 
-	/*
-	 * sync device to commit all recent LLOG changes to disk and avoid
+	/* sync device to commit all recent LLOG changes to disk and avoid
 	 * to consume a huge space with delayed journal commit callbacks
-	 * particularly on low memory nodes or VMs
-	 */
+	 * particularly on low memory nodes or VMs */
 	rc = dt_sync(env, dt);
 	if (rc) {
 		CERROR("10c: sync failed: %d\n", rc);
 		GOTO(out, rc);
 	}
 
-	/*
-	 * fill last allocated plain LLOG and reach -ENOSPC condition
-	 * because no slot available in Catalog
-	 */
+	/* fill last allocated plain LLOG and reach -ENOSPC condition
+	 * because no slot available in Catalog */
 	enospc = 0;
 	eok = 0;
-	CWARN("10c: write %d more log records\n", llog_test_recnum);
-	for (i = 0; i < llog_test_recnum; i++) {
+	CWARN("10c: write %d more log records\n", LLOG_TEST_RECNUM);
+	for (i = 0; i < LLOG_TEST_RECNUM; i++) {
 		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
 		if (rc && rc != -ENOSPC) {
 			CERROR("10c: write %d records failed at #%d: %d\n",
-			       llog_test_recnum, i + 1, rc);
+			       LLOG_TEST_RECNUM, i + 1, rc);
 			GOTO(out, rc);
 		}
-		/*
-		 * after last added plain LLOG has filled up, all new
-		 * records add should fail with -ENOSPC
-		 */
+		/* after last added plain LLOG has filled up, all new
+		 * records add should fail with -ENOSPC */
 		if (rc == -ENOSPC) {
 			enospc++;
 		} else {
@@ -1614,7 +1573,7 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 		}
 	}
 
-	if ((enospc == 0) && (enospc+eok != llog_test_recnum)) {
+	if ((enospc == 0) && (enospc+eok != LLOG_TEST_RECNUM)) {
 		CERROR("10c: all last records adds should have failed with"
 		       " -ENOSPC\n");
 		GOTO(out, rc = -EINVAL);
@@ -1636,19 +1595,15 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	}
 	cat_max_size = la.la_size;
 
-	/*
-	 * cancel all 1st plain llog records to empty it, this will also cause
-	 * its catalog entry to be freed for next forced wrap in 10e
-	 */
-	CWARN("10d: Cancel %d records, see one log zapped\n", llog_test_recnum);
+	/* cancel all 1st plain llog records to empty it, this will also cause
+	 * its catalog entry to be freed for next forced wrap in 10e */
+	CWARN("10d: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
 	cancel_count = 0;
 	rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0);
 	if (rc != -LLOG_EEMPTY) {
 		CERROR("10d: process with llog_cancel_rec_cb failed: %d\n", rc);
-		/*
-		 * need to indicate error if for any reason llog_test_recnum is
-		 * not reached
-		 */
+		/* need to indicate error if for any reason LLOG_TEST_RECNUM is
+		 * not reached */
 		if (rc == 0)
 			rc = -ERANGE;
 		GOTO(out, rc);
@@ -1671,11 +1626,9 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	if (rc)
 		GOTO(out, rc);
 
-	/*
-	 * sync device to commit all recent LLOG changes to disk and avoid
+	/* sync device to commit all recent LLOG changes to disk and avoid
 	 * to consume a huge space with delayed journal commit callbacks
-	 * particularly on low memory nodes or VMs
-	 */
+	 * particularly on low memory nodes or VMs */
 	rc = dt_sync(env, dt);
 	if (rc) {
 		CERROR("10d: sync failed: %d\n", rc);
@@ -1684,18 +1637,16 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 
 	enospc = 0;
 	eok = 0;
-	CWARN("10e: write %d more log records\n", llog_test_recnum);
-	for (i = 0; i < llog_test_recnum; i++) {
+	CWARN("10e: write %d more log records\n", LLOG_TEST_RECNUM);
+	for (i = 0; i < LLOG_TEST_RECNUM; i++) {
 		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
 		if (rc && rc != -ENOSPC) {
 			CERROR("10e: write %d records failed at #%d: %d\n",
-			       llog_test_recnum, i + 1, rc);
+			       LLOG_TEST_RECNUM, i + 1, rc);
 			GOTO(out, rc);
 		}
-		/*
-		 * after last added plain LLOG has filled up, all new
-		 * records add should fail with -ENOSPC
-		 */
+		/* after last added plain LLOG has filled up, all new
+		 * records add should fail with -ENOSPC */
 		if (rc == -ENOSPC) {
 			enospc++;
 		} else {
@@ -1704,7 +1655,7 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 		}
 	}
 
-	if ((enospc == 0) && (enospc+eok != llog_test_recnum)) {
+	if ((enospc == 0) && (enospc+eok != LLOG_TEST_RECNUM)) {
 		CERROR("10e: all last records adds should have failed with"
 		       " -ENOSPC\n");
 		GOTO(out, rc = -EINVAL);
@@ -1715,14 +1666,13 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 
 	CWARN("10e: print the catalog entries.. we expect 4\n");
 	cat_counter = 0;
-	rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10",
-				      0, 0, false);
+	rc = llog_process(env, cath, cat_print_cb, "test 10", NULL);
 	if (rc) {
-		CERROR("10e: process with cat_print_cb failed: %d\n", rc);
+		CERROR("10d: process with cat_print_cb failed: %d\n", rc);
 		GOTO(out, rc);
 	}
 	if (cat_counter != 4) {
-		CERROR("10e: %d entries in catalog\n", cat_counter);
+		CERROR("10d: %d entries in catalog\n", cat_counter);
 		GOTO(out, rc = -EINVAL);
 	}
 
@@ -1752,30 +1702,24 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	CWARN("10e: catalog successfully wrap around, last_idx %d, first %d\n",
 	      cath->lgh_last_idx, cath->lgh_hdr->llh_cat_idx);
 
-	/*
-	 * sync device to commit all recent LLOG changes to disk and avoid
+	/* sync device to commit all recent LLOG changes to disk and avoid
 	 * to consume a huge space with delayed journal commit callbacks
-	 * particularly on low memory nodes or VMs
-	 */
+	 * particularly on low memory nodes or VMs */
 	rc = dt_sync(env, dt);
 	if (rc) {
 		CERROR("10e: sync failed: %d\n", rc);
 		GOTO(out, rc);
 	}
 
-	/*
-	 * cancel more records to free one more slot in Catalog
-	 * see if it is re-allocated when adding more records
-	 */
-	CWARN("10f: Cancel %d records, see one log zapped\n", llog_test_recnum);
+	/* cancel more records to free one more slot in Catalog
+	 * see if it is re-allocated when adding more records */
+	CWARN("10f: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
 	cancel_count = 0;
 	rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0);
 	if (rc != -LLOG_EEMPTY) {
 		CERROR("10f: process with llog_cancel_rec_cb failed: %d\n", rc);
-		/*
-		 * need to indicate error if for any reason llog_test_recnum is
-		 * not reached
-		 */
+		/* need to indicate error if for any reason LLOG_TEST_RECNUM is
+		 * not reached */
 		if (rc == 0)
 			rc = -ERANGE;
 		GOTO(out, rc);
@@ -1783,8 +1727,7 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 
 	CWARN("10f: print the catalog entries.. we expect 3\n");
 	cat_counter = 0;
-	rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10",
-				      0, 0, false);
+	rc = llog_process(env, cath, cat_print_cb, "test 10", NULL);
 	if (rc) {
 		CERROR("10f: process with cat_print_cb failed: %d\n", rc);
 		GOTO(out, rc);
@@ -1799,11 +1742,9 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	if (rc)
 		GOTO(out, rc);
 
-	/*
-	 * sync device to commit all recent LLOG changes to disk and avoid
+	/* sync device to commit all recent LLOG changes to disk and avoid
 	 * to consume a huge space with delayed journal commit callbacks
-	 * particularly on low memory nodes or VMs
-	 */
+	 * particularly on low memory nodes or VMs */
 	rc = dt_sync(env, dt);
 	if (rc) {
 		CERROR("10f: sync failed: %d\n", rc);
@@ -1812,18 +1753,16 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 
 	enospc = 0;
 	eok = 0;
-	CWARN("10f: write %d more log records\n", llog_test_recnum);
-	for (i = 0; i < llog_test_recnum; i++) {
+	CWARN("10f: write %d more log records\n", LLOG_TEST_RECNUM);
+	for (i = 0; i < LLOG_TEST_RECNUM; i++) {
 		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
 		if (rc && rc != -ENOSPC) {
 			CERROR("10f: write %d records failed at #%d: %d\n",
-			       llog_test_recnum, i + 1, rc);
+			       LLOG_TEST_RECNUM, i + 1, rc);
 			GOTO(out, rc);
 		}
-		/*
-		 * after last added plain LLOG has filled up, all new
-		 * records add should fail with -ENOSPC
-		 */
+		/* after last added plain LLOG has filled up, all new
+		 * records add should fail with -ENOSPC */
 		if (rc == -ENOSPC) {
 			enospc++;
 		} else {
@@ -1832,7 +1771,7 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 		}
 	}
 
-	if ((enospc == 0) && (enospc+eok != llog_test_recnum)) {
+	if ((enospc == 0) && (enospc+eok != LLOG_TEST_RECNUM)) {
 		CERROR("10f: all last records adds should have failed with"
 		       " -ENOSPC\n");
 		GOTO(out, rc = -EINVAL);
@@ -1867,11 +1806,9 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 		GOTO(out, rc = -EINVAL);
 	}
 
-	/*
-	 * sync device to commit all recent LLOG changes to disk and avoid
+	/* sync device to commit all recent LLOG changes to disk and avoid
 	 * to consume a huge space with delayed journal commit callbacks
-	 * particularly on low memory nodes or VMs
-	 */
+	 * particularly on low memory nodes or VMs */
 	rc = dt_sync(env, dt);
 	if (rc) {
 		CERROR("10f: sync failed: %d\n", rc);
@@ -1880,18 +1817,16 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 
 	/* will llh_cat_idx also successfully wrap ? */
 
-	/*
-	 * cancel all records in the plain LLOGs referenced by 2 last indexes in
-	 * Catalog
-	 */
+	/* cancel all records in the plain LLOGs referenced by 2 last indexes in
+	 * Catalog */
 
 	/* cancel more records to free one more slot in Catalog */
-	CWARN("10g: Cancel %d records, see one log zapped\n", llog_test_recnum);
+	CWARN("10g: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
 	cancel_count = 0;
 	rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0);
 	if (rc != -LLOG_EEMPTY) {
 		CERROR("10g: process with llog_cancel_rec_cb failed: %d\n", rc);
-		/* need to indicate error if for any reason llog_test_recnum is
+		/* need to indicate error if for any reason LLOG_TEST_RECNUM is
 		 * not reached */
 		if (rc == 0)
 			rc = -ERANGE;
@@ -1900,8 +1835,7 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 
 	CWARN("10g: print the catalog entries.. we expect 3\n");
 	cat_counter = 0;
-	rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10",
-				      0, 0, false);
+	rc = llog_process(env, cath, cat_print_cb, "test 10", NULL);
 	if (rc) {
 		CERROR("10g: process with cat_print_cb failed: %d\n", rc);
 		GOTO(out, rc);
@@ -1916,11 +1850,9 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	if (rc)
 		GOTO(out, rc);
 
-	/*
-	 * sync device to commit all recent LLOG changes to disk and avoid
+	/* sync device to commit all recent LLOG changes to disk and avoid
 	 * to consume a huge space with delayed journal commit callbacks
-	 * particularly on low memory nodes or VMs
-	 */
+	 * particularly on low memory nodes or VMs */
 	rc = dt_sync(env, dt);
 	if (rc) {
 		CERROR("10g: sync failed: %d\n", rc);
@@ -1928,15 +1860,13 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	}
 
 	/* cancel more records to free one more slot in Catalog */
-	CWARN("10g: Cancel %d records, see one log zapped\n", llog_test_recnum);
+	CWARN("10g: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
 	cancel_count = 0;
 	rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0);
 	if (rc != -LLOG_EEMPTY) {
 		CERROR("10g: process with llog_cancel_rec_cb failed: %d\n", rc);
-		/*
-		 * need to indicate error if for any reason llog_test_recnum is
-		 * not reached
-		 */
+		/* need to indicate error if for any reason LLOG_TEST_RECNUM is
+		 * not reached */
 		if (rc == 0)
 			rc = -ERANGE;
 		GOTO(out, rc);
@@ -1944,8 +1874,7 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 
 	CWARN("10g: print the catalog entries.. we expect 2\n");
 	cat_counter = 0;
-	rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10",
-				      0, 0, false);
+	rc = llog_process(env, cath, cat_print_cb, "test 10", NULL);
 	if (rc) {
 		CERROR("10g: process with cat_print_cb failed: %d\n", rc);
 		GOTO(out, rc);
@@ -1968,11 +1897,9 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 		GOTO(out, rc = -EINVAL);
 	}
 
-	/*
-	 * sync device to commit all recent LLOG changes to disk and avoid
+	/* sync device to commit all recent LLOG changes to disk and avoid
 	 * to consume a huge space with delayed journal commit callbacks
-	 * particularly on low memory nodes or VMs
-	 */
+	 * particularly on low memory nodes or VMs */
 	rc = dt_sync(env, dt);
 	if (rc) {
 		CERROR("10g: sync failed: %d\n", rc);
@@ -1980,15 +1907,13 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	}
 
 	/* cancel more records to free one more slot in Catalog */
-	CWARN("10g: Cancel %d records, see one log zapped\n", llog_test_recnum);
+	CWARN("10g: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
 	cancel_count = 0;
 	rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0);
 	if (rc != -LLOG_EEMPTY) {
 		CERROR("10g: process with llog_cancel_rec_cb failed: %d\n", rc);
-		/*
-		 * need to indicate error if for any reason llog_test_recnum is
-		 * not reached
-		 */
+		/* need to indicate error if for any reason LLOG_TEST_RECNUM is
+		 * not reached */
 		if (rc == 0)
 			rc = -ERANGE;
 		GOTO(out, rc);
@@ -1996,8 +1921,7 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 
 	CWARN("10g: print the catalog entries.. we expect 1\n");
 	cat_counter = 0;
-	rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10",
-				      0, 0, false);
+	rc = llog_process(env, cath, cat_print_cb, "test 10", NULL);
 	if (rc) {
 		CERROR("10g: process with cat_print_cb failed: %d\n", rc);
 		GOTO(out, rc);
@@ -2022,64 +1946,6 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 
 	CWARN("10g: llh_cat_idx has also successfully wrapped!\n");
 
-	/*
-	 * catalog has only one valid entry other slots has outdated
-	 * records. Trying to race the llog_thread_process with llog_add
-	 * llog_thread_process read buffer and loop record on it.
-	 * llog_add adds a record and mark a record in bitmap.
-	 * llog_thread_process process record with old data.
-	 */
-	{
-	struct llog_process_info lpi;
-	struct lu_fid test_fid = {0};
-
-	lpi.lpi_loghandle = cath;
-	lpi.lpi_cb = cat_check_old_cb;
-	lpi.lpi_catdata = NULL;
-	lpi.lpi_cbdata = &test_fid;
-	init_completion(&lpi.lpi_completion);
-
-	kthread_run(llog_test_process_thread, &lpi, "llog_test_process_thread");
-
-	msleep(1 * MSEC_PER_SEC / 2);
-	enospc = 0;
-	eok = 0;
-	CWARN("10h: write %d more log records\n", llog_test_recnum);
-	for (i = 0; i < llog_test_recnum; i++) {
-		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
-		if (rc && rc != -ENOSPC) {
-			CERROR("10h: write %d records failed at #%d: %d\n",
-			       llog_test_recnum, i + 1, rc);
-			GOTO(out, rc);
-		}
-		/*
-		 * after last added plain LLOG has filled up, all new
-		 * records add should fail with -ENOSPC
-		 */
-		if (rc == -ENOSPC) {
-			enospc++;
-		} else {
-			enospc = 0;
-			eok++;
-		}
-	}
-
-	if ((enospc == 0) && (enospc+eok != llog_test_recnum)) {
-		CERROR("10h: all last records adds should have failed with"
-		       " -ENOSPC\n");
-		GOTO(out, rc = -EINVAL);
-	}
-
-	CWARN("10h: wrote %d records then %d failed with ENOSPC\n", eok,
-	      enospc);
-
-	wait_for_completion(&lpi.lpi_completion);
-
-	if (lpi.lpi_rc != 0) {
-		CERROR("10h: race happened, old record was processed\n");
-		GOTO(out, rc = -EINVAL);
-	}
-	}
 out:
 	cfs_fail_loc = 0;
 	cfs_fail_val = 0;
@@ -2096,17 +1962,15 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	RETURN(rc);
 }
 
-/*
- * -------------------------------------------------------------------------
+/* -------------------------------------------------------------------------
  * Tests above, boring obd functions below
- * -------------------------------------------------------------------------
- */
+ * ------------------------------------------------------------------------- */
 static int llog_run_tests(const struct lu_env *env, struct obd_device *obd)
 {
-	struct llog_handle *llh = NULL;
-	struct llog_ctxt *ctxt;
-	int rc, err;
-	char name[10];
+	struct llog_handle	*llh = NULL;
+	struct llog_ctxt	*ctxt;
+	int			 rc, err;
+	char			 name[10];
 
 	ENTRY;
 	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
@@ -2168,9 +2032,9 @@ static int llog_run_tests(const struct lu_env *env, struct obd_device *obd)
 
 static int llog_test_cleanup(struct obd_device *obd)
 {
-	struct obd_device *tgt;
-	struct lu_env env;
-	int rc;
+	struct obd_device	*tgt;
+	struct lu_env		 env;
+	int			 rc;
 
 	ENTRY;
 
@@ -2188,32 +2052,32 @@ static int llog_test_cleanup(struct obd_device *obd)
 
 static int llog_test_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 {
-	struct obd_device *tgt;
-	struct llog_ctxt *ctxt;
-	struct dt_object *o;
-	struct lu_env env;
-	struct lu_context test_session;
-	int rc;
-
-	ENTRY;
-
-	if (lcfg->lcfg_bufcount < 2) {
-		CERROR("requires a TARGET OBD name\n");
-		RETURN(-EINVAL);
-	}
+	struct obd_device	*tgt;
+	struct llog_ctxt	*ctxt;
+	struct dt_object	*o;
+	struct lu_env		 env;
+	struct lu_context	 test_session;
+	int			 rc;
+
+        ENTRY;
+
+        if (lcfg->lcfg_bufcount < 2) {
+                CERROR("requires a TARGET OBD name\n");
+                RETURN(-EINVAL);
+        }
 
-	if (lcfg->lcfg_buflens[1] < 1) {
-		CERROR("requires a TARGET OBD name\n");
-		RETURN(-EINVAL);
-	}
+        if (lcfg->lcfg_buflens[1] < 1) {
+                CERROR("requires a TARGET OBD name\n");
+                RETURN(-EINVAL);
+        }
 
-	/* disk obd */
-	tgt = class_name2obd(lustre_cfg_string(lcfg, 1));
-	if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) {
-		CERROR("target device not attached or not set up (%s)\n",
-			lustre_cfg_string(lcfg, 1));
-		RETURN(-EINVAL);
-	}
+        /* disk obd */
+        tgt = class_name2obd(lustre_cfg_string(lcfg, 1));
+        if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) {
+                CERROR("target device not attached or not set up (%s)\n",
+                       lustre_cfg_string(lcfg, 1));
+                RETURN(-EINVAL);
+        }
 
 	rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
 	if (rc)
@@ -2262,14 +2126,14 @@ static int llog_test_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 }
 
 static struct obd_ops llog_obd_ops = {
-	.o_owner       = THIS_MODULE,
-	.o_setup       = llog_test_setup,
-	.o_cleanup     = llog_test_cleanup,
+        .o_owner       = THIS_MODULE,
+        .o_setup       = llog_test_setup,
+        .o_cleanup     = llog_test_cleanup,
 };
 
 static int __init llog_test_init(void)
 {
-	return class_register_type(&llog_obd_ops, NULL, false, NULL,
+	return class_register_type(&llog_obd_ops, NULL, true, NULL,
 				   "llog_test", NULL);
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/local_storage.c b/drivers/staging/lustrefsx/lustre/obdclass/local_storage.c
index 04c25ebd88274..89b227b0cfa09 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/local_storage.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/local_storage.c
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2015, Intel Corporation.
  */
 /*
  * lustre/obdclass/local_storage.c
@@ -388,14 +388,14 @@ static struct dt_object *__local_file_create(const struct lu_env *env,
 		rec->rec_fid = fid;
 		/* Add "." and ".." for newly created dir */
 		rc = dt_insert(env, dto, (const struct dt_rec *)rec,
-			       (const struct dt_key *)".", th);
+			       (const struct dt_key *)".", th, 1);
 		if (rc != 0)
 			GOTO(destroy, rc);
 
 		dt_ref_add(env, dto, th);
 		rec->rec_fid = lu_object_fid(&parent->do_lu);
 		rc = dt_insert(env, dto, (const struct dt_rec *)rec,
-			       (const struct dt_key *)"..", th);
+			       (const struct dt_key *)"..", th, 1);
 		if (rc != 0)
 			GOTO(destroy, rc);
 	}
@@ -404,7 +404,7 @@ static struct dt_object *__local_file_create(const struct lu_env *env,
 	rec->rec_type = dto->do_lu.lo_header->loh_attr;
 	dt_write_lock(env, parent, LOS_PARENT);
 	rc = dt_insert(env, parent, (const struct dt_rec *)rec,
-		       (const struct dt_key *)name, th);
+		       (const struct dt_key *)name, th, 1);
 	if (dti->dti_dof.dof_type == DFT_DIR)
 		dt_ref_add(env, parent, th);
 	dt_write_unlock(env, parent);
@@ -684,7 +684,7 @@ int local_object_unlink(const struct lu_env *env, struct dt_device *dt,
 		rec->rec_fid = &dti->dti_fid;
 		rec->rec_type = dto->do_lu.lo_header->loh_attr;
 		rc = dt_insert(env, parent, (const struct dt_rec *)rec,
-			       (const struct dt_key *)name, th);
+			       (const struct dt_key *)name, th, 1);
 		GOTO(unlock, rc);
 	}
 
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c
index 37d749d199275..00395af273593 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c
@@ -30,8 +30,10 @@
 
 #define DEBUG_SUBSYSTEM S_CLASS
 
+
 #include <obd_class.h>
 #include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
 
 #ifdef CONFIG_PROC_FS
 
@@ -65,8 +67,8 @@ struct job_stat {
 	struct hlist_node	js_hash;	/* hash struct for this jobid */
 	struct list_head	js_list;	/* on ojs_list, with ojs_lock */
 	atomic_t		js_refcount;	/* num users of this struct */
-	char			js_jobid[LUSTRE_JOBID_SIZE]; /* job name + NUL*/
-	time64_t		js_timestamp;	/* seconds of most recent stat*/
+	char			js_jobid[LUSTRE_JOBID_SIZE]; /* job name */
+	time_t			js_timestamp;	/* seconds of most recent stat*/
 	struct lprocfs_stats	*js_stats;	/* per-job statistics */
 	struct obd_job_stats	*js_jobstats;	/* for accessing ojs_lock */
 };
@@ -162,7 +164,7 @@ static int job_cleanup_iter_callback(struct cfs_hash *hs,
 				     struct cfs_hash_bd *bd,
 				     struct hlist_node *hnode, void *data)
 {
-	time64_t oldest_time = *((time64_t *)data);
+	time_t oldest_time = *((time_t *)data);
 	struct job_stat *job;
 
 	job = hlist_entry(hnode, struct job_stat, js_hash);
@@ -191,8 +193,8 @@ static int job_cleanup_iter_callback(struct cfs_hash *hs,
  */
 static void lprocfs_job_cleanup(struct obd_job_stats *stats, int before)
 {
-	time64_t now = ktime_get_real_seconds();
-	time64_t oldest;
+	time_t now = cfs_time_current_sec();
+	time_t oldest;
 
 	if (likely(before >= 0)) {
 		unsigned int cleanup_interval = stats->ojs_cleanup_interval;
@@ -232,7 +234,7 @@ static void lprocfs_job_cleanup(struct obd_job_stats *stats, int before)
 
 	write_lock(&stats->ojs_lock);
 	stats->ojs_cleaning = false;
-	stats->ojs_last_cleanup = ktime_get_real_seconds();
+	stats->ojs_last_cleanup = cfs_time_current_sec();
 	write_unlock(&stats->ojs_lock);
 }
 
@@ -252,8 +254,8 @@ static struct job_stat *job_alloc(char *jobid, struct obd_job_stats *jobs)
 
 	jobs->ojs_cntr_init_fn(job->js_stats);
 
-	memcpy(job->js_jobid, jobid, sizeof(job->js_jobid));
-	job->js_timestamp = ktime_get_real_seconds();
+	memcpy(job->js_jobid, jobid, LUSTRE_JOBID_SIZE);
+	job->js_timestamp = cfs_time_current_sec();
 	job->js_jobstats = jobs;
 	INIT_HLIST_NODE(&job->js_hash);
 	INIT_LIST_HEAD(&job->js_list);
@@ -313,7 +315,7 @@ int lprocfs_job_stats_log(struct obd_device *obd, char *jobid,
 
 found:
 	LASSERT(stats == job->js_jobstats);
-	job->js_timestamp = ktime_get_real_seconds();
+	job->js_timestamp = cfs_time_current_sec();
 	lprocfs_counter_add(job->js_stats, event, amount);
 
 	job_putref(job);
@@ -442,7 +444,7 @@ static int lprocfs_jobstats_seq_show(struct seq_file *p, void *v)
 	}
 	seq_putc(p, '\n');
 
-	seq_printf(p, "  %-16s %lld\n", "snapshot_time:", job->js_timestamp);
+	seq_printf(p, "  %-16s %ld\n", "snapshot_time:", job->js_timestamp);
 
 	s = job->js_stats;
 	for (i = 0; i < s->ls_num; i++) {
@@ -513,7 +515,7 @@ static ssize_t lprocfs_jobstats_seq_write(struct file *file,
 	if (stats->ojs_hash == NULL)
 		return -ENODEV;
 
-	if (copy_from_user(jobid, buf, len))
+	if (lprocfs_copy_from_user(file, jobid, buf, len))
 		return -EFAULT;
 	jobid[len] = 0;
 
@@ -613,7 +615,7 @@ int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
 	stats->ojs_cntr_num = cntr_num;
 	stats->ojs_cntr_init_fn = init_fn;
 	stats->ojs_cleanup_interval = 600; /* 10 mins by default */
-	stats->ojs_last_cleanup = ktime_get_real_seconds();
+	stats->ojs_last_cleanup = cfs_time_current_sec();
 
 	entry = lprocfs_add_simple(obd->obd_proc_entry, "job_stats", stats,
 				   &lprocfs_jobstats_seq_fops);
@@ -624,38 +626,45 @@ int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
 	RETURN(0);
 }
 EXPORT_SYMBOL(lprocfs_job_stats_init);
-#endif /* CONFIG_PROC_FS*/
 
-ssize_t job_cleanup_interval_show(struct kobject *kobj, struct attribute *attr,
-				  char *buf)
+int lprocfs_job_interval_seq_show(struct seq_file *m, void *data)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
+	struct obd_device *obd = m->private;
 	struct obd_job_stats *stats;
 
+	if (obd == NULL)
+		return -ENODEV;
+
 	stats = &obd->u.obt.obt_jobstats;
-	return scnprintf(buf, PAGE_SIZE, "%d\n", stats->ojs_cleanup_interval);
+	seq_printf(m, "%d\n", stats->ojs_cleanup_interval);
+	return 0;
 }
-EXPORT_SYMBOL(job_cleanup_interval_show);
+EXPORT_SYMBOL(lprocfs_job_interval_seq_show);
 
-ssize_t job_cleanup_interval_store(struct kobject *kobj,
-				   struct attribute *attr,
-				   const char *buffer, size_t count)
+ssize_t
+lprocfs_job_interval_seq_write(struct file *file, const char __user *buffer,
+				size_t count, loff_t *off)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
+	struct obd_device *obd;
 	struct obd_job_stats *stats;
-	unsigned int val;
 	int rc;
+	__s64 val;
+
+	obd = ((struct seq_file *)file->private_data)->private;
+	if (obd == NULL)
+		return -ENODEV;
 
 	stats = &obd->u.obt.obt_jobstats;
 
-	rc = kstrtouint(buffer, 0, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
+	if (val < 0 || val > UINT_MAX)
+		return -ERANGE;
 
 	stats->ojs_cleanup_interval = val;
 	lprocfs_job_cleanup(stats, stats->ojs_cleanup_interval);
 	return count;
 }
-EXPORT_SYMBOL(job_cleanup_interval_store);
+EXPORT_SYMBOL(lprocfs_job_interval_seq_write);
+#endif /* CONFIG_PROC_FS*/
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
index 7a365730746d6..f3d2efc8403ba 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,6 +38,7 @@
 
 #include <obd_class.h>
 #include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
 
 #ifdef CONFIG_PROC_FS
 
@@ -47,15 +48,52 @@ MODULE_PARM_DESC(lprocfs_no_percpu_stats, "Do not alloc percpu data for lprocfs
 
 #define MAX_STRING_SIZE 128
 
+static const struct file_operations lprocfs_kernel_dummy = {};
+
+/*
+ * Awful hacks to mark procfs seq writes as going to kernel space. Used
+ * to be done with set_fs(KERNEL_DS), but that function is no more.
+ * This should only be called from class_process_proc_param(), which passes
+ * in a fake file structure. It should never, ever be used for anything else.
+ */
+void lprocfs_file_set_kernel(struct file *file)
+{
+	LASSERT(file->f_op == NULL);
+	file->f_op = &lprocfs_kernel_dummy;
+}
+EXPORT_SYMBOL(lprocfs_file_set_kernel);
+
+bool lprocfs_file_is_kernel(struct file *file)
+{
+	return (file->f_op == &lprocfs_kernel_dummy);
+}
+EXPORT_SYMBOL(lprocfs_file_is_kernel);
+
+unsigned long
+lprocfs_copy_from_user(struct file *file, void *to,
+		       const void __user *from, unsigned long n)
+{
+	unsigned long res;
+
+	if (lprocfs_file_is_kernel(file)) {
+		memcpy(to, from, n);
+		res = 0;
+	} else
+		res = copy_from_user(to, from, n);
+
+	return res;
+}
+EXPORT_SYMBOL(lprocfs_copy_from_user);
+
 int lprocfs_single_release(struct inode *inode, struct file *file)
 {
-	return single_release(inode, file);
+        return single_release(inode, file);
 }
 EXPORT_SYMBOL(lprocfs_single_release);
 
 int lprocfs_seq_release(struct inode *inode, struct file *file)
 {
-	return seq_release(inode, file);
+        return seq_release(inode, file);
 }
 EXPORT_SYMBOL(lprocfs_seq_release);
 
@@ -78,8 +116,8 @@ lprocfs_add_simple(struct proc_dir_entry *root, char *name,
 	struct proc_dir_entry *proc;
 	umode_t mode;
 
-	if (!root || !name || !fops)
-		return ERR_PTR(-EINVAL);
+	if (root == NULL || name == NULL || fops == NULL)
+                return ERR_PTR(-EINVAL);
 
 	mode = default_mode(fops);
 	proc = proc_create_data(name, mode, root, fops, data);
@@ -88,43 +126,42 @@ lprocfs_add_simple(struct proc_dir_entry *root, char *name,
 		       name);
 		return ERR_PTR(-ENOMEM);
 	}
-	return proc;
+        return proc;
 }
 EXPORT_SYMBOL(lprocfs_add_simple);
 
 struct proc_dir_entry *lprocfs_add_symlink(const char *name,
-					   struct proc_dir_entry *parent,
-					   const char *format, ...)
+                        struct proc_dir_entry *parent, const char *format, ...)
 {
-	struct proc_dir_entry *entry;
-	char *dest;
-	va_list ap;
+        struct proc_dir_entry *entry;
+        char *dest;
+        va_list ap;
 
-	if (!parent || !format)
-		return NULL;
+        if (parent == NULL || format == NULL)
+                return NULL;
 
-	OBD_ALLOC_WAIT(dest, MAX_STRING_SIZE + 1);
-	if (!dest)
-		return NULL;
+        OBD_ALLOC_WAIT(dest, MAX_STRING_SIZE + 1);
+        if (dest == NULL)
+                return NULL;
 
-	va_start(ap, format);
-	vsnprintf(dest, MAX_STRING_SIZE, format, ap);
-	va_end(ap);
+        va_start(ap, format);
+        vsnprintf(dest, MAX_STRING_SIZE, format, ap);
+        va_end(ap);
 
-	entry = proc_symlink(name, parent, dest);
-	if (!entry)
+        entry = proc_symlink(name, parent, dest);
+	if (entry == NULL)
 		CERROR("LprocFS: Could not create symbolic link from "
 		       "%s to %s\n", name, dest);
 
-	OBD_FREE(dest, MAX_STRING_SIZE + 1);
-	return entry;
+        OBD_FREE(dest, MAX_STRING_SIZE + 1);
+        return entry;
 }
 EXPORT_SYMBOL(lprocfs_add_symlink);
 
 static const struct file_operations ldebugfs_empty_ops = { };
 
 int ldebugfs_add_vars(struct dentry *parent, struct ldebugfs_vars *list,
-		       void *data)
+		      void *data)
 {
 	if (IS_ERR_OR_NULL(parent) || IS_ERR_OR_NULL(list))
 		return -EINVAL;
@@ -169,10 +206,10 @@ int
 lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list,
 		 void *data)
 {
-	if (!root || !list)
+	if (root == NULL || list == NULL)
 		return -EINVAL;
 
-	while (list->name) {
+	while (list->name != NULL) {
 		struct proc_dir_entry *proc;
 		umode_t mode = 0;
 
@@ -183,7 +220,7 @@ lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list,
 		proc = proc_create_data(list->name, mode, root,
 					list->fops ?: &lprocfs_empty_ops,
 					list->data ?: data);
-		if (!proc)
+		if (proc == NULL)
 			return -ENOMEM;
 		list++;
 	}
@@ -193,7 +230,7 @@ EXPORT_SYMBOL(lprocfs_add_vars);
 
 void ldebugfs_remove(struct dentry **entryp)
 {
-	debugfs_remove_recursive(*entryp);
+	debugfs_remove(*entryp);
 	*entryp = NULL;
 }
 EXPORT_SYMBOL_GPL(ldebugfs_remove);
@@ -211,38 +248,36 @@ static void lprocfs_remove_nolock(struct proc_dir_entry **proot)
 	struct proc_dir_entry *parent;
 
 	*proot = NULL;
-	if (!root || IS_ERR(root))
+	if (root == NULL || IS_ERR(root))
 		return;
 
-	parent = root->parent;
-	LASSERT(parent != NULL);
+        parent = root->parent;
+        LASSERT(parent != NULL);
 
-	while (1) {
-		while (temp->subdir)
-			temp = temp->subdir;
+        while (1) {
+                while (temp->subdir != NULL)
+                        temp = temp->subdir;
 
-		rm_entry = temp;
-		temp = temp->parent;
+                rm_entry = temp;
+                temp = temp->parent;
 
-		/*
-		 * Memory corruption once caused this to fail, and
-		 * without this LASSERT we would loop here forever.
-		 */
-		LASSERTF(strlen(rm_entry->name) == rm_entry->namelen,
-			 "0x%p  %s/%s len %d\n", rm_entry, temp->name,
-			 rm_entry->name, (int)strlen(rm_entry->name));
+                /* Memory corruption once caused this to fail, and
+                   without this LASSERT we would loop here forever. */
+                LASSERTF(strlen(rm_entry->name) == rm_entry->namelen,
+                         "0x%p  %s/%s len %d\n", rm_entry, temp->name,
+                         rm_entry->name, (int)strlen(rm_entry->name));
 
-		remove_proc_entry(rm_entry->name, temp);
-		if (temp == parent)
-			break;
-	}
+                remove_proc_entry(rm_entry->name, temp);
+                if (temp == parent)
+                        break;
+        }
 }
 
 int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
 {
-	struct proc_dir_entry *t = NULL;
-	struct proc_dir_entry **p;
-	int len, busy = 0;
+	struct proc_dir_entry	 *t = NULL;
+	struct proc_dir_entry	**p;
+	int			  len, busy = 0;
 
 	LASSERT(parent != NULL);
 	len = strlen(name);
@@ -341,10 +376,10 @@ lprocfs_register(const char *name, struct proc_dir_entry *parent,
 	struct proc_dir_entry *newchild;
 
 	newchild = proc_mkdir(name, parent);
-	if (!newchild)
+	if (newchild == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	if (list) {
+	if (list != NULL) {
 		int rc = lprocfs_add_vars(newchild, list, data);
 		if (rc) {
 			lprocfs_remove(&newchild);
@@ -356,6 +391,93 @@ lprocfs_register(const char *name, struct proc_dir_entry *parent,
 EXPORT_SYMBOL(lprocfs_register);
 
 /* Generic callbacks */
+int lprocfs_uint_seq_show(struct seq_file *m, void *data)
+{
+	seq_printf(m, "%u\n", *(unsigned int *)data);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_uint_seq_show);
+
+int lprocfs_wr_uint(struct file *file, const char __user *buffer,
+                    unsigned long count, void *data)
+{
+	unsigned	*p = data;
+	char		 dummy[MAX_STRING_SIZE + 1];
+	char		*end;
+	unsigned long	 tmp;
+
+	if (count >= sizeof(dummy))
+		return -EINVAL;
+
+	if (count == 0)
+		return 0;
+
+	if (lprocfs_copy_from_user(file, dummy, buffer, count))
+		return -EFAULT;
+
+	dummy[count] = 0;
+
+	tmp = simple_strtoul(dummy, &end, 0);
+	if (dummy == end)
+		return -EINVAL;
+
+	*p = (unsigned int)tmp;
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_uint);
+
+ssize_t lprocfs_uint_seq_write(struct file *file, const char __user *buffer,
+			       size_t count, loff_t *off)
+{
+	int *data = ((struct seq_file *)file->private_data)->private;
+	int rc;
+	__s64 val = 0;
+
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	if (rc < 0)
+		return rc;
+
+	return lprocfs_wr_uint(file, buffer, count, data);
+}
+EXPORT_SYMBOL(lprocfs_uint_seq_write);
+
+int lprocfs_u64_seq_show(struct seq_file *m, void *data)
+{
+	LASSERT(data != NULL);
+	seq_printf(m, "%llu\n", *(__u64 *)data);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_u64_seq_show);
+
+int lprocfs_atomic_seq_show(struct seq_file *m, void *data)
+{
+	atomic_t *atom = data;
+	LASSERT(atom != NULL);
+	seq_printf(m, "%d\n", atomic_read(atom));
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_atomic_seq_show);
+
+ssize_t
+lprocfs_atomic_seq_write(struct file *file, const char __user *buffer,
+			size_t count, loff_t *off)
+{
+	atomic_t *atm = ((struct seq_file *)file->private_data)->private;
+	__s64 val = 0;
+	int rc;
+
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	if (rc < 0)
+		return rc;
+
+	if (val <= 0 || val > INT_MAX)
+		return -ERANGE;
+
+	atomic_set(atm, val);
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_atomic_seq_write);
+
 int lprocfs_uuid_seq_show(struct seq_file *m, void *data)
 {
 	struct obd_device *obd = data;
@@ -366,163 +488,114 @@ int lprocfs_uuid_seq_show(struct seq_file *m, void *data)
 }
 EXPORT_SYMBOL(lprocfs_uuid_seq_show);
 
-static ssize_t uuid_show(struct kobject *kobj, struct attribute *attr,
-			 char *buf)
+int lprocfs_name_seq_show(struct seq_file *m, void *data)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
+	struct obd_device *dev = data;
 
-	return sprintf(buf, "%s\n", obd->obd_uuid.uuid);
+	LASSERT(dev != NULL);
+	seq_printf(m, "%s\n", dev->obd_name);
+	return 0;
 }
-LUSTRE_RO_ATTR(uuid);
+EXPORT_SYMBOL(lprocfs_name_seq_show);
 
-static ssize_t blocksize_show(struct kobject *kobj, struct attribute *attr,
-			      char *buf)
+int lprocfs_blksize_seq_show(struct seq_file *m, void *data)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct obd_statfs osfs;
-	int rc;
-
-	rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
-			ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
-			OBD_STATFS_NODELAY);
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
 	if (!rc)
-		return sprintf(buf, "%u\n", osfs.os_bsize);
-
+		seq_printf(m, "%u\n", osfs.os_bsize);
 	return rc;
 }
-LUSTRE_RO_ATTR(blocksize);
+EXPORT_SYMBOL(lprocfs_blksize_seq_show);
 
-static ssize_t kbytestotal_show(struct kobject *kobj, struct attribute *attr,
-				char *buf)
+int lprocfs_kbytestotal_seq_show(struct seq_file *m, void *data)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct obd_statfs osfs;
-	int rc;
-
-	rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
-			ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
-			OBD_STATFS_NODELAY);
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
 	if (!rc) {
-		u32 blk_size = osfs.os_bsize >> 10;
-		u64 result = osfs.os_blocks;
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_blocks;
 
 		while (blk_size >>= 1)
 			result <<= 1;
 
-		return sprintf(buf, "%llu\n", result);
+		seq_printf(m, "%llu\n", result);
 	}
-
 	return rc;
 }
-LUSTRE_RO_ATTR(kbytestotal);
+EXPORT_SYMBOL(lprocfs_kbytestotal_seq_show);
 
-static ssize_t kbytesfree_show(struct kobject *kobj, struct attribute *attr,
-			       char *buf)
+int lprocfs_kbytesfree_seq_show(struct seq_file *m, void *data)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct obd_statfs osfs;
-	int rc;
-
-	rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
-			ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
-			OBD_STATFS_NODELAY);
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
 	if (!rc) {
-		u32 blk_size = osfs.os_bsize >> 10;
-		u64 result = osfs.os_bfree;
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bfree;
 
 		while (blk_size >>= 1)
 			result <<= 1;
 
-		return sprintf(buf, "%llu\n", result);
+		seq_printf(m, "%llu\n", result);
 	}
-
 	return rc;
 }
-LUSTRE_RO_ATTR(kbytesfree);
+EXPORT_SYMBOL(lprocfs_kbytesfree_seq_show);
 
-static ssize_t kbytesavail_show(struct kobject *kobj, struct attribute *attr,
-				char *buf)
+int lprocfs_kbytesavail_seq_show(struct seq_file *m, void *data)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct obd_statfs osfs;
-	int rc;
-
-	rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
-			ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
-			OBD_STATFS_NODELAY);
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
 	if (!rc) {
-		u32 blk_size = osfs.os_bsize >> 10;
-		u64 result = osfs.os_bavail;
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bavail;
 
 		while (blk_size >>= 1)
 			result <<= 1;
 
-		return sprintf(buf, "%llu\n", result);
+		seq_printf(m, "%llu\n", result);
 	}
-
 	return rc;
 }
-LUSTRE_RO_ATTR(kbytesavail);
+EXPORT_SYMBOL(lprocfs_kbytesavail_seq_show);
 
-static ssize_t filestotal_show(struct kobject *kobj, struct attribute *attr,
-			       char *buf)
+int lprocfs_filestotal_seq_show(struct seq_file *m, void *data)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct obd_statfs osfs;
-	int rc;
-
-	rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
-			ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
-			OBD_STATFS_NODELAY);
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
 	if (!rc)
-		return sprintf(buf, "%llu\n", osfs.os_files);
-
+		seq_printf(m, "%llu\n", osfs.os_files);
 	return rc;
 }
-LUSTRE_RO_ATTR(filestotal);
+EXPORT_SYMBOL(lprocfs_filestotal_seq_show);
 
-static ssize_t filesfree_show(struct kobject *kobj, struct attribute *attr,
-			      char *buf)
+int lprocfs_filesfree_seq_show(struct seq_file *m, void *data)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct obd_statfs osfs;
-	int rc;
-
-	rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
-			ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
-			OBD_STATFS_NODELAY);
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
 	if (!rc)
-		return sprintf(buf, "%llu\n", osfs.os_ffree);
-
+		seq_printf(m, "%llu\n", osfs.os_ffree);
 	return rc;
 }
-LUSTRE_RO_ATTR(filesfree);
-
-ssize_t conn_uuid_show(struct kobject *kobj, struct attribute *attr, char *buf)
-{
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct ptlrpc_connection *conn;
-	ssize_t count;
-
-	LPROCFS_CLIMP_CHECK(obd);
-	conn = obd->u.cli.cl_import->imp_connection;
-	if (conn && obd->u.cli.cl_import)
-		count = sprintf(buf, "%s\n", conn->c_remote_uuid.uuid);
-	else
-		count = sprintf(buf, "%s\n", "<none>");
-
-	LPROCFS_CLIMP_EXIT(obd);
-	return count;
-}
-EXPORT_SYMBOL(conn_uuid_show);
+EXPORT_SYMBOL(lprocfs_filesfree_seq_show);
 
 int lprocfs_server_uuid_seq_show(struct seq_file *m, void *data)
 {
@@ -543,6 +616,26 @@ int lprocfs_server_uuid_seq_show(struct seq_file *m, void *data)
 }
 EXPORT_SYMBOL(lprocfs_server_uuid_seq_show);
 
+int lprocfs_conn_uuid_seq_show(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct ptlrpc_connection *conn;
+	int rc = 0;
+
+	LASSERT(obd != NULL);
+
+	LPROCFS_CLIMP_CHECK(obd);
+	conn = obd->u.cli.cl_import->imp_connection;
+	if (conn && obd->u.cli.cl_import)
+		seq_printf(m, "%s\n", conn->c_remote_uuid.uuid);
+	else
+		seq_printf(m, "%s\n", "<none>");
+
+	LPROCFS_CLIMP_EXIT(obd);
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_conn_uuid_seq_show);
+
 /** add up per-cpu counters */
 
 /**
@@ -636,14 +729,14 @@ void lprocfs_stats_unlock(struct lprocfs_stats *stats,
 void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
 			   struct lprocfs_counter *cnt)
 {
-	unsigned int num_entry;
-	struct lprocfs_counter *percpu_cntr;
-	int i;
-	unsigned long flags = 0;
+	unsigned int			num_entry;
+	struct lprocfs_counter		*percpu_cntr;
+	int				i;
+	unsigned long			flags = 0;
 
 	memset(cnt, 0, sizeof(*cnt));
 
-	if (!stats) {
+	if (stats == NULL) {
 		/* set count to 1 to avoid divide-by-zero errs in callers */
 		cnt->lc_count = 1;
 		return;
@@ -654,7 +747,7 @@ void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
 	num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
 
 	for (i = 0; i < num_entry; i++) {
-		if (!stats->ls_percpu[i])
+		if (stats->ls_percpu[i] == NULL)
 			continue;
 		percpu_cntr = lprocfs_stats_counter_get(stats, i, idx);
 
@@ -670,6 +763,16 @@ void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
 	lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
 }
 
+/**
+ * Append a space separated list of current set flags to str.
+ */
+#define flag2str(flag)						\
+	do {								\
+		if (imp->imp_##flag) {					\
+			seq_printf(m, "%s" #flag, first ? "" : ", ");	\
+			first = false;					\
+		}							\
+	} while (0)
 static void obd_import_flags2str(struct obd_import *imp, struct seq_file *m)
 {
 	bool first = true;
@@ -679,16 +782,19 @@ static void obd_import_flags2str(struct obd_import *imp, struct seq_file *m)
 		first = false;
 	}
 
-	flag2str(imp, invalid);
-	flag2str(imp, deactive);
-	flag2str(imp, replayable);
-	flag2str(imp, delayed_recovery);
-	flag2str(imp, vbr_failed);
-	flag2str(imp, pingable);
-	flag2str(imp, resend_replay);
-	flag2str(imp, no_pinger_recover);
-	flag2str(imp, connect_tried);
+	flag2str(invalid);
+	flag2str(deactive);
+	flag2str(replayable);
+	flag2str(delayed_recovery);
+	flag2str(no_lock_replay);
+	flag2str(vbr_failed);
+	flag2str(pingable);
+	flag2str(resend_replay);
+	flag2str(no_pinger_recover);
+	flag2str(need_mne_swab);
+	flag2str(connect_tried);
 }
+#undef flag2str
 
 static const char *obd_connect_names[] = {
 	/* flags names  */
@@ -752,34 +858,17 @@ static const char *obd_connect_names[] = {
 	"multi_mod_rpcs",
 	"dir_stripe",
 	"subtree",
-	"lockahead",
+	"lock_ahead",
 	"bulk_mbits",
 	"compact_obdo",
 	"second_flags",
 	/* flags2 names */
-	"file_secctx",	/* 0x01 */
-	"lockaheadv2",	/* 0x02 */
-	"dir_migrate",	/* 0x04 */
-	"sum_statfs",	/* 0x08 */
-	"overstriping",	/* 0x10 */
-	"flr",		/* 0x20 */
-	"wbc",		/* 0x40 */
-	"lock_convert",  /* 0x80 */
-	"archive_id_array",	/* 0x100 */
-	"increasing_xid",	/* 0x200 */
-	"selinux_policy",	/* 0x400 */
-	"lsom",			/* 0x800 */
-	"pcc",			/* 0x1000 */
-	"unknown",		/* 0x2000 */
-	"async_discard",	/* 0x4000 */
-	"client_encryption",	/* 0x8000 */
-	"fidmap",		/* 0x10000 */
-	"getattr_pfid",		/* 0x20000 */
+	"file_secctx",
 	NULL
 };
 
-void obd_connect_seq_flags2str(struct seq_file *m, __u64 flags, __u64 flags2,
-			       const char *sep)
+static void obd_connect_seq_flags2str(struct seq_file *m, __u64 flags,
+				      __u64 flags2, const char *sep)
 {
 	bool first = true;
 	__u64 mask;
@@ -816,7 +905,6 @@ void obd_connect_seq_flags2str(struct seq_file *m, __u64 flags, __u64 flags2,
 		first = false;
 	}
 }
-EXPORT_SYMBOL(obd_connect_seq_flags2str);
 
 int obd_connect_flags2str(char *page, int count, __u64 flags, __u64 flags2,
 			  const char *sep)
@@ -853,8 +941,8 @@ int obd_connect_flags2str(char *page, int count, __u64 flags, __u64 flags2,
 }
 EXPORT_SYMBOL(obd_connect_flags2str);
 
-void
-obd_connect_data_seqprint(struct seq_file *m, struct obd_connect_data *ocd)
+static void obd_connect_data_seqprint(struct seq_file *m,
+				      struct obd_connect_data *ocd)
 {
 	__u64 flags;
 
@@ -910,16 +998,16 @@ obd_connect_data_seqprint(struct seq_file *m, struct obd_connect_data *ocd)
 
 int lprocfs_import_seq_show(struct seq_file *m, void *data)
 {
-	char nidstr[LNET_NIDSTR_SIZE];
-	struct lprocfs_counter ret;
-	struct lprocfs_counter_header *header;
-	struct obd_device *obd = (struct obd_device *)data;
-	struct obd_import *imp;
-	struct obd_import_conn *conn;
-	struct obd_connect_data *ocd;
-	int j;
-	int k;
-	int rw = 0;
+	char				nidstr[LNET_NIDSTR_SIZE];
+	struct lprocfs_counter          ret;
+	struct lprocfs_counter_header   *header;
+	struct obd_device               *obd    = (struct obd_device *)data;
+	struct obd_import               *imp;
+	struct obd_import_conn          *conn;
+	struct obd_connect_data		*ocd;
+	int                             j;
+	int                             k;
+	int                             rw      = 0;
 
 	LASSERT(obd != NULL);
 	LPROCFS_CLIMP_CHECK(obd);
@@ -953,7 +1041,7 @@ int lprocfs_import_seq_show(struct seq_file *m, void *data)
 		seq_printf(m, "%s%s", j ? ", " : "", nidstr);
 		j++;
 	}
-	if (imp->imp_connection)
+	if (imp->imp_connection != NULL)
 		libcfs_nid2str_r(imp->imp_connection->c_peer.nid,
 				 nidstr, sizeof(nidstr));
 	else
@@ -962,16 +1050,14 @@ int lprocfs_import_seq_show(struct seq_file *m, void *data)
 		   "       current_connection: %s\n"
 		   "       connection_attempts: %u\n"
 		   "       generation: %u\n"
-		   "       in-progress_invalidations: %u\n"
-		   "       idle: %lld sec\n",
+		   "       in-progress_invalidations: %u\n",
 		   nidstr,
 		   imp->imp_conn_cnt,
 		   imp->imp_generation,
-		   atomic_read(&imp->imp_inval_count),
-		   ktime_get_real_seconds() - imp->imp_last_reply_time);
+		   atomic_read(&imp->imp_inval_count));
 	spin_unlock(&imp->imp_lock);
 
-	if (!obd->obd_svc_stats)
+	if (obd->obd_svc_stats == NULL)
 		goto out_climp;
 
 	header = &obd->obd_svc_stats->ls_cnt_header[PTLRPC_REQWAIT_CNTR];
@@ -1153,83 +1239,14 @@ int lprocfs_connect_flags_seq_show(struct seq_file *m, void *data)
 }
 EXPORT_SYMBOL(lprocfs_connect_flags_seq_show);
 
-static const struct attribute *obd_def_uuid_attrs[] = {
-	&lustre_attr_uuid.attr,
-	NULL,
-};
-
-static const struct attribute *obd_def_attrs[] = {
-	&lustre_attr_blocksize.attr,
-	&lustre_attr_kbytestotal.attr,
-	&lustre_attr_kbytesfree.attr,
-	&lustre_attr_kbytesavail.attr,
-	&lustre_attr_filestotal.attr,
-	&lustre_attr_filesfree.attr,
-	&lustre_attr_uuid.attr,
-	NULL,
-};
-
-static void obd_sysfs_release(struct kobject *kobj)
-{
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-
-	complete(&obd->obd_kobj_unregister);
-}
-
-int lprocfs_obd_setup(struct obd_device *obd, bool uuid_only)
+int
+lprocfs_obd_setup(struct obd_device *obd)
 {
-	struct ldebugfs_vars *debugfs_vars = NULL;
-	int rc;
-
-	if (!obd || obd->obd_magic != OBD_DEVICE_MAGIC)
-		return -ENODEV;
-
-	rc = kobject_set_name(&obd->obd_kset.kobj, "%s", obd->obd_name);
-	if (rc)
-		return rc;
-
-	obd->obd_ktype.sysfs_ops = &lustre_sysfs_ops;
-	obd->obd_ktype.release = obd_sysfs_release;
-
-	obd->obd_kset.kobj.parent = obd->obd_type->typ_kobj;
-	obd->obd_kset.kobj.ktype = &obd->obd_ktype;
-	init_completion(&obd->obd_kobj_unregister);
-	rc = kset_register(&obd->obd_kset);
-	if (rc)
-		return rc;
-
-	if (uuid_only)
-		obd->obd_attrs = obd_def_uuid_attrs;
-	else
-		obd->obd_attrs = obd_def_attrs;
-
-	rc = sysfs_create_files(&obd->obd_kset.kobj, obd->obd_attrs);
-	if (rc) {
-		kset_unregister(&obd->obd_kset);
-		return rc;
-	}
-
-	if (!obd->obd_type->typ_procroot)
-		debugfs_vars = obd->obd_debugfs_vars;
-	obd->obd_debugfs_entry = ldebugfs_register(obd->obd_name,
-						   obd->obd_type->typ_debugfs_entry,
-						   debugfs_vars, obd);
-	if (IS_ERR_OR_NULL(obd->obd_debugfs_entry)) {
-		rc = obd->obd_debugfs_entry ? PTR_ERR(obd->obd_debugfs_entry)
-					    : -ENOMEM;
-		CERROR("error %d setting up debugfs for %s\n",
-		       rc, obd->obd_name);
-		obd->obd_debugfs_entry = NULL;
-
-		sysfs_remove_files(&obd->obd_kset.kobj, obd->obd_attrs);
-		obd->obd_attrs = NULL;
-		kset_unregister(&obd->obd_kset);
-		return rc;
-	}
+	int rc = 0;
 
-	if (obd->obd_proc_entry || !obd->obd_type->typ_procroot)
-		GOTO(already_registered, rc);
+	LASSERT(obd != NULL);
+	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+	LASSERT(obd->obd_type->typ_procroot != NULL);
 
 	obd->obd_proc_entry = lprocfs_register(obd->obd_name,
 					       obd->obd_type->typ_procroot,
@@ -1238,66 +1255,42 @@ int lprocfs_obd_setup(struct obd_device *obd, bool uuid_only)
 		rc = PTR_ERR(obd->obd_proc_entry);
 		CERROR("error %d setting up lprocfs for %s\n",rc,obd->obd_name);
 		obd->obd_proc_entry = NULL;
-
-		ldebugfs_remove(&obd->obd_debugfs_entry);
-		sysfs_remove_files(&obd->obd_kset.kobj, obd->obd_attrs);
-		obd->obd_attrs = NULL;
-		kset_unregister(&obd->obd_kset);
-		return rc;
 	}
-already_registered:
 	return rc;
 }
 EXPORT_SYMBOL(lprocfs_obd_setup);
 
 int lprocfs_obd_cleanup(struct obd_device *obd)
 {
-	if (!obd)
-		return -EINVAL;
-
-	if (obd->obd_proc_exports_entry) {
-		/* Should be no exports left */
-		lprocfs_remove(&obd->obd_proc_exports_entry);
-		obd->obd_proc_exports_entry = NULL;
-	}
-
-	if (obd->obd_proc_entry) {
-		lprocfs_remove(&obd->obd_proc_entry);
-		obd->obd_proc_entry = NULL;
-	}
-
-	if (!IS_ERR_OR_NULL(obd->obd_debugfs_entry))
-		ldebugfs_remove(&obd->obd_debugfs_entry);
-
-	/* obd device never allocated a kset */
-	if (!obd->obd_kset.kobj.state_initialized)
-		return 0;
-
-	if (obd->obd_attrs) {
-		sysfs_remove_files(&obd->obd_kset.kobj, obd->obd_attrs);
-		obd->obd_attrs = NULL;
-	}
-
-	kset_unregister(&obd->obd_kset);
-	wait_for_completion(&obd->obd_kobj_unregister);
-	return 0;
+        if (!obd)
+                return -EINVAL;
+        if (obd->obd_proc_exports_entry) {
+                /* Should be no exports left */
+                lprocfs_remove(&obd->obd_proc_exports_entry);
+                obd->obd_proc_exports_entry = NULL;
+        }
+        if (obd->obd_proc_entry) {
+                lprocfs_remove(&obd->obd_proc_entry);
+                obd->obd_proc_entry = NULL;
+        }
+        return 0;
 }
 EXPORT_SYMBOL(lprocfs_obd_cleanup);
 
 int lprocfs_stats_alloc_one(struct lprocfs_stats *stats, unsigned int cpuid)
 {
-	struct lprocfs_counter *cntr;
-	unsigned int percpusize;
-	int rc = -ENOMEM;
-	unsigned long flags = 0;
-	int i;
+	struct lprocfs_counter  *cntr;
+	unsigned int            percpusize;
+	int                     rc = -ENOMEM;
+	unsigned long           flags = 0;
+	int                     i;
 
 	LASSERT(stats->ls_percpu[cpuid] == NULL);
 	LASSERT((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0);
 
 	percpusize = lprocfs_stats_counter_size(stats);
 	LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[cpuid], percpusize);
-	if (stats->ls_percpu[cpuid]) {
+	if (stats->ls_percpu[cpuid] != NULL) {
 		rc = 0;
 		if (unlikely(stats->ls_biggest_alloc_num <= cpuid)) {
 			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
@@ -1324,16 +1317,16 @@ int lprocfs_stats_alloc_one(struct lprocfs_stats *stats, unsigned int cpuid)
 struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num,
                                           enum lprocfs_stats_flags flags)
 {
-	struct lprocfs_stats *stats;
-	unsigned int num_entry;
-	unsigned int percpusize = 0;
-	int i;
+	struct lprocfs_stats	*stats;
+	unsigned int		num_entry;
+	unsigned int		percpusize = 0;
+	int			i;
 
-	if (num == 0)
-		return NULL;
+        if (num == 0)
+                return NULL;
 
-	if (lprocfs_no_percpu_stats != 0)
-		flags |= LPROCFS_STATS_FLAG_NOPERCPU;
+        if (lprocfs_no_percpu_stats != 0)
+                flags |= LPROCFS_STATS_FLAG_NOPERCPU;
 
 	if (flags & LPROCFS_STATS_FLAG_NOPERCPU)
 		num_entry = 1;
@@ -1342,7 +1335,7 @@ struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num,
 
 	/* alloc percpu pointers for all possible cpu slots */
 	LIBCFS_ALLOC(stats, offsetof(typeof(*stats), ls_percpu[num_entry]));
-	if (!stats)
+	if (stats == NULL)
 		return NULL;
 
 	stats->ls_num = num;
@@ -1352,14 +1345,14 @@ struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num,
 	/* alloc num of counter headers */
 	LIBCFS_ALLOC(stats->ls_cnt_header,
 		     stats->ls_num * sizeof(struct lprocfs_counter_header));
-	if (!stats->ls_cnt_header)
+	if (stats->ls_cnt_header == NULL)
 		goto fail;
 
 	if ((flags & LPROCFS_STATS_FLAG_NOPERCPU) != 0) {
 		/* contains only one set counters */
 		percpusize = lprocfs_stats_counter_size(stats);
 		LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[0], percpusize);
-		if (!stats->ls_percpu[0])
+		if (stats->ls_percpu[0] == NULL)
 			goto fail;
 		stats->ls_biggest_alloc_num = 1;
 	} else if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) {
@@ -1384,9 +1377,9 @@ void lprocfs_free_stats(struct lprocfs_stats **statsh)
 	unsigned int percpusize;
 	unsigned int i;
 
-	if (!stats || stats->ls_num == 0)
-		return;
-	*statsh = NULL;
+        if (stats == NULL || stats->ls_num == 0)
+                return;
+        *statsh = NULL;
 
 	if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU)
 		num_entry = 1;
@@ -1395,9 +1388,9 @@ void lprocfs_free_stats(struct lprocfs_stats **statsh)
 
 	percpusize = lprocfs_stats_counter_size(stats);
 	for (i = 0; i < num_entry; i++)
-		if (stats->ls_percpu[i])
+		if (stats->ls_percpu[i] != NULL)
 			LIBCFS_FREE(stats->ls_percpu[i], percpusize);
-	if (stats->ls_cnt_header)
+	if (stats->ls_cnt_header != NULL)
 		LIBCFS_FREE(stats->ls_cnt_header, stats->ls_num *
 					sizeof(struct lprocfs_counter_header));
 	LIBCFS_FREE(stats, offsetof(typeof(*stats), ls_percpu[num_entry]));
@@ -1432,16 +1425,16 @@ EXPORT_SYMBOL(lprocfs_stats_collector);
 
 void lprocfs_clear_stats(struct lprocfs_stats *stats)
 {
-	struct lprocfs_counter *percpu_cntr;
-	int i;
-	int j;
-	unsigned int num_entry;
-	unsigned long flags = 0;
+	struct lprocfs_counter		*percpu_cntr;
+	int				i;
+	int				j;
+	unsigned int			num_entry;
+	unsigned long			flags = 0;
 
 	num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
 
 	for (i = 0; i < num_entry; i++) {
-		if (!stats->ls_percpu[i])
+		if (stats->ls_percpu[i] == NULL)
 			continue;
 		for (j = 0; j < stats->ls_num; j++) {
 			percpu_cntr = lprocfs_stats_counter_get(stats, i, j);
@@ -1463,12 +1456,12 @@ static ssize_t lprocfs_stats_seq_write(struct file *file,
 				       const char __user *buf,
 				       size_t len, loff_t *off)
 {
-	struct seq_file *seq = file->private_data;
-	struct lprocfs_stats *stats = seq->private;
+        struct seq_file *seq = file->private_data;
+        struct lprocfs_stats *stats = seq->private;
 
-	lprocfs_clear_stats(stats);
+        lprocfs_clear_stats(stats);
 
-	return len;
+        return len;
 }
 
 static void *lprocfs_stats_seq_start(struct seq_file *p, loff_t *pos)
@@ -1492,10 +1485,10 @@ static void *lprocfs_stats_seq_next(struct seq_file *p, void *v, loff_t *pos)
 /* seq file export of one lprocfs counter */
 static int lprocfs_stats_seq_show(struct seq_file *p, void *v)
 {
-	struct lprocfs_stats *stats = p->private;
-	struct lprocfs_counter_header *hdr;
-	struct lprocfs_counter ctr;
-	int idx = *(loff_t *)v;
+	struct lprocfs_stats		*stats	= p->private;
+	struct lprocfs_counter_header	*hdr;
+	struct lprocfs_counter		 ctr;
+	int				 idx	= *(loff_t *)v;
 
 	if (idx == 0) {
 		struct timespec64 now;
@@ -1544,20 +1537,10 @@ static int lprocfs_stats_seq_open(struct inode *inode, struct file *file)
 	if (rc)
 		return rc;
 	seq = file->private_data;
-	seq->private = inode->i_private ? inode->i_private : PDE_DATA(inode);
+	seq->private = inode->i_private ? : PDE_DATA(inode);
 	return 0;
 }
 
-const struct file_operations ldebugfs_stats_seq_fops = {
-	.owner   = THIS_MODULE,
-	.open    = lprocfs_stats_seq_open,
-	.read    = seq_read,
-	.write   = lprocfs_stats_seq_write,
-	.llseek  = seq_lseek,
-	.release = lprocfs_seq_release,
-};
-EXPORT_SYMBOL(ldebugfs_stats_seq_fops);
-
 static const struct proc_ops lprocfs_stats_seq_fops = {
 	PROC_OWNER(THIS_MODULE)
 	.proc_open	= lprocfs_stats_seq_open,
@@ -1567,6 +1550,15 @@ static const struct proc_ops lprocfs_stats_seq_fops = {
 	.proc_release	= lprocfs_seq_release,
 };
 
+static const struct file_operations ldebugfs_stats_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = lprocfs_stats_seq_open,
+	.read	 = seq_read,
+	.write	 = lprocfs_stats_seq_write,
+	.llseek	 = seq_lseek,
+	.release = lprocfs_seq_release,
+};
+
 int ldebugfs_register_stats(struct dentry *parent, const char *name,
 			    struct lprocfs_stats *stats)
 {
@@ -1591,7 +1583,7 @@ int lprocfs_register_stats(struct proc_dir_entry *root, const char *name,
 
 	entry = proc_create_data(name, 0644, root,
 				 &lprocfs_stats_seq_fops, stats);
-	if (!entry)
+	if (entry == NULL)
 		return -ENOMEM;
 	return 0;
 }
@@ -1600,11 +1592,11 @@ EXPORT_SYMBOL(lprocfs_register_stats);
 void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
 			  unsigned conf, const char *name, const char *units)
 {
-	struct lprocfs_counter_header *header;
-	struct lprocfs_counter *percpu_cntr;
-	unsigned long flags = 0;
-	unsigned int i;
-	unsigned int num_cpu;
+	struct lprocfs_counter_header	*header;
+	struct lprocfs_counter		*percpu_cntr;
+	unsigned long			flags = 0;
+	unsigned int			i;
+	unsigned int			num_cpu;
 
 	LASSERT(stats != NULL);
 
@@ -1618,7 +1610,7 @@ void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
 
 	num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
 	for (i = 0; i < num_cpu; ++i) {
-		if (!stats->ls_percpu[i])
+		if (stats->ls_percpu[i] == NULL)
 			continue;
 		percpu_cntr = lprocfs_stats_counter_get(stats, i, index);
 		percpu_cntr->lc_count		= 0;
@@ -1633,23 +1625,49 @@ void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
 }
 EXPORT_SYMBOL(lprocfs_counter_init);
 
-static const char * const mps_stats[] = {
-	[LPROC_MD_CLOSE]		= "close",
-	[LPROC_MD_CREATE]		= "create",
-	[LPROC_MD_ENQUEUE]		= "enqueue",
-	[LPROC_MD_GETATTR]		= "getattr",
-	[LPROC_MD_INTENT_LOCK]		= "intent_lock",
-	[LPROC_MD_LINK]			= "link",
-	[LPROC_MD_RENAME]		= "rename",
-	[LPROC_MD_SETATTR]		= "setattr",
-	[LPROC_MD_FSYNC]		= "fsync",
-	[LPROC_MD_READ_PAGE]		= "read_page",
-	[LPROC_MD_UNLINK]		= "unlink",
-	[LPROC_MD_SETXATTR]		= "setxattr",
-	[LPROC_MD_GETXATTR]		= "getxattr",
-	[LPROC_MD_INTENT_GETATTR_ASYNC]	= "intent_getattr_async",
-	[LPROC_MD_REVALIDATE_LOCK]	= "revalidate_lock",
-};
+/* Note that we only init md counters for ops whose offset is less
+ * than NUM_MD_STATS. This is explained in a comment in the definition
+ * of struct md_ops. */
+#define LPROCFS_MD_OP_INIT(base, stats, op)				       \
+	do {								       \
+		unsigned int _idx = base + MD_COUNTER_OFFSET(op);	       \
+									       \
+		if (MD_COUNTER_OFFSET(op) < NUM_MD_STATS) {		       \
+			LASSERT(_idx < stats->ls_num);			       \
+			lprocfs_counter_init(stats, _idx, 0, #op, "reqs");     \
+		}							       \
+	} while (0)
+
+void lprocfs_init_mps_stats(int num_private_stats, struct lprocfs_stats *stats)
+{
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, get_root);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, null_inode);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, close);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, create);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, enqueue);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr_name);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_lock);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, link);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, rename);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, setattr);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, fsync);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, read_page);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, unlink);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, setxattr);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, getxattr);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, init_ea_size);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, get_lustre_md);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, free_lustre_md);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, merge_attr);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, set_open_replay_data);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, clear_open_replay_data);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, set_lock_data);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, lock_match);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, cancel_unused);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_getattr_async);
+        LPROCFS_MD_OP_INIT(num_private_stats, stats, revalidate_lock);
+}
 
 int lprocfs_alloc_md_stats(struct obd_device *obd,
 			   unsigned int num_private_stats)
@@ -1658,8 +1676,11 @@ int lprocfs_alloc_md_stats(struct obd_device *obd,
 	unsigned int num_stats;
 	int rc, i;
 
-	/*
-	 * TODO Ensure that this function is only used where
+	CLASSERT(offsetof(struct md_ops, MD_STATS_FIRST_OP) == 0);
+	CLASSERT(_MD_COUNTER_OFFSET(MD_STATS_FIRST_OP) == 0);
+	CLASSERT(_MD_COUNTER_OFFSET(MD_STATS_LAST_OP) > 0);
+
+	/* TODO Ensure that this function is only used where
 	 * appropriate by adding an assertion to the effect that
 	 * obd->obd_type->typ_md_ops is not NULL. We can't do this now
 	 * because mdt_procfs_init() uses this function to allocate
@@ -1669,17 +1690,20 @@ int lprocfs_alloc_md_stats(struct obd_device *obd,
 	 */
 	LASSERT(obd->obd_proc_entry != NULL);
 	LASSERT(obd->obd_md_stats == NULL);
+	LASSERT(obd->obd_md_cntr_base == 0);
 
-	num_stats = ARRAY_SIZE(mps_stats) + num_private_stats;
+	num_stats = NUM_MD_STATS + num_private_stats;
 	stats = lprocfs_alloc_stats(num_stats, 0);
-	if (!stats)
+	if (stats == NULL)
 		return -ENOMEM;
 
-	for (i = 0; i < ARRAY_SIZE(mps_stats); i++) {
-		lprocfs_counter_init(stats, i, 0, mps_stats[i], "reqs");
-		if (!stats->ls_cnt_header[i].lc_name) {
-			CERROR("Missing md_stat initializer md_op operation at offset %d. Aborting.\n",
-			       i);
+	lprocfs_init_mps_stats(num_private_stats, stats);
+
+	for (i = num_private_stats; i < num_stats; i++) {
+		if (stats->ls_cnt_header[i].lc_name == NULL) {
+			CERROR("Missing md_stat initializer md_op "
+			       "operation at offset %d. Aborting.\n",
+			       i - num_private_stats);
 			LBUG();
 		}
 	}
@@ -1689,6 +1713,7 @@ int lprocfs_alloc_md_stats(struct obd_device *obd,
 		lprocfs_free_stats(&stats);
 	} else {
 		obd->obd_md_stats = stats;
+		obd->obd_md_cntr_base = num_private_stats;
 	}
 
 	return rc;
@@ -1699,8 +1724,9 @@ void lprocfs_free_md_stats(struct obd_device *obd)
 {
 	struct lprocfs_stats *stats = obd->obd_md_stats;
 
-	if (stats) {
+	if (stats != NULL) {
 		obd->obd_md_stats = NULL;
+		obd->obd_md_cntr_base = 0;
 		lprocfs_free_stats(&stats);
 	}
 }
@@ -1708,24 +1734,24 @@ EXPORT_SYMBOL(lprocfs_free_md_stats);
 
 void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
 {
-	lprocfs_counter_init(ldlm_stats,
-			     LDLM_ENQUEUE - LDLM_FIRST_OPC,
-			     0, "ldlm_enqueue", "reqs");
-	lprocfs_counter_init(ldlm_stats,
-			     LDLM_CONVERT - LDLM_FIRST_OPC,
-			     0, "ldlm_convert", "reqs");
-	lprocfs_counter_init(ldlm_stats,
-			     LDLM_CANCEL - LDLM_FIRST_OPC,
-			     0, "ldlm_cancel", "reqs");
-	lprocfs_counter_init(ldlm_stats,
-			     LDLM_BL_CALLBACK - LDLM_FIRST_OPC,
-			     0, "ldlm_bl_callback", "reqs");
-	lprocfs_counter_init(ldlm_stats,
-			     LDLM_CP_CALLBACK - LDLM_FIRST_OPC,
-			     0, "ldlm_cp_callback", "reqs");
-	lprocfs_counter_init(ldlm_stats,
-			     LDLM_GL_CALLBACK - LDLM_FIRST_OPC,
-			     0, "ldlm_gl_callback", "reqs");
+        lprocfs_counter_init(ldlm_stats,
+                             LDLM_ENQUEUE - LDLM_FIRST_OPC,
+                             0, "ldlm_enqueue", "reqs");
+        lprocfs_counter_init(ldlm_stats,
+                             LDLM_CONVERT - LDLM_FIRST_OPC,
+                             0, "ldlm_convert", "reqs");
+        lprocfs_counter_init(ldlm_stats,
+                             LDLM_CANCEL - LDLM_FIRST_OPC,
+                             0, "ldlm_cancel", "reqs");
+        lprocfs_counter_init(ldlm_stats,
+                             LDLM_BL_CALLBACK - LDLM_FIRST_OPC,
+                             0, "ldlm_bl_callback", "reqs");
+        lprocfs_counter_init(ldlm_stats,
+                             LDLM_CP_CALLBACK - LDLM_FIRST_OPC,
+                             0, "ldlm_cp_callback", "reqs");
+        lprocfs_counter_init(ldlm_stats,
+                             LDLM_GL_CALLBACK - LDLM_FIRST_OPC,
+                             0, "ldlm_gl_callback", "reqs");
 }
 EXPORT_SYMBOL(lprocfs_init_ldlm_stats);
 
@@ -1736,7 +1762,7 @@ __s64 lprocfs_read_helper(struct lprocfs_counter *lc,
 {
 	__s64 ret = 0;
 
-	if (!lc || !header)
+	if (lc == NULL || header == NULL)
 		RETURN(0);
 
 	switch (field) {
@@ -1770,6 +1796,86 @@ __s64 lprocfs_read_helper(struct lprocfs_counter *lc,
 }
 EXPORT_SYMBOL(lprocfs_read_helper);
 
+int lprocfs_read_frac_helper(char *buffer, unsigned long count, long val,
+                             int mult)
+{
+        long decimal_val, frac_val;
+        int prtn;
+
+        if (count < 10)
+                return -EINVAL;
+
+        decimal_val = val / mult;
+        prtn = snprintf(buffer, count, "%ld", decimal_val);
+        frac_val = val % mult;
+
+        if (prtn < (count - 4) && frac_val > 0) {
+                long temp_frac;
+                int i, temp_mult = 1, frac_bits = 0;
+
+                temp_frac = frac_val * 10;
+                buffer[prtn++] = '.';
+                while (frac_bits < 2 && (temp_frac / mult) < 1 ) {
+                        /* only reserved 2 bits fraction */
+                        buffer[prtn++] ='0';
+                        temp_frac *= 10;
+                        frac_bits++;
+                }
+                /*
+                 * Need to think these cases :
+                 *      1. #echo x.00 > /proc/xxx       output result : x
+                 *      2. #echo x.0x > /proc/xxx       output result : x.0x
+                 *      3. #echo x.x0 > /proc/xxx       output result : x.x
+                 *      4. #echo x.xx > /proc/xxx       output result : x.xx
+                 *      Only reserved 2 bits fraction.
+                 */
+                for (i = 0; i < (5 - prtn); i++)
+                        temp_mult *= 10;
+
+                frac_bits = min((int)count - prtn, 3 - frac_bits);
+                prtn += snprintf(buffer + prtn, frac_bits, "%ld",
+                                 frac_val * temp_mult / mult);
+
+                prtn--;
+                while(buffer[prtn] < '1' || buffer[prtn] > '9') {
+                        prtn--;
+                        if (buffer[prtn] == '.') {
+                                prtn--;
+                                break;
+                        }
+                }
+                prtn++;
+        }
+        buffer[prtn++] ='\n';
+        return prtn;
+}
+EXPORT_SYMBOL(lprocfs_read_frac_helper);
+
+int lprocfs_seq_read_frac_helper(struct seq_file *m, long val, int mult)
+{
+	long decimal_val, frac_val;
+
+	decimal_val = val / mult;
+	seq_printf(m, "%ld", decimal_val);
+	frac_val = val % mult;
+
+	if (frac_val > 0) {
+		frac_val *= 100;
+		frac_val /= mult;
+	}
+	if (frac_val > 0) {
+		/* Three cases: x0, xx, 0x */
+		if ((frac_val % 10) != 0)
+			seq_printf(m, ".%ld", frac_val);
+		else
+			seq_printf(m, ".%ld", frac_val / 10);
+	}
+
+	seq_printf(m, "\n");
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_seq_read_frac_helper);
+
 /* Obtains the conversion factor for the unit specified */
 static int get_mult(char unit, __u64 *mult)
 {
@@ -1780,19 +1886,19 @@ static int get_mult(char unit, __u64 *mult)
 	case 'p':
 	case 'P':
 		units <<= 10;
-		/* fallthrough */
+		/* Fall through */
 	case 't':
 	case 'T':
 		units <<= 10;
-		/* fallthrough */
+		/* Fall through */
 	case 'g':
 	case 'G':
 		units <<= 10;
-		/* fallthrough */
+		/* Fall through */
 	case 'm':
 	case 'M':
 		units <<= 10;
-		/* fallthrough */
+		/* Fall through */
 	case 'k':
 	case 'K':
 		units <<= 10;
@@ -1937,7 +2043,7 @@ static int str_to_u64_parse(char *buffer, unsigned long count,
 	}
 
 	/* the multiplier limits how large the value can be */
-	wrap_indicator = div64_u64(wrap_indicator, mult);
+	wrap_indicator /=  mult;
 
 	if (strwhole) {
 		rc = kstrtoull(strwhole, base, &whole);
@@ -1988,7 +2094,8 @@ static int str_to_u64_parse(char *buffer, unsigned long count,
  * of the signed integer.
  */
 static int str_to_s64_internal(const char __user *buffer, unsigned long count,
-			       __s64 *val, __u64 def_mult, bool allow_units)
+			       __s64 *val, __u64 def_mult, bool allow_units,
+			       bool kernel_space)
 {
 	char kernbuf[22];
 	__u64 tmp;
@@ -2000,8 +2107,12 @@ static int str_to_s64_internal(const char __user *buffer, unsigned long count,
 	if (count > (sizeof(kernbuf) - 1))
 		return -EINVAL;
 
-	if (copy_from_user(kernbuf, buffer, count))
-		return -EFAULT;
+	if (kernel_space) {
+		memcpy(kernbuf, buffer, count);
+	} else {
+		if (copy_from_user(kernbuf, buffer, count))
+			return -EFAULT;
+	}
 
 	kernbuf[count] = '\0';
 
@@ -2027,6 +2138,29 @@ static int str_to_s64_internal(const char __user *buffer, unsigned long count,
 	return 0;
 }
 
+/**
+ * Convert a user string into a signed 64 bit number. This function produces
+ * an error when the value parsed from the string underflows or
+ * overflows. This function accepts strings which contain digits and
+ * optionally a decimal or hex strings which are prefixed with "0x".
+ *
+ * \param[in] buffer	string consisting of numbers and optionally a decimal
+ * \param[in] count	buffer length
+ * \param[in] val	if successful, the value represented by the string
+ *
+ * \retval		0 on success
+ * \retval		negative number on error
+ */
+int lprocfs_str_to_s64(struct file *file, const char __user *buffer,
+		       unsigned long count, __s64 *val)
+{
+	bool kernel_space;
+
+	kernel_space = lprocfs_file_is_kernel(file);
+	return str_to_s64_internal(buffer, count, val, 1, false, kernel_space);
+}
+EXPORT_SYMBOL(lprocfs_str_to_s64);
+
 /**
  * Convert a user string into a signed 64 bit number. This function produces
  * an error when the value parsed from the string times multiplier underflows or
@@ -2044,11 +2178,12 @@ static int str_to_s64_internal(const char __user *buffer, unsigned long count,
  * \retval		0 on success
  * \retval		negative number on error
  */
-int lprocfs_str_with_units_to_s64(const char __user *buffer,
+int lprocfs_str_with_units_to_s64(struct file *file, const char __user *buffer,
 				  unsigned long count, __s64 *val, char defunit)
 {
 	__u64 mult = 1;
 	int rc;
+	bool kernel_space;
 
 	if (defunit != '1') {
 		rc = get_mult(defunit, &mult);
@@ -2056,7 +2191,10 @@ int lprocfs_str_with_units_to_s64(const char __user *buffer,
 			return rc;
 	}
 
-	return str_to_s64_internal(buffer, count, val, mult, true);
+	kernel_space = lprocfs_file_is_kernel(file);
+
+	return str_to_s64_internal(buffer, count, val, mult, true,
+			kernel_space);
 }
 EXPORT_SYMBOL(lprocfs_str_with_units_to_s64);
 
@@ -2090,7 +2228,7 @@ char *lprocfs_find_named_value(const char *buffer, const char *name,
 
 	/* there is no strnstr() in rhel5 and ubuntu kernels */
 	val = lprocfs_strnstr(buffer, name, buflen);
-	if (!val)
+	if (val == NULL)
 		return (char *)buffer;
 
 	val += strlen(name);                             /* skip prefix */
@@ -2137,7 +2275,7 @@ int lprocfs_seq_create(struct proc_dir_entry *parent,
 
 	entry = proc_create_data(name, mode, parent, seq_fops, data);
 
-	if (!entry)
+	if (entry == NULL)
 		RETURN(-ENOMEM);
 
 	RETURN(0);
@@ -2179,12 +2317,12 @@ EXPORT_SYMBOL(lprocfs_oh_tally_log2);
 
 unsigned long lprocfs_oh_sum(struct obd_histogram *oh)
 {
-	unsigned long ret = 0;
-	int i;
+        unsigned long ret = 0;
+        int i;
 
-	for (i = 0; i < OBD_HIST_MAX; i++)
-		ret +=  oh->oh_buckets[i];
-	return ret;
+        for (i = 0; i < OBD_HIST_MAX; i++)
+                ret +=  oh->oh_buckets[i];
+        return ret;
 }
 EXPORT_SYMBOL(lprocfs_oh_sum);
 
@@ -2241,9 +2379,9 @@ ssize_t lprocfs_obd_max_pages_per_rpc_seq_write(struct file *file,
 	struct client_obd *cli = &dev->u.cli;
 	struct obd_connect_data *ocd = &cli->cl_import->imp_connect_data;
 	int chunk_mask, rc;
-	s64 val;
+	__s64 val;
 
-	rc = lprocfs_str_with_units_to_s64(buffer, count, &val, '1');
+	rc = lprocfs_str_with_units_to_s64(file, buffer, count, &val, '1');
 	if (rc)
 		return rc;
 	if (val < 0)
@@ -2273,59 +2411,9 @@ ssize_t lprocfs_obd_max_pages_per_rpc_seq_write(struct file *file,
 }
 EXPORT_SYMBOL(lprocfs_obd_max_pages_per_rpc_seq_write);
 
-ssize_t short_io_bytes_show(struct kobject *kobj, struct attribute *attr,
-			    char *buf)
-{
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct client_obd *cli = &dev->u.cli;
-	int rc;
-
-	spin_lock(&cli->cl_loi_list_lock);
-	rc = sprintf(buf, "%d\n", cli->cl_max_short_io_bytes);
-	spin_unlock(&cli->cl_loi_list_lock);
-	return rc;
-}
-EXPORT_SYMBOL(short_io_bytes_show);
-
-/* Used to catch people who think they're specifying pages. */
-#define MIN_SHORT_IO_BYTES 64U
-
-ssize_t short_io_bytes_store(struct kobject *kobj, struct attribute *attr,
-			     const char *buffer, size_t count)
-{
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct client_obd *cli = &dev->u.cli;
-	u32 val;
-	int rc;
-
-	LPROCFS_CLIMP_CHECK(dev);
-
-	rc = kstrtouint(buffer, 0, &val);
-	if (rc)
-		GOTO(out, rc);
-
-	if (val && (val < MIN_SHORT_IO_BYTES || val > OBD_MAX_SHORT_IO_BYTES))
-		GOTO(out, rc = -ERANGE);
-
-	rc = count;
-
-	spin_lock(&cli->cl_loi_list_lock);
-	if (val > (cli->cl_max_pages_per_rpc << PAGE_SHIFT))
-		rc = -ERANGE;
-	else
-		cli->cl_max_short_io_bytes = val;
-	spin_unlock(&cli->cl_loi_list_lock);
-
-out:
-	LPROCFS_CLIMP_EXIT(dev);
-	return rc;
-}
-EXPORT_SYMBOL(short_io_bytes_store);
-
-int lprocfs_wr_root_squash(const char __user *buffer, unsigned long count,
-			   struct root_squash_info *squash, char *name)
+int lprocfs_wr_root_squash(struct file *file, const char __user *buffer,
+			   unsigned long count, struct root_squash_info *squash,
+			   char *name)
 {
 	int rc;
 	char kernbuf[64], *tmp, *errmsg;
@@ -2336,7 +2424,7 @@ int lprocfs_wr_root_squash(const char __user *buffer, unsigned long count,
 		errmsg = "string too long";
 		GOTO(failed_noprint, rc = -EINVAL);
 	}
-	if (copy_from_user(kernbuf, buffer, count)) {
+	if (lprocfs_copy_from_user(file, kernbuf, buffer, count)) {
 		errmsg = "bad address";
 		GOTO(failed_noprint, rc = -EFAULT);
 	}
@@ -2344,7 +2432,7 @@ int lprocfs_wr_root_squash(const char __user *buffer, unsigned long count,
 
 	/* look for uid gid separator */
 	tmp = strchr(kernbuf, ':');
-	if (!tmp) {
+	if (tmp == NULL) {
 		errmsg = "needs uid:gid format";
 		GOTO(failed, rc = -EINVAL);
 	}
@@ -2371,7 +2459,7 @@ int lprocfs_wr_root_squash(const char __user *buffer, unsigned long count,
 	RETURN(count);
 
 failed:
-	if (tmp) {
+	if (tmp != NULL) {
 		tmp--;
 		*tmp = ':';
 	}
@@ -2386,7 +2474,8 @@ int lprocfs_wr_root_squash(const char __user *buffer, unsigned long count,
 EXPORT_SYMBOL(lprocfs_wr_root_squash);
 
 
-int lprocfs_wr_nosquash_nids(const char __user *buffer, unsigned long count,
+int lprocfs_wr_nosquash_nids(struct file *file, const char __user *buffer,
+			     unsigned long count,
 			     struct root_squash_info *squash, char *name)
 {
 	int rc;
@@ -2402,11 +2491,11 @@ int lprocfs_wr_nosquash_nids(const char __user *buffer, unsigned long count,
 	}
 
 	OBD_ALLOC(kernbuf, count + 1);
-	if (!kernbuf) {
+	if (kernbuf == NULL) {
 		errmsg = "no memory";
 		GOTO(failed, rc = -ENOMEM);
 	}
-	if (copy_from_user(kernbuf, buffer, count)) {
+	if (lprocfs_copy_from_user(file, kernbuf, buffer, count)) {
 		errmsg = "bad address";
 		GOTO(failed, rc = -EFAULT);
 	}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c
index 4df66a941e535..6d78831dd37fe 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2014, 2017, Intel Corporation.
+ * Copyright (c) 2014, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -34,57 +34,12 @@
 
 #define DEBUG_SUBSYSTEM S_CLASS
 
-#include <linux/kobject.h>
-#include <linux/sysfs.h>
 
 #include <obd_class.h>
 #include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
 #include <lustre_nodemap.h>
 
-#define MAX_STRING_SIZE 128
-
-struct dentry *ldebugfs_add_symlink(const char *name, const char *target,
-				    const char *format, ...)
-{
-	struct dentry *entry = NULL;
-	struct dentry *parent;
-	struct qstr dname;
-	va_list ap;
-	char *dest;
-
-	if (!target || !format)
-		return NULL;
-
-	dname.name = target;
-	dname.len = strlen(dname.name);
-	dname.hash = ll_full_name_hash(debugfs_lustre_root,
-				       dname.name, dname.len);
-	parent = d_lookup(debugfs_lustre_root, &dname);
-	if (!parent)
-		return NULL;
-
-	OBD_ALLOC_WAIT(dest, MAX_STRING_SIZE + 1);
-	if (!dest)
-		goto no_entry;
-
-	va_start(ap, format);
-	vsnprintf(dest, MAX_STRING_SIZE, format, ap);
-	va_end(ap);
-
-	entry = debugfs_create_symlink(name, parent, dest);
-	if (IS_ERR_OR_NULL(entry)) {
-		CERROR("LdebugFS: Could not create symbolic link from %s to %s\n",
-		       name, dest);
-		entry = NULL;
-	}
-
-	OBD_FREE(dest, MAX_STRING_SIZE + 1);
-no_entry:
-	dput(parent);
-	return entry;
-}
-EXPORT_SYMBOL(ldebugfs_add_symlink);
-
 #ifdef CONFIG_PROC_FS
 
 int lprocfs_evict_client_open(struct inode *inode, struct file *f)
@@ -124,7 +79,7 @@ lprocfs_evict_client_seq_write(struct file *file, const char __user *buffer,
 	 * bytes into kbuf, to ensure that the string is NUL-terminated.
 	 * UUID_MAX should include a trailing NUL already.
 	 */
-	if (copy_from_user(kbuf, buffer,
+	if (lprocfs_copy_from_user(file, kbuf, buffer,
 			   min_t(unsigned long, BUFLEN - 1, count))) {
 		count = -EFAULT;
 		goto out;
@@ -149,108 +104,15 @@ EXPORT_SYMBOL(lprocfs_evict_client_seq_write);
 
 #undef BUFLEN
 
-ssize_t num_exports_show(struct kobject *kobj, struct attribute *attr,
-			 char *buf)
+int lprocfs_num_exports_seq_show(struct seq_file *m, void *data)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
+	struct obd_device *obd = data;
 
-	return scnprintf(buf, PAGE_SIZE, "%u\n", obd->obd_num_exports);
-}
-EXPORT_SYMBOL(num_exports_show);
-
-static int obd_export_flags2str(struct obd_export *exp, struct seq_file *m)
-{
-	bool first = true;
-
-	flag2str(exp, failed);
-	flag2str(exp, in_recovery);
-	flag2str(exp, disconnected);
-	flag2str(exp, connecting);
-
-	return 0;
-}
-
-static int
-lprocfs_exp_print_export_seq(struct cfs_hash *hs, struct cfs_hash_bd *bd,
-			     struct hlist_node *hnode, void *cb_data)
-{
-	struct seq_file		*m = cb_data;
-	struct obd_export	*exp = cfs_hash_object(hs, hnode);
-	struct obd_device	*obd;
-	struct obd_connect_data	*ocd;
-
-	LASSERT(exp != NULL);
-	if (exp->exp_nid_stats == NULL)
-		goto out;
-	obd = exp->exp_obd;
-	ocd = &exp->exp_connect_data;
-
-	seq_printf(m, "%s:\n"
-		   "    name: %s\n"
-		   "    client: %s\n"
-		   "    connect_flags: [ ",
-		   obd_uuid2str(&exp->exp_client_uuid),
-		   obd->obd_name,
-		   obd_export_nid2str(exp));
-	obd_connect_seq_flags2str(m, ocd->ocd_connect_flags,
-				  ocd->ocd_connect_flags2, ", ");
-	seq_printf(m, " ]\n");
-	obd_connect_data_seqprint(m, ocd);
-	seq_printf(m, "    export_flags: [ ");
-	obd_export_flags2str(exp, m);
-	seq_printf(m, " ]\n");
-
-	if (obd->obd_type &&
-	    strcmp(obd->obd_type->typ_name, "obdfilter") == 0) {
-		struct filter_export_data *fed = &exp->exp_filter_data;
-
-		seq_printf(m, "    grant:\n");
-		seq_printf(m, "       granted: %ld\n",
-			fed->fed_ted.ted_grant);
-		seq_printf(m, "       dirty: %ld\n",
-			fed->fed_ted.ted_dirty);
-		seq_printf(m, "       pending: %ld\n",
-			fed->fed_ted.ted_pending);
-	}
-
-out:
-	return 0;
-}
-
-/**
- * RPC connections are composed of an import and an export. Using the
- * lctl utility we can extract important information about the state.
- * The lprocfs_exp_export_seq_show routine displays the state information
- * for the export.
- *
- * \param[in] m		seq file
- * \param[in] data	unused
- *
- * \retval		0 on success
- *
- * The format of the export state information is like:
- * a793e354-49c0-aa11-8c4f-a4f2b1a1a92b:
- *     name: MGS
- *     client: 10.211.55.10@tcp
- *     connect_flags: [ version, barrier, adaptive_timeouts, ... ]
- *     connect_data:
- *        flags: 0x2000011005002020
- *        instance: 0
- *        target_version: 2.10.51.0
- *        export_flags: [ ... ]
- *
- */
-static int lprocfs_exp_export_seq_show(struct seq_file *m, void *data)
-{
-	struct nid_stat *stats = m->private;
-	struct obd_device *obd = stats->nid_obd;
-
-	cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
-			      lprocfs_exp_print_export_seq, m);
+	LASSERT(obd != NULL);
+	seq_printf(m, "%u\n", obd->obd_num_exports);
 	return 0;
 }
-LPROC_SEQ_FOPS_RO(lprocfs_exp_export);
+EXPORT_SYMBOL(lprocfs_num_exports_seq_show);
 
 static void lprocfs_free_client_stats(struct nid_stat *client_stat)
 {
@@ -397,30 +259,6 @@ int lprocfs_exp_replydata_seq_show(struct seq_file *m, void *data)
 }
 LPROC_SEQ_FOPS_RO(lprocfs_exp_replydata);
 
-int lprocfs_exp_print_fmd_count_seq(struct cfs_hash *hs, struct cfs_hash_bd *bd,
-				    struct hlist_node *hnode, void *cb_data)
-
-{
-	struct obd_export *exp = cfs_hash_object(hs, hnode);
-	struct seq_file *m = cb_data;
-	struct tg_export_data *ted = &exp->exp_target_data;
-
-	seq_printf(m, "%d\n", ted->ted_fmd_count);
-
-	return 0;
-}
-
-int lprocfs_exp_fmd_count_seq_show(struct seq_file *m, void *data)
-{
-	struct nid_stat *stats = m->private;
-	struct obd_device *obd = stats->nid_obd;
-
-	cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
-			      lprocfs_exp_print_fmd_count_seq, m);
-	return 0;
-}
-LPROC_SEQ_FOPS_RO(lprocfs_exp_fmd_count);
-
 int lprocfs_nid_stats_clear_seq_show(struct seq_file *m, void *data)
 {
 	seq_puts(m, "Write into this file to clear all nid stats and stale nid entries\n");
@@ -546,8 +384,7 @@ int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid)
 				   &lprocfs_exp_nodemap_fops);
 	if (IS_ERR(entry)) {
 		rc = PTR_ERR(entry);
-		CWARN("%s: error adding the nodemap file: rc = %d\n",
-		      obd->obd_name, rc);
+		CWARN("Error adding the nodemap file: rc = %d\n", rc);
 		GOTO(destroy_new_ns, rc);
 	}
 
@@ -555,8 +392,7 @@ int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid)
 				   &lprocfs_exp_uuid_fops);
 	if (IS_ERR(entry)) {
 		rc = PTR_ERR(entry);
-		CWARN("%s: error adding the NID stats file: rc = %d\n",
-		      obd->obd_name, rc);
+		CWARN("Error adding the NID stats file: rc = %d\n", rc);
 		GOTO(destroy_new_ns, rc);
 	}
 
@@ -564,17 +400,7 @@ int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid)
 				   &lprocfs_exp_hash_fops);
 	if (IS_ERR(entry)) {
 		rc = PTR_ERR(entry);
-		CWARN("%s: error adding the hash file: rc = %d\n",
-		      obd->obd_name, rc);
-		GOTO(destroy_new_ns, rc);
-	}
-
-	entry = lprocfs_add_simple(new_stat->nid_proc, "export",
-				   new_stat, &lprocfs_exp_export_fops);
-	if (IS_ERR(entry)) {
-		rc = PTR_ERR(entry);
-		CWARN("%s: error adding the export file: rc = %d\n",
-		      obd->obd_name, rc);
+		CWARN("Error adding the hash file: rc = %d\n", rc);
 		GOTO(destroy_new_ns, rc);
 	}
 
@@ -582,16 +408,7 @@ int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid)
 				   &lprocfs_exp_replydata_fops);
 	if (IS_ERR(entry)) {
 		rc = PTR_ERR(entry);
-		CWARN("%s: error adding the reply_data file: rc = %d\n",
-		      obd->obd_name, rc);
-		GOTO(destroy_new_ns, rc);
-	}
-
-	entry = lprocfs_add_simple(new_stat->nid_proc, "fmd_count", new_stat,
-				   &lprocfs_exp_fmd_count_fops);
-	if (IS_ERR(entry)) {
-		rc = PTR_ERR(entry);
-		CWARN("%s: error adding the fmd_count file: rc = %d\n",
+		CWARN("%s: Error adding the reply_data file: rc = %d\n",
 		      obd->obd_name, rc);
 		GOTO(destroy_new_ns, rc);
 	}
@@ -632,24 +449,92 @@ int lprocfs_exp_cleanup(struct obd_export *exp)
 	return 0;
 }
 
-int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned int num_stats)
+#define LPROCFS_OBD_OP_INIT(base, stats, op)			\
+do {								\
+	unsigned int coffset = base + OBD_COUNTER_OFFSET(op);	\
+	LASSERT(coffset < stats->ls_num);			\
+	lprocfs_counter_init(stats, coffset, 0, #op, "reqs");	\
+} while (0)
+
+void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats)
+{
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, iocontrol);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_info);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, set_info_async);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, setup);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, precleanup);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, cleanup);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, process_config);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, postrecov);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, add_conn);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, del_conn);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, connect);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, reconnect);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, disconnect);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_init);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_fini);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_alloc);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs_async);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, create);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, preprw);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, commitrw);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, init_export);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy_export);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, import_event);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, notify);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, health_check);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_uuid);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotactl);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, ping);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_new);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_rem);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_add);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_del);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, getref);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, putref);
+
+	CLASSERT(NUM_OBD_STATS == OBD_COUNTER_OFFSET(putref) + 1);
+}
+EXPORT_SYMBOL(lprocfs_init_ops_stats);
+
+int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats)
 {
 	struct lprocfs_stats *stats;
-	int rc;
+	unsigned int num_stats;
+	int rc, i;
 
 	LASSERT(obd->obd_stats == NULL);
 	LASSERT(obd->obd_proc_entry != NULL);
+	LASSERT(obd->obd_cntr_base == 0);
 
+	num_stats = NUM_OBD_STATS + num_private_stats;
 	stats = lprocfs_alloc_stats(num_stats, 0);
 	if (stats == NULL)
 		return -ENOMEM;
 
+	lprocfs_init_ops_stats(num_private_stats, stats);
+
+	for (i = num_private_stats; i < num_stats; i++) {
+		/* If this LBUGs, it is likely that an obd
+		 * operation was added to struct obd_ops in
+		 * <obd.h>, and that the corresponding line item
+		 * LPROCFS_OBD_OP_INIT(.., .., opname)
+		 * is missing from the list above. */
+		LASSERTF(stats->ls_cnt_header[i].lc_name != NULL,
+			 "Missing obd_stat initializer obd_op "
+			 "operation at offset %d.\n", i - num_private_stats);
+	}
 	rc = lprocfs_register_stats(obd->obd_proc_entry, "stats", stats);
-	if (rc < 0)
+	if (rc < 0) {
 		lprocfs_free_stats(&stats);
-	else
-		obd->obd_stats = stats;
-
+	} else {
+		obd->obd_stats  = stats;
+		obd->obd_cntr_base = num_private_stats;
+	}
 	return rc;
 }
 EXPORT_SYMBOL(lprocfs_alloc_obd_stats);
@@ -684,7 +569,7 @@ int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data)
 	LASSERT(obd != NULL);
 
 	seq_printf(m, "status: ");
-	if (atomic_read(&obd->obd_max_recoverable_clients) == 0) {
+	if (obd->obd_max_recoverable_clients == 0) {
 		seq_printf(m, "INACTIVE\n");
 		goto out;
 	}
@@ -700,9 +585,9 @@ int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data)
 			   ktime_get_real_seconds() - obd->obd_recovery_start);
 		/* Number of clients that have completed recovery */
 		seq_printf(m, "completed_clients: %d/%d\n",
-			   atomic_read(&obd->obd_max_recoverable_clients) -
+			   obd->obd_max_recoverable_clients -
 			   obd->obd_stale_clients,
-			   atomic_read(&obd->obd_max_recoverable_clients));
+			   obd->obd_max_recoverable_clients);
 		seq_printf(m, "replayed_requests: %d\n",
 			   obd->obd_replayed_requests);
 		seq_printf(m, "last_transno: %lld\n",
@@ -758,7 +643,7 @@ int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data)
 			 ktime_get_real_seconds()));
 	seq_printf(m, "connected_clients: %d/%d\n",
 		   atomic_read(&obd->obd_connected_clients),
-		   atomic_read(&obd->obd_max_recoverable_clients));
+		   obd->obd_max_recoverable_clients);
 	/* Number of clients that have completed recovery */
 	seq_printf(m, "req_replay_clients: %d\n",
 		   atomic_read(&obd->obd_req_replay_clients));
@@ -778,25 +663,27 @@ int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data)
 }
 EXPORT_SYMBOL(lprocfs_recovery_status_seq_show);
 
-ssize_t ir_factor_show(struct kobject *kobj, struct attribute *attr,
-		       char *buf)
+int lprocfs_ir_factor_seq_show(struct seq_file *m, void *data)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
+	struct obd_device *obd = m->private;
 
-	return scnprintf(buf, PAGE_SIZE, "%d\n", obd->obd_recovery_ir_factor);
+	LASSERT(obd != NULL);
+	seq_printf(m, "%d\n", obd->obd_recovery_ir_factor);
+	return 0;
 }
-EXPORT_SYMBOL(ir_factor_show);
+EXPORT_SYMBOL(lprocfs_ir_factor_seq_show);
 
-ssize_t ir_factor_store(struct kobject *kobj, struct attribute *attr,
-			const char *buffer, size_t count)
+ssize_t
+lprocfs_ir_factor_seq_write(struct file *file, const char __user *buffer,
+			    size_t count, loff_t *off)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	int val;
+	struct seq_file *m = file->private_data;
+	struct obd_device *obd = m->private;
 	int rc;
+	__s64 val;
 
-	rc = kstrtoint(buffer, 10, &val);
+	LASSERT(obd != NULL);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
@@ -806,7 +693,7 @@ ssize_t ir_factor_store(struct kobject *kobj, struct attribute *attr,
 	obd->obd_recovery_ir_factor = val;
 	return count;
 }
-EXPORT_SYMBOL(ir_factor_store);
+EXPORT_SYMBOL(lprocfs_ir_factor_seq_write);
 
 int lprocfs_checksum_dump_seq_show(struct seq_file *m, void *data)
 {
@@ -824,85 +711,93 @@ lprocfs_checksum_dump_seq_write(struct file *file, const char __user *buffer,
 {
 	struct seq_file *m = file->private_data;
 	struct obd_device *obd = m->private;
-	bool val;
 	int rc;
+	__s64 val;
 
 	LASSERT(obd != NULL);
-	rc = kstrtobool_from_user(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
-	obd->obd_checksum_dump = val;
+	obd->obd_checksum_dump = !!val;
 	return count;
 }
 EXPORT_SYMBOL(lprocfs_checksum_dump_seq_write);
 
-ssize_t recovery_time_soft_show(struct kobject *kobj, struct attribute *attr,
-				char *buf)
+int lprocfs_recovery_time_soft_seq_show(struct seq_file *m, void *data)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
+	struct obd_device *obd = m->private;
 
-	return scnprintf(buf, PAGE_SIZE, "%ld\n", obd->obd_recovery_timeout);
+	LASSERT(obd != NULL);
+	seq_printf(m, "%llu\n", obd->obd_recovery_timeout);
+	return 0;
 }
-EXPORT_SYMBOL(recovery_time_soft_show);
+EXPORT_SYMBOL(lprocfs_recovery_time_soft_seq_show);
 
-ssize_t recovery_time_soft_store(struct kobject *kobj,
-				 struct attribute *attr,
-				 const char *buffer, size_t count)
+ssize_t
+lprocfs_recovery_time_soft_seq_write(struct file *file,
+				     const char __user *buffer,
+				     size_t count, loff_t *off)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	unsigned int val;
+	struct seq_file *m = file->private_data;
+	struct obd_device *obd = m->private;
 	int rc;
+	__s64 val;
 
-	rc = kstrtouint(buffer, 0, &val);
+	LASSERT(obd != NULL);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
+	if (val < 0 || val > INT_MAX)
+		return -ERANGE;
 
 	obd->obd_recovery_timeout = val;
 	return count;
 }
-EXPORT_SYMBOL(recovery_time_soft_store);
+EXPORT_SYMBOL(lprocfs_recovery_time_soft_seq_write);
 
-ssize_t recovery_time_hard_show(struct kobject *kobj, struct attribute *attr,
-				char *buf)
+int lprocfs_recovery_time_hard_seq_show(struct seq_file *m, void *data)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
+	struct obd_device *obd = m->private;
 
-	return scnprintf(buf, PAGE_SIZE, "%ld\n", obd->obd_recovery_time_hard);
+	LASSERT(obd != NULL);
+	seq_printf(m, "%lld\n", obd->obd_recovery_time_hard);
+	return 0;
 }
-EXPORT_SYMBOL(recovery_time_hard_show);
+EXPORT_SYMBOL(lprocfs_recovery_time_hard_seq_show);
 
-ssize_t recovery_time_hard_store(struct kobject *kobj,
-				 struct attribute *attr,
-				 const char *buffer, size_t count)
+ssize_t
+lprocfs_recovery_time_hard_seq_write(struct file *file,
+				     const char __user *buffer,
+				     size_t count, loff_t *off)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	unsigned int val;
+	struct seq_file *m = file->private_data;
+	struct obd_device *obd = m->private;
 	int rc;
+	__s64 val;
 
-	rc = kstrtouint(buffer, 0, &val);
+	LASSERT(obd != NULL);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
+	if (val < 0 || val > INT_MAX)
+		return -ERANGE;
 
 	obd->obd_recovery_time_hard = val;
 	return count;
 }
-EXPORT_SYMBOL(recovery_time_hard_store);
+EXPORT_SYMBOL(lprocfs_recovery_time_hard_seq_write);
 
-ssize_t instance_show(struct kobject *kobj, struct attribute *attr,
-		      char *buf)
+int lprocfs_target_instance_seq_show(struct seq_file *m, void *data)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
+	struct obd_device *obd = m->private;
 	struct obd_device_target *target = &obd->u.obt;
 
+	LASSERT(obd != NULL);
 	LASSERT(target->obt_magic == OBT_MAGIC);
-	return scnprintf(buf, PAGE_SIZE, "%u\n", obd->u.obt.obt_instance);
+	seq_printf(m, "%u\n", obd->u.obt.obt_instance);
+	return 0;
 }
-EXPORT_SYMBOL(instance_show);
+EXPORT_SYMBOL(lprocfs_target_instance_seq_show);
 
 #endif /* CONFIG_PROC_FS*/
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c b/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c
index 42e880e8a3948..21a137bad0bae 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -44,8 +44,6 @@
 #include <linux/list.h>
 #include <libcfs/libcfs.h>
 #include <libcfs/libcfs_hash.h> /* hash_long() */
-#include <libcfs/linux/linux-mem.h>
-#include <libcfs/linux/linux-hash.h>
 #include <obd_class.h>
 #include <obd_support.h>
 #include <lustre_disk.h>
@@ -53,28 +51,6 @@
 #include <lu_object.h>
 #include <lu_ref.h>
 
-struct lu_site_bkt_data {
-	/**
-	 * LRU list, updated on each access to object. Protected by
-	 * bucket lock of lu_site::ls_obj_hash.
-	 *
-	 * "Cold" end of LRU is lu_site::ls_lru.next. Accessed object are
-	 * moved to the lu_site::ls_lru.prev (this is due to the non-existence
-	 * of list_for_each_entry_safe_reverse()).
-	 */
-	struct list_head		lsb_lru;
-	/**
-	 * Wait-queue signaled when an object in this site is ultimately
-	 * destroyed (lu_object_free()) or initialized (lu_object_start()).
-	 * It is used by lu_object_find() to wait before re-trying when
-	 * object in the process of destruction is found in the hash table;
-	 * or wait object to be initialized by the allocator.
-	 *
-	 * \see htable_lookup().
-	 */
-	wait_queue_head_t		lsb_waitq;
-};
-
 enum {
 	LU_CACHE_PERCENT_MAX     = 50,
 	LU_CACHE_PERCENT_DEFAULT = 20
@@ -109,18 +85,6 @@ MODULE_PARM_DESC(lu_cache_nr, "Maximum number of objects in lu_object cache");
 static void lu_object_free(const struct lu_env *env, struct lu_object *o);
 static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx);
 
-wait_queue_head_t *
-lu_site_wq_from_fid(struct lu_site *site, struct lu_fid *fid)
-{
-	struct cfs_hash_bd bd;
-	struct lu_site_bkt_data *bkt;
-
-	cfs_hash_bd_get(site->ls_obj_hash, fid, &bd);
-	bkt = cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
-	return &bkt->lsb_waitq;
-}
-EXPORT_SYMBOL(lu_site_wq_from_fid);
-
 /**
  * Decrease reference counter on object. If last reference is freed, return
  * object to the cache, unless lu_object_is_dying(o) holds. In the latter
@@ -129,18 +93,22 @@ EXPORT_SYMBOL(lu_site_wq_from_fid);
 void lu_object_put(const struct lu_env *env, struct lu_object *o)
 {
 	struct lu_site_bkt_data *bkt;
-	struct lu_object_header *top = o->lo_header;
-	struct lu_site *site = o->lo_dev->ld_site;
-	struct lu_object *orig = o;
+	struct lu_object_header *top;
+	struct lu_site *site;
+	struct lu_object *orig;
 	struct cfs_hash_bd bd;
-	const struct lu_fid *fid = lu_object_fid(o);
-	bool is_dying;
+	const struct lu_fid *fid;
+
+	top  = o->lo_header;
+	site = o->lo_dev->ld_site;
+	orig = o;
 
 	/*
 	 * till we have full fids-on-OST implemented anonymous objects
 	 * are possible in OSP. such an object isn't listed in the site
 	 * so we should not remove it from the site.
 	 */
+	fid = lu_object_fid(o);
 	if (fid_is_zero(fid)) {
 		LASSERT(top->loh_hash.next == NULL
 			&& top->loh_hash.pprev == NULL);
@@ -158,19 +126,13 @@ void lu_object_put(const struct lu_env *env, struct lu_object *o)
 	cfs_hash_bd_get(site->ls_obj_hash, &top->loh_fid, &bd);
 	bkt = cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
 
-	is_dying = lu_object_is_dying(top);
 	if (!cfs_hash_bd_dec_and_lock(site->ls_obj_hash, &bd, &top->loh_ref)) {
-		/* at this point the object reference is dropped and lock is
-		 * not taken, so lu_object should not be touched because it
-		 * can be freed by concurrent thread. Use local variable for
-		 * check.
-		 */
-		if (is_dying) {
+		if (lu_object_is_dying(top)) {
 			/*
 			 * somebody may be waiting for this, currently only
 			 * used for cl_object, see cl_object_put_last().
 			 */
-			wake_up_all(&bkt->lsb_waitq);
+			wake_up_all(&bkt->lsb_marche_funebre);
 		}
 		return;
 	}
@@ -184,17 +146,15 @@ void lu_object_put(const struct lu_env *env, struct lu_object *o)
 			o->lo_ops->loo_object_release(env, o);
 	}
 
-	/* don't use local 'is_dying' here because if was taken without lock
-	 * but here we need the latest actual value of it so check lu_object
-	 * directly here.
-	 */
 	if (!lu_object_is_dying(top) &&
 	    (lu_object_exists(orig) || lu_object_is_cl(orig))) {
 		LASSERT(list_empty(&top->loh_lru));
 		list_add_tail(&top->loh_lru, &bkt->lsb_lru);
+		bkt->lsb_lru_len++;
 		percpu_counter_inc(&site->ls_lru_len_counter);
-		CDEBUG(D_INODE, "Add %p/%p to site lru. hash: %p, bkt: %p\n",
-		       orig, top, site->ls_obj_hash, bkt);
+		CDEBUG(D_INODE, "Add %p to site lru. hash: %p, bkt: %p, "
+		       "lru_len: %ld\n",
+		       o, site->ls_obj_hash, bkt, bkt->lsb_lru_len);
 		cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1);
 		return;
 	}
@@ -253,6 +213,7 @@ void lu_object_unhash(const struct lu_env *env, struct lu_object *o)
 
 			list_del_init(&top->loh_lru);
 			bkt = cfs_hash_bd_extra_get(obj_hash, &bd);
+			bkt->lsb_lru_len--;
 			percpu_counter_dec(&site->ls_lru_len_counter);
 		}
 		cfs_hash_bd_del_locked(obj_hash, &bd, &top->loh_hash);
@@ -269,9 +230,17 @@ EXPORT_SYMBOL(lu_object_unhash);
  */
 static struct lu_object *lu_object_alloc(const struct lu_env *env,
 					 struct lu_device *dev,
-					 const struct lu_fid *f)
+					 const struct lu_fid *f,
+					 const struct lu_object_conf *conf)
 {
+	struct lu_object *scan;
 	struct lu_object *top;
+	struct list_head *layers;
+	unsigned int init_mask = 0;
+	unsigned int init_flag;
+	int clean;
+	int result;
+	ENTRY;
 
 	/*
 	 * Create top-level object slice. This will also create
@@ -279,36 +248,15 @@ static struct lu_object *lu_object_alloc(const struct lu_env *env,
 	 */
 	top = dev->ld_ops->ldo_object_alloc(env, NULL, dev);
 	if (top == NULL)
-		return ERR_PTR(-ENOMEM);
+		RETURN(ERR_PTR(-ENOMEM));
 	if (IS_ERR(top))
-		return top;
-	/*
-	 * This is the only place where object fid is assigned. It's constant
-	 * after this point.
-	 */
-	top->lo_header->loh_fid = *f;
-
-	return top;
-}
-
-/**
- * Initialize object.
- *
- * This is called after object hash insertion to avoid returning an object with
- * stale attributes.
- */
-static int lu_object_start(const struct lu_env *env, struct lu_device *dev,
-			   struct lu_object *top,
-			   const struct lu_object_conf *conf)
-{
-	struct lu_object *scan;
-	struct list_head *layers;
-	unsigned int init_mask = 0;
-	unsigned int init_flag;
-	int clean;
-	int result;
-
-	layers = &top->lo_header->loh_layers;
+		RETURN(top);
+        /*
+         * This is the only place where object fid is assigned. It's constant
+         * after this point.
+         */
+        top->lo_header->loh_fid = *f;
+        layers = &top->lo_header->loh_layers;
 
 	do {
 		/*
@@ -323,9 +271,10 @@ static int lu_object_start(const struct lu_env *env, struct lu_device *dev,
 			clean = 0;
 			scan->lo_header = top->lo_header;
 			result = scan->lo_ops->loo_object_init(env, scan, conf);
-			if (result)
-				return result;
-
+			if (result != 0) {
+				lu_object_free(env, top);
+				RETURN(ERR_PTR(result));
+			}
 			init_mask |= init_flag;
 next:
 			init_flag <<= 1;
@@ -333,18 +282,17 @@ static int lu_object_start(const struct lu_env *env, struct lu_device *dev,
 	} while (!clean);
 
 	list_for_each_entry_reverse(scan, layers, lo_linkage) {
-		if (scan->lo_ops->loo_object_start != NULL) {
-			result = scan->lo_ops->loo_object_start(env, scan);
-			if (result)
-				return result;
-		}
-	}
-
-	lprocfs_counter_incr(dev->ld_site->ls_stats, LU_SS_CREATED);
-
-	set_bit(LU_OBJECT_INITED, &top->lo_header->loh_flags);
+                if (scan->lo_ops->loo_object_start != NULL) {
+                        result = scan->lo_ops->loo_object_start(env, scan);
+                        if (result != 0) {
+                                lu_object_free(env, top);
+                                RETURN(ERR_PTR(result));
+                        }
+                }
+        }
 
-	return 0;
+        lprocfs_counter_incr(dev->ld_site->ls_stats, LU_SS_CREATED);
+        RETURN(top);
 }
 
 /**
@@ -352,15 +300,15 @@ static int lu_object_start(const struct lu_env *env, struct lu_device *dev,
  */
 static void lu_object_free(const struct lu_env *env, struct lu_object *o)
 {
-	wait_queue_head_t *wq;
+	struct lu_site_bkt_data *bkt;
 	struct lu_site		*site;
 	struct lu_object	*scan;
 	struct list_head	*layers;
 	struct list_head	 splice;
 
-	site = o->lo_dev->ld_site;
-	layers = &o->lo_header->loh_layers;
-	wq = lu_site_wq_from_fid(site, &o->lo_header->loh_fid);
+        site   = o->lo_dev->ld_site;
+        layers = &o->lo_header->loh_layers;
+        bkt    = lu_site_bkt_from_fid(site, &o->lo_header->loh_fid);
         /*
          * First call ->loo_object_delete() method to release all resources.
          */
@@ -389,8 +337,8 @@ static void lu_object_free(const struct lu_env *env, struct lu_object *o)
 		o->lo_ops->loo_object_free(env, o);
 	}
 
-	if (waitqueue_active(wq))
-		wake_up_all(wq);
+	if (waitqueue_active(&bkt->lsb_marche_funebre))
+		wake_up_all(&bkt->lsb_marche_funebre);
 }
 
 /**
@@ -451,6 +399,7 @@ int lu_site_purge_objects(const struct lu_env *env, struct lu_site *s,
                         cfs_hash_bd_del_locked(s->ls_obj_hash,
                                                &bd2, &h->loh_hash);
 			list_move(&h->loh_lru, &dispose);
+			bkt->lsb_lru_len--;
 			percpu_counter_dec(&s->ls_lru_len_counter);
                         if (did_sth == 0)
                                 did_sth = 1;
@@ -642,6 +591,7 @@ static struct lu_object *htable_lookup(struct lu_site *s,
 				       const struct lu_fid *f,
 				       __u64 *version)
 {
+	struct lu_site_bkt_data	*bkt;
 	struct lu_object_header	*h;
 	struct hlist_node *hnode;
 	__u64 ver = cfs_hash_bd_version_get(bd);
@@ -650,6 +600,7 @@ static struct lu_object *htable_lookup(struct lu_site *s,
 		return ERR_PTR(-ENOENT);
 
 	*version = ver;
+	bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, bd);
 	/* cfs_hash_bd_peek_locked is a somehow "internal" function
 	 * of cfs_hash, it doesn't add refcount on object. */
 	hnode = cfs_hash_bd_peek_locked(s->ls_obj_hash, bd, (void *)f);
@@ -663,6 +614,7 @@ static struct lu_object *htable_lookup(struct lu_site *s,
 	lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_HIT);
 	if (!list_empty(&h->loh_lru)) {
 		list_del_init(&h->loh_lru);
+		bkt->lsb_lru_len--;
 		percpu_counter_dec(&s->ls_lru_len_counter);
 	}
 	return lu_object_top(h);
@@ -705,6 +657,29 @@ static void lu_object_limit(const struct lu_env *env,
 			      MIN(size - nr, LU_CACHE_NR_MAX_ADJUST), 0);
 }
 
+static struct lu_object *lu_object_new(const struct lu_env *env,
+				       struct lu_device *dev,
+				       const struct lu_fid *f,
+				       const struct lu_object_conf *conf)
+{
+	struct lu_object *o;
+	struct cfs_hash *hs;
+	struct cfs_hash_bd bd;
+
+	o = lu_object_alloc(env, dev, f, conf);
+	if (unlikely(IS_ERR(o)))
+		return o;
+
+	hs = dev->ld_site->ls_obj_hash;
+	cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1);
+	cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+	cfs_hash_bd_unlock(hs, &bd, 1);
+
+	lu_object_limit(env, dev);
+
+	return o;
+}
+
 /**
  * Core logic of lu_object_find*() functions.
  *
@@ -722,19 +697,7 @@ struct lu_object *lu_object_find_at(const struct lu_env *env,
 	struct lu_site *s;
 	struct cfs_hash *hs;
 	struct cfs_hash_bd bd;
-	struct lu_site_bkt_data *bkt;
-	struct l_wait_info lwi = { 0 };
 	__u64 version = 0;
-	int rc;
-
-	ENTRY;
-
-	/* FID is from disk or network, zero FID is meaningless, return error
-	 * early to avoid assertion in lu_object_put. If a zero FID is wanted,
-	 * it should be allocated via lu_object_anon().
-	 */
-	if (fid_is_zero(f))
-		RETURN(ERR_PTR(-EINVAL));
 
 	/*
 	 * This uses standard index maintenance protocol:
@@ -753,99 +716,46 @@ struct lu_object *lu_object_find_at(const struct lu_env *env,
 	 * It is unnecessary to perform lookup-alloc-lookup-insert, instead,
 	 * just alloc and insert directly.
 	 *
+	 * If dying object is found during index search, add @waiter to the
+	 * site wait-queue and return ERR_PTR(-EAGAIN).
 	 */
+	if (conf && conf->loc_flags & LOC_F_NEW)
+		return lu_object_new(env, dev, f, conf);
+
 	s  = dev->ld_site;
 	hs = s->ls_obj_hash;
-
-	if (unlikely(OBD_FAIL_PRECHECK(OBD_FAIL_OBD_ZERO_NLINK_RACE)))
-		lu_site_purge(env, s, -1);
-
-	cfs_hash_bd_get(hs, f, &bd);
-	bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd);
-	if (!(conf && conf->loc_flags & LOC_F_NEW)) {
-		cfs_hash_bd_lock(hs, &bd, 1);
-		o = htable_lookup(s, &bd, f, &version);
-		cfs_hash_bd_unlock(hs, &bd, 1);
-
-		if (!IS_ERR(o)) {
-			if (likely(lu_object_is_inited(o->lo_header)))
-				RETURN(o);
-
-			l_wait_event(bkt->lsb_waitq,
-				     lu_object_is_inited(o->lo_header) ||
-				     lu_object_is_dying(o->lo_header), &lwi);
-
-			if (lu_object_is_dying(o->lo_header)) {
-				lu_object_put(env, o);
-
-				RETURN(ERR_PTR(-ENOENT));
-			}
-
-			RETURN(o);
-		}
-
-		if (PTR_ERR(o) != -ENOENT)
-			RETURN(o);
-	}
+	cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1);
+	o = htable_lookup(s, &bd, f, &version);
+	cfs_hash_bd_unlock(hs, &bd, 1);
+	if (!IS_ERR(o) || PTR_ERR(o) != -ENOENT)
+		return o;
 
 	/*
-	 * Allocate new object, NB, object is unitialized in case object
-	 * is changed between allocation and hash insertion, thus the object
-	 * with stale attributes is returned.
+	 * Allocate new object. This may result in rather complicated
+	 * operations, including fld queries, inode loading, etc.
 	 */
-	o = lu_object_alloc(env, dev, f);
-	if (IS_ERR(o))
-		RETURN(o);
+	o = lu_object_alloc(env, dev, f, conf);
+	if (unlikely(IS_ERR(o)))
+		return o;
 
 	LASSERT(lu_fid_eq(lu_object_fid(o), f));
 
-	CFS_RACE_WAIT(OBD_FAIL_OBD_ZERO_NLINK_RACE);
-
 	cfs_hash_bd_lock(hs, &bd, 1);
 
-	if (conf && conf->loc_flags & LOC_F_NEW)
-		shadow = ERR_PTR(-ENOENT);
-	else
-		shadow = htable_lookup(s, &bd, f, &version);
-	if (likely(PTR_ERR(shadow) == -ENOENT)) {
+	shadow = htable_lookup(s, &bd, f, &version);
+	if (likely(IS_ERR(shadow) && PTR_ERR(shadow) == -ENOENT)) {
 		cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
 		cfs_hash_bd_unlock(hs, &bd, 1);
 
-		/*
-		 * This may result in rather complicated operations, including
-		 * fld queries, inode loading, etc.
-		 */
-		rc = lu_object_start(env, dev, o, conf);
-		if (rc) {
-			lu_object_put_nocache(env, o);
-			RETURN(ERR_PTR(rc));
-		}
-
-		wake_up_all(&bkt->lsb_waitq);
-
 		lu_object_limit(env, dev);
 
-		RETURN(o);
+		return o;
 	}
 
 	lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_RACE);
 	cfs_hash_bd_unlock(hs, &bd, 1);
 	lu_object_free(env, o);
-
-	if (!(conf && conf->loc_flags & LOC_F_NEW) &&
-	    !lu_object_is_inited(shadow->lo_header)) {
-		l_wait_event(bkt->lsb_waitq,
-			     lu_object_is_inited(shadow->lo_header) ||
-			     lu_object_is_dying(shadow->lo_header), &lwi);
-
-		if (lu_object_is_dying(shadow->lo_header)) {
-			lu_object_put(env, shadow);
-
-			RETURN(ERR_PTR(-ENOENT));
-		}
-	}
-
-	RETURN(shadow);
+	return shadow;
 }
 EXPORT_SYMBOL(lu_object_find_at);
 
@@ -1132,7 +1042,7 @@ int lu_site_init(struct lu_site *s, struct lu_device *top)
 	cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) {
 		bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd);
 		INIT_LIST_HEAD(&bkt->lsb_lru);
-		init_waitqueue_head(&bkt->lsb_waitq);
+		init_waitqueue_head(&bkt->lsb_marche_funebre);
 	}
 
         s->ls_stats = lprocfs_alloc_stats(LU_SS_LAST_STAT, 0);
@@ -1476,8 +1386,7 @@ static void key_fini(struct lu_context *ctx, int index)
 
                 key->lct_fini(ctx, key, ctx->lc_value[index]);
                 lu_ref_del(&key->lct_reference, "ctx", ctx);
-		if (atomic_dec_and_test(&key->lct_used))
-			wake_up_var(&key->lct_used);
+		atomic_dec(&key->lct_used);
 
 		LASSERT(key->lct_owner != NULL);
 		if ((ctx->lc_tags & LCT_NOREF) == 0) {
@@ -1498,23 +1407,29 @@ void lu_context_key_degister(struct lu_context_key *key)
 
 	lu_context_key_quiesce(key);
 
+	write_lock(&lu_keys_guard);
+	++key_set_version;
 	key_fini(&lu_shrink_env.le_ctx, key->lct_index);
 
 	/**
 	 * Wait until all transient contexts referencing this key have
 	 * run lu_context_key::lct_fini() method.
 	 */
-	atomic_dec(&key->lct_used);
-	wait_var_event(&key->lct_used, atomic_read(&key->lct_used) == 0);
-
-	write_lock(&lu_keys_guard);
+	while (atomic_read(&key->lct_used) > 1) {
+		write_unlock(&lu_keys_guard);
+		CDEBUG(D_INFO, "lu_context_key_degister: \"%s\" %p, %d\n",
+		       key->lct_owner ? key->lct_owner->name : "", key,
+		       atomic_read(&key->lct_used));
+		schedule();
+		write_lock(&lu_keys_guard);
+	}
 	if (lu_keys[key->lct_index]) {
 		lu_keys[key->lct_index] = NULL;
 		lu_ref_fini(&key->lct_reference);
 	}
 	write_unlock(&lu_keys_guard);
 
-	LASSERTF(atomic_read(&key->lct_used) == 0,
+	LASSERTF(atomic_read(&key->lct_used) == 1,
 		 "key has instances: %d\n",
 		 atomic_read(&key->lct_used));
 }
@@ -1978,119 +1893,6 @@ int lu_env_refill_by_tags(struct lu_env *env, __u32 ctags,
 }
 EXPORT_SYMBOL(lu_env_refill_by_tags);
 
-#ifdef HAVE_SERVER_SUPPORT
-struct lu_env_item {
-	struct task_struct *lei_task;	/* rhashtable key */
-	struct rhash_head lei_linkage;
-	struct lu_env *lei_env;
-	struct rcu_head lei_rcu_head;
-};
-
-static const struct rhashtable_params lu_env_rhash_params = {
-	.key_len     = sizeof(struct task_struct *),
-	.key_offset  = offsetof(struct lu_env_item, lei_task),
-	.head_offset = offsetof(struct lu_env_item, lei_linkage),
-};
-
-struct rhashtable lu_env_rhash;
-
-struct lu_env_percpu {
-	struct task_struct *lep_task;
-	struct lu_env *lep_env ____cacheline_aligned_in_smp;
-};
-
-static struct lu_env_percpu lu_env_percpu[NR_CPUS];
-
-int lu_env_add(struct lu_env *env)
-{
-	struct lu_env_item *lei, *old;
-
-	LASSERT(env);
-
-	OBD_ALLOC_PTR(lei);
-	if (!lei)
-		return -ENOMEM;
-
-	lei->lei_task = current;
-	lei->lei_env = env;
-
-	old = rhashtable_lookup_get_insert_fast(&lu_env_rhash,
-						&lei->lei_linkage,
-						lu_env_rhash_params);
-	LASSERT(!old);
-
-	return 0;
-}
-EXPORT_SYMBOL(lu_env_add);
-
-static void lu_env_item_free(struct rcu_head *head)
-{
-	struct lu_env_item *lei;
-
-	lei = container_of(head, struct lu_env_item, lei_rcu_head);
-	OBD_FREE_PTR(lei);
-}
-
-void lu_env_remove(struct lu_env *env)
-{
-	struct lu_env_item *lei;
-	const void *task = current;
-	int i;
-
-	for_each_possible_cpu(i) {
-		if (lu_env_percpu[i].lep_env == env) {
-			LASSERT(lu_env_percpu[i].lep_task == task);
-			lu_env_percpu[i].lep_task = NULL;
-			lu_env_percpu[i].lep_env = NULL;
-		}
-	}
-
-	/* The rcu_lock is not taking in this case since the key
-	 * used is the actual task_struct. This implies that each
-	 * object is only removed by the owning thread, so there
-	 * can never be a race on a particular object.
-	 */
-	lei = rhashtable_lookup_fast(&lu_env_rhash, &task,
-				     lu_env_rhash_params);
-	if (lei && rhashtable_remove_fast(&lu_env_rhash, &lei->lei_linkage,
-					  lu_env_rhash_params) == 0)
-		call_rcu(&lei->lei_rcu_head, lu_env_item_free);
-}
-EXPORT_SYMBOL(lu_env_remove);
-
-struct lu_env *lu_env_find(void)
-{
-	struct lu_env *env = NULL;
-	struct lu_env_item *lei;
-	const void *task = current;
-	int i = get_cpu();
-
-	if (lu_env_percpu[i].lep_task == current) {
-		env = lu_env_percpu[i].lep_env;
-		put_cpu();
-		LASSERT(env);
-		return env;
-	}
-
-	lei = rhashtable_lookup_fast(&lu_env_rhash, &task,
-				     lu_env_rhash_params);
-	if (lei) {
-		env = lei->lei_env;
-		lu_env_percpu[i].lep_task = current;
-		lu_env_percpu[i].lep_env = env;
-	}
-	put_cpu();
-
-	return env;
-}
-EXPORT_SYMBOL(lu_env_find);
-#define lu_env_rhash_init(rhash, params) rhashtable_init(rhash, params)
-#define lu_env_rhash_destroy(rhash)	 rhashtable_destroy(rhash)
-#else
-#define lu_env_rhash_init(rhash, params) 0
-#define lu_env_rhash_destroy(rhash)	 do {} while (0)
-#endif /* HAVE_SERVER_SUPPORT */
-
 static struct shrinker *lu_site_shrinker;
 
 typedef struct lu_site_stats{
@@ -2100,24 +1902,19 @@ typedef struct lu_site_stats{
         unsigned        lss_busy;
 } lu_site_stats_t;
 
-static void lu_site_stats_get(const struct lu_site *s,
+static void lu_site_stats_get(struct cfs_hash *hs,
                               lu_site_stats_t *stats, int populated)
 {
-	struct cfs_hash *hs = s->ls_obj_hash;
 	struct cfs_hash_bd bd;
-	unsigned int i;
-	/*
-	 * percpu_counter_sum_positive() won't accept a const pointer
-	 * as it does modify the struct by taking a spinlock
-	 */
-	struct lu_site *s2 = (struct lu_site *)s;
+	unsigned int  i;
 
-	stats->lss_busy += cfs_hash_size_get(hs) -
-		percpu_counter_sum_positive(&s2->ls_lru_len_counter);
         cfs_hash_for_each_bucket(hs, &bd, i) {
-		struct hlist_head *hhead;
+                struct lu_site_bkt_data *bkt = cfs_hash_bd_extra_get(hs, &bd);
+		struct hlist_head	*hhead;
 
                 cfs_hash_bd_lock(hs, &bd, 1);
+		stats->lss_busy  +=
+			cfs_hash_bd_count_get(&bd) - bkt->lsb_lru_len;
                 stats->lss_total += cfs_hash_bd_count_get(&bd);
                 stats->lss_max_search = max((int)stats->lss_max_search,
                                             cfs_hash_bd_depmax_get(&bd));
@@ -2306,7 +2103,7 @@ void lu_context_keys_dump(void)
  */
 int lu_global_init(void)
 {
-	int result;
+        int result;
 	DEF_SHRINKER_VAR(shvar, lu_cache_shrink,
 			 lu_cache_shrink_count, lu_cache_shrink_scan);
 
@@ -2341,8 +2138,6 @@ int lu_global_init(void)
         if (lu_site_shrinker == NULL)
                 return -ENOMEM;
 
-	result = lu_env_rhash_init(&lu_env_rhash, &lu_env_rhash_params);
-
         return result;
 }
 
@@ -2366,8 +2161,6 @@ void lu_global_fini(void)
         lu_env_fini(&lu_shrink_env);
 	up_write(&lu_sites_guard);
 
-	lu_env_rhash_destroy(&lu_env_rhash);
-
         lu_ref_global_fini();
 }
 
@@ -2392,7 +2185,7 @@ int lu_site_stats_seq_print(const struct lu_site *s, struct seq_file *m)
 	lu_site_stats_t stats;
 
 	memset(&stats, 0, sizeof(stats));
-	lu_site_stats_get(s, &stats, 1);
+	lu_site_stats_get(s->ls_obj_hash, &stats, 1);
 
 	seq_printf(m, "%d/%d %d/%d %d %d %d %d %d %d %d\n",
 		   stats.lss_busy,
@@ -2490,19 +2283,11 @@ struct lu_object *lu_object_anon(const struct lu_env *env,
 				 struct lu_device *dev,
 				 const struct lu_object_conf *conf)
 {
-	struct lu_fid fid;
+	struct lu_fid     fid;
 	struct lu_object *o;
-	int rc;
 
 	fid_zero(&fid);
-	o = lu_object_alloc(env, dev, &fid);
-	if (!IS_ERR(o)) {
-		rc = lu_object_start(env, dev, o, conf);
-		if (rc) {
-			lu_object_free(env, o);
-			return ERR_PTR(rc);
-		}
-	}
+	o = lu_object_alloc(env, dev, &fid, conf);
 
 	return o;
 }
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c b/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c
index e0a75791f1e6e..bef29033f30ee 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -65,14 +65,14 @@
 static struct kmem_cache *lu_ref_link_kmem;
 
 static struct lu_kmem_descr lu_ref_caches[] = {
-	{
-		.ckd_cache = &lu_ref_link_kmem,
-		.ckd_name  = "lu_ref_link_kmem",
-		.ckd_size  = sizeof(struct lu_ref_link)
-	},
-	{
-		.ckd_cache = NULL
-	}
+        {
+                .ckd_cache = &lu_ref_link_kmem,
+                .ckd_name  = "lu_ref_link_kmem",
+                .ckd_size  = sizeof (struct lu_ref_link)
+        },
+        {
+                .ckd_cache = NULL
+        }
 };
 
 /**
@@ -90,18 +90,18 @@ static struct lu_ref lu_ref_marker = {
 
 void lu_ref_print(const struct lu_ref *ref)
 {
-	struct lu_ref_link *link;
+        struct lu_ref_link *link;
 
-	CERROR("lu_ref: %p %d %d %s:%d\n",
-	       ref, ref->lf_refs, ref->lf_failed, ref->lf_func, ref->lf_line);
+        CERROR("lu_ref: %p %d %d %s:%d\n",
+               ref, ref->lf_refs, ref->lf_failed, ref->lf_func, ref->lf_line);
 	list_for_each_entry(link, &ref->lf_list, ll_linkage) {
-		CERROR("     link: %s %p\n", link->ll_scope, link->ll_source);
-	}
+                CERROR("     link: %s %p\n", link->ll_scope, link->ll_source);
+        }
 }
 
 static int lu_ref_is_marker(const struct lu_ref *ref)
 {
-	return ref == &lu_ref_marker;
+        return (ref == &lu_ref_marker);
 }
 
 void lu_ref_print_all(void)
@@ -146,19 +146,19 @@ void lu_ref_fini(struct lu_ref *ref)
 EXPORT_SYMBOL(lu_ref_fini);
 
 static struct lu_ref_link *lu_ref_add_context(struct lu_ref *ref,
-					      int flags,
-					      const char *scope,
-					      const void *source)
+                                              int flags,
+                                              const char *scope,
+                                              const void *source)
 {
-	struct lu_ref_link *link;
-
-	link = NULL;
-	if (lu_ref_link_kmem != NULL) {
-		OBD_SLAB_ALLOC_PTR_GFP(link, lu_ref_link_kmem, flags);
-		if (link != NULL) {
-			link->ll_ref = ref;
-			link->ll_scope = scope;
-			link->ll_source = source;
+        struct lu_ref_link *link;
+
+        link = NULL;
+        if (lu_ref_link_kmem != NULL) {
+                OBD_SLAB_ALLOC_PTR_GFP(link, lu_ref_link_kmem, flags);
+                if (link != NULL) {
+                        link->ll_ref    = ref;
+                        link->ll_scope  = scope;
+                        link->ll_source = source;
 			spin_lock(&ref->lf_guard);
 			list_add_tail(&link->ll_linkage, &ref->lf_list);
 			ref->lf_refs++;
@@ -207,10 +207,9 @@ void lu_ref_add_atomic(struct lu_ref *ref, const char *scope,
 EXPORT_SYMBOL(lu_ref_add_atomic);
 
 static inline int lu_ref_link_eq(const struct lu_ref_link *link,
-				 const char *scope,
-				 const void *source)
+                                 const char *scope, const void *source)
 {
-	return link->ll_source == source && !strcmp(link->ll_scope, scope);
+        return link->ll_source == source && !strcmp(link->ll_scope, scope);
 }
 
 /**
@@ -224,22 +223,22 @@ static unsigned lu_ref_chain_max_length = 127;
 static struct lu_ref_link *lu_ref_find(struct lu_ref *ref, const char *scope,
                                        const void *source)
 {
-	struct lu_ref_link *link;
-	unsigned int iterations;
+        struct lu_ref_link *link;
+        unsigned            iterations;
 
-	iterations = 0;
+        iterations = 0;
 	list_for_each_entry(link, &ref->lf_list, ll_linkage) {
-		++iterations;
-		if (lu_ref_link_eq(link, scope, source)) {
-			if (iterations > lu_ref_chain_max_length) {
-				CWARN("Long lu_ref chain %d \"%s\":%p\n",
-				      iterations, scope, source);
-				lu_ref_chain_max_length = iterations * 3 / 2;
-			}
-			return link;
-		}
-	}
-	return NULL;
+                ++iterations;
+                if (lu_ref_link_eq(link, scope, source)) {
+                        if (iterations > lu_ref_chain_max_length) {
+                                CWARN("Long lu_ref chain %d \"%s\":%p\n",
+                                      iterations, scope, source);
+                                lu_ref_chain_max_length = iterations * 3 / 2;
+                        }
+                        return link;
+                }
+        }
+        return NULL;
 }
 
 void lu_ref_del(struct lu_ref *ref, const char *scope, const void *source)
@@ -303,10 +302,10 @@ static void *lu_ref_seq_start(struct seq_file *seq, loff_t *pos)
 
 static void *lu_ref_seq_next(struct seq_file *seq, void *p, loff_t *pos)
 {
-	struct lu_ref *ref = p;
-	struct lu_ref *next;
+        struct lu_ref *ref = p;
+        struct lu_ref *next;
 
-	LASSERT(seq->private == p);
+        LASSERT(seq->private == p);
 	LASSERT(!list_empty(&ref->lf_linkage));
 
 	spin_lock(&lu_ref_refs_guard);
@@ -323,7 +322,7 @@ static void *lu_ref_seq_next(struct seq_file *seq, void *p, loff_t *pos)
 
 static void lu_ref_seq_stop(struct seq_file *seq, void *p)
 {
-	/* Nothing to do */
+        /* Nothing to do */
 }
 
 
@@ -341,19 +340,19 @@ static int lu_ref_seq_show(struct seq_file *seq, void *p)
 
 	/* print the entry */
 	spin_lock(&next->lf_guard);
-	seq_printf(seq, "lu_ref: %p %d %d %s:%d\n",
-		   next, next->lf_refs, next->lf_failed,
-		   next->lf_func, next->lf_line);
-	if (next->lf_refs > 64) {
-		seq_puts(seq, "  too many references, skip\n");
-	} else {
-		struct lu_ref_link *link;
-		int i = 0;
+        seq_printf(seq, "lu_ref: %p %d %d %s:%d\n",
+                   next, next->lf_refs, next->lf_failed,
+                   next->lf_func, next->lf_line);
+        if (next->lf_refs > 64) {
+                seq_printf(seq, "  too many references, skip\n");
+        } else {
+                struct lu_ref_link *link;
+                int i = 0;
 
 		list_for_each_entry(link, &next->lf_list, ll_linkage)
-			seq_printf(seq, "  #%d link: %s %p\n",
-				   i++, link->ll_scope, link->ll_source);
-	}
+                        seq_printf(seq, "  #%d link: %s %p\n",
+                                   i++, link->ll_scope, link->ll_source);
+        }
 	spin_unlock(&next->lf_guard);
 	spin_unlock(&lu_ref_refs_guard);
 
@@ -361,10 +360,10 @@ static int lu_ref_seq_show(struct seq_file *seq, void *p)
 }
 
 static struct seq_operations lu_ref_seq_ops = {
-	.start = lu_ref_seq_start,
-	.stop  = lu_ref_seq_stop,
-	.next  = lu_ref_seq_next,
-	.show  = lu_ref_seq_show
+        .start = lu_ref_seq_start,
+        .stop  = lu_ref_seq_stop,
+        .next  = lu_ref_seq_next,
+        .show  = lu_ref_seq_show
 };
 
 static int lu_ref_seq_open(struct inode *inode, struct file *file)
@@ -381,16 +380,15 @@ static int lu_ref_seq_open(struct inode *inode, struct file *file)
 			list_add(&marker->lf_linkage, &lu_ref_refs);
 		spin_unlock(&lu_ref_refs_guard);
 
-		if (result == 0) {
-			struct seq_file *f = file->private_data;
-
-			f->private = marker;
-		} else {
-			seq_release(inode, file);
-		}
-	}
+                if (result == 0) {
+                        struct seq_file *f = file->private_data;
+                        f->private = marker;
+                } else {
+                        seq_release(inode, file);
+                }
+        }
 
-	return result;
+        return result;
 }
 
 static int lu_ref_seq_release(struct inode *inode, struct file *file)
@@ -405,11 +403,11 @@ static int lu_ref_seq_release(struct inode *inode, struct file *file)
 }
 
 static struct file_operations lu_ref_dump_fops = {
-	.owner   = THIS_MODULE,
-	.open    = lu_ref_seq_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = lu_ref_seq_release
+        .owner   = THIS_MODULE,
+        .open    = lu_ref_seq_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = lu_ref_seq_release
 };
 
 #endif /* CONFIG_PROC_FS */
@@ -421,26 +419,26 @@ int lu_ref_global_init(void)
 	CDEBUG(D_CONSOLE,
 	       "lu_ref tracking is enabled. Performance isn't.\n");
 
-	result = lu_kmem_init(lu_ref_caches);
+        result = lu_kmem_init(lu_ref_caches);
 
 #ifdef CONFIG_PROC_FS
-	if (result == 0) {
-		result = lprocfs_seq_create(proc_lustre_root, "lu_refs",
-					    0444, &lu_ref_dump_fops, NULL);
-		if (result)
-			lu_kmem_fini(lu_ref_caches);
-	}
+        if (result == 0) {
+                result = lprocfs_seq_create(proc_lustre_root, "lu_refs",
+                                            0444, &lu_ref_dump_fops, NULL);
+                if (result)
+                        lu_kmem_fini(lu_ref_caches);
+        }
 #endif /* CONFIG_PROC_FS */
 
-	return result;
+        return result;
 }
 
 void lu_ref_global_fini(void)
 {
 #ifdef CONFIG_PROC_FS
-	lprocfs_remove_proc_entry("lu_refs", proc_lustre_root);
+        lprocfs_remove_proc_entry("lu_refs", proc_lustre_root);
 #endif /* CONFIG_PROC_FS */
-	lu_kmem_fini(lu_ref_caches);
+        lu_kmem_fini(lu_ref_caches);
 }
 
 #endif /* USE_LU_REF */
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lustre_handles.c b/drivers/staging/lustrefsx/lustre/obdclass/lustre_handles.c
index 4161b2dabfd72..bd149ddf7a967 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lustre_handles.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lustre_handles.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -46,7 +46,7 @@ static __u64 handle_base;
 static DEFINE_SPINLOCK(handle_base_lock);
 
 static struct handle_bucket {
-	spinlock_t lock;
+	spinlock_t	 lock;
 	struct list_head head;
 } *handle_hash;
 
@@ -60,17 +60,16 @@ static struct handle_bucket {
 void class_handle_hash(struct portals_handle *h,
 		       struct portals_handle_ops *ops)
 {
-	struct handle_bucket *bucket;
-
-	ENTRY;
+        struct handle_bucket *bucket;
+        ENTRY;
 
-	LASSERT(h != NULL);
+        LASSERT(h != NULL);
 	LASSERT(list_empty(&h->h_link));
 
-	/*
-	 * This is fast, but simplistic cookie generation algorithm, it will
-	 * need a re-do at some point in the future for security.
-	 */
+        /*
+         * This is fast, but simplistic cookie generation algorithm, it will
+         * need a re-do at some point in the future for security.
+         */
 	spin_lock(&handle_base_lock);
 	handle_base += HANDLE_INCR;
 
@@ -105,12 +104,12 @@ static void class_handle_unhash_nolock(struct portals_handle *h)
 {
 	if (list_empty(&h->h_link)) {
 		CERROR("removing an already-removed handle (%#llx)\n",
-		       h->h_cookie);
-		return;
-	}
+                       h->h_cookie);
+                return;
+        }
 
 	CDEBUG(D_INFO, "removing object %p with handle %#llx from hash\n",
-	       h, h->h_cookie);
+               h, h->h_cookie);
 
 	spin_lock(&h->h_lock);
 	if (h->h_in == 0) {
@@ -151,24 +150,21 @@ EXPORT_SYMBOL(class_handle_hash_back);
 
 void *class_handle2object(__u64 cookie, const void *owner)
 {
-	struct handle_bucket *bucket;
-	struct portals_handle *h;
-	void *retval = NULL;
-
-	ENTRY;
+        struct handle_bucket *bucket;
+        struct portals_handle *h;
+        void *retval = NULL;
+        ENTRY;
 
-	LASSERT(handle_hash != NULL);
+        LASSERT(handle_hash != NULL);
 
-	/*
-	 * Be careful when you want to change this code. See the
-	 * rcu_read_lock() definition on top this file. - jxiong
-	 */
-	bucket = handle_hash + (cookie & HANDLE_HASH_MASK);
+	/* Be careful when you want to change this code. See the
+	 * rcu_read_lock() definition on top this file. - jxiong */
+        bucket = handle_hash + (cookie & HANDLE_HASH_MASK);
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(h, &bucket->head, h_link) {
+        rcu_read_lock();
+        list_for_each_entry_rcu(h, &bucket->head, h_link) {
 		if (h->h_cookie != cookie || h->h_owner != owner)
-			continue;
+                        continue;
 
 		spin_lock(&h->h_lock);
 		if (likely(h->h_in != 0)) {
@@ -201,15 +197,15 @@ EXPORT_SYMBOL(class_handle_free_cb);
 
 int class_handle_init(void)
 {
-	struct handle_bucket *bucket;
+        struct handle_bucket *bucket;
 	struct timespec64 ts;
-	int seed[2];
+        int seed[2];
 
-	LASSERT(handle_hash == NULL);
+        LASSERT(handle_hash == NULL);
 
-	OBD_ALLOC_LARGE(handle_hash, sizeof(*bucket) * HANDLE_HASH_SIZE);
-	if (handle_hash == NULL)
-		return -ENOMEM;
+        OBD_ALLOC_LARGE(handle_hash, sizeof(*bucket) * HANDLE_HASH_SIZE);
+        if (handle_hash == NULL)
+                return -ENOMEM;
 
 	for (bucket = handle_hash + HANDLE_HASH_SIZE - 1; bucket >= handle_hash;
 	     bucket--) {
@@ -222,10 +218,10 @@ int class_handle_init(void)
 	ktime_get_ts64(&ts);
 	cfs_srand(ts.tv_sec ^ seed[0], ts.tv_nsec ^ seed[1]);
 
-	cfs_get_random_bytes(&handle_base, sizeof(handle_base));
-	LASSERT(handle_base != 0ULL);
+        cfs_get_random_bytes(&handle_base, sizeof(handle_base));
+        LASSERT(handle_base != 0ULL);
 
-	return 0;
+        return 0;
 }
 
 static int cleanup_all_handles(void)
@@ -252,15 +248,14 @@ static int cleanup_all_handles(void)
 
 void class_handle_cleanup(void)
 {
-	int count;
-
-	LASSERT(handle_hash != NULL);
+        int count;
+        LASSERT(handle_hash != NULL);
 
-	count = cleanup_all_handles();
+        count = cleanup_all_handles();
 
-	OBD_FREE_LARGE(handle_hash, sizeof(*handle_hash) * HANDLE_HASH_SIZE);
-	handle_hash = NULL;
+        OBD_FREE_LARGE(handle_hash, sizeof(*handle_hash) * HANDLE_HASH_SIZE);
+        handle_hash = NULL;
 
-	if (count != 0)
-		CERROR("handle_count at cleanup: %d\n", count);
+        if (count != 0)
+                CERROR("handle_count at cleanup: %d\n", count);
 }
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lustre_peer.c b/drivers/staging/lustrefsx/lustre/obdclass/lustre_peer.c
index 535d78eac5578..95716e1ccac88 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lustre_peer.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lustre_peer.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2012, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -80,51 +80,51 @@ EXPORT_SYMBOL(lustre_uuid_to_peer);
    LNET will choose the best one. */
 int class_add_uuid(const char *uuid, __u64 nid)
 {
-	struct uuid_nid_data *data, *entry;
-	int found = 0;
+        struct uuid_nid_data *data, *entry;
+        int found = 0;
 
-	LASSERT(nid != 0);  /* valid newconfig NID is never zero */
+        LASSERT(nid != 0);  /* valid newconfig NID is never zero */
 
-	if (strlen(uuid) > UUID_MAX - 1)
-		return -EOVERFLOW;
+        if (strlen(uuid) > UUID_MAX - 1)
+                return -EOVERFLOW;
 
-	OBD_ALLOC_PTR(data);
-	if (data == NULL)
-		return -ENOMEM;
+        OBD_ALLOC_PTR(data);
+        if (data == NULL)
+                return -ENOMEM;
 
-	obd_str2uuid(&data->un_uuid, uuid);
-	data->un_nids[0] = nid;
-	data->un_nid_count = 1;
+        obd_str2uuid(&data->un_uuid, uuid);
+        data->un_nids[0] = nid;
+        data->un_nid_count = 1;
 
 	spin_lock(&g_uuid_lock);
 	list_for_each_entry(entry, &g_uuid_list, un_list) {
-		if (obd_uuid_equals(&entry->un_uuid, &data->un_uuid)) {
-			int i;
-
-			found = 1;
-			for (i = 0; i < entry->un_nid_count; i++)
-				if (nid == entry->un_nids[i])
-					break;
-
-			if (i == entry->un_nid_count) {
-				LASSERT(entry->un_nid_count < NIDS_MAX);
-				entry->un_nids[entry->un_nid_count++] = nid;
-			}
-			break;
-		}
-	}
-	if (!found)
+                if (obd_uuid_equals(&entry->un_uuid, &data->un_uuid)) {
+                        int i;
+
+                        found = 1;
+                        for (i = 0; i < entry->un_nid_count; i++)
+                                if (nid == entry->un_nids[i])
+                                        break;
+
+                        if (i == entry->un_nid_count) {
+                                LASSERT(entry->un_nid_count < NIDS_MAX);
+                                entry->un_nids[entry->un_nid_count++] = nid;
+                        }
+                        break;
+                }
+        }
+        if (!found)
 		list_add(&data->un_list, &g_uuid_list);
 	spin_unlock(&g_uuid_lock);
 
-	if (found) {
-		CDEBUG(D_INFO, "found uuid %s %s cnt=%d\n", uuid,
-		       libcfs_nid2str(nid), entry->un_nid_count);
-		OBD_FREE(data, sizeof(*data));
-	} else {
-		CDEBUG(D_INFO, "add uuid %s %s\n", uuid, libcfs_nid2str(nid));
-	}
-	return 0;
+        if (found) {
+                CDEBUG(D_INFO, "found uuid %s %s cnt=%d\n", uuid,
+                       libcfs_nid2str(nid), entry->un_nid_count);
+                OBD_FREE(data, sizeof(*data));
+        } else {
+                CDEBUG(D_INFO, "add uuid %s %s\n", uuid, libcfs_nid2str(nid));
+        }
+        return 0;
 }
 
 /* Delete the nids for one uuid if specified, otherwise delete all */
@@ -173,30 +173,29 @@ int class_del_uuid(const char *uuid)
 /* check if @nid exists in nid list of @uuid */
 int class_check_uuid(struct obd_uuid *uuid, __u64 nid)
 {
-	struct uuid_nid_data *entry;
-	int found = 0;
+        struct uuid_nid_data *entry;
+        int found = 0;
+        ENTRY;
 
-	ENTRY;
-
-	CDEBUG(D_INFO, "check if uuid %s has %s.\n",
-	       obd_uuid2str(uuid), libcfs_nid2str(nid));
+        CDEBUG(D_INFO, "check if uuid %s has %s.\n",
+               obd_uuid2str(uuid), libcfs_nid2str(nid));
 
 	spin_lock(&g_uuid_lock);
 	list_for_each_entry(entry, &g_uuid_list, un_list) {
-		int i;
+                int i;
 
-		if (!obd_uuid_equals(&entry->un_uuid, uuid))
+                if (!obd_uuid_equals(&entry->un_uuid, uuid))
                         continue;
 
-		/* found the uuid, check if it has @nid */
-		for (i = 0; i < entry->un_nid_count; i++) {
-			if (entry->un_nids[i] == nid) {
-				found = 1;
-				break;
-			}
-		}
-		break;
-	}
+                /* found the uuid, check if it has @nid */
+                for (i = 0; i < entry->un_nid_count; i++) {
+                        if (entry->un_nids[i] == nid) {
+                                found = 1;
+                                break;
+                        }
+                }
+                break;
+        }
 	spin_unlock(&g_uuid_lock);
 	RETURN(found);
 }
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/md_attrs.c b/drivers/staging/lustrefsx/lustre/obdclass/md_attrs.c
index d0ca4f17b1cb3..85003937e7466 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/md_attrs.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/md_attrs.c
@@ -21,11 +21,14 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  * Use is subject to license terms.
  *
  * Author: Johann Lombardi <johann.lombardi@intel.com>
  */
+
+#include <lustre/lustre_idl.h>
+
 #include <llog_swab.h>
 #include <lustre_swab.h>
 #include <obd.h>
@@ -47,9 +50,9 @@ void lustre_lma_init(struct lustre_mdt_attrs *lma, const struct lu_fid *fid,
 
 	/* If a field is added in struct lustre_mdt_attrs, zero it explicitly
 	 * and change the test below. */
-	CLASSERT(sizeof(*lma) ==
-		 (offsetof(struct lustre_mdt_attrs, lma_self_fid) +
-		  sizeof(lma->lma_self_fid)));
+	LASSERT(sizeof(*lma) ==
+		(offsetof(struct lustre_mdt_attrs, lma_self_fid) +
+		 sizeof(lma->lma_self_fid)));
 }
 EXPORT_SYMBOL(lustre_lma_init);
 
@@ -111,22 +114,6 @@ void lustre_loa_swab(struct lustre_ost_attrs *loa, bool to_cpu)
 }
 EXPORT_SYMBOL(lustre_loa_swab);
 
-/**
- * Swab, if needed, SOM structure which is stored on-disk in little-endian
- * order.
- *
- * \param attrs - is a pointer to the SOM structure to be swabbed.
- */
-void lustre_som_swab(struct lustre_som_attrs *attrs)
-{
-#ifdef __BIG_ENDIAN
-	__swab16s(&attrs->lsa_valid);
-	__swab64s(&attrs->lsa_size);
-	__swab64s(&attrs->lsa_blocks);
-#endif
-}
-EXPORT_SYMBOL(lustre_som_swab);
-
 /**
  * Swab, if needed, HSM structure which is stored on-disk in little-endian
  * order.
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_cksum.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_cksum.c
deleted file mode 100644
index 16e6f12f8a05c..0000000000000
--- a/drivers/staging/lustrefsx/lustre/obdclass/obd_cksum.c
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2018, DataDirect Networks Storage.
- * Author: Li Xi.
- *
- * Checksum functions
- */
-#include <obd_class.h>
-#include <obd_cksum.h>
-
-/* Server uses algos that perform at 50% or better of the Adler */
-enum cksum_types obd_cksum_types_supported_server(const char *obd_name)
-{
-	enum cksum_types ret = OBD_CKSUM_ADLER;
-	int base_speed;
-
-	CDEBUG(D_INFO, "%s: checksum speed: crc %d, crc32c %d, adler %d, "
-	       "t10ip512 %d, t10ip4k %d, t10crc512 %d, t10crc4k %d\n",
-	       obd_name,
-	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)),
-	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)),
-	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)),
-	       obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP512),
-	       obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP4K),
-	       obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC512),
-	       obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC4K));
-
-	base_speed = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)) / 2;
-
-	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) >=
-	    base_speed)
-		ret |= OBD_CKSUM_CRC32C;
-
-	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) >=
-	    base_speed)
-		ret |= OBD_CKSUM_CRC32;
-
-	if (obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP512) >= base_speed)
-		ret |= OBD_CKSUM_T10IP512;
-
-	if (obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP4K) >= base_speed)
-		ret |= OBD_CKSUM_T10IP4K;
-
-	if (obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC512) >= base_speed)
-		ret |= OBD_CKSUM_T10CRC512;
-
-	if (obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC4K) >= base_speed)
-		ret |= OBD_CKSUM_T10CRC4K;
-
-	return ret;
-}
-EXPORT_SYMBOL(obd_cksum_types_supported_server);
-
-/* The OBD_FL_CKSUM_* flags is packed into 5 bits of o_flags, since there can
- * only be a single checksum type per RPC.
- *
- * The OBD_CKSUM_* type bits passed in ocd_cksum_types are a 32-bit bitmask
- * since they need to represent the full range of checksum algorithms that
- * both the client and server can understand.
- *
- * In case of an unsupported types/flags we fall back to ADLER
- * because that is supported by all clients since 1.8
- *
- * In case multiple algorithms are supported the best one is used. */
-u32 obd_cksum_type_pack(const char *obd_name, enum cksum_types cksum_type)
-{
-	unsigned int performance = 0, tmp;
-	u32 flag = OBD_FL_CKSUM_ADLER;
-
-	if (cksum_type & OBD_CKSUM_CRC32) {
-		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32));
-		if (tmp > performance) {
-			performance = tmp;
-			flag = OBD_FL_CKSUM_CRC32;
-		}
-	}
-	if (cksum_type & OBD_CKSUM_CRC32C) {
-		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C));
-		if (tmp > performance) {
-			performance = tmp;
-			flag = OBD_FL_CKSUM_CRC32C;
-		}
-	}
-	if (cksum_type & OBD_CKSUM_ADLER) {
-		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER));
-		if (tmp > performance) {
-			performance = tmp;
-			flag = OBD_FL_CKSUM_ADLER;
-		}
-	}
-
-	if (cksum_type & OBD_CKSUM_T10IP512) {
-		tmp = obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP512);
-		if (tmp > performance) {
-			performance = tmp;
-			flag = OBD_FL_CKSUM_T10IP512;
-		}
-	}
-
-	if (cksum_type & OBD_CKSUM_T10IP4K) {
-		tmp = obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP4K);
-		if (tmp > performance) {
-			performance = tmp;
-			flag = OBD_FL_CKSUM_T10IP4K;
-		}
-	}
-
-	if (cksum_type & OBD_CKSUM_T10CRC512) {
-		tmp = obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC512);
-		if (tmp > performance) {
-			performance = tmp;
-			flag = OBD_FL_CKSUM_T10CRC512;
-		}
-	}
-
-	if (cksum_type & OBD_CKSUM_T10CRC4K) {
-		tmp = obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC4K);
-		if (tmp > performance) {
-			performance = tmp;
-			flag = OBD_FL_CKSUM_T10CRC4K;
-		}
-	}
-
-	if (unlikely(cksum_type && !(cksum_type & OBD_CKSUM_ALL)))
-		CWARN("%s: unknown cksum type %x\n", obd_name, cksum_type);
-
-	return flag;
-}
-EXPORT_SYMBOL(obd_cksum_type_pack);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
index a5b5dcfe572fe..924322ef86e8c 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -36,15 +36,14 @@
 
 #define DEBUG_SUBSYSTEM S_CLASS
 
-#include <linux/kobject.h>
 #include <linux/string.h>
 
 #include <llog_swab.h>
 #include <lprocfs_status.h>
 #include <lustre_disk.h>
-#include <uapi/linux/lustre/lustre_ioctl.h>
+#include <uapi/linux/lustre_ioctl.h>
 #include <lustre_log.h>
-#include <uapi/linux/lustre/lustre_param.h>
+#include <uapi/linux/lustre_param.h>
 #include <obd_class.h>
 
 #include "llog_internal.h"
@@ -366,7 +365,6 @@ EXPORT_SYMBOL(lustre_cfg_string);
  */
 int class_attach(struct lustre_cfg *lcfg)
 {
-	struct obd_export *exp;
         struct obd_device *obd = NULL;
         char *typename, *name, *uuid;
         int rc, len;
@@ -383,54 +381,90 @@ int class_attach(struct lustre_cfg *lcfg)
                 RETURN(-EINVAL);
         }
         name = lustre_cfg_string(lcfg, 0);
+
         if (!LUSTRE_CFG_BUFLEN(lcfg, 2)) {
                 CERROR("No UUID passed!\n");
                 RETURN(-EINVAL);
         }
+        uuid = lustre_cfg_string(lcfg, 2);
 
-	uuid = lustre_cfg_string(lcfg, 2);
-	len = strlen(uuid);
-	if (len >= sizeof(obd->obd_uuid)) {
-		CERROR("%s: uuid must be < %d bytes long\n",
-		       name, (int)sizeof(obd->obd_uuid));
-		RETURN(-EINVAL);
-	}
+        CDEBUG(D_IOCTL, "attach type %s name: %s uuid: %s\n",
+               MKSTR(typename), MKSTR(name), MKSTR(uuid));
 
-	obd = class_newdev(typename, name, uuid);
-	if (IS_ERR(obd)) { /* Already exists or out of obds */
-		rc = PTR_ERR(obd);
+        obd = class_newdev(typename, name);
+        if (IS_ERR(obd)) {
+                /* Already exists or out of obds */
+                rc = PTR_ERR(obd);
+                obd = NULL;
                 CERROR("Cannot create device %s of type %s : %d\n",
                        name, typename, rc);
-		RETURN(rc);
+                GOTO(out, rc);
         }
+        LASSERTF(obd != NULL, "Cannot get obd device %s of type %s\n",
+                 name, typename);
         LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
                  "obd %p obd_magic %08X != %08X\n",
                  obd, obd->obd_magic, OBD_DEVICE_MAGIC);
         LASSERTF(strncmp(obd->obd_name, name, strlen(name)) == 0,
                  "%p obd_name %s != %s\n", obd, obd->obd_name, name);
 
-	exp = class_new_export_self(obd, &obd->obd_uuid);
-	if (IS_ERR(exp)) {
-		rc = PTR_ERR(exp);
-		class_free_dev(obd);
-		RETURN(rc);
-	}
-
-	obd->obd_self_export = exp;
-	list_del_init(&exp->exp_obd_chain_timed);
-	class_export_put(exp);
+	rwlock_init(&obd->obd_pool_lock);
+	obd->obd_pool_limit = 0;
+	obd->obd_pool_slv = 0;
+
+	INIT_LIST_HEAD(&obd->obd_exports);
+	INIT_LIST_HEAD(&obd->obd_unlinked_exports);
+	INIT_LIST_HEAD(&obd->obd_delayed_exports);
+	INIT_LIST_HEAD(&obd->obd_exports_timed);
+	INIT_LIST_HEAD(&obd->obd_nid_stats);
+	spin_lock_init(&obd->obd_nid_lock);
+	spin_lock_init(&obd->obd_dev_lock);
+	mutex_init(&obd->obd_dev_mutex);
+	spin_lock_init(&obd->obd_osfs_lock);
+	/* obd->obd_osfs_age must be set to a value in the distant
+	 * past to guarantee a fresh statfs is fetched on mount. */
+	obd->obd_osfs_age = cfs_time_shift_64(-1000);
+
+	/* XXX belongs in setup not attach  */
+	init_rwsem(&obd->obd_observer_link_sem);
+	/* recovery data */
+	spin_lock_init(&obd->obd_recovery_task_lock);
+	init_waitqueue_head(&obd->obd_next_transno_waitq);
+	init_waitqueue_head(&obd->obd_evict_inprogress_waitq);
+	INIT_LIST_HEAD(&obd->obd_req_replay_queue);
+	INIT_LIST_HEAD(&obd->obd_lock_replay_queue);
+	INIT_LIST_HEAD(&obd->obd_final_req_queue);
+	INIT_LIST_HEAD(&obd->obd_evict_list);
+	INIT_LIST_HEAD(&obd->obd_lwp_list);
+
+	llog_group_init(&obd->obd_olg);
+
+	obd->obd_conn_inprogress = 0;
+
+        len = strlen(uuid);
+        if (len >= sizeof(obd->obd_uuid)) {
+                CERROR("uuid must be < %d bytes long\n",
+                       (int)sizeof(obd->obd_uuid));
+                GOTO(out, rc = -EINVAL);
+        }
+        memcpy(obd->obd_uuid.uuid, uuid, len);
 
-	rc = class_register_device(obd);
-	if (rc != 0) {
-		class_decref(obd, "newdev", obd);
-		RETURN(rc);
-	}
+        /* Detach drops this */
+	spin_lock(&obd->obd_dev_lock);
+	atomic_set(&obd->obd_refcount, 1);
+	spin_unlock(&obd->obd_dev_lock);
+        lu_ref_init(&obd->obd_reference);
+        lu_ref_add(&obd->obd_reference, "attach", obd);
 
-	obd->obd_attached = 1;
-	CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n",
+        obd->obd_attached = 1;
+        CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n",
 	       obd->obd_minor, typename, atomic_read(&obd->obd_refcount));
-
-	RETURN(0);
+        RETURN(0);
+ out:
+        if (obd != NULL) {
+                class_release_dev(obd);
+        }
+        return rc;
 }
 EXPORT_SYMBOL(class_attach);
 
@@ -440,6 +474,7 @@ EXPORT_SYMBOL(class_attach);
 int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 {
         int err = 0;
+        struct obd_export *exp;
         ENTRY;
 
         LASSERT(obd != NULL);
@@ -488,7 +523,7 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
                                              CFS_HASH_MAX_THETA,
                                              &uuid_hash_ops, CFS_HASH_DEFAULT);
         if (!obd->obd_uuid_hash)
-		GOTO(err_exit, err = -ENOMEM);
+                GOTO(err_hash, err = -ENOMEM);
 
         /* create a nid-export lustre hash */
         obd->obd_nid_hash = cfs_hash_create("NID_HASH",
@@ -499,7 +534,7 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
                                             CFS_HASH_MAX_THETA,
                                             &nid_hash_ops, CFS_HASH_DEFAULT);
         if (!obd->obd_nid_hash)
-		GOTO(err_exit, err = -ENOMEM);
+                GOTO(err_hash, err = -ENOMEM);
 
         /* create a nid-stats lustre hash */
         obd->obd_nid_stats_hash = cfs_hash_create("NID_STATS",
@@ -509,8 +544,8 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
                                                   CFS_HASH_MIN_THETA,
                                                   CFS_HASH_MAX_THETA,
                                                   &nid_stat_hash_ops, CFS_HASH_DEFAULT);
-	if (!obd->obd_nid_stats_hash)
-		GOTO(err_exit, err = -ENOMEM);
+        if (!obd->obd_nid_stats_hash)
+                GOTO(err_hash, err = -ENOMEM);
 
 	/* create a client_generation-export lustre hash */
 	obd->obd_gen_hash = cfs_hash_create("UUID_HASH",
@@ -521,13 +556,21 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 					    CFS_HASH_MAX_THETA,
 					    &gen_hash_ops, CFS_HASH_DEFAULT);
 	if (!obd->obd_gen_hash)
-		GOTO(err_exit, err = -ENOMEM);
+		GOTO(err_hash, err = -ENOMEM);
 
-	err = obd_setup(obd, lcfg);
-	if (err)
-		GOTO(err_exit, err);
+        exp = class_new_export(obd, &obd->obd_uuid);
+        if (IS_ERR(exp))
+                GOTO(err_hash, err = PTR_ERR(exp));
 
-	obd->obd_set_up = 1;
+        obd->obd_self_export = exp;
+	list_del_init(&exp->exp_obd_chain_timed);
+        class_export_put(exp);
+
+        err = obd_setup(obd, lcfg);
+        if (err)
+                GOTO(err_exp, err);
+
+        obd->obd_set_up = 1;
 
 	spin_lock(&obd->obd_dev_lock);
 	/* cleanup drops this */
@@ -538,7 +581,12 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
                obd->obd_name, obd->obd_uuid.uuid);
 
         RETURN(0);
-err_exit:
+err_exp:
+        if (obd->obd_self_export) {
+                class_unlink_export(obd->obd_self_export);
+                obd->obd_self_export = NULL;
+        }
+err_hash:
         if (obd->obd_uuid_hash) {
                 cfs_hash_putref(obd->obd_uuid_hash);
                 obd->obd_uuid_hash = NULL;
@@ -582,14 +630,10 @@ int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg)
 	obd->obd_attached = 0;
 	spin_unlock(&obd->obd_dev_lock);
 
-	/* cleanup in progress. we don't like to find this device after now */
-	class_unregister_device(obd);
-
         CDEBUG(D_IOCTL, "detach on obd %s (uuid %s)\n",
                obd->obd_name, obd->obd_uuid.uuid);
 
-	class_decref(obd, "newdev", obd);
-
+        class_decref(obd, "attach", obd);
         RETURN(0);
 }
 EXPORT_SYMBOL(class_detach);
@@ -619,9 +663,6 @@ int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
 	}
 	/* Leave this on forever */
 	obd->obd_stopping = 1;
-	/* function can't return error after that point, so clear setup flag
-	 * as early as possible to avoid finding via obd_devs / hash */
-	obd->obd_set_up = 0;
 	spin_unlock(&obd->obd_dev_lock);
 
 	/* wait for already-arrived-connections to finish. */
@@ -654,11 +695,17 @@ int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
 
 	LASSERT(obd->obd_self_export);
 
-	CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d/%d\n",
-	       obd->obd_name, obd->obd_num_exports,
-	       atomic_read(&obd->obd_refcount) - 2);
-	dump_exports(obd, 0, D_HA);
-	class_disconnect_exports(obd);
+	/* The three references that should be remaining are the
+	 * obd_self_export and the attach and setup references. */
+	if (atomic_read(&obd->obd_refcount) > 3) {
+		/* refcounf - 3 might be the number of real exports
+		   (excluding self export). But class_incref is called
+		   by other things as well, so don't count on it. */
+		CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d\n",
+		       obd->obd_name, atomic_read(&obd->obd_refcount) - 3);
+		dump_exports(obd, 0, D_HA);
+		class_disconnect_exports(obd);
+	}
 
 	/* Precleanup, we must make sure all exports get destroyed. */
 	err = obd_precleanup(obd);
@@ -710,27 +757,43 @@ EXPORT_SYMBOL(class_incref);
 
 void class_decref(struct obd_device *obd, const char *scope, const void *source)
 {
-	int last;
-
-	CDEBUG(D_INFO, "Decref %s (%p) now %d - %s\n", obd->obd_name, obd,
-	       atomic_read(&obd->obd_refcount), scope);
+	int err;
+	int refs;
 
-	LASSERT(obd->obd_num_exports >= 0);
-	last = atomic_dec_and_test(&obd->obd_refcount);
+	spin_lock(&obd->obd_dev_lock);
+	atomic_dec(&obd->obd_refcount);
+	refs = atomic_read(&obd->obd_refcount);
+	spin_unlock(&obd->obd_dev_lock);
 	lu_ref_del(&obd->obd_reference, scope, source);
 
-	if (last) {
-		struct obd_export *exp;
+	CDEBUG(D_INFO, "Decref %s (%p) now %d\n", obd->obd_name, obd, refs);
 
-		LASSERT(!obd->obd_attached);
+	if ((refs == 1) && obd->obd_stopping) {
 		/* All exports have been destroyed; there should
-		 * be no more in-progress ops by this point.*/
-		exp = obd->obd_self_export;
+		   be no more in-progress ops by this point.*/
 
-		if (exp) {
-			exp->exp_flags |= exp_flags_from_obd(obd);
-			class_unlink_export(exp);
+		spin_lock(&obd->obd_self_export->exp_lock);
+		obd->obd_self_export->exp_flags |= exp_flags_from_obd(obd);
+		spin_unlock(&obd->obd_self_export->exp_lock);
+
+                /* note that we'll recurse into class_decref again */
+                class_unlink_export(obd->obd_self_export);
+                return;
+        }
+
+        if (refs == 0) {
+                CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n",
+                       obd->obd_name, obd->obd_uuid.uuid);
+                LASSERT(!obd->obd_attached);
+                if (obd->obd_stopping) {
+                        /* If we're not stopping, we were never set up */
+                        err = obd_cleanup(obd);
+                        if (err)
+                                CERROR("Cleanup %s returned %d\n",
+                                       obd->obd_name, err);
                 }
+
+                class_release_dev(obd);
         }
 }
 EXPORT_SYMBOL(class_decref);
@@ -806,7 +869,7 @@ static int class_del_conn(struct obd_device *obd, struct lustre_cfg *lcfg)
 static LIST_HEAD(lustre_profile_list);
 static DEFINE_SPINLOCK(lustre_profile_list_lock);
 
-struct lustre_profile *class_get_profile(const char *prof)
+struct lustre_profile *class_get_profile(const char * prof)
 {
 	struct lustre_profile *lprof;
 
@@ -948,12 +1011,40 @@ void class_del_profiles(void)
 }
 EXPORT_SYMBOL(class_del_profiles);
 
-/* We can't call lquota_process_config directly because
- * it lives in a module that must be loaded after this one.
- */
-#ifdef HAVE_SERVER_SUPPORT
+static int class_set_global(char *ptr, int val, struct lustre_cfg *lcfg)
+{
+	ENTRY;
+	if (class_match_param(ptr, PARAM_AT_MIN, NULL) == 0)
+		at_min = val;
+	else if (class_match_param(ptr, PARAM_AT_MAX, NULL) == 0)
+		at_max = val;
+	else if (class_match_param(ptr, PARAM_AT_EXTRA, NULL) == 0)
+		at_extra = val;
+	else if (class_match_param(ptr, PARAM_AT_EARLY_MARGIN, NULL) == 0)
+		at_early_margin = val;
+	else if (class_match_param(ptr, PARAM_AT_HISTORY, NULL) == 0)
+		at_history = val;
+	else if (class_match_param(ptr, PARAM_JOBID_VAR, NULL) == 0)
+		strlcpy(obd_jobid_var, lustre_cfg_string(lcfg, 2),
+			JOBSTATS_JOBID_VAR_MAX_LEN + 1);
+	else
+		RETURN(-EINVAL);
+
+	CDEBUG(D_IOCTL, "global %s = %d\n", ptr, val);
+	RETURN(0);
+}
+
+
+/* We can't call ll_process_config or lquota_process_config directly because
+ * it lives in a module that must be loaded after this one. */
+static int (*client_process_config)(struct lustre_cfg *lcfg) = NULL;
 static int (*quota_process_config)(struct lustre_cfg *lcfg) = NULL;
-#endif /* HAVE_SERVER_SUPPORT */
+
+void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg))
+{
+        client_process_config = cpc;
+}
+EXPORT_SYMBOL(lustre_register_client_process_config);
 
 /**
  * Rename the proc parameter in \a cfg with a new name \a new_name.
@@ -1030,12 +1121,10 @@ struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg,
 }
 EXPORT_SYMBOL(lustre_cfg_rename);
 
-static ssize_t process_param2_config(struct lustre_cfg *lcfg)
+static int process_param2_config(struct lustre_cfg *lcfg)
 {
 	char *param = lustre_cfg_string(lcfg, 1);
 	char *upcall = lustre_cfg_string(lcfg, 2);
-	struct kobject *kobj = NULL;
-	const char *subsys = param;
 	char *argv[] = {
 		[0] = "/usr/sbin/lctl",
 		[1] = "set_param",
@@ -1044,44 +1133,8 @@ static ssize_t process_param2_config(struct lustre_cfg *lcfg)
 	};
 	ktime_t start;
 	ktime_t end;
-	size_t len;
-	int rc;
-
+	int		rc;
 	ENTRY;
-	print_lustre_cfg(lcfg);
-
-	len = strcspn(param, ".=");
-	if (!len)
-		return -EINVAL;
-
-	/* If we find '=' then its the top level sysfs directory */
-	if (param[len] == '=')
-		return class_set_global(param);
-
-	subsys = kstrndup(param, len, GFP_KERNEL);
-	if (!subsys)
-		return -ENOMEM;
-
-	kobj = kset_find_obj(lustre_kset, subsys);
-	kfree(subsys);
-	if (kobj) {
-		char *value = param;
-		char *envp[3];
-		int i;
-
-		param = strsep(&value, "=");
-		envp[0] = kasprintf(GFP_KERNEL, "PARAM=%s", param);
-		envp[1] = kasprintf(GFP_KERNEL, "SETTING=%s", value);
-		envp[2] = NULL;
-
-		rc = kobject_uevent_env(kobj, KOBJ_CHANGE, envp);
-		for (i = 0; i < ARRAY_SIZE(envp); i++)
-			kfree(envp[i]);
-
-		kobject_put(kobj);
-
-		RETURN(rc);
-	}
 
 	/* Add upcall processing here. Now only lctl is supported */
 	if (strcmp(upcall, LCTL_UPCALL) != 0) {
@@ -1107,13 +1160,11 @@ static ssize_t process_param2_config(struct lustre_cfg *lcfg)
 	RETURN(rc);
 }
 
-#ifdef HAVE_SERVER_SUPPORT
 void lustre_register_quota_process_config(int (*qpc)(struct lustre_cfg *lcfg))
 {
 	quota_process_config = qpc;
 }
 EXPORT_SYMBOL(lustre_register_quota_process_config);
-#endif /* HAVE_SERVER_SUPPORT */
 
 /** Process configuration commands given in lustre_cfg form.
  * These may come from direct calls (e.g. class_manual_cleanup)
@@ -1200,51 +1251,29 @@ int class_process_config(struct lustre_cfg *lcfg)
         }
         case LCFG_PARAM: {
                 char *tmp;
-
                 /* llite has no obd */
-		if (class_match_param(lustre_cfg_string(lcfg, 1),
-				      PARAM_LLITE, NULL) == 0) {
-			struct lustre_sb_info *lsi;
-			unsigned long addr;
-			ssize_t count;
-
-			/* The instance name contains the sb:
-			 * lustre-client-aacfe000
-			 */
-			tmp = strrchr(lustre_cfg_string(lcfg, 0), '-');
-			if (!tmp || !*(++tmp))
-				GOTO(out, err = -EINVAL);
-
-			if (sscanf(tmp, "%lx", &addr) != 1)
-				GOTO(out, err = -EINVAL);
-
-			lsi = s2lsi((struct super_block *)addr);
-			/* This better be a real Lustre superblock! */
-			LASSERT(lsi->lsi_lmd->lmd_magic == LMD_MAGIC);
-
-			count = class_modify_config(lcfg, PARAM_LLITE,
-						    lsi->lsi_kobj);
-			err = count < 0 ? count : 0;
-			GOTO(out, err);
+                if ((class_match_param(lustre_cfg_string(lcfg, 1),
+				       PARAM_LLITE, NULL) == 0) &&
+                    client_process_config) {
+                        err = (*client_process_config)(lcfg);
+                        GOTO(out, err);
                 } else if ((class_match_param(lustre_cfg_string(lcfg, 1),
                                               PARAM_SYS, &tmp) == 0)) {
                         /* Global param settings */
-			err = class_set_global(tmp);
+			err = class_set_global(tmp, lcfg->lcfg_num, lcfg);
 			/*
 			 * Client or server should not fail to mount if
 			 * it hits an unknown configuration parameter.
 			 */
-			if (err < 0)
+			if (err != 0)
 				CWARN("Ignoring unknown param %s\n", tmp);
 
 			GOTO(out, err = 0);
-#ifdef HAVE_SERVER_SUPPORT
 		} else if ((class_match_param(lustre_cfg_string(lcfg, 1),
 					      PARAM_QUOTA, &tmp) == 0) &&
 			   quota_process_config) {
 			err = (*quota_process_config)(lcfg);
 			GOTO(out, err);
-#endif /* HAVE_SERVER_SUPPORT */
 		}
 
 		break;
@@ -1265,6 +1294,7 @@ int class_process_config(struct lustre_cfg *lcfg)
 
                 GOTO(out, err = -EINVAL);
         }
+
 	switch(lcfg->lcfg_command) {
 	case LCFG_SETUP: {
 		err = class_setup(obd, lcfg);
@@ -1304,47 +1334,12 @@ int class_process_config(struct lustre_cfg *lcfg)
                 err = obd_pool_del(obd, lustre_cfg_string(lcfg, 2));
                 GOTO(out, err = 0);
         }
-	/* Process config log ADD_MDC record twice to add MDC also to LOV
-	 * for Data-on-MDT:
-	 *
-	 * add 0:lustre-clilmv 1:lustre-MDT0000_UUID 2:0 3:1
-	 *     4:lustre-MDT0000-mdc_UUID
-	 */
-	case LCFG_ADD_MDC: {
-		struct obd_device *lov_obd;
-		char *clilmv;
-
-		err = obd_process_config(obd, sizeof(*lcfg), lcfg);
-		if (err)
-			GOTO(out, err);
-
-		/* make sure this is client LMV log entry */
-		clilmv = strstr(lustre_cfg_string(lcfg, 0), "clilmv");
-		if (!clilmv)
-			GOTO(out, err);
-
-		/* replace 'lmv' with 'lov' name to address LOV device and
-		 * process llog record to add MDC there. */
-		clilmv[4] = 'o';
-		lov_obd = class_name2obd(lustre_cfg_string(lcfg, 0));
-		if (lov_obd == NULL) {
-			err = -ENOENT;
-			CERROR("%s: Cannot find LOV by %s name, rc = %d\n",
-			       obd->obd_name, lustre_cfg_string(lcfg, 0), err);
-		} else {
-			err = obd_process_config(lov_obd, sizeof(*lcfg), lcfg);
-		}
-		/* restore 'lmv' name */
-		clilmv[4] = 'm';
-		GOTO(out, err);
-	}
-	default: {
-		err = obd_process_config(obd, sizeof(*lcfg), lcfg);
-		GOTO(out, err);
+        default: {
+                err = obd_process_config(obd, sizeof(*lcfg), lcfg);
+                GOTO(out, err);
 
         }
         }
-	EXIT;
 out:
         if ((err < 0) && !(lcfg->lcfg_command & LCFG_REQUIRED)) {
                 CWARN("Ignoring error %d on optional command %#x\n", err,
@@ -1355,89 +1350,97 @@ int class_process_config(struct lustre_cfg *lcfg)
 }
 EXPORT_SYMBOL(class_process_config);
 
-ssize_t class_modify_config(struct lustre_cfg *lcfg, const char *prefix,
-			    struct kobject *kobj)
+int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
+			     struct lustre_cfg *lcfg, void *data)
 {
-	struct kobj_type *typ;
-	ssize_t count = 0;
-	int i;
+	struct lprocfs_vars *var;
+	struct file fakefile = {};
+	struct seq_file fake_seqfile;
+	char *key, *sval;
+	int i, keylen, vallen;
+	int matched = 0, j = 0;
+	int rc = 0;
+	int skip = 0;
+	ENTRY;
 
 	if (lcfg->lcfg_command != LCFG_PARAM) {
 		CERROR("Unknown command: %d\n", lcfg->lcfg_command);
-		return -EINVAL;
+		RETURN(-EINVAL);
 	}
 
-	typ = get_ktype(kobj);
-	if (!typ || !typ->default_attrs)
-		return -ENODEV;
-
-	print_lustre_cfg(lcfg);
-
-	/*
-	 * e.g. tunefs.lustre --param mdt.group_upcall=foo /r/tmp/lustre-mdt
-	 * or   lctl conf_param lustre-MDT0000.mdt.group_upcall=bar
-	 * or   lctl conf_param lustre-OST0000.osc.max_dirty_mb=36
-	 */
+	/* fake a seq file so that var->fops->proc_write can work... */
+	lprocfs_file_set_kernel(&fakefile);
+	fakefile.private_data = &fake_seqfile;
+	fake_seqfile.private = data;
+	/* e.g. tunefs.lustre --param mdt.group_upcall=foo /r/tmp/lustre-mdt
+	   or   lctl conf_param lustre-MDT0000.mdt.group_upcall=bar
+	   or   lctl conf_param lustre-OST0000.osc.max_dirty_mb=36 */
 	for (i = 1; i < lcfg->lcfg_bufcount; i++) {
-		struct attribute *attr;
-		size_t keylen;
-		char *value;
-		char *key;
-		int j;
-
 		key = lustre_cfg_buf(lcfg, i);
 		/* Strip off prefix */
 		if (class_match_param(key, prefix, &key))
 			/* If the prefix doesn't match, return error so we
-			 * can pass it down the stack
-			 */
-			return -EINVAL;
-
-		value = strchr(key, '=');
-		if (!value || *(value + 1) == 0) {
+			 * can pass it down the stack */
+			RETURN(-ENOSYS);
+		sval = strchr(key, '=');
+		if (!sval || *(sval + 1) == 0) {
 			CERROR("%s: can't parse param '%s' (missing '=')\n",
 			       lustre_cfg_string(lcfg, 0),
 			       lustre_cfg_string(lcfg, i));
-			/* continue parsing other params */
+			/* rc = -EINVAL;        continue parsing other params */
 			continue;
 		}
-		keylen = value - key;
-		value++;
-
-		attr = NULL;
-		for (j = 0; typ->default_attrs[j]; j++) {
-			if (!strncmp(typ->default_attrs[j]->name, key,
-				     keylen)) {
-				attr = typ->default_attrs[j];
+		keylen = sval - key;
+		sval++;
+		vallen = strlen(sval);
+		matched = 0;
+		j = 0;
+		/* Search proc entries */
+		while (lvars[j].name) {
+			var = &lvars[j];
+			if (class_match_param(key, var->name, NULL) == 0 &&
+			    keylen == strlen(var->name)) {
+				matched++;
+				rc = -EROFS;
+
+				if (var->fops && var->fops->proc_write) {
+					rc = (var->fops->proc_write)(&fakefile,
+								     sval,
+								     vallen,
+								     NULL);
+				}
 				break;
 			}
+			j++;
 		}
+		if (!matched) {
+			/* It was upgraded from old MDT/OST device,
+			 * ignore the obsolete "sec_level" parameter. */
+			if (strncmp("sec_level", key, keylen) == 0)
+				continue;
 
-		if (!attr) {
-			char *envp[3];
-
-			envp[0] = kasprintf(GFP_KERNEL, "PARAM=%s.%s.%.*s",
-					    kobject_name(kobj->parent),
-					    kobject_name(kobj),
-					    (int) keylen, key);
-			envp[1] = kasprintf(GFP_KERNEL, "SETTING=%s", value);
-			envp[2] = NULL;
-
-			if (kobject_uevent_env(kobj, KOBJ_CHANGE, envp)) {
-				CERROR("%s: failed to send uevent %s\n",
-				       kobject_name(kobj), key);
-			}
-
-			for (i = 0; i < ARRAY_SIZE(envp); i++)
-				kfree(envp[i]);
+			CERROR("%s: unknown config parameter '%s'\n",
+			       lustre_cfg_string(lcfg, 0),
+			       lustre_cfg_string(lcfg, i));
+			/* rc = -EINVAL;        continue parsing other params */
+			skip++;
+		} else if (rc < 0) {
+			CERROR("%s: error writing parameter '%s': rc = %d\n",
+			       lustre_cfg_string(lcfg, 0), key, rc);
+			rc = 0;
 		} else {
-			count += lustre_attr_store(kobj, attr, value,
-						   strlen(value));
+			CDEBUG(D_CONFIG, "%s: set parameter '%s'\n",
+			       lustre_cfg_string(lcfg, 0), key);
 		}
 	}
-	return count;
+
+	if (rc > 0)
+		rc = 0;
+	if (!rc && skip)
+		rc = skip;
+	RETURN(rc);
 }
-EXPORT_SYMBOL(class_modify_config);
+EXPORT_SYMBOL(class_process_proc_param);
 
 /*
  * Supplemental functions for config logs, it allocates lustre_cfg
@@ -1539,11 +1542,12 @@ int class_config_llog_handler(const struct lu_env *env,
 			}
 		}
 		/* A config command without a start marker before it is
-		 * illegal
-		 */
-		if (!(cfg->cfg_flags & CFG_F_MARKER) &&
+		   illegal (post 146) */
+		if (!(cfg->cfg_flags & CFG_F_COMPAT146) &&
+		    !(cfg->cfg_flags & CFG_F_MARKER) &&
 		    (lcfg->lcfg_command != LCFG_MARKER)) {
-			CWARN("Skip config outside markers, (inst: %016lx, uuid: %s, flags: %#x)\n",
+			CWARN("Config not inside markers, ignoring! "
+			      "(inst: %p, uuid: %s, flags: %#x)\n",
 				cfg->cfg_instance,
 				cfg->cfg_uuid.uuid, cfg->cfg_flags);
 			cfg->cfg_flags |= CFG_F_SKIP;
@@ -1619,11 +1623,12 @@ int class_config_llog_handler(const struct lu_env *env,
 		if (cfg->cfg_instance &&
 		    lcfg->lcfg_command != LCFG_SPTLRPC_CONF &&
 		    LUSTRE_CFG_BUFLEN(lcfg, 0) > 0) {
-			inst_len = LUSTRE_CFG_BUFLEN(lcfg, 0) + 16 + 4;
+			inst_len = LUSTRE_CFG_BUFLEN(lcfg, 0) +
+				   sizeof(cfg->cfg_instance) * 2 + 4;
 			OBD_ALLOC(inst_name, inst_len);
 			if (inst_name == NULL)
 				GOTO(out, rc = -ENOMEM);
-			snprintf(inst_name, inst_len, "%s-%016lx",
+			snprintf(inst_name, inst_len, "%s-%p",
 				lustre_cfg_string(lcfg, 0),
 				cfg->cfg_instance);
 			lustre_cfg_bufs_set_string(&bufs, 0, inst_name);
@@ -1631,22 +1636,23 @@ int class_config_llog_handler(const struct lu_env *env,
 			       lcfg->lcfg_command, inst_name);
 		}
 
-		/* override llog UUID for clients, to insure they are unique */
-		if (cfg->cfg_instance && lcfg->lcfg_command == LCFG_ATTACH)
+                /* we override the llog's uuid for clients, to insure they
+                are unique */
+		if (cfg->cfg_instance != NULL &&
+		    lcfg->lcfg_command == LCFG_ATTACH) {
 			lustre_cfg_bufs_set_string(&bufs, 2,
 						   cfg->cfg_uuid.uuid);
-		/*
-		 * sptlrpc config record, we expect 2 data segments:
-		 *  [0]: fs_name/target_name,
-		 *  [1]: rule string
-		 * moving them to index [1] and [2], and insert MGC's
-		 * obdname at index [0].
-		 */
+		}
+                /*
+                 * sptlrpc config record, we expect 2 data segments:
+                 *  [0]: fs_name/target_name,
+                 *  [1]: rule string
+                 * moving them to index [1] and [2], and insert MGC's
+                 * obdname at index [0].
+                 */
 		if (cfg->cfg_instance &&
 		    lcfg->lcfg_command == LCFG_SPTLRPC_CONF) {
-			/* After ASLR changes cfg_instance this needs fixing */
-			/* "obd" is set in config_log_find_or_add() */
-			struct obd_device *obd = (void *)cfg->cfg_instance;
+			struct obd_device *obd = cfg->cfg_instance;
 
 			lustre_cfg_bufs_set(&bufs, 2, bufs.lcfg_buf[1],
 					    bufs.lcfg_buflen[1]);
@@ -1790,6 +1796,55 @@ int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
 }
 EXPORT_SYMBOL(class_config_parse_llog);
 
+static struct lcfg_type_data {
+	__u32	 ltd_type;
+	char	*ltd_name;
+	char	*ltd_bufs[4];
+} lcfg_data_table[] = {
+	{ LCFG_ATTACH, "attach", { "type", "UUID", "3", "4" } },
+	{ LCFG_DETACH, "detach", { "1", "2", "3", "4" } },
+	{ LCFG_SETUP, "setup", { "UUID", "node", "options", "failout" } },
+	{ LCFG_CLEANUP, "cleanup", { "1", "2", "3", "4" } },
+	{ LCFG_ADD_UUID, "add_uuid", { "node", "2", "3", "4" }  },
+	{ LCFG_DEL_UUID, "del_uuid", { "1", "2", "3", "4" }  },
+	{ LCFG_MOUNTOPT, "new_profile", { "name", "lov", "lmv", "4" }  },
+	{ LCFG_DEL_MOUNTOPT, "del_mountopt", { "1", "2", "3", "4" } , },
+	{ LCFG_SET_TIMEOUT, "set_timeout", { "parameter", "2", "3", "4" }  },
+	{ LCFG_SET_UPCALL, "set_upcall", { "1", "2", "3", "4" }  },
+	{ LCFG_ADD_CONN, "add_conn", { "node", "2", "3", "4" }  },
+	{ LCFG_DEL_CONN, "del_conn", { "1", "2", "3", "4" }  },
+	{ LCFG_LOV_ADD_OBD, "add_osc", { "ost", "index", "gen", "UUID" } },
+	{ LCFG_LOV_DEL_OBD, "del_osc", { "1", "2", "3", "4" } },
+	{ LCFG_PARAM, "conf_param", { "parameter", "value", "3", "4" } },
+	{ LCFG_MARKER, "marker", { "1", "2", "3", "4" } },
+	{ LCFG_LOG_START, "log_start", { "1", "2", "3", "4" } },
+	{ LCFG_LOG_END, "log_end", { "1", "2", "3", "4" } },
+	{ LCFG_LOV_ADD_INA, "add_osc_inactive", { "1", "2", "3", "4" }  },
+	{ LCFG_ADD_MDC, "add_mdc", { "mdt", "index", "gen", "UUID" } },
+	{ LCFG_DEL_MDC, "del_mdc", { "1", "2", "3", "4" } },
+	{ LCFG_SPTLRPC_CONF, "security", { "parameter", "2", "3", "4" } },
+	{ LCFG_POOL_NEW, "new_pool", { "fsname", "pool", "3", "4" }  },
+	{ LCFG_POOL_ADD, "add_pool", { "fsname", "pool", "ost", "4" } },
+	{ LCFG_POOL_REM, "remove_pool", { "fsname", "pool", "ost", "4" } },
+	{ LCFG_POOL_DEL, "del_pool", { "fsname", "pool", "3", "4" } },
+	{ LCFG_SET_LDLM_TIMEOUT, "set_ldlm_timeout",
+	  { "parameter", "2", "3", "4" } },
+	{ LCFG_SET_PARAM, "set_param", { "parameter", "value", "3", "4" } },
+	{ 0, NULL, { NULL, NULL, NULL, NULL } }
+};
+
+static struct lcfg_type_data *lcfg_cmd2data(__u32 cmd)
+{
+	int i = 0;
+
+	while (lcfg_data_table[i].ltd_type != 0) {
+		if (lcfg_data_table[i].ltd_type == cmd)
+			return &lcfg_data_table[i];
+		i++;
+	}
+	return NULL;
+}
+
 /**
  * Parse config record and output dump in supplied buffer.
  *
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c
index 3c7a51ffd38a1..e3390507d900e 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -43,10 +43,11 @@
 
 #include <obd.h>
 #include <obd_class.h>
+#include <lustre/lustre_user.h>
 #include <linux/version.h>
 #include <lustre_log.h>
 #include <lustre_disk.h>
-#include <uapi/linux/lustre/lustre_param.h>
+#include <uapi/linux/lustre_param.h>
 
 static int (*client_fill_super)(struct super_block *sb,
 				struct vfsmount *mnt);
@@ -219,7 +220,7 @@ int lustre_start_mgc(struct super_block *sb)
         struct lustre_sb_info *lsi = s2lsi(sb);
         struct obd_device *obd;
         struct obd_export *exp;
-	struct obd_uuid *uuid = NULL;
+        struct obd_uuid *uuid;
         class_uuid_t uuidc;
         lnet_nid_t nid;
 	char nidstr[LNET_NIDSTR_SIZE];
@@ -242,7 +243,7 @@ int lustre_start_mgc(struct super_block *sb)
 			struct lnet_process_id id;
 
                         while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
-                                if (id.nid == LNET_NID_LO_0)
+                                if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
                                         continue;
                                 nid = id.nid;
                                 i++;
@@ -408,6 +409,7 @@ int lustre_start_mgc(struct super_block *sb)
         rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME,
 				 (char *)uuid->uuid, LUSTRE_MGS_OBDNAME,
 				 niduuid, NULL, NULL);
+        OBD_FREE_PTR(uuid);
         if (rc)
                 GOTO(out_free, rc);
 
@@ -468,7 +470,7 @@ int lustre_start_mgc(struct super_block *sb)
             lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR)
                 data->ocd_connect_flags &= ~OBD_CONNECT_IMP_RECOV;
         data->ocd_version = LUSTRE_VERSION_CODE;
-	rc = obd_connect(NULL, &exp, obd, uuid, data, NULL);
+        rc = obd_connect(NULL, &exp, obd, &(obd->obd_uuid), data, NULL);
         if (rc) {
                 CERROR("connect failed %d\n", rc);
                 GOTO(out, rc);
@@ -483,8 +485,6 @@ int lustre_start_mgc(struct super_block *sb)
 out_free:
 	mutex_unlock(&mgc_start_lock);
 
-	if (uuid)
-		OBD_FREE_PTR(uuid);
         if (data)
                 OBD_FREE_PTR(data);
         if (mgcname)
@@ -591,7 +591,7 @@ static struct lustre_sb_info *lustre_init_lsi(struct super_block *sb)
         /* Default umount style */
         lsi->lsi_flags = LSI_UMOUNT_FAILOVER;
 	INIT_LIST_HEAD(&lsi->lsi_lwp_list);
-	mutex_init(&lsi->lsi_lwp_mutex);
+	spin_lock_init(&lsi->lsi_lwp_lock);
 
 	RETURN(lsi);
 }
@@ -1156,52 +1156,37 @@ static int lmd_parse_mgs(struct lustre_mount_data *lmd, char **ptr)
  * make \a *endh point to the string starting with the delimiter. The commas
  * in expression list [...] will be skipped.
  *
- * @buf		a delimiter-separated string
- * @endh	a pointer to a pointer that will point to the string
- *		starting with the delimiter
+ * \param[in] buf	a delimiter-separated string
+ * \param[in] endh	a pointer to a pointer that will point to the string
+ *			starting with the delimiter
  *
- * RETURNS	true if delimiter is found, false if delimiter is not found
+ * \retval 0		if delimiter is found
+ * \retval 1		if delimiter is not found
  */
-static bool lmd_find_delimiter(char *buf, char **endh)
+static int lmd_find_delimiter(char *buf, char **endh)
 {
 	char *c = buf;
-	size_t pos;
-	bool found;
-
-	if (!buf)
-		return false;
-try_again:
-	if (*c == ',' || *c == ':')
-		return true;
-
-	pos = strcspn(c, "[:,]");
-	if (!pos)
-		return false;
-
-	/* Not a valid mount string */
-	if (*c == ']') {
-		CWARN("invalid mount string format\n");
-		return false;
-	}
+	int   skip = 0;
+
+	if (buf == NULL)
+		return 1;
 
-	c += pos;
-	if (*c == '[') {
-		c = strchr(c, ']');
+	while (*c != '\0') {
+		if (*c == '[')
+			skip++;
+		else if (*c == ']')
+			skip--;
 
-		/* invalid mount string */
-		if (!c) {
-			CWARN("invalid mount string format\n");
-			return false;
+		if ((*c == ',' || *c == ':') && skip == 0) {
+			if (endh != NULL)
+				*endh = c;
+			return 0;
 		}
+
 		c++;
-		goto try_again;
 	}
 
-	found = *c != '\0';
-	if (found && endh)
-		*endh = c;
-
-	return found;
+	return 1;
 }
 
 /**
@@ -1230,7 +1215,7 @@ static int lmd_parse_nidlist(char *buf, char **endh)
 	if (*buf == ' ' || *buf == '/' || *buf == '\0')
 		return 1;
 
-	if (!lmd_find_delimiter(buf, &endp))
+	if (lmd_find_delimiter(buf, &endp) != 0)
 		endp = buf + strlen(buf);
 
 	tmp = *endp;
@@ -1375,8 +1360,9 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd)
 		} else if (strncmp(s1, "param=", 6) == 0) {
 			size_t length, params_length;
 			char  *tail = s1;
-
-			if (lmd_find_delimiter(s1 + 6, &tail)) {
+			if (lmd_find_delimiter(s1 + 6, &tail) != 0)
+				length = strlen(s1);
+			else {
 				char *param_str = tail + 1;
 				int   supplementary = 1;
 				while (lmd_parse_nidlist(param_str,
@@ -1384,8 +1370,6 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd)
 					supplementary = 0;
 				}
 				length = param_str - s1 - supplementary;
-			} else {
-				length = strlen(s1);
 			}
 			length -= 6;
 			params_length = strlen(lmd->lmd_params);
@@ -1414,15 +1398,6 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd)
 			rc = lmd_parse_network(lmd, s1 + 8);
 			if (rc)
 				goto invalid;
-
-			/* check if LNet dynamic peer discovery is activated */
-			if (LNetGetPeerDiscoveryStatus()) {
-				CERROR("LNet Dynamic Peer Discovery is enabled "
-				       "on this node. 'network' mount option "
-				       "cannot be taken into account.\n");
-				goto invalid;
-			}
-
 			clear++;
 		}
 
@@ -1501,8 +1476,6 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd)
 	s1 = options + strlen(options) - 1;
 	while (s1 >= options && (*s1 == ',' || *s1 == ' '))
 		*s1-- = 0;
-	while (*options && (*options == ',' || *options == ' '))
-		options++;
 	if (*options != 0) {
 		/* Freed in lustre_free_lsi */
 		OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1);
@@ -1675,12 +1648,7 @@ static struct file_system_type lustre_fs_type = {
         .get_sb       = lustre_get_sb,
 #endif
         .kill_sb      = lustre_kill_super,
-	.fs_flags     = FS_HAS_FIEMAP | FS_RENAME_DOES_D_MOVE |
-#ifdef HAVE_SERVER_SUPPORT
-			FS_REQUIRES_DEV,
-#else
-			0,
-#endif
+	.fs_flags     = FS_REQUIRES_DEV | FS_HAS_FIEMAP | FS_RENAME_DOES_D_MOVE,
 };
 MODULE_ALIAS_FS("lustre");
 
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c
index b23a4ccf0bd9d..b1f59d8f6b303 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2013, 2017, Intel Corporation.
+ * Copyright (c) 2013, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -52,11 +52,14 @@
 #include <linux/smp_lock.h>
 #endif
 
+#include <lustre/lustre_idl.h>
+#include <lustre/lustre_user.h>
+
 #include <llog_swab.h>
 #include <lustre_disk.h>
-#include <uapi/linux/lustre/lustre_ioctl.h>
+#include <uapi/linux/lustre_ioctl.h>
 #include <lustre_log.h>
-#include <uapi/linux/lustre/lustre_param.h>
+#include <uapi/linux/lustre_param.h>
 #include <obd.h>
 #include <obd_class.h>
 
@@ -508,7 +511,7 @@ struct obd_export *lustre_find_lwp_by_index(const char *dev, __u32 idx)
 	}
 
 	snprintf(lwp_name, sizeof(lwp_name), "%s-MDT%04x", fsname, idx);
-	mutex_lock(&lsi->lsi_lwp_mutex);
+	spin_lock(&lsi->lsi_lwp_lock);
 	list_for_each_entry(lwp, &lsi->lsi_lwp_list, obd_lwp_list) {
 		char *ptr = strstr(lwp->obd_name, lwp_name);
 
@@ -517,7 +520,7 @@ struct obd_export *lustre_find_lwp_by_index(const char *dev, __u32 idx)
 			break;
 		}
 	}
-	mutex_unlock(&lsi->lsi_lwp_mutex);
+	spin_unlock(&lsi->lsi_lwp_lock);
 
 err_lmi:
 	server_put_mount(dev, false);
@@ -678,9 +681,9 @@ static int lustre_lwp_setup(struct lustre_cfg *lcfg, struct lustre_sb_info *lsi,
 	rc = lustre_lwp_connect(obd, strstr(lsi->lsi_svname, "-MDT") != NULL);
 	if (rc == 0) {
 		obd->u.cli.cl_max_mds_easize = MAX_MD_SIZE;
-		mutex_lock(&lsi->lsi_lwp_mutex);
+		spin_lock(&lsi->lsi_lwp_lock);
 		list_add_tail(&obd->obd_lwp_list, &lsi->lsi_lwp_list);
-		mutex_unlock(&lsi->lsi_lwp_mutex);
+		spin_unlock(&lsi->lsi_lwp_lock);
 	} else {
 		CERROR("%s: connect failed: rc = %d\n", lwpname, rc);
 	}
@@ -936,7 +939,7 @@ static int lustre_disconnect_lwp(struct super_block *sb)
 			GOTO(out, rc = -ENOMEM);
 
 		/* end log first */
-		cfg->cfg_instance = ll_get_cfg_instance(sb);
+		cfg->cfg_instance = sb;
 		rc = lustre_end_log(sb, logname, cfg);
 		if (rc != 0 && rc != -ENOENT)
 			GOTO(out, rc);
@@ -948,7 +951,6 @@ static int lustre_disconnect_lwp(struct super_block *sb)
 	if (bufs == NULL)
 		GOTO(out, rc = -ENOMEM);
 
-	mutex_lock(&lsi->lsi_lwp_mutex);
 	list_for_each_entry(lwp, &lsi->lsi_lwp_list, obd_lwp_list) {
 		struct lustre_cfg *lcfg;
 
@@ -961,10 +963,8 @@ static int lustre_disconnect_lwp(struct super_block *sb)
 		lustre_cfg_bufs_set_string(bufs, 1, NULL);
 		OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount,
 					       bufs->lcfg_buflen));
-		if (!lcfg) {
-			rc = -ENOMEM;
-			break;
-		}
+		if (!lcfg)
+			GOTO(out, rc = -ENOMEM);
 		lustre_cfg_init(lcfg, LCFG_CLEANUP, bufs);
 
 		/* Disconnect import first. NULL is passed for the '@env',
@@ -979,7 +979,6 @@ static int lustre_disconnect_lwp(struct super_block *sb)
 			rc1 = rc;
 		}
 	}
-	mutex_unlock(&lsi->lsi_lwp_mutex);
 
 	GOTO(out, rc);
 
@@ -1005,23 +1004,18 @@ static int lustre_stop_lwp(struct super_block *sb)
 	int			 rc1 = 0;
 	ENTRY;
 
-	mutex_lock(&lsi->lsi_lwp_mutex);
 	while (!list_empty(&lsi->lsi_lwp_list)) {
 		lwp = list_entry(lsi->lsi_lwp_list.next, struct obd_device,
 				 obd_lwp_list);
 		list_del_init(&lwp->obd_lwp_list);
 		lwp->obd_force = 1;
-		mutex_unlock(&lsi->lsi_lwp_mutex);
-
 		rc = class_manual_cleanup(lwp);
 		if (rc != 0) {
 			CERROR("%s: fail to stop LWP: rc = %d\n",
 			       lwp->obd_name, rc);
 			rc1 = rc;
 		}
-		mutex_lock(&lsi->lsi_lwp_mutex);
 	}
-	mutex_unlock(&lsi->lsi_lwp_mutex);
 
 	RETURN(rc1 != 0 ? rc1 : rc);
 }
@@ -1057,7 +1051,7 @@ static int lustre_start_lwp(struct super_block *sb)
 		GOTO(out, rc = -ENOMEM);
 
 	cfg->cfg_callback = client_lwp_config_process;
-	cfg->cfg_instance = ll_get_cfg_instance(sb);
+	cfg->cfg_instance = sb;
 	rc = lustre_process_log(sb, logname, cfg);
 	/* need to remove config llog from mgc */
 	lsi->lsi_lwp_started = 1;
@@ -1139,7 +1133,7 @@ static int server_lsi2mti(struct lustre_sb_info *lsi,
 
 	mti->mti_nid_count = 0;
 	while (LNetGetId(i++, &id) != -ENOENT) {
-		if (id.nid == LNET_NID_LO_0)
+		if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
 			continue;
 
 		/* server use --servicenode param, only allow specified
@@ -1688,63 +1682,6 @@ static int server_statfs(struct dentry *dentry, struct kstatfs *buf)
 	RETURN(0);
 }
 
-#ifdef HAVE_SUPEROPS_USE_DENTRY
-int server_show_options(struct seq_file *seq, struct dentry *dentry)
-#else
-int server_show_options(struct seq_file *seq, struct vfsmount *vfs)
-#endif
-{
-	struct lustre_sb_info *lsi;
-	struct lustre_mount_data *lmd;
-
-#ifdef HAVE_SUPEROPS_USE_DENTRY
-	LASSERT(seq != NULL && dentry != NULL);
-	lsi = s2lsi(dentry->d_sb);
-#else
-	LASSERT(seq != NULL && vfs != NULL);
-	lsi = s2lsi(vfs->mnt_sb);
-#endif
-
-	lmd = lsi->lsi_lmd;
-	seq_printf(seq, ",svname=%s", lmd->lmd_profile);
-
-	if  (lmd->lmd_flags & LMD_FLG_ABORT_RECOV)
-		seq_puts(seq, ",abort_recov");
-
-	if (lmd->lmd_flags & LMD_FLG_NOIR)
-		seq_puts(seq, ",noir");
-
-	if (lmd->lmd_flags & LMD_FLG_NOSVC)
-		seq_puts(seq, ",nosvc");
-
-	if (lmd->lmd_flags & LMD_FLG_NOMGS)
-		seq_puts(seq, ",nomgs");
-
-	if (lmd->lmd_flags & LMD_FLG_NOSCRUB)
-		seq_puts(seq, ",noscrub");
-	if (lmd->lmd_flags & LMD_FLG_SKIP_LFSCK)
-		seq_puts(seq, ",skip_lfsck");
-
-	if (lmd->lmd_flags & LMD_FLG_DEV_RDONLY)
-		seq_puts(seq, ",rdonly_dev");
-
-	if (lmd->lmd_flags & LMD_FLG_MGS)
-		seq_puts(seq, ",mgs");
-
-	if (lmd->lmd_mgs != NULL)
-		seq_printf(seq, ",mgsnode=%s", lmd->lmd_mgs);
-
-	if (lmd->lmd_osd_type != NULL)
-		seq_printf(seq, ",osd=%s", lmd->lmd_osd_type);
-
-	if (lmd->lmd_opts != NULL) {
-		seq_putc(seq, ',');
-		seq_puts(seq, lmd->lmd_opts);
-	}
-
-	RETURN(0);
-}
-
 /** The operations we support directly on the superblock:
  * mount, umount, and df.
  */
@@ -1752,7 +1689,6 @@ static struct super_operations server_ops = {
 	.put_super	= server_put_super,
 	.umount_begin	= server_umount_begin, /* umount -f */
 	.statfs		= server_statfs,
-	.show_options	= server_show_options,
 };
 
 /*
@@ -1780,43 +1716,6 @@ static ssize_t lustre_listxattr(struct dentry *d_entry, char *name,
 	return -EOPNOTSUPP;
 }
 
-static bool is_cmd_supported(unsigned int command)
-{
-	switch (command) {
-	case FITRIM:
-		return true;
-	default:
-		return false;
-	}
-
-	return false;
-}
-
-static long server_ioctl(struct file *filp, unsigned int command,
-			 unsigned long arg)
-{
-	struct file active_filp;
-	struct inode *inode = file_inode(filp);
-	struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
-	struct super_block *dd_sb = dt_mnt_sb_get(lsi->lsi_dt_dev);
-	struct inode *active_inode;
-	int err = -EOPNOTSUPP;
-
-	if (IS_ERR(dd_sb) || !is_cmd_supported(command))
-		return err;
-
-	active_inode = igrab(dd_sb->s_root->d_inode);
-	if (!active_inode)
-		return -EACCES;
-
-	active_filp.f_inode = active_inode;
-	if (active_inode->i_fop && active_inode->i_fop->unlocked_ioctl)
-		err = active_inode->i_fop->unlocked_ioctl(&active_filp,
-							  command, arg);
-	iput(active_inode);
-	return err;
-}
-
 static const struct inode_operations server_inode_operations = {
 #ifdef HAVE_IOP_XATTR
 	.setxattr       = lustre_setxattr,
@@ -1825,10 +1724,6 @@ static const struct inode_operations server_inode_operations = {
 	.listxattr      = lustre_listxattr,
 };
 
-static const struct file_operations server_file_operations = {
-	.unlocked_ioctl = server_ioctl,
-};
-
 #define log2(n) ffz(~(n))
 #define LUSTRE_SUPER_MAGIC 0x0BD00BD1
 
@@ -1857,7 +1752,6 @@ static int server_fill_super_common(struct super_block *sb)
 	/* apparently we need to be a directory for the mount to finish */
 	root->i_mode = S_IFDIR;
 	root->i_op = &server_inode_operations;
-	root->i_fop = &server_file_operations;
 	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
 		CERROR("%s: can't make root dentry\n", sb->s_id);
@@ -1870,10 +1764,10 @@ static int server_fill_super_common(struct super_block *sb)
 static int osd_start(struct lustre_sb_info *lsi, unsigned long mflags)
 {
 	struct lustre_mount_data *lmd = lsi->lsi_lmd;
-	struct obd_device *obd;
-	struct dt_device_param p;
-	char flagstr[20 + 1 + 10 + 1];
-	int rc;
+	struct obd_device	 *obd;
+	struct dt_device_param    p;
+	char			  flagstr[16];
+	int			  rc;
 	ENTRY;
 
 	CDEBUG(D_MOUNT,
@@ -1883,7 +1777,7 @@ static int osd_start(struct lustre_sb_info *lsi, unsigned long mflags)
 	sprintf(lsi->lsi_osd_obdname, "%s-osd", lsi->lsi_svname);
 	strcpy(lsi->lsi_osd_uuid, lsi->lsi_osd_obdname);
 	strcat(lsi->lsi_osd_uuid, "_UUID");
-	snprintf(flagstr, sizeof(flagstr), "%lu:%u", mflags, lmd->lmd_flags);
+	sprintf(flagstr, "%lu:%lu", mflags, (unsigned long) lmd->lmd_flags);
 
 	obd = class_name2obd(lsi->lsi_osd_obdname);
 	if (obd == NULL) {
@@ -1946,10 +1840,8 @@ int server_fill_super(struct super_block *sb)
 	OBD_RACE(OBD_FAIL_TGT_MOUNT_RACE);
 
 	rc = lsi_prepare(lsi);
-	if (rc) {
-		lustre_put_lsi(sb);
+	if (rc)
 		RETURN(rc);
-	}
 
 	/* Start low level OSD */
 	rc = osd_start(lsi, sb->s_flags);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_sysfs.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_sysfs.c
deleted file mode 100644
index 53b0b3130b717..0000000000000
--- a/drivers/staging/lustrefsx/lustre/obdclass/obd_sysfs.c
+++ /dev/null
@@ -1,535 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2011, 2017, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lustre/obdclass/obd_sysfs.c
- *
- * Object Devices Class Driver
- * These are the only exported functions, they provide some generic
- * infrastructure for managing object devices
- */
-
-#define DEBUG_SUBSYSTEM S_CLASS
-
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/lp.h>
-#include <linux/slab.h>
-#include <linux/ioport.h>
-#include <linux/fcntl.h>
-#include <linux/delay.h>
-#include <linux/skbuff.h>
-#include <linux/proc_fs.h>
-#include <linux/fs.h>
-#include <linux/poll.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/highmem.h>
-#include <asm/io.h>
-#include <asm/ioctls.h>
-#include <asm/poll.h>
-#include <asm/uaccess.h>
-#include <linux/miscdevice.h>
-#include <linux/seq_file.h>
-#include <linux/kobject.h>
-
-#include <libcfs/libcfs.h>
-#include <obd_support.h>
-#include <obd_class.h>
-#include <lprocfs_status.h>
-#include <uapi/linux/lnet/lnetctl.h>
-#include <uapi/linux/lustre/lustre_ioctl.h>
-#include <uapi/linux/lustre/lustre_ver.h>
-
-struct static_lustre_uintvalue_attr {
-	struct {
-		struct attribute attr;
-		ssize_t (*show)(struct kobject *kobj, struct attribute *attr,
-				char *buf);
-		ssize_t (*store)(struct kobject *kobj, struct attribute *attr,
-				 const char *buf, size_t len);
-	} u;
-	int *value;
-};
-
-static ssize_t static_uintvalue_show(struct kobject *kobj,
-				     struct attribute *attr,
-				     char *buf)
-{
-	struct static_lustre_uintvalue_attr *lattr = (void *)attr;
-
-	return sprintf(buf, "%d\n", *lattr->value);
-}
-
-static ssize_t static_uintvalue_store(struct kobject *kobj,
-				      struct attribute *attr,
-				      const char *buffer, size_t count)
-{
-	struct static_lustre_uintvalue_attr *lattr = (void *)attr;
-	unsigned int val;
-	int rc;
-
-	rc = kstrtouint(buffer, 10, &val);
-	if (rc)
-		return rc;
-
-	*lattr->value = val;
-
-	return count;
-}
-
-#define LUSTRE_STATIC_UINT_ATTR(name, value)				\
-static struct static_lustre_uintvalue_attr lustre_sattr_##name =	\
-	{ __ATTR(name, 0644, static_uintvalue_show,			\
-		 static_uintvalue_store), value }
-
-LUSTRE_STATIC_UINT_ATTR(timeout, &obd_timeout);
-LUSTRE_STATIC_UINT_ATTR(debug_peer_on_timeout, &obd_debug_peer_on_timeout);
-LUSTRE_STATIC_UINT_ATTR(dump_on_timeout, &obd_dump_on_timeout);
-LUSTRE_STATIC_UINT_ATTR(dump_on_eviction, &obd_dump_on_eviction);
-LUSTRE_STATIC_UINT_ATTR(at_min, &at_min);
-LUSTRE_STATIC_UINT_ATTR(at_max, &at_max);
-LUSTRE_STATIC_UINT_ATTR(at_extra, &at_extra);
-LUSTRE_STATIC_UINT_ATTR(at_early_margin, &at_early_margin);
-LUSTRE_STATIC_UINT_ATTR(at_history, &at_history);
-LUSTRE_STATIC_UINT_ATTR(lbug_on_eviction, &obd_lbug_on_eviction);
-
-#ifdef HAVE_SERVER_SUPPORT
-LUSTRE_STATIC_UINT_ATTR(ldlm_timeout, &ldlm_timeout);
-LUSTRE_STATIC_UINT_ATTR(bulk_timeout, &bulk_timeout);
-#endif
-
-static ssize_t memused_show(struct kobject *kobj, struct attribute *attr,
-			    char *buf)
-{
-	return sprintf(buf, "%llu\n", obd_memory_sum());
-}
-LUSTRE_RO_ATTR(memused);
-
-static ssize_t memused_max_show(struct kobject *kobj, struct attribute *attr,
-				char *buf)
-{
-	return sprintf(buf, "%llu\n", obd_memory_max());
-}
-LUSTRE_RO_ATTR(memused_max);
-
-static ssize_t max_dirty_mb_show(struct kobject *kobj, struct attribute *attr,
-				 char *buf)
-{
-	return sprintf(buf, "%lu\n",
-		       obd_max_dirty_pages / (1 << (20 - PAGE_SHIFT)));
-}
-
-static ssize_t max_dirty_mb_store(struct kobject *kobj, struct attribute *attr,
-				  const char *buffer, size_t count)
-{
-	unsigned long val;
-	int rc;
-
-	rc = kstrtoul(buffer, 10, &val);
-	if (rc)
-		return rc;
-
-	val *= 1 << (20 - PAGE_SHIFT); /* convert to pages */
-
-	if (val > ((cfs_totalram_pages() / 10) * 9)) {
-		/* Somebody wants to assign too much memory to dirty pages */
-		return -EINVAL;
-	}
-
-	if (val < 4 << (20 - PAGE_SHIFT)) {
-		/* Less than 4 Mb for dirty cache is also bad */
-		return -EINVAL;
-	}
-
-	obd_max_dirty_pages = val;
-
-	return count;
-}
-LUSTRE_RW_ATTR(max_dirty_mb);
-
-static ssize_t version_show(struct kobject *kobj, struct attribute *attr,
-			    char *buf)
-{
-	return sprintf(buf, "%s\n", LUSTRE_VERSION_STRING);
-}
-
-static ssize_t pinger_show(struct kobject *kobj, struct attribute *attr,
-			   char *buf)
-{
-#ifdef ENABLE_PINGER
-	const char *state = "on";
-#else
-	const char *state = "off";
-#endif
-	return sprintf(buf, "%s\n", state);
-}
-
-/**
- * Check all obd devices health
- *
- * \param kobj
- * \param buf [in]
- *
- * \retval number of characters printed if healthy
- */
-static ssize_t
-health_check_show(struct kobject *kobj, struct attribute *attr, char *buf)
-{
-	bool healthy = true;
-	size_t len = 0;
-	int i;
-
-	if (libcfs_catastrophe) {
-		len = sprintf(buf, "LBUG\n");
-		healthy = false;
-	}
-
-	read_lock(&obd_dev_lock);
-	for (i = 0; i < class_devno_max(); i++) {
-		struct obd_device *obd;
-
-		obd = class_num2obd(i);
-		if (obd == NULL || !obd->obd_attached || !obd->obd_set_up)
-			continue;
-
-		LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
-		if (obd->obd_stopping)
-			continue;
-
-		class_incref(obd, __FUNCTION__, current);
-		read_unlock(&obd_dev_lock);
-
-		if (obd_health_check(NULL, obd)) {
-			len = sprintf(buf, "device %s reported unhealthy\n",
-				      obd->obd_name);
-			healthy = false;
-		}
-		class_decref(obd, __FUNCTION__, current);
-		read_lock(&obd_dev_lock);
-	}
-	read_unlock(&obd_dev_lock);
-
-	if (healthy)
-		len = sprintf(buf, "healthy\n");
-	else
-		len = sprintf(buf, "NOT HEALTHY\n");
-
-	return len;
-}
-
-static ssize_t jobid_var_show(struct kobject *kobj, struct attribute *attr,
-			      char *buf)
-{
-	int rc = 0;
-
-	if (strlen(obd_jobid_var))
-		rc = snprintf(buf, PAGE_SIZE, "%s\n", obd_jobid_var);
-	return rc;
-}
-
-static ssize_t jobid_var_store(struct kobject *kobj, struct attribute *attr,
-			       const char *buffer, size_t count)
-{
-	if (!count || count > JOBSTATS_JOBID_VAR_MAX_LEN)
-		return -EINVAL;
-
-	memset(obd_jobid_var, 0, JOBSTATS_JOBID_VAR_MAX_LEN + 1);
-
-	memcpy(obd_jobid_var, buffer, count);
-
-	/* Trim the trailing '\n' if any */
-	if (obd_jobid_var[count - 1] == '\n')
-		obd_jobid_var[count - 1] = 0;
-
-	return count;
-}
-
-static ssize_t jobid_name_show(struct kobject *kobj, struct attribute *attr,
-			       char *buf)
-{
-	int rc = 0;
-
-	if (strlen(obd_jobid_name))
-		rc = snprintf(buf, PAGE_SIZE, "%s\n", obd_jobid_name);
-	return rc;
-}
-
-static ssize_t jobid_name_store(struct kobject *kobj, struct attribute *attr,
-				const char *buffer, size_t count)
-{
-	if (!count || count > LUSTRE_JOBID_SIZE)
-		return -EINVAL;
-
-	if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) != 0 &&
-	    !strchr(buffer, '%')) {
-		lustre_jobid_clear(buffer);
-		return count;
-	}
-
-	/* clear previous value */
-	memset(obd_jobid_name, 0, LUSTRE_JOBID_SIZE);
-
-	memcpy(obd_jobid_name, buffer, count);
-
-	/* Trim the trailing '\n' if any */
-	if (obd_jobid_name[count - 1] == '\n') {
-		/* Don't echo just a newline */
-		if (count == 1)
-			return -EINVAL;
-		obd_jobid_name[count - 1] = 0;
-	}
-
-	return count;
-}
-
-/* Root for /sys/kernel/debug/lustre */
-struct dentry *debugfs_lustre_root;
-EXPORT_SYMBOL_GPL(debugfs_lustre_root);
-
-#ifdef CONFIG_PROC_FS
-/* Root for /proc/fs/lustre */
-struct proc_dir_entry *proc_lustre_root;
-EXPORT_SYMBOL(proc_lustre_root);
-#else
-#define lprocfs_base NULL
-#endif /* CONFIG_PROC_FS */
-
-LUSTRE_RO_ATTR(version);
-LUSTRE_RO_ATTR(pinger);
-LUSTRE_RO_ATTR(health_check);
-LUSTRE_RW_ATTR(jobid_var);
-LUSTRE_RW_ATTR(jobid_name);
-
-static struct attribute *lustre_attrs[] = {
-	&lustre_attr_version.attr,
-	&lustre_attr_pinger.attr,
-	&lustre_attr_health_check.attr,
-	&lustre_attr_jobid_name.attr,
-	&lustre_attr_jobid_var.attr,
-	&lustre_sattr_timeout.u.attr,
-	&lustre_attr_max_dirty_mb.attr,
-	&lustre_sattr_debug_peer_on_timeout.u.attr,
-	&lustre_sattr_dump_on_timeout.u.attr,
-	&lustre_sattr_dump_on_eviction.u.attr,
-	&lustre_sattr_at_min.u.attr,
-	&lustre_sattr_at_max.u.attr,
-	&lustre_sattr_at_extra.u.attr,
-	&lustre_sattr_at_early_margin.u.attr,
-	&lustre_sattr_at_history.u.attr,
-	&lustre_attr_memused_max.attr,
-	&lustre_attr_memused.attr,
-#ifdef HAVE_SERVER_SUPPORT
-	&lustre_sattr_ldlm_timeout.u.attr,
-	&lustre_sattr_bulk_timeout.u.attr,
-#endif
-	&lustre_sattr_lbug_on_eviction.u.attr,
-	NULL,
-};
-
-static void *obd_device_list_seq_start(struct seq_file *p, loff_t *pos)
-{
-	if (*pos >= class_devno_max())
-		return NULL;
-
-	return pos;
-}
-
-static void obd_device_list_seq_stop(struct seq_file *p, void *v)
-{
-}
-
-static void *obd_device_list_seq_next(struct seq_file *p, void *v, loff_t *pos)
-{
-	++*pos;
-	if (*pos >= class_devno_max())
-		return NULL;
-
-	return pos;
-}
-
-static int obd_device_list_seq_show(struct seq_file *p, void *v)
-{
-	loff_t index = *(loff_t *)v;
-	struct obd_device *obd = class_num2obd((int)index);
-	char *status;
-
-	if (obd == NULL)
-		return 0;
-
-	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
-	if (obd->obd_stopping)
-		status = "ST";
-	else if (obd->obd_inactive)
-		status = "IN";
-	else if (obd->obd_set_up)
-		status = "UP";
-	else if (obd->obd_attached)
-		status = "AT";
-	else
-		status = "--";
-
-	seq_printf(p, "%3d %s %s %s %s %d\n",
-		   (int)index, status, obd->obd_type->typ_name,
-		   obd->obd_name, obd->obd_uuid.uuid,
-		   atomic_read(&obd->obd_refcount));
-	return 0;
-}
-
-static const struct seq_operations obd_device_list_sops = {
-	.start = obd_device_list_seq_start,
-	.stop = obd_device_list_seq_stop,
-	.next = obd_device_list_seq_next,
-	.show = obd_device_list_seq_show,
-};
-
-static int obd_device_list_open(struct inode *inode, struct file *file)
-{
-	struct seq_file *seq;
-	int rc = seq_open(file, &obd_device_list_sops);
-
-	if (rc)
-		return rc;
-
-	seq = file->private_data;
-	seq->private = inode->i_private;
-	return 0;
-}
-
-static const struct file_operations obd_device_list_fops = {
-	.owner   = THIS_MODULE,
-	.open    = obd_device_list_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release,
-};
-
-struct kset *lustre_kset;
-EXPORT_SYMBOL_GPL(lustre_kset);
-
-static struct attribute_group lustre_attr_group = {
-	.attrs = lustre_attrs,
-};
-
-ssize_t class_set_global(const char *param)
-{
-	const char *value = strchr(param, '=') + 1;
-	size_t off = value - param - 1;
-	ssize_t count = -ENOENT;
-	int i;
-
-	for (i = 0; lustre_attrs[i]; i++) {
-		if (!strncmp(lustre_attrs[i]->name, param, off)) {
-			count = lustre_attr_store(&lustre_kset->kobj,
-						  lustre_attrs[i], value,
-						  strlen(value));
-			break;
-		}
-	}
-	return count;
-}
-
-int class_procfs_init(void)
-{
-	struct proc_dir_entry *entry;
-	struct dentry *file;
-	int rc = -ENOMEM;
-
-	ENTRY;
-
-	lustre_kset = kset_create_and_add("lustre", NULL, fs_kobj);
-	if (!lustre_kset)
-		goto out;
-
-	/* Create the files associated with this kobject */
-	rc = sysfs_create_group(&lustre_kset->kobj, &lustre_attr_group);
-	if (rc) {
-		kset_unregister(lustre_kset);
-		goto out;
-	}
-
-	rc = jobid_cache_init();
-	if (rc) {
-		kset_unregister(lustre_kset);
-		goto out;
-	}
-
-	debugfs_lustre_root = debugfs_create_dir("lustre", NULL);
-	if (IS_ERR_OR_NULL(debugfs_lustre_root)) {
-		rc = debugfs_lustre_root ? PTR_ERR(debugfs_lustre_root)
-					 : -ENOMEM;
-		debugfs_lustre_root = NULL;
-		kset_unregister(lustre_kset);
-		goto out;
-	}
-
-	file = debugfs_create_file("devices", 0444, debugfs_lustre_root, NULL,
-				   &obd_device_list_fops);
-	if (IS_ERR_OR_NULL(file)) {
-		rc = file ? PTR_ERR(file) : -ENOMEM;
-		debugfs_remove(debugfs_lustre_root);
-		kset_unregister(lustre_kset);
-		goto out;
-	}
-
-	entry = lprocfs_register("fs/lustre", NULL, NULL, NULL);
-	if (IS_ERR(entry)) {
-		rc = PTR_ERR(entry);
-		CERROR("cannot create '/proc/fs/lustre': rc = %d\n", rc);
-		debugfs_remove_recursive(debugfs_lustre_root);
-		kset_unregister(lustre_kset);
-		goto out;
-	}
-
-	proc_lustre_root = entry;
-out:
-	RETURN(rc);
-}
-
-int class_procfs_clean(void)
-{
-	ENTRY;
-
-	debugfs_remove_recursive(debugfs_lustre_root);
-
-	debugfs_lustre_root = NULL;
-	jobid_cache_fini();
-
-	if (proc_lustre_root)
-		lprocfs_remove(&proc_lustre_root);
-
-	sysfs_remove_group(&lustre_kset->kobj, &lustre_attr_group);
-
-	kset_unregister(lustre_kset);
-
-	RETURN(0);
-}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obdo.c b/drivers/staging/lustrefsx/lustre/obdclass/obdo.c
index 0367cfd1bef67..7d14851f799f0 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/obdo.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obdo.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -43,14 +43,15 @@
 # include <linux/uidgid.h>
 #endif
 #include <obd_class.h>
+#include <lustre/lustre_idl.h>
 #include <lustre_obdo.h>
 
 void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent)
 {
-	dst->o_parent_oid = fid_oid(parent);
-	dst->o_parent_seq = fid_seq(parent);
-	dst->o_parent_ver = fid_ver(parent);
-	dst->o_valid |= OBD_MD_FLPARENT | OBD_MD_FLFID;
+        dst->o_parent_oid = fid_oid(parent);
+        dst->o_parent_seq = fid_seq(parent);
+        dst->o_parent_ver = fid_ver(parent);
+        dst->o_valid |= OBD_MD_FLGENER | OBD_MD_FLFID;
 }
 EXPORT_SYMBOL(obdo_set_parent_fid);
 
@@ -61,10 +62,8 @@ void obdo_set_o_projid(struct obdo *dst, u32 projid)
 }
 EXPORT_SYMBOL(obdo_set_o_projid);
 
-/*
- * WARNING: the file systems must take care not to tinker with
- * attributes they don't manage (such as blocks).
- */
+/* WARNING: the file systems must take care not to tinker with
+   attributes they don't manage (such as blocks). */
 void obdo_from_inode(struct obdo *dst, struct inode *src, u64 valid)
 {
 	u64 newvalid = 0;
@@ -74,40 +73,40 @@ void obdo_from_inode(struct obdo *dst, struct inode *src, u64 valid)
 		       valid, (s64) src->i_mtime.tv_sec,
 		       (s64) src->i_ctime.tv_sec);
 
-	if (valid & OBD_MD_FLATIME) {
+        if (valid & OBD_MD_FLATIME) {
 		dst->o_atime = src->i_atime.tv_sec;
-		newvalid |= OBD_MD_FLATIME;
-	}
-	if (valid & OBD_MD_FLMTIME) {
+                newvalid |= OBD_MD_FLATIME;
+        }
+        if (valid & OBD_MD_FLMTIME) {
 		dst->o_mtime = src->i_mtime.tv_sec;
-		newvalid |= OBD_MD_FLMTIME;
-	}
-	if (valid & OBD_MD_FLCTIME) {
+                newvalid |= OBD_MD_FLMTIME;
+        }
+        if (valid & OBD_MD_FLCTIME) {
 		dst->o_ctime = src->i_ctime.tv_sec;
-		newvalid |= OBD_MD_FLCTIME;
-	}
-	if (valid & OBD_MD_FLSIZE) {
-		dst->o_size = i_size_read(src);
-		newvalid |= OBD_MD_FLSIZE;
-	}
-	if (valid & OBD_MD_FLBLOCKS) { /* allocation of space (x512 bytes) */
-		dst->o_blocks = src->i_blocks;
-		newvalid |= OBD_MD_FLBLOCKS;
-	}
-	if (valid & OBD_MD_FLBLKSZ) { /* optimal block size */
+                newvalid |= OBD_MD_FLCTIME;
+        }
+        if (valid & OBD_MD_FLSIZE) {
+                dst->o_size = i_size_read(src);
+                newvalid |= OBD_MD_FLSIZE;
+        }
+        if (valid & OBD_MD_FLBLOCKS) {  /* allocation of space (x512 bytes) */
+                dst->o_blocks = src->i_blocks;
+                newvalid |= OBD_MD_FLBLOCKS;
+        }
+        if (valid & OBD_MD_FLBLKSZ) {   /* optimal block size */
 		dst->o_blksize = 1U << src->i_blkbits;
-		newvalid |= OBD_MD_FLBLKSZ;
-	}
-	if (valid & OBD_MD_FLTYPE) {
-		dst->o_mode = (dst->o_mode & S_IALLUGO) |
-			      (src->i_mode & S_IFMT);
-		newvalid |= OBD_MD_FLTYPE;
-	}
-	if (valid & OBD_MD_FLMODE) {
-		dst->o_mode = (dst->o_mode & S_IFMT) |
-			      (src->i_mode & S_IALLUGO);
-		newvalid |= OBD_MD_FLMODE;
-	}
+                newvalid |= OBD_MD_FLBLKSZ;
+        }
+        if (valid & OBD_MD_FLTYPE) {
+                dst->o_mode = (dst->o_mode & S_IALLUGO) |
+                              (src->i_mode & S_IFMT);
+                newvalid |= OBD_MD_FLTYPE;
+        }
+        if (valid & OBD_MD_FLMODE) {
+                dst->o_mode = (dst->o_mode & S_IFMT) |
+                              (src->i_mode & S_IALLUGO);
+                newvalid |= OBD_MD_FLMODE;
+        }
 	if (valid & OBD_MD_FLUID) {
 		dst->o_uid = from_kuid(&init_user_ns, src->i_uid);
 		newvalid |= OBD_MD_FLUID;
@@ -127,39 +126,39 @@ EXPORT_SYMBOL(obdo_from_inode);
 void obdo_cpy_md(struct obdo *dst, const struct obdo *src, u64 valid)
 {
 	CDEBUG(D_INODE, "src obdo "DOSTID" valid %#llx, dst obdo "DOSTID"\n",
-	       POSTID(&src->o_oi), src->o_valid, POSTID(&dst->o_oi));
-	if (valid & OBD_MD_FLATIME)
-		dst->o_atime = src->o_atime;
-	if (valid & OBD_MD_FLMTIME)
-		dst->o_mtime = src->o_mtime;
-	if (valid & OBD_MD_FLCTIME)
-		dst->o_ctime = src->o_ctime;
-	if (valid & OBD_MD_FLSIZE)
-		dst->o_size = src->o_size;
-	if (valid & OBD_MD_FLBLOCKS) /* allocation of space */
-		dst->o_blocks = src->o_blocks;
-	if (valid & OBD_MD_FLBLKSZ)
-		dst->o_blksize = src->o_blksize;
-	if (valid & OBD_MD_FLTYPE)
-		dst->o_mode = (dst->o_mode & ~S_IFMT) | (src->o_mode & S_IFMT);
-	if (valid & OBD_MD_FLMODE)
-		dst->o_mode = (dst->o_mode & S_IFMT) | (src->o_mode & ~S_IFMT);
-	if (valid & OBD_MD_FLUID)
-		dst->o_uid = src->o_uid;
-	if (valid & OBD_MD_FLGID)
-		dst->o_gid = src->o_gid;
-	if (valid & OBD_MD_FLFLAGS)
-		dst->o_flags = src->o_flags;
-	if (valid & OBD_MD_FLFID) {
-		dst->o_parent_seq = src->o_parent_seq;
-		dst->o_parent_ver = src->o_parent_ver;
-	}
-	if (valid & OBD_MD_FLPARENT)
-		dst->o_parent_oid = src->o_parent_oid;
-	if (valid & OBD_MD_FLHANDLE)
-		dst->o_handle = src->o_handle;
-
-	dst->o_valid |= valid;
+               POSTID(&src->o_oi), src->o_valid, POSTID(&dst->o_oi));
+        if (valid & OBD_MD_FLATIME)
+                dst->o_atime = src->o_atime;
+        if (valid & OBD_MD_FLMTIME)
+                dst->o_mtime = src->o_mtime;
+        if (valid & OBD_MD_FLCTIME)
+                dst->o_ctime = src->o_ctime;
+        if (valid & OBD_MD_FLSIZE)
+                dst->o_size = src->o_size;
+        if (valid & OBD_MD_FLBLOCKS) /* allocation of space */
+                dst->o_blocks = src->o_blocks;
+        if (valid & OBD_MD_FLBLKSZ)
+                dst->o_blksize = src->o_blksize;
+        if (valid & OBD_MD_FLTYPE)
+                dst->o_mode = (dst->o_mode & ~S_IFMT) | (src->o_mode & S_IFMT);
+        if (valid & OBD_MD_FLMODE)
+                dst->o_mode = (dst->o_mode & S_IFMT) | (src->o_mode & ~S_IFMT);
+        if (valid & OBD_MD_FLUID)
+                dst->o_uid = src->o_uid;
+        if (valid & OBD_MD_FLGID)
+                dst->o_gid = src->o_gid;
+        if (valid & OBD_MD_FLFLAGS)
+                dst->o_flags = src->o_flags;
+        if (valid & OBD_MD_FLFID) {
+                dst->o_parent_seq = src->o_parent_seq;
+                dst->o_parent_ver = src->o_parent_ver;
+        }
+        if (valid & OBD_MD_FLGENER)
+                dst->o_parent_oid = src->o_parent_oid;
+        if (valid & OBD_MD_FLHANDLE)
+                dst->o_handle = src->o_handle;
+
+        dst->o_valid |= valid;
 }
 EXPORT_SYMBOL(obdo_cpy_md);
 
@@ -169,48 +168,39 @@ void obdo_to_ioobj(const struct obdo *oa, struct obd_ioobj *ioobj)
 	if (unlikely(!(oa->o_valid & OBD_MD_FLGROUP)))
 		ostid_set_seq_mdt0(&ioobj->ioo_oid);
 
-	/*
-	 * Since 2.4 this does not contain o_mode in the low 16 bits.
-	 * Instead, it holds (bd_md_max_brw - 1) for multi-bulk BRW RPCs
-	 */
+	/* Since 2.4 this does not contain o_mode in the low 16 bits.
+	 * Instead, it holds (bd_md_max_brw - 1) for multi-bulk BRW RPCs */
 	ioobj->ioo_max_brw = 0;
 }
 EXPORT_SYMBOL(obdo_to_ioobj);
 
-/*
+/**
  * Create an obdo to send over the wire
  */
 void lustre_set_wire_obdo(const struct obd_connect_data *ocd,
-			  struct obdo *wobdo,
-			  const struct obdo *lobdo)
+				 struct obdo *wobdo,
+				 const struct obdo *lobdo)
 {
 	*wobdo = *lobdo;
 	if (ocd == NULL)
 		return;
 
-	if (!(wobdo->o_valid & OBD_MD_FLUID))
-		wobdo->o_uid = from_kuid(&init_user_ns, current_uid());
-	if (!(wobdo->o_valid & OBD_MD_FLGID))
-		wobdo->o_gid = from_kgid(&init_user_ns, current_gid());
-
 	if (unlikely(!(ocd->ocd_connect_flags & OBD_CONNECT_FID)) &&
 	    fid_seq_is_echo(ostid_seq(&lobdo->o_oi))) {
-		/*
-		 * Currently OBD_FL_OSTID will only be used when 2.4 echo
-		 * client communicate with pre-2.4 server
-		 */
+		/* Currently OBD_FL_OSTID will only be used when 2.4 echo
+		 * client communicate with pre-2.4 server */
 		wobdo->o_oi.oi.oi_id = fid_oid(&lobdo->o_oi.oi_fid);
 		wobdo->o_oi.oi.oi_seq = fid_seq(&lobdo->o_oi.oi_fid);
 	}
 }
 EXPORT_SYMBOL(lustre_set_wire_obdo);
 
-/*
+/**
  * Create a local obdo from a wire based odbo
  */
 void lustre_get_wire_obdo(const struct obd_connect_data *ocd,
-			  struct obdo *lobdo,
-			  const struct obdo *wobdo)
+				 struct obdo *lobdo,
+				 const struct obdo *wobdo)
 {
 	*lobdo = *wobdo;
 	if (ocd == NULL)
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obdo_server.c b/drivers/staging/lustrefsx/lustre/obdclass/obdo_server.c
deleted file mode 100644
index 0f7f474f7fbb9..0000000000000
--- a/drivers/staging/lustrefsx/lustre/obdclass/obdo_server.c
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2012, 2017, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lustre/obdclass/linux/linux-obdo.c
- *
- * Object Devices Class Driver
- * These are the only exported functions, they provide some generic
- * infrastructure for managing object devices
- */
-
-#define DEBUG_SUBSYSTEM S_CLASS
-
-#include <linux/fs.h>
-#include <linux/module.h>
-#include <linux/pagemap.h> /* for PAGE_SIZE */
-#include <obd_class.h>
-
-/*FIXME: Just copy from obdo_from_inode*/
-void obdo_from_la(struct obdo *dst, const struct lu_attr *la, u64 valid)
-{
-	u64 newvalid = 0;
-
-	if (valid & LA_ATIME) {
-		dst->o_atime = la->la_atime;
-		newvalid |= OBD_MD_FLATIME;
-	}
-	if (valid & LA_MTIME) {
-		dst->o_mtime = la->la_mtime;
-		newvalid |= OBD_MD_FLMTIME;
-	}
-	if (valid & LA_CTIME) {
-		dst->o_ctime = la->la_ctime;
-		newvalid |= OBD_MD_FLCTIME;
-	}
-	if (valid & LA_SIZE) {
-		dst->o_size = la->la_size;
-		newvalid |= OBD_MD_FLSIZE;
-	}
-	if (valid & LA_BLOCKS) {  /* allocation of space (x512 bytes) */
-		dst->o_blocks = la->la_blocks;
-		newvalid |= OBD_MD_FLBLOCKS;
-	}
-	if (valid & LA_TYPE) {
-		dst->o_mode = (dst->o_mode & S_IALLUGO) |
-			(la->la_mode & S_IFMT);
-		newvalid |= OBD_MD_FLTYPE;
-	}
-	if (valid & LA_MODE) {
-		dst->o_mode = (dst->o_mode & S_IFMT) |
-			(la->la_mode & S_IALLUGO);
-		newvalid |= OBD_MD_FLMODE;
-	}
-	if (valid & LA_UID) {
-		dst->o_uid = la->la_uid;
-		newvalid |= OBD_MD_FLUID;
-	}
-	if (valid & LA_GID) {
-		dst->o_gid = la->la_gid;
-		newvalid |= OBD_MD_FLGID;
-	}
-	if (valid & LA_PROJID) {
-		dst->o_projid = la->la_projid;
-		newvalid |= OBD_MD_FLPROJID;
-	}
-	if (valid & LA_FLAGS) {
-		dst->o_flags = la->la_flags;
-		newvalid |= OBD_MD_FLFLAGS;
-	}
-	dst->o_valid |= newvalid;
-}
-EXPORT_SYMBOL(obdo_from_la);
-
-/*FIXME: Just copy from obdo_from_inode*/
-void la_from_obdo(struct lu_attr *dst, const struct obdo *obdo, u64 valid)
-{
-	u64 newvalid = 0;
-
-	valid &= obdo->o_valid;
-
-	if (valid & OBD_MD_FLATIME) {
-		dst->la_atime = obdo->o_atime;
-		newvalid |= LA_ATIME;
-	}
-	if (valid & OBD_MD_FLMTIME) {
-		dst->la_mtime = obdo->o_mtime;
-		newvalid |= LA_MTIME;
-	}
-	if (valid & OBD_MD_FLCTIME) {
-		dst->la_ctime = obdo->o_ctime;
-		newvalid |= LA_CTIME;
-	}
-	if (valid & OBD_MD_FLSIZE) {
-		dst->la_size = obdo->o_size;
-		newvalid |= LA_SIZE;
-	}
-	if (valid & OBD_MD_FLBLOCKS) {
-		dst->la_blocks = obdo->o_blocks;
-		newvalid |= LA_BLOCKS;
-	}
-	if (valid & OBD_MD_FLTYPE) {
-		dst->la_mode = (dst->la_mode & S_IALLUGO) |
-			(obdo->o_mode & S_IFMT);
-		newvalid |= LA_TYPE;
-	}
-	if (valid & OBD_MD_FLMODE) {
-		dst->la_mode = (dst->la_mode & S_IFMT) |
-			(obdo->o_mode & S_IALLUGO);
-		newvalid |= LA_MODE;
-	}
-	if (valid & OBD_MD_FLUID) {
-		dst->la_uid = obdo->o_uid;
-		newvalid |= LA_UID;
-	}
-	if (valid & OBD_MD_FLGID) {
-		dst->la_gid = obdo->o_gid;
-		newvalid |= LA_GID;
-	}
-	if (valid & OBD_MD_FLPROJID) {
-		dst->la_projid = obdo->o_projid;
-		newvalid |= LA_PROJID;
-	}
-	if (valid & OBD_MD_FLFLAGS) {
-		dst->la_flags = obdo->o_flags;
-		newvalid |= LA_FLAGS;
-	}
-	dst->la_valid = newvalid;
-}
-EXPORT_SYMBOL(la_from_obdo);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/scrub.c b/drivers/staging/lustrefsx/lustre/obdclass/scrub.c
deleted file mode 100644
index b2e93c6dcc408..0000000000000
--- a/drivers/staging/lustrefsx/lustre/obdclass/scrub.c
+++ /dev/null
@@ -1,1216 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2017, Intel Corporation.
- */
-/*
- * lustre/obdclass/scrub.c
- *
- * The OI scrub is used for checking and (re)building Object Index files
- * that are usually backend special. Here are some general scrub related
- * functions that can be shared by different backends for OI scrub.
- *
- * Author: Fan Yong <fan.yong@intel.com>
- */
-
-#define DEBUG_SUBSYSTEM S_LFSCK
-
-#include <linux/kthread.h>
-#include <lustre_scrub.h>
-#include <lustre_lib.h>
-#include <lustre_fid.h>
-
-static inline struct dt_device *scrub_obj2dev(struct dt_object *obj)
-{
-	return container_of0(obj->do_lu.lo_dev, struct dt_device, dd_lu_dev);
-}
-
-static void scrub_file_to_cpu(struct scrub_file *des, struct scrub_file *src)
-{
-	memcpy(des->sf_uuid, src->sf_uuid, 16);
-	des->sf_flags	= le64_to_cpu(src->sf_flags);
-	des->sf_magic	= le32_to_cpu(src->sf_magic);
-	des->sf_status	= le16_to_cpu(src->sf_status);
-	des->sf_param	= le16_to_cpu(src->sf_param);
-	des->sf_time_last_complete      =
-				le64_to_cpu(src->sf_time_last_complete);
-	des->sf_time_latest_start       =
-				le64_to_cpu(src->sf_time_latest_start);
-	des->sf_time_last_checkpoint    =
-				le64_to_cpu(src->sf_time_last_checkpoint);
-	des->sf_pos_latest_start	=
-				le64_to_cpu(src->sf_pos_latest_start);
-	des->sf_pos_last_checkpoint     =
-				le64_to_cpu(src->sf_pos_last_checkpoint);
-	des->sf_pos_first_inconsistent  =
-				le64_to_cpu(src->sf_pos_first_inconsistent);
-	des->sf_items_checked		=
-				le64_to_cpu(src->sf_items_checked);
-	des->sf_items_updated		=
-				le64_to_cpu(src->sf_items_updated);
-	des->sf_items_failed		=
-				le64_to_cpu(src->sf_items_failed);
-	des->sf_items_updated_prior     =
-				le64_to_cpu(src->sf_items_updated_prior);
-	des->sf_run_time	= le32_to_cpu(src->sf_run_time);
-	des->sf_success_count   = le32_to_cpu(src->sf_success_count);
-	des->sf_oi_count	= le16_to_cpu(src->sf_oi_count);
-	des->sf_internal_flags	= le16_to_cpu(src->sf_internal_flags);
-	memcpy(des->sf_oi_bitmap, src->sf_oi_bitmap, SCRUB_OI_BITMAP_SIZE);
-}
-
-static void scrub_file_to_le(struct scrub_file *des, struct scrub_file *src)
-{
-	memcpy(des->sf_uuid, src->sf_uuid, 16);
-	des->sf_flags	= cpu_to_le64(src->sf_flags);
-	des->sf_magic	= cpu_to_le32(src->sf_magic);
-	des->sf_status	= cpu_to_le16(src->sf_status);
-	des->sf_param	= cpu_to_le16(src->sf_param);
-	des->sf_time_last_complete      =
-				cpu_to_le64(src->sf_time_last_complete);
-	des->sf_time_latest_start       =
-				cpu_to_le64(src->sf_time_latest_start);
-	des->sf_time_last_checkpoint    =
-				cpu_to_le64(src->sf_time_last_checkpoint);
-	des->sf_pos_latest_start	=
-				cpu_to_le64(src->sf_pos_latest_start);
-	des->sf_pos_last_checkpoint     =
-				cpu_to_le64(src->sf_pos_last_checkpoint);
-	des->sf_pos_first_inconsistent  =
-				cpu_to_le64(src->sf_pos_first_inconsistent);
-	des->sf_items_checked		=
-				cpu_to_le64(src->sf_items_checked);
-	des->sf_items_updated		=
-				cpu_to_le64(src->sf_items_updated);
-	des->sf_items_failed		=
-				cpu_to_le64(src->sf_items_failed);
-	des->sf_items_updated_prior     =
-				cpu_to_le64(src->sf_items_updated_prior);
-	des->sf_run_time	= cpu_to_le32(src->sf_run_time);
-	des->sf_success_count   = cpu_to_le32(src->sf_success_count);
-	des->sf_oi_count	= cpu_to_le16(src->sf_oi_count);
-	des->sf_internal_flags	= cpu_to_le16(src->sf_internal_flags);
-	memcpy(des->sf_oi_bitmap, src->sf_oi_bitmap, SCRUB_OI_BITMAP_SIZE);
-}
-
-void scrub_file_init(struct lustre_scrub *scrub, __u8 *uuid)
-{
-	struct scrub_file *sf = &scrub->os_file;
-
-	memset(sf, 0, sizeof(*sf));
-	memcpy(sf->sf_uuid, uuid, 16);
-	sf->sf_magic = SCRUB_MAGIC_V1;
-	sf->sf_status = SS_INIT;
-}
-EXPORT_SYMBOL(scrub_file_init);
-
-void scrub_file_reset(struct lustre_scrub *scrub, __u8 *uuid, __u64 flags)
-{
-	struct scrub_file *sf = &scrub->os_file;
-
-	CDEBUG(D_LFSCK, "%s: reset OI scrub file, old flags = "
-	       "%#llx, add flags = %#llx\n",
-	       scrub->os_name, sf->sf_flags, flags);
-
-	memcpy(sf->sf_uuid, uuid, 16);
-	sf->sf_status = SS_INIT;
-	sf->sf_flags |= flags;
-	sf->sf_flags &= ~SF_AUTO;
-	sf->sf_run_time = 0;
-	sf->sf_time_latest_start = 0;
-	sf->sf_time_last_checkpoint = 0;
-	sf->sf_pos_latest_start = 0;
-	sf->sf_pos_last_checkpoint = 0;
-	sf->sf_pos_first_inconsistent = 0;
-	sf->sf_items_checked = 0;
-	sf->sf_items_updated = 0;
-	sf->sf_items_failed = 0;
-	sf->sf_items_noscrub = 0;
-	sf->sf_items_igif = 0;
-	if (!scrub->os_in_join)
-		sf->sf_items_updated_prior = 0;
-}
-EXPORT_SYMBOL(scrub_file_reset);
-
-int scrub_file_load(const struct lu_env *env, struct lustre_scrub *scrub)
-{
-	struct scrub_file *sf = &scrub->os_file;
-	struct lu_buf buf = {
-		.lb_buf = &scrub->os_file_disk,
-		.lb_len = sizeof(scrub->os_file_disk)
-	};
-	loff_t pos = 0;
-	int rc;
-
-	rc = dt_read(env, scrub->os_obj, &buf, &pos);
-	/* failure */
-	if (rc < 0) {
-		CERROR("%s: fail to load scrub file: rc = %d\n",
-		       scrub->os_name, rc);
-		return rc;
-	}
-
-	/* empty */
-	if (!rc)
-		return -ENOENT;
-
-	/* corrupted */
-	if (rc < buf.lb_len) {
-		CDEBUG(D_LFSCK, "%s: fail to load scrub file, "
-		       "expected = %d: rc = %d\n",
-		       scrub->os_name, (int)buf.lb_len, rc);
-		return -EFAULT;
-	}
-
-	scrub_file_to_cpu(sf, &scrub->os_file_disk);
-	if (sf->sf_magic != SCRUB_MAGIC_V1) {
-		CDEBUG(D_LFSCK, "%s: invalid scrub magic 0x%x != 0x%x\n",
-		       scrub->os_name, sf->sf_magic, SCRUB_MAGIC_V1);
-		return -EFAULT;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(scrub_file_load);
-
-int scrub_file_store(const struct lu_env *env, struct lustre_scrub *scrub)
-{
-	struct scrub_file *sf = &scrub->os_file_disk;
-	struct dt_object *obj = scrub->os_obj;
-	struct dt_device *dev = scrub_obj2dev(obj);
-	struct lu_buf buf = {
-		.lb_buf = sf,
-		.lb_len = sizeof(*sf)
-	};
-	struct thandle *th;
-	loff_t pos = 0;
-	int rc;
-	ENTRY;
-
-	/* Skip store under rdonly mode. */
-	if (dev->dd_rdonly)
-		RETURN(0);
-
-	scrub_file_to_le(sf, &scrub->os_file);
-	th = dt_trans_create(env, dev);
-	if (IS_ERR(th))
-		GOTO(log, rc = PTR_ERR(th));
-
-	rc = dt_declare_record_write(env, obj, &buf, pos, th);
-	if (rc)
-		GOTO(stop, rc);
-
-	rc = dt_trans_start_local(env, dev, th);
-	if (rc)
-		GOTO(stop, rc);
-
-	rc = dt_record_write(env, obj, &buf, &pos, th);
-
-	GOTO(stop, rc);
-
-stop:
-	dt_trans_stop(env, dev, th);
-
-log:
-	if (rc)
-		CERROR("%s: store scrub file: rc = %d\n",
-		       scrub->os_name, rc);
-	else
-		CDEBUG(D_LFSCK, "%s: store scrub file: rc = %d\n",
-		       scrub->os_name, rc);
-
-	scrub->os_time_last_checkpoint = ktime_get_seconds();
-	scrub->os_time_next_checkpoint = scrub->os_time_last_checkpoint +
-					 SCRUB_CHECKPOINT_INTERVAL;
-	return rc;
-}
-EXPORT_SYMBOL(scrub_file_store);
-
-int scrub_checkpoint(const struct lu_env *env, struct lustre_scrub *scrub)
-{
-	struct scrub_file *sf = &scrub->os_file;
-	time64_t now = ktime_get_seconds();
-	int rc;
-
-	if (likely(now < scrub->os_time_next_checkpoint ||
-		   scrub->os_new_checked == 0))
-		return 0;
-
-	CDEBUG(D_LFSCK, "%s: OI scrub checkpoint at pos %llu\n",
-	       scrub->os_name, scrub->os_pos_current);
-
-	down_write(&scrub->os_rwsem);
-	sf->sf_items_checked += scrub->os_new_checked;
-	scrub->os_new_checked = 0;
-	sf->sf_pos_last_checkpoint = scrub->os_pos_current;
-	sf->sf_time_last_checkpoint = ktime_get_real_seconds();
-	sf->sf_run_time += now - scrub->os_time_last_checkpoint;
-	rc = scrub_file_store(env, scrub);
-	up_write(&scrub->os_rwsem);
-
-	return rc;
-}
-EXPORT_SYMBOL(scrub_checkpoint);
-
-int scrub_start(int (*threadfn)(void *data), struct lustre_scrub *scrub,
-		void *data, __u32 flags)
-{
-	struct ptlrpc_thread *thread = &scrub->os_thread;
-	struct l_wait_info lwi = { 0 };
-	struct task_struct *task;
-	int rc;
-	ENTRY;
-
-again:
-	/* os_lock: sync status between stop and scrub thread */
-	spin_lock(&scrub->os_lock);
-	if (thread_is_running(thread)) {
-		spin_unlock(&scrub->os_lock);
-		RETURN(-EALREADY);
-	}
-
-	if (unlikely(thread_is_stopping(thread))) {
-		spin_unlock(&scrub->os_lock);
-		l_wait_event(thread->t_ctl_waitq,
-			     thread_is_stopped(thread),
-			     &lwi);
-		goto again;
-	}
-	spin_unlock(&scrub->os_lock);
-
-	if (scrub->os_file.sf_status == SS_COMPLETED) {
-		if (!(flags & SS_SET_FAILOUT))
-			flags |= SS_CLEAR_FAILOUT;
-
-		if (!(flags & SS_SET_DRYRUN))
-			flags |= SS_CLEAR_DRYRUN;
-
-		flags |= SS_RESET;
-	}
-
-	scrub->os_start_flags = flags;
-	thread_set_flags(thread, 0);
-	task = kthread_run(threadfn, data, "OI_scrub");
-	if (IS_ERR(task)) {
-		rc = PTR_ERR(task);
-		CERROR("%s: cannot start iteration thread: rc = %d\n",
-		       scrub->os_name, rc);
-		RETURN(rc);
-	}
-
-	l_wait_event(thread->t_ctl_waitq,
-		     thread_is_running(thread) || thread_is_stopped(thread),
-		     &lwi);
-
-	RETURN(0);
-}
-EXPORT_SYMBOL(scrub_start);
-
-void scrub_stop(struct lustre_scrub *scrub)
-{
-	struct ptlrpc_thread *thread = &scrub->os_thread;
-	struct l_wait_info lwi = { 0 };
-
-	/* os_lock: sync status between stop and scrub thread */
-	spin_lock(&scrub->os_lock);
-	if (!thread_is_init(thread) && !thread_is_stopped(thread)) {
-		thread_set_flags(thread, SVC_STOPPING);
-		spin_unlock(&scrub->os_lock);
-		wake_up_all(&thread->t_ctl_waitq);
-		l_wait_event(thread->t_ctl_waitq,
-			     thread_is_stopped(thread),
-			     &lwi);
-		/* Do not skip the last lock/unlock, which can guarantee that
-		 * the caller cannot return until the OI scrub thread exit. */
-		spin_lock(&scrub->os_lock);
-	}
-	spin_unlock(&scrub->os_lock);
-}
-EXPORT_SYMBOL(scrub_stop);
-
-const char *scrub_status_names[] = {
-	"init",
-	"scanning",
-	"completed",
-	"failed",
-	"stopped",
-	"paused",
-	"crashed",
-	NULL
-};
-
-const char *scrub_flags_names[] = {
-	"recreated",
-	"inconsistent",
-	"auto",
-	"upgrade",
-	NULL
-};
-
-const char *scrub_param_names[] = {
-	"failout",
-	"dryrun",
-	NULL
-};
-
-static void scrub_bits_dump(struct seq_file *m, int bits, const char *names[],
-			    const char *prefix)
-{
-	int flag;
-	int i;
-
-	seq_printf(m, "%s:%c", prefix, bits != 0 ? ' ' : '\n');
-
-	for (i = 0, flag = 1; bits != 0; i++, flag = 1 << i) {
-		if (flag & bits) {
-			bits &= ~flag;
-			seq_printf(m, "%s%c", names[i],
-				   bits != 0 ? ',' : '\n');
-		}
-	}
-}
-
-static void scrub_time_dump(struct seq_file *m, time64_t time,
-			    const char *prefix)
-{
-	if (time != 0)
-		seq_printf(m, "%s: %llu seconds\n", prefix,
-			   ktime_get_real_seconds() - time);
-	else
-		seq_printf(m, "%s: N/A\n", prefix);
-}
-
-static void scrub_pos_dump(struct seq_file *m, __u64 pos, const char *prefix)
-{
-	if (pos != 0)
-		seq_printf(m, "%s: %llu\n", prefix, pos);
-	else
-		seq_printf(m, "%s: N/A\n", prefix);
-}
-
-void scrub_dump(struct seq_file *m, struct lustre_scrub *scrub)
-{
-	struct scrub_file *sf = &scrub->os_file;
-	u64 checked;
-	s64 speed;
-
-	down_read(&scrub->os_rwsem);
-	seq_printf(m, "name: OI_scrub\n"
-		   "magic: 0x%x\n"
-		   "oi_files: %d\n"
-		   "status: %s\n",
-		   sf->sf_magic, (int)sf->sf_oi_count,
-		   scrub_status_names[sf->sf_status]);
-
-	scrub_bits_dump(m, sf->sf_flags, scrub_flags_names, "flags");
-
-	scrub_bits_dump(m, sf->sf_param, scrub_param_names, "param");
-
-	scrub_time_dump(m, sf->sf_time_last_complete,
-			"time_since_last_completed");
-
-	scrub_time_dump(m, sf->sf_time_latest_start,
-			"time_since_latest_start");
-
-	scrub_time_dump(m, sf->sf_time_last_checkpoint,
-			"time_since_last_checkpoint");
-
-	scrub_pos_dump(m, sf->sf_pos_latest_start,
-			"latest_start_position");
-
-	scrub_pos_dump(m, sf->sf_pos_last_checkpoint,
-			"last_checkpoint_position");
-
-	scrub_pos_dump(m, sf->sf_pos_first_inconsistent,
-			"first_failure_position");
-
-	checked = sf->sf_items_checked + scrub->os_new_checked;
-	seq_printf(m, "checked: %llu\n"
-		   "%s: %llu\n"
-		   "failed: %llu\n"
-		   "prior_%s: %llu\n"
-		   "noscrub: %llu\n"
-		   "igif: %llu\n"
-		   "success_count: %u\n",
-		   checked,
-		   sf->sf_param & SP_DRYRUN ? "inconsistent" : "updated",
-		   sf->sf_items_updated, sf->sf_items_failed,
-		   sf->sf_param & SP_DRYRUN ? "inconsistent" : "updated",
-		   sf->sf_items_updated_prior, sf->sf_items_noscrub,
-		   sf->sf_items_igif, sf->sf_success_count);
-
-	speed = checked;
-	if (thread_is_running(&scrub->os_thread)) {
-		s64 new_checked = scrub->os_new_checked;
-		time64_t duration;
-		time64_t rtime;
-
-		/* Since the time resolution is in seconds for new system
-		 * or small devices it ismore likely that duration will be
-		 * zero which will lead to inaccurate results.
-		 */
-		duration = ktime_get_seconds() -
-			   scrub->os_time_last_checkpoint;
-		if (duration != 0)
-			new_checked = div_s64(new_checked, duration);
-
-		rtime = sf->sf_run_time + duration;
-		if (rtime != 0)
-			speed = div_s64(speed, rtime);
-
-		seq_printf(m, "run_time: %lld seconds\n"
-			   "average_speed: %lld objects/sec\n"
-			   "real-time_speed: %lld objects/sec\n"
-			   "current_position: %llu\n"
-			   "scrub_in_prior: %s\n"
-			   "scrub_full_speed: %s\n"
-			   "partial_scan: %s\n",
-			   rtime, speed, new_checked,
-			   scrub->os_pos_current,
-			   scrub->os_in_prior ? "yes" : "no",
-			   scrub->os_full_speed ? "yes" : "no",
-			   scrub->os_partial_scan ? "yes" : "no");
-	} else {
-		if (sf->sf_run_time != 0)
-			speed = div_s64(speed, sf->sf_run_time);
-		seq_printf(m, "run_time: %ld seconds\n"
-			   "average_speed: %lld objects/sec\n"
-			   "real-time_speed: N/A\n"
-			   "current_position: N/A\n",
-			   sf->sf_run_time, speed);
-	}
-
-	up_read(&scrub->os_rwsem);
-}
-EXPORT_SYMBOL(scrub_dump);
-
-int lustre_liru_new(struct list_head *head, const struct lu_fid *pfid,
-		    const struct lu_fid *cfid, __u64 child,
-		    const char *name, int namelen)
-{
-	struct lustre_index_restore_unit *liru;
-	int len = sizeof(*liru) + namelen + 1;
-
-	OBD_ALLOC(liru, len);
-	if (!liru)
-		return -ENOMEM;
-
-	INIT_LIST_HEAD(&liru->liru_link);
-	liru->liru_pfid = *pfid;
-	liru->liru_cfid = *cfid;
-	liru->liru_clid = child;
-	liru->liru_len = len;
-	memcpy(liru->liru_name, name, namelen);
-	liru->liru_name[namelen] = 0;
-	list_add_tail(&liru->liru_link, head);
-
-	return 0;
-}
-EXPORT_SYMBOL(lustre_liru_new);
-
-int lustre_index_register(struct dt_device *dev, const char *devname,
-			  struct list_head *head, spinlock_t *lock, int *guard,
-			  const struct lu_fid *fid,
-			  __u32 keysize, __u32 recsize)
-{
-	struct lustre_index_backup_unit *libu, *pos;
-	int rc = 0;
-	ENTRY;
-
-	if (dev->dd_rdonly || *guard)
-		RETURN(1);
-
-	OBD_ALLOC_PTR(libu);
-	if (!libu)
-		RETURN(-ENOMEM);
-
-	INIT_LIST_HEAD(&libu->libu_link);
-	libu->libu_keysize = keysize;
-	libu->libu_recsize = recsize;
-	libu->libu_fid = *fid;
-
-	spin_lock(lock);
-	if (unlikely(*guard)) {
-		spin_unlock(lock);
-		OBD_FREE_PTR(libu);
-
-		RETURN(1);
-	}
-
-	list_for_each_entry_reverse(pos, head, libu_link) {
-		rc = lu_fid_cmp(&pos->libu_fid, fid);
-		if (rc < 0) {
-			list_add(&libu->libu_link, &pos->libu_link);
-			spin_unlock(lock);
-
-			RETURN(0);
-		}
-
-		if (!rc) {
-			/* Registered already. But the former registered one
-			 * has different keysize/recsize. It may because that
-			 * the former values are from disk and corrupted, then
-			 * replace it with new values. */
-			if (unlikely(keysize != pos->libu_keysize ||
-				     recsize != pos->libu_recsize)) {
-				CWARN("%s: the index "DFID" has registered "
-				      "with %u/%u, may be invalid, replace "
-				      "with %u/%u\n",
-				      devname, PFID(fid), pos->libu_keysize,
-				      pos->libu_recsize, keysize, recsize);
-
-				pos->libu_keysize = keysize;
-				pos->libu_recsize = recsize;
-			} else {
-				rc = 1;
-			}
-
-			spin_unlock(lock);
-			OBD_FREE_PTR(libu);
-
-			RETURN(rc);
-		}
-	}
-
-	list_add(&libu->libu_link, head);
-	spin_unlock(lock);
-
-	RETURN(0);
-}
-EXPORT_SYMBOL(lustre_index_register);
-
-static void lustre_index_degister(struct list_head *head, spinlock_t *lock,
-				  const struct lu_fid *fid)
-{
-	struct lustre_index_backup_unit *libu;
-	int rc = -ENOENT;
-
-	spin_lock(lock);
-	list_for_each_entry_reverse(libu, head, libu_link) {
-		rc = lu_fid_cmp(&libu->libu_fid, fid);
-		/* NOT registered. */
-		if (rc < 0)
-			break;
-
-		if (!rc) {
-			list_del(&libu->libu_link);
-			break;
-		}
-	}
-	spin_unlock(lock);
-
-	if (!rc)
-		OBD_FREE_PTR(libu);
-}
-
-static void
-lustre_index_backup_make_header(struct lustre_index_backup_header *header,
-				__u32 keysize, __u32 recsize,
-				const struct lu_fid *fid, __u32 count)
-{
-	memset(header, 0, sizeof(*header));
-	header->libh_magic = cpu_to_le32(INDEX_BACKUP_MAGIC_V1);
-	header->libh_count = cpu_to_le32(count);
-	header->libh_keysize = cpu_to_le32(keysize);
-	header->libh_recsize = cpu_to_le32(recsize);
-	fid_cpu_to_le(&header->libh_owner, fid);
-}
-
-static int lustre_index_backup_body(const struct lu_env *env,
-				    struct dt_object *obj, loff_t *pos,
-				    void *buf, int bufsize)
-{
-	struct dt_device *dev = lu2dt_dev(obj->do_lu.lo_dev);
-	struct thandle *th;
-	struct lu_buf lbuf = {
-		.lb_buf = buf,
-		.lb_len = bufsize
-	};
-	int rc;
-	ENTRY;
-
-	th = dt_trans_create(env, dev);
-	if (IS_ERR(th))
-		RETURN(PTR_ERR(th));
-
-	rc = dt_declare_record_write(env, obj, &lbuf, *pos, th);
-	if (rc)
-		GOTO(stop, rc);
-
-	rc = dt_trans_start_local(env, dev, th);
-	if (rc)
-		GOTO(stop, rc);
-
-	rc = dt_record_write(env, obj, &lbuf, pos, th);
-
-	GOTO(stop, rc);
-
-stop:
-	dt_trans_stop(env, dev, th);
-	return rc;
-}
-
-static int lustre_index_backup_header(const struct lu_env *env,
-				      struct dt_object *obj,
-				      const struct lu_fid *tgt_fid,
-				      __u32 keysize, __u32 recsize,
-				      void *buf, int bufsize, int count)
-{
-	struct dt_device *dev = lu2dt_dev(obj->do_lu.lo_dev);
-	struct lustre_index_backup_header *header = buf;
-	struct lu_attr *la = buf;
-	struct thandle *th;
-	struct lu_buf lbuf = {
-		.lb_buf = header,
-		.lb_len = sizeof(*header)
-	};
-	loff_t size = sizeof(*header) + (keysize + recsize) * count;
-	loff_t pos = 0;
-	int rc;
-	bool punch = false;
-	ENTRY;
-
-	LASSERT(sizeof(*la) <= bufsize);
-	LASSERT(sizeof(*header) <= bufsize);
-
-	rc = dt_attr_get(env, obj, la);
-	if (rc)
-		RETURN(rc);
-
-	if (la->la_size > size)
-		punch = true;
-
-	lustre_index_backup_make_header(header, keysize, recsize,
-					tgt_fid, count);
-	th = dt_trans_create(env, dev);
-	if (IS_ERR(th))
-		RETURN(PTR_ERR(th));
-
-	rc = dt_declare_record_write(env, obj, &lbuf, pos, th);
-	if (rc)
-		GOTO(stop, rc);
-
-	if (punch) {
-		rc = dt_declare_punch(env, obj, size, OBD_OBJECT_EOF, th);
-		if (rc)
-			GOTO(stop, rc);
-	}
-
-	rc = dt_trans_start_local(env, dev, th);
-	if (rc)
-		GOTO(stop, rc);
-
-	rc = dt_record_write(env, obj, &lbuf, &pos, th);
-	if (!rc && punch)
-		rc = dt_punch(env, obj, size, OBD_OBJECT_EOF, th);
-
-	GOTO(stop, rc);
-
-stop:
-	dt_trans_stop(env, dev, th);
-	return rc;
-}
-
-static int lustre_index_update_lma(const struct lu_env *env,
-				   struct dt_object *obj,
-				   void *buf, int bufsize)
-{
-	struct dt_device *dev = lu2dt_dev(obj->do_lu.lo_dev);
-	struct lustre_mdt_attrs *lma = buf;
-	struct lu_buf lbuf = {
-		.lb_buf = lma,
-		.lb_len = sizeof(struct lustre_ost_attrs)
-	};
-	struct thandle *th;
-	int fl = LU_XATTR_REPLACE;
-	int rc;
-	ENTRY;
-
-	LASSERT(bufsize >= lbuf.lb_len);
-
-	rc = dt_xattr_get(env, obj, &lbuf, XATTR_NAME_LMA);
-	if (unlikely(rc == -ENODATA)) {
-		fl = LU_XATTR_CREATE;
-		lustre_lma_init(lma, lu_object_fid(&obj->do_lu),
-				LMAC_IDX_BACKUP, 0);
-		rc = sizeof(*lma);
-	} else if (rc < sizeof(*lma)) {
-		RETURN(rc < 0 ? rc : -EFAULT);
-	} else {
-		lustre_lma_swab(lma);
-		if (lma->lma_compat & LMAC_IDX_BACKUP)
-			RETURN(0);
-
-		lma->lma_compat |= LMAC_IDX_BACKUP;
-	}
-
-	lustre_lma_swab(lma);
-	lbuf.lb_len = rc;
-	th = dt_trans_create(env, dev);
-	if (IS_ERR(th))
-		RETURN(rc);
-
-	rc = dt_declare_xattr_set(env, obj, &lbuf, XATTR_NAME_LMA, fl, th);
-	if (rc)
-		GOTO(stop, rc);
-
-	rc = dt_trans_start_local(env, dev, th);
-	if (rc)
-		GOTO(stop, rc);
-
-	rc = dt_xattr_set(env, obj, &lbuf, XATTR_NAME_LMA, fl, th);
-
-	GOTO(stop, rc);
-
-stop:
-	dt_trans_stop(env, dev, th);
-	return rc;
-}
-
-static int lustre_index_backup_one(const struct lu_env *env,
-				   struct local_oid_storage *los,
-				   struct dt_object *parent,
-				   struct lustre_index_backup_unit *libu,
-				   char *buf, int bufsize)
-{
-	struct dt_device *dev = scrub_obj2dev(parent);
-	struct dt_object *tgt_obj = NULL;
-	struct dt_object *bak_obj = NULL;
-	const struct dt_it_ops *iops;
-	struct dt_it *di;
-	loff_t pos = sizeof(struct lustre_index_backup_header);
-	int count = 0;
-	int size = 0;
-	int rc;
-	ENTRY;
-
-	tgt_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev,
-					     &libu->libu_fid, NULL));
-	if (IS_ERR_OR_NULL(tgt_obj))
-		GOTO(out, rc = tgt_obj ? PTR_ERR(tgt_obj) : -ENOENT);
-
-	if (!dt_object_exists(tgt_obj))
-		GOTO(out, rc = 0);
-
-	if (!tgt_obj->do_index_ops) {
-		struct dt_index_features feat;
-
-		feat.dif_flags = DT_IND_UPDATE;
-		feat.dif_keysize_min = libu->libu_keysize;
-		feat.dif_keysize_max = libu->libu_keysize;
-		feat.dif_recsize_min = libu->libu_recsize;
-		feat.dif_recsize_max = libu->libu_recsize;
-		feat.dif_ptrsize = 4;
-		rc = tgt_obj->do_ops->do_index_try(env, tgt_obj, &feat);
-		if (rc)
-			GOTO(out, rc);
-	}
-
-	lustre_fid2lbx(buf, &libu->libu_fid, bufsize);
-	bak_obj = local_file_find_or_create(env, los, parent, buf,
-					    S_IFREG | S_IRUGO | S_IWUSR);
-	if (IS_ERR_OR_NULL(bak_obj))
-		GOTO(out, rc = bak_obj ? PTR_ERR(bak_obj) : -ENOENT);
-
-	iops = &tgt_obj->do_index_ops->dio_it;
-	di = iops->init(env, tgt_obj, 0);
-	if (IS_ERR(di))
-		GOTO(out, rc = PTR_ERR(di));
-
-	rc = iops->load(env, di, 0);
-	if (!rc)
-		rc = iops->next(env, di);
-	else if (rc > 0)
-		rc = 0;
-
-	while (!rc) {
-		void *key;
-		void *rec;
-
-		key = iops->key(env, di);
-		memcpy(&buf[size], key, libu->libu_keysize);
-		size += libu->libu_keysize;
-		rec = &buf[size];
-		rc = iops->rec(env, di, rec, 0);
-		if (rc)
-			GOTO(fini, rc);
-
-		size += libu->libu_recsize;
-		count++;
-		if (size + libu->libu_keysize + libu->libu_recsize > bufsize) {
-			rc = lustre_index_backup_body(env, bak_obj, &pos,
-						      buf, size);
-			if (rc)
-				GOTO(fini, rc);
-
-			size = 0;
-		}
-
-		rc = iops->next(env, di);
-	}
-
-	if (rc >= 0 && size > 0)
-		rc = lustre_index_backup_body(env, bak_obj, &pos, buf, size);
-
-	if (rc < 0)
-		GOTO(fini, rc);
-
-	rc = lustre_index_backup_header(env, bak_obj, &libu->libu_fid,
-					libu->libu_keysize, libu->libu_recsize,
-					buf, bufsize, count);
-	if (!rc)
-		rc = lustre_index_update_lma(env, tgt_obj, buf, bufsize);
-
-	if (!rc && OBD_FAIL_CHECK(OBD_FAIL_OSD_INDEX_CRASH)) {
-		LASSERT(bufsize >= 512);
-
-		pos = 0;
-		memset(buf, 0, 512);
-		lustre_index_backup_body(env, tgt_obj, &pos, buf, 512);
-	}
-
-	GOTO(fini, rc);
-
-fini:
-	iops->fini(env, di);
-out:
-	if (!IS_ERR_OR_NULL(tgt_obj))
-		dt_object_put_nocache(env, tgt_obj);
-	if (!IS_ERR_OR_NULL(bak_obj))
-		dt_object_put_nocache(env, bak_obj);
-	return rc;
-}
-
-void lustre_index_backup(const struct lu_env *env, struct dt_device *dev,
-			 const char *devname, struct list_head *head,
-			 spinlock_t *lock, int *guard, bool backup)
-{
-	struct lustre_index_backup_unit *libu;
-	struct local_oid_storage *los = NULL;
-	struct dt_object *parent = NULL;
-	char *buf = NULL;
-	struct lu_fid fid;
-	int rc;
-	ENTRY;
-
-	if (dev->dd_rdonly || *guard)
-		RETURN_EXIT;
-
-	spin_lock(lock);
-	*guard = 1;
-	spin_unlock(lock);
-
-	if (list_empty(head))
-		RETURN_EXIT;
-
-	/* Handle kinds of failures during mount process. */
-	if (!dev->dd_lu_dev.ld_site || !dev->dd_lu_dev.ld_site->ls_top_dev)
-		backup = false;
-
-	if (backup) {
-		OBD_ALLOC_LARGE(buf, INDEX_BACKUP_BUFSIZE);
-		if (!buf) {
-			backup = false;
-			goto scan;
-		}
-
-		lu_local_obj_fid(&fid, INDEX_BACKUP_OID);
-		parent = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev,
-						    &fid, NULL));
-		if (IS_ERR_OR_NULL(parent)) {
-			CERROR("%s: failed to locate backup dir: rc = %ld\n",
-			       devname, parent ? PTR_ERR(parent) : -ENOENT);
-			backup = false;
-			goto scan;
-		}
-
-		lu_local_name_obj_fid(&fid, 1);
-		rc = local_oid_storage_init(env, dev, &fid, &los);
-		if (rc) {
-			CERROR("%s: failed to init local storage: rc = %d\n",
-			       devname, rc);
-			backup = false;
-		}
-	}
-
-scan:
-	spin_lock(lock);
-	while (!list_empty(head)) {
-		libu = list_entry(head->next,
-				  struct lustre_index_backup_unit, libu_link);
-		list_del_init(&libu->libu_link);
-		spin_unlock(lock);
-
-		if (backup) {
-			rc = lustre_index_backup_one(env, los, parent, libu,
-						     buf, INDEX_BACKUP_BUFSIZE);
-			CDEBUG(D_WARNING, "%s: backup index "DFID": rc = %d\n",
-			       devname, PFID(&libu->libu_fid), rc);
-		}
-
-		OBD_FREE_PTR(libu);
-		spin_lock(lock);
-	}
-	spin_unlock(lock);
-
-	if (los)
-		local_oid_storage_fini(env, los);
-	if (parent)
-		dt_object_put_nocache(env, parent);
-	if (buf)
-		OBD_FREE_LARGE(buf, INDEX_BACKUP_BUFSIZE);
-
-	EXIT;
-}
-EXPORT_SYMBOL(lustre_index_backup);
-
-int lustre_index_restore(const struct lu_env *env, struct dt_device *dev,
-			 const struct lu_fid *parent_fid,
-			 const struct lu_fid *tgt_fid,
-			 const struct lu_fid *bak_fid, const char *name,
-			 struct list_head *head, spinlock_t *lock,
-			 char *buf, int bufsize)
-{
-	struct dt_object *parent_obj = NULL;
-	struct dt_object *tgt_obj = NULL;
-	struct dt_object *bak_obj = NULL;
-	struct lustre_index_backup_header *header;
-	struct dt_index_features *feat;
-	struct dt_object_format *dof;
-	struct lu_attr *la;
-	struct thandle *th;
-	struct lu_object_conf conf;
-	struct dt_insert_rec ent;
-	struct lu_buf lbuf;
-	struct lu_fid tfid;
-	loff_t pos = 0;
-	__u32 keysize;
-	__u32 recsize;
-	__u32 pairsize;
-	int count;
-	int rc;
-	bool registered = false;
-	ENTRY;
-
-	LASSERT(bufsize >= sizeof(*la) + sizeof(*dof) +
-		sizeof(*feat) + sizeof(*header));
-
-	memset(buf, 0, bufsize);
-	la = (struct lu_attr *)buf;
-	dof = (void *)la + sizeof(*la);
-	feat = (void *)dof + sizeof(*dof);
-	header = (void *)feat + sizeof(*feat);
-	lbuf.lb_buf = header;
-	lbuf.lb_len = sizeof(*header);
-
-	tgt_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev,
-					     tgt_fid, NULL));
-	if (IS_ERR_OR_NULL(tgt_obj))
-		GOTO(out, rc = tgt_obj ? PTR_ERR(tgt_obj) : -ENOENT);
-
-	bak_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev,
-					     bak_fid, NULL));
-	if (IS_ERR_OR_NULL(bak_obj))
-		GOTO(out, rc = bak_obj ? PTR_ERR(bak_obj) : -ENOENT);
-
-	if (!dt_object_exists(bak_obj))
-		GOTO(out, rc = -ENOENT);
-
-	parent_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev,
-						parent_fid, NULL));
-	if (IS_ERR_OR_NULL(parent_obj))
-		GOTO(out, rc = parent_obj ? PTR_ERR(parent_obj) : -ENOENT);
-
-	LASSERT(dt_object_exists(parent_obj));
-
-	if (unlikely(!dt_try_as_dir(env, parent_obj)))
-		GOTO(out, rc = -ENOTDIR);
-
-	rc = dt_attr_get(env, tgt_obj, la);
-	if (rc)
-		GOTO(out, rc);
-
-	rc = dt_record_read(env, bak_obj, &lbuf, &pos);
-	if (rc)
-		GOTO(out, rc);
-
-	if (le32_to_cpu(header->libh_magic) != INDEX_BACKUP_MAGIC_V1)
-		GOTO(out, rc = -EINVAL);
-
-	fid_le_to_cpu(&tfid, &header->libh_owner);
-	if (unlikely(!lu_fid_eq(tgt_fid, &tfid)))
-		GOTO(out, rc = -EINVAL);
-
-	keysize = le32_to_cpu(header->libh_keysize);
-	recsize = le32_to_cpu(header->libh_recsize);
-	pairsize = keysize + recsize;
-
-	memset(feat, 0, sizeof(*feat));
-	feat->dif_flags = DT_IND_UPDATE;
-	feat->dif_keysize_min = feat->dif_keysize_max = keysize;
-	feat->dif_recsize_min = feat->dif_recsize_max = recsize;
-	feat->dif_ptrsize = 4;
-
-	/* T1: remove old name entry and destroy old index. */
-	th = dt_trans_create(env, dev);
-	if (IS_ERR(th))
-		GOTO(out, rc = PTR_ERR(th));
-
-	rc = dt_declare_delete(env, parent_obj,
-			       (const struct dt_key *)name, th);
-	if (rc)
-		GOTO(stop, rc);
-
-	rc = dt_declare_destroy(env, tgt_obj, th);
-	if (rc)
-		GOTO(stop, rc);
-
-	rc = dt_trans_start_local(env, dev, th);
-	if (rc)
-		GOTO(stop, rc);
-
-	rc = dt_delete(env, parent_obj, (const struct dt_key *)name, th);
-	if (rc)
-		GOTO(stop, rc);
-
-	dt_write_lock(env, tgt_obj, 0);
-	rc = dt_destroy(env, tgt_obj, th);
-	dt_write_unlock(env, tgt_obj);
-	dt_trans_stop(env, dev, th);
-	if (rc)
-		GOTO(out, rc);
-
-	la->la_valid = LA_MODE | LA_UID | LA_GID;
-	conf.loc_flags = LOC_F_NEW;
-	dof->u.dof_idx.di_feat = feat;
-	dof->dof_type = DFT_INDEX;
-	ent.rec_type = S_IFREG;
-	ent.rec_fid = tgt_fid;
-
-	/* Drop cache before re-create it. */
-	dt_object_put_nocache(env, tgt_obj);
-	tgt_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev,
-					     tgt_fid, &conf));
-	if (IS_ERR_OR_NULL(tgt_obj))
-		GOTO(out, rc = tgt_obj ? PTR_ERR(tgt_obj) : -ENOENT);
-
-	LASSERT(!dt_object_exists(tgt_obj));
-
-	/* T2: create new index and insert new name entry. */
-	th = dt_trans_create(env, dev);
-	if (IS_ERR(th))
-		GOTO(out, rc = PTR_ERR(th));
-
-	rc = dt_declare_create(env, tgt_obj, la, NULL, dof, th);
-	if (rc)
-		GOTO(stop, rc);
-
-	rc = dt_declare_insert(env, parent_obj, (const struct dt_rec *)&ent,
-			       (const struct dt_key *)name, th);
-	if (rc)
-		GOTO(stop, rc);
-
-	rc = dt_trans_start_local(env, dev, th);
-	if (rc)
-		GOTO(stop, rc);
-
-	dt_write_lock(env, tgt_obj, 0);
-	rc = dt_create(env, tgt_obj, la, NULL, dof, th);
-	dt_write_unlock(env, tgt_obj);
-	if (rc)
-		GOTO(stop, rc);
-
-	rc = dt_insert(env, parent_obj, (const struct dt_rec *)&ent,
-		       (const struct dt_key *)name, th);
-	dt_trans_stop(env, dev, th);
-	/* Some index name may has been inserted by OSD
-	 * automatically when create the index object. */
-	if (unlikely(rc == -EEXIST))
-		rc = 0;
-	if (rc)
-		GOTO(out, rc);
-
-	/* The new index will register via index_try. */
-	rc = tgt_obj->do_ops->do_index_try(env, tgt_obj, feat);
-	if (rc)
-		GOTO(out, rc);
-
-	registered = true;
-	count = le32_to_cpu(header->libh_count);
-	while (!rc && count > 0) {
-		int size = pairsize * count;
-		int items = count;
-		int i;
-
-		if (size > bufsize) {
-			items = bufsize / pairsize;
-			size = pairsize * items;
-		}
-
-		lbuf.lb_buf = buf;
-		lbuf.lb_len = size;
-		rc = dt_record_read(env, bak_obj, &lbuf, &pos);
-		for (i = 0; i < items && !rc; i++) {
-			void *key = &buf[i * pairsize];
-			void *rec = &buf[i * pairsize + keysize];
-
-			/* Tn: restore the records. */
-			th = dt_trans_create(env, dev);
-			if (!th)
-				GOTO(out, rc = -ENOMEM);
-
-			rc = dt_declare_insert(env, tgt_obj, rec, key, th);
-			if (rc)
-				GOTO(stop, rc);
-
-			rc = dt_trans_start_local(env, dev, th);
-			if (rc)
-				GOTO(stop, rc);
-
-			rc = dt_insert(env, tgt_obj, rec, key, th);
-			if (unlikely(rc == -EEXIST))
-				rc = 0;
-
-			dt_trans_stop(env, dev, th);
-		}
-
-		count -= items;
-	}
-
-	GOTO(out, rc);
-
-stop:
-	dt_trans_stop(env, dev, th);
-	if (rc && registered)
-		/* Degister the index to avoid overwriting the backup. */
-		lustre_index_degister(head, lock, tgt_fid);
-
-out:
-	if (!IS_ERR_OR_NULL(tgt_obj))
-		dt_object_put_nocache(env, tgt_obj);
-	if (!IS_ERR_OR_NULL(bak_obj))
-		dt_object_put_nocache(env, bak_obj);
-	if (!IS_ERR_OR_NULL(parent_obj))
-		dt_object_put_nocache(env, parent_obj);
-	return rc;
-}
-EXPORT_SYMBOL(lustre_index_restore);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/statfs_pack.c b/drivers/staging/lustrefsx/lustre/obdclass/statfs_pack.c
index 9c52f8094e9fe..2a36051e52356 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/statfs_pack.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/statfs_pack.c
@@ -46,28 +46,28 @@
 
 void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs)
 {
-	memset(osfs, 0, sizeof(*osfs));
-	osfs->os_type = sfs->f_type;
-	osfs->os_blocks = sfs->f_blocks;
-	osfs->os_bfree = sfs->f_bfree;
-	osfs->os_bavail = sfs->f_bavail;
-	osfs->os_files = sfs->f_files;
-	osfs->os_ffree = sfs->f_ffree;
-	osfs->os_bsize = sfs->f_bsize;
-	osfs->os_namelen = sfs->f_namelen;
+        memset(osfs, 0, sizeof(*osfs));
+        osfs->os_type = sfs->f_type;
+        osfs->os_blocks = sfs->f_blocks;
+        osfs->os_bfree = sfs->f_bfree;
+        osfs->os_bavail = sfs->f_bavail;
+        osfs->os_files = sfs->f_files;
+        osfs->os_ffree = sfs->f_ffree;
+        osfs->os_bsize = sfs->f_bsize;
+        osfs->os_namelen = sfs->f_namelen;
 }
 EXPORT_SYMBOL(statfs_pack);
 
 void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs)
 {
-	memset(sfs, 0, sizeof(*sfs));
-	sfs->f_type = osfs->os_type;
-	sfs->f_blocks = osfs->os_blocks;
-	sfs->f_bfree = osfs->os_bfree;
-	sfs->f_bavail = osfs->os_bavail;
-	sfs->f_files = osfs->os_files;
-	sfs->f_ffree = osfs->os_ffree;
-	sfs->f_bsize = osfs->os_bsize;
-	sfs->f_namelen = osfs->os_namelen;
+        memset(sfs, 0, sizeof(*sfs));
+        sfs->f_type = osfs->os_type;
+        sfs->f_blocks = osfs->os_blocks;
+        sfs->f_bfree = osfs->os_bfree;
+        sfs->f_bavail = osfs->os_bavail;
+        sfs->f_files = osfs->os_files;
+        sfs->f_ffree = osfs->os_ffree;
+        sfs->f_bsize = osfs->os_bsize;
+        sfs->f_namelen = osfs->os_namelen;
 }
 EXPORT_SYMBOL(statfs_unpack);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/upcall_cache.c b/drivers/staging/lustrefsx/lustre/obdclass/upcall_cache.c
index 5622410784d7a..2112733e50c54 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/upcall_cache.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/upcall_cache.c
@@ -35,8 +35,9 @@
  */
 #define DEBUG_SUBSYSTEM S_SEC
 
+#include <libcfs/linux/linux-misc.h>
 #include <libcfs/libcfs.h>
-#include <uapi/linux/lnet/lnet-types.h>
+#include <lnet/types.h>
 #include <upcall_cache.h>
 
 static struct upcall_cache_entry *alloc_entry(struct upcall_cache *cache,
@@ -114,14 +115,14 @@ static inline void put_entry(struct upcall_cache *cache,
 static int check_unlink_entry(struct upcall_cache *cache,
 			      struct upcall_cache_entry *entry)
 {
-	time64_t now = ktime_get_seconds();
-
-	if (UC_CACHE_IS_VALID(entry) && now < entry->ue_expire)
+	if (UC_CACHE_IS_VALID(entry) &&
+	    cfs_time_before(cfs_time_current(), entry->ue_expire))
 		return 0;
 
 	if (UC_CACHE_IS_ACQUIRING(entry)) {
 		if (entry->ue_acquire_expire == 0 ||
-		    now < entry->ue_acquire_expire)
+		    cfs_time_before(cfs_time_current(),
+				    entry->ue_acquire_expire))
 			return 0;
 
 		UC_CACHE_SET_EXPIRED(entry);
@@ -197,8 +198,8 @@ struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *cache,
 		spin_unlock(&cache->uc_lock);
 		rc = refresh_entry(cache, entry);
 		spin_lock(&cache->uc_lock);
-		entry->ue_acquire_expire = ktime_get_seconds() +
-					   cache->uc_acquire_expire;
+		entry->ue_acquire_expire =
+			cfs_time_shift(cache->uc_acquire_expire);
 		if (rc < 0) {
 			UC_CACHE_CLEAR_ACQUIRING(entry);
 			UC_CACHE_SET_INVALID(entry);
@@ -339,7 +340,7 @@ int upcall_cache_downcall(struct upcall_cache *cache, __u32 err, __u64 key,
 	if (rc)
 		GOTO(out, rc);
 
-	entry->ue_expire = ktime_get_seconds() + cache->uc_entry_expire;
+	entry->ue_expire = cfs_time_shift(cache->uc_entry_expire);
 	UC_CACHE_SET_VALID(entry);
 	CDEBUG(D_OTHER, "%s: created upcall cache entry %p for key %llu\n",
 	       cache->uc_name, entry, entry->ue_key);
@@ -399,10 +400,10 @@ void upcall_cache_flush_one(struct upcall_cache *cache, __u64 key, void *args)
 
 	if (found) {
 		CWARN("%s: flush entry %p: key %llu, ref %d, fl %x, "
-		      "cur %lld, ex %lld/%lld\n",
+		      "cur %lu, ex %ld/%ld\n",
 		      cache->uc_name, entry, entry->ue_key,
 		      atomic_read(&entry->ue_refcount), entry->ue_flags,
-		      ktime_get_real_seconds(), entry->ue_acquire_expire,
+		      cfs_time_current_sec(), entry->ue_acquire_expire,
 		      entry->ue_expire);
 		UC_CACHE_SET_EXPIRED(entry);
 		if (!atomic_read(&entry->ue_refcount))
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/uuid.c b/drivers/staging/lustrefsx/lustre/obdclass/uuid.c
new file mode 100644
index 0000000000000..cc0092687511b
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/uuid.c
@@ -0,0 +1,78 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/uuid.c
+ *
+ * Public include file for the UUID library
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <libcfs/libcfs.h>
+#include <obd_support.h>
+#include <obd_class.h>
+
+static inline size_t consume(size_t nob, __u8 **ptr)
+{
+	size_t value;
+
+	LASSERT(nob <= sizeof(value));
+
+	for (value = 0; nob > 0; --nob)
+		value = (value << 8) | *((*ptr)++);
+	return value;
+}
+
+#define CONSUME(val, ptr) (val) = consume(sizeof(val), (ptr))
+
+static void uuid_unpack(class_uuid_t in, __u16 *uu, size_t nr)
+{
+	__u8 *ptr = in;
+
+	LASSERT(nr * sizeof(*uu) == sizeof(class_uuid_t));
+
+	while (nr-- > 0)
+		CONSUME(uu[nr], &ptr);
+}
+
+void class_uuid_unparse(class_uuid_t uu, struct obd_uuid *out)
+{
+	/* uu as an array of __u16's */
+        __u16 uuid[sizeof(class_uuid_t) / sizeof(__u16)];
+
+	CLASSERT(ARRAY_SIZE(uuid) == 8);
+
+        uuid_unpack(uu, uuid, ARRAY_SIZE(uuid));
+        sprintf(out->uuid, "%04x%04x-%04x-%04x-%04x-%04x%04x%04x",
+		uuid[0], uuid[1], uuid[2], uuid[3],
+		uuid[4], uuid[5], uuid[6], uuid[7]);
+}
+EXPORT_SYMBOL(class_uuid_unparse);
diff --git a/drivers/staging/lustrefsx/lustre/obdecho/echo.c b/drivers/staging/lustrefsx/lustre/obdecho/echo.c
index 0f97a830f9b37..de7fd77920392 100644
--- a/drivers/staging/lustrefsx/lustre/obdecho/echo.c
+++ b/drivers/staging/lustrefsx/lustre/obdecho/echo.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2017, Intel Corporation.
+ * Copyright (c) 2010, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -59,21 +59,6 @@ enum {
         LPROC_ECHO_LAST = LPROC_ECHO_WRITE_BYTES +1
 };
 
-struct echo_srv_device {
-	struct lu_device esd_dev;
-	struct lu_target esd_lut;
-};
-
-static inline struct echo_srv_device *echo_srv_dev(struct lu_device *d)
-{
-	return container_of0(d, struct echo_srv_device, esd_dev);
-}
-
-static inline struct obd_device *echo_srv_obd(struct echo_srv_device *esd)
-{
-	return esd->esd_dev.ld_obd;
-}
-
 static int echo_connect(const struct lu_env *env,
                         struct obd_export **exp, struct obd_device *obd,
                         struct obd_uuid *cluuid, struct obd_connect_data *data,
@@ -130,6 +115,115 @@ static u64 echo_next_id(struct obd_device *obddev)
 	return id;
 }
 
+static int echo_create(const struct lu_env *env, struct obd_export *exp,
+		       struct obdo *oa)
+{
+        struct obd_device *obd = class_exp2obd(exp);
+
+        if (!obd) {
+		CERROR("invalid client cookie %#llx\n",
+                       exp->exp_handle.h_cookie);
+                return -EINVAL;
+        }
+
+	if (!(oa->o_mode & S_IFMT)) {
+		CERROR("echo obd: no type!\n");
+		return -ENOENT;
+	}
+
+        if (!(oa->o_valid & OBD_MD_FLTYPE)) {
+		CERROR("invalid o_valid %#llx\n", oa->o_valid);
+                return -EINVAL;
+        }
+
+	ostid_set_seq_echo(&oa->o_oi);
+	if (ostid_set_id(&oa->o_oi, echo_next_id(obd))) {
+		CERROR("Bad %llu to set " DOSTID "\n",
+		       echo_next_id(obd), POSTID(&oa->o_oi));
+		return -EINVAL;
+	}
+	oa->o_valid = OBD_MD_FLID;
+
+	return 0;
+}
+
+static int echo_destroy(const struct lu_env *env, struct obd_export *exp,
+			struct obdo *oa)
+{
+        struct obd_device *obd = class_exp2obd(exp);
+
+        ENTRY;
+        if (!obd) {
+		CERROR("invalid client cookie %#llx\n",
+                       exp->exp_handle.h_cookie);
+                RETURN(-EINVAL);
+        }
+
+        if (!(oa->o_valid & OBD_MD_FLID)) {
+		CERROR("obdo missing FLID valid flag: %#llx\n", oa->o_valid);
+                RETURN(-EINVAL);
+        }
+
+	if (ostid_id(&oa->o_oi) > obd->u.echo.eo_lastino ||
+	    ostid_id(&oa->o_oi) < ECHO_INIT_OID) {
+		CERROR("bad destroy objid: "DOSTID"\n", POSTID(&oa->o_oi));
+		RETURN(-EINVAL);
+	}
+
+        RETURN(0);
+}
+
+static int echo_getattr(const struct lu_env *env, struct obd_export *exp,
+			struct obdo *oa)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	u64 id = ostid_id(&oa->o_oi);
+
+	ENTRY;
+	if (!obd) {
+		CERROR("invalid client cookie %#llx\n",
+		       exp->exp_handle.h_cookie);
+		RETURN(-EINVAL);
+	}
+
+	if (!(oa->o_valid & OBD_MD_FLID)) {
+		CERROR("obdo missing FLID valid flag: %#llx\n", oa->o_valid);
+		RETURN(-EINVAL);
+	}
+
+	obdo_cpy_md(oa, &obd->u.echo.eo_oa, oa->o_valid);
+	ostid_set_seq_echo(&oa->o_oi);
+	if (ostid_set_id(&oa->o_oi, id)) {
+		CERROR("Bad %llu to set " DOSTID "\n",
+		       id, POSTID(&oa->o_oi));
+		RETURN(-EINVAL);
+	}
+
+	RETURN(0);
+}
+
+static int echo_setattr(const struct lu_env *env, struct obd_export *exp,
+			struct obdo *oa)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+
+	ENTRY;
+	if (!obd) {
+		CERROR("invalid client cookie %#llx\n",
+		       exp->exp_handle.h_cookie);
+		RETURN(-EINVAL);
+	}
+
+	if (!(oa->o_valid & OBD_MD_FLID)) {
+		CERROR("obdo missing FLID valid flag: %#llx\n", oa->o_valid);
+		RETURN(-EINVAL);
+	}
+
+	obd->u.echo.eo_oa = *oa;
+
+	RETURN(0);
+}
+
 static void
 echo_page_debug_setup(struct page *page, int rw, u64 id,
 		      __u64 offset, int len)
@@ -454,317 +548,41 @@ static int echo_commitrw(const struct lu_env *env, int cmd,
 
 LPROC_SEQ_FOPS_RO_TYPE(echo, uuid);
 static struct lprocfs_vars lprocfs_echo_obd_vars[] = {
-	{ .name =       "uuid",
-	  .fops =       &echo_uuid_fops         },
+	{ .name	=	"uuid",
+	  .fops	=	&echo_uuid_fops		},
 	{ NULL }
 };
 
-struct obd_ops echo_obd_ops = {
-	.o_owner           = THIS_MODULE,
-	.o_connect         = echo_connect,
-	.o_disconnect      = echo_disconnect,
-	.o_init_export     = echo_init_export,
-	.o_destroy_export  = echo_destroy_export,
-	.o_preprw          = echo_preprw,
-	.o_commitrw        = echo_commitrw,
-};
-
-/**
- * Echo Server request handler for OST_CREATE RPC.
- *
- * This is part of request processing. Its simulates the object
- * creation on OST.
- *
- * \param[in] tsi	target session environment for this request
- *
- * \retval		0 if successful
- * \retval		negative value on error
- */
-static int esd_create_hdl(struct tgt_session_info *tsi)
-{
-	const struct obdo *oa = &tsi->tsi_ost_body->oa;
-	struct obd_device *obd = tsi->tsi_exp->exp_obd;
-	struct ost_body *repbody;
-	struct obdo *rep_oa;
-
-	ENTRY;
-
-	repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY);
-	if (repbody == NULL)
-		RETURN(-ENOMEM);
-
-	if (!(oa->o_mode & S_IFMT)) {
-		CERROR("%s: no type is set in obdo!\n",
-		       tsi->tsi_exp->exp_obd->obd_name);
-		RETURN(-ENOENT);
-	}
-
-	if (!(oa->o_valid & OBD_MD_FLTYPE)) {
-		CERROR("%s: invalid o_valid in obdo: %#llx\n",
-		       tsi->tsi_exp->exp_obd->obd_name, oa->o_valid);
-		RETURN(-EINVAL);
-	}
-
-	rep_oa = &repbody->oa;
-
-	if (!fid_seq_is_echo(ostid_seq(&oa->o_oi))) {
-		CERROR("%s: invalid seq %#llx\n",
-		       tsi->tsi_exp->exp_obd->obd_name, ostid_seq(&oa->o_oi));
-		return -EINVAL;
-	}
-
-	ostid_set_seq_echo(&rep_oa->o_oi);
-	ostid_set_id(&rep_oa->o_oi, echo_next_id(obd));
-
-	CDEBUG(D_INFO, "%s: Create object "DOSTID"\n",
-	       tsi->tsi_exp->exp_obd->obd_name, POSTID(&rep_oa->o_oi));
-
-	rep_oa->o_valid |= OBD_MD_FLID | OBD_MD_FLGROUP;
-
-	RETURN(0);
-}
-
-/**
- * Echo Server request handler for OST_DESTROY RPC.
- *
- * This is Echo Server part of request handling. It simulates the objects
- * destroy on OST.
- *
- * \param[in] tsi	target session environment for this request
- *
- * \retval		0 if successful
- * \retval		negative value on error
- */
-static int esd_destroy_hdl(struct tgt_session_info *tsi)
-{
-	const struct obdo *oa = &tsi->tsi_ost_body->oa;
-	struct obd_device *obd = tsi->tsi_exp->exp_obd;
-	struct ost_body *repbody;
-	u64 oid;
-
-	ENTRY;
-
-	oid = ostid_id(&oa->o_oi);
-	LASSERT(oid != 0);
-
-	if (!(oa->o_valid & OBD_MD_FLID)) {
-		CERROR("%s: obdo missing FLID valid flag: %#llx\n",
-		       tsi->tsi_exp->exp_obd->obd_name, oa->o_valid);
-		RETURN(-EINVAL);
-	}
-
-	repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY);
-
-	if (ostid_id(&oa->o_oi) > obd->u.echo.eo_lastino ||
-	    ostid_id(&oa->o_oi) < ECHO_INIT_OID) {
-		CERROR("%s: bad objid to destroy: "DOSTID"\n",
-		       tsi->tsi_exp->exp_obd->obd_name, POSTID(&oa->o_oi));
-		RETURN(-EINVAL);
-	}
-
-	CDEBUG(D_INFO, "%s: Destroy object "DOSTID"\n",
-	       tsi->tsi_exp->exp_obd->obd_name, POSTID(&oa->o_oi));
-
-	repbody->oa.o_oi = oa->o_oi;
-	RETURN(0);
-}
-
-/**
- * Echo Server request handler for OST_GETATTR RPC.
- *
- * This is Echo Server part of request handling. It returns an object
- * attributes to the client. All objects have the same attributes in
- * Echo Server.
- *
- * \param[in] tsi	target session environment for this request
- *
- * \retval		0 if successful
- * \retval		negative value on error
- */
-static int esd_getattr_hdl(struct tgt_session_info *tsi)
-{
-	const struct obdo *oa = &tsi->tsi_ost_body->oa;
-	struct obd_device *obd = tsi->tsi_exp->exp_obd;
-	struct ost_body *repbody;
-
-	ENTRY;
-
-	if (!(oa->o_valid & OBD_MD_FLID)) {
-		CERROR("%s: obdo missing FLID valid flag: %#llx\n",
-		       tsi->tsi_exp->exp_obd->obd_name, oa->o_valid);
-		RETURN(-EINVAL);
-	}
-
-	repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY);
-	if (repbody == NULL)
-		RETURN(-ENOMEM);
-
-	repbody->oa.o_oi = oa->o_oi;
-	repbody->oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
-
-	obdo_cpy_md(&repbody->oa, &obd->u.echo.eo_oa, oa->o_valid);
-
-	repbody->oa.o_valid |= OBD_MD_FLFLAGS;
-	repbody->oa.o_flags = OBD_FL_FLUSH;
-
-	RETURN(0);
-}
-
-/**
- * Echo Server request handler for OST_SETATTR RPC.
- *
- * This is Echo Server part of request handling. It sets common
- * attributes from request to the Echo Server objects.
- *
- * \param[in] tsi	target session environment for this request
- *
- * \retval		0 if successful
- * \retval		negative value on error
- */
-static int esd_setattr_hdl(struct tgt_session_info *tsi)
-{
-	struct ost_body *body = tsi->tsi_ost_body;
-	struct obd_device *obd = tsi->tsi_exp->exp_obd;
-	struct ost_body *repbody;
-
-	ENTRY;
-
-	if (!(body->oa.o_valid & OBD_MD_FLID)) {
-		CERROR("%s: obdo missing FLID valid flag: %#llx\n",
-		       tsi->tsi_exp->exp_obd->obd_name,
-		       body->oa.o_valid);
-		RETURN(-EINVAL);
-	}
-
-	repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY);
-	if (repbody == NULL)
-		RETURN(-ENOMEM);
-
-	repbody->oa.o_oi = body->oa.o_oi;
-	repbody->oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
-
-	obd->u.echo.eo_oa = body->oa;
-
-	RETURN(0);
-}
-
-#define OBD_FAIL_OST_READ_NET	OBD_FAIL_OST_BRW_NET
-#define OBD_FAIL_OST_WRITE_NET	OBD_FAIL_OST_BRW_NET
-#define OST_BRW_READ	OST_READ
-#define OST_BRW_WRITE	OST_WRITE
-
-/**
- * Table of Echo Server specific request handlers
- *
- * This table contains all opcodes accepted by Echo Server and
- * specifies handlers for them. The tgt_request_handler()
- * uses such table from each target to process incoming
- * requests.
- */
-static struct tgt_handler esd_tgt_handlers[] = {
-TGT_RPC_HANDLER(OST_FIRST_OPC, 0, OST_CONNECT, tgt_connect,
-		&RQF_CONNECT, LUSTRE_OBD_VERSION),
-TGT_RPC_HANDLER(OST_FIRST_OPC, 0, OST_DISCONNECT, tgt_disconnect,
-		&RQF_OST_DISCONNECT, LUSTRE_OBD_VERSION),
-TGT_OST_HDL(HABEO_CORPUS | HABEO_REFERO, OST_GETATTR, esd_getattr_hdl),
-TGT_OST_HDL(HABEO_CORPUS | HABEO_REFERO | MUTABOR, OST_SETATTR,
-	    esd_setattr_hdl),
-TGT_OST_HDL(HABEO_REFERO | MUTABOR, OST_CREATE, esd_create_hdl),
-TGT_OST_HDL(HABEO_REFERO | MUTABOR, OST_DESTROY, esd_destroy_hdl),
-TGT_OST_HDL(HABEO_CORPUS | HABEO_REFERO, OST_BRW_READ, tgt_brw_read),
-TGT_OST_HDL(HABEO_CORPUS | MUTABOR, OST_BRW_WRITE, tgt_brw_write),
-};
-
-static struct tgt_opc_slice esd_common_slice[] = {
-	{
-		.tos_opc_start	= OST_FIRST_OPC,
-		.tos_opc_end	= OST_LAST_OPC,
-		.tos_hs		= esd_tgt_handlers
-	},
-	{
-		.tos_opc_start	= OBD_FIRST_OPC,
-		.tos_opc_end	= OBD_LAST_OPC,
-		.tos_hs		= tgt_obd_handlers
-	},
-	{
-		.tos_opc_start	= LDLM_FIRST_OPC,
-		.tos_opc_end	= LDLM_LAST_OPC,
-		.tos_hs		= tgt_dlm_handlers
-	},
-	{
-		.tos_opc_start  = SEC_FIRST_OPC,
-		.tos_opc_end    = SEC_LAST_OPC,
-		.tos_hs         = tgt_sec_ctx_handlers
-	},
-	{
-		.tos_hs		= NULL
-	}
-};
-
-/**
- * lu_device_operations matrix for ECHO SRV device is NULL,
- * this device is just serving incoming requests immediately
- * without building a stack of lu_devices.
- */
-static struct lu_device_operations echo_srv_lu_ops = { 0 };
-
-/**
- * Initialize Echo Server device with parameters in the config log \a cfg.
- *
- * This is the main starting point of Echo Server initialization. It fills all
- * parameters with their initial values and starts Echo Server.
- *
- * \param[in] env	execution environment
- * \param[in] m		Echo Server device
- * \param[in] ldt	LU device type of Echo Server
- * \param[in] cfg	configuration log
- *
- * \retval		0 if successful
- * \retval		negative value on error
- */
-static int echo_srv_init0(const struct lu_env *env,
-			  struct echo_srv_device *esd,
-			  struct lu_device_type *ldt, struct lustre_cfg *cfg)
+static int echo_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 {
-	const char *dev = lustre_cfg_string(cfg, 0);
-	struct obd_device *obd;
-	char ns_name[48];
-	int rc;
-
+	int			rc;
+	__u64			lock_flags = 0;
+	struct ldlm_res_id	res_id = {.name = {1}};
+	char			ns_name[48];
 	ENTRY;
 
-	obd = class_name2obd(dev);
-	if (obd == NULL) {
-		CERROR("Cannot find obd with name %s\n", dev);
-		RETURN(-ENODEV);
-	}
-
+        obd->u.echo.eo_obt.obt_magic = OBT_MAGIC;
 	spin_lock_init(&obd->u.echo.eo_lock);
-	obd->u.echo.eo_lastino = ECHO_INIT_OID;
-
-	esd->esd_dev.ld_ops = &echo_srv_lu_ops;
-	esd->esd_dev.ld_obd = obd;
-	/* set this lu_device to obd, because error handling need it */
-	obd->obd_lu_dev = &esd->esd_dev;
-
-	/* No connection accepted until configurations will finish */
-	spin_lock(&obd->obd_dev_lock);
-	obd->obd_no_conn = 1;
-	spin_unlock(&obd->obd_dev_lock);
-
-	/* non-replayable target */
-	obd->obd_replayable = 0;
+        obd->u.echo.eo_lastino = ECHO_INIT_OID;
+
+        sprintf(ns_name, "echotgt-%s", obd->obd_uuid.uuid);
+        obd->obd_namespace = ldlm_namespace_new(obd, ns_name,
+                                                LDLM_NAMESPACE_SERVER,
+                                                LDLM_NAMESPACE_MODEST,
+                                                LDLM_NS_TYPE_OST);
+        if (obd->obd_namespace == NULL) {
+                LBUG();
+                RETURN(-ENOMEM);
+        }
 
-	snprintf(ns_name, sizeof(ns_name), "echotgt-%s", obd->obd_uuid.uuid);
-	obd->obd_namespace = ldlm_namespace_new(obd, ns_name,
-						LDLM_NAMESPACE_SERVER,
-						LDLM_NAMESPACE_MODEST,
-						LDLM_NS_TYPE_OST);
-	if (obd->obd_namespace == NULL)
-		RETURN(-ENOMEM);
+        rc = ldlm_cli_enqueue_local(obd->obd_namespace, &res_id, LDLM_PLAIN,
+                                    NULL, LCK_NL, &lock_flags, NULL,
+				    ldlm_completion_ast, NULL, NULL, 0,
+				    LVB_T_NONE, NULL, &obd->u.echo.eo_nl_lock);
+        LASSERT (rc == ELDLM_OK);
 
 	obd->obd_vars = lprocfs_echo_obd_vars;
-	if (!lprocfs_obd_setup(obd, true) &&
+	if (lprocfs_obd_setup(obd) == 0 &&
             lprocfs_alloc_obd_stats(obd, LPROC_ECHO_LAST) == 0) {
                 lprocfs_counter_init(obd->obd_stats, LPROC_ECHO_READ_BYTES,
                                      LPROCFS_CNTR_AVGMINMAX,
@@ -776,158 +594,48 @@ static int echo_srv_init0(const struct lu_env *env,
 
 	ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL,
 			   "echo_ldlm_cb_client", &obd->obd_ldlm_client);
-
-	rc = tgt_init(env, &esd->esd_lut, obd, NULL, esd_common_slice,
-		      OBD_FAIL_OST_ALL_REQUEST_NET,
-		      OBD_FAIL_OST_ALL_REPLY_NET);
-	if (rc)
-		GOTO(err_out, rc);
-
-	spin_lock(&obd->obd_dev_lock);
-	obd->obd_no_conn = 0;
-	spin_unlock(&obd->obd_dev_lock);
-
-	RETURN(0);
-
-err_out:
-	ldlm_namespace_free(obd->obd_namespace, NULL, obd->obd_force);
-	obd->obd_namespace = NULL;
-
-	lprocfs_obd_cleanup(obd);
-	lprocfs_free_obd_stats(obd);
-	RETURN(rc);
+        RETURN(0);
 }
 
-/**
- * Stop the Echo Server device.
- *
- * This function stops the Echo Server device and all its subsystems.
- * This is the end of Echo Server lifecycle.
- *
- * \param[in] env	execution environment
- * \param[in] esd		ESD device
- */
-static void echo_srv_fini(const struct lu_env *env,
-			  struct echo_srv_device *esd)
+static int echo_cleanup(struct obd_device *obd)
 {
-	struct obd_device *obd = echo_srv_obd(esd);
-	struct lu_device *d = &esd->esd_dev;
 	int leaked;
-
 	ENTRY;
 
-	class_disconnect_exports(obd);
-	if (obd->obd_namespace != NULL)
-		ldlm_namespace_free_prior(obd->obd_namespace, NULL,
-					  obd->obd_force);
-
-	obd_exports_barrier(obd);
-	obd_zombie_barrier();
+	lprocfs_obd_cleanup(obd);
+	lprocfs_free_obd_stats(obd);
 
-	tgt_fini(env, &esd->esd_lut);
+	ldlm_lock_decref(&obd->u.echo.eo_nl_lock, LCK_NL);
 
-	if (obd->obd_namespace != NULL) {
-		ldlm_namespace_free_post(obd->obd_namespace);
-		obd->obd_namespace = NULL;
-	}
+	/* XXX Bug 3413; wait for a bit to ensure the BL callback has
+	 * happened before calling ldlm_namespace_free() */
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule_timeout(cfs_time_seconds(1));
 
-	lprocfs_obd_cleanup(obd);
-	lprocfs_free_obd_stats(obd);
+	ldlm_namespace_free(obd->obd_namespace, NULL, obd->obd_force);
+	obd->obd_namespace = NULL;
 
 	leaked = atomic_read(&obd->u.echo.eo_prep);
 	if (leaked != 0)
 		CERROR("%d prep/commitrw pages leaked\n", leaked);
 
-	LASSERT(atomic_read(&d->ld_ref) == 0);
-	EXIT;
-}
-
-/**
- * Implementation of lu_device_type_operations::ldto_device_fini.
- *
- * Finalize device. Dual to echo_srv_device_init(). It is called from
- * obd_precleanup() and stops the current device.
- *
- * \param[in] env	execution environment
- * \param[in] d		LU device of ESD
- *
- * \retval		NULL
- */
-static struct lu_device *echo_srv_device_fini(const struct lu_env *env,
-					      struct lu_device *d)
-{
-	ENTRY;
-	echo_srv_fini(env, echo_srv_dev(d));
-	RETURN(NULL);
-}
-
-/**
- * Implementation of lu_device_type_operations::ldto_device_free.
- *
- * Free Echo Server device. Dual to echo_srv_device_alloc().
- *
- * \param[in] env	execution environment
- * \param[in] d		LU device of ESD
- *
- * \retval		NULL
- */
-static struct lu_device *echo_srv_device_free(const struct lu_env *env,
-					      struct lu_device *d)
-{
-	struct echo_srv_device *esd = echo_srv_dev(d);
-
-	lu_device_fini(&esd->esd_dev);
-	OBD_FREE_PTR(esd);
-	RETURN(NULL);
-}
-
-/**
- * Implementation of lu_device_type_operations::ldto_device_alloc.
- *
- * This function allocates the new Echo Server device. It is called from
- * obd_setup() if OBD device had lu_device_type defined.
- *
- * \param[in] env	execution environment
- * \param[in] t		lu_device_type of ESD device
- * \param[in] cfg	configuration log
- *
- * \retval		pointer to the lu_device of just allocated OFD
- * \retval		ERR_PTR of return value on error
- */
-static struct lu_device *echo_srv_device_alloc(const struct lu_env *env,
-					       struct lu_device_type *t,
-					       struct lustre_cfg *cfg)
-{
-	struct echo_srv_device *esd;
-	struct lu_device *l;
-	int rc;
-
-	OBD_ALLOC_PTR(esd);
-	if (esd == NULL)
-		return ERR_PTR(-ENOMEM);
-
-	l = &esd->esd_dev;
-	lu_device_init(l, t);
-	rc = echo_srv_init0(env, esd, t, cfg);
-	if (rc != 0) {
-		echo_srv_device_free(env, l);
-		l = ERR_PTR(rc);
-	}
-
-	return l;
+	RETURN(0);
 }
 
-static const struct lu_device_type_operations echo_srv_type_ops = {
-	.ldto_device_alloc = echo_srv_device_alloc,
-	.ldto_device_free = echo_srv_device_free,
-	.ldto_device_fini = echo_srv_device_fini
-};
-
-struct lu_device_type echo_srv_type = {
-	.ldt_tags = LU_DEVICE_DT,
-	.ldt_name = LUSTRE_ECHO_NAME,
-	.ldt_ops = &echo_srv_type_ops,
-	.ldt_ctx_tags = LCT_DT_THREAD,
+struct obd_ops echo_obd_ops = {
+        .o_owner           = THIS_MODULE,
+        .o_connect         = echo_connect,
+        .o_disconnect      = echo_disconnect,
+        .o_init_export     = echo_init_export,
+        .o_destroy_export  = echo_destroy_export,
+        .o_create          = echo_create,
+        .o_destroy         = echo_destroy,
+        .o_getattr         = echo_getattr,
+        .o_setattr         = echo_setattr,
+        .o_preprw          = echo_preprw,
+        .o_commitrw        = echo_commitrw,
+        .o_setup           = echo_setup,
+        .o_cleanup         = echo_cleanup
 };
 
 void echo_persistent_pages_fini(void)
diff --git a/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c b/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c
index 53620c7e19c37..26065b110e592 100644
--- a/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c
+++ b/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -47,7 +47,7 @@
 #include <lustre_fid.h>
 #include <lustre_lmv.h>
 #include <lustre_acl.h>
-#include <uapi/linux/lustre/lustre_ioctl.h>
+#include <uapi/linux/lustre_ioctl.h>
 #include <lustre_net.h>
 #ifdef HAVE_SERVER_SUPPORT
 # include <md_object.h>
@@ -328,8 +328,7 @@ static void echo_page_completion(const struct lu_env *env,
 }
 
 static void echo_page_fini(const struct lu_env *env,
-			   struct cl_page_slice *slice,
-			   struct pagevec *pvec)
+			   struct cl_page_slice *slice)
 {
 	struct echo_object *eco = cl2echo_obj(slice->cpl_obj);
 	ENTRY;
@@ -507,18 +506,11 @@ static int echo_object_init(const struct lu_env *env, struct lu_object *obj,
 	RETURN(0);
 }
 
-static void echo_object_delete(const struct lu_env *env, struct lu_object *obj)
+static void echo_object_free(const struct lu_env *env, struct lu_object *obj)
 {
-	struct echo_object *eco    = cl2echo_obj(lu2cl(obj));
-	struct echo_client_obd *ec;
-
-	ENTRY;
-
-	/* object delete called unconditolally - layer init or not */
-	if (eco->eo_dev == NULL)
-		return;
-
-	ec = eco->eo_dev->ed_ec;
+        struct echo_object *eco    = cl2echo_obj(lu2cl(obj));
+        struct echo_client_obd *ec = eco->eo_dev->ed_ec;
+        ENTRY;
 
 	LASSERT(atomic_read(&eco->eo_npages) == 0);
 
@@ -526,18 +518,11 @@ static void echo_object_delete(const struct lu_env *env, struct lu_object *obj)
 	list_del_init(&eco->eo_obj_chain);
 	spin_unlock(&ec->ec_lock);
 
+        lu_object_fini(obj);
+        lu_object_header_fini(obj->lo_header);
+
 	if (eco->eo_oinfo != NULL)
 		OBD_FREE_PTR(eco->eo_oinfo);
-}
-
-static void echo_object_free(const struct lu_env *env, struct lu_object *obj)
-{
-	struct echo_object *eco    = cl2echo_obj(lu2cl(obj));
-
-	ENTRY;
-
-	lu_object_fini(obj);
-	lu_object_header_fini(obj->lo_header);
 
 	OBD_SLAB_FREE_PTR(eco, echo_object_kmem);
 	EXIT;
@@ -552,12 +537,12 @@ static int echo_object_print(const struct lu_env *env, void *cookie,
 }
 
 static const struct lu_object_operations echo_lu_obj_ops = {
-	.loo_object_init      = echo_object_init,
-	.loo_object_delete    = echo_object_delete,
-	.loo_object_release   = NULL,
-	.loo_object_free      = echo_object_free,
-	.loo_object_print     = echo_object_print,
-	.loo_object_invariant = NULL
+        .loo_object_init      = echo_object_init,
+        .loo_object_delete    = NULL,
+        .loo_object_release   = NULL,
+        .loo_object_free      = echo_object_free,
+        .loo_object_print     = echo_object_print,
+        .loo_object_invariant = NULL
 };
 /** @} echo_lu_ops */
 
@@ -977,18 +962,19 @@ static struct lu_device *echo_device_alloc(const struct lu_env *env,
                         CERROR("Cleanup obd device %s error(%d)\n",
                                obd->obd_name, rc2);
         }
-	/* fallthrough */
+	/* Fall through */
 
         case 3:
                 echo_site_fini(env, ed);
-		/* fallthrough */
+		/* Fall through */
         case 2:
                 cl_device_fini(&ed->ed_cl);
-		/* fallthrough */
+		/* Fall through */
         case 1:
                 OBD_FREE_PTR(ed);
-		/* fallthrough */
+		/* Fall through */
         case 0:
+		/* Fall through */
         default:
                 break;
         }
@@ -1728,7 +1714,7 @@ static int echo_create_md_object(const struct lu_env *env,
 	memset(spec, 0, sizeof(*spec));
 	echo_set_lmm_size(env, ld, ma);
 	if (stripe_count != 0) {
-		spec->sp_cr_flags |= MDS_FMODE_WRITE;
+		spec->sp_cr_flags |= FMODE_WRITE;
 		if (stripe_count != -1) {
 			if (S_ISDIR(mode)) {
 				struct lmv_user_md *lmu;
@@ -1756,7 +1742,7 @@ static int echo_create_md_object(const struct lu_env *env,
 
 	ma->ma_attr.la_mode = mode;
 	ma->ma_attr.la_valid = LA_CTIME | LA_MODE;
-	ma->ma_attr.la_ctime = ktime_get_real_seconds();
+        ma->ma_attr.la_ctime = cfs_time_current_64();
 
 	if (name != NULL) {
 		lname->ln_name = name;
@@ -2099,7 +2085,7 @@ static int echo_destroy_object(const struct lu_env *env,
         memset(ma, 0, sizeof(*ma));
         ma->ma_attr.la_mode = mode;
         ma->ma_attr.la_valid = LA_CTIME;
-	ma->ma_attr.la_ctime = ktime_get_real_seconds();
+        ma->ma_attr.la_ctime = cfs_time_current_64();
         ma->ma_need = MA_INODE;
         ma->ma_valid = 0;
 
@@ -2593,11 +2579,11 @@ static int echo_client_prep_commit(const struct lu_env *env,
 				   u64 offset, u64 count,
 				   u64 batch, int async)
 {
-	struct obd_ioobj ioo;
-	struct niobuf_local *lnb;
-	struct niobuf_remote rnb;
-	u64 off;
-	u64 npages, tot_pages, apc;
+	struct obd_ioobj	 ioo;
+	struct niobuf_local	*lnb;
+	struct niobuf_remote	 rnb;
+	u64			 off;
+	u64			 npages, tot_pages, apc;
 	int i, ret = 0, brw_flags = 0;
 
 	ENTRY;
@@ -2608,7 +2594,7 @@ static int echo_client_prep_commit(const struct lu_env *env,
 	apc = npages = batch >> PAGE_SHIFT;
 	tot_pages = count >> PAGE_SHIFT;
 
-	OBD_ALLOC_LARGE(lnb, apc * sizeof(struct niobuf_local));
+	OBD_ALLOC(lnb, apc * sizeof(struct niobuf_local));
 	if (lnb == NULL)
 		RETURN(-ENOMEM);
 
@@ -2674,7 +2660,7 @@ static int echo_client_prep_commit(const struct lu_env *env,
 	}
 
 out:
-	OBD_FREE_LARGE(lnb, apc * sizeof(struct niobuf_local));
+	OBD_FREE(lnb, apc * sizeof(struct niobuf_local));
 
 	RETURN(ret);
 }
@@ -2776,9 +2762,6 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
 	rc = lu_env_init(env, LCT_DT_THREAD);
 	if (rc)
 		GOTO(out_alloc, rc = -ENOMEM);
-	lu_env_add(env);
-	if (rc)
-		GOTO(out_env_fini, rc = -ENOMEM);
 
 #ifdef HAVE_SERVER_SUPPORT
 	env->le_ses = &echo_session;
@@ -2920,8 +2903,6 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
 	lu_context_fini(env->le_ses);
 out_env:
 #endif
-	lu_env_remove(env);
-out_env_fini:
         lu_env_fini(env);
 out_alloc:
         OBD_FREE_PTR(env);
@@ -3091,15 +3072,15 @@ static int __init obdecho_init(void)
                 goto failed_0;
 
 	rc = class_register_type(&echo_obd_ops, NULL, true, NULL,
-				 LUSTRE_ECHO_NAME, &echo_srv_type);
+				 LUSTRE_ECHO_NAME, NULL);
 	if (rc != 0)
 		goto failed_1;
 # endif
 
 	rc = lu_kmem_init(echo_caches);
 	if (rc == 0) {
-		rc = class_register_type(&echo_client_obd_ops, NULL, false,
-					 NULL, LUSTRE_ECHO_CLIENT_NAME,
+		rc = class_register_type(&echo_client_obd_ops, NULL, true, NULL,
+					 LUSTRE_ECHO_CLIENT_NAME,
 					 &echo_device_type);
 		if (rc)
 			lu_kmem_fini(echo_caches);
diff --git a/drivers/staging/lustrefsx/lustre/obdecho/echo_internal.h b/drivers/staging/lustrefsx/lustre/obdecho/echo_internal.h
index 469d68e94f02f..8c72c40ebb767 100644
--- a/drivers/staging/lustrefsx/lustre/obdecho/echo_internal.h
+++ b/drivers/staging/lustrefsx/lustre/obdecho/echo_internal.h
@@ -45,7 +45,6 @@
 
 #ifdef HAVE_SERVER_SUPPORT
 extern struct obd_ops echo_obd_ops;
-extern struct lu_device_type echo_srv_type;
 int echo_persistent_pages_init(void);
 void echo_persistent_pages_fini(void);
 #endif /* HAVE_SERVER_SUPPORT */
diff --git a/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c b/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c
index ab8cfca3601eb..d6123c61af113 100644
--- a/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c
+++ b/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -37,78 +37,69 @@
 #include <obd_class.h>
 #include <lprocfs_status.h>
 #include <linux/seq_file.h>
-#include <lustre_osc.h>
-
 #include "osc_internal.h"
 
-static ssize_t active_show(struct kobject *kobj, struct attribute *attr,
-			   char *buf)
+#ifdef CONFIG_PROC_FS
+static int osc_active_seq_show(struct seq_file *m, void *v)
 {
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	int rc;
+	struct obd_device *dev = m->private;
 
 	LPROCFS_CLIMP_CHECK(dev);
-	rc = sprintf(buf, "%d\n", !dev->u.cli.cl_import->imp_deactive);
+	seq_printf(m, "%d\n", !dev->u.cli.cl_import->imp_deactive);
 	LPROCFS_CLIMP_EXIT(dev);
-	return rc;
+	return 0;
 }
 
-static ssize_t active_store(struct kobject *kobj, struct attribute *attr,
-			    const char *buffer, size_t count)
+static ssize_t osc_active_seq_write(struct file *file,
+				    const char __user *buffer,
+				    size_t count, loff_t *off)
 {
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	bool val;
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
 	int rc;
+	__s64 val;
 
-	rc = kstrtobool(buffer, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
+	if (val < 0 || val > 1)
+		return -ERANGE;
 
 	/* opposite senses */
 	if (dev->u.cli.cl_import->imp_deactive == val)
 		rc = ptlrpc_set_import_active(dev->u.cli.cl_import, val);
 	else
-		CDEBUG(D_CONFIG, "activate %u: ignoring repeat request\n",
-		       (unsigned int)val);
+		CDEBUG(D_CONFIG, "activate %d: ignoring repeat request\n",
+			(int)val);
 
 	return count;
 }
-LUSTRE_RW_ATTR(active);
+LPROC_SEQ_FOPS(osc_active);
 
-static ssize_t max_rpcs_in_flight_show(struct kobject *kobj,
-				       struct attribute *attr,
-				       char *buf)
+static int osc_max_rpcs_in_flight_seq_show(struct seq_file *m, void *v)
 {
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
+	struct obd_device *dev = m->private;
 	struct client_obd *cli = &dev->u.cli;
-	ssize_t len;
 
 	spin_lock(&cli->cl_loi_list_lock);
-	len = sprintf(buf, "%u\n", cli->cl_max_rpcs_in_flight);
+	seq_printf(m, "%u\n", cli->cl_max_rpcs_in_flight);
 	spin_unlock(&cli->cl_loi_list_lock);
-	return len;
+	return 0;
 }
 
-static ssize_t max_rpcs_in_flight_store(struct kobject *kobj,
-					struct attribute *attr,
-					const char *buffer,
-					size_t count)
+static ssize_t osc_max_rpcs_in_flight_seq_write(struct file *file,
+						const char __user *buffer,
+						size_t count, loff_t *off)
 {
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
 	struct client_obd *cli = &dev->u.cli;
-	int adding, added, req_count;
-	unsigned int val;
 	int rc;
+	int adding, added, req_count;
+	__s64 val;
 
-	rc = kstrtouint(buffer, 0, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
-
-	if (val == 0 || val > OSC_MAX_RIF_MAX)
+	if (val < 1 || val > OSC_MAX_RIF_MAX)
 		return -ERANGE;
 
 	LPROCFS_CLIMP_CHECK(dev);
@@ -135,42 +126,41 @@ static ssize_t max_rpcs_in_flight_store(struct kobject *kobj,
 	LPROCFS_CLIMP_EXIT(dev);
 	return count;
 }
-LUSTRE_RW_ATTR(max_rpcs_in_flight);
+LPROC_SEQ_FOPS(osc_max_rpcs_in_flight);
 
-static ssize_t max_dirty_mb_show(struct kobject *kobj,
-				 struct attribute *attr,
-				 char *buf)
+static int osc_max_dirty_mb_seq_show(struct seq_file *m, void *v)
 {
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct client_obd *cli = &dev->u.cli;
-	unsigned long val;
+	struct obd_device *dev = m->private;
+        struct client_obd *cli = &dev->u.cli;
+        long val;
+        int mult;
 
 	spin_lock(&cli->cl_loi_list_lock);
-	val = PAGES_TO_MiB(cli->cl_dirty_max_pages);
+	val = cli->cl_dirty_max_pages;
 	spin_unlock(&cli->cl_loi_list_lock);
 
-	return sprintf(buf, "%lu\n", val);
+	mult = 1 << (20 - PAGE_SHIFT);
+	return lprocfs_seq_read_frac_helper(m, val, mult);
 }
 
-static ssize_t max_dirty_mb_store(struct kobject *kobj,
-				  struct attribute *attr,
-				  const char *buffer,
-				  size_t count)
+static ssize_t osc_max_dirty_mb_seq_write(struct file *file,
+					  const char __user *buffer,
+					  size_t count, loff_t *off)
 {
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
 	struct client_obd *cli = &dev->u.cli;
-	unsigned long pages_number, max_dirty_mb;
 	int rc;
+	__s64 pages_number;
 
-	rc = kstrtoul(buffer, 10, &max_dirty_mb);
+	rc = lprocfs_str_with_units_to_s64(file, buffer, count,
+			&pages_number, 'M');
 	if (rc)
 		return rc;
 
-	pages_number = MiB_TO_PAGES(max_dirty_mb);
+	pages_number >>= PAGE_SHIFT;
 
-	if (pages_number >= MiB_TO_PAGES(OSC_MAX_DIRTY_MB_MAX) ||
+	if (pages_number <= 0 ||
+	    pages_number >= OSC_MAX_DIRTY_MB_MAX << (20 - PAGE_SHIFT) ||
 	    pages_number > cfs_totalram_pages() / 4) /* 1/4 of RAM */
 		return -ERANGE;
 
@@ -181,12 +171,7 @@ static ssize_t max_dirty_mb_store(struct kobject *kobj,
 
 	return count;
 }
-LUSTRE_RW_ATTR(max_dirty_mb);
-
-LUSTRE_ATTR(ost_conn_uuid, 0444, conn_uuid_show, NULL);
-LUSTRE_RO_ATTR(conn_uuid);
-
-LUSTRE_RW_ATTR(ping);
+LPROC_SEQ_FOPS(osc_max_dirty_mb);
 
 static int osc_cached_mb_seq_show(struct seq_file *m, void *v)
 {
@@ -206,9 +191,9 @@ static int osc_cached_mb_seq_show(struct seq_file *m, void *v)
 }
 
 /* shrink the number of caching pages to a specific number */
-static ssize_t osc_cached_mb_seq_write(struct file *file,
-				       const char __user *buffer,
-				       size_t count, loff_t *off)
+static ssize_t
+osc_cached_mb_seq_write(struct file *file, const char __user *buffer,
+			size_t count, loff_t *off)
 {
 	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
 	struct client_obd *cli = &dev->u.cli;
@@ -219,13 +204,14 @@ static ssize_t osc_cached_mb_seq_write(struct file *file,
 	if (count >= sizeof(kernbuf))
 		return -EINVAL;
 
-	if (copy_from_user(kernbuf, buffer, count))
+	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
 		return -EFAULT;
 	kernbuf[count] = 0;
 
 	buffer += lprocfs_find_named_value(kernbuf, "used_mb:", &count) -
 		  kernbuf;
-	rc = lprocfs_str_with_units_to_s64(buffer, count, &pages_number, 'M');
+	rc = lprocfs_str_with_units_to_s64(file, buffer, count,
+			&pages_number, 'M');
 	if (rc)
 		return rc;
 
@@ -248,25 +234,19 @@ static ssize_t osc_cached_mb_seq_write(struct file *file,
 
 	return count;
 }
-
 LPROC_SEQ_FOPS(osc_cached_mb);
 
-static ssize_t cur_dirty_bytes_show(struct kobject *kobj,
-				    struct attribute *attr,
-				    char *buf)
+static int osc_cur_dirty_bytes_seq_show(struct seq_file *m, void *v)
 {
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
+	struct obd_device *dev = m->private;
 	struct client_obd *cli = &dev->u.cli;
-	ssize_t len;
 
 	spin_lock(&cli->cl_loi_list_lock);
-	len = sprintf(buf, "%lu\n", cli->cl_dirty_pages << PAGE_SHIFT);
+	seq_printf(m, "%lu\n", cli->cl_dirty_pages << PAGE_SHIFT);
 	spin_unlock(&cli->cl_loi_list_lock);
-
-	return len;
+	return 0;
 }
-LUSTRE_RO_ATTR(cur_dirty_bytes);
+LPROC_SEQ_FOPS_RO(osc_cur_dirty_bytes);
 
 static int osc_cur_grant_bytes_seq_show(struct seq_file *m, void *v)
 {
@@ -285,17 +265,17 @@ static ssize_t osc_cur_grant_bytes_seq_write(struct file *file,
 {
 	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
 	struct client_obd *cli = &obd->u.cli;
-	s64 val;
-	int rc;
+	int                rc;
+	__s64              val;
 
 	if (obd == NULL)
 		return 0;
 
-	rc = lprocfs_str_with_units_to_s64(buffer, count, &val, '1');
+	rc = lprocfs_str_with_units_to_s64(file, buffer, count, &val, '1');
 	if (rc)
 		return rc;
 	if (val < 0)
-		return val;
+		return -ERANGE;
 
 	/* this is only for shrinking grant */
 	spin_lock(&cli->cl_loi_list_lock);
@@ -310,89 +290,102 @@ static ssize_t osc_cur_grant_bytes_seq_write(struct file *file,
 	if (cli->cl_import->imp_state == LUSTRE_IMP_FULL)
 		rc = osc_shrink_grant_to_target(cli, val);
 	LPROCFS_CLIMP_EXIT(obd);
-
-	return rc ? rc : count;
+	if (rc)
+		return rc;
+	return count;
 }
 LPROC_SEQ_FOPS(osc_cur_grant_bytes);
 
-static ssize_t cur_lost_grant_bytes_show(struct kobject *kobj,
-					 struct attribute *attr,
-					 char *buf)
+static int osc_cur_lost_grant_bytes_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+	struct client_obd *cli = &dev->u.cli;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	seq_printf(m, "%lu\n", cli->cl_lost_grant);
+	spin_unlock(&cli->cl_loi_list_lock);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(osc_cur_lost_grant_bytes);
+
+static int osc_cur_dirty_grant_bytes_seq_show(struct seq_file *m, void *v)
 {
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
+	struct obd_device *dev = m->private;
 	struct client_obd *cli = &dev->u.cli;
-	ssize_t len;
 
 	spin_lock(&cli->cl_loi_list_lock);
-	len = sprintf(buf, "%lu\n", cli->cl_lost_grant);
+	seq_printf(m, "%lu\n", cli->cl_dirty_grant);
 	spin_unlock(&cli->cl_loi_list_lock);
-	return len;
+	return 0;
 }
-LUSTRE_RO_ATTR(cur_lost_grant_bytes);
+LPROC_SEQ_FOPS_RO(osc_cur_dirty_grant_bytes);
 
-static ssize_t grant_shrink_interval_show(struct kobject *kobj,
-					  struct attribute *attr,
-					  char *buf)
+static int osc_grant_shrink_interval_seq_show(struct seq_file *m, void *v)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
+	struct obd_device *obd = m->private;
 
-	return sprintf(buf, "%lld\n", obd->u.cli.cl_grant_shrink_interval);
+	if (obd == NULL)
+		return 0;
+	seq_printf(m, "%d\n",
+		   obd->u.cli.cl_grant_shrink_interval);
+	return 0;
 }
 
-static ssize_t grant_shrink_interval_store(struct kobject *kobj,
-					   struct attribute *attr,
-					   const char *buffer,
-					   size_t count)
+static ssize_t osc_grant_shrink_interval_seq_write(struct file *file,
+						   const char __user *buffer,
+						   size_t count, loff_t *off)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	unsigned int val;
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
 	int rc;
+	__s64 val;
 
-	rc = kstrtouint(buffer, 0, &val);
+	if (obd == NULL)
+		return 0;
+
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
-	if (val == 0)
+	if (val <= 0 || val > INT_MAX)
 		return -ERANGE;
 
 	obd->u.cli.cl_grant_shrink_interval = val;
 
 	return count;
 }
-LUSTRE_RW_ATTR(grant_shrink_interval);
+LPROC_SEQ_FOPS(osc_grant_shrink_interval);
 
-static ssize_t checksums_show(struct kobject *kobj,
-			      struct attribute *attr,
-			      char *buf)
+static int osc_checksum_seq_show(struct seq_file *m, void *v)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
+	struct obd_device *obd = m->private;
 
-	return sprintf(buf, "%d\n", obd->u.cli.cl_checksum ? 1 : 0);
+	if (obd == NULL)
+		return 0;
+
+	seq_printf(m, "%d\n", obd->u.cli.cl_checksum ? 1 : 0);
+	return 0;
 }
 
-static ssize_t checksums_store(struct kobject *kobj,
-			       struct attribute *attr,
-			       const char *buffer,
-			       size_t count)
+static ssize_t osc_checksum_seq_write(struct file *file,
+				      const char __user *buffer,
+				      size_t count, loff_t *off)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	bool val;
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
 	int rc;
+	__s64 val;
 
-	rc = kstrtobool(buffer, &val);
+	if (obd == NULL)
+		return 0;
+
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
-	obd->u.cli.cl_checksum = val;
+	obd->u.cli.cl_checksum = !!val;
 
 	return count;
 }
-LUSTRE_RW_ATTR(checksums);
+LPROC_SEQ_FOPS(osc_checksum);
 
 static int osc_checksum_type_seq_show(struct seq_file *m, void *v)
 {
@@ -429,7 +422,7 @@ static ssize_t osc_checksum_type_seq_write(struct file *file,
 
         if (count > sizeof(kernbuf) - 1)
                 return -EINVAL;
-	if (copy_from_user(kernbuf, buffer, count))
+	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
                 return -EFAULT;
         if (count > 0 && kernbuf[count - 1] == '\n')
                 kernbuf[count - 1] = '\0';
@@ -448,147 +441,139 @@ static ssize_t osc_checksum_type_seq_write(struct file *file,
 }
 LPROC_SEQ_FOPS(osc_checksum_type);
 
-static ssize_t resend_count_show(struct kobject *kobj,
-				 struct attribute *attr,
-				 char *buf)
+static int osc_resend_count_seq_show(struct seq_file *m, void *v)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
+	struct obd_device *obd = m->private;
 
-	return sprintf(buf, "%u\n", atomic_read(&obd->u.cli.cl_resends));
+	seq_printf(m, "%u\n", atomic_read(&obd->u.cli.cl_resends));
+	return 0;
 }
 
-static ssize_t resend_count_store(struct kobject *kobj,
-				  struct attribute *attr,
-				  const char *buffer,
-				  size_t count)
+static ssize_t osc_resend_count_seq_write(struct file *file,
+					  const char __user *buffer,
+					  size_t count, loff_t *off)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	unsigned int val;
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
 	int rc;
+	__s64 val;
 
-	rc = kstrtouint(buffer, 10, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
+	if (val < 0 || val > INT_MAX)
+		return -EINVAL;
+
 	atomic_set(&obd->u.cli.cl_resends, val);
 
 	return count;
 }
-LUSTRE_RW_ATTR(resend_count);
+LPROC_SEQ_FOPS(osc_resend_count);
 
-static ssize_t checksum_dump_show(struct kobject *kobj,
-				  struct attribute *attr,
-				  char *buf)
+static int osc_checksum_dump_seq_show(struct seq_file *m, void *v)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
+	struct obd_device *obd = m->private;
+
+	if (obd == NULL)
+		return 0;
 
-	return sprintf(buf, "%d\n", obd->u.cli.cl_checksum_dump ? 1 : 0);
+	seq_printf(m, "%d\n", obd->u.cli.cl_checksum_dump ? 1 : 0);
+	return 0;
 }
 
-static ssize_t checksum_dump_store(struct kobject *kobj,
-				   struct attribute *attr,
-				   const char *buffer,
-				   size_t count)
+static ssize_t osc_checksum_dump_seq_write(struct file *file,
+					   const char __user *buffer,
+					   size_t count, loff_t *off)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	bool val;
+	struct obd_device *obd;
 	int rc;
+	__s64 val;
 
-	rc = kstrtobool(buffer, &val);
+	obd = ((struct seq_file *)file->private_data)->private;
+	if (obd == NULL)
+		return 0;
+
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
 
-	obd->u.cli.cl_checksum_dump = val;
+	obd->u.cli.cl_checksum_dump = (val ? 1 : 0);
 
 	return count;
 }
-LUSTRE_RW_ATTR(checksum_dump);
+LPROC_SEQ_FOPS(osc_checksum_dump);
 
-static ssize_t contention_seconds_show(struct kobject *kobj,
-				       struct attribute *attr,
-				       char *buf)
+static int osc_contention_seconds_seq_show(struct seq_file *m, void *v)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct osc_device *od = obd2osc_dev(obd);
+	struct obd_device *obd = m->private;
+	struct osc_device *od  = obd2osc_dev(obd);
 
-	return sprintf(buf, "%lld\n", od->od_contention_time);
+	seq_printf(m, "%u\n", od->od_contention_time);
+	return 0;
 }
 
-static ssize_t contention_seconds_store(struct kobject *kobj,
-					struct attribute *attr,
-					const char *buffer,
-					size_t count)
+static ssize_t osc_contention_seconds_seq_write(struct file *file,
+						const char __user *buffer,
+						size_t count, loff_t *off)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct osc_device *od = obd2osc_dev(obd);
-	unsigned int val;
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	struct osc_device *od  = obd2osc_dev(obd);
 	int rc;
+	__s64 val;
 
-	rc = kstrtouint(buffer, 0, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
+	if (val < 0 || val > INT_MAX)
+		return -ERANGE;
 
 	od->od_contention_time = val;
 
 	return count;
 }
-LUSTRE_RW_ATTR(contention_seconds);
+LPROC_SEQ_FOPS(osc_contention_seconds);
 
-static ssize_t lockless_truncate_show(struct kobject *kobj,
-				      struct attribute *attr,
-				      char *buf)
+static int osc_lockless_truncate_seq_show(struct seq_file *m, void *v)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct osc_device *od = obd2osc_dev(obd);
+	struct obd_device *obd = m->private;
+	struct osc_device *od  = obd2osc_dev(obd);
 
-	return sprintf(buf, "%u\n", od->od_lockless_truncate);
+	seq_printf(m, "%u\n", od->od_lockless_truncate);
+	return 0;
 }
 
-static ssize_t lockless_truncate_store(struct kobject *kobj,
-				       struct attribute *attr,
-				       const char *buffer,
-				       size_t count)
+static ssize_t osc_lockless_truncate_seq_write(struct file *file,
+					       const char __user *buffer,
+				    size_t count, loff_t *off)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct osc_device *od = obd2osc_dev(obd);
-	bool val;
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+        struct osc_device *od  = obd2osc_dev(obd);
 	int rc;
+	__s64 val;
 
-	rc = kstrtobool(buffer, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc)
 		return rc;
+	if (val < 0)
+		return -ERANGE;
 
-	od->od_lockless_truncate = val;
+	od->od_lockless_truncate = !!val;
 
 	return count;
 }
-LUSTRE_RW_ATTR(lockless_truncate);
+LPROC_SEQ_FOPS(osc_lockless_truncate);
 
-static ssize_t destroys_in_flight_show(struct kobject *kobj,
-				       struct attribute *attr,
-				       char *buf)
+static int osc_destroys_in_flight_seq_show(struct seq_file *m, void *v)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-
-	return sprintf(buf, "%u\n",
-		       atomic_read(&obd->u.cli.cl_destroy_in_flight));
+	struct obd_device *obd = m->private;
+	seq_printf(m, "%u\n",
+		   atomic_read(&obd->u.cli.cl_destroy_in_flight));
+	return 0;
 }
-LUSTRE_RO_ATTR(destroys_in_flight);
+LPROC_SEQ_FOPS_RO(osc_destroys_in_flight);
 
 LPROC_SEQ_FOPS_RW_TYPE(osc, obd_max_pages_per_rpc);
 
-LUSTRE_RW_ATTR(short_io_bytes);
-
-#ifdef CONFIG_PROC_FS
 static int osc_unstable_stats_seq_show(struct seq_file *m, void *v)
 {
 	struct obd_device *dev = m->private;
@@ -606,154 +591,84 @@ static int osc_unstable_stats_seq_show(struct seq_file *m, void *v)
 }
 LPROC_SEQ_FOPS_RO(osc_unstable_stats);
 
-static ssize_t idle_timeout_show(struct kobject *kobj, struct attribute *attr,
-				 char *buf)
-{
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct client_obd *cli = &obd->u.cli;
-	int ret;
-
-	LPROCFS_CLIMP_CHECK(obd);
-	ret = sprintf(buf, "%u\n", cli->cl_import->imp_idle_timeout);
-	LPROCFS_CLIMP_EXIT(obd);
-
-	return ret;
-}
-
-static ssize_t idle_timeout_store(struct kobject *kobj, struct attribute *attr,
-				  const char *buffer, size_t count)
-{
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct client_obd *cli = &dev->u.cli;
-	struct ptlrpc_request *req;
-	unsigned int idle_debug = 0;
-	unsigned int val;
-	int rc;
-
-	if (strncmp(buffer, "debug", 5) == 0) {
-		idle_debug = D_CONSOLE;
-	} else if (strncmp(buffer, "nodebug", 6) == 0) {
-		idle_debug = D_HA;
-	} else {
-		rc = kstrtouint(buffer, 10, &val);
-		if (rc)
-			return rc;
-
-		if (val > CONNECTION_SWITCH_MAX)
-			return -ERANGE;
-	}
-
-	LPROCFS_CLIMP_CHECK(dev);
-	if (idle_debug) {
-		cli->cl_import->imp_idle_debug = idle_debug;
-	} else {
-		if (!val) {
-			/* initiate the connection if it's in IDLE state */
-			req = ptlrpc_request_alloc(cli->cl_import,
-						   &RQF_OST_STATFS);
-			if (req != NULL)
-				ptlrpc_req_finished(req);
-		}
-		cli->cl_import->imp_idle_timeout = val;
-	}
-	LPROCFS_CLIMP_EXIT(dev);
-
-	return count;
-}
-LUSTRE_RW_ATTR(idle_timeout);
-
-static ssize_t idle_connect_store(struct kobject *kobj, struct attribute *attr,
-				  const char *buffer, size_t count)
-{
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct client_obd *cli = &dev->u.cli;
-	struct ptlrpc_request *req;
-
-	LPROCFS_CLIMP_CHECK(dev);
-	/* to initiate the connection if it's in IDLE state */
-	req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_STATFS);
-	if (req)
-		ptlrpc_req_finished(req);
-	ptlrpc_pinger_force(cli->cl_import);
-	LPROCFS_CLIMP_EXIT(dev);
-
-	return count;
-}
-LUSTRE_WO_ATTR(idle_connect);
-
-static ssize_t grant_shrink_show(struct kobject *kobj, struct attribute *attr,
-				 char *buf)
-{
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct obd_import *imp;
-	ssize_t len;
-
-	LPROCFS_CLIMP_CHECK(obd);
-	imp = obd->u.cli.cl_import;
-	len = snprintf(buf, PAGE_SIZE, "%d\n",
-		       !imp->imp_grant_shrink_disabled &&
-		       OCD_HAS_FLAG(&imp->imp_connect_data, GRANT_SHRINK));
-	LPROCFS_CLIMP_EXIT(obd);
-
-	return len;
-}
-
-static ssize_t grant_shrink_store(struct kobject *kobj, struct attribute *attr,
-				  const char *buffer, size_t count)
-{
-	struct obd_device *dev = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct obd_import *imp;
-	bool val;
-	int rc;
-
-	if (dev == NULL)
-		return 0;
-
-	rc = kstrtobool(buffer, &val);
-	if (rc)
-		return rc;
-
-	LPROCFS_CLIMP_CHECK(dev);
-
-	imp = dev->u.cli.cl_import;
-	spin_lock(&imp->imp_lock);
-	imp->imp_grant_shrink_disabled = !val;
-	spin_unlock(&imp->imp_lock);
-
-	LPROCFS_CLIMP_EXIT(dev);
-
-	return count;
-}
-LUSTRE_RW_ATTR(grant_shrink);
-
+LPROC_SEQ_FOPS_RO_TYPE(osc, uuid);
 LPROC_SEQ_FOPS_RO_TYPE(osc, connect_flags);
+LPROC_SEQ_FOPS_RO_TYPE(osc, blksize);
+LPROC_SEQ_FOPS_RO_TYPE(osc, kbytestotal);
+LPROC_SEQ_FOPS_RO_TYPE(osc, kbytesfree);
+LPROC_SEQ_FOPS_RO_TYPE(osc, kbytesavail);
+LPROC_SEQ_FOPS_RO_TYPE(osc, filestotal);
+LPROC_SEQ_FOPS_RO_TYPE(osc, filesfree);
 LPROC_SEQ_FOPS_RO_TYPE(osc, server_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(osc, conn_uuid);
 LPROC_SEQ_FOPS_RO_TYPE(osc, timeouts);
 LPROC_SEQ_FOPS_RO_TYPE(osc, state);
 
+LPROC_SEQ_FOPS_WO_TYPE(osc, ping);
+
 LPROC_SEQ_FOPS_RW_TYPE(osc, import);
 LPROC_SEQ_FOPS_RW_TYPE(osc, pinger_recov);
 
 struct lprocfs_vars lprocfs_osc_obd_vars[] = {
+	{ .name	=	"uuid",
+	  .fops	=	&osc_uuid_fops			},
+	{ .name	=	"ping",
+	  .fops	=	&osc_ping_fops,
+	  .proc_mode =	0222				},
 	{ .name	=	"connect_flags",
 	  .fops	=	&osc_connect_flags_fops		},
+	{ .name	=	"blocksize",
+	  .fops	=	&osc_blksize_fops		},
+	{ .name	=	"kbytestotal",
+	  .fops	=	&osc_kbytestotal_fops		},
+	{ .name	=	"kbytesfree",
+	  .fops	=	&osc_kbytesfree_fops		},
+	{ .name	=	"kbytesavail",
+	  .fops	=	&osc_kbytesavail_fops		},
+	{ .name	=	"filestotal",
+	  .fops	=	&osc_filestotal_fops		},
+	{ .name	=	"filesfree",
+	  .fops	=	&osc_filesfree_fops		},
 	{ .name	=	"ost_server_uuid",
 	  .fops	=	&osc_server_uuid_fops		},
-	{ .name =	"max_pages_per_rpc",
-	  .fops =	&osc_obd_max_pages_per_rpc_fops	},
+	{ .name	=	"ost_conn_uuid",
+	  .fops	=	&osc_conn_uuid_fops		},
+	{ .name	=	"active",
+	  .fops	=	&osc_active_fops		},
+	{ .name	=	"max_pages_per_rpc",
+	  .fops	=	&osc_obd_max_pages_per_rpc_fops	},
+	{ .name	=	"max_rpcs_in_flight",
+	  .fops	=	&osc_max_rpcs_in_flight_fops	},
+	{ .name	=	"destroys_in_flight",
+	  .fops	=	&osc_destroys_in_flight_fops	},
+	{ .name	=	"max_dirty_mb",
+	  .fops	=	&osc_max_dirty_mb_fops		},
 	{ .name	=	"osc_cached_mb",
 	  .fops	=	&osc_cached_mb_fops		},
-	{ .name =	"cur_grant_bytes",
-	  .fops =	&osc_cur_grant_bytes_fops	},
+	{ .name	=	"cur_dirty_bytes",
+	  .fops	=	&osc_cur_dirty_bytes_fops	},
+	{ .name	=	"cur_grant_bytes",
+	  .fops	=	&osc_cur_grant_bytes_fops	},
+	{ .name	=	"cur_lost_grant_bytes",
+	  .fops	=	&osc_cur_lost_grant_bytes_fops	},
+	{ .name	=	"cur_dirty_grant_bytes",
+	  .fops	=	&osc_cur_dirty_grant_bytes_fops	},
+	{ .name	=	"grant_shrink_interval",
+	  .fops	=	&osc_grant_shrink_interval_fops	},
+	{ .name	=	"checksums",
+	  .fops	=	&osc_checksum_fops		},
 	{ .name	=	"checksum_type",
 	  .fops	=	&osc_checksum_type_fops		},
+	{ .name	=	"checksum_dump",
+	  .fops	=	&osc_checksum_dump_fops		},
+	{ .name	=	"resend_count",
+	  .fops	=	&osc_resend_count_fops		},
 	{ .name	=	"timeouts",
 	  .fops	=	&osc_timeouts_fops		},
+	{ .name	=	"contention_seconds",
+	  .fops	=	&osc_contention_seconds_fops	},
+	{ .name	=	"lockless_truncate",
+	  .fops	=	&osc_lockless_truncate_fops	},
 	{ .name	=	"import",
 	  .fops	=	&osc_import_fops		},
 	{ .name	=	"state",
@@ -765,6 +680,8 @@ struct lprocfs_vars lprocfs_osc_obd_vars[] = {
 	{ NULL }
 };
 
+#define pct(a,b) (b ? a * 100 / b : 0)
+
 static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v)
 {
 	struct timespec64 now;
@@ -803,7 +720,7 @@ static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v)
 
 		read_cum += r;
 		write_cum += w;
-		seq_printf(seq, "%d:\t\t%10lu %3u %3u   | %10lu %3u %3u\n",
+		seq_printf(seq, "%d:\t\t%10lu %3lu %3lu   | %10lu %3lu %3lu\n",
 			   1 << i, r, pct(r, read_tot),
 			   pct(read_cum, read_tot), w,
 			   pct(w, write_tot),
@@ -826,7 +743,7 @@ static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v)
                 unsigned long w = cli->cl_write_rpc_hist.oh_buckets[i];
                 read_cum += r;
                 write_cum += w;
-		seq_printf(seq, "%d:\t\t%10lu %3u %3u   | %10lu %3u %3u\n",
+		seq_printf(seq, "%d:\t\t%10lu %3lu %3lu   | %10lu %3lu %3lu\n",
 			   i, r, pct(r, read_tot),
 			   pct(read_cum, read_tot), w,
 			   pct(w, write_tot),
@@ -849,10 +766,10 @@ static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v)
                 unsigned long w = cli->cl_write_offset_hist.oh_buckets[i];
                 read_cum += r;
                 write_cum += w;
-		seq_printf(seq, "%d:\t\t%10lu %3u %3u   | %10lu %3u %3u\n",
-			   (i == 0) ? 0 : 1 << (i - 1),
-			   r, pct(r, read_tot), pct(read_cum, read_tot),
-			   w, pct(w, write_tot), pct(write_cum, write_tot));
+                seq_printf(seq, "%d:\t\t%10lu %3lu %3lu   | %10lu %3lu %3lu\n",
+                           (i == 0) ? 0 : 1 << (i - 1),
+                           r, pct(r, read_tot), pct(read_cum, read_tot),
+                           w, pct(w, write_tot), pct(write_cum, write_tot));
                 if (read_cum == read_tot && write_cum == write_tot)
                         break;
         }
@@ -861,6 +778,7 @@ static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v)
 
         return 0;
 }
+#undef pct
 
 static ssize_t osc_rpc_stats_seq_write(struct file *file,
 				       const char __user *buf,
@@ -914,7 +832,7 @@ static ssize_t osc_stats_seq_write(struct file *file,
 
 LPROC_SEQ_FOPS(osc_stats);
 
-int lprocfs_osc_attach_seqstat(struct obd_device *dev)
+int lproc_osc_attach_seqstat(struct obd_device *dev)
 {
 	int rc;
 
@@ -927,77 +845,3 @@ int lprocfs_osc_attach_seqstat(struct obd_device *dev)
 	return rc;
 }
 #endif /* CONFIG_PROC_FS */
-
-static struct attribute *osc_attrs[] = {
-	&lustre_attr_active.attr,
-	&lustre_attr_checksums.attr,
-	&lustre_attr_checksum_dump.attr,
-	&lustre_attr_contention_seconds.attr,
-	&lustre_attr_cur_dirty_bytes.attr,
-	&lustre_attr_cur_lost_grant_bytes.attr,
-	&lustre_attr_destroys_in_flight.attr,
-	&lustre_attr_grant_shrink_interval.attr,
-	&lustre_attr_lockless_truncate.attr,
-	&lustre_attr_max_dirty_mb.attr,
-	&lustre_attr_max_rpcs_in_flight.attr,
-	&lustre_attr_short_io_bytes.attr,
-	&lustre_attr_resend_count.attr,
-	&lustre_attr_ost_conn_uuid.attr,
-	&lustre_attr_conn_uuid.attr,
-	&lustre_attr_ping.attr,
-	&lustre_attr_idle_timeout.attr,
-	&lustre_attr_idle_connect.attr,
-	&lustre_attr_grant_shrink.attr,
-	NULL,
-};
-
-int osc_tunables_init(struct obd_device *obd)
-{
-#if defined(CONFIG_PROC_FS) && defined(HAVE_SERVER_SUPPORT)
-	struct obd_type *type;
-#endif
-	int rc;
-
-	obd->obd_vars = lprocfs_osc_obd_vars;
-#if defined(CONFIG_PROC_FS) && defined(HAVE_SERVER_SUPPORT)
-	/* If this is true then both client (osc) and server (osp) are on the
-	 * same node. The osp layer if loaded first will register the osc proc
-	 * directory. In that case this obd_device will be attached its proc
-	 * tree to type->typ_procsym instead of obd->obd_type->typ_procroot.
-	 */
-	type = class_search_type(LUSTRE_OSP_NAME);
-	if (type && type->typ_procsym) {
-		obd->obd_proc_entry = lprocfs_register(obd->obd_name,
-						       type->typ_procsym,
-						       obd->obd_vars, obd);
-		if (IS_ERR(obd->obd_proc_entry)) {
-			rc = PTR_ERR(obd->obd_proc_entry);
-			CERROR("error %d setting up lprocfs for %s\n", rc,
-			       obd->obd_name);
-			obd->obd_proc_entry = NULL;
-		}
-	}
-#endif
-	obd->obd_ktype.default_attrs = osc_attrs;
-	rc = lprocfs_obd_setup(obd, false);
-	if (rc)
-		return rc;
-#ifdef CONFIG_PROC_FS
-	/* If the basic OSC proc tree construction succeeded then
-	 * lets do the rest.
-	 */
-	rc = lprocfs_osc_attach_seqstat(obd);
-	if (rc)
-		goto obd_cleanup;
-
-#endif /* CONFIG_PROC_FS */
-	rc = sptlrpc_lprocfs_cliobd_attach(obd);
-	if (rc)
-		goto obd_cleanup;
-
-	ptlrpc_lprocfs_register_obd(obd);
-obd_cleanup:
-	if (rc)
-		lprocfs_obd_cleanup(obd);
-	return rc;
-}
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_cache.c b/drivers/staging/lustrefsx/lustre/osc/osc_cache.c
index 5652e74222bea..178340e255ac9 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_cache.c
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_cache.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  *
  */
 /*
@@ -37,9 +37,7 @@
 
 #define DEBUG_SUBSYSTEM S_OSC
 
-#include <lustre_osc.h>
-#include <lustre_dlm.h>
-
+#include "osc_cl_internal.h"
 #include "osc_internal.h"
 
 static int extent_debug; /* set it to be true for more debug */
@@ -216,7 +214,7 @@ static int osc_extent_sanity_check0(struct osc_extent *ext,
 			GOTO(out, rc = 60);
 		if (ext->oe_fsync_wait && !ext->oe_urgent && !ext->oe_hp)
 			GOTO(out, rc = 65);
-		/* fallthrough */
+		/* Fall through */
 	default:
 		if (atomic_read(&ext->oe_users) > 0)
 			GOTO(out, rc = 70);
@@ -228,9 +226,7 @@ static int osc_extent_sanity_check0(struct osc_extent *ext,
 	if (ext->oe_sync && ext->oe_grants > 0)
 		GOTO(out, rc = 90);
 
-	if (ext->oe_dlmlock != NULL &&
-	    ext->oe_dlmlock->l_resource->lr_type == LDLM_EXTENT &&
-	    !ldlm_is_failed(ext->oe_dlmlock)) {
+	if (ext->oe_dlmlock != NULL && !ldlm_is_failed(ext->oe_dlmlock)) {
 		struct ldlm_extent *extent;
 
 		extent = &ext->oe_dlmlock->l_policy_data.l_extent;
@@ -596,10 +592,7 @@ int osc_extent_release(const struct lu_env *env, struct osc_extent *ext)
 			if (grant > 0)
 				osc_unreserve_grant(cli, 0, grant);
 
-			if (ext->oe_hp)
-				list_move_tail(&ext->oe_link,
-					       &obj->oo_hp_exts);
-			else if (ext->oe_urgent)
+			if (ext->oe_urgent)
 				list_move_tail(&ext->oe_link,
 					       &obj->oo_urgent_exts);
 			else if (ext->oe_nr_pages == ext->oe_mppr) {
@@ -704,7 +697,7 @@ static struct osc_extent *osc_extent_find(const struct lu_env *env,
 		pgoff_t ext_chk_end   = ext->oe_end   >> ppc_bits;
 
 		LASSERT(sanity_check_nolock(ext) == 0);
-		if (chunk > ext_chk_end + 1 || chunk < ext_chk_start)
+		if (chunk > ext_chk_end + 1)
 			break;
 
 		/* if covering by different locks, no chance to match */
@@ -981,7 +974,6 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
 	struct client_obd     *cli = osc_cli(obj);
 	struct osc_async_page *oap;
 	struct osc_async_page *tmp;
-	struct pagevec        *pvec;
 	int                    pages_in_chunk = 0;
 	int                    ppc_bits    = cli->cl_chunkbits -
 					     PAGE_SHIFT;
@@ -1003,11 +995,9 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
 	if (IS_ERR(env))
 		RETURN(PTR_ERR(env));
 
-	io  = osc_env_thread_io(env);
+	io  = &osc_env_info(env)->oti_io;
 	io->ci_obj = cl_object_top(osc2cl(obj));
 	io->ci_ignore_layout = 1;
-	pvec = &osc_env_info(env)->oti_pagevec;
-	ll_pagevec_init(pvec, 0);
 	rc = cl_io_init(env, io, CIT_MISC, io->ci_obj);
 	if (rc < 0)
 		GOTO(out, rc);
@@ -1045,13 +1035,11 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
 		}
 
 		lu_ref_del(&page->cp_reference, "truncate", current);
-		cl_pagevec_put(env, page, pvec);
+		cl_page_put(env, page);
 
 		--ext->oe_nr_pages;
 		++nr_pages;
 	}
-	pagevec_release(pvec);
-
 	EASSERTF(ergo(ext->oe_start >= trunc_index + !!partial,
 		      ext->oe_nr_pages == 0),
 		ext, "trunc_index %lu, partial %d\n", trunc_index, partial);
@@ -1296,7 +1284,7 @@ static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap,
 	ENTRY;
 	result = cl_page_make_ready(env, page, CRT_WRITE);
 	if (result == 0)
-		opg->ops_submit_time = ktime_get();
+		opg->ops_submit_time = cfs_time_current();
 	RETURN(result);
 }
 
@@ -1307,6 +1295,7 @@ static int osc_refresh_count(const struct lu_env *env,
 	pgoff_t index = osc_index(oap2osc(oap));
 	struct cl_object *obj;
 	struct cl_attr   *attr = &osc_env_info(env)->oti_attr;
+
 	int result;
 	loff_t kms;
 
@@ -1352,7 +1341,7 @@ static int osc_completion(const struct lu_env *env, struct osc_async_page *oap,
 	/* Clear opg->ops_transfer_pinned before VM lock is released. */
 	opg->ops_transfer_pinned = 0;
 
-	opg->ops_submit_time = ktime_set(0, 0);
+	opg->ops_submit_time = 0;
 	srvlock = oap->oap_brw_flags & OBD_BRW_SRVLOCK;
 
 	/* statistic */
@@ -1403,6 +1392,7 @@ static void osc_consume_write_grant(struct client_obd *cli,
 {
 	assert_spin_locked(&cli->cl_loi_list_lock);
 	LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT));
+	atomic_long_inc(&obd_dirty_pages);
 	cli->cl_dirty_pages++;
 	pga->flag |= OBD_BRW_FROM_GRANT;
 	CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n",
@@ -1426,6 +1416,11 @@ static void osc_release_write_grant(struct client_obd *cli,
 	pga->flag &= ~OBD_BRW_FROM_GRANT;
 	atomic_long_dec(&obd_dirty_pages);
 	cli->cl_dirty_pages--;
+	if (pga->flag & OBD_BRW_NOCACHE) {
+		pga->flag &= ~OBD_BRW_NOCACHE;
+		atomic_long_dec(&obd_dirty_transit_pages);
+		cli->cl_dirty_transit--;
+	}
 	EXIT;
 }
 
@@ -1531,7 +1526,7 @@ static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap)
  */
 static int osc_enter_cache_try(struct client_obd *cli,
 			       struct osc_async_page *oap,
-			       int bytes)
+			       int bytes, int transient)
 {
 	int rc;
 
@@ -1541,38 +1536,31 @@ static int osc_enter_cache_try(struct client_obd *cli,
 	if (rc < 0)
 		return 0;
 
-	if (cli->cl_dirty_pages < cli->cl_dirty_max_pages) {
-		if (atomic_long_add_return(1, &obd_dirty_pages) <=
-		    obd_max_dirty_pages) {
-			osc_consume_write_grant(cli, &oap->oap_brw_page);
-			return 1;
+	if (cli->cl_dirty_pages < cli->cl_dirty_max_pages &&
+	    1 + atomic_long_read(&obd_dirty_pages) <= obd_max_dirty_pages) {
+		osc_consume_write_grant(cli, &oap->oap_brw_page);
+		if (transient) {
+			cli->cl_dirty_transit++;
+			atomic_long_inc(&obd_dirty_transit_pages);
+			oap->oap_brw_flags |= OBD_BRW_NOCACHE;
 		}
-		atomic_long_dec(&obd_dirty_pages);
+		rc = 1;
+	} else {
+		__osc_unreserve_grant(cli, bytes, bytes);
+		rc = 0;
 	}
-	__osc_unreserve_grant(cli, bytes, bytes);
-	return 0;
+	return rc;
 }
 
-/* Following two inlines exist to pass code fragments
- * to wait_event_idle_exclusive_timeout_cmd().  Passing
- * code fragments as macro args can look confusing, so
- * we provide inlines to encapsulate them.
- */
-static inline void cli_unlock_and_unplug(const struct lu_env *env,
-					 struct client_obd *cli,
-					 struct osc_async_page *oap)
+static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw)
 {
+	int rc;
+	spin_lock(&cli->cl_loi_list_lock);
+	rc = list_empty(&ocw->ocw_entry);
 	spin_unlock(&cli->cl_loi_list_lock);
-	osc_io_unplug_async(env, cli, NULL);
-	CDEBUG(D_CACHE,
-	       "%s: sleeping for cache space for %p\n",
-	       cli_name(cli), oap);
+	return rc;
 }
 
-static inline void cli_lock_after_unplug(struct client_obd *cli)
-{
-	spin_lock(&cli->cl_loi_list_lock);
-}
 /**
  * The main entry to reserve dirty page accounting. Usually the grant reserved
  * in this function will be freed in bulk in osc_free_grant() unless it fails
@@ -1583,24 +1571,16 @@ static inline void cli_lock_after_unplug(struct client_obd *cli)
 static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli,
 			   struct osc_async_page *oap, int bytes)
 {
-	struct osc_object *osc = oap->oap_obj;
-	struct lov_oinfo *loi = osc->oo_oinfo;
-	int rc = -EDQUOT;
-	int remain;
-	bool entered = false;
-	/* We cannot wait for a long time here since we are holding ldlm lock
-	 * across the actual IO. If no requests complete fast (e.g. due to
-	 * overloaded OST that takes a long time to process everything, we'd
-	 * get evicted if we wait for a normal obd_timeout or some such.
-	 * So we try to wait half the time it would take the client to be
-	 * evicted by server which is half obd_timeout when AT is off
-	 * or at least ldlm_enqueue_min with AT on.
-	 * See LU-13131 */
-	unsigned long timeout = cfs_time_seconds(AT_OFF ? obd_timeout / 2 :
-							  ldlm_enqueue_min / 2);
-
+	struct osc_object	*osc = oap->oap_obj;
+	struct lov_oinfo	*loi = osc->oo_oinfo;
+	struct osc_cache_waiter	 ocw;
+	struct l_wait_info	 lwi;
+	int			 rc = -EDQUOT;
 	ENTRY;
 
+	lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(AT_OFF ? obd_timeout : at_max),
+			       NULL, LWI_ON_SIGNAL_NOOP, NULL);
+
 	OSC_DUMP_GRANT(D_CACHE, cli, "need:%d\n", bytes);
 
 	spin_lock(&cli->cl_loi_list_lock);
@@ -1614,40 +1594,76 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli,
 		GOTO(out, rc = -EDQUOT);
 	}
 
-	/*
-	 * We can wait here for two reasons: too many dirty pages in cache, or
+	/* Hopefully normal case - cache space and write credits available */
+	if (osc_enter_cache_try(cli, oap, bytes, 0)) {
+		OSC_DUMP_GRANT(D_CACHE, cli, "granted from cache\n");
+		GOTO(out, rc = 0);
+	}
+
+	/* We can get here for two reasons: too many dirty pages in cache, or
 	 * run out of grants. In both cases we should write dirty pages out.
 	 * Adding a cache waiter will trigger urgent write-out no matter what
 	 * RPC size will be.
-	 * The exiting condition (other than success) is no avail grants
-	 * and no dirty pages caching, that really means there is no space
-	 * on the OST.
-	 */
-	remain = wait_event_idle_exclusive_timeout_cmd(
-		cli->cl_cache_waiters,
-		(entered = osc_enter_cache_try(cli, oap, bytes)) ||
-		(cli->cl_dirty_pages == 0 && cli->cl_w_in_flight == 0),
-		timeout,
-		cli_unlock_and_unplug(env, cli, oap),
-		cli_lock_after_unplug(cli));
-
-	if (entered) {
-		if (remain == timeout)
-			OSC_DUMP_GRANT(D_CACHE, cli, "granted from cache\n");
-		else
-			OSC_DUMP_GRANT(D_CACHE, cli,
-				       "finally got grant space\n");
-		wake_up(&cli->cl_cache_waiters);
-		rc = 0;
-	} else if (remain == 0) {
+	 * The exiting condition is no avail grants and no dirty pages caching,
+	 * that really means there is no space on the OST. */
+	init_waitqueue_head(&ocw.ocw_waitq);
+	ocw.ocw_oap   = oap;
+	ocw.ocw_grant = bytes;
+	while (cli->cl_dirty_pages > 0 || cli->cl_w_in_flight > 0) {
+		list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters);
+		ocw.ocw_rc = 0;
+		spin_unlock(&cli->cl_loi_list_lock);
+
+		osc_io_unplug_async(env, cli, NULL);
+
+		CDEBUG(D_CACHE, "%s: sleeping for cache space @ %p for %p\n",
+		       cli_name(cli), &ocw, oap);
+
+		rc = l_wait_event(ocw.ocw_waitq, ocw_granted(cli, &ocw), &lwi);
+
+		spin_lock(&cli->cl_loi_list_lock);
+
+		if (rc < 0) {
+			/* l_wait_event is interrupted by signal or timed out */
+			list_del_init(&ocw.ocw_entry);
+			break;
+		}
+		LASSERT(list_empty(&ocw.ocw_entry));
+		rc = ocw.ocw_rc;
+
+		if (rc != -EDQUOT)
+			break;
+		if (osc_enter_cache_try(cli, oap, bytes, 0)) {
+			rc = 0;
+			break;
+		}
+	}
+
+	switch (rc) {
+	case 0:
+		OSC_DUMP_GRANT(D_CACHE, cli, "finally got grant space\n");
+		break;
+	case -ETIMEDOUT:
 		OSC_DUMP_GRANT(D_CACHE, cli,
 			       "timeout, fall back to sync i/o\n");
 		osc_extent_tree_dump(D_CACHE, osc);
 		/* fall back to synchronous I/O */
-	} else {
+		rc = -EDQUOT;
+		break;
+	case -EINTR:
+		/* Ensures restartability - LU-3581 */
+		OSC_DUMP_GRANT(D_CACHE, cli, "interrupted\n");
+		rc = -ERESTARTSYS;
+		break;
+	case -EDQUOT:
 		OSC_DUMP_GRANT(D_CACHE, cli,
 			       "no grant space, fall back to sync i/o\n");
-		wake_up_all(&cli->cl_cache_waiters);
+		break;
+	default:
+		CDEBUG(D_CACHE, "%s: event for cache space @ %p never arrived "
+		       "due to %d, fall back to sync i/o\n",
+		       cli_name(cli), &ocw, rc);
+		break;
 	}
 	EXIT;
 out:
@@ -1655,6 +1671,41 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli,
 	RETURN(rc);
 }
 
+/* caller must hold loi_list_lock */
+void osc_wake_cache_waiters(struct client_obd *cli)
+{
+	struct list_head *l, *tmp;
+	struct osc_cache_waiter *ocw;
+
+	ENTRY;
+	list_for_each_safe(l, tmp, &cli->cl_cache_waiters) {
+		ocw = list_entry(l, struct osc_cache_waiter, ocw_entry);
+		list_del_init(&ocw->ocw_entry);
+
+		ocw->ocw_rc = -EDQUOT;
+		/* we can't dirty more */
+		if ((cli->cl_dirty_pages  >= cli->cl_dirty_max_pages) ||
+		    (1 + atomic_long_read(&obd_dirty_pages) >
+		     obd_max_dirty_pages)) {
+			CDEBUG(D_CACHE, "no dirty room: dirty: %ld "
+			       "osc max %ld, sys max %ld\n",
+			       cli->cl_dirty_pages, cli->cl_dirty_max_pages,
+			       obd_max_dirty_pages);
+			goto wakeup;
+		}
+
+		if (osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant, 0))
+			ocw->ocw_rc = 0;
+wakeup:
+		CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant %ld, %d\n",
+		       ocw, ocw->ocw_oap, cli->cl_avail_grant, ocw->ocw_rc);
+
+		wake_up(&ocw->ocw_waitq);
+	}
+
+	EXIT;
+}
+
 static int osc_max_rpc_in_flight(struct client_obd *cli, struct osc_object *osc)
 {
 	int hprpc = !!list_empty(&osc->oo_hp_exts);
@@ -1694,9 +1745,8 @@ static int osc_makes_rpc(struct client_obd *cli, struct osc_object *osc,
 		}
 		/* trigger a write rpc stream as long as there are dirtiers
 		 * waiting for space.  as they're waiting, they're not going to
-		 * create more pages to coalesce with what's waiting..
-		 */
-		if (waitqueue_active(&cli->cl_cache_waiters)) {
+		 * create more pages to coalesce with what's waiting.. */
+		if (!list_empty(&cli->cl_cache_waiters)) {
 			CDEBUG(D_CACHE, "cache waiters forcing RPC\n");
 			RETURN(1);
 		}
@@ -1918,7 +1968,6 @@ static int try_to_add_extent_for_io(struct client_obd *cli,
 
 		if (tmp->oe_srvlock != ext->oe_srvlock ||
 		    !tmp->oe_grants != !ext->oe_grants ||
-		    tmp->oe_ndelay != ext->oe_ndelay ||
 		    tmp->oe_no_merge || ext->oe_no_merge)
 			RETURN(0);
 
@@ -1994,6 +2043,7 @@ static unsigned int get_write_extents(struct osc_object *obj,
 	while (!list_empty(&obj->oo_hp_exts)) {
 		ext = list_entry(obj->oo_hp_exts.next, struct osc_extent,
 				 oe_link);
+		LASSERT(ext->oe_state == OES_CACHE);
 		if (!try_to_add_extent_for_io(cli, ext, &data))
 			return data.erd_page_count;
 		EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext);
@@ -2179,9 +2229,8 @@ static struct osc_object *osc_next_obj(struct client_obd *cli)
 	/* then if we have cache waiters, return all objects with queued
 	 * writes.  This is especially important when many small files
 	 * have filled up the cache and not been fired into rpcs because
-	 * they don't pass the nr_pending/object threshhold
-	 */
-	if (waitqueue_active(&cli->cl_cache_waiters) &&
+	 * they don't pass the nr_pending/object threshhold */
+	if (!list_empty(&cli->cl_cache_waiters) &&
 	    !list_empty(&cli->cl_loi_write_list))
 		RETURN(list_to_obj(&cli->cl_loi_write_list, write_item));
 
@@ -2212,12 +2261,7 @@ __must_hold(&cli->cl_loi_list_lock)
 
 		OSC_IO_DEBUG(osc, "%lu in flight\n", rpcs_in_flight(cli));
 
-		/* even if we have reached our max in flight RPCs, we still
-		 * allow all high-priority RPCs through to prevent their
-		 * starvation and leading to server evicting us for not
-		 * writing out pages in a timely manner LU-13131 */
-		if (osc_max_rpc_in_flight(cli, osc) &&
-		    list_empty(&osc->oo_hp_exts)) {
+		if (osc_max_rpc_in_flight(cli, osc)) {
 			__osc_list_maint(cli, osc);
 			break;
 		}
@@ -2272,8 +2316,8 @@ __must_hold(&cli->cl_loi_list_lock)
 	}
 }
 
-int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli,
-		   struct osc_object *osc, int async)
+static int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli,
+			  struct osc_object *osc, int async)
 {
 	int rc = 0;
 
@@ -2291,7 +2335,18 @@ int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli,
 	}
 	return rc;
 }
-EXPORT_SYMBOL(osc_io_unplug0);
+
+static int osc_io_unplug_async(const struct lu_env *env,
+				struct client_obd *cli, struct osc_object *osc)
+{
+	return osc_io_unplug0(env, cli, osc, 1);
+}
+
+void osc_io_unplug(const struct lu_env *env, struct client_obd *cli,
+		   struct osc_object *osc)
+{
+	(void)osc_io_unplug0(env, cli, osc, 0);
+}
 
 int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
 			struct page *page, loff_t offset)
@@ -2311,6 +2366,9 @@ int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
 	oap->oap_obj_off = offset;
 	LASSERT(!(offset & ~PAGE_MASK));
 
+	if (cfs_capable(CFS_CAP_SYS_RESOURCE))
+		oap->oap_brw_flags = OBD_BRW_NOQUOTA;
+
 	INIT_LIST_HEAD(&oap->oap_pending_item);
 	INIT_LIST_HEAD(&oap->oap_rpc_item);
 
@@ -2319,7 +2377,6 @@ int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
 	       oap, page, oap->oap_obj_off);
 	RETURN(0);
 }
-EXPORT_SYMBOL(osc_prep_async_page);
 
 int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
 		       struct osc_page *ops)
@@ -2350,7 +2407,7 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
 
 	/* Set the OBD_BRW_SRVLOCK before the page is queued. */
 	brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0;
-	if (oio->oi_cap_sys_resource || io->ci_noquota) {
+	if (cfs_capable(CFS_CAP_SYS_RESOURCE)) {
 		brw_flags |= OBD_BRW_NOQUOTA;
 		cmd |= OBD_BRW_NOQUOTA;
 	}
@@ -2406,7 +2463,7 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
 
 		/* it doesn't need any grant to dirty this page */
 		spin_lock(&cli->cl_loi_list_lock);
-		rc = osc_enter_cache_try(cli, oap, grants);
+		rc = osc_enter_cache_try(cli, oap, grants, 0);
 		spin_unlock(&cli->cl_loi_list_lock);
 		if (rc == 0) { /* try failed */
 			grants = 0;
@@ -2483,11 +2540,7 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
 		++ext->oe_nr_pages;
 		list_add_tail(&oap->oap_pending_item, &ext->oe_pages);
 		osc_object_unlock(osc);
-
-		if (!ext->oe_layout_version)
-			ext->oe_layout_version = io->ci_layout_version;
 	}
-
 	RETURN(rc);
 }
 
@@ -2673,9 +2726,8 @@ int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops)
 	RETURN(rc);
 }
 
-int osc_queue_sync_pages(const struct lu_env *env, const struct cl_io *io,
-			 struct osc_object *obj, struct list_head *list,
-			 int brw_flags)
+int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
+			 struct list_head *list, int cmd, int brw_flags)
 {
 	struct client_obd     *cli = osc_cli(obj);
 	struct osc_extent     *ext;
@@ -2713,7 +2765,7 @@ int osc_queue_sync_pages(const struct lu_env *env, const struct cl_io *io,
 		RETURN(-ENOMEM);
 	}
 
-	ext->oe_rw = !!(brw_flags & OBD_BRW_READ);
+	ext->oe_rw = !!(cmd & OBD_BRW_READ);
 	ext->oe_sync = 1;
 	ext->oe_no_merge = !can_merge;
 	ext->oe_urgent = 1;
@@ -2721,52 +2773,15 @@ int osc_queue_sync_pages(const struct lu_env *env, const struct cl_io *io,
 	ext->oe_end = ext->oe_max_end = end;
 	ext->oe_obj = obj;
 	ext->oe_srvlock = !!(brw_flags & OBD_BRW_SRVLOCK);
-	ext->oe_ndelay = !!(brw_flags & OBD_BRW_NDELAY);
-	if (brw_flags & OBD_BRW_NOCACHE && !ext->oe_rw) { /* direct io write */
-		int grants;
-		int ppc;
-
-		ppc = 1 << (cli->cl_chunkbits - PAGE_SHIFT);
-		grants = cli->cl_grant_extent_tax;
-		grants += (1 << cli->cl_chunkbits) *
-			((page_count + ppc - 1) / ppc);
-
-		spin_lock(&cli->cl_loi_list_lock);
-		if (osc_reserve_grant(cli, grants) == 0) {
-			list_for_each_entry(oap, list, oap_pending_item) {
-				osc_consume_write_grant(cli,
-							&oap->oap_brw_page);
-				atomic_long_inc(&obd_dirty_pages);
-			}
-			__osc_unreserve_grant(cli, grants, 0);
-			ext->oe_grants = grants;
-		}
-		spin_unlock(&cli->cl_loi_list_lock);
-	}
 	ext->oe_nr_pages = page_count;
 	ext->oe_mppr = mppr;
 	list_splice_init(list, &ext->oe_pages);
-	ext->oe_layout_version = io->ci_layout_version;
 
 	osc_object_lock(obj);
 	/* Reuse the initial refcount for RPC, don't drop it */
 	osc_extent_state_set(ext, OES_LOCK_DONE);
-	if (!ext->oe_rw) { /* write */
-		if (!ext->oe_srvlock) {
-			/* The most likely case here is from lack of grants
-			 * so we are either out of quota or out of space.
-			 * Since this means we are holding locks across
-			 * potentially multi-striped IO, we must send out
-			 * everything out instantly to avoid prolonged
-			 * waits resulting in lock eviction (likely since
-			 * the extended wait in osc_cache_enter() did not
-			 * yield any additional grant due to a timeout.
-			 * LU-13131 */
-			ext->oe_hp = 1;
-			list_add_tail(&ext->oe_link, &obj->oo_hp_exts);
-		} else {
-			list_add_tail(&ext->oe_link, &obj->oo_urgent_exts);
-		}
+	if (cmd & OBD_BRW_WRITE) {
+		list_add_tail(&ext->oe_link, &obj->oo_urgent_exts);
 		osc_update_pending(obj, OBD_BRW_WRITE, page_count);
 	} else {
 		list_add_tail(&ext->oe_link, &obj->oo_reading_exts);
@@ -2904,7 +2919,6 @@ int osc_cache_truncate_start(const struct lu_env *env, struct osc_object *obj,
 	}
 	RETURN(result);
 }
-EXPORT_SYMBOL(osc_cache_truncate_start);
 
 /**
  * Called after osc_io_setattr_end to add oio->oi_trunc back to cache.
@@ -2991,7 +3005,6 @@ int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
 	OSC_IO_DEBUG(obj, "sync file range.\n");
 	RETURN(result);
 }
-EXPORT_SYMBOL(osc_cache_wait_range);
 
 /**
  * Called to write out a range of osc object.
@@ -3031,7 +3044,7 @@ int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
 					EASSERT(!ext->oe_hp, ext);
 					ext->oe_hp = 1;
 					list = &obj->oo_hp_exts;
-				} else if (!ext->oe_urgent && !ext->oe_hp) {
+				} else if (!ext->oe_urgent) {
 					ext->oe_urgent = 1;
 					list = &obj->oo_urgent_exts;
 				}
@@ -3039,25 +3052,10 @@ int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
 					list_move_tail(&ext->oe_link, list);
 				unplug = true;
 			} else {
-				struct client_obd *cli = osc_cli(obj);
-				int pcc_bits = cli->cl_chunkbits - PAGE_SHIFT;
-				pgoff_t align_by = (1 << pcc_bits);
-				pgoff_t a_start = round_down(start, align_by);
-				pgoff_t a_end = round_up(end, align_by);
-
-				/* overflow case */
-				if (end && !a_end)
-					a_end = CL_PAGE_EOF;
 				/* the only discarder is lock cancelling, so
-				 * [start, end], aligned by chunk size, must
-				 * contain this extent */
-				LASSERTF(ext->oe_start >= a_start &&
-					 ext->oe_end <= a_end,
-					 "ext [%lu, %lu] reg [%lu, %lu] "
-					 "orig [%lu %lu] align %lu bits "
-					 "%d\n", ext->oe_start, ext->oe_end,
-					 a_start, a_end, start, end,
-					 align_by, pcc_bits);
+				 * [start, end] must contain this extent */
+				EASSERT(ext->oe_start >= start &&
+					ext->oe_max_end <= end, ext);
 				osc_extent_state_set(ext, OES_LOCKING);
 				ext->oe_owner = current;
 				list_move_tail(&ext->oe_link,
@@ -3123,7 +3121,6 @@ int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
 	OSC_IO_DEBUG(obj, "pageout [%lu, %lu], %d.\n", start, end, result);
 	RETURN(result);
 }
-EXPORT_SYMBOL(osc_cache_writeback_range);
 
 /**
  * Returns a list of pages by a given [start, end] of \a obj.
@@ -3142,7 +3139,6 @@ int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
 			osc_page_gang_cbt cb, void *cbdata)
 {
 	struct osc_page *ops;
-	struct pagevec	*pagevec;
 	void            **pvec;
 	pgoff_t         idx;
 	unsigned int    nr;
@@ -3154,8 +3150,6 @@ int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
 
 	idx = start;
 	pvec = osc_env_info(env)->oti_pvec;
-	pagevec = &osc_env_info(env)->oti_pagevec;
-	ll_pagevec_init(pagevec, 0);
 	spin_lock(&osc->oo_tree_lock);
 	while ((nr = radix_tree_gang_lookup(&osc->oo_tree, pvec,
 					    idx, OTI_PVEC_SIZE)) > 0) {
@@ -3202,10 +3196,8 @@ int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
 
 			page = ops->ops_cl.cpl_page;
 			lu_ref_del(&page->cp_reference, "gang_lookup", current);
-			cl_pagevec_put(env, page, pagevec);
+			cl_page_put(env, page);
 		}
-		pagevec_release(pagevec);
-
 		if (nr < OTI_PVEC_SIZE || end_of_region)
 			break;
 
@@ -3221,7 +3213,6 @@ int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
 		spin_unlock(&osc->oo_tree_lock);
 	RETURN(res);
 }
-EXPORT_SYMBOL(osc_page_gang_lookup);
 
 /**
  * Check if page @page is covered by an extra lock or discard it.
@@ -3264,8 +3255,8 @@ static int check_and_discard_cb(const struct lu_env *env, struct cl_io *io,
 	return CLP_GANG_OKAY;
 }
 
-int osc_discard_cb(const struct lu_env *env, struct cl_io *io,
-		   struct osc_page *ops, void *cbdata)
+static int discard_cb(const struct lu_env *env, struct cl_io *io,
+		      struct osc_page *ops, void *cbdata)
 {
 	struct osc_thread_info *info = osc_env_info(env);
 	struct cl_page *page = ops->ops_cl.cpl_page;
@@ -3287,7 +3278,6 @@ int osc_discard_cb(const struct lu_env *env, struct cl_io *io,
 
 	return CLP_GANG_OKAY;
 }
-EXPORT_SYMBOL(osc_discard_cb);
 
 /**
  * Discard pages protected by the given lock. This function traverses radix
@@ -3301,7 +3291,7 @@ int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc,
 			   pgoff_t start, pgoff_t end, bool discard)
 {
 	struct osc_thread_info *info = osc_env_info(env);
-	struct cl_io *io = osc_env_thread_io(env);
+	struct cl_io *io = &info->oti_io;
 	osc_page_gang_cbt cb;
 	int res;
 	int result;
@@ -3314,7 +3304,7 @@ int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc,
 	if (result != 0)
 		GOTO(out, result);
 
-	cb = discard ? osc_discard_cb : check_and_discard_cb;
+	cb = discard ? discard_cb : check_and_discard_cb;
 	info->oti_fn_index = info->oti_next_index = start;
 	do {
 		res = osc_page_gang_lookup(env, io, osc,
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_osc.h b/drivers/staging/lustrefsx/lustre/osc/osc_cl_internal.h
similarity index 52%
rename from drivers/staging/lustrefsx/lustre/include/lustre_osc.h
rename to drivers/staging/lustrefsx/lustre/osc/osc_cl_internal.h
index f865036f897cf..7e6cbc017dfde 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_osc.h
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_cl_internal.h
@@ -23,99 +23,35 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  * Lustre is a trademark of Sun Microsystems, Inc.
  */
 /*
- * lustre/include/lustre_osc.h
- *
- * OSC layer structures and methods common for both OSC and MDC.
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
  *
- * This file contains OSC interfaces used by OSC and MDC. Most of them
- * were just moved from lustre/osc/osc_cl_internal.h for Data-on-MDT
- * purposes.
+ * Internal interfaces of OSC layer.
  *
  *   Author: Nikita Danilov <nikita.danilov@sun.com>
  *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
- *   Author: Mikhail Pershin <mike.pershin@intel.com>
  */
 
-#ifndef LUSTRE_OSC_H
-#define LUSTRE_OSC_H
+#ifndef OSC_CL_INTERNAL_H
+#define OSC_CL_INTERNAL_H
 
 #include <libcfs/libcfs.h>
 #include <obd.h>
+/* osc_build_res_name() */
 #include <cl_object.h>
+#include "osc_internal.h"
 
 /** \defgroup osc osc
  *  @{
  */
 
-struct osc_quota_info {
-	/** linkage for quota hash table */
-	struct hlist_node oqi_hash;
-	__u32             oqi_id;
-};
-
-enum async_flags {
-	ASYNC_READY = 0x1, /* ap_make_ready will not be called before this
-			      page is added to an rpc */
-	ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */
-	ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called
-				     to give the caller a chance to update
-				     or cancel the size of the io */
-	ASYNC_HP = 0x10,
-};
-
-struct osc_async_page {
-	int			oap_magic;
-	unsigned short		oap_cmd;
-	unsigned short		oap_interrupted:1;
-
-	struct list_head	oap_pending_item;
-	struct list_head	oap_rpc_item;
-
-	loff_t			oap_obj_off;
-	unsigned		oap_page_off;
-	enum async_flags	oap_async_flags;
-
-	struct brw_page		oap_brw_page;
-
-	struct ptlrpc_request	*oap_request;
-	struct client_obd	*oap_cli;
-	struct osc_object	*oap_obj;
-
-	spinlock_t		 oap_lock;
-};
-
-#define oap_page	oap_brw_page.pg
-#define oap_count	oap_brw_page.count
-#define oap_brw_flags	oap_brw_page.flag
-
-static inline struct osc_async_page *brw_page2oap(struct brw_page *pga)
-{
-	return container_of(pga, struct osc_async_page, oap_brw_page);
-}
-
-struct osc_device {
-	struct cl_device	od_cl;
-	struct obd_export	*od_exp;
-
-	/* Write stats is actually protected by client_obd's lock. */
-	struct osc_stats {
-		uint64_t	os_lockless_writes;    /* by bytes */
-		uint64_t	os_lockless_reads;     /* by bytes */
-		uint64_t	os_lockless_truncates; /* by times */
-	} od_stats;
-
-	/* configuration item(s) */
-	time64_t		od_contention_time;
-	int			od_lockless_truncate;
-};
-
 struct osc_extent;
 
 /**
@@ -127,9 +63,7 @@ struct osc_io {
 	/** true if this io is lockless. */
 	unsigned int	   oi_lockless:1,
 	/** true if this io is counted as active IO */
-			   oi_is_active:1,
-	/** true if this io has CAP_SYS_RESOURCE */
-			   oi_cap_sys_resource:1;
+			   oi_is_active:1;
 	/** how many LRU pages are reserved for this IO */
 	unsigned long	   oi_lru_reserved;
 
@@ -144,8 +78,8 @@ struct osc_io {
 	struct obdo        oi_oa;
 	struct osc_async_cbargs {
 		bool		  opc_rpc_sent;
-		int		  opc_rc;
-		struct completion opc_sync;
+		int               opc_rc;
+		struct completion	opc_sync;
 	} oi_cbarg;
 };
 
@@ -153,7 +87,7 @@ struct osc_io {
  * State maintained by osc layer for the duration of a system call.
  */
 struct osc_session {
-	struct osc_io os_io;
+        struct osc_io       os_io;
 };
 
 #define OTI_PVEC_SIZE 256
@@ -165,7 +99,6 @@ struct osc_thread_info {
 	struct lustre_handle	oti_handle;
 	struct cl_page_list	oti_plist;
 	struct cl_io		oti_io;
-	struct pagevec		oti_pagevec;
 	void			*oti_pvec[OTI_PVEC_SIZE];
 	/**
 	 * Fields used by cl_lock_discard_pages().
@@ -177,88 +110,21 @@ struct osc_thread_info {
 	struct lu_buf		oti_ladvise_buf;
 };
 
-static inline __u64 osc_enq2ldlm_flags(__u32 enqflags)
-{
-	__u64 result = 0;
-
-	CDEBUG(D_DLMTRACE, "flags: %x\n", enqflags);
-
-	LASSERT((enqflags & ~CEF_MASK) == 0);
-
-	if (enqflags & CEF_NONBLOCK)
-		result |= LDLM_FL_BLOCK_NOWAIT;
-	if (enqflags & CEF_GLIMPSE)
-		result |= LDLM_FL_HAS_INTENT;
-	if (enqflags & CEF_DISCARD_DATA)
-		result |= LDLM_FL_AST_DISCARD_DATA;
-	if (enqflags & CEF_PEEK)
-		result |= LDLM_FL_TEST_LOCK;
-	if (enqflags & CEF_LOCK_MATCH)
-		result |= LDLM_FL_MATCH_LOCK;
-	if (enqflags & CEF_LOCK_NO_EXPAND)
-		result |= LDLM_FL_NO_EXPANSION;
-	if (enqflags & CEF_SPECULATIVE)
-		result |= LDLM_FL_SPECULATIVE;
-	return result;
-}
-
-typedef int (*osc_enqueue_upcall_f)(void *cookie, struct lustre_handle *lockh,
-				    int rc);
-
-struct osc_enqueue_args {
-	struct obd_export	*oa_exp;
-	enum ldlm_type		oa_type;
-	enum ldlm_mode		oa_mode;
-	__u64			*oa_flags;
-	osc_enqueue_upcall_f	oa_upcall;
-	void			*oa_cookie;
-	struct ost_lvb		*oa_lvb;
-	struct lustre_handle	oa_lockh;
-	bool			oa_speculative;
-};
-
-/**
- * Bit flags for osc_dlm_lock_at_pageoff().
- */
-enum osc_dap_flags {
-	/**
-	 * Just check if the desired lock exists, it won't hold reference
-	 * count on lock.
-	 */
-	OSC_DAP_FL_TEST_LOCK = 1 << 0,
-	/**
-	 * Return the lock even if it is being canceled.
-	 */
-	OSC_DAP_FL_CANCELING = 1 << 1
-};
-
-/*
- * The set of operations which are different for MDC and OSC objects
- */
-struct osc_object_operations {
-	void (*oto_build_res_name)(struct osc_object *osc,
-				   struct ldlm_res_id *resname);
-	struct ldlm_lock* (*oto_dlmlock_at_pgoff)(const struct lu_env *env,
-						struct osc_object *obj,
-						pgoff_t index,
-						enum osc_dap_flags dap_flags);
-};
-
 struct osc_object {
-	struct cl_object	oo_cl;
-	struct lov_oinfo	*oo_oinfo;
-	/**
-	 * True if locking against this stripe got -EUSERS.
-	 */
-	int			oo_contended;
-	ktime_t			oo_contention_time;
+        struct cl_object   oo_cl;
+        struct lov_oinfo  *oo_oinfo;
+        /**
+         * True if locking against this stripe got -EUSERS.
+         */
+        int                oo_contended;
+        cfs_time_t         oo_contention_time;
 #ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
-	/**
-	 * IO context used for invariant checks in osc_lock_has_pages().
-	 */
-	struct cl_io		oo_debug_io;
-	/** Serialization object for osc_object::oo_debug_io. */
-	struct mutex		oo_debug_mutex;
+        /**
+         * IO context used for invariant checks in osc_lock_has_pages().
+         */
+        struct cl_io       oo_debug_io;
+        /** Serialization object for osc_object::oo_debug_io. */
+	struct mutex	   oo_debug_mutex;
 #endif
 	/**
 	 * used by the osc to keep track of what objects to build into rpcs.
@@ -272,7 +138,7 @@ struct osc_object {
 	/**
 	 * extent is a red black tree to manage (async) dirty pages.
 	 */
-	struct rb_root		oo_root;
+	struct rb_root       oo_root;
 	/**
 	 * Manage write(dirty) extents.
 	 */
@@ -282,12 +148,12 @@ struct osc_object {
 
 	struct list_head	oo_reading_exts;
 
-	atomic_t		oo_nr_reads;
-	atomic_t		oo_nr_writes;
+	atomic_t	 oo_nr_reads;
+	atomic_t	 oo_nr_writes;
 
 	/** Protect extent tree. Will be used to protect
 	 * oo_{read|write}_pages soon. */
-	spinlock_t		oo_lock;
+	spinlock_t	    oo_lock;
 
 	/**
 	 * Radix tree for caching pages
@@ -303,25 +169,8 @@ struct osc_object {
 	/** number of active IOs of this object */
 	atomic_t		oo_nr_ios;
 	wait_queue_head_t	oo_io_waitq;
-
-	const struct osc_object_operations *oo_obj_ops;
-	bool			oo_initialized;
 };
 
-static inline void osc_build_res_name(struct osc_object *osc,
-				      struct ldlm_res_id *resname)
-{
-	return osc->oo_obj_ops->oto_build_res_name(osc, resname);
-}
-
-static inline struct ldlm_lock *osc_dlmlock_at_pgoff(const struct lu_env *env,
-						    struct osc_object *obj,
-						    pgoff_t index,
-						    enum osc_dap_flags flags)
-{
-	return obj->oo_obj_ops->oto_dlmlock_at_pgoff(env, obj, index, flags);
-}
-
 static inline void osc_object_lock(struct osc_object *obj)
 {
 	spin_lock(&obj->oo_lock);
@@ -351,27 +200,15 @@ static inline int osc_object_is_locked(struct osc_object *obj)
 #endif
 }
 
-static inline void osc_object_set_contended(struct osc_object *obj)
-{
-	obj->oo_contention_time = ktime_get();
-	/* mb(); */
-	obj->oo_contended = 1;
-}
-
-static inline void osc_object_clear_contended(struct osc_object *obj)
-{
-	obj->oo_contended = 0;
-}
-
 /*
  * Lock "micro-states" for osc layer.
  */
 enum osc_lock_state {
-	OLS_NEW,
-	OLS_ENQUEUED,
-	OLS_UPCALL_RECEIVED,
-	OLS_GRANTED,
-	OLS_CANCELLED
+        OLS_NEW,
+        OLS_ENQUEUED,
+        OLS_UPCALL_RECEIVED,
+        OLS_GRANTED,
+        OLS_CANCELLED
 };
 
 /**
@@ -434,68 +271,55 @@ struct osc_lock {
 	/** DLM flags with which osc_lock::ols_lock was enqueued */
 	__u64			ols_flags;
 	/** osc_lock::ols_lock handle */
-	struct lustre_handle	ols_handle;
+	struct lustre_handle     ols_handle;
 	struct ldlm_enqueue_info ols_einfo;
-	enum osc_lock_state	ols_state;
+	enum osc_lock_state      ols_state;
 	/** lock value block */
 	struct ost_lvb		ols_lvb;
-	/** Lockless operations to be used by lockless lock */
-	const struct cl_lock_operations *ols_lockless_ops;
-	/**
-	 * true, if ldlm_lock_addref() was called against
-	 * osc_lock::ols_lock. This is used for sanity checking.
-	 *
-	 * \see osc_lock::ols_has_ref
-	 */
-	unsigned		ols_hold :1,
-	/**
-	 * this is much like osc_lock::ols_hold, except that this bit is
-	 * cleared _after_ reference in released in osc_lock_unuse(). This
-	 * fine distinction is needed because:
-	 *
-	 *     - if ldlm lock still has a reference, osc_ast_data_get() needs
-	 *       to return associated cl_lock (so that a flag is needed that is
-	 *       cleared after ldlm_lock_decref() returned), and
-	 *
-	 *     - ldlm_lock_decref() can invoke blocking ast (for a
-	 *       LDLM_FL_CBPENDING lock), and osc_lock functions like
-	 *       osc_lock_cancel() called from there need to know whether to
-	 *       release lock reference (so that a flag is needed that is
-	 *       cleared before ldlm_lock_decref() is called).
-	 */
-				ols_has_ref:1,
-	/**
-	 * inherit the lockless attribute from top level cl_io.
-	 * If true, osc_lock_enqueue is able to tolerate the -EUSERS error.
-	 */
-				ols_locklessable:1,
-	/**
-	 * if set, the osc_lock is a glimpse lock. For glimpse locks, we treat
-	 * the EVAVAIL error as torerable, this will make upper logic happy
-	 * to wait all glimpse locks to each OSTs to be completed.
-	 * Glimpse lock converts to normal lock if the server lock is granted.
-	 * Glimpse lock should be destroyed immediately after use.
-	 */
-				ols_glimpse:1,
-	/**
-	 * For async glimpse lock.
-	 */
-				ols_agl:1,
-	/**
-	 * for speculative locks - asynchronous glimpse locks and ladvise
-	 * lockahead manual lock requests
-	 *
-	 * Used to tell osc layer to not wait for the ldlm reply from the
-	 * server, so the osc lock will be short lived - It only exists to
-	 * create the ldlm request and is not updated on request completion.
-	 */
-				ols_speculative:1;
+
+        /**
+         * true, if ldlm_lock_addref() was called against
+         * osc_lock::ols_lock. This is used for sanity checking.
+         *
+         * \see osc_lock::ols_has_ref
+         */
+        unsigned                  ols_hold :1,
+        /**
+         * this is much like osc_lock::ols_hold, except that this bit is
+         * cleared _after_ reference in released in osc_lock_unuse(). This
+         * fine distinction is needed because:
+         *
+         *     - if ldlm lock still has a reference, osc_ast_data_get() needs
+         *       to return associated cl_lock (so that a flag is needed that is
+         *       cleared after ldlm_lock_decref() returned), and
+         *
+         *     - ldlm_lock_decref() can invoke blocking ast (for a
+         *       LDLM_FL_CBPENDING lock), and osc_lock functions like
+         *       osc_lock_cancel() called from there need to know whether to
+         *       release lock reference (so that a flag is needed that is
+         *       cleared before ldlm_lock_decref() is called).
+         */
+                                 ols_has_ref:1,
+        /**
+         * inherit the lockless attribute from top level cl_io.
+         * If true, osc_lock_enqueue is able to tolerate the -EUSERS error.
+         */
+                                 ols_locklessable:1,
+        /**
+         * if set, the osc_lock is a glimpse lock. For glimpse locks, we treat
+         * the EVAVAIL error as torerable, this will make upper logic happy
+         * to wait all glimpse locks to each OSTs to be completed.
+         * Glimpse lock converts to normal lock if the server lock is
+         * granted.
+         * Glimpse lock should be destroyed immediately after use.
+         */
+                                 ols_glimpse:1,
+        /**
+         * For async glimpse lock.
+         */
+                                 ols_agl:1;
 };
 
-static inline int osc_lock_is_lockless(const struct osc_lock *ols)
-{
-	return (ols->ols_cl.cls_ops == ols->ols_lockless_ops);
-}
 
 /**
  * Page state private for osc layer.
@@ -524,7 +348,7 @@ struct osc_page {
 	/**
 	 * in LRU?
 	 */
-				ops_in_lru:1,
+			      ops_in_lru:1,
 	/**
 	 * Set if the page must be transferred with OBD_BRW_SRVLOCK.
 	 */
@@ -540,19 +364,7 @@ struct osc_page {
 	/**
 	 * Submit time - the time when the page is starting RPC. For debugging.
 	 */
-	ktime_t			ops_submit_time;
-};
-
-struct osc_brw_async_args {
-	struct obdo		*aa_oa;
-	int			 aa_requested_nob;
-	int			 aa_nio_count;
-	u32			 aa_page_count;
-	s32			 aa_resends;
-	struct brw_page		**aa_ppga;
-	struct client_obd	*aa_cli;
-	struct list_head	 aa_oaps;
-	struct list_head	 aa_exts;
+	cfs_time_t            ops_submit_time;
 };
 
 extern struct kmem_cache *osc_lock_kmem;
@@ -560,27 +372,32 @@ extern struct kmem_cache *osc_object_kmem;
 extern struct kmem_cache *osc_thread_kmem;
 extern struct kmem_cache *osc_session_kmem;
 extern struct kmem_cache *osc_extent_kmem;
-extern struct kmem_cache *osc_quota_kmem;
-extern struct kmem_cache *osc_obdo_kmem;
 
+extern struct lu_device_type osc_device_type;
 extern struct lu_context_key osc_key;
 extern struct lu_context_key osc_session_key;
 
 #define OSC_FLAGS (ASYNC_URGENT|ASYNC_READY)
 
-/* osc_page.c */
+int osc_lock_init(const struct lu_env *env,
+                  struct cl_object *obj, struct cl_lock *lock,
+                  const struct cl_io *io);
+int osc_io_init  (const struct lu_env *env,
+                  struct cl_object *obj, struct cl_io *io);
+struct lu_object *osc_object_alloc(const struct lu_env *env,
+                                   const struct lu_object_header *hdr,
+                                   struct lu_device *dev);
 int osc_page_init(const struct lu_env *env, struct cl_object *obj,
 		  struct cl_page *page, pgoff_t ind);
-void osc_index2policy(union ldlm_policy_data *policy, const struct cl_object *obj,
-		      pgoff_t start, pgoff_t end);
+
+void osc_index2policy(union ldlm_policy_data *policy,
+		      const struct cl_object *obj, pgoff_t start, pgoff_t end);
+int  osc_lvb_print(const struct lu_env *env, void *cookie,
+		   lu_printer_t p, const struct ost_lvb *lvb);
+
 void osc_lru_add_batch(struct client_obd *cli, struct list_head *list);
 void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
 		     enum cl_req_type crt, int brw_flags);
-int lru_queue_work(const struct lu_env *env, void *data);
-long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
-		    long target, bool force);
-
-/* osc_cache.c */
 int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops);
 int osc_set_async_flags(struct osc_object *obj, struct osc_page *opg,
 			u32 async_flags);
@@ -594,9 +411,8 @@ int osc_teardown_async_page(const struct lu_env *env, struct osc_object *obj,
 			    struct osc_page *ops);
 int osc_flush_async_page(const struct lu_env *env, struct cl_io *io,
 			 struct osc_page *ops);
-int osc_queue_sync_pages(const struct lu_env *env, const struct cl_io *io,
-			 struct osc_object *obj, struct list_head *list,
-			 int brw_flags);
+int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
+			 struct list_head *list, int cmd, int brw_flags);
 int osc_cache_truncate_start(const struct lu_env *env, struct osc_object *obj,
 			     __u64 size, struct osc_extent **extp);
 void osc_cache_truncate_end(const struct lu_env *env, struct osc_extent *ext);
@@ -604,161 +420,59 @@ int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
 			      pgoff_t start, pgoff_t end, int hp, int discard);
 int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
 			 pgoff_t start, pgoff_t end);
-int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli,
-		   struct osc_object *osc, int async);
-static inline void osc_wake_cache_waiters(struct client_obd *cli)
-{
-	wake_up(&cli->cl_cache_waiters);
-}
+void osc_io_unplug(const struct lu_env *env, struct client_obd *cli,
+		   struct osc_object *osc);
+int lru_queue_work(const struct lu_env *env, void *data);
 
-static inline int osc_io_unplug_async(const struct lu_env *env,
-				      struct client_obd *cli,
-				      struct osc_object *osc)
-{
-	return osc_io_unplug0(env, cli, osc, 1);
-}
+void osc_object_set_contended  (struct osc_object *obj);
+void osc_object_clear_contended(struct osc_object *obj);
+int  osc_object_is_contended   (struct osc_object *obj);
 
-static inline void osc_io_unplug(const struct lu_env *env,
-				 struct client_obd *cli,
-				 struct osc_object *osc)
-{
-	(void)osc_io_unplug0(env, cli, osc, 0);
-}
-
-typedef int (*osc_page_gang_cbt)(const struct lu_env *, struct cl_io *,
-				 struct osc_page *, void *);
-int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
-			struct osc_object *osc, pgoff_t start, pgoff_t end,
-			osc_page_gang_cbt cb, void *cbdata);
-int osc_discard_cb(const struct lu_env *env, struct cl_io *io,
-		   struct osc_page *ops, void *cbdata);
-
-/* osc_dev.c */
-int osc_device_init(const struct lu_env *env, struct lu_device *d,
-		    const char *name, struct lu_device *next);
-struct lu_device *osc_device_fini(const struct lu_env *env,
-				  struct lu_device *d);
-struct lu_device *osc_device_free(const struct lu_env *env,
-				  struct lu_device *d);
-
-/* osc_object.c */
-int osc_object_init(const struct lu_env *env, struct lu_object *obj,
-		    const struct lu_object_conf *conf);
-void osc_object_free(const struct lu_env *env, struct lu_object *obj);
-int osc_lvb_print(const struct lu_env *env, void *cookie,
-		  lu_printer_t p, const struct ost_lvb *lvb);
-int osc_object_print(const struct lu_env *env, void *cookie,
-		     lu_printer_t p, const struct lu_object *obj);
-int osc_attr_get(const struct lu_env *env, struct cl_object *obj,
-		 struct cl_attr *attr);
-int osc_attr_update(const struct lu_env *env, struct cl_object *obj,
-		    const struct cl_attr *attr, unsigned valid);
-int osc_object_glimpse(const struct lu_env *env, const struct cl_object *obj,
-		       struct ost_lvb *lvb);
-int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc);
-int osc_object_is_contended(struct osc_object *obj);
-int osc_object_find_cbdata(const struct lu_env *env, struct cl_object *obj,
-			   ldlm_iterator_t iter, void *data);
-int osc_object_prune(const struct lu_env *env, struct cl_object *obj);
-
-/* osc_request.c */
-void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd);
-int osc_setup_common(struct obd_device *obd, struct lustre_cfg *lcfg);
-int osc_precleanup_common(struct obd_device *obd);
-int osc_cleanup_common(struct obd_device *obd);
-int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
-		       u32 keylen, void *key, u32 vallen, void *val,
-		       struct ptlrpc_request_set *set);
-int osc_ldlm_resource_invalidate(struct cfs_hash *hs, struct cfs_hash_bd *bd,
-				 struct hlist_node *hnode, void *arg);
-int osc_reconnect(const struct lu_env *env, struct obd_export *exp,
-		  struct obd_device *obd, struct obd_uuid *cluuid,
-		  struct obd_connect_data *data, void *localdata);
-int osc_disconnect(struct obd_export *exp);
-int osc_punch_send(struct obd_export *exp, struct obdo *oa,
-		   obd_enqueue_update_f upcall, void *cookie);
-
-/* osc_io.c */
-int osc_io_submit(const struct lu_env *env, const struct cl_io_slice *ios,
-		  enum cl_req_type crt, struct cl_2queue *queue);
-int osc_io_commit_async(const struct lu_env *env,
-			const struct cl_io_slice *ios,
-			struct cl_page_list *qin, int from, int to,
-			cl_commit_cbt cb);
-int osc_io_iter_init(const struct lu_env *env, const struct cl_io_slice *ios);
-void osc_io_iter_fini(const struct lu_env *env,
-		      const struct cl_io_slice *ios);
-int osc_io_write_iter_init(const struct lu_env *env,
-			   const struct cl_io_slice *ios);
-void osc_io_write_iter_fini(const struct lu_env *env,
-			    const struct cl_io_slice *ios);
-int osc_io_fault_start(const struct lu_env *env, const struct cl_io_slice *ios);
-void osc_io_setattr_end(const struct lu_env *env,
-			const struct cl_io_slice *slice);
-int osc_io_read_start(const struct lu_env *env,
-		      const struct cl_io_slice *slice);
-int osc_io_write_start(const struct lu_env *env,
-		       const struct cl_io_slice *slice);
-void osc_io_end(const struct lu_env *env, const struct cl_io_slice *slice);
-int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj,
-		  struct cl_fsync_io *fio);
-void osc_io_fsync_end(const struct lu_env *env,
-		      const struct cl_io_slice *slice);
-void osc_read_ahead_release(const struct lu_env *env, void *cbdata);
-
-/* osc_lock.c */
-void osc_lock_to_lockless(const struct lu_env *env, struct osc_lock *ols,
-			  int force);
-void osc_lock_wake_waiters(const struct lu_env *env, struct osc_object *osc,
-			   struct osc_lock *oscl);
-int osc_lock_enqueue_wait(const struct lu_env *env, struct osc_object *obj,
-			  struct osc_lock *oscl);
-void osc_lock_set_writer(const struct lu_env *env, const struct cl_io *io,
-			 struct cl_object *obj, struct osc_lock *oscl);
-int osc_lock_print(const struct lu_env *env, void *cookie,
-		   lu_printer_t p, const struct cl_lock_slice *slice);
-void osc_lock_cancel(const struct lu_env *env,
-		     const struct cl_lock_slice *slice);
-void osc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice);
-int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data);
-unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock);
+int  osc_lock_is_lockless      (const struct osc_lock *olck);
 
 /*****************************************************************************
  *
- * Accessors and type conversions.
+ * Accessors.
  *
  */
+
 static inline struct osc_thread_info *osc_env_info(const struct lu_env *env)
 {
-	struct osc_thread_info *info;
+        struct osc_thread_info *info;
 
-	info = lu_context_key_get(&env->le_ctx, &osc_key);
-	LASSERT(info != NULL);
-	return info;
+        info = lu_context_key_get(&env->le_ctx, &osc_key);
+        LASSERT(info != NULL);
+        return info;
 }
 
 static inline struct osc_session *osc_env_session(const struct lu_env *env)
 {
-	struct osc_session *ses;
+        struct osc_session *ses;
 
-	ses = lu_context_key_get(env->le_ses, &osc_session_key);
-	LASSERT(ses != NULL);
-	return ses;
+        ses = lu_context_key_get(env->le_ses, &osc_session_key);
+        LASSERT(ses != NULL);
+        return ses;
 }
 
 static inline struct osc_io *osc_env_io(const struct lu_env *env)
 {
-	return &osc_env_session(env)->os_io;
+        return &osc_env_session(env)->os_io;
+}
+
+static inline int osc_is_object(const struct lu_object *obj)
+{
+        return obj->lo_dev->ld_type == &osc_device_type;
 }
 
 static inline struct osc_device *lu2osc_dev(const struct lu_device *d)
 {
-	return container_of0(d, struct osc_device, od_cl.cd_lu_dev);
+        LINVRNT(d->ld_type == &osc_device_type);
+        return container_of0(d, struct osc_device, od_cl.cd_lu_dev);
 }
 
 static inline struct obd_export *osc_export(const struct osc_object *obj)
 {
-	return lu2osc_dev(obj->oo_cl.co_lu.lo_dev)->od_exp;
+        return lu2osc_dev(obj->oo_cl.co_lu.lo_dev)->od_exp;
 }
 
 static inline struct client_obd *osc_cli(const struct osc_object *obj)
@@ -768,7 +482,8 @@ static inline struct client_obd *osc_cli(const struct osc_object *obj)
 
 static inline struct osc_object *cl2osc(const struct cl_object *obj)
 {
-	return container_of0(obj, struct osc_object, oo_cl);
+        LINVRNT(osc_is_object(&obj->co_lu));
+        return container_of0(obj, struct osc_object, oo_cl);
 }
 
 static inline struct cl_object *osc2cl(const struct osc_object *obj)
@@ -776,36 +491,6 @@ static inline struct cl_object *osc2cl(const struct osc_object *obj)
 	return (struct cl_object *)&obj->oo_cl;
 }
 
-static inline struct osc_device *obd2osc_dev(const struct obd_device *d)
-{
-	return container_of0(d->obd_lu_dev, struct osc_device,
-			     od_cl.cd_lu_dev);
-}
-
-static inline struct lu_device *osc2lu_dev(struct osc_device *osc)
-{
-	return &osc->od_cl.cd_lu_dev;
-}
-
-static inline struct lu_object *osc2lu(struct osc_object *osc)
-{
-	return &osc->oo_cl.co_lu;
-}
-
-static inline struct osc_object *lu2osc(const struct lu_object *obj)
-{
-	return container_of0(obj, struct osc_object, oo_cl.co_lu);
-}
-
-static inline struct osc_io *cl2osc_io(const struct lu_env *env,
-				       const struct cl_io_slice *slice)
-{
-	struct osc_io *oio = container_of0(slice, struct osc_io, oi_cl);
-
-	LINVRNT(oio == osc_env_io(env));
-	return oio;
-}
-
 static inline enum ldlm_mode osc_cl_lock2ldlm(enum cl_lock_mode mode)
 {
 	LASSERT(mode == CLM_READ || mode == CLM_WRITE || mode == CLM_GROUP);
@@ -828,7 +513,8 @@ static inline enum cl_lock_mode osc_ldlm2cl_lock(enum ldlm_mode mode)
 
 static inline struct osc_page *cl2osc_page(const struct cl_page_slice *slice)
 {
-	return container_of0(slice, struct osc_page, ops_cl);
+        LINVRNT(osc_is_object(&slice->cpl_obj->co_lu));
+        return container_of0(slice, struct osc_page, ops_cl);
 }
 
 static inline struct osc_page *oap2osc(struct osc_async_page *oap)
@@ -863,12 +549,18 @@ osc_cl_page_osc(struct cl_page *page, struct osc_object *osc)
 
 static inline struct osc_lock *cl2osc_lock(const struct cl_lock_slice *slice)
 {
-	return container_of0(slice, struct osc_lock, ols_cl);
+        LINVRNT(osc_is_object(&slice->cls_obj->co_lu));
+        return container_of0(slice, struct osc_lock, ols_cl);
+}
+
+static inline struct osc_lock *osc_lock_at(const struct cl_lock *lock)
+{
+        return cl2osc_lock(cl_lock_at(lock, &osc_device_type));
 }
 
 static inline int osc_io_srvlock(struct osc_io *oio)
 {
-	return (oio->oi_lockless && !oio->oi_cl.cis_io->ci_no_srvlock);
+        return (oio->oi_lockless && !oio->oi_cl.cis_io->ci_no_srvlock);
 }
 
 enum osc_extent_state {
@@ -934,9 +626,7 @@ struct osc_extent {
 				oe_hp:1,
 	/** this extent should be written back asap. set if one of pages is
 	 * called by page WB daemon, or sync write or reading requests. */
-				oe_urgent:1,
-	/** Non-delay RPC should be used for this extent. */
-				oe_ndelay:1;
+				oe_urgent:1;
 	/** how many grants allocated for this extent.
 	 *  Grant allocated for this extent. There is no grant allocated
 	 *  for reading extents and sync write extents. */
@@ -970,10 +660,20 @@ struct osc_extent {
 	int			oe_rc;
 	/** max pages per rpc when this extent was created */
 	unsigned int		oe_mppr;
-	/** FLR: layout version when this osc_extent is publised */
-	__u32			oe_layout_version;
 };
 
+int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
+		      int sent, int rc);
+int osc_extent_release(const struct lu_env *env, struct osc_extent *ext);
+
+int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc,
+			   pgoff_t start, pgoff_t end, bool discard_pages);
+
+typedef int (*osc_page_gang_cbt)(const struct lu_env *, struct cl_io *,
+				 struct osc_page *, void *);
+int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
+			 struct osc_object *osc, pgoff_t start, pgoff_t end,
+			 osc_page_gang_cbt cb, void *cbdata);
 /** @} osc */
 
-#endif /* LUSTRE_OSC_H */
+#endif /* OSC_CL_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_dev.c b/drivers/staging/lustrefsx/lustre/osc/osc_dev.c
index cbddab5c0f319..c06a5deb339b7 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_dev.c
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_dev.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,24 +38,19 @@
 
 /* class_name2obd() */
 #include <obd_class.h>
-#include <lustre_osc.h>
 
-#include "osc_internal.h"
+#include "osc_cl_internal.h"
 
-/** \addtogroup osc
- * @{
+/** \addtogroup osc 
+ * @{ 
  */
 
 struct kmem_cache *osc_lock_kmem;
-EXPORT_SYMBOL(osc_lock_kmem);
 struct kmem_cache *osc_object_kmem;
-EXPORT_SYMBOL(osc_object_kmem);
-
 struct kmem_cache *osc_thread_kmem;
 struct kmem_cache *osc_session_kmem;
 struct kmem_cache *osc_extent_kmem;
 struct kmem_cache *osc_quota_kmem;
-struct kmem_cache *osc_obdo_kmem;
 
 struct lu_kmem_descr osc_caches[] = {
         {
@@ -89,15 +84,21 @@ struct lu_kmem_descr osc_caches[] = {
 		.ckd_size  = sizeof(struct osc_quota_info)
 	},
 	{
-		.ckd_cache = &osc_obdo_kmem,
-		.ckd_name  = "osc_obdo_kmem",
-		.ckd_size  = sizeof(struct obdo)
-	},
-	{
-		.ckd_cache = NULL
-	}
+                .ckd_cache = NULL
+        }
 };
 
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static struct lu_device *osc2lu_dev(struct osc_device *osc)
+{
+        return &osc->od_cl.cd_lu_dev;
+}
+
 /*****************************************************************************
  *
  * Osc device and device type functions.
@@ -129,7 +130,6 @@ struct lu_context_key osc_key = {
         .lct_init = osc_key_init,
         .lct_fini = osc_key_fini
 };
-EXPORT_SYMBOL(osc_key);
 
 static void *osc_session_init(const struct lu_context *ctx,
 			      struct lu_context_key *key)
@@ -154,7 +154,6 @@ struct lu_context_key osc_session_key = {
         .lct_init = osc_session_init,
         .lct_fini = osc_session_fini
 };
-EXPORT_SYMBOL(osc_session_key);
 
 /* type constructor/destructor: osc_type_{init,fini,start,stop}(). */
 LU_TYPE_INIT_FINI(osc, &osc_key, &osc_session_key);
@@ -172,30 +171,27 @@ static const struct lu_device_operations osc_lu_ops = {
         .ldo_recovery_complete = NULL
 };
 
-int osc_device_init(const struct lu_env *env, struct lu_device *d,
-		    const char *name, struct lu_device *next)
+static int osc_device_init(const struct lu_env *env, struct lu_device *d,
+                           const char *name, struct lu_device *next)
 {
         RETURN(0);
 }
-EXPORT_SYMBOL(osc_device_init);
 
-struct lu_device *osc_device_fini(const struct lu_env *env,
-				  struct lu_device *d)
+static struct lu_device *osc_device_fini(const struct lu_env *env,
+                                         struct lu_device *d)
 {
 	return NULL;
 }
-EXPORT_SYMBOL(osc_device_fini);
 
-struct lu_device *osc_device_free(const struct lu_env *env,
-				  struct lu_device *d)
+static struct lu_device *osc_device_free(const struct lu_env *env,
+                                         struct lu_device *d)
 {
-	struct osc_device *od = lu2osc_dev(d);
+        struct osc_device *od = lu2osc_dev(d);
 
-	cl_device_fini(lu2cl_dev(d));
-	OBD_FREE_PTR(od);
-	return NULL;
+        cl_device_fini(lu2cl_dev(d));
+        OBD_FREE_PTR(od);
+        return NULL;
 }
-EXPORT_SYMBOL(osc_device_free);
 
 static struct lu_device *osc_device_alloc(const struct lu_env *env,
                                           struct lu_device_type *t,
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_internal.h b/drivers/staging/lustrefsx/lustre/osc/osc_internal.h
index 519a4d1f4b57e..24766263514a6 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_internal.h
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -35,45 +35,93 @@
 
 #define OAP_MAGIC 8675309
 
-#include <libcfs/linux/linux-mem.h>
-#include <lustre_osc.h>
-
 extern atomic_t osc_pool_req_count;
 extern unsigned int osc_reqpool_maxreqcount;
 extern struct ptlrpc_request_pool *osc_rq_pool;
 
+struct lu_env;
+
+enum async_flags {
+        ASYNC_READY = 0x1, /* ap_make_ready will not be called before this
+                              page is added to an rpc */
+        ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */
+        ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called
+                                     to give the caller a chance to update
+                                     or cancel the size of the io */
+        ASYNC_HP = 0x10,
+};
+
+struct osc_async_page {
+        int                     oap_magic;
+        unsigned short          oap_cmd;
+        unsigned short          oap_interrupted:1;
+
+	struct list_head	oap_pending_item;
+	struct list_head	oap_rpc_item;
+
+	loff_t			oap_obj_off;
+        unsigned                oap_page_off;
+        enum async_flags        oap_async_flags;
+
+        struct brw_page         oap_brw_page;
+
+        struct ptlrpc_request   *oap_request;
+        struct client_obd       *oap_cli;
+	struct osc_object       *oap_obj;
+
+	spinlock_t		 oap_lock;
+};
+
+#define oap_page        oap_brw_page.pg
+#define oap_count       oap_brw_page.count
+#define oap_brw_flags   oap_brw_page.flag
+
+static inline struct osc_async_page *brw_page2oap(struct brw_page *pga)
+{
+	return (struct osc_async_page *)container_of(pga, struct osc_async_page,
+						     oap_brw_page);
+}
+
+struct osc_cache_waiter {
+	struct list_head	ocw_entry;
+	wait_queue_head_t	ocw_waitq;
+	struct osc_async_page  *ocw_oap;
+	int                     ocw_grant;
+	int                     ocw_rc;
+};
+
+void osc_wake_cache_waiters(struct client_obd *cli);
 int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes);
 void osc_update_next_shrink(struct client_obd *cli);
-int lru_queue_work(const struct lu_env *env, void *data);
-int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
-		      int sent, int rc);
-int osc_extent_release(const struct lu_env *env, struct osc_extent *ext);
-int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc,
-			   pgoff_t start, pgoff_t end, bool discard);
+
+/*
+ * cl integration.
+ */
+#include <cl_object.h>
 
 extern struct ptlrpc_request_set *PTLRPCD_SET;
 
-void osc_lock_lvb_update(const struct lu_env *env,
-			 struct osc_object *osc,
-			 struct ldlm_lock *dlmlock,
-			 struct ost_lvb *lvb);
+typedef int (*osc_enqueue_upcall_f)(void *cookie, struct lustre_handle *lockh,
+				    int rc);
 
 int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
 		     __u64 *flags, union ldlm_policy_data *policy,
-		     struct ost_lvb *lvb, osc_enqueue_upcall_f upcall,
+		     struct ost_lvb *lvb, int kms_valid,
+		     osc_enqueue_upcall_f upcall,
 		     void *cookie, struct ldlm_enqueue_info *einfo,
-		     struct ptlrpc_request_set *rqset, int async,
-		     bool speculative);
+		     struct ptlrpc_request_set *rqset, int async, int agl);
 
-int osc_match_base(const struct lu_env *env, struct obd_export *exp,
-		   struct ldlm_res_id *res_id, enum ldlm_type type,
-		   union ldlm_policy_data *policy, enum ldlm_mode mode,
-		   __u64 *flags, struct osc_object *obj,
+int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+		   enum ldlm_type type, union ldlm_policy_data *policy,
+		   enum ldlm_mode mode, __u64 *flags, void *data,
 		   struct lustre_handle *lockh, int unref);
 
 int osc_setattr_async(struct obd_export *exp, struct obdo *oa,
 		      obd_enqueue_update_f upcall, void *cookie,
 		      struct ptlrpc_request_set *rqset);
+int osc_punch_base(struct obd_export *exp, struct obdo *oa,
+                   obd_enqueue_update_f upcall, void *cookie,
+                   struct ptlrpc_request_set *rqset);
 int osc_sync_base(struct osc_object *obj, struct obdo *oa,
 		  obd_enqueue_update_f upcall, void *cookie,
 		  struct ptlrpc_request_set *rqset);
@@ -84,6 +132,8 @@ int osc_ladvise_base(struct obd_export *exp, struct obdo *oa,
 int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *cfg);
 int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 		  struct list_head *ext_list, int cmd);
+long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
+		   long target, bool force);
 unsigned long osc_lru_reserve(struct client_obd *cli, unsigned long npages);
 void osc_lru_unreserve(struct client_obd *cli, unsigned long npages);
 
@@ -94,36 +144,15 @@ unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock);
 int osc_cleanup(struct obd_device *obd);
 int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
 
-int osc_tunables_init(struct obd_device *obd);
+#ifdef CONFIG_PROC_FS
+extern struct lprocfs_vars lprocfs_osc_obd_vars[];
+int lproc_osc_attach_seqstat(struct obd_device *dev);
+#else
+static inline int lproc_osc_attach_seqstat(struct obd_device *dev) {return 0;}
+#endif
 
 extern struct lu_device_type osc_device_type;
 
-static inline struct cl_io *osc_env_thread_io(const struct lu_env *env)
-{
-	struct cl_io *io = &osc_env_info(env)->oti_io;
-
-	memset(io, 0, sizeof(*io));
-	return io;
-}
-
-static inline int osc_is_object(const struct lu_object *obj)
-{
-	return obj->lo_dev->ld_type == &osc_device_type;
-}
-
-static inline struct osc_lock *osc_lock_at(const struct cl_lock *lock)
-{
-	return cl2osc_lock(cl_lock_at(lock, &osc_device_type));
-}
-
-int osc_lock_init(const struct lu_env *env, struct cl_object *obj,
-		  struct cl_lock *lock, const struct cl_io *io);
-int osc_io_init(const struct lu_env *env, struct cl_object *obj,
-		struct cl_io *io);
-struct lu_object *osc_object_alloc(const struct lu_env *env,
-				   const struct lu_object_header *hdr,
-				   struct lu_device *dev);
-
 static inline int osc_recoverable_error(int rc)
 {
         return (rc == -EIO || rc == -EROFS || rc == -ENOMEM ||
@@ -145,13 +174,41 @@ static inline char *cli_name(struct client_obd *cli)
         ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; })
 #endif
 
+struct osc_device {
+        struct cl_device    od_cl;
+        struct obd_export  *od_exp;
+
+        /* Write stats is actually protected by client_obd's lock. */
+        struct osc_stats {
+                uint64_t     os_lockless_writes;          /* by bytes */
+                uint64_t     os_lockless_reads;           /* by bytes */
+                uint64_t     os_lockless_truncates;       /* by times */
+        } od_stats;
+
+        /* configuration item(s) */
+        int                 od_contention_time;
+        int                 od_lockless_truncate;
+};
+
+static inline struct osc_device *obd2osc_dev(const struct obd_device *d)
+{
+        return container_of0(d->obd_lu_dev, struct osc_device, od_cl.cd_lu_dev);
+}
+
+extern struct kmem_cache *osc_quota_kmem;
+struct osc_quota_info {
+	/** linkage for quota hash table */
+	struct hlist_node oqi_hash;
+	u32		  oqi_id;
+};
+
 struct osc_async_args {
 	struct obd_info	*aa_oi;
 };
 
 int osc_quota_setup(struct obd_device *obd);
 int osc_quota_cleanup(struct obd_device *obd);
-int osc_quota_setdq(struct client_obd *cli, __u64 xid, const unsigned int qid[],
+int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[],
 		    u64 valid, u32 flags);
 int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[]);
 int osc_quotactl(struct obd_device *unused, struct obd_export *exp,
@@ -159,14 +216,24 @@ int osc_quotactl(struct obd_device *unused, struct obd_export *exp,
 void osc_inc_unstable_pages(struct ptlrpc_request *req);
 void osc_dec_unstable_pages(struct ptlrpc_request *req);
 bool osc_over_unstable_soft_limit(struct client_obd *cli);
-void osc_page_touch_at(const struct lu_env *env, struct cl_object *obj,
-		       pgoff_t idx, size_t to);
-
-struct ldlm_lock *osc_obj_dlmlock_at_pgoff(const struct lu_env *env,
-					   struct osc_object *obj,
-					   pgoff_t index,
-					   enum osc_dap_flags flags);
-
+/**
+ * Bit flags for osc_dlm_lock_at_pageoff().
+ */
+enum osc_dap_flags {
+	/**
+	 * Just check if the desired lock exists, it won't hold reference
+	 * count on lock.
+	 */
+	OSC_DAP_FL_TEST_LOCK = 1 << 0,
+	/**
+	 * Return the lock even if it is being canceled.
+	 */
+	OSC_DAP_FL_CANCELING = 1 << 1
+};
+struct ldlm_lock *osc_dlmlock_at_pgoff(const struct lu_env *env,
+				       struct osc_object *obj, pgoff_t index,
+				       enum osc_dap_flags flags);
+void osc_pack_req_body(struct ptlrpc_request *req, struct obdo *oa);
 int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc);
 
 /** osc shrink list to link all osc client obd */
@@ -178,14 +245,4 @@ extern unsigned long osc_cache_shrink_count(struct shrinker *sk,
 extern unsigned long osc_cache_shrink_scan(struct shrinker *sk,
 					   struct shrink_control *sc);
 
-static inline void osc_set_io_portal(struct ptlrpc_request *req)
-{
-	struct obd_import *imp = req->rq_import;
-
-	/* Distinguish OSC from MDC here to use OST or MDS portal */
-	if (OCD_HAS_FLAG(&imp->imp_connect_data, IBITS))
-		req->rq_request_portal = MDS_IO_PORTAL;
-	else
-		req->rq_request_portal = OST_IO_PORTAL;
-}
 #endif /* OSC_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_io.c b/drivers/staging/lustrefsx/lustre/osc/osc_io.c
index 4a51b9912d72f..38fe2532829fd 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_io.c
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_io.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,14 +38,27 @@
 #define DEBUG_SUBSYSTEM S_OSC
 
 #include <lustre_obdo.h>
-#include <lustre_osc.h>
 
-#include "osc_internal.h"
+#include "osc_cl_internal.h"
 
-/** \addtogroup osc
- *  @{
+/** \addtogroup osc 
+ *  @{ 
  */
 
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static struct osc_io *cl2osc_io(const struct lu_env *env,
+                                const struct cl_io_slice *slice)
+{
+        struct osc_io *oio = container_of0(slice, struct osc_io, oi_cl);
+        LINVRNT(oio == osc_env_io(env));
+        return oio;
+}
+
 /*****************************************************************************
  *
  * io operations.
@@ -56,7 +69,8 @@ static void osc_io_fini(const struct lu_env *env, const struct cl_io_slice *io)
 {
 }
 
-void osc_read_ahead_release(const struct lu_env *env, void *cbdata)
+static void osc_read_ahead_release(const struct lu_env *env,
+				   void *cbdata)
 {
 	struct ldlm_lock *dlmlock = cbdata;
 	struct lustre_handle lockh;
@@ -65,7 +79,6 @@ void osc_read_ahead_release(const struct lu_env *env, void *cbdata)
 	ldlm_lock_decref(&lockh, LCK_PR);
 	LDLM_LOCK_PUT(dlmlock);
 }
-EXPORT_SYMBOL(osc_read_ahead_release);
 
 static int osc_io_read_ahead(const struct lu_env *env,
 			     const struct cl_io_slice *ios,
@@ -104,8 +117,9 @@ static int osc_io_read_ahead(const struct lu_env *env,
  * or, if page is already submitted, changes osc flags through
  * osc_set_async_flags().
  */
-int osc_io_submit(const struct lu_env *env, const struct cl_io_slice *ios,
-		  enum cl_req_type crt, struct cl_2queue *queue)
+static int osc_io_submit(const struct lu_env *env,
+                         const struct cl_io_slice *ios,
+			 enum cl_req_type crt, struct cl_2queue *queue)
 {
 	struct cl_page	  *page;
 	struct cl_page	  *tmp;
@@ -119,6 +133,7 @@ int osc_io_submit(const struct lu_env *env, const struct cl_io_slice *ios,
 	struct cl_page_list *qout     = &queue->c2_qout;
 	unsigned int queued = 0;
 	int result = 0;
+	int cmd;
 	int brw_flags;
 	unsigned int max_pages;
 
@@ -130,14 +145,8 @@ int osc_io_submit(const struct lu_env *env, const struct cl_io_slice *ios,
 	cli = osc_cli(osc);
 	max_pages = cli->cl_max_pages_per_rpc;
 
+	cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
 	brw_flags = osc_io_srvlock(cl2osc_io(env, ios)) ? OBD_BRW_SRVLOCK : 0;
-	brw_flags |= crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
-	if (crt == CRT_READ && ios->cis_io->ci_ndelay)
-		brw_flags |= OBD_BRW_NDELAY;
-
-	page = cl_page_list_first(qin);
-	if (page->cp_type == CPT_TRANSIENT)
-		brw_flags |= OBD_BRW_NOCACHE;
 
         /*
          * NOTE: here @page is a top-level page. This is done to avoid
@@ -191,7 +200,7 @@ int osc_io_submit(const struct lu_env *env, const struct cl_io_slice *ios,
 
 		if (++queued == max_pages) {
 			queued = 0;
-			result = osc_queue_sync_pages(env, io, osc, &list,
+			result = osc_queue_sync_pages(env, osc, &list, cmd,
 						      brw_flags);
 			if (result < 0)
 				break;
@@ -199,7 +208,7 @@ int osc_io_submit(const struct lu_env *env, const struct cl_io_slice *ios,
 	}
 
 	if (queued > 0)
-		result = osc_queue_sync_pages(env, io, osc, &list, brw_flags);
+		result = osc_queue_sync_pages(env, osc, &list, cmd, brw_flags);
 
 	/* Update c/mtime for sync write. LU-7310 */
 	if (crt == CRT_WRITE && qout->pl_nr > 0 && result == 0) {
@@ -215,31 +224,36 @@ int osc_io_submit(const struct lu_env *env, const struct cl_io_slice *ios,
 	CDEBUG(D_INFO, "%d/%d %d\n", qin->pl_nr, qout->pl_nr, result);
 	return qout->pl_nr > 0 ? 0 : result;
 }
-EXPORT_SYMBOL(osc_io_submit);
 
 /**
- * This is called to update the attributes when modifying a specific page,
- * both when making new pages and when doing updates to existing cached pages.
+ * This is called when a page is accessed within file in a way that creates
+ * new page, if one were missing (i.e., if there were a hole at that place in
+ * the file, or accessed page is beyond the current file size).
  *
  * Expand stripe KMS if necessary.
  */
-void osc_page_touch_at(const struct lu_env *env, struct cl_object *obj,
-		       pgoff_t idx, size_t to)
+static void osc_page_touch_at(const struct lu_env *env,
+			      struct cl_object *obj, pgoff_t idx, size_t to)
 {
-	struct lov_oinfo  *loi  = cl2osc(obj)->oo_oinfo;
-	struct cl_attr    *attr = &osc_env_info(env)->oti_attr;
-	int valid;
-	__u64 kms;
+        struct lov_oinfo  *loi  = cl2osc(obj)->oo_oinfo;
+        struct cl_attr    *attr = &osc_env_info(env)->oti_attr;
+        int valid;
+        __u64 kms;
 
-	ENTRY;
+        /* offset within stripe */
+        kms = cl_offset(obj, idx) + to;
 
-	/* offset within stripe */
-	kms = cl_offset(obj, idx) + to;
-
-	cl_object_attr_lock(obj);
+        cl_object_attr_lock(obj);
+        /*
+         * XXX old code used
+         *
+         *         ll_inode_size_lock(inode, 0); lov_stripe_lock(lsm);
+         *
+         * here
+         */
 	CDEBUG(D_INODE, "stripe KMS %sincreasing %llu->%llu %llu\n",
-	       kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms,
-	       loi->loi_lvb.lvb_size);
+               kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms,
+               loi->loi_lvb.lvb_size);
 
 	attr->cat_mtime = attr->cat_ctime = ktime_get_real_seconds();
 	valid = CAT_MTIME | CAT_CTIME;
@@ -253,14 +267,12 @@ void osc_page_touch_at(const struct lu_env *env, struct cl_object *obj,
 	}
 	cl_object_attr_update(env, obj, attr, valid);
 	cl_object_attr_unlock(obj);
-
-	EXIT;
 }
 
-int osc_io_commit_async(const struct lu_env *env,
-			const struct cl_io_slice *ios,
-			struct cl_page_list *qin, int from, int to,
-			cl_commit_cbt cb)
+static int osc_io_commit_async(const struct lu_env *env,
+				const struct cl_io_slice *ios,
+				struct cl_page_list *qin, int from, int to,
+				cl_commit_cbt cb)
 {
 	struct cl_io    *io = ios->cis_io;
 	struct osc_io   *oio = cl2osc_io(env, ios);
@@ -294,9 +306,6 @@ int osc_io_commit_async(const struct lu_env *env,
 		opg = osc_cl_page_osc(page, osc);
 		oap = &opg->ops_oap;
 
-		LASSERTF(osc == oap->oap_obj,
-			 "obj mismatch: %p / %p\n", osc, oap->oap_obj);
-
 		if (!list_empty(&oap->oap_rpc_item)) {
 			CDEBUG(D_CACHE, "Busy oap %p page %p for submit.\n",
 			       oap, opg);
@@ -332,47 +341,29 @@ int osc_io_commit_async(const struct lu_env *env,
 	CDEBUG(D_INFO, "%d %d\n", qin->pl_nr, result);
 	RETURN(result);
 }
-EXPORT_SYMBOL(osc_io_commit_async);
-
-static bool osc_import_not_healthy(struct obd_import *imp)
-{
-	return imp->imp_invalid || imp->imp_deactive ||
-	       !(imp->imp_state == LUSTRE_IMP_FULL ||
-		 imp->imp_state == LUSTRE_IMP_IDLE);
-}
 
-int osc_io_iter_init(const struct lu_env *env, const struct cl_io_slice *ios)
+static int osc_io_iter_init(const struct lu_env *env,
+			    const struct cl_io_slice *ios)
 {
 	struct osc_object *osc = cl2osc(ios->cis_obj);
 	struct obd_import *imp = osc_cli(osc)->cl_import;
-	struct osc_io *oio = osc_env_io(env);
 	int rc = -EIO;
-	ENTRY;
 
 	spin_lock(&imp->imp_lock);
-	/**
-	 * check whether this OSC device is available for non-delay read,
-	 * fast switching mirror if we haven't tried all mirrors.
-	 */
-	if (ios->cis_io->ci_type == CIT_READ && ios->cis_io->ci_ndelay &&
-	    !ios->cis_io->ci_tried_all_mirrors && osc_import_not_healthy(imp)) {
-		rc = -EWOULDBLOCK;
-	} else if (likely(!imp->imp_invalid)) {
+	if (likely(!imp->imp_invalid)) {
+		struct osc_io *oio = osc_env_io(env);
+
 		atomic_inc(&osc->oo_nr_ios);
 		oio->oi_is_active = 1;
 		rc = 0;
 	}
 	spin_unlock(&imp->imp_lock);
 
-	if (cfs_capable(CFS_CAP_SYS_RESOURCE))
-		oio->oi_cap_sys_resource = 1;
-
-	RETURN(rc);
+	return rc;
 }
-EXPORT_SYMBOL(osc_io_iter_init);
 
-int osc_io_write_iter_init(const struct lu_env *env,
-			   const struct cl_io_slice *ios)
+static int osc_io_write_iter_init(const struct lu_env *env,
+				  const struct cl_io_slice *ios)
 {
 	struct cl_io *io = ios->cis_io;
 	struct osc_io *oio = osc_env_io(env);
@@ -383,18 +374,17 @@ int osc_io_write_iter_init(const struct lu_env *env,
 	if (cl_io_is_append(io))
 		RETURN(osc_io_iter_init(env, ios));
 
-	npages = io->u.ci_rw.crw_count >> PAGE_SHIFT;
-	if (io->u.ci_rw.crw_pos & ~PAGE_MASK)
+	npages = io->u.ci_rw.rw_range.cir_count >> PAGE_SHIFT;
+	if (io->u.ci_rw.rw_range.cir_pos & ~PAGE_MASK)
 		++npages;
 
 	oio->oi_lru_reserved = osc_lru_reserve(osc_cli(osc), npages);
 
 	RETURN(osc_io_iter_init(env, ios));
 }
-EXPORT_SYMBOL(osc_io_write_iter_init);
 
-void osc_io_iter_fini(const struct lu_env *env,
-		      const struct cl_io_slice *ios)
+static void osc_io_iter_fini(const struct lu_env *env,
+			     const struct cl_io_slice *ios)
 {
 	struct osc_io *oio = osc_env_io(env);
 
@@ -407,10 +397,9 @@ void osc_io_iter_fini(const struct lu_env *env,
 			wake_up_all(&osc->oo_io_waitq);
 	}
 }
-EXPORT_SYMBOL(osc_io_iter_fini);
 
-void osc_io_write_iter_fini(const struct lu_env *env,
-			    const struct cl_io_slice *ios)
+static void osc_io_write_iter_fini(const struct lu_env *env,
+				   const struct cl_io_slice *ios)
 {
 	struct osc_io *oio = osc_env_io(env);
 	struct osc_object *osc = cl2osc(ios->cis_obj);
@@ -423,9 +412,9 @@ void osc_io_write_iter_fini(const struct lu_env *env,
 
 	osc_io_iter_fini(env, ios);
 }
-EXPORT_SYMBOL(osc_io_write_iter_fini);
 
-int osc_io_fault_start(const struct lu_env *env, const struct cl_io_slice *ios)
+static int osc_io_fault_start(const struct lu_env *env,
+			      const struct cl_io_slice *ios)
 {
 	struct cl_io       *io;
 	struct cl_fault_io *fio;
@@ -445,8 +434,6 @@ int osc_io_fault_start(const struct lu_env *env, const struct cl_io_slice *ios)
 				  fio->ft_index, fio->ft_nob);
 	RETURN(0);
 }
-EXPORT_SYMBOL(osc_io_fault_start);
-
 
 static int osc_async_upcall(void *a, int rc)
 {
@@ -510,11 +497,10 @@ static int osc_io_setattr_start(const struct lu_env *env,
         struct obdo             *oa     = &oio->oi_oa;
 	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
 	__u64                    size   = io->u.ci_setattr.sa_attr.lvb_size;
-	unsigned int ia_avalid = io->u.ci_setattr.sa_avalid;
-	enum op_xvalid ia_xvalid = io->u.ci_setattr.sa_xvalid;
-	int result = 0;
-
+	unsigned int             ia_valid = io->u.ci_setattr.sa_valid;
+	int                      result = 0;
 	ENTRY;
+
 	/* truncate cache dirty pages first */
 	if (cl_io_is_trunc(io))
 		result = osc_cache_truncate_start(env, cl2osc(obj), size,
@@ -527,20 +513,19 @@ static int osc_io_setattr_start(const struct lu_env *env,
 			struct ost_lvb *lvb = &io->u.ci_setattr.sa_attr;
 			unsigned int cl_valid = 0;
 
-			if (ia_avalid & ATTR_SIZE) {
-				attr->cat_size = size;
-				attr->cat_kms = size;
+			if (ia_valid & ATTR_SIZE) {
+				attr->cat_size = attr->cat_kms = size;
 				cl_valid = (CAT_SIZE | CAT_KMS);
 			}
-			if (ia_avalid & ATTR_MTIME_SET) {
+			if (ia_valid & ATTR_MTIME_SET) {
 				attr->cat_mtime = lvb->lvb_mtime;
 				cl_valid |= CAT_MTIME;
 			}
-			if (ia_avalid & ATTR_ATIME_SET) {
+			if (ia_valid & ATTR_ATIME_SET) {
 				attr->cat_atime = lvb->lvb_atime;
 				cl_valid |= CAT_ATIME;
 			}
-			if (ia_xvalid & OP_XVALID_CTIME_SET) {
+			if (ia_valid & ATTR_CTIME_SET) {
 				attr->cat_ctime = lvb->lvb_ctime;
 				cl_valid |= CAT_CTIME;
 			}
@@ -557,47 +542,42 @@ static int osc_io_setattr_start(const struct lu_env *env,
 		oa->o_layout = io->u.ci_setattr.sa_layout;
 		oa->o_valid |= OBD_MD_FLID | OBD_MD_FLGROUP |
 			OBD_MD_FLOSTLAYOUT;
-		if (ia_avalid & ATTR_CTIME) {
+		if (ia_valid & ATTR_CTIME) {
 			oa->o_valid |= OBD_MD_FLCTIME;
 			oa->o_ctime = attr->cat_ctime;
 		}
-		if (ia_avalid & ATTR_ATIME) {
+		if (ia_valid & ATTR_ATIME) {
 			oa->o_valid |= OBD_MD_FLATIME;
 			oa->o_atime = attr->cat_atime;
 		}
-		if (ia_avalid & ATTR_MTIME) {
+		if (ia_valid & ATTR_MTIME) {
 			oa->o_valid |= OBD_MD_FLMTIME;
 			oa->o_mtime = attr->cat_mtime;
 		}
-		if (ia_avalid & ATTR_SIZE) {
-			oa->o_size = size;
-			oa->o_blocks = OBD_OBJECT_EOF;
-			oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
-
-			if (oio->oi_lockless) {
-				oa->o_flags = OBD_FL_SRVLOCK;
-				oa->o_valid |= OBD_MD_FLFLAGS;
-			}
-
-			if (io->ci_layout_version > 0) {
-				/* verify layout version */
-				oa->o_valid |= OBD_MD_LAYOUT_VERSION;
-				oa->o_layout_version = io->ci_layout_version;
-			}
-		} else {
-			LASSERT(oio->oi_lockless == 0);
-		}
+                if (ia_valid & ATTR_SIZE) {
+                        oa->o_size = size;
+                        oa->o_blocks = OBD_OBJECT_EOF;
+                        oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+
+                        if (oio->oi_lockless) {
+                                oa->o_flags = OBD_FL_SRVLOCK;
+                                oa->o_valid |= OBD_MD_FLFLAGS;
+                        }
+                } else {
+                        LASSERT(oio->oi_lockless == 0);
+                }
 
-		if (ia_xvalid & OP_XVALID_FLAGS) {
+		if (ia_valid & ATTR_ATTR_FLAG) {
 			oa->o_flags = io->u.ci_setattr.sa_attr_flags;
 			oa->o_valid |= OBD_MD_FLFLAGS;
 		}
 
 		init_completion(&cbargs->opc_sync);
 
-		if (ia_avalid & ATTR_SIZE)
-			result = osc_punch_send(osc_export(cl2osc(obj)),
-						oa, osc_async_upcall, cbargs);
+		if (ia_valid & ATTR_SIZE)
+			result = osc_punch_base(osc_export(cl2osc(obj)),
+						oa, osc_async_upcall,
+						cbargs, PTLRPCD_SET);
 		else
 			result = osc_setattr_async(osc_export(cl2osc(obj)),
 						   oa, osc_async_upcall,
@@ -609,50 +589,37 @@ static int osc_io_setattr_start(const struct lu_env *env,
 	RETURN(result);
 }
 
-void osc_io_setattr_end(const struct lu_env *env,
-			const struct cl_io_slice *slice)
+static void osc_io_setattr_end(const struct lu_env *env,
+                               const struct cl_io_slice *slice)
 {
 	struct cl_io     *io  = slice->cis_io;
 	struct osc_io    *oio = cl2osc_io(env, slice);
 	struct cl_object *obj = slice->cis_obj;
 	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
-	struct cl_attr  *attr = &osc_env_info(env)->oti_attr;
-	struct obdo *oa = &oio->oi_oa;
-	unsigned int cl_valid = 0;
-	int result = 0;
+        int result = 0;
 
 	if (cbargs->opc_rpc_sent) {
 		wait_for_completion(&cbargs->opc_sync);
 		result = io->ci_result = cbargs->opc_rc;
 	}
-
-	if (result == 0) {
-		if (oio->oi_lockless) {
-			/* lockless truncate */
-			struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
-
-			LASSERT(cl_io_is_trunc(io));
-			/* XXX: Need a lock. */
-			osd->od_stats.os_lockless_truncates++;
-		}
-	}
+        if (result == 0) {
+                if (oio->oi_lockless) {
+                        /* lockless truncate */
+                        struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
+
+                        LASSERT(cl_io_is_trunc(io));
+                        /* XXX: Need a lock. */
+                        osd->od_stats.os_lockless_truncates++;
+                }
+        }
 
 	if (cl_io_is_trunc(io)) {
 		__u64 size = io->u.ci_setattr.sa_attr.lvb_size;
-		cl_object_attr_lock(obj);
-		if (oa->o_valid & OBD_MD_FLBLOCKS) {
-			attr->cat_blocks = oa->o_blocks;
-			cl_valid |= CAT_BLOCKS;
-		}
-
-		cl_object_attr_update(env, obj, attr, cl_valid);
-		cl_object_attr_unlock(obj);
 		osc_trunc_check(env, io, oio, size);
 		osc_cache_truncate_end(env, oio->oi_trunc);
 		oio->oi_trunc = NULL;
 	}
 }
-EXPORT_SYMBOL(osc_io_setattr_end);
 
 struct osc_data_version_args {
 	struct osc_io *dva_oio;
@@ -749,23 +716,18 @@ static void osc_io_data_version_end(const struct lu_env *env,
 
 	if (cbargs->opc_rc != 0) {
 		slice->cis_io->ci_result = cbargs->opc_rc;
+	} else if (!(oio->oi_oa.o_valid & OBD_MD_FLDATAVERSION)) {
+		slice->cis_io->ci_result = -EOPNOTSUPP;
 	} else {
+		dv->dv_data_version = oio->oi_oa.o_data_version;
 		slice->cis_io->ci_result = 0;
-		if (!(oio->oi_oa.o_valid &
-		      (OBD_MD_LAYOUT_VERSION | OBD_MD_FLDATAVERSION)))
-			slice->cis_io->ci_result = -ENOTSUPP;
-
-		if (oio->oi_oa.o_valid & OBD_MD_LAYOUT_VERSION)
-			dv->dv_layout_version = oio->oi_oa.o_layout_version;
-		if (oio->oi_oa.o_valid & OBD_MD_FLDATAVERSION)
-			dv->dv_data_version = oio->oi_oa.o_data_version;
 	}
 
 	EXIT;
 }
 
-int osc_io_read_start(const struct lu_env *env,
-		      const struct cl_io_slice *slice)
+static int osc_io_read_start(const struct lu_env *env,
+                             const struct cl_io_slice *slice)
 {
 	struct cl_object *obj  = slice->cis_obj;
 	struct cl_attr	 *attr = &osc_env_info(env)->oti_attr;
@@ -781,10 +743,9 @@ int osc_io_read_start(const struct lu_env *env,
 
 	RETURN(rc);
 }
-EXPORT_SYMBOL(osc_io_read_start);
 
-int osc_io_write_start(const struct lu_env *env,
-		       const struct cl_io_slice *slice)
+static int osc_io_write_start(const struct lu_env *env,
+                              const struct cl_io_slice *slice)
 {
 	struct cl_object *obj   = slice->cis_obj;
 	struct cl_attr   *attr  = &osc_env_info(env)->oti_attr;
@@ -799,10 +760,9 @@ int osc_io_write_start(const struct lu_env *env,
 
 	RETURN(rc);
 }
-EXPORT_SYMBOL(osc_io_write_start);
 
-int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj,
-		  struct cl_fsync_io *fio)
+static int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj,
+			 struct cl_fsync_io *fio)
 {
 	struct osc_io    *oio   = osc_env_io(env);
 	struct obdo      *oa    = &oio->oi_oa;
@@ -827,10 +787,9 @@ int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj,
 	rc = osc_sync_base(obj, oa, osc_async_upcall, cbargs, PTLRPCD_SET);
 	RETURN(rc);
 }
-EXPORT_SYMBOL(osc_fsync_ost);
 
-int osc_io_fsync_start(const struct lu_env *env,
-		       const struct cl_io_slice *slice)
+static int osc_io_fsync_start(const struct lu_env *env,
+			      const struct cl_io_slice *slice)
 {
 	struct cl_io       *io  = slice->cis_io;
 	struct cl_fsync_io *fio = &io->u.ci_fsync;
@@ -869,8 +828,8 @@ int osc_io_fsync_start(const struct lu_env *env,
 	RETURN(result);
 }
 
-void osc_io_fsync_end(const struct lu_env *env,
-		      const struct cl_io_slice *slice)
+static void osc_io_fsync_end(const struct lu_env *env,
+			     const struct cl_io_slice *slice)
 {
 	struct cl_fsync_io *fio = &slice->cis_io->u.ci_fsync;
 	struct cl_object   *obj = slice->cis_obj;
@@ -890,7 +849,6 @@ void osc_io_fsync_end(const struct lu_env *env,
 	}
 	slice->cis_io->ci_result = result;
 }
-EXPORT_SYMBOL(osc_io_fsync_end);
 
 static int osc_io_ladvise_start(const struct lu_env *env,
 				const struct cl_io_slice *slice)
@@ -962,7 +920,8 @@ static void osc_io_ladvise_end(const struct lu_env *env,
 	slice->cis_io->ci_result = result;
 }
 
-void osc_io_end(const struct lu_env *env, const struct cl_io_slice *slice)
+static void osc_io_end(const struct lu_env *env,
+		       const struct cl_io_slice *slice)
 {
 	struct osc_io *oio = cl2osc_io(env, slice);
 
@@ -971,7 +930,6 @@ void osc_io_end(const struct lu_env *env, const struct cl_io_slice *slice)
 		oio->oi_active = NULL;
 	}
 }
-EXPORT_SYMBOL(osc_io_end);
 
 static const struct cl_io_operations osc_io_ops = {
 	.op = {
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_lock.c b/drivers/staging/lustrefsx/lustre/osc/osc_lock.c
index dd956fd8532b2..6d53b5b80c580 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_lock.c
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_lock.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -37,16 +37,32 @@
 
 #define DEBUG_SUBSYSTEM S_OSC
 
+#include <libcfs/libcfs.h>
 /* fid_build_reg_res_name() */
 #include <lustre_fid.h>
-#include <lustre_osc.h>
 
-#include "osc_internal.h"
+#include "osc_cl_internal.h"
 
 /** \addtogroup osc
  *  @{
  */
 
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static const struct cl_lock_operations osc_lock_ops;
+static const struct cl_lock_operations osc_lock_lockless_ops;
+static void osc_lock_to_lockless(const struct lu_env *env,
+                                 struct osc_lock *ols, int force);
+
+int osc_lock_is_lockless(const struct osc_lock *olck)
+{
+        return (olck->ols_cl.cls_ops == &osc_lock_lockless_ops);
+}
+
 /**
  * Returns a weak pointer to the ldlm lock identified by a handle. Returned
  * pointer cannot be dereferenced, as lock is not protected from concurrent
@@ -106,7 +122,7 @@ static int osc_lock_invariant(struct osc_lock *ols)
 
 	if (! ergo(ols->ols_state == OLS_GRANTED,
 		   olock != NULL &&
-		   ldlm_is_granted(olock) &&
+		   olock->l_req_mode == olock->l_granted_mode &&
 		   ols->ols_hold))
 		return 0;
 	return 1;
@@ -118,7 +134,8 @@ static int osc_lock_invariant(struct osc_lock *ols)
  *
  */
 
-void osc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice)
+static void osc_lock_fini(const struct lu_env *env,
+                          struct cl_lock_slice *slice)
 {
 	struct osc_lock  *ols = cl2osc_lock(slice);
 
@@ -127,7 +144,6 @@ void osc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice)
 
 	OBD_SLAB_FREE_PTR(ols, osc_lock_kmem);
 }
-EXPORT_SYMBOL(osc_lock_fini);
 
 static void osc_lock_build_policy(const struct lu_env *env,
 				  const struct cl_lock *lock,
@@ -139,22 +155,44 @@ static void osc_lock_build_policy(const struct lu_env *env,
 	policy->l_extent.gid = d->cld_gid;
 }
 
+static __u64 osc_enq2ldlm_flags(__u32 enqflags)
+{
+	__u64 result = 0;
+
+	LASSERT((enqflags & ~CEF_MASK) == 0);
+
+	if (enqflags & CEF_NONBLOCK)
+		result |= LDLM_FL_BLOCK_NOWAIT;
+	if (enqflags & CEF_ASYNC)
+		result |= LDLM_FL_HAS_INTENT;
+	if (enqflags & CEF_DISCARD_DATA)
+		result |= LDLM_FL_AST_DISCARD_DATA;
+	if (enqflags & CEF_PEEK)
+		result |= LDLM_FL_TEST_LOCK;
+	if (enqflags & CEF_LOCK_MATCH)
+		result |= LDLM_FL_MATCH_LOCK;
+	return result;
+}
+
 /**
  * Updates object attributes from a lock value block (lvb) received together
  * with the DLM lock reply from the server. Copy of osc_update_enqueue()
  * logic.
  *
+ * This can be optimized to not update attributes when lock is a result of a
+ * local match.
+ *
  * Called under lock and resource spin-locks.
  */
-void osc_lock_lvb_update(const struct lu_env *env,
-			 struct osc_object *osc,
-			 struct ldlm_lock *dlmlock,
-			 struct ost_lvb *lvb)
+static void osc_lock_lvb_update(const struct lu_env *env,
+				struct osc_object *osc,
+				struct ldlm_lock *dlmlock,
+				struct ost_lvb *lvb)
 {
-	struct cl_object *obj = osc2cl(osc);
-	struct lov_oinfo *oinfo = osc->oo_oinfo;
-	struct cl_attr *attr = &osc_env_info(env)->oti_attr;
-	unsigned valid, setkms = 0;
+	struct cl_object  *obj = osc2cl(osc);
+	struct lov_oinfo  *oinfo = osc->oo_oinfo;
+	struct cl_attr    *attr = &osc_env_info(env)->oti_attr;
+	unsigned           valid;
 
 	ENTRY;
 
@@ -179,23 +217,19 @@ void osc_lock_lvb_update(const struct lu_env *env,
                 if (size > dlmlock->l_policy_data.l_extent.end)
                         size = dlmlock->l_policy_data.l_extent.end + 1;
                 if (size >= oinfo->loi_kms) {
+			LDLM_DEBUG(dlmlock, "lock acquired, setting rss=%llu"
+				   ", kms=%llu", lvb->lvb_size, size);
                         valid |= CAT_KMS;
                         attr->cat_kms = size;
-			setkms = 1;
+                } else {
+                        LDLM_DEBUG(dlmlock, "lock acquired, setting rss="
+				   "%llu; leaving kms=%llu, end=%llu",
+                                   lvb->lvb_size, oinfo->loi_kms,
+                                   dlmlock->l_policy_data.l_extent.end);
                 }
 		ldlm_lock_allow_match_locked(dlmlock);
 	}
 
-	/* The size should not be less than the kms */
-	if (attr->cat_size < oinfo->loi_kms)
-		attr->cat_size = oinfo->loi_kms;
-
-	LDLM_DEBUG(dlmlock, "acquired size %llu, setting rss=%llu;%s "
-		   "kms=%llu, end=%llu", lvb->lvb_size, attr->cat_size,
-		   setkms ? "" : " leaving",
-		   setkms ? attr->cat_kms : oinfo->loi_kms,
-		   dlmlock ? dlmlock->l_policy_data.l_extent.end : -1ull);
-
 	cl_object_attr_update(env, obj, attr, valid);
 	cl_object_attr_unlock(obj);
 
@@ -203,9 +237,8 @@ void osc_lock_lvb_update(const struct lu_env *env,
 }
 
 static void osc_lock_granted(const struct lu_env *env, struct osc_lock *oscl,
-			     struct lustre_handle *lockh)
+			     struct lustre_handle *lockh, bool lvb_update)
 {
-	struct osc_object *osc = cl2osc(oscl->ols_cl.cls_obj);
 	struct ldlm_lock *dlmlock;
 
 	dlmlock = ldlm_handle2lock_long(lockh, 0);
@@ -232,7 +265,7 @@ static void osc_lock_granted(const struct lu_env *env, struct osc_lock *oscl,
 
 	/* Lock must have been granted. */
 	lock_res_and_lock(dlmlock);
-	if (ldlm_is_granted(dlmlock)) {
+	if (dlmlock->l_granted_mode == dlmlock->l_req_mode) {
 		struct ldlm_extent *ext = &dlmlock->l_policy_data.l_extent;
 		struct cl_lock_descr *descr = &oscl->ols_cl.cls_lock->cll_descr;
 
@@ -244,11 +277,10 @@ static void osc_lock_granted(const struct lu_env *env, struct osc_lock *oscl,
 		descr->cld_gid   = ext->gid;
 
 		/* no lvb update for matched lock */
-		if (!ldlm_is_lvb_cached(dlmlock)) {
+		if (lvb_update) {
 			LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY);
-			LASSERT(osc == dlmlock->l_ast_data);
-			osc_lock_lvb_update(env, osc, dlmlock, NULL);
-			ldlm_set_lvb_cached(dlmlock);
+			osc_lock_lvb_update(env, cl2osc(oscl->ols_cl.cls_obj),
+					    dlmlock, NULL);
 		}
 		LINVRNT(osc_lock_invariant(oscl));
 	}
@@ -288,7 +320,7 @@ static int osc_lock_upcall(void *cookie, struct lustre_handle *lockh,
 	}
 
 	if (rc == 0)
-		osc_lock_granted(env, oscl, lockh);
+		osc_lock_granted(env, oscl, lockh, errcode == ELDLM_OK);
 
 	/* Error handling, some errors are tolerable. */
 	if (oscl->ols_locklessable && rc == -EUSERS) {
@@ -296,7 +328,7 @@ static int osc_lock_upcall(void *cookie, struct lustre_handle *lockh,
 		 * lockless lock.
 		 */
 		osc_object_set_contended(cl2osc(slice->cls_obj));
-		LASSERT(slice->cls_ops != oscl->ols_lockless_ops);
+		LASSERT(slice->cls_ops == &osc_lock_ops);
 
 		/* Change this lock to ldlmlock-less lock. */
 		osc_lock_to_lockless(env, oscl, 1);
@@ -308,8 +340,6 @@ static int osc_lock_upcall(void *cookie, struct lustre_handle *lockh,
 				    NULL, &oscl->ols_lvb);
 		/* Hide the error. */
 		rc = 0;
-	} else if (rc < 0 && oscl->ols_flags & LDLM_FL_NDELAY) {
-		rc = -EWOULDBLOCK;
 	}
 
 	if (oscl->ols_owner != NULL)
@@ -319,9 +349,8 @@ static int osc_lock_upcall(void *cookie, struct lustre_handle *lockh,
 	RETURN(rc);
 }
 
-static int osc_lock_upcall_speculative(void *cookie,
-				       struct lustre_handle *lockh,
-				       int errcode)
+static int osc_lock_upcall_agl(void *cookie, struct lustre_handle *lockh,
+			       int errcode)
 {
 	struct osc_object	*osc = cookie;
 	struct ldlm_lock	*dlmlock;
@@ -342,10 +371,9 @@ static int osc_lock_upcall_speculative(void *cookie,
 	LASSERT(dlmlock != NULL);
 
 	lock_res_and_lock(dlmlock);
-	LASSERT(ldlm_is_granted(dlmlock));
+	LASSERT(dlmlock->l_granted_mode == dlmlock->l_req_mode);
 
-	/* there is no osc_lock associated with speculative locks
-	 * thus no need to set LDLM_FL_LVB_CACHED */
+	/* there is no osc_lock associated with AGL lock */
 	osc_lock_lvb_update(env, osc, dlmlock, NULL);
 
 	unlock_res_and_lock(dlmlock);
@@ -381,12 +409,7 @@ static int osc_lock_flush(struct osc_object *obj, pgoff_t start, pgoff_t end,
 			rc = 0;
 	}
 
-	/*
-	 * Do not try to match other locks with CLM_WRITE since we already
-	 * know there're none
-	 */
-	rc2 = osc_lock_discard_pages(env, obj, start, end,
-				     mode == CLM_WRITE || discard);
+	rc2 = osc_lock_discard_pages(env, obj, start, end, discard);
 	if (rc == 0 && rc2 < 0)
 		rc = rc2;
 
@@ -411,7 +434,7 @@ static int osc_dlm_blocking_ast0(const struct lu_env *env,
 	LASSERT(flag == LDLM_CB_CANCELING);
 
 	lock_res_and_lock(dlmlock);
-	if (!ldlm_is_granted(dlmlock)) {
+	if (dlmlock->l_granted_mode != dlmlock->l_req_mode) {
 		dlmlock->l_ast_data = NULL;
 		unlock_res_and_lock(dlmlock);
 		RETURN(0);
@@ -551,17 +574,13 @@ static int osc_ldlm_blocking_ast(struct ldlm_lock *dlmlock,
 	RETURN(result);
 }
 
-int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
+static int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
 {
 	struct ptlrpc_request	*req  = data;
 	struct lu_env		*env;
 	struct ost_lvb		*lvb;
 	struct req_capsule	*cap;
 	struct cl_object	*obj = NULL;
-	struct ldlm_resource	*res = dlmlock->l_resource;
-	struct ldlm_match_data  matchdata = { 0 };
-	union ldlm_policy_data  policy;
-	enum ldlm_mode		mode = LCK_PW | LCK_GROUP | LCK_PR;
 	int			result;
 	__u16			refcheck;
 
@@ -573,40 +592,13 @@ int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
 	if (IS_ERR(env))
 		GOTO(out, result = PTR_ERR(env));
 
-	policy.l_extent.start = 0;
-	policy.l_extent.end = LUSTRE_EOF;
-
-	matchdata.lmd_mode = &mode;
-	matchdata.lmd_policy = &policy;
-	matchdata.lmd_flags = LDLM_FL_TEST_LOCK | LDLM_FL_CBPENDING;
-	matchdata.lmd_unref = 1;
-	matchdata.lmd_has_ast_data = true;
-
-	LDLM_LOCK_GET(dlmlock);
-
-	/* If any dlmlock has l_ast_data set, we must find it or we risk
-	 * missing a size update done under a different lock.
-	 */
-	while (dlmlock) {
-		lock_res_and_lock(dlmlock);
-		if (dlmlock->l_ast_data) {
-			obj = osc2cl(dlmlock->l_ast_data);
-			cl_object_get(obj);
-		}
-		unlock_res_and_lock(dlmlock);
-		LDLM_LOCK_RELEASE(dlmlock);
-
-		dlmlock = NULL;
 
-		if (obj == NULL && res->lr_type == LDLM_EXTENT) {
-			if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_SIZE_DATA))
-				break;
-
-			lock_res(res);
-			dlmlock = search_itree(res, &matchdata);
-			unlock_res(res);
-		}
+	lock_res_and_lock(dlmlock);
+	if (dlmlock->l_ast_data != NULL) {
+		obj = osc2cl(dlmlock->l_ast_data);
+		cl_object_get(obj);
 	}
+	unlock_res_and_lock(dlmlock);
 
 	if (obj != NULL) {
 		/* Do not grab the mutex of cl_lock for glimpse.
@@ -644,15 +636,15 @@ int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
 	req->rq_status = result;
 	RETURN(result);
 }
-EXPORT_SYMBOL(osc_ldlm_glimpse_ast);
 
 static int weigh_cb(const struct lu_env *env, struct cl_io *io,
 		    struct osc_page *ops, void *cbdata)
 {
 	struct cl_page *page = ops->ops_cl.cpl_page;
 
-	if (cl_page_is_vmlocked(env, page) || PageDirty(page->cp_vmpage) ||
-	    PageWriteback(page->cp_vmpage))
+	if (cl_page_is_vmlocked(env, page)
+	    || PageDirty(page->cp_vmpage) || PageWriteback(page->cp_vmpage)
+	   )
 		return CLP_GANG_ABORT;
 
 	*(pgoff_t *)cbdata = osc_index(ops) + 1;
@@ -661,13 +653,12 @@ static int weigh_cb(const struct lu_env *env, struct cl_io *io,
 
 static unsigned long osc_lock_weight(const struct lu_env *env,
 				     struct osc_object *oscobj,
-				     loff_t start, loff_t end)
+				     struct ldlm_extent *extent)
 {
-	struct cl_io *io = osc_env_thread_io(env);
+	struct cl_io     *io = &osc_env_info(env)->oti_io;
 	struct cl_object *obj = cl_object_top(&oscobj->oo_cl);
-	pgoff_t page_index;
-	int result;
-
+	pgoff_t          page_index;
+	int              result;
 	ENTRY;
 
 	io->ci_obj = obj;
@@ -676,10 +667,11 @@ static unsigned long osc_lock_weight(const struct lu_env *env,
 	if (result != 0)
 		RETURN(result);
 
-	page_index = cl_index(obj, start);
+	page_index = cl_index(obj, extent->start);
 	do {
 		result = osc_page_gang_lookup(env, io, oscobj,
-					      page_index, cl_index(obj, end),
+					      page_index,
+					      cl_index(obj, extent->end),
 					      weigh_cb, (void *)&page_index);
 		if (result == CLP_GANG_ABORT)
 			break;
@@ -696,13 +688,12 @@ static unsigned long osc_lock_weight(const struct lu_env *env,
  */
 unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock)
 {
-	struct lu_env *env;
-	struct osc_object *obj;
-	struct osc_lock *oscl;
-	unsigned long weight;
-	bool found = false;
-	__u16 refcheck;
-
+	struct lu_env           *env;
+	struct osc_object	*obj;
+	struct osc_lock		*oscl;
+	unsigned long            weight;
+	bool			found = false;
+	__u16			refcheck;
 	ENTRY;
 
 	might_sleep();
@@ -718,9 +709,7 @@ unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock)
 		/* Mostly because lack of memory, do not eliminate this lock */
 		RETURN(1);
 
-	LASSERT(dlmlock->l_resource->lr_type == LDLM_EXTENT ||
-		dlmlock->l_resource->lr_type == LDLM_IBITS);
-
+	LASSERT(dlmlock->l_resource->lr_type == LDLM_EXTENT);
 	lock_res_and_lock(dlmlock);
 	obj = dlmlock->l_ast_data;
 	if (obj)
@@ -732,10 +721,9 @@ unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock)
 
 	spin_lock(&obj->oo_ol_spin);
 	list_for_each_entry(oscl, &obj->oo_ol_list, ols_nextlock_oscobj) {
-		if (oscl->ols_dlmlock == dlmlock) {
-			found = true;
-			break;
-		}
+		if (oscl->ols_dlmlock != NULL && oscl->ols_dlmlock != dlmlock)
+			continue;
+		found = true;
 	}
 	spin_unlock(&obj->oo_ol_spin);
 	if (found) {
@@ -745,18 +733,7 @@ unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock)
 		GOTO(out, weight = 1);
 	}
 
-	if (dlmlock->l_resource->lr_type == LDLM_EXTENT)
-		weight = osc_lock_weight(env, obj,
-					 dlmlock->l_policy_data.l_extent.start,
-					 dlmlock->l_policy_data.l_extent.end);
-	else if (ldlm_has_dom(dlmlock))
-		weight = osc_lock_weight(env, obj, 0, OBD_OBJECT_EOF);
-	/* The DOM bit can be cancelled at any time; in that case, we know
-	 * there are no pages, so just return weight of 0
-	 */
-	else
-		weight = 0;
-
+	weight = osc_lock_weight(env, obj, &dlmlock->l_policy_data.l_extent);
 	EXIT;
 
 out:
@@ -766,7 +743,6 @@ unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock)
 	cl_env_put(env, &refcheck);
 	return weight;
 }
-EXPORT_SYMBOL(osc_ldlm_weigh_ast);
 
 static void osc_lock_build_einfo(const struct lu_env *env,
 				 const struct cl_lock *lock,
@@ -793,46 +769,46 @@ static void osc_lock_build_einfo(const struct lu_env *env,
  *  Additional policy can be implemented here, e.g., never do lockless-io
  *  for large extents.
  */
-void osc_lock_to_lockless(const struct lu_env *env,
-			  struct osc_lock *ols, int force)
+static void osc_lock_to_lockless(const struct lu_env *env,
+                                 struct osc_lock *ols, int force)
 {
-	struct cl_lock_slice *slice = &ols->ols_cl;
-	struct osc_io *oio = osc_env_io(env);
-	struct cl_io *io = oio->oi_cl.cis_io;
-	struct cl_object *obj = slice->cls_obj;
-	struct osc_object *oob = cl2osc(obj);
-	const struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
-	struct obd_connect_data *ocd;
-
-	LASSERT(ols->ols_state == OLS_NEW ||
-		ols->ols_state == OLS_UPCALL_RECEIVED);
-
-	if (force) {
-		ols->ols_locklessable = 1;
-		slice->cls_ops = ols->ols_lockless_ops;
-	} else {
-		LASSERT(io->ci_lockreq == CILR_MANDATORY ||
-			io->ci_lockreq == CILR_MAYBE ||
-			io->ci_lockreq == CILR_NEVER);
-
-		ocd = &class_exp2cliimp(osc_export(oob))->imp_connect_data;
-		ols->ols_locklessable = (io->ci_type != CIT_SETATTR) &&
-					(io->ci_lockreq == CILR_MAYBE) &&
-					(ocd->ocd_connect_flags &
-					 OBD_CONNECT_SRVLOCK);
-		if (io->ci_lockreq == CILR_NEVER ||
-		    /* lockless IO */
-		    (ols->ols_locklessable && osc_object_is_contended(oob)) ||
-		    /* lockless truncate */
-		    (cl_io_is_trunc(io) && osd->od_lockless_truncate &&
-		     (ocd->ocd_connect_flags & OBD_CONNECT_TRUNCLOCK))) {
-			ols->ols_locklessable = 1;
-			slice->cls_ops = ols->ols_lockless_ops;
-		}
-	}
-	LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols)));
+        struct cl_lock_slice *slice = &ols->ols_cl;
+
+        LASSERT(ols->ols_state == OLS_NEW ||
+                ols->ols_state == OLS_UPCALL_RECEIVED);
+
+        if (force) {
+                ols->ols_locklessable = 1;
+                slice->cls_ops = &osc_lock_lockless_ops;
+        } else {
+                struct osc_io *oio     = osc_env_io(env);
+                struct cl_io  *io      = oio->oi_cl.cis_io;
+                struct cl_object *obj  = slice->cls_obj;
+                struct osc_object *oob = cl2osc(obj);
+                const struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
+                struct obd_connect_data *ocd;
+
+                LASSERT(io->ci_lockreq == CILR_MANDATORY ||
+                        io->ci_lockreq == CILR_MAYBE ||
+                        io->ci_lockreq == CILR_NEVER);
+
+                ocd = &class_exp2cliimp(osc_export(oob))->imp_connect_data;
+                ols->ols_locklessable = (io->ci_type != CIT_SETATTR) &&
+                                (io->ci_lockreq == CILR_MAYBE) &&
+                                (ocd->ocd_connect_flags & OBD_CONNECT_SRVLOCK);
+                if (io->ci_lockreq == CILR_NEVER ||
+                        /* lockless IO */
+                    (ols->ols_locklessable && osc_object_is_contended(oob)) ||
+                        /* lockless truncate */
+                    (cl_io_is_trunc(io) &&
+                     (ocd->ocd_connect_flags & OBD_CONNECT_TRUNCLOCK) &&
+                      osd->od_lockless_truncate)) {
+                        ols->ols_locklessable = 1;
+                        slice->cls_ops = &osc_lock_lockless_ops;
+                }
+        }
+        LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols)));
 }
-EXPORT_SYMBOL(osc_lock_to_lockless);
 
 static bool osc_lock_compatible(const struct osc_lock *qing,
 				const struct osc_lock *qed)
@@ -840,7 +816,7 @@ static bool osc_lock_compatible(const struct osc_lock *qing,
 	struct cl_lock_descr *qed_descr = &qed->ols_cl.cls_lock->cll_descr;
 	struct cl_lock_descr *qing_descr = &qing->ols_cl.cls_lock->cll_descr;
 
-	if (qed->ols_glimpse || qed->ols_speculative)
+	if (qed->ols_glimpse)
 		return true;
 
 	if (qing_descr->cld_mode == CLM_READ && qed_descr->cld_mode == CLM_READ)
@@ -857,8 +833,9 @@ static bool osc_lock_compatible(const struct osc_lock *qing,
 	return false;
 }
 
-void osc_lock_wake_waiters(const struct lu_env *env, struct osc_object *osc,
-			   struct osc_lock *oscl)
+static void osc_lock_wake_waiters(const struct lu_env *env,
+				  struct osc_object *osc,
+				  struct osc_lock *oscl)
 {
 	spin_lock(&osc->oo_ol_spin);
 	list_del_init(&oscl->ols_nextlock_oscobj);
@@ -876,16 +853,14 @@ void osc_lock_wake_waiters(const struct lu_env *env, struct osc_object *osc,
 	}
 	spin_unlock(&oscl->ols_lock);
 }
-EXPORT_SYMBOL(osc_lock_wake_waiters);
 
-int osc_lock_enqueue_wait(const struct lu_env *env, struct osc_object *obj,
-			  struct osc_lock *oscl)
+static int osc_lock_enqueue_wait(const struct lu_env *env,
+		struct osc_object *obj, struct osc_lock *oscl)
 {
 	struct osc_lock         *tmp_oscl;
 	struct cl_lock_descr    *need = &oscl->ols_cl.cls_lock->cll_descr;
 	struct cl_sync_io       *waiter = &osc_env_info(env)->oti_anchor;
 	int rc = 0;
-
 	ENTRY;
 
 	spin_lock(&obj->oo_ol_spin);
@@ -936,7 +911,6 @@ int osc_lock_enqueue_wait(const struct lu_env *env, struct osc_object *obj,
 
 	RETURN(rc);
 }
-EXPORT_SYMBOL(osc_lock_enqueue_wait);
 
 /**
  * Implementation of cl_lock_operations::clo_enqueue() method for osc
@@ -960,7 +934,6 @@ static int osc_lock_enqueue(const struct lu_env *env,
 	struct osc_io			*oio   = osc_env_io(env);
 	struct osc_object		*osc   = cl2osc(slice->cls_obj);
 	struct osc_lock			*oscl  = cl2osc_lock(slice);
-	struct obd_export		*exp   = osc_export(osc);
 	struct cl_lock			*lock  = slice->cls_lock;
 	struct ldlm_res_id		*resname = &info->oti_resname;
 	union ldlm_policy_data		*policy  = &info->oti_policy;
@@ -977,22 +950,11 @@ static int osc_lock_enqueue(const struct lu_env *env,
 	if (oscl->ols_state == OLS_GRANTED)
 		RETURN(0);
 
-	if ((oscl->ols_flags & LDLM_FL_NO_EXPANSION) &&
-	    !(exp_connect_lockahead_old(exp) || exp_connect_lockahead(exp))) {
-		result = -EOPNOTSUPP;
-		CERROR("%s: server does not support lockahead/locknoexpand:"
-		       "rc = %d\n", exp->exp_obd->obd_name, result);
-		RETURN(result);
-	}
-
 	if (oscl->ols_flags & LDLM_FL_TEST_LOCK)
 		GOTO(enqueue_base, 0);
 
-	/* For glimpse and/or speculative locks, do not wait for reply from
-	 * server on LDLM request */
-	if (oscl->ols_glimpse || oscl->ols_speculative) {
-		/* Speculative and glimpse locks do not have an anchor */
-		LASSERT(equi(oscl->ols_speculative, anchor == NULL));
+	if (oscl->ols_glimpse) {
+		LASSERT(equi(oscl->ols_agl, anchor == NULL));
 		async = true;
 		GOTO(enqueue_base, 0);
 	}
@@ -1018,30 +980,25 @@ static int osc_lock_enqueue(const struct lu_env *env,
 
 	/**
 	 * DLM lock's ast data must be osc_object;
-	 * if glimpse or speculative lock, async of osc_enqueue_base()
-	 * must be true
-	 *
-	 * For non-speculative locks:
+	 * if glimpse or AGL lock, async of osc_enqueue_base() must be true,
 	 * DLM's enqueue callback set to osc_lock_upcall() with cookie as
 	 * osc_lock.
-	 * For speculative locks:
-	 * osc_lock_upcall_speculative & cookie is the osc object, since
-	 * there is no osc_lock
 	 */
 	ostid_build_res_name(&osc->oo_oinfo->loi_oi, resname);
 	osc_lock_build_policy(env, lock, policy);
-	if (oscl->ols_speculative) {
+	if (oscl->ols_agl) {
 		oscl->ols_einfo.ei_cbdata = NULL;
 		/* hold a reference for callback */
 		cl_object_get(osc2cl(osc));
-		upcall = osc_lock_upcall_speculative;
+		upcall = osc_lock_upcall_agl;
 		cookie = osc;
 	}
-	result = osc_enqueue_base(exp, resname, &oscl->ols_flags,
+	result = osc_enqueue_base(osc_export(osc), resname, &oscl->ols_flags,
 				  policy, &oscl->ols_lvb,
+				  osc->oo_oinfo->loi_kms_valid,
 				  upcall, cookie,
 				  &oscl->ols_einfo, PTLRPCD_SET, async,
-				  oscl->ols_speculative);
+				  oscl->ols_agl);
 	if (result == 0) {
 		if (osc_lock_is_lockless(oscl)) {
 			oio->oi_lockless = 1;
@@ -1050,12 +1007,9 @@ static int osc_lock_enqueue(const struct lu_env *env,
 			LASSERT(oscl->ols_hold);
 			LASSERT(oscl->ols_dlmlock != NULL);
 		}
-	} else if (oscl->ols_speculative) {
+	} else if (oscl->ols_agl) {
 		cl_object_put(env, osc2cl(osc));
-		if (oscl->ols_glimpse) {
-			/* hide error for AGL request */
-			result = 0;
-		}
+		result = 0;
 	}
 
 out:
@@ -1113,8 +1067,8 @@ static void osc_lock_detach(const struct lu_env *env, struct osc_lock *olck)
  *
  *     - cancels ldlm lock (ldlm_cli_cancel()).
  */
-void osc_lock_cancel(const struct lu_env *env,
-		     const struct cl_lock_slice *slice)
+static void osc_lock_cancel(const struct lu_env *env,
+                            const struct cl_lock_slice *slice)
 {
 	struct osc_object *obj  = cl2osc(slice->cls_obj);
 	struct osc_lock	  *oscl = cl2osc_lock(slice);
@@ -1130,10 +1084,9 @@ void osc_lock_cancel(const struct lu_env *env,
 	osc_lock_wake_waiters(env, obj, oscl);
 	EXIT;
 }
-EXPORT_SYMBOL(osc_lock_cancel);
 
-int osc_lock_print(const struct lu_env *env, void *cookie,
-		   lu_printer_t p, const struct cl_lock_slice *slice)
+static int osc_lock_print(const struct lu_env *env, void *cookie,
+			  lu_printer_t p, const struct cl_lock_slice *slice)
 {
 	struct osc_lock *lock = cl2osc_lock(slice);
 
@@ -1143,7 +1096,6 @@ int osc_lock_print(const struct lu_env *env, void *cookie,
 	osc_lvb_print(env, cookie, p, &lock->ols_lvb);
 	return 0;
 }
-EXPORT_SYMBOL(osc_lock_print);
 
 static const struct cl_lock_operations osc_lock_ops = {
         .clo_fini    = osc_lock_fini,
@@ -1177,8 +1129,9 @@ static const struct cl_lock_operations osc_lock_lockless_ops = {
         .clo_print     = osc_lock_print
 };
 
-void osc_lock_set_writer(const struct lu_env *env, const struct cl_io *io,
-			 struct cl_object *obj, struct osc_lock *oscl)
+static void osc_lock_set_writer(const struct lu_env *env,
+				const struct cl_io *io,
+				struct cl_object *obj, struct osc_lock *oscl)
 {
 	struct cl_lock_descr *descr = &oscl->ols_cl.cls_lock->cll_descr;
 	pgoff_t io_start;
@@ -1188,9 +1141,9 @@ void osc_lock_set_writer(const struct lu_env *env, const struct cl_io *io,
 		return;
 
 	if (likely(io->ci_type == CIT_WRITE)) {
-		io_start = cl_index(obj, io->u.ci_rw.crw_pos);
-		io_end = cl_index(obj, io->u.ci_rw.crw_pos +
-						io->u.ci_rw.crw_count - 1);
+		io_start = cl_index(obj, io->u.ci_rw.rw_range.cir_pos);
+		io_end = cl_index(obj, io->u.ci_rw.rw_range.cir_pos +
+				  io->u.ci_rw.rw_range.cir_count - 1);
 	} else {
 		LASSERT(cl_io_is_mkwrite(io));
 		io_start = io_end = io->u.ci_fault.ft_index;
@@ -1206,7 +1159,6 @@ void osc_lock_set_writer(const struct lu_env *env, const struct cl_io *io,
 		oio->oi_write_osclock = oscl;
 	}
 }
-EXPORT_SYMBOL(osc_lock_set_writer);
 
 int osc_lock_init(const struct lu_env *env,
 		  struct cl_object *obj, struct cl_lock *lock,
@@ -1224,23 +1176,15 @@ int osc_lock_init(const struct lu_env *env,
 	INIT_LIST_HEAD(&oscl->ols_waiting_list);
 	INIT_LIST_HEAD(&oscl->ols_wait_entry);
 	INIT_LIST_HEAD(&oscl->ols_nextlock_oscobj);
-	oscl->ols_lockless_ops = &osc_lock_lockless_ops;
-
-	/* Speculative lock requests must be either no_expand or glimpse
-	 * request (CEF_GLIMPSE).  non-glimpse no_expand speculative extent
-	 * locks will break ofd_intent_cb. (see comment there)*/
-	LASSERT(ergo((enqflags & CEF_SPECULATIVE) != 0,
-		(enqflags & (CEF_LOCK_NO_EXPAND | CEF_GLIMPSE)) != 0));
 
 	oscl->ols_flags = osc_enq2ldlm_flags(enqflags);
-	oscl->ols_speculative = !!(enqflags & CEF_SPECULATIVE);
-
+	oscl->ols_agl = !!(enqflags & CEF_AGL);
+	if (oscl->ols_agl)
+		oscl->ols_flags |= LDLM_FL_BLOCK_NOWAIT;
 	if (oscl->ols_flags & LDLM_FL_HAS_INTENT) {
 		oscl->ols_flags |= LDLM_FL_BLOCK_GRANTED;
 		oscl->ols_glimpse = 1;
 	}
-	if (io->ci_ndelay && cl_object_same(io->ci_obj, obj))
-		oscl->ols_flags |= LDLM_FL_NDELAY;
 	osc_lock_build_einfo(env, lock, cl2osc(obj), &oscl->ols_einfo);
 
 	cl_lock_slice_add(lock, &oscl->ols_cl, obj, &osc_lock_ops);
@@ -1264,10 +1208,9 @@ int osc_lock_init(const struct lu_env *env,
  * Finds an existing lock covering given index and optionally different from a
  * given \a except lock.
  */
-struct ldlm_lock *osc_obj_dlmlock_at_pgoff(const struct lu_env *env,
-					   struct osc_object *obj,
-					   pgoff_t index,
-					   enum osc_dap_flags dap_flags)
+struct ldlm_lock *osc_dlmlock_at_pgoff(const struct lu_env *env,
+				       struct osc_object *obj, pgoff_t index,
+				       enum osc_dap_flags dap_flags)
 {
 	struct osc_thread_info *info = osc_env_info(env);
 	struct ldlm_res_id *resname = &info->oti_resname;
@@ -1291,9 +1234,9 @@ struct ldlm_lock *osc_obj_dlmlock_at_pgoff(const struct lu_env *env,
 	 * with a uniq gid and it conflicts with all other lock modes too
 	 */
 again:
-	mode = osc_match_base(env, osc_export(obj), resname, LDLM_EXTENT,
-			      policy, LCK_PR | LCK_PW | LCK_GROUP, &flags,
-			      obj, &lockh, dap_flags & OSC_DAP_FL_CANCELING);
+	mode = osc_match_base(osc_export(obj), resname, LDLM_EXTENT, policy,
+			       LCK_PR | LCK_PW | LCK_GROUP, &flags, obj, &lockh,
+			       dap_flags & OSC_DAP_FL_CANCELING);
 	if (mode != 0) {
 		lock = ldlm_handle2lock(&lockh);
 		/* RACE: the lock is cancelled so let's try again */
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_object.c b/drivers/staging/lustrefsx/lustre/osc/osc_object.c
index a99747cecf011..052f8bc90525c 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_object.c
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_object.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -36,9 +36,8 @@
  */
 
 #define DEBUG_SUBSYSTEM S_OSC
-#include <lustre_osc.h>
 
-#include "osc_internal.h"
+#include "osc_cl_internal.h"
 
 /** \addtogroup osc
  *  @{
@@ -46,27 +45,34 @@
 
 /*****************************************************************************
  *
- * Object operations.
+ * Type conversions.
  *
  */
-static void osc_obj_build_res_name(struct osc_object *osc,
-				   struct ldlm_res_id *resname)
+
+static struct lu_object *osc2lu(struct osc_object *osc)
 {
-	ostid_build_res_name(&osc->oo_oinfo->loi_oi, resname);
+        return &osc->oo_cl.co_lu;
 }
 
-static const struct osc_object_operations osc_object_ops = {
-	.oto_build_res_name = osc_obj_build_res_name,
-	.oto_dlmlock_at_pgoff = osc_obj_dlmlock_at_pgoff,
-};
+static struct osc_object *lu2osc(const struct lu_object *obj)
+{
+        LINVRNT(osc_is_object(obj));
+        return container_of0(obj, struct osc_object, oo_cl.co_lu);
+}
+
+/*****************************************************************************
+ *
+ * Object operations.
+ *
+ */
 
-int osc_object_init(const struct lu_env *env, struct lu_object *obj,
-		    const struct lu_object_conf *conf)
+static int osc_object_init(const struct lu_env *env, struct lu_object *obj,
+                           const struct lu_object_conf *conf)
 {
         struct osc_object           *osc   = lu2osc(obj);
         const struct cl_object_conf *cconf = lu2cl_conf(conf);
 
-	osc->oo_oinfo = cconf->u.coc_oinfo;
+        osc->oo_oinfo = cconf->u.coc_oinfo;
 #ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
 	mutex_init(&osc->oo_debug_mutex);
 #endif
@@ -90,15 +96,12 @@ int osc_object_init(const struct lu_env *env, struct lu_object *obj,
 	atomic_set(&osc->oo_nr_ios, 0);
 	init_waitqueue_head(&osc->oo_io_waitq);
 
-	LASSERT(osc->oo_obj_ops != NULL);
-
 	cl_object_page_init(lu2cl(obj), sizeof(struct osc_page));
 
 	return 0;
 }
-EXPORT_SYMBOL(osc_object_init);
 
-void osc_object_free(const struct lu_env *env, struct lu_object *obj)
+static void osc_object_free(const struct lu_env *env, struct lu_object *obj)
 {
 	struct osc_object *osc = lu2osc(obj);
 
@@ -120,24 +123,22 @@ void osc_object_free(const struct lu_env *env, struct lu_object *obj)
 	lu_object_fini(obj);
 	OBD_SLAB_FREE_PTR(osc, osc_object_kmem);
 }
-EXPORT_SYMBOL(osc_object_free);
 
 int osc_lvb_print(const struct lu_env *env, void *cookie,
-		  lu_printer_t p, const struct ost_lvb *lvb)
+                  lu_printer_t p, const struct ost_lvb *lvb)
 {
 	return (*p)(env, cookie, "size: %llu mtime: %llu atime: %llu "
 		    "ctime: %llu blocks: %llu",
                     lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime,
                     lvb->lvb_ctime, lvb->lvb_blocks);
 }
-EXPORT_SYMBOL(osc_lvb_print);
 
-int osc_object_print(const struct lu_env *env, void *cookie,
-		     lu_printer_t p, const struct lu_object *obj)
+static int osc_object_print(const struct lu_env *env, void *cookie,
+                            lu_printer_t p, const struct lu_object *obj)
 {
-	struct osc_object *osc = lu2osc(obj);
-	struct lov_oinfo *oinfo = osc->oo_oinfo;
-	struct osc_async_rc *ar = &oinfo->loi_ar;
+	struct osc_object   *osc   = lu2osc(obj);
+	struct lov_oinfo    *oinfo = osc->oo_oinfo;
+	struct osc_async_rc *ar    = &oinfo->loi_ar;
 
 	(*p)(env, cookie, "id: "DOSTID" "
 	     "idx: %d gen: %d kms_valid: %u kms %llu "
@@ -148,22 +149,20 @@ int osc_object_print(const struct lu_env *env, void *cookie,
 	osc_lvb_print(env, cookie, p, &oinfo->loi_lvb);
 	return 0;
 }
-EXPORT_SYMBOL(osc_object_print);
 
 
-int osc_attr_get(const struct lu_env *env, struct cl_object *obj,
-		 struct cl_attr *attr)
+static int osc_attr_get(const struct lu_env *env, struct cl_object *obj,
+                        struct cl_attr *attr)
 {
-	struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+        struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
 
-	cl_lvb2attr(attr, &oinfo->loi_lvb);
-	attr->cat_kms = oinfo->loi_kms_valid ? oinfo->loi_kms : 0;
-	return 0;
+        cl_lvb2attr(attr, &oinfo->loi_lvb);
+        attr->cat_kms = oinfo->loi_kms_valid ? oinfo->loi_kms : 0;
+        return 0;
 }
-EXPORT_SYMBOL(osc_attr_get);
 
-int osc_attr_update(const struct lu_env *env, struct cl_object *obj,
-		    const struct cl_attr *attr, unsigned valid)
+static int osc_attr_update(const struct lu_env *env, struct cl_object *obj,
+			   const struct cl_attr *attr, unsigned valid)
 {
 	struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
 	struct ost_lvb   *lvb   = &oinfo->loi_lvb;
@@ -185,66 +184,39 @@ int osc_attr_update(const struct lu_env *env, struct cl_object *obj,
 	}
 	return 0;
 }
-EXPORT_SYMBOL(osc_attr_update);
 
-int osc_object_glimpse(const struct lu_env *env, const struct cl_object *obj,
-		       struct ost_lvb *lvb)
+static int osc_object_glimpse(const struct lu_env *env,
+                              const struct cl_object *obj, struct ost_lvb *lvb)
 {
-	struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+        struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
 
-	lvb->lvb_size = oinfo->loi_kms;
-	lvb->lvb_blocks = oinfo->loi_lvb.lvb_blocks;
-	return 0;
+        ENTRY;
+        lvb->lvb_size   = oinfo->loi_kms;
+        lvb->lvb_blocks = oinfo->loi_lvb.lvb_blocks;
+        RETURN(0);
 }
-EXPORT_SYMBOL(osc_object_glimpse);
 
 static int osc_object_ast_clear(struct ldlm_lock *lock, void *data)
 {
-	struct osc_object *osc = (struct osc_object *)data;
-	struct ost_lvb *lvb = lock->l_lvb_data;
-	struct lov_oinfo *oinfo;
 	ENTRY;
 
-	if (lock->l_ast_data == data) {
+	if (lock->l_ast_data == data)
 		lock->l_ast_data = NULL;
-
-		LASSERT(osc != NULL);
-		LASSERT(osc->oo_oinfo != NULL);
-		LASSERT(lvb != NULL);
-
-		/* Updates lvb in lock by the cached oinfo */
-		oinfo = osc->oo_oinfo;
-
-		LDLM_DEBUG(lock, "update lock size %llu blocks %llu [cma]time: "
-			   "%llu %llu %llu by oinfo size %llu blocks %llu "
-			   "[cma]time %llu %llu %llu", lvb->lvb_size,
-			   lvb->lvb_blocks, lvb->lvb_ctime, lvb->lvb_mtime,
-			   lvb->lvb_atime, oinfo->loi_lvb.lvb_size,
-			   oinfo->loi_lvb.lvb_blocks, oinfo->loi_lvb.lvb_ctime,
-			   oinfo->loi_lvb.lvb_mtime, oinfo->loi_lvb.lvb_atime);
-		LASSERT(oinfo->loi_lvb.lvb_size >= oinfo->loi_kms);
-
-		cl_object_attr_lock(&osc->oo_cl);
-		memcpy(lvb, &oinfo->loi_lvb, sizeof(oinfo->loi_lvb));
-		cl_object_attr_unlock(&osc->oo_cl);
-		ldlm_clear_lvb_cached(lock);
-	}
 	RETURN(LDLM_ITER_CONTINUE);
 }
 
-int osc_object_prune(const struct lu_env *env, struct cl_object *obj)
+static int osc_object_prune(const struct lu_env *env, struct cl_object *obj)
 {
-	struct osc_object  *osc = cl2osc(obj);
-	struct ldlm_res_id *resname = &osc_env_info(env)->oti_resname;
+	struct osc_object       *osc = cl2osc(obj);
+	struct ldlm_res_id      *resname = &osc_env_info(env)->oti_resname;
 
 	/* DLM locks don't hold a reference of osc_object so we have to
 	 * clear it before the object is being destroyed. */
-	osc_build_res_name(osc, resname);
+	ostid_build_res_name(&osc->oo_oinfo->loi_oi, resname);
 	ldlm_resource_iterate(osc_export(osc)->exp_obd->obd_namespace, resname,
 			      osc_object_ast_clear, osc);
 	return 0;
 }
-EXPORT_SYMBOL(osc_object_prune);
 
 static int osc_object_fiemap(const struct lu_env *env, struct cl_object *obj,
 			     struct ll_fiemap_info_key *fmkey,
@@ -331,11 +303,24 @@ static int osc_object_fiemap(const struct lu_env *env, struct cl_object *obj,
 	RETURN(rc);
 }
 
+void osc_object_set_contended(struct osc_object *obj)
+{
+        obj->oo_contention_time = cfs_time_current();
+        /* mb(); */
+        obj->oo_contended = 1;
+}
+
+void osc_object_clear_contended(struct osc_object *obj)
+{
+        obj->oo_contended = 0;
+}
+
 int osc_object_is_contended(struct osc_object *obj)
 {
-	struct osc_device *dev = lu2osc_dev(obj->oo_cl.co_lu.lo_dev);
-	time64_t osc_contention_time = dev->od_contention_time;
-	ktime_t retry_time;
+        struct osc_device *dev  = lu2osc_dev(obj->oo_cl.co_lu.lo_dev);
+        int osc_contention_time = dev->od_contention_time;
+        cfs_time_t cur_time     = cfs_time_current();
+        cfs_time_t retry_time;
 
         if (OBD_FAIL_CHECK(OBD_FAIL_OSC_OBJECT_CONTENTION))
                 return 1;
@@ -343,19 +328,18 @@ int osc_object_is_contended(struct osc_object *obj)
         if (!obj->oo_contended)
                 return 0;
 
-	/*
-	 * I like copy-paste. the code is copied from
-	 * ll_file_is_contended.
-	 */
-	retry_time = ktime_add_ns(obj->oo_contention_time,
-				  osc_contention_time * NSEC_PER_SEC);
-	if (ktime_after(ktime_get(), retry_time)) {
-		osc_object_clear_contended(obj);
-		return 0;
-	}
-	return 1;
+        /*
+         * I like copy-paste. the code is copied from
+         * ll_file_is_contended.
+         */
+        retry_time = cfs_time_add(obj->oo_contention_time,
+                                  cfs_time_seconds(osc_contention_time));
+        if (cfs_time_after(cur_time, retry_time)) {
+                osc_object_clear_contended(obj);
+                return 0;
+        }
+        return 1;
 }
-EXPORT_SYMBOL(osc_object_is_contended);
 
 /**
  * Implementation of struct cl_object_operations::coo_req_attr_set() for osc
@@ -468,7 +452,6 @@ struct lu_object *osc_object_alloc(const struct lu_env *env,
 		lu_object_init(obj, NULL, dev);
 		osc->oo_cl.co_ops = &osc_ops;
 		obj->lo_ops = &osc_lu_obj_ops;
-		osc->oo_obj_ops = &osc_object_ops;
 	} else
 		obj = NULL;
 	return obj;
@@ -495,5 +478,5 @@ int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc)
 
 	RETURN(0);
 }
-EXPORT_SYMBOL(osc_object_invalidate);
+
 /** @} osc */
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_page.c b/drivers/staging/lustrefsx/lustre/osc/osc_page.c
index a37c185772a00..c89d11333357d 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_page.c
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_page.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -36,9 +36,8 @@
  */
 
 #define DEBUG_SUBSYSTEM S_OSC
-#include <lustre_osc.h>
 
-#include "osc_internal.h"
+#include "osc_cl_internal.h"
 
 static void osc_lru_del(struct client_obd *cli, struct osc_page *opg);
 static void osc_lru_use(struct client_obd *cli, struct osc_page *opg);
@@ -119,12 +118,12 @@ static const char *osc_list(struct list_head *head)
 	return list_empty(head) ? "-" : "+";
 }
 
-static inline s64 osc_submit_duration(struct osc_page *opg)
+static inline cfs_time_t osc_submit_duration(struct osc_page *opg)
 {
-	if (ktime_to_ns(opg->ops_submit_time) == 0)
-		return 0;
+        if (opg->ops_submit_time == 0)
+                return 0;
 
-	return ktime_ms_delta(ktime_get(), opg->ops_submit_time);
+        return (cfs_time_current() - opg->ops_submit_time);
 }
 
 static int osc_page_print(const struct lu_env *env,
@@ -139,8 +138,8 @@ static int osc_page_print(const struct lu_env *env,
 	return (*printer)(env, cookie, LUSTRE_OSC_NAME"-page@%p %lu: "
 			  "1< %#x %d %u %s %s > "
 			  "2< %lld %u %u %#x %#x | %p %p %p > "
-			  "3< %d %lld %d > "
-			  "4< %d %d %d %lu %c | %s %s %s %s > "
+			  "3< %d %lu %d > "
+			  "4< %d %d %d %lu %s | %s %s %s %s > "
 			  "5< %s %s %s %s | %d %s | %d %s %s>\n",
 			  opg, osc_index(opg),
                           /* 1 */
@@ -159,7 +158,7 @@ static int osc_page_print(const struct lu_env *env,
                           cli->cl_r_in_flight, cli->cl_w_in_flight,
                           cli->cl_max_rpcs_in_flight,
                           cli->cl_avail_grant,
-			  waitqueue_active(&cli->cl_cache_waiters) ? '+' : '-',
+                          osc_list(&cli->cl_cache_waiters),
                           osc_list(&cli->cl_loi_ready_list),
                           osc_list(&cli->cl_loi_hp_ready_list),
                           osc_list(&cli->cl_loi_write_list),
@@ -255,22 +254,12 @@ static int osc_page_flush(const struct lu_env *env,
 	RETURN(rc);
 }
 
-static void osc_page_touch(const struct lu_env *env,
-			  const struct cl_page_slice *slice, size_t to)
-{
-	struct osc_page *opg = cl2osc_page(slice);
-	struct cl_object *obj = opg->ops_cl.cpl_obj;
-
-	osc_page_touch_at(env, obj, osc_index(opg), to);
-}
-
 static const struct cl_page_operations osc_page_ops = {
 	.cpo_print         = osc_page_print,
 	.cpo_delete        = osc_page_delete,
 	.cpo_clip           = osc_page_clip,
 	.cpo_cancel         = osc_page_cancel,
-	.cpo_flush          = osc_page_flush,
-	.cpo_page_touch	   = osc_page_touch,
+	.cpo_flush          = osc_page_flush
 };
 
 int osc_page_init(const struct lu_env *env, struct cl_object *obj,
@@ -318,7 +307,6 @@ int osc_page_init(const struct lu_env *env, struct cl_object *obj,
 
 	return result;
 }
-EXPORT_SYMBOL(osc_page_init);
 
 /**
  * Helper function called by osc_io_submit() for every page in an immediate
@@ -327,7 +315,6 @@ EXPORT_SYMBOL(osc_page_init);
 void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
 		     enum cl_req_type crt, int brw_flags)
 {
-	struct osc_io *oio = osc_env_io(env);
 	struct osc_async_page *oap = &opg->ops_oap;
 
 	LASSERTF(oap->oap_magic == OAP_MAGIC, "Bad oap magic: oap %p, "
@@ -340,12 +327,12 @@ void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
 	oap->oap_count     = opg->ops_to - opg->ops_from;
 	oap->oap_brw_flags = OBD_BRW_SYNC | brw_flags;
 
-	if (oio->oi_cap_sys_resource) {
+	if (cfs_capable(CFS_CAP_SYS_RESOURCE)) {
 		oap->oap_brw_flags |= OBD_BRW_NOQUOTA;
 		oap->oap_cmd |= OBD_BRW_NOQUOTA;
 	}
 
-	opg->ops_submit_time = ktime_get();
+	opg->ops_submit_time = cfs_time_current();
 	osc_page_transfer_get(opg, "transfer\0imm");
 	osc_page_transfer_add(env, opg, crt);
 }
@@ -529,22 +516,19 @@ static void osc_lru_use(struct client_obd *cli, struct osc_page *opg)
 static void discard_pagevec(const struct lu_env *env, struct cl_io *io,
 				struct cl_page **pvec, int max_index)
 {
-	struct pagevec *pagevec = &osc_env_info(env)->oti_pagevec;
-	int i;
+        int i;
 
-	ll_pagevec_init(pagevec, 0);
-	for (i = 0; i < max_index; i++) {
-		struct cl_page *page = pvec[i];
+        for (i = 0; i < max_index; i++) {
+                struct cl_page *page = pvec[i];
 
 		LASSERT(cl_page_is_owned(page, io));
 		cl_page_delete(env, page);
 		cl_page_discard(env, io, page);
 		cl_page_disown(env, io, page);
-		cl_pagevec_put(env, page, pagevec);
+                cl_page_put(env, page);
 
-		pvec[i] = NULL;
-	}
-	pagevec_release(pagevec);
+                pvec[i] = NULL;
+        }
 }
 
 /**
@@ -604,7 +588,7 @@ long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
 	}
 
 	pvec = (struct cl_page **)osc_env_info(env)->oti_pvec;
-	io = osc_env_thread_io(env);
+	io = &osc_env_info(env)->oti_io;
 
 	spin_lock(&cli->cl_lru_list_lock);
 	if (force)
@@ -706,7 +690,6 @@ long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
 	}
 	RETURN(count > 0 ? count : rc);
 }
-EXPORT_SYMBOL(osc_lru_shrink);
 
 /**
  * Reclaim LRU pages by an IO thread. The caller wants to reclaim at least
@@ -799,7 +782,6 @@ static int osc_lru_alloc(const struct lu_env *env, struct client_obd *cli,
 	struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
 	struct osc_io *oio = osc_env_io(env);
 	int rc = 0;
-
 	ENTRY;
 
 	if (cli->cl_cache == NULL) /* shall not be in LRU */
@@ -905,27 +887,17 @@ void osc_lru_unreserve(struct client_obd *cli, unsigned long npages)
 #endif
 
 static inline void unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
-					    struct osc_brw_async_args *aa,
 					    int factor)
 {
-	int page_count;
+	int page_count = desc->bd_iov_count;
 	void *zone = NULL;
 	int count = 0;
 	int i;
 
-	if (desc != NULL) {
-		LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
-		page_count = desc->bd_iov_count;
-	} else {
-		page_count = aa->aa_page_count;
-	}
+	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
 
 	for (i = 0; i < page_count; i++) {
-		void *pz;
-		if (desc)
-			pz = page_zone(BD_GET_KIOV(desc, i).kiov_page);
-		else
-			pz = page_zone(aa->aa_ppga[i]->pg);
+		void *pz = page_zone(BD_GET_KIOV(desc, i).kiov_page);
 
 		if (likely(pz == zone)) {
 			++count;
@@ -944,16 +916,14 @@ static inline void unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
 		mod_zone_page_state(zone, NR_WRITEBACK, factor * count);
 }
 
-static inline void add_unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
-						struct osc_brw_async_args *aa)
+static inline void add_unstable_page_accounting(struct ptlrpc_bulk_desc *desc)
 {
-	unstable_page_accounting(desc, aa, 1);
+	unstable_page_accounting(desc, 1);
 }
 
-static inline void dec_unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
-						struct osc_brw_async_args *aa)
+static inline void dec_unstable_page_accounting(struct ptlrpc_bulk_desc *desc)
 {
-	unstable_page_accounting(desc, aa, -1);
+	unstable_page_accounting(desc, -1);
 }
 
 /**
@@ -970,19 +940,12 @@ static inline void dec_unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
 void osc_dec_unstable_pages(struct ptlrpc_request *req)
 {
 	struct ptlrpc_bulk_desc *desc       = req->rq_bulk;
-	struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
 	struct client_obd       *cli        = &req->rq_import->imp_obd->u.cli;
-	int			 page_count;
+	int			 page_count = desc->bd_iov_count;
 	long			 unstable_count;
 
-	if (desc)
-		page_count = desc->bd_iov_count;
-	else
-		page_count = aa->aa_page_count;
-
 	LASSERT(page_count >= 0);
-
-	dec_unstable_page_accounting(desc, aa);
+	dec_unstable_page_accounting(desc);
 
 	unstable_count = atomic_long_sub_return(page_count,
 						&cli->cl_unstable_count);
@@ -1004,20 +967,14 @@ void osc_dec_unstable_pages(struct ptlrpc_request *req)
 void osc_inc_unstable_pages(struct ptlrpc_request *req)
 {
 	struct ptlrpc_bulk_desc *desc = req->rq_bulk;
-	struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
 	struct client_obd       *cli  = &req->rq_import->imp_obd->u.cli;
-	long			 page_count;
+	long			 page_count = desc->bd_iov_count;
 
 	/* No unstable page tracking */
 	if (cli->cl_cache == NULL || !cli->cl_cache->ccc_unstable_check)
 		return;
 
-	if (desc)
-		page_count = desc->bd_iov_count;
-	else
-		page_count = aa->aa_page_count;
-
-	add_unstable_page_accounting(desc, aa);
+	add_unstable_page_accounting(desc);
 	atomic_long_add(page_count, &cli->cl_unstable_count);
 	atomic_long_add(page_count, &cli->cl_cache->ccc_unstable_nr);
 
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_quota.c b/drivers/staging/lustrefsx/lustre/osc/osc_quota.c
index a0aaae784515a..7dcbbd79a5de0 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_quota.c
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_quota.c
@@ -23,14 +23,12 @@
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2015, Intel Corporation.
  *
  * Code originally extracted from quota directory
  */
 
 #include <obd.h>
-#include <lustre_osc.h>
-
 #include "osc_internal.h"
 
 static inline struct osc_quota_info *osc_oqi_alloc(u32 id)
@@ -96,7 +94,7 @@ static inline u32 fl_quota_flag(int qtype)
 	}
 }
 
-int osc_quota_setdq(struct client_obd *cli, __u64 xid, const unsigned int qid[],
+int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[],
 		    u64 valid, u32 flags)
 {
 	int type;
@@ -107,17 +105,6 @@ int osc_quota_setdq(struct client_obd *cli, __u64 xid, const unsigned int qid[],
 	if ((valid & (OBD_MD_FLALLQUOTA)) == 0)
 		RETURN(0);
 
-	mutex_lock(&cli->cl_quota_mutex);
-	/* still mark the quots is running out for the old request, because it
-	 * could be processed after the new request at OST, the side effect is
-	 * the following request will be processed synchronously, but it will
-	 * not break the quota enforcement. */
-	if (cli->cl_quota_last_xid > xid && !(flags & OBD_FL_NO_QUOTA_ALL))
-		GOTO(out_unlock, rc);
-
-	if (cli->cl_quota_last_xid < xid)
-		cli->cl_quota_last_xid = xid;
-
 	for (type = 0; type < LL_MAXQUOTAS; type++) {
 		struct osc_quota_info *oqi;
 
@@ -164,8 +151,6 @@ int osc_quota_setdq(struct client_obd *cli, __u64 xid, const unsigned int qid[],
 		}
 	}
 
-out_unlock:
-	mutex_unlock(&cli->cl_quota_mutex);
 	RETURN(rc);
 }
 
@@ -245,8 +230,6 @@ int osc_quota_setup(struct obd_device *obd)
 	int i, type;
 	ENTRY;
 
-	mutex_init(&cli->cl_quota_mutex);
-
 	for (type = 0; type < LL_MAXQUOTAS; type++) {
 		cli->cl_quota_hash[type] = cfs_hash_create("QUOTA_HASH",
 							   HASH_QUOTA_CUR_BITS,
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_request.c b/drivers/staging/lustrefsx/lustre/osc/osc_request.c
index 80695d5805915..b50f4d6ee5019 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_request.c
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_request.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -32,21 +32,24 @@
 
 #define DEBUG_SUBSYSTEM S_OSC
 
-#include <linux/workqueue.h>
+#include <libcfs/libcfs.h>
+
+#include <lustre/lustre_user.h>
+
 #include <lprocfs_status.h>
 #include <lustre_debug.h>
 #include <lustre_dlm.h>
 #include <lustre_fid.h>
 #include <lustre_ha.h>
-#include <uapi/linux/lustre/lustre_ioctl.h>
+#include <uapi/linux/lustre_ioctl.h>
 #include <lustre_net.h>
 #include <lustre_obdo.h>
-#include <uapi/linux/lustre/lustre_param.h>
+#include <uapi/linux/lustre_param.h>
 #include <obd.h>
 #include <obd_cksum.h>
 #include <obd_class.h>
-#include <lustre_osc.h>
 
+#include "osc_cl_internal.h"
 #include "osc_internal.h"
 
 atomic_t osc_pool_req_count;
@@ -57,8 +60,17 @@ struct ptlrpc_request_pool *osc_rq_pool;
 static unsigned int osc_reqpool_mem_max = 5;
 module_param(osc_reqpool_mem_max, uint, 0444);
 
-static int osc_idle_timeout = 20;
-module_param(osc_idle_timeout, uint, 0644);
+struct osc_brw_async_args {
+	struct obdo		 *aa_oa;
+	int			  aa_requested_nob;
+	int			  aa_nio_count;
+	u32			  aa_page_count;
+	int			  aa_resends;
+	struct brw_page	**aa_ppga;
+	struct client_obd	 *aa_cli;
+	struct list_head	  aa_oaps;
+	struct list_head	  aa_exts;
+};
 
 #define osc_grant_args osc_brw_async_args
 
@@ -81,6 +93,18 @@ struct osc_ladvise_args {
 	void			*la_cookie;
 };
 
+struct osc_enqueue_args {
+	struct obd_export	*oa_exp;
+	enum ldlm_type		oa_type;
+	enum ldlm_mode		oa_mode;
+	__u64			*oa_flags;
+	osc_enqueue_upcall_f	oa_upcall;
+	void			*oa_cookie;
+	struct ost_lvb		*oa_lvb;
+	struct lustre_handle	oa_lockh;
+	unsigned int		oa_agl:1;
+};
+
 static void osc_release_ppga(struct brw_page **ppga, size_t count);
 static int brw_interpret(const struct lu_env *env, struct ptlrpc_request *req,
 			 void *data, int rc);
@@ -386,34 +410,31 @@ static int osc_create(const struct lu_env *env, struct obd_export *exp,
 	RETURN(rc);
 }
 
-int osc_punch_send(struct obd_export *exp, struct obdo *oa,
-		   obd_enqueue_update_f upcall, void *cookie)
+int osc_punch_base(struct obd_export *exp, struct obdo *oa,
+                   obd_enqueue_update_f upcall, void *cookie,
+                   struct ptlrpc_request_set *rqset)
 {
-	struct ptlrpc_request *req;
-	struct osc_setattr_args *sa;
-	struct obd_import *imp = class_exp2cliimp(exp);
-	struct ost_body *body;
-	int rc;
-
-	ENTRY;
-
-	req = ptlrpc_request_alloc(imp, &RQF_OST_PUNCH);
-	if (req == NULL)
-		RETURN(-ENOMEM);
-
-	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH);
-	if (rc < 0) {
-		ptlrpc_request_free(req);
-		RETURN(rc);
-	}
+        struct ptlrpc_request   *req;
+        struct osc_setattr_args *sa;
+        struct ost_body         *body;
+        int                      rc;
+        ENTRY;
 
-	osc_set_io_portal(req);
+        req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_PUNCH);
+        if (req == NULL)
+                RETURN(-ENOMEM);
 
-	ptlrpc_at_set_req_timeout(req);
+        rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH);
+        if (rc) {
+                ptlrpc_request_free(req);
+                RETURN(rc);
+        }
+        req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
+        ptlrpc_at_set_req_timeout(req);
 
 	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
-
-	lustre_set_wire_obdo(&imp->imp_connect_data, &body->oa, oa);
+	LASSERT(body);
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
 
 	ptlrpc_request_set_replen(req);
 
@@ -423,12 +444,13 @@ int osc_punch_send(struct obd_export *exp, struct obdo *oa,
 	sa->sa_oa = oa;
 	sa->sa_upcall = upcall;
 	sa->sa_cookie = cookie;
-
-	ptlrpcd_add_req(req);
+	if (rqset == PTLRPCD_SET)
+		ptlrpcd_add_req(req);
+	else
+		ptlrpc_set_add_req(rqset, req);
 
 	RETURN(0);
 }
-EXPORT_SYMBOL(osc_punch_send);
 
 static int osc_sync_interpret(const struct lu_env *env,
                               struct ptlrpc_request *req,
@@ -651,18 +673,21 @@ static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
 		oa->o_dirty = cli->cl_dirty_grant;
 	else
 		oa->o_dirty = cli->cl_dirty_pages << PAGE_SHIFT;
-	if (unlikely(cli->cl_dirty_pages > cli->cl_dirty_max_pages)) {
-		CERROR("dirty %lu > dirty_max %lu\n",
-		       cli->cl_dirty_pages,
+	if (unlikely(cli->cl_dirty_pages - cli->cl_dirty_transit >
+		     cli->cl_dirty_max_pages)) {
+		CERROR("dirty %lu - %lu > dirty_max %lu\n",
+		       cli->cl_dirty_pages, cli->cl_dirty_transit,
 		       cli->cl_dirty_max_pages);
 		oa->o_undirty = 0;
-	} else if (unlikely(atomic_long_read(&obd_dirty_pages) >
+	} else if (unlikely(atomic_long_read(&obd_dirty_pages) -
+			    atomic_long_read(&obd_dirty_transit_pages) >
 			    (long)(obd_max_dirty_pages + 1))) {
 		/* The atomic_read() allowing the atomic_inc() are
 		 * not covered by a lock thus they may safely race and trip
 		 * this CERROR() unless we add in a small fudge factor (+1). */
-		CERROR("%s: dirty %ld > system dirty_max %ld\n",
+		CERROR("%s: dirty %ld - %ld > system dirty_max %ld\n",
 		       cli_name(cli), atomic_long_read(&obd_dirty_pages),
+		       atomic_long_read(&obd_dirty_transit_pages),
 		       obd_max_dirty_pages);
 		oa->o_undirty = 0;
 	} else if (unlikely(cli->cl_dirty_max_pages - cli->cl_dirty_pages >
@@ -691,33 +716,23 @@ static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
 		/* Do not ask for more than OBD_MAX_GRANT - a margin for server
 		 * to add extent tax, etc.
 		 */
-		oa->o_undirty = min(undirty, OBD_MAX_GRANT &
-				    ~(PTLRPC_MAX_BRW_SIZE * 4UL));
+		oa->o_undirty = min(undirty, OBD_MAX_GRANT -
+				    (PTLRPC_MAX_BRW_PAGES << PAGE_SHIFT)*4UL);
         }
 	oa->o_grant = cli->cl_avail_grant + cli->cl_reserved_grant;
-	/* o_dropped AKA o_misc is 32 bits, but cl_lost_grant is 64 bits */
-	if (cli->cl_lost_grant > INT_MAX) {
-		CDEBUG(D_CACHE,
-		      "%s: avoided o_dropped overflow: cl_lost_grant %lu\n",
-		      cli_name(cli), cli->cl_lost_grant);
-		oa->o_dropped = INT_MAX;
-	} else {
-		oa->o_dropped = cli->cl_lost_grant;
-	}
-	cli->cl_lost_grant -= oa->o_dropped;
+        oa->o_dropped = cli->cl_lost_grant;
+        cli->cl_lost_grant = 0;
 	spin_unlock(&cli->cl_loi_list_lock);
-	CDEBUG(D_CACHE, "%s: dirty: %llu undirty: %u dropped %u grant: %llu"
-	       " cl_lost_grant %lu\n", cli_name(cli), oa->o_dirty,
-	       oa->o_undirty, oa->o_dropped, oa->o_grant, cli->cl_lost_grant);
+	CDEBUG(D_CACHE, "dirty: %llu undirty: %u dropped %u grant: %llu\n",
+               oa->o_dirty, oa->o_undirty, oa->o_dropped, oa->o_grant);
 }
 
 void osc_update_next_shrink(struct client_obd *cli)
 {
-	cli->cl_next_shrink_grant = ktime_get_seconds() +
-				    cli->cl_grant_shrink_interval;
-
-	CDEBUG(D_CACHE, "next time %lld to shrink grant\n",
-	       cli->cl_next_shrink_grant);
+        cli->cl_next_shrink_grant =
+                cfs_time_shift(cli->cl_grant_shrink_interval);
+        CDEBUG(D_CACHE, "next time %ld to shrink grant \n",
+               cli->cl_next_shrink_grant);
 }
 
 static void __osc_update_grant(struct client_obd *cli, u64 grant)
@@ -735,36 +750,30 @@ static void osc_update_grant(struct client_obd *cli, struct ost_body *body)
         }
 }
 
-/**
- * grant thread data for shrinking space.
- */
-struct grant_thread_data {
-	struct list_head	gtd_clients;
-	struct mutex		gtd_mutex;
-	unsigned long		gtd_stopped:1;
-};
-static struct grant_thread_data client_gtd;
+static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+			      u32 keylen, void *key,
+			      u32 vallen, void *val,
+			      struct ptlrpc_request_set *set);
 
 static int osc_shrink_grant_interpret(const struct lu_env *env,
-				      struct ptlrpc_request *req,
-				      void *aa, int rc)
+                                      struct ptlrpc_request *req,
+                                      void *aa, int rc)
 {
-	struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
-	struct obdo *oa = ((struct osc_grant_args *)aa)->aa_oa;
-	struct ost_body *body;
+        struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+        struct obdo *oa = ((struct osc_grant_args *)aa)->aa_oa;
+        struct ost_body *body;
 
-	if (rc != 0) {
-		__osc_update_grant(cli, oa->o_grant);
-		GOTO(out, rc);
-	}
+        if (rc != 0) {
+                __osc_update_grant(cli, oa->o_grant);
+                GOTO(out, rc);
+        }
 
-	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
-	LASSERT(body);
-	osc_update_grant(cli, body);
+        body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+        LASSERT(body);
+        osc_update_grant(cli, body);
 out:
-	OBD_SLAB_FREE_PTR(oa, osc_obdo_kmem);
-	oa = NULL;
-	return rc;
+        OBDO_FREE(oa);
+        return rc;
 }
 
 static void osc_shrink_grant_local(struct client_obd *cli, struct obdo *oa)
@@ -824,11 +833,6 @@ int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes)
 	osc_announce_cached(cli, &body->oa, 0);
 
 	spin_lock(&cli->cl_loi_list_lock);
-	if (target_bytes >= cli->cl_avail_grant) {
-		/* available grant has changed since target calculation */
-		spin_unlock(&cli->cl_loi_list_lock);
-		GOTO(out_free, rc = 0);
-	}
 	body->oa.o_grant = cli->cl_avail_grant - target_bytes;
 	cli->cl_avail_grant = target_bytes;
 	spin_unlock(&cli->cl_loi_list_lock);
@@ -844,25 +848,20 @@ int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes)
                                 sizeof(*body), body, NULL);
         if (rc != 0)
                 __osc_update_grant(cli, body->oa.o_grant);
-out_free:
         OBD_FREE_PTR(body);
         RETURN(rc);
 }
 
 static int osc_should_shrink_grant(struct client_obd *client)
 {
-	time64_t next_shrink = client->cl_next_shrink_grant;
-
-	if (client->cl_import == NULL)
-		return 0;
+        cfs_time_t time = cfs_time_current();
+        cfs_time_t next_shrink = client->cl_next_shrink_grant;
 
-	if (!OCD_HAS_FLAG(&client->cl_import->imp_connect_data, GRANT_SHRINK) ||
-	    client->cl_import->imp_grant_shrink_disabled) {
-		osc_update_next_shrink(client);
-		return 0;
-	}
+        if ((client->cl_import->imp_connect_data.ocd_connect_flags &
+             OBD_CONNECT_GRANT_SHRINK) == 0)
+                return 0;
 
-	if (ktime_get_seconds() >= next_shrink - 5) {
+	if (cfs_time_aftereq(time, next_shrink - 5 * CFS_TICK)) {
 		/* Get the current RPC size directly, instead of going via:
 		 * cli_brw_size(obd->u.cli.cl_import->imp_obd->obd_self_export)
 		 * Keep comment here so that it can be found by searching. */
@@ -877,88 +876,41 @@ static int osc_should_shrink_grant(struct client_obd *client)
         return 0;
 }
 
-#define GRANT_SHRINK_RPC_BATCH	100
-
-static struct delayed_work work;
-
-static void osc_grant_work_handler(struct work_struct *data)
+static int osc_grant_shrink_grant_cb(struct timeout_item *item, void *data)
 {
-	struct client_obd *cli;
-	int rpc_sent;
-	bool init_next_shrink = true;
-	time64_t next_shrink = ktime_get_seconds() + GRANT_SHRINK_INTERVAL;
-
-	rpc_sent = 0;
-	mutex_lock(&client_gtd.gtd_mutex);
-	list_for_each_entry(cli, &client_gtd.gtd_clients,
-			    cl_grant_chain) {
-		if (rpc_sent < GRANT_SHRINK_RPC_BATCH &&
-		    osc_should_shrink_grant(cli)) {
-			osc_shrink_grant(cli);
-			rpc_sent++;
-		}
+	struct client_obd *client;
 
-		if (!init_next_shrink) {
-			if (cli->cl_next_shrink_grant < next_shrink &&
-			    cli->cl_next_shrink_grant > ktime_get_seconds())
-				next_shrink = cli->cl_next_shrink_grant;
-		} else {
-			init_next_shrink = false;
-			next_shrink = cli->cl_next_shrink_grant;
-		}
+	list_for_each_entry(client, &item->ti_obd_list, cl_grant_shrink_list) {
+		if (osc_should_shrink_grant(client))
+			osc_shrink_grant(client);
 	}
-	mutex_unlock(&client_gtd.gtd_mutex);
-
-	if (client_gtd.gtd_stopped == 1)
-		return;
-
-	if (next_shrink > ktime_get_seconds())
-		schedule_delayed_work(&work, msecs_to_jiffies(
-					(next_shrink - ktime_get_seconds()) *
-					MSEC_PER_SEC));
-	else
-		schedule_work(&work.work);
-}
-
-/**
- * Start grant thread for returing grant to server for idle clients.
- */
-static int osc_start_grant_work(void)
-{
-	client_gtd.gtd_stopped = 0;
-	mutex_init(&client_gtd.gtd_mutex);
-	INIT_LIST_HEAD(&client_gtd.gtd_clients);
-
-	INIT_DELAYED_WORK(&work, osc_grant_work_handler);
-	schedule_work(&work.work);
-
 	return 0;
 }
 
-static void osc_stop_grant_work(void)
+static int osc_add_shrink_grant(struct client_obd *client)
 {
-	client_gtd.gtd_stopped = 1;
-	cancel_delayed_work_sync(&work);
-}
+	int rc;
 
-static void osc_add_grant_list(struct client_obd *client)
-{
-	mutex_lock(&client_gtd.gtd_mutex);
-	list_add(&client->cl_grant_chain, &client_gtd.gtd_clients);
-	mutex_unlock(&client_gtd.gtd_mutex);
+	rc = ptlrpc_add_timeout_client(client->cl_grant_shrink_interval,
+				       TIMEOUT_GRANT,
+				       osc_grant_shrink_grant_cb, NULL,
+				       &client->cl_grant_shrink_list);
+	if (rc) {
+		CERROR("add grant client %s error %d\n", cli_name(client), rc);
+		return rc;
+	}
+	CDEBUG(D_CACHE, "add grant client %s\n", cli_name(client));
+	osc_update_next_shrink(client);
+	return 0;
 }
 
-static void osc_del_grant_list(struct client_obd *client)
+static int osc_del_shrink_grant(struct client_obd *client)
 {
-	if (list_empty(&client->cl_grant_chain))
-		return;
-
-	mutex_lock(&client_gtd.gtd_mutex);
-	list_del_init(&client->cl_grant_chain);
-	mutex_unlock(&client_gtd.gtd_mutex);
+        return ptlrpc_del_timeout_client(&client->cl_grant_shrink_list,
+                                         TIMEOUT_GRANT);
 }
 
-void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
+static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
 {
 	/*
 	 * ocd_grant is the total grant amount we're expect to hold: if we've
@@ -972,19 +924,12 @@ void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
 	spin_lock(&cli->cl_loi_list_lock);
 	cli->cl_avail_grant = ocd->ocd_grant;
 	if (cli->cl_import->imp_state != LUSTRE_IMP_EVICTED) {
-		unsigned long consumed = cli->cl_reserved_grant;
-
+		cli->cl_avail_grant -= cli->cl_reserved_grant;
 		if (OCD_HAS_FLAG(ocd, GRANT_PARAM))
-			consumed += cli->cl_dirty_grant;
+			cli->cl_avail_grant -= cli->cl_dirty_grant;
 		else
-			consumed += cli->cl_dirty_pages << PAGE_SHIFT;
-		if (cli->cl_avail_grant < consumed) {
-			CERROR("%s: granted %ld but already consumed %ld\n",
-			       cli_name(cli), cli->cl_avail_grant, consumed);
-			cli->cl_avail_grant = 0;
-		} else {
-			cli->cl_avail_grant -= consumed;
-		}
+			cli->cl_avail_grant -=
+					cli->cl_dirty_pages << PAGE_SHIFT;
 	}
 
 	if (OCD_HAS_FLAG(ocd, GRANT_PARAM)) {
@@ -1018,10 +963,10 @@ void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
 		cli->cl_avail_grant, cli->cl_lost_grant, cli->cl_chunkbits,
 		cli->cl_max_extent_pages);
 
-	if (OCD_HAS_FLAG(ocd, GRANT_SHRINK) && list_empty(&cli->cl_grant_chain))
-		osc_add_grant_list(cli);
+	if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT_SHRINK &&
+	    list_empty(&cli->cl_grant_shrink_list))
+		osc_add_shrink_grant(cli);
 }
-EXPORT_SYMBOL(osc_init_grant);
 
 /* We assume that the reason this OSC got a short read is because it read
  * beyond the end of a stripe file; i.e. lustre is reading a sparse file
@@ -1088,8 +1033,8 @@ static int check_write_rcs(struct ptlrpc_request *req,
                         return(-EPROTO);
                 }
         }
-	if (req->rq_bulk != NULL &&
-	    req->rq_bulk->bd_nob_transferred != requested_nob) {
+
+        if (req->rq_bulk->bd_nob_transferred != requested_nob) {
                 CERROR("Unexpected # bytes transferred: %d (requested %d)\n",
                        req->rq_bulk->bd_nob_transferred, requested_nob);
                 return(-EPROTO);
@@ -1101,9 +1046,9 @@ static int check_write_rcs(struct ptlrpc_request *req,
 static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2)
 {
         if (p1->flag != p2->flag) {
-		unsigned mask = ~(OBD_BRW_FROM_GRANT | OBD_BRW_SYNC |
-				  OBD_BRW_ASYNC | OBD_BRW_NOQUOTA |
-				  OBD_BRW_SOFT_SYNC);
+		unsigned mask = ~(OBD_BRW_FROM_GRANT | OBD_BRW_NOCACHE |
+				  OBD_BRW_SYNC       | OBD_BRW_ASYNC   |
+				  OBD_BRW_NOQUOTA    | OBD_BRW_SOFT_SYNC);
 
                 /* warn if we try to combine flags that we don't know to be
                  * safe to combine */
@@ -1118,128 +1063,23 @@ static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2)
         return (p1->off + p1->count == p2->off);
 }
 
-#if IS_ENABLED(CONFIG_CRC_T10DIF)
-static int osc_checksum_bulk_t10pi(const char *obd_name, int nob,
-				   size_t pg_count, struct brw_page **pga,
-				   int opc, obd_dif_csum_fn *fn,
-				   int sector_size,
-				   u32 *check_sum)
-{
-	struct ahash_request *req;
-	/* Used Adler as the default checksum type on top of DIF tags */
-	unsigned char cfs_alg = cksum_obd2cfs(OBD_CKSUM_T10_TOP);
-	struct page *__page;
-	unsigned char *buffer;
-	__u16 *guard_start;
-	unsigned int bufsize;
-	int guard_number;
-	int used_number = 0;
-	int used;
-	u32 cksum;
-	int rc = 0;
-	int i = 0;
-
-	LASSERT(pg_count > 0);
-
-	__page = alloc_page(GFP_KERNEL);
-	if (__page == NULL)
-		return -ENOMEM;
-
-	req = cfs_crypto_hash_init(cfs_alg, NULL, 0);
-	if (IS_ERR(req)) {
-		rc = PTR_ERR(req);
-		CERROR("%s: unable to initialize checksum hash %s: rc = %d\n",
-		       obd_name, cfs_crypto_hash_name(cfs_alg), rc);
-		GOTO(out, rc);
-	}
-
-	buffer = kmap(__page);
-	guard_start = (__u16 *)buffer;
-	guard_number = PAGE_SIZE / sizeof(*guard_start);
-	while (nob > 0 && pg_count > 0) {
-		unsigned int count = pga[i]->count > nob ? nob : pga[i]->count;
-
-		/* corrupt the data before we compute the checksum, to
-		 * simulate an OST->client data error */
-		if (unlikely(i == 0 && opc == OST_READ &&
-			     OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE))) {
-			unsigned char *ptr = kmap(pga[i]->pg);
-			int off = pga[i]->off & ~PAGE_MASK;
-
-			memcpy(ptr + off, "bad1", min_t(typeof(nob), 4, nob));
-			kunmap(pga[i]->pg);
-		}
-
-		/*
-		 * The left guard number should be able to hold checksums of a
-		 * whole page
-		 */
-		rc = obd_page_dif_generate_buffer(obd_name, pga[i]->pg,
-						  pga[i]->off & ~PAGE_MASK,
-						  count,
-						  guard_start + used_number,
-						  guard_number - used_number,
-						  &used, sector_size,
-						  fn);
-		if (rc)
-			break;
-
-		used_number += used;
-		if (used_number == guard_number) {
-			cfs_crypto_hash_update_page(req, __page, 0,
-				used_number * sizeof(*guard_start));
-			used_number = 0;
-		}
-
-		nob -= pga[i]->count;
-		pg_count--;
-		i++;
-	}
-	kunmap(__page);
-	if (rc)
-		GOTO(out, rc);
-
-	if (used_number != 0)
-		cfs_crypto_hash_update_page(req, __page, 0,
-			used_number * sizeof(*guard_start));
-
-	bufsize = sizeof(cksum);
-	cfs_crypto_hash_final(req, (unsigned char *)&cksum, &bufsize);
-
-	/* For sending we only compute the wrong checksum instead
-	 * of corrupting the data so it is still correct on a redo */
-	if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND))
-		cksum++;
-
-	*check_sum = cksum;
-out:
-	__free_page(__page);
-	return rc;
-}
-#else /* !CONFIG_CRC_T10DIF */
-#define obd_dif_ip_fn NULL
-#define obd_dif_crc_fn NULL
-#define osc_checksum_bulk_t10pi(name, nob, pgc, pga, opc, fn, ssize, csum)  \
-	-EOPNOTSUPP
-#endif /* CONFIG_CRC_T10DIF */
-
-static int osc_checksum_bulk(int nob, size_t pg_count,
+static u32 osc_checksum_bulk(int nob, size_t pg_count,
 			     struct brw_page **pga, int opc,
-			     enum cksum_types cksum_type,
-			     u32 *cksum)
+			     cksum_type_t cksum_type)
 {
+	u32				cksum;
 	int				i = 0;
-	struct ahash_request	       *req;
+	struct cfs_crypto_hash_desc	*hdesc;
 	unsigned int			bufsize;
 	unsigned char			cfs_alg = cksum_obd2cfs(cksum_type);
 
 	LASSERT(pg_count > 0);
 
-	req = cfs_crypto_hash_init(cfs_alg, NULL, 0);
-	if (IS_ERR(req)) {
+	hdesc = cfs_crypto_hash_init(cfs_alg, NULL, 0);
+	if (IS_ERR(hdesc)) {
 		CERROR("Unable to initialize checksum hash %s\n",
 		       cfs_crypto_hash_name(cfs_alg));
-		return PTR_ERR(req);
+		return PTR_ERR(hdesc);
 	}
 
 	while (nob > 0 && pg_count > 0) {
@@ -1255,7 +1095,7 @@ static int osc_checksum_bulk(int nob, size_t pg_count,
 			memcpy(ptr + off, "bad1", min_t(typeof(nob), 4, nob));
 			kunmap(pga[i]->pg);
 		}
-		cfs_crypto_hash_update_page(req, pga[i]->pg,
+		cfs_crypto_hash_update_page(hdesc, pga[i]->pg,
 					    pga[i]->off & ~PAGE_MASK,
 					    count);
 		LL_CDEBUG_PAGE(D_PAGE, pga[i]->pg, "off %d\n",
@@ -1266,38 +1106,15 @@ static int osc_checksum_bulk(int nob, size_t pg_count,
 		i++;
 	}
 
-	bufsize = sizeof(*cksum);
-	cfs_crypto_hash_final(req, (unsigned char *)cksum, &bufsize);
+	bufsize = sizeof(cksum);
+	cfs_crypto_hash_final(hdesc, (unsigned char *)&cksum, &bufsize);
 
 	/* For sending we only compute the wrong checksum instead
 	 * of corrupting the data so it is still correct on a redo */
 	if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND))
-		(*cksum)++;
-
-	return 0;
-}
-
-static int osc_checksum_bulk_rw(const char *obd_name,
-				enum cksum_types cksum_type,
-				int nob, size_t pg_count,
-				struct brw_page **pga, int opc,
-				u32 *check_sum)
-{
-	obd_dif_csum_fn *fn = NULL;
-	int sector_size = 0;
-	int rc;
-
-	ENTRY;
-	obd_t10_cksum2dif(cksum_type, &fn, &sector_size);
-
-	if (fn)
-		rc = osc_checksum_bulk_t10pi(obd_name, nob, pg_count, pga,
-					     opc, fn, sector_size, check_sum);
-	else
-		rc = osc_checksum_bulk(nob, pg_count, pga, opc, cksum_type,
-				       check_sum);
+		cksum++;
 
-	RETURN(rc);
+	return cksum;
 }
 
 static int
@@ -1310,12 +1127,10 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
         struct ost_body         *body;
         struct obd_ioobj        *ioobj;
         struct niobuf_remote    *niobuf;
-	int niocount, i, requested_nob, opc, rc, short_io_size = 0;
+        int niocount, i, requested_nob, opc, rc;
         struct osc_brw_async_args *aa;
         struct req_capsule      *pill;
         struct brw_page *pg_prev;
-	void *short_io_buf;
-	const char *obd_name = cli->cl_import->imp_obd->obd_name;
 
         ENTRY;
         if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ))
@@ -1346,38 +1161,17 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
         req_capsule_set_size(pill, &RMF_NIOBUF_REMOTE, RCL_CLIENT,
                              niocount * sizeof(*niobuf));
 
-	for (i = 0; i < page_count; i++)
-		short_io_size += pga[i]->count;
-
-	/* Check if read/write is small enough to be a short io. */
-	if (short_io_size > cli->cl_max_short_io_bytes || niocount > 1 ||
-	    !imp_connect_shortio(cli->cl_import))
-		short_io_size = 0;
-
-	req_capsule_set_size(pill, &RMF_SHORT_IO, RCL_CLIENT,
-			     opc == OST_READ ? 0 : short_io_size);
-	if (opc == OST_READ)
-		req_capsule_set_size(pill, &RMF_SHORT_IO, RCL_SERVER,
-				     short_io_size);
-
         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, opc);
         if (rc) {
                 ptlrpc_request_free(req);
                 RETURN(rc);
         }
-	osc_set_io_portal(req);
-
-	ptlrpc_at_set_req_timeout(req);
+        req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
+        ptlrpc_at_set_req_timeout(req);
 	/* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own
 	 * retry logic */
 	req->rq_no_retry_einprogress = 1;
 
-	if (short_io_size != 0) {
-		desc = NULL;
-		short_io_buf = NULL;
-		goto no_bulk;
-	}
-
 	desc = ptlrpc_prep_bulk_imp(req, page_count,
 		cli->cl_import->imp_connect_data.ocd_brw_size >> LNET_MTU_BITS,
 		(opc == OST_WRITE ? PTLRPC_BULK_GET_SOURCE :
@@ -1389,7 +1183,7 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
         if (desc == NULL)
                 GOTO(out, rc = -ENOMEM);
         /* NB request now owns desc and will free it when it gets freed */
-no_bulk:
+
         body = req_capsule_client_get(pill, &RMF_OST_BODY);
         ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ);
         niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
@@ -1397,15 +1191,6 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
 
 	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
 
-	/* For READ and WRITE, we can't fill o_uid and o_gid using from_kuid()
-	 * and from_kgid(), because they are asynchronous. Fortunately, variable
-	 * oa contains valid o_uid and o_gid in these two operations.
-	 * Besides, filling o_uid and o_gid is enough for nrs-tbf, see LU-9658.
-	 * OBD_MD_FLUID and OBD_MD_FLUID is not set in order to avoid breaking
-	 * other process logic */
-	body->oa.o_uid = oa->o_uid;
-	body->oa.o_gid = oa->o_gid;
-
 	obdo_to_ioobj(oa, ioobj);
 	ioobj->ioo_bufcnt = niocount;
 	/* The high bits of ioo_max_brw tells server _maximum_ number of bulks
@@ -1413,26 +1198,7 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
 	 * when the RPC is finally sent in ptlrpc_register_bulk(). It sends
 	 * "max - 1" for old client compatibility sending "0", and also so the
 	 * the actual maximum is a power-of-two number, not one less. LU-1431 */
-	if (desc != NULL)
-		ioobj_max_brw_set(ioobj, desc->bd_md_max_brw);
-	else /* short io */
-		ioobj_max_brw_set(ioobj, 0);
-
-	if (short_io_size != 0) {
-		if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
-			body->oa.o_valid |= OBD_MD_FLFLAGS;
-			body->oa.o_flags = 0;
-		}
-		body->oa.o_flags |= OBD_FL_SHORT_IO;
-		CDEBUG(D_CACHE, "Using short io for data transfer, size = %d\n",
-		       short_io_size);
-		if (opc == OST_WRITE) {
-			short_io_buf = req_capsule_client_get(pill,
-							      &RMF_SHORT_IO);
-			LASSERT(short_io_buf != NULL);
-		}
-	}
-
+	ioobj_max_brw_set(ioobj, desc->bd_md_max_brw);
 	LASSERT(page_count > 0);
 	pg_prev = pga[0];
         for (requested_nob = i = 0; i < page_count; i++, niobuf++) {
@@ -1457,19 +1223,9 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
                          pg_prev->pg->index, pg_prev->off);
                 LASSERT((pga[0]->flag & OBD_BRW_SRVLOCK) ==
                         (pg->flag & OBD_BRW_SRVLOCK));
-		if (short_io_size != 0 && opc == OST_WRITE) {
-			unsigned char *ptr = ll_kmap_atomic(pg->pg, KM_USER0);
-
-			LASSERT(short_io_size >= requested_nob + pg->count);
-			memcpy(short_io_buf + requested_nob,
-			       ptr + poff,
-			       pg->count);
-			ll_kunmap_atomic(ptr, KM_USER0);
-		} else if (short_io_size == 0) {
-			desc->bd_frag_ops->add_kiov_frag(desc, pg->pg, poff,
-							 pg->count);
-		}
-		requested_nob += pg->count;
+
+		desc->bd_frag_ops->add_kiov_frag(desc, pg->pg, poff, pg->count);
+                requested_nob += pg->count;
 
                 if (i > 0 && can_merge_pages(pg_prev, pg)) {
                         niobuf--;
@@ -1505,31 +1261,22 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
                     !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
                         /* store cl_cksum_type in a local variable since
                          * it can be changed via lprocfs */
-			enum cksum_types cksum_type = cli->cl_cksum_type;
+                        cksum_type_t cksum_type = cli->cl_cksum_type;
 
                         if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0)
                                 body->oa.o_flags = 0;
 
-			body->oa.o_flags |= obd_cksum_type_pack(obd_name,
-								cksum_type);
+                        body->oa.o_flags |= cksum_type_pack(cksum_type);
                         body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
-
-			rc = osc_checksum_bulk_rw(obd_name, cksum_type,
-						  requested_nob, page_count,
-						  pga, OST_WRITE,
-						  &body->oa.o_cksum);
-			if (rc < 0) {
-				CDEBUG(D_PAGE, "failed to checksum, rc = %d\n",
-				       rc);
-				GOTO(out, rc);
-			}
+                        body->oa.o_cksum = osc_checksum_bulk(requested_nob,
+                                                             page_count, pga,
+                                                             OST_WRITE,
+                                                             cksum_type);
                         CDEBUG(D_PAGE, "checksum at write origin: %x\n",
                                body->oa.o_cksum);
-
                         /* save this in 'oa', too, for later checking */
                         oa->o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
-			oa->o_flags |= obd_cksum_type_pack(obd_name,
-							   cksum_type);
+                        oa->o_flags |= cksum_type_pack(cksum_type);
                 } else {
                         /* clear out the checksum flag, in case this is a
                          * resend but cl_checksum is no longer set. b=11238 */
@@ -1544,27 +1291,26 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
                     !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
                         if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0)
                                 body->oa.o_flags = 0;
-			body->oa.o_flags |= obd_cksum_type_pack(obd_name,
-				cli->cl_cksum_type);
+                        body->oa.o_flags |= cksum_type_pack(cli->cl_cksum_type);
                         body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
-		}
+                }
 
 		/* Client cksum has been already copied to wire obdo in previous
 		 * lustre_set_wire_obdo(), and in the case a bulk-read is being
 		 * resent due to cksum error, this will allow Server to
 		 * check+dump pages on its side */
 	}
-	ptlrpc_request_set_replen(req);
+        ptlrpc_request_set_replen(req);
 
-	CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
-	aa = ptlrpc_req_async_args(req);
-	aa->aa_oa = oa;
-	aa->aa_requested_nob = requested_nob;
-	aa->aa_nio_count = niocount;
-	aa->aa_page_count = page_count;
-	aa->aa_resends = 0;
-	aa->aa_ppga = pga;
-	aa->aa_cli = cli;
+        CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+        aa = ptlrpc_req_async_args(req);
+        aa->aa_oa = oa;
+        aa->aa_requested_nob = requested_nob;
+        aa->aa_nio_count = niocount;
+        aa->aa_page_count = page_count;
+        aa->aa_resends = 0;
+        aa->aa_ppga = pga;
+        aa->aa_cli = cli;
 	INIT_LIST_HEAD(&aa->aa_oaps);
 
 	*reqp = req;
@@ -1643,17 +1389,13 @@ static void dump_all_bulk_pages(struct obdo *oa, __u32 page_count,
 }
 
 static int
-check_write_checksum(struct obdo *oa, const struct lnet_process_id *peer,
-		     __u32 client_cksum, __u32 server_cksum,
-		     struct osc_brw_async_args *aa)
-{
-	const char *obd_name = aa->aa_cli->cl_import->imp_obd->obd_name;
-	enum cksum_types cksum_type;
-	obd_dif_csum_fn *fn = NULL;
-	int sector_size = 0;
-	__u32 new_cksum;
-	char *msg;
-	int rc;
+check_write_checksum(struct obdo *oa, const lnet_process_id_t *peer,
+				__u32 client_cksum, __u32 server_cksum,
+				struct osc_brw_async_args *aa)
+{
+        __u32 new_cksum;
+        char *msg;
+        cksum_type_t cksum_type;
 
         if (server_cksum == client_cksum) {
                 CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
@@ -1664,43 +1406,12 @@ check_write_checksum(struct obdo *oa, const struct lnet_process_id *peer,
 		dump_all_bulk_pages(oa, aa->aa_page_count, aa->aa_ppga,
 				    server_cksum, client_cksum);
 
-	cksum_type = obd_cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
-					   oa->o_flags : 0);
-
-	switch (cksum_type) {
-	case OBD_CKSUM_T10IP512:
-		fn = obd_dif_ip_fn;
-		sector_size = 512;
-		break;
-	case OBD_CKSUM_T10IP4K:
-		fn = obd_dif_ip_fn;
-		sector_size = 4096;
-		break;
-	case OBD_CKSUM_T10CRC512:
-		fn = obd_dif_crc_fn;
-		sector_size = 512;
-		break;
-	case OBD_CKSUM_T10CRC4K:
-		fn = obd_dif_crc_fn;
-		sector_size = 4096;
-		break;
-	default:
-		break;
-	}
-
-	if (fn)
-		rc = osc_checksum_bulk_t10pi(obd_name, aa->aa_requested_nob,
-					     aa->aa_page_count, aa->aa_ppga,
-					     OST_WRITE, fn, sector_size,
-					     &new_cksum);
-	else
-		rc = osc_checksum_bulk(aa->aa_requested_nob, aa->aa_page_count,
-				       aa->aa_ppga, OST_WRITE, cksum_type,
-				       &new_cksum);
+	cksum_type = cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
+				       oa->o_flags : 0);
+	new_cksum = osc_checksum_bulk(aa->aa_requested_nob, aa->aa_page_count,
+				      aa->aa_ppga, OST_WRITE, cksum_type);
 
-	if (rc < 0)
-		msg = "failed to calculate the client write checksum";
-	else if (cksum_type != obd_cksum_type_unpack(aa->aa_oa->o_flags))
+	if (cksum_type != cksum_type_unpack(aa->aa_oa->o_flags))
                 msg = "the server did not use the checksum type specified in "
                       "the original request - likely a protocol problem";
         else if (new_cksum == server_cksum)
@@ -1716,15 +1427,15 @@ check_write_checksum(struct obdo *oa, const struct lnet_process_id *peer,
 			   DFID " object "DOSTID" extent [%llu-%llu], original "
 			   "client csum %x (type %x), server csum %x (type %x),"
 			   " client csum now %x\n",
-			   obd_name, msg, libcfs_nid2str(peer->nid),
+			   aa->aa_cli->cl_import->imp_obd->obd_name,
+			   msg, libcfs_nid2str(peer->nid),
 			   oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0,
 			   oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
 			   oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
 			   POSTID(&oa->o_oi), aa->aa_ppga[0]->off,
 			   aa->aa_ppga[aa->aa_page_count - 1]->off +
 				aa->aa_ppga[aa->aa_page_count-1]->count - 1,
-			   client_cksum,
-			   obd_cksum_type_unpack(aa->aa_oa->o_flags),
+			   client_cksum, cksum_type_unpack(aa->aa_oa->o_flags),
 			   server_cksum, cksum_type, new_cksum);
 	return 1;
 }
@@ -1732,12 +1443,11 @@ check_write_checksum(struct obdo *oa, const struct lnet_process_id *peer,
 /* Note rc enters this function as number of bytes transferred */
 static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
 {
-	struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
-	struct client_obd *cli = aa->aa_cli;
-	const char *obd_name = cli->cl_import->imp_obd->obd_name;
+        struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
 	const struct lnet_process_id *peer =
-		&req->rq_import->imp_connection->c_peer;
-	struct ost_body *body;
+                        &req->rq_import->imp_connection->c_peer;
+        struct client_obd *cli = aa->aa_cli;
+        struct ost_body *body;
 	u32 client_cksum = 0;
         ENTRY;
 
@@ -1762,7 +1472,7 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
 		CDEBUG(D_QUOTA, "setdq for [%u %u %u] with valid %#llx, flags %x\n",
 		       body->oa.o_uid, body->oa.o_gid, body->oa.o_projid,
 		       body->oa.o_valid, body->oa.o_flags);
-		       osc_quota_setdq(cli, req->rq_xid, qid, body->oa.o_valid,
+		       osc_quota_setdq(cli, qid, body->oa.o_valid,
 				       body->oa.o_flags);
         }
 
@@ -1779,9 +1489,9 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
                         CERROR("Unexpected +ve rc %d\n", rc);
                         RETURN(-EPROTO);
                 }
+                LASSERT(req->rq_bulk->bd_nob == aa->aa_requested_nob);
 
-		if (req->rq_bulk != NULL &&
-		    sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk))
+                if (sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk))
                         RETURN(-EAGAIN);
 
                 if ((aa->aa_oa->o_valid & OBD_MD_FLCKSUM) && client_cksum &&
@@ -1796,14 +1506,8 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
 
         /* The rest of this function executes only for OST_READs */
 
-	if (req->rq_bulk == NULL) {
-		rc = req_capsule_get_size(&req->rq_pill, &RMF_SHORT_IO,
-					  RCL_SERVER);
-		LASSERT(rc == req->rq_status);
-	} else {
-		/* if unwrap_bulk failed, return -EAGAIN to retry */
-		rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc);
-	}
+        /* if unwrap_bulk failed, return -EAGAIN to retry */
+        rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc);
         if (rc < 0)
                 GOTO(out, rc = -EAGAIN);
 
@@ -1813,41 +1517,12 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
                 RETURN(-EPROTO);
         }
 
-	if (req->rq_bulk != NULL && rc != req->rq_bulk->bd_nob_transferred) {
+        if (rc != req->rq_bulk->bd_nob_transferred) {
                 CERROR ("Unexpected rc %d (%d transferred)\n",
                         rc, req->rq_bulk->bd_nob_transferred);
                 return (-EPROTO);
         }
 
-	if (req->rq_bulk == NULL) {
-		/* short io */
-		int nob, pg_count, i = 0;
-		unsigned char *buf;
-
-		CDEBUG(D_CACHE, "Using short io read, size %d\n", rc);
-		pg_count = aa->aa_page_count;
-		buf = req_capsule_server_sized_get(&req->rq_pill, &RMF_SHORT_IO,
-						   rc);
-		nob = rc;
-		while (nob > 0 && pg_count > 0) {
-			unsigned char *ptr;
-			int count = aa->aa_ppga[i]->count > nob ?
-				    nob : aa->aa_ppga[i]->count;
-
-			CDEBUG(D_CACHE, "page %p count %d\n",
-			       aa->aa_ppga[i]->pg, count);
-			ptr = ll_kmap_atomic(aa->aa_ppga[i]->pg, KM_USER0);
-			memcpy(ptr + (aa->aa_ppga[i]->off & ~PAGE_MASK), buf,
-			       count);
-			ll_kunmap_atomic((void *) ptr, KM_USER0);
-
-			buf += count;
-			nob -= count;
-			i++;
-			pg_count--;
-		}
-	}
-
         if (rc < aa->aa_requested_nob)
                 handle_short_read(rc, aa->aa_page_count, aa->aa_ppga);
 
@@ -1856,19 +1531,15 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
 		u32        server_cksum = body->oa.o_cksum;
 		char      *via = "";
 		char      *router = "";
-		enum cksum_types cksum_type;
-		u32 o_flags = body->oa.o_valid & OBD_MD_FLFLAGS ?
-			body->oa.o_flags : 0;
-
-		cksum_type = obd_cksum_type_unpack(o_flags);
-		rc = osc_checksum_bulk_rw(obd_name, cksum_type, rc,
-					  aa->aa_page_count, aa->aa_ppga,
-					  OST_READ, &client_cksum);
-		if (rc < 0)
-			GOTO(out, rc);
-
-		if (req->rq_bulk != NULL &&
-		    peer->nid != req->rq_bulk->bd_sender) {
+                cksum_type_t cksum_type;
+
+                cksum_type = cksum_type_unpack(body->oa.o_valid &OBD_MD_FLFLAGS?
+                                               body->oa.o_flags : 0);
+                client_cksum = osc_checksum_bulk(rc, aa->aa_page_count,
+                                                 aa->aa_ppga, OST_READ,
+                                                 cksum_type);
+
+		if (peer->nid != req->rq_bulk->bd_sender) {
 			via = " via ";
 			router = libcfs_nid2str(req->rq_bulk->bd_sender);
 		}
@@ -1888,7 +1559,7 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
 					   "%s%s%s inode "DFID" object "DOSTID
 					   " extent [%llu-%llu], client %x, "
 					   "server %x, cksum_type %x\n",
-					   obd_name,
+					   req->rq_import->imp_obd->obd_name,
 					   libcfs_nid2str(peer->nid),
 					   via, router,
 					   clbody->oa.o_valid & OBD_MD_FLFID ?
@@ -2042,14 +1713,13 @@ static int brw_interpret(const struct lu_env *env,
 	struct osc_extent *ext;
 	struct osc_extent *tmp;
 	struct client_obd *cli = aa->aa_cli;
-	unsigned long		transferred = 0;
         ENTRY;
 
         rc = osc_brw_fini_request(req, rc);
         CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc);
         /* When server return -EINPROGRESS, client should always retry
          * regardless of the number of times the bulk was resent already. */
-	if (osc_recoverable_error(rc) && !req->rq_no_delay) {
+	if (osc_recoverable_error(rc)) {
 		if (req->rq_import_generation !=
 		    req->rq_import->imp_generation) {
 			CDEBUG(D_HA, "%s: resend cross eviction for object: "
@@ -2123,26 +1793,20 @@ static int brw_interpret(const struct lu_env *env,
 			cl_object_attr_update(env, obj, attr, valid);
 		cl_object_attr_unlock(obj);
 	}
-	OBD_SLAB_FREE_PTR(aa->aa_oa, osc_obdo_kmem);
-	aa->aa_oa = NULL;
+	OBDO_FREE(aa->aa_oa);
 
 	if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE && rc == 0)
 		osc_inc_unstable_pages(req);
 
 	list_for_each_entry_safe(ext, tmp, &aa->aa_exts, oe_link) {
 		list_del_init(&ext->oe_link);
-		osc_extent_finish(env, ext, 1,
-				  rc && req->rq_no_delay ? -EWOULDBLOCK : rc);
+		osc_extent_finish(env, ext, 1, rc);
 	}
 	LASSERT(list_empty(&aa->aa_exts));
 	LASSERT(list_empty(&aa->aa_oaps));
 
-	transferred = (req->rq_bulk == NULL ? /* short io */
-		       aa->aa_requested_nob :
-		       req->rq_bulk->bd_nob_transferred);
-
 	osc_release_ppga(aa->aa_ppga, aa->aa_page_count);
-	ptlrpc_lprocfs_brw(req, transferred);
+	ptlrpc_lprocfs_brw(req, req->rq_bulk->bd_nob_transferred);
 
 	spin_lock(&cli->cl_loi_list_lock);
 	/* We need to decrement before osc_ap_completion->osc_wake_cache_waiters
@@ -2200,11 +1864,9 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 	int				page_count = 0;
 	bool				soft_sync = false;
 	bool				interrupted = false;
-	bool				ndelay = false;
 	int				i;
 	int				grant = 0;
 	int				rc;
-	__u32				layout_version = 0;
 	struct list_head		rpc_list = LIST_HEAD_INIT(rpc_list);
 	struct ost_body			*body;
 	ENTRY;
@@ -2216,7 +1878,6 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 		mem_tight |= ext->oe_memalloc;
 		grant += ext->oe_grants;
 		page_count += ext->oe_nr_pages;
-		layout_version = MAX(layout_version, ext->oe_layout_version);
 		if (obj == NULL)
 			obj = ext->oe_obj;
 	}
@@ -2229,7 +1890,7 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 	if (pga == NULL)
 		GOTO(out, rc = -ENOMEM);
 
-	OBD_SLAB_ALLOC_PTR_GFP(oa, osc_obdo_kmem, GFP_NOFS);
+	OBDO_ALLOC(oa);
 	if (oa == NULL)
 		GOTO(out, rc = -ENOMEM);
 
@@ -2259,8 +1920,6 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 			if (oap->oap_interrupted)
 				interrupted = true;
 		}
-		if (ext->oe_ndelay)
-			ndelay = true;
 	}
 
 	/* first page in the list */
@@ -2274,16 +1933,8 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 	crattr->cra_oa = oa;
 	cl_req_attr_set(env, osc2cl(obj), crattr);
 
-	if (cmd == OBD_BRW_WRITE) {
+	if (cmd == OBD_BRW_WRITE)
 		oa->o_grant_used = grant;
-		if (layout_version > 0) {
-			CDEBUG(D_LAYOUT, DFID": write with layout version %u\n",
-			       PFID(&oa->o_oi.oi_fid), layout_version);
-
-			oa->o_layout_version = layout_version;
-			oa->o_valid |= OBD_MD_LAYOUT_VERSION;
-		}
-	}
 
 	sort_brw_pages(pga, page_count);
 	rc = osc_brw_prep_request(cmd, cli, oa, page_count, pga, &req, 0);
@@ -2298,12 +1949,6 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 	oap->oap_request = ptlrpc_request_addref(req);
 	if (interrupted && !req->rq_intr)
 		ptlrpc_mark_interrupted(req);
-	if (ndelay) {
-		req->rq_no_resend = req->rq_no_delay = 1;
-		/* probably set a shorter timeout value.
-		 * to handle ETIMEDOUT in brw_interpret() correctly. */
-		/* lustre_msg_set_timeout(req, req->rq_timeout / 2); */
-	}
 
 	/* Need to update the timestamps after the request is built in case
 	 * we race with setattr (locally or in queue at OST).  If OST gets
@@ -2312,7 +1957,7 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 	 * way to do this in a single call.  bug 10150 */
 	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
 	crattr->cra_oa = &body->oa;
-	crattr->cra_flags = OBD_MD_FLMTIME | OBD_MD_FLCTIME | OBD_MD_FLATIME;
+	crattr->cra_flags = OBD_MD_FLMTIME|OBD_MD_FLCTIME|OBD_MD_FLATIME;
 	cl_req_attr_set(env, osc2cl(obj), crattr);
 	lustre_msg_set_jobid(req->rq_reqmsg, crattr->cra_jobid);
 
@@ -2357,7 +2002,7 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 		LASSERT(req == NULL);
 
 		if (oa)
-			OBD_SLAB_FREE_PTR(oa, osc_obdo_kmem);
+			OBDO_FREE(oa);
 		if (pga)
 			OBD_FREE(pga, sizeof(*pga) * page_count);
 		/* this should happen rarely and is pretty bad, it makes the
@@ -2390,10 +2035,10 @@ static int osc_set_lock_data(struct ldlm_lock *lock, void *data)
 	return set;
 }
 
-int osc_enqueue_fini(struct ptlrpc_request *req, osc_enqueue_upcall_f upcall,
-		     void *cookie, struct lustre_handle *lockh,
-		     enum ldlm_mode mode, __u64 *flags, bool speculative,
-		     int errcode)
+static int osc_enqueue_fini(struct ptlrpc_request *req,
+			    osc_enqueue_upcall_f upcall, void *cookie,
+			    struct lustre_handle *lockh, enum ldlm_mode mode,
+			    __u64 *flags, int agl, int errcode)
 {
 	bool intent = *flags & LDLM_FL_HAS_INTENT;
 	int rc;
@@ -2410,7 +2055,7 @@ int osc_enqueue_fini(struct ptlrpc_request *req, osc_enqueue_upcall_f upcall,
 			ptlrpc_status_ntoh(rep->lock_policy_res1);
 		if (rep->lock_policy_res1)
 			errcode = rep->lock_policy_res1;
-		if (!speculative)
+		if (!agl)
 			*flags |= LDLM_FL_LVB_READY;
 	} else if (errcode == ELDLM_OK) {
 		*flags |= LDLM_FL_LVB_READY;
@@ -2425,11 +2070,12 @@ int osc_enqueue_fini(struct ptlrpc_request *req, osc_enqueue_upcall_f upcall,
 	if (errcode == ELDLM_OK && lustre_handle_is_used(lockh))
 		ldlm_lock_decref(lockh, mode);
 
-	RETURN(rc);
+        RETURN(rc);
 }
 
-int osc_enqueue_interpret(const struct lu_env *env, struct ptlrpc_request *req,
-			  struct osc_enqueue_args *aa, int rc)
+static int osc_enqueue_interpret(const struct lu_env *env,
+				 struct ptlrpc_request *req,
+				 struct osc_enqueue_args *aa, int rc)
 {
 	struct ldlm_lock *lock;
 	struct lustre_handle *lockh = &aa->oa_lockh;
@@ -2459,7 +2105,7 @@ int osc_enqueue_interpret(const struct lu_env *env, struct ptlrpc_request *req,
 	/* Let CP AST to grant the lock first. */
 	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1);
 
-	if (aa->oa_speculative) {
+	if (aa->oa_agl) {
 		LASSERT(aa->oa_lvb == NULL);
 		LASSERT(aa->oa_flags == NULL);
 		aa->oa_flags = &flags;
@@ -2471,9 +2117,9 @@ int osc_enqueue_interpret(const struct lu_env *env, struct ptlrpc_request *req,
 				   lockh, rc);
 	/* Complete osc stuff. */
 	rc = osc_enqueue_fini(req, aa->oa_upcall, aa->oa_cookie, lockh, mode,
-			      aa->oa_flags, aa->oa_speculative, rc);
+			      aa->oa_flags, aa->oa_agl, rc);
 
-	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
+        OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
 
 	ldlm_lock_decref(lockh, mode);
 	LDLM_LOCK_PUT(lock);
@@ -2491,10 +2137,10 @@ struct ptlrpc_request_set *PTLRPCD_SET = (void *)1;
  * release locks just after they are obtained. */
 int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
 		     __u64 *flags, union ldlm_policy_data *policy,
-		     struct ost_lvb *lvb, osc_enqueue_upcall_f upcall,
-		     void *cookie, struct ldlm_enqueue_info *einfo,
-		     struct ptlrpc_request_set *rqset, int async,
-		     bool speculative)
+		     struct ost_lvb *lvb, int kms_valid,
+		     osc_enqueue_upcall_f upcall, void *cookie,
+		     struct ldlm_enqueue_info *einfo,
+		     struct ptlrpc_request_set *rqset, int async, int agl)
 {
 	struct obd_device *obd = exp->exp_obd;
 	struct lustre_handle lockh = { 0 };
@@ -2510,6 +2156,15 @@ int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
 	policy->l_extent.start -= policy->l_extent.start & ~PAGE_MASK;
 	policy->l_extent.end |= ~PAGE_MASK;
 
+        /*
+         * kms is not valid when either object is completely fresh (so that no
+         * locks are cached), or object was evicted. In the latter case cached
+         * lock cannot be used, because it would prime inode state with
+         * potentially stale LVB.
+         */
+        if (!kms_valid)
+                goto no_match;
+
         /* Next, search for already existing extent locks that will cover us */
         /* If we're trying to read, we also search for an existing PW lock.  The
          * VFS and page cache already protect us locally, so lots of readers/
@@ -2525,10 +2180,7 @@ int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
         mode = einfo->ei_mode;
         if (einfo->ei_mode == LCK_PR)
                 mode |= LCK_PW;
-	/* Normal lock requests must wait for the LVB to be ready before
-	 * matching a lock; speculative lock requests do not need to,
-	 * because they will not actually use the lock. */
-	if (!speculative)
+	if (agl == 0)
 		match_flags |= LDLM_FL_LVB_READY;
 	if (intent != 0)
 		match_flags |= LDLM_FL_BLOCK_GRANTED;
@@ -2541,22 +2193,13 @@ int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
 			RETURN(ELDLM_OK);
 
 		matched = ldlm_handle2lock(&lockh);
-		if (speculative) {
-			/* This DLM lock request is speculative, and does not
-			 * have an associated IO request. Therefore if there
-			 * is already a DLM lock, it wll just inform the
-			 * caller to cancel the request for this stripe.*/
-			lock_res_and_lock(matched);
-			if (ldlm_extent_equal(&policy->l_extent,
-			    &matched->l_policy_data.l_extent))
-				rc = -EEXIST;
-			else
-				rc = -ECANCELED;
-			unlock_res_and_lock(matched);
-
+		if (agl) {
+			/* AGL enqueues DLM locks speculatively. Therefore if
+			 * it already exists a DLM lock, it wll just inform the
+			 * caller to cancel the AGL process for this stripe. */
 			ldlm_lock_decref(&lockh, mode);
 			LDLM_LOCK_PUT(matched);
-			RETURN(rc);
+			RETURN(-ECANCELED);
 		} else if (osc_set_lock_data(matched, einfo->ei_cbdata)) {
 			*flags |= LDLM_FL_LVB_READY;
 
@@ -2572,6 +2215,7 @@ int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
 		}
 	}
 
+no_match:
 	if (*flags & (LDLM_FL_TEST_LOCK | LDLM_FL_MATCH_LOCK))
 		RETURN(-ENOLCK);
 
@@ -2602,20 +2246,20 @@ int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
 			struct osc_enqueue_args *aa;
 			CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
 			aa = ptlrpc_req_async_args(req);
-			aa->oa_exp	   = exp;
-			aa->oa_mode	   = einfo->ei_mode;
-			aa->oa_type	   = einfo->ei_type;
+			aa->oa_exp    = exp;
+			aa->oa_mode   = einfo->ei_mode;
+			aa->oa_type   = einfo->ei_type;
 			lustre_handle_copy(&aa->oa_lockh, &lockh);
-			aa->oa_upcall	   = upcall;
-			aa->oa_cookie	   = cookie;
-			aa->oa_speculative = speculative;
-			if (!speculative) {
+			aa->oa_upcall = upcall;
+			aa->oa_cookie = cookie;
+			aa->oa_agl    = !!agl;
+			if (!agl) {
 				aa->oa_flags  = flags;
 				aa->oa_lvb    = lvb;
 			} else {
-				/* speculative locks are essentially to enqueue
-				 * a DLM lock  in advance, so we don't care
-				 * about the result of the enqueue. */
+				/* AGL is essentially to enqueue an DLM lock
+				 * in advance, so we don't care about the
+				 * result of AGL enqueue. */
 				aa->oa_lvb    = NULL;
 				aa->oa_flags  = NULL;
 			}
@@ -2633,17 +2277,16 @@ int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
 	}
 
 	rc = osc_enqueue_fini(req, upcall, cookie, &lockh, einfo->ei_mode,
-			      flags, speculative, rc);
+			      flags, agl, rc);
 	if (intent)
 		ptlrpc_req_finished(req);
 
 	RETURN(rc);
 }
 
-int osc_match_base(const struct lu_env *env, struct obd_export *exp,
-		   struct ldlm_res_id *res_id, enum ldlm_type type,
-		   union ldlm_policy_data *policy, enum ldlm_mode mode,
-		   __u64 *flags, struct osc_object *obj,
+int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+		   enum ldlm_type type, union ldlm_policy_data *policy,
+		   enum ldlm_mode mode, __u64 *flags, void *data,
 		   struct lustre_handle *lockh, int unref)
 {
 	struct obd_device *obd = exp->exp_obd;
@@ -2671,19 +2314,11 @@ int osc_match_base(const struct lu_env *env, struct obd_export *exp,
 	if (rc == 0 || lflags & LDLM_FL_TEST_LOCK)
 		RETURN(rc);
 
-	if (obj != NULL) {
+	if (data != NULL) {
 		struct ldlm_lock *lock = ldlm_handle2lock(lockh);
 
 		LASSERT(lock != NULL);
-		if (osc_set_lock_data(lock, obj)) {
-			lock_res_and_lock(lock);
-			if (!ldlm_is_lvb_cached(lock)) {
-				LASSERT(lock->l_ast_data == obj);
-				osc_lock_lvb_update(env, obj, lock, NULL);
-				ldlm_set_lvb_cached(lock);
-			}
-			unlock_res_and_lock(lock);
-		} else {
+		if (!osc_set_lock_data(lock, data)) {
 			ldlm_lock_decref(lockh, rc);
 			rc = 0;
 		}
@@ -2726,13 +2361,13 @@ static int osc_statfs_interpret(const struct lu_env *env,
 }
 
 static int osc_statfs_async(struct obd_export *exp,
-			    struct obd_info *oinfo, time64_t max_age,
+                            struct obd_info *oinfo, __u64 max_age,
                             struct ptlrpc_request_set *rqset)
 {
         struct obd_device     *obd = class_exp2obd(exp);
         struct ptlrpc_request *req;
         struct osc_async_args *aa;
-	int rc;
+        int                    rc;
         ENTRY;
 
         /* We could possibly pass max_age in the request (as an absolute
@@ -2750,35 +2385,34 @@ static int osc_statfs_async(struct obd_export *exp,
                 ptlrpc_request_free(req);
                 RETURN(rc);
         }
-	ptlrpc_request_set_replen(req);
-	req->rq_request_portal = OST_CREATE_PORTAL;
-	ptlrpc_at_set_req_timeout(req);
+        ptlrpc_request_set_replen(req);
+        req->rq_request_portal = OST_CREATE_PORTAL;
+        ptlrpc_at_set_req_timeout(req);
 
-	if (oinfo->oi_flags & OBD_STATFS_NODELAY) {
-		/* procfs requests not want stat in wait for avoid deadlock */
-		req->rq_no_resend = 1;
-		req->rq_no_delay = 1;
-	}
+        if (oinfo->oi_flags & OBD_STATFS_NODELAY) {
+                /* procfs requests not want stat in wait for avoid deadlock */
+                req->rq_no_resend = 1;
+                req->rq_no_delay = 1;
+        }
 
-	req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_statfs_interpret;
-	CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
-	aa = ptlrpc_req_async_args(req);
-	aa->aa_oi = oinfo;
+        req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_statfs_interpret;
+        CLASSERT (sizeof(*aa) <= sizeof(req->rq_async_args));
+        aa = ptlrpc_req_async_args(req);
+        aa->aa_oi = oinfo;
 
-	ptlrpc_set_add_req(rqset, req);
-	RETURN(0);
+        ptlrpc_set_add_req(rqset, req);
+        RETURN(0);
 }
 
 static int osc_statfs(const struct lu_env *env, struct obd_export *exp,
-		      struct obd_statfs *osfs, time64_t max_age, __u32 flags)
+                      struct obd_statfs *osfs, __u64 max_age, __u32 flags)
 {
-	struct obd_device     *obd = class_exp2obd(exp);
-	struct obd_statfs     *msfs;
-	struct ptlrpc_request *req;
-	struct obd_import     *imp = NULL;
-	int rc;
-	ENTRY;
-
+        struct obd_device     *obd = class_exp2obd(exp);
+        struct obd_statfs     *msfs;
+        struct ptlrpc_request *req;
+        struct obd_import     *imp = NULL;
+        int rc;
+        ENTRY;
 
         /*Since the request might also come from lprocfs, so we need
          *sync this with client_disconnect_export Bug15684*/
@@ -2789,48 +2423,49 @@ static int osc_statfs(const struct lu_env *env, struct obd_export *exp,
         if (!imp)
                 RETURN(-ENODEV);
 
-	/* We could possibly pass max_age in the request (as an absolute
-	 * timestamp or a "seconds.usec ago") so the target can avoid doing
-	 * extra calls into the filesystem if that isn't necessary (e.g.
-	 * during mount that would help a bit).  Having relative timestamps
-	 * is not so great if request processing is slow, while absolute
-	 * timestamps are not ideal because they need time synchronization. */
-	req = ptlrpc_request_alloc(imp, &RQF_OST_STATFS);
+        /* We could possibly pass max_age in the request (as an absolute
+         * timestamp or a "seconds.usec ago") so the target can avoid doing
+         * extra calls into the filesystem if that isn't necessary (e.g.
+         * during mount that would help a bit).  Having relative timestamps
+         * is not so great if request processing is slow, while absolute
+         * timestamps are not ideal because they need time synchronization. */
+        req = ptlrpc_request_alloc(imp, &RQF_OST_STATFS);
 
-	class_import_put(imp);
+        class_import_put(imp);
 
-	if (req == NULL)
-		RETURN(-ENOMEM);
+        if (req == NULL)
+                RETURN(-ENOMEM);
 
-	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
-	if (rc) {
-		ptlrpc_request_free(req);
-		RETURN(rc);
-	}
-	ptlrpc_request_set_replen(req);
-	req->rq_request_portal = OST_CREATE_PORTAL;
-	ptlrpc_at_set_req_timeout(req);
+        rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
+        if (rc) {
+                ptlrpc_request_free(req);
+                RETURN(rc);
+        }
+        ptlrpc_request_set_replen(req);
+        req->rq_request_portal = OST_CREATE_PORTAL;
+        ptlrpc_at_set_req_timeout(req);
 
-	if (flags & OBD_STATFS_NODELAY) {
-		/* procfs requests not want stat in wait for avoid deadlock */
-		req->rq_no_resend = 1;
-		req->rq_no_delay = 1;
-	}
+        if (flags & OBD_STATFS_NODELAY) {
+                /* procfs requests not want stat in wait for avoid deadlock */
+                req->rq_no_resend = 1;
+                req->rq_no_delay = 1;
+        }
 
-	rc = ptlrpc_queue_wait(req);
-	if (rc)
-		GOTO(out, rc);
+        rc = ptlrpc_queue_wait(req);
+        if (rc)
+                GOTO(out, rc);
 
-	msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
-	if (msfs == NULL)
-		GOTO(out, rc = -EPROTO);
+        msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+        if (msfs == NULL) {
+                GOTO(out, rc = -EPROTO);
+        }
 
-	*osfs = *msfs;
+        *osfs = *msfs;
 
-	EXIT;
-out:
-	ptlrpc_req_finished(req);
-	return rc;
+        EXIT;
+ out:
+        ptlrpc_req_finished(req);
+        return rc;
 }
 
 static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
@@ -2870,9 +2505,10 @@ static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
 	return err;
 }
 
-int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
-		       u32 keylen, void *key, u32 vallen, void *val,
-		       struct ptlrpc_request_set *set)
+static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+			      u32 keylen, void *key,
+			      u32 vallen, void *val,
+			      struct ptlrpc_request_set *set)
 {
         struct ptlrpc_request *req;
         struct obd_device     *obd = exp->exp_obd;
@@ -2959,23 +2595,23 @@ int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
 	tmp = req_capsule_client_get(&req->rq_pill, KEY_IS(KEY_GRANT_SHRINK) ?
 							&RMF_OST_BODY :
 							&RMF_SETINFO_VAL);
-	memcpy(tmp, val, vallen);
+        memcpy(tmp, val, vallen);
 
 	if (KEY_IS(KEY_GRANT_SHRINK)) {
-		struct osc_grant_args *aa;
-		struct obdo *oa;
-
-		CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
-		aa = ptlrpc_req_async_args(req);
-		OBD_SLAB_ALLOC_PTR_GFP(oa, osc_obdo_kmem, GFP_NOFS);
-		if (!oa) {
-			ptlrpc_req_finished(req);
-			RETURN(-ENOMEM);
-		}
-		*oa = ((struct ost_body *)val)->oa;
-		aa->aa_oa = oa;
-		req->rq_interpret_reply = osc_shrink_grant_interpret;
-	}
+                struct osc_grant_args *aa;
+                struct obdo *oa;
+
+                CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+                aa = ptlrpc_req_async_args(req);
+                OBDO_ALLOC(oa);
+                if (!oa) {
+                        ptlrpc_req_finished(req);
+                        RETURN(-ENOMEM);
+                }
+                *oa = ((struct ost_body *)val)->oa;
+                aa->aa_oa = oa;
+                req->rq_interpret_reply = osc_shrink_grant_interpret;
+        }
 
 	ptlrpc_request_set_replen(req);
 	if (!KEY_IS(KEY_GRANT_SHRINK)) {
@@ -2988,27 +2624,25 @@ int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
 
 	RETURN(0);
 }
-EXPORT_SYMBOL(osc_set_info_async);
 
-int osc_reconnect(const struct lu_env *env, struct obd_export *exp,
-		  struct obd_device *obd, struct obd_uuid *cluuid,
-		  struct obd_connect_data *data, void *localdata)
+static int osc_reconnect(const struct lu_env *env,
+                         struct obd_export *exp, struct obd_device *obd,
+                         struct obd_uuid *cluuid,
+                         struct obd_connect_data *data,
+                         void *localdata)
 {
-	struct client_obd *cli = &obd->u.cli;
+        struct client_obd *cli = &obd->u.cli;
 
-	if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) {
-		long lost_grant;
+        if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) {
+                long lost_grant;
 		long grant;
 
 		spin_lock(&cli->cl_loi_list_lock);
 		grant = cli->cl_avail_grant + cli->cl_reserved_grant;
-		if (data->ocd_connect_flags & OBD_CONNECT_GRANT_PARAM) {
-			/* restore ocd_grant_blkbits as client page bits */
-			data->ocd_grant_blkbits = PAGE_SHIFT;
+		if (data->ocd_connect_flags & OBD_CONNECT_GRANT_PARAM)
 			grant += cli->cl_dirty_grant;
-		} else {
+		else
 			grant += cli->cl_dirty_pages << PAGE_SHIFT;
-		}
 		data->ocd_grant = grant ? : 2 * cli_brw_size(obd);
 		lost_grant = cli->cl_lost_grant;
 		cli->cl_lost_grant = 0;
@@ -3021,36 +2655,37 @@ int osc_reconnect(const struct lu_env *env, struct obd_export *exp,
 
 	RETURN(0);
 }
-EXPORT_SYMBOL(osc_reconnect);
 
-int osc_disconnect(struct obd_export *exp)
+static int osc_disconnect(struct obd_export *exp)
 {
 	struct obd_device *obd = class_exp2obd(exp);
 	int rc;
 
-	rc = client_disconnect_export(exp);
-	/**
-	 * Initially we put del_shrink_grant before disconnect_export, but it
-	 * causes the following problem if setup (connect) and cleanup
-	 * (disconnect) are tangled together.
-	 *      connect p1                     disconnect p2
-	 *   ptlrpc_connect_import
-	 *     ...............               class_manual_cleanup
-	 *                                     osc_disconnect
-	 *                                     del_shrink_grant
-	 *   ptlrpc_connect_interrupt
-	 *     osc_init_grant
-	 *   add this client to shrink list
-	 *                                      cleanup_osc
-	 * Bang! grant shrink thread trigger the shrink. BUG18662
-	 */
-	osc_del_grant_list(&obd->u.cli);
-	return rc;
-}
-EXPORT_SYMBOL(osc_disconnect);
-
-int osc_ldlm_resource_invalidate(struct cfs_hash *hs, struct cfs_hash_bd *bd,
-				 struct hlist_node *hnode, void *arg)
+        rc = client_disconnect_export(exp);
+        /**
+         * Initially we put del_shrink_grant before disconnect_export, but it
+         * causes the following problem if setup (connect) and cleanup
+         * (disconnect) are tangled together.
+         *      connect p1                     disconnect p2
+         *   ptlrpc_connect_import
+         *     ...............               class_manual_cleanup
+         *                                     osc_disconnect
+         *                                     del_shrink_grant
+         *   ptlrpc_connect_interrupt
+         *     init_grant_shrink
+         *   add this client to shrink list
+         *                                      cleanup_osc
+         * Bang! pinger trigger the shrink.
+         * So the osc should be disconnected from the shrink list, after we
+         * are sure the import has been destroyed. BUG18662
+         */
+        if (obd->u.cli.cl_import == NULL)
+                osc_del_shrink_grant(&obd->u.cli);
+        return rc;
+}
+
+static int osc_ldlm_resource_invalidate(struct cfs_hash *hs,
+	struct cfs_hash_bd *bd, struct hlist_node *hnode, void *arg)
 {
 	struct lu_env *env = arg;
 	struct ldlm_resource *res = cfs_hash_object(hs, hnode);
@@ -3079,7 +2714,6 @@ int osc_ldlm_resource_invalidate(struct cfs_hash *hs, struct cfs_hash_bd *bd,
 
 	RETURN(0);
 }
-EXPORT_SYMBOL(osc_ldlm_resource_invalidate);
 
 static int osc_import_event(struct obd_device *obd,
                             struct obd_import *imp,
@@ -3170,7 +2804,7 @@ static int osc_cancel_weight(struct ldlm_lock *lock)
 	 * Cancel all unused and granted extent lock.
 	 */
 	if (lock->l_resource->lr_type == LDLM_EXTENT &&
-	    ldlm_is_granted(lock) &&
+	    lock->l_granted_mode == lock->l_req_mode &&
 	    osc_ldlm_weigh_ast(lock) == 0)
 		RETURN(1);
 
@@ -3187,12 +2821,15 @@ static int brw_queue_work(const struct lu_env *env, void *data)
 	RETURN(0);
 }
 
-int osc_setup_common(struct obd_device *obd, struct lustre_cfg *lcfg)
+int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 {
 	struct client_obd *cli = &obd->u.cli;
-	void *handler;
-	int rc;
-
+	struct obd_type	  *type;
+	void		  *handler;
+	int		   rc;
+	int		   adding;
+	int		   added;
+	int		   req_count;
 	ENTRY;
 
 	rc = ptlrpcd_addref();
@@ -3203,10 +2840,9 @@ int osc_setup_common(struct obd_device *obd, struct lustre_cfg *lcfg)
 	if (rc)
 		GOTO(out_ptlrpcd, rc);
 
-
 	handler = ptlrpcd_alloc_work(cli->cl_import, brw_queue_work, cli);
 	if (IS_ERR(handler))
-		GOTO(out_ptlrpcd_work, rc = PTR_ERR(handler));
+		GOTO(out_client_setup, rc = PTR_ERR(handler));
 	cli->cl_writeback_work = handler;
 
 	handler = ptlrpcd_alloc_work(cli->cl_import, lru_queue_work, cli);
@@ -3219,43 +2855,36 @@ int osc_setup_common(struct obd_device *obd, struct lustre_cfg *lcfg)
 		GOTO(out_ptlrpcd_work, rc);
 
 	cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL;
-	osc_update_next_shrink(cli);
 
-	RETURN(rc);
-
-out_ptlrpcd_work:
-	if (cli->cl_writeback_work != NULL) {
-		ptlrpcd_destroy_work(cli->cl_writeback_work);
-		cli->cl_writeback_work = NULL;
-	}
-	if (cli->cl_lru_work != NULL) {
-		ptlrpcd_destroy_work(cli->cl_lru_work);
-		cli->cl_lru_work = NULL;
+#ifdef CONFIG_PROC_FS
+	obd->obd_vars = lprocfs_osc_obd_vars;
+#endif
+	/* If this is true then both client (osc) and server (osp) are on the
+	 * same node. The osp layer if loaded first will register the osc proc
+	 * directory. In that case this obd_device will be attached its proc
+	 * tree to type->typ_procsym instead of obd->obd_type->typ_procroot. */
+	type = class_search_type(LUSTRE_OSP_NAME);
+	if (type && type->typ_procsym) {
+		obd->obd_proc_entry = lprocfs_register(obd->obd_name,
+						       type->typ_procsym,
+						       obd->obd_vars, obd);
+		if (IS_ERR(obd->obd_proc_entry)) {
+			rc = PTR_ERR(obd->obd_proc_entry);
+			CERROR("error %d setting up lprocfs for %s\n", rc,
+			       obd->obd_name);
+			obd->obd_proc_entry = NULL;
+		}
+	} else {
+		rc = lprocfs_obd_setup(obd);
 	}
-	client_obd_cleanup(obd);
-out_ptlrpcd:
-	ptlrpcd_decref();
-	RETURN(rc);
-}
-EXPORT_SYMBOL(osc_setup_common);
-
-int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
-{
-	struct client_obd *cli = &obd->u.cli;
-	int		   adding;
-	int		   added;
-	int		   req_count;
-	int		   rc;
-
-	ENTRY;
-
-	rc = osc_setup_common(obd, lcfg);
-	if (rc < 0)
-		RETURN(rc);
 
-	rc = osc_tunables_init(obd);
-	if (rc)
-		RETURN(rc);
+	/* If the basic OSC proc tree construction succeeded then
+	 * lets do the rest. */
+	if (rc == 0) {
+		lproc_osc_attach_seqstat(obd);
+		sptlrpc_lprocfs_cliobd_attach(obd);
+		ptlrpc_lprocfs_register_obd(obd);
+	}
 
 	/*
 	 * We try to control the total number of requests with a upper limit
@@ -3272,18 +2901,32 @@ int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 		atomic_add(added, &osc_pool_req_count);
 	}
 
+	INIT_LIST_HEAD(&cli->cl_grant_shrink_list);
 	ns_register_cancel(obd->obd_namespace, osc_cancel_weight);
 
 	spin_lock(&osc_shrink_lock);
 	list_add_tail(&cli->cl_shrink_list, &osc_shrink_list);
 	spin_unlock(&osc_shrink_lock);
-	cli->cl_import->imp_idle_timeout = osc_idle_timeout;
-	cli->cl_import->imp_idle_debug = D_HA;
 
 	RETURN(0);
+
+out_ptlrpcd_work:
+	if (cli->cl_writeback_work != NULL) {
+		ptlrpcd_destroy_work(cli->cl_writeback_work);
+		cli->cl_writeback_work = NULL;
+	}
+	if (cli->cl_lru_work != NULL) {
+		ptlrpcd_destroy_work(cli->cl_lru_work);
+		cli->cl_lru_work = NULL;
+	}
+out_client_setup:
+	client_obd_cleanup(obd);
+out_ptlrpcd:
+	ptlrpcd_decref();
+	RETURN(rc);
 }
 
-int osc_precleanup_common(struct obd_device *obd)
+static int osc_precleanup(struct obd_device *obd)
 {
 	struct client_obd *cli = &obd->u.cli;
 	ENTRY;
@@ -3309,21 +2952,11 @@ int osc_precleanup_common(struct obd_device *obd)
 	}
 
 	obd_cleanup_client_import(obd);
-	RETURN(0);
-}
-EXPORT_SYMBOL(osc_precleanup_common);
-
-static int osc_precleanup(struct obd_device *obd)
-{
-	ENTRY;
-
-	osc_precleanup_common(obd);
-
 	ptlrpc_lprocfs_unregister_obd(obd);
 	RETURN(0);
 }
 
-int osc_cleanup_common(struct obd_device *obd)
+int osc_cleanup(struct obd_device *obd)
 {
 	struct client_obd *cli = &obd->u.cli;
 	int rc;
@@ -3353,13 +2986,11 @@ int osc_cleanup_common(struct obd_device *obd)
 	ptlrpcd_decref();
 	RETURN(rc);
 }
-EXPORT_SYMBOL(osc_cleanup_common);
 
 int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg)
 {
-	ssize_t count  = class_modify_config(lcfg, PARAM_OSC,
-					     &obd->obd_kset.kobj);
-	return count > 0 ? 0 : count;
+	int rc = class_process_proc_param(PARAM_OSC, obd->obd_vars, lcfg, obd);
+	return rc > 0 ? 0: rc;
 }
 
 static int osc_process_config(struct obd_device *obd, size_t len, void *buf)
@@ -3371,7 +3002,7 @@ static struct obd_ops osc_obd_ops = {
         .o_owner                = THIS_MODULE,
         .o_setup                = osc_setup,
         .o_precleanup           = osc_precleanup,
-	.o_cleanup              = osc_cleanup_common,
+        .o_cleanup              = osc_cleanup,
         .o_add_conn             = client_import_add_conn,
         .o_del_conn             = client_import_del_conn,
         .o_connect              = client_connect_import,
@@ -3464,28 +3095,19 @@ static int __init osc_init(void)
 	osc_rq_pool = ptlrpc_init_rq_pool(0, OST_IO_MAXREQSIZE,
 					  ptlrpc_add_rqs_to_pool);
 
-	if (osc_rq_pool == NULL)
-		GOTO(out_type, rc = -ENOMEM);
-
-	rc = osc_start_grant_work();
-	if (rc != 0)
-		GOTO(out_req_pool, rc);
-
-	RETURN(rc);
-
-out_req_pool:
-	ptlrpc_free_rq_pool(osc_rq_pool);
+	if (osc_rq_pool != NULL)
+		GOTO(out, rc);
+	rc = -ENOMEM;
 out_type:
 	class_unregister_type(LUSTRE_OSC_NAME);
 out_kmem:
 	lu_kmem_fini(osc_caches);
-
+out:
 	RETURN(rc);
 }
 
 static void __exit osc_exit(void)
 {
-	osc_stop_grant_work();
 	remove_shrinker(osc_cache_shrinker);
 	class_unregister_type(LUSTRE_OSC_NAME);
 	lu_kmem_fini(osc_caches);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/client.c b/drivers/staging/lustrefsx/lustre/ptlrpc/client.c
index b9888d92b1fd8..9642a5644009f 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/client.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/client.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -34,7 +34,6 @@
 
 #define DEBUG_SUBSYSTEM S_RPC
 
-#include <linux/delay.h>
 #include <obd_support.h>
 #include <obd_class.h>
 #include <lustre_lib.h>
@@ -127,12 +126,6 @@ struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned nfrags, unsigned max_brw,
 		(ptlrpc_is_bulk_desc_kvec(type) &&
 		 ops->add_iov_frag != NULL));
 
-	if (max_brw > PTLRPC_BULK_OPS_COUNT)
-		RETURN(NULL);
-
-	if (nfrags > LNET_MAX_IOV * max_brw)
-		RETURN(NULL);
-
 	OBD_ALLOC_PTR(desc);
 	if (desc == NULL)
 		return NULL;
@@ -155,7 +148,6 @@ struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned nfrags, unsigned max_brw,
 	desc->bd_portal = portal;
 	desc->bd_type = type;
 	desc->bd_md_count = 0;
-	desc->bd_nob_last = LNET_MTU;
 	desc->bd_frag_ops = (struct ptlrpc_bulk_frag_ops *) ops;
 	LASSERT(max_brw > 0);
 	desc->bd_md_max_brw = min(max_brw, PTLRPC_BULK_OPS_COUNT);
@@ -222,15 +214,7 @@ void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
 	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
 
 	kiov = &BD_GET_KIOV(desc, desc->bd_iov_count);
-	if (((desc->bd_iov_count % LNET_MAX_IOV) == 0) ||
-	     ((desc->bd_nob_last + len) > LNET_MTU)) {
-		desc->bd_mds_off[desc->bd_md_count] = desc->bd_iov_count;
-		desc->bd_md_count++;
-		desc->bd_nob_last = 0;
-		LASSERT(desc->bd_md_count <= PTLRPC_BULK_OPS_COUNT);
-	}
 
-	desc->bd_nob_last += len;
 	desc->bd_nob += len;
 
 	if (pin)
@@ -256,15 +240,7 @@ int ptlrpc_prep_bulk_frag(struct ptlrpc_bulk_desc *desc,
 	LASSERT(ptlrpc_is_bulk_desc_kvec(desc->bd_type));
 
 	iovec = &BD_GET_KVEC(desc, desc->bd_iov_count);
-	if (((desc->bd_iov_count % LNET_MAX_IOV) == 0) ||
-	     ((desc->bd_nob_last + len) > LNET_MTU)) {
-		desc->bd_mds_off[desc->bd_md_count] = desc->bd_iov_count;
-		desc->bd_md_count++;
-		desc->bd_nob_last = 0;
-		LASSERT(desc->bd_md_count <= PTLRPC_BULK_OPS_COUNT);
-	}
 
-	desc->bd_nob_last += len;
 	desc->bd_nob += len;
 
 	iovec->iov_base = frag;
@@ -282,7 +258,7 @@ void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc)
 
 	LASSERT(desc != NULL);
 	LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */
-	LASSERT(desc->bd_refs == 0);         /* network hands off */
+	LASSERT(desc->bd_md_count == 0);         /* network hands off */
 	LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL));
 	LASSERT(desc->bd_frag_ops != NULL);
 
@@ -377,7 +353,7 @@ int ptlrpc_at_get_net_latency(struct ptlrpc_request *req)
 
 /* Adjust expected network latency */
 void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
-			       timeout_t service_timeout)
+			       unsigned int service_time)
 {
         unsigned int nl, oldnl;
         struct imp_at *at;
@@ -385,9 +361,8 @@ void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
 
         LASSERT(req->rq_import);
 
-	if (service_timeout > now - req->rq_sent + 3) {
-		/*
-		 * b=16408, however, this can also happen if early reply
+	if (service_time > now - req->rq_sent + 3) {
+		/* bz16408, however, this can also happen if early reply
 		 * is lost and client RPC is expired and resent, early reply
 		 * or reply of original RPC can still be fit in reply buffer
 		 * of resent RPC, now client is measuring time from the
@@ -397,13 +372,13 @@ void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
 		CDEBUG((lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) ?
 		       D_ADAPTTO : D_WARNING,
 		       "Reported service time %u > total measured time %lld\n",
-		       service_timeout, now - req->rq_sent);
+		       service_time, now - req->rq_sent);
 		return;
 	}
 
         /* Network latency is total time less server processing time */
 	nl = max_t(int, now - req->rq_sent -
-			service_timeout, 0) + 1; /* st rounding */
+			service_time, 0) + 1; /* st rounding */
 	at = &req->rq_import->imp_at;
 
         oldnl = at_measured(&at->iat_net_latency, nl);
@@ -444,7 +419,6 @@ static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req)
 __must_hold(&req->rq_lock)
 {
 	struct ptlrpc_request *early_req;
-	timeout_t service_timeout;
 	time64_t olddl;
 	int rc;
 
@@ -474,8 +448,8 @@ __must_hold(&req->rq_lock)
 	lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
 
 	/* Network latency can be adjusted, it is pure network delays */
-	service_timeout = lustre_msg_get_service_timeout(early_req->rq_repmsg);
-	ptlrpc_at_adj_net_latency(req, service_timeout);
+	ptlrpc_at_adj_net_latency(req,
+			lustre_msg_get_service_time(early_req->rq_repmsg));
 
 	sptlrpc_cli_finish_early_reply(early_req);
 
@@ -803,7 +777,6 @@ int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
 	LASSERT(!request->rq_pool);
 	sptlrpc_cli_ctx_put(request->rq_cli_ctx, 1);
 out_free:
-	atomic_dec(&imp->imp_reqs);
 	class_import_put(imp);
 
 	return rc;
@@ -872,7 +845,6 @@ struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp,
 		LASSERT(imp->imp_client != LP_POISON);
 
 		request->rq_import = class_import_get(imp);
-		atomic_inc(&imp->imp_reqs);
 	} else {
 		CERROR("request allocation out of memory\n");
 	}
@@ -880,33 +852,6 @@ struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp,
 	return request;
 }
 
-static int ptlrpc_reconnect_if_idle(struct obd_import *imp)
-{
-	int rc;
-
-	/*
-	 * initiate connection if needed when the import has been
-	 * referenced by the new request to avoid races with disconnect.
-	 * serialize this check against conditional state=IDLE
-	 * in ptlrpc_disconnect_idle_interpret()
-	 */
-	spin_lock(&imp->imp_lock);
-	if (imp->imp_state == LUSTRE_IMP_IDLE) {
-		imp->imp_generation++;
-		imp->imp_initiated_at = imp->imp_generation;
-		imp->imp_state = LUSTRE_IMP_NEW;
-
-		/* connect_import_locked releases imp_lock */
-		rc = ptlrpc_connect_import_locked(imp);
-		if (rc)
-			return rc;
-		ptlrpc_pinger_add_import(imp);
-	} else {
-		spin_unlock(&imp->imp_lock);
-	}
-	return 0;
-}
-
 /**
  * Helper function for creating a request.
  * Calls __ptlrpc_request_alloc to allocate new request sturcture and inits
@@ -918,21 +863,11 @@ ptlrpc_request_alloc_internal(struct obd_import *imp,
                               struct ptlrpc_request_pool * pool,
                               const struct req_format *format)
 {
-	struct ptlrpc_request *request;
-
-	request = __ptlrpc_request_alloc(imp, pool);
-	if (request == NULL)
-		return NULL;
+        struct ptlrpc_request *request;
 
-	/* don't make expensive check for idling connection
-	 * if it's already connected */
-	if (unlikely(imp->imp_state != LUSTRE_IMP_FULL)) {
-		if (ptlrpc_reconnect_if_idle(imp) < 0) {
-			atomic_dec(&imp->imp_reqs);
-			ptlrpc_request_free(request);
-			return NULL;
-		}
-	}
+        request = __ptlrpc_request_alloc(imp, pool);
+        if (request == NULL)
+                return NULL;
 
         req_capsule_init(&request->rq_pill, request, RCL_CLIENT);
         req_capsule_set(&request->rq_pill, format);
@@ -1021,6 +956,7 @@ struct ptlrpc_request_set *ptlrpc_prep_set(void)
 	atomic_set(&set->set_remaining, 0);
 	spin_lock_init(&set->set_new_req_lock);
 	INIT_LIST_HEAD(&set->set_new_requests);
+	INIT_LIST_HEAD(&set->set_cblist);
 	set->set_max_inflight = UINT_MAX;
 	set->set_producer     = NULL;
 	set->set_producer_arg = NULL;
@@ -1115,6 +1051,27 @@ void ptlrpc_set_destroy(struct ptlrpc_request_set *set)
 }
 EXPORT_SYMBOL(ptlrpc_set_destroy);
 
+/**
+ * Add a callback function \a fn to the set.
+ * This function would be called when all requests on this set are completed.
+ * The function will be passed \a data argument.
+ */
+int ptlrpc_set_add_cb(struct ptlrpc_request_set *set,
+                      set_interpreter_func fn, void *data)
+{
+	struct ptlrpc_set_cbdata *cbdata;
+
+	OBD_ALLOC_PTR(cbdata);
+	if (cbdata == NULL)
+		RETURN(-ENOMEM);
+
+	cbdata->psc_interpret = fn;
+	cbdata->psc_data = data;
+	list_add_tail(&cbdata->psc_item, &set->set_cblist);
+
+	RETURN(0);
+}
+
 /**
  * Add a new request to the general purpose request set.
  * Assumes request reference from the caller.
@@ -1122,7 +1079,6 @@ EXPORT_SYMBOL(ptlrpc_set_destroy);
 void ptlrpc_set_add_req(struct ptlrpc_request_set *set,
                         struct ptlrpc_request *req)
 {
-	LASSERT(req->rq_import->imp_state != LUSTRE_IMP_IDLE);
 	LASSERT(list_empty(&req->rq_set_chain));
 
 	if (req->rq_allow_intr)
@@ -1132,7 +1088,7 @@ void ptlrpc_set_add_req(struct ptlrpc_request_set *set,
 	list_add_tail(&req->rq_set_chain, &set->set_requests);
 	req->rq_set = set;
 	atomic_inc(&set->set_remaining);
-	req->rq_queued_time = ktime_get_seconds();
+	req->rq_queued_time = cfs_time_current();
 
 	if (req->rq_reqmsg != NULL)
 		lustre_msg_set_jobid(req->rq_reqmsg, NULL);
@@ -1163,7 +1119,7 @@ void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
 	 * The set takes over the caller's request reference.
 	 */
 	req->rq_set = set;
-	req->rq_queued_time = ktime_get_seconds();
+	req->rq_queued_time = cfs_time_current();
 	list_add_tail(&req->rq_set_chain, &set->set_new_requests);
 	count = atomic_inc_return(&set->set_new_count);
 	spin_unlock(&set->set_new_req_lock);
@@ -1199,19 +1155,17 @@ static int ptlrpc_import_delay_req(struct obd_import *imp,
         LASSERT (status != NULL);
         *status = 0;
 
-	if (req->rq_ctx_init || req->rq_ctx_fini) {
-		/* always allow ctx init/fini rpc go through */
-	} else if (imp->imp_state == LUSTRE_IMP_NEW) {
-		DEBUG_REQ(D_ERROR, req, "Uninitialized import.");
-		*status = -EIO;
+        if (req->rq_ctx_init || req->rq_ctx_fini) {
+                /* always allow ctx init/fini rpc go through */
+        } else if (imp->imp_state == LUSTRE_IMP_NEW) {
+                DEBUG_REQ(D_ERROR, req, "Uninitialized import.");
+                *status = -EIO;
 	} else if (imp->imp_state == LUSTRE_IMP_CLOSED) {
-		unsigned int opc = lustre_msg_get_opc(req->rq_reqmsg);
-
-		/* pings or MDS-equivalent STATFS may safely race with umount */
-		DEBUG_REQ((opc == OBD_PING || opc == OST_STATFS) ?
+		/* pings may safely race with umount */
+		DEBUG_REQ(lustre_msg_get_opc(req->rq_reqmsg) == OBD_PING ?
 			  D_HA : D_ERROR, req, "IMP_CLOSED ");
 		*status = -EIO;
-	} else if (ptlrpc_send_limit_expired(req)) {
+        } else if (ptlrpc_send_limit_expired(req)) {
 		/* probably doesn't need to be a D_ERROR after initial testing*/
 		DEBUG_REQ(D_HA, req, "send limit expired ");
 		*status = -ETIMEDOUT;
@@ -1234,9 +1188,7 @@ static int ptlrpc_import_delay_req(struct obd_import *imp,
 		if (atomic_read(&imp->imp_inval_count) != 0) {
                         DEBUG_REQ(D_ERROR, req, "invalidate in flight");
                         *status = -EIO;
-		} else if (req->rq_no_delay &&
-			   imp->imp_generation != imp->imp_initiated_at) {
-			/* ignore nodelay for requests initiating connections */
+		} else if (req->rq_no_delay) {
                         *status = -EWOULDBLOCK;
 		} else if (req->rq_allow_replay &&
 			  (imp->imp_state == LUSTRE_IMP_REPLAY ||
@@ -1261,12 +1213,16 @@ static int ptlrpc_import_delay_req(struct obd_import *imp,
  * \retval false if no message should be printed
  * \retval true  if console message should be printed
  */
-static bool ptlrpc_console_allow(struct ptlrpc_request *req, __u32 opc, int err)
+static bool ptlrpc_console_allow(struct ptlrpc_request *req)
 {
+	__u32 opc;
+
 	LASSERT(req->rq_reqmsg != NULL);
+	opc = lustre_msg_get_opc(req->rq_reqmsg);
 
 	/* Suppress particular reconnect errors which are to be expected. */
 	if (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT) {
+		int err;
 
 		/* Suppress timed out reconnect requests */
 		if (lustre_handle_is_used(&req->rq_import->imp_remote_handle) ||
@@ -1276,20 +1232,12 @@ static bool ptlrpc_console_allow(struct ptlrpc_request *req, __u32 opc, int err)
 		/* Suppress most unavailable/again reconnect requests, but
 		 * print occasionally so it is clear client is trying to
 		 * connect to a server where no target is running. */
+		err = lustre_msg_get_status(req->rq_repmsg);
 		if ((err == -ENODEV || err == -EAGAIN) &&
 		    req->rq_import->imp_conn_cnt % 30 != 20)
 			return false;
 	}
 
-	if (opc == LDLM_ENQUEUE && err == -EAGAIN)
-		/* -EAGAIN is normal when using POSIX flocks */
-		return false;
-
-	if (opc == OBD_PING && (err == -ENODEV || err == -ENOTCONN) &&
-	    (req->rq_xid & 0xf) != 10)
-		/* Suppress most ping requests, they may fail occasionally */
-		return false;
-
 	return true;
 }
 
@@ -1308,7 +1256,9 @@ static int ptlrpc_check_status(struct ptlrpc_request *req)
 		lnet_nid_t nid = imp->imp_connection->c_peer.nid;
 		__u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
 
-		if (ptlrpc_console_allow(req, opc, err))
+		/* -EAGAIN is normal when using POSIX flocks */
+		if (ptlrpc_console_allow(req) &&
+		    !(opc == LDLM_ENQUEUE && err == -EAGAIN))
 			LCONSOLE_ERROR_MSG(0x11, "%s: operation %s to node %s "
 					   "failed: rc = %d\n",
 					   imp->imp_obd->obd_name,
@@ -1479,8 +1429,8 @@ static int after_reply(struct ptlrpc_request *req)
         if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING)
                 CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, cfs_fail_val);
         ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg));
-	ptlrpc_at_adj_net_latency(req,
-				  lustre_msg_get_service_timeout(req->rq_repmsg));
+        ptlrpc_at_adj_net_latency(req,
+                                  lustre_msg_get_service_time(req->rq_repmsg));
 
         rc = ptlrpc_check_status(req);
         imp->imp_connect_error = rc;
@@ -1607,7 +1557,8 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req)
 		req->rq_waiting = 1;
 		spin_unlock(&req->rq_lock);
 
-		DEBUG_REQ(D_HA, req, "req waiting for recovery: (%s != %s)",
+		DEBUG_REQ(D_HA, req, "req from PID %d waiting for recovery: "
+			  "(%s != %s)", lustre_msg_get_status(req->rq_reqmsg),
 			  ptlrpc_import_state_name(req->rq_send_state),
 			  ptlrpc_import_state_name(imp->imp_state));
 		LASSERT(list_empty(&req->rq_list));
@@ -1665,7 +1616,8 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req)
 	       " %s:%s:%d:%llu:%s:%d\n", current_comm(),
 	       imp->imp_obd->obd_uuid.uuid,
 	       lustre_msg_get_status(req->rq_reqmsg), req->rq_xid,
-	       obd_import_nid2str(imp), lustre_msg_get_opc(req->rq_reqmsg));
+	       libcfs_nid2str(imp->imp_connection->c_peer.nid),
+	       lustre_msg_get_opc(req->rq_reqmsg));
 
         rc = ptl_send_rpc(req, 0);
 	if (rc == -ENOMEM) {
@@ -1919,11 +1871,8 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
 					spin_unlock(&imp->imp_lock);
 					GOTO(interpret, req->rq_status);
 				}
-				/* ignore on just initiated connections */
 				if (ptlrpc_no_resend(req) &&
-				    !req->rq_wait_ctx &&
-				    imp->imp_generation !=
-				    imp->imp_initiated_at) {
+				    !req->rq_wait_ctx) {
 					req->rq_status = -ENOTCONN;
 					ptlrpc_rqphase_move(req,
 							    RQ_PHASE_INTERPRET);
@@ -2094,7 +2043,7 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
 			       imp->imp_obd->obd_uuid.uuid,
 			       lustre_msg_get_status(req->rq_reqmsg),
 			       req->rq_xid,
-			       obd_import_nid2str(imp),
+			       libcfs_nid2str(imp->imp_connection->c_peer.nid),
 			       lustre_msg_get_opc(req->rq_reqmsg));
 
 		spin_lock(&imp->imp_lock);
@@ -2151,7 +2100,6 @@ EXPORT_SYMBOL(ptlrpc_check_set);
 int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
 {
 	struct obd_import *imp = req->rq_import;
-	unsigned int debug_mask = D_RPCTRACE;
 	int rc = 0;
 	ENTRY;
 
@@ -2159,15 +2107,12 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
 	req->rq_timedout = 1;
 	spin_unlock(&req->rq_lock);
 
-	if (ptlrpc_console_allow(req, lustre_msg_get_opc(req->rq_reqmsg),
-				  lustre_msg_get_status(req->rq_reqmsg)))
-		debug_mask = D_WARNING;
-	DEBUG_REQ(debug_mask, req, "Request sent has %s: [sent %lld/real %lld]",
-		  req->rq_net_err ? "failed due to network error" :
-		     ((req->rq_real_sent == 0 ||
+	DEBUG_REQ(D_WARNING, req, "Request sent has %s: [sent %lld/real %lld]",
+                  req->rq_net_err ? "failed due to network error" :
+                     ((req->rq_real_sent == 0 ||
 		       req->rq_real_sent < req->rq_sent ||
 		       req->rq_real_sent >= req->rq_deadline) ?
-		      "timed out for sent delay" : "timed out for slow reply"),
+                      "timed out for sent delay" : "timed out for slow reply"),
 		  (s64)req->rq_sent, (s64)req->rq_real_sent);
 
 	if (imp != NULL && obd_debug_peer_on_timeout)
@@ -2308,7 +2253,7 @@ static void ptlrpc_interrupted_set(void *data)
 /**
  * Get the smallest timeout in the set; this does NOT set a timeout.
  */
-time64_t ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
+int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
 {
 	struct list_head *tmp;
 	time64_t now = ktime_get_real_seconds();
@@ -2361,14 +2306,13 @@ time64_t ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
  * error or otherwise be interrupted).
  * Returns 0 on success or error code otherwise.
  */
-int ptlrpc_set_wait(const struct lu_env *env, struct ptlrpc_request_set *set)
+int ptlrpc_set_wait(struct ptlrpc_request_set *set)
 {
-	struct list_head *tmp;
-	struct ptlrpc_request *req;
-	struct l_wait_info lwi;
-	time64_t timeout;
-	int rc;
-	ENTRY;
+	struct list_head            *tmp;
+        struct ptlrpc_request *req;
+        struct l_wait_info     lwi;
+        int                    rc, timeout;
+        ENTRY;
 
 	if (set->set_producer)
 		(void)ptlrpc_set_producer(set);
@@ -2383,13 +2327,13 @@ int ptlrpc_set_wait(const struct lu_env *env, struct ptlrpc_request_set *set)
 	if (list_empty(&set->set_requests))
                 RETURN(0);
 
-	do {
-		timeout = ptlrpc_set_next_timeout(set);
+        do {
+                timeout = ptlrpc_set_next_timeout(set);
 
                 /* wait until all complete, interrupted, or an in-flight
                  * req times out */
-		CDEBUG(D_RPCTRACE, "set %p going to sleep for %lld seconds\n",
-			set, timeout);
+                CDEBUG(D_RPCTRACE, "set %p going to sleep for %d seconds\n",
+                       set, timeout);
 
 		if ((timeout == 0 && !signal_pending(current)) ||
 		    set->set_allow_intr)
@@ -2410,8 +2354,7 @@ int ptlrpc_set_wait(const struct lu_env *env, struct ptlrpc_request_set *set)
                         lwi = LWI_TIMEOUT(cfs_time_seconds(timeout? timeout : 1),
                                           ptlrpc_expired_set, set);
 
-		rc = l_wait_event(set->set_waitq,
-				  ptlrpc_check_set(NULL, set), &lwi);
+                rc = l_wait_event(set->set_waitq, ptlrpc_check_set(NULL, set), &lwi);
 
                 /* LU-769 - if we ignored the signal because it was already
                  * pending when we started, we need to handle it now or we risk
@@ -2462,7 +2405,25 @@ int ptlrpc_set_wait(const struct lu_env *env, struct ptlrpc_request_set *set)
                         rc = req->rq_status;
         }
 
-	RETURN(rc);
+        if (set->set_interpret != NULL) {
+                int (*interpreter)(struct ptlrpc_request_set *set,void *,int) =
+                        set->set_interpret;
+                rc = interpreter (set, set->set_arg, rc);
+        } else {
+                struct ptlrpc_set_cbdata *cbdata, *n;
+                int err;
+
+		list_for_each_entry_safe(cbdata, n,
+                                         &set->set_cblist, psc_item) {
+			list_del_init(&cbdata->psc_item);
+                        err = cbdata->psc_interpret(set, cbdata->psc_data, rc);
+                        if (err && !rc)
+                                rc = err;
+                        OBD_FREE_PTR(cbdata);
+                }
+        }
+
+        RETURN(rc);
 }
 EXPORT_SYMBOL(ptlrpc_set_wait);
 
@@ -2512,13 +2473,9 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
                 sptlrpc_cli_free_repbuf(request);
 
         if (request->rq_import != NULL) {
-		if (!ptlrpcd_check_work(request)) {
-			LASSERT(atomic_read(&request->rq_import->imp_reqs) > 0);
-			atomic_dec(&request->rq_import->imp_reqs);
-		}
-		class_import_put(request->rq_import);
-		request->rq_import = NULL;
-	}
+                class_import_put(request->rq_import);
+                request->rq_import = NULL;
+        }
 	if (request->rq_bulk != NULL)
 		ptlrpc_free_bulk(request->rq_bulk);
 
@@ -2722,11 +2679,8 @@ void ptlrpc_request_committed(struct ptlrpc_request *req, int force)
 		return;
 	}
 
-	if (force || req->rq_transno <= imp->imp_peer_committed_transno) {
-		if (imp->imp_replay_cursor == &req->rq_replay_list)
-			imp->imp_replay_cursor = req->rq_replay_list.next;
+	if (force || req->rq_transno <= imp->imp_peer_committed_transno)
 		ptlrpc_free_request(req);
-	}
 
 	spin_unlock(&imp->imp_lock);
 }
@@ -2838,7 +2792,7 @@ void ptlrpc_cleanup_client(struct obd_import *imp)
  */
 void ptlrpc_resend_req(struct ptlrpc_request *req)
 {
-	DEBUG_REQ(D_HA, req, "going to resend");
+        DEBUG_REQ(D_HA, req, "going to resend");
 	spin_lock(&req->rq_lock);
 
 	/* Request got reply but linked to the import list still.
@@ -2849,13 +2803,14 @@ void ptlrpc_resend_req(struct ptlrpc_request *req)
 		return;
 	}
 
-	req->rq_status = -EAGAIN;
+        lustre_msg_set_handle(req->rq_reqmsg, &(struct lustre_handle){ 0 });
+        req->rq_status = -EAGAIN;
 
-	req->rq_resend = 1;
-	req->rq_net_err = 0;
-	req->rq_timedout = 0;
+        req->rq_resend = 1;
+        req->rq_net_err = 0;
+        req->rq_timedout = 0;
 
-	ptlrpc_client_wake_req(req);
+        ptlrpc_client_wake_req(req);
 	spin_unlock(&req->rq_lock);
 }
 
@@ -2965,13 +2920,13 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
 	/* for distributed debugging */
 	lustre_msg_set_status(req->rq_reqmsg, current_pid());
 
-	/* add a ref for the set (see comment in ptlrpc_set_add_req) */
-	ptlrpc_request_addref(req);
-	ptlrpc_set_add_req(set, req);
-	rc = ptlrpc_set_wait(NULL, set);
-	ptlrpc_set_destroy(set);
+        /* add a ref for the set (see comment in ptlrpc_set_add_req) */
+        ptlrpc_request_addref(req);
+        ptlrpc_set_add_req(set, req);
+        rc = ptlrpc_set_wait(set);
+        ptlrpc_set_destroy(set);
 
-	RETURN(rc);
+        RETURN(rc);
 }
 EXPORT_SYMBOL(ptlrpc_queue_wait);
 
@@ -3011,6 +2966,7 @@ static int ptlrpc_replay_interpret(const struct lu_env *env,
 		DEBUG_REQ(D_WARNING, req, "Version mismatch during replay\n");
 		spin_lock(&imp->imp_lock);
 		imp->imp_vbr_failed = 1;
+		imp->imp_no_lock_replay = 1;
 		spin_unlock(&imp->imp_lock);
 		lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
         } else {
@@ -3024,6 +2980,9 @@ static int ptlrpc_replay_interpret(const struct lu_env *env,
         }
 
 	spin_lock(&imp->imp_lock);
+	/** if replays by version then gap occur on server, no trust to locks */
+	if (lustre_msg_get_flags(req->rq_repmsg) & MSG_VERSION_REPLAY)
+		imp->imp_no_lock_replay = 1;
 	imp->imp_last_replay_transno = lustre_msg_get_transno(req->rq_reqmsg);
 	spin_unlock(&imp->imp_lock);
         LASSERT(imp->imp_last_replay_transno);
@@ -3122,15 +3081,14 @@ static int ptlrpc_replay_interpret(const struct lu_env *env,
  */
 int ptlrpc_replay_req(struct ptlrpc_request *req)
 {
-	struct ptlrpc_replay_async_args *aa;
-
-	ENTRY;
+        struct ptlrpc_replay_async_args *aa;
+        ENTRY;
 
-	LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY);
+        LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY);
 
-	CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
-	aa = ptlrpc_req_async_args(req);
-	memset(aa, 0, sizeof(*aa));
+        LASSERT (sizeof (*aa) <= sizeof (req->rq_async_args));
+        aa = ptlrpc_req_async_args(req);
+        memset(aa, 0, sizeof *aa);
 
         /* Prepare request to be resent with ptlrpcd */
         aa->praa_old_state = req->rq_send_state;
@@ -3146,8 +3104,8 @@ int ptlrpc_replay_req(struct ptlrpc_request *req)
 
         /* Tell server the net_latency, so the server can calculate how long
          * it should wait for next replay */
-	lustre_msg_set_service_timeout(req->rq_reqmsg,
-				       ptlrpc_at_get_net_latency(req));
+        lustre_msg_set_service_time(req->rq_reqmsg,
+                                    ptlrpc_at_get_net_latency(req));
         DEBUG_REQ(D_HA, req, "REPLAY");
 
 	atomic_inc(&req->rq_import->imp_replay_inflight);
@@ -3168,12 +3126,11 @@ void ptlrpc_abort_inflight(struct obd_import *imp)
 	struct list_head *tmp, *n;
 	ENTRY;
 
-	/*
-	 * Make sure that no new requests get processed for this import.
+	/* Make sure that no new requests get processed for this import.
 	 * ptlrpc_{queue,set}_wait must (and does) hold imp_lock while testing
 	 * this flag and then putting requests on sending_list or delayed_list.
 	 */
-	assert_spin_locked(&imp->imp_lock);
+	spin_lock(&imp->imp_lock);
 
 	/* XXX locking?  Maybe we should remove each request with the list
 	 * locked?  Also, how do we know if the requests on the list are
@@ -3215,6 +3172,8 @@ void ptlrpc_abort_inflight(struct obd_import *imp)
 	if (imp->imp_replayable)
 		ptlrpc_free_committed(imp);
 
+	spin_unlock(&imp->imp_lock);
+
 	EXIT;
 }
 
@@ -3354,7 +3313,8 @@ void ptlrpc_set_bulk_mbits(struct ptlrpc_request *req)
 	/* For multi-bulk RPCs, rq_mbits is the last mbits needed for bulks so
 	 * that server can infer the number of bulks that were prepared,
 	 * see LU-1431 */
-	req->rq_mbits += bd->bd_md_count - 1;
+	req->rq_mbits += ((bd->bd_iov_count + LNET_MAX_IOV - 1) /
+			  LNET_MAX_IOV) - 1;
 
 	/* Set rq_xid as rq_mbits to indicate the final bulk for the old
 	 * server which does not support OBD_CONNECT_BULK_MBITS. LU-6808.
@@ -3482,7 +3442,7 @@ void *ptlrpcd_alloc_work(struct obd_import *imp,
 	req->rq_no_delay = req->rq_no_resend = 1;
 	req->rq_pill.rc_fmt = (void *)&worker_format;
 
-	CLASSERT(sizeof(*args) <= sizeof(req->rq_async_args));
+	CLASSERT (sizeof(*args) <= sizeof(req->rq_async_args));
 	args = ptlrpc_req_async_args(req);
 	args->cb     = cb;
 	args->cbdata = cbdata;
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/errno.c b/drivers/staging/lustrefsx/lustre/ptlrpc/errno.c
index a3d31a853244c..fb302c70d08be 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/errno.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/errno.c
@@ -26,10 +26,9 @@
  */
 
 #include <libcfs/libcfs.h>
-#include <lustre_errno.h>
+#include <lustre/lustre_errno.h>
 
 #ifdef LUSTRE_TRANSLATE_ERRNOS
-#include <lustre_dlm.h>
 
 /*
  * The two translation tables below must define a one-to-one mapping between
@@ -186,20 +185,7 @@ static int lustre_errno_hton_mapping[] = {
 	[ESERVERFAULT]		= LUSTRE_ESERVERFAULT,
 	[EBADTYPE]		= LUSTRE_EBADTYPE,
 	[EJUKEBOX]		= LUSTRE_EJUKEBOX,
-	[EIOCBQUEUED]		= LUSTRE_EIOCBQUEUED,
-
-	/*
-	 * The ELDLM errors are Lustre specific errors whose ranges
-	 * lie in the middle of the above system errors. The ELDLM
-	 * numbers must be preserved to avoid LU-9793.
-	 */
-	[ELDLM_LOCK_CHANGED]	= ELDLM_LOCK_CHANGED,
-	[ELDLM_LOCK_ABORTED]	= ELDLM_LOCK_ABORTED,
-	[ELDLM_LOCK_REPLACED]	= ELDLM_LOCK_REPLACED,
-	[ELDLM_NO_LOCK_DATA]	= ELDLM_NO_LOCK_DATA,
-	[ELDLM_LOCK_WOULDBLOCK]	= ELDLM_LOCK_WOULDBLOCK,
-	[ELDLM_NAMESPACE_EXISTS]= ELDLM_NAMESPACE_EXISTS,
-	[ELDLM_BAD_NAMESPACE]	= ELDLM_BAD_NAMESPACE
+	[EIOCBQUEUED]		= LUSTRE_EIOCBQUEUED
 };
 
 static int lustre_errno_ntoh_mapping[] = {
@@ -345,20 +331,7 @@ static int lustre_errno_ntoh_mapping[] = {
 	[LUSTRE_ESERVERFAULT]		= ESERVERFAULT,
 	[LUSTRE_EBADTYPE]		= EBADTYPE,
 	[LUSTRE_EJUKEBOX]		= EJUKEBOX,
-	[LUSTRE_EIOCBQUEUED]		= EIOCBQUEUED,
-
-	/*
-	 * The ELDLM errors are Lustre specific errors whose ranges
-	 * lie in the middle of the above system errors. The ELDLM
-	 * numbers must be preserved to avoid LU-9793.
-	 */
-	[ELDLM_LOCK_CHANGED]		= ELDLM_LOCK_CHANGED,
-	[ELDLM_LOCK_ABORTED]		= ELDLM_LOCK_ABORTED,
-	[ELDLM_LOCK_REPLACED]		= ELDLM_LOCK_REPLACED,
-	[ELDLM_NO_LOCK_DATA]		= ELDLM_NO_LOCK_DATA,
-	[ELDLM_LOCK_WOULDBLOCK]		= ELDLM_LOCK_WOULDBLOCK,
-	[ELDLM_NAMESPACE_EXISTS]	= ELDLM_NAMESPACE_EXISTS,
-	[ELDLM_BAD_NAMESPACE]		= ELDLM_BAD_NAMESPACE
+	[LUSTRE_EIOCBQUEUED]		= EIOCBQUEUED
 };
 
 unsigned int lustre_errno_hton(unsigned int h)
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/events.c b/drivers/staging/lustrefsx/lustre/ptlrpc/events.c
index 6c713b22b94ae..28533cca19a32 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/events.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/events.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -56,11 +56,6 @@ void request_out_callback(struct lnet_event *ev)
 
 	DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status);
 
-	/* Do not update imp_next_ping for connection request */
-	if (lustre_msg_get_opc(req->rq_reqmsg) !=
-	    req->rq_import->imp_connect_op)
-		ptlrpc_pinger_sending_on_import(req->rq_import);
-
 	sptlrpc_request_out_callback(req);
 
 	spin_lock(&req->rq_lock);
@@ -166,13 +161,12 @@ void reply_in_callback(struct lnet_event *ev)
                           ev->mlength, ev->offset, req->rq_replen);
         }
 
-	if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING)
-		req->rq_import->imp_last_reply_time = ktime_get_real_seconds();
+	req->rq_import->imp_last_reply_time = ktime_get_real_seconds();
 
 out_wake:
-	/* NB don't unlock till after wakeup; req can disappear under us
-	 * since we don't have our own ref */
-	ptlrpc_client_wake_req(req);
+        /* NB don't unlock till after wakeup; req can disappear under us
+         * since we don't have our own ref */
+        ptlrpc_client_wake_req(req);
 	spin_unlock(&req->rq_lock);
 	EXIT;
 }
@@ -206,8 +200,8 @@ void client_bulk_callback(struct lnet_event *ev)
 
 	spin_lock(&desc->bd_lock);
 	req = desc->bd_req;
-	LASSERT(desc->bd_refs > 0);
-	desc->bd_refs--;
+	LASSERT(desc->bd_md_count > 0);
+	desc->bd_md_count--;
 
 	if (ev->type != LNET_EVENT_UNLINK && ev->status == 0) {
 		desc->bd_nob_transferred += ev->mlength;
@@ -224,7 +218,7 @@ void client_bulk_callback(struct lnet_event *ev)
 
 	/* NB don't unlock till after wakeup; desc can disappear under us
 	 * otherwise */
-	if (desc->bd_refs == 0)
+	if (desc->bd_md_count == 0)
 		ptlrpc_client_wake_req(desc->bd_req);
 
 	spin_unlock(&desc->bd_lock);
@@ -456,7 +450,7 @@ void server_bulk_callback(struct lnet_event *ev)
 
 	spin_lock(&desc->bd_lock);
 
-	LASSERT(desc->bd_refs > 0);
+	LASSERT(desc->bd_md_count > 0);
 
 	if ((ev->type == LNET_EVENT_ACK ||
 	     ev->type == LNET_EVENT_REPLY) &&
@@ -472,9 +466,9 @@ void server_bulk_callback(struct lnet_event *ev)
 		desc->bd_failure = 1;
 
 	if (ev->unlinked) {
-		desc->bd_refs--;
+		desc->bd_md_count--;
 		/* This is the last callback no matter what... */
-		if (desc->bd_refs == 0)
+		if (desc->bd_md_count == 0)
 			wake_up(&desc->bd_waitq);
 	}
 
@@ -506,14 +500,14 @@ static void ptlrpc_master_callback(struct lnet_event *ev)
 int ptlrpc_uuid_to_peer(struct obd_uuid *uuid,
 			struct lnet_process_id *peer, lnet_nid_t *self)
 {
-	int best_dist = 0;
-	__u32 best_order = 0;
-	int count = 0;
-	int rc = -ENOENT;
-	int dist;
-	__u32 order;
-	lnet_nid_t dst_nid;
-	lnet_nid_t src_nid;
+	int               best_dist = 0;
+	__u32             best_order = 0;
+	int               count = 0;
+	int               rc = -ENOENT;
+	int               dist;
+	__u32             order;
+	lnet_nid_t        dst_nid;
+	lnet_nid_t        src_nid;
 
 	peer->pid = LNET_PID_LUSTRE;
 
@@ -528,7 +522,7 @@ int ptlrpc_uuid_to_peer(struct obd_uuid *uuid,
 			continue;
 
 		if (dist == 0) {                /* local! use loopback LND */
-			peer->nid = *self = LNET_NID_LO_0;
+			peer->nid = *self = LNET_MKNID(LNET_MKNET(LOLND, 0), 0);
 			rc = 0;
 			break;
 		}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_api.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_api.h
index a5f203e215389..a5bbaea6065d3 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_api.h
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_api.h
@@ -21,16 +21,10 @@
 
 struct gss_api_mech;
 
-typedef int (*digest_hash)(
-	struct ahash_request *req, rawobj_t *hdr,
-	int msgcnt, rawobj_t *msgs,
-	int iovcnt, lnet_kiov_t *iovs);
-
 /* The mechanism-independent gss-api context: */
 struct gss_ctx {
-	struct gss_api_mech *mech_type;
-	void *internal_ctx_id;
-	digest_hash hash_func;
+        struct gss_api_mech    *mech_type;
+        void                   *internal_ctx_id;
 };
 
 #define GSS_C_NO_BUFFER         ((rawobj_t) 0)
@@ -50,7 +44,7 @@ __u32 lgss_copy_reverse_context(
                 struct gss_ctx         **ctx_new);
 __u32 lgss_inquire_context(
                 struct gss_ctx          *ctx,
-		time64_t *endtime);
+                unsigned long           *endtime);
 __u32 lgss_get_mic(
                 struct gss_ctx          *ctx,
                 int                      msgcnt,
@@ -125,7 +119,7 @@ struct gss_api_ops {
                         struct gss_ctx         *ctx_new);
         __u32 (*gss_inquire_context)(
                         struct gss_ctx         *ctx,
-			time64_t *endtime);
+                        unsigned long          *endtime);
         __u32 (*gss_get_mic)(
                         struct gss_ctx         *ctx,
                         int                     msgcnt,
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_bulk.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_bulk.c
index 041dd12dac593..3f703372d272f 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_bulk.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_bulk.c
@@ -46,6 +46,7 @@
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
+#include <lustre/lustre_idl.h>
 #include <lustre_net.h>
 #include <lustre_import.h>
 #include <lustre_sec.h>
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_cli_upcall.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_cli_upcall.c
index 70d4711c67a96..d1fa9200452ba 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_cli_upcall.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_cli_upcall.c
@@ -45,6 +45,7 @@
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
+#include <lustre/lustre_idl.h>
 #include <lustre_net.h>
 #include <lustre_import.h>
 #include <lustre_sec.h>
@@ -59,85 +60,82 @@
 
 static
 int ctx_init_pack_request(struct obd_import *imp,
-			  struct ptlrpc_request *req,
-			  int lustre_srv,
-			  uid_t uid, gid_t gid,
-			  long token_size,
-			  char __user *token)
+                          struct ptlrpc_request *req,
+                          int lustre_srv,
+                          uid_t uid, gid_t gid,
+                          long token_size,
+                          char __user *token)
 {
-	struct lustre_msg       *msg = req->rq_reqbuf;
-	struct gss_sec          *gsec;
-	struct gss_header       *ghdr;
-	struct ptlrpc_user_desc *pud;
-	__u32                   *p, size, offset = 2;
-	rawobj_t                 obj;
-
-	LASSERT(msg->lm_bufcount <= 4);
-	LASSERT(req->rq_cli_ctx);
-	LASSERT(req->rq_cli_ctx->cc_sec);
-
-	/* gss hdr */
-	ghdr = lustre_msg_buf(msg, 0, sizeof(*ghdr));
-	ghdr->gh_version = PTLRPC_GSS_VERSION;
-	ghdr->gh_sp = (__u8) imp->imp_sec->ps_part;
-	ghdr->gh_flags = 0;
-	ghdr->gh_proc = PTLRPC_GSS_PROC_INIT;
-	ghdr->gh_seq = 0;
-	ghdr->gh_svc = SPTLRPC_SVC_NULL;
-	ghdr->gh_handle.len = 0;
-
-	/* fix the user desc */
-	if (req->rq_pack_udesc) {
-		ghdr->gh_flags |= LUSTRE_GSS_PACK_USER;
-
-		pud = lustre_msg_buf(msg, offset, sizeof(*pud));
-		LASSERT(pud);
-		pud->pud_uid = pud->pud_fsuid = uid;
-		pud->pud_gid = pud->pud_fsgid = gid;
-		pud->pud_cap = 0;
-		pud->pud_ngroups = 0;
-		offset++;
-	}
+        struct lustre_msg       *msg = req->rq_reqbuf;
+        struct gss_sec          *gsec;
+        struct gss_header       *ghdr;
+        struct ptlrpc_user_desc *pud;
+        __u32                   *p, size, offset = 2;
+        rawobj_t                 obj;
+
+        LASSERT(msg->lm_bufcount <= 4);
+        LASSERT(req->rq_cli_ctx);
+        LASSERT(req->rq_cli_ctx->cc_sec);
+
+        /* gss hdr */
+        ghdr = lustre_msg_buf(msg, 0, sizeof(*ghdr));
+        ghdr->gh_version = PTLRPC_GSS_VERSION;
+        ghdr->gh_sp = (__u8) imp->imp_sec->ps_part;
+        ghdr->gh_flags = 0;
+        ghdr->gh_proc = PTLRPC_GSS_PROC_INIT;
+        ghdr->gh_seq = 0;
+        ghdr->gh_svc = SPTLRPC_SVC_NULL;
+        ghdr->gh_handle.len = 0;
+
+        /* fix the user desc */
+        if (req->rq_pack_udesc) {
+                ghdr->gh_flags |= LUSTRE_GSS_PACK_USER;
+
+                pud = lustre_msg_buf(msg, offset, sizeof(*pud));
+                LASSERT(pud);
+                pud->pud_uid = pud->pud_fsuid = uid;
+                pud->pud_gid = pud->pud_fsgid = gid;
+                pud->pud_cap = 0;
+                pud->pud_ngroups = 0;
+                offset++;
+        }
 
-	/* new clients are expected to set KCSUM flag */
-	ghdr->gh_flags |= LUSTRE_GSS_PACK_KCSUM;
-
-	/* security payload */
-	p = lustre_msg_buf(msg, offset, 0);
-	size = msg->lm_buflens[offset];
-	LASSERT(p);
-
-	/* 1. lustre svc type */
-	LASSERT(size > 4);
-	*p++ = cpu_to_le32(lustre_srv);
-	size -= 4;
-
-	/* 2. target uuid */
-	obj.len = strlen(imp->imp_obd->u.cli.cl_target_uuid.uuid) + 1;
-	obj.data = imp->imp_obd->u.cli.cl_target_uuid.uuid;
-	if (rawobj_serialize(&obj, &p, &size))
-		LBUG();
-
-	/* 3. reverse context handle. actually only needed by root user,
-	 *    but we send it anyway. */
-	gsec = sec2gsec(req->rq_cli_ctx->cc_sec);
-	obj.len = sizeof(gsec->gs_rvs_hdl);
-	obj.data = (__u8 *) &gsec->gs_rvs_hdl;
-	if (rawobj_serialize(&obj, &p, &size))
-		LBUG();
-
-	/* 4. now the token */
-	LASSERT(size >= (sizeof(__u32) + token_size));
-	*p++ = cpu_to_le32(((__u32) token_size));
+        /* security payload */
+        p = lustre_msg_buf(msg, offset, 0);
+        size = msg->lm_buflens[offset];
+        LASSERT(p);
+
+        /* 1. lustre svc type */
+        LASSERT(size > 4);
+        *p++ = cpu_to_le32(lustre_srv);
+        size -= 4;
+
+        /* 2. target uuid */
+        obj.len = strlen(imp->imp_obd->u.cli.cl_target_uuid.uuid) + 1;
+        obj.data = imp->imp_obd->u.cli.cl_target_uuid.uuid;
+        if (rawobj_serialize(&obj, &p, &size))
+                LBUG();
+
+        /* 3. reverse context handle. actually only needed by root user,
+         *    but we send it anyway. */
+        gsec = sec2gsec(req->rq_cli_ctx->cc_sec);
+        obj.len = sizeof(gsec->gs_rvs_hdl);
+        obj.data = (__u8 *) &gsec->gs_rvs_hdl;
+        if (rawobj_serialize(&obj, &p, &size))
+                LBUG();
+
+        /* 4. now the token */
+        LASSERT(size >= (sizeof(__u32) + token_size));
+        *p++ = cpu_to_le32(((__u32) token_size));
 	if (copy_from_user(p, token, token_size)) {
-		CERROR("can't copy token\n");
-		return -EFAULT;
-	}
-	size -= sizeof(__u32) + cfs_size_round4(token_size);
+                CERROR("can't copy token\n");
+                return -EFAULT;
+        }
+        size -= sizeof(__u32) + cfs_size_round4(token_size);
 
-	req->rq_reqdata_len = lustre_shrink_msg(req->rq_reqbuf, offset,
-					     msg->lm_buflens[offset] - size, 0);
-	return 0;
+        req->rq_reqdata_len = lustre_shrink_msg(req->rq_reqbuf, offset,
+                                                msg->lm_buflens[offset] - size, 0);
+        return 0;
 }
 
 static
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.c
index 7be412d2d4a72..17fd9cf3c00c1 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.c
@@ -55,12 +55,12 @@
 #include "gss_internal.h"
 #include "gss_crypto.h"
 
-int gss_keyblock_init(struct gss_keyblock *kb, const char *alg_name,
+int gss_keyblock_init(struct gss_keyblock *kb, char *alg_name,
 		      const int alg_mode)
 {
 	int rc;
 
-	kb->kb_tfm = crypto_alloc_sync_skcipher(alg_name, alg_mode, 0);
+	kb->kb_tfm = crypto_alloc_blkcipher(alg_name, alg_mode, 0);
 	if (IS_ERR(kb->kb_tfm)) {
 		rc = PTR_ERR(kb->kb_tfm);
 		kb->kb_tfm = NULL;
@@ -69,8 +69,8 @@ int gss_keyblock_init(struct gss_keyblock *kb, const char *alg_name,
 		return rc;
 	}
 
-	rc = crypto_sync_skcipher_setkey(kb->kb_tfm, kb->kb_key.data,
-					 kb->kb_key.len);
+	rc = crypto_blkcipher_setkey(kb->kb_tfm, kb->kb_key.data,
+				     kb->kb_key.len);
 	if (rc) {
 		CERROR("failed to set %s key, len %d, rc = %d\n", alg_name,
 		       kb->kb_key.len, rc);
@@ -84,7 +84,7 @@ void gss_keyblock_free(struct gss_keyblock *kb)
 {
 	rawobj_free(&kb->kb_key);
 	if (kb->kb_tfm)
-		crypto_free_sync_skcipher(kb->kb_tfm);
+		crypto_free_blkcipher(kb->kb_tfm);
 }
 
 int gss_keyblock_dup(struct gss_keyblock *new, struct gss_keyblock *kb)
@@ -226,76 +226,86 @@ void gss_teardown_sgtable(struct sg_table *sgt)
 		sg_free_table(sgt);
 }
 
-int gss_crypt_generic(struct crypto_sync_skcipher *tfm, int decrypt,
-		      const void *iv, const void *in, void *out, size_t length)
+int gss_crypt_generic(struct crypto_blkcipher *tfm, int decrypt, const void *iv,
+		      const void *in, void *out, size_t length)
 {
+	struct blkcipher_desc desc;
 	struct scatterlist sg;
 	struct sg_table sg_out;
 	__u8 local_iv[16] = {0};
 	__u32 ret = -EINVAL;
-	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
 
 	LASSERT(tfm);
+	desc.tfm = tfm;
+	desc.info = local_iv;
+	desc.flags = 0;
 
-	if (length % crypto_sync_skcipher_blocksize(tfm) != 0) {
+	if (length % crypto_blkcipher_blocksize(tfm) != 0) {
 		CERROR("output length %zu mismatch blocksize %d\n",
-		       length, crypto_sync_skcipher_blocksize(tfm));
+		       length, crypto_blkcipher_blocksize(tfm));
 		goto out;
 	}
 
-	if (crypto_sync_skcipher_ivsize(tfm) > ARRAY_SIZE(local_iv)) {
-		CERROR("iv size too large %d\n",
-			crypto_sync_skcipher_ivsize(tfm));
+	if (crypto_blkcipher_ivsize(tfm) > ARRAY_SIZE(local_iv)) {
+		CERROR("iv size too large %d\n", crypto_blkcipher_ivsize(tfm));
 		goto out;
 	}
 
 	if (iv)
-		memcpy(local_iv, iv, crypto_sync_skcipher_ivsize(tfm));
+		memcpy(local_iv, iv, crypto_blkcipher_ivsize(tfm));
 
-	if (in != out)
-		memmove(out, in, length);
+	memcpy(out, in, length);
 
 	ret = gss_setup_sgtable(&sg_out, &sg, out, length);
 	if (ret != 0)
 		goto out;
 
-	skcipher_request_set_sync_tfm(req, tfm);
-	skcipher_request_set_callback(req, 0, NULL, NULL);
-	skcipher_request_set_crypt(req, &sg, &sg, length, local_iv);
-
 	if (decrypt)
-		ret = crypto_skcipher_decrypt_iv(req, &sg, &sg, length);
+		ret = crypto_blkcipher_decrypt_iv(&desc, &sg, &sg, length);
 	else
-		ret = crypto_skcipher_encrypt_iv(req, &sg, &sg, length);
+		ret = crypto_blkcipher_encrypt_iv(&desc, &sg, &sg, length);
 
-	skcipher_request_zero(req);
 	gss_teardown_sgtable(&sg_out);
 out:
 	return ret;
 }
 
-int gss_digest_hash(struct ahash_request *req,
-		    rawobj_t *hdr, int msgcnt, rawobj_t *msgs,
-		    int iovcnt, lnet_kiov_t *iovs)
+int gss_digest_hmac(struct crypto_hash *tfm,
+		    rawobj_t *key,
+		    rawobj_t *hdr,
+		    int msgcnt, rawobj_t *msgs,
+		    int iovcnt, lnet_kiov_t *iovs,
+		    rawobj_t *cksum)
 {
+	struct hash_desc desc = {
+		.tfm = tfm,
+		.flags = 0,
+	};
 	struct scatterlist sg[1];
 	struct sg_table sgt;
-	int rc = 0;
 	int i;
+	int rc;
+
+	rc = crypto_hash_setkey(tfm, key->data, key->len);
+	if (rc)
+		return rc;
+
+	rc = crypto_hash_init(&desc);
+	if (rc)
+		return rc;
 
 	for (i = 0; i < msgcnt; i++) {
 		if (msgs[i].len == 0)
 			continue;
 
 		rc = gss_setup_sgtable(&sgt, sg, msgs[i].data, msgs[i].len);
+		if (rc != 0)
+			return rc;
+		rc = crypto_hash_update(&desc, sg, msgs[i].len);
 		if (rc)
 			return rc;
 
-		ahash_request_set_crypt(req, sg, NULL, msgs[i].len);
-		rc = crypto_ahash_update(req);
 		gss_teardown_sgtable(&sgt);
-		if (rc)
-			return rc;
 	}
 
 	for (i = 0; i < iovcnt; i++) {
@@ -305,50 +315,59 @@ int gss_digest_hash(struct ahash_request *req,
 		sg_init_table(sg, 1);
 		sg_set_page(&sg[0], iovs[i].kiov_page, iovs[i].kiov_len,
 			    iovs[i].kiov_offset);
-
-		ahash_request_set_crypt(req, sg, NULL, iovs[i].kiov_len);
-		rc = crypto_ahash_update(req);
+		rc = crypto_hash_update(&desc, sg, iovs[i].kiov_len);
 		if (rc)
 			return rc;
 	}
 
 	if (hdr) {
-		rc = gss_setup_sgtable(&sgt, sg, hdr->data, hdr->len);
+		rc = gss_setup_sgtable(&sgt, sg, hdr, sizeof(*hdr));
+		if (rc != 0)
+			return rc;
+		rc = crypto_hash_update(&desc, sg, sizeof(hdr->len));
 		if (rc)
 			return rc;
 
-		ahash_request_set_crypt(req, sg, NULL, hdr->len);
-		rc = crypto_ahash_update(req);
 		gss_teardown_sgtable(&sgt);
-		if (rc)
-			return rc;
 	}
 
-	return rc;
+	return crypto_hash_final(&desc, cksum->data);
 }
 
-int gss_digest_hash_compat(struct ahash_request *req,
-			   rawobj_t *hdr, int msgcnt, rawobj_t *msgs,
-			   int iovcnt, lnet_kiov_t *iovs)
+int gss_digest_norm(struct crypto_hash *tfm,
+		    struct gss_keyblock *kb,
+		    rawobj_t *hdr,
+		    int msgcnt, rawobj_t *msgs,
+		    int iovcnt, lnet_kiov_t *iovs,
+		    rawobj_t *cksum)
 {
+	struct hash_desc   desc;
 	struct scatterlist sg[1];
 	struct sg_table sgt;
-	int rc = 0;
-	int i;
+	int                i;
+	int                rc;
+
+	LASSERT(kb->kb_tfm);
+	desc.tfm = tfm;
+	desc.flags = 0;
+
+	rc = crypto_hash_init(&desc);
+	if (rc)
+		return rc;
 
 	for (i = 0; i < msgcnt; i++) {
 		if (msgs[i].len == 0)
 			continue;
 
 		rc = gss_setup_sgtable(&sgt, sg, msgs[i].data, msgs[i].len);
-		if (rc)
+		if (rc != 0)
 			return rc;
 
-		ahash_request_set_crypt(req, sg, NULL, msgs[i].len);
-		rc = crypto_ahash_update(req);
-		gss_teardown_sgtable(&sgt);
+		rc = crypto_hash_update(&desc, sg, msgs[i].len);
 		if (rc)
 			return rc;
+
+		gss_teardown_sgtable(&sgt);
 	}
 
 	for (i = 0; i < iovcnt; i++) {
@@ -358,26 +377,29 @@ int gss_digest_hash_compat(struct ahash_request *req,
 		sg_init_table(sg, 1);
 		sg_set_page(&sg[0], iovs[i].kiov_page, iovs[i].kiov_len,
 			    iovs[i].kiov_offset);
-
-		ahash_request_set_crypt(req, sg, NULL, iovs[i].kiov_len);
-		rc = crypto_ahash_update(req);
+		rc = crypto_hash_update(&desc, sg, iovs[i].kiov_len);
 		if (rc)
 			return rc;
 	}
 
 	if (hdr) {
-		rc = gss_setup_sgtable(&sgt, sg, &(hdr->len), sizeof(hdr->len));
-		if (rc)
+		rc = gss_setup_sgtable(&sgt, sg, hdr, sizeof(*hdr));
+		if (rc != 0)
 			return rc;
 
-		ahash_request_set_crypt(req, sg, NULL, sizeof(hdr->len));
-		rc = crypto_ahash_update(req);
-		gss_teardown_sgtable(&sgt);
+		rc = crypto_hash_update(&desc, sg, sizeof(*hdr));
 		if (rc)
 			return rc;
+
+		gss_teardown_sgtable(&sgt);
 	}
 
-	return rc;
+	rc = crypto_hash_final(&desc, cksum->data);
+	if (rc)
+		return rc;
+
+	return gss_crypt_generic(kb->kb_tfm, 0, NULL, cksum->data,
+				 cksum->data, cksum->len);
 }
 
 int gss_add_padding(rawobj_t *msg, int msg_buflen, int blocksize)
@@ -400,10 +422,11 @@ int gss_add_padding(rawobj_t *msg, int msg_buflen, int blocksize)
 	return 0;
 }
 
-int gss_crypt_rawobjs(struct crypto_sync_skcipher *tfm, __u8 *iv,
+int gss_crypt_rawobjs(struct crypto_blkcipher *tfm, __u8 *iv,
 		      int inobj_cnt, rawobj_t *inobjs, rawobj_t *outobj,
 		      int enc)
 {
+	struct blkcipher_desc desc;
 	struct scatterlist src;
 	struct scatterlist dst;
 	struct sg_table sg_dst;
@@ -411,13 +434,12 @@ int gss_crypt_rawobjs(struct crypto_sync_skcipher *tfm, __u8 *iv,
 	__u8 *buf;
 	__u32 datalen = 0;
 	int i, rc;
-	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
-
 	ENTRY;
 
 	buf = outobj->data;
-	skcipher_request_set_sync_tfm(req, tfm);
-	skcipher_request_set_callback(req, 0, NULL, NULL);
+	desc.tfm  = tfm;
+	desc.info = iv;
+	desc.flags = 0;
 
 	for (i = 0; i < inobj_cnt; i++) {
 		LASSERT(buf + inobjs[i].len <= outobj->data + outobj->len);
@@ -434,30 +456,35 @@ int gss_crypt_rawobjs(struct crypto_sync_skcipher *tfm, __u8 *iv,
 			RETURN(rc);
 		}
 
-		skcipher_request_set_crypt(req, &src, &dst, src.length, iv);
-		if (!iv)
-			skcipher_request_set_crypt_iv(req);
-
-		if (enc)
-			rc = crypto_skcipher_encrypt_iv(req, &dst, &src,
-							src.length);
-		else
-			rc = crypto_skcipher_decrypt_iv(req, &dst, &src,
-							src.length);
+		if (iv) {
+			if (enc)
+				rc = crypto_blkcipher_encrypt_iv(&desc, &dst,
+								 &src,
+								 src.length);
+			else
+				rc = crypto_blkcipher_decrypt_iv(&desc, &dst,
+								 &src,
+								 src.length);
+		} else {
+			if (enc)
+				rc = crypto_blkcipher_encrypt(&desc, &dst, &src,
+							      src.length);
+			else
+				rc = crypto_blkcipher_decrypt(&desc, &dst, &src,
+							      src.length);
+		}
 
 		gss_teardown_sgtable(&sg_src);
 		gss_teardown_sgtable(&sg_dst);
 
 		if (rc) {
 			CERROR("encrypt error %d\n", rc);
-			skcipher_request_zero(req);
 			RETURN(rc);
 		}
 
 		datalen += inobjs[i].len;
 		buf += inobjs[i].len;
 	}
-	skcipher_request_zero(req);
 
 	outobj->len = datalen;
 	RETURN(0);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.h
index 7ed680a4c8430..ad15cdedd66d5 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.h
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.h
@@ -1,79 +1,14 @@
 #ifndef PTLRPC_GSS_CRYPTO_H
 #define PTLRPC_GSS_CRYPTO_H
 
-#include <linux/scatterlist.h>
-
 #include "gss_internal.h"
 
-#include <crypto/skcipher.h>
-
-/*
- * linux v4.19-rc2-66-gb350bee5ea0f
- * crypto: skcipher - Introduce crypto_sync_skcipher
- *
- * crypto_sync_skcipher will replace crypto_blkcipher so start using
- * crypto_sync_skcipher and provide wrappers for older kernels
- */
-#ifdef SYNC_SKCIPHER_REQUEST_ON_STACK
-
-#define crypto_skcipher_encrypt_iv(desc, dst, src, blocksize)		\
-	crypto_skcipher_encrypt((desc))
-
-#define crypto_skcipher_decrypt_iv(desc, dst, src, blocksize)		\
-	crypto_skcipher_decrypt((desc))
-
-#define skcipher_request_set_crypt_iv(d)
-
-#else /* ! SYNC_SKCIPHER_REQUEST_ON_STACK */
-
-#define	crypto_sync_skcipher		crypto_blkcipher
-
-#define SYNC_SKCIPHER_REQUEST_ON_STACK(name, tfm)			\
-	struct blkcipher_desc __##name##_obj, *name = (void *)&__##name##_obj
-
-#define skcipher_request_set_sync_tfm(d, _tfm)				\
-	do { (d)->tfm = _tfm; } while (0)
-
-#define skcipher_request_set_callback(d, f, c, data)			\
-	do { (d)->flags = f; } while (0)
-
-#define skcipher_request_set_crypt(d, src, dst, cryptlen, iv)		\
-	do { (d)->info = iv; } while (0)
-
-#define skcipher_request_set_crypt_iv(d)				\
-	do { (d)->info = crypto_blkcipher_crt((d)->tfm)->iv; } while (0)
-
-#define crypto_sync_skcipher_blocksize(tfm)				\
-	crypto_blkcipher_blocksize((tfm))
-
-#define crypto_sync_skcipher_setkey(tfm, key, keylen)			\
-	crypto_blkcipher_setkey((tfm), (key), (keylen))
-
-#define crypto_alloc_sync_skcipher(name, type, mask)			\
-	crypto_alloc_blkcipher((name), (type), (mask))
-
-#define crypto_free_sync_skcipher(tfm)					\
-	crypto_free_blkcipher((tfm))
-
-#define crypto_sync_skcipher_ivsize(tfm)				\
-	crypto_blkcipher_ivsize((tfm))
-
-#define crypto_skcipher_encrypt_iv(desc, dst, src, len)			\
-	crypto_blkcipher_encrypt_iv((desc), (dst), (src), (len))
-
-#define crypto_skcipher_decrypt_iv(desc, dst, src, len)			\
-	crypto_blkcipher_decrypt_iv((desc), (dst), (src), (len))
-
-#define skcipher_request_zero(req) /* nop */
-
-#endif /* SYNC_SKCIPHER_REQUEST_ON_STACK */
-
 struct gss_keyblock {
-	rawobj_t kb_key;
-	struct crypto_sync_skcipher *kb_tfm;
+	rawobj_t		 kb_key;
+	struct crypto_blkcipher *kb_tfm;
 };
 
-int gss_keyblock_init(struct gss_keyblock *kb, const char *alg_name,
+int gss_keyblock_init(struct gss_keyblock *kb, char *alg_name,
 		      const int alg_mode);
 void gss_keyblock_free(struct gss_keyblock *kb);
 int gss_keyblock_dup(struct gss_keyblock *new, struct gss_keyblock *kb);
@@ -84,15 +19,16 @@ int gss_get_keyblock(char **ptr, const char *end, struct gss_keyblock *kb,
 int gss_setup_sgtable(struct sg_table *sgt, struct scatterlist *prealloc_sg,
 		      const void *buf, unsigned int buf_len);
 void gss_teardown_sgtable(struct sg_table *sgt);
-int gss_crypt_generic(struct crypto_sync_skcipher *tfm, int decrypt,
-		      const void *iv, const void *in, void *out, size_t length);
-int gss_digest_hash(struct ahash_request *req, rawobj_t *hdr,
-		    int msgcnt, rawobj_t *msgs, int iovcnt, lnet_kiov_t *iovs);
-int gss_digest_hash_compat(struct ahash_request *req,
-			   rawobj_t *hdr, int msgcnt, rawobj_t *msgs,
-			   int iovcnt, lnet_kiov_t *iovs);
+int gss_crypt_generic(struct crypto_blkcipher *tfm, int decrypt, const void *iv,
+		      const void *in, void *out, size_t length);
+int gss_digest_hmac(struct crypto_hash *tfm, rawobj_t *key, rawobj_t *hdr,
+		    int msgcnt, rawobj_t *msgs, int iovcnt, lnet_kiov_t *iovs,
+		    rawobj_t *cksum);
+int gss_digest_norm(struct crypto_hash *tfm, struct gss_keyblock *kb,
+		    rawobj_t *hdr, int msgcnt, rawobj_t *msgs, int iovcnt,
+		    lnet_kiov_t *iovs, rawobj_t *cksum);
 int gss_add_padding(rawobj_t *msg, int msg_buflen, int blocksize);
-int gss_crypt_rawobjs(struct crypto_sync_skcipher *tfm, __u8 *iv,
+int gss_crypt_rawobjs(struct crypto_blkcipher *tfm, __u8 *iv,
 		      int inobj_cnt, rawobj_t *inobjs, rawobj_t *outobj,
 		      int enc);
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_generic_token.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_generic_token.c
index 23506f89d67c2..3c4e63b992bee 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_generic_token.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_generic_token.c
@@ -50,6 +50,7 @@
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
+#include <lustre/lustre_idl.h>
 #include <lustre_net.h>
 #include <lustre_import.h>
 #include <lustre_sec.h>
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h
index c49a54021688f..eb86ba1627103 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h
@@ -11,8 +11,7 @@
 #ifndef __PTLRPC_GSS_GSS_INTERNAL_H_
 #define __PTLRPC_GSS_GSS_INTERNAL_H_
 
-#include <crypto/hash.h>
-#include <libcfs/libcfs_crypto.h>
+#include <linux/crypto.h>
 #include <lustre_sec.h>
 
 /*
@@ -73,16 +72,17 @@ int buffer_extract_bytes(const void **buf, __u32 *buflen,
  */
 #define GSS_GC_INTERVAL                 (60 * 60) /* 60 minutes */
 
-static inline time64_t gss_round_ctx_expiry(time64_t expiry,
-					    unsigned long sec_flags)
+static inline
+unsigned long gss_round_ctx_expiry(unsigned long expiry,
+                                   unsigned long sec_flags)
 {
-	if (sec_flags & PTLRPC_SEC_FL_REVERSE)
-		return expiry;
+        if (sec_flags & PTLRPC_SEC_FL_REVERSE)
+                return expiry;
 
-	if (ktime_get_real_seconds() + __TIMEOUT_DELTA <= expiry)
-		return expiry - __TIMEOUT_DELTA;
+        if (ktime_get_real_seconds() + __TIMEOUT_DELTA <= expiry)
+                return expiry - __TIMEOUT_DELTA;
 
-	return expiry;
+        return expiry;
 }
 
 /*
@@ -117,9 +117,8 @@ enum ptlrpc_gss_tgt {
 };
 
 enum ptlrpc_gss_header_flags {
-	LUSTRE_GSS_PACK_BULK            = 1,
-	LUSTRE_GSS_PACK_USER            = 2,
-	LUSTRE_GSS_PACK_KCSUM           = 4,
+        LUSTRE_GSS_PACK_BULK            = 1,
+        LUSTRE_GSS_PACK_USER            = 2,
 };
 
 static inline
@@ -287,9 +286,9 @@ struct gss_cli_ctx {
 };
 
 struct gss_cli_ctx_keyring {
-	struct gss_cli_ctx      gck_base;
-	struct key             *gck_key;
-	struct timer_list       gck_timer;
+        struct gss_cli_ctx      gck_base;
+        struct key             *gck_key;
+        struct timer_list      *gck_timer;
 };
 
 struct gss_sec {
@@ -358,14 +357,6 @@ static inline struct gss_sec_keyring *sec2gsec_keyring(struct ptlrpc_sec *sec)
         return container_of(sec2gsec(sec), struct gss_sec_keyring, gsk_base);
 }
 
-#ifdef HAVE_CACHE_HASH_SPINLOCK
-# define sunrpc_cache_lookup(c, i, h) sunrpc_cache_lookup_rcu((c), (i), (h))
-# define cache_read_lock(cdetail)   spin_lock(&((cdetail)->hash_lock))
-# define cache_read_unlock(cdetail) spin_unlock(&((cdetail)->hash_lock))
-#else /* ! HAVE_CACHE_HASH_SPINLOCK */
-# define cache_read_lock(cdetail)   read_lock(&((cdetail)->hash_lock))
-# define cache_read_unlock(cdetail) read_unlock(&((cdetail)->hash_lock))
-#endif
 
 #define GSS_CTX_INIT_MAX_LEN            (1024)
 
@@ -518,7 +509,6 @@ void gss_svc_upcall_destroy_ctx(struct gss_svc_ctx *ctx);
 
 int  __init gss_init_svc_upcall(void);
 void gss_exit_svc_upcall(void);
-extern unsigned int krb5_allow_old_client_csum;
 
 /* lproc_gss.c */
 void gss_stat_oos_record_cli(int behind);
@@ -564,13 +554,4 @@ void __dbg_memdump(char *name, void *ptr, int size)
         OBD_FREE(buf, bufsize);
 }
 
-static inline unsigned int ll_read_key_usage(struct key *key)
-{
-#ifdef HAVE_KEY_USAGE_REFCOUNT
-	return refcount_read(&key->usage);
-#else
-	return atomic_read(&key->usage);
-#endif
-}
-
 #endif /* __PTLRPC_GSS_GSS_INTERNAL_H_ */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_keyring.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_keyring.c
index 15bf99427489b..81aad1ffea6e2 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_keyring.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_keyring.c
@@ -51,7 +51,7 @@
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
-#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre/lustre_idl.h>
 #include <lustre_sec.h>
 #include <lustre_net.h>
 #include <lustre_import.h>
@@ -60,10 +60,6 @@
 #include "gss_internal.h"
 #include "gss_api.h"
 
-#ifdef HAVE_GET_REQUEST_KEY_AUTH
-#include <keys/request_key_auth-type.h>
-#endif
-
 static struct ptlrpc_sec_policy gss_policy_keyring;
 static struct ptlrpc_ctx_ops gss_keyring_ctxops;
 static struct key_type gss_key_type;
@@ -86,6 +82,45 @@ static int sec_install_rctx_kr(struct ptlrpc_sec *sec,
  * internal helpers                     *
  ****************************************/
 
+#define DUMP_PROCESS_KEYRINGS(tsk)					\
+{									\
+	CWARN("DUMP PK: %s[%u,%u/%u](<-%s[%u,%u/%u]): "			\
+	      "a %d, t %d, p %d, s %d, u %d, us %d, df %d\n",		\
+	      tsk->comm, tsk->pid, tsk->uid, tsk->fsuid,		\
+	      tsk->parent->comm, tsk->parent->pid,			\
+	      tsk->parent->uid, tsk->parent->fsuid,			\
+	      tsk->request_key_auth ?					\
+	      tsk->request_key_auth->serial : 0,			\
+	      key_cred(tsk)->thread_keyring ?				\
+	      key_cred(tsk)->thread_keyring->serial : 0,		\
+	      key_tgcred(tsk)->process_keyring ?			\
+	      key_tgcred(tsk)->process_keyring->serial : 0,		\
+	      key_tgcred(tsk)->session_keyring ?			\
+	      key_tgcred(tsk)->session_keyring->serial : 0,		\
+	      key_cred(tsk)->user->uid_keyring ?			\
+	      key_cred(tsk)->user->uid_keyring->serial : 0,		\
+	      key_cred(tsk)->user->session_keyring ?			\
+	      key_cred(tsk)->user->session_keyring->serial : 0,		\
+	      key_cred(tsk)->jit_keyring				\
+	     );								\
+}
+
+#define DUMP_KEY(key)                                                   \
+{                                                                       \
+        CWARN("DUMP KEY: %p(%d) ref %d u%u/g%u desc %s\n",              \
+              key, key->serial, atomic_read(&key->usage),               \
+              key->uid, key->gid,                                       \
+              key->description ? key->description : "n/a"               \
+             );                                                         \
+}
+
+#define key_cred(tsk)   ((tsk)->cred)
+#ifdef HAVE_CRED_TGCRED
+#define key_tgcred(tsk) ((tsk)->cred->tgcred)
+#else
+#define key_tgcred(tsk) key_cred(tsk)
+#endif
+
 static inline void keyring_upcall_lock(struct gss_sec_keyring *gsec_kr)
 {
 #ifdef HAVE_KEYRING_UPCALL_SERIALIZED
@@ -105,12 +140,10 @@ static inline void key_revoke_locked(struct key *key)
         set_bit(KEY_FLAG_REVOKED, &key->flags);
 }
 
-static void ctx_upcall_timeout_kr(cfs_timer_cb_arg_t data)
+static void ctx_upcall_timeout_kr(unsigned long data)
 {
-	struct gss_cli_ctx_keyring *gctx_kr = cfs_from_timer(gctx_kr,
-							     data, gck_timer);
-	struct ptlrpc_cli_ctx *ctx = &(gctx_kr->gck_base.gc_base);
-	struct key *key	= gctx_kr->gck_key;
+        struct ptlrpc_cli_ctx *ctx = (struct ptlrpc_cli_ctx *) data;
+        struct key            *key = ctx2gctx_keyring(ctx)->gck_key;
 
         CWARN("ctx %p, key %p\n", ctx, key);
 
@@ -120,18 +153,22 @@ static void ctx_upcall_timeout_kr(cfs_timer_cb_arg_t data)
         key_revoke_locked(key);
 }
 
-static void ctx_start_timer_kr(struct ptlrpc_cli_ctx *ctx, time64_t timeout)
+static void ctx_start_timer_kr(struct ptlrpc_cli_ctx *ctx, long timeout)
 {
 	struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx);
-	struct timer_list *timer = &gctx_kr->gck_timer;
+	struct timer_list          *timer = gctx_kr->gck_timer;
 
 	LASSERT(timer);
 
-	CDEBUG(D_SEC, "ctx %p: start timer %llds\n", ctx, timeout);
+	CDEBUG(D_SEC, "ctx %p: start timer %lds\n", ctx, timeout);
+	timeout = msecs_to_jiffies(timeout * MSEC_PER_SEC) +
+		  cfs_time_current();
+
+	init_timer(timer);
+	timer->expires = timeout;
+	timer->data = (unsigned long ) ctx;
+	timer->function = ctx_upcall_timeout_kr;
 
-	cfs_timer_setup(timer, ctx_upcall_timeout_kr,
-			(unsigned long)gctx_kr, 0);
-	timer->expires = cfs_time_seconds(timeout) + jiffies;
 	add_timer(timer);
 }
 
@@ -142,34 +179,47 @@ static
 void ctx_clear_timer_kr(struct ptlrpc_cli_ctx *ctx)
 {
         struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx);
-	struct timer_list          *timer = &gctx_kr->gck_timer;
+        struct timer_list          *timer = gctx_kr->gck_timer;
+
+        if (timer == NULL)
+                return;
 
         CDEBUG(D_SEC, "ctx %p, key %p\n", ctx, gctx_kr->gck_key);
 
+        gctx_kr->gck_timer = NULL;
+
         del_singleshot_timer_sync(timer);
+
+        OBD_FREE_PTR(timer);
 }
 
 static
 struct ptlrpc_cli_ctx *ctx_create_kr(struct ptlrpc_sec *sec,
                                      struct vfs_cred *vcred)
 {
-	struct ptlrpc_cli_ctx      *ctx;
-	struct gss_cli_ctx_keyring *gctx_kr;
+        struct ptlrpc_cli_ctx      *ctx;
+        struct gss_cli_ctx_keyring *gctx_kr;
 
-	OBD_ALLOC_PTR(gctx_kr);
-	if (gctx_kr == NULL)
-		return NULL;
+        OBD_ALLOC_PTR(gctx_kr);
+        if (gctx_kr == NULL)
+                return NULL;
 
-	cfs_timer_setup(&gctx_kr->gck_timer, NULL, 0, 0);
+        OBD_ALLOC_PTR(gctx_kr->gck_timer);
+        if (gctx_kr->gck_timer == NULL) {
+                OBD_FREE_PTR(gctx_kr);
+                return NULL;
+        }
+        init_timer(gctx_kr->gck_timer);
 
-	ctx = &gctx_kr->gck_base.gc_base;
+        ctx = &gctx_kr->gck_base.gc_base;
 
-	if (gss_cli_ctx_init_common(sec, ctx, &gss_keyring_ctxops, vcred)) {
-		OBD_FREE_PTR(gctx_kr);
-		return NULL;
-	}
+        if (gss_cli_ctx_init_common(sec, ctx, &gss_keyring_ctxops, vcred)) {
+                OBD_FREE_PTR(gctx_kr->gck_timer);
+                OBD_FREE_PTR(gctx_kr);
+                return NULL;
+        }
 
-	ctx->cc_expire = ktime_get_real_seconds() + KEYRING_UPCALL_TIMEOUT;
+	ctx->cc_expire = cfs_time_current_sec() + KEYRING_UPCALL_TIMEOUT;
 	clear_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags);
 	atomic_inc(&ctx->cc_refcount); /* for the caller */
 
@@ -191,6 +241,7 @@ static void ctx_destroy_kr(struct ptlrpc_cli_ctx *ctx)
         LASSERT(gctx_kr->gck_key == NULL);
 
 	ctx_clear_timer_kr(ctx);
+	LASSERT(gctx_kr->gck_timer == NULL);
 
 	if (gss_cli_ctx_fini_common(sec, ctx))
 		return;
@@ -337,7 +388,7 @@ static int key_set_payload(struct key *key, unsigned int index,
 static void bind_key_ctx(struct key *key, struct ptlrpc_cli_ctx *ctx)
 {
 	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
-	LASSERT(ll_read_key_usage(key) > 0);
+        LASSERT(atomic_read(&key->usage) > 0);
 	LASSERT(ctx2gctx_keyring(ctx)->gck_key == NULL);
 	LASSERT(!key_get_payload(key, 0));
 
@@ -510,17 +561,17 @@ void rvs_sec_install_root_ctx_kr(struct ptlrpc_sec *sec,
                                  struct ptlrpc_cli_ctx *new_ctx,
                                  struct key *key)
 {
-	struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec);
-	struct hlist_node __maybe_unused *hnode;
-	struct ptlrpc_cli_ctx *ctx;
-	time64_t now;
-
+	struct gss_sec_keyring	*gsec_kr = sec2gsec_keyring(sec);
+	struct hlist_node	__maybe_unused *hnode;
+	struct ptlrpc_cli_ctx	*ctx;
+	cfs_time_t		now;
 	ENTRY;
-	LASSERT(sec_is_reverse(sec));
+
+        LASSERT(sec_is_reverse(sec));
 
 	spin_lock(&sec->ps_lock);
 
-	now = ktime_get_real_seconds();
+        now = cfs_time_current_sec();
 
         /* set all existing ctxs short expiry */
         cfs_hlist_for_each_entry(ctx, hnode, &gsec_kr->gsk_clist, cc_cache) {
@@ -615,104 +666,39 @@ static inline int user_is_root(struct ptlrpc_sec *sec, struct vfs_cred *vcred)
                 return 0;
 }
 
-/*
- * kernel 5.3: commit 0f44e4d976f96c6439da0d6717238efa4b91196e
- * keys: Move the user and user-session keyrings to the user_namespace
- *
- * When lookup_user_key is available use the kernel API rather than directly
- * accessing the uid_keyring and session_keyring via the current process
- * credentials.
- */
-#ifdef HAVE_LOOKUP_USER_KEY
-
-/* from Linux security/keys/internal.h: */
-#ifndef KEY_LOOKUP_FOR_UNLINK
-#define KEY_LOOKUP_FOR_UNLINK		0x04
-#endif
-
-static struct key *_user_key(key_serial_t id)
-{
-	key_ref_t ref;
-
-	might_sleep();
-	ref = lookup_user_key(id, KEY_LOOKUP_FOR_UNLINK, 0);
-	if (IS_ERR(ref))
-		return NULL;
-	return key_ref_to_ptr(ref);
-}
-
-static inline struct key *get_user_session_keyring(const struct cred *cred)
-{
-	return _user_key(KEY_SPEC_USER_SESSION_KEYRING);
-}
-
-static inline struct key *get_user_keyring(const struct cred *cred)
-{
-	return _user_key(KEY_SPEC_USER_KEYRING);
-}
-#else
-static inline struct key *get_user_session_keyring(const struct cred *cred)
-{
-	return key_get(cred->user->session_keyring);
-}
-
-static inline struct key *get_user_keyring(const struct cred *cred)
-{
-	return key_get(cred->user->uid_keyring);
-}
-#endif
-
 /*
  * unlink request key from it's ring, which is linked during request_key().
  * sadly, we have to 'guess' which keyring it's linked to.
  *
- * FIXME this code is fragile, it depends on how request_key() is implemented.
+ * FIXME this code is fragile, depend on how request_key_link() is implemented.
  */
 static void request_key_unlink(struct key *key)
 {
-	const struct cred *cred = current_cred();
-	struct key *ring = NULL;
+	struct task_struct *tsk = current;
+	struct key *ring;
 
-	switch (cred->jit_keyring) {
+	switch (key_cred(tsk)->jit_keyring) {
 	case KEY_REQKEY_DEFL_DEFAULT:
-	case KEY_REQKEY_DEFL_REQUESTOR_KEYRING:
-#ifdef HAVE_GET_REQUEST_KEY_AUTH
-		if (cred->request_key_auth) {
-			struct request_key_auth *rka;
-			struct key *authkey = cred->request_key_auth;
-
-			down_read(&authkey->sem);
-			rka = get_request_key_auth(authkey);
-			if (!test_bit(KEY_FLAG_REVOKED, &authkey->flags))
-				ring = key_get(rka->dest_keyring);
-			up_read(&authkey->sem);
-			if (ring)
-				break;
-		}
-#endif
-		/* fall through */
 	case KEY_REQKEY_DEFL_THREAD_KEYRING:
-		ring = key_get(cred->thread_keyring);
+		ring = key_get(key_cred(tsk)->thread_keyring);
 		if (ring)
 			break;
-		/* fallthrough */
 	case KEY_REQKEY_DEFL_PROCESS_KEYRING:
-		ring = key_get(cred->process_keyring);
+		ring = key_get(key_tgcred(tsk)->process_keyring);
 		if (ring)
 			break;
-		/* fallthrough */
 	case KEY_REQKEY_DEFL_SESSION_KEYRING:
 		rcu_read_lock();
-		ring = key_get(rcu_dereference(cred->session_keyring));
+		ring = key_get(rcu_dereference(key_tgcred(tsk)
+					       ->session_keyring));
 		rcu_read_unlock();
 		if (ring)
 			break;
-		/* fallthrough */
 	case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
-		ring = get_user_session_keyring(cred);
+		ring = key_get(key_cred(tsk)->user->session_keyring);
 		break;
 	case KEY_REQKEY_DEFL_USER_KEYRING:
-		ring = get_user_keyring(cred);
+		ring = key_get(key_cred(tsk)->user->uid_keyring);
 		break;
 	case KEY_REQKEY_DEFL_GROUP_KEYRING:
 	default:
@@ -877,7 +863,7 @@ struct ptlrpc_cli_ctx * gss_sec_lookup_ctx_kr(struct ptlrpc_sec *sec,
 	if (likely(ctx)) {
 		LASSERT(atomic_read(&ctx->cc_refcount) >= 1);
 		LASSERT(ctx2gctx_keyring(ctx)->gck_key == key);
-		LASSERT(ll_read_key_usage(key) >= 2);
+		LASSERT(atomic_read(&key->usage) >= 2);
 
 		/* simply take a ref and return. it's upper layer's
 		 * responsibility to detect & replace dead ctx. */
@@ -1081,13 +1067,13 @@ void gss_sec_gc_ctx_kr(struct ptlrpc_sec *sec)
 static
 int gss_sec_display_kr(struct ptlrpc_sec *sec, struct seq_file *seq)
 {
-	struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec);
-	struct hlist_node __maybe_unused *pos, *next;
-	struct ptlrpc_cli_ctx *ctx;
-	struct gss_cli_ctx *gctx;
-	time64_t now = ktime_get_real_seconds();
-
+	struct gss_sec_keyring	*gsec_kr = sec2gsec_keyring(sec);
+	struct hlist_node	__maybe_unused *pos, *next;
+	struct ptlrpc_cli_ctx	*ctx;
+	struct gss_cli_ctx	*gctx;
+	time_t			 now = cfs_time_current_sec();
 	ENTRY;
+
 	spin_lock(&sec->ps_lock);
         cfs_hlist_for_each_entry_safe(ctx, pos, next,
 				      &gsec_kr->gsk_clist, cc_cache) {
@@ -1107,8 +1093,9 @@ int gss_sec_display_kr(struct ptlrpc_sec *sec, struct seq_file *seq)
                         snprintf(mech, sizeof(mech), "N/A");
                 mech[sizeof(mech) - 1] = '\0';
 
-		seq_printf(seq,
-			   "%p: uid %u, ref %d, expire %lld(%+lld), fl %s, seq %d, win %u, key %08x(ref %d), hdl %#llx:%#llx, mech: %s\n",
+		seq_printf(seq, "%p: uid %u, ref %d, expire %lu(%+ld), fl %s, "
+			   "seq %d, win %u, key %08x(ref %d), "
+			   "hdl %#llx:%#llx, mech: %s\n",
 			   ctx, ctx->cc_vcred.vc_uid,
 			   atomic_read(&ctx->cc_refcount),
 			   ctx->cc_expire,
@@ -1117,7 +1104,7 @@ int gss_sec_display_kr(struct ptlrpc_sec *sec, struct seq_file *seq)
 			   atomic_read(&gctx->gc_seq),
 			   gctx->gc_win,
 			   key ? key->serial : 0,
-			   key ? ll_read_key_usage(key) : 0,
+			   key ? atomic_read(&key->usage) : 0,
 			   gss_handle_to_u64(&gctx->gc_handle),
 			   gss_handle_to_u64(&gctx->gc_svc_handle),
 			   mech);
@@ -1134,16 +1121,8 @@ int gss_sec_display_kr(struct ptlrpc_sec *sec, struct seq_file *seq)
 static
 int gss_cli_ctx_refresh_kr(struct ptlrpc_cli_ctx *ctx)
 {
-	/* upcall is already on the way */
-	struct gss_cli_ctx *gctx = ctx ? ctx2gctx(ctx) : NULL;
-
-	/* record latest sequence number in buddy svcctx */
-	if (gctx && !rawobj_empty(&gctx->gc_svc_handle) &&
-	    sec_is_reverse(gctx->gc_base.cc_sec)) {
-		return gss_svc_upcall_update_sequence(&gctx->gc_svc_handle,
-					     (__u32)atomic_read(&gctx->gc_seq));
-	}
-	return 0;
+        /* upcall is already on the way */
+        return 0;
 }
 
 static
@@ -1346,15 +1325,15 @@ int gss_kt_instantiate(struct key *key, const void *data, size_t datalen)
          * the session keyring is created upon upcall, and don't change all
          * the way until upcall finished, so rcu lock is not needed here.
          */
-	LASSERT(current_cred()->session_keyring);
+	LASSERT(key_tgcred(current)->session_keyring);
 
 	lockdep_off();
-	rc = key_link(current_cred()->session_keyring, key);
+	rc = key_link(key_tgcred(current)->session_keyring, key);
 	lockdep_on();
 	if (unlikely(rc)) {
 		CERROR("failed to link key %08x to keyring %08x: %d\n",
 		       key->serial,
-		       current_cred()->session_keyring->serial, rc);
+		       key_tgcred(current)->session_keyring->serial, rc);
 		RETURN(rc);
 	}
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5.h
index 611160458d9b1..97ad55e3025c0 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5.h
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5.h
@@ -80,7 +80,7 @@ struct krb5_ctx {
 				kc_cfx:1,
 				kc_seed_init:1,
 				kc_have_acceptor_subkey:1;
-	time64_t		kc_endtime;
+	__s32			kc_endtime;
 	__u8			kc_seed[16];
 	__u64			kc_seq_send;
 	__u64			kc_seq_recv;
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5_mech.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5_mech.c
index bd3a94ba162b3..000d7a8e87b47 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5_mech.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5_mech.c
@@ -58,6 +58,7 @@
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
+#include <lustre/lustre_idl.h>
 #include <lustre_net.h>
 #include <lustre_import.h>
 #include <lustre_sec.h>
@@ -94,20 +95,18 @@ static struct krb5_enctype enctypes[] = {
 		.ke_hash_size	= 16,
 		.ke_conf_size	= 8,
 	},
-#ifdef HAVE_DES3_SUPPORT
 	[ENCTYPE_DES3_CBC_RAW] = {		/* des3-hmac-sha1 */
 		.ke_dispname	= "des3-hmac-sha1",
 		.ke_enc_name	= "cbc(des3_ede)",
-		.ke_hash_name	= "sha1",
+		.ke_hash_name	= "hmac(sha1)",
 		.ke_hash_size	= 20,
 		.ke_conf_size	= 8,
 		.ke_hash_hmac	= 1,
 	},
-#endif
 	[ENCTYPE_AES128_CTS_HMAC_SHA1_96] = {	/* aes128-cts */
 		.ke_dispname	= "aes128-cts-hmac-sha1-96",
 		.ke_enc_name	= "cbc(aes)",
-		.ke_hash_name	= "sha1",
+		.ke_hash_name	= "hmac(sha1)",
 		.ke_hash_size	= 12,
 		.ke_conf_size	= 16,
 		.ke_hash_hmac	= 1,
@@ -115,7 +114,7 @@ static struct krb5_enctype enctypes[] = {
 	[ENCTYPE_AES256_CTS_HMAC_SHA1_96] = {	/* aes256-cts */
 		.ke_dispname	= "aes256-cts-hmac-sha1-96",
 		.ke_enc_name	= "cbc(aes)",
-		.ke_hash_name	= "sha1",
+		.ke_hash_name	= "hmac(sha1)",
 		.ke_hash_size	= 12,
 		.ke_conf_size	= 16,
 		.ke_hash_hmac	= 1,
@@ -123,31 +122,33 @@ static struct krb5_enctype enctypes[] = {
 	[ENCTYPE_ARCFOUR_HMAC] = {		/* arcfour-hmac-md5 */
 		.ke_dispname	= "arcfour-hmac-md5",
 		.ke_enc_name	= "ecb(arc4)",
-		.ke_hash_name	= "md5",
+		.ke_hash_name	= "hmac(md5)",
 		.ke_hash_size	= 16,
 		.ke_conf_size	= 8,
 		.ke_hash_hmac	= 1,
 	}
 };
 
+#define MAX_ENCTYPES    sizeof(enctypes)/sizeof(struct krb5_enctype)
+
 static const char * enctype2str(__u32 enctype)
 {
-	if (enctype < ARRAY_SIZE(enctypes) && enctypes[enctype].ke_dispname)
-		return enctypes[enctype].ke_dispname;
+        if (enctype < MAX_ENCTYPES && enctypes[enctype].ke_dispname)
+                return enctypes[enctype].ke_dispname;
 
-	return "unknown";
+        return "unknown";
 }
 
 static
 int krb5_init_keys(struct krb5_ctx *kctx)
 {
-	struct krb5_enctype *ke;
+        struct krb5_enctype *ke;
 
-	if (kctx->kc_enctype >= ARRAY_SIZE(enctypes) ||
-	    enctypes[kctx->kc_enctype].ke_hash_size == 0) {
-		CERROR("unsupported enctype %x\n", kctx->kc_enctype);
-		return -1;
-	}
+        if (kctx->kc_enctype >= MAX_ENCTYPES ||
+            enctypes[kctx->kc_enctype].ke_hash_size == 0) {
+                CERROR("unsupported enctype %x\n", kctx->kc_enctype);
+                return -1;
+        }
 
         ke = &enctypes[kctx->kc_enctype];
 
@@ -196,13 +197,8 @@ __u32 import_context_rfc1964(struct krb5_ctx *kctx, char *p, char *end)
 	    gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
 		goto out_err;
 
-	/* end time. While kc_endtime might be 64 bit the krb5 API
-	 * still uses 32 bits. To delay the 2038 bug see the incoming
-	 * value as a u32 which give us until 2106. See the link for details:
-	 *
-	 * http://web.mit.edu/kerberos/www/krb5-current/doc/appdev/y2038.html
-	 */
-	if (gss_get_bytes(&p, end, &kctx->kc_endtime, sizeof(u32)))
+	/* end time */
+	if (gss_get_bytes(&p, end, &kctx->kc_endtime, sizeof(kctx->kc_endtime)))
 		goto out_err;
 
 	/* seq send */
@@ -266,13 +262,8 @@ __u32 import_context_rfc4121(struct krb5_ctx *kctx, char *p, char *end)
 {
 	unsigned int    tmp_uint, keysize;
 
-	/* end time. While kc_endtime might be 64 bit the krb5 API
-	 * still uses 32 bits. To delay the 2038 bug see the incoming
-	 * value as a u32 which give us until 2106. See the link for details:
-	 *
-	 * http://web.mit.edu/kerberos/www/krb5-current/doc/appdev/y2038.html
-	 */
-	if (gss_get_bytes(&p, end, &kctx->kc_endtime, sizeof(u32)))
+	/* end time */
+	if (gss_get_bytes(&p, end, &kctx->kc_endtime, sizeof(kctx->kc_endtime)))
 		goto out_err;
 
 	/* flags */
@@ -420,11 +411,11 @@ __u32 gss_copy_reverse_context_kerberos(struct gss_ctx *gctx,
 
 static
 __u32 gss_inquire_context_kerberos(struct gss_ctx *gctx,
-				   time64_t *endtime)
+                                   unsigned long  *endtime)
 {
         struct krb5_ctx *kctx = gctx->internal_ctx_id;
 
-	*endtime = kctx->kc_endtime;
+	*endtime = (unsigned long)((__u32) kctx->kc_endtime);
         return GSS_S_COMPLETE;
 }
 
@@ -447,66 +438,41 @@ __s32 krb5_make_checksum(__u32 enctype,
 			 struct krb5_header *khdr,
 			 int msgcnt, rawobj_t *msgs,
 			 int iovcnt, lnet_kiov_t *iovs,
-			 rawobj_t *cksum,
-			 digest_hash hash_func)
+			 rawobj_t *cksum)
 {
-	struct krb5_enctype *ke = &enctypes[enctype];
-	struct ahash_request *req = NULL;
-	enum cfs_crypto_hash_alg hash_algo;
-	rawobj_t hdr;
-	int rc;
-
-	hash_algo = cfs_crypto_hash_alg(ke->ke_hash_name);
-
-	/* For the cbc(des) case we want md5 instead of hmac(md5) */
-	if (strcmp(ke->ke_enc_name, "cbc(des)"))
-		req = cfs_crypto_hash_init(hash_algo, kb->kb_key.data,
-					   kb->kb_key.len);
-	else
-		req = cfs_crypto_hash_init(hash_algo, NULL, 0);
-	if (IS_ERR(req)) {
-		rc = PTR_ERR(req);
-		CERROR("failed to alloc hash %s : rc = %d\n",
-		       ke->ke_hash_name, rc);
-		goto out_no_hash;
-	}
+        struct krb5_enctype   *ke = &enctypes[enctype];
+	struct crypto_hash    *tfm;
+	rawobj_t	       hdr;
+        __u32                  code = GSS_S_FAILURE;
+        int                    rc;
+
+	if (!(tfm = crypto_alloc_hash(ke->ke_hash_name, 0, 0))) {
+                CERROR("failed to alloc TFM: %s\n", ke->ke_hash_name);
+                return GSS_S_FAILURE;
+        }
 
-	cksum->len = cfs_crypto_hash_digestsize(hash_algo);
-	OBD_ALLOC_LARGE(cksum->data, cksum->len);
-	if (!cksum->data) {
-		cksum->len = 0;
-		rc = -ENOMEM;
-		goto out_free_hash;
-	}
+	cksum->len = crypto_hash_digestsize(tfm);
+        OBD_ALLOC_LARGE(cksum->data, cksum->len);
+        if (!cksum->data) {
+                cksum->len = 0;
+                goto out_tfm;
+        }
 
 	hdr.data = (__u8 *)khdr;
 	hdr.len = sizeof(*khdr);
 
-	if (!hash_func) {
-		rc = -EPROTO;
-		CERROR("hash function for %s undefined\n",
-		       ke->ke_hash_name);
-		goto out_free_hash;
-	}
-	rc = hash_func(req, &hdr, msgcnt, msgs, iovcnt, iovs);
-	if (rc)
-		goto out_free_hash;
-
-	if (!ke->ke_hash_hmac) {
-		LASSERT(kb->kb_tfm);
-
-		cfs_crypto_hash_final(req, cksum->data, &cksum->len);
-		rc = gss_crypt_generic(kb->kb_tfm, 0, NULL,
-				       cksum->data, cksum->data,
-				       cksum->len);
-		goto out_no_hash;
-	}
+        if (ke->ke_hash_hmac)
+		rc = gss_digest_hmac(tfm, &kb->kb_key,
+				     &hdr, msgcnt, msgs, iovcnt, iovs, cksum);
+        else
+		rc = gss_digest_norm(tfm, kb,
+				     &hdr, msgcnt, msgs, iovcnt, iovs, cksum);
 
-out_free_hash:
-	if (req)
-		cfs_crypto_hash_final(req, cksum->data, &cksum->len);
-out_no_hash:
-	return rc ? GSS_S_FAILURE : GSS_S_COMPLETE;
+        if (rc == 0)
+                code = GSS_S_COMPLETE;
+out_tfm:
+	crypto_free_hash(tfm);
+        return code;
 }
 
 static void fill_krb5_header(struct krb5_ctx *kctx,
@@ -579,118 +545,118 @@ static __u32 verify_krb5_header(struct krb5_ctx *kctx,
 
 static
 __u32 gss_get_mic_kerberos(struct gss_ctx *gctx,
-			   int msgcnt,
-			   rawobj_t *msgs,
-			   int iovcnt,
-			   lnet_kiov_t *iovs,
-			   rawobj_t *token)
+                           int msgcnt,
+                           rawobj_t *msgs,
+                           int iovcnt,
+                           lnet_kiov_t *iovs,
+                           rawobj_t *token)
 {
-	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
-	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
-	struct krb5_header  *khdr;
-	rawobj_t cksum = RAWOBJ_EMPTY;
-	u32 major;
+        struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+        struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+        struct krb5_header  *khdr;
+        rawobj_t             cksum = RAWOBJ_EMPTY;
 
-	/* fill krb5 header */
-	LASSERT(token->len >= sizeof(*khdr));
+        /* fill krb5 header */
+        LASSERT(token->len >= sizeof(*khdr));
 	khdr = (struct krb5_header *)token->data;
-	fill_krb5_header(kctx, khdr, 0);
+        fill_krb5_header(kctx, khdr, 0);
 
-	/* checksum */
-	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc, khdr,
-			       msgcnt, msgs, iovcnt, iovs, &cksum,
-			       gctx->hash_func))
-		GOTO(out_free_cksum, major = GSS_S_FAILURE);
+        /* checksum */
+        if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc,
+                               khdr, msgcnt, msgs, iovcnt, iovs, &cksum))
+                return GSS_S_FAILURE;
 
-	LASSERT(cksum.len >= ke->ke_hash_size);
-	LASSERT(token->len >= sizeof(*khdr) + ke->ke_hash_size);
-	memcpy(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size,
-	       ke->ke_hash_size);
-
-	token->len = sizeof(*khdr) + ke->ke_hash_size;
-	major = GSS_S_COMPLETE;
-out_free_cksum:
-	rawobj_free(&cksum);
-	return major;
+        LASSERT(cksum.len >= ke->ke_hash_size);
+        LASSERT(token->len >= sizeof(*khdr) + ke->ke_hash_size);
+        memcpy(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size,
+               ke->ke_hash_size);
+
+        token->len = sizeof(*khdr) + ke->ke_hash_size;
+        rawobj_free(&cksum);
+        return GSS_S_COMPLETE;
 }
 
 static
 __u32 gss_verify_mic_kerberos(struct gss_ctx *gctx,
-			      int msgcnt,
-			      rawobj_t *msgs,
-			      int iovcnt,
-			      lnet_kiov_t *iovs,
-			      rawobj_t *token)
+                              int msgcnt,
+                              rawobj_t *msgs,
+                              int iovcnt,
+                              lnet_kiov_t *iovs,
+                              rawobj_t *token)
 {
-	struct krb5_ctx *kctx = gctx->internal_ctx_id;
-	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
-	struct krb5_header *khdr;
-	rawobj_t cksum = RAWOBJ_EMPTY;
-	u32 major;
-
-	if (token->len < sizeof(*khdr)) {
-		CERROR("short signature: %u\n", token->len);
-		return GSS_S_DEFECTIVE_TOKEN;
-	}
+        struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+        struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+        struct krb5_header  *khdr;
+        rawobj_t             cksum = RAWOBJ_EMPTY;
+        __u32                major;
+
+        if (token->len < sizeof(*khdr)) {
+                CERROR("short signature: %u\n", token->len);
+                return GSS_S_DEFECTIVE_TOKEN;
+        }
 
 	khdr = (struct krb5_header *)token->data;
 
-	major = verify_krb5_header(kctx, khdr, 0);
-	if (major != GSS_S_COMPLETE) {
-		CERROR("bad krb5 header\n");
-		goto out;
-	}
+        major = verify_krb5_header(kctx, khdr, 0);
+        if (major != GSS_S_COMPLETE) {
+                CERROR("bad krb5 header\n");
+                return major;
+        }
 
-	if (token->len < sizeof(*khdr) + ke->ke_hash_size) {
-		CERROR("short signature: %u, require %d\n",
-		       token->len, (int) sizeof(*khdr) + ke->ke_hash_size);
-		GOTO(out, major = GSS_S_FAILURE);
-	}
+        if (token->len < sizeof(*khdr) + ke->ke_hash_size) {
+                CERROR("short signature: %u, require %d\n",
+                       token->len, (int) sizeof(*khdr) + ke->ke_hash_size);
+                return GSS_S_FAILURE;
+        }
+
+        if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc,
+                               khdr, msgcnt, msgs, iovcnt, iovs, &cksum)) {
+                CERROR("failed to make checksum\n");
+                return GSS_S_FAILURE;
+        }
 
-	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc,
-			       khdr, msgcnt, msgs, iovcnt, iovs, &cksum,
-			       gctx->hash_func))
-		GOTO(out_free_cksum, major = GSS_S_FAILURE);
+        LASSERT(cksum.len >= ke->ke_hash_size);
+        if (memcmp(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size,
+                   ke->ke_hash_size)) {
+                CERROR("checksum mismatch\n");
+                rawobj_free(&cksum);
+                return GSS_S_BAD_SIG;
+        }
 
-	LASSERT(cksum.len >= ke->ke_hash_size);
-	if (memcmp(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size,
-		   ke->ke_hash_size)) {
-		CERROR("checksum mismatch\n");
-		GOTO(out_free_cksum, major = GSS_S_BAD_SIG);
-	}
-	major = GSS_S_COMPLETE;
-out_free_cksum:
-	rawobj_free(&cksum);
-out:
-	return major;
+        rawobj_free(&cksum);
+        return GSS_S_COMPLETE;
 }
 
 /*
  * if adj_nob != 0, we adjust desc->bd_nob to the actual cipher text size.
  */
 static
-int krb5_encrypt_bulk(struct crypto_sync_skcipher *tfm,
-		      struct krb5_header *khdr,
-		      char *confounder,
-		      struct ptlrpc_bulk_desc *desc,
-		      rawobj_t *cipher,
-		      int adj_nob)
+int krb5_encrypt_bulk(struct crypto_blkcipher *tfm,
+                      struct krb5_header *khdr,
+                      char *confounder,
+                      struct ptlrpc_bulk_desc *desc,
+                      rawobj_t *cipher,
+                      int adj_nob)
 {
-	__u8 local_iv[16] = {0};
-	struct scatterlist src, dst;
-	struct sg_table sg_src, sg_dst;
-	int blocksize, i, rc, nob = 0;
-	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
+        struct blkcipher_desc   ciph_desc;
+        __u8                    local_iv[16] = {0};
+        struct scatterlist      src, dst;
+	struct sg_table		sg_src, sg_dst;
+        int                     blocksize, i, rc, nob = 0;
 
 	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
-	LASSERT(desc->bd_iov_count);
+        LASSERT(desc->bd_iov_count);
 	LASSERT(GET_ENC_KIOV(desc));
 
-	blocksize = crypto_sync_skcipher_blocksize(tfm);
-	LASSERT(blocksize > 1);
-	LASSERT(cipher->len == blocksize + sizeof(*khdr));
+	blocksize = crypto_blkcipher_blocksize(tfm);
+        LASSERT(blocksize > 1);
+        LASSERT(cipher->len == blocksize + sizeof(*khdr));
+
+        ciph_desc.tfm  = tfm;
+        ciph_desc.info = local_iv;
+        ciph_desc.flags = 0;
 
-	/* encrypt confounder */
+        /* encrypt confounder */
 	rc = gss_setup_sgtable(&sg_src, &src, confounder, blocksize);
 	if (rc != 0)
 		return rc;
@@ -700,24 +666,20 @@ int krb5_encrypt_bulk(struct crypto_sync_skcipher *tfm,
 		gss_teardown_sgtable(&sg_src);
 		return rc;
 	}
-	skcipher_request_set_sync_tfm(req, tfm);
-	skcipher_request_set_callback(req, 0, NULL, NULL);
-	skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl,
-				   blocksize, local_iv);
 
-	rc = crypto_skcipher_encrypt_iv(req, sg_dst.sgl, sg_src.sgl, blocksize);
+	rc = crypto_blkcipher_encrypt_iv(&ciph_desc, sg_dst.sgl,
+					 sg_src.sgl, blocksize);
 
 	gss_teardown_sgtable(&sg_dst);
 	gss_teardown_sgtable(&sg_src);
 
-	if (rc) {
-		CERROR("error to encrypt confounder: %d\n", rc);
-		skcipher_request_zero(req);
-		return rc;
-	}
+        if (rc) {
+                CERROR("error to encrypt confounder: %d\n", rc);
+                return rc;
+        }
 
-	/* encrypt clear pages */
-	for (i = 0; i < desc->bd_iov_count; i++) {
+        /* encrypt clear pages */
+        for (i = 0; i < desc->bd_iov_count; i++) {
 		sg_init_table(&src, 1);
 		sg_set_page(&src, BD_GET_KIOV(desc, i).kiov_page,
 			    (BD_GET_KIOV(desc, i).kiov_len +
@@ -733,36 +695,28 @@ int krb5_encrypt_bulk(struct crypto_sync_skcipher *tfm,
 		BD_GET_ENC_KIOV(desc, i).kiov_offset = dst.offset;
 		BD_GET_ENC_KIOV(desc, i).kiov_len = dst.length;
 
-		skcipher_request_set_crypt(req, &src, &dst,
-					  src.length, local_iv);
-		rc = crypto_skcipher_encrypt_iv(req, &dst, &src, src.length);
-		if (rc) {
-			CERROR("error to encrypt page: %d\n", rc);
-			skcipher_request_zero(req);
-			return rc;
-		}
-	}
+		rc = crypto_blkcipher_encrypt_iv(&ciph_desc, &dst, &src,
+                                                    src.length);
+                if (rc) {
+                        CERROR("error to encrypt page: %d\n", rc);
+                        return rc;
+                }
+        }
 
-	/* encrypt krb5 header */
+        /* encrypt krb5 header */
 	rc = gss_setup_sgtable(&sg_src, &src, khdr, sizeof(*khdr));
-	if (rc != 0) {
-		skcipher_request_zero(req);
+	if (rc != 0)
 		return rc;
-	}
 
 	rc = gss_setup_sgtable(&sg_dst, &dst, cipher->data + blocksize,
 			   sizeof(*khdr));
 	if (rc != 0) {
 		gss_teardown_sgtable(&sg_src);
-		skcipher_request_zero(req);
 		return rc;
 	}
 
-	skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl,
-				   sizeof(*khdr), local_iv);
-	rc = crypto_skcipher_encrypt_iv(req, sg_dst.sgl, sg_src.sgl,
-					sizeof(*khdr));
-	skcipher_request_zero(req);
+	rc = crypto_blkcipher_encrypt_iv(&ciph_desc, sg_dst.sgl, sg_src.sgl,
+					 sizeof(*khdr));
 
 	gss_teardown_sgtable(&sg_dst);
 	gss_teardown_sgtable(&sg_src);
@@ -797,35 +751,39 @@ int krb5_encrypt_bulk(struct crypto_sync_skcipher *tfm,
  *   should have been done by prep_bulk().
  */
 static
-int krb5_decrypt_bulk(struct crypto_sync_skcipher *tfm,
-		      struct krb5_header *khdr,
-		      struct ptlrpc_bulk_desc *desc,
-		      rawobj_t *cipher,
-		      rawobj_t *plain,
-		      int adj_nob)
+int krb5_decrypt_bulk(struct crypto_blkcipher *tfm,
+                      struct krb5_header *khdr,
+                      struct ptlrpc_bulk_desc *desc,
+                      rawobj_t *cipher,
+                      rawobj_t *plain,
+                      int adj_nob)
 {
-	__u8 local_iv[16] = {0};
-	struct scatterlist src, dst;
-	struct sg_table sg_src, sg_dst;
-	int ct_nob = 0, pt_nob = 0;
-	int blocksize, i, rc;
-	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
+        struct blkcipher_desc   ciph_desc;
+        __u8                    local_iv[16] = {0};
+        struct scatterlist      src, dst;
+	struct sg_table		sg_src, sg_dst;
+        int                     ct_nob = 0, pt_nob = 0;
+        int                     blocksize, i, rc;
 
 	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
-	LASSERT(desc->bd_iov_count);
+        LASSERT(desc->bd_iov_count);
 	LASSERT(GET_ENC_KIOV(desc));
-	LASSERT(desc->bd_nob_transferred);
+        LASSERT(desc->bd_nob_transferred);
 
-	blocksize = crypto_sync_skcipher_blocksize(tfm);
-	LASSERT(blocksize > 1);
-	LASSERT(cipher->len == blocksize + sizeof(*khdr));
+	blocksize = crypto_blkcipher_blocksize(tfm);
+        LASSERT(blocksize > 1);
+        LASSERT(cipher->len == blocksize + sizeof(*khdr));
 
-	if (desc->bd_nob_transferred % blocksize) {
-		CERROR("odd transferred nob: %d\n", desc->bd_nob_transferred);
-		return -EPROTO;
-	}
+        ciph_desc.tfm  = tfm;
+        ciph_desc.info = local_iv;
+        ciph_desc.flags = 0;
+
+        if (desc->bd_nob_transferred % blocksize) {
+                CERROR("odd transferred nob: %d\n", desc->bd_nob_transferred);
+                return -EPROTO;
+        }
 
-	/* decrypt head (confounder) */
+        /* decrypt head (confounder) */
 	rc = gss_setup_sgtable(&sg_src, &src, cipher->data, blocksize);
 	if (rc != 0)
 		return rc;
@@ -836,31 +794,27 @@ int krb5_decrypt_bulk(struct crypto_sync_skcipher *tfm,
 		return rc;
 	}
 
-	skcipher_request_set_sync_tfm(req, tfm);
-	skcipher_request_set_callback(req, 0, NULL, NULL);
-	skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl,
-				   blocksize, local_iv);
-
-	rc = crypto_skcipher_encrypt_iv(req, sg_dst.sgl, sg_src.sgl, blocksize);
+	rc = crypto_blkcipher_decrypt_iv(&ciph_desc, sg_dst.sgl,
+					 sg_src.sgl, blocksize);
 
 	gss_teardown_sgtable(&sg_dst);
 	gss_teardown_sgtable(&sg_src);
 
-	if (rc) {
-		CERROR("error to decrypt confounder: %d\n", rc);
-		skcipher_request_zero(req);
-		return rc;
-	}
+        if (rc) {
+                CERROR("error to decrypt confounder: %d\n", rc);
+                return rc;
+        }
 
 	for (i = 0; i < desc->bd_iov_count && ct_nob < desc->bd_nob_transferred;
 	     i++) {
-		if (BD_GET_ENC_KIOV(desc, i).kiov_offset % blocksize != 0 ||
-		    BD_GET_ENC_KIOV(desc, i).kiov_len % blocksize != 0) {
+		if (BD_GET_ENC_KIOV(desc, i).kiov_offset % blocksize
+		    != 0 ||
+		    BD_GET_ENC_KIOV(desc, i).kiov_len % blocksize
+		    != 0) {
 			CERROR("page %d: odd offset %u len %u, blocksize %d\n",
 			       i, BD_GET_ENC_KIOV(desc, i).kiov_offset,
 			       BD_GET_ENC_KIOV(desc, i).kiov_len,
 			       blocksize);
-			skcipher_request_zero(req);
 			return -EFAULT;
 		}
 
@@ -897,14 +851,12 @@ int krb5_decrypt_bulk(struct crypto_sync_skcipher *tfm,
 			sg_assign_page(&dst,
 				       BD_GET_KIOV(desc, i).kiov_page);
 
-		skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl,
-					   src.length, local_iv);
-		rc = crypto_skcipher_decrypt_iv(req, &dst, &src, src.length);
-		if (rc) {
-			CERROR("error to decrypt page: %d\n", rc);
-			skcipher_request_zero(req);
-			return rc;
-		}
+		rc = crypto_blkcipher_decrypt_iv(&ciph_desc, &dst, &src,
+						 src.length);
+                if (rc) {
+                        CERROR("error to decrypt page: %d\n", rc);
+                        return rc;
+                }
 
 		if (BD_GET_KIOV(desc, i).kiov_len % blocksize != 0) {
 			memcpy(page_address(BD_GET_KIOV(desc, i).kiov_page) +
@@ -919,26 +871,24 @@ int krb5_decrypt_bulk(struct crypto_sync_skcipher *tfm,
 		pt_nob += BD_GET_KIOV(desc, i).kiov_len;
 	}
 
-	if (unlikely(ct_nob != desc->bd_nob_transferred)) {
-		CERROR("%d cipher text transferred but only %d decrypted\n",
-		       desc->bd_nob_transferred, ct_nob);
-		skcipher_request_zero(req);
-		return -EFAULT;
-	}
+        if (unlikely(ct_nob != desc->bd_nob_transferred)) {
+                CERROR("%d cipher text transferred but only %d decrypted\n",
+                       desc->bd_nob_transferred, ct_nob);
+                return -EFAULT;
+        }
 
-	if (unlikely(!adj_nob && pt_nob != desc->bd_nob)) {
-		CERROR("%d plain text expected but only %d received\n",
-		       desc->bd_nob, pt_nob);
-		skcipher_request_zero(req);
-		return -EFAULT;
-	}
+        if (unlikely(!adj_nob && pt_nob != desc->bd_nob)) {
+                CERROR("%d plain text expected but only %d received\n",
+                       desc->bd_nob, pt_nob);
+                return -EFAULT;
+        }
 
 	/* if needed, clear up the rest unused iovs */
 	if (adj_nob)
 		while (i < desc->bd_iov_count)
 			BD_GET_KIOV(desc, i++).kiov_len = 0;
 
-	/* decrypt tail (krb5 header) */
+        /* decrypt tail (krb5 header) */
 	rc = gss_setup_sgtable(&sg_src, &src, cipher->data + blocksize,
 			       sizeof(*khdr));
 	if (rc != 0)
@@ -951,170 +901,166 @@ int krb5_decrypt_bulk(struct crypto_sync_skcipher *tfm,
 		return rc;
 	}
 
-	skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl,
-				  src.length, local_iv);
-	rc = crypto_skcipher_decrypt_iv(req, sg_dst.sgl, sg_src.sgl,
-					sizeof(*khdr));
+	rc = crypto_blkcipher_decrypt_iv(&ciph_desc, sg_dst.sgl, sg_src.sgl,
+					 sizeof(*khdr));
+
 	gss_teardown_sgtable(&sg_src);
 	gss_teardown_sgtable(&sg_dst);
 
-	skcipher_request_zero(req);
-	if (rc) {
-		CERROR("error to decrypt tail: %d\n", rc);
-		return rc;
-	}
+        if (rc) {
+                CERROR("error to decrypt tail: %d\n", rc);
+                return rc;
+        }
 
-	if (memcmp(cipher->data + blocksize, khdr, sizeof(*khdr))) {
-		CERROR("krb5 header doesn't match\n");
-		return -EACCES;
-	}
+        if (memcmp(cipher->data + blocksize, khdr, sizeof(*khdr))) {
+                CERROR("krb5 header doesn't match\n");
+                return -EACCES;
+        }
 
-	return 0;
+        return 0;
 }
 
 static
 __u32 gss_wrap_kerberos(struct gss_ctx *gctx,
-			rawobj_t *gsshdr,
-			rawobj_t *msg,
-			int msg_buflen,
-			rawobj_t *token)
+                        rawobj_t *gsshdr,
+                        rawobj_t *msg,
+                        int msg_buflen,
+                        rawobj_t *token)
 {
-	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
-	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
-	struct krb5_header  *khdr;
-	int                  blocksize;
-	rawobj_t             cksum = RAWOBJ_EMPTY;
-	rawobj_t             data_desc[3], cipher;
-	__u8                 conf[GSS_MAX_CIPHER_BLOCK];
+        struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+        struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+        struct krb5_header  *khdr;
+        int                  blocksize;
+        rawobj_t             cksum = RAWOBJ_EMPTY;
+        rawobj_t             data_desc[3], cipher;
+        __u8                 conf[GSS_MAX_CIPHER_BLOCK];
 	__u8                 local_iv[16] = {0};
-	u32 major;
-	int                  rc = 0;
-
-	LASSERT(ke);
-	LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK);
-	LASSERT(kctx->kc_keye.kb_tfm == NULL ||
-		ke->ke_conf_size >=
-		crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm));
-
-	/*
-	 * final token format:
-	 * ---------------------------------------------------
-	 * | krb5 header | cipher text | checksum (16 bytes) |
-	 * ---------------------------------------------------
-	 */
-
-	/* fill krb5 header */
-	LASSERT(token->len >= sizeof(*khdr));
+        int                  rc = 0;
+
+        LASSERT(ke);
+        LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK);
+        LASSERT(kctx->kc_keye.kb_tfm == NULL ||
+                ke->ke_conf_size >=
+		crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm));
+
+        /*
+         * final token format:
+         * ---------------------------------------------------
+         * | krb5 header | cipher text | checksum (16 bytes) |
+         * ---------------------------------------------------
+         */
+
+        /* fill krb5 header */
+        LASSERT(token->len >= sizeof(*khdr));
 	khdr = (struct krb5_header *)token->data;
-	fill_krb5_header(kctx, khdr, 1);
+        fill_krb5_header(kctx, khdr, 1);
 
-	/* generate confounder */
-	cfs_get_random_bytes(conf, ke->ke_conf_size);
+        /* generate confounder */
+        cfs_get_random_bytes(conf, ke->ke_conf_size);
 
-	/* get encryption blocksize. note kc_keye might not associated with
-	 * a tfm, currently only for arcfour-hmac */
-	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
-		LASSERT(kctx->kc_keye.kb_tfm == NULL);
-		blocksize = 1;
-	} else {
-		LASSERT(kctx->kc_keye.kb_tfm);
-		blocksize = crypto_sync_skcipher_blocksize(
-							kctx->kc_keye.kb_tfm);
-	}
-	LASSERT(blocksize <= ke->ke_conf_size);
+        /* get encryption blocksize. note kc_keye might not associated with
+         * a tfm, currently only for arcfour-hmac */
+        if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+                LASSERT(kctx->kc_keye.kb_tfm == NULL);
+                blocksize = 1;
+        } else {
+                LASSERT(kctx->kc_keye.kb_tfm);
+		blocksize = crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+        }
+        LASSERT(blocksize <= ke->ke_conf_size);
 
 	/* padding the message */
 	if (gss_add_padding(msg, msg_buflen, blocksize))
 		return GSS_S_FAILURE;
 
-	/*
-	 * clear text layout for checksum:
-	 * ------------------------------------------------------
-	 * | confounder | gss header | clear msgs | krb5 header |
-	 * ------------------------------------------------------
-	 */
-	data_desc[0].data = conf;
-	data_desc[0].len = ke->ke_conf_size;
-	data_desc[1].data = gsshdr->data;
-	data_desc[1].len = gsshdr->len;
-	data_desc[2].data = msg->data;
-	data_desc[2].len = msg->len;
-
-	/* compute checksum */
-	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
-			       khdr, 3, data_desc, 0, NULL, &cksum,
-			       gctx->hash_func))
-		GOTO(out_free_cksum, major = GSS_S_FAILURE);
-	LASSERT(cksum.len >= ke->ke_hash_size);
-
-	/*
-	 * clear text layout for encryption:
-	 * -----------------------------------------
-	 * | confounder | clear msgs | krb5 header |
-	 * -----------------------------------------
-	 */
-	data_desc[0].data = conf;
-	data_desc[0].len = ke->ke_conf_size;
-	data_desc[1].data = msg->data;
-	data_desc[1].len = msg->len;
-	data_desc[2].data = (__u8 *) khdr;
-	data_desc[2].len = sizeof(*khdr);
-
-	/* cipher text will be directly inplace */
+        /*
+         * clear text layout for checksum:
+         * ------------------------------------------------------
+         * | confounder | gss header | clear msgs | krb5 header |
+         * ------------------------------------------------------
+         */
+        data_desc[0].data = conf;
+        data_desc[0].len = ke->ke_conf_size;
+        data_desc[1].data = gsshdr->data;
+        data_desc[1].len = gsshdr->len;
+        data_desc[2].data = msg->data;
+        data_desc[2].len = msg->len;
+
+        /* compute checksum */
+        if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+                               khdr, 3, data_desc, 0, NULL, &cksum))
+                return GSS_S_FAILURE;
+        LASSERT(cksum.len >= ke->ke_hash_size);
+
+        /*
+         * clear text layout for encryption:
+         * -----------------------------------------
+         * | confounder | clear msgs | krb5 header |
+         * -----------------------------------------
+         */
+        data_desc[0].data = conf;
+        data_desc[0].len = ke->ke_conf_size;
+        data_desc[1].data = msg->data;
+        data_desc[1].len = msg->len;
+        data_desc[2].data = (__u8 *) khdr;
+        data_desc[2].len = sizeof(*khdr);
+
+        /* cipher text will be directly inplace */
 	cipher.data = (__u8 *)(khdr + 1);
-	cipher.len = token->len - sizeof(*khdr);
-	LASSERT(cipher.len >= ke->ke_conf_size + msg->len + sizeof(*khdr));
+        cipher.len = token->len - sizeof(*khdr);
+        LASSERT(cipher.len >= ke->ke_conf_size + msg->len + sizeof(*khdr));
 
 	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
-		rawobj_t arc4_keye = RAWOBJ_EMPTY;
-		struct crypto_sync_skcipher *arc4_tfm;
+		rawobj_t		 arc4_keye;
+		struct crypto_blkcipher *arc4_tfm;
 
 		if (krb5_make_checksum(ENCTYPE_ARCFOUR_HMAC, &kctx->kc_keyi,
-				       NULL, 1, &cksum, 0, NULL, &arc4_keye,
-				       gctx->hash_func)) {
+				       NULL, 1, &cksum, 0, NULL, &arc4_keye)) {
 			CERROR("failed to obtain arc4 enc key\n");
-			GOTO(arc4_out_key, rc = -EACCES);
+			GOTO(arc4_out, rc = -EACCES);
 		}
 
-		arc4_tfm = crypto_alloc_sync_skcipher("ecb(arc4)", 0, 0);
+		arc4_tfm = crypto_alloc_blkcipher("ecb(arc4)", 0, 0);
 		if (IS_ERR(arc4_tfm)) {
 			CERROR("failed to alloc tfm arc4 in ECB mode\n");
 			GOTO(arc4_out_key, rc = -EACCES);
 		}
 
-		if (crypto_sync_skcipher_setkey(arc4_tfm, arc4_keye.data,
-						arc4_keye.len)) {
-			CERROR("failed to set arc4 key, len %d\n",
-			       arc4_keye.len);
-			GOTO(arc4_out_tfm, rc = -EACCES);
-		}
+		if (crypto_blkcipher_setkey(arc4_tfm, arc4_keye.data,
+                                               arc4_keye.len)) {
+                        CERROR("failed to set arc4 key, len %d\n",
+                               arc4_keye.len);
+                        GOTO(arc4_out_tfm, rc = -EACCES);
+                }
 
 		rc = gss_crypt_rawobjs(arc4_tfm, NULL, 3, data_desc,
 				       &cipher, 1);
 arc4_out_tfm:
-		crypto_free_sync_skcipher(arc4_tfm);
+		crypto_free_blkcipher(arc4_tfm);
 arc4_out_key:
-		rawobj_free(&arc4_keye);
-	} else {
+                rawobj_free(&arc4_keye);
+arc4_out:
+                do {} while(0); /* just to avoid compile warning */
+        } else {
 		rc = gss_crypt_rawobjs(kctx->kc_keye.kb_tfm, local_iv, 3,
 				       data_desc, &cipher, 1);
-	}
+        }
+
+        if (rc != 0) {
+                rawobj_free(&cksum);
+                return GSS_S_FAILURE;
+        }
+
+        /* fill in checksum */
+        LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size);
+        memcpy((char *)(khdr + 1) + cipher.len,
+               cksum.data + cksum.len - ke->ke_hash_size,
+               ke->ke_hash_size);
+        rawobj_free(&cksum);
 
-	if (rc)
-		GOTO(out_free_cksum, major = GSS_S_FAILURE);
-
-	/* fill in checksum */
-	LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size);
-	memcpy((char *)(khdr + 1) + cipher.len,
-	       cksum.data + cksum.len - ke->ke_hash_size,
-	       ke->ke_hash_size);
-
-	/* final token length */
-	token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size;
-	major = GSS_S_COMPLETE;
-out_free_cksum:
-	rawobj_free(&cksum);
-	return major;
+        /* final token length */
+        token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size;
+        return GSS_S_COMPLETE;
 }
 
 static
@@ -1129,7 +1075,7 @@ __u32 gss_prep_bulk_kerberos(struct gss_ctx *gctx,
 	LASSERT(GET_ENC_KIOV(desc));
 	LASSERT(kctx->kc_keye.kb_tfm);
 
-	blocksize = crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm);
+	blocksize = crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
 
 	for (i = 0; i < desc->bd_iov_count; i++) {
 		LASSERT(BD_GET_ENC_KIOV(desc, i).kiov_page);
@@ -1155,377 +1101,375 @@ __u32 gss_prep_bulk_kerberos(struct gss_ctx *gctx,
 
 static
 __u32 gss_wrap_bulk_kerberos(struct gss_ctx *gctx,
-			     struct ptlrpc_bulk_desc *desc,
-			     rawobj_t *token, int adj_nob)
+                             struct ptlrpc_bulk_desc *desc,
+                             rawobj_t *token, int adj_nob)
 {
-	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
-	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
-	struct krb5_header  *khdr;
-	int                  blocksz;
-	rawobj_t             cksum = RAWOBJ_EMPTY;
-	rawobj_t             data_desc[1], cipher;
-	__u8                 conf[GSS_MAX_CIPHER_BLOCK];
-	int rc = 0;
-	u32 major;
+        struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+        struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+        struct krb5_header  *khdr;
+        int                  blocksize;
+        rawobj_t             cksum = RAWOBJ_EMPTY;
+        rawobj_t             data_desc[1], cipher;
+        __u8                 conf[GSS_MAX_CIPHER_BLOCK];
+        int                  rc = 0;
 
 	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
-	LASSERT(ke);
-	LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK);
-
-	/*
-	 * final token format:
-	 * --------------------------------------------------
-	 * | krb5 header | head/tail cipher text | checksum |
-	 * --------------------------------------------------
-	 */
-
-	/* fill krb5 header */
-	LASSERT(token->len >= sizeof(*khdr));
+        LASSERT(ke);
+        LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK);
+
+        /*
+         * final token format:
+         * --------------------------------------------------
+         * | krb5 header | head/tail cipher text | checksum |
+         * --------------------------------------------------
+         */
+
+        /* fill krb5 header */
+        LASSERT(token->len >= sizeof(*khdr));
 	khdr = (struct krb5_header *)token->data;
-	fill_krb5_header(kctx, khdr, 1);
+        fill_krb5_header(kctx, khdr, 1);
 
-	/* generate confounder */
-	cfs_get_random_bytes(conf, ke->ke_conf_size);
+        /* generate confounder */
+        cfs_get_random_bytes(conf, ke->ke_conf_size);
 
-	/* get encryption blocksize. note kc_keye might not associated with
-	 * a tfm, currently only for arcfour-hmac */
-	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
-		LASSERT(kctx->kc_keye.kb_tfm == NULL);
-		blocksz = 1;
-	} else {
-		LASSERT(kctx->kc_keye.kb_tfm);
-		blocksz = crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm);
-	}
+        /* get encryption blocksize. note kc_keye might not associated with
+         * a tfm, currently only for arcfour-hmac */
+        if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+                LASSERT(kctx->kc_keye.kb_tfm == NULL);
+                blocksize = 1;
+        } else {
+                LASSERT(kctx->kc_keye.kb_tfm);
+		blocksize = crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+        }
 
-	/*
-	 * we assume the size of krb5_header (16 bytes) must be n * blocksize.
-	 * the bulk token size would be exactly (sizeof(krb5_header) +
-	 * blocksize + sizeof(krb5_header) + hashsize)
-	 */
-	LASSERT(blocksz <= ke->ke_conf_size);
-	LASSERT(sizeof(*khdr) >= blocksz && sizeof(*khdr) % blocksz == 0);
-	LASSERT(token->len >= sizeof(*khdr) + blocksz + sizeof(*khdr) + 16);
-
-	/*
-	 * clear text layout for checksum:
-	 * ------------------------------------------
-	 * | confounder | clear pages | krb5 header |
-	 * ------------------------------------------
-	 */
-	data_desc[0].data = conf;
-	data_desc[0].len = ke->ke_conf_size;
+        /*
+         * we assume the size of krb5_header (16 bytes) must be n * blocksize.
+         * the bulk token size would be exactly (sizeof(krb5_header) +
+         * blocksize + sizeof(krb5_header) + hashsize)
+         */
+        LASSERT(blocksize <= ke->ke_conf_size);
+        LASSERT(sizeof(*khdr) >= blocksize && sizeof(*khdr) % blocksize == 0);
+        LASSERT(token->len >= sizeof(*khdr) + blocksize + sizeof(*khdr) + 16);
+
+        /*
+         * clear text layout for checksum:
+         * ------------------------------------------
+         * | confounder | clear pages | krb5 header |
+         * ------------------------------------------
+         */
+        data_desc[0].data = conf;
+        data_desc[0].len = ke->ke_conf_size;
 
 	/* compute checksum */
 	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
 			       khdr, 1, data_desc,
 			       desc->bd_iov_count, GET_KIOV(desc),
-			       &cksum, gctx->hash_func))
-		GOTO(out_free_cksum, major = GSS_S_FAILURE);
+			       &cksum))
+		return GSS_S_FAILURE;
 	LASSERT(cksum.len >= ke->ke_hash_size);
 
-	/*
-	 * clear text layout for encryption:
-	 * ------------------------------------------
-	 * | confounder | clear pages | krb5 header |
-	 * ------------------------------------------
-	 *        |              |             |
-	 *        ----------  (cipher pages)   |
-	 * result token:   |                   |
-	 * -------------------------------------------
-	 * | krb5 header | cipher text | cipher text |
-	 * -------------------------------------------
-	 */
-	data_desc[0].data = conf;
-	data_desc[0].len = ke->ke_conf_size;
+        /*
+         * clear text layout for encryption:
+         * ------------------------------------------
+         * | confounder | clear pages | krb5 header |
+         * ------------------------------------------
+         *        |              |             |
+         *        ----------  (cipher pages)   |
+         * result token:   |                   |
+         * -------------------------------------------
+         * | krb5 header | cipher text | cipher text |
+         * -------------------------------------------
+         */
+        data_desc[0].data = conf;
+        data_desc[0].len = ke->ke_conf_size;
 
 	cipher.data = (__u8 *)(khdr + 1);
-	cipher.len = blocksz + sizeof(*khdr);
+        cipher.len = blocksize + sizeof(*khdr);
 
-	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
-		LBUG();
-		rc = 0;
-	} else {
-		rc = krb5_encrypt_bulk(kctx->kc_keye.kb_tfm, khdr,
-				       conf, desc, &cipher, adj_nob);
-	}
-	if (rc)
-		GOTO(out_free_cksum, major = GSS_S_FAILURE);
-
-	/* fill in checksum */
-	LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size);
-	memcpy((char *)(khdr + 1) + cipher.len,
-	       cksum.data + cksum.len - ke->ke_hash_size,
-	       ke->ke_hash_size);
-
-	/* final token length */
-	token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size;
-	major = GSS_S_COMPLETE;
-out_free_cksum:
-	rawobj_free(&cksum);
-	return major;
+        if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+                LBUG();
+                rc = 0;
+        } else {
+                rc = krb5_encrypt_bulk(kctx->kc_keye.kb_tfm, khdr,
+                                       conf, desc, &cipher, adj_nob);
+        }
+
+        if (rc != 0) {
+                rawobj_free(&cksum);
+                return GSS_S_FAILURE;
+        }
+
+        /* fill in checksum */
+        LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size);
+        memcpy((char *)(khdr + 1) + cipher.len,
+               cksum.data + cksum.len - ke->ke_hash_size,
+               ke->ke_hash_size);
+        rawobj_free(&cksum);
+
+        /* final token length */
+        token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size;
+        return GSS_S_COMPLETE;
 }
 
 static
 __u32 gss_unwrap_kerberos(struct gss_ctx  *gctx,
-			  rawobj_t        *gsshdr,
-			  rawobj_t        *token,
-			  rawobj_t        *msg)
+                          rawobj_t        *gsshdr,
+                          rawobj_t        *token,
+                          rawobj_t        *msg)
 {
-	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
-	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
-	struct krb5_header  *khdr;
-	unsigned char       *tmpbuf;
-	int                  blocksz, bodysize;
-	rawobj_t             cksum = RAWOBJ_EMPTY;
-	rawobj_t             cipher_in, plain_out;
-	rawobj_t             hash_objs[3];
-	int                  rc = 0;
-	__u32                major;
+        struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+        struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+        struct krb5_header  *khdr;
+        unsigned char       *tmpbuf;
+        int                  blocksize, bodysize;
+        rawobj_t             cksum = RAWOBJ_EMPTY;
+        rawobj_t             cipher_in, plain_out;
+        rawobj_t             hash_objs[3];
+        int                  rc = 0;
+        __u32                major;
 	__u8                 local_iv[16] = {0};
 
-	LASSERT(ke);
+        LASSERT(ke);
 
-	if (token->len < sizeof(*khdr)) {
-		CERROR("short signature: %u\n", token->len);
-		return GSS_S_DEFECTIVE_TOKEN;
-	}
+        if (token->len < sizeof(*khdr)) {
+                CERROR("short signature: %u\n", token->len);
+                return GSS_S_DEFECTIVE_TOKEN;
+        }
 
 	khdr = (struct krb5_header *)token->data;
 
-	major = verify_krb5_header(kctx, khdr, 1);
-	if (major != GSS_S_COMPLETE) {
-		CERROR("bad krb5 header\n");
-		return major;
-	}
+        major = verify_krb5_header(kctx, khdr, 1);
+        if (major != GSS_S_COMPLETE) {
+                CERROR("bad krb5 header\n");
+                return major;
+        }
 
-	/* block size */
-	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
-		LASSERT(kctx->kc_keye.kb_tfm == NULL);
-		blocksz = 1;
-	} else {
-		LASSERT(kctx->kc_keye.kb_tfm);
-		blocksz = crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm);
-	}
+        /* block size */
+        if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+                LASSERT(kctx->kc_keye.kb_tfm == NULL);
+                blocksize = 1;
+        } else {
+                LASSERT(kctx->kc_keye.kb_tfm);
+		blocksize = crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+        }
 
-	/* expected token layout:
-	 * ----------------------------------------
-	 * | krb5 header | cipher text | checksum |
-	 * ----------------------------------------
-	 */
-	bodysize = token->len - sizeof(*khdr) - ke->ke_hash_size;
+        /* expected token layout:
+         * ----------------------------------------
+         * | krb5 header | cipher text | checksum |
+         * ----------------------------------------
+         */
+        bodysize = token->len - sizeof(*khdr) - ke->ke_hash_size;
 
-	if (bodysize % blocksz) {
-		CERROR("odd bodysize %d\n", bodysize);
-		return GSS_S_DEFECTIVE_TOKEN;
-	}
+        if (bodysize % blocksize) {
+                CERROR("odd bodysize %d\n", bodysize);
+                return GSS_S_DEFECTIVE_TOKEN;
+        }
 
-	if (bodysize <= ke->ke_conf_size + sizeof(*khdr)) {
-		CERROR("incomplete token: bodysize %d\n", bodysize);
-		return GSS_S_DEFECTIVE_TOKEN;
-	}
+        if (bodysize <= ke->ke_conf_size + sizeof(*khdr)) {
+                CERROR("incomplete token: bodysize %d\n", bodysize);
+                return GSS_S_DEFECTIVE_TOKEN;
+        }
 
-	if (msg->len < bodysize - ke->ke_conf_size - sizeof(*khdr)) {
-		CERROR("buffer too small: %u, require %d\n",
-		       msg->len, bodysize - ke->ke_conf_size);
-		return GSS_S_FAILURE;
-	}
+        if (msg->len < bodysize - ke->ke_conf_size - sizeof(*khdr)) {
+                CERROR("buffer too small: %u, require %d\n",
+                       msg->len, bodysize - ke->ke_conf_size);
+                return GSS_S_FAILURE;
+        }
 
-	/* decrypting */
-	OBD_ALLOC_LARGE(tmpbuf, bodysize);
-	if (!tmpbuf)
-		return GSS_S_FAILURE;
+        /* decrypting */
+        OBD_ALLOC_LARGE(tmpbuf, bodysize);
+        if (!tmpbuf)
+                return GSS_S_FAILURE;
 
-	major = GSS_S_FAILURE;
+        major = GSS_S_FAILURE;
 
 	cipher_in.data = (__u8 *)(khdr + 1);
-	cipher_in.len = bodysize;
-	plain_out.data = tmpbuf;
-	plain_out.len = bodysize;
+        cipher_in.len = bodysize;
+        plain_out.data = tmpbuf;
+        plain_out.len = bodysize;
 
 	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
 		rawobj_t		 arc4_keye;
-		struct crypto_sync_skcipher *arc4_tfm;
+		struct crypto_blkcipher *arc4_tfm;
 
 		cksum.data = token->data + token->len - ke->ke_hash_size;
 		cksum.len = ke->ke_hash_size;
 
 		if (krb5_make_checksum(ENCTYPE_ARCFOUR_HMAC, &kctx->kc_keyi,
-				       NULL, 1, &cksum, 0, NULL, &arc4_keye,
-				       gctx->hash_func)) {
+				       NULL, 1, &cksum, 0, NULL, &arc4_keye)) {
 			CERROR("failed to obtain arc4 enc key\n");
 			GOTO(arc4_out, rc = -EACCES);
 		}
 
-		arc4_tfm = crypto_alloc_sync_skcipher("ecb(arc4)", 0, 0);
+		arc4_tfm = crypto_alloc_blkcipher("ecb(arc4)", 0, 0);
 		if (IS_ERR(arc4_tfm)) {
 			CERROR("failed to alloc tfm arc4 in ECB mode\n");
 			GOTO(arc4_out_key, rc = -EACCES);
 		}
 
-		if (crypto_sync_skcipher_setkey(arc4_tfm, arc4_keye.data,
-						arc4_keye.len)) {
-			CERROR("failed to set arc4 key, len %d\n",
-			       arc4_keye.len);
-			GOTO(arc4_out_tfm, rc = -EACCES);
-		}
+		if (crypto_blkcipher_setkey(arc4_tfm,
+                                         arc4_keye.data, arc4_keye.len)) {
+                        CERROR("failed to set arc4 key, len %d\n",
+                               arc4_keye.len);
+                        GOTO(arc4_out_tfm, rc = -EACCES);
+                }
 
 		rc = gss_crypt_rawobjs(arc4_tfm, NULL, 1, &cipher_in,
 				       &plain_out, 0);
 arc4_out_tfm:
-		crypto_free_sync_skcipher(arc4_tfm);
+		crypto_free_blkcipher(arc4_tfm);
 arc4_out_key:
-		rawobj_free(&arc4_keye);
+                rawobj_free(&arc4_keye);
 arc4_out:
-		cksum = RAWOBJ_EMPTY;
-	} else {
+                cksum = RAWOBJ_EMPTY;
+        } else {
 		rc = gss_crypt_rawobjs(kctx->kc_keye.kb_tfm, local_iv, 1,
 				       &cipher_in, &plain_out, 0);
-	}
-
-	if (rc != 0) {
-		CERROR("error decrypt\n");
-		goto out_free;
-	}
-	LASSERT(plain_out.len == bodysize);
-
-	/* expected clear text layout:
-	 * -----------------------------------------
-	 * | confounder | clear msgs | krb5 header |
-	 * -----------------------------------------
-	 */
-
-	/* verify krb5 header in token is not modified */
-	if (memcmp(khdr, plain_out.data + plain_out.len - sizeof(*khdr),
-		   sizeof(*khdr))) {
-		CERROR("decrypted krb5 header mismatch\n");
-		goto out_free;
-	}
+        }
 
-	/* verify checksum, compose clear text as layout:
-	 * ------------------------------------------------------
-	 * | confounder | gss header | clear msgs | krb5 header |
-	 * ------------------------------------------------------
-	 */
-	hash_objs[0].len = ke->ke_conf_size;
-	hash_objs[0].data = plain_out.data;
-	hash_objs[1].len = gsshdr->len;
-	hash_objs[1].data = gsshdr->data;
-	hash_objs[2].len = plain_out.len - ke->ke_conf_size - sizeof(*khdr);
-	hash_objs[2].data = plain_out.data + ke->ke_conf_size;
-	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
-			       khdr, 3, hash_objs, 0, NULL, &cksum,
-			       gctx->hash_func))
-		goto out_free;
+        if (rc != 0) {
+                CERROR("error decrypt\n");
+                goto out_free;
+        }
+        LASSERT(plain_out.len == bodysize);
+
+        /* expected clear text layout:
+         * -----------------------------------------
+         * | confounder | clear msgs | krb5 header |
+         * -----------------------------------------
+         */
+
+        /* verify krb5 header in token is not modified */
+        if (memcmp(khdr, plain_out.data + plain_out.len - sizeof(*khdr),
+                   sizeof(*khdr))) {
+                CERROR("decrypted krb5 header mismatch\n");
+                goto out_free;
+        }
 
-	LASSERT(cksum.len >= ke->ke_hash_size);
-	if (memcmp((char *)(khdr + 1) + bodysize,
-		   cksum.data + cksum.len - ke->ke_hash_size,
-		   ke->ke_hash_size)) {
-		CERROR("checksum mismatch\n");
-		goto out_free;
-	}
+        /* verify checksum, compose clear text as layout:
+         * ------------------------------------------------------
+         * | confounder | gss header | clear msgs | krb5 header |
+         * ------------------------------------------------------
+         */
+        hash_objs[0].len = ke->ke_conf_size;
+        hash_objs[0].data = plain_out.data;
+        hash_objs[1].len = gsshdr->len;
+        hash_objs[1].data = gsshdr->data;
+        hash_objs[2].len = plain_out.len - ke->ke_conf_size - sizeof(*khdr);
+        hash_objs[2].data = plain_out.data + ke->ke_conf_size;
+        if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+                               khdr, 3, hash_objs, 0, NULL, &cksum))
+                goto out_free;
+
+        LASSERT(cksum.len >= ke->ke_hash_size);
+        if (memcmp((char *)(khdr + 1) + bodysize,
+                   cksum.data + cksum.len - ke->ke_hash_size,
+                   ke->ke_hash_size)) {
+                CERROR("checksum mismatch\n");
+                goto out_free;
+        }
 
-	msg->len =  bodysize - ke->ke_conf_size - sizeof(*khdr);
-	memcpy(msg->data, tmpbuf + ke->ke_conf_size, msg->len);
+        msg->len =  bodysize - ke->ke_conf_size - sizeof(*khdr);
+        memcpy(msg->data, tmpbuf + ke->ke_conf_size, msg->len);
 
-	major = GSS_S_COMPLETE;
+        major = GSS_S_COMPLETE;
 out_free:
-	OBD_FREE_LARGE(tmpbuf, bodysize);
-	rawobj_free(&cksum);
-	return major;
+        OBD_FREE_LARGE(tmpbuf, bodysize);
+        rawobj_free(&cksum);
+        return major;
 }
 
 static
 __u32 gss_unwrap_bulk_kerberos(struct gss_ctx *gctx,
-			       struct ptlrpc_bulk_desc *desc,
-			       rawobj_t *token, int adj_nob)
+                               struct ptlrpc_bulk_desc *desc,
+                               rawobj_t *token, int adj_nob)
 {
-	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
-	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
-	struct krb5_header  *khdr;
-	int                  blocksz;
-	rawobj_t             cksum = RAWOBJ_EMPTY;
-	rawobj_t             cipher, plain;
-	rawobj_t             data_desc[1];
-	int                  rc;
-	__u32                major;
+        struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+        struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+        struct krb5_header  *khdr;
+        int                  blocksize;
+        rawobj_t             cksum = RAWOBJ_EMPTY;
+        rawobj_t             cipher, plain;
+        rawobj_t             data_desc[1];
+        int                  rc;
+        __u32                major;
 
 	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
-	LASSERT(ke);
+        LASSERT(ke);
 
-	if (token->len < sizeof(*khdr)) {
-		CERROR("short signature: %u\n", token->len);
-		return GSS_S_DEFECTIVE_TOKEN;
-	}
+        if (token->len < sizeof(*khdr)) {
+                CERROR("short signature: %u\n", token->len);
+                return GSS_S_DEFECTIVE_TOKEN;
+        }
 
 	khdr = (struct krb5_header *)token->data;
 
-	major = verify_krb5_header(kctx, khdr, 1);
-	if (major != GSS_S_COMPLETE) {
-		CERROR("bad krb5 header\n");
-		return major;
-	}
+        major = verify_krb5_header(kctx, khdr, 1);
+        if (major != GSS_S_COMPLETE) {
+                CERROR("bad krb5 header\n");
+                return major;
+        }
 
-	/* block size */
-	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
-		LASSERT(kctx->kc_keye.kb_tfm == NULL);
-		blocksz = 1;
-		LBUG();
-	} else {
-		LASSERT(kctx->kc_keye.kb_tfm);
-		blocksz = crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm);
-	}
-	LASSERT(sizeof(*khdr) >= blocksz && sizeof(*khdr) % blocksz == 0);
+        /* block size */
+        if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+                LASSERT(kctx->kc_keye.kb_tfm == NULL);
+                blocksize = 1;
+                LBUG();
+        } else {
+                LASSERT(kctx->kc_keye.kb_tfm);
+		blocksize = crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+        }
+        LASSERT(sizeof(*khdr) >= blocksize && sizeof(*khdr) % blocksize == 0);
+
+        /*
+         * token format is expected as:
+         * -----------------------------------------------
+         * | krb5 header | head/tail cipher text | cksum |
+         * -----------------------------------------------
+         */
+        if (token->len < sizeof(*khdr) + blocksize + sizeof(*khdr) +
+                         ke->ke_hash_size) {
+                CERROR("short token size: %u\n", token->len);
+                return GSS_S_DEFECTIVE_TOKEN;
+        }
 
-	/*
-	 * token format is expected as:
-	 * -----------------------------------------------
-	 * | krb5 header | head/tail cipher text | cksum |
-	 * -----------------------------------------------
-	 */
-	if (token->len < sizeof(*khdr) + blocksz + sizeof(*khdr) +
-	    ke->ke_hash_size) {
-		CERROR("short token size: %u\n", token->len);
-		return GSS_S_DEFECTIVE_TOKEN;
-	}
+        cipher.data = (__u8 *) (khdr + 1);
+        cipher.len = blocksize + sizeof(*khdr);
+        plain.data = cipher.data;
+        plain.len = cipher.len;
 
-	cipher.data = (__u8 *) (khdr + 1);
-	cipher.len = blocksz + sizeof(*khdr);
-	plain.data = cipher.data;
-	plain.len = cipher.len;
-
-	rc = krb5_decrypt_bulk(kctx->kc_keye.kb_tfm, khdr,
-			       desc, &cipher, &plain, adj_nob);
-	if (rc)
-		return GSS_S_DEFECTIVE_TOKEN;
-
-	/*
-	 * verify checksum, compose clear text as layout:
-	 * ------------------------------------------
-	 * | confounder | clear pages | krb5 header |
-	 * ------------------------------------------
-	 */
-	data_desc[0].data = plain.data;
-	data_desc[0].len = blocksz;
+        rc = krb5_decrypt_bulk(kctx->kc_keye.kb_tfm, khdr,
+                               desc, &cipher, &plain, adj_nob);
+        if (rc)
+                return GSS_S_DEFECTIVE_TOKEN;
+
+        /*
+         * verify checksum, compose clear text as layout:
+         * ------------------------------------------
+         * | confounder | clear pages | krb5 header |
+         * ------------------------------------------
+         */
+        data_desc[0].data = plain.data;
+        data_desc[0].len = blocksize;
 
 	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
 			       khdr, 1, data_desc,
 			       desc->bd_iov_count,
 			       GET_KIOV(desc),
-			       &cksum, gctx->hash_func))
+			       &cksum))
 		return GSS_S_FAILURE;
 	LASSERT(cksum.len >= ke->ke_hash_size);
 
-	if (memcmp(plain.data + blocksz + sizeof(*khdr),
-		   cksum.data + cksum.len - ke->ke_hash_size,
-		   ke->ke_hash_size)) {
-		CERROR("checksum mismatch\n");
-		rawobj_free(&cksum);
-		return GSS_S_BAD_SIG;
-	}
+        if (memcmp(plain.data + blocksize + sizeof(*khdr),
+                   cksum.data + cksum.len - ke->ke_hash_size,
+                   ke->ke_hash_size)) {
+                CERROR("checksum mismatch\n");
+                rawobj_free(&cksum);
+                return GSS_S_BAD_SIG;
+        }
 
-	rawobj_free(&cksum);
-	return GSS_S_COMPLETE;
+        rawobj_free(&cksum);
+        return GSS_S_COMPLETE;
 }
 
 int gss_display_kerberos(struct gss_ctx        *ctx,
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_mech_switch.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_mech_switch.c
index 3ee125f1070bf..be66ffde266d4 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_mech_switch.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_mech_switch.c
@@ -52,6 +52,7 @@
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
+#include <lustre/lustre_idl.h>
 #include <lustre_net.h>
 #include <lustre_import.h>
 #include <lustre_sec.h>
@@ -59,7 +60,6 @@
 #include "gss_err.h"
 #include "gss_internal.h"
 #include "gss_api.h"
-#include "gss_crypto.h"
 
 static struct list_head registered_mechs = LIST_HEAD_INIT(registered_mechs);
 static DEFINE_SPINLOCK(registered_mechs_lock);
@@ -69,7 +69,7 @@ int lgss_mech_register(struct gss_api_mech *gm)
 	spin_lock(&registered_mechs_lock);
 	list_add(&gm->gm_list, &registered_mechs);
 	spin_unlock(&registered_mechs_lock);
-	CDEBUG(D_SEC, "register %s mechanism\n", gm->gm_name);
+	CWARN("Register %s mechanism\n", gm->gm_name);
 	return 0;
 }
 
@@ -78,7 +78,7 @@ void lgss_mech_unregister(struct gss_api_mech *gm)
 	spin_lock(&registered_mechs_lock);
 	list_del(&gm->gm_list);
 	spin_unlock(&registered_mechs_lock);
-	CDEBUG(D_SEC, "Unregister %s mechanism\n", gm->gm_name);
+	CWARN("Unregister %s mechanism\n", gm->gm_name);
 }
 
 
@@ -148,52 +148,50 @@ __u32 lgss_import_sec_context(rawobj_t *input_token,
                               struct gss_api_mech *mech,
                               struct gss_ctx **ctx_id)
 {
-	OBD_ALLOC_PTR(*ctx_id);
-	if (*ctx_id == NULL)
-		return GSS_S_FAILURE;
+        OBD_ALLOC_PTR(*ctx_id);
+        if (*ctx_id == NULL)
+                return GSS_S_FAILURE;
 
-	(*ctx_id)->mech_type = lgss_mech_get(mech);
-	(*ctx_id)->hash_func = gss_digest_hash;
+        (*ctx_id)->mech_type = lgss_mech_get(mech);
 
-	LASSERT(mech);
-	LASSERT(mech->gm_ops);
-	LASSERT(mech->gm_ops->gss_import_sec_context);
-	return mech->gm_ops->gss_import_sec_context(input_token, *ctx_id);
+        LASSERT(mech);
+        LASSERT(mech->gm_ops);
+        LASSERT(mech->gm_ops->gss_import_sec_context);
+        return mech->gm_ops->gss_import_sec_context(input_token, *ctx_id);
 }
 
 __u32 lgss_copy_reverse_context(struct gss_ctx *ctx_id,
-				struct gss_ctx **ctx_id_new)
+                                struct gss_ctx **ctx_id_new)
 {
-	struct gss_api_mech *mech = ctx_id->mech_type;
-	__u32                major;
+        struct gss_api_mech *mech = ctx_id->mech_type;
+        __u32                major;
 
-	LASSERT(mech);
+        LASSERT(mech);
 
-	OBD_ALLOC_PTR(*ctx_id_new);
-	if (*ctx_id_new == NULL)
-		return GSS_S_FAILURE;
+        OBD_ALLOC_PTR(*ctx_id_new);
+        if (*ctx_id_new == NULL)
+                return GSS_S_FAILURE;
 
-	(*ctx_id_new)->mech_type = lgss_mech_get(mech);
-	(*ctx_id_new)->hash_func = ctx_id->hash_func;
+        (*ctx_id_new)->mech_type = lgss_mech_get(mech);
 
-	LASSERT(mech);
-	LASSERT(mech->gm_ops);
-	LASSERT(mech->gm_ops->gss_copy_reverse_context);
+        LASSERT(mech);
+        LASSERT(mech->gm_ops);
+        LASSERT(mech->gm_ops->gss_copy_reverse_context);
 
-	major = mech->gm_ops->gss_copy_reverse_context(ctx_id, *ctx_id_new);
-	if (major != GSS_S_COMPLETE) {
-		lgss_mech_put(mech);
-		OBD_FREE_PTR(*ctx_id_new);
-		*ctx_id_new = NULL;
-	}
-	return major;
+        major = mech->gm_ops->gss_copy_reverse_context(ctx_id, *ctx_id_new);
+        if (major != GSS_S_COMPLETE) {
+                lgss_mech_put(mech);
+                OBD_FREE_PTR(*ctx_id_new);
+                *ctx_id_new = NULL;
+        }
+        return major;
 }
 
 /*
  * this interface is much simplified, currently we only need endtime.
  */
 __u32 lgss_inquire_context(struct gss_ctx *context_handle,
-			   time64_t *endtime)
+                           unsigned long  *endtime)
 {
         LASSERT(context_handle);
         LASSERT(context_handle->mech_type);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_null_mech.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_null_mech.c
index 1e946f8ba2aff..fddd3ed3443c1 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_null_mech.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_null_mech.c
@@ -92,10 +92,10 @@ __u32 gss_copy_reverse_context_null(struct gss_ctx *gss_context_old,
 
 static
 __u32 gss_inquire_context_null(struct gss_ctx *gss_context,
-			       time64_t *endtime)
+			       unsigned long *endtime)
 {
 	/* quick timeout for testing purposes */
-	*endtime = ktime_get_real_seconds() + 60;
+	*endtime = cfs_time_current_sec() + 60;
 	return GSS_S_COMPLETE;
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c
index 5e1e7caa1aae6..016d455040972 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c
@@ -62,7 +62,7 @@ struct rpc_clnt; /* for rpc_pipefs */
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
-#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre/lustre_idl.h>
 #include <lustre_sec.h>
 #include <lustre_net.h>
 #include <lustre_import.h>
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c
index 69e92bcb28311..fd1b071d6f549 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c
@@ -39,6 +39,7 @@
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
+#include <lustre/lustre_user.h>
 
 #include "gss_err.h"
 #include "gss_crypto.h"
@@ -61,14 +62,14 @@
 #define SK_IV_REV_START (1ULL << 63)
 
 struct sk_ctx {
-	enum cfs_crypto_crypt_alg sc_crypt;
-	enum cfs_crypto_hash_alg  sc_hmac;
-	__u32			  sc_expire;
-	__u32			  sc_host_random;
-	__u32			  sc_peer_random;
-	atomic64_t		  sc_iv;
-	rawobj_t		  sc_hmac_key;
-	struct gss_keyblock	  sc_session_kb;
+	__u16			sc_hmac;
+	__u16			sc_crypt;
+	__u32			sc_expire;
+	__u32			sc_host_random;
+	__u32			sc_peer_random;
+	atomic64_t		sc_iv;
+	rawobj_t		sc_hmac_key;
+	struct gss_keyblock	sc_session_kb;
 };
 
 struct sk_hdr {
@@ -87,6 +88,24 @@ struct sk_wire {
 	rawobj_t		skw_hmac;
 };
 
+static struct sk_crypt_type sk_crypt_types[] = {
+	[SK_CRYPT_AES256_CTR] = {
+		.sct_name = "ctr(aes)",
+		.sct_bytes = 32,
+	},
+};
+
+static struct sk_hmac_type sk_hmac_types[] = {
+	[SK_HMAC_SHA256] = {
+		.sht_name = "hmac(sha256)",
+		.sht_bytes = 32,
+	},
+	[SK_HMAC_SHA512] = {
+		.sht_name = "hmac(sha512)",
+		.sht_bytes = 64,
+	},
+};
+
 static inline unsigned long sk_block_mask(unsigned long len, int blocksize)
 {
 	return (len + blocksize - 1) & (~(blocksize - 1));
@@ -129,18 +148,22 @@ void sk_construct_rfc3686_iv(__u8 *iv, __u32 nonce, __u64 partial_iv)
 	memcpy(iv, &ctr, sizeof(ctr));
 }
 
+static int sk_init_keys(struct sk_ctx *skc)
+{
+	return gss_keyblock_init(&skc->sc_session_kb,
+				 sk_crypt_types[skc->sc_crypt].sct_name, 0);
+}
+
 static int sk_fill_context(rawobj_t *inbuf, struct sk_ctx *skc)
 {
 	char *ptr = inbuf->data;
 	char *end = inbuf->data + inbuf->len;
-	char sk_hmac[CRYPTO_MAX_ALG_NAME];
-	char sk_crypt[CRYPTO_MAX_ALG_NAME];
-	u32 tmp;
+	__u32 tmp;
 
 	/* see sk_serialize_kctx() for format from userspace side */
 	/*  1. Version */
 	if (gss_get_bytes(&ptr, end, &tmp, sizeof(tmp))) {
-		CERROR("Failed to read shared key interface version\n");
+		CERROR("Failed to read shared key interface version");
 		return -1;
 	}
 	if (tmp != SK_INTERFACE_VERSION) {
@@ -149,55 +172,49 @@ static int sk_fill_context(rawobj_t *inbuf, struct sk_ctx *skc)
 	}
 
 	/* 2. HMAC type */
-	if (gss_get_bytes(&ptr, end, &sk_hmac, sizeof(sk_hmac))) {
-		CERROR("Failed to read HMAC algorithm type\n");
+	if (gss_get_bytes(&ptr, end, &skc->sc_hmac, sizeof(skc->sc_hmac))) {
+		CERROR("Failed to read HMAC algorithm type");
 		return -1;
 	}
-
-	skc->sc_hmac = cfs_crypto_hash_alg(sk_hmac);
-	if (skc->sc_hmac != CFS_HASH_ALG_NULL &&
-	    skc->sc_hmac != CFS_HASH_ALG_SHA256 &&
-	    skc->sc_hmac != CFS_HASH_ALG_SHA512) {
-		CERROR("Invalid hmac type: %s\n", sk_hmac);
+	if (skc->sc_hmac <= SK_HMAC_EMPTY || skc->sc_hmac >= SK_HMAC_MAX) {
+		CERROR("Invalid hmac type: %d\n", skc->sc_hmac);
 		return -1;
 	}
 
 	/* 3. crypt type */
-	if (gss_get_bytes(&ptr, end, &sk_crypt, sizeof(sk_crypt))) {
-		CERROR("Failed to read crypt algorithm type\n");
+	if (gss_get_bytes(&ptr, end, &skc->sc_crypt, sizeof(skc->sc_crypt))) {
+		CERROR("Failed to read crypt algorithm type");
 		return -1;
 	}
-
-	skc->sc_crypt = cfs_crypto_crypt_alg(sk_crypt);
-	if (skc->sc_crypt == CFS_CRYPT_ALG_UNKNOWN) {
-		CERROR("Invalid crypt type: %s\n", sk_crypt);
+	if (skc->sc_crypt <= SK_CRYPT_EMPTY || skc->sc_crypt >= SK_CRYPT_MAX) {
+		CERROR("Invalid crypt type: %d\n", skc->sc_crypt);
 		return -1;
 	}
 
 	/* 4. expiration time */
 	if (gss_get_bytes(&ptr, end, &tmp, sizeof(tmp))) {
-		CERROR("Failed to read context expiration time\n");
+		CERROR("Failed to read context expiration time");
 		return -1;
 	}
-	skc->sc_expire = tmp + ktime_get_real_seconds();
+	skc->sc_expire = tmp + cfs_time_current_sec();
 
 	/* 5. host random is used as nonce for encryption */
 	if (gss_get_bytes(&ptr, end, &skc->sc_host_random,
 			  sizeof(skc->sc_host_random))) {
-		CERROR("Failed to read host random\n");
+		CERROR("Failed to read host random ");
 		return -1;
 	}
 
 	/* 6. peer random is used as nonce for decryption */
 	if (gss_get_bytes(&ptr, end, &skc->sc_peer_random,
 			  sizeof(skc->sc_peer_random))) {
-		CERROR("Failed to read peer random\n");
+		CERROR("Failed to read peer random ");
 		return -1;
 	}
 
 	/* 7. HMAC key */
 	if (gss_get_rawobj(&ptr, end, &skc->sc_hmac_key)) {
-		CERROR("Failed to read HMAC key\n");
+		CERROR("Failed to read HMAC key");
 		return -1;
 	}
 	if (skc->sc_hmac_key.len <= SK_MIN_SIZE) {
@@ -208,7 +225,7 @@ static int sk_fill_context(rawobj_t *inbuf, struct sk_ctx *skc)
 
 	/* 8. Session key, can be empty if not using privacy mode */
 	if (gss_get_rawobj(&ptr, end, &skc->sc_session_kb.kb_key)) {
-		CERROR("Failed to read session key\n");
+		CERROR("Failed to read session key");
 		return -1;
 	}
 
@@ -246,14 +263,13 @@ __u32 gss_import_sec_context_sk(rawobj_t *inbuf, struct gss_ctx *gss_context)
 	/* Only privacy mode needs to initialize keys */
 	if (skc->sc_session_kb.kb_key.len > 0) {
 		privacy = true;
-		if (gss_keyblock_init(&skc->sc_session_kb,
-				      cfs_crypto_crypt_name(skc->sc_crypt), 0))
+		if (sk_init_keys(skc))
 			goto out_err;
 	}
 
 	gss_context->internal_ctx_id = skc;
 	CDEBUG(D_SEC, "successfully imported sk%s context\n",
-	       privacy ? " (with privacy)" : "");
+	       privacy ? "pi" : "i");
 
 	return GSS_S_COMPLETE;
 
@@ -288,9 +304,7 @@ __u32 gss_copy_reverse_context_sk(struct gss_ctx *gss_context_old,
 
 	/* Only privacy mode needs to initialize keys */
 	if (skc_new->sc_session_kb.kb_key.len > 0)
-		if (gss_keyblock_init(&skc_new->sc_session_kb,
-				      cfs_crypto_crypt_name(skc_new->sc_crypt),
-				      0))
+		if (sk_init_keys(skc_new))
 			goto out_err;
 
 	gss_context_new->internal_ctx_id = skc_new;
@@ -305,7 +319,7 @@ __u32 gss_copy_reverse_context_sk(struct gss_ctx *gss_context_old,
 
 static
 __u32 gss_inquire_context_sk(struct gss_ctx *gss_context,
-			     time64_t *endtime)
+			     unsigned long *endtime)
 {
 	struct sk_ctx *skc = gss_context->internal_ctx_id;
 
@@ -314,32 +328,24 @@ __u32 gss_inquire_context_sk(struct gss_ctx *gss_context,
 }
 
 static
-u32 sk_make_hmac(enum cfs_crypto_hash_alg algo, rawobj_t *key, int msg_count,
-		 rawobj_t *msgs, int iov_count, lnet_kiov_t *iovs,
-		 rawobj_t *token, digest_hash hash_func)
+__u32 sk_make_hmac(char *alg_name, rawobj_t *key, int msg_count, rawobj_t *msgs,
+		   int iov_count, lnet_kiov_t *iovs, rawobj_t *token)
 {
-	struct ahash_request *req;
-	int rc2, rc;
-
-	req = cfs_crypto_hash_init(algo, key->data, key->len);
-	if (IS_ERR(req)) {
-		rc = PTR_ERR(req);
-		goto out_init_failed;
-	}
+	struct crypto_hash *tfm;
+	int rc;
 
+	tfm = crypto_alloc_hash(alg_name, 0, 0);
+	if (IS_ERR(tfm))
+		return GSS_S_FAILURE;
 
-	if (hash_func)
-		rc2 = hash_func(req, NULL, msg_count, msgs, iov_count,
-				iovs);
-	else
-		rc2 = gss_digest_hash(req, NULL, msg_count, msgs, iov_count,
-				      iovs);
+	rc = GSS_S_FAILURE;
+	LASSERT(token->len >= crypto_hash_digestsize(tfm));
+	if (!gss_digest_hmac(tfm, key, NULL, msg_count, msgs, iov_count, iovs,
+			    token))
+		rc = GSS_S_COMPLETE;
 
-	rc = cfs_crypto_hash_final(req, token->data, &token->len);
-	if (!rc && rc2)
-		rc = rc2;
-out_init_failed:
-	return rc ? GSS_S_FAILURE : GSS_S_COMPLETE;
+	crypto_free_hash(tfm);
+	return rc;
 }
 
 static
@@ -351,22 +357,20 @@ __u32 gss_get_mic_sk(struct gss_ctx *gss_context,
 		     rawobj_t *token)
 {
 	struct sk_ctx *skc = gss_context->internal_ctx_id;
-
-	return sk_make_hmac(skc->sc_hmac,
+	return sk_make_hmac(sk_hmac_types[skc->sc_hmac].sht_name,
 			    &skc->sc_hmac_key, message_count, messages,
-			    iov_count, iovs, token, gss_context->hash_func);
+			    iov_count, iovs, token);
 }
 
 static
-u32 sk_verify_hmac(enum cfs_crypto_hash_alg algo, rawobj_t *key,
-		   int message_count, rawobj_t *messages,
-		   int iov_count, lnet_kiov_t *iovs,
-		   rawobj_t *token, digest_hash hash_func)
+__u32 sk_verify_hmac(struct sk_hmac_type *sht, rawobj_t *key, int message_count,
+			 rawobj_t *messages, int iov_count, lnet_kiov_t *iovs,
+			 rawobj_t *token)
 {
 	rawobj_t checksum = RAWOBJ_EMPTY;
 	__u32 rc = GSS_S_FAILURE;
 
-	checksum.len = cfs_crypto_hash_digestsize(algo);
+	checksum.len = sht->sht_bytes;
 	if (token->len < checksum.len) {
 		CDEBUG(D_SEC, "Token received too short, expected %d "
 		       "received %d\n", token->len, checksum.len);
@@ -377,9 +381,8 @@ u32 sk_verify_hmac(enum cfs_crypto_hash_alg algo, rawobj_t *key,
 	if (!checksum.data)
 		return rc;
 
-	if (sk_make_hmac(algo, key, message_count,
-			 messages, iov_count, iovs, &checksum,
-			 hash_func)) {
+	if (sk_make_hmac(sht->sht_name, key, message_count, messages,
+			 iov_count, iovs, &checksum)) {
 		CDEBUG(D_SEC, "Failed to create checksum to validate\n");
 		goto cleanup;
 	}
@@ -402,19 +405,23 @@ u32 sk_verify_hmac(enum cfs_crypto_hash_alg algo, rawobj_t *key,
  * to decrypt up to the number of bytes actually specified from the sender
  * (bd_nob) otherwise the calulated HMAC will be incorrect. */
 static
-u32 sk_verify_bulk_hmac(enum cfs_crypto_hash_alg sc_hmac, rawobj_t *key,
-			int msgcnt, rawobj_t *msgs, int iovcnt,
-			lnet_kiov_t *iovs, int iov_bytes, rawobj_t *token)
+__u32 sk_verify_bulk_hmac(struct sk_hmac_type *sht, rawobj_t *key,
+			  int msgcnt, rawobj_t *msgs, int iovcnt,
+			  lnet_kiov_t *iovs, int iov_bytes, rawobj_t *token)
 {
 	rawobj_t checksum = RAWOBJ_EMPTY;
-	struct ahash_request *req;
+	struct crypto_hash *tfm;
+	struct hash_desc desc = {
+		.tfm = NULL,
+		.flags = 0,
+	};
 	struct scatterlist sg[1];
-	int rc = 0;
 	struct sg_table sgt;
 	int bytes;
 	int i;
+	int rc = GSS_S_FAILURE;
 
-	checksum.len = cfs_crypto_hash_digestsize(sc_hmac);
+	checksum.len = sht->sht_bytes;
 	if (token->len < checksum.len) {
 		CDEBUG(D_SEC, "Token received too short, expected %d "
 		       "received %d\n", token->len, checksum.len);
@@ -423,24 +430,33 @@ u32 sk_verify_bulk_hmac(enum cfs_crypto_hash_alg sc_hmac, rawobj_t *key,
 
 	OBD_ALLOC_LARGE(checksum.data, checksum.len);
 	if (!checksum.data)
-		return GSS_S_FAILURE;
+		return rc;
 
-	req = cfs_crypto_hash_init(sc_hmac, key->data, key->len);
-	if (IS_ERR(req)) {
-		rc = GSS_S_FAILURE;
+	tfm = crypto_alloc_hash(sht->sht_name, 0, 0);
+	if (IS_ERR(tfm))
 		goto cleanup;
-	}
+
+	desc.tfm = tfm;
+
+	LASSERT(token->len >= crypto_hash_digestsize(tfm));
+
+	rc = crypto_hash_setkey(tfm, key->data, key->len);
+	if (rc)
+		goto hash_cleanup;
+
+	rc = crypto_hash_init(&desc);
+	if (rc)
+		goto hash_cleanup;
 
 	for (i = 0; i < msgcnt; i++) {
-		if (!msgs[i].len)
+		if (msgs[i].len == 0)
 			continue;
 
 		rc = gss_setup_sgtable(&sgt, sg, msgs[i].data, msgs[i].len);
 		if (rc != 0)
 			goto hash_cleanup;
 
-		ahash_request_set_crypt(req, sg, NULL, msgs[i].len);
-		rc = crypto_ahash_update(req);
+		rc = crypto_hash_update(&desc, sg, msgs[i].len);
 		if (rc) {
 			gss_teardown_sgtable(&sgt);
 			goto hash_cleanup;
@@ -459,21 +475,22 @@ u32 sk_verify_bulk_hmac(enum cfs_crypto_hash_alg sc_hmac, rawobj_t *key,
 		sg_init_table(sg, 1);
 		sg_set_page(&sg[0], iovs[i].kiov_page, bytes,
 			    iovs[i].kiov_offset);
-		ahash_request_set_crypt(req, sg, NULL, bytes);
-		rc = crypto_ahash_update(req);
+		rc = crypto_hash_update(&desc, sg, bytes);
 		if (rc)
 			goto hash_cleanup;
 	}
 
-hash_cleanup:
-	cfs_crypto_hash_final(req, checksum.data, &checksum.len);
-	if (rc)
-		goto cleanup;
+	crypto_hash_final(&desc, checksum.data);
 
-	if (memcmp(token->data, checksum.data, checksum.len))
+	if (memcmp(token->data, checksum.data, checksum.len)) {
 		rc = GSS_S_BAD_SIG;
-	else
-		rc = GSS_S_COMPLETE;
+		goto hash_cleanup;
+	}
+
+	rc = GSS_S_COMPLETE;
+
+hash_cleanup:
+	crypto_free_hash(tfm);
 
 cleanup:
 	OBD_FREE_LARGE(checksum.data, checksum.len);
@@ -490,10 +507,8 @@ __u32 gss_verify_mic_sk(struct gss_ctx *gss_context,
 			rawobj_t *token)
 {
 	struct sk_ctx *skc = gss_context->internal_ctx_id;
-
-	return sk_verify_hmac(skc->sc_hmac, &skc->sc_hmac_key,
-			      message_count, messages, iov_count, iovs, token,
-			      gss_context->hash_func);
+	return sk_verify_hmac(&sk_hmac_types[skc->sc_hmac], &skc->sc_hmac_key,
+			      message_count, messages, iov_count, iovs, token);
 }
 
 static
@@ -502,7 +517,7 @@ __u32 gss_wrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header,
 		    rawobj_t *token)
 {
 	struct sk_ctx *skc = gss_context->internal_ctx_id;
-	size_t sht_bytes = cfs_crypto_hash_digestsize(skc->sc_hmac);
+	struct sk_hmac_type *sht = &sk_hmac_types[skc->sc_hmac];
 	struct sk_wire skw;
 	struct sk_hdr skh;
 	rawobj_t msgbufs[3];
@@ -511,7 +526,7 @@ __u32 gss_wrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header,
 
 	LASSERT(skc->sc_session_kb.kb_tfm);
 
-	blocksize = crypto_sync_skcipher_blocksize(skc->sc_session_kb.kb_tfm);
+	blocksize = crypto_blkcipher_blocksize(skc->sc_session_kb.kb_tfm);
 	if (gss_add_padding(message, message_buffer_length, blocksize))
 		return GSS_S_FAILURE;
 
@@ -526,7 +541,7 @@ __u32 gss_wrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header,
 
 	sk_construct_rfc3686_iv(local_iv, skc->sc_host_random, skh.skh_iv);
 	skw.skw_cipher.data = skw.skw_header.data + skw.skw_header.len;
-	skw.skw_cipher.len = token->len - skw.skw_header.len - sht_bytes;
+	skw.skw_cipher.len = token->len - skw.skw_header.len - sht->sht_bytes;
 	if (gss_crypt_rawobjs(skc->sc_session_kb.kb_tfm, local_iv, 1, message,
 			      &skw.skw_cipher, 1))
 		return GSS_S_FAILURE;
@@ -537,10 +552,9 @@ __u32 gss_wrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header,
 	msgbufs[2] = skw.skw_cipher;
 
 	skw.skw_hmac.data = skw.skw_cipher.data + skw.skw_cipher.len;
-	skw.skw_hmac.len = sht_bytes;
-	if (sk_make_hmac(skc->sc_hmac, &skc->sc_hmac_key,
-			 3, msgbufs, 0, NULL, &skw.skw_hmac,
-			 gss_context->hash_func))
+	skw.skw_hmac.len = sht->sht_bytes;
+	if (sk_make_hmac(sht->sht_name, &skc->sc_hmac_key, 3, msgbufs, 0,
+			 NULL, &skw.skw_hmac))
 		return GSS_S_FAILURE;
 
 	token->len = skw.skw_header.len + skw.skw_cipher.len + skw.skw_hmac.len;
@@ -553,7 +567,7 @@ __u32 gss_unwrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header,
 		      rawobj_t *token, rawobj_t *message)
 {
 	struct sk_ctx *skc = gss_context->internal_ctx_id;
-	size_t sht_bytes = cfs_crypto_hash_digestsize(skc->sc_hmac);
+	struct sk_hmac_type *sht = &sk_hmac_types[skc->sc_hmac];
 	struct sk_wire skw;
 	struct sk_hdr *skh;
 	rawobj_t msgbufs[3];
@@ -563,17 +577,17 @@ __u32 gss_unwrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header,
 
 	LASSERT(skc->sc_session_kb.kb_tfm);
 
-	if (token->len < sizeof(skh) + sht_bytes)
+	if (token->len < sizeof(skh) + sht->sht_bytes)
 		return GSS_S_DEFECTIVE_TOKEN;
 
 	skw.skw_header.data = token->data;
 	skw.skw_header.len = sizeof(struct sk_hdr);
 	skw.skw_cipher.data = skw.skw_header.data + skw.skw_header.len;
-	skw.skw_cipher.len = token->len - skw.skw_header.len - sht_bytes;
+	skw.skw_cipher.len = token->len - skw.skw_header.len - sht->sht_bytes;
 	skw.skw_hmac.data = skw.skw_cipher.data + skw.skw_cipher.len;
-	skw.skw_hmac.len = sht_bytes;
+	skw.skw_hmac.len = sht->sht_bytes;
 
-	blocksize = crypto_sync_skcipher_blocksize(skc->sc_session_kb.kb_tfm);
+	blocksize = crypto_blkcipher_blocksize(skc->sc_session_kb.kb_tfm);
 	if (skw.skw_cipher.len % blocksize != 0)
 		return GSS_S_DEFECTIVE_TOKEN;
 
@@ -586,8 +600,8 @@ __u32 gss_unwrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header,
 	msgbufs[0] = skw.skw_header;
 	msgbufs[1] = *gss_header;
 	msgbufs[2] = skw.skw_cipher;
-	rc = sk_verify_hmac(skc->sc_hmac, &skc->sc_hmac_key, 3, msgbufs,
-			    0, NULL, &skw.skw_hmac, gss_context->hash_func);
+	rc = sk_verify_hmac(sht, &skc->sc_hmac_key, 3, msgbufs, 0, NULL,
+			    &skw.skw_hmac);
 	if (rc)
 		return rc;
 
@@ -609,7 +623,7 @@ __u32 gss_prep_bulk_sk(struct gss_ctx *gss_context,
 	int i;
 
 	LASSERT(skc->sc_session_kb.kb_tfm);
-	blocksize = crypto_sync_skcipher_blocksize(skc->sc_session_kb.kb_tfm);
+	blocksize = crypto_blkcipher_blocksize(skc->sc_session_kb.kb_tfm);
 
 	for (i = 0; i < desc->bd_iov_count; i++) {
 		if (BD_GET_KIOV(desc, i).kiov_offset & blocksize) {
@@ -627,26 +641,27 @@ __u32 gss_prep_bulk_sk(struct gss_ctx *gss_context,
 	return GSS_S_COMPLETE;
 }
 
-static __u32 sk_encrypt_bulk(struct crypto_sync_skcipher *tfm, __u8 *iv,
+static __u32 sk_encrypt_bulk(struct crypto_blkcipher *tfm, __u8 *iv,
 			     struct ptlrpc_bulk_desc *desc, rawobj_t *cipher,
 			     int adj_nob)
 {
+	struct blkcipher_desc cdesc = {
+		.tfm = tfm,
+		.info = iv,
+		.flags = 0,
+	};
 	struct scatterlist ptxt;
 	struct scatterlist ctxt;
 	int blocksize;
 	int i;
 	int rc;
 	int nob = 0;
-	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
 
-	blocksize = crypto_sync_skcipher_blocksize(tfm);
+	blocksize = crypto_blkcipher_blocksize(tfm);
 
 	sg_init_table(&ptxt, 1);
 	sg_init_table(&ctxt, 1);
 
-	skcipher_request_set_sync_tfm(req, tfm);
-	skcipher_request_set_callback(req, 0, NULL, NULL);
-
 	for (i = 0; i < desc->bd_iov_count; i++) {
 		sg_set_page(&ptxt, BD_GET_KIOV(desc, i).kiov_page,
 			    sk_block_mask(BD_GET_KIOV(desc, i).kiov_len,
@@ -660,15 +675,13 @@ static __u32 sk_encrypt_bulk(struct crypto_sync_skcipher *tfm, __u8 *iv,
 		BD_GET_ENC_KIOV(desc, i).kiov_offset = ctxt.offset;
 		BD_GET_ENC_KIOV(desc, i).kiov_len = ctxt.length;
 
-		skcipher_request_set_crypt(req, &ptxt, &ctxt, ptxt.length, iv);
-		rc = crypto_skcipher_encrypt_iv(req, &ctxt, &ptxt, ptxt.length);
+		rc = crypto_blkcipher_encrypt_iv(&cdesc, &ctxt, &ptxt,
+						 ptxt.length);
 		if (rc) {
 			CERROR("failed to encrypt page: %d\n", rc);
-			skcipher_request_zero(req);
 			return rc;
 		}
 	}
-	skcipher_request_zero(req);
 
 	if (adj_nob)
 		desc->bd_nob = nob;
@@ -676,10 +689,15 @@ static __u32 sk_encrypt_bulk(struct crypto_sync_skcipher *tfm, __u8 *iv,
 	return 0;
 }
 
-static __u32 sk_decrypt_bulk(struct crypto_sync_skcipher *tfm, __u8 *iv,
+static __u32 sk_decrypt_bulk(struct crypto_blkcipher *tfm, __u8 *iv,
 			     struct ptlrpc_bulk_desc *desc, rawobj_t *cipher,
 			     int adj_nob)
 {
+	struct blkcipher_desc cdesc = {
+		.tfm = tfm,
+		.info = iv,
+		.flags = 0,
+	};
 	struct scatterlist ptxt;
 	struct scatterlist ctxt;
 	int blocksize;
@@ -687,21 +705,17 @@ static __u32 sk_decrypt_bulk(struct crypto_sync_skcipher *tfm, __u8 *iv,
 	int rc;
 	int pnob = 0;
 	int cnob = 0;
-	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
 
 	sg_init_table(&ptxt, 1);
 	sg_init_table(&ctxt, 1);
 
-	blocksize = crypto_sync_skcipher_blocksize(tfm);
+	blocksize = crypto_blkcipher_blocksize(tfm);
 	if (desc->bd_nob_transferred % blocksize != 0) {
 		CERROR("Transfer not a multiple of block size: %d\n",
 		       desc->bd_nob_transferred);
 		return GSS_S_DEFECTIVE_TOKEN;
 	}
 
-	skcipher_request_set_sync_tfm(req, tfm);
-	skcipher_request_set_callback(req, 0, NULL, NULL);
-
 	for (i = 0; i < desc->bd_iov_count && cnob < desc->bd_nob_transferred;
 	     i++) {
 		lnet_kiov_t *piov = &BD_GET_KIOV(desc, i);
@@ -710,7 +724,6 @@ static __u32 sk_decrypt_bulk(struct crypto_sync_skcipher *tfm, __u8 *iv,
 		if (ciov->kiov_offset % blocksize != 0 ||
 		    ciov->kiov_len % blocksize != 0) {
 			CERROR("Invalid bulk descriptor vector\n");
-			skcipher_request_zero(req);
 			return GSS_S_DEFECTIVE_TOKEN;
 		}
 
@@ -734,7 +747,6 @@ static __u32 sk_decrypt_bulk(struct crypto_sync_skcipher *tfm, __u8 *iv,
 			if (ciov->kiov_len + cnob > desc->bd_nob_transferred ||
 			    piov->kiov_len > ciov->kiov_len) {
 				CERROR("Invalid decrypted length\n");
-				skcipher_request_zero(req);
 				return GSS_S_FAILURE;
 			}
 		}
@@ -753,11 +765,10 @@ static __u32 sk_decrypt_bulk(struct crypto_sync_skcipher *tfm, __u8 *iv,
 		if (piov->kiov_len % blocksize == 0)
 			sg_assign_page(&ptxt, piov->kiov_page);
 
-		skcipher_request_set_crypt(req, &ctxt, &ptxt, ptxt.length, iv);
-		rc = crypto_skcipher_decrypt_iv(req, &ptxt, &ctxt, ptxt.length);
+		rc = crypto_blkcipher_decrypt_iv(&cdesc, &ptxt, &ctxt,
+						 ctxt.length);
 		if (rc) {
 			CERROR("Decryption failed for page: %d\n", rc);
-			skcipher_request_zero(req);
 			return GSS_S_FAILURE;
 		}
 
@@ -772,7 +783,6 @@ static __u32 sk_decrypt_bulk(struct crypto_sync_skcipher *tfm, __u8 *iv,
 		cnob += ciov->kiov_len;
 		pnob += piov->kiov_len;
 	}
-	skcipher_request_zero(req);
 
 	/* if needed, clear up the rest unused iovs */
 	if (adj_nob)
@@ -800,7 +810,7 @@ __u32 gss_wrap_bulk_sk(struct gss_ctx *gss_context,
 		       int adj_nob)
 {
 	struct sk_ctx *skc = gss_context->internal_ctx_id;
-	size_t sht_bytes = cfs_crypto_hash_digestsize(skc->sc_hmac);
+	struct sk_hmac_type *sht = &sk_hmac_types[skc->sc_hmac];
 	struct sk_wire skw;
 	struct sk_hdr skh;
 	__u8 local_iv[SK_IV_SIZE];
@@ -817,16 +827,15 @@ __u32 gss_wrap_bulk_sk(struct gss_ctx *gss_context,
 
 	sk_construct_rfc3686_iv(local_iv, skc->sc_host_random, skh.skh_iv);
 	skw.skw_cipher.data = skw.skw_header.data + skw.skw_header.len;
-	skw.skw_cipher.len = token->len - skw.skw_header.len - sht_bytes;
+	skw.skw_cipher.len = token->len - skw.skw_header.len - sht->sht_bytes;
 	if (sk_encrypt_bulk(skc->sc_session_kb.kb_tfm, local_iv,
 			    desc, &skw.skw_cipher, adj_nob))
 		return GSS_S_FAILURE;
 
 	skw.skw_hmac.data = skw.skw_cipher.data + skw.skw_cipher.len;
-	skw.skw_hmac.len = sht_bytes;
-	if (sk_make_hmac(skc->sc_hmac, &skc->sc_hmac_key, 1, &skw.skw_cipher,
-			 desc->bd_iov_count, GET_ENC_KIOV(desc), &skw.skw_hmac,
-			 gss_context->hash_func))
+	skw.skw_hmac.len = sht->sht_bytes;
+	if (sk_make_hmac(sht->sht_name, &skc->sc_hmac_key, 1, &skw.skw_cipher,
+			 desc->bd_iov_count, GET_ENC_KIOV(desc), &skw.skw_hmac))
 		return GSS_S_FAILURE;
 
 	return GSS_S_COMPLETE;
@@ -838,7 +847,7 @@ __u32 gss_unwrap_bulk_sk(struct gss_ctx *gss_context,
 			   rawobj_t *token, int adj_nob)
 {
 	struct sk_ctx *skc = gss_context->internal_ctx_id;
-	size_t sht_bytes = cfs_crypto_hash_digestsize(skc->sc_hmac);
+	struct sk_hmac_type *sht = &sk_hmac_types[skc->sc_hmac];
 	struct sk_wire skw;
 	struct sk_hdr *skh;
 	__u8 local_iv[SK_IV_SIZE];
@@ -846,25 +855,25 @@ __u32 gss_unwrap_bulk_sk(struct gss_ctx *gss_context,
 
 	LASSERT(skc->sc_session_kb.kb_tfm);
 
-	if (token->len < sizeof(skh) + sht_bytes)
+	if (token->len < sizeof(skh) + sht->sht_bytes)
 		return GSS_S_DEFECTIVE_TOKEN;
 
 	skw.skw_header.data = token->data;
 	skw.skw_header.len = sizeof(struct sk_hdr);
 	skw.skw_cipher.data = skw.skw_header.data + skw.skw_header.len;
-	skw.skw_cipher.len = token->len - skw.skw_header.len - sht_bytes;
+	skw.skw_cipher.len = token->len - skw.skw_header.len - sht->sht_bytes;
 	skw.skw_hmac.data = skw.skw_cipher.data + skw.skw_cipher.len;
-	skw.skw_hmac.len = sht_bytes;
+	skw.skw_hmac.len = sht->sht_bytes;
 
 	skh = (struct sk_hdr *)skw.skw_header.data;
 	rc = sk_verify_header(skh);
 	if (rc != GSS_S_COMPLETE)
 		return rc;
 
-	rc = sk_verify_bulk_hmac(skc->sc_hmac, &skc->sc_hmac_key, 1,
-				 &skw.skw_cipher, desc->bd_iov_count,
-				 GET_ENC_KIOV(desc), desc->bd_nob,
-				 &skw.skw_hmac);
+	rc = sk_verify_bulk_hmac(&sk_hmac_types[skc->sc_hmac],
+				 &skc->sc_hmac_key, 1, &skw.skw_cipher,
+				 desc->bd_iov_count, GET_ENC_KIOV(desc),
+				 desc->bd_nob, &skw.skw_hmac);
 	if (rc)
 		return rc;
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_svc_upcall.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_svc_upcall.c
index 2202e3f56f8c5..4798711dbe983 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_svc_upcall.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_svc_upcall.c
@@ -60,6 +60,7 @@
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
+#include <lustre/lustre_idl.h>
 #include <lustre_import.h>
 #include <lustre_net.h>
 #include <lustre_nodemap.h>
@@ -68,15 +69,12 @@
 #include "gss_err.h"
 #include "gss_internal.h"
 #include "gss_api.h"
-#include "gss_crypto.h"
 
 #define GSS_SVC_UPCALL_TIMEOUT  (20)
 
 static spinlock_t __ctx_index_lock;
 static __u64 __ctx_index;
 
-unsigned int krb5_allow_old_client_csum;
-
 __u64 gss_get_next_ctx_index(void)
 {
 	__u64 idx;
@@ -162,18 +160,6 @@ static struct cache_detail rsi_cache;
 static struct rsi *rsi_update(struct rsi *new, struct rsi *old);
 static struct rsi *rsi_lookup(struct rsi *item);
 
-#ifdef HAVE_CACHE_DETAIL_WRITERS
-static inline int channel_users(struct cache_detail *cd)
-{
-	return atomic_read(&cd->writers);
-}
-#else
-static inline int channel_users(struct cache_detail *cd)
-{
-	return atomic_read(&cd->readers);
-}
-#endif
-
 static inline int rsi_hash(struct rsi *item)
 {
         return hash_mem((char *)item->in_handle.data, item->in_handle.len,
@@ -313,9 +299,10 @@ static struct cache_head *rsi_alloc(void)
 static int rsi_parse(struct cache_detail *cd, char *mesg, int mlen)
 {
         char           *buf = mesg;
+        char           *ep;
         int             len;
         struct rsi      rsii, *rsip = NULL;
-	time64_t expiry;
+        time_t          expiry;
         int             status = -EINVAL;
         ENTRY;
 
@@ -354,21 +341,18 @@ static int rsi_parse(struct cache_detail *cd, char *mesg, int mlen)
         if (len <= 0)
                 goto out;
 
-	/* major */
-	status = kstrtoint(buf, 10, &rsii.major_status);
-	if (status)
-		goto out;
-
-	/* minor */
-	len = qword_get(&mesg, buf, mlen);
-	if (len <= 0) {
-		status = -EINVAL;
-		goto out;
-	}
+        /* major */
+        rsii.major_status = simple_strtol(buf, &ep, 10);
+        if (*ep)
+                goto out;
 
-	status = kstrtoint(buf, 10, &rsii.minor_status);
-	if (status)
-		goto out;
+        /* minor */
+        len = qword_get(&mesg, buf, mlen);
+        if (len <= 0)
+                goto out;
+        rsii.minor_status = simple_strtol(buf, &ep, 10);
+        if (*ep)
+                goto out;
 
         /* out_handle */
         len = qword_get(&mesg, buf, mlen);
@@ -560,7 +544,7 @@ static int rsc_parse(struct cache_detail *cd, char *mesg, int mlen)
         char                *buf = mesg;
         int                  len, rv, tmp_int;
         struct rsc           rsci, *rscp = NULL;
-	time64_t expiry;
+        time_t               expiry;
         int                  status = -EINVAL;
         struct gss_api_mech *gm = NULL;
 
@@ -665,7 +649,8 @@ static int rsc_parse(struct cache_detail *cd, char *mesg, int mlen)
 		/* currently the expiry time passed down from user-space
 		 * is invalid, here we retrive it from mech.
 		 */
-		if (lgss_inquire_context(rsci.ctx.gsc_mechctx, &ctx_expiry)) {
+		if (lgss_inquire_context(rsci.ctx.gsc_mechctx,
+					 (unsigned long *)&ctx_expiry)) {
 			CERROR("unable to get expire time, drop it\n");
 			goto out;
 		}
@@ -735,6 +720,85 @@ static struct rsc *rsc_update(struct rsc *new, struct rsc *old)
  * rsc cache flush                      *
  ****************************************/
 
+typedef int rsc_entry_match(struct rsc *rscp, long data);
+
+static void rsc_flush(rsc_entry_match *match, long data)
+{
+#ifdef HAVE_CACHE_HEAD_HLIST
+	struct cache_head *ch = NULL;
+	struct hlist_head *head;
+#else
+	struct cache_head **ch;
+#endif
+        struct rsc *rscp;
+        int n;
+        ENTRY;
+
+	write_lock(&rsc_cache.hash_lock);
+        for (n = 0; n < RSC_HASHMAX; n++) {
+#ifdef HAVE_CACHE_HEAD_HLIST
+		head = &rsc_cache.hash_table[n];
+		hlist_for_each_entry(ch, head, cache_list) {
+			rscp = container_of(ch, struct rsc, h);
+#else
+		for (ch = &rsc_cache.hash_table[n]; *ch;) {
+			rscp = container_of(*ch, struct rsc, h);
+#endif
+
+                        if (!match(rscp, data)) {
+#ifndef HAVE_CACHE_HEAD_HLIST
+				ch = &((*ch)->next);
+#endif
+                                continue;
+                        }
+
+                        /* it seems simply set NEGATIVE doesn't work */
+#ifdef HAVE_CACHE_HEAD_HLIST
+			hlist_del_init(&ch->cache_list);
+#else
+			*ch = (*ch)->next;
+			rscp->h.next = NULL;
+#endif
+                        cache_get(&rscp->h);
+			set_bit(CACHE_NEGATIVE, &rscp->h.flags);
+                        COMPAT_RSC_PUT(&rscp->h, &rsc_cache);
+                        rsc_cache.entries--;
+                }
+        }
+	write_unlock(&rsc_cache.hash_lock);
+        EXIT;
+}
+
+static int match_uid(struct rsc *rscp, long uid)
+{
+        if ((int) uid == -1)
+                return 1;
+        return ((int) rscp->ctx.gsc_uid == (int) uid);
+}
+
+static int match_target(struct rsc *rscp, long target)
+{
+        return (rscp->target == (struct obd_device *) target);
+}
+
+static inline void rsc_flush_uid(int uid)
+{
+        if (uid == -1)
+                CWARN("flush all gss contexts...\n");
+
+        rsc_flush(match_uid, (long) uid);
+}
+
+static inline void rsc_flush_target(struct obd_device *target)
+{
+        rsc_flush(match_target, (long) target);
+}
+
+void gss_secsvc_flush(struct obd_device *target)
+{
+        rsc_flush_target(target);
+}
+
 static struct rsc *gss_svc_searchbyctx(rawobj_t *handle)
 {
         struct rsc  rsci;
@@ -758,7 +822,7 @@ int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp,
                                    struct gss_cli_ctx *gctx)
 {
         struct rsc      rsci, *rscp = NULL;
-	time64_t ctx_expiry;
+        unsigned long   ctx_expiry;
         __u32           major;
         int             rc;
         ENTRY;
@@ -782,7 +846,7 @@ int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp,
                 CERROR("unable to get expire time, drop it\n");
                 GOTO(out, rc = -EINVAL);
         }
-	rsci.h.expiry_time = ctx_expiry;
+        rsci.h.expiry_time = (time_t) ctx_expiry;
 
 	switch (imp->imp_obd->u.cli.cl_sp_to) {
 	case LUSTRE_SP_MDT:
@@ -793,13 +857,6 @@ int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp,
 		break;
 	case LUSTRE_SP_CLI:
 		rsci.ctx.gsc_usr_root = 1;
-		break;
-	case LUSTRE_SP_MGS:
-		/* by convention, all 3 set to 1 means MGS */
-		rsci.ctx.gsc_usr_mds = 1;
-		rsci.ctx.gsc_usr_oss = 1;
-		rsci.ctx.gsc_usr_root = 1;
-		break;
 	default:
 		break;
 	}
@@ -827,15 +884,15 @@ int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp,
 
 int gss_svc_upcall_expire_rvs_ctx(rawobj_t *handle)
 {
-	const time64_t expire = 20;
-	struct rsc *rscp;
+        const cfs_time_t        expire = 20;
+        struct rsc             *rscp;
 
         rscp = gss_svc_searchbyctx(handle);
         if (rscp) {
                 CDEBUG(D_SEC, "reverse svcctx %p (rsc %p) expire soon\n",
                        &rscp->ctx, rscp);
 
-		rscp->h.expiry_time = ktime_get_real_seconds() + expire;
+                rscp->h.expiry_time = cfs_time_current_sec() + expire;
                 COMPAT_RSC_PUT(&rscp->h, &rsc_cache);
         }
         return 0;
@@ -889,11 +946,7 @@ int gss_svc_upcall_handle_init(struct ptlrpc_request *req,
 
 	memset(&rsikey, 0, sizeof(rsikey));
 	rsikey.lustre_svc = lustre_svc;
-	/* In case of MR, rq_peer is not the NID from which request is received,
-	 * but primary NID of peer.
-	 * So we need rq_source, which contains the NID actually in use.
-	 */
-	rsikey.nid = (__u64) req->rq_source.nid;
+	rsikey.nid = (__u64) req->rq_peer.nid;
 	nodemap_test_nid(req->rq_peer.nid, rsikey.nm_name,
 			 sizeof(rsikey.nm_name));
 
@@ -938,11 +991,11 @@ int gss_svc_upcall_handle_init(struct ptlrpc_request *req,
 		if (first_check) {
 			first_check = 0;
 
-			cache_read_lock(&rsi_cache);
+			read_lock(&rsi_cache.hash_lock);
 			valid = test_bit(CACHE_VALID, &rsip->h.flags);
 			if (valid == 0)
 				set_current_state(TASK_INTERRUPTIBLE);
-			cache_read_unlock(&rsi_cache);
+			read_unlock(&rsi_cache.hash_lock);
 
 			if (valid == 0) {
 				unsigned long jiffies;
@@ -991,20 +1044,6 @@ int gss_svc_upcall_handle_init(struct ptlrpc_request *req,
                 grctx->src_ctx = &rsci->ctx;
         }
 
-	if (gw->gw_flags & LUSTRE_GSS_PACK_KCSUM) {
-		grctx->src_ctx->gsc_mechctx->hash_func = gss_digest_hash;
-	} else if (!strcmp(grctx->src_ctx->gsc_mechctx->mech_type->gm_name,
-			   "krb5") &&
-		   !krb5_allow_old_client_csum) {
-		CWARN("%s: deny connection from '%s' due to missing 'krb_csum' feature, set 'sptlrpc.gss.krb5_allow_old_client_csum=1' to allow, but recommend client upgrade: rc = %d\n",
-		      target->obd_name, libcfs_nid2str(req->rq_peer.nid),
-		      -EPROTO);
-		GOTO(out, rc = SECSVC_DROP);
-	} else {
-		grctx->src_ctx->gsc_mechctx->hash_func =
-			gss_digest_hash_compat;
-	}
-
         if (rawobj_dup(&rsci->ctx.gsc_rvs_hdl, rvs_hdl)) {
                 CERROR("failed duplicate reverse handle\n");
                 GOTO(out, rc);
@@ -1133,18 +1172,17 @@ int __init gss_init_svc_upcall(void)
 	/* FIXME this looks stupid. we intend to give lsvcgssd a chance to open
 	 * the init upcall channel, otherwise there's big chance that the first
 	 * upcall issued before the channel be opened thus nfsv4 cache code will
-	 * drop the request directly, thus lead to unnecessary recovery time.
-	 * Here we wait at minimum 1.5 seconds.
-	 */
+	 * drop the request direclty, thus lead to unnecessary recovery time.
+	 * here we wait at miximum 1.5 seconds. */
 	for (i = 0; i < 6; i++) {
-		if (channel_users(&rsi_cache) > 0)
+		if (atomic_read(&rsi_cache.readers) > 0)
 			break;
 		set_current_state(TASK_UNINTERRUPTIBLE);
-		LASSERT(msecs_to_jiffies(MSEC_PER_SEC / 4) > 0);
+		LASSERT(msecs_to_jiffies(MSEC_PER_SEC) >= 4);
 		schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC / 4));
 	}
 
-	if (channel_users(&rsi_cache) == 0)
+	if (atomic_read(&rsi_cache.readers) == 0)
 		CWARN("Init channel is not opened by lsvcgssd, following "
 		      "request might be dropped until lsvcgssd is active\n");
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/lproc_gss.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/lproc_gss.c
index 1335ffd466ff3..610f0b38c8d4f 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/lproc_gss.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/lproc_gss.c
@@ -41,6 +41,7 @@
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
+#include <lustre/lustre_idl.h>
 #include <lustre_net.h>
 #include <lustre_import.h>
 #include <lprocfs_status.h>
@@ -132,29 +133,7 @@ static const struct file_operations gss_proc_secinit = {
 	.write = gss_proc_write_secinit,
 };
 
-int sptlrpc_krb5_allow_old_client_csum_seq_show(struct seq_file *m, void *data)
-{
-	seq_printf(m, "%u\n", krb5_allow_old_client_csum);
-	return 0;
-}
-
-ssize_t sptlrpc_krb5_allow_old_client_csum_seq_write(struct file *file,
-						     const char __user *buffer,
-						     size_t count, loff_t *off)
-{
-	bool val;
-	int rc;
-
-	rc = kstrtobool_from_user(buffer, count, &val);
-	if (rc)
-		return rc;
-
-	krb5_allow_old_client_csum = val;
-	return count;
-}
-LPROC_SEQ_FOPS(sptlrpc_krb5_allow_old_client_csum);
-
-static struct ldebugfs_vars gss_debugfs_vars[] = {
+static struct lprocfs_vars gss_lprocfs_vars[] = {
 	{ .name	=	"replays",
 	  .fops	=	&gss_proc_oos_fops	},
 	{ .name	=	"init_channel",
@@ -163,12 +142,6 @@ static struct ldebugfs_vars gss_debugfs_vars[] = {
 	{ NULL }
 };
 
-static struct lprocfs_vars gss_lprocfs_vars[] = {
-	{ .name	=	"krb5_allow_old_client_csum",
-	  .fops	=	&sptlrpc_krb5_allow_old_client_csum_fops },
-	{ NULL }
-};
-
 /*
  * for userspace helper lgss_keyring.
  *
@@ -186,14 +159,14 @@ static ssize_t
 gss_lk_proc_dl_seq_write(struct file *file, const char __user *buffer,
 				size_t count, loff_t *off)
 {
-	unsigned int val;
 	int rc;
+	__s64 val;
 
-	rc = kstrtouint_from_user(buffer, count, 0, &val);
+	rc = lprocfs_str_to_s64(buffer, count, &val);
 	if (rc < 0)
 		return rc;
 
-	if (val > 4)
+	if (val < 0 || val > 4)
 		return -ERANGE;
 
 	gss_lk_debug_level = val;
@@ -202,7 +175,7 @@ gss_lk_proc_dl_seq_write(struct file *file, const char __user *buffer,
 }
 LPROC_SEQ_FOPS(gss_lk_proc_dl);
 
-static struct ldebugfs_vars gss_lk_debugfs_vars[] = {
+static struct lprocfs_vars gss_lk_lprocfs_vars[] = {
 	{ .name	=	"debug_level",
 	  .fops	=	&gss_lk_proc_dl_fops	},
 	{ NULL }
@@ -236,7 +209,7 @@ int gss_init_lproc(void)
 	}
 
 	gss_proc_lk = lprocfs_register("lgss_keyring", gss_proc_root,
-				       gss_lk_ldebugfs_vars, NULL);
+				       gss_lk_lprocfs_vars, NULL);
 	if (IS_ERR(gss_proc_lk)) {
 		rc = PTR_ERR(gss_proc_lk);
 		gss_proc_lk = NULL;
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/sec_gss.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/sec_gss.c
index 845bfbca44d51..bee52f3751356 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/sec_gss.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/sec_gss.c
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  *
  * Author: Eric Mei <ericm@clusterfs.com>
  */
@@ -59,6 +59,7 @@
 #include <obd_class.h>
 #include <obd_support.h>
 #include <obd_cksum.h>
+#include <lustre/lustre_idl.h>
 #include <lustre_net.h>
 #include <lustre_import.h>
 #include <lustre_sec.h>
@@ -308,11 +309,11 @@ int cli_ctx_expire(struct ptlrpc_cli_ctx *ctx)
 		if (!ctx->cc_early_expire)
 			clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags);
 
-		CWARN("ctx %p(%u->%s) get expired: %lld(%+llds)\n",
+		CWARN("ctx %p(%u->%s) get expired: %lu(%+lds)\n",
 		      ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
 		      ctx->cc_expire,
 		      ctx->cc_expire == 0 ? 0 :
-		      ctx->cc_expire - ktime_get_real_seconds());
+		      cfs_time_sub(ctx->cc_expire, cfs_time_current_sec()));
 
 		sptlrpc_cli_ctx_wakeup(ctx);
 		return 1;
@@ -335,7 +336,7 @@ int cli_ctx_check_death(struct ptlrpc_cli_ctx *ctx)
                 return 0;
 
         /* check real expiration */
-	if (ctx->cc_expire > ktime_get_real_seconds())
+        if (cfs_time_after(ctx->cc_expire, cfs_time_current_sec()))
                 return 0;
 
         cli_ctx_expire(ctx);
@@ -344,8 +345,8 @@ int cli_ctx_check_death(struct ptlrpc_cli_ctx *ctx)
 
 void gss_cli_ctx_uptodate(struct gss_cli_ctx *gctx)
 {
-	struct ptlrpc_cli_ctx *ctx = &gctx->gc_base;
-	time64_t ctx_expiry;
+        struct ptlrpc_cli_ctx  *ctx = &gctx->gc_base;
+        unsigned long           ctx_expiry;
 
         if (lgss_inquire_context(gctx->gc_mechctx, &ctx_expiry)) {
                 CERROR("ctx %p(%u): unable to inquire, expire it now\n",
@@ -364,17 +365,17 @@ void gss_cli_ctx_uptodate(struct gss_cli_ctx *gctx)
 
 	if (sec_is_reverse(ctx->cc_sec)) {
 		CWARN("server installed reverse ctx %p idx %#llx, "
-		      "expiry %lld(%+llds)\n", ctx,
+		      "expiry %lu(%+lds)\n", ctx,
 		      gss_handle_to_u64(&gctx->gc_handle),
 		      ctx->cc_expire,
-		      ctx->cc_expire - ktime_get_real_seconds());
+		      cfs_time_sub(ctx->cc_expire, cfs_time_current_sec()));
         } else {
 		CWARN("client refreshed ctx %p idx %#llx (%u->%s), "
-		      "expiry %lld(%+llds)\n", ctx,
+		      "expiry %lu(%+lds)\n", ctx,
 		      gss_handle_to_u64(&gctx->gc_handle),
 		      ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
 		      ctx->cc_expire,
-		      ctx->cc_expire - ktime_get_real_seconds());
+		      cfs_time_sub(ctx->cc_expire, cfs_time_current_sec()));
 
 		/* install reverse svc ctx for root context */
 		if (ctx->cc_vcred.vc_uid == 0)
@@ -1102,9 +1103,6 @@ int gss_sec_create_common(struct gss_sec *gsec,
 	sec->ps_import = class_import_get(imp);
 	spin_lock_init(&sec->ps_lock);
 	INIT_LIST_HEAD(&sec->ps_gc_list);
-	sec->ps_sepol_mtime = ktime_set(0, 0);
-	sec->ps_sepol_checknext = ktime_set(0, 0);
-	sec->ps_sepol[0] = '\0';
 
         if (!svcctx) {
                 sec->ps_gc_interval = GSS_GC_INTERVAL;
@@ -2057,17 +2055,16 @@ int gss_svc_handle_init(struct ptlrpc_request *req,
         if (rc != SECSVC_OK)
                 RETURN(rc);
 
-	if (grctx->src_ctx->gsc_usr_mds || grctx->src_ctx->gsc_usr_oss ||
-	    grctx->src_ctx->gsc_usr_root)
-		CWARN("create svc ctx %p: user from %s authenticated as %s\n",
-		      grctx->src_ctx, libcfs_nid2str(req->rq_peer.nid),
-		      grctx->src_ctx->gsc_usr_root ? "root" :
-		      (grctx->src_ctx->gsc_usr_mds ? "mds" :
-		       (grctx->src_ctx->gsc_usr_oss ? "oss" : "null")));
-	else
-		CWARN("create svc ctx %p: accept user %u from %s\n",
-		      grctx->src_ctx, grctx->src_ctx->gsc_uid,
-		      libcfs_nid2str(req->rq_peer.nid));
+        if (grctx->src_ctx->gsc_usr_mds || grctx->src_ctx->gsc_usr_oss ||
+            grctx->src_ctx->gsc_usr_root)
+                CWARN("create svc ctx %p: user from %s authenticated as %s\n",
+                      grctx->src_ctx, libcfs_nid2str(req->rq_peer.nid),
+                      grctx->src_ctx->gsc_usr_mds ? "mds" :
+                        (grctx->src_ctx->gsc_usr_oss ? "oss" : "root"));
+        else
+                CWARN("create svc ctx %p: accept user %u from %s\n",
+                      grctx->src_ctx, grctx->src_ctx->gsc_uid,
+                      libcfs_nid2str(req->rq_peer.nid));
 
         if (gw->gw_flags & LUSTRE_GSS_PACK_USER) {
                 if (reqbuf->lm_bufcount < 4) {
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/import.c b/drivers/staging/lustrefsx/lustre/ptlrpc/import.c
index 46d92bf4ed2d0..827a989f1e139 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/import.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/import.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -56,10 +56,10 @@ struct ptlrpc_connect_async_args {
 
 /**
  * Updates import \a imp current state to provided \a state value
- * Helper function.
+ * Helper function. Must be called under imp_lock.
  */
-static void import_set_state_nolock(struct obd_import *imp,
-				    enum lustre_imp_state state)
+static void __import_set_state(struct obd_import *imp,
+                               enum lustre_imp_state state)
 {
 	switch (state) {
 	case LUSTRE_IMP_CLOSED:
@@ -72,20 +72,7 @@ static void import_set_state_nolock(struct obd_import *imp,
 		break;
 	default:
 		imp->imp_replay_state = LUSTRE_IMP_REPLAY;
-		break;
 	}
-
-	/* A CLOSED import should remain so. */
-	if (imp->imp_state == LUSTRE_IMP_CLOSED)
-		return;
-
-	if (imp->imp_state != LUSTRE_IMP_NEW) {
-		CDEBUG(D_HA, "%p %s: changing import state from %s to %s\n",
-		       imp, obd2cli_tgt(imp->imp_obd),
-		       ptlrpc_import_state_name(imp->imp_state),
-		       ptlrpc_import_state_name(state));
-	}
-
         imp->imp_state = state;
         imp->imp_state_hist[imp->imp_state_hist_idx].ish_state = state;
         imp->imp_state_hist[imp->imp_state_hist_idx].ish_time =
@@ -94,17 +81,28 @@ static void import_set_state_nolock(struct obd_import *imp,
                 IMP_STATE_HIST_LEN;
 }
 
-static void import_set_state(struct obd_import *imp,
-			     enum lustre_imp_state new_state)
-{
-	spin_lock(&imp->imp_lock);
-	import_set_state_nolock(imp, new_state);
-	spin_unlock(&imp->imp_lock);
-}
+/* A CLOSED import should remain so. */
+#define IMPORT_SET_STATE_NOLOCK(imp, state)                                    \
+do {                                                                           \
+        if (imp->imp_state != LUSTRE_IMP_CLOSED) {                             \
+               CDEBUG(D_HA, "%p %s: changing import state from %s to %s\n",    \
+                      imp, obd2cli_tgt(imp->imp_obd),                          \
+                      ptlrpc_import_state_name(imp->imp_state),                \
+                      ptlrpc_import_state_name(state));                        \
+               __import_set_state(imp, state);                                 \
+        }                                                                      \
+} while(0)
+
+#define IMPORT_SET_STATE(imp, state)					\
+do {									\
+	spin_lock(&imp->imp_lock);					\
+	IMPORT_SET_STATE_NOLOCK(imp, state);				\
+	spin_unlock(&imp->imp_lock);					\
+} while(0)
 
 void ptlrpc_import_enter_resend(struct obd_import *imp)
 {
-	import_set_state(imp, LUSTRE_IMP_RECOVER);
+	IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
 }
 EXPORT_SYMBOL(ptlrpc_import_enter_resend);
 
@@ -148,21 +146,6 @@ void deuuidify(char *uuid, const char *prefix, char **uuid_start, int *uuid_len)
                 *uuid_len -= strlen(UUID_STR);
 }
 
-/* Must be called with imp_lock held! */
-static void ptlrpc_deactivate_import_nolock(struct obd_import *imp)
-{
-	ENTRY;
-
-	assert_spin_locked(&imp->imp_lock);
-	CDEBUG(D_HA, "setting import %s INVALID\n", obd2cli_tgt(imp->imp_obd));
-	imp->imp_invalid = 1;
-	imp->imp_generation++;
-
-	ptlrpc_abort_inflight(imp);
-
-	EXIT;
-}
-
 /**
  * Returns true if import was FULL, false if import was already not
  * connected.
@@ -173,10 +156,8 @@ static void ptlrpc_deactivate_import_nolock(struct obd_import *imp)
  *             bulk requests) and if one has already caused a reconnection
  *             (increasing the import->conn_cnt) the older failure should
  *             not also cause a reconnection.  If zero it forces a reconnect.
- * @invalid - set import invalid flag
  */
-int ptlrpc_set_import_discon(struct obd_import *imp,
-			     __u32 conn_cnt, bool invalid)
+int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt)
 {
 	int rc = 0;
 
@@ -186,43 +167,31 @@ int ptlrpc_set_import_discon(struct obd_import *imp,
             (conn_cnt == 0 || conn_cnt == imp->imp_conn_cnt)) {
                 char *target_start;
                 int   target_len;
-		bool  inact = false;
 
                 deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
                           &target_start, &target_len);
 
-		import_set_state_nolock(imp, LUSTRE_IMP_DISCON);
                 if (imp->imp_replayable) {
                         LCONSOLE_WARN("%s: Connection to %.*s (at %s) was "
                                "lost; in progress operations using this "
                                "service will wait for recovery to complete\n",
                                imp->imp_obd->obd_name, target_len, target_start,
-			       obd_import_nid2str(imp));
-		} else {
-			LCONSOLE_ERROR_MSG(0x166, "%s: Connection to "
-			       "%.*s (at %s) was lost; in progress "
-			       "operations using this service will fail\n",
-			       imp->imp_obd->obd_name, target_len, target_start,
-			       obd_import_nid2str(imp));
-			if (invalid) {
-				CDEBUG(D_HA, "import %s@%s for %s not "
-				       "replayable, auto-deactivating\n",
-				       obd2cli_tgt(imp->imp_obd),
-				       imp->imp_connection->c_remote_uuid.uuid,
-				       imp->imp_obd->obd_name);
-				ptlrpc_deactivate_import_nolock(imp);
-				inact = true;
-			}
-		}
+                               libcfs_nid2str(imp->imp_connection->c_peer.nid));
+                } else {
+                        LCONSOLE_ERROR_MSG(0x166, "%s: Connection to "
+                               "%.*s (at %s) was lost; in progress "
+                               "operations using this service will fail\n",
+                               imp->imp_obd->obd_name,
+                               target_len, target_start,
+                               libcfs_nid2str(imp->imp_connection->c_peer.nid));
+                }
+                IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
 		spin_unlock(&imp->imp_lock);
 
 		if (obd_dump_on_timeout)
 			libcfs_debug_dumplog();
 
 		obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON);
-
-		if (inact)
-			obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
 		rc = 1;
 	} else {
 		spin_unlock(&imp->imp_lock);
@@ -237,6 +206,23 @@ int ptlrpc_set_import_discon(struct obd_import *imp,
         return rc;
 }
 
+/* Must be called with imp_lock held! */
+static void ptlrpc_deactivate_and_unlock_import(struct obd_import *imp)
+{
+	ENTRY;
+	assert_spin_locked(&imp->imp_lock);
+
+	CDEBUG(D_HA, "setting import %s INVALID\n", obd2cli_tgt(imp->imp_obd));
+	imp->imp_invalid = 1;
+	imp->imp_generation++;
+	spin_unlock(&imp->imp_lock);
+
+	ptlrpc_abort_inflight(imp);
+	obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
+
+	EXIT;
+}
+
 /*
  * This acts as a barrier; all existing requests are rejected, and
  * no new requests will be accepted until the import is valid again.
@@ -244,17 +230,14 @@ int ptlrpc_set_import_discon(struct obd_import *imp,
 void ptlrpc_deactivate_import(struct obd_import *imp)
 {
 	spin_lock(&imp->imp_lock);
-	ptlrpc_deactivate_import_nolock(imp);
-	spin_unlock(&imp->imp_lock);
-
-	obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
+	ptlrpc_deactivate_and_unlock_import(imp);
 }
 EXPORT_SYMBOL(ptlrpc_deactivate_import);
 
-static time64_t ptlrpc_inflight_deadline(struct ptlrpc_request *req,
-					 time64_t now)
+static unsigned int
+ptlrpc_inflight_deadline(struct ptlrpc_request *req, time64_t now)
 {
-	time64_t dl;
+        long dl;
 
         if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
               (req->rq_phase == RQ_PHASE_BULK) ||
@@ -275,12 +258,12 @@ static time64_t ptlrpc_inflight_deadline(struct ptlrpc_request *req,
         return dl - now;
 }
 
-static time64_t ptlrpc_inflight_timeout(struct obd_import *imp)
+static unsigned int ptlrpc_inflight_timeout(struct obd_import *imp)
 {
 	time64_t now = ktime_get_real_seconds();
 	struct list_head *tmp, *n;
 	struct ptlrpc_request *req;
-	time64_t timeout = 0;
+	unsigned int timeout = 0;
 
 	spin_lock(&imp->imp_lock);
 	list_for_each_safe(tmp, n, &imp->imp_sending_list) {
@@ -302,7 +285,7 @@ void ptlrpc_invalidate_import(struct obd_import *imp)
 	struct list_head *tmp, *n;
 	struct ptlrpc_request *req;
 	struct l_wait_info lwi;
-	time64_t timeout;
+	unsigned int timeout;
 	int rc;
 
 	atomic_inc(&imp->imp_inval_count);
@@ -322,35 +305,30 @@ void ptlrpc_invalidate_import(struct obd_import *imp)
          * unlink. We can't do anything before that because there is really
          * no guarantee that some rdma transfer is not in progress right now. */
         do {
-		long timeout_jiffies;
-
                 /* Calculate max timeout for waiting on rpcs to error
                  * out. Use obd_timeout if calculated value is smaller
-		 * than it.
-		 */
-		if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
-			timeout = ptlrpc_inflight_timeout(imp);
-			timeout += div_u64(timeout, 3);
-
-			if (timeout == 0)
-				timeout = obd_timeout;
-		} else {
-			/* decrease the interval to increase race condition */
-			timeout = 1;
-		}
+                 * than it. */
+                if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
+                        timeout = ptlrpc_inflight_timeout(imp);
+                        timeout += timeout / 3;
 
-		CDEBUG(D_RPCTRACE, "Sleeping %llds for inflight to error out\n",
-		       timeout);
+                        if (timeout == 0)
+                                timeout = obd_timeout;
+                } else {
+                        /* decrease the interval to increase race condition */
+                        timeout = 1;
+                }
+
+                CDEBUG(D_RPCTRACE,"Sleeping %d sec for inflight to error out\n",
+                       timeout);
 
 		/* Wait for all requests to error out and call completion
 		 * callbacks. Cap it at obd_timeout -- these should all
-		 * have been locally cancelled by ptlrpc_abort_inflight.
-		 */
-		timeout_jiffies = max_t(long, cfs_time_seconds(timeout), 1);
-		lwi = LWI_TIMEOUT_INTERVAL(timeout_jiffies,
-					   (timeout > 1) ? cfs_time_seconds(1) :
-							   cfs_time_seconds(1) / 2,
-							   NULL, NULL);
+		 * have been locally cancelled by ptlrpc_abort_inflight. */
+		lwi = LWI_TIMEOUT_INTERVAL(
+			cfs_timeout_cap(cfs_time_seconds(timeout)),
+			(timeout > 1)?cfs_time_seconds(1):cfs_time_seconds(1)/2,
+			NULL, NULL);
 		rc = l_wait_event(imp->imp_recovery_waitq,
 				  (atomic_read(&imp->imp_inflight) == 0),
 				  &lwi);
@@ -418,23 +396,17 @@ void ptlrpc_invalidate_import(struct obd_import *imp)
 EXPORT_SYMBOL(ptlrpc_invalidate_import);
 
 /* unset imp_invalid */
-void ptlrpc_activate_import(struct obd_import *imp, bool set_state_full)
+void ptlrpc_activate_import(struct obd_import *imp)
 {
 	struct obd_device *obd = imp->imp_obd;
 
 	spin_lock(&imp->imp_lock);
 	if (imp->imp_deactive != 0) {
-		LASSERT(imp->imp_state != LUSTRE_IMP_FULL);
-		if (imp->imp_state != LUSTRE_IMP_DISCON)
-			import_set_state_nolock(imp, LUSTRE_IMP_DISCON);
 		spin_unlock(&imp->imp_lock);
 		return;
 	}
-	if (set_state_full)
-		import_set_state_nolock(imp, LUSTRE_IMP_FULL);
 
 	imp->imp_invalid = 0;
-
 	spin_unlock(&imp->imp_lock);
 	obd_import_event(obd, imp, IMP_EVENT_ACTIVE);
 }
@@ -456,36 +428,45 @@ EXPORT_SYMBOL(ptlrpc_pinger_force);
 
 void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt)
 {
-	ENTRY;
+        ENTRY;
 
-	LASSERT(!imp->imp_dlm_fake);
+        LASSERT(!imp->imp_dlm_fake);
 
-	if (ptlrpc_set_import_discon(imp, conn_cnt, true))
-		ptlrpc_pinger_force(imp);
+        if (ptlrpc_set_import_discon(imp, conn_cnt)) {
+                if (!imp->imp_replayable) {
+                        CDEBUG(D_HA, "import %s@%s for %s not replayable, "
+                               "auto-deactivating\n",
+                               obd2cli_tgt(imp->imp_obd),
+                               imp->imp_connection->c_remote_uuid.uuid,
+                               imp->imp_obd->obd_name);
+                        ptlrpc_deactivate_import(imp);
+                }
 
+		ptlrpc_pinger_force(imp);
+	}
 	EXIT;
 }
 
 int ptlrpc_reconnect_import(struct obd_import *imp)
 {
 #ifdef ENABLE_PINGER
-	long timeout_jiffies = cfs_time_seconds(obd_timeout);
 	struct l_wait_info lwi;
+	int secs = cfs_time_seconds(obd_timeout);
 	int rc;
 
 	ptlrpc_pinger_force(imp);
 
 	CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n",
-	       obd2cli_tgt(imp->imp_obd), obd_timeout);
+	       obd2cli_tgt(imp->imp_obd), secs);
 
-	lwi = LWI_TIMEOUT(timeout_jiffies, NULL, NULL);
+	lwi = LWI_TIMEOUT(secs, NULL, NULL);
 	rc = l_wait_event(imp->imp_recovery_waitq,
 			  !ptlrpc_import_in_recovery(imp), &lwi);
 	CDEBUG(D_HA, "%s: recovery finished s:%s\n", obd2cli_tgt(imp->imp_obd),
 	       ptlrpc_import_state_name(imp->imp_state));
 	return rc;
 #else
-	ptlrpc_set_import_discon(imp, 0, false);
+	ptlrpc_set_import_discon(imp, 0);
 	/* Force a new connect attempt */
 	ptlrpc_invalidate_import(imp);
 	/* Do a fresh connect next time by zeroing the handle */
@@ -506,7 +487,7 @@ int ptlrpc_reconnect_import(struct obd_import *imp)
 	/* Allow reconnect attempts */
 	imp->imp_obd->obd_no_recov = 0;
 	/* Remove 'invalid' flag */
-	ptlrpc_activate_import(imp, false);
+	ptlrpc_activate_import(imp);
 	/* Attempt a new connect */
 	ptlrpc_recover_import(imp, NULL, 0);
 	return 0;
@@ -537,7 +518,7 @@ static int import_select_connection(struct obd_import *imp)
 	}
 
 	list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
-		CDEBUG(D_HA, "%s: connect to NID %s last attempt %lld\n",
+		CDEBUG(D_HA, "%s: connect to NID %s last attempt %llu\n",
                        imp->imp_obd->obd_name,
                        libcfs_nid2str(conn->oic_conn->c_peer.nid),
                        conn->oic_last_attempt);
@@ -545,7 +526,8 @@ static int import_select_connection(struct obd_import *imp)
                 /* If we have not tried this connection since
                    the last successful attempt, go with this one */
                 if ((conn->oic_last_attempt == 0) ||
-		    conn->oic_last_attempt <= imp->imp_last_success_conn) {
+                    cfs_time_beforeq_64(conn->oic_last_attempt,
+                                       imp->imp_last_success_conn)) {
                         imp_conn = conn;
                         tried_all = 0;
                         break;
@@ -556,7 +538,8 @@ static int import_select_connection(struct obd_import *imp)
                    least recently used */
                 if (!imp_conn)
                         imp_conn = conn;
-		else if (imp_conn->oic_last_attempt > conn->oic_last_attempt)
+                else if (cfs_time_before_64(conn->oic_last_attempt,
+                                            imp_conn->oic_last_attempt))
                         imp_conn = conn;
         }
 
@@ -585,7 +568,7 @@ static int import_select_connection(struct obd_import *imp)
 			"to %ds\n", imp->imp_obd->obd_name, at_get(at));
 	}
 
-	imp_conn->oic_last_attempt = ktime_get_seconds();
+        imp_conn->oic_last_attempt = cfs_time_current_64();
 
         /* switch connection, don't mind if it's same as the current one */
         if (imp->imp_connection)
@@ -656,41 +639,29 @@ static int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno)
 	return 0;
 }
 
-int ptlrpc_connect_import(struct obd_import *imp)
-{
-	spin_lock(&imp->imp_lock);
-	return ptlrpc_connect_import_locked(imp);
-}
-
 /**
  * Attempt to (re)connect import \a imp. This includes all preparations,
  * initializing CONNECT RPC request and passing it to ptlrpcd for
  * actual sending.
- *
- * Assumes imp->imp_lock is held, and releases it.
- *
  * Returns 0 on success or error code.
  */
-int ptlrpc_connect_import_locked(struct obd_import *imp)
+int ptlrpc_connect_import(struct obd_import *imp)
 {
 	struct obd_device *obd = imp->imp_obd;
 	int initial_connect = 0;
 	int set_transno = 0;
 	__u64 committed_before_reconnect = 0;
 	struct ptlrpc_request *request;
-	struct obd_connect_data ocd;
 	char *bufs[] = { NULL,
 			 obd2cli_tgt(imp->imp_obd),
 			 obd->obd_uuid.uuid,
 			 (char *)&imp->imp_dlm_handle,
-			 (char *)&ocd,
-			 NULL };
+			 (char *)&imp->imp_connect_data };
 	struct ptlrpc_connect_async_args *aa;
 	int rc;
 	ENTRY;
 
-	assert_spin_locked(&imp->imp_lock);
-
+	spin_lock(&imp->imp_lock);
 	if (imp->imp_state == LUSTRE_IMP_CLOSED) {
 		spin_unlock(&imp->imp_lock);
 		CERROR("can't connect to a closed import\n");
@@ -707,7 +678,7 @@ int ptlrpc_connect_import_locked(struct obd_import *imp)
 		RETURN(-EALREADY);
 	}
 
-	import_set_state_nolock(imp, LUSTRE_IMP_CONNECTING);
+	IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CONNECTING);
 
 	imp->imp_conn_cnt++;
 	imp->imp_resend_replay = 0;
@@ -731,16 +702,15 @@ int ptlrpc_connect_import_locked(struct obd_import *imp)
 
 	/* Reset connect flags to the originally requested flags, in case
 	 * the server is updated on-the-fly we will get the new features. */
-	ocd = imp->imp_connect_data;
-	ocd.ocd_connect_flags = imp->imp_connect_flags_orig;
-	ocd.ocd_connect_flags2 = imp->imp_connect_flags2_orig;
+	imp->imp_connect_data.ocd_connect_flags = imp->imp_connect_flags_orig;
+	imp->imp_connect_data.ocd_connect_flags2 = imp->imp_connect_flags2_orig;
 	/* Reset ocd_version each time so the server knows the exact versions */
-	ocd.ocd_version = LUSTRE_VERSION_CODE;
+	imp->imp_connect_data.ocd_version = LUSTRE_VERSION_CODE;
 	imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
 	imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18;
 
 	rc = obd_reconnect(NULL, imp->imp_obd->obd_self_export, obd,
-			   &obd->obd_uuid, &ocd, NULL);
+			   &obd->obd_uuid, &imp->imp_connect_data, NULL);
 	if (rc)
 		GOTO(out, rc);
 
@@ -748,19 +718,6 @@ int ptlrpc_connect_import_locked(struct obd_import *imp)
 	if (request == NULL)
 		GOTO(out, rc = -ENOMEM);
 
-	/* get SELinux policy info if any */
-	rc = sptlrpc_get_sepol(request);
-	if (rc < 0) {
-		ptlrpc_request_free(request);
-		GOTO(out, rc);
-	}
-
-	bufs[5] = request->rq_sepol;
-
-	req_capsule_set_size(&request->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
-			     strlen(request->rq_sepol) ?
-			     strlen(request->rq_sepol) + 1 : 0);
-
 	rc = ptlrpc_request_bufs_pack(request, LUSTRE_OBD_VERSION,
 				      imp->imp_connect_op, bufs, NULL);
 	if (rc) {
@@ -770,8 +727,8 @@ int ptlrpc_connect_import_locked(struct obd_import *imp)
 
 	/* Report the rpc service time to the server so that it knows how long
 	 * to wait for clients to join recovery */
-	lustre_msg_set_service_timeout(request->rq_reqmsg,
-				       at_timeout2est(request->rq_timeout));
+	lustre_msg_set_service_time(request->rq_reqmsg,
+				    at_timeout2est(request->rq_timeout));
 
 	/* The amount of time we give the server to process the connect req.
 	 * import_select_connection will increase the net latency on
@@ -814,7 +771,7 @@ int ptlrpc_connect_import_locked(struct obd_import *imp)
 	rc = 0;
 out:
 	if (rc != 0)
-		import_set_state(imp, LUSTRE_IMP_DISCON);
+		IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
 
 	RETURN(rc);
 }
@@ -838,9 +795,9 @@ static int ptlrpc_busy_reconnect(int rc)
 }
 
 static int ptlrpc_connect_set_flags(struct obd_import *imp,
-				    struct obd_connect_data *ocd,
-				    __u64 old_connect_flags,
-				    struct obd_export *exp, int init_connect)
+				     struct obd_connect_data *ocd,
+				     __u64 old_connect_flags,
+				     struct obd_export *exp, int init_connect)
 {
 	static bool warned;
 	struct client_obd *cli = &imp->imp_obd->u.cli;
@@ -854,6 +811,7 @@ static int ptlrpc_connect_set_flags(struct obd_import *imp,
 
 	spin_unlock(&imp->imp_lock);
 
+
 	if (!warned && (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
 	    (ocd->ocd_version > LUSTRE_VERSION_CODE +
 				LUSTRE_VERSION_OFFSET_WARN ||
@@ -864,7 +822,7 @@ static int ptlrpc_connect_set_flags(struct obd_import *imp,
 		const char *older = "older than client. "
 				    "Consider upgrading server";
 		const char *newer = "newer than client. "
-				    "Consider upgrading client";
+				    "Consider recompiling application";
 
 		LCONSOLE_WARN("Server %s version (%d.%d.%d.%d) "
 			      "is much %s (%s)\n",
@@ -878,18 +836,37 @@ static int ptlrpc_connect_set_flags(struct obd_import *imp,
 		warned = true;
 	}
 
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
+	/* Check if server has LU-1252 fix applied to not always swab
+	 * the IR MNE entries. Do this only once per connection.  This
+	 * fixup is version-limited, because we don't want to carry the
+	 * OBD_CONNECT_MNE_SWAB flag around forever, just so long as we
+	 * need interop with unpatched 2.2 servers.  For newer servers,
+	 * the client will do MNE swabbing only as needed.  LU-1644 */
+	if (unlikely((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+		     !(ocd->ocd_connect_flags & OBD_CONNECT_MNE_SWAB) &&
+		     OBD_OCD_VERSION_MAJOR(ocd->ocd_version) == 2 &&
+		     OBD_OCD_VERSION_MINOR(ocd->ocd_version) == 2 &&
+		     OBD_OCD_VERSION_PATCH(ocd->ocd_version) < 55 &&
+		     strcmp(imp->imp_obd->obd_type->typ_name,
+			    LUSTRE_MGC_NAME) == 0))
+		imp->imp_need_mne_swab = 1;
+	else /* clear if server was upgraded since last connect */
+		imp->imp_need_mne_swab = 0;
+#endif
+
 	if (ocd->ocd_connect_flags & OBD_CONNECT_CKSUM) {
 		/* We sent to the server ocd_cksum_types with bits set
 		 * for algorithms we understand. The server masked off
 		 * the checksum types it doesn't support */
 		if ((ocd->ocd_cksum_types &
-		     obd_cksum_types_supported_client()) == 0) {
+		     cksum_types_supported_client()) == 0) {
 			LCONSOLE_ERROR("The negotiation of the checksum "
 				       "alogrithm to use with server %s "
 				       "failed (%x/%x)\n",
 				       obd2cli_tgt(imp->imp_obd),
 				       ocd->ocd_cksum_types,
-				       obd_cksum_types_supported_client());
+				       cksum_types_supported_client());
 			return -EPROTO;
 		} else {
 			cli->cl_supp_cksum_types = ocd->ocd_cksum_types;
@@ -899,8 +876,7 @@ static int ptlrpc_connect_set_flags(struct obd_import *imp,
 		 * Enforce ADLER for backward compatibility*/
 		cli->cl_supp_cksum_types = OBD_CKSUM_ADLER;
 	}
-	cli->cl_cksum_type = obd_cksum_type_select(imp->imp_obd->obd_name,
-						   cli->cl_supp_cksum_types);
+	cli->cl_cksum_type = cksum_type_select(cli->cl_supp_cksum_types);
 
 	if (ocd->ocd_connect_flags & OBD_CONNECT_BRW_SIZE)
 		cli->cl_max_pages_per_rpc =
@@ -929,17 +905,13 @@ static int ptlrpc_connect_set_flags(struct obd_import *imp,
 	 * this leads to losing user settings done before such as
 	 * disable lru_resize, etc. */
 	if (old_connect_flags != exp_connect_flags(exp) || init_connect) {
-		struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
-		__u64 changed_flags;
-
-		changed_flags =
-			ns->ns_connect_flags ^ ns->ns_orig_connect_flags;
 		CDEBUG(D_HA, "%s: Resetting ns_connect_flags to server "
 			     "flags: %#llx\n", imp->imp_obd->obd_name,
 			     ocd->ocd_connect_flags);
-		ns->ns_connect_flags = (ns->ns_connect_flags & changed_flags) |
-				      (ocd->ocd_connect_flags & ~changed_flags);
-		ns->ns_orig_connect_flags = ocd->ocd_connect_flags;
+		imp->imp_obd->obd_namespace->ns_connect_flags =
+			ocd->ocd_connect_flags;
+		imp->imp_obd->obd_namespace->ns_orig_connect_flags =
+			ocd->ocd_connect_flags;
 	}
 
 	if (ocd->ocd_connect_flags & OBD_CONNECT_AT)
@@ -1005,7 +977,6 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
         struct obd_import *imp = request->rq_import;
         struct lustre_handle old_hdl;
         __u64 old_connect_flags;
-	timeout_t service_timeout;
         int msg_flags;
 	struct obd_connect_data *ocd;
 	struct obd_export *exp = NULL;
@@ -1020,25 +991,11 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
 	}
 
 	if (rc) {
-		struct ptlrpc_request *free_req;
-		struct ptlrpc_request *tmp;
-
-		/* abort all delayed requests initiated connection */
-		list_for_each_entry_safe(free_req, tmp, &imp->imp_delayed_list,
-					 rq_list) {
-			spin_lock(&free_req->rq_lock);
-			if (free_req->rq_no_resend) {
-				free_req->rq_err = 1;
-				free_req->rq_status = -EIO;
-				ptlrpc_client_wake_req(free_req);
-			}
-			spin_unlock(&free_req->rq_lock);
-		}
-
 		/* if this reconnect to busy export - not need select new target
 		 * for connecting*/
 		imp->imp_force_reconnect = ptlrpc_busy_reconnect(rc);
 		spin_unlock(&imp->imp_lock);
+		ptlrpc_maybe_ping_import_soon(imp);
 		GOTO(out, rc);
 	}
 
@@ -1138,11 +1095,10 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
 	imp->imp_obd->obd_self_export->exp_connect_data = *ocd;
 
 	/* The net statistics after (re-)connect is not valid anymore,
-	 * because may reflect other routing, etc.
-	 */
-	service_timeout = lustre_msg_get_service_timeout(request->rq_repmsg);
+	 * because may reflect other routing, etc. */
 	at_reinit(&imp->imp_at.iat_net_latency, 0, 0);
-	ptlrpc_at_adj_net_latency(request, service_timeout);
+	ptlrpc_at_adj_net_latency(request,
+			lustre_msg_get_service_time(request->rq_repmsg));
 
 	/* Import flags should be updated before waking import at FULL state */
 	rc = ptlrpc_connect_set_flags(imp, ocd, old_connect_flags, exp,
@@ -1159,10 +1115,12 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
 		spin_lock(&imp->imp_lock);
 		if (msg_flags & MSG_CONNECT_REPLAYABLE) {
 			imp->imp_replayable = 1;
+			spin_unlock(&imp->imp_lock);
 			CDEBUG(D_HA, "connected to replayable target: %s\n",
 			       obd2cli_tgt(imp->imp_obd));
 		} else {
 			imp->imp_replayable = 0;
+			spin_unlock(&imp->imp_lock);
 		}
 
                 /* if applies, adjust the imp->imp_msg_magic here
@@ -1177,11 +1135,10 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
                 if (msg_flags & MSG_CONNECT_RECOVERING) {
                         CDEBUG(D_HA, "connect to %s during recovery\n",
                                obd2cli_tgt(imp->imp_obd));
-			import_set_state_nolock(imp, LUSTRE_IMP_REPLAY_LOCKS);
-			spin_unlock(&imp->imp_lock);
+                        IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
                 } else {
-			spin_unlock(&imp->imp_lock);
-			ptlrpc_activate_import(imp, true);
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
+			ptlrpc_activate_import(imp);
                 }
 
                 GOTO(finish, rc = 0);
@@ -1239,7 +1196,7 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
                                      *lustre_msg_get_handle(request->rq_repmsg);
 
                         if (!(MSG_CONNECT_RECOVERING & msg_flags)) {
-				import_set_state(imp, LUSTRE_IMP_EVICTED);
+                                IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
                                 GOTO(finish, rc = 0);
                         }
 
@@ -1252,7 +1209,7 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
                 if (imp->imp_invalid) {
                         CDEBUG(D_HA, "%s: reconnected but import is invalid; "
                                "marking evicted\n", imp->imp_obd->obd_name);
-			import_set_state(imp, LUSTRE_IMP_EVICTED);
+                        IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
                 } else if (MSG_CONNECT_RECOVERING & msg_flags) {
                         CDEBUG(D_HA, "%s: reconnected to %s during replay\n",
                                imp->imp_obd->obd_name,
@@ -1262,9 +1219,9 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
 			imp->imp_resend_replay = 1;
 			spin_unlock(&imp->imp_lock);
 
-			import_set_state(imp, imp->imp_replay_state);
+			IMPORT_SET_STATE(imp, imp->imp_replay_state);
                 } else {
-			import_set_state(imp, LUSTRE_IMP_RECOVER);
+                        IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
                 }
         } else if ((MSG_CONNECT_RECOVERING & msg_flags) && !imp->imp_invalid) {
                 LASSERT(imp->imp_replayable);
@@ -1272,13 +1229,13 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
                                 *lustre_msg_get_handle(request->rq_repmsg);
                 imp->imp_last_replay_transno = 0;
 		imp->imp_replay_cursor = &imp->imp_committed_list;
-		import_set_state(imp, LUSTRE_IMP_REPLAY);
-	} else {
+                IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
+        } else {
                 DEBUG_REQ(D_HA, request, "%s: evicting (reconnect/recover flags"
                           " not set: %x)", imp->imp_obd->obd_name, msg_flags);
                 imp->imp_remote_handle =
                                 *lustre_msg_get_handle(request->rq_repmsg);
-		import_set_state(imp, LUSTRE_IMP_EVICTED);
+                IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
         }
 
         /* Sanity checks for a reconnected import. */
@@ -1315,45 +1272,40 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
 	}
 
 out:
-	if (exp != NULL)
-		class_export_put(exp);
-
 	spin_lock(&imp->imp_lock);
 	imp->imp_connected = 0;
 	imp->imp_connect_tried = 1;
+	spin_unlock(&imp->imp_lock);
 
-	if (rc != 0) {
-		bool inact = false;
-		time64_t now = ktime_get_seconds();
-		time64_t next_connect;
-
-		import_set_state_nolock(imp, LUSTRE_IMP_DISCON);
-		if (rc == -EACCES) {
-			/*
-			 * Give up trying to reconnect
-			 * EACCES means client has no permission for connection
-			 */
-			imp->imp_obd->obd_no_recov = 1;
-			ptlrpc_deactivate_import_nolock(imp);
-			inact = true;
-		} else if (rc == -EPROTO) {
-			struct obd_connect_data *ocd;
-
-			/* reply message might not be ready */
-			if (request->rq_repmsg == NULL) {
-				spin_unlock(&imp->imp_lock);
-				RETURN(-EPROTO);
-			}
+	if (exp != NULL)
+		class_export_put(exp);
+
+        if (rc != 0) {
+                IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
+                if (rc == -EACCES) {
+                        /*
+                         * Give up trying to reconnect
+                         * EACCES means client has no permission for connection
+                         */
+                        imp->imp_obd->obd_no_recov = 1;
+                        ptlrpc_deactivate_import(imp);
+                }
 
-			ocd = req_capsule_server_get(&request->rq_pill,
-						     &RMF_CONNECT_DATA);
-			/* Servers are not supposed to refuse connections from
-			 * clients based on version, only connection feature
-			 * flags.  We should never see this from llite, but it
-			 * may be useful for debugging in the future. */
-			if (ocd &&
-			    (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
-			    (ocd->ocd_version != LUSTRE_VERSION_CODE)) {
+                if (rc == -EPROTO) {
+                        struct obd_connect_data *ocd;
+
+                        /* reply message might not be ready */
+                        if (request->rq_repmsg == NULL)
+                                RETURN(-EPROTO);
+
+                        ocd = req_capsule_server_get(&request->rq_pill,
+                                                     &RMF_CONNECT_DATA);
+                        if (ocd &&
+                            (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+                            (ocd->ocd_version != LUSTRE_VERSION_CODE)) {
+                           /* Actually servers are only supposed to refuse
+                              connection from liblustre clients, so we should
+                              never see this from VFS context */
                                 LCONSOLE_ERROR_MSG(0x16a, "Server %s version "
                                         "(%d.%d.%d.%d)"
                                         " refused connection from this client "
@@ -1365,59 +1317,17 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
                                         OBD_OCD_VERSION_PATCH(ocd->ocd_version),
                                         OBD_OCD_VERSION_FIX(ocd->ocd_version),
                                         LUSTRE_VERSION_STRING);
-				ptlrpc_deactivate_import_nolock(imp);
-				import_set_state_nolock(imp, LUSTRE_IMP_CLOSED);
-				inact = true;
-			}
-		} else if (rc == -ENODEV || rc == -ETIMEDOUT) {
-			/* ENODEV means there is no service, force reconnection
-			 * to a pair if attempt happen ptlrpc_next_reconnect
-			 * before now. ETIMEDOUT could be set during network
-			 * error and do not guarantee request deadline happened.
-			 */
-			struct obd_import_conn *conn;
-			time64_t reconnect_time;
-
-			/* Same as ptlrpc_next_reconnect, but in past */
-			reconnect_time = now - INITIAL_CONNECT_TIMEOUT;
-			list_for_each_entry(conn, &imp->imp_conn_list,
-					    oic_item) {
-				if (conn->oic_last_attempt <= reconnect_time) {
-					imp->imp_force_verify = 1;
-					break;
-				}
-			}
-		}
-
-		next_connect = imp->imp_conn_current->oic_last_attempt +
-			       (request->rq_deadline - request->rq_sent);
-		spin_unlock(&imp->imp_lock);
-
-		if (inact)
-			obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
-
-		if (rc == -EPROTO)
-			RETURN(rc);
-
-		/* adjust imp_next_ping to request deadline + 1 and reschedule
-		 * a pinger if import lost processing during CONNECTING or far
-		 * away from request deadline. It could happen when connection
-		 * was initiated outside of pinger, like
-		 * ptlrpc_set_import_discon().
-		 */
-		if (!imp->imp_force_verify && (imp->imp_next_ping <= now ||
-		    imp->imp_next_ping > next_connect)) {
-			imp->imp_next_ping = max(now, next_connect) + 1;
-			ptlrpc_pinger_wake_up();
-		}
+                                ptlrpc_deactivate_import(imp);
+                                IMPORT_SET_STATE(imp, LUSTRE_IMP_CLOSED);
+                        }
+                        RETURN(-EPROTO);
+                }
 
 		ptlrpc_maybe_ping_import_soon(imp);
 
 		CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n",
 		       obd2cli_tgt(imp->imp_obd),
 		       (char *)imp->imp_connection->c_remote_uuid.uuid, rc);
-	} else {
-		spin_unlock(&imp->imp_lock);
 	}
 
 	wake_up_all(&imp->imp_recovery_waitq);
@@ -1466,8 +1376,8 @@ static int signal_completed_replay(struct obd_import *imp)
 	if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_FINISH_REPLAY)))
 		RETURN(0);
 
-	if (!atomic_add_unless(&imp->imp_replay_inflight, 1, 1))
-		RETURN(0);
+	LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
+	atomic_inc(&imp->imp_replay_inflight);
 
 	req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING, LUSTRE_OBD_VERSION,
 					OBD_PING);
@@ -1512,7 +1422,7 @@ static int ptlrpc_invalidate_import_thread(void *data)
                 libcfs_debug_dumplog();
         }
 
-	import_set_state(imp, LUSTRE_IMP_RECOVER);
+        IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
         ptlrpc_import_recovery_state_machine(imp);
 
         class_import_put(imp);
@@ -1548,8 +1458,6 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
 
         ENTRY;
         if (imp->imp_state == LUSTRE_IMP_EVICTED) {
-		struct task_struct *task;
-
                 deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
                           &target_start, &target_len);
                 /* Don't care about MGC eviction */
@@ -1560,7 +1468,6 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
 					   "using this service will fail.\n",
 					   imp->imp_obd->obd_name, target_len,
 					   target_start);
-			LASSERTF(!obd_lbug_on_eviction, "LBUG upon eviction");
                 }
                 CDEBUG(D_HA, "evicted from %s@%s; invalidating\n",
                        obd2cli_tgt(imp->imp_obd),
@@ -1570,22 +1477,24 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
 		imp->imp_vbr_failed = 0;
 		spin_unlock(&imp->imp_lock);
 
+		{
+		struct task_struct *task;
 		/* bug 17802:  XXX client_disconnect_export vs connect request
 		 * race. if client is evicted at this time then we start
 		 * invalidate thread without reference to import and import can
 		 * be freed at same time. */
 		class_import_get(imp);
 		task = kthread_run(ptlrpc_invalidate_import_thread, imp,
-				   "ll_imp_inval");
+				     "ll_imp_inval");
 		if (IS_ERR(task)) {
 			class_import_put(imp);
+			CERROR("error starting invalidate thread: %d\n", rc);
 			rc = PTR_ERR(task);
-			CERROR("%s: can't start invalidate thread: rc = %d\n",
-			       imp->imp_obd->obd_name, rc);
 		} else {
 			rc = 0;
 		}
 		RETURN(rc);
+		}
         }
 
 	if (imp->imp_state == LUSTRE_IMP_REPLAY) {
@@ -1594,7 +1503,7 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
 		rc = ptlrpc_replay_next(imp, &inflight);
 		if (inflight == 0 &&
 		    atomic_read(&imp->imp_replay_inflight) == 0) {
-			import_set_state(imp, LUSTRE_IMP_REPLAY_LOCKS);
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
 			rc = ldlm_replay_locks(imp);
 			if (rc)
 				GOTO(out, rc);
@@ -1604,7 +1513,7 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
 
 	if (imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS) {
 		if (atomic_read(&imp->imp_replay_inflight) == 0) {
-			import_set_state(imp, LUSTRE_IMP_REPLAY_WAIT);
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_WAIT);
 			rc = signal_completed_replay(imp);
 			if (rc)
 				GOTO(out, rc);
@@ -1613,28 +1522,24 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
 
 	if (imp->imp_state == LUSTRE_IMP_REPLAY_WAIT) {
 		if (atomic_read(&imp->imp_replay_inflight) == 0) {
-			import_set_state(imp, LUSTRE_IMP_RECOVER);
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
 		}
 	}
 
-	if (imp->imp_state == LUSTRE_IMP_RECOVER) {
+        if (imp->imp_state == LUSTRE_IMP_RECOVER) {
 		struct ptlrpc_connection *conn = imp->imp_connection;
 
-		rc = ptlrpc_resend(imp);
-		if (rc)
-			GOTO(out, rc);
-		ptlrpc_activate_import(imp, true);
-
-		CDEBUG_LIMIT(imp->imp_was_idle ?
-				imp->imp_idle_debug : D_CONSOLE,
-			     "%s: Connection restored to %s (at %s)\n",
-			     imp->imp_obd->obd_name,
-			     obd_uuid2str(&conn->c_remote_uuid),
-			     obd_import_nid2str(imp));
-		spin_lock(&imp->imp_lock);
-		imp->imp_was_idle = 0;
-		spin_unlock(&imp->imp_lock);
-	}
+                rc = ptlrpc_resend(imp);
+                if (rc)
+                        GOTO(out, rc);
+                IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
+                ptlrpc_activate_import(imp);
+
+		LCONSOLE_INFO("%s: Connection restored to %s (at %s)\n",
+			      imp->imp_obd->obd_name,
+			      obd_uuid2str(&conn->c_remote_uuid),
+			      libcfs_nid2str(imp->imp_connection->c_peer.nid));
+        }
 
 	if (imp->imp_state == LUSTRE_IMP_FULL) {
 		wake_up_all(&imp->imp_recovery_waitq);
@@ -1645,12 +1550,15 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
 	RETURN(rc);
 }
 
-static struct ptlrpc_request *ptlrpc_disconnect_prep_req(struct obd_import *imp)
+int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
 {
 	struct ptlrpc_request *req;
 	int rq_opc, rc = 0;
 	ENTRY;
 
+	if (imp->imp_obd->obd_force)
+		GOTO(set_state, rc);
+
 	switch (imp->imp_connect_op) {
 	case OST_CONNECT:
 		rq_opc = OST_DISCONNECT;
@@ -1667,67 +1575,26 @@ static struct ptlrpc_request *ptlrpc_disconnect_prep_req(struct obd_import *imp)
 		       "(connect_op %d): rc = %d\n",
 		       imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
 		       imp->imp_connect_op, rc);
-		RETURN(ERR_PTR(rc));
+		RETURN(rc);
 	}
 
-	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_DISCONNECT,
-					LUSTRE_OBD_VERSION, rq_opc);
-	if (req == NULL)
-		RETURN(ERR_PTR(-ENOMEM));
-
-	/* We are disconnecting, do not retry a failed DISCONNECT rpc if
-	 * it fails.  We can get through the above with a down server
-	 * if the client doesn't know the server is gone yet. */
-	req->rq_no_resend = 1;
-
-	/* We want client umounts to happen quickly, no matter the
-	   server state... */
-	req->rq_timeout = min_t(timeout_t, req->rq_timeout,
-				INITIAL_CONNECT_TIMEOUT);
-
-	import_set_state(imp, LUSTRE_IMP_CONNECTING);
-	req->rq_send_state =  LUSTRE_IMP_CONNECTING;
-	ptlrpc_request_set_replen(req);
-
-	RETURN(req);
-}
-
-int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
-{
-	struct ptlrpc_request *req;
-	int rc = 0;
-	ENTRY;
-
-	if (imp->imp_obd->obd_force)
-		GOTO(set_state, rc);
+        if (ptlrpc_import_in_recovery(imp)) {
+                struct l_wait_info lwi;
+                cfs_duration_t timeout;
 
-	/* probably the import has been disconnected already being idle */
-	spin_lock(&imp->imp_lock);
-	if (imp->imp_state == LUSTRE_IMP_IDLE)
-		GOTO(out, rc);
-	spin_unlock(&imp->imp_lock);
-
-	if (ptlrpc_import_in_recovery(imp)) {
-		struct l_wait_info lwi;
-		long timeout_jiffies;
-		time64_t timeout;
-
-		if (AT_OFF) {
-			if (imp->imp_server_timeout)
-				timeout = obd_timeout >> 1;
-			else
-				timeout = obd_timeout;
-		} else {
-			u32 req_portal;
-			int idx;
-
-			req_portal = imp->imp_client->cli_request_portal;
-			idx = import_at_get_index(imp, req_portal);
-			timeout = at_get(&imp->imp_at.iat_service_estimate[idx]);
+                if (AT_OFF) {
+                        if (imp->imp_server_timeout)
+                                timeout = cfs_time_seconds(obd_timeout / 2);
+                        else
+                                timeout = cfs_time_seconds(obd_timeout);
+                } else {
+                        int idx = import_at_get_index(imp,
+                                imp->imp_client->cli_request_portal);
+                        timeout = cfs_time_seconds(
+                                at_get(&imp->imp_at.iat_service_estimate[idx]));
                 }
 
-		timeout_jiffies = cfs_time_seconds(timeout);
-		lwi = LWI_TIMEOUT_INTR(max_t(long, timeout_jiffies, 1),
+                lwi = LWI_TIMEOUT_INTR(cfs_timeout_cap(timeout),
                                        back_to_sleep, LWI_ON_SIGNAL_NOOP, NULL);
                 rc = l_wait_event(imp->imp_recovery_waitq,
                                   !ptlrpc_import_in_recovery(imp), &lwi);
@@ -1739,19 +1606,33 @@ int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
 		GOTO(out, rc);
 	spin_unlock(&imp->imp_lock);
 
-	req = ptlrpc_disconnect_prep_req(imp);
-	if (IS_ERR(req))
-		GOTO(set_state, rc = PTR_ERR(req));
-	rc = ptlrpc_queue_wait(req);
-	ptlrpc_req_finished(req);
+        req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_DISCONNECT,
+                                        LUSTRE_OBD_VERSION, rq_opc);
+        if (req) {
+                /* We are disconnecting, do not retry a failed DISCONNECT rpc if
+                 * it fails.  We can get through the above with a down server
+                 * if the client doesn't know the server is gone yet. */
+                req->rq_no_resend = 1;
+
+                /* We want client umounts to happen quickly, no matter the
+                   server state... */
+                req->rq_timeout = min_t(int, req->rq_timeout,
+                                        INITIAL_CONNECT_TIMEOUT);
+
+                IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING);
+                req->rq_send_state =  LUSTRE_IMP_CONNECTING;
+                ptlrpc_request_set_replen(req);
+                rc = ptlrpc_queue_wait(req);
+                ptlrpc_req_finished(req);
+        }
 
 set_state:
 	spin_lock(&imp->imp_lock);
 out:
 	if (noclose)
-		import_set_state_nolock(imp, LUSTRE_IMP_DISCON);
+		IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
 	else
-		import_set_state_nolock(imp, LUSTRE_IMP_CLOSED);
+		IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
 	memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle));
 	spin_unlock(&imp->imp_lock);
 
@@ -1761,115 +1642,15 @@ int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
 }
 EXPORT_SYMBOL(ptlrpc_disconnect_import);
 
-static void ptlrpc_reset_reqs_generation(struct obd_import *imp)
-{
-	struct ptlrpc_request *old, *tmp;
-
-	/* tag all resendable requests generated before disconnection
-	 * notice this code is part of disconnect-at-idle path only */
-	list_for_each_entry_safe(old, tmp, &imp->imp_delayed_list,
-			rq_list) {
-		spin_lock(&old->rq_lock);
-		if (old->rq_import_generation == imp->imp_generation - 1 &&
-		    !old->rq_no_resend)
-			old->rq_import_generation = imp->imp_generation;
-		spin_unlock(&old->rq_lock);
-	}
-}
-
-static int ptlrpc_disconnect_idle_interpret(const struct lu_env *env,
-					    struct ptlrpc_request *req,
-					    void *data, int rc)
-{
-	struct obd_import *imp = req->rq_import;
-	int connect = 0;
-
-	DEBUG_REQ(D_HA, req, "inflight=%d, refcount=%d: rc = %d ",
-		  atomic_read(&imp->imp_inflight),
-		  atomic_read(&imp->imp_refcount), rc);
-
-	spin_lock(&imp->imp_lock);
-	/* DISCONNECT reply can be late and another connection can just
-	 * be initiated. so we have to abort disconnection. */
-	if (req->rq_import_generation == imp->imp_generation &&
-	    imp->imp_state != LUSTRE_IMP_CLOSED) {
-		LASSERTF(imp->imp_state == LUSTRE_IMP_CONNECTING,
-			 "%s\n", ptlrpc_import_state_name(imp->imp_state));
-		memset(&imp->imp_remote_handle, 0,
-		       sizeof(imp->imp_remote_handle));
-		/* take our DISCONNECT into account */
-		if (atomic_read(&imp->imp_reqs) > 1) {
-			imp->imp_generation++;
-			imp->imp_initiated_at = imp->imp_generation;
-			import_set_state_nolock(imp, LUSTRE_IMP_NEW);
-			ptlrpc_reset_reqs_generation(imp);
-			connect = 1;
-		} else {
-			/* do not expose transient IDLE state */
-			import_set_state_nolock(imp, LUSTRE_IMP_IDLE);
-		}
-	}
-
-	if (connect) {
-		rc = ptlrpc_connect_import_locked(imp);
-		if (rc >= 0)
-			ptlrpc_pinger_add_import(imp);
-	} else {
-		spin_unlock(&imp->imp_lock);
-	}
-
-	return 0;
-}
-
-int ptlrpc_disconnect_and_idle_import(struct obd_import *imp)
-{
-	struct ptlrpc_request *req;
-	ENTRY;
-
-	if (imp->imp_obd->obd_force)
-		RETURN(0);
-
-	if (ptlrpc_import_in_recovery(imp))
-		RETURN(0);
-
-	spin_lock(&imp->imp_lock);
-	if (imp->imp_state != LUSTRE_IMP_FULL) {
-		spin_unlock(&imp->imp_lock);
-		RETURN(0);
-	}
-	spin_unlock(&imp->imp_lock);
-
-	req = ptlrpc_disconnect_prep_req(imp);
-	if (IS_ERR(req))
-		RETURN(PTR_ERR(req));
-
-	CDEBUG_LIMIT(imp->imp_idle_debug, "%s: disconnect after %llus idle\n",
-		     imp->imp_obd->obd_name,
-		     ktime_get_real_seconds() - imp->imp_last_reply_time);
-
-	/* don't make noise at reconnection */
-	spin_lock(&imp->imp_lock);
-	imp->imp_was_idle = 1;
-	spin_unlock(&imp->imp_lock);
-
-	req->rq_interpret_reply = ptlrpc_disconnect_idle_interpret;
-	ptlrpcd_add_req(req);
-
-	RETURN(0);
-}
-EXPORT_SYMBOL(ptlrpc_disconnect_and_idle_import);
-
 void ptlrpc_cleanup_imp(struct obd_import *imp)
 {
 	ENTRY;
 
 	spin_lock(&imp->imp_lock);
-
-	import_set_state_nolock(imp, LUSTRE_IMP_CLOSED);
+	IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
 	imp->imp_generation++;
-	ptlrpc_abort_inflight(imp);
-
 	spin_unlock(&imp->imp_lock);
+	ptlrpc_abort_inflight(imp);
 
 	EXIT;
 }
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/layout.c b/drivers/staging/lustrefsx/lustre/ptlrpc/layout.c
index 7db9465a3569f..d720645bafc16 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/layout.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/layout.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -46,16 +46,18 @@
 
 #include <linux/module.h>
 
+#include <lustre/lustre_idl.h>
+
 #include <llog_swab.h>
 #include <lustre_debug.h>
 #include <lustre_swab.h>
+#include <lustre_ver.h>
 #include <obd.h>
 #include <obd_support.h>
 
 /* struct ptlrpc_request, lustre_msg* */
 #include <lustre_req_layout.h>
 #include <lustre_acl.h>
-#include <lustre_nodemap.h>
 
 /*
  * RQFs (see below) refer to two struct req_msg_field arrays describing the
@@ -88,6 +90,11 @@ static const struct req_msg_field *mgs_config_read_server[] = {
         &RMF_MGS_CONFIG_RES
 };
 
+static const struct req_msg_field *log_cancel_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_LOGCOOKIES
+};
+
 static const struct req_msg_field *mdt_body_only[] = {
         &RMF_PTLRPC_BODY,
         &RMF_MDT_BODY
@@ -130,13 +137,12 @@ static const struct req_msg_field *mdt_close_client[] = {
         &RMF_CAPA1
 };
 
-static const struct req_msg_field *mdt_close_intent_client[] = {
+static const struct req_msg_field *mdt_intent_close_client[] = {
 	&RMF_PTLRPC_BODY,
 	&RMF_MDT_EPOCH,
 	&RMF_REC_REINT,
 	&RMF_CAPA1,
-	&RMF_CLOSE_DATA,
-	&RMF_U32
+	&RMF_CLOSE_DATA
 };
 
 static const struct req_msg_field *obd_statfs_server[] = {
@@ -212,8 +218,7 @@ static const struct req_msg_field *mds_reint_create_acl_client[] = {
 	&RMF_EADATA,
 	&RMF_DLM_REQ,
 	&RMF_FILE_SECCTX_NAME,
-	&RMF_FILE_SECCTX,
-	&RMF_SELINUX_POL
+	&RMF_FILE_SECCTX
 };
 
 static const struct req_msg_field *mds_reint_create_sym_client[] = {
@@ -224,8 +229,7 @@ static const struct req_msg_field *mds_reint_create_sym_client[] = {
 	&RMF_SYMTGT,
 	&RMF_DLM_REQ,
 	&RMF_FILE_SECCTX_NAME,
-	&RMF_FILE_SECCTX,
-	&RMF_SELINUX_POL
+	&RMF_FILE_SECCTX
 };
 
 static const struct req_msg_field *mds_reint_open_client[] = {
@@ -236,8 +240,7 @@ static const struct req_msg_field *mds_reint_open_client[] = {
 	&RMF_NAME,
 	&RMF_EADATA,
 	&RMF_FILE_SECCTX_NAME,
-	&RMF_FILE_SECCTX,
-	&RMF_SELINUX_POL
+	&RMF_FILE_SECCTX
 };
 
 static const struct req_msg_field *mds_reint_open_server[] = {
@@ -250,33 +253,30 @@ static const struct req_msg_field *mds_reint_open_server[] = {
 };
 
 static const struct req_msg_field *mds_reint_unlink_client[] = {
-	&RMF_PTLRPC_BODY,
-	&RMF_REC_REINT,
-	&RMF_CAPA1,
-	&RMF_NAME,
-	&RMF_DLM_REQ,
-	&RMF_SELINUX_POL
+        &RMF_PTLRPC_BODY,
+        &RMF_REC_REINT,
+        &RMF_CAPA1,
+        &RMF_NAME,
+        &RMF_DLM_REQ
 };
 
 static const struct req_msg_field *mds_reint_link_client[] = {
-	&RMF_PTLRPC_BODY,
-	&RMF_REC_REINT,
-	&RMF_CAPA1,
-	&RMF_CAPA2,
-	&RMF_NAME,
-	&RMF_DLM_REQ,
-	&RMF_SELINUX_POL
+        &RMF_PTLRPC_BODY,
+        &RMF_REC_REINT,
+        &RMF_CAPA1,
+        &RMF_CAPA2,
+        &RMF_NAME,
+        &RMF_DLM_REQ
 };
 
 static const struct req_msg_field *mds_reint_rename_client[] = {
-	&RMF_PTLRPC_BODY,
-	&RMF_REC_REINT,
-	&RMF_CAPA1,
-	&RMF_CAPA2,
-	&RMF_NAME,
-	&RMF_SYMTGT,
-	&RMF_DLM_REQ,
-	&RMF_SELINUX_POL
+        &RMF_PTLRPC_BODY,
+        &RMF_REC_REINT,
+        &RMF_CAPA1,
+        &RMF_CAPA2,
+        &RMF_NAME,
+        &RMF_SYMTGT,
+        &RMF_DLM_REQ
 };
 
 static const struct req_msg_field *mds_reint_migrate_client[] = {
@@ -287,10 +287,8 @@ static const struct req_msg_field *mds_reint_migrate_client[] = {
 	&RMF_NAME,
 	&RMF_SYMTGT,
 	&RMF_DLM_REQ,
-	&RMF_SELINUX_POL,
 	&RMF_MDT_EPOCH,
-	&RMF_CLOSE_DATA,
-	&RMF_EADATA
+	&RMF_CLOSE_DATA
 };
 
 static const struct req_msg_field *mds_last_unlink_server[] = {
@@ -318,13 +316,6 @@ static const struct req_msg_field *mds_reint_setxattr_client[] = {
         &RMF_CAPA1,
         &RMF_NAME,
         &RMF_EADATA,
-	&RMF_DLM_REQ,
-	&RMF_SELINUX_POL
-};
-
-static const struct req_msg_field *mds_reint_resync[] = {
-	&RMF_PTLRPC_BODY,
-	&RMF_REC_REINT,
 	&RMF_DLM_REQ
 };
 
@@ -337,28 +328,12 @@ static const struct req_msg_field *mdt_swap_layouts[] = {
 	&RMF_DLM_REQ
 };
 
-static const struct req_msg_field *mds_rmfid_client[] = {
-	&RMF_PTLRPC_BODY,
-	&RMF_MDT_BODY,
-	&RMF_FID_ARRAY,
-	&RMF_CAPA1,
-	&RMF_CAPA2,
-};
-
-static const struct req_msg_field *mds_rmfid_server[] = {
-	&RMF_PTLRPC_BODY,
-	&RMF_MDT_BODY,
-	&RMF_FID_ARRAY,
-	&RMF_RCS,
-};
-
 static const struct req_msg_field *obd_connect_client[] = {
-	&RMF_PTLRPC_BODY,
-	&RMF_TGTUUID,
-	&RMF_CLUUID,
-	&RMF_CONN,
-	&RMF_CONNECT_DATA,
-	&RMF_SELINUX_POL
+        &RMF_PTLRPC_BODY,
+        &RMF_TGTUUID,
+        &RMF_CLUUID,
+        &RMF_CONN,
+        &RMF_CONNECT_DATA
 };
 
 static const struct req_msg_field *obd_connect_server[] = {
@@ -450,37 +425,32 @@ static const struct req_msg_field *ldlm_intent_layout_client[] = {
 	&RMF_LAYOUT_INTENT,
 	&RMF_EADATA /* for new layout to be set up */
 };
-
 static const struct req_msg_field *ldlm_intent_open_server[] = {
-	&RMF_PTLRPC_BODY,
-	&RMF_DLM_REP,
-	&RMF_MDT_BODY,
-	&RMF_MDT_MD,
-	&RMF_ACL,
-	&RMF_CAPA1,
-	&RMF_CAPA2,
-	&RMF_NIOBUF_INLINE,
-	&RMF_FILE_SECCTX
+        &RMF_PTLRPC_BODY,
+        &RMF_DLM_REP,
+        &RMF_MDT_BODY,
+        &RMF_MDT_MD,
+        &RMF_ACL,
+        &RMF_CAPA1,
+        &RMF_CAPA2
 };
 
 static const struct req_msg_field *ldlm_intent_getattr_client[] = {
-	&RMF_PTLRPC_BODY,
-	&RMF_DLM_REQ,
-	&RMF_LDLM_INTENT,
-	&RMF_MDT_BODY,     /* coincides with mds_getattr_name_client[] */
-	&RMF_CAPA1,
-	&RMF_NAME,
-	&RMF_FILE_SECCTX_NAME
+        &RMF_PTLRPC_BODY,
+        &RMF_DLM_REQ,
+        &RMF_LDLM_INTENT,
+        &RMF_MDT_BODY,     /* coincides with mds_getattr_name_client[] */
+        &RMF_CAPA1,
+        &RMF_NAME
 };
 
 static const struct req_msg_field *ldlm_intent_getattr_server[] = {
-	&RMF_PTLRPC_BODY,
-	&RMF_DLM_REP,
-	&RMF_MDT_BODY,
-	&RMF_MDT_MD,
-	&RMF_ACL,
-	&RMF_CAPA1,
-	&RMF_FILE_SECCTX
+        &RMF_PTLRPC_BODY,
+        &RMF_DLM_REP,
+        &RMF_MDT_BODY,
+        &RMF_MDT_MD,
+        &RMF_ACL,
+        &RMF_CAPA1
 };
 
 static const struct req_msg_field *ldlm_intent_create_client[] = {
@@ -492,8 +462,7 @@ static const struct req_msg_field *ldlm_intent_create_client[] = {
 	&RMF_NAME,
 	&RMF_EADATA,
 	&RMF_FILE_SECCTX_NAME,
-	&RMF_FILE_SECCTX,
-	&RMF_SELINUX_POL
+	&RMF_FILE_SECCTX
 };
 
 static const struct req_msg_field *ldlm_intent_open_client[] = {
@@ -506,8 +475,16 @@ static const struct req_msg_field *ldlm_intent_open_client[] = {
 	&RMF_NAME,
 	&RMF_EADATA,
 	&RMF_FILE_SECCTX_NAME,
-	&RMF_FILE_SECCTX,
-	&RMF_SELINUX_POL
+	&RMF_FILE_SECCTX
+};
+
+static const struct req_msg_field *ldlm_intent_unlink_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_DLM_REQ,
+        &RMF_LDLM_INTENT,
+        &RMF_REC_REINT,    /* coincides with mds_reint_unlink_client[] */
+        &RMF_CAPA1,
+        &RMF_NAME
 };
 
 static const struct req_msg_field *ldlm_intent_getxattr_client[] = {
@@ -516,7 +493,6 @@ static const struct req_msg_field *ldlm_intent_getxattr_client[] = {
 	&RMF_LDLM_INTENT,
 	&RMF_MDT_BODY,
 	&RMF_CAPA1,
-	&RMF_SELINUX_POL
 };
 
 static const struct req_msg_field *ldlm_intent_getxattr_server[] = {
@@ -537,12 +513,11 @@ static const struct req_msg_field *mds_get_root_client[] = {
 };
 
 static const struct req_msg_field *mds_getxattr_client[] = {
-	&RMF_PTLRPC_BODY,
-	&RMF_MDT_BODY,
-	&RMF_CAPA1,
-	&RMF_NAME,
-	&RMF_EADATA,
-	&RMF_SELINUX_POL
+        &RMF_PTLRPC_BODY,
+        &RMF_MDT_BODY,
+        &RMF_CAPA1,
+        &RMF_NAME,
+        &RMF_EADATA
 };
 
 static const struct req_msg_field *mds_getxattr_server[] = {
@@ -596,6 +571,11 @@ static const struct req_msg_field *llog_log_hdr_only[] = {
         &RMF_LLOG_LOG_HDR
 };
 
+static const struct req_msg_field *llogd_conn_body_only[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_LLOGD_CONN_BODY
+};
+
 static const struct req_msg_field *llog_origin_handle_next_block_server[] = {
         &RMF_PTLRPC_BODY,
         &RMF_LLOGD_BODY,
@@ -632,18 +612,16 @@ static const struct req_msg_field *ost_destroy_client[] = {
 
 
 static const struct req_msg_field *ost_brw_client[] = {
-	&RMF_PTLRPC_BODY,
-	&RMF_OST_BODY,
-	&RMF_OBD_IOOBJ,
-	&RMF_NIOBUF_REMOTE,
-	&RMF_CAPA1,
-	&RMF_SHORT_IO
+        &RMF_PTLRPC_BODY,
+        &RMF_OST_BODY,
+        &RMF_OBD_IOOBJ,
+        &RMF_NIOBUF_REMOTE,
+        &RMF_CAPA1
 };
 
 static const struct req_msg_field *ost_brw_read_server[] = {
-	&RMF_PTLRPC_BODY,
-	&RMF_OST_BODY,
-	&RMF_SHORT_IO
+        &RMF_PTLRPC_BODY,
+        &RMF_OST_BODY
 };
 
 static const struct req_msg_field *ost_brw_write_server[] = {
@@ -751,45 +729,43 @@ static const struct req_msg_field *obd_lfsck_reply[] = {
 };
 
 static struct req_format *req_formats[] = {
-	&RQF_OBD_PING,
-	&RQF_OBD_SET_INFO,
+        &RQF_OBD_PING,
+        &RQF_OBD_SET_INFO,
 	&RQF_OBD_IDX_READ,
-	&RQF_SEC_CTX,
-	&RQF_MGS_TARGET_REG,
+        &RQF_SEC_CTX,
+        &RQF_MGS_TARGET_REG,
 #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0)
-	&RQF_MGS_SET_INFO,
+        &RQF_MGS_SET_INFO,
 #endif
-	&RQF_MGS_CONFIG_READ,
-	&RQF_SEQ_QUERY,
-	&RQF_FLD_QUERY,
+        &RQF_MGS_CONFIG_READ,
+        &RQF_SEQ_QUERY,
+        &RQF_FLD_QUERY,
 	&RQF_FLD_READ,
-	&RQF_MDS_CONNECT,
-	&RQF_MDS_DISCONNECT,
-	&RQF_MDS_GET_INFO,
+        &RQF_MDS_CONNECT,
+        &RQF_MDS_DISCONNECT,
+        &RQF_MDS_GET_INFO,
 	&RQF_MDS_GET_ROOT,
-	&RQF_MDS_STATFS,
-	&RQF_MDS_STATFS_NEW,
-	&RQF_MDS_GETATTR,
-	&RQF_MDS_GETATTR_NAME,
-	&RQF_MDS_GETXATTR,
-	&RQF_MDS_SYNC,
-	&RQF_MDS_CLOSE,
-	&RQF_MDS_CLOSE_INTENT,
+        &RQF_MDS_STATFS,
+        &RQF_MDS_GETATTR,
+        &RQF_MDS_GETATTR_NAME,
+        &RQF_MDS_GETXATTR,
+        &RQF_MDS_SYNC,
+        &RQF_MDS_CLOSE,
+	&RQF_MDS_INTENT_CLOSE,
 	&RQF_MDS_READPAGE,
 	&RQF_MDS_REINT,
 	&RQF_MDS_REINT_CREATE,
 	&RQF_MDS_REINT_CREATE_ACL,
-	&RQF_MDS_REINT_CREATE_SLAVE,
-	&RQF_MDS_REINT_CREATE_SYM,
-	&RQF_MDS_REINT_OPEN,
-	&RQF_MDS_REINT_UNLINK,
-	&RQF_MDS_REINT_LINK,
-	&RQF_MDS_REINT_RENAME,
+        &RQF_MDS_REINT_CREATE_SLAVE,
+        &RQF_MDS_REINT_CREATE_SYM,
+        &RQF_MDS_REINT_OPEN,
+        &RQF_MDS_REINT_UNLINK,
+        &RQF_MDS_REINT_LINK,
+        &RQF_MDS_REINT_RENAME,
 	&RQF_MDS_REINT_MIGRATE,
-	&RQF_MDS_REINT_SETATTR,
-	&RQF_MDS_REINT_SETXATTR,
-	&RQF_MDS_REINT_RESYNC,
-	&RQF_MDS_QUOTACTL,
+        &RQF_MDS_REINT_SETATTR,
+        &RQF_MDS_REINT_SETXATTR,
+        &RQF_MDS_QUOTACTL,
 	&RQF_MDS_HSM_PROGRESS,
 	&RQF_MDS_HSM_CT_REGISTER,
 	&RQF_MDS_HSM_CT_UNREGISTER,
@@ -798,23 +774,22 @@ static struct req_format *req_formats[] = {
 	&RQF_MDS_HSM_ACTION,
 	&RQF_MDS_HSM_REQUEST,
 	&RQF_MDS_SWAP_LAYOUTS,
-	&RQF_MDS_RMFID,
 	&RQF_OUT_UPDATE,
-	&RQF_OST_CONNECT,
-	&RQF_OST_DISCONNECT,
-	&RQF_OST_QUOTACTL,
-	&RQF_OST_GETATTR,
-	&RQF_OST_SETATTR,
-	&RQF_OST_CREATE,
-	&RQF_OST_PUNCH,
-	&RQF_OST_SYNC,
-	&RQF_OST_DESTROY,
-	&RQF_OST_BRW_READ,
-	&RQF_OST_BRW_WRITE,
-	&RQF_OST_STATFS,
-	&RQF_OST_SET_GRANT_INFO,
+        &RQF_OST_CONNECT,
+        &RQF_OST_DISCONNECT,
+        &RQF_OST_QUOTACTL,
+        &RQF_OST_GETATTR,
+        &RQF_OST_SETATTR,
+        &RQF_OST_CREATE,
+        &RQF_OST_PUNCH,
+        &RQF_OST_SYNC,
+        &RQF_OST_DESTROY,
+        &RQF_OST_BRW_READ,
+        &RQF_OST_BRW_WRITE,
+        &RQF_OST_STATFS,
+        &RQF_OST_SET_GRANT_INFO,
 	&RQF_OST_GET_INFO,
-	&RQF_OST_GET_INFO_LAST_ID,
+        &RQF_OST_GET_INFO_LAST_ID,
 	&RQF_OST_GET_INFO_LAST_FID,
 	&RQF_OST_SET_INFO_LAST_FID,
 	&RQF_OST_GET_INFO_FIEMAP,
@@ -824,23 +799,27 @@ static struct req_format *req_formats[] = {
 	&RQF_LDLM_CONVERT,
 	&RQF_LDLM_CANCEL,
 	&RQF_LDLM_CALLBACK,
-	&RQF_LDLM_CP_CALLBACK,
-	&RQF_LDLM_BL_CALLBACK,
-	&RQF_LDLM_GL_CALLBACK,
-	&RQF_LDLM_GL_CALLBACK_DESC,
-	&RQF_LDLM_INTENT,
+        &RQF_LDLM_CP_CALLBACK,
+        &RQF_LDLM_BL_CALLBACK,
+        &RQF_LDLM_GL_CALLBACK,
+	&RQF_LDLM_GL_DESC_CALLBACK,
+        &RQF_LDLM_INTENT,
 	&RQF_LDLM_INTENT_BASIC,
-	&RQF_LDLM_INTENT_LAYOUT,
-	&RQF_LDLM_INTENT_GETATTR,
-	&RQF_LDLM_INTENT_OPEN,
-	&RQF_LDLM_INTENT_CREATE,
+        &RQF_LDLM_INTENT_LAYOUT,
+        &RQF_LDLM_INTENT_GETATTR,
+        &RQF_LDLM_INTENT_OPEN,
+        &RQF_LDLM_INTENT_CREATE,
+        &RQF_LDLM_INTENT_UNLINK,
 	&RQF_LDLM_INTENT_GETXATTR,
 	&RQF_LDLM_INTENT_QUOTA,
 	&RQF_QUOTA_DQACQ,
-	&RQF_LLOG_ORIGIN_HANDLE_CREATE,
-	&RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK,
-	&RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK,
-	&RQF_LLOG_ORIGIN_HANDLE_READ_HEADER,
+        &RQF_LOG_CANCEL,
+        &RQF_LLOG_ORIGIN_HANDLE_CREATE,
+        &RQF_LLOG_ORIGIN_HANDLE_DESTROY,
+        &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK,
+        &RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK,
+        &RQF_LLOG_ORIGIN_HANDLE_READ_HEADER,
+	&RQF_LLOG_ORIGIN_CONNECT,
 	&RQF_CONNECT,
 	&RQF_LFSCK_NOTIFY,
 	&RQF_LFSCK_QUERY,
@@ -922,8 +901,8 @@ struct req_msg_field RMF_MGS_CONFIG_RES =
 EXPORT_SYMBOL(RMF_MGS_CONFIG_RES);
 
 struct req_msg_field RMF_U32 =
-	DEFINE_MSGF("generic u32", RMF_F_STRUCT_ARRAY,
-		    sizeof(__u32), lustre_swab_generic_32s, NULL);
+        DEFINE_MSGF("generic u32", 0,
+                    sizeof(__u32), lustre_swab_generic_32s, NULL);
 EXPORT_SYMBOL(RMF_U32);
 
 struct req_msg_field RMF_SETINFO_VAL =
@@ -1009,10 +988,6 @@ struct req_msg_field RMF_NAME =
         DEFINE_MSGF("name", RMF_F_STRING, -1, NULL, NULL);
 EXPORT_SYMBOL(RMF_NAME);
 
-struct req_msg_field RMF_FID_ARRAY =
-	DEFINE_MSGF("fid_array", 0, -1, NULL, NULL);
-EXPORT_SYMBOL(RMF_FID_ARRAY);
-
 struct req_msg_field RMF_SYMTGT =
         DEFINE_MSGF("symtgt", RMF_F_STRING, -1, NULL, NULL);
 EXPORT_SYMBOL(RMF_SYMTGT);
@@ -1036,7 +1011,7 @@ struct req_msg_field RMF_FILE_SECCTX_NAME =
 EXPORT_SYMBOL(RMF_FILE_SECCTX_NAME);
 
 struct req_msg_field RMF_FILE_SECCTX =
-	DEFINE_MSGF("file_secctx", RMF_F_NO_SIZE_CHECK, -1, NULL, NULL);
+	DEFINE_MSGF("file_secctx", 0, -1, NULL, NULL);
 EXPORT_SYMBOL(RMF_FILE_SECCTX);
 
 struct req_msg_field RMF_LLOGD_BODY =
@@ -1123,11 +1098,13 @@ struct req_msg_field RMF_LOGCOOKIES =
 EXPORT_SYMBOL(RMF_LOGCOOKIES);
 
 struct req_msg_field RMF_CAPA1 =
-	DEFINE_MSGF("capa", 0, 0, NULL, NULL);
+        DEFINE_MSGF("capa", 0, sizeof(struct lustre_capa),
+                    lustre_swab_lustre_capa, NULL);
 EXPORT_SYMBOL(RMF_CAPA1);
 
 struct req_msg_field RMF_CAPA2 =
-	DEFINE_MSGF("capa", 0, 0, NULL, NULL);
+        DEFINE_MSGF("capa", 0, sizeof(struct lustre_capa),
+                    lustre_swab_lustre_capa, NULL);
 EXPORT_SYMBOL(RMF_CAPA2);
 
 struct req_msg_field RMF_LAYOUT_INTENT =
@@ -1136,10 +1113,6 @@ struct req_msg_field RMF_LAYOUT_INTENT =
 		    NULL);
 EXPORT_SYMBOL(RMF_LAYOUT_INTENT);
 
-struct req_msg_field RMF_SELINUX_POL =
-	DEFINE_MSGF("selinux_pol", RMF_F_STRING, -1, NULL, NULL);
-EXPORT_SYMBOL(RMF_SELINUX_POL);
-
 /*
  * OST request field.
  */
@@ -1160,15 +1133,9 @@ struct req_msg_field RMF_NIOBUF_REMOTE =
                     dump_rniobuf);
 EXPORT_SYMBOL(RMF_NIOBUF_REMOTE);
 
-struct req_msg_field RMF_NIOBUF_INLINE =
-	DEFINE_MSGF("niobuf_inline", RMF_F_NO_SIZE_CHECK,
-		    sizeof(struct niobuf_remote), lustre_swab_niobuf_remote,
-		    dump_rniobuf);
-EXPORT_SYMBOL(RMF_NIOBUF_INLINE);
-
 struct req_msg_field RMF_RCS =
-	DEFINE_MSGF("niobuf_rcs", RMF_F_STRUCT_ARRAY, sizeof(__u32),
-		    lustre_swab_generic_32s, dump_rcs);
+        DEFINE_MSGF("niobuf_remote", RMF_F_STRUCT_ARRAY, sizeof(__u32),
+                    lustre_swab_generic_32s, dump_rcs);
 EXPORT_SYMBOL(RMF_RCS);
 
 struct req_msg_field RMF_EAVALS_LENS =
@@ -1192,8 +1159,8 @@ struct req_msg_field RMF_OST_ID =
 EXPORT_SYMBOL(RMF_OST_ID);
 
 struct req_msg_field RMF_FIEMAP_KEY =
-	DEFINE_MSGF("fiemap_key", 0, sizeof(struct ll_fiemap_info_key),
-		    lustre_swab_fiemap_info_key, NULL);
+        DEFINE_MSGF("fiemap", 0, sizeof(struct ll_fiemap_info_key),
+                    lustre_swab_fiemap, NULL);
 EXPORT_SYMBOL(RMF_FIEMAP_KEY);
 
 struct req_msg_field RMF_FIEMAP_VAL =
@@ -1204,9 +1171,6 @@ struct req_msg_field RMF_IDX_INFO =
 	DEFINE_MSGF("idx_info", 0, sizeof(struct idx_info),
 		    lustre_swab_idx_info, NULL);
 EXPORT_SYMBOL(RMF_IDX_INFO);
-struct req_msg_field RMF_SHORT_IO =
-	DEFINE_MSGF("short_io", 0, -1, NULL, NULL);
-EXPORT_SYMBOL(RMF_SHORT_IO);
 struct req_msg_field RMF_HSM_USER_STATE =
 	DEFINE_MSGF("hsm_user_state", 0, sizeof(struct hsm_user_state),
 		    lustre_swab_hsm_user_state, NULL);
@@ -1234,7 +1198,7 @@ struct req_msg_field RMF_MDS_HSM_USER_ITEM =
 EXPORT_SYMBOL(RMF_MDS_HSM_USER_ITEM);
 
 struct req_msg_field RMF_MDS_HSM_ARCHIVE =
-	DEFINE_MSGF("hsm_archive", RMF_F_STRUCT_ARRAY,
+	DEFINE_MSGF("hsm_archive", 0,
 		    sizeof(__u32), lustre_swab_generic_32s, NULL);
 EXPORT_SYMBOL(RMF_MDS_HSM_ARCHIVE);
 
@@ -1380,6 +1344,10 @@ struct req_format RQF_FLD_READ =
 	DEFINE_REQ_FMT0("FLD_READ", fld_read_client, fld_read_server);
 EXPORT_SYMBOL(RQF_FLD_READ);
 
+struct req_format RQF_LOG_CANCEL =
+        DEFINE_REQ_FMT0("OBD_LOG_CANCEL", log_cancel_client, empty);
+EXPORT_SYMBOL(RQF_LOG_CANCEL);
+
 struct req_format RQF_MDS_QUOTACTL =
         DEFINE_REQ_FMT0("MDS_QUOTACTL", quotactl_only, quotactl_only);
 EXPORT_SYMBOL(RQF_MDS_QUOTACTL);
@@ -1403,13 +1371,9 @@ struct req_format RQF_MDS_GET_ROOT =
 EXPORT_SYMBOL(RQF_MDS_GET_ROOT);
 
 struct req_format RQF_MDS_STATFS =
-	DEFINE_REQ_FMT0("MDS_STATFS", empty, obd_statfs_server);
+        DEFINE_REQ_FMT0("MDS_STATFS", empty, obd_statfs_server);
 EXPORT_SYMBOL(RQF_MDS_STATFS);
 
-struct req_format RQF_MDS_STATFS_NEW =
-	DEFINE_REQ_FMT0("MDS_STATFS_NEW", mdt_body_only, obd_statfs_server);
-EXPORT_SYMBOL(RQF_MDS_STATFS_NEW);
-
 struct req_format RQF_MDS_SYNC =
         DEFINE_REQ_FMT0("MDS_SYNC", mdt_body_capa, mdt_body_only);
 EXPORT_SYMBOL(RQF_MDS_SYNC);
@@ -1487,10 +1451,6 @@ struct req_format RQF_MDS_REINT_SETXATTR =
 			mds_reint_setxattr_client, mdt_body_only);
 EXPORT_SYMBOL(RQF_MDS_REINT_SETXATTR);
 
-struct req_format RQF_MDS_REINT_RESYNC =
-	DEFINE_REQ_FMT0("MDS_REINT_RESYNC", mds_reint_resync, mdt_body_only);
-EXPORT_SYMBOL(RQF_MDS_REINT_RESYNC);
-
 struct req_format RQF_MDS_CONNECT =
         DEFINE_REQ_FMT0("MDS_CONNECT",
                         obd_connect_client, obd_connect_server);
@@ -1546,10 +1506,10 @@ struct req_format RQF_LDLM_GL_CALLBACK =
                         ldlm_gl_callback_server);
 EXPORT_SYMBOL(RQF_LDLM_GL_CALLBACK);
 
-struct req_format RQF_LDLM_GL_CALLBACK_DESC =
+struct req_format RQF_LDLM_GL_DESC_CALLBACK =
 	DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_gl_callback_desc_client,
 			ldlm_gl_callback_server);
-EXPORT_SYMBOL(RQF_LDLM_GL_CALLBACK_DESC);
+EXPORT_SYMBOL(RQF_LDLM_GL_DESC_CALLBACK);
 
 struct req_format RQF_LDLM_INTENT_BASIC =
 	DEFINE_REQ_FMT0("LDLM_INTENT_BASIC",
@@ -1562,7 +1522,7 @@ struct req_format RQF_LDLM_INTENT =
 EXPORT_SYMBOL(RQF_LDLM_INTENT);
 
 struct req_format RQF_LDLM_INTENT_LAYOUT =
-	DEFINE_REQ_FMT0("LDLM_INTENT_LAYOUT",
+	DEFINE_REQ_FMT0("LDLM_INTENT_LAYOUT ",
 			ldlm_intent_layout_client, ldlm_enqueue_lvb_server);
 EXPORT_SYMBOL(RQF_LDLM_INTENT_LAYOUT);
 
@@ -1581,6 +1541,11 @@ struct req_format RQF_LDLM_INTENT_CREATE =
                         ldlm_intent_create_client, ldlm_intent_getattr_server);
 EXPORT_SYMBOL(RQF_LDLM_INTENT_CREATE);
 
+struct req_format RQF_LDLM_INTENT_UNLINK =
+        DEFINE_REQ_FMT0("LDLM_INTENT_UNLINK",
+                        ldlm_intent_unlink_client, ldlm_intent_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_UNLINK);
+
 struct req_format RQF_LDLM_INTENT_GETXATTR =
 	DEFINE_REQ_FMT0("LDLM_INTENT_GETXATTR",
 			ldlm_intent_getxattr_client,
@@ -1592,10 +1557,10 @@ struct req_format RQF_MDS_CLOSE =
                         mdt_close_client, mds_last_unlink_server);
 EXPORT_SYMBOL(RQF_MDS_CLOSE);
 
-struct req_format RQF_MDS_CLOSE_INTENT =
-	DEFINE_REQ_FMT0("MDS_CLOSE_INTENT",
-			mdt_close_intent_client, mds_last_unlink_server);
-EXPORT_SYMBOL(RQF_MDS_CLOSE_INTENT);
+struct req_format RQF_MDS_INTENT_CLOSE =
+	DEFINE_REQ_FMT0("MDS_CLOSE",
+			mdt_intent_close_client, mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_INTENT_CLOSE);
 
 struct req_format RQF_MDS_READPAGE =
         DEFINE_REQ_FMT0("MDS_READPAGE",
@@ -1636,16 +1601,16 @@ struct req_format RQF_MDS_SWAP_LAYOUTS =
 			mdt_swap_layouts, empty);
 EXPORT_SYMBOL(RQF_MDS_SWAP_LAYOUTS);
 
-struct req_format RQF_MDS_RMFID =
-	DEFINE_REQ_FMT0("MDS_RMFID", mds_rmfid_client,
-			mds_rmfid_server);
-EXPORT_SYMBOL(RQF_MDS_RMFID);
-
 struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE =
         DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_CREATE",
                         llog_origin_handle_create_client, llogd_body_only);
 EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_CREATE);
 
+struct req_format RQF_LLOG_ORIGIN_HANDLE_DESTROY =
+        DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_DESTROY",
+                        llogd_body_only, llogd_body_only);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_DESTROY);
+
 struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK =
         DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_NEXT_BLOCK",
                         llogd_body_only, llog_origin_handle_next_block_server);
@@ -1661,6 +1626,10 @@ struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER =
                         llogd_body_only, llog_log_hdr_only);
 EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_READ_HEADER);
 
+struct req_format RQF_LLOG_ORIGIN_CONNECT =
+        DEFINE_REQ_FMT0("LLOG_ORIGIN_CONNECT", llogd_conn_body_only, empty);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_CONNECT);
+
 struct req_format RQF_CONNECT =
 	DEFINE_REQ_FMT0("CONNECT", obd_connect_client, obd_connect_server);
 EXPORT_SYMBOL(RQF_CONNECT);
@@ -2371,13 +2340,12 @@ __u32 req_capsule_fmt_size(__u32 magic, const struct req_format *fmt,
 	if (size == 0)
 		return size;
 
-	for (; i < fmt->rf_fields[loc].nr; ++i)
-		if (fmt->rf_fields[loc].d[i]->rmf_size != -1)
-			size += cfs_size_round(fmt->rf_fields[loc].d[i]->
-					       rmf_size);
-	return size;
+        for (; i < fmt->rf_fields[loc].nr; ++i)
+                if (fmt->rf_fields[loc].d[i]->rmf_size != -1)
+                        size += cfs_size_round(fmt->rf_fields[loc].d[i]->
+                                               rmf_size);
+        return size;
 }
-EXPORT_SYMBOL(req_capsule_fmt_size);
 
 /**
  * Changes the format of an RPC.
@@ -2571,46 +2539,3 @@ int req_capsule_server_grow(struct req_capsule *pill,
         return 0;
 }
 EXPORT_SYMBOL(req_capsule_server_grow);
-
-int req_check_sepol(struct req_capsule *pill)
-{
-	int rc = 0;
-#ifdef HAVE_SERVER_SUPPORT
-	struct obd_export *export;
-	struct lu_nodemap *nm = NULL;
-	const char *sepol = NULL;
-	const char *nm_sepol = NULL;
-
-	if (!pill->rc_req)
-		return -EPROTO;
-
-	export = pill->rc_req->rq_export;
-	if (!export || !exp_connect_sepol(export) ||
-	    !req_capsule_has_field(pill, &RMF_SELINUX_POL, RCL_CLIENT))
-		goto nm;
-
-	if (req_capsule_get_size(pill, &RMF_SELINUX_POL, RCL_CLIENT) == 0)
-		goto nm;
-
-	sepol = req_capsule_client_get(pill, &RMF_SELINUX_POL);
-	CDEBUG(D_SEC, "retrieved sepol %s\n", sepol);
-
-nm:
-	if (export) {
-		nm = nodemap_get_from_exp(export);
-		if (!IS_ERR_OR_NULL(nm)) {
-			nm_sepol = nodemap_get_sepol(nm);
-			if (nm_sepol && nm_sepol[0])
-				if (sepol == NULL ||
-				    strcmp(sepol, nm_sepol) != 0)
-					rc = -EACCES;
-		}
-	}
-
-	if (!IS_ERR_OR_NULL(nm))
-		nodemap_putref(nm);
-#endif
-
-	return rc;
-}
-EXPORT_SYMBOL(req_check_sepol);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/llog_client.c b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_client.c
index 0f149b692362c..a39db55028dc5 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/llog_client.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_client.c
@@ -136,6 +136,41 @@ static int llog_client_open(const struct lu_env *env,
 	return rc;
 }
 
+static int llog_client_destroy(const struct lu_env *env,
+			       struct llog_handle *loghandle,
+			       struct thandle *th)
+{
+        struct obd_import     *imp;
+        struct ptlrpc_request *req = NULL;
+        struct llogd_body     *body;
+        int                    rc;
+        ENTRY;
+
+        LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp);
+        req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_DESTROY,
+                                        LUSTRE_LOG_VERSION,
+                                        LLOG_ORIGIN_HANDLE_DESTROY);
+        if (req == NULL)
+                GOTO(err_exit, rc =-ENOMEM);
+
+        body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+        body->lgd_logid = loghandle->lgh_id;
+        body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags;
+
+	if (!(body->lgd_llh_flags & LLOG_F_IS_PLAIN))
+		CERROR("%s: wrong llog flags %x\n", imp->imp_obd->obd_name,
+		       body->lgd_llh_flags);
+
+        ptlrpc_request_set_replen(req);
+        rc = ptlrpc_queue_wait(req);
+
+        ptlrpc_req_finished(req);
+err_exit:
+        LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp);
+        RETURN(rc);
+}
+
+
 static int llog_client_next_block(const struct lu_env *env,
 				  struct llog_handle *loghandle,
 				  int *cur_idx, int next_idx,
@@ -333,6 +368,7 @@ struct llog_operations llog_client_ops = {
 	.lop_prev_block		= llog_client_prev_block,
 	.lop_read_header	= llog_client_read_header,
 	.lop_open		= llog_client_open,
+	.lop_destroy		= llog_client_destroy,
 	.lop_close		= llog_client_close,
 };
 EXPORT_SYMBOL(llog_client_ops);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/llog_server.c b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_server.c
index ca91a1c9491ac..4864b499120df 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/llog_server.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_server.c
@@ -111,6 +111,45 @@ int llog_origin_handle_open(struct ptlrpc_request *req)
 	return rc;
 }
 
+int llog_origin_handle_destroy(struct ptlrpc_request *req)
+{
+	struct llogd_body	*body;
+	struct llog_logid	*logid = NULL;
+	struct llog_ctxt	*ctxt;
+	int			 rc;
+
+	ENTRY;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL)
+		RETURN(err_serious(-EFAULT));
+
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc < 0)
+		RETURN(err_serious(-ENOMEM));
+
+	if (ostid_id(&body->lgd_logid.lgl_oi) > 0)
+		logid = &body->lgd_logid;
+
+	if (!(body->lgd_llh_flags & LLOG_F_IS_PLAIN))
+		CERROR("%s: wrong llog flags %x\n",
+		       req->rq_export->exp_obd->obd_name, body->lgd_llh_flags);
+
+	if (body->lgd_ctxt_idx >= LLOG_MAX_CTXTS) {
+		CDEBUG(D_WARNING, "%s: bad ctxt ID: idx=%d\n",
+		       req->rq_export->exp_obd->obd_name, body->lgd_ctxt_idx);
+		RETURN(-EPROTO);
+	}
+
+	ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx);
+	if (ctxt == NULL)
+		RETURN(-ENODEV);
+
+	rc = llog_erase(req->rq_svc_thread->t_env, ctxt, logid, NULL);
+	llog_ctxt_put(ctxt);
+	RETURN(rc);
+}
+
 int llog_origin_handle_next_block(struct ptlrpc_request *req)
 {
 	struct llog_handle	*loghandle;
@@ -285,3 +324,15 @@ int llog_origin_handle_read_header(struct ptlrpc_request *req)
 	llog_ctxt_put(ctxt);
 	return rc;
 }
+
+int llog_origin_handle_close(struct ptlrpc_request *req)
+{
+	int	 rc;
+
+	ENTRY;
+
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc)
+		RETURN(err_serious(-ENOMEM));
+	RETURN(0);
+}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c b/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
index df178e0a02c82..933183a83dbb3 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -35,6 +35,7 @@
 #include <obd_support.h>
 #include <obd.h>
 #include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
 #include <lustre_net.h>
 #include <obd_class.h>
 #include "ptlrpc_internal.h"
@@ -95,7 +96,6 @@ static struct ll_rpc_opcode {
 	{ MDS_HSM_CT_REGISTER, "mds_hsm_ct_register" },
 	{ MDS_HSM_CT_UNREGISTER, "mds_hsm_ct_unregister" },
 	{ MDS_SWAP_LAYOUTS,	"mds_swap_layouts" },
-	{ MDS_RMFID,		"mds_rmfid" },
         { LDLM_ENQUEUE,     "ldlm_enqueue" },
         { LDLM_CONVERT,     "ldlm_convert" },
         { LDLM_CANCEL,      "ldlm_cancel" },
@@ -110,17 +110,17 @@ static struct ll_rpc_opcode {
         { MGS_TARGET_DEL,   "mgs_target_del" },
         { MGS_SET_INFO,     "mgs_set_info" },
         { MGS_CONFIG_READ,  "mgs_config_read" },
-	{ OBD_PING,			 "obd_ping" },
-	{ 401, /* was OBD_LOG_CANCEL */	 "llog_cancel" },
-	{ 402, /* was OBD_QC_CALLBACK */ "obd_quota_callback" },
-	{ OBD_IDX_READ,			 "dt_index_read" },
+        { OBD_PING,         "obd_ping" },
+	{ OBD_LOG_CANCEL,	"llog_cancel" },
+        { OBD_QC_CALLBACK,  "obd_quota_callback" },
+	{ OBD_IDX_READ,	    "dt_index_read" },
 	{ LLOG_ORIGIN_HANDLE_CREATE,	 "llog_origin_handle_open" },
         { LLOG_ORIGIN_HANDLE_NEXT_BLOCK, "llog_origin_handle_next_block" },
         { LLOG_ORIGIN_HANDLE_READ_HEADER,"llog_origin_handle_read_header" },
-        { 504, /*LLOG_ORIGIN_HANDLE_WRITE_REC*/"llog_origin_handle_write_rec" },
-        { 505, /* was LLOG_ORIGIN_HANDLE_CLOSE */ "llog_origin_handle_close" },
-        { 506, /* was LLOG_ORIGIN_CONNECT */ "llog_origin_connect" },
-        { 507, /* was LLOG_CATINFO */	 "llog_catinfo" },
+        { LLOG_ORIGIN_HANDLE_WRITE_REC,  "llog_origin_handle_write_rec" },
+        { LLOG_ORIGIN_HANDLE_CLOSE,      "llog_origin_handle_close" },
+        { LLOG_ORIGIN_CONNECT,           "llog_origin_connect" },
+        { LLOG_CATINFO,                  "llog_catinfo" },
         { LLOG_ORIGIN_HANDLE_PREV_BLOCK, "llog_origin_handle_prev_block" },
         { LLOG_ORIGIN_HANDLE_DESTROY,    "llog_origin_handle_destroy" },
         { QUOTA_DQACQ,      "quota_acquire" },
@@ -140,21 +140,20 @@ static struct ll_eopcode {
      __u32       opcode;
      const char *opname;
 } ll_eopcode_table[EXTRA_LAST_OPC] = {
-	{ LDLM_GLIMPSE_ENQUEUE, "ldlm_glimpse_enqueue" },
-	{ LDLM_PLAIN_ENQUEUE,   "ldlm_plain_enqueue" },
-	{ LDLM_EXTENT_ENQUEUE,  "ldlm_extent_enqueue" },
-	{ LDLM_FLOCK_ENQUEUE,   "ldlm_flock_enqueue" },
-	{ LDLM_IBITS_ENQUEUE,   "ldlm_ibits_enqueue" },
-	{ MDS_REINT_SETATTR,    "mds_reint_setattr" },
-	{ MDS_REINT_CREATE,     "mds_reint_create" },
-	{ MDS_REINT_LINK,       "mds_reint_link" },
-	{ MDS_REINT_UNLINK,     "mds_reint_unlink" },
-	{ MDS_REINT_RENAME,     "mds_reint_rename" },
-	{ MDS_REINT_OPEN,       "mds_reint_open" },
-	{ MDS_REINT_SETXATTR,   "mds_reint_setxattr" },
-	{ MDS_REINT_RESYNC,	"mds_reint_resync" },
-	{ BRW_READ_BYTES,       "read_bytes" },
-	{ BRW_WRITE_BYTES,      "write_bytes" },
+        { LDLM_GLIMPSE_ENQUEUE, "ldlm_glimpse_enqueue" },
+        { LDLM_PLAIN_ENQUEUE,   "ldlm_plain_enqueue" },
+        { LDLM_EXTENT_ENQUEUE,  "ldlm_extent_enqueue" },
+        { LDLM_FLOCK_ENQUEUE,   "ldlm_flock_enqueue" },
+        { LDLM_IBITS_ENQUEUE,   "ldlm_ibits_enqueue" },
+        { MDS_REINT_SETATTR,    "mds_reint_setattr" },
+        { MDS_REINT_CREATE,     "mds_reint_create" },
+        { MDS_REINT_LINK,       "mds_reint_link" },
+        { MDS_REINT_UNLINK,     "mds_reint_unlink" },
+        { MDS_REINT_RENAME,     "mds_reint_rename" },
+        { MDS_REINT_OPEN,       "mds_reint_open" },
+        { MDS_REINT_SETXATTR,   "mds_reint_setxattr" },
+        { BRW_READ_BYTES,       "read_bytes" },
+        { BRW_WRITE_BYTES,      "write_bytes" },
 };
 
 const char *ll_opcode2str(__u32 opcode)
@@ -195,33 +194,32 @@ static const char *ll_eopcode2str(__u32 opcode)
         return ll_eopcode_table[opcode].opname;
 }
 
-static void
-ptlrpc_ldebugfs_register(struct dentry *root, char *dir, char *name,
-			 struct dentry **debugfs_root_ret,
-			 struct lprocfs_stats **stats_ret)
+#ifdef CONFIG_PROC_FS
+static void ptlrpc_lprocfs_register(struct proc_dir_entry *root, char *dir,
+                             char *name, struct proc_dir_entry **procroot_ret,
+                             struct lprocfs_stats **stats_ret)
 {
-	struct dentry *svc_debugfs_entry;
+        struct proc_dir_entry *svc_procroot;
         struct lprocfs_stats *svc_stats;
         int i, rc;
         unsigned int svc_counter_config = LPROCFS_CNTR_AVGMINMAX |
                                           LPROCFS_CNTR_STDDEV;
 
-	LASSERT(!*debugfs_root_ret);
-	LASSERT(!*stats_ret);
+        LASSERT(*procroot_ret == NULL);
+        LASSERT(*stats_ret == NULL);
 
-	svc_stats = lprocfs_alloc_stats(EXTRA_MAX_OPCODES + LUSTRE_MAX_OPCODES,
-					0);
-	if (!svc_stats)
+        svc_stats = lprocfs_alloc_stats(EXTRA_MAX_OPCODES+LUSTRE_MAX_OPCODES,0);
+        if (svc_stats == NULL)
                 return;
 
         if (dir) {
-		svc_debugfs_entry = ldebugfs_register(dir, root, NULL, NULL);
-		if (IS_ERR(svc_debugfs_entry)) {
+		svc_procroot = lprocfs_register(dir, root, NULL, NULL);
+                if (IS_ERR(svc_procroot)) {
                         lprocfs_free_stats(&svc_stats);
                         return;
                 }
         } else {
-		svc_debugfs_entry = root;
+                svc_procroot = root;
         }
 
         lprocfs_counter_init(svc_stats, PTLRPC_REQWAIT_CNTR,
@@ -237,7 +235,7 @@ ptlrpc_ldebugfs_register(struct dentry *root, char *dir, char *name,
         for (i = 0; i < EXTRA_LAST_OPC; i++) {
                 char *units;
 
-		switch (i) {
+                switch(i) {
                 case BRW_WRITE_BYTES:
                 case BRW_READ_BYTES:
                         units = "bytes";
@@ -257,14 +255,14 @@ ptlrpc_ldebugfs_register(struct dentry *root, char *dir, char *name,
                                      ll_opcode2str(opcode), "usec");
         }
 
-	rc = ldebugfs_register_stats(svc_debugfs_entry, name, svc_stats);
+        rc = lprocfs_register_stats(svc_procroot, name, svc_stats);
         if (rc < 0) {
                 if (dir)
-			ldebugfs_remove(&svc_debugfs_entry);
+                        lprocfs_remove(&svc_procroot);
                 lprocfs_free_stats(&svc_stats);
         } else {
                 if (dir)
-			*debugfs_root_ret = svc_debugfs_entry;
+                        *procroot_ret = svc_procroot;
                 *stats_ret = svc_stats;
         }
 }
@@ -283,9 +281,7 @@ ptlrpc_lprocfs_req_history_len_seq_show(struct seq_file *m, void *v)
 	seq_printf(m, "%d\n", total);
 	return 0;
 }
-
-
-LDEBUGFS_SEQ_FOPS_RO(ptlrpc_lprocfs_req_history_len);
+LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_req_history_len);
 
 static int
 ptlrpc_lprocfs_req_history_max_seq_show(struct seq_file *m, void *n)
@@ -309,12 +305,11 @@ ptlrpc_lprocfs_req_history_max_seq_write(struct file *file,
 {
 	struct seq_file *m = file->private_data;
 	struct ptlrpc_service *svc = m->private;
-	unsigned long long val;
-	unsigned long long limit;
 	int bufpages;
+	__s64 val;
 	int rc;
 
-	rc = kstrtoull_from_user(buffer, count, 0, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc < 0)
 		return rc;
 
@@ -323,15 +318,10 @@ ptlrpc_lprocfs_req_history_max_seq_write(struct file *file,
 
 	/* This sanity check is more of an insanity check; we can still
 	 * hose a kernel by allowing the request history to grow too
-	 * far. The roundup to the next power of two is an empirical way
-	 * to take care that request buffer is allocated in Slab and thus
-	 * will be upgraded */
-	bufpages = (roundup_pow_of_two(svc->srv_buf_size) + PAGE_SIZE - 1) >>
+	 * far. */
+	bufpages = (svc->srv_buf_size + PAGE_SIZE - 1) >>
 							PAGE_SHIFT;
-	limit = cfs_totalram_pages() / (2 * bufpages);
-	/* do not allow history to consume more than half max number of rqbds */
-	if ((svc->srv_nrqbds_max == 0 && val > limit) ||
-	    (svc->srv_nrqbds_max != 0 && val > svc->srv_nrqbds_max / 2))
+	if (val > cfs_totalram_pages() / (2 * bufpages))
 		return -ERANGE;
 
 	spin_lock(&svc->srv_lock);
@@ -346,64 +336,28 @@ ptlrpc_lprocfs_req_history_max_seq_write(struct file *file,
 
 	return count;
 }
-
-LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_req_history_max);
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_req_history_max);
 
 static int
-ptlrpc_lprocfs_req_buffers_max_seq_show(struct seq_file *m, void *n)
+ptlrpc_lprocfs_threads_min_seq_show(struct seq_file *m, void *n)
 {
 	struct ptlrpc_service *svc = m->private;
 
-	seq_printf(m, "%d\n", svc->srv_nrqbds_max);
+	seq_printf(m, "%d\n",
+		   svc->srv_nthrs_cpt_init * svc->srv_ncpts);
 	return 0;
 }
 
 static ssize_t
-ptlrpc_lprocfs_req_buffers_max_seq_write(struct file *file,
-					 const char __user *buffer,
-					 size_t count, loff_t *off)
+ptlrpc_lprocfs_threads_min_seq_write(struct file *file,
+				     const char __user *buffer,
+				     size_t count, loff_t *off)
 {
 	struct seq_file *m = file->private_data;
 	struct ptlrpc_service *svc = m->private;
-	int val;
-	int rc;
+	__s64 val;
+	int rc = lprocfs_str_to_s64(file, buffer, count, &val);
 
-	rc = kstrtoint_from_user(buffer, count, 0, &val);
-	if (rc < 0)
-		return rc;
-
-	if (val < svc->srv_nbuf_per_group && val != 0)
-		return -ERANGE;
-
-	spin_lock(&svc->srv_lock);
-
-	svc->srv_nrqbds_max = (uint)val;
-
-	spin_unlock(&svc->srv_lock);
-
-	return count;
-}
-
-LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_req_buffers_max);
-
-static ssize_t threads_min_show(struct kobject *kobj, struct attribute *attr,
-				char *buf)
-{
-	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
-						  srv_kobj);
-
-	return sprintf(buf, "%d\n", svc->srv_nthrs_cpt_init * svc->srv_ncpts);
-}
-
-static ssize_t threads_min_store(struct kobject *kobj, struct attribute *attr,
-				 const char *buffer, size_t count)
-{
-	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
-						  srv_kobj);
-	unsigned long val;
-	int rc;
-
-	rc = kstrtoul(buffer, 10, &val);
 	if (rc < 0)
 		return rc;
 
@@ -422,43 +376,44 @@ static ssize_t threads_min_store(struct kobject *kobj, struct attribute *attr,
 
 	return count;
 }
-LUSTRE_RW_ATTR(threads_min);
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_threads_min);
 
-static ssize_t threads_started_show(struct kobject *kobj,
-				    struct attribute *attr,
-				    char *buf)
+static int
+ptlrpc_lprocfs_threads_started_seq_show(struct seq_file *m, void *n)
 {
-	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
-						  srv_kobj);
-	struct ptlrpc_service_part *svcpt;
-	int total = 0;
-	int i;
+	struct ptlrpc_service		*svc = m->private;
+	struct ptlrpc_service_part	*svcpt;
+	int	total = 0;
+	int	i;
 
 	ptlrpc_service_for_each_part(svcpt, i, svc)
 		total += svcpt->scp_nthrs_running;
 
-	return sprintf(buf, "%d\n", total);
+	seq_printf(m, "%d\n", total);
+	return 0;
 }
-LUSTRE_RO_ATTR(threads_started);
+LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_threads_started);
 
-static ssize_t threads_max_show(struct kobject *kobj, struct attribute *attr,
-				char *buf)
+static int
+ptlrpc_lprocfs_threads_max_seq_show(struct seq_file *m, void *n)
 {
-	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
-						  srv_kobj);
+	struct ptlrpc_service *svc = m->private;
 
-	return sprintf(buf, "%d\n", svc->srv_nthrs_cpt_limit * svc->srv_ncpts);
+	seq_printf(m, "%d\n",
+		   svc->srv_nthrs_cpt_limit * svc->srv_ncpts);
+	return 0;
 }
 
-static ssize_t threads_max_store(struct kobject *kobj, struct attribute *attr,
-				 const char *buffer, size_t count)
+static ssize_t
+ptlrpc_lprocfs_threads_max_seq_write(struct file *file,
+				     const char __user *buffer,
+				     size_t count, loff_t *off)
 {
-	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
-						  srv_kobj);
-	unsigned long val;
-	int rc;
+	struct seq_file *m = file->private_data;
+	struct ptlrpc_service *svc = m->private;
+	__s64 val;
+	int rc = lprocfs_str_to_s64(file, buffer, count, &val);
 
-	rc = kstrtoul(buffer, 10, &val);
 	if (rc < 0)
 		return rc;
 
@@ -477,7 +432,7 @@ static ssize_t threads_max_store(struct kobject *kobj, struct attribute *attr,
 
 	return count;
 }
-LUSTRE_RW_ATTR(threads_max);
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_threads_max);
 
 /**
  * Translates \e ptlrpc_nrs_pol_state values to human-readable strings.
@@ -517,7 +472,7 @@ void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy,
 	LASSERT(info != NULL);
 	assert_spin_locked(&policy->pol_nrs->nrs_lock);
 
-	CLASSERT(sizeof(info->pi_arg) == sizeof(policy->pol_arg));
+	LASSERT(sizeof(info->pi_arg) == sizeof(policy->pol_arg));
 	memcpy(info->pi_name, policy->pol_desc->pd_name, NRS_POL_NAME_MAX);
 	memcpy(info->pi_arg, policy->pol_arg, sizeof(policy->pol_arg));
 
@@ -600,39 +555,20 @@ static int ptlrpc_lprocfs_nrs_seq_show(struct seq_file *m, void *n)
 				 * sanity-check the values we get.
 				 */
 			} else {
-				if (strncmp(infos[pol_idx].pi_name,
-					    tmp.pi_name,
-					    NRS_POL_NAME_MAX) != 0) {
-					spin_unlock(&nrs->nrs_lock);
-					rc = -EINVAL;
-					CERROR("%s: failed to check pi_name: rc = %d\n",
-					       svc->srv_thread_name, rc);
-					GOTO(out, rc);
-				}
-				if (strncmp(infos[pol_idx].pi_arg,
-					    tmp.pi_arg,
-					    sizeof(tmp.pi_arg)) != 0) {
-					spin_unlock(&nrs->nrs_lock);
-					rc = -EINVAL;
-					CERROR("%s: failed to check pi_arg: rc = %d\n",
-					       svc->srv_thread_name, rc);
-					GOTO(out, rc);
-				}
+				LASSERT(strncmp(infos[pol_idx].pi_name,
+						tmp.pi_name,
+						NRS_POL_NAME_MAX) == 0);
+				LASSERT(strncmp(infos[pol_idx].pi_arg,
+						tmp.pi_arg,
+						sizeof(tmp.pi_arg)) == 0);
 				/**
-				 * Not checking ptlrpc_nrs_pol_info::pi_state,
+				 * Not asserting ptlrpc_nrs_pol_info::pi_state,
 				 * because it may be different between
 				 * instances of the same policy in different
 				 * service partitions.
 				 */
-
-				if (infos[pol_idx].pi_fallback !=
-				    tmp.pi_fallback) {
-					spin_unlock(&nrs->nrs_lock);
-					rc = -EINVAL;
-					CERROR("%s: failed to check pi_fallback: rc = %d\n",
-					       svc->srv_thread_name, rc);
-					GOTO(out, rc);
-				}
+				LASSERT(infos[pol_idx].pi_fallback ==
+					tmp.pi_fallback);
 			}
 
 			infos[pol_idx].pi_req_queued += tmp.pi_req_queued;
@@ -756,7 +692,7 @@ ptlrpc_lprocfs_nrs_seq_write(struct file *file, const char __user *buffer,
 	 */
 	cmd_copy = cmd;
 
-	if (copy_from_user(cmd, buffer, count))
+	if (lprocfs_copy_from_user(file, cmd, buffer, count))
 		GOTO(out, rc = -EFAULT);
 
 	cmd[count] = '\0';
@@ -811,8 +747,7 @@ ptlrpc_lprocfs_nrs_seq_write(struct file *file, const char __user *buffer,
 
 	RETURN(rc < 0 ? rc : count);
 }
-
-LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs);
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs);
 
 /** @} nrs */
 
@@ -932,12 +867,10 @@ ptlrpc_lprocfs_svc_req_history_start(struct seq_file *s, loff_t *pos)
 		if (i > cpt) /* make up the lowest position for this CPT */
 			*pos = PTLRPC_REQ_CPT2POS(svc, i);
 
-		mutex_lock(&svcpt->scp_mutex);
 		spin_lock(&svcpt->scp_lock);
 		rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi,
 				PTLRPC_REQ_POS2SEQ(svc, *pos));
 		spin_unlock(&svcpt->scp_lock);
-		mutex_unlock(&svcpt->scp_mutex);
 		if (rc == 0) {
 			*pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq);
 			srhi->srhi_idx = i;
@@ -979,11 +912,9 @@ ptlrpc_lprocfs_svc_req_history_next(struct seq_file *s,
 			seq = srhi->srhi_seq + (1 << svc->srv_cpt_bits);
 		}
 
-		mutex_lock(&svcpt->scp_mutex);
 		spin_lock(&svcpt->scp_lock);
 		rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, seq);
 		spin_unlock(&svcpt->scp_lock);
-		mutex_unlock(&svcpt->scp_mutex);
 		if (rc == 0) {
 			*pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq);
 			srhi->srhi_idx = i;
@@ -1037,7 +968,6 @@ static int ptlrpc_lprocfs_svc_req_history_show(struct seq_file *s, void *iter)
 
 	svcpt = svc->srv_parts[srhi->srhi_idx];
 
-	mutex_lock(&svcpt->scp_mutex);
 	spin_lock(&svcpt->scp_lock);
 
 	rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, srhi->srhi_seq);
@@ -1078,8 +1008,6 @@ static int ptlrpc_lprocfs_svc_req_history_show(struct seq_file *s, void *iter)
 	}
 
 	spin_unlock(&svcpt->scp_lock);
-	mutex_unlock(&svcpt->scp_mutex);
-
 	return rc;
 }
 
@@ -1104,7 +1032,7 @@ ptlrpc_lprocfs_svc_req_history_open(struct inode *inode, struct file *file)
 		return rc;
 
 	seqf = file->private_data;
-	seqf->private = inode->i_private;
+	seqf->private = PDE_DATA(inode);
 	return 0;
 }
 
@@ -1138,130 +1066,98 @@ static int ptlrpc_lprocfs_timeouts_seq_show(struct seq_file *m, void *n)
 
 	return 0;
 }
+LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_timeouts);
 
-LDEBUGFS_SEQ_FOPS_RO(ptlrpc_lprocfs_timeouts);
-
-static ssize_t high_priority_ratio_show(struct kobject *kobj,
-					struct attribute *attr,
-					char *buf)
+static int ptlrpc_lprocfs_hp_ratio_seq_show(struct seq_file *m, void *v)
 {
-	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
-						  srv_kobj);
-
-	return sprintf(buf, "%d\n", svc->srv_hpreq_ratio);
+	struct ptlrpc_service *svc = m->private;
+	seq_printf(m, "%d\n", svc->srv_hpreq_ratio);
+	return 0;
 }
 
-static ssize_t high_priority_ratio_store(struct kobject *kobj,
-					 struct attribute *attr,
-					 const char *buffer,
-					 size_t count)
+static ssize_t
+ptlrpc_lprocfs_hp_ratio_seq_write(struct file *file, const char __user *buffer,
+				  size_t count, loff_t *off)
 {
-	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
-						  srv_kobj);
+	struct seq_file *m = file->private_data;
+	struct ptlrpc_service *svc = m->private;
 	int rc;
-	unsigned long val;
+	__s64 val;
 
-	rc = kstrtoul(buffer, 10, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc < 0)
 		return rc;
 
+	if (val < 0 || val > INT_MAX)
+		return -ERANGE;
+
 	spin_lock(&svc->srv_lock);
 	svc->srv_hpreq_ratio = val;
 	spin_unlock(&svc->srv_lock);
 
 	return count;
 }
-LUSTRE_RW_ATTR(high_priority_ratio);
-
-static struct attribute *ptlrpc_svc_attrs[] = {
-	&lustre_attr_threads_min.attr,
-	&lustre_attr_threads_started.attr,
-	&lustre_attr_threads_max.attr,
-	&lustre_attr_high_priority_ratio.attr,
-	NULL,
-};
-
-static void ptlrpc_sysfs_svc_release(struct kobject *kobj)
-{
-	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
-						  srv_kobj);
-
-	complete(&svc->srv_kobj_unregister);
-}
-
-static struct kobj_type ptlrpc_svc_ktype = {
-	.default_attrs	= ptlrpc_svc_attrs,
-	.sysfs_ops	= &lustre_sysfs_ops,
-	.release	= ptlrpc_sysfs_svc_release,
-};
-
-void ptlrpc_sysfs_unregister_service(struct ptlrpc_service *svc)
-{
-	/* Let's see if we had a chance at initialization first */
-	if (svc->srv_kobj.kset) {
-		kobject_put(&svc->srv_kobj);
-		wait_for_completion(&svc->srv_kobj_unregister);
-	}
-}
-
-int ptlrpc_sysfs_register_service(struct kset *parent,
-				  struct ptlrpc_service *svc)
-{
-	svc->srv_kobj.kset = parent;
-	init_completion(&svc->srv_kobj_unregister);
-	return kobject_init_and_add(&svc->srv_kobj, &ptlrpc_svc_ktype,
-				    &parent->kobj, "%s", svc->srv_name);
-}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_hp_ratio);
 
-void ptlrpc_ldebugfs_register_service(struct dentry *entry,
-				      struct ptlrpc_service *svc)
+void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry,
+                                     struct ptlrpc_service *svc)
 {
-	struct ldebugfs_vars ldebugfs_vars[] = {
+	struct lprocfs_vars lproc_vars[] = {
+		{ .name	= "high_priority_ratio",
+		  .fops	= &ptlrpc_lprocfs_hp_ratio_fops,
+		  .data = svc },
 		{ .name	= "req_buffer_history_len",
 		  .fops	= &ptlrpc_lprocfs_req_history_len_fops,
 		  .data	= svc },
 		{ .name = "req_buffer_history_max",
 		  .fops	= &ptlrpc_lprocfs_req_history_max_fops,
 		  .data	= svc },
+		{ .name = "threads_min",
+		  .fops = &ptlrpc_lprocfs_threads_min_fops,
+		  .data = svc },
+		{ .name = "threads_max",
+		  .fops = &ptlrpc_lprocfs_threads_max_fops,
+		  .data = svc },
+		{ .name = "threads_started",
+		  .fops = &ptlrpc_lprocfs_threads_started_fops,
+		  .data = svc },
 		{ .name = "timeouts",
 		  .fops = &ptlrpc_lprocfs_timeouts_fops,
 		  .data = svc },
 		{ .name = "nrs_policies",
 		  .fops = &ptlrpc_lprocfs_nrs_fops,
 		  .data = svc },
-		{ .name = "req_buffers_max",
-		  .fops = &ptlrpc_lprocfs_req_buffers_max_fops,
-		  .data = svc },
 		{ NULL }
         };
-        static struct file_operations req_history_fops = {
-                .owner       = THIS_MODULE,
-                .open        = ptlrpc_lprocfs_svc_req_history_open,
-                .read        = seq_read,
-                .llseek      = seq_lseek,
-                .release     = lprocfs_seq_release,
+        static struct proc_ops req_history_fops = {
+		PROC_OWNER(THIS_MODULE)
+                .proc_open    = ptlrpc_lprocfs_svc_req_history_open,
+                .proc_read    = seq_read,
+                .proc_lseek   = seq_lseek,
+                .proc_release = lprocfs_seq_release,
         };
 
         int rc;
 
-	ptlrpc_ldebugfs_register(entry, svc->srv_name, "stats",
-				 &svc->srv_debugfs_entry, &svc->srv_stats);
-	if (IS_ERR_OR_NULL(svc->srv_debugfs_entry))
+        ptlrpc_lprocfs_register(entry, svc->srv_name,
+				"stats", &svc->srv_procroot,
+				&svc->srv_stats);
+	if (svc->srv_procroot == NULL)
 		return;
 
-	ldebugfs_add_vars(svc->srv_debugfs_entry, ldebugfs_vars, NULL);
+	lprocfs_add_vars(svc->srv_procroot, lproc_vars, NULL);
 
-	rc = ldebugfs_seq_create(svc->srv_debugfs_entry, "req_history",
-				 0400, &req_history_fops, svc);
+	rc = lprocfs_seq_create(svc->srv_procroot, "req_history",
+				0400, &req_history_fops, svc);
 	if (rc)
 		CWARN("Error adding the req_history file\n");
 }
 
 void ptlrpc_lprocfs_register_obd(struct obd_device *obddev)
 {
-	ptlrpc_ldebugfs_register(obddev->obd_debugfs_entry, NULL, "stats",
-				 &obddev->obd_svc_debugfs_entry,
-				 &obddev->obd_svc_stats);
+        ptlrpc_lprocfs_register(obddev->obd_proc_entry, NULL, "stats",
+                                &obddev->obd_svc_procroot,
+                                &obddev->obd_svc_stats);
 }
 EXPORT_SYMBOL(ptlrpc_lprocfs_register_obd);
 
@@ -1309,8 +1205,8 @@ EXPORT_SYMBOL(ptlrpc_lprocfs_brw);
 
 void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc)
 {
-	if (!IS_ERR_OR_NULL(svc->srv_debugfs_entry))
-		ldebugfs_remove(&svc->srv_debugfs_entry);
+        if (svc->srv_procroot != NULL)
+                lprocfs_remove(&svc->srv_procroot);
 
         if (svc->srv_stats)
                 lprocfs_free_stats(&svc->srv_stats);
@@ -1323,53 +1219,48 @@ void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd)
 	 */
 	lprocfs_obd_cleanup(obd);
 
-	if (!IS_ERR_OR_NULL(obd->obd_svc_debugfs_entry))
-		ldebugfs_remove(&obd->obd_svc_debugfs_entry);
+        if (obd->obd_svc_procroot)
+                lprocfs_remove(&obd->obd_svc_procroot);
 
         if (obd->obd_svc_stats)
                 lprocfs_free_stats(&obd->obd_svc_stats);
 }
 EXPORT_SYMBOL(ptlrpc_lprocfs_unregister_obd);
 
-ssize_t ping_show(struct kobject *kobj, struct attribute *attr,
-		  char *buffer)
+ssize_t
+lprocfs_ping_seq_write(struct file *file, const char __user *buffer,
+		       size_t count, loff_t *off)
 {
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct ptlrpc_request *req;
-	int rc;
-
+	struct seq_file		*m = file->private_data;
+	struct obd_device	*obd = m->private;
+	struct ptlrpc_request	*req;
+	int			rc;
 	ENTRY;
+
 	LPROCFS_CLIMP_CHECK(obd);
 	req = ptlrpc_prep_ping(obd->u.cli.cl_import);
 	LPROCFS_CLIMP_EXIT(obd);
-	if (!req)
+	if (req == NULL)
 		RETURN(-ENOMEM);
 
 	req->rq_send_state = LUSTRE_IMP_FULL;
 
 	rc = ptlrpc_queue_wait(req);
-	ptlrpc_req_finished(req);
 
+	ptlrpc_req_finished(req);
+	if (rc >= 0)
+		RETURN(count);
 	RETURN(rc);
 }
-EXPORT_SYMBOL(ping_show);
-
-/* kept for older verison of tools. */
-ssize_t ping_store(struct kobject *kobj, struct attribute *attr,
-		   const char *buffer, size_t count)
-{
-	return ping_show(kobj, attr, (char *)buffer);
-}
-EXPORT_SYMBOL(ping_store);
+EXPORT_SYMBOL(lprocfs_ping_seq_write);
 
 /* Write the connection UUID to this file to attempt to connect to that node.
  * The connection UUID is a node's primary NID. For example,
  * "echo connection=192.168.0.1@tcp0::instance > .../import".
  */
 ssize_t
-ldebugfs_import_seq_write(struct file *file, const char __user *buffer,
-			  size_t count, loff_t *off)
+lprocfs_import_seq_write(struct file *file, const char __user *buffer,
+			 size_t count, loff_t *off)
 {
 	struct seq_file	  *m	= file->private_data;
 	struct obd_device *obd	= m->private;
@@ -1388,7 +1279,7 @@ ldebugfs_import_seq_write(struct file *file, const char __user *buffer,
 	if (kbuf == NULL)
 		return -ENOMEM;
 
-	if (copy_from_user(kbuf, buffer, count))
+	if (lprocfs_copy_from_user(file, kbuf, buffer, count))
 		GOTO(out, count = -EFAULT);
 
 	kbuf[count] = 0;
@@ -1400,14 +1291,14 @@ ldebugfs_import_seq_write(struct file *file, const char __user *buffer,
 	uuid = kbuf + prefix_len;
 	ptr = strstr(uuid, "::");
 	if (ptr) {
-		u32 inst;
-		int rc;
+		__u32 inst;
+		char *endptr;
 
 		*ptr = 0;
 		do_reconn = 0;
 		ptr += 2; /* Skip :: */
-		rc = kstrtouint(ptr, 10, &inst);
-		if (rc) {
+		inst = simple_strtol(ptr, &endptr, 10);
+		if (*endptr) {
 			CERROR("config: wrong instance # %s\n", ptr);
 		} else if (inst != imp->imp_connect_data.ocd_instance) {
 			CDEBUG(D_INFO, "IR: %s is connecting to an obsoleted "
@@ -1429,7 +1320,7 @@ ldebugfs_import_seq_write(struct file *file, const char __user *buffer,
 	OBD_FREE(kbuf, count + 1);
 	return count;
 }
-EXPORT_SYMBOL(ldebugfs_import_seq_write);
+EXPORT_SYMBOL(lprocfs_import_seq_write);
 
 int lprocfs_pinger_recov_seq_show(struct seq_file *m, void *n)
 {
@@ -1451,13 +1342,16 @@ lprocfs_pinger_recov_seq_write(struct file *file, const char __user *buffer,
 	struct obd_device *obd = m->private;
 	struct client_obd *cli = &obd->u.cli;
 	struct obd_import *imp = cli->cl_import;
-	bool val;
 	int rc;
+	__s64 val;
 
-	rc = kstrtobool_from_user(buffer, count, &val);
+	rc = lprocfs_str_to_s64(file, buffer, count, &val);
 	if (rc < 0)
 		return rc;
 
+	if (val != 0 && val != 1)
+		return -ERANGE;
+
 	LPROCFS_CLIMP_CHECK(obd);
 	spin_lock(&imp->imp_lock);
 	imp->imp_no_pinger_recover = !val;
@@ -1466,3 +1360,5 @@ lprocfs_pinger_recov_seq_write(struct file *file, const char __user *buffer,
 	return count;
 }
 EXPORT_SYMBOL(lprocfs_pinger_recov_seq_write);
+
+#endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/niobuf.c b/drivers/staging/lustrefsx/lustre/ptlrpc/niobuf.c
index f6e0f57e2c785..999869000c35b 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/niobuf.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/niobuf.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -167,6 +167,7 @@ int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc)
 		RETURN(0);
 
 	/* NB no locking required until desc is on the network */
+	LASSERT(desc->bd_md_count == 0);
 	LASSERT(ptlrpc_is_bulk_op_active(desc->bd_type));
 
 	LASSERT(desc->bd_cbid.cbid_fn == server_bulk_callback);
@@ -189,7 +190,7 @@ int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc)
 	mbits = desc->bd_req->rq_mbits & ~((__u64)desc->bd_md_max_brw - 1);
 	total_md = desc->bd_req->rq_mbits - mbits + 1;
 
-	desc->bd_refs = total_md;
+	desc->bd_md_count = total_md;
 	desc->bd_failure = 0;
 
 	md.user_ptr = &desc->bd_cbid;
@@ -230,7 +231,7 @@ int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc)
 				     desc->bd_portal, mbits, 0, 0);
 		else
 			rc = LNetGet(self_nid, desc->bd_mds[posted_md],
-				     peer_id, desc->bd_portal, mbits, 0, false);
+				     peer_id, desc->bd_portal, mbits, 0);
 
 		posted_md++;
 		if (rc != 0) {
@@ -247,9 +248,9 @@ int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc)
 		 * event this creates will signal completion with failure,
 		 * so we return SUCCESS here! */
 		spin_lock(&desc->bd_lock);
-		desc->bd_refs -= total_md - posted_md;
+		desc->bd_md_count -= total_md - posted_md;
 		spin_unlock(&desc->bd_lock);
-		LASSERT(desc->bd_refs >= 0);
+		LASSERT(desc->bd_md_count >= 0);
 
 		mdunlink_iterate_helper(desc->bd_mds, posted_md);
 		RETURN(0);
@@ -326,6 +327,7 @@ int ptlrpc_register_bulk(struct ptlrpc_request *req)
 
 	/* NB no locking required until desc is on the network */
 	LASSERT(desc->bd_nob > 0);
+	LASSERT(desc->bd_md_count == 0);
 	LASSERT(desc->bd_md_max_brw <= PTLRPC_BULK_OPS_COUNT);
 	LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES);
 	LASSERT(desc->bd_req != NULL);
@@ -347,9 +349,9 @@ int ptlrpc_register_bulk(struct ptlrpc_request *req)
 	LASSERT(desc->bd_cbid.cbid_fn == client_bulk_callback);
 	LASSERT(desc->bd_cbid.cbid_arg == desc);
 
-	total_md = desc->bd_md_count;
+	total_md = (desc->bd_iov_count + LNET_MAX_IOV - 1) / LNET_MAX_IOV;
 	/* rq_mbits is matchbits of the final bulk */
-	mbits = req->rq_mbits - desc->bd_md_count + 1;
+	mbits = req->rq_mbits - total_md + 1;
 
 	LASSERTF(mbits == (req->rq_mbits & PTLRPC_BULK_OPS_MASK),
 		 "first mbits = x%llu, last mbits = x%llu\n",
@@ -362,25 +364,19 @@ int ptlrpc_register_bulk(struct ptlrpc_request *req)
 
 	desc->bd_registered = 1;
 	desc->bd_last_mbits = mbits;
-	desc->bd_refs = total_md;
+	desc->bd_md_count = total_md;
 	md.user_ptr = &desc->bd_cbid;
 	md.eq_handle = ptlrpc_eq_h;
 	md.threshold = 1;                       /* PUT or GET */
 
-	for (posted_md = 0; posted_md < desc->bd_md_count;
-	     posted_md++, mbits++) {
+	for (posted_md = 0; posted_md < total_md; posted_md++, mbits++) {
 		md.options = PTLRPC_MD_OPTIONS |
 			     (ptlrpc_is_bulk_op_get(desc->bd_type) ?
 			      LNET_MD_OP_GET : LNET_MD_OP_PUT);
 		ptlrpc_fill_bulk_md(&md, desc, posted_md);
 
-		if (posted_md > 0 && posted_md + 1 == desc->bd_md_count &&
-		    OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_ATTACH)) {
-			rc = -ENOMEM;
-		} else {
-			rc = LNetMEAttach(desc->bd_portal, peer, mbits, 0,
+		rc = LNetMEAttach(desc->bd_portal, peer, mbits, 0,
 				  LNET_UNLINK, LNET_INS_AFTER, &me_h);
-		}
 		if (rc != 0) {
 			CERROR("%s: LNetMEAttach failed x%llu/%d: rc = %d\n",
 			       desc->bd_import->imp_obd->obd_name, mbits,
@@ -404,26 +400,24 @@ int ptlrpc_register_bulk(struct ptlrpc_request *req)
 	if (rc != 0) {
 		LASSERT(rc == -ENOMEM);
 		spin_lock(&desc->bd_lock);
-		desc->bd_refs -= total_md - posted_md;
+		desc->bd_md_count -= total_md - posted_md;
 		spin_unlock(&desc->bd_lock);
-		LASSERT(desc->bd_refs >= 0);
+		LASSERT(desc->bd_md_count >= 0);
 		mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw);
 		req->rq_status = -ENOMEM;
-		desc->bd_registered = 0;
 		RETURN(-ENOMEM);
 	}
 
 	spin_lock(&desc->bd_lock);
 	/* Holler if peer manages to touch buffers before he knows the mbits */
-	if (desc->bd_refs != total_md)
+	if (desc->bd_md_count != total_md)
 		CWARN("%s: Peer %s touched %d buffers while I registered\n",
 		      desc->bd_import->imp_obd->obd_name, libcfs_id2str(peer),
-		      total_md - desc->bd_refs);
+		      total_md - desc->bd_md_count);
 	spin_unlock(&desc->bd_lock);
 
-	CDEBUG(D_NET,
-	       "Setup %u bulk %s buffers: %u pages %u bytes, mbits x%#llx-%#llx, portal %u\n",
-	       desc->bd_refs,
+	CDEBUG(D_NET, "Setup %u bulk %s buffers: %u pages %u bytes, "
+	       "mbits x%#llx-%#llx, portal %u\n", desc->bd_md_count,
 	       ptlrpc_is_bulk_op_get(desc->bd_type) ? "get-source" : "put-sink",
 	       desc->bd_iov_count, desc->bd_nob,
 	       desc->bd_last_mbits, req->rq_mbits, desc->bd_portal);
@@ -498,11 +492,9 @@ static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags)
 {
 	struct ptlrpc_service_part	*svcpt = req->rq_rqbd->rqbd_svcpt;
 	struct ptlrpc_service		*svc = svcpt->scp_service;
-	timeout_t service_timeout;
+	int service_time = max_t(int, ktime_get_real_seconds() -
+                                 req->rq_arrival_time.tv_sec, 1);
 
-	service_timeout = clamp_t(timeout_t, ktime_get_real_seconds() -
-					     req->rq_arrival_time.tv_sec, 1,
-				  (AT_OFF ? obd_timeout * 3 / 2 : at_max));
         if (!(flags & PTLRPC_REPLY_EARLY) &&
             (req->rq_type != PTL_RPC_MSG_ERR) &&
             (req->rq_reqmsg != NULL) &&
@@ -511,8 +503,7 @@ static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags)
                MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE))) {
                 /* early replies, errors and recovery requests don't count
                  * toward our service time estimate */
-		int oldse = at_measured(&svcpt->scp_at_estimate,
-					service_timeout);
+		int oldse = at_measured(&svcpt->scp_at_estimate, service_time);
 
 		if (oldse != 0) {
 			DEBUG_REQ(D_ADAPTTO, req,
@@ -522,7 +513,7 @@ static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags)
 		}
         }
         /* Report actual service time for client latency calc */
-	lustre_msg_set_service_timeout(req->rq_repmsg, service_timeout);
+        lustre_msg_set_service_time(req->rq_repmsg, service_time);
 	/* Report service time estimate for future client reqs, but report 0
 	 * (to be ignored by client) if it's an error reply during recovery.
 	 * b=15815
@@ -789,8 +780,8 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
 		if (request->rq_resend_cb != NULL)
 			request->rq_resend_cb(request, &request->rq_async_args);
 	}
-	if (request->rq_memalloc)
-		mpflag = cfs_memory_pressure_get_and_set();
+        if (request->rq_memalloc)
+                mpflag = cfs_memory_pressure_get_and_set();
 
 	rc = sptlrpc_cli_wrap_request(request);
 	if (rc)
@@ -800,7 +791,7 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
 	if (request->rq_bulk != NULL) {
 		rc = ptlrpc_register_bulk (request);
 		if (rc != 0)
-			GOTO(cleanup_bulk, rc);
+			GOTO(out, rc);
 		/*
 		 * All the mds in the request will have the same cpt
 		 * encoded in the cookie. So we can just get the first
@@ -822,13 +813,13 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
 				spin_lock(&request->rq_lock);
 				request->rq_err = 1;
 				spin_unlock(&request->rq_lock);
-				request->rq_status = rc;
-				GOTO(cleanup_bulk, rc);
-			}
-		} else {
-			request->rq_repdata = NULL;
-			request->rq_repmsg = NULL;
-		}
+                                request->rq_status = rc;
+                                GOTO(cleanup_bulk, rc);
+                        }
+                } else {
+                        request->rq_repdata = NULL;
+                        request->rq_repmsg = NULL;
+                }
 
                 rc = LNetMEAttach(request->rq_reply_portal,/*XXX FIXME bug 249*/
                                   connection->c_peer, request->rq_xid, 0,
@@ -902,6 +893,8 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
         request->rq_deadline = request->rq_sent + request->rq_timeout +
                 ptlrpc_at_get_net_latency(request);
 
+	ptlrpc_pinger_sending_on_import(imp);
+
 	DEBUG_REQ(D_INFO, request, "send flg=%x",
 		  lustre_msg_get_flags(request->rq_reqmsg));
 	rc = ptl_send_buf(&request->rq_req_md_h,
@@ -919,20 +912,18 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
                 GOTO(out, rc);
 
  cleanup_me:
-	/* MEUnlink is safe; the PUT didn't even get off the ground, and
-	 * nobody apart from the PUT's target has the right nid+XID to
-	 * access the reply buffer. */
-	rc2 = LNetMEUnlink(reply_me_h);
-	LASSERT (rc2 == 0);
-	/* UNLINKED callback called synchronously */
-	LASSERT(!request->rq_receiving_reply);
+        /* MEUnlink is safe; the PUT didn't even get off the ground, and
+         * nobody apart from the PUT's target has the right nid+XID to
+         * access the reply buffer. */
+        rc2 = LNetMEUnlink(reply_me_h);
+        LASSERT (rc2 == 0);
+        /* UNLINKED callback called synchronously */
+        LASSERT(!request->rq_receiving_reply);
 
  cleanup_bulk:
-	/* We do sync unlink here as there was no real transfer here so
-	 * the chance to have long unlink to sluggish net is smaller here. */
+        /* We do sync unlink here as there was no real transfer here so
+         * the chance to have long unlink to sluggish net is smaller here. */
         ptlrpc_unregister_bulk(request, 0);
-	if (request->rq_bulk != NULL)
-		request->rq_bulk->bd_registered = 0;
  out:
 	if (rc == -ENOMEM) {
 		/* set rq_sent so that this request is treated
@@ -953,10 +944,7 @@ EXPORT_SYMBOL(ptl_send_rpc);
 int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd)
 {
 	struct ptlrpc_service *service = rqbd->rqbd_svcpt->scp_service;
-	static struct lnet_process_id match_id = {
-		.nid = LNET_NID_ANY,
-		.pid = LNET_PID_ANY
-	};
+	static struct lnet_process_id match_id = {LNET_NID_ANY, LNET_PID_ANY};
 	int rc;
 	struct lnet_md md;
 	struct lnet_handle_me me_h;
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nodemap_internal.h b/drivers/staging/lustrefsx/lustre/ptlrpc/nodemap_internal.h
index 6d6b9d7a04541..851bdc0dc354a 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/nodemap_internal.h
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nodemap_internal.h
@@ -22,7 +22,7 @@
 /*
  * Copyright (C) 2013, Trustees of Indiana University
  *
- * Copyright (c) 2013, 2017, Intel Corporation.
+ * Copyright (c) 2013, 2014, Intel Corporation.
  *
  * Author: Joshua Walgenbach <jjw@iu.edu>
  */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c
index 94d21d42f87df..7423e981d9e37 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2013, 2017, Intel Corporation.
+ * Copyright (c) 2013, 2016, Intel Corporation.
  *
  * Copyright 2012 Xyratex Technology Limited
  */
@@ -610,8 +610,10 @@ static void nrs_crrn_req_stop(struct ptlrpc_nrs_policy *policy,
 	       libcfs_id2str(req->rq_peer), nrq->nr_u.crr.cr_round);
 }
 
+#ifdef CONFIG_PROC_FS
+
 /**
- * debugfs interface
+ * lprocfs interface
  */
 
 /**
@@ -716,7 +718,7 @@ ptlrpc_lprocfs_nrs_crrn_quantum_seq_write(struct file *file,
         if (count > (sizeof(kernbuf) - 1))
                 return -EINVAL;
 
-	if (copy_from_user(kernbuf, buffer, count))
+	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
 		return -EFAULT;
 
         kernbuf[count] = '\0';
@@ -729,9 +731,7 @@ ptlrpc_lprocfs_nrs_crrn_quantum_seq_write(struct file *file,
 	val = lprocfs_find_named_value(kernbuf, NRS_LPROCFS_QUANTUM_NAME_REG,
 				       &count_copy);
 	if (val != kernbuf) {
-		rc = kstrtol(val, 10, &quantum_reg);
-		if (rc)
-			return rc;
+		quantum_reg = simple_strtol(val, NULL, 10);
 
 		queue |= PTLRPC_NRS_QUEUE_REG;
 	}
@@ -747,9 +747,7 @@ ptlrpc_lprocfs_nrs_crrn_quantum_seq_write(struct file *file,
 		if (!nrs_svc_has_hp(svc))
 			return -ENODEV;
 
-		rc = kstrtol(val, 10, &quantum_hp);
-		if (rc)
-			return rc;
+		quantum_hp = simple_strtol(val, NULL, 10);
 
 		queue |= PTLRPC_NRS_QUEUE_HP;
 	}
@@ -759,9 +757,10 @@ ptlrpc_lprocfs_nrs_crrn_quantum_seq_write(struct file *file,
 	 * value
 	 */
 	if (queue == 0) {
-		rc = kstrtol(kernbuf, 10, &quantum_reg);
-		if (rc)
-			return rc;
+		if (!isdigit(kernbuf[0]))
+			return -EINVAL;
+
+		quantum_reg = simple_strtol(kernbuf, NULL, 10);
 
 		queue = PTLRPC_NRS_QUEUE_REG;
 
@@ -809,8 +808,7 @@ ptlrpc_lprocfs_nrs_crrn_quantum_seq_write(struct file *file,
 
 	return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count;
 }
-
-LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_crrn_quantum);
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_crrn_quantum);
 
 /**
  * Initializes a CRR-N policy's lprocfs interface for service \a svc
@@ -822,19 +820,34 @@ LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_crrn_quantum);
  */
 static int nrs_crrn_lprocfs_init(struct ptlrpc_service *svc)
 {
-	struct ldebugfs_vars nrs_crrn_lprocfs_vars[] = {
+	struct lprocfs_vars nrs_crrn_lprocfs_vars[] = {
 		{ .name		= "nrs_crrn_quantum",
 		  .fops		= &ptlrpc_lprocfs_nrs_crrn_quantum_fops,
 		  .data = svc },
 		{ NULL }
 	};
 
-	if (IS_ERR_OR_NULL(svc->srv_debugfs_entry))
+	if (svc->srv_procroot == NULL)
 		return 0;
 
-	return ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_crrn_lprocfs_vars, NULL);
+	return lprocfs_add_vars(svc->srv_procroot, nrs_crrn_lprocfs_vars, NULL);
+}
+
+/**
+ * Cleans up a CRR-N policy's lprocfs interface for service \a svc
+ *
+ * \param[in] svc the service
+ */
+static void nrs_crrn_lprocfs_fini(struct ptlrpc_service *svc)
+{
+	if (svc->srv_procroot == NULL)
+		return;
+
+	lprocfs_remove_proc_entry("nrs_crrn_quantum", svc->srv_procroot);
 }
 
+#endif /* CONFIG_PROC_FS */
+
 /**
  * CRR-N policy operations
  */
@@ -848,7 +861,10 @@ static const struct ptlrpc_nrs_pol_ops nrs_crrn_ops = {
 	.op_req_enqueue		= nrs_crrn_req_add,
 	.op_req_dequeue		= nrs_crrn_req_del,
 	.op_req_stop		= nrs_crrn_req_stop,
+#ifdef CONFIG_PROC_FS
 	.op_lprocfs_init	= nrs_crrn_lprocfs_init,
+	.op_lprocfs_fini	= nrs_crrn_lprocfs_fini,
+#endif
 };
 
 /**
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c
index c8a1e6637d261..403b74efe6415 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c
@@ -362,9 +362,11 @@ static int nrs_delay_ctl(struct ptlrpc_nrs_policy *policy,
 }
 
 /**
- * debugfs interface
+ * lprocfs interface
  */
 
+#ifdef CONFIG_PROC_FS
+
 /* nrs_delay_min and nrs_delay_max are bounded by these values */
 #define LPROCFS_NRS_DELAY_LOWER_BOUND		0
 #define LPROCFS_NRS_DELAY_UPPER_BOUND		65535
@@ -417,7 +419,7 @@ static int nrs_delay_ctl(struct ptlrpc_nrs_policy *policy,
  * Helper for delay's seq_write functions.
  */
 static ssize_t
-lprocfs_nrs_delay_seq_write_common(const char __user *buffer,
+lprocfs_nrs_delay_seq_write_common(struct file *file, const char __user *buffer,
 				   unsigned int bufsize, size_t count,
 				   const char *var_name, unsigned int min_val,
 				   unsigned int max_val,
@@ -441,7 +443,7 @@ lprocfs_nrs_delay_seq_write_common(const char __user *buffer,
 	if (kernbuf == NULL)
 		return -ENOMEM;
 
-	if (copy_from_user(kernbuf, buffer, count))
+	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
 		GOTO(free_kernbuf, rc = -EFAULT);
 
 	tmpsize = strlen("reg_") + strlen(var_name) + 1;
@@ -596,7 +598,7 @@ ptlrpc_lprocfs_nrs_delay_min_seq_write(struct file *file,
 	struct seq_file *m = file->private_data;
 	struct ptlrpc_service *svc = m->private;
 
-	return lprocfs_nrs_delay_seq_write_common(buffer,
+	return lprocfs_nrs_delay_seq_write_common(file, buffer,
 						  LPROCFS_NRS_DELAY_MIN_SIZE,
 						  count,
 						  LPROCFS_NRS_DELAY_MIN_NAME,
@@ -605,7 +607,7 @@ ptlrpc_lprocfs_nrs_delay_min_seq_write(struct file *file,
 						  svc, NRS_POL_NAME_DELAY,
 						  NRS_CTL_DELAY_WR_MIN, false);
 }
-LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_min);
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_min);
 
 /**
  * Retrieves the value of the maximum delay for delay policy instances on both
@@ -679,7 +681,7 @@ ptlrpc_lprocfs_nrs_delay_max_seq_write(struct file *file,
 	struct seq_file *m = file->private_data;
 	struct ptlrpc_service *svc = m->private;
 
-	return lprocfs_nrs_delay_seq_write_common(buffer,
+	return lprocfs_nrs_delay_seq_write_common(file, buffer,
 						  LPROCFS_NRS_DELAY_MAX_SIZE,
 						  count,
 						  LPROCFS_NRS_DELAY_MAX_NAME,
@@ -688,7 +690,7 @@ ptlrpc_lprocfs_nrs_delay_max_seq_write(struct file *file,
 						  svc, NRS_POL_NAME_DELAY,
 						  NRS_CTL_DELAY_WR_MAX, false);
 }
-LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_max);
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_max);
 
 /**
  * Retrieves the value of the percentage of requests which should be delayed
@@ -763,7 +765,7 @@ ptlrpc_lprocfs_nrs_delay_pct_seq_write(struct file *file,
 	struct seq_file *m = file->private_data;
 	struct ptlrpc_service *svc = m->private;
 
-	return lprocfs_nrs_delay_seq_write_common(buffer,
+	return lprocfs_nrs_delay_seq_write_common(file, buffer,
 						  LPROCFS_NRS_DELAY_PCT_SIZE,
 						  count,
 						  LPROCFS_NRS_DELAY_PCT_NAME,
@@ -772,12 +774,11 @@ ptlrpc_lprocfs_nrs_delay_pct_seq_write(struct file *file,
 						  svc, NRS_POL_NAME_DELAY,
 						  NRS_CTL_DELAY_WR_PCT, false);
 }
-
-LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_pct);
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_pct);
 
 static int nrs_delay_lprocfs_init(struct ptlrpc_service *svc)
 {
-	struct ldebugfs_vars nrs_delay_lprocfs_vars[] = {
+	struct lprocfs_vars nrs_delay_lprocfs_vars[] = {
 		{ .name		= "nrs_delay_min",
 		  .fops		= &ptlrpc_lprocfs_nrs_delay_min_fops,
 		  .data		= svc },
@@ -790,13 +791,25 @@ static int nrs_delay_lprocfs_init(struct ptlrpc_service *svc)
 		{ NULL }
 	};
 
-	if (IS_ERR_OR_NULL(svc->srv_debugfs_entry))
+	if (svc->srv_procroot == NULL)
 		return 0;
 
-	return ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_delay_lprocfs_vars,
-				 NULL);
+	return lprocfs_add_vars(svc->srv_procroot, nrs_delay_lprocfs_vars,
+				NULL);
 }
 
+static void nrs_delay_lprocfs_fini(struct ptlrpc_service *svc)
+{
+	if (svc->srv_procroot == NULL)
+		return;
+
+	lprocfs_remove_proc_entry("nrs_delay_min", svc->srv_procroot);
+	lprocfs_remove_proc_entry("nrs_delay_max", svc->srv_procroot);
+	lprocfs_remove_proc_entry("nrs_delay_pct", svc->srv_procroot);
+}
+
+#endif /* CONFIG_PROC_FS */
+
 /**
  * Delay policy operations
  */
@@ -809,7 +822,10 @@ static const struct ptlrpc_nrs_pol_ops nrs_delay_ops = {
 	.op_req_enqueue		= nrs_delay_req_add,
 	.op_req_dequeue		= nrs_delay_req_del,
 	.op_req_stop		= nrs_delay_req_stop,
+#ifdef CONFIG_PROC_FS
 	.op_lprocfs_init	= nrs_delay_lprocfs_init,
+	.op_lprocfs_fini	= nrs_delay_lprocfs_fini,
+#endif
 };
 
 /**
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c
index 8b8e092dd8209..96c3a6593d2dd 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2013, 2017, Intel Corporation.
+ * Copyright (c) 2013, 2016, Intel Corporation.
  *
  * Copyright 2012 Xyratex Technology Limited
  */
@@ -45,6 +45,7 @@
 #include <obd_support.h>
 #include <obd_class.h>
 #include <lustre_net.h>
+#include <lustre/lustre_idl.h>
 #include <lustre_req_layout.h>
 #include "ptlrpc_internal.h"
 
@@ -1160,9 +1161,11 @@ static void nrs_orr_req_stop(struct ptlrpc_nrs_policy *policy,
 }
 
 /**
- * debugfs interface
+ * lprocfs interface
  */
 
+#ifdef CONFIG_PROC_FS
+
 /**
  * This allows to bundle the policy name into the lprocfs_vars::data pointer
  * so that lprocfs read/write functions can be used by both the ORR and TRR
@@ -1294,7 +1297,7 @@ ptlrpc_lprocfs_nrs_orr_quantum_seq_write(struct file *file,
         if (count > (sizeof(kernbuf) - 1))
                 return -EINVAL;
 
-	if (copy_from_user(kernbuf, buffer, count))
+	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
 		return -EFAULT;
 
         kernbuf[count] = '\0';
@@ -1307,9 +1310,8 @@ ptlrpc_lprocfs_nrs_orr_quantum_seq_write(struct file *file,
 	val = lprocfs_find_named_value(kernbuf, NRS_LPROCFS_QUANTUM_NAME_REG,
 				       &count_copy);
 	if (val != kernbuf) {
-		rc = kstrtol(val, 10, &quantum_reg);
-		if (rc)
-			return rc;
+		quantum_reg = simple_strtol(val, NULL, 10);
+
 		queue |= PTLRPC_NRS_QUEUE_REG;
 	}
 
@@ -1324,9 +1326,7 @@ ptlrpc_lprocfs_nrs_orr_quantum_seq_write(struct file *file,
 		if (!nrs_svc_has_hp(svc))
 			return -ENODEV;
 
-		rc = kstrtol(val, 10, &quantum_hp);
-		if (rc)
-			return rc;
+		quantum_hp = simple_strtol(val, NULL, 10);
 
 		queue |= PTLRPC_NRS_QUEUE_HP;
 	}
@@ -1336,9 +1336,10 @@ ptlrpc_lprocfs_nrs_orr_quantum_seq_write(struct file *file,
 	 * value
 	 */
 	if (queue == 0) {
-		rc = kstrtol(kernbuf, 10, &quantum_reg);
-		if (rc)
-			return rc;
+		if (!isdigit(kernbuf[0]))
+			return -EINVAL;
+
+		quantum_reg = simple_strtol(kernbuf, NULL, 10);
 
 		queue = PTLRPC_NRS_QUEUE_REG;
 
@@ -1386,8 +1387,7 @@ ptlrpc_lprocfs_nrs_orr_quantum_seq_write(struct file *file,
 
 	return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count;
 }
-
-LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_quantum);
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_quantum);
 
 #define LPROCFS_NRS_OFF_NAME_REG		"reg_offset_type:"
 #define LPROCFS_NRS_OFF_NAME_HP			"hp_offset_type:"
@@ -1512,7 +1512,7 @@ ptlrpc_lprocfs_nrs_orr_offset_type_seq_write(struct file *file,
         if (count > (sizeof(kernbuf) - 1))
                 return -EINVAL;
 
-	if (copy_from_user(kernbuf, buffer, count))
+	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
 		return -EFAULT;
 
         kernbuf[count] = '\0';
@@ -1607,8 +1607,7 @@ ptlrpc_lprocfs_nrs_orr_offset_type_seq_write(struct file *file,
 
 	return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count;
 }
-
-LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_offset_type);
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_offset_type);
 
 #define NRS_LPROCFS_REQ_SUPP_NAME_REG		"reg_supported:"
 #define NRS_LPROCFS_REQ_SUPP_NAME_HP		"hp_supported:"
@@ -1773,7 +1772,7 @@ ptlrpc_lprocfs_nrs_orr_supported_seq_write(struct file *file,
         if (count > (sizeof(kernbuf) - 1))
                 return -EINVAL;
 
-	if (copy_from_user(kernbuf, buffer, count))
+	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
 		return -EFAULT;
 
         kernbuf[count] = '\0';
@@ -1859,14 +1858,13 @@ ptlrpc_lprocfs_nrs_orr_supported_seq_write(struct file *file,
 
 	return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count;
 }
-
-LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_supported);
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_supported);
 
 static int nrs_orr_lprocfs_init(struct ptlrpc_service *svc)
 {
 	int	i;
 
-	struct ldebugfs_vars nrs_orr_lprocfs_vars[] = {
+	struct lprocfs_vars nrs_orr_lprocfs_vars[] = {
 		{ .name		= "nrs_orr_quantum",
 		  .fops		= &ptlrpc_lprocfs_nrs_orr_quantum_fops	},
 		{ .name		= "nrs_orr_offset_type",
@@ -1876,7 +1874,7 @@ static int nrs_orr_lprocfs_init(struct ptlrpc_service *svc)
 		{ NULL }
 	};
 
-	if (IS_ERR_OR_NULL(svc->srv_debugfs_entry))
+	if (svc->srv_procroot == NULL)
 		return 0;
 
 	lprocfs_orr_data.svc = svc;
@@ -1884,10 +1882,21 @@ static int nrs_orr_lprocfs_init(struct ptlrpc_service *svc)
 	for (i = 0; i < ARRAY_SIZE(nrs_orr_lprocfs_vars); i++)
 		nrs_orr_lprocfs_vars[i].data = &lprocfs_orr_data;
 
-	return ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_orr_lprocfs_vars,
-				 NULL);
+	return lprocfs_add_vars(svc->srv_procroot, nrs_orr_lprocfs_vars, NULL);
+}
+
+static void nrs_orr_lprocfs_fini(struct ptlrpc_service *svc)
+{
+	if (svc->srv_procroot == NULL)
+		return;
+
+	lprocfs_remove_proc_entry("nrs_orr_quantum", svc->srv_procroot);
+	lprocfs_remove_proc_entry("nrs_orr_offset_type", svc->srv_procroot);
+	lprocfs_remove_proc_entry("nrs_orr_supported", svc->srv_procroot);
 }
 
+#endif /* CONFIG_PROC_FS */
+
 static const struct ptlrpc_nrs_pol_ops nrs_orr_ops = {
 	.op_policy_init		= nrs_orr_init,
 	.op_policy_start	= nrs_orr_start,
@@ -1899,7 +1908,10 @@ static const struct ptlrpc_nrs_pol_ops nrs_orr_ops = {
 	.op_req_enqueue		= nrs_orr_req_add,
 	.op_req_dequeue		= nrs_orr_req_del,
 	.op_req_stop		= nrs_orr_req_stop,
+#ifdef CONFIG_PROC_FS
 	.op_lprocfs_init	= nrs_orr_lprocfs_init,
+	.op_lprocfs_fini	= nrs_orr_lprocfs_fini,
+#endif
 };
 
 struct ptlrpc_nrs_pol_conf nrs_conf_orr = {
@@ -1914,11 +1926,14 @@ struct ptlrpc_nrs_pol_conf nrs_conf_orr = {
  *
  * TRR reuses much of the functions and data structures of ORR
  */
+
+#ifdef CONFIG_PROC_FS
+
 static int nrs_trr_lprocfs_init(struct ptlrpc_service *svc)
 {
 	int	i;
 
-	struct ldebugfs_vars nrs_trr_lprocfs_vars[] = {
+	struct lprocfs_vars nrs_trr_lprocfs_vars[] = {
 		{ .name		= "nrs_trr_quantum",
 		  .fops		= &ptlrpc_lprocfs_nrs_orr_quantum_fops },
 		{ .name		= "nrs_trr_offset_type",
@@ -1928,7 +1943,7 @@ static int nrs_trr_lprocfs_init(struct ptlrpc_service *svc)
 		{ NULL }
 	};
 
-	if (IS_ERR_OR_NULL(svc->srv_debugfs_entry))
+	if (svc->srv_procroot == NULL)
 		return 0;
 
 	lprocfs_trr_data.svc = svc;
@@ -1936,10 +1951,21 @@ static int nrs_trr_lprocfs_init(struct ptlrpc_service *svc)
 	for (i = 0; i < ARRAY_SIZE(nrs_trr_lprocfs_vars); i++)
 		nrs_trr_lprocfs_vars[i].data = &lprocfs_trr_data;
 
-	return ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_trr_lprocfs_vars,
-				 NULL);
+	return lprocfs_add_vars(svc->srv_procroot, nrs_trr_lprocfs_vars, NULL);
 }
 
+static void nrs_trr_lprocfs_fini(struct ptlrpc_service *svc)
+{
+	if (svc->srv_procroot == NULL)
+		return;
+
+	lprocfs_remove_proc_entry("nrs_trr_quantum", svc->srv_procroot);
+	lprocfs_remove_proc_entry("nrs_trr_offset_type", svc->srv_procroot);
+	lprocfs_remove_proc_entry("nrs_trr_supported", svc->srv_procroot);
+}
+
+#endif /* CONFIG_PROC_FS */
+
 /**
  * Reuse much of the ORR functionality for TRR.
  */
@@ -1954,7 +1980,10 @@ static const struct ptlrpc_nrs_pol_ops nrs_trr_ops = {
 	.op_req_enqueue		= nrs_orr_req_add,
 	.op_req_dequeue		= nrs_orr_req_del,
 	.op_req_stop		= nrs_orr_req_stop,
+#ifdef CONFIG_PROC_FS
 	.op_lprocfs_init	= nrs_trr_lprocfs_init,
+	.op_lprocfs_fini	= nrs_trr_lprocfs_fini,
+#endif
 };
 
 struct ptlrpc_nrs_pol_conf nrs_conf_trr = {
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c
index 07710bdb7bfd9..a81485554013b 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c
@@ -42,7 +42,6 @@
 #include <obd_support.h>
 #include <obd_class.h>
 #include <libcfs/libcfs.h>
-#include <lustre_req_layout.h>
 #include "ptlrpc_internal.h"
 
 /**
@@ -301,7 +300,6 @@ nrs_tbf_rule_start(struct ptlrpc_nrs_policy *policy,
 
 	memcpy(rule->tr_name, start->tc_name, strlen(start->tc_name));
 	rule->tr_rpc_rate = start->u.tc_start.ts_rpc_rate;
-	rule->tr_flags = start->u.tc_start.ts_rule_flags;
 	rule->tr_nsecs = NSEC_PER_SEC;
 	do_div(rule->tr_nsecs, rule->tr_rpc_rate);
 	rule->tr_depth = tbf_depth;
@@ -523,9 +521,11 @@ tbf_cli_compare(struct cfs_binheap_node *e1, struct cfs_binheap_node *e2)
 	cli1 = container_of(e1, struct nrs_tbf_client, tc_node);
 	cli2 = container_of(e2, struct nrs_tbf_client, tc_node);
 
-	if (cli1->tc_deadline < cli2->tc_deadline)
+	if (cli1->tc_check_time + cli1->tc_nsecs <
+	    cli2->tc_check_time + cli2->tc_nsecs)
 		return 1;
-	else if (cli1->tc_deadline > cli2->tc_deadline)
+	else if (cli1->tc_check_time + cli1->tc_nsecs >
+		 cli2->tc_check_time + cli2->tc_nsecs)
 		return 0;
 
 	if (cli1->tc_check_time < cli2->tc_check_time)
@@ -570,7 +570,7 @@ static void *nrs_tbf_jobid_hop_key(struct hlist_node *hnode)
 	return cli->tc_jobid;
 }
 
-static void *nrs_tbf_hop_object(struct hlist_node *hnode)
+static void *nrs_tbf_jobid_hop_object(struct hlist_node *hnode)
 {
 	return hlist_entry(hnode, struct nrs_tbf_client, tc_hnode);
 }
@@ -609,7 +609,7 @@ static struct cfs_hash_ops nrs_tbf_jobid_hash_ops = {
 	.hs_hash	= nrs_tbf_jobid_hop_hash,
 	.hs_keycmp	= nrs_tbf_jobid_hop_keycmp,
 	.hs_key		= nrs_tbf_jobid_hop_key,
-	.hs_object	= nrs_tbf_hop_object,
+	.hs_object	= nrs_tbf_jobid_hop_object,
 	.hs_get		= nrs_tbf_jobid_hop_get,
 	.hs_put		= nrs_tbf_jobid_hop_put,
 	.hs_put_locked	= nrs_tbf_jobid_hop_put,
@@ -1071,6 +1071,11 @@ static void *nrs_tbf_nid_hop_key(struct hlist_node *hnode)
 	return &cli->tc_nid;
 }
 
+static void *nrs_tbf_nid_hop_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct nrs_tbf_client, tc_hnode);
+}
+
 static void nrs_tbf_nid_hop_get(struct cfs_hash *hs, struct hlist_node *hnode)
 {
 	struct nrs_tbf_client *cli = hlist_entry(hnode,
@@ -1106,7 +1111,7 @@ static struct cfs_hash_ops nrs_tbf_nid_hash_ops = {
 	.hs_hash	= nrs_tbf_nid_hop_hash,
 	.hs_keycmp	= nrs_tbf_nid_hop_keycmp,
 	.hs_key		= nrs_tbf_nid_hop_key,
-	.hs_object	= nrs_tbf_hop_object,
+	.hs_object	= nrs_tbf_nid_hop_object,
 	.hs_get		= nrs_tbf_nid_hop_get,
 	.hs_put		= nrs_tbf_nid_hop_put,
 	.hs_put_locked	= nrs_tbf_nid_hop_put,
@@ -1302,6 +1307,11 @@ static void *nrs_tbf_hop_key(struct hlist_node *hnode)
 	return cli->tc_key;
 }
 
+static void *nrs_tbf_hop_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct nrs_tbf_client, tc_hnode);
+}
+
 static void nrs_tbf_hop_get(struct cfs_hash *hs, struct hlist_node *hnode)
 {
 	struct nrs_tbf_client *cli = hlist_entry(hnode,
@@ -1405,263 +1415,23 @@ nrs_tbf_cli_hash_lookup(struct cfs_hash *hs, struct cfs_hash_bd *bd,
 	return cli;
 }
 
-/**
- * ONLY opcode presented in this function will be checked in
- * nrs_tbf_id_cli_set(). That means, we can add or remove an
- * opcode to enable or disable requests handled in nrs_tbf
- */
-static struct req_format *req_fmt(__u32 opcode)
-{
-	switch (opcode) {
-	case OST_GETATTR:
-		return &RQF_OST_GETATTR;
-	case OST_SETATTR:
-		return &RQF_OST_SETATTR;
-	case OST_READ:
-		return &RQF_OST_BRW_READ;
-	case OST_WRITE:
-		return &RQF_OST_BRW_WRITE;
-	/* FIXME: OST_CREATE and OST_DESTROY comes from MDS
-	 * in most case. Should they be removed? */
-	case OST_CREATE:
-		return &RQF_OST_CREATE;
-	case OST_DESTROY:
-		return &RQF_OST_DESTROY;
-	case OST_PUNCH:
-		return &RQF_OST_PUNCH;
-	case OST_SYNC:
-		return &RQF_OST_SYNC;
-	case OST_LADVISE:
-		return &RQF_OST_LADVISE;
-	case MDS_GETATTR:
-		return &RQF_MDS_GETATTR;
-	case MDS_GETATTR_NAME:
-		return &RQF_MDS_GETATTR_NAME;
-	/* close is skipped to avoid LDLM cancel slowness */
-#if 0
-	case MDS_CLOSE:
-		return &RQF_MDS_CLOSE;
-#endif
-	case MDS_REINT:
-		return &RQF_MDS_REINT;
-	case MDS_READPAGE:
-		return &RQF_MDS_READPAGE;
-	case MDS_GET_ROOT:
-		return &RQF_MDS_GET_ROOT;
-	case MDS_STATFS:
-		return &RQF_MDS_STATFS;
-	case MDS_SYNC:
-		return &RQF_MDS_SYNC;
-	case MDS_QUOTACTL:
-		return &RQF_MDS_QUOTACTL;
-	case MDS_GETXATTR:
-		return &RQF_MDS_GETXATTR;
-	case MDS_GET_INFO:
-		return &RQF_MDS_GET_INFO;
-	/* HSM op is skipped */
-#if 0 
-	case MDS_HSM_STATE_GET:
-		return &RQF_MDS_HSM_STATE_GET;
-	case MDS_HSM_STATE_SET:
-		return &RQF_MDS_HSM_STATE_SET;
-	case MDS_HSM_ACTION:
-		return &RQF_MDS_HSM_ACTION;
-	case MDS_HSM_CT_REGISTER:
-		return &RQF_MDS_HSM_CT_REGISTER;
-	case MDS_HSM_CT_UNREGISTER:
-		return &RQF_MDS_HSM_CT_UNREGISTER;
-#endif
-	case MDS_SWAP_LAYOUTS:
-		return &RQF_MDS_SWAP_LAYOUTS;
-	case LDLM_ENQUEUE:
-		return &RQF_LDLM_ENQUEUE;
-	default:
-		return NULL;
-	}
-}
-
-static struct req_format *intent_req_fmt(__u32 it_opc)
-{
-	if (it_opc & (IT_OPEN | IT_CREAT))
-		return &RQF_LDLM_INTENT_OPEN;
-	else if (it_opc & (IT_GETATTR | IT_LOOKUP))
-		return &RQF_LDLM_INTENT_GETATTR;
-	else if (it_opc & IT_GETXATTR)
-		return &RQF_LDLM_INTENT_GETXATTR;
-	else if (it_opc & (IT_GLIMPSE | IT_BRW))
-		return &RQF_LDLM_INTENT;
-	else
-		return NULL;
-}
-
-static int ost_tbf_id_cli_set(struct ptlrpc_request *req,
-			      struct tbf_id *id)
-{
-	struct ost_body *body;
-
-	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
-	if (body != NULL) {
-		id->ti_uid = body->oa.o_uid;
-		id->ti_gid = body->oa.o_gid;
-		return 0;
-	}
-
-	return -EINVAL;
-}
-
-static void unpack_ugid_from_mdt_body(struct ptlrpc_request *req,
-				      struct tbf_id *id)
-{
-	struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
-						    &RMF_MDT_BODY);
-	LASSERT(b != NULL);
-
-	/* TODO: nodemaping feature converts {ug}id from individual
-	 * clients to the actual ones of the file system. Some work
-	 * may be needed to fix this. */
-	id->ti_uid = b->mbo_uid;
-	id->ti_gid = b->mbo_gid;
-}
-
-static void unpack_ugid_from_mdt_rec_reint(struct ptlrpc_request *req,
-					   struct tbf_id *id)
-{
-	struct mdt_rec_reint *rec;
-
-	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
-	LASSERT(rec != NULL);
-
-	/* use the fs{ug}id as {ug}id of the process */
-	id->ti_uid = rec->rr_fsuid;
-	id->ti_gid = rec->rr_fsgid;
-}
-
-static int mdt_tbf_id_cli_set(struct ptlrpc_request *req,
-			      struct tbf_id *id)
-{
-	u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
-	int rc = 0;
-
-	switch (opc) {
-	case MDS_GETATTR:
-	case MDS_GETATTR_NAME:
-	case MDS_GET_ROOT:
-	case MDS_READPAGE:
-	case MDS_SYNC:
-	case MDS_GETXATTR:
-	case MDS_HSM_STATE_GET ... MDS_SWAP_LAYOUTS:
-		unpack_ugid_from_mdt_body(req, id);
-		break;
-	case MDS_CLOSE:
-	case MDS_REINT:
-		unpack_ugid_from_mdt_rec_reint(req, id);
-		break;
-	default:
-		rc = -EINVAL;
-		break;
-	}
-	return rc;
-}
-
-static int ldlm_tbf_id_cli_set(struct ptlrpc_request *req,
-			      struct tbf_id *id)
-{
-	struct ldlm_intent *lit;
-	struct req_format *fmt;
-
-	if (req->rq_reqmsg->lm_bufcount <= DLM_INTENT_IT_OFF)
-		return -EINVAL;
-
-	req_capsule_extend(&req->rq_pill, &RQF_LDLM_INTENT_BASIC);
-	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
-	if (lit == NULL)
-		return -EINVAL;
-
-	fmt = intent_req_fmt(lit->opc);
-	if (fmt == NULL)
-		return -EINVAL;
-
-	req_capsule_extend(&req->rq_pill, fmt);
-
-	if (lit->opc & (IT_GETXATTR | IT_GETATTR | IT_LOOKUP))
-		unpack_ugid_from_mdt_body(req, id);
-	else if (lit->opc & (IT_OPEN | IT_OPEN | IT_GLIMPSE | IT_BRW))
-		unpack_ugid_from_mdt_rec_reint(req, id);
-	else
-		return -EINVAL;
-	return 0;
-}
-
-static int nrs_tbf_id_cli_set(struct ptlrpc_request *req, struct tbf_id *id,
-			      enum nrs_tbf_flag ti_type)
-{
-	u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
-	struct req_format *fmt = req_fmt(opc);
-	bool fmt_unset = false;
-	int rc;
-
-	memset(id, 0, sizeof(struct tbf_id));
-	id->ti_type = ti_type;
-
-	if (fmt == NULL)
-		return -EINVAL;
-	req_capsule_init(&req->rq_pill, req, RCL_SERVER);
-	if (req->rq_pill.rc_fmt == NULL) {
-		req_capsule_set(&req->rq_pill, fmt);
-		fmt_unset = true;
-	}
-
-	if (opc < OST_LAST_OPC)
-		rc = ost_tbf_id_cli_set(req, id);
-	else if (opc >= MDS_FIRST_OPC && opc < MDS_LAST_OPC)
-		rc = mdt_tbf_id_cli_set(req, id);
-	else if (opc == LDLM_ENQUEUE)
-		rc = ldlm_tbf_id_cli_set(req, id);
-	else
-		rc = -EINVAL;
-
-	/* restore it to the initialized state */
-	if (fmt_unset)
-		req->rq_pill.rc_fmt = NULL;
-	return rc;
-}
-
-static inline void nrs_tbf_cli_gen_key(struct nrs_tbf_client *cli,
-				       struct ptlrpc_request *req,
-				       char *keystr, size_t keystr_sz)
-{
-	const char *jobid;
-	u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
-	struct tbf_id id;
-
-	nrs_tbf_id_cli_set(req, &id, NRS_TBF_FLAG_UID | NRS_TBF_FLAG_GID);
-	jobid = lustre_msg_get_jobid(req->rq_reqmsg);
-	if (jobid == NULL)
-		jobid = NRS_TBF_JOBID_NULL;
-
-	snprintf(keystr, keystr_sz, "%s_%s_%d_%u_%u", jobid,
-		 libcfs_nid2str(req->rq_peer.nid), opc, id.ti_uid,
-		 id.ti_gid);
-
-	if (cli) {
-		INIT_LIST_HEAD(&cli->tc_lru);
-		strlcpy(cli->tc_key, keystr, sizeof(cli->tc_key));
-		strlcpy(cli->tc_jobid, jobid, sizeof(cli->tc_jobid));
-		cli->tc_nid = req->rq_peer.nid;
-		cli->tc_opcode = opc;
-		cli->tc_id = id;
-	}
-}
-
 static struct nrs_tbf_client *
 nrs_tbf_cli_find(struct nrs_tbf_head *head, struct ptlrpc_request *req)
 {
 	struct nrs_tbf_client *cli;
 	struct cfs_hash *hs = head->th_cli_hash;
 	struct cfs_hash_bd bd;
-	char keystr[NRS_TBF_KEY_LEN];
+	char keystr[NRS_TBF_KEY_LEN] = { '\0' };
+	const char *jobid;
+	__u32 opc;
 
-	nrs_tbf_cli_gen_key(NULL, req, keystr, sizeof(keystr));
+	jobid = lustre_msg_get_jobid(req->rq_reqmsg);
+	if (jobid == NULL)
+		jobid = NRS_TBF_JOBID_NULL;
+	opc = lustre_msg_get_opc(req->rq_reqmsg);
+	snprintf(keystr, sizeof(keystr), "%s_%s_%d", jobid,
+		 libcfs_nid2str(req->rq_peer.nid), opc);
+	LASSERT(strlen(keystr) < NRS_TBF_KEY_LEN);
 	cfs_hash_bd_get_and_lock(hs, (void *)keystr, &bd, 1);
 	cli = nrs_tbf_cli_hash_lookup(hs, &bd, keystr);
 	cfs_hash_bd_unlock(hs, &bd, 1);
@@ -1736,19 +1506,22 @@ nrs_tbf_generic_cli_init(struct nrs_tbf_client *cli,
 			 struct ptlrpc_request *req)
 {
 	char keystr[NRS_TBF_KEY_LEN];
+	const char *jobid;
+	__u32 opc;
 
-	nrs_tbf_cli_gen_key(cli, req, keystr, sizeof(keystr));
-}
-
-static void
-nrs_tbf_id_list_free(struct list_head *uid_list)
-{
-	struct nrs_tbf_id *nti_id, *n;
+	jobid = lustre_msg_get_jobid(req->rq_reqmsg);
+	if (jobid == NULL)
+		jobid = NRS_TBF_JOBID_NULL;
+	opc = lustre_msg_get_opc(req->rq_reqmsg);
+	snprintf(keystr, sizeof(keystr), "%s_%s_%d", jobid,
+		 libcfs_nid2str(req->rq_peer.nid), opc);
 
-	list_for_each_entry_safe(nti_id, n, uid_list, nti_linkage) {
-		list_del_init(&nti_id->nti_linkage);
-		OBD_FREE_PTR(nti_id);
-	}
+	LASSERT(strlen(keystr) < NRS_TBF_KEY_LEN);
+	INIT_LIST_HEAD(&cli->tc_lru);
+	memcpy(cli->tc_key, keystr, strlen(keystr));
+	memcpy(cli->tc_jobid, jobid, strlen(jobid));
+	cli->tc_nid = req->rq_peer.nid;
+	cli->tc_opcode = opc;
 }
 
 static void
@@ -1766,10 +1539,6 @@ nrs_tbf_expression_free(struct nrs_tbf_expression *expr)
 	case NRS_TBF_FIELD_OPCODE:
 		CFS_FREE_BITMAP(expr->te_opcodes);
 		break;
-	case NRS_TBF_FIELD_UID:
-	case NRS_TBF_FIELD_GID:
-		nrs_tbf_id_list_free(&expr->te_cond);
-		break;
 	default:
 		LBUG();
 	}
@@ -1829,9 +1598,6 @@ nrs_tbf_check_field(struct cfs_lstr *field, char *str)
 
 static int
 nrs_tbf_opcode_list_parse(char *str, int len, struct cfs_bitmap **bitmaptr);
-static int
-nrs_tbf_id_list_parse(char *str, int len, struct list_head *id_list,
-		      enum nrs_tbf_flag tif);
 
 static int
 nrs_tbf_expression_parse(struct cfs_lstr *src, struct list_head *cond_list)
@@ -1871,23 +1637,8 @@ nrs_tbf_expression_parse(struct cfs_lstr *src, struct list_head *cond_list)
 					      &expr->te_opcodes) < 0)
 			GOTO(out, rc = -EINVAL);
 		expr->te_field = NRS_TBF_FIELD_OPCODE;
-	} else if (nrs_tbf_check_field(&field, "uid")) {
-		if (nrs_tbf_id_list_parse(src->ls_str,
-					  src->ls_len,
-					  &expr->te_cond,
-					  NRS_TBF_FLAG_UID) < 0)
-			GOTO(out, rc = -EINVAL);
-		expr->te_field = NRS_TBF_FIELD_UID;
-	} else if (nrs_tbf_check_field(&field, "gid")) {
-		if (nrs_tbf_id_list_parse(src->ls_str,
-					  src->ls_len,
-					  &expr->te_cond,
-					  NRS_TBF_FLAG_GID) < 0)
-			GOTO(out, rc = -EINVAL);
-		expr->te_field = NRS_TBF_FIELD_GID;
-	} else {
+	} else
 		GOTO(out, rc = -EINVAL);
-	}
 
 	list_add_tail(&expr->te_linkage, cond_list);
 	return 0;
@@ -1968,9 +1719,6 @@ nrs_tbf_generic_parse(struct nrs_tbf_cmd *cmd, const char *id)
 	return rc;
 }
 
-static int
-nrs_tbf_id_list_match(struct list_head *id_list, struct tbf_id id);
-
 static int
 nrs_tbf_expression_match(struct nrs_tbf_expression *expr,
 			 struct nrs_tbf_rule *rule,
@@ -1983,9 +1731,6 @@ nrs_tbf_expression_match(struct nrs_tbf_expression *expr,
 		return nrs_tbf_jobid_list_match(&expr->te_cond, cli->tc_jobid);
 	case NRS_TBF_FIELD_OPCODE:
 		return cfs_bitmap_check(expr->te_opcodes, cli->tc_opcode);
-	case NRS_TBF_FIELD_UID:
-	case NRS_TBF_FIELD_GID:
-		return nrs_tbf_id_list_match(&expr->te_cond, cli->tc_id);
 	default:
 		return 0;
 	}
@@ -2123,6 +1868,11 @@ static void *nrs_tbf_opcode_hop_key(struct hlist_node *hnode)
 	return &cli->tc_opcode;
 }
 
+static void *nrs_tbf_opcode_hop_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct nrs_tbf_client, tc_hnode);
+}
+
 static void nrs_tbf_opcode_hop_get(struct cfs_hash *hs,
 				   struct hlist_node *hnode)
 {
@@ -2161,7 +1911,7 @@ static struct cfs_hash_ops nrs_tbf_opcode_hash_ops = {
 	.hs_hash	= nrs_tbf_opcode_hop_hash,
 	.hs_keycmp	= nrs_tbf_opcode_hop_keycmp,
 	.hs_key		= nrs_tbf_opcode_hop_key,
-	.hs_object	= nrs_tbf_hop_object,
+	.hs_object	= nrs_tbf_opcode_hop_object,
 	.hs_get		= nrs_tbf_opcode_hop_get,
 	.hs_put		= nrs_tbf_opcode_hop_put,
 	.hs_put_locked	= nrs_tbf_opcode_hop_put,
@@ -2377,340 +2127,6 @@ struct nrs_tbf_ops nrs_tbf_opcode_ops = {
 	.o_rule_fini = nrs_tbf_opcode_rule_fini,
 };
 
-static unsigned nrs_tbf_id_hop_hash(struct cfs_hash *hs, const void *key,
-				    unsigned mask)
-{
-	return cfs_hash_djb2_hash(key, sizeof(struct tbf_id), mask);
-}
-
-static int nrs_tbf_id_hop_keycmp(const void *key, struct hlist_node *hnode)
-{
-	const struct tbf_id *opc = key;
-	enum nrs_tbf_flag ntf;
-	struct nrs_tbf_client *cli = hlist_entry(hnode, struct nrs_tbf_client,
-						 tc_hnode);
-	ntf = opc->ti_type & cli->tc_id.ti_type;
-	if ((ntf & NRS_TBF_FLAG_UID) && opc->ti_uid != cli->tc_id.ti_uid)
-		return 0;
-
-	if ((ntf & NRS_TBF_FLAG_GID) && opc->ti_gid != cli->tc_id.ti_gid)
-		return 0;
-
-	return 1;
-}
-
-static void *nrs_tbf_id_hop_key(struct hlist_node *hnode)
-{
-	struct nrs_tbf_client *cli = hlist_entry(hnode,
-						 struct nrs_tbf_client,
-						 tc_hnode);
-	return &cli->tc_id;
-}
-
-static void nrs_tbf_id_hop_get(struct cfs_hash *hs, struct hlist_node *hnode)
-{
-	struct nrs_tbf_client *cli = hlist_entry(hnode,
-						 struct nrs_tbf_client,
-						 tc_hnode);
-
-	atomic_inc(&cli->tc_ref);
-}
-
-static void nrs_tbf_id_hop_put(struct cfs_hash *hs, struct hlist_node *hnode)
-{
-	struct nrs_tbf_client *cli = hlist_entry(hnode,
-						 struct nrs_tbf_client,
-						 tc_hnode);
-
-	atomic_dec(&cli->tc_ref);
-}
-
-static void
-nrs_tbf_id_hop_exit(struct cfs_hash *hs, struct hlist_node *hnode)
-
-{
-	struct nrs_tbf_client *cli = hlist_entry(hnode,
-						 struct nrs_tbf_client,
-						 tc_hnode);
-
-	LASSERT(atomic_read(&cli->tc_ref) == 0);
-	nrs_tbf_cli_fini(cli);
-}
-
-static struct cfs_hash_ops nrs_tbf_id_hash_ops = {
-	.hs_hash	= nrs_tbf_id_hop_hash,
-	.hs_keycmp	= nrs_tbf_id_hop_keycmp,
-	.hs_key		= nrs_tbf_id_hop_key,
-	.hs_object	= nrs_tbf_hop_object,
-	.hs_get		= nrs_tbf_id_hop_get,
-	.hs_put		= nrs_tbf_id_hop_put,
-	.hs_put_locked	= nrs_tbf_id_hop_put,
-	.hs_exit	= nrs_tbf_id_hop_exit,
-};
-
-static int
-nrs_tbf_id_startup(struct ptlrpc_nrs_policy *policy,
-		   struct nrs_tbf_head *head)
-{
-	struct nrs_tbf_cmd start;
-	int rc;
-
-	head->th_cli_hash = cfs_hash_create("nrs_tbf_id_hash",
-					    NRS_TBF_NID_BITS,
-					    NRS_TBF_NID_BITS,
-					    NRS_TBF_NID_BKT_BITS, 0,
-					    CFS_HASH_MIN_THETA,
-					    CFS_HASH_MAX_THETA,
-					    &nrs_tbf_id_hash_ops,
-					    CFS_HASH_RW_BKTLOCK);
-	if (head->th_cli_hash == NULL)
-		return -ENOMEM;
-
-	memset(&start, 0, sizeof(start));
-	start.u.tc_start.ts_ids_str = "*";
-	start.u.tc_start.ts_rpc_rate = tbf_rate;
-	start.u.tc_start.ts_rule_flags = NTRS_DEFAULT;
-	start.tc_name = NRS_TBF_DEFAULT_RULE;
-	INIT_LIST_HEAD(&start.u.tc_start.ts_ids);
-	rc = nrs_tbf_rule_start(policy, head, &start);
-	if (rc) {
-		cfs_hash_putref(head->th_cli_hash);
-		head->th_cli_hash = NULL;
-	}
-
-	return rc;
-}
-
-static struct nrs_tbf_client *
-nrs_tbf_id_cli_find(struct nrs_tbf_head *head,
-		    struct ptlrpc_request *req)
-{
-	struct tbf_id id;
-
-	LASSERT(head->th_type_flag == NRS_TBF_FLAG_UID ||
-		head->th_type_flag == NRS_TBF_FLAG_GID);
-
-	nrs_tbf_id_cli_set(req, &id, head->th_type_flag);
-	return cfs_hash_lookup(head->th_cli_hash, &id);
-}
-
-static struct nrs_tbf_client *
-nrs_tbf_id_cli_findadd(struct nrs_tbf_head *head,
-		       struct nrs_tbf_client *cli)
-{
-	return cfs_hash_findadd_unique(head->th_cli_hash, &cli->tc_id,
-				       &cli->tc_hnode);
-}
-
-static void
-nrs_tbf_uid_cli_init(struct nrs_tbf_client *cli,
-		     struct ptlrpc_request *req)
-{
-	nrs_tbf_id_cli_set(req, &cli->tc_id, NRS_TBF_FLAG_UID);
-}
-
-static void
-nrs_tbf_gid_cli_init(struct nrs_tbf_client *cli,
-		     struct ptlrpc_request *req)
-{
-	nrs_tbf_id_cli_set(req, &cli->tc_id, NRS_TBF_FLAG_GID);
-}
-
-static int
-nrs_tbf_id_list_match(struct list_head *id_list, struct tbf_id id)
-{
-	struct nrs_tbf_id *nti_id;
-	enum nrs_tbf_flag flag;
-
-	list_for_each_entry(nti_id, id_list, nti_linkage) {
-		flag = id.ti_type & nti_id->nti_id.ti_type;
-		if (!flag)
-			continue;
-
-		if ((flag & NRS_TBF_FLAG_UID) &&
-		    (id.ti_uid != nti_id->nti_id.ti_uid))
-			continue;
-
-		if ((flag & NRS_TBF_FLAG_GID) &&
-		    (id.ti_gid != nti_id->nti_id.ti_gid))
-			continue;
-
-		return 1;
-	}
-	return 0;
-}
-
-static int
-nrs_tbf_id_rule_match(struct nrs_tbf_rule *rule,
-		      struct nrs_tbf_client *cli)
-{
-	return nrs_tbf_id_list_match(&rule->tr_ids, cli->tc_id);
-}
-
-static void nrs_tbf_id_cmd_fini(struct nrs_tbf_cmd *cmd)
-{
-	nrs_tbf_id_list_free(&cmd->u.tc_start.ts_ids);
-
-	if (cmd->u.tc_start.ts_ids_str)
-		OBD_FREE(cmd->u.tc_start.ts_ids_str,
-			 strlen(cmd->u.tc_start.ts_ids_str) + 1);
-}
-
-static int
-nrs_tbf_id_list_parse(char *str, int len, struct list_head *id_list,
-		      enum nrs_tbf_flag tif)
-{
-	struct cfs_lstr src;
-	struct cfs_lstr res;
-	int rc = 0;
-	struct tbf_id id = { 0 };
-	ENTRY;
-
-	if (tif != NRS_TBF_FLAG_UID && tif != NRS_TBF_FLAG_GID)
-		RETURN(-EINVAL);
-
-	src.ls_str = str;
-	src.ls_len = len;
-	INIT_LIST_HEAD(id_list);
-	while (src.ls_str) {
-		struct nrs_tbf_id *nti_id;
-
-		if (cfs_gettok(&src, ' ', &res) == 0)
-			GOTO(out, rc = -EINVAL);
-
-		id.ti_type = tif;
-		if (tif == NRS_TBF_FLAG_UID) {
-			if (!cfs_str2num_check(res.ls_str, res.ls_len,
-					       &id.ti_uid, 0, (u32)~0U))
-				GOTO(out, rc = -EINVAL);
-		} else {
-			if (!cfs_str2num_check(res.ls_str, res.ls_len,
-					       &id.ti_gid, 0, (u32)~0U))
-				GOTO(out, rc = -EINVAL);
-		}
-
-		OBD_ALLOC_PTR(nti_id);
-		if (nti_id == NULL)
-			GOTO(out, rc = -ENOMEM);
-
-		nti_id->nti_id = id;
-		list_add_tail(&nti_id->nti_linkage, id_list);
-	}
-out:
-	if (rc)
-		nrs_tbf_id_list_free(id_list);
-	RETURN(rc);
-}
-
-static int nrs_tbf_ug_id_parse(struct nrs_tbf_cmd *cmd, char *id)
-{
-	struct cfs_lstr src;
-	int rc;
-	enum nrs_tbf_flag tif;
-
-	tif = cmd->u.tc_start.ts_valid_type;
-
-	src.ls_str = id;
-	src.ls_len = strlen(id);
-
-	rc = nrs_tbf_check_id_value(&src,
-				    tif == NRS_TBF_FLAG_UID ? "uid" : "gid");
-	if (rc)
-		return rc;
-
-	OBD_ALLOC(cmd->u.tc_start.ts_ids_str, src.ls_len + 1);
-	if (cmd->u.tc_start.ts_ids_str == NULL)
-		return -ENOMEM;
-
-	strlcpy(cmd->u.tc_start.ts_ids_str, src.ls_str, src.ls_len + 1);
-
-	rc = nrs_tbf_id_list_parse(cmd->u.tc_start.ts_ids_str,
-				   strlen(cmd->u.tc_start.ts_ids_str),
-				   &cmd->u.tc_start.ts_ids, tif);
-	if (rc)
-		nrs_tbf_id_cmd_fini(cmd);
-
-	return rc;
-}
-
-static int
-nrs_tbf_id_rule_init(struct ptlrpc_nrs_policy *policy,
-		     struct nrs_tbf_rule *rule,
-		     struct nrs_tbf_cmd *start)
-{
-	struct nrs_tbf_head *head = rule->tr_head;
-	int rc = 0;
-	enum nrs_tbf_flag tif = head->th_type_flag;
-	int ids_len = strlen(start->u.tc_start.ts_ids_str) + 1;
-
-	LASSERT(start->u.tc_start.ts_ids_str);
-	INIT_LIST_HEAD(&rule->tr_ids);
-
-	OBD_ALLOC(rule->tr_ids_str, ids_len);
-	if (rule->tr_ids_str == NULL)
-		return -ENOMEM;
-
-	strlcpy(rule->tr_ids_str, start->u.tc_start.ts_ids_str,
-		ids_len);
-
-	if (!list_empty(&start->u.tc_start.ts_ids)) {
-		rc = nrs_tbf_id_list_parse(rule->tr_ids_str,
-					   strlen(rule->tr_ids_str),
-					   &rule->tr_ids, tif);
-		if (rc)
-			CERROR("%ss {%s} illegal\n",
-			       tif == NRS_TBF_FLAG_UID ? "uid" : "gid",
-			       rule->tr_ids_str);
-	}
-	if (rc) {
-		OBD_FREE(rule->tr_ids_str, ids_len);
-		rule->tr_ids_str = NULL;
-	}
-	return rc;
-}
-
-static int
-nrs_tbf_id_rule_dump(struct nrs_tbf_rule *rule, struct seq_file *m)
-{
-	seq_printf(m, "%s {%s} %llu, ref %d\n", rule->tr_name,
-		   rule->tr_ids_str, rule->tr_rpc_rate,
-		   atomic_read(&rule->tr_ref) - 1);
-	return 0;
-}
-
-static void nrs_tbf_id_rule_fini(struct nrs_tbf_rule *rule)
-{
-	nrs_tbf_id_list_free(&rule->tr_ids);
-	if (rule->tr_ids_str != NULL)
-		OBD_FREE(rule->tr_ids_str, strlen(rule->tr_ids_str) + 1);
-}
-
-struct nrs_tbf_ops nrs_tbf_uid_ops = {
-	.o_name = NRS_TBF_TYPE_UID,
-	.o_startup = nrs_tbf_id_startup,
-	.o_cli_find = nrs_tbf_id_cli_find,
-	.o_cli_findadd = nrs_tbf_id_cli_findadd,
-	.o_cli_put = nrs_tbf_nid_cli_put,
-	.o_cli_init = nrs_tbf_uid_cli_init,
-	.o_rule_init = nrs_tbf_id_rule_init,
-	.o_rule_dump = nrs_tbf_id_rule_dump,
-	.o_rule_match = nrs_tbf_id_rule_match,
-	.o_rule_fini = nrs_tbf_id_rule_fini,
-};
-
-struct nrs_tbf_ops nrs_tbf_gid_ops = {
-	.o_name = NRS_TBF_TYPE_GID,
-	.o_startup = nrs_tbf_id_startup,
-	.o_cli_find = nrs_tbf_id_cli_find,
-	.o_cli_findadd = nrs_tbf_id_cli_findadd,
-	.o_cli_put = nrs_tbf_nid_cli_put,
-	.o_cli_init = nrs_tbf_gid_cli_init,
-	.o_rule_init = nrs_tbf_id_rule_init,
-	.o_rule_dump = nrs_tbf_id_rule_dump,
-	.o_rule_match = nrs_tbf_id_rule_match,
-	.o_rule_fini = nrs_tbf_id_rule_fini,
-};
-
 static struct nrs_tbf_type nrs_tbf_types[] = {
 	{
 		.ntt_name = NRS_TBF_TYPE_JOBID,
@@ -2732,16 +2148,6 @@ static struct nrs_tbf_type nrs_tbf_types[] = {
 		.ntt_flag = NRS_TBF_FLAG_GENERIC,
 		.ntt_ops = &nrs_tbf_generic_ops,
 	},
-	{
-		.ntt_name = NRS_TBF_TYPE_UID,
-		.ntt_flag = NRS_TBF_FLAG_UID,
-		.ntt_ops = &nrs_tbf_uid_ops,
-	},
-	{
-		.ntt_name = NRS_TBF_TYPE_GID,
-		.ntt_flag = NRS_TBF_FLAG_GID,
-		.ntt_ops = &nrs_tbf_gid_ops,
-	},
 };
 
 /**
@@ -3070,12 +2476,10 @@ struct ptlrpc_nrs_request *nrs_tbf_req_get(struct ptlrpc_nrs_policy *policy,
 				     struct ptlrpc_nrs_request,
 				     nr_u.tbf.tr_list);
 	} else {
-		struct nrs_tbf_rule *rule = cli->tc_rule;
 		__u64 now = ktime_to_ns(ktime_get());
 		__u64 passed;
 		__u64 ntoken;
 		__u64 deadline;
-		__u64 old_resid = 0;
 
 		deadline = cli->tc_check_time +
 			  cli->tc_nsecs;
@@ -3083,19 +2487,9 @@ struct ptlrpc_nrs_request *nrs_tbf_req_get(struct ptlrpc_nrs_policy *policy,
 		passed = now - cli->tc_check_time;
 		ntoken = passed * cli->tc_rpc_rate;
 		do_div(ntoken, NSEC_PER_SEC);
-
 		ntoken += cli->tc_ntoken;
-		if (rule->tr_flags & NTRS_REALTIME) {
-			LASSERT(cli->tc_nsecs_resid < cli->tc_nsecs);
-			old_resid = cli->tc_nsecs_resid;
-			cli->tc_nsecs_resid += passed % cli->tc_nsecs;
-			if (cli->tc_nsecs_resid > cli->tc_nsecs) {
-				ntoken++;
-				cli->tc_nsecs_resid -= cli->tc_nsecs;
-			}
-		} else if (ntoken > cli->tc_depth)
+		if (ntoken > cli->tc_depth)
 			ntoken = cli->tc_depth;
-
 		if (ntoken > 0) {
 			struct ptlrpc_request *req;
 			nrq = list_entry(cli->tc_list.next,
@@ -3113,8 +2507,6 @@ struct ptlrpc_nrs_request *nrs_tbf_req_get(struct ptlrpc_nrs_policy *policy,
 						   &cli->tc_node);
 				cli->tc_in_heap = false;
 			} else {
-				if (!(rule->tr_flags & NTRS_REALTIME))
-					cli->tc_deadline = now + cli->tc_nsecs;
 				cfs_binheap_relocate(head->th_binheap,
 						     &cli->tc_node);
 			}
@@ -3128,15 +2520,6 @@ struct ptlrpc_nrs_request *nrs_tbf_req_get(struct ptlrpc_nrs_policy *policy,
 		} else {
 			ktime_t time;
 
-			if (rule->tr_flags & NTRS_REALTIME) {
-				cli->tc_deadline = deadline;
-				cli->tc_nsecs_resid = old_resid;
-				cfs_binheap_relocate(head->th_binheap,
-						     &cli->tc_node);
-				if (node != cfs_binheap_root(head->th_binheap))
-					return nrs_tbf_req_get(policy,
-							       peek, force);
-			}
 			policy->pol_nrs->nrs_throttling = 1;
 			head->th_deadline = deadline;
 			time = ktime_set(0, 0);
@@ -3172,7 +2555,6 @@ static int nrs_tbf_req_add(struct ptlrpc_nrs_policy *policy,
 			    struct nrs_tbf_head, th_res);
 	if (list_empty(&cli->tc_list)) {
 		LASSERT(!cli->tc_in_heap);
-		cli->tc_deadline = cli->tc_check_time + cli->tc_nsecs;
 		rc = cfs_binheap_insert(head->th_binheap, &cli->tc_node);
 		if (rc == 0) {
 			cli->tc_in_heap = true;
@@ -3180,7 +2562,8 @@ static int nrs_tbf_req_add(struct ptlrpc_nrs_policy *policy,
 			list_add_tail(&nrq->nr_u.tbf.tr_list,
 					  &cli->tc_list);
 			if (policy->pol_nrs->nrs_throttling) {
-				__u64 deadline = cli->tc_deadline;
+				__u64 deadline = cli->tc_check_time +
+						 cli->tc_nsecs;
 				if ((head->th_deadline > deadline) &&
 				    (hrtimer_try_to_cancel(&head->th_timer)
 				     >= 0)) {
@@ -3266,8 +2649,10 @@ static void nrs_tbf_req_stop(struct ptlrpc_nrs_policy *policy,
 	       nrq->nr_u.tbf.tr_sequence);
 }
 
+#ifdef CONFIG_PROC_FS
+
 /**
- * debugfs interface
+ * lprocfs interface
  */
 
 /**
@@ -3334,7 +2719,6 @@ ptlrpc_lprocfs_nrs_tbf_rule_seq_show(struct seq_file *m, void *data)
 static int nrs_tbf_id_parse(struct nrs_tbf_cmd *cmd, char *token)
 {
 	int rc;
-	ENTRY;
 
 	switch (cmd->u.tc_start.ts_valid_type) {
 	case NRS_TBF_FLAG_JOBID:
@@ -3349,41 +2733,24 @@ static int nrs_tbf_id_parse(struct nrs_tbf_cmd *cmd, char *token)
 	case NRS_TBF_FLAG_GENERIC:
 		rc = nrs_tbf_generic_parse(cmd, token);
 		break;
-	case NRS_TBF_FLAG_UID:
-	case NRS_TBF_FLAG_GID:
-		rc = nrs_tbf_ug_id_parse(cmd, token);
-		break;
 	default:
 		RETURN(-EINVAL);
 	}
 
-	RETURN(rc);
+	return rc;
 }
 
 static void nrs_tbf_cmd_fini(struct nrs_tbf_cmd *cmd)
 {
 	if (cmd->tc_cmd == NRS_CTL_TBF_START_RULE) {
-		switch (cmd->u.tc_start.ts_valid_type) {
-		case NRS_TBF_FLAG_JOBID:
+		if (cmd->u.tc_start.ts_valid_type == NRS_TBF_FLAG_JOBID)
 			nrs_tbf_jobid_cmd_fini(cmd);
-			break;
-		case NRS_TBF_FLAG_NID:
+		else if (cmd->u.tc_start.ts_valid_type == NRS_TBF_FLAG_NID)
 			nrs_tbf_nid_cmd_fini(cmd);
-			break;
-		case NRS_TBF_FLAG_OPCODE:
+		else if (cmd->u.tc_start.ts_valid_type == NRS_TBF_FLAG_OPCODE)
 			nrs_tbf_opcode_cmd_fini(cmd);
-			break;
-		case NRS_TBF_FLAG_GENERIC:
+		else if (cmd->u.tc_start.ts_valid_type == NRS_TBF_FLAG_GENERIC)
 			nrs_tbf_generic_cmd_fini(cmd);
-			break;
-		case NRS_TBF_FLAG_UID:
-		case NRS_TBF_FLAG_GID:
-			nrs_tbf_id_cmd_fini(cmd);
-			break;
-		default:
-			CWARN("unknown NRS_TBF_FLAGS:0x%x\n",
-			      cmd->u.tc_start.ts_valid_type);
-		}
 	}
 }
 
@@ -3437,15 +2804,6 @@ nrs_tbf_parse_value_pair(struct nrs_tbf_cmd *cmd, char *buffer)
 			cmd->u.tc_change.tc_next_name = val;
 		else
 			return -EINVAL;
-	} else if (strcmp(key, "realtime") == 0) {
-		unsigned long realtime;
-
-		rc = kstrtoul(val, 10, &realtime);
-		if (rc)
-			return rc;
-
-		if (realtime > 0)
-			cmd->u.tc_start.ts_rule_flags |= NTRS_REALTIME;
 	} else {
 		return -EINVAL;
 	}
@@ -3607,7 +2965,7 @@ ptlrpc_lprocfs_nrs_tbf_rule_seq_write(struct file *file,
 	if (count > LPROCFS_WR_NRS_TBF_MAX_CMD - 1)
 		GOTO(out_free_kernbuff, rc = -EINVAL);
 
-	if (copy_from_user(kernbuf, buffer, count))
+	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
 		GOTO(out_free_kernbuff, rc = -EFAULT);
 
 	val = kernbuf;
@@ -3655,8 +3013,7 @@ ptlrpc_lprocfs_nrs_tbf_rule_seq_write(struct file *file,
 out:
 	return rc ? rc : count;
 }
-
-LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_tbf_rule);
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_tbf_rule);
 
 /**
  * Initializes a TBF policy's lprocfs interface for service \a svc
@@ -3668,20 +3025,34 @@ LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_tbf_rule);
  */
 static int nrs_tbf_lprocfs_init(struct ptlrpc_service *svc)
 {
-	struct ldebugfs_vars nrs_tbf_lprocfs_vars[] = {
+	struct lprocfs_vars nrs_tbf_lprocfs_vars[] = {
 		{ .name		= "nrs_tbf_rule",
 		  .fops		= &ptlrpc_lprocfs_nrs_tbf_rule_fops,
 		  .data = svc },
 		{ NULL }
 	};
 
-	if (IS_ERR_OR_NULL(svc->srv_debugfs_entry))
+	if (svc->srv_procroot == NULL)
 		return 0;
 
-	return ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_tbf_lprocfs_vars,
-				 NULL);
+	return lprocfs_add_vars(svc->srv_procroot, nrs_tbf_lprocfs_vars, NULL);
 }
 
+/**
+ * Cleans up a TBF policy's lprocfs interface for service \a svc
+ *
+ * \param[in] svc the service
+ */
+static void nrs_tbf_lprocfs_fini(struct ptlrpc_service *svc)
+{
+	if (svc->srv_procroot == NULL)
+		return;
+
+	lprocfs_remove_proc_entry("nrs_tbf_rule", svc->srv_procroot);
+}
+
+#endif /* CONFIG_PROC_FS */
+
 /**
  * TBF policy operations
  */
@@ -3695,7 +3066,10 @@ static const struct ptlrpc_nrs_pol_ops nrs_tbf_ops = {
 	.op_req_enqueue		= nrs_tbf_req_add,
 	.op_req_dequeue		= nrs_tbf_req_del,
 	.op_req_stop		= nrs_tbf_req_stop,
+#ifdef CONFIG_PROC_FS
 	.op_lprocfs_init	= nrs_tbf_lprocfs_init,
+	.op_lprocfs_fini	= nrs_tbf_lprocfs_fini,
+#endif
 };
 
 /**
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c b/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c
index 5e2d384435fbb..3e97aa6332ed3 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -42,6 +42,8 @@
 
 #include <libcfs/libcfs.h>
 
+#include <lustre/ll_fiemap.h>
+
 #include <llog_swab.h>
 #include <lustre_net.h>
 #include <lustre_swab.h>
@@ -60,15 +62,13 @@ static inline __u32 lustre_msg_hdr_size_v2(__u32 count)
 
 __u32 lustre_msg_hdr_size(__u32 magic, __u32 count)
 {
-	LASSERT(count > 0);
-
-	switch (magic) {
-	case LUSTRE_MSG_MAGIC_V2:
-		return lustre_msg_hdr_size_v2(count);
-	default:
-		LASSERTF(0, "incorrect message magic: %08x\n", magic);
+        switch (magic) {
+        case LUSTRE_MSG_MAGIC_V2:
+                return lustre_msg_hdr_size_v2(count);
+        default:
+                LASSERTF(0, "incorrect message magic: %08x\n", magic);
 		return 0;
-	}
+        }
 }
 
 void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout,
@@ -80,26 +80,25 @@ void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout,
                 lustre_set_rep_swabbed(req, index);
 }
 
-bool ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout,
-			  __u32 index)
+int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout,
+			 __u32 index)
 {
-	if (inout)
-		return (ptlrpc_req_need_swab(req) &&
-			!lustre_req_swabbed(req, index));
-
-	return (ptlrpc_rep_need_swab(req) && !lustre_rep_swabbed(req, index));
+        if (inout)
+                return (ptlrpc_req_need_swab(req) &&
+                        !lustre_req_swabbed(req, index));
+        else
+                return (ptlrpc_rep_need_swab(req) &&
+                        !lustre_rep_swabbed(req, index));
 }
 
 static inline int lustre_msg_check_version_v2(struct lustre_msg_v2 *msg,
-					      enum lustre_msg_version version)
+						__u32 version)
 {
-	enum lustre_msg_version ver = lustre_msg_get_version(msg);
-
-	return (ver & LUSTRE_VERSION_MASK) != version;
+        __u32 ver = lustre_msg_get_version(msg);
+        return (ver & LUSTRE_VERSION_MASK) != version;
 }
 
-int lustre_msg_check_version(struct lustre_msg *msg,
-			     enum lustre_msg_version version)
+int lustre_msg_check_version(struct lustre_msg *msg, __u32 version)
 {
 #define LUSTRE_MSG_MAGIC_V1 0x0BD00BD0
 	switch (msg->lm_magic) {
@@ -137,14 +136,13 @@ EXPORT_SYMBOL(lustre_msg_early_size);
 __u32 lustre_msg_size_v2(int count, __u32 *lengths)
 {
 	__u32 size;
-	int i;
+        int i;
 
-	LASSERT(count > 0);
-	size = lustre_msg_hdr_size_v2(count);
-	for (i = 0; i < count; i++)
-		size += cfs_size_round(lengths[i]);
+        size = lustre_msg_hdr_size_v2(count);
+        for (i = 0; i < count; i++)
+                size += cfs_size_round(lengths[i]);
 
-	return size;
+        return size;
 }
 EXPORT_SYMBOL(lustre_msg_size_v2);
 
@@ -187,25 +185,22 @@ __u32 lustre_packed_msg_size(struct lustre_msg *msg)
                 return 0;
         }
 }
-EXPORT_SYMBOL(lustre_packed_msg_size);
 
 void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens,
-			char **bufs)
+                        char **bufs)
 {
-	char *ptr;
-	int i;
-
-	LASSERT(count > 0);
+        char *ptr;
+        int i;
 
-	msg->lm_bufcount = count;
-	/* XXX: lm_secflvr uninitialized here */
-	msg->lm_magic = LUSTRE_MSG_MAGIC_V2;
+        msg->lm_bufcount = count;
+        /* XXX: lm_secflvr uninitialized here */
+        msg->lm_magic = LUSTRE_MSG_MAGIC_V2;
 
-	for (i = 0; i < count; i++)
-		msg->lm_buflens[i] = lens[i];
+        for (i = 0; i < count; i++)
+                msg->lm_buflens[i] = lens[i];
 
-	if (bufs == NULL)
-		return;
+        if (bufs == NULL)
+                return;
 
 	ptr = (char *)msg + lustre_msg_hdr_size_v2(count);
 	for (i = 0; i < count; i++) {
@@ -332,25 +327,24 @@ void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs)
 }
 
 int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
-			 __u32 *lens, char **bufs, int flags)
+                         __u32 *lens, char **bufs, int flags)
 {
-	struct ptlrpc_reply_state *rs;
-	int                        msg_len, rc;
-	ENTRY;
+        struct ptlrpc_reply_state *rs;
+        int                        msg_len, rc;
+        ENTRY;
 
-	LASSERT(req->rq_reply_state == NULL);
-	LASSERT(count > 0);
+        LASSERT(req->rq_reply_state == NULL);
 
-	if ((flags & LPRFL_EARLY_REPLY) == 0) {
+        if ((flags & LPRFL_EARLY_REPLY) == 0) {
 		spin_lock(&req->rq_lock);
 		req->rq_packed_final = 1;
 		spin_unlock(&req->rq_lock);
-	}
+        }
 
-	msg_len = lustre_msg_size_v2(count, lens);
-	rc = sptlrpc_svc_alloc_rs(req, msg_len);
-	if (rc)
-		RETURN(rc);
+        msg_len = lustre_msg_size_v2(count, lens);
+        rc = sptlrpc_svc_alloc_rs(req, msg_len);
+        if (rc)
+                RETURN(rc);
 
 	rs = req->rq_reply_state;
 	atomic_set(&rs->rs_refcount, 1);	/* 1 ref for rq_reply_state */
@@ -362,16 +356,16 @@ int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
 	INIT_LIST_HEAD(&rs->rs_list);
 	spin_lock_init(&rs->rs_lock);
 
-	req->rq_replen = msg_len;
-	req->rq_reply_state = rs;
-	req->rq_repmsg = rs->rs_msg;
+        req->rq_replen = msg_len;
+        req->rq_reply_state = rs;
+        req->rq_repmsg = rs->rs_msg;
 
-	lustre_init_msg_v2(rs->rs_msg, count, lens, bufs);
-	lustre_msg_add_version(rs->rs_msg, PTLRPC_MSG_VERSION);
+        lustre_init_msg_v2(rs->rs_msg, count, lens, bufs);
+        lustre_msg_add_version(rs->rs_msg, PTLRPC_MSG_VERSION);
 
-	PTLRPC_RS_DEBUG_LRU_ADD(rs);
+        PTLRPC_RS_DEBUG_LRU_ADD(rs);
 
-	RETURN(0);
+        RETURN(0);
 }
 EXPORT_SYMBOL(lustre_pack_reply_v2);
 
@@ -415,29 +409,28 @@ void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, __u32 n, __u32 min_size)
 {
 	__u32 i, offset, buflen, bufcount;
 
-	LASSERT(m != NULL);
-	LASSERT(m->lm_bufcount > 0);
+        LASSERT(m != NULL);
 
-	bufcount = m->lm_bufcount;
-	if (unlikely(n >= bufcount)) {
-		CDEBUG(D_INFO, "msg %p buffer[%d] not present (count %d)\n",
-		       m, n, bufcount);
-		return NULL;
-	}
+        bufcount = m->lm_bufcount;
+        if (unlikely(n >= bufcount)) {
+                CDEBUG(D_INFO, "msg %p buffer[%d] not present (count %d)\n",
+                       m, n, bufcount);
+                return NULL;
+        }
 
-	buflen = m->lm_buflens[n];
-	if (unlikely(buflen < min_size)) {
-		CERROR("msg %p buffer[%d] size %d too small "
-		       "(required %d, opc=%d)\n", m, n, buflen, min_size,
-		       n == MSG_PTLRPC_BODY_OFF ? -1 : lustre_msg_get_opc(m));
-		return NULL;
-	}
+        buflen = m->lm_buflens[n];
+        if (unlikely(buflen < min_size)) {
+                CERROR("msg %p buffer[%d] size %d too small "
+                       "(required %d, opc=%d)\n", m, n, buflen, min_size,
+                       n == MSG_PTLRPC_BODY_OFF ? -1 : lustre_msg_get_opc(m));
+                return NULL;
+        }
 
-	offset = lustre_msg_hdr_size_v2(bufcount);
-	for (i = 0; i < n; i++)
-		offset += cfs_size_round(m->lm_buflens[i]);
+        offset = lustre_msg_hdr_size_v2(bufcount);
+        for (i = 0; i < n; i++)
+                offset += cfs_size_round(m->lm_buflens[i]);
 
-	return (char *)m + offset;
+        return (char *)m + offset;
 }
 
 void *lustre_msg_buf(struct lustre_msg *m, __u32 n, __u32 min_size)
@@ -530,60 +523,52 @@ void lustre_free_reply_state(struct ptlrpc_reply_state *rs)
 
 static int lustre_unpack_msg_v2(struct lustre_msg_v2 *m, int len)
 {
-	int swabbed, required_len, i, buflen;
+        int swabbed, required_len, i;
 
-	/* Now we know the sender speaks my language. */
-	required_len = lustre_msg_hdr_size_v2(0);
-	if (len < required_len) {
-		/* can't even look inside the message */
-		CERROR("message length %d too small for lustre_msg\n", len);
-		return -EINVAL;
-	}
+        /* Now we know the sender speaks my language. */
+        required_len = lustre_msg_hdr_size_v2(0);
+        if (len < required_len) {
+                /* can't even look inside the message */
+                CERROR("message length %d too small for lustre_msg\n", len);
+                return -EINVAL;
+        }
 
-	swabbed = (m->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED);
+        swabbed = (m->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED);
+
+        if (swabbed) {
+                __swab32s(&m->lm_magic);
+                __swab32s(&m->lm_bufcount);
+                __swab32s(&m->lm_secflvr);
+                __swab32s(&m->lm_repsize);
+                __swab32s(&m->lm_cksum);
+                __swab32s(&m->lm_flags);
+                CLASSERT(offsetof(typeof(*m), lm_padding_2) != 0);
+                CLASSERT(offsetof(typeof(*m), lm_padding_3) != 0);
+        }
 
-	if (swabbed) {
-		__swab32s(&m->lm_magic);
-		__swab32s(&m->lm_bufcount);
-		__swab32s(&m->lm_secflvr);
-		__swab32s(&m->lm_repsize);
-		__swab32s(&m->lm_cksum);
-		__swab32s(&m->lm_flags);
-		CLASSERT(offsetof(typeof(*m), lm_padding_2) != 0);
-		CLASSERT(offsetof(typeof(*m), lm_padding_3) != 0);
-	}
+        required_len = lustre_msg_hdr_size_v2(m->lm_bufcount);
+        if (len < required_len) {
+                /* didn't receive all the buffer lengths */
+                CERROR ("message length %d too small for %d buflens\n",
+                        len, m->lm_bufcount);
+                return -EINVAL;
+        }
 
-	if (m->lm_bufcount == 0 || m->lm_bufcount > PTLRPC_MAX_BUFCOUNT) {
-		CERROR("message bufcount %d is not valid\n", m->lm_bufcount);
-		return -EINVAL;
-	}
-	required_len = lustre_msg_hdr_size_v2(m->lm_bufcount);
-	if (len < required_len) {
-		/* didn't receive all the buffer lengths */
-		CERROR("message length %d too small for %d buflens\n",
-		       len, m->lm_bufcount);
-		return -EINVAL;
-	}
+        for (i = 0; i < m->lm_bufcount; i++) {
+                if (swabbed)
+                        __swab32s(&m->lm_buflens[i]);
+                required_len += cfs_size_round(m->lm_buflens[i]);
+        }
 
-	for (i = 0; i < m->lm_bufcount; i++) {
-		if (swabbed)
-			__swab32s(&m->lm_buflens[i]);
-		buflen = cfs_size_round(m->lm_buflens[i]);
-		if (buflen < 0 || buflen > PTLRPC_MAX_BUFLEN) {
-			CERROR("buffer %d length %d is not valid\n", i, buflen);
-			return -EINVAL;
-		}
-		required_len += buflen;
-	}
-	if (len < required_len || required_len > PTLRPC_MAX_BUFLEN) {
-		CERROR("len: %d, required_len %d, bufcount: %d\n",
-		       len, required_len, m->lm_bufcount);
-		for (i = 0; i < m->lm_bufcount; i++)
-			CERROR("buffer %d length %d\n", i, m->lm_buflens[i]);
-		return -EINVAL;
-	}
+        if (len < required_len) {
+                CERROR("len: %d, required_len %d\n", len, required_len);
+                CERROR("bufcount: %d\n", m->lm_bufcount);
+                for (i = 0; i < m->lm_bufcount; i++)
+                        CERROR("buffer %d length %d\n", i, m->lm_buflens[i]);
+                return -EINVAL;
+        }
 
-	return swabbed;
+        return swabbed;
 }
 
 int __lustre_unpack_msg(struct lustre_msg *m, int len)
@@ -772,11 +757,6 @@ char *lustre_msg_string(struct lustre_msg *m, __u32 index, __u32 max_len)
                         "msg %p buffer[%d] len %d\n", m, index, blen);
                 return NULL;
         }
-	if (blen > PTLRPC_MAX_BUFLEN) {
-		CERROR("buffer length of msg %p buffer[%d] is invalid(%d)\n",
-		       m, index, blen);
-		return NULL;
-	}
 
         if (max_len == 0) {
                 if (slen != blen - 1) {
@@ -822,7 +802,7 @@ static inline struct ptlrpc_body *lustre_msg_ptlrpc_body(struct lustre_msg *msg)
 				 sizeof(struct ptlrpc_body_v2));
 }
 
-enum lustre_msghdr lustre_msghdr_get_flags(struct lustre_msg *msg)
+__u32 lustre_msghdr_get_flags(struct lustre_msg *msg)
 {
 	switch (msg->lm_magic) {
 	case LUSTRE_MSG_MAGIC_V2:
@@ -856,7 +836,7 @@ __u32 lustre_msg_get_flags(struct lustre_msg *msg)
 
 		CERROR("invalid msg %p: no ptlrpc body!\n", msg);
 	}
-	/* fallthrough */
+	/* Fall through */
 	default:
 		/* flags might be printed in debug code while message
 		 * uninitialized */
@@ -900,8 +880,7 @@ void lustre_msg_clear_flags(struct lustre_msg *msg, __u32 flags)
 	case LUSTRE_MSG_MAGIC_V2: {
 		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
 		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
-		pb->pb_flags &= ~flags;
-
+		pb->pb_flags &= ~(MSG_GEN_FLAG_MASK & flags);
 		return;
 	}
 	default:
@@ -920,7 +899,7 @@ __u32 lustre_msg_get_op_flags(struct lustre_msg *msg)
 
 		CERROR("invalid msg %p: no ptlrpc body!\n", msg);
 	}
-	/* fallthrough */
+	/* Fall through */
 	default:
 		return 0;
 	}
@@ -976,7 +955,7 @@ __u32 lustre_msg_get_type(struct lustre_msg *msg)
 }
 EXPORT_SYMBOL(lustre_msg_get_type);
 
-enum lustre_msg_version lustre_msg_get_version(struct lustre_msg *msg)
+__u32 lustre_msg_get_version(struct lustre_msg *msg)
 {
 	switch (msg->lm_magic) {
 	case LUSTRE_MSG_MAGIC_V2: {
@@ -1125,7 +1104,7 @@ int lustre_msg_get_status(struct lustre_msg *msg)
 			return pb->pb_status;
 		CERROR("invalid msg %p: no ptlrpc body!\n", msg);
 	}
-	/* fallthrough */
+	/* Fall through */
 	default:
 		/* status might be printed in debug code while message
 		* uninitialized */
@@ -1235,12 +1214,11 @@ __u32 lustre_msg_get_magic(struct lustre_msg *msg)
 	}
 }
 
-timeout_t lustre_msg_get_timeout(struct lustre_msg *msg)
+__u32 lustre_msg_get_timeout(struct lustre_msg *msg)
 {
 	switch (msg->lm_magic) {
 	case LUSTRE_MSG_MAGIC_V2: {
 		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
-
 		if (pb == NULL) {
 			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
 			return 0;
@@ -1253,12 +1231,11 @@ timeout_t lustre_msg_get_timeout(struct lustre_msg *msg)
 	}
 }
 
-timeout_t lustre_msg_get_service_timeout(struct lustre_msg *msg)
+__u32 lustre_msg_get_service_time(struct lustre_msg *msg)
 {
 	switch (msg->lm_magic) {
 	case LUSTRE_MSG_MAGIC_V2: {
 		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
-
 		if (pb == NULL) {
 			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
 			return 0;
@@ -1488,13 +1465,11 @@ void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt)
 	}
 }
 
-void lustre_msg_set_timeout(struct lustre_msg *msg, timeout_t timeout)
+void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout)
 {
 	switch (msg->lm_magic) {
 	case LUSTRE_MSG_MAGIC_V2: {
 		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
-
-		LASSERT(timeout >= 0);
 		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
 		pb->pb_timeout = timeout;
 		return;
@@ -1504,16 +1479,13 @@ void lustre_msg_set_timeout(struct lustre_msg *msg, timeout_t timeout)
 	}
 }
 
-void lustre_msg_set_service_timeout(struct lustre_msg *msg,
-				    timeout_t service_timeout)
+void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time)
 {
 	switch (msg->lm_magic) {
 	case LUSTRE_MSG_MAGIC_V2: {
 		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
-
-		LASSERT(service_timeout >= 0);
 		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
-		pb->pb_service_time = service_timeout;
+		pb->pb_service_time = service_time;
 		return;
 	}
 	default:
@@ -1539,9 +1511,9 @@ void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid)
 		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
 
 		if (jobid != NULL)
-			memcpy(pb->pb_jobid, jobid, sizeof(pb->pb_jobid));
+			memcpy(pb->pb_jobid, jobid, LUSTRE_JOBID_SIZE);
 		else if (pb->pb_jobid[0] == '\0')
-			lustre_get_jobid(pb->pb_jobid, sizeof(pb->pb_jobid));
+			lustre_get_jobid(pb->pb_jobid);
 		return;
 	}
 	default:
@@ -1646,40 +1618,39 @@ EXPORT_SYMBOL(do_set_info_async);
 /* byte flipping routines for all wire types declared in
  * lustre_idl.h implemented here.
  */
-void lustre_swab_ptlrpc_body(struct ptlrpc_body *body)
-{
-	__swab32s(&body->pb_type);
-	__swab32s(&body->pb_version);
-	__swab32s(&body->pb_opc);
-	__swab32s(&body->pb_status);
-	__swab64s(&body->pb_last_xid);
-	__swab16s(&body->pb_tag);
-	CLASSERT(offsetof(typeof(*body), pb_padding0) != 0);
-	CLASSERT(offsetof(typeof(*body), pb_padding1) != 0);
-	__swab64s(&body->pb_last_committed);
-	__swab64s(&body->pb_transno);
-	__swab32s(&body->pb_flags);
-	__swab32s(&body->pb_op_flags);
-	__swab32s(&body->pb_conn_cnt);
-	__swab32s(&body->pb_timeout);
-	__swab32s(&body->pb_service_time);
-	__swab32s(&body->pb_limit);
-	__swab64s(&body->pb_slv);
-	__swab64s(&body->pb_pre_versions[0]);
-	__swab64s(&body->pb_pre_versions[1]);
-	__swab64s(&body->pb_pre_versions[2]);
-	__swab64s(&body->pb_pre_versions[3]);
-	__swab64s(&body->pb_mbits);
-	CLASSERT(offsetof(typeof(*body), pb_padding64_0) != 0);
-	CLASSERT(offsetof(typeof(*body), pb_padding64_1) != 0);
-	CLASSERT(offsetof(typeof(*body), pb_padding64_2) != 0);
+void lustre_swab_ptlrpc_body(struct ptlrpc_body *b)
+{
+        __swab32s (&b->pb_type);
+        __swab32s (&b->pb_version);
+        __swab32s (&b->pb_opc);
+        __swab32s (&b->pb_status);
+        __swab64s (&b->pb_last_xid);
+	__swab16s (&b->pb_tag);
+        __swab64s (&b->pb_last_committed);
+        __swab64s (&b->pb_transno);
+        __swab32s (&b->pb_flags);
+        __swab32s (&b->pb_op_flags);
+        __swab32s (&b->pb_conn_cnt);
+        __swab32s (&b->pb_timeout);
+        __swab32s (&b->pb_service_time);
+        __swab32s (&b->pb_limit);
+        __swab64s (&b->pb_slv);
+        __swab64s (&b->pb_pre_versions[0]);
+        __swab64s (&b->pb_pre_versions[1]);
+        __swab64s (&b->pb_pre_versions[2]);
+        __swab64s (&b->pb_pre_versions[3]);
+	__swab64s(&b->pb_mbits);
+	CLASSERT(offsetof(typeof(*b), pb_padding0) != 0);
+	CLASSERT(offsetof(typeof(*b), pb_padding1) != 0);
+	CLASSERT(offsetof(typeof(*b), pb_padding64_0) != 0);
+	CLASSERT(offsetof(typeof(*b), pb_padding64_1) != 0);
+	CLASSERT(offsetof(typeof(*b), pb_padding64_2) != 0);
 	/* While we need to maintain compatibility between
 	 * clients and servers without ptlrpc_body_v2 (< 2.3)
 	 * do not swab any fields beyond pb_jobid, as we are
 	 * using this swab function for both ptlrpc_body
 	 * and ptlrpc_body_v2. */
-	/* pb_jobid is an ASCII string and should not be swabbed */
-	CLASSERT(offsetof(typeof(*body), pb_jobid) != 0);
+	CLASSERT(offsetof(typeof(*b), pb_jobid) != 0);
 }
 
 void lustre_swab_connect(struct obd_connect_data *ocd)
@@ -1759,7 +1730,7 @@ void lustre_swab_obdo (struct obdo  *o)
 	__swab32s(&o->o_stripe_idx);
 	__swab32s(&o->o_parent_ver);
 	lustre_swab_ost_layout(&o->o_layout);
-	__swab32s(&o->o_layout_version);
+	CLASSERT(offsetof(typeof(*o), o_padding_3) != 0);
 	__swab32s(&o->o_uid_h);
 	__swab32s(&o->o_gid_h);
 	__swab64s(&o->o_data_version);
@@ -1773,26 +1744,26 @@ EXPORT_SYMBOL(lustre_swab_obdo);
 
 void lustre_swab_obd_statfs (struct obd_statfs *os)
 {
-	__swab64s(&os->os_type);
-	__swab64s(&os->os_blocks);
-	__swab64s(&os->os_bfree);
-	__swab64s(&os->os_bavail);
-	__swab64s(&os->os_files);
-	__swab64s(&os->os_ffree);
-	/* no need to swab os_fsid */
-	__swab32s(&os->os_bsize);
-	__swab32s(&os->os_namelen);
-	__swab64s(&os->os_maxbytes);
-	__swab32s(&os->os_state);
-	__swab32s(&os->os_fprecreated);
-	__swab32s(&os->os_granted);
-	CLASSERT(offsetof(typeof(*os), os_spare3) != 0);
-	CLASSERT(offsetof(typeof(*os), os_spare4) != 0);
-	CLASSERT(offsetof(typeof(*os), os_spare5) != 0);
-	CLASSERT(offsetof(typeof(*os), os_spare6) != 0);
-	CLASSERT(offsetof(typeof(*os), os_spare7) != 0);
-	CLASSERT(offsetof(typeof(*os), os_spare8) != 0);
-	CLASSERT(offsetof(typeof(*os), os_spare9) != 0);
+        __swab64s (&os->os_type);
+        __swab64s (&os->os_blocks);
+        __swab64s (&os->os_bfree);
+        __swab64s (&os->os_bavail);
+        __swab64s (&os->os_files);
+        __swab64s (&os->os_ffree);
+        /* no need to swab os_fsid */
+        __swab32s (&os->os_bsize);
+        __swab32s (&os->os_namelen);
+        __swab64s (&os->os_maxbytes);
+        __swab32s (&os->os_state);
+	CLASSERT(offsetof(typeof(*os), os_fprecreated) != 0);
+        CLASSERT(offsetof(typeof(*os), os_spare2) != 0);
+        CLASSERT(offsetof(typeof(*os), os_spare3) != 0);
+        CLASSERT(offsetof(typeof(*os), os_spare4) != 0);
+        CLASSERT(offsetof(typeof(*os), os_spare5) != 0);
+        CLASSERT(offsetof(typeof(*os), os_spare6) != 0);
+        CLASSERT(offsetof(typeof(*os), os_spare7) != 0);
+        CLASSERT(offsetof(typeof(*os), os_spare8) != 0);
+        CLASSERT(offsetof(typeof(*os), os_spare9) != 0);
 }
 
 void lustre_swab_obd_ioobj(struct obd_ioobj *ioo)
@@ -1897,7 +1868,7 @@ void lustre_swab_mdt_body (struct mdt_body *b)
 	__swab64s(&b->mbo_atime);
 	__swab64s(&b->mbo_ctime);
 	__swab64s(&b->mbo_blocks);
-	__swab64s(&b->mbo_version);
+	__swab64s(&b->mbo_ioepoch);
 	__swab64s(&b->mbo_t_state);
 	__swab32s(&b->mbo_fsuid);
 	__swab32s(&b->mbo_fsgid);
@@ -1908,7 +1879,7 @@ void lustre_swab_mdt_body (struct mdt_body *b)
 	__swab32s(&b->mbo_flags);
 	__swab32s(&b->mbo_rdev);
 	__swab32s(&b->mbo_nlink);
-	__swab32s(&b->mbo_layout_gen);
+	CLASSERT(offsetof(typeof(*b), mbo_unused2) != 0);
 	__swab32s(&b->mbo_suppgid);
 	__swab32s(&b->mbo_eadatasize);
 	__swab32s(&b->mbo_aclsize);
@@ -1917,8 +1888,8 @@ void lustre_swab_mdt_body (struct mdt_body *b)
 	__swab32s(&b->mbo_uid_h);
 	__swab32s(&b->mbo_gid_h);
 	__swab32s(&b->mbo_projid);
-	__swab64s(&b->mbo_dom_size);
-	__swab64s(&b->mbo_dom_blocks);
+	CLASSERT(offsetof(typeof(*b), mbo_padding_6) != 0);
+	CLASSERT(offsetof(typeof(*b), mbo_padding_7) != 0);
 	CLASSERT(offsetof(typeof(*b), mbo_padding_8) != 0);
 	CLASSERT(offsetof(typeof(*b), mbo_padding_9) != 0);
 	CLASSERT(offsetof(typeof(*b), mbo_padding_10) != 0);
@@ -1926,7 +1897,7 @@ void lustre_swab_mdt_body (struct mdt_body *b)
 
 void lustre_swab_mdt_ioepoch(struct mdt_ioepoch *b)
 {
-	/* mio_open_handle is opaque */
+	/* mio_handle is opaque */
 	CLASSERT(offsetof(typeof(*b), mio_unused1) != 0);
 	CLASSERT(offsetof(typeof(*b), mio_unused2) != 0);
 	CLASSERT(offsetof(typeof(*b), mio_padding) != 0);
@@ -1934,39 +1905,38 @@ void lustre_swab_mdt_ioepoch(struct mdt_ioepoch *b)
 
 void lustre_swab_mgs_target_info(struct mgs_target_info *mti)
 {
-	int i;
-
-	__swab32s(&mti->mti_lustre_ver);
-	__swab32s(&mti->mti_stripe_index);
-	__swab32s(&mti->mti_config_ver);
-	__swab32s(&mti->mti_flags);
-	__swab32s(&mti->mti_instance);
-	__swab32s(&mti->mti_nid_count);
-	CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
-	for (i = 0; i < MTI_NIDS_MAX; i++)
-		__swab64s(&mti->mti_nids[i]);
+        int i;
+        __swab32s(&mti->mti_lustre_ver);
+        __swab32s(&mti->mti_stripe_index);
+        __swab32s(&mti->mti_config_ver);
+        __swab32s(&mti->mti_flags);
+        __swab32s(&mti->mti_instance);
+        __swab32s(&mti->mti_nid_count);
+        CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
+        for (i = 0; i < MTI_NIDS_MAX; i++)
+                __swab64s(&mti->mti_nids[i]);
 }
 
 void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *entry)
 {
 	__u8 i;
 
-	__swab64s(&entry->mne_version);
-	__swab32s(&entry->mne_instance);
-	__swab32s(&entry->mne_index);
-	__swab32s(&entry->mne_length);
-
-	/* mne_nid_(count|type) must be one byte size because we're gonna
-	 * access it w/o swapping. */
-	CLASSERT(sizeof(entry->mne_nid_count) == sizeof(__u8));
-	CLASSERT(sizeof(entry->mne_nid_type) == sizeof(__u8));
-
-	/* remove this assertion if ipv6 is supported. */
-	LASSERT(entry->mne_nid_type == 0);
-	for (i = 0; i < entry->mne_nid_count; i++) {
-		CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
-		__swab64s(&entry->u.nids[i]);
-	}
+        __swab64s(&entry->mne_version);
+        __swab32s(&entry->mne_instance);
+        __swab32s(&entry->mne_index);
+        __swab32s(&entry->mne_length);
+
+        /* mne_nid_(count|type) must be one byte size because we're gonna
+         * access it w/o swapping. */
+        CLASSERT(sizeof(entry->mne_nid_count) == sizeof(__u8));
+        CLASSERT(sizeof(entry->mne_nid_type) == sizeof(__u8));
+
+        /* remove this assertion if ipv6 is supported. */
+        LASSERT(entry->mne_nid_type == 0);
+        for (i = 0; i < entry->mne_nid_count; i++) {
+                CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
+                __swab64s(&entry->u.nids[i]);
+        }
 }
 EXPORT_SYMBOL(lustre_swab_mgs_nidtbl_entry);
 
@@ -2033,32 +2003,21 @@ static void lustre_swab_fiemap_extent(struct fiemap_extent *fm_extent)
         __swab32s(&fm_extent->fe_device);
 }
 
-static void lustre_swab_fiemap_hdr(struct fiemap *fiemap)
-{
-	__swab64s(&fiemap->fm_start);
-	__swab64s(&fiemap->fm_length);
-	__swab32s(&fiemap->fm_flags);
-	__swab32s(&fiemap->fm_mapped_extents);
-	__swab32s(&fiemap->fm_extent_count);
-	__swab32s(&fiemap->fm_reserved);
-}
-
 void lustre_swab_fiemap(struct fiemap *fiemap)
 {
 	__u32 i;
 
-	lustre_swab_fiemap_hdr(fiemap);
+        __swab64s(&fiemap->fm_start);
+        __swab64s(&fiemap->fm_length);
+        __swab32s(&fiemap->fm_flags);
+        __swab32s(&fiemap->fm_mapped_extents);
+        __swab32s(&fiemap->fm_extent_count);
+        __swab32s(&fiemap->fm_reserved);
 
         for (i = 0; i < fiemap->fm_mapped_extents; i++)
                 lustre_swab_fiemap_extent(&fiemap->fm_extents[i]);
 }
 
-void lustre_swab_fiemap_info_key(struct ll_fiemap_info_key *fiemap_info)
-{
-	lustre_swab_obdo(&fiemap_info->lfik_oa);
-	lustre_swab_fiemap_hdr(&fiemap_info->lfik_fiemap);
-}
-
 void lustre_swab_idx_info(struct idx_info *ii)
 {
 	__swab32s(&ii->ii_magic);
@@ -2106,7 +2065,6 @@ void lustre_swab_mdt_rec_reint (struct mdt_rec_reint *rr)
 	__swab32s(&rr->rr_flags);
 	__swab32s(&rr->rr_flags_h);
 	__swab32s(&rr->rr_umask);
-	__swab16s(&rr->rr_mirror_id);
 
 	CLASSERT(offsetof(typeof(*rr), rr_padding_4) != 0);
 };
@@ -2161,37 +2119,14 @@ void lustre_swab_lmv_mds_md(union lmv_mds_md *lmm)
 }
 EXPORT_SYMBOL(lustre_swab_lmv_mds_md);
 
-void lustre_swab_lmv_user_md_objects(struct lmv_user_mds_data *lmd,
-				     int stripe_count)
-{
-	int i;
-
-	for (i = 0; i < stripe_count; i++)
-		__swab32s(&(lmd[i].lum_mds));
-}
-EXPORT_SYMBOL(lustre_swab_lmv_user_md_objects);
-
-
 void lustre_swab_lmv_user_md(struct lmv_user_md *lum)
 {
-	__u32 count = lum->lum_stripe_count;
-
 	__swab32s(&lum->lum_magic);
 	__swab32s(&lum->lum_stripe_count);
 	__swab32s(&lum->lum_stripe_offset);
 	__swab32s(&lum->lum_hash_type);
 	__swab32s(&lum->lum_type);
 	CLASSERT(offsetof(typeof(*lum), lum_padding1) != 0);
-	switch (lum->lum_magic) {
-	case LMV_USER_MAGIC_SPECIFIC:
-		count = lum->lum_stripe_count;
-		/* fallthrough */
-	case __swab32(LMV_USER_MAGIC_SPECIFIC):
-		lustre_swab_lmv_user_md_objects(lum->lum_objects, count);
-		break;
-	default:
-		break;
-	}
 }
 EXPORT_SYMBOL(lustre_swab_lmv_user_md);
 
@@ -2251,7 +2186,6 @@ void lustre_print_user_md(unsigned int lvl, struct lov_user_md *lum,
 	CDEBUG(lvl, "\tlcm_layout_gen: %#x\n", comp_v1->lcm_layout_gen);
 	CDEBUG(lvl, "\tlcm_flags: %#x\n", comp_v1->lcm_flags);
 	CDEBUG(lvl, "\tlcm_entry_count: %#x\n\n", comp_v1->lcm_entry_count);
-	CDEBUG(lvl, "\tlcm_mirror_count: %#x\n\n", comp_v1->lcm_mirror_count);
 
 	for (i = 0; i < comp_v1->lcm_entry_count; i++) {
 		struct lov_comp_md_entry_v1 *ent = &comp_v1->lcm_entries[i];
@@ -2260,9 +2194,6 @@ void lustre_print_user_md(unsigned int lvl, struct lov_user_md *lum,
 		CDEBUG(lvl, "\tentry %d:\n", i);
 		CDEBUG(lvl, "\tlcme_id: %#x\n", ent->lcme_id);
 		CDEBUG(lvl, "\tlcme_flags: %#x\n", ent->lcme_flags);
-		if (ent->lcme_flags & LCME_FL_NOSYNC)
-			CDEBUG(lvl, "\tlcme_timestamp: %llu\n",
-					ent->lcme_timestamp);
 		CDEBUG(lvl, "\tlcme_extent.e_start: %llu\n",
 		       ent->lcme_extent.e_start);
 		CDEBUG(lvl, "\tlcme_extent.e_end: %llu\n",
@@ -2336,7 +2267,6 @@ void lustre_swab_lov_comp_md_v1(struct lov_comp_md_v1 *lum)
 	__swab32s(&lum->lcm_layout_gen);
 	__swab16s(&lum->lcm_flags);
 	__swab16s(&lum->lcm_entry_count);
-	__swab16s(&lum->lcm_mirror_count);
 	CLASSERT(offsetof(typeof(*lum), lcm_padding1) != 0);
 	CLASSERT(offsetof(typeof(*lum), lcm_padding2) != 0);
 
@@ -2351,13 +2281,11 @@ void lustre_swab_lov_comp_md_v1(struct lov_comp_md_v1 *lum)
 		}
 		__swab32s(&ent->lcme_id);
 		__swab32s(&ent->lcme_flags);
-		__swab64s(&ent->lcme_timestamp);
 		__swab64s(&ent->lcme_extent.e_start);
 		__swab64s(&ent->lcme_extent.e_end);
 		__swab32s(&ent->lcme_offset);
 		__swab32s(&ent->lcme_size);
-		__swab32s(&ent->lcme_layout_gen);
-		CLASSERT(offsetof(typeof(*ent), lcme_padding_1) != 0);
+		CLASSERT(offsetof(typeof(*ent), lcme_padding) != 0);
 
 		v1 = (struct lov_user_md_v1 *)((char *)lum + off);
 		stripe_count = v1->lmm_stripe_count;
@@ -2386,83 +2314,6 @@ void lustre_swab_lov_comp_md_v1(struct lov_comp_md_v1 *lum)
 }
 EXPORT_SYMBOL(lustre_swab_lov_comp_md_v1);
 
-void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
-                                     int stripe_count)
-{
-        int i;
-        ENTRY;
-        for (i = 0; i < stripe_count; i++) {
-                lustre_swab_ost_id(&(lod[i].l_ost_oi));
-                __swab32s(&(lod[i].l_ost_gen));
-                __swab32s(&(lod[i].l_ost_idx));
-        }
-        EXIT;
-}
-EXPORT_SYMBOL(lustre_swab_lov_user_md_objects);
-
-void lustre_swab_lov_user_md(struct lov_user_md *lum, size_t size)
-{
-	struct lov_user_md_v1 *v1;
-	struct lov_user_md_v3 *v3;
-	__u16 stripe_count;
-	ENTRY;
-
-	CDEBUG(D_IOCTL, "swabbing lov_user_md\n");
-	switch (lum->lmm_magic) {
-	case __swab32(LOV_MAGIC_V1):
-	case LOV_USER_MAGIC_V1:
-	{
-		v1 = (struct lov_user_md_v1 *)lum;
-		stripe_count = v1->lmm_stripe_count;
-
-		if (lum->lmm_magic != LOV_USER_MAGIC_V1)
-			__swab16s(&stripe_count);
-
-		lustre_swab_lov_user_md_v1(v1);
-		if (size > sizeof(*v1))
-			lustre_swab_lov_user_md_objects(v1->lmm_objects,
-							stripe_count);
-
-		break;
-	}
-	case __swab32(LOV_MAGIC_V3):
-	case LOV_USER_MAGIC_V3:
-	{
-		v3 = (struct lov_user_md_v3 *)lum;
-		stripe_count = v3->lmm_stripe_count;
-
-		if (lum->lmm_magic != LOV_USER_MAGIC_V3)
-			__swab16s(&stripe_count);
-
-		lustre_swab_lov_user_md_v3(v3);
-		if (size > sizeof(*v3))
-			lustre_swab_lov_user_md_objects(v3->lmm_objects,
-							stripe_count);
-		break;
-	}
-	case __swab32(LOV_USER_MAGIC_SPECIFIC):
-	case LOV_USER_MAGIC_SPECIFIC:
-	{
-		v3 = (struct lov_user_md_v3 *)lum;
-		stripe_count = v3->lmm_stripe_count;
-
-		if (lum->lmm_magic != LOV_USER_MAGIC_SPECIFIC)
-			__swab16s(&stripe_count);
-
-		lustre_swab_lov_user_md_v3(v3);
-		lustre_swab_lov_user_md_objects(v3->lmm_objects, stripe_count);
-		break;
-	}
-	case __swab32(LOV_MAGIC_COMP_V1):
-	case LOV_USER_MAGIC_COMP_V1:
-		lustre_swab_lov_comp_md_v1((struct lov_comp_md_v1 *)lum);
-		break;
-	default:
-		CDEBUG(D_IOCTL, "Invalid LOV magic %08x\n", lum->lmm_magic);
-	}
-}
-EXPORT_SYMBOL(lustre_swab_lov_user_md);
-
 void lustre_swab_lov_mds_md(struct lov_mds_md *lmm)
 {
 	ENTRY;
@@ -2477,6 +2328,20 @@ void lustre_swab_lov_mds_md(struct lov_mds_md *lmm)
 }
 EXPORT_SYMBOL(lustre_swab_lov_mds_md);
 
+void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
+                                     int stripe_count)
+{
+        int i;
+        ENTRY;
+        for (i = 0; i < stripe_count; i++) {
+                lustre_swab_ost_id(&(lod[i].l_ost_oi));
+                __swab32s(&(lod[i].l_ost_gen));
+                __swab32s(&(lod[i].l_ost_idx));
+        }
+        EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_lov_user_md_objects);
+
 void lustre_swab_ldlm_res_id (struct ldlm_res_id *id)
 {
         int  i;
@@ -2570,51 +2435,54 @@ void dump_obdo(struct obdo *oa)
 	if (valid & OBD_MD_FLFID)
 		CDEBUG(D_RPCTRACE, "obdo: o_parent_seq = %#llx\n",
 		       oa->o_parent_seq);
-	if (valid & OBD_MD_FLSIZE)
+        if (valid & OBD_MD_FLSIZE)
 		CDEBUG(D_RPCTRACE, "obdo: o_size = %lld\n", oa->o_size);
-	if (valid & OBD_MD_FLMTIME)
+        if (valid & OBD_MD_FLMTIME)
 		CDEBUG(D_RPCTRACE, "obdo: o_mtime = %lld\n", oa->o_mtime);
-	if (valid & OBD_MD_FLATIME)
+        if (valid & OBD_MD_FLATIME)
 		CDEBUG(D_RPCTRACE, "obdo: o_atime = %lld\n", oa->o_atime);
-	if (valid & OBD_MD_FLCTIME)
+        if (valid & OBD_MD_FLCTIME)
 		CDEBUG(D_RPCTRACE, "obdo: o_ctime = %lld\n", oa->o_ctime);
-	if (valid & OBD_MD_FLBLOCKS)   /* allocation of space */
+        if (valid & OBD_MD_FLBLOCKS)   /* allocation of space */
 		CDEBUG(D_RPCTRACE, "obdo: o_blocks = %lld\n", oa->o_blocks);
-	if (valid & OBD_MD_FLGRANT)
+        if (valid & OBD_MD_FLGRANT)
 		CDEBUG(D_RPCTRACE, "obdo: o_grant = %lld\n", oa->o_grant);
-	if (valid & OBD_MD_FLBLKSZ)
-		CDEBUG(D_RPCTRACE, "obdo: o_blksize = %d\n", oa->o_blksize);
-	if (valid & (OBD_MD_FLTYPE | OBD_MD_FLMODE))
-		CDEBUG(D_RPCTRACE, "obdo: o_mode = %o\n",
-		       oa->o_mode & ((valid & OBD_MD_FLTYPE ?  S_IFMT : 0) |
-				     (valid & OBD_MD_FLMODE ? ~S_IFMT : 0)));
-	if (valid & OBD_MD_FLUID)
-		CDEBUG(D_RPCTRACE, "obdo: o_uid = %u\n", oa->o_uid);
-	if (valid & OBD_MD_FLUID)
-		CDEBUG(D_RPCTRACE, "obdo: o_uid_h = %u\n", oa->o_uid_h);
-	if (valid & OBD_MD_FLGID)
-		CDEBUG(D_RPCTRACE, "obdo: o_gid = %u\n", oa->o_gid);
-	if (valid & OBD_MD_FLGID)
-		CDEBUG(D_RPCTRACE, "obdo: o_gid_h = %u\n", oa->o_gid_h);
-	if (valid & OBD_MD_FLFLAGS)
-		CDEBUG(D_RPCTRACE, "obdo: o_flags = %x\n", oa->o_flags);
-	if (valid & OBD_MD_FLNLINK)
-		CDEBUG(D_RPCTRACE, "obdo: o_nlink = %u\n", oa->o_nlink);
-	else if (valid & OBD_MD_FLCKSUM)
-		CDEBUG(D_RPCTRACE, "obdo: o_checksum (o_nlink) = %u\n",
-		       oa->o_nlink);
-	if (valid & OBD_MD_FLPARENT)
-		CDEBUG(D_RPCTRACE, "obdo: o_parent_oid = %x\n",
-		       oa->o_parent_oid);
-	if (valid & OBD_MD_FLFID) {
-		CDEBUG(D_RPCTRACE, "obdo: o_stripe_idx = %u\n",
-		       oa->o_stripe_idx);
-		CDEBUG(D_RPCTRACE, "obdo: o_parent_ver = %x\n",
-		       oa->o_parent_ver);
-	}
-	if (valid & OBD_MD_FLHANDLE)
+        if (valid & OBD_MD_FLBLKSZ)
+                CDEBUG(D_RPCTRACE, "obdo: o_blksize = %d\n", oa->o_blksize);
+        if (valid & (OBD_MD_FLTYPE | OBD_MD_FLMODE))
+                CDEBUG(D_RPCTRACE, "obdo: o_mode = %o\n",
+                       oa->o_mode & ((valid & OBD_MD_FLTYPE ?  S_IFMT : 0) |
+                                     (valid & OBD_MD_FLMODE ? ~S_IFMT : 0)));
+        if (valid & OBD_MD_FLUID)
+                CDEBUG(D_RPCTRACE, "obdo: o_uid = %u\n", oa->o_uid);
+        if (valid & OBD_MD_FLUID)
+                CDEBUG(D_RPCTRACE, "obdo: o_uid_h = %u\n", oa->o_uid_h);
+        if (valid & OBD_MD_FLGID)
+                CDEBUG(D_RPCTRACE, "obdo: o_gid = %u\n", oa->o_gid);
+        if (valid & OBD_MD_FLGID)
+                CDEBUG(D_RPCTRACE, "obdo: o_gid_h = %u\n", oa->o_gid_h);
+        if (valid & OBD_MD_FLFLAGS)
+                CDEBUG(D_RPCTRACE, "obdo: o_flags = %x\n", oa->o_flags);
+        if (valid & OBD_MD_FLNLINK)
+                CDEBUG(D_RPCTRACE, "obdo: o_nlink = %u\n", oa->o_nlink);
+        else if (valid & OBD_MD_FLCKSUM)
+                CDEBUG(D_RPCTRACE, "obdo: o_checksum (o_nlink) = %u\n",
+                       oa->o_nlink);
+        if (valid & OBD_MD_FLGENER)
+                CDEBUG(D_RPCTRACE, "obdo: o_parent_oid = %x\n",
+                       oa->o_parent_oid);
+        if (valid & OBD_MD_FLEPOCH)
+		CDEBUG(D_RPCTRACE, "obdo: o_ioepoch = %lld\n",
+                       oa->o_ioepoch);
+        if (valid & OBD_MD_FLFID) {
+                CDEBUG(D_RPCTRACE, "obdo: o_stripe_idx = %u\n",
+                       oa->o_stripe_idx);
+                CDEBUG(D_RPCTRACE, "obdo: o_parent_ver = %x\n",
+                       oa->o_parent_ver);
+        }
+        if (valid & OBD_MD_FLHANDLE)
 		CDEBUG(D_RPCTRACE, "obdo: o_handle = %lld\n",
-		       oa->o_handle.cookie);
+                       oa->o_handle.cookie);
 }
 
 void dump_ost_body(struct ost_body *ob)
@@ -2761,17 +2629,12 @@ void lustre_swab_hsm_user_item(struct hsm_user_item *hui)
 	lustre_swab_hsm_extent(&hui->hui_extent);
 }
 
-void lustre_swab_lu_extent(struct lu_extent *le)
-{
-	__swab64s(&le->e_start);
-	__swab64s(&le->e_end);
-}
-
 void lustre_swab_layout_intent(struct layout_intent *li)
 {
 	__swab32s(&li->li_opc);
 	__swab32s(&li->li_flags);
-	lustre_swab_lu_extent(&li->li_extent);
+	__swab64s(&li->li_start);
+	__swab64s(&li->li_end);
 }
 
 void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk)
@@ -2883,19 +2746,6 @@ void lustre_swab_close_data(struct close_data *cd)
 	__swab64s(&cd->cd_data_version);
 }
 
-void lustre_swab_close_data_resync_done(struct close_data_resync_done *resync)
-{
-	int i;
-
-	__swab32s(&resync->resync_count);
-	/* after swab, resync_count must in CPU endian */
-	if (resync->resync_count <= INLINE_RESYNC_ARRAY_SIZE) {
-		for (i = 0; i < resync->resync_count; i++)
-			__swab32s(&resync->resync_ids_inline[i]);
-	}
-}
-EXPORT_SYMBOL(lustre_swab_close_data_resync_done);
-
 void lustre_swab_lfsck_request(struct lfsck_request *lr)
 {
 	__swab32s(&lr->lr_event);
@@ -2947,18 +2797,6 @@ void lustre_swab_orphan_ent_v2(struct lu_orphan_ent_v2 *ent)
 }
 EXPORT_SYMBOL(lustre_swab_orphan_ent_v2);
 
-void lustre_swab_orphan_ent_v3(struct lu_orphan_ent_v3 *ent)
-{
-	lustre_swab_lu_fid(&ent->loe_key);
-	lustre_swab_orphan_rec(&ent->loe_rec.lor_rec);
-	lustre_swab_ost_layout(&ent->loe_rec.lor_layout);
-	__swab32s(&ent->loe_rec.lor_layout_version);
-	__swab32s(&ent->loe_rec.lor_range);
-	CLASSERT(offsetof(typeof(ent->loe_rec), lor_padding_1) != 0);
-	CLASSERT(offsetof(typeof(ent->loe_rec), lor_padding_2) != 0);
-}
-EXPORT_SYMBOL(lustre_swab_orphan_ent_v3);
-
 void lustre_swab_ladvise(struct lu_ladvise *ladvise)
 {
 	__swab16s(&ladvise->lla_advice);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/pers.c b/drivers/staging/lustrefsx/lustre/ptlrpc/pers.c
index d0c8fa7a1e6ac..51e17e2c2b459 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/pers.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/pers.c
@@ -44,8 +44,6 @@
 void ptlrpc_fill_bulk_md(struct lnet_md *md, struct ptlrpc_bulk_desc *desc,
 			 int mdidx)
 {
-	unsigned int start = desc->bd_mds_off[mdidx];
-
 	CLASSERT(PTLRPC_MAX_BRW_PAGES < LI_POISON);
 
 	LASSERT(mdidx < desc->bd_md_max_brw);
@@ -53,34 +51,23 @@ void ptlrpc_fill_bulk_md(struct lnet_md *md, struct ptlrpc_bulk_desc *desc,
 	LASSERT(!(md->options & (LNET_MD_IOVEC | LNET_MD_KIOV |
 				 LNET_MD_PHYS)));
 
-	/* just send a lnet header */
-	if (mdidx >= desc->bd_md_count) {
-		if (ptlrpc_is_bulk_desc_kiov(desc->bd_type))
-			md->options |= LNET_MD_KIOV;
-		else if (ptlrpc_is_bulk_desc_kvec(desc->bd_type))
-			md->options |= LNET_MD_IOVEC;
-		md->length = 0;
-		md->start = NULL;
-		return;
-	}
-
-	if (mdidx == (desc->bd_md_count - 1))
-		md->length = desc->bd_iov_count - start;
-	else
-		md->length = desc->bd_mds_off[mdidx + 1] - start;
+	md->length = max(0, desc->bd_iov_count - mdidx * LNET_MAX_IOV);
+	md->length = min_t(unsigned int, LNET_MAX_IOV, md->length);
 
 	if (ptlrpc_is_bulk_desc_kiov(desc->bd_type)) {
 		md->options |= LNET_MD_KIOV;
 		if (GET_ENC_KIOV(desc))
-			md->start = &BD_GET_ENC_KIOV(desc, start);
+			md->start = &BD_GET_ENC_KIOV(desc, mdidx *
+						     LNET_MAX_IOV);
 		else
-			md->start = &BD_GET_KIOV(desc, start);
+			md->start = &BD_GET_KIOV(desc, mdidx * LNET_MAX_IOV);
 	} else if (ptlrpc_is_bulk_desc_kvec(desc->bd_type)) {
 		md->options |= LNET_MD_IOVEC;
 		if (GET_ENC_KVEC(desc))
-			md->start = &BD_GET_ENC_KVEC(desc, start);
+			md->start = &BD_GET_ENC_KVEC(desc, mdidx *
+						      LNET_MAX_IOV);
 		else
-			md->start = &BD_GET_KVEC(desc, start);
+			md->start = &BD_GET_KVEC(desc, mdidx * LNET_MAX_IOV);
 	}
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/pinger.c b/drivers/staging/lustrefsx/lustre/ptlrpc/pinger.c
index d965c0838d8d5..15fb0965241eb 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/pinger.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/pinger.c
@@ -37,7 +37,6 @@
 #define DEBUG_SUBSYSTEM S_RPC
 
 #include <linux/kthread.h>
-#include <linux/workqueue.h>
 #include <obd_support.h>
 #include <obd_class.h>
 #include "ptlrpc_internal.h"
@@ -49,6 +48,8 @@ MODULE_PARM_DESC(suppress_pings, "Suppress pings");
 struct mutex pinger_mutex;
 static struct list_head pinger_imports =
 		LIST_HEAD_INIT(pinger_imports);
+static struct list_head timeout_list =
+		LIST_HEAD_INIT(timeout_list);
 
 int ptlrpc_pinger_suppress_pings()
 {
@@ -90,51 +91,11 @@ int ptlrpc_obd_ping(struct obd_device *obd)
 }
 EXPORT_SYMBOL(ptlrpc_obd_ping);
 
-static bool ptlrpc_check_import_is_idle(struct obd_import *imp)
-{
-	struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
-	time64_t now;
-
-	if (!imp->imp_idle_timeout)
-		return false;
-
-	if (atomic_read(&imp->imp_reqs) > 0)
-		return false;
-
-	/* any lock increases ns_bref being a resource holder */
-	if (ns && atomic_read(&ns->ns_bref) > 0)
-		return false;
-
-	now = ktime_get_real_seconds();
-	if (now - imp->imp_last_reply_time < imp->imp_idle_timeout)
-		return false;
-
-	return true;
-}
-
-static void ptlrpc_update_next_ping(struct obd_import *imp, int soon)
-{
-#ifdef CONFIG_LUSTRE_FS_PINGER
-	time64_t time = soon ? PING_INTERVAL_SHORT : PING_INTERVAL;
-
-	if (imp->imp_state == LUSTRE_IMP_DISCON) {
-		time64_t dtime = max_t(time64_t, CONNECTION_SWITCH_MIN,
-				       AT_OFF ? 0 :
-				       at_get(&imp->imp_at.iat_net_latency));
-		time = min(time, dtime);
-	}
-	imp->imp_next_ping = ktime_get_seconds() + time;
-#endif /* CONFIG_LUSTRE_FS_PINGER */
-}
-
 static int ptlrpc_ping(struct obd_import *imp)
 {
 	struct ptlrpc_request	*req;
 	ENTRY;
 
-	if (ptlrpc_check_import_is_idle(imp))
-		RETURN(ptlrpc_disconnect_and_idle_import(imp));
-
 	req = ptlrpc_prep_ping(imp);
 	if (req == NULL) {
 		CERROR("OOM trying to ping %s->%s\n",
@@ -145,20 +106,28 @@ static int ptlrpc_ping(struct obd_import *imp)
 
 	DEBUG_REQ(D_INFO, req, "pinging %s->%s",
 		  imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
-	/* Updating imp_next_ping early, it allows pinger_check_timeout to
-	 * see an actual time for next awake. request_out_callback update
-	 * happens at another thread, and ptlrpc_pinger_main may sleep
-	 * already.
-	 */
-	ptlrpc_update_next_ping(imp, 0);
 	ptlrpcd_add_req(req);
 
 	RETURN(0);
 }
 
+static void ptlrpc_update_next_ping(struct obd_import *imp, int soon)
+{
+#ifdef ENABLE_PINGER
+        int time = soon ? PING_INTERVAL_SHORT : PING_INTERVAL;
+        if (imp->imp_state == LUSTRE_IMP_DISCON) {
+                int dtime = max_t(int, CONNECTION_SWITCH_MIN,
+                                  AT_OFF ? 0 :
+                                  at_get(&imp->imp_at.iat_net_latency));
+                time = min(time, dtime);
+        }
+        imp->imp_next_ping = cfs_time_shift(time);
+#endif /* ENABLE_PINGER */
+}
+
 void ptlrpc_ping_import_soon(struct obd_import *imp)
 {
-	imp->imp_next_ping = ktime_get_seconds();
+        imp->imp_next_ping = cfs_time_current();
 }
 
 static inline int imp_is_deactive(struct obd_import *imp)
@@ -167,36 +136,34 @@ static inline int imp_is_deactive(struct obd_import *imp)
                 OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_IMP_DEACTIVE));
 }
 
-static inline time64_t ptlrpc_next_reconnect(struct obd_import *imp)
+static inline int ptlrpc_next_reconnect(struct obd_import *imp)
 {
-	return ktime_get_seconds() + INITIAL_CONNECT_TIMEOUT;
+        if (imp->imp_server_timeout)
+                return cfs_time_shift(obd_timeout / 2);
+        else
+                return cfs_time_shift(obd_timeout);
 }
 
-static s32 pinger_check_timeout(time64_t time)
+static cfs_duration_t pinger_check_timeout(cfs_time_t time)
 {
-	s32 timeout = PING_INTERVAL;
-	s32 next_timeout;
-	time64_t now;
-	struct list_head *iter;
-	struct obd_import *imp;
+        struct timeout_item *item;
+        cfs_time_t timeout = PING_INTERVAL;
 
+	/* This list is sorted in increasing timeout order */
 	mutex_lock(&pinger_mutex);
-	now = ktime_get_seconds();
-	/* Process imports to find a nearest next ping */
-	list_for_each(iter, &pinger_imports) {
-		imp = list_entry(iter, struct obd_import, imp_pinger_chain);
-		if (!imp->imp_pingable || imp->imp_next_ping < now)
-			continue;
-		next_timeout = imp->imp_next_ping - now;
-		/* make sure imp_next_ping in the future from time */
-		if (next_timeout > (now - time) && timeout > next_timeout)
-			timeout = next_timeout;
+	list_for_each_entry(item, &timeout_list, ti_chain) {
+		int ti_timeout = item->ti_timeout;
+		if (timeout > ti_timeout)
+			timeout = ti_timeout;
+		break;
 	}
 	mutex_unlock(&pinger_mutex);
 
-	return timeout - (now - time);
+        return cfs_time_sub(cfs_time_add(time, cfs_time_seconds(timeout)),
+                                         cfs_time_current());
 }
 
+
 static bool ir_up;
 
 void ptlrpc_pinger_ir_up(void)
@@ -214,7 +181,7 @@ void ptlrpc_pinger_ir_down(void)
 EXPORT_SYMBOL(ptlrpc_pinger_ir_down);
 
 static void ptlrpc_pinger_process_import(struct obd_import *imp,
-					 time64_t this_ping)
+                                         unsigned long this_ping)
 {
 	int level;
 	int force;
@@ -233,13 +200,16 @@ static void ptlrpc_pinger_process_import(struct obd_import *imp,
 
 	imp->imp_force_verify = 0;
 
-	if (imp->imp_next_ping - 5 >= this_ping && !force) {
+	if (cfs_time_aftereq(imp->imp_next_ping - 5 * CFS_TICK, this_ping) &&
+	    !force) {
 		spin_unlock(&imp->imp_lock);
 		return;
 	}
 
 	imp->imp_force_next_verify = 0;
 
+	spin_unlock(&imp->imp_lock);
+
 	CDEBUG(level == LUSTRE_IMP_FULL ? D_INFO : D_HA, "%s->%s: level %s/%u "
 	       "force %u force_next %u deactive %u pingable %u suppress %u\n",
 	       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
@@ -249,91 +219,130 @@ static void ptlrpc_pinger_process_import(struct obd_import *imp,
         if (level == LUSTRE_IMP_DISCON && !imp_is_deactive(imp)) {
                 /* wait for a while before trying recovery again */
                 imp->imp_next_ping = ptlrpc_next_reconnect(imp);
-		spin_unlock(&imp->imp_lock);
                 if (!imp->imp_no_pinger_recover)
                         ptlrpc_initiate_recovery(imp);
-	} else if (level != LUSTRE_IMP_FULL || imp->imp_obd->obd_no_recov ||
-		   imp_is_deactive(imp)) {
+        } else if (level != LUSTRE_IMP_FULL ||
+                   imp->imp_obd->obd_no_recov ||
+                   imp_is_deactive(imp)) {
 		CDEBUG(D_HA, "%s->%s: not pinging (in recovery "
 		       "or recovery disabled: %s)\n",
 		       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
 		       ptlrpc_import_state_name(level));
-		if (force)
+		if (force) {
+			spin_lock(&imp->imp_lock);
 			imp->imp_force_verify = 1;
-		spin_unlock(&imp->imp_lock);
+			spin_unlock(&imp->imp_lock);
+		}
 	} else if ((imp->imp_pingable && !suppress) || force_next || force) {
-		spin_unlock(&imp->imp_lock);
 		ptlrpc_ping(imp);
-	} else {
-		spin_unlock(&imp->imp_lock);
 	}
 }
 
-static struct workqueue_struct *pinger_wq;
-static void ptlrpc_pinger_main(struct work_struct *ws);
-static DECLARE_DELAYED_WORK(ping_work, ptlrpc_pinger_main);
-
-static void ptlrpc_pinger_main(struct work_struct *ws)
+static int ptlrpc_pinger_main(void *arg)
 {
-	time64_t this_ping, time_after_ping;
-	s32 time_to_next_wake;
-	struct obd_import *imp;
-	struct list_head *iter;
+	struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg;
+	ENTRY;
 
-	do {
-		this_ping = ktime_get_seconds();
+	/* Record that the thread is running */
+	thread_set_flags(thread, SVC_RUNNING);
+	wake_up(&thread->t_ctl_waitq);
+
+	/* And now, loop forever, pinging as needed. */
+	while (1) {
+		cfs_time_t this_ping = cfs_time_current();
+		struct l_wait_info lwi;
+		cfs_duration_t time_to_next_wake;
+		struct timeout_item *item;
+		struct list_head *iter;
 
 		mutex_lock(&pinger_mutex);
+		list_for_each_entry(item, &timeout_list, ti_chain)
+                        item->ti_cb(item, item->ti_cb_data);
 
 		list_for_each(iter, &pinger_imports) {
-			imp = list_entry(iter, struct obd_import,
-					 imp_pinger_chain);
-
-			ptlrpc_pinger_process_import(imp, this_ping);
-			/* obd_timeout might have changed */
-			if (imp->imp_pingable && imp->imp_next_ping &&
-			    imp->imp_next_ping > this_ping + PING_INTERVAL)
-				ptlrpc_update_next_ping(imp, 0);
-		}
+			struct obd_import *imp = list_entry(iter,
+							    struct obd_import,
+							    imp_pinger_chain);
+
+                        ptlrpc_pinger_process_import(imp, this_ping);
+                        /* obd_timeout might have changed */
+                        if (imp->imp_pingable && imp->imp_next_ping &&
+                            cfs_time_after(imp->imp_next_ping,
+                                           cfs_time_add(this_ping,
+                                                        cfs_time_seconds(PING_INTERVAL))))
+                                ptlrpc_update_next_ping(imp, 0);
+                }
 		mutex_unlock(&pinger_mutex);
+                /* update memory usage info */
+                obd_update_maxusage();
+
+                /* Wait until the next ping time, or until we're stopped. */
+                time_to_next_wake = pinger_check_timeout(this_ping);
+                /* The ping sent by ptlrpc_send_rpc may get sent out
+                   say .01 second after this.
+                   ptlrpc_pinger_sending_on_import will then set the
+                   next ping time to next_ping + .01 sec, which means
+                   we will SKIP the next ping at next_ping, and the
+                   ping will get sent 2 timeouts from now!  Beware. */
+		CDEBUG(D_INFO, "next wakeup in "CFS_DURATION_T" (%ld)\n",
+		       time_to_next_wake,
+                       cfs_time_add(this_ping,cfs_time_seconds(PING_INTERVAL)));
+                if (time_to_next_wake > 0) {
+                        lwi = LWI_TIMEOUT(max_t(cfs_duration_t,
+                                                time_to_next_wake,
+                                                cfs_time_seconds(1)),
+                                          NULL, NULL);
+                        l_wait_event(thread->t_ctl_waitq,
+                                     thread_is_stopping(thread) ||
+                                     thread_is_event(thread),
+                                     &lwi);
+                        if (thread_test_and_clear_flags(thread, SVC_STOPPING)) {
+                                EXIT;
+                                break;
+                        } else {
+                                /* woken after adding import to reset timer */
+                                thread_test_and_clear_flags(thread, SVC_EVENT);
+                        }
+                }
+        }
+
+	thread_set_flags(thread, SVC_STOPPED);
+	wake_up(&thread->t_ctl_waitq);
 
-		time_after_ping = ktime_get_seconds();
-		/* update memory usage info */
-		obd_update_maxusage();
-
-		if ((ktime_get_seconds() - this_ping - 3) > PING_INTERVAL)
-			CDEBUG(D_HA, "long time to ping: %lld, %lld, %lld\n",
-			       this_ping, time_after_ping, ktime_get_seconds());
-
-		/* Wait until the next ping time, or until we're stopped. */
-		time_to_next_wake = pinger_check_timeout(this_ping);
-		/* The ping sent by ptlrpc_send_rpc may get sent out
-		 * say .01 second after this.
-		 * ptlrpc_pinger_sending_on_import will then set the
-		 * next ping time to next_ping + .01 sec, which means
-		 * we will SKIP the next ping at next_ping, and the
-		 * ping will get sent 2 timeouts from now!  Beware. */
-		CDEBUG(D_INFO, "next wakeup in %d (%lld)\n",
-		       time_to_next_wake, this_ping + PING_INTERVAL);
-	} while (time_to_next_wake <= 0);
-
-	queue_delayed_work(pinger_wq, &ping_work,
-			   cfs_time_seconds(max(time_to_next_wake, 1)));
+	CDEBUG(D_NET, "pinger thread exiting, process %d\n", current_pid());
+	return 0;
 }
 
+static struct ptlrpc_thread pinger_thread;
+
 int ptlrpc_start_pinger(void)
 {
-#ifdef ENABLE_PINGER
-	if (pinger_wq)
-		return -EALREADY;
+	struct l_wait_info lwi = { 0 };
+	struct task_struct *task;
+	int rc;
+#ifndef ENABLE_PINGER
+	return 0;
+#endif
+	ENTRY;
+
+	if (!thread_is_init(&pinger_thread) &&
+	    !thread_is_stopped(&pinger_thread))
+		RETURN(-EALREADY);
+
+	init_waitqueue_head(&pinger_thread.t_ctl_waitq);
+
+	strcpy(pinger_thread.t_name, "ll_ping");
 
-	pinger_wq = alloc_workqueue("ptlrpc_pinger", 0, 1);
-	if (!pinger_wq) {
-		CERROR("cannot start pinger workqueue\n");
-		return -ENOMEM;
+	task = kthread_run(ptlrpc_pinger_main, &pinger_thread,
+			   pinger_thread.t_name);
+	if (IS_ERR(task)) {
+		rc = PTR_ERR(task);
+		CERROR("cannot start pinger thread: rc = %d\n", rc);
+		RETURN(rc);
 	}
 
-	queue_delayed_work(pinger_wq, &ping_work, 0);
+	l_wait_event(pinger_thread.t_ctl_waitq,
+		     thread_is_running(&pinger_thread), &lwi);
 
 	if (suppress_pings)
 		CWARN("Pings will be suppressed at the request of the "
@@ -341,21 +350,32 @@ int ptlrpc_start_pinger(void)
 		      "additional requirements described in the manual.  "
 		      "(Search for the \"suppress_pings\" kernel module "
 		      "parameter.)\n");
-#endif
-	return 0;
+
+	RETURN(0);
 }
 
+int ptlrpc_pinger_remove_timeouts(void);
+
 int ptlrpc_stop_pinger(void)
 {
-#ifdef ENABLE_PINGER
-	if (!pinger_wq)
-		return -EALREADY;
-
-	cancel_delayed_work_sync(&ping_work);
-	destroy_workqueue(pinger_wq);
-	pinger_wq = NULL;
-#endif
+	struct l_wait_info lwi = { 0 };
+#ifndef ENABLE_PINGER
 	return 0;
+#endif
+	ENTRY;
+
+	if (thread_is_init(&pinger_thread) ||
+	    thread_is_stopped(&pinger_thread))
+		RETURN(-EALREADY);
+
+	ptlrpc_pinger_remove_timeouts();
+
+	thread_set_flags(&pinger_thread, SVC_STOPPING);
+	wake_up(&pinger_thread.t_ctl_waitq);
+
+	l_wait_event(pinger_thread.t_ctl_waitq,
+		     thread_is_stopped(&pinger_thread), &lwi);
+	RETURN(0);
 }
 
 void ptlrpc_pinger_sending_on_import(struct obd_import *imp)
@@ -420,10 +440,129 @@ int ptlrpc_pinger_del_import(struct obd_import *imp)
 }
 EXPORT_SYMBOL(ptlrpc_pinger_del_import);
 
+/**
+ * Register a timeout callback to the pinger list, and the callback will
+ * be called when timeout happens.
+ */
+static struct timeout_item *ptlrpc_new_timeout(int time,
+					       enum timeout_event event,
+					       timeout_cb_t cb, void *data)
+{
+        struct timeout_item *ti;
+
+        OBD_ALLOC_PTR(ti);
+        if (!ti)
+                return(NULL);
+
+	INIT_LIST_HEAD(&ti->ti_obd_list);
+	INIT_LIST_HEAD(&ti->ti_chain);
+        ti->ti_timeout = time;
+        ti->ti_event = event;
+        ti->ti_cb = cb;
+        ti->ti_cb_data = data;
+
+        return ti;
+}
+
+/**
+ * Register timeout event on the the pinger thread.
+ * Note: the timeout list is an sorted list with increased timeout value.
+ */
+static struct timeout_item*
+ptlrpc_pinger_register_timeout(int time, enum timeout_event event,
+                               timeout_cb_t cb, void *data)
+{
+	struct timeout_item *item, *tmp;
+
+	LASSERT(mutex_is_locked(&pinger_mutex));
+
+	list_for_each_entry(item, &timeout_list, ti_chain)
+		if (item->ti_event == event)
+			goto out;
+
+	item = ptlrpc_new_timeout(time, event, cb, data);
+	if (item) {
+		list_for_each_entry_reverse(tmp, &timeout_list, ti_chain) {
+			if (tmp->ti_timeout < time) {
+				list_add(&item->ti_chain, &tmp->ti_chain);
+				goto out;
+			}
+		}
+		list_add(&item->ti_chain, &timeout_list);
+	}
+out:
+	return item;
+}
+
+/* Add a client_obd to the timeout event list, when timeout(@time)
+ * happens, the callback(@cb) will be called.
+ */
+int ptlrpc_add_timeout_client(int time, enum timeout_event event,
+                              timeout_cb_t cb, void *data,
+			      struct list_head *obd_list)
+{
+        struct timeout_item *ti;
+
+	mutex_lock(&pinger_mutex);
+        ti = ptlrpc_pinger_register_timeout(time, event, cb, data);
+        if (!ti) {
+		mutex_unlock(&pinger_mutex);
+                return (-EINVAL);
+        }
+	list_add(obd_list, &ti->ti_obd_list);
+	mutex_unlock(&pinger_mutex);
+        return 0;
+}
+EXPORT_SYMBOL(ptlrpc_add_timeout_client);
+
+int ptlrpc_del_timeout_client(struct list_head *obd_list,
+			      enum timeout_event event)
+{
+	struct timeout_item *ti = NULL, *item;
+
+	if (list_empty(obd_list))
+		return 0;
+	mutex_lock(&pinger_mutex);
+	list_del_init(obd_list);
+	/**
+	 * If there are no obd attached to the timeout event
+	 * list, remove this timeout event from the pinger
+	 */
+	list_for_each_entry(item, &timeout_list, ti_chain) {
+		if (item->ti_event == event) {
+			ti = item;
+			break;
+		}
+	}
+	LASSERTF(ti != NULL, "ti is NULL !\n");
+	if (list_empty(&ti->ti_obd_list)) {
+		list_del(&ti->ti_chain);
+		OBD_FREE_PTR(ti);
+	}
+	mutex_unlock(&pinger_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_del_timeout_client);
+
+int ptlrpc_pinger_remove_timeouts(void)
+{
+        struct timeout_item *item, *tmp;
+
+	mutex_lock(&pinger_mutex);
+	list_for_each_entry_safe(item, tmp, &timeout_list, ti_chain) {
+		LASSERT(list_empty(&item->ti_obd_list));
+		list_del(&item->ti_chain);
+                OBD_FREE_PTR(item);
+        }
+	mutex_unlock(&pinger_mutex);
+        return 0;
+}
+
 void ptlrpc_pinger_wake_up()
 {
 #ifdef ENABLE_PINGER
-	mod_delayed_work(pinger_wq, &ping_work, 0);
+	thread_add_flags(&pinger_thread, SVC_EVENT);
+	wake_up(&pinger_thread.t_ctl_waitq);
 #endif
 }
 
@@ -461,12 +600,12 @@ int ping_evictor_wake(struct obd_export *exp)
 
 static int ping_evictor_main(void *arg)
 {
-	struct obd_device *obd;
-	struct obd_export *exp;
-	struct l_wait_info lwi = { 0 };
-	time64_t expire_time;
+        struct obd_device *obd;
+        struct obd_export *exp;
+        struct l_wait_info lwi = { 0 };
+        time_t expire_time;
+        ENTRY;
 
-	ENTRY;
 	unshare_fs_struct();
 
 	CDEBUG(D_HA, "Starting Ping Evictor\n");
@@ -487,9 +626,9 @@ static int ping_evictor_main(void *arg)
 				 obd_evict_list);
 		spin_unlock(&pet_lock);
 
-		expire_time = ktime_get_real_seconds() - PING_EVICT_TIMEOUT;
+		expire_time = cfs_time_current_sec() - PING_EVICT_TIMEOUT;
 
-		CDEBUG(D_HA, "evicting all exports of obd %s older than %lld\n",
+		CDEBUG(D_HA, "evicting all exports of obd %s older than %ld\n",
 		       obd->obd_name, expire_time);
 
 		/* Exports can't be deleted out of the list while we hold
@@ -505,19 +644,19 @@ static int ping_evictor_main(void *arg)
 				class_export_get(exp);
 				spin_unlock(&obd->obd_dev_lock);
 				LCONSOLE_WARN("%s: haven't heard from client %s"
-					      " (at %s) in %lld seconds. I think"
+                                              " (at %s) in %ld seconds. I think"
                                               " it's dead, and I am evicting"
-					      " it. exp %p, cur %lld expire %lld"
-					      " last %lld\n",
+                                              " it. exp %p, cur %ld expire %ld"
+                                              " last %ld\n",
                                               obd->obd_name,
                                               obd_uuid2str(&exp->exp_client_uuid),
                                               obd_export_nid2str(exp),
-					      ktime_get_real_seconds() -
-					      exp->exp_last_request_time,
-					      exp, ktime_get_real_seconds(),
-					      expire_time,
-					      exp->exp_last_request_time);
-				CDEBUG(D_HA, "Last request was at %lld\n",
+                                              (long)(cfs_time_current_sec() -
+                                                     exp->exp_last_request_time),
+                                              exp, (long)cfs_time_current_sec(),
+                                              (long)expire_time,
+                                              (long)exp->exp_last_request_time);
+                                CDEBUG(D_HA, "Last request was at %ld\n",
                                        exp->exp_last_request_time);
                                 class_fail_export(exp);
                                 class_export_put(exp);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_internal.h b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_internal.h
index 41b9a268d52a6..cfd1de5bb3d45 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_internal.h
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -69,7 +69,7 @@ int ptlrpcd_start(struct ptlrpcd_ctl *pc);
 
 /* client.c */
 void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
-			       timeout_t service_timeout);
+			       unsigned int service_time);
 struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw,
 					 enum ptlrpc_bulk_op_type type,
 					 unsigned portal,
@@ -83,7 +83,7 @@ void ptlrpc_init_xid(void);
 void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
 			    struct ptlrpc_request *req);
 int ptlrpc_expired_set(void *data);
-time64_t ptlrpc_set_next_timeout(struct ptlrpc_request_set *);
+int ptlrpc_set_next_timeout(struct ptlrpc_request_set *);
 void ptlrpc_resend_req(struct ptlrpc_request *request);
 void ptlrpc_set_bulk_mbits(struct ptlrpc_request *req);
 void ptlrpc_assign_next_xid_nolock(struct ptlrpc_request *req);
@@ -97,8 +97,7 @@ void ptlrpc_exit_portals(void);
 void ptlrpc_request_handle_notconn(struct ptlrpc_request *);
 void lustre_assert_wire_constants(void);
 int ptlrpc_import_in_recovery(struct obd_import *imp);
-int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt,
-			     bool invalid);
+int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt);
 void ptlrpc_handle_failed_import(struct obd_import *imp);
 int ptlrpc_replay_next(struct obd_import *imp, int *inflight);
 void ptlrpc_initiate_recovery(struct obd_import *imp);
@@ -106,18 +105,15 @@ void ptlrpc_initiate_recovery(struct obd_import *imp);
 int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset);
 int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset);
 
-int ptlrpc_sysfs_register_service(struct kset *parent,
-				  struct ptlrpc_service *svc);
-void ptlrpc_sysfs_unregister_service(struct ptlrpc_service *svc);
-
-void ptlrpc_ldebugfs_register_service(struct dentry *debugfs_entry,
-				      struct ptlrpc_service *svc);
 #ifdef CONFIG_PROC_FS
+void ptlrpc_lprocfs_register_service(struct proc_dir_entry *proc_entry,
+                                     struct ptlrpc_service *svc);
 void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc);
 void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount);
 void ptlrpc_lprocfs_do_request_stat (struct ptlrpc_request *req,
                                      long q_usec, long work_usec);
 #else
+#define ptlrpc_lprocfs_register_service(params...) do{}while(0)
 #define ptlrpc_lprocfs_unregister_service(params...) do{}while(0)
 #define ptlrpc_lprocfs_rpc_sent(params...) do{}while(0)
 #define ptlrpc_lprocfs_do_request_stat(params...) do{}while(0)
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c
index b98d082660628..0532c4d22d8bd 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -212,7 +212,7 @@ void ptlrpcd_add_rqset(struct ptlrpc_request_set *set)
 
 		LASSERT(req->rq_phase == RQ_PHASE_NEW);
 		req->rq_set = new;
-		req->rq_queued_time = ktime_get_seconds();
+		req->rq_queued_time = cfs_time_current();
 	}
 
 	spin_lock(&new->set_new_req_lock);
@@ -476,7 +476,7 @@ static int ptlrpcd(void *arg)
          */
         do {
                 struct l_wait_info lwi;
-		time64_t timeout;
+                int timeout;
 
                 timeout = ptlrpc_set_next_timeout(set);
 		lwi = LWI_TIMEOUT(cfs_time_seconds(timeout),
@@ -503,11 +503,11 @@ static int ptlrpcd(void *arg)
                  */
         } while (exit < 2);
 
-	/*
-	 * Wait for inflight requests to drain.
-	 */
+        /*
+         * Wait for inflight requests to drain.
+         */
 	if (!list_empty(&set->set_requests))
-		ptlrpc_set_wait(&env, set);
+                ptlrpc_set_wait(set);
 	lu_context_fini(&env.le_ctx);
 	lu_context_fini(env.le_ses);
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/recover.c b/drivers/staging/lustrefsx/lustre/ptlrpc/recover.c
index c923ab9386901..aacb929beae23 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/recover.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/recover.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -228,22 +228,30 @@ void ptlrpc_wake_delayed(struct obd_import *imp)
 
 void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req)
 {
-	struct obd_import *imp = failed_req->rq_import;
-	int conn = lustre_msg_get_conn_cnt(failed_req->rq_reqmsg);
-	ENTRY;
-
-	CDEBUG(D_HA, "import %s of %s@%s abruptly disconnected: reconnecting\n",
-		imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
-		imp->imp_connection->c_remote_uuid.uuid);
+        struct obd_import *imp = failed_req->rq_import;
+        ENTRY;
 
-	if (ptlrpc_set_import_discon(imp, conn, true)) {
-		/* to control recovery via lctl {disable|enable}_recovery */
-		if (imp->imp_deactive == 0)
-			ptlrpc_connect_import(imp);
-	}
+        CDEBUG(D_HA, "import %s of %s@%s abruptly disconnected: reconnecting\n",
+               imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
+               imp->imp_connection->c_remote_uuid.uuid);
+
+        if (ptlrpc_set_import_discon(imp,
+                              lustre_msg_get_conn_cnt(failed_req->rq_reqmsg))) {
+                if (!imp->imp_replayable) {
+                        CDEBUG(D_HA, "import %s@%s for %s not replayable, "
+                               "auto-deactivating\n",
+                               obd2cli_tgt(imp->imp_obd),
+                               imp->imp_connection->c_remote_uuid.uuid,
+                               imp->imp_obd->obd_name);
+                        ptlrpc_deactivate_import(imp);
+                }
+                /* to control recovery via lctl {disable|enable}_recovery */
+                if (imp->imp_deactive == 0)
+                        ptlrpc_connect_import(imp);
+        }
 
-	/* Wait for recovery to complete and resend. If evicted, then
-	   this request will be errored out later.*/
+        /* Wait for recovery to complete and resend. If evicted, then
+           this request will be errored out later.*/
 	spin_lock(&failed_req->rq_lock);
 	if (!failed_req->rq_no_resend)
 		failed_req->rq_resend = 1;
@@ -253,7 +261,7 @@ void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req)
 }
 
 /**
- * Administratively active/deactive a client.
+ * Administratively active/deactive a client. 
  * This should only be called by the ioctl interface, currently
  *  - the lctl deactivate and activate commands
  *  - echo 0/1 >> /proc/osc/XXX/active
@@ -312,21 +320,21 @@ int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async)
 	    atomic_read(&imp->imp_inval_count))
 		rc = -EINVAL;
 	spin_unlock(&imp->imp_lock);
-	if (rc)
-		GOTO(out, rc);
+        if (rc)
+                GOTO(out, rc);
 
-	/* force import to be disconnected. */
-	ptlrpc_set_import_discon(imp, 0, false);
+        /* force import to be disconnected. */
+        ptlrpc_set_import_discon(imp, 0);
 
-	if (new_uuid) {
-		struct obd_uuid uuid;
+        if (new_uuid) {
+                struct obd_uuid uuid;
 
-		/* intruct import to use new uuid */
-		obd_str2uuid(&uuid, new_uuid);
-		rc = import_set_conn_priority(imp, &uuid);
-		if (rc)
-			GOTO(out, rc);
-	}
+                /* intruct import to use new uuid */
+                obd_str2uuid(&uuid, new_uuid);
+                rc = import_set_conn_priority(imp, &uuid);
+                if (rc)
+                        GOTO(out, rc);
+        }
 
         /* Check if reconnect is already in progress */
 	spin_lock(&imp->imp_lock);
@@ -346,9 +354,9 @@ int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async)
 
         if (!async) {
                 struct l_wait_info lwi;
-		long secs = cfs_time_seconds(obd_timeout);
+                int secs = cfs_time_seconds(obd_timeout);
 
-		CDEBUG(D_HA, "%s: recovery started, waiting %lu seconds\n",
+                CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n",
                        obd2cli_tgt(imp->imp_obd), secs);
 
                 lwi = LWI_TIMEOUT(secs, NULL, NULL);
@@ -369,8 +377,9 @@ int ptlrpc_import_in_recovery(struct obd_import *imp)
 	int in_recovery = 1;
 
 	spin_lock(&imp->imp_lock);
-	if (imp->imp_state <= LUSTRE_IMP_DISCON ||
-	    imp->imp_state >= LUSTRE_IMP_FULL ||
+	if (imp->imp_state == LUSTRE_IMP_FULL ||
+	    imp->imp_state == LUSTRE_IMP_CLOSED ||
+	    imp->imp_state == LUSTRE_IMP_DISCON ||
 	    imp->imp_obd->obd_no_recov)
 		in_recovery = 0;
 	spin_unlock(&imp->imp_lock);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec.c
index 78c07fcefec3a..92d39ece51d16 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -43,10 +43,6 @@
 #include <linux/crypto.h>
 #include <linux/key.h>
 
-#ifdef HAVE_LINUX_SELINUX_IS_ENABLED
-#include <linux/selinux.h>
-#endif
-
 #include <libcfs/libcfs.h>
 #include <obd.h>
 #include <obd_class.h>
@@ -58,10 +54,6 @@
 
 #include "ptlrpc_internal.h"
 
-static int send_sepol;
-module_param(send_sepol, int, 0644);
-MODULE_PARM_DESC(send_sepol, "Client sends SELinux policy status");
-
 /***********************************************
  * policy registers                            *
  ***********************************************/
@@ -410,12 +402,11 @@ static int import_sec_validate_get(struct obd_import *imp,
 	}
 
 	*sec = sptlrpc_import_sec_ref(imp);
+	/* Only output an error when the import is still active */
 	if (*sec == NULL) {
-		/* Only output an error when the import is still active */
-		if (!test_bit(WORK_STRUCT_PENDING_BIT,
-			      work_data_bits(&imp->imp_zombie_work)))
+		if (list_empty(&imp->imp_zombie_chain))
 			CERROR("import %p (%s) with no sec\n",
-			       imp, ptlrpc_import_state_name(imp->imp_state));
+				imp, ptlrpc_import_state_name(imp->imp_state));
 		return -EACCES;
 	}
 
@@ -718,12 +709,12 @@ int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout)
         sptlrpc_sec_put(sec);
 
         if (cli_ctx_is_eternal(ctx))
-		RETURN(0);
+                RETURN(0);
 
 	if (unlikely(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags))) {
-		if (ctx->cc_ops->refresh)
-			ctx->cc_ops->refresh(ctx);
-	}
+                LASSERT(ctx->cc_ops->refresh);
+                ctx->cc_ops->refresh(ctx);
+        }
 	LASSERT(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags) == 0);
 
         LASSERT(ctx->cc_ops->validate);
@@ -845,30 +836,7 @@ int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout)
                 RETURN(rc);
         }
 
-	goto again;
-}
-
-/* Bring ptlrpc_sec context up-to-date */
-int sptlrpc_export_update_ctx(struct obd_export *exp)
-{
-	struct obd_import *imp = exp ? exp->exp_imp_reverse : NULL;
-	struct ptlrpc_sec *sec = NULL;
-	struct ptlrpc_cli_ctx *ctx = NULL;
-	int rc = 0;
-
-	if (imp)
-		sec = sptlrpc_import_sec_ref(imp);
-	if (sec) {
-		ctx = get_my_ctx(sec);
-		sptlrpc_sec_put(sec);
-	}
-
-	if (ctx) {
-		if (ctx->cc_ops->refresh)
-			rc = ctx->cc_ops->refresh(ctx);
-		sptlrpc_cli_ctx_put(ctx, 1);
-	}
-	return rc;
+        goto again;
 }
 
 /**
@@ -1758,7 +1726,6 @@ void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req)
         req->rq_repmsg = NULL;
         EXIT;
 }
-EXPORT_SYMBOL(sptlrpc_cli_free_repbuf);
 
 int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp,
                                 struct ptlrpc_cli_ctx *ctx)
@@ -1780,128 +1747,6 @@ int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp,
         return policy->sp_sops->install_rctx(imp, ctx);
 }
 
-/* Get SELinux policy info from userspace */
-static int sepol_helper(struct obd_import *imp)
-{
-	char mtime_str[21] = { 0 }, mode_str[2] = { 0 };
-	char *argv[] = {
-		[0] = "/usr/sbin/l_getsepol",
-		[1] = "-o",
-		[2] = NULL,	    /* obd type */
-		[3] = "-n",
-		[4] = NULL,	    /* obd name */
-		[5] = "-t",
-		[6] = mtime_str,    /* policy mtime */
-		[7] = "-m",
-		[8] = mode_str,	    /* enforcing mode */
-		[9] = NULL
-	};
-	char *envp[] = {
-		[0] = "HOME=/",
-		[1] = "PATH=/sbin:/usr/sbin",
-		[2] = NULL
-	};
-	signed short ret;
-	int rc = 0;
-
-	if (imp == NULL || imp->imp_obd == NULL ||
-	    imp->imp_obd->obd_type == NULL) {
-		rc = -EINVAL;
-	} else {
-		argv[2] = imp->imp_obd->obd_type->typ_name;
-		argv[4] = imp->imp_obd->obd_name;
-		spin_lock(&imp->imp_sec->ps_lock);
-		if (ktime_to_ns(imp->imp_sec->ps_sepol_mtime) == 0 &&
-		    imp->imp_sec->ps_sepol[0] == '\0') {
-			/* ps_sepol has not been initialized */
-			argv[5] = NULL;
-			argv[7] = NULL;
-		} else {
-			time64_t mtime_ms;
-
-			mtime_ms = ktime_to_ms(imp->imp_sec->ps_sepol_mtime);
-			snprintf(mtime_str, sizeof(mtime_str), "%lld",
-				 mtime_ms / MSEC_PER_SEC);
-			mode_str[0] = imp->imp_sec->ps_sepol[0];
-		}
-		spin_unlock(&imp->imp_sec->ps_lock);
-		ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
-		rc = ret>>8;
-	}
-
-	return rc;
-}
-
-static inline int sptlrpc_sepol_needs_check(struct ptlrpc_sec *imp_sec)
-{
-	ktime_t checknext;
-
-	if (send_sepol == 0 || !selinux_is_enabled())
-		return 0;
-
-	if (send_sepol == -1)
-		/* send_sepol == -1 means fetch sepol status every time */
-		return 1;
-
-	spin_lock(&imp_sec->ps_lock);
-	checknext = imp_sec->ps_sepol_checknext;
-	spin_unlock(&imp_sec->ps_lock);
-
-	/* next check is too far in time, please update */
-	if (ktime_after(checknext,
-			ktime_add(ktime_get(), ktime_set(send_sepol, 0))))
-		goto setnext;
-
-	if (ktime_before(ktime_get(), checknext))
-		/* too early to fetch sepol status */
-		return 0;
-
-setnext:
-	/* define new sepol_checknext time */
-	spin_lock(&imp_sec->ps_lock);
-	imp_sec->ps_sepol_checknext = ktime_add(ktime_get(),
-						ktime_set(send_sepol, 0));
-	spin_unlock(&imp_sec->ps_lock);
-
-	return 1;
-}
-
-int sptlrpc_get_sepol(struct ptlrpc_request *req)
-{
-	struct ptlrpc_sec *imp_sec = req->rq_import->imp_sec;
-	int rc = 0;
-
-	ENTRY;
-
-	(req->rq_sepol)[0] = '\0';
-
-#ifndef HAVE_SELINUX
-	if (unlikely(send_sepol != 0))
-		CDEBUG(D_SEC, "Client cannot report SELinux status, "
-			      "it was not built against libselinux.\n");
-	RETURN(0);
-#endif
-
-	if (send_sepol == 0 || !selinux_is_enabled())
-		RETURN(0);
-
-	if (imp_sec == NULL)
-		RETURN(-EINVAL);
-
-	/* Retrieve SELinux status info */
-	if (sptlrpc_sepol_needs_check(imp_sec))
-		rc = sepol_helper(req->rq_import);
-	if (likely(rc == 0)) {
-		spin_lock(&imp_sec->ps_lock);
-		memcpy(req->rq_sepol, imp_sec->ps_sepol,
-		       sizeof(req->rq_sepol));
-		spin_unlock(&imp_sec->ps_lock);
-	}
-
-	RETURN(rc);
-}
-EXPORT_SYMBOL(sptlrpc_get_sepol);
-
 /****************************************
  * server side security                 *
  ****************************************/
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c
index 216c2f2a0820b..42841f0c0aaf1 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -36,7 +36,7 @@
 
 #define DEBUG_SUBSYSTEM S_SEC
 
-#include <libcfs/linux/linux-mem.h>
+#include <libcfs/libcfs.h>
 
 #include <obd.h>
 #include <obd_cksum.h>
@@ -114,7 +114,7 @@ static struct ptlrpc_enc_page_pool {
         unsigned long    epp_st_missings;       /* # of cache missing */
         unsigned long    epp_st_lowfree;        /* lowest free pages reached */
         unsigned int     epp_st_max_wqlen;      /* highest waitqueue length */
-	ktime_t		epp_st_max_wait;	/* in nanoseconds */
+        cfs_time_t       epp_st_max_wait;       /* in jeffies */
 	unsigned long	 epp_st_outofmem;	/* # of out of mem requests */
 	/*
 	 * pointers to pools, may be vmalloc'd
@@ -143,8 +143,8 @@ int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v)
 		   "total pages:             %lu\n"
 		   "total free:              %lu\n"
 		   "idle index:              %lu/100\n"
-		   "last shrink:             %llds\n"
-		   "last access:             %llds\n"
+		   "last shrink:             %lds\n"
+		   "last access:             %lds\n"
 		   "max pages reached:       %lu\n"
 		   "grows:                   %u\n"
 		   "grows failure:           %u\n"
@@ -153,7 +153,7 @@ int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v)
 		   "cache missing:           %lu\n"
 		   "low free mark:           %lu\n"
 		   "max waitqueue depth:     %u\n"
-		   "max wait time ms:        %lld\n"
+		   "max wait time:           %ld/%lu\n"
 		   "out of mem:              %lu\n",
 		   cfs_totalram_pages(), PAGES_PER_POOL,
 		   page_pools.epp_max_pages,
@@ -161,8 +161,8 @@ int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v)
 		   page_pools.epp_total_pages,
 		   page_pools.epp_free_pages,
 		   page_pools.epp_idle_idx,
-		   ktime_get_seconds() - page_pools.epp_last_shrink,
-		   ktime_get_seconds() - page_pools.epp_last_access,
+		   (long)(ktime_get_seconds() - page_pools.epp_last_shrink),
+		   (long)(ktime_get_seconds() - page_pools.epp_last_access),
 		   page_pools.epp_st_max_pages,
 		   page_pools.epp_st_grows,
 		   page_pools.epp_st_grow_fails,
@@ -171,7 +171,8 @@ int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v)
 		   page_pools.epp_st_missings,
 		   page_pools.epp_st_lowfree,
 		   page_pools.epp_st_max_wqlen,
-		   ktime_to_ms(page_pools.epp_st_max_wait),
+		   page_pools.epp_st_max_wait,
+		   msecs_to_jiffies(MSEC_PER_SEC),
 		   page_pools.epp_st_outofmem);
 
 	spin_unlock(&page_pools.epp_lock);
@@ -233,7 +234,7 @@ static unsigned long enc_pools_shrink_count(struct shrinker *s,
 	 * if no pool access for a long time, we consider it's fully idle.
 	 * a little race here is fine.
 	 */
-	if (unlikely(ktime_get_seconds() - page_pools.epp_last_access >
+	if (unlikely(ktime_get_real_seconds() - page_pools.epp_last_access >
 		     CACHE_QUIESCENT_PERIOD)) {
 		spin_lock(&page_pools.epp_lock);
 		page_pools.epp_idle_idx = IDLE_IDX_MAX;
@@ -264,7 +265,7 @@ static unsigned long enc_pools_shrink_scan(struct shrinker *s,
 		       (long)sc->nr_to_scan, page_pools.epp_free_pages);
 
 		page_pools.epp_st_shrinks++;
-		page_pools.epp_last_shrink = ktime_get_seconds();
+		page_pools.epp_last_shrink = ktime_get_real_seconds();
 	}
 	spin_unlock(&page_pools.epp_lock);
 
@@ -272,7 +273,7 @@ static unsigned long enc_pools_shrink_scan(struct shrinker *s,
 	 * if no pool access for a long time, we consider it's fully idle.
 	 * a little race here is fine.
 	 */
-	if (unlikely(ktime_get_seconds() - page_pools.epp_last_access >
+	if (unlikely(ktime_get_real_seconds() - page_pools.epp_last_access >
 		     CACHE_QUIESCENT_PERIOD)) {
 		spin_lock(&page_pools.epp_lock);
 		page_pools.epp_idle_idx = IDLE_IDX_MAX;
@@ -541,11 +542,11 @@ EXPORT_SYMBOL(pool_is_at_full_capacity);
 int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc)
 {
 	wait_queue_entry_t waitlink;
-	unsigned long this_idle = -1;
-	u64 tick_ns = 0;
-	time64_t now;
-	int p_idx, g_idx;
-	int i;
+	unsigned long   this_idle = -1;
+	cfs_time_t      tick = 0;
+	long            now;
+	int             p_idx, g_idx;
+	int             i;
 
 	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
 	LASSERT(desc->bd_iov_count > 0);
@@ -565,8 +566,8 @@ int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc)
 	page_pools.epp_st_access++;
 again:
 	if (unlikely(page_pools.epp_free_pages < desc->bd_iov_count)) {
-		if (tick_ns == 0)
-			tick_ns = ktime_get_ns();
+		if (tick == 0)
+			tick = cfs_time_current();
 
 		now = ktime_get_real_seconds();
 
@@ -624,13 +625,12 @@ int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc)
 		goto again;
 	}
 
-	/* record max wait time */
-	if (unlikely(tick_ns)) {
-		ktime_t tick = ktime_sub_ns(ktime_get(), tick_ns);
-
-		if (ktime_after(tick, page_pools.epp_st_max_wait))
-			page_pools.epp_st_max_wait = tick;
-	}
+        /* record max wait time */
+        if (unlikely(tick != 0)) {
+                tick = cfs_time_current() - tick;
+                if (tick > page_pools.epp_st_max_wait)
+                        page_pools.epp_st_max_wait = tick;
+        }
 
         /* proceed with rest of allocation */
         page_pools.epp_free_pages -= desc->bd_iov_count;
@@ -664,7 +664,7 @@ int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc)
                                    this_idle) /
                                   (IDLE_IDX_WEIGHT + 1);
 
-	page_pools.epp_last_access = ktime_get_seconds();
+	page_pools.epp_last_access = ktime_get_real_seconds();
 
 	spin_unlock(&page_pools.epp_lock);
 	return 0;
@@ -789,8 +789,8 @@ int sptlrpc_enc_pool_init(void)
         page_pools.epp_growing = 0;
 
         page_pools.epp_idle_idx = 0;
-	page_pools.epp_last_shrink = ktime_get_seconds();
-	page_pools.epp_last_access = ktime_get_seconds();
+	page_pools.epp_last_shrink = ktime_get_real_seconds();
+	page_pools.epp_last_access = ktime_get_real_seconds();
 
 	spin_lock_init(&page_pools.epp_lock);
         page_pools.epp_total_pages = 0;
@@ -804,7 +804,7 @@ int sptlrpc_enc_pool_init(void)
         page_pools.epp_st_missings = 0;
         page_pools.epp_st_lowfree = 0;
         page_pools.epp_st_max_wqlen = 0;
-	page_pools.epp_st_max_wait = ktime_set(0, 0);
+        page_pools.epp_st_max_wait = 0;
 	page_pools.epp_st_outofmem = 0;
 
         enc_pools_alloc();
@@ -838,12 +838,13 @@ void sptlrpc_enc_pool_fini(void)
 
 	if (page_pools.epp_st_access > 0) {
 		CDEBUG(D_SEC,
-		       "max pages %lu, grows %u, grow fails %u, shrinks %u, access %lu, missing %lu, max qlen %u, max wait ms %lld, out of mem %lu\n",
+		       "max pages %lu, grows %u, grow fails %u, shrinks %u, access %lu, missing %lu, max qlen %u, max wait %ld/%lu, out of mem %lu\n",
 		       page_pools.epp_st_max_pages, page_pools.epp_st_grows,
 		       page_pools.epp_st_grow_fails,
 		       page_pools.epp_st_shrinks, page_pools.epp_st_access,
 		       page_pools.epp_st_missings, page_pools.epp_st_max_wqlen,
-		       ktime_to_ms(page_pools.epp_st_max_wait),
+		       page_pools.epp_st_max_wait,
+		       msecs_to_jiffies(MSEC_PER_SEC),
 		       page_pools.epp_st_outofmem);
 	}
 }
@@ -916,7 +917,7 @@ EXPORT_SYMBOL(bulk_sec_desc_unpack);
 int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
 			      void *buf, int buflen)
 {
-	struct ahash_request	       *req;
+	struct cfs_crypto_hash_desc	*hdesc;
 	int				hashsize;
 	unsigned int			bufsize;
 	int				i, err;
@@ -925,17 +926,17 @@ int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
 	LASSERT(alg > BULK_HASH_ALG_NULL && alg < BULK_HASH_ALG_MAX);
 	LASSERT(buflen >= 4);
 
-	req = cfs_crypto_hash_init(cfs_hash_alg_id[alg], NULL, 0);
-	if (IS_ERR(req)) {
+	hdesc = cfs_crypto_hash_init(cfs_hash_alg_id[alg], NULL, 0);
+	if (IS_ERR(hdesc)) {
 		CERROR("Unable to initialize checksum hash %s\n",
 		       cfs_crypto_hash_name(cfs_hash_alg_id[alg]));
-		return PTR_ERR(req);
+		return PTR_ERR(hdesc);
 	}
 
 	hashsize = cfs_crypto_hash_digestsize(cfs_hash_alg_id[alg]);
 
 	for (i = 0; i < desc->bd_iov_count; i++) {
-		cfs_crypto_hash_update_page(req,
+		cfs_crypto_hash_update_page(hdesc,
 				  BD_GET_KIOV(desc, i).kiov_page,
 				  BD_GET_KIOV(desc, i).kiov_offset &
 					      ~PAGE_MASK,
@@ -948,11 +949,11 @@ int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
 		bufsize = sizeof(hashbuf);
 		LASSERTF(bufsize >= hashsize, "bufsize = %u < hashsize %u\n",
 			 bufsize, hashsize);
-		err = cfs_crypto_hash_final(req, hashbuf, &bufsize);
+		err = cfs_crypto_hash_final(hdesc, hashbuf, &bufsize);
 		memcpy(buf, hashbuf, buflen);
 	} else {
 		bufsize = buflen;
-		err = cfs_crypto_hash_final(req, buf, &bufsize);
+		err = cfs_crypto_hash_final(hdesc, buf, &bufsize);
 	}
 
 	return err;
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_config.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_config.c
index b661ff8696530..550abeafceea0 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_config.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_config.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -44,7 +44,7 @@
 #include <lustre_log.h>
 #include <lustre_disk.h>
 #include <lustre_dlm.h>
-#include <uapi/linux/lustre/lustre_param.h>
+#include <uapi/linux/lustre_param.h>
 #include <lustre_sec.h>
 
 #include "ptlrpc_internal.h"
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c
index dc9f38c7036ba..766b21d10c20c 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c
@@ -21,7 +21,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2014, 2017, Intel Corporation.
+ * Copyright (c) 2014, Intel Corporation.
  */
 
 #define DEBUG_SUBSYSTEM S_FILTER
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_gc.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_gc.c
index 042a632390cfe..f8ec60b1adb01 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_gc.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_gc.c
@@ -36,7 +36,7 @@
 
 #define DEBUG_SUBSYSTEM S_SEC
 
-#include <linux/workqueue.h>
+#include <linux/kthread.h>
 #include <libcfs/libcfs.h>
 
 #include <obd_support.h>
@@ -48,6 +48,7 @@
 
 #define SEC_GC_INTERVAL (30 * 60)
 
+
 static struct mutex sec_gc_mutex;
 static spinlock_t sec_gc_list_lock;
 static struct list_head sec_gc_list;
@@ -55,8 +56,10 @@ static struct list_head sec_gc_list;
 static spinlock_t sec_gc_ctx_list_lock;
 static struct list_head sec_gc_ctx_list;
 
+static struct ptlrpc_thread sec_gc_thread;
 static atomic_t sec_gc_wait_del = ATOMIC_INIT(0);
 
+
 void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec)
 {
         LASSERT(sec->ps_policy->sp_cops->gc_ctx);
@@ -95,9 +98,6 @@ void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec)
 	CDEBUG(D_SEC, "del sec %p(%s)\n", sec, sec->ps_policy->sp_name);
 }
 
-static void sec_gc_main(struct work_struct *ws);
-static DECLARE_DELAYED_WORK(sec_gc_work, sec_gc_main);
-
 void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx)
 {
 	LASSERT(list_empty(&ctx->cc_gc_chain));
@@ -108,7 +108,8 @@ void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx)
 	list_add(&ctx->cc_gc_chain, &sec_gc_ctx_list);
 	spin_unlock(&sec_gc_ctx_list_lock);
 
-	mod_delayed_work(system_wq, &sec_gc_work, 0);
+	thread_add_flags(&sec_gc_thread, SVC_SIGNAL);
+	wake_up(&sec_gc_thread.t_ctl_waitq);
 }
 EXPORT_SYMBOL(sptlrpc_gc_add_ctx);
 
@@ -155,41 +156,68 @@ static void sec_do_gc(struct ptlrpc_sec *sec)
 	sec->ps_gc_next = ktime_get_real_seconds() + sec->ps_gc_interval;
 }
 
-static void sec_gc_main(struct work_struct *ws)
+static int sec_gc_main(void *arg)
 {
-	struct ptlrpc_sec *sec;
+	struct ptlrpc_thread *thread = (struct ptlrpc_thread *) arg;
+	struct l_wait_info    lwi;
+
+	unshare_fs_struct();
+
+	/* Record that the thread is running */
+	thread_set_flags(thread, SVC_RUNNING);
+	wake_up(&thread->t_ctl_waitq);
 
-	sec_process_ctx_list();
+	while (1) {
+		struct ptlrpc_sec *sec;
+
+		thread_clear_flags(thread, SVC_SIGNAL);
+		sec_process_ctx_list();
 again:
-	/* go through sec list do gc.
-	 * FIXME here we iterate through the whole list each time which
-	 * is not optimal. we perhaps want to use balanced binary tree
-	 * to trace each sec as order of expiry time.
-	 * another issue here is we wakeup as fixed interval instead of
-	 * according to each sec's expiry time
-	 */
-	mutex_lock(&sec_gc_mutex);
-	list_for_each_entry(sec, &sec_gc_list, ps_gc_list) {
-		/* if someone is waiting to be deleted, let it
-		 * proceed as soon as possible.
-		 */
-		if (atomic_read(&sec_gc_wait_del)) {
-			CDEBUG(D_SEC, "deletion pending, start over\n");
-			mutex_unlock(&sec_gc_mutex);
-			goto again;
+		/* go through sec list do gc.
+		 * FIXME here we iterate through the whole list each time which
+		 * is not optimal. we perhaps want to use balanced binary tree
+		 * to trace each sec as order of expiry time.
+		 * another issue here is we wakeup as fixed interval instead of
+		 * according to each sec's expiry time */
+		mutex_lock(&sec_gc_mutex);
+		list_for_each_entry(sec, &sec_gc_list, ps_gc_list) {
+			/* if someone is waiting to be deleted, let it
+			 * proceed as soon as possible. */
+			if (atomic_read(&sec_gc_wait_del)) {
+				CDEBUG(D_SEC, "deletion pending, start over\n");
+				mutex_unlock(&sec_gc_mutex);
+				goto again;
+			}
+
+			sec_do_gc(sec);
 		}
+		mutex_unlock(&sec_gc_mutex);
+
+		/* check ctx list again before sleep */
+		sec_process_ctx_list();
+
+		lwi = LWI_TIMEOUT(msecs_to_jiffies(SEC_GC_INTERVAL *
+						   MSEC_PER_SEC),
+				  NULL, NULL);
+		l_wait_event(thread->t_ctl_waitq,
+			     thread_is_stopping(thread) ||
+			     thread_is_signal(thread),
+			     &lwi);
 
-		sec_do_gc(sec);
+		if (thread_test_and_clear_flags(thread, SVC_STOPPING))
+			break;
 	}
-	mutex_unlock(&sec_gc_mutex);
 
-	/* check ctx list again before sleep */
-	sec_process_ctx_list();
-	schedule_delayed_work(&sec_gc_work, cfs_time_seconds(SEC_GC_INTERVAL));
+	thread_set_flags(thread, SVC_STOPPED);
+	wake_up(&thread->t_ctl_waitq);
+	return 0;
 }
 
 int sptlrpc_gc_init(void)
 {
+	struct l_wait_info lwi = { 0 };
+	struct task_struct *task;
+
 	mutex_init(&sec_gc_mutex);
 	spin_lock_init(&sec_gc_list_lock);
 	spin_lock_init(&sec_gc_ctx_list_lock);
@@ -197,11 +225,28 @@ int sptlrpc_gc_init(void)
 	INIT_LIST_HEAD(&sec_gc_list);
 	INIT_LIST_HEAD(&sec_gc_ctx_list);
 
-	schedule_delayed_work(&sec_gc_work, cfs_time_seconds(SEC_GC_INTERVAL));
+	/* initialize thread control */
+	memset(&sec_gc_thread, 0, sizeof(sec_gc_thread));
+	init_waitqueue_head(&sec_gc_thread.t_ctl_waitq);
+
+	task = kthread_run(sec_gc_main, &sec_gc_thread, "sptlrpc_gc");
+	if (IS_ERR(task)) {
+		CERROR("can't start gc thread: %ld\n", PTR_ERR(task));
+		return PTR_ERR(task);
+	}
+
+	l_wait_event(sec_gc_thread.t_ctl_waitq,
+		     thread_is_running(&sec_gc_thread), &lwi);
 	return 0;
 }
 
 void sptlrpc_gc_fini(void)
 {
-	cancel_delayed_work_sync(&sec_gc_work);
+	struct l_wait_info lwi = { 0 };
+
+	thread_set_flags(&sec_gc_thread, SVC_STOPPING);
+	wake_up(&sec_gc_thread.t_ctl_waitq);
+
+	l_wait_event(sec_gc_thread.t_ctl_waitq,
+		     thread_is_stopped(&sec_gc_thread), &lwi);
 }
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_lproc.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_lproc.c
index 4f8efe44aa678..96acb183270e4 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_lproc.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_lproc.c
@@ -110,8 +110,7 @@ static int sptlrpc_info_lprocfs_seq_show(struct seq_file *seq, void *v)
 out:
         return 0;
 }
-
-LDEBUGFS_SEQ_FOPS_RO(sptlrpc_info_lprocfs);
+LPROC_SEQ_FOPS_RO(sptlrpc_info_lprocfs);
 
 static int sptlrpc_ctxs_lprocfs_seq_show(struct seq_file *seq, void *v)
 {
@@ -137,81 +136,11 @@ static int sptlrpc_ctxs_lprocfs_seq_show(struct seq_file *seq, void *v)
 out:
         return 0;
 }
-
-LDEBUGFS_SEQ_FOPS_RO(sptlrpc_ctxs_lprocfs);
-
-static ssize_t
-ldebugfs_sptlrpc_sepol_seq_write(struct file *file, const char __user *buffer,
-				size_t count, void *data)
-{
-	struct seq_file	*seq = file->private_data;
-	struct obd_device *dev = seq->private;
-	struct client_obd *cli = &dev->u.cli;
-	struct obd_import *imp = cli->cl_import;
-	struct sepol_downcall_data *param;
-	int size = sizeof(*param);
-	int rc = 0;
-
-	if (count < size) {
-		CERROR("%s: invalid data count = %lu, size = %d\n",
-		       dev->obd_name, (unsigned long) count, size);
-		return -EINVAL;
-	}
-
-	OBD_ALLOC(param, size);
-	if (param == NULL)
-		return -ENOMEM;
-
-	if (copy_from_user(param, buffer, size)) {
-		CERROR("%s: bad sepol data\n", dev->obd_name);
-		GOTO(out, rc = -EFAULT);
-	}
-
-	if (param->sdd_magic != SEPOL_DOWNCALL_MAGIC) {
-		CERROR("%s: sepol downcall bad params\n",
-		       dev->obd_name);
-		GOTO(out, rc = -EINVAL);
-	}
-
-	if (param->sdd_sepol_len == 0 ||
-	    param->sdd_sepol_len >= sizeof(imp->imp_sec->ps_sepol)) {
-		CERROR("%s: invalid sepol data returned\n",
-		       dev->obd_name);
-		GOTO(out, rc = -EINVAL);
-	}
-	rc = param->sdd_sepol_len; /* save sdd_sepol_len */
-	OBD_FREE(param, size);
-	size = offsetof(struct sepol_downcall_data,
-			sdd_sepol[rc]);
-
-	/* alloc again with real size */
-	rc = 0;
-	OBD_ALLOC(param, size);
-	if (param == NULL)
-		return -ENOMEM;
-
-	if (copy_from_user(param, buffer, size)) {
-		CERROR("%s: bad sepol data\n", dev->obd_name);
-		GOTO(out, rc = -EFAULT);
-	}
-
-	spin_lock(&imp->imp_sec->ps_lock);
-	snprintf(imp->imp_sec->ps_sepol, param->sdd_sepol_len + 1, "%s",
-		 param->sdd_sepol);
-	imp->imp_sec->ps_sepol_mtime = ktime_set(param->sdd_sepol_mtime, 0);
-	spin_unlock(&imp->imp_sec->ps_lock);
-
-out:
-	if (param != NULL)
-		OBD_FREE(param, size);
-
-	return rc ? rc : count;
-}
-LDEBUGFS_FOPS_WR_ONLY(srpc, sptlrpc_sepol);
+LPROC_SEQ_FOPS_RO(sptlrpc_ctxs_lprocfs);
 
 int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev)
 {
-	int     rc;
+        int     rc;
 
 	if (strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) != 0 &&
 	    strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) != 0 &&
@@ -223,31 +152,23 @@ int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev)
 		return -EINVAL;
 	}
 
-	rc = ldebugfs_seq_create(dev->obd_debugfs_entry, "srpc_info", 0444,
-				 &sptlrpc_info_lprocfs_fops, dev);
-	if (rc) {
-		CERROR("create proc entry srpc_info for %s: %d\n",
-		       dev->obd_name, rc);
-		return rc;
-	}
-
-	rc = ldebugfs_seq_create(dev->obd_debugfs_entry, "srpc_contexts",
-				 0444, &sptlrpc_ctxs_lprocfs_fops, dev);
-	if (rc) {
-		CERROR("create proc entry srpc_contexts for %s: %d\n",
-		       dev->obd_name, rc);
-		return rc;
-	}
+        rc = lprocfs_obd_seq_create(dev, "srpc_info", 0444,
+                                    &sptlrpc_info_lprocfs_fops, dev);
+        if (rc) {
+                CERROR("create proc entry srpc_info for %s: %d\n",
+                       dev->obd_name, rc);
+                return rc;
+        }
 
-	rc = ldebugfs_seq_create(dev->obd_debugfs_entry, "srpc_sepol",
-				 0200, &srpc_sptlrpc_sepol_fops, dev);
-	if (rc) {
-		CERROR("create proc entry srpc_sepol for %s: %d\n",
-		       dev->obd_name, rc);
-		return rc;
-	}
+        rc = lprocfs_obd_seq_create(dev, "srpc_contexts", 0444,
+                                    &sptlrpc_ctxs_lprocfs_fops, dev);
+        if (rc) {
+                CERROR("create proc entry srpc_contexts for %s: %d\n",
+                       dev->obd_name, rc);
+                return rc;
+        }
 
-	return 0;
+        return 0;
 }
 EXPORT_SYMBOL(sptlrpc_lprocfs_cliobd_attach);
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_null.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_null.c
index a17a4e182233e..52af519a291d7 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_null.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_null.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -63,7 +63,14 @@ void null_encode_sec_part(struct lustre_msg *msg, enum lustre_sec_part sp)
 static inline
 enum lustre_sec_part null_decode_sec_part(struct lustre_msg *msg)
 {
-	return (msg->lm_secflvr >> 24) & 0xFF;
+        return (msg->lm_secflvr >> 24) & 0xFF;
+}
+
+static int null_ctx_refresh(struct ptlrpc_cli_ctx *ctx)
+{
+        /* should never reach here */
+        LBUG();
+        return 0;
 }
 
 static
@@ -363,9 +370,11 @@ int null_authorize(struct ptlrpc_request *req)
 }
 
 static struct ptlrpc_ctx_ops null_ctx_ops = {
-	.sign                   = null_ctx_sign,
-	.verify                 = null_ctx_verify,
+        .refresh                = null_ctx_refresh,
+        .sign                   = null_ctx_sign,
+        .verify                 = null_ctx_verify,
 };
+
 static struct ptlrpc_sec_cops null_sec_cops = {
         .create_sec             = null_create_sec,
         .destroy_sec            = null_destroy_sec,
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_plain.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_plain.c
index dea70d160b54e..a0f192cecf633 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_plain.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_plain.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -215,12 +215,12 @@ int plain_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
 static
 int plain_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
 {
-	struct lustre_msg *msg = req->rq_repdata;
-	struct plain_header *phdr;
-	__u32 cksum;
-	bool swabbed;
+        struct lustre_msg   *msg = req->rq_repdata;
+        struct plain_header *phdr;
+        __u32                cksum;
+        int                  swabbed;
+        ENTRY;
 
-	ENTRY;
         if (msg->lm_bufcount != PLAIN_PACK_SEGMENTS) {
                 CERROR("unexpected reply buf count %u\n", msg->lm_bufcount);
                 RETURN(-EPROTO);
@@ -723,15 +723,16 @@ static struct ptlrpc_svc_ctx plain_svc_ctx = {
         .sc_policy      = &plain_policy,
 };
 
-static int plain_accept(struct ptlrpc_request *req)
+static
+int plain_accept(struct ptlrpc_request *req)
 {
-	struct lustre_msg *msg = req->rq_reqbuf;
-	struct plain_header *phdr;
-	bool swabbed;
+        struct lustre_msg   *msg = req->rq_reqbuf;
+        struct plain_header *phdr;
+        int                  swabbed;
+        ENTRY;
 
-	ENTRY;
-	LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) ==
-		SPTLRPC_POLICY_PLAIN);
+        LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) ==
+                SPTLRPC_POLICY_PLAIN);
 
         if (SPTLRPC_FLVR_BASE(req->rq_flvr.sf_rpc) !=
             SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN) ||
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/service.c b/drivers/staging/lustrefsx/lustre/ptlrpc/service.c
index 6373c36865f3d..6e3172cdeb5a7 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/service.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/service.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2017, Intel Corporation.
+ * Copyright (c) 2010, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -31,15 +31,13 @@
  */
 
 #define DEBUG_SUBSYSTEM S_RPC
-
 #include <linux/kthread.h>
 #include <obd_support.h>
 #include <obd_class.h>
 #include <lustre_net.h>
 #include <lu_object.h>
-#include <uapi/linux/lnet/lnet-types.h>
+#include <lnet/types.h>
 #include "ptlrpc_internal.h"
-#include <linux/delay.h>
 
 /* The following are visible and mutable through /sys/module/ptlrpc */
 int test_req_buffer_pressure = 0;
@@ -141,9 +139,7 @@ ptlrpc_grow_req_bufs(struct ptlrpc_service_part *svcpt, int post)
         for (i = 0; i < svc->srv_nbuf_per_group; i++) {
                 /* NB: another thread might have recycled enough rqbds, we
 		 * need to make sure it wouldn't over-allocate, see LU-1212. */
-		if (svcpt->scp_nrqbds_posted >= svc->srv_nbuf_per_group ||
-		    (svc->srv_nrqbds_max != 0 &&
-		     svcpt->scp_nrqbds_total > svc->srv_nrqbds_max))
+		if (svcpt->scp_nrqbds_posted >= svc->srv_nbuf_per_group)
 			break;
 
 		rqbd = ptlrpc_alloc_rqbd(svcpt);
@@ -483,7 +479,7 @@ static void ptlrpc_at_timer(cfs_timer_cb_arg_t data)
 	svcpt = cfs_from_timer(svcpt, data, scp_at_timer);
 
 	svcpt->scp_at_check = 1;
-	svcpt->scp_at_checktime = ktime_get();
+	svcpt->scp_at_checktime = cfs_time_current();
 	wake_up(&svcpt->scp_waitq);
 }
 
@@ -606,7 +602,6 @@ ptlrpc_service_part_init(struct ptlrpc_service *svc,
 
 	/* rqbd and incoming request queue */
 	spin_lock_init(&svcpt->scp_lock);
-	mutex_init(&svcpt->scp_mutex);
 	INIT_LIST_HEAD(&svcpt->scp_rqbd_idle);
 	INIT_LIST_HEAD(&svcpt->scp_rqbd_posted);
 	INIT_LIST_HEAD(&svcpt->scp_req_incoming);
@@ -688,8 +683,7 @@ ptlrpc_service_part_init(struct ptlrpc_service *svc,
  */
 struct ptlrpc_service *
 ptlrpc_register_service(struct ptlrpc_service_conf *conf,
-			struct kset *parent,
-			struct dentry *debugfs_entry)
+			struct proc_dir_entry *proc_entry)
 {
 	struct ptlrpc_service_cpt_conf	*cconf = &conf->psc_cpt;
 	struct ptlrpc_service		*service;
@@ -711,13 +705,7 @@ ptlrpc_register_service(struct ptlrpc_service_conf *conf,
 	if (cptable == NULL)
 		cptable = cfs_cpt_table;
 
-	if (conf->psc_thr.tc_cpu_bind > 1) {
-		CERROR("%s: Invalid cpu bind value %d, only 1 or 0 allowed\n",
-		       conf->psc_name, conf->psc_thr.tc_cpu_bind);
-		RETURN(ERR_PTR(-EINVAL));
-	}
-
-	if (!cconf->cc_affinity) {
+	if (!conf->psc_thr.tc_cpu_affinity) {
 		ncpts = 1;
 	} else {
 		ncpts = cfs_cpt_number(cptable);
@@ -756,7 +744,6 @@ ptlrpc_register_service(struct ptlrpc_service_conf *conf,
 	service->srv_cptable		= cptable;
 	service->srv_cpts		= cpts;
 	service->srv_ncpts		= ncpts;
-	service->srv_cpt_bind		= conf->psc_thr.tc_cpu_bind;
 
 	service->srv_cpt_bits = 0; /* it's zero already, easy to read... */
 	while ((1 << service->srv_cpt_bits) < cfs_cpt_number(cptable))
@@ -771,9 +758,6 @@ ptlrpc_register_service(struct ptlrpc_service_conf *conf,
 	/* buffer configuration */
 	service->srv_nbuf_per_group	= test_req_buffer_pressure ?
 					  1 : conf->psc_buf.bc_nbufs;
-	/* do not limit max number of rqbds by default */
-	service->srv_nrqbds_max		= 0;
-
 	service->srv_max_req_size	= conf->psc_buf.bc_req_max_size +
 					  SPTLRPC_MAX_PAYLOAD;
 	service->srv_buf_size		= conf->psc_buf.bc_buf_size;
@@ -792,7 +776,7 @@ ptlrpc_register_service(struct ptlrpc_service_conf *conf,
 	service->srv_ops		= conf->psc_ops;
 
 	for (i = 0; i < ncpts; i++) {
-		if (!cconf->cc_affinity)
+		if (!conf->psc_thr.tc_cpu_affinity)
 			cpt = CFS_CPT_ANY;
 		else
 			cpt = cpts != NULL ? cpts[i] : i;
@@ -816,14 +800,8 @@ ptlrpc_register_service(struct ptlrpc_service_conf *conf,
 	list_add(&service->srv_list, &ptlrpc_all_services);
 	mutex_unlock(&ptlrpc_all_services_mutex);
 
-	if (parent) {
-		rc = ptlrpc_sysfs_register_service(parent, service);
-		if (rc)
-			GOTO(failed, rc);
-	}
-
-	if (debugfs_entry != NULL)
-		ptlrpc_ldebugfs_register_service(debugfs_entry, service);
+	if (proc_entry != NULL)
+		ptlrpc_lprocfs_register_service(proc_entry, service);
 
 	rc = ptlrpc_service_nrs_setup(service);
 	if (rc != 0)
@@ -961,10 +939,8 @@ void ptlrpc_server_drop_request(struct ptlrpc_request *req)
 			 */
 			LASSERT(atomic_read(&rqbd->rqbd_req.rq_refcount) == 0);
 			if (svcpt->scp_nrqbds_posted >=
-			    svc->srv_nbuf_per_group ||
-			    (svc->srv_nrqbds_max != 0 &&
-			     svcpt->scp_nrqbds_total > svc->srv_nrqbds_max) ||
-			    test_req_buffer_pressure) {
+			    svc->srv_nbuf_per_group &&
+			    !test_req_buffer_pressure) {
 				/* like in ptlrpc_free_rqbd() */
 				svcpt->scp_nrqbds_total--;
 				OBD_FREE_LARGE(rqbd->rqbd_buffer,
@@ -1001,18 +977,18 @@ void ptlrpc_request_change_export(struct ptlrpc_request *req,
 	if (req->rq_export != NULL) {
 		LASSERT(!list_empty(&req->rq_exp_list));
 		/* remove rq_exp_list from last export */
-		spin_lock(&req->rq_export->exp_rpc_lock);
+		spin_lock_bh(&req->rq_export->exp_rpc_lock);
 		list_del_init(&req->rq_exp_list);
-		spin_unlock(&req->rq_export->exp_rpc_lock);
+		spin_unlock_bh(&req->rq_export->exp_rpc_lock);
 		/* export has one reference already, so it`s safe to
 		 * add req to export queue here and get another
 		 * reference for request later */
-		spin_lock(&export->exp_rpc_lock);
+		spin_lock_bh(&export->exp_rpc_lock);
 		if (req->rq_ops != NULL) /* hp request */
 			list_add(&req->rq_exp_list, &export->exp_hp_rpcs);
 		else
 			list_add(&req->rq_exp_list, &export->exp_reg_rpcs);
-		spin_unlock(&export->exp_rpc_lock);
+		spin_unlock_bh(&export->exp_rpc_lock);
 
 		class_export_rpc_dec(req->rq_export);
 		class_export_put(req->rq_export);
@@ -1065,10 +1041,10 @@ static void ptlrpc_server_finish_active_request(
  * This function is only called when some export receives a message (i.e.,
  * the network is up.)
  */
-void ptlrpc_update_export_timer(struct obd_export *exp, time64_t extra_delay)
+void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay)
 {
-	struct obd_export *oldest_exp;
-	time64_t oldest_time, new_time;
+        struct obd_export *oldest_exp;
+        time_t oldest_time, new_time;
 
         ENTRY;
 
@@ -1081,7 +1057,7 @@ void ptlrpc_update_export_timer(struct obd_export *exp, time64_t extra_delay)
            will make it to the top of the list. */
 
         /* Do not pay attention on 1sec or smaller renewals. */
-	new_time = ktime_get_real_seconds() + extra_delay;
+        new_time = cfs_time_current_sec() + extra_delay;
         if (exp->exp_last_request_time + 1 /*second */ >= new_time)
                 RETURN_EXIT;
 
@@ -1112,35 +1088,33 @@ void ptlrpc_update_export_timer(struct obd_export *exp, time64_t extra_delay)
                 return;
         }
 
-	/* Note - racing to start/reset the obd_eviction timer is safe */
-	if (exp->exp_obd->obd_eviction_timer == 0) {
-		/* Check if the oldest entry is expired. */
-		if (ktime_get_real_seconds() >
-		    oldest_time + PING_EVICT_TIMEOUT + extra_delay) {
-			/* We need a second timer, in case the net was down and
-			 * it just came back. Since the pinger may skip every
-			 * other PING_INTERVAL (see note in ptlrpc_pinger_main),
-			 * we better wait for 3.
-			 */
-			exp->exp_obd->obd_eviction_timer =
-				ktime_get_real_seconds() + 3 * PING_INTERVAL;
-			CDEBUG(D_HA, "%s: Think about evicting %s from %lld\n",
-			       exp->exp_obd->obd_name,
-			       obd_export_nid2str(oldest_exp), oldest_time);
-		}
-	} else {
-		if (ktime_get_real_seconds() >
-		    (exp->exp_obd->obd_eviction_timer + extra_delay)) {
-			/* The evictor won't evict anyone who we've heard from
-			 * recently, so we don't have to check before we start
-			 * it.
-			 */
-			if (!ping_evictor_wake(exp))
-				exp->exp_obd->obd_eviction_timer = 0;
-		}
-	}
+        /* Note - racing to start/reset the obd_eviction timer is safe */
+        if (exp->exp_obd->obd_eviction_timer == 0) {
+                /* Check if the oldest entry is expired. */
+                if (cfs_time_current_sec() > (oldest_time + PING_EVICT_TIMEOUT +
+                                              extra_delay)) {
+                        /* We need a second timer, in case the net was down and
+                         * it just came back. Since the pinger may skip every
+                         * other PING_INTERVAL (see note in ptlrpc_pinger_main),
+                         * we better wait for 3. */
+                        exp->exp_obd->obd_eviction_timer =
+                                cfs_time_current_sec() + 3 * PING_INTERVAL;
+			CDEBUG(D_HA, "%s: Think about evicting %s from %ld\n",
+                               exp->exp_obd->obd_name,
+                               obd_export_nid2str(oldest_exp), oldest_time);
+                }
+        } else {
+                if (cfs_time_current_sec() >
+                    (exp->exp_obd->obd_eviction_timer + extra_delay)) {
+                        /* The evictor won't evict anyone who we've heard from
+                         * recently, so we don't have to check before we start
+                         * it. */
+                        if (!ping_evictor_wake(exp))
+                                exp->exp_obd->obd_eviction_timer = 0;
+                }
+        }
 
-	EXIT;
+        EXIT;
 }
 
 /**
@@ -1192,7 +1166,7 @@ static int ptlrpc_check_req(struct ptlrpc_request *req)
 static void ptlrpc_at_set_timer(struct ptlrpc_service_part *svcpt)
 {
 	struct ptlrpc_at_array *array = &svcpt->scp_at_array;
-	time64_t next;
+	__s32 next;
 
 	if (array->paa_count == 0) {
 		del_timer(&svcpt->scp_at_timer);
@@ -1200,14 +1174,13 @@ static void ptlrpc_at_set_timer(struct ptlrpc_service_part *svcpt)
 	}
 
 	/* Set timer for closest deadline */
-	next = array->paa_deadline - ktime_get_real_seconds() -
-	       at_early_margin;
+	next = (__s32)(array->paa_deadline - ktime_get_real_seconds() -
+		       at_early_margin);
 	if (next <= 0) {
 		ptlrpc_at_timer(cfs_timer_cb_arg(svcpt, scp_at_timer));
 	} else {
-		mod_timer(&svcpt->scp_at_timer,
-			  jiffies + nsecs_to_jiffies(next * NSEC_PER_SEC));
-		CDEBUG(D_INFO, "armed %s at %+llds\n",
+		mod_timer(&svcpt->scp_at_timer, cfs_time_shift(next));
+		CDEBUG(D_INFO, "armed %s at %+ds\n",
 		       svcpt->scp_service->srv_name, next);
 	}
 }
@@ -1459,16 +1432,16 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt)
         __u32  index, count;
 	time64_t deadline;
 	time64_t now = ktime_get_real_seconds();
-	s64 delay;
-	int first, counter = 0;
+        cfs_duration_t delay;
+        int first, counter = 0;
+        ENTRY;
 
-	ENTRY;
 	spin_lock(&svcpt->scp_at_lock);
 	if (svcpt->scp_at_check == 0) {
 		spin_unlock(&svcpt->scp_at_lock);
 		RETURN(0);
 	}
-	delay = ktime_ms_delta(ktime_get(), svcpt->scp_at_checktime);
+	delay = cfs_time_sub(cfs_time_current(), svcpt->scp_at_checktime);
 	svcpt->scp_at_check = 0;
 
 	if (array->paa_count == 0) {
@@ -1504,18 +1477,14 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt)
 				break;
 			}
 
+			ptlrpc_at_remove_timed(rq);
 			/**
 			 * ptlrpc_server_drop_request() may drop
 			 * refcount to 0 already. Let's check this and
 			 * don't add entry to work_list
 			 */
-			if (likely(atomic_inc_not_zero(&rq->rq_refcount))) {
-				ptlrpc_at_remove_timed(rq);
+			if (likely(atomic_inc_not_zero(&rq->rq_refcount)))
 				list_add(&rq->rq_timed_list, &work_list);
-			} else {
-				ptlrpc_at_remove_timed(rq);
-			}
-
 			counter++;
 		}
 
@@ -1536,7 +1505,7 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt)
                 LCONSOLE_WARN("%s: This server is not able to keep up with "
 			      "request traffic (cpu-bound).\n",
 			      svcpt->scp_service->srv_name);
-		CWARN("earlyQ=%d reqQ=%d recA=%d, svcEst=%d, delay=%lld\n",
+		CWARN("earlyQ=%d reqQ=%d recA=%d, svcEst=%d, delay=%ld(jiff)\n",
 		      counter, svcpt->scp_nreqs_incoming,
 		      svcpt->scp_nreqs_active,
 		      at_get(&svcpt->scp_at_estimate), delay);
@@ -1560,14 +1529,18 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt)
 
 /* Check if we are already handling earlier incarnation of this request.
  * Called under &req->rq_export->exp_rpc_lock locked */
-static struct ptlrpc_request*
-ptlrpc_server_check_resend_in_progress(struct ptlrpc_request *req)
+static int ptlrpc_server_check_resend_in_progress(struct ptlrpc_request *req)
 {
 	struct ptlrpc_request	*tmp = NULL;
 
 	if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) ||
 	    (atomic_read(&req->rq_export->exp_rpc_count) == 0))
-		return NULL;
+		return 0;
+
+	/* bulk request are aborted upon reconnect, don't try to
+	 * find a match */
+	if (req->rq_bulk_write || req->rq_bulk_read)
+		return 0;
 
 	/* This list should not be longer than max_requests in
 	 * flights on the client, so it is not all that long.
@@ -1585,12 +1558,12 @@ ptlrpc_server_check_resend_in_progress(struct ptlrpc_request *req)
 		if (tmp->rq_xid == req->rq_xid)
 			goto found;
 	}
-	return NULL;
+	return 0;
 
 found:
 	DEBUG_REQ(D_HA, req, "Found duplicate req in processing");
 	DEBUG_REQ(D_HA, tmp, "Request being processed");
-	return tmp;
+	return -EBUSY;
 }
 
 /**
@@ -1644,9 +1617,9 @@ static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req)
 		if (req->rq_ops && req->rq_ops->hpreq_fini)
 			req->rq_ops->hpreq_fini(req);
 
-		spin_lock(&req->rq_export->exp_rpc_lock);
+		spin_lock_bh(&req->rq_export->exp_rpc_lock);
 		list_del_init(&req->rq_exp_list);
-		spin_unlock(&req->rq_export->exp_rpc_lock);
+		spin_unlock_bh(&req->rq_export->exp_rpc_lock);
 	}
 	EXIT;
 }
@@ -1680,7 +1653,6 @@ static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt,
 {
 	int rc;
 	bool hp;
-	struct ptlrpc_request *orig;
 	ENTRY;
 
 	rc = ptlrpc_server_hpreq_init(svcpt, req);
@@ -1690,43 +1662,18 @@ static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt,
 	hp = rc > 0;
 	ptlrpc_nrs_req_initialize(svcpt, req, hp);
 
-	while (req->rq_export != NULL) {
+	if (req->rq_export != NULL) {
 		struct obd_export *exp = req->rq_export;
 
 		/* do search for duplicated xid and the adding to the list
 		 * atomically */
 		spin_lock_bh(&exp->exp_rpc_lock);
-		orig = ptlrpc_server_check_resend_in_progress(req);
-		if (orig && OBD_FAIL_PRECHECK(OBD_FAIL_PTLRPC_RESEND_RACE)) {
-			spin_unlock_bh(&exp->exp_rpc_lock);
-
-			OBD_RACE(OBD_FAIL_PTLRPC_RESEND_RACE);
-			msleep(4 * MSEC_PER_SEC);
-			continue;
-		}
-		if (orig && likely(atomic_inc_not_zero(&orig->rq_refcount))) {
-			bool linked;
-
+		rc = ptlrpc_server_check_resend_in_progress(req);
+		if (rc < 0) {
 			spin_unlock_bh(&exp->exp_rpc_lock);
 
-			/*
-			 * When the client resend request and the server has
-			 * the previous copy of it, we need to update deadlines,
-			 * to be sure that the client and the server have equal
-			 *  request deadlines.
-			 */
-
-			spin_lock(&orig->rq_rqbd->rqbd_svcpt->scp_at_lock);
-			linked = orig->rq_at_linked;
-			if (likely(linked))
-				ptlrpc_at_remove_timed(orig);
-			spin_unlock(&orig->rq_rqbd->rqbd_svcpt->scp_at_lock);
-			orig->rq_deadline = req->rq_deadline;
-			if (likely(linked))
-				ptlrpc_at_add_timed(orig);
-			ptlrpc_server_drop_request(orig);
 			ptlrpc_nrs_req_finalize(req);
-			RETURN(-EBUSY);
+			RETURN(rc);
 		}
 
 		if (hp || req->rq_ops != NULL)
@@ -1734,7 +1681,6 @@ static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt,
 		else
 			list_add(&req->rq_exp_list, &exp->exp_reg_rpcs);
 		spin_unlock_bh(&exp->exp_rpc_lock);
-		break;
 	}
 
 	/* the current thread is not the processing thread for this request
@@ -2118,8 +2064,7 @@ ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
 		if (unlikely(ptlrpc_check_req(request)))
 			goto put_conn;
 		ptlrpc_update_export_timer(request->rq_export,
-					   div_u64(timediff_usecs,
-						   USEC_PER_SEC / 2));
+					   timediff_usecs >> 19);
         }
 
         /* Discard requests queued for longer than the deadline.
@@ -2206,7 +2151,7 @@ ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
 		DEBUG_REQ(D_ADAPTTO, request,
 			  "sent %d early replies before finishing in %llds",
 			  request->rq_early_count,
-			  div_u64(arrived_usecs, USEC_PER_SEC));
+			  arrived_usecs / USEC_PER_SEC);
 	}
 
 	ptlrpc_server_finish_active_request(svcpt, request);
@@ -2294,7 +2239,7 @@ ptlrpc_handle_rs(struct ptlrpc_reply_state *rs)
 
 				while (nlocks-- > 0) {
 					lock = ack_locks[nlocks];
-					ldlm_lock_mode_downgrade(lock, LCK_COS);
+					ldlm_lock_downgrade(lock, LCK_COS);
 					LDLM_LOCK_PUT(lock);
 				}
 				RETURN(0);
@@ -2508,39 +2453,40 @@ static int ptlrpc_main(void *arg)
 	thread->t_pid = current_pid();
 	unshare_fs_struct();
 
-	if (svc->srv_cpt_bind) {
-		rc = cfs_cpt_bind(svc->srv_cptable, svcpt->scp_cpt);
-		if (rc != 0) {
-			CWARN("%s: failed to bind %s on CPT %d\n",
-			      svc->srv_name, thread->t_name, svcpt->scp_cpt);
-		}
+	/* NB: we will call cfs_cpt_bind() for all threads, because we
+	 * might want to run lustre server only on a subset of system CPUs,
+	 * in that case ->scp_cpt is CFS_CPT_ANY */
+	rc = cfs_cpt_bind(svc->srv_cptable, svcpt->scp_cpt);
+	if (rc != 0) {
+		CWARN("%s: failed to bind %s on CPT %d\n",
+		      svc->srv_name, thread->t_name, svcpt->scp_cpt);
 	}
 
 	ginfo = groups_alloc(0);
-	if (!ginfo)
-		GOTO(out, rc = -ENOMEM);
+	if (!ginfo) {
+		rc = -ENOMEM;
+		goto out;
+	}
 
 	set_current_groups(ginfo);
 	put_group_info(ginfo);
 
 	if (svc->srv_ops.so_thr_init != NULL) {
 		rc = svc->srv_ops.so_thr_init(thread);
+                if (rc)
+                        goto out;
+        }
 
-		if (rc)
-			GOTO(out, rc);
-	}
-
-	OBD_ALLOC_PTR(env);
-	if (env == NULL)
-		GOTO(out_srv_fini, rc = -ENOMEM);
-	rc = lu_env_add(env);
-	if (rc)
-		GOTO(out_env, rc);
+        OBD_ALLOC_PTR(env);
+        if (env == NULL) {
+                rc = -ENOMEM;
+                goto out_srv_fini;
+        }
 
-	rc = lu_context_init(&env->le_ctx,
-			     svc->srv_ctx_tags|LCT_REMEMBER|LCT_NOREF);
-	if (rc)
-		GOTO(out_env_remove, rc);
+        rc = lu_context_init(&env->le_ctx,
+                             svc->srv_ctx_tags|LCT_REMEMBER|LCT_NOREF);
+        if (rc)
+                goto out_srv_fini;
 
         thread->t_env = env;
         env->le_ctx.lc_thread = thread;
@@ -2553,13 +2499,15 @@ static int ptlrpc_main(void *arg)
 
 		CERROR("Failed to post rqbd for %s on CPT %d: %d\n",
 			svc->srv_name, svcpt->scp_cpt, rc);
-		GOTO(out_ctx_fini, rc);
+		goto out_srv_fini;
 	}
 
 	/* Alloc reply state structure for this one */
 	OBD_ALLOC_LARGE(rs, svc->srv_max_reply_size);
-	if (!rs)
-		GOTO(out_ctx_fini, rc = -ENOMEM);
+	if (!rs) {
+		rc = -ENOMEM;
+		goto out_srv_fini;
+	}
 
 	spin_lock(&svcpt->scp_lock);
 
@@ -2605,9 +2553,6 @@ static int ptlrpc_main(void *arg)
 
 		/* reset le_ses to initial state */
 		env->le_ses = NULL;
-		/* Refill the context before execution to make sure
-		 * all thread keys are allocated */
-		lu_env_refill(env);
 		/* Process all incoming reqs before handling any */
 		if (ptlrpc_server_request_incoming(svcpt)) {
 			lu_context_enter(&env->le_ctx);
@@ -2643,18 +2588,17 @@ static int ptlrpc_main(void *arg)
         lc_watchdog_delete(thread->t_watchdog);
         thread->t_watchdog = NULL;
 
-out_ctx_fini:
-	lu_context_fini(&env->le_ctx);
-out_env_remove:
-	lu_env_remove(env);
-out_env:
-	OBD_FREE_PTR(env);
 out_srv_fini:
         /*
          * deconstruct service specific state created by ptlrpc_start_thread()
          */
 	if (svc->srv_ops.so_thr_done != NULL)
 		svc->srv_ops.so_thr_done(thread);
+
+        if (env != NULL) {
+                lu_context_fini(&env->le_ctx);
+                OBD_FREE_PTR(env);
+        }
 out:
         CDEBUG(D_RPCTRACE, "service thread [ %p : %u ] %d exiting: rc %d\n",
                thread, thread->t_pid, thread->t_id, rc);
@@ -2700,13 +2644,8 @@ static int ptlrpc_hr_main(void *arg)
 	struct ptlrpc_hr_thread		*hrt = (struct ptlrpc_hr_thread *)arg;
 	struct ptlrpc_hr_partition	*hrp = hrt->hrt_partition;
 	struct list_head		replies;
-	struct lu_env			*env;
 	int				rc;
 
-	OBD_ALLOC_PTR(env);
-	if (env == NULL)
-		RETURN(-ENOMEM);
-
 	INIT_LIST_HEAD(&replies);
 	unshare_fs_struct();
 
@@ -2720,15 +2659,6 @@ static int ptlrpc_hr_main(void *arg)
 		      threadname, hrp->hrp_cpt, ptlrpc_hr.hr_cpt_table, rc);
 	}
 
-	rc = lu_context_init(&env->le_ctx, LCT_MD_THREAD | LCT_DT_THREAD |
-			     LCT_REMEMBER | LCT_NOREF);
-	if (rc)
-		GOTO(out_env, rc);
-
-	rc = lu_env_add(env);
-	if (rc)
-		GOTO(out_ctx_fini, rc);
-
 	atomic_inc(&hrp->hrp_nstarted);
 	wake_up(&ptlrpc_hr.hr_waitq);
 
@@ -2742,22 +2672,13 @@ static int ptlrpc_hr_main(void *arg)
 					struct ptlrpc_reply_state,
 					rs_list);
 			list_del_init(&rs->rs_list);
-			/* refill keys if needed */
-			lu_env_refill(env);
-			lu_context_enter(&env->le_ctx);
 			ptlrpc_handle_rs(rs);
-			lu_context_exit(&env->le_ctx);
 		}
 	}
 
 	atomic_inc(&hrp->hrp_nstopped);
 	wake_up(&ptlrpc_hr.hr_waitq);
 
-	lu_env_remove(env);
-out_ctx_fini:
-	lu_context_fini(&env->le_ctx);
-out_env:
-	OBD_FREE_PTR(env);
 	return 0;
 }
 
@@ -3322,7 +3243,6 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service)
 	ptlrpc_service_nrs_cleanup(service);
 
 	ptlrpc_lprocfs_unregister_service(service);
-	ptlrpc_sysfs_unregister_service(service);
 
 	ptlrpc_service_free(service);
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/wirehdr.c b/drivers/staging/lustrefsx/lustre/ptlrpc/wirehdr.c
index 7f9fb09ee4ffd..3a9daf899c26e 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/wirehdr.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/wirehdr.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,7 +40,5 @@
 #include <obd_support.h>
 #include <obd_class.h>
 #include <lustre_net.h>
+#include <lustre/lustre_lfsck_user.h>
 #include <lustre_disk.h>
-#include <uapi/linux/lustre/lustre_lfsck_user.h>
-#include <uapi/linux/lustre/lustre_cfg.h>
-
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c b/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c
index c0c7b0c5f1f05..94828872d70ac 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,16 +40,15 @@
 #include <obd_support.h>
 #include <obd_class.h>
 #include <lustre_net.h>
+#include <lustre/lustre_lfsck_user.h>
 #include <lustre_disk.h>
-#include <uapi/linux/lustre/lustre_lfsck_user.h>
-#include <uapi/linux/lustre/lustre_cfg.h>
-
-
 void lustre_assert_wire_constants(void)
 {
-	/* Wire protocol assertions generated by 'wirecheck'
-	 * (make -C lustre/utils newwiretest)
-	 */
+	 /* Wire protocol assertions generated by 'wirecheck'
+	  * (make -C lustre/utils newwiretest)
+	  * running on Linux centss05 2.6.32.431.29.2.el6_lustre #1 SMP Tue Sep 23 16:06:38 CDT 2014 x
+	  * with gcc version 4.4.7 20120313 (Red Hat 4.4.7-4) (GCC)  */
+
 
 	/* Constants... */
 	LASSERTF(PTL_RPC_MSG_REQUEST == 4711, "found %lld\n",
@@ -175,9 +174,7 @@ void lustre_assert_wire_constants(void)
 		 (long long)MDS_HSM_CT_UNREGISTER);
 	LASSERTF(MDS_SWAP_LAYOUTS == 61, "found %lld\n",
 		 (long long)MDS_SWAP_LAYOUTS);
-	LASSERTF(MDS_RMFID == 62, "found %lld\n",
-		 (long long)MDS_RMFID);
-	LASSERTF(MDS_LAST_OPC == 63, "found %lld\n",
+	LASSERTF(MDS_LAST_OPC == 62, "found %lld\n",
 		 (long long)MDS_LAST_OPC);
 	LASSERTF(REINT_SETATTR == 1, "found %lld\n",
 		 (long long)REINT_SETATTR);
@@ -197,7 +194,7 @@ void lustre_assert_wire_constants(void)
 		 (long long)REINT_RMENTRY);
 	LASSERTF(REINT_MIGRATE == 9, "found %lld\n",
 		 (long long)REINT_MIGRATE);
-	LASSERTF(REINT_MAX == 11, "found %lld\n",
+	LASSERTF(REINT_MAX == 10, "found %lld\n",
 		 (long long)REINT_MAX);
 	LASSERTF(DISP_IT_EXECD == 0x00000001UL, "found 0x%.8xUL\n",
 		(unsigned)DISP_IT_EXECD);
@@ -255,14 +252,9 @@ void lustre_assert_wire_constants(void)
 			(long long)MDS_ATTR_FROM_OPEN);
 	LASSERTF(MDS_ATTR_BLOCKS == 0x0000000000008000ULL, "found 0x%.16llxULL\n",
 			(long long)MDS_ATTR_BLOCKS);
+
 	LASSERTF(MDS_ATTR_PROJID == 0x0000000000010000ULL, "found 0x%.16llxULL\n",
 			(long long)MDS_ATTR_PROJID);
-	LASSERTF(MDS_ATTR_LSIZE == 0x0000000000020000ULL, "found 0x%.16llxULL\n",
-			(long long)MDS_ATTR_LSIZE);
-	LASSERTF(MDS_ATTR_LBLOCKS == 0x0000000000040000ULL, "found 0x%.16llxULL\n",
-			(long long)MDS_ATTR_LBLOCKS);
-	LASSERTF(MDS_ATTR_OVERRIDE == 0x0000000002000000ULL, "found 0x%.16llxULL\n",
-			(long long)MDS_ATTR_OVERRIDE);
 	LASSERTF(FLD_QUERY == 900, "found %lld\n",
 		 (long long)FLD_QUERY);
 	LASSERTF(FLD_READ == 901, "found %lld\n",
@@ -347,6 +339,10 @@ void lustre_assert_wire_constants(void)
 	CLASSERT(LQUOTA_RES_DT == 2);
 	LASSERTF(OBD_PING == 400, "found %lld\n",
 		 (long long)OBD_PING);
+	LASSERTF(OBD_LOG_CANCEL == 401, "found %lld\n",
+		 (long long)OBD_LOG_CANCEL);
+	LASSERTF(OBD_QC_CALLBACK == 402, "found %lld\n",
+		 (long long)OBD_QC_CALLBACK);
 	LASSERTF(OBD_IDX_READ == 403, "found %lld\n",
 		 (long long)OBD_IDX_READ);
 	LASSERTF(OBD_LAST_OPC == 404, "found %lld\n",
@@ -369,8 +365,6 @@ void lustre_assert_wire_constants(void)
 		 (long long)MGS_TARGET_DEL);
 	LASSERTF(MGS_SET_INFO == 255, "found %lld\n",
 		 (long long)MGS_SET_INFO);
-	LASSERTF(MGS_CONFIG_READ == 256, "found %lld\n",
-		 (long long)MGS_CONFIG_READ);
 	LASSERTF(MGS_LAST_OPC == 257, "found %lld\n",
 		 (long long)MGS_LAST_OPC);
 	LASSERTF(SEC_CTX_INIT == 801, "found %lld\n",
@@ -506,30 +500,6 @@ void lustre_assert_wire_constants(void)
 		 (long long)OUT_PUNCH);
 	LASSERTF(OUT_READ == 15, "found %lld\n",
 		 (long long)OUT_READ);
-	LASSERTF(OUT_NOOP == 16, "found %lld\n",
-		 (long long)OUT_NOOP);
-	LASSERTF(OUT_XATTR_LIST == 17, "found %lld\n",
-		 (long long)OUT_XATTR_LIST);
-
-	/* Checks for struct lustre_som_attrs */
-	LASSERTF((int)sizeof(struct lustre_som_attrs) == 24, "found %lld\n",
-		 (long long)(int)sizeof(struct lustre_som_attrs));
-	LASSERTF((int)offsetof(struct lustre_som_attrs, lsa_valid) == 0, "found %lld\n",
-		 (long long)(int)offsetof(struct lustre_som_attrs, lsa_valid));
-	LASSERTF((int)sizeof(((struct lustre_som_attrs *)0)->lsa_valid) == 2, "found %lld\n",
-		 (long long)(int)sizeof(((struct lustre_som_attrs *)0)->lsa_valid));
-	LASSERTF((int)offsetof(struct lustre_som_attrs, lsa_reserved) == 2, "found %lld\n",
-		 (long long)(int)offsetof(struct lustre_som_attrs, lsa_reserved));
-	LASSERTF((int)sizeof(((struct lustre_som_attrs *)0)->lsa_reserved) == 6, "found %lld\n",
-		 (long long)(int)sizeof(((struct lustre_som_attrs *)0)->lsa_reserved));
-	LASSERTF((int)offsetof(struct lustre_som_attrs, lsa_size) == 8, "found %lld\n",
-		 (long long)(int)offsetof(struct lustre_som_attrs, lsa_size));
-	LASSERTF((int)sizeof(((struct lustre_som_attrs *)0)->lsa_size) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct lustre_som_attrs *)0)->lsa_size));
-	LASSERTF((int)offsetof(struct lustre_som_attrs, lsa_blocks) == 16, "found %lld\n",
-		 (long long)(int)offsetof(struct lustre_som_attrs, lsa_blocks));
-	LASSERTF((int)sizeof(((struct lustre_som_attrs *)0)->lsa_blocks) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct lustre_som_attrs *)0)->lsa_blocks));
 
 	/* Checks for struct hsm_attrs */
 	LASSERTF((int)sizeof(struct hsm_attrs) == 24, "found %lld\n",
@@ -686,78 +656,6 @@ void lustre_assert_wire_constants(void)
 	LASSERTF((int)sizeof(union lu_page) == 4096, "found %lld\n",
 		 (long long)(int)sizeof(union lu_page));
 
-	/* Checks for struct lu_ladvise */
-	LASSERTF((int)sizeof(struct lu_ladvise) == 32, "found %lld\n",
-		 (long long)(int)sizeof(struct lu_ladvise));
-	LASSERTF((int)offsetof(struct lu_ladvise, lla_advice) == 0, "found %lld\n",
-		 (long long)(int)offsetof(struct lu_ladvise, lla_advice));
-	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_advice) == 2, "found %lld\n",
-		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_advice));
-	LASSERTF((int)offsetof(struct lu_ladvise, lla_value1) == 2, "found %lld\n",
-		 (long long)(int)offsetof(struct lu_ladvise, lla_value1));
-	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value1) == 2, "found %lld\n",
-		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value1));
-	LASSERTF((int)offsetof(struct lu_ladvise, lla_value2) == 4, "found %lld\n",
-		 (long long)(int)offsetof(struct lu_ladvise, lla_value2));
-	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value2) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value2));
-	LASSERTF((int)offsetof(struct lu_ladvise, lla_start) == 8, "found %lld\n",
-		 (long long)(int)offsetof(struct lu_ladvise, lla_start));
-	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_start) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_start));
-	LASSERTF((int)offsetof(struct lu_ladvise, lla_end) == 16, "found %lld\n",
-		 (long long)(int)offsetof(struct lu_ladvise, lla_end));
-	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_end) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_end));
-	LASSERTF((int)offsetof(struct lu_ladvise, lla_value3) == 24, "found %lld\n",
-		 (long long)(int)offsetof(struct lu_ladvise, lla_value3));
-	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value3) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value3));
-	LASSERTF((int)offsetof(struct lu_ladvise, lla_value4) == 28, "found %lld\n",
-		 (long long)(int)offsetof(struct lu_ladvise, lla_value4));
-	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value4) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value4));
-	LASSERTF(LU_LADVISE_WILLREAD == 1, "found %lld\n",
-		 (long long)LU_LADVISE_WILLREAD);
-	LASSERTF(LU_LADVISE_DONTNEED == 2, "found %lld\n",
-		 (long long)LU_LADVISE_DONTNEED);
-
-	/* Checks for struct ladvise_hdr */
-	LASSERTF((int)sizeof(struct ladvise_hdr) == 32, "found %lld\n",
-		 (long long)(int)sizeof(struct ladvise_hdr));
-	LASSERTF((int)offsetof(struct ladvise_hdr, lah_magic) == 0, "found %lld\n",
-		 (long long)(int)offsetof(struct ladvise_hdr, lah_magic));
-	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_magic) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_magic));
-	LASSERTF((int)offsetof(struct ladvise_hdr, lah_count) == 4, "found %lld\n",
-		 (long long)(int)offsetof(struct ladvise_hdr, lah_count));
-	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_count) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_count));
-	LASSERTF((int)offsetof(struct ladvise_hdr, lah_flags) == 8, "found %lld\n",
-		 (long long)(int)offsetof(struct ladvise_hdr, lah_flags));
-	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_flags) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_flags));
-	LASSERTF((int)offsetof(struct ladvise_hdr, lah_value1) == 16, "found %lld\n",
-		 (long long)(int)offsetof(struct ladvise_hdr, lah_value1));
-	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value1) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value1));
-	LASSERTF((int)offsetof(struct ladvise_hdr, lah_value2) == 20, "found %lld\n",
-		 (long long)(int)offsetof(struct ladvise_hdr, lah_value2));
-	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value2) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value2));
-	LASSERTF((int)offsetof(struct ladvise_hdr, lah_value3) == 24, "found %lld\n",
-		 (long long)(int)offsetof(struct ladvise_hdr, lah_value3));
-	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value3) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value3));
-	LASSERTF((int)offsetof(struct ladvise_hdr, lah_advise) == 32, "found %lld\n",
-		 (long long)(int)offsetof(struct ladvise_hdr, lah_advise));
-	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_advise) == 0, "found %lld\n",
-		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_advise));
-	LASSERTF(LF_ASYNC == 1, "found %lld\n",
-		 (long long)LF_ASYNC);
-	LASSERTF(LADVISE_MAGIC == 450829536, "found %lld\n",
-		 (long long)LADVISE_MAGIC);
-
 	/* Checks for struct lustre_handle */
 	LASSERTF((int)sizeof(struct lustre_handle) == 8, "found %lld\n",
 		 (long long)(int)sizeof(struct lustre_handle));
@@ -805,10 +703,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct lustre_msg_v2, lm_buflens[0]));
 	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0]) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0]));
-	LASSERTF(LUSTRE_MSG_MAGIC_V2 == 0x0bd00bd3UL, "found 0x%.8xUL\n",
-		(unsigned)LUSTRE_MSG_MAGIC_V2);
-	LASSERTF(LUSTRE_MSG_MAGIC_V2_SWABBED == 0xd30bd00bUL, "found 0x%.8xUL\n",
-		(unsigned)LUSTRE_MSG_MAGIC_V2_SWABBED);
+	LASSERTF(LUSTRE_MSG_MAGIC_V2 == 0x0BD00BD3, "found 0x%.8x\n",
+		LUSTRE_MSG_MAGIC_V2);
+	LASSERTF(LUSTRE_MSG_MAGIC_V2_SWABBED == 0xD30BD00B, "found 0x%.8x\n",
+		LUSTRE_MSG_MAGIC_V2_SWABBED);
 
 	/* Checks for struct ptlrpc_body */
 	LASSERTF((int)sizeof(struct ptlrpc_body_v3) == 184, "found %lld\n",
@@ -1023,30 +921,42 @@ void lustre_assert_wire_constants(void)
 		 (long long)DLM_REPLY_REC_OFF);
 	LASSERTF(MSG_PTLRPC_HEADER_OFF == 31, "found %lld\n",
 		 (long long)MSG_PTLRPC_HEADER_OFF);
-	LASSERTF(PTLRPC_MSG_VERSION == 0x00000003UL, "found 0x%.8xUL\n",
-		(unsigned)PTLRPC_MSG_VERSION);
-	LASSERTF(LUSTRE_VERSION_MASK == 0xffff0000UL, "found 0x%.8xUL\n",
-		(unsigned)LUSTRE_VERSION_MASK);
-	LASSERTF(LUSTRE_OBD_VERSION == 0x00010000UL, "found 0x%.8xUL\n",
-		(unsigned)LUSTRE_OBD_VERSION);
-	LASSERTF(LUSTRE_MDS_VERSION == 0x00020000UL, "found 0x%.8xUL\n",
-		(unsigned)LUSTRE_MDS_VERSION);
-	LASSERTF(LUSTRE_OST_VERSION == 0x00030000UL, "found 0x%.8xUL\n",
-		(unsigned)LUSTRE_OST_VERSION);
-	LASSERTF(LUSTRE_DLM_VERSION == 0x00040000UL, "found 0x%.8xUL\n",
-		(unsigned)LUSTRE_DLM_VERSION);
-	LASSERTF(LUSTRE_LOG_VERSION == 0x00050000UL, "found 0x%.8xUL\n",
-		(unsigned)LUSTRE_LOG_VERSION);
-	LASSERTF(LUSTRE_MGS_VERSION == 0x00060000UL, "found 0x%.8xUL\n",
-		(unsigned)LUSTRE_MGS_VERSION);
+	LASSERTF(PTLRPC_MSG_VERSION == 0x00000003, "found 0x%.8x\n",
+		PTLRPC_MSG_VERSION);
+	LASSERTF(LUSTRE_VERSION_MASK == 0xffff0000, "found 0x%.8x\n",
+		LUSTRE_VERSION_MASK);
+	LASSERTF(LUSTRE_OBD_VERSION == 0x00010000, "found 0x%.8x\n",
+		LUSTRE_OBD_VERSION);
+	LASSERTF(LUSTRE_MDS_VERSION == 0x00020000, "found 0x%.8x\n",
+		LUSTRE_MDS_VERSION);
+	LASSERTF(LUSTRE_OST_VERSION == 0x00030000, "found 0x%.8x\n",
+		LUSTRE_OST_VERSION);
+	LASSERTF(LUSTRE_DLM_VERSION == 0x00040000, "found 0x%.8x\n",
+		LUSTRE_DLM_VERSION);
+	LASSERTF(LUSTRE_LOG_VERSION == 0x00050000, "found 0x%.8x\n",
+		LUSTRE_LOG_VERSION);
+	LASSERTF(LUSTRE_MGS_VERSION == 0x00060000, "found 0x%.8x\n",
+		LUSTRE_MGS_VERSION);
 	LASSERTF(MSGHDR_AT_SUPPORT == 1, "found %lld\n",
 		 (long long)MSGHDR_AT_SUPPORT);
 	LASSERTF(MSGHDR_CKSUM_INCOMPAT18 == 2, "found %lld\n",
 		 (long long)MSGHDR_CKSUM_INCOMPAT18);
+	LASSERTF(MSG_OP_FLAG_MASK == 0xffff0000UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_OP_FLAG_MASK);
+	LASSERTF(MSG_OP_FLAG_SHIFT == 16, "found %lld\n",
+		 (long long)MSG_OP_FLAG_SHIFT);
+	LASSERTF(MSG_GEN_FLAG_MASK == 0x0000ffffUL, "found 0x%.8xUL\n",
+		(unsigned)MSG_GEN_FLAG_MASK);
+	LASSERTF(MSG_LAST_REPLAY == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_LAST_REPLAY);
 	LASSERTF(MSG_RESENT == 0x00000002UL, "found 0x%.8xUL\n",
 		(unsigned)MSG_RESENT);
 	LASSERTF(MSG_REPLAY == 0x00000004UL, "found 0x%.8xUL\n",
 		(unsigned)MSG_REPLAY);
+	LASSERTF(MSG_DELAY_REPLAY == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_DELAY_REPLAY);
+	LASSERTF(MSG_VERSION_REPLAY == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_VERSION_REPLAY);
 	LASSERTF(MSG_REQ_REPLAY_DONE == 0x00000040UL, "found 0x%.8xUL\n",
 		(unsigned)MSG_REQ_REPLAY_DONE);
 	LASSERTF(MSG_LOCK_REPLAY_DONE == 0x00000080UL, "found 0x%.8xUL\n",
@@ -1061,6 +971,8 @@ void lustre_assert_wire_constants(void)
 		(unsigned)MSG_CONNECT_LIBCLIENT);
 	LASSERTF(MSG_CONNECT_INITIAL == 0x00000020UL, "found 0x%.8xUL\n",
 		(unsigned)MSG_CONNECT_INITIAL);
+	LASSERTF(MSG_CONNECT_ASYNC == 0x00000040UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_ASYNC);
 	LASSERTF(MSG_CONNECT_NEXT_VER == 0x00000080UL, "found 0x%.8xUL\n",
 		(unsigned)MSG_CONNECT_NEXT_VER);
 	LASSERTF(MSG_CONNECT_TRANSNO == 0x00000100UL, "found 0x%.8xUL\n",
@@ -1317,8 +1229,8 @@ void lustre_assert_wire_constants(void)
 		 OBD_CONNECT_DIR_STRIPE);
 	LASSERTF(OBD_CONNECT_SUBTREE == 0x800000000000000ULL, "found 0x%.16llxULL\n",
 		 OBD_CONNECT_SUBTREE);
-	LASSERTF(OBD_CONNECT_LOCKAHEAD_OLD == 0x1000000000000000ULL, "found 0x%.16llxULL\n",
-		 OBD_CONNECT_LOCKAHEAD_OLD);
+	LASSERTF(OBD_CONNECT_LOCK_AHEAD == 0x1000000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LOCK_AHEAD);
 	LASSERTF(OBD_CONNECT_BULK_MBITS == 0x2000000000000000ULL, "found 0x%.16llxULL\n",
 		 OBD_CONNECT_BULK_MBITS);
 	LASSERTF(OBD_CONNECT_OBDOPACK == 0x4000000000000000ULL, "found 0x%.16llxULL\n",
@@ -1327,48 +1239,12 @@ void lustre_assert_wire_constants(void)
 		 OBD_CONNECT_FLAGS2);
 	LASSERTF(OBD_CONNECT2_FILE_SECCTX == 0x1ULL, "found 0x%.16llxULL\n",
 		 OBD_CONNECT2_FILE_SECCTX);
-	LASSERTF(OBD_CONNECT2_LOCKAHEAD == 0x2ULL, "found 0x%.16llxULL\n",
-		 OBD_CONNECT2_LOCKAHEAD);
-	LASSERTF(OBD_CONNECT2_DIR_MIGRATE == 0x4ULL, "found 0x%.16llxULL\n",
-		 OBD_CONNECT2_DIR_MIGRATE);
-	LASSERTF(OBD_CONNECT2_FLR == 0x20ULL, "found 0x%.16llxULL\n",
-		 OBD_CONNECT2_FLR);
-	LASSERTF(OBD_CONNECT2_WBC_INTENTS == 0x40ULL, "found 0x%.16llxULL\n",
-		 OBD_CONNECT2_WBC_INTENTS);
-	LASSERTF(OBD_CONNECT2_LOCK_CONVERT == 0x80ULL, "found 0x%.16llxULL\n",
-		 OBD_CONNECT2_LOCK_CONVERT);
-	LASSERTF(OBD_CONNECT2_ARCHIVE_ID_ARRAY == 0x100ULL, "found 0x%.16llxULL\n",
-		 OBD_CONNECT2_ARCHIVE_ID_ARRAY);
-	LASSERTF(OBD_CONNECT2_SELINUX_POLICY == 0x400ULL, "found 0x%.16llxULL\n",
-		 OBD_CONNECT2_SELINUX_POLICY);
-	LASSERTF(OBD_CONNECT2_LSOM == 0x800ULL, "found 0x%.16llxULL\n",
-		 OBD_CONNECT2_LSOM);
-	LASSERTF(OBD_CONNECT2_ASYNC_DISCARD == 0x4000ULL, "found 0x%.16llxULL\n",
-		 OBD_CONNECT2_ASYNC_DISCARD);
-	LASSERTF(OBD_CONNECT2_ENCRYPT == 0x8000ULL, "found 0x%.16llxULL\n",
-		 OBD_CONNECT2_ENCRYPT);
-	LASSERTF(OBD_CONNECT2_FIDMAP== 0x10000ULL, "found 0x%.16llxULL\n",
-		 OBD_CONNECT2_FIDMAP);
-	LASSERTF(OBD_CONNECT2_GETATTR_PFID== 0x20000ULL, "found 0x%.16llxULL\n",
-		 OBD_CONNECT2_GETATTR_PFID);
 	LASSERTF(OBD_CKSUM_CRC32 == 0x00000001UL, "found 0x%.8xUL\n",
 		(unsigned)OBD_CKSUM_CRC32);
 	LASSERTF(OBD_CKSUM_ADLER == 0x00000002UL, "found 0x%.8xUL\n",
 		(unsigned)OBD_CKSUM_ADLER);
 	LASSERTF(OBD_CKSUM_CRC32C == 0x00000004UL, "found 0x%.8xUL\n",
 		(unsigned)OBD_CKSUM_CRC32C);
-	LASSERTF(OBD_CKSUM_RESERVED == 0x00000008UL, "found 0x%.8xUL\n",
-		(unsigned)OBD_CKSUM_RESERVED);
-	LASSERTF(OBD_CKSUM_T10IP512 == 0x00000010UL, "found 0x%.8xUL\n",
-		(unsigned)OBD_CKSUM_T10IP512);
-	LASSERTF(OBD_CKSUM_T10IP4K == 0x00000020UL, "found 0x%.8xUL\n",
-		(unsigned)OBD_CKSUM_T10IP4K);
-	LASSERTF(OBD_CKSUM_T10CRC512 == 0x00000040UL, "found 0x%.8xUL\n",
-		(unsigned)OBD_CKSUM_T10CRC512);
-	LASSERTF(OBD_CKSUM_T10CRC4K == 0x00000080UL, "found 0x%.8xUL\n",
-		(unsigned)OBD_CKSUM_T10CRC4K);
-	LASSERTF(OBD_CKSUM_T10_TOP == 0x00000002UL, "found 0x%.8xUL\n",
-		(unsigned)OBD_CKSUM_T10_TOP);
 
 	/* Checks for struct ost_layout */
 	LASSERTF((int)sizeof(struct ost_layout) == 28, "found %lld\n",
@@ -1485,10 +1361,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct obdo, o_layout));
 	LASSERTF((int)sizeof(((struct obdo *)0)->o_layout) == 28, "found %lld\n",
 		 (long long)(int)sizeof(((struct obdo *)0)->o_layout));
-	LASSERTF((int)offsetof(struct obdo, o_layout_version) == 164, "found %lld\n",
-		 (long long)(int)offsetof(struct obdo, o_layout_version));
-	LASSERTF((int)sizeof(((struct obdo *)0)->o_layout_version) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct obdo *)0)->o_layout_version));
+	LASSERTF((int)offsetof(struct obdo, o_padding_3) == 164, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_padding_3));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_padding_3));
 	LASSERTF((int)offsetof(struct obdo, o_uid_h) == 168, "found %lld\n",
 		 (long long)(int)offsetof(struct obdo, o_uid_h));
 	LASSERTF((int)sizeof(((struct obdo *)0)->o_uid_h) == 4, "found %lld\n",
@@ -1543,8 +1419,8 @@ void lustre_assert_wire_constants(void)
 		 OBD_MD_FLFLAGS);
 	LASSERTF(OBD_MD_FLNLINK == (0x00002000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLNLINK);
-	LASSERTF(OBD_MD_FLPARENT == (0x00004000ULL), "found 0x%.16llxULL\n",
-		 OBD_MD_FLPARENT);
+	LASSERTF(OBD_MD_FLGENER == (0x00004000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGENER);
 	LASSERTF(OBD_MD_FLRDEV == (0x00010000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLRDEV);
 	LASSERTF(OBD_MD_FLEASIZE == (0x00020000ULL), "found 0x%.16llxULL\n",
@@ -1555,10 +1431,14 @@ void lustre_assert_wire_constants(void)
 		 OBD_MD_FLHANDLE);
 	LASSERTF(OBD_MD_FLCKSUM == (0x00100000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLCKSUM);
+	LASSERTF(OBD_MD_FLQOS == (0x00200000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLQOS);
 	LASSERTF(OBD_MD_FLGROUP == (0x01000000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLGROUP);
 	LASSERTF(OBD_MD_FLFID == (0x02000000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLFID);
+	LASSERTF(OBD_MD_FLEPOCH == (0x04000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLEPOCH);
 	LASSERTF(OBD_MD_FLGRANT == (0x08000000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLGRANT);
 	LASSERTF(OBD_MD_FLDIREA == (0x10000000ULL), "found 0x%.16llxULL\n",
@@ -1571,6 +1451,8 @@ void lustre_assert_wire_constants(void)
 		 OBD_MD_FLMODEASIZE);
 	LASSERTF(OBD_MD_MDS == (0x0000000100000000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_MDS);
+	LASSERTF(OBD_MD_REINT == (0x0000000200000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_REINT);
 	LASSERTF(OBD_MD_MEA == (0x0000000400000000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_MEA);
 	LASSERTF(OBD_MD_TSTATE == (0x0000000800000000ULL), "found 0x%.16llxULL\n",
@@ -1583,6 +1465,12 @@ void lustre_assert_wire_constants(void)
 		 OBD_MD_FLXATTRRM);
 	LASSERTF(OBD_MD_FLACL == (0x0000008000000000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLACL);
+	LASSERTF(OBD_MD_FLMDSCAPA == (0x0000020000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLMDSCAPA);
+	LASSERTF(OBD_MD_FLOSSCAPA == (0x0000040000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLOSSCAPA);
+	LASSERTF(OBD_MD_FLCKSPLIT == (0x0000080000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLCKSPLIT);
 	LASSERTF(OBD_MD_FLCROSSREF == (0x0000100000000000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLCROSSREF);
 	LASSERTF(OBD_MD_FLGETATTRLOCK == (0x0000200000000000ULL), "found 0x%.16llxULL\n",
@@ -1595,6 +1483,7 @@ void lustre_assert_wire_constants(void)
 		 OBD_MD_DEFAULT_MEA);
 	LASSERTF(OBD_MD_FLOSTLAYOUT == (0x0080000000000000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLOSTLAYOUT);
+
 	LASSERTF(OBD_MD_FLPROJID == (0x0100000000000000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLPROJID);
 	CLASSERT(OBD_FL_INLINEDATA == 0x00000001);
@@ -1611,10 +1500,7 @@ void lustre_assert_wire_constants(void)
 	CLASSERT(OBD_FL_CKSUM_CRC32 == 0x00001000);
 	CLASSERT(OBD_FL_CKSUM_ADLER == 0x00002000);
 	CLASSERT(OBD_FL_CKSUM_CRC32C == 0x00004000);
-	CLASSERT(OBD_FL_CKSUM_T10IP512 == 0x00005000);
-	CLASSERT(OBD_FL_CKSUM_T10IP4K == 0x00006000);
-	CLASSERT(OBD_FL_CKSUM_T10CRC512 == 0x00007000);
-	CLASSERT(OBD_FL_CKSUM_T10CRC4K == 0x00008000);
+	CLASSERT(OBD_FL_CKSUM_RSVD2 == 0x00008000);
 	CLASSERT(OBD_FL_CKSUM_RSVD3 == 0x00010000);
 	CLASSERT(OBD_FL_SHRINK_GRANT == 0x00020000);
 	CLASSERT(OBD_FL_MMAP == 0x00040000);
@@ -1713,8 +1599,8 @@ void lustre_assert_wire_constants(void)
 		(unsigned)LOV_PATTERN_RAID0);
 	LASSERTF(LOV_PATTERN_RAID1 == 0x00000002UL, "found 0x%.8xUL\n",
 		(unsigned)LOV_PATTERN_RAID1);
-	LASSERTF(LOV_PATTERN_MDT == 0x00000100UL, "found 0x%.8xUL\n",
-		(unsigned)LOV_PATTERN_MDT);
+	LASSERTF(LOV_PATTERN_FIRST == 0x00000100UL, "found 0x%.8xUL\n",
+		(unsigned)LOV_PATTERN_FIRST);
 	LASSERTF(LOV_PATTERN_CMOBD == 0x00000200UL, "found 0x%.8xUL\n",
 		(unsigned)LOV_PATTERN_CMOBD);
 
@@ -1741,22 +1627,12 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_size));
 	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_size) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_size));
-	LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_layout_gen) == 32, "found %lld\n",
-		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_layout_gen));
-	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_layout_gen) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_layout_gen));
-	LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_timestamp) == 36, "found %lld\n",
-		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_timestamp));
-	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_timestamp) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_timestamp));
-	LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_padding_1) == 44, "found %lld\n",
-		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_padding_1));
-	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_padding_1) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_padding_1));
+	LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_padding) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_padding));
+	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_padding) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_padding));
 	LASSERTF(LCME_FL_INIT == 0x00000010UL, "found 0x%.8xUL\n",
 		(unsigned)LCME_FL_INIT);
-	LASSERTF(LCME_FL_NEG == 0x80000000UL, "found 0x%.8xUL\n",
-		(unsigned)LCME_FL_NEG);
 
 	/* Checks for struct lov_comp_md_v1 */
 	LASSERTF((int)sizeof(struct lov_comp_md_v1) == 32, "found %lld\n",
@@ -1781,13 +1657,9 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_entry_count));
 	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entry_count) == 2, "found %lld\n",
 		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entry_count));
-	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_mirror_count) == 16, "found %lld\n",
-		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_mirror_count));
-	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_mirror_count) == 2, "found %lld\n",
-		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_mirror_count));
-	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding1) == 18, "found %lld\n",
+	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding1) == 16, "found %lld\n",
 		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_padding1));
-	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1) == 6, "found %lld\n",
+	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1) == 8, "found %lld\n",
 		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1));
 	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding2) == 24, "found %lld\n",
 		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_padding2));
@@ -1798,14 +1670,6 @@ void lustre_assert_wire_constants(void)
 	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entries[0]) == 48, "found %lld\n",
 		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entries[0]));
 	CLASSERT(LOV_MAGIC_COMP_V1 == (0x0BD60000 | 0x0BD0));
-	LASSERTF(LCM_FL_NONE == 0, "found %lld\n",
-		 (long long)LCM_FL_NONE);
-	LASSERTF(LCM_FL_RDONLY == 1, "found %lld\n",
-		 (long long)LCM_FL_RDONLY);
-	LASSERTF(LCM_FL_WRITE_PENDING == 2, "found %lld\n",
-		 (long long)LCM_FL_WRITE_PENDING);
-	LASSERTF(LCM_FL_SYNC_PENDING == 3, "found %lld\n",
-		 (long long)LCM_FL_SYNC_PENDING);
 
 	/* Checks for struct lmv_mds_md_v1 */
 	LASSERTF((int)sizeof(struct lmv_mds_md_v1) == 56, "found %lld\n",
@@ -1830,17 +1694,13 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_layout_version));
 	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_layout_version) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_layout_version));
-	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_migrate_offset) == 20, "found %lld\n",
-		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_migrate_offset));
-	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_migrate_offset) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_migrate_offset));
-	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_migrate_hash) == 24, "found %lld\n",
-		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_migrate_hash));
-	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_migrate_hash) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_migrate_hash));
-	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_padding2) == 28, "found %lld\n",
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_padding1) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_padding1));
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding1));
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_padding2) == 24, "found %lld\n",
 		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_padding2));
-	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding2) == 4, "found %lld\n",
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding2) == 8, "found %lld\n",
 		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding2));
 	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_padding3) == 32, "found %lld\n",
 		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_padding3));
@@ -1881,10 +1741,6 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct obd_statfs, os_bavail));
 	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bavail) == 8, "found %lld\n",
 		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_bavail));
-	LASSERTF((int)offsetof(struct obd_statfs, os_files) == 32, "found %lld\n",
-		 (long long)(int)offsetof(struct obd_statfs, os_files));
-	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_files) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_files));
 	LASSERTF((int)offsetof(struct obd_statfs, os_ffree) == 40, "found %lld\n",
 		 (long long)(int)offsetof(struct obd_statfs, os_ffree));
 	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_ffree) == 8, "found %lld\n",
@@ -1901,10 +1757,6 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct obd_statfs, os_namelen));
 	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_namelen) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_namelen));
-	LASSERTF((int)offsetof(struct obd_statfs, os_maxbytes) == 96, "found %lld\n",
-		 (long long)(int)offsetof(struct obd_statfs, os_maxbytes));
-	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_maxbytes) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_maxbytes));
 	LASSERTF((int)offsetof(struct obd_statfs, os_state) == 104, "found %lld\n",
 		 (long long)(int)offsetof(struct obd_statfs, os_state));
 	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_state) == 4, "found %lld\n",
@@ -1913,10 +1765,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct obd_statfs, os_fprecreated));
 	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fprecreated) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_fprecreated));
-	LASSERTF((int)offsetof(struct obd_statfs, os_granted) == 112, "found %lld\n",
-		 (long long)(int)offsetof(struct obd_statfs, os_granted));
-	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_granted) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_granted));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare2) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare2));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare2));
 	LASSERTF((int)offsetof(struct obd_statfs, os_spare3) == 116, "found %lld\n",
 		 (long long)(int)offsetof(struct obd_statfs, os_spare3));
 	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare3) == 4, "found %lld\n",
@@ -1945,20 +1797,6 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct obd_statfs, os_spare9));
 	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare9) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare9));
-	LASSERTF(OS_STATE_DEGRADED == 0x1, "found %lld\n",
-		 (long long)OS_STATE_DEGRADED);
-	LASSERTF(OS_STATE_READONLY == 0x2, "found %lld\n",
-		 (long long)OS_STATE_READONLY);
-	LASSERTF(OS_STATE_NOPRECREATE == 0x4, "found %lld\n",
-		 (long long)OS_STATE_NOPRECREATE);
-	LASSERTF(OS_STATE_ENOSPC == 0x20, "found %lld\n",
-		 (long long)OS_STATE_ENOSPC);
-	LASSERTF(OS_STATE_ENOINO == 0x40, "found %lld\n",
-		 (long long)OS_STATE_ENOINO);
-	LASSERTF(OS_STATE_SUM == 0x100, "found %lld\n",
-		 (long long)OS_STATE_SUM);
-	LASSERTF(OS_STATE_NONROT == 0x200, "found %lld\n",
-		 (long long)OS_STATE_NONROT);
 
 	/* Checks for struct obd_ioobj */
 	LASSERTF((int)sizeof(struct obd_ioobj) == 24, "found %lld\n",
@@ -2289,33 +2127,6 @@ void lustre_assert_wire_constants(void)
 	LASSERTF((int)sizeof(((struct ll_fid *)0)->f_type) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct ll_fid *)0)->f_type));
 
-	LASSERTF(MDS_CROSS_REF == 0x00000002UL, "found 0x%.8xUL\n",
-		(unsigned)MDS_CROSS_REF);
-	LASSERTF(MDS_PERM_BYPASS == 0x00000008UL, "found 0x%.8xUL\n",
-		(unsigned)MDS_PERM_BYPASS);
-	LASSERTF(MDS_QUOTA_IGNORE == 0x00000020UL, "found 0x%.8xUL\n",
-		(unsigned)MDS_QUOTA_IGNORE);
-	LASSERTF(MDS_KEEP_ORPHAN == 0x00000080UL, "found 0x%.8xUL\n",
-		(unsigned)MDS_KEEP_ORPHAN);
-	LASSERTF(MDS_RECOV_OPEN == 0x00000100UL, "found 0x%.8xUL\n",
-		(unsigned)MDS_RECOV_OPEN);
-	LASSERTF(MDS_DATA_MODIFIED == 0x00000200UL, "found 0x%.8xUL\n",
-		(unsigned)MDS_DATA_MODIFIED);
-	LASSERTF(MDS_CREATE_VOLATILE == 0x00000400UL, "found 0x%.8xUL\n",
-		(unsigned)MDS_CREATE_VOLATILE);
-	LASSERTF(MDS_OWNEROVERRIDE == 0x00000800UL, "found 0x%.8xUL\n",
-		(unsigned)MDS_OWNEROVERRIDE);
-	LASSERTF(MDS_HSM_RELEASE == 0x00001000UL, "found 0x%.8xUL\n",
-		(unsigned)MDS_HSM_RELEASE);
-	LASSERTF(MDS_CLOSE_LAYOUT_SWAP == 0x00004000UL, "found 0x%.8xUL\n",
-		(unsigned)MDS_CLOSE_LAYOUT_SWAP);
-	LASSERTF(MDS_CLOSE_LAYOUT_MERGE == 0x00008000UL, "found 0x%.8xUL\n",
-		(unsigned)MDS_CLOSE_LAYOUT_MERGE);
-	LASSERTF(MDS_CLOSE_RESYNC_DONE == 0x00010000UL, "found 0x%.8xUL\n",
-		(unsigned)MDS_CLOSE_RESYNC_DONE);
-	LASSERTF(MDS_CLOSE_LAYOUT_SPLIT == 0x00020000UL, "found 0x%.8xUL\n",
-		(unsigned)MDS_CLOSE_LAYOUT_SPLIT);
-
 	/* Checks for struct mdt_body */
 	LASSERTF((int)sizeof(struct mdt_body) == 216, "found %lld\n",
 		 (long long)(int)sizeof(struct mdt_body));
@@ -2327,10 +2138,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct mdt_body, mbo_fid2));
 	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_fid2) == 16, "found %lld\n",
 		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_fid2));
-	LASSERTF((int)offsetof(struct mdt_body, mbo_open_handle) == 32, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_body, mbo_open_handle));
-	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_open_handle) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_open_handle));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_handle) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_handle));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_handle));
 	LASSERTF((int)offsetof(struct mdt_body, mbo_valid) == 40, "found %lld\n",
 		 (long long)(int)offsetof(struct mdt_body, mbo_valid));
 	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_valid) == 8, "found %lld\n",
@@ -2355,10 +2166,6 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct mdt_body, mbo_blocks));
 	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_blocks) == 8, "found %lld\n",
 		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_blocks));
-	LASSERTF((int)offsetof(struct mdt_body, mbo_version) == 88, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_body, mbo_version));
-	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_version) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_version));
 	LASSERTF((int)offsetof(struct mdt_body, mbo_t_state) == 96, "found %lld\n",
 		 (long long)(int)offsetof(struct mdt_body, mbo_t_state));
 	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_t_state) == 8, "found %lld\n",
@@ -2399,10 +2206,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct mdt_body, mbo_nlink));
 	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_nlink) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_nlink));
-	LASSERTF((int)offsetof(struct mdt_body, mbo_layout_gen) == 140, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_body, mbo_layout_gen));
-	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_layout_gen) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_layout_gen));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_unused2) == 140, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_unused2));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_unused2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_unused2));
 	LASSERTF((int)offsetof(struct mdt_body, mbo_suppgid) == 144, "found %lld\n",
 		 (long long)(int)offsetof(struct mdt_body, mbo_suppgid));
 	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_suppgid) == 4, "found %lld\n",
@@ -2435,14 +2242,14 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct mdt_body, mbo_projid));
 	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_projid) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_projid));
-	LASSERTF((int)offsetof(struct mdt_body, mbo_dom_size) == 176, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_body, mbo_dom_size));
-	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_dom_size) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_dom_size));
-	LASSERTF((int)offsetof(struct mdt_body, mbo_dom_blocks) == 184, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_body, mbo_dom_blocks));
-	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_dom_blocks) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_dom_blocks));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_padding_6) == 176, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_padding_6));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_6) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_6));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_padding_7) == 184, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_padding_7));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_7) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_7));
 	LASSERTF((int)offsetof(struct mdt_body, mbo_padding_8) == 192, "found %lld\n",
 		 (long long)(int)offsetof(struct mdt_body, mbo_padding_8));
 	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_8) == 8, "found %lld\n",
@@ -2461,6 +2268,8 @@ void lustre_assert_wire_constants(void)
 		MDS_FMODE_EXEC);
 	LASSERTF(MDS_OPEN_CREATED == 000000000010UL, "found 0%.11oUL\n",
 		MDS_OPEN_CREATED);
+	LASSERTF(MDS_OPEN_CROSS == 000000000020UL, "found 0%.11oUL\n",
+		MDS_OPEN_CROSS);
 	LASSERTF(MDS_OPEN_CREAT == 000000000100UL, "found 0%.11oUL\n",
 		MDS_OPEN_CREAT);
 	LASSERTF(MDS_OPEN_EXCL == 000000000200UL, "found 0%.11oUL\n",
@@ -2523,20 +2332,14 @@ void lustre_assert_wire_constants(void)
 		MDS_INODELOCK_OPEN);
 	LASSERTF(MDS_INODELOCK_LAYOUT == 0x000008, "found 0x%.8x\n",
 		MDS_INODELOCK_LAYOUT);
-	LASSERTF(MDS_INODELOCK_PERM == 0x000010, "found 0x%.8x\n",
-		MDS_INODELOCK_PERM);
-	LASSERTF(MDS_INODELOCK_XATTR == 0x000020, "found 0x%.8x\n",
-		MDS_INODELOCK_XATTR);
-	LASSERTF(MDS_INODELOCK_DOM == 0x000040, "found 0x%.8x\n",
-		MDS_INODELOCK_DOM);
 
 	/* Checks for struct mdt_ioepoch */
 	LASSERTF((int)sizeof(struct mdt_ioepoch) == 24, "found %lld\n",
 		 (long long)(int)sizeof(struct mdt_ioepoch));
-	LASSERTF((int)offsetof(struct mdt_ioepoch, mio_open_handle) == 0, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_ioepoch, mio_open_handle));
-	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_open_handle) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_ioepoch *)0)->mio_open_handle));
+	LASSERTF((int)offsetof(struct mdt_ioepoch, mio_handle) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_ioepoch, mio_handle));
+	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_ioepoch *)0)->mio_handle));
 	LASSERTF((int)offsetof(struct mdt_ioepoch, mio_unused1) == 8, "found %lld\n",
 		 (long long)(int)offsetof(struct mdt_ioepoch, mio_unused1));
 	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_unused1) == 8, "found %lld\n",
@@ -2705,10 +2508,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct mdt_rec_create, cr_fid2));
 	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid2) == 16, "found %lld\n",
 		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid2));
-	LASSERTF((int)offsetof(struct mdt_rec_create, cr_open_handle_old) == 72, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_rec_create, cr_open_handle_old));
-	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_open_handle_old) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_open_handle_old));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_old_handle) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_old_handle));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_old_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_old_handle));
 	LASSERTF((int)offsetof(struct mdt_rec_create, cr_time) == 80, "found %lld\n",
 		 (long long)(int)offsetof(struct mdt_rec_create, cr_time));
 	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_time) == 8, "found %lld\n",
@@ -3142,102 +2945,6 @@ void lustre_assert_wire_constants(void)
 	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11));
 
-	/* Checks for struct mdt_rec_resync */
-	LASSERTF((int)sizeof(struct mdt_rec_resync) == 136, "found %lld\n",
-		 (long long)(int)sizeof(struct mdt_rec_resync));
-	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_opcode) == 0, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_rec_resync, rs_opcode));
-	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_opcode) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_opcode));
-	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_cap) == 4, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_rec_resync, rs_cap));
-	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_cap) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_cap));
-	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsuid) == 8, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_rec_resync, rs_fsuid));
-	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid));
-	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsuid_h) == 12, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_rec_resync, rs_fsuid_h));
-	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid_h) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid_h));
-	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsgid) == 16, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_rec_resync, rs_fsgid));
-	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid));
-	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsgid_h) == 20, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_rec_resync, rs_fsgid_h));
-	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid_h) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid_h));
-	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid1) == 24, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid1));
-	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1));
-	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid1_h) == 28, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid1_h));
-	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1_h) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1_h));
-	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid2) == 32, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid2));
-	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2));
-	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid2_h) == 36, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid2_h));
-	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2_h) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2_h));
-	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fid) == 40, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_rec_resync, rs_fid));
-	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fid) == 16, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fid));
-	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding0) == 56, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding0));
-	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding0) == 16, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding0));
-	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding1) == 80, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding1));
-	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding1) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding1));
-	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding2) == 88, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding2));
-	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding2) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding2));
-	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding3) == 96, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding3));
-	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding3) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding3));
-	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding4) == 104, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding4));
-	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding4) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding4));
-	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_bias) == 112, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_rec_resync, rs_bias));
-	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_bias) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_bias));
-	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding5) == 116, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding5));
-	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding5) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding5));
-	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding6) == 120, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding6));
-	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding6) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding6));
-	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding7) == 124, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding7));
-	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding7) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding7));
-	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding8) == 128, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding8));
-	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding8) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding8));
-	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_mirror_id) == 132, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_rec_resync, rs_mirror_id));
-	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_mirror_id) == 2, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_mirror_id));
-	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding9) == 134, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding9));
-	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding9) == 2, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding9));
-
 	/* Checks for struct mdt_rec_reint */
 	LASSERTF((int)sizeof(struct mdt_rec_reint) == 136, "found %lld\n",
 		 (long long)(int)sizeof(struct mdt_rec_reint));
@@ -3329,13 +3036,9 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct mdt_rec_reint, rr_umask));
 	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_umask) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_umask));
-	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mirror_id) == 132, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_rec_reint, rr_mirror_id));
-	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mirror_id) == 2, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mirror_id));
-	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_padding_4) == 134, "found %lld\n",
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_padding_4) == 132, "found %lld\n",
 		 (long long)(int)offsetof(struct mdt_rec_reint, rr_padding_4));
-	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4) == 2, "found %lld\n",
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4));
 
 	/* Checks for struct lmv_desc */
@@ -3461,16 +3164,12 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)sizeof(((struct ldlm_extent *)0)->gid));
 
 	/* Checks for struct ldlm_inodebits */
-	LASSERTF((int)sizeof(struct ldlm_inodebits) == 16, "found %lld\n",
+	LASSERTF((int)sizeof(struct ldlm_inodebits) == 8, "found %lld\n",
 		 (long long)(int)sizeof(struct ldlm_inodebits));
 	LASSERTF((int)offsetof(struct ldlm_inodebits, bits) == 0, "found %lld\n",
 		 (long long)(int)offsetof(struct ldlm_inodebits, bits));
 	LASSERTF((int)sizeof(((struct ldlm_inodebits *)0)->bits) == 8, "found %lld\n",
 		 (long long)(int)sizeof(((struct ldlm_inodebits *)0)->bits));
-	LASSERTF((int)offsetof(struct ldlm_inodebits, try_bits) == 8, "found %lld\n",
-		 (long long)(int)offsetof(struct ldlm_inodebits, try_bits));
-	LASSERTF((int)sizeof(((struct ldlm_inodebits *)0)->try_bits) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct ldlm_inodebits *)0)->try_bits));
 
 	/* Checks for struct ldlm_flock_wire */
 	LASSERTF((int)sizeof(struct ldlm_flock_wire) == 32, "found %lld\n",
@@ -3513,14 +3212,24 @@ void lustre_assert_wire_constants(void)
 		 (long long)IT_GETATTR);
 	LASSERTF(IT_LOOKUP == 16, "found %lld\n",
 		 (long long)IT_LOOKUP);
+	LASSERTF(IT_UNLINK == 32, "found %lld\n",
+		 (long long)IT_UNLINK);
+	LASSERTF(IT_TRUNC == 64, "found %lld\n",
+		 (long long)IT_TRUNC);
 	LASSERTF(IT_GETXATTR == 128, "found %lld\n",
 		 (long long)IT_GETXATTR);
+	LASSERTF(IT_EXEC == 256, "found %lld\n",
+		 (long long)IT_EXEC);
+	LASSERTF(IT_PIN == 512, "found %lld\n",
+		 (long long)IT_PIN);
 	LASSERTF(IT_LAYOUT == 1024, "found %lld\n",
 		 (long long)IT_LAYOUT);
 	LASSERTF(IT_QUOTA_DQACQ == 2048, "found %lld\n",
 		 (long long)IT_QUOTA_DQACQ);
 	LASSERTF(IT_QUOTA_CONN == 4096, "found %lld\n",
 		 (long long)IT_QUOTA_CONN);
+	LASSERTF(IT_SETXATTR == 8192, "found %lld\n",
+		 (long long)IT_SETXATTR);
 
 	/* Checks for struct ldlm_resource_desc */
 	LASSERTF((int)sizeof(struct ldlm_resource_desc) == 40, "found %lld\n",
@@ -3993,14 +3702,14 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_valid));
 	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_valid) == 8, "found %lld\n",
 		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_valid));
-	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_tail) == 56, "found %lld\n",
-		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_tail));
-	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail));
 	LASSERTF((int)offsetof(struct llog_setattr64_rec_v2, lsr_projid) == 56, "found %lld\n",
 		 (long long)(int)offsetof(struct llog_setattr64_rec_v2, lsr_projid));
 	LASSERTF((int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_projid) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_projid));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec_v2, lsr_tail) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec_v2, lsr_tail));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_tail));
 
 	/* Checks for struct llog_size_change_rec */
 	LASSERTF((int)sizeof(struct llog_size_change_rec) == 64, "found %lld\n",
@@ -4129,10 +3838,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_id));
 	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id));
-	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_time) == 20, "found %lld\n",
-		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_time));
-	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_time) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_time));
+	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_padding) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_padding));
+	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_padding));
 	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_endrec) == 24, "found %lld\n",
 		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_endrec));
 	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec) == 8, "found %lld\n",
@@ -4240,7 +3949,12 @@ void lustre_assert_wire_constants(void)
 	CLASSERT(LLOG_ORIGIN_HANDLE_CREATE == 501);
 	CLASSERT(LLOG_ORIGIN_HANDLE_NEXT_BLOCK == 502);
 	CLASSERT(LLOG_ORIGIN_HANDLE_READ_HEADER == 503);
+	CLASSERT(LLOG_ORIGIN_HANDLE_WRITE_REC == 504);
+	CLASSERT(LLOG_ORIGIN_HANDLE_CLOSE == 505);
+	CLASSERT(LLOG_ORIGIN_CONNECT == 506);
+	CLASSERT(LLOG_CATINFO == 507);
 	CLASSERT(LLOG_ORIGIN_HANDLE_PREV_BLOCK == 508);
+	CLASSERT(LLOG_ORIGIN_HANDLE_DESTROY == 509);
 	CLASSERT(LLOG_FIRST_OPC == 501);
 	CLASSERT(LLOG_LAST_OPC == 510);
 	CLASSERT(LLOG_CONFIG_ORIG_CTXT == 0);
@@ -4712,10 +4426,14 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct layout_intent, li_flags));
 	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_flags) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct layout_intent *)0)->li_flags));
-	LASSERTF((int)offsetof(struct layout_intent, li_extent) == 8, "found %lld\n",
-		 (long long)(int)offsetof(struct layout_intent, li_extent));
-	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_extent) == 16, "found %lld\n",
-		 (long long)(int)sizeof(((struct layout_intent *)0)->li_extent));
+	LASSERTF((int)offsetof(struct layout_intent, li_start) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct layout_intent, li_start));
+	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct layout_intent *)0)->li_start));
+	LASSERTF((int)offsetof(struct layout_intent, li_end) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct layout_intent, li_end));
+	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct layout_intent *)0)->li_end));
 	LASSERTF(LAYOUT_INTENT_ACCESS == 0, "found %lld\n",
 		 (long long)LAYOUT_INTENT_ACCESS);
 	LASSERTF(LAYOUT_INTENT_READ == 1, "found %lld\n",
@@ -5371,14 +5089,12 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct lfsck_request, lr_padding_3));
 	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_padding_3) == 8, "found %lld\n",
 		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_padding_3));
-#ifdef HAVE_SERVER_SUPPORT
 	LASSERTF(LFSCK_TYPE_SCRUB == 0x00000000UL, "found 0x%.8xUL\n",
 		(unsigned)LFSCK_TYPE_SCRUB);
 	LASSERTF(LFSCK_TYPE_LAYOUT == 0x00000001UL, "found 0x%.8xUL\n",
 		(unsigned)LFSCK_TYPE_LAYOUT);
 	LASSERTF(LFSCK_TYPE_NAMESPACE == 0x00000004UL, "found 0x%.8xUL\n",
 		(unsigned)LFSCK_TYPE_NAMESPACE);
-#endif
 	LASSERTF(LE_LASTID_REBUILDING == 1, "found %lld\n",
 		 (long long)LE_LASTID_REBUILDING);
 	LASSERTF(LE_LASTID_REBUILT == 2, "found %lld\n",
@@ -5441,7 +5157,7 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)sizeof(((struct update_params *)0)->up_params));
 
 	/* Checks for struct update_op */
-	LASSERTF((int)sizeof(struct update_op) == 20, "found %lld\n",
+	LASSERTF((int)sizeof(struct update_op) == 24, "found %lld\n",
 		 (long long)(int)sizeof(struct update_op));
 	LASSERTF((int)offsetof(struct update_op, uop_fid) == 0, "found %lld\n",
 		 (long long)(int)offsetof(struct update_op, uop_fid));
@@ -5510,145 +5226,75 @@ void lustre_assert_wire_constants(void)
 	LASSERTF((int)sizeof(((struct llog_update_record *)0)->lur_update_rec) == 32, "found %lld\n",
 		 (long long)(int)sizeof(((struct llog_update_record *)0)->lur_update_rec));
 
-	/* Checks for struct lustre_cfg */
-	LASSERTF((int)sizeof(struct lustre_cfg) == 32, "found %lld\n",
-		 (long long)(int)sizeof(struct lustre_cfg));
-	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_version) == 0, "found %lld\n",
-		 (long long)(int)offsetof(struct lustre_cfg, lcfg_version));
-	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_version) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_version));
-	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_command) == 4, "found %lld\n",
-		 (long long)(int)offsetof(struct lustre_cfg, lcfg_command));
-	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_command) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_command));
-	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_num) == 8, "found %lld\n",
-		 (long long)(int)offsetof(struct lustre_cfg, lcfg_num));
-	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_num) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_num));
-	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_flags) == 12, "found %lld\n",
-		 (long long)(int)offsetof(struct lustre_cfg, lcfg_flags));
-	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_flags) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_flags));
-	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_nid) == 16, "found %lld\n",
-		 (long long)(int)offsetof(struct lustre_cfg, lcfg_nid));
-	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_nid) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_nid));
-	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_nal) == 24, "found %lld\n",
-		 (long long)(int)offsetof(struct lustre_cfg, lcfg_nal));
-	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_nal) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_nal));
-	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_bufcount) == 28, "found %lld\n",
-		 (long long)(int)offsetof(struct lustre_cfg, lcfg_bufcount));
-	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_bufcount) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_bufcount));
-	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_buflens[0]) == 32, "found %lld\n",
-		 (long long)(int)offsetof(struct lustre_cfg, lcfg_buflens[0]));
-	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_buflens[0]) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_buflens[0]));
-	LASSERTF(LCFG_ATTACH == 0x000cf001UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_ATTACH);
-	LASSERTF(LCFG_DETACH == 0x000cf002UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_DETACH);
-	LASSERTF(LCFG_SETUP == 0x000cf003UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_SETUP);
-	LASSERTF(LCFG_CLEANUP == 0x000cf004UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_CLEANUP);
-	LASSERTF(LCFG_ADD_UUID == 0x000cf005UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_ADD_UUID);
-	LASSERTF(LCFG_DEL_UUID == 0x000cf006UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_DEL_UUID);
-	LASSERTF(LCFG_MOUNTOPT == 0x000cf007UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_MOUNTOPT);
-	LASSERTF(LCFG_DEL_MOUNTOPT == 0x000cf008UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_DEL_MOUNTOPT);
-	LASSERTF(LCFG_SET_TIMEOUT == 0x000cf009UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_SET_TIMEOUT);
-	LASSERTF(LCFG_SET_UPCALL == 0x000cf00aUL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_SET_UPCALL);
-	LASSERTF(LCFG_ADD_CONN == 0x000cf00bUL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_ADD_CONN);
-	LASSERTF(LCFG_DEL_CONN == 0x000cf00cUL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_DEL_CONN);
-	LASSERTF(LCFG_LOV_ADD_OBD == 0x000cf00dUL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_LOV_ADD_OBD);
-	LASSERTF(LCFG_LOV_DEL_OBD == 0x000cf00eUL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_LOV_DEL_OBD);
-	LASSERTF(LCFG_PARAM == 0x000cf00fUL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_PARAM);
-	LASSERTF(LCFG_MARKER == 0x000cf010UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_MARKER);
-	LASSERTF(LCFG_LOG_START == 0x000ce011UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_LOG_START);
-	LASSERTF(LCFG_LOG_END == 0x000ce012UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_LOG_END);
-	LASSERTF(LCFG_LOV_ADD_INA == 0x000ce013UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_LOV_ADD_INA);
-	LASSERTF(LCFG_ADD_MDC == 0x000cf014UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_ADD_MDC);
-	LASSERTF(LCFG_DEL_MDC == 0x000cf015UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_DEL_MDC);
-	LASSERTF(LCFG_SPTLRPC_CONF == 0x000ce016UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_SPTLRPC_CONF);
-	LASSERTF(LCFG_POOL_NEW == 0x000ce020UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_POOL_NEW);
-	LASSERTF(LCFG_POOL_ADD == 0x000ce021UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_POOL_ADD);
-	LASSERTF(LCFG_POOL_REM == 0x000ce022UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_POOL_REM);
-	LASSERTF(LCFG_POOL_DEL == 0x000ce023UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_POOL_DEL);
-	LASSERTF(LCFG_SET_LDLM_TIMEOUT == 0x000ce030UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_SET_LDLM_TIMEOUT);
-	LASSERTF(LCFG_PRE_CLEANUP == 0x000cf031UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_PRE_CLEANUP);
-	LASSERTF(LCFG_SET_PARAM == 0x000ce032UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_SET_PARAM);
-	LASSERTF(LCFG_NODEMAP_ADD == 0x000ce040UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_NODEMAP_ADD);
-	LASSERTF(LCFG_NODEMAP_DEL == 0x000ce041UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_NODEMAP_DEL);
-	LASSERTF(LCFG_NODEMAP_ADD_RANGE == 0x000ce042UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_NODEMAP_ADD_RANGE);
-	LASSERTF(LCFG_NODEMAP_DEL_RANGE == 0x000ce043UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_NODEMAP_DEL_RANGE);
-	LASSERTF(LCFG_NODEMAP_ADD_UIDMAP == 0x000ce044UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_NODEMAP_ADD_UIDMAP);
-	LASSERTF(LCFG_NODEMAP_DEL_UIDMAP == 0x000ce045UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_NODEMAP_DEL_UIDMAP);
-	LASSERTF(LCFG_NODEMAP_ADD_GIDMAP == 0x000ce046UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_NODEMAP_ADD_GIDMAP);
-	LASSERTF(LCFG_NODEMAP_DEL_GIDMAP == 0x000ce047UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_NODEMAP_DEL_GIDMAP);
-	LASSERTF(LCFG_NODEMAP_ACTIVATE == 0x000ce048UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_NODEMAP_ACTIVATE);
-	LASSERTF(LCFG_NODEMAP_ADMIN == 0x000ce049UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_NODEMAP_ADMIN);
-	LASSERTF(LCFG_NODEMAP_TRUSTED == 0x000ce050UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_NODEMAP_TRUSTED);
-	LASSERTF(LCFG_NODEMAP_SQUASH_UID == 0x000ce051UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_NODEMAP_SQUASH_UID);
-	LASSERTF(LCFG_NODEMAP_SQUASH_GID == 0x000ce052UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_NODEMAP_SQUASH_GID);
-	LASSERTF(LCFG_NODEMAP_ADD_SHKEY == 0x000ce053UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_NODEMAP_ADD_SHKEY);
-	LASSERTF(LCFG_NODEMAP_DEL_SHKEY == 0x000ce054UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_NODEMAP_DEL_SHKEY);
-	LASSERTF(LCFG_NODEMAP_TEST_NID == 0x000ce055UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_NODEMAP_TEST_NID);
-	LASSERTF(LCFG_NODEMAP_TEST_ID == 0x000ce056UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_NODEMAP_TEST_ID);
-	LASSERTF(LCFG_NODEMAP_SET_FILESET == 0x000ce057UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_NODEMAP_SET_FILESET);
-	LASSERTF(LCFG_NODEMAP_DENY_UNKNOWN == 0x000ce058UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_NODEMAP_DENY_UNKNOWN);
-	LASSERTF(LCFG_NODEMAP_MAP_MODE == 0x000ce059UL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_NODEMAP_MAP_MODE);
-	LASSERTF(LCFG_NODEMAP_AUDIT_MODE == 0x000ce05aUL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_NODEMAP_AUDIT_MODE);
-	LASSERTF(LCFG_NODEMAP_SET_SEPOL == 0x000ce05bUL, "found 0x%.8xUL\n",
-		(unsigned)LCFG_NODEMAP_SET_SEPOL);
-	LASSERTF(PORTALS_CFG_TYPE == 1, "found %lld\n",
-		 (long long)PORTALS_CFG_TYPE);
-	LASSERTF(LUSTRE_CFG_TYPE == 123, "found %lld\n",
-		 (long long)LUSTRE_CFG_TYPE);
+	/* Checks for struct lu_ladvise */
+	LASSERTF((int)sizeof(struct lu_ladvise) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lu_ladvise));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_advice) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_advice));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_advice) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_advice));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_value1) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_value1));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value1) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value1));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_value2) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_value2));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value2));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_start) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_start));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_start));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_end) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_end));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_end));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_value3) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_value3));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value3));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_value4) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_value4));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value4));
+	LASSERTF(LU_LADVISE_WILLREAD == 1, "found %lld\n",
+		 (long long)LU_LADVISE_WILLREAD);
+	LASSERTF(LU_LADVISE_DONTNEED == 2, "found %lld\n",
+		 (long long)LU_LADVISE_DONTNEED);
+
+	/* Checks for struct ladvise_hdr */
+	LASSERTF(LADVISE_MAGIC == 0x1ADF1CE0, "found 0x%.8x\n",
+		 LADVISE_MAGIC);
+	LASSERTF((int)sizeof(struct ladvise_hdr) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct ladvise_hdr));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_magic));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_magic));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_count));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_count));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_flags) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_flags));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_flags));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_value1) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_value1));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value1));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_value2) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_value2));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value2));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_value3) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_value3));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value3));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_advise) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_advise));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_advise) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_advise));
+	LASSERTF(LF_ASYNC == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)LF_ASYNC);
 }
diff --git a/drivers/staging/lustrefsx/lustre/target/barrier.c b/drivers/staging/lustrefsx/lustre/target/barrier.c
index 54b3e567b3605..6145e0e37a711 100644
--- a/drivers/staging/lustrefsx/lustre/target/barrier.c
+++ b/drivers/staging/lustrefsx/lustre/target/barrier.c
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2016, Intel Corporation.
  *
  * lustre/target/barrier.c
  *
@@ -35,11 +35,12 @@
 
 #include <linux/percpu_counter.h>
 
+#include <lustre/lustre_idl.h>
 #include <dt_object.h>
 #include <obd.h>
 #include <obd_class.h>
 #include <lustre_barrier.h>
-#include <uapi/linux/lustre/lustre_barrier_user.h>
+#include <lustre/lustre_barrier_user.h>
 
 static LIST_HEAD(barrier_instance_list);
 static DEFINE_SPINLOCK(barrier_instance_lock);
@@ -52,7 +53,7 @@ struct barrier_instance {
 	rwlock_t		 bi_rwlock;
 	struct percpu_counter	 bi_writers;
 	atomic_t		 bi_ref;
-	time64_t		 bi_deadline;
+	time_t			 bi_deadline;
 	__u32			 bi_status;
 };
 
@@ -172,7 +173,7 @@ static void barrier_set(struct barrier_instance *barrier, __u32 status)
 static int barrier_freeze(const struct lu_env *env,
 			  struct barrier_instance *barrier, bool phase1)
 {
-	time64_t left;
+	int left;
 	int rc = 0;
 	__s64 inflight = 0;
 	ENTRY;
@@ -194,7 +195,7 @@ static int barrier_freeze(const struct lu_env *env,
 
 	LASSERT(barrier->bi_deadline != 0);
 
-	left = barrier->bi_deadline - ktime_get_real_seconds();
+	left = barrier->bi_deadline - cfs_time_current_sec();
 	if (left <= 0)
 		RETURN(1);
 
@@ -213,7 +214,8 @@ static int barrier_freeze(const struct lu_env *env,
 		if (rc)
 			RETURN(rc);
 
-		if (ktime_get_real_seconds() > barrier->bi_deadline)
+		if (cfs_time_beforeq(barrier->bi_deadline,
+				     cfs_time_current_sec()))
 			RETURN(1);
 	}
 
@@ -250,7 +252,7 @@ bool barrier_entry(struct dt_device *key)
 	if (likely(barrier->bi_status != BS_FREEZING_P1 &&
 		   barrier->bi_status != BS_FREEZING_P2 &&
 		   barrier->bi_status != BS_FROZEN) ||
-	    ktime_get_real_seconds() > barrier->bi_deadline) {
+	    cfs_time_beforeq(barrier->bi_deadline, cfs_time_current_sec())) {
 		percpu_counter_inc(&barrier->bi_writers);
 		entered = true;
 	}
@@ -290,7 +292,7 @@ int barrier_handler(struct dt_device *key, struct ptlrpc_request *req)
 	ENTRY;
 
 	/* glimpse on barrier locks always packs a glimpse descriptor */
-	req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK_DESC);
+	req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_DESC_CALLBACK);
 	desc = req_capsule_client_get(&req->rq_pill, &RMF_DLM_GL_DESC);
 	if (!desc)
 		GOTO(out, rc = -EPROTO);
@@ -324,8 +326,8 @@ int barrier_handler(struct dt_device *key, struct ptlrpc_request *req)
 		if (OBD_FAIL_CHECK(OBD_FAIL_BARRIER_FAILURE))
 			GOTO(fini, rc = -EINVAL);
 
-		barrier->bi_deadline = ktime_get_real_seconds() +
-				       desc->lgbd_timeout;
+		barrier->bi_deadline = cfs_time_current_sec() +
+					desc->lgbd_timeout;
 		rc = barrier_freeze(&env, barrier,
 				    desc->lgbd_status == BS_FREEZING_P1);
 		break;
@@ -356,7 +358,7 @@ int barrier_handler(struct dt_device *key, struct ptlrpc_request *req)
 	lvb->lvb_index = barrier_dev_idx(barrier);
 
 	CDEBUG(D_SNAPSHOT, "%s: handled barrier request: status %u, "
-	       "deadline %lld: rc = %d\n", barrier_barrier2name(barrier),
+	       "deadline %lu: rc = %d\n", barrier_barrier2name(barrier),
 	       lvb->lvb_status, barrier->bi_deadline, rc);
 
 	barrier_instance_put(barrier);
diff --git a/drivers/staging/lustrefsx/lustre/target/out_handler.c b/drivers/staging/lustrefsx/lustre/target/out_handler.c
index a238f588e0cd1..c342ae41f95c0 100644
--- a/drivers/staging/lustrefsx/lustre/target/out_handler.c
+++ b/drivers/staging/lustrefsx/lustre/target/out_handler.c
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2013, 2017, Intel Corporation.
+ * Copyright (c) 2013, 2016, Intel Corporation.
  *
  * lustre/target/out_handler.c
  *
@@ -52,7 +52,7 @@ static void out_reconstruct(const struct lu_env *env, struct dt_device *dt,
 			    struct object_update_reply *reply,
 			    int index)
 {
-	CDEBUG(D_HA, "%s: fork reply reply %p index %d: rc = %d\n",
+	CDEBUG(D_INFO, "%s: fork reply reply %p index %d: rc = %d\n",
 	       dt_obd_name(dt), reply, index, 0);
 
 	object_update_result_insert(reply, NULL, 0, index, 0);
@@ -65,10 +65,16 @@ typedef void (*out_reconstruct_t)(const struct lu_env *env,
 				  struct object_update_reply *reply,
 				  int index);
 
-static inline bool out_check_resent(struct ptlrpc_request *req)
+static inline int out_check_resent(const struct lu_env *env,
+				   struct dt_device *dt,
+				   struct dt_object *obj,
+				   struct ptlrpc_request *req,
+				   out_reconstruct_t reconstruct,
+				   struct object_update_reply *reply,
+				   int index)
 {
 	if (likely(!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT)))
-		return false;
+		return 0;
 
 	if (req_xid_is_last(req)) {
 		struct lsd_client_data *lcd;
@@ -84,12 +90,14 @@ static inline bool out_check_resent(struct ptlrpc_request *req)
 		lustre_msg_set_transno(req->rq_repmsg, req->rq_transno);
 		lustre_msg_set_status(req->rq_repmsg, req->rq_status);
 
-		DEBUG_REQ(D_HA, req, "reconstruct resent RPC");
-		return true;
+		DEBUG_REQ(D_RPCTRACE, req, "restoring resent RPC");
+
+		reconstruct(env, dt, obj, reply, index);
+		return 1;
 	}
-	DEBUG_REQ(D_HA, req, "reprocess RESENT req, last_xid is %lld",
-		  req->rq_export->exp_target_data.ted_lcd->lcd_last_xid);
-	return false;
+	DEBUG_REQ(D_HA, req, "no reply for RESENT req (have %lld)",
+		 req->rq_export->exp_target_data.ted_lcd->lcd_last_xid);
+	return 0;
 }
 
 static int out_create(struct tgt_session_info *tsi)
@@ -281,62 +289,10 @@ static int out_xattr_get(struct tgt_session_info *tsi)
 	} else if (lbuf->lb_buf) {
 		lbuf->lb_len = rc;
 	}
-	CDEBUG(D_INFO, "%s: "DFID" get xattr %s len %d\n",
-	       tgt_name(tsi->tsi_tgt), PFID(lu_object_fid(&obj->do_lu)),
-	       name, rc);
-
-	GOTO(out, rc);
-
-out:
-	object_update_result_insert(reply, lbuf->lb_buf, lbuf->lb_len, idx, rc);
-	RETURN(0);
-}
-
-static int out_xattr_list(struct tgt_session_info *tsi)
-{
-	const struct lu_env *env = tsi->tsi_env;
-	struct tgt_thread_info *tti = tgt_th_info(env);
-	struct lu_buf *lbuf = &tti->tti_buf;
-	struct object_update_reply *reply = tti->tti_u.update.tti_update_reply;
-	struct dt_object *obj = tti->tti_u.update.tti_dt_object;
-	struct object_update_result *update_result;
-	int idx = tti->tti_u.update.tti_update_reply_index;
-	int rc;
-
-	ENTRY;
-
-	if (!lu_object_exists(&obj->do_lu)) {
-		set_bit(LU_OBJECT_HEARD_BANSHEE,
-			&obj->do_lu.lo_header->loh_flags);
-		RETURN(-ENOENT);
-	}
-
-	update_result = object_update_result_get(reply, 0, NULL);
-	if (!update_result) {
-		rc = -EPROTO;
-		CERROR("%s: empty buf for xattr list: rc = %d\n",
-		       tgt_name(tsi->tsi_tgt), rc);
-		RETURN(rc);
-	}
-
-	lbuf->lb_len = (int)tti->tti_u.update.tti_update->ou_result_size;
-	lbuf->lb_buf = update_result->our_data;
-	if (lbuf->lb_len == 0)
-		lbuf->lb_buf = 0;
-
-	dt_read_lock(env, obj, MOR_TGT_CHILD);
-	rc = dt_xattr_list(env, obj, lbuf);
-	dt_read_unlock(env, obj);
-	if (rc <= 0) {
-		lbuf->lb_len = 0;
-		if (unlikely(!rc))
-			rc = -ENODATA;
-	} else if (lbuf->lb_buf) {
-		lbuf->lb_len = rc;
-	}
 
-	CDEBUG(D_INFO, "%s: "DFID" list xattr len %d\n",
-	       tgt_name(tsi->tsi_tgt), PFID(lu_object_fid(&obj->do_lu)), rc);
+	CDEBUG(D_INFO, "%s: "DFID" get xattr %s len %d: rc = %d\n",
+	       tgt_name(tsi->tsi_tgt), PFID(lu_object_fid(&obj->do_lu)),
+	       name, (int)lbuf->lb_len, rc);
 
 	/* Since we directly use update_result->our_data as the lbuf->lb_buf,
 	 * then use NULL for result_insert to avoid unnecessary memory copy. */
@@ -803,8 +759,6 @@ static struct tgt_handler out_update_ops[] = {
 	DEF_OUT_HNDL(OUT_WRITE, "out_write", MUTABOR | HABEO_REFERO, out_write),
 	DEF_OUT_HNDL(OUT_READ, "out_read", HABEO_REFERO, out_read),
 	DEF_OUT_HNDL(OUT_NOOP, "out_noop", HABEO_REFERO, out_noop),
-	DEF_OUT_HNDL(OUT_XATTR_LIST, "out_xattr_list", HABEO_REFERO,
-		     out_xattr_list),
 };
 
 static struct tgt_handler *out_handler_find(__u32 opc)
@@ -963,8 +917,6 @@ int out_handle(struct tgt_session_info *tsi)
 	int				rc1 = 0;
 	int				ouh_size, reply_size;
 	int				updates;
-	bool need_reconstruct;
-
 	ENTRY;
 
 	req_capsule_set(pill, &RQF_OUT_UPDATE);
@@ -1102,8 +1054,6 @@ int out_handle(struct tgt_session_info *tsi)
 	tti->tti_u.update.tti_update_reply = reply;
 	tti->tti_mult_trans = !req_is_replay(tgt_ses_req(tsi));
 
-	need_reconstruct = out_check_resent(pill->rc_req);
-
 	/* Walk through updates in the request to execute them */
 	for (i = 0; i < update_buf_count; i++) {
 		struct tgt_handler	*h;
@@ -1151,19 +1101,12 @@ int out_handle(struct tgt_session_info *tsi)
 
 			/* Check resend case only for modifying RPC */
 			if (h->th_flags & MUTABOR) {
-				/* sanity check for last XID changing */
-				if (unlikely(!need_reconstruct &&
-					     req_xid_is_last(pill->rc_req))) {
-					DEBUG_REQ(D_ERROR, pill->rc_req,
-						  "unexpected last XID change");
-					GOTO(next, rc = -EINVAL);
-				}
+				struct ptlrpc_request *req = tgt_ses_req(tsi);
 
-				if (need_reconstruct) {
-					out_reconstruct(env, dt, dt_obj, reply,
-							reply_index);
+				if (out_check_resent(env, dt, dt_obj, req,
+						     out_reconstruct, reply,
+						     reply_index))
 					GOTO(next, rc = 0);
-				}
 
 				if (dt->dd_rdonly)
 					GOTO(next, rc = -EROFS);
@@ -1172,10 +1115,6 @@ int out_handle(struct tgt_session_info *tsi)
 			/* start transaction for modification RPC only */
 			if (h->th_flags & MUTABOR && current_batchid == -1) {
 				current_batchid = update->ou_batchid;
-
-				if (reply_index == 0)
-					CFS_RACE(OBD_FAIL_PTLRPC_RESEND_RACE);
-
 				rc = out_tx_start(env, dt, ta, tsi->tsi_exp);
 				if (rc != 0)
 					GOTO(next, rc);
diff --git a/drivers/staging/lustrefsx/lustre/target/out_lib.c b/drivers/staging/lustrefsx/lustre/target/out_lib.c
index e8ebf95f4786c..c267ed20bf485 100644
--- a/drivers/staging/lustrefsx/lustre/target/out_lib.c
+++ b/drivers/staging/lustrefsx/lustre/target/out_lib.c
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2014, 2017, Intel Corporation.
+ * Copyright (c) 2014, 2015, Intel Corporation.
  */
 /*
  * lustre/target/out_lib.c
@@ -53,7 +53,6 @@ const char *update_op_str(__u16 opc)
 		[OUT_ATTR_GET] = "attr_get",
 		[OUT_XATTR_SET] = "xattr_set",
 		[OUT_XATTR_GET] = "xattr_get",
-		[OUT_XATTR_LIST] = "xattr_list",
 		[OUT_INDEX_LOOKUP] = "lookup",
 		[OUT_INDEX_INSERT] = "insert",
 		[OUT_INDEX_DELETE] = "delete",
@@ -103,7 +102,7 @@ int out_update_header_pack(const struct lu_env *env,
 	unsigned int			i;
 	size_t				update_size;
 
-	if (reply_size  >= LNET_MTU)
+	if (((reply_size + 7) >> 3) >= 1ULL << 16)
 		return -EINVAL;
 
 	/* Check whether the packing exceeding the maxima update length */
@@ -405,15 +404,6 @@ int out_xattr_get_pack(const struct lu_env *env, struct object_update *update,
 }
 EXPORT_SYMBOL(out_xattr_get_pack);
 
-int out_xattr_list_pack(const struct lu_env *env, struct object_update *update,
-		       size_t *max_update_size, const struct lu_fid *fid,
-		       const int bufsize)
-{
-	return out_update_pack(env, update, max_update_size, OUT_XATTR_LIST,
-			       fid, 0, NULL, NULL, bufsize);
-}
-EXPORT_SYMBOL(out_xattr_list_pack);
-
 int out_read_pack(const struct lu_env *env, struct object_update *update,
 		  size_t *max_update_size, const struct lu_fid *fid,
 		  size_t size, loff_t pos)
@@ -598,10 +588,6 @@ int out_create_add_exec(const struct lu_env *env, struct dt_object *obj,
 	struct tx_arg *arg;
 	int rc;
 
-	/* LU-13653: ignore quota for DNE directory creation */
-	if (dof->dof_type == DFT_DIR)
-		th->th_ignore_quota = 1;
-
 	rc = dt_declare_create(env, obj, attr, NULL, dof, th);
 	if (rc != 0)
 		return rc;
@@ -671,10 +657,6 @@ int out_attr_set_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
 	if (rc != 0)
 		return rc;
 
-	if (attr->la_valid & LA_FLAGS &&
-	    attr->la_flags & LUSTRE_SET_SYNC_FL)
-		th->th_sync |= 1;
-
 	arg = tx_add_exec(ta, out_tx_attr_set_exec, out_tx_attr_set_undo,
 			  file, line);
 	if (IS_ERR(arg))
@@ -815,7 +797,8 @@ static int out_tx_xattr_set_exec(const struct lu_env *env,
 
 				lu_buf_free(&tbuf);
 				if (update) {
-					leh->leh_overflow_time = ktime_get_real_seconds();
+					leh->leh_overflow_time =
+							cfs_time_current_sec();
 					if (unlikely(!leh->leh_overflow_time))
 						leh->leh_overflow_time++;
 				}
@@ -1077,7 +1060,7 @@ static int out_obj_index_insert(const struct lu_env *env,
 		return -ENOTDIR;
 
 	dt_write_lock(env, dt_obj, MOR_TGT_CHILD);
-	rc = dt_insert(env, dt_obj, rec, key, th);
+	rc = dt_insert(env, dt_obj, rec, key, th, 0);
 	dt_write_unlock(env, dt_obj);
 
 	return rc;
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_fmd.c b/drivers/staging/lustrefsx/lustre/target/tgt_fmd.c
deleted file mode 100644
index afbf668e38a70..0000000000000
--- a/drivers/staging/lustrefsx/lustre/target/tgt_fmd.c
+++ /dev/null
@@ -1,363 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright  2008 Sun Microsystems, Inc. All rights reserved
- * Use is subject to license terms.
- *
- * Copyright (c) 2012, 2014, Intel Corporation.
- *
- * Copyright (c) 2019, DDN Storage Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- *
- * lustre/target/tgt_fmd.c
- *
- * This file provides functions to handle Filter Modification Data (FMD).
- * The FMD is responsible for file attributes to be applied in
- * Transaction ID (XID) order, so older requests can't re-write newer
- * attributes.
- *
- * FMD is organized as per-client list and identified by FID of object. Each
- * FMD stores FID of object and the highest received XID of modification
- * request for this object.
- *
- * FMD can expire if there are no updates for a long time to keep the list
- * reasonably small.
- *
- * Author: Andreas Dilger <adilger@whamcloud.com>
- * Author: Mike Pershin <mpershin@whamcloud.com>
- */
-
-#define DEBUG_SUBSYSTEM S_CLASS
-
-#include <obd.h>
-#include <obd_class.h>
-
-#include "tgt_internal.h"
-
-/**
- * Drop FMD reference and free it if reference drops to zero.
- *
- * Must be called with ted_fmd_lock held.
- *
- * \param[in] exp	OBD export
- * \param[in] fmd	FMD to put
- */
-static inline void tgt_fmd_put_nolock(struct obd_export *exp,
-				      struct tgt_fmd_data *fmd)
-{
-	struct tg_export_data *ted = &exp->exp_target_data;
-
-	assert_spin_locked(&ted->ted_fmd_lock);
-	if (--fmd->fmd_refcount == 0) {
-		ted->ted_fmd_count--;
-		list_del(&fmd->fmd_list);
-		OBD_SLAB_FREE_PTR(fmd, tgt_fmd_kmem);
-	}
-}
-
-/**
- * Wrapper to drop FMD reference with ted_fmd_lock held.
- *
- * \param[in] exp	OBD export
- * \param[in] fmd	FMD to put
- */
-void tgt_fmd_put(struct obd_export *exp, struct tgt_fmd_data *fmd)
-{
-	struct tg_export_data *ted = &exp->exp_target_data;
-
-	spin_lock(&ted->ted_fmd_lock);
-	tgt_fmd_put_nolock(exp, fmd); /* caller reference */
-	spin_unlock(&ted->ted_fmd_lock);
-}
-
-/**
- * Expire FMD entries.
- *
- * Expire entries from the FMD list if there are too many
- * of them or they are too old.
- *
- * This function must be called with ted_fmd_lock held.
- *
- * The \a keep FMD is not to be expired in any case. This parameter is used
- * by ofd_fmd_find_nolock() to prohibit a FMD that was just found from
- * expiring.
- *
- * \param[in] exp	OBD export
- * \param[in] keep	FMD to keep always
- */
-static void tgt_fmd_expire_nolock(struct obd_export *exp,
-				  struct tgt_fmd_data *keep)
-{
-	struct tg_export_data *ted = &exp->exp_target_data;
-	struct lu_target *lut = exp->exp_obd->u.obt.obt_lut;
-	time64_t now = ktime_get_seconds();
-	struct tgt_fmd_data *fmd, *tmp;
-
-	list_for_each_entry_safe(fmd, tmp, &ted->ted_fmd_list, fmd_list) {
-		if (fmd == keep)
-			break;
-
-		if (now < fmd->fmd_expire &&
-		    ted->ted_fmd_count < lut->lut_fmd_max_num)
-			break;
-
-		list_del_init(&fmd->fmd_list);
-		tgt_fmd_put_nolock(exp, fmd); /* list reference */
-	}
-}
-
-/**
- * Expire FMD entries.
- *
- * This is a wrapper to call ofd_fmd_expire_nolock() with the required lock.
- *
- * \param[in] exp	OBD export
- */
-void tgt_fmd_expire(struct obd_export *exp)
-{
-	struct tg_export_data *ted = &exp->exp_target_data;
-
-	spin_lock(&ted->ted_fmd_lock);
-	tgt_fmd_expire_nolock(exp, NULL);
-	spin_unlock(&ted->ted_fmd_lock);
-}
-
-/**
- * Find FMD by specified FID.
- *
- * Function finds FMD entry by FID in the tg_export_data::ted_fmd_list.
- *
- * Caller must hold tg_export_data::ted_fmd_lock and take FMD reference.
- *
- * \param[in] exp	OBD export
- * \param[in] fid	FID of FMD to find
- *
- * \retval		struct tgt_fmd_data found by FID
- * \retval		NULL is FMD is not found
- */
-static struct tgt_fmd_data *tgt_fmd_find_nolock(struct obd_export *exp,
-						const struct lu_fid *fid)
-{
-	struct tg_export_data *ted = &exp->exp_target_data;
-	struct tgt_fmd_data *found = NULL, *fmd;
-	struct lu_target *lut = exp->exp_obd->u.obt.obt_lut;
-	time64_t now = ktime_get_seconds();
-
-	assert_spin_locked(&ted->ted_fmd_lock);
-
-	list_for_each_entry_reverse(fmd, &ted->ted_fmd_list, fmd_list) {
-		if (lu_fid_eq(&fmd->fmd_fid, fid)) {
-			found = fmd;
-			list_move_tail(&fmd->fmd_list, &ted->ted_fmd_list);
-			fmd->fmd_expire = now + lut->lut_fmd_max_age;
-			break;
-		}
-	}
-
-	tgt_fmd_expire_nolock(exp, found);
-
-	return found;
-}
-
-/**
- * Find FMD by specified FID with locking.
- *
- * Wrapper to the ofd_fmd_find_nolock() with correct locks.
- *
- * \param[in] exp	OBD export
- * \param[in] fid	FID of FMD to find
- *
- * \retval		struct tgt_fmd_data found by FID
- * \retval		NULL indicates FMD is not found
- */
-struct tgt_fmd_data *tgt_fmd_find(struct obd_export *exp,
-				  const struct lu_fid *fid)
-{
-	struct tg_export_data *ted = &exp->exp_target_data;
-	struct tgt_fmd_data *fmd;
-
-	spin_lock(&ted->ted_fmd_lock);
-	fmd = tgt_fmd_find_nolock(exp, fid);
-	if (fmd)
-		fmd->fmd_refcount++;    /* caller reference */
-	spin_unlock(&ted->ted_fmd_lock);
-
-	return fmd;
-}
-
-/**
- * Find FMD by FID or create a new one if none is found.
- *
- * It is possible for this function to return NULL under memory pressure,
- * or if the passed FID is zero (which will only cause old entries to expire).
- * Currently this is not fatal because any FMD state is transient and
- * may also be freed when it gets sufficiently old.
- *
- * \param[in] exp	OBD export
- * \param[in] fid	FID of FMD to find
- *
- * \retval		struct tgt_fmd_data found by FID
- * \retval		NULL indicates FMD is not found
- */
-struct tgt_fmd_data *tgt_fmd_get(struct obd_export *exp,
-				 const struct lu_fid *fid)
-{
-	struct tg_export_data *ted = &exp->exp_target_data;
-	struct tgt_fmd_data *found = NULL, *fmd_new = NULL;
-
-	OBD_SLAB_ALLOC_PTR(fmd_new, tgt_fmd_kmem);
-
-	spin_lock(&ted->ted_fmd_lock);
-	found = tgt_fmd_find_nolock(exp, fid);
-	if (fmd_new) {
-		if (!found) {
-			list_add_tail(&fmd_new->fmd_list, &ted->ted_fmd_list);
-			fmd_new->fmd_fid = *fid;
-			fmd_new->fmd_refcount++;   /* list reference */
-			found = fmd_new;
-			ted->ted_fmd_count++;
-		} else {
-			OBD_SLAB_FREE_PTR(fmd_new, tgt_fmd_kmem);
-		}
-	}
-	if (found) {
-		found->fmd_refcount++; /* caller reference */
-		found->fmd_expire = ktime_get_seconds() +
-			class_exp2tgt(exp)->lut_fmd_max_age;
-	} else {
-		LCONSOLE_WARN("%s: cannot allocate FMD for "DFID
-			      ", timestamps may be out of sync\n",
-			      exp->exp_obd->obd_name, PFID(fid));
-	}
-	spin_unlock(&ted->ted_fmd_lock);
-
-	return found;
-}
-
-#ifdef DO_FMD_DROP
-/**
- * Drop FMD list reference so it will disappear when last reference is dropped
- * to zero.
- *
- * This function is called from ofd_destroy() and may only affect
- * the one client that is doing the unlink and at worst we have an stale entry
- * referencing an object that should never be used again.
- *
- * NB: this function is used only if DO_FMD_DROP is defined. It is not
- * currently defined, so FMD drop doesn't happen and FMD are dropped only
- * when expired.
- *
- * \param[in] exp	OBD export
- * \param[in] fid	FID of FMD to drop
- */
-void tgt_fmd_drop(struct obd_export *exp, const struct lu_fid *fid)
-{
-	struct tg_export_data *ted = &exp->exp_target_data;
-	struct tgt_fmd_data *fmd = NULL;
-
-	spin_lock(&ted->ted_fmd_lock);
-	fmd = tgt_fmd_find_nolock(exp, fid);
-	if (fmd) {
-		list_del_init(&fmd->fmd_list);
-		tgt_fmd_put_nolock(exp, fmd);
-	}
-	spin_unlock(&ted->ted_fmd_lock);
-}
-EXPORT_SYMBOL(tgt_fmd_drop);
-#endif
-
-/**
- * Remove all entries from FMD list.
- *
- * Cleanup function to free all FMD enries on the given export.
- *
- * \param[in] exp	OBD export
- */
-void tgt_fmd_cleanup(struct obd_export *exp)
-{
-	struct tg_export_data *ted = &exp->exp_target_data;
-	struct tgt_fmd_data *fmd = NULL, *tmp;
-
-	spin_lock(&ted->ted_fmd_lock);
-	list_for_each_entry_safe(fmd, tmp, &ted->ted_fmd_list, fmd_list) {
-		list_del_init(&fmd->fmd_list);
-		if (fmd->fmd_refcount > 1) {
-			CDEBUG(D_INFO,
-			       "fmd %p still referenced (refcount = %d)\n",
-			       fmd, fmd->fmd_refcount);
-		}
-		tgt_fmd_put_nolock(exp, fmd);
-	}
-	spin_unlock(&ted->ted_fmd_lock);
-	LASSERT(list_empty(&exp->exp_target_data.ted_fmd_list));
-}
-
-/**
- * Update FMD with the latest request XID.
- *
- * Save a new setattr/punch XID in FMD if exists.
- *
- * \param[in] exp	OBD export
- * \param[in] fid	FID of FMD to find
- * \param[in] xid	request XID
- */
-void tgt_fmd_update(struct obd_export *exp, const struct lu_fid *fid, __u64 xid)
-{
-	struct tgt_fmd_data *fmd;
-
-	fmd = tgt_fmd_get(exp, fid);
-	if (fmd) {
-		if (fmd->fmd_mactime_xid < xid)
-			fmd->fmd_mactime_xid = xid;
-		tgt_fmd_put(exp, fmd);
-	}
-}
-EXPORT_SYMBOL(tgt_fmd_update);
-
-/**
- * Chech that time can be updated by the request with given XID.
- *
- * Check FMD XID if exists to be less than supplied XID
- *
- * \param[in] exp	OBD export
- * \param[in] fid	FID of FMD to find
- * \param[in] xid	request XID
- *
- * \retval true if FMD has no greater XID, so time attr can be updated
- */
-bool tgt_fmd_check(struct obd_export *exp, const struct lu_fid *fid, __u64 xid)
-{
-	struct tgt_fmd_data *fmd;
-	bool can_update = true;
-
-	fmd = tgt_fmd_find(exp, fid);
-	if (fmd) {
-		can_update = fmd->fmd_mactime_xid < xid;
-		tgt_fmd_put(exp, fmd);
-	}
-
-	return can_update;
-}
-EXPORT_SYMBOL(tgt_fmd_check);
-
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_grant.c b/drivers/staging/lustrefsx/lustre/target/tgt_grant.c
index 3c5eec062cb4e..083e40020f1fc 100644
--- a/drivers/staging/lustrefsx/lustre/target/tgt_grant.c
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_grant.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * lustre/target/tgt_grant.c
@@ -71,7 +71,7 @@
  * Author: Johann Lombardi <johann.lombardi@intel.com>
  */
 
-#define DEBUG_SUBSYSTEM S_CLASS
+#define DEBUG_SUBSYSTEM S_FILTER
 
 #include <obd.h>
 #include <obd_class.h>
@@ -138,6 +138,11 @@ static int tgt_check_export_grants(struct obd_export *exp, u64 *dirty,
 	struct tg_export_data *ted = &exp->exp_target_data;
 	int level = D_CACHE;
 
+	if (exp->exp_obd->obd_self_export == exp)
+		CDEBUG(D_CACHE, "%s: processing self export: %ld %ld "
+		       "%ld\n", exp->exp_obd->obd_name, ted->ted_grant,
+		       ted->ted_pending, ted->ted_dirty);
+
 	if (ted->ted_grant < 0 || ted->ted_pending < 0 || ted->ted_dirty < 0)
 		level = D_ERROR;
 	CDEBUG_LIMIT(level, "%s: cli %s/%p dirty %ld pend %ld grant %ld\n",
@@ -183,7 +188,6 @@ void tgt_grant_sanity_check(struct obd_device *obd, const char *func)
 	struct lu_target *lut = obd->u.obt.obt_lut;
 	struct tg_grants_data *tgd = &lut->lut_tgd;
 	struct obd_export *exp;
-	struct tg_export_data *ted;
 	u64		   maxsize;
 	u64		   tot_dirty = 0;
 	u64		   tot_pending = 0;
@@ -205,15 +209,6 @@ void tgt_grant_sanity_check(struct obd_device *obd, const char *func)
 
 	spin_lock(&obd->obd_dev_lock);
 	spin_lock(&tgd->tgd_grant_lock);
-	exp = obd->obd_self_export;
-	ted = &exp->exp_target_data;
-	CDEBUG(D_CACHE, "%s: processing self export: %ld %ld "
-	       "%ld\n", obd->obd_name, ted->ted_grant,
-	       ted->ted_pending, ted->ted_dirty);
-	tot_granted += ted->ted_grant + ted->ted_pending;
-	tot_pending += ted->ted_pending;
-	tot_dirty += ted->ted_dirty;
-
 	list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) {
 		error = tgt_check_export_grants(exp, &tot_dirty, &tot_pending,
 						&tot_granted, maxsize);
@@ -280,14 +275,14 @@ EXPORT_SYMBOL(tgt_grant_sanity_check);
  * \retval		negative value on error
  */
 int tgt_statfs_internal(const struct lu_env *env, struct lu_target *lut,
-			struct obd_statfs *osfs, time64_t max_age, int *from_cache)
+			struct obd_statfs *osfs, __u64 max_age, int *from_cache)
 {
 	struct tg_grants_data *tgd = &lut->lut_tgd;
 	int rc = 0;
 	ENTRY;
 
 	spin_lock(&tgd->tgd_osfs_lock);
-	if (tgd->tgd_osfs_age < max_age || max_age == 0) {
+	if (cfs_time_before_64(tgd->tgd_osfs_age, max_age) || max_age == 0) {
 		u64 unstable;
 
 		/* statfs data are too old, get up-to-date one.
@@ -313,8 +308,6 @@ int tgt_statfs_internal(const struct lu_env *env, struct lu_target *lut,
 		if (unlikely(rc))
 			GOTO(out, rc);
 
-		osfs->os_namelen = min_t(__u32, osfs->os_namelen, NAME_MAX);
-
 		spin_lock(&tgd->tgd_grant_lock);
 		spin_lock(&tgd->tgd_osfs_lock);
 		/* calculate how much space was written while we released the
@@ -344,7 +337,7 @@ int tgt_statfs_internal(const struct lu_env *env, struct lu_target *lut,
 
 		/* finally udpate cached statfs data */
 		tgd->tgd_osfs = *osfs;
-		tgd->tgd_osfs_age = ktime_get_seconds();
+		tgd->tgd_osfs_age = cfs_time_current_64();
 
 		tgd->tgd_statfs_inflight--; /* stop tracking */
 		if (tgd->tgd_statfs_inflight == 0)
@@ -390,13 +383,13 @@ static void tgt_grant_statfs(const struct lu_env *env, struct obd_export *exp,
 	struct tg_grants_data	*tgd = &lut->lut_tgd;
 	struct tgt_thread_info	*tti;
 	struct obd_statfs	*osfs;
-	time64_t max_age;
-	int rc;
+	__u64			 max_age;
+	int			 rc;
 
 	if (force)
 		max_age = 0; /* get fresh statfs data */
 	else
-		max_age = ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS;
+		max_age = cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS);
 
 	tti = tgt_th_info(env);
 	osfs = &tti->tti_u.osfs;
@@ -435,7 +428,6 @@ static u64 tgt_grant_space_left(struct obd_export *exp)
 	u64			 left;
 	u64			 avail;
 	u64			 unstable;
-	u64			 reserved;
 
 	ENTRY;
 	assert_spin_locked(&tgd->tgd_grant_lock);
@@ -446,8 +438,7 @@ static u64 tgt_grant_space_left(struct obd_export *exp)
 	unstable = tgd->tgd_osfs_unstable; /* those might be accounted twice */
 	spin_unlock(&tgd->tgd_osfs_lock);
 
-	reserved = left * tgd->tgd_reserved_pcnt / 100;
-	tot_granted = tgd->tgd_tot_granted + reserved;
+	tot_granted = tgd->tgd_tot_granted;
 
 	if (left < tot_granted) {
 		int mask = (left + unstable <
@@ -499,7 +490,8 @@ static void tgt_grant_incoming(const struct lu_env *env, struct obd_export *exp,
 	struct tg_export_data	*ted = &exp->exp_target_data;
 	struct obd_device	*obd = exp->exp_obd;
 	struct tg_grants_data	*tgd = &obd->u.obt.obt_lut->lut_tgd;
-	long long		 dirty, dropped;
+	long			 dirty;
+	long			 dropped;
 	ENTRY;
 
 	assert_spin_locked(&tgd->tgd_grant_lock);
@@ -523,19 +515,10 @@ static void tgt_grant_incoming(const struct lu_env *env, struct obd_export *exp,
 
 	/* inflate grant counters if required */
 	if (!exp_grant_param_supp(exp)) {
-		u64 tmp;
 		oa->o_grant	= tgt_grant_inflate(tgd, oa->o_grant);
 		oa->o_dirty	= tgt_grant_inflate(tgd, oa->o_dirty);
-		/* inflation can bump client's wish to >4GB which doesn't fit
-		 * 32bit o_undirty, limit that ..  */
-		tmp = tgt_grant_inflate(tgd, oa->o_undirty);
-		if (tmp >= OBD_MAX_GRANT)
-			tmp = OBD_MAX_GRANT & ~(1ULL << tgd->tgd_blockbits);
-		oa->o_undirty = tmp;
-		tmp = tgt_grant_inflate(tgd, oa->o_dropped);
-		if (tmp >= OBD_MAX_GRANT)
-			tmp = OBD_MAX_GRANT & ~(1ULL << tgd->tgd_blockbits);
-		oa->o_dropped = tmp;
+		oa->o_dropped	= tgt_grant_inflate(tgd, (u64)oa->o_dropped);
+		oa->o_undirty	= tgt_grant_inflate(tgd, oa->o_undirty);
 	}
 
 	dirty = oa->o_dirty;
@@ -550,13 +533,13 @@ static void tgt_grant_incoming(const struct lu_env *env, struct obd_export *exp,
 	tgd->tgd_tot_dirty += dirty - ted->ted_dirty;
 	if (ted->ted_grant < dropped) {
 		CDEBUG(D_CACHE,
-		       "%s: cli %s/%p reports %llu dropped > grant %lu\n",
+		       "%s: cli %s/%p reports %lu dropped > grant %lu\n",
 		       obd->obd_name, exp->exp_client_uuid.uuid, exp, dropped,
 		       ted->ted_grant);
 		dropped = 0;
 	}
 	if (tgd->tgd_tot_granted < dropped) {
-		CERROR("%s: cli %s/%p reports %llu dropped > tot_grant %llu\n",
+		CERROR("%s: cli %s/%p reports %lu dropped > tot_grant %llu\n",
 		       obd->obd_name, exp->exp_client_uuid.uuid, exp,
 		       dropped, tgd->tgd_tot_granted);
 		dropped = 0;
@@ -605,14 +588,6 @@ static void tgt_grant_shrink(struct obd_export *exp, struct obdo *oa,
 
 	grant_shrink = oa->o_grant;
 
-	if (ted->ted_grant < grant_shrink) {
-		CDEBUG(D_CACHE,
-		       "%s: cli %s/%p wants %lu shrinked > grant %lu\n",
-		       obd->obd_name, exp->exp_client_uuid.uuid, exp,
-		       grant_shrink, ted->ted_grant);
-		grant_shrink = ted->ted_grant;
-	}
-
 	ted->ted_grant -= grant_shrink;
 	tgd->tgd_tot_granted -= grant_shrink;
 
@@ -884,7 +859,6 @@ static void tgt_grant_check(const struct lu_env *env, struct obd_export *exp,
  *				have
  * \param[in] left		remaining free space with granted space taken
  *				out
- * \param[in] chunk		grant allocation unit
  * \param[in] conservative	if set to true, the server should be cautious
  *				and limit how much space is granted back to the
  *				client. Otherwise, the server should try hard to
@@ -903,9 +877,6 @@ static long tgt_grant_alloc(struct obd_export *exp, u64 curgrant,
 
 	ENTRY;
 
-	if (OBD_FAIL_CHECK(OBD_FAIL_TGT_NO_GRANT))
-		RETURN(0);
-
 	/* When tgd_grant_compat_disable is set, we don't grant any space to
 	 * clients not supporting OBD_CONNECT_GRANT_PARAM.
 	 * Otherwise, space granted to such a client is inflated since it
@@ -957,19 +928,18 @@ static long tgt_grant_alloc(struct obd_export *exp, u64 curgrant,
 	 * client would like to have by more than grants for 2 full
 	 * RPCs
 	 */
-	if (want + chunk <= ted->ted_grant)
-		RETURN(0);
 	if (ted->ted_grant + grant > want + chunk)
 		grant = want + chunk - ted->ted_grant;
 
 	tgd->tgd_tot_granted += grant;
 	ted->ted_grant += grant;
 
-	if (unlikely(ted->ted_grant < 0 || ted->ted_grant > want + chunk)) {
+	if (ted->ted_grant < 0) {
 		CERROR("%s: cli %s/%p grant %ld want %llu current %llu\n",
 		       obd->obd_name, exp->exp_client_uuid.uuid, exp,
 		       ted->ted_grant, want, curgrant);
 		spin_unlock(&tgd->tgd_grant_lock);
+		LBUG();
 	}
 
 	CDEBUG(D_CACHE,
@@ -1083,51 +1053,28 @@ EXPORT_SYMBOL(tgt_grant_connect);
 void tgt_grant_discard(struct obd_export *exp)
 {
 	struct obd_device	*obd = exp->exp_obd;
-	struct lu_target        *lut = class_exp2tgt(exp);
+	struct tg_grants_data	*tgd = &obd->u.obt.obt_lut->lut_tgd;
 	struct tg_export_data	*ted = &exp->exp_target_data;
-	struct tg_grants_data	*tgd;
-
-	if (!lut)
-		return;
 
-	tgd = &lut->lut_tgd;
 	spin_lock(&tgd->tgd_grant_lock);
-	if (unlikely(tgd->tgd_tot_granted < ted->ted_grant ||
-		     tgd->tgd_tot_dirty < ted->ted_dirty)) {
-		struct obd_export *e;
-		u64 ttg = 0;
-		u64 ttd = 0;
-
-		list_for_each_entry(e, &obd->obd_exports, exp_obd_chain) {
-			LASSERT(exp != e);
-			ttg += e->exp_target_data.ted_grant;
-			ttg += e->exp_target_data.ted_pending;
-			ttd += e->exp_target_data.ted_dirty;
-		}
-		if (tgd->tgd_tot_granted < ted->ted_grant)
-			CERROR("%s: cli %s/%p: tot_granted %llu < ted_grant %ld, corrected to %llu",
-			       obd->obd_name,  exp->exp_client_uuid.uuid, exp,
-			       tgd->tgd_tot_granted, ted->ted_grant, ttg);
-		if (tgd->tgd_tot_dirty < ted->ted_dirty)
-			CERROR("%s: cli %s/%p: tot_dirty %llu < ted_dirty %ld, corrected to %llu",
-			       obd->obd_name, exp->exp_client_uuid.uuid, exp,
-			       tgd->tgd_tot_dirty, ted->ted_dirty, ttd);
-		tgd->tgd_tot_granted = ttg;
-		tgd->tgd_tot_dirty = ttd;
-	} else {
-		tgd->tgd_tot_granted -= ted->ted_grant;
-		tgd->tgd_tot_dirty -= ted->ted_dirty;
-	}
+	LASSERTF(tgd->tgd_tot_granted >= ted->ted_grant,
+		 "%s: tot_granted %llu cli %s/%p ted_grant %ld\n",
+		 obd->obd_name, tgd->tgd_tot_granted,
+		 exp->exp_client_uuid.uuid, exp, ted->ted_grant);
+	tgd->tgd_tot_granted -= ted->ted_grant;
 	ted->ted_grant = 0;
-	ted->ted_dirty = 0;
-
-	if (tgd->tgd_tot_pending < ted->ted_pending) {
-		CERROR("%s: tot_pending %llu < cli %s/%p ted_pending %ld\n",
-		       obd->obd_name, tgd->tgd_tot_pending,
-		       exp->exp_client_uuid.uuid, exp, ted->ted_pending);
-	}
+	LASSERTF(tgd->tgd_tot_pending >= ted->ted_pending,
+		 "%s: tot_pending %llu cli %s/%p ted_pending %ld\n",
+		 obd->obd_name, tgd->tgd_tot_pending,
+		 exp->exp_client_uuid.uuid, exp, ted->ted_pending);
 	/* tgd_tot_pending is handled in tgt_grant_commit as bulk
 	 * commmits */
+	LASSERTF(tgd->tgd_tot_dirty >= ted->ted_dirty,
+		 "%s: tot_dirty %llu cli %s/%p ted_dirty %ld\n",
+		 obd->obd_name, tgd->tgd_tot_dirty,
+		 exp->exp_client_uuid.uuid, exp, ted->ted_dirty);
+	tgd->tgd_tot_dirty -= ted->ted_dirty;
+	ted->ted_dirty = 0;
 	spin_unlock(&tgd->tgd_grant_lock);
 }
 EXPORT_SYMBOL(tgt_grant_discard);
@@ -1562,131 +1509,3 @@ int tgt_grant_commit_cb_add(struct thandle *th, struct obd_export *exp,
 	RETURN(rc);
 }
 EXPORT_SYMBOL(tgt_grant_commit_cb_add);
-
-/**
- * Show estimate of total amount of dirty data on clients.
- *
- * @kobj		kobject embedded in obd_device
- * @attr		unused
- * @buf			buf used by sysfs to print out data
- *
- * Return:		0 on success
- *			negative value on error
- */
-ssize_t tot_dirty_show(struct kobject *kobj, struct attribute *attr,
-		       char *buf)
-{
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct tg_grants_data *tgd;
-
-	tgd = &obd->u.obt.obt_lut->lut_tgd;
-	return scnprintf(buf, PAGE_SIZE, "%llu\n", tgd->tgd_tot_dirty);
-}
-EXPORT_SYMBOL(tot_dirty_show);
-
-/**
- * Show total amount of space granted to clients.
- *
- * @kobj		kobject embedded in obd_device
- * @attr		unused
- * @buf			buf used by sysfs to print out data
- *
- * Return:		0 on success
- *			negative value on error
- */
-ssize_t tot_granted_show(struct kobject *kobj, struct attribute *attr,
-			 char *buf)
-{
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct tg_grants_data *tgd;
-
-	tgd = &obd->u.obt.obt_lut->lut_tgd;
-	return scnprintf(buf, PAGE_SIZE, "%llu\n", tgd->tgd_tot_granted);
-}
-EXPORT_SYMBOL(tot_granted_show);
-
-/**
- * Show total amount of space used by IO in progress.
- *
- * @kobj		kobject embedded in obd_device
- * @attr		unused
- * @buf			buf used by sysfs to print out data
- *
- * Return:		0 on success
- *			negative value on error
- */
-ssize_t tot_pending_show(struct kobject *kobj, struct attribute *attr,
-			 char *buf)
-{
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct tg_grants_data *tgd;
-
-	tgd = &obd->u.obt.obt_lut->lut_tgd;
-	return scnprintf(buf, PAGE_SIZE, "%llu\n", tgd->tgd_tot_pending);
-}
-EXPORT_SYMBOL(tot_pending_show);
-
-/**
- * Show if grants compatibility mode is disabled.
- *
- * When tgd_grant_compat_disable is set, we don't grant any space to clients
- * not supporting OBD_CONNECT_GRANT_PARAM. Otherwise, space granted to such
- * a client is inflated since it consumes PAGE_SIZE of grant space per
- * block, (i.e. typically 4kB units), but underlaying file system might have
- * block size bigger than page size, e.g. ZFS. See LU-2049 for details.
- *
- * @kobj		kobject embedded in obd_device
- * @attr		unused
- * @buf			buf used by sysfs to print out data
- *
- * Return:		string length of @buf output on success
- */
-ssize_t grant_compat_disable_show(struct kobject *kobj, struct attribute *attr,
-				  char *buf)
-{
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
-
-	return scnprintf(buf, PAGE_SIZE, "%u\n", tgd->tgd_grant_compat_disable);
-}
-EXPORT_SYMBOL(grant_compat_disable_show);
-
-/**
- * Change grant compatibility mode.
- *
- * Setting tgd_grant_compat_disable prohibit any space granting to clients
- * not supporting OBD_CONNECT_GRANT_PARAM. See details above.
- *
- * @kobj	kobject embedded in obd_device
- * @attr	unused
- * @buffer	string which represents mode
- *		1: disable compatibility mode
- *		0: enable compatibility mode
- * @count	@buffer length
- *
- * Return:	@count on success
- *		negative number on error
- */
-ssize_t grant_compat_disable_store(struct kobject *kobj,
-				   struct attribute *attr,
-				   const char *buffer, size_t count)
-{
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
-	bool val;
-	int rc;
-
-	rc = kstrtobool(buffer, &val);
-	if (rc)
-		return rc;
-
-	tgd->tgd_grant_compat_disable = val;
-
-	return count;
-}
-EXPORT_SYMBOL(grant_compat_disable_store);
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_handler.c b/drivers/staging/lustrefsx/lustre/target/tgt_handler.c
index 2ec6d01e60d91..d2113af69436b 100644
--- a/drivers/staging/lustrefsx/lustre/target/tgt_handler.c
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_handler.c
@@ -21,7 +21,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2013, 2017, Intel Corporation.
+ * Copyright (c) 2013, 2016, Intel Corporation.
  */
 /*
  * lustre/target/tgt_handler.c
@@ -343,13 +343,10 @@ static int tgt_request_preprocess(struct tgt_session_info *tsi,
 
 		dlm_req = req_capsule_client_get(pill, &RMF_DLM_REQ);
 		if (dlm_req != NULL) {
-			union ldlm_wire_policy_data *policy =
-					&dlm_req->lock_desc.l_policy_data;
-
 			if (unlikely(dlm_req->lock_desc.l_resource.lr_type ==
 				     LDLM_IBITS &&
-				     (policy->l_inodebits.bits |
-				      policy->l_inodebits.try_bits) == 0)) {
+				     dlm_req->lock_desc.l_policy_data.\
+				     l_inodebits.bits == 0)) {
 				/*
 				 * Lock without inodebits makes no sense and
 				 * will oops later in ldlm. If client miss to
@@ -434,20 +431,6 @@ static int tgt_handle_request0(struct tgt_session_info *tsi,
 					     &RMF_ACL, RCL_SERVER,
 					     LUSTRE_POSIX_ACL_MAX_SIZE_OLD);
 
-		if (req_capsule_has_field(tsi->tsi_pill, &RMF_SHORT_IO,
-					  RCL_SERVER)) {
-			struct niobuf_remote *remote_nb =
-				req_capsule_client_get(tsi->tsi_pill,
-						       &RMF_NIOBUF_REMOTE);
-			struct ost_body *body = tsi->tsi_ost_body;
-
-			req_capsule_set_size(tsi->tsi_pill, &RMF_SHORT_IO,
-					 RCL_SERVER,
-					 (body->oa.o_valid & OBD_MD_FLFLAGS &&
-					  body->oa.o_flags & OBD_FL_SHORT_IO) ?
-					 remote_nb[0].rnb_len : 0);
-		}
-
 		rc = req_capsule_server_pack(tsi->tsi_pill);
 	}
 
@@ -613,14 +596,8 @@ static struct tgt_handler *tgt_handler_find_check(struct ptlrpc_request *req)
 
 	/* opcode was not found in slice */
 	if (unlikely(s->tos_hs == NULL)) {
-		static bool printed;
-
-		/* don't print error messages for known unhandled RPCs */
-		if (opc != OST_FALLOCATE && opc != OST_SEEK && !printed) {
-			CERROR("%s: no handler for opcode 0x%x from %s\n",
-			       tgt_name(tgt), opc, libcfs_id2str(req->rq_peer));
-			printed = true;
-		}
+		CERROR("%s: no handlers for opcode 0x%x\n", tgt_name(tgt),
+		       opc);
 		RETURN(ERR_PTR(-ENOTSUPP));
 	}
 
@@ -668,19 +645,6 @@ static int process_req_last_xid(struct ptlrpc_request *req)
 			RETURN(-EPROTO);
 	}
 
-	/* The "last_xid" is the minimum xid among unreplied requests,
-	 * if the request is from the previous connection, its xid can
-	 * still be larger than "exp_last_xid", then the above check of
-	 * xid is not enough to determine whether the request is delayed.
-	 *
-	 * For example, if some replay request was delayed and caused
-	 * timeout at client and the replay is restarted, the delayed
-	 * replay request will have the larger xid than "exp_last_xid"
-	 */
-	if (req->rq_export->exp_conn_cnt >
-	    lustre_msg_get_conn_cnt(req->rq_reqmsg))
-		RETURN(-ESTALE);
-
 	/* try to release in-memory reply data */
 	if (tgt_is_multimodrpcs_client(req->rq_export)) {
 		tgt_handle_received_xid(req->rq_export,
@@ -707,18 +671,8 @@ int tgt_request_handle(struct ptlrpc_request *req)
 	bool			 is_connect = false;
 	ENTRY;
 
-	if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_TGT_RECOVERY_REQ_RACE))) {
-		if (cfs_fail_val == 0 &&
-		    lustre_msg_get_opc(msg) != OBD_PING &&
-		    lustre_msg_get_flags(msg) & MSG_REQ_REPLAY_DONE) {
-			struct l_wait_info lwi =  { 0 };
-
-			cfs_fail_val = 1;
-			cfs_race_state = 0;
-			l_wait_event(cfs_race_waitq, (cfs_race_state == 1),
-				     &lwi);
-		}
-	}
+	/* Refill the context, to make sure all thread keys are allocated */
+	lu_env_refill(req->rq_svc_thread->t_env);
 
 	req_capsule_init(&req->rq_pill, req, RCL_SERVER);
 	tsi->tsi_pill = &req->rq_pill;
@@ -882,9 +836,9 @@ EXPORT_SYMBOL(tgt_counter_incr);
 
 int tgt_connect_check_sptlrpc(struct ptlrpc_request *req, struct obd_export *exp)
 {
-	struct lu_target *tgt = class_exp2tgt(exp);
-	struct sptlrpc_flavor flvr;
-	int rc = 0;
+	struct lu_target	*tgt = class_exp2tgt(exp);
+	struct sptlrpc_flavor	 flvr;
+	int			 rc = 0;
 
 	LASSERT(tgt);
 	LASSERT(tgt->lut_obd);
@@ -909,13 +863,13 @@ int tgt_connect_check_sptlrpc(struct ptlrpc_request *req, struct obd_export *exp
 		exp->exp_sp_peer = req->rq_sp_from;
 		exp->exp_flvr = flvr;
 
-		/* when on mgs, if no restriction is set, or if the client
-		 * NID is on the local node, allow any flavor
-		 */
+		/* when on mgs, if no restriction is set, or if client
+		 * is loopback, allow any flavor */
 		if ((strcmp(exp->exp_obd->obd_type->typ_name,
 			   LUSTRE_MGS_NAME) == 0) &&
 		     (exp->exp_flvr.sf_rpc == SPTLRPC_FLVR_NULL ||
-		      LNetIsPeerLocal(exp->exp_connection->c_peer.nid)))
+		      LNET_NETTYP(LNET_NIDNET(exp->exp_connection->c_peer.nid))
+		      == LOLND))
 			exp->exp_flvr.sf_rpc = SPTLRPC_FLVR_ANY;
 
 		if (exp->exp_flvr.sf_rpc != SPTLRPC_FLVR_ANY &&
@@ -995,19 +949,9 @@ int tgt_connect(struct tgt_session_info *tsi)
 	reply = req_capsule_server_get(tsi->tsi_pill, &RMF_CONNECT_DATA);
 	spin_lock(&tsi->tsi_exp->exp_lock);
 	*exp_connect_flags_ptr(tsi->tsi_exp) = reply->ocd_connect_flags;
-	if (reply->ocd_connect_flags & OBD_CONNECT_FLAGS2)
-		*exp_connect_flags2_ptr(tsi->tsi_exp) =
-			reply->ocd_connect_flags2;
 	tsi->tsi_exp->exp_connect_data.ocd_brw_size = reply->ocd_brw_size;
 	spin_unlock(&tsi->tsi_exp->exp_lock);
 
-	if (strcmp(tsi->tsi_exp->exp_obd->obd_type->typ_name,
-		   LUSTRE_MDT_NAME) == 0) {
-		rc = req_check_sepol(tsi->tsi_pill);
-		if (rc)
-			GOTO(out, rc);
-	}
-
 	RETURN(0);
 out:
 	obd_disconnect(class_export_get(tsi->tsi_exp));
@@ -1021,8 +965,6 @@ int tgt_disconnect(struct tgt_session_info *tsi)
 
 	ENTRY;
 
-	OBD_FAIL_TIMEOUT(OBD_FAIL_OST_DISCONNECT_DELAY, cfs_fail_val);
-
 	rc = target_handle_disconnect(tgt_ses_req(tsi));
 	if (rc)
 		RETURN(err_serious(rc));
@@ -1040,16 +982,7 @@ int tgt_obd_ping(struct tgt_session_info *tsi)
 
 	ENTRY;
 
-	/* The target-specific part of OBD_PING request handling.
-	 * It controls Filter Modification Data (FMD) expiration each time
-	 * PING is received.
-	 *
-	 * Valid only for replayable targets, e.g. MDT and OFD
-	 */
-	if (tsi->tsi_exp->exp_obd->obd_replayable)
-		tgt_fmd_expire(tsi->tsi_exp);
-
-	rc = req_capsule_server_pack(tsi->tsi_pill);
+	rc = target_handle_ping(tgt_ses_req(tsi));
 	if (rc)
 		RETURN(err_serious(rc));
 
@@ -1219,6 +1152,7 @@ static int tgt_obd_idx_read(struct tgt_session_info *tsi)
 
 struct tgt_handler tgt_obd_handlers[] = {
 TGT_OBD_HDL    (0,	OBD_PING,		tgt_obd_ping),
+TGT_OBD_HDL_VAR(0,	OBD_LOG_CANCEL,		tgt_obd_log_cancel),
 TGT_OBD_HDL    (0,	OBD_IDX_READ,		tgt_obd_idx_read)
 };
 EXPORT_SYMBOL(tgt_obd_handlers);
@@ -1282,8 +1216,8 @@ static int tgt_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
 
 	if (flag == LDLM_CB_CANCELING &&
 	    (lock->l_granted_mode & (LCK_EX | LCK_PW | LCK_GROUP)) &&
-	    (tgt->lut_sync_lock_cancel == SYNC_LOCK_CANCEL_ALWAYS ||
-	     (tgt->lut_sync_lock_cancel == SYNC_LOCK_CANCEL_BLOCKING &&
+	    (tgt->lut_sync_lock_cancel == ALWAYS_SYNC_ON_CANCEL ||
+	     (tgt->lut_sync_lock_cancel == BLOCKING_SYNC_ON_CANCEL &&
 	      ldlm_is_cbpending(lock))) &&
 	    ((exp_connect_flags(lock->l_export) & OBD_CONNECT_MDS_MDS) ||
 	     lock->l_resource->lr_type == LDLM_EXTENT)) {
@@ -1292,7 +1226,7 @@ static int tgt_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
 
 		rc = lu_env_init(&env, LCT_DT_THREAD);
 		if (unlikely(rc != 0))
-			GOTO(err, rc);
+			RETURN(rc);
 
 		ost_fid_from_resid(&fid, &lock->l_resource->lr_name,
 				   tgt->lut_lsd.lsd_osd_index);
@@ -1323,7 +1257,7 @@ static int tgt_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
 err_env:
 		lu_env_fini(&env);
 	}
-err:
+
 	rc = ldlm_server_blocking_ast(lock, desc, data, flag);
 	RETURN(rc);
 }
@@ -1395,7 +1329,7 @@ int tgt_cp_callback(struct tgt_session_info *tsi)
 /* generic LDLM target handler */
 struct tgt_handler tgt_dlm_handlers[] = {
 TGT_DLM_HDL    (HABEO_CLAVIS,	LDLM_ENQUEUE,		tgt_enqueue),
-TGT_DLM_HDL    (HABEO_CLAVIS,	LDLM_CONVERT,		tgt_convert),
+TGT_DLM_HDL_VAR(HABEO_CLAVIS,	LDLM_CONVERT,		tgt_convert),
 TGT_DLM_HDL_VAR(0,		LDLM_BL_CALLBACK,	tgt_bl_callback),
 TGT_DLM_HDL_VAR(0,		LDLM_CP_CALLBACK,	tgt_cp_callback)
 };
@@ -1416,6 +1350,30 @@ int tgt_llog_open(struct tgt_session_info *tsi)
 }
 EXPORT_SYMBOL(tgt_llog_open);
 
+int tgt_llog_close(struct tgt_session_info *tsi)
+{
+	int rc;
+
+	ENTRY;
+
+	rc = llog_origin_handle_close(tgt_ses_req(tsi));
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_llog_close);
+
+
+int tgt_llog_destroy(struct tgt_session_info *tsi)
+{
+	int rc;
+
+	ENTRY;
+
+	rc = llog_origin_handle_destroy(tgt_ses_req(tsi));
+
+	RETURN(rc);
+}
+
 int tgt_llog_read_header(struct tgt_session_info *tsi)
 {
 	int rc;
@@ -1458,6 +1416,8 @@ TGT_LLOG_HDL    (0,	LLOG_ORIGIN_HANDLE_CREATE,	tgt_llog_open),
 TGT_LLOG_HDL    (0,	LLOG_ORIGIN_HANDLE_NEXT_BLOCK,	tgt_llog_next_block),
 TGT_LLOG_HDL    (0,	LLOG_ORIGIN_HANDLE_READ_HEADER,	tgt_llog_read_header),
 TGT_LLOG_HDL    (0,	LLOG_ORIGIN_HANDLE_PREV_BLOCK,	tgt_llog_prev_block),
+TGT_LLOG_HDL    (0,	LLOG_ORIGIN_HANDLE_DESTROY,	tgt_llog_destroy),
+TGT_LLOG_HDL_VAR(0,	LLOG_ORIGIN_HANDLE_CLOSE,	tgt_llog_close),
 };
 EXPORT_SYMBOL(tgt_llog_handlers);
 
@@ -1607,48 +1567,13 @@ void tgt_io_thread_done(struct ptlrpc_thread *thread)
 	EXIT;
 }
 EXPORT_SYMBOL(tgt_io_thread_done);
-
-/**
- * Helper function for getting Data-on-MDT file server DLM lock
- * if asked by client.
- */
-int tgt_mdt_data_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
-		      struct lustre_handle *lh, int mode, __u64 *flags)
-{
-	union ldlm_policy_data policy = {
-		.l_inodebits.bits = MDS_INODELOCK_DOM,
-	};
-	int rc;
-
-	ENTRY;
-
-	LASSERT(lh != NULL);
-	LASSERT(ns != NULL);
-	LASSERT(!lustre_handle_is_used(lh));
-
-	rc = ldlm_cli_enqueue_local(NULL, ns, res_id, LDLM_IBITS, &policy, mode,
-				    flags, ldlm_blocking_ast,
-				    ldlm_completion_ast, ldlm_glimpse_ast,
-				    NULL, 0, LVB_T_NONE, NULL, lh);
-
-	RETURN(rc == ELDLM_OK ? 0 : -EIO);
-}
-EXPORT_SYMBOL(tgt_mdt_data_lock);
-
-void tgt_mdt_data_unlock(struct lustre_handle *lh, enum ldlm_mode mode)
-{
-	LASSERT(lustre_handle_is_used(lh));
-	ldlm_lock_decref(lh, mode);
-}
-EXPORT_SYMBOL(tgt_mdt_data_unlock);
-
 /**
  * Helper function for getting server side [start, start+count] DLM lock
  * if asked by client.
  */
-int tgt_extent_lock(const struct lu_env *env, struct ldlm_namespace *ns,
-		    struct ldlm_res_id *res_id, __u64 start, __u64 end,
-		    struct lustre_handle *lh, int mode, __u64 *flags)
+int tgt_extent_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
+		    __u64 start, __u64 end, struct lustre_handle *lh,
+		    int mode, __u64 *flags)
 {
 	union ldlm_policy_data policy;
 	int rc;
@@ -1671,8 +1596,8 @@ int tgt_extent_lock(const struct lu_env *env, struct ldlm_namespace *ns,
 	else
 		policy.l_extent.end = end | ~PAGE_MASK;
 
-	rc = ldlm_cli_enqueue_local(env, ns, res_id, LDLM_EXTENT, &policy,
-				    mode, flags, ldlm_blocking_ast,
+	rc = ldlm_cli_enqueue_local(ns, res_id, LDLM_EXTENT, &policy, mode,
+				    flags, ldlm_blocking_ast,
 				    ldlm_completion_ast, ldlm_glimpse_ast,
 				    NULL, 0, LVB_T_NONE, NULL, lh);
 	RETURN(rc == ELDLM_OK ? 0 : -EIO);
@@ -1686,16 +1611,13 @@ void tgt_extent_unlock(struct lustre_handle *lh, enum ldlm_mode mode)
 }
 EXPORT_SYMBOL(tgt_extent_unlock);
 
-static int tgt_brw_lock(const struct lu_env *env, struct obd_export *exp,
-			struct ldlm_res_id *res_id, struct obd_ioobj *obj,
-			struct niobuf_remote *nb, struct lustre_handle *lh,
-			enum ldlm_mode mode)
+int tgt_brw_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
+		 struct obd_ioobj *obj, struct niobuf_remote *nb,
+		 struct lustre_handle *lh, enum ldlm_mode mode)
 {
-	struct ldlm_namespace	*ns = exp->exp_obd->obd_namespace;
 	__u64			 flags = 0;
 	int			 nrbufs = obj->ioo_bufcnt;
 	int			 i;
-	int			 rc;
 
 	ENTRY;
 
@@ -1712,19 +1634,14 @@ static int tgt_brw_lock(const struct lu_env *env, struct obd_export *exp,
 		if (!(nb[i].rnb_flags & OBD_BRW_SRVLOCK))
 			RETURN(-EFAULT);
 
-	/* MDT IO for data-on-mdt */
-	if (exp->exp_connect_data.ocd_connect_flags & OBD_CONNECT_IBITS)
-		rc = tgt_mdt_data_lock(ns, res_id, lh, mode, &flags);
-	else
-		rc = tgt_extent_lock(env, ns, res_id, nb[0].rnb_offset,
-				     nb[nrbufs - 1].rnb_offset +
-				     nb[nrbufs - 1].rnb_len - 1,
-				     lh, mode, &flags);
-	RETURN(rc);
+	RETURN(tgt_extent_lock(ns, res_id, nb[0].rnb_offset,
+			       nb[nrbufs - 1].rnb_offset +
+			       nb[nrbufs - 1].rnb_len - 1,
+			       lh, mode, &flags));
 }
 
-static void tgt_brw_unlock(struct obd_ioobj *obj, struct niobuf_remote *niob,
-			   struct lustre_handle *lh, enum ldlm_mode mode)
+void tgt_brw_unlock(struct obd_ioobj *obj, struct niobuf_remote *niob,
+		    struct lustre_handle *lh, enum ldlm_mode mode)
 {
 	ENTRY;
 
@@ -1737,82 +1654,86 @@ static void tgt_brw_unlock(struct obd_ioobj *obj, struct niobuf_remote *niob,
 		tgt_extent_unlock(lh, mode);
 	EXIT;
 }
-static int tgt_checksum_niobuf(struct lu_target *tgt,
-				 struct niobuf_local *local_nb, int npages,
-				 int opc, enum cksum_types cksum_type,
-				 __u32 *cksum)
+
+static __u32 tgt_checksum_bulk(struct lu_target *tgt,
+			       struct ptlrpc_bulk_desc *desc, int opc,
+			       cksum_type_t cksum_type)
 {
-	struct ahash_request	       *req;
+	struct cfs_crypto_hash_desc	*hdesc;
 	unsigned int			bufsize;
 	int				i, err;
 	unsigned char			cfs_alg = cksum_obd2cfs(cksum_type);
+	__u32				cksum;
+
+	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
 
-	req = cfs_crypto_hash_init(cfs_alg, NULL, 0);
-	if (IS_ERR(req)) {
+	hdesc = cfs_crypto_hash_init(cfs_alg, NULL, 0);
+	if (IS_ERR(hdesc)) {
 		CERROR("%s: unable to initialize checksum hash %s\n",
 		       tgt_name(tgt), cfs_crypto_hash_name(cfs_alg));
-		return PTR_ERR(req);
+		return PTR_ERR(hdesc);
 	}
 
 	CDEBUG(D_INFO, "Checksum for algo %s\n", cfs_crypto_hash_name(cfs_alg));
-	for (i = 0; i < npages; i++) {
+	for (i = 0; i < desc->bd_iov_count; i++) {
 		/* corrupt the data before we compute the checksum, to
 		 * simulate a client->OST data error */
 		if (i == 0 && opc == OST_WRITE &&
 		    OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_RECEIVE)) {
-			int off = local_nb[i].lnb_page_offset & ~PAGE_MASK;
-			int len = local_nb[i].lnb_len;
+			int off = BD_GET_KIOV(desc, i).kiov_offset &
+				~PAGE_MASK;
+			int len = BD_GET_KIOV(desc, i).kiov_len;
 			struct page *np = tgt_page_to_corrupt;
+			char *ptr = kmap(BD_GET_KIOV(desc, i).kiov_page) + off;
 
 			if (np) {
-				char *ptr = ll_kmap_atomic(local_nb[i].lnb_page,
-							KM_USER0);
-				char *ptr2 = page_address(np);
+				char *ptr2 = kmap(np) + off;
 
-				memcpy(ptr2 + off, ptr + off, len);
-				memcpy(ptr2 + off, "bad3", min(4, len));
-				ll_kunmap_atomic(ptr, KM_USER0);
+				memcpy(ptr2, ptr, len);
+				memcpy(ptr2, "bad3", min(4, len));
+				kunmap(np);
 
 				/* LU-8376 to preserve original index for
 				 * display in dump_all_bulk_pages() */
-				np->index = i;
+				np->index = BD_GET_KIOV(desc,
+							i).kiov_page->index;
 
-				cfs_crypto_hash_update_page(req, np, off,
-							    len);
-				continue;
+				BD_GET_KIOV(desc, i).kiov_page = np;
 			} else {
 				CERROR("%s: can't alloc page for corruption\n",
 				       tgt_name(tgt));
 			}
 		}
-		cfs_crypto_hash_update_page(req, local_nb[i].lnb_page,
-				  local_nb[i].lnb_page_offset & ~PAGE_MASK,
-				  local_nb[i].lnb_len);
+		cfs_crypto_hash_update_page(hdesc,
+				  BD_GET_KIOV(desc, i).kiov_page,
+				  BD_GET_KIOV(desc, i).kiov_offset &
+					~PAGE_MASK,
+				  BD_GET_KIOV(desc, i).kiov_len);
 
 		 /* corrupt the data after we compute the checksum, to
 		 * simulate an OST->client data error */
 		if (i == 0 && opc == OST_READ &&
 		    OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_SEND)) {
-			int off = local_nb[i].lnb_page_offset & ~PAGE_MASK;
-			int len = local_nb[i].lnb_len;
+			int off = BD_GET_KIOV(desc, i).kiov_offset
+			  & ~PAGE_MASK;
+			int len = BD_GET_KIOV(desc, i).kiov_len;
 			struct page *np = tgt_page_to_corrupt;
+			char *ptr =
+			  kmap(BD_GET_KIOV(desc, i).kiov_page) + off;
 
 			if (np) {
-				char *ptr = ll_kmap_atomic(local_nb[i].lnb_page,
-							KM_USER0);
-				char *ptr2 = page_address(np);
+				char *ptr2 = kmap(np) + off;
 
-				memcpy(ptr2 + off, ptr + off, len);
-				memcpy(ptr2 + off, "bad4", min(4, len));
-				ll_kunmap_atomic(ptr, KM_USER0);
+				memcpy(ptr2, ptr, len);
+				memcpy(ptr2, "bad4", min(4, len));
+				kunmap(np);
 
 				/* LU-8376 to preserve original index for
 				 * display in dump_all_bulk_pages() */
-				np->index = i;
+				np->index = BD_GET_KIOV(desc,
+							i).kiov_page->index;
 
-				cfs_crypto_hash_update_page(req, np, off,
-							    len);
-				continue;
+				BD_GET_KIOV(desc, i).kiov_page = np;
 			} else {
 				CERROR("%s: can't alloc page for corruption\n",
 				       tgt_name(tgt));
@@ -1820,17 +1741,17 @@ static int tgt_checksum_niobuf(struct lu_target *tgt,
 		}
 	}
 
-	bufsize = sizeof(*cksum);
-	err = cfs_crypto_hash_final(req, (unsigned char *)cksum, &bufsize);
+	bufsize = sizeof(cksum);
+	err = cfs_crypto_hash_final(hdesc, (unsigned char *)&cksum, &bufsize);
 
-	return 0;
+	return cksum;
 }
 
 char dbgcksum_file_name[PATH_MAX];
 
 static void dump_all_bulk_pages(struct obdo *oa, int count,
-				struct niobuf_local *local_nb,
-				__u32 server_cksum, __u32 client_cksum)
+				    lnet_kiov_t *iov, __u32 server_cksum,
+				    __u32 client_cksum)
 {
 	struct file *filp;
 	int rc, i;
@@ -1847,9 +1768,9 @@ static void dump_all_bulk_pages(struct obdo *oa, int count,
 		 oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0,
 		 oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
 		 oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
-		 local_nb[0].lnb_file_offset,
-		 local_nb[count-1].lnb_file_offset +
-		 local_nb[count-1].lnb_len - 1, client_cksum, server_cksum);
+		 (__u64)iov[0].kiov_page->index << PAGE_SHIFT,
+		 ((__u64)iov[count - 1].kiov_page->index << PAGE_SHIFT) +
+		 iov[count - 1].kiov_len - 1, client_cksum, server_cksum);
 	filp = filp_open(dbgcksum_file_name,
 			 O_CREAT | O_EXCL | O_WRONLY | O_LARGEFILE, 0600);
 	if (IS_ERR(filp)) {
@@ -1865,8 +1786,8 @@ static void dump_all_bulk_pages(struct obdo *oa, int count,
 	}
 
 	for (i = 0; i < count; i++) {
-		len = local_nb[i].lnb_len;
-		buf = kmap(local_nb[i].lnb_page);
+		len = iov[i].kiov_len;
+		buf = kmap(iov[i].kiov_page);
 		while (len != 0) {
 			rc = cfs_kernel_write(filp, buf, len, &filp->f_pos);
 			if (rc < 0) {
@@ -1879,7 +1800,7 @@ static void dump_all_bulk_pages(struct obdo *oa, int count,
 			CDEBUG(D_INFO, "%s: wrote %d bytes\n",
 			       dbgcksum_file_name, rc);
 		}
-		kunmap(local_nb[i].lnb_page);
+		kunmap(iov[i].kiov_page);
 	}
 
 	rc = ll_vfs_fsync_range(filp, 0, LLONG_MAX, 1);
@@ -1889,15 +1810,13 @@ static void dump_all_bulk_pages(struct obdo *oa, int count,
 	return;
 }
 
-static int check_read_checksum(struct niobuf_local *local_nb, int npages,
-			       struct obd_export *exp, struct obdo *oa,
-			       const struct lnet_process_id *peer,
+static int check_read_checksum(struct ptlrpc_bulk_desc *desc, struct obdo *oa,
+			       const lnet_process_id_t *peer,
 			       __u32 client_cksum, __u32 server_cksum,
-			       enum cksum_types server_cksum_type)
+			       cksum_type_t server_cksum_type)
 {
 	char *msg;
-	enum cksum_types cksum_type;
-	loff_t start, end;
+	cksum_type_t cksum_type;
 
 	/* unlikely to happen and only if resend does not occur due to cksum
 	 * control failure on Client */
@@ -1907,12 +1826,13 @@ static int check_read_checksum(struct niobuf_local *local_nb, int npages,
 		return 0;
 	}
 
-	if (exp->exp_obd->obd_checksum_dump)
-		dump_all_bulk_pages(oa, npages, local_nb, server_cksum,
+	if (desc->bd_export->exp_obd->obd_checksum_dump)
+		dump_all_bulk_pages(oa, desc->bd_iov_count,
+				    &BD_GET_KIOV(desc, 0), server_cksum,
 				    client_cksum);
 
-	cksum_type = obd_cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
-					   oa->o_flags : 0);
+	cksum_type = cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
+				       oa->o_flags : 0);
 
 	if (cksum_type != server_cksum_type)
 		msg = "the server may have not used the checksum type specified"
@@ -1920,237 +1840,24 @@ static int check_read_checksum(struct niobuf_local *local_nb, int npages,
 	else
 		msg = "should have changed on the client or in transit";
 
-	start = local_nb[0].lnb_file_offset;
-	end = local_nb[npages-1].lnb_file_offset +
-					local_nb[npages-1].lnb_len - 1;
-
 	LCONSOLE_ERROR_MSG(0x132, "%s: BAD READ CHECKSUM: %s: from %s inode "
 		DFID " object "DOSTID" extent [%llu-%llu], client returned csum"
 		" %x (type %x), server csum %x (type %x)\n",
-		exp->exp_obd->obd_name,
+		desc->bd_export->exp_obd->obd_name,
 		msg, libcfs_nid2str(peer->nid),
 		oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : 0ULL,
 		oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
 		oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
 		POSTID(&oa->o_oi),
-		start, end, client_cksum, cksum_type, server_cksum,
-		server_cksum_type);
-
+		(__u64)BD_GET_KIOV(desc, 0).kiov_page->index << PAGE_SHIFT,
+		((__u64)BD_GET_KIOV(desc,
+				    desc->bd_iov_count - 1).kiov_page->index
+			<< PAGE_SHIFT) +
+			BD_GET_KIOV(desc, desc->bd_iov_count - 1).kiov_len - 1,
+		client_cksum, cksum_type, server_cksum, server_cksum_type);
 	return 1;
 }
 
-static int tgt_pages2shortio(struct niobuf_local *local, int npages,
-			     unsigned char *buf, int size)
-{
-	int	i, off, len, copied = size;
-	char	*ptr;
-
-	for (i = 0; i < npages; i++) {
-		off = local[i].lnb_page_offset & ~PAGE_MASK;
-		len = local[i].lnb_len;
-
-		CDEBUG(D_PAGE, "index %d offset = %d len = %d left = %d\n",
-		       i, off, len, size);
-		if (len > size)
-			return -EINVAL;
-
-		ptr = ll_kmap_atomic(local[i].lnb_page, KM_USER0);
-		memcpy(buf + off, ptr, len);
-		ll_kunmap_atomic(ptr, KM_USER0);
-		buf += len;
-		size -= len;
-	}
-	return copied - size;
-}
-
-static int tgt_checksum_niobuf_t10pi(struct lu_target *tgt,
-				     struct niobuf_local *local_nb,
-				     int npages, int opc,
-				     obd_dif_csum_fn *fn,
-				     int sector_size,
-				     u32 *check_sum)
-{
-	enum cksum_types t10_cksum_type = tgt->lut_dt_conf.ddp_t10_cksum_type;
-	unsigned char cfs_alg = cksum_obd2cfs(OBD_CKSUM_T10_TOP);
-	const char *obd_name = tgt->lut_obd->obd_name;
-	struct ahash_request *req;
-	unsigned int bufsize;
-	unsigned char *buffer;
-	struct page *__page;
-	__u16 *guard_start;
-	int guard_number;
-	int used_number = 0;
-	__u32 cksum;
-	int rc = 0;
-	int used;
-	int i;
-
-	__page = alloc_page(GFP_KERNEL);
-	if (__page == NULL)
-		return -ENOMEM;
-
-	req = cfs_crypto_hash_init(cfs_alg, NULL, 0);
-	if (IS_ERR(req)) {
-		CERROR("%s: unable to initialize checksum hash %s\n",
-		       tgt_name(tgt), cfs_crypto_hash_name(cfs_alg));
-		return PTR_ERR(req);
-	}
-
-	buffer = kmap(__page);
-	guard_start = (__u16 *)buffer;
-	guard_number = PAGE_SIZE / sizeof(*guard_start);
-	for (i = 0; i < npages; i++) {
-		/* corrupt the data before we compute the checksum, to
-		 * simulate a client->OST data error */
-		if (i == 0 && opc == OST_WRITE &&
-		    OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_RECEIVE)) {
-			int off = local_nb[i].lnb_page_offset & ~PAGE_MASK;
-			int len = local_nb[i].lnb_len;
-			struct page *np = tgt_page_to_corrupt;
-
-			if (np) {
-				char *ptr = ll_kmap_atomic(local_nb[i].lnb_page,
-							KM_USER0);
-				char *ptr2 = page_address(np);
-
-				memcpy(ptr2 + off, ptr + off, len);
-				memcpy(ptr2 + off, "bad3", min(4, len));
-				ll_kunmap_atomic(ptr, KM_USER0);
-
-				/* LU-8376 to preserve original index for
-				 * display in dump_all_bulk_pages() */
-				np->index = i;
-
-				cfs_crypto_hash_update_page(req, np, off,
-							    len);
-				continue;
-			} else {
-				CERROR("%s: can't alloc page for corruption\n",
-				       tgt_name(tgt));
-			}
-		}
-
-		/*
-		 * The left guard number should be able to hold checksums of a
-		 * whole page
-		 */
-		if (t10_cksum_type && opc == OST_READ &&
-		    local_nb[i].lnb_guard_disk) {
-			used = DIV_ROUND_UP(local_nb[i].lnb_len, sector_size);
-			if (used > (guard_number - used_number)) {
-				rc = -E2BIG;
-				break;
-			}
-			memcpy(guard_start + used_number,
-			       local_nb[i].lnb_guards,
-			       used * sizeof(*local_nb[i].lnb_guards));
-		} else {
-			rc = obd_page_dif_generate_buffer(obd_name,
-				local_nb[i].lnb_page,
-				local_nb[i].lnb_page_offset & ~PAGE_MASK,
-				local_nb[i].lnb_len, guard_start + used_number,
-				guard_number - used_number, &used, sector_size,
-				fn);
-			if (rc)
-				break;
-		}
-
-		LASSERT(used <= MAX_GUARD_NUMBER);
-		/*
-		 * If disk support T10PI checksum, copy guards to local_nb.
-		 * If the write is partial page, do not use the guards for bio
-		 * submission since the data might not be full-sector. The bio
-		 * guards will be generated later based on the full sectors. If
-		 * the sector size is 512B rather than 4 KB, or the page size
-		 * is larger than 4KB, this might drop some useful guards for
-		 * partial page write, but it will only add minimal extra time
-		 * of checksum calculation.
-		 */
-		if (t10_cksum_type && opc == OST_WRITE &&
-		    local_nb[i].lnb_len == PAGE_SIZE) {
-			local_nb[i].lnb_guard_rpc = 1;
-			memcpy(local_nb[i].lnb_guards,
-			       guard_start + used_number,
-			       used * sizeof(*local_nb[i].lnb_guards));
-		}
-
-		used_number += used;
-		if (used_number == guard_number) {
-			cfs_crypto_hash_update_page(req, __page, 0,
-				used_number * sizeof(*guard_start));
-			used_number = 0;
-		}
-
-		 /* corrupt the data after we compute the checksum, to
-		 * simulate an OST->client data error */
-		if (unlikely(i == 0 && opc == OST_READ &&
-			     OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_SEND))) {
-			int off = local_nb[i].lnb_page_offset & ~PAGE_MASK;
-			int len = local_nb[i].lnb_len;
-			struct page *np = tgt_page_to_corrupt;
-
-			if (np) {
-				char *ptr = ll_kmap_atomic(local_nb[i].lnb_page,
-							KM_USER0);
-				char *ptr2 = page_address(np);
-
-				memcpy(ptr2 + off, ptr + off, len);
-				memcpy(ptr2 + off, "bad4", min(4, len));
-				ll_kunmap_atomic(ptr, KM_USER0);
-
-				/* LU-8376 to preserve original index for
-				 * display in dump_all_bulk_pages() */
-				np->index = i;
-
-				cfs_crypto_hash_update_page(req, np, off,
-							    len);
-				continue;
-			} else {
-				CERROR("%s: can't alloc page for corruption\n",
-				       tgt_name(tgt));
-			}
-		}
-	}
-	kunmap(__page);
-	if (rc)
-		GOTO(out, rc);
-
-	if (used_number != 0)
-		cfs_crypto_hash_update_page(req, __page, 0,
-			used_number * sizeof(*guard_start));
-
-	bufsize = sizeof(cksum);
-	rc = cfs_crypto_hash_final(req, (unsigned char *)&cksum, &bufsize);
-
-	if (rc == 0)
-		*check_sum = cksum;
-out:
-	__free_page(__page);
-	return rc;
-}
-
-static int tgt_checksum_niobuf_rw(struct lu_target *tgt,
-				  enum cksum_types cksum_type,
-				  struct niobuf_local *local_nb,
-				  int npages, int opc, u32 *check_sum)
-{
-	obd_dif_csum_fn *fn = NULL;
-	int sector_size = 0;
-	int rc;
-
-	ENTRY;
-	obd_t10_cksum2dif(cksum_type, &fn, &sector_size);
-
-	if (fn)
-		rc = tgt_checksum_niobuf_t10pi(tgt, local_nb, npages,
-					       opc, fn, sector_size,
-					       check_sum);
-	else
-		rc = tgt_checksum_niobuf(tgt, local_nb, npages, opc,
-					 cksum_type, check_sum);
-	RETURN(rc);
-}
-
 int tgt_brw_read(struct tgt_session_info *tsi)
 {
 	struct ptlrpc_request	*req = tgt_ses_req(tsi);
@@ -2162,15 +1869,12 @@ int tgt_brw_read(struct tgt_session_info *tsi)
 	struct ost_body		*body, *repbody;
 	struct l_wait_info	 lwi;
 	struct lustre_handle	 lockh = { 0 };
-	int			 npages, nob = 0, rc, i, no_reply = 0,
-				 npages_read;
+	int			 npages, nob = 0, rc, i, no_reply = 0;
 	struct tgt_thread_big_cache *tbc = req->rq_svc_thread->t_data;
-	const char *obd_name = exp->exp_obd->obd_name;
 
 	ENTRY;
 
-	if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL &&
-	    ptlrpc_req2svc(req)->srv_req_portal != MDS_IO_PORTAL) {
+	if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL) {
 		CERROR("%s: deny read request from %s to portal %u\n",
 		       tgt_name(tsi->tsi_tgt),
 		       obd_export_nid2str(req->rq_export),
@@ -2213,8 +1917,8 @@ int tgt_brw_read(struct tgt_session_info *tsi)
 
 	local_nb = tbc->local;
 
-	rc = tgt_brw_lock(tsi->tsi_env, exp, &tsi->tsi_resid, ioo, remote_nb,
-			  &lockh, LCK_PR);
+	rc = tgt_brw_lock(exp->exp_obd->obd_namespace, &tsi->tsi_resid, ioo,
+			  remote_nb, &lockh, LCK_PR);
 	if (rc != 0)
 		RETURN(rc);
 
@@ -2232,17 +1936,6 @@ int tgt_brw_read(struct tgt_session_info *tsi)
 		GOTO(out_lock, rc = -ETIMEDOUT);
 	}
 
-	/*
-	 * Because we already sync grant info with client when
-	 * reconnect, grant info will be cleared for resent req,
-	 * otherwise, outdated grant count in the rpc would de-sync
-	 * grant counters in case of shrink
-	 */
-	if (lustre_msg_get_flags(req->rq_reqmsg) & (MSG_RESENT | MSG_REPLAY)) {
-		DEBUG_REQ(D_CACHE, req, "clear resent/replay req grant info");
-		body->oa.o_valid &= ~OBD_MD_FLGRANT;
-	}
-
 	repbody = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
 	repbody->oa = body->oa;
 
@@ -2252,42 +1945,33 @@ int tgt_brw_read(struct tgt_session_info *tsi)
 	if (rc != 0)
 		GOTO(out_lock, rc);
 
-	if (body->oa.o_valid & OBD_MD_FLFLAGS &&
-	    body->oa.o_flags & OBD_FL_SHORT_IO) {
-		desc = NULL;
-	} else {
-		desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
-					    PTLRPC_BULK_PUT_SOURCE |
-						PTLRPC_BULK_BUF_KIOV,
-					    OST_BULK_PORTAL,
-					    &ptlrpc_bulk_kiov_nopin_ops);
-		if (desc == NULL)
-			GOTO(out_commitrw, rc = -ENOMEM);
-	}
+	desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
+				    PTLRPC_BULK_PUT_SOURCE |
+					PTLRPC_BULK_BUF_KIOV,
+				    OST_BULK_PORTAL,
+				    &ptlrpc_bulk_kiov_nopin_ops);
+	if (desc == NULL)
+		GOTO(out_commitrw, rc = -ENOMEM);
 
 	nob = 0;
-	npages_read = npages;
 	for (i = 0; i < npages; i++) {
 		int page_rc = local_nb[i].lnb_rc;
 
 		if (page_rc < 0) {
 			rc = page_rc;
-			npages_read = i;
 			break;
 		}
 
 		nob += page_rc;
-		if (page_rc != 0 && desc != NULL) { /* some data! */
+		if (page_rc != 0) { /* some data! */
 			LASSERT(local_nb[i].lnb_page != NULL);
 			desc->bd_frag_ops->add_kiov_frag
 			  (desc, local_nb[i].lnb_page,
-			   local_nb[i].lnb_page_offset & ~PAGE_MASK,
+			   local_nb[i].lnb_page_offset,
 			   page_rc);
 		}
 
 		if (page_rc != local_nb[i].lnb_len) { /* short read */
-			local_nb[i].lnb_len = page_rc;
-			npages_read = i + (page_rc != 0 ? 1 : 0);
 			/* All subsequent pages should be 0 */
 			while (++i < npages)
 				LASSERT(local_nb[i].lnb_rc == 0);
@@ -2299,19 +1983,14 @@ int tgt_brw_read(struct tgt_session_info *tsi)
 		rc = -E2BIG;
 
 	if (body->oa.o_valid & OBD_MD_FLCKSUM) {
-		u32 flag = body->oa.o_valid & OBD_MD_FLFLAGS ?
-			   body->oa.o_flags : 0;
-		enum cksum_types cksum_type = obd_cksum_type_unpack(flag);
+		cksum_type_t cksum_type =
+			cksum_type_unpack(body->oa.o_valid & OBD_MD_FLFLAGS ?
+					  body->oa.o_flags : 0);
 
-		repbody->oa.o_flags = obd_cksum_type_pack(obd_name,
-							  cksum_type);
+		repbody->oa.o_flags = cksum_type_pack(cksum_type);
 		repbody->oa.o_valid = OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
-
-		rc = tgt_checksum_niobuf_rw(tsi->tsi_tgt, cksum_type,
-					    local_nb, npages_read, OST_READ,
-					    &repbody->oa.o_cksum);
-		if (rc < 0)
-			GOTO(out_commitrw, rc);
+		repbody->oa.o_cksum = tgt_checksum_bulk(tsi->tsi_tgt, desc,
+							OST_READ, cksum_type);
 		CDEBUG(D_PAGE, "checksum at read origin: %x\n",
 		       repbody->oa.o_cksum);
 
@@ -2320,46 +1999,21 @@ int tgt_brw_read(struct tgt_session_info *tsi)
 		 * zero-cksum case) */
 		if ((body->oa.o_valid & OBD_MD_FLFLAGS) &&
 		    (body->oa.o_flags & OBD_FL_RECOV_RESEND))
-			check_read_checksum(local_nb, npages_read, exp,
-					    &body->oa, &req->rq_peer,
+			check_read_checksum(desc, &body->oa, &req->rq_peer,
 					    body->oa.o_cksum,
 					    repbody->oa.o_cksum, cksum_type);
 	} else {
 		repbody->oa.o_valid = 0;
 	}
-	if (body->oa.o_valid & OBD_MD_FLGRANT)
-		repbody->oa.o_valid |= OBD_MD_FLGRANT;
 	/* We're finishing using body->oa as an input variable */
 
 	/* Check if client was evicted while we were doing i/o before touching
 	 * network */
-	if (rc == 0) {
-		if (body->oa.o_valid & OBD_MD_FLFLAGS &&
-		    body->oa.o_flags & OBD_FL_SHORT_IO) {
-			unsigned char *short_io_buf;
-			int short_io_size;
-
-			short_io_buf = req_capsule_server_get(&req->rq_pill,
-							      &RMF_SHORT_IO);
-			short_io_size = req_capsule_get_size(&req->rq_pill,
-							     &RMF_SHORT_IO,
-							     RCL_SERVER);
-			rc = tgt_pages2shortio(local_nb, npages_read,
-					       short_io_buf, short_io_size);
-			if (rc >= 0)
-				req_capsule_shrink(&req->rq_pill,
-						   &RMF_SHORT_IO, rc,
-						   RCL_SERVER);
-			rc = rc > 0 ? 0 : rc;
-		} else if (!CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)) {
-			rc = target_bulk_io(exp, desc, &lwi);
-		}
+	if (likely(rc == 0 &&
+		   !CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2) &&
+		   !CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_BULK))) {
+		rc = target_bulk_io(exp, desc, &lwi);
 		no_reply = rc != 0;
-	} else {
-		if (body->oa.o_valid & OBD_MD_FLFLAGS &&
-		    body->oa.o_flags & OBD_FL_SHORT_IO)
-			req_capsule_shrink(&req->rq_pill, &RMF_SHORT_IO, 0,
-					   RCL_SERVER);
 	}
 
 out_commitrw:
@@ -2382,15 +2036,13 @@ int tgt_brw_read(struct tgt_session_info *tsi)
 		ptlrpc_req_drop_rs(req);
 		LCONSOLE_WARN("%s: Bulk IO read error with %s (at %s), "
 			      "client will retry: rc %d\n",
-			      obd_name,
+			      exp->exp_obd->obd_name,
 			      obd_uuid2str(&exp->exp_client_uuid),
 			      obd_export_nid2str(exp), rc);
 	}
 	/* send a bulk after reply to simulate a network delay or reordering
-	 * by a router - Note that !desc implies short io, so there is no bulk
-	 * to reorder. */
-	if (unlikely(CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)) &&
-	    desc) {
+	 * by a router */
+	if (unlikely(CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2))) {
 		wait_queue_head_t	 waitq;
 		struct l_wait_info	 lwi1;
 
@@ -2407,32 +2059,6 @@ int tgt_brw_read(struct tgt_session_info *tsi)
 }
 EXPORT_SYMBOL(tgt_brw_read);
 
-static int tgt_shortio2pages(struct niobuf_local *local, int npages,
-			     unsigned char *buf, unsigned int size)
-{
-	int	i, off, len;
-	char	*ptr;
-
-	for (i = 0; i < npages; i++) {
-		off = local[i].lnb_page_offset & ~PAGE_MASK;
-		len = local[i].lnb_len;
-
-		if (len == 0)
-			continue;
-
-		CDEBUG(D_PAGE, "index %d offset = %d len = %d left = %d\n",
-		       i, off, len, size);
-		ptr = ll_kmap_atomic(local[i].lnb_page, KM_USER0);
-		if (ptr == NULL)
-			return -EINVAL;
-		memcpy(ptr + off, buf, len < size ? len : size);
-		ll_kunmap_atomic(ptr, KM_USER0);
-		buf += len;
-		size -= len;
-	}
-	return 0;
-}
-
 static void tgt_warn_on_cksum(struct ptlrpc_request *req,
 			      struct ptlrpc_bulk_desc *desc,
 			      struct niobuf_local *local_nb, int npages,
@@ -2447,13 +2073,14 @@ static void tgt_warn_on_cksum(struct ptlrpc_request *req,
 	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
 	LASSERT(body != NULL);
 
-	if (desc && req->rq_peer.nid != desc->bd_sender) {
+	if (req->rq_peer.nid != desc->bd_sender) {
 		via = " via ";
 		router = libcfs_nid2str(desc->bd_sender);
 	}
 
 	if (exp->exp_obd->obd_checksum_dump)
-		dump_all_bulk_pages(&body->oa, npages, local_nb, server_cksum,
+		dump_all_bulk_pages(&body->oa, desc->bd_iov_count,
+				    &BD_GET_KIOV(desc, 0), server_cksum,
 				    client_cksum);
 
 	if (mmap) {
@@ -2494,16 +2121,14 @@ int tgt_brw_write(struct tgt_session_info *tsi)
 	__u32			*rcs;
 	int			 objcount, niocount, npages;
 	int			 rc, i, j;
-	enum cksum_types cksum_type = OBD_CKSUM_CRC32;
+	cksum_type_t		 cksum_type = OBD_CKSUM_CRC32;
 	bool			 no_reply = false, mmap;
 	struct tgt_thread_big_cache *tbc = req->rq_svc_thread->t_data;
 	bool wait_sync = false;
-	const char *obd_name = exp->exp_obd->obd_name;
 
 	ENTRY;
 
-	if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL &&
-	    ptlrpc_req2svc(req)->srv_req_portal != MDS_IO_PORTAL) {
+	if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL) {
 		CERROR("%s: deny write request from %s to portal %u\n",
 		       tgt_name(tsi->tsi_tgt),
 		       obd_export_nid2str(req->rq_export),
@@ -2527,9 +2152,6 @@ int tgt_brw_write(struct tgt_session_info *tsi)
 	CFS_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK, cfs_fail_val > 0 ?
 			 cfs_fail_val : (obd_timeout + 1) / 4);
 
-	/* Delay write commit to show stale size information */
-	CFS_FAIL_TIMEOUT(OBD_FAIL_OSC_NO_SIZE_DATA, cfs_fail_val);
-
 	/* There must be big cache in current thread to process this request
 	 * if it is NULL then something went wrong and it wasn't allocated,
 	 * report -ENOMEM in that case */
@@ -2570,8 +2192,8 @@ int tgt_brw_write(struct tgt_session_info *tsi)
 
 	local_nb = tbc->local;
 
-	rc = tgt_brw_lock(tsi->tsi_env, exp, &tsi->tsi_resid, ioo, remote_nb,
-			  &lockh, LCK_PW);
+	rc = tgt_brw_lock(exp->exp_obd->obd_namespace, &tsi->tsi_resid, ioo,
+			  remote_nb, &lockh, LCK_PW);
 	if (rc != 0)
 		GOTO(out, rc);
 
@@ -2608,46 +2230,26 @@ int tgt_brw_write(struct tgt_session_info *tsi)
 			objcount, ioo, remote_nb, &npages, local_nb);
 	if (rc < 0)
 		GOTO(out_lock, rc);
-	if (body->oa.o_valid & OBD_MD_FLFLAGS &&
-	    body->oa.o_flags & OBD_FL_SHORT_IO) {
-		unsigned int short_io_size;
-		unsigned char *short_io_buf;
-
-		short_io_size = req_capsule_get_size(&req->rq_pill,
-						     &RMF_SHORT_IO,
-						     RCL_CLIENT);
-		short_io_buf = req_capsule_client_get(&req->rq_pill,
-						      &RMF_SHORT_IO);
-		CDEBUG(D_INFO, "Client use short io for data transfer,"
-			       " size = %d\n", short_io_size);
-
-		/* Copy short io buf to pages */
-		rc = tgt_shortio2pages(local_nb, npages, short_io_buf,
-				       short_io_size);
-		desc = NULL;
-	} else {
-		desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
-					    PTLRPC_BULK_GET_SINK |
-					    PTLRPC_BULK_BUF_KIOV,
-					    OST_BULK_PORTAL,
-					    &ptlrpc_bulk_kiov_nopin_ops);
-		if (desc == NULL)
-			GOTO(skip_transfer, rc = -ENOMEM);
-
-		/* NB Having prepped, we must commit... */
-		for (i = 0; i < npages; i++)
-			desc->bd_frag_ops->add_kiov_frag(desc,
-					local_nb[i].lnb_page,
-					local_nb[i].lnb_page_offset & ~PAGE_MASK,
-					local_nb[i].lnb_len);
-
-		rc = sptlrpc_svc_prep_bulk(req, desc);
-		if (rc != 0)
-			GOTO(skip_transfer, rc);
 
-		rc = target_bulk_io(exp, desc, &lwi);
-	}
+	desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
+				    PTLRPC_BULK_GET_SINK | PTLRPC_BULK_BUF_KIOV,
+				    OST_BULK_PORTAL,
+				    &ptlrpc_bulk_kiov_nopin_ops);
+	if (desc == NULL)
+		GOTO(skip_transfer, rc = -ENOMEM);
+
+	/* NB Having prepped, we must commit... */
+	for (i = 0; i < npages; i++)
+		desc->bd_frag_ops->add_kiov_frag(desc,
+						 local_nb[i].lnb_page,
+						 local_nb[i].lnb_page_offset,
+						 local_nb[i].lnb_len);
 
+	rc = sptlrpc_svc_prep_bulk(req, desc);
+	if (rc != 0)
+		GOTO(skip_transfer, rc);
+
+	rc = target_bulk_io(exp, desc, &lwi);
 	no_reply = rc != 0;
 
 skip_transfer:
@@ -2655,19 +2257,13 @@ int tgt_brw_write(struct tgt_session_info *tsi)
 		static int cksum_counter;
 
 		if (body->oa.o_valid & OBD_MD_FLFLAGS)
-			cksum_type = obd_cksum_type_unpack(body->oa.o_flags);
+			cksum_type = cksum_type_unpack(body->oa.o_flags);
 
 		repbody->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
 		repbody->oa.o_flags &= ~OBD_FL_CKSUM_ALL;
-		repbody->oa.o_flags |= obd_cksum_type_pack(obd_name,
-							   cksum_type);
-
-		rc = tgt_checksum_niobuf_rw(tsi->tsi_tgt, cksum_type,
-					    local_nb, npages, OST_WRITE,
-					    &repbody->oa.o_cksum);
-		if (rc < 0)
-			GOTO(out_commitrw, rc);
-
+		repbody->oa.o_flags |= cksum_type_pack(cksum_type);
+		repbody->oa.o_cksum = tgt_checksum_bulk(tsi->tsi_tgt, desc,
+							OST_WRITE, cksum_type);
 		cksum_counter++;
 
 		if (unlikely(body->oa.o_cksum != repbody->oa.o_cksum)) {
@@ -2686,7 +2282,6 @@ int tgt_brw_write(struct tgt_session_info *tsi)
 		}
 	}
 
-out_commitrw:
 	/* Must commit after prep above in all cases */
 	rc = obd_commitrw(tsi->tsi_env, OBD_BRW_WRITE, exp, &repbody->oa,
 			  objcount, ioo, remote_nb, npages, local_nb, rc);
@@ -2742,7 +2337,7 @@ int tgt_brw_write(struct tgt_session_info *tsi)
 		if (!exp->exp_obd->obd_no_transno)
 			LCONSOLE_WARN("%s: Bulk IO write error with %s (at %s),"
 				      " client will retry: rc = %d\n",
-				      obd_name,
+				      exp->exp_obd->obd_name,
 				      obd_uuid2str(&exp->exp_client_uuid),
 				      obd_export_nid2str(exp), rc);
 	}
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_internal.h b/drivers/staging/lustrefsx/lustre/target/tgt_internal.h
index ac7c3c17feb9d..981e2ab9f9ade 100644
--- a/drivers/staging/lustrefsx/lustre/target/tgt_internal.h
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_internal.h
@@ -21,7 +21,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * lustre/target/tgt_internal.h
@@ -35,6 +35,7 @@
 #define _TG_INTERNAL_H
 
 #include <lustre_net.h>
+#include <lustre/lustre_idl.h>
 #include <lu_target.h>
 #include <lustre_export.h>
 #include <lustre_fid.h>
@@ -287,19 +288,4 @@ int top_trans_create_tmt(const struct lu_env *env,
 void tgt_cancel_slc_locks(struct lu_target *tgt, __u64 transno);
 void barrier_init(void);
 void barrier_fini(void);
-
-/* FMD tracking data */
-struct tgt_fmd_data {
-	struct list_head fmd_list;	  /* linked to tgt_fmd_list */
-	struct lu_fid	 fmd_fid;	  /* FID being written to */
-	__u64		 fmd_mactime_xid; /* xid highest {m,a,c}time setattr */
-	time64_t	 fmd_expire;	  /* time when the fmd should expire */
-	int		 fmd_refcount;	  /* reference counter - list holds 1 */
-};
-
-/* tgt_fmd.c */
-extern struct kmem_cache *tgt_fmd_kmem;
-void tgt_fmd_expire(struct obd_export *exp);
-void tgt_fmd_cleanup(struct obd_export *exp);
-
 #endif /* _TG_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_lastrcvd.c b/drivers/staging/lustrefsx/lustre/target/tgt_lastrcvd.c
index 0d2fde1be1bc3..c7aecdf2171ea 100644
--- a/drivers/staging/lustrefsx/lustre/target/tgt_lastrcvd.c
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_lastrcvd.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2017, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -148,13 +148,6 @@ static int tgt_clear_reply_slot(struct lu_target *lut, int idx)
 	int chunk;
 	int b;
 
-	if (lut->lut_obd->obd_stopping)
-		/*
-		 * in case of failover keep the bit set in order to
-		 * avoid overwriting slots in reply_data which might
-		 * be required by resent rpcs
-		 */
-		return 0;
 	chunk = idx / LUT_REPLY_SLOTS_PER_CHUNK;
 	b = idx % LUT_REPLY_SLOTS_PER_CHUNK;
 
@@ -395,8 +388,6 @@ int tgt_client_alloc(struct obd_export *exp)
 
 	spin_lock_init(&exp->exp_target_data.ted_nodemap_lock);
 	INIT_LIST_HEAD(&exp->exp_target_data.ted_nodemap_member);
-	spin_lock_init(&exp->exp_target_data.ted_fmd_lock);
-	INIT_LIST_HEAD(&exp->exp_target_data.ted_fmd_list);
 
 	OBD_ALLOC_PTR(exp->exp_target_data.ted_lcd);
 	if (exp->exp_target_data.ted_lcd == NULL)
@@ -420,8 +411,6 @@ void tgt_client_free(struct obd_export *exp)
 
 	LASSERT(exp != exp->exp_obd->obd_self_export);
 
-	tgt_fmd_cleanup(exp);
-
 	/* free reply data */
 	mutex_lock(&ted->ted_lcd_lock);
 	list_for_each_entry_safe(trd, tmp, &ted->ted_reply_list, trd_list) {
@@ -844,7 +833,7 @@ void tgt_boot_epoch_update(struct lu_target *tgt)
 	 * - there is no client to recover or the recovery was aborted
 	 */
 	if (!strncmp(tgt->lut_obd->obd_type->typ_name, LUSTRE_MDT_NAME, 3) &&
-	    (atomic_read(&tgt->lut_obd->obd_max_recoverable_clients) == 0 ||
+	    (tgt->lut_obd->obd_max_recoverable_clients == 0 ||
 	    tgt->lut_obd->obd_abort_recovery))
 		tgt->lut_lsd.lsd_feature_incompat &= ~OBD_INCOMPAT_MULTI_RPCS;
 
@@ -1528,7 +1517,7 @@ static int tgt_clients_data_init(const struct lu_env *env,
 		exp->exp_connecting = 0;
 		exp->exp_in_recovery = 0;
 		spin_unlock(&exp->exp_lock);
-		atomic_inc(&obd->obd_max_recoverable_clients);
+		obd->obd_max_recoverable_clients++;
 
 		if (tgt->lut_lsd.lsd_feature_incompat &
 		    OBD_INCOMPAT_MULTI_RPCS &&
@@ -1900,6 +1889,7 @@ int tgt_reply_data_init(const struct lu_env *env, struct lu_target *tgt)
 	unsigned long		 reply_data_size;
 	int			 rc;
 	struct lsd_reply_header	*lrh = NULL;
+	struct lsd_client_data  *lcd = NULL;
 	struct tg_reply_data	*trd = NULL;
 	int                      idx;
 	loff_t			 off;
@@ -1948,6 +1938,10 @@ int tgt_reply_data_init(const struct lu_env *env, struct lu_target *tgt)
 		if (hash == NULL)
 			GOTO(out, rc = -ENODEV);
 
+		OBD_ALLOC_PTR(lcd);
+		if (lcd == NULL)
+			GOTO(out, rc = -ENOMEM);
+
 		OBD_ALLOC_PTR(trd);
 		if (trd == NULL)
 			GOTO(out, rc = -ENOMEM);
@@ -1999,13 +1993,6 @@ int tgt_reply_data_init(const struct lu_env *env, struct lu_target *tgt)
 			/* update export last committed transation */
 			exp->exp_last_committed = max(exp->exp_last_committed,
 						      lrd->lrd_transno);
-			/* Update lcd_last_transno as well for check in
-			 * tgt_release_reply_data() or the latest client
-			 * transno can be lost.
-			 */
-			ted->ted_lcd->lcd_last_transno =
-				max(ted->ted_lcd->lcd_last_transno,
-				    exp->exp_last_committed);
 
 			mutex_unlock(&ted->ted_lcd_lock);
 			class_export_put(exp);
@@ -2037,6 +2024,8 @@ int tgt_reply_data_init(const struct lu_env *env, struct lu_target *tgt)
 out:
 	if (hash != NULL)
 		cfs_hash_putref(hash);
+	if (lcd != NULL)
+		OBD_FREE_PTR(lcd);
 	if (trd != NULL)
 		OBD_FREE_PTR(trd);
 	if (lrh != NULL)
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_main.c b/drivers/staging/lustrefsx/lustre/target/tgt_main.c
index ce158941f9c06..12f9fdc1c2138 100644
--- a/drivers/staging/lustrefsx/lustre/target/tgt_main.c
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_main.c
@@ -21,7 +21,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2012, 2017, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * lustre/target/tgt_main.c
@@ -37,243 +37,6 @@
 #include "tgt_internal.h"
 #include "../ptlrpc/ptlrpc_internal.h"
 
-/* This must be longer than the longest string below */
-#define SYNC_STATES_MAXLEN 16
-static char *sync_lock_cancel_states[] = {
-	[SYNC_LOCK_CANCEL_NEVER]	= "never",
-	[SYNC_LOCK_CANCEL_BLOCKING]	= "blocking",
-	[SYNC_LOCK_CANCEL_ALWAYS]	= "always",
-};
-
-/**
- * Show policy for handling dirty data under a lock being cancelled.
- *
- * \param[in] kobj	sysfs kobject
- * \param[in] attr	sysfs attribute
- * \param[in] buf	buffer for data
- *
- * \retval		0 and buffer filled with data on success
- * \retval		negative value on error
- */
-ssize_t sync_lock_cancel_show(struct kobject *kobj,
-			      struct attribute *attr, char *buf)
-{
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct lu_target *tgt = obd->u.obt.obt_lut;
-
-	return sprintf(buf, "%s\n",
-		       sync_lock_cancel_states[tgt->lut_sync_lock_cancel]);
-}
-EXPORT_SYMBOL(sync_lock_cancel_show);
-
-/**
- * Change policy for handling dirty data under a lock being cancelled.
- *
- * This variable defines what action target takes upon lock cancel
- * There are three possible modes:
- * 1) never - never do sync upon lock cancel. This can lead to data
- *    inconsistencies if both the OST and client crash while writing a file
- *    that is also concurrently being read by another client. In these cases,
- *    this may allow the file data to "rewind" to an earlier state.
- * 2) blocking - do sync only if there is blocking lock, e.g. if another
- *    client is trying to access this same object
- * 3) always - do sync always
- *
- * \param[in] kobj	kobject
- * \param[in] attr	attribute to show
- * \param[in] buf	buffer for data
- * \param[in] count	buffer size
- *
- * \retval		\a count on success
- * \retval		negative value on error
- */
-ssize_t sync_lock_cancel_store(struct kobject *kobj, struct attribute *attr,
-			       const char *buffer, size_t count)
-{
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct lu_target *tgt = obd->u.obt.obt_lut;
-	int val = -1;
-	enum tgt_sync_lock_cancel slc;
-
-	if (count == 0 || count >= SYNC_STATES_MAXLEN)
-		return -EINVAL;
-
-	for (slc = 0; slc < ARRAY_SIZE(sync_lock_cancel_states); slc++) {
-		if (strcmp(buffer, sync_lock_cancel_states[slc]) == 0) {
-			val = slc;
-			break;
-		}
-	}
-
-	/* Legacy numeric codes */
-	if (val == -1) {
-		int rc = kstrtoint(buffer, 0, &val);
-		if (rc)
-			return rc;
-	}
-
-	if (val < 0 || val > 2)
-		return -EINVAL;
-
-	spin_lock(&tgt->lut_flags_lock);
-	tgt->lut_sync_lock_cancel = val;
-	spin_unlock(&tgt->lut_flags_lock);
-	return count;
-}
-EXPORT_SYMBOL(sync_lock_cancel_store);
-LUSTRE_RW_ATTR(sync_lock_cancel);
-
-/**
- * Show maximum number of Filter Modification Data (FMD) maintained.
- *
- * \param[in] kobj	kobject
- * \param[in] attr	attribute to show
- * \param[in] buf	buffer for data
- *
- * \retval		0 and buffer filled with data on success
- * \retval		negative value on error
- */
-ssize_t tgt_fmd_count_show(struct kobject *kobj, struct attribute *attr,
-			   char *buf)
-{
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct lu_target *lut = obd->u.obt.obt_lut;
-
-	return sprintf(buf, "%u\n", lut->lut_fmd_max_num);
-}
-
-/**
- * Change number of FMDs maintained by target.
- *
- * This defines how large the list of FMDs can be.
- *
- * \param[in] kobj	kobject
- * \param[in] attr	attribute to show
- * \param[in] buf	buffer for data
- * \param[in] count	buffer size
- *
- * \retval		\a count on success
- * \retval		negative value on error
- */
-ssize_t tgt_fmd_count_store(struct kobject *kobj, struct attribute *attr,
-			    const char *buffer, size_t count)
-{
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct lu_target *lut = obd->u.obt.obt_lut;
-	int val, rc;
-
-	rc = kstrtoint(buffer, 0, &val);
-	if (rc)
-		return rc;
-
-	if (val < 1 || val > 65536)
-		return -EINVAL;
-
-	lut->lut_fmd_max_num = val;
-
-	return count;
-}
-LUSTRE_RW_ATTR(tgt_fmd_count);
-
-/**
- * Show the maximum age of FMD data in seconds.
- *
- * \param[in] kobj	kobject
- * \param[in] attr	attribute to show
- * \param[in] buf	buffer for data
- *
- * \retval		0 and buffer filled with data on success
- * \retval		negative value on error
- */
-ssize_t tgt_fmd_seconds_show(struct kobject *kobj, struct attribute *attr,
-			     char *buf)
-{
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct lu_target *lut = obd->u.obt.obt_lut;
-
-	return sprintf(buf, "%lld\n", lut->lut_fmd_max_age);
-}
-
-/**
- * Set the maximum age of FMD data in seconds.
- *
- * This defines how long FMD data stays in the FMD list.
- *
- * \param[in] kobj	kobject
- * \param[in] attr	attribute to show
- * \param[in] buf	buffer for data
- * \param[in] count	buffer size
- *
- * \retval		\a count on success
- * \retval		negative number on error
- */
-ssize_t tgt_fmd_seconds_store(struct kobject *kobj, struct attribute *attr,
-			      const char *buffer, size_t count)
-{
-	struct obd_device *obd = container_of(kobj, struct obd_device,
-					      obd_kset.kobj);
-	struct lu_target *lut = obd->u.obt.obt_lut;
-	time64_t val;
-	int rc;
-
-	rc = kstrtoll(buffer, 0, &val);
-	if (rc)
-		return rc;
-
-	if (val < 1 || val > 65536) /* ~ 18 hour max */
-		return -EINVAL;
-
-	lut->lut_fmd_max_age = val;
-
-	return count;
-}
-LUSTRE_RW_ATTR(tgt_fmd_seconds);
-
-/* These two aliases are old names and kept for compatibility, they were
- * changed to 'tgt_fmd_count' and 'tgt_fmd_seconds'.
- * This change was made in Lustre 2.13, so these aliases can be removed
- * when back compatibility is not needed with any Lustre version prior 2.13
- */
-static struct lustre_attr tgt_fmd_count_compat = __ATTR(client_cache_count,
-			0644, tgt_fmd_count_show, tgt_fmd_count_store);
-static struct lustre_attr tgt_fmd_seconds_compat = __ATTR(client_cache_seconds,
-			0644, tgt_fmd_seconds_show, tgt_fmd_seconds_store);
-
-static const struct attribute *tgt_attrs[] = {
-	&lustre_attr_sync_lock_cancel.attr,
-	&lustre_attr_tgt_fmd_count.attr,
-	&lustre_attr_tgt_fmd_seconds.attr,
-	&tgt_fmd_count_compat.attr,
-	&tgt_fmd_seconds_compat.attr,
-	NULL,
-};
-
-int tgt_tunables_init(struct lu_target *lut)
-{
-	int rc;
-
-	rc = sysfs_create_files(&lut->lut_obd->obd_kset.kobj, tgt_attrs);
-	if (!rc)
-		lut->lut_attrs = tgt_attrs;
-	return rc;
-}
-EXPORT_SYMBOL(tgt_tunables_init);
-
-void tgt_tunables_fini(struct lu_target *lut)
-{
-	if (lut->lut_attrs) {
-		sysfs_remove_files(&lut->lut_obd->obd_kset.kobj,
-				   lut->lut_attrs);
-		lut->lut_attrs = NULL;
-	}
-}
-EXPORT_SYMBOL(tgt_tunables_fini);
-
 /*
  * Save cross-MDT lock in lut_slc_locks.
  *
@@ -389,8 +152,6 @@ int tgt_init(const struct lu_env *env, struct lu_target *lut,
 	struct lu_attr		 attr;
 	struct lu_fid		 fid;
 	struct dt_object	*o;
-	struct tg_grants_data	*tgd = &lut->lut_tgd;
-	struct obd_statfs	*osfs;
 	int i, rc = 0;
 
 	ENTRY;
@@ -418,7 +179,7 @@ int tgt_init(const struct lu_env *env, struct lu_target *lut,
 	sptlrpc_rule_set_init(&lut->lut_sptlrpc_rset);
 
 	spin_lock_init(&lut->lut_flags_lock);
-	lut->lut_sync_lock_cancel = SYNC_LOCK_CANCEL_NEVER;
+	lut->lut_sync_lock_cancel = NEVER_SYNC_ON_CANCEL;
 
 	spin_lock_init(&lut->lut_slc_locks_guard);
 	INIT_LIST_HEAD(&lut->lut_slc_locks);
@@ -427,38 +188,6 @@ int tgt_init(const struct lu_env *env, struct lu_target *lut,
 	if (!obd->obd_replayable)
 		RETURN(0);
 
-	/* initialize grant and statfs data in target */
-	dt_conf_get(env, lut->lut_bottom, &lut->lut_dt_conf);
-
-	/* statfs data */
-	spin_lock_init(&tgd->tgd_osfs_lock);
-	tgd->tgd_osfs_age = ktime_get_seconds() - 1000;
-	tgd->tgd_osfs_unstable = 0;
-	tgd->tgd_statfs_inflight = 0;
-	tgd->tgd_osfs_inflight = 0;
-
-	/* grant data */
-	spin_lock_init(&tgd->tgd_grant_lock);
-	tgd->tgd_tot_dirty = 0;
-	tgd->tgd_tot_granted = 0;
-	tgd->tgd_tot_pending = 0;
-	tgd->tgd_grant_compat_disable = 0;
-
-	/* populate cached statfs data */
-	osfs = &tgt_th_info(env)->tti_u.osfs;
-	rc = tgt_statfs_internal(env, lut, osfs, 0, NULL);
-	if (rc != 0) {
-		CERROR("%s: can't get statfs data, rc %d\n", tgt_name(lut),
-			rc);
-		GOTO(out, rc);
-	}
-	if (!is_power_of_2(osfs->os_bsize)) {
-		CERROR("%s: blocksize (%d) is not a power of 2\n",
-			tgt_name(lut), osfs->os_bsize);
-		GOTO(out, rc = -EPROTO);
-	}
-	tgd->tgd_blockbits = fls(osfs->os_bsize) - 1;
-
 	spin_lock_init(&lut->lut_translock);
 	spin_lock_init(&lut->lut_client_bitmap_lock);
 
@@ -496,11 +225,6 @@ int tgt_init(const struct lu_env *env, struct lu_target *lut,
 	dt_txn_callback_add(lut->lut_bottom, &lut->lut_txn_cb);
 	lut->lut_bottom->dd_lu_dev.ld_site->ls_tgt = lut;
 
-	lut->lut_fmd_max_num = LUT_FMD_MAX_NUM_DEFAULT;
-	lut->lut_fmd_max_age = LUT_FMD_MAX_AGE_DEFAULT;
-
-	atomic_set(&lut->lut_sync_count, 0);
-
 	/* reply_data is supported by MDT targets only for now */
 	if (strncmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME, 3) != 0)
 		RETURN(0);
@@ -530,6 +254,8 @@ int tgt_init(const struct lu_env *env, struct lu_target *lut,
 	if (rc < 0)
 		GOTO(out, rc);
 
+	atomic_set(&lut->lut_sync_count, 0);
+
 	RETURN(0);
 
 out:
@@ -611,44 +337,8 @@ void tgt_fini(const struct lu_env *env, struct lu_target *lut)
 }
 EXPORT_SYMBOL(tgt_fini);
 
-static struct kmem_cache *tgt_thread_kmem;
-static struct kmem_cache *tgt_session_kmem;
-struct kmem_cache *tgt_fmd_kmem;
-
-static struct lu_kmem_descr tgt_caches[] = {
-	{
-		.ckd_cache = &tgt_thread_kmem,
-		.ckd_name  = "tgt_thread_kmem",
-		.ckd_size  = sizeof(struct tgt_thread_info),
-	},
-	{
-		.ckd_cache = &tgt_session_kmem,
-		.ckd_name  = "tgt_session_kmem",
-		.ckd_size  = sizeof(struct tgt_session_info)
-	},
-	{
-		.ckd_cache = &tgt_fmd_kmem,
-		.ckd_name  = "tgt_fmd_cache",
-		.ckd_size  = sizeof(struct tgt_fmd_data)
-	},
-	{
-		.ckd_cache = NULL
-	}
-};
-
-
 /* context key constructor/destructor: tg_key_init, tg_key_fini */
-static void *tgt_key_init(const struct lu_context *ctx,
-				  struct lu_context_key *key)
-{
-	struct tgt_thread_info *thread;
-
-	OBD_SLAB_ALLOC_PTR_GFP(thread, tgt_thread_kmem, GFP_NOFS);
-	if (thread == NULL)
-		return ERR_PTR(-ENOMEM);
-
-	return thread;
-}
+LU_KEY_INIT(tgt, struct tgt_thread_info);
 
 static void tgt_key_fini(const struct lu_context *ctx,
 			 struct lu_context_key *key, void *data)
@@ -665,7 +355,7 @@ static void tgt_key_fini(const struct lu_context *ctx,
 	if (args->ta_args != NULL)
 		OBD_FREE(args->ta_args, sizeof(args->ta_args[0]) *
 					args->ta_alloc_args);
-	OBD_SLAB_FREE_PTR(info, tgt_thread_kmem);
+	OBD_FREE_PTR(info);
 }
 
 static void tgt_key_exit(const struct lu_context *ctx,
@@ -687,25 +377,8 @@ struct lu_context_key tgt_thread_key = {
 
 LU_KEY_INIT_GENERIC(tgt);
 
-static void *tgt_ses_key_init(const struct lu_context *ctx,
-			      struct lu_context_key *key)
-{
-	struct tgt_session_info *session;
-
-	OBD_SLAB_ALLOC_PTR_GFP(session, tgt_session_kmem, GFP_NOFS);
-	if (session == NULL)
-		return ERR_PTR(-ENOMEM);
-
-	return session;
-}
-
-static void tgt_ses_key_fini(const struct lu_context *ctx,
-			     struct lu_context_key *key, void *data)
-{
-	struct tgt_session_info *session = data;
-
-	OBD_SLAB_FREE_PTR(session, tgt_session_kmem);
-}
+/* context key constructor/destructor: tgt_ses_key_init, tgt_ses_key_fini */
+LU_KEY_INIT_FINI(tgt_ses, struct tgt_session_info);
 
 /* context key: tgt_session_key */
 struct lu_context_key tgt_session_key = {
@@ -728,13 +401,8 @@ struct page *tgt_page_to_corrupt;
 
 int tgt_mod_init(void)
 {
-	int	result;
 	ENTRY;
 
-	result = lu_kmem_init(tgt_caches);
-	if (result != 0)
-		RETURN(result);
-
 	tgt_page_to_corrupt = alloc_page(GFP_KERNEL);
 
 	tgt_key_init_generic(&tgt_thread_key, NULL);
@@ -758,7 +426,5 @@ void tgt_mod_exit(void)
 	lu_context_key_degister(&tgt_thread_key);
 	lu_context_key_degister(&tgt_session_key);
 	update_info_fini();
-
-	lu_kmem_fini(tgt_caches);
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/target/update_records.c b/drivers/staging/lustrefsx/lustre/target/update_records.c
index 5fb706c5090a5..a36d554525507 100644
--- a/drivers/staging/lustrefsx/lustre/target/update_records.c
+++ b/drivers/staging/lustrefsx/lustre/target/update_records.c
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2015, 2017, Intel Corporation.
+ * Copyright (c) 2015, Intel Corporation.
  */
 
 /*
diff --git a/drivers/staging/lustrefsx/lustre/target/update_recovery.c b/drivers/staging/lustrefsx/lustre/target/update_recovery.c
index ac47105a633b9..3769d09d19282 100644
--- a/drivers/staging/lustrefsx/lustre/target/update_recovery.c
+++ b/drivers/staging/lustrefsx/lustre/target/update_recovery.c
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2015, 2017, Intel Corporation.
+ * Copyright (c) 2015, 2016, Intel Corporation.
  */
 
 /*
diff --git a/drivers/staging/lustrefsx/lustre/target/update_trans.c b/drivers/staging/lustrefsx/lustre/target/update_trans.c
index b8150fa5c694c..6c3e41438347c 100644
--- a/drivers/staging/lustrefsx/lustre/target/update_trans.c
+++ b/drivers/staging/lustrefsx/lustre/target/update_trans.c
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2015, 2017, Intel Corporation.
+ * Copyright (c) 2015, 2016, Intel Corporation.
  */
 /*
  * lustre/target/update_trans.c
@@ -82,11 +82,9 @@ static void top_multiple_thandle_dump(struct top_multiple_thandle *tmt,
 	list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) {
 		struct sub_thandle_cookie *stc;
 
-		CDEBUG(mask, "st %p obd %s committed %d started %d stopped %d "
-		       "result %d sub_th %p\n",
+		CDEBUG(mask, "st %p obd %s committed %d stopped %d sub_th %p\n",
 		       st, st->st_dt->dd_lu_dev.ld_obd->obd_name,
-		       st->st_committed, st->st_started, st->st_stopped,
-		       st->st_result, st->st_sub_th);
+		       st->st_committed, st->st_stopped, st->st_sub_th);
 
 		list_for_each_entry(stc, &st->st_cookie_list, stc_list) {
 			CDEBUG(mask, " cookie "DFID".%u\n",
@@ -528,7 +526,6 @@ static void sub_trans_stop_cb(struct lu_env *env,
 	struct top_multiple_thandle	*tmt = cb->dcb_data;
 	ENTRY;
 
-	spin_lock(&tmt->tmt_sub_lock);
 	list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) {
 		if (st->st_stopped)
 			continue;
@@ -539,7 +536,6 @@ static void sub_trans_stop_cb(struct lu_env *env,
 			break;
 		}
 	}
-	spin_unlock(&tmt->tmt_sub_lock);
 
 	wake_up(&tmt->tmt_stop_waitq);
 	RETURN_EXIT;
@@ -1020,8 +1016,6 @@ int top_trans_stop(const struct lu_env *env, struct dt_device *master_dev,
 			sub_trans_commit_cb_internal(tmt,
 						master_st->st_sub_th, rc);
 		if (rc < 0) {
-			CERROR("%s: stop trans failed: rc = %d\n",
-			       master_dev->dd_lu_dev.ld_obd->obd_name, rc);
 			th->th_result = rc;
 			GOTO(stop_other_trans, rc);
 		} else if (tur != NULL && tur->tur_update_records != NULL) {
@@ -1059,9 +1053,6 @@ int top_trans_stop(const struct lu_env *env, struct dt_device *master_dev,
 
 			rc = sub_updates_write(env, lur, st);
 			if (rc < 0) {
-				CERROR("%s: write updates failed: rc = %d\n",
-				       st->st_dt->dd_lu_dev.ld_obd->obd_name,
-				       rc);
 				th->th_result = rc;
 				break;
 			}
@@ -1081,12 +1072,8 @@ int top_trans_stop(const struct lu_env *env, struct dt_device *master_dev,
 		st->st_sub_th->th_result = th->th_result;
 		rc = dt_trans_stop(env, st->st_sub_th->th_dev,
 				   st->st_sub_th);
-		if (rc < 0) {
-			CERROR("%s: stop trans failed: rc = %d\n",
-			       st->st_dt->dd_lu_dev.ld_obd->obd_name, rc);
-			if (th->th_result == 0)
-				th->th_result = rc;
-		}
+		if (unlikely(rc < 0 && th->th_result == 0))
+			th->th_result = rc;
 	}
 
 	rc = top_trans_wait_result(top_th);
diff --git a/drivers/staging/lustrefsx/undef.h b/drivers/staging/lustrefsx/undef.h
index bc1b3326d7a40..aa1343bf5a36d 100644
--- a/drivers/staging/lustrefsx/undef.h
+++ b/drivers/staging/lustrefsx/undef.h
@@ -23,9 +23,15 @@
 /* extened attributes for ldiskfs */
 #undef CONFIG_LDISKFS_FS_XATTR
 
+/* Max LNET payload */
+#undef CONFIG_LNET_MAX_PAYLOAD
+
 /* enable invariant checking */
 #undef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
 
+/* IOCTL Buffer Size */
+#undef CONFIG_LUSTRE_OBD_MAX_IOCTL_BUFFER
+
 /* kernel has cpu affinity support */
 #undef CPU_AFFINITY
 
@@ -50,15 +56,9 @@
 /* do data checksums */
 #undef ENABLE_CHECKSUM
 
-/* enable flock by default */
-#undef ENABLE_FLOCK
-
 /* Use the Pinger */
 #undef ENABLE_PINGER
 
-/* aes-sha2 is supported by krb5 */
-#undef HAVE_AES_SHA2_SUPPORT
-
 /* Define to 1 if you have the <asm/types.h> header file. */
 #undef HAVE_ASM_TYPES_H
 
@@ -77,12 +77,6 @@
 /* 'bio_integrity_enabled' is available */
 #undef HAVE_BIO_INTEGRITY_ENABLED
 
-/* kernel has bio_integrity_prep_fn */
-#undef HAVE_BIO_INTEGRITY_PREP_FN
-
-/* bio_integrity_payload.bip_iter exist */
-#undef HAVE_BIP_ITER_BIO_INTEGRITY_PAYLOAD
-
 /* 'bi_bdev' is available */
 #undef HAVE_BI_BDEV
 
@@ -107,18 +101,9 @@
 /* blk_queue_max_segments is defined */
 #undef HAVE_BLK_QUEUE_MAX_SEGMENTS
 
-/* kernel hash_64() is broken */
-#undef HAVE_BROKEN_HASH_64
-
 /* kernel has struct bvec_iter */
 #undef HAVE_BVEC_ITER
 
-/* struct cache_detail has writers */
-#undef HAVE_CACHE_DETAIL_WRITERS
-
-/* if cache_detail->hash_lock is a spinlock */
-#undef HAVE_CACHE_HASH_SPINLOCK
-
 /* cache_head has hlist cache_list */
 #undef HAVE_CACHE_HEAD_HLIST
 
@@ -131,24 +116,24 @@
 /* kernel has clean_bdev_aliases */
 #undef HAVE_CLEAN_BDEV_ALIASES
 
-/* 'clear_and_wake_up_bit' is available */
-#undef HAVE_CLEAR_AND_WAKE_UP_BIT
-
 /* have clear_inode */
 #undef HAVE_CLEAR_INODE
 
 /* compat rdma found */
 #undef HAVE_COMPAT_RDMA
 
+/* cpumap_print_to_pagebuf is available */
+#undef HAVE_CPUMASK_PRINT_TO_PAGEBUF
+
 /* kernel compiled with CRC32 functions */
 #undef HAVE_CRC32
 
+/* struct cred has member tgcred */
+#undef HAVE_CRED_TGCRED
+
 /* crypto hash helper functions are available */
 #undef HAVE_CRYPTO_HASH_HELPERS
 
-/* 'CRYPTO_MAX_ALG_NAME' is 128 */
-#undef HAVE_CRYPTO_MAX_ALG_NAME_128
-
 /* current_time() has replaced CURRENT_TIME */
 #undef HAVE_CURRENT_TIME
 
@@ -167,9 +152,6 @@
 /* dentry_open uses struct path as first argument */
 #undef HAVE_DENTRY_OPEN_USE_PATH
 
-/* DES3 enctype is supported by krb5 */
-#undef HAVE_DES3_SUPPORT
-
 /* direct_IO need 2 arguments */
 #undef HAVE_DIRECTIO_2ARGS
 
@@ -251,9 +233,6 @@
 /* d_delete first parameter declared is not const */
 #undef HAVE_D_DELETE_CONST
 
-/* d_hash_and_lookup is exported by the kernel */
-#undef HAVE_D_HASH_AND_LOOKUP
-
 /* have d_make_root */
 #undef HAVE_D_MAKE_ROOT
 
@@ -341,15 +320,15 @@
 /* Define to 1 if you have the `gethostbyname' function. */
 #undef HAVE_GETHOSTBYNAME
 
-/* get_request_key_auth() is available */
-#undef HAVE_GET_REQUEST_KEY_AUTH
-
 /* get_user_pages takes 6 arguments */
 #undef HAVE_GET_USER_PAGES_6ARG
 
 /* get_user_pages takes gup_flags in arguments */
 #undef HAVE_GET_USER_PAGES_GUP_FLAGS
 
+/* get_user_pages takes gup_flags in arguments with 7 args */
+#undef HAVE_GET_USER_PAGES_GUP_FLAGS_7ARGS
+
 /* struct group_info has member gid */
 #undef HAVE_GROUP_INFO_GID
 
@@ -362,9 +341,6 @@
 /* Define this if the Kerberos GSS library supports gss_krb5_ccache_name */
 #undef HAVE_GSS_KRB5_CCACHE_NAME
 
-/* '__rhashtable_insert_fast()' returns int */
-#undef HAVE_HASHTABLE_INSERT_FAST_RETURN_INT
-
 /* Define this if you have Heimdal Kerberos libraries */
 #undef HAVE_HEIMDAL
 
@@ -413,9 +389,6 @@
 /* if ib_sg_dma_address wrapper exists */
 #undef HAVE_IB_SG_DMA_ADDRESS
 
-/* INIT_LIST_HEAD_RCU exists */
-#undef HAVE_INIT_LIST_HEAD_RCU
-
 /* inode_operations .getattr member function can gather advance stats */
 #undef HAVE_INODEOPS_ENHANCED_GETATTR
 
@@ -440,15 +413,6 @@
 /* inode_operations->permission has two args */
 #undef HAVE_INODE_PERMISION_2ARGS
 
-/* inode times are using timespec64 */
-#undef HAVE_INODE_TIMESPEC64
-
-/* blk_integrity.interval exist */
-#undef HAVE_INTERVAL_BLK_INTEGRITY
-
-/* blk_integrity.interval_exp exist */
-#undef HAVE_INTERVAL_EXP_BLK_INTEGRITY
-
 /* Define to 1 if you have the <inttypes.h> header file. */
 #undef HAVE_INTTYPES_H
 
@@ -458,9 +422,6 @@
 /* have in_compat_syscall */
 #undef HAVE_IN_COMPAT_SYSCALL
 
-/* 'in_dev_for_each_ifa_rtnl' is defined */
-#undef HAVE_IN_DEV_FOR_EACH_IFA_RTNL
-
 /* inode_operations->rename need flags as argument */
 #undef HAVE_IOPS_RENAME_WITH_FLAGS
 
@@ -500,27 +461,18 @@
 /* is_sxid is defined */
 #undef HAVE_IS_SXID
 
-/* 'iterate_shared' is available */
-#undef HAVE_ITERATE_SHARED
-
 /* struct address_space has i_pages */
 #undef HAVE_I_PAGES
 
 /* i_uid_read is present */
 #undef HAVE_I_UID_READ
 
-/* kallsyms_lookup_name is exported by kernel */
-#undef HAVE_KALLSYMS_LOOKUP_NAME
+/* jiffies_to_timespec64() is available */
+#undef HAVE_JIFFIES_TO_TIMESPEC64
 
 /* kernel_locked is defined */
 #undef HAVE_KERNEL_LOCKED
 
-/* 'kernel_param_[un]lock' is available */
-#undef HAVE_KERNEL_PARAM_LOCK
-
-/* 'struct kernel_param_ops' is available */
-#undef HAVE_KERNEL_PARAM_OPS
-
 /* kernel_setsockopt still in use */
 #undef HAVE_KERNEL_SETSOCKOPT
 
@@ -539,9 +491,6 @@
 /* key_type->instantiate has two args */
 #undef HAVE_KEY_TYPE_INSTANTIATE_2ARGS
 
-/* key.usage is of type refcount_t */
-#undef HAVE_KEY_USAGE_REFCOUNT
-
 /* ki_left exist */
 #undef HAVE_KIOCB_KI_LEFT
 
@@ -570,15 +519,12 @@
    available */
 #undef HAVE_KRB5_GET_INIT_CREDS_OPT_SET_ADDRESSLESS
 
-/* kset_find_obj is exported by the kernel */
-#undef HAVE_KSET_FIND_OBJ
-
-/* kernel has kstrtobool_from_user */
-#undef HAVE_KSTRTOBOOL_FROM_USER
-
 /* kernel has kstrtoul */
 #undef HAVE_KSTRTOUL
 
+/* kernel has ksys_close */
+#undef HAVE_KSYS_CLOSE
+
 /* kthread_worker found */
 #undef HAVE_KTHREAD_WORK
 
@@ -606,9 +552,6 @@
 /* 'ktime_get_ts64' is available */
 #undef HAVE_KTIME_GET_TS64
 
-/* 'ktime_ms_delta' is available */
-#undef HAVE_KTIME_MS_DELTA
-
 /* 'ktime_to_timespec64' is available */
 #undef HAVE_KTIME_TO_TIMESPEC64
 
@@ -636,12 +579,21 @@
 /* readline library is available */
 #undef HAVE_LIBREADLINE
 
-/* linux/rhashtable.h is present */
-#undef HAVE_LINUX_RHASHTABLE_H
+/* Define to 1 if you have the <linux/random.h> header file. */
+#undef HAVE_LINUX_RANDOM_H
 
 /* if linux/selinux.h exists */
 #undef HAVE_LINUX_SELINUX_IS_ENABLED
 
+/* Define to 1 if you have the <linux/types.h> header file. */
+#undef HAVE_LINUX_TYPES_H
+
+/* Define to 1 if you have the <linux/unistd.h> header file. */
+#undef HAVE_LINUX_UNISTD_H
+
+/* Define to 1 if you have the <linux/version.h> header file. */
+#undef HAVE_LINUX_VERSION_H
+
 /* lock_manager_operations has lm_compare_owner */
 #undef HAVE_LM_COMPARE_OWNER
 
@@ -651,9 +603,6 @@
 /* kernel has locks_lock_file_wait */
 #undef HAVE_LOCKS_LOCK_FILE_WAIT
 
-/* lookup_user_key() is available */
-#undef HAVE_LOOKUP_USER_KEY
-
 /* kernel has LOOP_CTL_GET_FREE */
 #undef HAVE_LOOP_CTL_GET_FREE
 
@@ -682,9 +631,6 @@
 /* kernel module loading is possible */
 #undef HAVE_MODULE_LOADING_SUPPORT
 
-/* locking module param is supported */
-#undef HAVE_MODULE_PARAM_LOCKING
-
 /* Define to 1 if you have the `name_to_handle_at' function. */
 #undef HAVE_NAME_TO_HANDLE_AT
 
@@ -694,24 +640,15 @@
 /* cancel_dirty_page with one arguement is available */
 #undef HAVE_NEW_CANCEL_DIRTY_PAGE
 
-/* DEFINE_TIMER uses only 2 arguements */
-#undef HAVE_NEW_DEFINE_TIMER
-
 /* 'kernel_write' aligns with read/write helpers */
 #undef HAVE_NEW_KERNEL_WRITE
 
 /* NR_UNSTABLE_NFS is still in use. */
 #undef HAVE_NR_UNSTABLE_NFS
 
-/* ns_to_timespec64() is available */
-#undef HAVE_NS_TO_TIMESPEC64
-
 /* with oldsize */
 #undef HAVE_OLDSIZE_TRUNCATE_PAGECACHE
 
-/* openssl-devel is present */
-#undef HAVE_OPENSSL_GETSEPOL
-
 /* OpenSSL HMAC functions needed for SSK */
 #undef HAVE_OPENSSL_SSK
 
@@ -736,9 +673,6 @@
 /* posix_acl_valid takes struct user_namespace */
 #undef HAVE_POSIX_ACL_VALID_USER_NS
 
-/* 'prepare_to_wait_event' is available */
-#undef HAVE_PREPARE_TO_WAIT_EVENT
-
 /* struct proc_ops exists */
 #undef HAVE_PROC_OPS
 
@@ -751,18 +685,12 @@
 /* inode->i_nlink is protected from direct modification */
 #undef HAVE_PROTECT_I_NLINK
 
-/* 'PTR_ERR_OR_ZERO' exist */
-#undef HAVE_PTR_ERR_OR_ZERO
-
 /* have quota64 */
 #undef HAVE_QUOTA64
 
 /* radix_tree_exceptional_entry exist */
 #undef HAVE_RADIX_EXCEPTION_ENTRY
 
-/* rdma_connect_locked is defined */
-#undef HAVE_RDMA_CONNECT_LOCKED
-
 /* rdma_create_id wants 4 args */
 #undef HAVE_RDMA_CREATE_ID_4ARG
 
@@ -772,24 +700,15 @@
 /* rdma_reject has 4 arguments */
 #undef HAVE_RDMA_REJECT_4ARGS
 
+/* reinit_completion is exist */
+#undef HAVE_REINIT_COMPLETION
+
 /* kernel export remove_from_page_cache */
 #undef HAVE_REMOVE_FROM_PAGE_CACHE
 
 /* remove_proc_subtree is defined */
 #undef HAVE_REMOVE_PROC_SUBTREE
 
-/* rhashtable_lookup() is available */
-#undef HAVE_RHASHTABLE_LOOKUP
-
-/* rhashtable_lookup_get_insert_fast() is available */
-#undef HAVE_RHASHTABLE_LOOKUP_GET_INSERT_FAST
-
-/* struct rhltable exist */
-#undef HAVE_RHLTABLE
-
-/* save_stack_trace_tsk is exported */
-#undef HAVE_SAVE_STACK_TRACE_TSK
-
 /* Have sa_spill_alloc in ZFS */
 #undef HAVE_SA_SPILL_ALLOC
 
@@ -814,9 +733,6 @@
 /* security_inode_init_security takes a 'struct qstr' parameter */
 #undef HAVE_SECURITY_IINITSEC_QSTR
 
-/* security_inode_listsecurity() is available/exported */
-#undef HAVE_SECURITY_INODE_LISTSECURITY
-
 /* security_release_secctx has 1 arg. */
 #undef HAVE_SEC_RELEASE_SECCTX_1ARG
 
@@ -860,27 +776,36 @@
 /* Have spa_maxblocksize in ZFS */
 #undef HAVE_SPA_MAXBLOCKSIZE
 
+/* spinlock_t is defined */
+#undef HAVE_SPINLOCK_T
+
 /* struct stacktrace_ops exists */
 #undef HAVE_STACKTRACE_OPS
 
 /* stacktrace_ops.warning is exist */
 #undef HAVE_STACKTRACE_WARNING
 
+/* stack_trace_print() exists */
+#undef HAVE_STACK_TRACE_PRINT
+
 /* Define to 1 if you have the <stdint.h> header file. */
 #undef HAVE_STDINT_H
 
 /* Define to 1 if you have the <stdlib.h> header file. */
 #undef HAVE_STDLIB_H
 
-/* stringhash.h is present */
-#undef HAVE_STRINGHASH
-
 /* Define to 1 if you have the <strings.h> header file. */
 #undef HAVE_STRINGS_H
 
 /* Define to 1 if you have the <string.h> header file. */
 #undef HAVE_STRING_H
 
+/* Define to 1 if you have the `strlcat' function. */
+#undef HAVE_STRLCAT
+
+/* Define to 1 if you have the `strlcpy' function. */
+#undef HAVE_STRLCPY
+
 /* Define to 1 if you have the `strnlen' function. */
 #undef HAVE_STRNLEN
 
@@ -908,6 +833,9 @@
 /* ctl_table has ctl_name field */
 #undef HAVE_SYSCTL_CTLNAME
 
+/* Define to 1 if you have the <sys/ioctl.h> header file. */
+#undef HAVE_SYS_IOCTL_H
+
 /* Define to 1 if you have <sys/quota.h>. */
 #undef HAVE_SYS_QUOTA_H
 
@@ -938,6 +866,9 @@
 /* 'timespec64_to_ktime' is available */
 #undef HAVE_TIMESPEC64_TO_KTIME
 
+/* have_time_t */
+#undef HAVE_TIME_T
+
 /* topology_sibling_cpumask is available */
 #undef HAVE_TOPOLOGY_SIBLING_CPUMASK
 
@@ -986,18 +917,9 @@
 /* 'struct vm_operations' remove struct vm_area_struct argument */
 #undef HAVE_VM_OPS_USE_VM_FAULT_ONLY
 
-/* wait_bit.h is present */
-#undef HAVE_WAIT_BIT_HEADER_H
-
 /* 'wait_queue_entry_t' is available */
 #undef HAVE_WAIT_QUEUE_ENTRY
 
-/* linux wait_queue_head_t list_head is name head */
-#undef HAVE_WAIT_QUEUE_ENTRY_LIST
-
-/* 'wait_var_event' is available */
-#undef HAVE_WAIT_VAR_EVENT
-
 /* flags field exist */
 #undef HAVE_XATTR_HANDLER_FLAGS
 
@@ -1022,18 +944,9 @@
 /* Have zap_remove_by_dnode() in ZFS */
 #undef HAVE_ZAP_REMOVE_ADD_BY_DNODE
 
-/* Have inode_timespec_t */
-#undef HAVE_ZFS_INODE_TIMESPEC
-
-/* Have multihost protection in ZFS */
-#undef HAVE_ZFS_MULTIHOST
-
 /* Enable zfs osd */
 #undef HAVE_ZFS_OSD
 
-/* Have zfs_refcount_add */
-#undef HAVE_ZFS_REFCOUNT_ADD
-
 /* __add_wait_queue_exclusive exists */
 #undef HAVE___ADD_WAIT_QUEUE_EXCLUSIVE
 
@@ -1083,9 +996,6 @@
 /* need pclmulqdq based crc32 */
 #undef NEED_CRC32_ACCEL
 
-/* 'ktime_get_ns' is not available */
-#undef NEED_KTIME_GET_NS
-
 /* 'ktime_get_real_ns' is not available */
 #undef NEED_KTIME_GET_REAL_NS
 
@@ -1116,6 +1026,9 @@
 /* name of parallel fsck program */
 #undef PFSCK
 
+/* proc handler methods use __user */
+#undef PROC_HANDLER_USE_USER_ATTR
+
 /* enable randomly alloc failure */
 #undef RANDOM_FAIL_ALLOC
 

From 74df982204aa97f7776471b8455581800380dd6f Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Mon, 18 Apr 2022 16:49:13 +0000
Subject: [PATCH 380/737] svm: fix backport of "KVM: X86: Move
 write_l1_tsc_offset() logic to common code and rename it"

The backport of edcfe5405811 ("KVM: X86: Move write_l1_tsc_offset() logic
to common code and rename it") had a problem that caused crashes for
AMD/SVM, because of an unconditional reference to the nested part of
the svm structure.

Fix this by making it conditional on is_guest_mode

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 arch/x86/kvm/svm/svm.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 0c1e265409eba..352d8300263c9 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1077,7 +1077,10 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	svm->nested.hsave->control.tsc_offset = vcpu->arch.l1_tsc_offset;
+	if (is_guest_mode(&svm->vcpu)) {
+		svm->nested.hsave->control.tsc_offset =
+		    vcpu->arch.l1_tsc_offset;
+	}
 	svm->vmcb->control.tsc_offset = offset;
 	vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 }

From e5891082d5e80ce94a504f53e71f9c26cc27eb53 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 14 Dec 2020 19:04:52 -0800
Subject: [PATCH 381/737] mm/filemap/c: break generic_file_buffered_read up
 into multiple functions

Patch series "generic_file_buffered_read() improvements", v2.

generic_file_buffered_read() has turned into a real monstrosity to work
with.  And it's a major performance improvement, for both small random and
large sequential reads.  On my test box, 4k buffered random reads go from
~150k to ~250k iops, and the improvements to big sequential reads are even
bigger.

This incorporates the fix for IOCB_WAITQ handling that Jens just posted as
well, also factors out lock_page_for_iocb() to improve handling of the
various iocb flags.

This patch (of 2):

This is prep work for changing generic_file_buffered_read() to use
find_get_pages_contig() to batch up all the pagecache lookups.

This patch should be functionally identical to the existing code and
changes as little as of the flow control as possible.  More refactoring
could be done, this patch is intended to be relatively minimal.

Link: https://lkml.kernel.org/r/20201025212949.602194-1-kent.overstreet@gmail.com
Link: https://lkml.kernel.org/r/20201025212949.602194-2-kent.overstreet@gmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit 723ef24b9b379e59facc65de8c065c8b89d479cd)
---
 mm/filemap.c | 483 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 261 insertions(+), 222 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 3a983bc1a71c9..c585586c4b5d8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2170,6 +2170,234 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra)
 	ra->ra_pages /= 4;
 }
 
+static int lock_page_for_iocb(struct kiocb *iocb, struct page *page)
+{
+	if (iocb->ki_flags & IOCB_WAITQ)
+		return lock_page_async(page, iocb->ki_waitq);
+	else if (iocb->ki_flags & IOCB_NOWAIT)
+		return trylock_page(page) ? 0 : -EAGAIN;
+	else
+		return lock_page_killable(page);
+}
+
+static int generic_file_buffered_read_page_ok(struct kiocb *iocb,
+			struct iov_iter *iter,
+			struct page *page)
+{
+	struct address_space *mapping = iocb->ki_filp->f_mapping;
+	struct inode *inode = mapping->host;
+	struct file_ra_state *ra = &iocb->ki_filp->f_ra;
+	unsigned int offset = iocb->ki_pos & ~PAGE_MASK;
+	unsigned int bytes, copied;
+	loff_t isize, end_offset;
+
+	BUG_ON(iocb->ki_pos >> PAGE_SHIFT != page->index);
+
+	/*
+	 * i_size must be checked after we know the page is Uptodate.
+	 *
+	 * Checking i_size after the check allows us to calculate
+	 * the correct value for "bytes", which means the zero-filled
+	 * part of the page is not copied back to userspace (unless
+	 * another truncate extends the file - this is desired though).
+	 */
+
+	isize = i_size_read(inode);
+	if (unlikely(iocb->ki_pos >= isize))
+		return 1;
+
+	end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
+
+	bytes = min_t(loff_t, end_offset - iocb->ki_pos, PAGE_SIZE - offset);
+
+	/* If users can be writing to this page using arbitrary
+	 * virtual addresses, take care about potential aliasing
+	 * before reading the page on the kernel side.
+	 */
+	if (mapping_writably_mapped(mapping))
+		flush_dcache_page(page);
+
+	/*
+	 * Ok, we have the page, and it's up-to-date, so
+	 * now we can copy it to user space...
+	 */
+
+	copied = copy_page_to_iter(page, offset, bytes, iter);
+
+	iocb->ki_pos += copied;
+
+	/*
+	 * When a sequential read accesses a page several times,
+	 * only mark it as accessed the first time.
+	 */
+	if (iocb->ki_pos >> PAGE_SHIFT != ra->prev_pos >> PAGE_SHIFT)
+		mark_page_accessed(page);
+
+	ra->prev_pos = iocb->ki_pos;
+
+	if (copied < bytes)
+		return -EFAULT;
+
+	return !iov_iter_count(iter) || iocb->ki_pos == isize;
+}
+
+static struct page *
+generic_file_buffered_read_readpage(struct kiocb *iocb,
+				    struct file *filp,
+				    struct address_space *mapping,
+				    struct page *page)
+{
+	struct file_ra_state *ra = &filp->f_ra;
+	int error;
+
+	if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
+		unlock_page(page);
+		put_page(page);
+		return ERR_PTR(-EAGAIN);
+	}
+
+	/*
+	 * A previous I/O error may have been due to temporary
+	 * failures, eg. multipath errors.
+	 * PG_error will be set again if readpage fails.
+	 */
+	ClearPageError(page);
+	/* Start the actual read. The read will unlock the page. */
+	error = mapping->a_ops->readpage(filp, page);
+
+	if (unlikely(error)) {
+		put_page(page);
+		return error != AOP_TRUNCATED_PAGE ? ERR_PTR(error) : NULL;
+	}
+
+	if (!PageUptodate(page)) {
+		error = lock_page_for_iocb(iocb, page);
+		if (unlikely(error)) {
+			put_page(page);
+			return ERR_PTR(error);
+		}
+		if (!PageUptodate(page)) {
+			if (page->mapping == NULL) {
+				/*
+				 * invalidate_mapping_pages got it
+				 */
+				unlock_page(page);
+				put_page(page);
+				return NULL;
+			}
+			unlock_page(page);
+			shrink_readahead_size_eio(ra);
+			put_page(page);
+			return ERR_PTR(-EIO);
+		}
+		unlock_page(page);
+	}
+
+	return page;
+}
+
+static struct page *
+generic_file_buffered_read_pagenotuptodate(struct kiocb *iocb,
+					   struct file *filp,
+					   struct iov_iter *iter,
+					   struct page *page,
+					   loff_t pos, loff_t count)
+{
+	struct address_space *mapping = filp->f_mapping;
+	struct inode *inode = mapping->host;
+	int error;
+
+	/*
+	 * See comment in do_read_cache_page on why
+	 * wait_on_page_locked is used to avoid unnecessarily
+	 * serialisations and why it's safe.
+	 */
+	if (iocb->ki_flags & IOCB_WAITQ) {
+		error = wait_on_page_locked_async(page,
+						iocb->ki_waitq);
+	} else {
+		error = wait_on_page_locked_killable(page);
+	}
+	if (unlikely(error)) {
+		put_page(page);
+		return ERR_PTR(error);
+	}
+	if (PageUptodate(page))
+		return page;
+
+	if (inode->i_blkbits == PAGE_SHIFT ||
+			!mapping->a_ops->is_partially_uptodate)
+		goto page_not_up_to_date;
+	/* pipes can't handle partially uptodate pages */
+	if (unlikely(iov_iter_is_pipe(iter)))
+		goto page_not_up_to_date;
+	if (!trylock_page(page))
+		goto page_not_up_to_date;
+	/* Did it get truncated before we got the lock? */
+	if (!page->mapping)
+		goto page_not_up_to_date_locked;
+	if (!mapping->a_ops->is_partially_uptodate(page,
+				pos & ~PAGE_MASK, count))
+		goto page_not_up_to_date_locked;
+	unlock_page(page);
+	return page;
+
+page_not_up_to_date:
+	/* Get exclusive access to the page ... */
+	error = lock_page_for_iocb(iocb, page);
+	if (unlikely(error)) {
+		put_page(page);
+		return ERR_PTR(error);
+	}
+
+page_not_up_to_date_locked:
+	/* Did it get truncated before we got the lock? */
+	if (!page->mapping) {
+		unlock_page(page);
+		put_page(page);
+		return NULL;
+	}
+
+	/* Did somebody else fill it already? */
+	if (PageUptodate(page)) {
+		unlock_page(page);
+		return page;
+	}
+
+	return generic_file_buffered_read_readpage(iocb, filp, mapping, page);
+}
+
+static struct page *
+generic_file_buffered_read_no_cached_page(struct kiocb *iocb,
+					  struct iov_iter *iter)
+{
+	struct file *filp = iocb->ki_filp;
+	struct address_space *mapping = filp->f_mapping;
+	pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
+	struct page *page;
+	int error;
+
+	if (iocb->ki_flags & IOCB_NOIO)
+		return ERR_PTR(-EAGAIN);
+
+	/*
+	 * Ok, it wasn't cached, so we need to create a new
+	 * page..
+	 */
+	page = page_cache_alloc(mapping);
+	if (!page)
+		return ERR_PTR(-ENOMEM);
+
+	error = add_to_page_cache_lru(page, mapping, index,
+				      mapping_gfp_constraint(mapping, GFP_KERNEL));
+	if (error) {
+		put_page(page);
+		return error != -EEXIST ? ERR_PTR(error) : NULL;
+	}
+
+	return generic_file_buffered_read_readpage(iocb, filp, mapping, page);
+}
+
 /**
  * generic_file_buffered_read - generic file read routine
  * @iocb:	the iocb to read
@@ -2193,23 +2421,15 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 	struct address_space *mapping = filp->f_mapping;
 	struct inode *inode = mapping->host;
 	struct file_ra_state *ra = &filp->f_ra;
-	loff_t *ppos = &iocb->ki_pos;
-	pgoff_t index;
+	size_t orig_count = iov_iter_count(iter);
 	pgoff_t last_index;
-	pgoff_t prev_index;
-	unsigned long offset;      /* offset into pagecache page */
-	unsigned int prev_offset;
 	int error = 0;
 
-	if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
+	if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
 		return 0;
 	iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
 
-	index = *ppos >> PAGE_SHIFT;
-	prev_index = ra->prev_pos >> PAGE_SHIFT;
-	prev_offset = ra->prev_pos & (PAGE_SIZE-1);
-	last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
-	offset = *ppos & ~PAGE_MASK;
+	last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
 
 	/*
 	 * If we've already successfully copied some data, then we
@@ -2220,10 +2440,8 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 		iocb->ki_flags |= IOCB_NOWAIT;
 
 	for (;;) {
+		pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
 		struct page *page;
-		pgoff_t end_index;
-		loff_t isize;
-		unsigned long nr, ret;
 
 		cond_resched();
 find_page:
@@ -2232,6 +2450,14 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 			goto out;
 		}
 
+		/*
+		 * We can't return -EIOCBQUEUED once we've done some work, so
+		 * ensure we don't block:
+		 */
+		if ((iocb->ki_flags & IOCB_WAITQ) &&
+		    (written + orig_count - iov_iter_count(iter)))
+			iocb->ki_flags |= IOCB_NOWAIT;
+
 		page = find_get_page(mapping, index);
 		if (!page) {
 			if (iocb->ki_flags & IOCB_NOIO)
@@ -2240,8 +2466,15 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 					ra, filp,
 					index, last_index - index);
 			page = find_get_page(mapping, index);
-			if (unlikely(page == NULL))
-				goto no_cached_page;
+			if (unlikely(page == NULL)) {
+				page = generic_file_buffered_read_no_cached_page(iocb, iter);
+				if (!page)
+					goto find_page;
+				if (IS_ERR(page)) {
+					error = PTR_ERR(page);
+					goto out;
+				}
+			}
 		}
 		if (PageReadahead(page)) {
 			if (iocb->ki_flags & IOCB_NOIO) {
@@ -2253,231 +2486,37 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 					index, last_index - index);
 		}
 		if (!PageUptodate(page)) {
-			/*
-			 * See comment in do_read_cache_page on why
-			 * wait_on_page_locked is used to avoid unnecessarily
-			 * serialisations and why it's safe.
-			 */
-			if (iocb->ki_flags & IOCB_WAITQ) {
-				if (written) {
-					put_page(page);
-					goto out;
-				}
-				error = wait_on_page_locked_async(page,
-								iocb->ki_waitq);
-			} else {
-				if (iocb->ki_flags & IOCB_NOWAIT) {
-					put_page(page);
-					goto would_block;
-				}
-				error = wait_on_page_locked_killable(page);
-			}
-			if (unlikely(error))
-				goto readpage_error;
-			if (PageUptodate(page))
-				goto page_ok;
-
-			if (inode->i_blkbits == PAGE_SHIFT ||
-					!mapping->a_ops->is_partially_uptodate)
-				goto page_not_up_to_date;
-			/* pipes can't handle partially uptodate pages */
-			if (unlikely(iov_iter_is_pipe(iter)))
-				goto page_not_up_to_date;
-			if (!trylock_page(page))
-				goto page_not_up_to_date;
-			/* Did it get truncated before we got the lock? */
-			if (!page->mapping)
-				goto page_not_up_to_date_locked;
-			if (!mapping->a_ops->is_partially_uptodate(page,
-							offset, iter->count))
-				goto page_not_up_to_date_locked;
-			unlock_page(page);
-		}
-page_ok:
-		/*
-		 * i_size must be checked after we know the page is Uptodate.
-		 *
-		 * Checking i_size after the check allows us to calculate
-		 * the correct value for "nr", which means the zero-filled
-		 * part of the page is not copied back to userspace (unless
-		 * another truncate extends the file - this is desired though).
-		 */
-
-		isize = i_size_read(inode);
-		end_index = (isize - 1) >> PAGE_SHIFT;
-		if (unlikely(!isize || index > end_index)) {
-			put_page(page);
-			goto out;
-		}
-
-		/* nr is the maximum number of bytes to copy from this page */
-		nr = PAGE_SIZE;
-		if (index == end_index) {
-			nr = ((isize - 1) & ~PAGE_MASK) + 1;
-			if (nr <= offset) {
+			if (iocb->ki_flags & IOCB_NOWAIT) {
 				put_page(page);
+				error = -EAGAIN;
 				goto out;
 			}
-		}
-		nr = nr - offset;
-
-		/* If users can be writing to this page using arbitrary
-		 * virtual addresses, take care about potential aliasing
-		 * before reading the page on the kernel side.
-		 */
-		if (mapping_writably_mapped(mapping))
-			flush_dcache_page(page);
-
-		/*
-		 * When a sequential read accesses a page several times,
-		 * only mark it as accessed the first time.
-		 */
-		if (prev_index != index || offset != prev_offset)
-			mark_page_accessed(page);
-		prev_index = index;
-
-		/*
-		 * Ok, we have the page, and it's up-to-date, so
-		 * now we can copy it to user space...
-		 */
-
-		ret = copy_page_to_iter(page, offset, nr, iter);
-		offset += ret;
-		index += offset >> PAGE_SHIFT;
-		offset &= ~PAGE_MASK;
-		prev_offset = offset;
-
-		put_page(page);
-		written += ret;
-		if (!iov_iter_count(iter))
-			goto out;
-		if (ret < nr) {
-			error = -EFAULT;
-			goto out;
-		}
-		continue;
-
-page_not_up_to_date:
-		/* Get exclusive access to the page ... */
-		if (iocb->ki_flags & IOCB_WAITQ) {
-			if (written) {
-				put_page(page);
-				goto out;
-			}
-			error = lock_page_async(page, iocb->ki_waitq);
-		} else {
-			error = lock_page_killable(page);
-		}
-		if (unlikely(error))
-			goto readpage_error;
-
-page_not_up_to_date_locked:
-		/* Did it get truncated before we got the lock? */
-		if (!page->mapping) {
-			unlock_page(page);
-			put_page(page);
-			continue;
-		}
-
-		/* Did somebody else fill it already? */
-		if (PageUptodate(page)) {
-			unlock_page(page);
-			goto page_ok;
-		}
-
-readpage:
-		if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
-			unlock_page(page);
-			put_page(page);
-			goto would_block;
-		}
-		/*
-		 * A previous I/O error may have been due to temporary
-		 * failures, eg. multipath errors.
-		 * PG_error will be set again if readpage fails.
-		 */
-		ClearPageError(page);
-		/* Start the actual read. The read will unlock the page. */
-		error = mapping->a_ops->readpage(filp, page);
-
-		if (unlikely(error)) {
-			if (error == AOP_TRUNCATED_PAGE) {
-				put_page(page);
-				error = 0;
+			page = generic_file_buffered_read_pagenotuptodate(iocb,
+					filp, iter, page, iocb->ki_pos, iter->count);
+			if (!page)
 				goto find_page;
+			if (IS_ERR(page)) {
+				error = PTR_ERR(page);
+				goto out;
 			}
-			goto readpage_error;
-		}
-
-		if (!PageUptodate(page)) {
-			if (iocb->ki_flags & IOCB_WAITQ) {
-				if (written) {
-					put_page(page);
-					goto out;
-				}
-				error = lock_page_async(page, iocb->ki_waitq);
-			} else {
-				error = lock_page_killable(page);
-			}
-
-			if (unlikely(error))
-				goto readpage_error;
-			if (!PageUptodate(page)) {
-				if (page->mapping == NULL) {
-					/*
-					 * invalidate_mapping_pages got it
-					 */
-					unlock_page(page);
-					put_page(page);
-					goto find_page;
-				}
-				unlock_page(page);
-				shrink_readahead_size_eio(ra);
-				error = -EIO;
-				goto readpage_error;
-			}
-			unlock_page(page);
 		}
 
-		goto page_ok;
-
-readpage_error:
-		/* UHHUH! A synchronous read error occurred. Report it */
+		error = generic_file_buffered_read_page_ok(iocb, iter, page);
 		put_page(page);
-		goto out;
 
-no_cached_page:
-		/*
-		 * Ok, it wasn't cached, so we need to create a new
-		 * page..
-		 */
-		page = page_cache_alloc(mapping);
-		if (!page) {
-			error = -ENOMEM;
-			goto out;
-		}
-		error = add_to_page_cache_lru(page, mapping, index,
-				mapping_gfp_constraint(mapping, GFP_KERNEL));
 		if (error) {
-			put_page(page);
-			if (error == -EEXIST) {
+			if (error > 0)
 				error = 0;
-				goto find_page;
-			}
 			goto out;
 		}
-		goto readpage;
 	}
 
 would_block:
 	error = -EAGAIN;
 out:
-	ra->prev_pos = prev_index;
-	ra->prev_pos <<= PAGE_SHIFT;
-	ra->prev_pos |= prev_offset;
-
-	*ppos = ((loff_t)index << PAGE_SHIFT) + offset;
 	file_accessed(filp);
+	written += orig_count - iov_iter_count(iter);
+
 	return written ? written : error;
 }
 EXPORT_SYMBOL_GPL(generic_file_buffered_read);

From 238ceda1df24002b79f9068ae6ee1316b46d0cd0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 14 Dec 2020 19:04:56 -0800
Subject: [PATCH 382/737] mm/filemap.c: generic_file_buffered_read() now uses
 find_get_pages_contig

Convert generic_file_buffered_read() to get pages to read from in batches,
and then copy data to userspace from many pages at once - in particular,
we now don't touch any cachelines that might be contended while we're in
the loop to copy data to userspace.

This is is a performance improvement on workloads that do buffered reads
with large blocksizes, and a very large performance improvement if that
file is also being accessed concurrently by different threads.

On smaller reads (512 bytes), there's a very small performance improvement
(1%, within the margin of error).

akpm: kernel test robot found a 32% speedup on one test:
https://lkml.kernel.org/r/20201030081456.GY31092@shao2-debian

Link: https://lkml.kernel.org/r/20201025212949.602194-3-kent.overstreet@gmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: kernel test robot <rong.a.chen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit 06c0444290cecf04c89c62e6d448b8461507d247)
---
 mm/filemap.c | 313 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 175 insertions(+), 138 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index c585586c4b5d8..684c16303bfe2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2180,67 +2180,6 @@ static int lock_page_for_iocb(struct kiocb *iocb, struct page *page)
 		return lock_page_killable(page);
 }
 
-static int generic_file_buffered_read_page_ok(struct kiocb *iocb,
-			struct iov_iter *iter,
-			struct page *page)
-{
-	struct address_space *mapping = iocb->ki_filp->f_mapping;
-	struct inode *inode = mapping->host;
-	struct file_ra_state *ra = &iocb->ki_filp->f_ra;
-	unsigned int offset = iocb->ki_pos & ~PAGE_MASK;
-	unsigned int bytes, copied;
-	loff_t isize, end_offset;
-
-	BUG_ON(iocb->ki_pos >> PAGE_SHIFT != page->index);
-
-	/*
-	 * i_size must be checked after we know the page is Uptodate.
-	 *
-	 * Checking i_size after the check allows us to calculate
-	 * the correct value for "bytes", which means the zero-filled
-	 * part of the page is not copied back to userspace (unless
-	 * another truncate extends the file - this is desired though).
-	 */
-
-	isize = i_size_read(inode);
-	if (unlikely(iocb->ki_pos >= isize))
-		return 1;
-
-	end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
-
-	bytes = min_t(loff_t, end_offset - iocb->ki_pos, PAGE_SIZE - offset);
-
-	/* If users can be writing to this page using arbitrary
-	 * virtual addresses, take care about potential aliasing
-	 * before reading the page on the kernel side.
-	 */
-	if (mapping_writably_mapped(mapping))
-		flush_dcache_page(page);
-
-	/*
-	 * Ok, we have the page, and it's up-to-date, so
-	 * now we can copy it to user space...
-	 */
-
-	copied = copy_page_to_iter(page, offset, bytes, iter);
-
-	iocb->ki_pos += copied;
-
-	/*
-	 * When a sequential read accesses a page several times,
-	 * only mark it as accessed the first time.
-	 */
-	if (iocb->ki_pos >> PAGE_SHIFT != ra->prev_pos >> PAGE_SHIFT)
-		mark_page_accessed(page);
-
-	ra->prev_pos = iocb->ki_pos;
-
-	if (copied < bytes)
-		return -EFAULT;
-
-	return !iov_iter_count(iter) || iocb->ki_pos == isize;
-}
-
 static struct page *
 generic_file_buffered_read_readpage(struct kiocb *iocb,
 				    struct file *filp,
@@ -2398,6 +2337,92 @@ generic_file_buffered_read_no_cached_page(struct kiocb *iocb,
 	return generic_file_buffered_read_readpage(iocb, filp, mapping, page);
 }
 
+static int generic_file_buffered_read_get_pages(struct kiocb *iocb,
+						struct iov_iter *iter,
+						struct page **pages,
+						unsigned int nr)
+{
+	struct file *filp = iocb->ki_filp;
+	struct address_space *mapping = filp->f_mapping;
+	struct file_ra_state *ra = &filp->f_ra;
+	pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
+	pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
+	int i, j, nr_got, err = 0;
+
+	nr = min_t(unsigned long, last_index - index, nr);
+find_page:
+	if (fatal_signal_pending(current))
+		return -EINTR;
+
+	nr_got = find_get_pages_contig(mapping, index, nr, pages);
+	if (nr_got)
+		goto got_pages;
+
+	if (iocb->ki_flags & IOCB_NOIO)
+		return -EAGAIN;
+
+	page_cache_sync_readahead(mapping, ra, filp, index, last_index - index);
+
+	nr_got = find_get_pages_contig(mapping, index, nr, pages);
+	if (nr_got)
+		goto got_pages;
+
+	pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter);
+	err = PTR_ERR_OR_ZERO(pages[0]);
+	if (!IS_ERR_OR_NULL(pages[0]))
+		nr_got = 1;
+got_pages:
+	for (i = 0; i < nr_got; i++) {
+		struct page *page = pages[i];
+		pgoff_t pg_index = index + i;
+		loff_t pg_pos = max(iocb->ki_pos,
+				    (loff_t) pg_index << PAGE_SHIFT);
+		loff_t pg_count = iocb->ki_pos + iter->count - pg_pos;
+
+		if (PageReadahead(page)) {
+			if (iocb->ki_flags & IOCB_NOIO) {
+				for (j = i; j < nr_got; j++)
+					put_page(pages[j]);
+				nr_got = i;
+				err = -EAGAIN;
+				break;
+			}
+			page_cache_async_readahead(mapping, ra, filp, page,
+					pg_index, last_index - pg_index);
+		}
+
+		if (!PageUptodate(page)) {
+			if ((iocb->ki_flags & IOCB_NOWAIT) ||
+			    ((iocb->ki_flags & IOCB_WAITQ) && i)) {
+				for (j = i; j < nr_got; j++)
+					put_page(pages[j]);
+				nr_got = i;
+				err = -EAGAIN;
+				break;
+			}
+
+			page = generic_file_buffered_read_pagenotuptodate(iocb,
+					filp, iter, page, pg_pos, pg_count);
+			if (IS_ERR_OR_NULL(page)) {
+				for (j = i + 1; j < nr_got; j++)
+					put_page(pages[j]);
+				nr_got = i;
+				err = PTR_ERR_OR_ZERO(page);
+				break;
+			}
+		}
+	}
+
+	if (likely(nr_got))
+		return nr_got;
+	if (err)
+		return err;
+	/*
+	 * No pages and no error means we raced and should retry:
+	 */
+	goto find_page;
+}
+
 /**
  * generic_file_buffered_read - generic file read routine
  * @iocb:	the iocb to read
@@ -2418,104 +2443,116 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 		struct iov_iter *iter, ssize_t written)
 {
 	struct file *filp = iocb->ki_filp;
+	struct file_ra_state *ra = &filp->f_ra;
 	struct address_space *mapping = filp->f_mapping;
 	struct inode *inode = mapping->host;
-	struct file_ra_state *ra = &filp->f_ra;
-	size_t orig_count = iov_iter_count(iter);
-	pgoff_t last_index;
-	int error = 0;
+	struct page *pages_onstack[PAGEVEC_SIZE], **pages = NULL;
+	unsigned int nr_pages = min_t(unsigned int, 512,
+			((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) -
+			(iocb->ki_pos >> PAGE_SHIFT));
+	int i, pg_nr, error = 0;
+	bool writably_mapped;
+	loff_t isize, end_offset;
 
 	if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
 		return 0;
 	iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
 
-	last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
+	if (nr_pages > ARRAY_SIZE(pages_onstack))
+		pages = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL);
 
-	/*
-	 * If we've already successfully copied some data, then we
-	 * can no longer safely return -EIOCBQUEUED. Hence mark
-	 * an async read NOWAIT at that point.
-	 */
-	if (written && (iocb->ki_flags & IOCB_WAITQ))
-		iocb->ki_flags |= IOCB_NOWAIT;
-
-	for (;;) {
-		pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
-		struct page *page;
+	if (!pages) {
+		pages = pages_onstack;
+		nr_pages = min_t(unsigned int, nr_pages, ARRAY_SIZE(pages_onstack));
+	}
 
+	do {
 		cond_resched();
-find_page:
-		if (fatal_signal_pending(current)) {
-			error = -EINTR;
-			goto out;
-		}
 
 		/*
-		 * We can't return -EIOCBQUEUED once we've done some work, so
-		 * ensure we don't block:
+		 * If we've already successfully copied some data, then we
+		 * can no longer safely return -EIOCBQUEUED. Hence mark
+		 * an async read NOWAIT at that point.
 		 */
-		if ((iocb->ki_flags & IOCB_WAITQ) &&
-		    (written + orig_count - iov_iter_count(iter)))
+		if ((iocb->ki_flags & IOCB_WAITQ) && written)
 			iocb->ki_flags |= IOCB_NOWAIT;
 
-		page = find_get_page(mapping, index);
-		if (!page) {
-			if (iocb->ki_flags & IOCB_NOIO)
-				goto would_block;
-			page_cache_sync_readahead(mapping,
-					ra, filp,
-					index, last_index - index);
-			page = find_get_page(mapping, index);
-			if (unlikely(page == NULL)) {
-				page = generic_file_buffered_read_no_cached_page(iocb, iter);
-				if (!page)
-					goto find_page;
-				if (IS_ERR(page)) {
-					error = PTR_ERR(page);
-					goto out;
-				}
-			}
-		}
-		if (PageReadahead(page)) {
-			if (iocb->ki_flags & IOCB_NOIO) {
-				put_page(page);
-				goto out;
-			}
-			page_cache_async_readahead(mapping,
-					ra, filp, page,
-					index, last_index - index);
-		}
-		if (!PageUptodate(page)) {
-			if (iocb->ki_flags & IOCB_NOWAIT) {
-				put_page(page);
-				error = -EAGAIN;
-				goto out;
-			}
-			page = generic_file_buffered_read_pagenotuptodate(iocb,
-					filp, iter, page, iocb->ki_pos, iter->count);
-			if (!page)
-				goto find_page;
-			if (IS_ERR(page)) {
-				error = PTR_ERR(page);
-				goto out;
-			}
+		i = 0;
+		pg_nr = generic_file_buffered_read_get_pages(iocb, iter,
+							     pages, nr_pages);
+		if (pg_nr < 0) {
+			error = pg_nr;
+			break;
 		}
 
-		error = generic_file_buffered_read_page_ok(iocb, iter, page);
-		put_page(page);
+		/*
+		 * i_size must be checked after we know the pages are Uptodate.
+		 *
+		 * Checking i_size after the check allows us to calculate
+		 * the correct value for "nr", which means the zero-filled
+		 * part of the page is not copied back to userspace (unless
+		 * another truncate extends the file - this is desired though).
+		 */
+		isize = i_size_read(inode);
+		if (unlikely(iocb->ki_pos >= isize))
+			goto put_pages;
 
-		if (error) {
-			if (error > 0)
-				error = 0;
-			goto out;
+		end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
+
+		while ((iocb->ki_pos >> PAGE_SHIFT) + pg_nr >
+		       (end_offset + PAGE_SIZE - 1) >> PAGE_SHIFT)
+			put_page(pages[--pg_nr]);
+
+		/*
+		 * Once we start copying data, we don't want to be touching any
+		 * cachelines that might be contended:
+		 */
+		writably_mapped = mapping_writably_mapped(mapping);
+
+		/*
+		 * When a sequential read accesses a page several times, only
+		 * mark it as accessed the first time.
+		 */
+		if (iocb->ki_pos >> PAGE_SHIFT !=
+		    ra->prev_pos >> PAGE_SHIFT)
+			mark_page_accessed(pages[0]);
+		for (i = 1; i < pg_nr; i++)
+			mark_page_accessed(pages[i]);
+
+		for (i = 0; i < pg_nr; i++) {
+			unsigned int offset = iocb->ki_pos & ~PAGE_MASK;
+			unsigned int bytes = min_t(loff_t, end_offset - iocb->ki_pos,
+						   PAGE_SIZE - offset);
+			unsigned int copied;
+
+			/*
+			 * If users can be writing to this page using arbitrary
+			 * virtual addresses, take care about potential aliasing
+			 * before reading the page on the kernel side.
+			 */
+			if (writably_mapped)
+				flush_dcache_page(pages[i]);
+
+			copied = copy_page_to_iter(pages[i], offset, bytes, iter);
+
+			written += copied;
+			iocb->ki_pos += copied;
+			ra->prev_pos = iocb->ki_pos;
+
+			if (copied < bytes) {
+				error = -EFAULT;
+				break;
+			}
 		}
-	}
+put_pages:
+		for (i = 0; i < pg_nr; i++)
+			put_page(pages[i]);
+	} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
 
-would_block:
-	error = -EAGAIN;
-out:
 	file_accessed(filp);
-	written += orig_count - iov_iter_count(iter);
+
+	if (pages != pages_onstack)
+		kfree(pages);
 
 	return written ? written : error;
 }

From 78c62970a37e5de1ce2df5ba47ad37ae27e6c3b3 Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <surajjs@amazon.com>
Date: Thu, 28 Apr 2022 16:19:03 -0700
Subject: [PATCH 383/737] ENA: Update to v2.7.1

Source: https://github.com/amzn/amzn-drivers/

Change Log:

## r2.7.1 release notes
**Bug Fixes**
* Fix NUMA node update rate

## r2.7.0 release notes
**New Features**
* Add AF XDP with zero-copy support
* Add devlink tool support
* Add Dynamic RX Buffers (DRB) feature

**Bug Fixes**
* Fix Toepltiz initial value change after changing RSS key
* Fix compilation errors on RHEL 8 and on some old kernel version
* Fix several bugs in XDP infrastructure

**Minor Changes**
* Cosmetic code changes
* Add support for (upcoming) kernel 5.17
* Removing some dead code and redundant checks

Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 drivers/amazon/net/ena/Makefile         |    4 +-
 drivers/amazon/net/ena/ena_admin_defs.h |    6 +-
 drivers/amazon/net/ena/ena_com.c        |   32 +-
 drivers/amazon/net/ena/ena_com.h        |    1 -
 drivers/amazon/net/ena/ena_devlink.c    |  304 +++++++
 drivers/amazon/net/ena/ena_devlink.h    |   45 +
 drivers/amazon/net/ena/ena_ethtool.c    |   41 +-
 drivers/amazon/net/ena/ena_lpc.c        |   10 +-
 drivers/amazon/net/ena/ena_netdev.c     | 1044 ++++++-----------------
 drivers/amazon/net/ena/ena_netdev.h     |  181 ++--
 drivers/amazon/net/ena/ena_xdp.c        |  978 +++++++++++++++++++++
 drivers/amazon/net/ena/ena_xdp.h        |  221 +++++
 drivers/amazon/net/ena/kcompat.h        |   67 +-
 13 files changed, 2048 insertions(+), 886 deletions(-)
 create mode 100644 drivers/amazon/net/ena/ena_devlink.c
 create mode 100644 drivers/amazon/net/ena/ena_devlink.h
 create mode 100644 drivers/amazon/net/ena/ena_xdp.c
 create mode 100644 drivers/amazon/net/ena/ena_xdp.h

diff --git a/drivers/amazon/net/ena/Makefile b/drivers/amazon/net/ena/Makefile
index 2595641267d20..aa212758c796c 100644
--- a/drivers/amazon/net/ena/Makefile
+++ b/drivers/amazon/net/ena/Makefile
@@ -6,7 +6,7 @@
 
 obj-$(CONFIG_AMAZON_ENA_ETHERNET) += ena.o
 
-ena-y := ena_netdev.o ena_com.o ena_eth_com.o ena_ethtool.o net_dim.o \
-	dim.o ena_lpc.o
+ena-y := ena_netdev.o ena_ethtool.o ena_lpc.o ena_xdp.o dim.o ena_devlink.o \
+         net_dim.o ena_com.o ena_eth_com.o
 
 ena-$(CONFIG_SYSFS) += ena_sysfs.o
diff --git a/drivers/amazon/net/ena/ena_admin_defs.h b/drivers/amazon/net/ena/ena_admin_defs.h
index be5ca30976279..7f2595f2545cb 100644
--- a/drivers/amazon/net/ena/ena_admin_defs.h
+++ b/drivers/amazon/net/ena/ena_admin_defs.h
@@ -883,7 +883,9 @@ struct ena_admin_host_info {
 	 * 2 : interrupt_moderation
 	 * 3 : rx_buf_mirroring
 	 * 4 : rss_configurable_function_key
-	 * 31:5 : reserved
+	 * 5 : reserved
+	 * 6 : rx_page_reuse
+	 * 31:7 : reserved
 	 */
 	u32 driver_supported_features;
 };
@@ -1208,6 +1210,8 @@ struct ena_admin_ena_mmio_req_read_less_resp {
 #define ENA_ADMIN_HOST_INFO_RX_BUF_MIRRORING_MASK           BIT(3)
 #define ENA_ADMIN_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY_SHIFT 4
 #define ENA_ADMIN_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY_MASK BIT(4)
+#define ENA_ADMIN_HOST_INFO_RX_PAGE_REUSE_SHIFT             6
+#define ENA_ADMIN_HOST_INFO_RX_PAGE_REUSE_MASK              BIT(6)
 
 /* feature_rss_ind_table */
 #define ENA_ADMIN_FEATURE_RSS_IND_TABLE_ONE_ENTRY_UPDATE_MASK BIT(0)
diff --git a/drivers/amazon/net/ena/ena_com.c b/drivers/amazon/net/ena/ena_com.c
index 2a250dce55e2c..f9dbcf24a753b 100644
--- a/drivers/amazon/net/ena/ena_com.c
+++ b/drivers/amazon/net/ena/ena_com.c
@@ -1283,9 +1283,6 @@ static int ena_com_create_io_sq(struct ena_com_dev *ena_dev,
 		(uintptr_t)cmd_completion.sq_doorbell_offset);
 
 	if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
-		io_sq->header_addr = (u8 __iomem *)((uintptr_t)ena_dev->mem_bar
-				+ cmd_completion.llq_headers_offset);
-
 		io_sq->desc_addr.pbuf_dev_addr =
 			(u8 __iomem *)((uintptr_t)ena_dev->mem_bar +
 			cmd_completion.llq_descriptors_offset);
@@ -2411,29 +2408,18 @@ int ena_com_fill_hash_function(struct ena_com_dev *ena_dev,
 		return -EOPNOTSUPP;
 	}
 
-	switch (func) {
-	case ENA_ADMIN_TOEPLITZ:
-		if (key) {
-			if (key_len != sizeof(hash_key->key)) {
-				netdev_err(ena_dev->net_device,
-					   "key len (%u) doesn't equal the supported size (%zu)\n",
-					   key_len, sizeof(hash_key->key));
-				return -EINVAL;
-			}
-			memcpy(hash_key->key, key, key_len);
-			rss->hash_init_val = init_val;
-			hash_key->key_parts = key_len / sizeof(hash_key->key[0]);
+	if ((func == ENA_ADMIN_TOEPLITZ) && key) {
+		if (key_len != sizeof(hash_key->key)) {
+			netdev_err(ena_dev->net_device,
+				   "key len (%u) doesn't equal the supported size (%zu)\n",
+				   key_len, sizeof(hash_key->key));
+			return -EINVAL;
 		}
-		break;
-	case ENA_ADMIN_CRC32:
-		rss->hash_init_val = init_val;
-		break;
-	default:
-		netdev_err(ena_dev->net_device, "Invalid hash function (%d)\n",
-			   func);
-		return -EINVAL;
+		memcpy(hash_key->key, key, key_len);
+		hash_key->key_parts = key_len / sizeof(hash_key->key[0]);
 	}
 
+	rss->hash_init_val = init_val;
 	old_func = rss->hash_func;
 	rss->hash_func = func;
 	rc = ena_com_set_hash_function(ena_dev);
diff --git a/drivers/amazon/net/ena/ena_com.h b/drivers/amazon/net/ena/ena_com.h
index 795bd714778d9..6b085c54685f6 100644
--- a/drivers/amazon/net/ena/ena_com.h
+++ b/drivers/amazon/net/ena/ena_com.h
@@ -160,7 +160,6 @@ struct ena_com_io_sq {
 	void *bus;
 
 	u32 __iomem *db_addr;
-	u8 __iomem *header_addr;
 
 	enum queue_direction direction;
 	enum ena_admin_placement_policy_type mem_queue_type;
diff --git a/drivers/amazon/net/ena/ena_devlink.c b/drivers/amazon/net/ena/ena_devlink.c
new file mode 100644
index 0000000000000..68b02270786c7
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_devlink.c
@@ -0,0 +1,304 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright 2015-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include "ena_devlink.h"
+#ifdef ENA_DEVLINK_SUPPORT
+
+static int ena_devlink_llq_header_validate(struct devlink *devlink, u32 id,
+					   union devlink_param_value val,
+					   struct netlink_ext_ack *extack);
+
+enum ena_devlink_param_id {
+	ENA_DEVLINK_PARAM_ID_BASE = DEVLINK_PARAM_GENERIC_ID_MAX,
+	ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
+};
+
+static const struct devlink_param ena_devlink_params[] = {
+	DEVLINK_PARAM_DRIVER(ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
+			     "large_llq_header", DEVLINK_PARAM_TYPE_BOOL,
+			     BIT(DEVLINK_PARAM_CMODE_DRIVERINIT),
+			     NULL, NULL, ena_devlink_llq_header_validate),
+};
+
+static int ena_devlink_llq_header_validate(struct devlink *devlink, u32 id,
+					   union devlink_param_value val,
+					   struct netlink_ext_ack *extack)
+{
+	struct ena_adapter *adapter = ENA_DEVLINK_PRIV(devlink);
+	bool value = val.vbool;
+
+	if (!value)
+		return 0;
+
+	if (adapter->ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST) {
+		NL_SET_ERR_MSG_MOD(extack, "Instance doesn't support LLQ");
+		return -EOPNOTSUPP;
+	}
+
+	if (!adapter->large_llq_header_supported) {
+		NL_SET_ERR_MSG_MOD(extack, "Instance doesn't support large LLQ");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+#ifdef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
+/* Determines if ena_devlink_register has been called.
+ * Prefer to check if the driver enabled reloading capabilities, but fallback
+ * to check if driver configured 'dev' devlink attribute for older kernels.
+ */
+bool ena_is_devlink_params_registered(struct devlink *devlink)
+{
+#if defined(ENA_DEVLINK_RELOAD_ENABLING_REQUIRED)
+	return devlink->reload_enabled;
+#elif !defined(ENA_DEVLINK_RECEIVES_DEVICE_ON_ALLOC)
+	return devlink->dev;
+#endif
+}
+
+#endif
+void ena_devlink_params_get(struct devlink *devlink)
+{
+	struct ena_adapter *adapter = ENA_DEVLINK_PRIV(devlink);
+	union devlink_param_value val;
+	int err;
+
+#ifdef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
+	/* If devlink params aren't registered, don't access them */
+	if (!ena_is_devlink_params_registered(devlink))
+		return;
+#endif
+	err = devlink_param_driverinit_value_get(devlink,
+						 ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
+						 &val);
+	if (err) {
+		netdev_err(adapter->netdev, "Failed to query LLQ header size param\n");
+		return;
+	}
+
+	adapter->large_llq_header_enabled = val.vbool;
+}
+
+void ena_devlink_disable_large_llq_header_param(struct devlink *devlink)
+{
+	union devlink_param_value value;
+
+#ifdef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
+	/* If devlink params aren't registered, don't access them */
+	if (!ena_is_devlink_params_registered(devlink))
+		return;
+
+#endif
+	value.vbool = false;
+	devlink_param_driverinit_value_set(devlink,
+					   ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
+					   value);
+}
+
+static int ena_devlink_reload_down(struct devlink *devlink,
+#ifdef ENA_DEVLINK_RELOAD_NS_CHANGE_SUPPORT
+				   bool netns_change,
+#endif
+#ifdef ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT
+				   enum devlink_reload_action action,
+				   enum devlink_reload_limit limit,
+#endif
+				   struct netlink_ext_ack *extack)
+{
+	struct ena_adapter *adapter = ENA_DEVLINK_PRIV(devlink);
+
+#ifdef ENA_DEVLINK_RELOAD_NS_CHANGE_SUPPORT
+	if (netns_change) {
+		NL_SET_ERR_MSG_MOD(extack, "Namespace change is not supported");
+		return -EOPNOTSUPP;
+	}
+
+#endif
+#ifdef ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT
+	if (action != DEVLINK_RELOAD_ACTION_DRIVER_REINIT) {
+		NL_SET_ERR_MSG_MOD(extack, "Action is not supported");
+		return -EOPNOTSUPP;
+	}
+
+	if (limit != DEVLINK_RELOAD_LIMIT_UNSPEC) {
+		NL_SET_ERR_MSG_MOD(extack, "Driver reload doesn't support limitations");
+		return -EOPNOTSUPP;
+	}
+
+#endif
+	rtnl_lock();
+	ena_destroy_device(adapter, false);
+	rtnl_unlock();
+
+	return 0;
+}
+
+static int ena_devlink_reload_up(struct devlink *devlink,
+#ifdef ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT
+				 enum devlink_reload_action action,
+				 enum devlink_reload_limit limit,
+				 u32 *actions_performed,
+#endif
+				 struct netlink_ext_ack *extack)
+{
+	struct ena_adapter *adapter = ENA_DEVLINK_PRIV(devlink);
+	int err = 0;
+
+#ifdef ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT
+	if (action != DEVLINK_RELOAD_ACTION_DRIVER_REINIT) {
+		NL_SET_ERR_MSG_MOD(extack, "Action is not supported");
+		return -EOPNOTSUPP;
+	}
+
+	if (limit != DEVLINK_RELOAD_LIMIT_UNSPEC) {
+		NL_SET_ERR_MSG_MOD(extack, "Driver reload doesn't support limitations");
+		return -EOPNOTSUPP;
+	}
+
+#endif
+	rtnl_lock();
+	/* Check that no other routine initialized the device (e.g.
+	 * ena_fw_reset_device()). Also we're under devlink_mutex here,
+	 * so devink (and ena_adapter with it) isn't freed under our
+	 * feet.
+	 */
+	if (!test_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags))
+		err = ena_restore_device(adapter);
+	rtnl_unlock();
+
+#ifdef ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT
+	if (!err)
+		*actions_performed = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT);
+
+#endif
+	return err;
+}
+#ifndef ENA_DEVLINK_RELOAD_UP_DOWN_SUPPORTED
+
+static int ena_devlink_reload(struct devlink *devlink, struct netlink_ext_ack *extack)
+{
+	/* This function always succeeds when called from this function */
+	ena_devlink_reload_down(devlink, extack);
+
+	return ena_devlink_reload_up(devlink, extack);
+}
+
+#endif
+
+static const struct devlink_ops ena_devlink_ops = {
+#ifdef ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT
+	.reload_actions = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT),
+#endif
+#ifdef ENA_DEVLINK_RELOAD_UP_DOWN_SUPPORTED
+	.reload_down	= ena_devlink_reload_down,
+	.reload_up	= ena_devlink_reload_up,
+#else
+	.reload		= ena_devlink_reload,
+#endif
+};
+
+static int ena_devlink_configure_params(struct devlink *devlink)
+{
+	struct ena_adapter *adapter = ENA_DEVLINK_PRIV(devlink);
+	union devlink_param_value value;
+	int rc;
+
+	rc = devlink_params_register(devlink, ena_devlink_params,
+				     ARRAY_SIZE(ena_devlink_params));
+	if (rc) {
+		netdev_err(adapter->netdev, "Failed to register devlink params\n");
+		return rc;
+	}
+
+	value.vbool = adapter->large_llq_header_enabled;
+	devlink_param_driverinit_value_set(devlink,
+					   ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
+					   value);
+
+#ifdef ENA_DEVLINK_RELOAD_SUPPORT_ADVERTISEMENT_NEEDED
+	devlink_set_features(devlink, DEVLINK_F_RELOAD);
+
+#endif
+#ifdef ENA_DEVLINK_RELOAD_ENABLING_REQUIRED
+	devlink_reload_enable(devlink);
+
+#endif
+	return 0;
+}
+
+struct devlink *ena_devlink_alloc(struct ena_adapter *adapter)
+{
+#ifdef ENA_DEVLINK_RECEIVES_DEVICE_ON_ALLOC
+	struct device *dev = &adapter->pdev->dev;
+#endif
+	struct devlink *devlink;
+
+#ifdef ENA_DEVLINK_RECEIVES_DEVICE_ON_ALLOC
+	devlink = devlink_alloc(&ena_devlink_ops, sizeof(struct ena_adapter *), dev);
+#else
+	devlink = devlink_alloc(&ena_devlink_ops, sizeof(struct ena_adapter *));
+#endif
+	if (!devlink) {
+		netdev_err(adapter->netdev, "Failed to allocate devlink struct\n");
+		return NULL;
+	}
+
+	ENA_DEVLINK_PRIV(devlink) = adapter;
+	adapter->devlink = devlink;
+
+#ifndef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
+	if (ena_devlink_configure_params(devlink))
+		goto free_devlink;
+
+	return devlink;
+free_devlink:
+	devlink_free(devlink);
+
+	return NULL;
+#else
+	return devlink;
+#endif
+}
+
+static void ena_devlink_configure_params_clean(struct devlink *devlink)
+{
+#ifdef ENA_DEVLINK_RELOAD_ENABLING_REQUIRED
+	devlink_reload_disable(devlink);
+
+#endif
+	devlink_params_unregister(devlink, ena_devlink_params,
+				  ARRAY_SIZE(ena_devlink_params));
+}
+
+void ena_devlink_free(struct devlink *devlink)
+{
+#ifndef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
+	ena_devlink_configure_params_clean(devlink);
+
+#endif
+	devlink_free(devlink);
+}
+
+void ena_devlink_register(struct devlink *devlink, struct device *dev)
+{
+#ifdef ENA_DEVLINK_RECEIVES_DEVICE_ON_ALLOC
+	devlink_register(devlink);
+#else
+	devlink_register(devlink, dev);
+#endif
+#ifdef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
+	ena_devlink_configure_params(devlink);
+#endif
+}
+
+void ena_devlink_unregister(struct devlink *devlink)
+{
+#ifdef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
+	ena_devlink_configure_params_clean(devlink);
+#endif
+	devlink_unregister(devlink);
+}
+#endif /* ENA_DEVLINK_SUPPORT */
diff --git a/drivers/amazon/net/ena/ena_devlink.h b/drivers/amazon/net/ena/ena_devlink.h
new file mode 100644
index 0000000000000..8a047654b2f52
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_devlink.h
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright 2015-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef DEVLINK_H
+#define DEVLINK_H
+
+#include "ena_netdev.h"
+#ifndef ENA_NO_DEVLINK_HEADERS
+#include <net/devlink.h>
+#endif
+
+#ifdef ENA_DEVLINK_SUPPORT
+
+#define ENA_DEVLINK_PRIV(devlink) \
+	(*(struct ena_adapter **) devlink_priv(devlink))
+
+struct devlink *ena_devlink_alloc(struct ena_adapter *adapter);
+void ena_devlink_free(struct devlink *devlink);
+void ena_devlink_register(struct devlink *devlink, struct device *dev);
+void ena_devlink_unregister(struct devlink *devlink);
+void ena_devlink_params_get(struct devlink *devlink);
+void ena_devlink_disable_large_llq_header_param(struct devlink *devlink);
+
+#else /* ENA_DEVLINK_SUPPORT */
+
+#ifdef ENA_NO_DEVLINK_HEADERS
+struct devlink {};
+#endif
+
+/* Return a value of 1 so the caller wouldn't think the function failed (returned NULL) */
+static inline struct devlink *ena_devlink_alloc(struct ena_adapter *adapter)
+{
+	return (struct devlink *)1;
+}
+static inline void ena_devlink_free(struct devlink *devlink) { }
+static inline void ena_devlink_register(struct devlink *devlink, struct device *dev) { };
+static inline void ena_devlink_unregister(struct devlink *devlink) { }
+static inline void ena_devlink_params_get(struct devlink *devlink) { }
+static inline void ena_devlink_disable_large_llq_header_param(struct devlink *devlink) { }
+
+#endif /* ENA_DEVLINK_SUPPORT */
+
+#endif /* DEVLINK_H */
diff --git a/drivers/amazon/net/ena/ena_ethtool.c b/drivers/amazon/net/ena/ena_ethtool.c
index a3ff6fca628ec..4c387d0b6be35 100644
--- a/drivers/amazon/net/ena/ena_ethtool.c
+++ b/drivers/amazon/net/ena/ena_ethtool.c
@@ -7,6 +7,7 @@
 #include <linux/pci.h>
 
 #include "ena_netdev.h"
+#include "ena_xdp.h"
 
 struct ena_stats {
 	char name[ETH_GSTRING_LEN];
@@ -74,6 +75,10 @@ static const struct ena_stats ena_stats_tx_strings[] = {
 	ENA_STAT_TX_ENTRY(llq_buffer_copy),
 	ENA_STAT_TX_ENTRY(missed_tx),
 	ENA_STAT_TX_ENTRY(unmask_interrupt),
+#ifdef ENA_AF_XDP_SUPPORT
+	ENA_STAT_TX_ENTRY(xsk_need_wakeup_set),
+	ENA_STAT_TX_ENTRY(xsk_wakeup_request),
+#endif /* ENA_AF_XDP_SUPPORT */
 };
 
 static const struct ena_stats ena_stats_rx_strings[] = {
@@ -82,7 +87,7 @@ static const struct ena_stats ena_stats_rx_strings[] = {
 	ENA_STAT_RX_ENTRY(rx_copybreak_pkt),
 	ENA_STAT_RX_ENTRY(csum_good),
 	ENA_STAT_RX_ENTRY(refil_partial),
-	ENA_STAT_RX_ENTRY(bad_csum),
+	ENA_STAT_RX_ENTRY(csum_bad),
 	ENA_STAT_RX_ENTRY(page_alloc_fail),
 	ENA_STAT_RX_ENTRY(skb_alloc_fail),
 	ENA_STAT_RX_ENTRY(dma_mapping_err),
@@ -106,6 +111,10 @@ static const struct ena_stats ena_stats_rx_strings[] = {
 	ENA_STAT_RX_ENTRY(lpc_warm_up),
 	ENA_STAT_RX_ENTRY(lpc_full),
 	ENA_STAT_RX_ENTRY(lpc_wrong_numa),
+#ifdef ENA_AF_XDP_SUPPORT
+	ENA_STAT_RX_ENTRY(xsk_need_wakeup_set),
+	ENA_STAT_RX_ENTRY(zc_queue_pkt_copy),
+#endif /* ENA_AF_XDP_SUPPORT */
 };
 
 static const struct ena_stats ena_stats_ena_com_strings[] = {
@@ -415,7 +424,13 @@ static int ena_get_settings(struct net_device *netdev,
 
 #endif
 static int ena_get_coalesce(struct net_device *net_dev,
+#ifdef ENA_EXTENDED_COALESCE_UAPI_WITH_CQE_SUPPORTED
+			    struct ethtool_coalesce *coalesce,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
+#else
 			    struct ethtool_coalesce *coalesce)
+#endif
 {
 	struct ena_adapter *adapter = netdev_priv(net_dev);
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
@@ -460,7 +475,13 @@ static void ena_update_rx_rings_nonadaptive_intr_moderation(struct ena_adapter *
 }
 
 static int ena_set_coalesce(struct net_device *net_dev,
+#ifdef ENA_EXTENDED_COALESCE_UAPI_WITH_CQE_SUPPORTED
+			    struct ethtool_coalesce *coalesce,
+			    struct kernel_ethtool_coalesce *kernel_coal,
+			    struct netlink_ext_ack *extack)
+#else
 			    struct ethtool_coalesce *coalesce)
+#endif
 {
 	struct ena_adapter *adapter = netdev_priv(net_dev);
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
@@ -522,7 +543,13 @@ static void ena_get_drvinfo(struct net_device *dev,
 }
 
 static void ena_get_ringparam(struct net_device *netdev,
+#ifdef ENA_ETHTOOL_RX_BUFF_SIZE_CHANGE
+			      struct ethtool_ringparam *ring,
+			      struct kernel_ethtool_ringparam *kernel_ring,
+			      struct netlink_ext_ack *extack)
+#else
 			      struct ethtool_ringparam *ring)
+#endif
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
 
@@ -533,7 +560,13 @@ static void ena_get_ringparam(struct net_device *netdev,
 }
 
 static int ena_set_ringparam(struct net_device *netdev,
+#ifdef ENA_ETHTOOL_RX_BUFF_SIZE_CHANGE
+			     struct ethtool_ringparam *ring,
+			     struct kernel_ethtool_ringparam *kernel_ring,
+			     struct netlink_ext_ack *extack)
+#else
 			     struct ethtool_ringparam *ring)
+#endif
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
 	u32 new_tx_size, new_rx_size;
@@ -859,6 +892,7 @@ static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
 static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key)
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
+	enum ena_admin_hash_functions ena_func;
 	int rc;
 
 	rc = ena_indirection_table_get(adapter, indir);
@@ -985,6 +1019,11 @@ static int ena_set_channels(struct net_device *netdev,
 
 	if (count > adapter->max_num_io_queues)
 		return -EINVAL;
+	if (count != adapter->num_io_queues && ena_is_zc_q_exist(adapter)) {
+		netdev_err(adapter->netdev,
+			   "Changing channel count not supported with xsk pool loaded\n");
+		return -EOPNOTSUPP;
+	}
 
 	return ena_update_queue_count(adapter, count);
 }
diff --git a/drivers/amazon/net/ena/ena_lpc.c b/drivers/amazon/net/ena/ena_lpc.c
index 7e9c9aa1166d9..64c3d2d24f398 100644
--- a/drivers/amazon/net/ena/ena_lpc.c
+++ b/drivers/amazon/net/ena/ena_lpc.c
@@ -3,18 +3,10 @@
  * Copyright 2015-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 #include "ena_lpc.h"
+#include "ena_xdp.h"
 
 static void ena_free_ring_page_cache(struct ena_ring *rx_ring);
 
-/* Increase a stat by cnt while holding syncp seqlock on 32bit machines */
-static void ena_increase_stat(u64 *statp, u64 cnt,
-			      struct u64_stats_sync *syncp)
-{
-	u64_stats_update_begin(syncp);
-	(*statp) += cnt;
-	u64_stats_update_end(syncp);
-}
-
 static void ena_put_unmap_cache_page(struct ena_ring *rx_ring, struct ena_page *ena_page)
 {
 	dma_unmap_page(rx_ring->dev, ena_page->dma_addr, ENA_PAGE_SIZE,
diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c
index 5c96ec35a74fd..36421b2684d3e 100644
--- a/drivers/amazon/net/ena/ena_netdev.c
+++ b/drivers/amazon/net/ena/ena_netdev.c
@@ -23,14 +23,14 @@
 #include <net/ip.h>
 
 #include "ena_netdev.h"
-#ifdef ENA_XDP_SUPPORT
-#include <linux/bpf_trace.h>
-#endif /* ENA_XDP_SUPPORT */
 #include "ena_pci_id_tbl.h"
 #include "ena_sysfs.h"
+#include "ena_xdp.h"
 
 #include "ena_lpc.h"
 
+#include "ena_devlink.h"
+
 static char version[] = DEVICE_NAME " v" DRV_MODULE_GENERATION "\n";
 
 MODULE_AUTHOR("Amazon.com, Inc. or its affiliates");
@@ -84,60 +84,11 @@ MODULE_DEVICE_TABLE(pci, ena_pci_tbl);
 
 static int ena_rss_init_default(struct ena_adapter *adapter);
 static void check_for_admin_com_state(struct ena_adapter *adapter);
-static void ena_destroy_device(struct ena_adapter *adapter, bool graceful);
-static int ena_restore_device(struct ena_adapter *adapter);
 static void ena_calc_io_queue_size(struct ena_adapter *adapter,
 				   struct ena_com_dev_get_features_ctx *get_feat_ctx);
 static void ena_set_dev_offloads(struct ena_com_dev_get_features_ctx *feat,
 				 struct net_device *netdev);
 
-#ifdef ENA_XDP_SUPPORT
-static void ena_init_io_rings(struct ena_adapter *adapter,
-			      int first_index, int count);
-static void ena_init_napi_in_range(struct ena_adapter *adapter, int first_index,
-				   int count);
-static void ena_del_napi_in_range(struct ena_adapter *adapter, int first_index,
-				  int count);
-static int ena_setup_tx_resources(struct ena_adapter *adapter, int qid);
-static int ena_setup_tx_resources_in_range(struct ena_adapter *adapter,
-					   int first_index,
-					   int count);
-static int ena_create_io_tx_queue(struct ena_adapter *adapter, int qid);
-static void ena_free_tx_resources(struct ena_adapter *adapter, int qid);
-static int ena_clean_xdp_irq(struct ena_ring *xdp_ring, u32 budget);
-static void ena_destroy_all_tx_queues(struct ena_adapter *adapter);
-static void ena_free_all_io_tx_resources(struct ena_adapter *adapter);
-static void ena_napi_disable_in_range(struct ena_adapter *adapter,
-				      int first_index, int count);
-static void ena_napi_enable_in_range(struct ena_adapter *adapter,
-				     int first_index, int count);
-static int ena_up(struct ena_adapter *adapter);
-static void ena_down(struct ena_adapter *adapter);
-static void ena_unmask_interrupt(struct ena_ring *tx_ring,
-				 struct ena_ring *rx_ring);
-static void ena_update_ring_numa_node(struct ena_ring *tx_ring,
-				      struct ena_ring *rx_ring);
-static void ena_unmap_tx_buff(struct ena_ring *tx_ring,
-			      struct ena_tx_buffer *tx_info);
-static int ena_create_io_tx_queues_in_range(struct ena_adapter *adapter,
-					    int first_index, int count);
-#endif /* ENA_XDP_SUPPORT */
-
-/* Increase a stat by cnt while holding syncp seqlock on 32bit machines */
-static void ena_increase_stat(u64 *statp, u64 cnt,
-			      struct u64_stats_sync *syncp)
-{
-	u64_stats_update_begin(syncp);
-	(*statp) += cnt;
-	u64_stats_update_end(syncp);
-}
-
-static void ena_ring_tx_doorbell(struct ena_ring *tx_ring)
-{
-	ena_com_write_sq_doorbell(tx_ring->ena_com_io_sq);
-	ena_increase_stat(&tx_ring->tx_stats.doorbells, 1, &tx_ring->syncp);
-}
-
 #ifdef HAVE_NDO_TX_TIMEOUT_STUCK_QUEUE_PARAMETER
 static void ena_tx_timeout(struct net_device *dev, unsigned int txqueue)
 #else
@@ -153,7 +104,7 @@ static void ena_tx_timeout(struct net_device *dev)
 	if (test_and_set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))
 		return;
 
-	adapter->reset_reason = ENA_REGS_RESET_OS_NETDEV_WD;
+	ena_reset_device(adapter, ENA_REGS_RESET_OS_NETDEV_WD);
 	ena_increase_stat(&adapter->dev_stats.tx_timeout, 1, &adapter->syncp);
 
 	netif_err(adapter, tx_err, dev, "Transmit time out\n");
@@ -193,19 +144,18 @@ static int ena_change_mtu(struct net_device *dev, int new_mtu)
 	return ret;
 }
 
-static int ena_xmit_common(struct net_device *dev,
-			   struct ena_ring *ring,
-			   struct ena_tx_buffer *tx_info,
-			   struct ena_com_tx_ctx *ena_tx_ctx,
-			   u16 next_to_use,
-			   u32 bytes)
+int ena_xmit_common(struct ena_adapter *adapter,
+		    struct ena_ring *ring,
+		    struct ena_tx_buffer *tx_info,
+		    struct ena_com_tx_ctx *ena_tx_ctx,
+		    u16 next_to_use,
+		    u32 bytes)
 {
-	struct ena_adapter *adapter = netdev_priv(dev);
 	int rc, nb_hw_desc;
 
 	if (unlikely(ena_com_is_doorbell_needed(ring->ena_com_io_sq,
 						ena_tx_ctx))) {
-		netif_dbg(adapter, tx_queued, dev,
+		netif_dbg(adapter, tx_queued, adapter->netdev,
 			  "llq tx max burst size of queue %d achieved, writing doorbell to send burst\n",
 			  ring->qid);
 		ena_ring_tx_doorbell(ring);
@@ -220,15 +170,13 @@ static int ena_xmit_common(struct net_device *dev,
 	 * ena_com_prepare_tx() are fatal and therefore require a device reset.
 	 */
 	if (unlikely(rc)) {
-		netif_err(adapter, tx_queued, dev,
+		netif_err(adapter, tx_queued, adapter->netdev,
 			  "Failed to prepare tx bufs\n");
 		ena_increase_stat(&ring->tx_stats.prepare_ctx_err, 1,
 				  &ring->syncp);
-		if (rc != -ENOMEM) {
-			adapter->reset_reason =
-				ENA_REGS_RESET_DRIVER_INVALID_STATE;
-			set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
-		}
+		if (rc != -ENOMEM)
+			ena_reset_device(adapter,
+					 ENA_REGS_RESET_DRIVER_INVALID_STATE);
 		return rc;
 	}
 
@@ -238,6 +186,7 @@ static int ena_xmit_common(struct net_device *dev,
 	u64_stats_update_end(&ring->syncp);
 
 	tx_info->tx_descs = nb_hw_desc;
+	tx_info->total_tx_size = bytes;
 	tx_info->last_jiffies = jiffies;
 	tx_info->print_once = 0;
 
@@ -246,486 +195,6 @@ static int ena_xmit_common(struct net_device *dev,
 	return 0;
 }
 
-#ifdef ENA_XDP_SUPPORT
-/* This is the XDP napi callback. XDP queues use a separate napi callback
- * than Rx/Tx queues.
- */
-static int ena_xdp_io_poll(struct napi_struct *napi, int budget)
-{
-	struct ena_napi *ena_napi = container_of(napi, struct ena_napi, napi);
-	u32 xdp_work_done, xdp_budget;
-	struct ena_ring *xdp_ring;
-	int napi_comp_call = 0;
-	int ret;
-
-	xdp_ring = ena_napi->xdp_ring;
-
-	xdp_budget = budget;
-
-	if (!test_bit(ENA_FLAG_DEV_UP, &xdp_ring->adapter->flags) ||
-	    test_bit(ENA_FLAG_TRIGGER_RESET, &xdp_ring->adapter->flags)) {
-		napi_complete_done(napi, 0);
-		return 0;
-	}
-
-	xdp_work_done = ena_clean_xdp_irq(xdp_ring, xdp_budget);
-
-	/* If the device is about to reset or down, avoid unmask
-	 * the interrupt and return 0 so NAPI won't reschedule
-	 */
-	if (unlikely(!test_bit(ENA_FLAG_DEV_UP, &xdp_ring->adapter->flags))) {
-		napi_complete_done(napi, 0);
-		ret = 0;
-	} else if (xdp_budget > xdp_work_done) {
-		napi_comp_call = 1;
-		if (napi_complete_done(napi, xdp_work_done))
-			ena_unmask_interrupt(xdp_ring, NULL);
-		ena_update_ring_numa_node(xdp_ring, NULL);
-		ret = xdp_work_done;
-	} else {
-		ret = xdp_budget;
-	}
-
-	u64_stats_update_begin(&xdp_ring->syncp);
-	xdp_ring->tx_stats.napi_comp += napi_comp_call;
-	xdp_ring->tx_stats.tx_poll++;
-	u64_stats_update_end(&xdp_ring->syncp);
-	xdp_ring->tx_stats.last_napi_jiffies = jiffies;
-
-	return ret;
-}
-
-static int ena_xdp_tx_map_frame(struct ena_ring *xdp_ring,
-				struct ena_tx_buffer *tx_info,
-				struct xdp_frame *xdpf,
-				struct ena_com_tx_ctx *ena_tx_ctx)
-{
-	struct ena_adapter *adapter = xdp_ring->adapter;
-	struct ena_com_buf *ena_buf;
-	int push_len = 0;
-	dma_addr_t dma;
-	void *data;
-	u32 size;
-
-	tx_info->xdpf = xdpf;
-	data = tx_info->xdpf->data;
-	size = tx_info->xdpf->len;
-
-	if (xdp_ring->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
-		/* Designate part of the packet for LLQ */
-		push_len = min_t(u32, size, xdp_ring->tx_max_header_size);
-
-		ena_tx_ctx->push_header = data;
-
-		size -= push_len;
-		data += push_len;
-	}
-
-	ena_tx_ctx->header_len = push_len;
-
-	if (size > 0) {
-		dma = dma_map_single(xdp_ring->dev,
-				     data,
-				     size,
-				     DMA_TO_DEVICE);
-		if (unlikely(dma_mapping_error(xdp_ring->dev, dma)))
-			goto error_report_dma_error;
-
-		tx_info->map_linear_data = 0;
-
-		ena_buf = tx_info->bufs;
-		ena_buf->paddr = dma;
-		ena_buf->len = size;
-
-		ena_tx_ctx->ena_bufs = ena_buf;
-		ena_tx_ctx->num_bufs = tx_info->num_of_bufs = 1;
-	}
-
-	return 0;
-
-error_report_dma_error:
-	ena_increase_stat(&xdp_ring->tx_stats.dma_mapping_err, 1,
-			  &xdp_ring->syncp);
-	netif_warn(adapter, tx_queued, adapter->netdev, "Failed to map xdp buff\n");
-
-	return -EINVAL;
-}
-
-static int ena_xdp_xmit_frame(struct ena_ring *xdp_ring,
-			      struct net_device *dev,
-			      struct xdp_frame *xdpf,
-			      int flags)
-{
-	struct ena_com_tx_ctx ena_tx_ctx = {};
-	struct ena_tx_buffer *tx_info;
-	u16 next_to_use, req_id;
-	int rc;
-
-	next_to_use = xdp_ring->next_to_use;
-	req_id = xdp_ring->free_ids[next_to_use];
-	tx_info = &xdp_ring->tx_buffer_info[req_id];
-	tx_info->num_of_bufs = 0;
-
-	rc = ena_xdp_tx_map_frame(xdp_ring, tx_info, xdpf, &ena_tx_ctx);
-	if (unlikely(rc))
-		return rc;
-
-	ena_tx_ctx.req_id = req_id;
-
-	rc = ena_xmit_common(dev,
-			     xdp_ring,
-			     tx_info,
-			     &ena_tx_ctx,
-			     next_to_use,
-			     xdpf->len);
-	if (rc)
-		goto error_unmap_dma;
-
-	/* trigger the dma engine. ena_ring_tx_doorbell()
-	 * calls a memory barrier inside it.
-	 */
-	if (flags & XDP_XMIT_FLUSH)
-		ena_ring_tx_doorbell(xdp_ring);
-
-	return rc;
-
-error_unmap_dma:
-	ena_unmap_tx_buff(xdp_ring, tx_info);
-	tx_info->xdpf = NULL;
-	return rc;
-}
-
-static int ena_xdp_xmit(struct net_device *dev, int n,
-			struct xdp_frame **frames, u32 flags)
-{
-	struct ena_adapter *adapter = netdev_priv(dev);
-	struct ena_ring *xdp_ring;
-	int qid, i, nxmit = 0;
-
-	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
-		return -EINVAL;
-
-	if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
-		return -ENETDOWN;
-
-	/* We assume that all rings have the same XDP program */
-	if (!READ_ONCE(adapter->rx_ring->xdp_bpf_prog))
-		return -ENXIO;
-
-	qid = smp_processor_id() % adapter->xdp_num_queues;
-	qid += adapter->xdp_first_ring;
-	xdp_ring = &adapter->tx_ring[qid];
-
-	/* Other CPU ids might try to send thorugh this queue */
-	spin_lock(&xdp_ring->xdp_tx_lock);
-
-	for (i = 0; i < n; i++) {
-		if (ena_xdp_xmit_frame(xdp_ring, dev, frames[i], 0))
-			break;
-		nxmit++;
-	}
-
-	/* Ring doorbell to make device aware of the packets */
-	if (flags & XDP_XMIT_FLUSH)
-		ena_ring_tx_doorbell(xdp_ring);
-
-	spin_unlock(&xdp_ring->xdp_tx_lock);
-
-#ifndef ENA_XDP_XMIT_FREES_FAILED_DESCS_INTERNALLY
-	for (i = nxmit; unlikely(i < n); i++)
-		xdp_return_frame(frames[i]);
-
-#endif
-	/* Return number of packets sent */
-	return nxmit;
-}
-
-static int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp)
-{
-	struct bpf_prog *xdp_prog;
-	struct ena_ring *xdp_ring;
-	u32 verdict = XDP_PASS;
-	struct xdp_frame *xdpf;
-	u64 *xdp_stat;
-
-	rcu_read_lock();
-	xdp_prog = READ_ONCE(rx_ring->xdp_bpf_prog);
-
-	if (!xdp_prog)
-		goto out;
-
-	verdict = bpf_prog_run_xdp(xdp_prog, xdp);
-
-	switch (verdict) {
-	case XDP_TX:
-#ifdef XDP_CONVERT_TO_FRAME_NAME_CHANGED
-		xdpf = xdp_convert_buff_to_frame(xdp);
-#else
-		xdpf = convert_to_xdp_frame(xdp);
-#endif
-		if (unlikely(!xdpf)) {
-			trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict);
-			xdp_stat = &rx_ring->rx_stats.xdp_aborted;
-			verdict = XDP_ABORTED;
-			break;
-		}
-
-		/* Find xmit queue */
-		xdp_ring = rx_ring->xdp_ring;
-
-		/* The XDP queues are shared between XDP_TX and XDP_REDIRECT */
-		spin_lock(&xdp_ring->xdp_tx_lock);
-
-		if (ena_xdp_xmit_frame(xdp_ring, rx_ring->netdev, xdpf,
-				       XDP_XMIT_FLUSH))
-			xdp_return_frame(xdpf);
-
-		spin_unlock(&xdp_ring->xdp_tx_lock);
-		xdp_stat = &rx_ring->rx_stats.xdp_tx;
-		break;
-	case XDP_REDIRECT:
-		if (likely(!xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog))) {
-			xdp_stat = &rx_ring->rx_stats.xdp_redirect;
-			break;
-		}
-		trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict);
-		xdp_stat = &rx_ring->rx_stats.xdp_aborted;
-		verdict = XDP_ABORTED;
-		break;
-	case XDP_ABORTED:
-		trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict);
-		xdp_stat = &rx_ring->rx_stats.xdp_aborted;
-		break;
-	case XDP_DROP:
-		xdp_stat = &rx_ring->rx_stats.xdp_drop;
-		break;
-	case XDP_PASS:
-		xdp_stat = &rx_ring->rx_stats.xdp_pass;
-		break;
-	default:
-		bpf_warn_invalid_xdp_action(verdict);
-		xdp_stat = &rx_ring->rx_stats.xdp_invalid;
-	}
-
-	ena_increase_stat(xdp_stat, 1, &rx_ring->syncp);
-out:
-	rcu_read_unlock();
-
-	return verdict;
-}
-
-static void ena_init_all_xdp_queues(struct ena_adapter *adapter)
-{
-	adapter->xdp_first_ring = adapter->num_io_queues;
-	adapter->xdp_num_queues = adapter->num_io_queues;
-
-	ena_init_io_rings(adapter,
-			  adapter->xdp_first_ring,
-			  adapter->xdp_num_queues);
-}
-
-static int ena_setup_and_create_all_xdp_queues(struct ena_adapter *adapter)
-{
-	int rc = 0;
-
-	rc = ena_setup_tx_resources_in_range(adapter, adapter->xdp_first_ring,
-					     adapter->xdp_num_queues);
-	if (rc)
-		goto setup_err;
-
-	rc = ena_create_io_tx_queues_in_range(adapter,
-					      adapter->xdp_first_ring,
-					      adapter->xdp_num_queues);
-	if (rc)
-		goto create_err;
-
-	return 0;
-
-create_err:
-	ena_free_all_io_tx_resources(adapter);
-setup_err:
-	return rc;
-}
-
-/* Provides a way for both kernel and bpf-prog to know
- * more about the RX-queue a given XDP frame arrived on.
- */
-static int ena_xdp_register_rxq_info(struct ena_ring *rx_ring)
-{
-	int rc;
-
-#ifdef AF_XDP_BUSY_POLL_SUPPORTED
-	rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid, 0);
-#else
-	rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid);
-#endif
-
-	if (rc) {
-		netif_err(rx_ring->adapter, ifup, rx_ring->netdev,
-			  "Failed to register xdp rx queue info. RX queue num %d rc: %d\n",
-			  rx_ring->qid, rc);
-		goto err;
-	}
-
-	rc = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq, MEM_TYPE_PAGE_SHARED,
-					NULL);
-
-	if (rc) {
-		netif_err(rx_ring->adapter, ifup, rx_ring->netdev,
-			  "Failed to register xdp rx queue info memory model. RX queue num %d rc: %d\n",
-			  rx_ring->qid, rc);
-		xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
-	}
-
-err:
-	return rc;
-}
-
-static void ena_xdp_unregister_rxq_info(struct ena_ring *rx_ring)
-{
-	xdp_rxq_info_unreg_mem_model(&rx_ring->xdp_rxq);
-	xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
-}
-
-static void ena_xdp_exchange_program_rx_in_range(struct ena_adapter *adapter,
-						 struct bpf_prog *prog,
-						 int first, int count)
-{
-	struct ena_ring *rx_ring;
-	int i = 0;
-
-	for (i = first; i < count; i++) {
-		rx_ring = &adapter->rx_ring[i];
-		xchg(&rx_ring->xdp_bpf_prog, prog);
-		if (prog) {
-			ena_xdp_register_rxq_info(rx_ring);
-			rx_ring->rx_headroom = XDP_PACKET_HEADROOM;
-		} else {
-			ena_xdp_unregister_rxq_info(rx_ring);
-			rx_ring->rx_headroom = NET_SKB_PAD;
-		}
-	}
-}
-
-static void ena_xdp_exchange_program(struct ena_adapter *adapter,
-				     struct bpf_prog *prog)
-{
-	struct bpf_prog *old_bpf_prog = xchg(&adapter->xdp_bpf_prog, prog);
-
-	ena_xdp_exchange_program_rx_in_range(adapter,
-					     prog,
-					     0,
-					     adapter->num_io_queues);
-
-	if (old_bpf_prog)
-		bpf_prog_put(old_bpf_prog);
-}
-
-static int ena_destroy_and_free_all_xdp_queues(struct ena_adapter *adapter)
-{
-	bool was_up;
-	int rc;
-
-	was_up = test_bit(ENA_FLAG_DEV_UP, &adapter->flags);
-
-	if (was_up)
-		ena_down(adapter);
-
-	adapter->xdp_first_ring = 0;
-	adapter->xdp_num_queues = 0;
-	ena_xdp_exchange_program(adapter, NULL);
-	if (was_up) {
-		rc = ena_up(adapter);
-		if (rc)
-			return rc;
-	}
-	return 0;
-}
-
-static int ena_xdp_set(struct net_device *netdev, struct netdev_bpf *bpf)
-{
-	struct ena_adapter *adapter = netdev_priv(netdev);
-	struct bpf_prog *prog = bpf->prog;
-	struct bpf_prog *old_bpf_prog;
-	int rc, prev_mtu;
-	bool is_up;
-
-	is_up = test_bit(ENA_FLAG_DEV_UP, &adapter->flags);
-	rc = ena_xdp_allowed(adapter);
-	if (rc == ENA_XDP_ALLOWED) {
-		old_bpf_prog = adapter->xdp_bpf_prog;
-		if (prog) {
-			if (!is_up) {
-				ena_init_all_xdp_queues(adapter);
-			} else if (!old_bpf_prog) {
-				ena_down(adapter);
-				ena_init_all_xdp_queues(adapter);
-			}
-			ena_xdp_exchange_program(adapter, prog);
-
-			if (is_up && !old_bpf_prog) {
-				rc = ena_up(adapter);
-				if (rc)
-					return rc;
-			}
-		} else if (old_bpf_prog) {
-			rc = ena_destroy_and_free_all_xdp_queues(adapter);
-			if (rc)
-				return rc;
-		}
-
-		prev_mtu = netdev->max_mtu;
-		netdev->max_mtu = prog ? ENA_XDP_MAX_MTU : adapter->max_mtu;
-
-		if (!old_bpf_prog)
-			netif_info(adapter, drv, adapter->netdev,
-				   "XDP program is set, changing the max_mtu from %d to %d",
-				   prev_mtu, netdev->max_mtu);
-
-	} else if (rc == ENA_XDP_CURRENT_MTU_TOO_LARGE) {
-		netif_err(adapter, drv, adapter->netdev,
-			  "Failed to set xdp program, the current MTU (%d) is larger than the maximum allowed MTU (%lu) while xdp is on",
-			  netdev->mtu, ENA_XDP_MAX_MTU);
-		NL_SET_ERR_MSG_MOD(bpf->extack,
-				   "Failed to set xdp program, the current MTU is larger than the maximum allowed MTU. Check the dmesg for more info");
-		return -EINVAL;
-	} else if (rc == ENA_XDP_NO_ENOUGH_QUEUES) {
-		netif_err(adapter, drv, adapter->netdev,
-			  "Failed to set xdp program, the Rx/Tx channel count should be at most half of the maximum allowed channel count. The current queue count (%d), the maximal queue count (%d)\n",
-			  adapter->num_io_queues, adapter->max_num_io_queues);
-		NL_SET_ERR_MSG_MOD(bpf->extack,
-				   "Failed to set xdp program, there is no enough space for allocating XDP queues, Check the dmesg for more info");
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-/* This is the main xdp callback, it's used by the kernel to set/unset the xdp
- * program as well as to query the current xdp program id.
- */
-static int ena_xdp(struct net_device *netdev, struct netdev_bpf *bpf)
-{
-#ifndef ENA_XDP_QUERY_IN_KERNEL
-	struct ena_adapter *adapter = netdev_priv(netdev);
-
-#endif /* ENA_XDP_QUERY_IN_KERNEL */
-	switch (bpf->command) {
-	case XDP_SETUP_PROG:
-		return ena_xdp_set(netdev, bpf);
-#ifndef ENA_XDP_QUERY_IN_KERNEL
-	case XDP_QUERY_PROG:
-		bpf->prog_id = adapter->xdp_bpf_prog ?
-			adapter->xdp_bpf_prog->aux->id : 0;
-		break;
-#endif
-	default:
-		return -EINVAL;
-	}
-	return 0;
-}
-#endif /* ENA_XDP_SUPPORT */
-
 static int ena_init_rx_cpu_rmap(struct ena_adapter *adapter)
 {
 #ifdef CONFIG_RFS_ACCEL
@@ -766,12 +235,13 @@ static void ena_init_io_rings_common(struct ena_adapter *adapter,
 	ring->ena_dev = adapter->ena_dev;
 	ring->per_napi_packets = 0;
 	ring->cpu = 0;
+	ring->numa_node = 0;
 	ring->no_interrupt_event_cnt = 0;
 	u64_stats_init(&ring->syncp);
 }
 
-static void ena_init_io_rings(struct ena_adapter *adapter,
-			      int first_index, int count)
+void ena_init_io_rings(struct ena_adapter *adapter,
+		       int first_index, int count)
 {
 	struct ena_com_dev *ena_dev;
 	struct ena_ring *txr, *rxr;
@@ -874,6 +344,7 @@ static int ena_setup_tx_resources(struct ena_adapter *adapter, int qid)
 	tx_ring->next_to_use = 0;
 	tx_ring->next_to_clean = 0;
 	tx_ring->cpu = ena_irq->cpu;
+	tx_ring->numa_node = node;
 	return 0;
 
 err_push_buf_intermediate_buf:
@@ -906,9 +377,8 @@ static void ena_free_tx_resources(struct ena_adapter *adapter, int qid)
 	tx_ring->push_buf_intermediate_buf = NULL;
 }
 
-static int ena_setup_tx_resources_in_range(struct ena_adapter *adapter,
-					   int first_index,
-					   int count)
+int ena_setup_tx_resources_in_range(struct ena_adapter *adapter,
+				    int first_index, int count)
 {
 	int i, rc = 0;
 
@@ -931,7 +401,7 @@ static int ena_setup_tx_resources_in_range(struct ena_adapter *adapter,
 	return rc;
 }
 
-static void ena_free_all_io_tx_resources_in_range(struct ena_adapter *adapter,
+void ena_free_all_io_tx_resources_in_range(struct ena_adapter *adapter,
 						  int first_index, int count)
 {
 	int i;
@@ -945,7 +415,7 @@ static void ena_free_all_io_tx_resources_in_range(struct ena_adapter *adapter,
  *
  * Free all transmit software resources
  */
-static void ena_free_all_io_tx_resources(struct ena_adapter *adapter)
+void ena_free_all_io_tx_resources(struct ena_adapter *adapter)
 {
 	ena_free_all_io_tx_resources_in_range(adapter,
 					      0,
@@ -1009,6 +479,7 @@ static int ena_setup_rx_resources(struct ena_adapter *adapter,
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
 	rx_ring->cpu = ena_irq->cpu;
+	rx_ring->numa_node = node;
 
 	return 0;
 }
@@ -1111,12 +582,32 @@ static int ena_alloc_rx_buffer(struct ena_ring *rx_ring,
 	int tailroom;
 
 	/* restore page offset value in case it has been changed by device */
-	rx_info->page_offset = headroom;
+	rx_info->buf_offset = headroom;
 
 	/* if previous allocated page is not used */
 	if (unlikely(rx_info->page))
 		return 0;
 
+	tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	ena_buf = &rx_info->ena_buf;
+
+#ifdef ENA_AF_XDP_SUPPORT
+	if (unlikely(ENA_IS_XSK_RING(rx_ring))) {
+		struct xdp_buff *xdp;
+
+		xdp = xsk_buff_alloc(rx_ring->xsk_pool);
+		if (!xdp)
+			return -ENOMEM;
+
+		ena_buf->paddr = xsk_buff_xdp_get_dma(xdp);
+		ena_buf->len = xsk_pool_get_rx_frame_size(rx_ring->xsk_pool);
+
+		rx_info->xdp = xdp;
+
+		return 0;
+	}
+#endif /* ENA_AF_XDP_SUPPORT */
+
 	/* We handle DMA here */
 	page = ena_lpc_get_page(rx_ring, &dma, &rx_info->is_lpc_page);
 	if (unlikely(IS_ERR(page)))
@@ -1125,11 +616,9 @@ static int ena_alloc_rx_buffer(struct ena_ring *rx_ring,
 	netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
 		  "Allocate page %p, rx_info %p\n", page, rx_info);
 
-	tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-
 	rx_info->page = page;
 	rx_info->dma_addr = dma;
-	ena_buf = &rx_info->ena_buf;
+	rx_info->page_offset = 0;
 	ena_buf->paddr = dma + headroom;
 	ena_buf->len = ENA_PAGE_SIZE - headroom - tailroom;
 
@@ -1165,7 +654,7 @@ static void ena_free_rx_page(struct ena_ring *rx_ring,
 	rx_info->page = NULL;
 }
 
-static int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num)
+int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num)
 {
 	u16 next_to_use, req_id;
 	u32 i;
@@ -1182,9 +671,10 @@ static int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num)
 
 		rc = ena_alloc_rx_buffer(rx_ring, rx_info);
 		if (unlikely(rc < 0)) {
-			netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev,
-				   "Failed to allocate buffer for rx queue %d\n",
-				   rx_ring->qid);
+			if (!ENA_IS_XSK_RING(rx_ring))
+				netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev,
+					   "Failed to allocate buffer for rx queue %d\n",
+					   rx_ring->qid);
 			break;
 		}
 		rc = ena_com_add_single_rx_desc(rx_ring->ena_com_io_sq,
@@ -1203,9 +693,10 @@ static int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num)
 	if (unlikely(i < num)) {
 		ena_increase_stat(&rx_ring->rx_stats.refil_partial, 1,
 				  &rx_ring->syncp);
-		netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev,
-			   "Refilled rx qid %d with only %d buffers (from %d)\n",
-			   rx_ring->qid, i, num);
+		if (!ENA_IS_XSK_RING(rx_ring))
+			netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev,
+				   "Refilled rx qid %d with only %d buffers (from %d)\n",
+				   rx_ring->qid, i, num);
 	}
 
 	/* ena_com_write_sq_doorbell issues a wmb() */
@@ -1223,6 +714,11 @@ static void ena_free_rx_bufs(struct ena_adapter *adapter,
 	struct ena_ring *rx_ring = &adapter->rx_ring[qid];
 	u32 i;
 
+	if (ENA_IS_XSK_RING(rx_ring)) {
+		ena_xdp_free_rx_bufs_zc(adapter, qid);
+		return;
+	}
+
 	for (i = 0; i < rx_ring->ring_size; i++) {
 		struct ena_rx_buffer *rx_info = &rx_ring->rx_buffer_info[i];
 
@@ -1259,8 +755,8 @@ static void ena_free_all_rx_bufs(struct ena_adapter *adapter)
 		ena_free_rx_bufs(adapter, i);
 }
 
-static void ena_unmap_tx_buff(struct ena_ring *tx_ring,
-			      struct ena_tx_buffer *tx_info)
+void ena_unmap_tx_buff(struct ena_ring *tx_ring,
+		       struct ena_tx_buffer *tx_info)
 {
 	struct ena_com_buf *ena_buf;
 	u32 cnt;
@@ -1329,6 +825,10 @@ static void ena_free_all_tx_bufs(struct ena_adapter *adapter)
 
 	for (i = 0; i < adapter->num_io_queues + adapter->xdp_num_queues; i++) {
 		tx_ring = &adapter->tx_ring[i];
+		if (ENA_IS_XSK_RING(tx_ring)) {
+			ena_xdp_free_tx_bufs_zc(tx_ring);
+			continue;
+		}
 		ena_free_tx_bufs(tx_ring);
 	}
 }
@@ -1352,6 +852,7 @@ static void ena_destroy_all_rx_queues(struct ena_adapter *adapter)
 	for (i = 0; i < adapter->num_io_queues; i++) {
 		ena_qid = ENA_IO_RXQ_IDX(i);
 		cancel_work_sync(&adapter->ena_napi[i].dim.work);
+		ena_xdp_unregister_rxq_info(&adapter->rx_ring[i]);
 		ena_com_destroy_io_queue(adapter->ena_dev, ena_qid);
 	}
 }
@@ -1362,8 +863,8 @@ static void ena_destroy_all_io_queues(struct ena_adapter *adapter)
 	ena_destroy_all_rx_queues(adapter);
 }
 
-static int handle_invalid_req_id(struct ena_ring *ring, u16 req_id,
-				 struct ena_tx_buffer *tx_info, bool is_xdp)
+int handle_invalid_req_id(struct ena_ring *ring, u16 req_id,
+			  struct ena_tx_buffer *tx_info, bool is_xdp)
 {
 	if (tx_info)
 		netif_err(ring->adapter,
@@ -1379,10 +880,8 @@ static int handle_invalid_req_id(struct ena_ring *ring, u16 req_id,
 			  req_id, ring->qid);
 
 	ena_increase_stat(&ring->tx_stats.bad_req_id, 1, &ring->syncp);
+	ena_reset_device(ring->adapter, ENA_REGS_RESET_INV_TX_REQ_ID);
 
-	/* Trigger device reset */
-	ring->adapter->reset_reason = ENA_REGS_RESET_INV_TX_REQ_ID;
-	set_bit(ENA_FLAG_TRIGGER_RESET, &ring->adapter->flags);
 	return -EFAULT;
 }
 
@@ -1397,19 +896,6 @@ static int validate_tx_req_id(struct ena_ring *tx_ring, u16 req_id)
 	return handle_invalid_req_id(tx_ring, req_id, tx_info, false);
 }
 
-#ifdef ENA_XDP_SUPPORT
-static int validate_xdp_req_id(struct ena_ring *xdp_ring, u16 req_id)
-{
-	struct ena_tx_buffer *tx_info;
-
-	tx_info = &xdp_ring->tx_buffer_info[req_id];
-	if (likely(tx_info->xdpf))
-		return 0;
-
-	return handle_invalid_req_id(xdp_ring, req_id, tx_info, true);
-}
-#endif /* ENA_XDP_SUPPORT */
-
 static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget)
 {
 	struct netdev_queue *txq;
@@ -1457,7 +943,7 @@ static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget)
 			  "tx_poll: q %d skb %p completed\n", tx_ring->qid,
 			  skb);
 
-		tx_bytes += skb->len;
+		tx_bytes += tx_info->total_tx_size;
 		dev_kfree_skb(skb);
 		tx_pkts++;
 		total_done += tx_info->tx_descs;
@@ -1502,21 +988,21 @@ static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget)
 	return tx_pkts;
 }
 
-static struct sk_buff *ena_alloc_skb(struct ena_ring *rx_ring, void *first_frag)
+static struct sk_buff *ena_alloc_skb(struct ena_ring *rx_ring, void *first_frag, u16 len)
 {
 	struct sk_buff *skb;
-#ifdef ENA_LINEAR_FRAG_SUPPORTED
 
+#ifdef ENA_LINEAR_FRAG_SUPPORTED
 	if (!first_frag)
-		skb = netdev_alloc_skb_ip_align(rx_ring->netdev,
-						rx_ring->rx_copybreak);
+		skb = netdev_alloc_skb_ip_align(rx_ring->netdev, len);
 	else
-		skb = build_skb(first_frag, ENA_PAGE_SIZE);
+		skb = build_skb(first_frag, len);
 #else
-	u32 linear_size = max_t(u32, ENA_SKB_PULL_MIN_LEN, rx_ring->rx_copybreak);
-
-	skb = netdev_alloc_skb_ip_align(rx_ring->netdev,
-					linear_size);
+	if (!first_frag)
+		skb = netdev_alloc_skb_ip_align(rx_ring->netdev, len);
+	else
+		skb = netdev_alloc_skb_ip_align(rx_ring->netdev,
+						ENA_SKB_PULL_MIN_LEN);
 #endif
 
 	if (unlikely(!skb)) {
@@ -1526,25 +1012,48 @@ static struct sk_buff *ena_alloc_skb(struct ena_ring *rx_ring, void *first_frag)
 		netif_dbg(rx_ring->adapter, rx_err, rx_ring->netdev,
 			  "Failed to allocate skb. first_frag %s\n",
 			  first_frag ? "provided" : "not provided");
-		return NULL;
 	}
 
 	return skb;
 }
 
+static bool ena_try_rx_buf_page_reuse(struct ena_rx_buffer *rx_info,
+				       u16 buf_len, u16 len)
+{
+	struct ena_com_buf *ena_buf = &rx_info->ena_buf;
+
+	/* More than ENA_MIN_RX_BUF_SIZE left in the reused buffer
+	 * for data + headroom + tailroom
+	 */
+	if (SKB_DATA_ALIGN(len) + ENA_MIN_RX_BUF_SIZE <= ena_buf->len) {
+		page_ref_inc(rx_info->page);
+		rx_info->page_offset += buf_len;
+		ena_buf->paddr += buf_len;
+		ena_buf->len -= buf_len;
+		return true;
+        }
+
+	return false;
+}
+
 static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 				  struct ena_com_rx_buf_info *ena_bufs,
 				  u32 descs,
 				  u16 *next_to_clean)
 {
+	int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	bool is_xdp_loaded = ena_xdp_present_ring(rx_ring);
 	struct ena_rx_buffer *rx_info;
 	struct ena_adapter *adapter;
+	int page_offset, pkt_offset;
 	u16 len, req_id, buf = 0;
+	bool reuse_rx_buf_page;
 	struct sk_buff *skb;
-	void *page_addr;
-	u32 page_offset;
-	void *data_addr;
+	void *buf_addr;
+	int buf_offset;
+	u16 buf_len;
 #ifndef ENA_LINEAR_FRAG_SUPPORTED
+	void *data_addr;
 	u16 hlen;
 #endif
 
@@ -1558,9 +1067,7 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 		netif_err(adapter, rx_err, rx_ring->netdev,
 			  "Page is NULL. qid %u req_id %u\n", rx_ring->qid, req_id);
 		ena_increase_stat(&rx_ring->rx_stats.bad_req_id, 1, &rx_ring->syncp);
-		adapter->reset_reason = ENA_REGS_RESET_INV_RX_REQ_ID;
-		smp_mb__before_atomic();
-		set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+		ena_reset_device(adapter, ENA_REGS_RESET_INV_RX_REQ_ID);
 		return NULL;
 	}
 
@@ -1568,34 +1075,31 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 		  "rx_info %p page %p\n",
 		  rx_info, rx_info->page);
 
-	/* save virt address of first buffer */
-	page_addr = page_address(rx_info->page);
+	buf_offset = rx_info->buf_offset;
 	page_offset = rx_info->page_offset;
-	data_addr = page_addr + page_offset;
-
-	prefetch(data_addr);
+	buf_addr = page_address(rx_info->page) + page_offset;
 
 	if (len <= rx_ring->rx_copybreak) {
-		skb = ena_alloc_skb(rx_ring, NULL);
+		skb = ena_alloc_skb(rx_ring, NULL, len);
 		if (unlikely(!skb))
 			return NULL;
 
-		netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
-			  "RX allocated small packet. len %d. data_len %d\n",
-			  skb->len, skb->data_len);
+		pkt_offset = buf_offset - rx_ring->rx_headroom;
 
 		/* sync this buffer for CPU use */
 		dma_sync_single_for_cpu(rx_ring->dev,
-					dma_unmap_addr(&rx_info->ena_buf, paddr),
+					dma_unmap_addr(&rx_info->ena_buf, paddr) + pkt_offset,
 					len,
 					DMA_FROM_DEVICE);
-		skb_copy_to_linear_data(skb, data_addr, len);
+		skb_copy_to_linear_data(skb, buf_addr + buf_offset, len);
 		dma_sync_single_for_device(rx_ring->dev,
-					   dma_unmap_addr(&rx_info->ena_buf, paddr),
+					   dma_unmap_addr(&rx_info->ena_buf, paddr) + pkt_offset,
 					   len,
 					   DMA_FROM_DEVICE);
 
 		skb_put(skb, len);
+		netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
+			  "RX allocated small packet. len %d.\n", skb->len);
 #ifdef ENA_BUSY_POLL_SUPPORT
 		skb_mark_napi_id(skb, rx_ring->napi);
 #endif
@@ -1606,17 +1110,26 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 		return skb;
 	}
 
-	ena_unmap_rx_buff(rx_ring, rx_info);
+	buf_len = SKB_DATA_ALIGN(len + buf_offset + tailroom);
 
-	skb = ena_alloc_skb(rx_ring, page_addr);
+	/* If XDP isn't loaded try to reuse part of the RX buffer */
+	reuse_rx_buf_page = !is_xdp_loaded &&
+			    ena_try_rx_buf_page_reuse(rx_info, buf_len, len);
+
+	if (!reuse_rx_buf_page)
+		ena_unmap_rx_buff(rx_ring, rx_info);
+
+	skb = ena_alloc_skb(rx_ring, buf_addr, buf_len);
 	if (unlikely(!skb))
 		return NULL;
 
 #ifdef ENA_LINEAR_FRAG_SUPPORTED
 	/* Populate skb's linear part */
-	skb_reserve(skb, page_offset);
+	skb_reserve(skb, buf_offset);
 	skb_put(skb, len);
 #else
+	data_addr = buf_addr + buf_offset;
+
 	/* GRO expects us to have the ethernet header in the linear part.
 	 * Copy the first ENA_SKB_PULL_MIN_LEN bytes because it is more
 	 * efficient.
@@ -1625,7 +1138,8 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 	memcpy(__skb_put(skb, hlen), data_addr, hlen);
 	if (hlen < len)
 		skb_add_rx_frag(skb, 0, rx_info->page,
-				page_offset + hlen, len - hlen, ENA_PAGE_SIZE);
+				page_offset + buf_offset + hlen,
+				len - hlen, buf_len);
 #endif
 	skb->protocol = eth_type_trans(skb, rx_ring->netdev);
 
@@ -1634,7 +1148,8 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 			  "RX skb updated. len %d. data_len %d\n",
 			  skb->len, skb->data_len);
 
-		rx_info->page = NULL;
+		if (!reuse_rx_buf_page)
+			rx_info->page = NULL;
 
 		rx_ring->free_ids[*next_to_clean] = req_id;
 		*next_to_clean =
@@ -1649,10 +1164,19 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 
 		rx_info = &rx_ring->rx_buffer_info[req_id];
 
-		ena_unmap_rx_buff(rx_ring, rx_info);
+		/* rx_info->buf_offset includes rx_ring->rx_headroom */
+		buf_offset = rx_info->buf_offset;
+		buf_len = SKB_DATA_ALIGN(len + buf_offset + tailroom);
+		page_offset = rx_info->page_offset;
+
+		reuse_rx_buf_page = !is_xdp_loaded &&
+				    ena_try_rx_buf_page_reuse(rx_info, buf_len, len);
+
+		if (!reuse_rx_buf_page)
+			ena_unmap_rx_buff(rx_ring, rx_info);
 
 		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_info->page,
-				rx_info->page_offset, len, ENA_PAGE_SIZE);
+				page_offset + buf_offset, len, buf_len);
 
 	} while (1);
 
@@ -1668,9 +1192,9 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
  * @ena_rx_ctx: received packet context/metadata
  * @skb: skb currently being received and modified
  */
-static void ena_rx_checksum(struct ena_ring *rx_ring,
-				   struct ena_com_rx_ctx *ena_rx_ctx,
-				   struct sk_buff *skb)
+void ena_rx_checksum(struct ena_ring *rx_ring,
+		     struct ena_com_rx_ctx *ena_rx_ctx,
+		     struct sk_buff *skb)
 {
 	/* Rx csum disabled */
 	if (unlikely(!(rx_ring->netdev->features & NETIF_F_RXCSUM))) {
@@ -1689,7 +1213,7 @@ static void ena_rx_checksum(struct ena_ring *rx_ring,
 		     (ena_rx_ctx->l3_csum_err))) {
 		/* ipv4 checksum error */
 		skb->ip_summed = CHECKSUM_NONE;
-		ena_increase_stat(&rx_ring->rx_stats.bad_csum, 1,
+		ena_increase_stat(&rx_ring->rx_stats.csum_bad, 1,
 				  &rx_ring->syncp);
 		netif_dbg(rx_ring->adapter, rx_err, rx_ring->netdev,
 			  "RX IPv4 header checksum error\n");
@@ -1701,7 +1225,7 @@ static void ena_rx_checksum(struct ena_ring *rx_ring,
 		   (ena_rx_ctx->l4_proto == ENA_ETH_IO_L4_PROTO_UDP))) {
 		if (unlikely(ena_rx_ctx->l4_csum_err)) {
 			/* TCP/UDP checksum error */
-			ena_increase_stat(&rx_ring->rx_stats.bad_csum, 1,
+			ena_increase_stat(&rx_ring->rx_stats.csum_bad, 1,
 					  &rx_ring->syncp);
 			netif_dbg(rx_ring->adapter, rx_err, rx_ring->netdev,
 				  "RX L4 checksum error\n");
@@ -1725,9 +1249,9 @@ static void ena_rx_checksum(struct ena_ring *rx_ring,
 
 }
 
-static void ena_set_rx_hash(struct ena_ring *rx_ring,
-			    struct ena_com_rx_ctx *ena_rx_ctx,
-			    struct sk_buff *skb)
+void ena_set_rx_hash(struct ena_ring *rx_ring,
+		     struct ena_com_rx_ctx *ena_rx_ctx,
+		     struct sk_buff *skb)
 {
 #ifdef NETIF_F_RXHASH
 	enum pkt_hash_types hash_type;
@@ -1757,24 +1281,25 @@ static int ena_xdp_handle_buff(struct ena_ring *rx_ring, struct xdp_buff *xdp)
 
 	rx_info = &rx_ring->rx_buffer_info[rx_ring->ena_bufs[0].req_id];
 	xdp_prepare_buff(xdp, page_address(rx_info->page),
-			 rx_info->page_offset,
+			 rx_info->buf_offset,
 			 rx_ring->ena_bufs[0].len, false);
 	/* If for some reason we received a bigger packet than
 	 * we expect, then we simply drop it
 	 */
 	if (unlikely(rx_ring->ena_bufs[0].len > ENA_XDP_MAX_MTU))
-		return XDP_DROP;
+		return ENA_XDP_DROP;
 
 	ret = ena_xdp_execute(rx_ring, xdp);
 
 	/* The xdp program might expand the headers */
-	if (ret == XDP_PASS) {
-		rx_info->page_offset = xdp->data - xdp->data_hard_start;
+	if (ret == ENA_XDP_PASS) {
+		rx_info->buf_offset = xdp->data - xdp->data_hard_start;
 		rx_ring->ena_bufs[0].len = xdp->data_end - xdp->data;
 	}
 
 	return ret;
 }
+
 #endif /* ENA_XDP_SUPPORT */
 /* ena_clean_rx_irq - Cleanup RX irq
  * @rx_ring: RX ring to clean
@@ -1815,7 +1340,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 
 	do {
 #ifdef ENA_XDP_SUPPORT
-		xdp_verdict = XDP_PASS;
+		xdp_verdict = ENA_XDP_PASS;
 		skb = NULL;
 #endif /* ENA_XDP_SUPPORT */
 		ena_rx_ctx.ena_bufs = rx_ring->ena_bufs;
@@ -1833,7 +1358,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 
 		/* First descriptor might have an offset set by the device */
 		rx_info = &rx_ring->rx_buffer_info[rx_ring->ena_bufs[0].req_id];
-		rx_info->page_offset += ena_rx_ctx.pkt_offset;
+		rx_info->buf_offset += ena_rx_ctx.pkt_offset;
 
 		netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
 			  "rx_poll: q %d got packet from ena. descs #: %d l3 proto %d l4 proto %d hash: %x\n",
@@ -1845,7 +1370,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 			xdp_verdict = ena_xdp_handle_buff(rx_ring, &xdp);
 
 		/* allocate skb and fill it */
-		if (xdp_verdict == XDP_PASS)
+		if (xdp_verdict == ENA_XDP_PASS)
 			skb = ena_rx_skb(rx_ring,
 					 rx_ring->ena_bufs,
 					 ena_rx_ctx.descs,
@@ -1868,7 +1393,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 				/* Packets was passed for transmission, unmap it
 				 * from RX side.
 				 */
-				if (xdp_verdict == XDP_TX || xdp_verdict == XDP_REDIRECT) {
+				if (xdp_verdict & ENA_XDP_FORWARDED) {
 					ena_unmap_rx_buff(rx_ring,
 							  &rx_ring->rx_buffer_info[req_id]);
 					rx_ring->rx_buffer_info[req_id].page = NULL;
@@ -1876,8 +1401,9 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 #endif /* ENA_XDP_SUPPORT */
 			}
 #ifdef ENA_XDP_SUPPORT
-			if (xdp_verdict != XDP_PASS) {
+			if (xdp_verdict != ENA_XDP_PASS) {
 				xdp_flags |= xdp_verdict;
+				total_len += ena_rx_ctx.ena_bufs[0].len;
 				res_budget--;
 				continue;
 			}
@@ -1930,7 +1456,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 	}
 
 #ifdef ENA_XDP_SUPPORT
-	if (xdp_flags & XDP_REDIRECT)
+	if (xdp_flags & ENA_XDP_REDIRECT)
 		xdp_do_flush_map();
 #endif
 
@@ -1942,15 +1468,12 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 	if (rc == -ENOSPC) {
 		ena_increase_stat(&rx_ring->rx_stats.bad_desc_num, 1,
 				  &rx_ring->syncp);
-		adapter->reset_reason = ENA_REGS_RESET_TOO_MANY_RX_DESCS;
+		ena_reset_device(adapter, ENA_REGS_RESET_TOO_MANY_RX_DESCS);
 	} else {
 		ena_increase_stat(&rx_ring->rx_stats.bad_req_id, 1,
 				  &rx_ring->syncp);
-		adapter->reset_reason = ENA_REGS_RESET_INV_RX_REQ_ID;
+		ena_reset_device(adapter, ENA_REGS_RESET_INV_RX_REQ_ID);
 	}
-
-	set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
-
 	return 0;
 }
 
@@ -1985,8 +1508,8 @@ static void ena_adjust_adaptive_rx_intr_moderation(struct ena_napi *ena_napi)
 	rx_ring->per_napi_packets = 0;
 }
 
-static void ena_unmask_interrupt(struct ena_ring *tx_ring,
-					struct ena_ring *rx_ring)
+void ena_unmask_interrupt(struct ena_ring *tx_ring,
+			  struct ena_ring *rx_ring)
 {
 	struct ena_eth_io_intr_reg intr_reg;
 	u32 rx_interval = 0;
@@ -2017,8 +1540,8 @@ static void ena_unmask_interrupt(struct ena_ring *tx_ring,
 	ena_com_unmask_intr(tx_ring->ena_com_io_cq, &intr_reg);
 }
 
-static void ena_update_ring_numa_node(struct ena_ring *tx_ring,
-					     struct ena_ring *rx_ring)
+void ena_update_ring_numa_node(struct ena_ring *tx_ring,
+			       struct ena_ring *rx_ring)
 {
 	int cpu = get_cpu();
 	int numa_node;
@@ -2027,89 +1550,32 @@ static void ena_update_ring_numa_node(struct ena_ring *tx_ring,
 	if (likely(tx_ring->cpu == cpu))
 		goto out;
 
+	tx_ring->cpu = cpu;
+	if (rx_ring)
+		rx_ring->cpu = cpu;
+
 	numa_node = cpu_to_node(cpu);
+
+	if (likely(tx_ring->numa_node == numa_node))
+		goto out;
+
 	put_cpu();
 
 	if (numa_node != NUMA_NO_NODE) {
 		ena_com_update_numa_node(tx_ring->ena_com_io_cq, numa_node);
-		if (rx_ring)
+		tx_ring->numa_node = numa_node;
+		if (rx_ring) {
+			rx_ring->numa_node = numa_node;
 			ena_com_update_numa_node(rx_ring->ena_com_io_cq,
 						 numa_node);
+		}
 	}
 
-	tx_ring->cpu = cpu;
-	if (rx_ring)
-		rx_ring->cpu = cpu;
-
 	return;
 out:
 	put_cpu();
 }
 
-#ifdef ENA_XDP_SUPPORT
-static int ena_clean_xdp_irq(struct ena_ring *xdp_ring, u32 budget)
-{
-	u32 total_done = 0;
-	u16 next_to_clean;
-	u32 tx_bytes = 0;
-	int tx_pkts = 0;
-	u16 req_id;
-	int rc;
-
-	if (unlikely(!xdp_ring))
-		return 0;
-	next_to_clean = xdp_ring->next_to_clean;
-
-	while (tx_pkts < budget) {
-		struct ena_tx_buffer *tx_info;
-		struct xdp_frame *xdpf;
-
-		rc = ena_com_tx_comp_req_id_get(xdp_ring->ena_com_io_cq,
-						&req_id);
-		if (rc) {
-			if (unlikely(rc == -EINVAL))
-				handle_invalid_req_id(xdp_ring, req_id, NULL,
-						      true);
-			break;
-		}
-
-		/* validate that the request id points to a valid skb */
-		rc = validate_xdp_req_id(xdp_ring, req_id);
-		if (rc)
-			break;
-
-		tx_info = &xdp_ring->tx_buffer_info[req_id];
-		xdpf = tx_info->xdpf;
-
-		tx_info->xdpf = NULL;
-		tx_info->last_jiffies = 0;
-		ena_unmap_tx_buff(xdp_ring, tx_info);
-
-		netif_dbg(xdp_ring->adapter, tx_done, xdp_ring->netdev,
-			  "tx_poll: q %d skb %p completed\n", xdp_ring->qid,
-			  xdpf);
-
-		tx_bytes += xdpf->len;
-		tx_pkts++;
-		total_done += tx_info->tx_descs;
-
-		xdp_return_frame(xdpf);
-		xdp_ring->free_ids[next_to_clean] = req_id;
-		next_to_clean = ENA_TX_RING_IDX_NEXT(next_to_clean,
-						     xdp_ring->ring_size);
-	}
-
-	xdp_ring->next_to_clean = next_to_clean;
-	ena_com_comp_ack(xdp_ring->ena_com_io_sq, total_done);
-	ena_com_update_dev_comp_head(xdp_ring->ena_com_io_cq);
-
-	netif_dbg(xdp_ring->adapter, tx_done, xdp_ring->netdev,
-		  "tx_poll: q %d done. total pkts: %d\n",
-		  xdp_ring->qid, tx_pkts);
-
-	return tx_pkts;
-}
-#endif /* ENA_XDP_SUPPORT */
 
 static int ena_io_poll(struct napi_struct *napi, int budget)
 {
@@ -2172,11 +1638,10 @@ static int ena_io_poll(struct napi_struct *napi, int budget)
 			if (ena_com_get_adaptive_moderation_enabled(rx_ring->ena_dev))
 				ena_adjust_adaptive_rx_intr_moderation(ena_napi);
 
+			ena_update_ring_numa_node(tx_ring, rx_ring);
 			ena_unmask_interrupt(tx_ring, rx_ring);
 		}
 
-		ena_update_ring_numa_node(tx_ring, rx_ring);
-
 		ret = rx_work_done;
 	} else {
 		ret = budget;
@@ -2271,6 +1736,7 @@ static int ena_enable_msix(struct ena_adapter *adapter)
 			  "Failed to enable MSI-X. irq_cnt %d\n", irq_cnt);
 #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
 		vfree(adapter->msix_entries);
+		adapter->msix_entries = NULL;
 #endif
 		return -ENOSPC;
 	}
@@ -2476,8 +1942,8 @@ static void ena_del_napi_in_range(struct ena_adapter *adapter,
 		netif_napi_del(&adapter->ena_napi[i].napi);
 
 #ifdef ENA_XDP_SUPPORT
-		WARN_ON(!ENA_IS_XDP_INDEX(adapter, i) &&
-			adapter->ena_napi[i].xdp_ring);
+		WARN_ON(ENA_IS_XDP_INDEX(adapter, i) &&
+			adapter->ena_napi[i].rx_ring);
 #endif /* ENA_XDP_SUPPORT */
 	}
 #ifdef ENA_BUSY_POLL_SUPPORT
@@ -2491,31 +1957,36 @@ static void ena_init_napi_in_range(struct ena_adapter *adapter,
 				   int first_index, int count)
 {
 	int i;
+	int (*napi_handler)(struct napi_struct *napi, int budget);
 
 	for (i = first_index; i < first_index + count; i++) {
 		struct ena_napi *napi = &adapter->ena_napi[i];
+		struct ena_ring *rx_ring, *tx_ring;
 
-		netif_napi_add(adapter->netdev,
-			       &napi->napi,
+		memset(napi, 0, sizeof(*napi));
+
+		rx_ring = &adapter->rx_ring[i];
+		tx_ring = &adapter->tx_ring[i];
+
+		napi_handler = ena_io_poll;
 #ifdef ENA_XDP_SUPPORT
-			       ENA_IS_XDP_INDEX(adapter, i) ? ena_xdp_io_poll : ena_io_poll,
-#else
-			       ena_io_poll,
+		if (ENA_IS_XDP_INDEX(adapter, i) || ENA_IS_XSK_RING(rx_ring))
+			napi_handler = ena_xdp_io_poll;
 #endif /* ENA_XDP_SUPPORT */
+
+		netif_napi_add(adapter->netdev,
+			       &napi->napi,
+			       napi_handler,
 			       ENA_NAPI_BUDGET);
 
 #ifdef ENA_BUSY_POLL_SUPPORT
 		napi_hash_add(&adapter->ena_napi[i].napi);
 
 #endif /* ENA_BUSY_POLL_SUPPORT */
-		if (!ENA_IS_XDP_INDEX(adapter, i)) {
-			napi->rx_ring = &adapter->rx_ring[i];
-			napi->tx_ring = &adapter->tx_ring[i];
-		} else {
-#ifdef ENA_XDP_SUPPORT
-			napi->xdp_ring = &adapter->tx_ring[i];
-#endif /* ENA_XDP_SUPPORT */
-		}
+		if (!ENA_IS_XDP_INDEX(adapter, i))
+			napi->rx_ring = rx_ring;
+
+		napi->tx_ring = tx_ring;
 		napi->qid = i;
 	}
 }
@@ -2647,7 +2118,7 @@ static int ena_create_io_tx_queue(struct ena_adapter *adapter, int qid)
 	ctx.mem_queue_type = ena_dev->tx_mem_queue_type;
 	ctx.msix_vector = msix_vector;
 	ctx.queue_size = tx_ring->ring_size;
-	ctx.numa_node = cpu_to_node(tx_ring->cpu);
+	ctx.numa_node = tx_ring->numa_node;
 
 	rc = ena_com_create_io_queue(ena_dev, &ctx);
 	if (rc) {
@@ -2672,8 +2143,8 @@ static int ena_create_io_tx_queue(struct ena_adapter *adapter, int qid)
 	return rc;
 }
 
-static int ena_create_io_tx_queues_in_range(struct ena_adapter *adapter,
-					    int first_index, int count)
+int ena_create_io_tx_queues_in_range(struct ena_adapter *adapter,
+				     int first_index, int count)
 {
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
 	int rc, i;
@@ -2715,7 +2186,7 @@ static int ena_create_io_rx_queue(struct ena_adapter *adapter, int qid)
 	ctx.mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
 	ctx.msix_vector = msix_vector;
 	ctx.queue_size = rx_ring->ring_size;
-	ctx.numa_node = cpu_to_node(rx_ring->cpu);
+	ctx.numa_node = rx_ring->numa_node;
 
 	rc = ena_com_create_io_queue(ena_dev, &ctx);
 	if (rc) {
@@ -2753,12 +2224,15 @@ static int ena_create_all_io_rx_queues(struct ena_adapter *adapter)
 		if (rc)
 			goto create_err;
 		INIT_WORK(&adapter->ena_napi[i].dim.work, ena_dim_work);
+
+		ena_xdp_register_rxq_info(&adapter->rx_ring[i]);
 	}
 
 	return 0;
 
 create_err:
 	while (i--) {
+		ena_xdp_unregister_rxq_info(&adapter->rx_ring[i]);
 		cancel_work_sync(&adapter->ena_napi[i].dim.work);
 		ena_com_destroy_io_queue(ena_dev, ENA_IO_RXQ_IDX(i));
 	}
@@ -2889,7 +2363,7 @@ static int create_queues_with_size_backoff(struct ena_adapter *adapter)
 	}
 }
 
-static int ena_up(struct ena_adapter *adapter)
+int ena_up(struct ena_adapter *adapter)
 {
 	int io_queue_count, rc, i;
 
@@ -2960,7 +2434,7 @@ static int ena_up(struct ena_adapter *adapter)
 	return rc;
 }
 
-static void ena_down(struct ena_adapter *adapter)
+void ena_down(struct ena_adapter *adapter)
 {
 	int io_queue_count = adapter->num_io_queues + adapter->xdp_num_queues;
 
@@ -3392,7 +2866,7 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* set flags and meta data */
 	ena_tx_csum(&ena_tx_ctx, skb, tx_ring->disable_meta_caching);
 
-	rc = ena_xmit_common(dev,
+	rc = ena_xmit_common(adapter,
 			     tx_ring,
 			     tx_info,
 			     &ena_tx_ctx,
@@ -3556,7 +3030,8 @@ static void ena_config_host_info(struct ena_com_dev *ena_dev, struct pci_dev *pd
 		ENA_ADMIN_HOST_INFO_RX_OFFSET_MASK |
 		ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_MASK |
 		ENA_ADMIN_HOST_INFO_RX_BUF_MIRRORING_MASK |
-		ENA_ADMIN_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY_MASK;
+		ENA_ADMIN_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY_MASK |
+		ENA_ADMIN_HOST_INFO_RX_PAGE_REUSE_MASK;
 
 	rc = ena_com_set_host_attributes(ena_dev);
 	if (rc) {
@@ -3809,6 +3284,9 @@ static const struct net_device_ops ena_netdev_ops = {
 #ifdef ENA_XDP_SUPPORT
 	.ndo_bpf		= ena_xdp,
 	.ndo_xdp_xmit		= ena_xdp_xmit,
+#ifdef ENA_AF_XDP_SUPPORT
+	.ndo_xsk_wakeup         = ena_xdp_xsk_wakeup,
+#endif /* ENA_AF_XDP_SUPPORT */
 #endif /* ENA_XDP_SUPPORT */
 };
 
@@ -3839,12 +3317,20 @@ static void set_default_llq_configurations(struct ena_adapter *adapter,
 					   struct ena_llq_configurations *llq_config,
 					   struct ena_admin_feature_llq_desc *llq)
 {
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+
 	llq_config->llq_header_location = ENA_ADMIN_INLINE_HEADER;
 	llq_config->llq_stride_ctrl = ENA_ADMIN_MULTIPLE_DESCS_PER_ENTRY;
 	llq_config->llq_num_decs_before_header = ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_2;
 
+	adapter->large_llq_header_supported =
+		!!(ena_dev->supported_features & (1 << ENA_ADMIN_LLQ));
+	adapter->large_llq_header_supported &=
+		!!(llq->entry_size_ctrl_supported &
+			ENA_ADMIN_LIST_ENTRY_SIZE_256B);
+
 	if ((llq->entry_size_ctrl_supported & ENA_ADMIN_LIST_ENTRY_SIZE_256B) &&
-	    adapter->large_llq_header) {
+		adapter->large_llq_header_enabled) {
 		llq_config->llq_ring_entry_size = ENA_ADMIN_LIST_ENTRY_SIZE_256B;
 		llq_config->llq_ring_entry_size_value = 256;
 	} else {
@@ -3969,6 +3455,8 @@ static int ena_device_init(struct ena_adapter *adapter, struct pci_dev *pdev,
 	}
 #endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0) */
 
+	ena_devlink_params_get(adapter->devlink);
+
 	/* ENA admin level init */
 	rc = ena_com_admin_init(ena_dev, &aenq_handlers);
 	if (rc) {
@@ -4067,7 +3555,7 @@ static int ena_enable_msix_and_set_admin_interrupts(struct ena_adapter *adapter)
 	return rc;
 }
 
-static void ena_destroy_device(struct ena_adapter *adapter, bool graceful)
+void ena_destroy_device(struct ena_adapter *adapter, bool graceful)
 {
 	struct net_device *netdev = adapter->netdev;
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
@@ -4114,7 +3602,7 @@ static void ena_destroy_device(struct ena_adapter *adapter, bool graceful)
 	clear_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags);
 }
 
-static int ena_restore_device(struct ena_adapter *adapter)
+int ena_restore_device(struct ena_adapter *adapter)
 {
 	struct ena_com_dev_get_features_ctx get_feat_ctx;
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
@@ -4221,9 +3709,8 @@ static int check_for_rx_interrupt_queue(struct ena_adapter *adapter,
 		netif_err(adapter, rx_err, adapter->netdev,
 			  "Potential MSIX issue on Rx side Queue = %d. Reset the device\n",
 			  rx_ring->qid);
-		adapter->reset_reason = ENA_REGS_RESET_MISS_INTERRUPT;
-		smp_mb__before_atomic();
-		set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+
+		ena_reset_device(adapter, ENA_REGS_RESET_MISS_INTERRUPT);
 		return -EIO;
 	}
 
@@ -4260,9 +3747,7 @@ static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
 			netif_err(adapter, tx_err, adapter->netdev,
 				  "Potential MSIX issue on Tx side Queue = %d. Reset the device\n",
 				  tx_ring->qid);
-			adapter->reset_reason = ENA_REGS_RESET_MISS_INTERRUPT;
-			smp_mb__before_atomic();
-			set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+			ena_reset_device(adapter, ENA_REGS_RESET_MISS_INTERRUPT);
 			return -EIO;
 		}
 
@@ -4288,9 +3773,7 @@ static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
 			  "The number of lost tx completions is above the threshold (%d > %d). Reset the device\n",
 			  missed_tx,
 			  adapter->missing_tx_completion_threshold);
-		adapter->reset_reason =
-			ENA_REGS_RESET_MISS_TX_CMPL;
-		set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+		ena_reset_device(adapter, ENA_REGS_RESET_MISS_TX_CMPL);
 		rc = -EIO;
 	}
 
@@ -4373,6 +3856,12 @@ static void check_for_empty_rx_ring(struct ena_adapter *adapter)
 	for (i = 0; i < adapter->num_io_queues; i++) {
 		rx_ring = &adapter->rx_ring[i];
 
+		/* If using UMEM, app might not provide RX buffers and the ring
+		 * can be empty
+		 */
+		if (ENA_IS_XSK_RING(rx_ring))
+			continue;
+
 		refill_required = ena_com_free_q_entries(rx_ring->ena_com_io_sq);
 		if (unlikely(refill_required == (rx_ring->ring_size - 1))) {
 			rx_ring->empty_rx_queue++;
@@ -4411,8 +3900,7 @@ static void check_for_missing_keep_alive(struct ena_adapter *adapter)
 			  "Keep alive watchdog timeout.\n");
 		ena_increase_stat(&adapter->dev_stats.wd_expired, 1,
 				  &adapter->syncp);
-		adapter->reset_reason = ENA_REGS_RESET_KEEP_ALIVE_TO;
-		set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+		ena_reset_device(adapter, ENA_REGS_RESET_KEEP_ALIVE_TO);
 	}
 }
 
@@ -4423,8 +3911,7 @@ static void check_for_admin_com_state(struct ena_adapter *adapter)
 			  "ENA admin queue is not in running state!\n");
 		ena_increase_stat(&adapter->dev_stats.admin_q_pause, 1,
 				  &adapter->syncp);
-		adapter->reset_reason = ENA_REGS_RESET_ADMIN_TO;
-		set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+		ena_reset_device(adapter, ENA_REGS_RESET_ADMIN_TO);
 	}
 }
 
@@ -4574,11 +4061,9 @@ static void ena_set_dev_offloads(struct ena_com_dev_get_features_ctx *feat,
 		ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV4_CSUM_PART_MASK)
 		dev_features |= NETIF_F_IP_CSUM;
 
-#ifdef NETIF_F_IPV6_CSUM
 	if (feat->offload.tx &
 		ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV6_CSUM_PART_MASK)
 		dev_features |= NETIF_F_IPV6_CSUM;
-#endif /* NETIF_F_IPV6_CSUM */
 
 	if (feat->offload.tx & ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV4_MASK)
 		dev_features |= NETIF_F_TSO;
@@ -4628,7 +4113,7 @@ static void ena_set_conf_feat_params(struct ena_adapter *adapter,
 		ether_addr_copy(adapter->mac_addr, netdev->dev_addr);
 	} else {
 		ether_addr_copy(adapter->mac_addr, feat->dev_attr.mac_addr);
-		ether_addr_copy(netdev->dev_addr, adapter->mac_addr);
+		eth_hw_addr_set(netdev, adapter->mac_addr);
 	}
 
 	/* Set offload features */
@@ -4728,9 +4213,9 @@ static void ena_calc_io_queue_size(struct ena_adapter *adapter,
 						  max_queue_ext->max_tx_sq_depth);
 
 		adapter->max_tx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
-					     max_queue_ext->max_per_packet_tx_descs);
+						 max_queue_ext->max_per_packet_tx_descs);
 		adapter->max_rx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
-					     max_queue_ext->max_per_packet_rx_descs);
+						 max_queue_ext->max_per_packet_rx_descs);
 	} else {
 		struct ena_admin_queue_feature_desc *max_queues =
 			&get_feat_ctx->max_queues;
@@ -4746,9 +4231,9 @@ static void ena_calc_io_queue_size(struct ena_adapter *adapter,
 						  max_queues->max_sq_depth);
 
 		adapter->max_tx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
-					     max_queues->max_packet_tx_descs);
+						 max_queues->max_packet_tx_descs);
 		adapter->max_rx_sgl_size = min_t(u16, ENA_PKT_MAX_BUFS,
-					     max_queues->max_packet_rx_descs);
+						 max_queues->max_packet_rx_descs);
 	}
 
 	max_tx_queue_size = rounddown_pow_of_two(max_tx_queue_size);
@@ -4758,7 +4243,7 @@ static void ena_calc_io_queue_size(struct ena_adapter *adapter,
 	 * and therefore divide the queue size by 2, leaving the amount
 	 * of memory used by the queues unchanged.
 	 */
-	if (adapter->large_llq_header) {
+	if (adapter->large_llq_header_enabled) {
 		if ((llq->entry_size_ctrl_supported & ENA_ADMIN_LIST_ENTRY_SIZE_256B) &&
 		    (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)) {
 			max_tx_queue_size /= 2;
@@ -4767,7 +4252,8 @@ static void ena_calc_io_queue_size(struct ena_adapter *adapter,
 		} else {
 			dev_err(&adapter->pdev->dev, "Forcing large headers failed: LLQ is disabled or device does not support large headers\n");
 
-			adapter->large_llq_header = false;
+			adapter->large_llq_header_enabled = false;
+			ena_devlink_disable_large_llq_header_param(adapter->devlink);
 		}
 	}
 
@@ -4802,6 +4288,7 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	struct ena_adapter *adapter;
 	struct net_device *netdev;
 	static int adapters_found;
+	struct devlink *devlink;
 	u32 max_num_io_queues;
 	bool wd_state;
 	int bars, rc;
@@ -4884,12 +4371,18 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	pci_set_drvdata(pdev, adapter);
 
-	adapter->large_llq_header = !!force_large_llq_header;
+	adapter->large_llq_header_enabled = !!force_large_llq_header;
+
+	devlink = ena_devlink_alloc(adapter);
+	if (!devlink) {
+		netdev_err(netdev, "ena_devlink_alloc failed\n");
+		goto err_netdev_destroy;
+	}
 
 	rc = ena_map_llq_mem_bar(pdev, ena_dev, bars);
 	if (rc) {
 		dev_err(&pdev->dev, "ENA LLQ bar mapping failed\n");
-		goto err_netdev_destroy;
+		goto err_devlink_destroy;
 	}
 
 	rc = ena_device_init(adapter, pdev, &get_feat_ctx, &wd_state);
@@ -4897,7 +4390,7 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		dev_err(&pdev->dev, "ENA device init failed\n");
 		if (rc == -ETIME)
 			rc = -EPROBE_DEFER;
-		goto err_netdev_destroy;
+		goto err_devlink_destroy;
 	}
 
 	/* Initial TX and RX interrupt delay. Assumes 1 usec granularity.
@@ -5020,6 +4513,8 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	adapters_found++;
 
+	ena_devlink_register(devlink, &pdev->dev);
+
 	return 0;
 
 err_rss:
@@ -5038,6 +4533,8 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 err_device_destroy:
 	ena_com_delete_host_info(ena_dev);
 	ena_com_admin_destroy(ena_dev);
+err_devlink_destroy:
+	ena_devlink_free(devlink);
 err_netdev_destroy:
 	free_netdev(netdev);
 err_free_region:
@@ -5064,17 +4561,22 @@ static void __ena_shutoff(struct pci_dev *pdev, bool shutdown)
 	struct ena_adapter *adapter = pci_get_drvdata(pdev);
 	struct ena_com_dev *ena_dev;
 	struct net_device *netdev;
+	struct devlink *devlink;
 
 	ena_dev = adapter->ena_dev;
 	netdev = adapter->netdev;
 
+	devlink = adapter->devlink;
+	ena_devlink_unregister(devlink);
+	ena_devlink_free(devlink);
+
 #ifdef CONFIG_RFS_ACCEL
 	if ((adapter->msix_vecs >= 1) && (netdev->rx_cpu_rmap)) {
 		free_irq_cpu_rmap(netdev->rx_cpu_rmap);
 		netdev->rx_cpu_rmap = NULL;
 	}
-#endif /* CONFIG_RFS_ACCEL */
 
+#endif /* CONFIG_RFS_ACCEL */
 	/* Make sure timer and reset routine won't be called after
 	 * freeing device resources.
 	 */
diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h
index bdc8f9f07c79c..b6a2332d986bb 100644
--- a/drivers/amazon/net/ena/ena_netdev.h
+++ b/drivers/amazon/net/ena/ena_netdev.h
@@ -25,7 +25,7 @@
 #include "ena_eth_com.h"
 
 #define DRV_MODULE_GEN_MAJOR	2
-#define DRV_MODULE_GEN_MINOR	6
+#define DRV_MODULE_GEN_MINOR	7
 #define DRV_MODULE_GEN_SUBMINOR 1
 
 #define DRV_MODULE_NAME		"ena"
@@ -62,6 +62,8 @@
 #define ENA_DEFAULT_RING_SIZE	(1024)
 #define ENA_MIN_RING_SIZE	(256)
 
+#define ENA_MIN_RX_BUF_SIZE (2048)
+
 #define ENA_MIN_NUM_IO_QUEUES	(1)
 
 #define ENA_TX_WAKEUP_THRESH		(MAX_SKB_FRAGS + 2)
@@ -118,28 +120,6 @@
 
 #define ENA_MMIO_DISABLE_REG_READ	BIT(0)
 
-/* The max MTU size is configured to be the ethernet frame size without
- * the overhead of the ethernet header, which can have a VLAN header, and
- * a frame check sequence (FCS).
- * The buffer size we share with the device is defined to be ENA_PAGE_SIZE
- */
-
-#ifdef ENA_XDP_SUPPORT
-#ifdef XDP_HAS_FRAME_SZ
-#define ENA_XDP_MAX_MTU (ENA_PAGE_SIZE - ETH_HLEN - ETH_FCS_LEN -	\
-			 VLAN_HLEN - XDP_PACKET_HEADROOM -		\
-			 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
-#else
-#define ENA_XDP_MAX_MTU (ENA_PAGE_SIZE - ETH_HLEN - ETH_FCS_LEN - \
-				VLAN_HLEN - XDP_PACKET_HEADROOM)
-#endif
-
-#define ENA_IS_XDP_INDEX(adapter, index) (((index) >= (adapter)->xdp_first_ring) && \
-	((index) < (adapter)->xdp_first_ring + (adapter)->xdp_num_queues))
-#else
-#define ENA_IS_XDP_INDEX(adapter, index) (false)
-#endif /* ENA_XDP_SUPPORT */
-
 struct ena_page_cache;
 
 struct ena_irq {
@@ -157,15 +137,20 @@ struct ena_napi {
 	struct napi_struct napi;
 	struct ena_ring *tx_ring;
 	struct ena_ring *rx_ring;
-#ifdef ENA_XDP_SUPPORT
-	struct ena_ring *xdp_ring;
-#endif /* ENA_XDP_SUPPORT */
 	u32 qid;
 	struct dim dim;
 };
 
 struct ena_tx_buffer {
-	struct sk_buff *skb;
+	union {
+		struct sk_buff *skb;
+#ifdef ENA_XDP_SUPPORT
+		/* XDP buffer structure which is used for sending packets in
+		 * the xdp queues
+		 */
+		struct xdp_frame *xdpf;
+#endif /* ENA_XDP_SUPPORT */
+	};
 	/* num of ena desc for this specific skb
 	 * (includes data desc and metadata desc)
 	 */
@@ -173,18 +158,14 @@ struct ena_tx_buffer {
 	/* num of buffers used by this skb */
 	u32 num_of_bufs;
 
-#ifdef ENA_XDP_SUPPORT
-	/* XDP buffer structure which is used for sending packets in
-	 * the xdp queues
-	 */
-	struct xdp_frame *xdpf;
-#endif /* ENA_XDP_SUPPORT */
+	/* Total size of all buffers */
+	u32 total_tx_size;
 
 	/* Indicate if bufs[0] map the linear data of the skb. */
 	u8 map_linear_data;
 
 	/* Used for detect missing tx packets to limit the number of prints */
-	u32 print_once;
+	u8 print_once;
 	/* Save the last jiffies to detect missing tx packets
 	 *
 	 * sets to non zero value on ena_start_xmit and set to zero on
@@ -200,9 +181,18 @@ struct ena_tx_buffer {
 
 struct ena_rx_buffer {
 	struct sk_buff *skb;
-	struct page *page;
-	dma_addr_t dma_addr;
+	union {
+		struct {
+			struct page *page;
+			dma_addr_t dma_addr;
+		};
+#ifdef ENA_XDP_SUPPORT
+		/* XSK pool buffer */
+		struct xdp_buff *xdp;
+#endif
+	};
 	u32 page_offset;
+	u32 buf_offset;
 	struct ena_com_buf ena_buf;
 	bool is_lpc_page;
 } ____cacheline_aligned;
@@ -224,6 +214,10 @@ struct ena_stats_tx {
 	u64 missed_tx;
 	u64 unmask_interrupt;
 	u64 last_napi_jiffies;
+#ifdef ENA_AF_XDP_SUPPORT
+	u64 xsk_need_wakeup_set;
+	u64 xsk_wakeup_request;
+#endif /* ENA_AF_XDP_SUPPORT */
 };
 
 struct ena_stats_rx {
@@ -232,7 +226,7 @@ struct ena_stats_rx {
 	u64 rx_copybreak_pkt;
 	u64 csum_good;
 	u64 refil_partial;
-	u64 bad_csum;
+	u64 csum_bad;
 	u64 page_alloc_fail;
 	u64 skb_alloc_fail;
 	u64 dma_mapping_err;
@@ -256,6 +250,10 @@ struct ena_stats_rx {
 	u64 lpc_warm_up;
 	u64 lpc_full;
 	u64 lpc_wrong_numa;
+#ifdef ENA_AF_XDP_SUPPORT
+	u64 xsk_need_wakeup_set;
+	u64 zc_queue_pkt_copy;
+#endif /* ENA_AF_XDP_SUPPORT */
 };
 
 struct ena_ring {
@@ -287,7 +285,10 @@ struct ena_ring {
 	 * which traffic should be redirected from this rx ring.
 	 */
 	struct ena_ring *xdp_ring;
-#endif
+#ifdef ENA_AF_XDP_SUPPORT
+	struct xsk_buff_pool *xsk_pool;
+#endif /* ENA_AF_XDP_SUPPORT */
+#endif /* ENA_XDP_SUPPORT */
 
 	u16 next_to_use;
 	u16 next_to_clean;
@@ -304,9 +305,11 @@ struct ena_ring {
 	bool disable_meta_caching;
 	u16 no_interrupt_event_cnt;
 
-	/* cpu for TPH */
+	/* cpu and NUMA for TPH */
 	int cpu;
-	 /* number of tx/rx_buffer_info's entries */
+	int numa_node;
+
+	/* number of tx/rx_buffer_info's entries */
 	int ring_size;
 
 	enum ena_admin_placement_policy_type tx_mem_queue_type;
@@ -364,7 +367,9 @@ struct ena_adapter {
 	struct net_device *netdev;
 	struct pci_dev *pdev;
 
-	/* rx packets that shorter that this len will be copied to the skb
+	struct devlink *devlink;
+
+	/* rx packets that are shorter than this len will be copied to the skb
 	 * header
 	 */
 	u32 rx_copybreak;
@@ -391,7 +396,13 @@ struct ena_adapter {
 
 	u32 msg_enable;
 
-	bool large_llq_header;
+	/* The flag is used for two purposes:
+	 * 1. Indicates that large LLQ has been requested.
+	 * 2. Indicates whether large LLQ is set or not after device
+	 *    initialization / configuration.
+	 */
+	bool large_llq_header_enabled;
+	bool large_llq_header_supported;
 
 	u16 max_tx_sgl_size;
 	u16 max_rx_sgl_size;
@@ -528,42 +539,15 @@ static inline bool ena_bp_disable(struct ena_ring *rx_ring)
 }
 #endif /* ENA_BUSY_POLL_SUPPORT */
 
-#ifdef ENA_XDP_SUPPORT
-enum ena_xdp_errors_t {
-	ENA_XDP_ALLOWED = 0,
-	ENA_XDP_CURRENT_MTU_TOO_LARGE,
-	ENA_XDP_NO_ENOUGH_QUEUES,
-};
-
-static inline bool ena_xdp_present(struct ena_adapter *adapter)
-{
-	return !!adapter->xdp_bpf_prog;
-}
-
-static inline bool ena_xdp_present_ring(struct ena_ring *ring)
-{
-	return !!ring->xdp_bpf_prog;
-}
-
-static inline bool ena_xdp_legal_queue_count(struct ena_adapter *adapter,
-					     u32 queues)
+static inline void ena_reset_device(struct ena_adapter *adapter,
+				    enum ena_regs_reset_reason_types reset_reason)
 {
-	return 2 * queues <= adapter->max_num_io_queues;
+	adapter->reset_reason = reset_reason;
+	/* Make sure reset reason is set before triggering the reset */
+	smp_mb__before_atomic();
+	set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
 }
 
-static inline enum ena_xdp_errors_t ena_xdp_allowed(struct ena_adapter *adapter)
-{
-	enum ena_xdp_errors_t rc = ENA_XDP_ALLOWED;
-
-	if (adapter->netdev->mtu > ENA_XDP_MAX_MTU)
-		rc = ENA_XDP_CURRENT_MTU_TOO_LARGE;
-	else if (!ena_xdp_legal_queue_count(adapter, adapter->num_io_queues))
-		rc = ENA_XDP_NO_ENOUGH_QUEUES;
-
-	return rc;
-}
-#endif /* ENA_XDP_SUPPORT */
-
 /* Allocate a page and DMA map it
  * @rx_ring: The IO queue pair which requests the allocation
  *
@@ -572,4 +556,51 @@ static inline enum ena_xdp_errors_t ena_xdp_allowed(struct ena_adapter *adapter)
  */
 struct page *ena_alloc_map_page(struct ena_ring *rx_ring, dma_addr_t *dma);
 
+void ena_destroy_device(struct ena_adapter *adapter, bool graceful);
+int ena_restore_device(struct ena_adapter *adapter);
+int handle_invalid_req_id(struct ena_ring *ring, u16 req_id,
+			  struct ena_tx_buffer *tx_info, bool is_xdp);
+
+/* Increase a stat by cnt while holding syncp seqlock on 32bit machines */
+static inline void ena_increase_stat(u64 *statp, u64 cnt,
+			      struct u64_stats_sync *syncp)
+{
+	u64_stats_update_begin(syncp);
+	(*statp) += cnt;
+	u64_stats_update_end(syncp);
+}
+
+static inline void ena_ring_tx_doorbell(struct ena_ring *tx_ring)
+{
+	ena_com_write_sq_doorbell(tx_ring->ena_com_io_sq);
+	ena_increase_stat(&tx_ring->tx_stats.doorbells, 1, &tx_ring->syncp);
+}
+
+int ena_xmit_common(struct ena_adapter *adapter,
+		    struct ena_ring *ring,
+		    struct ena_tx_buffer *tx_info,
+		    struct ena_com_tx_ctx *ena_tx_ctx,
+		    u16 next_to_use,
+		    u32 bytes);
+void ena_unmap_tx_buff(struct ena_ring *tx_ring,
+		       struct ena_tx_buffer *tx_info);
+void ena_init_io_rings(struct ena_adapter *adapter,
+		       int first_index, int count);
+int ena_create_io_tx_queues_in_range(struct ena_adapter *adapter,
+				     int first_index, int count);
+int ena_setup_tx_resources_in_range(struct ena_adapter *adapter,
+				    int first_index, int count);
+void ena_free_all_io_tx_resources(struct ena_adapter *adapter);
+void ena_down(struct ena_adapter *adapter);
+int ena_up(struct ena_adapter *adapter);
+void ena_unmask_interrupt(struct ena_ring *tx_ring, struct ena_ring *rx_ring);
+void ena_update_ring_numa_node(struct ena_ring *tx_ring,
+			       struct ena_ring *rx_ring);
+void ena_rx_checksum(struct ena_ring *rx_ring,
+		     struct ena_com_rx_ctx *ena_rx_ctx,
+		     struct sk_buff *skb);
+void ena_set_rx_hash(struct ena_ring *rx_ring,
+		     struct ena_com_rx_ctx *ena_rx_ctx,
+		     struct sk_buff *skb);
+int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num);
 #endif /* !(ENA_H) */
diff --git a/drivers/amazon/net/ena/ena_xdp.c b/drivers/amazon/net/ena/ena_xdp.c
new file mode 100644
index 0000000000000..d06c0f50998af
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_xdp.c
@@ -0,0 +1,978 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright 2015-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include "ena_xdp.h"
+#ifdef ENA_XDP_SUPPORT
+
+static int validate_xdp_req_id(struct ena_ring *tx_ring, u16 req_id)
+{
+	struct ena_tx_buffer *tx_info;
+
+	tx_info = &tx_ring->tx_buffer_info[req_id];
+	if (likely(tx_info->total_tx_size))
+		return 0;
+
+	return handle_invalid_req_id(tx_ring, req_id, tx_info, true);
+}
+
+static int ena_xdp_tx_map_frame(struct ena_ring *tx_ring,
+				struct ena_tx_buffer *tx_info,
+				struct xdp_frame *xdpf,
+				struct ena_com_tx_ctx *ena_tx_ctx)
+{
+	struct ena_adapter *adapter = tx_ring->adapter;
+	struct ena_com_buf *ena_buf;
+	int push_len = 0;
+	dma_addr_t dma;
+	void *data;
+	u32 size;
+
+	tx_info->xdpf = xdpf;
+	data = tx_info->xdpf->data;
+	size = tx_info->xdpf->len;
+
+	if (tx_ring->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
+		/* Designate part of the packet for LLQ */
+		push_len = min_t(u32, size, tx_ring->tx_max_header_size);
+
+		ena_tx_ctx->push_header = data;
+
+		size -= push_len;
+		data += push_len;
+	}
+
+	ena_tx_ctx->header_len = push_len;
+
+	if (size > 0) {
+		dma = dma_map_single(tx_ring->dev,
+				     data,
+				     size,
+				     DMA_TO_DEVICE);
+		if (unlikely(dma_mapping_error(tx_ring->dev, dma)))
+			goto error_report_dma_error;
+
+		tx_info->map_linear_data = 0;
+
+		ena_buf = tx_info->bufs;
+		ena_buf->paddr = dma;
+		ena_buf->len = size;
+
+		ena_tx_ctx->ena_bufs = ena_buf;
+		ena_tx_ctx->num_bufs = tx_info->num_of_bufs = 1;
+	}
+
+	return 0;
+
+error_report_dma_error:
+	ena_increase_stat(&tx_ring->tx_stats.dma_mapping_err, 1,
+			  &tx_ring->syncp);
+	netif_warn(adapter, tx_queued, adapter->netdev, "Failed to map xdp buff\n");
+
+	return -EINVAL;
+}
+
+int ena_xdp_xmit_frame(struct ena_ring *tx_ring,
+		       struct ena_adapter *adapter,
+		       struct xdp_frame *xdpf,
+		       int flags)
+{
+	struct ena_com_tx_ctx ena_tx_ctx = {};
+	struct ena_tx_buffer *tx_info;
+	u16 next_to_use, req_id;
+	int rc;
+
+	next_to_use = tx_ring->next_to_use;
+	req_id = tx_ring->free_ids[next_to_use];
+	tx_info = &tx_ring->tx_buffer_info[req_id];
+	tx_info->num_of_bufs = 0;
+
+	rc = ena_xdp_tx_map_frame(tx_ring, tx_info, xdpf, &ena_tx_ctx);
+	if (unlikely(rc))
+		return rc;
+
+	ena_tx_ctx.req_id = req_id;
+
+	rc = ena_xmit_common(adapter,
+			     tx_ring,
+			     tx_info,
+			     &ena_tx_ctx,
+			     next_to_use,
+			     xdpf->len);
+	if (rc)
+		goto error_unmap_dma;
+
+	/* trigger the dma engine. ena_ring_tx_doorbell()
+	 * calls a memory barrier inside it.
+	 */
+	if (flags & XDP_XMIT_FLUSH)
+		ena_ring_tx_doorbell(tx_ring);
+
+	return rc;
+
+error_unmap_dma:
+	ena_unmap_tx_buff(tx_ring, tx_info);
+	tx_info->xdpf = NULL;
+	return rc;
+}
+
+int ena_xdp_xmit(struct net_device *dev, int n,
+			struct xdp_frame **frames, u32 flags)
+{
+	struct ena_adapter *adapter = netdev_priv(dev);
+	struct ena_ring *tx_ring;
+	int qid, i, nxmit = 0;
+
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
+		return -EINVAL;
+
+	if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
+		return -ENETDOWN;
+
+	/* We assume that all rings have the same XDP program */
+	if (!READ_ONCE(adapter->rx_ring->xdp_bpf_prog))
+		return -ENXIO;
+
+	qid = smp_processor_id() % adapter->xdp_num_queues;
+	qid += adapter->xdp_first_ring;
+	tx_ring = &adapter->tx_ring[qid];
+
+	/* Other CPU ids might try to send thorugh this queue */
+	spin_lock(&tx_ring->xdp_tx_lock);
+
+	for (i = 0; i < n; i++) {
+		if (ena_xdp_xmit_frame(tx_ring, adapter, frames[i], 0))
+			break;
+		nxmit++;
+	}
+
+	/* Ring doorbell to make device aware of the packets */
+	if (flags & XDP_XMIT_FLUSH)
+		ena_ring_tx_doorbell(tx_ring);
+
+	spin_unlock(&tx_ring->xdp_tx_lock);
+
+#ifndef ENA_XDP_XMIT_FREES_FAILED_DESCS_INTERNALLY
+	for (i = nxmit; unlikely(i < n); i++)
+		xdp_return_frame(frames[i]);
+
+#endif
+	/* Return number of packets sent */
+	return nxmit;
+}
+
+static void ena_init_all_xdp_queues(struct ena_adapter *adapter)
+{
+	adapter->xdp_first_ring = adapter->num_io_queues;
+	adapter->xdp_num_queues = adapter->num_io_queues;
+
+	ena_init_io_rings(adapter,
+			  adapter->xdp_first_ring,
+			  adapter->xdp_num_queues);
+}
+
+int ena_setup_and_create_all_xdp_queues(struct ena_adapter *adapter)
+{
+	int rc = 0;
+
+	rc = ena_setup_tx_resources_in_range(adapter, adapter->xdp_first_ring,
+					     adapter->xdp_num_queues);
+	if (rc)
+		goto setup_err;
+
+	rc = ena_create_io_tx_queues_in_range(adapter,
+					      adapter->xdp_first_ring,
+					      adapter->xdp_num_queues);
+	if (rc)
+		goto create_err;
+
+	return 0;
+
+create_err:
+	ena_free_all_io_tx_resources(adapter);
+setup_err:
+	return rc;
+}
+
+/* Provides a way for both kernel and bpf-prog to know
+ * more about the RX-queue a given XDP frame arrived on.
+ */
+int ena_xdp_register_rxq_info(struct ena_ring *rx_ring)
+{
+	int rc;
+
+#ifdef AF_XDP_BUSY_POLL_SUPPORTED
+	rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid,
+			      rx_ring->napi->napi_id < 0);
+#else
+	rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid);
+#endif
+
+	netif_dbg(rx_ring->adapter, ifup, rx_ring->netdev, "Registering RX info for queue %d",
+		  rx_ring->qid);
+	if (rc) {
+		netif_err(rx_ring->adapter, ifup, rx_ring->netdev,
+			  "Failed to register xdp rx queue info. RX queue num %d rc: %d\n",
+			  rx_ring->qid, rc);
+		goto err;
+	}
+
+	if (ENA_IS_XSK_RING(rx_ring)) {
+		rc = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq, MEM_TYPE_XSK_BUFF_POOL, NULL);
+		xsk_pool_set_rxq_info(rx_ring->xsk_pool, &rx_ring->xdp_rxq);
+	} else {
+		rc = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq, MEM_TYPE_PAGE_SHARED,
+						NULL);
+	}
+
+	if (rc) {
+		netif_err(rx_ring->adapter, ifup, rx_ring->netdev,
+			  "Failed to register xdp rx queue info memory model. RX queue num %d rc: %d\n",
+			  rx_ring->qid, rc);
+		xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
+	}
+
+err:
+	return rc;
+}
+
+#ifdef ENA_AF_XDP_SUPPORT
+void ena_xdp_free_tx_bufs_zc(struct ena_ring *tx_ring)
+{
+	struct xsk_buff_pool *xsk_pool = tx_ring->xsk_pool;
+	int i, xsk_frames = 0;
+
+	for (i = 0; i < tx_ring->ring_size; i++) {
+		struct ena_tx_buffer *tx_info = &tx_ring->tx_buffer_info[i];
+
+		if (tx_info->last_jiffies)
+			xsk_frames++;
+
+		tx_info->last_jiffies = 0;
+	}
+
+	if (xsk_frames)
+		xsk_tx_completed(xsk_pool, xsk_frames);
+}
+
+void ena_xdp_free_rx_bufs_zc(struct ena_adapter *adapter, u32 qid)
+{
+	struct ena_ring *rx_ring = &adapter->rx_ring[qid];
+	int i = 0;
+
+	for (i = 0; i < rx_ring->ring_size; i++) {
+		struct ena_rx_buffer *rx_info = &rx_ring->rx_buffer_info[i];
+
+		if (rx_info->xdp)
+			xsk_buff_free(rx_info->xdp);
+
+		rx_info->xdp = NULL;
+	}
+}
+
+#endif /* ENA_AF_XDP_SUPPORT */
+void ena_xdp_unregister_rxq_info(struct ena_ring *rx_ring)
+{
+	netif_dbg(rx_ring->adapter, ifdown, rx_ring->netdev,
+		  "Unregistering RX info for queue %d",
+		  rx_ring->qid);
+	xdp_rxq_info_unreg_mem_model(&rx_ring->xdp_rxq);
+	xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
+}
+
+void ena_xdp_exchange_program_rx_in_range(struct ena_adapter *adapter,
+						 struct bpf_prog *prog,
+						 int first, int count)
+{
+	struct bpf_prog *old_bpf_prog;
+	struct ena_ring *rx_ring;
+	int i = 0;
+
+	for (i = first; i < count; i++) {
+		rx_ring = &adapter->rx_ring[i];
+		old_bpf_prog = xchg(&rx_ring->xdp_bpf_prog, prog);
+
+		if (!old_bpf_prog && prog) {
+			rx_ring->rx_headroom = XDP_PACKET_HEADROOM;
+		} else if (old_bpf_prog && !prog) {
+			rx_ring->rx_headroom = NET_SKB_PAD;
+		}
+	}
+}
+
+static void ena_xdp_exchange_program(struct ena_adapter *adapter,
+				     struct bpf_prog *prog)
+{
+	struct bpf_prog *old_bpf_prog = xchg(&adapter->xdp_bpf_prog, prog);
+
+	ena_xdp_exchange_program_rx_in_range(adapter,
+					     prog,
+					     0,
+					     adapter->num_io_queues);
+
+	if (old_bpf_prog)
+		bpf_prog_put(old_bpf_prog);
+}
+
+static int ena_destroy_and_free_all_xdp_queues(struct ena_adapter *adapter)
+{
+	bool was_up;
+	int rc;
+
+	was_up = test_bit(ENA_FLAG_DEV_UP, &adapter->flags);
+
+	if (was_up)
+		ena_down(adapter);
+
+	adapter->xdp_first_ring = 0;
+	adapter->xdp_num_queues = 0;
+	ena_xdp_exchange_program(adapter, NULL);
+	if (was_up) {
+		rc = ena_up(adapter);
+		if (rc)
+			return rc;
+	}
+	return 0;
+}
+
+static int ena_xdp_set(struct net_device *netdev, struct netdev_bpf *bpf)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	struct bpf_prog *prog = bpf->prog;
+	struct bpf_prog *old_bpf_prog;
+	int rc, prev_mtu;
+	bool is_up;
+
+	is_up = test_bit(ENA_FLAG_DEV_UP, &adapter->flags);
+	rc = ena_xdp_allowed(adapter);
+	if (rc == ENA_XDP_ALLOWED) {
+		old_bpf_prog = adapter->xdp_bpf_prog;
+		if (prog) {
+			if (!is_up) {
+				ena_init_all_xdp_queues(adapter);
+			} else if (!old_bpf_prog) {
+				ena_down(adapter);
+				ena_init_all_xdp_queues(adapter);
+			}
+			ena_xdp_exchange_program(adapter, prog);
+
+			netif_dbg(adapter, drv, adapter->netdev, "Set a new XDP program\n");
+
+			if (is_up && !old_bpf_prog) {
+				rc = ena_up(adapter);
+				if (rc)
+					return rc;
+			}
+		} else if (old_bpf_prog) {
+			netif_dbg(adapter, drv, adapter->netdev,
+				  "Removing XDP program\n");
+
+			rc = ena_destroy_and_free_all_xdp_queues(adapter);
+			if (rc)
+				return rc;
+		}
+
+		prev_mtu = netdev->max_mtu;
+		netdev->max_mtu = prog ? ENA_XDP_MAX_MTU : adapter->max_mtu;
+
+		if (!old_bpf_prog)
+			netif_info(adapter, drv, adapter->netdev,
+				   "XDP program is set, changing the max_mtu from %d to %d",
+				   prev_mtu, netdev->max_mtu);
+
+	} else if (rc == ENA_XDP_CURRENT_MTU_TOO_LARGE) {
+		netif_err(adapter, drv, adapter->netdev,
+			  "Failed to set xdp program, the current MTU (%d) is larger than the maximum allowed MTU (%lu) while xdp is on",
+			  netdev->mtu, ENA_XDP_MAX_MTU);
+		NL_SET_ERR_MSG_MOD(bpf->extack,
+				   "Failed to set xdp program, the current MTU is larger than the maximum allowed MTU. Check the dmesg for more info");
+		return -EINVAL;
+	} else if (rc == ENA_XDP_NO_ENOUGH_QUEUES) {
+		netif_err(adapter, drv, adapter->netdev,
+			  "Failed to set xdp program, the Rx/Tx channel count should be at most half of the maximum allowed channel count. The current queue count (%d), the maximal queue count (%d)\n",
+			  adapter->num_io_queues, adapter->max_num_io_queues);
+		NL_SET_ERR_MSG_MOD(bpf->extack,
+				   "Failed to set xdp program, there is no enough space for allocating XDP queues, Check the dmesg for more info");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+#ifdef ENA_AF_XDP_SUPPORT
+static bool ena_is_xsk_pool_params_allowed(struct xsk_buff_pool *pool)
+{
+	return xsk_pool_get_headroom(pool) == 0 &&
+	       xsk_pool_get_chunk_size(pool) == ENA_PAGE_SIZE;
+}
+
+static int ena_xsk_pool_enable(struct ena_adapter *adapter,
+			       struct xsk_buff_pool *pool,
+			       u16 qid)
+{
+	struct ena_ring *rx_ring, *tx_ring;
+	bool dev_was_up = false;
+	int err;
+
+	if (!ena_xdp_legal_queue_count(adapter, qid)) {
+		netdev_err(adapter->netdev,
+			   "Max qid for XSK pool is %d (received %d)\n",
+			   adapter->max_num_io_queues >> 1, qid);
+		return -EINVAL;
+	}
+
+	if (ena_is_xsk_pool_params_allowed(pool))
+		return -EINVAL;
+
+	rx_ring = &adapter->rx_ring[qid];
+	tx_ring = &adapter->tx_ring[qid];
+
+	err = xsk_pool_dma_map(pool, adapter->ena_dev->dmadev, 0);
+	if (err) {
+		ena_increase_stat(&rx_ring->rx_stats.dma_mapping_err, 1,
+				  &rx_ring->syncp);
+		netif_err(adapter, drv, adapter->netdev,
+			  "Failed to DMA map XSK pool for qid %d\n", qid);
+		return err;
+	}
+
+	if (test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) {
+		dev_was_up = true;
+		ena_down(adapter);
+	}
+
+	rx_ring->xsk_pool = tx_ring->xsk_pool = pool;
+
+	netif_dbg(adapter, drv, adapter->netdev,
+		  "Setting XSK pool for queue %d\n", qid);
+
+	return dev_was_up ? ena_up(adapter) : 0;
+}
+
+static int ena_xsk_pool_disable(struct ena_adapter *adapter,
+				u16 qid)
+{
+	struct ena_ring *rx_ring, *tx_ring;
+	bool dev_was_up = false;
+
+	if (qid >= adapter->num_io_queues)
+		return -EINVAL;
+
+	rx_ring = &adapter->rx_ring[qid];
+	tx_ring = &adapter->tx_ring[qid];
+
+	/* XSK pool isn't attached to this ring */
+	if (!rx_ring->xsk_pool)
+		return 0;
+
+	if (test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) {
+		dev_was_up = true;
+		ena_down(adapter);
+	}
+
+	xsk_pool_dma_unmap(rx_ring->xsk_pool, 0);
+
+	rx_ring->xsk_pool = tx_ring->xsk_pool = NULL;
+
+	netif_dbg(adapter, drv, adapter->netdev,
+		  "Removing XSK pool for queue %d\n", qid);
+
+	return dev_was_up ? ena_up(adapter) : 0;
+}
+
+static int ena_xsk_pool_setup(struct ena_adapter *adapter,
+			      struct xsk_buff_pool *pool,
+			      u16 qid)
+{
+	return pool ? ena_xsk_pool_enable(adapter, pool, qid) :
+		      ena_xsk_pool_disable(adapter, qid);
+}
+
+#endif /* ENA_AF_XDP_SUPPORT */
+/* This is the main xdp callback, it's used by the kernel to set/unset the xdp
+ * program as well as to query the current xdp program id.
+ */
+int ena_xdp(struct net_device *netdev, struct netdev_bpf *bpf)
+{
+#if !defined(ENA_XDP_QUERY_IN_KERNEL) || defined(ENA_AF_XDP_SUPPORT)
+	struct ena_adapter *adapter = netdev_priv(netdev);
+
+#endif /* ENA_XDP_QUERY_IN_KERNEL || ENA_AF_XDP_SUPPORT */
+	switch (bpf->command) {
+	case XDP_SETUP_PROG:
+		return ena_xdp_set(netdev, bpf);
+#ifdef ENA_AF_XDP_SUPPORT
+	case XDP_SETUP_XSK_POOL:
+		return ena_xsk_pool_setup(adapter, bpf->xsk.pool, bpf->xsk.queue_id);
+#endif /* ENA_AF_XDP_SUPPORT */
+#ifndef ENA_XDP_QUERY_IN_KERNEL
+	case XDP_QUERY_PROG:
+		bpf->prog_id = adapter->xdp_bpf_prog ?
+			adapter->xdp_bpf_prog->aux->id : 0;
+		break;
+#endif
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+#ifdef ENA_AF_XDP_SUPPORT
+int ena_xdp_xsk_wakeup(struct net_device *netdev, u32 qid, u32 flags)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+	struct ena_ring *tx_ring;
+	struct napi_struct *napi;
+
+	if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
+		return -ENETDOWN;
+
+	if (qid >= adapter->num_io_queues)
+		return -EINVAL;
+
+	if (!adapter->xdp_bpf_prog)
+		return -ENXIO;
+
+	tx_ring = &adapter->tx_ring[qid];
+
+	if (!ENA_IS_XSK_RING(tx_ring))
+		return -ENXIO;
+
+	ena_increase_stat(&tx_ring->tx_stats.xsk_wakeup_request, 1,
+			  &tx_ring->syncp);
+
+	napi = tx_ring->napi;
+
+	napi_schedule(napi);
+
+	return 0;
+}
+
+#endif /* ENA_AF_XDP_SUPPORT */
+static bool ena_clean_xdp_irq(struct ena_ring *tx_ring, u32 budget)
+{
+
+	bool is_zc_q = ENA_IS_XSK_RING(tx_ring);
+	u32 total_done = 0;
+	u16 next_to_clean;
+	bool needs_wakeup;
+	u32 tx_bytes = 0;
+	int tx_pkts = 0;
+	u16 req_id;
+	int rc;
+
+	if (unlikely(!tx_ring))
+		return 0;
+	next_to_clean = tx_ring->next_to_clean;
+
+	while (tx_pkts < budget) {
+		struct ena_tx_buffer *tx_info;
+		struct xdp_frame *xdpf;
+
+		rc = ena_com_tx_comp_req_id_get(tx_ring->ena_com_io_cq,
+						&req_id);
+		if (rc) {
+			if (unlikely(rc == -EINVAL))
+				handle_invalid_req_id(tx_ring, req_id, NULL,
+						      true);
+			break;
+		}
+
+		/* validate that the request id points to a valid xdp_frame */
+		rc = validate_xdp_req_id(tx_ring, req_id);
+		if (rc)
+			break;
+
+		tx_info = &tx_ring->tx_buffer_info[req_id];
+
+		tx_info->last_jiffies = 0;
+
+		if (!is_zc_q) {
+			xdpf = tx_info->xdpf;
+			tx_info->xdpf = NULL;
+			ena_unmap_tx_buff(tx_ring, tx_info);
+			xdp_return_frame(xdpf);
+		}
+
+		netif_dbg(tx_ring->adapter, tx_done, tx_ring->netdev,
+			  "tx_poll: q %d pkt #%d req_id %d\n", tx_ring->qid, tx_pkts, req_id);
+
+		tx_bytes += tx_info->total_tx_size;
+		tx_pkts++;
+		total_done += tx_info->tx_descs;
+
+		tx_info->total_tx_size = 0;
+
+		tx_ring->free_ids[next_to_clean] = req_id;
+		next_to_clean = ENA_TX_RING_IDX_NEXT(next_to_clean,
+						     tx_ring->ring_size);
+	}
+
+	tx_ring->next_to_clean = next_to_clean;
+	ena_com_comp_ack(tx_ring->ena_com_io_sq, total_done);
+	ena_com_update_dev_comp_head(tx_ring->ena_com_io_cq);
+
+	netif_dbg(tx_ring->adapter, tx_done, tx_ring->netdev,
+		  "tx_poll: q %d done. total pkts: %d\n",
+		  tx_ring->qid, tx_pkts);
+
+	needs_wakeup = tx_pkts < budget;
+#ifdef ENA_AF_XDP_SUPPORT
+	if (is_zc_q) {
+		struct xsk_buff_pool *xsk_pool = tx_ring->xsk_pool;
+
+		if (tx_pkts)
+			xsk_tx_completed(xsk_pool, tx_pkts);
+
+		if (xsk_uses_need_wakeup(xsk_pool)) {
+			if (needs_wakeup)
+				xsk_set_tx_need_wakeup(xsk_pool);
+			else
+				xsk_clear_tx_need_wakeup(xsk_pool);
+		}
+	}
+#endif /* ENA_AF_XDP_SUPPORT */
+
+	return needs_wakeup;
+}
+
+#ifdef ENA_AF_XDP_SUPPORT
+static bool ena_xdp_xmit_irq_zc(struct ena_ring *tx_ring,
+				struct napi_struct *napi,
+				int budget)
+{
+	struct xsk_buff_pool *xsk_pool = tx_ring->xsk_pool;
+	int size, rc, push_len = 0, work_done = 0;
+	struct ena_tx_buffer *tx_info;
+	struct ena_com_buf *ena_buf;
+	u16 next_to_use, req_id;
+	bool need_wakeup = true;
+	struct xdp_desc desc;
+	dma_addr_t dma;
+
+	while (likely(work_done < budget)) {
+		struct ena_com_tx_ctx ena_tx_ctx = {};
+
+		/* We assume the maximum number of descriptors, which is two
+		 * (meta data included)
+		 */
+		if (unlikely(!ena_com_sq_have_enough_space(tx_ring->ena_com_io_sq, 2)))
+			break;
+
+		if (!xsk_tx_peek_desc(xsk_pool, &desc))
+			break;
+
+		next_to_use = tx_ring->next_to_use;
+		req_id = tx_ring->free_ids[next_to_use];
+		tx_info = &tx_ring->tx_buffer_info[req_id];
+
+		size = desc.len;
+
+		if (tx_ring->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
+			/* Designate part of the packet for LLQ */
+			push_len = min_t(u32, size, tx_ring->tx_max_header_size);
+			ena_tx_ctx.push_header = xsk_buff_raw_get_data(xsk_pool, desc.addr);
+			ena_tx_ctx.header_len = push_len;
+
+			size -= push_len;
+			if (!size)
+				goto xmit_desc;
+		}
+
+		/* Pass the rest of the descriptor as a DMA address. Assuming
+		 * single page descriptor.
+		 */
+		dma  = xsk_buff_raw_get_dma(xsk_pool, desc.addr);
+		ena_buf = tx_info->bufs;
+		ena_buf->paddr = dma + push_len;
+		ena_buf->len = size;
+
+		ena_tx_ctx.ena_bufs = ena_buf;
+		ena_tx_ctx.num_bufs = 1;
+
+xmit_desc:
+		ena_tx_ctx.req_id = req_id;
+
+		netif_dbg(tx_ring->adapter, tx_queued, tx_ring->netdev,
+			  "Queueing zc packet on q %d, %s DMA part (req-id %d)\n",
+			  tx_ring->qid, ena_tx_ctx.num_bufs ? "with" : "without", req_id);
+
+		rc = ena_xmit_common(tx_ring->adapter,
+				     tx_ring,
+				     tx_info,
+				     &ena_tx_ctx,
+				     next_to_use,
+				     desc.len);
+		if (rc)
+			break;
+
+		work_done++;
+	}
+
+	if (work_done) {
+		xsk_tx_release(xsk_pool);
+		ena_ring_tx_doorbell(tx_ring);
+	}
+
+	if (work_done == budget) {
+		need_wakeup = false;
+		if (xsk_uses_need_wakeup(xsk_pool))
+			xsk_clear_tx_need_wakeup(xsk_pool);
+	}
+
+	return need_wakeup;
+}
+
+static struct sk_buff *ena_xdp_rx_skb_zc(struct ena_ring *rx_ring, struct xdp_buff *xdp)
+{
+	u32 headroom, data_len;
+	struct sk_buff *skb;
+	void *data_addr;
+
+	/* Assuming single-page packets for XDP */
+	headroom  = xdp->data - xdp->data_hard_start;
+	data_len  = xdp->data_end - xdp->data;
+	data_addr = xdp->data;
+
+	/* allocate a skb to store the frags */
+	skb = __napi_alloc_skb(rx_ring->napi,
+			       headroom + data_len,
+			       GFP_ATOMIC | __GFP_NOWARN);
+	if (unlikely(!skb)) {
+		ena_increase_stat(&rx_ring->rx_stats.skb_alloc_fail, 1,
+				  &rx_ring->syncp);
+		netif_err(rx_ring->adapter, rx_err, rx_ring->netdev,
+			  "Failed to allocate skb in zc queue %d\n", rx_ring->qid);
+		return NULL;
+	}
+
+	skb_reserve(skb, headroom);
+	memcpy(__skb_put(skb, data_len), data_addr, data_len);
+
+	skb->protocol = eth_type_trans(skb, rx_ring->netdev);
+
+	return skb;
+}
+
+static int ena_xdp_clean_rx_irq_zc(struct ena_ring *rx_ring,
+				   struct napi_struct *napi,
+				   int budget)
+{
+	int i, refill_required, work_done, refill_threshold, pkt_copy;
+	u16 next_to_clean = rx_ring->next_to_clean;
+	int xdp_verdict, req_id, rc, total_len;
+	struct ena_com_rx_ctx ena_rx_ctx;
+	struct ena_rx_buffer *rx_info;
+	bool xdp_prog_present;
+	struct xdp_buff *xdp;
+	struct sk_buff *skb;
+	u32 xdp_flags = 0;
+
+	netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
+		  "%s qid %d\n", __func__, rx_ring->qid);
+
+	ena_rx_ctx.ena_bufs = rx_ring->ena_bufs;
+	ena_rx_ctx.max_bufs = rx_ring->sgl_size;
+
+	xdp_prog_present = ena_xdp_present_ring(rx_ring);
+
+	work_done = 0;
+	total_len = 0;
+	pkt_copy = 0;
+
+	do {
+		xdp_verdict = ENA_XDP_PASS;
+
+		/* Poll a packet from HW */
+		rc = ena_com_rx_pkt(rx_ring->ena_com_io_cq,
+				    rx_ring->ena_com_io_sq,
+				    &ena_rx_ctx);
+		if (unlikely(rc))
+			break;
+
+		/* Polled all RX packets */
+		if (unlikely(ena_rx_ctx.descs == 0))
+			break;
+
+		netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
+			  "rx_poll: q %d got packet from ena. descs #: %d l3 proto %d l4 proto %d hash: %x\n",
+			  rx_ring->qid, ena_rx_ctx.descs, ena_rx_ctx.l3_proto,
+			  ena_rx_ctx.l4_proto, ena_rx_ctx.hash);
+
+		/* First descriptor might have an offset set by the device */
+		rx_info = &rx_ring->rx_buffer_info[ena_rx_ctx.ena_bufs[0].req_id];
+		xdp = rx_info->xdp;
+		xdp->data += ena_rx_ctx.pkt_offset;
+		xdp->data_end = xdp->data + ena_rx_ctx.ena_bufs[0].len;
+		xsk_buff_dma_sync_for_cpu(xdp, rx_ring->xsk_pool);
+
+		/* Don't process several descriptors, not blocked by HW
+		 * (regardless of MTU)
+		 */
+		if (unlikely(ena_rx_ctx.descs > 1)) {
+			ena_increase_stat(&rx_ring->rx_stats.xdp_drop, 1, &rx_ring->syncp);
+			xdp_verdict = ENA_XDP_DROP;
+			goto skip_xdp_prog;
+		}
+
+		if (likely(xdp_prog_present))
+			xdp_verdict = ena_xdp_execute(rx_ring, xdp);
+
+skip_xdp_prog:
+		/* Note that there can be several descriptors, since device
+		 * might not honor MTU
+		 */
+		for (i = 0; i < ena_rx_ctx.descs; i++) {
+			req_id = rx_ring->ena_bufs[i].req_id;
+			rx_ring->free_ids[next_to_clean] = req_id;
+			next_to_clean =
+				ENA_RX_RING_IDX_NEXT(next_to_clean,
+						     rx_ring->ring_size);
+		}
+
+		if (likely(xdp_verdict)) {
+			work_done++;
+			total_len += ena_rx_ctx.ena_bufs[0].len;
+			xdp_flags |= xdp_verdict;
+
+			/* Mark buffer as consumed when it is redirected */
+			if (likely(xdp_verdict & ENA_XDP_FORWARDED))
+				rx_info->xdp = NULL;
+
+			continue;
+		}
+
+		/* XDP PASS */
+		skb = ena_xdp_rx_skb_zc(rx_ring, xdp);
+		if (unlikely(!skb)) {
+			rc = -ENOMEM;
+			break;
+		}
+
+		pkt_copy++;
+		work_done++;
+		total_len += ena_rx_ctx.ena_bufs[0].len;
+		ena_rx_checksum(rx_ring, &ena_rx_ctx, skb);
+		ena_set_rx_hash(rx_ring, &ena_rx_ctx, skb);
+		skb_record_rx_queue(skb, rx_ring->qid);
+		napi_gro_receive(napi, skb);
+
+	} while (likely(work_done <= budget));
+
+	rx_ring->per_napi_packets += work_done;
+	u64_stats_update_begin(&rx_ring->syncp);
+	rx_ring->rx_stats.bytes += total_len;
+	rx_ring->rx_stats.cnt += work_done;
+	rx_ring->rx_stats.zc_queue_pkt_copy += pkt_copy;
+	u64_stats_update_end(&rx_ring->syncp);
+
+	rx_ring->next_to_clean = next_to_clean;
+
+	if (xdp_flags & ENA_XDP_REDIRECT)
+		xdp_do_flush_map();
+
+	refill_required = ena_com_free_q_entries(rx_ring->ena_com_io_sq);
+	refill_threshold =
+		min_t(int, rx_ring->ring_size / ENA_RX_REFILL_THRESH_DIVIDER,
+		      ENA_RX_REFILL_THRESH_PACKET);
+	/* Optimization, try to batch new rx buffers */
+	if (refill_required > refill_threshold) {
+		ena_com_update_dev_comp_head(rx_ring->ena_com_io_cq);
+		ena_refill_rx_bufs(rx_ring, refill_required);
+	}
+
+	if (xsk_uses_need_wakeup(rx_ring->xsk_pool)) {
+		if (likely(rc || work_done < budget)) {
+			xsk_set_rx_need_wakeup(rx_ring->xsk_pool);
+			ena_increase_stat(&rx_ring->rx_stats.xsk_need_wakeup_set, 1,
+					  &rx_ring->syncp);
+		} else {
+			xsk_clear_rx_need_wakeup(rx_ring->xsk_pool);
+		}
+	}
+
+	if (unlikely(rc)) {
+		struct ena_adapter *adapter = netdev_priv(rx_ring->netdev);
+
+		if (rc == -ENOSPC) {
+			ena_increase_stat(&rx_ring->rx_stats.bad_desc_num, 1,
+					  &rx_ring->syncp);
+			ena_reset_device(adapter,
+					 ENA_REGS_RESET_TOO_MANY_RX_DESCS);
+		} else if (rc == -EIO) {
+			ena_increase_stat(&rx_ring->rx_stats.bad_req_id, 1,
+					  &rx_ring->syncp);
+			ena_reset_device(adapter, ENA_REGS_RESET_INV_RX_REQ_ID);
+		}
+
+		return 0;
+	}
+
+	return work_done;
+}
+
+#endif /* ENA_AF_XDP_SUPPORT */
+/* This is the XDP napi callback. XDP queues use a separate napi callback
+ * than Rx/Tx queues.
+ */
+int ena_xdp_io_poll(struct napi_struct *napi, int budget)
+{
+	struct ena_napi *ena_napi = container_of(napi, struct ena_napi, napi);
+	struct ena_ring *rx_ring, *tx_ring;
+	bool needs_wakeup = true;
+	u32 rx_work_done = 0;
+	int ret;
+
+	rx_ring = ena_napi->rx_ring;
+	tx_ring = ena_napi->tx_ring;
+
+	if (!test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags) ||
+	    test_bit(ENA_FLAG_TRIGGER_RESET, &tx_ring->adapter->flags)) {
+		napi_complete_done(napi, 0);
+		return 0;
+	}
+
+	needs_wakeup &= ena_clean_xdp_irq(tx_ring, budget);
+
+#ifdef ENA_AF_XDP_SUPPORT
+	if (!ENA_IS_XSK_RING(tx_ring))
+		goto polling_done;
+
+	needs_wakeup &= ena_xdp_xmit_irq_zc(tx_ring, napi, budget);
+
+	rx_work_done = ena_xdp_clean_rx_irq_zc(rx_ring, napi, budget);
+	needs_wakeup &= rx_work_done < budget;
+
+polling_done:
+#endif /* ENA_AF_XDP_SUPPORT */
+	/* If the device is about to reset or down, avoid unmask
+	 * the interrupt and return 0 so NAPI won't reschedule
+	 */
+	if (unlikely(!test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags))) {
+		napi_complete_done(napi, 0);
+		ret = 0;
+	} else if (needs_wakeup) {
+		ena_increase_stat(&tx_ring->tx_stats.napi_comp, 1,
+				  &tx_ring->syncp);
+		if (napi_complete_done(napi, rx_work_done) &&
+		    READ_ONCE(ena_napi->interrupts_masked)) {
+			smp_rmb(); /* make sure interrupts_masked is read */
+			WRITE_ONCE(ena_napi->interrupts_masked, false);
+			ena_unmask_interrupt(tx_ring, NULL);
+		}
+
+		ena_update_ring_numa_node(tx_ring, NULL);
+		ret = rx_work_done;
+	} else {
+		ret = budget;
+	}
+
+	u64_stats_update_begin(&tx_ring->syncp);
+	tx_ring->tx_stats.tx_poll++;
+	u64_stats_update_end(&tx_ring->syncp);
+	tx_ring->tx_stats.last_napi_jiffies = jiffies;
+
+	return ret;
+}
+#endif /* ENA_XDP_SUPPORT */
diff --git a/drivers/amazon/net/ena/ena_xdp.h b/drivers/amazon/net/ena/ena_xdp.h
new file mode 100644
index 0000000000000..b15d9cb0d25f1
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_xdp.h
@@ -0,0 +1,221 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright 2015-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef ENA_XDP_H
+#define ENA_XDP_H
+
+#include "ena_netdev.h"
+#ifdef ENA_XDP_SUPPORT
+#include <linux/bpf_trace.h>
+#ifdef ENA_AF_XDP_SUPPORT
+#include <net/xdp_sock_drv.h>
+#endif /* ENA_AF_XDP_SUPPORT */
+
+#ifdef ENA_AF_XDP_SUPPORT
+#define ENA_IS_XSK_RING(ring) (!!(ring)->xsk_pool)
+#endif /* ENA_AF_XDP_SUPPORT */
+
+/* The max MTU size is configured to be the ethernet frame size without
+ * the overhead of the ethernet header, which can have a VLAN header, and
+ * a frame check sequence (FCS).
+ * The buffer size we share with the device is defined to be ENA_PAGE_SIZE
+ */
+#ifdef XDP_HAS_FRAME_SZ
+#define ENA_XDP_MAX_MTU (ENA_PAGE_SIZE - ETH_HLEN - ETH_FCS_LEN -	\
+			 VLAN_HLEN - XDP_PACKET_HEADROOM -		\
+			 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
+#else
+#define ENA_XDP_MAX_MTU (ENA_PAGE_SIZE - ETH_HLEN - ETH_FCS_LEN - \
+			 VLAN_HLEN - XDP_PACKET_HEADROOM)
+#endif
+
+#define ENA_IS_XDP_INDEX(adapter, index) (((index) >= (adapter)->xdp_first_ring) && \
+	((index) < (adapter)->xdp_first_ring + (adapter)->xdp_num_queues))
+
+enum ENA_XDP_ACTIONS {
+	ENA_XDP_PASS		= 0,
+	ENA_XDP_TX		= BIT(0),
+	ENA_XDP_REDIRECT	= BIT(1),
+	ENA_XDP_DROP		= BIT(2)
+};
+#define ENA_XDP_FORWARDED (ENA_XDP_TX | ENA_XDP_REDIRECT)
+
+int ena_setup_and_create_all_xdp_queues(struct ena_adapter *adapter);
+void ena_xdp_exchange_program_rx_in_range(struct ena_adapter *adapter,
+					  struct bpf_prog *prog,
+					  int first, int count);
+int ena_xdp_io_poll(struct napi_struct *napi, int budget);
+int ena_xdp_xmit_frame(struct ena_ring *tx_ring,
+		       struct ena_adapter *adapter,
+		       struct xdp_frame *xdpf,
+		       int flags);
+int ena_xdp_xmit(struct net_device *dev, int n,
+		 struct xdp_frame **frames, u32 flags);
+int ena_xdp(struct net_device *netdev, struct netdev_bpf *bpf);
+int ena_xdp_register_rxq_info(struct ena_ring *rx_ring);
+void ena_xdp_unregister_rxq_info(struct ena_ring *rx_ring);
+#ifdef ENA_AF_XDP_SUPPORT
+void ena_xdp_free_tx_bufs_zc(struct ena_ring *tx_ring);
+void ena_xdp_free_rx_bufs_zc(struct ena_adapter *adapter, u32 qid);
+int ena_xdp_xsk_wakeup(struct net_device *netdev, u32 qid, u32 flags);
+#endif
+
+enum ena_xdp_errors_t {
+	ENA_XDP_ALLOWED = 0,
+	ENA_XDP_CURRENT_MTU_TOO_LARGE,
+	ENA_XDP_NO_ENOUGH_QUEUES,
+};
+
+static inline bool ena_xdp_present(struct ena_adapter *adapter)
+{
+	return !!adapter->xdp_bpf_prog;
+}
+
+static inline bool ena_xdp_present_ring(struct ena_ring *ring)
+{
+	return !!ring->xdp_bpf_prog;
+}
+
+static inline bool ena_xdp_legal_queue_count(struct ena_adapter *adapter,
+					     u32 queues)
+{
+	return 2 * queues <= adapter->max_num_io_queues;
+}
+
+static inline enum ena_xdp_errors_t ena_xdp_allowed(struct ena_adapter *adapter)
+{
+	enum ena_xdp_errors_t rc = ENA_XDP_ALLOWED;
+
+	if (adapter->netdev->mtu > ENA_XDP_MAX_MTU)
+		rc = ENA_XDP_CURRENT_MTU_TOO_LARGE;
+	else if (!ena_xdp_legal_queue_count(adapter, adapter->num_io_queues))
+		rc = ENA_XDP_NO_ENOUGH_QUEUES;
+
+	return rc;
+}
+
+#ifdef ENA_AF_XDP_SUPPORT
+static inline bool ena_is_zc_q_exist(struct ena_adapter *adapter)
+{
+	int i;
+
+	for (i = 0; i < adapter->num_io_queues; i++)
+		if (ENA_IS_XSK_RING(&adapter->rx_ring[i]))
+			return true;
+
+	return false;
+}
+
+#endif /* ENA_AF_XDP_SUPPORT */
+static inline int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp)
+{
+	u32 verdict = ENA_XDP_PASS;
+	struct bpf_prog *xdp_prog;
+	struct ena_ring *xdp_ring;
+	struct xdp_frame *xdpf;
+	u64 *xdp_stat;
+
+	xdp_prog = READ_ONCE(rx_ring->xdp_bpf_prog);
+
+	verdict = bpf_prog_run_xdp(xdp_prog, xdp);
+
+	switch (verdict) {
+	case XDP_TX:
+#ifdef XDP_CONVERT_TO_FRAME_NAME_CHANGED
+		xdpf = xdp_convert_buff_to_frame(xdp);
+#else
+		xdpf = convert_to_xdp_frame(xdp);
+#endif
+		if (unlikely(!xdpf)) {
+			trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict);
+			xdp_stat = &rx_ring->rx_stats.xdp_aborted;
+			verdict = ENA_XDP_DROP;
+			break;
+		}
+
+		/* Find xmit queue */
+		xdp_ring = rx_ring->xdp_ring;
+
+		/* The XDP queues are shared between XDP_TX and XDP_REDIRECT */
+		spin_lock(&xdp_ring->xdp_tx_lock);
+
+		if (ena_xdp_xmit_frame(xdp_ring, rx_ring->adapter, xdpf,
+				       XDP_XMIT_FLUSH))
+			xdp_return_frame(xdpf);
+
+		spin_unlock(&xdp_ring->xdp_tx_lock);
+		xdp_stat = &rx_ring->rx_stats.xdp_tx;
+		verdict = ENA_XDP_TX;
+		break;
+	case XDP_REDIRECT:
+		if (likely(!xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog))) {
+			xdp_stat = &rx_ring->rx_stats.xdp_redirect;
+			verdict = ENA_XDP_REDIRECT;
+			break;
+		}
+		trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict);
+		xdp_stat = &rx_ring->rx_stats.xdp_aborted;
+		verdict = ENA_XDP_DROP;
+		break;
+	case XDP_ABORTED:
+		trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict);
+		xdp_stat = &rx_ring->rx_stats.xdp_aborted;
+		verdict = ENA_XDP_DROP;
+		break;
+	case XDP_DROP:
+		xdp_stat = &rx_ring->rx_stats.xdp_drop;
+		verdict = ENA_XDP_DROP;
+		break;
+	case XDP_PASS:
+		xdp_stat = &rx_ring->rx_stats.xdp_pass;
+		verdict = ENA_XDP_PASS;
+		break;
+	default:
+		bpf_warn_invalid_xdp_action(verdict);
+		xdp_stat = &rx_ring->rx_stats.xdp_invalid;
+		verdict = ENA_XDP_DROP;
+	}
+
+	ena_increase_stat(xdp_stat, 1, &rx_ring->syncp);
+
+	return verdict;
+}
+#else /* ENA_XDP_SUPPORT */
+
+#define ENA_IS_XDP_INDEX(adapter, index) (false)
+
+static inline bool ena_xdp_present_ring(struct ena_ring *ring)
+{
+	return false;
+}
+
+static inline int ena_xdp_register_rxq_info(struct ena_ring *rx_ring)
+{
+	return 0;
+}
+
+static inline void ena_xdp_unregister_rxq_info(struct ena_ring *rx_ring) {}
+
+#endif /* ENA_XDP_SUPPORT */
+#ifndef ENA_AF_XDP_SUPPORT /* stabs for AF XDP code */
+
+/* Define (or override if it's defined) these enum and function to make sure
+ * that the code that uses them would always compile. If AF XDP isn't supported, it
+ * won't be used anyway.
+ */
+#define MEM_TYPE_XSK_BUFF_POOL 0
+#define xsk_pool_set_rxq_info(pool, rxq)
+
+static inline void ena_xdp_free_tx_bufs_zc(struct ena_ring *tx_ring) {}
+static inline void ena_xdp_free_rx_bufs_zc(struct ena_adapter *adapter, u32 qid) {}
+
+#define ENA_IS_XSK_RING(ring) false
+
+static inline bool ena_is_zc_q_exist(struct ena_adapter *adapter)
+{
+	return false;
+}
+#endif /* ENA_AF_XDP_SUPPORT */
+#endif /* ENA_XDP_H */
diff --git a/drivers/amazon/net/ena/kcompat.h b/drivers/amazon/net/ena/kcompat.h
index c82567e4529db..f6f930e2bf19e 100644
--- a/drivers/amazon/net/ena/kcompat.h
+++ b/drivers/amazon/net/ena/kcompat.h
@@ -696,11 +696,52 @@ do {									\
 #define HAVE_NDO_TX_TIMEOUT_STUCK_QUEUE_PARAMETER
 #endif
 
+#if defined(CONFIG_NET_DEVLINK) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0)
+#define ENA_DEVLINK_SUPPORT
+#endif
+
+#if !defined(CONFIG_NET_DEVLINK) && !defined(CONFIG_NET_DEVLINK_MODULE) && !defined(CONFIG_MAY_USE_DEVLINK)
+#define ENA_NO_DEVLINK_HEADERS
+#endif
+
+#if defined(CONFIG_NET_DEVLINK) &&					\
+	(LINUX_VERSION_CODE >= KERNEL_VERSION(5, 4, 0) ||		\
+	 (SUSE_VERSION && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 3, 18)))
+#define ENA_DEVLINK_RELOAD_UP_DOWN_SUPPORTED
+#endif
+
+#if defined(CONFIG_NET_DEVLINK) && \
+	(KERNEL_VERSION(5, 4, 0) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0))
+#define ENA_DEVLINK_RELOAD_ENABLING_REQUIRED
+#endif
+
+#if defined(CONFIG_NET_DEVLINK) &&					\
+	(LINUX_VERSION_CODE >= KERNEL_VERSION(5, 5, 0) ||		\
+	 (SUSE_VERSION && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 3, 18)))
+#define ENA_DEVLINK_RELOAD_NS_CHANGE_SUPPORT
+#endif
+
+#if defined(CONFIG_NET_DEVLINK) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0)
+#define ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0)
+#define ENA_DEVLINK_RECEIVES_DEVICE_ON_ALLOC
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0)
+#define ENA_DEVLINK_RELOAD_SUPPORT_ADVERTISEMENT_NEEDED
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0)
+#define ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
+#endif
+
 #if LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0) && \
     !(RHEL_RELEASE_CODE && ((RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7, 1)) && \
-                            (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(6, 6)))) && \
-                            !defined(UBUNTU_VERSION_CODE) && \
-                            !defined(UEK3_RELEASE)
+			    (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(6, 6)))) && \
+			    !defined(UBUNTU_VERSION_CODE) && \
+			    !defined(UEK3_RELEASE) && (!defined(DEBIAN_VERSION) || DEBIAN_VERSION != 8)
 
 #define DO_ONCE(func, ...)						     \
 	({								     \
@@ -843,4 +884,24 @@ xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start,
 #define ENA_XDP_XMIT_FREES_FAILED_DESCS_INTERNALLY
 #endif
 
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0)
+
+static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr)
+{
+	memcpy(dev->dev_addr, addr, ETH_ALEN);
+}
+
+#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0) */
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0)
+#define ENA_EXTENDED_COALESCE_UAPI_WITH_CQE_SUPPORTED
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 17, 0)
+#define ENA_ETHTOOL_RX_BUFF_SIZE_CHANGE
+#endif
+
+#if defined(ENA_XDP_SUPPORT) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0)
+#define ENA_AF_XDP_SUPPORT
+#endif
 #endif /* _KCOMPAT_H_ */

From 49e4c47685296ee2d566fd711137b6999063cd97 Mon Sep 17 00:00:00 2001
From: Shaoying Xu <shaoyi@amazon.com>
Date: Fri, 20 May 2022 19:04:51 +0000
Subject: [PATCH 384/737] lustre: update to AmazonFSxLustreClient v2.10.8-11

Signed-off-by: Shaoying Xu <shaoyi@amazon.com>
---
 drivers/staging/lustrefsx/config.h            |  11 +-
 .../lustre/include/lustre/lustre_idl.h        |  18 +-
 .../lustrefsx/lustre/include/lustre_compat.h  | 103 +++++++-----
 .../lustrefsx/lustre/include/lustre_disk.h    |   7 +-
 .../lustrefsx/lustre/include/obd_class.h      |  17 +-
 drivers/staging/lustrefsx/lustre/llite/file.c |  26 ++-
 .../lustrefsx/lustre/llite/llite_internal.h   |  35 ++--
 .../lustrefsx/lustre/llite/llite_lib.c        |  52 +++---
 .../lustrefsx/lustre/llite/lproc_llite.c      |   5 +-
 .../staging/lustrefsx/lustre/llite/namei.c    |  38 +++--
 .../staging/lustrefsx/lustre/llite/xattr.c    |  46 ++++--
 .../lustrefsx/lustre/llite/xattr_security.c   |   4 +-
 .../lustrefsx/lustre/mgc/mgc_internal.h       |  16 +-
 .../lustrefsx/lustre/mgc/mgc_request.c        | 154 +++++++++---------
 .../lustrefsx/lustre/obdclass/obd_config.c    |  35 ++--
 .../lustre/obdclass/obd_mount_server.c        |   4 +-
 .../lustrefsx/lustre/ptlrpc/wiretest.c        |  12 +-
 drivers/staging/lustrefsx/undef.h             |   3 +
 18 files changed, 339 insertions(+), 247 deletions(-)

diff --git a/drivers/staging/lustrefsx/config.h b/drivers/staging/lustrefsx/config.h
index fce8b057480b6..f4d6ee0ba3c8a 100644
--- a/drivers/staging/lustrefsx/config.h
+++ b/drivers/staging/lustrefsx/config.h
@@ -892,6 +892,9 @@
 /* Define to 1 if you have the <unistd.h> header file. */
 #define HAVE_UNISTD_H 1
 
+/* 'inode_operations' members have user namespace argument */
+/* #undef HAVE_USER_NAMESPACE_ARG */
+
 /* kernel has vfs_rename with 5 args */
 /* #undef HAVE_VFS_RENAME_5ARGS */
 
@@ -981,7 +984,7 @@
 #define LUSTRE_PATCH 8
 
 /* A copy of PACKAGE_VERSION */
-#define LUSTRE_VERSION_STRING "2.10.8-10"
+#define LUSTRE_VERSION_STRING "2.10.8-11"
 
 /* maximum number of MDS threads */
 /* #undef MDS_MAX_THREADS */
@@ -1014,7 +1017,7 @@
 #define PACKAGE_NAME "Lustre"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "Lustre 2.10.8-10"
+#define PACKAGE_STRING "Lustre 2.10.8-11"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "lustre"
@@ -1023,7 +1026,7 @@
 #define PACKAGE_URL ""
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2.10.8-10"
+#define PACKAGE_VERSION "2.10.8-11"
 
 /* name of parallel fsck program */
 #define PFSCK "fsck"
@@ -1067,7 +1070,7 @@
 /* #undef USE_LU_REF */
 
 /* Version number of package */
-#define VERSION "2.10.8-10"
+#define VERSION "2.10.8-11"
 
 /* zfs fix version */
 /* #undef ZFS_FIX */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_idl.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_idl.h
index f2c850c0f1848..9840237e4e046 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_idl.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_idl.h
@@ -2448,20 +2448,20 @@ struct mgs_nidtbl_entry {
         } u;
 };
 
-enum {
-	CONFIG_T_CONFIG  = 0,
-	CONFIG_T_SPTLRPC = 1,
-	CONFIG_T_RECOVER = 2,
-	CONFIG_T_PARAMS  = 3,
-	CONFIG_T_NODEMAP = 4,
-	CONFIG_T_BARRIER = 5,
-	CONFIG_T_MAX
+enum mgs_cfg_type {
+	MGS_CFG_T_CONFIG	= 0,
+	MGS_CFG_T_SPTLRPC	= 1,
+	MGS_CFG_T_RECOVER	= 2,
+	MGS_CFG_T_PARAMS	= 3,
+	MGS_CFG_T_NODEMAP	= 4,
+	MGS_CFG_T_BARRIER	= 5,
+	MGS_CFG_T_MAX
 };
 
 struct mgs_config_body {
 	char     mcb_name[MTI_NAME_MAXLEN]; /* logname */
 	__u64    mcb_offset;    /* next index of config log to request */
-	__u16    mcb_type;      /* type of log: CONFIG_T_[CONFIG|RECOVER] */
+	__u16    mcb_type;      /* type of log: MGS_CFG_T_[CONFIG|RECOVER] */
 	__u8     mcb_nm_cur_pass;
 	__u8     mcb_bits;      /* bits unit size of config log */
 	__u32    mcb_units;     /* # of units for bulk transfer */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
index 441f737170daa..2b14937780e6a 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
@@ -373,10 +373,12 @@ static inline struct inode *file_inode(const struct file *file)
 #define ll_vfs_rename(a, b, c, d) vfs_rename(a, b, c, d)
 #endif
 
-#ifdef HAVE_VFS_UNLINK_3ARGS
-#define ll_vfs_unlink(a, b) vfs_unlink(a, b, NULL)
+#ifdef HAVE_USER_NAMESPACE_ARG
+#define vfs_unlink(ns, dir, de) vfs_unlink(ns, dir, de, NULL)
+#elif defined HAVE_VFS_UNLINK_3ARGS
+#define vfs_unlink(ns, dir, de) vfs_unlink(dir, de, NULL)
 #else
-#define ll_vfs_unlink(a, b) vfs_unlink(a, b)
+#define vfs_unlink(ns, dir, de) vfs_unlink(dir, de)
 #endif
 
 #ifndef HAVE_INODE_LOCK
@@ -469,45 +471,9 @@ int ll_removexattr(struct dentry *dentry, const char *name);
 #endif /* ! HAVE_XATTR_HANDLER_FLAGS */
 #endif /* HAVE_IOP_XATTR */
 
-#ifndef HAVE_VFS_SETXATTR
-const struct xattr_handler *get_xattr_type(const char *name);
-
-#ifdef HAVE_XATTR_HANDLER_FLAGS
-static inline int
-__vfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
-	       const void *value, size_t size, int flags)
-{
-	const struct xattr_handler *handler;
-	int rc;
-
-	handler = get_xattr_type(name);
-	if (!handler)
-		return -ENXIO;
-
-#if defined(HAVE_XATTR_HANDLER_INODE_PARAM)
-	rc = handler->set(handler, dentry, inode, name, value, size,
-			  XATTR_CREATE);
-#elif defined(HAVE_XATTR_HANDLER_SIMPLIFIED)
-	rc = handler->set(handler, dentry, name, value, size, XATTR_CREATE);
-#else
-	rc = handler->set(dentry, name, value, size, XATTR_CREATE,
-			  handler->flags);
-#endif /* !HAVE_XATTR_HANDLER_INODE_PARAM */
-	return rc;
-}
-#else /* !HAVE_XATTR_HANDLER_FLAGS */
-static inline int
-__vfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
-	       const void *value, size_t size, int flags)
-{
-	return ll_setxattr(dentry, name, value, size, flags);
-}
-#endif /* HAVE_XATTR_HANDLER_FLAGS */
-#endif /* HAVE_VFS_SETXATTR */
-
 #ifdef HAVE_IOP_SET_ACL
 #ifdef CONFIG_FS_POSIX_ACL
-#ifndef HAVE_POSIX_ACL_UPDATE_MODE
+#if !defined(HAVE_USER_NAMESPACE_ARG) && !defined(HAVE_POSIX_ACL_UPDATE_MODE)
 static inline int posix_acl_update_mode(struct inode *inode, umode_t *mode_p,
 			  struct posix_acl **acl)
 {
@@ -723,4 +689,61 @@ static inline void ll_security_release_secctx(char *secdata, u32 seclen)
 #endif
 }
 
+static inline int ll_vfs_getxattr(struct dentry *dentry, struct inode *inode,
+				  const char *name,
+				  void *value, size_t size)
+{
+#ifdef HAVE_USER_NAMESPACE_ARG
+	return vfs_getxattr(&init_user_ns, dentry, name, value, size);
+#elif defined(HAVE_VFS_SETXATTR)
+	return __vfs_getxattr(dentry, inode, name, value, size);
+#else
+	if (unlikely(!inode->i_op->getxattr))
+		return -ENODATA;
+
+	return inode->i_op->getxattr(dentry, name, value, size);
+#endif
+}
+
+static inline int ll_vfs_setxattr(struct dentry *dentry, struct inode *inode,
+				  const char *name,
+				  const void *value, size_t size, int flags)
+{
+#ifdef HAVE_USER_NAMESPACE_ARG
+	return vfs_setxattr(&init_user_ns, dentry, name, value, size, flags);
+#elif defined(HAVE_VFS_SETXATTR)
+	return __vfs_setxattr(dentry, inode, name, value, size, flags);
+#else
+	if (unlikely(!inode->i_op->setxattr))
+		return -EOPNOTSUPP;
+
+	return inode->i_op->setxattr(dentry, name, value, size, flags);
+#endif
+}
+
+static inline int ll_vfs_removexattr(struct dentry *dentry, struct inode *inode,
+				     const char *name)
+{
+#ifdef HAVE_USER_NAMESPACE_ARG
+	return vfs_removexattr(&init_user_ns, dentry, name);
+#elif defined(HAVE_VFS_SETXATTR)
+    return __vfs_removexattr(dentry, name);
+#else
+	if (unlikely(!inode->i_op->setxattr))
+		return -EOPNOTSUPP;
+
+	return inode->i_op->removexattr(dentry, name);
+#endif
+}
+
+#ifndef HAVE_USER_NAMESPACE_ARG
+#define posix_acl_update_mode(ns, inode, mode, acl) \
+	posix_acl_update_mode(inode, mode, acl)
+#define notify_change(ns, de, attr, inode)	notify_change(de, attr, inode)
+#define inode_owner_or_capable(ns, inode)	inode_owner_or_capable(inode)
+#define vfs_create(ns, dir, de, mode, ex)	vfs_create(dir, de, mode, ex)
+#define vfs_mkdir(ns, dir, de, mode)		vfs_mkdir(dir, de, mode)
+#define ll_set_acl(ns, inode, acl, type)	ll_set_acl(inode, acl, type)
+#endif
+
 #endif /* _LUSTRE_COMPAT_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_disk.h b/drivers/staging/lustrefsx/lustre/include/lustre_disk.h
index 9b20b7ba8f09e..763e682f2d2b2 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_disk.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_disk.h
@@ -353,8 +353,11 @@ int server_mti_print(const char *title, struct mgs_target_info *mti);
 void server_calc_timeout(struct lustre_sb_info *lsi, struct obd_device *obd);
 # endif
 
-int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, int type);
-int mgc_logname2resid(char *fsname, struct ldlm_res_id *res_id, int type);
+/* mgc_request.c */
+int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id,
+		     enum mgs_cfg_type type);
+int mgc_logname2resid(char *fsname, struct ldlm_res_id *res_id,
+		      enum mgs_cfg_type type);
 
 /** @} disk */
 
diff --git a/drivers/staging/lustrefsx/lustre/include/obd_class.h b/drivers/staging/lustrefsx/lustre/include/obd_class.h
index da40a4e38f91b..437a700807142 100644
--- a/drivers/staging/lustrefsx/lustre/include/obd_class.h
+++ b/drivers/staging/lustrefsx/lustre/include/obd_class.h
@@ -169,7 +169,7 @@ int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg);
 
 /* Passed as data param to class_config_parse_llog */
 struct config_llog_instance {
-	void			*cfg_instance;
+	unsigned long		 cfg_instance;
 	struct super_block	*cfg_sb;
 	struct obd_uuid		 cfg_uuid;
 	llog_cb_t		 cfg_callback;
@@ -181,6 +181,19 @@ struct config_llog_instance {
 int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
 			    char *name, struct config_llog_instance *cfg);
 
+/**
+ * Generate a unique configuration instance for this mount
+ *
+ * Temporary hack to bypass ASLR in 4.15+ kernels, a better fix soon.
+ * For now, use the same value as before - the superblock pointer value.
+ *
+ * Using the client UUID would be an option, but it needs more testing.
+ */
+static inline unsigned long ll_get_cfg_instance(struct super_block *sb)
+{
+	return (unsigned long)sb;
+}
+
 #define CONFIG_SUB_SPTLRPC	0x01
 #define CONFIG_SUB_RECOVER	0x02
 #define CONFIG_SUB_PARAMS	0x04
@@ -224,7 +237,7 @@ struct config_llog_data {
 	struct config_llog_data    *cld_barrier;/* barrier log (for MDT only) */
 	struct obd_export	   *cld_mgcexp;
 	struct mutex		    cld_lock;
-	int			    cld_type;
+	enum mgs_cfg_type	    cld_type;
 	unsigned int		    cld_stopping:1, /* we were told to stop
 						     * watching */
 				    cld_lostlock:1; /* lock not requeued */
diff --git a/drivers/staging/lustrefsx/lustre/llite/file.c b/drivers/staging/lustrefsx/lustre/llite/file.c
index 04cc72f451861..65d57dbe70b42 100644
--- a/drivers/staging/lustrefsx/lustre/llite/file.c
+++ b/drivers/staging/lustrefsx/lustre/llite/file.c
@@ -904,8 +904,8 @@ static int ll_check_swap_layouts_validity(struct inode *inode1,
 	if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
 		return -EINVAL;
 
-	if (inode_permission(inode1, MAY_WRITE) ||
-	    inode_permission(inode2, MAY_WRITE))
+	if (inode_permission(&init_user_ns, inode1, MAY_WRITE) ||
+	    inode_permission(&init_user_ns, inode2, MAY_WRITE))
 		return -EPERM;
 
 	if (inode1->i_sb != inode2->i_sb)
@@ -3815,8 +3815,8 @@ static inline dev_t ll_compat_encode_dev(dev_t dev)
 	return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
 }
 
-#ifdef HAVE_INODEOPS_ENHANCED_GETATTR
-int ll_getattr(const struct path *path, struct kstat *stat,
+#if defined(HAVE_USER_NAMESPACE_ARG) || defined(HAVE_INODEOPS_ENHANCED_GETATTR)
+int ll_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat,
 	       u32 request_mask, unsigned int flags)
 
 {
@@ -3918,7 +3918,7 @@ struct posix_acl *ll_get_acl(struct inode *inode, int type)
 
 #ifdef HAVE_IOP_SET_ACL
 #ifdef CONFIG_FS_POSIX_ACL
-int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+int ll_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type)
 {
 	struct ll_sb_info *sbi = ll_i2sbi(inode);
 	struct ptlrpc_request *req = NULL;
@@ -3932,7 +3932,7 @@ int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	case ACL_TYPE_ACCESS:
 		name = XATTR_NAME_POSIX_ACL_ACCESS;
 		if (acl) {
-			rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+			rc = posix_acl_update_mode(mnt_userns, inode, &inode->i_mode, &acl);
 			if (rc)
 				GOTO(out, rc);
 		}
@@ -3976,6 +3976,7 @@ int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 #endif /* CONFIG_FS_POSIX_ACL */
 #endif /* HAVE_IOP_SET_ACL */
 
+#ifndef HAVE_USER_NAMESPACE_ARG
 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
 static int
 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
@@ -4007,16 +4008,9 @@ ll_check_acl(struct inode *inode, int mask)
 # endif /* CONFIG_FS_POSIX_ACL */
 }
 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
+#endif /* HAVE_USER_NAMESPACE_ARG */
 
-#ifdef HAVE_GENERIC_PERMISSION_4ARGS
-int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
-#else
-# ifdef HAVE_INODE_PERMISION_2ARGS
-int ll_inode_permission(struct inode *inode, int mask)
-# else
-int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
-# endif
-#endif
+int ll_inode_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask)
 {
 	int rc = 0;
 	struct ll_sb_info *sbi;
@@ -4077,7 +4071,7 @@ int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
 	}
 
 	ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
-	rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
+	rc = generic_permission(mnt_userns, inode, mask);
 	/* restore current process's credentials and FS capability */
 	if (squash_id) {
 		revert_creds(old_cred);
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_internal.h b/drivers/staging/lustrefsx/lustre/llite/llite_internal.h
index ce05c17a2231f..bfc4f8bfd7bea 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_internal.h
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_internal.h
@@ -236,6 +236,17 @@ struct ll_inode_info {
 	struct list_head		lli_xattrs; /* ll_xattr_entry->xe_list */
 };
 
+#ifndef HAVE_USER_NAMESPACE_ARG
+#define inode_permission(ns, inode, mask)	inode_permission(inode, mask)
+#define generic_permission(ns, inode, mask)	generic_permission(inode, mask)
+#define simple_setattr(ns, de, iattr)		simple_setattr(de, iattr)
+#define ll_inode_permission(ns, inode, mask)	ll_inode_permission(inode, mask)
+#ifdef HAVE_INODEOPS_ENHANCED_GETATTR
+#define ll_getattr(ns, path, stat, mask, fl)	ll_getattr(path, stat, mask, fl)
+#endif /* HAVE_INODEOPS_ENHANCED_GETATTR */
+#define ll_setattr(ns, de, attr)		ll_setattr(de, attr)
+#endif
+
 static inline __u32 ll_layout_version_get(struct ll_inode_info *lli)
 {
 	__u32 gen;
@@ -824,16 +835,17 @@ int ll_md_real_close(struct inode *inode, fmode_t fmode);
 extern void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
                               struct ll_file_data *file, loff_t pos,
                               size_t count, int rw);
-#ifdef HAVE_INODEOPS_ENHANCED_GETATTR
-int ll_getattr(const struct path *path, struct kstat *stat,
-	       u32 request_mask, unsigned int flags);
+#if defined(HAVE_USER_NAMESPACE_ARG) || defined(HAVE_INODEOPS_ENHANCED_GETATTR)
+int ll_getattr(struct user_namespace *mnt_userns, const struct path *path,
+	       struct kstat *stat, u32 request_mask, unsigned int flags);
 #else
 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat);
-#endif
+#endif /* HAVE_USER_NAMESPACE_ARG */
 struct posix_acl *ll_get_acl(struct inode *inode, int type);
 #ifdef HAVE_IOP_SET_ACL
 #ifdef CONFIG_FS_POSIX_ACL
-int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int ll_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+	       struct posix_acl *acl, int type);
 #else  /* !CONFIG_FS_POSIX_ACL */
 #define ll_set_acl NULL
 #endif /* CONFIG_FS_POSIX_ACL */
@@ -843,15 +855,7 @@ int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
 	       const char *name, int namelen);
 int ll_get_fid_by_name(struct inode *parent, const char *name,
 		       int namelen, struct lu_fid *fid, struct inode **inode);
-#ifdef HAVE_GENERIC_PERMISSION_4ARGS
-int ll_inode_permission(struct inode *inode, int mask, unsigned int flags);
-#else
-# ifndef HAVE_INODE_PERMISION_2ARGS
-int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd);
-# else
-int ll_inode_permission(struct inode *inode, int mask);
-# endif
-#endif
+int ll_inode_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask);
 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
 			unsigned long arg);
 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
@@ -903,7 +907,8 @@ struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock);
 void ll_dir_clear_lsm_md(struct inode *inode);
 void ll_clear_inode(struct inode *inode);
 int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import);
-int ll_setattr(struct dentry *de, struct iattr *attr);
+int ll_setattr(struct user_namespace *mnt_userns, struct dentry *de,
+	       struct iattr *attr);
 int ll_statfs(struct dentry *de, struct kstatfs *sfs);
 int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
                        __u64 max_age, __u32 flags);
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_lib.c b/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
index 644b1c4e26d47..04256c2600083 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
@@ -978,13 +978,16 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
 	char	*profilenm = get_profile_name(sb);
 	struct config_llog_instance *cfg;
 	/* %p for void* in printf needs 16+2 characters: 0xffffffffffffffff */
-	const int instlen = sizeof(cfg->cfg_instance) * 2 + 2;
+	const int instlen = 16 + 2;
+	unsigned long cfg_instance = ll_get_cfg_instance(sb);
 	int	md_len = 0;
 	int	dt_len = 0;
 	int	err;
 	ENTRY;
 
-	CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb);
+	/* for ASLR, to map between cfg_instance and hashed ptr */
+	CDEBUG(D_VFSTRACE, "VFS Op: cfg_instance %s-%016lx (sb %p)\n",
+	       profilenm, cfg_instance, sb);
 
 	OBD_ALLOC_PTR(cfg);
 	if (cfg == NULL)
@@ -1004,7 +1007,7 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
 	if (err)
 		GOTO(out_free, err);
 
-	err = super_setup_bdi_name(sb, "lustre-%p", sb);
+	err = super_setup_bdi_name(sb, "lustre-%016lx", cfg_instance);
 	if (err)
 		GOTO(out_free, err);
 
@@ -1024,10 +1027,10 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
 		}
 	}
 
-	/* Generate a string unique to this super, in case some joker tries
-	   to mount the same fs at two mount points.
-	   Use the address of the super itself.*/
-	cfg->cfg_instance = sb;
+	/* The cfg_instance is a value unique to this super, in case some
+	 * joker tries to mount the same fs at two mount points.
+	 */
+	cfg->cfg_instance = cfg_instance;
 	cfg->cfg_uuid = lsi->lsi_llsbi->ll_sb_uuid;
 	cfg->cfg_callback = class_config_llog_handler;
 	cfg->cfg_sub_clds = CONFIG_SUB_CLIENT;
@@ -1051,13 +1054,13 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
 	OBD_ALLOC(dt, dt_len);
 	if (!dt)
 		GOTO(out_proc, err = -ENOMEM);
-	snprintf(dt, dt_len - 1, "%s-%p", lprof->lp_dt, cfg->cfg_instance);
+	snprintf(dt, dt_len - 1, "%s-%016lx", lprof->lp_dt, cfg_instance);
 
 	md_len = strlen(lprof->lp_md) + instlen + 2;
 	OBD_ALLOC(md, md_len);
 	if (!md)
 		GOTO(out_proc, err = -ENOMEM);
-	snprintf(md, md_len - 1, "%s-%p", lprof->lp_md, cfg->cfg_instance);
+	snprintf(md, md_len - 1, "%s-%016lx", lprof->lp_md, cfg_instance);
 
 	/* connections, registrations, sb setup */
 	err = client_common_fill_super(sb, md, dt, mnt);
@@ -1088,20 +1091,24 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
 void ll_put_super(struct super_block *sb)
 {
 	struct config_llog_instance cfg, params_cfg;
-        struct obd_device *obd;
-        struct lustre_sb_info *lsi = s2lsi(sb);
-        struct ll_sb_info *sbi = ll_s2sbi(sb);
-        char *profilenm = get_profile_name(sb);
+	struct obd_device *obd;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	char *profilenm = get_profile_name(sb);
+	unsigned long cfg_instance = ll_get_cfg_instance(sb);
 	long ccc_count;
 	int next, force = 1, rc = 0;
-        ENTRY;
+	ENTRY;
 
-        CDEBUG(D_VFSTRACE, "VFS Op: sb %p - %s\n", sb, profilenm);
 
-        cfg.cfg_instance = sb;
-        lustre_end_log(sb, profilenm, &cfg);
+	/* Should replace instance_id with something better for ASLR */
+	CDEBUG(D_VFSTRACE, "VFS Op: cfg_instance %s-%016lx (sb %p)\n",
+	       profilenm, cfg_instance, sb);
 
-	params_cfg.cfg_instance = sb;
+	cfg.cfg_instance = cfg_instance;
+	lustre_end_log(sb, profilenm, &cfg);
+
+	params_cfg.cfg_instance = cfg_instance;
 	lustre_end_log(sb, PARAMS_FILENAME, &params_cfg);
 
         if (sbi->ll_md_exp) {
@@ -1122,7 +1129,6 @@ void ll_put_super(struct super_block *sb)
 	if (force == 0 && rc != -EINTR)
 		LASSERTF(ccc_count == 0, "count: %li\n", ccc_count);
 
-
         /* We need to set force before the lov_disconnect in
            lustre_common_put_super, since l_d cleans up osc's as well. */
         if (force) {
@@ -1506,7 +1512,8 @@ static int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data)
 			    !S_ISDIR(inode->i_mode)) {
 				ia_valid = op_data->op_attr.ia_valid;
 				op_data->op_attr.ia_valid &= ~TIMES_SET_FLAGS;
-				rc = simple_setattr(dentry, &op_data->op_attr);
+				rc = simple_setattr(&init_user_ns, dentry,
+						    &op_data->op_attr);
 				op_data->op_attr.ia_valid = ia_valid;
 			}
 		} else if (rc != -EPERM && rc != -EACCES && rc != -ETXTBSY) {
@@ -1528,7 +1535,7 @@ static int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data)
 	op_data->op_attr.ia_valid &= ~(TIMES_SET_FLAGS | ATTR_SIZE);
 	if (S_ISREG(inode->i_mode))
 		inode_lock(inode);
-	rc = simple_setattr(dentry, &op_data->op_attr);
+	rc = simple_setattr(&init_user_ns, dentry, &op_data->op_attr);
 	if (S_ISREG(inode->i_mode))
 		inode_unlock(inode);
 	op_data->op_attr.ia_valid = ia_valid;
@@ -1713,7 +1720,8 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import)
 	return rc;
 }
 
-int ll_setattr(struct dentry *de, struct iattr *attr)
+int ll_setattr(struct user_namespace *mnt_userns, struct dentry *de,
+	       struct iattr *attr)
 {
 	int mode = de->d_inode->i_mode;
 
diff --git a/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c b/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
index ee696ef0a4c79..6ed67697eb455 100755
--- a/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
+++ b/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
@@ -1289,6 +1289,7 @@ int lprocfs_ll_register_mountpoint(struct proc_dir_entry *parent,
 	struct lprocfs_vars lvars[2];
 	struct lustre_sb_info *lsi = s2lsi(sb);
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	unsigned long cfg_instance = ll_get_cfg_instance(sb);
 	char name[MAX_STRING_SIZE + 1], *ptr;
 	int err, id, len, rc;
 	ENTRY;
@@ -1307,8 +1308,8 @@ int lprocfs_ll_register_mountpoint(struct proc_dir_entry *parent,
 		len -= 7;
 
 	/* Mount info */
-	snprintf(name, MAX_STRING_SIZE, "%.*s-%p", len,
-		 lsi->lsi_lmd->lmd_profile, sb);
+	snprintf(name, MAX_STRING_SIZE, "%.*s-%016lx", len,
+		 lsi->lsi_lmd->lmd_profile, cfg_instance);
 
 	sbi->ll_proc_root = lprocfs_register(name, parent, NULL, NULL);
 	if (IS_ERR(sbi->ll_proc_root)) {
diff --git a/drivers/staging/lustrefsx/lustre/llite/namei.c b/drivers/staging/lustrefsx/lustre/llite/namei.c
index ae7101b1885f2..3ff57049caa9c 100644
--- a/drivers/staging/lustrefsx/lustre/llite/namei.c
+++ b/drivers/staging/lustrefsx/lustre/llite/namei.c
@@ -49,6 +49,18 @@
 #include <lustre_ver.h>
 #include "llite_internal.h"
 
+#ifndef HAVE_USER_NAMESPACE_ARG
+#define ll_create_nd(ns, dir, de, mode, ex)	ll_create_nd(dir, de, mode, ex)
+#define ll_mkdir(ns, dir, dch, mode)		ll_mkdir(dir, dch, mode)
+#define ll_mknod(ns, dir, dch, mode, rd)	ll_mknod(dir, dch, mode, rd)
+#ifdef HAVE_IOPS_RENAME_WITH_FLAGS
+#define ll_rename(ns, src, sdc, tgt, tdc, fl)	ll_rename(src, sdc, tgt, tdc, fl)
+#else
+#define ll_rename(ns, src, sdc, tgt, tdc)	ll_rename(src, sdc, tgt, tdc)
+#endif /* HAVE_IOPS_RENAME_WITH_FLAGS */
+#define ll_symlink(nd, dir, dch, old)		ll_symlink(dir, dch, old)
+#endif
+
 static int ll_create_it(struct inode *dir, struct dentry *dentry,
 			struct lookup_intent *it,
 			void *secctx, __u32 secctxlen);
@@ -683,7 +695,8 @@ static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry,
 	 * to proceed with lookup. LU-4185
 	 */
 	if ((flags & LOOKUP_CREATE) && !(flags & LOOKUP_OPEN) &&
-	    (inode_permission(parent, MAY_WRITE | MAY_EXEC) == 0))
+	    (inode_permission(&init_user_ns,
+			      parent, MAY_WRITE | MAY_EXEC) == 0))
 		return NULL;
 
 	if (flags & (LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE))
@@ -1147,8 +1160,8 @@ static int ll_new_node(struct inode *dir, struct dentry *dchild,
 	return err;
 }
 
-static int ll_mknod(struct inode *dir, struct dentry *dchild, ll_umode_t mode,
-		    dev_t rdev)
+static int ll_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		    struct dentry *dchild, umode_t mode, dev_t rdev)
 {
 	struct qstr *name = &dchild->d_name;
 	int err;
@@ -1190,7 +1203,8 @@ static int ll_mknod(struct inode *dir, struct dentry *dchild, ll_umode_t mode,
 /*
  * Plain create. Intent create is handled in atomic_open.
  */
-static int ll_create_nd(struct inode *dir, struct dentry *dentry,
+static int ll_create_nd(struct user_namespace *mnt_userns,
+			struct inode *dir, struct dentry *dentry,
 			umode_t mode, bool want_excl)
 {
 	int rc;
@@ -1204,7 +1218,7 @@ static int ll_create_nd(struct inode *dir, struct dentry *dentry,
 
 	/* Using mknod(2) to create a regular file is designed to not recognize
 	 * volatile file name, so we use ll_mknod() here. */
-	rc = ll_mknod(dir, dentry, mode, 0);
+	rc = ll_mknod(mnt_userns, dir, dentry, mode, 0);
 
 	ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_CREATE, 1);
 
@@ -1265,8 +1279,8 @@ static int ll_create_nd(struct inode *dir, struct dentry *dentry,
 }
 #endif /* HAVE_IOP_ATOMIC_OPEN */
 
-static int ll_symlink(struct inode *dir, struct dentry *dchild,
-		      const char *oldpath)
+static int ll_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dchild, const char *oldpath)
 {
 	struct qstr *name = &dchild->d_name;
 	int err;
@@ -1318,7 +1332,8 @@ static int ll_link(struct dentry *old_dentry, struct inode *dir,
         RETURN(err);
 }
 
-static int ll_mkdir(struct inode *dir, struct dentry *dchild, ll_umode_t mode)
+static int ll_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		    struct dentry *dchild, umode_t mode)
 {
 	struct qstr *name = &dchild->d_name;
         int err;
@@ -1449,9 +1464,10 @@ static int ll_unlink(struct inode *dir, struct dentry *dchild)
 	RETURN(rc);
 }
 
-static int ll_rename(struct inode *src, struct dentry *src_dchild,
+static int ll_rename(struct user_namespace *mnt_userns,
+		     struct inode *src, struct dentry *src_dchild,
 		     struct inode *tgt, struct dentry *tgt_dchild
-#ifdef HAVE_IOPS_RENAME_WITH_FLAGS
+#if defined(HAVE_USER_NAMESPACE_ARG) || defined(HAVE_IOPS_RENAME_WITH_FLAGS)
 		     , unsigned int flags
 #endif
 		     )
@@ -1464,7 +1480,7 @@ static int ll_rename(struct inode *src, struct dentry *src_dchild,
 	int err;
 	ENTRY;
 
-#ifdef HAVE_IOPS_RENAME_WITH_FLAGS
+#if defined(HAVE_USER_NAMESPACE_ARG) || defined(HAVE_IOPS_RENAME_WITH_FLAGS)
 	if (flags)
 		return -EINVAL;
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr.c b/drivers/staging/lustrefsx/lustre/llite/xattr.c
index 78c774ef738c4..ae0ee171ac4e7 100644
--- a/drivers/staging/lustrefsx/lustre/llite/xattr.c
+++ b/drivers/staging/lustrefsx/lustre/llite/xattr.c
@@ -92,7 +92,13 @@ static int xattr_type_filter(struct ll_sb_info *sbi,
 	return 0;
 }
 
+#ifndef HAVE_USER_NAMESPACE_ARG
+#define ll_xattr_set_common(hd, ns, de, inode, name, value, size, flags) \
+	ll_xattr_set_common(hd, de, inode, name, value, size, flags)
+#endif
+
 static int ll_xattr_set_common(const struct xattr_handler *handler,
+			       struct user_namespace *mnt_userns,
 			       struct dentry *dentry, struct inode *inode,
 			       const char *name, const void *value, size_t size,
 			       int flags)
@@ -124,8 +130,9 @@ static int ll_xattr_set_common(const struct xattr_handler *handler,
 
 	if ((handler->flags == XATTR_ACL_ACCESS_T ||
 	     handler->flags == XATTR_ACL_DEFAULT_T) &&
-#ifdef HAVE_INODE_OWNER_OR_CAPABLE
-	    !inode_owner_or_capable(inode))
+/* Test for older kernels that was cleaned up in LU-12477 and LU-10092 */
+#if defined(HAVE_USER_NAMESPACE_ARG) || defined(HAVE_INODE_OWNER_OR_CAPABLE)
+	    !inode_owner_or_capable(mnt_userns, inode))
 #else
 	    !is_owner_or_cap(inode))
 #endif
@@ -305,7 +312,13 @@ static int ll_setstripe_ea(struct dentry *dentry, struct lov_user_md *lump,
 	return rc;
 }
 
+#ifndef HAVE_USER_NAMESPACE_ARG
+#define ll_xattr_set(hd, ns, de, inode, name, value, size, flags) \
+	ll_xattr_set(hd, de, inode, name, value, size, flags)
+#endif
+
 static int ll_xattr_set(const struct xattr_handler *handler,
+			struct user_namespace *mnt_userns,
 			struct dentry *dentry, struct inode *inode,
 			const char *name, const void *value, size_t size,
 			int flags)
@@ -333,8 +346,8 @@ static int ll_xattr_set(const struct xattr_handler *handler,
 		return 0;
 	}
 
-	return ll_xattr_set_common(handler, dentry, inode, name, value, size,
-				   flags);
+    return ll_xattr_set_common(handler, mnt_userns, dentry, inode, name,
+    				   value, size, flags);
 }
 
 int ll_xattr_list(struct inode *inode, const char *name, int type, void *buffer,
@@ -662,7 +675,8 @@ static int ll_xattr_set_4_3(const struct xattr_handler *handler,
 			    size, flags);
 }
 
-#elif !defined(HAVE_XATTR_HANDLER_INODE_PARAM)
+#elif !defined(HAVE_USER_NAMESPACE_ARG) && \
+!defined(HAVE_XATTR_HANDLER_INODE_PARAM)
 const struct xattr_handler *get_xattr_handler(int handler_flag)
 {
 	int i = 0;
@@ -708,7 +722,7 @@ static int ll_xattr_set_common_3_11(struct dentry *dentry, const char *name,
 	if (!handler)
 		return -ENXIO;
 
-	return ll_xattr_set_common(handler, dentry, dentry->d_inode, name,
+	return ll_xattr_set_common(handler, NULL, dentry, dentry->d_inode, name,
 				   value, size, flags);
 }
 
@@ -721,7 +735,7 @@ static int ll_xattr_set_3_11(struct dentry *dentry, const char *name,
 	if (!handler)
 		return -ENXIO;
 
-	return ll_xattr_set(handler, dentry, dentry->d_inode, name, value,
+	return ll_xattr_set(handler, NULL, dentry, dentry->d_inode, name, value,
 			    size, flags);
 }
 #endif
@@ -732,7 +746,8 @@ static const struct xattr_handler ll_user_xattr_handler = {
 #if defined(HAVE_XATTR_HANDLER_SIMPLIFIED)
 	.get = ll_xattr_get_common_4_3,
 	.set = ll_xattr_set_common_4_3,
-#elif !defined(HAVE_XATTR_HANDLER_INODE_PARAM)
+#elif !defined(HAVE_USER_NAMESPACE_ARG) && \
+!defined(HAVE_XATTR_HANDLER_INODE_PARAM)
 	.get = ll_xattr_get_common_3_11,
 	.set = ll_xattr_set_common_3_11,
 #else
@@ -747,7 +762,8 @@ static const struct xattr_handler ll_trusted_xattr_handler = {
 #if defined(HAVE_XATTR_HANDLER_SIMPLIFIED)
 	.get = ll_xattr_get_4_3,
 	.set = ll_xattr_set_4_3,
-#elif !defined(HAVE_XATTR_HANDLER_INODE_PARAM)
+#elif !defined(HAVE_USER_NAMESPACE_ARG) && \
+!defined(HAVE_XATTR_HANDLER_INODE_PARAM)
 	.get = ll_xattr_get_3_11,
 	.set = ll_xattr_set_3_11,
 #else
@@ -762,7 +778,8 @@ static const struct xattr_handler ll_security_xattr_handler = {
 #if defined(HAVE_XATTR_HANDLER_SIMPLIFIED)
 	.get = ll_xattr_get_common_4_3,
 	.set = ll_xattr_set_common_4_3,
-#elif !defined(HAVE_XATTR_HANDLER_INODE_PARAM)
+#elif !defined(HAVE_USER_NAMESPACE_ARG) && \
+!defined(HAVE_XATTR_HANDLER_INODE_PARAM)
 	.get = ll_xattr_get_common_3_11,
 	.set = ll_xattr_set_common_3_11,
 #else
@@ -781,7 +798,8 @@ static const struct xattr_handler ll_acl_access_xattr_handler = {
 #if defined(HAVE_XATTR_HANDLER_SIMPLIFIED)
 	.get = ll_xattr_get_common_4_3,
 	.set = ll_xattr_set_common_4_3,
-#elif !defined(HAVE_XATTR_HANDLER_INODE_PARAM)
+#elif !defined(HAVE_USER_NAMESPACE_ARG) && \
+!defined(HAVE_XATTR_HANDLER_INODE_PARAM)
 	.get = ll_xattr_get_common_3_11,
 	.set = ll_xattr_set_common_3_11,
 #else
@@ -800,7 +818,8 @@ static const struct xattr_handler ll_acl_default_xattr_handler = {
 #if defined(HAVE_XATTR_HANDLER_SIMPLIFIED)
 	.get = ll_xattr_get_common_4_3,
 	.set = ll_xattr_set_common_4_3,
-#elif !defined(HAVE_XATTR_HANDLER_INODE_PARAM)
+#elif !defined(HAVE_USER_NAMESPACE_ARG) && \
+!defined(HAVE_XATTR_HANDLER_INODE_PARAM)
 	.get = ll_xattr_get_common_3_11,
 	.set = ll_xattr_set_common_3_11,
 #else
@@ -815,7 +834,8 @@ static const struct xattr_handler ll_lustre_xattr_handler = {
 #if defined(HAVE_XATTR_HANDLER_SIMPLIFIED)
 	.get = ll_xattr_get_4_3,
 	.set = ll_xattr_set_4_3,
-#elif !defined(HAVE_XATTR_HANDLER_INODE_PARAM)
+#elif !defined(HAVE_USER_NAMESPACE_ARG) && \
+!defined(HAVE_XATTR_HANDLER_INODE_PARAM)
 	.get = ll_xattr_get_3_11,
 	.set = ll_xattr_set_3_11,
 #else
diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr_security.c b/drivers/staging/lustrefsx/lustre/llite/xattr_security.c
index 8f2e2e5cc1fa0..d7e9ec8473ae4 100644
--- a/drivers/staging/lustrefsx/lustre/llite/xattr_security.c
+++ b/drivers/staging/lustrefsx/lustre/llite/xattr_security.c
@@ -124,8 +124,8 @@ ll_initxattrs(struct inode *inode, const struct xattr *xattr_array,
 			break;
 		}
 
-		err = __vfs_setxattr(dentry, inode, full_name, xattr->value,
-				     xattr->value_len, XATTR_CREATE);
+		err = ll_vfs_setxattr(dentry, inode, full_name, xattr->value,
+				      xattr->value_len, XATTR_CREATE);
 		kfree(full_name);
 		if (err < 0)
 			break;
diff --git a/drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h b/drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h
index 1a37720e901eb..50b4b602e17eb 100644
--- a/drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h
+++ b/drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h
@@ -47,24 +47,24 @@ int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data);
 
 int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld);
 
-static inline int cld_is_sptlrpc(struct config_llog_data *cld)
+static inline bool cld_is_sptlrpc(struct config_llog_data *cld)
 {
-	return cld->cld_type == CONFIG_T_SPTLRPC;
+	return cld->cld_type == MGS_CFG_T_SPTLRPC;
 }
 
-static inline int cld_is_recover(struct config_llog_data *cld)
+static inline bool cld_is_recover(struct config_llog_data *cld)
 {
-	return cld->cld_type == CONFIG_T_RECOVER;
+	return cld->cld_type == MGS_CFG_T_RECOVER;
 }
 
-static inline int cld_is_nodemap(struct config_llog_data *cld)
+static inline bool cld_is_nodemap(struct config_llog_data *cld)
 {
-	return cld->cld_type == CONFIG_T_NODEMAP;
+	return cld->cld_type == MGS_CFG_T_NODEMAP;
 }
 
-static inline int cld_is_barrier(struct config_llog_data *cld)
+static inline bool cld_is_barrier(struct config_llog_data *cld)
 {
-	return cld->cld_type == CONFIG_T_BARRIER;
+	return cld->cld_type == MGS_CFG_T_BARRIER;
 }
 
 #endif  /* _MGC_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c b/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c
index a2a2bdd1f0732..2bd0f39dbed4b 100644
--- a/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c
+++ b/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c
@@ -53,7 +53,7 @@
 #include "mgc_internal.h"
 
 static int mgc_name2resid(char *name, int len, struct ldlm_res_id *res_id,
-                          int type)
+			  enum mgs_cfg_type type)
 {
         __u64 resname = 0;
 
@@ -72,14 +72,14 @@ static int mgc_name2resid(char *name, int len, struct ldlm_res_id *res_id,
         res_id->name[0] = cpu_to_le64(resname);
         /* XXX: unfortunately, sptlprc and config llog share one lock */
         switch(type) {
-        case CONFIG_T_CONFIG:
-        case CONFIG_T_SPTLRPC:
+	case MGS_CFG_T_CONFIG:
+	case MGS_CFG_T_SPTLRPC:
                 resname = 0;
                 break;
-	case CONFIG_T_RECOVER:
-	case CONFIG_T_PARAMS:
-	case CONFIG_T_NODEMAP:
-	case CONFIG_T_BARRIER:
+	case MGS_CFG_T_RECOVER:
+	case MGS_CFG_T_PARAMS:
+	case MGS_CFG_T_NODEMAP:
+	case MGS_CFG_T_BARRIER:
 		resname = type;
 		break;
         default:
@@ -91,7 +91,8 @@ static int mgc_name2resid(char *name, int len, struct ldlm_res_id *res_id,
         return 0;
 }
 
-int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, int type)
+int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id,
+		     enum mgs_cfg_type type)
 {
         /* fsname is at most 8 chars long, maybe contain "-".
          * e.g. "lustre", "SUN-000" */
@@ -99,7 +100,8 @@ int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, int type)
 }
 EXPORT_SYMBOL(mgc_fsname2resid);
 
-int mgc_logname2resid(char *logname, struct ldlm_res_id *res_id, int type)
+int mgc_logname2resid(char *logname, struct ldlm_res_id *res_id,
+		      enum mgs_cfg_type type)
 {
 	char *name_end;
 	int len;
@@ -170,18 +172,18 @@ static
 struct config_llog_data *config_log_find(char *logname,
                                          struct config_llog_instance *cfg)
 {
-        struct config_llog_data *cld;
-        struct config_llog_data *found = NULL;
-        void *                   instance;
-        ENTRY;
+	struct config_llog_data *cld;
+	struct config_llog_data *found = NULL;
+	unsigned long cfg_instance;
 
-        LASSERT(logname != NULL);
+	ENTRY;
+	LASSERT(logname != NULL);
 
-        instance = cfg ? cfg->cfg_instance : NULL;
+	cfg_instance = cfg ? cfg->cfg_instance : 0;
 	spin_lock(&config_list_lock);
 	list_for_each_entry(cld, &config_llog_list, cld_list_chain) {
-		/* check if instance equals */
-		if (instance != cld->cld_cfg.cfg_instance)
+		/* check if cfg_instance is the one we want */
+		if (cfg_instance != cld->cld_cfg.cfg_instance)
 			continue;
 
 		/* instance may be NULL, should check name */
@@ -198,7 +200,7 @@ struct config_llog_data *config_log_find(char *logname,
 static
 struct config_llog_data *do_config_log_add(struct obd_device *obd,
 					   char *logname,
-					   int type,
+					   enum mgs_cfg_type type,
 					   struct config_llog_instance *cfg,
 					   struct super_block *sb)
 {
@@ -207,8 +209,8 @@ struct config_llog_data *do_config_log_add(struct obd_device *obd,
 
 	ENTRY;
 
-	CDEBUG(D_MGC, "do adding config log %s:%p\n", logname,
-	       cfg ? cfg->cfg_instance : NULL);
+	CDEBUG(D_MGC, "do adding config log %s-%016lx\n", logname,
+	       cfg ? cfg->cfg_instance : 0);
 
 	OBD_ALLOC(cld, sizeof(*cld) + strlen(logname) + 1);
 	if (!cld)
@@ -253,47 +255,50 @@ struct config_llog_data *do_config_log_add(struct obd_device *obd,
 }
 
 static struct config_llog_data *config_recover_log_add(struct obd_device *obd,
-        char *fsname,
-        struct config_llog_instance *cfg,
-        struct super_block *sb)
+					char *fsname,
+					struct config_llog_instance *cfg,
+					struct super_block *sb)
 {
-        struct config_llog_instance lcfg = *cfg;
-        struct lustre_sb_info *lsi = s2lsi(sb);
-        struct config_llog_data *cld;
-        char logname[32];
+	struct config_llog_instance lcfg = *cfg;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct config_llog_data *cld;
+	char logname[32];
 
 	if (IS_OST(lsi))
-                return NULL;
+		return NULL;
 
 	/* for osp-on-ost, see lustre_start_osp() */
 	if (IS_MDT(lsi) && lcfg.cfg_instance)
 		return NULL;
 
-        /* we have to use different llog for clients and mdts for cmd
-         * where only clients are notified if one of cmd server restarts */
-        LASSERT(strlen(fsname) < sizeof(logname) / 2);
-        strcpy(logname, fsname);
+	/* We have to use different llog for clients and MDTs for DNE,
+	 * where only clients are notified if one of DNE server restarts.
+	 */
+	LASSERT(strlen(fsname) < sizeof(logname) / 2);
+	strncpy(logname, fsname, sizeof(logname));
 	if (IS_SERVER(lsi)) { /* mdt */
-                LASSERT(lcfg.cfg_instance == NULL);
-                lcfg.cfg_instance = sb;
-                strcat(logname, "-mdtir");
-        } else {
-                LASSERT(lcfg.cfg_instance != NULL);
-                strcat(logname, "-cliir");
-        }
+		LASSERT(lcfg.cfg_instance == 0);
+		lcfg.cfg_instance = ll_get_cfg_instance(sb);
+		strncat(logname, "-mdtir", sizeof(logname));
+	} else {
+		LASSERT(lcfg.cfg_instance != 0);
+		strncat(logname, "-cliir", sizeof(logname));
+	}
 
-        cld = do_config_log_add(obd, logname, CONFIG_T_RECOVER, &lcfg, sb);
-        return cld;
+	cld = do_config_log_add(obd, logname, MGS_CFG_T_RECOVER, &lcfg, sb);
+	return cld;
 }
 
-static struct config_llog_data *config_log_find_or_add(struct obd_device *obd,
-				char *logname, struct super_block *sb, int type,
-				struct config_llog_instance *cfg)
+static struct config_llog_data *
+config_log_find_or_add(struct obd_device *obd, char *logname,
+		       struct super_block *sb, enum mgs_cfg_type type,
+		       struct config_llog_instance *cfg)
 {
-	struct config_llog_instance	lcfg = *cfg;
-	struct config_llog_data		*cld;
+	struct config_llog_instance lcfg = *cfg;
+	struct config_llog_data *cld;
 
-	lcfg.cfg_instance = sb != NULL ? (void *)sb : (void *)obd;
+	/* Note class_config_llog_handler() depends on getting "obd" back */
+	lcfg.cfg_instance = sb ? ll_get_cfg_instance(sb) : (unsigned long)obd;
 
 	cld = config_log_find(logname, &lcfg);
 	if (unlikely(cld != NULL))
@@ -323,7 +328,8 @@ config_log_add(struct obd_device *obd, char *logname,
 	bool locked = false;
 	ENTRY;
 
-	CDEBUG(D_MGC, "adding config log %s:%p\n", logname, cfg->cfg_instance);
+	CDEBUG(D_MGC, "add config log %s-%016lx\n", logname,
+	       cfg->cfg_instance);
 
 	/*
 	 * for each regular log, the depended sptlrpc log name is
@@ -340,7 +346,7 @@ config_log_add(struct obd_device *obd, char *logname,
 
 	if (cfg->cfg_sub_clds & CONFIG_SUB_SPTLRPC) {
 		sptlrpc_cld = config_log_find_or_add(obd, seclogname, NULL,
-						     CONFIG_T_SPTLRPC, cfg);
+						     MGS_CFG_T_SPTLRPC, cfg);
 		if (IS_ERR(sptlrpc_cld)) {
 			CERROR("%s: can't create sptlrpc log %s: rc = %ld\n",
 			       obd->obd_name, seclogname, PTR_ERR(sptlrpc_cld));
@@ -350,7 +356,7 @@ config_log_add(struct obd_device *obd, char *logname,
 
 	if (!IS_MGS(lsi) && cfg->cfg_sub_clds & CONFIG_SUB_NODEMAP) {
 		nodemap_cld = config_log_find_or_add(obd, LUSTRE_NODEMAP_NAME,
-						     NULL, CONFIG_T_NODEMAP,
+						     NULL, MGS_CFG_T_NODEMAP,
 						     cfg);
 		if (IS_ERR(nodemap_cld)) {
 			rc = PTR_ERR(nodemap_cld);
@@ -362,7 +368,7 @@ config_log_add(struct obd_device *obd, char *logname,
 
 	if (cfg->cfg_sub_clds & CONFIG_SUB_PARAMS) {
 		params_cld = config_log_find_or_add(obd, PARAMS_FILENAME, sb,
-						    CONFIG_T_PARAMS, cfg);
+						    MGS_CFG_T_PARAMS, cfg);
 		if (IS_ERR(params_cld)) {
 			rc = PTR_ERR(params_cld);
 			CERROR("%s: can't create params log: rc = %d\n",
@@ -375,7 +381,7 @@ config_log_add(struct obd_device *obd, char *logname,
 		snprintf(seclogname + (ptr - logname), sizeof(seclogname) - 1,
 			 "-%s", BARRIER_FILENAME);
 		barrier_cld = config_log_find_or_add(obd, seclogname, sb,
-						     CONFIG_T_BARRIER, cfg);
+						     MGS_CFG_T_BARRIER, cfg);
 		if (IS_ERR(barrier_cld)) {
 			rc = PTR_ERR(barrier_cld);
 			CERROR("%s: can't create barrier log: rc = %d\n",
@@ -384,7 +390,7 @@ config_log_add(struct obd_device *obd, char *logname,
 		}
 	}
 
-	cld = do_config_log_add(obd, logname, CONFIG_T_CONFIG, cfg, sb);
+	cld = do_config_log_add(obd, logname, MGS_CFG_T_CONFIG, cfg, sb);
 	if (IS_ERR(cld)) {
 		rc = PTR_ERR(cld);
 		CERROR("%s: can't create log: rc = %d\n",
@@ -1404,34 +1410,34 @@ static int mgc_apply_recover_logs(struct obd_device *mgc,
 				  __u64 max_version,
 				  void *data, int datalen, bool mne_swab)
 {
-        struct config_llog_instance *cfg = &cld->cld_cfg;
-        struct lustre_sb_info       *lsi = s2lsi(cfg->cfg_sb);
-        struct mgs_nidtbl_entry *entry;
-        struct lustre_cfg       *lcfg;
-        struct lustre_cfg_bufs   bufs;
-        u64   prev_version = 0;
-        char *inst;
-        char *buf;
-        int   bufsz;
-        int   pos;
-        int   rc  = 0;
-        int   off = 0;
-        ENTRY;
+	struct config_llog_instance *cfg = &cld->cld_cfg;
+	struct lustre_sb_info *lsi = s2lsi(cfg->cfg_sb);
+	struct mgs_nidtbl_entry *entry;
+	struct lustre_cfg *lcfg;
+	struct lustre_cfg_bufs bufs;
+	u64 prev_version = 0;
+	char *inst;
+	char *buf;
+	int bufsz;
+	int pos;
+	int rc  = 0;
+	int off = 0;
 
-        LASSERT(cfg->cfg_instance != NULL);
-        LASSERT(cfg->cfg_sb == cfg->cfg_instance);
+	ENTRY;
+	LASSERT(cfg->cfg_instance != 0);
+	LASSERT(ll_get_cfg_instance(cfg->cfg_sb) == cfg->cfg_instance);
 
 	OBD_ALLOC(inst, PAGE_SIZE);
 	if (inst == NULL)
 		RETURN(-ENOMEM);
 
 	if (!IS_SERVER(lsi)) {
-		pos = snprintf(inst, PAGE_SIZE, "%p", cfg->cfg_instance);
+		pos = snprintf(inst, PAGE_SIZE, "%016lx", cfg->cfg_instance);
 		if (pos >= PAGE_SIZE) {
 			OBD_FREE(inst, PAGE_SIZE);
 			return -E2BIG;
 		}
-        } else {
+	} else {
 		LASSERT(IS_MDT(lsi));
 		rc = server_name2svname(lsi->lsi_svname, inst, NULL,
 					PAGE_SIZE);
@@ -2062,12 +2068,12 @@ int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld)
 	mutex_lock(&cld->cld_lock);
 	if (cld->cld_stopping) {
 		mutex_unlock(&cld->cld_lock);
-                RETURN(0);
-        }
+		RETURN(0);
+	}
 
-        OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PAUSE_PROCESS_LOG, 20);
+	OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PAUSE_PROCESS_LOG, 20);
 
-	CDEBUG(D_MGC, "Process log %s:%p from %d\n", cld->cld_logname,
+	CDEBUG(D_MGC, "Process log %s-%016lx from %d\n", cld->cld_logname,
 	       cld->cld_cfg.cfg_instance, cld->cld_cfg.cfg_last_idx + 1);
 
 	/* Get the cfg lock on the llog */
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
index 924322ef86e8c..9f5fffd48bd61 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
@@ -869,7 +869,7 @@ static int class_del_conn(struct obd_device *obd, struct lustre_cfg *lcfg)
 static LIST_HEAD(lustre_profile_list);
 static DEFINE_SPINLOCK(lustre_profile_list_lock);
 
-struct lustre_profile *class_get_profile(const char * prof)
+struct lustre_profile *class_get_profile(const char *prof)
 {
 	struct lustre_profile *lprof;
 
@@ -1546,8 +1546,7 @@ int class_config_llog_handler(const struct lu_env *env,
 		if (!(cfg->cfg_flags & CFG_F_COMPAT146) &&
 		    !(cfg->cfg_flags & CFG_F_MARKER) &&
 		    (lcfg->lcfg_command != LCFG_MARKER)) {
-			CWARN("Config not inside markers, ignoring! "
-			      "(inst: %p, uuid: %s, flags: %#x)\n",
+			CWARN("Skip config outside markers, (inst: %016lx, uuid: %s, flags: %#x)\n",
 				cfg->cfg_instance,
 				cfg->cfg_uuid.uuid, cfg->cfg_flags);
 			cfg->cfg_flags |= CFG_F_SKIP;
@@ -1623,12 +1622,11 @@ int class_config_llog_handler(const struct lu_env *env,
 		if (cfg->cfg_instance &&
 		    lcfg->lcfg_command != LCFG_SPTLRPC_CONF &&
 		    LUSTRE_CFG_BUFLEN(lcfg, 0) > 0) {
-			inst_len = LUSTRE_CFG_BUFLEN(lcfg, 0) +
-				   sizeof(cfg->cfg_instance) * 2 + 4;
+			inst_len = LUSTRE_CFG_BUFLEN(lcfg, 0) + 16 + 4;
 			OBD_ALLOC(inst_name, inst_len);
 			if (inst_name == NULL)
 				GOTO(out, rc = -ENOMEM);
-			snprintf(inst_name, inst_len, "%s-%p",
+			snprintf(inst_name, inst_len, "%s-%016lx",
 				lustre_cfg_string(lcfg, 0),
 				cfg->cfg_instance);
 			lustre_cfg_bufs_set_string(&bufs, 0, inst_name);
@@ -1636,23 +1634,22 @@ int class_config_llog_handler(const struct lu_env *env,
 			       lcfg->lcfg_command, inst_name);
 		}
 
-                /* we override the llog's uuid for clients, to insure they
-                are unique */
-		if (cfg->cfg_instance != NULL &&
-		    lcfg->lcfg_command == LCFG_ATTACH) {
+		/* override llog UUID for clients, to insure they are unique */
+		if (cfg->cfg_instance && lcfg->lcfg_command == LCFG_ATTACH)
 			lustre_cfg_bufs_set_string(&bufs, 2,
 						   cfg->cfg_uuid.uuid);
-		}
-                /*
-                 * sptlrpc config record, we expect 2 data segments:
-                 *  [0]: fs_name/target_name,
-                 *  [1]: rule string
-                 * moving them to index [1] and [2], and insert MGC's
-                 * obdname at index [0].
-                 */
+		/*
+		 * sptlrpc config record, we expect 2 data segments:
+		 *  [0]: fs_name/target_name,
+		 *  [1]: rule string
+		 * moving them to index [1] and [2], and insert MGC's
+		 * obdname at index [0].
+		 */
 		if (cfg->cfg_instance &&
 		    lcfg->lcfg_command == LCFG_SPTLRPC_CONF) {
-			struct obd_device *obd = cfg->cfg_instance;
+			/* After ASLR changes cfg_instance this needs fixing */
+			/* "obd" is set in config_log_find_or_add() */
+			struct obd_device *obd = (void *)cfg->cfg_instance;
 
 			lustre_cfg_bufs_set(&bufs, 2, bufs.lcfg_buf[1],
 					    bufs.lcfg_buflen[1]);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c
index b1f59d8f6b303..6bec75198e190 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c
@@ -939,7 +939,7 @@ static int lustre_disconnect_lwp(struct super_block *sb)
 			GOTO(out, rc = -ENOMEM);
 
 		/* end log first */
-		cfg->cfg_instance = sb;
+		cfg->cfg_instance = ll_get_cfg_instance(sb);
 		rc = lustre_end_log(sb, logname, cfg);
 		if (rc != 0 && rc != -ENOENT)
 			GOTO(out, rc);
@@ -1051,7 +1051,7 @@ static int lustre_start_lwp(struct super_block *sb)
 		GOTO(out, rc = -ENOMEM);
 
 	cfg->cfg_callback = client_lwp_config_process;
-	cfg->cfg_instance = sb;
+	cfg->cfg_instance = ll_get_cfg_instance(sb);
 	rc = lustre_process_log(sb, logname, cfg);
 	/* need to remove config llog from mgc */
 	lsi->lsi_lwp_started = 1;
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c b/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c
index 94828872d70ac..3fdaf4e78ff65 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c
@@ -4164,12 +4164,12 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct mgs_config_body, mcb_units));
 	LASSERTF((int)sizeof(((struct mgs_config_body *)0)->mcb_units) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct mgs_config_body *)0)->mcb_units));
-	CLASSERT(CONFIG_T_CONFIG == 0);
-	CLASSERT(CONFIG_T_SPTLRPC == 1);
-	CLASSERT(CONFIG_T_RECOVER == 2);
-	CLASSERT(CONFIG_T_PARAMS == 3);
-	CLASSERT(CONFIG_T_NODEMAP == 4);
-	CLASSERT(CONFIG_T_BARRIER == 5);
+	CLASSERT(MGS_CFG_T_CONFIG == 0);
+	CLASSERT(MGS_CFG_T_SPTLRPC == 1);
+	CLASSERT(MGS_CFG_T_RECOVER == 2);
+	CLASSERT(MGS_CFG_T_PARAMS == 3);
+	CLASSERT(MGS_CFG_T_NODEMAP == 4);
+	CLASSERT(MGS_CFG_T_BARRIER == 5);
 
 	/* Checks for struct mgs_config_res */
 	LASSERTF((int)sizeof(struct mgs_config_res) == 16, "found %lld\n",
diff --git a/drivers/staging/lustrefsx/undef.h b/drivers/staging/lustrefsx/undef.h
index aa1343bf5a36d..4d27f6dfa46fb 100644
--- a/drivers/staging/lustrefsx/undef.h
+++ b/drivers/staging/lustrefsx/undef.h
@@ -890,6 +890,9 @@
 /* Define to 1 if you have the <unistd.h> header file. */
 #undef HAVE_UNISTD_H
 
+/* 'inode_operations' members have user namespace argument */
+#undef HAVE_USER_NAMESPACE_ARG
+
 /* kernel has vfs_rename with 5 args */
 #undef HAVE_VFS_RENAME_5ARGS
 

From 06c3a02841df6a69e50ccfb0e0dfeefde7aa5ec4 Mon Sep 17 00:00:00 2001
From: Tighe Barris <tbarri@amazon.com>
Date: Wed, 25 May 2022 16:43:55 +0000
Subject: [PATCH 385/737] Correct read overflow in page touching DMA ops
 binding

To force a page into residence, a read operation is performed on behalf
of devices without an IOMMU. This functionality is required to facilitate
memory overcommitted hosts.

Commit 25d4ce2 ("Introduce page touching DMA ops binding") initially
introduced this logic by invoking a '__raw_readl' function. This function
can however read past the bounds of memory mapped for DMA. Instead,
it is replaced with '__raw_readb'. This limits the length of memory read
to a byte, and prevents reading past the range of mapped memory.

Fixes: 25d4ce2 ("Introduce page touching DMA ops binding")
Signed-off-by: Tighe Barris <tbarri@amazon.com>
Cc-Team: kaos-brimstone <kaos-brimstone@amazon.com>
Cc-Team: ec2-memo <ec2-memo@amazon.com>
---
 kernel/dma/page_touching.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/dma/page_touching.c b/kernel/dma/page_touching.c
index a9bb7901d769e..c5ffb90a40a51 100644
--- a/kernel/dma/page_touching.c
+++ b/kernel/dma/page_touching.c
@@ -36,7 +36,7 @@
 #include <linux/moduleparam.h>
 
 /*
- * A wrapper around dma_direct which does a readl on the memory being mapped
+ * A wrapper around dma_direct which does a readb on the memory being mapped
  * for DMA to ensure that it becomes resident.
  * Useful when running in a memory overcommit environment with lazy allocation
  * and free page reporting.
@@ -56,7 +56,7 @@ static void touch_each_page(void *start_addr, size_t size)
 	int addr_offset;
 
 	for (addr_offset = 0; addr_offset < size; addr_offset += PAGE_SIZE)
-		__raw_readl((char *)start_addr + addr_offset);
+		__raw_readb((char *)start_addr + addr_offset);
 }
 
 static void *page_touching_dma_alloc(struct device *dev, size_t size,

From ea34c726da1f3b3bfa6ef084d7a30ad398595c58 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 23 Aug 2021 11:18:44 +0100
Subject: [PATCH 386/737] iov_iter: track truncated size

[ Upstream commit 2112ff5ce0c1128fe7b4d19cfe7f2b8ce5b595fa ]

Remember how many bytes were truncated and reverted back. Because
not reexpanded iterators don't always work well with reverting, we may
need to know that to reexpand ourselves when needed.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/uio.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 06ddb12f3b649..4b19d7dd003d5 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -54,6 +54,7 @@ struct iov_iter {
 			unsigned int start_head;
 		};
 	};
+	size_t truncated;
 };
 
 static inline enum iter_type iov_iter_type(const struct iov_iter *i)
@@ -271,8 +272,10 @@ static inline void iov_iter_truncate(struct iov_iter *i, u64 count)
 	 * conversion in assignement is by definition greater than all
 	 * values of size_t, including old i->count.
 	 */
-	if (i->count > count)
+	if (i->count > count) {
+		i->truncated += i->count - count;
 		i->count = count;
+	}
 }
 
 /*
@@ -281,6 +284,7 @@ static inline void iov_iter_truncate(struct iov_iter *i, u64 count)
  */
 static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
 {
+	i->truncated -= count - i->count;
 	i->count = count;
 }
 

From 194c5f38b61778bff0aaaf0bfb0cf870d19d9fab Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 10 Jan 2022 14:05:49 +0000
Subject: [PATCH 387/737] bpf: Generalize check_ctx_reg for reuse with other
 types

[ Upstream commit be80a1d3f9dbe5aee79a325964f7037fe2d92f30 ]

Generalize the check_ctx_reg() helper function into a more generic named one
so that it can be reused for other register types as well to check whether
their offset is non-zero. No functional change.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
[5.10 - Adjust context]
Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 include/linux/bpf_verifier.h |  4 ++--
 kernel/bpf/btf.c             |  2 +-
 kernel/bpf/verifier.c        | 21 +++++++++++----------
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index d47f127fcf6e9..b0343efecce5f 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -497,8 +497,8 @@ bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off,
 void
 bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt);
 
-int check_ctx_reg(struct bpf_verifier_env *env,
-		  const struct bpf_reg_state *reg, int regno);
+int check_ptr_off_reg(struct bpf_verifier_env *env,
+		      const struct bpf_reg_state *reg, int regno);
 
 /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */
 static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 5e13f6a8b9b73..7d4a7b0b7deec 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5217,7 +5217,7 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
 						i, btf_kind_str[BTF_INFO_KIND(t->info)]);
 					goto out;
 				}
-				if (check_ctx_reg(env, &reg[i + 1], i + 1))
+				if (check_ptr_off_reg(env, &reg[i + 1], i + 1))
 					goto out;
 				continue;
 			}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0e0ff32adabc4..2100ee350fe0c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3467,16 +3467,16 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env,
 }
 #endif
 
-int check_ctx_reg(struct bpf_verifier_env *env,
-		  const struct bpf_reg_state *reg, int regno)
+int check_ptr_off_reg(struct bpf_verifier_env *env,
+		      const struct bpf_reg_state *reg, int regno)
 {
-	/* Access to ctx or passing it to a helper is only allowed in
-	 * its original, unmodified form.
+	/* Access to this pointer-typed register or passing it to a helper
+	 * is only allowed in its original, unmodified form.
 	 */
 
 	if (reg->off) {
-		verbose(env, "dereference of modified ctx ptr R%d off=%d disallowed\n",
-			regno, reg->off);
+		verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n",
+			reg_type_str(env, reg->type), regno, reg->off);
 		return -EACCES;
 	}
 
@@ -3484,7 +3484,8 @@ int check_ctx_reg(struct bpf_verifier_env *env,
 		char tn_buf[48];
 
 		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-		verbose(env, "variable ctx access var_off=%s disallowed\n", tn_buf);
+		verbose(env, "variable %s access var_off=%s disallowed\n",
+			reg_type_str(env, reg->type), tn_buf);
 		return -EACCES;
 	}
 
@@ -3922,7 +3923,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			return -EACCES;
 		}
 
-		err = check_ctx_reg(env, reg, regno);
+		err = check_ptr_off_reg(env, reg, regno);
 		if (err < 0)
 			return err;
 
@@ -4657,7 +4658,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 		return err;
 
 	if (type == PTR_TO_CTX) {
-		err = check_ctx_reg(env, reg, regno);
+		err = check_ptr_off_reg(env, reg, regno);
 		if (err < 0)
 			return err;
 	}
@@ -8405,7 +8406,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			return err;
 	}
 
-	err = check_ctx_reg(env, &regs[ctx_reg], ctx_reg);
+	err = check_ptr_off_reg(env, &regs[ctx_reg], ctx_reg);
 	if (err < 0)
 		return err;
 

From d07434af8866ea255712260bd2b70e11b9e71527 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 14 Jan 2022 13:58:36 +0000
Subject: [PATCH 388/737] bpf: Mark PTR_TO_FUNC register initially with zero
 offset

[ Upstream commit d400a6cf1c8a57cdf10f35220ead3284320d85ff ]

Similar as with other pointer types where we use ldimm64, clear the register
content to zero first, and then populate the PTR_TO_FUNC type and subprogno
number. Currently this is not done, and leads to reuse of stale register
tracking data.

Given for special ldimm64 cases we always clear the register offset, make it
common for all cases, so it won't be forgotten in future.

Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2100ee350fe0c..c004b9225b14e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -8284,9 +8284,13 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		return 0;
 	}
 
-	if (insn->src_reg == BPF_PSEUDO_BTF_ID) {
-		mark_reg_known_zero(env, regs, insn->dst_reg);
+	/* All special src_reg cases are listed below. From this point onwards
+	 * we either succeed and assign a corresponding dst_reg->type after
+	 * zeroing the offset, or fail and reject the program.
+	 */
+	mark_reg_known_zero(env, regs, insn->dst_reg);
 
+	if (insn->src_reg == BPF_PSEUDO_BTF_ID) {
 		dst_reg->type = aux->btf_var.reg_type;
 		switch (base_type(dst_reg->type)) {
 		case PTR_TO_MEM:
@@ -8304,7 +8308,6 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	}
 
 	map = env->used_maps[aux->map_index];
-	mark_reg_known_zero(env, regs, insn->dst_reg);
 	dst_reg->map_ptr = map;
 
 	if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) {

From 31ccfc7694e0eb8e73e7b10f0eba98c8b7be351b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 10 Jan 2022 14:40:40 +0000
Subject: [PATCH 389/737] bpf: Generally fix helper register offset check

[ Upstream commit 6788ab23508bddb0a9d88e104284922cb2c22b77 ]

Right now the assertion on check_ptr_off_reg() is only enforced for register
types PTR_TO_CTX (and open coded also for PTR_TO_BTF_ID), however, this is
insufficient since many other PTR_TO_* register types such as PTR_TO_FUNC do
not handle/expect register offsets when passed to helper functions.

Given this can slip-through easily when adding new types, make this an explicit
allow-list and reject all other current and future types by default if this is
encountered.

Also, extend check_ptr_off_reg() to handle PTR_TO_BTF_ID as well instead of
duplicating it. For PTR_TO_BTF_ID, reg->off is used for BTF to match expected
BTF ids if struct offset is used. This part still needs to be allowed, but the
dynamic off from the tnum must be rejected.

Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper")
Fixes: eaa6bcb71ef6 ("bpf: Introduce bpf_per_cpu_ptr()")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 39 ++++++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c004b9225b14e..3c9c089a64769 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3467,14 +3467,15 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env,
 }
 #endif
 
-int check_ptr_off_reg(struct bpf_verifier_env *env,
-		      const struct bpf_reg_state *reg, int regno)
+static int __check_ptr_off_reg(struct bpf_verifier_env *env,
+			       const struct bpf_reg_state *reg, int regno,
+			       bool fixed_off_ok)
 {
 	/* Access to this pointer-typed register or passing it to a helper
 	 * is only allowed in its original, unmodified form.
 	 */
 
-	if (reg->off) {
+	if (!fixed_off_ok && reg->off) {
 		verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n",
 			reg_type_str(env, reg->type), regno, reg->off);
 		return -EACCES;
@@ -3492,6 +3493,12 @@ int check_ptr_off_reg(struct bpf_verifier_env *env,
 	return 0;
 }
 
+int check_ptr_off_reg(struct bpf_verifier_env *env,
+		      const struct bpf_reg_state *reg, int regno)
+{
+	return __check_ptr_off_reg(env, reg, regno, false);
+}
+
 static int __check_buffer_access(struct bpf_verifier_env *env,
 				 const char *buf_info,
 				 const struct bpf_reg_state *reg,
@@ -4597,12 +4604,6 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
 				kernel_type_name(*arg_btf_id));
 			return -EACCES;
 		}
-
-		if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
-			verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n",
-				regno);
-			return -EACCES;
-		}
 	}
 
 	return 0;
@@ -4657,10 +4658,26 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 	if (err)
 		return err;
 
-	if (type == PTR_TO_CTX) {
-		err = check_ptr_off_reg(env, reg, regno);
+	switch ((u32)type) {
+	case SCALAR_VALUE:
+	/* Pointer types where reg offset is explicitly allowed: */
+	case PTR_TO_PACKET:
+	case PTR_TO_PACKET_META:
+	case PTR_TO_MAP_KEY:
+	case PTR_TO_MAP_VALUE:
+	case PTR_TO_MEM:
+	case PTR_TO_MEM | MEM_RDONLY:
+	case PTR_TO_BUF:
+	case PTR_TO_BUF | MEM_RDONLY:
+	case PTR_TO_STACK:
+		break;
+	/* All the rest must be rejected: */
+	default:
+		err = __check_ptr_off_reg(env, reg, regno,
+					  type == PTR_TO_BTF_ID);
 		if (err < 0)
 			return err;
+		break;
 	}
 
 skip_type_check:

From be89cde571c1252fa2a804bae872a940d976bb7a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 11 Jan 2022 14:43:41 +0000
Subject: [PATCH 390/737] bpf: Fix out of bounds access for ringbuf helpers

[ Upstream commit 64620e0a1e712a778095bd35cbb277dc2259281f ]

Both bpf_ringbuf_submit() and bpf_ringbuf_discard() have ARG_PTR_TO_ALLOC_MEM
in their bpf_func_proto definition as their first argument. They both expect
the result from a prior bpf_ringbuf_reserve() call which has a return type of
RET_PTR_TO_ALLOC_MEM_OR_NULL.

Meaning, after a NULL check in the code, the verifier will promote the register
type in the non-NULL branch to a PTR_TO_MEM and in the NULL branch to a known
zero scalar. Generally, pointer arithmetic on PTR_TO_MEM is allowed, so the
latter could have an offset.

The ARG_PTR_TO_ALLOC_MEM expects a PTR_TO_MEM register type. However, the non-
zero result from bpf_ringbuf_reserve() must be fed into either bpf_ringbuf_submit()
or bpf_ringbuf_discard() but with the original offset given it will then read
out the struct bpf_ringbuf_hdr mapping.

The verifier missed to enforce a zero offset, so that out of bounds access
can be triggered which could be used to escalate privileges if unprivileged
BPF was enabled (disabled by default in kernel).

Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it")
Reported-by: <tr3e.wang@gmail.com> (SecCoder Security Lab)
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3c9c089a64769..4a15e47b9f3a6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4670,9 +4670,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 	case PTR_TO_BUF:
 	case PTR_TO_BUF | MEM_RDONLY:
 	case PTR_TO_STACK:
+		/* Some of the argument types nevertheless require a
+		 * zero register offset.
+		 */
+		if (arg_type == ARG_PTR_TO_ALLOC_MEM)
+			goto force_off_check;
 		break;
 	/* All the rest must be rejected: */
 	default:
+force_off_check:
 		err = __check_ptr_off_reg(env, reg, regno,
 					  type == PTR_TO_BTF_ID);
 		if (err < 0)

From ddbb1601dff54a1e1d68280ee3b977e55ce031fe Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 13 Jan 2022 11:11:30 +0000
Subject: [PATCH 391/737] bpf: Fix ringbuf memory type confusion when passing
 to helpers

[ Upsteam commit a672b2e36a648afb04ad3bda93b6bda947a479a5 ]

The bpf_ringbuf_submit() and bpf_ringbuf_discard() have ARG_PTR_TO_ALLOC_MEM
in their bpf_func_proto definition as their first argument, and thus both expect
the result from a prior bpf_ringbuf_reserve() call which has a return type of
RET_PTR_TO_ALLOC_MEM_OR_NULL.

While the non-NULL memory from bpf_ringbuf_reserve() can be passed to other
helpers, the two sinks (bpf_ringbuf_submit(), bpf_ringbuf_discard()) right now
only enforce a register type of PTR_TO_MEM.

This can lead to potential type confusion since it would allow other PTR_TO_MEM
memory to be passed into the two sinks which did not come from bpf_ringbuf_reserve().

Add a new MEM_ALLOC composable type attribute for PTR_TO_MEM, and enforce that:

 - bpf_ringbuf_reserve() returns NULL or PTR_TO_MEM | MEM_ALLOC
 - bpf_ringbuf_submit() and bpf_ringbuf_discard() only take PTR_TO_MEM | MEM_ALLOC
   but not plain PTR_TO_MEM arguments via ARG_PTR_TO_ALLOC_MEM
 - however, other helpers might treat PTR_TO_MEM | MEM_ALLOC as plain PTR_TO_MEM
   to populate the memory area when they use ARG_PTR_TO_{UNINIT_,}MEM in their
   func proto description

Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it")
Reported-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h   | 9 +++++++--
 kernel/bpf/verifier.c | 6 +++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 629f219b601d4..98a59a5110a8c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -273,7 +273,12 @@ enum bpf_type_flag {
 	 */
 	MEM_RDONLY		= BIT(1 + BPF_BASE_TYPE_BITS),
 
-	__BPF_TYPE_LAST_FLAG	= MEM_RDONLY,
+	/* MEM was "allocated" from a different helper, and cannot be mixed
+	 * with regular non-MEM_ALLOC'ed MEM types.
+	 */
+	MEM_ALLOC		= BIT(2 + BPF_BASE_TYPE_BITS),
+
+	__BPF_TYPE_LAST_FLAG	= MEM_ALLOC,
 };
 
 /* Max number of base types. */
@@ -352,7 +357,7 @@ enum bpf_return_type {
 	RET_PTR_TO_SOCKET_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_SOCKET,
 	RET_PTR_TO_TCP_SOCK_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK,
 	RET_PTR_TO_SOCK_COMMON_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON,
-	RET_PTR_TO_ALLOC_MEM_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_ALLOC_MEM,
+	RET_PTR_TO_ALLOC_MEM_OR_NULL	= PTR_MAYBE_NULL | MEM_ALLOC | RET_PTR_TO_ALLOC_MEM,
 	RET_PTR_TO_BTF_ID_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID,
 
 	/* This must be the last entry. Its purpose is to ensure the enum is
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4a15e47b9f3a6..f58b8506ddf33 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -523,6 +523,8 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
 
 	if (type & MEM_RDONLY)
 		strncpy(prefix, "rdonly_", 16);
+	if (type & MEM_ALLOC)
+		strncpy(prefix, "alloc_", 16);
 
 	snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s",
 		 prefix, str[base_type(type)], postfix);
@@ -4481,6 +4483,7 @@ static const struct bpf_reg_types mem_types = {
 		PTR_TO_PACKET_META,
 		PTR_TO_MAP_VALUE,
 		PTR_TO_MEM,
+		PTR_TO_MEM | MEM_ALLOC,
 		PTR_TO_BUF,
 	},
 };
@@ -4497,7 +4500,7 @@ static const struct bpf_reg_types int_ptr_types = {
 static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } };
 static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } };
 static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } };
-static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM } };
+static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM | MEM_ALLOC } };
 static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } };
 static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } };
 static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } };
@@ -4667,6 +4670,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 	case PTR_TO_MAP_VALUE:
 	case PTR_TO_MEM:
 	case PTR_TO_MEM | MEM_RDONLY:
+	case PTR_TO_MEM | MEM_ALLOC:
 	case PTR_TO_BUF:
 	case PTR_TO_BUF | MEM_RDONLY:
 	case PTR_TO_STACK:

From 9391dbf9c02a2be11c00d1a3a9f72307f7a59ab8 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 5 Jan 2022 11:33:34 -0800
Subject: [PATCH 392/737] bpf, selftests: Add verifier test for mem_or_null
 register with offset.

[ Upstream commit ca796fe66f7fceff17679ee6cc5fe4b4023de44d ]

Add a new test case with mem_or_null typed register with off > 0 to ensure
it gets rejected by the verifier:

  # ./test_verifier 1011
  #1009/u check with invalid reg offset 0 OK
  #1009/p check with invalid reg offset 0 OK
  Summary: 2 PASSED, 0 SKIPPED, 0 FAILED

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/verifier/spill_fill.c       | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/tools/testing/selftests/bpf/verifier/spill_fill.c b/tools/testing/selftests/bpf/verifier/spill_fill.c
index 0b943897aaf6c..baccfa341516e 100644
--- a/tools/testing/selftests/bpf/verifier/spill_fill.c
+++ b/tools/testing/selftests/bpf/verifier/spill_fill.c
@@ -58,6 +58,34 @@
 	.result = ACCEPT,
 	.result_unpriv = ACCEPT,
 },
+{
+	"check with invalid reg offset 0",
+	.insns = {
+	/* reserve 8 byte ringbuf memory */
+	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_MOV64_IMM(BPF_REG_2, 8),
+	BPF_MOV64_IMM(BPF_REG_3, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve),
+	/* store a pointer to the reserved memory in R6 */
+	BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+	/* add invalid offset to memory or NULL */
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1),
+	/* check whether the reservation was successful */
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+	/* should not be able to access *(R7) = 0 */
+	BPF_ST_MEM(BPF_W, BPF_REG_6, 0, 0),
+	/* submit the reserved ringbuf memory */
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+	BPF_MOV64_IMM(BPF_REG_2, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_submit),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_ringbuf = { 1 },
+	.result = REJECT,
+	.errstr = "R0 pointer arithmetic on alloc_mem_or_null prohibited",
+},
 {
 	"check corrupted spill/fill",
 	.insns = {

From d94e2d89930c470e82b333bfc60c8eafb084968d Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 10 Jan 2022 13:44:18 +0000
Subject: [PATCH 393/737] bpf, selftests: Add various ringbuf tests with
 invalid offset

[ Upstream commit 722e4db3ae0d52b2e3801280afbe19cf2d188e91 ]

Assert that the verifier is rejecting invalid offsets on the ringbuf entries:

  # ./test_verifier | grep ring
  #947/u ringbuf: invalid reservation offset 1 OK
  #947/p ringbuf: invalid reservation offset 1 OK
  #948/u ringbuf: invalid reservation offset 2 OK
  #948/p ringbuf: invalid reservation offset 2 OK

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 .../testing/selftests/bpf/verifier/ringbuf.c  | 64 +++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/verifier/ringbuf.c

diff --git a/tools/testing/selftests/bpf/verifier/ringbuf.c b/tools/testing/selftests/bpf/verifier/ringbuf.c
new file mode 100644
index 0000000000000..e26dccd188c22
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/ringbuf.c
@@ -0,0 +1,64 @@
+{
+	"ringbuf: invalid reservation offset 1",
+	.insns = {
+	/* reserve 8 byte ringbuf memory */
+	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_MOV64_IMM(BPF_REG_2, 8),
+	BPF_MOV64_IMM(BPF_REG_3, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve),
+	/* store a pointer to the reserved memory in R6 */
+	BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+	/* check whether the reservation was successful */
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+	/* spill R6(mem) into the stack */
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -8),
+	/* fill it back in R7 */
+	BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, -8),
+	/* should be able to access *(R7) = 0 */
+	BPF_ST_MEM(BPF_DW, BPF_REG_7, 0, 0),
+	/* submit the reserved ringbuf memory */
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+	/* add invalid offset to reserved ringbuf memory */
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0xcafe),
+	BPF_MOV64_IMM(BPF_REG_2, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_submit),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_ringbuf = { 1 },
+	.result = REJECT,
+	.errstr = "dereference of modified alloc_mem ptr R1",
+},
+{
+	"ringbuf: invalid reservation offset 2",
+	.insns = {
+	/* reserve 8 byte ringbuf memory */
+	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_MOV64_IMM(BPF_REG_2, 8),
+	BPF_MOV64_IMM(BPF_REG_3, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve),
+	/* store a pointer to the reserved memory in R6 */
+	BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+	/* check whether the reservation was successful */
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+	/* spill R6(mem) into the stack */
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -8),
+	/* fill it back in R7 */
+	BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, -8),
+	/* add invalid offset to reserved ringbuf memory */
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 0xcafe),
+	/* should be able to access *(R7) = 0 */
+	BPF_ST_MEM(BPF_DW, BPF_REG_7, 0, 0),
+	/* submit the reserved ringbuf memory */
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+	BPF_MOV64_IMM(BPF_REG_2, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_submit),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_ringbuf = { 1 },
+	.result = REJECT,
+	.errstr = "R7 min value is outside of the allowed memory range",
+},

From afa7ee22743638877af7ec908d9af320cb6fa86d Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <surajjs@amazon.com>
Date: Wed, 25 May 2022 15:36:38 -0700
Subject: [PATCH 394/737] mm/migrate: Don't drop mapping lock in
 unmap_and_move_huge_page()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In unmap_and_move_huge_page() when unmapping a shared huge page the lock is
taken, dropped again and then reacquired in remove_migration_ptes(). This can
lead to a deadlock when a hugepage is being migrated while also being accessed
and a process with it mapped is exiting. Leading to the following stack traces
being observed.

Process Accessing Huge Page: (Process A)
	 Call trace:
	  __switch_to+0xc0/0xec
	  __schedule+0x27c/0x6e0
	  schedule+0x54/0xe0
	  io_schedule+0x48/0x68
	  wait_on_page_bit_common+0x158/0x440
	  put_and_wait_on_page_locked+0x60/0x80
	  __migration_entry_wait+0x148/0x164
	  migration_entry_wait_huge+0x78/0x84
	  hugetlb_fault+0x464/0x594
	  handle_mm_fault+0x1b0/0x240
	  do_page_fault+0x154/0x420
	  do_translation_fault+0xbc/0xe0
	  do_mem_abort+0x4c/0xb8
	  el0_da+0x3c/0x50
	  el0_sync_handler+0xe0/0x120

Process Migrating the Huge Page: (Process B)
	 Call trace:
	  __switch_to+0xc0/0xec
	  __schedule+0x27c/0x6e0
	  schedule+0x54/0xe0
	  rwsem_down_read_slowpath+0x190/0x5c0
	  down_read+0x68/0x80
	  rmap_walk_file+0x1a4/0x280
	  rmap_walk+0x58/0x88
	  unmap_and_move_huge_page+0x1e8/0x3c0
	  migrate_pages+0x9c/0x4a4
	  do_migrate_range.isra.0+0x24c/0x41c
	  offline_pages+0x374/0x460
	  memory_subsys_offline+0x104/0x140
	  device_offline+0x100/0x13c
	  offline_and_remove_memory+0x88/0xd4
	  remove_store+0x80/0xf0
	  dev_attr_store+0x24/0x40
	  sysfs_kf_write+0x50/0x60
	  kernfs_fop_write_iter+0x134/0x1c4
	  new_sync_write+0xf0/0x190
	  vfs_write+0x22c/0x2c0
	  ksys_write+0x74/0x100
	  __arm64_sys_write+0x28/0x40
	  el0_svc_common.constprop.0+0x80/0x1cc
	  do_el0_svc+0x30/0x98
	  el0_svc+0x20/0x3c
	  el0_sync_handler+0x9c/0x120

Process Exiting: (Process C)
	 Call trace:
	  __switch_to+0x80/0xa8
	  __schedule+0x27c/0x6e0
	  schedule+0x54/0xe0
	  rwsem_down_write_slowpath+0x320/0x950
	  down_write+0x7c/0x8c
	  unlink_file_vma+0x3c/0xd0
	  free_pgtables+0xa0/0x140
	  exit_mmap+0xe4/0x1a0
	  __mmput+0x44/0x194
	  mmput+0x6c/0x80
	  exit_mm+0x178/0x240
	  do_exit+0x1ac/0x440
	  do_group_exit+0x44/0xac
	  __wake_up_parent+0x0/0x3c
	  el0_svc_common.constprop.0+0x88/0x248
	  do_el0_svc+0x30/0x98
	  el0_svc+0x20/0x3c
	  el0_sync_handler+0x9c/0x120

Process A takes the mapping lock and then is waiting on the page lock.
Process B has the page lock and is waiting on the mapping lock.
Process C is waiting on the mapping lock.

Remove this deadlock potential by having Process B hold onto the mapping lock
(which it initially takes with a trylock()) rather than dropping it then trying
to take it again. This means that if it can't get it Process A and C can still
make progress.

NOTE:
Upstream this is being fixed by a rework of the code by the following series:
https://lore.kernel.org/linux-mm/20220508183420.18488-2-mike.kravetz@oracle.com/

Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 mm/migrate.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index fcb7eb6a6ecae..c0e00735df37a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1282,6 +1282,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 {
 	int rc = -EAGAIN;
 	int page_was_mapped = 0;
+	bool mapping_locked = false;
 	struct page *new_hpage;
 	struct anon_vma *anon_vma = NULL;
 	struct address_space *mapping = NULL;
@@ -1332,7 +1333,6 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 		goto put_anon;
 
 	if (page_mapped(hpage)) {
-		bool mapping_locked = false;
 		enum ttu_flags ttu = TTU_MIGRATION|TTU_IGNORE_MLOCK;
 
 		if (!PageAnon(hpage)) {
@@ -1352,17 +1352,17 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 
 		try_to_unmap(hpage, ttu);
 		page_was_mapped = 1;
-
-		if (mapping_locked)
-			i_mmap_unlock_write(mapping);
 	}
 
 	if (!page_mapped(hpage))
 		rc = move_to_new_page(new_hpage, hpage, mode);
 
-	if (page_was_mapped)
+	if (page_was_mapped) {
 		remove_migration_ptes(hpage,
-			rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false);
+			rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, mapping_locked);
+		if (mapping_locked)
+			i_mmap_unlock_write(mapping);
+	}
 
 unlock_put_anon:
 	unlock_page(new_hpage);

From 4c761d59d9c70c0518c404824013df94dc8eb491 Mon Sep 17 00:00:00 2001
From: Hailey <hailmo@amazon.com>
Date: Fri, 3 Jun 2022 17:18:58 +0000
Subject: [PATCH 395/737] enable rfc4106(gcm(aes)) for fips

This alogrithim works with no additional changes required and has been
requested by a customer, so enable it
---
 crypto/testmgr.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 51c99630c61ae..eef9142bfec93 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -5234,6 +5234,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.alg = "rfc4106(gcm(aes))",
 		.generic_driver = "rfc4106(gcm_base(ctr(aes-generic),ghash-generic))",
 		.test = alg_test_aead,
+		.fips_allowed = 1,
 		.suite = {
 			.aead = {
 				____VECS(aes_gcm_rfc4106_tv_template),

From f506c14100e2420b4b86c1b413bb7fb0b43fc77b Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Tue, 8 Feb 2022 09:43:33 +0000
Subject: [PATCH 396/737] sched/fair: Improve consistency of allowed NUMA
 balance calculations

There are inconsistencies when determining if a NUMA imbalance is allowed
that should be corrected.

o allow_numa_imbalance changes types and is not always examining
  the destination group so both the type should be corrected as
  well as the naming.
o find_idlest_group uses the sched_domain's weight instead of the
  group weight which is different to find_busiest_group
o find_busiest_group uses the source group instead of the destination
  which is different to task_numa_find_cpu
o Both find_idlest_group and find_busiest_group should account
  for the number of running tasks if a move was allowed to be
  consistent with task_numa_find_cpu

Fixes: 7d2b5dd0bcc4 ("sched/numa: Allow a floating imbalance between NUMA nodes")
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Link: https://lore.kernel.org/r/20220208094334.16379-2-mgorman@techsingularity.net
---
 kernel/sched/fair.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 96b6fe8cc35bf..7435fbea26868 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9092,9 +9092,10 @@ static bool update_pick_idlest(struct sched_group *idlest,
  * This is an approximation as the number of running tasks may not be
  * related to the number of busy CPUs due to sched_setaffinity.
  */
-static inline bool allow_numa_imbalance(int dst_running, int dst_weight)
+static inline bool
+allow_numa_imbalance(unsigned int running, unsigned int weight)
 {
-	return (dst_running < (dst_weight >> 2));
+	return (running < (weight >> 2));
 }
 
 /*
@@ -9224,12 +9225,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 				return idlest;
 #endif
 			/*
-			 * Otherwise, keep the task on this node to stay close
-			 * its wakeup source and improve locality. If there is
-			 * a real need of migration, periodic load balance will
-			 * take care of it.
+			 * Otherwise, keep the task close to the wakeup source
+			 * and improve locality if the number of running tasks
+			 * would remain below threshold where an imbalance is
+			 * allowed. If there is a real need of migration,
+			 * periodic load balance will take care of it.
 			 */
-			if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight))
+			if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, local_sgs.group_weight))
 				return NULL;
 		}
 
@@ -9448,7 +9450,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 		/* Consider allowing a small imbalance between NUMA groups */
 		if (env->sd->flags & SD_NUMA) {
 			env->imbalance = adjust_numa_imbalance(env->imbalance,
-				busiest->sum_nr_running, busiest->group_weight);
+				local->sum_nr_running + 1, local->group_weight);
 		}
 
 		return;

From 016a499ab7cc5e2277d72f8063c3926e462cf55e Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Tue, 8 Feb 2022 09:43:34 +0000
Subject: [PATCH 397/737] sched/fair: Adjust the allowed NUMA imbalance when
 SD_NUMA spans multiple LLCs

Commit 7d2b5dd0bcc4 ("sched/numa: Allow a floating imbalance between NUMA
nodes") allowed an imbalance between NUMA nodes such that communicating
tasks would not be pulled apart by the load balancer. This works fine when
there is a 1:1 relationship between LLC and node but can be suboptimal
for multiple LLCs if independent tasks prematurely use CPUs sharing cache.

Zen* has multiple LLCs per node with local memory channels and due to
the allowed imbalance, it's far harder to tune some workloads to run
optimally than it is on hardware that has 1 LLC per node. This patch
allows an imbalance to exist up to the point where LLCs should be balanced
between nodes.

On a Zen3 machine running STREAM parallelised with OMP to have on instance
per LLC the results and without binding, the results are

                            5.17.0-rc0             5.17.0-rc0
                               vanilla       sched-numaimb-v6
MB/sec copy-16    162596.94 (   0.00%)   580559.74 ( 257.05%)
MB/sec scale-16   136901.28 (   0.00%)   374450.52 ( 173.52%)
MB/sec add-16     157300.70 (   0.00%)   564113.76 ( 258.62%)
MB/sec triad-16   151446.88 (   0.00%)   564304.24 ( 272.61%)

STREAM can use directives to force the spread if the OpenMP is new
enough but that doesn't help if an application uses threads and
it's not known in advance how many threads will be created.

Coremark is a CPU and cache intensive benchmark parallelised with
threads. When running with 1 thread per core, the vanilla kernel
allows threads to contend on cache. With the patch;

                               5.17.0-rc0             5.17.0-rc0
                                  vanilla       sched-numaimb-v5
Min       Score-16   368239.36 (   0.00%)   389816.06 (   5.86%)
Hmean     Score-16   388607.33 (   0.00%)   427877.08 *  10.11%*
Max       Score-16   408945.69 (   0.00%)   481022.17 (  17.62%)
Stddev    Score-16    15247.04 (   0.00%)    24966.82 ( -63.75%)
CoeffVar  Score-16        3.92 (   0.00%)        5.82 ( -48.48%)

It can also make a big difference for semi-realistic workloads
like specjbb which can execute arbitrary numbers of threads without
advance knowledge of how they should be placed. Even in cases where
the average performance is neutral, the results are more stable.

                               5.17.0-rc0             5.17.0-rc0
                                  vanilla       sched-numaimb-v6
Hmean     tput-1      71631.55 (   0.00%)    73065.57 (   2.00%)
Hmean     tput-8     582758.78 (   0.00%)   556777.23 (  -4.46%)
Hmean     tput-16   1020372.75 (   0.00%)  1009995.26 (  -1.02%)
Hmean     tput-24   1416430.67 (   0.00%)  1398700.11 (  -1.25%)
Hmean     tput-32   1687702.72 (   0.00%)  1671357.04 (  -0.97%)
Hmean     tput-40   1798094.90 (   0.00%)  2015616.46 *  12.10%*
Hmean     tput-48   1972731.77 (   0.00%)  2333233.72 (  18.27%)
Hmean     tput-56   2386872.38 (   0.00%)  2759483.38 (  15.61%)
Hmean     tput-64   2909475.33 (   0.00%)  2925074.69 (   0.54%)
Hmean     tput-72   2585071.36 (   0.00%)  2962443.97 (  14.60%)
Hmean     tput-80   2994387.24 (   0.00%)  3015980.59 (   0.72%)
Hmean     tput-88   3061408.57 (   0.00%)  3010296.16 (  -1.67%)
Hmean     tput-96   3052394.82 (   0.00%)  2784743.41 (  -8.77%)
Hmean     tput-104  2997814.76 (   0.00%)  2758184.50 (  -7.99%)
Hmean     tput-112  2955353.29 (   0.00%)  2859705.09 (  -3.24%)
Hmean     tput-120  2889770.71 (   0.00%)  2764478.46 (  -4.34%)
Hmean     tput-128  2871713.84 (   0.00%)  2750136.73 (  -4.23%)
Stddev    tput-1       5325.93 (   0.00%)     2002.53 (  62.40%)
Stddev    tput-8       6630.54 (   0.00%)    10905.00 ( -64.47%)
Stddev    tput-16     25608.58 (   0.00%)     6851.16 (  73.25%)
Stddev    tput-24     12117.69 (   0.00%)     4227.79 (  65.11%)
Stddev    tput-32     27577.16 (   0.00%)     8761.05 (  68.23%)
Stddev    tput-40     59505.86 (   0.00%)     2048.49 (  96.56%)
Stddev    tput-48    168330.30 (   0.00%)    93058.08 (  44.72%)
Stddev    tput-56    219540.39 (   0.00%)    30687.02 (  86.02%)
Stddev    tput-64    121750.35 (   0.00%)     9617.36 (  92.10%)
Stddev    tput-72    223387.05 (   0.00%)    34081.13 (  84.74%)
Stddev    tput-80    128198.46 (   0.00%)    22565.19 (  82.40%)
Stddev    tput-88    136665.36 (   0.00%)    27905.97 (  79.58%)
Stddev    tput-96    111925.81 (   0.00%)    99615.79 (  11.00%)
Stddev    tput-104   146455.96 (   0.00%)    28861.98 (  80.29%)
Stddev    tput-112    88740.49 (   0.00%)    58288.23 (  34.32%)
Stddev    tput-120   186384.86 (   0.00%)    45812.03 (  75.42%)
Stddev    tput-128    78761.09 (   0.00%)    57418.48 (  27.10%)

Similarly, for embarassingly parallel problems like NPB-ep, there are
improvements due to better spreading across LLC when the machine is not
fully utilised.

                              vanilla       sched-numaimb-v6
Min       ep.D       31.79 (   0.00%)       26.11 (  17.87%)
Amean     ep.D       31.86 (   0.00%)       26.17 *  17.86%*
Stddev    ep.D        0.07 (   0.00%)        0.05 (  24.41%)
CoeffVar  ep.D        0.22 (   0.00%)        0.20 (   7.97%)
Max       ep.D       31.93 (   0.00%)       26.21 (  17.91%)

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lore.kernel.org/r/20220208094334.16379-3-mgorman@techsingularity.net
---
 include/linux/sched/topology.h |  1 +
 kernel/sched/fair.c            | 22 +++++++-------
 kernel/sched/topology.c        | 53 ++++++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 10 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 9ef7bf686a9f7..bf1c656c3be0a 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -86,6 +86,7 @@ struct sched_domain {
 	unsigned int busy_factor;	/* less balancing by factor if busy */
 	unsigned int imbalance_pct;	/* No balance until over watermark */
 	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
+	unsigned int imb_numa_nr;	/* Nr running tasks that allows a NUMA imbalance */
 
 	int nohz_idle;			/* NOHZ IDLE status */
 	int flags;			/* See SD_* */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7435fbea26868..7469c812623ab 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1546,6 +1546,7 @@ struct task_numa_env {
 
 	int src_cpu, src_nid;
 	int dst_cpu, dst_nid;
+	int imb_numa_nr;
 
 	struct numa_stats src_stats, dst_stats;
 
@@ -1561,7 +1562,7 @@ static unsigned long cpu_load(struct rq *rq);
 static unsigned long cpu_runnable(struct rq *rq);
 static unsigned long cpu_util(int cpu);
 static inline long adjust_numa_imbalance(int imbalance,
-					int dst_running, int dst_weight);
+					int dst_running, int imb_numa_nr);
 
 static inline enum
 numa_type numa_classify(unsigned int imbalance_pct,
@@ -1942,7 +1943,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
 		dst_running = env->dst_stats.nr_running + 1;
 		imbalance = max(0, dst_running - src_running);
 		imbalance = adjust_numa_imbalance(imbalance, dst_running,
-							env->dst_stats.weight);
+						  env->imb_numa_nr);
 
 		/* Use idle CPU if there is no imbalance */
 		if (!imbalance) {
@@ -2007,8 +2008,10 @@ static int task_numa_migrate(struct task_struct *p)
 	 */
 	rcu_read_lock();
 	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
-	if (sd)
+	if (sd) {
 		env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+		env.imb_numa_nr = sd->imb_numa_nr;
+	}
 	rcu_read_unlock();
 
 	/*
@@ -9092,10 +9095,9 @@ static bool update_pick_idlest(struct sched_group *idlest,
  * This is an approximation as the number of running tasks may not be
  * related to the number of busy CPUs due to sched_setaffinity.
  */
-static inline bool
-allow_numa_imbalance(unsigned int running, unsigned int weight)
+static inline bool allow_numa_imbalance(int running, int imb_numa_nr)
 {
-	return (running < (weight >> 2));
+	return running <= imb_numa_nr;
 }
 
 /*
@@ -9231,7 +9233,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 			 * allowed. If there is a real need of migration,
 			 * periodic load balance will take care of it.
 			 */
-			if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, local_sgs.group_weight))
+			if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr))
 				return NULL;
 		}
 
@@ -9336,9 +9338,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 #define NUMA_IMBALANCE_MIN 2
 
 static inline long adjust_numa_imbalance(int imbalance,
-				int dst_running, int dst_weight)
+				int dst_running, int imb_numa_nr)
 {
-	if (!allow_numa_imbalance(dst_running, dst_weight))
+	if (!allow_numa_imbalance(dst_running, imb_numa_nr))
 		return imbalance;
 
 	/*
@@ -9450,7 +9452,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 		/* Consider allowing a small imbalance between NUMA groups */
 		if (env->sd->flags & SD_NUMA) {
 			env->imbalance = adjust_numa_imbalance(env->imbalance,
-				local->sum_nr_running + 1, local->group_weight);
+				local->sum_nr_running + 1, env->sd->imb_numa_nr);
 		}
 
 		return;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index ff2c6d3ba6c79..94f1e6299aa19 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2035,6 +2035,59 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 		}
 	}
 
+	/*
+	 * Calculate an allowed NUMA imbalance such that LLCs do not get
+	 * imbalanced.
+	 */
+	for_each_cpu(i, cpu_map) {
+		unsigned int imb = 0;
+		unsigned int imb_span = 1;
+
+		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+			struct sched_domain *child = sd->child;
+
+			if (!(sd->flags & SD_SHARE_PKG_RESOURCES) && child &&
+			    (child->flags & SD_SHARE_PKG_RESOURCES)) {
+				struct sched_domain *top, *top_p;
+				unsigned int nr_llcs;
+
+				/*
+				 * For a single LLC per node, allow an
+				 * imbalance up to 25% of the node. This is an
+				 * arbitrary cutoff based on SMT-2 to balance
+				 * between memory bandwidth and avoiding
+				 * premature sharing of HT resources and SMT-4
+				 * or SMT-8 *may* benefit from a different
+				 * cutoff.
+				 *
+				 * For multiple LLCs, allow an imbalance
+				 * until multiple tasks would share an LLC
+				 * on one node while LLCs on another node
+				 * remain idle.
+				 */
+				nr_llcs = sd->span_weight / child->span_weight;
+				if (nr_llcs == 1)
+					imb = sd->span_weight >> 2;
+				else
+					imb = nr_llcs;
+				sd->imb_numa_nr = imb;
+
+				/* Set span based on the first NUMA domain. */
+				top = sd;
+				top_p = top->parent;
+				while (top_p && !(top_p->flags & SD_NUMA)) {
+					top = top->parent;
+					top_p = top->parent;
+				}
+				imb_span = top_p ? top_p->span_weight : sd->span_weight;
+			} else {
+				int factor = max(1U, (sd->span_weight / imb_span));
+
+				sd->imb_numa_nr = imb * factor;
+			}
+		}
+	}
+
 	/* Calculate CPU capacity for physical packages and nodes */
 	for (i = nr_cpumask_bits-1; i >= 0; i--) {
 		if (!cpumask_test_cpu(i, cpu_map))

From 37bee5286a6f99da58819dbfb56c253c4f0c3fa2 Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <surajjs@amazon.com>
Date: Mon, 27 Jun 2022 16:53:28 -0700
Subject: [PATCH 398/737] ENA: Update to v2.7.3

Source: https://github.com/amzn/amzn-drivers/

Change Log:

## r2.7.3 release notes
**Changes**
* Make AF XDP native support experimental
* Update supported distributions documentation

## r2.7.2 release notes
**Bug Fixes**
* Fix compilation for SLES 15 SP3
* Fix compilation for RHEL 8.6
* Fix wrong value check in copybreak sysfs code

**Minor Changes**
* Provide more information on TX timeouts
* Use the same interrupt moderation value for both TX and RX
  In XDP TX/REDIRECT channels

Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 drivers/amazon/net/ena/Makefile        |   4 +
 drivers/amazon/net/ena/ena_com.c       |   5 -
 drivers/amazon/net/ena/ena_com.h       |   5 +-
 drivers/amazon/net/ena/ena_eth_com.h   |  25 -----
 drivers/amazon/net/ena/ena_ethtool.c   |   2 +-
 drivers/amazon/net/ena/ena_netdev.c    | 134 ++++++++++++++++++++-----
 drivers/amazon/net/ena/ena_netdev.h    |   2 +-
 drivers/amazon/net/ena/ena_regs_defs.h |   1 +
 drivers/amazon/net/ena/ena_sysfs.c     |   7 +-
 drivers/amazon/net/ena/ena_xdp.c       |   5 +-
 drivers/amazon/net/ena/kcompat.h       |  31 ++++--
 11 files changed, 149 insertions(+), 72 deletions(-)

diff --git a/drivers/amazon/net/ena/Makefile b/drivers/amazon/net/ena/Makefile
index aa212758c796c..37106200d6c19 100644
--- a/drivers/amazon/net/ena/Makefile
+++ b/drivers/amazon/net/ena/Makefile
@@ -10,3 +10,7 @@ ena-y := ena_netdev.o ena_ethtool.o ena_lpc.o ena_xdp.o dim.o ena_devlink.o \
          net_dim.o ena_com.o ena_eth_com.o
 
 ena-$(CONFIG_SYSFS) += ena_sysfs.o
+
+ifdef TEST_AF_XDP
+	ccflags-y += -DENA_TEST_AF_XDP
+endif
diff --git a/drivers/amazon/net/ena/ena_com.c b/drivers/amazon/net/ena/ena_com.c
index f9dbcf24a753b..07d5d5eb5676b 100644
--- a/drivers/amazon/net/ena/ena_com.c
+++ b/drivers/amazon/net/ena/ena_com.c
@@ -1426,11 +1426,6 @@ int ena_com_create_io_cq(struct ena_com_dev *ena_dev,
 	io_cq->unmask_reg = (u32 __iomem *)((uintptr_t)ena_dev->reg_bar +
 		cmd_completion.cq_interrupt_unmask_register_offset);
 
-	if (cmd_completion.cq_head_db_register_offset)
-		io_cq->cq_head_db_reg =
-			(u32 __iomem *)((uintptr_t)ena_dev->reg_bar +
-			cmd_completion.cq_head_db_register_offset);
-
 	if (cmd_completion.numa_node_register_offset)
 		io_cq->numa_node_cfg_reg =
 			(u32 __iomem *)((uintptr_t)ena_dev->reg_bar +
diff --git a/drivers/amazon/net/ena/ena_com.h b/drivers/amazon/net/ena/ena_com.h
index 6b085c54685f6..555cb822bbb1c 100644
--- a/drivers/amazon/net/ena/ena_com.h
+++ b/drivers/amazon/net/ena/ena_com.h
@@ -110,8 +110,6 @@ struct ena_com_io_cq {
 	/* Interrupt unmask register */
 	u32 __iomem *unmask_reg;
 
-	/* The completion queue head doorbell register */
-	u32 __iomem *cq_head_db_reg;
 
 	/* numa configuration register (for TPH) */
 	u32 __iomem *numa_node_cfg_reg;
@@ -119,7 +117,7 @@ struct ena_com_io_cq {
 	/* The value to write to the above register to unmask
 	 * the interrupt of this queue
 	 */
-	u32 msix_vector;
+	u32 msix_vector ____cacheline_aligned;
 
 	enum queue_direction direction;
 
@@ -135,7 +133,6 @@ struct ena_com_io_cq {
 	/* Device queue index */
 	u16 idx;
 	u16 head;
-	u16 last_head_update;
 	u8 phase;
 	u8 cdesc_entry_size_in_bytes;
 
diff --git a/drivers/amazon/net/ena/ena_eth_com.h b/drivers/amazon/net/ena/ena_eth_com.h
index 689313ee25a80..91207c657b73a 100644
--- a/drivers/amazon/net/ena/ena_eth_com.h
+++ b/drivers/amazon/net/ena/ena_eth_com.h
@@ -8,9 +8,6 @@
 
 #include "ena_com.h"
 
-/* head update threshold in units of (queue size / ENA_COMP_HEAD_THRESH) */
-#define ENA_COMP_HEAD_THRESH 4
-
 struct ena_com_tx_ctx {
 	struct ena_com_tx_meta ena_meta;
 	struct ena_com_buf *ena_bufs;
@@ -168,28 +165,6 @@ static inline int ena_com_write_sq_doorbell(struct ena_com_io_sq *io_sq)
 	return 0;
 }
 
-static inline int ena_com_update_dev_comp_head(struct ena_com_io_cq *io_cq)
-{
-	u16 unreported_comp, head;
-	bool need_update;
-
-	if (unlikely(io_cq->cq_head_db_reg)) {
-		head = io_cq->head;
-		unreported_comp = head - io_cq->last_head_update;
-		need_update = unreported_comp > (io_cq->q_depth / ENA_COMP_HEAD_THRESH);
-
-		if (unlikely(need_update)) {
-			netdev_dbg(ena_com_io_cq_to_ena_dev(io_cq)->net_device,
-				   "Write completion queue doorbell for queue %d: head: %d\n",
-				   io_cq->qid, head);
-			writel(head, io_cq->cq_head_db_reg);
-			io_cq->last_head_update = head;
-		}
-	}
-
-	return 0;
-}
-
 static inline void ena_com_update_numa_node(struct ena_com_io_cq *io_cq,
 					    u8 numa_node)
 {
diff --git a/drivers/amazon/net/ena/ena_ethtool.c b/drivers/amazon/net/ena/ena_ethtool.c
index 4c387d0b6be35..b3bf0836a2c5b 100644
--- a/drivers/amazon/net/ena/ena_ethtool.c
+++ b/drivers/amazon/net/ena/ena_ethtool.c
@@ -1060,7 +1060,7 @@ static int ena_set_tunable(struct net_device *netdev,
 	switch (tuna->id) {
 	case ETHTOOL_RX_COPYBREAK:
 		len = *(u32 *)data;
-		if (len > adapter->netdev->mtu) {
+		if (len > min_t(u16, adapter->netdev->mtu, ENA_PAGE_SIZE)) {
 			ret = -EINVAL;
 			break;
 		}
diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c
index 36421b2684d3e..096677a2e8bfb 100644
--- a/drivers/amazon/net/ena/ena_netdev.c
+++ b/drivers/amazon/net/ena/ena_netdev.c
@@ -89,27 +89,85 @@ static void ena_calc_io_queue_size(struct ena_adapter *adapter,
 static void ena_set_dev_offloads(struct ena_com_dev_get_features_ctx *feat,
 				 struct net_device *netdev);
 
-#ifdef HAVE_NDO_TX_TIMEOUT_STUCK_QUEUE_PARAMETER
 static void ena_tx_timeout(struct net_device *dev, unsigned int txqueue)
-#else
-static void ena_tx_timeout(struct net_device *dev)
-#endif
 {
+	enum ena_regs_reset_reason_types reset_reason = ENA_REGS_RESET_OS_NETDEV_WD;
 	struct ena_adapter *adapter = netdev_priv(dev);
+	unsigned int time_since_last_napi, threshold;
+	struct ena_ring *tx_ring;
+	int napi_scheduled;
+
+	if (txqueue >= adapter->num_io_queues) {
+		netdev_err(dev, "TX timeout on invalid queue %u\n", txqueue);
+		goto schedule_reset;
+	}
+
+	threshold = jiffies_to_usecs(dev->watchdog_timeo);
+	tx_ring = &adapter->tx_ring[txqueue];
 
+	time_since_last_napi = jiffies_to_usecs(jiffies - tx_ring->tx_stats.last_napi_jiffies);
+	napi_scheduled = !!(tx_ring->napi->state & NAPIF_STATE_SCHED);
+
+	netdev_err(dev,
+		  "TX q %d is paused for too long (threshold %u). Time since last napi %u usec. napi scheduled: %d\n",
+		  txqueue,
+		  threshold,
+		  time_since_last_napi,
+		  napi_scheduled);
+
+	if (threshold < time_since_last_napi && napi_scheduled) {
+		netdev_err(dev,
+			"napi handler hasn't been called for a long time but is scheduled\n");
+			reset_reason = ENA_REGS_RESET_SUSPECTED_POLL_STARVATION;
+	}
+schedule_reset:
 	/* Change the state of the device to trigger reset
 	 * Check that we are not in the middle or a trigger already
 	 */
+	if (test_and_set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))
+		return;
+
+	ena_reset_device(adapter, reset_reason);
+	ena_increase_stat(&adapter->dev_stats.tx_timeout, 1, &adapter->syncp);
+}
 
+#ifndef HAVE_NDO_TX_TIMEOUT_STUCK_QUEUE_PARAMETER
+/* This function is called by the kernel's watchdog and indicates that the queue
+ * has been closed longer than dev->watchdog_timeo value allows.
+ * In older kernels the called function doesn't contain the id of the queue
+ * that's been closed for too long. This helper function retrieves this
+ * information
+ */
+static void ena_find_and_timeout_queue(struct net_device *dev)
+{
+	struct ena_adapter *adapter = netdev_priv(dev);
+	unsigned long trans_start;
+	struct netdev_queue *txq;
+	unsigned int i;
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		txq = netdev_get_tx_queue(dev, i);
+		trans_start = txq->trans_start;
+		if (netif_xmit_stopped(txq) &&
+			time_after(jiffies, (trans_start + dev->watchdog_timeo))) {
+			ena_tx_timeout(dev, i);
+			return;
+		}
+	}
+
+	netdev_warn(dev, "timeout was called, but no offending queue was found\n");
+
+	/* Change the state of the device to trigger reset
+	 * Check that we are not in the middle or a trigger already
+	 */
 	if (test_and_set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))
 		return;
 
 	ena_reset_device(adapter, ENA_REGS_RESET_OS_NETDEV_WD);
 	ena_increase_stat(&adapter->dev_stats.tx_timeout, 1, &adapter->syncp);
-
-	netif_err(adapter, tx_err, dev, "Transmit time out\n");
 }
 
+#endif
 static void update_rx_ring_mtu(struct ena_adapter *adapter, int mtu)
 {
 	int i;
@@ -955,7 +1013,6 @@ static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget)
 
 	tx_ring->next_to_clean = next_to_clean;
 	ena_com_comp_ack(tx_ring->ena_com_io_sq, total_done);
-	ena_com_update_dev_comp_head(tx_ring->ena_com_io_cq);
 
 	if (tx_ring->enable_bql)
 		netdev_tx_completed_queue(txq, tx_pkts, tx_bytes);
@@ -1450,10 +1507,8 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 		      ENA_RX_REFILL_THRESH_PACKET);
 
 	/* Optimization, try to batch new rx buffers */
-	if (refill_required > refill_threshold) {
-		ena_com_update_dev_comp_head(rx_ring->ena_com_io_cq);
+	if (refill_required > refill_threshold)
 		ena_refill_rx_bufs(rx_ring, refill_required);
-	}
 
 #ifdef ENA_XDP_SUPPORT
 	if (xdp_flags & ENA_XDP_REDIRECT)
@@ -1512,7 +1567,11 @@ void ena_unmask_interrupt(struct ena_ring *tx_ring,
 			  struct ena_ring *rx_ring)
 {
 	struct ena_eth_io_intr_reg intr_reg;
+#ifdef ENA_XDP_SUPPORT
+	u32 rx_interval = tx_ring->smoothed_interval;
+#else
 	u32 rx_interval = 0;
+#endif
 	/* Rx ring can be NULL when for XDP tx queues which don't have an
 	 * accompanying rx_ring pair.
 	 */
@@ -3271,7 +3330,11 @@ static const struct net_device_ops ena_netdev_ops = {
 #else
 	.ndo_get_stats		= ena_get_stats,
 #endif
+#ifdef HAVE_NDO_TX_TIMEOUT_STUCK_QUEUE_PARAMETER
 	.ndo_tx_timeout		= ena_tx_timeout,
+#else
+	.ndo_tx_timeout		= ena_find_and_timeout_queue,
+#endif
 	.ndo_change_mtu		= ena_change_mtu,
 	.ndo_set_mac_address	= NULL,
 #ifdef	HAVE_SET_RX_MODE
@@ -3284,9 +3347,9 @@ static const struct net_device_ops ena_netdev_ops = {
 #ifdef ENA_XDP_SUPPORT
 	.ndo_bpf		= ena_xdp,
 	.ndo_xdp_xmit		= ena_xdp_xmit,
-#ifdef ENA_AF_XDP_SUPPORT
+#if defined(ENA_TEST_AF_XDP) && defined(ENA_AF_XDP_SUPPORT)
 	.ndo_xsk_wakeup         = ena_xdp_xsk_wakeup,
-#endif /* ENA_AF_XDP_SUPPORT */
+#endif /* defined(ENA_TEST_AF_XDP) && defined(ENA_AF_XDP_SUPPORT) */
 #endif /* ENA_XDP_SUPPORT */
 };
 
@@ -3395,6 +3458,7 @@ static int ena_device_init(struct ena_adapter *adapter, struct pci_dev *pdev,
 			   bool *wd_state)
 {
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	struct net_device *netdev = adapter->netdev;
 	struct ena_llq_configurations llq_config;
 	netdev_features_t prev_netdev_features;
 	struct device *dev = &pdev->dev;
@@ -3502,7 +3566,7 @@ static int ena_device_init(struct ena_adapter *adapter, struct pci_dev *pdev,
 	rc = ena_set_queues_placement_policy(pdev, ena_dev, &get_feat_ctx->llq,
 					     &llq_config);
 	if (rc) {
-		dev_err(dev, "ENA device init failed\n");
+		netdev_err(netdev, "Cannot set queues placement policy rc= %d\n", rc);
 		goto err_admin_init;
 	}
 
@@ -3721,14 +3785,18 @@ static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
 					  struct ena_ring *tx_ring)
 {
 	struct ena_napi *ena_napi = container_of(tx_ring->napi, struct ena_napi, napi);
+	enum ena_regs_reset_reason_types reset_reason = ENA_REGS_RESET_MISS_TX_CMPL;
 	unsigned int time_since_last_napi;
 	unsigned int missing_tx_comp_to;
 	bool is_tx_comp_time_expired;
 	struct ena_tx_buffer *tx_buf;
 	unsigned long last_jiffies;
+	int napi_scheduled;
 	u32 missed_tx = 0;
 	int i, rc = 0;
 
+	missing_tx_comp_to = jiffies_to_msecs(adapter->missing_tx_completion_to);
+
 	for (i = 0; i < tx_ring->ring_size; i++) {
 		tx_buf = &tx_ring->tx_buffer_info[i];
 		last_jiffies = tx_buf->last_jiffies;
@@ -3755,25 +3823,45 @@ static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
 			adapter->missing_tx_completion_to);
 
 		if (unlikely(is_tx_comp_time_expired)) {
-			if (!tx_buf->print_once) {
-				time_since_last_napi = jiffies_to_usecs(jiffies - tx_ring->tx_stats.last_napi_jiffies);
-				missing_tx_comp_to = jiffies_to_msecs(adapter->missing_tx_completion_to);
-				netif_notice(adapter, tx_err, adapter->netdev,
-					     "Found a Tx that wasn't completed on time, qid %d, index %d. %u usecs have passed since last napi execution. Missing Tx timeout value %u msecs\n",
-					     tx_ring->qid, i, time_since_last_napi, missing_tx_comp_to);
+
+			time_since_last_napi = jiffies_to_usecs(jiffies - tx_ring->tx_stats.last_napi_jiffies);
+			napi_scheduled = !!(ena_napi->napi.state & NAPIF_STATE_SCHED);
+
+			if (missing_tx_comp_to < time_since_last_napi && napi_scheduled) {
+				/* We suspect napi isn't called because the
+				 * bottom half is not run. Require a bigger
+				 * timeout for these cases
+				 */
+				if (!time_is_before_jiffies(last_jiffies +
+					2 * adapter->missing_tx_completion_to))
+					continue;
+
+				reset_reason = ENA_REGS_RESET_SUSPECTED_POLL_STARVATION;
 			}
 
-			tx_buf->print_once = 1;
 			missed_tx++;
+
+			if (tx_buf->print_once)
+				continue;
+
+			netif_notice(adapter, tx_err, adapter->netdev,
+				     "TX hasn't completed, qid %d, index %d. %u usecs from last napi execution, napi scheduled: %d\n",
+				     tx_ring->qid, i, time_since_last_napi, napi_scheduled);
+
+			tx_buf->print_once = 1;
 		}
 	}
 
 	if (unlikely(missed_tx > adapter->missing_tx_completion_threshold)) {
 		netif_err(adapter, tx_err, adapter->netdev,
-			  "The number of lost tx completions is above the threshold (%d > %d). Reset the device\n",
+			  "Lost TX completions are above the threshold (%d > %d). Completion transmission timeout: %u.\n",
 			  missed_tx,
-			  adapter->missing_tx_completion_threshold);
-		ena_reset_device(adapter, ENA_REGS_RESET_MISS_TX_CMPL);
+			  adapter->missing_tx_completion_threshold,
+			  missing_tx_comp_to);
+		netif_err(adapter, tx_err, adapter->netdev,
+			  "Resetting the device\n");
+
+		ena_reset_device(adapter, reset_reason);
 		rc = -EIO;
 	}
 
diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h
index b6a2332d986bb..be7e50da737ab 100644
--- a/drivers/amazon/net/ena/ena_netdev.h
+++ b/drivers/amazon/net/ena/ena_netdev.h
@@ -26,7 +26,7 @@
 
 #define DRV_MODULE_GEN_MAJOR	2
 #define DRV_MODULE_GEN_MINOR	7
-#define DRV_MODULE_GEN_SUBMINOR 1
+#define DRV_MODULE_GEN_SUBMINOR 3
 
 #define DRV_MODULE_NAME		"ena"
 #ifndef DRV_MODULE_GENERATION
diff --git a/drivers/amazon/net/ena/ena_regs_defs.h b/drivers/amazon/net/ena/ena_regs_defs.h
index 568b26185fe9d..8ca6f795a0fd0 100755
--- a/drivers/amazon/net/ena/ena_regs_defs.h
+++ b/drivers/amazon/net/ena/ena_regs_defs.h
@@ -21,6 +21,7 @@ enum ena_regs_reset_reason_types {
 	ENA_REGS_RESET_USER_TRIGGER                 = 12,
 	ENA_REGS_RESET_GENERIC                      = 13,
 	ENA_REGS_RESET_MISS_INTERRUPT               = 14,
+	ENA_REGS_RESET_SUSPECTED_POLL_STARVATION    = 15,
 	ENA_REGS_RESET_LAST,
 };
 
diff --git a/drivers/amazon/net/ena/ena_sysfs.c b/drivers/amazon/net/ena/ena_sysfs.c
index 53b8d84ddcc36..0c3451b60a2fe 100755
--- a/drivers/amazon/net/ena/ena_sysfs.c
+++ b/drivers/amazon/net/ena/ena_sysfs.c
@@ -26,7 +26,7 @@ static ssize_t ena_store_rx_copybreak(struct device *dev,
 	if (err < 0)
 		return err;
 
-	if (len > adapter->netdev->mtu)
+	if (rx_copybreak > min_t(u16, adapter->netdev->mtu, ENA_PAGE_SIZE))
 		return -EINVAL;
 
 	rtnl_lock();
@@ -41,12 +41,15 @@ static ssize_t ena_store_rx_copybreak(struct device *dev,
 	return len;
 }
 
+#define ENA_RX_COPYBREAK_STR_MAX_LEN 7
+
 static ssize_t ena_show_rx_copybreak(struct device *dev,
 				     struct device_attribute *attr, char *buf)
 {
 	struct ena_adapter *adapter = dev_get_drvdata(dev);
 
-	return sprintf(buf, "%d\n", adapter->rx_copybreak);
+	return snprintf(buf, ENA_RX_COPYBREAK_STR_MAX_LEN, "%d\n",
+			adapter->rx_copybreak);
 }
 
 static DEVICE_ATTR(rx_copybreak, S_IRUGO | S_IWUSR, ena_show_rx_copybreak,
diff --git a/drivers/amazon/net/ena/ena_xdp.c b/drivers/amazon/net/ena/ena_xdp.c
index d06c0f50998af..9296be230fd38 100644
--- a/drivers/amazon/net/ena/ena_xdp.c
+++ b/drivers/amazon/net/ena/ena_xdp.c
@@ -611,7 +611,6 @@ static bool ena_clean_xdp_irq(struct ena_ring *tx_ring, u32 budget)
 
 	tx_ring->next_to_clean = next_to_clean;
 	ena_com_comp_ack(tx_ring->ena_com_io_sq, total_done);
-	ena_com_update_dev_comp_head(tx_ring->ena_com_io_cq);
 
 	netif_dbg(tx_ring->adapter, tx_done, tx_ring->netdev,
 		  "tx_poll: q %d done. total pkts: %d\n",
@@ -877,10 +876,8 @@ static int ena_xdp_clean_rx_irq_zc(struct ena_ring *rx_ring,
 		min_t(int, rx_ring->ring_size / ENA_RX_REFILL_THRESH_DIVIDER,
 		      ENA_RX_REFILL_THRESH_PACKET);
 	/* Optimization, try to batch new rx buffers */
-	if (refill_required > refill_threshold) {
-		ena_com_update_dev_comp_head(rx_ring->ena_com_io_cq);
+	if (refill_required > refill_threshold)
 		ena_refill_rx_bufs(rx_ring, refill_required);
-	}
 
 	if (xsk_uses_need_wakeup(rx_ring->xsk_pool)) {
 		if (likely(rc || work_done < budget)) {
diff --git a/drivers/amazon/net/ena/kcompat.h b/drivers/amazon/net/ena/kcompat.h
index f6f930e2bf19e..38af7b173de14 100644
--- a/drivers/amazon/net/ena/kcompat.h
+++ b/drivers/amazon/net/ena/kcompat.h
@@ -680,7 +680,8 @@ do {									\
 
 #if defined(CONFIG_BPF) && LINUX_VERSION_CODE >= KERNEL_VERSION(5,0,0)
 #define ENA_XDP_SUPPORT
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,8,0)
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5,8,0)) || \
+	(SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 3)
 #define XDP_HAS_FRAME_SZ
 #define XDP_CONVERT_TO_FRAME_NAME_CHANGED
 #endif
@@ -692,7 +693,9 @@ do {									\
 #endif
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(5,6,0) || \
-    (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 3)))
+	(defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 3))) || \
+	(SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 3)
+
 #define HAVE_NDO_TX_TIMEOUT_STUCK_QUEUE_PARAMETER
 #endif
 
@@ -884,16 +887,16 @@ xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start,
 #define ENA_XDP_XMIT_FREES_FAILED_DESCS_INTERNALLY
 #endif
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0)
-
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0) && \
+	!(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 6))
 static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr)
 {
 	memcpy(dev->dev_addr, addr, ETH_ALEN);
 }
+#endif
 
-#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0) */
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) || \
+	(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 6))
 #define ENA_EXTENDED_COALESCE_UAPI_WITH_CQE_SUPPORTED
 #endif
 
@@ -904,4 +907,18 @@ static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr)
 #if defined(ENA_XDP_SUPPORT) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0)
 #define ENA_AF_XDP_SUPPORT
 #endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 3, 0)
+/* kernels older than 3.3.0 didn't have this function and
+ * used netif_tx_queue_stopped() for the same purpose
+ */
+static inline int netif_xmit_stopped(const struct netdev_queue *dev_queue)
+{
+	return netif_tx_queue_stopped(dev_queue);
+}
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0)
+#define NAPIF_STATE_SCHED BIT(NAPI_STATE_SCHED)
+#endif
 #endif /* _KCOMPAT_H_ */

From dfef1f826bf6a7c6061af2359869df3bdba6e837 Mon Sep 17 00:00:00 2001
From: Shaoying Xu <shaoyi@amazon.com>
Date: Tue, 2 Aug 2022 01:41:16 +0000
Subject: [PATCH 399/737] ENA: Update to v2.7.4

Source: https://github.com/amzn/amzn-drivers/

Change Log:

## r2.7.4 release notes
**Bug Fixes**
* Fix remaining space check in DRB

Signed-off-by: Shaoying Xu <shaoyi@amazon.com>
---
 drivers/amazon/net/ena/ena_netdev.c | 16 ++++++++--------
 drivers/amazon/net/ena/ena_netdev.h |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c
index 096677a2e8bfb..55c3c141c81a0 100644
--- a/drivers/amazon/net/ena/ena_netdev.c
+++ b/drivers/amazon/net/ena/ena_netdev.c
@@ -1074,15 +1074,15 @@ static struct sk_buff *ena_alloc_skb(struct ena_ring *rx_ring, void *first_frag,
 	return skb;
 }
 
-static bool ena_try_rx_buf_page_reuse(struct ena_rx_buffer *rx_info,
-				       u16 buf_len, u16 len)
+static bool ena_try_rx_buf_page_reuse(struct ena_rx_buffer *rx_info, u16 buf_len,
+				      u16 len, int pkt_offset)
 {
 	struct ena_com_buf *ena_buf = &rx_info->ena_buf;
 
 	/* More than ENA_MIN_RX_BUF_SIZE left in the reused buffer
-	 * for data + headroom + tailroom
+	 * for data + headroom + tailroom.
 	 */
-	if (SKB_DATA_ALIGN(len) + ENA_MIN_RX_BUF_SIZE <= ena_buf->len) {
+	if (SKB_DATA_ALIGN(len + pkt_offset) + ENA_MIN_RX_BUF_SIZE <= ena_buf->len) {
 		page_ref_inc(rx_info->page);
 		rx_info->page_offset += buf_len;
 		ena_buf->paddr += buf_len;
@@ -1133,6 +1133,7 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 		  rx_info, rx_info->page);
 
 	buf_offset = rx_info->buf_offset;
+	pkt_offset = buf_offset - rx_ring->rx_headroom;
 	page_offset = rx_info->page_offset;
 	buf_addr = page_address(rx_info->page) + page_offset;
 
@@ -1141,8 +1142,6 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 		if (unlikely(!skb))
 			return NULL;
 
-		pkt_offset = buf_offset - rx_ring->rx_headroom;
-
 		/* sync this buffer for CPU use */
 		dma_sync_single_for_cpu(rx_ring->dev,
 					dma_unmap_addr(&rx_info->ena_buf, paddr) + pkt_offset,
@@ -1171,7 +1170,7 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 
 	/* If XDP isn't loaded try to reuse part of the RX buffer */
 	reuse_rx_buf_page = !is_xdp_loaded &&
-			    ena_try_rx_buf_page_reuse(rx_info, buf_len, len);
+			    ena_try_rx_buf_page_reuse(rx_info, buf_len, len, pkt_offset);
 
 	if (!reuse_rx_buf_page)
 		ena_unmap_rx_buff(rx_ring, rx_info);
@@ -1223,11 +1222,12 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 
 		/* rx_info->buf_offset includes rx_ring->rx_headroom */
 		buf_offset = rx_info->buf_offset;
+		pkt_offset = buf_offset - rx_ring->rx_headroom;
 		buf_len = SKB_DATA_ALIGN(len + buf_offset + tailroom);
 		page_offset = rx_info->page_offset;
 
 		reuse_rx_buf_page = !is_xdp_loaded &&
-				    ena_try_rx_buf_page_reuse(rx_info, buf_len, len);
+				    ena_try_rx_buf_page_reuse(rx_info, buf_len, len, pkt_offset);
 
 		if (!reuse_rx_buf_page)
 			ena_unmap_rx_buff(rx_ring, rx_info);
diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h
index be7e50da737ab..e19cd75be698b 100644
--- a/drivers/amazon/net/ena/ena_netdev.h
+++ b/drivers/amazon/net/ena/ena_netdev.h
@@ -26,7 +26,7 @@
 
 #define DRV_MODULE_GEN_MAJOR	2
 #define DRV_MODULE_GEN_MINOR	7
-#define DRV_MODULE_GEN_SUBMINOR 3
+#define DRV_MODULE_GEN_SUBMINOR 4
 
 #define DRV_MODULE_NAME		"ena"
 #ifndef DRV_MODULE_GENERATION

From f181d9b720bf3ba366b94b1b45f619e2c58ca7be Mon Sep 17 00:00:00 2001
From: Oleg Kiselev <okiselev@amazon.com>
Date: Wed, 15 Jun 2022 05:32:59 +0000
Subject: [PATCH 400/737] ext4: reduce computation of overhead during resize

This patch avoids doing an O(n**2)-complexity walk through every flex group.
Instead, it uses the already computed overhead information for the newly
allocated space, and simply adds it to the previously calculated
overhead stored in the superblock.  This drastically reduces the time
taken to resize very large bigalloc filesystems (from 3+ hours for a
64TB fs down to milliseconds).

Signed-off-by: Oleg Kiselev <okiselev@amazon.com>
---
 fs/ext4/resize.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 51cebc1990eb1..bb4ef2fe3bd2f 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1360,6 +1360,16 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
 	return err;
 }
 
+static void ext4_add_overhead(struct super_block *sb,
+                              const ext4_fsblk_t overhead)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_super_block *es = sbi->s_es;
+
+       sbi->s_overhead += overhead;
+       smp_wmb();
+}
+
 /*
  * ext4_update_super() updates the super block so that the newly added
  * groups can be seen by the filesystem.
@@ -1458,9 +1468,17 @@ static void ext4_update_super(struct super_block *sb,
 	}
 
 	/*
-	 * Update the fs overhead information
+	 * Update the fs overhead information.
+	 *
+	 * For bigalloc, if the superblock already has a properly calculated
+	 * overhead, update it with a value based on numbers already computed
+	 * above for the newly allocated capacity.
 	 */
-	ext4_calculate_overhead(sb);
+	if (ext4_has_feature_bigalloc(sb) && (sbi->s_overhead != 0))
+		ext4_add_overhead(sb,
+			EXT4_NUM_B2C(sbi, blocks_count - free_blocks));
+	else
+		ext4_calculate_overhead(sb);
 	es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead);
 
 	if (test_opt(sb, DEBUG))

From b589d4a227533bd2c1a85853d71fa8bb585b1140 Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <surajjs@amazon.com>
Date: Fri, 5 Aug 2022 14:53:06 -0700
Subject: [PATCH 401/737] Mitigate unbalanced RETs on vmexit via serialising
 wrmsr

The primary mitigation for "unbalanced" RETs causing guest-directed
speculation on processors using eIBRS is the call;int3 sequence below:

	call 1f
	int3
	1:
	lea 0x8(%rsp), %rsp
	lfence

The int3 instruction acts as a speculation barrier, and lfence prevents
execution of the next RET until that call has retired.

However wrmsr is also a speculation barrier and can be used in place of
int3. If we ensure that a serialising write (via wrmsr) occurs within a
balanced call/ret, followed by an lfence, this also works as a
mitigation.

To do this always perform a wrmsr if SPEC_CTRL_IBRS is set in
vmx_spec_ctrl_restore_host(). The call to native_wrmsrl() to perform the
write, followed by lfence within barrier_nospec(), completes the
sequence.

In general benchmarks this appears to have a significantly lower
performance impact compared to the call;int3 sequence.

Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 arch/x86/kvm/vmx/vmx.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index c1f433afe83d6..16271591264e2 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6752,9 +6752,12 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
 	 * For legacy IBRS, the IBRS bit always needs to be written after
 	 * transitioning from a less privileged predictor mode, regardless of
 	 * whether the guest/host values differ.
+	 *
+	 * For eIBRS affected by Post Barrier RSB Predictions a serialising
+	 * instruction (wrmsr) must be executed.
 	 */
 	if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) ||
-	    vmx->spec_ctrl != hostval)
+	    vmx->spec_ctrl != hostval || (hostval & SPEC_CTRL_IBRS))
 		native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval);
 
 	barrier_nospec();

From 60377e058db4746a194e6f01b05d5e6e8767e2c7 Mon Sep 17 00:00:00 2001
From: Xin Hao <xhao@linux.alibaba.com>
Date: Fri, 14 Jan 2022 14:09:34 -0800
Subject: [PATCH 402/737] mm/damon: unified access_check function naming rules

Patch series "mm/damon: Do some small changes", v4.

This patch (of 4):

In damon/paddr.c file, two functions names start with underscore,
	static void __damon_pa_prepare_access_check(struct damon_ctx *ctx,
			struct damon_region *r)
	static void __damon_pa_prepare_access_check(struct damon_ctx *ctx,
			struct damon_region *r)
In damon/vaddr.c file, there are also two functions with the same function,
	static void damon_va_prepare_access_check(struct damon_ctx *ctx,
			struct mm_struct *mm, struct damon_region *r)
	static void damon_va_check_access(struct damon_ctx *ctx,
			struct mm_struct *mm, struct damon_region *r)

It makes sense to keep consistent, and it is not easy to be confused with
the function that call them.

Link: https://lkml.kernel.org/r/cover.1636989871.git.xhao@linux.alibaba.com
Link: https://lkml.kernel.org/r/529054aed932a42b9c09fc9977ad4574b9e7b0bd.1636989871.git.xhao@linux.alibaba.com
Signed-off-by: Xin Hao <xhao@linux.alibaba.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/vaddr.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 20a9a9d69eb19..73c5d1aafda6c 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -410,7 +410,7 @@ static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
  * Functions for the access checking of the regions
  */
 
-static void damon_va_prepare_access_check(struct damon_ctx *ctx,
+static void __damon_va_prepare_access_check(struct damon_ctx *ctx,
 			struct mm_struct *mm, struct damon_region *r)
 {
 	r->sampling_addr = damon_rand(r->ar.start, r->ar.end);
@@ -429,7 +429,7 @@ void damon_va_prepare_access_checks(struct damon_ctx *ctx)
 		if (!mm)
 			continue;
 		damon_for_each_region(r, t)
-			damon_va_prepare_access_check(ctx, mm, r);
+			__damon_va_prepare_access_check(ctx, mm, r);
 		mmput(mm);
 	}
 }
@@ -515,7 +515,7 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
  * mm	'mm_struct' for the given virtual address space
  * r	the region to be checked
  */
-static void damon_va_check_access(struct damon_ctx *ctx,
+static void __damon_va_check_access(struct damon_ctx *ctx,
 			       struct mm_struct *mm, struct damon_region *r)
 {
 	static struct mm_struct *last_mm;
@@ -551,7 +551,7 @@ unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
 		if (!mm)
 			continue;
 		damon_for_each_region(r, t) {
-			damon_va_check_access(ctx, mm, r);
+			__damon_va_check_access(ctx, mm, r);
 			max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
 		}
 		mmput(mm);

From b456a32b4ac408e301a9275d3b292eea4309e1dd Mon Sep 17 00:00:00 2001
From: Xin Hao <xhao@linux.alibaba.com>
Date: Fri, 14 Jan 2022 14:09:37 -0800
Subject: [PATCH 403/737] mm/damon: add 'age' of region tracepoint support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In Damon, we can get age information by analyzing the nr_access change,
But short time sampling is not effective, we have to obtain enough data
for analysis through long time trace, this also means that we need to
consume more cpu resources and storage space.

Now the region add a new 'age' variable, we only need to get the change of
age value through a little time trace, for example, age has been
increasing to 141, but nr_access shows a value of 0 at the same time,
Through this，we can conclude that the region has a very low nr_access
value for a long time.

Link: https://lkml.kernel.org/r/b9def1262af95e0dc1d0caea447886434db01161.1636989871.git.xhao@linux.alibaba.com
Signed-off-by: Xin Hao <xhao@linux.alibaba.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/trace/events/damon.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h
index 2f422f4f1fb9e..99ffa601e3511 100644
--- a/include/trace/events/damon.h
+++ b/include/trace/events/damon.h
@@ -22,6 +22,7 @@ TRACE_EVENT(damon_aggregated,
 		__field(unsigned long, start)
 		__field(unsigned long, end)
 		__field(unsigned int, nr_accesses)
+		__field(unsigned int, age)
 	),
 
 	TP_fast_assign(
@@ -30,11 +31,13 @@ TRACE_EVENT(damon_aggregated,
 		__entry->start = r->ar.start;
 		__entry->end = r->ar.end;
 		__entry->nr_accesses = r->nr_accesses;
+		__entry->age = r->age;
 	),
 
-	TP_printk("target_id=%lu nr_regions=%u %lu-%lu: %u",
+	TP_printk("target_id=%lu nr_regions=%u %lu-%lu: %u %u",
 			__entry->target_id, __entry->nr_regions,
-			__entry->start, __entry->end, __entry->nr_accesses)
+			__entry->start, __entry->end,
+			__entry->nr_accesses, __entry->age)
 );
 
 #endif /* _TRACE_DAMON_H */

From c0f5854c84539af2a7f6911fa1dae644768a814f Mon Sep 17 00:00:00 2001
From: Xin Hao <xhao@linux.alibaba.com>
Date: Fri, 14 Jan 2022 14:09:40 -0800
Subject: [PATCH 404/737] mm/damon/core: use abs() instead of diff_of()

In kernel, we can use abs(a - b) to get the absolute value, So there is no
need to redefine a new one.

Link: https://lkml.kernel.org/r/b24e7b82d9efa90daf150d62dea171e19390ad0b.1636989871.git.xhao@linux.alibaba.com
Signed-off-by: Xin Hao <xhao@linux.alibaba.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/core.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index e924978952025..04b8df7fd9e95 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -750,8 +750,6 @@ static void damon_merge_two_regions(struct damon_target *t,
 	damon_destroy_region(r, t);
 }
 
-#define diff_of(a, b) (a > b ? a - b : b - a)
-
 /*
  * Merge adjacent regions having similar access frequencies
  *
@@ -765,13 +763,13 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres,
 	struct damon_region *r, *prev = NULL, *next;
 
 	damon_for_each_region_safe(r, next, t) {
-		if (diff_of(r->nr_accesses, r->last_nr_accesses) > thres)
+		if (abs(r->nr_accesses - r->last_nr_accesses) > thres)
 			r->age = 0;
 		else
 			r->age++;
 
 		if (prev && prev->ar.end == r->ar.start &&
-		    diff_of(prev->nr_accesses, r->nr_accesses) <= thres &&
+		    abs(prev->nr_accesses - r->nr_accesses) <= thres &&
 		    sz_damon_region(prev) + sz_damon_region(r) <= sz_limit)
 			damon_merge_two_regions(t, prev, r);
 		else

From 141c56284cc13c07e5de6bf44ee844188c0752df Mon Sep 17 00:00:00 2001
From: Xin Hao <xhao@linux.alibaba.com>
Date: Fri, 14 Jan 2022 14:09:44 -0800
Subject: [PATCH 405/737] mm/damon: remove some unneeded function definitions
 in damon.h

In damon.h some func definitions about VA & PA can only be used in its
own file, so there no need to define in the header file, and the header
file will look cleaner.

If other files later need these functions, the prototypes can be added
to damon.h at that time.

[sj@kernel.org: remove unnecessary function prototype position changes]
 Link: https://lkml.kernel.org/r/20211118114827.20052-1-sj@kernel.org

Link: https://lkml.kernel.org/r/45fd5b3ef6cce8e28dbc1c92f9dc845ccfc949d7.1636989871.git.xhao@linux.alibaba.com
Signed-off-by: Xin Hao <xhao@linux.alibaba.com>
Signed-off-by: SeongJae Park <sj@kernel.org>
Reviewed-by: SeongJae Park <sj@kernel.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 21 ---------------------
 mm/damon/paddr.c      | 11 ++++++-----
 mm/damon/vaddr.c      | 18 ++++++++++--------
 3 files changed, 16 insertions(+), 34 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index b4d4be3cc987f..1d1be348f506e 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -461,34 +461,13 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
 #endif	/* CONFIG_DAMON */
 
 #ifdef CONFIG_DAMON_VADDR
-
-/* Monitoring primitives for virtual memory address spaces */
-void damon_va_init(struct damon_ctx *ctx);
-void damon_va_update(struct damon_ctx *ctx);
-void damon_va_prepare_access_checks(struct damon_ctx *ctx);
-unsigned int damon_va_check_accesses(struct damon_ctx *ctx);
 bool damon_va_target_valid(void *t);
-void damon_va_cleanup(struct damon_ctx *ctx);
-int damon_va_apply_scheme(struct damon_ctx *context, struct damon_target *t,
-		struct damon_region *r, struct damos *scheme);
-int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t,
-		struct damon_region *r, struct damos *scheme);
 void damon_va_set_primitives(struct damon_ctx *ctx);
-
 #endif	/* CONFIG_DAMON_VADDR */
 
 #ifdef CONFIG_DAMON_PADDR
-
-/* Monitoring primitives for the physical memory address space */
-void damon_pa_prepare_access_checks(struct damon_ctx *ctx);
-unsigned int damon_pa_check_accesses(struct damon_ctx *ctx);
 bool damon_pa_target_valid(void *t);
-int damon_pa_apply_scheme(struct damon_ctx *context, struct damon_target *t,
-		struct damon_region *r, struct damos *scheme);
-int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t,
-		struct damon_region *r, struct damos *scheme);
 void damon_pa_set_primitives(struct damon_ctx *ctx);
-
 #endif	/* CONFIG_DAMON_PADDR */
 
 #endif	/* _DAMON_H */
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index a496d6f203d64..4318134cbc4c5 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -73,7 +73,7 @@ static void __damon_pa_prepare_access_check(struct damon_ctx *ctx,
 	damon_pa_mkold(r->sampling_addr);
 }
 
-void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
+static void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
 {
 	struct damon_target *t;
 	struct damon_region *r;
@@ -192,7 +192,7 @@ static void __damon_pa_check_access(struct damon_ctx *ctx,
 	last_addr = r->sampling_addr;
 }
 
-unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
+static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
 {
 	struct damon_target *t;
 	struct damon_region *r;
@@ -213,7 +213,7 @@ bool damon_pa_target_valid(void *t)
 	return true;
 }
 
-int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t,
+static int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t,
 		struct damon_region *r, struct damos *scheme)
 {
 	unsigned long addr;
@@ -246,8 +246,9 @@ int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t,
 	return 0;
 }
 
-int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t,
-		struct damon_region *r, struct damos *scheme)
+static int damon_pa_scheme_score(struct damon_ctx *context,
+		struct damon_target *t, struct damon_region *r,
+		struct damos *scheme)
 {
 	switch (scheme->action) {
 	case DAMOS_PAGEOUT:
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 73c5d1aafda6c..a9d3b4d96e294 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -272,7 +272,7 @@ static void __damon_va_init_regions(struct damon_ctx *ctx,
 }
 
 /* Initialize '->regions_list' of every target (task) */
-void damon_va_init(struct damon_ctx *ctx)
+static void damon_va_init(struct damon_ctx *ctx)
 {
 	struct damon_target *t;
 
@@ -292,7 +292,8 @@ void damon_va_init(struct damon_ctx *ctx)
  *
  * Returns true if it is.
  */
-static bool damon_intersect(struct damon_region *r, struct damon_addr_range *re)
+static bool damon_intersect(struct damon_region *r,
+		struct damon_addr_range *re)
 {
 	return !(r->ar.end <= re->start || re->end <= r->ar.start);
 }
@@ -356,7 +357,7 @@ static void damon_va_apply_three_regions(struct damon_target *t,
 /*
  * Update regions for current memory mappings
  */
-void damon_va_update(struct damon_ctx *ctx)
+static void damon_va_update(struct damon_ctx *ctx)
 {
 	struct damon_addr_range three_regions[3];
 	struct damon_target *t;
@@ -418,7 +419,7 @@ static void __damon_va_prepare_access_check(struct damon_ctx *ctx,
 	damon_va_mkold(mm, r->sampling_addr);
 }
 
-void damon_va_prepare_access_checks(struct damon_ctx *ctx)
+static void damon_va_prepare_access_checks(struct damon_ctx *ctx)
 {
 	struct damon_target *t;
 	struct mm_struct *mm;
@@ -539,7 +540,7 @@ static void __damon_va_check_access(struct damon_ctx *ctx,
 	last_addr = r->sampling_addr;
 }
 
-unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
+static unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
 {
 	struct damon_target *t;
 	struct mm_struct *mm;
@@ -603,7 +604,7 @@ static int damos_madvise(struct damon_target *target, struct damon_region *r,
 }
 #endif	/* CONFIG_ADVISE_SYSCALLS */
 
-int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t,
+static int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t,
 		struct damon_region *r, struct damos *scheme)
 {
 	int madv_action;
@@ -633,8 +634,9 @@ int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t,
 	return damos_madvise(t, r, madv_action);
 }
 
-int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t,
-		struct damon_region *r, struct damos *scheme)
+static int damon_va_scheme_score(struct damon_ctx *context,
+		struct damon_target *t, struct damon_region *r,
+		struct damos *scheme)
 {
 
 	switch (scheme->action) {

From 0a01541a162101125c711937ebc75f83e1d4bad5 Mon Sep 17 00:00:00 2001
From: Yihao Han <hanyihao@vivo.com>
Date: Fri, 14 Jan 2022 14:09:47 -0800
Subject: [PATCH 406/737] mm/damon/vaddr: remove swap_ranges() and replace it
 with swap()

Remove 'swap_ranges()' and replace it with the macro 'swap()' defined in
'include/linux/minmax.h' to simplify code and improve efficiency

Link: https://lkml.kernel.org/r/20211111115355.2808-1-hanyihao@vivo.com
Signed-off-by: Yihao Han <hanyihao@vivo.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/vaddr.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index a9d3b4d96e294..78ff2bcb66eb3 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -98,16 +98,6 @@ static unsigned long sz_range(struct damon_addr_range *r)
 	return r->end - r->start;
 }
 
-static void swap_ranges(struct damon_addr_range *r1,
-			struct damon_addr_range *r2)
-{
-	struct damon_addr_range tmp;
-
-	tmp = *r1;
-	*r1 = *r2;
-	*r2 = tmp;
-}
-
 /*
  * Find three regions separated by two biggest unmapped regions
  *
@@ -146,9 +136,9 @@ static int __damon_va_three_regions(struct vm_area_struct *vma,
 		gap.start = last_vma->vm_end;
 		gap.end = vma->vm_start;
 		if (sz_range(&gap) > sz_range(&second_gap)) {
-			swap_ranges(&gap, &second_gap);
+			swap(gap, second_gap);
 			if (sz_range(&second_gap) > sz_range(&first_gap))
-				swap_ranges(&second_gap, &first_gap);
+				swap(second_gap, first_gap);
 		}
 next:
 		last_vma = vma;
@@ -159,7 +149,7 @@ static int __damon_va_three_regions(struct vm_area_struct *vma,
 
 	/* Sort the two biggest gaps by address */
 	if (first_gap.start > second_gap.start)
-		swap_ranges(&first_gap, &second_gap);
+		swap(first_gap, second_gap);
 
 	/* Store the result */
 	regions[0].start = ALIGN(start, DAMON_MIN_REGION);

From eacb4295ddd7381e3e2c45806999a7448039d244 Mon Sep 17 00:00:00 2001
From: Xin Hao <xhao@linux.alibaba.com>
Date: Fri, 14 Jan 2022 14:09:50 -0800
Subject: [PATCH 407/737] mm/damon/schemes: add the validity judgment of
 thresholds

In dbgfs "schemes" interface, i do some test like this:
    # cd /sys/kernel/debug/damon
    # echo "2 1 2 1 10 1 3 10 1 1 1 1 1 1 1 1 2 3" > schemes
    # cat schemes
    # 2 1 2 1 10 1 3 10 1 1 1 1 1 1 1 1 2 3 0 0

There have some unreasonable places, i set the valules of these variables
"<min_sz, max_sz> <min_nr_a, max_nr_a>, <min_age, max_age>, <wmarks.high,
wmarks.mid, wmarks.low>" as "<2, 1>, <2, 1>, <10, 1>, <1, 2, 3>.

So there add a validity judgment for these thresholds value.

Link: https://lkml.kernel.org/r/d78360e52158d786fcbf20bc62c96785742e76d3.1637239568.git.xhao@linux.alibaba.com
Signed-off-by: Xin Hao <xhao@linux.alibaba.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/dbgfs.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index ad65436756aff..bf36a2756cfb1 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -213,6 +213,13 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
 		if (!damos_action_valid(action))
 			goto fail;
 
+		if (min_sz > max_sz || min_nr_a > max_nr_a || min_age > max_age)
+			goto fail;
+
+		if (wmarks.high < wmarks.mid || wmarks.high < wmarks.low ||
+		    wmarks.mid <  wmarks.low)
+			goto fail;
+
 		pos += parsed;
 		scheme = damon_new_scheme(min_sz, max_sz, min_nr_a, max_nr_a,
 				min_age, max_age, action, &quota, &wmarks);

From a1b78f36e1642310cb99836cdab2f4b167fc4844 Mon Sep 17 00:00:00 2001
From: Xin Hao <xhao@linux.alibaba.com>
Date: Fri, 14 Jan 2022 14:09:53 -0800
Subject: [PATCH 408/737] mm/damon: move damon_rand() definition into damon.h

damon_rand() is called in three files:damon/core.c, damon/ paddr.c,
damon/vaddr.c, i think there is no need to redefine this twice, So move
it to damon.h will be a good choice.

Link: https://lkml.kernel.org/r/20211202075859.51341-1-xhao@linux.alibaba.com
Signed-off-by: Xin Hao <xhao@linux.alibaba.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h   | 4 ++++
 mm/damon/core.c         | 4 ----
 mm/damon/prmtv-common.h | 4 ----
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 1d1be348f506e..3e91a597a1aad 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -11,12 +11,16 @@
 #include <linux/mutex.h>
 #include <linux/time64.h>
 #include <linux/types.h>
+#include <linux/random.h>
 
 /* Minimal region size.  Every damon_region is aligned by this. */
 #define DAMON_MIN_REGION	PAGE_SIZE
 /* Max priority score for DAMON-based operation schemes */
 #define DAMOS_MAX_SCORE		(99)
 
+/* Get a random number in [l, r) */
+#define damon_rand(l, r) (l + prandom_u32_max(r - l))
+
 /**
  * struct damon_addr_range - Represents an address region of [@start, @end).
  * @start:	Start address of the region (inclusive).
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 04b8df7fd9e95..61e844d15b13a 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -11,7 +11,6 @@
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/mm.h>
-#include <linux/random.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 
@@ -23,9 +22,6 @@
 #define DAMON_MIN_REGION 1
 #endif
 
-/* Get a random number in [l, r) */
-#define damon_rand(l, r) (l + prandom_u32_max(r - l))
-
 static DEFINE_MUTEX(damon_lock);
 static int nr_running_ctxs;
 
diff --git a/mm/damon/prmtv-common.h b/mm/damon/prmtv-common.h
index 61f27037603e1..e790cb5f8fe05 100644
--- a/mm/damon/prmtv-common.h
+++ b/mm/damon/prmtv-common.h
@@ -6,10 +6,6 @@
  */
 
 #include <linux/damon.h>
-#include <linux/random.h>
-
-/* Get a random number in [l, r) */
-#define damon_rand(l, r) (l + prandom_u32_max(r - l))
 
 struct page *damon_get_page(unsigned long pfn);
 

From 7306cbe06fc3696307f7ee0b7047219a603d8662 Mon Sep 17 00:00:00 2001
From: Xin Hao <xhao@linux.alibaba.com>
Date: Fri, 14 Jan 2022 14:09:56 -0800
Subject: [PATCH 409/737] mm/damon: modify damon_rand() macro to static inline
 function

damon_rand() cannot be implemented as a macro.

Example:
	damon_rand(a++, b);

The value of 'a' will be incremented twice, This is obviously
unreasonable, So there fix it.

Link: https://lkml.kernel.org/r/110ffcd4e420c86c42b41ce2bc9f0fe6a4f32cd3.1638795127.git.xhao@linux.alibaba.com
Fixes: b9a6ac4e4ede ("mm/damon: adaptively adjust regions")
Signed-off-by: Xin Hao <xhao@linux.alibaba.com>
Reported-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 3e91a597a1aad..e2c8152985b7d 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -19,7 +19,10 @@
 #define DAMOS_MAX_SCORE		(99)
 
 /* Get a random number in [l, r) */
-#define damon_rand(l, r) (l + prandom_u32_max(r - l))
+static inline unsigned long damon_rand(unsigned long l, unsigned long r)
+{
+	return l + prandom_u32_max(r - l);
+}
 
 /**
  * struct damon_addr_range - Represents an address region of [@start, @end).

From bcb62cf0420bb99f10993a58e1b4f87c4e285a6b Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 14 Jan 2022 14:09:59 -0800
Subject: [PATCH 410/737] mm/damon: convert macro functions to static inline
 functions

Patch series "mm/damon: Misc cleanups".

This patchset contains miscellaneous cleanups for DAMON's macro
functions and documentation.

This patch (of 6):

This commit converts macro functions in DAMON to static inline functions,
for better type checking, code documentation, etc[1].

[1] https://lore.kernel.org/linux-mm/20211202151213.6ec830863342220da4141bc5@linux-foundation.org/

Link: https://lkml.kernel.org/r/20211209131806.19317-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20211209131806.19317-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 18 ++++++++++++------
 mm/damon/core.c       |  5 ++++-
 mm/damon/vaddr.c      |  6 ++++--
 3 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index e2c8152985b7d..2dbc1f545da20 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -399,14 +399,20 @@ struct damon_ctx {
 	struct list_head schemes;
 };
 
-#define damon_next_region(r) \
-	(container_of(r->list.next, struct damon_region, list))
+static inline struct damon_region *damon_next_region(struct damon_region *r)
+{
+	return container_of(r->list.next, struct damon_region, list);
+}
 
-#define damon_prev_region(r) \
-	(container_of(r->list.prev, struct damon_region, list))
+static inline struct damon_region *damon_prev_region(struct damon_region *r)
+{
+	return container_of(r->list.prev, struct damon_region, list);
+}
 
-#define damon_last_region(t) \
-	(list_last_entry(&t->regions_list, struct damon_region, list))
+static inline struct damon_region *damon_last_region(struct damon_target *t)
+{
+	return list_last_entry(&t->regions_list, struct damon_region, list);
+}
 
 #define damon_for_each_region(r, t) \
 	list_for_each_entry(r, &t->regions_list, list)
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 61e844d15b13a..4515cf82c433c 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -729,7 +729,10 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
 	}
 }
 
-#define sz_damon_region(r) (r->ar.end - r->ar.start)
+static inline unsigned long sz_damon_region(struct damon_region *r)
+{
+	return r->ar.end - r->ar.start;
+}
 
 /*
  * Merge two adjacent regions into one region
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 78ff2bcb66eb3..68d9e4134816d 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -26,8 +26,10 @@
  * 't->id' should be the pointer to the relevant 'struct pid' having reference
  * count.  Caller must put the returned task, unless it is NULL.
  */
-#define damon_get_task_struct(t) \
-	(get_pid_task((struct pid *)t->id, PIDTYPE_PID))
+static inline struct task_struct *damon_get_task_struct(struct damon_target *t)
+{
+	return get_pid_task((struct pid *)t->id, PIDTYPE_PID);
+}
 
 /*
  * Get the mm_struct of the given target

From b3e7537bfc6c98f762acfc77e865159ae2437120 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 14 Jan 2022 14:10:02 -0800
Subject: [PATCH 411/737] Docs/admin-guide/mm/damon/usage: update for scheme
 quotas and watermarks

DAMOS features including time/space quota limits and watermarks are not
described in the DAMON debugfs interface document.  This commit updates
the document for the features.

Link: https://lkml.kernel.org/r/20211209131806.19317-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 123 +++++++++++++++----
 1 file changed, 98 insertions(+), 25 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index ed96bbf0daffc..1ab9b714fca20 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -131,24 +131,38 @@ Schemes
 
 For usual DAMON-based data access aware memory management optimizations, users
 would simply want the system to apply a memory management action to a memory
-region of a specific size having a specific access frequency for a specific
-time.  DAMON receives such formalized operation schemes from the user and
-applies those to the target processes.  It also counts the total number and
-size of regions that each scheme is applied.  This statistics can be used for
-online analysis or tuning of the schemes.
+region of a specific access pattern.  DAMON receives such formalized operation
+schemes from the user and applies those to the target processes.
 
 Users can get and set the schemes by reading from and writing to ``schemes``
 debugfs file.  Reading the file also shows the statistics of each scheme.  To
-the file, each of the schemes should be represented in each line in below form:
+the file, each of the schemes should be represented in each line in below
+form::
 
-    min-size max-size min-acc max-acc min-age max-age action
+    <target access pattern> <action> <quota> <watermarks>
 
-Note that the ranges are closed interval.  Bytes for the size of regions
-(``min-size`` and ``max-size``), number of monitored accesses per aggregate
-interval for access frequency (``min-acc`` and ``max-acc``), number of
-aggregate intervals for the age of regions (``min-age`` and ``max-age``), and a
-predefined integer for memory management actions should be used.  The supported
-numbers and their meanings are as below.
+You can disable schemes by simply writing an empty string to the file.
+
+Target Access Pattern
+~~~~~~~~~~~~~~~~~~~~~
+
+The ``<target access pattern>`` is constructed with three ranges in below
+form::
+
+    min-size max-size min-acc max-acc min-age max-age
+
+Specifically, bytes for the size of regions (``min-size`` and ``max-size``),
+number of monitored accesses per aggregate interval for access frequency
+(``min-acc`` and ``max-acc``), number of aggregate intervals for the age of
+regions (``min-age`` and ``max-age``) are specified.  Note that the ranges are
+closed interval.
+
+Action
+~~~~~~
+
+The ``<action>`` is a predefined integer for memory management actions, which
+DAMON will apply to the regions having the target access pattern.  The
+supported numbers and their meanings are as below.
 
  - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED``
  - 1: Call ``madvise()`` for the region with ``MADV_COLD``
@@ -157,20 +171,79 @@ numbers and their meanings are as below.
  - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``
  - 5: Do nothing but count the statistics
 
-You can disable schemes by simply writing an empty string to the file.  For
-example, below commands applies a scheme saying "If a memory region of size in
-[4KiB, 8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate
-interval in [10, 20], page out the region", check the entered scheme again, and
-finally remove the scheme. ::
+Quota
+~~~~~
 
-    # cd <debugfs>/damon
-    # echo "4096 8192    0 5    10 20    2" > schemes
-    # cat schemes
-    4096 8192 0 5 10 20 2 0 0
-    # echo > schemes
+Optimal ``target access pattern`` for each ``action`` is workload dependent, so
+not easy to find.  Worse yet, setting a scheme of some action too aggressive
+can cause severe overhead.  To avoid such overhead, users can limit time and
+size quota for the scheme via the ``<quota>`` in below form::
+
+    <ms> <sz> <reset interval> <priority weights>
+
+This makes DAMON to try to use only up to ``<ms>`` milliseconds for applying
+the action to memory regions of the ``target access pattern`` within the
+``<reset interval>`` milliseconds, and to apply the action to only up to
+``<sz>`` bytes of memory regions within the ``<reset interval>``.  Setting both
+``<ms>`` and ``<sz>`` zero disables the quota limits.
+
+When the quota limit is expected to be exceeded, DAMON prioritizes found memory
+regions of the ``target access pattern`` based on their size, access frequency,
+and age.  For personalized prioritization, users can set the weights for the
+three properties in ``<priority weights>`` in below form::
+
+    <size weight> <access frequency weight> <age weight>
+
+Watermarks
+~~~~~~~~~~
 
-The last two integers in the 4th line of above example is the total number and
-the total size of the regions that the scheme is applied.
+Some schemes would need to run based on current value of the system's specific
+metrics like free memory ratio.  For such cases, users can specify watermarks
+for the condition.::
+
+    <metric> <check interval> <high mark> <middle mark> <low mark>
+
+``<metric>`` is a predefined integer for the metric to be checked.  The
+supported numbers and their meanings are as below.
+
+ - 0: Ignore the watermarks
+ - 1: System's free memory rate (per thousand)
+
+The value of the metric is checked every ``<check interval>`` microseconds.
+
+If the value is higher than ``<high mark>`` or lower than ``<low mark>``, the
+scheme is deactivated.  If the value is lower than ``<mid mark>``, the scheme
+is activated.
+
+Statistics
+~~~~~~~~~~
+
+It also counts the total number and bytes of regions that each scheme is
+applied.  This statistics can be used for online analysis or tuning of the
+schemes.
+
+The statistics can be shown by reading the ``schemes`` file.  Reading the file
+will show each scheme you entered in each line, and the two numbers for the
+statistics will be added at the end of each line.
+
+Example
+~~~~~~~
+
+Below commands applies a scheme saying "If a memory region of size in [4KiB,
+8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate
+interval in [10, 20], page out the region.  For the paging out, use only up to
+10ms per second, and also don't page out more than 1GiB per second.  Under the
+limitation, page out memory regions having longer age first.  Also, check the
+free memory rate of the system every 5 seconds, start the monitoring and paging
+out when the free memory rate becomes lower than 50%, but stop it if the free
+memory rate becomes larger than 60%, or lower than 30%".::
+
+    # cd <debugfs>/damon
+    # scheme="4096 8192  0 5    10 20    2"  # target access pattern and action
+    # scheme+=" 10 $((1024*1024*1024)) 1000" # quotas
+    # scheme+=" 0 0 100"                     # prioritization weights
+    # scheme+=" 1 5000000 600 500 300"       # watermarks
+    # echo "$scheme" > schemes
 
 
 Turning On/Off

From 7013a5b678b08637d5a011333b4133905fddce3b Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 14 Jan 2022 14:10:05 -0800
Subject: [PATCH 412/737] Docs/admin-guide/mm/damon/usage: remove redundant
 information

DAMON usage document mentions DAMON user space tool and programming
interface twice.  This commit integrates those and remove unnecessary
part.

Link: https://lkml.kernel.org/r/20211209131806.19317-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 44 ++++++++++----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 1ab9b714fca20..24137312f6011 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -7,30 +7,30 @@ Detailed Usages
 DAMON provides below three interfaces for different users.
 
 - *DAMON user space tool.*
-  This is for privileged people such as system administrators who want a
-  just-working human-friendly interface.  Using this, users can use the DAMON’s
-  major features in a human-friendly way.  It may not be highly tuned for
-  special cases, though.  It supports both virtual and physical address spaces
-  monitoring.
+  `This <https://github.com/awslabs/damo>`_ is for privileged people such as
+  system administrators who want a just-working human-friendly interface.
+  Using this, users can use the DAMON’s major features in a human-friendly way.
+  It may not be highly tuned for special cases, though.  It supports both
+  virtual and physical address spaces monitoring.  For more detail, please
+  refer to its `usage document
+  <https://github.com/awslabs/damo/blob/next/USAGE.md>`_.
 - *debugfs interface.*
-  This is for privileged user space programmers who want more optimized use of
-  DAMON.  Using this, users can use DAMON’s major features by reading
-  from and writing to special debugfs files.  Therefore, you can write and use
-  your personalized DAMON debugfs wrapper programs that reads/writes the
-  debugfs files instead of you.  The DAMON user space tool is also a reference
-  implementation of such programs.  It supports both virtual and physical
-  address spaces monitoring.
+  :ref:`This <debugfs_interface>` is for privileged user space programmers who
+  want more optimized use of DAMON.  Using this, users can use DAMON’s major
+  features by reading from and writing to special debugfs files.  Therefore,
+  you can write and use your personalized DAMON debugfs wrapper programs that
+  reads/writes the debugfs files instead of you.  The `DAMON user space tool
+  <https://github.com/awslabs/damo>`_ is one example of such programs.  It
+  supports both virtual and physical address spaces monitoring.
 - *Kernel Space Programming Interface.*
-  This is for kernel space programmers.  Using this, users can utilize every
-  feature of DAMON most flexibly and efficiently by writing kernel space
-  DAMON application programs for you.  You can even extend DAMON for various
-  address spaces.
-
-Nevertheless, you could write your own user space tool using the debugfs
-interface.  A reference implementation is available at
-https://github.com/awslabs/damo.  If you are a kernel programmer, you could
-refer to :doc:`/vm/damon/api` for the kernel space programming interface.  For
-the reason, this document describes only the debugfs interface
+  :doc:`This </vm/damon/api>` is for kernel space programmers.  Using this,
+  users can utilize every feature of DAMON most flexibly and efficiently by
+  writing kernel space DAMON application programs for you.  You can even extend
+  DAMON for various address spaces.  For detail, please refer to the interface
+  :doc:`document </vm/damon/api>`.
+
+
+.. _debugfs_interface:
 
 debugfs Interface
 =================

From 3d6e8cd5ccc01ac77e4e75446f306135d5762f28 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 14 Jan 2022 14:10:08 -0800
Subject: [PATCH 413/737] Docs/admin-guide/mm/damon/usage: mention tracepoint
 at the beginning

To get detailed monitoring results from the user space, users need to
use the damon_aggregated tracepoint.  This commit adds a brief mention
of it at the beginning of the usage document.

Link: https://lkml.kernel.org/r/20211209131806.19317-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 24137312f6011..846c85bf4b9dc 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -21,7 +21,10 @@ DAMON provides below three interfaces for different users.
   you can write and use your personalized DAMON debugfs wrapper programs that
   reads/writes the debugfs files instead of you.  The `DAMON user space tool
   <https://github.com/awslabs/damo>`_ is one example of such programs.  It
-  supports both virtual and physical address spaces monitoring.
+  supports both virtual and physical address spaces monitoring.  Note that this
+  interface provides only simple :ref:`statistics <damos_stats>` for the
+  monitoring results.  For detailed monitoring results, DAMON provides a
+  :ref:`tracepoint <tracepoint>`.
 - *Kernel Space Programming Interface.*
   :doc:`This </vm/damon/api>` is for kernel space programmers.  Using this,
   users can utilize every feature of DAMON most flexibly and efficiently by
@@ -215,6 +218,8 @@ If the value is higher than ``<high mark>`` or lower than ``<low mark>``, the
 scheme is deactivated.  If the value is lower than ``<mid mark>``, the scheme
 is activated.
 
+.. _damos_stats:
+
 Statistics
 ~~~~~~~~~~
 
@@ -268,6 +273,8 @@ the monitoring is turned on.  If you write to the files while DAMON is running,
 an error code such as ``-EBUSY`` will be returned.
 
 
+.. _tracepoint:
+
 Tracepoint for Monitoring Results
 =================================
 

From 4a7bea306c1f835d0de7fd159cc003a68f4188cd Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 14 Jan 2022 14:10:11 -0800
Subject: [PATCH 414/737] Docs/admin-guide/mm/damon/usage: update for
 kdamond_pid and (mk|rm)_contexts

The DAMON debugfs usage document is missing descriptions for
'kdamond_pid', 'mk_contexts', and 'rm_contexts' debugfs files.  This
commit adds those.

Link: https://lkml.kernel.org/r/20211209131806.19317-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 52 ++++++++++++++++++--
 1 file changed, 49 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 846c85bf4b9dc..cb614c84ba9e9 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -38,9 +38,9 @@ DAMON provides below three interfaces for different users.
 debugfs Interface
 =================
 
-DAMON exports five files, ``attrs``, ``target_ids``, ``init_regions``,
-``schemes`` and ``monitor_on`` under its debugfs directory,
-``<debugfs>/damon/``.
+DAMON exports eight files, ``attrs``, ``target_ids``, ``init_regions``,
+``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` and
+``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
 
 
 Attributes
@@ -273,6 +273,52 @@ the monitoring is turned on.  If you write to the files while DAMON is running,
 an error code such as ``-EBUSY`` will be returned.
 
 
+Monitoring Thread PID
+---------------------
+
+DAMON does requested monitoring with a kernel thread called ``kdamond``.  You
+can get the pid of the thread by reading the ``kdamond_pid`` file.  When the
+monitoring is turned off, reading the file returns ``none``. ::
+
+    # cd <debugfs>/damon
+    # cat monitor_on
+    off
+    # cat kdamond_pid
+    none
+    # echo on > monitor_on
+    # cat kdamond_pid
+    18594
+
+
+Using Multiple Monitoring Threads
+---------------------------------
+
+One ``kdamond`` thread is created for each monitoring context.  You can create
+and remove monitoring contexts for multiple ``kdamond`` required use case using
+the ``mk_contexts`` and ``rm_contexts`` files.
+
+Writing the name of the new context to the ``mk_contexts`` file creates a
+directory of the name on the DAMON debugfs directory.  The directory will have
+DAMON debugfs files for the context. ::
+
+    # cd <debugfs>/damon
+    # ls foo
+    # ls: cannot access 'foo': No such file or directory
+    # echo foo > mk_contexts
+    # ls foo
+    # attrs  init_regions  kdamond_pid  schemes  target_ids
+
+If the context is not needed anymore, you can remove it and the corresponding
+directory by putting the name of the context to the ``rm_contexts`` file. ::
+
+    # echo foo > rm_contexts
+    # ls foo
+    # ls: cannot access 'foo': No such file or directory
+
+Note that ``mk_contexts``, ``rm_contexts``, and ``monitor_on`` files are in the
+root directory only.
+
+
 .. _tracepoint:
 
 Tracepoint for Monitoring Results

From 74fe6ee2cbb2172b7f7313b0f36988ff6ec56864 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 14 Jan 2022 14:10:14 -0800
Subject: [PATCH 415/737] mm/damon: remove a mistakenly added comment for a
 future feature

Due to a mistake in patches reordering, a comment for a future feature
called 'arbitrary monitoring target support'[1], which is still under
development, has added.  Because it only introduces confusion and we
don't have a plan to post the patches soon, this commit removes the
mistakenly added part.

[1] https://lore.kernel.org/linux-mm/20201215115448.25633-3-sjpark@amazon.com/

Link: https://lkml.kernel.org/r/20211209131806.19317-7-sj@kernel.org
Fixes: 1f366e421c8f ("mm/damon/core: implement DAMON-based Operation Schemes (DAMOS)")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 2dbc1f545da20..97f4a224e9502 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -281,7 +281,7 @@ struct damon_ctx;
  * as an integer in [0, &DAMOS_MAX_SCORE].
  * @apply_scheme is called from @kdamond when a region for user provided
  * DAMON-based operation scheme is found.  It should apply the scheme's action
- * to the region.  This is not used for &DAMON_ARBITRARY_TARGET case.
+ * to the region.
  * @target_valid should check whether the target is still valid for the
  * monitoring.
  * @cleanup is called from @kdamond just before its termination.

From 8ef8058af1312ba82d8781dd7427242ac0533b3a Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 14 Jan 2022 14:10:17 -0800
Subject: [PATCH 416/737] mm/damon/schemes: account scheme actions that
 successfully applied

Patch series "mm/damon/schemes: Extend stats for better online analysis and tuning".

To help online access pattern analysis and tuning of DAMON-based
Operation Schemes (DAMOS), DAMOS provides simple statistics for each
scheme.  Introduction of DAMOS time/space quota further made the tuning
easier by making the risk management easier.  However, that also made
understanding of the working schemes a little bit more difficult.

For an example, progress of a given scheme can now be throttled by not
only the aggressiveness of the target access pattern, but also the
time/space quotas.  So, when a scheme is showing unexpectedly slow
progress, it's difficult to know by what the progress of the scheme is
throttled, with currently provided statistics.

This patchset extends the statistics to contain some metrics that can be
helpful for such online schemes analysis and tuning (patches 1-2),
exports those to users (patches 3 and 5), and add documents (patches 4
and 6).

This patch (of 6):

DAMON-based operation schemes (DAMOS) stats provide only the number and
the amount of regions that the action of the scheme has tried to be
applied.  Because the action could be failed for some reasons, the
currently provided information is sometimes not useful or convenient
enough for schemes profiling and tuning.  To improve this situation,
this commit extends the DAMOS stats to provide the number and the amount
of regions that the action has successfully applied.

Link: https://lkml.kernel.org/r/20211210150016.35349-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20211210150016.35349-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 28 +++++++++++++++++++++-------
 mm/damon/core.c       | 13 ++++++++-----
 mm/damon/dbgfs.c      |  2 +-
 mm/damon/paddr.c      | 13 +++++++------
 mm/damon/vaddr.c      | 30 ++++++++++++++++--------------
 5 files changed, 53 insertions(+), 33 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 97f4a224e9502..e0ad3d9aaeedb 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -192,6 +192,20 @@ struct damos_watermarks {
 	bool activated;
 };
 
+/**
+ * struct damos_stat - Statistics on a given scheme.
+ * @nr_tried:	Total number of regions that the scheme is tried to be applied.
+ * @sz_tried:	Total size of regions that the scheme is tried to be applied.
+ * @nr_applied:	Total number of regions that the scheme is applied.
+ * @sz_applied:	Total size of regions that the scheme is applied.
+ */
+struct damos_stat {
+	unsigned long nr_tried;
+	unsigned long sz_tried;
+	unsigned long nr_applied;
+	unsigned long sz_applied;
+};
+
 /**
  * struct damos - Represents a Data Access Monitoring-based Operation Scheme.
  * @min_sz_region:	Minimum size of target regions.
@@ -203,8 +217,7 @@ struct damos_watermarks {
  * @action:		&damo_action to be applied to the target regions.
  * @quota:		Control the aggressiveness of this scheme.
  * @wmarks:		Watermarks for automated (in)activation of this scheme.
- * @stat_count:		Total number of regions that this scheme is applied.
- * @stat_sz:		Total size of regions that this scheme is applied.
+ * @stat:		Statistics of this scheme.
  * @list:		List head for siblings.
  *
  * For each aggregation interval, DAMON finds regions which fit in the
@@ -235,8 +248,7 @@ struct damos {
 	enum damos_action action;
 	struct damos_quota quota;
 	struct damos_watermarks wmarks;
-	unsigned long stat_count;
-	unsigned long stat_sz;
+	struct damos_stat stat;
 	struct list_head list;
 };
 
@@ -281,7 +293,8 @@ struct damon_ctx;
  * as an integer in [0, &DAMOS_MAX_SCORE].
  * @apply_scheme is called from @kdamond when a region for user provided
  * DAMON-based operation scheme is found.  It should apply the scheme's action
- * to the region.
+ * to the region and return bytes of the region that the action is successfully
+ * applied.
  * @target_valid should check whether the target is still valid for the
  * monitoring.
  * @cleanup is called from @kdamond just before its termination.
@@ -295,8 +308,9 @@ struct damon_primitive {
 	int (*get_scheme_score)(struct damon_ctx *context,
 			struct damon_target *t, struct damon_region *r,
 			struct damos *scheme);
-	int (*apply_scheme)(struct damon_ctx *context, struct damon_target *t,
-			struct damon_region *r, struct damos *scheme);
+	unsigned long (*apply_scheme)(struct damon_ctx *context,
+			struct damon_target *t, struct damon_region *r,
+			struct damos *scheme);
 	bool (*target_valid)(void *target);
 	void (*cleanup)(struct damon_ctx *context);
 };
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 4515cf82c433c..d745bf28509ff 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -102,8 +102,7 @@ struct damos *damon_new_scheme(
 	scheme->min_age_region = min_age_region;
 	scheme->max_age_region = max_age_region;
 	scheme->action = action;
-	scheme->stat_count = 0;
-	scheme->stat_sz = 0;
+	scheme->stat = (struct damos_stat){};
 	INIT_LIST_HEAD(&scheme->list);
 
 	scheme->quota.ms = quota->ms;
@@ -574,6 +573,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
 		struct damos_quota *quota = &s->quota;
 		unsigned long sz = r->ar.end - r->ar.start;
 		struct timespec64 begin, end;
+		unsigned long sz_applied = 0;
 
 		if (!s->wmarks.activated)
 			continue;
@@ -627,7 +627,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
 				damon_split_region_at(c, t, r, sz);
 			}
 			ktime_get_coarse_ts64(&begin);
-			c->primitive.apply_scheme(c, t, r, s);
+			sz_applied = c->primitive.apply_scheme(c, t, r, s);
 			ktime_get_coarse_ts64(&end);
 			quota->total_charged_ns += timespec64_to_ns(&end) -
 				timespec64_to_ns(&begin);
@@ -641,8 +641,11 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
 			r->age = 0;
 
 update_stat:
-		s->stat_count++;
-		s->stat_sz += sz;
+		s->stat.nr_tried++;
+		s->stat.sz_tried += sz;
+		if (sz_applied)
+			s->stat.nr_applied++;
+		s->stat.sz_applied += sz_applied;
 	}
 }
 
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index bf36a2756cfb1..9318b52d0b462 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -117,7 +117,7 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
 				s->quota.weight_age,
 				s->wmarks.metric, s->wmarks.interval,
 				s->wmarks.high, s->wmarks.mid, s->wmarks.low,
-				s->stat_count, s->stat_sz);
+				s->stat.nr_tried, s->stat.sz_tried);
 		if (!rc)
 			return -ENOMEM;
 
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 4318134cbc4c5..5e8244f65a1a2 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -213,14 +213,15 @@ bool damon_pa_target_valid(void *t)
 	return true;
 }
 
-static int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t,
-		struct damon_region *r, struct damos *scheme)
+static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
+		struct damon_target *t, struct damon_region *r,
+		struct damos *scheme)
 {
-	unsigned long addr;
+	unsigned long addr, applied;
 	LIST_HEAD(page_list);
 
 	if (scheme->action != DAMOS_PAGEOUT)
-		return -EINVAL;
+		return 0;
 
 	for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
 		struct page *page = damon_get_page(PHYS_PFN(addr));
@@ -241,9 +242,9 @@ static int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t,
 			put_page(page);
 		}
 	}
-	reclaim_pages(&page_list);
+	applied = reclaim_pages(&page_list);
 	cond_resched();
-	return 0;
+	return applied * PAGE_SIZE;
 }
 
 static int damon_pa_scheme_score(struct damon_ctx *context,
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 68d9e4134816d..a10df3fd3d024 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -572,32 +572,34 @@ bool damon_va_target_valid(void *target)
 }
 
 #ifndef CONFIG_ADVISE_SYSCALLS
-static int damos_madvise(struct damon_target *target, struct damon_region *r,
-			int behavior)
+static unsigned long damos_madvise(struct damon_target *target,
+		struct damon_region *r, int behavior)
 {
-	return -EINVAL;
+	return 0;
 }
 #else
-static int damos_madvise(struct damon_target *target, struct damon_region *r,
-			int behavior)
+static unsigned long damos_madvise(struct damon_target *target,
+		struct damon_region *r, int behavior)
 {
 	struct mm_struct *mm;
-	int ret = -ENOMEM;
+	unsigned long start = PAGE_ALIGN(r->ar.start);
+	unsigned long len = PAGE_ALIGN(r->ar.end - r->ar.start);
+	unsigned long applied;
 
 	mm = damon_get_mm(target);
 	if (!mm)
-		goto out;
+		return 0;
 
-	ret = do_madvise(mm, PAGE_ALIGN(r->ar.start),
-			PAGE_ALIGN(r->ar.end - r->ar.start), behavior);
+	applied = do_madvise(mm, start, len, behavior) ? 0 : len;
 	mmput(mm);
-out:
-	return ret;
+
+	return applied;
 }
 #endif	/* CONFIG_ADVISE_SYSCALLS */
 
-static int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t,
-		struct damon_region *r, struct damos *scheme)
+static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
+		struct damon_target *t, struct damon_region *r,
+		struct damos *scheme)
 {
 	int madv_action;
 
@@ -620,7 +622,7 @@ static int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t,
 	case DAMOS_STAT:
 		return 0;
 	default:
-		return -EINVAL;
+		return 0;
 	}
 
 	return damos_madvise(t, r, madv_action);

From 14b60ddfd3a8ebf90cbf7569c84b7ee8d60b6ccb Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 14 Jan 2022 14:10:20 -0800
Subject: [PATCH 417/737] mm/damon/schemes: account how many times quota limit
 has exceeded

If the time/space quotas of a given DAMON-based operation scheme is too
small, the scheme could show unexpectedly slow progress.  However, there
is no good way to notice the case in runtime.  This commit extends the
DAMOS stat to provide how many times the quota limits exceeded so that
the users can easily notice the case and tune the scheme.

Link: https://lkml.kernel.org/r/20211210150016.35349-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 2 ++
 mm/damon/core.c       | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index e0ad3d9aaeedb..af648388e7596 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -198,12 +198,14 @@ struct damos_watermarks {
  * @sz_tried:	Total size of regions that the scheme is tried to be applied.
  * @nr_applied:	Total number of regions that the scheme is applied.
  * @sz_applied:	Total size of regions that the scheme is applied.
+ * @qt_exceeds: Total number of times the quota of the scheme has exceeded.
  */
 struct damos_stat {
 	unsigned long nr_tried;
 	unsigned long sz_tried;
 	unsigned long nr_applied;
 	unsigned long sz_applied;
+	unsigned long qt_exceeds;
 };
 
 /**
diff --git a/mm/damon/core.c b/mm/damon/core.c
index d745bf28509ff..d5120b326e1b6 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -693,6 +693,8 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
 		if (time_after_eq(jiffies, quota->charged_from +
 					msecs_to_jiffies(
 						quota->reset_interval))) {
+			if (quota->esz && quota->charged_sz >= quota->esz)
+				s->stat.qt_exceeds++;
 			quota->total_charged_sz += quota->charged_sz;
 			quota->charged_from = jiffies;
 			quota->charged_sz = 0;

From 7f5007df1732d927d166525f7630eb11eca077d3 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 14 Jan 2022 14:10:23 -0800
Subject: [PATCH 418/737] mm/damon/reclaim: provide reclamation statistics

This implements new DAMON_RECLAIM parameters for statistics reporting.
Those can be used for understanding how DAMON_RECLAIM is working, and
for tuning the other parameters.

Link: https://lkml.kernel.org/r/20211210150016.35349-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/reclaim.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index dc1485044eaf7..bc476cef688e8 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -185,6 +185,36 @@ module_param(monitor_region_end, ulong, 0600);
 static int kdamond_pid __read_mostly = -1;
 module_param(kdamond_pid, int, 0400);
 
+/*
+ * Number of memory regions that tried to be reclaimed.
+ */
+static unsigned long nr_reclaim_tried_regions __read_mostly;
+module_param(nr_reclaim_tried_regions, ulong, 0400);
+
+/*
+ * Total bytes of memory regions that tried to be reclaimed.
+ */
+static unsigned long bytes_reclaim_tried_regions __read_mostly;
+module_param(bytes_reclaim_tried_regions, ulong, 0400);
+
+/*
+ * Number of memory regions that successfully be reclaimed.
+ */
+static unsigned long nr_reclaimed_regions __read_mostly;
+module_param(nr_reclaimed_regions, ulong, 0400);
+
+/*
+ * Total bytes of memory regions that successfully be reclaimed.
+ */
+static unsigned long bytes_reclaimed_regions __read_mostly;
+module_param(bytes_reclaimed_regions, ulong, 0400);
+
+/*
+ * Number of times that the time/space quota limits have exceeded
+ */
+static unsigned long nr_quota_exceeds __read_mostly;
+module_param(nr_quota_exceeds, ulong, 0400);
+
 static struct damon_ctx *ctx;
 static struct damon_target *target;
 
@@ -333,6 +363,21 @@ static void damon_reclaim_timer_fn(struct work_struct *work)
 }
 static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn);
 
+static int damon_reclaim_after_aggregation(struct damon_ctx *c)
+{
+	struct damos *s;
+
+	/* update the stats parameter */
+	damon_for_each_scheme(s, c) {
+		nr_reclaim_tried_regions = s->stat.nr_tried;
+		bytes_reclaim_tried_regions = s->stat.sz_tried;
+		nr_reclaimed_regions = s->stat.nr_applied;
+		bytes_reclaimed_regions = s->stat.sz_applied;
+		nr_quota_exceeds = s->stat.qt_exceeds;
+	}
+	return 0;
+}
+
 static int __init damon_reclaim_init(void)
 {
 	ctx = damon_new_ctx();
@@ -340,6 +385,7 @@ static int __init damon_reclaim_init(void)
 		return -ENOMEM;
 
 	damon_pa_set_primitives(ctx);
+	ctx->callback.after_aggregation = damon_reclaim_after_aggregation;
 
 	/* 4242 means nothing but fun */
 	target = damon_new_target(4242);

From 045791031c3f0a3d7aef40e7a526c5d80c7ce21b Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 14 Jan 2022 14:10:26 -0800
Subject: [PATCH 419/737] Docs/admin-guide/mm/damon/reclaim: document
 statistics parameters

This adds descriptions for the DAMON_RECLAIM statistics parameters.

Link: https://lkml.kernel.org/r/20211210150016.35349-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 .../admin-guide/mm/damon/reclaim.rst          | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst
index fb9def3a73559..0af51a9705b10 100644
--- a/Documentation/admin-guide/mm/damon/reclaim.rst
+++ b/Documentation/admin-guide/mm/damon/reclaim.rst
@@ -208,6 +208,31 @@ PID of the DAMON thread.
 If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread.  Else,
 -1.
 
+nr_reclaim_tried_regions
+------------------------
+
+Number of memory regions that tried to be reclaimed by DAMON_RECLAIM.
+
+bytes_reclaim_tried_regions
+---------------------------
+
+Total bytes of memory regions that tried to be reclaimed by DAMON_RECLAIM.
+
+nr_reclaimed_regions
+--------------------
+
+Number of memory regions that successfully be reclaimed by DAMON_RECLAIM.
+
+bytes_reclaimed_regions
+-----------------------
+
+Total bytes of memory regions that successfully be reclaimed by DAMON_RECLAIM.
+
+nr_quota_exceeds
+----------------
+
+Number of times that the time/space quota limits have exceeded.
+
 Example
 =======
 

From 73b62a87a3263443196f425bde883582d5a89c06 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 14 Jan 2022 14:10:29 -0800
Subject: [PATCH 420/737] mm/damon/dbgfs: support all DAMOS stats

Currently, DAMON debugfs interface is not supporting DAMON-based
Operation Schemes (DAMOS) stats for schemes successfully applied regions
and time/space quota limit exceeds.  This adds the support.

Link: https://lkml.kernel.org/r/20211210150016.35349-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/dbgfs.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 9318b52d0b462..751c7b8356848 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -105,7 +105,7 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
 
 	damon_for_each_scheme(s, c) {
 		rc = scnprintf(&buf[written], len - written,
-				"%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu\n",
+				"%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
 				s->min_sz_region, s->max_sz_region,
 				s->min_nr_accesses, s->max_nr_accesses,
 				s->min_age_region, s->max_age_region,
@@ -117,7 +117,9 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
 				s->quota.weight_age,
 				s->wmarks.metric, s->wmarks.interval,
 				s->wmarks.high, s->wmarks.mid, s->wmarks.low,
-				s->stat.nr_tried, s->stat.sz_tried);
+				s->stat.nr_tried, s->stat.sz_tried,
+				s->stat.nr_applied, s->stat.sz_applied,
+				s->stat.qt_exceeds);
 		if (!rc)
 			return -ENOMEM;
 

From 9094d3862a9013e76528d10610ca3c1cce0103f1 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 14 Jan 2022 14:10:32 -0800
Subject: [PATCH 421/737] Docs/admin-guide/mm/damon/usage: update for schemes
 statistics

This updates DAMON debugfs interface for statistics of schemes
successfully applied regions and time/space quota limit exceeds counts.

Link: https://lkml.kernel.org/r/20211210150016.35349-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index cb614c84ba9e9..59b84904a8543 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -223,12 +223,13 @@ is activated.
 Statistics
 ~~~~~~~~~~
 
-It also counts the total number and bytes of regions that each scheme is
-applied.  This statistics can be used for online analysis or tuning of the
-schemes.
+It also counts the total number and bytes of regions that each scheme is tried
+to be applied, the two numbers for the regions that each scheme is successfully
+applied, and the total number of the quota limit exceeds.  This statistics can
+be used for online analysis or tuning of the schemes.
 
 The statistics can be shown by reading the ``schemes`` file.  Reading the file
-will show each scheme you entered in each line, and the two numbers for the
+will show each scheme you entered in each line, and the five numbers for the
 statistics will be added at the end of each line.
 
 Example

From 68ae54e9a605a8f4d7995f5127920e4ac6a20892 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Fri, 14 Jan 2022 14:10:35 -0800
Subject: [PATCH 422/737] mm/damon: add access checking for hugetlb pages

The process's VMAs can be mapped by hugetlb page, but now the DAMON did
not implement the access checking for hugetlb pte, so we can not get the
actual access count like below if a process VMAs were mapped by hugetlb.

  damon_aggregated: target_id=18446614368406014464 nr_regions=12 4194304-5476352: 0 545
  damon_aggregated: target_id=18446614368406014464 nr_regions=12 140662370467840-140662372970496: 0 545
  damon_aggregated: target_id=18446614368406014464 nr_regions=12 140662372970496-140662375460864: 0 545
  damon_aggregated: target_id=18446614368406014464 nr_regions=12 140662375460864-140662377951232: 0 545
  damon_aggregated: target_id=18446614368406014464 nr_regions=12 140662377951232-140662380449792: 0 545
  damon_aggregated: target_id=18446614368406014464 nr_regions=12 140662380449792-140662382944256: 0 545
  ......

Thus this patch adds hugetlb access checking support, with this patch we
can see below VMA mapped by hugetlb access count.

  damon_aggregated: target_id=18446613056935405824 nr_regions=12 140296486649856-140296489914368: 1 3
  damon_aggregated: target_id=18446613056935405824 nr_regions=12 140296489914368-140296492978176: 1 3
  damon_aggregated: target_id=18446613056935405824 nr_regions=12 140296492978176-140296495439872: 1 3
  damon_aggregated: target_id=18446613056935405824 nr_regions=12 140296495439872-140296498311168: 1 3
  damon_aggregated: target_id=18446613056935405824 nr_regions=12 140296498311168-140296501198848: 1 3
  damon_aggregated: target_id=18446613056935405824 nr_regions=12 140296501198848-140296504320000: 1 3
  damon_aggregated: target_id=18446613056935405824 nr_regions=12 140296504320000-140296507568128: 1 2
  ......

[baolin.wang@linux.alibaba.com: fix unused var warning]
  Link: https://lkml.kernel.org/r/1aaf9c11-0d8e-b92d-5c92-46e50a6e8d4e@linux.alibaba.com
[baolin.wang@linux.alibaba.com: v3]
  Link: https://lkml.kernel.org/r/486927ecaaaecf2e3a7fbe0378ec6e1c58b50747.1640852276.git.baolin.wang@linux.alibaba.com

Link: https://lkml.kernel.org/r/6afcbd1fda5f9c7c24f320d26a98188c727ceec3.1639623751.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/vaddr.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 96 insertions(+)

diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index a10df3fd3d024..ee465b3806127 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -388,8 +388,65 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
 	return 0;
 }
 
+#ifdef CONFIG_HUGETLB_PAGE
+static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm,
+				struct vm_area_struct *vma, unsigned long addr)
+{
+	bool referenced = false;
+	pte_t entry = huge_ptep_get(pte);
+	struct page *page = pte_page(entry);
+
+	if (!page)
+		return;
+
+	get_page(page);
+
+	if (pte_young(entry)) {
+		referenced = true;
+		entry = pte_mkold(entry);
+		huge_ptep_set_access_flags(vma, addr, pte, entry,
+					   vma->vm_flags & VM_WRITE);
+	}
+
+#ifdef CONFIG_MMU_NOTIFIER
+	if (mmu_notifier_clear_young(mm, addr,
+				     addr + huge_page_size(hstate_vma(vma))))
+		referenced = true;
+#endif /* CONFIG_MMU_NOTIFIER */
+
+	if (referenced)
+		set_page_young(page);
+
+	set_page_idle(page);
+	put_page(page);
+}
+
+static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask,
+				     unsigned long addr, unsigned long end,
+				     struct mm_walk *walk)
+{
+	struct hstate *h = hstate_vma(walk->vma);
+	spinlock_t *ptl;
+	pte_t entry;
+
+	ptl = huge_pte_lock(h, walk->mm, pte);
+	entry = huge_ptep_get(pte);
+	if (!pte_present(entry))
+		goto out;
+
+	damon_hugetlb_mkold(pte, walk->mm, walk->vma, addr);
+
+out:
+	spin_unlock(ptl);
+	return 0;
+}
+#else
+#define damon_mkold_hugetlb_entry NULL
+#endif /* CONFIG_HUGETLB_PAGE */
+
 static const struct mm_walk_ops damon_mkold_ops = {
 	.pmd_entry = damon_mkold_pmd_entry,
+	.hugetlb_entry = damon_mkold_hugetlb_entry,
 };
 
 static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
@@ -484,8 +541,47 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
 	return 0;
 }
 
+#ifdef CONFIG_HUGETLB_PAGE
+static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask,
+				     unsigned long addr, unsigned long end,
+				     struct mm_walk *walk)
+{
+	struct damon_young_walk_private *priv = walk->private;
+	struct hstate *h = hstate_vma(walk->vma);
+	struct page *page;
+	spinlock_t *ptl;
+	pte_t entry;
+
+	ptl = huge_pte_lock(h, walk->mm, pte);
+	entry = huge_ptep_get(pte);
+	if (!pte_present(entry))
+		goto out;
+
+	page = pte_page(entry);
+	if (!page)
+		goto out;
+
+	get_page(page);
+
+	if (pte_young(entry) || !page_is_idle(page) ||
+	    mmu_notifier_test_young(walk->mm, addr)) {
+		*priv->page_sz = huge_page_size(h);
+		priv->young = true;
+	}
+
+	put_page(page);
+
+out:
+	spin_unlock(ptl);
+	return 0;
+}
+#else
+#define damon_young_hugetlb_entry NULL
+#endif /* CONFIG_HUGETLB_PAGE */
+
 static const struct mm_walk_ops damon_young_ops = {
 	.pmd_entry = damon_young_pmd_entry,
+	.hugetlb_entry = damon_young_hugetlb_entry,
 };
 
 static bool damon_va_young(struct mm_struct *mm, unsigned long addr,

From 0171f7e1b63ce9e4ab23b9f704cbaac2bb56ff26 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <guoqing.jiang@linux.dev>
Date: Fri, 14 Jan 2022 14:10:38 -0800
Subject: [PATCH 423/737] mm/damon: move the implementation of
 damon_insert_region to damon.h

Usually, inline function is declared static since it should sit between
storage and type.  And implement it in a header file if used by multiple
files.

And this change also fixes compile issue when backport damon to 5.10.

  mm/damon/vaddr.c: In function `damon_va_evenly_split_region':
  ./include/linux/damon.h:425:13: error: inlining failed in call to `always_inline' `damon_insert_region': function body not available
  425 | inline void damon_insert_region(struct damon_region *r,
      | ^~~~~~~~~~~~~~~~~~~
  mm/damon/vaddr.c:86:3: note: called from here
  86 | damon_insert_region(n, r, next, t);
     | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Link: https://lkml.kernel.org/r/20211223085703.6142-1-guoqing.jiang@linux.dev
Signed-off-by: Guoqing Jiang <guoqing.jiang@linux.dev>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 13 +++++++++++--
 mm/damon/core.c       | 11 -----------
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index af648388e7596..5e1e3a128b77a 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -451,9 +451,18 @@ static inline struct damon_region *damon_last_region(struct damon_target *t)
 #ifdef CONFIG_DAMON
 
 struct damon_region *damon_new_region(unsigned long start, unsigned long end);
-inline void damon_insert_region(struct damon_region *r,
+
+/*
+ * Add a region between two other regions
+ */
+static inline void damon_insert_region(struct damon_region *r,
 		struct damon_region *prev, struct damon_region *next,
-		struct damon_target *t);
+		struct damon_target *t)
+{
+	__list_add(&r->list, &prev->list, &next->list);
+	t->nr_regions++;
+}
+
 void damon_add_region(struct damon_region *r, struct damon_target *t);
 void damon_destroy_region(struct damon_region *r, struct damon_target *t);
 
diff --git a/mm/damon/core.c b/mm/damon/core.c
index d5120b326e1b6..6482d510dcbe3 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -49,17 +49,6 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end)
 	return region;
 }
 
-/*
- * Add a region between two other regions
- */
-inline void damon_insert_region(struct damon_region *r,
-		struct damon_region *prev, struct damon_region *next,
-		struct damon_target *t)
-{
-	__list_add(&r->list, &prev->list, &next->list);
-	t->nr_regions++;
-}
-
 void damon_add_region(struct damon_region *r, struct damon_target *t)
 {
 	list_add_tail(&r->list, &t->regions_list);

From 52e26fcbee3bb0c290f1267e75712085cf144e0f Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 14 Jan 2022 14:10:41 -0800
Subject: [PATCH 424/737] mm/damon/dbgfs: remove an unnecessary variable

Patch series "mm/damon: Hide unnecessary information disclosures".

DAMON is exposing some unnecessary information including kernel pointer
in kernel log and tracepoint.  This patchset hides such information.
The first patch is only for a trivial cleanup, though.

This patch (of 4):

This commit removes a unnecessarily used variable in
dbgfs_target_ids_write().

Link: https://lkml.kernel.org/r/20211229131016.23641-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20211229131016.23641-2-sj@kernel.org
Fixes: 4bc05954d007 ("mm/damon: implement a debugfs-based user space interface")
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/dbgfs.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 751c7b8356848..5b899601e56c3 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -364,7 +364,7 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
 	struct damon_ctx *ctx = file->private_data;
 	struct damon_target *t, *next_t;
 	bool id_is_pid = true;
-	char *kbuf, *nrs;
+	char *kbuf;
 	unsigned long *targets;
 	ssize_t nr_targets;
 	ssize_t ret;
@@ -374,14 +374,13 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
 	if (IS_ERR(kbuf))
 		return PTR_ERR(kbuf);
 
-	nrs = kbuf;
 	if (!strncmp(kbuf, "paddr\n", count)) {
 		id_is_pid = false;
 		/* target id is meaningless here, but we set it just for fun */
 		scnprintf(kbuf, count, "42    ");
 	}
 
-	targets = str_to_target_ids(nrs, count, &nr_targets);
+	targets = str_to_target_ids(kbuf, count, &nr_targets);
 	if (!targets) {
 		ret = -ENOMEM;
 		goto out;

From 3204ccf297185e096357d7fa4e5973781fc33a88 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 14 Jan 2022 14:10:44 -0800
Subject: [PATCH 425/737] mm/damon/vaddr: use pr_debug() for
 damon_va_three_regions() failure logging

Failure of 'damon_va_three_regions()' is logged using 'pr_err()'.  But,
the function can fail in legal situations.  To avoid making users be
surprised and to keep the kernel clean, this makes the log to be printed
using 'pr_debug()'.

Link: https://lkml.kernel.org/r/20211229131016.23641-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/vaddr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index ee465b3806127..223829655d64b 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -238,7 +238,7 @@ static void __damon_va_init_regions(struct damon_ctx *ctx,
 	int i;
 
 	if (damon_va_three_regions(t, regions)) {
-		pr_err("Failed to get three regions of target %lu\n", t->id);
+		pr_debug("Failed to get three regions of target %lu\n", t->id);
 		return;
 	}
 

From 1f0008885dc9467b149c68d324c223d4e8139824 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 14 Jan 2022 14:10:47 -0800
Subject: [PATCH 426/737] mm/damon/vaddr: hide kernel pointer from
 damon_va_three_regions() failure log

The failure log message for 'damon_va_three_regions()' prints the target
id, which is a 'struct pid' pointer in the case.  To avoid exposing the
kernel pointer via the log, this makes the log to use the index of the
target in the context's targets list instead.

Link: https://lkml.kernel.org/r/20211229131016.23641-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/vaddr.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 223829655d64b..89b6468da2b9b 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -232,13 +232,19 @@ static int damon_va_three_regions(struct damon_target *t,
 static void __damon_va_init_regions(struct damon_ctx *ctx,
 				     struct damon_target *t)
 {
+	struct damon_target *ti;
 	struct damon_region *r;
 	struct damon_addr_range regions[3];
 	unsigned long sz = 0, nr_pieces;
-	int i;
+	int i, tidx = 0;
 
 	if (damon_va_three_regions(t, regions)) {
-		pr_debug("Failed to get three regions of target %lu\n", t->id);
+		damon_for_each_target(ti, ctx) {
+			if (ti == t)
+				break;
+			tidx++;
+		}
+		pr_debug("Failed to get three regions of %dth target\n", tidx);
 		return;
 	}
 

From c20aba21becf19b0d228a3034947fea6c715dea7 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 14 Jan 2022 14:10:50 -0800
Subject: [PATCH 427/737] mm/damon: hide kernel pointer from tracepoint event

DAMON's virtual address spaces monitoring primitive uses 'struct pid *'
of the target process as its monitoring target id.  The kernel address
is exposed as-is to the user space via the DAMON tracepoint,
'damon_aggregated'.

Though primarily only privileged users are allowed to access that, it
would be better to avoid unnecessarily exposing kernel pointers so.
Because the trace result is only required to be able to distinguish each
target, we aren't need to use the pointer as-is.

This makes the tracepoint to use the index of the target in the
context's targets list as its id in the tracepoint, to hide the kernel
space address.

Link: https://lkml.kernel.org/r/20211229131016.23641-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/trace/events/damon.h | 8 ++++----
 mm/damon/core.c              | 4 +++-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h
index 99ffa601e3511..c79f1d4c39afe 100644
--- a/include/trace/events/damon.h
+++ b/include/trace/events/damon.h
@@ -11,10 +11,10 @@
 
 TRACE_EVENT(damon_aggregated,
 
-	TP_PROTO(struct damon_target *t, struct damon_region *r,
-		unsigned int nr_regions),
+	TP_PROTO(struct damon_target *t, unsigned int target_id,
+		struct damon_region *r, unsigned int nr_regions),
 
-	TP_ARGS(t, r, nr_regions),
+	TP_ARGS(t, target_id, r, nr_regions),
 
 	TP_STRUCT__entry(
 		__field(unsigned long, target_id)
@@ -26,7 +26,7 @@ TRACE_EVENT(damon_aggregated,
 	),
 
 	TP_fast_assign(
-		__entry->target_id = t->id;
+		__entry->target_id = target_id;
 		__entry->nr_regions = nr_regions;
 		__entry->start = r->ar.start;
 		__entry->end = r->ar.end;
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 6482d510dcbe3..1dd153c31c9e2 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -514,15 +514,17 @@ static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx)
 static void kdamond_reset_aggregated(struct damon_ctx *c)
 {
 	struct damon_target *t;
+	unsigned int ti = 0;	/* target's index */
 
 	damon_for_each_target(t, c) {
 		struct damon_region *r;
 
 		damon_for_each_region(r, t) {
-			trace_damon_aggregated(t, r, damon_nr_regions(t));
+			trace_damon_aggregated(t, ti, r, damon_nr_regions(t));
 			r->last_nr_accesses = r->nr_accesses;
 			r->nr_accesses = 0;
 		}
+		ti++;
 	}
 }
 

From b25a5310e95646d61cf8389d750e30747b458cc8 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Wed, 16 Mar 2022 16:15:28 +0800
Subject: [PATCH 428/737] mm/damon: minor cleanup for damon_pa_young

if need_lock is true but folio_trylock fails, we should return false
instead of NULL to match the return value type exactly. No functional
change intended.

Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 mm/damon/paddr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 5e8244f65a1a2..a6c5b24fba7a9 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -156,7 +156,7 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz)
 	need_lock = !PageAnon(page) || PageKsm(page);
 	if (need_lock && !trylock_page(page)) {
 		put_page(page);
-		return NULL;
+		return false;
 	}
 
 	rmap_walk(page, &rwc);

From 182e41d80c9da75119d7da2052baa14e24027cad Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Mar 2022 14:48:31 -0700
Subject: [PATCH 429/737] mm/damon/dbgfs/init_regions: use target index instead
 of target id

Patch series "Remove the type-unclear target id concept".

DAMON asks each monitoring target ('struct damon_target') to have one
'unsigned long' integer called 'id', which should be unique among the
targets of same monitoring context.  Meaning of it is, however, totally up
to the monitoring primitives that registered to the monitoring context.
For example, the virtual address spaces monitoring primitives treats the
id as a 'struct pid' pointer.

This makes the code flexible but ugly, not well-documented, and
type-unsafe[1].  Also, identification of each target can be done via its
index.  For the reason, this patchset removes the concept and uses clear
type definition.

[1] https://lore.kernel.org/linux-mm/20211013154535.4aaeaaf9d0182922e405dd1e@linux-foundation.org/

This patch (of 4):

Target id is a 'unsigned long' data, which can be interpreted differently
by each monitoring primitives.  For example, it means 'struct pid *' for
the virtual address spaces monitoring, while it means nothing but an
integer to be displayed to debugfs interface users for the physical
address space monitoring.  It's flexible but makes code ugly and
type-unsafe[1].

To be prepared for eventual removal of the concept, this commit removes a
use case of the concept in 'init_regions' debugfs file handling.  In
detail, this commit replaces use of the id with the index of each target
in the context's targets list.

[1] https://lore.kernel.org/linux-mm/20211013154535.4aaeaaf9d0182922e405dd1e@linux-foundation.org/

Link: https://lkml.kernel.org/r/20211230100723.2238-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20211230100723.2238-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/dbgfs-test.h | 20 ++++++++++----------
 mm/damon/dbgfs.c      | 25 ++++++++++++-------------
 2 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h
index 86b9f9528231e..00bff058fe08f 100644
--- a/mm/damon/dbgfs-test.h
+++ b/mm/damon/dbgfs-test.h
@@ -113,19 +113,19 @@ static void damon_dbgfs_test_set_init_regions(struct kunit *test)
 {
 	struct damon_ctx *ctx = damon_new_ctx();
 	unsigned long ids[] = {1, 2, 3};
-	/* Each line represents one region in ``<target id> <start> <end>`` */
-	char * const valid_inputs[] = {"2 10 20\n 2   20 30\n2 35 45",
-		"2 10 20\n",
-		"2 10 20\n1 39 59\n1 70 134\n  2  20 25\n",
+	/* Each line represents one region in ``<target idx> <start> <end>`` */
+	char * const valid_inputs[] = {"1 10 20\n 1   20 30\n1 35 45",
+		"1 10 20\n",
+		"1 10 20\n0 39 59\n0 70 134\n  1  20 25\n",
 		""};
 	/* Reading the file again will show sorted, clean output */
-	char * const valid_expects[] = {"2 10 20\n2 20 30\n2 35 45\n",
-		"2 10 20\n",
-		"1 39 59\n1 70 134\n2 10 20\n2 20 25\n",
+	char * const valid_expects[] = {"1 10 20\n1 20 30\n1 35 45\n",
+		"1 10 20\n",
+		"0 39 59\n0 70 134\n1 10 20\n1 20 25\n",
 		""};
-	char * const invalid_inputs[] = {"4 10 20\n",	/* target not exists */
-		"2 10 20\n 2 14 26\n",		/* regions overlap */
-		"1 10 20\n2 30 40\n 1 5 8"};	/* not sorted by address */
+	char * const invalid_inputs[] = {"3 10 20\n",	/* target not exists */
+		"1 10 20\n 1 14 26\n",		/* regions overlap */
+		"0 10 20\n1 30 40\n 0 5 8"};	/* not sorted by address */
 	char *input, *expect;
 	int i, rc;
 	char buf[256];
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 5b899601e56c3..3f65af04e4e60 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -440,18 +440,20 @@ static ssize_t sprint_init_regions(struct damon_ctx *c, char *buf, ssize_t len)
 {
 	struct damon_target *t;
 	struct damon_region *r;
+	int target_idx = 0;
 	int written = 0;
 	int rc;
 
 	damon_for_each_target(t, c) {
 		damon_for_each_region(r, t) {
 			rc = scnprintf(&buf[written], len - written,
-					"%lu %lu %lu\n",
-					t->id, r->ar.start, r->ar.end);
+					"%d %lu %lu\n",
+					target_idx, r->ar.start, r->ar.end);
 			if (!rc)
 				return -ENOMEM;
 			written += rc;
 		}
+		target_idx++;
 	}
 	return written;
 }
@@ -485,22 +487,19 @@ static ssize_t dbgfs_init_regions_read(struct file *file, char __user *buf,
 	return len;
 }
 
-static int add_init_region(struct damon_ctx *c,
-			 unsigned long target_id, struct damon_addr_range *ar)
+static int add_init_region(struct damon_ctx *c, int target_idx,
+		struct damon_addr_range *ar)
 {
 	struct damon_target *t;
 	struct damon_region *r, *prev;
-	unsigned long id;
+	unsigned long idx = 0;
 	int rc = -EINVAL;
 
 	if (ar->start >= ar->end)
 		return -EINVAL;
 
 	damon_for_each_target(t, c) {
-		id = t->id;
-		if (targetid_is_pid(c))
-			id = (unsigned long)pid_vnr((struct pid *)id);
-		if (id == target_id) {
+		if (idx++ == target_idx) {
 			r = damon_new_region(ar->start, ar->end);
 			if (!r)
 				return -ENOMEM;
@@ -523,7 +522,7 @@ static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len)
 	struct damon_target *t;
 	struct damon_region *r, *next;
 	int pos = 0, parsed, ret;
-	unsigned long target_id;
+	int target_idx;
 	struct damon_addr_range ar;
 	int err;
 
@@ -533,11 +532,11 @@ static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len)
 	}
 
 	while (pos < len) {
-		ret = sscanf(&str[pos], "%lu %lu %lu%n",
-				&target_id, &ar.start, &ar.end, &parsed);
+		ret = sscanf(&str[pos], "%d %lu %lu%n",
+				&target_idx, &ar.start, &ar.end, &parsed);
 		if (ret != 3)
 			break;
-		err = add_init_region(c, target_id, &ar);
+		err = add_init_region(c, target_idx, &ar);
 		if (err)
 			goto fail;
 		pos += parsed;

From 3fb76901c1f92623a7546861fce1695da3af8e4a Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Mar 2022 14:48:34 -0700
Subject: [PATCH 430/737] Docs/admin-guide/mm/damon/usage: update for changed
 initail_regions file input

A previous commit made init_regions debugfs file to use target index
instead of target id for specifying the target of the init regions.  This
commit updates the usage document to reflect the change.

Link: https://lkml.kernel.org/r/20211230100723.2238-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 24 ++++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 59b84904a8543..1e06435b8ff67 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -108,19 +108,23 @@ In such cases, users can explicitly set the initial monitoring target regions
 as they want, by writing proper values to the ``init_regions`` file.  Each line
 of the input should represent one region in below form.::
 
-    <target id> <start address> <end address>
+    <target idx> <start address> <end address>
 
-The ``target id`` should already in ``target_ids`` file, and the regions should
-be passed in address order.  For example, below commands will set a couple of
-address ranges, ``1-100`` and ``100-200`` as the initial monitoring target
-region of process 42, and another couple of address ranges, ``20-40`` and
-``50-100`` as that of process 4242.::
+The ``target idx`` should be the index of the target in ``target_ids`` file,
+starting from ``0``, and the regions should be passed in address order.  For
+example, below commands will set a couple of address ranges, ``1-100`` and
+``100-200`` as the initial monitoring target region of pid 42, which is the
+first one (index ``0``) in ``target_ids``, and another couple of address
+ranges, ``20-40`` and ``50-100`` as that of pid 4242, which is the second one
+(index ``1``) in ``target_ids``.::
 
     # cd <debugfs>/damon
-    # echo "42   1       100
-            42   100     200
-            4242 20      40
-            4242 50      100" > init_regions
+    # cat target_ids
+    42 4242
+    # echo "0   1       100
+            0   100     200
+            1   20      40
+            1   50      100" > init_regions
 
 Note that this sets the initial monitoring target regions only.  In case of
 virtual memory monitoring, DAMON will automatically updates the boundary of the

From 318162e5a9b81bafea981e6f69d8e463d80ca08b Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Mar 2022 14:48:37 -0700
Subject: [PATCH 431/737] mm/damon/core: move damon_set_targets() into dbgfs

damon_set_targets() function is defined in the core for general use cases,
but called from only dbgfs.  Also, because the function is for general use
cases, dbgfs does additional handling of pid type target id case.  To make
the situation simpler, this commit moves the function into dbgfs and makes
it to do the pid type case handling on its own.

Link: https://lkml.kernel.org/r/20211230100723.2238-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h |  2 --
 mm/damon/core-test.h  |  5 +++-
 mm/damon/core.c       | 32 --------------------------
 mm/damon/dbgfs-test.h | 14 ++++++------
 mm/damon/dbgfs.c      | 53 +++++++++++++++++++++++++++++++++----------
 5 files changed, 52 insertions(+), 54 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 5e1e3a128b77a..bd021af5db3d1 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -484,8 +484,6 @@ unsigned int damon_nr_regions(struct damon_target *t);
 
 struct damon_ctx *damon_new_ctx(void);
 void damon_destroy_ctx(struct damon_ctx *ctx);
-int damon_set_targets(struct damon_ctx *ctx,
-		unsigned long *ids, ssize_t nr_ids);
 int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
 		unsigned long aggr_int, unsigned long primitive_upd_int,
 		unsigned long min_nr_reg, unsigned long max_nr_reg);
diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h
index 7008c3735e99f..4a6141ddd6fcf 100644
--- a/mm/damon/core-test.h
+++ b/mm/damon/core-test.h
@@ -86,7 +86,10 @@ static void damon_test_aggregate(struct kunit *test)
 	struct damon_region *r;
 	int it, ir;
 
-	damon_set_targets(ctx, target_ids, 3);
+	for (it = 0; it < 3; it++) {
+		t = damon_new_target(target_ids[it]);
+		damon_add_target(ctx, t);
+	}
 
 	it = 0;
 	damon_for_each_target(t, ctx) {
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 1dd153c31c9e2..3fef5c667a31d 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -245,38 +245,6 @@ void damon_destroy_ctx(struct damon_ctx *ctx)
 	kfree(ctx);
 }
 
-/**
- * damon_set_targets() - Set monitoring targets.
- * @ctx:	monitoring context
- * @ids:	array of target ids
- * @nr_ids:	number of entries in @ids
- *
- * This function should not be called while the kdamond is running.
- *
- * Return: 0 on success, negative error code otherwise.
- */
-int damon_set_targets(struct damon_ctx *ctx,
-		      unsigned long *ids, ssize_t nr_ids)
-{
-	ssize_t i;
-	struct damon_target *t, *next;
-
-	damon_destroy_targets(ctx);
-
-	for (i = 0; i < nr_ids; i++) {
-		t = damon_new_target(ids[i]);
-		if (!t) {
-			/* The caller should do cleanup of the ids itself */
-			damon_for_each_target_safe(t, next, ctx)
-				damon_destroy_target(t);
-			return -ENOMEM;
-		}
-		damon_add_target(ctx, t);
-	}
-
-	return 0;
-}
-
 /**
  * damon_set_attrs() - Set attributes for the monitoring.
  * @ctx:		monitoring context
diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h
index 00bff058fe08f..c1c988b607bc9 100644
--- a/mm/damon/dbgfs-test.h
+++ b/mm/damon/dbgfs-test.h
@@ -86,23 +86,23 @@ static void damon_dbgfs_test_set_targets(struct kunit *test)
 	ctx->primitive.target_valid = NULL;
 	ctx->primitive.cleanup = NULL;
 
-	damon_set_targets(ctx, ids, 3);
+	dbgfs_set_targets(ctx, ids, 3);
 	sprint_target_ids(ctx, buf, 64);
 	KUNIT_EXPECT_STREQ(test, (char *)buf, "1 2 3\n");
 
-	damon_set_targets(ctx, NULL, 0);
+	dbgfs_set_targets(ctx, NULL, 0);
 	sprint_target_ids(ctx, buf, 64);
 	KUNIT_EXPECT_STREQ(test, (char *)buf, "\n");
 
-	damon_set_targets(ctx, (unsigned long []){1, 2}, 2);
+	dbgfs_set_targets(ctx, (unsigned long []){1, 2}, 2);
 	sprint_target_ids(ctx, buf, 64);
 	KUNIT_EXPECT_STREQ(test, (char *)buf, "1 2\n");
 
-	damon_set_targets(ctx, (unsigned long []){2}, 1);
+	dbgfs_set_targets(ctx, (unsigned long []){2}, 1);
 	sprint_target_ids(ctx, buf, 64);
 	KUNIT_EXPECT_STREQ(test, (char *)buf, "2\n");
 
-	damon_set_targets(ctx, NULL, 0);
+	dbgfs_set_targets(ctx, NULL, 0);
 	sprint_target_ids(ctx, buf, 64);
 	KUNIT_EXPECT_STREQ(test, (char *)buf, "\n");
 
@@ -130,7 +130,7 @@ static void damon_dbgfs_test_set_init_regions(struct kunit *test)
 	int i, rc;
 	char buf[256];
 
-	damon_set_targets(ctx, ids, 3);
+	dbgfs_set_targets(ctx, ids, 3);
 
 	/* Put valid inputs and check the results */
 	for (i = 0; i < ARRAY_SIZE(valid_inputs); i++) {
@@ -158,7 +158,7 @@ static void damon_dbgfs_test_set_init_regions(struct kunit *test)
 		KUNIT_EXPECT_STREQ(test, (char *)buf, "");
 	}
 
-	damon_set_targets(ctx, NULL, 0);
+	dbgfs_set_targets(ctx, NULL, 0);
 	damon_destroy_ctx(ctx);
 }
 
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 3f65af04e4e60..58867b9666350 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -358,11 +358,48 @@ static void dbgfs_put_pids(unsigned long *ids, int nr_ids)
 		put_pid((struct pid *)ids[i]);
 }
 
+/*
+ * dbgfs_set_targets() - Set monitoring targets.
+ * @ctx:	monitoring context
+ * @ids:	array of target ids
+ * @nr_ids:	number of entries in @ids
+ *
+ * This function should not be called while the kdamond is running.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+static int dbgfs_set_targets(struct damon_ctx *ctx,
+		      unsigned long *ids, ssize_t nr_ids)
+{
+	ssize_t i;
+	struct damon_target *t, *next;
+
+	damon_for_each_target_safe(t, next, ctx) {
+		if (targetid_is_pid(ctx))
+			put_pid((struct pid *)t->id);
+		damon_destroy_target(t);
+	}
+
+	for (i = 0; i < nr_ids; i++) {
+		t = damon_new_target(ids[i]);
+		if (!t) {
+			/* The caller should do cleanup of the ids itself */
+			damon_for_each_target_safe(t, next, ctx)
+				damon_destroy_target(t);
+			if (targetid_is_pid(ctx))
+				dbgfs_put_pids(ids, nr_ids);
+			return -ENOMEM;
+		}
+		damon_add_target(ctx, t);
+	}
+
+	return 0;
+}
+
 static ssize_t dbgfs_target_ids_write(struct file *file,
 		const char __user *buf, size_t count, loff_t *ppos)
 {
 	struct damon_ctx *ctx = file->private_data;
-	struct damon_target *t, *next_t;
 	bool id_is_pid = true;
 	char *kbuf;
 	unsigned long *targets;
@@ -407,11 +444,7 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
 	}
 
 	/* remove previously set targets */
-	damon_for_each_target_safe(t, next_t, ctx) {
-		if (targetid_is_pid(ctx))
-			put_pid((struct pid *)t->id);
-		damon_destroy_target(t);
-	}
+	dbgfs_set_targets(ctx, NULL, 0);
 
 	/* Configure the context for the address space type */
 	if (id_is_pid)
@@ -419,13 +452,9 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
 	else
 		damon_pa_set_primitives(ctx);
 
-	ret = damon_set_targets(ctx, targets, nr_targets);
-	if (ret) {
-		if (id_is_pid)
-			dbgfs_put_pids(targets, nr_targets);
-	} else {
+	ret = dbgfs_set_targets(ctx, targets, nr_targets);
+	if (!ret)
 		ret = count;
-	}
 
 unlock_out:
 	mutex_unlock(&ctx->kdamond_lock);

From 2a8a5a1cdc2733f57a6b3e730eb73939dd8efa6d Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Mar 2022 14:48:40 -0700
Subject: [PATCH 432/737] mm/damon: remove the target id concept

DAMON asks each monitoring target ('struct damon_target') to have one
'unsigned long' integer called 'id', which should be unique among the
targets of same monitoring context.  Meaning of it is, however, totally up
to the monitoring primitives that registered to the monitoring context.
For example, the virtual address spaces monitoring primitives treats the
id as a 'struct pid' pointer.

This makes the code flexible, but ugly, not well-documented, and
type-unsafe[1].  Also, identification of each target can be done via its
index.  For the reason, this commit removes the concept and uses clear
type definition.  For now, only 'struct pid' pointer is used for the
virtual address spaces monitoring.  If DAMON is extended in future so that
we need to put another identifier field in the struct, we will use a union
for such primitives-dependent fields and document which primitives are
using which type.

[1] https://lore.kernel.org/linux-mm/20211013154535.4aaeaaf9d0182922e405dd1e@linux-foundation.org/

Link: https://lkml.kernel.org/r/20211230100723.2238-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h |  11 ++-
 mm/damon/core-test.h  |  18 +++--
 mm/damon/core.c       |   4 +-
 mm/damon/dbgfs-test.h |  63 ++++++-----------
 mm/damon/dbgfs.c      | 152 +++++++++++++++++++++++++-----------------
 mm/damon/reclaim.c    |   3 +-
 mm/damon/vaddr-test.h |   6 +-
 mm/damon/vaddr.c      |   4 +-
 8 files changed, 133 insertions(+), 128 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index bd021af5db3d1..7c1d915b35875 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -60,19 +60,18 @@ struct damon_region {
 
 /**
  * struct damon_target - Represents a monitoring target.
- * @id:			Unique identifier for this target.
+ * @pid:		The PID of the virtual address space to monitor.
  * @nr_regions:		Number of monitoring target regions of this target.
  * @regions_list:	Head of the monitoring target regions of this target.
  * @list:		List head for siblings.
  *
  * Each monitoring context could have multiple targets.  For example, a context
  * for virtual memory address spaces could have multiple target processes.  The
- * @id of each target should be unique among the targets of the context.  For
- * example, in the virtual address monitoring context, it could be a pidfd or
- * an address of an mm_struct.
+ * @pid should be set for appropriate address space monitoring primitives
+ * including the virtual address spaces monitoring primitives.
  */
 struct damon_target {
-	unsigned long id;
+	struct pid *pid;
 	unsigned int nr_regions;
 	struct list_head regions_list;
 	struct list_head list;
@@ -475,7 +474,7 @@ struct damos *damon_new_scheme(
 void damon_add_scheme(struct damon_ctx *ctx, struct damos *s);
 void damon_destroy_scheme(struct damos *s);
 
-struct damon_target *damon_new_target(unsigned long id);
+struct damon_target *damon_new_target(void);
 void damon_add_target(struct damon_ctx *ctx, struct damon_target *t);
 bool damon_targets_empty(struct damon_ctx *ctx);
 void damon_free_target(struct damon_target *t);
diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h
index 4a6141ddd6fcf..b4085deb9fa05 100644
--- a/mm/damon/core-test.h
+++ b/mm/damon/core-test.h
@@ -24,7 +24,7 @@ static void damon_test_regions(struct kunit *test)
 	KUNIT_EXPECT_EQ(test, 2ul, r->ar.end);
 	KUNIT_EXPECT_EQ(test, 0u, r->nr_accesses);
 
-	t = damon_new_target(42);
+	t = damon_new_target();
 	KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t));
 
 	damon_add_region(r, t);
@@ -52,8 +52,7 @@ static void damon_test_target(struct kunit *test)
 	struct damon_ctx *c = damon_new_ctx();
 	struct damon_target *t;
 
-	t = damon_new_target(42);
-	KUNIT_EXPECT_EQ(test, 42ul, t->id);
+	t = damon_new_target();
 	KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c));
 
 	damon_add_target(c, t);
@@ -78,7 +77,6 @@ static void damon_test_target(struct kunit *test)
 static void damon_test_aggregate(struct kunit *test)
 {
 	struct damon_ctx *ctx = damon_new_ctx();
-	unsigned long target_ids[] = {1, 2, 3};
 	unsigned long saddr[][3] = {{10, 20, 30}, {5, 42, 49}, {13, 33, 55} };
 	unsigned long eaddr[][3] = {{15, 27, 40}, {31, 45, 55}, {23, 44, 66} };
 	unsigned long accesses[][3] = {{42, 95, 84}, {10, 20, 30}, {0, 1, 2} };
@@ -87,7 +85,7 @@ static void damon_test_aggregate(struct kunit *test)
 	int it, ir;
 
 	for (it = 0; it < 3; it++) {
-		t = damon_new_target(target_ids[it]);
+		t = damon_new_target();
 		damon_add_target(ctx, t);
 	}
 
@@ -125,7 +123,7 @@ static void damon_test_split_at(struct kunit *test)
 	struct damon_target *t;
 	struct damon_region *r;
 
-	t = damon_new_target(42);
+	t = damon_new_target();
 	r = damon_new_region(0, 100);
 	damon_add_region(r, t);
 	damon_split_region_at(c, t, r, 25);
@@ -146,7 +144,7 @@ static void damon_test_merge_two(struct kunit *test)
 	struct damon_region *r, *r2, *r3;
 	int i;
 
-	t = damon_new_target(42);
+	t = damon_new_target();
 	r = damon_new_region(0, 100);
 	r->nr_accesses = 10;
 	damon_add_region(r, t);
@@ -194,7 +192,7 @@ static void damon_test_merge_regions_of(struct kunit *test)
 	unsigned long eaddrs[] = {112, 130, 156, 170, 230};
 	int i;
 
-	t = damon_new_target(42);
+	t = damon_new_target();
 	for (i = 0; i < ARRAY_SIZE(sa); i++) {
 		r = damon_new_region(sa[i], ea[i]);
 		r->nr_accesses = nrs[i];
@@ -218,14 +216,14 @@ static void damon_test_split_regions_of(struct kunit *test)
 	struct damon_target *t;
 	struct damon_region *r;
 
-	t = damon_new_target(42);
+	t = damon_new_target();
 	r = damon_new_region(0, 22);
 	damon_add_region(r, t);
 	damon_split_regions_of(c, t, 2);
 	KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u);
 	damon_free_target(t);
 
-	t = damon_new_target(42);
+	t = damon_new_target();
 	r = damon_new_region(0, 220);
 	damon_add_region(r, t);
 	damon_split_regions_of(c, t, 4);
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 3fef5c667a31d..bf495236d741b 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -144,7 +144,7 @@ void damon_destroy_scheme(struct damos *s)
  *
  * Returns the pointer to the new struct if success, or NULL otherwise
  */
-struct damon_target *damon_new_target(unsigned long id)
+struct damon_target *damon_new_target(void)
 {
 	struct damon_target *t;
 
@@ -152,7 +152,7 @@ struct damon_target *damon_new_target(unsigned long id)
 	if (!t)
 		return NULL;
 
-	t->id = id;
+	t->pid = NULL;
 	t->nr_regions = 0;
 	INIT_LIST_HEAD(&t->regions_list);
 
diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h
index c1c988b607bc9..0d3a14c00acfb 100644
--- a/mm/damon/dbgfs-test.h
+++ b/mm/damon/dbgfs-test.h
@@ -12,66 +12,58 @@
 
 #include <kunit/test.h>
 
-static void damon_dbgfs_test_str_to_target_ids(struct kunit *test)
+static void damon_dbgfs_test_str_to_ints(struct kunit *test)
 {
 	char *question;
-	unsigned long *answers;
-	unsigned long expected[] = {12, 35, 46};
+	int *answers;
+	int expected[] = {12, 35, 46};
 	ssize_t nr_integers = 0, i;
 
 	question = "123";
-	answers = str_to_target_ids(question, strlen(question),
-			&nr_integers);
+	answers = str_to_ints(question, strlen(question), &nr_integers);
 	KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers);
-	KUNIT_EXPECT_EQ(test, 123ul, answers[0]);
+	KUNIT_EXPECT_EQ(test, 123, answers[0]);
 	kfree(answers);
 
 	question = "123abc";
-	answers = str_to_target_ids(question, strlen(question),
-			&nr_integers);
+	answers = str_to_ints(question, strlen(question), &nr_integers);
 	KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers);
-	KUNIT_EXPECT_EQ(test, 123ul, answers[0]);
+	KUNIT_EXPECT_EQ(test, 123, answers[0]);
 	kfree(answers);
 
 	question = "a123";
-	answers = str_to_target_ids(question, strlen(question),
-			&nr_integers);
+	answers = str_to_ints(question, strlen(question), &nr_integers);
 	KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers);
 	kfree(answers);
 
 	question = "12 35";
-	answers = str_to_target_ids(question, strlen(question),
-			&nr_integers);
+	answers = str_to_ints(question, strlen(question), &nr_integers);
 	KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers);
 	for (i = 0; i < nr_integers; i++)
 		KUNIT_EXPECT_EQ(test, expected[i], answers[i]);
 	kfree(answers);
 
 	question = "12 35 46";
-	answers = str_to_target_ids(question, strlen(question),
-			&nr_integers);
+	answers = str_to_ints(question, strlen(question), &nr_integers);
 	KUNIT_EXPECT_EQ(test, (ssize_t)3, nr_integers);
 	for (i = 0; i < nr_integers; i++)
 		KUNIT_EXPECT_EQ(test, expected[i], answers[i]);
 	kfree(answers);
 
 	question = "12 35 abc 46";
-	answers = str_to_target_ids(question, strlen(question),
-			&nr_integers);
+	answers = str_to_ints(question, strlen(question), &nr_integers);
 	KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers);
 	for (i = 0; i < 2; i++)
 		KUNIT_EXPECT_EQ(test, expected[i], answers[i]);
 	kfree(answers);
 
 	question = "";
-	answers = str_to_target_ids(question, strlen(question),
-			&nr_integers);
+	answers = str_to_ints(question, strlen(question), &nr_integers);
 	KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers);
 	kfree(answers);
 
 	question = "\n";
-	answers = str_to_target_ids(question, strlen(question),
-			&nr_integers);
+	answers = str_to_ints(question, strlen(question), &nr_integers);
 	KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers);
 	kfree(answers);
 }
@@ -79,30 +71,20 @@ static void damon_dbgfs_test_str_to_target_ids(struct kunit *test)
 static void damon_dbgfs_test_set_targets(struct kunit *test)
 {
 	struct damon_ctx *ctx = dbgfs_new_ctx();
-	unsigned long ids[] = {1, 2, 3};
 	char buf[64];
 
-	/* Make DAMON consider target id as plain number */
-	ctx->primitive.target_valid = NULL;
-	ctx->primitive.cleanup = NULL;
+	/* Make DAMON consider target has no pid */
+	ctx->primitive = (struct damon_primitive){};
 
-	dbgfs_set_targets(ctx, ids, 3);
-	sprint_target_ids(ctx, buf, 64);
-	KUNIT_EXPECT_STREQ(test, (char *)buf, "1 2 3\n");
-
-	dbgfs_set_targets(ctx, NULL, 0);
+	dbgfs_set_targets(ctx, 0, NULL);
 	sprint_target_ids(ctx, buf, 64);
 	KUNIT_EXPECT_STREQ(test, (char *)buf, "\n");
 
-	dbgfs_set_targets(ctx, (unsigned long []){1, 2}, 2);
-	sprint_target_ids(ctx, buf, 64);
-	KUNIT_EXPECT_STREQ(test, (char *)buf, "1 2\n");
-
-	dbgfs_set_targets(ctx, (unsigned long []){2}, 1);
+	dbgfs_set_targets(ctx, 1, NULL);
 	sprint_target_ids(ctx, buf, 64);
-	KUNIT_EXPECT_STREQ(test, (char *)buf, "2\n");
+	KUNIT_EXPECT_STREQ(test, (char *)buf, "42\n");
 
-	dbgfs_set_targets(ctx, NULL, 0);
+	dbgfs_set_targets(ctx, 0, NULL);
 	sprint_target_ids(ctx, buf, 64);
 	KUNIT_EXPECT_STREQ(test, (char *)buf, "\n");
 
@@ -112,7 +94,6 @@ static void damon_dbgfs_test_set_targets(struct kunit *test)
 static void damon_dbgfs_test_set_init_regions(struct kunit *test)
 {
 	struct damon_ctx *ctx = damon_new_ctx();
-	unsigned long ids[] = {1, 2, 3};
 	/* Each line represents one region in ``<target idx> <start> <end>`` */
 	char * const valid_inputs[] = {"1 10 20\n 1   20 30\n1 35 45",
 		"1 10 20\n",
@@ -130,7 +111,7 @@ static void damon_dbgfs_test_set_init_regions(struct kunit *test)
 	int i, rc;
 	char buf[256];
 
-	dbgfs_set_targets(ctx, ids, 3);
+	dbgfs_set_targets(ctx, 3, NULL);
 
 	/* Put valid inputs and check the results */
 	for (i = 0; i < ARRAY_SIZE(valid_inputs); i++) {
@@ -158,12 +139,12 @@ static void damon_dbgfs_test_set_init_regions(struct kunit *test)
 		KUNIT_EXPECT_STREQ(test, (char *)buf, "");
 	}
 
-	dbgfs_set_targets(ctx, NULL, 0);
+	dbgfs_set_targets(ctx, 0, NULL);
 	damon_destroy_ctx(ctx);
 }
 
 static struct kunit_case damon_test_cases[] = {
-	KUNIT_CASE(damon_dbgfs_test_str_to_target_ids),
+	KUNIT_CASE(damon_dbgfs_test_str_to_ints),
 	KUNIT_CASE(damon_dbgfs_test_set_targets),
 	KUNIT_CASE(damon_dbgfs_test_set_init_regions),
 	{},
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 58867b9666350..78ff645433c64 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -275,7 +275,7 @@ static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf,
 	return ret;
 }
 
-static inline bool targetid_is_pid(const struct damon_ctx *ctx)
+static inline bool target_has_pid(const struct damon_ctx *ctx)
 {
 	return ctx->primitive.target_valid == damon_va_target_valid;
 }
@@ -283,17 +283,19 @@ static inline bool targetid_is_pid(const struct damon_ctx *ctx)
 static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len)
 {
 	struct damon_target *t;
-	unsigned long id;
+	int id;
 	int written = 0;
 	int rc;
 
 	damon_for_each_target(t, ctx) {
-		id = t->id;
-		if (targetid_is_pid(ctx))
+		if (target_has_pid(ctx))
 			/* Show pid numbers to debugfs users */
-			id = (unsigned long)pid_vnr((struct pid *)id);
+			id = pid_vnr(t->pid);
+		else
+			/* Show 42 for physical address space, just for fun */
+			id = 42;
 
-		rc = scnprintf(&buf[written], len - written, "%lu ", id);
+		rc = scnprintf(&buf[written], len - written, "%d ", id);
 		if (!rc)
 			return -ENOMEM;
 		written += rc;
@@ -321,75 +323,114 @@ static ssize_t dbgfs_target_ids_read(struct file *file,
 }
 
 /*
- * Converts a string into an array of unsigned long integers
+ * Converts a string into an integers array
  *
- * Returns an array of unsigned long integers if the conversion success, or
- * NULL otherwise.
+ * Returns an array of integers array if the conversion success, or NULL
+ * otherwise.
  */
-static unsigned long *str_to_target_ids(const char *str, ssize_t len,
-					ssize_t *nr_ids)
+static int *str_to_ints(const char *str, ssize_t len, ssize_t *nr_ints)
 {
-	unsigned long *ids;
-	const int max_nr_ids = 32;
-	unsigned long id;
+	int *array;
+	const int max_nr_ints = 32;
+	int nr;
 	int pos = 0, parsed, ret;
 
-	*nr_ids = 0;
-	ids = kmalloc_array(max_nr_ids, sizeof(id), GFP_KERNEL);
-	if (!ids)
+	*nr_ints = 0;
+	array = kmalloc_array(max_nr_ints, sizeof(*array), GFP_KERNEL);
+	if (!array)
 		return NULL;
-	while (*nr_ids < max_nr_ids && pos < len) {
-		ret = sscanf(&str[pos], "%lu%n", &id, &parsed);
+	while (*nr_ints < max_nr_ints && pos < len) {
+		ret = sscanf(&str[pos], "%d%n", &nr, &parsed);
 		pos += parsed;
 		if (ret != 1)
 			break;
-		ids[*nr_ids] = id;
-		*nr_ids += 1;
+		array[*nr_ints] = nr;
+		*nr_ints += 1;
 	}
 
-	return ids;
+	return array;
 }
 
-static void dbgfs_put_pids(unsigned long *ids, int nr_ids)
+static void dbgfs_put_pids(struct pid **pids, int nr_pids)
 {
 	int i;
 
-	for (i = 0; i < nr_ids; i++)
-		put_pid((struct pid *)ids[i]);
+	for (i = 0; i < nr_pids; i++)
+		put_pid(pids[i]);
+}
+
+/*
+ * Converts a string into an struct pid pointers array
+ *
+ * Returns an array of struct pid pointers if the conversion success, or NULL
+ * otherwise.
+ */
+static struct pid **str_to_pids(const char *str, ssize_t len, ssize_t *nr_pids)
+{
+	int *ints;
+	ssize_t nr_ints;
+	struct pid **pids;
+
+	*nr_pids = 0;
+
+	ints = str_to_ints(str, len, &nr_ints);
+	if (!ints)
+		return NULL;
+
+	pids = kmalloc_array(nr_ints, sizeof(*pids), GFP_KERNEL);
+	if (!pids)
+		goto out;
+
+	for (; *nr_pids < nr_ints; (*nr_pids)++) {
+		pids[*nr_pids] = find_get_pid(ints[*nr_pids]);
+		if (!pids[*nr_pids]) {
+			dbgfs_put_pids(pids, *nr_pids);
+			kfree(ints);
+			kfree(pids);
+			return NULL;
+		}
+	}
+
+out:
+	kfree(ints);
+	return pids;
 }
 
 /*
  * dbgfs_set_targets() - Set monitoring targets.
  * @ctx:	monitoring context
- * @ids:	array of target ids
- * @nr_ids:	number of entries in @ids
+ * @nr_targets:	number of targets
+ * @pids:	array of target pids (size is same to @nr_targets)
  *
- * This function should not be called while the kdamond is running.
+ * This function should not be called while the kdamond is running.  @pids is
+ * ignored if the context is not configured to have pid in each target.  On
+ * failure, reference counts of all pids in @pids are decremented.
  *
  * Return: 0 on success, negative error code otherwise.
  */
-static int dbgfs_set_targets(struct damon_ctx *ctx,
-		      unsigned long *ids, ssize_t nr_ids)
+static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets,
+		struct pid **pids)
 {
 	ssize_t i;
 	struct damon_target *t, *next;
 
 	damon_for_each_target_safe(t, next, ctx) {
-		if (targetid_is_pid(ctx))
-			put_pid((struct pid *)t->id);
+		if (target_has_pid(ctx))
+			put_pid(t->pid);
 		damon_destroy_target(t);
 	}
 
-	for (i = 0; i < nr_ids; i++) {
-		t = damon_new_target(ids[i]);
+	for (i = 0; i < nr_targets; i++) {
+		t = damon_new_target();
 		if (!t) {
-			/* The caller should do cleanup of the ids itself */
 			damon_for_each_target_safe(t, next, ctx)
 				damon_destroy_target(t);
-			if (targetid_is_pid(ctx))
-				dbgfs_put_pids(ids, nr_ids);
+			if (target_has_pid(ctx))
+				dbgfs_put_pids(pids, nr_targets);
 			return -ENOMEM;
 		}
+		if (target_has_pid(ctx))
+			t->pid = pids[i];
 		damon_add_target(ctx, t);
 	}
 
@@ -402,10 +443,9 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
 	struct damon_ctx *ctx = file->private_data;
 	bool id_is_pid = true;
 	char *kbuf;
-	unsigned long *targets;
+	struct pid **target_pids = NULL;
 	ssize_t nr_targets;
 	ssize_t ret;
-	int i;
 
 	kbuf = user_input_str(buf, count, ppos);
 	if (IS_ERR(kbuf))
@@ -413,38 +453,27 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
 
 	if (!strncmp(kbuf, "paddr\n", count)) {
 		id_is_pid = false;
-		/* target id is meaningless here, but we set it just for fun */
-		scnprintf(kbuf, count, "42    ");
-	}
-
-	targets = str_to_target_ids(kbuf, count, &nr_targets);
-	if (!targets) {
-		ret = -ENOMEM;
-		goto out;
+		nr_targets = 1;
 	}
 
 	if (id_is_pid) {
-		for (i = 0; i < nr_targets; i++) {
-			targets[i] = (unsigned long)find_get_pid(
-					(int)targets[i]);
-			if (!targets[i]) {
-				dbgfs_put_pids(targets, i);
-				ret = -EINVAL;
-				goto free_targets_out;
-			}
+		target_pids = str_to_pids(kbuf, count, &nr_targets);
+		if (!target_pids) {
+			ret = -ENOMEM;
+			goto out;
 		}
 	}
 
 	mutex_lock(&ctx->kdamond_lock);
 	if (ctx->kdamond) {
 		if (id_is_pid)
-			dbgfs_put_pids(targets, nr_targets);
+			dbgfs_put_pids(target_pids, nr_targets);
 		ret = -EBUSY;
 		goto unlock_out;
 	}
 
 	/* remove previously set targets */
-	dbgfs_set_targets(ctx, NULL, 0);
+	dbgfs_set_targets(ctx, 0, NULL);
 
 	/* Configure the context for the address space type */
 	if (id_is_pid)
@@ -452,14 +481,13 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
 	else
 		damon_pa_set_primitives(ctx);
 
-	ret = dbgfs_set_targets(ctx, targets, nr_targets);
+	ret = dbgfs_set_targets(ctx, nr_targets, target_pids);
 	if (!ret)
 		ret = count;
 
 unlock_out:
 	mutex_unlock(&ctx->kdamond_lock);
-free_targets_out:
-	kfree(targets);
+	kfree(target_pids);
 out:
 	kfree(kbuf);
 	return ret;
@@ -688,12 +716,12 @@ static void dbgfs_before_terminate(struct damon_ctx *ctx)
 {
 	struct damon_target *t, *next;
 
-	if (!targetid_is_pid(ctx))
+	if (!target_has_pid(ctx))
 		return;
 
 	mutex_lock(&ctx->kdamond_lock);
 	damon_for_each_target_safe(t, next, ctx) {
-		put_pid((struct pid *)t->id);
+		put_pid(t->pid);
 		damon_destroy_target(t);
 	}
 	mutex_unlock(&ctx->kdamond_lock);
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index bc476cef688e8..29da37192e4a0 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -387,8 +387,7 @@ static int __init damon_reclaim_init(void)
 	damon_pa_set_primitives(ctx);
 	ctx->callback.after_aggregation = damon_reclaim_after_aggregation;
 
-	/* 4242 means nothing but fun */
-	target = damon_new_target(4242);
+	target = damon_new_target();
 	if (!target) {
 		damon_destroy_ctx(ctx);
 		return -ENOMEM;
diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h
index 6a1b9272ea123..f0d0ba591792c 100644
--- a/mm/damon/vaddr-test.h
+++ b/mm/damon/vaddr-test.h
@@ -139,7 +139,7 @@ static void damon_do_test_apply_three_regions(struct kunit *test,
 	struct damon_region *r;
 	int i;
 
-	t = damon_new_target(42);
+	t = damon_new_target();
 	for (i = 0; i < nr_regions / 2; i++) {
 		r = damon_new_region(regions[i * 2], regions[i * 2 + 1]);
 		damon_add_region(r, t);
@@ -251,7 +251,7 @@ static void damon_test_apply_three_regions4(struct kunit *test)
 static void damon_test_split_evenly_fail(struct kunit *test,
 		unsigned long start, unsigned long end, unsigned int nr_pieces)
 {
-	struct damon_target *t = damon_new_target(42);
+	struct damon_target *t = damon_new_target();
 	struct damon_region *r = damon_new_region(start, end);
 
 	damon_add_region(r, t);
@@ -270,7 +270,7 @@ static void damon_test_split_evenly_fail(struct kunit *test,
 static void damon_test_split_evenly_succ(struct kunit *test,
 	unsigned long start, unsigned long end, unsigned int nr_pieces)
 {
-	struct damon_target *t = damon_new_target(42);
+	struct damon_target *t = damon_new_target();
 	struct damon_region *r = damon_new_region(start, end);
 	unsigned long expected_width = (end - start) / nr_pieces;
 	unsigned long i = 0;
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 89b6468da2b9b..f98edb90a873c 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -23,12 +23,12 @@
 #endif
 
 /*
- * 't->id' should be the pointer to the relevant 'struct pid' having reference
+ * 't->pid' should be the pointer to the relevant 'struct pid' having reference
  * count.  Caller must put the returned task, unless it is NULL.
  */
 static inline struct task_struct *damon_get_task_struct(struct damon_target *t)
 {
-	return get_pid_task((struct pid *)t->id, PIDTYPE_PID);
+	return get_pid_task(t->pid, PIDTYPE_PID);
 }
 
 /*

From fc0fb349f1ff3bf4f88ad0ec1de52db27c72af8a Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Tue, 22 Mar 2022 14:48:43 -0700
Subject: [PATCH 433/737] mm/damon: remove redundant page validation

It will never get a NULL page by pte_page() as discussed in thread [1],
thus remove the redundant page validation to fix below Smatch static
checker warning.

    mm/damon/vaddr.c:405 damon_hugetlb_mkold()
    warn: 'page' can't be NULL.

[1] https://lore.kernel.org/linux-mm/20220106091200.GA14564@kili/

Link: https://lkml.kernel.org/r/6d32f7d201b8970d53f51b6c5717d472aed2987c.1642386715.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Souptick Joarder <jrdr.linux@gmail.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/vaddr.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index f98edb90a873c..6d3454dd3204b 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -402,9 +402,6 @@ static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm,
 	pte_t entry = huge_ptep_get(pte);
 	struct page *page = pte_page(entry);
 
-	if (!page)
-		return;
-
 	get_page(page);
 
 	if (pte_young(entry)) {
@@ -564,9 +561,6 @@ static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask,
 		goto out;
 
 	page = pte_page(entry);
-	if (!page)
-		goto out;
-
 	get_page(page);
 
 	if (pte_young(entry) || !page_is_idle(page) ||

From 1b202fdd73f8f6887562fc1835e00e26f7beffda Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Mar 2022 14:48:46 -0700
Subject: [PATCH 434/737] mm/damon: rename damon_primitives to damon_operations

Patch series "Allow DAMON user code independent of monitoring primitives".

In-kernel DAMON user code is required to configure the monitoring context
(struct damon_ctx) with proper monitoring primitives (struct
damon_primitive).  This makes the user code dependent to all supporting
monitoring primitives.  For example, DAMON debugfs interface depends on
both DAMON_VADDR and DAMON_PADDR, though some users have interest in only
one use case.  As more monitoring primitives are introduced, the problem
will be bigger.

To minimize such unnecessary dependency, this patchset makes monitoring
primitives can be registered by the implemnting code and later dynamically
searched and selected by the user code.

In addition to that, this patchset renames monitoring primitives to
monitoring operations, which is more easy to intuitively understand what
it means and how it would be structed.

This patch (of 8):

DAMON has a set of callback functions called monitoring primitives and let
it can be configured with various implementations for easy extension for
different address spaces and usages.  However, the word 'primitive' is not
so explicit.  Meanwhile, many other structs resembles similar purpose
calls themselves 'operations'.  To make the code easier to be understood,
this commit renames 'damon_primitives' to 'damon_operations' before it is
too late to rename.

Link: https://lkml.kernel.org/r/20220215184603.1479-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20220215184603.1479-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Xin Hao <xhao@linux.alibaba.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h                     | 48 ++++++++---------
 mm/damon/Kconfig                          | 12 ++---
 mm/damon/Makefile                         |  4 +-
 mm/damon/core.c                           | 65 ++++++++++++-----------
 mm/damon/dbgfs-test.h                     |  2 +-
 mm/damon/dbgfs.c                          | 10 ++--
 mm/damon/{prmtv-common.c => ops-common.c} |  2 +-
 mm/damon/{prmtv-common.h => ops-common.h} |  0
 mm/damon/paddr.c                          | 22 ++++----
 mm/damon/reclaim.c                        |  2 +-
 mm/damon/vaddr-test.h                     |  2 +-
 mm/damon/vaddr.c                          | 22 ++++----
 12 files changed, 96 insertions(+), 95 deletions(-)
 rename mm/damon/{prmtv-common.c => ops-common.c} (99%)
 rename mm/damon/{prmtv-common.h => ops-common.h} (100%)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 7c1d915b35875..00baeb42c18e2 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -67,8 +67,8 @@ struct damon_region {
  *
  * Each monitoring context could have multiple targets.  For example, a context
  * for virtual memory address spaces could have multiple target processes.  The
- * @pid should be set for appropriate address space monitoring primitives
- * including the virtual address spaces monitoring primitives.
+ * @pid should be set for appropriate &struct damon_operations including the
+ * virtual address spaces monitoring operations.
  */
 struct damon_target {
 	struct pid *pid;
@@ -120,9 +120,9 @@ enum damos_action {
  * uses smaller one as the effective quota.
  *
  * For selecting regions within the quota, DAMON prioritizes current scheme's
- * target memory regions using the &struct damon_primitive->get_scheme_score.
+ * target memory regions using the &struct damon_operations->get_scheme_score.
  * You could customize the prioritization logic by setting &weight_sz,
- * &weight_nr_accesses, and &weight_age, because monitoring primitives are
+ * &weight_nr_accesses, and &weight_age, because monitoring operations are
  * encouraged to respect those.
  */
 struct damos_quota {
@@ -256,10 +256,10 @@ struct damos {
 struct damon_ctx;
 
 /**
- * struct damon_primitive - Monitoring primitives for given use cases.
+ * struct damon_operations - Monitoring operations for given use cases.
  *
- * @init:			Initialize primitive-internal data structures.
- * @update:			Update primitive-internal data structures.
+ * @init:			Initialize operations-related data structures.
+ * @update:			Update operations-related data structures.
  * @prepare_access_checks:	Prepare next access check of target regions.
  * @check_accesses:		Check the accesses to target regions.
  * @reset_aggregated:		Reset aggregated accesses monitoring results.
@@ -269,18 +269,18 @@ struct damon_ctx;
  * @cleanup:			Clean up the context.
  *
  * DAMON can be extended for various address spaces and usages.  For this,
- * users should register the low level primitives for their target address
- * space and usecase via the &damon_ctx.primitive.  Then, the monitoring thread
+ * users should register the low level operations for their target address
+ * space and usecase via the &damon_ctx.ops.  Then, the monitoring thread
  * (&damon_ctx.kdamond) calls @init and @prepare_access_checks before starting
- * the monitoring, @update after each &damon_ctx.primitive_update_interval, and
+ * the monitoring, @update after each &damon_ctx.ops_update_interval, and
  * @check_accesses, @target_valid and @prepare_access_checks after each
  * &damon_ctx.sample_interval.  Finally, @reset_aggregated is called after each
  * &damon_ctx.aggr_interval.
  *
- * @init should initialize primitive-internal data structures.  For example,
+ * @init should initialize operations-related data structures.  For example,
  * this could be used to construct proper monitoring target regions and link
  * those to @damon_ctx.adaptive_targets.
- * @update should update the primitive-internal data structures.  For example,
+ * @update should update the operations-related data structures.  For example,
  * this could be used to update monitoring target regions for current status.
  * @prepare_access_checks should manipulate the monitoring regions to be
  * prepared for the next access check.
@@ -300,7 +300,7 @@ struct damon_ctx;
  * monitoring.
  * @cleanup is called from @kdamond just before its termination.
  */
-struct damon_primitive {
+struct damon_operations {
 	void (*init)(struct damon_ctx *context);
 	void (*update)(struct damon_ctx *context);
 	void (*prepare_access_checks)(struct damon_ctx *context);
@@ -354,15 +354,15 @@ struct damon_callback {
  *
  * @sample_interval:		The time between access samplings.
  * @aggr_interval:		The time between monitor results aggregations.
- * @primitive_update_interval:	The time between monitoring primitive updates.
+ * @ops_update_interval:	The time between monitoring operations updates.
  *
  * For each @sample_interval, DAMON checks whether each region is accessed or
  * not.  It aggregates and keeps the access information (number of accesses to
  * each region) for @aggr_interval time.  DAMON also checks whether the target
  * memory regions need update (e.g., by ``mmap()`` calls from the application,
  * in case of virtual memory monitoring) and applies the changes for each
- * @primitive_update_interval.  All time intervals are in micro-seconds.
- * Please refer to &struct damon_primitive and &struct damon_callback for more
+ * @ops_update_interval.  All time intervals are in micro-seconds.
+ * Please refer to &struct damon_operations and &struct damon_callback for more
  * detail.
  *
  * @kdamond:		Kernel thread who does the monitoring.
@@ -374,7 +374,7 @@ struct damon_callback {
  *
  * Once started, the monitoring thread runs until explicitly required to be
  * terminated or every monitoring target is invalid.  The validity of the
- * targets is checked via the &damon_primitive.target_valid of @primitive.  The
+ * targets is checked via the &damon_operations.target_valid of @ops.  The
  * termination can also be explicitly requested by writing non-zero to
  * @kdamond_stop.  The thread sets @kdamond to NULL when it terminates.
  * Therefore, users can know whether the monitoring is ongoing or terminated by
@@ -384,7 +384,7 @@ struct damon_callback {
  * Note that the monitoring thread protects only @kdamond and @kdamond_stop via
  * @kdamond_lock.  Accesses to other fields must be protected by themselves.
  *
- * @primitive:	Set of monitoring primitives for given use cases.
+ * @ops:	Set of monitoring operations for given use cases.
  * @callback:	Set of callbacks for monitoring events notifications.
  *
  * @min_nr_regions:	The minimum number of adaptive monitoring regions.
@@ -395,17 +395,17 @@ struct damon_callback {
 struct damon_ctx {
 	unsigned long sample_interval;
 	unsigned long aggr_interval;
-	unsigned long primitive_update_interval;
+	unsigned long ops_update_interval;
 
 /* private: internal use only */
 	struct timespec64 last_aggregation;
-	struct timespec64 last_primitive_update;
+	struct timespec64 last_ops_update;
 
 /* public: */
 	struct task_struct *kdamond;
 	struct mutex kdamond_lock;
 
-	struct damon_primitive primitive;
+	struct damon_operations ops;
 	struct damon_callback callback;
 
 	unsigned long min_nr_regions;
@@ -484,7 +484,7 @@ unsigned int damon_nr_regions(struct damon_target *t);
 struct damon_ctx *damon_new_ctx(void);
 void damon_destroy_ctx(struct damon_ctx *ctx);
 int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
-		unsigned long aggr_int, unsigned long primitive_upd_int,
+		unsigned long aggr_int, unsigned long ops_upd_int,
 		unsigned long min_nr_reg, unsigned long max_nr_reg);
 int damon_set_schemes(struct damon_ctx *ctx,
 			struct damos **schemes, ssize_t nr_schemes);
@@ -497,12 +497,12 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
 
 #ifdef CONFIG_DAMON_VADDR
 bool damon_va_target_valid(void *t);
-void damon_va_set_primitives(struct damon_ctx *ctx);
+void damon_va_set_operations(struct damon_ctx *ctx);
 #endif	/* CONFIG_DAMON_VADDR */
 
 #ifdef CONFIG_DAMON_PADDR
 bool damon_pa_target_valid(void *t);
-void damon_pa_set_primitives(struct damon_ctx *ctx);
+void damon_pa_set_operations(struct damon_ctx *ctx);
 #endif	/* CONFIG_DAMON_PADDR */
 
 #endif	/* _DAMON_H */
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index 5bcf05851ad07..01bad77ad7ae6 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -25,27 +25,27 @@ config DAMON_KUNIT_TEST
 	  If unsure, say N.
 
 config DAMON_VADDR
-	bool "Data access monitoring primitives for virtual address spaces"
+	bool "Data access monitoring operations for virtual address spaces"
 	depends on DAMON && MMU
 	select PAGE_IDLE_FLAG
 	help
-	  This builds the default data access monitoring primitives for DAMON
+	  This builds the default data access monitoring operations for DAMON
 	  that work for virtual address spaces.
 
 config DAMON_PADDR
-	bool "Data access monitoring primitives for the physical address space"
+	bool "Data access monitoring operations for the physical address space"
 	depends on DAMON && MMU
 	select PAGE_IDLE_FLAG
 	help
-	  This builds the default data access monitoring primitives for DAMON
+	  This builds the default data access monitoring operations for DAMON
 	  that works for the physical address space.
 
 config DAMON_VADDR_KUNIT_TEST
-	bool "Test for DAMON primitives" if !KUNIT_ALL_TESTS
+	bool "Test for DAMON operations" if !KUNIT_ALL_TESTS
 	depends on DAMON_VADDR && KUNIT=y
 	default KUNIT_ALL_TESTS
 	help
-	  This builds the DAMON virtual addresses primitives Kunit test suite.
+	  This builds the DAMON virtual addresses operations Kunit test suite.
 
 	  For more information on KUnit and unit tests in general, please refer
 	  to the KUnit documentation.
diff --git a/mm/damon/Makefile b/mm/damon/Makefile
index f7d5ac377a2bb..03931472991a4 100644
--- a/mm/damon/Makefile
+++ b/mm/damon/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
 obj-$(CONFIG_DAMON)		:= core.o
-obj-$(CONFIG_DAMON_VADDR)	+= prmtv-common.o vaddr.o
-obj-$(CONFIG_DAMON_PADDR)	+= prmtv-common.o paddr.o
+obj-$(CONFIG_DAMON_VADDR)	+= ops-common.o vaddr.o
+obj-$(CONFIG_DAMON_PADDR)	+= ops-common.o paddr.o
 obj-$(CONFIG_DAMON_DBGFS)	+= dbgfs.o
 obj-$(CONFIG_DAMON_RECLAIM)	+= reclaim.o
diff --git a/mm/damon/core.c b/mm/damon/core.c
index bf495236d741b..be93fb1c34735 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -204,10 +204,10 @@ struct damon_ctx *damon_new_ctx(void)
 
 	ctx->sample_interval = 5 * 1000;
 	ctx->aggr_interval = 100 * 1000;
-	ctx->primitive_update_interval = 60 * 1000 * 1000;
+	ctx->ops_update_interval = 60 * 1000 * 1000;
 
 	ktime_get_coarse_ts64(&ctx->last_aggregation);
-	ctx->last_primitive_update = ctx->last_aggregation;
+	ctx->last_ops_update = ctx->last_aggregation;
 
 	mutex_init(&ctx->kdamond_lock);
 
@@ -224,8 +224,8 @@ static void damon_destroy_targets(struct damon_ctx *ctx)
 {
 	struct damon_target *t, *next_t;
 
-	if (ctx->primitive.cleanup) {
-		ctx->primitive.cleanup(ctx);
+	if (ctx->ops.cleanup) {
+		ctx->ops.cleanup(ctx);
 		return;
 	}
 
@@ -250,7 +250,7 @@ void damon_destroy_ctx(struct damon_ctx *ctx)
  * @ctx:		monitoring context
  * @sample_int:		time interval between samplings
  * @aggr_int:		time interval between aggregations
- * @primitive_upd_int:	time interval between monitoring primitive updates
+ * @ops_upd_int:	time interval between monitoring operations updates
  * @min_nr_reg:		minimal number of regions
  * @max_nr_reg:		maximum number of regions
  *
@@ -260,7 +260,7 @@ void damon_destroy_ctx(struct damon_ctx *ctx)
  * Return: 0 on success, negative error code otherwise.
  */
 int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
-		    unsigned long aggr_int, unsigned long primitive_upd_int,
+		    unsigned long aggr_int, unsigned long ops_upd_int,
 		    unsigned long min_nr_reg, unsigned long max_nr_reg)
 {
 	if (min_nr_reg < 3)
@@ -270,7 +270,7 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
 
 	ctx->sample_interval = sample_int;
 	ctx->aggr_interval = aggr_int;
-	ctx->primitive_update_interval = primitive_upd_int;
+	ctx->ops_update_interval = ops_upd_int;
 	ctx->min_nr_regions = min_nr_reg;
 	ctx->max_nr_regions = max_nr_reg;
 
@@ -516,10 +516,10 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t,
 {
 	bool ret = __damos_valid_target(r, s);
 
-	if (!ret || !s->quota.esz || !c->primitive.get_scheme_score)
+	if (!ret || !s->quota.esz || !c->ops.get_scheme_score)
 		return ret;
 
-	return c->primitive.get_scheme_score(c, t, r, s) >= s->quota.min_score;
+	return c->ops.get_scheme_score(c, t, r, s) >= s->quota.min_score;
 }
 
 static void damon_do_apply_schemes(struct damon_ctx *c,
@@ -576,7 +576,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
 			continue;
 
 		/* Apply the scheme */
-		if (c->primitive.apply_scheme) {
+		if (c->ops.apply_scheme) {
 			if (quota->esz &&
 					quota->charged_sz + sz > quota->esz) {
 				sz = ALIGN_DOWN(quota->esz - quota->charged_sz,
@@ -586,7 +586,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
 				damon_split_region_at(c, t, r, sz);
 			}
 			ktime_get_coarse_ts64(&begin);
-			sz_applied = c->primitive.apply_scheme(c, t, r, s);
+			sz_applied = c->ops.apply_scheme(c, t, r, s);
 			ktime_get_coarse_ts64(&end);
 			quota->total_charged_ns += timespec64_to_ns(&end) -
 				timespec64_to_ns(&begin);
@@ -660,7 +660,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
 			damos_set_effective_quota(quota);
 		}
 
-		if (!c->primitive.get_scheme_score)
+		if (!c->ops.get_scheme_score)
 			continue;
 
 		/* Fill up the score histogram */
@@ -669,7 +669,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
 			damon_for_each_region(r, t) {
 				if (!__damos_valid_target(r, s))
 					continue;
-				score = c->primitive.get_scheme_score(
+				score = c->ops.get_scheme_score(
 						c, t, r, s);
 				quota->histogram[score] +=
 					r->ar.end - r->ar.start;
@@ -848,14 +848,15 @@ static void kdamond_split_regions(struct damon_ctx *ctx)
 }
 
 /*
- * Check whether it is time to check and apply the target monitoring regions
+ * Check whether it is time to check and apply the operations-related data
+ * structures.
  *
  * Returns true if it is.
  */
-static bool kdamond_need_update_primitive(struct damon_ctx *ctx)
+static bool kdamond_need_update_operations(struct damon_ctx *ctx)
 {
-	return damon_check_reset_time_interval(&ctx->last_primitive_update,
-			ctx->primitive_update_interval);
+	return damon_check_reset_time_interval(&ctx->last_ops_update,
+			ctx->ops_update_interval);
 }
 
 /*
@@ -873,11 +874,11 @@ static bool kdamond_need_stop(struct damon_ctx *ctx)
 	if (kthread_should_stop())
 		return true;
 
-	if (!ctx->primitive.target_valid)
+	if (!ctx->ops.target_valid)
 		return false;
 
 	damon_for_each_target(t, ctx) {
-		if (ctx->primitive.target_valid(t))
+		if (ctx->ops.target_valid(t))
 			return false;
 	}
 
@@ -976,8 +977,8 @@ static int kdamond_fn(void *data)
 
 	pr_debug("kdamond (%d) starts\n", current->pid);
 
-	if (ctx->primitive.init)
-		ctx->primitive.init(ctx);
+	if (ctx->ops.init)
+		ctx->ops.init(ctx);
 	if (ctx->callback.before_start && ctx->callback.before_start(ctx))
 		done = true;
 
@@ -987,16 +988,16 @@ static int kdamond_fn(void *data)
 		if (kdamond_wait_activation(ctx))
 			continue;
 
-		if (ctx->primitive.prepare_access_checks)
-			ctx->primitive.prepare_access_checks(ctx);
+		if (ctx->ops.prepare_access_checks)
+			ctx->ops.prepare_access_checks(ctx);
 		if (ctx->callback.after_sampling &&
 				ctx->callback.after_sampling(ctx))
 			done = true;
 
 		kdamond_usleep(ctx->sample_interval);
 
-		if (ctx->primitive.check_accesses)
-			max_nr_accesses = ctx->primitive.check_accesses(ctx);
+		if (ctx->ops.check_accesses)
+			max_nr_accesses = ctx->ops.check_accesses(ctx);
 
 		if (kdamond_aggregate_interval_passed(ctx)) {
 			kdamond_merge_regions(ctx,
@@ -1008,13 +1009,13 @@ static int kdamond_fn(void *data)
 			kdamond_apply_schemes(ctx);
 			kdamond_reset_aggregated(ctx);
 			kdamond_split_regions(ctx);
-			if (ctx->primitive.reset_aggregated)
-				ctx->primitive.reset_aggregated(ctx);
+			if (ctx->ops.reset_aggregated)
+				ctx->ops.reset_aggregated(ctx);
 		}
 
-		if (kdamond_need_update_primitive(ctx)) {
-			if (ctx->primitive.update)
-				ctx->primitive.update(ctx);
+		if (kdamond_need_update_operations(ctx)) {
+			if (ctx->ops.update)
+				ctx->ops.update(ctx);
 			sz_limit = damon_region_sz_limit(ctx);
 		}
 	}
@@ -1025,8 +1026,8 @@ static int kdamond_fn(void *data)
 
 	if (ctx->callback.before_terminate)
 		ctx->callback.before_terminate(ctx);
-	if (ctx->primitive.cleanup)
-		ctx->primitive.cleanup(ctx);
+	if (ctx->ops.cleanup)
+		ctx->ops.cleanup(ctx);
 
 	pr_debug("kdamond (%d) finishes\n", current->pid);
 	mutex_lock(&ctx->kdamond_lock);
diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h
index 0d3a14c00acfb..8f7f325950559 100644
--- a/mm/damon/dbgfs-test.h
+++ b/mm/damon/dbgfs-test.h
@@ -74,7 +74,7 @@ static void damon_dbgfs_test_set_targets(struct kunit *test)
 	char buf[64];
 
 	/* Make DAMON consider target has no pid */
-	ctx->primitive = (struct damon_primitive){};
+	ctx->ops = (struct damon_operations){};
 
 	dbgfs_set_targets(ctx, 0, NULL);
 	sprint_target_ids(ctx, buf, 64);
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 78ff645433c64..719278a8cc5eb 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -56,7 +56,7 @@ static ssize_t dbgfs_attrs_read(struct file *file,
 	mutex_lock(&ctx->kdamond_lock);
 	ret = scnprintf(kbuf, ARRAY_SIZE(kbuf), "%lu %lu %lu %lu %lu\n",
 			ctx->sample_interval, ctx->aggr_interval,
-			ctx->primitive_update_interval, ctx->min_nr_regions,
+			ctx->ops_update_interval, ctx->min_nr_regions,
 			ctx->max_nr_regions);
 	mutex_unlock(&ctx->kdamond_lock);
 
@@ -277,7 +277,7 @@ static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf,
 
 static inline bool target_has_pid(const struct damon_ctx *ctx)
 {
-	return ctx->primitive.target_valid == damon_va_target_valid;
+	return ctx->ops.target_valid == damon_va_target_valid;
 }
 
 static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len)
@@ -477,9 +477,9 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
 
 	/* Configure the context for the address space type */
 	if (id_is_pid)
-		damon_va_set_primitives(ctx);
+		damon_va_set_operations(ctx);
 	else
-		damon_pa_set_primitives(ctx);
+		damon_pa_set_operations(ctx);
 
 	ret = dbgfs_set_targets(ctx, nr_targets, target_pids);
 	if (!ret)
@@ -735,7 +735,7 @@ static struct damon_ctx *dbgfs_new_ctx(void)
 	if (!ctx)
 		return NULL;
 
-	damon_va_set_primitives(ctx);
+	damon_va_set_operations(ctx);
 	ctx->callback.before_terminate = dbgfs_before_terminate;
 	return ctx;
 }
diff --git a/mm/damon/prmtv-common.c b/mm/damon/ops-common.c
similarity index 99%
rename from mm/damon/prmtv-common.c
rename to mm/damon/ops-common.c
index 92a04f5831d6b..e346cc10d1439 100644
--- a/mm/damon/prmtv-common.c
+++ b/mm/damon/ops-common.c
@@ -10,7 +10,7 @@
 #include <linux/pagemap.h>
 #include <linux/rmap.h>
 
-#include "prmtv-common.h"
+#include "ops-common.h"
 
 /*
  * Get an online page for a pfn if it's in the LRU list.  Otherwise, returns
diff --git a/mm/damon/prmtv-common.h b/mm/damon/ops-common.h
similarity index 100%
rename from mm/damon/prmtv-common.h
rename to mm/damon/ops-common.h
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index a6c5b24fba7a9..fbeb9a3236c5d 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -14,7 +14,7 @@
 #include <linux/swap.h>
 
 #include "../internal.h"
-#include "prmtv-common.h"
+#include "ops-common.h"
 
 static bool __damon_pa_mkold(struct page *page, struct vm_area_struct *vma,
 		unsigned long addr, void *arg)
@@ -261,15 +261,15 @@ static int damon_pa_scheme_score(struct damon_ctx *context,
 	return DAMOS_MAX_SCORE;
 }
 
-void damon_pa_set_primitives(struct damon_ctx *ctx)
+void damon_pa_set_operations(struct damon_ctx *ctx)
 {
-	ctx->primitive.init = NULL;
-	ctx->primitive.update = NULL;
-	ctx->primitive.prepare_access_checks = damon_pa_prepare_access_checks;
-	ctx->primitive.check_accesses = damon_pa_check_accesses;
-	ctx->primitive.reset_aggregated = NULL;
-	ctx->primitive.target_valid = damon_pa_target_valid;
-	ctx->primitive.cleanup = NULL;
-	ctx->primitive.apply_scheme = damon_pa_apply_scheme;
-	ctx->primitive.get_scheme_score = damon_pa_scheme_score;
+	ctx->ops.init = NULL;
+	ctx->ops.update = NULL;
+	ctx->ops.prepare_access_checks = damon_pa_prepare_access_checks;
+	ctx->ops.check_accesses = damon_pa_check_accesses;
+	ctx->ops.reset_aggregated = NULL;
+	ctx->ops.target_valid = damon_pa_target_valid;
+	ctx->ops.cleanup = NULL;
+	ctx->ops.apply_scheme = damon_pa_apply_scheme;
+	ctx->ops.get_scheme_score = damon_pa_scheme_score;
 }
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 29da37192e4a0..3c93095c793c4 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -384,7 +384,7 @@ static int __init damon_reclaim_init(void)
 	if (!ctx)
 		return -ENOMEM;
 
-	damon_pa_set_primitives(ctx);
+	damon_pa_set_operations(ctx);
 	ctx->callback.after_aggregation = damon_reclaim_after_aggregation;
 
 	target = damon_new_target();
diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h
index f0d0ba591792c..1a55bb6c36c3d 100644
--- a/mm/damon/vaddr-test.h
+++ b/mm/damon/vaddr-test.h
@@ -314,7 +314,7 @@ static struct kunit_case damon_test_cases[] = {
 };
 
 static struct kunit_suite damon_test_suite = {
-	.name = "damon-primitives",
+	.name = "damon-operations",
 	.test_cases = damon_test_cases,
 };
 kunit_test_suite(damon_test_suite);
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 6d3454dd3204b..c0eb32025f9ba 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -15,7 +15,7 @@
 #include <linux/pagewalk.h>
 #include <linux/sched/mm.h>
 
-#include "prmtv-common.h"
+#include "ops-common.h"
 
 #ifdef CONFIG_DAMON_VADDR_KUNIT_TEST
 #undef DAMON_MIN_REGION
@@ -739,17 +739,17 @@ static int damon_va_scheme_score(struct damon_ctx *context,
 	return DAMOS_MAX_SCORE;
 }
 
-void damon_va_set_primitives(struct damon_ctx *ctx)
+void damon_va_set_operations(struct damon_ctx *ctx)
 {
-	ctx->primitive.init = damon_va_init;
-	ctx->primitive.update = damon_va_update;
-	ctx->primitive.prepare_access_checks = damon_va_prepare_access_checks;
-	ctx->primitive.check_accesses = damon_va_check_accesses;
-	ctx->primitive.reset_aggregated = NULL;
-	ctx->primitive.target_valid = damon_va_target_valid;
-	ctx->primitive.cleanup = NULL;
-	ctx->primitive.apply_scheme = damon_va_apply_scheme;
-	ctx->primitive.get_scheme_score = damon_va_scheme_score;
+	ctx->ops.init = damon_va_init;
+	ctx->ops.update = damon_va_update;
+	ctx->ops.prepare_access_checks = damon_va_prepare_access_checks;
+	ctx->ops.check_accesses = damon_va_check_accesses;
+	ctx->ops.reset_aggregated = NULL;
+	ctx->ops.target_valid = damon_va_target_valid;
+	ctx->ops.cleanup = NULL;
+	ctx->ops.apply_scheme = damon_va_apply_scheme;
+	ctx->ops.get_scheme_score = damon_va_scheme_score;
 }
 
 #include "vaddr-test.h"

From 4acdcf5c4bc8a9891c15ca9c0d0e9ba64027903c Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Mar 2022 14:48:49 -0700
Subject: [PATCH 435/737] mm/damon: let monitoring operations can be registered
 and selected

In-kernel DAMON user code like DAMON debugfs interface should set 'struct
damon_operations' of its 'struct damon_ctx' on its own.  Therefore, the
client code should depend on all supporting monitoring operations
implementations that it could use.  For example, DAMON debugfs interface
depends on both vaddr and paddr, while some of the users are not always
interested in both.

To minimize such unnecessary dependencies, this commit makes the
monitoring operations can be registered by implementing code and then
dynamically selected by the user code without build-time dependency.

Link: https://lkml.kernel.org/r/20220215184603.1479-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Xin Hao <xhao@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 18 ++++++++++++
 mm/damon/core.c       | 66 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 00baeb42c18e2..076da277b249e 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -253,11 +253,24 @@ struct damos {
 	struct list_head list;
 };
 
+/**
+ * enum damon_ops_id - Identifier for each monitoring operations implementation
+ *
+ * @DAMON_OPS_VADDR:	Monitoring operations for virtual address spaces
+ * @DAMON_OPS_PADDR:	Monitoring operations for the physical address space
+ */
+enum damon_ops_id {
+	DAMON_OPS_VADDR,
+	DAMON_OPS_PADDR,
+	NR_DAMON_OPS,
+};
+
 struct damon_ctx;
 
 /**
  * struct damon_operations - Monitoring operations for given use cases.
  *
+ * @id:				Identifier of this operations set.
  * @init:			Initialize operations-related data structures.
  * @update:			Update operations-related data structures.
  * @prepare_access_checks:	Prepare next access check of target regions.
@@ -277,6 +290,8 @@ struct damon_ctx;
  * &damon_ctx.sample_interval.  Finally, @reset_aggregated is called after each
  * &damon_ctx.aggr_interval.
  *
+ * Each &struct damon_operations instance having valid @id can be registered
+ * via damon_register_ops() and selected by damon_select_ops() later.
  * @init should initialize operations-related data structures.  For example,
  * this could be used to construct proper monitoring target regions and link
  * those to @damon_ctx.adaptive_targets.
@@ -301,6 +316,7 @@ struct damon_ctx;
  * @cleanup is called from @kdamond just before its termination.
  */
 struct damon_operations {
+	enum damon_ops_id id;
 	void (*init)(struct damon_ctx *context);
 	void (*update)(struct damon_ctx *context);
 	void (*prepare_access_checks)(struct damon_ctx *context);
@@ -489,6 +505,8 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
 int damon_set_schemes(struct damon_ctx *ctx,
 			struct damos **schemes, ssize_t nr_schemes);
 int damon_nr_running_ctxs(void);
+int damon_register_ops(struct damon_operations *ops);
+int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id);
 
 int damon_start(struct damon_ctx **ctxs, int nr_ctxs);
 int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
diff --git a/mm/damon/core.c b/mm/damon/core.c
index be93fb1c34735..82e0a4620c4fe 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -25,6 +25,72 @@
 static DEFINE_MUTEX(damon_lock);
 static int nr_running_ctxs;
 
+static DEFINE_MUTEX(damon_ops_lock);
+static struct damon_operations damon_registered_ops[NR_DAMON_OPS];
+
+/* Should be called under damon_ops_lock with id smaller than NR_DAMON_OPS */
+static bool damon_registered_ops_id(enum damon_ops_id id)
+{
+	struct damon_operations empty_ops = {};
+
+	if (!memcmp(&empty_ops, &damon_registered_ops[id], sizeof(empty_ops)))
+		return false;
+	return true;
+}
+
+/**
+ * damon_register_ops() - Register a monitoring operations set to DAMON.
+ * @ops:	monitoring operations set to register.
+ *
+ * This function registers a monitoring operations set of valid &struct
+ * damon_operations->id so that others can find and use them later.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int damon_register_ops(struct damon_operations *ops)
+{
+	int err = 0;
+
+	if (ops->id >= NR_DAMON_OPS)
+		return -EINVAL;
+	mutex_lock(&damon_ops_lock);
+	/* Fail for already registered ops */
+	if (damon_registered_ops_id(ops->id)) {
+		err = -EINVAL;
+		goto out;
+	}
+	damon_registered_ops[ops->id] = *ops;
+out:
+	mutex_unlock(&damon_ops_lock);
+	return err;
+}
+
+/**
+ * damon_select_ops() - Select a monitoring operations to use with the context.
+ * @ctx:	monitoring context to use the operations.
+ * @id:		id of the registered monitoring operations to select.
+ *
+ * This function finds registered monitoring operations set of @id and make
+ * @ctx to use it.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id)
+{
+	int err = 0;
+
+	if (id >= NR_DAMON_OPS)
+		return -EINVAL;
+
+	mutex_lock(&damon_ops_lock);
+	if (!damon_registered_ops_id(id))
+		err = -EINVAL;
+	else
+		ctx->ops = damon_registered_ops[id];
+	mutex_unlock(&damon_ops_lock);
+	return err;
+}
+
 /*
  * Construct a damon_region struct
  *

From 2fe90ac3fad89173f1959b54c0225e1dc3dc57e2 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Mar 2022 14:48:52 -0700
Subject: [PATCH 436/737] mm/damon/paddr,vaddr: register themselves to DAMON in
 subsys_initcall

This commit makes the monitoring operations for the physical address space
and virtual address spaces register themselves to DAMON in the
subsys_initcall step.  Later, in-kernel DAMON user code can use them via
damon_select_ops() without have to unnecessarily depend on all possible
monitoring operations implementations.

Link: https://lkml.kernel.org/r/20220215184603.1479-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Xin Hao <xhao@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/paddr.c | 20 ++++++++++++++++++++
 mm/damon/vaddr.c | 20 ++++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index fbeb9a3236c5d..416038bc53633 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -273,3 +273,23 @@ void damon_pa_set_operations(struct damon_ctx *ctx)
 	ctx->ops.apply_scheme = damon_pa_apply_scheme;
 	ctx->ops.get_scheme_score = damon_pa_scheme_score;
 }
+
+static int __init damon_pa_initcall(void)
+{
+	struct damon_operations ops = {
+		.id = DAMON_OPS_PADDR,
+		.init = NULL,
+		.update = NULL,
+		.prepare_access_checks = damon_pa_prepare_access_checks,
+		.check_accesses = damon_pa_check_accesses,
+		.reset_aggregated = NULL,
+		.target_valid = damon_pa_target_valid,
+		.cleanup = NULL,
+		.apply_scheme = damon_pa_apply_scheme,
+		.get_scheme_score = damon_pa_scheme_score,
+	};
+
+	return damon_register_ops(&ops);
+};
+
+subsys_initcall(damon_pa_initcall);
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index c0eb32025f9ba..87475ba37bec9 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -752,4 +752,24 @@ void damon_va_set_operations(struct damon_ctx *ctx)
 	ctx->ops.get_scheme_score = damon_va_scheme_score;
 }
 
+static int __init damon_va_initcall(void)
+{
+	struct damon_operations ops = {
+		.id = DAMON_OPS_VADDR,
+		.init = damon_va_init,
+		.update = damon_va_update,
+		.prepare_access_checks = damon_va_prepare_access_checks,
+		.check_accesses = damon_va_check_accesses,
+		.reset_aggregated = NULL,
+		.target_valid = damon_va_target_valid,
+		.cleanup = NULL,
+		.apply_scheme = damon_va_apply_scheme,
+		.get_scheme_score = damon_va_scheme_score,
+	};
+
+	return damon_register_ops(&ops);
+};
+
+subsys_initcall(damon_va_initcall);
+
 #include "vaddr-test.h"

From 967f6671bf474e08dc8f07db532a06b182bd6b5f Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Mar 2022 14:48:55 -0700
Subject: [PATCH 437/737] mm/damon/reclaim: use damon_select_ops() instead of
 damon_{v,p}a_set_operations()

This commit makes DAMON_RECLAIM to select the registered monitoring
operations for the physical address space instead of setting it on its
own.  This allows DAMON_RECLAIM be independent of DAMON_PADDR, but leave
the dependency as is, because it's the only one monitoring operations it
use, and therefore it makes no sense to build DAMON_RECLAIM without
DAMON_PADDR.

Link: https://lkml.kernel.org/r/20220215184603.1479-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Xin Hao <xhao@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/reclaim.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 3c93095c793c4..b53d9c22fad15 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -384,7 +384,9 @@ static int __init damon_reclaim_init(void)
 	if (!ctx)
 		return -ENOMEM;
 
-	damon_pa_set_operations(ctx);
+	if (damon_select_ops(ctx, DAMON_OPS_PADDR))
+		return -EINVAL;
+
 	ctx->callback.after_aggregation = damon_reclaim_after_aggregation;
 
 	target = damon_new_target();

From 622882215723b94ac2b35380044626ab6bf29ccc Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Mar 2022 14:48:58 -0700
Subject: [PATCH 438/737] mm/damon/dbgfs: use damon_select_ops() instead of
 damon_{v,p}a_set_operations()

This commit makes DAMON debugfs interface to select the registered
monitoring operations for the physical address space or virtual address
spaces depending on user requests instead of setting it on its own.  Note
that DAMON debugfs interface is still dependent to DAMON_VADDR with this
change, because it is also using its symbol, 'damon_va_target_valid'.

Link: https://lkml.kernel.org/r/20220215184603.1479-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Xin Hao <xhao@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/dbgfs.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 719278a8cc5eb..8bf9e38b60f47 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -474,12 +474,18 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
 
 	/* remove previously set targets */
 	dbgfs_set_targets(ctx, 0, NULL);
+	if (!nr_targets) {
+		ret = count;
+		goto unlock_out;
+	}
 
 	/* Configure the context for the address space type */
 	if (id_is_pid)
-		damon_va_set_operations(ctx);
+		ret = damon_select_ops(ctx, DAMON_OPS_VADDR);
 	else
-		damon_pa_set_operations(ctx);
+		ret = damon_select_ops(ctx, DAMON_OPS_PADDR);
+	if (ret)
+		goto unlock_out;
 
 	ret = dbgfs_set_targets(ctx, nr_targets, target_pids);
 	if (!ret)
@@ -735,7 +741,11 @@ static struct damon_ctx *dbgfs_new_ctx(void)
 	if (!ctx)
 		return NULL;
 
-	damon_va_set_operations(ctx);
+	if (damon_select_ops(ctx, DAMON_OPS_VADDR) && damon_select_ops(ctx,
+				DAMON_OPS_PADDR)) {
+		damon_destroy_ctx(ctx);
+		return NULL;
+	}
 	ctx->callback.before_terminate = dbgfs_before_terminate;
 	return ctx;
 }

From ae6069054d0fd36005ada05bd5cbd7a952d601d1 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Mar 2022 14:49:01 -0700
Subject: [PATCH 439/737] mm/damon/dbgfs: use operations id for knowing if the
 target has pid

DAMON debugfs interface depends on monitoring operations for virtual
address spaces because it knows if the target has pid or not by seeing if
the context is configured to use one of the virtual address space
monitoring operation functions.  We can replace that check with 'enum
damon_ops_id' now, to make it independent.  This commit makes the change.

Link: https://lkml.kernel.org/r/20220215184603.1479-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Xin Hao <xhao@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/dbgfs.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 8bf9e38b60f47..05b574cbcea81 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -277,7 +277,7 @@ static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf,
 
 static inline bool target_has_pid(const struct damon_ctx *ctx)
 {
-	return ctx->ops.target_valid == damon_va_target_valid;
+	return ctx->ops.id == DAMON_OPS_VADDR;
 }
 
 static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len)
@@ -741,8 +741,8 @@ static struct damon_ctx *dbgfs_new_ctx(void)
 	if (!ctx)
 		return NULL;
 
-	if (damon_select_ops(ctx, DAMON_OPS_VADDR) && damon_select_ops(ctx,
-				DAMON_OPS_PADDR)) {
+	if (damon_select_ops(ctx, DAMON_OPS_VADDR) &&
+			damon_select_ops(ctx, DAMON_OPS_PADDR)) {
 		damon_destroy_ctx(ctx);
 		return NULL;
 	}

From 202f1cf9bc4427c4514a85dcf218bdc43fe881a4 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Mar 2022 14:49:04 -0700
Subject: [PATCH 440/737] mm/damon/dbgfs-test: fix is_target_id() change

DAMON kunit tests for DAMON debugfs interface fails because it still
assumes setting empty monitoring operations makes DAMON debugfs interface
believe the target of the context don't have pid.  This commit fixes the
kunit test fails by explicitly setting the context's monitoring operations
with the operations for the physical address space, which let debugfs
knows the target will not have pid.

Link: https://lkml.kernel.org/r/20220215184603.1479-8-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Xin Hao <xhao@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/dbgfs-test.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h
index 8f7f325950559..0bb0d532b1590 100644
--- a/mm/damon/dbgfs-test.h
+++ b/mm/damon/dbgfs-test.h
@@ -74,7 +74,7 @@ static void damon_dbgfs_test_set_targets(struct kunit *test)
 	char buf[64];
 
 	/* Make DAMON consider target has no pid */
-	ctx->ops = (struct damon_operations){};
+	damon_select_ops(ctx, DAMON_OPS_PADDR);
 
 	dbgfs_set_targets(ctx, 0, NULL);
 	sprint_target_ids(ctx, buf, 64);
@@ -111,6 +111,8 @@ static void damon_dbgfs_test_set_init_regions(struct kunit *test)
 	int i, rc;
 	char buf[256];
 
+	damon_select_ops(ctx, DAMON_OPS_PADDR);
+
 	dbgfs_set_targets(ctx, 3, NULL);
 
 	/* Put valid inputs and check the results */

From ec42f90aa6a70647b96ebf7d5bfd606488efe5fb Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Mar 2022 14:49:07 -0700
Subject: [PATCH 441/737] mm/damon/paddr,vaddr: remove
 damon_{p,v}a_{target_valid,set_operations}()

Because DAMON debugfs interface and DAMON-based proactive reclaim are now
using monitoring operations via registration mechanism,
damon_{p,v}a_{target_valid,set_operations}() functions have no user.  This
commit clean them up.

Link: https://lkml.kernel.org/r/20220215184603.1479-9-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Xin Hao <xhao@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 10 ----------
 mm/damon/paddr.c      | 20 +-------------------
 mm/damon/vaddr.c      | 15 +--------------
 3 files changed, 2 insertions(+), 43 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 076da277b249e..49c4a11ecf200 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -513,14 +513,4 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
 
 #endif	/* CONFIG_DAMON */
 
-#ifdef CONFIG_DAMON_VADDR
-bool damon_va_target_valid(void *t);
-void damon_va_set_operations(struct damon_ctx *ctx);
-#endif	/* CONFIG_DAMON_VADDR */
-
-#ifdef CONFIG_DAMON_PADDR
-bool damon_pa_target_valid(void *t);
-void damon_pa_set_operations(struct damon_ctx *ctx);
-#endif	/* CONFIG_DAMON_PADDR */
-
 #endif	/* _DAMON_H */
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 416038bc53633..ffcd6047fea85 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -208,11 +208,6 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
 	return max_nr_accesses;
 }
 
-bool damon_pa_target_valid(void *t)
-{
-	return true;
-}
-
 static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
 		struct damon_target *t, struct damon_region *r,
 		struct damos *scheme)
@@ -261,19 +256,6 @@ static int damon_pa_scheme_score(struct damon_ctx *context,
 	return DAMOS_MAX_SCORE;
 }
 
-void damon_pa_set_operations(struct damon_ctx *ctx)
-{
-	ctx->ops.init = NULL;
-	ctx->ops.update = NULL;
-	ctx->ops.prepare_access_checks = damon_pa_prepare_access_checks;
-	ctx->ops.check_accesses = damon_pa_check_accesses;
-	ctx->ops.reset_aggregated = NULL;
-	ctx->ops.target_valid = damon_pa_target_valid;
-	ctx->ops.cleanup = NULL;
-	ctx->ops.apply_scheme = damon_pa_apply_scheme;
-	ctx->ops.get_scheme_score = damon_pa_scheme_score;
-}
-
 static int __init damon_pa_initcall(void)
 {
 	struct damon_operations ops = {
@@ -283,7 +265,7 @@ static int __init damon_pa_initcall(void)
 		.prepare_access_checks = damon_pa_prepare_access_checks,
 		.check_accesses = damon_pa_check_accesses,
 		.reset_aggregated = NULL,
-		.target_valid = damon_pa_target_valid,
+		.target_valid = NULL,
 		.cleanup = NULL,
 		.apply_scheme = damon_pa_apply_scheme,
 		.get_scheme_score = damon_pa_scheme_score,
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 87475ba37bec9..b2ec0aa1ff451 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -653,7 +653,7 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
  * Functions for the target validity check and cleanup
  */
 
-bool damon_va_target_valid(void *target)
+static bool damon_va_target_valid(void *target)
 {
 	struct damon_target *t = target;
 	struct task_struct *task;
@@ -739,19 +739,6 @@ static int damon_va_scheme_score(struct damon_ctx *context,
 	return DAMOS_MAX_SCORE;
 }
 
-void damon_va_set_operations(struct damon_ctx *ctx)
-{
-	ctx->ops.init = damon_va_init;
-	ctx->ops.update = damon_va_update;
-	ctx->ops.prepare_access_checks = damon_va_prepare_access_checks;
-	ctx->ops.check_accesses = damon_va_check_accesses;
-	ctx->ops.reset_aggregated = NULL;
-	ctx->ops.target_valid = damon_va_target_valid;
-	ctx->ops.cleanup = NULL;
-	ctx->ops.apply_scheme = damon_va_apply_scheme;
-	ctx->ops.get_scheme_score = damon_va_scheme_score;
-}
-
 static int __init damon_va_initcall(void)
 {
 	struct damon_operations ops = {

From 1f48b8917b4ef29122448ddb079985a26be4ce60 Mon Sep 17 00:00:00 2001
From: tangmeng <tangmeng@uniontech.com>
Date: Tue, 22 Mar 2022 14:49:09 -0700
Subject: [PATCH 442/737] mm/damon: remove unnecessary CONFIG_DAMON option

In mm/Makefile has:

  obj-$(CONFIG_DAMON) += damon/

So that we don't need 'obj-$(CONFIG_DAMON) :=' in mm/damon/Makefile,
delete it from mm/damon/Makefile.

Link: https://lkml.kernel.org/r/20220221065255.19991-1-tangmeng@uniontech.com
Signed-off-by: tangmeng <tangmeng@uniontech.com>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/damon/Makefile b/mm/damon/Makefile
index 03931472991a4..aebbf6c14c51f 100644
--- a/mm/damon/Makefile
+++ b/mm/damon/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
-obj-$(CONFIG_DAMON)		:= core.o
+obj-y				:= core.o
 obj-$(CONFIG_DAMON_VADDR)	+= ops-common.o vaddr.o
 obj-$(CONFIG_DAMON_PADDR)	+= ops-common.o paddr.o
 obj-$(CONFIG_DAMON_DBGFS)	+= dbgfs.o

From b112960dc40678f9758d246bc20fde545f6a42e6 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Mar 2022 14:49:18 -0700
Subject: [PATCH 443/737] Docs/damon: update outdated term 'regions update
 interval'

Before DAMON is merged in the mainline, the concept of 'regions update
interval' has generalized to be used as the time interval for update of
any monitoring operations related data structure, but the document has not
updated properly.  This commit updates the document for better
consistency.

Link: https://lkml.kernel.org/r/20220222170100.17068-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst |  6 +++---
 Documentation/vm/damon/design.rst            | 12 +++++++-----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 1e06435b8ff67..b6ec650873b2f 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -47,7 +47,7 @@ Attributes
 ----------
 
 Users can get and set the ``sampling interval``, ``aggregation interval``,
-``regions update interval``, and min/max number of monitoring target regions by
+``update interval``, and min/max number of monitoring target regions by
 reading from and writing to the ``attrs`` file.  To know about the monitoring
 attributes in detail, please refer to the :doc:`/vm/damon/design`.  For
 example, below commands set those values to 5 ms, 100 ms, 1,000 ms, 10 and
@@ -128,8 +128,8 @@ ranges, ``20-40`` and ``50-100`` as that of pid 4242, which is the second one
 
 Note that this sets the initial monitoring target regions only.  In case of
 virtual memory monitoring, DAMON will automatically updates the boundary of the
-regions after one ``regions update interval``.  Therefore, users should set the
-``regions update interval`` large enough in this case, if they don't want the
+regions after one ``update interval``.  Therefore, users should set the
+``update interval`` large enough in this case, if they don't want the
 update.
 
 
diff --git a/Documentation/vm/damon/design.rst b/Documentation/vm/damon/design.rst
index 210f0f50efd81..60b2c22d4e104 100644
--- a/Documentation/vm/damon/design.rst
+++ b/Documentation/vm/damon/design.rst
@@ -94,8 +94,8 @@ Address Space Independent Core Mechanisms
 
 Below four sections describe each of the DAMON core mechanisms and the five
 monitoring attributes, ``sampling interval``, ``aggregation interval``,
-``regions update interval``, ``minimum number of regions``, and ``maximum
-number of regions``.
+``update interval``, ``minimum number of regions``, and ``maximum number of
+regions``.
 
 
 Access Frequency Monitoring
@@ -168,6 +168,8 @@ The monitoring target address range could dynamically changed.  For example,
 virtual memory could be dynamically mapped and unmapped.  Physical memory could
 be hot-plugged.
 
-As the changes could be quite frequent in some cases, DAMON checks the dynamic
-memory mapping changes and applies it to the abstracted target area only for
-each of a user-specified time interval (``regions update interval``).
+As the changes could be quite frequent in some cases, DAMON allows the
+monitoring operations to check dynamic changes including memory mapping changes
+and applies it to monitoring operations-related data structures such as the
+abstracted monitoring target memory area only for each of a user-specified time
+interval (``update interval``).

From 54e86fc098803b0d8e45f8e08c501cce5ddfa667 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Mar 2022 14:49:21 -0700
Subject: [PATCH 444/737] mm/damon/core: allow non-exclusive DAMON start/stop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "Introduce DAMON sysfs interface", v3.

Introduction
============

DAMON's debugfs-based user interface (DAMON_DBGFS) served very well, so
far.  However, it unnecessarily depends on debugfs, while DAMON is not
aimed to be used for only debugging.  Also, the interface receives
multiple values via one file.  For example, schemes file receives 18
values.  As a result, it is inefficient, hard to be used, and difficult to
be extended.  Especially, keeping backward compatibility of user space
tools is getting only challenging.  It would be better to implement
another reliable and flexible interface and deprecate DAMON_DBGFS in long
term.

For the reason, this patchset introduces a sysfs-based new user interface
of DAMON.  The idea of the new interface is, using directory hierarchies
and having one dedicated file for each value.  For a short example, users
can do the virtual address monitoring via the interface as below:

    # cd /sys/kernel/mm/damon/admin/
    # echo 1 > kdamonds/nr_kdamonds
    # echo 1 > kdamonds/0/contexts/nr_contexts
    # echo vaddr > kdamonds/0/contexts/0/operations
    # echo 1 > kdamonds/0/contexts/0/targets/nr_targets
    # echo $(pidof <workload>) > kdamonds/0/contexts/0/targets/0/pid_target
    # echo on > kdamonds/0/state

A brief representation of the files hierarchy of DAMON sysfs interface is
as below.  Childs are represented with indentation, directories are having
'/' suffix, and files in each directory are separated by comma.

    /sys/kernel/mm/damon/admin
    │ kdamonds/nr_kdamonds
    │ │ 0/state,pid
    │ │ │ contexts/nr_contexts
    │ │ │ │ 0/operations
    │ │ │ │ │ monitoring_attrs/
    │ │ │ │ │ │ intervals/sample_us,aggr_us,update_us
    │ │ │ │ │ │ nr_regions/min,max
    │ │ │ │ │ targets/nr_targets
    │ │ │ │ │ │ 0/pid_target
    │ │ │ │ │ │ │ regions/nr_regions
    │ │ │ │ │ │ │ │ 0/start,end
    │ │ │ │ │ │ │ │ ...
    │ │ │ │ │ │ ...
    │ │ │ │ │ schemes/nr_schemes
    │ │ │ │ │ │ 0/action
    │ │ │ │ │ │ │ access_pattern/
    │ │ │ │ │ │ │ │ sz/min,max
    │ │ │ │ │ │ │ │ nr_accesses/min,max
    │ │ │ │ │ │ │ │ age/min,max
    │ │ │ │ │ │ │ quotas/ms,bytes,reset_interval_ms
    │ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil
    │ │ │ │ │ │ │ watermarks/metric,interval_us,high,mid,low
    │ │ │ │ │ │ │ stats/nr_tried,sz_tried,nr_applied,sz_applied,qt_exceeds
    │ │ │ │ │ │ ...
    │ │ │ │ ...
    │ │ ...

Detailed usage of the files will be described in the final Documentation
patch of this patchset.

Main Difference Between DAMON_DBGFS and DAMON_SYSFS
---------------------------------------------------

At the moment, DAMON_DBGFS and DAMON_SYSFS provides same features.  One
important difference between them is their exclusiveness.  DAMON_DBGFS
works in an exclusive manner, so that no DAMON worker thread (kdamond) in
the system can run concurrently and interfere somehow.  For the reason,
DAMON_DBGFS asks users to construct all monitoring contexts and start them
at once.  It's not a big problem but makes the operation a little bit
complex and unflexible.

For more flexible usage, DAMON_SYSFS moves the responsibility of
preventing any possible interference to the admins and work in a
non-exclusive manner.  That is, users can configure and start contexts one
by one.  Note that DAMON respects both exclusive groups and non-exclusive
groups of contexts, in a manner similar to that of reader-writer locks.
That is, if any exclusive monitoring contexts (e.g., contexts that started
via DAMON_DBGFS) are running, DAMON_SYSFS does not start new contexts, and
vice versa.

Future Plan of DAMON_DBGFS Deprecation
======================================

Once this patchset is merged, DAMON_DBGFS development will be frozen.
That is, we will maintain it to work as is now so that no users will be
break.  But, it will not be extended to provide any new feature of DAMON.
The support will be continued only until next LTS release.  After that, we
will drop DAMON_DBGFS.

User-space Tooling Compatibility
--------------------------------

As DAMON_SYSFS provides all features of DAMON_DBGFS, all user space
tooling can move to DAMON_SYSFS.  As we will continue supporting
DAMON_DBGFS until next LTS kernel release, user space tools would have
enough time to move to DAMON_SYSFS.

The official user space tool, damo[1], is already supporting both
DAMON_SYSFS and DAMON_DBGFS.  Both correctness tests[2] and performance
tests[3] of DAMON using DAMON_SYSFS also passed.

[1] https://github.com/awslabs/damo
[2] https://github.com/awslabs/damon-tests/tree/master/corr
[3] https://github.com/awslabs/damon-tests/tree/master/perf

Sequence of Patches
===================

First two patches (patches 1-2) make core changes for DAMON_SYSFS.  The
first one (patch 1) allows non-exclusive DAMON contexts so that
DAMON_SYSFS can work in non-exclusive mode, while the second one (patch 2)
adds size of DAMON enum types so that DAMON API users can safely iterate
the enums.

Third patch (patch 3) implements basic sysfs stub for virtual address
spaces monitoring.  Note that this implements only sysfs files and DAMON
is not linked.  Fourth patch (patch 4) links the DAMON_SYSFS to DAMON so
that users can control DAMON using the sysfs files.

Following six patches (patches 5-10) implements other DAMON features that
DAMON_DBGFS supports one by one (physical address space monitoring,
DAMON-based operation schemes, schemes quotas, schemes prioritization
weights, schemes watermarks, and schemes stats).

Following patch (patch 11) adds a simple selftest for DAMON_SYSFS, and the
final one (patch 12) documents DAMON_SYSFS.

This patch (of 13):

To avoid interference between DAMON contexts monitoring overlapping memory
regions, damon_start() works in an exclusive manner.  That is,
damon_start() does nothing bug fails if any context that started by
another instance of the function is still running.  This makes its usage a
little bit restrictive.  However, admins could aware each DAMON usage and
address such interferences on their own in some cases.

This commit hence implements non-exclusive mode of the function and allows
the callers to select the mode.  Note that the exclusive groups and
non-exclusive groups of contexts will respect each other in a manner
similar to that of reader-writer locks.  Therefore, this commit will not
cause any behavioral change to the exclusive groups.

Link: https://lkml.kernel.org/r/20220228081314.5770-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20220228081314.5770-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Xin Hao <xhao@linux.alibaba.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h |  2 +-
 mm/damon/core.c       | 23 +++++++++++++++--------
 mm/damon/dbgfs.c      |  2 +-
 mm/damon/reclaim.c    |  2 +-
 4 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 49c4a11ecf200..f8e99e47d7472 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -508,7 +508,7 @@ int damon_nr_running_ctxs(void);
 int damon_register_ops(struct damon_operations *ops);
 int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id);
 
-int damon_start(struct damon_ctx **ctxs, int nr_ctxs);
+int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive);
 int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
 
 #endif	/* CONFIG_DAMON */
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 82e0a4620c4fe..c1e0fed4e877f 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -24,6 +24,7 @@
 
 static DEFINE_MUTEX(damon_lock);
 static int nr_running_ctxs;
+static bool running_exclusive_ctxs;
 
 static DEFINE_MUTEX(damon_ops_lock);
 static struct damon_operations damon_registered_ops[NR_DAMON_OPS];
@@ -434,22 +435,25 @@ static int __damon_start(struct damon_ctx *ctx)
  * damon_start() - Starts the monitorings for a given group of contexts.
  * @ctxs:	an array of the pointers for contexts to start monitoring
  * @nr_ctxs:	size of @ctxs
+ * @exclusive:	exclusiveness of this contexts group
  *
  * This function starts a group of monitoring threads for a group of monitoring
  * contexts.  One thread per each context is created and run in parallel.  The
- * caller should handle synchronization between the threads by itself.  If a
- * group of threads that created by other 'damon_start()' call is currently
- * running, this function does nothing but returns -EBUSY.
+ * caller should handle synchronization between the threads by itself.  If
+ * @exclusive is true and a group of threads that created by other
+ * 'damon_start()' call is currently running, this function does nothing but
+ * returns -EBUSY.
  *
  * Return: 0 on success, negative error code otherwise.
  */
-int damon_start(struct damon_ctx **ctxs, int nr_ctxs)
+int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive)
 {
 	int i;
 	int err = 0;
 
 	mutex_lock(&damon_lock);
-	if (nr_running_ctxs) {
+	if ((exclusive && nr_running_ctxs) ||
+			(!exclusive && running_exclusive_ctxs)) {
 		mutex_unlock(&damon_lock);
 		return -EBUSY;
 	}
@@ -460,13 +464,15 @@ int damon_start(struct damon_ctx **ctxs, int nr_ctxs)
 			break;
 		nr_running_ctxs++;
 	}
+	if (exclusive && nr_running_ctxs)
+		running_exclusive_ctxs = true;
 	mutex_unlock(&damon_lock);
 
 	return err;
 }
 
 /*
- * __damon_stop() - Stops monitoring of given context.
+ * __damon_stop() - Stops monitoring of a given context.
  * @ctx:	monitoring context
  *
  * Return: 0 on success, negative error code otherwise.
@@ -504,9 +510,8 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs)
 		/* nr_running_ctxs is decremented in kdamond_fn */
 		err = __damon_stop(ctxs[i]);
 		if (err)
-			return err;
+			break;
 	}
-
 	return err;
 }
 
@@ -1102,6 +1107,8 @@ static int kdamond_fn(void *data)
 
 	mutex_lock(&damon_lock);
 	nr_running_ctxs--;
+	if (!nr_running_ctxs && running_exclusive_ctxs)
+		running_exclusive_ctxs = false;
 	mutex_unlock(&damon_lock);
 
 	return 0;
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 05b574cbcea81..a0dab8b5e45f2 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -967,7 +967,7 @@ static ssize_t dbgfs_monitor_on_write(struct file *file,
 				return -EINVAL;
 			}
 		}
-		ret = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs);
+		ret = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs, true);
 	} else if (!strncmp(kbuf, "off", count)) {
 		ret = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs);
 	} else {
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index b53d9c22fad15..e34c4d0c4d939 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -330,7 +330,7 @@ static int damon_reclaim_turn(bool on)
 	if (err)
 		goto free_scheme_out;
 
-	err = damon_start(&ctx, 1);
+	err = damon_start(&ctx, 1, true);
 	if (!err) {
 		kdamond_pid = ctx->kdamond->pid;
 		return 0;

From b0c093af421d11aa9efacc760aa3aa20970edf73 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Mar 2022 14:49:24 -0700
Subject: [PATCH 445/737] mm/damon/core: add number of each enum type values

This commit declares the number of legal values for each DAMON enum types
to make traversals of such DAMON enum types easy and safe.

Link: https://lkml.kernel.org/r/20220228081314.5770-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Xin Hao <xhao@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index f8e99e47d7472..f23cbfa4248d4 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -87,6 +87,7 @@ struct damon_target {
  * @DAMOS_HUGEPAGE:	Call ``madvise()`` for the region with MADV_HUGEPAGE.
  * @DAMOS_NOHUGEPAGE:	Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
  * @DAMOS_STAT:		Do nothing but count the stat.
+ * @NR_DAMOS_ACTIONS:	Total number of DAMOS actions
  */
 enum damos_action {
 	DAMOS_WILLNEED,
@@ -95,6 +96,7 @@ enum damos_action {
 	DAMOS_HUGEPAGE,
 	DAMOS_NOHUGEPAGE,
 	DAMOS_STAT,		/* Do nothing but only record the stat */
+	NR_DAMOS_ACTIONS,
 };
 
 /**
@@ -157,10 +159,12 @@ struct damos_quota {
  *
  * @DAMOS_WMARK_NONE:		Ignore the watermarks of the given scheme.
  * @DAMOS_WMARK_FREE_MEM_RATE:	Free memory rate of the system in [0,1000].
+ * @NR_DAMOS_WMARK_METRICS:	Total number of DAMOS watermark metrics
  */
 enum damos_wmark_metric {
 	DAMOS_WMARK_NONE,
 	DAMOS_WMARK_FREE_MEM_RATE,
+	NR_DAMOS_WMARK_METRICS,
 };
 
 /**

From cb8d484ff0a3f914dcfcd208982f5cdb64dbdab0 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Mar 2022 14:49:27 -0700
Subject: [PATCH 446/737] mm/damon: implement a minimal stub for sysfs-based
 DAMON interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

DAMON's debugfs-based user interface served very well, so far.  However,
it unnecessarily depends on debugfs, while DAMON is not aimed to be used
for only debugging.  Also, the interface receives multiple values via one
file.  For example, schemes file receives 18 values separated by white
spaces.  As a result, it is ineffient, hard to be used, and difficult to
be extended.  Especially, keeping backward compatibility of user space
tools is getting only challenging.  It would be better to implement
another reliable and flexible interface and deprecate the debugfs
interface in long term.

To this end, this commit implements a stub of a part of the new user
interface of DAMON using sysfs.  Specifically, this commit implements the
sysfs control parts for virtual address space monitoring.

More specifically, the idea of the new interface is, using directory
hierarchies and making one file for one value.  The hierarchy that this
commit is introducing is as below.  In the below figure, parents-children
relations are represented with indentations, each directory is having
``/`` suffix, and files in each directory are separated by comma (",").

    /sys/kernel/mm/damon/admin
    │ kdamonds/nr_kdamonds
    │ │ 0/state,pid
    │ │ │ contexts/nr_contexts
    │ │ │ │ 0/operations
    │ │ │ │ │ monitoring_attrs/
    │ │ │ │ │ │ intervals/sample_us,aggr_us,update_us
    │ │ │ │ │ │ nr_regions/min,max
    │ │ │ │ │ targets/nr_targets
    │ │ │ │ │ │ 0/pid_target
    │ │ │ │ │ │ ...
    │ │ │ │ ...
    │ │ ...

Writing a number <N> to each 'nr' file makes directories of name <0> to
<N-1> in the directory of the 'nr' file.  That's all this commit does.
Writing proper values to relevant files will construct the DAMON contexts,
and writing a special keyword, 'on', to 'state' files for each kdamond
will ask DAMON to start the constructed contexts.

For a short example, using below commands for monitoring virtual address
spaces of a given workload is imaginable:

    # cd /sys/kernel/mm/damon/admin/
    # echo 1 > kdamonds/nr_kdamonds
    # echo 1 > kdamonds/0/contexts/nr_contexts
    # echo vaddr > kdamonds/0/contexts/0/operations
    # echo 1 > kdamonds/0/contexts/0/targets/nr_targets
    # echo $(pidof <workload>) > kdamonds/0/contexts/0/targets/0/pid_target
    # echo on > kdamonds/0/state

Please note that this commit is implementing only the sysfs part stub as
abovely mentioned.  This commit doesn't implement the special keywords for
'state' files.  Following commits will do that.

[jiapeng.chong@linux.alibaba.com: fix missing error code in damon_sysfs_attrs_add_dirs()]
  Link: https://lkml.kernel.org/r/20220302111120.24984-1-jiapeng.chong@linux.alibaba.com

Link: https://lkml.kernel.org/r/20220228081314.5770-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Xin Hao <xhao@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/Kconfig  |    7 +
 mm/damon/Makefile |    1 +
 mm/damon/sysfs.c  | 1084 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 1092 insertions(+)
 create mode 100644 mm/damon/sysfs.c

diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index 01bad77ad7ae6..9b559c76d6dd1 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -52,6 +52,13 @@ config DAMON_VADDR_KUNIT_TEST
 
 	  If unsure, say N.
 
+config DAMON_SYSFS
+	bool "DAMON sysfs interface"
+	depends on DAMON && SYSFS
+	help
+	  This builds the sysfs interface for DAMON.  The user space can use
+	  the interface for arbitrary data access monitoring.
+
 config DAMON_DBGFS
 	bool "DAMON debugfs interface"
 	depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS
diff --git a/mm/damon/Makefile b/mm/damon/Makefile
index aebbf6c14c51f..dbf7190b4144a 100644
--- a/mm/damon/Makefile
+++ b/mm/damon/Makefile
@@ -3,5 +3,6 @@
 obj-y				:= core.o
 obj-$(CONFIG_DAMON_VADDR)	+= ops-common.o vaddr.o
 obj-$(CONFIG_DAMON_PADDR)	+= ops-common.o paddr.o
+obj-$(CONFIG_DAMON_SYSFS)	+= sysfs.o
 obj-$(CONFIG_DAMON_DBGFS)	+= dbgfs.o
 obj-$(CONFIG_DAMON_RECLAIM)	+= reclaim.o
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
new file mode 100644
index 0000000000000..4455e4bef88db
--- /dev/null
+++ b/mm/damon/sysfs.c
@@ -0,0 +1,1084 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DAMON sysfs Interface
+ *
+ * Copyright (c) 2022 SeongJae Park <sj@kernel.org>
+ */
+
+#include <linux/damon.h>
+#include <linux/kobject.h>
+#include <linux/pid.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+static DEFINE_MUTEX(damon_sysfs_lock);
+
+/*
+ * unsigned long range directory
+ */
+
+struct damon_sysfs_ul_range {
+	struct kobject kobj;
+	unsigned long min;
+	unsigned long max;
+};
+
+static struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc(
+		unsigned long min,
+		unsigned long max)
+{
+	struct damon_sysfs_ul_range *range = kmalloc(sizeof(*range),
+			GFP_KERNEL);
+
+	if (!range)
+		return NULL;
+	range->kobj = (struct kobject){};
+	range->min = min;
+	range->max = max;
+
+	return range;
+}
+
+static ssize_t min_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_ul_range *range = container_of(kobj,
+			struct damon_sysfs_ul_range, kobj);
+
+	return sysfs_emit(buf, "%lu\n", range->min);
+}
+
+static ssize_t min_store(struct kobject *kobj, struct kobj_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct damon_sysfs_ul_range *range = container_of(kobj,
+			struct damon_sysfs_ul_range, kobj);
+	unsigned long min;
+	int err;
+
+	err = kstrtoul(buf, 0, &min);
+	if (err)
+		return -EINVAL;
+
+	range->min = min;
+	return count;
+}
+
+static ssize_t max_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_ul_range *range = container_of(kobj,
+			struct damon_sysfs_ul_range, kobj);
+
+	return sysfs_emit(buf, "%lu\n", range->max);
+}
+
+static ssize_t max_store(struct kobject *kobj, struct kobj_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct damon_sysfs_ul_range *range = container_of(kobj,
+			struct damon_sysfs_ul_range, kobj);
+	unsigned long max;
+	int err;
+
+	err = kstrtoul(buf, 0, &max);
+	if (err)
+		return -EINVAL;
+
+	range->max = max;
+	return count;
+}
+
+static void damon_sysfs_ul_range_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_ul_range, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_ul_range_min_attr =
+		__ATTR_RW_MODE(min, 0600);
+
+static struct kobj_attribute damon_sysfs_ul_range_max_attr =
+		__ATTR_RW_MODE(max, 0600);
+
+static struct attribute *damon_sysfs_ul_range_attrs[] = {
+	&damon_sysfs_ul_range_min_attr.attr,
+	&damon_sysfs_ul_range_max_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_ul_range);
+
+static struct kobj_type damon_sysfs_ul_range_ktype = {
+	.release = damon_sysfs_ul_range_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_ul_range_groups,
+};
+
+/*
+ * target directory
+ */
+
+struct damon_sysfs_target {
+	struct kobject kobj;
+	int pid;
+};
+
+static struct damon_sysfs_target *damon_sysfs_target_alloc(void)
+{
+	return kzalloc(sizeof(struct damon_sysfs_target), GFP_KERNEL);
+}
+
+static ssize_t pid_target_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_target *target = container_of(kobj,
+			struct damon_sysfs_target, kobj);
+
+	return sysfs_emit(buf, "%d\n", target->pid);
+}
+
+static ssize_t pid_target_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_target *target = container_of(kobj,
+			struct damon_sysfs_target, kobj);
+	int err = kstrtoint(buf, 0, &target->pid);
+
+	if (err)
+		return -EINVAL;
+	return count;
+}
+
+static void damon_sysfs_target_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_target, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_target_pid_attr =
+		__ATTR_RW_MODE(pid_target, 0600);
+
+static struct attribute *damon_sysfs_target_attrs[] = {
+	&damon_sysfs_target_pid_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_target);
+
+static struct kobj_type damon_sysfs_target_ktype = {
+	.release = damon_sysfs_target_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_target_groups,
+};
+
+/*
+ * targets directory
+ */
+
+struct damon_sysfs_targets {
+	struct kobject kobj;
+	struct damon_sysfs_target **targets_arr;
+	int nr;
+};
+
+static struct damon_sysfs_targets *damon_sysfs_targets_alloc(void)
+{
+	return kzalloc(sizeof(struct damon_sysfs_targets), GFP_KERNEL);
+}
+
+static void damon_sysfs_targets_rm_dirs(struct damon_sysfs_targets *targets)
+{
+	struct damon_sysfs_target **targets_arr = targets->targets_arr;
+	int i;
+
+	for (i = 0; i < targets->nr; i++)
+		kobject_put(&targets_arr[i]->kobj);
+	targets->nr = 0;
+	kfree(targets_arr);
+	targets->targets_arr = NULL;
+}
+
+static int damon_sysfs_targets_add_dirs(struct damon_sysfs_targets *targets,
+		int nr_targets)
+{
+	struct damon_sysfs_target **targets_arr, *target;
+	int err, i;
+
+	damon_sysfs_targets_rm_dirs(targets);
+	if (!nr_targets)
+		return 0;
+
+	targets_arr = kmalloc_array(nr_targets, sizeof(*targets_arr),
+			GFP_KERNEL | __GFP_NOWARN);
+	if (!targets_arr)
+		return -ENOMEM;
+	targets->targets_arr = targets_arr;
+
+	for (i = 0; i < nr_targets; i++) {
+		target = damon_sysfs_target_alloc();
+		if (!target) {
+			damon_sysfs_targets_rm_dirs(targets);
+			return -ENOMEM;
+		}
+
+		err = kobject_init_and_add(&target->kobj,
+				&damon_sysfs_target_ktype, &targets->kobj,
+				"%d", i);
+		if (err)
+			goto out;
+
+		targets_arr[i] = target;
+		targets->nr++;
+	}
+	return 0;
+
+out:
+	damon_sysfs_targets_rm_dirs(targets);
+	kobject_put(&target->kobj);
+	return err;
+}
+
+static ssize_t nr_targets_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_targets *targets = container_of(kobj,
+			struct damon_sysfs_targets, kobj);
+
+	return sysfs_emit(buf, "%d\n", targets->nr);
+}
+
+static ssize_t nr_targets_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_targets *targets = container_of(kobj,
+			struct damon_sysfs_targets, kobj);
+	int nr, err = kstrtoint(buf, 0, &nr);
+
+	if (err)
+		return err;
+	if (nr < 0)
+		return -EINVAL;
+
+	if (!mutex_trylock(&damon_sysfs_lock))
+		return -EBUSY;
+	err = damon_sysfs_targets_add_dirs(targets, nr);
+	mutex_unlock(&damon_sysfs_lock);
+	if (err)
+		return err;
+
+	return count;
+}
+
+static void damon_sysfs_targets_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_targets, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_targets_nr_attr =
+		__ATTR_RW_MODE(nr_targets, 0600);
+
+static struct attribute *damon_sysfs_targets_attrs[] = {
+	&damon_sysfs_targets_nr_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_targets);
+
+static struct kobj_type damon_sysfs_targets_ktype = {
+	.release = damon_sysfs_targets_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_targets_groups,
+};
+
+/*
+ * intervals directory
+ */
+
+struct damon_sysfs_intervals {
+	struct kobject kobj;
+	unsigned long sample_us;
+	unsigned long aggr_us;
+	unsigned long update_us;
+};
+
+static struct damon_sysfs_intervals *damon_sysfs_intervals_alloc(
+		unsigned long sample_us, unsigned long aggr_us,
+		unsigned long update_us)
+{
+	struct damon_sysfs_intervals *intervals = kmalloc(sizeof(*intervals),
+			GFP_KERNEL);
+
+	if (!intervals)
+		return NULL;
+
+	intervals->kobj = (struct kobject){};
+	intervals->sample_us = sample_us;
+	intervals->aggr_us = aggr_us;
+	intervals->update_us = update_us;
+	return intervals;
+}
+
+static ssize_t sample_us_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_intervals *intervals = container_of(kobj,
+			struct damon_sysfs_intervals, kobj);
+
+	return sysfs_emit(buf, "%lu\n", intervals->sample_us);
+}
+
+static ssize_t sample_us_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_intervals *intervals = container_of(kobj,
+			struct damon_sysfs_intervals, kobj);
+	unsigned long us;
+	int err = kstrtoul(buf, 0, &us);
+
+	if (err)
+		return -EINVAL;
+
+	intervals->sample_us = us;
+	return count;
+}
+
+static ssize_t aggr_us_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_intervals *intervals = container_of(kobj,
+			struct damon_sysfs_intervals, kobj);
+
+	return sysfs_emit(buf, "%lu\n", intervals->aggr_us);
+}
+
+static ssize_t aggr_us_store(struct kobject *kobj, struct kobj_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct damon_sysfs_intervals *intervals = container_of(kobj,
+			struct damon_sysfs_intervals, kobj);
+	unsigned long us;
+	int err = kstrtoul(buf, 0, &us);
+
+	if (err)
+		return -EINVAL;
+
+	intervals->aggr_us = us;
+	return count;
+}
+
+static ssize_t update_us_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_intervals *intervals = container_of(kobj,
+			struct damon_sysfs_intervals, kobj);
+
+	return sysfs_emit(buf, "%lu\n", intervals->update_us);
+}
+
+static ssize_t update_us_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_intervals *intervals = container_of(kobj,
+			struct damon_sysfs_intervals, kobj);
+	unsigned long us;
+	int err = kstrtoul(buf, 0, &us);
+
+	if (err)
+		return -EINVAL;
+
+	intervals->update_us = us;
+	return count;
+}
+
+static void damon_sysfs_intervals_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_intervals, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_intervals_sample_us_attr =
+		__ATTR_RW_MODE(sample_us, 0600);
+
+static struct kobj_attribute damon_sysfs_intervals_aggr_us_attr =
+		__ATTR_RW_MODE(aggr_us, 0600);
+
+static struct kobj_attribute damon_sysfs_intervals_update_us_attr =
+		__ATTR_RW_MODE(update_us, 0600);
+
+static struct attribute *damon_sysfs_intervals_attrs[] = {
+	&damon_sysfs_intervals_sample_us_attr.attr,
+	&damon_sysfs_intervals_aggr_us_attr.attr,
+	&damon_sysfs_intervals_update_us_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_intervals);
+
+static struct kobj_type damon_sysfs_intervals_ktype = {
+	.release = damon_sysfs_intervals_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_intervals_groups,
+};
+
+/*
+ * monitoring_attrs directory
+ */
+
+struct damon_sysfs_attrs {
+	struct kobject kobj;
+	struct damon_sysfs_intervals *intervals;
+	struct damon_sysfs_ul_range *nr_regions_range;
+};
+
+static struct damon_sysfs_attrs *damon_sysfs_attrs_alloc(void)
+{
+	struct damon_sysfs_attrs *attrs = kmalloc(sizeof(*attrs), GFP_KERNEL);
+
+	if (!attrs)
+		return NULL;
+	attrs->kobj = (struct kobject){};
+	return attrs;
+}
+
+static int damon_sysfs_attrs_add_dirs(struct damon_sysfs_attrs *attrs)
+{
+	struct damon_sysfs_intervals *intervals;
+	struct damon_sysfs_ul_range *nr_regions_range;
+	int err;
+
+	intervals = damon_sysfs_intervals_alloc(5000, 100000, 60000000);
+	if (!intervals)
+		return -ENOMEM;
+
+	err = kobject_init_and_add(&intervals->kobj,
+			&damon_sysfs_intervals_ktype, &attrs->kobj,
+			"intervals");
+	if (err)
+		goto put_intervals_out;
+	attrs->intervals = intervals;
+
+	nr_regions_range = damon_sysfs_ul_range_alloc(10, 1000);
+	if (!nr_regions_range) {
+		err = -ENOMEM;
+		goto put_intervals_out;
+	}
+
+	err = kobject_init_and_add(&nr_regions_range->kobj,
+			&damon_sysfs_ul_range_ktype, &attrs->kobj,
+			"nr_regions");
+	if (err)
+		goto put_nr_regions_intervals_out;
+	attrs->nr_regions_range = nr_regions_range;
+	return 0;
+
+put_nr_regions_intervals_out:
+	kobject_put(&nr_regions_range->kobj);
+	attrs->nr_regions_range = NULL;
+put_intervals_out:
+	kobject_put(&intervals->kobj);
+	attrs->intervals = NULL;
+	return err;
+}
+
+static void damon_sysfs_attrs_rm_dirs(struct damon_sysfs_attrs *attrs)
+{
+	kobject_put(&attrs->nr_regions_range->kobj);
+	kobject_put(&attrs->intervals->kobj);
+}
+
+static void damon_sysfs_attrs_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_attrs, kobj));
+}
+
+static struct attribute *damon_sysfs_attrs_attrs[] = {
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_attrs);
+
+static struct kobj_type damon_sysfs_attrs_ktype = {
+	.release = damon_sysfs_attrs_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_attrs_groups,
+};
+
+/*
+ * context directory
+ */
+
+/* This should match with enum damon_ops_id */
+static const char * const damon_sysfs_ops_strs[] = {
+	"vaddr",
+	"paddr",
+};
+
+struct damon_sysfs_context {
+	struct kobject kobj;
+	enum damon_ops_id ops_id;
+	struct damon_sysfs_attrs *attrs;
+	struct damon_sysfs_targets *targets;
+};
+
+static struct damon_sysfs_context *damon_sysfs_context_alloc(
+		enum damon_ops_id ops_id)
+{
+	struct damon_sysfs_context *context = kmalloc(sizeof(*context),
+				GFP_KERNEL);
+
+	if (!context)
+		return NULL;
+	context->kobj = (struct kobject){};
+	context->ops_id = ops_id;
+	return context;
+}
+
+static int damon_sysfs_context_set_attrs(struct damon_sysfs_context *context)
+{
+	struct damon_sysfs_attrs *attrs = damon_sysfs_attrs_alloc();
+	int err;
+
+	if (!attrs)
+		return -ENOMEM;
+	err = kobject_init_and_add(&attrs->kobj, &damon_sysfs_attrs_ktype,
+			&context->kobj, "monitoring_attrs");
+	if (err)
+		goto out;
+	err = damon_sysfs_attrs_add_dirs(attrs);
+	if (err)
+		goto out;
+	context->attrs = attrs;
+	return 0;
+
+out:
+	kobject_put(&attrs->kobj);
+	return err;
+}
+
+static int damon_sysfs_context_set_targets(struct damon_sysfs_context *context)
+{
+	struct damon_sysfs_targets *targets = damon_sysfs_targets_alloc();
+	int err;
+
+	if (!targets)
+		return -ENOMEM;
+	err = kobject_init_and_add(&targets->kobj, &damon_sysfs_targets_ktype,
+			&context->kobj, "targets");
+	if (err) {
+		kobject_put(&targets->kobj);
+		return err;
+	}
+	context->targets = targets;
+	return 0;
+}
+
+static int damon_sysfs_context_add_dirs(struct damon_sysfs_context *context)
+{
+	int err;
+
+	err = damon_sysfs_context_set_attrs(context);
+	if (err)
+		return err;
+
+	err = damon_sysfs_context_set_targets(context);
+	if (err)
+		goto put_attrs_out;
+	return 0;
+
+put_attrs_out:
+	kobject_put(&context->attrs->kobj);
+	context->attrs = NULL;
+	return err;
+}
+
+static void damon_sysfs_context_rm_dirs(struct damon_sysfs_context *context)
+{
+	damon_sysfs_attrs_rm_dirs(context->attrs);
+	kobject_put(&context->attrs->kobj);
+	damon_sysfs_targets_rm_dirs(context->targets);
+	kobject_put(&context->targets->kobj);
+}
+
+static ssize_t operations_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_context *context = container_of(kobj,
+			struct damon_sysfs_context, kobj);
+
+	return sysfs_emit(buf, "%s\n", damon_sysfs_ops_strs[context->ops_id]);
+}
+
+static ssize_t operations_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_context *context = container_of(kobj,
+			struct damon_sysfs_context, kobj);
+	enum damon_ops_id id;
+
+	for (id = 0; id < NR_DAMON_OPS; id++) {
+		if (sysfs_streq(buf, damon_sysfs_ops_strs[id])) {
+			/* Support only vaddr */
+			if (id != DAMON_OPS_VADDR)
+				return -EINVAL;
+			context->ops_id = id;
+			return count;
+		}
+	}
+	return -EINVAL;
+}
+
+static void damon_sysfs_context_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_context, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_context_operations_attr =
+		__ATTR_RW_MODE(operations, 0600);
+
+static struct attribute *damon_sysfs_context_attrs[] = {
+	&damon_sysfs_context_operations_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_context);
+
+static struct kobj_type damon_sysfs_context_ktype = {
+	.release = damon_sysfs_context_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_context_groups,
+};
+
+/*
+ * contexts directory
+ */
+
+struct damon_sysfs_contexts {
+	struct kobject kobj;
+	struct damon_sysfs_context **contexts_arr;
+	int nr;
+};
+
+static struct damon_sysfs_contexts *damon_sysfs_contexts_alloc(void)
+{
+	return kzalloc(sizeof(struct damon_sysfs_contexts), GFP_KERNEL);
+}
+
+static void damon_sysfs_contexts_rm_dirs(struct damon_sysfs_contexts *contexts)
+{
+	struct damon_sysfs_context **contexts_arr = contexts->contexts_arr;
+	int i;
+
+	for (i = 0; i < contexts->nr; i++) {
+		damon_sysfs_context_rm_dirs(contexts_arr[i]);
+		kobject_put(&contexts_arr[i]->kobj);
+	}
+	contexts->nr = 0;
+	kfree(contexts_arr);
+	contexts->contexts_arr = NULL;
+}
+
+static int damon_sysfs_contexts_add_dirs(struct damon_sysfs_contexts *contexts,
+		int nr_contexts)
+{
+	struct damon_sysfs_context **contexts_arr, *context;
+	int err, i;
+
+	damon_sysfs_contexts_rm_dirs(contexts);
+	if (!nr_contexts)
+		return 0;
+
+	contexts_arr = kmalloc_array(nr_contexts, sizeof(*contexts_arr),
+			GFP_KERNEL | __GFP_NOWARN);
+	if (!contexts_arr)
+		return -ENOMEM;
+	contexts->contexts_arr = contexts_arr;
+
+	for (i = 0; i < nr_contexts; i++) {
+		context = damon_sysfs_context_alloc(DAMON_OPS_VADDR);
+		if (!context) {
+			damon_sysfs_contexts_rm_dirs(contexts);
+			return -ENOMEM;
+		}
+
+		err = kobject_init_and_add(&context->kobj,
+				&damon_sysfs_context_ktype, &contexts->kobj,
+				"%d", i);
+		if (err)
+			goto out;
+
+		err = damon_sysfs_context_add_dirs(context);
+		if (err)
+			goto out;
+
+		contexts_arr[i] = context;
+		contexts->nr++;
+	}
+	return 0;
+
+out:
+	damon_sysfs_contexts_rm_dirs(contexts);
+	kobject_put(&context->kobj);
+	return err;
+}
+
+static ssize_t nr_contexts_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_contexts *contexts = container_of(kobj,
+			struct damon_sysfs_contexts, kobj);
+
+	return sysfs_emit(buf, "%d\n", contexts->nr);
+}
+
+static ssize_t nr_contexts_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_contexts *contexts = container_of(kobj,
+			struct damon_sysfs_contexts, kobj);
+	int nr, err;
+
+	err = kstrtoint(buf, 0, &nr);
+	if (err)
+		return err;
+	/* TODO: support multiple contexts per kdamond */
+	if (nr < 0 || 1 < nr)
+		return -EINVAL;
+
+	if (!mutex_trylock(&damon_sysfs_lock))
+		return -EBUSY;
+	err = damon_sysfs_contexts_add_dirs(contexts, nr);
+	mutex_unlock(&damon_sysfs_lock);
+	if (err)
+		return err;
+
+	return count;
+}
+
+static void damon_sysfs_contexts_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_contexts, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_contexts_nr_attr
+		= __ATTR_RW_MODE(nr_contexts, 0600);
+
+static struct attribute *damon_sysfs_contexts_attrs[] = {
+	&damon_sysfs_contexts_nr_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_contexts);
+
+static struct kobj_type damon_sysfs_contexts_ktype = {
+	.release = damon_sysfs_contexts_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_contexts_groups,
+};
+
+/*
+ * kdamond directory
+ */
+
+struct damon_sysfs_kdamond {
+	struct kobject kobj;
+	struct damon_sysfs_contexts *contexts;
+	struct damon_ctx *damon_ctx;
+};
+
+static struct damon_sysfs_kdamond *damon_sysfs_kdamond_alloc(void)
+{
+	return kzalloc(sizeof(struct damon_sysfs_kdamond), GFP_KERNEL);
+}
+
+static int damon_sysfs_kdamond_add_dirs(struct damon_sysfs_kdamond *kdamond)
+{
+	struct damon_sysfs_contexts *contexts;
+	int err;
+
+	contexts = damon_sysfs_contexts_alloc();
+	if (!contexts)
+		return -ENOMEM;
+
+	err = kobject_init_and_add(&contexts->kobj,
+			&damon_sysfs_contexts_ktype, &kdamond->kobj,
+			"contexts");
+	if (err) {
+		kobject_put(&contexts->kobj);
+		return err;
+	}
+	kdamond->contexts = contexts;
+
+	return err;
+}
+
+static void damon_sysfs_kdamond_rm_dirs(struct damon_sysfs_kdamond *kdamond)
+{
+	damon_sysfs_contexts_rm_dirs(kdamond->contexts);
+	kobject_put(&kdamond->contexts->kobj);
+}
+
+static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	return -EINVAL;
+}
+
+static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
+		const char *buf, size_t count)
+{
+	return -EINVAL;
+}
+
+static ssize_t pid_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return -EINVAL;
+}
+
+static void damon_sysfs_kdamond_release(struct kobject *kobj)
+{
+	struct damon_sysfs_kdamond *kdamond = container_of(kobj,
+			struct damon_sysfs_kdamond, kobj);
+
+	if (kdamond->damon_ctx)
+		damon_destroy_ctx(kdamond->damon_ctx);
+	kfree(container_of(kobj, struct damon_sysfs_kdamond, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_kdamond_state_attr =
+		__ATTR_RW_MODE(state, 0600);
+
+static struct kobj_attribute damon_sysfs_kdamond_pid_attr =
+		__ATTR_RO_MODE(pid, 0400);
+
+static struct attribute *damon_sysfs_kdamond_attrs[] = {
+	&damon_sysfs_kdamond_state_attr.attr,
+	&damon_sysfs_kdamond_pid_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_kdamond);
+
+static struct kobj_type damon_sysfs_kdamond_ktype = {
+	.release = damon_sysfs_kdamond_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_kdamond_groups,
+};
+
+/*
+ * kdamonds directory
+ */
+
+struct damon_sysfs_kdamonds {
+	struct kobject kobj;
+	struct damon_sysfs_kdamond **kdamonds_arr;
+	int nr;
+};
+
+static struct damon_sysfs_kdamonds *damon_sysfs_kdamonds_alloc(void)
+{
+	return kzalloc(sizeof(struct damon_sysfs_kdamonds), GFP_KERNEL);
+}
+
+static void damon_sysfs_kdamonds_rm_dirs(struct damon_sysfs_kdamonds *kdamonds)
+{
+	struct damon_sysfs_kdamond **kdamonds_arr = kdamonds->kdamonds_arr;
+	int i;
+
+	for (i = 0; i < kdamonds->nr; i++) {
+		damon_sysfs_kdamond_rm_dirs(kdamonds_arr[i]);
+		kobject_put(&kdamonds_arr[i]->kobj);
+	}
+	kdamonds->nr = 0;
+	kfree(kdamonds_arr);
+	kdamonds->kdamonds_arr = NULL;
+}
+
+static int damon_sysfs_nr_running_ctxs(struct damon_sysfs_kdamond **kdamonds,
+		int nr_kdamonds)
+{
+	int nr_running_ctxs = 0;
+	int i;
+
+	for (i = 0; i < nr_kdamonds; i++) {
+		struct damon_ctx *ctx = kdamonds[i]->damon_ctx;
+
+		if (!ctx)
+			continue;
+		mutex_lock(&ctx->kdamond_lock);
+		if (ctx->kdamond)
+			nr_running_ctxs++;
+		mutex_unlock(&ctx->kdamond_lock);
+	}
+	return nr_running_ctxs;
+}
+
+static int damon_sysfs_kdamonds_add_dirs(struct damon_sysfs_kdamonds *kdamonds,
+		int nr_kdamonds)
+{
+	struct damon_sysfs_kdamond **kdamonds_arr, *kdamond;
+	int err, i;
+
+	if (damon_sysfs_nr_running_ctxs(kdamonds->kdamonds_arr, kdamonds->nr))
+		return -EBUSY;
+
+	damon_sysfs_kdamonds_rm_dirs(kdamonds);
+	if (!nr_kdamonds)
+		return 0;
+
+	kdamonds_arr = kmalloc_array(nr_kdamonds, sizeof(*kdamonds_arr),
+			GFP_KERNEL | __GFP_NOWARN);
+	if (!kdamonds_arr)
+		return -ENOMEM;
+	kdamonds->kdamonds_arr = kdamonds_arr;
+
+	for (i = 0; i < nr_kdamonds; i++) {
+		kdamond = damon_sysfs_kdamond_alloc();
+		if (!kdamond) {
+			damon_sysfs_kdamonds_rm_dirs(kdamonds);
+			return -ENOMEM;
+		}
+
+		err = kobject_init_and_add(&kdamond->kobj,
+				&damon_sysfs_kdamond_ktype, &kdamonds->kobj,
+				"%d", i);
+		if (err)
+			goto out;
+
+		err = damon_sysfs_kdamond_add_dirs(kdamond);
+		if (err)
+			goto out;
+
+		kdamonds_arr[i] = kdamond;
+		kdamonds->nr++;
+	}
+	return 0;
+
+out:
+	damon_sysfs_kdamonds_rm_dirs(kdamonds);
+	kobject_put(&kdamond->kobj);
+	return err;
+}
+
+static ssize_t nr_kdamonds_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_kdamonds *kdamonds = container_of(kobj,
+			struct damon_sysfs_kdamonds, kobj);
+
+	return sysfs_emit(buf, "%d\n", kdamonds->nr);
+}
+
+static ssize_t nr_kdamonds_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_kdamonds *kdamonds = container_of(kobj,
+			struct damon_sysfs_kdamonds, kobj);
+	int nr, err;
+
+	err = kstrtoint(buf, 0, &nr);
+	if (err)
+		return err;
+	if (nr < 0)
+		return -EINVAL;
+
+	if (!mutex_trylock(&damon_sysfs_lock))
+		return -EBUSY;
+	err = damon_sysfs_kdamonds_add_dirs(kdamonds, nr);
+	mutex_unlock(&damon_sysfs_lock);
+	if (err)
+		return err;
+
+	return count;
+}
+
+static void damon_sysfs_kdamonds_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_kdamonds, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_kdamonds_nr_attr =
+		__ATTR_RW_MODE(nr_kdamonds, 0600);
+
+static struct attribute *damon_sysfs_kdamonds_attrs[] = {
+	&damon_sysfs_kdamonds_nr_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_kdamonds);
+
+static struct kobj_type damon_sysfs_kdamonds_ktype = {
+	.release = damon_sysfs_kdamonds_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_kdamonds_groups,
+};
+
+/*
+ * damon user interface directory
+ */
+
+struct damon_sysfs_ui_dir {
+	struct kobject kobj;
+	struct damon_sysfs_kdamonds *kdamonds;
+};
+
+static struct damon_sysfs_ui_dir *damon_sysfs_ui_dir_alloc(void)
+{
+	return kzalloc(sizeof(struct damon_sysfs_ui_dir), GFP_KERNEL);
+}
+
+static int damon_sysfs_ui_dir_add_dirs(struct damon_sysfs_ui_dir *ui_dir)
+{
+	struct damon_sysfs_kdamonds *kdamonds;
+	int err;
+
+	kdamonds = damon_sysfs_kdamonds_alloc();
+	if (!kdamonds)
+		return -ENOMEM;
+
+	err = kobject_init_and_add(&kdamonds->kobj,
+			&damon_sysfs_kdamonds_ktype, &ui_dir->kobj,
+			"kdamonds");
+	if (err) {
+		kobject_put(&kdamonds->kobj);
+		return err;
+	}
+	ui_dir->kdamonds = kdamonds;
+	return err;
+}
+
+static void damon_sysfs_ui_dir_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_ui_dir, kobj));
+}
+
+static struct attribute *damon_sysfs_ui_dir_attrs[] = {
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_ui_dir);
+
+static struct kobj_type damon_sysfs_ui_dir_ktype = {
+	.release = damon_sysfs_ui_dir_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_ui_dir_groups,
+};
+
+static int __init damon_sysfs_init(void)
+{
+	struct kobject *damon_sysfs_root;
+	struct damon_sysfs_ui_dir *admin;
+	int err;
+
+	damon_sysfs_root = kobject_create_and_add("damon", mm_kobj);
+	if (!damon_sysfs_root)
+		return -ENOMEM;
+
+	admin = damon_sysfs_ui_dir_alloc();
+	if (!admin) {
+		kobject_put(damon_sysfs_root);
+		return -ENOMEM;
+	}
+	err = kobject_init_and_add(&admin->kobj, &damon_sysfs_ui_dir_ktype,
+			damon_sysfs_root, "admin");
+	if (err)
+		goto out;
+	err = damon_sysfs_ui_dir_add_dirs(admin);
+	if (err)
+		goto out;
+	return 0;
+
+out:
+	kobject_put(&admin->kobj);
+	kobject_put(damon_sysfs_root);
+	return err;
+}
+subsys_initcall(damon_sysfs_init);

From 1bb11520ab51d31731eba83f48526d0a1fd9577a Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Mar 2022 14:49:30 -0700
Subject: [PATCH 447/737] mm/damon/sysfs: link DAMON for virtual address spaces
 monitoring
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit links the DAMON sysfs interface to DAMON so that users can
control DAMON via the interface.  In detail, this commit makes writing
'on' to 'state' file constructs DAMON contexts based on values that users
have written to relevant sysfs files and start the context.  It supports
only virtual address spaces monitoring at the moment, though.

The files hierarchy of DAMON sysfs interface after this commit is shown
below.  In the below figure, parents-children relations are represented
with indentations, each directory is having ``/`` suffix, and files in
each directory are separated by comma (",").

    /sys/kernel/mm/damon/admin
    │ kdamonds/nr_kdamonds
    │ │ 0/state,pid
    │ │ │ contexts/nr_contexts
    │ │ │ │ 0/operations
    │ │ │ │ │ monitoring_attrs/
    │ │ │ │ │ │ intervals/sample_us,aggr_us,update_us
    │ │ │ │ │ │ nr_regions/min,max
    │ │ │ │ │ targets/nr_targets
    │ │ │ │ │ │ 0/pid_target
    │ │ │ │ │ │ ...
    │ │ │ │ ...
    │ │ ...

The usage is straightforward.  Writing a number ('N') to each 'nr_*' file
makes directories named '0' to 'N-1'.  Users can construct DAMON contexts
by writing proper values to the files in the straightforward manner and
start each kdamond by writing 'on' to 'kdamonds/<N>/state'.

Link: https://lkml.kernel.org/r/20220228081314.5770-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Xin Hao <xhao@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/sysfs.c | 192 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 189 insertions(+), 3 deletions(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 4455e4bef88db..39b2b8d828190 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -808,22 +808,208 @@ static void damon_sysfs_kdamond_rm_dirs(struct damon_sysfs_kdamond *kdamond)
 	kobject_put(&kdamond->contexts->kobj);
 }
 
+static bool damon_sysfs_ctx_running(struct damon_ctx *ctx)
+{
+	bool running;
+
+	mutex_lock(&ctx->kdamond_lock);
+	running = ctx->kdamond != NULL;
+	mutex_unlock(&ctx->kdamond_lock);
+	return running;
+}
+
 static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
 		char *buf)
 {
-	return -EINVAL;
+	struct damon_sysfs_kdamond *kdamond = container_of(kobj,
+			struct damon_sysfs_kdamond, kobj);
+	struct damon_ctx *ctx = kdamond->damon_ctx;
+	bool running;
+
+	if (!ctx)
+		running = false;
+	else
+		running = damon_sysfs_ctx_running(ctx);
+
+	return sysfs_emit(buf, "%s\n", running ? "on" : "off");
+}
+
+static int damon_sysfs_set_attrs(struct damon_ctx *ctx,
+		struct damon_sysfs_attrs *sys_attrs)
+{
+	struct damon_sysfs_intervals *sys_intervals = sys_attrs->intervals;
+	struct damon_sysfs_ul_range *sys_nr_regions =
+		sys_attrs->nr_regions_range;
+
+	return damon_set_attrs(ctx, sys_intervals->sample_us,
+			sys_intervals->aggr_us, sys_intervals->update_us,
+			sys_nr_regions->min, sys_nr_regions->max);
+}
+
+static void damon_sysfs_destroy_targets(struct damon_ctx *ctx)
+{
+	struct damon_target *t, *next;
+
+	damon_for_each_target_safe(t, next, ctx) {
+		if (ctx->ops.id == DAMON_OPS_VADDR)
+			put_pid(t->pid);
+		damon_destroy_target(t);
+	}
+}
+
+static int damon_sysfs_set_targets(struct damon_ctx *ctx,
+		struct damon_sysfs_targets *sysfs_targets)
+{
+	int i;
+
+	for (i = 0; i < sysfs_targets->nr; i++) {
+		struct damon_sysfs_target *sys_target =
+			sysfs_targets->targets_arr[i];
+		struct damon_target *t = damon_new_target();
+
+		if (!t) {
+			damon_sysfs_destroy_targets(ctx);
+			return -ENOMEM;
+		}
+		if (ctx->ops.id == DAMON_OPS_VADDR) {
+			t->pid = find_get_pid(sys_target->pid);
+			if (!t->pid) {
+				damon_sysfs_destroy_targets(ctx);
+				return -EINVAL;
+			}
+		}
+		damon_add_target(ctx, t);
+	}
+	return 0;
+}
+
+static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
+{
+	struct damon_target *t, *next;
+
+	if (ctx->ops.id != DAMON_OPS_VADDR)
+		return;
+
+	mutex_lock(&ctx->kdamond_lock);
+	damon_for_each_target_safe(t, next, ctx) {
+		put_pid(t->pid);
+		damon_destroy_target(t);
+	}
+	mutex_unlock(&ctx->kdamond_lock);
+}
+
+static struct damon_ctx *damon_sysfs_build_ctx(
+		struct damon_sysfs_context *sys_ctx)
+{
+	struct damon_ctx *ctx = damon_new_ctx();
+	int err;
+
+	if (!ctx)
+		return ERR_PTR(-ENOMEM);
+
+	err = damon_select_ops(ctx, sys_ctx->ops_id);
+	if (err)
+		goto out;
+	err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs);
+	if (err)
+		goto out;
+	err = damon_sysfs_set_targets(ctx, sys_ctx->targets);
+	if (err)
+		goto out;
+
+	ctx->callback.before_terminate = damon_sysfs_before_terminate;
+	return ctx;
+
+out:
+	damon_destroy_ctx(ctx);
+	return ERR_PTR(err);
+}
+
+static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond)
+{
+	struct damon_ctx *ctx;
+	int err;
+
+	if (kdamond->damon_ctx &&
+			damon_sysfs_ctx_running(kdamond->damon_ctx))
+		return -EBUSY;
+	/* TODO: support multiple contexts per kdamond */
+	if (kdamond->contexts->nr != 1)
+		return -EINVAL;
+
+	if (kdamond->damon_ctx)
+		damon_destroy_ctx(kdamond->damon_ctx);
+	kdamond->damon_ctx = NULL;
+
+	ctx = damon_sysfs_build_ctx(kdamond->contexts->contexts_arr[0]);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+	err = damon_start(&ctx, 1, false);
+	if (err) {
+		damon_destroy_ctx(ctx);
+		return err;
+	}
+	kdamond->damon_ctx = ctx;
+	return err;
+}
+
+static int damon_sysfs_turn_damon_off(struct damon_sysfs_kdamond *kdamond)
+{
+	if (!kdamond->damon_ctx)
+		return -EINVAL;
+	return damon_stop(&kdamond->damon_ctx, 1);
+	/*
+	 * To allow users show final monitoring results of already turned-off
+	 * DAMON, we free kdamond->damon_ctx in next
+	 * damon_sysfs_turn_damon_on(), or kdamonds_nr_store()
+	 */
 }
 
 static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
 		const char *buf, size_t count)
 {
-	return -EINVAL;
+	struct damon_sysfs_kdamond *kdamond = container_of(kobj,
+			struct damon_sysfs_kdamond, kobj);
+	ssize_t ret;
+
+	if (!mutex_trylock(&damon_sysfs_lock))
+		return -EBUSY;
+	if (sysfs_streq(buf, "on"))
+		ret = damon_sysfs_turn_damon_on(kdamond);
+	else if (sysfs_streq(buf, "off"))
+		ret = damon_sysfs_turn_damon_off(kdamond);
+	else
+		ret = -EINVAL;
+	mutex_unlock(&damon_sysfs_lock);
+	if (!ret)
+		ret = count;
+	return ret;
 }
 
 static ssize_t pid_show(struct kobject *kobj,
 		struct kobj_attribute *attr, char *buf)
 {
-	return -EINVAL;
+	struct damon_sysfs_kdamond *kdamond = container_of(kobj,
+			struct damon_sysfs_kdamond, kobj);
+	struct damon_ctx *ctx;
+	int pid;
+
+	if (!mutex_trylock(&damon_sysfs_lock))
+		return -EBUSY;
+	ctx = kdamond->damon_ctx;
+	if (!ctx) {
+		pid = -1;
+		goto out;
+	}
+	mutex_lock(&ctx->kdamond_lock);
+	if (!ctx->kdamond)
+		pid = -1;
+	else
+		pid = ctx->kdamond->pid;
+	mutex_unlock(&ctx->kdamond_lock);
+out:
+	mutex_unlock(&damon_sysfs_lock);
+	return sysfs_emit(buf, "%d\n", pid);
 }
 
 static void damon_sysfs_kdamond_release(struct kobject *kobj)

From eee88ff3873101084ddf6f89bd64e9cc1f0771c6 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Mar 2022 14:49:34 -0700
Subject: [PATCH 448/737] mm/damon/sysfs: support the physical address space
 monitoring
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit makes DAMON sysfs interface supports the physical address
space monitoring.  Specifically, this commit adds support of the initial
monitoring regions set feature by adding 'regions' directory under each
target directory and makes context operations file to receive 'paddr' in
addition to 'vaddr'.

As a result, the files hierarchy becomes as below:

    /sys/kernel/mm/damon/admin
    │ kdamonds/nr_kdamonds
    │ │ 0/state,pid
    │ │ │ contexts/nr_contexts
    │ │ │ │ 0/operations
    │ │ │ │ │ monitoring_attrs/
    │ │ │ │ │ │ intervals/sample_us,aggr_us,update_us
    │ │ │ │ │ │ nr_regions/min,max
    │ │ │ │ │ targets/nr_targets
    │ │ │ │ │ │ 0/pid_target
    │ │ │ │ │ │ │ regions/nr_regions    <- NEW DIRECTORY
    │ │ │ │ │ │ │ │ 0/start,end
    │ │ │ │ │ │ │ │ ...
    │ │ │ │ │ │ ...
    │ │ │ │ ...
    │ │ ...

Link: https://lkml.kernel.org/r/20220228081314.5770-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Xin Hao <xhao@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/sysfs.c | 276 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 271 insertions(+), 5 deletions(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 39b2b8d828190..7d5f2c992345c 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -113,12 +113,220 @@ static struct kobj_type damon_sysfs_ul_range_ktype = {
 	.default_groups = damon_sysfs_ul_range_groups,
 };
 
+/*
+ * init region directory
+ */
+
+struct damon_sysfs_region {
+	struct kobject kobj;
+	unsigned long start;
+	unsigned long end;
+};
+
+static struct damon_sysfs_region *damon_sysfs_region_alloc(
+		unsigned long start,
+		unsigned long end)
+{
+	struct damon_sysfs_region *region = kmalloc(sizeof(*region),
+			GFP_KERNEL);
+
+	if (!region)
+		return NULL;
+	region->kobj = (struct kobject){};
+	region->start = start;
+	region->end = end;
+	return region;
+}
+
+static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_region *region = container_of(kobj,
+			struct damon_sysfs_region, kobj);
+
+	return sysfs_emit(buf, "%lu\n", region->start);
+}
+
+static ssize_t start_store(struct kobject *kobj, struct kobj_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct damon_sysfs_region *region = container_of(kobj,
+			struct damon_sysfs_region, kobj);
+	int err = kstrtoul(buf, 0, &region->start);
+
+	if (err)
+		return -EINVAL;
+	return count;
+}
+
+static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_region *region = container_of(kobj,
+			struct damon_sysfs_region, kobj);
+
+	return sysfs_emit(buf, "%lu\n", region->end);
+}
+
+static ssize_t end_store(struct kobject *kobj, struct kobj_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct damon_sysfs_region *region = container_of(kobj,
+			struct damon_sysfs_region, kobj);
+	int err = kstrtoul(buf, 0, &region->end);
+
+	if (err)
+		return -EINVAL;
+	return count;
+}
+
+static void damon_sysfs_region_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_region, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_region_start_attr =
+		__ATTR_RW_MODE(start, 0600);
+
+static struct kobj_attribute damon_sysfs_region_end_attr =
+		__ATTR_RW_MODE(end, 0600);
+
+static struct attribute *damon_sysfs_region_attrs[] = {
+	&damon_sysfs_region_start_attr.attr,
+	&damon_sysfs_region_end_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_region);
+
+static struct kobj_type damon_sysfs_region_ktype = {
+	.release = damon_sysfs_region_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_region_groups,
+};
+
+/*
+ * init_regions directory
+ */
+
+struct damon_sysfs_regions {
+	struct kobject kobj;
+	struct damon_sysfs_region **regions_arr;
+	int nr;
+};
+
+static struct damon_sysfs_regions *damon_sysfs_regions_alloc(void)
+{
+	return kzalloc(sizeof(struct damon_sysfs_regions), GFP_KERNEL);
+}
+
+static void damon_sysfs_regions_rm_dirs(struct damon_sysfs_regions *regions)
+{
+	struct damon_sysfs_region **regions_arr = regions->regions_arr;
+	int i;
+
+	for (i = 0; i < regions->nr; i++)
+		kobject_put(&regions_arr[i]->kobj);
+	regions->nr = 0;
+	kfree(regions_arr);
+	regions->regions_arr = NULL;
+}
+
+static int damon_sysfs_regions_add_dirs(struct damon_sysfs_regions *regions,
+		int nr_regions)
+{
+	struct damon_sysfs_region **regions_arr, *region;
+	int err, i;
+
+	damon_sysfs_regions_rm_dirs(regions);
+	if (!nr_regions)
+		return 0;
+
+	regions_arr = kmalloc_array(nr_regions, sizeof(*regions_arr),
+			GFP_KERNEL | __GFP_NOWARN);
+	if (!regions_arr)
+		return -ENOMEM;
+	regions->regions_arr = regions_arr;
+
+	for (i = 0; i < nr_regions; i++) {
+		region = damon_sysfs_region_alloc(0, 0);
+		if (!region) {
+			damon_sysfs_regions_rm_dirs(regions);
+			return -ENOMEM;
+		}
+
+		err = kobject_init_and_add(&region->kobj,
+				&damon_sysfs_region_ktype, &regions->kobj,
+				"%d", i);
+		if (err) {
+			kobject_put(&region->kobj);
+			damon_sysfs_regions_rm_dirs(regions);
+			return err;
+		}
+
+		regions_arr[i] = region;
+		regions->nr++;
+	}
+	return 0;
+}
+
+static ssize_t nr_regions_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_regions *regions = container_of(kobj,
+			struct damon_sysfs_regions, kobj);
+
+	return sysfs_emit(buf, "%d\n", regions->nr);
+}
+
+static ssize_t nr_regions_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_regions *regions = container_of(kobj,
+			struct damon_sysfs_regions, kobj);
+	int nr, err = kstrtoint(buf, 0, &nr);
+
+	if (err)
+		return err;
+	if (nr < 0)
+		return -EINVAL;
+
+	if (!mutex_trylock(&damon_sysfs_lock))
+		return -EBUSY;
+	err = damon_sysfs_regions_add_dirs(regions, nr);
+	mutex_unlock(&damon_sysfs_lock);
+	if (err)
+		return err;
+
+	return count;
+}
+
+static void damon_sysfs_regions_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_regions, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_regions_nr_attr =
+		__ATTR_RW_MODE(nr_regions, 0600);
+
+static struct attribute *damon_sysfs_regions_attrs[] = {
+	&damon_sysfs_regions_nr_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_regions);
+
+static struct kobj_type damon_sysfs_regions_ktype = {
+	.release = damon_sysfs_regions_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_regions_groups,
+};
+
 /*
  * target directory
  */
 
 struct damon_sysfs_target {
 	struct kobject kobj;
+	struct damon_sysfs_regions *regions;
 	int pid;
 };
 
@@ -127,6 +335,29 @@ static struct damon_sysfs_target *damon_sysfs_target_alloc(void)
 	return kzalloc(sizeof(struct damon_sysfs_target), GFP_KERNEL);
 }
 
+static int damon_sysfs_target_add_dirs(struct damon_sysfs_target *target)
+{
+	struct damon_sysfs_regions *regions = damon_sysfs_regions_alloc();
+	int err;
+
+	if (!regions)
+		return -ENOMEM;
+
+	err = kobject_init_and_add(&regions->kobj, &damon_sysfs_regions_ktype,
+			&target->kobj, "regions");
+	if (err)
+		kobject_put(&regions->kobj);
+	else
+		target->regions = regions;
+	return err;
+}
+
+static void damon_sysfs_target_rm_dirs(struct damon_sysfs_target *target)
+{
+	damon_sysfs_regions_rm_dirs(target->regions);
+	kobject_put(&target->regions->kobj);
+}
+
 static ssize_t pid_target_show(struct kobject *kobj,
 		struct kobj_attribute *attr, char *buf)
 {
@@ -188,8 +419,10 @@ static void damon_sysfs_targets_rm_dirs(struct damon_sysfs_targets *targets)
 	struct damon_sysfs_target **targets_arr = targets->targets_arr;
 	int i;
 
-	for (i = 0; i < targets->nr; i++)
+	for (i = 0; i < targets->nr; i++) {
+		damon_sysfs_target_rm_dirs(targets_arr[i]);
 		kobject_put(&targets_arr[i]->kobj);
+	}
 	targets->nr = 0;
 	kfree(targets_arr);
 	targets->targets_arr = NULL;
@@ -224,6 +457,10 @@ static int damon_sysfs_targets_add_dirs(struct damon_sysfs_targets *targets,
 		if (err)
 			goto out;
 
+		err = damon_sysfs_target_add_dirs(target);
+		if (err)
+			goto out;
+
 		targets_arr[i] = target;
 		targets->nr++;
 	}
@@ -610,9 +847,6 @@ static ssize_t operations_store(struct kobject *kobj,
 
 	for (id = 0; id < NR_DAMON_OPS; id++) {
 		if (sysfs_streq(buf, damon_sysfs_ops_strs[id])) {
-			/* Support only vaddr */
-			if (id != DAMON_OPS_VADDR)
-				return -EINVAL;
 			context->ops_id = id;
 			return count;
 		}
@@ -857,10 +1091,37 @@ static void damon_sysfs_destroy_targets(struct damon_ctx *ctx)
 	}
 }
 
+static int damon_sysfs_set_regions(struct damon_target *t,
+		struct damon_sysfs_regions *sysfs_regions)
+{
+	int i;
+
+	for (i = 0; i < sysfs_regions->nr; i++) {
+		struct damon_sysfs_region *sys_region =
+			sysfs_regions->regions_arr[i];
+		struct damon_region *prev, *r;
+
+		if (sys_region->start > sys_region->end)
+			return -EINVAL;
+		r = damon_new_region(sys_region->start, sys_region->end);
+		if (!r)
+			return -ENOMEM;
+		damon_add_region(r, t);
+		if (damon_nr_regions(t) > 1) {
+			prev = damon_prev_region(r);
+			if (prev->ar.end > r->ar.start) {
+				damon_destroy_region(r, t);
+				return -EINVAL;
+			}
+		}
+	}
+	return 0;
+}
+
 static int damon_sysfs_set_targets(struct damon_ctx *ctx,
 		struct damon_sysfs_targets *sysfs_targets)
 {
-	int i;
+	int i, err;
 
 	for (i = 0; i < sysfs_targets->nr; i++) {
 		struct damon_sysfs_target *sys_target =
@@ -879,6 +1140,11 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx,
 			}
 		}
 		damon_add_target(ctx, t);
+		err = damon_sysfs_set_regions(t, sys_target->regions);
+		if (err) {
+			damon_sysfs_destroy_targets(ctx);
+			return err;
+		}
 	}
 	return 0;
 }

From c8b3154285a0301f329b4d27e89f0f3b3771285a Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Mar 2022 14:49:37 -0700
Subject: [PATCH 449/737] mm/damon/sysfs: support DAMON-based Operation Schemes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit makes DAMON sysfs interface supports the DAMON-based operation
schemes (DAMOS) feature.  Specifically, this commit adds 'schemes'
directory under each context direcotry, and makes kdamond 'state' file
writing respects the contents in the directory.

Note that this commit doesn't support all features of DAMOS but only the
target access pattern and action feature.  Supports for quotas,
prioritization, watermarks will follow.

As a result, the files hierarchy becomes as below:

    /sys/kernel/mm/damon/admin
    │ kdamonds/nr_kdamonds
    │ │ 0/state,pid
    │ │ │ contexts/nr_contexts
    │ │ │ │ 0/operations
    │ │ │ │ │ monitoring_attrs/intervals/sample_us,aggr_us,update_us
    │ │ │ │ │ │ nr_regions/min,max
    │ │ │ │ │ targets/nr_targets
    │ │ │ │ │ │ 0/pid_target
    │ │ │ │ │ │ │ regions/nr_regions
    │ │ │ │ │ │ │ │ 0/start,end
    │ │ │ │ │ │ │ │ ...
    │ │ │ │ │ │ ...
    │ │ │ │ │ schemes/nr_schemes    <- NEW DIRECTORY
    │ │ │ │ │ │ 0/action
    │ │ │ │ │ │ │ access_pattern/
    │ │ │ │ │ │ │ │ sz/min,max
    │ │ │ │ │ │ │ │ nr_accesses/min,max
    │ │ │ │ │ │ │ │ age/min,max
    │ │ │ │ │ │ ...
    │ │ │ │ ...
    │ │ ...

Link: https://lkml.kernel.org/r/20220228081314.5770-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Xin Hao <xhao@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/sysfs.c | 410 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 410 insertions(+)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 7d5f2c992345c..812c3a3b0624a 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -113,6 +113,347 @@ static struct kobj_type damon_sysfs_ul_range_ktype = {
 	.default_groups = damon_sysfs_ul_range_groups,
 };
 
+/*
+ * access_pattern directory
+ */
+
+struct damon_sysfs_access_pattern {
+	struct kobject kobj;
+	struct damon_sysfs_ul_range *sz;
+	struct damon_sysfs_ul_range *nr_accesses;
+	struct damon_sysfs_ul_range *age;
+};
+
+static
+struct damon_sysfs_access_pattern *damon_sysfs_access_pattern_alloc(void)
+{
+	struct damon_sysfs_access_pattern *access_pattern =
+		kmalloc(sizeof(*access_pattern), GFP_KERNEL);
+
+	if (!access_pattern)
+		return NULL;
+	access_pattern->kobj = (struct kobject){};
+	return access_pattern;
+}
+
+static int damon_sysfs_access_pattern_add_range_dir(
+		struct damon_sysfs_access_pattern *access_pattern,
+		struct damon_sysfs_ul_range **range_dir_ptr,
+		char *name)
+{
+	struct damon_sysfs_ul_range *range = damon_sysfs_ul_range_alloc(0, 0);
+	int err;
+
+	if (!range)
+		return -ENOMEM;
+	err = kobject_init_and_add(&range->kobj, &damon_sysfs_ul_range_ktype,
+			&access_pattern->kobj, name);
+	if (err)
+		kobject_put(&range->kobj);
+	else
+		*range_dir_ptr = range;
+	return err;
+}
+
+static int damon_sysfs_access_pattern_add_dirs(
+		struct damon_sysfs_access_pattern *access_pattern)
+{
+	int err;
+
+	err = damon_sysfs_access_pattern_add_range_dir(access_pattern,
+			&access_pattern->sz, "sz");
+	if (err)
+		goto put_sz_out;
+
+	err = damon_sysfs_access_pattern_add_range_dir(access_pattern,
+			&access_pattern->nr_accesses, "nr_accesses");
+	if (err)
+		goto put_nr_accesses_sz_out;
+
+	err = damon_sysfs_access_pattern_add_range_dir(access_pattern,
+			&access_pattern->age, "age");
+	if (err)
+		goto put_age_nr_accesses_sz_out;
+	return 0;
+
+put_age_nr_accesses_sz_out:
+	kobject_put(&access_pattern->age->kobj);
+	access_pattern->age = NULL;
+put_nr_accesses_sz_out:
+	kobject_put(&access_pattern->nr_accesses->kobj);
+	access_pattern->nr_accesses = NULL;
+put_sz_out:
+	kobject_put(&access_pattern->sz->kobj);
+	access_pattern->sz = NULL;
+	return err;
+}
+
+static void damon_sysfs_access_pattern_rm_dirs(
+		struct damon_sysfs_access_pattern *access_pattern)
+{
+	kobject_put(&access_pattern->sz->kobj);
+	kobject_put(&access_pattern->nr_accesses->kobj);
+	kobject_put(&access_pattern->age->kobj);
+}
+
+static void damon_sysfs_access_pattern_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_access_pattern, kobj));
+}
+
+static struct attribute *damon_sysfs_access_pattern_attrs[] = {
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_access_pattern);
+
+static struct kobj_type damon_sysfs_access_pattern_ktype = {
+	.release = damon_sysfs_access_pattern_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_access_pattern_groups,
+};
+
+/*
+ * scheme directory
+ */
+
+struct damon_sysfs_scheme {
+	struct kobject kobj;
+	enum damos_action action;
+	struct damon_sysfs_access_pattern *access_pattern;
+};
+
+/* This should match with enum damos_action */
+static const char * const damon_sysfs_damos_action_strs[] = {
+	"willneed",
+	"cold",
+	"pageout",
+	"hugepage",
+	"nohugepage",
+	"stat",
+};
+
+static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc(
+		enum damos_action action)
+{
+	struct damon_sysfs_scheme *scheme = kmalloc(sizeof(*scheme),
+				GFP_KERNEL);
+
+	if (!scheme)
+		return NULL;
+	scheme->kobj = (struct kobject){};
+	scheme->action = action;
+	return scheme;
+}
+
+static int damon_sysfs_scheme_set_access_pattern(
+		struct damon_sysfs_scheme *scheme)
+{
+	struct damon_sysfs_access_pattern *access_pattern;
+	int err;
+
+	access_pattern = damon_sysfs_access_pattern_alloc();
+	if (!access_pattern)
+		return -ENOMEM;
+	err = kobject_init_and_add(&access_pattern->kobj,
+			&damon_sysfs_access_pattern_ktype, &scheme->kobj,
+			"access_pattern");
+	if (err)
+		goto out;
+	err = damon_sysfs_access_pattern_add_dirs(access_pattern);
+	if (err)
+		goto out;
+	scheme->access_pattern = access_pattern;
+	return 0;
+
+out:
+	kobject_put(&access_pattern->kobj);
+	return err;
+}
+
+static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
+{
+	int err;
+
+	err = damon_sysfs_scheme_set_access_pattern(scheme);
+	if (err)
+		return err;
+	return 0;
+}
+
+static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme)
+{
+	damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern);
+	kobject_put(&scheme->access_pattern->kobj);
+}
+
+static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_scheme *scheme = container_of(kobj,
+			struct damon_sysfs_scheme, kobj);
+
+	return sysfs_emit(buf, "%s\n",
+			damon_sysfs_damos_action_strs[scheme->action]);
+}
+
+static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct damon_sysfs_scheme *scheme = container_of(kobj,
+			struct damon_sysfs_scheme, kobj);
+	enum damos_action action;
+
+	for (action = 0; action < NR_DAMOS_ACTIONS; action++) {
+		if (sysfs_streq(buf, damon_sysfs_damos_action_strs[action])) {
+			scheme->action = action;
+			return count;
+		}
+	}
+	return -EINVAL;
+}
+
+static void damon_sysfs_scheme_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_scheme, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_scheme_action_attr =
+		__ATTR_RW_MODE(action, 0600);
+
+static struct attribute *damon_sysfs_scheme_attrs[] = {
+	&damon_sysfs_scheme_action_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_scheme);
+
+static struct kobj_type damon_sysfs_scheme_ktype = {
+	.release = damon_sysfs_scheme_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_scheme_groups,
+};
+
+/*
+ * schemes directory
+ */
+
+struct damon_sysfs_schemes {
+	struct kobject kobj;
+	struct damon_sysfs_scheme **schemes_arr;
+	int nr;
+};
+
+static struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void)
+{
+	return kzalloc(sizeof(struct damon_sysfs_schemes), GFP_KERNEL);
+}
+
+static void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes)
+{
+	struct damon_sysfs_scheme **schemes_arr = schemes->schemes_arr;
+	int i;
+
+	for (i = 0; i < schemes->nr; i++) {
+		damon_sysfs_scheme_rm_dirs(schemes_arr[i]);
+		kobject_put(&schemes_arr[i]->kobj);
+	}
+	schemes->nr = 0;
+	kfree(schemes_arr);
+	schemes->schemes_arr = NULL;
+}
+
+static int damon_sysfs_schemes_add_dirs(struct damon_sysfs_schemes *schemes,
+		int nr_schemes)
+{
+	struct damon_sysfs_scheme **schemes_arr, *scheme;
+	int err, i;
+
+	damon_sysfs_schemes_rm_dirs(schemes);
+	if (!nr_schemes)
+		return 0;
+
+	schemes_arr = kmalloc_array(nr_schemes, sizeof(*schemes_arr),
+			GFP_KERNEL | __GFP_NOWARN);
+	if (!schemes_arr)
+		return -ENOMEM;
+	schemes->schemes_arr = schemes_arr;
+
+	for (i = 0; i < nr_schemes; i++) {
+		scheme = damon_sysfs_scheme_alloc(DAMOS_STAT);
+		if (!scheme) {
+			damon_sysfs_schemes_rm_dirs(schemes);
+			return -ENOMEM;
+		}
+
+		err = kobject_init_and_add(&scheme->kobj,
+				&damon_sysfs_scheme_ktype, &schemes->kobj,
+				"%d", i);
+		if (err)
+			goto out;
+		err = damon_sysfs_scheme_add_dirs(scheme);
+		if (err)
+			goto out;
+
+		schemes_arr[i] = scheme;
+		schemes->nr++;
+	}
+	return 0;
+
+out:
+	damon_sysfs_schemes_rm_dirs(schemes);
+	kobject_put(&scheme->kobj);
+	return err;
+}
+
+static ssize_t nr_schemes_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_schemes *schemes = container_of(kobj,
+			struct damon_sysfs_schemes, kobj);
+
+	return sysfs_emit(buf, "%d\n", schemes->nr);
+}
+
+static ssize_t nr_schemes_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_schemes *schemes = container_of(kobj,
+			struct damon_sysfs_schemes, kobj);
+	int nr, err = kstrtoint(buf, 0, &nr);
+
+	if (err)
+		return err;
+	if (nr < 0)
+		return -EINVAL;
+
+	if (!mutex_trylock(&damon_sysfs_lock))
+		return -EBUSY;
+	err = damon_sysfs_schemes_add_dirs(schemes, nr);
+	mutex_unlock(&damon_sysfs_lock);
+	if (err)
+		return err;
+	return count;
+}
+
+static void damon_sysfs_schemes_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_schemes, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_schemes_nr_attr =
+		__ATTR_RW_MODE(nr_schemes, 0600);
+
+static struct attribute *damon_sysfs_schemes_attrs[] = {
+	&damon_sysfs_schemes_nr_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_schemes);
+
+static struct kobj_type damon_sysfs_schemes_ktype = {
+	.release = damon_sysfs_schemes_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_schemes_groups,
+};
+
 /*
  * init region directory
  */
@@ -748,6 +1089,7 @@ struct damon_sysfs_context {
 	enum damon_ops_id ops_id;
 	struct damon_sysfs_attrs *attrs;
 	struct damon_sysfs_targets *targets;
+	struct damon_sysfs_schemes *schemes;
 };
 
 static struct damon_sysfs_context *damon_sysfs_context_alloc(
@@ -802,6 +1144,23 @@ static int damon_sysfs_context_set_targets(struct damon_sysfs_context *context)
 	return 0;
 }
 
+static int damon_sysfs_context_set_schemes(struct damon_sysfs_context *context)
+{
+	struct damon_sysfs_schemes *schemes = damon_sysfs_schemes_alloc();
+	int err;
+
+	if (!schemes)
+		return -ENOMEM;
+	err = kobject_init_and_add(&schemes->kobj, &damon_sysfs_schemes_ktype,
+			&context->kobj, "schemes");
+	if (err) {
+		kobject_put(&schemes->kobj);
+		return err;
+	}
+	context->schemes = schemes;
+	return 0;
+}
+
 static int damon_sysfs_context_add_dirs(struct damon_sysfs_context *context)
 {
 	int err;
@@ -813,8 +1172,15 @@ static int damon_sysfs_context_add_dirs(struct damon_sysfs_context *context)
 	err = damon_sysfs_context_set_targets(context);
 	if (err)
 		goto put_attrs_out;
+
+	err = damon_sysfs_context_set_schemes(context);
+	if (err)
+		goto put_targets_attrs_out;
 	return 0;
 
+put_targets_attrs_out:
+	kobject_put(&context->targets->kobj);
+	context->targets = NULL;
 put_attrs_out:
 	kobject_put(&context->attrs->kobj);
 	context->attrs = NULL;
@@ -827,6 +1193,8 @@ static void damon_sysfs_context_rm_dirs(struct damon_sysfs_context *context)
 	kobject_put(&context->attrs->kobj);
 	damon_sysfs_targets_rm_dirs(context->targets);
 	kobject_put(&context->targets->kobj);
+	damon_sysfs_schemes_rm_dirs(context->schemes);
+	kobject_put(&context->schemes->kobj);
 }
 
 static ssize_t operations_show(struct kobject *kobj,
@@ -1149,6 +1517,45 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx,
 	return 0;
 }
 
+static struct damos *damon_sysfs_mk_scheme(
+		struct damon_sysfs_scheme *sysfs_scheme)
+{
+	struct damon_sysfs_access_pattern *pattern =
+		sysfs_scheme->access_pattern;
+	struct damos_quota quota = (struct damos_quota){};
+	struct damos_watermarks wmarks = {
+		.metric = DAMOS_WMARK_NONE,
+		.interval = 0,
+		.high = 0,
+		.mid = 0,
+		.low = 0,
+	};
+
+	return damon_new_scheme(pattern->sz->min, pattern->sz->max,
+			pattern->nr_accesses->min, pattern->nr_accesses->max,
+			pattern->age->min, pattern->age->max,
+			sysfs_scheme->action, &quota, &wmarks);
+}
+
+static int damon_sysfs_set_schemes(struct damon_ctx *ctx,
+		struct damon_sysfs_schemes *sysfs_schemes)
+{
+	int i;
+
+	for (i = 0; i < sysfs_schemes->nr; i++) {
+		struct damos *scheme, *next;
+
+		scheme = damon_sysfs_mk_scheme(sysfs_schemes->schemes_arr[i]);
+		if (!scheme) {
+			damon_for_each_scheme_safe(scheme, next, ctx)
+				damon_destroy_scheme(scheme);
+			return -ENOMEM;
+		}
+		damon_add_scheme(ctx, scheme);
+	}
+	return 0;
+}
+
 static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
 {
 	struct damon_target *t, *next;
@@ -1180,6 +1587,9 @@ static struct damon_ctx *damon_sysfs_build_ctx(
 	if (err)
 		goto out;
 	err = damon_sysfs_set_targets(ctx, sys_ctx->targets);
+	if (err)
+		goto out;
+	err = damon_sysfs_set_schemes(ctx, sys_ctx->schemes);
 	if (err)
 		goto out;
 

From cbc941365d7b5736fac21690b3ae2b66889a33c7 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Mar 2022 14:49:40 -0700
Subject: [PATCH 450/737] mm/damon/sysfs: support DAMOS quotas
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit makes DAMON sysfs interface supports the DAMOS quotas feature.
Specifically, this commit adds 'quotas' directory under each scheme
directory and makes kdamond 'state' file writing respects the contents in
the directory.

As a result, the files hierarchy becomes as below:

    /sys/kernel/mm/damon/admin
    │ kdamonds/nr_kdamonds
    │ │ 0/state,pid
    │ │ │ contexts/nr_contexts
    │ │ │ │ 0/operations
    │ │ │ │ │ monitoring_attrs/intervals/sample_us,aggr_us,update_us
    │ │ │ │ │ │ nr_regions/min,max
    │ │ │ │ │ targets/nr_targets
    │ │ │ │ │ │ 0/pid_target
    │ │ │ │ │ │ │ regions/nr_regions
    │ │ │ │ │ │ │ │ 0/start,end
    │ │ │ │ │ │ │ │ ...
    │ │ │ │ │ │ ...
    │ │ │ │ │ schemes/nr_schemes
    │ │ │ │ │ │ 0/action
    │ │ │ │ │ │ │ access_pattern/
    │ │ │ │ │ │ │ │ sz/min,max
    │ │ │ │ │ │ │ │ nr_accesses/min,max
    │ │ │ │ │ │ │ │ age/min,max
    │ │ │ │ │ │ │ quotas/ms,bytes,reset_interval_ms    <- NEW DIRECTORY
    │ │ │ │ │ │ ...
    │ │ │ │ ...
    │ │ ...

Link: https://lkml.kernel.org/r/20220228081314.5770-8-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Xin Hao <xhao@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/sysfs.c | 146 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 145 insertions(+), 1 deletion(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 812c3a3b0624a..21c2663142b6c 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -113,6 +113,113 @@ static struct kobj_type damon_sysfs_ul_range_ktype = {
 	.default_groups = damon_sysfs_ul_range_groups,
 };
 
+/*
+ * quotas directory
+ */
+
+struct damon_sysfs_quotas {
+	struct kobject kobj;
+	unsigned long ms;
+	unsigned long sz;
+	unsigned long reset_interval_ms;
+};
+
+static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void)
+{
+	return kzalloc(sizeof(struct damon_sysfs_quotas), GFP_KERNEL);
+}
+
+static ssize_t ms_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_quotas *quotas = container_of(kobj,
+			struct damon_sysfs_quotas, kobj);
+
+	return sysfs_emit(buf, "%lu\n", quotas->ms);
+}
+
+static ssize_t ms_store(struct kobject *kobj, struct kobj_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct damon_sysfs_quotas *quotas = container_of(kobj,
+			struct damon_sysfs_quotas, kobj);
+	int err = kstrtoul(buf, 0, &quotas->ms);
+
+	if (err)
+		return -EINVAL;
+	return count;
+}
+
+static ssize_t bytes_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_quotas *quotas = container_of(kobj,
+			struct damon_sysfs_quotas, kobj);
+
+	return sysfs_emit(buf, "%lu\n", quotas->sz);
+}
+
+static ssize_t bytes_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_quotas *quotas = container_of(kobj,
+			struct damon_sysfs_quotas, kobj);
+	int err = kstrtoul(buf, 0, &quotas->sz);
+
+	if (err)
+		return -EINVAL;
+	return count;
+}
+
+static ssize_t reset_interval_ms_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_quotas *quotas = container_of(kobj,
+			struct damon_sysfs_quotas, kobj);
+
+	return sysfs_emit(buf, "%lu\n", quotas->reset_interval_ms);
+}
+
+static ssize_t reset_interval_ms_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_quotas *quotas = container_of(kobj,
+			struct damon_sysfs_quotas, kobj);
+	int err = kstrtoul(buf, 0, &quotas->reset_interval_ms);
+
+	if (err)
+		return -EINVAL;
+	return count;
+}
+
+static void damon_sysfs_quotas_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_quotas, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_quotas_ms_attr =
+		__ATTR_RW_MODE(ms, 0600);
+
+static struct kobj_attribute damon_sysfs_quotas_sz_attr =
+		__ATTR_RW_MODE(bytes, 0600);
+
+static struct kobj_attribute damon_sysfs_quotas_reset_interval_ms_attr =
+		__ATTR_RW_MODE(reset_interval_ms, 0600);
+
+static struct attribute *damon_sysfs_quotas_attrs[] = {
+	&damon_sysfs_quotas_ms_attr.attr,
+	&damon_sysfs_quotas_sz_attr.attr,
+	&damon_sysfs_quotas_reset_interval_ms_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_quotas);
+
+static struct kobj_type damon_sysfs_quotas_ktype = {
+	.release = damon_sysfs_quotas_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_quotas_groups,
+};
+
 /*
  * access_pattern directory
  */
@@ -220,6 +327,7 @@ struct damon_sysfs_scheme {
 	struct kobject kobj;
 	enum damos_action action;
 	struct damon_sysfs_access_pattern *access_pattern;
+	struct damon_sysfs_quotas *quotas;
 };
 
 /* This should match with enum damos_action */
@@ -270,6 +378,25 @@ static int damon_sysfs_scheme_set_access_pattern(
 	return err;
 }
 
+static int damon_sysfs_scheme_set_quotas(struct damon_sysfs_scheme *scheme)
+{
+	struct damon_sysfs_quotas *quotas = damon_sysfs_quotas_alloc();
+	int err;
+
+	if (!quotas)
+		return -ENOMEM;
+	err = kobject_init_and_add(&quotas->kobj, &damon_sysfs_quotas_ktype,
+			&scheme->kobj, "quotas");
+	if (err)
+		goto out;
+	scheme->quotas = quotas;
+	return 0;
+
+out:
+	kobject_put(&quotas->kobj);
+	return err;
+}
+
 static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
 {
 	int err;
@@ -277,13 +404,22 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
 	err = damon_sysfs_scheme_set_access_pattern(scheme);
 	if (err)
 		return err;
+	err = damon_sysfs_scheme_set_quotas(scheme);
+	if (err)
+		goto put_access_pattern_out;
 	return 0;
+
+put_access_pattern_out:
+	kobject_put(&scheme->access_pattern->kobj);
+	scheme->access_pattern = NULL;
+	return err;
 }
 
 static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme)
 {
 	damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern);
 	kobject_put(&scheme->access_pattern->kobj);
+	kobject_put(&scheme->quotas->kobj);
 }
 
 static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1522,7 +1658,15 @@ static struct damos *damon_sysfs_mk_scheme(
 {
 	struct damon_sysfs_access_pattern *pattern =
 		sysfs_scheme->access_pattern;
-	struct damos_quota quota = (struct damos_quota){};
+	struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
+	struct damos_quota quota = {
+		.ms = sysfs_quotas->ms,
+		.sz = sysfs_quotas->sz,
+		.reset_interval = sysfs_quotas->reset_interval_ms,
+		.weight_sz = 1000,
+		.weight_nr_accesses = 1000,
+		.weight_age = 1000,
+	};
 	struct damos_watermarks wmarks = {
 		.metric = DAMOS_WMARK_NONE,
 		.interval = 0,

From f447a287227450168ba64f74da497fa9dd3628ea Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Mar 2022 14:49:43 -0700
Subject: [PATCH 451/737] mm/damon/sysfs: support schemes prioritization
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit makes DAMON sysfs interface supports the DAMOS' regions
prioritization weights feature under quotas limitation.  Specifically,
this commit adds 'weights' directory under each scheme directory and makes
kdamond 'state' file writing respects the contents in the directory.

    /sys/kernel/mm/damon/admin
    │ kdamonds/nr
    │ │ 0/state,pid
    │ │ │ contexts/nr
    │ │ │ │ 0/operations
    │ │ │ │ │ monitoring_attrs/intervals/sample_us,aggr_us,update_us
    │ │ │ │ │ │ nr_regions/min,max
    │ │ │ │ │ targets/nr
    │ │ │ │ │ │ 0/pid
    │ │ │ │ │ │ │ regions/nr
    │ │ │ │ │ │ │ │ 0/start,end
    │ │ │ │ │ │ │ │ ...
    │ │ │ │ │ │ ...
    │ │ │ │ │ schemes/nr
    │ │ │ │ │ │ 0/action
    │ │ │ │ │ │ │ access_pattern/
    │ │ │ │ │ │ │ │ sz/min,max
    │ │ │ │ │ │ │ │ nr_accesses/min,max
    │ │ │ │ │ │ │ │ age/min,max
    │ │ │ │ │ │ │ quotas/ms,bytes,reset_interval_ms
    │ │ │ │ │ │ │ │ weights/    <- NEW DIRECTORY
    │ │ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil
    │ │ │ │ │ │ ...
    │ │ │ │ ...
    │ │ ...

Link: https://lkml.kernel.org/r/20220228081314.5770-9-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Xin Hao <xhao@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/sysfs.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 149 insertions(+), 3 deletions(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 21c2663142b6c..03ff364c15544 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -113,12 +113,130 @@ static struct kobj_type damon_sysfs_ul_range_ktype = {
 	.default_groups = damon_sysfs_ul_range_groups,
 };
 
+/*
+ * scheme/weights directory
+ */
+
+struct damon_sysfs_weights {
+	struct kobject kobj;
+	unsigned int sz;
+	unsigned int nr_accesses;
+	unsigned int age;
+};
+
+static struct damon_sysfs_weights *damon_sysfs_weights_alloc(unsigned int sz,
+		unsigned int nr_accesses, unsigned int age)
+{
+	struct damon_sysfs_weights *weights = kmalloc(sizeof(*weights),
+			GFP_KERNEL);
+
+	if (!weights)
+		return NULL;
+	weights->kobj = (struct kobject){};
+	weights->sz = sz;
+	weights->nr_accesses = nr_accesses;
+	weights->age = age;
+	return weights;
+}
+
+static ssize_t sz_permil_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_weights *weights = container_of(kobj,
+			struct damon_sysfs_weights, kobj);
+
+	return sysfs_emit(buf, "%u\n", weights->sz);
+}
+
+static ssize_t sz_permil_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_weights *weights = container_of(kobj,
+			struct damon_sysfs_weights, kobj);
+	int err = kstrtouint(buf, 0, &weights->sz);
+
+	if (err)
+		return -EINVAL;
+	return count;
+}
+
+static ssize_t nr_accesses_permil_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_weights *weights = container_of(kobj,
+			struct damon_sysfs_weights, kobj);
+
+	return sysfs_emit(buf, "%u\n", weights->nr_accesses);
+}
+
+static ssize_t nr_accesses_permil_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_weights *weights = container_of(kobj,
+			struct damon_sysfs_weights, kobj);
+	int err = kstrtouint(buf, 0, &weights->nr_accesses);
+
+	if (err)
+		return -EINVAL;
+	return count;
+}
+
+static ssize_t age_permil_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_weights *weights = container_of(kobj,
+			struct damon_sysfs_weights, kobj);
+
+	return sysfs_emit(buf, "%u\n", weights->age);
+}
+
+static ssize_t age_permil_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_weights *weights = container_of(kobj,
+			struct damon_sysfs_weights, kobj);
+	int err = kstrtouint(buf, 0, &weights->age);
+
+	if (err)
+		return -EINVAL;
+	return count;
+}
+
+static void damon_sysfs_weights_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_weights, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_weights_sz_attr =
+		__ATTR_RW_MODE(sz_permil, 0600);
+
+static struct kobj_attribute damon_sysfs_weights_nr_accesses_attr =
+		__ATTR_RW_MODE(nr_accesses_permil, 0600);
+
+static struct kobj_attribute damon_sysfs_weights_age_attr =
+		__ATTR_RW_MODE(age_permil, 0600);
+
+static struct attribute *damon_sysfs_weights_attrs[] = {
+	&damon_sysfs_weights_sz_attr.attr,
+	&damon_sysfs_weights_nr_accesses_attr.attr,
+	&damon_sysfs_weights_age_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_weights);
+
+static struct kobj_type damon_sysfs_weights_ktype = {
+	.release = damon_sysfs_weights_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_weights_groups,
+};
+
 /*
  * quotas directory
  */
 
 struct damon_sysfs_quotas {
 	struct kobject kobj;
+	struct damon_sysfs_weights *weights;
 	unsigned long ms;
 	unsigned long sz;
 	unsigned long reset_interval_ms;
@@ -129,6 +247,29 @@ static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void)
 	return kzalloc(sizeof(struct damon_sysfs_quotas), GFP_KERNEL);
 }
 
+static int damon_sysfs_quotas_add_dirs(struct damon_sysfs_quotas *quotas)
+{
+	struct damon_sysfs_weights *weights;
+	int err;
+
+	weights = damon_sysfs_weights_alloc(0, 0, 0);
+	if (!weights)
+		return -ENOMEM;
+
+	err = kobject_init_and_add(&weights->kobj, &damon_sysfs_weights_ktype,
+			&quotas->kobj, "weights");
+	if (err)
+		kobject_put(&weights->kobj);
+	else
+		quotas->weights = weights;
+	return err;
+}
+
+static void damon_sysfs_quotas_rm_dirs(struct damon_sysfs_quotas *quotas)
+{
+	kobject_put(&quotas->weights->kobj);
+}
+
 static ssize_t ms_show(struct kobject *kobj, struct kobj_attribute *attr,
 		char *buf)
 {
@@ -387,6 +528,9 @@ static int damon_sysfs_scheme_set_quotas(struct damon_sysfs_scheme *scheme)
 		return -ENOMEM;
 	err = kobject_init_and_add(&quotas->kobj, &damon_sysfs_quotas_ktype,
 			&scheme->kobj, "quotas");
+	if (err)
+		goto out;
+	err = damon_sysfs_quotas_add_dirs(quotas);
 	if (err)
 		goto out;
 	scheme->quotas = quotas;
@@ -419,6 +563,7 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme)
 {
 	damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern);
 	kobject_put(&scheme->access_pattern->kobj);
+	damon_sysfs_quotas_rm_dirs(scheme->quotas);
 	kobject_put(&scheme->quotas->kobj);
 }
 
@@ -1659,13 +1804,14 @@ static struct damos *damon_sysfs_mk_scheme(
 	struct damon_sysfs_access_pattern *pattern =
 		sysfs_scheme->access_pattern;
 	struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
+	struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
 	struct damos_quota quota = {
 		.ms = sysfs_quotas->ms,
 		.sz = sysfs_quotas->sz,
 		.reset_interval = sysfs_quotas->reset_interval_ms,
-		.weight_sz = 1000,
-		.weight_nr_accesses = 1000,
-		.weight_age = 1000,
+		.weight_sz = sysfs_weights->sz,
+		.weight_nr_accesses = sysfs_weights->nr_accesses,
+		.weight_age = sysfs_weights->age,
 	};
 	struct damos_watermarks wmarks = {
 		.metric = DAMOS_WMARK_NONE,

From 1725fe039f9234d5fff1f2ed996b2b6f8bcbc284 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Mar 2022 14:49:46 -0700
Subject: [PATCH 452/737] mm/damon/sysfs: support DAMOS watermarks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit makes DAMON sysfs interface supports the DAMOS watermarks
feature.  Specifically, this commit adds 'watermarks' directory under each
scheme directory and makes kdamond 'state' file writing respects the
contents in the directory.

As a result, the files hierarchy becomes as below:

    /sys/kernel/mm/damon/admin
    │ kdamonds/nr_kdamonds
    │ │ 0/state,pid
    │ │ │ contexts/nr_contexts
    │ │ │ │ 0/operations
    │ │ │ │ │ monitoring_attrs/intervals/sample_us,aggr_us,update_us
    │ │ │ │ │ │ nr_regions/min,max
    │ │ │ │ │ targets/nr_targets
    │ │ │ │ │ │ 0/pid_target
    │ │ │ │ │ │ │ regions/nr_regions
    │ │ │ │ │ │ │ │ 0/start,end
    │ │ │ │ │ │ │ │ ...
    │ │ │ │ │ │ ...
    │ │ │ │ │ schemes/nr_schemes
    │ │ │ │ │ │ 0/action
    │ │ │ │ │ │ │ access_pattern/
    │ │ │ │ │ │ │ │ sz/min,max
    │ │ │ │ │ │ │ │ nr_accesses/min,max
    │ │ │ │ │ │ │ │ age/min,max
    │ │ │ │ │ │ │ quotas/ms,sz,reset_interval_ms
    │ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil
    │ │ │ │ │ │ │ watermarks/    <- NEW DIRECTORY
    │ │ │ │ │ │ │ │ metric,interval_us,high,mid,lo
    │ │ │ │ │ │ ...
    │ │ │ │ ...
    │ │ ...

[sj@kernel.org: fix out-of-bound array access for wmark_metric_strs[]]
  Link: https://lkml.kernel.org/r/20220301185619.2904-1-sj@kernel.org

Link: https://lkml.kernel.org/r/20220228081314.5770-10-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Xin Hao <xhao@linux.alibaba.com>
Cc: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/sysfs.c | 220 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 215 insertions(+), 5 deletions(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 03ff364c15544..8a8f8c15470fa 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -113,6 +113,189 @@ static struct kobj_type damon_sysfs_ul_range_ktype = {
 	.default_groups = damon_sysfs_ul_range_groups,
 };
 
+/*
+ * watermarks directory
+ */
+
+struct damon_sysfs_watermarks {
+	struct kobject kobj;
+	enum damos_wmark_metric metric;
+	unsigned long interval_us;
+	unsigned long high;
+	unsigned long mid;
+	unsigned long low;
+};
+
+static struct damon_sysfs_watermarks *damon_sysfs_watermarks_alloc(
+		enum damos_wmark_metric metric, unsigned long interval_us,
+		unsigned long high, unsigned long mid, unsigned long low)
+{
+	struct damon_sysfs_watermarks *watermarks = kmalloc(
+			sizeof(*watermarks), GFP_KERNEL);
+
+	if (!watermarks)
+		return NULL;
+	watermarks->kobj = (struct kobject){};
+	watermarks->metric = metric;
+	watermarks->interval_us = interval_us;
+	watermarks->high = high;
+	watermarks->mid = mid;
+	watermarks->low = low;
+	return watermarks;
+}
+
+/* Should match with enum damos_wmark_metric */
+static const char * const damon_sysfs_wmark_metric_strs[] = {
+	"none",
+	"free_mem_rate",
+};
+
+static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+
+	return sysfs_emit(buf, "%s\n",
+			damon_sysfs_wmark_metric_strs[watermarks->metric]);
+}
+
+static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+	enum damos_wmark_metric metric;
+
+	for (metric = 0; metric < NR_DAMOS_WMARK_METRICS; metric++) {
+		if (sysfs_streq(buf, damon_sysfs_wmark_metric_strs[metric])) {
+			watermarks->metric = metric;
+			return count;
+		}
+	}
+	return -EINVAL;
+}
+
+static ssize_t interval_us_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+
+	return sysfs_emit(buf, "%lu\n", watermarks->interval_us);
+}
+
+static ssize_t interval_us_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+	int err = kstrtoul(buf, 0, &watermarks->interval_us);
+
+	if (err)
+		return -EINVAL;
+	return count;
+}
+
+static ssize_t high_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+
+	return sysfs_emit(buf, "%lu\n", watermarks->high);
+}
+
+static ssize_t high_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+	int err = kstrtoul(buf, 0, &watermarks->high);
+
+	if (err)
+		return -EINVAL;
+	return count;
+}
+
+static ssize_t mid_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+
+	return sysfs_emit(buf, "%lu\n", watermarks->mid);
+}
+
+static ssize_t mid_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+	int err = kstrtoul(buf, 0, &watermarks->mid);
+
+	if (err)
+		return -EINVAL;
+	return count;
+}
+
+static ssize_t low_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+
+	return sysfs_emit(buf, "%lu\n", watermarks->low);
+}
+
+static ssize_t low_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+	int err = kstrtoul(buf, 0, &watermarks->low);
+
+	if (err)
+		return -EINVAL;
+	return count;
+}
+
+static void damon_sysfs_watermarks_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_watermarks, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_watermarks_metric_attr =
+		__ATTR_RW_MODE(metric, 0600);
+
+static struct kobj_attribute damon_sysfs_watermarks_interval_us_attr =
+		__ATTR_RW_MODE(interval_us, 0600);
+
+static struct kobj_attribute damon_sysfs_watermarks_high_attr =
+		__ATTR_RW_MODE(high, 0600);
+
+static struct kobj_attribute damon_sysfs_watermarks_mid_attr =
+		__ATTR_RW_MODE(mid, 0600);
+
+static struct kobj_attribute damon_sysfs_watermarks_low_attr =
+		__ATTR_RW_MODE(low, 0600);
+
+static struct attribute *damon_sysfs_watermarks_attrs[] = {
+	&damon_sysfs_watermarks_metric_attr.attr,
+	&damon_sysfs_watermarks_interval_us_attr.attr,
+	&damon_sysfs_watermarks_high_attr.attr,
+	&damon_sysfs_watermarks_mid_attr.attr,
+	&damon_sysfs_watermarks_low_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_watermarks);
+
+static struct kobj_type damon_sysfs_watermarks_ktype = {
+	.release = damon_sysfs_watermarks_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_watermarks_groups,
+};
+
 /*
  * scheme/weights directory
  */
@@ -469,6 +652,7 @@ struct damon_sysfs_scheme {
 	enum damos_action action;
 	struct damon_sysfs_access_pattern *access_pattern;
 	struct damon_sysfs_quotas *quotas;
+	struct damon_sysfs_watermarks *watermarks;
 };
 
 /* This should match with enum damos_action */
@@ -541,6 +725,24 @@ static int damon_sysfs_scheme_set_quotas(struct damon_sysfs_scheme *scheme)
 	return err;
 }
 
+static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme)
+{
+	struct damon_sysfs_watermarks *watermarks =
+		damon_sysfs_watermarks_alloc(DAMOS_WMARK_NONE, 0, 0, 0, 0);
+	int err;
+
+	if (!watermarks)
+		return -ENOMEM;
+	err = kobject_init_and_add(&watermarks->kobj,
+			&damon_sysfs_watermarks_ktype, &scheme->kobj,
+			"watermarks");
+	if (err)
+		kobject_put(&watermarks->kobj);
+	else
+		scheme->watermarks = watermarks;
+	return err;
+}
+
 static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
 {
 	int err;
@@ -551,8 +753,14 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
 	err = damon_sysfs_scheme_set_quotas(scheme);
 	if (err)
 		goto put_access_pattern_out;
+	err = damon_sysfs_scheme_set_watermarks(scheme);
+	if (err)
+		goto put_quotas_access_pattern_out;
 	return 0;
 
+put_quotas_access_pattern_out:
+	kobject_put(&scheme->quotas->kobj);
+	scheme->quotas = NULL;
 put_access_pattern_out:
 	kobject_put(&scheme->access_pattern->kobj);
 	scheme->access_pattern = NULL;
@@ -565,6 +773,7 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme)
 	kobject_put(&scheme->access_pattern->kobj);
 	damon_sysfs_quotas_rm_dirs(scheme->quotas);
 	kobject_put(&scheme->quotas->kobj);
+	kobject_put(&scheme->watermarks->kobj);
 }
 
 static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1805,6 +2014,7 @@ static struct damos *damon_sysfs_mk_scheme(
 		sysfs_scheme->access_pattern;
 	struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
 	struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
+	struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
 	struct damos_quota quota = {
 		.ms = sysfs_quotas->ms,
 		.sz = sysfs_quotas->sz,
@@ -1814,11 +2024,11 @@ static struct damos *damon_sysfs_mk_scheme(
 		.weight_age = sysfs_weights->age,
 	};
 	struct damos_watermarks wmarks = {
-		.metric = DAMOS_WMARK_NONE,
-		.interval = 0,
-		.high = 0,
-		.mid = 0,
-		.low = 0,
+		.metric = sysfs_wmarks->metric,
+		.interval = sysfs_wmarks->interval_us,
+		.high = sysfs_wmarks->high,
+		.mid = sysfs_wmarks->mid,
+		.low = sysfs_wmarks->low,
 	};
 
 	return damon_new_scheme(pattern->sz->min, pattern->sz->max,

From 430cb81f55f320575b672e8b2e574cbe599fc037 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Mar 2022 14:49:49 -0700
Subject: [PATCH 453/737] mm/damon/sysfs: support DAMOS stats
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit makes DAMON sysfs interface supports the DAMOS stats feature.
Specifically, this commit adds 'stats' directory under each scheme
directory, and update the contents of the files under the directory
according to the latest monitoring results, when the user writes special
keyword, 'update_schemes_stats' to the 'state' file of the kdamond.

As a result, the files hierarchy becomes as below:

    /sys/kernel/mm/damon/admin
    │ kdamonds/nr_kdamonds
    │ │ 0/state,pid
    │ │ │ contexts/nr_contexts
    │ │ │ │ 0/operations
    │ │ │ │ │ monitoring_attrs/intervals/sample_us,aggr_us,update_us
    │ │ │ │ │ │ nr_regions/min,max
    │ │ │ │ │ targets/nr_targets
    │ │ │ │ │ │ 0/pid_target
    │ │ │ │ │ │ │ regions/nr_regions
    │ │ │ │ │ │ │ │ 0/start,end
    │ │ │ │ │ │ │ │ ...
    │ │ │ │ │ │ ...
    │ │ │ │ │ schemes/nr_schemes
    │ │ │ │ │ │ 0/action
    │ │ │ │ │ │ │ access_pattern/
    │ │ │ │ │ │ │ │ sz/min,max
    │ │ │ │ │ │ │ │ nr_accesses/min,max
    │ │ │ │ │ │ │ │ age/min,max
    │ │ │ │ │ │ │ quotas/ms,sz,reset_interval_ms
    │ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil
    │ │ │ │ │ │ │ watermarks/metric,interval_us,high,mid,low
    │ │ │ │ │ │ │ stats/    <- NEW DIRECTORY
    │ │ │ │ │ │ │ │ nr_tried,sz_tried,nr_applied,sz_applied,qt_exceeds
    │ │ │ │ │ │ ...
    │ │ │ │ ...
    │ │ ...

Link: https://lkml.kernel.org/r/20220228081314.5770-11-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Xin Hao <xhao@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/sysfs.c | 150 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 150 insertions(+)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 8a8f8c15470fa..58bcd2f5b02ae 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -113,6 +113,105 @@ static struct kobj_type damon_sysfs_ul_range_ktype = {
 	.default_groups = damon_sysfs_ul_range_groups,
 };
 
+/*
+ * schemes/stats directory
+ */
+
+struct damon_sysfs_stats {
+	struct kobject kobj;
+	unsigned long nr_tried;
+	unsigned long sz_tried;
+	unsigned long nr_applied;
+	unsigned long sz_applied;
+	unsigned long qt_exceeds;
+};
+
+static struct damon_sysfs_stats *damon_sysfs_stats_alloc(void)
+{
+	return kzalloc(sizeof(struct damon_sysfs_stats), GFP_KERNEL);
+}
+
+static ssize_t nr_tried_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_stats *stats = container_of(kobj,
+			struct damon_sysfs_stats, kobj);
+
+	return sysfs_emit(buf, "%lu\n", stats->nr_tried);
+}
+
+static ssize_t sz_tried_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_stats *stats = container_of(kobj,
+			struct damon_sysfs_stats, kobj);
+
+	return sysfs_emit(buf, "%lu\n", stats->sz_tried);
+}
+
+static ssize_t nr_applied_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_stats *stats = container_of(kobj,
+			struct damon_sysfs_stats, kobj);
+
+	return sysfs_emit(buf, "%lu\n", stats->nr_applied);
+}
+
+static ssize_t sz_applied_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_stats *stats = container_of(kobj,
+			struct damon_sysfs_stats, kobj);
+
+	return sysfs_emit(buf, "%lu\n", stats->sz_applied);
+}
+
+static ssize_t qt_exceeds_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_stats *stats = container_of(kobj,
+			struct damon_sysfs_stats, kobj);
+
+	return sysfs_emit(buf, "%lu\n", stats->qt_exceeds);
+}
+
+static void damon_sysfs_stats_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_stats, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_stats_nr_tried_attr =
+		__ATTR_RO_MODE(nr_tried, 0400);
+
+static struct kobj_attribute damon_sysfs_stats_sz_tried_attr =
+		__ATTR_RO_MODE(sz_tried, 0400);
+
+static struct kobj_attribute damon_sysfs_stats_nr_applied_attr =
+		__ATTR_RO_MODE(nr_applied, 0400);
+
+static struct kobj_attribute damon_sysfs_stats_sz_applied_attr =
+		__ATTR_RO_MODE(sz_applied, 0400);
+
+static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr =
+		__ATTR_RO_MODE(qt_exceeds, 0400);
+
+static struct attribute *damon_sysfs_stats_attrs[] = {
+	&damon_sysfs_stats_nr_tried_attr.attr,
+	&damon_sysfs_stats_sz_tried_attr.attr,
+	&damon_sysfs_stats_nr_applied_attr.attr,
+	&damon_sysfs_stats_sz_applied_attr.attr,
+	&damon_sysfs_stats_qt_exceeds_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_stats);
+
+static struct kobj_type damon_sysfs_stats_ktype = {
+	.release = damon_sysfs_stats_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_stats_groups,
+};
+
 /*
  * watermarks directory
  */
@@ -653,6 +752,7 @@ struct damon_sysfs_scheme {
 	struct damon_sysfs_access_pattern *access_pattern;
 	struct damon_sysfs_quotas *quotas;
 	struct damon_sysfs_watermarks *watermarks;
+	struct damon_sysfs_stats *stats;
 };
 
 /* This should match with enum damos_action */
@@ -743,6 +843,22 @@ static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme)
 	return err;
 }
 
+static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme)
+{
+	struct damon_sysfs_stats *stats = damon_sysfs_stats_alloc();
+	int err;
+
+	if (!stats)
+		return -ENOMEM;
+	err = kobject_init_and_add(&stats->kobj, &damon_sysfs_stats_ktype,
+			&scheme->kobj, "stats");
+	if (err)
+		kobject_put(&stats->kobj);
+	else
+		scheme->stats = stats;
+	return err;
+}
+
 static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
 {
 	int err;
@@ -756,8 +872,14 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
 	err = damon_sysfs_scheme_set_watermarks(scheme);
 	if (err)
 		goto put_quotas_access_pattern_out;
+	err = damon_sysfs_scheme_set_stats(scheme);
+	if (err)
+		goto put_watermarks_quotas_access_pattern_out;
 	return 0;
 
+put_watermarks_quotas_access_pattern_out:
+	kobject_put(&scheme->watermarks->kobj);
+	scheme->watermarks = NULL;
 put_quotas_access_pattern_out:
 	kobject_put(&scheme->quotas->kobj);
 	scheme->quotas = NULL;
@@ -774,6 +896,7 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme)
 	damon_sysfs_quotas_rm_dirs(scheme->quotas);
 	kobject_put(&scheme->quotas->kobj);
 	kobject_put(&scheme->watermarks->kobj);
+	kobject_put(&scheme->stats->kobj);
 }
 
 static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -2141,6 +2264,31 @@ static int damon_sysfs_turn_damon_off(struct damon_sysfs_kdamond *kdamond)
 	 */
 }
 
+static int damon_sysfs_update_schemes_stats(struct damon_sysfs_kdamond *kdamond)
+{
+	struct damon_ctx *ctx = kdamond->damon_ctx;
+	struct damos *scheme;
+	int schemes_idx = 0;
+
+	if (!ctx)
+		return -EINVAL;
+	mutex_lock(&ctx->kdamond_lock);
+	damon_for_each_scheme(scheme, ctx) {
+		struct damon_sysfs_schemes *sysfs_schemes;
+		struct damon_sysfs_stats *sysfs_stats;
+
+		sysfs_schemes = kdamond->contexts->contexts_arr[0]->schemes;
+		sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats;
+		sysfs_stats->nr_tried = scheme->stat.nr_tried;
+		sysfs_stats->sz_tried = scheme->stat.sz_tried;
+		sysfs_stats->nr_applied = scheme->stat.nr_applied;
+		sysfs_stats->sz_applied = scheme->stat.sz_applied;
+		sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds;
+	}
+	mutex_unlock(&ctx->kdamond_lock);
+	return 0;
+}
+
 static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
 		const char *buf, size_t count)
 {
@@ -2154,6 +2302,8 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
 		ret = damon_sysfs_turn_damon_on(kdamond);
 	else if (sysfs_streq(buf, "off"))
 		ret = damon_sysfs_turn_damon_off(kdamond);
+	else if (sysfs_streq(buf, "update_schemes_stats"))
+		ret = damon_sysfs_update_schemes_stats(kdamond);
 	else
 		ret = -EINVAL;
 	mutex_unlock(&damon_sysfs_lock);

From 8e1761ee197835f7ee19ba317f702e0990999936 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Mar 2022 14:49:52 -0700
Subject: [PATCH 454/737] selftests/damon: add a test for DAMON sysfs interface

This commit adds a selftest for DAMON sysfs interface.  It tests the
functionality of 'nr' files and existence of files in each directory of
the hierarchy.

Link: https://lkml.kernel.org/r/20220228081314.5770-12-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Xin Hao <xhao@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/damon/Makefile |   1 +
 tools/testing/selftests/damon/sysfs.sh | 306 +++++++++++++++++++++++++
 2 files changed, 307 insertions(+)
 create mode 100644 tools/testing/selftests/damon/sysfs.sh

diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile
index 937d36ae9a69c..0470c5f3e6906 100644
--- a/tools/testing/selftests/damon/Makefile
+++ b/tools/testing/selftests/damon/Makefile
@@ -6,5 +6,6 @@ TEST_GEN_FILES += huge_count_read_write
 TEST_FILES = _chk_dependency.sh _debugfs_common.sh
 TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh
 TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh
+TEST_PROGS += sysfs.sh
 
 include ../lib.mk
diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh
new file mode 100644
index 0000000000000..2e3ae77cb6db7
--- /dev/null
+++ b/tools/testing/selftests/damon/sysfs.sh
@@ -0,0 +1,306 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Kselftest frmework requirement - SKIP code is 4.
+ksft_skip=4
+
+ensure_write_succ()
+{
+	file=$1
+	content=$2
+	reason=$3
+
+	if ! echo "$content" > "$file"
+	then
+		echo "writing $content to $file failed"
+		echo "expected success because $reason"
+		exit 1
+	fi
+}
+
+ensure_write_fail()
+{
+	file=$1
+	content=$2
+	reason=$3
+
+	if echo "$content" > "$file"
+	then
+		echo "writing $content to $file succeed ($fail_reason)"
+		echo "expected failure because $reason"
+		exit 1
+	fi
+}
+
+ensure_dir()
+{
+	dir=$1
+	to_ensure=$2
+	if [ "$to_ensure" = "exist" ] && [ ! -d "$dir" ]
+	then
+		echo "$dir dir is expected but not found"
+		exit 1
+	elif [ "$to_ensure" = "not_exist" ] && [ -d "$dir" ]
+	then
+		echo "$dir dir is not expected but found"
+		exit 1
+	fi
+}
+
+ensure_file()
+{
+	file=$1
+	to_ensure=$2
+	permission=$3
+	if [ "$to_ensure" = "exist" ]
+	then
+		if [ ! -f "$file" ]
+		then
+			echo "$file is expected but not found"
+			exit 1
+		fi
+		perm=$(stat -c "%a" "$file")
+		if [ ! "$perm" = "$permission" ]
+		then
+			echo "$file permission: expected $permission but $perm"
+			exit 1
+		fi
+	elif [ "$to_ensure" = "not_exist" ] && [ -f "$dir" ]
+	then
+		echo "$file is not expected but found"
+		exit 1
+	fi
+}
+
+test_range()
+{
+	range_dir=$1
+	ensure_dir "$range_dir" "exist"
+	ensure_file "$range_dir/min" "exist" 600
+	ensure_file "$range_dir/max" "exist" 600
+}
+
+test_stats()
+{
+	stats_dir=$1
+	ensure_dir "$stats_dir" "exist"
+	for f in nr_tried sz_tried nr_applied sz_applied qt_exceeds
+	do
+		ensure_file "$stats_dir/$f" "exist" "400"
+	done
+}
+
+test_watermarks()
+{
+	watermarks_dir=$1
+	ensure_dir "$watermarks_dir" "exist"
+	ensure_file "$watermarks_dir/metric" "exist" "600"
+	ensure_file "$watermarks_dir/interval_us" "exist" "600"
+	ensure_file "$watermarks_dir/high" "exist" "600"
+	ensure_file "$watermarks_dir/mid" "exist" "600"
+	ensure_file "$watermarks_dir/low" "exist" "600"
+}
+
+test_weights()
+{
+	weights_dir=$1
+	ensure_dir "$weights_dir" "exist"
+	ensure_file "$weights_dir/sz_permil" "exist" "600"
+	ensure_file "$weights_dir/nr_accesses_permil" "exist" "600"
+	ensure_file "$weights_dir/age_permil" "exist" "600"
+}
+
+test_quotas()
+{
+	quotas_dir=$1
+	ensure_dir "$quotas_dir" "exist"
+	ensure_file "$quotas_dir/ms" "exist" 600
+	ensure_file "$quotas_dir/bytes" "exist" 600
+	ensure_file "$quotas_dir/reset_interval_ms" "exist" 600
+	test_weights "$quotas_dir/weights"
+}
+
+test_access_pattern()
+{
+	access_pattern_dir=$1
+	ensure_dir "$access_pattern_dir" "exist"
+	test_range "$access_pattern_dir/age"
+	test_range "$access_pattern_dir/nr_accesses"
+	test_range "$access_pattern_dir/sz"
+}
+
+test_scheme()
+{
+	scheme_dir=$1
+	ensure_dir "$scheme_dir" "exist"
+	ensure_file "$scheme_dir/action" "exist" "600"
+	test_access_pattern "$scheme_dir/access_pattern"
+	test_quotas "$scheme_dir/quotas"
+	test_watermarks "$scheme_dir/watermarks"
+	test_stats "$scheme_dir/stats"
+}
+
+test_schemes()
+{
+	schemes_dir=$1
+	ensure_dir "$schemes_dir" "exist"
+	ensure_file "$schemes_dir/nr_schemes" "exist" 600
+
+	ensure_write_succ  "$schemes_dir/nr_schemes" "1" "valid input"
+	test_scheme "$schemes_dir/0"
+
+	ensure_write_succ  "$schemes_dir/nr_schemes" "2" "valid input"
+	test_scheme "$schemes_dir/0"
+	test_scheme "$schemes_dir/1"
+
+	ensure_write_succ "$schemes_dir/nr_schemes" "0" "valid input"
+	ensure_dir "$schemes_dir/0" "not_exist"
+	ensure_dir "$schemes_dir/1" "not_exist"
+}
+
+test_region()
+{
+	region_dir=$1
+	ensure_dir "$region_dir" "exist"
+	ensure_file "$region_dir/start" "exist" 600
+	ensure_file "$region_dir/end" "exist" 600
+}
+
+test_regions()
+{
+	regions_dir=$1
+	ensure_dir "$regions_dir" "exist"
+	ensure_file "$regions_dir/nr_regions" "exist" 600
+
+	ensure_write_succ  "$regions_dir/nr_regions" "1" "valid input"
+	test_region "$regions_dir/0"
+
+	ensure_write_succ  "$regions_dir/nr_regions" "2" "valid input"
+	test_region "$regions_dir/0"
+	test_region "$regions_dir/1"
+
+	ensure_write_succ "$regions_dir/nr_regions" "0" "valid input"
+	ensure_dir "$regions_dir/0" "not_exist"
+	ensure_dir "$regions_dir/1" "not_exist"
+}
+
+test_target()
+{
+	target_dir=$1
+	ensure_dir "$target_dir" "exist"
+	ensure_file "$target_dir/pid_target" "exist" "600"
+	test_regions "$target_dir/regions"
+}
+
+test_targets()
+{
+	targets_dir=$1
+	ensure_dir "$targets_dir" "exist"
+	ensure_file "$targets_dir/nr_targets" "exist" 600
+
+	ensure_write_succ  "$targets_dir/nr_targets" "1" "valid input"
+	test_target "$targets_dir/0"
+
+	ensure_write_succ  "$targets_dir/nr_targets" "2" "valid input"
+	test_target "$targets_dir/0"
+	test_target "$targets_dir/1"
+
+	ensure_write_succ "$targets_dir/nr_targets" "0" "valid input"
+	ensure_dir "$targets_dir/0" "not_exist"
+	ensure_dir "$targets_dir/1" "not_exist"
+}
+
+test_intervals()
+{
+	intervals_dir=$1
+	ensure_dir "$intervals_dir" "exist"
+	ensure_file "$intervals_dir/aggr_us" "exist" "600"
+	ensure_file "$intervals_dir/sample_us" "exist" "600"
+	ensure_file "$intervals_dir/update_us" "exist" "600"
+}
+
+test_monitoring_attrs()
+{
+	monitoring_attrs_dir=$1
+	ensure_dir "$monitoring_attrs_dir" "exist"
+	test_intervals "$monitoring_attrs_dir/intervals"
+	test_range "$monitoring_attrs_dir/nr_regions"
+}
+
+test_context()
+{
+	context_dir=$1
+	ensure_dir "$context_dir" "exist"
+	ensure_file "$context_dir/operations" "exist" 600
+	test_monitoring_attrs "$context_dir/monitoring_attrs"
+	test_targets "$context_dir/targets"
+	test_schemes "$context_dir/schemes"
+}
+
+test_contexts()
+{
+	contexts_dir=$1
+	ensure_dir "$contexts_dir" "exist"
+	ensure_file "$contexts_dir/nr_contexts" "exist" 600
+
+	ensure_write_succ  "$contexts_dir/nr_contexts" "1" "valid input"
+	test_context "$contexts_dir/0"
+
+	ensure_write_fail "$contexts_dir/nr_contexts" "2" "only 0/1 are supported"
+	test_context "$contexts_dir/0"
+
+	ensure_write_succ "$contexts_dir/nr_contexts" "0" "valid input"
+	ensure_dir "$contexts_dir/0" "not_exist"
+}
+
+test_kdamond()
+{
+	kdamond_dir=$1
+	ensure_dir "$kdamond_dir" "exist"
+	ensure_file "$kdamond_dir/state" "exist" "600"
+	ensure_file "$kdamond_dir/pid" "exist" 400
+	test_contexts "$kdamond_dir/contexts"
+}
+
+test_kdamonds()
+{
+	kdamonds_dir=$1
+	ensure_dir "$kdamonds_dir" "exist"
+
+	ensure_file "$kdamonds_dir/nr_kdamonds" "exist" "600"
+
+	ensure_write_succ  "$kdamonds_dir/nr_kdamonds" "1" "valid input"
+	test_kdamond "$kdamonds_dir/0"
+
+	ensure_write_succ  "$kdamonds_dir/nr_kdamonds" "2" "valid input"
+	test_kdamond "$kdamonds_dir/0"
+	test_kdamond "$kdamonds_dir/1"
+
+	ensure_write_succ "$kdamonds_dir/nr_kdamonds" "0" "valid input"
+	ensure_dir "$kdamonds_dir/0" "not_exist"
+	ensure_dir "$kdamonds_dir/1" "not_exist"
+}
+
+test_damon_sysfs()
+{
+	damon_sysfs=$1
+	if [ ! -d "$damon_sysfs" ]
+	then
+		echo "$damon_sysfs not found"
+		exit $ksft_skip
+	fi
+
+	test_kdamonds "$damon_sysfs/kdamonds"
+}
+
+check_dependencies()
+{
+	if [ $EUID -ne 0 ]
+	then
+		echo "Run as root"
+		exit $ksft_skip
+	fi
+}
+
+check_dependencies
+test_damon_sysfs "/sys/kernel/mm/damon/admin"

From 430f9c6e7527a6a14024d3a190d6121417362073 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Mar 2022 14:49:55 -0700
Subject: [PATCH 455/737] Docs/admin-guide/mm/damon/usage: document DAMON sysfs
 interface

This commit adds detailed usage of DAMON sysfs interface in the
admin-guide document for DAMON.

Link: https://lkml.kernel.org/r/20220228081314.5770-13-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Xin Hao <xhao@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 350 ++++++++++++++++++-
 1 file changed, 344 insertions(+), 6 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index b6ec650873b2f..592ea9a508812 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -4,7 +4,7 @@
 Detailed Usages
 ===============
 
-DAMON provides below three interfaces for different users.
+DAMON provides below interfaces for different users.
 
 - *DAMON user space tool.*
   `This <https://github.com/awslabs/damo>`_ is for privileged people such as
@@ -14,17 +14,21 @@ DAMON provides below three interfaces for different users.
   virtual and physical address spaces monitoring.  For more detail, please
   refer to its `usage document
   <https://github.com/awslabs/damo/blob/next/USAGE.md>`_.
-- *debugfs interface.*
-  :ref:`This <debugfs_interface>` is for privileged user space programmers who
+- *sysfs interface.*
+  :ref:`This <sysfs_interface>` is for privileged user space programmers who
   want more optimized use of DAMON.  Using this, users can use DAMON’s major
-  features by reading from and writing to special debugfs files.  Therefore,
-  you can write and use your personalized DAMON debugfs wrapper programs that
-  reads/writes the debugfs files instead of you.  The `DAMON user space tool
+  features by reading from and writing to special sysfs files.  Therefore,
+  you can write and use your personalized DAMON sysfs wrapper programs that
+  reads/writes the sysfs files instead of you.  The `DAMON user space tool
   <https://github.com/awslabs/damo>`_ is one example of such programs.  It
   supports both virtual and physical address spaces monitoring.  Note that this
   interface provides only simple :ref:`statistics <damos_stats>` for the
   monitoring results.  For detailed monitoring results, DAMON provides a
   :ref:`tracepoint <tracepoint>`.
+- *debugfs interface.*
+  :ref:`This <debugfs_interface>` is almost identical to :ref:`sysfs interface
+  <sysfs_interface>`.  This will be removed after next LTS kernel is released,
+  so users should move to the :ref:`sysfs interface <sysfs_interface>`.
 - *Kernel Space Programming Interface.*
   :doc:`This </vm/damon/api>` is for kernel space programmers.  Using this,
   users can utilize every feature of DAMON most flexibly and efficiently by
@@ -32,6 +36,340 @@ DAMON provides below three interfaces for different users.
   DAMON for various address spaces.  For detail, please refer to the interface
   :doc:`document </vm/damon/api>`.
 
+.. _sysfs_interface:
+
+sysfs Interface
+===============
+
+DAMON sysfs interface is built when ``CONFIG_DAMON_SYSFS`` is defined.  It
+creates multiple directories and files under its sysfs directory,
+``<sysfs>/kernel/mm/damon/``.  You can control DAMON by writing to and reading
+from the files under the directory.
+
+For a short example, users can monitor the virtual address space of a given
+workload as below. ::
+
+    # cd /sys/kernel/mm/damon/admin/
+    # echo 1 > kdamonds/nr && echo 1 > kdamonds/0/contexts/nr
+    # echo vaddr > kdamonds/0/contexts/0/operations
+    # echo 1 > kdamonds/0/contexts/0/targets/nr
+    # echo $(pidof <workload>) > kdamonds/0/contexts/0/targets/0/pid
+    # echo on > kdamonds/0/state
+
+Files Hierarchy
+---------------
+
+The files hierarchy of DAMON sysfs interface is shown below.  In the below
+figure, parents-children relations are represented with indentations, each
+directory is having ``/`` suffix, and files in each directory are separated by
+comma (","). ::
+
+    /sys/kernel/mm/damon/admin
+    │ kdamonds/nr_kdamonds
+    │ │ 0/state,pid
+    │ │ │ contexts/nr_contexts
+    │ │ │ │ 0/operations
+    │ │ │ │ │ monitoring_attrs/
+    │ │ │ │ │ │ intervals/sample_us,aggr_us,update_us
+    │ │ │ │ │ │ nr_regions/min,max
+    │ │ │ │ │ targets/nr_targets
+    │ │ │ │ │ │ 0/pid_target
+    │ │ │ │ │ │ │ regions/nr_regions
+    │ │ │ │ │ │ │ │ 0/start,end
+    │ │ │ │ │ │ │ │ ...
+    │ │ │ │ │ │ ...
+    │ │ │ │ │ schemes/nr_schemes
+    │ │ │ │ │ │ 0/action
+    │ │ │ │ │ │ │ access_pattern/
+    │ │ │ │ │ │ │ │ sz/min,max
+    │ │ │ │ │ │ │ │ nr_accesses/min,max
+    │ │ │ │ │ │ │ │ age/min,max
+    │ │ │ │ │ │ │ quotas/ms,bytes,reset_interval_ms
+    │ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil
+    │ │ │ │ │ │ │ watermarks/metric,interval_us,high,mid,low
+    │ │ │ │ │ │ │ stats/nr_tried,sz_tried,nr_applied,sz_applied,qt_exceeds
+    │ │ │ │ │ │ ...
+    │ │ │ │ ...
+    │ │ ...
+
+Root
+----
+
+The root of the DAMON sysfs interface is ``<sysfs>/kernel/mm/damon/``, and it
+has one directory named ``admin``.  The directory contains the files for
+privileged user space programs' control of DAMON.  User space tools or deamons
+having the root permission could use this directory.
+
+kdamonds/
+---------
+
+The monitoring-related information including request specifications and results
+are called DAMON context.  DAMON executes each context with a kernel thread
+called kdamond, and multiple kdamonds could run in parallel.
+
+Under the ``admin`` directory, one directory, ``kdamonds``, which has files for
+controlling the kdamonds exist.  In the beginning, this directory has only one
+file, ``nr_kdamonds``.  Writing a number (``N``) to the file creates the number
+of child directories named ``0`` to ``N-1``.  Each directory represents each
+kdamond.
+
+kdamonds/<N>/
+-------------
+
+In each kdamond directory, two files (``state`` and ``pid``) and one directory
+(``contexts``) exist.
+
+Reading ``state`` returns ``on`` if the kdamond is currently running, or
+``off`` if it is not running.  Writing ``on`` or ``off`` makes the kdamond be
+in the state.  Writing ``update_schemes_stats`` to ``state`` file updates the
+contents of stats files for each DAMON-based operation scheme of the kdamond.
+For details of the stats, please refer to :ref:`stats section
+<sysfs_schemes_stats>`.
+
+If the state is ``on``, reading ``pid`` shows the pid of the kdamond thread.
+
+``contexts`` directory contains files for controlling the monitoring contexts
+that this kdamond will execute.
+
+kdamonds/<N>/contexts/
+----------------------
+
+In the beginning, this directory has only one file, ``nr_contexts``.  Writing a
+number (``N``) to the file creates the number of child directories named as
+``0`` to ``N-1``.  Each directory represents each monitoring context.  At the
+moment, only one context per kdamond is supported, so only ``0`` or ``1`` can
+be written to the file.
+
+contexts/<N>/
+-------------
+
+In each context directory, one file (``operations``) and three directories
+(``monitoring_attrs``, ``targets``, and ``schemes``) exist.
+
+DAMON supports multiple types of monitoring operations, including those for
+virtual address space and the physical address space.  You can set and get what
+type of monitoring operations DAMON will use for the context by writing one of
+below keywords to, and reading from the file.
+
+ - vaddr: Monitor virtual address spaces of specific processes
+ - paddr: Monitor the physical address space of the system
+
+contexts/<N>/monitoring_attrs/
+------------------------------
+
+Files for specifying attributes of the monitoring including required quality
+and efficiency of the monitoring are in ``monitoring_attrs`` directory.
+Specifically, two directories, ``intervals`` and ``nr_regions`` exist in this
+directory.
+
+Under ``intervals`` directory, three files for DAMON's sampling interval
+(``sample_us``), aggregation interval (``aggr_us``), and update interval
+(``update_us``) exist.  You can set and get the values in micro-seconds by
+writing to and reading from the files.
+
+Under ``nr_regions`` directory, two files for the lower-bound and upper-bound
+of DAMON's monitoring regions (``min`` and ``max``, respectively), which
+controls the monitoring overhead, exist.  You can set and get the values by
+writing to and rading from the files.
+
+For more details about the intervals and monitoring regions range, please refer
+to the Design document (:doc:`/vm/damon/design`).
+
+contexts/<N>/targets/
+---------------------
+
+In the beginning, this directory has only one file, ``nr_targets``.  Writing a
+number (``N``) to the file creates the number of child directories named ``0``
+to ``N-1``.  Each directory represents each monitoring target.
+
+targets/<N>/
+------------
+
+In each target directory, one file (``pid_target``) and one directory
+(``regions``) exist.
+
+If you wrote ``vaddr`` to the ``contexts/<N>/operations``, each target should
+be a process.  You can specify the process to DAMON by writing the pid of the
+process to the ``pid_target`` file.
+
+targets/<N>/regions
+-------------------
+
+When ``vaddr`` monitoring operations set is being used (``vaddr`` is written to
+the ``contexts/<N>/operations`` file), DAMON automatically sets and updates the
+monitoring target regions so that entire memory mappings of target processes
+can be covered.  However, users could want to set the initial monitoring region
+to specific address ranges.
+
+In contrast, DAMON do not automatically sets and updates the monitoring target
+regions when ``paddr`` monitoring operations set is being used (``paddr`` is
+written to the ``contexts/<N>/operations``).  Therefore, users should set the
+monitoring target regions by themselves in the case.
+
+For such cases, users can explicitly set the initial monitoring target regions
+as they want, by writing proper values to the files under this directory.
+
+In the beginning, this directory has only one file, ``nr_regions``.  Writing a
+number (``N``) to the file creates the number of child directories named ``0``
+to ``N-1``.  Each directory represents each initial monitoring target region.
+
+regions/<N>/
+------------
+
+In each region directory, you will find two files (``start`` and ``end``).  You
+can set and get the start and end addresses of the initial monitoring target
+region by writing to and reading from the files, respectively.
+
+contexts/<N>/schemes/
+---------------------
+
+For usual DAMON-based data access aware memory management optimizations, users
+would normally want the system to apply a memory management action to a memory
+region of a specific access pattern.  DAMON receives such formalized operation
+schemes from the user and applies those to the target memory regions.  Users
+can get and set the schemes by reading from and writing to files under this
+directory.
+
+In the beginning, this directory has only one file, ``nr_schemes``.  Writing a
+number (``N``) to the file creates the number of child directories named ``0``
+to ``N-1``.  Each directory represents each DAMON-based operation scheme.
+
+schemes/<N>/
+------------
+
+In each scheme directory, four directories (``access_pattern``, ``quotas``,
+``watermarks``, and ``stats``) and one file (``action``) exist.
+
+The ``action`` file is for setting and getting what action you want to apply to
+memory regions having specific access pattern of the interest.  The keywords
+that can be written to and read from the file and their meaning are as below.
+
+ - ``willneed``: Call ``madvise()`` for the region with ``MADV_WILLNEED``
+ - ``cold``: Call ``madvise()`` for the region with ``MADV_COLD``
+ - ``pageout``: Call ``madvise()`` for the region with ``MADV_PAGEOUT``
+ - ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``
+ - ``nohugepage``: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``
+ - ``stat``: Do nothing but count the statistics
+
+schemes/<N>/access_pattern/
+---------------------------
+
+The target access pattern of each DAMON-based operation scheme is constructed
+with three ranges including the size of the region in bytes, number of
+monitored accesses per aggregate interval, and number of aggregated intervals
+for the age of the region.
+
+Under the ``access_pattern`` directory, three directories (``sz``,
+``nr_accesses``, and ``age``) each having two files (``min`` and ``max``)
+exist.  You can set and get the access pattern for the given scheme by writing
+to and reading from the ``min`` and ``max`` files under ``sz``,
+``nr_accesses``, and ``age`` directories, respectively.
+
+schemes/<N>/quotas/
+-------------------
+
+Optimal ``target access pattern`` for each ``action`` is workload dependent, so
+not easy to find.  Worse yet, setting a scheme of some action too aggressive
+can cause severe overhead.  To avoid such overhead, users can limit time and
+size quota for each scheme.  In detail, users can ask DAMON to try to use only
+up to specific time (``time quota``) for applying the action, and to apply the
+action to only up to specific amount (``size quota``) of memory regions having
+the target access pattern within a given time interval (``reset interval``).
+
+When the quota limit is expected to be exceeded, DAMON prioritizes found memory
+regions of the ``target access pattern`` based on their size, access frequency,
+and age.  For personalized prioritization, users can set the weights for the
+three properties.
+
+Under ``quotas`` directory, three files (``ms``, ``bytes``,
+``reset_interval_ms``) and one directory (``weights``) having three files
+(``sz_permil``, ``nr_accesses_permil``, and ``age_permil``) in it exist.
+
+You can set the ``time quota`` in milliseconds, ``size quota`` in bytes, and
+``reset interval`` in milliseconds by writing the values to the three files,
+respectively.  You can also set the prioritization weights for size, access
+frequency, and age in per-thousand unit by writing the values to the three
+files under the ``weights`` directory.
+
+schemes/<N>/watermarks/
+-----------------------
+
+To allow easy activation and deactivation of each scheme based on system
+status, DAMON provides a feature called watermarks.  The feature receives five
+values called ``metric``, ``interval``, ``high``, ``mid``, and ``low``.  The
+``metric`` is the system metric such as free memory ratio that can be measured.
+If the metric value of the system is higher than the value in ``high`` or lower
+than ``low`` at the memoent, the scheme is deactivated.  If the value is lower
+than ``mid``, the scheme is activated.
+
+Under the watermarks directory, five files (``metric``, ``interval_us``,
+``high``, ``mid``, and ``low``) for setting each value exist.  You can set and
+get the five values by writing to the files, respectively.
+
+Keywords and meanings of those that can be written to the ``metric`` file are
+as below.
+
+ - none: Ignore the watermarks
+ - free_mem_rate: System's free memory rate (per thousand)
+
+The ``interval`` should written in microseconds unit.
+
+.. _sysfs_schemes_stats:
+
+schemes/<N>/stats/
+------------------
+
+DAMON counts the total number and bytes of regions that each scheme is tried to
+be applied, the two numbers for the regions that each scheme is successfully
+applied, and the total number of the quota limit exceeds.  This statistics can
+be used for online analysis or tuning of the schemes.
+
+The statistics can be retrieved by reading the files under ``stats`` directory
+(``nr_tried``, ``sz_tried``, ``nr_applied``, ``sz_applied``, and
+``qt_exceeds``), respectively.  The files are not updated in real time, so you
+should ask DAMON sysfs interface to updte the content of the files for the
+stats by writing a special keyword, ``update_schemes_stats`` to the relevant
+``kdamonds/<N>/state`` file.
+
+Example
+~~~~~~~
+
+Below commands applies a scheme saying "If a memory region of size in [4KiB,
+8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate
+interval in [10, 20], page out the region.  For the paging out, use only up to
+10ms per second, and also don't page out more than 1GiB per second.  Under the
+limitation, page out memory regions having longer age first.  Also, check the
+free memory rate of the system every 5 seconds, start the monitoring and paging
+out when the free memory rate becomes lower than 50%, but stop it if the free
+memory rate becomes larger than 60%, or lower than 30%". ::
+
+    # cd <sysfs>/kernel/mm/damon/admin
+    # # populate directories
+    # echo 1 > kdamonds/nr_kdamonds; echo 1 > kdamonds/0/contexts/nr_contexts;
+    # echo 1 > kdamonds/0/contexts/0/schemes/nr_schemes
+    # cd kdamonds/0/contexts/0/schemes/0
+    # # set the basic access pattern and the action
+    # echo 4096 > access_patterns/sz/min
+    # echo 8192 > access_patterns/sz/max
+    # echo 0 > access_patterns/nr_accesses/min
+    # echo 5 > access_patterns/nr_accesses/max
+    # echo 10 > access_patterns/age/min
+    # echo 20 > access_patterns/age/max
+    # echo pageout > action
+    # # set quotas
+    # echo 10 > quotas/ms
+    # echo $((1024*1024*1024)) > quotas/bytes
+    # echo 1000 > quotas/reset_interval_ms
+    # # set watermark
+    # echo free_mem_rate > watermarks/metric
+    # echo 5000000 > watermarks/interval_us
+    # echo 600 > watermarks/high
+    # echo 500 > watermarks/mid
+    # echo 300 > watermarks/low
+
+Please note that it's highly recommended to use user space tools like `damo
+<https://github.com/awslabs/damo>`_ rather than manually reading and writing
+the files as above.  Above is only for an example.
 
 .. _debugfs_interface:
 

From eda3c29a298168604f0f0312d30f7cad5ed5c426 Mon Sep 17 00:00:00 2001
From: Xin Hao <xhao@linux.alibaba.com>
Date: Tue, 22 Mar 2022 14:50:00 -0700
Subject: [PATCH 456/737] mm/damon/sysfs: remove repeat container_of() in
 damon_sysfs_kdamond_release()

In damon_sysfs_kdamond_release(), we have use container_of() to get
"kdamond" pointer, so there no need to get it once again.

Link: https://lkml.kernel.org/r/20220303075314.22502-1-xhao@linux.alibaba.com
Signed-off-by: Xin Hao <xhao@linux.alibaba.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/sysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 58bcd2f5b02ae..48e434cd43d8e 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2345,7 +2345,7 @@ static void damon_sysfs_kdamond_release(struct kobject *kobj)
 
 	if (kdamond->damon_ctx)
 		damon_destroy_ctx(kdamond->damon_ctx);
-	kfree(container_of(kobj, struct damon_sysfs_kdamond, kobj));
+	kfree(kdamond);
 }
 
 static struct kobj_attribute damon_sysfs_kdamond_state_attr =

From 5c80a2c9df924e6b573dbd6e19e5c557f69d4c04 Mon Sep 17 00:00:00 2001
From: Jonghyeon Kim <tome01@ajou.ac.kr>
Date: Fri, 1 Apr 2022 11:28:57 -0700
Subject: [PATCH 457/737] mm/damon: prevent activated scheme from sleeping by
 deactivated schemes

In the DAMON, the minimum wait time of the schemes decides whether the
kernel wakes up 'kdamon_fn()'.  But since the minimum wait time is
initialized to zero, there are corner cases against the original
objective.

For example, if we have several schemes for one target, and if the wait
time of the first scheme is zero, the minimum wait time will set zero,
which means 'kdamond_fn()' should wake up to apply this scheme.
However, in the following scheme, wait time can be set to non-zero.
Thus, the mininum wait time will be set to non-zero, which can cause
sleeping this interval for 'kdamon_fn()' due to one deactivated last
scheme.

This commit prevents making DAMON monitoring inactive state due to other
deactivated schemes.

Link: https://lkml.kernel.org/r/20220330105302.32114-1-tome01@ajou.ac.kr
Signed-off-by: Jonghyeon Kim <tome01@ajou.ac.kr>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/damon/core.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index c1e0fed4e877f..5ce8d7c867f04 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1019,12 +1019,15 @@ static int kdamond_wait_activation(struct damon_ctx *ctx)
 	struct damos *s;
 	unsigned long wait_time;
 	unsigned long min_wait_time = 0;
+	bool init_wait_time = false;
 
 	while (!kdamond_need_stop(ctx)) {
 		damon_for_each_scheme(s, ctx) {
 			wait_time = damos_wmark_wait_us(s);
-			if (!min_wait_time || wait_time < min_wait_time)
+			if (!init_wait_time || wait_time < min_wait_time) {
+				init_wait_time = true;
 				min_wait_time = wait_time;
+			}
 		}
 		if (!min_wait_time)
 			return 0;

From f3bc9ffe46342ba1123baec1e4de9b215509e54b Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Mar 2022 14:49:58 -0700
Subject: [PATCH 458/737] Docs/ABI/testing: add DAMON sysfs interface ABI
 document

This commit adds DAMON sysfs interface ABI document under
Documentation/ABI/testing.

Link: https://lkml.kernel.org/r/20220228081314.5770-14-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Xin Hao <xhao@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 .../ABI/testing/sysfs-kernel-mm-damon         | 274 ++++++++++++++++++
 MAINTAINERS                                   |   1 +
 2 files changed, 275 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-damon

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon
new file mode 100644
index 0000000000000..9e282065cbcf8
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon
@@ -0,0 +1,274 @@
+what:		/sys/kernel/mm/damon/
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Interface for Data Access MONitoring (DAMON).  Contains files
+		for controlling DAMON.  For more details on DAMON itself,
+		please refer to Documentation/admin-guide/mm/damon/index.rst.
+
+What:		/sys/kernel/mm/damon/admin/
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Interface for privileged users of DAMON.  Contains files for
+		controlling DAMON that aimed to be used by privileged users.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/nr_kdamonds
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing a number 'N' to this file creates the number of
+		directories for controlling each DAMON worker thread (kdamond)
+		named '0' to 'N-1' under the kdamonds/ directory.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/state
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing 'on' or 'off' to this file makes the kdamond starts or
+		stops, respectively.  Reading the file returns the keywords
+		based on the current status.  Writing 'update_schemes_stats' to
+		the file updates contents of schemes stats files of the
+		kdamond.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/pid
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Reading this file returns the pid of the kdamond if it is
+		running.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/nr_contexts
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing a number 'N' to this file creates the number of
+		directories for controlling each DAMON context named '0' to
+		'N-1' under the contexts/ directory.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/operations
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing a keyword for a monitoring operations set ('vaddr' for
+		virtual address spaces monitoring, and 'paddr' for the physical
+		address space monitoring) to this file makes the context to use
+		the operations set.  Reading the file returns the keyword for
+		the operations set the context is set to use.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/intervals/sample_us
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing a value to this file sets the sampling interval of the
+		DAMON context in microseconds as the value.  Reading this file
+		returns the value.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/intervals/aggr_us
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing a value to this file sets the aggregation interval of
+		the DAMON context in microseconds as the value.  Reading this
+		file returns the value.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/intervals/update_us
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing a value to this file sets the update interval of the
+		DAMON context in microseconds as the value.  Reading this file
+		returns the value.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/nr_regions/min
+
+WDate:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing a value to this file sets the minimum number of
+		monitoring regions of the DAMON context as the value.  Reading
+		this file returns the value.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/nr_regions/max
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing a value to this file sets the maximum number of
+		monitoring regions of the DAMON context as the value.  Reading
+		this file returns the value.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/targets/nr_targets
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing a number 'N' to this file creates the number of
+		directories for controlling each DAMON target of the context
+		named '0' to 'N-1' under the contexts/ directory.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/targets/<T>/pid_target
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing to and reading from this file sets and gets the pid of
+		the target process if the context is for virtual address spaces
+		monitoring, respectively.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/targets/<T>/regions/nr_regions
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing a number 'N' to this file creates the number of
+		directories for setting each DAMON target memory region of the
+		context named '0' to 'N-1' under the regions/ directory.  In
+		case of the virtual address space monitoring, DAMON
+		automatically sets the target memory region based on the target
+		processes' mappings.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/targets/<T>/regions/<R>/start
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing to and reading from this file sets and gets the start
+		address of the monitoring region.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/targets/<T>/regions/<R>/end
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing to and reading from this file sets and gets the end
+		address of the monitoring region.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/nr_schemes
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing a number 'N' to this file creates the number of
+		directories for controlling each DAMON-based operation scheme
+		of the context named '0' to 'N-1' under the schemes/ directory.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/action
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing to and reading from this file sets and gets the action
+		of the scheme.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/access_pattern/sz/min
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing to and reading from this file sets and gets the mimimum
+		size of the scheme's target regions in bytes.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/access_pattern/sz/max
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing to and reading from this file sets and gets the maximum
+		size of the scheme's target regions in bytes.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/access_pattern/nr_accesses/min
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing to and reading from this file sets and gets the manimum
+		'nr_accesses' of the scheme's target regions.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/access_pattern/nr_accesses/max
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing to and reading from this file sets and gets the maximum
+		'nr_accesses' of the scheme's target regions.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/access_pattern/age/min
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing to and reading from this file sets and gets the minimum
+		'age' of the scheme's target regions.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/access_pattern/age/max
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing to and reading from this file sets and gets the maximum
+		'age' of the scheme's target regions.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/ms
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing to and reading from this file sets and gets the time
+		quota of the scheme in milliseconds.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/bytes
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing to and reading from this file sets and gets the size
+		quota of the scheme in bytes.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/reset_interval_ms
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing to and reading from this file sets and gets the quotas
+		charge reset interval of the scheme in milliseconds.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/weights/sz_permil
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing to and reading from this file sets and gets the
+		under-quota limit regions prioritization weight for 'size' in
+		permil.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/weights/nr_accesses_permil
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing to and reading from this file sets and gets the
+		under-quota limit regions prioritization weight for
+		'nr_accesses' in permil.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/weights/age_permil
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing to and reading from this file sets and gets the
+		under-quota limit regions prioritization weight for 'age' in
+		permil.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/watermarks/metric
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing to and reading from this file sets and gets the metric
+		of the watermarks for the scheme.  The writable/readable
+		keywords for this file are 'none' for disabling the watermarks
+		feature, or 'free_mem_rate' for the system's global free memory
+		rate in permil.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/watermarks/interval_us
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing to and reading from this file sets and gets the metric
+		check interval of the watermarks for the scheme in
+		microseconds.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/watermarks/high
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing to and reading from this file sets and gets the high
+		watermark of the scheme in permil.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/watermarks/mid
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing to and reading from this file sets and gets the mid
+		watermark of the scheme in permil.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/watermarks/low
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing to and reading from this file sets and gets the low
+		watermark of the scheme in permil.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/stats/nr_tried
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Reading this file returns the number of regions that the action
+		of the scheme has tried to be applied.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/stats/sz_tried
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Reading this file returns the total size of regions that the
+		action of the scheme has tried to be applied in bytes.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/stats/nr_applied
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Reading this file returns the number of regions that the action
+		of the scheme has successfully applied.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/stats/sz_applied
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Reading this file returns the total size of regions that the
+		action of the scheme has successfully applied in bytes.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/stats/qt_exceeds
+Date:		Mar 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Reading this file returns the number of the exceed events of
+		the scheme's quotas.
diff --git a/MAINTAINERS b/MAINTAINERS
index e6e4093db8823..5d3220850dfc6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4876,6 +4876,7 @@ DATA ACCESS MONITOR
 M:	SeongJae Park <sjpark@amazon.de>
 L:	linux-mm@kvack.org
 S:	Maintained
+F:	Documentation/ABI/testing/sysfs-kernel-mm-damon
 F:	Documentation/admin-guide/mm/damon/
 F:	Documentation/vm/damon/
 F:	include/linux/damon.h

From 9928e5416d5d620c4f70772d200765ced0fd5b37 Mon Sep 17 00:00:00 2001
From: Xiaomeng Tong <xiam0nd.tong@gmail.com>
Date: Fri, 29 Apr 2022 14:37:00 -0700
Subject: [PATCH 459/737] damon: vaddr-test: tweak code to make the logic
 clearer

Move these two lines into the damon_for_each_region loop, it is always for
testing the last region.  And also avoid to use a list iterator 'r'
outside the loop which is considered harmful[1].

[1]:  https://lkml.org/lkml/2022/2/17/1032

Link: https://lkml.kernel.org/r/20220328115252.31675-1-xiam0nd.tong@gmail.com
Signed-off-by: Xiaomeng Tong <xiam0nd.tong@gmail.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/vaddr-test.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h
index 1a55bb6c36c3d..5431da4fe9d41 100644
--- a/mm/damon/vaddr-test.h
+++ b/mm/damon/vaddr-test.h
@@ -281,14 +281,16 @@ static void damon_test_split_evenly_succ(struct kunit *test,
 	KUNIT_EXPECT_EQ(test, damon_nr_regions(t), nr_pieces);
 
 	damon_for_each_region(r, t) {
-		if (i == nr_pieces - 1)
+		if (i == nr_pieces - 1) {
+			KUNIT_EXPECT_EQ(test,
+				r->ar.start, start + i * expected_width);
+			KUNIT_EXPECT_EQ(test, r->ar.end, end);
 			break;
+		}
 		KUNIT_EXPECT_EQ(test,
 				r->ar.start, start + i++ * expected_width);
 		KUNIT_EXPECT_EQ(test, r->ar.end, start + i * expected_width);
 	}
-	KUNIT_EXPECT_EQ(test, r->ar.start, start + i * expected_width);
-	KUNIT_EXPECT_EQ(test, r->ar.end, end);
 	damon_free_target(t);
 }
 

From dcc1ba4ab8bc8fb4cd4f3b8a1f507de72eaaaf3c Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 29 Apr 2022 14:37:00 -0700
Subject: [PATCH 460/737] mm/damon/core-test: add a kunit test case for ops
 registration

This commit adds a simple kunit test case for DAMON operations
registration feature.

Link: https://lkml.kernel.org/r/20220419122225.290518-1-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/core-test.h | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h
index b4085deb9fa05..573669566f846 100644
--- a/mm/damon/core-test.h
+++ b/mm/damon/core-test.h
@@ -232,6 +232,41 @@ static void damon_test_split_regions_of(struct kunit *test)
 	damon_destroy_ctx(c);
 }
 
+static void damon_test_ops_registration(struct kunit *test)
+{
+	struct damon_ctx *c = damon_new_ctx();
+	struct damon_operations ops, bak;
+
+	/* DAMON_OPS_{V,P}ADDR are registered on subsys_initcall */
+	KUNIT_EXPECT_EQ(test, damon_select_ops(c, DAMON_OPS_VADDR), 0);
+	KUNIT_EXPECT_EQ(test, damon_select_ops(c, DAMON_OPS_PADDR), 0);
+
+	/* Double-registration is prohibited */
+	ops.id = DAMON_OPS_VADDR;
+	KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL);
+	ops.id = DAMON_OPS_PADDR;
+	KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL);
+
+	/* Unknown ops id cannot be registered */
+	KUNIT_EXPECT_EQ(test, damon_select_ops(c, NR_DAMON_OPS), -EINVAL);
+
+	/* Registration should success after unregistration */
+	mutex_lock(&damon_ops_lock);
+	bak = damon_registered_ops[DAMON_OPS_VADDR];
+	damon_registered_ops[DAMON_OPS_VADDR] = (struct damon_operations){};
+	mutex_unlock(&damon_ops_lock);
+
+	ops.id = DAMON_OPS_VADDR;
+	KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), 0);
+
+	mutex_lock(&damon_ops_lock);
+	damon_registered_ops[DAMON_OPS_VADDR] = bak;
+	mutex_unlock(&damon_ops_lock);
+
+	/* Check double-registration failure again */
+	KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL);
+}
+
 static struct kunit_case damon_test_cases[] = {
 	KUNIT_CASE(damon_test_target),
 	KUNIT_CASE(damon_test_regions),
@@ -240,6 +275,7 @@ static struct kunit_case damon_test_cases[] = {
 	KUNIT_CASE(damon_test_merge_two),
 	KUNIT_CASE(damon_test_merge_regions_of),
 	KUNIT_CASE(damon_test_split_regions_of),
+	KUNIT_CASE(damon_test_ops_registration),
 	{},
 };
 

From dbdb53808b20f47e3cad3893c4d0770376ed3abf Mon Sep 17 00:00:00 2001
From: Yu Zhe <yuzhe@nfschina.com>
Date: Fri, 29 Apr 2022 14:37:00 -0700
Subject: [PATCH 461/737] mm/damon: remove unnecessary type castings

Remove unnecessary void* type castings.

Link: https://lkml.kernel.org/r/20220421153056.8474-1-yuzhe@nfschina.com
Signed-off-by: Yu Zhe <yuzhe@nfschina.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: liqiong <liqiong@nfschina.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index 5ce8d7c867f04..5fe42e47c57bd 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1042,7 +1042,7 @@ static int kdamond_wait_activation(struct damon_ctx *ctx)
  */
 static int kdamond_fn(void *data)
 {
-	struct damon_ctx *ctx = (struct damon_ctx *)data;
+	struct damon_ctx *ctx = data;
 	struct damon_target *t;
 	struct damon_region *r, *next;
 	unsigned int max_nr_accesses = 0;

From d16016df4f59af6faf9ca36d213e550d3f42a40c Mon Sep 17 00:00:00 2001
From: Hailong Tu <tuhailong@gmail.com>
Date: Fri, 29 Apr 2022 14:37:00 -0700
Subject: [PATCH 462/737] mm/damon/reclaim: fix the timer always stays active

The timer stays active even if the reclaim mechanism is never enabled.  It
is unnecessary overhead can be completely avoided by using
module_param_cb() for enabled flag.

Link: https://lkml.kernel.org/r/20220421125910.1052459-1-tuhailong@gmail.com
Signed-off-by: Hailong Tu <tuhailong@gmail.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/reclaim.c | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index e34c4d0c4d939..75cfd96a6060f 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -28,7 +28,6 @@
  * this.
  */
 static bool enabled __read_mostly;
-module_param(enabled, bool, 0600);
 
 /*
  * Time threshold for cold memory regions identification in microseconds.
@@ -358,11 +357,35 @@ static void damon_reclaim_timer_fn(struct work_struct *work)
 			enabled = last_enabled;
 	}
 
-	schedule_delayed_work(&damon_reclaim_timer,
+	if (enabled)
+		schedule_delayed_work(&damon_reclaim_timer,
 			msecs_to_jiffies(ENABLE_CHECK_INTERVAL_MS));
 }
 static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn);
 
+static int enabled_store(const char *val,
+		const struct kernel_param *kp)
+{
+	int rc = param_set_bool(val, kp);
+
+	if (rc < 0)
+		return rc;
+
+	if (enabled)
+		schedule_delayed_work(&damon_reclaim_timer, 0);
+
+	return 0;
+}
+
+static const struct kernel_param_ops enabled_param_ops = {
+	.set = enabled_store,
+	.get = param_get_bool,
+};
+
+module_param_cb(enabled, &enabled_param_ops, &enabled, 0600);
+MODULE_PARM_DESC(enabled,
+	"Enable or disable DAMON_RECLAIM (default: disabled)");
+
 static int damon_reclaim_after_aggregation(struct damon_ctx *c)
 {
 	struct damos *s;

From 16cc855b1852e5813298f5f69c4b9a9ef9f7deb0 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 9 May 2022 18:20:51 -0700
Subject: [PATCH 463/737] mm/damon/core: add a function for damon_operations
 registration checks

Patch series "mm/damon: allow users know which monitoring ops are available".

DAMON users can configure it for vaious address spaces including virtual
address spaces and the physical address space by setting its monitoring
operations set with appropriate one for their purpose.  However, there is
no celan and simple way to know exactly which monitoring operations sets
are available on the currently running kernel.

This patchset adds functions for the purpose on DAMON's kernel API
('damon_is_registered_ops()') and sysfs interface ('avail_operations' file
under each context directory).

This patch (of 4):

To know if a specific 'damon_operations' is registered, users need to
check the kernel config or try 'damon_select_ops()' with the ops of the
question, and then see if it successes.  In the latter case, the user
should also revert the change.  To make the process simple and convenient,
this commit adds a function for checking if a specific 'damon_operations'
is registered or not.

Link: https://lkml.kernel.org/r/20220426203843.45238-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20220426203843.45238-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h |  1 +
 mm/damon/core.c       | 24 +++++++++++++++++++++---
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index f23cbfa4248d4..73ff0e2d2a4db 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -509,6 +509,7 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
 int damon_set_schemes(struct damon_ctx *ctx,
 			struct damos **schemes, ssize_t nr_schemes);
 int damon_nr_running_ctxs(void);
+bool damon_is_registered_ops(enum damon_ops_id id);
 int damon_register_ops(struct damon_operations *ops);
 int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id);
 
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 5fe42e47c57bd..997cf7b17779d 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -30,7 +30,7 @@ static DEFINE_MUTEX(damon_ops_lock);
 static struct damon_operations damon_registered_ops[NR_DAMON_OPS];
 
 /* Should be called under damon_ops_lock with id smaller than NR_DAMON_OPS */
-static bool damon_registered_ops_id(enum damon_ops_id id)
+static bool __damon_is_registered_ops(enum damon_ops_id id)
 {
 	struct damon_operations empty_ops = {};
 
@@ -39,6 +39,24 @@ static bool damon_registered_ops_id(enum damon_ops_id id)
 	return true;
 }
 
+/**
+ * damon_is_registered_ops() - Check if a given damon_operations is registered.
+ * @id:	Id of the damon_operations to check if registered.
+ *
+ * Return: true if the ops is set, false otherwise.
+ */
+bool damon_is_registered_ops(enum damon_ops_id id)
+{
+	bool registered;
+
+	if (id >= NR_DAMON_OPS)
+		return false;
+	mutex_lock(&damon_ops_lock);
+	registered = __damon_is_registered_ops(id);
+	mutex_unlock(&damon_ops_lock);
+	return registered;
+}
+
 /**
  * damon_register_ops() - Register a monitoring operations set to DAMON.
  * @ops:	monitoring operations set to register.
@@ -56,7 +74,7 @@ int damon_register_ops(struct damon_operations *ops)
 		return -EINVAL;
 	mutex_lock(&damon_ops_lock);
 	/* Fail for already registered ops */
-	if (damon_registered_ops_id(ops->id)) {
+	if (__damon_is_registered_ops(ops->id)) {
 		err = -EINVAL;
 		goto out;
 	}
@@ -84,7 +102,7 @@ int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id)
 		return -EINVAL;
 
 	mutex_lock(&damon_ops_lock);
-	if (!damon_registered_ops_id(id))
+	if (!__damon_is_registered_ops(id))
 		err = -EINVAL;
 	else
 		ctx->ops = damon_registered_ops[id];

From 5d0cb80f324cdd59dc5ceda7e6919dc650ba0313 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 9 May 2022 18:20:51 -0700
Subject: [PATCH 464/737] mm/damon/sysfs: add a file for listing available
 monitoring ops

DAMON programming interface users can know if specific monitoring ops set
is registered or not using 'damon_is_registered_ops()', but there is no
such method for the user space.  To help the case, this commit adds a new
DAMON sysfs file called 'avail_operations' under each context directory
for listing available monitoring ops.  Reading the file will list each
registered monitoring ops on each line.

Link: https://lkml.kernel.org/r/20220426203843.45238-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 48e434cd43d8e..6ad6364780b8d 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1810,6 +1810,21 @@ static void damon_sysfs_context_rm_dirs(struct damon_sysfs_context *context)
 	kobject_put(&context->schemes->kobj);
 }
 
+static ssize_t avail_operations_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	enum damon_ops_id id;
+	int len = 0;
+
+	for (id = 0; id < NR_DAMON_OPS; id++) {
+		if (!damon_is_registered_ops(id))
+			continue;
+		len += sysfs_emit_at(buf, len, "%s\n",
+				damon_sysfs_ops_strs[id]);
+	}
+	return len;
+}
+
 static ssize_t operations_show(struct kobject *kobj,
 		struct kobj_attribute *attr, char *buf)
 {
@@ -1840,10 +1855,14 @@ static void damon_sysfs_context_release(struct kobject *kobj)
 	kfree(container_of(kobj, struct damon_sysfs_context, kobj));
 }
 
+static struct kobj_attribute damon_sysfs_context_avail_operations_attr =
+		__ATTR_RO_MODE(avail_operations, 0400);
+
 static struct kobj_attribute damon_sysfs_context_operations_attr =
 		__ATTR_RW_MODE(operations, 0600);
 
 static struct attribute *damon_sysfs_context_attrs[] = {
+	&damon_sysfs_context_avail_operations_attr.attr,
 	&damon_sysfs_context_operations_attr.attr,
 	NULL,
 };

From f7e549bd3f640f3e4794dfc97cab8d659bd0355e Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 9 May 2022 18:20:51 -0700
Subject: [PATCH 465/737] selftets/damon/sysfs: test existence and permission
 of avail_operations

This commit adds a selftest test case for ensuring the existence and the
permission (read-only) of the 'avail_oprations' DAMON sysfs file.

Link: https://lkml.kernel.org/r/20220426203843.45238-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/sysfs.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh
index 2e3ae77cb6db7..89592c64462f8 100644
--- a/tools/testing/selftests/damon/sysfs.sh
+++ b/tools/testing/selftests/damon/sysfs.sh
@@ -231,6 +231,7 @@ test_context()
 {
 	context_dir=$1
 	ensure_dir "$context_dir" "exist"
+	ensure_file "$context_dir/avail_operations" "exit" 400
 	ensure_file "$context_dir/operations" "exist" 600
 	test_monitoring_attrs "$context_dir/monitoring_attrs"
 	test_targets "$context_dir/targets"

From ee024cc76c78a219f1dd9022c94a430b94e26294 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 9 May 2022 18:20:51 -0700
Subject: [PATCH 466/737] Docs/{ABI,admin-guide}/damon: document
 'avail_operations' sysfs file

This commit updates the DAMON ABI and usage documents for the new sysfs
file, 'avail_operations'.

Link: https://lkml.kernel.org/r/20220426203843.45238-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../ABI/testing/sysfs-kernel-mm-damon          | 10 +++++++++-
 Documentation/admin-guide/mm/damon/usage.rst   | 18 ++++++++++++------
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon
index 9e282065cbcf8..d724b8a12228b 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-damon
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon
@@ -40,6 +40,12 @@ Description:	Writing a number 'N' to this file creates the number of
 		directories for controlling each DAMON context named '0' to
 		'N-1' under the contexts/ directory.
 
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/avail_operations
+Date:		Apr 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Reading this file returns the available monitoring operations
+		sets on the currently running kernel.
+
 What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/operations
 Date:		Mar 2022
 Contact:	SeongJae Park <sj@kernel.org>
@@ -47,7 +53,9 @@ Description:	Writing a keyword for a monitoring operations set ('vaddr' for
 		virtual address spaces monitoring, and 'paddr' for the physical
 		address space monitoring) to this file makes the context to use
 		the operations set.  Reading the file returns the keyword for
-		the operations set the context is set to use.
+		the operations set the context is set to use.  Note that only
+		the operations sets that listed in 'avail_operations' file are
+		valid inputs.
 
 What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/intervals/sample_us
 Date:		Mar 2022
diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 592ea9a508812..af6ffaea567bb 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -68,7 +68,7 @@ comma (","). ::
     │ kdamonds/nr_kdamonds
     │ │ 0/state,pid
     │ │ │ contexts/nr_contexts
-    │ │ │ │ 0/operations
+    │ │ │ │ 0/avail_operations,operations
     │ │ │ │ │ monitoring_attrs/
     │ │ │ │ │ │ intervals/sample_us,aggr_us,update_us
     │ │ │ │ │ │ nr_regions/min,max
@@ -143,17 +143,23 @@ be written to the file.
 contexts/<N>/
 -------------
 
-In each context directory, one file (``operations``) and three directories
-(``monitoring_attrs``, ``targets``, and ``schemes``) exist.
+In each context directory, two files (``avail_operations`` and ``operations``)
+and three directories (``monitoring_attrs``, ``targets``, and ``schemes``)
+exist.
 
 DAMON supports multiple types of monitoring operations, including those for
-virtual address space and the physical address space.  You can set and get what
-type of monitoring operations DAMON will use for the context by writing one of
-below keywords to, and reading from the file.
+virtual address space and the physical address space.  You can get the list of
+available monitoring operations set on the currently running kernel by reading
+``avail_operations`` file.  Based on the kernel configuration, the file will
+list some or all of below keywords.
 
  - vaddr: Monitor virtual address spaces of specific processes
  - paddr: Monitor the physical address space of the system
 
+You can set and get what type of monitoring operations DAMON will use for the
+context by writing one of the keywords listed in ``avail_operations`` file and
+reading from the ``operations`` file.
+
 contexts/<N>/monitoring_attrs/
 ------------------------------
 

From dc1ee5d3e7ea54a7573a1eb25ec5b4b6dbfa693a Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 9 May 2022 18:20:52 -0700
Subject: [PATCH 467/737] mm/damon/vaddr: register a damon_operations for fixed
 virtual address ranges monitoring

Patch series "support fixed virtual address ranges monitoring".

The monitoring operations set for virtual address spaces automatically
updates the monitoring target regions to cover entire mappings of the
virtual address spaces as much as possible.  Some users could have more
information about their programs than kernel and therefore have interest
in not entire regions but only specific regions.  For such cases, the
automatic monitoring target regions updates are only unnecessary overhead
or distractions.

This patchset adds supports for the use case on DAMON's kernel API
(DAMON_OPS_FVADDR) and sysfs interface ('fvaddr' keyword for 'operations'
sysfs file).

This patch (of 3):

The monitoring operations set for virtual address spaces automatically
updates the monitoring target regions to cover entire mappings of the
virtual address spaces as much as possible.  Some users could have more
information about their programs than kernel and therefore have interest
in not entire regions but only specific regions.  For such cases, the
automatic monitoring target regions updates are only unnecessary overheads
or distractions.

For such cases, DAMON's API users can simply set the '->init()' and
'->update()' of the DAMON context's '->ops' NULL, and set the target
monitoring regions when creating the context.  But, that would be a dirty
hack.  Worse yet, the hack is unavailable for DAMON user space interface
users.

To support the use case in a clean way that can easily exported to the
user space, this commit adds another monitoring operations set called
'fvaddr', which is same to 'vaddr' but does not automatically update the
monitoring regions.  Instead, it will only respect the virtual address
regions which have explicitly passed at the initial context creation.

Note that this commit leave sysfs interface not supporting the feature
yet.  The support will be made in a following commit.

Link: https://lkml.kernel.org/r/20220426231750.48822-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20220426231750.48822-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h |  3 +++
 mm/damon/sysfs.c      |  4 ++++
 mm/damon/vaddr.c      | 15 +++++++++++++--
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 73ff0e2d2a4db..09a5d0d02c000 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -261,10 +261,13 @@ struct damos {
  * enum damon_ops_id - Identifier for each monitoring operations implementation
  *
  * @DAMON_OPS_VADDR:	Monitoring operations for virtual address spaces
+ * @DAMON_OPS_FVADDR:	Monitoring operations for only fixed ranges of virtual
+ *			address spaces
  * @DAMON_OPS_PADDR:	Monitoring operations for the physical address space
  */
 enum damon_ops_id {
 	DAMON_OPS_VADDR,
+	DAMON_OPS_FVADDR,
 	DAMON_OPS_PADDR,
 	NR_DAMON_OPS,
 };
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 6ad6364780b8d..719a286d378f2 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1694,6 +1694,7 @@ static struct kobj_type damon_sysfs_attrs_ktype = {
 /* This should match with enum damon_ops_id */
 static const char * const damon_sysfs_ops_strs[] = {
 	"vaddr",
+	"unsupported",	/* fvaddr is not supported by sysfs yet */
 	"paddr",
 };
 
@@ -1843,6 +1844,9 @@ static ssize_t operations_store(struct kobject *kobj,
 
 	for (id = 0; id < NR_DAMON_OPS; id++) {
 		if (sysfs_streq(buf, damon_sysfs_ops_strs[id])) {
+			/* fvaddr is not supported by sysfs yet */
+			if (id == DAMON_OPS_FVADDR)
+				return -EINVAL;
 			context->ops_id = id;
 			return count;
 		}
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index b2ec0aa1ff451..5ba82ab4943bc 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -753,8 +753,19 @@ static int __init damon_va_initcall(void)
 		.apply_scheme = damon_va_apply_scheme,
 		.get_scheme_score = damon_va_scheme_score,
 	};
-
-	return damon_register_ops(&ops);
+	/* ops for fixed virtual address ranges */
+	struct damon_operations ops_fvaddr = ops;
+	int err;
+
+	/* Don't set the monitoring target regions for the entire mapping */
+	ops_fvaddr.id = DAMON_OPS_FVADDR;
+	ops_fvaddr.init = NULL;
+	ops_fvaddr.update = NULL;
+
+	err = damon_register_ops(&ops);
+	if (err)
+		return err;
+	return damon_register_ops(&ops_fvaddr);
 };
 
 subsys_initcall(damon_va_initcall);

From d09a378d24586cc55321597433faef985602b3ba Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 9 May 2022 18:20:52 -0700
Subject: [PATCH 468/737] mm/damon/sysfs: support fixed virtual address ranges
 monitoring

This commit makes DAMON sysfs interface to support the fixed virtual
address ranges monitoring.  After this commit, writing 'fvaddr' to the
'operations' DAMON sysfs file makes DAMON uses the monitoring operations
set for fixed virtual address ranges, so that users can monitor accesses
to only interested virtual address ranges.

[sj@kernel.org: fix pid leak under fvaddr ops use case]
  Link: https://lkml.kernel.org/r/20220503220531.45913-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20220426231750.48822-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 719a286d378f2..f753bb4051017 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1694,7 +1694,7 @@ static struct kobj_type damon_sysfs_attrs_ktype = {
 /* This should match with enum damon_ops_id */
 static const char * const damon_sysfs_ops_strs[] = {
 	"vaddr",
-	"unsupported",	/* fvaddr is not supported by sysfs yet */
+	"fvaddr",
 	"paddr",
 };
 
@@ -1844,9 +1844,6 @@ static ssize_t operations_store(struct kobject *kobj,
 
 	for (id = 0; id < NR_DAMON_OPS; id++) {
 		if (sysfs_streq(buf, damon_sysfs_ops_strs[id])) {
-			/* fvaddr is not supported by sysfs yet */
-			if (id == DAMON_OPS_FVADDR)
-				return -EINVAL;
 			context->ops_id = id;
 			return count;
 		}
@@ -2089,7 +2086,8 @@ static void damon_sysfs_destroy_targets(struct damon_ctx *ctx)
 	struct damon_target *t, *next;
 
 	damon_for_each_target_safe(t, next, ctx) {
-		if (ctx->ops.id == DAMON_OPS_VADDR)
+		if (ctx->ops.id == DAMON_OPS_VADDR ||
+				ctx->ops.id == DAMON_OPS_FVADDR)
 			put_pid(t->pid);
 		damon_destroy_target(t);
 	}
@@ -2136,7 +2134,8 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx,
 			damon_sysfs_destroy_targets(ctx);
 			return -ENOMEM;
 		}
-		if (ctx->ops.id == DAMON_OPS_VADDR) {
+		if (ctx->ops.id == DAMON_OPS_VADDR ||
+				ctx->ops.id == DAMON_OPS_FVADDR) {
 			t->pid = find_get_pid(sys_target->pid);
 			if (!t->pid) {
 				damon_sysfs_destroy_targets(ctx);
@@ -2206,7 +2205,7 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
 {
 	struct damon_target *t, *next;
 
-	if (ctx->ops.id != DAMON_OPS_VADDR)
+	if (ctx->ops.id != DAMON_OPS_VADDR && ctx->ops.id != DAMON_OPS_FVADDR)
 		return;
 
 	mutex_lock(&ctx->kdamond_lock);

From 459fcd133c1f275fb4f0148138fbff9173315164 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 9 May 2022 18:20:52 -0700
Subject: [PATCH 469/737] Docs/{ABI,admin-guide}/damon: update for fixed
 virtual address ranges monitoring

This commit documents the user space support of the newly added monitoring
operations set for fixed virtual address ranges monitoring, namely
'fvaddr', on the ABI and usage documents for DAMON.

Link: https://lkml.kernel.org/r/20220426231750.48822-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/ABI/testing/sysfs-kernel-mm-damon | 14 ++++++++------
 Documentation/admin-guide/mm/damon/usage.rst    | 14 +++++++++++---
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon
index d724b8a12228b..fab97ea225691 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-damon
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon
@@ -50,12 +50,14 @@ What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/operations
 Date:		Mar 2022
 Contact:	SeongJae Park <sj@kernel.org>
 Description:	Writing a keyword for a monitoring operations set ('vaddr' for
-		virtual address spaces monitoring, and 'paddr' for the physical
-		address space monitoring) to this file makes the context to use
-		the operations set.  Reading the file returns the keyword for
-		the operations set the context is set to use.  Note that only
-		the operations sets that listed in 'avail_operations' file are
-		valid inputs.
+		virtual address spaces monitoring, 'fvaddr' for fixed virtual
+		address ranges monitoring, and 'paddr' for the physical address
+		space monitoring) to this file makes the context to use the
+		operations set.  Reading the file returns the keyword for the
+		operations set the context is set to use.
+
+		Note that only the operations sets that listed in
+		'avail_operations' file are valid inputs.
 
 What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/intervals/sample_us
 Date:		Mar 2022
diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index af6ffaea567bb..9c67311a79d8c 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -154,8 +154,13 @@ available monitoring operations set on the currently running kernel by reading
 list some or all of below keywords.
 
  - vaddr: Monitor virtual address spaces of specific processes
+ - fvaddr: Monitor fixed virtual address ranges
  - paddr: Monitor the physical address space of the system
 
+Please refer to :ref:`regions sysfs directory <sysfs_regions>` for detailed
+differences between the operations sets in terms of the monitoring target
+regions.
+
 You can set and get what type of monitoring operations DAMON will use for the
 context by writing one of the keywords listed in ``avail_operations`` file and
 reading from the ``operations`` file.
@@ -198,6 +203,8 @@ If you wrote ``vaddr`` to the ``contexts/<N>/operations``, each target should
 be a process.  You can specify the process to DAMON by writing the pid of the
 process to the ``pid_target`` file.
 
+.. _sysfs_regions:
+
 targets/<N>/regions
 -------------------
 
@@ -208,9 +215,10 @@ can be covered.  However, users could want to set the initial monitoring region
 to specific address ranges.
 
 In contrast, DAMON do not automatically sets and updates the monitoring target
-regions when ``paddr`` monitoring operations set is being used (``paddr`` is
-written to the ``contexts/<N>/operations``).  Therefore, users should set the
-monitoring target regions by themselves in the case.
+regions when ``fvaddr`` or ``paddr`` monitoring operations sets are being used
+(``fvaddr`` or ``paddr`` have written to the ``contexts/<N>/operations``).
+Therefore, users should set the monitoring target regions by themselves in the
+cases.
 
 For such cases, users can explicitly set the initial monitoring target regions
 as they want, by writing proper values to the files under this directory.

From 4a62c93d7f293108903289caca6613b81bfb5428 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 9 May 2022 18:20:54 -0700
Subject: [PATCH 470/737] mm/damon/core: add a new callback for watermarks
 checks

Patch series "mm/damon: Support online tuning".

Effects of DAMON and DAMON-based Operation Schemes highly depends on the
configurations.  Wrong configurations could even result in unexpected
efficiency degradations.  For finding a best configuration, repeating
incremental configuration changes and results measurements, in other
words, online tuning, could be helpful.

Nevertheless, DAMON kernel API supports only restrictive online tuning.
Worse yet, the sysfs-based DAMON user interface doesn't support online
tuning at all.  DAMON_RECLAIM also doesn't support online tuning.

This patchset makes the DAMON kernel API, DAMON sysfs interface, and
DAMON_RECLAIM supports online tuning.

Sequence of patches
-------------------

First two patches enhance DAMON online tuning for kernel API users.
Specifically, patch 1 let kernel API users to be able to do DAMON online
tuning without a restriction, and patch 2 makes error handling easier.

Following seven patches (patches 3-9) refactor code for better readability
and easier reuse of code fragments that will be useful for online tuning
support.

Patch 10 introduces DAMON callback based user request handling structure
for DAMON sysfs interface, and patch 11 enables DAMON online tuning via
DAMON sysfs interface.  Documentation patch (patch 12) for usage of it
follows.

Patch 13 enables online tuning of DAMON_RECLAIM and finally patch 14
documents the DAMON_RECLAIM online tuning usage.

This patch (of 14):

For updating input parameters for running DAMON contexts, DAMON kernel API
users can use the contexts' callbacks, as it is the safe place for context
internal data accesses.  When the context has DAMON-based operation
schemes and all schemes are deactivated due to their watermarks, however,
DAMON does nothing but only watermarks checks.  As a result, no callbacks
will be called back, and therefore the kernel API users cannot update the
input parameters including monitoring attributes, DAMON-based operation
schemes, and watermarks.

To let users easily update such DAMON input parameters in such a case,
this commit adds a new callback, 'after_wmarks_check()'.  It will be
called after each watermarks check.  Users can do the online input
parameters update in the callback even under the schemes deactivated case.

Link: https://lkml.kernel.org/r/20220429160606.127307-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 7 +++++++
 mm/damon/core.c       | 8 +++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 09a5d0d02c000..6cb5ab5d8e9d5 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -343,6 +343,7 @@ struct damon_operations {
  * struct damon_callback - Monitoring events notification callbacks.
  *
  * @before_start:	Called before starting the monitoring.
+ * @after_wmarks_check:	Called after each schemes' watermarks check.
  * @after_sampling:	Called after each sampling.
  * @after_aggregation:	Called after each aggregation.
  * @before_terminate:	Called before terminating the monitoring.
@@ -353,6 +354,11 @@ struct damon_operations {
  * respectively.  Therefore, those are good places for installing and cleaning
  * @private.
  *
+ * The monitoring thread calls @after_wmarks_check after each DAMON-based
+ * operation schemes' watermarks check.  If users need to make changes to the
+ * attributes of the monitoring context while it's deactivated due to the
+ * watermarks, this is the good place to do.
+ *
  * The monitoring thread calls @after_sampling and @after_aggregation for each
  * of the sampling intervals and aggregation intervals, respectively.
  * Therefore, users can safely access the monitoring results without additional
@@ -365,6 +371,7 @@ struct damon_callback {
 	void *private;
 
 	int (*before_start)(struct damon_ctx *context);
+	int (*after_wmarks_check)(struct damon_ctx *context);
 	int (*after_sampling)(struct damon_ctx *context);
 	int (*after_aggregation)(struct damon_ctx *context);
 	void (*before_terminate)(struct damon_ctx *context);
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 997cf7b17779d..44fe7e452a1ef 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1051,6 +1051,10 @@ static int kdamond_wait_activation(struct damon_ctx *ctx)
 			return 0;
 
 		kdamond_usleep(min_wait_time);
+
+		if (ctx->callback.after_wmarks_check &&
+				ctx->callback.after_wmarks_check(ctx))
+			break;
 	}
 	return -EBUSY;
 }
@@ -1077,8 +1081,10 @@ static int kdamond_fn(void *data)
 	sz_limit = damon_region_sz_limit(ctx);
 
 	while (!kdamond_need_stop(ctx) && !done) {
-		if (kdamond_wait_activation(ctx))
+		if (kdamond_wait_activation(ctx)) {
+			done = true;
 			continue;
+		}
 
 		if (ctx->ops.prepare_access_checks)
 			ctx->ops.prepare_access_checks(ctx);

From 60308444960f4b2efcbfe65be0a4d928f988ba1e Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 9 May 2022 18:20:54 -0700
Subject: [PATCH 471/737] mm/damon/core: finish kdamond as soon as any callback
 returns an error

When 'after_sampling()' or 'after_aggregation()' DAMON callbacks return an
error, kdamond continues the remaining loop once.  It makes no much sense
to run the remaining part while something wrong already happened.  The
context might be corrupted or having invalid data.  This commit therefore
makes kdamond skips the remaining works and immediately finish in the
cases.

Link: https://lkml.kernel.org/r/20220429160606.127307-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/core.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index 44fe7e452a1ef..b6daaff37bece 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1089,8 +1089,10 @@ static int kdamond_fn(void *data)
 		if (ctx->ops.prepare_access_checks)
 			ctx->ops.prepare_access_checks(ctx);
 		if (ctx->callback.after_sampling &&
-				ctx->callback.after_sampling(ctx))
+				ctx->callback.after_sampling(ctx)) {
 			done = true;
+			continue;
+		}
 
 		kdamond_usleep(ctx->sample_interval);
 
@@ -1102,8 +1104,10 @@ static int kdamond_fn(void *data)
 					max_nr_accesses / 10,
 					sz_limit);
 			if (ctx->callback.after_aggregation &&
-					ctx->callback.after_aggregation(ctx))
+					ctx->callback.after_aggregation(ctx)) {
 				done = true;
+				continue;
+			}
 			kdamond_apply_schemes(ctx);
 			kdamond_reset_aggregated(ctx);
 			kdamond_split_regions(ctx);

From 9ecd7a7d524149d257221e764213f56cbdb8863a Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 9 May 2022 18:20:54 -0700
Subject: [PATCH 472/737] mm/damon/vaddr: generalize
 damon_va_apply_three_regions()

'damon_va_apply_three_regions()' is for adjusting address ranges to fit in
three discontiguous ranges.  The function can be generalized for arbitrary
number of discontiguous ranges and reused for future usage, such as
arbitrary online regions update.  For such future usage, this commit
introduces a generalized version of the function called
'damon_set_regions()'.

Link: https://lkml.kernel.org/r/20220429160606.127307-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/vaddr.c | 66 ++++++++++++++++++++++++++++++------------------
 1 file changed, 42 insertions(+), 24 deletions(-)

diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 5ba82ab4943bc..26e9ad80f9ea5 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -297,59 +297,77 @@ static bool damon_intersect(struct damon_region *r,
 }
 
 /*
- * Update damon regions for the three big regions of the given target
+ * damon_set_regions() - Set regions of a target for given address ranges.
+ * @t:		the given target.
+ * @ranges:	array of new monitoring target ranges.
+ * @nr_ranges:	length of @ranges.
  *
- * t		the given target
- * bregions	the three big regions of the target
+ * This function adds new regions to, or modify existing regions of a
+ * monitoring target to fit in specific ranges.
+ *
+ * Return: 0 if success, or negative error code otherwise.
  */
-static void damon_va_apply_three_regions(struct damon_target *t,
-		struct damon_addr_range bregions[3])
+static int damon_set_regions(struct damon_target *t,
+		struct damon_addr_range *ranges, unsigned int nr_ranges)
 {
 	struct damon_region *r, *next;
 	unsigned int i;
 
-	/* Remove regions which are not in the three big regions now */
+	/* Remove regions which are not in the new ranges */
 	damon_for_each_region_safe(r, next, t) {
-		for (i = 0; i < 3; i++) {
-			if (damon_intersect(r, &bregions[i]))
+		for (i = 0; i < nr_ranges; i++) {
+			if (damon_intersect(r, &ranges[i]))
 				break;
 		}
-		if (i == 3)
+		if (i == nr_ranges)
 			damon_destroy_region(r, t);
 	}
 
-	/* Adjust intersecting regions to fit with the three big regions */
-	for (i = 0; i < 3; i++) {
-		struct damon_region *first = NULL, *last;
-		struct damon_region *newr;
-		struct damon_addr_range *br;
+	/* Add new regions or resize existing regions to fit in the ranges */
+	for (i = 0; i < nr_ranges; i++) {
+		struct damon_region *first = NULL, *last, *newr;
+		struct damon_addr_range *range;
 
-		br = &bregions[i];
-		/* Get the first and last regions which intersects with br */
+		range = &ranges[i];
+		/* Get the first/last regions intersecting with the range */
 		damon_for_each_region(r, t) {
-			if (damon_intersect(r, br)) {
+			if (damon_intersect(r, range)) {
 				if (!first)
 					first = r;
 				last = r;
 			}
-			if (r->ar.start >= br->end)
+			if (r->ar.start >= range->end)
 				break;
 		}
 		if (!first) {
-			/* no damon_region intersects with this big region */
+			/* no region intersects with this range */
 			newr = damon_new_region(
-					ALIGN_DOWN(br->start,
+					ALIGN_DOWN(range->start,
 						DAMON_MIN_REGION),
-					ALIGN(br->end, DAMON_MIN_REGION));
+					ALIGN(range->end, DAMON_MIN_REGION));
 			if (!newr)
-				continue;
+				return -ENOMEM;
 			damon_insert_region(newr, damon_prev_region(r), r, t);
 		} else {
-			first->ar.start = ALIGN_DOWN(br->start,
+			/* resize intersecting regions to fit in this range */
+			first->ar.start = ALIGN_DOWN(range->start,
 					DAMON_MIN_REGION);
-			last->ar.end = ALIGN(br->end, DAMON_MIN_REGION);
+			last->ar.end = ALIGN(range->end, DAMON_MIN_REGION);
 		}
 	}
+	return 0;
+}
+
+/*
+ * Update damon regions for the three big regions of the given target
+ *
+ * t		the given target
+ * bregions	the three big regions of the target
+ */
+static void damon_va_apply_three_regions(struct damon_target *t,
+		struct damon_addr_range bregions[3])
+{
+	damon_set_regions(t, bregions, 3);
 }
 
 /*

From 82dc4f8fbe3a87bdcbddff177ba1ba8822df3ed2 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 9 May 2022 18:20:55 -0700
Subject: [PATCH 473/737] mm/damon/vaddr: move 'damon_set_regions()' to core

This commit moves 'damon_set_regions()' from vaddr to core, as it is aimed
to be used by not only 'vaddr' but also other parts of DAMON.

Link: https://lkml.kernel.org/r/20220429160606.127307-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h |  2 ++
 mm/damon/core.c       | 73 +++++++++++++++++++++++++++++++++++++++++++
 mm/damon/vaddr.c      | 73 -------------------------------------------
 3 files changed, 75 insertions(+), 73 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 6cb5ab5d8e9d5..d1e6ee28a2fff 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -494,6 +494,8 @@ static inline void damon_insert_region(struct damon_region *r,
 
 void damon_add_region(struct damon_region *r, struct damon_target *t);
 void damon_destroy_region(struct damon_region *r, struct damon_target *t);
+int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
+		unsigned int nr_ranges);
 
 struct damos *damon_new_scheme(
 		unsigned long min_sz_region, unsigned long max_sz_region,
diff --git a/mm/damon/core.c b/mm/damon/core.c
index b6daaff37bece..7d25dc582fe34 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -157,6 +157,79 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t)
 	damon_free_region(r);
 }
 
+/*
+ * Check whether a region is intersecting an address range
+ *
+ * Returns true if it is.
+ */
+static bool damon_intersect(struct damon_region *r,
+		struct damon_addr_range *re)
+{
+	return !(r->ar.end <= re->start || re->end <= r->ar.start);
+}
+
+/*
+ * damon_set_regions() - Set regions of a target for given address ranges.
+ * @t:		the given target.
+ * @ranges:	array of new monitoring target ranges.
+ * @nr_ranges:	length of @ranges.
+ *
+ * This function adds new regions to, or modify existing regions of a
+ * monitoring target to fit in specific ranges.
+ *
+ * Return: 0 if success, or negative error code otherwise.
+ */
+int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
+		unsigned int nr_ranges)
+{
+	struct damon_region *r, *next;
+	unsigned int i;
+
+	/* Remove regions which are not in the new ranges */
+	damon_for_each_region_safe(r, next, t) {
+		for (i = 0; i < nr_ranges; i++) {
+			if (damon_intersect(r, &ranges[i]))
+				break;
+		}
+		if (i == nr_ranges)
+			damon_destroy_region(r, t);
+	}
+
+	/* Add new regions or resize existing regions to fit in the ranges */
+	for (i = 0; i < nr_ranges; i++) {
+		struct damon_region *first = NULL, *last, *newr;
+		struct damon_addr_range *range;
+
+		range = &ranges[i];
+		/* Get the first/last regions intersecting with the range */
+		damon_for_each_region(r, t) {
+			if (damon_intersect(r, range)) {
+				if (!first)
+					first = r;
+				last = r;
+			}
+			if (r->ar.start >= range->end)
+				break;
+		}
+		if (!first) {
+			/* no region intersects with this range */
+			newr = damon_new_region(
+					ALIGN_DOWN(range->start,
+						DAMON_MIN_REGION),
+					ALIGN(range->end, DAMON_MIN_REGION));
+			if (!newr)
+				return -ENOMEM;
+			damon_insert_region(newr, damon_prev_region(r), r, t);
+		} else {
+			/* resize intersecting regions to fit in this range */
+			first->ar.start = ALIGN_DOWN(range->start,
+					DAMON_MIN_REGION);
+			last->ar.end = ALIGN(range->end, DAMON_MIN_REGION);
+		}
+	}
+	return 0;
+}
+
 struct damos *damon_new_scheme(
 		unsigned long min_sz_region, unsigned long max_sz_region,
 		unsigned int min_nr_accesses, unsigned int max_nr_accesses,
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 26e9ad80f9ea5..fcb44210204a0 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -285,79 +285,6 @@ static void damon_va_init(struct damon_ctx *ctx)
  * Functions for the dynamic monitoring target regions update
  */
 
-/*
- * Check whether a region is intersecting an address range
- *
- * Returns true if it is.
- */
-static bool damon_intersect(struct damon_region *r,
-		struct damon_addr_range *re)
-{
-	return !(r->ar.end <= re->start || re->end <= r->ar.start);
-}
-
-/*
- * damon_set_regions() - Set regions of a target for given address ranges.
- * @t:		the given target.
- * @ranges:	array of new monitoring target ranges.
- * @nr_ranges:	length of @ranges.
- *
- * This function adds new regions to, or modify existing regions of a
- * monitoring target to fit in specific ranges.
- *
- * Return: 0 if success, or negative error code otherwise.
- */
-static int damon_set_regions(struct damon_target *t,
-		struct damon_addr_range *ranges, unsigned int nr_ranges)
-{
-	struct damon_region *r, *next;
-	unsigned int i;
-
-	/* Remove regions which are not in the new ranges */
-	damon_for_each_region_safe(r, next, t) {
-		for (i = 0; i < nr_ranges; i++) {
-			if (damon_intersect(r, &ranges[i]))
-				break;
-		}
-		if (i == nr_ranges)
-			damon_destroy_region(r, t);
-	}
-
-	/* Add new regions or resize existing regions to fit in the ranges */
-	for (i = 0; i < nr_ranges; i++) {
-		struct damon_region *first = NULL, *last, *newr;
-		struct damon_addr_range *range;
-
-		range = &ranges[i];
-		/* Get the first/last regions intersecting with the range */
-		damon_for_each_region(r, t) {
-			if (damon_intersect(r, range)) {
-				if (!first)
-					first = r;
-				last = r;
-			}
-			if (r->ar.start >= range->end)
-				break;
-		}
-		if (!first) {
-			/* no region intersects with this range */
-			newr = damon_new_region(
-					ALIGN_DOWN(range->start,
-						DAMON_MIN_REGION),
-					ALIGN(range->end, DAMON_MIN_REGION));
-			if (!newr)
-				return -ENOMEM;
-			damon_insert_region(newr, damon_prev_region(r), r, t);
-		} else {
-			/* resize intersecting regions to fit in this range */
-			first->ar.start = ALIGN_DOWN(range->start,
-					DAMON_MIN_REGION);
-			last->ar.end = ALIGN(range->end, DAMON_MIN_REGION);
-		}
-	}
-	return 0;
-}
-
 /*
  * Update damon regions for the three big regions of the given target
  *

From a8f8fb4560847d55cdbcd7249875c45e455190ab Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 9 May 2022 18:20:55 -0700
Subject: [PATCH 474/737] mm/damon/vaddr: remove damon_va_apply_three_regions()

'damon_va_apply_three_regions()' is just a wrapper of its general version,
'damon_set_regions()'.  This commit replaces the wrapper calls to directly
call the general version.

Link: https://lkml.kernel.org/r/20220429160606.127307-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/vaddr-test.h |  6 +++---
 mm/damon/vaddr.c      | 18 +-----------------
 2 files changed, 4 insertions(+), 20 deletions(-)

diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h
index 5431da4fe9d41..d4f55f3491007 100644
--- a/mm/damon/vaddr-test.h
+++ b/mm/damon/vaddr-test.h
@@ -109,7 +109,7 @@ static struct damon_region *__nth_region_of(struct damon_target *t, int idx)
 }
 
 /*
- * Test 'damon_va_apply_three_regions()'
+ * Test 'damon_set_regions()'
  *
  * test			kunit object
  * regions		an array containing start/end addresses of current
@@ -124,7 +124,7 @@ static struct damon_region *__nth_region_of(struct damon_target *t, int idx)
  * the change, DAMON periodically reads the mappings, simplifies it to the
  * three regions, and updates the monitoring target regions to fit in the three
  * regions.  The update of current target regions is the role of
- * 'damon_va_apply_three_regions()'.
+ * 'damon_set_regions()'.
  *
  * This test passes the given target regions and the new three regions that
  * need to be applied to the function and check whether it updates the regions
@@ -145,7 +145,7 @@ static void damon_do_test_apply_three_regions(struct kunit *test,
 		damon_add_region(r, t);
 	}
 
-	damon_va_apply_three_regions(t, three_regions);
+	damon_set_regions(t, three_regions, 3);
 
 	for (i = 0; i < nr_expected / 2; i++) {
 		r = __nth_region_of(t, i);
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index fcb44210204a0..77f326323e747 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -281,22 +281,6 @@ static void damon_va_init(struct damon_ctx *ctx)
 	}
 }
 
-/*
- * Functions for the dynamic monitoring target regions update
- */
-
-/*
- * Update damon regions for the three big regions of the given target
- *
- * t		the given target
- * bregions	the three big regions of the target
- */
-static void damon_va_apply_three_regions(struct damon_target *t,
-		struct damon_addr_range bregions[3])
-{
-	damon_set_regions(t, bregions, 3);
-}
-
 /*
  * Update regions for current memory mappings
  */
@@ -308,7 +292,7 @@ static void damon_va_update(struct damon_ctx *ctx)
 	damon_for_each_target(t, ctx) {
 		if (damon_va_three_regions(t, three_regions))
 			continue;
-		damon_va_apply_three_regions(t, three_regions);
+		damon_set_regions(t, three_regions, 3);
 	}
 }
 

From f09130af35444f07e34cb36613ea1397928fd791 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 9 May 2022 18:20:55 -0700
Subject: [PATCH 475/737] mm/damon/sysfs: prohibit multiple physical address
 space monitoring targets

Having multiple targets for physical address space monitoring makes no
sense.  This commit prohibits such a ridiculous DAMON context setup my
making the DAMON context build function to check and return an error for
the case.

Link: https://lkml.kernel.org/r/20220429160606.127307-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index f753bb4051017..3d2da791ebd84 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2125,6 +2125,10 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx,
 {
 	int i, err;
 
+	/* Multiple physical address space monitoring targets makes no sense */
+	if (ctx->ops.id == DAMON_OPS_PADDR && sysfs_targets->nr > 1)
+		return -EINVAL;
+
 	for (i = 0; i < sysfs_targets->nr; i++) {
 		struct damon_sysfs_target *sys_target =
 			sysfs_targets->targets_arr[i];

From ca166fc129e3cd91ee833a5e6648ae694282942e Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 9 May 2022 18:20:55 -0700
Subject: [PATCH 476/737] mm/damon/sysfs: move targets setup code to a
 separated function

This commit separates DAMON sysfs interface's monitoring context targets
setup code to a new function for better readability.

Link: https://lkml.kernel.org/r/20220429160606.127307-8-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs.c | 49 +++++++++++++++++++++++++++---------------------
 1 file changed, 28 insertions(+), 21 deletions(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 3d2da791ebd84..a9d4f93899034 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2120,6 +2120,31 @@ static int damon_sysfs_set_regions(struct damon_target *t,
 	return 0;
 }
 
+static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target,
+		struct damon_ctx *ctx)
+{
+	struct damon_target *t = damon_new_target();
+	int err = -EINVAL;
+
+	if (!t)
+		return -ENOMEM;
+	if (ctx->ops.id == DAMON_OPS_VADDR ||
+			ctx->ops.id == DAMON_OPS_FVADDR) {
+		t->pid = find_get_pid(sys_target->pid);
+		if (!t->pid)
+			goto destroy_targets_out;
+	}
+	damon_add_target(ctx, t);
+	err = damon_sysfs_set_regions(t, sys_target->regions);
+	if (err)
+		goto destroy_targets_out;
+	return 0;
+
+destroy_targets_out:
+	damon_sysfs_destroy_targets(ctx);
+	return err;
+}
+
 static int damon_sysfs_set_targets(struct damon_ctx *ctx,
 		struct damon_sysfs_targets *sysfs_targets)
 {
@@ -2130,28 +2155,10 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx,
 		return -EINVAL;
 
 	for (i = 0; i < sysfs_targets->nr; i++) {
-		struct damon_sysfs_target *sys_target =
-			sysfs_targets->targets_arr[i];
-		struct damon_target *t = damon_new_target();
-
-		if (!t) {
-			damon_sysfs_destroy_targets(ctx);
-			return -ENOMEM;
-		}
-		if (ctx->ops.id == DAMON_OPS_VADDR ||
-				ctx->ops.id == DAMON_OPS_FVADDR) {
-			t->pid = find_get_pid(sys_target->pid);
-			if (!t->pid) {
-				damon_sysfs_destroy_targets(ctx);
-				return -EINVAL;
-			}
-		}
-		damon_add_target(ctx, t);
-		err = damon_sysfs_set_regions(t, sys_target->regions);
-		if (err) {
-			damon_sysfs_destroy_targets(ctx);
+		err = damon_sysfs_add_target(
+				sysfs_targets->targets_arr[i], ctx);
+		if (err)
 			return err;
-		}
 	}
 	return 0;
 }

From 7c5ecc690ea5a206b03542893f6291ad848717ca Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 9 May 2022 18:20:55 -0700
Subject: [PATCH 477/737] mm/damon/sysfs: reuse damon_set_regions() for regions
 setting

'damon_set_regions()' is general enough so that it can also be used for
only creating regions.  This commit makes DAMON sysfs interface to reuse
the function rather keeping two implementations for a same purpose.

Link: https://lkml.kernel.org/r/20220429160606.127307-9-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs.c | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index a9d4f93899034..b85efe2bad783 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2096,28 +2096,31 @@ static void damon_sysfs_destroy_targets(struct damon_ctx *ctx)
 static int damon_sysfs_set_regions(struct damon_target *t,
 		struct damon_sysfs_regions *sysfs_regions)
 {
-	int i;
+	struct damon_addr_range *ranges = kmalloc_array(sysfs_regions->nr,
+			sizeof(*ranges), GFP_KERNEL | __GFP_NOWARN);
+	int i, err = -EINVAL;
 
+	if (!ranges)
+		return -ENOMEM;
 	for (i = 0; i < sysfs_regions->nr; i++) {
 		struct damon_sysfs_region *sys_region =
 			sysfs_regions->regions_arr[i];
-		struct damon_region *prev, *r;
 
 		if (sys_region->start > sys_region->end)
-			return -EINVAL;
-		r = damon_new_region(sys_region->start, sys_region->end);
-		if (!r)
-			return -ENOMEM;
-		damon_add_region(r, t);
-		if (damon_nr_regions(t) > 1) {
-			prev = damon_prev_region(r);
-			if (prev->ar.end > r->ar.start) {
-				damon_destroy_region(r, t);
-				return -EINVAL;
-			}
-		}
+			goto out;
+
+		ranges[i].start = sys_region->start;
+		ranges[i].end = sys_region->end;
+		if (i == 0)
+			continue;
+		if (ranges[i - 1].end > ranges[i].start)
+			goto out;
 	}
-	return 0;
+	err = damon_set_regions(t, ranges, sysfs_regions->nr);
+out:
+	kfree(ranges);
+	return err;
+
 }
 
 static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target,

From 25db5c1664b27454d9608048fb99a2c9f5e3eda1 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 9 May 2022 18:20:55 -0700
Subject: [PATCH 478/737] mm/damon/sysfs: use enum for 'state' input handling

DAMON sysfs 'state' file handling code is using string literals in both
'state_show()' and 'state_store()'.  This makes the code error prone and
inflexible for future extensions.

To improve the situation, this commit defines possible input strings and
'enum' for identifying each input keyword only once, and refactors the
code to reuse those.

Link: https://lkml.kernel.org/r/20220429160606.127307-10-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs.c | 72 +++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 62 insertions(+), 10 deletions(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index b85efe2bad783..446374fb112b7 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2053,6 +2053,32 @@ static bool damon_sysfs_ctx_running(struct damon_ctx *ctx)
 	return running;
 }
 
+/*
+ * enum damon_sysfs_cmd - Commands for a specific kdamond.
+ */
+enum damon_sysfs_cmd {
+	/* @DAMON_SYSFS_CMD_ON: Turn the kdamond on. */
+	DAMON_SYSFS_CMD_ON,
+	/* @DAMON_SYSFS_CMD_OFF: Turn the kdamond off. */
+	DAMON_SYSFS_CMD_OFF,
+	/*
+	 * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS: Update scheme stats sysfs
+	 * files.
+	 */
+	DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS,
+	/*
+	 * @NR_DAMON_SYSFS_CMDS: Total number of DAMON sysfs commands.
+	 */
+	NR_DAMON_SYSFS_CMDS,
+};
+
+/* Should match with enum damon_sysfs_cmd */
+static const char * const damon_sysfs_cmd_strs[] = {
+	"on",
+	"off",
+	"update_schemes_stats",
+};
+
 static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
 		char *buf)
 {
@@ -2066,7 +2092,9 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
 	else
 		running = damon_sysfs_ctx_running(ctx);
 
-	return sysfs_emit(buf, "%s\n", running ? "on" : "off");
+	return sysfs_emit(buf, "%s\n", running ?
+			damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_ON] :
+			damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_OFF]);
 }
 
 static int damon_sysfs_set_attrs(struct damon_ctx *ctx,
@@ -2325,23 +2353,47 @@ static int damon_sysfs_update_schemes_stats(struct damon_sysfs_kdamond *kdamond)
 	return 0;
 }
 
+/*
+ * damon_sysfs_handle_cmd() - Handle a command for a specific kdamond.
+ * @cmd:	The command to handle.
+ * @kdamond:	The kobject wrapper for the associated kdamond.
+ *
+ * This function handles a DAMON sysfs command for a kdamond.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+static int damon_sysfs_handle_cmd(enum damon_sysfs_cmd cmd,
+		struct damon_sysfs_kdamond *kdamond)
+{
+	switch (cmd) {
+	case DAMON_SYSFS_CMD_ON:
+		return damon_sysfs_turn_damon_on(kdamond);
+	case DAMON_SYSFS_CMD_OFF:
+		return damon_sysfs_turn_damon_off(kdamond);
+	case DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS:
+		return damon_sysfs_update_schemes_stats(kdamond);
+	default:
+		break;
+	}
+	return -EINVAL;
+}
+
 static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
 		const char *buf, size_t count)
 {
 	struct damon_sysfs_kdamond *kdamond = container_of(kobj,
 			struct damon_sysfs_kdamond, kobj);
-	ssize_t ret;
+	enum damon_sysfs_cmd cmd;
+	ssize_t ret = -EINVAL;
 
 	if (!mutex_trylock(&damon_sysfs_lock))
 		return -EBUSY;
-	if (sysfs_streq(buf, "on"))
-		ret = damon_sysfs_turn_damon_on(kdamond);
-	else if (sysfs_streq(buf, "off"))
-		ret = damon_sysfs_turn_damon_off(kdamond);
-	else if (sysfs_streq(buf, "update_schemes_stats"))
-		ret = damon_sysfs_update_schemes_stats(kdamond);
-	else
-		ret = -EINVAL;
+	for (cmd = 0; cmd < NR_DAMON_SYSFS_CMDS; cmd++) {
+		if (sysfs_streq(buf, damon_sysfs_cmd_strs[cmd])) {
+			ret = damon_sysfs_handle_cmd(cmd, kdamond);
+			break;
+		}
+	}
 	mutex_unlock(&damon_sysfs_lock);
 	if (!ret)
 		ret = count;

From f76a1899a2c5b0edc31ee7ba2e4c51f2692c160b Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 9 May 2022 18:20:56 -0700
Subject: [PATCH 479/737] mm/damon/sysfs: update schemes stat in the kdamond
 context

Only '->kdamond' and '->kdamond_stop' are protected by 'kdamond_lock' of
'struct damon_ctx'.  All other DAMON context internal data items are
recommended to be accessed in DAMON callbacks, or under some additional
synchronizations.  But, DAMON sysfs is accessing the schemes stat under
'kdamond_lock'.

It makes no big issue as the read values are not used anywhere inside
kernel, but would better to be fixed.  This commit moves the reads to
DAMON callback context, as supposed to be used for the purpose.

Link: https://lkml.kernel.org/r/20220429160606.127307-11-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs.c | 161 +++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 135 insertions(+), 26 deletions(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 446374fb112b7..f181f1d2e0138 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2079,6 +2079,25 @@ static const char * const damon_sysfs_cmd_strs[] = {
 	"update_schemes_stats",
 };
 
+/*
+ * struct damon_sysfs_cmd_request - A request to the DAMON callback.
+ * @cmd:	The command that needs to be handled by the callback.
+ * @kdamond:	The kobject wrapper that associated to the kdamond thread.
+ *
+ * This structure represents a sysfs command request that need to access some
+ * DAMON context-internal data.  Because DAMON context-internal data can be
+ * safely accessed from DAMON callbacks without additional synchronization, the
+ * request will be handled by the DAMON callback.  None-``NULL`` @kdamond means
+ * the request is valid.
+ */
+struct damon_sysfs_cmd_request {
+	enum damon_sysfs_cmd cmd;
+	struct damon_sysfs_kdamond *kdamond;
+};
+
+/* Current DAMON callback request.  Protected by damon_sysfs_lock. */
+static struct damon_sysfs_cmd_request damon_sysfs_cmd_request;
+
 static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
 		char *buf)
 {
@@ -2258,6 +2277,70 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
 	mutex_unlock(&ctx->kdamond_lock);
 }
 
+/*
+ * damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files.
+ * @kdamond:	The kobject wrapper that associated to the kdamond thread.
+ *
+ * This function reads the schemes stats of specific kdamond and update the
+ * related values for sysfs files.  This function should be called from DAMON
+ * callbacks while holding ``damon_syfs_lock``, to safely access the DAMON
+ * contexts-internal data and DAMON sysfs variables.
+ */
+static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond)
+{
+	struct damon_ctx *ctx = kdamond->damon_ctx;
+	struct damon_sysfs_schemes *sysfs_schemes;
+	struct damos *scheme;
+	int schemes_idx = 0;
+
+	if (!ctx)
+		return -EINVAL;
+	sysfs_schemes = kdamond->contexts->contexts_arr[0]->schemes;
+	damon_for_each_scheme(scheme, ctx) {
+		struct damon_sysfs_stats *sysfs_stats;
+
+		sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats;
+		sysfs_stats->nr_tried = scheme->stat.nr_tried;
+		sysfs_stats->sz_tried = scheme->stat.sz_tried;
+		sysfs_stats->nr_applied = scheme->stat.nr_applied;
+		sysfs_stats->sz_applied = scheme->stat.sz_applied;
+		sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds;
+	}
+	return 0;
+}
+
+/*
+ * damon_sysfs_cmd_request_callback() - DAMON callback for handling requests.
+ * @c:	The DAMON context of the callback.
+ *
+ * This function is periodically called back from the kdamond thread for @c.
+ * Then, it checks if there is a waiting DAMON sysfs request and handles it.
+ */
+static int damon_sysfs_cmd_request_callback(struct damon_ctx *c)
+{
+	struct damon_sysfs_kdamond *kdamond;
+	int err = 0;
+
+	/* avoid deadlock due to concurrent state_store('off') */
+	if (!mutex_trylock(&damon_sysfs_lock))
+		return 0;
+	kdamond = damon_sysfs_cmd_request.kdamond;
+	if (!kdamond || kdamond->damon_ctx != c)
+		goto out;
+	switch (damon_sysfs_cmd_request.cmd) {
+	case DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS:
+		err = damon_sysfs_upd_schemes_stats(kdamond);
+		break;
+	default:
+		break;
+	}
+	/* Mark the request as invalid now. */
+	damon_sysfs_cmd_request.kdamond = NULL;
+out:
+	mutex_unlock(&damon_sysfs_lock);
+	return err;
+}
+
 static struct damon_ctx *damon_sysfs_build_ctx(
 		struct damon_sysfs_context *sys_ctx)
 {
@@ -2280,6 +2363,8 @@ static struct damon_ctx *damon_sysfs_build_ctx(
 	if (err)
 		goto out;
 
+	ctx->callback.after_wmarks_check = damon_sysfs_cmd_request_callback;
+	ctx->callback.after_aggregation = damon_sysfs_cmd_request_callback;
 	ctx->callback.before_terminate = damon_sysfs_before_terminate;
 	return ctx;
 
@@ -2296,6 +2381,8 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond)
 	if (kdamond->damon_ctx &&
 			damon_sysfs_ctx_running(kdamond->damon_ctx))
 		return -EBUSY;
+	if (damon_sysfs_cmd_request.kdamond == kdamond)
+		return -EBUSY;
 	/* TODO: support multiple contexts per kdamond */
 	if (kdamond->contexts->nr != 1)
 		return -EINVAL;
@@ -2328,29 +2415,11 @@ static int damon_sysfs_turn_damon_off(struct damon_sysfs_kdamond *kdamond)
 	 */
 }
 
-static int damon_sysfs_update_schemes_stats(struct damon_sysfs_kdamond *kdamond)
+static inline bool damon_sysfs_kdamond_running(
+		struct damon_sysfs_kdamond *kdamond)
 {
-	struct damon_ctx *ctx = kdamond->damon_ctx;
-	struct damos *scheme;
-	int schemes_idx = 0;
-
-	if (!ctx)
-		return -EINVAL;
-	mutex_lock(&ctx->kdamond_lock);
-	damon_for_each_scheme(scheme, ctx) {
-		struct damon_sysfs_schemes *sysfs_schemes;
-		struct damon_sysfs_stats *sysfs_stats;
-
-		sysfs_schemes = kdamond->contexts->contexts_arr[0]->schemes;
-		sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats;
-		sysfs_stats->nr_tried = scheme->stat.nr_tried;
-		sysfs_stats->sz_tried = scheme->stat.sz_tried;
-		sysfs_stats->nr_applied = scheme->stat.nr_applied;
-		sysfs_stats->sz_applied = scheme->stat.sz_applied;
-		sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds;
-	}
-	mutex_unlock(&ctx->kdamond_lock);
-	return 0;
+	return kdamond->damon_ctx &&
+		damon_sysfs_ctx_running(kdamond->damon_ctx);
 }
 
 /*
@@ -2358,24 +2427,58 @@ static int damon_sysfs_update_schemes_stats(struct damon_sysfs_kdamond *kdamond)
  * @cmd:	The command to handle.
  * @kdamond:	The kobject wrapper for the associated kdamond.
  *
- * This function handles a DAMON sysfs command for a kdamond.
+ * This function handles a DAMON sysfs command for a kdamond.  For commands
+ * that need to access running DAMON context-internal data, it requests
+ * handling of the command to the DAMON callback
+ * (@damon_sysfs_cmd_request_callback()) and wait until it is properly handled,
+ * or the context is completed.
  *
  * Return: 0 on success, negative error code otherwise.
  */
 static int damon_sysfs_handle_cmd(enum damon_sysfs_cmd cmd,
 		struct damon_sysfs_kdamond *kdamond)
 {
+	bool need_wait = true;
+
+	/* Handle commands that doesn't access DAMON context-internal data */
 	switch (cmd) {
 	case DAMON_SYSFS_CMD_ON:
 		return damon_sysfs_turn_damon_on(kdamond);
 	case DAMON_SYSFS_CMD_OFF:
 		return damon_sysfs_turn_damon_off(kdamond);
-	case DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS:
-		return damon_sysfs_update_schemes_stats(kdamond);
 	default:
 		break;
 	}
-	return -EINVAL;
+
+	/* Pass the command to DAMON callback for safe DAMON context access */
+	if (damon_sysfs_cmd_request.kdamond)
+		return -EBUSY;
+	if (!damon_sysfs_kdamond_running(kdamond))
+		return -EINVAL;
+	damon_sysfs_cmd_request.cmd = cmd;
+	damon_sysfs_cmd_request.kdamond = kdamond;
+
+	/*
+	 * wait until damon_sysfs_cmd_request_callback() handles the request
+	 * from kdamond context
+	 */
+	mutex_unlock(&damon_sysfs_lock);
+	while (need_wait) {
+		schedule_timeout_idle(msecs_to_jiffies(100));
+		if (!mutex_trylock(&damon_sysfs_lock))
+			continue;
+		if (!damon_sysfs_cmd_request.kdamond) {
+			/* damon_sysfs_cmd_request_callback() handled */
+			need_wait = false;
+		} else if (!damon_sysfs_kdamond_running(kdamond)) {
+			/* kdamond has already finished */
+			need_wait = false;
+			damon_sysfs_cmd_request.kdamond = NULL;
+		}
+		mutex_unlock(&damon_sysfs_lock);
+	}
+	mutex_lock(&damon_sysfs_lock);
+	return 0;
 }
 
 static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -2512,6 +2615,12 @@ static int damon_sysfs_kdamonds_add_dirs(struct damon_sysfs_kdamonds *kdamonds,
 	if (damon_sysfs_nr_running_ctxs(kdamonds->kdamonds_arr, kdamonds->nr))
 		return -EBUSY;
 
+	for (i = 0; i < kdamonds->nr; i++) {
+		if (damon_sysfs_cmd_request.kdamond ==
+				kdamonds->kdamonds_arr[i])
+			return -EBUSY;
+	}
+
 	damon_sysfs_kdamonds_rm_dirs(kdamonds);
 	if (!nr_kdamonds)
 		return 0;

From 2415fc0ce78dace1f48a7a63933af30025cb468c Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 9 May 2022 18:20:56 -0700
Subject: [PATCH 480/737] mm/damon/sysfs: support online inputs update

Currently, DAMON sysfs interface doesn't provide a way for adjusting DAMON
input parameters while it is turned on.  Therefore, users who want to
reconfigure DAMON need to stop DAMON and restart.  This means all the
monitoring results that accumulated so far, which could be useful, should
be flushed.  This would be inefficient for many cases.

For an example, let's suppose a sysadmin was running a DAMON-based
Operation Scheme to find memory regions not accessed for more than 5 mins
and page out the regions.  If it turns out the 5 mins threshold was too
long and therefore the sysadmin wants to reduce it to 4 mins, the sysadmin
should turn off DAMON, restart it, and wait for at least 4 more minutes so
that DAMON can find the cold memory regions, even though DAMON was knowing
there are regions that not accessed for 4 mins at the time of shutdown.

This commit makes DAMON sysfs interface to support online DAMON input
parameters updates by adding a new input keyword for the 'state' DAMON
sysfs file, 'commit'.  Writing the keyword to the 'state' file while the
corresponding kdamond is running makes the kdamond to read the sysfs file
values again and update the DAMON context.

Link: https://lkml.kernel.org/r/20220429160606.127307-12-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs.c | 99 +++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 90 insertions(+), 9 deletions(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index f181f1d2e0138..09f9e8ca3d1fa 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2061,6 +2061,8 @@ enum damon_sysfs_cmd {
 	DAMON_SYSFS_CMD_ON,
 	/* @DAMON_SYSFS_CMD_OFF: Turn the kdamond off. */
 	DAMON_SYSFS_CMD_OFF,
+	/* @DAMON_SYSFS_CMD_COMMIT: Update kdamond inputs. */
+	DAMON_SYSFS_CMD_COMMIT,
 	/*
 	 * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS: Update scheme stats sysfs
 	 * files.
@@ -2076,6 +2078,7 @@ enum damon_sysfs_cmd {
 static const char * const damon_sysfs_cmd_strs[] = {
 	"on",
 	"off",
+	"commit",
 	"update_schemes_stats",
 };
 
@@ -2195,6 +2198,39 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target,
 	return err;
 }
 
+/*
+ * Search a target in a context that corresponds to the sysfs target input.
+ *
+ * Return: pointer to the target if found, NULL if not found, or negative
+ * error code if the search failed.
+ */
+static struct damon_target *damon_sysfs_existing_target(
+		struct damon_sysfs_target *sys_target, struct damon_ctx *ctx)
+{
+	struct pid *pid;
+	struct damon_target *t;
+
+	if (ctx->ops.id == DAMON_OPS_PADDR) {
+		/* Up to only one target for paddr could exist */
+		damon_for_each_target(t, ctx)
+			return t;
+		return NULL;
+	}
+
+	/* ops.id should be DAMON_OPS_VADDR or DAMON_OPS_FVADDR */
+	pid = find_get_pid(sys_target->pid);
+	if (!pid)
+		return ERR_PTR(-EINVAL);
+	damon_for_each_target(t, ctx) {
+		if (t->pid == pid) {
+			put_pid(pid);
+			return t;
+		}
+	}
+	put_pid(pid);
+	return NULL;
+}
+
 static int damon_sysfs_set_targets(struct damon_ctx *ctx,
 		struct damon_sysfs_targets *sysfs_targets)
 {
@@ -2205,8 +2241,15 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx,
 		return -EINVAL;
 
 	for (i = 0; i < sysfs_targets->nr; i++) {
-		err = damon_sysfs_add_target(
-				sysfs_targets->targets_arr[i], ctx);
+		struct damon_sysfs_target *st = sysfs_targets->targets_arr[i];
+		struct damon_target *t = damon_sysfs_existing_target(st, ctx);
+
+		if (IS_ERR(t))
+			return PTR_ERR(t);
+		if (!t)
+			err = damon_sysfs_add_target(st, ctx);
+		else
+			err = damon_sysfs_set_regions(t, st->regions);
 		if (err)
 			return err;
 	}
@@ -2309,6 +2352,48 @@ static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond)
 	return 0;
 }
 
+static inline bool damon_sysfs_kdamond_running(
+		struct damon_sysfs_kdamond *kdamond)
+{
+	return kdamond->damon_ctx &&
+		damon_sysfs_ctx_running(kdamond->damon_ctx);
+}
+
+/*
+ * damon_sysfs_commit_input() - Commit user inputs to a running kdamond.
+ * @kdamond:	The kobject wrapper for the associated kdamond.
+ *
+ * If the sysfs input is wrong, the kdamond will be terminated.
+ */
+static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond)
+{
+	struct damon_ctx *ctx = kdamond->damon_ctx;
+	struct damon_sysfs_context *sys_ctx;
+	int err = 0;
+
+	if (!damon_sysfs_kdamond_running(kdamond))
+		return -EINVAL;
+	/* TODO: Support multiple contexts per kdamond */
+	if (kdamond->contexts->nr != 1)
+		return -EINVAL;
+
+	sys_ctx = kdamond->contexts->contexts_arr[0];
+
+	err = damon_select_ops(ctx, sys_ctx->ops_id);
+	if (err)
+		return err;
+	err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs);
+	if (err)
+		return err;
+	err = damon_sysfs_set_targets(ctx, sys_ctx->targets);
+	if (err)
+		return err;
+	err = damon_sysfs_set_schemes(ctx, sys_ctx->schemes);
+	if (err)
+		return err;
+	return err;
+}
+
 /*
  * damon_sysfs_cmd_request_callback() - DAMON callback for handling requests.
  * @c:	The DAMON context of the callback.
@@ -2331,6 +2416,9 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c)
 	case DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS:
 		err = damon_sysfs_upd_schemes_stats(kdamond);
 		break;
+	case DAMON_SYSFS_CMD_COMMIT:
+		err = damon_sysfs_commit_input(kdamond);
+		break;
 	default:
 		break;
 	}
@@ -2415,13 +2503,6 @@ static int damon_sysfs_turn_damon_off(struct damon_sysfs_kdamond *kdamond)
 	 */
 }
 
-static inline bool damon_sysfs_kdamond_running(
-		struct damon_sysfs_kdamond *kdamond)
-{
-	return kdamond->damon_ctx &&
-		damon_sysfs_ctx_running(kdamond->damon_ctx);
-}
-
 /*
  * damon_sysfs_handle_cmd() - Handle a command for a specific kdamond.
  * @cmd:	The command to handle.

From 5210cecb530e59f431a2ffdf87d0c2951e6359f3 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 9 May 2022 18:20:56 -0700
Subject: [PATCH 481/737] Docs/{ABI,admin-guide}/damon: Update for 'state'
 sysfs file input keyword, 'commit'

This commit documents the newly added 'state' sysfs file input keyword,
'commit', which allows online tuning of DAMON contexts.

Link: https://lkml.kernel.org/r/20220429160606.127307-13-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/ABI/testing/sysfs-kernel-mm-damon | 7 ++++---
 Documentation/admin-guide/mm/damon/usage.rst    | 9 +++++----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon
index fab97ea225691..08b9df3235609 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-damon
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon
@@ -23,9 +23,10 @@ Date:		Mar 2022
 Contact:	SeongJae Park <sj@kernel.org>
 Description:	Writing 'on' or 'off' to this file makes the kdamond starts or
 		stops, respectively.  Reading the file returns the keywords
-		based on the current status.  Writing 'update_schemes_stats' to
-		the file updates contents of schemes stats files of the
-		kdamond.
+		based on the current status.  Writing 'commit' to this file
+		makes the kdamond reads the user inputs in the sysfs files
+		except 'state' again.  Writing 'update_schemes_stats' to the
+		file updates contents of schemes stats files of the kdamond.
 
 What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/pid
 Date:		Mar 2022
diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 9c67311a79d8c..1bb7b72414b24 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -121,10 +121,11 @@ In each kdamond directory, two files (``state`` and ``pid``) and one directory
 
 Reading ``state`` returns ``on`` if the kdamond is currently running, or
 ``off`` if it is not running.  Writing ``on`` or ``off`` makes the kdamond be
-in the state.  Writing ``update_schemes_stats`` to ``state`` file updates the
-contents of stats files for each DAMON-based operation scheme of the kdamond.
-For details of the stats, please refer to :ref:`stats section
-<sysfs_schemes_stats>`.
+in the state.  Writing ``commit`` to the ``state`` file makes kdamond reads the
+user inputs in the sysfs files except ``state`` file again.  Writing
+``update_schemes_stats`` to ``state`` file updates the contents of stats files
+for each DAMON-based operation scheme of the kdamond.  For details of the
+stats, please refer to :ref:`stats section <sysfs_schemes_stats>`.
 
 If the state is ``on``, reading ``pid`` shows the pid of the kdamond thread.
 

From e1da5dfe322303aa5754d55ca8ed5df7633cf5b0 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 9 May 2022 18:20:56 -0700
Subject: [PATCH 482/737] mm/damon/reclaim: support online inputs update

DAMON_RECLAIM reads the user input parameters only when it starts.  To
allow more efficient online tuning, this commit implements a new input
parameter called 'commit_inputs'.  Writing true to the parameter makes
DAMON_RECLAIM reads the input parameters again.

Link: https://lkml.kernel.org/r/20220429160606.127307-14-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/reclaim.c | 95 ++++++++++++++++++++++++++++++----------------
 1 file changed, 62 insertions(+), 33 deletions(-)

diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 75cfd96a6060f..f37c5d4b27faf 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -29,6 +29,18 @@
  */
 static bool enabled __read_mostly;
 
+/*
+ * Make DAMON_RECLAIM reads the input parameters again, except ``enabled``.
+ *
+ * Input parameters that updated while DAMON_RECLAIM is running are not applied
+ * by default.  Once this parameter is set as ``Y``, DAMON_RECLAIM reads values
+ * of parametrs except ``enabled`` again.  Once the re-reading is done, this
+ * parameter is set as ``N``.  If invalid parameters are found while the
+ * re-reading, DAMON_RECLAIM will be disabled.
+ */
+static bool commit_inputs __read_mostly;
+module_param(commit_inputs, bool, 0600);
+
 /*
  * Time threshold for cold memory regions identification in microseconds.
  *
@@ -289,57 +301,56 @@ static struct damos *damon_reclaim_new_scheme(void)
 	return scheme;
 }
 
-static int damon_reclaim_turn(bool on)
+static int damon_reclaim_apply_parameters(void)
 {
-	struct damon_region *region;
 	struct damos *scheme;
-	int err;
-
-	if (!on) {
-		err = damon_stop(&ctx, 1);
-		if (!err)
-			kdamond_pid = -1;
-		return err;
-	}
+	struct damon_addr_range addr_range;
+	int err = 0;
 
 	err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0,
 			min_nr_regions, max_nr_regions);
 	if (err)
 		return err;
 
+	/* Will be freed by next 'damon_set_schemes()' below */
+	scheme = damon_reclaim_new_scheme();
+	if (!scheme)
+		return -ENOMEM;
+	err = damon_set_schemes(ctx, &scheme, 1);
+	if (err)
+		return err;
+
 	if (monitor_region_start > monitor_region_end)
 		return -EINVAL;
 	if (!monitor_region_start && !monitor_region_end &&
 			!get_monitoring_region(&monitor_region_start,
 				&monitor_region_end))
 		return -EINVAL;
-	/* DAMON will free this on its own when finish monitoring */
-	region = damon_new_region(monitor_region_start, monitor_region_end);
-	if (!region)
-		return -ENOMEM;
-	damon_add_region(region, target);
+	addr_range.start = monitor_region_start;
+	addr_range.end = monitor_region_end;
+	return damon_set_regions(target, &addr_range, 1);
+}
 
-	/* Will be freed by 'damon_set_schemes()' below */
-	scheme = damon_reclaim_new_scheme();
-	if (!scheme) {
-		err = -ENOMEM;
-		goto free_region_out;
+static int damon_reclaim_turn(bool on)
+{
+	int err;
+
+	if (!on) {
+		err = damon_stop(&ctx, 1);
+		if (!err)
+			kdamond_pid = -1;
+		return err;
 	}
-	err = damon_set_schemes(ctx, &scheme, 1);
+
+	err = damon_reclaim_apply_parameters();
 	if (err)
-		goto free_scheme_out;
+		return err;
 
 	err = damon_start(&ctx, 1, true);
-	if (!err) {
-		kdamond_pid = ctx->kdamond->pid;
-		return 0;
-	}
-
-free_scheme_out:
-	damon_destroy_scheme(scheme);
-free_region_out:
-	damon_destroy_region(region, target);
-	return err;
+	if (err)
+		return err;
+	kdamond_pid = ctx->kdamond->pid;
+	return 0;
 }
 
 #define ENABLE_CHECK_INTERVAL_MS	1000
@@ -389,6 +400,7 @@ MODULE_PARM_DESC(enabled,
 static int damon_reclaim_after_aggregation(struct damon_ctx *c)
 {
 	struct damos *s;
+	int err = 0;
 
 	/* update the stats parameter */
 	damon_for_each_scheme(s, c) {
@@ -398,7 +410,23 @@ static int damon_reclaim_after_aggregation(struct damon_ctx *c)
 		bytes_reclaimed_regions = s->stat.sz_applied;
 		nr_quota_exceeds = s->stat.qt_exceeds;
 	}
-	return 0;
+
+	if (commit_inputs) {
+		err = damon_reclaim_apply_parameters();
+		commit_inputs = false;
+	}
+	return err;
+}
+
+static int damon_reclaim_after_wmarks_check(struct damon_ctx *c)
+{
+	int err = 0;
+
+	if (commit_inputs) {
+		err = damon_reclaim_apply_parameters();
+		commit_inputs = false;
+	}
+	return err;
 }
 
 static int __init damon_reclaim_init(void)
@@ -410,6 +438,7 @@ static int __init damon_reclaim_init(void)
 	if (damon_select_ops(ctx, DAMON_OPS_PADDR))
 		return -EINVAL;
 
+	ctx->callback.after_wmarks_check = damon_reclaim_after_wmarks_check;
 	ctx->callback.after_aggregation = damon_reclaim_after_aggregation;
 
 	target = damon_new_target();

From d1c7c17a765fe44aa7d36df243ad0e5ebdb8916a Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 9 May 2022 18:20:56 -0700
Subject: [PATCH 483/737] Docs/admin-guide/mm/damon/reclaim: document
 'commit_inputs' parameter

This commit documents the new DAMON_RECLAIM parameter, 'commit_inputs' in
its usage document.

Link: https://lkml.kernel.org/r/20220429160606.127307-15-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/reclaim.rst | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst
index 0af51a9705b10..46306f1f34b1a 100644
--- a/Documentation/admin-guide/mm/damon/reclaim.rst
+++ b/Documentation/admin-guide/mm/damon/reclaim.rst
@@ -66,6 +66,17 @@ Setting it as ``N`` disables DAMON_RECLAIM.  Note that DAMON_RECLAIM could do
 no real monitoring and reclamation due to the watermarks-based activation
 condition.  Refer to below descriptions for the watermarks parameter for this.
 
+commit_inputs
+-------------
+
+Make DAMON_RECLAIM reads the input parameters again, except ``enabled``.
+
+Input parameters that updated while DAMON_RECLAIM is running are not applied
+by default.  Once this parameter is set as ``Y``, DAMON_RECLAIM reads values
+of parametrs except ``enabled`` again.  Once the re-reading is done, this
+parameter is set as ``N``.  If invalid parameters are found while the
+re-reading, DAMON_RECLAIM will be disabled.
+
 min_age
 -------
 

From 0cd6ae5e09d70392dbd95a07eb933f0f71ee6c58 Mon Sep 17 00:00:00 2001
From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Date: Thu, 12 May 2022 20:23:07 -0700
Subject: [PATCH 484/737] mm/damon/reclaim: use resource_size function on
 resource object

Fix the following coccicheck warnings:

./mm/damon/reclaim.c:241:30-33: WARNING: Suspicious code. resource_size is maybe missing with res.

Link: https://lkml.kernel.org/r/20220507032512.129598-1-jiapeng.chong@linux.alibaba.com
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Cc: "Boehme, Markus" <markubo@amazon.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/reclaim.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index f37c5d4b27faf..8efbfb24f3a1e 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -238,7 +238,7 @@ static int walk_system_ram(struct resource *res, void *arg)
 {
 	struct damon_reclaim_ram_walk_arg *a = arg;
 
-	if (a->end - a->start < res->end - res->start) {
+	if (a->end - a->start < resource_size(res)) {
 		a->start = res->start;
 		a->end = res->end;
 	}

From d09f032bb499f42b22f2b507665b9135046207f3 Mon Sep 17 00:00:00 2001
From: Gautam Menghani <gautammenghani201@gmail.com>
Date: Fri, 13 May 2022 16:48:57 -0700
Subject: [PATCH 485/737] mm/damon: add documentation for Enum value

Fix the warning - "Enum value 'NR_DAMON_OPS' not described in enum
'damon_ops_id'" generated by the command "make pdfdocs"

Link: https://lkml.kernel.org/r/20220508073316.141401-1-gautammenghani201@gmail.com
Signed-off-by: Gautam Menghani <gautammenghani201@gmail.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index d1e6ee28a2fff..7c62da31ce4b5 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -264,6 +264,7 @@ struct damos {
  * @DAMON_OPS_FVADDR:	Monitoring operations for only fixed ranges of virtual
  *			address spaces
  * @DAMON_OPS_PADDR:	Monitoring operations for the physical address space
+ * @NR_DAMON_OPS:	Number of monitoring operations implementations
  */
 enum damon_ops_id {
 	DAMON_OPS_VADDR,

From 0742b50a84589c04cbe1ec030bc9ce5aa5be8d22 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Tue, 17 May 2022 22:51:20 +0800
Subject: [PATCH 486/737] mm: damon: use HPAGE_PMD_SIZE

Use HPAGE_PMD_SIZE instead of open coding.

Link: https://lkml.kernel.org/r/20220517145120.118523-1-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/ops-common.c | 3 +--
 mm/damon/paddr.c      | 2 +-
 mm/damon/vaddr.c      | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index e346cc10d1439..10ef20b2003f5 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -73,8 +73,7 @@ void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr)
 	}
 
 #ifdef CONFIG_MMU_NOTIFIER
-	if (mmu_notifier_clear_young(mm, addr,
-				addr + ((1UL) << HPAGE_PMD_SHIFT)))
+	if (mmu_notifier_clear_young(mm, addr, addr + HPAGE_PMD_SIZE))
 		referenced = true;
 #endif /* CONFIG_MMU_NOTIFIER */
 
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index ffcd6047fea85..208fb369f22eb 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -112,7 +112,7 @@ static bool __damon_pa_young(struct page *page, struct vm_area_struct *vma,
 			result->accessed = pmd_young(*pvmw.pmd) ||
 				!page_is_idle(page) ||
 				mmu_notifier_test_young(vma->vm_mm, addr);
-			result->page_sz = ((1UL) << HPAGE_PMD_SHIFT);
+			result->page_sz = HPAGE_PMD_SIZE;
 #else
 			WARN_ON_ONCE(1);
 #endif	/* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 77f326323e747..59e1653799f83 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -442,7 +442,7 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
 		if (pmd_young(*pmd) || !page_is_idle(page) ||
 					mmu_notifier_test_young(walk->mm,
 						addr)) {
-			*priv->page_sz = ((1UL) << HPAGE_PMD_SHIFT);
+			*priv->page_sz = HPAGE_PMD_SIZE;
 			priv->young = true;
 		}
 		put_page(page);

From 0761883db94c005dadd2ed952a1683147e3ed2ba Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Sat, 4 Jun 2022 19:50:51 +0000
Subject: [PATCH 487/737] mm/damon/reclaim: schedule 'damon_reclaim_timer' only
 after 'system_wq' is initialized

Commit 059342d1dd4e ("mm/damon/reclaim: fix the timer always stays
active") made DAMON_RECLAIM's 'enabled' parameter store callback,
'enabled_store()', to schedule 'damon_reclaim_timer'.  The scheduling uses
'system_wq', which is initialized in 'workqueue_init_early()'.  As kernel
parameters parsing function ('parse_args()') is called before
'workqueue_init_early()', 'enabled_store()' can be executed before
'workqueue_init_early()' and end up accessing the uninitialized
'system_wq'.  As a result, the booting hang[1].  This commit fixes the
issue by checking if the initialization is done before scheduling the
timer.

[1] https://lkml.kernel.org/20220604192222.1488-1-sj@kernel.org/

Link: https://lkml.kernel.org/r/20220604195051.1589-1-sj@kernel.org
Fixes: 059342d1dd4e ("mm/damon/reclaim: fix the timer always stays active")
Signed-off-by: SeongJae Park <sj@kernel.org>
Reported-by: Greg White <gwhite@kupulau.com>
Cc: Hailong Tu <tuhailong@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/reclaim.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 8efbfb24f3a1e..4b07c29effe97 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -374,6 +374,8 @@ static void damon_reclaim_timer_fn(struct work_struct *work)
 }
 static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn);
 
+static bool damon_reclaim_initialized;
+
 static int enabled_store(const char *val,
 		const struct kernel_param *kp)
 {
@@ -382,6 +384,10 @@ static int enabled_store(const char *val,
 	if (rc < 0)
 		return rc;
 
+	/* system_wq might not initialized yet */
+	if (!damon_reclaim_initialized)
+		return rc;
+
 	if (enabled)
 		schedule_delayed_work(&damon_reclaim_timer, 0);
 
@@ -449,6 +455,8 @@ static int __init damon_reclaim_init(void)
 	damon_add_target(ctx, target);
 
 	schedule_delayed_work(&damon_reclaim_timer, 0);
+
+	damon_reclaim_initialized = true;
 	return 0;
 }
 

From 2d48967ecdef3c1b4abca520933fca98691d7be0 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Mon, 20 Jun 2022 10:34:42 +0800
Subject: [PATCH 488/737] mm/damon: use set_huge_pte_at() to make huge pte old

The huge_ptep_set_access_flags() can not make the huge pte old according
to the discussion [1], that means we will always mornitor the young state
of the hugetlb though we stopped accessing the hugetlb, as a result DAMON
will get inaccurate accessing statistics.

So changing to use set_huge_pte_at() to make the huge pte old to fix this
issue.

[1] https://lore.kernel.org/all/Yqy97gXI4Nqb7dYo@arm.com/

Link: https://lkml.kernel.org/r/1655692482-28797-1-git-send-email-baolin.wang@linux.alibaba.com
Fixes: 49f4203aae06 ("mm/damon: add access checking for hugetlb pages")
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Acked-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/vaddr.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 59e1653799f83..3c7b9d6dca95d 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -336,8 +336,7 @@ static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm,
 	if (pte_young(entry)) {
 		referenced = true;
 		entry = pte_mkold(entry);
-		huge_ptep_set_access_flags(vma, addr, pte, entry,
-					   vma->vm_flags & VM_WRITE);
+		set_huge_pte_at(mm, addr, pte, entry);
 	}
 
 #ifdef CONFIG_MMU_NOTIFIER

From b378c690e7d5883b9f8f83fce862fd2d31fb0065 Mon Sep 17 00:00:00 2001
From: Jianglei Nie <niejianglei2021@163.com>
Date: Thu, 14 Jul 2022 14:37:46 +0800
Subject: [PATCH 489/737] mm/damon/reclaim: fix potential memory leak in
 damon_reclaim_init()

damon_reclaim_init() allocates a memory chunk for ctx with
damon_new_ctx().  When damon_select_ops() fails, ctx is not released,
which will lead to a memory leak.

We should release the ctx with damon_destroy_ctx() when damon_select_ops()
fails to fix the memory leak.

Link: https://lkml.kernel.org/r/20220714063746.2343549-1-niejianglei2021@163.com
Fixes: 4d69c3457821 ("mm/damon/reclaim: use damon_select_ops() instead of damon_{v,p}a_set_operations()")
Signed-off-by: Jianglei Nie <niejianglei2021@163.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/reclaim.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 4b07c29effe97..0b3c7396cb90a 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -441,8 +441,10 @@ static int __init damon_reclaim_init(void)
 	if (!ctx)
 		return -ENOMEM;
 
-	if (damon_select_ops(ctx, DAMON_OPS_PADDR))
+	if (damon_select_ops(ctx, DAMON_OPS_PADDR)) {
+		damon_destroy_ctx(ctx);
 		return -EINVAL;
+	}
 
 	ctx->callback.after_wmarks_check = damon_reclaim_after_wmarks_check;
 	ctx->callback.after_aggregation = damon_reclaim_after_aggregation;

From 0c76a520a159def9ee396806a713e9ec24adc59a Mon Sep 17 00:00:00 2001
From: Badari Pulavarty <badari.pulavarty@intel.com>
Date: Sun, 21 Aug 2022 18:08:53 +0000
Subject: [PATCH 490/737] mm/damon/dbgfs: avoid duplicate context directory
 creation

When user tries to create a DAMON context via the DAMON debugfs interface
with a name of an already existing context, the context directory creation
fails but a new context is created and added in the internal data
structure, due to absence of the directory creation success check.  As a
result, memory could leak and DAMON cannot be turned on.  An example test
case is as below:

    # cd /sys/kernel/debug/damon/
    # echo "off" >  monitor_on
    # echo paddr > target_ids
    # echo "abc" > mk_context
    # echo "abc" > mk_context
    # echo $$ > abc/target_ids
    # echo "on" > monitor_on  <<< fails

Return value of 'debugfs_create_dir()' is expected to be ignored in
general, but this is an exceptional case as DAMON feature is depending
on the debugfs functionality and it has the potential duplicate name
issue.  This commit therefore fixes the issue by checking the directory
creation failure and immediately return the error in the case.

Link: https://lkml.kernel.org/r/20220821180853.2400-1-sj@kernel.org
Fixes: 75c1c2b53c78 ("mm/damon/dbgfs: support multiple contexts")
Signed-off-by: Badari Pulavarty <badari.pulavarty@intel.com>
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>	[ 5.15.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/dbgfs.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index a0dab8b5e45f2..53ba8b1e619ca 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -787,6 +787,9 @@ static int dbgfs_mk_context(char *name)
 		return -ENOENT;
 
 	new_dir = debugfs_create_dir(name, root);
+	/* Below check is required for a potential duplicated name case */
+	if (IS_ERR(new_dir))
+		return PTR_ERR(new_dir);
 	dbgfs_dirs[dbgfs_nr_ctxs] = new_dir;
 
 	new_ctx = dbgfs_new_ctx();

From 72ab7cd22d6b0a786d18d0b9f5104affb8f5c07c Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <surajjs@amazon.com>
Date: Tue, 30 Aug 2022 14:03:27 -0700
Subject: [PATCH 491/737] Revert "x86/speculation: Add RSB VM Exit protections"

This reverts commit 509c2c9fe75ea7493eebbb6bb2f711f37530ae19.

Commit "1a072f13b2dc Mitigate unbalanced RETs on vmexit via serialising wrmsr"
addresses this with less performance impact.

[ Hailmo: Resolved conflicts when rebasing onto 5.10.190 and adding SRSO
and GDS support ]

Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 Documentation/admin-guide/hw-vuln/spectre.rst |  8 --
 arch/x86/include/asm/cpufeatures.h            |  2 -
 arch/x86/include/asm/msr-index.h              |  4 -
 arch/x86/include/asm/nospec-branch.h          |  7 +-
 arch/x86/kernel/cpu/bugs.c                    | 86 +++++--------------
 arch/x86/kernel/cpu/common.c                  | 12 +--
 arch/x86/kvm/vmx/vmenter.S                    |  8 +-
 arch/x86/kvm/x86.c                            |  2 +-
 tools/arch/x86/include/asm/cpufeatures.h      |  1 -
 tools/arch/x86/include/asm/msr-index.h        |  4 -
 10 files changed, 32 insertions(+), 102 deletions(-)

diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst
index 0fba3758d0da8..52b5ef785420e 100644
--- a/Documentation/admin-guide/hw-vuln/spectre.rst
+++ b/Documentation/admin-guide/hw-vuln/spectre.rst
@@ -422,14 +422,6 @@ The possible values in this file are:
   'RSB filling'   Protection of RSB on context switch enabled
   =============   ===========================================
 
-  - EIBRS Post-barrier Return Stack Buffer (PBRSB) protection status:
-
-  ===========================  =======================================================
-  'PBRSB-eIBRS: SW sequence'   CPU is affected and protection of RSB on VMEXIT enabled
-  'PBRSB-eIBRS: Vulnerable'    CPU is vulnerable
-  'PBRSB-eIBRS: Not affected'  CPU is not affected by PBRSB
-  ===========================  =======================================================
-
 Full mitigation might require a microcode update from the CPU
 vendor. When the necessary microcode is not available, the kernel will
 report vulnerability.
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 5a54c3685a066..5c9488cd662c6 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -298,7 +298,6 @@
 #define X86_FEATURE_RETHUNK		(11*32+14) /* "" Use REturn THUNK */
 #define X86_FEATURE_UNRET		(11*32+15) /* "" AMD BTB untrain return */
 #define X86_FEATURE_USE_IBPB_FW		(11*32+16) /* "" Use IBPB during runtime firmware calls */
-#define X86_FEATURE_RSB_VMEXIT_LITE	(11*32+17) /* "" Fill RSB on VM exit when EIBRS is enabled */
 #define X86_FEATURE_MSR_TSX_CTRL	(11*32+18) /* "" MSR IA32_TSX_CTRL (Intel) implemented */
 
 #define X86_FEATURE_SRSO		(11*32+24) /* "" AMD BTB untrain RETs */
@@ -446,7 +445,6 @@
 #define X86_BUG_MMIO_STALE_DATA		X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */
 #define X86_BUG_MMIO_UNKNOWN		X86_BUG(26) /* CPU is too old and its MMIO Stale Data status is unknown */
 #define X86_BUG_RETBLEED		X86_BUG(27) /* CPU is affected by RETBleed */
-#define X86_BUG_EIBRS_PBRSB		X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */
 #define X86_BUG_GDS			X86_BUG(29) /* CPU is affected by Gather Data Sampling */
 
 /* BUG word 2 */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 202a52e42a368..cdeaa099ad0e7 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -153,10 +153,6 @@
 						 * are restricted to targets in
 						 * kernel.
 						 */
-#define ARCH_CAP_PBRSB_NO		BIT(24)	/*
-						 * Not susceptible to Post-Barrier
-						 * Return Stack Buffer Predictions.
-						 */
 #define ARCH_CAP_GDS_CTRL		BIT(25)	/*
 						 * CPU is vulnerable to Gather
 						 * Data Sampling (GDS) and
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 99fbce2c1c7c1..ec556ae20545c 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -147,10 +147,9 @@
   * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP
   * monstrosity above, manually.
   */
-.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ftr2=ALT_NOT(X86_FEATURE_ALWAYS)
-	ALTERNATIVE_2 "jmp .Lskip_rsb_\@", \
-		__stringify(__FILL_RETURN_BUFFER(\reg,\nr)), \ftr, \
-		__stringify(__FILL_ONE_RETURN), \ftr2
+.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req
+	ALTERNATIVE "jmp .Lskip_rsb_\@", \
+		__stringify(__FILL_RETURN_BUFFER(\reg,\nr)), \ftr
 
 .Lskip_rsb_\@:
 .endm
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index d31639e3ce282..035b4ba4e5feb 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -1422,53 +1422,6 @@ static void __init spec_ctrl_disable_kernel_rrsba(void)
 	}
 }
 
-static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode)
-{
-	/*
-	 * Similar to context switches, there are two types of RSB attacks
-	 * after VM exit:
-	 *
-	 * 1) RSB underflow
-	 *
-	 * 2) Poisoned RSB entry
-	 *
-	 * When retpoline is enabled, both are mitigated by filling/clearing
-	 * the RSB.
-	 *
-	 * When IBRS is enabled, while #1 would be mitigated by the IBRS branch
-	 * prediction isolation protections, RSB still needs to be cleared
-	 * because of #2.  Note that SMEP provides no protection here, unlike
-	 * user-space-poisoned RSB entries.
-	 *
-	 * eIBRS should protect against RSB poisoning, but if the EIBRS_PBRSB
-	 * bug is present then a LITE version of RSB protection is required,
-	 * just a single call needs to retire before a RET is executed.
-	 */
-	switch (mode) {
-	case SPECTRE_V2_NONE:
-		return;
-
-	case SPECTRE_V2_EIBRS_LFENCE:
-	case SPECTRE_V2_EIBRS:
-		if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) {
-			setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE);
-			pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n");
-		}
-		return;
-
-	case SPECTRE_V2_EIBRS_RETPOLINE:
-	case SPECTRE_V2_RETPOLINE:
-	case SPECTRE_V2_LFENCE:
-	case SPECTRE_V2_IBRS:
-		setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT);
-		pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n");
-		return;
-	}
-
-	pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit");
-	dump_stack();
-}
-
 static void __init spectre_v2_select_mitigation(void)
 {
 	enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
@@ -1619,7 +1572,28 @@ static void __init spectre_v2_select_mitigation(void)
 	setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
 	pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n");
 
-	spectre_v2_determine_rsb_fill_type_at_vmexit(mode);
+	/*
+	 * Similar to context switches, there are two types of RSB attacks
+	 * after vmexit:
+	 *
+	 * 1) RSB underflow
+	 *
+	 * 2) Poisoned RSB entry
+	 *
+	 * When retpoline is enabled, both are mitigated by filling/clearing
+	 * the RSB.
+	 *
+	 * When IBRS is enabled, while #1 would be mitigated by the IBRS branch
+	 * prediction isolation protections, RSB still needs to be cleared
+	 * because of #2.  Note that SMEP provides no protection here, unlike
+	 * user-space-poisoned RSB entries.
+	 *
+	 * eIBRS, on the other hand, has RSB-poisoning protections, so it
+	 * doesn't need RSB clearing after vmexit.
+	 */
+	if (boot_cpu_has(X86_FEATURE_RETPOLINE) ||
+	    boot_cpu_has(X86_FEATURE_KERNEL_IBRS))
+		setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT);
 
 	/*
 	 * Retpoline protects the kernel, but doesn't protect firmware.  IBRS
@@ -2536,19 +2510,6 @@ static char *ibpb_state(void)
 	return "";
 }
 
-static char *pbrsb_eibrs_state(void)
-{
-	if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) {
-		if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) ||
-		    boot_cpu_has(X86_FEATURE_RSB_VMEXIT))
-			return ", PBRSB-eIBRS: SW sequence";
-		else
-			return ", PBRSB-eIBRS: Vulnerable";
-	} else {
-		return ", PBRSB-eIBRS: Not affected";
-	}
-}
-
 static ssize_t spectre_v2_show_state(char *buf)
 {
 	if (spectre_v2_enabled == SPECTRE_V2_LFENCE)
@@ -2561,13 +2522,12 @@ static ssize_t spectre_v2_show_state(char *buf)
 	    spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE)
 		return sprintf(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n");
 
-	return sprintf(buf, "%s%s%s%s%s%s%s\n",
+	return sprintf(buf, "%s%s%s%s%s%s\n",
 		       spectre_v2_strings[spectre_v2_enabled],
 		       ibpb_state(),
 		       boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
 		       stibp_state(),
 		       boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
-		       pbrsb_eibrs_state(),
 		       spectre_v2_module_string());
 }
 
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c1f2360309120..5cf7db6dc2f01 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1036,7 +1036,6 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
 #define NO_ITLB_MULTIHIT	BIT(7)
 #define NO_SPECTRE_V2		BIT(8)
 #define NO_MMIO			BIT(9)
-#define NO_EIBRS_PBRSB		BIT(10)
 
 #define VULNWL(vendor, family, model, whitelist)	\
 	X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist)
@@ -1082,7 +1081,7 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
 
 	VULNWL_INTEL(ATOM_GOLDMONT,		NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
 	VULNWL_INTEL(ATOM_GOLDMONT_D,		NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
-	VULNWL_INTEL(ATOM_GOLDMONT_PLUS,	NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB),
+	VULNWL_INTEL(ATOM_GOLDMONT_PLUS,	NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
 
 	/*
 	 * Technically, swapgs isn't serializing on AMD (despite it previously
@@ -1092,9 +1091,7 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
 	 * good enough for our purposes.
 	 */
 
-	VULNWL_INTEL(ATOM_TREMONT,		NO_EIBRS_PBRSB),
-	VULNWL_INTEL(ATOM_TREMONT_L,		NO_EIBRS_PBRSB),
-	VULNWL_INTEL(ATOM_TREMONT_D,		NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB),
+	VULNWL_INTEL(ATOM_TREMONT_D,		NO_ITLB_MULTIHIT),
 
 	/* AMD Family 0xf - 0x12 */
 	VULNWL_AMD(0x0f,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
@@ -1287,11 +1284,6 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 			setup_force_cpu_bug(X86_BUG_RETBLEED);
 	}
 
-	if (cpu_has(c, X86_FEATURE_IBRS_ENHANCED) &&
-	    !cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) &&
-	    !(ia32_cap & ARCH_CAP_PBRSB_NO))
-		setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB);
-
 	/*
 	 * Check if CPU is vulnerable to GDS. If running in a virtual machine on
 	 * an affected processor, the VMM may have disabled the use of GATHER by
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 982138bebb70f..857fa0fc49faf 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -197,13 +197,11 @@ SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL)
 	 * entries and (in some cases) RSB underflow.
 	 *
 	 * eIBRS has its own protection against poisoned RSB, so it doesn't
-	 * need the RSB filling sequence.  But it does need to be enabled, and a
-	 * single call to retire, before the first unbalanced RET.
+	 * need the RSB filling sequence.  But it does need to be enabled
+	 * before the first unbalanced RET.
          */
 
-	FILL_RETURN_BUFFER %_ASM_CX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT,\
-			   X86_FEATURE_RSB_VMEXIT_LITE
-
+	FILL_RETURN_BUFFER %_ASM_CX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT
 
 	pop %_ASM_ARG2	/* @flags */
 	pop %_ASM_ARG1	/* @vmx */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dc03f6ebae306..2f15ae110f42a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1391,7 +1391,7 @@ static unsigned int num_msr_based_features;
 	 ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \
 	 ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
 	 ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
-	 ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO)
+	 ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_GDS_NO)
 
 static u64 kvm_get_arch_capabilities(void)
 {
diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
index 2ae4d74ee73b4..2866fa3501800 100644
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -295,7 +295,6 @@
 #define X86_FEATURE_RETPOLINE_LFENCE	(11*32+13) /* "" Use LFENCE for Spectre variant 2 */
 #define X86_FEATURE_RETHUNK		(11*32+14) /* "" Use REturn THUNK */
 #define X86_FEATURE_UNRET		(11*32+15) /* "" AMD BTB untrain return */
-#define X86_FEATURE_RSB_VMEXIT_LITE	(11*32+17) /* "" Fill RSB on VM-Exit when EIBRS is enabled */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
 #define X86_FEATURE_AVX_VNNI		(12*32+ 4) /* AVX VNNI instructions */
diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h
index 8fb9256768134..f9a4a35093769 100644
--- a/tools/arch/x86/include/asm/msr-index.h
+++ b/tools/arch/x86/include/asm/msr-index.h
@@ -148,10 +148,6 @@
 						 * are restricted to targets in
 						 * kernel.
 						 */
-#define ARCH_CAP_PBRSB_NO		BIT(24)	/*
-						 * Not susceptible to Post-Barrier
-						 * Return Stack Buffer Predictions.
-						 */
 
 #define MSR_IA32_FLUSH_CMD		0x0000010b
 #define L1D_FLUSH			BIT(0)	/*

From 1ab2cfd750bd3016dd246e7f5596f54281032617 Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <surajjs@amazon.com>
Date: Tue, 13 Sep 2022 21:28:28 -0700
Subject: [PATCH 492/737] DOWNSTREAM ONLY: Revert "Makefile: link with -z
 noexecstack --no-warn-rwx-segments"

This reverts commit 8f4f2c9b98e40c699329680f9ca4f7c3a7f949ea.

This causes arm64 debug builds to fail with:
*** ERROR: No build ID note found in /builddir/build/BUILDROOT/kernel-5.15.63-32.131.amzn2.aarch64/usr/lib/debug/lib/modules/5.15.63-32.131.amzn2.aarch64/vmlinux

This is due to the notes section which contains the build id being
missing from the linux elf.

Revert this commit until this can be remedied.

Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 Makefile | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/Makefile b/Makefile
index bd2f457703634..112c3473c71b3 100644
--- a/Makefile
+++ b/Makefile
@@ -1001,9 +1001,6 @@ KBUILD_CFLAGS   += $(KCFLAGS)
 KBUILD_LDFLAGS_MODULE += --build-id=sha1
 LDFLAGS_vmlinux += --build-id=sha1
 
-KBUILD_LDFLAGS	+= -z noexecstack
-KBUILD_LDFLAGS	+= $(call ld-option,--no-warn-rwx-segments)
-
 ifeq ($(CONFIG_STRIP_ASM_SYMS),y)
 LDFLAGS_vmlinux	+= $(call ld-option, -X,)
 endif

From 43155b9f49999cecbc8e7c92af92acebc76259bb Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <surajjs@amazon.com>
Date: Tue, 13 Sep 2022 21:29:02 -0700
Subject: [PATCH 493/737] ENA: Update to v2.8.0

Source: https://github.com/amzn/amzn-drivers/

Change Log:

## r2.8.0 release notes
**Notes**
* The driver is now dependent on the ptp module for loading
  See README for more details.

**New Features**
* Add support for PTP HW clock
* Add support for SRD metrics
  Feature's enablement and documentation would be in future release

**Bug Fixes**
* Fix potential sign extension issue
* Reduce memory footprint of some structs
* Fix updating rx_copybreak issue
* Fix xdp drops handling due to multibuf packets
* Handle ena_calc_io_queue_size() possible errors
* Destroy correct amount of xdp queues upon failure

**Minor Changes**
* Remove wide LLQ comment on supported versions
* Backport uapi/bpf.h inclusion
* Add a counter for driver's reset failures
* Take xdp packets stats into account in ena_get_stats64()
* Make queue stats code cleaner by removing if block
* Remove redundant empty line
* Remove confusing comment
* Remove flag reading code duplication
* Replace ENA local ENA_NAPI_BUDGET to global NAPI_POLL_WEIGHT
* Change default print level for netif_ prints
* Relocate skb_tx_timestamp() to improve time stamping accuracy
* Backport bpf_warn_invalid_xdp_action() change
* Fix incorrect indentation using spaces
* Driver now compiles with Linux kernel 5.19

Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
---
 drivers/amazon/net/ena/Makefile         |   8 +-
 drivers/amazon/net/ena/ena_admin_defs.h |  99 ++++++++++
 drivers/amazon/net/ena/ena_com.c        | 244 ++++++++++++++++++++++-
 drivers/amazon/net/ena/ena_com.h        |  94 ++++++++-
 drivers/amazon/net/ena/ena_eth_com.h    |   2 +-
 drivers/amazon/net/ena/ena_ethtool.c    | 217 +++++++++++++++------
 drivers/amazon/net/ena/ena_netdev.c     | 130 +++++++++----
 drivers/amazon/net/ena/ena_netdev.h     |  20 +-
 drivers/amazon/net/ena/ena_phc.c        | 246 ++++++++++++++++++++++++
 drivers/amazon/net/ena/ena_phc.h        |  43 +++++
 drivers/amazon/net/ena/ena_regs_defs.h  |   8 +
 drivers/amazon/net/ena/ena_sysfs.c      |  26 ++-
 drivers/amazon/net/ena/ena_xdp.c        |  10 +-
 drivers/amazon/net/ena/ena_xdp.h        |  12 +-
 drivers/amazon/net/ena/kcompat.h        |  63 ++++++
 15 files changed, 1095 insertions(+), 127 deletions(-)
 create mode 100644 drivers/amazon/net/ena/ena_phc.c
 create mode 100644 drivers/amazon/net/ena/ena_phc.h

diff --git a/drivers/amazon/net/ena/Makefile b/drivers/amazon/net/ena/Makefile
index 37106200d6c19..b61366782d8d6 100644
--- a/drivers/amazon/net/ena/Makefile
+++ b/drivers/amazon/net/ena/Makefile
@@ -6,11 +6,15 @@
 
 obj-$(CONFIG_AMAZON_ENA_ETHERNET) += ena.o
 
-ena-y := ena_netdev.o ena_ethtool.o ena_lpc.o ena_xdp.o dim.o ena_devlink.o \
-         net_dim.o ena_com.o ena_eth_com.o
+ena-y := ena_netdev.o ena_ethtool.o ena_lpc.o ena_phc.o ena_xdp.o dim.o \
+	 ena_devlink.o net_dim.o ena_com.o ena_eth_com.o
 
 ena-$(CONFIG_SYSFS) += ena_sysfs.o
 
 ifdef TEST_AF_XDP
 	ccflags-y += -DENA_TEST_AF_XDP
 endif
+
+ifdef ENA_PHC_INCLUDE
+	ccflags-y += -DENA_PHC_INCLUDE
+endif
diff --git a/drivers/amazon/net/ena/ena_admin_defs.h b/drivers/amazon/net/ena/ena_admin_defs.h
index 7f2595f2545cb..a52f588445039 100644
--- a/drivers/amazon/net/ena/ena_admin_defs.h
+++ b/drivers/amazon/net/ena/ena_admin_defs.h
@@ -50,12 +50,15 @@ enum ena_admin_aq_feature_id {
 	ENA_ADMIN_AENQ_CONFIG                       = 26,
 	ENA_ADMIN_LINK_CONFIG                       = 27,
 	ENA_ADMIN_HOST_ATTR_CONFIG                  = 28,
+	ENA_ADMIN_PHC_CONFIG                        = 29,
 	ENA_ADMIN_FEATURES_OPCODE_NUM               = 32,
 };
 
 /* device capabilities */
 enum ena_admin_aq_caps_id {
 	ENA_ADMIN_ENI_STATS                         = 0,
+	/* ENA SRD customer metrics */
+	ENA_ADMIN_ENA_SRD_INFO                      = 1,
 };
 
 enum ena_admin_placement_policy_type {
@@ -104,6 +107,8 @@ enum ena_admin_get_stats_type {
 	ENA_ADMIN_GET_STATS_TYPE_EXTENDED           = 1,
 	/* extra HW stats for specific network interface */
 	ENA_ADMIN_GET_STATS_TYPE_ENI                = 2,
+	/* extra HW stats for ENA SRD */
+	ENA_ADMIN_GET_STATS_TYPE_ENA_SRD            = 3,
 };
 
 enum ena_admin_get_stats_scope {
@@ -111,6 +116,20 @@ enum ena_admin_get_stats_scope {
 	ENA_ADMIN_ETH_TRAFFIC                       = 1,
 };
 
+enum ena_admin_get_phc_type {
+	ENA_ADMIN_PHC_TYPE_READLESS                 = 0,
+};
+
+/* ENA SRD configuration for ENI */
+enum ena_admin_ena_srd_flags {
+	/* Feature enabled */
+	ENA_ADMIN_ENA_SRD_ENABLED                   = BIT(0),
+	/* UDP support enabled */
+	ENA_ADMIN_ENA_SRD_UDP_ENABLED               = BIT(1),
+	/* Bypass Rx UDP ordering */
+	ENA_ADMIN_ENA_SRD_UDP_ORDERING_BYPASS_ENABLED = BIT(2),
+};
+
 struct ena_admin_aq_common_desc {
 	/* 11:0 : command_id
 	 * 15:12 : reserved12
@@ -424,6 +443,32 @@ struct ena_admin_eni_stats {
 	u64 linklocal_allowance_exceeded;
 };
 
+struct ena_admin_ena_srd_stats {
+	/* Number of packets transmitted over ENA SRD */
+	u64 ena_srd_tx_pkts;
+
+	/* Number of packets transmitted or could have been
+	 * transmitted over ENA SRD
+	 */
+	u64 ena_srd_eligible_tx_pkts;
+
+	/* Number of packets received over ENA SRD */
+	u64 ena_srd_rx_pkts;
+
+	/* Percentage of the ENA SRD resources that is in use */
+	u64 ena_srd_resource_utilization;
+};
+
+/* ENA SRD Statistics Command */
+struct ena_admin_ena_srd_info {
+	/* ENA SRD configuration bitmap. See ena_admin_ena_srd_flags for
+	 * details
+	 */
+	u64 flags;
+
+	struct ena_admin_ena_srd_stats ena_srd_stats;
+};
+
 struct ena_admin_acq_get_stats_resp {
 	struct ena_admin_acq_common_desc acq_common_desc;
 
@@ -433,6 +478,8 @@ struct ena_admin_acq_get_stats_resp {
 		struct ena_admin_basic_stats basic_stats;
 
 		struct ena_admin_eni_stats eni_stats;
+
+		struct ena_admin_ena_srd_info ena_srd_info;
 	} u;
 };
 
@@ -970,6 +1017,43 @@ struct ena_admin_queue_ext_feature_desc {
 	};
 };
 
+struct ena_admin_feature_phc_desc {
+	/* PHC type as defined in enum ena_admin_get_phc_type,
+	 * used only for GET command.
+	 */
+	u8 type;
+
+	/* Reserved - MBZ */
+	u8 reserved1[3];
+
+	/* PHC doorbell address as an offset to PCIe MMIO REG BAR,
+	 * used only for GET command.
+	 */
+	u32 doorbell_offset;
+
+	/* Max time for valid PHC retrieval, passing this threshold will
+	 * fail the get-time request and block PHC requests for
+	 * block_timeout_usec, used only for GET command.
+	 */
+	u32 expire_timeout_usec;
+
+	/* PHC requests block period, blocking starts if PHC request expired
+	 * in order to prevent floods on busy device,
+	 * used only for GET command.
+	 */
+	u32 block_timeout_usec;
+
+	/* Shared PHC physical address (ena_admin_phc_resp),
+	 * used only for SET command.
+	 */
+	struct ena_common_mem_addr output_address;
+
+	/* Shared PHC Size (ena_admin_phc_resp),
+	 * used only for SET command.
+	 */
+	u32 output_length;
+};
+
 struct ena_admin_get_feat_resp {
 	struct ena_admin_acq_common_desc acq_common_desc;
 
@@ -1000,6 +1084,8 @@ struct ena_admin_get_feat_resp {
 
 		struct ena_admin_ena_hw_hints hw_hints;
 
+		struct ena_admin_feature_phc_desc phc;
+
 		struct ena_admin_get_extra_properties_strings_desc extra_properties_strings;
 
 		struct ena_admin_get_extra_properties_flags_desc extra_properties_flags;
@@ -1036,6 +1122,9 @@ struct ena_admin_set_feat_cmd {
 
 		/* LLQ configuration */
 		struct ena_admin_feature_llq_desc llq;
+
+		/* PHC configuration */
+		struct ena_admin_feature_phc_desc phc;
 	} u;
 };
 
@@ -1114,6 +1203,16 @@ struct ena_admin_ena_mmio_req_read_less_resp {
 	u32 reg_val;
 };
 
+struct ena_admin_phc_resp {
+	u16 req_id;
+
+	u8 reserved1[6];
+
+	u64 timestamp;
+
+	u8 reserved2[48];
+};
+
 /* aq_common_desc */
 #define ENA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK            GENMASK(11, 0)
 #define ENA_ADMIN_AQ_COMMON_DESC_PHASE_MASK                 BIT(0)
diff --git a/drivers/amazon/net/ena/ena_com.c b/drivers/amazon/net/ena/ena_com.c
index 07d5d5eb5676b..520dad1e549af 100644
--- a/drivers/amazon/net/ena/ena_com.c
+++ b/drivers/amazon/net/ena/ena_com.c
@@ -39,6 +39,12 @@
 
 #define ENA_MAX_ADMIN_POLL_US 5000
 
+/* PHC definitions */
+#define ENA_PHC_DEFAULT_EXPIRE_TIMEOUT_USEC 20
+#define ENA_PHC_DEFAULT_BLOCK_TIMEOUT_USEC 1000
+#define ENA_PHC_TIMESTAMP_ERROR 0xFFFFFFFFFFFFFFFF
+#define ENA_PHC_REQ_ID_OFFSET 0xDEAD
+
 /*****************************************************************************/
 /*****************************************************************************/
 /*****************************************************************************/
@@ -361,7 +367,7 @@ static int ena_com_init_io_sq(struct ena_com_dev *ena_dev,
 			ENA_COM_BOUNCE_BUFFER_CNTRL_CNT;
 		io_sq->bounce_buf_ctrl.next_to_use = 0;
 
-		size = io_sq->bounce_buf_ctrl.buffer_size *
+		size = (size_t)io_sq->bounce_buf_ctrl.buffer_size *
 			io_sq->bounce_buf_ctrl.buffers_num;
 
 		dev_node = dev_to_node(ena_dev->dmadev);
@@ -1724,6 +1730,220 @@ void ena_com_set_admin_auto_polling_mode(struct ena_com_dev *ena_dev,
 	ena_dev->admin_queue.auto_polling = polling;
 }
 
+bool ena_com_phc_supported(struct ena_com_dev *ena_dev)
+{
+	return ena_com_check_supported_feature_id(ena_dev, ENA_ADMIN_PHC_CONFIG);
+}
+
+int ena_com_phc_init(struct ena_com_dev *ena_dev)
+{
+	struct ena_com_phc_info *phc = &ena_dev->phc;
+
+	memset(phc, 0x0, sizeof(*phc));
+
+	/* Allocate shared mem used PHC timestamp retrieved from device */
+	phc->virt_addr =
+		dma_zalloc_coherent(ena_dev->dmadev, sizeof(*phc->virt_addr),
+				    &phc->phys_addr, GFP_KERNEL);
+	if (unlikely(!phc->virt_addr))
+		return -ENOMEM;
+
+	spin_lock_init(&phc->lock);
+
+	phc->virt_addr->req_id = 0;
+	phc->virt_addr->timestamp = 0;
+
+	return 0;
+}
+
+int ena_com_phc_config(struct ena_com_dev *ena_dev)
+{
+	struct ena_com_phc_info *phc = &ena_dev->phc;
+	struct ena_admin_get_feat_resp get_feat_resp;
+	struct ena_admin_set_feat_resp set_feat_resp;
+	struct ena_admin_set_feat_cmd set_feat_cmd;
+	int ret = 0;
+
+	/* Get device PHC default configuration */
+	ret = ena_com_get_feature(ena_dev, &get_feat_resp, ENA_ADMIN_PHC_CONFIG, 0);
+	if (unlikely(ret)) {
+		netdev_err(ena_dev->net_device,
+			   "Failed to get PHC feature configuration, error: %d\n",
+			   ret);
+		return ret;
+	}
+
+	/* Suporting only readless PHC retrieval */
+	if (get_feat_resp.u.phc.type != ENA_ADMIN_PHC_TYPE_READLESS) {
+		netdev_err(ena_dev->net_device,
+			   "Unsupprted PHC type, error: %d\n", -EOPNOTSUPP);
+		return -EOPNOTSUPP;
+	}
+
+	/* Update PHC doorbell offset according to device value, used to write req_id to PHC bar */
+	phc->doorbell_offset = get_feat_resp.u.phc.doorbell_offset;
+
+	/* Update PHC expire timeout according to device or default driver value */
+	phc->expire_timeout_usec = (get_feat_resp.u.phc.expire_timeout_usec) ?
+				    get_feat_resp.u.phc.expire_timeout_usec :
+				    ENA_PHC_DEFAULT_EXPIRE_TIMEOUT_USEC;
+
+	/* Update PHC block timeout according to device or default driver value */
+	phc->block_timeout_usec = (get_feat_resp.u.phc.block_timeout_usec) ?
+				   get_feat_resp.u.phc.block_timeout_usec :
+				   ENA_PHC_DEFAULT_BLOCK_TIMEOUT_USEC;
+
+	/* Sanity check - expire timeout must not be above skip timeout */
+	if (phc->expire_timeout_usec > phc->block_timeout_usec)
+		phc->expire_timeout_usec = phc->block_timeout_usec;
+
+	/* Prepare PHC feature command with PHC output address */
+	memset(&set_feat_cmd, 0x0, sizeof(set_feat_cmd));
+	set_feat_cmd.aq_common_descriptor.opcode = ENA_ADMIN_SET_FEATURE;
+	set_feat_cmd.feat_common.feature_id = ENA_ADMIN_PHC_CONFIG;
+	set_feat_cmd.u.phc.output_length = sizeof(*phc->virt_addr);
+	ret = ena_com_mem_addr_set(ena_dev, &set_feat_cmd.u.phc.output_address, phc->phys_addr);
+	if (unlikely(ret)) {
+		netdev_err(ena_dev->net_device,
+			   "Failed setting PHC output address, error: %d\n",
+			   ret);
+		return ret;
+	}
+
+	/* Send PHC feature command to the device */
+	ret = ena_com_execute_admin_command(&ena_dev->admin_queue,
+					    (struct ena_admin_aq_entry *)&set_feat_cmd,
+					    sizeof(set_feat_cmd),
+					    (struct ena_admin_acq_entry *)&set_feat_resp,
+					    sizeof(set_feat_resp));
+
+	if (unlikely(ret)) {
+		netdev_err(ena_dev->net_device,
+			   "Failed to enable PHC, error: %d\n", ret);
+		return ret;
+	}
+
+	phc->enabled = true;
+	netdev_dbg(ena_dev->net_device, "PHC is enabled\n");
+
+	return ret;
+}
+
+void ena_com_phc_destroy(struct ena_com_dev *ena_dev)
+{
+	struct ena_com_phc_info *phc = &ena_dev->phc;
+
+	phc->enabled = false;
+
+	/* In case PHC is not supported by the device, silently exiting */
+	if (!phc->virt_addr)
+		return;
+
+	dma_free_coherent(ena_dev->dmadev, sizeof(*phc->virt_addr),
+			  phc->virt_addr, phc->phys_addr);
+	phc->virt_addr = NULL;
+}
+
+int ena_com_phc_get(struct ena_com_dev *ena_dev, u64 *timestamp)
+{
+	volatile struct ena_admin_phc_resp *read_resp = ena_dev->phc.virt_addr;
+	struct ena_com_phc_info *phc = &ena_dev->phc;
+	ktime_t initial_time = ktime_set(0, 0);
+	static ktime_t start_time;
+	unsigned long flags = 0;
+	ktime_t expire_time;
+	ktime_t block_time;
+	int ret = 0;
+
+	if (!phc->enabled) {
+		netdev_err(ena_dev->net_device, "PHC feature is not enabled\n");
+		return -EOPNOTSUPP;
+	}
+
+	spin_lock_irqsave(&phc->lock, flags);
+
+	/* Check if PHC is in blocked state */
+	if (unlikely(ktime_compare(start_time, initial_time))) {
+		/* Check if blocking time expired */
+		block_time = ktime_add_us(start_time, phc->block_timeout_usec);
+		if (!ktime_after(ktime_get(), block_time)) {
+			/* PHC is still in blocked state, skip PHC request */
+			phc->stats.phc_skp++;
+			ret = -EBUSY;
+			goto skip;
+		}
+
+		/* PHC is in active state, update statistics according to req_id and timestamp */
+		if ((READ_ONCE(read_resp->req_id) != phc->req_id) ||
+		    (read_resp->timestamp == ENA_PHC_TIMESTAMP_ERROR)) {
+			/* Device didn't update req_id during blocking time or timestamp is invalid,
+			 * this indicates on a device error
+			 */
+			phc->stats.phc_err++;
+		} else {
+			/* Device updated req_id during blocking time with valid timestamp */
+			phc->stats.phc_exp++;
+		}
+	}
+
+	/* Setting relative timeouts */
+	start_time = ktime_get();
+	block_time = ktime_add_us(start_time, phc->block_timeout_usec);
+	expire_time = ktime_add_us(start_time, phc->expire_timeout_usec);
+
+	/* We expect the device to return this req_id once the new PHC timestamp is updated */
+	phc->req_id++;
+
+	/* Initialize PHC shared memory with different req_id value to be able to identify once the
+	 * device changes it to req_id
+	 */
+	read_resp->req_id = phc->req_id + ENA_PHC_REQ_ID_OFFSET;
+
+	/* Writing req_id to PHC bar */
+	writel(phc->req_id, ena_dev->reg_bar + phc->doorbell_offset);
+
+	/* Stalling until the device updates req_id */
+	while (1) {
+		if (unlikely(ktime_after(ktime_get(), expire_time))) {
+			/* Gave up waiting for updated req_id, PHC enters into
+			 * blocked state until passing blocking time
+			 */
+			ret = -EBUSY;
+			break;
+		}
+
+		/* Check if req_id was updated by the device */
+		if (READ_ONCE(read_resp->req_id) != phc->req_id) {
+			/* req_id was not updated by the device, check again on next loop */
+			continue;
+		}
+
+		/* req_id was updated which indicates that PHC timestamp was updated too */
+		*timestamp = read_resp->timestamp;
+
+		/* PHC timestamp validty check */
+		if (unlikely(*timestamp == ENA_PHC_TIMESTAMP_ERROR)) {
+			/* Retrieved invalid PHC timestamp, PHC enters into
+			 * blocked state until passing blocking time
+			 */
+			ret = -EBUSY;
+			break;
+		}
+
+		/* Retrieved valid PHC timestamp */
+		phc->stats.phc_cnt++;
+
+		/* This indicates PHC state is active */
+		start_time = initial_time;
+		break;
+	}
+
+skip:
+	spin_unlock_irqrestore(&phc->lock, flags);
+
+	return ret;
+}
+
 int ena_com_mmio_reg_read_request_init(struct ena_com_dev *ena_dev)
 {
 	struct ena_com_mmio_read *mmio_read = &ena_dev->mmio_read;
@@ -2243,6 +2463,28 @@ int ena_com_get_eni_stats(struct ena_com_dev *ena_dev,
 	return ret;
 }
 
+int ena_com_get_ena_srd_info(struct ena_com_dev *ena_dev,
+			      struct ena_admin_ena_srd_info *info)
+{
+	struct ena_com_stats_ctx ctx;
+	int ret;
+
+	if (!ena_com_get_cap(ena_dev, ENA_ADMIN_ENA_SRD_INFO)) {
+		netdev_err(ena_dev->net_device,
+			   "Capability %d isn't supported\n",
+			   ENA_ADMIN_ENA_SRD_INFO);
+		return -EOPNOTSUPP;
+	}
+
+	memset(&ctx, 0x0, sizeof(ctx));
+	ret = ena_get_dev_stats(ena_dev, &ctx, ENA_ADMIN_GET_STATS_TYPE_ENA_SRD);
+	if (likely(ret == 0))
+		memcpy(info, &ctx.get_resp.u.ena_srd_info,
+		       sizeof(ctx.get_resp.u.ena_srd_info));
+
+	return ret;
+}
+
 int ena_com_get_dev_basic_stats(struct ena_com_dev *ena_dev,
 				struct ena_admin_basic_stats *stats)
 {
diff --git a/drivers/amazon/net/ena/ena_com.h b/drivers/amazon/net/ena/ena_com.h
index 555cb822bbb1c..ab17ba125ca3c 100644
--- a/drivers/amazon/net/ena/ena_com.h
+++ b/drivers/amazon/net/ena/ena_com.h
@@ -211,6 +211,13 @@ struct ena_com_stats_admin {
 	u64 no_completion;
 };
 
+struct ena_com_stats_phc {
+	u64 phc_cnt;
+	u64 phc_exp;
+	u64 phc_skp;
+	u64 phc_err;
+};
+
 struct ena_com_admin_queue {
 	void *q_dmadev;
 	void *bus;
@@ -263,6 +270,45 @@ struct ena_com_mmio_read {
 	spinlock_t lock;
 };
 
+/* PTP hardware clock (PHC) MMIO read data info */
+struct ena_com_phc_info {
+	/* Internal PHC statistics */
+	struct ena_com_stats_phc stats;
+
+	/* PHC shared memory - virtual address */
+	struct ena_admin_phc_resp *virt_addr;
+
+	/* Spin lock to ensure a single outstanding PHC read */
+	spinlock_t lock;
+
+	/* PHC doorbell address as an offset to PCIe MMIO REG BAR */
+	u32 doorbell_offset;
+
+	/* Shared memory read expire timeout (usec)
+	 * Max time for valid PHC retrieval, passing this threshold will fail the get time request
+	 * and block new PHC requests for block_timeout_usec in order to prevent floods on busy
+	 * device
+	 */
+	u32 expire_timeout_usec;
+
+	/* Shared memory read abort timeout (usec)
+	 * PHC requests block period, blocking starts once PHC request expired in order to prevent
+	 * floods on busy device, any PHC requests during block period will be skipped
+	 */
+	u32 block_timeout_usec;
+
+	/* Request id sent to the device */
+	u16 req_id;
+
+	/* True if PHC is enabled */
+	bool enabled;
+
+	/* PHC shared memory - memory handle */
+
+	/* PHC shared memory - physical address */
+	dma_addr_t phys_addr;
+};
+
 struct ena_rss {
 	/* Indirect table */
 	u16 *host_rss_ind_tbl;
@@ -310,7 +356,10 @@ struct ena_com_dev {
 	u16 stats_func; /* Selected function for extended statistic dump */
 	u16 stats_queue; /* Selected queue for extended statistic dump */
 
+	u32 ena_min_poll_delay_us;
+
 	struct ena_com_mmio_read mmio_read;
+	struct ena_com_phc_info phc;
 
 	struct ena_rss rss;
 	u32 supported_features;
@@ -330,8 +379,6 @@ struct ena_com_dev {
 	struct ena_intr_moder_entry *intr_moder_tbl;
 
 	struct ena_com_llq_info llq_info;
-
-	u32 ena_min_poll_delay_us;
 };
 
 struct ena_com_dev_get_features_ctx {
@@ -376,6 +423,40 @@ struct ena_aenq_handlers {
  */
 int ena_com_mmio_reg_read_request_init(struct ena_com_dev *ena_dev);
 
+/* ena_com_phc_init - Allocate and initialize PHC feature
+ * @ena_dev: ENA communication layer struct
+ * @note: This method assumes PHC is supported by the device
+ * @return - 0 on success, negative value on failure
+ */
+int ena_com_phc_init(struct ena_com_dev *ena_dev);
+
+/* ena_com_phc_supported - Return if PHC feature is supported by the device
+ * @ena_dev: ENA communication layer struct
+ * @note: This method must be called after getting supported features
+ * @return - supported or not
+ */
+bool ena_com_phc_supported(struct ena_com_dev *ena_dev);
+
+/* ena_com_phc_config - Configure PHC feature
+ * @ena_dev: ENA communication layer struct
+ * Configure PHC feature in driver and device
+ * @note: This method assumes PHC is supported by the device
+ * @return - 0 on success, negative value on failure
+ */
+int ena_com_phc_config(struct ena_com_dev *ena_dev);
+
+/* ena_com_phc_destroy - Destroy PHC feature
+ * @ena_dev: ENA communication layer struct
+ */
+void ena_com_phc_destroy(struct ena_com_dev *ena_dev);
+
+/* ena_com_phc_get - Retrieve PHC timestamp
+ * @ena_dev: ENA communication layer struct
+ * @timestamp: Retrieve PHC timestamp
+ * @return - 0 on success, negative value on failure
+ */
+int ena_com_phc_get(struct ena_com_dev *ena_dev, u64 *timestamp);
+
 /* ena_com_set_mmio_read_mode - Enable/disable the indirect mmio reg read mechanism
  * @ena_dev: ENA communication layer struct
  * @readless_supported: readless mode (enable/disable)
@@ -612,6 +693,15 @@ int ena_com_get_dev_basic_stats(struct ena_com_dev *ena_dev,
 int ena_com_get_eni_stats(struct ena_com_dev *ena_dev,
 			  struct ena_admin_eni_stats *stats);
 
+/* ena_com_get_ena_srd_info - Get ENA SRD network interface statistics
+ * @ena_dev: ENA communication layer struct
+ * @info: ena srd stats and flags
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_get_ena_srd_info(struct ena_com_dev *ena_dev,
+			     struct ena_admin_ena_srd_info *info);
+
 /* ena_com_set_dev_mtu - Configure the device mtu.
  * @ena_dev: ENA communication layer struct
  * @mtu: mtu value
diff --git a/drivers/amazon/net/ena/ena_eth_com.h b/drivers/amazon/net/ena/ena_eth_com.h
index 91207c657b73a..028270a069d86 100644
--- a/drivers/amazon/net/ena/ena_eth_com.h
+++ b/drivers/amazon/net/ena/ena_eth_com.h
@@ -42,7 +42,7 @@ struct ena_com_rx_ctx {
 	bool frag;
 	u32 hash;
 	u16 descs;
-	int max_bufs;
+	u16 max_bufs;
 	u8 pkt_offset;
 };
 
diff --git a/drivers/amazon/net/ena/ena_ethtool.c b/drivers/amazon/net/ena/ena_ethtool.c
index b3bf0836a2c5b..08f7ee8fc151c 100644
--- a/drivers/amazon/net/ena/ena_ethtool.c
+++ b/drivers/amazon/net/ena/ena_ethtool.c
@@ -5,20 +5,27 @@
 
 #include <linux/ethtool.h>
 #include <linux/pci.h>
+#include <linux/net_tstamp.h>
 
 #include "ena_netdev.h"
 #include "ena_xdp.h"
+#include "ena_phc.h"
 
 struct ena_stats {
 	char name[ETH_GSTRING_LEN];
 	int stat_offset;
 };
 
-#define ENA_STAT_ENA_COM_ENTRY(stat) { \
+#define ENA_STAT_ENA_COM_ADMIN_ENTRY(stat) { \
 	.name = #stat, \
 	.stat_offset = offsetof(struct ena_com_stats_admin, stat) / sizeof(u64) \
 }
 
+#define ENA_STAT_ENA_COM_PHC_ENTRY(stat) { \
+	.name = #stat, \
+	.stat_offset = offsetof(struct ena_com_stats_phc, stat) / sizeof(u64) \
+}
+
 #define ENA_STAT_ENTRY(stat, stat_type) { \
 	.name = #stat, \
 	.stat_offset = offsetof(struct ena_stats_##stat_type, stat) / sizeof(u64) \
@@ -41,6 +48,14 @@ struct ena_stats {
 #define ENA_STAT_ENI_ENTRY(stat) \
 	ENA_STAT_HW_ENTRY(stat, eni_stats)
 
+#define ENA_STAT_ENA_SRD_ENTRY(stat) \
+	ENA_STAT_HW_ENTRY(stat, ena_srd_stats)
+
+#define ENA_STAT_ENA_SRD_MODE_ENTRY(stat) { \
+	.name = #stat, \
+	.stat_offset = offsetof(struct ena_admin_ena_srd_info, flags) / sizeof(u64) \
+}
+
 static const struct ena_stats ena_stats_global_strings[] = {
 	ENA_STAT_GLOBAL_ENTRY(tx_timeout),
 	ENA_STAT_GLOBAL_ENTRY(suspend),
@@ -49,6 +64,7 @@ static const struct ena_stats ena_stats_global_strings[] = {
 	ENA_STAT_GLOBAL_ENTRY(interface_up),
 	ENA_STAT_GLOBAL_ENTRY(interface_down),
 	ENA_STAT_GLOBAL_ENTRY(admin_q_pause),
+	ENA_STAT_GLOBAL_ENTRY(reset_fail),
 };
 
 static const struct ena_stats ena_stats_eni_strings[] = {
@@ -59,6 +75,14 @@ static const struct ena_stats ena_stats_eni_strings[] = {
 	ENA_STAT_ENI_ENTRY(linklocal_allowance_exceeded),
 };
 
+static const struct ena_stats ena_srd_info_strings[] = {
+	ENA_STAT_ENA_SRD_MODE_ENTRY(ena_srd_mode),
+	ENA_STAT_ENA_SRD_ENTRY(ena_srd_tx_pkts),
+	ENA_STAT_ENA_SRD_ENTRY(ena_srd_eligible_tx_pkts),
+	ENA_STAT_ENA_SRD_ENTRY(ena_srd_rx_pkts),
+	ENA_STAT_ENA_SRD_ENTRY(ena_srd_resource_utilization)
+};
+
 static const struct ena_stats ena_stats_tx_strings[] = {
 	ENA_STAT_TX_ENTRY(cnt),
 	ENA_STAT_TX_ENTRY(bytes),
@@ -117,19 +141,28 @@ static const struct ena_stats ena_stats_rx_strings[] = {
 #endif /* ENA_AF_XDP_SUPPORT */
 };
 
-static const struct ena_stats ena_stats_ena_com_strings[] = {
-	ENA_STAT_ENA_COM_ENTRY(aborted_cmd),
-	ENA_STAT_ENA_COM_ENTRY(submitted_cmd),
-	ENA_STAT_ENA_COM_ENTRY(completed_cmd),
-	ENA_STAT_ENA_COM_ENTRY(out_of_space),
-	ENA_STAT_ENA_COM_ENTRY(no_completion),
+static const struct ena_stats ena_stats_ena_com_admin_strings[] = {
+	ENA_STAT_ENA_COM_ADMIN_ENTRY(aborted_cmd),
+	ENA_STAT_ENA_COM_ADMIN_ENTRY(submitted_cmd),
+	ENA_STAT_ENA_COM_ADMIN_ENTRY(completed_cmd),
+	ENA_STAT_ENA_COM_ADMIN_ENTRY(out_of_space),
+	ENA_STAT_ENA_COM_ADMIN_ENTRY(no_completion),
+};
+
+static const struct ena_stats ena_stats_ena_com_phc_strings[] = {
+	ENA_STAT_ENA_COM_PHC_ENTRY(phc_cnt),
+	ENA_STAT_ENA_COM_PHC_ENTRY(phc_exp),
+	ENA_STAT_ENA_COM_PHC_ENTRY(phc_skp),
+	ENA_STAT_ENA_COM_PHC_ENTRY(phc_err),
 };
 
 #define ENA_STATS_ARRAY_GLOBAL		ARRAY_SIZE(ena_stats_global_strings)
 #define ENA_STATS_ARRAY_TX		ARRAY_SIZE(ena_stats_tx_strings)
 #define ENA_STATS_ARRAY_RX		ARRAY_SIZE(ena_stats_rx_strings)
-#define ENA_STATS_ARRAY_ENA_COM		ARRAY_SIZE(ena_stats_ena_com_strings)
-#define ENA_STATS_ARRAY_ENI(adapter)	ARRAY_SIZE(ena_stats_eni_strings)
+#define ENA_STATS_ARRAY_ENA_COM_ADMIN	ARRAY_SIZE(ena_stats_ena_com_admin_strings)
+#define ENA_STATS_ARRAY_ENA_COM_PHC	ARRAY_SIZE(ena_stats_ena_com_phc_strings)
+#define ENA_STATS_ARRAY_ENI		ARRAY_SIZE(ena_stats_eni_strings)
+#define ENA_STATS_ARRAY_ENA_SRD		ARRAY_SIZE(ena_srd_info_strings)
 
 static const char ena_priv_flags_strings[][ETH_GSTRING_LEN] = {
 #define ENA_PRIV_FLAGS_LPC	BIT(0)
@@ -185,14 +218,14 @@ static void ena_queue_stats(struct ena_adapter *adapter, u64 **data)
 	}
 }
 
-static void ena_dev_admin_queue_stats(struct ena_adapter *adapter, u64 **data)
+static void ena_com_admin_queue_stats(struct ena_adapter *adapter, u64 **data)
 {
 	const struct ena_stats *ena_stats;
 	u64 *ptr;
 	int i;
 
-	for (i = 0; i < ENA_STATS_ARRAY_ENA_COM; i++) {
-		ena_stats = &ena_stats_ena_com_strings[i];
+	for (i = 0; i < ENA_STATS_ARRAY_ENA_COM_ADMIN; i++) {
+		ena_stats = &ena_stats_ena_com_admin_strings[i];
 
 		ptr = (u64 *)&adapter->ena_dev->admin_queue.stats +
 			ena_stats->stat_offset;
@@ -201,9 +234,22 @@ static void ena_dev_admin_queue_stats(struct ena_adapter *adapter, u64 **data)
 	}
 }
 
+static void ena_com_phc_stats(struct ena_adapter *adapter, u64 **data)
+{
+	const struct ena_stats *ena_stats;
+	u64 *ptr;
+	int i;
+
+	for (i = 0; i < ENA_STATS_ARRAY_ENA_COM_PHC; i++) {
+		ena_stats = &ena_stats_ena_com_phc_strings[i];
+		ptr = (u64 *)&adapter->ena_dev->phc.stats + ena_stats->stat_offset;
+		*(*data)++ = *ptr;
+	}
+}
+
 static void ena_get_stats(struct ena_adapter *adapter,
 			  u64 *data,
-			  bool eni_stats_needed)
+			  bool hw_stats_needed)
 {
 	const struct ena_stats *ena_stats;
 	u64 *ptr;
@@ -217,20 +263,46 @@ static void ena_get_stats(struct ena_adapter *adapter,
 		ena_safe_update_stat(ptr, data++, &adapter->syncp);
 	}
 
-	if (eni_stats_needed) {
-		ena_update_hw_stats(adapter);
-		for (i = 0; i < ENA_STATS_ARRAY_ENI(adapter); i++) {
-			ena_stats = &ena_stats_eni_strings[i];
+	if (hw_stats_needed) {
+		if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENI_STATS)) {
+			ena_com_get_eni_stats(adapter->ena_dev, &adapter->eni_stats);
+			/* Updating regardless of rc - once we told ethtool how many stats we have
+			 * it will print that much stats. We can't leave holes in the stats
+			 */
+			for (i = 0; i < ENA_STATS_ARRAY_ENI; i++) {
+				ena_stats = &ena_stats_eni_strings[i];
 
-			ptr = (u64 *)&adapter->eni_stats +
-				ena_stats->stat_offset;
+				ptr = (u64 *)&adapter->eni_stats +
+					ena_stats->stat_offset;
 
+				ena_safe_update_stat(ptr, data++, &adapter->syncp);
+			}
+		}
+
+		if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENA_SRD_INFO)) {
+			ena_com_get_ena_srd_info(adapter->ena_dev, &adapter->ena_srd_info);
+			/* Get ENA SRD mode */
+			ptr = (u64 *)&adapter->ena_srd_info;
 			ena_safe_update_stat(ptr, data++, &adapter->syncp);
+			for (i = 1; i < ENA_STATS_ARRAY_ENA_SRD; i++) {
+				ena_stats = &ena_srd_info_strings[i];
+				/* Wrapped within an outer struct - need to accommodate an
+				 * additional offset of the ENA SRD mode that was already processed
+				 */
+				ptr = (u64 *)&adapter->ena_srd_info +
+					ena_stats->stat_offset + 1;
+
+				ena_safe_update_stat(ptr, data++, &adapter->syncp);
+			}
 		}
 	}
 
 	ena_queue_stats(adapter, &data);
-	ena_dev_admin_queue_stats(adapter, &data);
+	ena_com_admin_queue_stats(adapter, &data);
+
+	if (ena_phc_enabled(adapter)) {
+		ena_com_phc_stats(adapter, &data);
+	}
 }
 
 static void ena_get_ethtool_stats(struct net_device *netdev,
@@ -238,23 +310,41 @@ static void ena_get_ethtool_stats(struct net_device *netdev,
 				  u64 *data)
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
-	struct ena_com_dev *dev = adapter->ena_dev;
 
-	ena_get_stats(adapter, data, ena_com_get_cap(dev, ENA_ADMIN_ENI_STATS));
+	ena_get_stats(adapter, data, true);
 }
 
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0)
+static int ena_get_ts_info(struct net_device *netdev, struct ethtool_ts_info *info)
+{
+	struct ena_adapter *adapter = netdev_priv(netdev);
+
+	info->so_timestamping = SOF_TIMESTAMPING_TX_SOFTWARE |
+				SOF_TIMESTAMPING_RX_SOFTWARE |
+				SOF_TIMESTAMPING_SOFTWARE;
+
+	info->phc_index = ena_phc_get_index(adapter);
+
+	return 0;
+}
+
+#endif
 static int ena_get_sw_stats_count(struct ena_adapter *adapter)
 {
-	return adapter->num_io_queues * (ENA_STATS_ARRAY_TX + ENA_STATS_ARRAY_RX)
-		+ adapter->xdp_num_queues * ENA_STATS_ARRAY_TX
-		+ ENA_STATS_ARRAY_GLOBAL + ENA_STATS_ARRAY_ENA_COM;
+	int count = adapter->num_io_queues * (ENA_STATS_ARRAY_TX + ENA_STATS_ARRAY_RX)
+		    + adapter->xdp_num_queues * ENA_STATS_ARRAY_TX
+		    + ENA_STATS_ARRAY_GLOBAL + ENA_STATS_ARRAY_ENA_COM_ADMIN;
+
+	if (ena_phc_enabled(adapter))
+		count += ENA_STATS_ARRAY_ENA_COM_PHC;
+
+	return count;
 }
 
 static int ena_get_hw_stats_count(struct ena_adapter *adapter)
 {
-	bool supported = ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENI_STATS);
-
-	return ENA_STATS_ARRAY_ENI(adapter) * supported;
+	return ENA_STATS_ARRAY_ENI * ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENI_STATS) +
+	       ENA_STATS_ARRAY_ENA_SRD * ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENA_SRD_INFO);
 }
 
 int ena_get_sset_count(struct net_device *netdev, int sset)
@@ -290,37 +380,47 @@ static void ena_queue_strings(struct ena_adapter *adapter, u8 **data)
 					ena_stats->name);
 		}
 
-		if (!is_xdp) {
-			/* RX stats, in XDP there isn't a RX queue
-			 * counterpart
-			 */
-			for (j = 0; j < ENA_STATS_ARRAY_RX; j++) {
-				ena_stats = &ena_stats_rx_strings[j];
+		/* In XDP there isn't an RX queue counterpart */
+		if (is_xdp)
+			continue;
 
-				ethtool_sprintf(data,
-						"queue_%u_rx_%s", i,
-						ena_stats->name);
-			}
+		for (j = 0; j < ENA_STATS_ARRAY_RX; j++) {
+			ena_stats = &ena_stats_rx_strings[j];
+
+			ethtool_sprintf(data,
+					"queue_%u_rx_%s", i,
+					ena_stats->name);
 		}
 	}
 }
 
-static void ena_com_dev_strings(u8 **data)
+static void ena_com_admin_strings(u8 **data)
 {
 	const struct ena_stats *ena_stats;
 	int i;
 
-	for (i = 0; i < ENA_STATS_ARRAY_ENA_COM; i++) {
-		ena_stats = &ena_stats_ena_com_strings[i];
+	for (i = 0; i < ENA_STATS_ARRAY_ENA_COM_ADMIN; i++) {
+		ena_stats = &ena_stats_ena_com_admin_strings[i];
 
 		ethtool_sprintf(data,
 				"ena_admin_q_%s", ena_stats->name);
 	}
 }
 
+static void ena_com_phc_strings(u8 **data)
+{
+	const struct ena_stats *ena_stats;
+	int i;
+
+	for (i = 0; i < ENA_STATS_ARRAY_ENA_COM_PHC; i++) {
+		ena_stats = &ena_stats_ena_com_phc_strings[i];
+		ethtool_sprintf(data, "%s", ena_stats->name);
+	}
+}
+
 static void ena_get_strings(struct ena_adapter *adapter,
 			    u8 *data,
-			    bool eni_stats_needed)
+			    bool hw_stats_needed)
 {
 	const struct ena_stats *ena_stats;
 	int i;
@@ -330,15 +430,27 @@ static void ena_get_strings(struct ena_adapter *adapter,
 		ethtool_sprintf(&data, ena_stats->name);
 	}
 
-	if (eni_stats_needed) {
-		for (i = 0; i < ENA_STATS_ARRAY_ENI(adapter); i++) {
-			ena_stats = &ena_stats_eni_strings[i];
-			ethtool_sprintf(&data, ena_stats->name);
+	if (hw_stats_needed) {
+		if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENI_STATS)) {
+			for (i = 0; i < ENA_STATS_ARRAY_ENI; i++) {
+				ena_stats = &ena_stats_eni_strings[i];
+				ethtool_sprintf(&data, ena_stats->name);
+			}
+		}
+		if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENA_SRD_INFO)) {
+			for (i = 0; i < ENA_STATS_ARRAY_ENA_SRD; i++) {
+				ena_stats = &ena_srd_info_strings[i];
+				ethtool_sprintf(&data, ena_stats->name);
+			}
 		}
 	}
 
 	ena_queue_strings(adapter, &data);
-	ena_com_dev_strings(&data);
+	ena_com_admin_strings(&data);
+
+	if (ena_phc_enabled(adapter)) {
+		ena_com_phc_strings(&data);
+	}
 }
 
 static void ena_get_ethtool_strings(struct net_device *netdev,
@@ -346,11 +458,10 @@ static void ena_get_ethtool_strings(struct net_device *netdev,
 				    u8 *data)
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
-	struct ena_com_dev *dev = adapter->ena_dev;
 
 	switch (sset) {
 	case ETH_SS_STATS:
-		ena_get_strings(adapter, data, ena_com_get_cap(dev, ENA_ADMIN_ENI_STATS));
+		ena_get_strings(adapter, data, true);
 		break;
 	case ETH_SS_PRIV_FLAGS:
 		memcpy(data, ena_priv_flags_strings, sizeof(ena_priv_flags_strings));
@@ -1060,11 +1171,7 @@ static int ena_set_tunable(struct net_device *netdev,
 	switch (tuna->id) {
 	case ETHTOOL_RX_COPYBREAK:
 		len = *(u32 *)data;
-		if (len > min_t(u16, adapter->netdev->mtu, ENA_PAGE_SIZE)) {
-			ret = -EINVAL;
-			break;
-		}
-		adapter->rx_copybreak = len;
+		ret = ena_set_rx_copybreak(adapter, len);
 		break;
 	default:
 		ret = -EINVAL;
@@ -1141,7 +1248,7 @@ static const struct ethtool_ops ena_ethtool_ops = {
 	.set_tunable		= ena_set_tunable,
 #endif
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0)
-	.get_ts_info            = ethtool_op_get_ts_info,
+	.get_ts_info		= ena_get_ts_info,
 #endif
 	.get_priv_flags		= ena_get_priv_flags,
 	.set_priv_flags		= ena_set_priv_flags,
diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c
index 55c3c141c81a0..fbb96d864d8c3 100644
--- a/drivers/amazon/net/ena/ena_netdev.c
+++ b/drivers/amazon/net/ena/ena_netdev.c
@@ -29,6 +29,7 @@
 
 #include "ena_lpc.h"
 
+#include "ena_phc.h"
 #include "ena_devlink.h"
 
 static char version[] = DEVICE_NAME " v" DRV_MODULE_GENERATION "\n";
@@ -43,10 +44,8 @@ MODULE_VERSION(DRV_MODULE_GENERATION);
 
 #define ENA_MAX_RINGS min_t(unsigned int, ENA_MAX_NUM_IO_QUEUES, num_possible_cpus())
 
-#define ENA_NAPI_BUDGET 64
-
 #define DEFAULT_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_IFUP | \
-		NETIF_MSG_TX_DONE | NETIF_MSG_TX_ERR | NETIF_MSG_RX_ERR)
+		NETIF_MSG_IFDOWN | NETIF_MSG_TX_ERR | NETIF_MSG_RX_ERR)
 #ifndef ENA_LINEAR_FRAG_SUPPORTED
 
 #define ENA_SKB_PULL_MIN_LEN 64
@@ -84,7 +83,7 @@ MODULE_DEVICE_TABLE(pci, ena_pci_tbl);
 
 static int ena_rss_init_default(struct ena_adapter *adapter);
 static void check_for_admin_com_state(struct ena_adapter *adapter);
-static void ena_calc_io_queue_size(struct ena_adapter *adapter,
+static int ena_calc_io_queue_size(struct ena_adapter *adapter,
 				   struct ena_com_dev_get_features_ctx *get_feat_ctx);
 static void ena_set_dev_offloads(struct ena_com_dev_get_features_ctx *feat,
 				 struct net_device *netdev);
@@ -1060,7 +1059,7 @@ static struct sk_buff *ena_alloc_skb(struct ena_ring *rx_ring, void *first_frag,
 	else
 		skb = netdev_alloc_skb_ip_align(rx_ring->netdev,
 						ENA_SKB_PULL_MIN_LEN);
-#endif
+#endif /* ENA_LINEAR_FRAG_SUPPORTED */
 
 	if (unlikely(!skb)) {
 		ena_increase_stat(&rx_ring->rx_stats.skb_alloc_fail, 1,
@@ -1088,7 +1087,7 @@ static bool ena_try_rx_buf_page_reuse(struct ena_rx_buffer *rx_info, u16 buf_len
 		ena_buf->paddr += buf_len;
 		ena_buf->len -= buf_len;
 		return true;
-        }
+	}
 
 	return false;
 }
@@ -1331,20 +1330,24 @@ void ena_set_rx_hash(struct ena_ring *rx_ring,
 }
 
 #ifdef ENA_XDP_SUPPORT
-static int ena_xdp_handle_buff(struct ena_ring *rx_ring, struct xdp_buff *xdp)
+static int ena_xdp_handle_buff(struct ena_ring *rx_ring, struct xdp_buff *xdp, u16 num_descs)
 {
 	struct ena_rx_buffer *rx_info;
 	int ret;
 
+	/* XDP multi-buffer packets not supported */
+	if (unlikely(num_descs > 1)) {
+		netdev_err_once(rx_ring->adapter->netdev,
+				"xdp: dropped multi-buffer packets. RX packets must be < %lu\n",
+				ENA_XDP_MAX_MTU);
+		ena_increase_stat(&rx_ring->rx_stats.xdp_drop, 1, &rx_ring->syncp);
+		return ENA_XDP_DROP;
+	}
+
 	rx_info = &rx_ring->rx_buffer_info[rx_ring->ena_bufs[0].req_id];
 	xdp_prepare_buff(xdp, page_address(rx_info->page),
 			 rx_info->buf_offset,
 			 rx_ring->ena_bufs[0].len, false);
-	/* If for some reason we received a bigger packet than
-	 * we expect, then we simply drop it
-	 */
-	if (unlikely(rx_ring->ena_bufs[0].len > ENA_XDP_MAX_MTU))
-		return ENA_XDP_DROP;
 
 	ret = ena_xdp_execute(rx_ring, xdp);
 
@@ -1424,7 +1427,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 
 #ifdef ENA_XDP_SUPPORT
 		if (ena_xdp_present_ring(rx_ring))
-			xdp_verdict = ena_xdp_handle_buff(rx_ring, &xdp);
+			xdp_verdict = ena_xdp_handle_buff(rx_ring, &xdp, ena_rx_ctx.descs);
 
 		/* allocate skb and fill it */
 		if (xdp_verdict == ENA_XDP_PASS)
@@ -2036,7 +2039,7 @@ static void ena_init_napi_in_range(struct ena_adapter *adapter,
 		netif_napi_add(adapter->netdev,
 			       &napi->napi,
 			       napi_handler,
-			       ENA_NAPI_BUDGET);
+			       NAPI_POLL_WEIGHT);
 
 #ifdef ENA_BUSY_POLL_SUPPORT
 		napi_hash_add(&adapter->ena_napi[i].napi);
@@ -2497,7 +2500,7 @@ void ena_down(struct ena_adapter *adapter)
 {
 	int io_queue_count = adapter->num_io_queues + adapter->xdp_num_queues;
 
-	netif_info(adapter, ifdown, adapter->netdev, "%s\n", __func__);
+	netif_dbg(adapter, ifdown, adapter->netdev, "%s\n", __func__);
 
 	clear_bit(ENA_FLAG_DEV_UP, &adapter->flags);
 
@@ -2510,8 +2513,6 @@ void ena_down(struct ena_adapter *adapter)
 	/* After this point the napi handler won't enable the tx queue */
 	ena_napi_disable_in_range(adapter, 0, io_queue_count);
 
-	/* After destroy the queue there won't be any new interrupts */
-
 	if (test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags)) {
 		int rc;
 
@@ -2653,6 +2654,24 @@ int ena_update_queue_sizes(struct ena_adapter *adapter,
 	return dev_was_up ? ena_up(adapter) : 0;
 }
 
+int ena_set_rx_copybreak(struct ena_adapter *adapter, u32 rx_copybreak)
+{
+	struct ena_ring *rx_ring;
+	int i;
+
+	if (rx_copybreak > min_t(u16, adapter->netdev->mtu, ENA_PAGE_SIZE))
+		return -EINVAL;
+
+	adapter->rx_copybreak = rx_copybreak;
+
+	for (i = 0; i < adapter->num_io_queues; i++) {
+		rx_ring = &adapter->rx_ring[i];
+		rx_ring->rx_copybreak = rx_copybreak;
+	}
+
+	return 0;
+}
+
 int ena_update_queue_count(struct ena_adapter *adapter, u32 new_channel_count)
 {
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
@@ -2902,8 +2921,6 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (unlikely(rc))
 		goto error_drop_packet;
 
-	skb_tx_timestamp(skb);
-
 	next_to_use = tx_ring->next_to_use;
 	req_id = tx_ring->free_ids[next_to_use];
 	tx_info = &tx_ring->tx_buffer_info[req_id];
@@ -2968,6 +2985,8 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		}
 	}
 
+	skb_tx_timestamp(skb);
+
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0)
 #ifdef HAVE_NETDEV_XMIT_MORE
 	if (netif_xmit_stopped(txq) || !netdev_xmit_more())
@@ -3146,19 +3165,6 @@ static void ena_config_debug_area(struct ena_adapter *adapter)
 	ena_com_delete_debug_area(adapter->ena_dev);
 }
 
-int ena_update_hw_stats(struct ena_adapter *adapter)
-{
-	int rc;
-
-	rc = ena_com_get_eni_stats(adapter->ena_dev, &adapter->eni_stats);
-	if (rc) {
-		netdev_err(adapter->netdev, "Failed to get ENI stats\n");
-		return rc;
-	}
-
-	return 0;
-}
-
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36))
 #ifdef NDO_GET_STATS_64_V2
 static void ena_get_stats64(struct net_device *netdev,
@@ -3170,6 +3176,7 @@ static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
 	struct ena_ring *rx_ring, *tx_ring;
+	u64 xdp_rx_drops = 0;
 	unsigned int start;
 	u64 rx_drops;
 	u64 tx_drops;
@@ -3182,7 +3189,7 @@ static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
 		return NULL;
 #endif
 
-	for (i = 0; i < adapter->num_io_queues; i++) {
+	for (i = 0; i < adapter->num_io_queues + adapter->xdp_num_queues; i++) {
 		u64 bytes, packets;
 
 		tx_ring = &adapter->tx_ring[i];
@@ -3196,12 +3203,17 @@ static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
 		stats->tx_packets += packets;
 		stats->tx_bytes += bytes;
 
+		/* In XDP there isn't an RX queue counterpart */
+		if (ENA_IS_XDP_INDEX(adapter, i))
+			continue;
+
 		rx_ring = &adapter->rx_ring[i];
 
 		do {
 			start = u64_stats_fetch_begin_irq(&rx_ring->syncp);
 			packets = rx_ring->rx_stats.cnt;
 			bytes = rx_ring->rx_stats.bytes;
+			xdp_rx_drops += ena_ring_xdp_drops_cnt(rx_ring);
 		} while (u64_stats_fetch_retry_irq(&rx_ring->syncp, start));
 
 		stats->rx_packets += packets;
@@ -3214,7 +3226,7 @@ static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
 		tx_drops = adapter->dev_stats.tx_drops;
 	} while (u64_stats_fetch_retry_irq(&adapter->syncp, start));
 
-	stats->rx_dropped = rx_drops;
+	stats->rx_dropped = rx_drops + xdp_rx_drops;
 	stats->tx_dropped = tx_drops;
 
 	stats->multicast = 0;
@@ -3570,15 +3582,26 @@ static int ena_device_init(struct ena_adapter *adapter, struct pci_dev *pdev,
 		goto err_admin_init;
 	}
 
-	ena_calc_io_queue_size(adapter, get_feat_ctx);
+	rc = ena_calc_io_queue_size(adapter, get_feat_ctx);
+	if (unlikely(rc))
+		goto err_admin_init;
 
 	/* Turned on features shouldn't change due to reset. */
 	prev_netdev_features = adapter->netdev->features;
 	ena_set_dev_offloads(get_feat_ctx, adapter->netdev);
 	adapter->netdev->features = prev_netdev_features;
+
+	rc = ena_phc_init(adapter);
+	if (unlikely(rc && (rc != -EOPNOTSUPP))) {
+		netdev_err(netdev, "Failed initiating PHC, error: %d\n", rc);
+		goto err_admin_init;
+	}
+
 	return 0;
 
 err_admin_init:
+	ena_com_abort_admin_commands(ena_dev);
+	ena_com_wait_for_abort_completion(ena_dev);
 	ena_com_delete_host_info(ena_dev);
 	ena_com_admin_destroy(ena_dev);
 err_mmio_read_less:
@@ -3619,14 +3642,15 @@ static int ena_enable_msix_and_set_admin_interrupts(struct ena_adapter *adapter)
 	return rc;
 }
 
-void ena_destroy_device(struct ena_adapter *adapter, bool graceful)
+int ena_destroy_device(struct ena_adapter *adapter, bool graceful)
 {
 	struct net_device *netdev = adapter->netdev;
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
 	bool dev_up;
+	int rc = 0;
 
 	if (!test_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags))
-		return;
+		return 0;
 
 	netif_carrier_off(netdev);
 
@@ -3638,14 +3662,14 @@ void ena_destroy_device(struct ena_adapter *adapter, bool graceful)
 	if (!graceful)
 		ena_com_set_admin_running_state(ena_dev, false);
 
-	if (test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
+	if (dev_up)
 		ena_down(adapter);
 
 	/* Stop the device from sending AENQ events (in case reset flag is set
 	 *  and device is up, ena_down() already reset the device.
 	 */
 	if (!(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags) && dev_up))
-		ena_com_dev_reset(adapter->ena_dev, adapter->reset_reason);
+		rc = ena_com_dev_reset(adapter->ena_dev, adapter->reset_reason);
 
 	ena_free_mgmnt_irq(adapter);
 
@@ -3657,6 +3681,8 @@ void ena_destroy_device(struct ena_adapter *adapter, bool graceful)
 
 	ena_com_admin_destroy(ena_dev);
 
+	ena_phc_destroy(adapter);
+
 	ena_com_mmio_reg_read_request_destroy(ena_dev);
 
 	/* return reset reason to default value */
@@ -3664,6 +3690,8 @@ void ena_destroy_device(struct ena_adapter *adapter, bool graceful)
 
 	clear_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
 	clear_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags);
+
+	return rc;
 }
 
 int ena_restore_device(struct ena_adapter *adapter)
@@ -3727,6 +3755,7 @@ int ena_restore_device(struct ena_adapter *adapter)
 	ena_com_wait_for_abort_completion(ena_dev);
 	ena_com_admin_destroy(ena_dev);
 	ena_com_dev_reset(ena_dev, ENA_REGS_RESET_DRIVER_INVALID_STATE);
+	ena_phc_destroy(adapter);
 	ena_com_mmio_reg_read_request_destroy(ena_dev);
 err:
 	clear_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags);
@@ -3739,14 +3768,17 @@ int ena_restore_device(struct ena_adapter *adapter)
 
 static void ena_fw_reset_device(struct work_struct *work)
 {
+	int rc = 0;
+
 	struct ena_adapter *adapter =
 		container_of(work, struct ena_adapter, reset_task);
 
 	rtnl_lock();
 
 	if (likely(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) {
-		ena_destroy_device(adapter, false);
-		ena_restore_device(adapter);
+		rc |= ena_destroy_device(adapter, false);
+		rc |= ena_restore_device(adapter);
+		adapter->dev_stats.reset_fail += !!rc;
 
 		dev_err(&adapter->pdev->dev,
 			"Device reset completed successfully, Driver info: %s\n",
@@ -4267,7 +4299,7 @@ static void ena_release_bars(struct ena_com_dev *ena_dev, struct pci_dev *pdev)
 }
 
 
-static void ena_calc_io_queue_size(struct ena_adapter *adapter,
+static int ena_calc_io_queue_size(struct ena_adapter *adapter,
 				   struct ena_com_dev_get_features_ctx *get_feat_ctx)
 {
 	struct ena_admin_feature_llq_desc *llq = &get_feat_ctx->llq;
@@ -4327,6 +4359,18 @@ static void ena_calc_io_queue_size(struct ena_adapter *adapter,
 	max_tx_queue_size = rounddown_pow_of_two(max_tx_queue_size);
 	max_rx_queue_size = rounddown_pow_of_two(max_rx_queue_size);
 
+	if (max_tx_queue_size < ENA_MIN_RING_SIZE) {
+		netdev_err(adapter->netdev, "Device max TX queue size: %d < minimum: %d\n",
+			   max_tx_queue_size, ENA_MIN_RING_SIZE);
+		return -EFAULT;
+	}
+
+	if (max_rx_queue_size < ENA_MIN_RING_SIZE) {
+		netdev_err(adapter->netdev, "Device max RX queue size: %d < minimum: %d\n",
+			   max_rx_queue_size, ENA_MIN_RING_SIZE);
+		return -EFAULT;
+	}
+
 	/* When forcing large headers, we multiply the entry size by 2,
 	 * and therefore divide the queue size by 2, leaving the amount
 	 * of memory used by the queues unchanged.
@@ -4357,6 +4401,8 @@ static void ena_calc_io_queue_size(struct ena_adapter *adapter,
 	adapter->max_rx_ring_size = max_rx_queue_size;
 	adapter->requested_tx_ring_size = tx_queue_size;
 	adapter->requested_rx_ring_size = rx_queue_size;
+
+	return 0;
 }
 
 /* ena_probe - Device Initialization Routine
diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h
index e19cd75be698b..7b373cf6545e9 100644
--- a/drivers/amazon/net/ena/ena_netdev.h
+++ b/drivers/amazon/net/ena/ena_netdev.h
@@ -19,14 +19,17 @@
 #include <linux/interrupt.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
+#ifdef HAS_BPF_HEADER
+#include <uapi/linux/bpf.h>
+#endif
 #include <linux/u64_stats_sync.h>
 
 #include "ena_com.h"
 #include "ena_eth_com.h"
 
 #define DRV_MODULE_GEN_MAJOR	2
-#define DRV_MODULE_GEN_MINOR	7
-#define DRV_MODULE_GEN_SUBMINOR 4
+#define DRV_MODULE_GEN_MINOR	8
+#define DRV_MODULE_GEN_SUBMINOR 0
 
 #define DRV_MODULE_NAME		"ena"
 #ifndef DRV_MODULE_GENERATION
@@ -122,6 +125,8 @@
 
 struct ena_page_cache;
 
+struct ena_phc_info;
+
 struct ena_irq {
 	irq_handler_t handler;
 	void *data;
@@ -349,6 +354,7 @@ struct ena_stats_dev {
 	u64 admin_q_pause;
 	u64 rx_drops;
 	u64 tx_drops;
+	u64 reset_fail;
 };
 
 enum ena_flags_t {
@@ -439,6 +445,7 @@ struct ena_adapter {
 	struct u64_stats_sync syncp;
 	struct ena_stats_dev dev_stats;
 	struct ena_admin_eni_stats eni_stats;
+	struct ena_admin_ena_srd_info ena_srd_info;
 
 	/* last queue index that was checked for uncompleted tx packets */
 	u32 last_monitored_tx_qid;
@@ -450,6 +457,8 @@ struct ena_adapter {
 #endif
 	u32 xdp_first_ring;
 	u32 xdp_num_queues;
+
+	struct ena_phc_info *phc_info;
 };
 
 void ena_set_ethtool_ops(struct net_device *netdev);
@@ -458,7 +467,6 @@ void ena_dump_stats_to_dmesg(struct ena_adapter *adapter);
 
 void ena_dump_stats_to_buf(struct ena_adapter *adapter, u8 *buf);
 
-int ena_update_hw_stats(struct ena_adapter *adapter);
 
 int ena_set_lpc_state(struct ena_adapter *adapter, bool enabled);
 
@@ -468,6 +476,8 @@ int ena_update_queue_sizes(struct ena_adapter *adapter,
 
 int ena_update_queue_count(struct ena_adapter *adapter, u32 new_channel_count);
 
+int ena_set_rx_copybreak(struct ena_adapter *adapter, u32 rx_copybreak);
+
 int ena_get_sset_count(struct net_device *netdev, int sset);
 #ifdef ENA_BUSY_POLL_SUPPORT
 static inline void ena_bp_init_lock(struct ena_ring *rx_ring)
@@ -556,7 +566,7 @@ static inline void ena_reset_device(struct ena_adapter *adapter,
  */
 struct page *ena_alloc_map_page(struct ena_ring *rx_ring, dma_addr_t *dma);
 
-void ena_destroy_device(struct ena_adapter *adapter, bool graceful);
+int ena_destroy_device(struct ena_adapter *adapter, bool graceful);
 int ena_restore_device(struct ena_adapter *adapter);
 int handle_invalid_req_id(struct ena_ring *ring, u16 req_id,
 			  struct ena_tx_buffer *tx_info, bool is_xdp);
@@ -590,6 +600,8 @@ int ena_create_io_tx_queues_in_range(struct ena_adapter *adapter,
 				     int first_index, int count);
 int ena_setup_tx_resources_in_range(struct ena_adapter *adapter,
 				    int first_index, int count);
+void ena_free_all_io_tx_resources_in_range(struct ena_adapter *adapter,
+					int first_index, int count);
 void ena_free_all_io_tx_resources(struct ena_adapter *adapter);
 void ena_down(struct ena_adapter *adapter);
 int ena_up(struct ena_adapter *adapter);
diff --git a/drivers/amazon/net/ena/ena_phc.c b/drivers/amazon/net/ena/ena_phc.c
new file mode 100644
index 0000000000000..46e21d3202a1b
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_phc.c
@@ -0,0 +1,246 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright 2015-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include "ena_phc.h"
+
+#ifdef ENA_PHC_SUPPORT
+
+static int ena_phc_adjfreq(struct ptp_clock_info *clock_info, s32 ppb)
+{
+	return -EOPNOTSUPP;
+}
+
+static int ena_phc_adjtime(struct ptp_clock_info *clock_info, s64 delta)
+{
+	return -EOPNOTSUPP;
+}
+
+static int ena_phc_enable(struct ptp_clock_info *clock_info, struct ptp_clock_request *rq, int on)
+{
+	return -EOPNOTSUPP;
+}
+
+#ifdef ENA_PHC_SUPPORT_GETTIME64
+#ifdef ENA_PHC_SUPPORT_GETTIME64_EXTENDED
+static int ena_phc_gettimex64(struct ptp_clock_info *clock_info, struct timespec64 *ts,
+			      struct ptp_system_timestamp *sts)
+{
+	struct ena_phc_info *phc_info = container_of(clock_info, struct ena_phc_info, clock_info);
+	unsigned long flags;
+	u64 timestamp_nsec;
+	int rc;
+
+	spin_lock_irqsave(&phc_info->lock, flags);
+
+	ptp_read_system_prets(sts);
+
+	rc = ena_com_phc_get(phc_info->adapter->ena_dev, &timestamp_nsec);
+
+	ptp_read_system_postts(sts);
+
+	spin_unlock_irqrestore(&phc_info->lock, flags);
+
+	*ts = ns_to_timespec64(timestamp_nsec);
+
+	return rc;
+}
+
+#else /* ENA_PHC_SUPPORT_GETTIME64_EXTENDED */
+static int ena_phc_gettime64(struct ptp_clock_info *clock_info, struct timespec64 *ts)
+{
+	struct ena_phc_info *phc_info = container_of(clock_info, struct ena_phc_info, clock_info);
+	unsigned long flags;
+	u64 timestamp_nsec;
+	int rc;
+
+	spin_lock_irqsave(&phc_info->lock, flags);
+
+	rc = ena_com_phc_get(phc_info->adapter->ena_dev, &timestamp_nsec);
+
+	spin_unlock_irqrestore(&phc_info->lock, flags);
+
+	*ts = ns_to_timespec64(timestamp_nsec);
+
+	return rc;
+}
+
+#endif /* ENA_PHC_SUPPORT_GETTIME64_EXTENDED */
+static int ena_phc_settime64(struct ptp_clock_info *clock_info,
+			     const struct timespec64 *ts)
+{
+	return -EOPNOTSUPP;
+}
+
+#else /* ENA_PHC_SUPPORT_GETTIME64 */
+static int ena_phc_gettime(struct ptp_clock_info *clock_info, struct timespec *ts)
+{
+	struct ena_phc_info *phc_info = container_of(clock_info, struct ena_phc_info, clock_info);
+	unsigned long flags;
+	u64 timestamp_nsec;
+	u32 remainder;
+	int rc;
+
+	spin_lock_irqsave(&phc_info->lock, flags);
+
+	rc = ena_com_phc_get(phc_info->adapter->ena_dev, &timestamp_nsec);
+
+	spin_unlock_irqrestore(&phc_info->lock, flags);
+
+	ts->tv_sec = div_u64_rem(timestamp_nsec, NSEC_PER_SEC, &remainder);
+	ts->tv_nsec = remainder;
+
+	return rc;
+}
+
+static int ena_phc_settime(struct ptp_clock_info *clock_info, const struct timespec *ts)
+{
+	return -EOPNOTSUPP;
+}
+
+#endif /* ENA_PHC_SUPPORT_GETTIME64 */
+
+static struct ptp_clock_info ena_ptp_clock_info = {
+	.owner		= THIS_MODULE,
+	.n_alarm	= 0,
+	.n_ext_ts	= 0,
+	.n_per_out	= 0,
+	.pps		= 0,
+	.adjfreq	= ena_phc_adjfreq,
+	.adjtime	= ena_phc_adjtime,
+#ifdef ENA_PHC_SUPPORT_GETTIME64
+#ifdef ENA_PHC_SUPPORT_GETTIME64_EXTENDED
+	.gettimex64	= ena_phc_gettimex64,
+#else /* ENA_PHC_SUPPORT_GETTIME64_EXTENDED */
+	.gettime64	= ena_phc_gettime64,
+#endif /* ENA_PHC_SUPPORT_GETTIME64_EXTENDED */
+	.settime64	= ena_phc_settime64,
+#else /* ENA_PHC_SUPPORT_GETTIME64 */
+	.gettime	= ena_phc_gettime,
+	.settime	= ena_phc_settime,
+#endif /* ENA_PHC_SUPPORT_GETTIME64 */
+	.enable		= ena_phc_enable,
+};
+
+static int ena_phc_register(struct ena_adapter *adapter)
+{
+	struct pci_dev *pdev = adapter->pdev;
+	struct ptp_clock_info *clock_info;
+	struct ena_phc_info *phc_info;
+	int rc = 0;
+
+	phc_info = adapter->phc_info;
+	clock_info = &phc_info->clock_info;
+
+	phc_info->adapter = adapter;
+
+	spin_lock_init(&phc_info->lock);
+
+	/* Fill the ptp_clock_info struct and register PTP clock */
+	*clock_info = ena_ptp_clock_info;
+	snprintf(clock_info->name,
+		 sizeof(clock_info->name),
+		 "ena-ptp-%02x",
+		 PCI_SLOT(pdev->devfn));
+
+	phc_info->clock = ptp_clock_register(clock_info, &pdev->dev);
+	if (IS_ERR(phc_info->clock)) {
+		rc = PTR_ERR(phc_info->clock);
+		netdev_err(adapter->netdev, "Failed registering ptp clock, error: %d\n", rc);
+		phc_info->clock = NULL;
+	}
+
+	return rc;
+}
+
+bool ena_phc_enabled(struct ena_adapter *adapter)
+{
+	struct ena_phc_info *phc_info = adapter->phc_info;
+
+	return (phc_info && phc_info->clock);
+}
+
+static void ena_phc_unregister(struct ena_adapter *adapter)
+{
+	struct ena_phc_info *phc_info = adapter->phc_info;
+
+	if (ena_phc_enabled(adapter))
+		ptp_clock_unregister(phc_info->clock);
+}
+
+int ena_phc_init(struct ena_adapter *adapter)
+{
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	struct net_device *netdev = adapter->netdev;
+	int rc = -EOPNOTSUPP;
+
+	/* Validate phc feature is supported in the device */
+	if (!ena_com_phc_supported(ena_dev)) {
+		netdev_dbg(netdev, "PHC feature is not supported\n");
+		goto err_ena_com_phc_init;
+	}
+
+	/* Allocate and initialize device specific PHC info */
+	rc = ena_com_phc_init(ena_dev);
+	if (unlikely(rc)) {
+		netdev_err(netdev, "Failed to init phc, error: %d\n", rc);
+		goto err_ena_com_phc_init;
+	}
+
+	/* Configure PHC feature in driver and device */
+	rc = ena_com_phc_config(ena_dev);
+	if (unlikely(rc)) {
+		netdev_err(netdev, "Failed to config phc, error: %d\n", rc);
+		goto err_ena_com_phc_config;
+	}
+
+	/* Allocate and initialize driver specific PHC info */
+	adapter->phc_info = vzalloc(sizeof(*adapter->phc_info));
+	if (unlikely(!adapter->phc_info)) {
+		rc = -ENOMEM;
+		netdev_err(netdev, "Failed to alloc phc_info, error: %d\n", rc);
+		goto err_ena_com_phc_config;
+	}
+
+	/* Register to PTP class driver */
+	rc = ena_phc_register(adapter);
+	if (unlikely(rc)) {
+		netdev_err(netdev, "Failed to register phc, error: %d\n", rc);
+		goto err_ena_phc_register;
+	}
+
+	return 0;
+
+err_ena_phc_register:
+	vfree(adapter->phc_info);
+	adapter->phc_info = NULL;
+err_ena_com_phc_config:
+	ena_com_phc_destroy(ena_dev);
+err_ena_com_phc_init:
+	return rc;
+}
+
+void ena_phc_destroy(struct ena_adapter *adapter)
+{
+	ena_phc_unregister(adapter);
+
+	if (likely(adapter->phc_info)) {
+		vfree(adapter->phc_info);
+		adapter->phc_info = NULL;
+	}
+
+	ena_com_phc_destroy(adapter->ena_dev);
+}
+
+int ena_phc_get_index(struct ena_adapter *adapter)
+{
+	struct ena_phc_info *phc_info = adapter->phc_info;
+
+	if (ena_phc_enabled(adapter))
+		return ptp_clock_index(phc_info->clock);
+
+	return -1;
+}
+
+#endif /* ENA_PHC_SUPPORT */
diff --git a/drivers/amazon/net/ena/ena_phc.h b/drivers/amazon/net/ena/ena_phc.h
new file mode 100644
index 0000000000000..f08ff473bd1e4
--- /dev/null
+++ b/drivers/amazon/net/ena/ena_phc.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright 2015-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef ENA_PHC_H
+#define ENA_PHC_H
+
+#include "ena_netdev.h"
+
+#ifdef ENA_PHC_SUPPORT
+
+#include <linux/ptp_clock_kernel.h>
+
+struct ena_phc_info {
+	/* PTP hardware capabilities */
+	struct ptp_clock_info clock_info;
+
+	/* Registered PTP clock device */
+	struct ptp_clock *clock;
+
+	/* Adapter specific private data structure */
+	struct ena_adapter *adapter;
+
+	/* PHC lock */
+	spinlock_t lock;
+};
+
+bool ena_phc_enabled(struct ena_adapter *adapter);
+int ena_phc_get_index(struct ena_adapter *adapter);
+int ena_phc_init(struct ena_adapter *adapter);
+void ena_phc_destroy(struct ena_adapter *adapter);
+
+#else /* ENA_PHC_SUPPORT */
+
+static inline bool ena_phc_enabled(struct ena_adapter *adapter) {return false; }
+static inline int ena_phc_get_index(struct ena_adapter *adapter) { return -1; }
+static inline int ena_phc_init(struct ena_adapter *adapter) { return 0; }
+static inline void ena_phc_destroy(struct ena_adapter *adapter) { }
+
+#endif /* ENA_PHC_SUPPORT */
+
+#endif /* ENA_PHC_H */
diff --git a/drivers/amazon/net/ena/ena_regs_defs.h b/drivers/amazon/net/ena/ena_regs_defs.h
index 8ca6f795a0fd0..ded18aa5162bc 100755
--- a/drivers/amazon/net/ena/ena_regs_defs.h
+++ b/drivers/amazon/net/ena/ena_regs_defs.h
@@ -53,6 +53,11 @@ enum ena_regs_reset_reason_types {
 #define ENA_REGS_MMIO_RESP_HI_OFF                           0x64
 #define ENA_REGS_RSS_IND_ENTRY_UPDATE_OFF                   0x68
 
+/* phc_registers offsets */
+
+/* 100 base */
+#define ENA_REGS_PHC_DB_OFF                                 0x100
+
 /* version register */
 #define ENA_REGS_VERSION_MINOR_VERSION_MASK                 0xff
 #define ENA_REGS_VERSION_MAJOR_VERSION_SHIFT                8
@@ -129,4 +134,7 @@ enum ena_regs_reset_reason_types {
 #define ENA_REGS_RSS_IND_ENTRY_UPDATE_CQ_IDX_SHIFT          16
 #define ENA_REGS_RSS_IND_ENTRY_UPDATE_CQ_IDX_MASK           0xffff0000
 
+/* phc_db_req_id register */
+#define ENA_REGS_PHC_DB_REQ_ID_MASK                         0xffff
+
 #endif /* _ENA_REGS_H_ */
diff --git a/drivers/amazon/net/ena/ena_sysfs.c b/drivers/amazon/net/ena/ena_sysfs.c
index 0c3451b60a2fe..98e1f7ecd0f09 100755
--- a/drivers/amazon/net/ena/ena_sysfs.c
+++ b/drivers/amazon/net/ena/ena_sysfs.c
@@ -19,26 +19,23 @@ static ssize_t ena_store_rx_copybreak(struct device *dev,
 {
 	struct ena_adapter *adapter = dev_get_drvdata(dev);
 	unsigned long rx_copybreak;
-	struct ena_ring *rx_ring;
-	int err, i;
+	int rc;
 
-	err = kstrtoul(buf, 10, &rx_copybreak);
-	if (err < 0)
-		return err;
-
-	if (rx_copybreak > min_t(u16, adapter->netdev->mtu, ENA_PAGE_SIZE))
-		return -EINVAL;
+	rc = kstrtoul(buf, 10, &rx_copybreak);
+	if (rc < 0)
+		goto exit;
 
 	rtnl_lock();
-	adapter->rx_copybreak = rx_copybreak;
-
-	for (i = 0; i < adapter->num_io_queues; i++) {
-		rx_ring = &adapter->rx_ring[i];
-		rx_ring->rx_copybreak = rx_copybreak;
-	}
+	rc = ena_set_rx_copybreak(adapter, rx_copybreak);
+	if (rc)
+		goto unlock;
 	rtnl_unlock();
 
 	return len;
+unlock:
+	rtnl_unlock();
+exit:
+	return rc;
 }
 
 #define ENA_RX_COPYBREAK_STR_MAX_LEN 7
@@ -55,7 +52,6 @@ static ssize_t ena_show_rx_copybreak(struct device *dev,
 static DEVICE_ATTR(rx_copybreak, S_IRUGO | S_IWUSR, ena_show_rx_copybreak,
 		   ena_store_rx_copybreak);
 
-
 /******************************************************************************
  *****************************************************************************/
 int ena_sysfs_init(struct device *dev)
diff --git a/drivers/amazon/net/ena/ena_xdp.c b/drivers/amazon/net/ena/ena_xdp.c
index 9296be230fd38..4d8c1709598de 100644
--- a/drivers/amazon/net/ena/ena_xdp.c
+++ b/drivers/amazon/net/ena/ena_xdp.c
@@ -190,7 +190,8 @@ int ena_setup_and_create_all_xdp_queues(struct ena_adapter *adapter)
 	return 0;
 
 create_err:
-	ena_free_all_io_tx_resources(adapter);
+	ena_free_all_io_tx_resources_in_range(adapter, adapter->xdp_first_ring,
+					      adapter->xdp_num_queues);
 setup_err:
 	return rc;
 }
@@ -806,10 +807,11 @@ static int ena_xdp_clean_rx_irq_zc(struct ena_ring *rx_ring,
 		xdp->data_end = xdp->data + ena_rx_ctx.ena_bufs[0].len;
 		xsk_buff_dma_sync_for_cpu(xdp, rx_ring->xsk_pool);
 
-		/* Don't process several descriptors, not blocked by HW
-		 * (regardless of MTU)
-		 */
+		/* XDP multi-buffer packets not supported */
 		if (unlikely(ena_rx_ctx.descs > 1)) {
+			netdev_err_once(rx_ring->adapter->netdev,
+					"xdp: dropped multi-buffer packets. RX packets must be < %lu\n",
+					ENA_XDP_MAX_MTU);
 			ena_increase_stat(&rx_ring->rx_stats.xdp_drop, 1, &rx_ring->syncp);
 			xdp_verdict = ENA_XDP_DROP;
 			goto skip_xdp_prog;
diff --git a/drivers/amazon/net/ena/ena_xdp.h b/drivers/amazon/net/ena/ena_xdp.h
index b15d9cb0d25f1..f6b60c0e5d7c6 100644
--- a/drivers/amazon/net/ena/ena_xdp.h
+++ b/drivers/amazon/net/ena/ena_xdp.h
@@ -96,6 +96,11 @@ static inline enum ena_xdp_errors_t ena_xdp_allowed(struct ena_adapter *adapter)
 	return rc;
 }
 
+static inline u64 ena_ring_xdp_drops_cnt(struct ena_ring *rx_ring)
+{
+	return rx_ring->rx_stats.xdp_drop;
+}
+
 #ifdef ENA_AF_XDP_SUPPORT
 static inline bool ena_is_zc_q_exist(struct ena_adapter *adapter)
 {
@@ -173,7 +178,7 @@ static inline int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp
 		verdict = ENA_XDP_PASS;
 		break;
 	default:
-		bpf_warn_invalid_xdp_action(verdict);
+		bpf_warn_invalid_xdp_action(rx_ring->netdev, xdp_prog, verdict);
 		xdp_stat = &rx_ring->rx_stats.xdp_invalid;
 		verdict = ENA_XDP_DROP;
 	}
@@ -191,6 +196,11 @@ static inline bool ena_xdp_present_ring(struct ena_ring *ring)
 	return false;
 }
 
+static inline u64 ena_ring_xdp_drops_cnt(struct ena_ring *rx_ring)
+{
+	return 0;
+}
+
 static inline int ena_xdp_register_rxq_info(struct ena_ring *rx_ring)
 {
 	return 0;
diff --git a/drivers/amazon/net/ena/kcompat.h b/drivers/amazon/net/ena/kcompat.h
index 38af7b173de14..fd7e80d0347ba 100644
--- a/drivers/amazon/net/ena/kcompat.h
+++ b/drivers/amazon/net/ena/kcompat.h
@@ -839,6 +839,10 @@ static inline int numa_mem_id(void)
 #define fallthrough do {} while (0)  /* fallthrough */
 #endif
 
+#ifndef NAPI_POLL_WEIGHT
+#define NAPI_POLL_WEIGHT 64
+#endif
+
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0)
 #define AF_XDP_BUSY_POLL_SUPPORTED
 #endif
@@ -921,4 +925,63 @@ static inline int netif_xmit_stopped(const struct netdev_queue *dev_queue)
 #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0)
 #define NAPIF_STATE_SCHED BIT(NAPI_STATE_SCHED)
 #endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 17, 0)
+#define bpf_warn_invalid_xdp_action(netdev, xdp_prog, verdict) \
+	bpf_warn_invalid_xdp_action(verdict)
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0)
+#define HAS_BPF_HEADER
+#endif
+
+#if ((LINUX_VERSION_CODE < KERNEL_VERSION(3, 8, 0)) && \
+	!(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 7))))
+static inline int ktime_compare(const ktime_t cmp1, const ktime_t cmp2)
+{
+	if (cmp1.tv64 < cmp2.tv64)
+		return -1;
+	if (cmp1.tv64 > cmp2.tv64)
+		return 1;
+	return 0;
+}
+#endif
+
+#if ((LINUX_VERSION_CODE < KERNEL_VERSION(3, 16, 0)) && \
+	!(RHEL_RELEASE_CODE && \
+	(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 7)) && \
+	(RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7, 0)) && \
+	(RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(7, 1))))
+static inline bool ktime_after(const ktime_t cmp1, const ktime_t cmp2)
+{
+	return ktime_compare(cmp1, cmp2) > 0;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_PTP_1588_CLOCK)
+
+#if defined(ENA_PHC_INCLUDE) && ((LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0)) || \
+	(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 4)))
+#define ENA_PHC_SUPPORT
+#endif /* ENA_PHC_SUPPORT */
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 1, 0)) || \
+	(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 2))
+#define ENA_PHC_SUPPORT_GETTIME64
+#endif /* ENA_PHC_SUPPORT_GETTIME64 */
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0)) || \
+	(RHEL_RELEASE_CODE && \
+	(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 7)) && \
+	(RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(8, 0)))
+#define ENA_PHC_SUPPORT_GETTIME64_EXTENDED
+#endif /* ENA_PHC_SUPPORT_GETTIME64_EXTENDED */
+
+#if ((LINUX_VERSION_CODE < KERNEL_VERSION(3, 7, 0)) && \
+	!(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 4))))
+#define ptp_clock_register(info, parent) ptp_clock_register(info)
+#endif
+
+#endif /* CONFIG_PTP_1588_CLOCK */
+
 #endif /* _KCOMPAT_H_ */

From 42d7b8f6af39263c573c5b3885679b9c7e491a28 Mon Sep 17 00:00:00 2001
From: Shaoying Xu <shaoyi@amazon.com>
Date: Tue, 13 Sep 2022 20:07:05 +0000
Subject: [PATCH 494/737] lustre: update to AmazonFSxLustreClient v2.12.8-fsx4

Signed-off-by: Shaoying Xu <shaoyi@amazon.com>
---
 drivers/staging/lustrefsx/Makefile.rules      |    1 +
 drivers/staging/lustrefsx/config.h            |  233 +-
 .../lustrefsx/libcfs/include/libcfs/bitmap.h  |    3 +
 .../lustrefsx/libcfs/include/libcfs/curproc.h |    4 +
 .../lustrefsx/libcfs/include/libcfs/libcfs.h  |   78 +-
 .../libcfs/include/libcfs/libcfs_cpu.h        |  102 +-
 .../libcfs/include/libcfs/libcfs_crypto.h     |  113 +-
 .../libcfs/include/libcfs/libcfs_debug.h      |  171 +-
 .../libcfs/include/libcfs/libcfs_fail.h       |   40 +-
 .../libcfs/include/libcfs/libcfs_prim.h       |    1 +
 .../libcfs/include/libcfs/libcfs_private.h    |   11 +-
 .../libcfs/include/libcfs/libcfs_ptask.h      |  121 -
 .../libcfs/include/libcfs/libcfs_string.h     |    2 +-
 .../libcfs/include/libcfs/libcfs_time.h       |   81 -
 .../libcfs/include/libcfs/linux/libcfs.h      |  150 -
 .../libcfs/include/libcfs/linux/linux-cpu.h   |   60 +-
 .../include/libcfs/linux/linux-crypto.h       |    5 -
 .../libcfs/include/libcfs/linux/linux-fs.h    |   21 +-
 .../libcfs/include/libcfs/linux/linux-hash.h  |  247 ++
 .../libcfs/include/libcfs/linux/linux-mem.h   |    8 -
 .../libcfs/include/libcfs/linux/linux-misc.h  |   68 +-
 .../libcfs/include/libcfs/linux/linux-time.h  |  164 +-
 .../libcfs/include/libcfs/linux/linux-wait.h  |  568 +++
 .../libcfs/include/libcfs/util/hash.h         |  103 +
 .../libcfs/include/libcfs/util/ioctl.h        |    4 +-
 .../libcfs/include/libcfs/util/parser.h       |    4 +-
 .../libcfs/include/libcfs/util/string.h       |   11 +-
 .../staging/lustrefsx/libcfs/libcfs/Makefile  |    3 +-
 .../staging/lustrefsx/libcfs/libcfs/debug.c   |  387 +-
 .../lustrefsx/libcfs/libcfs/libcfs_cpu.c      | 1180 +++++-
 .../lustrefsx/libcfs/libcfs/libcfs_mem.c      |   13 +-
 .../lustrefsx/libcfs/libcfs/libcfs_ptask.c    |  478 ---
 .../lustrefsx/libcfs/libcfs/libcfs_string.c   |   50 +-
 .../lustrefsx/libcfs/libcfs/linux/linux-cpu.c | 1178 ------
 .../libcfs/libcfs/linux/linux-crypto-adler.c  |    2 +
 .../libcfs/libcfs/linux/linux-crypto-crc32.c  |    3 +
 .../libcfs/linux/linux-crypto-crc32c-pclmul.c |    3 +
 .../libcfs/linux/linux-crypto-crc32pclmul.c   |    4 +
 .../libcfs/libcfs/linux/linux-crypto.c        |   70 +-
 .../libcfs/libcfs/linux/linux-curproc.c       |   23 +-
 .../libcfs/libcfs/linux/linux-debug.c         |   57 +-
 .../libcfs/libcfs/linux/linux-hash.c          |   57 +
 .../libcfs/libcfs/linux/linux-module.c        |    4 +-
 .../libcfs/libcfs/linux/linux-prim.c          |   60 +-
 .../libcfs/libcfs/linux/linux-tracefile.c     |    1 +
 .../libcfs/libcfs/linux/linux-wait.c          |  115 +
 .../staging/lustrefsx/libcfs/libcfs/module.c  |  552 ++-
 .../lustrefsx/libcfs/libcfs/tracefile.c       |   89 +-
 .../lustrefsx/libcfs/libcfs/tracefile.h       |    3 +-
 .../lustrefsx/libcfs/libcfs/util/l_ioctl.c    |    4 +-
 .../lustrefsx/libcfs/libcfs/util/nidstrings.c |    6 +-
 .../lustrefsx/libcfs/libcfs/util/param.c      |   10 +-
 .../lustrefsx/libcfs/libcfs/util/parser.c     |   67 +-
 .../lustrefsx/libcfs/libcfs/util/string.c     |  122 +-
 .../lustrefsx/libcfs/libcfs/watchdog.c        |    7 +-
 .../lustrefsx/libcfs/libcfs/workitem.c        |    9 +-
 .../staging/lustrefsx/lnet/include/cyaml.h    |    2 +-
 .../staging/lustrefsx/lnet/include/lnet/api.h |    8 +-
 .../lustrefsx/lnet/include/lnet/lib-lnet.h    |  279 +-
 .../lustrefsx/lnet/include/lnet/lib-types.h   |  582 ++-
 .../lustrefsx/lnet/include/lnet/socklnd.h     |   14 +-
 .../include/uapi/linux/lnet/libcfs_debug.h    |  151 +
 .../include/uapi/linux/lnet}/libcfs_ioctl.h   |   24 +-
 .../lib-dlc.h => uapi/linux/lnet/lnet-dlc.h}  |   77 +-
 .../types.h => uapi/linux/lnet/lnet-types.h}  |  179 +-
 .../include/{ => uapi/linux}/lnet/lnetctl.h   |   36 +-
 .../include/{ => uapi/linux}/lnet/lnetst.h    |   10 +-
 .../include/{ => uapi/linux}/lnet/nidstr.h    |   14 +-
 .../lnet.h => uapi/linux/lnet/socklnd.h}      |   24 +-
 .../lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c    |  930 ++---
 .../lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h    |  426 ++-
 .../lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c | 1039 +++---
 .../lnet/klnds/o2iblnd/o2iblnd_modparams.c    |   84 +-
 .../lustrefsx/lnet/klnds/socklnd/socklnd.c    |  782 ++--
 .../lustrefsx/lnet/klnds/socklnd/socklnd.h    |  333 +-
 .../lustrefsx/lnet/klnds/socklnd/socklnd_cb.c | 1109 +++---
 .../lnet/klnds/socklnd/socklnd_lib.c          |  116 +-
 .../lnet/klnds/socklnd/socklnd_modparams.c    |    4 +-
 .../lnet/klnds/socklnd/socklnd_proto.c        |   87 +-
 .../staging/lustrefsx/lnet/lnet/acceptor.c    |   10 +-
 drivers/staging/lustrefsx/lnet/lnet/api-ni.c  | 1678 +++++++--
 drivers/staging/lustrefsx/lnet/lnet/config.c  |   66 +-
 drivers/staging/lustrefsx/lnet/lnet/lib-eq.c  |    2 -
 drivers/staging/lustrefsx/lnet/lnet/lib-md.c  |    2 +-
 .../staging/lustrefsx/lnet/lnet/lib-move.c    | 3014 +++++++++++++---
 drivers/staging/lustrefsx/lnet/lnet/lib-msg.c |  699 +++-
 drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c |    2 +-
 .../staging/lustrefsx/lnet/lnet/lib-socket.c  |   28 +-
 drivers/staging/lustrefsx/lnet/lnet/lo.c      |    2 +
 drivers/staging/lustrefsx/lnet/lnet/module.c  |   51 +-
 .../staging/lustrefsx/lnet/lnet/net_fault.c   |  198 +-
 .../staging/lustrefsx/lnet/lnet/nidstrings.c  |    4 +-
 drivers/staging/lustrefsx/lnet/lnet/peer.c    | 3172 +++++++++++++++--
 drivers/staging/lustrefsx/lnet/lnet/router.c  |  436 ++-
 .../staging/lustrefsx/lnet/lnet/router_proc.c |  107 +-
 .../lustrefsx/lnet/selftest/brw_test.c        |  107 +-
 .../staging/lustrefsx/lnet/selftest/conctl.c  |  784 ++--
 .../staging/lustrefsx/lnet/selftest/conrpc.c  |  335 +-
 .../staging/lustrefsx/lnet/selftest/conrpc.h  |   46 +-
 .../staging/lustrefsx/lnet/selftest/console.c |  444 +--
 .../staging/lustrefsx/lnet/selftest/console.h |   69 +-
 .../lustrefsx/lnet/selftest/framework.c       |  412 +--
 .../staging/lustrefsx/lnet/selftest/module.c  |  129 +-
 .../lustrefsx/lnet/selftest/ping_test.c       |   52 +-
 drivers/staging/lustrefsx/lnet/selftest/rpc.c | 1218 +++----
 drivers/staging/lustrefsx/lnet/selftest/rpc.h |  182 +-
 .../lustrefsx/lnet/selftest/selftest.h        |  246 +-
 .../staging/lustrefsx/lnet/selftest/timer.c   |   33 +-
 .../staging/lustrefsx/lnet/selftest/timer.h   |   10 +-
 .../lustrefsx/lustre/fid/fid_handler.c        |   64 +-
 .../lustrefsx/lustre/fid/fid_internal.h       |   11 +-
 .../staging/lustrefsx/lustre/fid/fid_lib.c    |    1 -
 .../lustrefsx/lustre/fid/fid_request.c        |  114 +-
 .../staging/lustrefsx/lustre/fid/fid_store.c  |    2 +-
 .../staging/lustrefsx/lustre/fid/lproc_fid.c  |  212 +-
 .../staging/lustrefsx/lustre/fld/fld_cache.c  |   23 +-
 .../lustrefsx/lustre/fld/fld_handler.c        |  149 +-
 .../staging/lustrefsx/lustre/fld/fld_index.c  |  133 +-
 .../lustrefsx/lustre/fld/fld_internal.h       |   18 +-
 .../lustrefsx/lustre/fld/fld_request.c        |  418 ++-
 .../staging/lustrefsx/lustre/fld/lproc_fld.c  |   65 +-
 .../lustrefsx/lustre/include/cl_object.h      |  214 +-
 .../lustrefsx/lustre/include/dt_object.h      |   88 +-
 .../lustrefsx/lustre/include/llog_swab.h      |    2 +-
 .../lustrefsx/lustre/include/lprocfs_status.h |  315 +-
 .../lustrefsx/lustre/include/lu_object.h      |  198 +-
 .../lustrefsx/lustre/include/lu_target.h      |   69 +-
 .../lustre/include/lustre/ll_fiemap.h         |   38 +-
 .../include/lustre/lustre_barrier_user.h      |   53 +-
 .../lustre/include/lustre/lustre_lfsck_user.h |  214 +-
 .../lustre/include/lustre/lustre_user.h       | 1639 +--------
 .../lustre/include/lustre/lustreapi.h         |  627 ++--
 .../lustrefsx/lustre/include/lustre_acl.h     |    2 +-
 .../lustrefsx/lustre/include/lustre_barrier.h |    2 +-
 .../lustrefsx/lustre/include/lustre_compat.h  |  191 +-
 .../lustrefsx/lustre/include/lustre_disk.h    |   12 +-
 .../lustrefsx/lustre/include/lustre_dlm.h     |  311 +-
 .../lustre/include/lustre_dlm_flags.h         |   73 +-
 .../lustrefsx/lustre/include/lustre_eacl.h    |    2 +-
 .../include/{lustre => }/lustre_errno.h       |    0
 .../lustrefsx/lustre/include/lustre_export.h  |  153 +-
 .../lustrefsx/lustre/include/lustre_fid.h     |   42 +-
 .../lustrefsx/lustre/include/lustre_fld.h     |   19 +-
 .../lustrefsx/lustre/include/lustre_ha.h      |    2 +-
 .../lustrefsx/lustre/include/lustre_idmap.h   |    2 +-
 .../lustrefsx/lustre/include/lustre_import.h  |  102 +-
 .../lustre/include/lustre_kernelcomm.h        |    4 +-
 .../lustrefsx/lustre/include/lustre_lfsck.h   |   10 +-
 .../lustrefsx/lustre/include/lustre_lib.h     |   38 +-
 .../lustrefsx/lustre/include/lustre_linkea.h  |    2 +-
 .../lustrefsx/lustre/include/lustre_lmv.h     |   43 +-
 .../lustrefsx/lustre/include/lustre_log.h     |   17 +-
 .../lustrefsx/lustre/include/lustre_mdc.h     |  111 +-
 .../lustrefsx/lustre/include/lustre_mds.h     |   29 +-
 .../lustrefsx/lustre/include/lustre_net.h     |  209 +-
 .../lustrefsx/lustre/include/lustre_nodemap.h |   13 +-
 .../lustrefsx/lustre/include/lustre_nrs_tbf.h |   64 +-
 .../lustrefsx/lustre/include/lustre_obdo.h    |    2 +-
 .../lustre_osc.h}                             |  598 +++-
 .../lustre/include/lustre_patchless_compat.h  |   20 -
 .../lustrefsx/lustre/include/lustre_quota.h   |   29 +-
 .../lustre/include/lustre_req_layout.h        |   20 +-
 .../lustrefsx/lustre/include/lustre_scrub.h   |  375 ++
 .../lustrefsx/lustre/include/lustre_sec.h     |   21 +-
 .../lustrefsx/lustre/include/lustre_swab.h    |    8 +-
 .../lustrefsx/lustre/include/lustre_update.h  |    5 +-
 .../lustrefsx/lustre/include/md_object.h      |  287 +-
 .../staging/lustrefsx/lustre/include/obd.h    |  213 +-
 .../lustrefsx/lustre/include/obd_cksum.h      |  153 +-
 .../lustrefsx/lustre/include/obd_class.h      | 1254 ++++---
 .../lustrefsx/lustre/include/obd_support.h    |  130 +-
 .../lustrefsx/lustre/include/obj_update.h     |    2 +-
 .../lustrefsx/lustre/include/seq_range.h      |    2 +-
 .../uapi/linux/lustre/lustre_barrier_user.h   |   74 +
 .../uapi/linux/{ => lustre}/lustre_cfg.h      |   67 +-
 .../uapi/linux/{ => lustre}/lustre_disk.h     |   36 +-
 .../uapi/linux/{ => lustre}/lustre_fid.h      |    7 +-
 .../include/uapi/linux/lustre/lustre_fiemap.h |   72 +
 .../{ => uapi/linux}/lustre/lustre_idl.h      | 1022 +++---
 .../uapi/linux/{ => lustre}/lustre_ioctl.h    |   27 +-
 .../linux/lustre/lustre_kernelcomm.h}         |   15 +-
 .../uapi/linux/lustre/lustre_lfsck_user.h     |  238 ++
 .../{ => uapi/linux/lustre}/lustre_log_user.h |    3 +-
 .../uapi/linux/{ => lustre}/lustre_ostid.h    |   14 +-
 .../uapi/linux/{ => lustre}/lustre_param.h    |    0
 .../include/uapi/linux/lustre/lustre_user.h   | 2378 ++++++++++++
 .../{ => uapi/linux/lustre}/lustre_ver.h      |    6 -
 .../lustrefsx/lustre/include/upcall_cache.h   |   10 +-
 .../lustrefsx/lustre/ldlm/interval_tree.c     |    7 +-
 .../lustrefsx/lustre/ldlm/ldlm_extent.c       |  182 +-
 .../lustrefsx/lustre/ldlm/ldlm_flock.c        |   39 +-
 .../lustrefsx/lustre/ldlm/ldlm_inodebits.c    |  441 ++-
 .../lustrefsx/lustre/ldlm/ldlm_internal.h     |   91 +-
 .../staging/lustrefsx/lustre/ldlm/ldlm_lib.c  |  715 ++--
 .../staging/lustrefsx/lustre/ldlm/ldlm_lock.c |  893 +++--
 .../lustrefsx/lustre/ldlm/ldlm_lockd.c        | 1227 ++++---
 .../lustrefsx/lustre/ldlm/ldlm_plain.c        |   36 +-
 .../staging/lustrefsx/lustre/ldlm/ldlm_pool.c |  374 +-
 .../lustrefsx/lustre/ldlm/ldlm_request.c      |  791 ++--
 .../lustrefsx/lustre/ldlm/ldlm_resource.c     |  438 ++-
 .../staging/lustrefsx/lustre/llite/Makefile   |    2 +-
 .../staging/lustrefsx/lustre/llite/dcache.c   |   27 +-
 drivers/staging/lustrefsx/lustre/llite/dir.c  |  957 +++--
 drivers/staging/lustrefsx/lustre/llite/file.c | 2155 +++++++----
 .../staging/lustrefsx/lustre/llite/glimpse.c  |   84 +-
 .../lustrefsx/lustre/llite/lcommon_cl.c       |   19 +-
 .../lustrefsx/lustre/llite/lcommon_misc.c     |    5 +-
 .../lustrefsx/lustre/llite/llite_internal.h   |  240 +-
 .../lustrefsx/lustre/llite/llite_lib.c        |  902 +++--
 .../lustrefsx/lustre/llite/llite_mmap.c       |   23 +-
 .../lustrefsx/lustre/llite/llite_nfs.c        |    2 +-
 .../lustrefsx/lustre/llite/lproc_llite.c      | 1370 +++----
 .../staging/lustrefsx/lustre/llite/namei.c    |  621 +++-
 .../lustrefsx/lustre/llite/range_lock.c       |    5 +-
 drivers/staging/lustrefsx/lustre/llite/rw.c   |  100 +-
 drivers/staging/lustrefsx/lustre/llite/rw26.c |  162 +-
 .../lustrefsx/lustre/llite/statahead.c        |  181 +-
 .../staging/lustrefsx/lustre/llite/super25.c  |   52 +-
 .../staging/lustrefsx/lustre/llite/vvp_dev.c  |  311 +-
 .../lustrefsx/lustre/llite/vvp_internal.h     |   28 +-
 .../staging/lustrefsx/lustre/llite/vvp_io.c   |  393 +-
 .../staging/lustrefsx/lustre/llite/vvp_lock.c |   86 -
 .../lustrefsx/lustre/llite/vvp_object.c       |   18 +-
 .../staging/lustrefsx/lustre/llite/vvp_page.c |   39 +-
 .../staging/lustrefsx/lustre/llite/xattr.c    |   96 +-
 .../staging/lustrefsx/lustre/llite/xattr26.c  |   32 +-
 .../lustrefsx/lustre/llite/xattr_cache.c      |    3 +-
 .../lustrefsx/lustre/llite/xattr_security.c   |   33 +
 .../staging/lustrefsx/lustre/lmv/lmv_fld.c    |    1 -
 .../staging/lustrefsx/lustre/lmv/lmv_intent.c |  245 +-
 .../lustrefsx/lustre/lmv/lmv_internal.h       |   89 +-
 .../staging/lustrefsx/lustre/lmv/lmv_obd.c    | 1622 +++++----
 .../staging/lustrefsx/lustre/lmv/lproc_lmv.c  |  121 +-
 drivers/staging/lustrefsx/lustre/lov/Makefile |    4 +-
 .../lustrefsx/lustre/lov/lov_cl_internal.h    |  296 +-
 .../staging/lustrefsx/lustre/lov/lov_dev.c    |  581 ++-
 drivers/staging/lustrefsx/lustre/lov/lov_ea.c |   84 +-
 .../lustrefsx/lustre/lov/lov_internal.h       |   53 +-
 drivers/staging/lustrefsx/lustre/lov/lov_io.c |  990 +++--
 .../staging/lustrefsx/lustre/lov/lov_lock.c   |  128 +-
 .../staging/lustrefsx/lustre/lov/lov_merge.c  |    2 +-
 .../staging/lustrefsx/lustre/lov/lov_obd.c    |  615 ++--
 .../staging/lustrefsx/lustre/lov/lov_object.c |  949 +++--
 .../staging/lustrefsx/lustre/lov/lov_offset.c |  153 +-
 .../staging/lustrefsx/lustre/lov/lov_pack.c   |  106 +-
 .../staging/lustrefsx/lustre/lov/lov_page.c   |   44 +-
 .../staging/lustrefsx/lustre/lov/lov_pool.c   |   32 +-
 .../lustrefsx/lustre/lov/lov_request.c        |  308 +-
 .../staging/lustrefsx/lustre/lov/lovsub_dev.c |  108 +-
 .../lustrefsx/lustre/lov/lovsub_lock.c        |   82 -
 .../lustrefsx/lustre/lov/lovsub_object.c      |   95 +-
 .../lustrefsx/lustre/lov/lovsub_page.c        |   70 -
 .../staging/lustrefsx/lustre/lov/lproc_lov.c  |  272 +-
 drivers/staging/lustrefsx/lustre/mdc/Makefile |    2 +-
 .../staging/lustrefsx/lustre/mdc/lproc_mdc.c  |  511 ++-
 .../lustrefsx/lustre/mdc/mdc_changelog.c      |  342 +-
 .../staging/lustrefsx/lustre/mdc/mdc_dev.c    | 1564 ++++++++
 .../lustrefsx/lustre/mdc/mdc_internal.h       |   28 +-
 .../staging/lustrefsx/lustre/mdc/mdc_lib.c    |  185 +-
 .../staging/lustrefsx/lustre/mdc/mdc_locks.c  |  353 +-
 .../staging/lustrefsx/lustre/mdc/mdc_reint.c  |  183 +-
 .../lustrefsx/lustre/mdc/mdc_request.c        |  770 ++--
 .../staging/lustrefsx/lustre/mgc/lproc_mgc.c  |   52 +-
 .../lustrefsx/lustre/mgc/mgc_internal.h       |    7 +-
 .../lustrefsx/lustre/mgc/mgc_request.c        |   60 +-
 .../lustrefsx/lustre/obdclass/Makefile        |   13 +-
 .../staging/lustrefsx/lustre/obdclass/acl.c   |  183 +-
 .../lustrefsx/lustre/obdclass/cl_internal.h   |    2 +-
 .../staging/lustrefsx/lustre/obdclass/cl_io.c |  263 +-
 .../lustrefsx/lustre/obdclass/cl_lock.c       |   11 +-
 .../lustrefsx/lustre/obdclass/cl_object.c     |  105 +-
 .../lustrefsx/lustre/obdclass/cl_page.c       |  114 +-
 .../lustrefsx/lustre/obdclass/class_obd.c     |  301 +-
 .../lustrefsx/lustre/obdclass/dt_object.c     |  671 ++--
 .../lustrefsx/lustre/obdclass/genops.c        |  931 ++---
 .../staging/lustrefsx/lustre/obdclass/idmap.c |   26 +-
 .../lustrefsx/lustre/obdclass/integrity.c     |  277 ++
 .../staging/lustrefsx/lustre/obdclass/jobid.c |  575 +++
 .../lustrefsx/lustre/obdclass/kernelcomm.c    |   11 +-
 .../lustrefsx/lustre/obdclass/linkea.c        |   12 +-
 .../lustre/obdclass/linux/linux-module.c      |  582 ---
 .../lustre/obdclass/linux/linux-obdo.c        |  157 -
 .../lustre/obdclass/linux/linux-sysctl.c      |  190 -
 .../staging/lustrefsx/lustre/obdclass/llog.c  |  311 +-
 .../lustrefsx/lustre/obdclass/llog_cat.c      |  689 ++--
 .../lustrefsx/lustre/obdclass/llog_internal.h |   11 +-
 .../lustrefsx/lustre/obdclass/llog_ioctl.c    |  296 +-
 .../lustrefsx/lustre/obdclass/llog_obd.c      |  182 +-
 .../lustrefsx/lustre/obdclass/llog_osd.c      |  112 +-
 .../lustrefsx/lustre/obdclass/llog_swab.c     |  306 +-
 .../lustrefsx/lustre/obdclass/llog_test.c     |  630 ++--
 .../lustrefsx/lustre/obdclass/local_storage.c |   10 +-
 .../lustre/obdclass/lprocfs_jobstats.c        |   63 +-
 .../lustre/obdclass/lprocfs_status.c          | 1108 +++---
 .../lustre/obdclass/lprocfs_status_server.c   |  405 ++-
 .../lustrefsx/lustre/obdclass/lu_object.c     |  461 ++-
 .../lustrefsx/lustre/obdclass/lu_ref.c        |  172 +-
 .../lustre/obdclass/lustre_handles.c          |   85 +-
 .../lustrefsx/lustre/obdclass/lustre_peer.c   |  105 +-
 .../lustrefsx/lustre/obdclass/md_attrs.c      |   27 +-
 .../lustrefsx/lustre/obdclass/obd_cksum.c     |  149 +
 .../lustrefsx/lustre/obdclass/obd_config.c    |  548 ++-
 .../lustrefsx/lustre/obdclass/obd_mount.c     |  100 +-
 .../lustre/obdclass/obd_mount_server.c        |  146 +-
 .../lustrefsx/lustre/obdclass/obd_sysfs.c     |  535 +++
 .../staging/lustrefsx/lustre/obdclass/obdo.c  |  172 +-
 .../lustrefsx/lustre/obdclass/obdo_server.c   |  156 +
 .../staging/lustrefsx/lustre/obdclass/scrub.c | 1216 +++++++
 .../lustrefsx/lustre/obdclass/statfs_pack.c   |   36 +-
 .../lustrefsx/lustre/obdclass/upcall_cache.c  |   21 +-
 .../staging/lustrefsx/lustre/obdclass/uuid.c  |   78 -
 .../staging/lustrefsx/lustre/obdecho/echo.c   |  614 +++-
 .../lustrefsx/lustre/obdecho/echo_client.c    |   91 +-
 .../lustrefsx/lustre/obdecho/echo_internal.h  |    1 +
 .../staging/lustrefsx/lustre/osc/lproc_osc.c  |  676 ++--
 .../staging/lustrefsx/lustre/osc/osc_cache.c  |  366 +-
 .../staging/lustrefsx/lustre/osc/osc_dev.c    |   58 +-
 .../lustrefsx/lustre/osc/osc_internal.h       |  191 +-
 drivers/staging/lustrefsx/lustre/osc/osc_io.c |  290 +-
 .../staging/lustrefsx/lustre/osc/osc_lock.c   |  401 ++-
 .../staging/lustrefsx/lustre/osc/osc_object.c |  167 +-
 .../staging/lustrefsx/lustre/osc/osc_page.c   |  103 +-
 .../staging/lustrefsx/lustre/osc/osc_quota.c  |   21 +-
 .../lustrefsx/lustre/osc/osc_request.c        | 1248 ++++---
 .../staging/lustrefsx/lustre/ptlrpc/Makefile  |    2 +-
 .../staging/lustrefsx/lustre/ptlrpc/client.c  |  308 +-
 .../staging/lustrefsx/lustre/ptlrpc/errno.c   |   33 +-
 .../staging/lustrefsx/lustre/ptlrpc/events.c  |   46 +-
 .../lustrefsx/lustre/ptlrpc/gss/gss_api.h     |   14 +-
 .../lustrefsx/lustre/ptlrpc/gss/gss_bulk.c    |    1 -
 .../lustre/ptlrpc/gss/gss_cli_upcall.c        |  148 +-
 .../lustrefsx/lustre/ptlrpc/gss/gss_crypto.c  |  177 +-
 .../lustrefsx/lustre/ptlrpc/gss/gss_crypto.h  |   88 +-
 .../lustre/ptlrpc/gss/gss_generic_token.c     |    1 -
 .../lustre/ptlrpc/gss/gss_internal.h          |   47 +-
 .../lustrefsx/lustre/ptlrpc/gss/gss_keyring.c |  249 +-
 .../lustrefsx/lustre/ptlrpc/gss/gss_krb5.h    |    2 +-
 .../lustre/ptlrpc/gss/gss_krb5_mech.c         | 1268 +++----
 .../lustre/ptlrpc/gss/gss_mech_switch.c       |   62 +-
 .../lustre/ptlrpc/gss/gss_null_mech.c         |    4 +-
 .../lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c  |    2 +-
 .../lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c |  309 +-
 .../lustre/ptlrpc/gss/gss_svc_upcall.c        |  180 +-
 .../lustrefsx/lustre/ptlrpc/gss/lproc_gss.c   |   41 +-
 .../lustrefsx/lustre/ptlrpc/gss/sec_gss.c     |   47 +-
 .../staging/lustrefsx/lustre/ptlrpc/import.c  |  735 ++--
 .../staging/lustrefsx/lustre/ptlrpc/layout.c  |  447 ++-
 .../lustrefsx/lustre/ptlrpc/llog_client.c     |   36 -
 .../lustrefsx/lustre/ptlrpc/llog_server.c     |   51 -
 .../lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c    |  449 ++-
 .../staging/lustrefsx/lustre/ptlrpc/niobuf.c  |  100 +-
 .../lustre/ptlrpc/nodemap_internal.h          |    2 +-
 .../staging/lustrefsx/lustre/ptlrpc/nrs_crr.c |   50 +-
 .../lustrefsx/lustre/ptlrpc/nrs_delay.c       |   44 +-
 .../staging/lustrefsx/lustre/ptlrpc/nrs_orr.c |   85 +-
 .../staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c |  790 +++-
 .../lustrefsx/lustre/ptlrpc/pack_generic.c    |  700 ++--
 .../staging/lustrefsx/lustre/ptlrpc/pers.c    |   29 +-
 .../staging/lustrefsx/lustre/ptlrpc/pinger.c  |  447 +--
 .../lustrefsx/lustre/ptlrpc/ptlrpc_internal.h |   18 +-
 .../staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c |   14 +-
 .../staging/lustrefsx/lustre/ptlrpc/recover.c |   73 +-
 drivers/staging/lustrefsx/lustre/ptlrpc/sec.c |  173 +-
 .../lustrefsx/lustre/ptlrpc/sec_bulk.c        |   79 +-
 .../lustrefsx/lustre/ptlrpc/sec_config.c      |    4 +-
 .../staging/lustrefsx/lustre/ptlrpc/sec_ctx.c |    2 +-
 .../staging/lustrefsx/lustre/ptlrpc/sec_gc.c  |  107 +-
 .../lustrefsx/lustre/ptlrpc/sec_lproc.c       |  115 +-
 .../lustrefsx/lustre/ptlrpc/sec_null.c        |   17 +-
 .../lustrefsx/lustre/ptlrpc/sec_plain.c       |   27 +-
 .../staging/lustrefsx/lustre/ptlrpc/service.c |  296 +-
 .../staging/lustrefsx/lustre/ptlrpc/wirehdr.c |    6 +-
 .../lustrefsx/lustre/ptlrpc/wiretest.c        |  798 +++--
 .../staging/lustrefsx/lustre/target/barrier.c |   24 +-
 .../lustrefsx/lustre/target/out_handler.c     |  109 +-
 .../staging/lustrefsx/lustre/target/out_lib.c |   27 +-
 .../staging/lustrefsx/lustre/target/tgt_fmd.c |  363 ++
 .../lustrefsx/lustre/target/tgt_grant.c       |  257 +-
 .../lustrefsx/lustre/target/tgt_handler.c     |  775 +++-
 .../lustrefsx/lustre/target/tgt_internal.h    |   18 +-
 .../lustrefsx/lustre/target/tgt_lastrcvd.c    |   31 +-
 .../lustrefsx/lustre/target/tgt_main.c        |  350 +-
 .../lustrefsx/lustre/target/update_records.c  |    2 +-
 .../lustrefsx/lustre/target/update_recovery.c |    2 +-
 .../lustrefsx/lustre/target/update_trans.c    |   23 +-
 drivers/staging/lustrefsx/undef.h             |  213 +-
 386 files changed, 58347 insertions(+), 33625 deletions(-)
 delete mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h
 delete mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_time.h
 delete mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/libcfs.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-hash.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-wait.h
 create mode 100644 drivers/staging/lustrefsx/libcfs/include/libcfs/util/hash.h
 delete mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c
 delete mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-cpu.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-hash.c
 create mode 100644 drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-wait.c
 create mode 100644 drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_debug.h
 rename drivers/staging/lustrefsx/{libcfs/include/libcfs => lnet/include/uapi/linux/lnet}/libcfs_ioctl.h (88%)
 rename drivers/staging/lustrefsx/lnet/include/{lnet/lib-dlc.h => uapi/linux/lnet/lnet-dlc.h} (76%)
 rename drivers/staging/lustrefsx/lnet/include/{lnet/types.h => uapi/linux/lnet/lnet-types.h} (85%)
 rename drivers/staging/lustrefsx/lnet/include/{ => uapi/linux}/lnet/lnetctl.h (76%)
 rename drivers/staging/lustrefsx/lnet/include/{ => uapi/linux}/lnet/lnetst.h (99%)
 rename drivers/staging/lustrefsx/lnet/include/{ => uapi/linux}/lnet/nidstr.h (93%)
 rename drivers/staging/lustrefsx/lnet/include/{lnet/lnet.h => uapi/linux/lnet/socklnd.h} (74%)
 rename drivers/staging/lustrefsx/lustre/include/{lustre => }/lustre_errno.h (100%)
 rename drivers/staging/lustrefsx/lustre/{osc/osc_cl_internal.h => include/lustre_osc.h} (52%)
 create mode 100644 drivers/staging/lustrefsx/lustre/include/lustre_scrub.h
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_barrier_user.h
 rename drivers/staging/lustrefsx/lustre/include/uapi/linux/{ => lustre}/lustre_cfg.h (77%)
 rename drivers/staging/lustrefsx/lustre/include/uapi/linux/{ => lustre}/lustre_disk.h (85%)
 rename drivers/staging/lustrefsx/lustre/include/uapi/linux/{ => lustre}/lustre_fid.h (97%)
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fiemap.h
 rename drivers/staging/lustrefsx/lustre/include/{ => uapi/linux}/lustre/lustre_idl.h (81%)
 rename drivers/staging/lustrefsx/lustre/include/uapi/linux/{ => lustre}/lustre_ioctl.h (93%)
 rename drivers/staging/lustrefsx/lustre/include/{uapi_kernelcomm.h => uapi/linux/lustre/lustre_kernelcomm.h} (88%)
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_lfsck_user.h
 rename drivers/staging/lustrefsx/lustre/include/{ => uapi/linux/lustre}/lustre_log_user.h (97%)
 rename drivers/staging/lustrefsx/lustre/include/uapi/linux/{ => lustre}/lustre_ostid.h (95%)
 rename drivers/staging/lustrefsx/lustre/include/uapi/linux/{ => lustre}/lustre_param.h (100%)
 create mode 100644 drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_user.h
 rename drivers/staging/lustrefsx/lustre/include/{ => uapi/linux/lustre}/lustre_ver.h (83%)
 mode change 100755 => 100644 drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
 delete mode 100644 drivers/staging/lustrefsx/lustre/llite/vvp_lock.c
 delete mode 100644 drivers/staging/lustrefsx/lustre/lov/lovsub_lock.c
 delete mode 100644 drivers/staging/lustrefsx/lustre/lov/lovsub_page.c
 create mode 100644 drivers/staging/lustrefsx/lustre/mdc/mdc_dev.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/integrity.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/jobid.c
 delete mode 100644 drivers/staging/lustrefsx/lustre/obdclass/linux/linux-module.c
 delete mode 100644 drivers/staging/lustrefsx/lustre/obdclass/linux/linux-obdo.c
 delete mode 100644 drivers/staging/lustrefsx/lustre/obdclass/linux/linux-sysctl.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/obd_cksum.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/obd_sysfs.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/obdo_server.c
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/scrub.c
 delete mode 100644 drivers/staging/lustrefsx/lustre/obdclass/uuid.c
 create mode 100644 drivers/staging/lustrefsx/lustre/target/tgt_fmd.c

diff --git a/drivers/staging/lustrefsx/Makefile.rules b/drivers/staging/lustrefsx/Makefile.rules
index a0d56e80f2ce7..ce56ffa5576a0 100644
--- a/drivers/staging/lustrefsx/Makefile.rules
+++ b/drivers/staging/lustrefsx/Makefile.rules
@@ -3,4 +3,5 @@ ccflags-y += -include $(srctree)/drivers/staging/lustrefsx/config.h
 ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/libcfs/include
 ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/lnet/include
 ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/lustre/include
+ccflags-y += -I$(srctree)/drivers/staging/lustrefsx/lustre/include/uapi
 ccflags-y += -Wno-format-truncation -Werror
diff --git a/drivers/staging/lustrefsx/config.h b/drivers/staging/lustrefsx/config.h
index f4d6ee0ba3c8a..98806290010a0 100644
--- a/drivers/staging/lustrefsx/config.h
+++ b/drivers/staging/lustrefsx/config.h
@@ -25,17 +25,11 @@
 /* extened attributes for ldiskfs */
 /* #undef CONFIG_LDISKFS_FS_XATTR */
 
-/* Max LNET payload */
-#define CONFIG_LNET_MAX_PAYLOAD LNET_MTU
-
 /* enable invariant checking */
 /* #undef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK */
 
-/* IOCTL Buffer Size */
-#define CONFIG_LUSTRE_OBD_MAX_IOCTL_BUFFER 8192
-
 /* kernel has cpu affinity support */
-/* #undef CPU_AFFINITY */
+#define CPU_AFFINITY 1
 
 /* both i_dentry/d_alias uses list */
 /* #undef DATA_FOR_LLITE_IS_LIST */
@@ -58,9 +52,15 @@
 /* do data checksums */
 #define ENABLE_CHECKSUM 1
 
+/* enable flock by default */
+#define ENABLE_FLOCK 1
+
 /* Use the Pinger */
 #define ENABLE_PINGER 1
 
+/* aes-sha2 is supported by krb5 */
+/* #undef HAVE_AES_SHA2_SUPPORT */
+
 /* Define to 1 if you have the <asm/types.h> header file. */
 #define HAVE_ASM_TYPES_H 1
 
@@ -79,6 +79,12 @@
 /* 'bio_integrity_enabled' is available */
 /* #undef HAVE_BIO_INTEGRITY_ENABLED */
 
+/* kernel has bio_integrity_prep_fn */
+/* #undef HAVE_BIO_INTEGRITY_PREP_FN */
+
+/* bio_integrity_payload.bip_iter exist */
+#define HAVE_BIP_ITER_BIO_INTEGRITY_PAYLOAD 1
+
 /* 'bi_bdev' is available */
 /* #undef HAVE_BI_BDEV */
 
@@ -103,9 +109,18 @@
 /* blk_queue_max_segments is defined */
 #define HAVE_BLK_QUEUE_MAX_SEGMENTS 1
 
+/* kernel hash_64() is broken */
+/* #undef HAVE_BROKEN_HASH_64 */
+
 /* kernel has struct bvec_iter */
 #define HAVE_BVEC_ITER 1
 
+/* struct cache_detail has writers */
+#define HAVE_CACHE_DETAIL_WRITERS 1
+
+/* if cache_detail->hash_lock is a spinlock */
+#define HAVE_CACHE_HASH_SPINLOCK 1
+
 /* cache_head has hlist cache_list */
 #define HAVE_CACHE_HEAD_HLIST 1
 
@@ -118,24 +133,27 @@
 /* kernel has clean_bdev_aliases */
 /* #undef HAVE_CLEAN_BDEV_ALIASES */
 
+/* 'clear_and_wake_up_bit' is available */
+#define HAVE_CLEAR_AND_WAKE_UP_BIT 1
+
 /* have clear_inode */
 #define HAVE_CLEAR_INODE 1
 
 /* compat rdma found */
 /* #undef HAVE_COMPAT_RDMA */
 
-/* cpumap_print_to_pagebuf is available */
-#define HAVE_CPUMASK_PRINT_TO_PAGEBUF 1
+/* 'cpu_read_lock' exist */
+#define HAVE_CPUS_READ_LOCK 1
 
 /* kernel compiled with CRC32 functions */
 #define HAVE_CRC32 1
 
-/* struct cred has member tgcred */
-/* #undef HAVE_CRED_TGCRED */
-
 /* crypto hash helper functions are available */
 #define HAVE_CRYPTO_HASH_HELPERS 1
 
+/* 'CRYPTO_MAX_ALG_NAME' is 128 */
+#define HAVE_CRYPTO_MAX_ALG_NAME_128 1
+
 /* current_time() has replaced CURRENT_TIME */
 #define HAVE_CURRENT_TIME 1
 
@@ -154,6 +172,9 @@
 /* dentry_open uses struct path as first argument */
 #define HAVE_DENTRY_OPEN_USE_PATH 1
 
+/* DES3 enctype is supported by krb5 */
+/* #undef HAVE_DES3_SUPPORT */
+
 /* direct_IO need 2 arguments */
 #define HAVE_DIRECTIO_2ARGS 1
 
@@ -235,6 +256,9 @@
 /* d_delete first parameter declared is not const */
 #define HAVE_D_DELETE_CONST const
 
+/* d_hash_and_lookup is exported by the kernel */
+#define HAVE_D_HASH_AND_LOOKUP 1
+
 /* have d_make_root */
 #define HAVE_D_MAKE_ROOT 1
 
@@ -322,15 +346,18 @@
 /* Define to 1 if you have the `gethostbyname' function. */
 #define HAVE_GETHOSTBYNAME 1
 
+/* 'get_acl' has a rcu argument */
+/* #undef HAVE_GET_ACL_RCU_ARG */
+
+/* get_request_key_auth() is available */
+#define HAVE_GET_REQUEST_KEY_AUTH 1
+
 /* get_user_pages takes 6 arguments */
 /* #undef HAVE_GET_USER_PAGES_6ARG */
 
 /* get_user_pages takes gup_flags in arguments */
 #define HAVE_GET_USER_PAGES_GUP_FLAGS 1
 
-/* get_user_pages takes gup_flags in arguments with 7 args */
-/* #undef HAVE_GET_USER_PAGES_GUP_FLAGS_7ARGS */
-
 /* struct group_info has member gid */
 #define HAVE_GROUP_INFO_GID 1
 
@@ -343,6 +370,9 @@
 /* Define this if the Kerberos GSS library supports gss_krb5_ccache_name */
 /* #undef HAVE_GSS_KRB5_CCACHE_NAME */
 
+/* '__rhashtable_insert_fast()' returns int */
+/* #undef HAVE_HASHTABLE_INSERT_FAST_RETURN_INT */
+
 /* Define this if you have Heimdal Kerberos libraries */
 /* #undef HAVE_HEIMDAL */
 
@@ -391,6 +421,9 @@
 /* if ib_sg_dma_address wrapper exists */
 /* #undef HAVE_IB_SG_DMA_ADDRESS */
 
+/* INIT_LIST_HEAD_RCU exists */
+#define HAVE_INIT_LIST_HEAD_RCU 1
+
 /* inode_operations .getattr member function can gather advance stats */
 #define HAVE_INODEOPS_ENHANCED_GETATTR 1
 
@@ -415,6 +448,15 @@
 /* inode_operations->permission has two args */
 #define HAVE_INODE_PERMISION_2ARGS 1
 
+/* inode times are using timespec64 */
+#define HAVE_INODE_TIMESPEC64 1
+
+/* blk_integrity.interval exist */
+/* #undef HAVE_INTERVAL_BLK_INTEGRITY */
+
+/* blk_integrity.interval_exp exist */
+#define HAVE_INTERVAL_EXP_BLK_INTEGRITY 1
+
 /* Define to 1 if you have the <inttypes.h> header file. */
 #define HAVE_INTTYPES_H 1
 
@@ -424,6 +466,9 @@
 /* have in_compat_syscall */
 #define HAVE_IN_COMPAT_SYSCALL 1
 
+/* 'in_dev_for_each_ifa_rtnl' is defined */
+#define HAVE_IN_DEV_FOR_EACH_IFA_RTNL 1
+
 /* inode_operations->rename need flags as argument */
 #define HAVE_IOPS_RENAME_WITH_FLAGS 1
 
@@ -445,7 +490,7 @@
 /* inode_operations has {get,set,remove}xattr members */
 /* #undef HAVE_IOP_XATTR */
 
-/* if iov_iter has member type */
+/* if iov_iter has member iter_type */
 #define HAVE_IOV_ITER_HAS_TYPE_MEMBER 1
 
 /* iov_iter_init handles directional tag */
@@ -463,18 +508,27 @@
 /* is_sxid is defined */
 #define HAVE_IS_SXID 1
 
+/* 'iterate_shared' is available */
+#define HAVE_ITERATE_SHARED 1
+
 /* struct address_space has i_pages */
 #define HAVE_I_PAGES 1
 
 /* i_uid_read is present */
 #define HAVE_I_UID_READ 1
 
-/* jiffies_to_timespec64() is available */
-#define HAVE_JIFFIES_TO_TIMESPEC64 1
+/* kallsyms_lookup_name is exported by kernel */
+/* #undef HAVE_KALLSYMS_LOOKUP_NAME */
 
 /* kernel_locked is defined */
 /* #undef HAVE_KERNEL_LOCKED */
 
+/* 'kernel_param_[un]lock' is available */
+#define HAVE_KERNEL_PARAM_LOCK 1
+
+/* 'struct kernel_param_ops' is available */
+#define HAVE_KERNEL_PARAM_OPS 1
+
 /* kernel_setsockopt still in use */
 /* #undef HAVE_KERNEL_SETSOCKOPT */
 
@@ -493,6 +547,9 @@
 /* key_type->instantiate has two args */
 #define HAVE_KEY_TYPE_INSTANTIATE_2ARGS 1
 
+/* key.usage is of type refcount_t */
+#define HAVE_KEY_USAGE_REFCOUNT 1
+
 /* ki_left exist */
 /* #undef HAVE_KIOCB_KI_LEFT */
 
@@ -521,12 +578,15 @@
    available */
 /* #undef HAVE_KRB5_GET_INIT_CREDS_OPT_SET_ADDRESSLESS */
 
+/* kset_find_obj is exported by the kernel */
+#define HAVE_KSET_FIND_OBJ 1
+
+/* kernel has kstrtobool_from_user */
+#define HAVE_KSTRTOBOOL_FROM_USER 1
+
 /* kernel has kstrtoul */
 #define HAVE_KSTRTOUL 1
 
-/* kernel has ksys_close */
-#define HAVE_KSYS_CLOSE 1
-
 /* kthread_worker found */
 /* #undef HAVE_KTHREAD_WORK */
 
@@ -554,6 +614,9 @@
 /* 'ktime_get_ts64' is available */
 #define HAVE_KTIME_GET_TS64 1
 
+/* 'ktime_ms_delta' is available */
+#define HAVE_KTIME_MS_DELTA 1
+
 /* 'ktime_to_timespec64' is available */
 #define HAVE_KTIME_TO_TIMESPEC64 1
 
@@ -581,20 +644,14 @@
 /* readline library is available */
 /* #undef HAVE_LIBREADLINE */
 
-/* Define to 1 if you have the <linux/random.h> header file. */
-#define HAVE_LINUX_RANDOM_H 1
+/* linux/rhashtable.h is present */
+#define HAVE_LINUX_RHASHTABLE_H 1
 
 /* if linux/selinux.h exists */
 /* #undef HAVE_LINUX_SELINUX_IS_ENABLED */
 
-/* Define to 1 if you have the <linux/types.h> header file. */
-#define HAVE_LINUX_TYPES_H 1
-
-/* Define to 1 if you have the <linux/unistd.h> header file. */
-#define HAVE_LINUX_UNISTD_H 1
-
-/* Define to 1 if you have the <linux/version.h> header file. */
-#define HAVE_LINUX_VERSION_H 1
+/* linux/stdarg.h is present */
+/* #undef HAVE_LINUX_STDARG_HEADER */
 
 /* lock_manager_operations has lm_compare_owner */
 /* #undef HAVE_LM_COMPARE_OWNER */
@@ -605,6 +662,9 @@
 /* kernel has locks_lock_file_wait */
 #define HAVE_LOCKS_LOCK_FILE_WAIT 1
 
+/* lookup_user_key() is available */
+#define HAVE_LOOKUP_USER_KEY 1
+
 /* kernel has LOOP_CTL_GET_FREE */
 #define HAVE_LOOP_CTL_GET_FREE 1
 
@@ -633,6 +693,9 @@
 /* kernel module loading is possible */
 #define HAVE_MODULE_LOADING_SUPPORT 1
 
+/* locking module param is supported */
+/* #undef HAVE_MODULE_PARAM_LOCKING */
+
 /* Define to 1 if you have the `name_to_handle_at' function. */
 #define HAVE_NAME_TO_HANDLE_AT 1
 
@@ -642,21 +705,36 @@
 /* cancel_dirty_page with one arguement is available */
 #define HAVE_NEW_CANCEL_DIRTY_PAGE 1
 
+/* DEFINE_TIMER uses only 2 arguements */
+#define HAVE_NEW_DEFINE_TIMER 1
+
 /* 'kernel_write' aligns with read/write helpers */
 #define HAVE_NEW_KERNEL_WRITE 1
 
 /* NR_UNSTABLE_NFS is still in use. */
 /* #undef HAVE_NR_UNSTABLE_NFS */
 
+/* ns_to_timespec64() is available */
+#define HAVE_NS_TO_TIMESPEC64 1
+
 /* with oldsize */
 /* #undef HAVE_OLDSIZE_TRUNCATE_PAGECACHE */
 
+/* openssl-devel is present */
+/* #undef HAVE_OPENSSL_GETSEPOL */
+
 /* OpenSSL HMAC functions needed for SSK */
 /* #undef HAVE_OPENSSL_SSK */
 
 /* 'pagevec_init' takes one parameter */
 #define HAVE_PAGEVEC_INIT_ONE_PARAM 1
 
+/* linux/panic_notifier.h is present */
+/* #undef HAVE_PANIC_NOTIFIER_H */
+
+/* 'param_set_uint_minmax' is available */
+/* #undef HAVE_PARAM_SET_UINT_MINMAX */
+
 /* have PCLMULQDQ instruction */
 #define HAVE_PCLMULQDQ 1
 
@@ -675,6 +753,9 @@
 /* posix_acl_valid takes struct user_namespace */
 #define HAVE_POSIX_ACL_VALID_USER_NS 1
 
+/* 'prepare_to_wait_event' is available */
+#define HAVE_PREPARE_TO_WAIT_EVENT 1
+
 /* struct proc_ops exists */
 #define HAVE_PROC_OPS 1
 
@@ -687,12 +768,18 @@
 /* inode->i_nlink is protected from direct modification */
 #define HAVE_PROTECT_I_NLINK 1
 
+/* 'PTR_ERR_OR_ZERO' exist */
+#define HAVE_PTR_ERR_OR_ZERO 1
+
 /* have quota64 */
 #define HAVE_QUOTA64 1
 
 /* radix_tree_exceptional_entry exist */
 /* #undef HAVE_RADIX_EXCEPTION_ENTRY */
 
+/* rdma_connect_locked is defined */
+#define HAVE_RDMA_CONNECT_LOCKED 1
+
 /* rdma_create_id wants 4 args */
 /* #undef HAVE_RDMA_CREATE_ID_4ARG */
 
@@ -702,15 +789,24 @@
 /* rdma_reject has 4 arguments */
 #define HAVE_RDMA_REJECT_4ARGS 1
 
-/* reinit_completion is exist */
-#define HAVE_REINIT_COMPLETION 1
-
 /* kernel export remove_from_page_cache */
 /* #undef HAVE_REMOVE_FROM_PAGE_CACHE */
 
 /* remove_proc_subtree is defined */
 #define HAVE_REMOVE_PROC_SUBTREE 1
 
+/* rhashtable_lookup() is available */
+#define HAVE_RHASHTABLE_LOOKUP 1
+
+/* rhashtable_lookup_get_insert_fast() is available */
+#define HAVE_RHASHTABLE_LOOKUP_GET_INSERT_FAST 1
+
+/* struct rhltable exist */
+#define HAVE_RHLTABLE 1
+
+/* save_stack_trace_tsk is exported */
+/* #undef HAVE_SAVE_STACK_TRACE_TSK */
+
 /* Have sa_spill_alloc in ZFS */
 /* #undef HAVE_SA_SPILL_ALLOC */
 
@@ -735,6 +831,9 @@
 /* security_inode_init_security takes a 'struct qstr' parameter */
 /* #undef HAVE_SECURITY_IINITSEC_QSTR */
 
+/* security_inode_listsecurity() is available/exported */
+#define HAVE_SECURITY_INODE_LISTSECURITY 1
+
 /* security_release_secctx has 1 arg. */
 /* #undef HAVE_SEC_RELEASE_SECCTX_1ARG */
 
@@ -778,36 +877,27 @@
 /* Have spa_maxblocksize in ZFS */
 /* #undef HAVE_SPA_MAXBLOCKSIZE */
 
-/* spinlock_t is defined */
-/* #undef HAVE_SPINLOCK_T */
-
 /* struct stacktrace_ops exists */
 /* #undef HAVE_STACKTRACE_OPS */
 
 /* stacktrace_ops.warning is exist */
 /* #undef HAVE_STACKTRACE_WARNING */
 
-/* stack_trace_print() exists */
-#define HAVE_STACK_TRACE_PRINT 1
-
 /* Define to 1 if you have the <stdint.h> header file. */
 #define HAVE_STDINT_H 1
 
 /* Define to 1 if you have the <stdlib.h> header file. */
 #define HAVE_STDLIB_H 1
 
+/* stringhash.h is present */
+#define HAVE_STRINGHASH 1
+
 /* Define to 1 if you have the <strings.h> header file. */
 #define HAVE_STRINGS_H 1
 
 /* Define to 1 if you have the <string.h> header file. */
 #define HAVE_STRING_H 1
 
-/* Define to 1 if you have the `strlcat' function. */
-/* #undef HAVE_STRLCAT */
-
-/* Define to 1 if you have the `strlcpy' function. */
-/* #undef HAVE_STRLCPY */
-
 /* Define to 1 if you have the `strnlen' function. */
 #define HAVE_STRNLEN 1
 
@@ -835,9 +925,6 @@
 /* ctl_table has ctl_name field */
 /* #undef HAVE_SYSCTL_CTLNAME */
 
-/* Define to 1 if you have the <sys/ioctl.h> header file. */
-#define HAVE_SYS_IOCTL_H 1
-
 /* Define to 1 if you have <sys/quota.h>. */
 #define HAVE_SYS_QUOTA_H 1
 
@@ -847,6 +934,9 @@
 /* Define to 1 if you have the <sys/types.h> header file. */
 #define HAVE_SYS_TYPES_H 1
 
+/* task_is_running() is defined */
+/* #undef HAVE_TASK_IS_RUNNING */
+
 /* tcp_sendpage use socket as first parameter */
 /* #undef HAVE_TCP_SENDPAGE_USE_SOCKET */
 
@@ -868,9 +958,6 @@
 /* 'timespec64_to_ktime' is available */
 #define HAVE_TIMESPEC64_TO_KTIME 1
 
-/* have_time_t */
-/* #undef HAVE_TIME_T */
-
 /* topology_sibling_cpumask is available */
 #define HAVE_TOPOLOGY_SIBLING_CPUMASK 1
 
@@ -922,9 +1009,18 @@
 /* 'struct vm_operations' remove struct vm_area_struct argument */
 #define HAVE_VM_OPS_USE_VM_FAULT_ONLY 1
 
+/* wait_bit.h is present */
+#define HAVE_WAIT_BIT_HEADER_H 1
+
 /* 'wait_queue_entry_t' is available */
 #define HAVE_WAIT_QUEUE_ENTRY 1
 
+/* linux wait_queue_head_t list_head is name head */
+#define HAVE_WAIT_QUEUE_ENTRY_LIST 1
+
+/* 'wait_var_event' is available */
+#define HAVE_WAIT_VAR_EVENT 1
+
 /* flags field exist */
 #define HAVE_XATTR_HANDLER_FLAGS 1
 
@@ -949,9 +1045,18 @@
 /* Have zap_remove_by_dnode() in ZFS */
 /* #undef HAVE_ZAP_REMOVE_ADD_BY_DNODE */
 
+/* Have inode_timespec_t */
+/* #undef HAVE_ZFS_INODE_TIMESPEC */
+
+/* Have multihost protection in ZFS */
+/* #undef HAVE_ZFS_MULTIHOST */
+
 /* Enable zfs osd */
 /* #undef HAVE_ZFS_OSD */
 
+/* Have zfs_refcount_add */
+/* #undef HAVE_ZFS_REFCOUNT_ADD */
+
 /* __add_wait_queue_exclusive exists */
 /* #undef HAVE___ADD_WAIT_QUEUE_EXCLUSIVE */
 
@@ -978,13 +1083,13 @@
 #define LUSTRE_MAJOR 2
 
 /* Second number in the Lustre version */
-#define LUSTRE_MINOR 10
+#define LUSTRE_MINOR 12
 
 /* Third number in the Lustre version */
 #define LUSTRE_PATCH 8
 
 /* A copy of PACKAGE_VERSION */
-#define LUSTRE_VERSION_STRING "2.10.8-11"
+#define LUSTRE_VERSION_STRING "2.12.8"
 
 /* maximum number of MDS threads */
 /* #undef MDS_MAX_THREADS */
@@ -1001,6 +1106,9 @@
 /* need pclmulqdq based crc32 */
 /* #undef NEED_CRC32_ACCEL */
 
+/* 'ktime_get_ns' is not available */
+/* #undef NEED_KTIME_GET_NS */
+
 /* 'ktime_get_real_ns' is not available */
 /* #undef NEED_KTIME_GET_REAL_NS */
 
@@ -1017,7 +1125,7 @@
 #define PACKAGE_NAME "Lustre"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "Lustre 2.10.8-11"
+#define PACKAGE_STRING "Lustre 2.12.8"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "lustre"
@@ -1026,14 +1134,11 @@
 #define PACKAGE_URL ""
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2.10.8-11"
+#define PACKAGE_VERSION "2.12.8"
 
 /* name of parallel fsck program */
 #define PFSCK "fsck"
 
-/* proc handler methods use __user */
-/* #undef PROC_HANDLER_USE_USER_ATTR */
-
 /* enable randomly alloc failure */
 #define RANDOM_FAIL_ALLOC 1
 
@@ -1070,16 +1175,16 @@
 /* #undef USE_LU_REF */
 
 /* Version number of package */
-#define VERSION "2.10.8-11"
+#define VERSION "2.12.8"
 
 /* zfs fix version */
-/* #undef ZFS_FIX */
+#define ZFS_FIX 0
 
 /* zfs major version */
-/* #undef ZFS_MAJOR */
+#define ZFS_MAJOR 
 
 /* zfs minor version */
-/* #undef ZFS_MINOR */
+#define ZFS_MINOR 
 
 /* zfs patch version */
-/* #undef ZFS_PATCH */
+#define ZFS_PATCH 
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h
index 28472601ed4df..1763da296244d 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h
@@ -32,6 +32,9 @@
 #ifndef _LIBCFS_BITMAP_H_
 #define _LIBCFS_BITMAP_H_
 
+#include <linux/interrupt.h>
+#include <libcfs/libcfs_private.h>
+
 struct cfs_bitmap {
 	unsigned int size;
 	unsigned long data[0];
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/curproc.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/curproc.h
index e9e0cc2109034..0f00c7219e75d 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/curproc.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/curproc.h
@@ -39,6 +39,10 @@
 #ifndef __LIBCFS_CURPROC_H__
 #define __LIBCFS_CURPROC_H__
 
+/* check if task is running in compat mode.*/
+#define current_pid()		(current->pid)
+#define current_comm()		(current->comm)
+
 typedef __u32 cfs_cap_t;
 
 #define CFS_CAP_CHOWN                   0
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h
index f01170c6e1d97..9ae7b8405a94b 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -33,11 +33,28 @@
 #ifndef __LIBCFS_LIBCFS_H__
 #define __LIBCFS_LIBCFS_H__
 
-#ifdef __KERNEL__
-# include <libcfs/linux/libcfs.h>
-# include "curproc.h"
+#include <linux/kernel.h>
+#include <linux/module.h>
 
-#define LIBCFS_VERSION	"0.5.0"
+#include <libcfs/linux/linux-misc.h>
+#include <libcfs/linux/linux-time.h>
+#include <libcfs/linux/linux-wait.h>
+#include <libcfs/linux/linux-mem.h>
+
+#include <uapi/linux/lnet/libcfs_ioctl.h>
+#include <libcfs/libcfs_debug.h>
+#include <libcfs/libcfs_private.h>
+#include <libcfs/bitmap.h>
+#include <libcfs/libcfs_cpu.h>
+#include <libcfs/libcfs_prim.h>
+#include <libcfs/libcfs_string.h>
+#include <libcfs/libcfs_workitem.h>
+#include <libcfs/libcfs_hash.h>
+#include <libcfs/libcfs_heap.h>
+#include <libcfs/libcfs_fail.h>
+#include "curproc.h"
+
+#define LIBCFS_VERSION	"0.7.1"
 
 #define PO2_ROUNDUP_TYPED(x, po2, type) (-(-(type)(x) & -(type)(po2)))
 #define LOWEST_BIT_SET(x) ((x) & ~((x) - 1))
@@ -82,15 +99,19 @@ void lc_watchdog_delete(struct lc_watchdog *lcw);
  #endif
 #endif
 
+typedef s32 timeout_t;
+
 /* need both kernel and user-land acceptor */
 #define LNET_ACCEPTOR_MIN_RESERVED_PORT    512
 #define LNET_ACCEPTOR_MAX_RESERVED_PORT    1023
 
-/*
- * Drop into debugger, if possible. Implementation is provided by platform.
- */
-
-void cfs_enter_debugger(void);
+extern struct blocking_notifier_head libcfs_ioctl_list;
+static inline int notifier_from_ioctl_errno(int err)
+{
+	if (err == -EINVAL)
+		return NOTIFY_OK;
+	return notifier_from_errno(err) | NOTIFY_STOP_MASK;
+}
 
 /*
  * Defined by platform
@@ -111,21 +132,6 @@ unsigned int cfs_rand(void);
 /* seed the generator */
 void cfs_srand(unsigned int, unsigned int);
 void cfs_get_random_bytes(void *buf, int size);
-#endif /* __KERNEL__ */
-
-#include <libcfs/libcfs_debug.h>
-#ifdef __KERNEL__
-# include <libcfs/libcfs_private.h>
-# include <libcfs/bitmap.h>
-# include <libcfs/libcfs_cpu.h>
-# include <libcfs/libcfs_ioctl.h>
-# include <libcfs/libcfs_prim.h>
-# include <libcfs/libcfs_time.h>
-# include <libcfs/libcfs_string.h>
-# include <libcfs/libcfs_workitem.h>
-# include <libcfs/libcfs_hash.h>
-# include <libcfs/libcfs_heap.h>
-# include <libcfs/libcfs_fail.h>
 
 int libcfs_ioctl_data_adjust(struct libcfs_ioctl_data *data);
 int libcfs_ioctl(unsigned long cmd, void __user *uparam);
@@ -139,12 +145,30 @@ static inline void *__container_of(const void *ptr, unsigned long shift)
 		return (char *)ptr - shift;
 }
 
-#define container_of0(ptr, type, member)				\
+#define container_of0(ptr, type, member) \
 	((type *)__container_of((ptr), offsetof(type, member)))
 
-#endif /* __KERNEL__ */
+struct lnet_debugfs_symlink_def {
+	const char *name;
+	const char *target;
+};
+
+void lnet_insert_debugfs(struct ctl_table *table);
+void lnet_remove_debugfs(struct ctl_table *table);
+
+/* helper for sysctl handlers */
+int lprocfs_call_handler(void *data, int write, loff_t *ppos,
+			 void __user *buffer, size_t *lenp,
+			 int (*handler)(void *data, int write, loff_t pos,
+					void __user *buffer, int len));
+int debugfs_doint(struct ctl_table *table, int write,
+		  void __user *buffer, size_t *lenp, loff_t *ppos);
 
 /* atomic-context safe vfree */
+#ifdef HAVE_LIBCFS_VFREE_ATOMIC
 void libcfs_vfree_atomic(const void *addr);
+#else
+#define libcfs_vfree_atomic(ptr) vfree(ptr)
+#endif
 
 #endif /* _LIBCFS_LIBCFS_H_ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_cpu.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_cpu.h
index 9fd28ce749cfe..4620dcc08cf80 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_cpu.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_cpu.h
@@ -13,17 +13,12 @@
  * General Public License version 2 for more details (a copy is included
  * in the LICENSE file that accompanied this code).
  *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA
- *
  * GPL HEADER END
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -47,16 +42,16 @@
  *
  *     Example: if there are 8 cores on the system, while creating a CPT
  *     with cpu_npartitions=4:
- *              core[0, 1] = partition[0], core[2, 3] = partition[1]
- *              core[4, 5] = partition[2], core[6, 7] = partition[3]
+ *		core[0, 1] = partition[0], core[2, 3] = partition[1]
+ *		core[4, 5] = partition[2], core[6, 7] = partition[3]
  *
  *          cpu_npartitions=1:
- *              core[0, 1, ... 7] = partition[0]
+ *		core[0, 1, ... 7] = partition[0]
  *
  *   . User can also specify CPU partitions by string pattern
  *
  *     Examples: cpu_partitions="0[0,1], 1[2,3]"
- *               cpu_partitions="N 0[0-3], 1[4-8]"
+ *		 cpu_partitions="N 0[0-3], 1[4-8]"
  *
  *     The first character "N" means following numbers are numa ID
  *
@@ -76,21 +71,56 @@
 #ifndef __LIBCFS_CPU_H__
 #define __LIBCFS_CPU_H__
 
-#ifndef HAVE_LIBCFS_CPT
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/slab.h>
+#include <linux/topology.h>
+#include <linux/version.h>
+#include <linux/vmalloc.h>
+
+#include <libcfs/linux/linux-cpu.h>
+
+#ifdef CONFIG_SMP
+
+/** virtual processing unit */
+struct cfs_cpu_partition {
+	/* CPUs mask for this partition */
+	cpumask_t			*cpt_cpumask;
+	/* nodes mask for this partition */
+	nodemask_t			*cpt_nodemask;
+	/* NUMA distance between CPTs */
+	unsigned int			*cpt_distance;
+	/* spread rotor for NUMA allocator */
+	unsigned int			 cpt_spread_rotor;
+	/* NUMA node if cpt_nodemask is empty */
+	int				 cpt_node;
+};
+#endif /* CONFIG_SMP */
 
+/** descriptor for CPU partitions */
 struct cfs_cpt_table {
+#ifdef CONFIG_SMP
+	/* spread rotor for NUMA allocator */
+	unsigned int			 ctb_spread_rotor;
+	/* maximum NUMA distance between all nodes in table */
+	unsigned int			 ctb_distance;
+	/* partitions tables */
+	struct cfs_cpu_partition	*ctb_parts;
+	/* shadow HW CPU to CPU partition ID */
+	int				*ctb_cpu2cpt;
+	/* shadow HW node to CPU partition ID */
+	int				*ctb_node2cpt;
 	/* # of CPU partitions */
-	int			ctb_nparts;
-	/* cpu mask */
-	cpumask_t		ctb_mask;
-	/* node mask */
-	nodemask_t		ctb_nodemask;
-	/* version */
-	__u64			ctb_version;
+	int				 ctb_nparts;
+	/* all nodes in this partition table */
+	nodemask_t			*ctb_nodemask;
+#else
+	nodemask_t			 ctb_nodemask;
+#endif /* CONFIG_SMP */
+	/* all cpus in this partition table */
+	cpumask_t			*ctb_cpumask;
 };
 
-#endif /* !HAVE_LIBCFS_CPT */
-
 /* any CPU partition */
 #define CFS_CPT_ANY		(-1)
 
@@ -117,7 +147,7 @@ int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len);
  */
 int cfs_cpt_number(struct cfs_cpt_table *cptab);
 /**
- * return number of HW cores or hypter-threadings in a CPU partition \a cpt
+ * return number of HW cores or hyper-threadings in a CPU partition \a cpt
  */
 int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt);
 /**
@@ -147,13 +177,13 @@ int cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node);
 /**
  * NUMA distance between \a cpt1 and \a cpt2 in \a cptab
  */
-unsigned cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2);
+unsigned int cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2);
 /**
  * bind current thread on a CPU-partition \a cpt of \a cptab
  */
 int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt);
 /**
- * add \a cpu to CPU partion @cpt of \a cptab, return 1 for success,
+ * add \a cpu to CPU partition @cpt of \a cptab, return 1 for success,
  * otherwise 0 is returned
  */
 int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu);
@@ -165,7 +195,6 @@ void cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu);
  * add all cpus in \a mask to CPU partition \a cpt
  * return 1 if successfully set all CPUs, otherwise return 0
  */
-
 int cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt,
 			const cpumask_t *mask);
 /**
@@ -203,15 +232,15 @@ int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt);
 /*
  * allocate per-cpu-partition data, returned value is an array of pointers,
  * variable can be indexed by CPU ID.
- *      cptab != NULL: size of array is number of CPU partitions
- *      cptab == NULL: size of array is number of HW cores
+ *	cptab != NULL: size of array is number of CPU partitions
+ *	cptab == NULL: size of array is number of HW cores
  */
 void *cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size);
 /*
- * destory per-cpu-partition variable
+ * destroy per-cpu-partition variable
  */
-void  cfs_percpt_free(void *vars);
-int   cfs_percpt_number(void *vars);
+void cfs_percpt_free(void *vars);
+int cfs_percpt_number(void *vars);
 
 #define cfs_percpt_for_each(var, i, vars)		\
 	for (i = 0; i < cfs_percpt_number(vars) &&	\
@@ -260,16 +289,17 @@ void cfs_percpt_lock_free(struct cfs_percpt_lock *pcl);
 
 /* lock private lock \a index of \a pcl */
 void cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index);
+
 /* unlock private lock \a index of \a pcl */
 void cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index);
 
-#define CFS_PERCPT_LOCK_KEYS   256
+#define CFS_PERCPT_LOCK_KEYS	256
 
 /* NB: don't allocate keys dynamically, lockdep needs them to be in ".data" */
 #define cfs_percpt_lock_alloc(cptab)					\
 ({									\
-	static struct lock_class_key  ___keys[CFS_PERCPT_LOCK_KEYS];	\
-	struct cfs_percpt_lock	     *___lk;				\
+	static struct lock_class_key ___keys[CFS_PERCPT_LOCK_KEYS];	\
+	struct cfs_percpt_lock *___lk;					\
 									\
 	if (cfs_cpt_number(cptab) > CFS_PERCPT_LOCK_KEYS)		\
 		___lk = cfs_percpt_lock_create(cptab, NULL);		\
@@ -338,14 +368,6 @@ cfs_mem_cache_cpt_alloc(struct kmem_cache *cachep, struct cfs_cpt_table *cptab,
 #define cfs_cpt_for_each(i, cptab)	\
 	for (i = 0; i < cfs_cpt_number(cptab); i++)
 
-#ifndef __read_mostly
-# define __read_mostly
-#endif
-
-#ifndef ____cacheline_aligned
-#define ____cacheline_aligned
-#endif
-
 int  cfs_cpu_init(void);
 void cfs_cpu_fini(void);
 
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_crypto.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_crypto.h
index ea9234abc7f76..8271306ce6019 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_crypto.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_crypto.h
@@ -38,6 +38,12 @@ struct cfs_crypto_hash_type {
 	unsigned int    cht_size;       /**< hash digest size */
 };
 
+struct cfs_crypto_crypt_type {
+	char	       *cct_name;	  /**< crypto algorithm name, equal to
+					   * format name for crypto api */
+	unsigned int    cct_size;         /**< crypto key size */
+};
+
 enum cfs_crypto_hash_alg {
 	CFS_HASH_ALG_NULL	= 0,
 	CFS_HASH_ALG_ADLER32,
@@ -54,6 +60,13 @@ enum cfs_crypto_hash_alg {
 	CFS_HASH_ALG_UNKNOWN	= 0xff
 };
 
+enum cfs_crypto_crypt_alg {
+	CFS_CRYPT_ALG_NULL	= 0,
+	CFS_CRYPT_ALG_AES256_CTR,
+	CFS_CRYPT_ALG_MAX,
+	CFS_CRYPT_ALG_UNKNOWN	= 0xff
+};
+
 static struct cfs_crypto_hash_type hash_types[] = {
 	[CFS_HASH_ALG_NULL] = {
 		.cht_name	= "null",
@@ -107,6 +120,17 @@ static struct cfs_crypto_hash_type hash_types[] = {
 	}
 };
 
+static struct cfs_crypto_crypt_type crypt_types[] = {
+	[CFS_CRYPT_ALG_NULL] = {
+		.cct_name	= "null",
+		.cct_size	= 0
+	},
+	[CFS_CRYPT_ALG_AES256_CTR] = {
+		.cct_name	= "ctr(aes)",
+		.cct_size	= 32
+	}
+};
+
 /* Maximum size of hash_types[].cht_size */
 #define CFS_CRYPTO_HASH_DIGESTSIZE_MAX 64
 
@@ -188,24 +212,103 @@ static inline unsigned char cfs_crypto_hash_alg(const char *algname)
 	return CFS_HASH_ALG_UNKNOWN;
 }
 
+/**
+ * Return crypt algorithm information for the specified algorithm identifier
+ *
+ * Crypt information includes algorithm name, key size.
+ *
+ * \retval		cfs_crypto_crupt_type for valid ID (CFS_CRYPT_ALG_*)
+ * \retval		NULL for unknown algorithm identifier
+ */
+static inline const struct
+cfs_crypto_crypt_type *cfs_crypto_crypt_type(
+	enum cfs_crypto_crypt_alg crypt_alg)
+{
+	struct cfs_crypto_crypt_type *ct;
+
+	if (crypt_alg < CFS_CRYPT_ALG_MAX) {
+		ct = &crypt_types[crypt_alg];
+		if (ct->cct_name != NULL)
+			return ct;
+	}
+	return NULL;
+}
+
+/**
+ * Return crypt name for crypt algorithm identifier
+ *
+ * \param[in] crypt_alg	crypt alrgorithm id (CFS_CRYPT_ALG_*)
+ *
+ * \retval		string name of known crypt algorithm
+ * \retval		"unknown" if hash algorithm is unknown
+ */
+static inline const
+char *cfs_crypto_crypt_name(enum cfs_crypto_crypt_alg crypt_alg)
+{
+	const struct cfs_crypto_crypt_type *ct;
+
+	ct = cfs_crypto_crypt_type(crypt_alg);
+	if (ct)
+		return ct->cct_name;
+
+	return "unknown";
+}
+
+
+/**
+ * Return key size for crypto algorithm type
+ *
+ * \param[in] crypt_alg	crypt alrgorithm id (CFS_CRYPT_ALG_*)
+ *
+ * \retval		crypt algorithm key size in bytes
+ * \retval		0 if crypt algorithm type is unknown
+ */
+static inline
+unsigned int cfs_crypto_crypt_keysize(enum cfs_crypto_crypt_alg crypt_alg)
+{
+	const struct cfs_crypto_crypt_type *ct;
+
+	ct = cfs_crypto_crypt_type(crypt_alg);
+	if (ct != NULL)
+		return ct->cct_size;
+
+	return 0;
+}
+
+/**
+ * Find crypto algorithm ID for the specified algorithm name
+ *
+ * \retval		crypto algorithm ID for valid ID (CFS_CRYPT_ALG_*)
+ * \retval		CFS_CRYPT_ALG_UNKNOWN for unknown algorithm name
+ */
+static inline unsigned char cfs_crypto_crypt_alg(const char *algname)
+{
+	enum cfs_crypto_crypt_alg crypt_alg;
+
+	for (crypt_alg = 0; crypt_alg < CFS_CRYPT_ALG_MAX; crypt_alg++)
+		if (strcmp(crypt_types[crypt_alg].cct_name, algname) == 0)
+			return crypt_alg;
+
+	return CFS_CRYPT_ALG_UNKNOWN;
+}
+
 int cfs_crypto_hash_digest(enum cfs_crypto_hash_alg hash_alg,
 			   const void *buf, unsigned int buf_len,
 			   unsigned char *key, unsigned int key_len,
 			   unsigned char *hash, unsigned int *hash_len);
 
 /* cfs crypto hash descriptor */
-struct cfs_crypto_hash_desc;
 struct page;
 
-struct cfs_crypto_hash_desc *
+struct ahash_request *
 	cfs_crypto_hash_init(enum cfs_crypto_hash_alg hash_alg,
 			     unsigned char *key, unsigned int key_len);
-int cfs_crypto_hash_update_page(struct cfs_crypto_hash_desc *desc,
+int cfs_crypto_hash_update_page(struct ahash_request *req,
 				struct page *page, unsigned int offset,
 				unsigned int len);
-int cfs_crypto_hash_update(struct cfs_crypto_hash_desc *desc, const void *buf,
+int cfs_crypto_hash_update(struct ahash_request *req, const void *buf,
 			   unsigned int buf_len);
-int cfs_crypto_hash_final(struct cfs_crypto_hash_desc *desc,
+int cfs_crypto_hash_final(struct ahash_request *req,
 			  unsigned char *hash, unsigned int *hash_len);
 int cfs_crypto_register(void);
 void cfs_crypto_unregister(void);
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_debug.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_debug.h
index 2eb6b7aa57d9c..ac89d2cb60b55 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_debug.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_debug.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2014, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,6 +38,10 @@
 #ifndef __LIBCFS_DEBUG_H__
 #define __LIBCFS_DEBUG_H__
 
+#include <stdarg.h>
+#include <linux/limits.h>
+#include <uapi/linux/lnet/libcfs_debug.h>
+
 /*
  *  Debugging
  */
@@ -60,112 +64,6 @@ int libcfs_debug_str2mask(int *mask, const char *str, int is_subsys);
 extern unsigned int libcfs_catastrophe;
 extern unsigned int libcfs_panic_on_lbug;
 
-/**
- * Format for debug message headers
- */
-struct ptldebug_header {
-        __u32 ph_len;
-        __u32 ph_flags;
-        __u32 ph_subsys;
-        __u32 ph_mask;
-        __u16 ph_cpu_id;
-        __u16 ph_type;
-	/* time_t overflow in 2106 */
-        __u32 ph_sec;
-        __u64 ph_usec;
-        __u32 ph_stack;
-        __u32 ph_pid;
-        __u32 ph_extern_pid;
-        __u32 ph_line_num;
-} __attribute__((packed));
-
-
-#define PH_FLAG_FIRST_RECORD 1
-
-/* Debugging subsystems (32 bits, non-overlapping) */
-#define S_UNDEFINED	0x00000001
-#define S_MDC		0x00000002
-#define S_MDS		0x00000004
-#define S_OSC		0x00000008
-#define S_OST		0x00000010
-#define S_CLASS		0x00000020
-#define S_LOG		0x00000040
-#define S_LLITE		0x00000080
-#define S_RPC		0x00000100
-#define S_MGMT		0x00000200
-#define S_LNET		0x00000400
-#define S_LND		0x00000800 /* ALL LNDs */
-#define S_PINGER	0x00001000
-#define S_FILTER	0x00002000
-/* unused */
-#define S_ECHO		0x00008000
-#define S_LDLM		0x00010000
-#define S_LOV		0x00020000
-#define S_LQUOTA	0x00040000
-#define S_OSD		0x00080000
-#define S_LFSCK		0x00100000
-#define S_SNAPSHOT	0x00200000
-/* unused */
-#define S_LMV		0x00800000 /* b_new_cmd */
-/* unused */
-#define S_SEC		0x02000000 /* upcall cache */
-#define S_GSS		0x04000000 /* b_new_cmd */
-/* unused */
-#define S_MGC		0x10000000
-#define S_MGS		0x20000000
-#define S_FID		0x40000000 /* b_new_cmd */
-#define S_FLD		0x80000000 /* b_new_cmd */
-
-#define LIBCFS_DEBUG_SUBSYS_NAMES {					\
-	"undefined", "mdc", "mds", "osc", "ost", "class", "log",	\
-	"llite", "rpc", "mgmt", "lnet", "lnd", "pinger", "filter", "",	\
-	"echo", "ldlm", "lov", "lquota", "osd", "lfsck", "snapshot", "",\
-	"lmv",	"", "sec", "gss", "", "mgc", "mgs", "fid", "fld", NULL }
-
-/* Debugging masks (32 bits, non-overlapping) */
-#define D_TRACE		0x00000001 /* ENTRY/EXIT markers */
-#define D_INODE		0x00000002
-#define D_SUPER		0x00000004
-#define D_EXT2		0x00000008 /* anything from ext2_debug */
-#define D_MALLOC	0x00000010 /* print malloc, free information */
-#define D_CACHE		0x00000020 /* cache-related items */
-#define D_INFO		0x00000040 /* general information */
-#define D_IOCTL		0x00000080 /* ioctl related information */
-#define D_NETERROR	0x00000100 /* network errors */
-#define D_NET		0x00000200 /* network communications */
-#define D_WARNING	0x00000400 /* CWARN(...) == CDEBUG (D_WARNING, ...) */
-#define D_BUFFS		0x00000800
-#define D_OTHER		0x00001000
-#define D_DENTRY	0x00002000
-#define D_NETTRACE	0x00004000
-#define D_PAGE		0x00008000 /* bulk page handling */
-#define D_DLMTRACE	0x00010000
-#define D_ERROR		0x00020000 /* CERROR(...) == CDEBUG (D_ERROR, ...) */
-#define D_EMERG		0x00040000 /* CEMERG(...) == CDEBUG (D_EMERG, ...) */
-#define D_HA		0x00080000 /* recovery and failover */
-#define D_RPCTRACE	0x00100000 /* for distributed debugging */
-#define D_VFSTRACE	0x00200000
-#define D_READA		0x00400000 /* read-ahead */
-#define D_MMAP		0x00800000
-#define D_CONFIG	0x01000000
-#define D_CONSOLE	0x02000000
-#define D_QUOTA		0x04000000
-#define D_SEC		0x08000000
-#define D_LFSCK		0x10000000 /* For both OI scrub and LFSCK */
-#define D_HSM		0x20000000
-#define D_SNAPSHOT	0x40000000 /* snapshot */
-#define D_LAYOUT	0x80000000
-
-#define LIBCFS_DEBUG_MASKS_NAMES {					\
-	"trace", "inode", "super", "ext2", "malloc", "cache", "info",	\
-	"ioctl", "neterror", "net", "warning", "buffs", "other",	\
-	"dentry", "nettrace", "page", "dlmtrace", "error", "emerg",	\
-	"ha", "rpctrace", "vfstrace", "reada", "mmap", "config",	\
-	"console", "quota", "sec", "lfsck", "hsm", "snapshot", "layout",\
-	NULL }
-
-#define D_CANTMASK   (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE)
-
 #ifndef DEBUG_SUBSYSTEM
 # define DEBUG_SUBSYSTEM S_UNDEFINED
 #endif
@@ -207,9 +105,38 @@ do {                                                        \
                .msg_cdls   = (cdls)         };              \
         dataname.msg_mask   = (mask);
 
-#ifdef __KERNEL__
+#ifdef CDEBUG_ENABLED
 
-# ifdef CDEBUG_ENABLED
+#if !defined(__x86_64__)
+# ifdef __ia64__
+#  define CDEBUG_STACK() (THREAD_SIZE -					\
+			  ((unsigned long)__builtin_dwarf_cfa() &	\
+			   (THREAD_SIZE - 1)))
+# else
+#  define CDEBUG_STACK() (THREAD_SIZE -					\
+			  ((unsigned long)__builtin_frame_address(0) &	\
+			   (THREAD_SIZE - 1)))
+# endif /* __ia64__ */
+
+#define __CHECK_STACK(msgdata, mask, cdls)				\
+do {									\
+	if (unlikely(CDEBUG_STACK() > libcfs_stack)) {			\
+		LIBCFS_DEBUG_MSG_DATA_INIT(msgdata, D_WARNING, NULL);	\
+		libcfs_stack = CDEBUG_STACK();				\
+		libcfs_debug_msg(msgdata,				\
+				 "maximum lustre stack %lu\n",		\
+				 CDEBUG_STACK());			\
+		(msgdata)->msg_mask = mask;				\
+		(msgdata)->msg_cdls = cdls;				\
+		dump_stack();						\
+		/*panic("LBUG");*/					\
+	}								\
+} while (0)
+#define CFS_CHECK_STACK(msgdata, mask, cdls)  __CHECK_STACK(msgdata, mask, cdls)
+#else /* __x86_64__ */
+#define CFS_CHECK_STACK(msgdata, mask, cdls) do {} while (0)
+#define CDEBUG_STACK() (0L)
+#endif /* __x86_64__ */
 
 /**
  * Filters out logging messages based on mask and subsystem.
@@ -251,22 +178,6 @@ static inline int cfs_cdebug_show(unsigned int mask, unsigned int subsystem)
 #  warning "CDEBUG IS DISABLED. THIS SHOULD NEVER BE DONE FOR PRODUCTION!"
 # endif /* CDEBUG_ENABLED */
 
-#else /* !__KERNEL__ */
-static inline int cfs_cdebug_show(unsigned int mask, unsigned int subsystem)
-{
-        return 0;
-}
-# define CDEBUG(mask, format, ...)					\
-do {                                                                    \
-        if (((mask) & D_CANTMASK) != 0)                                 \
-                fprintf(stderr, "(%s:%d:%s()) " format,                 \
-                        __FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__);\
-} while (0)
-
-# define CDEBUG_LIMIT CDEBUG
-
-#endif /* __KERNEL__ */
-
 /*
  * Lustre Error Checksum: calculates checksum
  * of Hex number by XORing each bit.
@@ -288,7 +199,7 @@ do {                                                                    \
 
 #define LCONSOLE_EMERG(format, ...) CDEBUG(D_CONSOLE | D_EMERG, format, ## __VA_ARGS__)
 
-#if defined(CDEBUG_ENTRY_EXIT) && defined(__KERNEL__)
+#if defined(CDEBUG_ENTRY_EXIT)
 
 void libcfs_log_goto(struct libcfs_debug_msg_data *goto_data,
 		     const char *label, long rc);
@@ -341,7 +252,7 @@ do {									\
 # define ENTRY	CDEBUG(D_TRACE, "Process entered\n")
 # define EXIT	CDEBUG(D_TRACE, "Process leaving\n")
 
-#else /* !CDEBUG_ENTRY_EXIT || !__KERNEL__ */
+#else /* !CDEBUG_ENTRY_EXIT */
 
 # define GOTO(label, rc)						\
 	do {								\
@@ -353,7 +264,7 @@ do {									\
 # define ENTRY	do { } while (0)
 # define EXIT	do { } while (0)
 
-#endif /* CDEBUG_ENTRY_EXIT && __KERNEL__ */
+#endif /* CDEBUG_ENTRY_EXIT */
 
 #define RETURN_EXIT							\
 do {									\
@@ -370,15 +281,15 @@ extern int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata,
                               va_list args, const char *format2, ...)
         __attribute__ ((format (printf, 4, 5)));
 
-#ifdef __KERNEL__
 /* other external symbols that tracefile provides: */
 extern int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
 				   const char __user *usr_buffer,
 				   int usr_buffer_nob);
 extern int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob,
 				    const char *knl_buffer, char *append);
-#endif /* __KERNEL__ */
 
 #define LIBCFS_DEBUG_FILE_PATH_DEFAULT "/tmp/lustre-log"
 
+void cfs_debug_init(void);
+
 #endif	/* __LIBCFS_DEBUG_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_fail.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_fail.h
index 2af5149be8f69..203e470df88d0 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_fail.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_fail.h
@@ -165,7 +165,7 @@ static inline void cfs_race(__u32 id)
 			CERROR("cfs_race id %x sleeping\n", id);
 			rc = wait_event_interruptible(cfs_race_waitq,
 						      cfs_race_state != 0);
-			CERROR("cfs_fail_race id %x awake, rc=%d\n", id, rc);
+			CERROR("cfs_fail_race id %x awake: rc=%d\n", id, rc);
 		} else {
 			CERROR("cfs_fail_race id %x waking\n", id);
 			cfs_race_state = 1;
@@ -175,4 +175,42 @@ static inline void cfs_race(__u32 id)
 }
 #define CFS_RACE(id) cfs_race(id)
 
+/**
+ * Wait on race.
+ *
+ * The first thread that calls this with a matching fail_loc is put to sleep,
+ * but subseqent callers of this won't sleep. Until another thread that calls
+ * cfs_race_wakeup(), the first thread will be woken up and continue.
+ */
+static inline void cfs_race_wait(__u32 id)
+{
+	if (CFS_FAIL_PRECHECK(id)) {
+		if (unlikely(__cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET))) {
+			int rc;
+
+			cfs_race_state = 0;
+			CERROR("cfs_race id %x sleeping\n", id);
+			rc = wait_event_interruptible(cfs_race_waitq,
+						      cfs_race_state != 0);
+			CERROR("cfs_fail_race id %x awake: rc=%d\n", id, rc);
+		}
+	}
+}
+#define CFS_RACE_WAIT(id) cfs_race_wait(id)
+
+/**
+ * Wake up the thread that is waiting on the matching fail_loc.
+ */
+static inline void cfs_race_wakeup(__u32 id)
+{
+	if (CFS_FAIL_PRECHECK(id)) {
+		if (likely(!__cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET))) {
+			CERROR("cfs_fail_race id %x waking\n", id);
+			cfs_race_state = 1;
+			wake_up(&cfs_race_waitq);
+		}
+	}
+}
+#define CFS_RACE_WAKEUP(id) cfs_race_wakeup(id)
+
 #endif /* _LIBCFS_FAIL_H */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_prim.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_prim.h
index 16bda0c460ebf..1001362e75cd0 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_prim.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_prim.h
@@ -36,6 +36,7 @@
 #ifndef __LIBCFS_PRIM_H__
 #define __LIBCFS_PRIM_H__
 
+#include <linux/mm.h>
 #include <linux/sched.h>
 
 /*
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h
index ebcdc990203b2..9a242839fd843 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_private.h
@@ -42,6 +42,9 @@
 # define DEBUG_SUBSYSTEM S_UNDEFINED
 #endif
 
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
 #ifdef LIBCFS_DEBUG
 
 /*
@@ -213,8 +216,14 @@ do {									    \
 #define LIBCFS_CPT_ALLOC(ptr, cptab, cpt, size)				    \
 	LIBCFS_CPT_ALLOC_GFP(ptr, cptab, cpt, size, GFP_NOFS)
 
+#ifdef LLIST_HEAD
 void init_libcfs_vfree_atomic(void);
 void exit_libcfs_vfree_atomic(void);
+#define HAVE_LIBCFS_VFREE_ATOMIC
+#else
+#define init_libcfs_vfree_atomic() do {} while(0)
+#define exit_libcfs_vfree_atomic() do {} while(0)
+#endif
 
 #define LIBCFS_FREE(ptr, size)						\
 do {									\
@@ -228,7 +237,7 @@ do {									\
 	CDEBUG(D_MALLOC, "kfreed '" #ptr "': %d at %p (tot %d).\n",     \
 	       s, (ptr), libcfs_kmem_read());				\
 	if (unlikely(s > LIBCFS_VMALLOC_SIZE))                          \
-		libcfs_vfree_atomic(ptr);						\
+		libcfs_vfree_atomic(ptr);				\
 	else								\
 		kfree(ptr);						\
 } while (0)
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h
deleted file mode 100644
index ca40551dfc678..0000000000000
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ptask.h
+++ /dev/null
@@ -1,121 +0,0 @@
-#ifndef __LIBCFS_PTASK_H__
-#define __LIBCFS_PTASK_H__
-
-#include <linux/types.h>
-#include <linux/bitops.h>
-#include <linux/kernel.h>
-#include <linux/cpumask.h>
-#include <linux/uaccess.h>
-#include <linux/notifier.h>
-#include <linux/workqueue.h>
-#include <linux/completion.h>
-
-/*
- * Unconditionaly disable PADATA.
- *
- * Padata is needed for PIO client feature. This feature is disabled by default
- * and was removed from Lustre code during 2.13 development (2b0a34fe43bf).
- * Instead of adapting the code to Linux 5.4+ change, just disable it.
- */
-#undef CONFIG_PADATA
-
-#ifdef CONFIG_PADATA
-#include <linux/padata.h>
-#else
-struct padata_priv {};
-struct padata_instance {};
-#endif
-
-#define PTF_COMPLETE	BIT(0)
-#define PTF_AUTOFREE	BIT(1)
-#define PTF_ORDERED	BIT(2)
-#define PTF_USER_MM	BIT(3)
-#define PTF_ATOMIC	BIT(4)
-#define PTF_RETRY	BIT(5)
-
-struct cfs_ptask_engine {
-	struct padata_instance	*pte_pinst;
-	struct workqueue_struct	*pte_wq;
-	struct notifier_block	 pte_notifier;
-	int			 pte_weight;
-};
-
-struct cfs_ptask;
-typedef int (*cfs_ptask_cb_t)(struct cfs_ptask *);
-
-struct cfs_ptask {
-	struct padata_priv	 pt_padata;
-	struct completion	 pt_completion;
-	struct mm_struct	*pt_mm;
-	unsigned int		 pt_flags;
-	int			 pt_cbcpu;
-	cfs_ptask_cb_t		 pt_cbfunc;
-	void			*pt_cbdata;
-	int			 pt_result;
-};
-
-static inline
-struct padata_priv *cfs_ptask2padata(struct cfs_ptask *ptask)
-{
-	return &ptask->pt_padata;
-}
-
-static inline
-struct cfs_ptask *cfs_padata2ptask(struct padata_priv *padata)
-{
-	return container_of(padata, struct cfs_ptask, pt_padata);
-}
-
-static inline
-bool cfs_ptask_need_complete(struct cfs_ptask *ptask)
-{
-	return ptask->pt_flags & PTF_COMPLETE;
-}
-
-static inline
-bool cfs_ptask_is_autofree(struct cfs_ptask *ptask)
-{
-	return ptask->pt_flags & PTF_AUTOFREE;
-}
-
-static inline
-bool cfs_ptask_is_ordered(struct cfs_ptask *ptask)
-{
-	return ptask->pt_flags & PTF_ORDERED;
-}
-
-static inline
-bool cfs_ptask_use_user_mm(struct cfs_ptask *ptask)
-{
-	return ptask->pt_flags & PTF_USER_MM;
-}
-
-static inline
-bool cfs_ptask_is_atomic(struct cfs_ptask *ptask)
-{
-	return ptask->pt_flags & PTF_ATOMIC;
-}
-
-static inline
-bool cfs_ptask_is_retry(struct cfs_ptask *ptask)
-{
-	return ptask->pt_flags & PTF_RETRY;
-}
-
-static inline
-int cfs_ptask_result(struct cfs_ptask *ptask)
-{
-	return ptask->pt_result;
-}
-
-struct cfs_ptask_engine *cfs_ptengine_init(const char *, const struct cpumask *);
-void cfs_ptengine_fini(struct cfs_ptask_engine *);
-int  cfs_ptengine_set_cpumask(struct cfs_ptask_engine *, const struct cpumask *);
-int  cfs_ptengine_weight(struct cfs_ptask_engine *);
-
-int  cfs_ptask_submit(struct cfs_ptask *, struct cfs_ptask_engine *);
-int  cfs_ptask_wait_for(struct cfs_ptask *);
-int  cfs_ptask_init(struct cfs_ptask *, cfs_ptask_cb_t, void *,
-		    unsigned int, int);
-
-#endif /* __LIBCFS_PTASK_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_string.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_string.h
index 3c34071d35774..4d9dbde91e8a0 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_string.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_string.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2014, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_time.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_time.h
deleted file mode 100644
index 68947c9792296..0000000000000
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_time.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * libcfs/include/libcfs/libcfs_time.h
- *
- * Time functions.
- *
- */
-
-#ifndef __LIBCFS_TIME_H__
-#define __LIBCFS_TIME_H__
-
-/*
- * generic time manipulation functions.
- */
-
-static inline cfs_time_t cfs_time_add(cfs_time_t t, cfs_duration_t d)
-{
-        return (cfs_time_t)(t + d);
-}
-
-static inline cfs_duration_t cfs_time_sub(cfs_time_t t1, cfs_time_t t2)
-{
-        return (cfs_time_t)(t1 - t2);
-}
-
-static inline int cfs_time_after(cfs_time_t t1, cfs_time_t t2)
-{
-        return cfs_time_before(t2, t1);
-}
-
-static inline int cfs_time_aftereq(cfs_time_t t1, cfs_time_t t2)
-{
-        return cfs_time_beforeq(t2, t1);
-}
-
-static inline cfs_time_t cfs_time_shift(int seconds)
-{
-        return cfs_time_add(cfs_time_current(), cfs_time_seconds(seconds));
-}
-
-#define CFS_TICK	1
-
-/*
- * return valid time-out based on user supplied one. Currently we only check
- * that time-out is not shorted than allowed.
- */
-static inline cfs_duration_t cfs_timeout_cap(cfs_duration_t timeout)
-{
-	if (timeout < CFS_TICK)
-		timeout = CFS_TICK;
-	return timeout;
-}
-
-#endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/libcfs.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/libcfs.h
deleted file mode 100644
index 0f67a87096c0a..0000000000000
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/libcfs.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2012, 2015, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- */
-
-#ifndef __LIBCFS_LINUX_LIBCFS_H__
-#define __LIBCFS_LINUX_LIBCFS_H__
-
-#ifndef __LIBCFS_LIBCFS_H__
-#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
-#endif
-
-#ifndef __KERNEL__
-#error This include is only for kernel use.
-#endif
-
-#include <linux/bitops.h>
-#include <linux/compiler.h>
-#include <linux/ctype.h>
-#include <linux/errno.h>
-#include <linux/file.h>
-#include <linux/fs.h>
-#include <linux/highmem.h>
-#include <linux/interrupt.h>
-#include <linux/kallsyms.h>
-#include <linux/kernel.h>
-#include <linux/kmod.h>
-#include <linux/kthread.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/mutex.h>
-#include <linux/notifier.h>
-#include <linux/random.h>
-#include <linux/rbtree.h>
-#include <linux/rwsem.h>
-#include <linux/scatterlist.h>
-#include <linux/sched.h>
-#ifdef HAVE_SCHED_HEADERS
-#include <linux/sched/signal.h>
-#include <linux/sched/mm.h>
-#endif
-#include <linux/signal.h>
-#include <linux/slab.h>
-#include <linux/smp.h>
-#include <linux/stat.h>
-#include <linux/string.h>
-#include <linux/time.h>
-#include <linux/timer.h>
-#include <linux/types.h>
-#include <linux/unistd.h>
-#include <linux/vmalloc.h>
-#include <net/sock.h>
-#include <linux/atomic.h>
-#include <asm/div64.h>
-#include <linux/timex.h>
-#include <linux/uaccess.h>
-#include <stdarg.h>
-
-#include <libcfs/linux/linux-cpu.h>
-#include <libcfs/linux/linux-time.h>
-#include <libcfs/linux/linux-mem.h>
-#include <libcfs/linux/linux-misc.h>
-#include <libcfs/linux/linux-fs.h>
-
-#if !defined(__x86_64__)
-# ifdef  __ia64__
-#  define CDEBUG_STACK() (THREAD_SIZE -                                 \
-                          ((unsigned long)__builtin_dwarf_cfa() &       \
-                           (THREAD_SIZE - 1)))
-# else
-#  define CDEBUG_STACK() (THREAD_SIZE -                                 \
-                          ((unsigned long)__builtin_frame_address(0) &  \
-                           (THREAD_SIZE - 1)))
-# endif /* __ia64__ */
-
-#define __CHECK_STACK(msgdata, mask, cdls)                              \
-do {                                                                    \
-        if (unlikely(CDEBUG_STACK() > libcfs_stack)) {                  \
-                LIBCFS_DEBUG_MSG_DATA_INIT(msgdata, D_WARNING, NULL);   \
-                libcfs_stack = CDEBUG_STACK();                          \
-                libcfs_debug_msg(msgdata,                               \
-                                 "maximum lustre stack %lu\n",          \
-                                 CDEBUG_STACK());                       \
-                (msgdata)->msg_mask = mask;                             \
-                (msgdata)->msg_cdls = cdls;                             \
-                dump_stack();                                           \
-              /*panic("LBUG");*/                                        \
-        }                                                               \
-} while (0)
-#define CFS_CHECK_STACK(msgdata, mask, cdls)  __CHECK_STACK(msgdata, mask, cdls)
-#else /* __x86_64__ */
-#define CFS_CHECK_STACK(msgdata, mask, cdls) do {} while(0)
-#define CDEBUG_STACK() (0L)
-#endif /* __x86_64__ */
-
-/**
- * Platform specific declarations for cfs_curproc API (libcfs/curproc.h)
- *
- * Implementation is in linux-curproc.c
- */
-#define CFS_CURPROC_COMM_MAX (sizeof ((struct task_struct *)0)->comm)
-
-/* helper for sysctl handlers */
-int lprocfs_call_handler(void *data, int write, loff_t *ppos,
-			 void __user *buffer, size_t *lenp,
-			 int (*handler)(void *data, int write,
-			 loff_t pos, void __user *buffer, int len));
-
-#ifndef WITH_WATCHDOG
-#define WITH_WATCHDOG
-#endif
-
-/*
- * Macros to access common characteristics of "current" UNIX process.
- */
-#define current_pid()             (current->pid)
-#define current_comm()            (current->comm)
-
-/* check if task is running in compat mode.*/
-int current_is_32bit(void);
-
-#endif /* _LINUX_LIBCFS_H */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-cpu.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-cpu.h
index a46e252466026..918f8daa8f4ca 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-cpu.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-cpu.h
@@ -23,7 +23,7 @@
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2012, 2013, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -39,63 +39,15 @@
 #ifndef __LIBCFS_LINUX_CPU_H__
 #define __LIBCFS_LINUX_CPU_H__
 
-#ifndef __LIBCFS_LIBCFS_H__
-#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
-#endif
-
-#ifndef __KERNEL__
-#error This include is only for kernel use.
-#endif
-
 #include <linux/cpu.h>
-#include <linux/cpuset.h>
-#include <linux/topology.h>
-#include <linux/version.h>
-
-#ifdef CONFIG_SMP
-
-#define HAVE_LIBCFS_CPT
-
-/** virtual processing unit */
-struct cfs_cpu_partition {
-	/* CPUs mask for this partition */
-	cpumask_t			*cpt_cpumask;
-	/* nodes mask for this partition */
-	nodemask_t			*cpt_nodemask;
-	/* NUMA distance between CPTs */
-	unsigned			*cpt_distance;
-	/* spread rotor for NUMA allocator */
-	int				 cpt_spread_rotor;
-	/* NUMA node if cpt_nodemask is empty */
-	int				 cpt_node;
-};
-
-/** descriptor for CPU partitions */
-struct cfs_cpt_table {
-	/* spread rotor for NUMA allocator */
-	int				ctb_spread_rotor;
-	/* maximum NUMA distance between all nodes in table */
-	unsigned			ctb_distance;
-	/* # of CPU partitions */
-	int				 ctb_nparts;
-	/* partitions tables */
-	struct cfs_cpu_partition	*ctb_parts;
-	/* shadow HW CPU to CPU partition ID */
-	int				*ctb_cpu2cpt;
-	/* all cpus in this partition table */
-	cpumask_t			*ctb_cpumask;
-	/* shadow HW node to CPU partition ID */
-	int				*ctb_node2cpt;
-	/* all nodes in this partition table */
-	nodemask_t			*ctb_nodemask;
-};
-
-void cfs_cpu_core_siblings(int cpu, cpumask_t *mask);
-
-#endif /* CONFIG_SMP */
 
 #ifndef HAVE_TOPOLOGY_SIBLING_CPUMASK
 # define topology_sibling_cpumask(cpu)	topology_thread_cpumask(cpu)
 #endif /* HAVE_TOPOLOGY_SIBLING_CPUMASK */
 
+#ifndef HAVE_CPUS_READ_LOCK
+# define cpus_read_lock	get_online_cpus
+# define cpus_read_unlock	put_online_cpus
+#endif
+
 #endif /* __LIBCFS_LINUX_CPU_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-crypto.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-crypto.h
index a9c15a66ab207..6346c59e516e7 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-crypto.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-crypto.h
@@ -26,11 +26,6 @@
  * Copyright 2012 Xyratex Technology Limited
  */
 
-/* Added in v4.15-rc4 (commit a208fa8f3303) */
-#ifndef CRYPTO_ALG_OPTIONAL_KEY
-#define CRYPTO_ALG_OPTIONAL_KEY 0x00004000
-#endif
-
 /**
  * Linux crypto hash specific functions.
  */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h
index dbc84de172146..dd86d1947466b 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-fs.h
@@ -37,14 +37,6 @@
 #ifndef __LIBCFS_LINUX_CFS_FS_H__
 #define __LIBCFS_LINUX_CFS_FS_H__
 
-#ifndef __LIBCFS_LIBCFS_H__
-#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
-#endif
-
-#ifndef __KERNEL__
-#error This include is only for kernel use.
-#endif
-
 #include <linux/fs.h>
 #include <linux/stat.h>
 #include <linux/mount.h>
@@ -58,6 +50,10 @@ static inline struct dentry *file_dentry(const struct file *file)
 }
 #endif
 
+#ifndef QSTR_INIT
+#define QSTR_INIT(n, l) { .len = l, .name = n }
+#endif
+
 #if defined(HAVE_FILE_FSYNC_4ARGS) || defined(HAVE_FILE_FSYNC_2ARGS)
 #define ll_vfs_fsync_range(fp, start, end, datasync) \
 	vfs_fsync_range(fp, start, end, datasync)
@@ -66,15 +62,6 @@ static inline struct dentry *file_dentry(const struct file *file)
 	vfs_fsync_range(fp, file_dentry(fp), start, end, datasync)
 #endif
 
-#define flock_type(fl)			((fl)->fl_type)
-#define flock_set_type(fl, type)	do { (fl)->fl_type = (type); } while (0)
-#define flock_pid(fl)			((fl)->fl_pid)
-#define flock_set_pid(fl, pid)		do { (fl)->fl_pid = (pid); } while (0)
-#define flock_start(fl)			((fl)->fl_start)
-#define flock_set_start(fl, st)		do { (fl)->fl_start = (st); } while (0)
-#define flock_end(fl)			((fl)->fl_end)
-#define flock_set_end(fl, end)		do { (fl)->fl_end = (end); } while (0)
-
 #ifndef IFSHIFT
 #define IFSHIFT			12
 #endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-hash.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-hash.h
new file mode 100644
index 0000000000000..2721655306bbe
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-hash.h
@@ -0,0 +1,247 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+
+#ifndef __LIBCFS_LINUX_HASH_H__
+#define __LIBCFS_LINUX_HASH_H__
+
+#include <linux/dcache.h>
+
+u64 cfs_hashlen_string(const void *salt, const char *name);
+
+#ifndef hashlen_hash
+#define hashlen_hash(hashlen) ((u32)(hashlen))
+#endif
+
+#ifndef HAVE_STRINGHASH
+#ifndef hashlen_create
+#define hashlen_create(hash, len) ((u64)(len)<<32 | (u32)(hash))
+#endif
+#endif /* !HAVE_STRINGHASH */
+
+#ifdef HAVE_LINUX_RHASHTABLE_H
+#include <linux/rhashtable.h>
+
+#ifndef HAVE_RHLTABLE
+struct rhlist_head {
+	struct rhash_head		rhead;
+	struct rhlist_head __rcu	*next;
+};
+
+struct rhltable {
+	struct rhashtable ht;
+};
+
+#define rhl_for_each_entry_rcu(tpos, pos, list, member)                 \
+	for (pos = list; pos && rht_entry(tpos, pos, member);           \
+		pos = rcu_dereference_raw(pos->next))
+
+static inline int rhltable_init(struct rhltable *hlt,
+				const struct rhashtable_params *params)
+{
+	return rhashtable_init(&hlt->ht, params);
+}
+
+static inline struct rhlist_head *rhltable_lookup(
+	struct rhltable *hlt, const void *key,
+	const struct rhashtable_params params)
+{
+	struct rhashtable *ht = &hlt->ht;
+	struct rhashtable_compare_arg arg = {
+		.ht = ht,
+		.key = key,
+	};
+	struct bucket_table *tbl;
+	struct rhash_head *he;
+	unsigned int hash;
+
+	tbl = rht_dereference_rcu(ht->tbl, ht);
+restart:
+	hash = rht_key_hashfn(ht, tbl, key, params);
+	rht_for_each_rcu(he, tbl, hash) {
+		if (params.obj_cmpfn ?
+		    params.obj_cmpfn(&arg, rht_obj(ht, he)) :
+		    rhashtable_compare(&arg, rht_obj(ht, he)))
+			continue;
+		return he ? container_of(he, struct rhlist_head, rhead) : NULL;
+	}
+
+	/* Ensure we see any new tables. */
+	smp_rmb();
+
+	tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+	if (unlikely(tbl))
+		goto restart;
+
+	return NULL;
+}
+
+static inline int rhltable_insert_key(
+	struct rhltable *hlt, const void *key, struct rhlist_head *list,
+	const struct rhashtable_params params)
+{
+#ifdef HAVE_HASHTABLE_INSERT_FAST_RETURN_INT
+	return __rhashtable_insert_fast(&hlt->ht, key, &list->rhead,
+					params);
+#else
+	return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead,
+						params));
+#endif
+}
+
+static inline int rhltable_remove(
+	struct rhltable *hlt, struct rhlist_head *list,
+	const struct rhashtable_params params)
+{
+	return rhashtable_remove_fast(&hlt->ht, &list->rhead, params);
+}
+
+static inline void rhltable_free_and_destroy(struct rhltable *hlt,
+					     void (*free_fn)(void *ptr,
+							     void *arg),
+					     void *arg)
+{
+	rhashtable_free_and_destroy(&hlt->ht, free_fn, arg);
+}
+
+static inline void rhltable_destroy(struct rhltable *hlt)
+{
+	rhltable_free_and_destroy(hlt, NULL, NULL);
+}
+
+static inline void rhltable_walk_enter(struct rhltable *hlt,
+				       struct rhashtable_iter *iter)
+{
+	rhashtable_walk_init(&hlt->ht, iter);
+}
+#endif /* !HAVE_RHLTABLE */
+
+#ifdef HAVE_BROKEN_HASH_64
+
+#define GOLDEN_RATIO_32 0x61C88647
+#define GOLDEN_RATIO_64 0x61C8864680B583EBull
+
+static inline u32 cfs_hash_32(u32 val, unsigned int bits)
+{
+	/* High bits are more random, so use them. */
+	return (val * GOLDEN_RATIO_32) >> (32 - bits);
+}
+
+static __always_inline u32 cfs_hash_64(u64 val, unsigned int bits)
+{
+#if BITS_PER_LONG == 64
+	/* 64x64-bit multiply is efficient on all 64-bit processors */
+	return val * GOLDEN_RATIO_64 >> (64 - bits);
+#else
+	/* Hash 64 bits using only 32x32-bit multiply. */
+	return cfs_hash_32(((u32)val ^ ((val >> 32) * GOLDEN_RATIO_32)), bits);
+#endif
+}
+#else
+
+#define cfs_hash_32	hash_32
+#define cfs_hash_64	hash_64
+
+#endif /* HAVE_BROKEN_HASH_64 */
+
+#ifndef HAVE_RHASHTABLE_LOOKUP_GET_INSERT_FAST
+/**
+ * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table
+ * @ht:         hash table
+ * @obj:        pointer to hash head inside object
+ * @params:     hash table parameters
+ *
+ * Just like rhashtable_lookup_insert_fast(), but this function returns the
+ * object if it exists, NULL if it did not and the insertion was successful,
+ * and an ERR_PTR otherwise.
+ */
+static inline void *rhashtable_lookup_get_insert_fast(
+	struct rhashtable *ht, struct rhash_head *obj,
+	const struct rhashtable_params params)
+{
+	const char *key;
+	void *ret;
+	int rc;
+
+	rc = rhashtable_lookup_insert_fast(ht, obj, params);
+	switch (rc) {
+	case -EEXIST:
+		key = rht_obj(ht, obj);
+		ret = rhashtable_lookup_fast(ht, key, params);
+		break;
+	case 0:
+		ret = NULL;
+		break;
+	default:
+		ret = ERR_PTR(rc);
+		break;
+	}
+	return ret;
+}
+#endif /* !HAVE_RHASHTABLE_LOOKUP_GET_INSERT_FAST */
+
+#ifndef HAVE_RHASHTABLE_LOOKUP
+/*
+ * The function rhashtable_lookup() and rhashtable_lookup_fast()
+ * are almost the same except rhashtable_lookup() doesn't
+ * take the RCU read lock. Since this is the case and only
+ * SLES12 SP3 lacks rhashtable_lookup() just duplicate the
+ * SLES12 SP3 rhashtable_lookup_fast() minus the RCU read lock.
+ */
+static inline void *rhashtable_lookup(
+	struct rhashtable *ht, const void *key,
+	const struct rhashtable_params params)
+{
+	struct rhashtable_compare_arg arg = {
+		.ht = ht,
+		.key = key,
+	};
+	const struct bucket_table *tbl;
+	struct rhash_head *he;
+	unsigned int hash;
+
+	tbl = rht_dereference_rcu(ht->tbl, ht);
+restart:
+	hash = rht_key_hashfn(ht, tbl, key, params);
+	rht_for_each_rcu(he, tbl, hash) {
+		if (params.obj_cmpfn ?
+		    params.obj_cmpfn(&arg, rht_obj(ht, he)) :
+		    rhashtable_compare(&arg, rht_obj(ht, he)))
+			continue;
+		return rht_obj(ht, he);
+	}
+
+	/* Ensure we see any new tables. */
+	smp_rmb();
+
+	tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+	if (unlikely(tbl))
+		goto restart;
+
+	return NULL;
+}
+#endif /* !HAVE_RHASHTABLE_LOOKUP */
+#else
+#define rhashtable_init(ht, param) 0
+#define rhashtable_destroy(ht) do {} while (0)
+#endif /* HAVE_LINUX_RHASHTABLE_H */
+
+#endif /* __LIBCFS_LINUX_HASH_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h
index f08d623bd8a84..81e79dbf24852 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-mem.h
@@ -37,14 +37,6 @@
 #ifndef __LIBCFS_LINUX_CFS_MEM_H__
 #define __LIBCFS_LINUX_CFS_MEM_H__
 
-#ifndef __LIBCFS_LIBCFS_H__
-#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
-#endif
-
-#ifndef __KERNEL__
-#error This include is only for kernel use.
-#endif
-
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/pagemap.h>
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h
index 754f183050485..a55697b2cfbfe 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-misc.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -34,7 +34,10 @@
 #define __LIBCFS_LINUX_MISC_H__
 
 #include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/user_namespace.h>
 #include <linux/uio.h>
+#include <linux/kallsyms.h>
 
 #ifdef HAVE_SYSCTL_CTLNAME
 #define INIT_CTL_NAME	.ctl_name = CTL_UNNUMBERED,
@@ -60,8 +63,8 @@
 #endif
 #endif /* HAVE_IOV_ITER_TYPE */
 
-#ifndef HAVE_LINUX_SELINUX_IS_ENABLED
-bool selinux_is_enabled(void);
+#ifndef HAVE_MODULE_PARAM_LOCKING
+static DEFINE_MUTEX(param_lock);
 #endif
 
 #ifndef HAVE_UIDGID_HEADER
@@ -128,13 +131,41 @@ static inline bool gid_valid(kgid_t gid)
 
 int cfs_get_environ(const char *key, char *value, int *val_len);
 
-#ifndef HAVE_WAIT_QUEUE_ENTRY
-#define wait_queue_entry_t wait_queue_t
-#endif
-
 int cfs_kernel_write(struct file *filp, const void *buf, size_t count,
 		     loff_t *pos);
 
+/*
+ * For RHEL6 struct kernel_parm_ops doesn't exist. Also
+ * the arguments for .set and .get take different
+ * parameters which is handled below
+ */
+#ifdef HAVE_KERNEL_PARAM_OPS
+#define cfs_kernel_param_arg_t const struct kernel_param
+#else
+#define cfs_kernel_param_arg_t struct kernel_param_ops
+#define kernel_param_ops kernel_param
+#endif /* ! HAVE_KERNEL_PARAM_OPS */
+
+#ifndef HAVE_KERNEL_PARAM_LOCK
+static inline void kernel_param_unlock(struct module *mod)
+{
+#ifndef	HAVE_MODULE_PARAM_LOCKING
+	mutex_unlock(&param_lock);
+#else
+	__kernel_param_unlock();
+#endif
+}
+
+static inline void kernel_param_lock(struct module *mod)
+{
+#ifndef	HAVE_MODULE_PARAM_LOCKING
+	mutex_lock(&param_lock);
+#else
+	__kernel_param_lock();
+#endif
+}
+#endif /* ! HAVE_KERNEL_PARAM_LOCK */
+
 #ifndef HAVE_KSTRTOUL
 static inline int kstrtoul(const char *s, unsigned int base, unsigned long *res)
 {
@@ -147,4 +178,27 @@ static inline int kstrtoul(const char *s, unsigned int base, unsigned long *res)
 }
 #endif /* !HAVE_KSTRTOUL */
 
+#ifndef HAVE_KSTRTOBOOL_FROM_USER
+
+#define kstrtobool strtobool
+
+int kstrtobool_from_user(const char __user *s, size_t count, bool *res);
+#endif
+
+#ifndef HAVE_TASK_IS_RUNNING
+#define task_is_running(task)		(task->state == TASK_RUNNING)
+#endif
+
+#ifdef HAVE_KALLSYMS_LOOKUP_NAME
+static inline void *cfs_kallsyms_lookup_name(const char *name)
+{
+	return (void *)kallsyms_lookup_name(name);
+}
+#else
+static inline void *cfs_kallsyms_lookup_name(const char *name)
+{
+	return NULL;
+}
+#endif
+
 #endif
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
index a805ff9aedf84..3934635dcd322 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-time.h
@@ -39,54 +39,13 @@
 #ifndef __LIBCFS_LINUX_LINUX_TIME_H__
 #define __LIBCFS_LINUX_LINUX_TIME_H__
 
-#ifndef __LIBCFS_LIBCFS_H__
-#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
-#endif
-
-#ifndef __KERNEL__
-#error This include is only for kernel use.
-#endif
-
 /* Portable time API */
-
-/*
- * Platform provides three opaque data-types:
- *
- *  cfs_time_t        represents point in time. This is internal kernel
- *                    time rather than "wall clock". This time bears no
- *                    relation to gettimeofday().
- *
- *  cfs_duration_t    represents time interval with resolution of internal
- *                    platform clock
- *
- *  cfs_time_t     cfs_time_current(void);
- *  cfs_time_t     cfs_time_add    (cfs_time_t, cfs_duration_t);
- *  cfs_duration_t cfs_time_sub    (cfs_time_t, cfs_time_t);
- *  int            cfs_impl_time_before (cfs_time_t, cfs_time_t);
- *  int            cfs_impl_time_before_eq(cfs_time_t, cfs_time_t);
- *
- *  cfs_duration_t cfs_duration_build(int64_t);
- *
- *  time_t         cfs_duration_sec (cfs_duration_t);
- *  void           cfs_duration_usec(cfs_duration_t, struct timeval *);
- *  void           cfs_duration_nsec(cfs_duration_t, struct timespec *);
- *
- *  CFS_TIME_FORMAT
- *  CFS_DURATION_FORMAT
- *
- */
-
-#define ONE_BILLION ((u_int64_t)1000000000)
-#define ONE_MILLION 1000000
-
-#ifndef __KERNEL__
-#error This include is only for kernel use.
-#endif
-
+#include <linux/hrtimer.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/version.h>
 #include <linux/jiffies.h>
+#include <linux/hrtimer.h>
 #include <linux/types.h>
 #include <linux/time.h>
 #include <asm/div64.h>
@@ -94,10 +53,6 @@
 /*
  * Generic kernel stuff
  */
-
-typedef unsigned long cfs_time_t;      /* jiffies */
-typedef long cfs_duration_t;
-
 #ifndef HAVE_TIMESPEC64
 
 typedef __s64 time64_t;
@@ -143,22 +98,23 @@ static inline struct timespec timespec64_to_timespec(const struct timespec64 ts6
 
 #endif /* HAVE_TIMESPEC64 */
 
-#ifndef HAVE_TIME_T
-typedef __kernel_old_time_t time_t;
-#endif
-
-#ifndef HAVE_JIFFIES_TO_TIMESPEC64
-static inline void
-jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value)
+#ifndef HAVE_NS_TO_TIMESPEC64
+static inline struct timespec64 ns_to_timespec64(const s64 nsec)
 {
-	/*
-	 * Convert jiffies to nanoseconds and separate with
-	 * one divide.
-	 */
-	u32 rem;
-	value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
-					NSEC_PER_SEC, &rem);
-	value->tv_nsec = rem;
+	struct timespec64 ts;
+	s32 rem;
+
+	if (!nsec)
+		return (struct timespec64) {0, 0};
+
+	ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
+	if (unlikely(rem < 0)) {
+		ts.tv_sec--;
+		rem += NSEC_PER_SEC;
+	}
+	ts.tv_nsec = rem;
+
+	return ts;
 }
 #endif
 
@@ -207,6 +163,13 @@ time64_t ktime_get_real_seconds(void);
 time64_t ktime_get_seconds(void);
 #endif /* HAVE_KTIME_GET_SECONDS */
 
+#ifdef NEED_KTIME_GET_NS
+static inline u64 ktime_get_ns(void)
+{
+	return ktime_to_ns(ktime_get());
+}
+#endif /* NEED_KTIME_GET_NS */
+
 #ifdef NEED_KTIME_GET_REAL_NS
 static inline u64 ktime_get_real_ns(void)
 {
@@ -214,6 +177,13 @@ static inline u64 ktime_get_real_ns(void)
 }
 #endif /* NEED_KTIME_GET_REAL_NS */
 
+#ifndef HAVE_KTIME_MS_DELTA
+static inline s64 ktime_ms_delta(const ktime_t later, const ktime_t earlier)
+{
+	return ktime_to_ms(ktime_sub(later, earlier));
+}
+#endif /* HAVE_KTIME_MS_DELTA */
+
 #ifndef HAVE_KTIME_TO_TIMESPEC64
 static inline struct timespec64 ktime_to_timespec64(ktime_t kt)
 {
@@ -242,79 +212,39 @@ static inline ktime_t timespec64_to_ktime(struct timespec64 ts)
 }
 #endif
 
-static inline int cfs_time_before(cfs_time_t t1, cfs_time_t t2)
+static inline unsigned long cfs_time_seconds(time64_t seconds)
 {
-        return time_before(t1, t2);
+	return nsecs_to_jiffies(seconds * NSEC_PER_SEC);
 }
 
-static inline int cfs_time_beforeq(cfs_time_t t1, cfs_time_t t2)
-{
-        return time_before_eq(t1, t2);
-}
+#ifdef HAVE_NEW_DEFINE_TIMER
+# ifndef TIMER_DATA_TYPE
+# define TIMER_DATA_TYPE struct timer_list *
+# endif
 
-static inline cfs_time_t cfs_time_current(void)
-{
-        return jiffies;
-}
-
-static inline time_t cfs_time_current_sec(void)
-{
-	return ktime_get_real_seconds();
-}
-
-static inline cfs_duration_t cfs_time_seconds(int seconds)
-{
-	return ((cfs_duration_t)seconds) * msecs_to_jiffies(MSEC_PER_SEC);
-}
-
-static inline time_t cfs_duration_sec(cfs_duration_t d)
-{
-	return d / msecs_to_jiffies(MSEC_PER_SEC);
-}
-
-#define cfs_time_current_64 get_jiffies_64
-
-static inline __u64 cfs_time_add_64(__u64 t, __u64 d)
-{
-        return t + d;
-}
-
-static inline __u64 cfs_time_shift_64(int seconds)
-{
-        return cfs_time_add_64(cfs_time_current_64(),
-                               cfs_time_seconds(seconds));
-}
-
-static inline int cfs_time_before_64(__u64 t1, __u64 t2)
-{
-        return (__s64)t2 - (__s64)t1 > 0;
-}
+#define CFS_DEFINE_TIMER(_name, _function, _expires, _data) \
+	DEFINE_TIMER((_name), (_function))
+#else
+# ifndef TIMER_DATA_TYPE
+# define TIMER_DATA_TYPE unsigned long
+# endif
 
-static inline int cfs_time_beforeq_64(__u64 t1, __u64 t2)
-{
-        return (__s64)t2 - (__s64)t1 >= 0;
-}
+#define CFS_DEFINE_TIMER(_name, _function, _expires, _data) \
+	DEFINE_TIMER((_name), (_function), (_expires), (_data))
+#endif
 
-/*
- * One jiffy
- */
-#define CFS_DURATION_T          "%ld"
 #ifdef HAVE_TIMER_SETUP
 #define cfs_timer_cb_arg_t struct timer_list *
 #define cfs_from_timer(var, callback_timer, timer_fieldname) \
 	from_timer(var, callback_timer, timer_fieldname)
 #define cfs_timer_setup(timer, callback, data, flags) \
 	timer_setup((timer), (callback), (flags))
-#define CFS_DEFINE_TIMER(_name, _function, _expires, _data) \
-	DEFINE_TIMER((_name), (_function))
 #define cfs_timer_cb_arg(var, timer_fieldname) (&(var)->timer_fieldname)
 #else
 #define cfs_timer_cb_arg_t unsigned long
 #define cfs_from_timer(var, data, timer_fieldname) (typeof(var))(data)
 #define cfs_timer_setup(timer, callback, data, flags) \
 	setup_timer((timer), (callback), (data))
-#define CFS_DEFINE_TIMER(_name, _function, _expires, _data) \
-	DEFINE_TIMER((_name), (_function), (_expires), (_data))
 #define cfs_timer_cb_arg(var, timer_fieldname) (cfs_timer_cb_arg_t)(var)
 #endif
 
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-wait.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-wait.h
new file mode 100644
index 0000000000000..fd154ba0f049f
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/linux/linux-wait.h
@@ -0,0 +1,568 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LIBCFS_LINUX_WAIT_BIT_H
+#define __LIBCFS_LINUX_WAIT_BIT_H
+
+/* Make sure we can see if we have TASK_NOLOAD */
+#include <linux/sched.h>
+/*
+ * Linux wait-bit related types and methods:
+ */
+#ifdef HAVE_WAIT_BIT_HEADER_H
+#include <linux/wait_bit.h>
+#endif
+#include <linux/wait.h>
+
+#ifndef HAVE_WAIT_QUEUE_ENTRY
+#define wait_queue_entry_t wait_queue_t
+#endif
+
+#ifndef HAVE_WAIT_BIT_HEADER_H
+struct wait_bit_queue_entry {
+	struct wait_bit_key	key;
+	wait_queue_entry_t	wq_entry;
+};
+
+#define ___wait_is_interruptible(state)                                         \
+	(!__builtin_constant_p(state) ||                                        \
+		state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE)          \
+
+#endif /* ! HAVE_WAIT_BIT_HEADER_H */
+
+#ifndef HAVE_PREPARE_TO_WAIT_EVENT
+extern long prepare_to_wait_event(wait_queue_head_t *wq_head,
+				  wait_queue_entry_t *wq_entry, int state);
+#endif
+
+/* ___wait_cond_timeout changed number of args in v3.12-rc1-78-g35a2af94c7ce
+ * so let's define our own ___wait_cond_timeout1
+ */
+
+#define ___wait_cond_timeout1(condition)				\
+({									\
+	bool __cond = (condition);					\
+	if (__cond && !__ret)						\
+		__ret = 1;						\
+	__cond || !__ret;						\
+})
+
+#ifndef HAVE_CLEAR_AND_WAKE_UP_BIT
+/**
+ * clear_and_wake_up_bit - clear a bit and wake up anyone waiting on that bit
+ *
+ * @bit: the bit of the word being waited on
+ * @word: the word being waited on, a kernel virtual address
+ *
+ * You can use this helper if bitflags are manipulated atomically rather than
+ * non-atomically under a lock.
+ */
+static inline void clear_and_wake_up_bit(int bit, void *word)
+{
+	clear_bit_unlock(bit, word);
+	/* See wake_up_bit() for which memory barrier you need to use. */
+	smp_mb__after_atomic();
+	wake_up_bit(word, bit);
+}
+#endif /* ! HAVE_CLEAR_AND_WAKE_UP_BIT */
+
+#ifndef HAVE_WAIT_VAR_EVENT
+extern void __init wait_bit_init(void);
+extern void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry,
+				void *var, int flags);
+extern void wake_up_var(void *var);
+extern wait_queue_head_t *__var_waitqueue(void *p);
+
+#define ___wait_var_event(var, condition, state, exclusive, ret, cmd)	\
+({									\
+	__label__ __out;						\
+	wait_queue_head_t *__wq_head = __var_waitqueue(var);		\
+	struct wait_bit_queue_entry __wbq_entry;			\
+	long __ret = ret; /* explicit shadow */				\
+									\
+	init_wait_var_entry(&__wbq_entry, var,				\
+			    exclusive ? WQ_FLAG_EXCLUSIVE : 0);		\
+	for (;;) {							\
+		long __int = prepare_to_wait_event(__wq_head,		\
+						   &__wbq_entry.wq_entry, \
+						   state);		\
+		if (condition)						\
+			break;						\
+									\
+		if (___wait_is_interruptible(state) && __int) {		\
+			__ret = __int;					\
+			goto __out;					\
+		}							\
+									\
+		cmd;							\
+	}								\
+	finish_wait(__wq_head, &__wbq_entry.wq_entry);			\
+__out:	__ret;								\
+})
+
+#define __wait_var_event(var, condition)				\
+	___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0,	\
+			  schedule())
+
+#define wait_var_event(var, condition)					\
+do {									\
+	might_sleep();							\
+	if (condition)							\
+		break;							\
+	__wait_var_event(var, condition);				\
+} while (0)
+
+#define __wait_var_event_killable(var, condition)			\
+	___wait_var_event(var, condition, TASK_KILLABLE, 0, 0,		\
+			  schedule())
+
+#define wait_var_event_killable(var, condition)				\
+({									\
+	int __ret = 0;							\
+	might_sleep();							\
+	if (!(condition))						\
+		__ret = __wait_var_event_killable(var, condition);	\
+	__ret;								\
+})
+
+#define __wait_var_event_timeout(var, condition, timeout)		\
+	___wait_var_event(var, ___wait_cond_timeout1(condition),	\
+			  TASK_UNINTERRUPTIBLE, 0, timeout,		\
+			  __ret = schedule_timeout(__ret))
+
+#define wait_var_event_timeout(var, condition, timeout)			\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout1(condition))				\
+		__ret = __wait_var_event_timeout(var, condition, timeout); \
+	__ret;								\
+})
+#endif /* ! HAVE_WAIT_VAR_EVENT */
+
+/*
+ * prepare_to_wait_event() does not support an exclusive
+ * lifo wait.
+ * However it will not relink the wait_queue_entry if
+ * it is already linked.  So we link to the head of the
+ * queue here, and it will stay there.
+ */
+static inline void prepare_to_wait_exclusive_head(
+	wait_queue_head_t *waitq, wait_queue_entry_t *link)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&(waitq->lock), flags);
+#ifdef HAVE_WAIT_QUEUE_ENTRY_LIST
+	if (list_empty(&link->entry))
+#else
+	if (list_empty(&link->task_list))
+#endif
+		__add_wait_queue_exclusive(waitq, link);
+	spin_unlock_irqrestore(&((waitq)->lock), flags);
+}
+
+#ifndef ___wait_event
+/*
+ * The below macro ___wait_event() has an explicit shadow of the __ret
+ * variable when used from the wait_event_*() macros.
+ *
+ * This is so that both can use the ___wait_cond_timeout1() construct
+ * to wrap the condition.
+ *
+ * The type inconsistency of the wait_event_*() __ret variable is also
+ * on purpose; we use long where we can return timeout values and int
+ * otherwise.
+ */
+
+#define ___wait_event(wq_head, condition, state, exclusive, ret, cmd)	\
+({									\
+	__label__ __out;						\
+	wait_queue_entry_ __wq_entry;					\
+	long __ret = ret;	/* explicit shadow */			\
+									\
+	init_wait(&__wq_entry);						\
+	if (exclusive)							\
+		__wq_entry.flags = WQ_FLAG_EXCLUSIVE			\
+	for (;;) {							\
+		long __int = prepare_to_wait_event(&wq_head,		\
+						  &__wq_entry, state);	\
+									\
+		if (condition)						\
+			break;						\
+									\
+		if (___wait_is_interruptible(state) && __int) {		\
+			__ret = __int;					\
+			goto __out;					\
+		}							\
+									\
+		cmd;							\
+	}								\
+	finish_wait(&wq_head, &__wq_entry);				\
+__out:	__ret;								\
+})
+#endif
+
+#ifndef TASK_NOLOAD
+
+#define ___wait_event_idle(wq_head, condition, exclusive, ret, cmd)	\
+({									\
+	wait_queue_entry_t __wq_entry;					\
+	unsigned long flags;						\
+	long __ret = ret;	/* explicit shadow */			\
+	sigset_t __blocked;						\
+									\
+	__blocked = cfs_block_sigsinv(0);				\
+	init_wait(&__wq_entry);						\
+	if (exclusive)							\
+		__wq_entry.flags = WQ_FLAG_EXCLUSIVE;			\
+	for (;;) {							\
+		prepare_to_wait_event(&wq_head,				\
+				   &__wq_entry,				\
+				   TASK_INTERRUPTIBLE);			\
+									\
+		if (condition)						\
+			break;						\
+		/* We have to do this here because some signals */	\
+		/* are not blockable - ie from strace(1).       */	\
+		/* In these cases we want to schedule_timeout() */	\
+		/* again, because we don't want that to return  */	\
+		/* -EINTR when the RPC actually succeeded.      */	\
+		/* the recalc_sigpending() below will deliver the */	\
+		/* signal properly.                             */	\
+		if (signal_pending(current)) {				\
+			spin_lock_irqsave(&current->sighand->siglock,	\
+					  flags);			\
+			clear_tsk_thread_flag(current, TIF_SIGPENDING);	\
+			spin_unlock_irqrestore(&current->sighand->siglock,\
+					       flags);			\
+		}							\
+		cmd;							\
+	}								\
+	finish_wait(&wq_head, &__wq_entry);				\
+	cfs_restore_sigs(__blocked);					\
+	__ret;								\
+})
+
+#define wait_event_idle(wq_head, condition)				\
+do {									\
+	might_sleep();							\
+	if (!(condition))						\
+		___wait_event_idle(wq_head, condition, 0, 0, schedule());\
+} while (0)
+
+#define wait_event_idle_exclusive(wq_head, condition)			\
+do {									\
+	might_sleep();							\
+	if (!(condition))						\
+		___wait_event_idle(wq_head, condition, 1, 0, schedule());\
+} while (0)
+
+#define __wait_event_idle_exclusive_timeout(wq_head, condition, timeout)\
+	___wait_event_idle(wq_head, ___wait_cond_timeout1(condition),	\
+			   1, timeout,					\
+			   __ret = schedule_timeout(__ret))
+
+#define wait_event_idle_exclusive_timeout(wq_head, condition, timeout)	\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout1(condition))				\
+		__ret = __wait_event_idle_exclusive_timeout(		\
+			wq_head, condition, timeout);			\
+	__ret;								\
+})
+
+#define __wait_event_idle_exclusive_timeout_cmd(wq_head, condition,	\
+						timeout, cmd1, cmd2)	\
+	___wait_event_idle(wq_head, ___wait_cond_timeout1(condition),	\
+			   1, timeout,					\
+			   cmd1; __ret = schedule_timeout(__ret); cmd2)
+
+#define wait_event_idle_exclusive_timeout_cmd(wq_head, condition, timeout,\
+					      cmd1, cmd2)		\
+({									\
+	long __ret = timeout;						\
+	if (!___wait_cond_timeout1(condition))				\
+		__ret = __wait_event_idle_exclusive_timeout_cmd(	\
+			wq_head, condition, timeout, cmd1, cmd2);	\
+	__ret;								\
+})
+
+#define __wait_event_idle_timeout(wq_head, condition, timeout)		\
+	___wait_event_idle(wq_head, ___wait_cond_timeout1(condition),	\
+			   0, timeout,					\
+			   __ret = schedule_timeout(__ret))
+
+#define wait_event_idle_timeout(wq_head, condition, timeout)		\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout1(condition))				\
+		__ret = __wait_event_idle_timeout(wq_head, condition,	\
+						  timeout);		\
+	__ret;								\
+})
+
+#else /* TASK_IDLE */
+#ifndef wait_event_idle
+/**
+ * wait_event_idle - wait for a condition without contributing to system load
+ * @wq_head: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_IDLE) until the
+ * @condition evaluates to true.
+ * The @condition is checked each time the waitqueue @wq_head is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ */
+#define wait_event_idle(wq_head, condition)				\
+do {									\
+	might_sleep();							\
+	if (!(condition))						\
+		___wait_event(wq_head, condition, TASK_IDLE, 0, 0,	\
+			      schedule());				\
+} while (0)
+#endif
+#ifndef wait_event_idle_exclusive
+/**
+ * wait_event_idle_exclusive - wait for a condition without contributing to
+ *               system load
+ * @wq_head: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_IDLE) until the
+ * @condition evaluates to true.
+ * The @condition is checked each time the waitqueue @wq_head is woken up.
+ *
+ * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
+ * set thus if other processes wait on the same list, when this
+ * process is woken further processes are not considered.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ */
+#define wait_event_idle_exclusive(wq_head, condition)			\
+do {									\
+	might_sleep();							\
+	if (!(condition))						\
+		___wait_event(wq_head, condition, TASK_IDLE, 1, 0,	\
+			      schedule());				\
+} while (0)
+#endif
+#ifndef wait_event_idle_exclusive_timeout
+/**
+ * wait_event_idle_exclusive_timeout - sleep without load until a condition
+ *                       becomes true or a timeout elapses
+ * @wq_head: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout, in jiffies
+ *
+ * The process is put to sleep (TASK_IDLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq_head is woken up.
+ *
+ * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
+ * set thus if other processes wait on the same list, when this
+ * process is woken further processes are not considered.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * Returns:
+ * 0 if the @condition evaluated to %false after the @timeout elapsed,
+ * 1 if the @condition evaluated to %true after the @timeout elapsed,
+ * or the remaining jiffies (at least 1) if the @condition evaluated
+ * to %true before the @timeout elapsed.
+ */
+#define wait_event_idle_exclusive_timeout(wq_head, condition, timeout)	\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout1(condition))				\
+		__ret = __wait_event_idle_exclusive_timeout(wq_head,	\
+							    condition,	\
+							    timeout);	\
+	__ret;								\
+})
+#endif
+#ifndef wait_event_idle_exclusive_timeout_cmd
+#define __wait_event_idle_exclusive_timeout_cmd(wq_head, condition,	\
+						timeout, cmd1, cmd2)	\
+	___wait_event(wq_head, ___wait_cond_timeout1(condition),	\
+		      TASK_IDLE, 1, timeout,				\
+		      cmd1; __ret = schedule_timeout(__ret); cmd2)
+
+#define wait_event_idle_exclusive_timeout_cmd(wq_head, condition, timeout,\
+					      cmd1, cmd2)		\
+({									\
+	long __ret = timeout;						\
+	if (!___wait_cond_timeout1(condition))				\
+		__ret = __wait_event_idle_exclusive_timeout_cmd(	\
+			wq_head, condition, timeout, cmd1, cmd2);	\
+	__ret;								\
+})
+#endif
+
+#ifndef wait_event_idle_timeout
+
+#define __wait_event_idle_timeout(wq_head, condition, timeout)		\
+	___wait_event(wq_head, ___wait_cond_timeout1(condition),	\
+		      TASK_IDLE, 0, timeout,				\
+		      __ret = schedule_timeout(__ret))
+
+/**
+ * wait_event_idle_timeout - sleep without load until a condition becomes
+ *                           true or a timeout elapses
+ * @wq_head: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout, in jiffies
+ *
+ * The process is put to sleep (TASK_IDLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq_head is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * Returns:
+ * 0 if the @condition evaluated to %false after the @timeout elapsed,
+ * 1 if the @condition evaluated to %true after the @timeout elapsed,
+ * or the remaining jiffies (at least 1) if the @condition evaluated
+ * to %true before the @timeout elapsed.
+ */
+#define wait_event_idle_timeout(wq_head, condition, timeout)		\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout1(condition))				\
+		__ret = __wait_event_idle_timeout(wq_head, condition,	\
+						  timeout);		\
+	__ret;								\
+})
+#endif
+#endif /* TASK_IDLE */
+
+/* ___wait_event_lifo is used for lifo exclusive 'idle' waits */
+#ifdef TASK_NOLOAD
+
+#define ___wait_event_lifo(wq_head, condition, ret, cmd)		\
+({									\
+	wait_queue_entry_t	 __wq_entry;				\
+	long __ret = ret;	/* explicit shadow */			\
+									\
+	init_wait(&__wq_entry);						\
+	__wq_entry.flags =  WQ_FLAG_EXCLUSIVE;				\
+	for (;;) {							\
+		prepare_to_wait_exclusive_head(&wq_head, &__wq_entry);	\
+		prepare_to_wait_event(&wq_head, &__wq_entry, TASK_IDLE);\
+									\
+		if (condition)						\
+			break;						\
+									\
+		cmd;							\
+	}								\
+	finish_wait(&wq_head, &__wq_entry);				\
+	__ret;								\
+})
+#else
+#define ___wait_event_lifo(wq_head, condition, ret, cmd)		\
+({									\
+	wait_queue_entry_t __wq_entry;					\
+	unsigned long flags;						\
+	long __ret = ret;	/* explicit shadow */			\
+	sigset_t __blocked;						\
+									\
+	__blocked = cfs_block_sigsinv(0);				\
+	init_wait(&__wq_entry);						\
+	__wq_entry.flags = WQ_FLAG_EXCLUSIVE;				\
+	for (;;) {							\
+		prepare_to_wait_exclusive_head(&wq_head, &__wq_entry);	\
+		prepare_to_wait_event(&wq_head, &__wq_entry,		\
+				      TASK_INTERRUPTIBLE);		\
+									\
+		if (condition)						\
+			break;						\
+		/* See justification in ___wait_event_idle */		\
+		if (signal_pending(current)) {				\
+			spin_lock_irqsave(&current->sighand->siglock,	\
+					  flags);			\
+			clear_tsk_thread_flag(current, TIF_SIGPENDING);	\
+			spin_unlock_irqrestore(&current->sighand->siglock,\
+					       flags);			\
+		}							\
+		cmd;							\
+	}								\
+	cfs_restore_sigs(__blocked);					\
+	finish_wait(&wq_head, &__wq_entry);				\
+	__ret;								\
+})
+#endif
+
+#define wait_event_idle_exclusive_lifo(wq_head, condition)		\
+do {									\
+	might_sleep();							\
+	if (!(condition))						\
+		___wait_event_lifo(wq_head, condition, 0, schedule());	\
+} while (0)
+
+#define __wait_event_idle_lifo_timeout(wq_head, condition, timeout)	\
+	___wait_event_lifo(wq_head, ___wait_cond_timeout1(condition),	\
+			   timeout,					\
+			   __ret = schedule_timeout(__ret))
+
+#define wait_event_idle_exclusive_lifo_timeout(wq_head, condition, timeout)\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout1(condition))				\
+		__ret = __wait_event_idle_lifo_timeout(wq_head,		\
+						       condition,	\
+						       timeout);	\
+	__ret;								\
+})
+
+/* l_wait_event_abortable() is a bit like wait_event_killable()
+ * except there is a fixed set of signals which will abort:
+ * LUSTRE_FATAL_SIGS
+ */
+#define LUSTRE_FATAL_SIGS					 \
+	(sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGTERM) | \
+	 sigmask(SIGQUIT) | sigmask(SIGALRM))
+
+#define l_wait_event_abortable(wq, condition)				\
+({									\
+	sigset_t __new_blocked, __old_blocked;				\
+	int __ret = 0;							\
+	siginitsetinv(&__new_blocked, LUSTRE_FATAL_SIGS);		\
+	sigprocmask(SIG_BLOCK, &__new_blocked, &__old_blocked);		\
+	__ret = wait_event_interruptible(wq, condition);		\
+	sigprocmask(SIG_SETMASK, &__old_blocked, NULL);			\
+	__ret;								\
+})
+
+#define l_wait_event_abortable_timeout(wq, condition, timeout)		\
+({									\
+	sigset_t __new_blocked, __old_blocked;				\
+	int __ret = 0;							\
+	siginitsetinv(&__new_blocked, LUSTRE_FATAL_SIGS);		\
+	sigprocmask(SIG_BLOCK, &__new_blocked, &__old_blocked);		\
+	__ret = wait_event_interruptible_timeout(wq, condition, timeout);\
+	sigprocmask(SIG_SETMASK, &__old_blocked, NULL);			\
+	__ret;								\
+})
+
+#define l_wait_event_abortable_exclusive(wq, condition)			\
+({									\
+	sigset_t __new_blocked, __old_blocked;				\
+	int __ret = 0;							\
+	siginitsetinv(&__new_blocked, LUSTRE_FATAL_SIGS);		\
+	sigprocmask(SIG_BLOCK, &__new_blocked, &__old_blocked);		\
+	__ret = wait_event_interruptible_exclusive(wq, condition);	\
+	sigprocmask(SIG_SETMASK, &__old_blocked, NULL);			\
+	__ret;								\
+})
+
+#endif /* __LICBFS_LINUX_WAIT_BIT_H */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/hash.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/hash.h
new file mode 100644
index 0000000000000..45818dddedd94
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/hash.h
@@ -0,0 +1,103 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+
+#ifndef _LINUX_HASH_H
+#define _LINUX_HASH_H
+/* Fast hashing routine for ints,  longs and pointers.
+   (C) 2002 Nadia Yvette Chambers, IBM */
+
+/*
+ * Knuth recommends primes in approximately golden ratio to the maximum
+ * integer representable by a machine word for multiplicative hashing.
+ * Chuck Lever verified the effectiveness of this technique:
+ * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
+ *
+ * These primes are chosen to be bit-sparse, that is operations on
+ * them can use shifts and additions instead of multiplications for
+ * machines where multiplications are slow.
+ */
+
+#include <linux/types.h>
+
+/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
+#define GOLDEN_RATIO_PRIME_32 0x9e370001UL
+/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
+#define GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001UL
+
+#if __BITS_PER_LONG == 32
+#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_32
+#define hash_long(val, bits) hash_32(val, bits)
+#elif __BITS_PER_LONG == 64
+#define hash_long(val, bits) hash_64(val, bits)
+#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_64
+#else
+#error Wordsize not 32 or 64
+#endif
+
+static __always_inline __u64 hash_64(__u64 val, unsigned int bits)
+{
+	__u64 hash = val;
+
+	/*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
+	__u64 n = hash;
+	n <<= 18;
+	hash -= n;
+	n <<= 33;
+	hash -= n;
+	n <<= 3;
+	hash += n;
+	n <<= 3;
+	hash -= n;
+	n <<= 4;
+	hash += n;
+	n <<= 2;
+	hash += n;
+
+	/* High bits are more random, so use them. */
+	return hash >> (64 - bits);
+}
+
+static inline __u32 hash_32(__u32 val, unsigned int bits)
+{
+	/* On some cpus multiply is faster, on others gcc will do shifts */
+	__u32 hash = val * GOLDEN_RATIO_PRIME_32;
+
+	/* High bits are more random, so use them. */
+	return hash >> (32 - bits);
+}
+
+static inline unsigned long hash_ptr(const void *ptr, unsigned int bits)
+{
+	return hash_long((unsigned long)ptr, bits);
+}
+
+static inline __u32 hash32_ptr(const void *ptr)
+{
+	unsigned long val = (unsigned long)ptr;
+
+#if __BITS_PER_LONG == 64
+	val ^= (val >> 32);
+#endif
+	return (__u32)val;
+}
+
+#endif /* _LINUX_HASH_H */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/ioctl.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/ioctl.h
index 600bf27b607b4..a42e0c5fe4568 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/ioctl.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/ioctl.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2014, Intel Corporation.
+ * Copyright (c) 2014, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -43,7 +43,7 @@
 /* Sparse annotation. */
 #define __user
 
-#include <libcfs/libcfs_ioctl.h>
+#include <uapi/linux/lnet/libcfs_ioctl.h>
 
 #define LIBCFS_IOC_INIT(data)					\
 do {								\
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/parser.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/parser.h
index 2fb2db7c651dd..7bae8393a1916 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/parser.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/parser.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2014, Intel Corporation.
+ * Copyright (c) 2014, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -107,7 +107,7 @@ char *Parser_strarg(char *inp, const char *prompt, const char *deft,
 int Parser_arg2int(const char *inp, long *result, int base);
 
 /* Convert human readable size string to and int; "1k" -> 1000 */
-int Parser_size(int *sizep, char *str);
+int Parser_size(unsigned long *sizep, char *str);
 
 /* Convert a string boolean to an int; "enable" -> 1 */
 int Parser_bool(int *b, char *str);
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/string.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/string.h
index 72414f0c8003a..065829b7161d6 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/util/string.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/util/string.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2014, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -44,14 +44,6 @@
 #include <linux/types.h>
 #include <libcfs/util/list.h>
 
-#ifndef HAVE_STRLCPY /* not in glibc for RHEL 5.x, remove when obsolete */
-size_t strlcpy(char *tgt, const char *src, size_t tgt_len);
-#endif
-
-#ifndef HAVE_STRLCAT /* not in glibc for RHEL 5.x, remove when obsolete */
-size_t strlcat(char *tgt, const char *src, size_t tgt_len);
-#endif
-
 /**
  * Structure to represent NULL-less strings.
  */
@@ -93,5 +85,6 @@ int cfs_ip_addr_parse(char *str, int len, struct list_head *list);
 int cfs_ip_addr_range_gen(__u32 *ip_list, int count,
 			  struct list_head *ip_addr_expr);
 int cfs_ip_addr_match(__u32 addr, struct list_head *list);
+int cfs_abs_path(const char *request_path, char **resolved_path);
 
 #endif
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/Makefile b/drivers/staging/lustrefsx/libcfs/libcfs/Makefile
index a487ba0329342..04b9fafaae920 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/Makefile
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/Makefile
@@ -1,7 +1,7 @@
 obj-$(CONFIG_LUSTREFSX_LIBCFS)	+= libcfs.o
 
 libcfs-linux-objs	:= linux-tracefile.o linux-debug.o linux-prim.o
-libcfs-linux-objs	+= linux-cpu.o linux-curproc.o linux-module.o
+libcfs-linux-objs	+= linux-curproc.o linux-module.o linux-hash.o linux-wait.o
 libcfs-linux-objs	+= linux-crypto.o linux-crypto-adler.o
 libcfs-linux-objs	+= linux-crypto-crc32.o
 
@@ -10,7 +10,6 @@ libcfs-linux-objs	:= $(addprefix linux/,$(libcfs-linux-objs))
 libcfs-all-objs		:= debug.o fail.o module.o tracefile.o watchdog.o
 libcfs-all-objs		+= libcfs_string.o hash.o prng.o workitem.o
 libcfs-all-objs		+= libcfs_cpu.o libcfs_mem.o libcfs_lock.o heap.o
-libcfs-all-objs		+= libcfs_ptask.o
 
 libcfs-y		+= $(libcfs-linux-objs) $(libcfs-all-objs)
 
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/debug.c b/drivers/staging/lustrefsx/libcfs/libcfs/debug.c
index a4aede1e3be08..65e5b4669d0d2 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/debug.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/debug.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -37,6 +37,7 @@
 
 # define DEBUG_SUBSYSTEM S_LNET
 
+#include <linux/ctype.h>
 #include <linux/kthread.h>
 #include <libcfs/libcfs.h>
 #include "tracefile.h"
@@ -54,8 +55,63 @@ module_param(libcfs_debug, int, 0644);
 MODULE_PARM_DESC(libcfs_debug, "Lustre kernel debug mask");
 EXPORT_SYMBOL(libcfs_debug);
 
+static int libcfs_param_debug_mb_set(const char *val,
+				     cfs_kernel_param_arg_t *kp)
+{
+	int rc;
+	unsigned int num;
+
+	rc = kstrtouint(val, 0, &num);
+	if (rc < 0)
+		return rc;
+
+/*
+ * RHEL6 does not support any kind of locking so we have to provide
+ * our own
+ */
+#if !defined(HAVE_MODULE_PARAM_LOCKING) && !defined(HAVE_KERNEL_PARAM_LOCK)
+	kernel_param_lock(THIS_MODULE);
+#endif
+	if (!*((unsigned int *)kp->arg)) {
+		*((unsigned int *)kp->arg) = num;
+
+#if !defined(HAVE_MODULE_PARAM_LOCKING) && !defined(HAVE_KERNEL_PARAM_LOCK)
+		kernel_param_unlock(THIS_MODULE);
+#endif
+		return 0;
+	}
+
+	rc = cfs_trace_set_debug_mb(num);
+
+	if (!rc)
+		*((unsigned int *)kp->arg) = cfs_trace_get_debug_mb();
+
+#if !defined(HAVE_MODULE_PARAM_LOCKING) && !defined(HAVE_KERNEL_PARAM_LOCK)
+	kernel_param_unlock(THIS_MODULE);
+#endif
+	return rc;
+}
+
+/*
+ * While debug_mb setting look like unsigned int, in fact
+ * it needs quite a bunch of extra processing, so we define special
+ * debug_mb parameter type with corresponding methods to handle this case
+ */
+static struct kernel_param_ops param_ops_debug_mb = {
+	.set = libcfs_param_debug_mb_set,
+	.get = param_get_uint,
+};
+
+#define param_check_debug_mb(name, p) \
+		__param_check(name, p, unsigned int)
+
 static unsigned int libcfs_debug_mb;
-module_param(libcfs_debug_mb, uint, 0644);
+#ifdef HAVE_KERNEL_PARAM_OPS
+module_param(libcfs_debug_mb, debug_mb, 0644);
+#else
+module_param_call(libcfs_debug_mb, libcfs_param_debug_mb_set, param_get_uint,
+		  &param_ops_debug_mb, 0644);
+#endif
 MODULE_PARM_DESC(libcfs_debug_mb, "Total debug buffer size.");
 
 unsigned int libcfs_printk = D_CANTMASK;
@@ -66,16 +122,125 @@ unsigned int libcfs_console_ratelimit = 1;
 module_param(libcfs_console_ratelimit, uint, 0644);
 MODULE_PARM_DESC(libcfs_console_ratelimit, "Lustre kernel debug console ratelimit (0 to disable)");
 
+static int param_set_delay_minmax(const char *val,
+				  cfs_kernel_param_arg_t *kp,
+				  long min, long max)
+{
+	long d;
+	int sec;
+	int rc;
+
+	rc = kstrtoint(val, 0, &sec);
+	if (rc)
+		return -EINVAL;
+
+	/* The sysfs setting is in centiseconds */
+	d = cfs_time_seconds(sec) / 100;
+	if (d < min || d > max)
+		return -EINVAL;
+
+	*((unsigned int *)kp->arg) = d;
+
+	return 0;
+}
+
+static int param_get_delay(char *buffer, cfs_kernel_param_arg_t *kp)
+{
+	unsigned int d = *(unsigned int *)kp->arg;
+
+	return sprintf(buffer, "%lu", jiffies_to_msecs(d * 10) / MSEC_PER_SEC);
+}
+
 unsigned int libcfs_console_max_delay;
-module_param(libcfs_console_max_delay, uint, 0644);
+unsigned int libcfs_console_min_delay;
+
+static int param_set_console_max_delay(const char *val,
+				       cfs_kernel_param_arg_t *kp)
+{
+	return param_set_delay_minmax(val, kp,
+				      libcfs_console_min_delay, INT_MAX);
+}
+
+static struct kernel_param_ops param_ops_console_max_delay = {
+	.set = param_set_console_max_delay,
+	.get = param_get_delay,
+};
+
+#define param_check_console_max_delay(name, p) \
+		__param_check(name, p, unsigned int)
+
+#ifdef HAVE_KERNEL_PARAM_OPS
+module_param(libcfs_console_max_delay, console_max_delay, 0644);
+#else
+module_param_call(libcfs_console_max_delay, param_set_console_max_delay,
+		  param_get_delay, &param_ops_console_max_delay, 0644);
+#endif
 MODULE_PARM_DESC(libcfs_console_max_delay, "Lustre kernel debug console max delay (jiffies)");
 
-unsigned int libcfs_console_min_delay;
-module_param(libcfs_console_min_delay, uint, 0644);
+static int param_set_console_min_delay(const char *val,
+				       cfs_kernel_param_arg_t *kp)
+{
+	return param_set_delay_minmax(val, kp,
+				      1, libcfs_console_max_delay);
+}
+
+static struct kernel_param_ops param_ops_console_min_delay = {
+	.set = param_set_console_min_delay,
+	.get = param_get_delay,
+};
+
+#define param_check_console_min_delay(name, p) \
+		__param_check(name, p, unsigned int)
+
+#ifdef HAVE_KERNEL_PARAM_OPS
+module_param(libcfs_console_min_delay, console_min_delay, 0644);
+#else
+module_param_call(libcfs_console_min_delay, param_set_console_min_delay,
+		  param_get_delay, &param_ops_console_min_delay, 0644);
+#endif
 MODULE_PARM_DESC(libcfs_console_min_delay, "Lustre kernel debug console min delay (jiffies)");
 
+#ifndef HAVE_PARAM_SET_UINT_MINMAX
+static int param_set_uint_minmax(const char *val,
+				 cfs_kernel_param_arg_t *kp,
+				 unsigned int min, unsigned int max)
+{
+	unsigned int num;
+	int ret;
+
+	if (!val)
+		return -EINVAL;
+
+	ret = kstrtouint(val, 0, &num);
+	if (ret < 0 || num < min || num > max)
+		return -EINVAL;
+
+	*((unsigned int *)kp->arg) = num;
+	return 0;
+}
+#endif
+
+static int param_set_uintpos(const char *val,
+			     cfs_kernel_param_arg_t *kp)
+{
+	return param_set_uint_minmax(val, kp, 1, -1);
+}
+
+static struct kernel_param_ops param_ops_uintpos = {
+	.set = param_set_uintpos,
+	.get = param_get_uint,
+};
+
+#define param_check_uintpos(name, p) \
+		__param_check(name, p, unsigned int)
+
 unsigned int libcfs_console_backoff = CDEBUG_DEFAULT_BACKOFF;
-module_param(libcfs_console_backoff, uint, 0644);
+#ifdef HAVE_KERNEL_PARAM_OPS
+module_param(libcfs_console_backoff, uintpos, 0644);
+#else
+module_param_call(libcfs_console_backoff, param_set_uintpos, param_get_uint,
+		  &param_ops_uintpos, 0644);
+#endif
 MODULE_PARM_DESC(libcfs_console_backoff, "Lustre kernel debug console backoff factor");
 
 unsigned int libcfs_debug_binary = 1;
@@ -101,15 +266,17 @@ char libcfs_debug_file_path_arr[PATH_MAX] = LIBCFS_DEBUG_FILE_PATH_DEFAULT;
 EXPORT_SYMBOL(libcfs_debug_file_path_arr);
 
 /* We need to pass a pointer here, but elsewhere this must be a const */
-static char *libcfs_debug_file_path;
+static char *libcfs_debug_file_path = LIBCFS_DEBUG_FILE_PATH_DEFAULT;
 module_param(libcfs_debug_file_path, charp, 0644);
 MODULE_PARM_DESC(libcfs_debug_file_path,
 		 "Path for dumping debug logs, set 'NONE' to prevent log dumping");
 
 int libcfs_panic_in_progress;
 
-/* libcfs_debug_token2mask() expects the returned
- * string in lower-case */
+/*
+ * libcfs_debug_token2mask() expects the returned
+ * string in lower-case
+ */
 static const char *libcfs_debug_subsys2str(int subsys)
 {
 	static const char *libcfs_debug_subsystems[] = LIBCFS_DEBUG_SUBSYS_NAMES;
@@ -120,8 +287,10 @@ static const char *libcfs_debug_subsys2str(int subsys)
 	return libcfs_debug_subsystems[subsys];
 }
 
-/* libcfs_debug_token2mask() expects the returned
- * string in lower-case */
+/*
+ * libcfs_debug_token2mask() expects the returned
+ * string in lower-case
+ */
 static const char *libcfs_debug_dbg2str(int debug)
 {
 	static const char *libcfs_debug_masks[] = LIBCFS_DEBUG_MASKS_NAMES;
@@ -135,79 +304,78 @@ static const char *libcfs_debug_dbg2str(int debug)
 int
 libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys)
 {
-        const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
-                                                 libcfs_debug_dbg2str;
-        int           len = 0;
-        const char   *token;
-        int           i;
-
-        if (mask == 0) {                        /* "0" */
-                if (size > 0)
-                        str[0] = '0';
-                len = 1;
-        } else {                                /* space-separated tokens */
-                for (i = 0; i < 32; i++) {
-                        if ((mask & (1 << i)) == 0)
-                                continue;
-
-                        token = fn(i);
-                        if (token == NULL)              /* unused bit */
-                                continue;
-
-                        if (len > 0) {                  /* separator? */
-                                if (len < size)
-                                        str[len] = ' ';
-                                len++;
-                        }
-
-                        while (*token != 0) {
-                                if (len < size)
-                                        str[len] = *token;
-                                token++;
-                                len++;
-                        }
-                }
-        }
-
-        /* terminate 'str' */
-        if (len < size)
-                str[len] = 0;
-        else
-                str[size - 1] = 0;
-
-        return len;
+	const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
+						 libcfs_debug_dbg2str;
+	int len = 0;
+	const char *token;
+	int i;
+
+	if (mask == 0) {			/* "0" */
+		if (size > 0)
+			str[0] = '0';
+		len = 1;
+	} else {				/* space-separated tokens */
+		for (i = 0; i < 32; i++) {
+			if ((mask & (1 << i)) == 0)
+				continue;
+
+			token = fn(i);
+			if (token == NULL)	/* unused bit */
+				continue;
+
+			if (len > 0) {		/* separator? */
+				if (len < size)
+					str[len] = ' ';
+				len++;
+			}
+
+			while (*token != 0) {
+				if (len < size)
+					str[len] = *token;
+				token++;
+				len++;
+			}
+		}
+	}
+
+	/* terminate 'str' */
+	if (len < size)
+		str[len] = 0;
+	else
+		str[size - 1] = 0;
+
+	return len;
 }
 
 int
 libcfs_debug_str2mask(int *mask, const char *str, int is_subsys)
 {
-        const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
-                                                 libcfs_debug_dbg2str;
-        int         m = 0;
-        int         matched;
-        int         n;
-        int         t;
-
-        /* Allow a number for backwards compatibility */
-
-        for (n = strlen(str); n > 0; n--)
-                if (!isspace(str[n-1]))
-                        break;
-        matched = n;
-
-        if ((t = sscanf(str, "%i%n", &m, &matched)) >= 1 &&
-            matched == n) {
-                /* don't print warning for lctl set_param debug=0 or -1 */
-                if (m != 0 && m != -1)
-                        CWARN("You are trying to use a numerical value for the "
-                              "mask - this will be deprecated in a future "
-                              "release.\n");
-                *mask = m;
-                return 0;
-        }
-
-        return cfs_str2mask(str, fn, mask, is_subsys ? 0 : D_CANTMASK,
-                            0xffffffff);
+	const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
+						 libcfs_debug_dbg2str;
+	int m = 0;
+	int matched;
+	int n;
+	int t;
+
+	/* Allow a number for backwards compatibility */
+	for (n = strlen(str); n > 0; n--)
+		if (!isspace(str[n-1]))
+			break;
+	matched = n;
+
+	t = sscanf(str, "%i%n", &m, &matched);
+	if (t >= 1 && matched == n) {
+		/* don't print warning for lctl set_param debug=0 or -1 */
+		if (m != 0 && m != -1)
+			CWARN("You are trying to use a numerical value for the "
+			      "mask - this will be deprecated in a future "
+			      "release.\n");
+		*mask = m;
+		return 0;
+	}
+
+	return cfs_str2mask(str, fn, mask, is_subsys ? 0 : D_CANTMASK,
+			    0xffffffff);
 }
 
 /**
@@ -248,11 +416,14 @@ void libcfs_debug_dumplog(void)
 {
 	wait_queue_entry_t wait;
 	struct task_struct *dumper;
+
 	ENTRY;
 
-	/* we're being careful to ensure that the kernel thread is
+	/*
+	 * we're being careful to ensure that the kernel thread is
 	 * able to set our state to running as it exits before we
-	 * get to schedule() */
+	 * get to schedule()
+	 */
 	init_waitqueue_entry(&wait, current);
 	set_current_state(TASK_INTERRUPTIBLE);
 	add_wait_queue(&debug_ctlwq, &wait);
@@ -274,7 +445,7 @@ EXPORT_SYMBOL(libcfs_debug_dumplog);
 
 int libcfs_debug_init(unsigned long bufsize)
 {
-	int    rc = 0;
+	int rc = 0;
 	unsigned int max = libcfs_debug_mb;
 
 	init_waitqueue_head(&debug_ctlwq);
@@ -292,55 +463,65 @@ int libcfs_debug_init(unsigned long bufsize)
 			sizeof(libcfs_debug_file_path_arr));
 	}
 
-	/* If libcfs_debug_mb is set to an invalid value or uninitialized
-	 * then just make the total buffers smp_num_cpus * TCD_MAX_PAGES */
+	/*
+	 * If libcfs_debug_mb is set to an invalid value or uninitialized
+	 * then just make the total buffers smp_num_cpus * TCD_MAX_PAGES
+	 */
 	if (max > cfs_trace_max_debug_mb() || max < num_possible_cpus()) {
 		max = TCD_MAX_PAGES;
 	} else {
 		max = (max / num_possible_cpus());
 		max = (max << (20 - PAGE_SHIFT));
 	}
-	rc = cfs_tracefile_init(max);
-
-        if (rc == 0)
-                libcfs_register_panic_notifier();
 
-        return rc;
+	rc = cfs_tracefile_init(max);
+	if (rc)
+		return rc;
+
+	libcfs_register_panic_notifier();
+	kernel_param_lock(THIS_MODULE);
+	libcfs_debug_mb = cfs_trace_get_debug_mb();
+	kernel_param_unlock(THIS_MODULE);
+	return rc;
 }
 
 int libcfs_debug_cleanup(void)
 {
-        libcfs_unregister_panic_notifier();
-        cfs_tracefile_exit();
-        return 0;
+	libcfs_unregister_panic_notifier();
+	kernel_param_lock(THIS_MODULE);
+	cfs_tracefile_exit();
+	kernel_param_unlock(THIS_MODULE);
+	return 0;
 }
 
 int libcfs_debug_clear_buffer(void)
 {
-        cfs_trace_flush_pages();
-        return 0;
+	cfs_trace_flush_pages();
+	return 0;
 }
 
-/* Debug markers, although printed by S_LNET
- * should not be be marked as such. */
+/*
+ * Debug markers, although printed by S_LNET
+ * should not be be marked as such.
+ */
 #undef DEBUG_SUBSYSTEM
 #define DEBUG_SUBSYSTEM S_UNDEFINED
 int libcfs_debug_mark_buffer(const char *text)
 {
-        CDEBUG(D_TRACE,"***************************************************\n");
-        LCONSOLE(D_WARNING, "DEBUG MARKER: %s\n", text);
-        CDEBUG(D_TRACE,"***************************************************\n");
+	CDEBUG(D_TRACE, "**************************************************\n");
+	LCONSOLE(D_WARNING, "DEBUG MARKER: %s\n", text);
+	CDEBUG(D_TRACE, "**************************************************\n");
 
-        return 0;
+	return 0;
 }
 #undef DEBUG_SUBSYSTEM
 #define DEBUG_SUBSYSTEM S_LNET
 
 long libcfs_log_return(struct libcfs_debug_msg_data *msgdata, long rc)
 {
-        libcfs_debug_msg(msgdata, "Process leaving (rc=%lu : %ld : %lx)\n",
-                         rc, rc, rc);
-        return rc;
+	libcfs_debug_msg(msgdata, "Process leaving (rc=%lu : %ld : %lx)\n",
+			 rc, rc, rc);
+	return rc;
 }
 EXPORT_SYMBOL(libcfs_log_return);
 
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_cpu.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_cpu.c
index 209333edf6b5b..fff5a2217c6f5 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_cpu.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_cpu.c
@@ -13,16 +13,11 @@
  * General Public License version 2 for more details (a copy is included
  * in the LICENSE file that accompanied this code).
  *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA
- *
  * GPL HEADER END
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2013, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -35,42 +30,193 @@
 
 #define DEBUG_SUBSYSTEM S_LNET
 
+#include <linux/cpu.h>
+#include <linux/sched.h>
+#include <libcfs/libcfs_cpu.h>
 #include <libcfs/libcfs.h>
 
 /** Global CPU partition table */
-struct cfs_cpt_table *cfs_cpt_table __read_mostly = NULL;
+struct cfs_cpt_table *cfs_cpt_table __read_mostly;
 EXPORT_SYMBOL(cfs_cpt_table);
 
-#ifndef HAVE_LIBCFS_CPT
-
-#define CFS_CPU_VERSION_MAGIC           0xbabecafe
+/**
+ * modparam for setting number of partitions
+ *
+ *  0 : estimate best value based on cores or NUMA nodes
+ *  1 : disable multiple partitions
+ * >1 : specify number of partitions
+ */
+static int cpu_npartitions;
+module_param(cpu_npartitions, int, 0444);
+MODULE_PARM_DESC(cpu_npartitions, "# of CPU partitions");
 
-#define CFS_CPT_DISTANCE		1	/* Arbitrary positive value */
+/**
+ * modparam for setting CPU partitions patterns:
+ *
+ * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID,
+ *      number in bracket is processor ID (core or HT)
+ *
+ * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket
+ *       are NUMA node ID, number before bracket is CPU partition ID.
+ *
+ * i.e: "N", shortcut expression to create CPT from NUMA & CPU topology
+ *
+ * NB: If user specified cpu_pattern, cpu_npartitions will be ignored
+ */
+static char *cpu_pattern = "N";
+module_param(cpu_pattern, charp, 0444);
+MODULE_PARM_DESC(cpu_pattern, "CPU partitions pattern");
 
+#ifdef CONFIG_SMP
 struct cfs_cpt_table *cfs_cpt_table_alloc(int ncpt)
 {
 	struct cfs_cpt_table *cptab;
+	int i;
 
-	if (ncpt != 1) {
-		CERROR("Can't support cpu partition number %d\n", ncpt);
+	LIBCFS_ALLOC(cptab, sizeof(*cptab));
+	if (!cptab)
 		return NULL;
-	}
 
-	LIBCFS_ALLOC(cptab, sizeof(*cptab));
-	if (cptab != NULL) {
-		cptab->ctb_version = CFS_CPU_VERSION_MAGIC;
-		cpu_set(0, cptab->ctb_cpumask);
-		node_set(0, cptab->ctb_nodemask);
-		cptab->ctb_nparts  = ncpt;
+	cptab->ctb_nparts = ncpt;
+
+	LIBCFS_ALLOC(cptab->ctb_cpumask, cpumask_size());
+	if (!cptab->ctb_cpumask)
+		goto failed_alloc_cpumask;
+
+	LIBCFS_ALLOC(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
+	if (!cptab->ctb_nodemask)
+		goto failed_alloc_nodemask;
+
+	LIBCFS_ALLOC(cptab->ctb_cpu2cpt,
+		     nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
+	if (!cptab->ctb_cpu2cpt)
+		goto failed_alloc_cpu2cpt;
+
+	memset(cptab->ctb_cpu2cpt, -1,
+	       nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
+
+	LIBCFS_ALLOC(cptab->ctb_node2cpt,
+		     nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
+	if (!cptab->ctb_node2cpt)
+		goto failed_alloc_node2cpt;
+
+	memset(cptab->ctb_node2cpt, -1,
+	       nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
+
+	LIBCFS_ALLOC(cptab->ctb_parts, ncpt * sizeof(cptab->ctb_parts[0]));
+	if (!cptab->ctb_parts)
+		goto failed_alloc_ctb_parts;
+
+	memset(cptab->ctb_parts, -1, ncpt * sizeof(cptab->ctb_parts[0]));
+
+	for (i = 0; i < ncpt; i++) {
+		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
+
+		LIBCFS_ALLOC(part->cpt_cpumask, cpumask_size());
+		if (!part->cpt_cpumask)
+			goto failed_setting_ctb_parts;
+
+		LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask));
+		if (!part->cpt_nodemask)
+			goto failed_setting_ctb_parts;
+
+		LIBCFS_ALLOC(part->cpt_distance,
+			     cptab->ctb_nparts * sizeof(part->cpt_distance[0]));
+		if (!part->cpt_distance)
+			goto failed_setting_ctb_parts;
+
+		memset(part->cpt_distance, -1,
+		       cptab->ctb_nparts * sizeof(part->cpt_distance[0]));
 	}
 
 	return cptab;
+
+failed_setting_ctb_parts:
+	while (i-- >= 0) {
+		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
+
+		if (part->cpt_nodemask) {
+			LIBCFS_FREE(part->cpt_nodemask,
+				    sizeof(*part->cpt_nodemask));
+		}
+
+		if (part->cpt_cpumask)
+			LIBCFS_FREE(part->cpt_cpumask, cpumask_size());
+
+		if (part->cpt_distance) {
+			LIBCFS_FREE(part->cpt_distance,
+				cptab->ctb_nparts *
+					sizeof(part->cpt_distance[0]));
+		}
+	}
+
+	if (cptab->ctb_parts) {
+		LIBCFS_FREE(cptab->ctb_parts,
+			    cptab->ctb_nparts * sizeof(cptab->ctb_parts[0]));
+	}
+failed_alloc_ctb_parts:
+	if (cptab->ctb_node2cpt) {
+		LIBCFS_FREE(cptab->ctb_node2cpt,
+			    nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
+	}
+failed_alloc_node2cpt:
+	if (cptab->ctb_cpu2cpt) {
+		LIBCFS_FREE(cptab->ctb_cpu2cpt,
+			    nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
+	}
+failed_alloc_cpu2cpt:
+	if (cptab->ctb_nodemask)
+		LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
+failed_alloc_nodemask:
+	if (cptab->ctb_cpumask)
+		LIBCFS_FREE(cptab->ctb_cpumask, cpumask_size());
+failed_alloc_cpumask:
+	LIBCFS_FREE(cptab, sizeof(*cptab));
+	return NULL;
 }
 EXPORT_SYMBOL(cfs_cpt_table_alloc);
 
 void cfs_cpt_table_free(struct cfs_cpt_table *cptab)
 {
-	LASSERT(cptab->ctb_version == CFS_CPU_VERSION_MAGIC);
+	int i;
+
+	if (cptab->ctb_cpu2cpt) {
+		LIBCFS_FREE(cptab->ctb_cpu2cpt,
+			    nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
+	}
+
+	if (cptab->ctb_node2cpt) {
+		LIBCFS_FREE(cptab->ctb_node2cpt,
+			    nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
+	}
+
+	for (i = 0; cptab->ctb_parts && i < cptab->ctb_nparts; i++) {
+		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
+
+		if (part->cpt_nodemask) {
+			LIBCFS_FREE(part->cpt_nodemask,
+				    sizeof(*part->cpt_nodemask));
+		}
+
+		if (part->cpt_cpumask)
+			LIBCFS_FREE(part->cpt_cpumask, cpumask_size());
+
+		if (part->cpt_distance) {
+			LIBCFS_FREE(part->cpt_distance,
+				cptab->ctb_nparts *
+					sizeof(part->cpt_distance[0]));
+		}
+	}
+
+	if (cptab->ctb_parts) {
+		LIBCFS_FREE(cptab->ctb_parts,
+			    cptab->ctb_nparts * sizeof(cptab->ctb_parts[0]));
+	}
+
+	if (cptab->ctb_nodemask)
+		LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
+	if (cptab->ctb_cpumask)
+		LIBCFS_FREE(cptab->ctb_cpumask, cpumask_size());
 
 	LIBCFS_FREE(cptab, sizeof(*cptab));
 }
@@ -78,80 +224,346 @@ EXPORT_SYMBOL(cfs_cpt_table_free);
 
 int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
 {
-	int rc = 0;
+	char *tmp = buf;
+	int rc;
+	int i;
+	int j;
 
-	rc = snprintf(buf, len, "%d\t: %d\n", 0, 0);
-	len -= rc;
-	if (len <= 0)
-		return -EFBIG;
+	for (i = 0; i < cptab->ctb_nparts; i++) {
+		if (len <= 0)
+			goto err;
 
-	return rc;
+		rc = snprintf(tmp, len, "%d\t:", i);
+		len -= rc;
+
+		if (len <= 0)
+			goto err;
+
+		tmp += rc;
+		for_each_cpu(j, cptab->ctb_parts[i].cpt_cpumask) {
+			rc = snprintf(tmp, len, " %d", j);
+			len -= rc;
+			if (len <= 0)
+				goto err;
+			tmp += rc;
+		}
+
+		*tmp = '\n';
+		tmp++;
+		len--;
+	}
+
+	return tmp - buf;
+err:
+	return -E2BIG;
 }
 EXPORT_SYMBOL(cfs_cpt_table_print);
 
 int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len)
 {
-	int	rc = 0;
+	char *tmp = buf;
+	int rc;
+	int i;
+	int j;
 
-	rc = snprintf(buf, len, "%d\t: %d:%d\n", 0, CFS_CPT_DISTANCE);
-	len -= rc;
-	if (len <= 0)
-		return -EFBIG;
+	for (i = 0; i < cptab->ctb_nparts; i++) {
+		if (len <= 0)
+			goto err;
 
-	return rc;
+		rc = snprintf(tmp, len, "%d\t:", i);
+		len -= rc;
+
+		if (len <= 0)
+			goto err;
+
+		tmp += rc;
+		for (j = 0; j < cptab->ctb_nparts; j++) {
+			rc = snprintf(tmp, len, " %d:%d", j,
+				      cptab->ctb_parts[i].cpt_distance[j]);
+			len -= rc;
+			if (len <= 0)
+				goto err;
+			tmp += rc;
+		}
+
+		*tmp = '\n';
+		tmp++;
+		len--;
+	}
+
+	return tmp - buf;
+err:
+	return -E2BIG;
 }
 EXPORT_SYMBOL(cfs_cpt_distance_print);
 
 int cfs_cpt_number(struct cfs_cpt_table *cptab)
 {
-	return 1;
+	return cptab->ctb_nparts;
 }
 EXPORT_SYMBOL(cfs_cpt_number);
 
 int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
 {
-	return 1;
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cpumask_weight(cptab->ctb_cpumask) :
+	       cpumask_weight(cptab->ctb_parts[cpt].cpt_cpumask);
 }
 EXPORT_SYMBOL(cfs_cpt_weight);
 
 int cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
 {
-	return 1;
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cpumask_any_and(cptab->ctb_cpumask,
+			       cpu_online_mask) < nr_cpu_ids :
+	       cpumask_any_and(cptab->ctb_parts[cpt].cpt_cpumask,
+			       cpu_online_mask) < nr_cpu_ids;
 }
 EXPORT_SYMBOL(cfs_cpt_online);
 
 cpumask_t *cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt)
 {
-	return &cptab->ctb_mask;
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cptab->ctb_cpumask : cptab->ctb_parts[cpt].cpt_cpumask;
 }
 EXPORT_SYMBOL(cfs_cpt_cpumask);
 
 nodemask_t *cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt)
 {
-	return &cptab->ctb_nodemask;
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask;
 }
 EXPORT_SYMBOL(cfs_cpt_nodemask);
 
-unsigned cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2)
+unsigned int cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2)
 {
-	return CFS_CPT_DISTANCE;
+	LASSERT(cpt1 == CFS_CPT_ANY || (cpt1 >= 0 && cpt1 < cptab->ctb_nparts));
+	LASSERT(cpt2 == CFS_CPT_ANY || (cpt2 >= 0 && cpt2 < cptab->ctb_nparts));
+
+	if (cpt1 == CFS_CPT_ANY || cpt2 == CFS_CPT_ANY)
+		return cptab->ctb_distance;
+
+	return cptab->ctb_parts[cpt1].cpt_distance[cpt2];
 }
 EXPORT_SYMBOL(cfs_cpt_distance);
 
+/*
+ * Calculate the maximum NUMA distance between all nodes in the
+ * from_mask and all nodes in the to_mask.
+ */
+static unsigned int cfs_cpt_distance_calculate(nodemask_t *from_mask,
+					       nodemask_t *to_mask)
+{
+	unsigned int maximum;
+	unsigned int distance;
+	int from;
+	int to;
+
+	maximum = 0;
+	for_each_node_mask(from, *from_mask) {
+		for_each_node_mask(to, *to_mask) {
+			distance = node_distance(from, to);
+			if (maximum < distance)
+				maximum = distance;
+		}
+	}
+	return maximum;
+}
+
+static void cfs_cpt_add_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+	cptab->ctb_cpu2cpt[cpu] = cpt;
+
+	cpumask_set_cpu(cpu, cptab->ctb_cpumask);
+	cpumask_set_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
+}
+
+static void cfs_cpt_del_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+	cpumask_clear_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
+	cpumask_clear_cpu(cpu, cptab->ctb_cpumask);
+
+	cptab->ctb_cpu2cpt[cpu] = -1;
+}
+
+static void cfs_cpt_add_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+	struct cfs_cpu_partition *part;
+
+	if (!node_isset(node, *cptab->ctb_nodemask)) {
+		unsigned int dist;
+
+		/* first time node is added to the CPT table */
+		node_set(node, *cptab->ctb_nodemask);
+		cptab->ctb_node2cpt[node] = cpt;
+
+		dist = cfs_cpt_distance_calculate(cptab->ctb_nodemask,
+						  cptab->ctb_nodemask);
+		cptab->ctb_distance = dist;
+	}
+
+	part = &cptab->ctb_parts[cpt];
+	if (!node_isset(node, *part->cpt_nodemask)) {
+		int cpt2;
+
+		/* first time node is added to this CPT */
+		node_set(node, *part->cpt_nodemask);
+		for (cpt2 = 0; cpt2 < cptab->ctb_nparts; cpt2++) {
+			struct cfs_cpu_partition *part2;
+			unsigned int dist;
+
+			part2 = &cptab->ctb_parts[cpt2];
+			dist = cfs_cpt_distance_calculate(part->cpt_nodemask,
+							  part2->cpt_nodemask);
+			part->cpt_distance[cpt2] = dist;
+			dist = cfs_cpt_distance_calculate(part2->cpt_nodemask,
+							  part->cpt_nodemask);
+			part2->cpt_distance[cpt] = dist;
+		}
+	}
+}
+
+static void cfs_cpt_del_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+	struct cfs_cpu_partition *part = &cptab->ctb_parts[cpt];
+	int cpu;
+
+	for_each_cpu(cpu, part->cpt_cpumask) {
+		/* this CPT has other CPU belonging to this node? */
+		if (cpu_to_node(cpu) == node)
+			break;
+	}
+
+	if (cpu >= nr_cpu_ids && node_isset(node,  *part->cpt_nodemask)) {
+		int cpt2;
+
+		/* No more CPUs in the node for this CPT. */
+		node_clear(node, *part->cpt_nodemask);
+		for (cpt2 = 0; cpt2 < cptab->ctb_nparts; cpt2++) {
+			struct cfs_cpu_partition *part2;
+			unsigned int dist;
+
+			part2 = &cptab->ctb_parts[cpt2];
+			if (node_isset(node, *part2->cpt_nodemask))
+				cptab->ctb_node2cpt[node] = cpt2;
+
+			dist = cfs_cpt_distance_calculate(part->cpt_nodemask,
+							  part2->cpt_nodemask);
+			part->cpt_distance[cpt2] = dist;
+			dist = cfs_cpt_distance_calculate(part2->cpt_nodemask,
+							  part->cpt_nodemask);
+			part2->cpt_distance[cpt] = dist;
+		}
+	}
+
+	for_each_cpu(cpu, cptab->ctb_cpumask) {
+		/* this CPT-table has other CPUs belonging to this node? */
+		if (cpu_to_node(cpu) == node)
+			break;
+	}
+
+	if (cpu >= nr_cpu_ids && node_isset(node, *cptab->ctb_nodemask)) {
+		/* No more CPUs in the table for this node. */
+		node_clear(node, *cptab->ctb_nodemask);
+		cptab->ctb_node2cpt[node] = -1;
+		cptab->ctb_distance =
+			cfs_cpt_distance_calculate(cptab->ctb_nodemask,
+						   cptab->ctb_nodemask);
+	}
+}
+
 int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
 {
+	LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts);
+
+	if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_online(cpu)) {
+		CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu);
+		return 0;
+	}
+
+	if (cptab->ctb_cpu2cpt[cpu] != -1) {
+		CDEBUG(D_INFO, "CPU %d is already in partition %d\n",
+		       cpu, cptab->ctb_cpu2cpt[cpu]);
+		return 0;
+	}
+
+	if (cpumask_test_cpu(cpu, cptab->ctb_cpumask)) {
+		CDEBUG(D_INFO, "CPU %d is already in cpumask\n", cpu);
+		return 0;
+	}
+
+	if (cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask)) {
+		CDEBUG(D_INFO, "CPU %d is already in partition %d cpumask\n",
+		       cpu, cptab->ctb_cpu2cpt[cpu]);
+		return 0;
+	}
+
+	cfs_cpt_add_cpu(cptab, cpt, cpu);
+	cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu));
+
 	return 1;
 }
 EXPORT_SYMBOL(cfs_cpt_set_cpu);
 
 void cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
 {
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	if (cpu < 0 || cpu >= nr_cpu_ids) {
+		CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu);
+		return;
+	}
+
+	if (cpt == CFS_CPT_ANY) {
+		/* caller doesn't know the partition ID */
+		cpt = cptab->ctb_cpu2cpt[cpu];
+		if (cpt < 0) { /* not set in this CPT-table */
+			CDEBUG(D_INFO,
+			       "Try to unset cpu %d which is not in CPT-table %p\n",
+			       cpt, cptab);
+			return;
+		}
+
+	} else if (cpt != cptab->ctb_cpu2cpt[cpu]) {
+		CDEBUG(D_INFO,
+		       "CPU %d is not in CPU partition %d\n", cpu, cpt);
+		return;
+	}
+
+	LASSERT(cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
+	LASSERT(cpumask_test_cpu(cpu, cptab->ctb_cpumask));
+
+	cfs_cpt_del_cpu(cptab, cpt, cpu);
+	cfs_cpt_del_node(cptab, cpt, cpu_to_node(cpu));
 }
 EXPORT_SYMBOL(cfs_cpt_unset_cpu);
 
 int cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt,
 			const cpumask_t *mask)
 {
+	int cpu;
+
+	if (!cpumask_weight(mask) ||
+	    cpumask_any_and(mask, cpu_online_mask) >= nr_cpu_ids) {
+		CDEBUG(D_INFO,
+		       "No online CPU is found in the CPU mask for CPU partition %d\n",
+		       cpt);
+		return 0;
+	}
+
+	for_each_cpu(cpu, mask) {
+		cfs_cpt_add_cpu(cptab, cpt, cpu);
+		cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu));
+	}
+
 	return 1;
 }
 EXPORT_SYMBOL(cfs_cpt_set_cpumask);
@@ -159,23 +571,65 @@ EXPORT_SYMBOL(cfs_cpt_set_cpumask);
 void cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt,
 			   const cpumask_t *mask)
 {
+	int cpu;
+
+	for_each_cpu(cpu, mask) {
+		cfs_cpt_del_cpu(cptab, cpt, cpu);
+		cfs_cpt_del_node(cptab, cpt, cpu_to_node(cpu));
+	}
 }
 EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
 
 int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
 {
+	const cpumask_t *mask;
+	int cpu;
+
+	if (node < 0 || node >= nr_node_ids) {
+		CDEBUG(D_INFO,
+		       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
+		return 0;
+	}
+
+	mask = cpumask_of_node(node);
+
+	for_each_cpu(cpu, mask)
+		cfs_cpt_add_cpu(cptab, cpt, cpu);
+
+	cfs_cpt_add_node(cptab, cpt, node);
+
 	return 1;
 }
 EXPORT_SYMBOL(cfs_cpt_set_node);
 
 void cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
 {
+	const cpumask_t *mask;
+	int cpu;
+
+	if (node < 0 || node >= nr_node_ids) {
+		CDEBUG(D_INFO,
+		       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
+		return;
+	}
+
+	mask = cpumask_of_node(node);
+
+	for_each_cpu(cpu, mask)
+		cfs_cpt_del_cpu(cptab, cpt, cpu);
+
+	cfs_cpt_del_node(cptab, cpt, node);
 }
 EXPORT_SYMBOL(cfs_cpt_unset_node);
 
 int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt,
 			 const nodemask_t *mask)
 {
+	int node;
+
+	for_each_node_mask(node, *mask)
+		cfs_cpt_set_node(cptab, cpt, node);
+
 	return 1;
 }
 EXPORT_SYMBOL(cfs_cpt_set_nodemask);
@@ -183,42 +637,674 @@ EXPORT_SYMBOL(cfs_cpt_set_nodemask);
 void cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt,
 			    const nodemask_t *mask)
 {
+	int node;
+
+	for_each_node_mask(node, *mask)
+		cfs_cpt_unset_node(cptab, cpt, node);
 }
 EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
 
 int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
 {
-	return 0;
+	nodemask_t *mask;
+	int weight;
+	unsigned int rotor;
+	int node = 0;
+
+	/* convert CPU partition ID to HW node id */
+
+	if (cpt < 0 || cpt >= cptab->ctb_nparts) {
+		mask = cptab->ctb_nodemask;
+		rotor = cptab->ctb_spread_rotor++;
+	} else {
+		mask = cptab->ctb_parts[cpt].cpt_nodemask;
+		rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++;
+		node  = cptab->ctb_parts[cpt].cpt_node;
+	}
+
+	weight = nodes_weight(*mask);
+	if (weight > 0) {
+		rotor %= weight;
+
+		for_each_node_mask(node, *mask) {
+			if (!rotor--)
+				return node;
+		}
+	}
+
+	return node;
 }
 EXPORT_SYMBOL(cfs_cpt_spread_node);
 
 int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
 {
-	return 0;
+	int cpu;
+	int cpt;
+
+	preempt_disable();
+	cpu = smp_processor_id();
+	cpt = cptab->ctb_cpu2cpt[cpu];
+
+	if (cpt < 0 && remap) {
+		/* don't return negative value for safety of upper layer,
+		 * instead we shadow the unknown cpu to a valid partition ID
+		 */
+		cpt = cpu % cptab->ctb_nparts;
+	}
+	preempt_enable();
+	return cpt;
 }
 EXPORT_SYMBOL(cfs_cpt_current);
 
 int cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
 {
-	return 0;
+	LASSERT(cpu >= 0 && cpu < nr_cpu_ids);
+
+	return cptab->ctb_cpu2cpt[cpu];
 }
 EXPORT_SYMBOL(cfs_cpt_of_cpu);
 
 int cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node)
 {
-	return 0;
+	if (node < 0 || node > nr_node_ids)
+		return CFS_CPT_ANY;
+
+	return cptab->ctb_node2cpt[node];
 }
 EXPORT_SYMBOL(cfs_cpt_of_node);
 
 int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
 {
+	nodemask_t *nodemask;
+	cpumask_t *cpumask;
+	int cpu;
+	int rc;
+
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	if (cpt == CFS_CPT_ANY) {
+		cpumask = cptab->ctb_cpumask;
+		nodemask = cptab->ctb_nodemask;
+	} else {
+		cpumask = cptab->ctb_parts[cpt].cpt_cpumask;
+		nodemask = cptab->ctb_parts[cpt].cpt_nodemask;
+	}
+
+	if (!cpumask_intersects(cpumask, cpu_online_mask)) {
+		CDEBUG(D_INFO,
+		       "No online CPU found in CPU partition %d, did someone do CPU hotplug on system? You might need to reload Lustre modules to keep system working well.\n",
+			cpt);
+		return -ENODEV;
+	}
+
+	for_each_online_cpu(cpu) {
+		if (cpumask_test_cpu(cpu, cpumask))
+			continue;
+
+		rc = set_cpus_allowed_ptr(current, cpumask);
+		set_mems_allowed(*nodemask);
+		if (!rc)
+			schedule(); /* switch to allowed CPU */
+
+		return rc;
+	}
+
+	/* don't need to set affinity because all online CPUs are covered */
 	return 0;
 }
 EXPORT_SYMBOL(cfs_cpt_bind);
 
+/**
+ * Choose max to \a number CPUs from \a node and set them in \a cpt.
+ * We always prefer to choose CPU in the same core/socket.
+ */
+static int cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt,
+				cpumask_t *node_mask, int number)
+{
+	cpumask_t *socket_mask = NULL;
+	cpumask_t *core_mask = NULL;
+	int rc = 0;
+	int cpu;
+	int i;
+
+	LASSERT(number > 0);
+
+	if (number >= cpumask_weight(node_mask)) {
+		while (!cpumask_empty(node_mask)) {
+			cpu = cpumask_first(node_mask);
+			cpumask_clear_cpu(cpu, node_mask);
+
+			if (!cpu_online(cpu))
+				continue;
+
+			rc = cfs_cpt_set_cpu(cptab, cpt, cpu);
+			if (!rc)
+				return -EINVAL;
+		}
+		return 0;
+	}
+
+	/* allocate scratch buffer */
+	LIBCFS_ALLOC(socket_mask, cpumask_size());
+	LIBCFS_ALLOC(core_mask, cpumask_size());
+	if (!socket_mask || !core_mask) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	while (!cpumask_empty(node_mask)) {
+		cpu = cpumask_first(node_mask);
+
+		/* get cpumask for cores in the same socket */
+		cpumask_and(socket_mask, topology_core_cpumask(cpu), node_mask);
+		while (!cpumask_empty(socket_mask)) {
+			/* get cpumask for hts in the same core */
+			cpumask_and(core_mask, topology_sibling_cpumask(cpu),
+				    node_mask);
+
+			for_each_cpu(i, core_mask) {
+				cpumask_clear_cpu(i, socket_mask);
+				cpumask_clear_cpu(i, node_mask);
+
+				if (!cpu_online(i))
+					continue;
+
+				rc = cfs_cpt_set_cpu(cptab, cpt, i);
+				if (!rc) {
+					rc = -EINVAL;
+					goto out;
+				}
+
+				if (!--number)
+					goto out;
+			}
+			cpu = cpumask_first(socket_mask);
+		}
+	}
+
+out:
+	if (core_mask)
+		LIBCFS_FREE(core_mask, cpumask_size());
+	if (socket_mask)
+		LIBCFS_FREE(socket_mask, cpumask_size());
+	return rc;
+}
+
+#define CPT_WEIGHT_MIN 4
+
+static int cfs_cpt_num_estimate(void)
+{
+	int nthr = cpumask_weight(topology_sibling_cpumask(smp_processor_id()));
+	int ncpu = num_online_cpus();
+	int ncpt = 1;
+
+	if (ncpu > CPT_WEIGHT_MIN)
+		for (ncpt = 2; ncpu > 2 * nthr * ncpt; ncpt++)
+			; /* nothing */
+
+#if (BITS_PER_LONG == 32)
+	/* config many CPU partitions on 32-bit system could consume
+	 * too much memory
+	 */
+	ncpt = min(2, ncpt);
+#endif
+	while (ncpu % ncpt)
+		ncpt--; /* worst case is 1 */
+
+	return ncpt;
+}
+
+static struct cfs_cpt_table *cfs_cpt_table_create(int ncpt)
+{
+	struct cfs_cpt_table *cptab = NULL;
+	cpumask_t *node_mask = NULL;
+	int cpt = 0;
+	int node;
+	int num;
+	int rem;
+	int rc = 0;
+
+	num = cfs_cpt_num_estimate();
+	if (ncpt <= 0)
+		ncpt = num;
+
+	if (ncpt > num_online_cpus()) {
+		rc = -EINVAL;
+		CERROR("libcfs: CPU partition count %d > cores %d: rc = %d\n",
+		       ncpt, num_online_cpus(), rc);
+		goto failed;
+	}
+
+	if (ncpt > 4 * num) {
+		CWARN("CPU partition number %d is larger than suggested value (%d), your system may have performance issue or run out of memory while under pressure\n",
+		      ncpt, num);
+	}
+
+	cptab = cfs_cpt_table_alloc(ncpt);
+	if (!cptab) {
+		CERROR("Failed to allocate CPU map(%d)\n", ncpt);
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	LIBCFS_ALLOC(node_mask, cpumask_size());
+	if (!node_mask) {
+		CERROR("Failed to allocate scratch cpumask\n");
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	num = num_online_cpus() / ncpt;
+	rem = num_online_cpus() % ncpt;
+	for_each_online_node(node) {
+		cpumask_copy(node_mask, cpumask_of_node(node));
+
+		while (cpt < ncpt && !cpumask_empty(node_mask)) {
+			struct cfs_cpu_partition *part = &cptab->ctb_parts[cpt];
+			int ncpu = cpumask_weight(part->cpt_cpumask);
+
+			rc = cfs_cpt_choose_ncpus(cptab, cpt, node_mask,
+						  (rem > 0) + num - ncpu);
+			if (rc < 0) {
+				rc = -EINVAL;
+				goto failed_mask;
+			}
+
+			ncpu = cpumask_weight(part->cpt_cpumask);
+			if (ncpu == num + !!(rem > 0)) {
+				cpt++;
+				rem--;
+			}
+		}
+	}
+
+	LIBCFS_FREE(node_mask, cpumask_size());
+
+	return cptab;
+
+failed_mask:
+	if (node_mask)
+		LIBCFS_FREE(node_mask, cpumask_size());
+failed:
+	CERROR("Failed (rc = %d) to setup CPU partition table with %d partitions, online HW NUMA nodes: %d, HW CPU cores: %d.\n",
+	       rc, ncpt, num_online_nodes(), num_online_cpus());
+
+	if (cptab)
+		cfs_cpt_table_free(cptab);
+
+	return ERR_PTR(rc);
+}
+
+static struct cfs_cpt_table *cfs_cpt_table_create_pattern(const char *pattern)
+{
+	struct cfs_cpt_table *cptab;
+	char *pattern_dup;
+	char *bracket;
+	char *str;
+	int node = 0;
+	int ncpt = 0;
+	int cpt = 0;
+	int high;
+	int rc;
+	int c;
+	int i;
+
+	pattern_dup = kstrdup(pattern, GFP_KERNEL);
+	if (!pattern_dup) {
+		CERROR("Failed to duplicate pattern '%s'\n", pattern);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	str = cfs_trimwhite(pattern_dup);
+	if (*str == 'n' || *str == 'N') {
+		str++; /* skip 'N' char */
+		node = 1; /* NUMA pattern */
+		if (*str == '\0') {
+			node = -1;
+			for_each_online_node(i) {
+				if (!cpumask_empty(cpumask_of_node(i)))
+					ncpt++;
+			}
+			if (ncpt == 1) { /* single NUMA node */
+				kfree(pattern_dup);
+				return cfs_cpt_table_create(cpu_npartitions);
+			}
+		}
+	}
+
+	if (!ncpt) { /* scanning bracket which is mark of partition */
+		bracket = str;
+		while ((bracket = strchr(bracket, '['))) {
+			bracket++;
+			ncpt++;
+		}
+	}
+
+	if (!ncpt ||
+	    (node && ncpt > num_online_nodes()) ||
+	    (!node && ncpt > num_online_cpus())) {
+		CERROR("Invalid pattern '%s', or too many partitions %d\n",
+		       pattern_dup, ncpt);
+		rc = -EINVAL;
+		goto err_free_str;
+	}
+
+	cptab = cfs_cpt_table_alloc(ncpt);
+	if (!cptab) {
+		CERROR("Failed to allocate CPU partition table\n");
+		rc = -ENOMEM;
+		goto err_free_str;
+	}
+
+	if (node < 0) { /* shortcut to create CPT from NUMA & CPU topology */
+		for_each_online_node(i) {
+			if (cpumask_empty(cpumask_of_node(i)))
+				continue;
+
+			rc = cfs_cpt_set_node(cptab, cpt++, i);
+			if (!rc) {
+				rc = -EINVAL;
+				goto err_free_table;
+			}
+		}
+		kfree(pattern_dup);
+		return cptab;
+	}
+
+	high = node ? nr_node_ids - 1 : nr_cpu_ids - 1;
+
+	for (str = cfs_trimwhite(str), c = 0; /* until break */; c++) {
+		struct cfs_range_expr *range;
+		struct cfs_expr_list *el;
+		int n;
+
+		bracket = strchr(str, '[');
+		if (!bracket) {
+			if (*str) {
+				CERROR("Invalid pattern '%s'\n", str);
+				rc = -EINVAL;
+				goto err_free_table;
+			} else if (c != ncpt) {
+				CERROR("Expect %d partitions but found %d\n",
+				       ncpt, c);
+				rc = -EINVAL;
+				goto err_free_table;
+			}
+			break;
+		}
+
+		if (sscanf(str, "%d%n", &cpt, &n) < 1) {
+			CERROR("Invalid CPU pattern '%s'\n", str);
+			rc = -EINVAL;
+			goto err_free_table;
+		}
+
+		if (cpt < 0 || cpt >= ncpt) {
+			CERROR("Invalid partition id %d, total partitions %d\n",
+			       cpt, ncpt);
+			rc = -EINVAL;
+			goto err_free_table;
+		}
+
+		if (cfs_cpt_weight(cptab, cpt)) {
+			CERROR("Partition %d has already been set.\n", cpt);
+			rc = -EPERM;
+			goto err_free_table;
+		}
+
+		str = cfs_trimwhite(str + n);
+		if (str != bracket) {
+			CERROR("Invalid pattern '%s'\n", str);
+			rc = -EINVAL;
+			goto err_free_table;
+		}
+
+		bracket = strchr(str, ']');
+		if (!bracket) {
+			CERROR("Missing right bracket for partition %d in '%s'\n",
+			       cpt, str);
+			rc = -EINVAL;
+			goto err_free_table;
+		}
+
+		rc = cfs_expr_list_parse(str, (bracket - str) + 1, 0, high,
+					 &el);
+		if (rc) {
+			CERROR("Can't parse number range in '%s'\n", str);
+			rc = -ERANGE;
+			goto err_free_table;
+		}
+
+		list_for_each_entry(range, &el->el_exprs, re_link) {
+			for (i = range->re_lo; i <= range->re_hi; i++) {
+				if ((i - range->re_lo) % range->re_stride)
+					continue;
+
+				rc = node ? cfs_cpt_set_node(cptab, cpt, i)
+					  : cfs_cpt_set_cpu(cptab, cpt, i);
+				if (!rc) {
+					cfs_expr_list_free(el);
+					rc = -EINVAL;
+					goto err_free_table;
+				}
+			}
+		}
+
+		cfs_expr_list_free(el);
+
+		if (!cfs_cpt_online(cptab, cpt)) {
+			CERROR("No online CPU is found on partition %d\n", cpt);
+			rc = -ENODEV;
+			goto err_free_table;
+		}
+
+		str = cfs_trimwhite(bracket + 1);
+	}
+
+	kfree(pattern_dup);
+	return cptab;
+
+err_free_table:
+	cfs_cpt_table_free(cptab);
+err_free_str:
+	kfree(pattern_dup);
+	return ERR_PTR(rc);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+#ifdef HAVE_HOTPLUG_STATE_MACHINE
+static enum cpuhp_state lustre_cpu_online;
+
+static int cfs_cpu_online(unsigned int cpu)
+{
+	return 0;
+}
+#endif
+
+static int cfs_cpu_dead(unsigned int cpu)
+{
+	bool warn;
+
+	/* if all HTs in a core are offline, it may break affinity */
+	warn = cpumask_any_and(topology_sibling_cpumask(cpu),
+			       cpu_online_mask) >= nr_cpu_ids;
+	CDEBUG(warn ? D_WARNING : D_INFO,
+	       "Lustre: can't support CPU plug-out well now, performance and stability could be impacted [CPU %u]\n",
+	       cpu);
+	return 0;
+}
+
+#ifndef HAVE_HOTPLUG_STATE_MACHINE
+static int cfs_cpu_notify(struct notifier_block *self, unsigned long action,
+			  void *hcpu)
+{
+	int cpu = (unsigned long)hcpu;
+
+	switch (action) {
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+	default:
+		if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) {
+			CDEBUG(D_INFO, "CPU changed [cpu %u action %lx]\n",
+			       cpu, action);
+			break;
+		}
+
+		cfs_cpu_dead(cpu);
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cfs_cpu_notifier = {
+	.notifier_call	= cfs_cpu_notify,
+	.priority	= 0
+};
+#endif /* !HAVE_HOTPLUG_STATE_MACHINE */
+#endif /* CONFIG_HOTPLUG_CPU */
+
+void cfs_cpu_fini(void)
+{
+	if (!IS_ERR_OR_NULL(cfs_cpt_table))
+		cfs_cpt_table_free(cfs_cpt_table);
+
+#ifdef CONFIG_HOTPLUG_CPU
+#ifdef HAVE_HOTPLUG_STATE_MACHINE
+	if (lustre_cpu_online > 0)
+		cpuhp_remove_state_nocalls(lustre_cpu_online);
+	cpuhp_remove_state_nocalls(CPUHP_LUSTRE_CFS_DEAD);
+#else
+	unregister_hotcpu_notifier(&cfs_cpu_notifier);
+#endif /* !HAVE_HOTPLUG_STATE_MACHINE */
+#endif /* CONFIG_HOTPLUG_CPU */
+}
+
+int cfs_cpu_init(void)
+{
+	int ret;
+
+	LASSERT(!cfs_cpt_table);
+
+#ifdef CONFIG_HOTPLUG_CPU
+#ifdef HAVE_HOTPLUG_STATE_MACHINE
+	ret = cpuhp_setup_state_nocalls(CPUHP_LUSTRE_CFS_DEAD,
+					"fs/lustre/cfe:dead", NULL,
+					cfs_cpu_dead);
+	if (ret < 0)
+		goto failed_cpu_dead;
+
+	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+					"fs/lustre/cfe:online",
+					cfs_cpu_online, NULL);
+	if (ret < 0)
+		goto failed_cpu_online;
+
+	lustre_cpu_online = ret;
+#else
+	register_hotcpu_notifier(&cfs_cpu_notifier);
+#endif /* !HAVE_HOTPLUG_STATE_MACHINE */
+#endif /* CONFIG_HOTPLUG_CPU */
+
+	cpus_read_lock();
+	if (*cpu_pattern) {
+		cfs_cpt_table = cfs_cpt_table_create_pattern(cpu_pattern);
+		if (IS_ERR(cfs_cpt_table)) {
+			CERROR("Failed to create cptab from pattern '%s'\n",
+			       cpu_pattern);
+			ret = PTR_ERR(cfs_cpt_table);
+			goto failed_alloc_table;
+		}
+
+	} else {
+		cfs_cpt_table = cfs_cpt_table_create(cpu_npartitions);
+		if (IS_ERR(cfs_cpt_table)) {
+			CERROR("Failed to create cptab with npartitions %d\n",
+			       cpu_npartitions);
+			ret = PTR_ERR(cfs_cpt_table);
+			goto failed_alloc_table;
+		}
+	}
+
+	cpus_read_unlock();
+
+	LCONSOLE(0, "HW NUMA nodes: %d, HW CPU cores: %d, npartitions: %d\n",
+		 num_online_nodes(), num_online_cpus(),
+		 cfs_cpt_number(cfs_cpt_table));
+	return 0;
+
+failed_alloc_table:
+	cpus_read_unlock();
+
+	if (!IS_ERR_OR_NULL(cfs_cpt_table))
+		cfs_cpt_table_free(cfs_cpt_table);
+
+#ifdef CONFIG_HOTPLUG_CPU
+#ifdef HAVE_HOTPLUG_STATE_MACHINE
+	if (lustre_cpu_online > 0)
+		cpuhp_remove_state_nocalls(lustre_cpu_online);
+failed_cpu_online:
+	cpuhp_remove_state_nocalls(CPUHP_LUSTRE_CFS_DEAD);
+failed_cpu_dead:
+#else
+	unregister_hotcpu_notifier(&cfs_cpu_notifier);
+#endif /* !HAVE_HOTPLUG_STATE_MACHINE */
+#endif /* CONFIG_HOTPLUG_CPU */
+	return ret;
+}
+
+#else /* ! CONFIG_SMP */
+
+struct cfs_cpt_table *cfs_cpt_table_alloc(int ncpt)
+{
+	struct cfs_cpt_table *cptab;
+
+	if (ncpt != 1) {
+		CERROR("Can't support cpu partition number %d\n", ncpt);
+		return NULL;
+	}
+
+	LIBCFS_ALLOC(cptab, sizeof(*cptab));
+	if (!cptab)
+		return NULL;
+
+	cpumask_set_cpu(0, cptab->ctb_cpumask);
+	node_set(0, cptab->ctb_nodemask);
+
+	return cptab;
+}
+EXPORT_SYMBOL(cfs_cpt_table_alloc);
+
+int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
+{
+	int rc;
+
+	rc = snprintf(buf, len, "0\t: 0\n");
+	len -= rc;
+	if (len <= 0)
+		return -EFBIG;
+
+	return rc;
+}
+EXPORT_SYMBOL(cfs_cpt_table_print);
+
+int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len)
+{
+	int rc;
+
+	rc = snprintf(buf, len, "0\t: 0:1\n");
+	len -= rc;
+	if (len <= 0)
+		return -EFBIG;
+
+	return rc;
+}
+EXPORT_SYMBOL(cfs_cpt_distance_print);
+
 void cfs_cpu_fini(void)
 {
-	if (cfs_cpt_table != NULL) {
+	if (cfs_cpt_table) {
 		cfs_cpt_table_free(cfs_cpt_table);
 		cfs_cpt_table = NULL;
 	}
@@ -228,7 +1314,7 @@ int cfs_cpu_init(void)
 {
 	cfs_cpt_table = cfs_cpt_table_alloc(1);
 
-	return cfs_cpt_table != NULL ? 0 : -1;
+	return cfs_cpt_table ? 0 : -1;
 }
 
-#endif /* HAVE_LIBCFS_CPT */
+#endif /* !CONFIG_SMP */
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c
index 2f401e74a7dd7..5f85219101eb0 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_mem.c
@@ -33,7 +33,6 @@
 
 #define DEBUG_SUBSYSTEM S_LNET
 
-#include <linux/workqueue.h>
 #include <libcfs/libcfs.h>
 
 struct cfs_var_array {
@@ -172,9 +171,12 @@ cfs_array_alloc(int count, unsigned int size)
 }
 EXPORT_SYMBOL(cfs_array_alloc);
 
+#ifdef HAVE_LIBCFS_VFREE_ATOMIC
+#include <linux/workqueue.h>
 /*
  * This is opencoding of vfree_atomic from Linux kernel added in 4.10 with
- * minimum changes needed to work on older kernels too.
+ * minimum changes needed to work on some older kernels too.
+ * For RHEL6, just use vfree() directly since it is missing too much code.
  */
 
 #ifndef raw_cpu_ptr
@@ -183,12 +185,12 @@ EXPORT_SYMBOL(cfs_array_alloc);
 
 #ifndef llist_for_each_safe
 #define llist_for_each_safe(pos, n, node)                       \
-		for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n))
+	for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n))
 #endif
 
 struct vfree_deferred {
-		struct llist_head list;
-		struct work_struct wq;
+	struct llist_head list;
+	struct work_struct wq;
 };
 static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
 
@@ -230,3 +232,4 @@ void __exit exit_libcfs_vfree_atomic(void)
 {
 	flush_scheduled_work();
 }
+#endif /* HAVE_LIBCFS_VFREE_ATOMIC */
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c
deleted file mode 100644
index 9786288cbad50..0000000000000
--- a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_ptask.c
+++ /dev/null
@@ -1,478 +0,0 @@
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/cpumask.h>
-#include <linux/cpu.h>
-#include <linux/slab.h>
-#include <linux/sched.h>
-#include <linux/moduleparam.h>
-#include <linux/mmu_context.h>
-
-#define DEBUG_SUBSYSTEM S_UNDEFINED
-
-#include <libcfs/libcfs.h>
-#include <libcfs/libcfs_ptask.h>
-
-/**
- * This API based on Linux kernel padada API which is used to perform
- * encryption and decryption on large numbers of packets without
- * reordering those packets.
- *
- * It was adopted for general use in Lustre for parallelization of
- * various functionality.
- *
- * The first step in using it is to set up a cfs_ptask structure to
- * control of how this task are to be run:
- *
- * #include <libcfs/libcfs_ptask.h>
- *
- * int cfs_ptask_init(struct cfs_ptask *ptask, cfs_ptask_cb_t cbfunc,
- *                    void *cbdata, unsigned int flags, int cpu);
- *
- * The cbfunc function with cbdata argument will be called in the process
- * of getting the task done. The cpu specifies which CPU will be used for
- * the final callback when the task is done.
- *
- * The submission of task is done with:
- *
- * int cfs_ptask_submit(struct cfs_ptask *ptask, struct cfs_ptask_engine *engine);
- *
- * The task is submitted to the engine for execution.
- *
- * In order to wait for result of task execution you should call:
- *
- * int cfs_ptask_wait_for(struct cfs_ptask *ptask);
- *
- * The tasks with flag PTF_ORDERED are executed in parallel but complete
- * into submission order. So, waiting for last ordered task you can be sure
- * that all previous tasks were done before this task complete.
- */
-
-#ifndef HAVE_REINIT_COMPLETION
-/**
- * reinit_completion - reinitialize a completion structure
- * @x:  pointer to completion structure that is to be reinitialized
- *
- * This inline function should be used to reinitialize a completion
- * structure so it can be reused. This is especially important after
- * complete_all() is used.
- */
-static inline void reinit_completion(struct completion *x)
-{
-	x->done = 0;
-}
-#endif
-
-#ifndef HAVE_CPUMASK_PRINT_TO_PAGEBUF
-static inline void cpumap_print_to_pagebuf(bool unused, char *buf,
-					   const struct cpumask *mask)
-{
-	cpulist_scnprintf(buf, PAGE_SIZE, mask);
-}
-#endif
-
-#ifdef CONFIG_PADATA
-static void cfs_ptask_complete(struct padata_priv *padata)
-{
-	struct cfs_ptask *ptask = cfs_padata2ptask(padata);
-
-	if (cfs_ptask_need_complete(ptask)) {
-		if (cfs_ptask_is_ordered(ptask))
-			complete(&ptask->pt_completion);
-	} else if (cfs_ptask_is_autofree(ptask)) {
-		kfree(ptask);
-	}
-}
-
-static void cfs_ptask_execute(struct padata_priv *padata)
-{
-	struct cfs_ptask *ptask = cfs_padata2ptask(padata);
-	bool bh_enabled = false;
-
-	if (!cfs_ptask_is_atomic(ptask)) {
-		local_bh_enable();
-		bh_enabled = true;
-	}
-
-	if (cfs_ptask_use_user_mm(ptask) && ptask->pt_mm != NULL) {
-		kthread_use_mm(ptask->pt_mm);
-	}
-
-	if (ptask->pt_cbfunc != NULL)
-		ptask->pt_result = ptask->pt_cbfunc(ptask);
-	else
-		ptask->pt_result = -ENOSYS;
-
-	if (cfs_ptask_use_user_mm(ptask) && ptask->pt_mm != NULL) {
-		kthread_unuse_mm(ptask->pt_mm);
-		mmput(ptask->pt_mm);
-		ptask->pt_mm = NULL;
-	}
-
-	if (cfs_ptask_need_complete(ptask) && !cfs_ptask_is_ordered(ptask))
-		complete(&ptask->pt_completion);
-
-	if (bh_enabled)
-		local_bh_disable();
-
-	padata_do_serial(padata);
-}
-
-static int cfs_do_parallel(struct cfs_ptask_engine *engine,
-			   struct padata_priv *padata)
-{
-	struct cfs_ptask *ptask = cfs_padata2ptask(padata);
-	int rc;
-
-	if (cfs_ptask_need_complete(ptask))
-		reinit_completion(&ptask->pt_completion);
-
-	if (cfs_ptask_use_user_mm(ptask)) {
-		ptask->pt_mm = get_task_mm(current);
-	}
-	ptask->pt_result = -EINPROGRESS;
-
-retry:
-	rc = padata_do_parallel(engine->pte_pinst, padata, ptask->pt_cbcpu);
-	if (rc == -EBUSY && cfs_ptask_is_retry(ptask)) {
-		/* too many tasks already in queue */
-		schedule_timeout_uninterruptible(1);
-		goto retry;
-	}
-
-	if (rc) {
-		if (cfs_ptask_use_user_mm(ptask) && ptask->pt_mm != NULL) {
-			mmput(ptask->pt_mm);
-			ptask->pt_mm = NULL;
-		}
-		ptask->pt_result = rc;
-	}
-
-	return rc;
-}
-
-/**
- * This function submit initialized task for async execution
- * in engine with specified id.
- */
-int cfs_ptask_submit(struct cfs_ptask *ptask, struct cfs_ptask_engine *engine)
-{
-	struct padata_priv *padata = cfs_ptask2padata(ptask);
-
-	if (IS_ERR_OR_NULL(engine))
-		return -EINVAL;
-
-	memset(padata, 0, sizeof(*padata));
-
-	padata->parallel = cfs_ptask_execute;
-	padata->serial   = cfs_ptask_complete;
-
-	return cfs_do_parallel(engine, padata);
-}
-
-#else  /* !CONFIG_PADATA */
-
-/**
- * If CONFIG_PADATA is not defined this function just execute
- * the initialized task in current thread. (emulate async execution)
- */
-int cfs_ptask_submit(struct cfs_ptask *ptask, struct cfs_ptask_engine *engine)
-{
-	if (IS_ERR_OR_NULL(engine))
-		return -EINVAL;
-
-	if (ptask->pt_cbfunc != NULL)
-		ptask->pt_result = ptask->pt_cbfunc(ptask);
-	else
-		ptask->pt_result = -ENOSYS;
-
-	if (cfs_ptask_need_complete(ptask))
-		complete(&ptask->pt_completion);
-	else if (cfs_ptask_is_autofree(ptask))
-		kfree(ptask);
-
-	return 0;
-}
-#endif /* CONFIG_PADATA */
-
-EXPORT_SYMBOL(cfs_ptask_submit);
-
-/**
- * This function waits when task complete async execution.
- * The tasks with flag PTF_ORDERED are executed in parallel but completes
- * into submission order. So, waiting for last ordered task you can be sure
- * that all previous tasks were done before this task complete.
- */
-int cfs_ptask_wait_for(struct cfs_ptask *ptask)
-{
-	if (!cfs_ptask_need_complete(ptask))
-		return -EINVAL;
-
-	wait_for_completion(&ptask->pt_completion);
-
-	return 0;
-}
-EXPORT_SYMBOL(cfs_ptask_wait_for);
-
-/**
- * This function initialize internal members of task and prepare it for
- * async execution.
- */
-int cfs_ptask_init(struct cfs_ptask *ptask, cfs_ptask_cb_t cbfunc, void *cbdata,
-		   unsigned int flags, int cpu)
-{
-	memset(ptask, 0, sizeof(*ptask));
-
-	ptask->pt_flags  = flags;
-	ptask->pt_cbcpu  = cpu;
-	ptask->pt_mm     = NULL; /* will be set in cfs_do_parallel() */
-	ptask->pt_cbfunc = cbfunc;
-	ptask->pt_cbdata = cbdata;
-	ptask->pt_result = -EAGAIN;
-
-	if (cfs_ptask_need_complete(ptask)) {
-		if (cfs_ptask_is_autofree(ptask))
-			return -EINVAL;
-
-		init_completion(&ptask->pt_completion);
-	}
-
-	if (cfs_ptask_is_atomic(ptask) && cfs_ptask_use_user_mm(ptask))
-		return -EINVAL;
-
-	return 0;
-}
-EXPORT_SYMBOL(cfs_ptask_init);
-
-/**
- * This function set the mask of allowed CPUs for parallel execution
- * for engine with specified id.
- */
-int cfs_ptengine_set_cpumask(struct cfs_ptask_engine *engine,
-			     const struct cpumask *cpumask)
-{
-	int rc = 0;
-
-#ifdef CONFIG_PADATA
-	cpumask_var_t serial_mask;
-	cpumask_var_t parallel_mask;
-
-	if (IS_ERR_OR_NULL(engine))
-		return -EINVAL;
-
-	if (!alloc_cpumask_var(&serial_mask, GFP_KERNEL))
-		return -ENOMEM;
-
-	if (!alloc_cpumask_var(&parallel_mask, GFP_KERNEL)) {
-		free_cpumask_var(serial_mask);
-		return -ENOMEM;
-	}
-
-	cpumask_copy(parallel_mask, cpumask);
-	cpumask_copy(serial_mask, cpu_online_mask);
-
-	rc = padata_set_cpumask(engine->pte_pinst, PADATA_CPU_PARALLEL,
-				parallel_mask);
-	free_cpumask_var(parallel_mask);
-	if (rc)
-		goto out_failed_mask;
-
-	rc = padata_set_cpumask(engine->pte_pinst, PADATA_CPU_SERIAL,
-				serial_mask);
-out_failed_mask:
-	free_cpumask_var(serial_mask);
-#endif /* CONFIG_PADATA */
-
-	return rc;
-}
-EXPORT_SYMBOL(cfs_ptengine_set_cpumask);
-
-/**
- * This function returns the count of allowed CPUs for parallel execution
- * for engine with specified id.
- */
-int cfs_ptengine_weight(struct cfs_ptask_engine *engine)
-{
-	if (IS_ERR_OR_NULL(engine))
-		return -EINVAL;
-
-	return engine->pte_weight;
-}
-EXPORT_SYMBOL(cfs_ptengine_weight);
-
-#ifdef CONFIG_PADATA
-static int cfs_ptask_cpumask_change_notify(struct notifier_block *self,
-					   unsigned long val, void *data)
-{
-	struct padata_cpumask *padata_cpumask = data;
-	struct cfs_ptask_engine *engine;
-
-	engine = container_of(self, struct cfs_ptask_engine, pte_notifier);
-
-	if (val & PADATA_CPU_PARALLEL)
-		engine->pte_weight = cpumask_weight(padata_cpumask->pcpu);
-
-	return 0;
-}
-
-static int cfs_ptengine_padata_init(struct cfs_ptask_engine *engine,
-				    const char *name,
-				    const struct cpumask *cpumask)
-{
-	cpumask_var_t all_mask;
-	cpumask_var_t par_mask;
-	unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_CPU_INTENSIVE;
-	int rc;
-
-	get_online_cpus();
-
-	engine->pte_wq = alloc_workqueue(name, wq_flags, 1);
-	if (engine->pte_wq == NULL)
-		GOTO(err, rc = -ENOMEM);
-
-	if (!alloc_cpumask_var(&all_mask, GFP_KERNEL))
-		GOTO(err_destroy_workqueue, rc = -ENOMEM);
-
-	if (!alloc_cpumask_var(&par_mask, GFP_KERNEL))
-		GOTO(err_free_all_mask, rc = -ENOMEM);
-
-	cpumask_copy(par_mask, cpumask);
-	if (cpumask_empty(par_mask) ||
-	    cpumask_equal(par_mask, cpu_online_mask)) {
-		cpumask_copy(all_mask, cpu_online_mask);
-		cpumask_clear(par_mask);
-		while (!cpumask_empty(all_mask)) {
-			int cpu = cpumask_first(all_mask);
-
-			cpumask_set_cpu(cpu, par_mask);
-			cpumask_andnot(all_mask, all_mask,
-					topology_sibling_cpumask(cpu));
-		}
-	}
-
-	cpumask_copy(all_mask, cpu_online_mask);
-
-	{
-		char *pa_mask_buff, *cb_mask_buff;
-
-		pa_mask_buff = (char *)__get_free_page(GFP_KERNEL);
-		if (pa_mask_buff == NULL)
-			GOTO(err_free_par_mask, rc = -ENOMEM);
-
-		cb_mask_buff = (char *)__get_free_page(GFP_KERNEL);
-		if (cb_mask_buff == NULL) {
-			free_page((unsigned long)pa_mask_buff);
-			GOTO(err_free_par_mask, rc = -ENOMEM);
-		}
-
-		cpumap_print_to_pagebuf(true, pa_mask_buff, par_mask);
-		pa_mask_buff[PAGE_SIZE - 1] = '\0';
-		cpumap_print_to_pagebuf(true, cb_mask_buff, all_mask);
-		cb_mask_buff[PAGE_SIZE - 1] = '\0';
-
-		CDEBUG(D_INFO, "%s weight=%u plist='%s' cblist='%s'\n",
-			name, cpumask_weight(par_mask),
-			pa_mask_buff, cb_mask_buff);
-
-		free_page((unsigned long)cb_mask_buff);
-		free_page((unsigned long)pa_mask_buff);
-	}
-
-	engine->pte_weight = cpumask_weight(par_mask);
-	engine->pte_pinst  = padata_alloc_possible(engine->pte_wq);
-	if (engine->pte_pinst == NULL)
-		GOTO(err_free_par_mask, rc = -ENOMEM);
-
-	engine->pte_notifier.notifier_call = cfs_ptask_cpumask_change_notify;
-	rc = padata_register_cpumask_notifier(engine->pte_pinst,
-					      &engine->pte_notifier);
-	if (rc)
-		GOTO(err_free_padata, rc);
-
-	rc = cfs_ptengine_set_cpumask(engine, par_mask);
-	if (rc)
-		GOTO(err_unregister, rc);
-
-	rc = padata_start(engine->pte_pinst);
-	if (rc)
-		GOTO(err_unregister, rc);
-
-	free_cpumask_var(par_mask);
-	free_cpumask_var(all_mask);
-
-	put_online_cpus();
-	return 0;
-
-err_unregister:
-	padata_unregister_cpumask_notifier(engine->pte_pinst,
-					   &engine->pte_notifier);
-err_free_padata:
-	padata_free(engine->pte_pinst);
-err_free_par_mask:
-	free_cpumask_var(par_mask);
-err_free_all_mask:
-	free_cpumask_var(all_mask);
-err_destroy_workqueue:
-	destroy_workqueue(engine->pte_wq);
-err:
-	put_online_cpus();
-	return rc;
-}
-
-static void cfs_ptengine_padata_fini(struct cfs_ptask_engine *engine)
-{
-	padata_stop(engine->pte_pinst);
-	padata_unregister_cpumask_notifier(engine->pte_pinst,
-					   &engine->pte_notifier);
-	padata_free(engine->pte_pinst);
-	destroy_workqueue(engine->pte_wq);
-}
-
-#else  /* !CONFIG_PADATA */
-
-static int cfs_ptengine_padata_init(struct cfs_ptask_engine *engine,
-				    const char *name,
-				    const struct cpumask *cpumask)
-{
-	engine->pte_weight = 1;
-
-	return 0;
-}
-
-static void cfs_ptengine_padata_fini(struct cfs_ptask_engine *engine)
-{
-}
-#endif /* CONFIG_PADATA */
-
-struct cfs_ptask_engine *cfs_ptengine_init(const char *name,
-					   const struct cpumask *cpumask)
-{
-	struct cfs_ptask_engine *engine;
-	int rc;
-
-	engine = kzalloc(sizeof(*engine), GFP_KERNEL);
-	if (engine == NULL)
-		GOTO(err, rc = -ENOMEM);
-
-	rc = cfs_ptengine_padata_init(engine, name, cpumask);
-	if (rc)
-		GOTO(err_free_engine, rc);
-
-	return engine;
-
-err_free_engine:
-	kfree(engine);
-err:
-	return ERR_PTR(rc);
-}
-EXPORT_SYMBOL(cfs_ptengine_init);
-
-void cfs_ptengine_fini(struct cfs_ptask_engine *engine)
-{
-	if (IS_ERR_OR_NULL(engine))
-		return;
-
-	cfs_ptengine_padata_fini(engine);
-	kfree(engine);
-}
-EXPORT_SYMBOL(cfs_ptengine_fini);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_string.c b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_string.c
index 04e1dd56dd430..b460df3c4d9bc 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_string.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/libcfs_string.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -36,7 +36,9 @@
  * Author: Nathan Rutman <nathan.rutman@sun.com>
  */
 
+#include <linux/ctype.h>
 #include <libcfs/libcfs.h>
+#include <libcfs/libcfs_string.h>
 
 char *cfs_strrstr(const char *haystack, const char *needle)
 {
@@ -252,17 +254,47 @@ int
 cfs_str2num_check(char *str, int nob, unsigned *num,
 		  unsigned min, unsigned max)
 {
-	char	*endp;
-
-	*num = simple_strtoul(str, &endp, 0);
-	if (endp == str)
-		return 0;
+	bool all_numbers = true;
+	char *endp, cache;
+	int len;
+	int rc;
+
+	endp = strim(str);
+	/**
+	 * kstrouint can only handle strings composed
+	 * of only numbers. We need to scan the string
+	 * passed in for the first non-digit character
+	 * and end the string at that location. If we
+	 * don't find any non-digit character we still
+	 * need to place a '\0' at position len since
+	 * we are not interested in the rest of the
+	 * string which is longer than len in size.
+	 * After we are done the character at the
+	 * position we placed '\0' must be restored.
+	 */
+	len = min((int)strlen(endp), nob);
+	for (; endp < str + len; endp++) {
+		if (!isxdigit(*endp) && *endp != '-' &&
+		    *endp != '+') {
+			all_numbers = false;
+			break;
+		}
+	}
 
-	for (; endp < str + nob; endp++) {
-		if (!isspace(*endp))
-			return 0;
+	/* Eat trailing space */
+	if (!all_numbers && isspace(*endp)) {
+		all_numbers = true;
+		endp--;
 	}
 
+	cache = *endp;
+	*endp = '\0';
+
+	rc = kstrtouint(str, 0, num);
+	*endp = cache;
+	if (rc || !all_numbers)
+		return 0;
+
 	return (*num >= min && *num <= max);
 }
 EXPORT_SYMBOL(cfs_str2num_check);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-cpu.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-cpu.c
deleted file mode 100644
index b7d6193425b41..0000000000000
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-cpu.c
+++ /dev/null
@@ -1,1178 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- *
- * Copyright (c) 2012, 2016, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * Author: liang@whamcloud.com
- */
-
-#define DEBUG_SUBSYSTEM S_LNET
-
-#include <linux/cpu.h>
-#include <linux/sched.h>
-#include <libcfs/libcfs.h>
-
-#ifdef CONFIG_SMP
-
-/**
- * modparam for setting number of partitions
- *
- *  0 : estimate best value based on cores or NUMA nodes
- *  1 : disable multiple partitions
- * >1 : specify number of partitions
- */
-static int cpu_npartitions;
-module_param(cpu_npartitions, int, 0444);
-MODULE_PARM_DESC(cpu_npartitions, "# of CPU partitions");
-
-/**
- * modparam for setting CPU partitions patterns:
- *
- * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID,
- *      number in bracket is processor ID (core or HT)
- *
- * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket
- *       are NUMA node ID, number before bracket is CPU partition ID.
- *
- * i.e: "N", shortcut expression to create CPT from NUMA & CPU topology
- *
- * NB: If user specified cpu_pattern, cpu_npartitions will be ignored
- */
-static char *cpu_pattern = "N";
-module_param(cpu_pattern, charp, 0444);
-MODULE_PARM_DESC(cpu_pattern, "CPU partitions pattern");
-
-void cfs_cpt_table_free(struct cfs_cpt_table *cptab)
-{
-	int i;
-
-	if (cptab->ctb_cpu2cpt != NULL) {
-		LIBCFS_FREE(cptab->ctb_cpu2cpt,
-			    nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
-	}
-
-	if (cptab->ctb_node2cpt != NULL) {
-		LIBCFS_FREE(cptab->ctb_node2cpt,
-			    nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
-	}
-
-	for (i = 0; cptab->ctb_parts != NULL && i < cptab->ctb_nparts; i++) {
-		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
-
-		if (part->cpt_nodemask != NULL) {
-			LIBCFS_FREE(part->cpt_nodemask,
-				    sizeof(*part->cpt_nodemask));
-		}
-
-		if (part->cpt_cpumask != NULL)
-			LIBCFS_FREE(part->cpt_cpumask, cpumask_size());
-
-		if (part->cpt_distance) {
-			LIBCFS_FREE(part->cpt_distance,
-				cptab->ctb_nparts *
-					sizeof(part->cpt_distance[0]));
-		}
-	}
-
-	if (cptab->ctb_parts != NULL) {
-		LIBCFS_FREE(cptab->ctb_parts,
-			    cptab->ctb_nparts * sizeof(cptab->ctb_parts[0]));
-	}
-
-	if (cptab->ctb_nodemask != NULL)
-		LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
-	if (cptab->ctb_cpumask != NULL)
-		LIBCFS_FREE(cptab->ctb_cpumask, cpumask_size());
-
-	LIBCFS_FREE(cptab, sizeof(*cptab));
-}
-EXPORT_SYMBOL(cfs_cpt_table_free);
-
-struct cfs_cpt_table *cfs_cpt_table_alloc(int ncpt)
-{
-	struct cfs_cpt_table *cptab;
-	int i;
-
-	LIBCFS_ALLOC(cptab, sizeof(*cptab));
-	if (cptab == NULL)
-		return NULL;
-
-	cptab->ctb_nparts = ncpt;
-
-	LIBCFS_ALLOC(cptab->ctb_cpumask, cpumask_size());
-	LIBCFS_ALLOC(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
-
-	if (cptab->ctb_cpumask == NULL || cptab->ctb_nodemask == NULL)
-		goto failed;
-
-	LIBCFS_ALLOC(cptab->ctb_cpu2cpt,
-		     nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
-	if (cptab->ctb_cpu2cpt == NULL)
-		goto failed;
-
-	memset(cptab->ctb_cpu2cpt, -1,
-	       nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
-
-	LIBCFS_ALLOC(cptab->ctb_node2cpt,
-		     nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
-	if (cptab->ctb_node2cpt == NULL)
-		goto failed;
-
-	memset(cptab->ctb_node2cpt, -1,
-	       nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
-
-	LIBCFS_ALLOC(cptab->ctb_parts, ncpt * sizeof(cptab->ctb_parts[0]));
-	if (cptab->ctb_parts == NULL)
-		goto failed;
-
-	for (i = 0; i < ncpt; i++) {
-		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
-
-		LIBCFS_ALLOC(part->cpt_cpumask, cpumask_size());
-		if (!part->cpt_cpumask)
-			goto failed;
-
-		LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask));
-		if (!part->cpt_nodemask)
-			goto failed;
-
-		LIBCFS_ALLOC(part->cpt_distance,
-			cptab->ctb_nparts * sizeof(part->cpt_distance[0]));
-		if (!part->cpt_distance)
-			goto failed;
-	}
-
-	return cptab;
-
-failed:
-	cfs_cpt_table_free(cptab);
-	return NULL;
-}
-EXPORT_SYMBOL(cfs_cpt_table_alloc);
-
-int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
-{
-	char *tmp = buf;
-	int rc;
-	int i;
-	int j;
-
-	for (i = 0; i < cptab->ctb_nparts; i++) {
-		if (len <= 0)
-			goto err;
-
-		rc = snprintf(tmp, len, "%d\t:", i);
-		len -= rc;
-
-		if (len <= 0)
-			goto err;
-
-		tmp += rc;
-		for_each_cpu(j, cptab->ctb_parts[i].cpt_cpumask) {
-			rc = snprintf(tmp, len, " %d", j);
-			len -= rc;
-			if (len <= 0)
-				goto err;
-			tmp += rc;
-		}
-
-		*tmp = '\n';
-		tmp++;
-		len--;
-	}
-
-	return tmp - buf;
-
-err:
-	return -E2BIG;
-}
-EXPORT_SYMBOL(cfs_cpt_table_print);
-
-int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len)
-{
-	char *tmp = buf;
-	int rc;
-	int i;
-	int j;
-
-	for (i = 0; i < cptab->ctb_nparts; i++) {
-		if (len <= 0)
-			goto err;
-
-		rc = snprintf(tmp, len, "%d\t:", i);
-		len -= rc;
-
-		if (len <= 0)
-			goto err;
-
-		tmp += rc;
-		for (j = 0; j < cptab->ctb_nparts; j++) {
-			rc = snprintf(tmp, len, " %d:%d",
-				j, cptab->ctb_parts[i].cpt_distance[j]);
-			len -= rc;
-			if (len <= 0)
-				goto err;
-			tmp += rc;
-		}
-
-		*tmp = '\n';
-		tmp++;
-		len--;
-	}
-
-	return tmp - buf;
-
-err:
-	return -E2BIG;
-}
-EXPORT_SYMBOL(cfs_cpt_distance_print);
-
-int cfs_cpt_number(struct cfs_cpt_table *cptab)
-{
-	return cptab->ctb_nparts;
-}
-EXPORT_SYMBOL(cfs_cpt_number);
-
-int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
-{
-	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
-
-	return cpt == CFS_CPT_ANY ?
-	       cpumask_weight(cptab->ctb_cpumask) :
-	       cpumask_weight(cptab->ctb_parts[cpt].cpt_cpumask);
-}
-EXPORT_SYMBOL(cfs_cpt_weight);
-
-int cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
-{
-	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
-
-	return cpt == CFS_CPT_ANY ?
-	       cpumask_any_and(cptab->ctb_cpumask,
-			       cpu_online_mask) < nr_cpu_ids :
-	       cpumask_any_and(cptab->ctb_parts[cpt].cpt_cpumask,
-			       cpu_online_mask) < nr_cpu_ids;
-}
-EXPORT_SYMBOL(cfs_cpt_online);
-
-cpumask_t *cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt)
-{
-	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
-
-	return cpt == CFS_CPT_ANY ?
-	       cptab->ctb_cpumask : cptab->ctb_parts[cpt].cpt_cpumask;
-}
-EXPORT_SYMBOL(cfs_cpt_cpumask);
-
-nodemask_t *cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt)
-{
-	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
-
-	return cpt == CFS_CPT_ANY ?
-	       cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask;
-}
-EXPORT_SYMBOL(cfs_cpt_nodemask);
-
-unsigned cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2)
-{
-	LASSERT(cpt1 == CFS_CPT_ANY || (cpt1 >= 0 && cpt1 < cptab->ctb_nparts));
-	LASSERT(cpt2 == CFS_CPT_ANY || (cpt2 >= 0 && cpt2 < cptab->ctb_nparts));
-
-	if (cpt1 == CFS_CPT_ANY || cpt2 == CFS_CPT_ANY)
-		return cptab->ctb_distance;
-
-	return cptab->ctb_parts[cpt1].cpt_distance[cpt2];
-}
-EXPORT_SYMBOL(cfs_cpt_distance);
-
-/*
- * Calculate the maximum NUMA distance between all nodes in the
- * from_mask and all nodes in the to_mask.
- */
-static unsigned cfs_cpt_distance_calculate(nodemask_t *from_mask,
-					   nodemask_t *to_mask)
-{
-	unsigned maximum;
-	unsigned distance;
-	int to;
-	int from;
-
-	maximum = 0;
-	for_each_node_mask(from, *from_mask) {
-		for_each_node_mask(to, *to_mask) {
-			distance = node_distance(from, to);
-			if (maximum < distance)
-				maximum = distance;
-		}
-	}
-	return maximum;
-}
-
-static void cfs_cpt_add_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
-{
-	cptab->ctb_cpu2cpt[cpu] = cpt;
-
-	cpumask_set_cpu(cpu, cptab->ctb_cpumask);
-	cpumask_set_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
-}
-
-static void cfs_cpt_del_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
-{
-	cpumask_clear_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
-	cpumask_clear_cpu(cpu, cptab->ctb_cpumask);
-
-	cptab->ctb_cpu2cpt[cpu] = -1;
-}
-
-static void cfs_cpt_add_node(struct cfs_cpt_table *cptab, int cpt, int node)
-{
-	int cpt2;
-	struct cfs_cpu_partition *part;
-	struct cfs_cpu_partition *part2;
-
-	if (!node_isset(node, *cptab->ctb_nodemask)) {
-		/* first time node is added to the CPT table */
-		node_set(node, *cptab->ctb_nodemask);
-		cptab->ctb_node2cpt[node] = cpt;
-		cptab->ctb_distance = cfs_cpt_distance_calculate(
-							cptab->ctb_nodemask,
-							cptab->ctb_nodemask);
-	}
-
-	part = &cptab->ctb_parts[cpt];
-	if (!node_isset(node, *part->cpt_nodemask)) {
-		/* first time node is added to this CPT */
-		node_set(node, *part->cpt_nodemask);
-		for (cpt2 = 0; cpt2 < cptab->ctb_nparts; cpt2++) {
-			part2 = &cptab->ctb_parts[cpt2];
-			part->cpt_distance[cpt2] = cfs_cpt_distance_calculate(
-						part->cpt_nodemask,
-						part2->cpt_nodemask);
-			part2->cpt_distance[cpt] = cfs_cpt_distance_calculate(
-						part2->cpt_nodemask,
-						part->cpt_nodemask);
-		}
-	}
-}
-
-static void cfs_cpt_del_node(struct cfs_cpt_table *cptab, int cpt, int node)
-{
-	int cpu;
-	int cpt2;
-	struct cfs_cpu_partition *part;
-	struct cfs_cpu_partition *part2;
-
-	part = &cptab->ctb_parts[cpt];
-
-	for_each_cpu(cpu, part->cpt_cpumask) {
-		/* this CPT has other CPU belonging to this node? */
-		if (cpu_to_node(cpu) == node)
-			break;
-	}
-
-	if (cpu >= nr_cpu_ids && node_isset(node,  *part->cpt_nodemask)) {
-		/* No more CPUs in the node for this CPT. */
-		node_clear(node, *part->cpt_nodemask);
-		for (cpt2 = 0; cpt2 < cptab->ctb_nparts; cpt2++) {
-			part2 = &cptab->ctb_parts[cpt2];
-			if (node_isset(node, *part2->cpt_nodemask))
-				cptab->ctb_node2cpt[node] = cpt2;
-			part->cpt_distance[cpt2] = cfs_cpt_distance_calculate(
-						part->cpt_nodemask,
-						part2->cpt_nodemask);
-			part2->cpt_distance[cpt] = cfs_cpt_distance_calculate(
-						part2->cpt_nodemask,
-						part->cpt_nodemask);
-		}
-	}
-
-	for_each_cpu(cpu, cptab->ctb_cpumask) {
-		/* this CPT-table has other CPUs belonging to this node? */
-		if (cpu_to_node(cpu) == node)
-			break;
-	}
-
-	if (cpu >= nr_cpu_ids && node_isset(node, *cptab->ctb_nodemask)) {
-		/* No more CPUs in the table for this node. */
-		node_clear(node, *cptab->ctb_nodemask);
-		cptab->ctb_node2cpt[node] = -1;
-		cptab->ctb_distance =
-			cfs_cpt_distance_calculate(cptab->ctb_nodemask,
-					cptab->ctb_nodemask);
-	}
-}
-
-int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
-{
-	LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts);
-
-	if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_online(cpu)) {
-		CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu);
-		return 0;
-	}
-
-	if (cptab->ctb_cpu2cpt[cpu] != -1) {
-		CDEBUG(D_INFO, "CPU %d is already in partition %d\n",
-		       cpu, cptab->ctb_cpu2cpt[cpu]);
-		return 0;
-	}
-
-	if (cpumask_test_cpu(cpu, cptab->ctb_cpumask)) {
-		CDEBUG(D_INFO, "CPU %d is already in cpumask\n", cpu);
-		return 0;
-	}
-	if (cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask)) {
-		CDEBUG(D_INFO, "CPU %d is already in partition %d cpumask\n",
-		       cpu, cptab->ctb_cpu2cpt[cpu]);
-		return 0;
-	}
-
-	cfs_cpt_add_cpu(cptab, cpt, cpu);
-	cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu));
-
-	return 1;
-}
-EXPORT_SYMBOL(cfs_cpt_set_cpu);
-
-void cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
-{
-	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
-
-	if (cpu < 0 || cpu >= nr_cpu_ids) {
-		CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu);
-		return;
-	}
-
-	if (cpt == CFS_CPT_ANY) {
-		/* caller doesn't know the partition ID */
-		cpt = cptab->ctb_cpu2cpt[cpu];
-		if (cpt < 0) { /* not set in this CPT-table */
-			CDEBUG(D_INFO, "Try to unset cpu %d which is "
-				       "not in CPT-table %p\n", cpt, cptab);
-			return;
-		}
-
-	} else if (cpt != cptab->ctb_cpu2cpt[cpu]) {
-		CDEBUG(D_INFO, "CPU %d is not in CPU partition %d\n", cpu, cpt);
-		return;
-	}
-
-	LASSERT(cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
-	LASSERT(cpumask_test_cpu(cpu, cptab->ctb_cpumask));
-
-	cfs_cpt_del_cpu(cptab, cpt, cpu);
-	cfs_cpt_del_node(cptab, cpt, cpu_to_node(cpu));
-}
-EXPORT_SYMBOL(cfs_cpt_unset_cpu);
-
-int cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt,
-			const cpumask_t *mask)
-{
-	int cpu;
-
-	if (cpumask_weight(mask) == 0 ||
-	    cpumask_any_and(mask, cpu_online_mask) >= nr_cpu_ids) {
-		CDEBUG(D_INFO, "No online CPU is found in the CPU mask "
-			       "for CPU partition %d\n", cpt);
-		return 0;
-	}
-
-	for_each_cpu(cpu, mask) {
-		cfs_cpt_add_cpu(cptab, cpt, cpu);
-		cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu));
-	}
-
-	return 1;
-}
-EXPORT_SYMBOL(cfs_cpt_set_cpumask);
-
-void cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt,
-			   const cpumask_t *mask)
-{
-	int cpu;
-
-	for_each_cpu(cpu, mask) {
-		cfs_cpt_del_cpu(cptab, cpt, cpu);
-		cfs_cpt_del_node(cptab, cpt, cpu_to_node(cpu));
-	}
-}
-EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
-
-int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
-{
-	const cpumask_t *mask;
-	int cpu;
-
-	if (node < 0 || node >= nr_node_ids) {
-		CDEBUG(D_INFO,
-		       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
-		return 0;
-	}
-
-	mask = cpumask_of_node(node);
-
-	for_each_cpu(cpu, mask)
-		cfs_cpt_add_cpu(cptab, cpt, cpu);
-
-	cfs_cpt_add_node(cptab, cpt, node);
-
-	return 1;
-}
-EXPORT_SYMBOL(cfs_cpt_set_node);
-
-void cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
-{
-	const cpumask_t *mask;
-	int cpu;
-
-	if (node < 0 || node >= nr_node_ids) {
-		CDEBUG(D_INFO,
-		       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
-		return;
-	}
-
-	mask = cpumask_of_node(node);
-
-	for_each_cpu(cpu, mask)
-		cfs_cpt_del_cpu(cptab, cpt, cpu);
-
-	cfs_cpt_del_node(cptab, cpt, node);
-}
-EXPORT_SYMBOL(cfs_cpt_unset_node);
-
-int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt,
-			 const nodemask_t *mask)
-{
-	int node;
-
-	for_each_node_mask(node, *mask)
-		cfs_cpt_set_node(cptab, cpt, node);
-
-	return 1;
-}
-EXPORT_SYMBOL(cfs_cpt_set_nodemask);
-
-void cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt,
-			    const nodemask_t *mask)
-{
-	int node;
-
-	for_each_node_mask(node, *mask)
-		cfs_cpt_unset_node(cptab, cpt, node);
-}
-EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
-
-int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
-{
-	nodemask_t *mask;
-	int weight;
-	int rotor;
-	int node = 0;
-
-	/* convert CPU partition ID to HW node id */
-
-	if (cpt < 0 || cpt >= cptab->ctb_nparts) {
-		mask  = cptab->ctb_nodemask;
-		rotor = cptab->ctb_spread_rotor++;
-	} else {
-		mask  = cptab->ctb_parts[cpt].cpt_nodemask;
-		rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++;
-		node  = cptab->ctb_parts[cpt].cpt_node;
-	}
-
-	weight = nodes_weight(*mask);
-	if (weight > 0) {
-		rotor %= weight;
-
-		for_each_node_mask(node, *mask) {
-			if (rotor-- == 0)
-				return node;
-		}
-	}
-
-	return node;
-}
-EXPORT_SYMBOL(cfs_cpt_spread_node);
-
-int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
-{
-	int cpu = smp_processor_id();
-	int cpt = cptab->ctb_cpu2cpt[cpu];
-
-	if (cpt < 0) {
-		if (!remap)
-			return cpt;
-
-		/* don't return negative value for safety of upper layer,
-		 * instead we shadow the unknown cpu to a valid partition ID */
-		cpt = cpu % cptab->ctb_nparts;
-	}
-
-	return cpt;
-}
-EXPORT_SYMBOL(cfs_cpt_current);
-
-int cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
-{
-	LASSERT(cpu >= 0 && cpu < nr_cpu_ids);
-
-	return cptab->ctb_cpu2cpt[cpu];
-}
-EXPORT_SYMBOL(cfs_cpt_of_cpu);
-
-int cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node)
-{
-	if (node < 0 || node > nr_node_ids)
-		return CFS_CPT_ANY;
-
-	return cptab->ctb_node2cpt[node];
-}
-EXPORT_SYMBOL(cfs_cpt_of_node);
-
-int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
-{
-	nodemask_t *nodemask;
-	cpumask_t *cpumask;
-	int cpu;
-	int rc;
-
-	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
-
-	if (cpt == CFS_CPT_ANY) {
-		cpumask = cptab->ctb_cpumask;
-		nodemask = cptab->ctb_nodemask;
-	} else {
-		cpumask = cptab->ctb_parts[cpt].cpt_cpumask;
-		nodemask = cptab->ctb_parts[cpt].cpt_nodemask;
-	}
-
-	if (!cpumask_intersects(cpumask, cpu_online_mask)) {
-		CDEBUG(D_INFO, "No online CPU found in CPU partition %d, did "
-			"someone do CPU hotplug on system? You might need to "
-			"reload Lustre modules to keep system working well.\n",
-			cpt);
-		return -ENODEV;
-	}
-
-	for_each_online_cpu(cpu) {
-		if (cpumask_test_cpu(cpu, cpumask))
-			continue;
-
-		rc = set_cpus_allowed_ptr(current, cpumask);
-		set_mems_allowed(*nodemask);
-		if (rc == 0)
-			schedule(); /* switch to allowed CPU */
-
-		return rc;
-	}
-
-	/* don't need to set affinity because all online CPUs are covered */
-	return 0;
-}
-EXPORT_SYMBOL(cfs_cpt_bind);
-
-/**
- * Choose max to \a number CPUs from \a node and set them in \a cpt.
- * We always prefer to choose CPU in the same core/socket.
- */
-static int cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt,
-				cpumask_t *node_mask, int number)
-{
-	cpumask_t *socket_mask = NULL;
-	cpumask_t *core_mask = NULL;
-	int rc = 0;
-	int cpu;
-	int i;
-
-	LASSERT(number > 0);
-
-	if (number >= cpumask_weight(node_mask)) {
-		while (!cpumask_empty(node_mask)) {
-			cpu = cpumask_first(node_mask);
-			cpumask_clear_cpu(cpu, node_mask);
-
-			if (!cpu_online(cpu))
-				continue;
-
-			rc = cfs_cpt_set_cpu(cptab, cpt, cpu);
-			if (!rc)
-				return -EINVAL;
-		}
-		return 0;
-	}
-
-	/* allocate scratch buffer */
-	LIBCFS_ALLOC(socket_mask, cpumask_size());
-	LIBCFS_ALLOC(core_mask, cpumask_size());
-	if (socket_mask == NULL || core_mask == NULL) {
-		rc = -ENOMEM;
-		goto out;
-	}
-
-	while (!cpumask_empty(node_mask)) {
-		cpu = cpumask_first(node_mask);
-
-		/* get cpumask for cores in the same socket */
-		cpumask_and(socket_mask, topology_core_cpumask(cpu), node_mask);
-		while (!cpumask_empty(socket_mask)) {
-			/* get cpumask for hts in the same core */
-			cpumask_and(core_mask,
-				    topology_sibling_cpumask(cpu), node_mask);
-
-			for_each_cpu(i, core_mask) {
-				cpumask_clear_cpu(i, socket_mask);
-				cpumask_clear_cpu(i, node_mask);
-
-				if (!cpu_online(i))
-					continue;
-
-				rc = cfs_cpt_set_cpu(cptab, cpt, i);
-				if (!rc) {
-					rc = -EINVAL;
-					goto out;
-				}
-
-				if (--number == 0)
-					goto out;
-			}
-			cpu = cpumask_first(socket_mask);
-		}
-	}
-
-out:
-	if (core_mask != NULL)
-		LIBCFS_FREE(core_mask, cpumask_size());
-	if (socket_mask != NULL)
-		LIBCFS_FREE(socket_mask, cpumask_size());
-	return rc;
-}
-
-#define CPT_WEIGHT_MIN 4
-
-static int cfs_cpt_num_estimate(void)
-{
-	int nthr = cpumask_weight(topology_sibling_cpumask(smp_processor_id()));
-	int ncpu  = num_online_cpus();
-	int ncpt = 1;
-
-	if (ncpu > CPT_WEIGHT_MIN)
-		for (ncpt = 2; ncpu > 2 * nthr * ncpt; ncpt++);
-			/* nothing */
-
-#if (BITS_PER_LONG == 32)
-	/* config many CPU partitions on 32-bit system could consume
-	 * too much memory */
-	ncpt = min(2, ncpt);
-#endif
-	while (ncpu % ncpt != 0)
-		ncpt--; /* worst case is 1 */
-
-	return ncpt;
-}
-
-static struct cfs_cpt_table *cfs_cpt_table_create(int ncpt)
-{
-	struct cfs_cpt_table *cptab = NULL;
-	cpumask_t *node_mask = NULL;
-	int cpt = 0;
-	int node;
-	int num;
-	int rem;
-	int rc = 0;
-
-	num = cfs_cpt_num_estimate();
-	if (ncpt <= 0)
-		ncpt = num;
-
-	if (ncpt > num_online_cpus() || ncpt > 4 * num) {
-		CWARN("CPU partition number %d is larger than suggested "
-		      "value (%d), your system may have performance "
-		      "issue or run out of memory while under pressure\n",
-		      ncpt, num);
-	}
-
-	cptab = cfs_cpt_table_alloc(ncpt);
-	if (cptab == NULL) {
-		CERROR("Failed to allocate CPU map(%d)\n", ncpt);
-		rc = -ENOMEM;
-		goto failed;
-	}
-
-	LIBCFS_ALLOC(node_mask, cpumask_size());
-	if (node_mask == NULL) {
-		CERROR("Failed to allocate scratch cpumask\n");
-		rc = -ENOMEM;
-		goto failed;
-	}
-
-	num = num_online_cpus() / ncpt;
-	rem = num_online_cpus() % ncpt;
-	for_each_online_node(node) {
-		cpumask_copy(node_mask, cpumask_of_node(node));
-
-		while (cpt < ncpt && !cpumask_empty(node_mask)) {
-			struct cfs_cpu_partition *part = &cptab->ctb_parts[cpt];
-			int ncpu = cpumask_weight(part->cpt_cpumask);
-
-			rc = cfs_cpt_choose_ncpus(cptab, cpt, node_mask,
-						  num - ncpu);
-			if (rc < 0) {
-				rc = -EINVAL;
-				goto failed;
-			}
-
-			ncpu = cpumask_weight(part->cpt_cpumask);
-			if (ncpu == num + !!(rem > 0)) {
-				cpt++;
-				rem--;
-			}
-		}
-	}
-
-	LIBCFS_FREE(node_mask, cpumask_size());
-	return cptab;
-
-failed:
-	CERROR("Failed (rc=%d) to setup CPU partition table with %d "
-		"partitions, online HW NUMA nodes: %d, HW CPU cores: %d.\n",
-		rc, ncpt, num_online_nodes(), num_online_cpus());
-
-	if (node_mask != NULL)
-		LIBCFS_FREE(node_mask, cpumask_size());
-
-	if (cptab != NULL)
-		cfs_cpt_table_free(cptab);
-
-	return ERR_PTR(rc);
-}
-
-static struct cfs_cpt_table *cfs_cpt_table_create_pattern(const char *pattern)
-{
-	struct cfs_cpt_table *cptab;
-	char *pattern_dup;
-	char *bracket;
-	char *str;
-	int node = 0;
-	int ncpt = 0;
-	int cpt  = 0;
-	int high;
-	int rc;
-	int c;
-	int i;
-
-	pattern_dup = kstrdup(pattern, GFP_KERNEL);
-	if (pattern_dup == NULL) {
-		CERROR("Failed to duplicate pattern '%s'\n", pattern);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	str = cfs_trimwhite(pattern_dup);
-	if (*str == 'n' || *str == 'N') {
-		str++; /* skip 'N' char */
-		node = 1; /* NUMA pattern */
-		if (*str == '\0') {
-			node = -1;
-			for_each_online_node(i) {
-				if (!cpumask_empty(cpumask_of_node(i)))
-					ncpt++;
-			}
-			if (ncpt == 1) { /* single NUMA node */
-				kfree(pattern_dup);
-				return cfs_cpt_table_create(cpu_npartitions);
-			}
-		}
-	}
-
-	if (ncpt == 0) { /* scanning bracket which is mark of partition */
-		bracket = str;
-		while ((bracket = strchr(bracket, '['))) {
-			bracket++;
-			ncpt++;
-		}
-	}
-
-	if (ncpt == 0 ||
-	    (node && ncpt > num_online_nodes()) ||
-	    (!node && ncpt > num_online_cpus())) {
-		CERROR("Invalid pattern '%s', or too many partitions %d\n",
-		       pattern_dup, ncpt);
-		rc = -EINVAL;
-		goto err_free_str;
-	}
-
-	cptab = cfs_cpt_table_alloc(ncpt);
-	if (cptab == NULL) {
-		CERROR("Failed to allocate CPU partition table\n");
-		rc = -ENOMEM;
-		goto err_free_str;
-	}
-
-	if (node < 0) { /* shortcut to create CPT from NUMA & CPU topology */
-		for_each_online_node(i) {
-			if (cpumask_empty(cpumask_of_node(i)))
-				continue;
-
-			rc = cfs_cpt_set_node(cptab, cpt++, i);
-			if (!rc) {
-				rc = -EINVAL;
-				goto err_free_table;
-			}
-		}
-		kfree(pattern_dup);
-		return cptab;
-	}
-
-	high = node ? nr_node_ids - 1 : nr_cpu_ids - 1;
-
-	for (str = cfs_trimwhite(str), c = 0; /* until break */; c++) {
-		struct cfs_range_expr *range;
-		struct cfs_expr_list *el;
-		int n;
-
-		bracket = strchr(str, '[');
-		if (bracket == NULL) {
-			if (*str != 0) {
-				CERROR("Invalid pattern '%s'\n", str);
-				rc = -EINVAL;
-				goto err_free_table;
-			} else if (c != ncpt) {
-				CERROR("Expect %d partitions but found %d\n",
-					ncpt, c);
-				rc = -EINVAL;
-				goto err_free_table;
-			}
-			break;
-		}
-
-		if (sscanf(str, "%d%n", &cpt, &n) < 1) {
-			CERROR("Invalid CPU pattern '%s'\n", str);
-			rc = -EINVAL;
-			goto err_free_table;
-		}
-
-		if (cpt < 0 || cpt >= ncpt) {
-			CERROR("Invalid partition id %d, total partitions %d\n",
-			       cpt, ncpt);
-			rc = -EINVAL;
-			goto err_free_table;
-		}
-
-		if (cfs_cpt_weight(cptab, cpt) != 0) {
-			CERROR("Partition %d has already been set.\n", cpt);
-			rc = -EPERM;
-			goto err_free_table;
-		}
-
-		str = cfs_trimwhite(str + n);
-		if (str != bracket) {
-			CERROR("Invalid pattern '%s'\n", str);
-			rc = -EINVAL;
-			goto err_free_table;
-		}
-
-		bracket = strchr(str, ']');
-		if (bracket == NULL) {
-			CERROR("Missing right bracket for partition "
-				"%d in '%s'\n", cpt, str);
-			rc = -EINVAL;
-			goto err_free_table;
-		}
-
-		rc = cfs_expr_list_parse(str, (bracket - str) + 1, 0, high,
-					 &el);
-		if (rc) {
-			CERROR("Can't parse number range in '%s'\n", str);
-			rc = -ERANGE;
-			goto err_free_table;
-		}
-
-		list_for_each_entry(range, &el->el_exprs, re_link) {
-			for (i = range->re_lo; i <= range->re_hi; i++) {
-				if ((i - range->re_lo) % range->re_stride != 0)
-					continue;
-
-				rc = node ? cfs_cpt_set_node(cptab, cpt, i)
-					  : cfs_cpt_set_cpu(cptab, cpt, i);
-				if (!rc) {
-					cfs_expr_list_free(el);
-					rc = -EINVAL;
-					goto err_free_table;
-				}
-			}
-		}
-
-		cfs_expr_list_free(el);
-
-		if (!cfs_cpt_online(cptab, cpt)) {
-			CERROR("No online CPU is found on partition %d\n", cpt);
-			rc = -ENODEV;
-			goto err_free_table;
-		}
-
-		str = cfs_trimwhite(bracket + 1);
-	}
-
-	kfree(pattern_dup);
-	return cptab;
-
-err_free_table:
-	cfs_cpt_table_free(cptab);
-err_free_str:
-	kfree(pattern_dup);
-	return ERR_PTR(rc);
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-#ifdef HAVE_HOTPLUG_STATE_MACHINE
-static enum cpuhp_state lustre_cpu_online;
-
-static int cfs_cpu_online(unsigned int cpu)
-{
-	return 0;
-}
-#endif
-
-static int cfs_cpu_dead(unsigned int cpu)
-{
-	bool warn;
-
-	/* if all HTs in a core are offline, it may break affinity */
-	warn = cpumask_any_and(topology_sibling_cpumask(cpu),
-			       cpu_online_mask) >= nr_cpu_ids;
-	CDEBUG(warn ? D_WARNING : D_INFO,
-	       "Lustre: can't support CPU plug-out well now, performance and stability could be impacted [CPU %u]\n",
-	       cpu);
-	return 0;
-}
-
-#ifndef HAVE_HOTPLUG_STATE_MACHINE
-static int cfs_cpu_notify(struct notifier_block *self, unsigned long action,
-			  void *hcpu)
-{
-	int cpu = (unsigned long)hcpu;
-
-	switch (action) {
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-	default:
-		if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) {
-			CDEBUG(D_INFO, "CPU changed [cpu %u action %lx]\n",
-			       cpu, action);
-			break;
-		}
-
-		cfs_cpu_dead(cpu);
-	}
-
-	return NOTIFY_OK;
-}
-
-static struct notifier_block cfs_cpu_notifier = {
-	.notifier_call	= cfs_cpu_notify,
-	.priority	= 0
-};
-#endif /* !HAVE_HOTPLUG_STATE_MACHINE */
-#endif /* CONFIG_HOTPLUG_CPU */
-
-void cfs_cpu_fini(void)
-{
-	if (!IS_ERR_OR_NULL(cfs_cpt_table))
-		cfs_cpt_table_free(cfs_cpt_table);
-
-#ifdef CONFIG_HOTPLUG_CPU
-#ifdef HAVE_HOTPLUG_STATE_MACHINE
-	if (lustre_cpu_online > 0)
-		cpuhp_remove_state_nocalls(lustre_cpu_online);
-	cpuhp_remove_state_nocalls(CPUHP_LUSTRE_CFS_DEAD);
-#else
-	unregister_hotcpu_notifier(&cfs_cpu_notifier);
-#endif /* !HAVE_HOTPLUG_STATE_MACHINE */
-#endif /* CONFIG_HOTPLUG_CPU */
-}
-
-int cfs_cpu_init(void)
-{
-	int ret = -EINVAL;
-
-	LASSERT(!cfs_cpt_table);
-
-#ifdef CONFIG_HOTPLUG_CPU
-#ifdef HAVE_HOTPLUG_STATE_MACHINE
-	ret = cpuhp_setup_state_nocalls(CPUHP_LUSTRE_CFS_DEAD,
-					"fs/lustre/cfe:dead", NULL,
-					cfs_cpu_dead);
-	if (ret < 0)
-		goto failed;
-	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
-					"fs/lustre/cfe:online",
-					cfs_cpu_online, NULL);
-	if (ret < 0)
-		goto failed;
-	lustre_cpu_online = ret;
-#else
-	register_hotcpu_notifier(&cfs_cpu_notifier);
-#endif /* !HAVE_HOTPLUG_STATE_MACHINE */
-#endif /* CONFIG_HOTPLUG_CPU */
-	ret = -EINVAL;
-
-	get_online_cpus();
-	if (*cpu_pattern != 0) {
-		cfs_cpt_table = cfs_cpt_table_create_pattern(cpu_pattern);
-		if (IS_ERR(cfs_cpt_table)) {
-			CERROR("Failed to create cptab from pattern '%s'\n",
-				cpu_pattern);
-			ret = PTR_ERR(cfs_cpt_table);
-			goto failed;
-		}
-
-	} else {
-		cfs_cpt_table = cfs_cpt_table_create(cpu_npartitions);
-		if (IS_ERR(cfs_cpt_table)) {
-			CERROR("Failed to create cptab with npartitions %d\n",
-				cpu_npartitions);
-			ret = PTR_ERR(cfs_cpt_table);
-			goto failed;
-		}
-	}
-	put_online_cpus();
-
-	LCONSOLE(0, "HW NUMA nodes: %d, HW CPU cores: %d, npartitions: %d\n",
-		 num_online_nodes(), num_online_cpus(),
-		 cfs_cpt_number(cfs_cpt_table));
-	return 0;
-
-failed:
-	put_online_cpus();
-	cfs_cpu_fini();
-	return ret;
-}
-
-#endif
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-adler.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-adler.c
index 0f507d555e603..7a19a5803ee8c 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-adler.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-adler.c
@@ -116,7 +116,9 @@ static struct shash_alg alg = {
 		.cra_name		= "adler32",
 		.cra_driver_name	= "adler32-zlib",
 		.cra_priority		= 100,
+#ifdef CRYPTO_ALG_OPTIONAL_KEY
 		.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
+#endif
 		.cra_blocksize		= CHKSUM_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(u32),
 		.cra_module		= NULL,
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32.c
index c20e5e9a8194b..c794e670ecfd9 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32.c
@@ -65,6 +65,7 @@ static int crc32_setkey(struct crypto_shash *hash, const u8 *key,
 
 	if (keylen != sizeof(u32))
 		return -EINVAL;
+
 	*mctx = le32_to_cpup((__le32 *)key);
 	return 0;
 }
@@ -128,7 +129,9 @@ static struct shash_alg alg = {
 		.cra_name		= "crc32",
 		.cra_driver_name	= "crc32-table",
 		.cra_priority		= 100,
+#ifdef CRYPTO_ALG_OPTIONAL_KEY
 		.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
+#endif
 		.cra_blocksize		= CHKSUM_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(u32),
 		.cra_module		= NULL,
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c
index 5262f071b8a7a..566ba882ede82 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c
@@ -63,6 +63,7 @@ static int crc32c_pclmul_setkey(struct crypto_shash *hash, const u8 *key,
 
 	if (keylen != sizeof(u32))
 		return -EINVAL;
+
 	*mctx = le32_to_cpup((__le32 *)key);
 	return 0;
 }
@@ -131,7 +132,9 @@ static struct shash_alg alg = {
 			.cra_name		= "crc32c",
 			.cra_driver_name	= "crc32c-pclmul",
 			.cra_priority		= 150,
+#ifdef CRYPTO_ALG_OPTIONAL_KEY
 			.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
+#endif
 			.cra_blocksize		= CHKSUM_BLOCK_SIZE,
 			.cra_ctxsize		= sizeof(u32),
 			.cra_module		= NULL,
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c
index 4ad3b7c310037..8d4cb640681f8 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c
@@ -102,6 +102,7 @@ static int crc32_pclmul_setkey(struct crypto_shash *hash, const u8 *key,
 
 	if (keylen != sizeof(u32))
 		return -EINVAL;
+
 	*mctx = le32_to_cpup((__le32 *)key);
 	return 0;
 }
@@ -167,6 +168,9 @@ static struct shash_alg alg = {
 			.cra_name		= "crc32",
 			.cra_driver_name	= "crc32-pclmul",
 			.cra_priority		= 200,
+#ifdef CRYPTO_ALG_OPTIONAL_KEY
+			.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
+#endif
 			.cra_blocksize		= CHKSUM_BLOCK_SIZE,
 			.cra_ctxsize		= sizeof(u32),
 			.cra_module		= NULL,
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto.c
index 1991a86a49598..dce1734a4d500 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-crypto.c
@@ -29,6 +29,7 @@
 
 #include <crypto/hash.h>
 #include <linux/scatterlist.h>
+#include <linux/pagemap.h>
 #include <libcfs/libcfs.h>
 #include <libcfs/libcfs_crypto.h>
 #include <libcfs/linux/linux-crypto.h>
@@ -77,13 +78,27 @@ static int cfs_crypto_hash_alloc(enum cfs_crypto_hash_alg hash_alg,
 	int err = 0;
 
 	*type = cfs_crypto_hash_type(hash_alg);
-
-	if (*type == NULL) {
+	if (!*type) {
 		CWARN("Unsupported hash algorithm id = %d, max id is %d\n",
 		      hash_alg, CFS_HASH_ALG_MAX);
 		return -EINVAL;
 	}
-	tfm = crypto_alloc_ahash((*type)->cht_name, 0, CRYPTO_ALG_ASYNC);
+
+	/* Keys are only supported for the hmac version */
+	if (key && key_len > 0) {
+		char *algo_name;
+
+		algo_name = kasprintf(GFP_KERNEL, "hmac(%s)",
+				      (*type)->cht_name);
+		if (!algo_name)
+			return -ENOMEM;
+
+		tfm = crypto_alloc_ahash(algo_name, 0, CRYPTO_ALG_ASYNC);
+		kfree(algo_name);
+	} else {
+		tfm = crypto_alloc_ahash((*type)->cht_name, 0,
+					 CRYPTO_ALG_ASYNC);
+	}
 	if (IS_ERR(tfm)) {
 		CDEBUG(D_INFO, "Failed to alloc crypto hash %s\n",
 		       (*type)->cht_name);
@@ -94,8 +109,7 @@ static int cfs_crypto_hash_alloc(enum cfs_crypto_hash_alg hash_alg,
 	if (!*req) {
 		CDEBUG(D_INFO, "Failed to alloc ahash_request for %s\n",
 		       (*type)->cht_name);
-		crypto_free_ahash(tfm);
-		return -ENOMEM;
+		GOTO(out_free_tfm, err = -ENOMEM);
 	}
 
 	ahash_request_set_callback(*req, 0, NULL, NULL);
@@ -106,12 +120,8 @@ static int cfs_crypto_hash_alloc(enum cfs_crypto_hash_alg hash_alg,
 		err = crypto_ahash_setkey(tfm,
 					 (unsigned char *)&((*type)->cht_key),
 					 (*type)->cht_size);
-
-	if (err != 0) {
-		ahash_request_free(*req);
-		crypto_free_ahash(tfm);
-		return err;
-	}
+	if (err)
+		GOTO(out_free_req, err);
 
 	CDEBUG(D_INFO, "Using crypto hash: %s (%s) speed %d MB/s\n",
 	       crypto_ahash_alg_name(tfm), crypto_ahash_driver_name(tfm),
@@ -119,7 +129,9 @@ static int cfs_crypto_hash_alloc(enum cfs_crypto_hash_alg hash_alg,
 
 	err = crypto_ahash_init(*req);
 	if (err) {
+out_free_req:
 		ahash_request_free(*req);
+out_free_tfm:
 		crypto_free_ahash(tfm);
 	}
 	return err;
@@ -195,10 +207,10 @@ EXPORT_SYMBOL(cfs_crypto_hash_digest);
  *			use default initial value
  * \param[in] key_len	length of \a key in bytes
  *
- * \retval		pointer to descriptor of hash instance
+ * \retval		pointer to ahash request
  * \retval		ERR_PTR(errno) in case of error
  */
-struct cfs_crypto_hash_desc *
+struct ahash_request *
 	cfs_crypto_hash_init(enum cfs_crypto_hash_alg hash_alg,
 			     unsigned char *key, unsigned int key_len)
 {
@@ -209,14 +221,14 @@ struct cfs_crypto_hash_desc *
 	err = cfs_crypto_hash_alloc(hash_alg, &type, &req, key, key_len);
 	if (err)
 		return ERR_PTR(err);
-	return (struct cfs_crypto_hash_desc *)req;
+	return req;
 }
 EXPORT_SYMBOL(cfs_crypto_hash_init);
 
 /**
  * Update hash digest computed on data within the given \a page
  *
- * \param[in] hdesc	hash state descriptor
+ * \param[in] req	ahash request
  * \param[in] page	data page on which to compute the hash
  * \param[in] offset	offset within \a page at which to start hash
  * \param[in] len	length of data on which to compute hash
@@ -224,11 +236,10 @@ EXPORT_SYMBOL(cfs_crypto_hash_init);
  * \retval		0 for success
  * \retval		negative errno on failure
  */
-int cfs_crypto_hash_update_page(struct cfs_crypto_hash_desc *hdesc,
+int cfs_crypto_hash_update_page(struct ahash_request *req,
 				struct page *page, unsigned int offset,
 				unsigned int len)
 {
-	struct ahash_request *req = (void *)hdesc;
 	struct scatterlist sl;
 
 	sg_init_table(&sl, 1);
@@ -242,17 +253,16 @@ EXPORT_SYMBOL(cfs_crypto_hash_update_page);
 /**
  * Update hash digest computed on the specified data
  *
- * \param[in] hdesc	hash state descriptor
+ * \param[in] req	ahash request
  * \param[in] buf	data buffer on which to compute the hash
  * \param[in] buf_len	length of \buf on which to compute hash
  *
  * \retval		0 for success
  * \retval		negative errno on failure
  */
-int cfs_crypto_hash_update(struct cfs_crypto_hash_desc *hdesc,
+int cfs_crypto_hash_update(struct ahash_request *req,
 			   const void *buf, unsigned int buf_len)
 {
-	struct ahash_request *req = (void *)hdesc;
 	struct scatterlist sl;
 
 	sg_init_one(&sl, (void *)buf, buf_len);
@@ -265,7 +275,7 @@ EXPORT_SYMBOL(cfs_crypto_hash_update);
 /**
  * Finish hash calculation, copy hash digest to buffer, clean up hash descriptor
  *
- * \param[in]	hdesc		hash descriptor
+ * \param[in]	req		ahash request
  * \param[out]	hash		pointer to hash buffer to store hash digest
  * \param[in,out] hash_len	pointer to hash buffer size, if \a hash == NULL
  *				or hash_len == NULL only free \a hdesc instead
@@ -275,10 +285,9 @@ EXPORT_SYMBOL(cfs_crypto_hash_update);
  * \retval		-EOVERFLOW if hash_len is too small for the hash digest
  * \retval		negative errno for other errors from lower layers
  */
-int cfs_crypto_hash_final(struct cfs_crypto_hash_desc *hdesc,
+int cfs_crypto_hash_final(struct ahash_request *req,
 			  unsigned char *hash, unsigned int *hash_len)
 {
-	struct ahash_request *req = (void *)hdesc;
 	int size = crypto_ahash_digestsize(crypto_ahash_reqtfm(req));
 	int err;
 
@@ -313,6 +322,9 @@ EXPORT_SYMBOL(cfs_crypto_hash_final);
  * The speed is stored internally in the cfs_crypto_hash_speeds[] array, and
  * is available through the cfs_crypto_hash_speed() function.
  *
+ * This function needs to stay the same as obd_t10_performance_test() so that
+ * the speeds are comparable.
+ *
  * \param[in] hash_alg	hash algorithm id (CFS_HASH_ALG_*)
  * \param[in] buf	data buffer on which to compute the hash
  * \param[in] buf_len	length of \buf on which to compute hash
@@ -340,23 +352,23 @@ static void cfs_crypto_performance_test(enum cfs_crypto_hash_alg hash_alg)
 
 	for (start = jiffies, end = start + msecs_to_jiffies(MSEC_PER_SEC / 4),
 	     bcount = 0; time_before(jiffies, end) && err == 0; bcount++) {
-		struct cfs_crypto_hash_desc *hdesc;
+		struct ahash_request *req;
 		int i;
 
-		hdesc = cfs_crypto_hash_init(hash_alg, NULL, 0);
-		if (IS_ERR(hdesc)) {
-			err = PTR_ERR(hdesc);
+		req = cfs_crypto_hash_init(hash_alg, NULL, 0);
+		if (IS_ERR(req)) {
+			err = PTR_ERR(req);
 			break;
 		}
 
 		for (i = 0; i < buf_len / PAGE_SIZE; i++) {
-			err = cfs_crypto_hash_update_page(hdesc, page, 0,
+			err = cfs_crypto_hash_update_page(req, page, 0,
 							  PAGE_SIZE);
 			if (err != 0)
 				break;
 		}
 
-		err = cfs_crypto_hash_final(hdesc, hash, &hash_len);
+		err = cfs_crypto_hash_final(req, hash, &hash_len);
 		if (err != 0)
 			break;
 	}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c
index cd00d0ae5717f..799c40ea638ec 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-curproc.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -37,8 +37,12 @@
  */
 
 #include <linux/sched.h>
+#ifdef HAVE_SCHED_HEADERS
+#include <linux/sched/signal.h>
+#include <linux/sched/mm.h>
+#endif
 #include <linux/fs_struct.h>
-
+#include <linux/pagemap.h>
 #include <linux/compat.h>
 #include <linux/thread_info.h>
 
@@ -149,9 +153,7 @@ static int cfs_access_process_vm(struct task_struct *tsk,
 		int bytes, rc, offset;
 		void *maddr;
 
-#if defined(HAVE_GET_USER_PAGES_GUP_FLAGS_7ARGS)
-		rc = get_user_pages(tsk, mm, addr, 1, write ? FOLL_WRITE : 0, &page, &vma);
-#elif defined(HAVE_GET_USER_PAGES_GUP_FLAGS)
+#if defined(HAVE_GET_USER_PAGES_GUP_FLAGS)
 		rc = get_user_pages(addr, 1, write ? FOLL_WRITE : 0, &page, &vma);
 #elif defined(HAVE_GET_USER_PAGES_6ARG)
 		rc = get_user_pages(addr, 1, write, 1, &page, &vma);
@@ -254,15 +256,22 @@ int cfs_get_environ(const char *key, char *value, int *val_len)
 
 			entry = env_start;
 			entry_len = env_end - env_start;
+			CDEBUG(D_INFO, "key: %s, entry: %s\n", key, entry);
 
 			/* Key length + length of '=' */
 			if (entry_len > key_len + 1 &&
+			    entry[key_len] == '='  &&
 			    !memcmp(entry, key, key_len)) {
 				entry += key_len + 1;
 				entry_len -= key_len + 1;
-				/* The 'value' buffer passed in is too small.*/
-				if (entry_len >= *val_len)
+
+				/* The 'value' buffer passed in is too small.
+				 * Copy what fits, but return -EOVERFLOW. */
+				if (entry_len >= *val_len) {
+					memcpy(value, entry, *val_len);
+					value[*val_len - 1] = 0;
 					GOTO(out, rc = -EOVERFLOW);
+				}
 
 				memcpy(value, entry, entry_len);
 				*val_len = entry_len;
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-debug.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-debug.c
index 048b2f34df5ba..5bb4f08ecefd7 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-debug.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-debug.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2013, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -36,7 +36,6 @@
 
 #include <linux/errno.h>
 #include <linux/interrupt.h>
-#include <linux/kallsyms.h>
 #include <linux/kmod.h>
 #include <linux/module.h>
 #include <linux/notifier.h>
@@ -51,6 +50,9 @@
 # define DEBUG_SUBSYSTEM S_LNET
 
 #include <libcfs/libcfs.h>
+#ifdef HAVE_PANIC_NOTIFIER_H
+#include <linux/panic_notifier.h>
+#endif
 
 #include "tracefile.h"
 
@@ -115,6 +117,28 @@ EXPORT_SYMBOL(lbug_with_loc);
 
 #ifdef CONFIG_STACKTRACE
 
+#ifndef HAVE_SAVE_STACK_TRACE_TSK
+#define save_stack_trace_tsk(tsk, trace)				       \
+do {									       \
+	if (tsk == current)						       \
+		save_stack_trace(trace);				       \
+	else								       \
+		pr_info("No stack, save_stack_trace_tsk() not exported\n");    \
+} while (0)
+#endif
+
+static void cfs_print_stack_trace(unsigned long *entries, unsigned int nr)
+{
+	unsigned int i;
+
+	/* Prefer %pB for backtraced symbolic names since it was added in:
+	 * Linux v2.6.38-6557-g0f77a8d37825
+	 * vsprintf: Introduce %pB format specifier
+	 */
+	for (i = 0; i < nr; i++)
+		pr_info("[<0>] %pB\n", (void *)entries[i]);
+}
+
 #define MAX_ST_ENTRIES	100
 static DEFINE_SPINLOCK(st_lock);
 
@@ -130,11 +154,20 @@ typedef unsigned int (stack_trace_save_tsk_t)(struct task_struct *task,
 static stack_trace_save_tsk_t *task_dump_stack;
 #endif
 
-static void libcfs_call_trace(struct task_struct *tsk)
+void __init cfs_debug_init(void)
 {
 #ifdef CONFIG_ARCH_STACKWALK
+	task_dump_stack = (void *)
+			cfs_kallsyms_lookup_name("stack_trace_save_tsk");
+
+#endif
+}
+
+static void libcfs_call_trace(struct task_struct *tsk)
+{
 	static unsigned long entries[MAX_ST_ENTRIES];
-	unsigned int i, nr_entries;
+#ifdef CONFIG_ARCH_STACKWALK
+	unsigned int nr_entries;
 
 	if (!task_dump_stack)
 		task_dump_stack = (stack_trace_save_tsk_t *)
@@ -146,13 +179,11 @@ static void libcfs_call_trace(struct task_struct *tsk)
 	pr_info("Call Trace TBD:\n");
 	if (task_dump_stack) {
 		nr_entries = task_dump_stack(tsk, entries, MAX_ST_ENTRIES, 0);
-		for (i = 0; i < nr_entries; i++)
-			pr_info("[<0>] %pB\n", (void *)entries[i]);
+		cfs_print_stack_trace(entries, nr_entries);
 	}
 	spin_unlock(&st_lock);
 #else
 	struct stack_trace trace;
-	static unsigned long entries[MAX_ST_ENTRIES];
 
 	trace.nr_entries = 0;
 	trace.max_entries = MAX_ST_ENTRIES;
@@ -164,11 +195,7 @@ static void libcfs_call_trace(struct task_struct *tsk)
 	       init_utsname()->release, init_utsname()->version);
 	pr_info("Call Trace:\n");
 	save_stack_trace_tsk(tsk, &trace);
-#ifdef HAVE_STACK_TRACE_PRINT
-	stack_trace_print(trace.entries, trace.nr_entries, 0);
-#else
-	print_stack_trace(&trace, 0);
-#endif
+	cfs_print_stack_trace(trace.entries, trace.nr_entries);
 	spin_unlock(&st_lock);
 #endif
 }
@@ -270,12 +297,6 @@ void libcfs_debug_dumpstack(struct task_struct *tsk)
 }
 EXPORT_SYMBOL(libcfs_debug_dumpstack);
 
-struct task_struct *libcfs_current(void)
-{
-        CWARN("current task struct is %p\n", current);
-        return current;
-}
-
 static int panic_notifier(struct notifier_block *self, unsigned long unused1,
                          void *unused2)
 {
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-hash.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-hash.c
new file mode 100644
index 0000000000000..e4e67c20cee5d
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-hash.c
@@ -0,0 +1,57 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/module.h>
+#ifdef HAVE_STRINGHASH
+#include <linux/stringhash.h>
+#else
+#include <linux/dcache.h>
+#endif
+#include <linux/hash.h>
+
+#include <libcfs/linux/linux-hash.h>
+
+/* Return the "hash_len" (hash and length) of a null-terminated string */
+/* The kernel equivalent is in fs/namei.c but for some strange reason
+ * RHEL7.5 stuck it in dax/super.c instead. This placement never existed
+ * upstream so to make life easier we just have the equavilent
+ */
+u64 cfs_hashlen_string(const void *salt, const char *name)
+{
+#ifdef HAVE_FULL_NAME_HASH_3ARGS
+	unsigned long hash = init_name_hash(salt);
+#else
+	unsigned long hash = init_name_hash();
+#endif
+	unsigned long len = 0, c;
+
+	c = (unsigned char)*name;
+	while (c) {
+		len++;
+		hash = partial_name_hash(c, hash);
+		c = (unsigned char)name[len];
+	}
+	return hashlen_create(end_name_hash(hash), len);
+}
+EXPORT_SYMBOL(cfs_hashlen_string);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-module.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-module.c
index 839f9324ac5ca..7300af8018c69 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-module.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-module.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2014, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -32,7 +32,9 @@
 
 #define DEBUG_SUBSYSTEM S_LNET
 
+#include <linux/fs.h>
 #include <linux/miscdevice.h>
+#include <linux/uaccess.h>
 #include <libcfs/libcfs.h>
 
 static inline size_t libcfs_ioctl_packlen(struct libcfs_ioctl_data *data)
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c
index 4b73ed6e79a93..2ee18be5e59a6 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-prim.c
@@ -36,13 +36,18 @@
 #include <linux/fs.h>
 #include <linux/fs_struct.h>
 #include <linux/sched.h>
+#ifdef HAVE_SCHED_HEADERS
+#include <linux/sched/signal.h>
+#include <linux/sched/mm.h>
+#endif
 #include <linux/uaccess.h>
-#include <libcfs/libcfs.h>
 
 #if defined(CONFIG_KGDB)
 #include <asm/kgdb.h>
 #endif
 
+#include <libcfs/linux/linux-time.h>
+
 #ifndef HAVE_KTIME_GET_TS64
 void ktime_get_ts64(struct timespec64 *ts)
 {
@@ -97,17 +102,17 @@ time64_t ktime_get_seconds(void)
 EXPORT_SYMBOL(ktime_get_seconds);
 #endif /* HAVE_KTIME_GET_SECONDS */
 
-#ifndef HAVE_LINUX_SELINUX_IS_ENABLED
-static char **cfs_lsm_names;
+static int (*cfs_apply_workqueue_attrs_t)(struct workqueue_struct *wq,
+					  const struct workqueue_attrs *attrs);
 
-bool selinux_is_enabled(void)
+int cfs_apply_workqueue_attrs(struct workqueue_struct *wq,
+			      const struct workqueue_attrs *attrs)
 {
-	if (cfs_lsm_names)
-		return !!strstr("selinux", *cfs_lsm_names);
-	return false;
+	if (cfs_apply_workqueue_attrs_t)
+		return cfs_apply_workqueue_attrs_t(wq, attrs);
+	return 0;
 }
-EXPORT_SYMBOL(selinux_is_enabled);
-#endif
+EXPORT_SYMBOL_GPL(cfs_apply_workqueue_attrs);
 
 int cfs_kernel_write(struct file *filp, const void *buf, size_t count,
 		     loff_t *pos)
@@ -127,6 +132,43 @@ int cfs_kernel_write(struct file *filp, const void *buf, size_t count,
 }
 EXPORT_SYMBOL(cfs_kernel_write);
 
+#ifndef HAVE_KSET_FIND_OBJ
+struct kobject *kset_find_obj(struct kset *kset, const char *name)
+{
+	struct kobject *ret = NULL;
+	struct kobject *k;
+
+	spin_lock(&kset->list_lock);
+
+	list_for_each_entry(k, &kset->list, entry) {
+		if (kobject_name(k) && !strcmp(kobject_name(k), name)) {
+			if (kref_get_unless_zero(&k->kref))
+				ret = k;
+			break;
+		}
+	}
+
+	spin_unlock(&kset->list_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kset_find_obj);
+#endif
+
+#ifndef HAVE_KSTRTOBOOL_FROM_USER
+int kstrtobool_from_user(const char __user *s, size_t count, bool *res)
+{
+	/* Longest string needed to differentiate, newline, terminator */
+	char buf[4];
+
+	count = min(count, sizeof(buf) - 1);
+	if (copy_from_user(buf, s, count))
+		return -EFAULT;
+	buf[count] = '\0';
+	return strtobool(buf, res);
+}
+EXPORT_SYMBOL(kstrtobool_from_user);
+#endif /* !HAVE_KSTRTOBOOL_FROM_USER */
+
 sigset_t
 cfs_block_allsigs(void)
 {
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-tracefile.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-tracefile.c
index e0fd4c0de04f1..9685296266f04 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-tracefile.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-tracefile.c
@@ -33,6 +33,7 @@
 #define DEBUG_SUBSYSTEM S_LNET
 #define LUSTRE_TRACEFILE_PRIVATE
 
+#include <linux/slab.h>
 #include <libcfs/libcfs.h>
 #include "tracefile.h"
 
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-wait.c b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-wait.c
new file mode 100644
index 0000000000000..5843d808bc332
--- /dev/null
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/linux/linux-wait.c
@@ -0,0 +1,115 @@
+/*
+ * The implementation of the wait_bit*() and related waiting APIs:
+ */
+#include <linux/hash.h>
+#include <linux/sched.h>
+#ifdef HAVE_SCHED_HEADERS
+#include <linux/sched/signal.h>
+#endif
+#include <libcfs/linux/linux-wait.h>
+
+#ifndef HAVE_PREPARE_TO_WAIT_EVENT
+
+#define __add_wait_queue_entry_tail __add_wait_queue_tail
+
+long prepare_to_wait_event(wait_queue_head_t *wq_head,
+			   wait_queue_entry_t *wq_entry, int state)
+{
+	unsigned long flags;
+	long ret = 0;
+
+	spin_lock_irqsave(&wq_head->lock, flags);
+	if (unlikely(signal_pending_state(state, current))) {
+		/*
+		 * Exclusive waiter must not fail if it was selected by wakeup,
+		 * it should "consume" the condition we were waiting for.
+		 *
+		 * The caller will recheck the condition and return success if
+		 * we were already woken up, we can not miss the event because
+		 * wakeup locks/unlocks the same wq_head->lock.
+		 *
+		 * But we need to ensure that set-condition + wakeup after that
+		 * can't see us, it should wake up another exclusive waiter if
+		 * we fail.
+		 */
+		list_del_init(&wq_entry->task_list);
+		ret = -ERESTARTSYS;
+	} else {
+		if (list_empty(&wq_entry->task_list)) {
+			if (wq_entry->flags & WQ_FLAG_EXCLUSIVE)
+				__add_wait_queue_entry_tail(wq_head, wq_entry);
+			else
+				__add_wait_queue(wq_head, wq_entry);
+		}
+		set_current_state(state);
+	}
+	spin_unlock_irqrestore(&wq_head->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(prepare_to_wait_event);
+#endif /* !HAVE_PREPARE_TO_WAIT_EVENT */
+
+#ifndef HAVE_WAIT_VAR_EVENT
+
+#define WAIT_TABLE_BITS 8
+#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
+
+static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
+
+wait_queue_head_t *__var_waitqueue(void *p)
+{
+	return bit_wait_table + hash_ptr(p, WAIT_TABLE_BITS);
+}
+EXPORT_SYMBOL(__var_waitqueue);
+
+static int
+var_wake_function(wait_queue_entry_t *wq_entry, unsigned int mode,
+		  int sync, void *arg)
+{
+	struct wait_bit_key *key = arg;
+	struct wait_bit_queue_entry *wbq_entry =
+		container_of(wq_entry, struct wait_bit_queue_entry, wq_entry);
+
+	if (wbq_entry->key.flags != key->flags ||
+	    wbq_entry->key.bit_nr != key->bit_nr)
+		return 0;
+
+	return autoremove_wake_function(wq_entry, mode, sync, key);
+}
+
+void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var,
+			 int flags)
+{
+	*wbq_entry = (struct wait_bit_queue_entry){
+		.key = {
+			.flags	= (var),
+			.bit_nr = -1,
+		},
+		.wq_entry = {
+			.private = current,
+			.func = var_wake_function,
+#ifdef HAVE_WAIT_QUEUE_ENTRY_LIST
+			.entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry),
+#else
+			.task_list = LIST_HEAD_INIT(wbq_entry->wq_entry.task_list),
+#endif
+		},
+	};
+}
+EXPORT_SYMBOL(init_wait_var_entry);
+
+void wake_up_var(void *var)
+{
+	__wake_up_bit(__var_waitqueue(var), var, -1);
+}
+EXPORT_SYMBOL(wake_up_var);
+
+void __init wait_bit_init(void)
+{
+	int i;
+
+	for (i = 0; i < WAIT_TABLE_SIZE; i++)
+		init_waitqueue_head(bit_wait_table + i);
+}
+#endif /* ! HAVE_WAIT_VAR_EVENT */
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/module.c b/drivers/staging/lustrefsx/libcfs/libcfs/module.c
index f832a6fd02bce..08f5a1c1a5655 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/module.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/module.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -46,52 +46,21 @@
 #include <linux/list.h>
 
 #include <linux/sysctl.h>
-#include <linux/proc_fs.h>
+#include <linux/debugfs.h>
 #include <asm/div64.h>
 
 #define DEBUG_SUBSYSTEM S_LNET
 
 #include <libcfs/libcfs.h>
 #include <libcfs/libcfs_crypto.h>
+#include <libcfs/linux/linux-fs.h>
 #include <lnet/lib-lnet.h>
 #include "tracefile.h"
 
-#ifdef CONFIG_SYSCTL
-static struct ctl_table_header *lnet_table_header;
-#endif
-
-static DECLARE_RWSEM(ioctl_list_sem);
-static LIST_HEAD(ioctl_list);
-
-int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand)
-{
-	int rc = 0;
-
-	down_write(&ioctl_list_sem);
-	if (!list_empty(&hand->item))
-		rc = -EBUSY;
-	else
-		list_add_tail(&hand->item, &ioctl_list);
-	up_write(&ioctl_list_sem);
-
-	return rc;
-}
-EXPORT_SYMBOL(libcfs_register_ioctl);
-
-int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand)
-{
-	int rc = 0;
-
-	down_write(&ioctl_list_sem);
-	if (list_empty(&hand->item))
-		rc = -ENOENT;
-	else
-		list_del_init(&hand->item);
-	up_write(&ioctl_list_sem);
+static struct dentry *lnet_debugfs_root;
 
-	return rc;
-}
-EXPORT_SYMBOL(libcfs_deregister_ioctl);
+BLOCKING_NOTIFIER_HEAD(libcfs_ioctl_list);
+EXPORT_SYMBOL(libcfs_ioctl_list);
 
 int libcfs_ioctl(unsigned long cmd, void __user *uparam)
 {
@@ -133,35 +102,27 @@ int libcfs_ioctl(unsigned long cmd, void __user *uparam)
 		libcfs_debug_mark_buffer(data->ioc_inlbuf1);
 		break;
 
-	default: {
-		struct libcfs_ioctl_handler *hand;
-
-		err = -EINVAL;
-		down_read(&ioctl_list_sem);
-		list_for_each_entry(hand, &ioctl_list, item) {
-			err = hand->handle_ioctl(cmd, hdr);
-			if (err == -EINVAL)
-				continue;
-
-			if (err == 0) {
-				if (copy_to_user(uparam, hdr, hdr->ioc_len))
-					err = -EFAULT;
-			}
-			break;
-		}
-		up_read(&ioctl_list_sem);
-		break; }
+	default:
+		err = blocking_notifier_call_chain(&libcfs_ioctl_list,
+						   cmd, hdr);
+		if (!(err & NOTIFY_STOP_MASK))
+			/* No-one claimed the ioctl */
+			err = -EINVAL;
+		else
+			err = notifier_to_errno(err);
+		if (copy_to_user(uparam, hdr, hdr->ioc_len) && !err)
+			err = -EFAULT;
+		break;
 	}
 out:
 	LIBCFS_FREE(hdr, hdr->ioc_len);
 	RETURN(err);
 }
 
-int
-lprocfs_call_handler(void *data, int write, loff_t *ppos,
-		     void __user *buffer, size_t *lenp,
-		     int (*handler)(void *data, int write, loff_t pos,
-				    void __user *buffer, int len))
+int lprocfs_call_handler(void *data, int write, loff_t *ppos,
+			 void __user *buffer, size_t *lenp,
+			 int (*handler)(void *data, int write, loff_t pos,
+					void __user *buffer, int len))
 {
 	int rc = handler(data, write, *ppos, buffer, *lenp);
 
@@ -219,9 +180,8 @@ static int __proc_dobitmasks(void *data, int write,
 	return rc;
 }
 
-static int
-proc_dobitmasks(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp, loff_t *ppos)
+static int proc_dobitmasks(struct ctl_table *table, int write,
+			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
 				    __proc_dobitmasks);
@@ -239,9 +199,8 @@ static int __proc_dump_kernel(void *data, int write,
 	return cfs_trace_dump_debug_buffer_usrstr(buffer, nob);
 }
 
-static int
-proc_dump_kernel(struct ctl_table *table, int write, void __user *buffer,
-		 size_t *lenp, loff_t *ppos)
+static int proc_dump_kernel(struct ctl_table *table, int write,
+			    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
 				    __proc_dump_kernel);
@@ -263,156 +222,133 @@ static int __proc_daemon_file(void *data, int write,
 	return cfs_trace_daemon_command_usrstr(buffer, nob);
 }
 
-static int
-proc_daemon_file(struct ctl_table *table, int write, void __user *buffer,
-		 size_t *lenp, loff_t *ppos)
+static int proc_daemon_file(struct ctl_table *table, int write,
+			    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
 				    __proc_daemon_file);
 }
 
-static int __proc_debug_mb(void *data, int write,
-			   loff_t pos, void __user *buffer, int nob)
+static int libcfs_force_lbug(struct ctl_table *table, int write,
+			     void __user *buffer,
+			     size_t *lenp, loff_t *ppos)
 {
-	if (!write) {
-		char tmpstr[32];
-		int  len = snprintf(tmpstr, sizeof(tmpstr), "%d",
-				    cfs_trace_get_debug_mb());
-
-		if (pos >= len)
-			return 0;
-
-		return cfs_trace_copyout_string(buffer, nob, tmpstr + pos,
-						"\n");
-	}
-
-	return cfs_trace_set_debug_mb_usrstr(buffer, nob);
-}
-
-static int
-proc_debug_mb(struct ctl_table *table, int write, void __user *buffer,
-	      size_t *lenp, loff_t *ppos)
-{
-	return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
-				    __proc_debug_mb);
+	if (write)
+		LBUG();
+	return 0;
 }
 
-static int
-proc_console_max_delay_cs(struct ctl_table *table, int write,
-			  void __user *buffer, size_t *lenp, loff_t *ppos)
+static int proc_fail_loc(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-	int rc, max_delay_cs;
-	struct ctl_table dummy = *table;
-	cfs_duration_t d;
-
-	dummy.data = &max_delay_cs;
-	dummy.proc_handler = &proc_dointvec;
+	int rc;
+	long old_fail_loc = cfs_fail_loc;
 
-	if (!write) { /* read */
-		max_delay_cs = cfs_duration_sec(libcfs_console_max_delay * 100);
-		rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
-		return rc;
+	if (!*lenp || *ppos) {
+		*lenp = 0;
+		return 0;
 	}
 
-	/* write */
-	max_delay_cs = 0;
-	rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
-	if (rc < 0)
-		return rc;
-	if (max_delay_cs <= 0)
-		return -EINVAL;
-
-	d = cfs_time_seconds(max_delay_cs) / 100;
-	if (d == 0 || d < libcfs_console_min_delay)
-		return -EINVAL;
-	libcfs_console_max_delay = d;
-
-	return rc;
-}
-
-static int
-proc_console_min_delay_cs(struct ctl_table *table, int write,
-			  void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	int rc, min_delay_cs;
-	struct ctl_table dummy = *table;
-	cfs_duration_t d;
-
-	dummy.data = &min_delay_cs;
-	dummy.proc_handler = &proc_dointvec;
+	if (write) {
+		char *kbuf = memdup_user_nul(buffer, *lenp);
 
-	if (!write) { /* read */
-		min_delay_cs = cfs_duration_sec(libcfs_console_min_delay * 100);
-		rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
-		return rc;
+		if (IS_ERR(kbuf))
+			return PTR_ERR(kbuf);
+		rc = kstrtoul(kbuf, 0, &cfs_fail_loc);
+		kfree(kbuf);
+		*ppos += *lenp;
+	} else {
+		char kbuf[64/3+3];
+
+		rc = scnprintf(kbuf, sizeof(kbuf), "%lu\n", cfs_fail_loc);
+		if (copy_to_user(buffer, kbuf, rc))
+			rc = -EFAULT;
+		else {
+			*lenp = rc;
+			*ppos += rc;
+		}
 	}
 
-	/* write */
-	min_delay_cs = 0;
-	rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
-	if (rc < 0)
-		return rc;
-	if (min_delay_cs <= 0)
-		return -EINVAL;
-
-	d = cfs_time_seconds(min_delay_cs) / 100;
-	if (d == 0 || d > libcfs_console_max_delay)
-		return -EINVAL;
-	libcfs_console_min_delay = d;
-
+	if (old_fail_loc != cfs_fail_loc) {
+		cfs_race_state = 1;
+		wake_up(&cfs_race_waitq);
+	}
 	return rc;
 }
 
-static int
-proc_console_backoff(struct ctl_table *table, int write, void __user *buffer,
-		     size_t *lenp, loff_t *ppos)
+int debugfs_doint(struct ctl_table *table, int write,
+		  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-	int rc, backoff;
-	struct ctl_table dummy = *table;
-
-	dummy.data = &backoff;
-	dummy.proc_handler = &proc_dointvec;
+	int rc;
 
-	if (!write) { /* read */
-		backoff = libcfs_console_backoff;
-		rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
-		return rc;
+	if (!*lenp || *ppos) {
+		*lenp = 0;
+		return 0;
 	}
 
-	/* write */
-	backoff = 0;
-	rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
-	if (rc < 0)
-		return rc;
-
-	if (backoff <= 0)
-		return -EINVAL;
-
-	libcfs_console_backoff = backoff;
+	if (write) {
+		char *kbuf = memdup_user_nul(buffer, *lenp);
+		int val;
+
+		if (IS_ERR(kbuf))
+			return PTR_ERR(kbuf);
+
+		rc = kstrtoint(kbuf, 0, &val);
+		kfree(kbuf);
+		if (!rc) {
+			if (table->extra1 && val < *(int *)table->extra1)
+				val = *(int *)table->extra1;
+			if (table->extra2 && val > *(int *)table->extra2)
+				val = *(int *)table->extra2;
+			*(int *)table->data = val;
+		}
+		*ppos += *lenp;
+	} else {
+		char kbuf[64/3+3];
+
+		rc = scnprintf(kbuf, sizeof(kbuf), "%u\n", *(int *)table->data);
+		if (copy_to_user(buffer, kbuf, rc))
+			rc = -EFAULT;
+		else {
+			*lenp = rc;
+			*ppos += rc;
+		}
+	}
 
 	return rc;
 }
+EXPORT_SYMBOL(debugfs_doint);
 
-static int
-libcfs_force_lbug(struct ctl_table *table, int write, void __user *buffer,
-		  size_t *lenp, loff_t *ppos)
+static int debugfs_dostring(struct ctl_table *table, int write,
+			    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-	if (write)
-		LBUG();
-	return 0;
-}
-
-static int
-proc_fail_loc(struct ctl_table *table, int write, void __user *buffer,
-	      size_t *lenp, loff_t *ppos)
-{
-	int rc;
-	long old_fail_loc = cfs_fail_loc;
+	int len = *lenp;
+	char *kbuf = table->data;
 
-	rc = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
-	if (old_fail_loc != cfs_fail_loc)
-		wake_up(&cfs_race_waitq);
-	return rc;
+	if (!len || *ppos) {
+		*lenp = 0;
+		return 0;
+	}
+	if (len > table->maxlen)
+		len = table->maxlen;
+	if (write) {
+		if (copy_from_user(kbuf, buffer, len))
+			return -EFAULT;
+		memset(kbuf+len, 0, table->maxlen - len);
+		*ppos = *lenp;
+	} else {
+		len = strnlen(kbuf, len);
+		if (copy_to_user(buffer, kbuf, len))
+			return -EFAULT;
+		if (len < *lenp) {
+			if (copy_to_user(buffer+len, "\n", 1))
+				return -EFAULT;
+			len += 1;
+		}
+		*ppos += len;
+		*lenp -= len;
+	}
+	return len;
 }
 
 static int __proc_cpt_table(void *data, int write,
@@ -456,9 +392,8 @@ static int __proc_cpt_table(void *data, int write,
 	return rc;
 }
 
-static int
-proc_cpt_table(struct ctl_table *table, int write, void __user *buffer,
-	       size_t *lenp, loff_t *ppos)
+static int proc_cpt_table(struct ctl_table *table, int write,
+			  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
 				    __proc_cpt_table);
@@ -505,19 +440,14 @@ static int __proc_cpt_distance(void *data, int write,
 	return rc;
 }
 
-static int
-proc_cpt_distance(struct ctl_table *table, int write, void __user *buffer,
-	       size_t *lenp, loff_t *ppos)
+static int proc_cpt_distance(struct ctl_table *table, int write,
+			     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
 				     __proc_cpt_distance);
 }
 
 static struct ctl_table lnet_table[] = {
-	/*
-	 * NB No .strategy entries have been provided since sysctl(8) prefers
-	 * to go via /proc for portability.
-	 */
 	{
 		INIT_CTL_NAME
 		.procname	= "debug",
@@ -542,43 +472,6 @@ static struct ctl_table lnet_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dobitmasks,
 	},
-	{
-		INIT_CTL_NAME
-		.procname	= "console_ratelimit",
-		.data		= &libcfs_console_ratelimit,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
-	},
-	{
-		INIT_CTL_NAME
-		.procname	= "console_max_delay_centisecs",
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_console_max_delay_cs
-	},
-	{
-		INIT_CTL_NAME
-		.procname	= "console_min_delay_centisecs",
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_console_min_delay_cs
-	},
-	{
-		INIT_CTL_NAME
-		.procname	= "console_backoff",
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_console_backoff
-	},
-	{
-		INIT_CTL_NAME
-		.procname	= "debug_path",
-		.data		= libcfs_debug_file_path_arr,
-		.maxlen		= sizeof(libcfs_debug_file_path_arr),
-		.mode		= 0644,
-		.proc_handler	= &proc_dostring,
-	},
 	{
 		INIT_CTL_NAME
 		.procname	= "cpu_partition_table",
@@ -599,7 +492,7 @@ static struct ctl_table lnet_table[] = {
 		.data		= lnet_debug_log_upcall,
 		.maxlen		= sizeof(lnet_debug_log_upcall),
 		.mode		= 0644,
-		.proc_handler	= &proc_dostring,
+		.proc_handler	= &debugfs_dostring,
 	},
 	{
 		INIT_CTL_NAME
@@ -607,7 +500,7 @@ static struct ctl_table lnet_table[] = {
 		.data		= (int *)&libcfs_kmemory.counter,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &debugfs_doint,
 	},
 	{
 		INIT_CTL_NAME
@@ -615,15 +508,7 @@ static struct ctl_table lnet_table[] = {
 		.data		= &libcfs_catastrophe,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		INIT_CTL_NAME
-		.procname	= "panic_on_lbug",
-		.data		= &libcfs_panic_on_lbug,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &debugfs_doint,
 	},
 	{
 		INIT_CTL_NAME
@@ -639,19 +524,13 @@ static struct ctl_table lnet_table[] = {
 		.maxlen		= 256,
 		.proc_handler	= &proc_daemon_file,
 	},
-	{
-		INIT_CTL_NAME
-		.procname	= "debug_mb",
-		.mode		= 0644,
-		.proc_handler	= &proc_debug_mb,
-	},
 	{
 		INIT_CTL_NAME
 		.procname	= "watchdog_ratelimit",
 		.data		= &libcfs_watchdog_ratelimit,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= &debugfs_doint,
 		.extra1		= &min_watchdog_ratelimit,
 		.extra2		= &max_watchdog_ratelimit,
 	},
@@ -677,7 +556,7 @@ static struct ctl_table lnet_table[] = {
 		.data		= &cfs_fail_val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
+		.proc_handler	= &debugfs_doint
 	},
 	{
 		INIT_CTL_NAME
@@ -685,55 +564,154 @@ static struct ctl_table lnet_table[] = {
 		.data		= &cfs_fail_err,
 		.maxlen		= sizeof(cfs_fail_err),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &debugfs_doint,
 	},
 	{
 	}
 };
 
-#ifdef CONFIG_SYSCTL
-static struct ctl_table top_table[] = {
-	{
-		INIT_CTL_NAME
-		.procname       = "lnet",
-		.mode           = 0555,
-		.data           = NULL,
-		.maxlen         = 0,
-		.child          = lnet_table,
-	},
-	{ .procname = NULL }
+static const struct lnet_debugfs_symlink_def lnet_debugfs_symlinks[] = {
+	{ .name		= "console_ratelimit",
+	  .target	= "../../../module/libcfs/parameters/libcfs_console_ratelimit" },
+	{ .name		= "debug_path",
+	  .target	= "../../../module/libcfs/parameters/libcfs_debug_file_path" },
+	{ .name		= "panic_on_lbug",
+	  .target	= "../../../module/libcfs/parameters/libcfs_panic_on_lbug" },
+	{ .name		= "console_backoff",
+	  .target	= "../../../module/libcfs/parameters/libcfs_console_backoff" },
+	{ .name		= "debug_mb",
+	  .target	= "../../../module/libcfs/parameters/libcfs_debug_mb" },
+	{ .name		= "console_min_delay_centisecs",
+	  .target	= "../../../module/libcfs/parameters/libcfs_console_min_delay" },
+	{ .name		= "console_max_delay_centisecs",
+	  .target	= "../../../module/libcfs/parameters/libcfs_console_max_delay" },
+	{ .name		= NULL },
 };
-#endif
 
-static int insert_proc(void)
+static ssize_t lnet_debugfs_read(struct file *filp, char __user *buf,
+				 size_t count, loff_t *ppos)
 {
-#ifdef CONFIG_SYSCTL
-	if (lnet_table_header == NULL)
-		lnet_table_header = register_sysctl_table(top_table);
-#endif
-	return 0;
+	struct ctl_table *table = filp->private_data;
+	ssize_t rc = -EINVAL;
+
+	if (table) {
+		rc = table->proc_handler(table, 0, buf, &count, ppos);
+		if (!rc)
+			rc = count;
+	}
+
+	return rc;
+}
+
+static ssize_t lnet_debugfs_write(struct file *filp, const char __user *buf,
+				  size_t count, loff_t *ppos)
+{
+	struct ctl_table *table = filp->private_data;
+	ssize_t rc = -EINVAL;
+
+	if (table) {
+		rc = table->proc_handler(table, 1, (void __user *)buf, &count,
+					 ppos);
+		if (!rc)
+			rc = count;
+	}
+
+	return rc;
+}
+
+static const struct file_operations lnet_debugfs_file_operations_rw = {
+	.open		= simple_open,
+	.read		= lnet_debugfs_read,
+	.write		= lnet_debugfs_write,
+	.llseek		= default_llseek,
+};
+
+static const struct file_operations lnet_debugfs_file_operations_ro = {
+	.open		= simple_open,
+	.read		= lnet_debugfs_read,
+	.llseek		= default_llseek,
+};
+
+static const struct file_operations lnet_debugfs_file_operations_wo = {
+	.open		= simple_open,
+	.write		= lnet_debugfs_write,
+	.llseek		= default_llseek,
+};
+
+static const struct file_operations *lnet_debugfs_fops_select(umode_t mode)
+{
+	if (!(mode & S_IWUGO))
+		return &lnet_debugfs_file_operations_ro;
+
+	if (!(mode & S_IRUGO))
+		return &lnet_debugfs_file_operations_wo;
+
+	return &lnet_debugfs_file_operations_rw;
 }
 
-static void remove_proc(void)
+void lnet_insert_debugfs(struct ctl_table *table)
 {
-#ifdef CONFIG_SYSCTL
-	if (lnet_table_header != NULL)
-		unregister_sysctl_table(lnet_table_header);
+	if (!lnet_debugfs_root)
+		lnet_debugfs_root = debugfs_create_dir("lnet", NULL);
 
-	lnet_table_header = NULL;
+	/* Even if we cannot create, just ignore it altogether) */
+	if (IS_ERR_OR_NULL(lnet_debugfs_root))
+		return;
+
+	/* We don't save the dentry returned in next two calls, because
+	 * we don't call debugfs_remove() but rather remove_recursive()
+	 */
+	for (; table && table->procname; table++)
+		debugfs_create_file(table->procname, table->mode,
+				    lnet_debugfs_root, table,
+				    lnet_debugfs_fops_select(table->mode));
+}
+EXPORT_SYMBOL_GPL(lnet_insert_debugfs);
+
+static void lnet_insert_debugfs_links(
+		const struct lnet_debugfs_symlink_def *symlinks)
+{
+	for (; symlinks && symlinks->name; symlinks++)
+		debugfs_create_symlink(symlinks->name, lnet_debugfs_root,
+				       symlinks->target);
+}
+
+void lnet_remove_debugfs(struct ctl_table *table)
+{
+#ifndef HAVE_D_HASH_AND_LOOKUP
+	debugfs_remove_recursive(lnet_debugfs_root);
+	lnet_debugfs_root = NULL;
+	return;
 #endif
+
+	for (; table && table->procname; table++) {
+		struct qstr dname = QSTR_INIT(table->procname,
+					      strlen(table->procname));
+		struct dentry *dentry;
+
+		dentry = d_hash_and_lookup(lnet_debugfs_root, &dname);
+		debugfs_remove(dentry);
+	}
 }
+EXPORT_SYMBOL_GPL(lnet_remove_debugfs);
 
 static int __init libcfs_init(void)
 {
 	int rc;
+
+#ifndef HAVE_WAIT_VAR_EVENT
+	wait_bit_init();
+#endif
 	init_libcfs_vfree_atomic();
+
 	rc = libcfs_debug_init(5 * 1024 * 1024);
 	if (rc < 0) {
 		printk(KERN_ERR "LustreError: libcfs_debug_init: %d\n", rc);
 		return (rc);
 	}
 
+	cfs_debug_init();
+
 	rc = cfs_cpu_init();
 	if (rc != 0)
 		goto cleanup_debug;
@@ -765,17 +743,12 @@ static int __init libcfs_init(void)
 		goto cleanup_wi;
 	}
 
-
-	rc = insert_proc();
-	if (rc) {
-		CERROR("insert_proc: error %d\n", rc);
-		goto cleanup_crypto;
-	}
+	lnet_insert_debugfs(lnet_table);
+	if (!IS_ERR_OR_NULL(lnet_debugfs_root))
+		lnet_insert_debugfs_links(lnet_debugfs_symlinks);
 
 	CDEBUG (D_OTHER, "portals setup OK\n");
 	return 0;
-cleanup_crypto:
-	cfs_crypto_unregister();
 cleanup_wi:
 	cfs_wi_shutdown();
 cleanup_deregister:
@@ -791,7 +764,11 @@ static void __exit libcfs_exit(void)
 {
 	int rc;
 
-	remove_proc();
+	/* Remove everthing */
+	if (lnet_debugfs_root) {
+		debugfs_remove_recursive(lnet_debugfs_root);
+		lnet_debugfs_root = NULL;
+	}
 
 	CDEBUG(D_MALLOC, "before Portals cleanup: kmem %d\n",
 	       atomic_read(&libcfs_kmemory));
@@ -816,6 +793,7 @@ static void __exit libcfs_exit(void)
 	if (rc)
 		printk(KERN_ERR "LustreError: libcfs_debug_cleanup: %d\n",
 		       rc);
+
 	exit_libcfs_vfree_atomic();
 }
 
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c
index ac762726fa5ce..f9d96d12f2555 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,8 +40,12 @@
 #define LUSTRE_TRACEFILE_PRIVATE
 #include "tracefile.h"
 
+#include <linux/ctype.h>
+#include <linux/fs.h>
 #include <linux/kthread.h>
-#include <libcfs/linux/linux-misc.h>
+#include <linux/pagemap.h>
+#include <linux/uaccess.h>
+#include <libcfs/linux/linux-fs.h>
 #include <libcfs/libcfs.h>
 
 /* XXX move things up to the top, comment */
@@ -390,34 +394,34 @@ int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata,
                 return 1;
         }
 
-        if (cdls != NULL) {
-                if (libcfs_console_ratelimit &&
-                    cdls->cdls_next != 0 &&     /* not first time ever */
-                    !cfs_time_after(cfs_time_current(), cdls->cdls_next)) {
-                        /* skipping a console message */
-                        cdls->cdls_count++;
-                        if (tcd != NULL)
-                                cfs_trace_put_tcd(tcd);
-                        return 1;
-                }
+	if (cdls != NULL) {
+		if (libcfs_console_ratelimit &&
+		    cdls->cdls_next != 0 &&	/* not first time ever */
+		    time_before(jiffies, cdls->cdls_next)) {
+			/* skipping a console message */
+			cdls->cdls_count++;
+			if (tcd != NULL)
+				cfs_trace_put_tcd(tcd);
+			return 1;
+		}
 
-                if (cfs_time_after(cfs_time_current(), cdls->cdls_next +
-                                                       libcfs_console_max_delay
-                                                       + cfs_time_seconds(10))) {
-                        /* last timeout was a long time ago */
-                        cdls->cdls_delay /= libcfs_console_backoff * 4;
-                } else {
-                        cdls->cdls_delay *= libcfs_console_backoff;
-                }
+		if (time_after(jiffies, cdls->cdls_next +
+					libcfs_console_max_delay +
+					cfs_time_seconds(10))) {
+			/* last timeout was a long time ago */
+			cdls->cdls_delay /= libcfs_console_backoff * 4;
+		} else {
+			cdls->cdls_delay *= libcfs_console_backoff;
+		}
 
 		if (cdls->cdls_delay < libcfs_console_min_delay)
 			cdls->cdls_delay = libcfs_console_min_delay;
 		else if (cdls->cdls_delay > libcfs_console_max_delay)
 			cdls->cdls_delay = libcfs_console_max_delay;
 
-                /* ensure cdls_next is never zero after it's been seen */
-                cdls->cdls_next = (cfs_time_current() + cdls->cdls_delay) | 1;
-        }
+		/* ensure cdls_next is never zero after it's been seen */
+		cdls->cdls_next = (jiffies + cdls->cdls_delay) | 1;
+	}
 
         if (tcd != NULL) {
                 cfs_print_to_console(&header, mask, string_buf, needed, file,
@@ -737,12 +741,8 @@ int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
         if (usr_buffer_nob > knl_buffer_nob)
                 return -EOVERFLOW;
 
-#ifdef PROC_HANDLER_USE_USER_ATTR
 	if (copy_from_user(knl_buffer, usr_buffer, usr_buffer_nob))
                 return -EFAULT;
-#else
-	memcpy(knl_buffer, usr_buffer, usr_buffer_nob);
-#endif
 
         nob = strnlen(knl_buffer, usr_buffer_nob);
         while (nob-- >= 0)                      /* strip trailing whitespace */
@@ -771,20 +771,12 @@ int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob,
         if (nob > usr_buffer_nob)
                 nob = usr_buffer_nob;
 
-#ifdef PROC_HANDLER_USE_USER_ATTR
 	if (copy_to_user(usr_buffer, knl_buffer, nob))
                 return -EFAULT;
-#else
-	memcpy(usr_buffer, knl_buffer, nob);
-#endif
 
         if (append != NULL && nob < usr_buffer_nob) {
-#ifdef PROC_HANDLER_USE_USER_ATTR
 		if (copy_to_user(usr_buffer + nob, append, 1))
                         return -EFAULT;
-#else
-		memcpy(usr_buffer + nob, append, 1);
-#endif
 
                 nob++;
         }
@@ -841,13 +833,16 @@ int cfs_trace_daemon_command(char *str)
                 cfs_tracefile_write_lock();
                 memset(cfs_tracefile, 0, sizeof(cfs_tracefile));
 
-        } else if (strncmp(str, "size=", 5) == 0) {
-                cfs_tracefile_size = simple_strtoul(str + 5, NULL, 0);
-                if (cfs_tracefile_size < 10 || cfs_tracefile_size > 20480)
-                        cfs_tracefile_size = CFS_TRACEFILE_SIZE;
-                else
-                        cfs_tracefile_size <<= 20;
+	} else if (strncmp(str, "size=", 5) == 0) {
+		unsigned long tmp;
 
+		rc = kstrtoul(str + 5, 10, &tmp);
+		if (!rc) {
+			if (tmp < 10 || tmp > 20480)
+				cfs_tracefile_size = CFS_TRACEFILE_SIZE;
+			else
+				cfs_tracefile_size = tmp << 20;
+		}
         } else if (strlen(str) >= sizeof(cfs_tracefile)) {
                 rc = -ENAMETOOLONG;
         } else if (str[0] != '/') {
@@ -920,18 +915,6 @@ int cfs_trace_set_debug_mb(int mb)
 	return 0;
 }
 
-int cfs_trace_set_debug_mb_usrstr(void __user *usr_str, int usr_str_nob)
-{
-        char     str[32];
-        int      rc;
-
-        rc = cfs_trace_copyin_string(str, sizeof(str), usr_str, usr_str_nob);
-        if (rc < 0)
-                return rc;
-
-        return cfs_trace_set_debug_mb(simple_strtoul(str, NULL, 0));
-}
-
 int cfs_trace_get_debug_mb(void)
 {
         int i;
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.h b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.h
index 2f5dc4f272783..c6ca34d4fb08e 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.h
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/tracefile.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2014, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -82,7 +82,6 @@ int cfs_trace_dump_debug_buffer_usrstr(void __user *usr_str, int usr_str_nob);
 int cfs_trace_daemon_command(char *str);
 int cfs_trace_daemon_command_usrstr(void __user *usr_str, int usr_str_nob);
 int cfs_trace_set_debug_mb(int mb);
-int cfs_trace_set_debug_mb_usrstr(void __user *usr_str, int usr_str_nob);
 int cfs_trace_get_debug_mb(void);
 
 extern void libcfs_debug_dumplog_internal(void *arg);
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c
index c3d5556ab1557..f1676aa8f7a4d 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/l_ioctl.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
  *
- * Copyright (c) 2014, Intel Corporation.
+ * Copyright (c) 2014, 2017, Intel Corporation.
  *
  *   This file is part of Lustre, https://wiki.whamcloud.com/
  *
@@ -35,7 +35,7 @@
 #include <linux/types.h>
 
 #include <libcfs/util/ioctl.h>
-#include <lnet/lnetctl.h>
+#include <linux/lnet/lnetctl.h>
 
 struct ioc_dev {
 	const char *dev_name;
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/nidstrings.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/nidstrings.c
index 04a33bdef4c4c..246d420354217 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/util/nidstrings.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/nidstrings.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -44,8 +44,8 @@
 #include <string.h>
 
 #include <libcfs/util/string.h>
-#include <lnet/types.h>
-#include <lnet/nidstr.h>
+#include <linux/lnet/lnet-types.h>
+#include <linux/lnet/nidstr.h>
 #ifdef HAVE_NETDB_H
 # include <netdb.h>
 #endif
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/param.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/param.c
index 9facce6bfa975..18fe84dc53f6a 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/util/param.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/param.c
@@ -64,10 +64,10 @@
 int
 cfs_get_param_paths(glob_t *paths, const char *pattern, ...)
 {
-	char path[PATH_MAX] = "{/sys/{fs,kernel/debug}/{lnet,lustre}/,"
-			       "/proc/{fs,sys}/{lnet,lustre}/}";
+	char topdir[PATH_MAX] = "{/sys/{fs,kernel/debug}/{lnet,lustre},"
+				"/proc/{fs,sys}/{lnet,lustre}}";
 	static bool test_mounted = false;
-	size_t len = strlen(path);
+	char path[PATH_MAX];
 	char buf[PATH_MAX];
 	struct statfs statfsbuf;
 	va_list args;
@@ -127,9 +127,9 @@ cfs_get_param_paths(glob_t *paths, const char *pattern, ...)
 		errno = EINVAL;
 		return -1;
 	}
-	len += rc;
 
-	if (strlcat(path, buf, sizeof(path)) != len) {
+	if (snprintf(path, sizeof(path), "%s/%s", topdir, buf) >=
+	    sizeof(path)) {
 		errno = E2BIG;
 		return -1;
 	}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/parser.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/parser.c
index 9afdaa07f8883..861f97a3c51e6 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/util/parser.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/parser.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (C) 2001 Cluster File Systems, Inc.
  *
- * Copyright (c) 2014, Intel Corporation.
+ * Copyright (c) 2014, 2017, Intel Corporation.
  *
  *   This file is part of Lustre, http://www.sf.net/projects/lustre/
  *
@@ -36,7 +36,7 @@
 #include <unistd.h>
 
 #include <libcfs/util/parser.h>
-#include <lustre_ver.h>
+#include <linux/lustre/lustre_ver.h>
 
 static command_t * top_level;           /* Top level of commands, initialized by
                                     * InitParser                              */
@@ -768,40 +768,41 @@ int Parser_arg2int(const char *inp, long *result, int base)
 }
 
 /* Convert human readable size string to and int; "1k" -> 1000 */
-int Parser_size (int *sizep, char *str) {
-        int size;
-        char mod[32];
+int Parser_size(unsigned long *sizep, char *str)
+{
+	unsigned long size;
+	char mod[32];
 
-        switch (sscanf (str, "%d%1[gGmMkK]", &size, mod)) {
-        default:
-                return (-1);
+	switch (sscanf(str, "%lu%1[gGmMkK]", &size, mod)) {
+	default:
+		return -1;
 
-        case 1:
-                *sizep = size;
-                return (0);
+	case 1:
+		*sizep = size;
+		return 0;
 
-        case 2:
-                switch (*mod) {
-                case 'g':
-                case 'G':
-                        *sizep = size << 30;
-                        return (0);
-
-                case 'm':
-                case 'M':
-                        *sizep = size << 20;
-                        return (0);
-
-                case 'k':
-                case 'K':
-                        *sizep = size << 10;
-                        return (0);
-
-                default:
-                        *sizep = size;
-                        return (0);
-                }
-        }
+	case 2:
+		switch (*mod) {
+		case 'g':
+		case 'G':
+			*sizep = size << 30;
+			return 0;
+
+		case 'm':
+		case 'M':
+			*sizep = size << 20;
+			return 0;
+
+		case 'k':
+		case 'K':
+			*sizep = size << 10;
+			return 0;
+
+		default:
+			*sizep = size;
+			return 0;
+		}
+	}
 }
 
 /* Convert a string boolean to an int; "enable" -> 1 */
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/util/string.c b/drivers/staging/lustrefsx/libcfs/libcfs/util/string.c
index 9078500020bb9..2c1a24cacebb2 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/util/string.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/util/string.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2014, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -41,46 +41,10 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <limits.h>
+#include <unistd.h>
 #include <libcfs/util/string.h>
 
-/*
- * According manual of strlcpy() and strlcat() the functions should return
- * the total length of the string they tried to create. For strlcpy() that
- * means the length of src. For strlcat() that means the initial length of
- * dst plus the length of src. So, the function strnlen() cannot be used
- * otherwise the return value will be wrong.
- */
-#ifndef HAVE_STRLCPY /* not in glibc for RHEL 5.x, remove when obsolete */
-size_t strlcpy(char *dst, const char *src, size_t size)
-{
-	size_t ret = strlen(src);
-
-	if (size) {
-		size_t len = (ret >= size) ? size - 1 : ret;
-		memcpy(dst, src, len);
-		dst[len] = '\0';
-	}
-	return ret;
-}
-#endif
-
-#ifndef HAVE_STRLCAT /* not in glibc for RHEL 5.x, remove when obsolete */
-size_t strlcat(char *dst, const char *src, size_t size)
-{
-	size_t dsize = strlen(dst);
-	size_t len = strlen(src);
-	size_t ret = dsize + len;
-
-	dst  += dsize;
-	size -= dsize;
-	if (len >= size)
-		len = size-1;
-	memcpy(dst, src, len);
-	dst[len] = '\0';
-	return ret;
-}
-#endif
-
 /**
  * Extracts tokens from strings.
  *
@@ -480,3 +444,83 @@ cfs_expr_list_free_list(struct list_head *list)
 		cfs_expr_list_free(el);
 	}
 }
+
+/**
+ * cfs_abs_path() - Get the absolute path of a relative path
+ * @request_path:	The relative path to be resolved
+ * @resolved_path:	Set to the resolved absolute path
+ *
+ * Returns the canonicalized absolute pathname.  This function is a wrapper to
+ * realpath, but will work even if the target file does not exist.  All
+ * directories in the path must exist.
+ *
+ * Return: On success, 0 is returned and resolved_path points to an allocated
+ * string containing the absolute pathname.  On error, errno is set
+ * appropriately, -errno is returned, and resolved_path points to NULL.
+ */
+int cfs_abs_path(const char *request_path, char **resolved_path)
+{
+	char  buf[PATH_MAX + 1] = "";
+	char *path;
+	char *ptr;
+	int len;
+	int rc = 0;
+	const char *fmt;
+
+	path = malloc(sizeof(buf));
+	if (path == NULL)
+		return -ENOMEM;
+
+	if (request_path[0] != '/') {
+		if (getcwd(path, sizeof(buf) - 1) == NULL) {
+			rc = -errno;
+			goto out;
+		}
+		len = snprintf(buf, sizeof(buf), "%s/%s", path, request_path);
+		if (len >= sizeof(buf)) {
+			rc = -ENAMETOOLONG;
+			goto out;
+		}
+	} else {
+		/* skip duplicate leading '/' */
+		len = snprintf(buf, sizeof(buf), "%s",
+			       request_path + strspn(request_path, "/") - 1);
+		if (len >= sizeof(buf)) {
+			rc = -ENAMETOOLONG;
+			goto out;
+		}
+	}
+
+	/* if filename not in root directory, call realpath for parent path */
+	ptr = strrchr(buf, '/');
+	if (ptr != buf) {
+		*ptr = '\0';
+		if (path != realpath(buf, path)) {
+			rc = -errno;
+			goto out;
+		}
+		/* add the filename back */
+		len = strlen(path);
+		fmt = (path[len - 1] == '/') ? "%s" : "/%s";
+		len = snprintf(path + len, sizeof(buf) - len, fmt, ptr + 1);
+		if (len >= sizeof(buf) - len) {
+			rc = -ENAMETOOLONG;
+			goto out;
+		}
+	} else {
+		len = snprintf(path, sizeof(buf), "%s", buf);
+		if (len >= sizeof(buf)) {
+			rc = -ENAMETOOLONG;
+			goto out;
+		}
+	}
+
+out:
+	if (rc == 0) {
+		*resolved_path = path;
+	} else {
+		*resolved_path = NULL;
+		free(path);
+	}
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c b/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c
index f9e4de58b8ed2..dd451dd807bc1 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/watchdog.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2014, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,6 +40,10 @@
 #include <libcfs/libcfs.h>
 #include "tracefile.h"
 
+#ifndef WITH_WATCHDOG
+#define WITH_WATCHDOG
+#endif
+
 struct lc_watchdog {
 	spinlock_t		lcw_lock;	/* check or change lcw_list */
 	int			lcw_refcount;	/* must hold lcw_pending_timers_lock */
@@ -331,6 +335,7 @@ static void lcw_dispatch_stop(void)
 	wake_up(&lcw_event_waitq);
 
 	wait_for_completion(&lcw_stop_completion);
+	clear_bit(LCW_FLAG_STOP, &lcw_flags);
 
 	CDEBUG(D_INFO, "watchdog dispatcher has shut down.\n");
 
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/workitem.c b/drivers/staging/lustrefsx/libcfs/libcfs/workitem.c
index fb4fd643ee0c0..f370ffab81677 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/workitem.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/workitem.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2013, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -313,10 +313,9 @@ cfs_wi_sched_destroy(struct cfs_wi_sched *sched)
 		int i = 2;
 
 		while (sched->ws_nthreads > 0) {
-			CDEBUG(is_power_of_2(++i) ? D_WARNING : D_NET,
-			       "waiting for %d threads of WI sched[%s] to "
-			       "terminate\n", sched->ws_nthreads,
-			       sched->ws_name);
+			CDEBUG(is_power_of_2(++i / 20) ? D_WARNING : D_NET,
+			       "waiting %us for %d %s worker threads to exit\n",
+			       i / 20, sched->ws_nthreads, sched->ws_name);
 
 			spin_unlock(&cfs_wi_data.wi_glock);
 			set_current_state(TASK_UNINTERRUPTIBLE);
diff --git a/drivers/staging/lustrefsx/lnet/include/cyaml.h b/drivers/staging/lustrefsx/lnet/include/cyaml.h
index c9c21c750a45d..1537dbd19ed0c 100644
--- a/drivers/staging/lustrefsx/lnet/include/cyaml.h
+++ b/drivers/staging/lustrefsx/lnet/include/cyaml.h
@@ -18,7 +18,7 @@
  *
  * LGPL HEADER END
  *
- * Copyright (c) 2014, Intel Corporation.
+ * Copyright (c) 2014, 2017, Intel Corporation.
  *
  * Author:
  *   Amir Shehata <amir.shehata@intel.com>
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/api.h b/drivers/staging/lustrefsx/lnet/include/lnet/api.h
index 84c6bd0039632..1ce4a0056829d 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/api.h
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/api.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2016, Intel Corporation.
+ * Copyright (c) 2016, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -47,7 +47,7 @@
 # error This include is only for kernel use.
 #endif
 
-#include <lnet/types.h>
+#include <uapi/linux/lnet/lnet-types.h>
 
 /** \defgroup lnet_init_fini Initialization and cleanup
  * The LNet must be properly initialized before any LNet calls can be made.
@@ -198,7 +198,8 @@ int LNetGet(lnet_nid_t	      self,
 	    struct lnet_process_id target_in,
 	    unsigned int      portal_in,
 	    __u64	      match_bits_in,
-	    unsigned int      offset_in);
+	    unsigned int      offset_in,
+	    bool	      recovery);
 /** @} lnet_data */
 
 
@@ -210,6 +211,7 @@ int LNetSetLazyPortal(int portal);
 int LNetClearLazyPortal(int portal);
 int LNetCtl(unsigned int cmd, void *arg);
 void LNetDebugPeer(struct lnet_process_id id);
+int LNetGetPeerDiscoveryStatus(void);
 
 /** @} lnet_misc */
 
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h b/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h
index c905eda43b5b8..3115757aea5d6 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/lib-lnet.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -39,11 +39,13 @@
 
 #include <linux/netdevice.h>
 
-#include <libcfs/linux/linux-misc.h>
 #include <libcfs/libcfs.h>
 #include <lnet/api.h>
-#include <lnet/lnet.h>
 #include <lnet/lib-types.h>
+#include <uapi/linux/lnet/lnet-dlc.h>
+#include <uapi/linux/lnet/lnet-types.h>
+#include <uapi/linux/lnet/lnetctl.h>
+#include <uapi/linux/lnet/nidstr.h>
 
 extern struct lnet the_lnet;			/* THE network */
 
@@ -69,6 +71,10 @@ extern struct lnet the_lnet;			/* THE network */
 /** exclusive lock */
 #define LNET_LOCK_EX		CFS_PERCPT_LOCK_EX
 
+/* default timeout */
+#define DEFAULT_PEER_TIMEOUT    180
+#define LNET_LND_DEFAULT_TIMEOUT 5
+
 #ifdef HAVE_KERN_SOCK_GETNAME_2ARGS
 #define lnet_kernel_getpeername(sock, addr, addrlen) \
 		kernel_getpeername(sock, addr)
@@ -81,6 +87,10 @@ extern struct lnet the_lnet;			/* THE network */
 		kernel_getsockname(sock, addr, addrlen)
 #endif
 
+#ifndef fallthrough
+#define fallthrough do {} while (0)  /* fallthrough */
+#endif
+
 static inline int lnet_is_route_alive(struct lnet_route *route)
 {
 	if (!route->lr_gateway->lpni_alive)
@@ -389,10 +399,40 @@ lnet_handle2me(struct lnet_handle_me *handle)
 	return lh_entry(lh, struct lnet_me, me_lh);
 }
 
+static inline void
+lnet_peer_net_addref_locked(struct lnet_peer_net *lpn)
+{
+	atomic_inc(&lpn->lpn_refcount);
+}
+
+extern void lnet_destroy_peer_net_locked(struct lnet_peer_net *lpn);
+
+static inline void
+lnet_peer_net_decref_locked(struct lnet_peer_net *lpn)
+{
+	if (atomic_dec_and_test(&lpn->lpn_refcount))
+		lnet_destroy_peer_net_locked(lpn);
+}
+
+static inline void
+lnet_peer_addref_locked(struct lnet_peer *lp)
+{
+	atomic_inc(&lp->lp_refcount);
+}
+
+extern void lnet_destroy_peer_locked(struct lnet_peer *lp);
+
+static inline void
+lnet_peer_decref_locked(struct lnet_peer *lp)
+{
+	if (atomic_dec_and_test(&lp->lp_refcount))
+		lnet_destroy_peer_locked(lp);
+}
+
 static inline void
 lnet_peer_ni_addref_locked(struct lnet_peer_ni *lp)
 {
-	LASSERT (atomic_read(&lp->lpni_refcount) > 0);
+	LASSERT(atomic_read(&lp->lpni_refcount) > 0);
 	atomic_inc(&lp->lpni_refcount);
 }
 
@@ -401,9 +441,8 @@ extern void lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lp);
 static inline void
 lnet_peer_ni_decref_locked(struct lnet_peer_ni *lp)
 {
-	LASSERT (atomic_read(&lp->lpni_refcount) > 0);
-	atomic_dec(&lp->lpni_refcount);
-	if (atomic_read(&lp->lpni_refcount) == 0)
+	LASSERT(atomic_read(&lp->lpni_refcount) > 0);
+	if (atomic_dec_and_test(&lp->lpni_refcount))
 		lnet_destroy_peer_ni_locked(lp);
 }
 
@@ -465,6 +504,26 @@ lnet_msg_free(struct lnet_msg *msg)
 	LIBCFS_FREE(msg, sizeof(*msg));
 }
 
+static inline struct lnet_rsp_tracker *
+lnet_rspt_alloc(int cpt)
+{
+	struct lnet_rsp_tracker *rspt;
+	LIBCFS_ALLOC(rspt, sizeof(*rspt));
+	lnet_net_lock(cpt);
+	the_lnet.ln_counters[cpt]->lct_health.lch_rst_alloc++;
+	lnet_net_unlock(cpt);
+	return rspt;
+}
+
+static inline void
+lnet_rspt_free(struct lnet_rsp_tracker *rspt, int cpt)
+{
+	LIBCFS_FREE(rspt, sizeof(*rspt));
+	lnet_net_lock(cpt);
+	the_lnet.ln_counters[cpt]->lct_health.lch_rst_alloc--;
+	lnet_net_unlock(cpt);
+}
+
 void lnet_ni_free(struct lnet_ni *ni);
 void lnet_net_free(struct lnet_net *net);
 
@@ -502,19 +561,26 @@ extern struct lnet_ni *lnet_nid2ni_locked(lnet_nid_t nid, int cpt);
 extern struct lnet_ni *lnet_nid2ni_addref(lnet_nid_t nid);
 extern struct lnet_ni *lnet_net2ni_locked(__u32 net, int cpt);
 extern struct lnet_ni *lnet_net2ni_addref(__u32 net);
-bool lnet_is_ni_healthy_locked(struct lnet_ni *ni);
 struct lnet_net *lnet_get_net_locked(__u32 net_id);
 
 int lnet_lib_init(void);
 void lnet_lib_exit(void);
 
+extern unsigned lnet_transaction_timeout;
+extern unsigned lnet_retry_count;
 extern unsigned int lnet_numa_range;
+extern unsigned int lnet_health_sensitivity;
+extern unsigned int lnet_recovery_interval;
+extern unsigned int lnet_peer_discovery_disabled;
+extern unsigned int lnet_drop_asym_route;
 extern int portal_rotor;
 
+void lnet_mt_event_handler(struct lnet_event *event);
+
 int lnet_notify(struct lnet_ni *ni, lnet_nid_t peer, int alive,
-		cfs_time_t when);
+		time64_t when);
 void lnet_notify_locked(struct lnet_peer_ni *lp, int notifylnd, int alive,
-			cfs_time_t when);
+			time64_t when);
 int lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway_nid,
 		   unsigned int priority);
 int lnet_check_routes(void);
@@ -527,24 +593,15 @@ struct lnet_ni *lnet_get_next_ni_locked(struct lnet_net *mynet,
 					struct lnet_ni *prev);
 struct lnet_ni *lnet_get_ni_idx_locked(int idx);
 
-struct libcfs_ioctl_handler {
-	struct list_head item;
-	int (*handle_ioctl)(unsigned int cmd, struct libcfs_ioctl_hdr *hdr);
-};
-
-#define DECLARE_IOCTL_HANDLER(ident, func)			\
-	static struct libcfs_ioctl_handler ident = {		\
-		.item	      = LIST_HEAD_INIT(ident.item),	\
-		.handle_ioctl = func				\
-	}
-
-extern int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand);
-extern int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand);
 extern int libcfs_ioctl_getdata(struct libcfs_ioctl_hdr **hdr_pp,
 				struct libcfs_ioctl_hdr __user *uparam);
+extern int lnet_get_peer_list(__u32 *countp, __u32 *sizep,
+			      struct lnet_process_id __user *ids);
+extern void lnet_peer_ni_set_healthv(lnet_nid_t nid, int value, bool all);
+extern void lnet_peer_ni_add_to_recoveryq_locked(struct lnet_peer_ni *lpni);
 
-void lnet_proc_init(void);
-void lnet_proc_fini(void);
+void lnet_router_debugfs_init(void);
+void lnet_router_debugfs_fini(void);
 int  lnet_rtrpools_alloc(int im_a_router);
 void lnet_destroy_rtrbuf(struct lnet_rtrbuf *rb, int npages);
 int  lnet_rtrpools_adjust(int tiny, int small, int large);
@@ -564,7 +621,6 @@ int lnet_islocalnet(__u32 net);
 
 void lnet_msg_attach_md(struct lnet_msg *msg, struct lnet_libmd *md,
 			unsigned int offset, unsigned int mlen);
-void lnet_msg_detach_md(struct lnet_msg *msg, int status);
 void lnet_build_unlink_event(struct lnet_libmd *md, struct lnet_event *ev);
 void lnet_build_msg_event(struct lnet_msg *msg, enum lnet_event_kind ev_type);
 void lnet_msg_commit(struct lnet_msg *msg, int cpt);
@@ -575,11 +631,15 @@ void lnet_prep_send(struct lnet_msg *msg, int type,
 		    struct lnet_process_id target, unsigned int offset,
 		    unsigned int len);
 int lnet_send(lnet_nid_t nid, struct lnet_msg *msg, lnet_nid_t rtr_nid);
+int lnet_send_ping(lnet_nid_t dest_nid, struct lnet_handle_md *mdh, int nnis,
+		   void *user_ptr, struct lnet_handle_eq eqh, bool recovery);
 void lnet_return_tx_credits_locked(struct lnet_msg *msg);
 void lnet_return_rx_credits_locked(struct lnet_msg *msg);
 void lnet_schedule_blocked_locked(struct lnet_rtrbufpool *rbp);
 void lnet_drop_routed_msgs_locked(struct list_head *list, int cpt);
 
+struct list_head **lnet_create_array_of_queues(void);
+
 /* portals functions */
 /* portals attributes */
 static inline int
@@ -644,16 +704,22 @@ void lnet_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg,
 void lnet_ni_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg,
 		  int delayed, unsigned int offset,
 		  unsigned int mlen, unsigned int rlen);
+void lnet_ni_send(struct lnet_ni *ni, struct lnet_msg *msg);
 
 struct lnet_msg *lnet_create_reply_msg(struct lnet_ni *ni,
 				       struct lnet_msg *get_msg);
 void lnet_set_reply_msg_len(struct lnet_ni *ni, struct lnet_msg *msg,
 			    unsigned int len);
+void lnet_detach_rsp_tracker(struct lnet_libmd *md, int cpt);
+void lnet_clean_zombie_rstqs(void);
 
 void lnet_finalize(struct lnet_msg *msg, int rc);
+bool lnet_send_error_simulation(struct lnet_msg *msg,
+				enum lnet_msg_hstatus *hstatus);
+void lnet_handle_remote_failure_locked(struct lnet_peer_ni *lpni);
 
 void lnet_drop_message(struct lnet_ni *ni, int cpt, void *private,
-		       unsigned int nob);
+		       unsigned int nob, __u32 msg_type);
 void lnet_drop_delayed_msg_list(struct list_head *head, char *reason);
 void lnet_recv_delayed_msg_list(struct list_head *head);
 
@@ -662,6 +728,7 @@ void lnet_msg_container_cleanup(struct lnet_msg_container *container);
 void lnet_msg_containers_destroy(void);
 int lnet_msg_containers_create(void);
 
+char *lnet_health_error2str(enum lnet_msg_hstatus hstatus);
 char *lnet_msgtyp2str(int type);
 void lnet_print_hdr(struct lnet_hdr *hdr);
 int lnet_fail_nid(lnet_nid_t nid, unsigned int threshold);
@@ -672,7 +739,7 @@ int lnet_fault_ctl(int cmd, struct libcfs_ioctl_data *data);
 int lnet_fault_init(void);
 void lnet_fault_fini(void);
 
-bool lnet_drop_rule_match(struct lnet_hdr *hdr);
+bool lnet_drop_rule_match(struct lnet_hdr *hdr, enum lnet_msg_hstatus *hstatus);
 
 int lnet_delay_rule_add(struct lnet_fault_attr *attr);
 int lnet_delay_rule_del(lnet_nid_t src, lnet_nid_t dst, bool shutdown);
@@ -684,6 +751,7 @@ bool lnet_delay_rule_match_locked(struct lnet_hdr *hdr, struct lnet_msg *msg);
 
 /** @} lnet_fault_simulation */
 
+void lnet_counters_get_common(struct lnet_counters_common *common);
 void lnet_counters_get(struct lnet_counters *counters);
 void lnet_counters_reset(void);
 
@@ -763,6 +831,7 @@ void lnet_md_deconstruct(struct lnet_libmd *lmd, struct lnet_md *umd);
 struct page *lnet_kvaddr_to_page(unsigned long vaddr);
 int lnet_cpt_of_md(struct lnet_libmd *md, unsigned int offset);
 
+unsigned int lnet_get_lnd_timeout(void);
 void lnet_register_lnd(struct lnet_lnd *lnd);
 void lnet_unregister_lnd(struct lnet_lnd *lnd);
 
@@ -801,10 +870,45 @@ int lnet_sock_connect(struct socket **sockp, int *fatal,
 int lnet_peers_start_down(void);
 int lnet_peer_buffer_credits(struct lnet_net *net);
 
-int lnet_router_checker_start(void);
-void lnet_router_checker_stop(void);
+int lnet_monitor_thr_start(void);
+void lnet_monitor_thr_stop(void);
+
+bool lnet_router_checker_active(void);
+void lnet_check_routers(void);
+int lnet_router_pre_mt_start(void);
+void lnet_router_post_mt_start(void);
+void lnet_prune_rc_data(int wait_unlink);
+void lnet_router_cleanup(void);
 void lnet_router_ni_update_locked(struct lnet_peer_ni *gw, __u32 net);
-void lnet_swap_pinginfo(struct lnet_ping_info *info);
+void lnet_swap_pinginfo(struct lnet_ping_buffer *pbuf);
+
+int lnet_ping_info_validate(struct lnet_ping_info *pinfo);
+struct lnet_ping_buffer *lnet_ping_buffer_alloc(int nnis, gfp_t gfp);
+void lnet_ping_buffer_free(struct lnet_ping_buffer *pbuf);
+
+static inline void lnet_ping_buffer_addref(struct lnet_ping_buffer *pbuf)
+{
+	atomic_inc(&pbuf->pb_refcnt);
+}
+
+static inline void lnet_ping_buffer_decref(struct lnet_ping_buffer *pbuf)
+{
+	if (atomic_dec_and_test(&pbuf->pb_refcnt))
+		lnet_ping_buffer_free(pbuf);
+}
+
+static inline int lnet_ping_buffer_numref(struct lnet_ping_buffer *pbuf)
+{
+	return atomic_read(&pbuf->pb_refcnt);
+}
+
+static inline int lnet_push_target_resize_needed(void)
+{
+	return the_lnet.ln_push_target->pb_nnis < the_lnet.ln_push_target_nnis;
+}
+
+int lnet_push_target_resize(void);
+void lnet_peer_push_event(struct lnet_event *ev);
 
 int lnet_parse_ip2nets(char **networksp, char *ip2nets);
 int lnet_parse_routes(char *route_str, int *im_a_router);
@@ -819,94 +923,115 @@ __u32 lnet_get_dlc_seq_locked(void);
 struct lnet_peer_ni *lnet_get_next_peer_ni_locked(struct lnet_peer *peer,
 						  struct lnet_peer_net *peer_net,
 						  struct lnet_peer_ni *prev);
-struct lnet_peer *lnet_find_or_create_peer_locked(lnet_nid_t dst_nid, int cpt);
-struct lnet_peer_ni *lnet_nid2peerni_locked(lnet_nid_t nid, int cpt);
+struct lnet_peer_ni *lnet_nid2peerni_locked(lnet_nid_t nid, lnet_nid_t pref,
+					int cpt);
 struct lnet_peer_ni *lnet_nid2peerni_ex(lnet_nid_t nid, int cpt);
 struct lnet_peer_ni *lnet_find_peer_ni_locked(lnet_nid_t nid);
+struct lnet_peer *lnet_find_peer(lnet_nid_t nid);
 void lnet_peer_net_added(struct lnet_net *net);
 lnet_nid_t lnet_peer_primary_nid_locked(lnet_nid_t nid);
+int lnet_discover_peer_locked(struct lnet_peer_ni *lpni, int cpt, bool block);
+int lnet_peer_discovery_start(void);
+void lnet_peer_discovery_stop(void);
+void lnet_push_update_to_peers(int force);
 void lnet_peer_tables_cleanup(struct lnet_net *net);
 void lnet_peer_uninit(void);
 int lnet_peer_tables_create(void);
 void lnet_debug_peer(lnet_nid_t nid);
 struct lnet_peer_net *lnet_peer_get_net_locked(struct lnet_peer *peer,
 					       __u32 net_id);
-bool lnet_peer_is_ni_pref_locked(struct lnet_peer_ni *lpni,
-				 struct lnet_ni *ni);
-int lnet_add_peer_ni_to_peer(lnet_nid_t key_nid, lnet_nid_t nid, bool mr);
-int lnet_del_peer_ni_from_peer(lnet_nid_t key_nid, lnet_nid_t nid);
-int lnet_get_peer_info(__u32 idx, lnet_nid_t *primary_nid, lnet_nid_t *nid,
-		       bool *mr,
-		       struct lnet_peer_ni_credit_info __user *peer_ni_info,
-		       struct lnet_ioctl_element_stats __user *peer_ni_stats);
+bool lnet_peer_is_pref_nid_locked(struct lnet_peer_ni *lpni, lnet_nid_t nid);
+int lnet_peer_ni_set_non_mr_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid);
+int lnet_add_peer_ni(lnet_nid_t key_nid, lnet_nid_t nid, bool mr);
+int lnet_del_peer_ni(lnet_nid_t key_nid, lnet_nid_t nid);
+int lnet_get_peer_info(struct lnet_ioctl_peer_cfg *cfg, void __user *bulk);
 int lnet_get_peer_ni_info(__u32 peer_index, __u64 *nid,
 			  char alivness[LNET_MAX_STR_LEN],
 			  __u32 *cpt_iter, __u32 *refcount,
 			  __u32 *ni_peer_tx_credits, __u32 *peer_tx_credits,
 			  __u32 *peer_rtr_credits, __u32 *peer_min_rtr_credtis,
 			  __u32 *peer_tx_qnob);
+int lnet_get_peer_ni_hstats(struct lnet_ioctl_peer_ni_hstats *stats);
 
-
-static inline __u32
-lnet_get_num_peer_nis(struct lnet_peer *peer)
+static inline struct lnet_peer_net *
+lnet_find_peer_net_locked(struct lnet_peer *peer, __u32 net_id)
 {
-	struct lnet_peer_net *lpn;
-	struct lnet_peer_ni *lpni;
-	__u32 count = 0;
+	struct lnet_peer_net *peer_net;
 
-	list_for_each_entry(lpn, &peer->lp_peer_nets, lpn_on_peer_list)
-		list_for_each_entry(lpni, &lpn->lpn_peer_nis,
-				    lpni_on_peer_net_list)
-			count++;
+	list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_peer_nets) {
+		if (peer_net->lpn_net_id == net_id)
+			return peer_net;
+	}
 
-	return count;
+	return NULL;
 }
 
-static inline bool
-lnet_is_peer_ni_healthy_locked(struct lnet_peer_ni *lpni)
+static inline void
+lnet_peer_set_alive(struct lnet_peer_ni *lp)
 {
-	return lpni->lpni_healthy;
+	lp->lpni_last_alive = ktime_get_seconds();
+	lp->lpni_last_query = lp->lpni_last_alive;
+	if (!lp->lpni_alive)
+		lnet_notify_locked(lp, 0, 1, lp->lpni_last_alive);
 }
 
-static inline void
-lnet_set_peer_ni_health_locked(struct lnet_peer_ni *lpni, bool health)
+static inline bool
+lnet_peer_is_multi_rail(struct lnet_peer *lp)
 {
-	lpni->lpni_healthy = health;
+	if (lp->lp_state & LNET_PEER_MULTI_RAIL)
+		return true;
+	return false;
 }
 
 static inline bool
-lnet_is_peer_net_healthy_locked(struct lnet_peer_net *peer_net)
+lnet_peer_ni_is_configured(struct lnet_peer_ni *lpni)
 {
-	struct lnet_peer_ni *lpni;
-
-	list_for_each_entry(lpni, &peer_net->lpn_peer_nis,
-			    lpni_on_peer_net_list) {
-		if (lnet_is_peer_ni_healthy_locked(lpni))
-			return true;
-	}
-
+	if (lpni->lpni_peer_net->lpn_peer->lp_state & LNET_PEER_CONFIGURED)
+		return true;
 	return false;
 }
 
 static inline bool
-lnet_is_peer_healthy_locked(struct lnet_peer *peer)
+lnet_peer_ni_is_primary(struct lnet_peer_ni *lpni)
 {
-	struct lnet_peer_net *peer_net;
+	return lpni->lpni_nid == lpni->lpni_peer_net->lpn_peer->lp_primary_nid;
+}
 
-	list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_on_peer_list) {
-		if (lnet_is_peer_net_healthy_locked(peer_net))
-			return true;
-	}
+bool lnet_peer_is_uptodate(struct lnet_peer *lp);
+bool lnet_peer_is_uptodate_locked(struct lnet_peer *lp);
+bool lnet_is_discovery_disabled(struct lnet_peer *lp);
 
+static inline bool
+lnet_peer_needs_push(struct lnet_peer *lp)
+{
+	if (!(lp->lp_state & LNET_PEER_MULTI_RAIL))
+		return false;
+	if (lp->lp_state & LNET_PEER_FORCE_PUSH)
+		return true;
+	if (lp->lp_state & LNET_PEER_NO_DISCOVERY)
+		return false;
+	/* if discovery is not enabled then no need to push */
+	if (lnet_peer_discovery_disabled)
+		return false;
+	if (lp->lp_node_seqno < atomic_read(&the_lnet.ln_ping_target_seqno))
+		return true;
 	return false;
 }
 
 static inline void
-lnet_peer_set_alive(struct lnet_peer_ni *lp)
+lnet_inc_healthv(atomic_t *healthv)
 {
-	lp->lpni_last_alive = lp->lpni_last_query = cfs_time_current();
-	if (!lp->lpni_alive)
-		lnet_notify_locked(lp, 0, 1, lp->lpni_last_alive);
+	atomic_add_unless(healthv, 1, LNET_MAX_HEALTH_VALUE);
 }
 
+void lnet_incr_stats(struct lnet_element_stats *stats,
+		     enum lnet_msg_type msg_type,
+		     enum lnet_stats_type stats_type);
+
+__u32 lnet_sum_stats(struct lnet_element_stats *stats,
+		     enum lnet_stats_type stats_type);
+
+void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
+			      struct lnet_element_stats *stats);
+
 #endif
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lib-types.h b/drivers/staging/lustrefsx/lnet/include/lnet/lib-types.h
index 9b8af0e45a4c8..496a1b0fe0f93 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/lib-types.h
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/lib-types.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -44,26 +44,56 @@
 
 #include <linux/kthread.h>
 #include <linux/uio.h>
+#include <linux/semaphore.h>
 #include <linux/types.h>
 
-#include <lnet/lnetctl.h>
+#include <uapi/linux/lnet/lnet-dlc.h>
+#include <uapi/linux/lnet/lnetctl.h>
 
 /* Max payload size */
-#ifndef CONFIG_LNET_MAX_PAYLOAD
-# error "CONFIG_LNET_MAX_PAYLOAD must be defined in config.h"
-#endif
+#define LNET_MAX_PAYLOAD	LNET_MTU
 
-#define LNET_MAX_PAYLOAD       CONFIG_LNET_MAX_PAYLOAD
-#if (LNET_MAX_PAYLOAD < LNET_MTU)
-# error "LNET_MAX_PAYLOAD too small - error in configure --with-max-payload-mb"
-#elif (LNET_MAX_PAYLOAD > (PAGE_SIZE * LNET_MAX_IOV))
-# error "LNET_MAX_PAYLOAD too large - error in configure --with-max-payload-mb"
-#endif
+/** limit on the number of fragments in discontiguous MDs */
+#define LNET_MAX_IOV	256
+
+/*
+ * This is the maximum health value.
+ * All local and peer NIs created have their health default to this value.
+ */
+#define LNET_MAX_HEALTH_VALUE 1000
 
 /* forward refs */
 struct lnet_libmd;
 
-typedef struct lnet_msg {
+enum lnet_msg_hstatus {
+	LNET_MSG_STATUS_OK = 0,
+	LNET_MSG_STATUS_LOCAL_INTERRUPT,
+	LNET_MSG_STATUS_LOCAL_DROPPED,
+	LNET_MSG_STATUS_LOCAL_ABORTED,
+	LNET_MSG_STATUS_LOCAL_NO_ROUTE,
+	LNET_MSG_STATUS_LOCAL_ERROR,
+	LNET_MSG_STATUS_LOCAL_TIMEOUT,
+	LNET_MSG_STATUS_REMOTE_ERROR,
+	LNET_MSG_STATUS_REMOTE_DROPPED,
+	LNET_MSG_STATUS_REMOTE_TIMEOUT,
+	LNET_MSG_STATUS_NETWORK_TIMEOUT,
+	LNET_MSG_STATUS_END,
+};
+
+struct lnet_rsp_tracker {
+	/* chain on the waiting list */
+	struct list_head rspt_on_list;
+	/* cpt to lock */
+	int rspt_cpt;
+	/* nid of next hop */
+	lnet_nid_t rspt_next_hop_nid;
+	/* deadline of the REPLY/ACK */
+	ktime_t rspt_deadline;
+	/* parent MD */
+	struct lnet_handle_md rspt_mdh;
+};
+
+struct lnet_msg {
 	struct list_head	msg_activelist;
 	struct list_head	msg_list;	/* Q for credits/MD */
 
@@ -74,6 +104,28 @@ typedef struct lnet_msg {
 	lnet_nid_t		msg_from;
 	__u32			msg_type;
 
+	/*
+	 * hold parameters in case message is with held due
+	 * to discovery
+	 */
+	lnet_nid_t		msg_src_nid_param;
+	lnet_nid_t		msg_rtr_nid_param;
+
+	/*
+	 * Deadline for the message after which it will be finalized if it
+	 * has not completed.
+	 */
+	ktime_t			msg_deadline;
+
+	/* The message health status. */
+	enum lnet_msg_hstatus	msg_health_status;
+	/* This is a recovery message */
+	bool			msg_recovery;
+	/* the number of times a transmission has been retried */
+	int			msg_retry_count;
+	/* flag to indicate that we do not want to resend this message */
+	bool			msg_no_resend;
+
 	/* committed for sending */
 	unsigned int		msg_tx_committed:1;
 	/* CPT # this message committed for sending */
@@ -120,17 +172,17 @@ typedef struct lnet_msg {
 
 	struct lnet_event	msg_ev;
 	struct lnet_hdr		msg_hdr;
-} lnet_msg_t;
+};
 
-typedef struct lnet_libhandle {
+struct lnet_libhandle {
 	struct list_head	lh_hash_chain;
 	__u64			lh_cookie;
-} lnet_libhandle_t;
+};
 
 #define lh_entry(ptr, type, member) \
 	((type *)((char *)(ptr)-(char *)(&((type *)0)->member)))
 
-typedef struct lnet_eq {
+struct lnet_eq {
 	struct list_head	eq_list;
 	struct lnet_libhandle	eq_lh;
 	unsigned long		eq_enq_seq;
@@ -139,9 +191,9 @@ typedef struct lnet_eq {
 	lnet_eq_handler_t	eq_callback;
 	struct lnet_event	*eq_events;
 	int			**eq_refs;	/* percpt refcount for EQ */
-} lnet_eq_t;
+};
 
-typedef struct lnet_me {
+struct lnet_me {
 	struct list_head	me_list;
 	struct lnet_libhandle	me_lh;
 	struct lnet_process_id	me_match_id;
@@ -151,40 +203,41 @@ typedef struct lnet_me {
 	__u64			me_ignore_bits;
 	enum lnet_unlink	me_unlink;
 	struct lnet_libmd      *me_md;
-} lnet_me_t;
-
-typedef struct lnet_libmd {
-	struct list_head	md_list;
-	struct lnet_libhandle	md_lh;
-	struct lnet_me	       *md_me;
-	char		       *md_start;
-	unsigned int		md_offset;
-	unsigned int		md_length;
-	unsigned int		md_max_size;
-	int			md_threshold;
-	int			md_refcount;
-	unsigned int		md_options;
-	unsigned int		md_flags;
-	unsigned int		md_niov;	/* # frags at end of struct */
-	void		       *md_user_ptr;
-	struct lnet_eq	       *md_eq;
-	struct lnet_handle_md	md_bulk_handle;
+};
+
+struct lnet_libmd {
+	struct list_head	 md_list;
+	struct lnet_libhandle	 md_lh;
+	struct lnet_me	        *md_me;
+	char		        *md_start;
+	unsigned int		 md_offset;
+	unsigned int		 md_length;
+	unsigned int		 md_max_size;
+	int			 md_threshold;
+	int			 md_refcount;
+	unsigned int		 md_options;
+	unsigned int		 md_flags;
+	unsigned int		 md_niov;	/* # frags at end of struct */
+	void		        *md_user_ptr;
+	struct lnet_rsp_tracker *md_rspt_ptr;
+	struct lnet_eq	        *md_eq;
+	struct lnet_handle_md	 md_bulk_handle;
 	union {
-		struct kvec	iov[LNET_MAX_IOV];
-		lnet_kiov_t	kiov[LNET_MAX_IOV];
+		struct kvec	 iov[LNET_MAX_IOV];
+		lnet_kiov_t	 kiov[LNET_MAX_IOV];
 	} md_iov;
-} lnet_libmd_t;
+};
 
 #define LNET_MD_FLAG_ZOMBIE	 (1 << 0)
 #define LNET_MD_FLAG_AUTO_UNLINK (1 << 1)
 #define LNET_MD_FLAG_ABORTED	 (1 << 2)
 
-typedef struct lnet_test_peer {
+struct lnet_test_peer {
 	/* info about peers we are trying to fail */
 	struct list_head	tp_list;	/* ln_test_peers */
 	lnet_nid_t		tp_nid;		/* matching nid */
 	unsigned int		tp_threshold;	/* # failures to simulate */
-} lnet_test_peer_t;
+};
 
 #define LNET_COOKIE_TYPE_MD    1
 #define LNET_COOKIE_TYPE_ME    2
@@ -195,7 +248,7 @@ typedef struct lnet_test_peer {
 struct lnet_ni;					 /* forward ref */
 struct socket;
 
-typedef struct lnet_lnd {
+struct lnet_lnd {
 	/* fields managed by portals */
 	struct list_head	lnd_list;	/* stash in the LND table */
 	int			lnd_refcount;	/* # active instances */
@@ -249,17 +302,11 @@ typedef struct lnet_lnd {
 	void (*lnd_notify)(struct lnet_ni *ni, lnet_nid_t peer, int alive);
 
 	/* query of peer aliveness */
-	void (*lnd_query)(struct lnet_ni *ni, lnet_nid_t peer, cfs_time_t *when);
+	void (*lnd_query)(struct lnet_ni *ni, lnet_nid_t peer, time64_t *when);
 
 	/* accept a new connection */
 	int (*lnd_accept)(struct lnet_ni *ni, struct socket *sock);
-} lnd_t;
-
-typedef struct lnet_ni_status {
-	lnet_nid_t ns_nid;
-	__u32	   ns_status;
-	__u32	   ns_unused;
-} WIRE_ATTR lnet_ni_status_t;
+};
 
 struct lnet_tx_queue {
 	int			tq_credits;	/* # tx credits free */
@@ -280,22 +327,51 @@ enum lnet_net_state {
 };
 
 enum lnet_ni_state {
-	/* set when NI block is allocated */
+	/* initial state when NI is created */
 	LNET_NI_STATE_INIT = 0,
-	/* set when NI is started successfully */
+	/* set when NI is brought up */
 	LNET_NI_STATE_ACTIVE,
-	/* set when LND notifies NI failed */
-	LNET_NI_STATE_FAILED,
-	/* set when LND notifies NI degraded */
-	LNET_NI_STATE_DEGRADED,
-	/* set when shuttding down NI */
-	LNET_NI_STATE_DELETING
+	/* set when NI is being shutdown */
+	LNET_NI_STATE_DELETING,
+};
+
+#define LNET_NI_RECOVERY_PENDING	BIT(0)
+#define LNET_NI_RECOVERY_FAILED		BIT(1)
+
+enum lnet_stats_type {
+	LNET_STATS_TYPE_SEND = 0,
+	LNET_STATS_TYPE_RECV,
+	LNET_STATS_TYPE_DROP
+};
+
+struct lnet_comm_count {
+	atomic_t co_get_count;
+	atomic_t co_put_count;
+	atomic_t co_reply_count;
+	atomic_t co_ack_count;
+	atomic_t co_hello_count;
 };
 
 struct lnet_element_stats {
-	atomic_t	send_count;
-	atomic_t	recv_count;
-	atomic_t	drop_count;
+	struct lnet_comm_count el_send_stats;
+	struct lnet_comm_count el_recv_stats;
+	struct lnet_comm_count el_drop_stats;
+};
+
+struct lnet_health_local_stats {
+	atomic_t hlt_local_interrupt;
+	atomic_t hlt_local_dropped;
+	atomic_t hlt_local_aborted;
+	atomic_t hlt_local_no_route;
+	atomic_t hlt_local_timeout;
+	atomic_t hlt_local_error;
+};
+
+struct lnet_health_remote_stats {
+	atomic_t hlt_remote_dropped;
+	atomic_t hlt_remote_timeout;
+	atomic_t hlt_remote_error;
+	atomic_t hlt_network_timeout;
 };
 
 struct lnet_net {
@@ -342,12 +418,15 @@ struct lnet_net {
 	enum lnet_net_state	net_state;
 };
 
-typedef struct lnet_ni {
+struct lnet_ni {
 	/* chain on the lnet_net structure */
 	struct list_head	ni_netlist;
 
-	/* chain on net_ni_cpt */
-	struct list_head	ni_cptlist;
+	/* chain on the recovery queue */
+	struct list_head	ni_recovery;
+
+	/* MD handle for recovery ping */
+	struct lnet_handle_md	ni_ping_mdh;
 
 	spinlock_t		ni_lock;
 
@@ -373,7 +452,7 @@ typedef struct lnet_ni {
 	int			**ni_refs;
 
 	/* when I was last alive */
-	long			ni_last_alive;
+	time64_t		ni_last_alive;
 
 	/* pointer to parent network */
 	struct lnet_net		*ni_net;
@@ -381,9 +460,12 @@ typedef struct lnet_ni {
 	/* my health status */
 	struct lnet_ni_status	*ni_status;
 
-	/* NI FSM */
+	/* NI FSM. Protected by lnet_ni_lock() */
 	enum lnet_ni_state	ni_state;
 
+	/* Recovery state. Protected by lnet_ni_lock() */
+	__u32			ni_recovery_state;
+
 	/* per NI LND tunables */
 	struct lnet_lnd_tunables ni_lnd_tunables;
 
@@ -392,6 +474,7 @@ typedef struct lnet_ni {
 
 	/* NI statistics */
 	struct lnet_element_stats ni_stats;
+	struct lnet_health_local_stats ni_hstats;
 
 	/* physical device CPT */
 	int			ni_dev_cpt;
@@ -399,50 +482,69 @@ typedef struct lnet_ni {
 	/* sequence number used to round robin over nis within a net */
 	__u32			ni_seq;
 
+	/*
+	 * health value
+	 *	initialized to LNET_MAX_HEALTH_VALUE
+	 * Value is decremented every time we fail to send a message over
+	 * this NI because of a NI specific failure.
+	 * Value is incremented if we successfully send a message.
+	 */
+	atomic_t		ni_healthv;
+
+	/*
+	 * Set to 1 by the LND when it receives an event telling it the device
+	 * has gone into a fatal state. Set to 0 when the LND receives an
+	 * even telling it the device is back online.
+	 */
+	atomic_t		ni_fatal_error_on;
+
 	/*
 	 * equivalent interfaces to use
 	 * This is an array because socklnd bonding can still be configured
 	 */
-	char			*ni_interfaces[LNET_NUM_INTERFACES];
+	char			*ni_interfaces[LNET_INTERFACES_NUM];
 	struct net		*ni_net_ns;     /* original net namespace */
-} lnet_ni_t;
+};
 
 #define LNET_PROTO_PING_MATCHBITS	0x8000000000000000LL
 
-/* NB: value of these features equal to LNET_PROTO_PING_VERSION_x
- * of old LNet, so there shouldn't be any compatibility issue */
-#define LNET_PING_FEAT_INVAL		(0)		/* no feature */
-#define LNET_PING_FEAT_BASE		(1 << 0)	/* just a ping */
-#define LNET_PING_FEAT_NI_STATUS	(1 << 1)	/* return NI status */
-#define LNET_PING_FEAT_RTE_DISABLED	(1 << 2)	/* Routing enabled */
+/*
+ * Descriptor of a ping info buffer: keep a separate indicator of the
+ * size and a reference count. The type is used both as a source and
+ * sink of data, so we need to keep some information outside of the
+ * area that may be overwritten by network data.
+ */
+struct lnet_ping_buffer {
+	int			pb_nnis;
+	atomic_t		pb_refcnt;
+	struct lnet_ping_info	pb_info;
+};
 
-#define LNET_PING_FEAT_MASK		(LNET_PING_FEAT_BASE | \
-					 LNET_PING_FEAT_NI_STATUS)
+#define LNET_PING_BUFFER_SIZE(NNIDS) \
+	offsetof(struct lnet_ping_buffer, pb_info.pi_ni[NNIDS])
+#define LNET_PING_BUFFER_LONI(PBUF)	((PBUF)->pb_info.pi_ni[0].ns_nid)
+#define LNET_PING_BUFFER_SEQNO(PBUF)	((PBUF)->pb_info.pi_ni[0].ns_status)
 
-typedef struct lnet_ping_info {
-	__u32			pi_magic;
-	__u32			pi_features;
-	lnet_pid_t		pi_pid;
-	__u32			pi_nnis;
-	struct lnet_ni_status	pi_ni[0];
-} WIRE_ATTR lnet_ping_info_t;
+#define LNET_PING_INFO_TO_BUFFER(PINFO)	\
+	container_of((PINFO), struct lnet_ping_buffer, pb_info)
 
 /* router checker data, per router */
-#define LNET_MAX_RTR_NIS   16
-#define LNET_PINGINFO_SIZE offsetof(struct lnet_ping_info, pi_ni[LNET_MAX_RTR_NIS])
-typedef struct lnet_rc_data {
+struct lnet_rc_data {
 	/* chain on the_lnet.ln_zombie_rcd or ln_deathrow_rcd */
 	struct list_head	rcd_list;
 	struct lnet_handle_md	rcd_mdh;	/* ping buffer MD */
 	struct lnet_peer_ni	*rcd_gateway;	/* reference to gateway */
-	struct lnet_ping_info	*rcd_pinginfo;	/* ping buffer */
-} lnet_rc_data_t;
+	struct lnet_ping_buffer	*rcd_pingbuffer;/* ping buffer */
+	int			rcd_nnis;	/* desired size of buffer */
+};
 
 struct lnet_peer_ni {
-	/* chain on peer_net */
-	struct list_head	lpni_on_peer_net_list;
+	/* chain on lpn_peer_nis */
+	struct list_head	lpni_peer_nis;
 	/* chain on remote peer list */
 	struct list_head	lpni_on_remote_peer_ni_list;
+	/* chain on recovery queue */
+	struct list_head	lpni_recovery;
 	/* chain on peer hash */
 	struct list_head	lpni_hashlist;
 	/* messages blocking for tx credits */
@@ -455,6 +557,7 @@ struct lnet_peer_ni {
 	struct lnet_peer_net	*lpni_peer_net;
 	/* statistics kept on each peer NI */
 	struct lnet_element_stats lpni_stats;
+	struct lnet_health_remote_stats lpni_hstats;
 	/* spin lock protecting credits and lpni_txq / lpni_rtrq */
 	spinlock_t		lpni_lock;
 	/* # tx credits available */
@@ -480,23 +583,29 @@ struct lnet_peer_ni {
 	/* # times router went dead<->alive. Protected with lpni_lock */
 	int			lpni_alive_count;
 	/* time of last aliveness news */
-	cfs_time_t		lpni_timestamp;
+	time64_t		lpni_timestamp;
 	/* time of last ping attempt */
-	cfs_time_t		lpni_ping_timestamp;
+	time64_t		lpni_ping_timestamp;
 	/* != 0 if ping reply expected */
-	cfs_time_t		lpni_ping_deadline;
+	time64_t		lpni_ping_deadline;
 	/* when I was last alive */
-	cfs_time_t		lpni_last_alive;
+	time64_t		lpni_last_alive;
 	/* when lpni_ni was queried last time */
-	cfs_time_t		lpni_last_query;
+	time64_t		lpni_last_query;
 	/* network peer is on */
 	struct lnet_net		*lpni_net;
 	/* peer's NID */
 	lnet_nid_t		lpni_nid;
 	/* # refs */
 	atomic_t		lpni_refcount;
+	/* health value for the peer */
+	atomic_t		lpni_healthv;
+	/* recovery ping mdh */
+	struct lnet_handle_md	lpni_recovery_ping_mdh;
 	/* CPT this peer attached on */
 	int			lpni_cpt;
+	/* state flags -- protected by lpni_lock */
+	unsigned		lpni_state;
 	/* # refs from lnet_route_t::lr_gateway */
 	int			lpni_rtr_refcount;
 	/* sequence number used to round robin over peer nis within a net */
@@ -509,31 +618,148 @@ struct lnet_peer_ni {
 	unsigned int		lpni_ping_feats;
 	/* routes on this peer */
 	struct list_head	lpni_routes;
-	/* array of preferred local nids */
-	lnet_nid_t		*lpni_pref_nids;
+	/* preferred local nids: if only one, use lpni_pref.nid */
+	union lpni_pref {
+		lnet_nid_t	nid;
+		lnet_nid_t	*nids;
+	} lpni_pref;
 	/* number of preferred NIDs in lnpi_pref_nids */
 	__u32			lpni_pref_nnids;
 	/* router checker state */
 	struct lnet_rc_data	*lpni_rcd;
 };
 
+/* Preferred path added due to traffic on non-MR peer_ni */
+#define LNET_PEER_NI_NON_MR_PREF	(1 << 0)
+/* peer is being recovered. */
+#define LNET_PEER_NI_RECOVERY_PENDING	(1 << 1)
+/* recovery ping failed */
+#define LNET_PEER_NI_RECOVERY_FAILED	(1 << 2)
+/* peer is being deleted */
+#define LNET_PEER_NI_DELETING		(1 << 3)
+
 struct lnet_peer {
-	/* chain on global peer list */
-	struct list_head	lp_on_lnet_peer_list;
+	/* chain on pt_peer_list */
+	struct list_head	lp_peer_list;
 
 	/* list of peer nets */
 	struct list_head	lp_peer_nets;
 
+	/* list of messages pending discovery*/
+	struct list_head	lp_dc_pendq;
+
 	/* primary NID of the peer */
 	lnet_nid_t		lp_primary_nid;
 
-	/* peer is Multi-Rail enabled peer */
-	bool			lp_multi_rail;
+	/* source NID to use during discovery */
+	lnet_nid_t		lp_disc_src_nid;
+
+	/* CPT of peer_table */
+	int			lp_cpt;
+
+	/* number of NIDs on this peer */
+	int			lp_nnis;
+
+	/* reference count */
+	atomic_t		lp_refcount;
+
+	/* lock protecting peer state flags */
+	spinlock_t		lp_lock;
+
+	/* peer state flags */
+	unsigned		lp_state;
+
+	/* buffer for data pushed by peer */
+	struct lnet_ping_buffer	*lp_data;
+
+	/* MD handle for ping in progress */
+	struct lnet_handle_md	lp_ping_mdh;
+
+	/* MD handle for push in progress */
+	struct lnet_handle_md	lp_push_mdh;
+
+	/* number of NIDs for sizing push data */
+	int			lp_data_nnis;
+
+	/* NI config sequence number of peer */
+	__u32			lp_peer_seqno;
+
+	/* Local NI config sequence number acked by peer */
+	__u32			lp_node_seqno;
+
+	/* Local NI config sequence number sent to peer */
+	__u32			lp_node_seqno_sent;
+
+	/* Ping error encountered during discovery. */
+	int			lp_ping_error;
+
+	/* Push error encountered during discovery. */
+	int			lp_push_error;
+
+	/* Error encountered during discovery. */
+	int			lp_dc_error;
+
+	/* time it was put on the ln_dc_working queue */
+	time64_t		lp_last_queued;
+
+	/* link on discovery-related lists */
+	struct list_head	lp_dc_list;
+
+	/* tasks waiting on discovery of this peer */
+	wait_queue_head_t	lp_dc_waitq;
 };
 
+/*
+ * The status flags in lp_state. Their semantics have chosen so that
+ * lp_state can be zero-initialized.
+ *
+ * A peer is marked MULTI_RAIL in two cases: it was configured using DLC
+ * as multi-rail aware, or the LNET_PING_FEAT_MULTI_RAIL bit was set.
+ *
+ * A peer is marked NO_DISCOVERY if the LNET_PING_FEAT_DISCOVERY bit was
+ * NOT set when the peer was pinged by discovery.
+ */
+#define LNET_PEER_MULTI_RAIL	(1 << 0)	/* Multi-rail aware */
+#define LNET_PEER_NO_DISCOVERY	(1 << 1)	/* Peer disabled discovery */
+/*
+ * A peer is marked CONFIGURED if it was configured by DLC.
+ *
+ * In addition, a peer is marked DISCOVERED if it has fully passed
+ * through Peer Discovery.
+ *
+ * When Peer Discovery is disabled, the discovery thread will mark
+ * peers REDISCOVER to indicate that they should be re-examined if
+ * discovery is (re)enabled on the node.
+ *
+ * A peer that was created as the result of inbound traffic will not
+ * be marked at all.
+ */
+#define LNET_PEER_CONFIGURED	(1 << 2)	/* Configured via DLC */
+#define LNET_PEER_DISCOVERED	(1 << 3)	/* Peer was discovered */
+#define LNET_PEER_REDISCOVER	(1 << 4)	/* Discovery was disabled */
+/*
+ * A peer is marked DISCOVERING when discovery is in progress.
+ * The other flags below correspond to stages of discovery.
+ */
+#define LNET_PEER_DISCOVERING	(1 << 5)	/* Discovering */
+#define LNET_PEER_DATA_PRESENT	(1 << 6)	/* Remote peer data present */
+#define LNET_PEER_NIDS_UPTODATE	(1 << 7)	/* Remote peer info uptodate */
+#define LNET_PEER_PING_SENT	(1 << 8)	/* Waiting for REPLY to Ping */
+#define LNET_PEER_PUSH_SENT	(1 << 9)	/* Waiting for ACK of Push */
+#define LNET_PEER_PING_FAILED	(1 << 10)	/* Ping send failure */
+#define LNET_PEER_PUSH_FAILED	(1 << 11)	/* Push send failure */
+/*
+ * A ping can be forced as a way to fix up state, or as a manual
+ * intervention by an admin.
+ * A push can be forced in circumstances that would normally not
+ * allow for one to happen.
+ */
+#define LNET_PEER_FORCE_PING	(1 << 12)	/* Forced Ping */
+#define LNET_PEER_FORCE_PUSH	(1 << 13)	/* Forced Push */
+
 struct lnet_peer_net {
-	/* chain on peer block */
-	struct list_head	lpn_on_peer_list;
+	/* chain on lp_peer_nets */
+	struct list_head	lpn_peer_nets;
 
 	/* list of peer_nis on this network */
 	struct list_head	lpn_peer_nis;
@@ -543,19 +769,38 @@ struct lnet_peer_net {
 
 	/* Net ID */
 	__u32			lpn_net_id;
+
+	/* reference count */
+	atomic_t		lpn_refcount;
 };
 
 /* peer hash size */
 #define LNET_PEER_HASH_BITS	9
 #define LNET_PEER_HASH_SIZE	(1 << LNET_PEER_HASH_BITS)
 
-/* peer hash table */
+/*
+ * peer hash table - one per CPT
+ *
+ * protected by lnet_net_lock/EX for update
+ *    pt_version
+ *    pt_number
+ *    pt_hash[...]
+ *    pt_peer_list
+ *    pt_peers
+ * protected by pt_zombie_lock:
+ *    pt_zombie_list
+ *    pt_zombies
+ *
+ * pt_zombie lock nests inside lnet_net_lock
+ */
 struct lnet_peer_table {
 	int			pt_version;	/* /proc validity stamp */
-	atomic_t		pt_number;	/* # peers extant */
+	int			pt_number;	/* # peers_ni extant */
 	struct list_head	*pt_hash;	/* NID->peer hash */
-	struct list_head	pt_zombie_list;	/* zombie peers */
-	int			pt_zombies;	/* # zombie peers */
+	struct list_head	pt_peer_list;	/* peers */
+	int			pt_peers;	/* # peers */
+	struct list_head	pt_zombie_list;	/* zombie peer_ni */
+	int			pt_zombies;	/* # zombie peers_ni */
 	spinlock_t		pt_zombie_lock;	/* protect list and count */
 };
 
@@ -566,7 +811,7 @@ struct lnet_peer_table {
 					((lp)->lpni_net) && \
 					(lp)->lpni_net->net_tunables.lct_peer_timeout > 0)
 
-typedef struct lnet_route {
+struct lnet_route {
 	struct list_head	lr_list;	/* chain on net */
 	struct list_head	lr_gwlist;	/* chain on gateway */
 	struct lnet_peer_ni	*lr_gateway;	/* router node */
@@ -575,27 +820,29 @@ typedef struct lnet_route {
 	unsigned int		lr_downis;	/* number of down NIs */
 	__u32			lr_hops;	/* how far I am */
 	unsigned int		lr_priority;	/* route priority */
-} lnet_route_t;
+};
 
 #define LNET_REMOTE_NETS_HASH_DEFAULT	(1U << 7)
 #define LNET_REMOTE_NETS_HASH_MAX	(1U << 16)
 #define LNET_REMOTE_NETS_HASH_SIZE	(1 << the_lnet.ln_remote_nets_hbits)
 
-typedef struct lnet_remotenet {
+struct lnet_remotenet {
 	/* chain on ln_remote_nets_hash */
 	struct list_head	lrn_list;
 	/* routes to me */
 	struct list_head	lrn_routes;
 	/* my net number */
 	__u32			lrn_net;
-} lnet_remotenet_t;
+};
 
 /** lnet message has credit and can be submitted to lnd for send/receive */
 #define LNET_CREDIT_OK		0
 /** lnet message is waiting for credit */
 #define LNET_CREDIT_WAIT	1
+/** lnet message is waiting for discovery */
+#define LNET_DC_WAIT		2
 
-typedef struct lnet_rtrbufpool {
+struct lnet_rtrbufpool {
 	/* my free buffer pool */
 	struct list_head	rbp_bufs;
 	/* messages blocking for a buffer */
@@ -610,13 +857,13 @@ typedef struct lnet_rtrbufpool {
 	int			rbp_credits;
 	/* low water mark */
 	int			rbp_mincredits;
-} lnet_rtrbufpool_t;
+};
 
-typedef struct lnet_rtrbuf {
+struct lnet_rtrbuf {
 	struct list_head	 rb_list;	/* chain on rbp_bufs */
 	struct lnet_rtrbufpool	*rb_pool;	/* owning pool */
 	lnet_kiov_t		 rb_kiov[0];	/* the buffer space */
-} lnet_rtrbuf_t;
+};
 
 #define LNET_PEER_HASHSIZE   503		/* prime! */
 
@@ -686,7 +933,7 @@ struct lnet_match_table {
 /* dispatch routed PUT message by hashing source NID for wildcard portals */
 #define	LNET_PTL_ROTOR_HASH_RT	3
 
-typedef struct lnet_portal {
+struct lnet_portal {
 	spinlock_t		ptl_lock;
 	unsigned int		ptl_index;	/* portal ID, reserved */
 	/* flags on this portal: lazy, unique... */
@@ -703,7 +950,7 @@ typedef struct lnet_portal {
 	int			ptl_mt_nmaps;
 	/* array of active entries' cpu-partition-id */
 	int			ptl_mt_maps[0];
-} lnet_portal_t;
+};
 
 #define LNET_LH_HASH_BITS	12
 #define LNET_LH_HASH_SIZE	(1ULL << LNET_LH_HASH_BITS)
@@ -724,22 +971,31 @@ struct lnet_msg_container {
 	int			msc_nfinalizers;
 	/* msgs waiting to complete finalizing */
 	struct list_head	msc_finalizing;
+	/* msgs waiting to be resent */
+	struct list_head	msc_resending;
 	struct list_head	msc_active;	/* active message list */
 	/* threads doing finalization */
 	void			**msc_finalizers;
+	/* threads doing resends */
+	void			**msc_resenders;
 };
 
+/* Peer Discovery states */
+#define LNET_DC_STATE_SHUTDOWN		0	/* not started */
+#define LNET_DC_STATE_RUNNING		1	/* started up OK */
+#define LNET_DC_STATE_STOPPING		2	/* telling thread to stop */
+
 /* Router Checker states */
-#define LNET_RC_STATE_SHUTDOWN		0	/* not started */
-#define LNET_RC_STATE_RUNNING		1	/* started up OK */
-#define LNET_RC_STATE_STOPPING		2	/* telling thread to stop */
+#define LNET_MT_STATE_SHUTDOWN		0	/* not started */
+#define LNET_MT_STATE_RUNNING		1	/* started up OK */
+#define LNET_MT_STATE_STOPPING		2	/* telling thread to stop */
 
 /* LNet states */
 #define LNET_STATE_SHUTDOWN		0	/* not started */
 #define LNET_STATE_RUNNING		1	/* started up OK */
 #define LNET_STATE_STOPPING		2	/* telling thread to stop */
 
-typedef struct lnet {
+struct lnet {
 	/* CPU partition table of LNet */
 	struct cfs_cpt_table		*ln_cpt_table;
 	/* number of CPTs in ln_cpt_table */
@@ -770,8 +1026,6 @@ typedef struct lnet {
 	struct lnet_msg_container	**ln_msg_containers;
 	struct lnet_counters		**ln_counters;
 	struct lnet_peer_table		**ln_peer_tables;
-	/* list of configured or discovered peers */
-	struct list_head		ln_peers;
 	/* list of peer nis not on a local network */
 	struct list_head		ln_remote_peer_ni_list;
 	/* failure simulation */
@@ -784,6 +1038,10 @@ typedef struct lnet {
 	struct lnet_ni			*ln_loni;
 	/* network zombie list */
 	struct list_head		ln_net_zombie;
+	/* resend messages list */
+	struct list_head		ln_msg_resend;
+	/* spin lock to protect the msg resend list */
+	spinlock_t			ln_msg_resend_lock;
 
 	/* remote networks with routes to them */
 	struct list_head		*ln_remote_nets_hash;
@@ -796,12 +1054,46 @@ typedef struct lnet {
 	/* percpt router buffer pools */
 	struct lnet_rtrbufpool		**ln_rtrpools;
 
+	/*
+	 * Ping target / Push source
+	 *
+	 * The ping target and push source share a single buffer. The
+	 * ln_ping_target is protected against concurrent updates by
+	 * ln_api_mutex.
+	 */
 	struct lnet_handle_md		ln_ping_target_md;
 	struct lnet_handle_eq		ln_ping_target_eq;
-	struct lnet_ping_info		*ln_ping_info;
+	struct lnet_ping_buffer		*ln_ping_target;
+	atomic_t			ln_ping_target_seqno;
 
-	/* router checker startup/shutdown state */
-	int				ln_rc_state;
+	/*
+	 * Push Target
+	 *
+	 * ln_push_nnis contains the desired size of the push target.
+	 * The lnet_net_lock is used to handle update races. The old
+	 * buffer may linger a while after it has been unlinked, in
+	 * which case the event handler cleans up.
+	 */
+	struct lnet_handle_eq		ln_push_target_eq;
+	struct lnet_handle_md		ln_push_target_md;
+	struct lnet_ping_buffer		*ln_push_target;
+	int				ln_push_target_nnis;
+
+	/* discovery event queue handle */
+	struct lnet_handle_eq		ln_dc_eqh;
+	/* discovery requests */
+	struct list_head		ln_dc_request;
+	/* discovery working list */
+	struct list_head		ln_dc_working;
+	/* discovery expired list */
+	struct list_head		ln_dc_expired;
+	/* discovery thread wait queue */
+	wait_queue_head_t		ln_dc_waitq;
+	/* discovery startup/shutdown state */
+	int				ln_dc_state;
+
+	/* monitor thread startup/shutdown state */
+	int				ln_mt_state;
 	/* router checker's event queue */
 	struct lnet_handle_eq		ln_rc_eqh;
 	/* rcd still pending on net */
@@ -809,7 +1101,7 @@ typedef struct lnet {
 	/* rcd ready for free */
 	struct list_head		ln_rcd_zombie;
 	/* serialise startup/shutdown */
-	struct semaphore		ln_rc_signal;
+	struct semaphore		ln_mt_signal;
 
 	struct mutex			ln_api_mutex;
 	struct mutex			ln_lnd_mutex;
@@ -837,10 +1129,36 @@ typedef struct lnet {
 	 */
 	bool				ln_nis_from_mod_params;
 
-	/* waitq for router checker.  As long as there are no routes in
-	 * the list, the router checker will sleep on this queue.  when
-	 * routes are added the thread will wake up */
-	wait_queue_head_t		ln_rc_waitq;
-} lnet_t;
+	/*
+	 * waitq for the monitor thread. The monitor thread takes care of
+	 * checking routes, timedout messages and resending messages.
+	 */
+	wait_queue_head_t		ln_mt_waitq;
+
+	/* per-cpt resend queues */
+	struct list_head		**ln_mt_resendqs;
+	/* local NIs to recover */
+	struct list_head		ln_mt_localNIRecovq;
+	/* local NIs to recover */
+	struct list_head		ln_mt_peerNIRecovq;
+	/*
+	 * An array of queues for GET/PUT waiting for REPLY/ACK respectively.
+	 * There are CPT number of queues. Since response trackers will be
+	 * added on the fast path we can't afford to grab the exclusive
+	 * net lock to protect these queues. The CPT will be calculated
+	 * based on the mdh cookie.
+	 */
+	struct list_head		**ln_mt_rstq;
+	/*
+	 * A response tracker becomes a zombie when the associated MD is queued
+	 * for unlink before the response tracker is detached from the MD. An
+	 * entry on a zombie list can be freed when either the remaining
+	 * operations on the MD complete or when LNet has shut down.
+	 */
+	struct list_head		**ln_mt_zombie_rstqs;
+	/* recovery eq handler */
+	struct lnet_handle_eq		ln_mt_eqh;
+
+};
 
 #endif
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/socklnd.h b/drivers/staging/lustrefsx/lnet/include/lnet/socklnd.h
index 843d35c06105a..e2c19f2a4ed35 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/socklnd.h
+++ b/drivers/staging/lustrefsx/lnet/include/lnet/socklnd.h
@@ -28,22 +28,12 @@
  * Lustre is a trademark of Sun Microsystems, Inc.
  *
  * lnet/include/lnet/socklnd.h
- *
- * #defines shared between socknal implementation and utilities
  */
 #ifndef __LNET_LNET_SOCKLND_H__
 #define __LNET_LNET_SOCKLND_H__
 
-#include <lnet/types.h>
-
-#define SOCKLND_CONN_NONE     (-1)
-#define SOCKLND_CONN_ANY	0
-#define SOCKLND_CONN_CONTROL	1
-#define SOCKLND_CONN_BULK_IN	2
-#define SOCKLND_CONN_BULK_OUT	3
-#define SOCKLND_CONN_NTYPES	4
-
-#define SOCKLND_CONN_ACK	SOCKLND_CONN_BULK_IN
+#include <uapi/linux/lnet/lnet-types.h>
+#include <uapi/linux/lnet/socklnd.h>
 
 struct ksock_hello_msg {
 	__u32			kshm_magic;	/* magic number of socklnd message */
diff --git a/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_debug.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_debug.h
new file mode 100644
index 0000000000000..2672fe7ae103d
--- /dev/null
+++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_debug.h
@@ -0,0 +1,151 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_debug.h
+ *
+ * Debug messages and assertions
+ *
+ */
+
+#ifndef __UAPI_LIBCFS_DEBUG_H__
+#define __UAPI_LIBCFS_DEBUG_H__
+
+#include <linux/types.h>
+
+/**
+ * Format for debug message headers
+ */
+struct ptldebug_header {
+	__u32 ph_len;
+	__u32 ph_flags;
+	__u32 ph_subsys;
+	__u32 ph_mask;
+	__u16 ph_cpu_id;
+	__u16 ph_type;
+	/* time_t overflow in 2106 */
+	__u32 ph_sec;
+	__u64 ph_usec;
+	__u32 ph_stack;
+	__u32 ph_pid;
+	__u32 ph_extern_pid;
+	__u32 ph_line_num;
+} __attribute__((packed));
+
+#define PH_FLAG_FIRST_RECORD	1
+
+/* Debugging subsystems (32 bits, non-overlapping) */
+#define S_UNDEFINED     0x00000001
+#define S_MDC           0x00000002
+#define S_MDS           0x00000004
+#define S_OSC           0x00000008
+#define S_OST           0x00000010
+#define S_CLASS         0x00000020
+#define S_LOG           0x00000040
+#define S_LLITE         0x00000080
+#define S_RPC           0x00000100
+#define S_MGMT          0x00000200
+#define S_LNET          0x00000400
+#define S_LND           0x00000800 /* ALL LNDs */
+#define S_PINGER        0x00001000
+#define S_FILTER        0x00002000
+#define S_LIBCFS        0x00004000
+#define S_ECHO          0x00008000
+#define S_LDLM          0x00010000
+#define S_LOV           0x00020000
+#define S_LQUOTA        0x00040000
+#define S_OSD           0x00080000
+#define S_LFSCK         0x00100000
+#define S_SNAPSHOT      0x00200000
+/* unused */
+#define S_LMV           0x00800000 /* b_new_cmd */
+/* unused */
+#define S_SEC           0x02000000 /* upcall cache */
+#define S_GSS           0x04000000 /* b_new_cmd */
+/* unused */
+#define S_MGC           0x10000000
+#define S_MGS           0x20000000
+#define S_FID           0x40000000 /* b_new_cmd */
+#define S_FLD           0x80000000 /* b_new_cmd */
+
+#define LIBCFS_DEBUG_SUBSYS_NAMES {					\
+	"undefined", "mdc", "mds", "osc", "ost", "class", "log",	\
+	"llite", "rpc", "mgmt", "lnet", "lnd", "pinger", "filter",	\
+	"libcfs", "echo", "ldlm", "lov", "lquota", "osd", "lfsck",	\
+	"snapshot", "", "lmv", "", "sec", "gss", "", "mgc", "mgs",	\
+	"fid", "fld", NULL }
+
+/* Debugging masks (32 bits, non-overlapping) */
+#define D_TRACE         0x00000001 /* ENTRY/EXIT markers */
+#define D_INODE         0x00000002
+#define D_SUPER         0x00000004
+#define D_EXT2          0x00000008 /* anything from ext2_debug */
+#define D_MALLOC        0x00000010 /* print malloc, free information */
+#define D_CACHE         0x00000020 /* cache-related items */
+#define D_INFO          0x00000040 /* general information */
+#define D_IOCTL         0x00000080 /* ioctl related information */
+#define D_NETERROR      0x00000100 /* network errors */
+#define D_NET           0x00000200 /* network communications */
+#define D_WARNING       0x00000400 /* CWARN(...) == CDEBUG (D_WARNING, ...) */
+#define D_BUFFS         0x00000800
+#define D_OTHER         0x00001000
+#define D_DENTRY        0x00002000
+#define D_NETTRACE      0x00004000
+#define D_PAGE          0x00008000 /* bulk page handling */
+#define D_DLMTRACE      0x00010000
+#define D_ERROR         0x00020000 /* CERROR(...) == CDEBUG (D_ERROR, ...) */
+#define D_EMERG         0x00040000 /* CEMERG(...) == CDEBUG (D_EMERG, ...) */
+#define D_HA            0x00080000 /* recovery and failover */
+#define D_RPCTRACE      0x00100000 /* for distributed debugging */
+#define D_VFSTRACE      0x00200000
+#define D_READA         0x00400000 /* read-ahead */
+#define D_MMAP          0x00800000
+#define D_CONFIG        0x01000000
+#define D_CONSOLE       0x02000000
+#define D_QUOTA         0x04000000
+#define D_SEC           0x08000000
+#define D_LFSCK         0x10000000 /* For both OI scrub and LFSCK */
+#define D_HSM           0x20000000
+#define D_SNAPSHOT      0x40000000 /* snapshot */
+#define D_LAYOUT        0x80000000
+
+#define LIBCFS_DEBUG_MASKS_NAMES {					\
+	"trace", "inode", "super", "ext2", "malloc", "cache", "info",	\
+	"ioctl", "neterror", "net", "warning", "buffs", "other",	\
+	"dentry", "nettrace", "page", "dlmtrace", "error", "emerg",	\
+	"ha", "rpctrace", "vfstrace", "reada", "mmap", "config",	\
+	"console", "quota", "sec", "lfsck", "hsm", "snapshot", "layout",\
+	NULL }
+
+#define D_CANTMASK   (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE)
+
+#define LIBCFS_DEBUG_FILE_PATH_DEFAULT "/tmp/lustre-log"
+
+#endif	/* __UAPI_LIBCFS_DEBUG_H__ */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ioctl.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_ioctl.h
similarity index 88%
rename from drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ioctl.h
rename to drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_ioctl.h
index 6b79096f761a0..cdac10f572408 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/libcfs_ioctl.h
+++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/libcfs_ioctl.h
@@ -23,21 +23,19 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2013, 2016, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  * Lustre is a trademark of Sun Microsystems, Inc.
  *
- * libcfs/include/libcfs/libcfs_ioctl.h
- *
  * Low-level ioctl data structures. Kernel ioctl functions declared here,
  * and user space functions are in libcfs/util/ioctl.h.
  *
  */
 
-#ifndef __LIBCFS_IOCTL_H__
-#define __LIBCFS_IOCTL_H__
+#ifndef __UAPI_LIBCFS_IOCTL_H__
+#define __UAPI_LIBCFS_IOCTL_H__
 
 #include <linux/types.h>
 #include <linux/ioctl.h>
@@ -77,8 +75,7 @@ struct libcfs_ioctl_data {
 	char ioc_bulk[0];
 };
 
-struct libcfs_debug_ioctl_data
-{
+struct libcfs_debug_ioctl_data {
 	struct libcfs_ioctl_hdr hdr;
 	unsigned int subs;
 	unsigned int debug;
@@ -105,7 +102,7 @@ struct libcfs_debug_ioctl_data
 #define IOC_LIBCFS_CONFIGURE		   _IOWR('e', 59, IOCTL_LIBCFS_TYPE)
 #define IOC_LIBCFS_TESTPROTOCOMPAT	   _IOWR('e', 60, IOCTL_LIBCFS_TYPE)
 #define IOC_LIBCFS_PING			   _IOWR('e', 61, IOCTL_LIBCFS_TYPE)
-/*	IOC_LIBCFS_DEBUG_PEER		   _IOWR('e', 62, IOCTL_LIBCFS_TYPE) */
+#define IOC_LIBCFS_PING_PEER               _IOWR('e', 62, IOCTL_LIBCFS_TYPE)
 #define IOC_LIBCFS_LNETST		   _IOWR('e', 63, IOCTL_LIBCFS_TYPE)
 #define IOC_LIBCFS_LNET_FAULT		   _IOWR('e', 64, IOCTL_LIBCFS_TYPE)
 /* lnd ioctls */
@@ -116,7 +113,7 @@ struct libcfs_debug_ioctl_data
 #define IOC_LIBCFS_DEL_PEER		   _IOWR('e', 74, IOCTL_LIBCFS_TYPE)
 #define IOC_LIBCFS_ADD_PEER		   _IOWR('e', 75, IOCTL_LIBCFS_TYPE)
 #define IOC_LIBCFS_GET_PEER		   _IOWR('e', 76, IOCTL_LIBCFS_TYPE)
-/* ioctl 77 is free for use */
+#define IOC_LIBCFS_DISCOVER                _IOWR('e', 77, IOCTL_LIBCFS_TYPE)
 #define IOC_LIBCFS_ADD_INTERFACE	   _IOWR('e', 78, IOCTL_LIBCFS_TYPE)
 #define IOC_LIBCFS_DEL_INTERFACE	   _IOWR('e', 79, IOCTL_LIBCFS_TYPE)
 #define IOC_LIBCFS_GET_INTERFACE	   _IOWR('e', 80, IOCTL_LIBCFS_TYPE)
@@ -148,8 +145,13 @@ struct libcfs_debug_ioctl_data
 #define IOC_LIBCFS_GET_LOCAL_NI		   _IOWR(IOC_LIBCFS_TYPE, 97, IOCTL_CONFIG_SIZE)
 #define IOC_LIBCFS_SET_NUMA_RANGE	   _IOWR(IOC_LIBCFS_TYPE, 98, IOCTL_CONFIG_SIZE)
 #define IOC_LIBCFS_GET_NUMA_RANGE	   _IOWR(IOC_LIBCFS_TYPE, 99, IOCTL_CONFIG_SIZE)
-#define IOC_LIBCFS_MAX_NR					  99
+#define IOC_LIBCFS_GET_PEER_LIST	   _IOWR(IOC_LIBCFS_TYPE, 100, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_LOCAL_NI_MSG_STATS  _IOWR(IOC_LIBCFS_TYPE, 101, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_SET_HEALHV		   _IOWR(IOC_LIBCFS_TYPE, 102, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_LOCAL_HSTATS	   _IOWR(IOC_LIBCFS_TYPE, 103, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_RECOVERY_QUEUE	   _IOWR(IOC_LIBCFS_TYPE, 104, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_MAX_NR					  104
 
 extern int libcfs_ioctl_data_adjust(struct libcfs_ioctl_data *data);
 
-#endif /* __LIBCFS_IOCTL_H__ */
+#endif /* __UAPI_LIBCFS_IOCTL_H__ */
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lib-dlc.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-dlc.h
similarity index 76%
rename from drivers/staging/lustrefsx/lnet/include/lnet/lib-dlc.h
rename to drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-dlc.h
index 4141f7c492c22..f10cbc3309176 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/lib-dlc.h
+++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-dlc.h
@@ -20,21 +20,32 @@
  *
  */
 /*
- * Copyright (c) 2014, 2016, Intel Corporation.
+ * Copyright (c) 2014, 2017, Intel Corporation.
  */
 /*
  * Author: Amir Shehata <amir.shehata@intel.com>
  */
 
-#ifndef LNET_DLC_H
-#define LNET_DLC_H
+#ifndef __UAPI_LNET_DLC_H_
+#define __UAPI_LNET_DLC_H_
 
-#include <libcfs/libcfs_ioctl.h>
-#include <lnet/types.h>
+#include <linux/types.h>
+/*
+ * This is due to us being out of kernel and the way the OpenSFS branch
+ * handles CFLAGS.
+ */
+#ifdef __KERNEL__
+# include <uapi/linux/lnet/libcfs_ioctl.h>
+# include <uapi/linux/lnet/lnet-types.h>
+#else
+# include <linux/lnet/libcfs_ioctl.h>
+# include <linux/lnet/lnet-types.h>
+#endif
 
 #define MAX_NUM_SHOW_ENTRIES	32
 #define LNET_MAX_STR_LEN	128
 #define LNET_MAX_SHOW_NUM_CPT	128
+#define LNET_MAX_SHOW_NUM_NID	128
 #define LNET_UNDEFINED_HOPS	((__u32) -1)
 
 /*
@@ -81,7 +92,7 @@ struct lnet_ioctl_config_lnd_tunables {
 };
 
 struct lnet_ioctl_net_config {
-	char ni_interfaces[LNET_NUM_INTERFACES][LNET_MAX_STR_LEN];
+	char ni_interfaces[LNET_INTERFACES_NUM][LNET_MAX_STR_LEN];
 	__u32 ni_status;
 	__u32 ni_cpts[LNET_MAX_SHOW_NUM_CPT];
 	char cfg_bulk[0];
@@ -111,8 +122,8 @@ struct lnet_ioctl_ping_data {
 	__u32 ping_count;
 	__u32 ping_flags;
 	bool mr_info;
-	lnet_process_id_t ping_id;
-	lnet_process_id_t __user *ping_buf;
+	struct lnet_process_id ping_id;
+	struct lnet_process_id __user *ping_buf;
 };
 
 struct lnet_ioctl_config_data {
@@ -163,6 +174,31 @@ struct lnet_ioctl_element_stats {
 	__u32 iel_drop_count;
 };
 
+enum lnet_health_type {
+	LNET_HEALTH_TYPE_LOCAL_NI = 0,
+	LNET_HEALTH_TYPE_PEER_NI,
+};
+
+struct lnet_ioctl_local_ni_hstats {
+	struct libcfs_ioctl_hdr hlni_hdr;
+	lnet_nid_t hlni_nid;
+	__u32 hlni_local_interrupt;
+	__u32 hlni_local_dropped;
+	__u32 hlni_local_aborted;
+	__u32 hlni_local_no_route;
+	__u32 hlni_local_timeout;
+	__u32 hlni_local_error;
+	__s32 hlni_health_value;
+};
+
+struct lnet_ioctl_peer_ni_hstats {
+	__u32 hlpni_remote_dropped;
+	__u32 hlpni_remote_timeout;
+	__u32 hlpni_remote_error;
+	__u32 hlpni_network_timeout;
+	__s32 hlpni_health_value;
+};
+
 struct lnet_ioctl_element_msg_stats {
 	struct libcfs_ioctl_hdr im_hdr;
 	__u32 im_idx;
@@ -184,7 +220,7 @@ struct lnet_ioctl_element_msg_stats {
 struct lnet_ioctl_config_ni {
 	struct libcfs_ioctl_hdr lic_cfg_hdr;
 	lnet_nid_t		lic_nid;
-	char			lic_ni_intf[LNET_NUM_INTERFACES][LNET_MAX_STR_LEN];
+	char			lic_ni_intf[LNET_INTERFACES_NUM][LNET_MAX_STR_LEN];
 	char			lic_legacy_ip2nets[LNET_MAX_STR_LEN];
 	__u32			lic_cpts[LNET_MAX_SHOW_NUM_CPT];
 	__u32			lic_ncpts;
@@ -230,9 +266,24 @@ struct lnet_ioctl_peer_cfg {
 	void __user *prcfg_bulk;
 };
 
-struct lnet_ioctl_numa_range {
-	struct libcfs_ioctl_hdr nr_hdr;
-	__u32 nr_range;
+struct lnet_ioctl_reset_health_cfg {
+	struct libcfs_ioctl_hdr rh_hdr;
+	enum lnet_health_type rh_type;
+	bool rh_all;
+	int rh_value;
+	lnet_nid_t rh_nid;
+};
+
+struct lnet_ioctl_recovery_list {
+	struct libcfs_ioctl_hdr rlst_hdr;
+	enum lnet_health_type rlst_type;
+	int rlst_num_nids;
+	lnet_nid_t rlst_nid_array[LNET_MAX_SHOW_NUM_NID];
+};
+
+struct lnet_ioctl_set_value {
+	struct libcfs_ioctl_hdr sv_hdr;
+	__u32 sv_value;
 };
 
 struct lnet_ioctl_lnet_stats {
@@ -240,4 +291,4 @@ struct lnet_ioctl_lnet_stats {
 	struct lnet_counters st_cntrs;
 };
 
-#endif /* LNET_DLC_H */
+#endif /* _LNET_DLC_H_ */
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/types.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-types.h
similarity index 85%
rename from drivers/staging/lustrefsx/lnet/include/lnet/types.h
rename to drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-types.h
index e4bfe3d4951dd..1f7828c8c9c15 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/types.h
+++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnet-types.h
@@ -23,15 +23,15 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  * Lustre is a trademark of Sun Microsystems, Inc.
  */
 
-#ifndef __LNET_TYPES_H__
-#define __LNET_TYPES_H__
+#ifndef __UAPI_LNET_TYPES_H__
+#define __UAPI_LNET_TYPES_H__
 
 /** \addtogroup lnet
  * @{ */
@@ -107,30 +107,33 @@ static inline __u32 LNET_MKNET(__u32 type, __u32 num)
 	return (type << 16) | num;
 }
 
+/** The lolnd NID (i.e. myself) */
+#define LNET_NID_LO_0 LNET_MKNID(LNET_MKNET(LOLND, 0), 0)
+
 #define WIRE_ATTR	__attribute__((packed))
 
 /* Packed version of struct lnet_process_id to transfer via network */
-typedef struct lnet_process_id_packed {
+struct lnet_process_id_packed {
 	lnet_nid_t nid;
 	lnet_pid_t pid;	/* node id / process id */
-} WIRE_ATTR lnet_process_id_packed;
+} WIRE_ATTR;
 
 /* The wire handle's interface cookie only matches one network interface in
  * one epoch (i.e. new cookie when the interface restarts or the node
  * reboots).  The object cookie only matches one object on that interface
  * during that object's lifetime (i.e. no cookie re-use). */
-typedef struct lnet_handle_wire {
+struct lnet_handle_wire {
 	__u64 wh_interface_cookie;
 	__u64 wh_object_cookie;
-} WIRE_ATTR lnet_handle_wire_t;
+} WIRE_ATTR;
 
-typedef enum lnet_msg_type {
+enum lnet_msg_type {
 	LNET_MSG_ACK = 0,
 	LNET_MSG_PUT,
 	LNET_MSG_GET,
 	LNET_MSG_REPLY,
 	LNET_MSG_HELLO,
-} lnet_msg_type_t;
+};
 
 /* The variant fields of the portals message header are aligned on an 8
  * byte boundary in the message header.  Note that all types used in these
@@ -167,7 +170,7 @@ struct lnet_hello {
 	__u32			type;
 } WIRE_ATTR;
 
-typedef struct lnet_hdr {
+struct lnet_hdr {
 	lnet_nid_t	dest_nid;
 	lnet_nid_t	src_nid;
 	lnet_pid_t	dest_pid;
@@ -182,7 +185,7 @@ typedef struct lnet_hdr {
 		struct lnet_reply	reply;
 		struct lnet_hello	hello;
 	} msg;
-} WIRE_ATTR lnet_hdr_t;
+} WIRE_ATTR;
 
 /* A HELLO message contains a magic number and protocol version
  * code in the header's dest_nid, the peer's NID in the src_nid, and
@@ -193,11 +196,11 @@ typedef struct lnet_hdr {
  * exchange HELLO messages when a connection is first established.  Individual
  * LNDs can put whatever else they fancy in lnet_hdr::msg.
  */
-typedef struct lnet_magicversion {
+struct lnet_magicversion {
 	__u32	magic;		/* LNET_PROTO_TCP_MAGIC */
 	__u16	version_major;	/* increment on incompatible change */
 	__u16	version_minor;	/* increment on compatible change */
-} WIRE_ATTR lnet_magic_version_t;
+} WIRE_ATTR;
 
 /* PROTO MAGIC for LNDs */
 #define LNET_PROTO_IB_MAGIC		0x0be91b91
@@ -215,39 +218,109 @@ typedef struct lnet_magicversion {
 #define LNET_PROTO_TCP_VERSION_MINOR	0
 
 /* Acceptor connection request */
-typedef struct lnet_acceptor_connreq {
+struct lnet_acceptor_connreq {
 	__u32	acr_magic;	/* PTL_ACCEPTOR_PROTO_MAGIC */
 	__u32	acr_version;	/* protocol version */
 	__u64	acr_nid;	/* target NID */
-} WIRE_ATTR lnet_acceptor_connreq_t;
+} WIRE_ATTR;
 
 #define LNET_PROTO_ACCEPTOR_VERSION	1
 
-typedef struct lnet_counters {
-	__u32	msgs_alloc;
-	__u32	msgs_max;
-	__u32	errors;
-	__u32	send_count;
-	__u32	recv_count;
-	__u32	route_count;
-	__u32	drop_count;
-	__u64	send_length;
-	__u64	recv_length;
-	__u64	route_length;
-	__u64	drop_length;
-} WIRE_ATTR lnet_counters_t;
+struct lnet_counters_common {
+	__u32	lcc_msgs_alloc;
+	__u32	lcc_msgs_max;
+	__u32	lcc_errors;
+	__u32	lcc_send_count;
+	__u32	lcc_recv_count;
+	__u32	lcc_route_count;
+	__u32	lcc_drop_count;
+	__u64	lcc_send_length;
+	__u64	lcc_recv_length;
+	__u64	lcc_route_length;
+	__u64	lcc_drop_length;
+} WIRE_ATTR;
+
+struct lnet_counters_health {
+	__u32	lch_rst_alloc;
+	__u32	lch_resend_count;
+	__u32	lch_response_timeout_count;
+	__u32	lch_local_interrupt_count;
+	__u32	lch_local_dropped_count;
+	__u32	lch_local_aborted_count;
+	__u32	lch_local_no_route_count;
+	__u32	lch_local_timeout_count;
+	__u32	lch_local_error_count;
+	__u32	lch_remote_dropped_count;
+	__u32	lch_remote_error_count;
+	__u32	lch_remote_timeout_count;
+	__u32	lch_network_timeout_count;
+};
+
+struct lnet_counters {
+	struct lnet_counters_common lct_common;
+	struct lnet_counters_health lct_health;
+};
 
 #define LNET_NI_STATUS_UP	0x15aac0de
 #define LNET_NI_STATUS_DOWN	0xdeadface
 #define LNET_NI_STATUS_INVALID	0x00000000
 
+struct lnet_ni_status {
+	lnet_nid_t ns_nid;
+	__u32      ns_status;
+	__u32      ns_unused;
+} WIRE_ATTR;
+
+/*
+ * NB: value of these features equal to LNET_PROTO_PING_VERSION_x
+ * of old LNet, so there shouldn't be any compatibility issue
+ */
+#define LNET_PING_FEAT_INVAL		(0)		/* no feature */
+#define LNET_PING_FEAT_BASE		(1 << 0)	/* just a ping */
+#define LNET_PING_FEAT_NI_STATUS	(1 << 1)	/* return NI status */
+#define LNET_PING_FEAT_RTE_DISABLED	(1 << 2)        /* Routing enabled */
+#define LNET_PING_FEAT_MULTI_RAIL	(1 << 3)        /* Multi-Rail aware */
+#define LNET_PING_FEAT_DISCOVERY	(1 << 4)	/* Supports Discovery */
+
+/*
+ * All ping feature bits fit to hit the wire.
+ * In lnet_assert_wire_constants() this is compared against its open-coded
+ * value, and in lnet_ping_target_update() it is used to verify that no
+ * unknown bits have been set.
+ * New feature bits can be added, just be aware that this does change the
+ * over-the-wire protocol.
+ */
+#define LNET_PING_FEAT_BITS		(LNET_PING_FEAT_BASE | \
+					 LNET_PING_FEAT_NI_STATUS | \
+					 LNET_PING_FEAT_RTE_DISABLED | \
+					 LNET_PING_FEAT_MULTI_RAIL | \
+					 LNET_PING_FEAT_DISCOVERY)
+
+struct lnet_ping_info {
+	__u32			pi_magic;
+	__u32			pi_features;
+	lnet_pid_t		pi_pid;
+	__u32			pi_nnis;
+	struct lnet_ni_status	pi_ni[0];
+} WIRE_ATTR;
+
+#define LNET_PING_INFO_SIZE(NNIDS) \
+	offsetof(struct lnet_ping_info, pi_ni[NNIDS])
+#define LNET_PING_INFO_LONI(PINFO)	((PINFO)->pi_ni[0].ns_nid)
+#define LNET_PING_INFO_SEQNO(PINFO)	((PINFO)->pi_ni[0].ns_status)
+
 /*
  * This is a hard-coded limit on the number of interfaces supported by
  * the interface bonding implemented by the ksocknal LND. It must be
  * defined here because it is used in LNet data structures that are
  * common to all LNDs.
  */
-#define LNET_NUM_INTERFACES	16
+#define LNET_INTERFACES_NUM	16
+
+/* The minimum number of interfaces per node supported by LNet. */
+#define LNET_INTERFACES_MIN	16
+/* The default - arbitrary - value of the lnet_max_interfaces tunable. */
+#define LNET_INTERFACES_MAX_DEFAULT	200
 
 /**
  * Objects maintained by the LNet are accessed through handles. Handle types
@@ -258,9 +331,9 @@ typedef struct lnet_counters {
  */
 #define LNET_WIRE_HANDLE_COOKIE_NONE   (-1)
 
-typedef struct lnet_handle_eq {
+struct lnet_handle_eq {
 	__u64	cookie;
-} lnet_handle_eq_t;
+};
 
 /**
  * Invalidate eq handle \a h.
@@ -280,9 +353,9 @@ static inline int LNetEQHandleIsInvalid(struct lnet_handle_eq h)
 	return (LNET_WIRE_HANDLE_COOKIE_NONE == h.cookie);
 }
 
-typedef struct lnet_handle_md {
+struct lnet_handle_md {
 	__u64	cookie;
-} lnet_handle_md_t;
+};
 
 /**
  * Invalidate md handle \a h.
@@ -302,19 +375,19 @@ static inline int LNetMDHandleIsInvalid(struct lnet_handle_md h)
 	return (LNET_WIRE_HANDLE_COOKIE_NONE == h.cookie);
 }
 
-typedef struct lnet_handle_me {
+struct lnet_handle_me {
 	__u64	cookie;
-} lnet_handle_me_t;
+};
 
 /**
  * Global process ID.
  */
-typedef struct lnet_process_id {
+struct lnet_process_id {
 	/** node id */
 	lnet_nid_t nid;
 	/** process id */
 	lnet_pid_t pid;
-} lnet_process_id_t;
+};
 /** @} lnet_addr */
 
 /** \addtogroup lnet_me
@@ -324,10 +397,10 @@ typedef struct lnet_process_id {
  * Specifies whether the match entry or memory descriptor should be unlinked
  * automatically (LNET_UNLINK) or not (LNET_RETAIN).
  */
-typedef enum lnet_unlink {
+enum lnet_unlink {
 	LNET_RETAIN = 0,
 	LNET_UNLINK
-} lnet_unlink_t;
+};
 
 /**
  * Values of the type enum lnet_ins_pos are used to control where a new match
@@ -336,14 +409,14 @@ typedef enum lnet_unlink {
  * LNET_INS_AFTER is used to insert the new entry after the current entry
  * or after the last item in the list.
  */
-typedef enum lnet_ins_pos {
+enum lnet_ins_pos {
 	/** insert ME before current position or head of the list */
 	LNET_INS_BEFORE,
 	/** insert ME after current position or tail of the list */
 	LNET_INS_AFTER,
 	/** attach ME at tail of local CPU partition ME list */
 	LNET_INS_LOCAL
-} lnet_ins_pos;
+};
 
 /** @} lnet_me */
 
@@ -354,7 +427,7 @@ typedef enum lnet_ins_pos {
  * Defines the visible parts of a memory descriptor. Values of this type
  * are used to initialize memory descriptors.
  */
-typedef struct lnet_md {
+struct lnet_md {
 	/**
 	 * Specify the memory region associated with the memory descriptor.
 	 * If the options field has:
@@ -458,7 +531,7 @@ typedef struct lnet_md {
 	 * if the LNET_MD_BULK_HANDLE option is set.
 	 */
 	struct lnet_handle_md bulk_handle;
-} lnet_md_t;
+};
 
 /* Max Transfer Unit (minimum supported everywhere).
  * CAVEAT EMPTOR, with multinet (i.e. routers forwarding between networks)
@@ -466,9 +539,6 @@ typedef struct lnet_md {
 #define LNET_MTU_BITS	20
 #define LNET_MTU	(1 << LNET_MTU_BITS)
 
-/** limit on the number of fragments in discontiguous MDs */
-#define LNET_MAX_IOV	256
-
 /**
  * Options for the MD structure. See struct lnet_md::options.
  */
@@ -520,7 +590,7 @@ typedef struct {
 /**
  * Six types of events can be logged in an event queue.
  */
-typedef enum lnet_event_kind {
+enum lnet_event_kind {
 	/** An incoming GET operation has completed on the MD. */
 	LNET_EVENT_GET		= 1,
 	/**
@@ -556,14 +626,14 @@ typedef enum lnet_event_kind {
 	 * \see LNetMDUnlink
 	 */
 	LNET_EVENT_UNLINK,
-} lnet_event_kind_t;
+};
 
 #define LNET_SEQ_GT(a, b)	(((signed long)((a) - (b))) > 0)
 
 /**
  * Information about an event on a MD.
  */
-typedef struct lnet_event {
+struct lnet_event {
 	/** The identifier (nid, pid) of the target. */
 	struct lnet_process_id   target;
 	/** The identifier (nid, pid) of the initiator. */
@@ -608,6 +678,11 @@ typedef struct lnet_event {
 	 * \see LNetPut
 	 */
 	__u64               hdr_data;
+	/**
+	 * The message type, to ensure a handler for LNET_EVENT_SEND can
+	 * distinguish between LNET_MSG_GET and LNET_MSG_PUT.
+	 */
+	__u32               msg_type;
 	/**
 	 * Indicates the completion status of the operation. It's 0 for
 	 * successful operations, otherwise it's an error code.
@@ -632,7 +707,7 @@ typedef struct lnet_event {
 	 * to each event.
 	 */
 	volatile unsigned long sequence;
-} lnet_event_t;
+};
 
 /**
  * Event queue handler function type.
@@ -659,12 +734,12 @@ typedef void (*lnet_eq_handler_t)(struct lnet_event *event);
  * \see struct lnet_md::options for the discussion on LNET_MD_ACK_DISABLE
  * by which acknowledgments can be disabled for a MD.
  */
-typedef enum lnet_ack_req {
+enum lnet_ack_req {
 	/** Request an acknowledgment */
 	LNET_ACK_REQ,
 	/** Request that no acknowledgment should be generated. */
 	LNET_NOACK_REQ
-} lnet_ack_req_t;
+};
 /** @} lnet_data */
 
 /** @} lnet */
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lnetctl.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetctl.h
similarity index 76%
rename from drivers/staging/lustrefsx/lnet/include/lnet/lnetctl.h
rename to drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetctl.h
index 4328135c5ec72..cb4f153e377d1 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/lnetctl.h
+++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetctl.h
@@ -17,12 +17,23 @@
  * header for lnet ioctl
  */
 /*
- * Copyright (c) 2014, Intel Corporation.
+ * Copyright (c) 2014, 2017, Intel Corporation.
  */
-#ifndef _LNETCTL_H_
-#define _LNETCTL_H_
+#ifndef __UAPI_LNETCTL_H_
+#define __UAPI_LNETCTL_H_
 
-#include <lnet/types.h>
+#include <linux/types.h>
+/*
+ * This is due to us being out of kernel and the way the OpenSFS branch
+ * handles CFLAGS.
+ */
+#ifdef __KERNEL__
+# include <uapi/linux/lnet/lnet-types.h>
+#else
+# include <linux/lnet/lnet-types.h>
+#endif
+
+#include <stdbool.h>
 
 /** \addtogroup lnet_fault_simulation
  * @{ */
@@ -43,6 +54,19 @@ enum {
 #define LNET_GET_BIT		(1 << 2)
 #define LNET_REPLY_BIT		(1 << 3)
 
+#define HSTATUS_END			11
+#define HSTATUS_LOCAL_INTERRUPT_BIT	(1 << 1)
+#define HSTATUS_LOCAL_DROPPED_BIT	(1 << 2)
+#define HSTATUS_LOCAL_ABORTED_BIT	(1 << 3)
+#define HSTATUS_LOCAL_NO_ROUTE_BIT	(1 << 4)
+#define HSTATUS_LOCAL_ERROR_BIT		(1 << 5)
+#define HSTATUS_LOCAL_TIMEOUT_BIT	(1 << 6)
+#define HSTATUS_REMOTE_ERROR_BIT	(1 << 7)
+#define HSTATUS_REMOTE_DROPPED_BIT	(1 << 8)
+#define HSTATUS_REMOTE_TIMEOUT_BIT	(1 << 9)
+#define HSTATUS_NETWORK_TIMEOUT_BIT	(1 << 10)
+#define HSTATUS_RANDOM			0xffffffff
+
 /** ioctl parameter for LNet fault simulation */
 struct lnet_fault_attr {
 	/**
@@ -80,6 +104,10 @@ struct lnet_fault_attr {
 			 * with da_rate
 			 */
 			__u32			da_interval;
+			/** error type mask */
+			__u32			da_health_error_mask;
+			/** randomize error generation */
+			bool			da_random;
 		} drop;
 		/** message latency simulation */
 		struct {
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lnetst.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetst.h
similarity index 99%
rename from drivers/staging/lustrefsx/lnet/include/lnet/lnetst.h
rename to drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetst.h
index 7071039d9aa38..ca871cac02b7b 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/lnetst.h
+++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/lnetst.h
@@ -29,13 +29,13 @@
  * This file is part of Lustre, http://www.lustre.org/
  * Lustre is a trademark of Sun Microsystems, Inc.
  *
- * lnet/include/lnet/lnetst.h
- *
  * Author: Liang Zhen <liangzhen@clusterfs.com>
  */
 
-#ifndef __LNET_ST_H__
-#define __LNET_ST_H__
+#ifndef __UAPI_LNET_ST_H__
+#define __UAPI_LNET_ST_H__
+
+#include <linux/types.h>
 
 #define LST_FEAT_NONE		(0)
 #define LST_FEAT_BULK_LEN	(1 << 0)	/* enable variable page size */
@@ -67,7 +67,7 @@
 
 struct lst_sid {
 	lnet_nid_t	ses_nid;	/* nid of console node */
-	__u64		ses_stamp;	/* time stamp */
+	__s64		ses_stamp;	/* time stamp in milliseconds */
 };					/*** session id */
 
 extern struct lst_sid LST_INVALID_SID;
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/nidstr.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/nidstr.h
similarity index 93%
rename from drivers/staging/lustrefsx/lnet/include/lnet/nidstr.h
rename to drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/nidstr.h
index be14a1dfcf71d..c41b9158ecd7d 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/nidstr.h
+++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/nidstr.h
@@ -23,11 +23,21 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2014, 2015, Intel Corporation.
+ * Copyright (c) 2015, 2017, Intel Corporation.
  */
 #ifndef _LNET_NIDSTRINGS_H
 #define _LNET_NIDSTRINGS_H
-#include <lnet/types.h>
+
+#include <linux/types.h>
+/*
+ * This is due to us being out of kernel and the way the OpenSFS branch
+ * handles CFLAGS.
+ */
+#ifdef __KERNEL__
+# include <uapi/linux/lnet/lnet-types.h>
+#else
+# include <linux/lnet/lnet-types.h>
+#endif
 
 /**
  *  Lustre Network Driver types.
diff --git a/drivers/staging/lustrefsx/lnet/include/lnet/lnet.h b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/socklnd.h
similarity index 74%
rename from drivers/staging/lustrefsx/lnet/include/lnet/lnet.h
rename to drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/socklnd.h
index 54061f593496e..6453e053fa99d 100644
--- a/drivers/staging/lustrefsx/lnet/include/lnet/lnet.h
+++ b/drivers/staging/lustrefsx/lnet/include/uapi/linux/lnet/socklnd.h
@@ -22,25 +22,23 @@
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
- *
- * Copyright (c) 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  * Lustre is a trademark of Sun Microsystems, Inc.
- */
-
-#ifndef __LNET_H__
-#define __LNET_H__
-
-/*
- * lnet.h
  *
- * User application interface file
+ * #defines shared between socknal implementation and utilities
  */
+#ifndef __UAPI_LNET_SOCKLND_H__
+#define __UAPI_LNET_SOCKLND_H__
+
+#define SOCKLND_CONN_NONE     (-1)
+#define SOCKLND_CONN_ANY	0
+#define SOCKLND_CONN_CONTROL	1
+#define SOCKLND_CONN_BULK_IN	2
+#define SOCKLND_CONN_BULK_OUT	3
+#define SOCKLND_CONN_NTYPES	4
 
-#include <lnet/types.h>
-#include <lnet/lib-dlc.h>
-#include <lnet/nidstr.h>
+#define SOCKLND_CONN_ACK	SOCKLND_CONN_BULK_IN
 
 #endif
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c
index 90645f6388ea6..68b83585dc300 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -35,11 +35,13 @@
  */
 
 #include <asm/page.h>
+#include <linux/inetdevice.h>
+
 #include "o2iblnd.h"
 
 static struct lnet_lnd the_o2iblnd;
 
-kib_data_t              kiblnd_data;
+struct kib_data kiblnd_data;
 
 static __u32
 kiblnd_cksum (void *ptr, int nob)
@@ -96,41 +98,40 @@ kiblnd_msgtype2str(int type)
 static int
 kiblnd_msgtype2size(int type)
 {
-        const int hdr_size = offsetof(kib_msg_t, ibm_u);
+	const int hdr_size = offsetof(struct kib_msg, ibm_u);
 
         switch (type) {
         case IBLND_MSG_CONNREQ:
         case IBLND_MSG_CONNACK:
-                return hdr_size + sizeof(kib_connparams_t);
+		return hdr_size + sizeof(struct kib_connparams);
 
         case IBLND_MSG_NOOP:
                 return hdr_size;
 
         case IBLND_MSG_IMMEDIATE:
-                return offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]);
+		return offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[0]);
 
         case IBLND_MSG_PUT_REQ:
-                return hdr_size + sizeof(kib_putreq_msg_t);
+		return hdr_size + sizeof(struct kib_putreq_msg);
 
         case IBLND_MSG_PUT_ACK:
-                return hdr_size + sizeof(kib_putack_msg_t);
+		return hdr_size + sizeof(struct kib_putack_msg);
 
         case IBLND_MSG_GET_REQ:
-                return hdr_size + sizeof(kib_get_msg_t);
+		return hdr_size + sizeof(struct kib_get_msg);
 
         case IBLND_MSG_PUT_NAK:
         case IBLND_MSG_PUT_DONE:
         case IBLND_MSG_GET_DONE:
-                return hdr_size + sizeof(kib_completion_msg_t);
+		return hdr_size + sizeof(struct kib_completion_msg);
         default:
                 return -1;
         }
 }
 
-static int
-kiblnd_unpack_rd(kib_msg_t *msg, int flip)
+static int kiblnd_unpack_rd(struct kib_msg *msg, int flip)
 {
-        kib_rdma_desc_t   *rd;
+	struct kib_rdma_desc *rd;
         int                nob;
         int                n;
         int                i;
@@ -155,7 +156,7 @@ kiblnd_unpack_rd(kib_msg_t *msg, int flip)
                 return 1;
         }
 
-        nob = offsetof (kib_msg_t, ibm_u) +
+	nob = offsetof(struct kib_msg, ibm_u) +
               kiblnd_rd_msg_size(rd, msg->ibm_type, n);
 
         if (msg->ibm_nob < nob) {
@@ -175,11 +176,10 @@ kiblnd_unpack_rd(kib_msg_t *msg, int flip)
         return 0;
 }
 
-void
-kiblnd_pack_msg(struct lnet_ni *ni, kib_msg_t *msg, int version,
-		int credits, lnet_nid_t dstnid, __u64 dststamp)
+void kiblnd_pack_msg(struct lnet_ni *ni, struct kib_msg *msg, int version,
+		     int credits, lnet_nid_t dstnid, __u64 dststamp)
 {
-        kib_net_t *net = ni->ni_data;
+	struct kib_net *net = ni->ni_data;
 
         /* CAVEAT EMPTOR! all message fields not set here should have been
          * initialised previously. */
@@ -200,10 +200,9 @@ kiblnd_pack_msg(struct lnet_ni *ni, kib_msg_t *msg, int version,
         }
 }
 
-int
-kiblnd_unpack_msg(kib_msg_t *msg, int nob)
+int kiblnd_unpack_msg(struct kib_msg *msg, int nob)
 {
-        const int hdr_size = offsetof(kib_msg_t, ibm_u);
+	const int hdr_size = offsetof(struct kib_msg, ibm_u);
         __u32     msg_cksum;
         __u16     version;
         int       msg_nob;
@@ -313,12 +312,13 @@ kiblnd_unpack_msg(kib_msg_t *msg, int nob)
 }
 
 int
-kiblnd_create_peer(struct lnet_ni *ni, kib_peer_ni_t **peerp, lnet_nid_t nid)
+kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer_ni **peerp,
+		   lnet_nid_t nid)
 {
-	kib_peer_ni_t	*peer_ni;
-	kib_net_t	*net = ni->ni_data;
-	int		cpt = lnet_cpt_of_nid(nid, ni);
-	unsigned long   flags;
+	struct kib_peer_ni *peer_ni;
+	struct kib_net *net = ni->ni_data;
+	int cpt = lnet_cpt_of_nid(nid, ni);
+	unsigned long flags;
 
 	LASSERT(net != NULL);
 	LASSERT(nid != LNET_NID_ANY);
@@ -333,7 +333,7 @@ kiblnd_create_peer(struct lnet_ni *ni, kib_peer_ni_t **peerp, lnet_nid_t nid)
 	peer_ni->ibp_nid = nid;
 	peer_ni->ibp_error = 0;
 	peer_ni->ibp_last_alive = 0;
-	peer_ni->ibp_max_frags = kiblnd_cfg_rdma_frags(peer_ni->ibp_ni);
+	peer_ni->ibp_max_frags = IBLND_MAX_RDMA_FRAGS;
 	peer_ni->ibp_queue_depth = ni->ni_net->net_tunables.lct_peer_tx_credits;
 	atomic_set(&peer_ni->ibp_refcount, 1);	/* 1 ref for caller */
 
@@ -356,9 +356,9 @@ kiblnd_create_peer(struct lnet_ni *ni, kib_peer_ni_t **peerp, lnet_nid_t nid)
 }
 
 void
-kiblnd_destroy_peer (kib_peer_ni_t *peer_ni)
+kiblnd_destroy_peer(struct kib_peer_ni *peer_ni)
 {
-	kib_net_t *net = peer_ni->ibp_ni->ni_data;
+	struct kib_net *net = peer_ni->ibp_ni->ni_data;
 
 	LASSERT(net != NULL);
 	LASSERT (atomic_read(&peer_ni->ibp_refcount) == 0);
@@ -375,18 +375,18 @@ kiblnd_destroy_peer (kib_peer_ni_t *peer_ni)
 	atomic_dec(&net->ibn_npeers);
 }
 
-kib_peer_ni_t *
+struct kib_peer_ni *
 kiblnd_find_peer_locked(struct lnet_ni *ni, lnet_nid_t nid)
 {
 	/* the caller is responsible for accounting the additional reference
 	 * that this creates */
 	struct list_head	*peer_list = kiblnd_nid2peerlist(nid);
 	struct list_head	*tmp;
-	kib_peer_ni_t		*peer_ni;
+	struct kib_peer_ni		*peer_ni;
 
 	list_for_each(tmp, peer_list) {
 
-		peer_ni = list_entry(tmp, kib_peer_ni_t, ibp_list);
+		peer_ni = list_entry(tmp, struct kib_peer_ni, ibp_list);
 		LASSERT(!kiblnd_peer_idle(peer_ni));
 
 		/*
@@ -409,7 +409,7 @@ kiblnd_find_peer_locked(struct lnet_ni *ni, lnet_nid_t nid)
 }
 
 void
-kiblnd_unlink_peer_locked (kib_peer_ni_t *peer_ni)
+kiblnd_unlink_peer_locked(struct kib_peer_ni *peer_ni)
 {
 	LASSERT(list_empty(&peer_ni->ibp_conns));
 
@@ -423,7 +423,7 @@ static int
 kiblnd_get_peer_info(struct lnet_ni *ni, int index,
 		     lnet_nid_t *nidp, int *count)
 {
-	kib_peer_ni_t		*peer_ni;
+	struct kib_peer_ni		*peer_ni;
 	struct list_head	*ptmp;
 	int			 i;
 	unsigned long		 flags;
@@ -434,7 +434,7 @@ kiblnd_get_peer_info(struct lnet_ni *ni, int index,
 
 		list_for_each(ptmp, &kiblnd_data.kib_peers[i]) {
 
-			peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list);
+			peer_ni = list_entry(ptmp, struct kib_peer_ni, ibp_list);
 			LASSERT(!kiblnd_peer_idle(peer_ni));
 
 			if (peer_ni->ibp_ni != ni)
@@ -457,17 +457,17 @@ kiblnd_get_peer_info(struct lnet_ni *ni, int index,
 }
 
 static void
-kiblnd_del_peer_locked (kib_peer_ni_t *peer_ni)
+kiblnd_del_peer_locked(struct kib_peer_ni *peer_ni)
 {
-	struct list_head	*ctmp;
-	struct list_head	*cnxt;
-	kib_conn_t		*conn;
+	struct list_head *ctmp;
+	struct list_head *cnxt;
+	struct kib_conn	*conn;
 
 	if (list_empty(&peer_ni->ibp_conns)) {
 		kiblnd_unlink_peer_locked(peer_ni);
 	} else {
 		list_for_each_safe(ctmp, cnxt, &peer_ni->ibp_conns) {
-			conn = list_entry(ctmp, kib_conn_t, ibc_list);
+			conn = list_entry(ctmp, struct kib_conn, ibc_list);
 
 			kiblnd_close_conn_locked(conn, 0);
 		}
@@ -483,7 +483,7 @@ kiblnd_del_peer(struct lnet_ni *ni, lnet_nid_t nid)
 	struct list_head	zombies = LIST_HEAD_INIT(zombies);
 	struct list_head	*ptmp;
 	struct list_head	*pnxt;
-	kib_peer_ni_t		*peer_ni;
+	struct kib_peer_ni		*peer_ni;
 	int			lo;
 	int			hi;
 	int			i;
@@ -501,7 +501,7 @@ kiblnd_del_peer(struct lnet_ni *ni, lnet_nid_t nid)
 
 	for (i = lo; i <= hi; i++) {
 		list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
-			peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list);
+			peer_ni = list_entry(ptmp, struct kib_peer_ni, ibp_list);
 			LASSERT(!kiblnd_peer_idle(peer_ni));
 
 			if (peer_ni->ibp_ni != ni)
@@ -524,17 +524,17 @@ kiblnd_del_peer(struct lnet_ni *ni, lnet_nid_t nid)
 
 	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 
-	kiblnd_txlist_done(&zombies, -EIO);
+	kiblnd_txlist_done(&zombies, -EIO, LNET_MSG_STATUS_LOCAL_ERROR);
 
 	return rc;
 }
 
-static kib_conn_t *
+static struct kib_conn *
 kiblnd_get_conn_by_idx(struct lnet_ni *ni, int index)
 {
-	kib_peer_ni_t		*peer_ni;
+	struct kib_peer_ni		*peer_ni;
 	struct list_head	*ptmp;
-	kib_conn_t		*conn;
+	struct kib_conn	*conn;
 	struct list_head	*ctmp;
 	int			i;
 	unsigned long		flags;
@@ -544,7 +544,7 @@ kiblnd_get_conn_by_idx(struct lnet_ni *ni, int index)
 	for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
 		list_for_each(ptmp, &kiblnd_data.kib_peers[i]) {
 
-			peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list);
+			peer_ni = list_entry(ptmp, struct kib_peer_ni, ibp_list);
 			LASSERT(!kiblnd_peer_idle(peer_ni));
 
 			if (peer_ni->ibp_ni != ni)
@@ -554,7 +554,7 @@ kiblnd_get_conn_by_idx(struct lnet_ni *ni, int index)
 				if (index-- > 0)
 					continue;
 
-				conn = list_entry(ctmp, kib_conn_t, ibc_list);
+				conn = list_entry(ctmp, struct kib_conn, ibc_list);
 				kiblnd_conn_addref(conn);
 				read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
 						       flags);
@@ -568,7 +568,7 @@ kiblnd_get_conn_by_idx(struct lnet_ni *ni, int index)
 }
 
 static void
-kiblnd_debug_rx (kib_rx_t *rx)
+kiblnd_debug_rx(struct kib_rx *rx)
 {
         CDEBUG(D_CONSOLE, "      %p status %d msg_type %x cred %d\n",
                rx, rx->rx_status, rx->rx_msg->ibm_type,
@@ -576,19 +576,19 @@ kiblnd_debug_rx (kib_rx_t *rx)
 }
 
 static void
-kiblnd_debug_tx (kib_tx_t *tx)
+kiblnd_debug_tx(struct kib_tx *tx)
 {
-        CDEBUG(D_CONSOLE, "      %p snd %d q %d w %d rc %d dl %lx "
+	CDEBUG(D_CONSOLE, "      %p snd %d q %d w %d rc %d dl %lld "
 	       "cookie %#llx msg %s%s type %x cred %d\n",
                tx, tx->tx_sending, tx->tx_queued, tx->tx_waiting,
-               tx->tx_status, tx->tx_deadline, tx->tx_cookie,
+	       tx->tx_status, ktime_to_ns(tx->tx_deadline), tx->tx_cookie,
                tx->tx_lntmsg[0] == NULL ? "-" : "!",
                tx->tx_lntmsg[1] == NULL ? "-" : "!",
                tx->tx_msg->ibm_type, tx->tx_msg->ibm_credits);
 }
 
 void
-kiblnd_debug_conn (kib_conn_t *conn)
+kiblnd_debug_conn(struct kib_conn *conn)
 {
 	struct list_head	*tmp;
 	int			i;
@@ -606,27 +606,27 @@ kiblnd_debug_conn (kib_conn_t *conn)
 
 	CDEBUG(D_CONSOLE, "   early_rxs:\n");
 	list_for_each(tmp, &conn->ibc_early_rxs)
-		kiblnd_debug_rx(list_entry(tmp, kib_rx_t, rx_list));
+		kiblnd_debug_rx(list_entry(tmp, struct kib_rx, rx_list));
 
 	CDEBUG(D_CONSOLE, "   tx_noops:\n");
 	list_for_each(tmp, &conn->ibc_tx_noops)
-		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+		kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
 
 	CDEBUG(D_CONSOLE, "   tx_queue_nocred:\n");
 	list_for_each(tmp, &conn->ibc_tx_queue_nocred)
-		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+		kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
 
 	CDEBUG(D_CONSOLE, "   tx_queue_rsrvd:\n");
 	list_for_each(tmp, &conn->ibc_tx_queue_rsrvd)
-		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+		kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
 
 	CDEBUG(D_CONSOLE, "   tx_queue:\n");
 	list_for_each(tmp, &conn->ibc_tx_queue)
-		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+		kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
 
 	CDEBUG(D_CONSOLE, "   active_txs:\n");
 	list_for_each(tmp, &conn->ibc_active_txs)
-		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+		kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
 
 	CDEBUG(D_CONSOLE, "   rxs:\n");
 	for (i = 0; i < IBLND_RX_MSGS(conn); i++)
@@ -672,7 +672,7 @@ kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid)
 }
 
 static int
-kiblnd_get_completion_vector(kib_conn_t *conn, int cpt)
+kiblnd_get_completion_vector(struct kib_conn *conn, int cpt)
 {
 	cpumask_t	*mask;
 	int		vectors;
@@ -734,15 +734,32 @@ static unsigned int kiblnd_send_wrs(struct kib_conn *conn)
 	 * One WR for the LNet message
 	 * And ibc_max_frags for the transfer WRs
 	 */
-	unsigned int ret = 1 + conn->ibc_max_frags;
+	int ret;
+	int multiplier = 1 + conn->ibc_max_frags;
+	enum kib_dev_caps dev_caps = conn->ibc_hdev->ibh_dev->ibd_dev_caps;
+
+	/* FastReg needs two extra WRs for map and invalidate */
+	if (dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED)
+		multiplier += 2;
 
 	/* account for a maximum of ibc_queue_depth in-flight transfers */
-	ret *= conn->ibc_queue_depth;
-	return ret;
+	ret = multiplier * conn->ibc_queue_depth;
+
+	if (ret > conn->ibc_hdev->ibh_max_qp_wr) {
+		CDEBUG(D_NET, "peer_credits %u will result in send work "
+		       "request size %d larger than maximum %d device "
+		       "can handle\n", conn->ibc_queue_depth, ret,
+		       conn->ibc_hdev->ibh_max_qp_wr);
+		conn->ibc_queue_depth =
+			conn->ibc_hdev->ibh_max_qp_wr / multiplier;
+	}
+
+	/* don't go beyond the maximum the device can handle */
+	return min(ret, conn->ibc_hdev->ibh_max_qp_wr);
 }
 
-kib_conn_t *
-kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
+struct kib_conn *
+kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid,
 		   int state, int version)
 {
 	/* CAVEAT EMPTOR:
@@ -753,14 +770,14 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
 	 * to destroy 'cmid' here since I'm called from the CM which still has
 	 * its ref on 'cmid'). */
 	rwlock_t	       *glock = &kiblnd_data.kib_global_lock;
-	kib_net_t              *net = peer_ni->ibp_ni->ni_data;
-	kib_dev_t              *dev;
+	struct kib_net              *net = peer_ni->ibp_ni->ni_data;
+	struct kib_dev *dev;
 	struct ib_qp_init_attr *init_qp_attr;
 	struct kib_sched_info	*sched;
 #ifdef HAVE_IB_CQ_INIT_ATTR
 	struct ib_cq_init_attr  cq_attr = {};
 #endif
-	kib_conn_t		*conn;
+	struct kib_conn	*conn;
 	struct ib_cq		*cq;
 	unsigned long		flags;
 	int			cpt;
@@ -815,6 +832,7 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
 	INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd);
 	INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred);
 	INIT_LIST_HEAD(&conn->ibc_active_txs);
+	INIT_LIST_HEAD(&conn->ibc_zombie_txs);
 	spin_lock_init(&conn->ibc_lock);
 
 	LIBCFS_CPT_ALLOC(conn->ibc_connvars, lnet_cpt_table(), cpt,
@@ -853,7 +871,7 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
 	write_unlock_irqrestore(glock, flags);
 
 	LIBCFS_CPT_ALLOC(conn->ibc_rxs, lnet_cpt_table(), cpt,
-			 IBLND_RX_MSGS(conn) * sizeof(kib_rx_t));
+			 IBLND_RX_MSGS(conn) * sizeof(struct kib_rx));
 	if (conn->ibc_rxs == NULL) {
 		CERROR("Cannot allocate RX buffers\n");
 		goto failed_2;
@@ -879,6 +897,12 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
 			  kiblnd_get_completion_vector(conn, cpt));
 #endif
 	if (IS_ERR(cq)) {
+		/*
+		 * on MLX-5 (possibly MLX-4 as well) this error could be
+		 * hit if the concurrent_sends and/or peer_tx_credits is set
+		 * too high. Or due to an MLX-5 bug which tries to
+		 * allocate 256kb via kmalloc for WR cookie array
+		 */
 		CERROR("Failed to create CQ with %d CQEs: %ld\n",
 			IBLND_CQ_ENTRIES(conn), PTR_ERR(cq));
 		goto failed_2;
@@ -900,20 +924,14 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
 	init_qp_attr->qp_type = IB_QPT_RC;
 	init_qp_attr->send_cq = cq;
 	init_qp_attr->recv_cq = cq;
+	/*
+	 * kiblnd_send_wrs() can change the connection's queue depth if
+	 * the maximum work requests for the device is maxed out
+	 */
+	init_qp_attr->cap.max_send_wr = kiblnd_send_wrs(conn);
+	init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn);
 
-	conn->ibc_sched = sched;
-
-	do {
-		init_qp_attr->cap.max_send_wr = kiblnd_send_wrs(conn);
-		init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn);
-
-		rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
-		if (!rc || conn->ibc_queue_depth < 2)
-			break;
-
-		conn->ibc_queue_depth--;
-	} while (rc);
-
+	rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
 	if (rc) {
 		CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d, "
 		       "send_sge: %d, recv_sge: %d\n",
@@ -924,6 +942,8 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
 		goto failed_2;
 	}
 
+	conn->ibc_sched = sched;
+
 	if (conn->ibc_queue_depth != peer_ni->ibp_queue_depth)
 		CWARN("peer %s - queue depth reduced from %u to %u"
 		      "  to allow for qp creation\n",
@@ -976,7 +996,8 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
         return conn;
 
  failed_2:
-	kiblnd_destroy_conn(conn, true);
+	kiblnd_destroy_conn(conn);
+	LIBCFS_FREE(conn, sizeof(*conn));
  failed_1:
         LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
  failed_0:
@@ -984,10 +1005,10 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
 }
 
 void
-kiblnd_destroy_conn(kib_conn_t *conn, bool free_conn)
+kiblnd_destroy_conn(struct kib_conn *conn)
 {
 	struct rdma_cm_id *cmid = conn->ibc_cmid;
-	kib_peer_ni_t        *peer_ni = conn->ibc_peer;
+	struct kib_peer_ni *peer_ni = conn->ibc_peer;
 
 	LASSERT (!in_interrupt());
 	LASSERT (atomic_read(&conn->ibc_refcount) == 0);
@@ -1021,12 +1042,15 @@ kiblnd_destroy_conn(kib_conn_t *conn, bool free_conn)
 	if (conn->ibc_cq)
 		ib_destroy_cq(conn->ibc_cq);
 
+	kiblnd_txlist_done(&conn->ibc_zombie_txs, -ECONNABORTED,
+			   LNET_MSG_STATUS_OK);
+
 	if (conn->ibc_rx_pages != NULL)
 		kiblnd_unmap_rx_descs(conn);
 
 	if (conn->ibc_rxs != NULL) {
 		LIBCFS_FREE(conn->ibc_rxs,
-			    IBLND_RX_MSGS(conn) * sizeof(kib_rx_t));
+			    IBLND_RX_MSGS(conn) * sizeof(struct kib_rx));
 	}
 
 	if (conn->ibc_connvars != NULL)
@@ -1037,27 +1061,24 @@ kiblnd_destroy_conn(kib_conn_t *conn, bool free_conn)
 
 	/* See CAVEAT EMPTOR above in kiblnd_create_conn */
 	if (conn->ibc_state != IBLND_CONN_INIT) {
-		kib_net_t *net = peer_ni->ibp_ni->ni_data;
+		struct kib_net *net = peer_ni->ibp_ni->ni_data;
 
 		kiblnd_peer_decref(peer_ni);
 		rdma_destroy_id(cmid);
 		atomic_dec(&net->ibn_nconns);
 	}
-
-	if (free_conn)
-		LIBCFS_FREE(conn, sizeof(*conn));
 }
 
 int
-kiblnd_close_peer_conns_locked(kib_peer_ni_t *peer_ni, int why)
+kiblnd_close_peer_conns_locked(struct kib_peer_ni *peer_ni, int why)
 {
-	kib_conn_t		*conn;
+	struct kib_conn	*conn;
 	struct list_head	*ctmp;
 	struct list_head	*cnxt;
 	int			count = 0;
 
 	list_for_each_safe(ctmp, cnxt, &peer_ni->ibp_conns) {
-		conn = list_entry(ctmp, kib_conn_t, ibc_list);
+		conn = list_entry(ctmp, struct kib_conn, ibc_list);
 
 		CDEBUG(D_NET, "Closing conn -> %s, "
 			      "version: %x, reason: %d\n",
@@ -1072,16 +1093,16 @@ kiblnd_close_peer_conns_locked(kib_peer_ni_t *peer_ni, int why)
 }
 
 int
-kiblnd_close_stale_conns_locked(kib_peer_ni_t *peer_ni,
+kiblnd_close_stale_conns_locked(struct kib_peer_ni *peer_ni,
 				int version, __u64 incarnation)
 {
-	kib_conn_t		*conn;
+	struct kib_conn	*conn;
 	struct list_head	*ctmp;
 	struct list_head	*cnxt;
 	int			count = 0;
 
 	list_for_each_safe(ctmp, cnxt, &peer_ni->ibp_conns) {
-		conn = list_entry(ctmp, kib_conn_t, ibc_list);
+		conn = list_entry(ctmp, struct kib_conn, ibc_list);
 
 		if (conn->ibc_version     == version &&
 		    conn->ibc_incarnation == incarnation)
@@ -1103,7 +1124,7 @@ kiblnd_close_stale_conns_locked(kib_peer_ni_t *peer_ni,
 static int
 kiblnd_close_matching_conns(struct lnet_ni *ni, lnet_nid_t nid)
 {
-	kib_peer_ni_t		*peer_ni;
+	struct kib_peer_ni		*peer_ni;
 	struct list_head	*ptmp;
 	struct list_head	*pnxt;
 	int			lo;
@@ -1124,7 +1145,7 @@ kiblnd_close_matching_conns(struct lnet_ni *ni, lnet_nid_t nid)
 	for (i = lo; i <= hi; i++) {
 		list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
 
-			peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list);
+			peer_ni = list_entry(ptmp, struct kib_peer_ni, ibp_list);
 			LASSERT(!kiblnd_peer_idle(peer_ni));
 
 			if (peer_ni->ibp_ni != ni)
@@ -1169,7 +1190,7 @@ kiblnd_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
                 break;
         }
         case IOC_LIBCFS_GET_CONN: {
-                kib_conn_t *conn;
+		struct kib_conn *conn;
 
                 rc = 0;
                 conn = kiblnd_get_conn_by_idx(ni, data->ioc_count);
@@ -1201,13 +1222,13 @@ kiblnd_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
 }
 
 static void
-kiblnd_query(struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when)
+kiblnd_query(struct lnet_ni *ni, lnet_nid_t nid, time64_t *when)
 {
-	cfs_time_t	last_alive = 0;
-	cfs_time_t	now = cfs_time_current();
-	rwlock_t	*glock = &kiblnd_data.kib_global_lock;
-	kib_peer_ni_t	*peer_ni;
-	unsigned long	flags;
+	time64_t last_alive = 0;
+	time64_t now = ktime_get_seconds();
+	rwlock_t *glock = &kiblnd_data.kib_global_lock;
+	struct kib_peer_ni *peer_ni;
+	unsigned long flags;
 
 	read_lock_irqsave(glock, flags);
 
@@ -1225,14 +1246,14 @@ kiblnd_query(struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when)
 	if (peer_ni == NULL)
 		kiblnd_launch_tx(ni, NULL, nid);
 
-	CDEBUG(D_NET, "peer_ni %s %p, alive %ld secs ago\n",
+	CDEBUG(D_NET, "peer_ni %s %p, alive %lld secs ago\n",
 	       libcfs_nid2str(nid), peer_ni,
-	       last_alive ? cfs_duration_sec(now - last_alive) : -1);
+	       last_alive ? now - last_alive : -1);
 	return;
 }
 
 static void
-kiblnd_free_pages(kib_pages_t *p)
+kiblnd_free_pages(struct kib_pages *p)
 {
 	int	npages = p->ibp_npages;
 	int	i;
@@ -1242,23 +1263,23 @@ kiblnd_free_pages(kib_pages_t *p)
 			__free_page(p->ibp_pages[i]);
 	}
 
-	LIBCFS_FREE(p, offsetof(kib_pages_t, ibp_pages[npages]));
+	LIBCFS_FREE(p, offsetof(struct kib_pages, ibp_pages[npages]));
 }
 
 int
-kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages)
+kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages)
 {
-	kib_pages_t	*p;
-	int		i;
+	struct kib_pages *p;
+	int i;
 
 	LIBCFS_CPT_ALLOC(p, lnet_cpt_table(), cpt,
-			 offsetof(kib_pages_t, ibp_pages[npages]));
+			 offsetof(struct kib_pages, ibp_pages[npages]));
         if (p == NULL) {
                 CERROR("Can't allocate descriptor for %d pages\n", npages);
                 return -ENOMEM;
         }
 
-        memset(p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
+	memset(p, 0, offsetof(struct kib_pages, ibp_pages[npages]));
         p->ibp_npages = npages;
 
         for (i = 0; i < npages; i++) {
@@ -1276,9 +1297,9 @@ kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages)
 }
 
 void
-kiblnd_unmap_rx_descs(kib_conn_t *conn)
+kiblnd_unmap_rx_descs(struct kib_conn *conn)
 {
-        kib_rx_t *rx;
+	struct kib_rx *rx;
         int       i;
 
         LASSERT (conn->ibc_rxs != NULL);
@@ -1301,9 +1322,9 @@ kiblnd_unmap_rx_descs(kib_conn_t *conn)
 }
 
 void
-kiblnd_map_rx_descs(kib_conn_t *conn)
+kiblnd_map_rx_descs(struct kib_conn *conn)
 {
-        kib_rx_t       *rx;
+	struct kib_rx *rx;
         struct page    *pg;
         int             pg_off;
         int             ipg;
@@ -1314,7 +1335,7 @@ kiblnd_map_rx_descs(kib_conn_t *conn)
 		rx = &conn->ibc_rxs[i];
 
 		rx->rx_conn = conn;
-		rx->rx_msg = (kib_msg_t *)(((char *)page_address(pg)) + pg_off);
+		rx->rx_msg = (struct kib_msg *)(((char *)page_address(pg)) + pg_off);
 
 		rx->rx_msgaddr =
 			kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev,
@@ -1340,11 +1361,11 @@ kiblnd_map_rx_descs(kib_conn_t *conn)
 }
 
 static void
-kiblnd_unmap_tx_pool(kib_tx_pool_t *tpo)
+kiblnd_unmap_tx_pool(struct kib_tx_pool *tpo)
 {
-        kib_hca_dev_t  *hdev = tpo->tpo_hdev;
-        kib_tx_t       *tx;
-        int             i;
+	struct kib_hca_dev *hdev = tpo->tpo_hdev;
+	struct kib_tx *tx;
+	int i;
 
         LASSERT (tpo->tpo_pool.po_allocated == 0);
 
@@ -1363,10 +1384,10 @@ kiblnd_unmap_tx_pool(kib_tx_pool_t *tpo)
         tpo->tpo_hdev = NULL;
 }
 
-static kib_hca_dev_t *
-kiblnd_current_hdev(kib_dev_t *dev)
+static struct kib_hca_dev *
+kiblnd_current_hdev(struct kib_dev *dev)
 {
-        kib_hca_dev_t *hdev;
+	struct kib_hca_dev *hdev;
         unsigned long  flags;
         int            i = 0;
 
@@ -1391,14 +1412,14 @@ kiblnd_current_hdev(kib_dev_t *dev)
 }
 
 static void
-kiblnd_map_tx_pool(kib_tx_pool_t *tpo)
-{
-        kib_pages_t    *txpgs = tpo->tpo_tx_pages;
-        kib_pool_t     *pool  = &tpo->tpo_pool;
-        kib_net_t      *net   = pool->po_owner->ps_net;
-	kib_dev_t      *dev;
-        struct page    *page;
-        kib_tx_t       *tx;
+kiblnd_map_tx_pool(struct kib_tx_pool *tpo)
+{
+	struct kib_pages *txpgs = tpo->tpo_tx_pages;
+	struct kib_pool *pool = &tpo->tpo_pool;
+	struct kib_net      *net   = pool->po_owner->ps_net;
+	struct kib_dev *dev;
+	struct page *page;
+	struct kib_tx *tx;
         int             page_offset;
         int             ipage;
         int             i;
@@ -1419,8 +1440,8 @@ kiblnd_map_tx_pool(kib_tx_pool_t *tpo)
 		page = txpgs->ibp_pages[ipage];
 		tx = &tpo->tpo_tx_descs[i];
 
-		tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
-					   page_offset);
+		tx->tx_msg = (struct kib_msg *)(((char *)page_address(page)) +
+						page_offset);
 
 		tx->tx_msgaddr = kiblnd_dma_map_single(tpo->tpo_hdev->ibh_ibdev,
 						       tx->tx_msg,
@@ -1443,39 +1464,14 @@ kiblnd_map_tx_pool(kib_tx_pool_t *tpo)
 	}
 }
 
-#ifdef HAVE_IB_GET_DMA_MR
-struct ib_mr *
-kiblnd_find_rd_dma_mr(struct lnet_ni *ni, kib_rdma_desc_t *rd,
-		      int negotiated_nfrags)
-{
-	kib_net_t     *net   = ni->ni_data;
-	kib_hca_dev_t *hdev  = net->ibn_dev->ibd_hdev;
-	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
-	int	mod;
-	__u16	nfrags;
-
-	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
-	mod = tunables->lnd_map_on_demand;
-	nfrags = (negotiated_nfrags != -1) ? negotiated_nfrags : mod;
-
-	LASSERT(hdev->ibh_mrs != NULL);
-
-	if (mod > 0 && nfrags <= rd->rd_nfrags)
-		return NULL;
-
-	return hdev->ibh_mrs;
-}
-#endif
-
 static void
-kiblnd_destroy_fmr_pool(kib_fmr_pool_t *fpo)
+kiblnd_destroy_fmr_pool(struct kib_fmr_pool *fpo)
 {
 	LASSERT(fpo->fpo_map_count == 0);
 
 #ifdef HAVE_FMR_POOL_API
-	if (fpo->fpo_is_fmr) {
-		if (fpo->fmr.fpo_fmr_pool)
-			ib_destroy_fmr_pool(fpo->fmr.fpo_fmr_pool);
+	if (fpo->fpo_is_fmr && fpo->fmr.fpo_fmr_pool) {
+		ib_destroy_fmr_pool(fpo->fmr.fpo_fmr_pool);
 	} else
 #endif /* HAVE_FMR_POOL_API */
 	{
@@ -1506,7 +1502,7 @@ kiblnd_destroy_fmr_pool(kib_fmr_pool_t *fpo)
 static void
 kiblnd_destroy_fmr_pool_list(struct list_head *head)
 {
-	kib_fmr_pool_t *fpo, *tmp;
+	struct kib_fmr_pool *fpo, *tmp;
 
 	list_for_each_entry_safe(fpo, tmp, head, fpo_list) {
 		list_del(&fpo->fpo_list);
@@ -1533,10 +1529,11 @@ kiblnd_fmr_flush_trigger(struct lnet_ioctl_config_o2iblnd_tunables *tunables,
 }
 
 #ifdef HAVE_FMR_POOL_API
-static int kiblnd_alloc_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t *fpo)
+static int kiblnd_alloc_fmr_pool(struct kib_fmr_poolset *fps,
+				 struct kib_fmr_pool *fpo)
 {
 	struct ib_fmr_pool_param param = {
-		.max_pages_per_fmr = LNET_MAX_PAYLOAD/PAGE_SIZE,
+		.max_pages_per_fmr = LNET_MAX_IOV,
 		.page_shift        = PAGE_SHIFT,
 		.access            = (IB_ACCESS_LOCAL_WRITE |
 				      IB_ACCESS_REMOTE_WRITE),
@@ -1556,16 +1553,23 @@ static int kiblnd_alloc_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t *fpo)
 		else
 			CERROR("FMRs are not supported\n");
 	}
+	fpo->fpo_is_fmr = true;
 
 	return rc;
 }
 #endif /* HAVE_FMR_POOL_API */
 
-static int kiblnd_alloc_freg_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t *fpo)
+static int kiblnd_alloc_freg_pool(struct kib_fmr_poolset *fps,
+				  struct kib_fmr_pool *fpo,
+				  enum kib_dev_caps dev_caps)
 {
 	struct kib_fast_reg_descriptor *frd, *tmp;
 	int i, rc;
 
+#ifdef HAVE_FMR_POOL_API
+	fpo->fpo_is_fmr = false;
+#endif
+
 	INIT_LIST_HEAD(&fpo->fast_reg.fpo_pool_list);
 	fpo->fast_reg.fpo_pool_size = 0;
 	for (i = 0; i < fps->fps_pool_size; i++) {
@@ -1580,7 +1584,7 @@ static int kiblnd_alloc_freg_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t *fpo)
 
 #ifndef HAVE_IB_MAP_MR_SG
 		frd->frd_frpl = ib_alloc_fast_reg_page_list(fpo->fpo_hdev->ibh_ibdev,
-							    LNET_MAX_PAYLOAD/PAGE_SIZE);
+							    LNET_MAX_IOV);
 		if (IS_ERR(frd->frd_frpl)) {
 			rc = PTR_ERR(frd->frd_frpl);
 			CERROR("Failed to allocate ib_fast_reg_page_list: %d\n",
@@ -1592,11 +1596,28 @@ static int kiblnd_alloc_freg_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t *fpo)
 
 #ifdef HAVE_IB_ALLOC_FAST_REG_MR
 		frd->frd_mr = ib_alloc_fast_reg_mr(fpo->fpo_hdev->ibh_pd,
-						   LNET_MAX_PAYLOAD/PAGE_SIZE);
+						   LNET_MAX_IOV);
 #else
+		/*
+		 * it is expected to get here if this is an MLX-5 card.
+		 * MLX-4 cards will always use FMR and MLX-5 cards will
+		 * always use fast_reg. It turns out that some MLX-5 cards
+		 * (possibly due to older FW versions) do not natively support
+		 * gaps. So we will need to track them here.
+		 */
 		frd->frd_mr = ib_alloc_mr(fpo->fpo_hdev->ibh_pd,
-					  IB_MR_TYPE_MEM_REG,
-					  LNET_MAX_PAYLOAD/PAGE_SIZE);
+#ifdef IB_MR_TYPE_SG_GAPS
+					  ((*kiblnd_tunables.kib_use_fastreg_gaps == 1) &&
+					   (dev_caps & IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT)) ?
+						IB_MR_TYPE_SG_GAPS :
+						IB_MR_TYPE_MEM_REG,
+#else
+						IB_MR_TYPE_MEM_REG,
+#endif
+					  LNET_MAX_IOV);
+		if ((*kiblnd_tunables.kib_use_fastreg_gaps == 1) &&
+		    (dev_caps & IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT))
+			CWARN("using IB_MR_TYPE_SG_GAPS, expect a performance drop\n");
 #endif
 		if (IS_ERR(frd->frd_mr)) {
 			rc = PTR_ERR(frd->frd_mr);
@@ -1639,79 +1660,32 @@ static int kiblnd_alloc_freg_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t *fpo)
 	return rc;
 }
 
-static int
-kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t **pp_fpo)
+static int kiblnd_create_fmr_pool(struct kib_fmr_poolset *fps,
+				  struct kib_fmr_pool **pp_fpo)
 {
-	struct ib_device_attr *dev_attr;
-	kib_dev_t *dev = fps->fps_net->ibn_dev;
-	kib_fmr_pool_t *fpo;
+	struct kib_dev *dev = fps->fps_net->ibn_dev;
+	struct kib_fmr_pool *fpo;
 	int rc;
 
-#ifndef HAVE_IB_DEVICE_ATTRS
-	dev_attr = kmalloc(sizeof(*dev_attr), GFP_KERNEL);
-	if (!dev_attr)
-		return -ENOMEM;
-#endif
-
 	LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo));
 	if (!fpo) {
-		rc = -ENOMEM;
-		goto out_dev_attr;
+		return -ENOMEM;
 	}
+	memset(fpo, 0, sizeof(*fpo));
 
 	fpo->fpo_hdev = kiblnd_current_hdev(dev);
 
-#ifdef HAVE_IB_DEVICE_ATTRS
-	dev_attr = &fpo->fpo_hdev->ibh_ibdev->attrs;
-#else
-	rc = ib_query_device(fpo->fpo_hdev->ibh_ibdev, dev_attr);
-	if (rc) {
-		CERROR("Query device failed for %s: %d\n",
-			fpo->fpo_hdev->ibh_ibdev->name, rc);
-		goto out_dev_attr;
-	}
-#endif
-
-#ifdef HAVE_FMR_POOL_API
-	/* Check for FMR or FastReg support */
-	fpo->fpo_is_fmr = 0;
-#ifdef HAVE_IB_DEVICE_OPS
-	if (fpo->fpo_hdev->ibh_ibdev->ops.alloc_fmr &&
-	    fpo->fpo_hdev->ibh_ibdev->ops.dealloc_fmr &&
-	    fpo->fpo_hdev->ibh_ibdev->ops.map_phys_fmr &&
-	    fpo->fpo_hdev->ibh_ibdev->ops.unmap_fmr) {
-#else
-	if (fpo->fpo_hdev->ibh_ibdev->alloc_fmr &&
-	    fpo->fpo_hdev->ibh_ibdev->dealloc_fmr &&
-	    fpo->fpo_hdev->ibh_ibdev->map_phys_fmr &&
-	    fpo->fpo_hdev->ibh_ibdev->unmap_fmr) {
-#endif
-		LCONSOLE_INFO("Using FMR for registration\n");
-		fpo->fpo_is_fmr = 1;
-	} else
-#endif /* HAVE_FMR_POOL_API */
-	if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
-		LCONSOLE_INFO("Using FastReg for registration\n");
-	} else {
-		rc = -ENOSYS;
-		LCONSOLE_ERROR_MSG(rc, "IB device does not support FMRs nor FastRegs, can't register memory\n");
-		goto out_dev_attr;
-	}
-
 #ifdef HAVE_FMR_POOL_API
-	if (fpo->fpo_is_fmr)
+	if (dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED)
 		rc = kiblnd_alloc_fmr_pool(fps, fpo);
 	else
 #endif /* HAVE_FMR_POOL_API */
-		rc = kiblnd_alloc_freg_pool(fps, fpo);
+		rc = kiblnd_alloc_freg_pool(fps, fpo, dev->ibd_dev_caps);
 	if (rc)
 		goto out_fpo;
 
-#ifndef HAVE_IB_DEVICE_ATTRS
-	kfree(dev_attr);
-#endif
-	fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
-	fpo->fpo_owner    = fps;
+	fpo->fpo_deadline = ktime_get_seconds() + IBLND_POOL_DEADLINE;
+	fpo->fpo_owner = fps;
 	*pp_fpo = fpo;
 
 	return 0;
@@ -1719,17 +1693,11 @@ kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t **pp_fpo)
 out_fpo:
 	kiblnd_hdev_decref(fpo->fpo_hdev);
 	LIBCFS_FREE(fpo, sizeof(*fpo));
-
-out_dev_attr:
-#ifndef HAVE_IB_DEVICE_ATTRS
-	kfree(dev_attr);
-#endif
-
 	return rc;
 }
 
 static void
-kiblnd_fail_fmr_poolset(kib_fmr_poolset_t *fps, struct list_head *zombies)
+kiblnd_fail_fmr_poolset(struct kib_fmr_poolset *fps, struct list_head *zombies)
 {
 	if (fps->fps_net == NULL) /* intialized? */
 		return;
@@ -1737,8 +1705,10 @@ kiblnd_fail_fmr_poolset(kib_fmr_poolset_t *fps, struct list_head *zombies)
 	spin_lock(&fps->fps_lock);
 
 	while (!list_empty(&fps->fps_pool_list)) {
-		kib_fmr_pool_t *fpo = list_entry(fps->fps_pool_list.next,
-                                                 kib_fmr_pool_t, fpo_list);
+		struct kib_fmr_pool *fpo = list_entry(fps->fps_pool_list.next,
+						      struct kib_fmr_pool,
+						      fpo_list);
+
 		fpo->fpo_failed = 1;
 		list_del(&fpo->fpo_list);
 		if (fpo->fpo_map_count == 0)
@@ -1751,7 +1721,7 @@ kiblnd_fail_fmr_poolset(kib_fmr_poolset_t *fps, struct list_head *zombies)
 }
 
 static void
-kiblnd_fini_fmr_poolset(kib_fmr_poolset_t *fps)
+kiblnd_fini_fmr_poolset(struct kib_fmr_poolset *fps)
 {
 	if (fps->fps_net != NULL) { /* initialized? */
 		kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list);
@@ -1760,14 +1730,14 @@ kiblnd_fini_fmr_poolset(kib_fmr_poolset_t *fps)
 }
 
 static int
-kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt, int ncpts,
-			kib_net_t *net,
+kiblnd_init_fmr_poolset(struct kib_fmr_poolset *fps, int cpt, int ncpts,
+			struct kib_net *net,
 			struct lnet_ioctl_config_o2iblnd_tunables *tunables)
 {
-	kib_fmr_pool_t *fpo;
-	int		rc;
+	struct kib_fmr_pool *fpo;
+	int rc;
 
-	memset(fps, 0, sizeof(kib_fmr_poolset_t));
+	memset(fps, 0, sizeof(struct kib_fmr_poolset));
 
 	fps->fps_net = net;
 	fps->fps_cpt = cpt;
@@ -1788,20 +1758,20 @@ kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt, int ncpts,
 }
 
 static int
-kiblnd_fmr_pool_is_idle(kib_fmr_pool_t *fpo, cfs_time_t now)
+kiblnd_fmr_pool_is_idle(struct kib_fmr_pool *fpo, time64_t now)
 {
         if (fpo->fpo_map_count != 0) /* still in use */
                 return 0;
         if (fpo->fpo_failed)
                 return 1;
-        return cfs_time_aftereq(now, fpo->fpo_deadline);
+	return now >= fpo->fpo_deadline;
 }
 
 #if defined(HAVE_FMR_POOL_API) || !defined(HAVE_IB_MAP_MR_SG)
 static int
-kiblnd_map_tx_pages(kib_tx_t *tx, kib_rdma_desc_t *rd)
+kiblnd_map_tx_pages(struct kib_tx *tx, struct kib_rdma_desc *rd)
 {
-	kib_hca_dev_t	*hdev;
+	struct kib_hca_dev *hdev;
 	__u64		*pages = tx->tx_pages;
 	int		npages;
 	int		size;
@@ -1822,13 +1792,13 @@ kiblnd_map_tx_pages(kib_tx_t *tx, kib_rdma_desc_t *rd)
 #endif
 
 void
-kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status)
+kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status)
 {
-	struct list_head   zombies = LIST_HEAD_INIT(zombies);
-	kib_fmr_pool_t    *fpo = fmr->fmr_pool;
-	kib_fmr_poolset_t *fps;
-	cfs_time_t         now = cfs_time_current();
-	kib_fmr_pool_t    *tmp;
+	struct list_head zombies = LIST_HEAD_INIT(zombies);
+	struct kib_fmr_pool *fpo = fmr->fmr_pool;
+	struct kib_fmr_poolset *fps;
+	time64_t now = ktime_get_seconds();
+	struct kib_fmr_pool *tmp;
 
 	if (!fpo)
 		return;
@@ -1853,10 +1823,11 @@ kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status)
 
 		if (frd) {
 			frd->frd_valid = false;
+			frd->frd_posted = false;
+			fmr->fmr_frd = NULL;
 			spin_lock(&fps->fps_lock);
 			list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list);
 			spin_unlock(&fps->fps_lock);
-			fmr->fmr_frd = NULL;
 		}
 	}
 	fmr->fmr_pool = NULL;
@@ -1880,11 +1851,11 @@ kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status)
 		kiblnd_destroy_fmr_pool_list(&zombies);
 }
 
-int
-kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, kib_rdma_desc_t *rd,
-		    __u32 nob, __u64 iov, kib_fmr_t *fmr, bool *is_fastreg)
+int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx,
+			struct kib_rdma_desc *rd, u32 nob, u64 iov,
+			struct kib_fmr *fmr)
 {
-	kib_fmr_pool_t *fpo;
+	struct kib_fmr_pool *fpo;
 	__u64 version;
 	bool is_rx = (rd != tx->tx_rd);
 #ifdef HAVE_FMR_POOL_API
@@ -1898,7 +1869,7 @@ kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, kib_rdma_desc_t *rd,
 	spin_lock(&fps->fps_lock);
 	version = fps->fps_version;
 	list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) {
-		fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+		fpo->fpo_deadline = ktime_get_seconds() + IBLND_POOL_DEADLINE;
 		fpo->fpo_map_count++;
 
 #ifdef HAVE_FMR_POOL_API
@@ -1906,7 +1877,6 @@ kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, kib_rdma_desc_t *rd,
 		if (fpo->fpo_is_fmr) {
 			struct ib_pool_fmr *pfmr;
 
-			*is_fastreg = 0;
 			spin_unlock(&fps->fps_lock);
 
 			if (!tx_pages_mapped) {
@@ -1928,7 +1898,6 @@ kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, kib_rdma_desc_t *rd,
 		} else
 #endif /* HAVE_FMR_POOL_API */
 		{
-			*is_fastreg = 1;
 			if (!list_empty(&fpo->fast_reg.fpo_pool_list)) {
 				struct kib_fast_reg_descriptor *frd;
 #ifdef HAVE_IB_MAP_MR_SG
@@ -1970,14 +1939,14 @@ kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, kib_rdma_desc_t *rd,
 #ifdef HAVE_IB_MAP_MR_SG
 #ifdef HAVE_IB_MAP_MR_SG_5ARGS
 				n = ib_map_mr_sg(mr, tx->tx_frags,
-						 tx->tx_nfrags, NULL, PAGE_SIZE);
+						 rd->rd_nfrags, NULL, PAGE_SIZE);
 #else
 				n = ib_map_mr_sg(mr, tx->tx_frags,
-						 tx->tx_nfrags, PAGE_SIZE);
+						 rd->rd_nfrags, PAGE_SIZE);
 #endif /* HAVE_IB_MAP_MR_SG_5ARGS */
-				if (unlikely(n != tx->tx_nfrags)) {
+				if (unlikely(n != rd->rd_nfrags)) {
 					CERROR("Failed to map mr %d/%d "
-					       "elements\n", n, tx->tx_nfrags);
+					       "elements\n", n, rd->rd_nfrags);
 					return n < 0 ? n : -EINVAL;
 				}
 
@@ -2024,6 +1993,7 @@ kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, kib_rdma_desc_t *rd,
 				fmr->fmr_key  = is_rx ? mr->rkey : mr->lkey;
 				fmr->fmr_frd  = frd;
 				fmr->fmr_pool = fpo;
+				frd->frd_posted = false;
 				return 0;
 			}
 			spin_unlock(&fps->fps_lock);
@@ -2053,7 +2023,7 @@ kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, kib_rdma_desc_t *rd,
 
 	}
 
-	if (cfs_time_before(cfs_time_current(), fps->fps_next_retry)) {
+	if (ktime_get_seconds() < fps->fps_next_retry) {
 		/* someone failed recently */
 		spin_unlock(&fps->fps_lock);
 		return -EAGAIN;
@@ -2070,7 +2040,7 @@ kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, kib_rdma_desc_t *rd,
 		fps->fps_version++;
 		list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
 	} else {
-		fps->fps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
+		fps->fps_next_retry = ktime_get_seconds() + IBLND_POOL_RETRY;
 	}
 	spin_unlock(&fps->fps_lock);
 
@@ -2078,7 +2048,7 @@ kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, kib_rdma_desc_t *rd,
 }
 
 static void
-kiblnd_fini_pool(kib_pool_t *pool)
+kiblnd_fini_pool(struct kib_pool *pool)
 {
 	LASSERT(list_empty(&pool->po_free_list));
 	LASSERT(pool->po_allocated == 0);
@@ -2087,24 +2057,24 @@ kiblnd_fini_pool(kib_pool_t *pool)
 }
 
 static void
-kiblnd_init_pool(kib_poolset_t *ps, kib_pool_t *pool, int size)
+kiblnd_init_pool(struct kib_poolset *ps, struct kib_pool *pool, int size)
 {
 	CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name);
 
-	memset(pool, 0, sizeof(kib_pool_t));
+	memset(pool, 0, sizeof(struct kib_pool));
 	INIT_LIST_HEAD(&pool->po_free_list);
-	pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
-	pool->po_owner	  = ps;
-	pool->po_size	  = size;
+	pool->po_deadline = ktime_get_seconds() + IBLND_POOL_DEADLINE;
+	pool->po_owner = ps;
+	pool->po_size = size;
 }
 
 static void
 kiblnd_destroy_pool_list(struct list_head *head)
 {
-	kib_pool_t *pool;
+	struct kib_pool *pool;
 
 	while (!list_empty(head)) {
-		pool = list_entry(head->next, kib_pool_t, po_list);
+		pool = list_entry(head->next, struct kib_pool, po_list);
 		list_del(&pool->po_list);
 
 		LASSERT(pool->po_owner != NULL);
@@ -2113,15 +2083,16 @@ kiblnd_destroy_pool_list(struct list_head *head)
 }
 
 static void
-kiblnd_fail_poolset(kib_poolset_t *ps, struct list_head *zombies)
+kiblnd_fail_poolset(struct kib_poolset *ps, struct list_head *zombies)
 {
 	if (ps->ps_net == NULL) /* intialized? */
 		return;
 
 	spin_lock(&ps->ps_lock);
 	while (!list_empty(&ps->ps_pool_list)) {
-		kib_pool_t *po = list_entry(ps->ps_pool_list.next,
-                                            kib_pool_t, po_list);
+		struct kib_pool *po = list_entry(ps->ps_pool_list.next,
+						 struct kib_pool, po_list);
+
 		po->po_failed = 1;
 		list_del(&po->po_list);
 		if (po->po_allocated == 0)
@@ -2133,7 +2104,7 @@ kiblnd_fail_poolset(kib_poolset_t *ps, struct list_head *zombies)
 }
 
 static void
-kiblnd_fini_poolset(kib_poolset_t *ps)
+kiblnd_fini_poolset(struct kib_poolset *ps)
 {
 	if (ps->ps_net != NULL) { /* initialized? */
 		kiblnd_destroy_pool_list(&ps->ps_failed_pool_list);
@@ -2142,17 +2113,17 @@ kiblnd_fini_poolset(kib_poolset_t *ps)
 }
 
 static int
-kiblnd_init_poolset(kib_poolset_t *ps, int cpt,
-		    kib_net_t *net, char *name, int size,
+kiblnd_init_poolset(struct kib_poolset *ps, int cpt,
+		    struct kib_net *net, char *name, int size,
 		    kib_ps_pool_create_t po_create,
 		    kib_ps_pool_destroy_t po_destroy,
 		    kib_ps_node_init_t nd_init,
 		    kib_ps_node_fini_t nd_fini)
 {
-	kib_pool_t	*pool;
-	int		rc;
+	struct kib_pool	*pool;
+	int rc;
 
-	memset(ps, 0, sizeof(kib_poolset_t));
+	memset(ps, 0, sizeof(struct kib_poolset));
 
 	ps->ps_cpt	    = cpt;
         ps->ps_net          = net;
@@ -2178,22 +2149,22 @@ kiblnd_init_poolset(kib_poolset_t *ps, int cpt,
 }
 
 static int
-kiblnd_pool_is_idle(kib_pool_t *pool, cfs_time_t now)
+kiblnd_pool_is_idle(struct kib_pool *pool, time64_t now)
 {
         if (pool->po_allocated != 0) /* still in use */
                 return 0;
         if (pool->po_failed)
                 return 1;
-        return cfs_time_aftereq(now, pool->po_deadline);
+	return now >= pool->po_deadline;
 }
 
 void
-kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node)
+kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node)
 {
 	struct list_head zombies = LIST_HEAD_INIT(zombies);
-	kib_poolset_t	*ps = pool->po_owner;
-	kib_pool_t	*tmp;
-	cfs_time_t	 now = cfs_time_current();
+	struct kib_poolset *ps = pool->po_owner;
+	struct kib_pool *tmp;
+	time64_t now = ktime_get_seconds();
 
 	spin_lock(&ps->ps_lock);
 
@@ -2219,14 +2190,14 @@ kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node)
 }
 
 struct list_head *
-kiblnd_pool_alloc_node(kib_poolset_t *ps)
+kiblnd_pool_alloc_node(struct kib_poolset *ps)
 {
 	struct list_head	*node;
-	kib_pool_t		*pool;
+	struct kib_pool	*pool;
 	int			rc;
 	unsigned int		interval = 1;
-	cfs_time_t		time_before;
-	unsigned int		trips = 0;
+	ktime_t time_before;
+	unsigned int trips = 0;
 
 again:
 	spin_lock(&ps->ps_lock);
@@ -2235,7 +2206,8 @@ kiblnd_pool_alloc_node(kib_poolset_t *ps)
 			continue;
 
 		pool->po_allocated++;
-		pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+		pool->po_deadline = ktime_get_seconds() +
+				    IBLND_POOL_DEADLINE;
 		node = pool->po_free_list.next;
 		list_del(node);
 
@@ -2265,7 +2237,7 @@ kiblnd_pool_alloc_node(kib_poolset_t *ps)
                 goto again;
         }
 
-	if (cfs_time_before(cfs_time_current(), ps->ps_next_retry)) {
+	if (ktime_get_seconds() < ps->ps_next_retry) {
 		/* someone failed recently */
 		spin_unlock(&ps->ps_lock);
 		return NULL;
@@ -2275,17 +2247,17 @@ kiblnd_pool_alloc_node(kib_poolset_t *ps)
 	spin_unlock(&ps->ps_lock);
 
 	CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name);
-	time_before = cfs_time_current();
+	time_before = ktime_get();
 	rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool);
-	CDEBUG(D_NET, "ps_pool_create took %lu HZ to complete",
-	       cfs_time_current() - time_before);
+	CDEBUG(D_NET, "ps_pool_create took %lld ms to complete",
+	       ktime_ms_delta(ktime_get(), time_before));
 
 	spin_lock(&ps->ps_lock);
 	ps->ps_increasing = 0;
 	if (rc == 0) {
 		list_add_tail(&pool->po_list, &ps->ps_pool_list);
 	} else {
-		ps->ps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
+		ps->ps_next_retry = ktime_get_seconds() + IBLND_POOL_RETRY;
 		CERROR("Can't allocate new %s pool because out of memory\n",
 		       ps->ps_name);
 	}
@@ -2295,10 +2267,11 @@ kiblnd_pool_alloc_node(kib_poolset_t *ps)
 }
 
 static void
-kiblnd_destroy_tx_pool(kib_pool_t *pool)
+kiblnd_destroy_tx_pool(struct kib_pool *pool)
 {
-        kib_tx_pool_t  *tpo = container_of(pool, kib_tx_pool_t, tpo_pool);
-        int             i;
+	struct kib_tx_pool *tpo = container_of(pool, struct kib_tx_pool,
+					       tpo_pool);
+	int i;
 
         LASSERT (pool->po_allocated == 0);
 
@@ -2311,7 +2284,7 @@ kiblnd_destroy_tx_pool(kib_pool_t *pool)
                 goto out;
 
         for (i = 0; i < pool->po_size; i++) {
-		kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+		struct kib_tx *tx = &tpo->tpo_tx_descs[i];
 		int	  wrq_sge = *kiblnd_tunables.kib_wrq_sge;
 
 		list_del(&tx->tx_list);
@@ -2333,15 +2306,15 @@ kiblnd_destroy_tx_pool(kib_pool_t *pool)
 				    sizeof(*tx->tx_sge));
                 if (tx->tx_rd != NULL)
                         LIBCFS_FREE(tx->tx_rd,
-                                    offsetof(kib_rdma_desc_t,
+				    offsetof(struct kib_rdma_desc,
                                              rd_frags[IBLND_MAX_RDMA_FRAGS]));
         }
 
         LIBCFS_FREE(tpo->tpo_tx_descs,
-                    pool->po_size * sizeof(kib_tx_t));
+		    pool->po_size * sizeof(struct kib_tx));
 out:
         kiblnd_fini_pool(pool);
-        LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
+	LIBCFS_FREE(tpo, sizeof(struct kib_tx_pool));
 }
 
 static int kiblnd_tx_pool_size(struct lnet_ni *ni, int ncpts)
@@ -2356,12 +2329,12 @@ static int kiblnd_tx_pool_size(struct lnet_ni *ni, int ncpts)
 }
 
 static int
-kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
+kiblnd_create_tx_pool(struct kib_poolset *ps, int size, struct kib_pool **pp_po)
 {
         int            i;
         int            npg;
-        kib_pool_t    *pool;
-        kib_tx_pool_t *tpo;
+	struct kib_pool *pool;
+	struct kib_tx_pool *tpo;
 
 	LIBCFS_CPT_ALLOC(tpo, lnet_cpt_table(), ps->ps_cpt, sizeof(*tpo));
         if (tpo == NULL) {
@@ -2377,22 +2350,22 @@ kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
         npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE;
 	if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg) != 0) {
 		CERROR("Can't allocate tx pages: %d\n", npg);
-		LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
+		LIBCFS_FREE(tpo, sizeof(struct kib_tx_pool));
 		return -ENOMEM;
 	}
 
 	LIBCFS_CPT_ALLOC(tpo->tpo_tx_descs, lnet_cpt_table(), ps->ps_cpt,
-			 size * sizeof(kib_tx_t));
+			 size * sizeof(struct kib_tx));
         if (tpo->tpo_tx_descs == NULL) {
                 CERROR("Can't allocate %d tx descriptors\n", size);
                 ps->ps_pool_destroy(pool);
                 return -ENOMEM;
         }
 
-        memset(tpo->tpo_tx_descs, 0, size * sizeof(kib_tx_t));
+	memset(tpo->tpo_tx_descs, 0, size * sizeof(struct kib_tx));
 
         for (i = 0; i < size; i++) {
-		kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+		struct kib_tx *tx = &tpo->tpo_tx_descs[i];
 		int	  wrq_sge = *kiblnd_tunables.kib_wrq_sge;
 
                 tx->tx_pool = tpo;
@@ -2425,7 +2398,7 @@ kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
 			break;
 
 		LIBCFS_CPT_ALLOC(tx->tx_rd, lnet_cpt_table(), ps->ps_cpt,
-				 offsetof(kib_rdma_desc_t,
+				 offsetof(struct kib_rdma_desc,
 					  rd_frags[IBLND_MAX_RDMA_FRAGS]));
 		if (tx->tx_rd == NULL)
 			break;
@@ -2442,23 +2415,24 @@ kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
 }
 
 static void
-kiblnd_tx_init(kib_pool_t *pool, struct list_head *node)
+kiblnd_tx_init(struct kib_pool *pool, struct list_head *node)
 {
-	kib_tx_poolset_t *tps = container_of(pool->po_owner, kib_tx_poolset_t,
-					     tps_poolset);
-	kib_tx_t	 *tx  = list_entry(node, kib_tx_t, tx_list);
+	struct kib_tx_poolset *tps = container_of(pool->po_owner,
+						  struct kib_tx_poolset,
+						  tps_poolset);
+	struct kib_tx *tx  = list_entry(node, struct kib_tx, tx_list);
 
 	tx->tx_cookie = tps->tps_next_tx_cookie++;
 }
 
 static void
-kiblnd_net_fini_pools(kib_net_t *net)
+kiblnd_net_fini_pools(struct kib_net *net)
 {
 	int	i;
 
 	cfs_cpt_for_each(i, lnet_cpt_table()) {
-		kib_tx_poolset_t	*tps;
-		kib_fmr_poolset_t	*fps;
+		struct kib_tx_poolset *tps;
+		struct kib_fmr_poolset *fps;
 
 		if (net->ibn_tx_ps != NULL) {
 			tps = net->ibn_tx_ps[i];
@@ -2483,7 +2457,7 @@ kiblnd_net_fini_pools(kib_net_t *net)
 }
 
 static int
-kiblnd_net_init_pools(kib_net_t *net, struct lnet_ni *ni, __u32 *cpts,
+kiblnd_net_init_pools(struct kib_net *net, struct lnet_ni *ni, __u32 *cpts,
 		      int ncpts)
 {
 	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
@@ -2498,7 +2472,12 @@ kiblnd_net_init_pools(kib_net_t *net, struct lnet_ni *ni, __u32 *cpts,
 
 #ifdef HAVE_IB_GET_DMA_MR
 	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-	if (tunables->lnd_map_on_demand == 0) {
+	/*
+	 * if lnd_map_on_demand is zero then we have effectively disabled
+	 * FMR or FastReg and we're using global memory regions
+	 * exclusively.
+	 */
+	if (!tunables->lnd_map_on_demand) {
 		read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
 					   flags);
 		goto create_tx_pool;
@@ -2523,7 +2502,7 @@ kiblnd_net_init_pools(kib_net_t *net, struct lnet_ni *ni, __u32 *cpts,
 	 * FMR pool and map-on-demand if premapping failed */
 
 	net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
-					   sizeof(kib_fmr_poolset_t));
+					   sizeof(struct kib_fmr_poolset));
 	if (net->ibn_fmr_ps == NULL) {
 		CERROR("Failed to allocate FMR pool array\n");
 		rc = -ENOMEM;
@@ -2548,7 +2527,7 @@ kiblnd_net_init_pools(kib_net_t *net, struct lnet_ni *ni, __u32 *cpts,
  create_tx_pool:
 #endif
 	net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(),
-					  sizeof(kib_tx_poolset_t));
+					  sizeof(struct kib_tx_poolset));
 	if (net->ibn_tx_ps == NULL) {
 		CERROR("Failed to allocate tx pool array\n");
 		rc = -ENOMEM;
@@ -2578,52 +2557,87 @@ kiblnd_net_init_pools(kib_net_t *net, struct lnet_ni *ni, __u32 *cpts,
 }
 
 static int
-kiblnd_hdev_get_attr(kib_hca_dev_t *hdev)
+kiblnd_hdev_get_attr(struct kib_hca_dev *hdev)
 {
+	struct ib_device_attr *dev_attr;
+	int rc = 0;
+
+	/* It's safe to assume a HCA can handle a page size
+	 * matching that of the native system */
+	hdev->ibh_page_shift = PAGE_SHIFT;
+	hdev->ibh_page_size  = 1 << PAGE_SHIFT;
+	hdev->ibh_page_mask  = ~((__u64)hdev->ibh_page_size - 1);
+
 #ifndef HAVE_IB_DEVICE_ATTRS
-	struct ib_device_attr *attr;
-	int                    rc;
+	LIBCFS_ALLOC(dev_attr, sizeof(*dev_attr));
+	if (dev_attr == NULL) {
+		CERROR("Out of memory\n");
+		return -ENOMEM;
+	}
+
+	rc = ib_query_device(hdev->ibh_ibdev, dev_attr);
+	if (rc != 0) {
+		CERROR("Failed to query IB device: %d\n", rc);
+		goto out_clean_attr;
+	}
+#else
+	dev_attr = &hdev->ibh_ibdev->attrs;
 #endif
 
-        /* It's safe to assume a HCA can handle a page size
-         * matching that of the native system */
-        hdev->ibh_page_shift = PAGE_SHIFT;
-        hdev->ibh_page_size  = 1 << PAGE_SHIFT;
-        hdev->ibh_page_mask  = ~((__u64)hdev->ibh_page_size - 1);
+	hdev->ibh_mr_size = dev_attr->max_mr_size;
+	hdev->ibh_max_qp_wr = dev_attr->max_qp_wr;
 
-#ifdef HAVE_IB_DEVICE_ATTRS
-	hdev->ibh_mr_size = hdev->ibh_ibdev->attrs.max_mr_size;
+	/* Setup device Memory Registration capabilities */
+#ifdef HAVE_FMR_POOL_API
+#ifdef HAVE_IB_DEVICE_OPS
+	if (hdev->ibh_ibdev->ops.alloc_fmr &&
+	    hdev->ibh_ibdev->ops.dealloc_fmr &&
+	    hdev->ibh_ibdev->ops.map_phys_fmr &&
+	    hdev->ibh_ibdev->ops.unmap_fmr) {
 #else
-        LIBCFS_ALLOC(attr, sizeof(*attr));
-        if (attr == NULL) {
-                CERROR("Out of memory\n");
-                return -ENOMEM;
-        }
-
-        rc = ib_query_device(hdev->ibh_ibdev, attr);
-        if (rc == 0)
-                hdev->ibh_mr_size = attr->max_mr_size;
+	if (hdev->ibh_ibdev->alloc_fmr &&
+	    hdev->ibh_ibdev->dealloc_fmr &&
+	    hdev->ibh_ibdev->map_phys_fmr &&
+	    hdev->ibh_ibdev->unmap_fmr) {
+#endif
+		LCONSOLE_INFO("Using FMR for registration\n");
+		hdev->ibh_dev->ibd_dev_caps |= IBLND_DEV_CAPS_FMR_ENABLED;
+	} else
+#endif /* HAVE_FMR_POOL_API */
+	if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
+		LCONSOLE_INFO("Using FastReg for registration\n");
+		hdev->ibh_dev->ibd_dev_caps |= IBLND_DEV_CAPS_FASTREG_ENABLED;
+#ifndef HAVE_IB_ALLOC_FAST_REG_MR
+#ifdef IB_DEVICE_SG_GAPS_REG
+		if (dev_attr->device_cap_flags & IB_DEVICE_SG_GAPS_REG)
+			hdev->ibh_dev->ibd_dev_caps |= IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT;
+#endif
+#endif
+	} else {
+		rc = -ENOSYS;
+	}
 
-        LIBCFS_FREE(attr, sizeof(*attr));
+	if (rc == 0 && hdev->ibh_mr_size == ~0ULL)
+		hdev->ibh_mr_shift = 64;
+	else if (rc != 0)
+		rc = -EINVAL;
 
-        if (rc != 0) {
-                CERROR("Failed to query IB device: %d\n", rc);
-                return rc;
-        }
+#ifndef HAVE_IB_DEVICE_ATTRS
+out_clean_attr:
+	LIBCFS_FREE(dev_attr, sizeof(*dev_attr));
 #endif
 
-        if (hdev->ibh_mr_size == ~0ULL) {
-                hdev->ibh_mr_shift = 64;
-                return 0;
-        }
-
-	CERROR("Invalid mr size: %#llx\n", hdev->ibh_mr_size);
-        return -EINVAL;
+	if (rc == -ENOSYS)
+		CERROR("IB device does not support FMRs nor FastRegs, can't "
+		       "register memory: %d\n", rc);
+	else if (rc == -EINVAL)
+		CERROR("Invalid mr size: %#llx\n", hdev->ibh_mr_size);
+	return rc;
 }
 
 #ifdef HAVE_IB_GET_DMA_MR
 static void
-kiblnd_hdev_cleanup_mrs(kib_hca_dev_t *hdev)
+kiblnd_hdev_cleanup_mrs(struct kib_hca_dev *hdev)
 {
 	if (hdev->ibh_mrs == NULL)
 		return;
@@ -2635,7 +2649,7 @@ kiblnd_hdev_cleanup_mrs(kib_hca_dev_t *hdev)
 #endif
 
 void
-kiblnd_hdev_destroy(kib_hca_dev_t *hdev)
+kiblnd_hdev_destroy(struct kib_hca_dev *hdev)
 {
 #ifdef HAVE_IB_GET_DMA_MR
         kiblnd_hdev_cleanup_mrs(hdev);
@@ -2652,17 +2666,12 @@ kiblnd_hdev_destroy(kib_hca_dev_t *hdev)
 
 #ifdef HAVE_IB_GET_DMA_MR
 static int
-kiblnd_hdev_setup_mrs(kib_hca_dev_t *hdev)
+kiblnd_hdev_setup_mrs(struct kib_hca_dev *hdev)
 {
 	struct ib_mr *mr;
-	int           rc;
 	int           acflags = IB_ACCESS_LOCAL_WRITE |
 				IB_ACCESS_REMOTE_WRITE;
 
-	rc = kiblnd_hdev_get_attr(hdev);
-	if (rc != 0)
-		return rc;
-
 	mr = ib_get_dma_mr(hdev->ibh_pd, acflags);
 	if (IS_ERR(mr)) {
 		CERROR("Failed ib_get_dma_mr: %ld\n", PTR_ERR(mr));
@@ -2683,7 +2692,7 @@ kiblnd_dummy_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
 }
 
 static int
-kiblnd_dev_need_failover(kib_dev_t *dev, struct net *ns)
+kiblnd_dev_need_failover(struct kib_dev *dev, struct net *ns)
 {
         struct rdma_cm_id  *cmid;
         struct sockaddr_in  srcaddr;
@@ -2735,16 +2744,16 @@ kiblnd_dev_need_failover(kib_dev_t *dev, struct net *ns)
 }
 
 int
-kiblnd_dev_failover(kib_dev_t *dev, struct net *ns)
+kiblnd_dev_failover(struct kib_dev *dev, struct net *ns)
 {
 	struct list_head    zombie_tpo = LIST_HEAD_INIT(zombie_tpo);
 	struct list_head    zombie_ppo = LIST_HEAD_INIT(zombie_ppo);
 	struct list_head    zombie_fpo = LIST_HEAD_INIT(zombie_fpo);
         struct rdma_cm_id  *cmid  = NULL;
-        kib_hca_dev_t      *hdev  = NULL;
-        kib_hca_dev_t      *old;
+	struct kib_hca_dev *hdev  = NULL;
+	struct kib_hca_dev *old;
         struct ib_pd       *pd;
-        kib_net_t          *net;
+	struct kib_net *net;
         struct sockaddr_in  addr;
         unsigned long       flags;
         int                 rc = 0;
@@ -2776,7 +2785,7 @@ kiblnd_dev_failover(kib_dev_t *dev, struct net *ns)
         }
 
 	cmid = kiblnd_rdma_create_id(ns, kiblnd_cm_callback, dev, RDMA_PS_TCP,
-                                     IB_QPT_RC);
+				     IB_QPT_RC);
         if (IS_ERR(cmid)) {
                 rc = PTR_ERR(cmid);
                 CERROR("Failed to create cmid for failover: %d\n", rc);
@@ -2830,16 +2839,16 @@ kiblnd_dev_failover(kib_dev_t *dev, struct net *ns)
                 goto out;
         }
 
-#ifdef HAVE_IB_GET_DMA_MR
-	rc = kiblnd_hdev_setup_mrs(hdev);
+	rc = kiblnd_hdev_get_attr(hdev);
 	if (rc != 0) {
-		CERROR("Can't setup device: %d\n", rc);
+		CERROR("Can't get device attributes: %d\n", rc);
 		goto out;
 	}
-#else
-	rc = kiblnd_hdev_get_attr(hdev);
+
+#ifdef HAVE_IB_GET_DMA_MR
+	rc = kiblnd_hdev_setup_mrs(hdev);
 	if (rc != 0) {
-		CERROR("Can't get device attributes: %d\n", rc);
+		CERROR("Can't setup device: %d\n", rc);
 		goto out;
 	}
 #endif
@@ -2881,9 +2890,9 @@ kiblnd_dev_failover(kib_dev_t *dev, struct net *ns)
 }
 
 void
-kiblnd_destroy_dev (kib_dev_t *dev)
+kiblnd_destroy_dev(struct kib_dev *dev)
 {
-        LASSERT (dev->ibd_nnets == 0);
+	LASSERT(dev->ibd_nnets == 0);
 	LASSERT(list_empty(&dev->ibd_nets));
 
 	list_del(&dev->ibd_fail_list);
@@ -2944,7 +2953,7 @@ kiblnd_base_shutdown(void)
 			schedule_timeout(cfs_time_seconds(1));
 		}
 
-                /* fall through */
+		fallthrough;
 
         case IBLND_INIT_NOTHING:
                 break;
@@ -2969,7 +2978,7 @@ kiblnd_base_shutdown(void)
 static void
 kiblnd_shutdown(struct lnet_ni *ni)
 {
-        kib_net_t        *net = ni->ni_data;
+	struct kib_net *net = ni->ni_data;
 	rwlock_t     *g_lock = &kiblnd_data.kib_global_lock;
         int               i;
         unsigned long     flags;
@@ -3015,7 +3024,7 @@ kiblnd_shutdown(struct lnet_ni *ni)
 		list_del(&net->ibn_list);
 		write_unlock_irqrestore(g_lock, flags);
 
-                /* fall through */
+		fallthrough;
 
         case IBLND_INIT_NOTHING:
 		LASSERT (atomic_read(&net->ibn_nconns) == 0);
@@ -3175,7 +3184,8 @@ kiblnd_start_schedulers(struct kib_sched_info *sched)
 	return rc;
 }
 
-static int kiblnd_dev_start_threads(kib_dev_t *dev, __u32 *cpts, int ncpts)
+static int kiblnd_dev_start_threads(struct kib_dev *dev, bool newdev, u32 *cpts,
+				    int ncpts)
 {
 	int	cpt;
 	int	rc;
@@ -3187,7 +3197,7 @@ static int kiblnd_dev_start_threads(kib_dev_t *dev, __u32 *cpts, int ncpts)
 		cpt = (cpts == NULL) ? i : cpts[i];
 		sched = kiblnd_data.kib_scheds[cpt];
 
-		if (sched->ibs_nthreads > 0)
+		if (!newdev && sched->ibs_nthreads > 0)
 			continue;
 
 		rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]);
@@ -3200,38 +3210,80 @@ static int kiblnd_dev_start_threads(kib_dev_t *dev, __u32 *cpts, int ncpts)
 	return 0;
 }
 
+static struct kib_dev *
+kiblnd_dev_search(char *ifname)
+{
+	struct kib_dev *alias = NULL;
+	struct kib_dev *dev;
+	char            *colon;
+	char            *colon2;
+
+	colon = strchr(ifname, ':');
+	list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
+		if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
+			return dev;
+
+		if (alias != NULL)
+			continue;
+
+		colon2 = strchr(dev->ibd_ifname, ':');
+		if (colon != NULL)
+			*colon = 0;
+		if (colon2 != NULL)
+			*colon2 = 0;
+
+		if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
+			alias = dev;
+
+		if (colon != NULL)
+			*colon = ':';
+		if (colon2 != NULL)
+			*colon2 = ':';
+	}
+	return alias;
+}
+
 static int
 kiblnd_startup(struct lnet_ni *ni)
 {
-        char                     *ifname;
+	char *ifname = NULL;
 	struct lnet_inetdev *ifaces = NULL;
-        kib_dev_t                *ibdev = NULL;
-        kib_net_t                *net;
-        unsigned long             flags;
-        int                       rc;
+	struct kib_dev *ibdev = NULL;
+	struct kib_net *net = NULL;
+	unsigned long flags;
+	int rc;
 	int i;
+	bool newdev;
 
-        LASSERT (ni->ni_net->net_lnd == &the_o2iblnd);
+	LASSERT(ni->ni_net->net_lnd == &the_o2iblnd);
 
-        if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
+	if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
 		rc = kiblnd_base_startup(ni->ni_net_ns);
-                if (rc != 0)
-                        return rc;
-        }
+		if (rc != 0)
+			return rc;
+	}
 
-        LIBCFS_ALLOC(net, sizeof(*net));
-        ni->ni_data = net;
-        if (net == NULL)
-                goto failed;
+	LIBCFS_ALLOC(net, sizeof(*net));
+	ni->ni_data = net;
+	if (net == NULL) {
+		rc = -ENOMEM;
+		goto failed;
+	}
 
 	net->ibn_incarnation = ktime_get_real_ns() / NSEC_PER_USEC;
 
 	kiblnd_tunables_setup(ni);
 
+	/*
+	 * ni_interfaces is only to support legacy pre Multi-Rail
+	 * tcp bonding for ksocklnd. Multi-Rail wants each secondary
+	 * IP to be treated as an unique 'struct ni' interfaces instead.
+	 */
 	if (ni->ni_interfaces[0] != NULL) {
 		/* Use the IPoIB interface specified in 'networks=' */
 		if (ni->ni_interfaces[1] != NULL) {
 			CERROR("ko2iblnd: Multiple interfaces not supported\n");
+			rc = -EINVAL;
 			goto failed;
 		}
 
@@ -3240,10 +3292,11 @@ kiblnd_startup(struct lnet_ni *ni)
 		ifname = *kiblnd_tunables.kib_default_ipif;
 	}
 
-        if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
-                CERROR("IPoIB interface name too long: %s\n", ifname);
-                goto failed;
-        }
+	if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
+		CERROR("IPoIB interface name too long: %s\n", ifname);
+		rc = -E2BIG;
+		goto failed;
+	}
 
 	rc = lnet_inet_enumerate(&ifaces, ni->ni_net_ns);
 	if (rc < 0)
@@ -3260,63 +3313,70 @@ kiblnd_startup(struct lnet_ni *ni)
 		goto failed;
 	}
 
-	LIBCFS_ALLOC(ibdev, sizeof(*ibdev));
-	if (!ibdev) {
-		rc = -ENOMEM;
-		goto failed;
-	}
+	ibdev = kiblnd_dev_search(ifname);
+	newdev = ibdev == NULL;
+	/* hmm...create kib_dev even for alias */
+	if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0) {
+		LIBCFS_ALLOC(ibdev, sizeof(*ibdev));
+		if (!ibdev) {
+			rc = -ENOMEM;
+			goto failed;
+		}
 
-	ibdev->ibd_ifip = ifaces[i].li_ipaddr;
-	strlcpy(ibdev->ibd_ifname, ifaces[i].li_name,
-		sizeof(ibdev->ibd_ifname));
-	ibdev->ibd_can_failover = !!(ifaces[i].li_flags & IFF_MASTER);
+		ibdev->ibd_ifip = ifaces[i].li_ipaddr;
+		strlcpy(ibdev->ibd_ifname, ifaces[i].li_name,
+			sizeof(ibdev->ibd_ifname));
+		ibdev->ibd_can_failover = !!(ifaces[i].li_flags & IFF_MASTER);
 
-	INIT_LIST_HEAD(&ibdev->ibd_nets);
-	INIT_LIST_HEAD(&ibdev->ibd_list); /* not yet in kib_devs */
-	INIT_LIST_HEAD(&ibdev->ibd_fail_list);
+		INIT_LIST_HEAD(&ibdev->ibd_nets);
+		INIT_LIST_HEAD(&ibdev->ibd_list); /* not yet in kib_devs */
+		INIT_LIST_HEAD(&ibdev->ibd_fail_list);
 
-	/* initialize the device */
-	rc = kiblnd_dev_failover(ibdev, ni->ni_net_ns);
-	if (rc) {
-		CERROR("ko2iblnd: Can't initialize device: rc = %d\n", rc);
-		goto failed;
-	}
+		/* initialize the device */
+		rc = kiblnd_dev_failover(ibdev, ni->ni_net_ns);
+		if (rc) {
+			CERROR("ko2iblnd: Can't initialize device: rc = %d\n", rc);
+			goto failed;
+		}
 
-	list_add_tail(&ibdev->ibd_list, &kiblnd_data.kib_devs);
+		list_add_tail(&ibdev->ibd_list, &kiblnd_data.kib_devs);
+	}
 
 	net->ibn_dev = ibdev;
 	ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
 
 	ni->ni_dev_cpt = ifaces[i].li_cpt;
 
-	rc = kiblnd_dev_start_threads(ibdev, ni->ni_cpts, ni->ni_ncpts);
+	rc = kiblnd_dev_start_threads(ibdev, newdev, ni->ni_cpts, ni->ni_ncpts);
 	if (rc != 0)
 		goto failed;
 
 	rc = kiblnd_net_init_pools(net, ni, ni->ni_cpts, ni->ni_ncpts);
-        if (rc != 0) {
-                CERROR("Failed to initialize NI pools: %d\n", rc);
-                goto failed;
-        }
+	if (rc != 0) {
+		CERROR("Failed to initialize NI pools: %d\n", rc);
+		goto failed;
+	}
 
 	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 	ibdev->ibd_nnets++;
 	list_add_tail(&net->ibn_list, &ibdev->ibd_nets);
 	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 
-        net->ibn_init = IBLND_INIT_ALL;
+	net->ibn_init = IBLND_INIT_ALL;
 
-        return 0;
+	return 0;
 
 failed:
 	if (net != NULL && net->ibn_dev == NULL && ibdev != NULL)
-                kiblnd_destroy_dev(ibdev);
+		kiblnd_destroy_dev(ibdev);
 
 	kfree(ifaces);
-        kiblnd_shutdown(ni);
+	kiblnd_shutdown(ni);
 
-        CDEBUG(D_NET, "kiblnd_startup failed\n");
-        return -ENETDOWN;
+	CDEBUG(D_NET, "Configuration of device %s failed: rc = %d\n",
+	       ifname ? ifname : "", rc);
+
+	return -ENETDOWN;
 }
 
 static struct lnet_lnd the_o2iblnd = {
@@ -3338,11 +3398,11 @@ static int __init ko2iblnd_init(void)
 {
 	int rc;
 
-	CLASSERT(sizeof(kib_msg_t) <= IBLND_MSG_SIZE);
-	CLASSERT(offsetof(kib_msg_t,
+	CLASSERT(sizeof(struct kib_msg) <= IBLND_MSG_SIZE);
+	CLASSERT(offsetof(struct kib_msg,
 			  ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) <=
 		 IBLND_MSG_SIZE);
-	CLASSERT(offsetof(kib_msg_t,
+	CLASSERT(offsetof(struct kib_msg,
 			  ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
 		 <= IBLND_MSG_SIZE);
 
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h
index 7a9a1c3de16a4..3e24405c2c31e 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -78,7 +78,6 @@
 #define DEBUG_SUBSYSTEM S_LND
 
 #include <libcfs/libcfs.h>
-#include <lnet/lnet.h>
 #include <lnet/lib-lnet.h>
 
 #define IBLND_PEER_HASH_SIZE		101	/* # peer_ni lists */
@@ -88,13 +87,12 @@
 #define IBLND_N_SCHED			2
 #define IBLND_N_SCHED_HIGH		4
 
-typedef struct
-{
+struct kib_tunables {
 	int              *kib_dev_failover;     /* HCA failover */
 	unsigned int     *kib_service;          /* IB service number */
 	int              *kib_min_reconnect_interval; /* first failed connection retry... */
 	int              *kib_max_reconnect_interval; /* ...exponentially increasing to this */
-	int              *kib_cksum;            /* checksum kib_msg_t? */
+	int              *kib_cksum;            /* checksum struct kib_msg? */
 	int              *kib_timeout;          /* comms timeout (seconds) */
 	int              *kib_keepalive;        /* keepalive timeout (seconds) */
 	int              *kib_ntx;              /* # tx descs */
@@ -107,32 +105,32 @@ typedef struct
 	/* # threads on each CPT */
 	int		 *kib_nscheds;
 	int		 *kib_wrq_sge;		/* # sg elements per wrq */
-} kib_tunables_t;
+	int		 *kib_use_fastreg_gaps; /* enable discontiguous fastreg fragment support */
+};
 
-extern kib_tunables_t  kiblnd_tunables;
+extern struct kib_tunables  kiblnd_tunables;
 
 #define IBLND_MSG_QUEUE_SIZE_V1      8          /* V1 only : # messages/RDMAs in-flight */
 #define IBLND_CREDIT_HIGHWATER_V1    7          /* V1 only : when eagerly to return credits */
 
 #define IBLND_CREDITS_DEFAULT        8          /* default # of peer_ni credits */
-#define IBLND_CREDITS_MAX          ((typeof(((kib_msg_t*) 0)->ibm_credits)) - 1)  /* Max # of peer_ni credits */
+#define IBLND_CREDITS_MAX          ((typeof(((struct kib_msg *) 0)->ibm_credits)) - 1)  /* Max # of peer_ni credits */
 
 /* when eagerly to return credits */
-#define IBLND_CREDITS_HIGHWATER(t, v) ((v) == IBLND_MSG_VERSION_1 ? \
+#define IBLND_CREDITS_HIGHWATER(t, conn) ((conn->ibc_version) == IBLND_MSG_VERSION_1 ? \
 					IBLND_CREDIT_HIGHWATER_V1 : \
-					t->lnd_peercredits_hiw)
+			min(t->lnd_peercredits_hiw, (__u32)conn->ibc_queue_depth - 1))
 
 #ifdef HAVE_RDMA_CREATE_ID_5ARG
-# define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) rdma_create_id(ns, cb, \
-								    dev, ps, \
-								    qpt)
+# define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) \
+	 rdma_create_id((ns) ? (ns) : &init_net, cb, dev, ps, qpt)
 #else
 # ifdef HAVE_RDMA_CREATE_ID_4ARG
-#  define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) rdma_create_id(cb, dev, \
-								     ps, qpt)
+#  define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) \
+	  rdma_create_id(cb, dev, ps, qpt)
 # else
-#  define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) rdma_create_id(cb, dev, \
-								     ps)
+#  define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) \
+	  rdma_create_id(cb, dev, ps)
 # endif
 #endif
 
@@ -162,7 +160,7 @@ extern kib_tunables_t  kiblnd_tunables;
 #define IBLND_RECV_WRS(c)            IBLND_RX_MSGS(c)
 
 /* 2 = LNet msg + Transfer chain */
-#define IBLND_CQ_ENTRIES(c)	(IBLND_RECV_WRS(c) + kiblnd_send_wrs(c))
+#define IBLND_CQ_ENTRIES(c) (IBLND_RECV_WRS(c) + kiblnd_send_wrs(c))
 
 struct kib_hca_dev;
 
@@ -173,8 +171,15 @@ struct kib_hca_dev;
 #define KIB_IFNAME_SIZE              256
 #endif
 
-typedef struct
-{
+enum kib_dev_caps {
+	IBLND_DEV_CAPS_FASTREG_ENABLED		= BIT(0),
+	IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT	= BIT(1),
+#ifdef HAVE_FMR_POOL_API
+	IBLND_DEV_CAPS_FMR_ENABLED		= BIT(2),
+#endif
+};
+
+struct kib_dev {
 	struct list_head	ibd_list;	/* chain on kib_devs */
 	struct list_head	ibd_fail_list;	/* chain on kib_failed_devs */
 	__u32			ibd_ifip;	/* IPoIB interface IP */
@@ -182,7 +187,7 @@ typedef struct
 	char			ibd_ifname[KIB_IFNAME_SIZE];
 	int			ibd_nnets;	/* # nets extant */
 
-	cfs_time_t		ibd_next_failover;
+	time64_t		ibd_next_failover;
 	/* # failover failures */
 	int			ibd_failed_failover;
 	/* failover in progress */
@@ -191,10 +196,10 @@ typedef struct
 	unsigned int		ibd_can_failover;
 	struct list_head	ibd_nets;
 	struct kib_hca_dev	*ibd_hdev;
-} kib_dev_t;
+	enum kib_dev_caps	ibd_dev_caps;
+};
 
-typedef struct kib_hca_dev
-{
+struct kib_hca_dev {
 	struct rdma_cm_id   *ibh_cmid;          /* listener cmid */
 	struct ib_device    *ibh_ibdev;         /* IB device */
 	int                  ibh_page_shift;    /* page shift of current HCA */
@@ -202,24 +207,24 @@ typedef struct kib_hca_dev
 	__u64                ibh_page_mask;     /* page mask of current HCA */
 	int                  ibh_mr_shift;      /* bits shift of max MR size */
 	__u64                ibh_mr_size;       /* size of MR */
+	int		     ibh_max_qp_wr;     /* maximum work requests size */
 #ifdef HAVE_IB_GET_DMA_MR
 	struct ib_mr        *ibh_mrs;           /* global MR */
 #endif
 	struct ib_pd        *ibh_pd;            /* PD */
-	kib_dev_t           *ibh_dev;           /* owner */
+	struct kib_dev           *ibh_dev;           /* owner */
 	atomic_t             ibh_ref;           /* refcount */
-} kib_hca_dev_t;
+};
 
 /** # of seconds to keep pool alive */
 #define IBLND_POOL_DEADLINE     300
 /** # of seconds to retry if allocation failed */
 #define IBLND_POOL_RETRY        1
 
-typedef struct
-{
+struct kib_pages {
         int                     ibp_npages;             /* # pages */
         struct page            *ibp_pages[0];           /* page array */
-} kib_pages_t;
+};
 
 struct kib_pool;
 struct kib_poolset;
@@ -234,8 +239,7 @@ struct kib_net;
 
 #define IBLND_POOL_NAME_LEN     32
 
-typedef struct kib_poolset
-{
+struct kib_poolset {
 	/* serialize */
 	spinlock_t		ps_lock;
 	/* network it belongs to */
@@ -247,7 +251,7 @@ typedef struct kib_poolset
 	/* failed pool list */
 	struct list_head	ps_failed_pool_list;
 	/* time stamp for retry if failed to allocate */
-	cfs_time_t		ps_next_retry;
+	time64_t		ps_next_retry;
 	/* is allocating new pool */
 	int			ps_increasing;
 	/* new pool size */
@@ -263,40 +267,38 @@ typedef struct kib_poolset
 	kib_ps_node_init_t	ps_node_init;
 	/* finalize node */
 	kib_ps_node_fini_t	ps_node_fini;
-} kib_poolset_t;
+};
 
-typedef struct kib_pool
-{
+struct kib_pool {
 	/* chain on pool list */
 	struct list_head	po_list;
 	/* pre-allocated node */
 	struct list_head	po_free_list;
 	/* pool_set of this pool */
-	kib_poolset_t	       *po_owner;
+	struct kib_poolset     *po_owner;
 	/* deadline of this pool */
-	cfs_time_t		po_deadline;
+	time64_t		po_deadline;
 	/* # of elements in use */
 	int			po_allocated;
 	/* pool is created on failed HCA */
 	int			po_failed;
 	/* # of pre-allocated elements */
 	int			po_size;
-} kib_pool_t;
+};
 
-typedef struct {
-        kib_poolset_t           tps_poolset;            /* pool-set */
+struct kib_tx_poolset {
+	struct kib_poolset	tps_poolset;		/* pool-set */
         __u64                   tps_next_tx_cookie;     /* cookie of TX */
-} kib_tx_poolset_t;
+};
 
-typedef struct {
-        kib_pool_t              tpo_pool;               /* pool */
+struct kib_tx_pool {
+	struct kib_pool		tpo_pool;		/* pool */
         struct kib_hca_dev     *tpo_hdev;               /* device for this pool */
         struct kib_tx          *tpo_tx_descs;           /* all the tx descriptors */
-        kib_pages_t            *tpo_tx_pages;           /* premapped tx msg pages */
-} kib_tx_pool_t;
+	struct kib_pages       *tpo_tx_pages;           /* premapped tx msg pages */
+};
 
-typedef struct
-{
+struct kib_fmr_poolset {
 	spinlock_t		fps_lock;		/* serialize */
 	struct kib_net	       *fps_net;		/* IB network */
 	struct list_head	fps_pool_list;		/* FMR pool list */
@@ -309,8 +311,8 @@ typedef struct
 	/* is allocating new pool */
 	int			fps_increasing;
 	/* time stamp for retry if failed to allocate */
-	cfs_time_t		fps_next_retry;
-} kib_fmr_poolset_t;
+	time64_t		fps_next_retry;
+};
 
 #ifndef HAVE_IB_RDMA_WR
 struct ib_rdma_wr {
@@ -329,13 +331,13 @@ struct kib_fast_reg_descriptor { /* For fast registration */
 #endif
 	struct ib_mr			*frd_mr;
 	bool				 frd_valid;
+	bool				 frd_posted;
 };
 
-typedef struct
-{
+struct kib_fmr_pool {
 	struct list_head	fpo_list;	/* chain on pool list */
 	struct kib_hca_dev     *fpo_hdev;	/* device for this pool */
-	kib_fmr_poolset_t      *fpo_owner;	/* owner of this pool */
+	struct kib_fmr_poolset      *fpo_owner;	/* owner of this pool */
 #ifdef HAVE_FMR_POOL_API
 	union {
 		struct {
@@ -348,25 +350,24 @@ typedef struct
 		} fast_reg;
 #ifdef HAVE_FMR_POOL_API
 	};
-	int			fpo_is_fmr;
+	bool			fpo_is_fmr; /* True if FMR pools allocated */
 #endif
-	cfs_time_t		fpo_deadline;	/* deadline of this pool */
+	time64_t		fpo_deadline;	/* deadline of this pool */
 	int			fpo_failed;	/* fmr pool is failed */
 	int			fpo_map_count;	/* # of mapped FMR */
-} kib_fmr_pool_t;
+};
 
-typedef struct {
-	kib_fmr_pool_t			*fmr_pool;	/* pool of FMR */
+struct kib_fmr {
+	struct kib_fmr_pool		*fmr_pool;	/* pool of FMR */
 #ifdef HAVE_FMR_POOL_API
 	struct ib_pool_fmr		*fmr_pfmr;	/* IB pool fmr */
 #endif /* HAVE_FMR_POOL_API */
 	struct kib_fast_reg_descriptor	*fmr_frd;
 	u32				 fmr_key;
-} kib_fmr_t;
+};
 
-typedef struct kib_net
-{
-	/* chain on kib_dev_t::ibd_nets */
+struct kib_net {
+	/* chain on struct kib_dev::ibd_nets */
 	struct list_head	ibn_list;
 	__u64			ibn_incarnation;/* my epoch */
 	int			ibn_init;	/* initialisation state */
@@ -375,11 +376,11 @@ typedef struct kib_net
 	atomic_t		ibn_npeers;	/* # peers extant */
 	atomic_t		ibn_nconns;	/* # connections extant */
 
-	kib_tx_poolset_t	**ibn_tx_ps;	/* tx pool-set */
-	kib_fmr_poolset_t	**ibn_fmr_ps;	/* fmr pool-set */
+	struct kib_tx_poolset	**ibn_tx_ps;	/* tx pool-set */
+	struct kib_fmr_poolset	**ibn_fmr_ps;	/* fmr pool-set */
 
-	kib_dev_t		*ibn_dev;	/* underlying IB device */
-} kib_net_t;
+	struct kib_dev		*ibn_dev;	/* underlying IB device */
+};
 
 #define KIB_THREAD_SHIFT		16
 #define KIB_THREAD_ID(cpt, tid)		((cpt) << KIB_THREAD_SHIFT | (tid))
@@ -400,8 +401,7 @@ struct kib_sched_info {
 	int			ibs_cpt;	/* CPT id */
 };
 
-typedef struct
-{
+struct kib_data {
 	int			kib_init;	/* initialisation state */
 	int			kib_shutdown;	/* shut down? */
 	struct list_head	kib_devs;	/* IB devices extant */
@@ -430,14 +430,14 @@ typedef struct
 	 * The second that peers are pulled out from \a kib_reconn_wait
 	 * for reconnection.
 	 */
-	unsigned int		kib_reconn_sec;
+	time64_t		kib_reconn_sec;
 	/* connection daemon sleeps here */
 	wait_queue_head_t	kib_connd_waitq;
 	spinlock_t		kib_connd_lock;	/* serialise */
 	struct ib_qp_attr	kib_error_qpa;	/* QP->ERROR */
 	/* percpt data for schedulers */
 	struct kib_sched_info	**kib_scheds;
-} kib_data_t;
+};
 
 #define IBLND_INIT_NOTHING         0
 #define IBLND_INIT_DATA            1
@@ -448,60 +448,51 @@ typedef struct
  * These are sent in sender's byte order (i.e. receiver flips).
  */
 
-typedef struct kib_connparams
-{
+struct kib_connparams {
         __u16             ibcp_queue_depth;
         __u16             ibcp_max_frags;
         __u32             ibcp_max_msg_size;
-} WIRE_ATTR kib_connparams_t;
+} WIRE_ATTR;
 
-typedef struct
-{
+struct kib_immediate_msg {
 	struct lnet_hdr		ibim_hdr;	/* portals header */
 	char			ibim_payload[0];/* piggy-backed payload */
-} WIRE_ATTR kib_immediate_msg_t;
+} WIRE_ATTR;
 
-typedef struct
-{
+struct kib_rdma_frag {
         __u32             rf_nob;               /* # bytes this frag */
         __u64             rf_addr;              /* CAVEAT EMPTOR: misaligned!! */
-} WIRE_ATTR kib_rdma_frag_t;
+} WIRE_ATTR;
 
-typedef struct
-{
-        __u32             rd_key;               /* local/remote key */
-        __u32             rd_nfrags;            /* # fragments */
-        kib_rdma_frag_t   rd_frags[0];          /* buffer frags */
-} WIRE_ATTR kib_rdma_desc_t;
+struct kib_rdma_desc {
+	__u32			rd_key;		/* local/remote key */
+	__u32			rd_nfrags;	/* # fragments */
+	struct kib_rdma_frag	rd_frags[0];	/* buffer frags */
+} WIRE_ATTR;
 
-typedef struct
-{
+struct kib_putreq_msg {
 	struct lnet_hdr		ibprm_hdr;	/* portals header */
 	__u64			ibprm_cookie;	/* opaque completion cookie */
-} WIRE_ATTR kib_putreq_msg_t;
+} WIRE_ATTR;
 
-typedef struct
-{
+struct kib_putack_msg {
         __u64             ibpam_src_cookie;     /* reflected completion cookie */
         __u64             ibpam_dst_cookie;     /* opaque completion cookie */
-        kib_rdma_desc_t   ibpam_rd;             /* sender's sink buffer */
-} WIRE_ATTR kib_putack_msg_t;
+	struct kib_rdma_desc	ibpam_rd;	/* sender's sink buffer */
+} WIRE_ATTR;
 
-typedef struct
-{
+struct kib_get_msg {
 	struct lnet_hdr		ibgm_hdr;	/* portals header */
 	__u64			ibgm_cookie;	/* opaque completion cookie */
-	kib_rdma_desc_t		ibgm_rd;	/* rdma descriptor */
-} WIRE_ATTR kib_get_msg_t;
+	struct kib_rdma_desc	ibgm_rd;	/* rdma descriptor */
+} WIRE_ATTR;
 
-typedef struct
-{
+struct kib_completion_msg {
         __u64             ibcm_cookie;          /* opaque completion cookie */
         __s32             ibcm_status;          /* < 0 failure: >= 0 length */
-} WIRE_ATTR kib_completion_msg_t;
+} WIRE_ATTR;
 
-typedef struct
-{
+struct kib_msg {
         /* First 2 fields fixed FOR ALL TIME */
         __u32             ibm_magic;            /* I'm an ibnal message */
         __u16             ibm_version;          /* this is my version number */
@@ -516,14 +507,14 @@ typedef struct
         __u64             ibm_dststamp;         /* destination's incarnation */
 
         union {
-                kib_connparams_t      connparams;
-                kib_immediate_msg_t   immediate;
-                kib_putreq_msg_t      putreq;
-                kib_putack_msg_t      putack;
-                kib_get_msg_t         get;
-                kib_completion_msg_t  completion;
+		struct kib_connparams		connparams;
+		struct kib_immediate_msg	immediate;
+		struct kib_putreq_msg		putreq;
+		struct kib_putack_msg		putack;
+		struct kib_get_msg		get;
+		struct kib_completion_msg	completion;
         } WIRE_ATTR ibm_u;
-} WIRE_ATTR kib_msg_t;
+} WIRE_ATTR;
 
 #define IBLND_MSG_MAGIC LNET_PROTO_IB_MAGIC	/* unique magic */
 
@@ -542,14 +533,14 @@ typedef struct
 #define IBLND_MSG_GET_REQ           0xd6        /* getreq (sink->src) */
 #define IBLND_MSG_GET_DONE          0xd7        /* completion (src->sink: all OK) */
 
-typedef struct {
+struct kib_rej {
         __u32            ibr_magic;             /* sender's magic */
         __u16            ibr_version;           /* sender's version */
         __u8             ibr_why;               /* reject reason */
         __u8             ibr_padding;           /* padding */
         __u64            ibr_incarnation;       /* incarnation of peer_ni */
-        kib_connparams_t ibr_cp;                /* connection parameters */
-} WIRE_ATTR kib_rej_t;
+	struct kib_connparams	ibr_cp;		/* connection parameters */
+} WIRE_ATTR;
 
 /* connection rejection reasons */
 #define IBLND_REJECT_CONN_RACE       1          /* You lost connection race */
@@ -567,8 +558,7 @@ typedef struct {
 
 /***********************************************************************/
 
-typedef struct kib_rx                           /* receive message */
-{
+struct kib_rx {					/* receive message */
 	/* queue for attention */
 	struct list_head	rx_list;
 	/* owning conn */
@@ -578,7 +568,7 @@ typedef struct kib_rx                           /* receive message */
 	/* completion status */
 	enum ib_wc_status	rx_status;
 	/* message buffer (host vaddr) */
-	kib_msg_t	       *rx_msg;
+	struct kib_msg	       *rx_msg;
 	/* message buffer (I/O addr) */
 	__u64			rx_msgaddr;
 	/* for dma_unmap_single() */
@@ -587,19 +577,18 @@ typedef struct kib_rx                           /* receive message */
 	struct ib_recv_wr	rx_wrq;
 	/* ...and its memory */
 	struct ib_sge		rx_sge;
-} kib_rx_t;
+};
 
 #define IBLND_POSTRX_DONT_POST    0             /* don't post */
 #define IBLND_POSTRX_NO_CREDIT    1             /* post: no credits */
 #define IBLND_POSTRX_PEER_CREDIT  2             /* post: give peer_ni back 1 credit */
 #define IBLND_POSTRX_RSRVD_CREDIT 3             /* post: give myself back 1 reserved credit */
 
-typedef struct kib_tx                           /* transmit message */
-{
+struct kib_tx {					/* transmit message */
 	/* queue on idle_txs ibc_tx_queue etc. */
 	struct list_head	tx_list;
 	/* pool I'm from */
-	kib_tx_pool_t		*tx_pool;
+	struct kib_tx_pool	*tx_pool;
 	/* owning conn */
 	struct kib_conn		*tx_conn;
 	/* # tx callbacks outstanding */
@@ -610,14 +599,16 @@ typedef struct kib_tx                           /* transmit message */
 	short			tx_waiting;
 	/* LNET completion status */
 	int			tx_status;
+	/* health status of the transmit */
+	enum lnet_msg_hstatus	tx_hstatus;
 	/* completion deadline */
-	unsigned long		tx_deadline;
+	ktime_t			tx_deadline;
 	/* completion cookie */
 	__u64			tx_cookie;
 	/* lnet msgs to finalize on completion */
 	struct lnet_msg		*tx_lntmsg[2];
 	/* message buffer (host vaddr) */
-	kib_msg_t		*tx_msg;
+	struct kib_msg		*tx_msg;
 	/* message buffer (I/O addr) */
 	__u64			tx_msgaddr;
 	/* for dma_unmap_single() */
@@ -633,33 +624,33 @@ typedef struct kib_tx                           /* transmit message */
 	/* ...and their memory */
 	struct ib_sge		*tx_sge;
 	/* rdma descriptor */
-	kib_rdma_desc_t		*tx_rd;
+	struct kib_rdma_desc	*tx_rd;
 	/* # entries in... */
 	int			tx_nfrags;
 	/* dma_map_sg descriptor */
 	struct scatterlist	*tx_frags;
 	/* rdma phys page addrs */
 	__u64			*tx_pages;
+	/* gaps in fragments */
+	bool			tx_gaps;
 	/* FMR */
-	kib_fmr_t		fmr;
+	struct kib_fmr		tx_fmr;
 				/* dma direction */
 	int			tx_dmadir;
-} kib_tx_t;
+};
 
-typedef struct kib_connvars
-{
+struct kib_connvars {
         /* connection-in-progress variables */
-        kib_msg_t                 cv_msg;
-} kib_connvars_t;
+	struct kib_msg		cv_msg;
+};
 
-typedef struct kib_conn
-{
+struct kib_conn {
 	/* scheduler information */
 	struct kib_sched_info	*ibc_sched;
 	/* owning peer_ni */
-	struct kib_peer		*ibc_peer;
+	struct kib_peer_ni	*ibc_peer;
 	/* HCA bound on */
-	kib_hca_dev_t		*ibc_hdev;
+	struct kib_hca_dev	*ibc_hdev;
 	/* stash on peer_ni's conn list */
 	struct list_head	ibc_list;
 	/* schedule for attention */
@@ -697,7 +688,7 @@ typedef struct kib_conn
 	/* CQ callback fired */
 	unsigned int		ibc_ready:1;
 	/* time of last send */
-	unsigned long		ibc_last_send;
+	ktime_t			ibc_last_send;
 	/** link chain for kiblnd_check_conns only */
 	struct list_head	ibc_connd_list;
 	/** rxs completed before ESTABLISHED */
@@ -712,12 +703,14 @@ typedef struct kib_conn
 	struct list_head	ibc_tx_queue_rsrvd;
 	/* active tx awaiting completion */
 	struct list_head	ibc_active_txs;
+	/* zombie tx awaiting done */
+	struct list_head	ibc_zombie_txs;
 	/* serialise */
 	spinlock_t		ibc_lock;
 	/* the rx descs */
-	kib_rx_t		*ibc_rxs;
+	struct kib_rx		*ibc_rxs;
 	/* premapped rx msg pages */
-	kib_pages_t		*ibc_rx_pages;
+	struct kib_pages	*ibc_rx_pages;
 
 	/* CM id */
 	struct rdma_cm_id	*ibc_cmid;
@@ -725,8 +718,8 @@ typedef struct kib_conn
 	struct ib_cq		*ibc_cq;
 
 	/* in-progress connection state */
-	kib_connvars_t		*ibc_connvars;
-} kib_conn_t;
+	struct kib_connvars	*ibc_connvars;
+};
 
 #define IBLND_CONN_INIT               0         /* being initialised */
 #define IBLND_CONN_ACTIVE_CONNECT     1         /* active sending req */
@@ -735,8 +728,7 @@ typedef struct kib_conn
 #define IBLND_CONN_CLOSING            4         /* being closed */
 #define IBLND_CONN_DISCONNECTED       5         /* disconnected */
 
-typedef struct kib_peer
-{
+struct kib_peer_ni {
 	/* stash on global peer_ni list */
 	struct list_head	ibp_list;
 	/* who's on the other end(s) */
@@ -751,8 +743,8 @@ typedef struct kib_peer
 	struct list_head	ibp_tx_queue;
 	/* incarnation of peer_ni */
 	__u64			ibp_incarnation;
-	/* when (in jiffies) I was last alive */
-	cfs_time_t		ibp_last_alive;
+	/* when (in seconds) I was last alive */
+	time64_t		ibp_last_alive;
 	/* # users */
 	atomic_t		ibp_refcount;
 	/* version of peer_ni */
@@ -767,13 +759,15 @@ typedef struct kib_peer
 	unsigned char		ibp_races;
 	/* # consecutive reconnection attempts to this peer */
 	unsigned int		ibp_reconnected;
+	/* number of total active retries */
+	unsigned int		ibp_retries;
 	/* errno on closing this peer_ni */
 	int			ibp_error;
 	/* max map_on_demand */
 	__u16			ibp_max_frags;
 	/* max_peer_credits */
 	__u16			ibp_queue_depth;
-} kib_peer_ni_t;
+};
 
 #ifndef HAVE_IB_INC_RKEY
 /**
@@ -788,32 +782,12 @@ static inline u32 ib_inc_rkey(u32 rkey)
 }
 #endif
 
-extern kib_data_t      kiblnd_data;
+extern struct kib_data kiblnd_data;
 
-extern void kiblnd_hdev_destroy(kib_hca_dev_t *hdev);
+extern void kiblnd_hdev_destroy(struct kib_hca_dev *hdev);
 
 int kiblnd_msg_queue_size(int version, struct lnet_ni *ni);
 
-/* max # of fragments configured by user */
-static inline int
-kiblnd_cfg_rdma_frags(struct lnet_ni *ni)
-{
-	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
-	int mod;
-
-	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
-	mod = tunables->lnd_map_on_demand;
-	return mod != 0 ? mod : IBLND_MAX_RDMA_FRAGS;
-}
-
-static inline int
-kiblnd_rdma_frags(int version, struct lnet_ni *ni)
-{
-	return version == IBLND_MSG_VERSION_1 ?
-	  IBLND_MAX_RDMA_FRAGS :
-	  kiblnd_cfg_rdma_frags(ni);
-}
-
 static inline int
 kiblnd_concurrent_sends(int version, struct lnet_ni *ni)
 {
@@ -835,14 +809,14 @@ kiblnd_concurrent_sends(int version, struct lnet_ni *ni)
 }
 
 static inline void
-kiblnd_hdev_addref_locked(kib_hca_dev_t *hdev)
+kiblnd_hdev_addref_locked(struct kib_hca_dev *hdev)
 {
 	LASSERT(atomic_read(&hdev->ibh_ref) > 0);
 	atomic_inc(&hdev->ibh_ref);
 }
 
 static inline void
-kiblnd_hdev_decref(kib_hca_dev_t *hdev)
+kiblnd_hdev_decref(struct kib_hca_dev *hdev)
 {
 	LASSERT(atomic_read(&hdev->ibh_ref) > 0);
 	if (atomic_dec_and_test(&hdev->ibh_ref))
@@ -850,7 +824,7 @@ kiblnd_hdev_decref(kib_hca_dev_t *hdev)
 }
 
 static inline int
-kiblnd_dev_can_failover(kib_dev_t *dev)
+kiblnd_dev_can_failover(struct kib_dev *dev)
 {
 	if (!list_empty(&dev->ibd_fail_list)) /* already scheduled */
                 return 0;
@@ -906,7 +880,7 @@ do {                                                            \
 } while (0)
 
 static inline bool
-kiblnd_peer_connecting(kib_peer_ni_t *peer_ni)
+kiblnd_peer_connecting(struct kib_peer_ni *peer_ni)
 {
 	return peer_ni->ibp_connecting != 0 ||
 	       peer_ni->ibp_reconnecting != 0 ||
@@ -914,7 +888,7 @@ kiblnd_peer_connecting(kib_peer_ni_t *peer_ni)
 }
 
 static inline bool
-kiblnd_peer_idle(kib_peer_ni_t *peer_ni)
+kiblnd_peer_idle(struct kib_peer_ni *peer_ni)
 {
 	return !kiblnd_peer_connecting(peer_ni) && list_empty(&peer_ni->ibp_conns);
 }
@@ -929,14 +903,14 @@ kiblnd_nid2peerlist (lnet_nid_t nid)
 }
 
 static inline int
-kiblnd_peer_active (kib_peer_ni_t *peer_ni)
+kiblnd_peer_active(struct kib_peer_ni *peer_ni)
 {
 	/* Am I in the peer_ni hash table? */
 	return !list_empty(&peer_ni->ibp_list);
 }
 
 static inline struct kib_conn *
-kiblnd_get_conn_locked (kib_peer_ni_t *peer_ni)
+kiblnd_get_conn_locked(struct kib_peer_ni *peer_ni)
 {
 	struct list_head *next;
 
@@ -954,16 +928,17 @@ kiblnd_get_conn_locked (kib_peer_ni_t *peer_ni)
 }
 
 static inline int
-kiblnd_send_keepalive(kib_conn_t *conn)
+kiblnd_send_keepalive(struct kib_conn *conn)
 {
+	s64 keepalive_ns = *kiblnd_tunables.kib_keepalive * NSEC_PER_SEC;
+
 	return (*kiblnd_tunables.kib_keepalive > 0) &&
-		cfs_time_after(jiffies, conn->ibc_last_send +
-			       msecs_to_jiffies(*kiblnd_tunables.kib_keepalive *
-						MSEC_PER_SEC));
+		ktime_after(ktime_get(),
+			    ktime_add_ns(conn->ibc_last_send, keepalive_ns));
 }
 
 static inline int
-kiblnd_need_noop(kib_conn_t *conn)
+kiblnd_need_noop(struct kib_conn *conn)
 {
 	struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
 	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
@@ -972,7 +947,7 @@ kiblnd_need_noop(kib_conn_t *conn)
 	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
 
         if (conn->ibc_outstanding_credits <
-	    IBLND_CREDITS_HIGHWATER(tunables, conn->ibc_version) &&
+	    IBLND_CREDITS_HIGHWATER(tunables, conn) &&
             !kiblnd_send_keepalive(conn))
                 return 0; /* No need to send NOOP */
 
@@ -999,14 +974,14 @@ kiblnd_need_noop(kib_conn_t *conn)
 }
 
 static inline void
-kiblnd_abort_receives(kib_conn_t *conn)
+kiblnd_abort_receives(struct kib_conn *conn)
 {
         ib_modify_qp(conn->ibc_cmid->qp,
                      &kiblnd_data.kib_error_qpa, IB_QP_STATE);
 }
 
 static inline const char *
-kiblnd_queue2str(kib_conn_t *conn, struct list_head *q)
+kiblnd_queue2str(struct kib_conn *conn, struct list_head *q)
 {
 	if (q == &conn->ibc_tx_queue)
 		return "tx_queue";
@@ -1057,21 +1032,21 @@ kiblnd_wreqid2type (__u64 wreqid)
 }
 
 static inline void
-kiblnd_set_conn_state (kib_conn_t *conn, int state)
+kiblnd_set_conn_state(struct kib_conn *conn, int state)
 {
 	conn->ibc_state = state;
 	smp_mb();
 }
 
 static inline void
-kiblnd_init_msg (kib_msg_t *msg, int type, int body_nob)
+kiblnd_init_msg(struct kib_msg *msg, int type, int body_nob)
 {
         msg->ibm_type = type;
-        msg->ibm_nob  = offsetof(kib_msg_t, ibm_u) + body_nob;
+	msg->ibm_nob = offsetof(struct kib_msg, ibm_u) + body_nob;
 }
 
 static inline int
-kiblnd_rd_size (kib_rdma_desc_t *rd)
+kiblnd_rd_size(struct kib_rdma_desc *rd)
 {
         int   i;
         int   size;
@@ -1083,25 +1058,25 @@ kiblnd_rd_size (kib_rdma_desc_t *rd)
 }
 
 static inline __u64
-kiblnd_rd_frag_addr(kib_rdma_desc_t *rd, int index)
+kiblnd_rd_frag_addr(struct kib_rdma_desc *rd, int index)
 {
         return rd->rd_frags[index].rf_addr;
 }
 
 static inline __u32
-kiblnd_rd_frag_size(kib_rdma_desc_t *rd, int index)
+kiblnd_rd_frag_size(struct kib_rdma_desc *rd, int index)
 {
         return rd->rd_frags[index].rf_nob;
 }
 
 static inline __u32
-kiblnd_rd_frag_key(kib_rdma_desc_t *rd, int index)
+kiblnd_rd_frag_key(struct kib_rdma_desc *rd, int index)
 {
         return rd->rd_key;
 }
 
 static inline int
-kiblnd_rd_consume_frag(kib_rdma_desc_t *rd, int index, __u32 nob)
+kiblnd_rd_consume_frag(struct kib_rdma_desc *rd, int index, __u32 nob)
 {
         if (nob < rd->rd_frags[index].rf_nob) {
                 rd->rd_frags[index].rf_addr += nob;
@@ -1114,14 +1089,14 @@ kiblnd_rd_consume_frag(kib_rdma_desc_t *rd, int index, __u32 nob)
 }
 
 static inline int
-kiblnd_rd_msg_size(kib_rdma_desc_t *rd, int msgtype, int n)
+kiblnd_rd_msg_size(struct kib_rdma_desc *rd, int msgtype, int n)
 {
         LASSERT (msgtype == IBLND_MSG_GET_REQ ||
                  msgtype == IBLND_MSG_PUT_ACK);
 
         return msgtype == IBLND_MSG_GET_REQ ?
-               offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]) :
-               offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
+	       offsetof(struct kib_get_msg, ibgm_rd.rd_frags[n]) :
+	       offsetof(struct kib_putack_msg, ibpam_rd.rd_frags[n]);
 }
 
 static inline __u64
@@ -1179,6 +1154,10 @@ static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev,
         return ib_sg_dma_len(dev, sg);
 }
 
+#ifndef HAVE_RDMA_CONNECT_LOCKED
+#define rdma_connect_locked(cmid, cpp)	rdma_connect(cmid, cpp)
+#endif
+
 /* XXX We use KIBLND_CONN_PARAM(e) as writable buffer, it's not strictly
  * right because OFED1.2 defines it as const, to use it we have to add
  * (void *) cast to overcome "const" */
@@ -1186,19 +1165,16 @@ static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev,
 #define KIBLND_CONN_PARAM(e)            ((e)->param.conn.private_data)
 #define KIBLND_CONN_PARAM_LEN(e)        ((e)->param.conn.private_data_len)
 
-#ifdef HAVE_IB_GET_DMA_MR
-struct ib_mr *kiblnd_find_rd_dma_mr(struct lnet_ni *ni, kib_rdma_desc_t *rd,
-				    int negotiated_nfrags);
-#endif
-void kiblnd_map_rx_descs(kib_conn_t *conn);
-void kiblnd_unmap_rx_descs(kib_conn_t *conn);
-void kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node);
-struct list_head *kiblnd_pool_alloc_node(kib_poolset_t *ps);
+void kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs);
+void kiblnd_map_rx_descs(struct kib_conn *conn);
+void kiblnd_unmap_rx_descs(struct kib_conn *conn);
+void kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node);
+struct list_head *kiblnd_pool_alloc_node(struct kib_poolset *ps);
 
-int  kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx,
-			 kib_rdma_desc_t *rd, __u32 nob, __u64 iov,
-			 kib_fmr_t *fmr, bool *is_fastreg);
-void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status);
+int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx,
+			struct kib_rdma_desc *rd, u32 nob, u64 iov,
+			struct kib_fmr *fmr);
+void kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status);
 
 int  kiblnd_tunables_setup(struct lnet_ni *ni);
 int  kiblnd_tunables_init(void);
@@ -1208,43 +1184,45 @@ int  kiblnd_scheduler(void *arg);
 int  kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name);
 int  kiblnd_failover_thread (void *arg);
 
-int  kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages);
+int kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages);
 
 int  kiblnd_cm_callback(struct rdma_cm_id *cmid,
                         struct rdma_cm_event *event);
 int  kiblnd_translate_mtu(int value);
 
-int  kiblnd_dev_failover(kib_dev_t *dev, struct net *ns);
-int kiblnd_create_peer(struct lnet_ni *ni, kib_peer_ni_t **peerp,
+int  kiblnd_dev_failover(struct kib_dev *dev, struct net *ns);
+int kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer_ni **peerp,
 		       lnet_nid_t nid);
-void kiblnd_destroy_peer (kib_peer_ni_t *peer);
-bool kiblnd_reconnect_peer(kib_peer_ni_t *peer);
-void kiblnd_destroy_dev (kib_dev_t *dev);
-void kiblnd_unlink_peer_locked (kib_peer_ni_t *peer_ni);
-kib_peer_ni_t *kiblnd_find_peer_locked(struct lnet_ni *ni, lnet_nid_t nid);
-int  kiblnd_close_stale_conns_locked (kib_peer_ni_t *peer_ni,
-                                      int version, __u64 incarnation);
-int  kiblnd_close_peer_conns_locked (kib_peer_ni_t *peer_ni, int why);
-
-kib_conn_t *kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
-			       int state, int version);
-void kiblnd_destroy_conn(kib_conn_t *conn, bool free_conn);
-void kiblnd_close_conn (kib_conn_t *conn, int error);
-void kiblnd_close_conn_locked (kib_conn_t *conn, int error);
-
-void kiblnd_launch_tx(struct lnet_ni *ni, kib_tx_t *tx, lnet_nid_t nid);
-void kiblnd_txlist_done(struct list_head *txlist, int status);
+void kiblnd_destroy_peer(struct kib_peer_ni *peer);
+bool kiblnd_reconnect_peer(struct kib_peer_ni *peer);
+void kiblnd_destroy_dev(struct kib_dev *dev);
+void kiblnd_unlink_peer_locked(struct kib_peer_ni *peer_ni);
+struct kib_peer_ni *kiblnd_find_peer_locked(struct lnet_ni *ni, lnet_nid_t nid);
+int  kiblnd_close_stale_conns_locked(struct kib_peer_ni *peer_ni,
+				     int version, u64 incarnation);
+int  kiblnd_close_peer_conns_locked(struct kib_peer_ni *peer_ni, int why);
+
+struct kib_conn *kiblnd_create_conn(struct kib_peer_ni *peer_ni,
+				    struct rdma_cm_id *cmid,
+				    int state, int version);
+void kiblnd_destroy_conn(struct kib_conn *conn);
+void kiblnd_close_conn(struct kib_conn *conn, int error);
+void kiblnd_close_conn_locked(struct kib_conn *conn, int error);
+
+void kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid);
+void kiblnd_txlist_done(struct list_head *txlist, int status,
+			enum lnet_msg_hstatus hstatus);
 
 void kiblnd_qp_event(struct ib_event *event, void *arg);
 void kiblnd_cq_event(struct ib_event *event, void *arg);
 void kiblnd_cq_completion(struct ib_cq *cq, void *arg);
 
-void kiblnd_pack_msg(struct lnet_ni *ni, kib_msg_t *msg, int version,
+void kiblnd_pack_msg(struct lnet_ni *ni, struct kib_msg *msg, int version,
 		     int credits, lnet_nid_t dstnid, __u64 dststamp);
-int  kiblnd_unpack_msg(kib_msg_t *msg, int nob);
-int  kiblnd_post_rx (kib_rx_t *rx, int credit);
+int kiblnd_unpack_msg(struct kib_msg *msg, int nob);
+int kiblnd_post_rx(struct kib_rx *rx, int credit);
 
-int  kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg);
+int kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg);
 int kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
 		int delayed, unsigned int niov, struct kvec *iov,
 		lnet_kiov_t *kiov, unsigned int offset, unsigned int mlen,
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c
index 4b896a52d3bb4..72079ead79c4d 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_cb.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,20 +38,21 @@
 
 #define MAX_CONN_RACES_BEFORE_ABORT 20
 
-static void kiblnd_peer_alive(kib_peer_ni_t *peer_ni);
-static void kiblnd_peer_connect_failed(kib_peer_ni_t *peer_ni, int active, int error);
-static void kiblnd_init_tx_msg(struct lnet_ni *ni, kib_tx_t *tx,
+static void kiblnd_peer_alive(struct kib_peer_ni *peer_ni);
+static void kiblnd_peer_connect_failed(struct kib_peer_ni *peer_ni, int active,
+				       int error);
+static void kiblnd_init_tx_msg(struct lnet_ni *ni, struct kib_tx *tx,
 			       int type, int body_nob);
-static int kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type,
-			    int resid, kib_rdma_desc_t *dstrd, __u64 dstcookie);
-static void kiblnd_queue_tx_locked(kib_tx_t *tx, kib_conn_t *conn);
-static void kiblnd_queue_tx(kib_tx_t *tx, kib_conn_t *conn);
+static int kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type,
+			    int resid, struct kib_rdma_desc *dstrd, u64 dstcookie);
+static void kiblnd_queue_tx_locked(struct kib_tx *tx, struct kib_conn *conn);
+static void kiblnd_queue_tx(struct kib_tx *tx, struct kib_conn *conn);
 
-static void kiblnd_unmap_tx(kib_tx_t *tx);
-static void kiblnd_check_sends_locked(kib_conn_t *conn);
+static void kiblnd_unmap_tx(struct kib_tx *tx);
+static void kiblnd_check_sends_locked(struct kib_conn *conn);
 
 void
-kiblnd_tx_done(kib_tx_t *tx)
+kiblnd_tx_done(struct kib_tx *tx)
 {
 	struct lnet_msg *lntmsg[2];
 	int         rc;
@@ -85,39 +86,46 @@ kiblnd_tx_done(kib_tx_t *tx)
 		if (lntmsg[i] == NULL)
 			continue;
 
+		/* propagate health status to LNet for requests */
+		if (i == 0 && lntmsg[i])
+			lntmsg[i]->msg_health_status = tx->tx_hstatus;
+
 		lnet_finalize(lntmsg[i], rc);
 	}
 }
 
 void
-kiblnd_txlist_done(struct list_head *txlist, int status)
+kiblnd_txlist_done(struct list_head *txlist, int status,
+		   enum lnet_msg_hstatus hstatus)
 {
-	kib_tx_t *tx;
+	struct kib_tx *tx;
 
 	while (!list_empty(txlist)) {
-		tx = list_entry(txlist->next, kib_tx_t, tx_list);
+		tx = list_entry(txlist->next, struct kib_tx, tx_list);
 
 		list_del(&tx->tx_list);
 		/* complete now */
 		tx->tx_waiting = 0;
 		tx->tx_status = status;
+		if (hstatus != LNET_MSG_STATUS_OK)
+			tx->tx_hstatus = hstatus;
 		kiblnd_tx_done(tx);
 	}
 }
 
-static kib_tx_t *
+static struct kib_tx *
 kiblnd_get_idle_tx(struct lnet_ni *ni, lnet_nid_t target)
 {
-	kib_net_t		*net = (kib_net_t *)ni->ni_data;
-	struct list_head	*node;
-	kib_tx_t		*tx;
-	kib_tx_poolset_t	*tps;
+	struct kib_net *net = ni->ni_data;
+	struct list_head *node;
+	struct kib_tx *tx;
+	struct kib_tx_poolset *tps;
 
 	tps = net->ibn_tx_ps[lnet_cpt_of_nid(target, ni)];
 	node = kiblnd_pool_alloc_node(&tps->tps_poolset);
         if (node == NULL)
                 return NULL;
-        tx = container_of(node, kib_tx_t, tx_list);
+	tx = container_of(node, struct kib_tx, tx_list);
 
         LASSERT (tx->tx_nwrq == 0);
         LASSERT (!tx->tx_queued);
@@ -129,15 +137,18 @@ kiblnd_get_idle_tx(struct lnet_ni *ni, lnet_nid_t target)
         LASSERT (tx->tx_lntmsg[1] == NULL);
         LASSERT (tx->tx_nfrags == 0);
 
+	tx->tx_gaps = false;
+	tx->tx_hstatus = LNET_MSG_STATUS_OK;
+
         return tx;
 }
 
 static void
-kiblnd_drop_rx(kib_rx_t *rx)
+kiblnd_drop_rx(struct kib_rx *rx)
 {
-	kib_conn_t		*conn	= rx->rx_conn;
-	struct kib_sched_info	*sched	= conn->ibc_sched;
-	unsigned long		flags;
+	struct kib_conn *conn = rx->rx_conn;
+	struct kib_sched_info *sched = conn->ibc_sched;
+	unsigned long flags;
 
 	spin_lock_irqsave(&sched->ibs_lock, flags);
 	LASSERT(conn->ibc_nrx > 0);
@@ -148,15 +159,15 @@ kiblnd_drop_rx(kib_rx_t *rx)
 }
 
 int
-kiblnd_post_rx (kib_rx_t *rx, int credit)
+kiblnd_post_rx(struct kib_rx *rx, int credit)
 {
-	kib_conn_t         *conn = rx->rx_conn;
-	kib_net_t          *net = conn->ibc_peer->ibp_ni->ni_data;
-	struct ib_recv_wr  *bad_wrq = NULL;
+	struct kib_conn *conn = rx->rx_conn;
+	struct kib_net *net = conn->ibc_peer->ibp_ni->ni_data;
+	struct ib_recv_wr *bad_wrq = NULL;
 #ifdef HAVE_IB_GET_DMA_MR
-	struct ib_mr       *mr = conn->ibc_hdev->ibh_mrs;
+	struct ib_mr *mr = conn->ibc_hdev->ibh_mrs;
 #endif
-	int                 rc;
+	int rc;
 
 	LASSERT (net != NULL);
 	LASSERT (!in_interrupt());
@@ -229,13 +240,13 @@ kiblnd_post_rx (kib_rx_t *rx, int credit)
 	return rc;
 }
 
-static kib_tx_t *
-kiblnd_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
+static struct kib_tx *
+kiblnd_find_waiting_tx_locked(struct kib_conn *conn, int txtype, u64 cookie)
 {
 	struct list_head *tmp;
 
 	list_for_each(tmp, &conn->ibc_active_txs) {
-		kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
+		struct kib_tx *tx = list_entry(tmp, struct kib_tx, tx_list);
 
 		LASSERT(!tx->tx_queued);
 		LASSERT(tx->tx_sending != 0 || tx->tx_waiting);
@@ -255,11 +266,11 @@ kiblnd_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
 }
 
 static void
-kiblnd_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
+kiblnd_handle_completion(struct kib_conn *conn, int txtype, int status, u64 cookie)
 {
-	kib_tx_t    *tx;
-	struct lnet_ni   *ni = conn->ibc_peer->ibp_ni;
-	int          idle;
+	struct kib_tx *tx;
+	struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
+	int idle;
 
 	spin_lock(&conn->ibc_lock);
 
@@ -268,23 +279,24 @@ kiblnd_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
 		spin_unlock(&conn->ibc_lock);
 
 		CWARN("Unmatched completion type %x cookie %#llx from %s\n",
-                      txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
-                kiblnd_close_conn(conn, -EPROTO);
-                return;
-        }
+		      txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		kiblnd_close_conn(conn, -EPROTO);
+		return;
+	}
 
-        if (tx->tx_status == 0) {               /* success so far */
-                if (status < 0) {               /* failed? */
-                        tx->tx_status = status;
-                } else if (txtype == IBLND_MSG_GET_REQ) {
-                        lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status);
-                }
-        }
+	if (tx->tx_status == 0) {               /* success so far */
+		if (status < 0) {               /* failed? */
+			tx->tx_status = status;
+			tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_ERROR;
+		} else if (txtype == IBLND_MSG_GET_REQ) {
+			lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status);
+		}
+	}
 
-        tx->tx_waiting = 0;
+	tx->tx_waiting = 0;
 
-        idle = !tx->tx_queued && (tx->tx_sending == 0);
-        if (idle)
+	idle = !tx->tx_queued && (tx->tx_sending == 0);
+	if (idle)
 		list_del(&tx->tx_list);
 
 	spin_unlock(&conn->ibc_lock);
@@ -294,10 +306,10 @@ kiblnd_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
 }
 
 static void
-kiblnd_send_completion(kib_conn_t *conn, int type, int status, __u64 cookie)
+kiblnd_send_completion(struct kib_conn *conn, int type, int status, u64 cookie)
 {
-	struct lnet_ni   *ni = conn->ibc_peer->ibp_ni;
-	kib_tx_t    *tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+	struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
+	struct kib_tx *tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
 
         if (tx == NULL) {
                 CERROR("Can't get tx for completion %x for %s\n",
@@ -307,19 +319,19 @@ kiblnd_send_completion(kib_conn_t *conn, int type, int status, __u64 cookie)
 
         tx->tx_msg->ibm_u.completion.ibcm_status = status;
         tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
-        kiblnd_init_tx_msg(ni, tx, type, sizeof(kib_completion_msg_t));
+	kiblnd_init_tx_msg(ni, tx, type, sizeof(struct kib_completion_msg));
 
         kiblnd_queue_tx(tx, conn);
 }
 
 static void
-kiblnd_handle_rx (kib_rx_t *rx)
+kiblnd_handle_rx(struct kib_rx *rx)
 {
-        kib_msg_t    *msg = rx->rx_msg;
-        kib_conn_t   *conn = rx->rx_conn;
-	struct lnet_ni    *ni = conn->ibc_peer->ibp_ni;
+	struct kib_msg *msg = rx->rx_msg;
+	struct kib_conn   *conn = rx->rx_conn;
+	struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
         int           credits = msg->ibm_credits;
-        kib_tx_t     *tx;
+	struct kib_tx *tx;
         int           rc = 0;
         int           rc2;
         int           post_credit;
@@ -474,14 +486,14 @@ kiblnd_handle_rx (kib_rx_t *rx)
 }
 
 static void
-kiblnd_rx_complete (kib_rx_t *rx, int status, int nob)
+kiblnd_rx_complete(struct kib_rx *rx, int status, int nob)
 {
-        kib_msg_t    *msg = rx->rx_msg;
-        kib_conn_t   *conn = rx->rx_conn;
-	struct lnet_ni    *ni = conn->ibc_peer->ibp_ni;
-        kib_net_t    *net = ni->ni_data;
-        int           rc;
-        int           err = -EIO;
+	struct kib_msg *msg = rx->rx_msg;
+	struct kib_conn   *conn = rx->rx_conn;
+	struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
+	struct kib_net *net = ni->ni_data;
+	int rc;
+	int err = -EIO;
 
         LASSERT (net != NULL);
         LASSERT (rx->rx_nob < 0);               /* was posted */
@@ -545,47 +557,112 @@ kiblnd_rx_complete (kib_rx_t *rx, int status, int nob)
 }
 
 static int
-kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, __u32 nob)
+kiblnd_fmr_map_tx(struct kib_net *net, struct kib_tx *tx,
+		  struct kib_rdma_desc *rd, u32 nob)
 {
-	kib_hca_dev_t		*hdev;
-	kib_fmr_poolset_t	*fps;
+	struct kib_hca_dev *hdev;
+	struct kib_dev *dev;
+	struct kib_fmr_poolset *fps;
 	int			cpt;
 	int			rc;
-	bool			is_fastreg = 0;
+	int i;
 
 	LASSERT(tx->tx_pool != NULL);
 	LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL);
 
+	dev = net->ibn_dev;
 	hdev = tx->tx_pool->tpo_hdev;
 	cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
 
+	/*
+	 * If we're dealing with FastReg, but the device doesn't
+	 * support GAPS and the tx has GAPS, then there is no real point
+	 * in trying to map the memory, because it'll just fail. So
+	 * preemptively fail with an appropriate message
+	 */
+	if ((dev->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED) &&
+	    !(dev->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT) &&
+	    tx->tx_gaps) {
+		CERROR("Using FastReg with no GAPS support, but tx has gaps. "
+		       "Try setting use_fastreg_gaps to 1\n");
+		return -EPROTONOSUPPORT;
+	}
+
+#ifdef HAVE_FMR_POOL_API
+	/*
+	 * FMR does not support gaps but the tx has gaps then
+	 * we should make sure that the number of fragments we'll be sending
+	 * over fits within the number of fragments negotiated on the
+	 * connection, otherwise, we won't be able to RDMA the data.
+	 * We need to maintain the number of fragments negotiation on the
+	 * connection for backwards compatibility.
+	 */
+	if (tx->tx_gaps && (dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED)) {
+		if (tx->tx_conn &&
+		    tx->tx_conn->ibc_max_frags <= rd->rd_nfrags) {
+			CERROR("TX number of frags (%d) is <= than connection"
+			       " number of frags (%d). Consider setting peer's"
+			       " map_on_demand to 256\n", tx->tx_nfrags,
+			       tx->tx_conn->ibc_max_frags);
+			return -EFBIG;
+		}
+	}
+#endif
+
 	fps = net->ibn_fmr_ps[cpt];
-	rc = kiblnd_fmr_pool_map(fps, tx, rd, nob, 0, &tx->fmr, &is_fastreg);
+	rc = kiblnd_fmr_pool_map(fps, tx, rd, nob, 0, &tx->tx_fmr);
 	if (rc != 0) {
-		CERROR("Can't map %u pages: %d\n", nob, rc);
+		CERROR("Can't map %u bytes (%u/%u)s: %d\n", nob,
+		       tx->tx_nfrags, rd->rd_nfrags, rc);
 		return rc;
 	}
 
-	/* If rd is not tx_rd, it's going to get sent to a peer_ni, who will need
-	 * the rkey */
-	rd->rd_key = tx->fmr.fmr_key;
-	if (!is_fastreg)
-		rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask;
-	rd->rd_frags[0].rf_nob   = nob;
-	rd->rd_nfrags = 1;
+	/*
+	 * If rd is not tx_rd, it's going to get sent to a peer_ni, who will
+	 * need the rkey
+	 */
+	rd->rd_key = tx->tx_fmr.fmr_key;
+	/*
+	 * for FastReg or FMR with no gaps we can accumulate all
+	 * the fragments in one FastReg or FMR fragment.
+	 */
+	if (
+#ifdef HAVE_FMR_POOL_API
+	    ((dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED)
+	     && !tx->tx_gaps) ||
+#endif
+	    (dev->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED)) {
+		/* FMR requires zero based address */
+#ifdef HAVE_FMR_POOL_API
+		if (dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED)
+			rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask;
+#endif
+		rd->rd_frags[0].rf_nob = nob;
+		rd->rd_nfrags = 1;
+	} else {
+		/*
+		 * We're transmitting with gaps using FMR.
+		 * We'll need to use multiple fragments and identify the
+		 * zero based address of each fragment.
+		 */
+		for (i = 0; i < rd->rd_nfrags; i++) {
+			rd->rd_frags[i].rf_addr &= ~hdev->ibh_page_mask;
+			rd->rd_frags[i].rf_addr += i << hdev->ibh_page_shift;
+		}
+	}
 
 	return 0;
 }
 
 static void
-kiblnd_unmap_tx(kib_tx_t *tx)
+kiblnd_unmap_tx(struct kib_tx *tx)
 {
 	if (
 #ifdef HAVE_FMR_POOL_API
-		tx->fmr.fmr_pfmr ||
+		tx->tx_fmr.fmr_pfmr ||
 #endif
-		tx->fmr.fmr_frd)
-		kiblnd_fmr_pool_unmap(&tx->fmr, tx->tx_status);
+		tx->tx_fmr.fmr_frd)
+		kiblnd_fmr_pool_unmap(&tx->tx_fmr, tx->tx_status);
 
 	if (tx->tx_nfrags != 0) {
 		kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev,
@@ -594,13 +671,46 @@ kiblnd_unmap_tx(kib_tx_t *tx)
 	}
 }
 
-static int
-kiblnd_map_tx(struct lnet_ni *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, int nfrags)
+#ifdef HAVE_IB_GET_DMA_MR
+static struct ib_mr *
+kiblnd_find_rd_dma_mr(struct lnet_ni *ni, struct kib_rdma_desc *rd)
 {
-	kib_net_t     *net   = ni->ni_data;
-	kib_hca_dev_t *hdev  = net->ibn_dev->ibd_hdev;
+	struct kib_net *net = ni->ni_data;
+	struct kib_hca_dev *hdev = net->ibn_dev->ibd_hdev;
+	struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+
+	tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+
+	/*
+	 * if map-on-demand is turned on and the device supports
+	 * either FMR or FastReg then use that. Otherwise use global
+	 * memory regions. If that's not available either, then you're
+	 * dead in the water and fail the operation.
+	 */
+	if (tunables->lnd_map_on_demand &&
+	    (net->ibn_dev->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED
+#ifdef HAVE_FMR_POOL_API
+	     || net->ibn_dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED
+#endif
+	))
+		return NULL;
+
+	/*
+	 * hdev->ibh_mrs can be NULL. This case is dealt with gracefully
+	 * in the call chain. The mapping will fail with appropriate error
+	 * message.
+	 */
+	return hdev->ibh_mrs;
+}
+#endif
+
+static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
+			 struct kib_rdma_desc *rd, int nfrags)
+{
+	struct kib_net *net = ni->ni_data;
+	struct kib_hca_dev *hdev = net->ibn_dev->ibd_hdev;
 #ifdef HAVE_IB_GET_DMA_MR
-	struct ib_mr  *mr    = NULL;
+	struct ib_mr *mr = NULL;
 #endif
 	__u32 nob;
 	int i;
@@ -622,9 +732,7 @@ kiblnd_map_tx(struct lnet_ni *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, int nfrags)
         }
 
 #ifdef HAVE_IB_GET_DMA_MR
-	mr = kiblnd_find_rd_dma_mr(ni, rd,
-				   (tx->tx_conn != NULL) ?
-				   tx->tx_conn->ibc_max_frags : -1);
+	mr = kiblnd_find_rd_dma_mr(ni, rd);
 	if (mr != NULL) {
 		/* found pre-mapping MR */
 		rd->rd_key = (rd != tx->tx_rd) ? mr->rkey : mr->lkey;
@@ -638,17 +746,17 @@ kiblnd_map_tx(struct lnet_ni *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, int nfrags)
 	return -EINVAL;
 }
 
-
-static int
-kiblnd_setup_rd_iov(struct lnet_ni *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
-		    unsigned int niov, struct kvec *iov, int offset, int nob)
+static int kiblnd_setup_rd_iov(struct lnet_ni *ni, struct kib_tx *tx,
+			       struct kib_rdma_desc *rd, unsigned int niov,
+			       struct kvec *iov, int offset, int nob)
 {
-        kib_net_t          *net = ni->ni_data;
-        struct page        *page;
+	struct kib_net *net = ni->ni_data;
+	struct page *page;
         struct scatterlist *sg;
         unsigned long       vaddr;
         int                 fragnob;
         int                 page_offset;
+	unsigned int	    max_niov;
 
         LASSERT (nob > 0);
         LASSERT (niov > 0);
@@ -661,6 +769,8 @@ kiblnd_setup_rd_iov(struct lnet_ni *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
                 LASSERT (niov > 0);
         }
 
+	max_niov = niov;
+
 	sg = tx->tx_frags;
 	do {
 		LASSERT(niov > 0);
@@ -676,6 +786,20 @@ kiblnd_setup_rd_iov(struct lnet_ni *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
 		fragnob = min((int)(iov->iov_len - offset), nob);
 		fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
 
+		/*
+		 * We're allowed to start at a non-aligned page offset in
+		 * the first fragment and end at a non-aligned page offset
+		 * in the last fragment.
+		 */
+		if ((fragnob < (int)PAGE_SIZE - page_offset) &&
+		    (niov < max_niov) && nob > fragnob) {
+			CDEBUG(D_NET, "fragnob %d < available page %d: with"
+				      " remaining %d iovs with %d nob left\n",
+			       fragnob, (int)PAGE_SIZE - page_offset, niov,
+			       nob);
+			tx->tx_gaps = true;
+		}
+
 		sg_set_page(sg, page, fragnob, page_offset);
 		sg = sg_next(sg);
 		if (!sg) {
@@ -696,32 +820,49 @@ kiblnd_setup_rd_iov(struct lnet_ni *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
         return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
 }
 
-static int
-kiblnd_setup_rd_kiov(struct lnet_ni *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
-		     int nkiov, lnet_kiov_t *kiov, int offset, int nob)
+static int kiblnd_setup_rd_kiov(struct lnet_ni *ni, struct kib_tx *tx,
+				struct kib_rdma_desc *rd, int nkiov,
+				lnet_kiov_t *kiov, int offset, int nob)
 {
-        kib_net_t          *net = ni->ni_data;
-        struct scatterlist *sg;
-        int                 fragnob;
+	struct kib_net *net = ni->ni_data;
+	struct scatterlist *sg;
+	int                 fragnob;
+	int		    max_nkiov;
+
+	CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+
+	LASSERT(nob > 0);
+	LASSERT(nkiov > 0);
+	LASSERT(net != NULL);
+
+	while (offset >= kiov->kiov_len) {
+		offset -= kiov->kiov_len;
+		nkiov--;
+		kiov++;
+		LASSERT(nkiov > 0);
+	}
 
-        CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+	max_nkiov = nkiov;
 
-        LASSERT (nob > 0);
-        LASSERT (nkiov > 0);
-        LASSERT (net != NULL);
-
-        while (offset >= kiov->kiov_len) {
-                offset -= kiov->kiov_len;
-                nkiov--;
-                kiov++;
-                LASSERT (nkiov > 0);
-        }
+	sg = tx->tx_frags;
+	do {
+		LASSERT(nkiov > 0);
 
-        sg = tx->tx_frags;
-        do {
-                LASSERT (nkiov > 0);
+		fragnob = min((int)(kiov->kiov_len - offset), nob);
 
-                fragnob = min((int)(kiov->kiov_len - offset), nob);
+		/*
+		 * We're allowed to start at a non-aligned page offset in
+		 * the first fragment and end at a non-aligned page offset
+		 * in the last fragment.
+		 */
+		if ((fragnob < (int)(kiov->kiov_len - offset)) &&
+		    nkiov < max_nkiov && nob > fragnob) {
+			CDEBUG(D_NET, "fragnob %d < available page %d: with"
+				      " remaining %d kiovs with %d nob left\n",
+			       fragnob, (int)(kiov->kiov_len - offset),
+			       nkiov, nob);
+			tx->tx_gaps = true;
+		}
 
 		sg_set_page(sg, kiov->kiov_page, fragnob,
 			    kiov->kiov_offset + offset);
@@ -731,22 +872,23 @@ kiblnd_setup_rd_kiov(struct lnet_ni *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
 			return -EFAULT;
 		}
 
-                offset = 0;
-                kiov++;
-                nkiov--;
-                nob -= fragnob;
-        } while (nob > 0);
+		offset = 0;
+		kiov++;
+		nkiov--;
+		nob -= fragnob;
+	} while (nob > 0);
 
-        return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
+	return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
 }
 
 static int
-kiblnd_post_tx_locked (kib_conn_t *conn, kib_tx_t *tx, int credit)
+kiblnd_post_tx_locked(struct kib_conn *conn, struct kib_tx *tx, int credit)
 __must_hold(&conn->ibc_lock)
 {
-	kib_msg_t *msg = tx->tx_msg;
-	kib_peer_ni_t *peer_ni = conn->ibc_peer;
+	struct kib_msg *msg = tx->tx_msg;
+	struct kib_peer_ni *peer_ni = conn->ibc_peer;
 	struct lnet_ni *ni = peer_ni->ibp_ni;
+	struct kib_fast_reg_descriptor *frd = tx->tx_fmr.fmr_frd;
 	int ver = conn->ibc_version;
 	int rc;
 	int done;
@@ -764,11 +906,11 @@ __must_hold(&conn->ibc_lock)
 
 	if (conn->ibc_nsends_posted ==
 	    kiblnd_concurrent_sends(ver, ni)) {
-                /* tx completions outstanding... */
-                CDEBUG(D_NET, "%s: posted enough\n",
-                       libcfs_nid2str(peer_ni->ibp_nid));
-                return -EAGAIN;
-        }
+		/* tx completions outstanding... */
+		CDEBUG(D_NET, "%s: posted enough\n",
+		       libcfs_nid2str(peer_ni->ibp_nid));
+		return -EAGAIN;
+	}
 
         if (credit != 0 && conn->ibc_credits == 0) {   /* no credits */
                 CDEBUG(D_NET, "%s: no credits\n",
@@ -796,6 +938,7 @@ __must_hold(&conn->ibc_lock)
 		 * kiblnd_check_sends_locked will queue NOOP again when
 		 * posted NOOPs complete */
 		spin_unlock(&conn->ibc_lock);
+		tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
 		kiblnd_tx_done(tx);
 		spin_lock(&conn->ibc_lock);
                 CDEBUG(D_NET, "%s(%d): redundant or enough NOOP\n",
@@ -830,11 +973,10 @@ __must_hold(&conn->ibc_lock)
                 /* close_conn will launch failover */
                 rc = -ENETDOWN;
         } else {
-		struct kib_fast_reg_descriptor *frd = tx->fmr.fmr_frd;
 		struct ib_send_wr *bad = &tx->tx_wrq[tx->tx_nwrq - 1].wr;
 		struct ib_send_wr *wr  = &tx->tx_wrq[0].wr;
 
-		if (frd != NULL) {
+		if (frd != NULL && !frd->frd_posted) {
 			if (!frd->frd_valid) {
 				wr = &frd->frd_inv_wr.wr;
 				wr->next = &frd->frd_fastreg_wr.wr;
@@ -850,18 +992,24 @@ __must_hold(&conn->ibc_lock)
 			 libcfs_nid2str(conn->ibc_peer->ibp_nid));
 
 		bad = NULL;
+		if (lnet_send_error_simulation(tx->tx_lntmsg[0], &tx->tx_hstatus))
+			rc = -EINVAL;
+		else
 #ifdef HAVE_IB_POST_SEND_RECV_CONST
-		rc = ib_post_send(conn->ibc_cmid->qp, wr,
-				  (const struct ib_send_wr **)&bad);
+			rc = ib_post_send(conn->ibc_cmid->qp, wr,
+					  (const struct ib_send_wr **)&bad);
 #else
-		rc = ib_post_send(conn->ibc_cmid->qp, wr, &bad);
+			rc = ib_post_send(conn->ibc_cmid->qp, wr, &bad);
 #endif
 	}
 
-        conn->ibc_last_send = jiffies;
+	conn->ibc_last_send = ktime_get();
 
-        if (rc == 0)
-                return 0;
+	if (rc == 0) {
+		if (frd != NULL)
+			frd->frd_posted = true;
+		return 0;
+	}
 
         /* NB credits are transferred in the actual
          * message, which can only be the last work item */
@@ -899,11 +1047,11 @@ __must_hold(&conn->ibc_lock)
 }
 
 static void
-kiblnd_check_sends_locked(kib_conn_t *conn)
+kiblnd_check_sends_locked(struct kib_conn *conn)
 {
-        int        ver = conn->ibc_version;
+	int ver = conn->ibc_version;
 	struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
-        kib_tx_t  *tx;
+	struct kib_tx *tx;
 
         /* Don't send anything until after the connection is established */
         if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
@@ -921,7 +1069,7 @@ kiblnd_check_sends_locked(kib_conn_t *conn)
         while (conn->ibc_reserved_credits > 0 &&
 	       !list_empty(&conn->ibc_tx_queue_rsrvd)) {
 		tx = list_entry(conn->ibc_tx_queue_rsrvd.next,
-                                    kib_tx_t, tx_list);
+				struct kib_tx, tx_list);
 		list_del(&tx->tx_list);
 		list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
                 conn->ibc_reserved_credits--;
@@ -945,16 +1093,16 @@ kiblnd_check_sends_locked(kib_conn_t *conn)
 		if (!list_empty(&conn->ibc_tx_queue_nocred)) {
                         credit = 0;
 			tx = list_entry(conn->ibc_tx_queue_nocred.next,
-                                            kib_tx_t, tx_list);
+					struct kib_tx, tx_list);
 		} else if (!list_empty(&conn->ibc_tx_noops)) {
                         LASSERT (!IBLND_OOB_CAPABLE(ver));
                         credit = 1;
 			tx = list_entry(conn->ibc_tx_noops.next,
-                                        kib_tx_t, tx_list);
+					struct kib_tx, tx_list);
 		} else if (!list_empty(&conn->ibc_tx_queue)) {
                         credit = 1;
 			tx = list_entry(conn->ibc_tx_queue.next,
-                                            kib_tx_t, tx_list);
+					struct kib_tx, tx_list);
                 } else
                         break;
 
@@ -964,26 +1112,30 @@ kiblnd_check_sends_locked(kib_conn_t *conn)
 }
 
 static void
-kiblnd_tx_complete (kib_tx_t *tx, int status)
+kiblnd_tx_complete(struct kib_tx *tx, int status)
 {
-        int           failed = (status != IB_WC_SUCCESS);
-        kib_conn_t   *conn = tx->tx_conn;
-        int           idle;
+	int           failed = (status != IB_WC_SUCCESS);
+	struct kib_conn   *conn = tx->tx_conn;
+	int           idle;
 
-        LASSERT (tx->tx_sending > 0);
+	if (tx->tx_sending <= 0) {
+		CERROR("Received an event on a freed tx: %p status %d\n",
+		       tx, tx->tx_status);
+		return;
+	}
 
-        if (failed) {
-                if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
+	if (failed) {
+		if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
 			CNETERR("Tx -> %s cookie %#llx"
-                                " sending %d waiting %d: failed %d\n",
-                                libcfs_nid2str(conn->ibc_peer->ibp_nid),
-                                tx->tx_cookie, tx->tx_sending, tx->tx_waiting,
-                                status);
+				" sending %d waiting %d: failed %d\n",
+				libcfs_nid2str(conn->ibc_peer->ibp_nid),
+				tx->tx_cookie, tx->tx_sending, tx->tx_waiting,
+				status);
 
-                kiblnd_close_conn(conn, -EIO);
-        } else {
-                kiblnd_peer_alive(conn->ibc_peer);
-        }
+		kiblnd_close_conn(conn, -EIO);
+	} else {
+		kiblnd_peer_alive(conn->ibc_peer);
+	}
 
 	spin_lock(&conn->ibc_lock);
 
@@ -996,6 +1148,7 @@ kiblnd_tx_complete (kib_tx_t *tx, int status)
                 conn->ibc_noops_posted--;
 
         if (failed) {
+		tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_DROPPED;
                 tx->tx_waiting = 0;             /* don't wait for peer_ni */
                 tx->tx_status = -EIO;
         }
@@ -1014,12 +1167,13 @@ kiblnd_tx_complete (kib_tx_t *tx, int status)
 }
 
 static void
-kiblnd_init_tx_msg(struct lnet_ni *ni, kib_tx_t *tx, int type, int body_nob)
+kiblnd_init_tx_msg(struct lnet_ni *ni, struct kib_tx *tx, int type,
+		   int body_nob)
 {
-	kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev;
+	struct kib_hca_dev *hdev = tx->tx_pool->tpo_hdev;
 	struct ib_sge *sge = &tx->tx_msgsge;
 	struct ib_rdma_wr *wrq;
-	int nob = offsetof(kib_msg_t, ibm_u) + body_nob;
+	int nob = offsetof(struct kib_msg, ibm_u) + body_nob;
 #ifdef HAVE_IB_GET_DMA_MR
 	struct ib_mr *mr = hdev->ibh_mrs;
 #endif
@@ -1055,11 +1209,11 @@ kiblnd_init_tx_msg(struct lnet_ni *ni, kib_tx_t *tx, int type, int body_nob)
 }
 
 static int
-kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type,
-		 int resid, kib_rdma_desc_t *dstrd, __u64 dstcookie)
+kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type,
+		 int resid, struct kib_rdma_desc *dstrd, u64 dstcookie)
 {
-	kib_msg_t	  *ibmsg = tx->tx_msg;
-	kib_rdma_desc_t   *srcrd = tx->tx_rd;
+	struct kib_msg *ibmsg = tx->tx_msg;
+	struct kib_rdma_desc *srcrd = tx->tx_rd;
 	struct ib_rdma_wr *wrq = NULL;
 	struct ib_sge	  *sge;
 	int		   rc  = resid;
@@ -1147,24 +1301,39 @@ kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type,
         ibmsg->ibm_u.completion.ibcm_status = rc;
         ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
         kiblnd_init_tx_msg(conn->ibc_peer->ibp_ni, tx,
-                           type, sizeof (kib_completion_msg_t));
+			   type, sizeof(struct kib_completion_msg));
 
         return rc;
 }
 
 static void
-kiblnd_queue_tx_locked(kib_tx_t *tx, kib_conn_t *conn)
+kiblnd_queue_tx_locked(struct kib_tx *tx, struct kib_conn *conn)
 {
 	struct list_head *q;
+	s64 timeout_ns;
 
 	LASSERT(tx->tx_nwrq > 0);	/* work items set up */
 	LASSERT(!tx->tx_queued);	/* not queued for sending already */
 	LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
 
+	if (conn->ibc_state >= IBLND_CONN_DISCONNECTED) {
+		tx->tx_status = -ECONNABORTED;
+		tx->tx_waiting = 0;
+		if (tx->tx_conn != NULL) {
+			/* PUT_DONE first attached to conn as a PUT_REQ */
+			LASSERT(tx->tx_conn == conn);
+			LASSERT(tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE);
+			tx->tx_conn = NULL;
+			kiblnd_conn_decref(conn);
+		}
+		list_add(&tx->tx_list, &conn->ibc_zombie_txs);
+
+		return;
+	}
+
+	timeout_ns = lnet_get_lnd_timeout() * NSEC_PER_SEC;
 	tx->tx_queued = 1;
-	tx->tx_deadline = jiffies +
-			  msecs_to_jiffies(*kiblnd_tunables.kib_timeout *
-					   MSEC_PER_SEC);
+	tx->tx_deadline = ktime_add_ns(ktime_get(), timeout_ns);
 
         if (tx->tx_conn == NULL) {
                 kiblnd_conn_addref(conn);
@@ -1208,7 +1377,7 @@ kiblnd_queue_tx_locked(kib_tx_t *tx, kib_conn_t *conn)
 }
 
 static void
-kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
+kiblnd_queue_tx(struct kib_tx *tx, struct kib_conn *conn)
 {
 	spin_lock(&conn->ibc_lock);
 	kiblnd_queue_tx_locked(tx, conn);
@@ -1254,14 +1423,14 @@ static int kiblnd_resolve_addr(struct rdma_cm_id *cmid,
 }
 
 static void
-kiblnd_connect_peer (kib_peer_ni_t *peer_ni)
+kiblnd_connect_peer(struct kib_peer_ni *peer_ni)
 {
         struct rdma_cm_id *cmid;
-        kib_dev_t         *dev;
-        kib_net_t         *net = peer_ni->ibp_ni->ni_data;
+	struct kib_dev *dev;
+	struct kib_net *net = peer_ni->ibp_ni->ni_data;
         struct sockaddr_in srcaddr;
         struct sockaddr_in dstaddr;
-        int                rc;
+	int rc;
 
         LASSERT (net != NULL);
         LASSERT (peer_ni->ibp_connecting > 0);
@@ -1289,21 +1458,21 @@ kiblnd_connect_peer (kib_peer_ni_t *peer_ni)
 
         kiblnd_peer_addref(peer_ni);               /* cmid's ref */
 
-        if (*kiblnd_tunables.kib_use_priv_port) {
-                rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr,
-                                         *kiblnd_tunables.kib_timeout * 1000);
-        } else {
-                rc = rdma_resolve_addr(cmid,
-                                       (struct sockaddr *)&srcaddr,
-                                       (struct sockaddr *)&dstaddr,
-                                       *kiblnd_tunables.kib_timeout * 1000);
-        }
-        if (rc != 0) {
-                /* Can't initiate address resolution:  */
-                CERROR("Can't resolve addr for %s: %d\n",
-                       libcfs_nid2str(peer_ni->ibp_nid), rc);
-                goto failed2;
-        }
+	if (*kiblnd_tunables.kib_use_priv_port) {
+		rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr,
+					 lnet_get_lnd_timeout() * 1000);
+	} else {
+		rc = rdma_resolve_addr(cmid,
+				       (struct sockaddr *)&srcaddr,
+				       (struct sockaddr *)&dstaddr,
+				       lnet_get_lnd_timeout() * 1000);
+	}
+	if (rc != 0) {
+		/* Can't initiate address resolution:  */
+		CERROR("Can't resolve addr for %s: %d\n",
+		       libcfs_nid2str(peer_ni->ibp_nid), rc);
+		goto failed2;
+	}
 
 	return;
 
@@ -1317,7 +1486,7 @@ kiblnd_connect_peer (kib_peer_ni_t *peer_ni)
 }
 
 bool
-kiblnd_reconnect_peer(kib_peer_ni_t *peer_ni)
+kiblnd_reconnect_peer(struct kib_peer_ni *peer_ni)
 {
 	rwlock_t	 *glock = &kiblnd_data.kib_global_lock;
 	char		 *reason = NULL;
@@ -1363,17 +1532,18 @@ kiblnd_reconnect_peer(kib_peer_ni_t *peer_ni)
 
 	CWARN("Abort reconnection of %s: %s\n",
 	      libcfs_nid2str(peer_ni->ibp_nid), reason);
-	kiblnd_txlist_done(&txs, -ECONNABORTED);
+	kiblnd_txlist_done(&txs, -ECONNABORTED,
+			   LNET_MSG_STATUS_LOCAL_ABORTED);
 	return false;
 }
 
 void
-kiblnd_launch_tx(struct lnet_ni *ni, kib_tx_t *tx, lnet_nid_t nid)
+kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid)
 {
-        kib_peer_ni_t        *peer_ni;
-        kib_peer_ni_t        *peer2;
-        kib_conn_t        *conn;
-	rwlock_t	*g_lock = &kiblnd_data.kib_global_lock;
+	struct kib_peer_ni *peer_ni;
+	struct kib_peer_ni *peer2;
+	struct kib_conn *conn;
+	rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
         unsigned long      flags;
         int                rc;
 	int		   i;
@@ -1438,6 +1608,7 @@ kiblnd_launch_tx(struct lnet_ni *ni, kib_tx_t *tx, lnet_nid_t nid)
 		if (tx != NULL) {
 			tx->tx_status = -EHOSTUNREACH;
 			tx->tx_waiting = 0;
+			tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
 			kiblnd_tx_done(tx);
 		}
 		return;
@@ -1475,7 +1646,7 @@ kiblnd_launch_tx(struct lnet_ni *ni, kib_tx_t *tx, lnet_nid_t nid)
 	peer_ni->ibp_connecting = tunables->lnd_conns_per_peer;
 
 	/* always called with a ref on ni, which prevents ni being shutdown */
-	LASSERT(((kib_net_t *)ni->ni_data)->ibn_shutdown == 0);
+	LASSERT(((struct kib_net *)ni->ni_data)->ibn_shutdown == 0);
 
 	if (tx != NULL)
 		list_add_tail(&tx->tx_list, &peer_ni->ibp_tx_queue);
@@ -1503,9 +1674,9 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
 	lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
 	unsigned int      payload_offset = lntmsg->msg_offset;
 	unsigned int      payload_nob = lntmsg->msg_len;
-	kib_msg_t        *ibmsg;
-	kib_rdma_desc_t  *rd;
-	kib_tx_t         *tx;
+	struct kib_msg *ibmsg;
+	struct kib_rdma_desc *rd;
+	struct kib_tx *tx;
 	int               nob;
 	int               rc;
 
@@ -1536,7 +1707,7 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
                         break;                  /* send IMMEDIATE */
 
                 /* is the REPLY message too small for RDMA? */
-                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
+		nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
                 if (nob <= IBLND_MSG_SIZE)
                         break;                  /* send IMMEDIATE */
 
@@ -1562,11 +1733,12 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
 		if (rc != 0) {
 			CERROR("Can't setup GET sink for %s: %d\n",
 			       libcfs_nid2str(target.nid), rc);
+			tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
 			kiblnd_tx_done(tx);
 			return -EIO;
 		}
 
-		nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[rd->rd_nfrags]);
+		nob = offsetof(struct kib_get_msg, ibgm_rd.rd_frags[rd->rd_nfrags]);
 		ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
 		ibmsg->ibm_u.get.ibgm_hdr = *hdr;
 
@@ -1588,7 +1760,7 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
         case LNET_MSG_REPLY:
         case LNET_MSG_PUT:
                 /* Is the payload small enough not to need RDMA? */
-                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+		nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[payload_nob]);
                 if (nob <= IBLND_MSG_SIZE)
                         break;                  /* send IMMEDIATE */
 
@@ -1618,7 +1790,8 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
                 ibmsg = tx->tx_msg;
                 ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
                 ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
-                kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
+		kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_REQ,
+				   sizeof(struct kib_putreq_msg));
 
                 tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
                 tx->tx_waiting = 1;             /* waiting for PUT_{ACK,NAK} */
@@ -1626,10 +1799,9 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
                 return 0;
         }
 
-        /* send IMMEDIATE */
-
-        LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
-                 <= IBLND_MSG_SIZE);
+	/* send IMMEDIATE */
+	LASSERT(offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[payload_nob])
+		<= IBLND_MSG_SIZE);
 
 	tx = kiblnd_get_idle_tx(ni, target.nid);
         if (tx == NULL) {
@@ -1643,16 +1815,16 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
 
         if (payload_kiov != NULL)
                 lnet_copy_kiov2flat(IBLND_MSG_SIZE, ibmsg,
-                                    offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+				    offsetof(struct kib_msg, ibm_u.immediate.ibim_payload),
                                     payload_niov, payload_kiov,
                                     payload_offset, payload_nob);
         else
                 lnet_copy_iov2flat(IBLND_MSG_SIZE, ibmsg,
-                                   offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+				   offsetof(struct kib_msg, ibm_u.immediate.ibim_payload),
                                    payload_niov, payload_iov,
                                    payload_offset, payload_nob);
 
-        nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
+	nob = offsetof(struct kib_immediate_msg, ibim_payload[payload_nob]);
         kiblnd_init_tx_msg(ni, tx, IBLND_MSG_IMMEDIATE, nob);
 
         tx->tx_lntmsg[0] = lntmsg;              /* finalise lntmsg on completion */
@@ -1661,7 +1833,7 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
 }
 
 static void
-kiblnd_reply(struct lnet_ni *ni, kib_rx_t *rx, struct lnet_msg *lntmsg)
+kiblnd_reply(struct lnet_ni *ni, struct kib_rx *rx, struct lnet_msg *lntmsg)
 {
 	struct lnet_process_id target = lntmsg->msg_target;
         unsigned int      niov = lntmsg->msg_niov;
@@ -1669,7 +1841,7 @@ kiblnd_reply(struct lnet_ni *ni, kib_rx_t *rx, struct lnet_msg *lntmsg)
         lnet_kiov_t      *kiov = lntmsg->msg_kiov;
         unsigned int      offset = lntmsg->msg_offset;
         unsigned int      nob = lntmsg->msg_len;
-        kib_tx_t         *tx;
+	struct kib_tx *tx;
         int               rc;
 
 	tx = kiblnd_get_idle_tx(ni, rx->rx_conn->ibc_peer->ibp_nid);
@@ -1716,9 +1888,11 @@ kiblnd_reply(struct lnet_ni *ni, kib_rx_t *rx, struct lnet_msg *lntmsg)
         kiblnd_queue_tx(tx, rx->rx_conn);
         return;
 
- failed_1:
+
+failed_1:
+	tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
 	kiblnd_tx_done(tx);
- failed_0:
+failed_0:
 	lnet_finalize(lntmsg, -EIO);
 }
 
@@ -1727,10 +1901,10 @@ kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
 	    int delayed, unsigned int niov, struct kvec *iov, lnet_kiov_t *kiov,
 	    unsigned int offset, unsigned int mlen, unsigned int rlen)
 {
-        kib_rx_t    *rx = private;
-        kib_msg_t   *rxmsg = rx->rx_msg;
-        kib_conn_t  *conn = rx->rx_conn;
-        kib_tx_t    *tx;
+	struct kib_rx *rx = private;
+	struct kib_msg *rxmsg = rx->rx_msg;
+	struct kib_conn *conn = rx->rx_conn;
+	struct kib_tx *tx;
 	__u64	     ibprm_cookie;
 	int          nob;
 	int          post_credit = IBLND_POSTRX_PEER_CREDIT;
@@ -1746,7 +1920,7 @@ kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
 		LBUG();
 
         case IBLND_MSG_IMMEDIATE:
-                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+		nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[rlen]);
                 if (nob > rx->rx_nob) {
                         CERROR ("Immediate message from %s too big: %d(%d)\n",
                                 libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
@@ -1758,19 +1932,19 @@ kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
                 if (kiov != NULL)
                         lnet_copy_flat2kiov(niov, kiov, offset,
                                             IBLND_MSG_SIZE, rxmsg,
-                                            offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+					    offsetof(struct kib_msg, ibm_u.immediate.ibim_payload),
                                             mlen);
                 else
                         lnet_copy_flat2iov(niov, iov, offset,
                                            IBLND_MSG_SIZE, rxmsg,
-                                           offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+					   offsetof(struct kib_msg, ibm_u.immediate.ibim_payload),
                                            mlen);
 		lnet_finalize(lntmsg, 0);
 		break;
 
 	case IBLND_MSG_PUT_REQ: {
-		kib_msg_t	*txmsg;
-		kib_rdma_desc_t *rd;
+		struct kib_msg	*txmsg;
+		struct kib_rdma_desc *rd;
 		ibprm_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
 
 		if (mlen == 0) {
@@ -1800,6 +1974,7 @@ kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
 		if (rc != 0) {
 			CERROR("Can't setup PUT sink for %s: %d\n",
 			       libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+			tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
 			kiblnd_tx_done(tx);
 			/* tell peer_ni it's over */
 			kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK,
@@ -1807,7 +1982,7 @@ kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
 			break;
 		}
 
-		nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[rd->rd_nfrags]);
+		nob = offsetof(struct kib_putack_msg, ibpam_rd.rd_frags[rd->rd_nfrags]);
 		txmsg->ibm_u.putack.ibpam_src_cookie = ibprm_cookie;
 		txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
 
@@ -1858,18 +2033,18 @@ kiblnd_thread_fini (void)
 }
 
 static void
-kiblnd_peer_alive (kib_peer_ni_t *peer_ni)
+kiblnd_peer_alive(struct kib_peer_ni *peer_ni)
 {
-	/* This is racy, but everyone's only writing cfs_time_current() */
-	peer_ni->ibp_last_alive = cfs_time_current();
+	/* This is racy, but everyone's only writing ktime_get_seconds() */
+	peer_ni->ibp_last_alive = ktime_get_seconds();
 	smp_mb();
 }
 
 static void
-kiblnd_peer_notify (kib_peer_ni_t *peer_ni)
+kiblnd_peer_notify(struct kib_peer_ni *peer_ni)
 {
         int           error = 0;
-        cfs_time_t    last_alive = 0;
+	time64_t last_alive = 0;
         unsigned long flags;
 
 	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
@@ -1889,7 +2064,7 @@ kiblnd_peer_notify (kib_peer_ni_t *peer_ni)
 }
 
 void
-kiblnd_close_conn_locked (kib_conn_t *conn, int error)
+kiblnd_close_conn_locked(struct kib_conn *conn, int error)
 {
         /* This just does the immediate housekeeping.  'error' is zero for a
          * normal shutdown which can happen only after the connection has been
@@ -1897,9 +2072,9 @@ kiblnd_close_conn_locked (kib_conn_t *conn, int error)
          * connection to be finished off by the connd.  Otherwise the connd is
          * already dealing with it (either to set it up or tear it down).
          * Caller holds kib_global_lock exclusively in irq context */
-        kib_peer_ni_t       *peer_ni = conn->ibc_peer;
-        kib_dev_t        *dev;
-        unsigned long     flags;
+	struct kib_peer_ni *peer_ni = conn->ibc_peer;
+	struct kib_dev *dev;
+	unsigned long flags;
 
         LASSERT (error != 0 || conn->ibc_state >= IBLND_CONN_ESTABLISHED);
 
@@ -1929,7 +2104,7 @@ kiblnd_close_conn_locked (kib_conn_t *conn, int error)
 		       list_empty(&conn->ibc_active_txs) ? "" : "(waiting)");
 	}
 
-	dev = ((kib_net_t *)peer_ni->ibp_ni->ni_data)->ibn_dev;
+	dev = ((struct kib_net *)peer_ni->ibp_ni->ni_data)->ibn_dev;
 	if (peer_ni->ibp_next_conn == conn)
 		/* clear next_conn so it won't be used */
 		peer_ni->ibp_next_conn = NULL;
@@ -1962,7 +2137,7 @@ kiblnd_close_conn_locked (kib_conn_t *conn, int error)
 }
 
 void
-kiblnd_close_conn(kib_conn_t *conn, int error)
+kiblnd_close_conn(struct kib_conn *conn, int error)
 {
 	unsigned long flags;
 
@@ -1974,10 +2149,10 @@ kiblnd_close_conn(kib_conn_t *conn, int error)
 }
 
 static void
-kiblnd_handle_early_rxs(kib_conn_t *conn)
+kiblnd_handle_early_rxs(struct kib_conn *conn)
 {
-	unsigned long    flags;
-	kib_rx_t        *rx;
+	unsigned long flags;
+	struct kib_rx *rx;
 
 	LASSERT(!in_interrupt());
 	LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
@@ -1985,7 +2160,7 @@ kiblnd_handle_early_rxs(kib_conn_t *conn)
 	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 	while (!list_empty(&conn->ibc_early_rxs)) {
 		rx = list_entry(conn->ibc_early_rxs.next,
-				    kib_rx_t, rx_list);
+				struct kib_rx, rx_list);
 		list_del(&rx->rx_list);
 		write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 
@@ -1996,30 +2171,52 @@ kiblnd_handle_early_rxs(kib_conn_t *conn)
 	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 }
 
-static void
-kiblnd_abort_txs(kib_conn_t *conn, struct list_head *txs)
+void
+kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs)
 {
 	struct list_head	 zombies = LIST_HEAD_INIT(zombies);
 	struct list_head	*tmp;
 	struct list_head	*nxt;
-	kib_tx_t		*tx;
+	struct kib_tx *tx;
 
 	spin_lock(&conn->ibc_lock);
 
 	list_for_each_safe(tmp, nxt, txs) {
-		tx = list_entry(tmp, kib_tx_t, tx_list);
+		tx = list_entry(tmp, struct kib_tx, tx_list);
 
 		if (txs == &conn->ibc_active_txs) {
 			LASSERT(!tx->tx_queued);
 			LASSERT(tx->tx_waiting ||
 				tx->tx_sending != 0);
+			if (conn->ibc_comms_error == -ETIMEDOUT) {
+				if (tx->tx_waiting && !tx->tx_sending)
+					tx->tx_hstatus =
+					  LNET_MSG_STATUS_REMOTE_TIMEOUT;
+				else if (tx->tx_sending)
+					tx->tx_hstatus =
+					  LNET_MSG_STATUS_NETWORK_TIMEOUT;
+			}
 		} else {
 			LASSERT(tx->tx_queued);
+			if (conn->ibc_comms_error == -ETIMEDOUT)
+				tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_TIMEOUT;
+			else
+				tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
 		}
 
 		tx->tx_status = -ECONNABORTED;
 		tx->tx_waiting = 0;
 
+		/*
+		 * TODO: This makes an assumption that
+		 * kiblnd_tx_complete() will be called for each tx. If
+		 * that event is dropped we could end up with stale
+		 * connections floating around. We'd like to deal with
+		 * that in a better way.
+		 *
+		 * Also that means we can exceed the timeout by many
+		 * seconds.
+		 */
 		if (tx->tx_sending == 0) {
 			tx->tx_queued = 0;
 			list_del(&tx->tx_list);
@@ -2029,22 +2226,28 @@ kiblnd_abort_txs(kib_conn_t *conn, struct list_head *txs)
 
 	spin_unlock(&conn->ibc_lock);
 
-	kiblnd_txlist_done(&zombies, -ECONNABORTED);
+	/*
+	 * aborting transmits occurs when finalizing the connection.
+	 * The connection is finalized on error.
+	 * Passing LNET_MSG_STATUS_OK to txlist_done() will not
+	 * override the value already set in tx->tx_hstatus above.
+	 */
+	kiblnd_txlist_done(&zombies, -ECONNABORTED, LNET_MSG_STATUS_OK);
 }
 
 static void
-kiblnd_finalise_conn (kib_conn_t *conn)
+kiblnd_finalise_conn(struct kib_conn *conn)
 {
 	LASSERT (!in_interrupt());
 	LASSERT (conn->ibc_state > IBLND_CONN_INIT);
 
-	kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED);
-
 	/* abort_receives moves QP state to IB_QPS_ERR.  This is only required
 	 * for connections that didn't get as far as being connected, because
 	 * rdma_disconnect() does this for free. */
 	kiblnd_abort_receives(conn);
 
+	kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED);
+
 	/* Complete all tx descs not waiting for sends to complete.
 	 * NB we should be safe from RDMA now that the QP has changed state */
 
@@ -2058,7 +2261,8 @@ kiblnd_finalise_conn (kib_conn_t *conn)
 }
 
 static void
-kiblnd_peer_connect_failed(kib_peer_ni_t *peer_ni, int active, int error)
+kiblnd_peer_connect_failed(struct kib_peer_ni *peer_ni, int active,
+			   int error)
 {
 	struct list_head zombies = LIST_HEAD_INIT(zombies);
 	unsigned long	flags;
@@ -2086,8 +2290,7 @@ kiblnd_peer_connect_failed(kib_peer_ni_t *peer_ni, int active, int error)
 	peer_ni->ibp_reconnected = 0;
 	if (list_empty(&peer_ni->ibp_conns)) {
 		/* Take peer_ni's blocked transmits to complete with error */
-		list_add(&zombies, &peer_ni->ibp_tx_queue);
-		list_del_init(&peer_ni->ibp_tx_queue);
+		list_splice_init(&peer_ni->ibp_tx_queue, &zombies);
 
 		if (kiblnd_peer_active(peer_ni))
 			kiblnd_unlink_peer_locked(peer_ni);
@@ -2108,14 +2311,15 @@ kiblnd_peer_connect_failed(kib_peer_ni_t *peer_ni, int active, int error)
 	CNETERR("Deleting messages for %s: connection failed\n",
 		libcfs_nid2str(peer_ni->ibp_nid));
 
-	kiblnd_txlist_done(&zombies, -EHOSTUNREACH);
+	kiblnd_txlist_done(&zombies, error,
+			   LNET_MSG_STATUS_LOCAL_DROPPED);
 }
 
 static void
-kiblnd_connreq_done(kib_conn_t *conn, int status)
+kiblnd_connreq_done(struct kib_conn *conn, int status)
 {
-	kib_peer_ni_t	 *peer_ni = conn->ibc_peer;
-	kib_tx_t	 *tx;
+	struct kib_peer_ni *peer_ni = conn->ibc_peer;
+	struct kib_tx *tx;
 	struct list_head txs;
 	unsigned long	 flags;
 	int		 active;
@@ -2132,20 +2336,23 @@ kiblnd_connreq_done(kib_conn_t *conn, int status)
 		 (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT &&
 		  peer_ni->ibp_accepting > 0));
 
-        LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
-        conn->ibc_connvars = NULL;
+	LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+	conn->ibc_connvars = NULL;
 
-        if (status != 0) {
-                /* failed to establish connection */
-                kiblnd_peer_connect_failed(peer_ni, active, status);
-                kiblnd_finalise_conn(conn);
-                return;
-        }
+	if (status != 0) {
+		/* failed to establish connection */
+		kiblnd_peer_connect_failed(peer_ni, active, status);
+		kiblnd_finalise_conn(conn);
+		return;
+	}
 
-        /* connection established */
+	/* connection established */
 	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 
-        conn->ibc_last_send = jiffies;
+	/* reset retry count */
+	peer_ni->ibp_retries = 0;
+
+	conn->ibc_last_send = ktime_get();
         kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED);
         kiblnd_peer_alive(peer_ni);
 
@@ -2183,7 +2390,8 @@ kiblnd_connreq_done(kib_conn_t *conn, int status)
                 kiblnd_close_conn_locked(conn, -ECONNABORTED);
 		write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 
-		kiblnd_txlist_done(&txs, -ECONNABORTED);
+		kiblnd_txlist_done(&txs, -ECONNABORTED,
+				   LNET_MSG_STATUS_LOCAL_ERROR);
 
 		return;
 	}
@@ -2203,7 +2411,7 @@ kiblnd_connreq_done(kib_conn_t *conn, int status)
 	 */
 	spin_lock(&conn->ibc_lock);
 	while (!list_empty(&txs)) {
-		tx = list_entry(txs.next, kib_tx_t, tx_list);
+		tx = list_entry(txs.next, struct kib_tx, tx_list);
 		list_del(&tx->tx_list);
 
 		kiblnd_queue_tx_locked(tx, conn);
@@ -2217,7 +2425,7 @@ kiblnd_connreq_done(kib_conn_t *conn, int status)
 }
 
 static void
-kiblnd_reject(struct rdma_cm_id *cmid, kib_rej_t *rej)
+kiblnd_reject(struct rdma_cm_id *cmid, struct kib_rej *rej)
 {
         int          rc;
 
@@ -2235,17 +2443,17 @@ static int
 kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
 {
 	rwlock_t		*g_lock = &kiblnd_data.kib_global_lock;
-        kib_msg_t             *reqmsg = priv;
-        kib_msg_t             *ackmsg;
-        kib_dev_t             *ibdev;
-        kib_peer_ni_t            *peer_ni;
-        kib_peer_ni_t            *peer2;
-        kib_conn_t            *conn;
-	struct lnet_ni             *ni  = NULL;
-        kib_net_t             *net = NULL;
+	struct kib_msg *reqmsg = priv;
+	struct kib_msg *ackmsg;
+	struct kib_dev *ibdev;
+	struct kib_peer_ni *peer_ni;
+	struct kib_peer_ni *peer2;
+	struct kib_conn *conn;
+	struct lnet_ni *ni = NULL;
+	struct kib_net *net = NULL;
         lnet_nid_t             nid;
         struct rdma_conn_param cp;
-        kib_rej_t              rej;
+	struct kib_rej rej;
 	int                    version = IBLND_MSG_VERSION;
 	unsigned long          flags;
 	int                    rc;
@@ -2253,8 +2461,8 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
 	LASSERT (!in_interrupt());
 
 	/* cmid inherits 'context' from the corresponding listener id */
-	ibdev = (kib_dev_t *)cmid->context;
-	LASSERT (ibdev != NULL);
+	ibdev = cmid->context;
+	LASSERT(ibdev);
 
         memset(&rej, 0, sizeof(rej));
         rej.ibr_magic                = IBLND_MSG_MAGIC;
@@ -2270,7 +2478,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
 		goto failed;
 	}
 
-	if (priv_nob < offsetof(kib_msg_t, ibm_type)) {
+	if (priv_nob < offsetof(struct kib_msg, ibm_type)) {
 		CERROR("Short connection request\n");
 		goto failed;
 	}
@@ -2303,7 +2511,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
 	ni  = lnet_nid2ni_addref(reqmsg->ibm_dstnid);
 
 	if (ni != NULL) {
-		net = (kib_net_t *)ni->ni_data;
+		net = (struct kib_net *)ni->ni_data;
 		rej.ibr_incarnation = net->ibn_incarnation;
 	}
 
@@ -2352,26 +2560,26 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
 	}
 
 	if (reqmsg->ibm_u.connparams.ibcp_max_frags >
-	    kiblnd_rdma_frags(version, ni)) {
+	    IBLND_MAX_RDMA_FRAGS) {
 		CWARN("Can't accept conn from %s (version %x): "
 		      "max_frags %d too large (%d wanted)\n",
 		      libcfs_nid2str(nid), version,
 		      reqmsg->ibm_u.connparams.ibcp_max_frags,
-		      kiblnd_rdma_frags(version, ni));
+		      IBLND_MAX_RDMA_FRAGS);
 
 		if (version >= IBLND_MSG_VERSION)
 			rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
 
 		goto failed;
 	} else if (reqmsg->ibm_u.connparams.ibcp_max_frags <
-		   kiblnd_rdma_frags(version, ni) &&
+		   IBLND_MAX_RDMA_FRAGS &&
 		   net->ibn_fmr_ps == NULL) {
 		CWARN("Can't accept conn from %s (version %x): "
 		      "max_frags %d incompatible without FMR pool "
 		      "(%d wanted)\n",
 		      libcfs_nid2str(nid), version,
 		      reqmsg->ibm_u.connparams.ibcp_max_frags,
-		      kiblnd_rdma_frags(version, ni));
+		      IBLND_MAX_RDMA_FRAGS);
 
 		if (version == IBLND_MSG_VERSION)
 			rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
@@ -2545,7 +2753,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
 	if (ni != NULL) {
 		rej.ibr_cp.ibcp_queue_depth =
 			kiblnd_msg_queue_size(version, ni);
-		rej.ibr_cp.ibcp_max_frags   = kiblnd_rdma_frags(version, ni);
+		rej.ibr_cp.ibcp_max_frags   = IBLND_MAX_RDMA_FRAGS;
 		lnet_ni_decref(ni);
 	}
 
@@ -2556,11 +2764,11 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
 }
 
 static void
-kiblnd_check_reconnect(kib_conn_t *conn, int version,
-		       __u64 incarnation, int why, kib_connparams_t *cp)
+kiblnd_check_reconnect(struct kib_conn *conn, int version,
+		       u64 incarnation, int why, struct kib_connparams *cp)
 {
 	rwlock_t	*glock = &kiblnd_data.kib_global_lock;
-	kib_peer_ni_t	*peer_ni = conn->ibc_peer;
+	struct kib_peer_ni *peer_ni = conn->ibc_peer;
 	char		*reason;
 	int		 msg_size = IBLND_MSG_SIZE;
 	int		 frag_num = -1;
@@ -2592,10 +2800,15 @@ kiblnd_check_reconnect(kib_conn_t *conn, int version,
 		goto out;
 	}
 
-        switch (why) {
-        default:
-                reason = "Unknown";
-                break;
+	if (peer_ni->ibp_retries > *kiblnd_tunables.kib_retry_count) {
+		reason = "retry count exceeded due to no listener";
+		goto out;
+	}
+
+	switch (why) {
+	default:
+		reason = "Unknown";
+		break;
 
 	case IBLND_REJECT_RDMA_FRAGS: {
 		struct lnet_ioctl_config_o2iblnd_tunables *tunables;
@@ -2605,10 +2818,16 @@ kiblnd_check_reconnect(kib_conn_t *conn, int version,
 			goto out;
 		}
 		tunables = &peer_ni->ibp_ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+#ifdef HAVE_IB_GET_DMA_MR
+		/*
+		 * This check only makes sense if the kernel supports global
+		 * memory registration. Otherwise, map_on_demand will never == 0
+		 */
 		if (!tunables->lnd_map_on_demand) {
 			reason = "map_on_demand must be enabled";
 			goto out;
 		}
+#endif
 		if (conn->ibc_max_frags <= frag_num) {
 			reason = "unsupported max frags";
 			goto out;
@@ -2670,9 +2889,9 @@ kiblnd_check_reconnect(kib_conn_t *conn, int version,
 }
 
 static void
-kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
+kiblnd_rejected(struct kib_conn *conn, int reason, void *priv, int priv_nob)
 {
-	kib_peer_ni_t    *peer_ni = conn->ibc_peer;
+	struct kib_peer_ni *peer_ni = conn->ibc_peer;
 
 	LASSERT (!in_interrupt());
 	LASSERT (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
@@ -2684,17 +2903,18 @@ kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
 		break;
 
         case IB_CM_REJ_INVALID_SERVICE_ID:
+		peer_ni->ibp_retries++;
 		kiblnd_check_reconnect(conn, IBLND_MSG_VERSION, 0,
 				       IBLND_REJECT_INVALID_SRV_ID, NULL);
-                CNETERR("%s rejected: no listener at %d\n",
-                        libcfs_nid2str(peer_ni->ibp_nid),
-                        *kiblnd_tunables.kib_service);
-                break;
+		CNETERR("%s rejected: no listener at %d\n",
+			libcfs_nid2str(peer_ni->ibp_nid),
+			*kiblnd_tunables.kib_service);
+		break;
 
         case IB_CM_REJ_CONSUMER_DEFINED:
-                if (priv_nob >= offsetof(kib_rej_t, ibr_padding)) {
-                        kib_rej_t        *rej         = priv;
-                        kib_connparams_t *cp          = NULL;
+		if (priv_nob >= offsetof(struct kib_rej, ibr_padding)) {
+			struct kib_rej *rej = priv;
+			struct kib_connparams *cp = NULL;
                         int               flip        = 0;
                         __u64             incarnation = -1;
 
@@ -2707,7 +2927,7 @@ kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
                          * it rejected me then upgrade to V2, I have no idea
                          * about the upgrading and try to reconnect with V1,
                          * in this case upgraded V2 can find out I'm trying to
-                         * talk to the old guy and reject me(incarnation is -1). 
+			 * talk to the old guy and reject me(incarnation is -1).
                          */
 
                         if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) ||
@@ -2717,7 +2937,7 @@ kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
                                 flip = 1;
                         }
 
-                        if (priv_nob >= sizeof(kib_rej_t) &&
+			if (priv_nob >= sizeof(struct kib_rej) &&
                             rej->ibr_version > IBLND_MSG_VERSION_1) {
                                 /* priv_nob is always 148 in current version
                                  * of OFED, so we still need to check version.
@@ -2786,7 +3006,7 @@ kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
                         }
                         break;
                 }
-                /* fall through */
+                fallthrough;
         default:
                 CNETERR("%s rejected: reason %d, size %d\n",
                         libcfs_nid2str(peer_ni->ibp_nid), reason, priv_nob);
@@ -2797,12 +3017,12 @@ kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
 }
 
 static void
-kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob)
+kiblnd_check_connreply(struct kib_conn *conn, void *priv, int priv_nob)
 {
-        kib_peer_ni_t    *peer_ni = conn->ibc_peer;
-	struct lnet_ni *ni   = peer_ni->ibp_ni;
-        kib_net_t     *net  = ni->ni_data;
-        kib_msg_t     *msg  = priv;
+	struct kib_peer_ni *peer_ni = conn->ibc_peer;
+	struct lnet_ni *ni = peer_ni->ibp_ni;
+	struct kib_net *net = ni->ni_data;
+	struct kib_msg *msg = priv;
         int            ver  = conn->ibc_version;
         int            rc   = kiblnd_unpack_msg(msg, priv_nob);
         unsigned long  flags;
@@ -2898,12 +3118,12 @@ kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob)
 }
 
 static int
-kiblnd_active_connect (struct rdma_cm_id *cmid)
+kiblnd_active_connect(struct rdma_cm_id *cmid)
 {
-        kib_peer_ni_t              *peer_ni = (kib_peer_ni_t *)cmid->context;
-        kib_conn_t              *conn;
-        kib_msg_t               *msg;
-        struct rdma_conn_param   cp;
+	struct kib_peer_ni *peer_ni = cmid->context;
+	struct kib_conn *conn;
+	struct kib_msg *msg;
+	struct rdma_conn_param cp;
         int                      version;
         __u64                    incarnation;
         unsigned long            flags;
@@ -2951,8 +3171,7 @@ kiblnd_active_connect (struct rdma_cm_id *cmid)
 
         LASSERT(cmid->context == (void *)conn);
         LASSERT(conn->ibc_cmid == cmid);
-
-        rc = rdma_connect(cmid, &cp);
+	rc = rdma_connect_locked(cmid, &cp);
         if (rc != 0) {
                 CERROR("Can't connect to %s: %d\n",
                        libcfs_nid2str(peer_ni->ibp_nid), rc);
@@ -2966,9 +3185,9 @@ kiblnd_active_connect (struct rdma_cm_id *cmid)
 int
 kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
 {
-        kib_peer_ni_t  *peer_ni;
-        kib_conn_t  *conn;
-	int          rc;
+	struct kib_peer_ni *peer_ni;
+	struct kib_conn *conn;
+	int rc;
 
 	switch (event->event) {
 	default:
@@ -2978,14 +3197,14 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
 
 	case RDMA_CM_EVENT_CONNECT_REQUEST:
                 /* destroy cmid on failure */
-		rc = kiblnd_passive_connect(cmid, 
+		rc = kiblnd_passive_connect(cmid,
                                             (void *)KIBLND_CONN_PARAM(event),
                                             KIBLND_CONN_PARAM_LEN(event));
                 CDEBUG(D_NET, "connreq: %d\n", rc);
                 return rc;
-                
+
 	case RDMA_CM_EVENT_ADDR_ERROR:
-                peer_ni = (kib_peer_ni_t *)cmid->context;
+		peer_ni = cmid->context;
                 CNETERR("%s: ADDR ERROR %d\n",
                        libcfs_nid2str(peer_ni->ibp_nid), event->status);
                 kiblnd_peer_connect_failed(peer_ni, 1, -EHOSTUNREACH);
@@ -2993,7 +3212,7 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                 return -EHOSTUNREACH;      /* rc != 0 destroys cmid */
 
 	case RDMA_CM_EVENT_ADDR_RESOLVED:
-                peer_ni = (kib_peer_ni_t *)cmid->context;
+		peer_ni = cmid->context;
 
                 CDEBUG(D_NET,"%s Addr resolved: %d\n",
                        libcfs_nid2str(peer_ni->ibp_nid), event->status);
@@ -3002,12 +3221,12 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                         CNETERR("Can't resolve address for %s: %d\n",
                                 libcfs_nid2str(peer_ni->ibp_nid), event->status);
                         rc = event->status;
-                } else {
-                        rc = rdma_resolve_route(
-                                cmid, *kiblnd_tunables.kib_timeout * 1000);
+		} else {
+			rc = rdma_resolve_route(
+				cmid, lnet_get_lnd_timeout() * 1000);
 			if (rc == 0) {
-				kib_net_t *net = peer_ni->ibp_ni->ni_data;
-				kib_dev_t *dev = net->ibn_dev;
+				struct kib_net *net = peer_ni->ibp_ni->ni_data;
+				struct kib_dev *dev = net->ibn_dev;
 
 				CDEBUG(D_NET, "%s: connection bound to "\
 				       "%s:%pI4h:%s\n",
@@ -3027,7 +3246,7 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                 return rc;                      /* rc != 0 destroys cmid */
 
 	case RDMA_CM_EVENT_ROUTE_ERROR:
-                peer_ni = (kib_peer_ni_t *)cmid->context;
+		peer_ni = cmid->context;
                 CNETERR("%s: ROUTE ERROR %d\n",
                         libcfs_nid2str(peer_ni->ibp_nid), event->status);
                 kiblnd_peer_connect_failed(peer_ni, 1, -EHOSTUNREACH);
@@ -3035,7 +3254,7 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                 return -EHOSTUNREACH;           /* rc != 0 destroys cmid */
 
 	case RDMA_CM_EVENT_ROUTE_RESOLVED:
-                peer_ni = (kib_peer_ni_t *)cmid->context;
+		peer_ni = cmid->context;
                 CDEBUG(D_NET,"%s Route resolved: %d\n",
                        libcfs_nid2str(peer_ni->ibp_nid), event->status);
 
@@ -3047,9 +3266,9 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                 kiblnd_peer_connect_failed(peer_ni, 1, event->status);
                 kiblnd_peer_decref(peer_ni);
                 return event->status;           /* rc != 0 destroys cmid */
-                
+
 	case RDMA_CM_EVENT_UNREACHABLE:
-                conn = (kib_conn_t *)cmid->context;
+		conn = cmid->context;
                 LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
                         conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
                 CNETERR("%s: UNREACHABLE %d\n",
@@ -3059,7 +3278,7 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                 return 0;
 
 	case RDMA_CM_EVENT_CONNECT_ERROR:
-                conn = (kib_conn_t *)cmid->context;
+		conn = cmid->context;
                 LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
                         conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
                 CNETERR("%s: CONNECT ERROR %d\n",
@@ -3069,7 +3288,7 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                 return 0;
 
 	case RDMA_CM_EVENT_REJECTED:
-                conn = (kib_conn_t *)cmid->context;
+		conn = cmid->context;
                 switch (conn->ibc_state) {
                 default:
                         LBUG();
@@ -3091,7 +3310,7 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                 return 0;
 
 	case RDMA_CM_EVENT_ESTABLISHED:
-                conn = (kib_conn_t *)cmid->context;
+		conn = cmid->context;
                 switch (conn->ibc_state) {
                 default:
                         LBUG();
@@ -3118,7 +3337,7 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                 return 0;
 
 	case RDMA_CM_EVENT_DISCONNECTED:
-                conn = (kib_conn_t *)cmid->context;
+		conn = cmid->context;
                 if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
                         CERROR("%s DISCONNECTED\n",
                                libcfs_nid2str(conn->ibc_peer->ibp_nid));
@@ -3145,13 +3364,13 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
 }
 
 static int
-kiblnd_check_txs_locked(kib_conn_t *conn, struct list_head *txs)
+kiblnd_check_txs_locked(struct kib_conn *conn, struct list_head *txs)
 {
-	kib_tx_t	 *tx;
+	struct kib_tx *tx;
 	struct list_head *ttmp;
 
 	list_for_each(ttmp, txs) {
-		tx = list_entry(ttmp, kib_tx_t, tx_list);
+		tx = list_entry(ttmp, struct kib_tx, tx_list);
 
 		if (txs != &conn->ibc_active_txs) {
 			LASSERT(tx->tx_queued);
@@ -3160,10 +3379,11 @@ kiblnd_check_txs_locked(kib_conn_t *conn, struct list_head *txs)
 			LASSERT(tx->tx_waiting || tx->tx_sending != 0);
 		}
 
-		if (cfs_time_aftereq(jiffies, tx->tx_deadline)) {
-			CERROR("Timed out tx: %s, %lu seconds\n",
+		if (ktime_compare(ktime_get(), tx->tx_deadline) >= 0) {
+			CERROR("Timed out tx: %s, %lld seconds\n",
 			       kiblnd_queue2str(conn, txs),
-			       cfs_duration_sec(jiffies - tx->tx_deadline));
+			       ktime_ms_delta(ktime_get(),
+					      tx->tx_deadline) / MSEC_PER_SEC);
 			return 1;
 		}
 	}
@@ -3172,7 +3392,7 @@ kiblnd_check_txs_locked(kib_conn_t *conn, struct list_head *txs)
 }
 
 static int
-kiblnd_conn_timed_out_locked(kib_conn_t *conn)
+kiblnd_conn_timed_out_locked(struct kib_conn *conn)
 {
         return  kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue) ||
                 kiblnd_check_txs_locked(conn, &conn->ibc_tx_noops) ||
@@ -3189,9 +3409,9 @@ kiblnd_check_conns (int idx)
 	struct list_head  timedout_txs = LIST_HEAD_INIT(timedout_txs);
 	struct list_head *peers = &kiblnd_data.kib_peers[idx];
 	struct list_head *ptmp;
-	kib_peer_ni_t	 *peer_ni;
-	kib_conn_t	 *conn;
-	kib_tx_t	 *tx, *tx_tmp;
+	struct kib_peer_ni *peer_ni;
+	struct kib_conn	*conn;
+	struct kib_tx *tx, *tx_tmp;
 	struct list_head *ctmp;
 	unsigned long	  flags;
 
@@ -3201,14 +3421,15 @@ kiblnd_check_conns (int idx)
 	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 
 	list_for_each(ptmp, peers) {
-		peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list);
+		peer_ni = list_entry(ptmp, struct kib_peer_ni, ibp_list);
 
 		/* Check tx_deadline */
 		list_for_each_entry_safe(tx, tx_tmp, &peer_ni->ibp_tx_queue, tx_list) {
-			if (cfs_time_aftereq(jiffies, tx->tx_deadline)) {
-				CWARN("Timed out tx for %s: %lu seconds\n",
+			if (ktime_compare(ktime_get(), tx->tx_deadline) >= 0) {
+				CWARN("Timed out tx for %s: %lld seconds\n",
 				      libcfs_nid2str(peer_ni->ibp_nid),
-				      cfs_duration_sec(jiffies - tx->tx_deadline));
+				      ktime_ms_delta(ktime_get(),
+						     tx->tx_deadline) / MSEC_PER_SEC);
 				list_move(&tx->tx_list, &timedout_txs);
 			}
 		}
@@ -3217,7 +3438,7 @@ kiblnd_check_conns (int idx)
 			int timedout;
 			int sendnoop;
 
-			conn = list_entry(ctmp, kib_conn_t, ibc_list);
+			conn = list_entry(ctmp, struct kib_conn, ibc_list);
 
 			LASSERT(conn->ibc_state == IBLND_CONN_ESTABLISHED);
 
@@ -3231,11 +3452,10 @@ kiblnd_check_conns (int idx)
 			}
 
 			if (timedout) {
-				CERROR("Timed out RDMA with %s (%lu): "
+				CERROR("Timed out RDMA with %s (%lld): "
 				       "c: %u, oc: %u, rc: %u\n",
 				       libcfs_nid2str(peer_ni->ibp_nid),
-				       cfs_duration_sec(cfs_time_current() -
-							peer_ni->ibp_last_alive),
+				       ktime_get_seconds() - peer_ni->ibp_last_alive,
 				       conn->ibc_credits,
 				       conn->ibc_outstanding_credits,
 				       conn->ibc_reserved_credits);
@@ -3253,14 +3473,15 @@ kiblnd_check_conns (int idx)
 	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 
 	if (!list_empty(&timedout_txs))
-		kiblnd_txlist_done(&timedout_txs, -ETIMEDOUT);
+		kiblnd_txlist_done(&timedout_txs, -ETIMEDOUT,
+				   LNET_MSG_STATUS_LOCAL_TIMEOUT);
 
 	/* Handle timeout by closing the whole
 	 * connection. We can only be sure RDMA activity
 	 * has ceased once the QP has been modified. */
 	while (!list_empty(&closes)) {
 		conn = list_entry(closes.next,
-				  kib_conn_t, ibc_connd_list);
+				  struct kib_conn, ibc_connd_list);
 		list_del(&conn->ibc_connd_list);
 		kiblnd_close_conn(conn, -ETIMEDOUT);
 		kiblnd_conn_decref(conn);
@@ -3271,7 +3492,7 @@ kiblnd_check_conns (int idx)
 	 * free to do it last time... */
 	while (!list_empty(&checksends)) {
 		conn = list_entry(checksends.next,
-				  kib_conn_t, ibc_connd_list);
+				  struct kib_conn, ibc_connd_list);
 		list_del(&conn->ibc_connd_list);
 
 		spin_lock(&conn->ibc_lock);
@@ -3283,7 +3504,7 @@ kiblnd_check_conns (int idx)
 }
 
 static void
-kiblnd_disconnect_conn (kib_conn_t *conn)
+kiblnd_disconnect_conn(struct kib_conn *conn)
 {
 	LASSERT (!in_interrupt());
 	LASSERT (current == kiblnd_data.kib_connd);
@@ -3312,7 +3533,7 @@ kiblnd_connd (void *arg)
 	spinlock_t	  *lock= &kiblnd_data.kib_connd_lock;
 	wait_queue_entry_t wait;
 	unsigned long      flags;
-	kib_conn_t        *conn;
+	struct kib_conn *conn;
 	int                timeout;
 	int                i;
 	int                dropped_lock;
@@ -3332,10 +3553,10 @@ kiblnd_connd (void *arg)
                 dropped_lock = 0;
 
 		if (!list_empty(&kiblnd_data.kib_connd_zombies)) {
-			kib_peer_ni_t *peer_ni = NULL;
+			struct kib_peer_ni *peer_ni = NULL;
 
 			conn = list_entry(kiblnd_data.kib_connd_zombies.next,
-					  kib_conn_t, ibc_list);
+					  struct kib_conn, ibc_list);
 			list_del(&conn->ibc_list);
 			if (conn->ibc_reconnect) {
 				peer_ni = conn->ibc_peer;
@@ -3345,11 +3566,13 @@ kiblnd_connd (void *arg)
 			spin_unlock_irqrestore(lock, flags);
 			dropped_lock = 1;
 
-			kiblnd_destroy_conn(conn, !peer_ni);
+			kiblnd_destroy_conn(conn);
 
 			spin_lock_irqsave(lock, flags);
-			if (!peer_ni)
+			if (!peer_ni) {
+				LIBCFS_FREE(conn, sizeof(*conn));
 				continue;
+			}
 
 			conn->ibc_peer = peer_ni;
 			if (peer_ni->ibp_reconnected < KIB_RECONN_HIGH_RACE)
@@ -3362,7 +3585,7 @@ kiblnd_connd (void *arg)
 
 		if (!list_empty(&kiblnd_data.kib_connd_conns)) {
 			conn = list_entry(kiblnd_data.kib_connd_conns.next,
-					      kib_conn_t, ibc_list);
+					  struct kib_conn, ibc_list);
 			list_del(&conn->ibc_list);
 
 			spin_unlock_irqrestore(lock, flags);
@@ -3375,7 +3598,8 @@ kiblnd_connd (void *arg)
                 }
 
 		while (reconn < KIB_RECONN_BREAK) {
-			if (kiblnd_data.kib_reconn_sec != ktime_get_real_seconds()) {
+			if (kiblnd_data.kib_reconn_sec !=
+			    ktime_get_real_seconds()) {
 				kiblnd_data.kib_reconn_sec = ktime_get_real_seconds();
 				list_splice_init(&kiblnd_data.kib_reconn_wait,
 						 &kiblnd_data.kib_reconn_list);
@@ -3385,7 +3609,7 @@ kiblnd_connd (void *arg)
 				break;
 
 			conn = list_entry(kiblnd_data.kib_reconn_list.next,
-					  kib_conn_t, ibc_list);
+					  struct kib_conn, ibc_list);
 			list_del(&conn->ibc_list);
 
 			spin_unlock_irqrestore(lock, flags);
@@ -3404,6 +3628,7 @@ kiblnd_connd (void *arg)
                         const int n = 4;
                         const int p = 1;
                         int       chunk = kiblnd_data.kib_peer_hash_size;
+			unsigned int lnd_timeout;
 
 			spin_unlock_irqrestore(lock, flags);
                         dropped_lock = 1;
@@ -3416,11 +3641,11 @@ kiblnd_connd (void *arg)
                          * connection within (n+1)/n times the timeout
                          * interval. */
 
-                        if (*kiblnd_tunables.kib_timeout > n * p)
-                                chunk = (chunk * n * p) /
-                                        *kiblnd_tunables.kib_timeout;
-                        if (chunk == 0)
-                                chunk = 1;
+			lnd_timeout = lnet_get_lnd_timeout();
+			if (lnd_timeout > n * p)
+				chunk = (chunk * n * p) / lnd_timeout;
+			if (chunk == 0)
+				chunk = 1;
 
 			for (i = 0; i < chunk; i++) {
 				kiblnd_check_conns(peer_index);
@@ -3456,23 +3681,36 @@ kiblnd_connd (void *arg)
 void
 kiblnd_qp_event(struct ib_event *event, void *arg)
 {
-        kib_conn_t *conn = arg;
+	struct kib_conn *conn = arg;
 
-        switch (event->event) {
-        case IB_EVENT_COMM_EST:
-                CDEBUG(D_NET, "%s established\n",
-                       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+	switch (event->event) {
+	case IB_EVENT_COMM_EST:
+		CDEBUG(D_NET, "%s established\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid));
 		/* We received a packet but connection isn't established
 		 * probably handshake packet was lost, so free to
 		 * force make connection established */
 		rdma_notify(conn->ibc_cmid, IB_EVENT_COMM_EST);
-                return;
+		return;
 
-        default:
-                CERROR("%s: Async QP event type %d\n",
-                       libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
-                return;
-        }
+	case IB_EVENT_PORT_ERR:
+	case IB_EVENT_DEVICE_FATAL:
+		CERROR("Fatal device error for NI %s\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_ni->ni_nid));
+		atomic_set(&conn->ibc_peer->ibp_ni->ni_fatal_error_on, 1);
+		return;
+
+	case IB_EVENT_PORT_ACTIVE:
+		CERROR("Port reactivated for NI %s\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_ni->ni_nid));
+		atomic_set(&conn->ibc_peer->ibp_ni->ni_fatal_error_on, 0);
+		return;
+
+	default:
+		CERROR("%s: Async QP event type %d\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
+		return;
+	}
 }
 
 static void
@@ -3518,9 +3756,9 @@ kiblnd_cq_completion(struct ib_cq *cq, void *arg)
 	 * consuming my CQ I could be called after all completions have
 	 * occurred.  But in this case, ibc_nrx == 0 && ibc_nsends_posted == 0
 	 * and this CQ is about to be destroyed so I NOOP. */
-	kib_conn_t		*conn = (kib_conn_t *)arg;
-	struct kib_sched_info	*sched = conn->ibc_sched;
-	unsigned long		flags;
+	struct kib_conn	*conn = arg;
+	struct kib_sched_info *sched = conn->ibc_sched;
+	unsigned long flags;
 
 	LASSERT(cq == conn->ibc_cq);
 
@@ -3545,7 +3783,7 @@ kiblnd_cq_completion(struct ib_cq *cq, void *arg)
 void
 kiblnd_cq_event(struct ib_event *event, void *arg)
 {
-        kib_conn_t *conn = arg;
+	struct kib_conn *conn = arg;
 
         CERROR("%s: async CQ event type %d\n",
                libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
@@ -3556,7 +3794,7 @@ kiblnd_scheduler(void *arg)
 {
 	long			id = (long)arg;
 	struct kib_sched_info	*sched;
-	kib_conn_t		*conn;
+	struct kib_conn	*conn;
 	wait_queue_entry_t      wait;
 	unsigned long		flags;
 	struct ib_wc		wc;
@@ -3594,7 +3832,7 @@ kiblnd_scheduler(void *arg)
 
 		if (!list_empty(&sched->ibs_conns)) {
 			conn = list_entry(sched->ibs_conns.next,
-					      kib_conn_t, ibc_sched_list);
+					  struct kib_conn, ibc_sched_list);
 			/* take over kib_sched_conns' ref on conn... */
 			LASSERT(conn->ibc_scheduled);
 			list_del(&conn->ibc_sched_list);
@@ -3698,7 +3936,7 @@ int
 kiblnd_failover_thread(void *arg)
 {
 	rwlock_t	*glock = &kiblnd_data.kib_global_lock;
-	kib_dev_t	*dev;
+	struct kib_dev *dev;
 	struct net *ns = arg;
 	wait_queue_entry_t wait;
 	unsigned long	 flags;
@@ -3717,8 +3955,7 @@ kiblnd_failover_thread(void *arg)
 
 		list_for_each_entry(dev, &kiblnd_data.kib_failed_devs,
                                     ibd_fail_list) {
-                        if (cfs_time_before(cfs_time_current(),
-                                            dev->ibd_next_failover))
+			if (ktime_get_seconds() < dev->ibd_next_failover)
                                 continue;
                         do_failover = 1;
                         break;
@@ -3736,13 +3973,13 @@ kiblnd_failover_thread(void *arg)
                         LASSERT (dev->ibd_failover);
                         dev->ibd_failover = 0;
                         if (rc >= 0) { /* Device is OK or failover succeed */
-                                dev->ibd_next_failover = cfs_time_shift(3);
+				dev->ibd_next_failover = ktime_get_seconds() + 3;
                                 continue;
                         }
 
                         /* failed to failover, retry later */
-                        dev->ibd_next_failover =
-                                cfs_time_shift(min(dev->ibd_failed_failover, 10));
+			dev->ibd_next_failover = ktime_get_seconds() +
+						 min(dev->ibd_failed_failover, 10);
                         if (kiblnd_dev_can_failover(dev)) {
 				list_add_tail(&dev->ibd_fail_list,
                                               &kiblnd_data.kib_failed_devs);
diff --git a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_modparams.c
index 72cb50ecd14f5..39f9a620d04a4 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_modparams.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/o2iblnd/o2iblnd_modparams.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -82,7 +82,7 @@ static int peer_buffer_credits = 0;
 module_param(peer_buffer_credits, int, 0444);
 MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits");
 
-static int peer_timeout = 180;
+static int peer_timeout = DEFAULT_PEER_TIMEOUT;
 module_param(peer_timeout, int, 0444);
 MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)");
 
@@ -92,7 +92,7 @@ MODULE_PARM_DESC(ipif_name, "IPoIB interface name");
 
 static int retry_count = 5;
 module_param(retry_count, int, 0644);
-MODULE_PARM_DESC(retry_count, "Retransmissions when no ACK received");
+MODULE_PARM_DESC(retry_count, "Number of times to retry connection operations");
 
 static int rnr_retry_count = 6;
 module_param(rnr_retry_count, int, 0644);
@@ -110,16 +110,46 @@ static int concurrent_sends;
 module_param(concurrent_sends, int, 0444);
 MODULE_PARM_DESC(concurrent_sends, "send work-queue sizing");
 
+static int use_fastreg_gaps;
+module_param(use_fastreg_gaps, int, 0444);
+MODULE_PARM_DESC(use_fastreg_gaps, "Enable discontiguous fastreg fragment support. Expect performance drop");
+
+/*
+ * map_on_demand is a flag used to determine if we can use FMR or FastReg.
+ * This is applicable for kernels which support global memory regions. For
+ * later kernels this flag is always enabled, since we will always either
+ * use FMR or FastReg
+ * For kernels which support global memory regions map_on_demand defaults
+ * to 0 which means we will be using global memory regions exclusively.
+ * If it is set to a value other than 0, then we will behave as follows:
+ *  1. Always default the number of fragments to IBLND_MAX_RDMA_FRAGS
+ *  2. Create FMR/FastReg pools
+ *  3. Negotiate the supported number of fragments per connection
+ *  4. Attempt to transmit using global memory regions only if
+ *     map-on-demand is not turned on, otherwise use FMR or FastReg
+ *  5. In case of transmitting tx with GAPS over FMR we will need to
+ *     transmit it with multiple fragments. Look at the comments in
+ *     kiblnd_fmr_map_tx() for an explanation of the behavior.
+ *
+ * For later kernels we default map_on_demand to 1 and not allow
+ * it to be set to 0, since there is no longer support for global memory
+ * regions. Behavior:
+ *  1. Default the number of fragments to IBLND_MAX_RDMA_FRAGS
+ *  2. Create FMR/FastReg pools
+ *  3. Negotiate the supported number of fragments per connection
+ *  4. Look at the comments in kiblnd_fmr_map_tx() for an explanation of
+ *     the behavior when transmit with GAPS verses contiguous.
+ */
 #ifdef HAVE_IB_GET_DMA_MR
 #define IBLND_DEFAULT_MAP_ON_DEMAND 0
-#define IBLND_MIN_MAP_ON_DEMAND 0
+#define MOD_STR "map on demand"
 #else
-#define IBLND_DEFAULT_MAP_ON_DEMAND IBLND_MAX_RDMA_FRAGS
-#define IBLND_MIN_MAP_ON_DEMAND 1
+#define IBLND_DEFAULT_MAP_ON_DEMAND 1
+#define MOD_STR "map on demand (obsolete)"
 #endif
 static int map_on_demand = IBLND_DEFAULT_MAP_ON_DEMAND;
 module_param(map_on_demand, int, 0444);
-MODULE_PARM_DESC(map_on_demand, "map on demand");
+MODULE_PARM_DESC(map_on_demand, MOD_STR);
 
 /* NB: this value is shared by all CPTs, it can grow at runtime */
 static int fmr_pool_size = 512;
@@ -156,7 +186,7 @@ static unsigned int wrq_sge = 2;
 module_param(wrq_sge, uint, 0444);
 MODULE_PARM_DESC(wrq_sge, "# scatter/gather element per work request");
 
-kib_tunables_t kiblnd_tunables = {
+struct kib_tunables kiblnd_tunables = {
         .kib_dev_failover           = &dev_failover,
         .kib_service                = &service,
         .kib_cksum                  = &cksum,
@@ -170,6 +200,7 @@ kib_tunables_t kiblnd_tunables = {
 	.kib_use_priv_port	    = &use_privileged_port,
 	.kib_nscheds		    = &nscheds,
 	.kib_wrq_sge		    = &wrq_sge,
+	.kib_use_fastreg_gaps       = &use_fastreg_gaps,
 };
 
 static struct lnet_ioctl_config_o2iblnd_tunables default_tunables;
@@ -236,6 +267,15 @@ kiblnd_tunables_setup(struct lnet_ni *ni)
 		net_tunables->lct_peer_tx_credits =
 			net_tunables->lct_max_tx_credits;
 
+#ifndef HAVE_IB_GET_DMA_MR
+	/*
+	 * For kernels which do not support global memory regions, always
+	 * enable map_on_demand
+	 */
+	if (tunables->lnd_map_on_demand == 0)
+		tunables->lnd_map_on_demand = 1;
+#endif
+
 	if (!tunables->lnd_peercredits_hiw)
 		tunables->lnd_peercredits_hiw = peer_credits_hiw;
 
@@ -245,30 +285,8 @@ kiblnd_tunables_setup(struct lnet_ni *ni)
 	if (tunables->lnd_peercredits_hiw >= net_tunables->lct_peer_tx_credits)
 		tunables->lnd_peercredits_hiw = net_tunables->lct_peer_tx_credits - 1;
 
-	if (tunables->lnd_map_on_demand < IBLND_MIN_MAP_ON_DEMAND ||
-	    tunables->lnd_map_on_demand > IBLND_MAX_RDMA_FRAGS) {
-		/* Use the default */
-		CWARN("Invalid map_on_demand (%d), expects %d - %d. Using default of %d\n",
-		      tunables->lnd_map_on_demand, IBLND_MIN_MAP_ON_DEMAND,
-		      IBLND_MAX_RDMA_FRAGS, IBLND_DEFAULT_MAP_ON_DEMAND);
-		tunables->lnd_map_on_demand = IBLND_DEFAULT_MAP_ON_DEMAND;
-	}
-
-	if (tunables->lnd_map_on_demand == 1) {
-		/* don't make sense to create map if only one fragment */
-		tunables->lnd_map_on_demand = 2;
-	}
-
-	if (tunables->lnd_concurrent_sends == 0) {
-		if (tunables->lnd_map_on_demand > 0 &&
-		    tunables->lnd_map_on_demand <= IBLND_MAX_RDMA_FRAGS / 8) {
-			tunables->lnd_concurrent_sends =
-					net_tunables->lct_peer_tx_credits * 2;
-		} else {
-			tunables->lnd_concurrent_sends =
-				net_tunables->lct_peer_tx_credits;
-		}
-	}
+	if (tunables->lnd_concurrent_sends == 0)
+			tunables->lnd_concurrent_sends = net_tunables->lct_peer_tx_credits;
 
 	if (tunables->lnd_concurrent_sends > net_tunables->lct_peer_tx_credits * 2)
 		tunables->lnd_concurrent_sends = net_tunables->lct_peer_tx_credits * 2;
@@ -303,7 +321,7 @@ int
 kiblnd_tunables_init(void)
 {
 	default_tunables.lnd_version = CURRENT_LND_VERSION;
-	default_tunables.lnd_peercredits_hiw = peer_credits_hiw,
+	default_tunables.lnd_peercredits_hiw = peer_credits_hiw;
 	default_tunables.lnd_map_on_demand = map_on_demand;
 	default_tunables.lnd_concurrent_sends = concurrent_sends;
 	default_tunables.lnd_fmr_pool_size = fmr_pool_size;
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c
index d0b8756143580..32dda0a5769b3 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -41,17 +41,17 @@
 #include <linux/inetdevice.h>
 
 static struct lnet_lnd the_ksocklnd;
-ksock_nal_data_t        ksocknal_data;
+struct ksock_nal_data ksocknal_data;
 
-static ksock_interface_t *
+static struct ksock_interface *
 ksocknal_ip2iface(struct lnet_ni *ni, __u32 ip)
 {
-	ksock_net_t *net = ni->ni_data;
+	struct ksock_net *net = ni->ni_data;
 	int i;
-	ksock_interface_t *iface;
+	struct ksock_interface *iface;
 
 	for (i = 0; i < net->ksnn_ninterfaces; i++) {
-		LASSERT(i < LNET_NUM_INTERFACES);
+		LASSERT(i < LNET_INTERFACES_NUM);
 		iface = &net->ksnn_interfaces[i];
 
 		if (iface->ksni_ipaddr == ip)
@@ -61,10 +61,10 @@ ksocknal_ip2iface(struct lnet_ni *ni, __u32 ip)
 	return NULL;
 }
 
-static ksock_route_t *
-ksocknal_create_route (__u32 ipaddr, int port)
+static struct ksock_route *
+ksocknal_create_route(__u32 ipaddr, int port)
 {
-	ksock_route_t *route;
+	struct ksock_route *route;
 
 	LIBCFS_ALLOC (route, sizeof (*route));
 	if (route == NULL)
@@ -86,7 +86,7 @@ ksocknal_create_route (__u32 ipaddr, int port)
 }
 
 void
-ksocknal_destroy_route (ksock_route_t *route)
+ksocknal_destroy_route(struct ksock_route *route)
 {
 	LASSERT (atomic_read(&route->ksnr_refcount) == 0);
 
@@ -97,12 +97,12 @@ ksocknal_destroy_route (ksock_route_t *route)
 }
 
 static int
-ksocknal_create_peer(ksock_peer_ni_t **peerp, struct lnet_ni *ni,
+ksocknal_create_peer(struct ksock_peer_ni **peerp, struct lnet_ni *ni,
 		     struct lnet_process_id id)
 {
-	int		cpt = lnet_cpt_of_nid(id.nid, ni);
-	ksock_net_t	*net = ni->ni_data;
-	ksock_peer_ni_t	*peer_ni;
+	int cpt = lnet_cpt_of_nid(id.nid, ni);
+	struct ksock_net *net = ni->ni_data;
+	struct ksock_peer_ni *peer_ni;
 
 	LASSERT(id.nid != LNET_NID_ANY);
 	LASSERT(id.pid != LNET_PID_ANY);
@@ -146,9 +146,9 @@ ksocknal_create_peer(ksock_peer_ni_t **peerp, struct lnet_ni *ni,
 }
 
 void
-ksocknal_destroy_peer (ksock_peer_ni_t *peer_ni)
+ksocknal_destroy_peer(struct ksock_peer_ni *peer_ni)
 {
-	ksock_net_t    *net = peer_ni->ksnp_ni->ni_data;
+	struct ksock_net *net = peer_ni->ksnp_ni->ni_data;
 
 	CDEBUG (D_NET, "peer_ni %s %p deleted\n",
 		libcfs_id2str(peer_ni->ksnp_id), peer_ni);
@@ -171,16 +171,15 @@ ksocknal_destroy_peer (ksock_peer_ni_t *peer_ni)
 	spin_unlock_bh(&net->ksnn_lock);
 }
 
-ksock_peer_ni_t *
+struct ksock_peer_ni *
 ksocknal_find_peer_locked(struct lnet_ni *ni, struct lnet_process_id id)
 {
 	struct list_head *peer_list = ksocknal_nid2peerlist(id.nid);
 	struct list_head *tmp;
-	ksock_peer_ni_t	 *peer_ni;
+	struct ksock_peer_ni *peer_ni;
 
 	list_for_each(tmp, peer_list) {
-
-		peer_ni = list_entry(tmp, ksock_peer_ni_t, ksnp_list);
+		peer_ni = list_entry(tmp, struct ksock_peer_ni, ksnp_list);
 
 		LASSERT(!peer_ni->ksnp_closing);
 
@@ -199,10 +198,10 @@ ksocknal_find_peer_locked(struct lnet_ni *ni, struct lnet_process_id id)
 	return NULL;
 }
 
-ksock_peer_ni_t *
+struct ksock_peer_ni *
 ksocknal_find_peer(struct lnet_ni *ni, struct lnet_process_id id)
 {
-        ksock_peer_ni_t     *peer_ni;
+	struct ksock_peer_ni *peer_ni;
 
 	read_lock(&ksocknal_data.ksnd_global_lock);
 	peer_ni = ksocknal_find_peer_locked(ni, id);
@@ -214,14 +213,14 @@ ksocknal_find_peer(struct lnet_ni *ni, struct lnet_process_id id)
 }
 
 static void
-ksocknal_unlink_peer_locked(ksock_peer_ni_t *peer_ni)
+ksocknal_unlink_peer_locked(struct ksock_peer_ni *peer_ni)
 {
 	int i;
 	__u32 ip;
-	ksock_interface_t *iface;
+	struct ksock_interface *iface;
 
 	for (i = 0; i < peer_ni->ksnp_n_passive_ips; i++) {
-		LASSERT(i < LNET_NUM_INTERFACES);
+		LASSERT(i < LNET_INTERFACES_NUM);
 		ip = peer_ni->ksnp_passive_ips[i];
 
 		iface = ksocknal_ip2iface(peer_ni->ksnp_ni, ip);
@@ -250,19 +249,19 @@ ksocknal_get_peer_info(struct lnet_ni *ni, int index,
 		       struct lnet_process_id *id, __u32 *myip, __u32 *peer_ip,
 		       int *port, int *conn_count, int *share_count)
 {
-	ksock_peer_ni_t	  *peer_ni;
-	struct list_head  *ptmp;
-	ksock_route_t     *route;
-	struct list_head  *rtmp;
-	int		   i;
-        int                j;
-	int		   rc = -ENOENT;
+	struct ksock_peer_ni *peer_ni;
+	struct list_head *ptmp;
+	struct ksock_route *route;
+	struct list_head *rtmp;
+	int i;
+	int j;
+	int rc = -ENOENT;
 
 	read_lock(&ksocknal_data.ksnd_global_lock);
 
 	for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
 		list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
-			peer_ni = list_entry(ptmp, ksock_peer_ni_t, ksnp_list);
+			peer_ni = list_entry(ptmp, struct ksock_peer_ni, ksnp_list);
 
 			if (peer_ni->ksnp_ni != ni)
 				continue;
@@ -300,7 +299,7 @@ ksocknal_get_peer_info(struct lnet_ni *ni, int index,
 				if (index-- > 0)
 					continue;
 
-				route = list_entry(rtmp, ksock_route_t,
+				route = list_entry(rtmp, struct ksock_route,
 						   ksnr_list);
 
 				*id = peer_ni->ksnp_id;
@@ -320,11 +319,11 @@ ksocknal_get_peer_info(struct lnet_ni *ni, int index,
 }
 
 static void
-ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn)
+ksocknal_associate_route_conn_locked(struct ksock_route *route, struct ksock_conn *conn)
 {
-	ksock_peer_ni_t	  *peer_ni = route->ksnr_peer;
-	int		   type = conn->ksnc_type;
-	ksock_interface_t *iface;
+	struct ksock_peer_ni *peer_ni = route->ksnr_peer;
+	int type = conn->ksnc_type;
+	struct ksock_interface *iface;
 
 	conn->ksnc_route = route;
 	ksocknal_route_addref(route);
@@ -364,11 +363,11 @@ ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn)
 }
 
 static void
-ksocknal_add_route_locked (ksock_peer_ni_t *peer_ni, ksock_route_t *route)
+ksocknal_add_route_locked(struct ksock_peer_ni *peer_ni, struct ksock_route *route)
 {
 	struct list_head *tmp;
-	ksock_conn_t	 *conn;
-	ksock_route_t	 *route2;
+	struct ksock_conn *conn;
+	struct ksock_route *route2;
 
 	LASSERT(!peer_ni->ksnp_closing);
 	LASSERT(route->ksnr_peer == NULL);
@@ -378,7 +377,7 @@ ksocknal_add_route_locked (ksock_peer_ni_t *peer_ni, ksock_route_t *route)
 
 	/* LASSERT(unique) */
 	list_for_each(tmp, &peer_ni->ksnp_routes) {
-		route2 = list_entry(tmp, ksock_route_t, ksnr_list);
+		route2 = list_entry(tmp, struct ksock_route, ksnr_list);
 
 		if (route2->ksnr_ipaddr == route->ksnr_ipaddr) {
 			CERROR("Duplicate route %s %pI4h\n",
@@ -394,7 +393,7 @@ ksocknal_add_route_locked (ksock_peer_ni_t *peer_ni, ksock_route_t *route)
 	list_add_tail(&route->ksnr_list, &peer_ni->ksnp_routes);
 
 	list_for_each(tmp, &peer_ni->ksnp_conns) {
-		conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+		conn = list_entry(tmp, struct ksock_conn, ksnc_list);
 
 		if (conn->ksnc_ipaddr != route->ksnr_ipaddr)
 			continue;
@@ -405,19 +404,19 @@ ksocknal_add_route_locked (ksock_peer_ni_t *peer_ni, ksock_route_t *route)
 }
 
 static void
-ksocknal_del_route_locked (ksock_route_t *route)
+ksocknal_del_route_locked(struct ksock_route *route)
 {
-	ksock_peer_ni_t	  *peer_ni = route->ksnr_peer;
-	ksock_interface_t *iface;
-	ksock_conn_t	  *conn;
-	struct list_head  *ctmp;
-	struct list_head  *cnxt;
+	struct ksock_peer_ni *peer_ni = route->ksnr_peer;
+	struct ksock_interface *iface;
+	struct ksock_conn *conn;
+	struct list_head *ctmp;
+	struct list_head *cnxt;
 
 	LASSERT(!route->ksnr_deleted);
 
 	/* Close associated conns */
 	list_for_each_safe(ctmp, cnxt, &peer_ni->ksnp_conns) {
-		conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
+		conn = list_entry(ctmp, struct ksock_conn, ksnc_list);
 
 		if (conn->ksnc_route != route)
 			continue;
@@ -449,11 +448,11 @@ ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ipaddr,
 		  int port)
 {
 	struct list_head *tmp;
-	ksock_peer_ni_t	 *peer_ni;
-	ksock_peer_ni_t	 *peer2;
-	ksock_route_t	 *route;
-	ksock_route_t	 *route2;
-	int		  rc;
+	struct ksock_peer_ni *peer_ni;
+	struct ksock_peer_ni *peer2;
+	struct ksock_route *route;
+	struct ksock_route *route2;
+	int rc;
 
         if (id.nid == LNET_NID_ANY ||
             id.pid == LNET_PID_ANY)
@@ -473,7 +472,7 @@ ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ipaddr,
 	write_lock_bh(&ksocknal_data.ksnd_global_lock);
 
         /* always called with a ref on ni, so shutdown can't have started */
-        LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0);
+	LASSERT(((struct ksock_net *) ni->ni_data)->ksnn_shutdown == 0);
 
 	peer2 = ksocknal_find_peer_locked(ni, id);
 	if (peer2 != NULL) {
@@ -487,7 +486,7 @@ ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ipaddr,
 
 	route2 = NULL;
 	list_for_each(tmp, &peer_ni->ksnp_routes) {
-		route2 = list_entry(tmp, ksock_route_t, ksnr_list);
+		route2 = list_entry(tmp, struct ksock_route, ksnr_list);
 
 		if (route2->ksnr_ipaddr == ipaddr)
 			break;
@@ -508,13 +507,13 @@ ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ipaddr,
 }
 
 static void
-ksocknal_del_peer_locked (ksock_peer_ni_t *peer_ni, __u32 ip)
+ksocknal_del_peer_locked(struct ksock_peer_ni *peer_ni, __u32 ip)
 {
-	ksock_conn_t	 *conn;
-	ksock_route_t	 *route;
+	struct ksock_conn *conn;
+	struct ksock_route *route;
 	struct list_head *tmp;
 	struct list_head *nxt;
-	int		  nshared;
+	int nshared;
 
 	LASSERT(!peer_ni->ksnp_closing);
 
@@ -522,7 +521,7 @@ ksocknal_del_peer_locked (ksock_peer_ni_t *peer_ni, __u32 ip)
 	ksocknal_peer_addref(peer_ni);
 
 	list_for_each_safe(tmp, nxt, &peer_ni->ksnp_routes) {
-		route = list_entry(tmp, ksock_route_t, ksnr_list);
+		route = list_entry(tmp, struct ksock_route, ksnr_list);
 
 		/* no match */
 		if (!(ip == 0 || route->ksnr_ipaddr == ip))
@@ -535,7 +534,7 @@ ksocknal_del_peer_locked (ksock_peer_ni_t *peer_ni, __u32 ip)
 
 	nshared = 0;
 	list_for_each_safe(tmp, nxt, &peer_ni->ksnp_routes) {
-		route = list_entry(tmp, ksock_route_t, ksnr_list);
+		route = list_entry(tmp, struct ksock_route, ksnr_list);
 		nshared += route->ksnr_share_count;
 	}
 
@@ -544,7 +543,7 @@ ksocknal_del_peer_locked (ksock_peer_ni_t *peer_ni, __u32 ip)
 		 * left */
 
 		list_for_each_safe(tmp, nxt, &peer_ni->ksnp_routes) {
-			route = list_entry(tmp, ksock_route_t, ksnr_list);
+			route = list_entry(tmp, struct ksock_route, ksnr_list);
 
 			/* we should only be removing auto-entries */
 			LASSERT(route->ksnr_share_count == 0);
@@ -552,27 +551,27 @@ ksocknal_del_peer_locked (ksock_peer_ni_t *peer_ni, __u32 ip)
 		}
 
 		list_for_each_safe(tmp, nxt, &peer_ni->ksnp_conns) {
-			conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+			conn = list_entry(tmp, struct ksock_conn, ksnc_list);
 
 			ksocknal_close_conn_locked(conn, 0);
 		}
 	}
 
 	ksocknal_peer_decref(peer_ni);
-		/* NB peer_ni unlinks itself when last conn/route is removed */
+	/* NB peer_ni unlinks itself when last conn/route is removed */
 }
 
 static int
 ksocknal_del_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip)
 {
-	struct list_head  zombies = LIST_HEAD_INIT(zombies);
+	struct list_head zombies = LIST_HEAD_INIT(zombies);
 	struct list_head *ptmp;
 	struct list_head *pnxt;
-	ksock_peer_ni_t     *peer_ni;
-	int		  lo;
-	int		  hi;
-	int		  i;
-	int		  rc = -ENOENT;
+	struct ksock_peer_ni *peer_ni;
+	int lo;
+	int hi;
+	int i;
+	int rc = -ENOENT;
 
 	write_lock_bh(&ksocknal_data.ksnd_global_lock);
 
@@ -588,7 +587,7 @@ ksocknal_del_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip)
 	for (i = lo; i <= hi; i++) {
 		list_for_each_safe(ptmp, pnxt,
 				   &ksocknal_data.ksnd_peers[i]) {
-			peer_ni = list_entry(ptmp, ksock_peer_ni_t, ksnp_list);
+			peer_ni = list_entry(ptmp, struct ksock_peer_ni, ksnp_list);
 
 			if (peer_ni->ksnp_ni != ni)
 				continue;
@@ -625,20 +624,20 @@ ksocknal_del_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip)
 	return rc;
 }
 
-static ksock_conn_t *
+static struct ksock_conn *
 ksocknal_get_conn_by_idx(struct lnet_ni *ni, int index)
 {
-	ksock_peer_ni_t	 *peer_ni;
+	struct ksock_peer_ni *peer_ni;
 	struct list_head *ptmp;
-	ksock_conn_t	 *conn;
+	struct ksock_conn *conn;
 	struct list_head *ctmp;
-	int		  i;
+	int i;
 
 	read_lock(&ksocknal_data.ksnd_global_lock);
 
 	for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
 		list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
-			peer_ni = list_entry(ptmp, ksock_peer_ni_t, ksnp_list);
+			peer_ni = list_entry(ptmp, struct ksock_peer_ni, ksnp_list);
 
 			LASSERT(!peer_ni->ksnp_closing);
 
@@ -649,7 +648,7 @@ ksocknal_get_conn_by_idx(struct lnet_ni *ni, int index)
 				if (index-- > 0)
 					continue;
 
-				conn = list_entry(ctmp, ksock_conn_t,
+				conn = list_entry(ctmp, struct ksock_conn,
 						  ksnc_list);
 				ksocknal_conn_addref(conn);
 				read_unlock(&ksocknal_data. \
@@ -663,50 +662,37 @@ ksocknal_get_conn_by_idx(struct lnet_ni *ni, int index)
 	return NULL;
 }
 
-static ksock_sched_t *
+static struct ksock_sched *
 ksocknal_choose_scheduler_locked(unsigned int cpt)
 {
-	struct ksock_sched_info	*info = ksocknal_data.ksnd_sched_info[cpt];
-	ksock_sched_t		*sched;
-	int			i;
+	struct ksock_sched *sched = ksocknal_data.ksnd_schedulers[cpt];
+	int i;
 
-	if (info->ksi_nthreads == 0) {
-		cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
-			if (info->ksi_nthreads > 0) {
+	if (sched->kss_nthreads == 0) {
+		cfs_percpt_for_each(sched, i, ksocknal_data.ksnd_schedulers) {
+			if (sched->kss_nthreads > 0) {
 				CDEBUG(D_NET, "scheduler[%d] has no threads. selected scheduler[%d]\n",
-				       cpt, info->ksi_cpt);
-				goto select_sched;
+				       cpt, sched->kss_cpt);
+				return sched;
 			}
 		}
 		return NULL;
 	}
 
-select_sched:
-	sched = &info->ksi_scheds[0];
-	/*
-	 * NB: it's safe so far, but info->ksi_nthreads could be changed
-	 * at runtime when we have dynamic LNet configuration, then we
-	 * need to take care of this.
-	 */
-	for (i = 1; i < info->ksi_nthreads; i++) {
-		if (sched->kss_nconns > info->ksi_scheds[i].kss_nconns)
-			sched = &info->ksi_scheds[i];
-	}
-
 	return sched;
 }
 
 static int
 ksocknal_local_ipvec(struct lnet_ni *ni, __u32 *ipaddrs)
 {
-	ksock_net_t *net = ni->ni_data;
+	struct ksock_net *net = ni->ni_data;
 	int i;
 	int nip;
 
 	read_lock(&ksocknal_data.ksnd_global_lock);
 
 	nip = net->ksnn_ninterfaces;
-	LASSERT(nip <= LNET_NUM_INTERFACES);
+	LASSERT(nip <= LNET_INTERFACES_NUM);
 
 	/*
 	 * Only offer interfaces for additional connections if I have
@@ -727,14 +713,14 @@ ksocknal_local_ipvec(struct lnet_ni *ni, __u32 *ipaddrs)
 }
 
 static int
-ksocknal_match_peerip (ksock_interface_t *iface, __u32 *ips, int nips)
+ksocknal_match_peerip(struct ksock_interface *iface, __u32 *ips, int nips)
 {
-        int   best_netmatch = 0;
-        int   best_xor      = 0;
-        int   best          = -1;
-        int   this_xor;
-        int   this_netmatch;
-        int   i;
+	int best_netmatch = 0;
+	int best_xor = 0;
+	int best = -1;
+	int this_xor;
+	int this_netmatch;
+	int i;
 
         for (i = 0; i < nips; i++) {
                 if (ips[i] == 0)
@@ -759,21 +745,21 @@ ksocknal_match_peerip (ksock_interface_t *iface, __u32 *ips, int nips)
 }
 
 static int
-ksocknal_select_ips(ksock_peer_ni_t *peer_ni, __u32 *peerips, int n_peerips)
+ksocknal_select_ips(struct ksock_peer_ni *peer_ni, __u32 *peerips, int n_peerips)
 {
-	rwlock_t		*global_lock = &ksocknal_data.ksnd_global_lock;
-        ksock_net_t        *net = peer_ni->ksnp_ni->ni_data;
-        ksock_interface_t  *iface;
-        ksock_interface_t  *best_iface;
-        int                 n_ips;
-        int                 i;
-        int                 j;
-        int                 k;
-        __u32               ip;
-        __u32               xor;
-        int                 this_netmatch;
-        int                 best_netmatch;
-        int                 best_npeers;
+	rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock;
+	struct ksock_net *net = peer_ni->ksnp_ni->ni_data;
+	struct ksock_interface *iface;
+	struct ksock_interface *best_iface;
+	int n_ips;
+	int i;
+	int j;
+	int k;
+	u32 ip;
+	u32 xor;
+	int this_netmatch;
+	int best_netmatch;
+	int best_npeers;
 
         /* CAVEAT EMPTOR: We do all our interface matching with an
          * exclusive hold of global lock at IRQ priority.  We're only
@@ -785,8 +771,8 @@ ksocknal_select_ips(ksock_peer_ni_t *peer_ni, __u32 *peerips, int n_peerips)
 
 	write_lock_bh(global_lock);
 
-	LASSERT(n_peerips <= LNET_NUM_INTERFACES);
-	LASSERT(net->ksnn_ninterfaces <= LNET_NUM_INTERFACES);
+	LASSERT(n_peerips <= LNET_INTERFACES_NUM);
+	LASSERT(net->ksnn_ninterfaces <= LNET_INTERFACES_NUM);
 
 	/* Only match interfaces for additional connections
          * if I have > 1 interface */
@@ -865,17 +851,17 @@ ksocknal_select_ips(ksock_peer_ni_t *peer_ni, __u32 *peerips, int n_peerips)
 }
 
 static void
-ksocknal_create_routes(ksock_peer_ni_t *peer_ni, int port,
+ksocknal_create_routes(struct ksock_peer_ni *peer_ni, int port,
                        __u32 *peer_ipaddrs, int npeer_ipaddrs)
 {
-	ksock_route_t		*newroute = NULL;
+	struct ksock_route		*newroute = NULL;
 	rwlock_t		*global_lock = &ksocknal_data.ksnd_global_lock;
 	struct lnet_ni *ni = peer_ni->ksnp_ni;
-	ksock_net_t		*net = ni->ni_data;
+	struct ksock_net		*net = ni->ni_data;
 	struct list_head	*rtmp;
-	ksock_route_t		*route;
-	ksock_interface_t	*iface;
-	ksock_interface_t	*best_iface;
+	struct ksock_route		*route;
+	struct ksock_interface	*iface;
+	struct ksock_interface	*best_iface;
 	int			best_netmatch;
 	int			this_netmatch;
 	int			best_nroutes;
@@ -896,7 +882,7 @@ ksocknal_create_routes(ksock_peer_ni_t *peer_ni, int port,
                 return;
         }
 
-	LASSERT(npeer_ipaddrs <= LNET_NUM_INTERFACES);
+	LASSERT(npeer_ipaddrs <= LNET_INTERFACES_NUM);
 
         for (i = 0; i < npeer_ipaddrs; i++) {
                 if (newroute != NULL) {
@@ -919,7 +905,7 @@ ksocknal_create_routes(ksock_peer_ni_t *peer_ni, int port,
 		/* Already got a route? */
 		route = NULL;
 		list_for_each(rtmp, &peer_ni->ksnp_routes) {
-			route = list_entry(rtmp, ksock_route_t, ksnr_list);
+			route = list_entry(rtmp, struct ksock_route, ksnr_list);
 
 			if (route->ksnr_ipaddr == newroute->ksnr_ipaddr)
 				break;
@@ -933,7 +919,7 @@ ksocknal_create_routes(ksock_peer_ni_t *peer_ni, int port,
 		best_nroutes = 0;
 		best_netmatch = 0;
 
-		LASSERT(net->ksnn_ninterfaces <= LNET_NUM_INTERFACES);
+		LASSERT(net->ksnn_ninterfaces <= LNET_INTERFACES_NUM);
 
 		/* Select interface to connect from */
 		for (j = 0; j < net->ksnn_ninterfaces; j++) {
@@ -941,7 +927,7 @@ ksocknal_create_routes(ksock_peer_ni_t *peer_ni, int port,
 
 			/* Using this interface already? */
 			list_for_each(rtmp, &peer_ni->ksnp_routes) {
-				route = list_entry(rtmp, ksock_route_t,
+				route = list_entry(rtmp, struct ksock_route,
 						   ksnr_list);
 
 				if (route->ksnr_myipaddr == iface->ksni_ipaddr)
@@ -985,10 +971,10 @@ ksocknal_create_routes(ksock_peer_ni_t *peer_ni, int port,
 int
 ksocknal_accept(struct lnet_ni *ni, struct socket *sock)
 {
-	ksock_connreq_t	*cr;
-	int		 rc;
-	__u32		 peer_ip;
-	int		 peer_port;
+	struct ksock_connreq *cr;
+	int rc;
+	u32 peer_ip;
+	int peer_port;
 
 	rc = lnet_sock_getaddr(sock, true, &peer_ip, &peer_port);
 	LASSERT(rc == 0);		/* we succeeded before */
@@ -1014,9 +1000,9 @@ ksocknal_accept(struct lnet_ni *ni, struct socket *sock)
 }
 
 static int
-ksocknal_connecting (ksock_peer_ni_t *peer_ni, __u32 ipaddr)
+ksocknal_connecting(struct ksock_peer_ni *peer_ni, __u32 ipaddr)
 {
-	ksock_route_t *route;
+	struct ksock_route *route;
 
 	list_for_each_entry(route, &peer_ni->ksnp_routes, ksnr_list) {
 		if (route->ksnr_ipaddr == ipaddr)
@@ -1026,27 +1012,27 @@ ksocknal_connecting (ksock_peer_ni_t *peer_ni, __u32 ipaddr)
 }
 
 int
-ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
+ksocknal_create_conn(struct lnet_ni *ni, struct ksock_route *route,
 		     struct socket *sock, int type)
 {
-	rwlock_t		*global_lock = &ksocknal_data.ksnd_global_lock;
-	struct list_head	zombies = LIST_HEAD_INIT(zombies);
+	rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock;
+	struct list_head zombies = LIST_HEAD_INIT(zombies);
 	struct lnet_process_id peerid;
-	struct list_head	*tmp;
-        __u64              incarnation;
-        ksock_conn_t      *conn;
-        ksock_conn_t      *conn2;
-        ksock_peer_ni_t      *peer_ni = NULL;
-        ksock_peer_ni_t      *peer2;
-        ksock_sched_t     *sched;
+	struct list_head *tmp;
+	u64 incarnation;
+	struct ksock_conn *conn;
+	struct ksock_conn *conn2;
+	struct ksock_peer_ni *peer_ni = NULL;
+	struct ksock_peer_ni *peer2;
+	struct ksock_sched *sched;
 	struct ksock_hello_msg *hello;
-	int		   cpt;
-        ksock_tx_t        *tx;
-        ksock_tx_t        *txtmp;
-        int                rc;
-	int                rc2;
-        int                active;
-        char              *warn = NULL;
+	int cpt;
+	struct ksock_tx *tx;
+	struct ksock_tx *txtmp;
+	int rc;
+	int rc2;
+	int active;
+	char *warn = NULL;
 
         active = (route != NULL);
 
@@ -1078,7 +1064,7 @@ ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
 	atomic_set (&conn->ksnc_tx_nob, 0);
 
 	LIBCFS_ALLOC(hello, offsetof(struct ksock_hello_msg,
-				     kshm_ips[LNET_NUM_INTERFACES]));
+				     kshm_ips[LNET_INTERFACES_NUM]));
         if (hello == NULL) {
                 rc = -ENOMEM;
                 goto failed_1;
@@ -1148,7 +1134,7 @@ ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
 		write_lock_bh(global_lock);
 
                 /* called with a ref on ni, so shutdown can't have started */
-                LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0);
+		LASSERT(((struct ksock_net *) ni->ni_data)->ksnn_shutdown == 0);
 
 		peer2 = ksocknal_find_peer_locked(ni, peerid);
 		if (peer2 == NULL) {
@@ -1224,7 +1210,7 @@ ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
 	 * loopback connection */
 	if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) {
 		list_for_each(tmp, &peer_ni->ksnp_conns) {
-			conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
+			conn2 = list_entry(tmp, struct ksock_conn, ksnc_list);
 
                         if (conn2->ksnc_ipaddr != conn->ksnc_ipaddr ||
                             conn2->ksnc_myipaddr != conn->ksnc_myipaddr ||
@@ -1258,7 +1244,7 @@ ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
 	 * by routes in my peer_ni to match my own route entries so I don't
 	 * continually create duplicate routes. */
 	list_for_each(tmp, &peer_ni->ksnp_routes) {
-		route = list_entry(tmp, ksock_route_t, ksnr_list);
+		route = list_entry(tmp, struct ksock_route, ksnr_list);
 
 		if (route->ksnr_ipaddr != conn->ksnc_ipaddr)
 			continue;
@@ -1268,7 +1254,7 @@ ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
 	}
 
 	conn->ksnc_peer = peer_ni;                 /* conn takes my ref on peer_ni */
-	peer_ni->ksnp_last_alive = cfs_time_current();
+	peer_ni->ksnp_last_alive = ktime_get_seconds();
 	peer_ni->ksnp_send_keepalive = 0;
 	peer_ni->ksnp_error = 0;
 
@@ -1281,14 +1267,15 @@ ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
 	 * The cpt might have changed if we ended up selecting a non cpt
 	 * native scheduler. So use the scheduler's cpt instead.
 	 */
-	cpt = sched->kss_info->ksi_cpt;
+	cpt = sched->kss_cpt;
         sched->kss_nconns++;
         conn->ksnc_scheduler = sched;
 
-	conn->ksnc_tx_last_post = cfs_time_current();
+	conn->ksnc_tx_last_post = ktime_get_seconds();
 	/* Set the deadline for the outgoing HELLO to drain */
 	conn->ksnc_tx_bufnob = sock->sk->sk_wmem_queued;
-	conn->ksnc_tx_deadline = cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+	conn->ksnc_tx_deadline = ktime_get_seconds() +
+				 lnet_get_lnd_timeout();
 	smp_mb();   /* order with adding to peer_ni's conn list */
 
 	list_add(&conn->ksnc_list, &peer_ni->ksnp_conns);
@@ -1319,11 +1306,10 @@ ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
          */
 
 	CDEBUG(D_NET, "New conn %s p %d.x %pI4h -> %pI4h/%d"
-	       " incarnation:%lld sched[%d:%d]\n",
+	       " incarnation:%lld sched[%d]\n",
 	       libcfs_id2str(peerid), conn->ksnc_proto->pro_version,
 	       &conn->ksnc_myipaddr, &conn->ksnc_ipaddr,
-	       conn->ksnc_port, incarnation, cpt,
-	       (int)(sched - &sched->kss_info->ksi_scheds[0]));
+	       conn->ksnc_port, incarnation, cpt);
 
         if (active) {
                 /* additional routes after interface exchange? */
@@ -1336,7 +1322,7 @@ ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
         }
 
 	LIBCFS_FREE(hello, offsetof(struct ksock_hello_msg,
-				    kshm_ips[LNET_NUM_INTERFACES]));
+				    kshm_ips[LNET_INTERFACES_NUM]));
 
         /* setup the socket AFTER I've received hello (it disables
          * SO_LINGER).  I might call back to the acceptor who may want
@@ -1420,7 +1406,7 @@ ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
 failed_1:
 	if (hello != NULL)
 		LIBCFS_FREE(hello, offsetof(struct ksock_hello_msg,
-					    kshm_ips[LNET_NUM_INTERFACES]));
+					    kshm_ips[LNET_INTERFACES_NUM]));
 
 	LIBCFS_FREE(conn, sizeof(*conn));
 
@@ -1430,15 +1416,15 @@ ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
 }
 
 void
-ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
+ksocknal_close_conn_locked(struct ksock_conn *conn, int error)
 {
         /* This just does the immmediate housekeeping, and queues the
          * connection for the reaper to terminate.
          * Caller holds ksnd_global_lock exclusively in irq context */
-        ksock_peer_ni_t      *peer_ni = conn->ksnc_peer;
-        ksock_route_t     *route;
-        ksock_conn_t      *conn2;
-	struct list_head  *tmp;
+	struct ksock_peer_ni *peer_ni = conn->ksnc_peer;
+	struct ksock_route *route;
+	struct ksock_conn *conn2;
+	struct list_head *tmp;
 
 	LASSERT(peer_ni->ksnp_error == 0);
 	LASSERT(!conn->ksnc_closing);
@@ -1455,7 +1441,7 @@ ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
 
 		conn2 = NULL;
 		list_for_each(tmp, &peer_ni->ksnp_conns) {
-			conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
+			conn2 = list_entry(tmp, struct ksock_conn, ksnc_list);
 
 			if (conn2->ksnc_route == route &&
 			    conn2->ksnc_type == conn->ksnc_type)
@@ -1475,7 +1461,7 @@ ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
 		/* No more connections to this peer_ni */
 
 		if (!list_empty(&peer_ni->ksnp_tx_queue)) {
-				ksock_tx_t *tx;
+				struct ksock_tx *tx;
 
 			LASSERT(conn->ksnc_proto == &ksocknal_protocol_v3x);
 
@@ -1513,10 +1499,10 @@ ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
 }
 
 void
-ksocknal_peer_failed (ksock_peer_ni_t *peer_ni)
+ksocknal_peer_failed(struct ksock_peer_ni *peer_ni)
 {
-        int        notify = 0;
-	cfs_time_t last_alive = 0;
+	int notify = 0;
+	time64_t last_alive = 0;
 
 	/* There has been a connection failure or comms error; but I'll only
 	 * tell LNET I think the peer_ni is dead if it's to another kernel and
@@ -1540,12 +1526,12 @@ ksocknal_peer_failed (ksock_peer_ni_t *peer_ni)
 }
 
 void
-ksocknal_finalize_zcreq(ksock_conn_t *conn)
+ksocknal_finalize_zcreq(struct ksock_conn *conn)
 {
-	ksock_peer_ni_t	 *peer_ni = conn->ksnc_peer;
-	ksock_tx_t	 *tx;
-	ksock_tx_t	 *tmp;
-	struct list_head  zlist = LIST_HEAD_INIT(zlist);
+	struct ksock_peer_ni *peer_ni = conn->ksnc_peer;
+	struct ksock_tx *tx;
+	struct ksock_tx *tmp;
+	struct list_head zlist = LIST_HEAD_INIT(zlist);
 
 	/* NB safe to finalize TXs because closing of socket will
 	 * abort all buffered data */
@@ -1568,7 +1554,7 @@ ksocknal_finalize_zcreq(ksock_conn_t *conn)
 	spin_unlock(&peer_ni->ksnp_lock);
 
 	while (!list_empty(&zlist)) {
-		tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list);
+		tx = list_entry(zlist.next, struct ksock_tx, tx_zc_list);
 
 		list_del(&tx->tx_zc_list);
 		ksocknal_tx_decref(tx);
@@ -1576,15 +1562,15 @@ ksocknal_finalize_zcreq(ksock_conn_t *conn)
 }
 
 void
-ksocknal_terminate_conn(ksock_conn_t *conn)
+ksocknal_terminate_conn(struct ksock_conn *conn)
 {
         /* This gets called by the reaper (guaranteed thread context) to
          * disengage the socket from its callbacks and close it.
          * ksnc_refcount will eventually hit zero, and then the reaper will
          * destroy it. */
-        ksock_peer_ni_t     *peer_ni = conn->ksnc_peer;
-        ksock_sched_t    *sched = conn->ksnc_scheduler;
-        int               failed = 0;
+	struct ksock_peer_ni *peer_ni = conn->ksnc_peer;
+	struct ksock_sched *sched = conn->ksnc_scheduler;
+	int failed = 0;
 
         LASSERT(conn->ksnc_closing);
 
@@ -1637,10 +1623,9 @@ ksocknal_terminate_conn(ksock_conn_t *conn)
 }
 
 void
-ksocknal_queue_zombie_conn (ksock_conn_t *conn)
+ksocknal_queue_zombie_conn(struct ksock_conn *conn)
 {
 	/* Queue the conn for the reaper to destroy */
-
 	LASSERT(atomic_read(&conn->ksnc_conn_refcount) == 0);
 	spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
 
@@ -1651,9 +1636,9 @@ ksocknal_queue_zombie_conn (ksock_conn_t *conn)
 }
 
 void
-ksocknal_destroy_conn (ksock_conn_t *conn)
+ksocknal_destroy_conn(struct ksock_conn *conn)
 {
-	cfs_time_t      last_rcv;
+	time64_t last_rcv;
 
 	/* Final coup-de-grace of the reaper */
 	CDEBUG (D_NET, "connection %p\n", conn);
@@ -1670,16 +1655,18 @@ ksocknal_destroy_conn (ksock_conn_t *conn)
         switch (conn->ksnc_rx_state) {
         case SOCKNAL_RX_LNET_PAYLOAD:
                 last_rcv = conn->ksnc_rx_deadline -
-			   cfs_time_seconds(*ksocknal_tunables.ksnd_timeout);
+			   lnet_get_lnd_timeout();
 		CERROR("Completing partial receive from %s[%d], "
 		       "ip %pI4h:%d, with error, wanted: %d, left: %d, "
-		       "last alive is %ld secs ago\n",
+		       "last alive is %lld secs ago\n",
                        libcfs_id2str(conn->ksnc_peer->ksnp_id), conn->ksnc_type,
 		       &conn->ksnc_ipaddr, conn->ksnc_port,
                        conn->ksnc_rx_nob_wanted, conn->ksnc_rx_nob_left,
-		       cfs_duration_sec(cfs_time_sub(cfs_time_current(),
-					last_rcv)));
-		lnet_finalize(conn->ksnc_cookie, -EIO);
+		       ktime_get_seconds() - last_rcv);
+		if (conn->ksnc_lnet_msg)
+			conn->ksnc_lnet_msg->msg_health_status =
+				LNET_MSG_STATUS_REMOTE_ERROR;
+		lnet_finalize(conn->ksnc_lnet_msg, -EIO);
 		break;
         case SOCKNAL_RX_LNET_HEADER:
                 if (conn->ksnc_rx_started)
@@ -1715,15 +1702,15 @@ ksocknal_destroy_conn (ksock_conn_t *conn)
 }
 
 int
-ksocknal_close_peer_conns_locked (ksock_peer_ni_t *peer_ni, __u32 ipaddr, int why)
+ksocknal_close_peer_conns_locked(struct ksock_peer_ni *peer_ni, __u32 ipaddr, int why)
 {
-        ksock_conn_t       *conn;
-	struct list_head         *ctmp;
-	struct list_head         *cnxt;
-        int                 count = 0;
+	struct ksock_conn *conn;
+	struct list_head *ctmp;
+	struct list_head *cnxt;
+	int count = 0;
 
 	list_for_each_safe(ctmp, cnxt, &peer_ni->ksnp_conns) {
-		conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
+		conn = list_entry(ctmp, struct ksock_conn, ksnc_list);
 
                 if (ipaddr == 0 ||
                     conn->ksnc_ipaddr == ipaddr) {
@@ -1736,11 +1723,11 @@ ksocknal_close_peer_conns_locked (ksock_peer_ni_t *peer_ni, __u32 ipaddr, int wh
 }
 
 int
-ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why)
+ksocknal_close_conn_and_siblings(struct ksock_conn *conn, int why)
 {
-        ksock_peer_ni_t     *peer_ni = conn->ksnc_peer;
-        __u32             ipaddr = conn->ksnc_ipaddr;
-        int               count;
+	struct ksock_peer_ni *peer_ni = conn->ksnc_peer;
+	u32 ipaddr = conn->ksnc_ipaddr;
+	int count;
 
 	write_lock_bh(&ksocknal_data.ksnd_global_lock);
 
@@ -1754,13 +1741,13 @@ ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why)
 int
 ksocknal_close_matching_conns(struct lnet_process_id id, __u32 ipaddr)
 {
-        ksock_peer_ni_t       *peer_ni;
-	struct list_head         *ptmp;
-	struct list_head         *pnxt;
-        int                 lo;
-        int                 hi;
-        int                 i;
-        int                 count = 0;
+	struct ksock_peer_ni *peer_ni;
+	struct list_head *ptmp;
+	struct list_head *pnxt;
+	int lo;
+	int hi;
+	int i;
+	int count = 0;
 
 	write_lock_bh(&ksocknal_data.ksnd_global_lock);
 
@@ -1774,7 +1761,7 @@ ksocknal_close_matching_conns(struct lnet_process_id id, __u32 ipaddr)
         for (i = lo; i <= hi; i++) {
 		list_for_each_safe(ptmp, pnxt, &ksocknal_data.ksnd_peers[i]) {
 
-			peer_ni = list_entry(ptmp, ksock_peer_ni_t, ksnp_list);
+			peer_ni = list_entry(ptmp, struct ksock_peer_ni, ksnp_list);
 
                         if (!((id.nid == LNET_NID_ANY || id.nid == peer_ni->ksnp_id.nid) &&
                               (id.pid == LNET_PID_ANY || id.pid == peer_ni->ksnp_id.pid)))
@@ -1818,12 +1805,12 @@ ksocknal_notify(struct lnet_ni *ni, lnet_nid_t gw_nid, int alive)
 }
 
 void
-ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when)
+ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, time64_t *when)
 {
 	int connect = 1;
-	cfs_time_t last_alive = 0;
-	cfs_time_t now = cfs_time_current();
-	ksock_peer_ni_t *peer_ni = NULL;
+	time64_t last_alive = 0;
+	time64_t now = ktime_get_seconds();
+	struct ksock_peer_ni *peer_ni = NULL;
 	rwlock_t *glock = &ksocknal_data.ksnd_global_lock;
 	struct lnet_process_id id = {
 		.nid = nid,
@@ -1832,20 +1819,20 @@ ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when)
 
 	read_lock(glock);
 
-        peer_ni = ksocknal_find_peer_locked(ni, id);
-        if (peer_ni != NULL) {
-		struct list_head       *tmp;
-                ksock_conn_t     *conn;
-                int               bufnob;
+	peer_ni = ksocknal_find_peer_locked(ni, id);
+	if (peer_ni != NULL) {
+		struct list_head *tmp;
+		struct ksock_conn *conn;
+		int bufnob;
 
 		list_for_each(tmp, &peer_ni->ksnp_conns) {
-			conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+			conn = list_entry(tmp, struct ksock_conn, ksnc_list);
 			bufnob = conn->ksnc_sock->sk->sk_wmem_queued;
 
 			if (bufnob < conn->ksnc_tx_bufnob) {
 				/* something got ACKed */
-				conn->ksnc_tx_deadline =
-					cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+				conn->ksnc_tx_deadline = ktime_get_seconds() +
+							 lnet_get_lnd_timeout();
                                 peer_ni->ksnp_last_alive = now;
                                 conn->ksnc_tx_bufnob = bufnob;
                         }
@@ -1861,9 +1848,9 @@ ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when)
         if (last_alive != 0)
 		*when = last_alive;
 
-	CDEBUG(D_NET, "peer_ni %s %p, alive %ld secs ago, connect %d\n",
+	CDEBUG(D_NET, "peer_ni %s %p, alive %lld secs ago, connect %d\n",
                libcfs_nid2str(nid), peer_ni,
-	       last_alive ? cfs_duration_sec(now - last_alive) : -1,
+	       last_alive ? now - last_alive : -1,
                connect);
 
         if (!connect)
@@ -1882,12 +1869,12 @@ ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when)
 }
 
 static void
-ksocknal_push_peer (ksock_peer_ni_t *peer_ni)
+ksocknal_push_peer(struct ksock_peer_ni *peer_ni)
 {
-        int               index;
-        int               i;
-	struct list_head       *tmp;
-        ksock_conn_t     *conn;
+	int index;
+	int i;
+	struct list_head *tmp;
+	struct ksock_conn *conn;
 
         for (index = 0; ; index++) {
 		read_lock(&ksocknal_data.ksnd_global_lock);
@@ -1897,8 +1884,8 @@ ksocknal_push_peer (ksock_peer_ni_t *peer_ni)
 
 		list_for_each(tmp, &peer_ni->ksnp_conns) {
                         if (i++ == index) {
-				conn = list_entry(tmp, ksock_conn_t,
-                                                       ksnc_list);
+				conn = list_entry(tmp, struct ksock_conn,
+						  ksnc_list);
                                 ksocknal_conn_addref(conn);
                                 break;
                         }
@@ -1934,7 +1921,7 @@ ksocknal_push(struct lnet_ni *ni, struct lnet_process_id id)
 		int	peer_off; /* searching offset in peer_ni hash table */
 
 		for (peer_off = 0; ; peer_off++) {
-			ksock_peer_ni_t *peer_ni;
+			struct ksock_peer_ni *peer_ni;
 			int	      i = 0;
 
 			read_lock(&ksocknal_data.ksnd_global_lock);
@@ -1966,15 +1953,15 @@ ksocknal_push(struct lnet_ni *ni, struct lnet_process_id id)
 static int
 ksocknal_add_interface(struct lnet_ni *ni, __u32 ipaddress, __u32 netmask)
 {
-	ksock_net_t *net = ni->ni_data;
-	ksock_interface_t *iface;
+	struct ksock_net *net = ni->ni_data;
+	struct ksock_interface *iface;
 	int rc;
 	int i;
 	int j;
 	struct list_head *ptmp;
-	ksock_peer_ni_t *peer_ni;
+	struct ksock_peer_ni *peer_ni;
 	struct list_head *rtmp;
-	ksock_route_t *route;
+	struct ksock_route *route;
 
 	if (ipaddress == 0 ||
 	    netmask == 0)
@@ -1986,7 +1973,7 @@ ksocknal_add_interface(struct lnet_ni *ni, __u32 ipaddress, __u32 netmask)
 	if (iface != NULL) {
 		/* silently ignore dups */
 		rc = 0;
-	} else if (net->ksnn_ninterfaces == LNET_NUM_INTERFACES) {
+	} else if (net->ksnn_ninterfaces == LNET_INTERFACES_NUM) {
 		rc = -ENOSPC;
 	} else {
 		iface = &net->ksnn_interfaces[net->ksnn_ninterfaces++];
@@ -1998,7 +1985,7 @@ ksocknal_add_interface(struct lnet_ni *ni, __u32 ipaddress, __u32 netmask)
 
 		for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
 			list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
-				peer_ni = list_entry(ptmp, ksock_peer_ni_t,
+				peer_ni = list_entry(ptmp, struct ksock_peer_ni,
 						     ksnp_list);
 
 				for (j = 0; j < peer_ni->ksnp_n_passive_ips; j++)
@@ -2007,7 +1994,7 @@ ksocknal_add_interface(struct lnet_ni *ni, __u32 ipaddress, __u32 netmask)
 
 				list_for_each(rtmp, &peer_ni->ksnp_routes) {
 					route = list_entry(rtmp,
-							   ksock_route_t,
+							   struct ksock_route,
 							   ksnr_list);
 
 					if (route->ksnr_myipaddr == ipaddress)
@@ -2026,14 +2013,14 @@ ksocknal_add_interface(struct lnet_ni *ni, __u32 ipaddress, __u32 netmask)
 }
 
 static void
-ksocknal_peer_del_interface_locked(ksock_peer_ni_t *peer_ni, __u32 ipaddr)
+ksocknal_peer_del_interface_locked(struct ksock_peer_ni *peer_ni, __u32 ipaddr)
 {
-	struct list_head         *tmp;
-	struct list_head         *nxt;
-        ksock_route_t      *route;
-        ksock_conn_t       *conn;
-        int                 i;
-        int                 j;
+	struct list_head *tmp;
+	struct list_head *nxt;
+	struct ksock_route *route;
+	struct ksock_conn *conn;
+	int i;
+	int j;
 
         for (i = 0; i < peer_ni->ksnp_n_passive_ips; i++)
                 if (peer_ni->ksnp_passive_ips[i] == ipaddr) {
@@ -2045,7 +2032,7 @@ ksocknal_peer_del_interface_locked(ksock_peer_ni_t *peer_ni, __u32 ipaddr)
                 }
 
 	list_for_each_safe(tmp, nxt, &peer_ni->ksnp_routes) {
-		route = list_entry(tmp, ksock_route_t, ksnr_list);
+		route = list_entry(tmp, struct ksock_route, ksnr_list);
 
                 if (route->ksnr_myipaddr != ipaddr)
                         continue;
@@ -2059,7 +2046,7 @@ ksocknal_peer_del_interface_locked(ksock_peer_ni_t *peer_ni, __u32 ipaddr)
         }
 
 	list_for_each_safe(tmp, nxt, &peer_ni->ksnp_conns) {
-		conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+		conn = list_entry(tmp, struct ksock_conn, ksnc_list);
 
                 if (conn->ksnc_myipaddr == ipaddr)
                         ksocknal_close_conn_locked (conn, 0);
@@ -2069,14 +2056,14 @@ ksocknal_peer_del_interface_locked(ksock_peer_ni_t *peer_ni, __u32 ipaddr)
 static int
 ksocknal_del_interface(struct lnet_ni *ni, __u32 ipaddress)
 {
-        ksock_net_t       *net = ni->ni_data;
-        int                rc = -ENOENT;
-	struct list_head        *tmp;
-	struct list_head        *nxt;
-        ksock_peer_ni_t      *peer_ni;
-        __u32              this_ip;
-        int                i;
-        int                j;
+	struct ksock_net *net = ni->ni_data;
+	int rc = -ENOENT;
+	struct list_head *tmp;
+	struct list_head *nxt;
+	struct ksock_peer_ni *peer_ni;
+	u32 this_ip;
+	int i;
+	int j;
 
 	write_lock_bh(&ksocknal_data.ksnd_global_lock);
 
@@ -2097,9 +2084,9 @@ ksocknal_del_interface(struct lnet_ni *ni, __u32 ipaddress)
 
                 for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) {
 			list_for_each_safe(tmp, nxt,
-                                               &ksocknal_data.ksnd_peers[j]) {
-				peer_ni = list_entry(tmp, ksock_peer_ni_t,
-                                                      ksnp_list);
+					   &ksocknal_data.ksnd_peers[j]) {
+				peer_ni = list_entry(tmp, struct ksock_peer_ni,
+						     ksnp_list);
 
                                 if (peer_ni->ksnp_ni != ni)
                                         continue;
@@ -2123,8 +2110,8 @@ ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
 
         switch(cmd) {
         case IOC_LIBCFS_GET_INTERFACE: {
-                ksock_net_t       *net = ni->ni_data;
-                ksock_interface_t *iface;
+		struct ksock_net *net = ni->ni_data;
+		struct ksock_interface *iface;
 
 		read_lock(&ksocknal_data.ksnd_global_lock);
 
@@ -2193,7 +2180,7 @@ ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
                 int           txmem;
                 int           rxmem;
                 int           nagle;
-                ksock_conn_t *conn = ksocknal_get_conn_by_idx (ni, data->ioc_count);
+		struct ksock_conn *conn = ksocknal_get_conn_by_idx(ni, data->ioc_count);
 
                 if (conn == NULL)
                         return -ENOENT;
@@ -2207,7 +2194,7 @@ ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
                 data->ioc_u32[1] = conn->ksnc_port;
                 data->ioc_u32[2] = conn->ksnc_myipaddr;
                 data->ioc_u32[3] = conn->ksnc_type;
-		data->ioc_u32[4] = conn->ksnc_scheduler->kss_info->ksi_cpt;
+		data->ioc_u32[4] = conn->ksnc_scheduler->kss_cpt;
                 data->ioc_u32[5] = rxmem;
                 data->ioc_u32[6] = conn->ksnc_peer->ksnp_id.pid;
                 ksocknal_conn_decref(conn);
@@ -2246,19 +2233,8 @@ ksocknal_free_buffers (void)
 {
 	LASSERT (atomic_read(&ksocknal_data.ksnd_nactive_txs) == 0);
 
-	if (ksocknal_data.ksnd_sched_info != NULL) {
-		struct ksock_sched_info	*info;
-		int			i;
-
-		cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
-			if (info->ksi_scheds != NULL) {
-				LIBCFS_FREE(info->ksi_scheds,
-					    info->ksi_nthreads_max *
-					    sizeof(info->ksi_scheds[0]));
-			}
-		}
-		cfs_percpt_free(ksocknal_data.ksnd_sched_info);
-	}
+	if (ksocknal_data.ksnd_schedulers != NULL)
+		cfs_percpt_free(ksocknal_data.ksnd_schedulers);
 
         LIBCFS_FREE (ksocknal_data.ksnd_peers,
 		     sizeof(struct list_head) *
@@ -2267,15 +2243,15 @@ ksocknal_free_buffers (void)
 	spin_lock(&ksocknal_data.ksnd_tx_lock);
 
 	if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
-		struct list_head	zlist;
-		ksock_tx_t	*tx;
+		struct list_head zlist;
+		struct ksock_tx	*tx;
 
 		list_add(&zlist, &ksocknal_data.ksnd_idle_noop_txs);
 		list_del_init(&ksocknal_data.ksnd_idle_noop_txs);
 		spin_unlock(&ksocknal_data.ksnd_tx_lock);
 
 		while (!list_empty(&zlist)) {
-			tx = list_entry(zlist.next, ksock_tx_t, tx_list);
+			tx = list_entry(zlist.next, struct ksock_tx, tx_list);
 			list_del(&tx->tx_list);
 			LIBCFS_FREE(tx, tx->tx_desc_size);
 		}
@@ -2287,26 +2263,23 @@ ksocknal_free_buffers (void)
 static void
 ksocknal_base_shutdown(void)
 {
-	struct ksock_sched_info *info;
-	ksock_sched_t		*sched;
-	int			i;
-	int			j;
+	struct ksock_sched *sched;
+	int i;
 
 	CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
 	       atomic_read (&libcfs_kmemory));
 	LASSERT (ksocknal_data.ksnd_nnets == 0);
 
-        switch (ksocknal_data.ksnd_init) {
-        default:
-                LASSERT (0);
-		/* Fall through */
+	switch (ksocknal_data.ksnd_init) {
+	default:
+		LASSERT(0);
+		fallthrough;
 
-        case SOCKNAL_INIT_ALL:
-        case SOCKNAL_INIT_DATA:
-                LASSERT (ksocknal_data.ksnd_peers != NULL);
-                for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+	case SOCKNAL_INIT_ALL:
+	case SOCKNAL_INIT_DATA:
+		LASSERT(ksocknal_data.ksnd_peers != NULL);
+		for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++)
 			LASSERT(list_empty(&ksocknal_data.ksnd_peers[i]));
-                }
 
 		LASSERT(list_empty(&ksocknal_data.ksnd_nets));
 		LASSERT(list_empty(&ksocknal_data.ksnd_enomem_conns));
@@ -2314,23 +2287,14 @@ ksocknal_base_shutdown(void)
 		LASSERT(list_empty(&ksocknal_data.ksnd_connd_connreqs));
 		LASSERT(list_empty(&ksocknal_data.ksnd_connd_routes));
 
-		if (ksocknal_data.ksnd_sched_info != NULL) {
-			cfs_percpt_for_each(info, i,
-					    ksocknal_data.ksnd_sched_info) {
-				if (info->ksi_scheds == NULL)
-					continue;
-
-				for (j = 0; j < info->ksi_nthreads_max; j++) {
+		if (ksocknal_data.ksnd_schedulers != NULL) {
+			cfs_percpt_for_each(sched, i,
+					    ksocknal_data.ksnd_schedulers) {
 
-					sched = &info->ksi_scheds[j];
-					LASSERT(list_empty(&sched->\
-							       kss_tx_conns));
-					LASSERT(list_empty(&sched->\
-							       kss_rx_conns));
-					LASSERT(list_empty(&sched-> \
-						  kss_zombie_noop_txs));
-					LASSERT(sched->kss_nconns == 0);
-				}
+				LASSERT(list_empty(&sched->kss_tx_conns));
+				LASSERT(list_empty(&sched->kss_rx_conns));
+				LASSERT(list_empty(&sched->kss_zombie_noop_txs));
+				LASSERT(sched->kss_nconns == 0);
 			}
 		}
 
@@ -2339,17 +2303,10 @@ ksocknal_base_shutdown(void)
 		wake_up_all(&ksocknal_data.ksnd_connd_waitq);
 		wake_up_all(&ksocknal_data.ksnd_reaper_waitq);
 
-		if (ksocknal_data.ksnd_sched_info != NULL) {
-			cfs_percpt_for_each(info, i,
-					    ksocknal_data.ksnd_sched_info) {
-				if (info->ksi_scheds == NULL)
-					continue;
-
-				for (j = 0; j < info->ksi_nthreads_max; j++) {
-					sched = &info->ksi_scheds[j];
+		if (ksocknal_data.ksnd_schedulers != NULL) {
+			cfs_percpt_for_each(sched, i,
+					    ksocknal_data.ksnd_schedulers)
 					wake_up_all(&sched->kss_waitq);
-				}
-			}
 		}
 
 		i = 4;
@@ -2382,9 +2339,9 @@ ksocknal_base_shutdown(void)
 static int
 ksocknal_base_startup(void)
 {
-	struct ksock_sched_info	*info;
-	int			rc;
-	int			i;
+	struct ksock_sched *sched;
+	int rc;
+	int i;
 
         LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
         LASSERT (ksocknal_data.ksnd_nnets == 0);
@@ -2424,50 +2381,43 @@ ksocknal_base_startup(void)
 	ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
 	try_module_get(THIS_MODULE);
 
-	ksocknal_data.ksnd_sched_info = cfs_percpt_alloc(lnet_cpt_table(),
-							 sizeof(*info));
-	if (ksocknal_data.ksnd_sched_info == NULL)
+	/* Create a scheduler block per available CPT */
+	ksocknal_data.ksnd_schedulers = cfs_percpt_alloc(lnet_cpt_table(),
+							 sizeof(*sched));
+	if (ksocknal_data.ksnd_schedulers == NULL)
 		goto failed;
 
-	cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
-		ksock_sched_t	*sched;
-		int		nthrs;
+	cfs_percpt_for_each(sched, i, ksocknal_data.ksnd_schedulers) {
+		int nthrs;
 
+		/*
+		 * make sure not to allocate more threads than there are
+		 * cores/CPUs in teh CPT
+		 */
 		nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
 		if (*ksocknal_tunables.ksnd_nscheds > 0) {
 			nthrs = min(nthrs, *ksocknal_tunables.ksnd_nscheds);
 		} else {
-			/* max to half of CPUs, assume another half should be
-			 * reserved for upper layer modules */
+			/*
+			 * max to half of CPUs, assume another half should be
+			 * reserved for upper layer modules
+			 */
 			nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
 		}
 
-		info->ksi_nthreads_max = nthrs;
-		info->ksi_cpt = i;
-
-		if (nthrs != 0) {
-			LIBCFS_CPT_ALLOC(info->ksi_scheds, lnet_cpt_table(), i,
-					 info->ksi_nthreads_max *
-						sizeof(*sched));
-			if (info->ksi_scheds == NULL)
-				goto failed;
-
-			for (; nthrs > 0; nthrs--) {
-				sched = &info->ksi_scheds[nthrs - 1];
-
-				sched->kss_info = info;
-				spin_lock_init(&sched->kss_lock);
-				INIT_LIST_HEAD(&sched->kss_rx_conns);
-				INIT_LIST_HEAD(&sched->kss_tx_conns);
-				INIT_LIST_HEAD(&sched->kss_zombie_noop_txs);
-				init_waitqueue_head(&sched->kss_waitq);
-			}
-		}
+		sched->kss_nthreads_max = nthrs;
+		sched->kss_cpt = i;
+
+		spin_lock_init(&sched->kss_lock);
+		INIT_LIST_HEAD(&sched->kss_rx_conns);
+		INIT_LIST_HEAD(&sched->kss_tx_conns);
+		INIT_LIST_HEAD(&sched->kss_zombie_noop_txs);
+		init_waitqueue_head(&sched->kss_waitq);
         }
 
         ksocknal_data.ksnd_connd_starting         = 0;
         ksocknal_data.ksnd_connd_failed_stamp     = 0;
-	ksocknal_data.ksnd_connd_starting_stamp   = cfs_time_current_sec();
+	ksocknal_data.ksnd_connd_starting_stamp   = ktime_get_real_seconds();
         /* must have at least 2 connds to remain responsive to accepts while
          * connecting */
         if (*ksocknal_tunables.ksnd_nconnds < SOCKNAL_CONND_RESV + 1)
@@ -2517,15 +2467,15 @@ ksocknal_base_startup(void)
 static void
 ksocknal_debug_peerhash(struct lnet_ni *ni)
 {
-	ksock_peer_ni_t	*peer_ni = NULL;
-	struct list_head	*tmp;
-	int		i;
+	struct ksock_peer_ni *peer_ni = NULL;
+	struct list_head *tmp;
+	int i;
 
 	read_lock(&ksocknal_data.ksnd_global_lock);
 
         for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
 		list_for_each(tmp, &ksocknal_data.ksnd_peers[i]) {
-			peer_ni = list_entry(tmp, ksock_peer_ni_t, ksnp_list);
+			peer_ni = list_entry(tmp, struct ksock_peer_ni, ksnp_list);
 
                         if (peer_ni->ksnp_ni == ni) break;
 
@@ -2534,8 +2484,8 @@ ksocknal_debug_peerhash(struct lnet_ni *ni)
         }
 
         if (peer_ni != NULL) {
-                ksock_route_t *route;
-                ksock_conn_t  *conn;
+		struct ksock_route *route;
+		struct ksock_conn  *conn;
 
 		CWARN ("Active peer_ni on shutdown: %s, ref %d, scnt %d, "
 		       "closing %d, accepting %d, err %d, zcookie %llu, "
@@ -2548,7 +2498,7 @@ ksocknal_debug_peerhash(struct lnet_ni *ni)
 		       !list_empty(&peer_ni->ksnp_zc_req_list));
 
 		list_for_each(tmp, &peer_ni->ksnp_routes) {
-			route = list_entry(tmp, ksock_route_t, ksnr_list);
+			route = list_entry(tmp, struct ksock_route, ksnr_list);
 			CWARN ("Route: ref %d, schd %d, conn %d, cnted %d, "
 			       "del %d\n", atomic_read(&route->ksnr_refcount),
 			       route->ksnr_scheduled, route->ksnr_connecting,
@@ -2556,7 +2506,7 @@ ksocknal_debug_peerhash(struct lnet_ni *ni)
 		}
 
 		list_for_each(tmp, &peer_ni->ksnp_conns) {
-			conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+			conn = list_entry(tmp, struct ksock_conn, ksnc_list);
 			CWARN ("Conn: ref %d, sref %d, t %d, c %d\n",
 			       atomic_read(&conn->ksnc_conn_refcount),
 			       atomic_read(&conn->ksnc_sock_refcount),
@@ -2571,7 +2521,7 @@ ksocknal_debug_peerhash(struct lnet_ni *ni)
 void
 ksocknal_shutdown(struct lnet_ni *ni)
 {
-	ksock_net_t *net = ni->ni_data;
+	struct ksock_net *net = ni->ni_data;
 	struct lnet_process_id anyid = {
 		.nid = LNET_NID_ANY,
 		.pid = LNET_PID_ANY,
@@ -2621,17 +2571,17 @@ ksocknal_shutdown(struct lnet_ni *ni)
 }
 
 static int
-ksocknal_search_new_ipif(ksock_net_t *net)
+ksocknal_search_new_ipif(struct ksock_net *net)
 {
-	int	new_ipif = 0;
-	int	i;
+	int new_ipif = 0;
+	int i;
 
 	for (i = 0; i < net->ksnn_ninterfaces; i++) {
-		char		*ifnam = &net->ksnn_interfaces[i].ksni_name[0];
-		char		*colon = strchr(ifnam, ':');
-		int		found  = 0;
-		ksock_net_t	*tmp;
-		int		j;
+		char *ifnam = &net->ksnn_interfaces[i].ksni_name[0];
+		char *colon = strchr(ifnam, ':');
+		int found  = 0;
+		struct ksock_net *tmp;
+		int j;
 
 		if (colon != NULL) /* ignore alias device */
 			*colon = 0;
@@ -2663,36 +2613,35 @@ ksocknal_search_new_ipif(ksock_net_t *net)
 }
 
 static int
-ksocknal_start_schedulers(struct ksock_sched_info *info)
+ksocknal_start_schedulers(struct ksock_sched *sched)
 {
 	int	nthrs;
 	int	rc = 0;
 	int	i;
 
-	if (info->ksi_nthreads == 0) {
+	if (sched->kss_nthreads == 0) {
 		if (*ksocknal_tunables.ksnd_nscheds > 0) {
-			nthrs = info->ksi_nthreads_max;
+			nthrs = sched->kss_nthreads_max;
 		} else {
 			nthrs = cfs_cpt_weight(lnet_cpt_table(),
-					       info->ksi_cpt);
+					       sched->kss_cpt);
 			nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
 			nthrs = min(SOCKNAL_NSCHEDS_HIGH, nthrs);
 		}
-		nthrs = min(nthrs, info->ksi_nthreads_max);
+		nthrs = min(nthrs, sched->kss_nthreads_max);
 	} else {
-		LASSERT(info->ksi_nthreads <= info->ksi_nthreads_max);
+		LASSERT(sched->kss_nthreads <= sched->kss_nthreads_max);
 		/* increase two threads if there is new interface */
-		nthrs = min(2, info->ksi_nthreads_max - info->ksi_nthreads);
+		nthrs = min(2, sched->kss_nthreads_max - sched->kss_nthreads);
 	}
 
 	for (i = 0; i < nthrs; i++) {
-		long		id;
-		char		name[20];
-		ksock_sched_t	*sched;
-		id = KSOCK_THREAD_ID(info->ksi_cpt, info->ksi_nthreads + i);
-		sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
+		long id;
+		char name[20];
+
+		id = KSOCK_THREAD_ID(sched->kss_cpt, sched->kss_nthreads + i);
 		snprintf(name, sizeof(name), "socknal_sd%02d_%02d",
-			 info->ksi_cpt, (int)(sched - &info->ksi_scheds[0]));
+			 sched->kss_cpt, (int)KSOCK_THREAD_SID(id));
 
 		rc = ksocknal_thread_start(ksocknal_scheduler,
 					   (void *)id, name);
@@ -2700,35 +2649,35 @@ ksocknal_start_schedulers(struct ksock_sched_info *info)
 			continue;
 
 		CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
-		       info->ksi_cpt, info->ksi_nthreads + i, rc);
+		       sched->kss_cpt, (int) KSOCK_THREAD_SID(id), rc);
 		break;
 	}
 
-	info->ksi_nthreads += i;
+	sched->kss_nthreads += i;
 	return rc;
 }
 
 static int
-ksocknal_net_start_threads(ksock_net_t *net, __u32 *cpts, int ncpts)
+ksocknal_net_start_threads(struct ksock_net *net, __u32 *cpts, int ncpts)
 {
-	int	newif = ksocknal_search_new_ipif(net);
-	int	rc;
-	int	i;
+	int newif = ksocknal_search_new_ipif(net);
+	int rc;
+	int i;
 
 	if (ncpts > 0 && ncpts > cfs_cpt_number(lnet_cpt_table()))
 		return -EINVAL;
 
 	for (i = 0; i < ncpts; i++) {
-		struct ksock_sched_info	*info;
+		struct ksock_sched *sched;
 		int cpt = (cpts == NULL) ? i : cpts[i];
 
 		LASSERT(cpt < cfs_cpt_number(lnet_cpt_table()));
-		info = ksocknal_data.ksnd_sched_info[cpt];
+		sched = ksocknal_data.ksnd_schedulers[cpt];
 
-		if (!newif && info->ksi_nthreads > 0)
+		if (!newif && sched->kss_nthreads > 0)
 			continue;
 
-		rc = ksocknal_start_schedulers(info);
+		rc = ksocknal_start_schedulers(sched);
 		if (rc != 0)
 			return rc;
 	}
@@ -2738,8 +2687,9 @@ ksocknal_net_start_threads(ksock_net_t *net, __u32 *cpts, int ncpts)
 int
 ksocknal_startup(struct lnet_ni *ni)
 {
-	ksock_net_t  *net;
-	ksock_interface_t *ksi = NULL;
+	struct ksock_net *net;
+	struct lnet_ioctl_config_lnd_cmn_tunables *net_tunables;
+	struct ksock_interface *ksi = NULL;
 	struct lnet_inetdev *ifaces = NULL;
 	int i = 0;
 	int rc;
@@ -2759,18 +2709,28 @@ ksocknal_startup(struct lnet_ni *ni)
 	spin_lock_init(&net->ksnn_lock);
 	net->ksnn_incarnation = ktime_get_real_ns();
 	ni->ni_data = net;
-	if (!ni->ni_net->net_tunables_set) {
-		ni->ni_net->net_tunables.lct_peer_timeout =
+	net_tunables = &ni->ni_net->net_tunables;
+
+	if (net_tunables->lct_peer_timeout == -1)
+		net_tunables->lct_peer_timeout =
 			*ksocknal_tunables.ksnd_peertimeout;
-		ni->ni_net->net_tunables.lct_max_tx_credits =
+
+	if (net_tunables->lct_max_tx_credits == -1)
+		net_tunables->lct_max_tx_credits =
 			*ksocknal_tunables.ksnd_credits;
-		ni->ni_net->net_tunables.lct_peer_tx_credits =
+
+	if (net_tunables->lct_peer_tx_credits == -1)
+		net_tunables->lct_peer_tx_credits =
 			*ksocknal_tunables.ksnd_peertxcredits;
-		ni->ni_net->net_tunables.lct_peer_rtr_credits =
-			*ksocknal_tunables.ksnd_peerrtrcredits;
-		ni->ni_net->net_tunables_set = true;
-	}
 
+	if (net_tunables->lct_peer_tx_credits >
+	    net_tunables->lct_max_tx_credits)
+		net_tunables->lct_peer_tx_credits =
+			net_tunables->lct_max_tx_credits;
+
+	if (net_tunables->lct_peer_rtr_credits == -1)
+		net_tunables->lct_peer_rtr_credits =
+			*ksocknal_tunables.ksnd_peerrtrcredits;
 
 	rc = lnet_inet_enumerate(&ifaces, ni->ni_net_ns);
 	if (rc < 0)
@@ -2797,13 +2757,13 @@ ksocknal_startup(struct lnet_ni *ni)
 		 * should exist. Each IP alias should be mapped to
 		 * each 'struct net_ni'.
 		 */
-		for (i = 0; i < LNET_NUM_INTERFACES; i++) {
+		for (i = 0; i < LNET_INTERFACES_NUM; i++) {
 			int j;
 
 			if (!ni->ni_interfaces[i])
 				break;
 
-			for (j = 0; j < LNET_NUM_INTERFACES;  j++) {
+			for (j = 0; j < LNET_INTERFACES_NUM;  j++) {
 				if (i != j && ni->ni_interfaces[j] &&
 				    strcmp(ni->ni_interfaces[i],
 					   ni->ni_interfaces[j]) == 0) {
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h
index 12d6cb83ef4ac..cbc40f7347d4d 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd.h
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  *
  *   Author: Zach Brown <zab@zabbo.net>
  *   Author: Peter J. Braam <braam@clusterfs.com>
@@ -41,6 +41,7 @@
 #include <linux/list.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/pagemap.h>
 #include <linux/stat.h>
 #include <linux/string.h>
 #include <linux/syscalls.h>
@@ -50,13 +51,9 @@
 #include <net/sock.h>
 #include <net/tcp.h>
 
-#include <libcfs/libcfs.h>
-#include <lnet/lnet.h>
 #include <lnet/lib-lnet.h>
 #include <lnet/socklnd.h>
 
-#include <libcfs/linux/linux-net.h>
-
 #ifdef HAVE_TCP_SENDPAGE_USE_SOCKET
 # define cfs_tcp_sendpage(sk, page, offset, size, flags) \
 	tcp_sendpage((sk)->sk_socket, page, offset, size, flags)
@@ -65,6 +62,8 @@
 	tcp_sendpage(sk, page, offset, size, flags)
 #endif /* HAVE_TCP_SENDPAGE_USE_SOCKET */
 
+#include <libcfs/linux/linux-net.h>
+
 #ifndef NETIF_F_CSUM_MASK
 # define NETIF_F_CSUM_MASK NETIF_F_ALL_CSUM
 #endif
@@ -76,7 +75,7 @@
 #define SOCKNAL_PEER_HASH_SIZE  101             /* # peer_ni lists */
 #define SOCKNAL_RESCHED         100             /* # scheduler loops before reschedule */
 #define SOCKNAL_INSANITY_RECONN 5000            /* connd is trying on reconn infinitely */
-#define SOCKNAL_ENOMEM_RETRY    CFS_TICK        /* jiffies between retries */
+#define SOCKNAL_ENOMEM_RETRY    1		/* seconds between retries */
 
 #define SOCKNAL_SINGLE_FRAG_TX      0           /* disable multi-fragment sends */
 #define SOCKNAL_SINGLE_FRAG_RX      0           /* disable multi-fragment receives */
@@ -91,33 +90,25 @@
 # define SOCKNAL_RISK_KMAP_DEADLOCK  1
 #endif
 
-struct ksock_sched_info;
-
-typedef struct                                  /* per scheduler state */
-{
-	spinlock_t		kss_lock;	/* serialise */
-	struct list_head	kss_rx_conns;	/* conn waiting to be read */
+/* per scheduler state */
+struct ksock_sched {
+	/* serialise */
+	spinlock_t kss_lock;
 	/* conn waiting to be written */
-	struct list_head	kss_tx_conns;
+	struct list_head kss_rx_conns;
+	struct list_head kss_tx_conns;
 	/* zombie noop tx list */
-	struct list_head	kss_zombie_noop_txs;
-	wait_queue_head_t	kss_waitq;	/* where scheduler sleeps */
+	struct list_head kss_zombie_noop_txs;
+	/* where scheduler sleeps */
+	wait_queue_head_t kss_waitq;
 	/* # connections assigned to this scheduler */
-	int			kss_nconns;
-	struct ksock_sched_info	*kss_info;	/* owner of it */
-#if !SOCKNAL_SINGLE_FRAG_RX
-	struct page		*kss_rx_scratch_pgs[LNET_MAX_IOV];
-#endif
-#if !SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_SINGLE_FRAG_RX
-	struct kvec		kss_scratch_iov[LNET_MAX_IOV];
-#endif
-} ksock_sched_t;
-
-struct ksock_sched_info {
-	int			ksi_nthreads_max; /* max allowed threads */
-	int			ksi_nthreads;	/* number of threads */
-	int			ksi_cpt;	/* CPT id */
-	ksock_sched_t		*ksi_scheds;	/* array of schedulers */
+	int kss_nconns;
+	/* max allowed threads */
+	int kss_nthreads_max;
+	/* number of threads */
+	int kss_nthreads;
+	/* CPT id */
+	int kss_cpt;
 };
 
 #define KSOCK_CPT_SHIFT			16
@@ -125,17 +116,15 @@ struct ksock_sched_info {
 #define KSOCK_THREAD_CPT(id)		((id) >> KSOCK_CPT_SHIFT)
 #define KSOCK_THREAD_SID(id)		((id) & ((1UL << KSOCK_CPT_SHIFT) - 1))
 
-typedef struct                                  /* in-use interface */
-{
+struct ksock_interface {			/* in-use interface */
 	__u32		ksni_ipaddr;		/* interface's IP address */
 	__u32		ksni_netmask;		/* interface's network mask */
 	int		ksni_nroutes;		/* # routes using (active) */
 	int		ksni_npeers;		/* # peers using (passive) */
 	char		ksni_name[IFNAMSIZ];	/* interface name */
-} ksock_interface_t;
+};
 
-typedef struct
-{
+struct ksock_tunables {
 	/* "stuck" socket timeout (seconds) */
 	int              *ksnd_timeout;
 	/* # scheduler threads in each pool while starting */
@@ -175,26 +164,24 @@ typedef struct
 #if SOCKNAL_VERSION_DEBUG
         int              *ksnd_protocol;        /* protocol version */
 #endif
-} ksock_tunables_t;
+};
 
-typedef struct
-{
+struct ksock_net {
 	__u64		  ksnn_incarnation;	/* my epoch */
 	spinlock_t	  ksnn_lock;		/* serialise */
 	struct list_head  ksnn_list;		/* chain on global list */
 	int		  ksnn_npeers;		/* # peers */
 	int		  ksnn_shutdown;	/* shutting down? */
 	int		  ksnn_ninterfaces;	/* IP interfaces */
-	ksock_interface_t ksnn_interfaces[LNET_NUM_INTERFACES];
-} ksock_net_t;
+	struct ksock_interface ksnn_interfaces[LNET_INTERFACES_NUM];
+};
 
 /** connd timeout */
 #define SOCKNAL_CONND_TIMEOUT  120
 /** reserved thread for accepting & creating new connd */
 #define SOCKNAL_CONND_RESV     1
 
-typedef struct
-{
+struct ksock_nal_data {
 	int			ksnd_init;	/* initialisation state */
 	int			ksnd_nnets;	/* # networks set up */
 	struct list_head	ksnd_nets;	/* list of nets */
@@ -207,7 +194,7 @@ typedef struct
 	int			ksnd_nthreads;	/* # live threads */
 	int			ksnd_shuttingdown; /* tell threads to exit */
 	/* schedulers information */
-	struct ksock_sched_info	**ksnd_sched_info;
+	struct ksock_sched	**ksnd_schedulers;
 
 	atomic_t      ksnd_nactive_txs;    /* #active txs */
 
@@ -220,7 +207,7 @@ typedef struct
 	/* reaper sleeps here */
 	wait_queue_head_t       ksnd_reaper_waitq;
 	/* when reaper will wake */
-	cfs_time_t        ksnd_reaper_waketime;
+	time64_t		ksnd_reaper_waketime;
 	/* serialise */
 	spinlock_t	  ksnd_reaper_lock;
 
@@ -237,11 +224,11 @@ typedef struct
 	/* # connds connecting */
 	int			ksnd_connd_connecting;
 	/** time stamp of the last failed connecting attempt */
-	long			ksnd_connd_failed_stamp;
+	time64_t		ksnd_connd_failed_stamp;
 	/** # starting connd */
 	unsigned		ksnd_connd_starting;
 	/** time stamp of the last starting connd */
-	long			ksnd_connd_starting_stamp;
+	time64_t		ksnd_connd_starting_stamp;
 	/** # running connd */
 	unsigned		ksnd_connd_running;
 	/* serialise */
@@ -251,8 +238,7 @@ typedef struct
 	struct list_head	ksnd_idle_noop_txs;
 	/* serialise, g_lock unsafe */
 	spinlock_t		ksnd_tx_lock;
-
-} ksock_nal_data_t;
+};
 
 #define SOCKNAL_INIT_NOTHING    0
 #define SOCKNAL_INIT_DATA       1
@@ -272,8 +258,7 @@ struct ksock_peer;                              /* forward ref */
 struct ksock_route;                             /* forward ref */
 struct ksock_proto;                             /* forward ref */
 
-typedef struct                                  /* transmit packet */
-{
+struct ksock_tx {			/* transmit packet */
 	struct list_head   tx_list;	/* queue on conn for transmission etc */
 	struct list_head   tx_zc_list;	/* queue on peer_ni for ZC request */
 	atomic_t       tx_refcount;    /* tx reference count */
@@ -289,9 +274,10 @@ typedef struct                                  /* transmit packet */
         lnet_kiov_t   *tx_kiov;        /* packet page frags */
 	struct ksock_conn *tx_conn;        /* owning conn */
 	struct lnet_msg	  *tx_lnetmsg;	/* lnet message for lnet_finalize() */
-	cfs_time_t     tx_deadline;    /* when (in jiffies) tx times out */
-	struct ksock_msg    tx_msg;         /* socklnd message buffer */
+	time64_t	   tx_deadline;	/* when (in secs) tx times out */
+	struct ksock_msg   tx_msg;         /* socklnd message buffer */
         int            tx_desc_size;   /* size of this descriptor */
+	enum lnet_msg_hstatus tx_hstatus; /* health status of tx */
         union {
                 struct {
 			struct kvec iov;	/* virt hdr */
@@ -301,18 +287,18 @@ typedef struct                                  /* transmit packet */
 			struct kvec iov[1];	/* virt hdr + payload */
                 }                  virt;
         }                       tx_frags;
-} ksock_tx_t;
+};
 
-#define KSOCK_NOOP_TX_SIZE  ((int)offsetof(ksock_tx_t, tx_frags.paged.kiov[0]))
+#define KSOCK_NOOP_TX_SIZE  ((int)offsetof(struct ksock_tx, tx_frags.paged.kiov[0]))
 
-/* network zero copy callback descriptor embedded in ksock_tx_t */
+/* network zero copy callback descriptor embedded in struct ksock_tx */
 
 /* space for the rx frag descriptors; we either read a single contiguous
  * header, or up to LNET_MAX_IOV frags of payload of either type. */
-typedef union {
-	struct kvec     iov[LNET_MAX_IOV];
-        lnet_kiov_t      kiov[LNET_MAX_IOV];
-} ksock_rxiovspace_t;
+union ksock_rxiovspace {
+	struct kvec	iov[LNET_MAX_IOV];
+	lnet_kiov_t	kiov[LNET_MAX_IOV];
+};
 
 #define SOCKNAL_RX_KSM_HEADER   1               /* reading ksock message header */
 #define SOCKNAL_RX_LNET_HEADER  2               /* reading lnet message header */
@@ -321,17 +307,16 @@ typedef union {
 #define SOCKNAL_RX_LNET_PAYLOAD 5               /* reading lnet payload (to deliver here) */
 #define SOCKNAL_RX_SLOP         6               /* skipping body */
 
-typedef struct ksock_conn
-{
-	struct ksock_peer  *ksnc_peer;		/* owning peer_ni */
-	struct ksock_route *ksnc_route;		/* owning route */
+struct ksock_conn {
+	struct ksock_peer_ni	*ksnc_peer;	/* owning peer_ni */
+	struct ksock_route	*ksnc_route;	/* owning route */
 	struct list_head    ksnc_list;		/* stash on peer_ni's conn list */
 	struct socket       *ksnc_sock;		/* actual socket */
 	void                *ksnc_saved_data_ready; /* socket's original data_ready() callback */
 	void                *ksnc_saved_write_space; /* socket's original write_space() callback */
 	atomic_t            ksnc_conn_refcount; /* conn refcount */
 	atomic_t            ksnc_sock_refcount; /* sock refcount */
-	ksock_sched_t       *ksnc_scheduler;  /* who schedules this connection */
+	struct ksock_sched *ksnc_scheduler;	/* who schedules this connection */
 	__u32               ksnc_myipaddr;   /* my IP */
         __u32               ksnc_ipaddr;     /* peer_ni's IP */
         int                 ksnc_port;       /* peer_ni's port */
@@ -346,7 +331,7 @@ typedef struct ksock_conn
 
 	/* where I enq waiting input or a forwarding descriptor */
 	struct list_head   ksnc_rx_list;
-	cfs_time_t            ksnc_rx_deadline; /* when (in jiffies) receive times out */
+	time64_t		ksnc_rx_deadline; /* when (in seconds) receive times out */
         __u8                  ksnc_rx_started;  /* started receiving a message */
         __u8                  ksnc_rx_ready;    /* data ready to read */
         __u8                  ksnc_rx_scheduled;/* being progressed */
@@ -357,9 +342,9 @@ typedef struct ksock_conn
 	struct kvec          *ksnc_rx_iov;      /* the kvec frags */
         int                   ksnc_rx_nkiov;    /* # page frags */
         lnet_kiov_t          *ksnc_rx_kiov;     /* the page frags */
-        ksock_rxiovspace_t    ksnc_rx_iov_space;/* space for frag descriptors */
+	union ksock_rxiovspace	ksnc_rx_iov_space;/* space for frag descriptors */
         __u32                 ksnc_rx_csum;     /* partial checksum for incoming data */
-        void                 *ksnc_cookie;      /* rx lnet_finalize passthru arg */
+	struct lnet_msg      *ksnc_lnet_msg;    /* rx lnet_finalize arg*/
 	struct ksock_msg	ksnc_msg;	/* incoming message buffer:
 						 * V2.x message takes the
 						 * whole struct
@@ -373,9 +358,9 @@ typedef struct ksock_conn
 	/* packets waiting to be sent */
 	struct list_head	ksnc_tx_queue;
 	/* next TX that can carry a LNet message or ZC-ACK */
-	ksock_tx_t		*ksnc_tx_carrier;
-	/* when (in jiffies) tx times out */
-	cfs_time_t		ksnc_tx_deadline;
+	struct ksock_tx		*ksnc_tx_carrier;
+	/* when (in seconds) tx times out */
+	time64_t		ksnc_tx_deadline;
 	/* send buffer marker */
 	int			ksnc_tx_bufnob;
 	/* # bytes queued */
@@ -385,17 +370,16 @@ typedef struct ksock_conn
 	/* being progressed */
 	int			ksnc_tx_scheduled;
 	/* time stamp of the last posted TX */
-	cfs_time_t		ksnc_tx_last_post;
-} ksock_conn_t;
+	time64_t		ksnc_tx_last_post;
+};
 
-typedef struct ksock_route
-{
+struct ksock_route {
 	struct list_head   ksnr_list;		/* chain on peer_ni route list */
 	struct list_head   ksnr_connd_list;	/* chain on ksnr_connd_routes */
-	struct ksock_peer *ksnr_peer;		/* owning peer_ni */
+	struct ksock_peer_ni *ksnr_peer;	/* owning peer_ni */
 	atomic_t	   ksnr_refcount;	/* # users */
-	cfs_time_t            ksnr_timeout;     /* when (in jiffies) reconnection can happen next */
-	cfs_duration_t        ksnr_retry_interval; /* how long between retries */
+	time64_t	   ksnr_timeout;	/* when (in secs) reconnection can happen next */
+	time64_t	   ksnr_retry_interval;	/* how long between retries */
         __u32                 ksnr_myipaddr;    /* my IP */
         __u32                 ksnr_ipaddr;      /* IP address to connect to */
         int                   ksnr_port;        /* port to connect to */
@@ -405,14 +389,13 @@ typedef struct ksock_route
         unsigned int          ksnr_deleted:1;   /* been removed from peer_ni? */
         unsigned int          ksnr_share_count; /* created explicitly? */
         int                   ksnr_conn_count;  /* # conns established by this route */
-} ksock_route_t;
+};
 
 #define SOCKNAL_KEEPALIVE_PING          1       /* cookie for keepalive ping */
 
-typedef struct ksock_peer
-{
+struct ksock_peer_ni {
 	struct list_head	ksnp_list;	/* stash on global peer_ni list */
-	cfs_time_t            ksnp_last_alive;  /* when (in jiffies) I was last alive */
+	time64_t		ksnp_last_alive;/* when (in seconds) I was last alive */
 	struct lnet_process_id	ksnp_id;	/* who's on the other end(s) */
 	atomic_t              ksnp_refcount; /* # users */
 	int                   ksnp_sharecount;  /* lconf usage counter */
@@ -428,50 +411,48 @@ typedef struct ksock_peer
 	spinlock_t	      ksnp_lock;	/* serialize, g_lock unsafe */
 	/* zero copy requests wait for ACK  */
 	struct list_head	ksnp_zc_req_list;
-	cfs_time_t            ksnp_send_keepalive; /* time to send keepalive */
+	time64_t		ksnp_send_keepalive; /* time to send keepalive */
 	struct lnet_ni       *ksnp_ni;       /* which network */
 	int                   ksnp_n_passive_ips; /* # of... */
-	__u32                 ksnp_passive_ips[LNET_NUM_INTERFACES]; /* preferred local interfaces */
-} ksock_peer_ni_t;
+	__u32                 ksnp_passive_ips[LNET_INTERFACES_NUM]; /* preferred local interfaces */
+};
 
-typedef struct ksock_connreq
-{
+struct ksock_connreq {
 	/* stash on ksnd_connd_connreqs */
 	struct list_head	ksncr_list;
 	/* chosen NI */
 	struct lnet_ni		*ksncr_ni;
 	/* accepted socket */
 	struct socket		*ksncr_sock;
-} ksock_connreq_t;
+};
 
-extern ksock_nal_data_t ksocknal_data;
-extern ksock_tunables_t ksocknal_tunables;
+extern struct ksock_nal_data ksocknal_data;
+extern struct ksock_tunables ksocknal_tunables;
 
 #define SOCKNAL_MATCH_NO        0        /* TX can't match type of connection */
 #define SOCKNAL_MATCH_YES       1        /* TX matches type of connection */
 #define SOCKNAL_MATCH_MAY       2        /* TX can be sent on the connection, but not preferred */
 
-typedef struct ksock_proto
-{
+struct ksock_proto {
         int           pro_version;                                              /* version number of protocol */
-	int         (*pro_send_hello)(ksock_conn_t *, struct ksock_hello_msg *);     /* handshake function */
-	int         (*pro_recv_hello)(ksock_conn_t *, struct ksock_hello_msg *, int);/* handshake function */
-        void        (*pro_pack)(ksock_tx_t *);                                  /* message pack */
+	int         (*pro_send_hello)(struct ksock_conn *, struct ksock_hello_msg *);     /* handshake function */
+	int         (*pro_recv_hello)(struct ksock_conn *, struct ksock_hello_msg *, int);/* handshake function */
+	void        (*pro_pack)(struct ksock_tx *);                                  /* message pack */
 	void        (*pro_unpack)(struct ksock_msg *);				/* message unpack */
-        ksock_tx_t *(*pro_queue_tx_msg)(ksock_conn_t *, ksock_tx_t *);          /* queue tx on the connection */
-        int         (*pro_queue_tx_zcack)(ksock_conn_t *, ksock_tx_t *, __u64); /* queue ZC ack on the connection */
-        int         (*pro_handle_zcreq)(ksock_conn_t *, __u64, int);            /* handle ZC request */
-        int         (*pro_handle_zcack)(ksock_conn_t *, __u64, __u64);          /* handle ZC ACK */
-        int         (*pro_match_tx)(ksock_conn_t *, ksock_tx_t *, int);         /* msg type matches the connection type:
+	struct ksock_tx *(*pro_queue_tx_msg)(struct ksock_conn *, struct ksock_tx *);          /* queue tx on the connection */
+	int         (*pro_queue_tx_zcack)(struct ksock_conn *, struct ksock_tx *, __u64); /* queue ZC ack on the connection */
+	int         (*pro_handle_zcreq)(struct ksock_conn *, __u64, int);            /* handle ZC request */
+	int         (*pro_handle_zcack)(struct ksock_conn *, __u64, __u64);          /* handle ZC ACK */
+	int         (*pro_match_tx)(struct ksock_conn *, struct ksock_tx *, int);         /* msg type matches the connection type:
                                                                                  * return value:
                                                                                  *   return MATCH_NO  : no
                                                                                  *   return MATCH_YES : matching type
                                                                                  *   return MATCH_MAY : can be backup */
-} ksock_proto_t;
+};
 
-extern ksock_proto_t ksocknal_protocol_v1x;
-extern ksock_proto_t ksocknal_protocol_v2x;
-extern ksock_proto_t ksocknal_protocol_v3x;
+extern struct ksock_proto ksocknal_protocol_v1x;
+extern struct ksock_proto ksocknal_protocol_v2x;
+extern struct ksock_proto ksocknal_protocol_v3x;
 
 #define KSOCK_PROTO_V1_MAJOR    LNET_PROTO_TCP_VERSION_MAJOR
 #define KSOCK_PROTO_V1_MINOR    LNET_PROTO_TCP_VERSION_MINOR
@@ -513,27 +494,27 @@ ksocknal_nid2peerlist (lnet_nid_t nid)
 }
 
 static inline void
-ksocknal_conn_addref (ksock_conn_t *conn)
+ksocknal_conn_addref(struct ksock_conn *conn)
 {
-	LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+	LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0);
 	atomic_inc(&conn->ksnc_conn_refcount);
 }
 
-extern void ksocknal_queue_zombie_conn (ksock_conn_t *conn);
-extern void ksocknal_finalize_zcreq(ksock_conn_t *conn);
+extern void ksocknal_queue_zombie_conn(struct ksock_conn *conn);
+extern void ksocknal_finalize_zcreq(struct ksock_conn *conn);
 
 static inline void
-ksocknal_conn_decref (ksock_conn_t *conn)
+ksocknal_conn_decref(struct ksock_conn *conn)
 {
-	LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+	LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0);
 	if (atomic_dec_and_test(&conn->ksnc_conn_refcount))
 		ksocknal_queue_zombie_conn(conn);
 }
 
 static inline int
-ksocknal_connsock_addref (ksock_conn_t *conn)
+ksocknal_connsock_addref(struct ksock_conn *conn)
 {
-	int   rc = -ESHUTDOWN;
+	int rc = -ESHUTDOWN;
 
 	read_lock(&ksocknal_data.ksnd_global_lock);
 	if (!conn->ksnc_closing) {
@@ -547,9 +528,9 @@ ksocknal_connsock_addref (ksock_conn_t *conn)
 }
 
 static inline void
-ksocknal_connsock_decref (ksock_conn_t *conn)
+ksocknal_connsock_decref(struct ksock_conn *conn)
 {
-	LASSERT (atomic_read(&conn->ksnc_sock_refcount) > 0);
+	LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0);
 	if (atomic_dec_and_test(&conn->ksnc_sock_refcount)) {
 		LASSERT (conn->ksnc_closing);
 		sock_release(conn->ksnc_sock);
@@ -559,55 +540,55 @@ ksocknal_connsock_decref (ksock_conn_t *conn)
 }
 
 static inline void
-ksocknal_tx_addref (ksock_tx_t *tx)
+ksocknal_tx_addref(struct ksock_tx *tx)
 {
-	LASSERT (atomic_read(&tx->tx_refcount) > 0);
+	LASSERT(atomic_read(&tx->tx_refcount) > 0);
 	atomic_inc(&tx->tx_refcount);
 }
 
-extern void ksocknal_tx_prep (ksock_conn_t *, ksock_tx_t *tx);
-extern void ksocknal_tx_done(struct lnet_ni *ni, ksock_tx_t *tx, int error);
+extern void ksocknal_tx_prep(struct ksock_conn *, struct ksock_tx *tx);
+extern void ksocknal_tx_done(struct lnet_ni *ni, struct ksock_tx *tx, int error);
 
 static inline void
-ksocknal_tx_decref (ksock_tx_t *tx)
+ksocknal_tx_decref(struct ksock_tx *tx)
 {
-	LASSERT (atomic_read(&tx->tx_refcount) > 0);
+	LASSERT(atomic_read(&tx->tx_refcount) > 0);
 	if (atomic_dec_and_test(&tx->tx_refcount))
 		ksocknal_tx_done(NULL, tx, 0);
 }
 
 static inline void
-ksocknal_route_addref (ksock_route_t *route)
+ksocknal_route_addref(struct ksock_route *route)
 {
-	LASSERT (atomic_read(&route->ksnr_refcount) > 0);
+	LASSERT(atomic_read(&route->ksnr_refcount) > 0);
 	atomic_inc(&route->ksnr_refcount);
 }
 
-extern void ksocknal_destroy_route (ksock_route_t *route);
+extern void ksocknal_destroy_route(struct ksock_route *route);
 
 static inline void
-ksocknal_route_decref (ksock_route_t *route)
+ksocknal_route_decref(struct ksock_route *route)
 {
-	LASSERT (atomic_read (&route->ksnr_refcount) > 0);
+	LASSERT(atomic_read(&route->ksnr_refcount) > 0);
 	if (atomic_dec_and_test(&route->ksnr_refcount))
 		ksocknal_destroy_route (route);
 }
 
 static inline void
-ksocknal_peer_addref (ksock_peer_ni_t *peer_ni)
+ksocknal_peer_addref(struct ksock_peer_ni *peer_ni)
 {
-	LASSERT (atomic_read (&peer_ni->ksnp_refcount) > 0);
+	LASSERT(atomic_read(&peer_ni->ksnp_refcount) > 0);
 	atomic_inc(&peer_ni->ksnp_refcount);
 }
 
-extern void ksocknal_destroy_peer (ksock_peer_ni_t *peer_ni);
+extern void ksocknal_destroy_peer(struct ksock_peer_ni *peer_ni);
 
 static inline void
-ksocknal_peer_decref (ksock_peer_ni_t *peer_ni)
+ksocknal_peer_decref(struct ksock_peer_ni *peer_ni)
 {
 	LASSERT (atomic_read (&peer_ni->ksnp_refcount) > 0);
 	if (atomic_dec_and_test(&peer_ni->ksnp_refcount))
-		ksocknal_destroy_peer (peer_ni);
+		ksocknal_destroy_peer(peer_ni);
 }
 
 int ksocknal_startup(struct lnet_ni *ni);
@@ -622,73 +603,77 @@ int ksocknal_accept(struct lnet_ni *ni, struct socket *sock);
 
 int ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip,
 		      int port);
-ksock_peer_ni_t *ksocknal_find_peer_locked(struct lnet_ni *ni,
+struct ksock_peer_ni *ksocknal_find_peer_locked(struct lnet_ni *ni,
 					   struct lnet_process_id id);
-ksock_peer_ni_t *ksocknal_find_peer(struct lnet_ni *ni,
+struct ksock_peer_ni *ksocknal_find_peer(struct lnet_ni *ni,
 				    struct lnet_process_id id);
-extern void ksocknal_peer_failed (ksock_peer_ni_t *peer_ni);
-extern int ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
+extern void ksocknal_peer_failed(struct ksock_peer_ni *peer_ni);
+extern int ksocknal_create_conn(struct lnet_ni *ni, struct ksock_route *route,
 				struct socket *sock, int type);
-extern void ksocknal_close_conn_locked (ksock_conn_t *conn, int why);
-extern void ksocknal_terminate_conn (ksock_conn_t *conn);
-extern void ksocknal_destroy_conn (ksock_conn_t *conn);
-extern int  ksocknal_close_peer_conns_locked (ksock_peer_ni_t *peer_ni,
-                                              __u32 ipaddr, int why);
-extern int ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why);
+extern void ksocknal_close_conn_locked(struct ksock_conn *conn, int why);
+extern void ksocknal_terminate_conn(struct ksock_conn *conn);
+extern void ksocknal_destroy_conn(struct ksock_conn *conn);
+extern int  ksocknal_close_peer_conns_locked(struct ksock_peer_ni *peer_ni,
+					     __u32 ipaddr, int why);
+extern int ksocknal_close_conn_and_siblings(struct ksock_conn *conn, int why);
 int ksocknal_close_matching_conns(struct lnet_process_id id, __u32 ipaddr);
-extern ksock_conn_t *ksocknal_find_conn_locked(ksock_peer_ni_t *peer_ni,
-                                               ksock_tx_t *tx, int nonblk);
+extern struct ksock_conn *ksocknal_find_conn_locked(struct ksock_peer_ni *peer_ni,
+						    struct ksock_tx *tx, int nonblk);
 
-extern int  ksocknal_launch_packet(struct lnet_ni *ni, ksock_tx_t *tx,
+extern int  ksocknal_launch_packet(struct lnet_ni *ni, struct ksock_tx *tx,
 				   struct lnet_process_id id);
-extern ksock_tx_t *ksocknal_alloc_tx(int type, int size);
-extern void ksocknal_free_tx (ksock_tx_t *tx);
-extern ksock_tx_t *ksocknal_alloc_tx_noop(__u64 cookie, int nonblk);
-extern void ksocknal_next_tx_carrier(ksock_conn_t *conn);
-extern void ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn);
+extern struct ksock_tx *ksocknal_alloc_tx(int type, int size);
+extern void ksocknal_free_tx(struct ksock_tx *tx);
+extern struct ksock_tx *ksocknal_alloc_tx_noop(__u64 cookie, int nonblk);
+extern void ksocknal_next_tx_carrier(struct ksock_conn *conn);
+extern void ksocknal_queue_tx_locked(struct ksock_tx *tx, struct ksock_conn *conn);
 extern void ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist,
 				 int error);
 extern void ksocknal_notify(struct lnet_ni *ni, lnet_nid_t gw_nid, int alive);
-extern void ksocknal_query (struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when);
+extern void ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, time64_t *when);
 extern int ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name);
-extern void ksocknal_thread_fini (void);
-extern void ksocknal_launch_all_connections_locked (ksock_peer_ni_t *peer_ni);
-extern ksock_route_t *ksocknal_find_connectable_route_locked (ksock_peer_ni_t *peer_ni);
-extern ksock_route_t *ksocknal_find_connecting_route_locked (ksock_peer_ni_t *peer_ni);
-extern int ksocknal_new_packet (ksock_conn_t *conn, int skip);
-extern int ksocknal_scheduler (void *arg);
-extern int ksocknal_connd (void *arg);
-extern int ksocknal_reaper (void *arg);
-int ksocknal_send_hello(struct lnet_ni *ni, ksock_conn_t *conn,
+extern void ksocknal_thread_fini(void);
+extern void ksocknal_launch_all_connections_locked(struct ksock_peer_ni *peer_ni);
+extern struct ksock_route *ksocknal_find_connectable_route_locked(struct ksock_peer_ni *peer_ni);
+extern struct ksock_route *ksocknal_find_connecting_route_locked(struct ksock_peer_ni *peer_ni);
+extern int ksocknal_new_packet(struct ksock_conn *conn, int skip);
+extern int ksocknal_scheduler(void *arg);
+extern int ksocknal_connd(void *arg);
+extern int ksocknal_reaper(void *arg);
+int ksocknal_send_hello(struct lnet_ni *ni, struct ksock_conn *conn,
 			lnet_nid_t peer_nid, struct ksock_hello_msg *hello);
-int ksocknal_recv_hello(struct lnet_ni *ni, ksock_conn_t *conn,
+int ksocknal_recv_hello(struct lnet_ni *ni, struct ksock_conn *conn,
 			struct ksock_hello_msg *hello,
 			struct lnet_process_id *id,
 			__u64 *incarnation);
-extern void ksocknal_read_callback(ksock_conn_t *conn);
-extern void ksocknal_write_callback(ksock_conn_t *conn);
+extern void ksocknal_read_callback(struct ksock_conn *conn);
+extern void ksocknal_write_callback(struct ksock_conn *conn);
 
-extern int ksocknal_lib_zc_capable(ksock_conn_t *conn);
-extern void ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn);
-extern void ksocknal_lib_set_callback(struct socket *sock,  ksock_conn_t *conn);
+extern int ksocknal_lib_zc_capable(struct ksock_conn *conn);
+extern void ksocknal_lib_save_callback(struct socket *sock, struct ksock_conn *conn);
+extern void ksocknal_lib_set_callback(struct socket *sock,  struct ksock_conn *conn);
 extern void ksocknal_lib_reset_callback(struct socket *sock,
-					ksock_conn_t *conn);
-extern void ksocknal_lib_push_conn(ksock_conn_t *conn);
-extern int ksocknal_lib_get_conn_addrs(ksock_conn_t *conn);
+					struct ksock_conn *conn);
+extern void ksocknal_lib_push_conn(struct ksock_conn *conn);
+extern int ksocknal_lib_get_conn_addrs(struct ksock_conn *conn);
 extern int ksocknal_lib_setup_sock(struct socket *so);
-extern int ksocknal_lib_send_iov(ksock_conn_t *conn, ksock_tx_t *tx);
-extern int ksocknal_lib_send_kiov(ksock_conn_t *conn, ksock_tx_t *tx);
-extern void ksocknal_lib_eager_ack(ksock_conn_t *conn);
-extern int ksocknal_lib_recv_iov(ksock_conn_t *conn);
-extern int ksocknal_lib_recv_kiov(ksock_conn_t *conn);
-extern int ksocknal_lib_get_conn_tunables(ksock_conn_t *conn, int *txmem,
+extern int ksocknal_lib_send_iov(struct ksock_conn *conn, struct ksock_tx *tx,
+				 struct kvec *scratch_iov);
+extern int ksocknal_lib_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx,
+				  struct kvec *scratch_iov);
+extern void ksocknal_lib_eager_ack(struct ksock_conn *conn);
+extern int ksocknal_lib_recv_iov(struct ksock_conn *conn,
+				 struct kvec *scratchiov);
+extern int ksocknal_lib_recv_kiov(struct ksock_conn *conn, struct page **pages,
+		       struct kvec *scratchiov);
+extern int ksocknal_lib_get_conn_tunables(struct ksock_conn *conn, int *txmem,
 					  int *rxmem, int *nagle);
 
 extern int ksocknal_tunables_init(void);
 
-extern void ksocknal_lib_csum_tx(ksock_tx_t *tx);
+extern void ksocknal_lib_csum_tx(struct ksock_tx *tx);
 
-extern int ksocknal_lib_memory_pressure(ksock_conn_t *conn);
+extern int ksocknal_lib_memory_pressure(struct ksock_conn *conn);
 extern int ksocknal_lib_bind_thread_to_cpu(int id);
 
 #endif /* _SOCKLND_SOCKLND_H_ */
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c
index 83c6a2da2f4ae..69e275e18adde 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_cb.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  *
  *   Author: Zach Brown <zab@zabbo.net>
  *   Author: Peter J. Braam <braam@clusterfs.com>
@@ -26,10 +26,10 @@
 
 #include "socklnd.h"
 
-ksock_tx_t *
+struct ksock_tx *
 ksocknal_alloc_tx(int type, int size)
 {
-	ksock_tx_t *tx = NULL;
+	struct ksock_tx *tx = NULL;
 
 	if (type == KSOCK_MSG_NOOP) {
 		LASSERT(size == KSOCK_NOOP_TX_SIZE);
@@ -38,8 +38,8 @@ ksocknal_alloc_tx(int type, int size)
 		spin_lock(&ksocknal_data.ksnd_tx_lock);
 
 		if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
-			tx = list_entry(ksocknal_data.ksnd_idle_noop_txs. \
-					    next, ksock_tx_t, tx_list);
+			tx = list_entry(ksocknal_data.ksnd_idle_noop_txs.next,
+					struct ksock_tx, tx_list);
 			LASSERT(tx->tx_desc_size == size);
 			list_del(&tx->tx_list);
 		}
@@ -57,6 +57,7 @@ ksocknal_alloc_tx(int type, int size)
 	tx->tx_zc_aborted = 0;
 	tx->tx_zc_capable = 0;
 	tx->tx_zc_checked = 0;
+	tx->tx_hstatus = LNET_MSG_STATUS_OK;
 	tx->tx_desc_size  = size;
 
 	atomic_inc(&ksocknal_data.ksnd_nactive_txs);
@@ -64,10 +65,10 @@ ksocknal_alloc_tx(int type, int size)
 	return tx;
 }
 
-ksock_tx_t *
+struct ksock_tx *
 ksocknal_alloc_tx_noop(__u64 cookie, int nonblk)
 {
-        ksock_tx_t *tx;
+	struct ksock_tx *tx;
 
         tx = ksocknal_alloc_tx(KSOCK_MSG_NOOP, KSOCK_NOOP_TX_SIZE);
         if (tx == NULL) {
@@ -93,7 +94,7 @@ ksocknal_alloc_tx_noop(__u64 cookie, int nonblk)
 
 
 void
-ksocknal_free_tx (ksock_tx_t *tx)
+ksocknal_free_tx(struct ksock_tx *tx)
 {
 	atomic_dec(&ksocknal_data.ksnd_nactive_txs);
 
@@ -110,82 +111,85 @@ ksocknal_free_tx (ksock_tx_t *tx)
 }
 
 static int
-ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
+ksocknal_send_iov(struct ksock_conn *conn, struct ksock_tx *tx,
+		  struct kvec *scratch_iov)
 {
 	struct kvec *iov = tx->tx_iov;
-        int    nob;
-        int    rc;
+	int    nob;
+	int    rc;
 
-        LASSERT (tx->tx_niov > 0);
+	LASSERT(tx->tx_niov > 0);
 
-        /* Never touch tx->tx_iov inside ksocknal_lib_send_iov() */
-        rc = ksocknal_lib_send_iov(conn, tx);
+	/* Never touch tx->tx_iov inside ksocknal_lib_send_iov() */
+	rc = ksocknal_lib_send_iov(conn, tx, scratch_iov);
 
-        if (rc <= 0)                            /* sent nothing? */
-                return (rc);
+	if (rc <= 0)                            /* sent nothing? */
+		return rc;
 
-        nob = rc;
-        LASSERT (nob <= tx->tx_resid);
-        tx->tx_resid -= nob;
+	nob = rc;
+	LASSERT(nob <= tx->tx_resid);
+	tx->tx_resid -= nob;
 
-        /* "consume" iov */
-        do {
-                LASSERT (tx->tx_niov > 0);
+	/* "consume" iov */
+	do {
+		LASSERT(tx->tx_niov > 0);
 
-                if (nob < (int) iov->iov_len) {
+		if (nob < (int) iov->iov_len) {
 			iov->iov_base += nob;
-                        iov->iov_len -= nob;
-                        return (rc);
-                }
+			iov->iov_len -= nob;
+			return rc;
+		}
 
-                nob -= iov->iov_len;
-                tx->tx_iov = ++iov;
-                tx->tx_niov--;
-        } while (nob != 0);
+		nob -= iov->iov_len;
+		tx->tx_iov = ++iov;
+		tx->tx_niov--;
+	} while (nob != 0);
 
-        return (rc);
+	return rc;
 }
 
 static int
-ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
+ksocknal_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx,
+		   struct kvec *scratch_iov)
 {
-        lnet_kiov_t    *kiov = tx->tx_kiov;
-        int     nob;
-        int     rc;
+	lnet_kiov_t *kiov = tx->tx_kiov;
+	int nob;
+	int rc;
 
-        LASSERT (tx->tx_niov == 0);
-        LASSERT (tx->tx_nkiov > 0);
+	LASSERT(tx->tx_niov == 0);
+	LASSERT(tx->tx_nkiov > 0);
 
-        /* Never touch tx->tx_kiov inside ksocknal_lib_send_kiov() */
-        rc = ksocknal_lib_send_kiov(conn, tx);
+	/* Never touch tx->tx_kiov inside ksocknal_lib_send_kiov() */
+	rc = ksocknal_lib_send_kiov(conn, tx, scratch_iov);
 
-        if (rc <= 0)                            /* sent nothing? */
-                return (rc);
+	if (rc <= 0)                            /* sent nothing? */
+		return rc;
 
-        nob = rc;
-        LASSERT (nob <= tx->tx_resid);
-        tx->tx_resid -= nob;
+	nob = rc;
+	LASSERT(nob <= tx->tx_resid);
+	tx->tx_resid -= nob;
 
-        /* "consume" kiov */
-        do {
-                LASSERT(tx->tx_nkiov > 0);
+	/* "consume" kiov */
+	do {
+		LASSERT(tx->tx_nkiov > 0);
 
-                if (nob < (int)kiov->kiov_len) {
-                        kiov->kiov_offset += nob;
-                        kiov->kiov_len -= nob;
-                        return rc;
-                }
+		if (nob < (int)kiov->kiov_len) {
+			kiov->kiov_offset += nob;
+			kiov->kiov_len -= nob;
+			return rc;
+		}
 
-                nob -= (int)kiov->kiov_len;
-                tx->tx_kiov = ++kiov;
-                tx->tx_nkiov--;
-        } while (nob != 0);
+		nob -= (int)kiov->kiov_len;
+		tx->tx_kiov = ++kiov;
+		tx->tx_nkiov--;
+	} while (nob != 0);
 
-        return (rc);
+	return rc;
 }
 
 static int
-ksocknal_transmit(ksock_conn_t *conn, ksock_tx_t *tx)
+ksocknal_transmit(struct ksock_conn *conn, struct ksock_tx *tx,
+		  struct kvec *scratch_iov)
 {
 	int	rc;
 	int	bufnob;
@@ -197,214 +201,223 @@ ksocknal_transmit(ksock_conn_t *conn, ksock_tx_t *tx)
 
 	LASSERT(tx->tx_resid != 0);
 
-        rc = ksocknal_connsock_addref(conn);
-        if (rc != 0) {
-                LASSERT (conn->ksnc_closing);
-                return (-ESHUTDOWN);
-        }
+	rc = ksocknal_connsock_addref(conn);
+	if (rc != 0) {
+		LASSERT(conn->ksnc_closing);
+		return -ESHUTDOWN;
+	}
 
-        do {
-                if (ksocknal_data.ksnd_enomem_tx > 0) {
-                        /* testing... */
-                        ksocknal_data.ksnd_enomem_tx--;
-                        rc = -EAGAIN;
-                } else if (tx->tx_niov != 0) {
-                        rc = ksocknal_send_iov (conn, tx);
-                } else {
-                        rc = ksocknal_send_kiov (conn, tx);
-                }
+	do {
+		if (ksocknal_data.ksnd_enomem_tx > 0) {
+			/* testing... */
+			ksocknal_data.ksnd_enomem_tx--;
+			rc = -EAGAIN;
+		} else if (tx->tx_niov != 0) {
+			rc = ksocknal_send_iov(conn, tx, scratch_iov);
+		} else {
+			rc = ksocknal_send_kiov(conn, tx, scratch_iov);
+		}
 
 		bufnob = conn->ksnc_sock->sk->sk_wmem_queued;
-                if (rc > 0)                     /* sent something? */
-                        conn->ksnc_tx_bufnob += rc; /* account it */
+		if (rc > 0)                     /* sent something? */
+			conn->ksnc_tx_bufnob += rc; /* account it */
 
 		if (bufnob < conn->ksnc_tx_bufnob) {
 			/* allocated send buffer bytes < computed; infer
 			 * something got ACKed */
-			conn->ksnc_tx_deadline =
-				cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
-			conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+			conn->ksnc_tx_deadline = ktime_get_seconds() +
+						 lnet_get_lnd_timeout();
+			conn->ksnc_peer->ksnp_last_alive = ktime_get_seconds();
 			conn->ksnc_tx_bufnob = bufnob;
 			smp_mb();
 		}
 
 		if (rc <= 0) { /* Didn't write anything? */
+			/* some stacks return 0 instead of -EAGAIN */
+			if (rc == 0)
+				rc = -EAGAIN;
 
-                        if (rc == 0) /* some stacks return 0 instead of -EAGAIN */
-                                rc = -EAGAIN;
-
-                        /* Check if EAGAIN is due to memory pressure */
-                        if(rc == -EAGAIN && ksocknal_lib_memory_pressure(conn))
-                                rc = -ENOMEM;
+			/* Check if EAGAIN is due to memory pressure */
+			if (rc == -EAGAIN && ksocknal_lib_memory_pressure(conn))
+				rc = -ENOMEM;
 
-                        break;
-                }
+			break;
+		}
 
-                /* socket's wmem_queued now includes 'rc' bytes */
+		/* socket's wmem_queued now includes 'rc' bytes */
 		atomic_sub (rc, &conn->ksnc_tx_nob);
-                rc = 0;
+		rc = 0;
 
-        } while (tx->tx_resid != 0);
+	} while (tx->tx_resid != 0);
 
-        ksocknal_connsock_decref(conn);
-        return (rc);
+	ksocknal_connsock_decref(conn);
+	return rc;
 }
 
 static int
-ksocknal_recv_iov (ksock_conn_t *conn)
+ksocknal_recv_iov(struct ksock_conn *conn, struct kvec *scratchiov)
 {
 	struct kvec *iov = conn->ksnc_rx_iov;
-        int     nob;
-        int     rc;
+	int     nob;
+	int     rc;
 
-        LASSERT (conn->ksnc_rx_niov > 0);
+	LASSERT(conn->ksnc_rx_niov > 0);
 
 	/* Never touch conn->ksnc_rx_iov or change connection
-         * status inside ksocknal_lib_recv_iov */
-        rc = ksocknal_lib_recv_iov(conn);
+	 * status inside ksocknal_lib_recv_iov */
+	rc = ksocknal_lib_recv_iov(conn, scratchiov);
 
-        if (rc <= 0)
-                return (rc);
+	if (rc <= 0)
+		return rc;
 
-        /* received something... */
-        nob = rc;
+	/* received something... */
+	nob = rc;
 
-	conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
-	conn->ksnc_rx_deadline =
-		cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+	conn->ksnc_peer->ksnp_last_alive = ktime_get_seconds();
+	conn->ksnc_rx_deadline = ktime_get_seconds() +
+				 lnet_get_lnd_timeout();
 	smp_mb();                       /* order with setting rx_started */
 	conn->ksnc_rx_started = 1;
 
 	conn->ksnc_rx_nob_wanted -= nob;
 	conn->ksnc_rx_nob_left -= nob;
 
-        do {
-                LASSERT (conn->ksnc_rx_niov > 0);
+	do {
+		LASSERT(conn->ksnc_rx_niov > 0);
 
-                if (nob < (int)iov->iov_len) {
-                        iov->iov_len -= nob;
+		if (nob < (int)iov->iov_len) {
+			iov->iov_len -= nob;
 			iov->iov_base += nob;
-                        return (-EAGAIN);
-                }
+			return -EAGAIN;
+		}
 
-                nob -= iov->iov_len;
-                conn->ksnc_rx_iov = ++iov;
-                conn->ksnc_rx_niov--;
-        } while (nob != 0);
+		nob -= iov->iov_len;
+		conn->ksnc_rx_iov = ++iov;
+		conn->ksnc_rx_niov--;
+	} while (nob != 0);
 
-        return (rc);
+	return rc;
 }
 
 static int
-ksocknal_recv_kiov (ksock_conn_t *conn)
+ksocknal_recv_kiov(struct ksock_conn *conn, struct page **rx_scratch_pgs,
+		   struct kvec *scratch_iov)
 {
-        lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
-        int     nob;
-        int     rc;
-        LASSERT (conn->ksnc_rx_nkiov > 0);
+	lnet_kiov_t *kiov = conn->ksnc_rx_kiov;
+	int nob;
+	int rc;
+	LASSERT(conn->ksnc_rx_nkiov > 0);
 
 	/* Never touch conn->ksnc_rx_kiov or change connection
-         * status inside ksocknal_lib_recv_iov */
-        rc = ksocknal_lib_recv_kiov(conn);
+	 * status inside ksocknal_lib_recv_iov */
+	rc = ksocknal_lib_recv_kiov(conn, rx_scratch_pgs, scratch_iov);
 
-        if (rc <= 0)
-                return (rc);
+	if (rc <= 0)
+		return rc;
 
-        /* received something... */
-        nob = rc;
+	/* received something... */
+	nob = rc;
 
-	conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
-	conn->ksnc_rx_deadline =
-		cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+	conn->ksnc_peer->ksnp_last_alive = ktime_get_seconds();
+	conn->ksnc_rx_deadline = ktime_get_seconds() +
+				 lnet_get_lnd_timeout();
 	smp_mb();                       /* order with setting rx_started */
 	conn->ksnc_rx_started = 1;
 
 	conn->ksnc_rx_nob_wanted -= nob;
 	conn->ksnc_rx_nob_left -= nob;
 
-        do {
-                LASSERT (conn->ksnc_rx_nkiov > 0);
+	do {
+		LASSERT(conn->ksnc_rx_nkiov > 0);
 
-                if (nob < (int) kiov->kiov_len) {
-                        kiov->kiov_offset += nob;
-                        kiov->kiov_len -= nob;
-                        return -EAGAIN;
-                }
+		if (nob < (int) kiov->kiov_len) {
+			kiov->kiov_offset += nob;
+			kiov->kiov_len -= nob;
+			return -EAGAIN;
+		}
 
-                nob -= kiov->kiov_len;
-                conn->ksnc_rx_kiov = ++kiov;
-                conn->ksnc_rx_nkiov--;
-        } while (nob != 0);
+		nob -= kiov->kiov_len;
+		conn->ksnc_rx_kiov = ++kiov;
+		conn->ksnc_rx_nkiov--;
+	} while (nob != 0);
 
-        return 1;
+	return 1;
 }
 
 static int
-ksocknal_receive (ksock_conn_t *conn)
+ksocknal_receive(struct ksock_conn *conn, struct page **rx_scratch_pgs,
+		 struct kvec *scratch_iov)
 {
-        /* Return 1 on success, 0 on EOF, < 0 on error.
-         * Caller checks ksnc_rx_nob_wanted to determine
-         * progress/completion. */
-        int     rc;
-        ENTRY;
+	/* Return 1 on success, 0 on EOF, < 0 on error.
+	 * Caller checks ksnc_rx_nob_wanted to determine
+	 * progress/completion. */
+	int     rc;
+	ENTRY;
 
 	if (ksocknal_data.ksnd_stall_rx != 0) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_timeout(cfs_time_seconds(ksocknal_data.ksnd_stall_rx));
 	}
 
-        rc = ksocknal_connsock_addref(conn);
-        if (rc != 0) {
-                LASSERT (conn->ksnc_closing);
-                return (-ESHUTDOWN);
-        }
+	rc = ksocknal_connsock_addref(conn);
+	if (rc != 0) {
+		LASSERT(conn->ksnc_closing);
+		return -ESHUTDOWN;
+	}
 
-        for (;;) {
-                if (conn->ksnc_rx_niov != 0)
-                        rc = ksocknal_recv_iov (conn);
-                else
-                        rc = ksocknal_recv_kiov (conn);
-
-                if (rc <= 0) {
-                        /* error/EOF or partial receive */
-                        if (rc == -EAGAIN) {
-                                rc = 1;
-                        } else if (rc == 0 && conn->ksnc_rx_started) {
-                                /* EOF in the middle of a message */
-                                rc = -EPROTO;
-                        }
-                        break;
-                }
+	for (;;) {
+		if (conn->ksnc_rx_niov != 0)
+			rc = ksocknal_recv_iov(conn, scratch_iov);
+		else
+			rc = ksocknal_recv_kiov(conn, rx_scratch_pgs,
+						 scratch_iov);
 
-                /* Completed a fragment */
+		if (rc <= 0) {
+			/* error/EOF or partial receive */
+			if (rc == -EAGAIN) {
+				rc = 1;
+			} else if (rc == 0 && conn->ksnc_rx_started) {
+				/* EOF in the middle of a message */
+				rc = -EPROTO;
+			}
+			break;
+		}
 
-                if (conn->ksnc_rx_nob_wanted == 0) {
-                        rc = 1;
-                        break;
-                }
-        }
+		/* Completed a fragment */
 
-        ksocknal_connsock_decref(conn);
-        RETURN (rc);
+		if (conn->ksnc_rx_nob_wanted == 0) {
+			rc = 1;
+			break;
+		}
+	}
+
+	ksocknal_connsock_decref(conn);
+	RETURN(rc);
 }
 
 void
-ksocknal_tx_done(struct lnet_ni *ni, ksock_tx_t *tx, int rc)
+ksocknal_tx_done(struct lnet_ni *ni, struct ksock_tx *tx, int rc)
 {
 	struct lnet_msg *lnetmsg = tx->tx_lnetmsg;
+	enum lnet_msg_hstatus hstatus = tx->tx_hstatus;
         ENTRY;
 
 	LASSERT(ni != NULL || tx->tx_conn != NULL);
 
-	if (!rc && (tx->tx_resid != 0 || tx->tx_zc_aborted))
+	if (!rc && (tx->tx_resid != 0 || tx->tx_zc_aborted)) {
 		rc = -EIO;
+		if (hstatus == LNET_MSG_STATUS_OK)
+			hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
+	}
 
 	if (tx->tx_conn != NULL)
 		ksocknal_conn_decref(tx->tx_conn);
 
 	ksocknal_free_tx(tx);
-	if (lnetmsg != NULL) /* KSOCK_MSG_NOOP go without lnetmsg */
+	if (lnetmsg != NULL) { /* KSOCK_MSG_NOOP go without lnetmsg */
+		lnetmsg->msg_health_status = hstatus;
 		lnet_finalize(lnetmsg, rc);
+	}
 
 	EXIT;
 }
@@ -412,10 +425,10 @@ ksocknal_tx_done(struct lnet_ni *ni, ksock_tx_t *tx, int rc)
 void
 ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist, int error)
 {
-	ksock_tx_t *tx;
+	struct ksock_tx *tx;
 
 	while (!list_empty(txlist)) {
-		tx = list_entry(txlist->next, ksock_tx_t, tx_list);
+		tx = list_entry(txlist->next, struct ksock_tx, tx_list);
 
 		if (error && tx->tx_lnetmsg != NULL) {
 			CNETERR("Deleting packet type %d len %d %s->%s\n",
@@ -429,16 +442,34 @@ ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist, int error)
 
 		list_del(&tx->tx_list);
 
+		if (tx->tx_hstatus == LNET_MSG_STATUS_OK) {
+			if (error == -ETIMEDOUT)
+				tx->tx_hstatus =
+				  LNET_MSG_STATUS_LOCAL_TIMEOUT;
+			else if (error == -ENETDOWN ||
+				 error == -EHOSTUNREACH ||
+				 error == -ENETUNREACH ||
+				 error == -ECONNREFUSED ||
+				 error == -ECONNRESET)
+				tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_DROPPED;
+			/*
+			 * for all other errors we don't want to
+			 * retransmit
+			 */
+			else if (error)
+				tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
+		}
+
 		LASSERT(atomic_read(&tx->tx_refcount) == 1);
 		ksocknal_tx_done(ni, tx, error);
 	}
 }
 
 static void
-ksocknal_check_zc_req(ksock_tx_t *tx)
+ksocknal_check_zc_req(struct ksock_tx *tx)
 {
-        ksock_conn_t   *conn = tx->tx_conn;
-        ksock_peer_ni_t   *peer_ni = conn->ksnc_peer;
+	struct ksock_conn *conn = tx->tx_conn;
+	struct ksock_peer_ni *peer_ni = conn->ksnc_peer;
 
         /* Set tx_msg.ksm_zc_cookies[0] to a unique non-zero cookie and add tx
          * to ksnp_zc_req_list if some fragment of this message should be sent
@@ -463,8 +494,8 @@ ksocknal_check_zc_req(ksock_tx_t *tx)
 	spin_lock(&peer_ni->ksnp_lock);
 
         /* ZC_REQ is going to be pinned to the peer_ni */
-	tx->tx_deadline =
-		cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+	tx->tx_deadline = ktime_get_seconds() +
+			  lnet_get_lnd_timeout();
 
         LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0);
 
@@ -479,9 +510,9 @@ ksocknal_check_zc_req(ksock_tx_t *tx)
 }
 
 static void
-ksocknal_uncheck_zc_req(ksock_tx_t *tx)
+ksocknal_uncheck_zc_req(struct ksock_tx *tx)
 {
-	ksock_peer_ni_t   *peer_ni = tx->tx_conn->ksnc_peer;
+	struct ksock_peer_ni *peer_ni = tx->tx_conn->ksnc_peer;
 
 	LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
 	LASSERT(tx->tx_zc_capable);
@@ -505,85 +536,111 @@ ksocknal_uncheck_zc_req(ksock_tx_t *tx)
 }
 
 static int
-ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
+ksocknal_process_transmit(struct ksock_conn *conn, struct ksock_tx *tx,
+			  struct kvec *scratch_iov)
 {
-        int            rc;
+	int rc;
+	bool error_sim = false;
+
+	if (lnet_send_error_simulation(tx->tx_lnetmsg, &tx->tx_hstatus)) {
+		error_sim = true;
+		rc = -EINVAL;
+		goto simulate_error;
+	}
 
-        if (tx->tx_zc_capable && !tx->tx_zc_checked)
-                ksocknal_check_zc_req(tx);
+	if (tx->tx_zc_capable && !tx->tx_zc_checked)
+		ksocknal_check_zc_req(tx);
 
-        rc = ksocknal_transmit (conn, tx);
+	rc = ksocknal_transmit(conn, tx, scratch_iov);
 
-        CDEBUG (D_NET, "send(%d) %d\n", tx->tx_resid, rc);
+	CDEBUG(D_NET, "send(%d) %d\n", tx->tx_resid, rc);
 
-        if (tx->tx_resid == 0) {
-                /* Sent everything OK */
-                LASSERT (rc == 0);
+	if (tx->tx_resid == 0) {
+		/* Sent everything OK */
+		LASSERT(rc == 0);
 
-                return (0);
-        }
+		return 0;
+	}
 
-        if (rc == -EAGAIN)
-                return (rc);
+	if (rc == -EAGAIN)
+		return rc;
 
-        if (rc == -ENOMEM) {
-                static int counter;
+	if (rc == -ENOMEM) {
+		static int counter;
 
-                counter++;   /* exponential backoff warnings */
-                if ((counter & (-counter)) == counter)
-                        CWARN("%u ENOMEM tx %p (%u allocated)\n",
+		counter++;   /* exponential backoff warnings */
+		if ((counter & (-counter)) == counter)
+			CWARN("%u ENOMEM tx %p (%u allocated)\n",
 			      counter, conn, atomic_read(&libcfs_kmemory));
 
-                /* Queue on ksnd_enomem_conns for retry after a timeout */
+		/* Queue on ksnd_enomem_conns for retry after a timeout */
 		spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
 
-                /* enomem list takes over scheduler's ref... */
-                LASSERT (conn->ksnc_tx_scheduled);
+		/* enomem list takes over scheduler's ref... */
+		LASSERT(conn->ksnc_tx_scheduled);
 		list_add_tail(&conn->ksnc_tx_list,
-                                  &ksocknal_data.ksnd_enomem_conns);
-		if (!cfs_time_aftereq(cfs_time_add(cfs_time_current(),
-					SOCKNAL_ENOMEM_RETRY),
-					ksocknal_data.ksnd_reaper_waketime))
+				  &ksocknal_data.ksnd_enomem_conns);
+		if (ktime_get_seconds() + SOCKNAL_ENOMEM_RETRY <
+		    ksocknal_data.ksnd_reaper_waketime)
 			wake_up(&ksocknal_data.ksnd_reaper_waitq);
 
 		spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+		/*
+		 * set the health status of the message which determines
+		 * whether we should retry the transmit
+		 */
+		tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
 		return (rc);
 	}
 
-        /* Actual error */
-        LASSERT (rc < 0);
+simulate_error:
 
-        if (!conn->ksnc_closing) {
-                switch (rc) {
-                case -ECONNRESET:
+	/* Actual error */
+	LASSERT(rc < 0);
+
+	if (!error_sim) {
+		/*
+		* set the health status of the message which determines
+		* whether we should retry the transmit
+		*/
+		if (rc == -ETIMEDOUT)
+			tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_TIMEOUT;
+		else
+			tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
+	}
+
+	if (!conn->ksnc_closing) {
+		switch (rc) {
+		case -ECONNRESET:
 			LCONSOLE_WARN("Host %pI4h reset our connection "
-                                      "while we were sending data; it may have "
-                                      "rebooted.\n",
+				      "while we were sending data; it may have "
+				      "rebooted.\n",
 				      &conn->ksnc_ipaddr);
-                        break;
-                default:
-                        LCONSOLE_WARN("There was an unexpected network error "
+			break;
+		default:
+			LCONSOLE_WARN("There was an unexpected network error "
 				      "while writing to %pI4h: %d.\n",
 				      &conn->ksnc_ipaddr, rc);
-                        break;
-                }
+			break;
+		}
 		CDEBUG(D_NET, "[%p] Error %d on write to %s ip %pI4h:%d\n",
 		       conn, rc, libcfs_id2str(conn->ksnc_peer->ksnp_id),
 		       &conn->ksnc_ipaddr, conn->ksnc_port);
-        }
+	}
 
-        if (tx->tx_zc_checked)
-                ksocknal_uncheck_zc_req(tx);
+	if (tx->tx_zc_checked)
+		ksocknal_uncheck_zc_req(tx);
 
-        /* it's not an error if conn is being closed */
-        ksocknal_close_conn_and_siblings (conn,
-                                          (conn->ksnc_closing) ? 0 : rc);
+	/* it's not an error if conn is being closed */
+	ksocknal_close_conn_and_siblings(conn,
+					  (conn->ksnc_closing) ? 0 : rc);
 
-        return (rc);
+	return rc;
 }
 
 static void
-ksocknal_launch_connection_locked (ksock_route_t *route)
+ksocknal_launch_connection_locked(struct ksock_route *route)
 {
 
         /* called holding write lock on ksnd_global_lock */
@@ -605,9 +662,9 @@ ksocknal_launch_connection_locked (ksock_route_t *route)
 }
 
 void
-ksocknal_launch_all_connections_locked (ksock_peer_ni_t *peer_ni)
+ksocknal_launch_all_connections_locked(struct ksock_peer_ni *peer_ni)
 {
-        ksock_route_t *route;
+	struct ksock_route *route;
 
         /* called holding write lock on ksnd_global_lock */
         for (;;) {
@@ -620,21 +677,22 @@ ksocknal_launch_all_connections_locked (ksock_peer_ni_t *peer_ni)
         }
 }
 
-ksock_conn_t *
-ksocknal_find_conn_locked(ksock_peer_ni_t *peer_ni, ksock_tx_t *tx, int nonblk)
+struct ksock_conn *
+ksocknal_find_conn_locked(struct ksock_peer_ni *peer_ni, struct ksock_tx *tx, int nonblk)
 {
 	struct list_head *tmp;
-        ksock_conn_t     *conn;
-        ksock_conn_t     *typed = NULL;
-        ksock_conn_t     *fallback = NULL;
-        int               tnob     = 0;
-        int               fnob     = 0;
+	struct ksock_conn *conn;
+	struct ksock_conn *typed = NULL;
+	struct ksock_conn *fallback = NULL;
+	int tnob = 0;
+	int fnob = 0;
 
 	list_for_each(tmp, &peer_ni->ksnp_conns) {
-		ksock_conn_t *c  = list_entry(tmp, ksock_conn_t, ksnc_list);
-		int           nob = atomic_read(&c->ksnc_tx_nob) +
-					c->ksnc_sock->sk->sk_wmem_queued;
-                int           rc;
+		struct ksock_conn *c = list_entry(tmp, struct ksock_conn,
+						  ksnc_list);
+		int nob = atomic_read(&c->ksnc_tx_nob) +
+			  c->ksnc_sock->sk->sk_wmem_queued;
+		int rc;
 
                 LASSERT (!c->ksnc_closing);
                 LASSERT (c->ksnc_proto != NULL &&
@@ -651,7 +709,7 @@ ksocknal_find_conn_locked(ksock_peer_ni_t *peer_ni, ksock_tx_t *tx, int nonblk)
                 case SOCKNAL_MATCH_YES: /* typed connection */
                         if (typed == NULL || tnob > nob ||
                             (tnob == nob && *ksocknal_tunables.ksnd_round_robin &&
-			     cfs_time_after(typed->ksnc_tx_last_post, c->ksnc_tx_last_post))) {
+			     typed->ksnc_tx_last_post > c->ksnc_tx_last_post)) {
                                 typed = c;
                                 tnob  = nob;
                         }
@@ -660,7 +718,7 @@ ksocknal_find_conn_locked(ksock_peer_ni_t *peer_ni, ksock_tx_t *tx, int nonblk)
                 case SOCKNAL_MATCH_MAY: /* fallback connection */
                         if (fallback == NULL || fnob > nob ||
                             (fnob == nob && *ksocknal_tunables.ksnd_round_robin &&
-			     cfs_time_after(fallback->ksnc_tx_last_post, c->ksnc_tx_last_post))) {
+			     fallback->ksnc_tx_last_post > c->ksnc_tx_last_post)) {
                                 fallback = c;
                                 fnob     = nob;
                         }
@@ -672,13 +730,13 @@ ksocknal_find_conn_locked(ksock_peer_ni_t *peer_ni, ksock_tx_t *tx, int nonblk)
         conn = (typed != NULL) ? typed : fallback;
 
         if (conn != NULL)
-		conn->ksnc_tx_last_post = cfs_time_current();
+		conn->ksnc_tx_last_post = ktime_get_seconds();
 
         return conn;
 }
 
 void
-ksocknal_tx_prep(ksock_conn_t *conn, ksock_tx_t *tx)
+ksocknal_tx_prep(struct ksock_conn *conn, struct ksock_tx *tx)
 {
         conn->ksnc_proto->pro_pack(tx);
 
@@ -688,12 +746,12 @@ ksocknal_tx_prep(ksock_conn_t *conn, ksock_tx_t *tx)
 }
 
 void
-ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn)
+ksocknal_queue_tx_locked(struct ksock_tx *tx, struct ksock_conn *conn)
 {
-        ksock_sched_t *sched = conn->ksnc_scheduler;
-	struct ksock_msg   *msg = &tx->tx_msg;
-        ksock_tx_t    *ztx = NULL;
-        int            bufnob = 0;
+	struct ksock_sched *sched = conn->ksnc_scheduler;
+	struct ksock_msg *msg = &tx->tx_msg;
+	struct ksock_tx *ztx = NULL;
+	int bufnob = 0;
 
         /* called holding global lock (read or irq-write) and caller may
          * not have dropped this lock between finding conn and calling me,
@@ -729,10 +787,10 @@ ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn)
 
 	if (list_empty(&conn->ksnc_tx_queue) && bufnob == 0) {
 		/* First packet starts the timeout */
-		conn->ksnc_tx_deadline =
-			cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+		conn->ksnc_tx_deadline = ktime_get_seconds() +
+					 lnet_get_lnd_timeout();
 		if (conn->ksnc_tx_bufnob > 0) /* something got ACKed */
-			conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+			conn->ksnc_peer->ksnp_last_alive = ktime_get_seconds();
 		conn->ksnc_tx_bufnob = 0;
 		smp_mb(); /* order with adding to tx_queue */
 	}
@@ -775,15 +833,15 @@ ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn)
 }
 
 
-ksock_route_t *
-ksocknal_find_connectable_route_locked (ksock_peer_ni_t *peer_ni)
+struct ksock_route *
+ksocknal_find_connectable_route_locked(struct ksock_peer_ni *peer_ni)
 {
-	cfs_time_t     now = cfs_time_current();
-	struct list_head    *tmp;
-        ksock_route_t *route;
+	time64_t now = ktime_get_seconds();
+	struct list_head *tmp;
+	struct ksock_route *route;
 
 	list_for_each(tmp, &peer_ni->ksnp_routes) {
-		route = list_entry(tmp, ksock_route_t, ksnr_list);
+		route = list_entry(tmp, struct ksock_route, ksnr_list);
 
                 LASSERT (!route->ksnr_connecting || route->ksnr_scheduled);
 
@@ -795,14 +853,14 @@ ksocknal_find_connectable_route_locked (ksock_peer_ni_t *peer_ni)
                         continue;
 
                 if (!(route->ksnr_retry_interval == 0 || /* first attempt */
-		      cfs_time_aftereq(now, route->ksnr_timeout))) {
+		      now >= route->ksnr_timeout)) {
                         CDEBUG(D_NET,
 			       "Too soon to retry route %pI4h "
-			       "(cnted %d, interval %ld, %ld secs later)\n",
+			       "(cnted %d, interval %lld, %lld secs later)\n",
 			       &route->ksnr_ipaddr,
                                route->ksnr_connected,
                                route->ksnr_retry_interval,
-			       cfs_duration_sec(route->ksnr_timeout - now));
+			       route->ksnr_timeout - now);
                         continue;
                 }
 
@@ -812,14 +870,14 @@ ksocknal_find_connectable_route_locked (ksock_peer_ni_t *peer_ni)
         return (NULL);
 }
 
-ksock_route_t *
-ksocknal_find_connecting_route_locked (ksock_peer_ni_t *peer_ni)
+struct ksock_route *
+ksocknal_find_connecting_route_locked(struct ksock_peer_ni *peer_ni)
 {
-	struct list_head        *tmp;
-        ksock_route_t     *route;
+	struct list_head *tmp;
+	struct ksock_route *route;
 
 	list_for_each(tmp, &peer_ni->ksnp_routes) {
-		route = list_entry(tmp, ksock_route_t, ksnr_list);
+		route = list_entry(tmp, struct ksock_route, ksnr_list);
 
                 LASSERT (!route->ksnr_connecting || route->ksnr_scheduled);
 
@@ -831,14 +889,14 @@ ksocknal_find_connecting_route_locked (ksock_peer_ni_t *peer_ni)
 }
 
 int
-ksocknal_launch_packet(struct lnet_ni *ni, ksock_tx_t *tx,
+ksocknal_launch_packet(struct lnet_ni *ni, struct ksock_tx *tx,
 		       struct lnet_process_id id)
 {
-        ksock_peer_ni_t     *peer_ni;
-        ksock_conn_t     *conn;
-	rwlock_t     *g_lock;
-        int               retry;
-        int               rc;
+	struct ksock_peer_ni *peer_ni;
+	struct ksock_conn *conn;
+	rwlock_t *g_lock;
+	int retry;
+	int rc;
 
         LASSERT (tx->tx_conn == NULL);
 
@@ -906,8 +964,8 @@ ksocknal_launch_packet(struct lnet_ni *ni, ksock_tx_t *tx,
         if (peer_ni->ksnp_accepting > 0 ||
             ksocknal_find_connecting_route_locked (peer_ni) != NULL) {
                 /* the message is going to be pinned to the peer_ni */
-		tx->tx_deadline =
-			cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+		tx->tx_deadline = ktime_get_seconds() +
+				  lnet_get_lnd_timeout();
 
                 /* Queue the message until a connection is established */
 		list_add_tail(&tx->tx_list, &peer_ni->ksnp_tx_queue);
@@ -919,6 +977,7 @@ ksocknal_launch_packet(struct lnet_ni *ni, ksock_tx_t *tx,
 
         /* NB Routes may be ignored if connections to them failed recently */
         CNETERR("No usable routes to %s\n", libcfs_id2str(id));
+	tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_ERROR;
         return (-EHOSTUNREACH);
 }
 
@@ -933,7 +992,7 @@ ksocknal_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
         lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
         unsigned int      payload_offset = lntmsg->msg_offset;
         unsigned int      payload_nob = lntmsg->msg_len;
-        ksock_tx_t       *tx;
+	struct ksock_tx *tx;
         int               desc_size;
         int               rc;
 
@@ -950,10 +1009,10 @@ ksocknal_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
 	LASSERT (!in_interrupt ());
 
 	if (payload_iov != NULL)
-		desc_size = offsetof(ksock_tx_t,
+		desc_size = offsetof(struct ksock_tx,
 				     tx_frags.virt.iov[1 + payload_niov]);
 	else
-		desc_size = offsetof(ksock_tx_t,
+		desc_size = offsetof(struct ksock_tx,
 				     tx_frags.paged.kiov[payload_niov]);
 
         if (lntmsg->msg_vmflush)
@@ -1003,6 +1062,7 @@ ksocknal_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
         if (rc == 0)
                 return (0);
 
+	lntmsg->msg_health_status = tx->tx_hstatus;
         ksocknal_free_tx(tx);
         return (-EIO);
 }
@@ -1030,13 +1090,12 @@ ksocknal_thread_fini (void)
 }
 
 int
-ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip)
+ksocknal_new_packet(struct ksock_conn *conn, int nob_to_skip)
 {
         static char ksocknal_slop_buffer[4096];
-
-        int            nob;
-        unsigned int   niov;
-        int            skipped;
+	int nob;
+	unsigned int niov;
+	int skipped;
 
         LASSERT(conn->ksnc_proto != NULL);
 
@@ -1112,7 +1171,9 @@ ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip)
 }
 
 static int
-ksocknal_process_receive (ksock_conn_t *conn)
+ksocknal_process_receive(struct ksock_conn *conn,
+			 struct page **rx_scratch_pgs,
+			 struct kvec *scratch_iov)
 {
 	struct lnet_hdr *lhdr;
 	struct lnet_process_id *id;
@@ -1122,13 +1183,14 @@ ksocknal_process_receive (ksock_conn_t *conn)
 
 	/* NB: sched lock NOT held */
 	/* SOCKNAL_RX_LNET_HEADER is here for backward compatibility */
-        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_KSM_HEADER ||
-                 conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD ||
-                 conn->ksnc_rx_state == SOCKNAL_RX_LNET_HEADER ||
-                 conn->ksnc_rx_state == SOCKNAL_RX_SLOP);
+	LASSERT(conn->ksnc_rx_state == SOCKNAL_RX_KSM_HEADER ||
+		conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD ||
+		conn->ksnc_rx_state == SOCKNAL_RX_LNET_HEADER ||
+		conn->ksnc_rx_state == SOCKNAL_RX_SLOP);
  again:
-        if (conn->ksnc_rx_nob_wanted != 0) {
-                rc = ksocknal_receive(conn);
+	if (conn->ksnc_rx_nob_wanted != 0) {
+		rc = ksocknal_receive(conn, rx_scratch_pgs,
+				      scratch_iov);
 
 		if (rc <= 0) {
 			struct lnet_process_id ksnp_id;
@@ -1294,14 +1356,17 @@ ksocknal_process_receive (ksock_conn_t *conn)
                                         le64_to_cpu(lhdr->src_nid) != id->nid);
                 }
 
-		lnet_finalize(conn->ksnc_cookie, rc);
+		if (rc && conn->ksnc_lnet_msg)
+			conn->ksnc_lnet_msg->msg_health_status =
+				LNET_MSG_STATUS_REMOTE_ERROR;
+		lnet_finalize(conn->ksnc_lnet_msg, rc);
 
                 if (rc != 0) {
                         ksocknal_new_packet(conn, 0);
                         ksocknal_close_conn_and_siblings (conn, rc);
                         return (-EPROTO);
                 }
-                /* Fall through */
+                fallthrough;
 
         case SOCKNAL_RX_SLOP:
                 /* starting new packet? */
@@ -1324,15 +1389,15 @@ ksocknal_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg,
 	      lnet_kiov_t *kiov, unsigned int offset, unsigned int mlen,
 	      unsigned int rlen)
 {
-        ksock_conn_t  *conn = (ksock_conn_t *)private;
-        ksock_sched_t *sched = conn->ksnc_scheduler;
+	struct ksock_conn *conn = private;
+	struct ksock_sched *sched = conn->ksnc_scheduler;
 
         LASSERT (mlen <= rlen);
         LASSERT (niov <= LNET_MAX_IOV);
 
-        conn->ksnc_cookie = msg;
-        conn->ksnc_rx_nob_wanted = mlen;
-        conn->ksnc_rx_nob_left   = rlen;
+	conn->ksnc_lnet_msg = msg;
+	conn->ksnc_rx_nob_wanted = mlen;
+	conn->ksnc_rx_nob_left   = rlen;
 
         if (mlen == 0 || iov != NULL) {
                 conn->ksnc_rx_nkiov = 0;
@@ -1378,7 +1443,7 @@ ksocknal_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg,
 }
 
 static inline int
-ksocknal_sched_cansleep(ksock_sched_t *sched)
+ksocknal_sched_cansleep(struct ksock_sched *sched)
 {
 	int           rc;
 
@@ -1394,154 +1459,169 @@ ksocknal_sched_cansleep(ksock_sched_t *sched)
 
 int ksocknal_scheduler(void *arg)
 {
-	struct ksock_sched_info	*info;
-	ksock_sched_t		*sched;
-	ksock_conn_t		*conn;
-	ksock_tx_t		*tx;
-	int			rc;
-	int			nloops = 0;
-	long			id = (long)arg;
+	struct ksock_sched *sched;
+	struct ksock_conn *conn;
+	struct ksock_tx	*tx;
+	int rc;
+	int nloops = 0;
+	long id = (long)arg;
+	struct page **rx_scratch_pgs;
+	struct kvec *scratch_iov;
+
+	sched = ksocknal_data.ksnd_schedulers[KSOCK_THREAD_CPT(id)];
+
+	LIBCFS_CPT_ALLOC(rx_scratch_pgs, lnet_cpt_table(), sched->kss_cpt,
+			 sizeof(*rx_scratch_pgs) * LNET_MAX_IOV);
+	if (!rx_scratch_pgs) {
+		CERROR("Unable to allocate scratch pages\n");
+		return -ENOMEM;
+	}
 
-	info = ksocknal_data.ksnd_sched_info[KSOCK_THREAD_CPT(id)];
-	sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
+	LIBCFS_CPT_ALLOC(scratch_iov, lnet_cpt_table(), sched->kss_cpt,
+			 sizeof(*scratch_iov) * LNET_MAX_IOV);
+	if (!scratch_iov) {
+		CERROR("Unable to allocate scratch iov\n");
+		return -ENOMEM;
+	}
 
 	cfs_block_allsigs();
 
-	rc = cfs_cpt_bind(lnet_cpt_table(), info->ksi_cpt);
+	rc = cfs_cpt_bind(lnet_cpt_table(), sched->kss_cpt);
 	if (rc != 0) {
 		CWARN("Can't set CPU partition affinity to %d: %d\n",
-			info->ksi_cpt, rc);
+			sched->kss_cpt, rc);
 	}
 
 	spin_lock_bh(&sched->kss_lock);
 
-        while (!ksocknal_data.ksnd_shuttingdown) {
-                int did_something = 0;
+	while (!ksocknal_data.ksnd_shuttingdown) {
+		int did_something = 0;
 
-                /* Ensure I progress everything semi-fairly */
+		/* Ensure I progress everything semi-fairly */
 
 		if (!list_empty(&sched->kss_rx_conns)) {
 			conn = list_entry(sched->kss_rx_conns.next,
-                                              ksock_conn_t, ksnc_rx_list);
+					  struct ksock_conn, ksnc_rx_list);
 			list_del(&conn->ksnc_rx_list);
 
-                        LASSERT(conn->ksnc_rx_scheduled);
-                        LASSERT(conn->ksnc_rx_ready);
+			LASSERT(conn->ksnc_rx_scheduled);
+			LASSERT(conn->ksnc_rx_ready);
 
-                        /* clear rx_ready in case receive isn't complete.
-                         * Do it BEFORE we call process_recv, since
-                         * data_ready can set it any time after we release
-                         * kss_lock. */
-                        conn->ksnc_rx_ready = 0;
+			/* clear rx_ready in case receive isn't complete.
+			 * Do it BEFORE we call process_recv, since
+			 * data_ready can set it any time after we release
+			 * kss_lock. */
+			conn->ksnc_rx_ready = 0;
 			spin_unlock_bh(&sched->kss_lock);
 
-			rc = ksocknal_process_receive(conn);
+			rc = ksocknal_process_receive(conn, rx_scratch_pgs,
+						      scratch_iov);
 
 			spin_lock_bh(&sched->kss_lock);
 
-                        /* I'm the only one that can clear this flag */
-                        LASSERT(conn->ksnc_rx_scheduled);
+			/* I'm the only one that can clear this flag */
+			LASSERT(conn->ksnc_rx_scheduled);
 
-                        /* Did process_receive get everything it wanted? */
-                        if (rc == 0)
-                                conn->ksnc_rx_ready = 1;
-
-                        if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) {
-                                /* Conn blocked waiting for ksocknal_recv()
-                                 * I change its state (under lock) to signal
-                                 * it can be rescheduled */
-                                conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT;
-                        } else if (conn->ksnc_rx_ready) {
-                                /* reschedule for rx */
+			/* Did process_receive get everything it wanted? */
+			if (rc == 0)
+				conn->ksnc_rx_ready = 1;
+
+			if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) {
+				/* Conn blocked waiting for ksocknal_recv()
+				 * I change its state (under lock) to signal
+				 * it can be rescheduled */
+				conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT;
+			} else if (conn->ksnc_rx_ready) {
+				/* reschedule for rx */
 				list_add_tail(&conn->ksnc_rx_list,
-                                                   &sched->kss_rx_conns);
-                        } else {
-                                conn->ksnc_rx_scheduled = 0;
-                                /* drop my ref */
-                                ksocknal_conn_decref(conn);
-                        }
+						   &sched->kss_rx_conns);
+			} else {
+				conn->ksnc_rx_scheduled = 0;
+				/* drop my ref */
+				ksocknal_conn_decref(conn);
+			}
 
-                        did_something = 1;
-                }
+			did_something = 1;
+		}
 
 		if (!list_empty(&sched->kss_tx_conns)) {
 			struct list_head zlist = LIST_HEAD_INIT(zlist);
 
 			if (!list_empty(&sched->kss_zombie_noop_txs)) {
 				list_add(&zlist,
-                                             &sched->kss_zombie_noop_txs);
+					 &sched->kss_zombie_noop_txs);
 				list_del_init(&sched->kss_zombie_noop_txs);
-                        }
+			}
 
 			conn = list_entry(sched->kss_tx_conns.next,
-                                              ksock_conn_t, ksnc_tx_list);
+					  struct ksock_conn, ksnc_tx_list);
 			list_del(&conn->ksnc_tx_list);
 
-                        LASSERT(conn->ksnc_tx_scheduled);
-                        LASSERT(conn->ksnc_tx_ready);
+			LASSERT(conn->ksnc_tx_scheduled);
+			LASSERT(conn->ksnc_tx_ready);
 			LASSERT(!list_empty(&conn->ksnc_tx_queue));
 
 			tx = list_entry(conn->ksnc_tx_queue.next,
-                                            ksock_tx_t, tx_list);
+					struct ksock_tx, tx_list);
 
-                        if (conn->ksnc_tx_carrier == tx)
-                                ksocknal_next_tx_carrier(conn);
+			if (conn->ksnc_tx_carrier == tx)
+				ksocknal_next_tx_carrier(conn);
 
-                        /* dequeue now so empty list => more to send */
+			/* dequeue now so empty list => more to send */
 			list_del(&tx->tx_list);
 
-                        /* Clear tx_ready in case send isn't complete.  Do
-                         * it BEFORE we call process_transmit, since
-                         * write_space can set it any time after we release
-                         * kss_lock. */
-                        conn->ksnc_tx_ready = 0;
+			/* Clear tx_ready in case send isn't complete.  Do
+			 * it BEFORE we call process_transmit, since
+			 * write_space can set it any time after we release
+			 * kss_lock. */
+			conn->ksnc_tx_ready = 0;
 			spin_unlock_bh(&sched->kss_lock);
 
 			if (!list_empty(&zlist)) {
 				/* free zombie noop txs, it's fast because
-                                 * noop txs are just put in freelist */
-                                ksocknal_txlist_done(NULL, &zlist, 0);
-                        }
+				 * noop txs are just put in freelist */
+				ksocknal_txlist_done(NULL, &zlist, 0);
+			}
 
-                        rc = ksocknal_process_transmit(conn, tx);
+			rc = ksocknal_process_transmit(conn, tx, scratch_iov);
 
-                        if (rc == -ENOMEM || rc == -EAGAIN) {
-                                /* Incomplete send: replace tx on HEAD of tx_queue */
+			if (rc == -ENOMEM || rc == -EAGAIN) {
+				/* Incomplete send: replace tx on HEAD of tx_queue */
 				spin_lock_bh(&sched->kss_lock);
 				list_add(&tx->tx_list,
-					     &conn->ksnc_tx_queue);
+					 &conn->ksnc_tx_queue);
 			} else {
 				/* Complete send; tx -ref */
 				ksocknal_tx_decref(tx);
 
 				spin_lock_bh(&sched->kss_lock);
-                                /* assume space for more */
-                                conn->ksnc_tx_ready = 1;
-                        }
+				/* assume space for more */
+				conn->ksnc_tx_ready = 1;
+			}
 
-                        if (rc == -ENOMEM) {
-                                /* Do nothing; after a short timeout, this
-                                 * conn will be reposted on kss_tx_conns. */
-                        } else if (conn->ksnc_tx_ready &&
+			if (rc == -ENOMEM) {
+				/* Do nothing; after a short timeout, this
+				 * conn will be reposted on kss_tx_conns. */
+			} else if (conn->ksnc_tx_ready &&
 				   !list_empty(&conn->ksnc_tx_queue)) {
-                                /* reschedule for tx */
+				/* reschedule for tx */
 				list_add_tail(&conn->ksnc_tx_list,
-                                                   &sched->kss_tx_conns);
-                        } else {
-                                conn->ksnc_tx_scheduled = 0;
-                                /* drop my ref */
-                                ksocknal_conn_decref(conn);
-                        }
+					      &sched->kss_tx_conns);
+			} else {
+				conn->ksnc_tx_scheduled = 0;
+				/* drop my ref */
+				ksocknal_conn_decref(conn);
+			}
 
-                        did_something = 1;
-                }
-                if (!did_something ||           /* nothing to do */
-                    ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */
+			did_something = 1;
+		}
+		if (!did_something ||           /* nothing to do */
+		    ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */
 			spin_unlock_bh(&sched->kss_lock);
 
-                        nloops = 0;
+			nloops = 0;
 
-                        if (!did_something) {   /* wait for something to do */
+			if (!did_something) {   /* wait for something to do */
 				rc = wait_event_interruptible_exclusive(
 					sched->kss_waitq,
 					!ksocknal_sched_cansleep(sched));
@@ -1555,6 +1635,10 @@ int ksocknal_scheduler(void *arg)
 	}
 
 	spin_unlock_bh(&sched->kss_lock);
+	LIBCFS_FREE(rx_scratch_pgs, sizeof(*rx_scratch_pgs) *
+		    LNET_MAX_IOV);
+	LIBCFS_FREE(scratch_iov, sizeof(*scratch_iov) *
+		    LNET_MAX_IOV);
 	ksocknal_thread_fini();
 	return 0;
 }
@@ -1563,9 +1647,9 @@ int ksocknal_scheduler(void *arg)
  * Add connection to kss_rx_conns of scheduler
  * and wakeup the scheduler.
  */
-void ksocknal_read_callback (ksock_conn_t *conn)
+void ksocknal_read_callback(struct ksock_conn *conn)
 {
-	ksock_sched_t *sched;
+	struct ksock_sched *sched;
 	ENTRY;
 
 	sched = conn->ksnc_scheduler;
@@ -1592,9 +1676,9 @@ void ksocknal_read_callback (ksock_conn_t *conn)
  * Add connection to kss_tx_conns of scheduler
  * and wakeup the scheduler.
  */
-void ksocknal_write_callback(ksock_conn_t *conn)
+void ksocknal_write_callback(struct ksock_conn *conn)
 {
-	ksock_sched_t *sched;
+	struct ksock_sched *sched;
 	ENTRY;
 
 	sched = conn->ksnc_scheduler;
@@ -1618,7 +1702,7 @@ void ksocknal_write_callback(ksock_conn_t *conn)
 	EXIT;
 }
 
-static ksock_proto_t *
+static struct ksock_proto *
 ksocknal_parse_proto_version (struct ksock_hello_msg *hello)
 {
         __u32   version = 0;
@@ -1663,13 +1747,13 @@ ksocknal_parse_proto_version (struct ksock_hello_msg *hello)
 }
 
 int
-ksocknal_send_hello(struct lnet_ni *ni, ksock_conn_t *conn,
+ksocknal_send_hello(struct lnet_ni *ni, struct ksock_conn *conn,
 		    lnet_nid_t peer_nid, struct ksock_hello_msg *hello)
 {
 	/* CAVEAT EMPTOR: this byte flips 'ipaddrs' */
-	ksock_net_t         *net = (ksock_net_t *)ni->ni_data;
+	struct ksock_net *net = (struct ksock_net *)ni->ni_data;
 
-	LASSERT(hello->kshm_nips <= LNET_NUM_INTERFACES);
+	LASSERT(hello->kshm_nips <= LNET_INTERFACES_NUM);
 
 	/* rely on caller to hold a ref on socket so it wouldn't disappear */
 	LASSERT(conn->ksnc_proto != NULL);
@@ -1702,7 +1786,7 @@ ksocknal_invert_type(int type)
 }
 
 int
-ksocknal_recv_hello(struct lnet_ni *ni, ksock_conn_t *conn,
+ksocknal_recv_hello(struct lnet_ni *ni, struct ksock_conn *conn,
 		    struct ksock_hello_msg *hello,
 		    struct lnet_process_id *peerid,
 		    __u64 *incarnation)
@@ -1717,13 +1801,13 @@ ksocknal_recv_hello(struct lnet_ni *ni, ksock_conn_t *conn,
         int                  timeout;
         int                  proto_match;
         int                  rc;
-        ksock_proto_t       *proto;
-	struct lnet_process_id    recv_id;
+	struct ksock_proto *proto;
+	struct lnet_process_id recv_id;
 
 	/* socket type set on active connections - not set on passive */
 	LASSERT(!active == !(conn->ksnc_type != SOCKLND_CONN_NONE));
 
-	timeout = active ? *ksocknal_tunables.ksnd_timeout :
+	timeout = active ? lnet_get_lnd_timeout() :
 			    lnet_acceptor_timeout();
 
 	rc = lnet_sock_read(sock, &hello->kshm_magic,
@@ -1847,19 +1931,18 @@ ksocknal_recv_hello(struct lnet_ni *ni, ksock_conn_t *conn,
 }
 
 static int
-ksocknal_connect (ksock_route_t *route)
+ksocknal_connect(struct ksock_route *route)
 {
-	struct list_head        zombies = LIST_HEAD_INIT(zombies);
-        ksock_peer_ni_t     *peer_ni = route->ksnr_peer;
+	struct list_head zombies = LIST_HEAD_INIT(zombies);
+	struct ksock_peer_ni *peer_ni = route->ksnr_peer;
         int               type;
         int               wanted;
 	struct socket     *sock;
-	cfs_time_t        deadline;
+	time64_t deadline;
         int               retry_later = 0;
         int               rc = 0;
 
-	deadline = cfs_time_add(cfs_time_current(),
-				cfs_time_seconds(*ksocknal_tunables.ksnd_timeout));
+	deadline = ktime_get_seconds() + lnet_get_lnd_timeout();
 
 	write_lock_bh(&ksocknal_data.ksnd_global_lock);
 
@@ -1903,7 +1986,7 @@ ksocknal_connect (ksock_route_t *route)
 
 		write_unlock_bh(&ksocknal_data.ksnd_global_lock);
 
-		if (cfs_time_aftereq(cfs_time_current(), deadline)) {
+		if (ktime_get_seconds() >= deadline) {
                         rc = -ETIMEDOUT;
                         lnet_connect_console_error(rc, peer_ni->ksnp_id.nid,
                                                    route->ksnr_ipaddr,
@@ -1911,12 +1994,12 @@ ksocknal_connect (ksock_route_t *route)
                         goto failed;
                 }
 
-                rc = lnet_connect(&sock, peer_ni->ksnp_id.nid,
-                                  route->ksnr_myipaddr,
+		rc = lnet_connect(&sock, peer_ni->ksnp_id.nid,
+				  route->ksnr_myipaddr,
 				  route->ksnr_ipaddr, route->ksnr_port,
 				  peer_ni->ksnp_ni->ni_net_ns);
-                if (rc != 0)
-                        goto failed;
+		if (rc != 0)
+			goto failed;
 
                 rc = ksocknal_create_conn(peer_ni->ksnp_ni, route, sock, type);
                 if (rc < 0) {
@@ -1949,10 +2032,9 @@ ksocknal_connect (ksock_route_t *route)
                          * attempt to connect if we lost conn race,
                          * but the race is resolved quickly usually,
                          * so min_reconnectms should be good heuristic */
-			route->ksnr_retry_interval =
-				cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000;
-			route->ksnr_timeout = cfs_time_add(cfs_time_current(),
-							   route->ksnr_retry_interval);
+			route->ksnr_retry_interval = *ksocknal_tunables.ksnd_min_reconnectms / 1000;
+			route->ksnr_timeout = ktime_get_seconds() +
+					      route->ksnr_retry_interval;
                 }
 
                 ksocknal_launch_connection_locked(route);
@@ -1970,26 +2052,25 @@ ksocknal_connect (ksock_route_t *route)
 	/* This is a retry rather than a new connection */
 	route->ksnr_retry_interval *= 2;
 	route->ksnr_retry_interval =
-		MAX(route->ksnr_retry_interval,
-		    cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000);
+		max_t(time64_t, route->ksnr_retry_interval,
+		      *ksocknal_tunables.ksnd_min_reconnectms / 1000);
 	route->ksnr_retry_interval =
-		MIN(route->ksnr_retry_interval,
-		    cfs_time_seconds(*ksocknal_tunables.ksnd_max_reconnectms)/1000);
+		min_t(time64_t, route->ksnr_retry_interval,
+		      *ksocknal_tunables.ksnd_max_reconnectms / 1000);
 
-	LASSERT (route->ksnr_retry_interval != 0);
-	route->ksnr_timeout = cfs_time_add(cfs_time_current(),
-					   route->ksnr_retry_interval);
+	LASSERT(route->ksnr_retry_interval);
+	route->ksnr_timeout = ktime_get_seconds() + route->ksnr_retry_interval;
 
 	if (!list_empty(&peer_ni->ksnp_tx_queue) &&
             peer_ni->ksnp_accepting == 0 &&
             ksocknal_find_connecting_route_locked(peer_ni) == NULL) {
-                ksock_conn_t *conn;
+		struct ksock_conn *conn;
 
                 /* ksnp_tx_queue is queued on a conn on successful
                  * connection for V1.x and V2.x */
 		if (!list_empty(&peer_ni->ksnp_conns)) {
 			conn = list_entry(peer_ni->ksnp_conns.next,
-                                              ksock_conn_t, ksnc_list);
+					  struct ksock_conn, ksnc_list);
                         LASSERT (conn->ksnc_proto == &ksocknal_protocol_v3x);
                 }
 
@@ -2012,7 +2093,7 @@ ksocknal_connect (ksock_route_t *route)
  * running out of resource.
  */
 static int
-ksocknal_connd_check_start(long sec, long *timeout)
+ksocknal_connd_check_start(time64_t sec, long *timeout)
 {
 	char name[16];
         int rc;
@@ -2062,7 +2143,7 @@ ksocknal_connd_check_start(long sec, long *timeout)
         /* we tried ... */
         LASSERT(ksocknal_data.ksnd_connd_starting > 0);
         ksocknal_data.ksnd_connd_starting--;
-	ksocknal_data.ksnd_connd_failed_stamp = cfs_time_current_sec();
+	ksocknal_data.ksnd_connd_failed_stamp = ktime_get_real_seconds();
 
         return 1;
 }
@@ -2074,7 +2155,7 @@ ksocknal_connd_check_start(long sec, long *timeout)
  * again to recheck these conditions.
  */
 static int
-ksocknal_connd_check_stop(long sec, long *timeout)
+ksocknal_connd_check_stop(time64_t sec, long *timeout)
 {
         int val;
 
@@ -2110,38 +2191,36 @@ ksocknal_connd_check_stop(long sec, long *timeout)
 
 /* Go through connd_routes queue looking for a route that we can process
  * right now, @timeout_p can be updated if we need to come back later */
-static ksock_route_t *
+static struct ksock_route *
 ksocknal_connd_get_route_locked(signed long *timeout_p)
 {
-	ksock_route_t *route;
-	cfs_time_t     now;
-
-	now = cfs_time_current();
+	time64_t now = ktime_get_seconds();
+	struct ksock_route *route;
 
 	/* connd_routes can contain both pending and ordinary routes */
 	list_for_each_entry(route, &ksocknal_data.ksnd_connd_routes,
 				 ksnr_connd_list) {
 
 		if (route->ksnr_retry_interval == 0 ||
-		    cfs_time_aftereq(now, route->ksnr_timeout))
+		    now >= route->ksnr_timeout)
 			return route;
 
 		if (*timeout_p == MAX_SCHEDULE_TIMEOUT ||
-		    (int)*timeout_p > (int)(route->ksnr_timeout - now))
-			*timeout_p = (int)(route->ksnr_timeout - now);
+		    *timeout_p > cfs_time_seconds(route->ksnr_timeout - now))
+			*timeout_p = cfs_time_seconds(route->ksnr_timeout - now);
 	}
 
 	return NULL;
 }
 
 int
-ksocknal_connd (void *arg)
+ksocknal_connd(void *arg)
 {
-	spinlock_t    *connd_lock = &ksocknal_data.ksnd_connd_lock;
-	ksock_connreq_t   *cr;
+	spinlock_t *connd_lock = &ksocknal_data.ksnd_connd_lock;
+	struct ksock_connreq *cr;
 	wait_queue_entry_t wait;
-	int                nloops = 0;
-	int                cons_retry = 0;
+	int nloops = 0;
+	int cons_retry = 0;
 
 	cfs_block_allsigs();
 
@@ -2154,8 +2233,8 @@ ksocknal_connd (void *arg)
 	ksocknal_data.ksnd_connd_running++;
 
 	while (!ksocknal_data.ksnd_shuttingdown) {
-		ksock_route_t *route = NULL;
-		long sec = cfs_time_current_sec();
+		struct ksock_route *route = NULL;
+		time64_t sec = ktime_get_real_seconds();
 		long timeout = MAX_SCHEDULE_TIMEOUT;
 		int  dropped_lock = 0;
 
@@ -2172,8 +2251,8 @@ ksocknal_connd (void *arg)
 
 		if (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) {
                         /* Connection accepted by the listener */
-			cr = list_entry(ksocknal_data.ksnd_connd_connreqs. \
-                                            next, ksock_connreq_t, ksncr_list);
+			cr = list_entry(ksocknal_data.ksnd_connd_connreqs.next,
+					struct ksock_connreq, ksncr_list);
 
 			list_del(&cr->ksncr_list);
 			spin_unlock_bh(connd_lock);
@@ -2247,16 +2326,18 @@ ksocknal_connd (void *arg)
 	return 0;
 }
 
-static ksock_conn_t *
-ksocknal_find_timed_out_conn (ksock_peer_ni_t *peer_ni)
+static struct ksock_conn *
+ksocknal_find_timed_out_conn(struct ksock_peer_ni *peer_ni)
 {
         /* We're called with a shared lock on ksnd_global_lock */
-        ksock_conn_t      *conn;
-	struct list_head        *ctmp;
+	struct ksock_conn *conn;
+	struct list_head *ctmp;
+	struct ksock_tx *tx;
 
 	list_for_each(ctmp, &peer_ni->ksnp_conns) {
-                int     error;
-		conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
+		int error;
+
+		conn = list_entry(ctmp, struct ksock_conn, ksnc_list);
 
                 /* Don't need the {get,put}connsock dance to deref ksnc_sock */
                 LASSERT (!conn->ksnc_closing);
@@ -2296,8 +2377,7 @@ ksocknal_find_timed_out_conn (ksock_peer_ni_t *peer_ni)
                 }
 
                 if (conn->ksnc_rx_started &&
-		    cfs_time_aftereq(cfs_time_current(),
-				     conn->ksnc_rx_deadline)) {
+		    ktime_get_seconds() >= conn->ksnc_rx_deadline) {
                         /* Timed out incomplete incoming message */
                         ksocknal_conn_addref(conn);
 			CNETERR("Timeout receiving from %s (%pI4h:%d), "
@@ -2313,11 +2393,14 @@ ksocknal_find_timed_out_conn (ksock_peer_ni_t *peer_ni)
 
 		if ((!list_empty(&conn->ksnc_tx_queue) ||
 		     conn->ksnc_sock->sk->sk_wmem_queued != 0) &&
-		    cfs_time_aftereq(cfs_time_current(),
-				     conn->ksnc_tx_deadline)) {
+		    ktime_get_seconds() >= conn->ksnc_tx_deadline) {
                         /* Timed out messages queued for sending or
                          * buffered in the socket's send buffer */
                         ksocknal_conn_addref(conn);
+			list_for_each_entry(tx, &conn->ksnc_tx_queue,
+					    tx_list)
+				tx->tx_hstatus =
+					LNET_MSG_STATUS_LOCAL_TIMEOUT;
 			CNETERR("Timeout sending data to %s (%pI4h:%d) "
                                 "the network or that node may be down.\n",
                                 libcfs_id2str(peer_ni->ksnp_id),
@@ -2330,21 +2413,22 @@ ksocknal_find_timed_out_conn (ksock_peer_ni_t *peer_ni)
 }
 
 static inline void
-ksocknal_flush_stale_txs(ksock_peer_ni_t *peer_ni)
+ksocknal_flush_stale_txs(struct ksock_peer_ni *peer_ni)
 {
-	ksock_tx_t	  *tx;
-	struct list_head	stale_txs = LIST_HEAD_INIT(stale_txs);
+	struct ksock_tx	*tx;
+	struct list_head stale_txs = LIST_HEAD_INIT(stale_txs);
 
 	write_lock_bh(&ksocknal_data.ksnd_global_lock);
 
 	while (!list_empty(&peer_ni->ksnp_tx_queue)) {
 		tx = list_entry(peer_ni->ksnp_tx_queue.next,
-				     ksock_tx_t, tx_list);
+				struct ksock_tx, tx_list);
 
-		if (!cfs_time_aftereq(cfs_time_current(),
-				      tx->tx_deadline))
+		if (ktime_get_seconds() < tx->tx_deadline)
 			break;
 
+		tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_TIMEOUT;
+
 		list_del(&tx->tx_list);
 		list_add_tail(&tx->tx_list, &stale_txs);
 	}
@@ -2355,12 +2439,12 @@ ksocknal_flush_stale_txs(ksock_peer_ni_t *peer_ni)
 }
 
 static int
-ksocknal_send_keepalive_locked(ksock_peer_ni_t *peer_ni)
+ksocknal_send_keepalive_locked(struct ksock_peer_ni *peer_ni)
 __must_hold(&ksocknal_data.ksnd_global_lock)
 {
-        ksock_sched_t  *sched;
-        ksock_conn_t   *conn;
-        ksock_tx_t     *tx;
+	struct ksock_sched *sched;
+	struct ksock_conn *conn;
+	struct ksock_tx *tx;
 
 	/* last_alive will be updated by create_conn */
 	if (list_empty(&peer_ni->ksnp_conns))
@@ -2370,18 +2454,16 @@ __must_hold(&ksocknal_data.ksnd_global_lock)
                 return 0;
 
         if (*ksocknal_tunables.ksnd_keepalive <= 0 ||
-	    cfs_time_before(cfs_time_current(),
-			    cfs_time_add(peer_ni->ksnp_last_alive,
-					 cfs_time_seconds(*ksocknal_tunables.ksnd_keepalive))))
+	    ktime_get_seconds() < peer_ni->ksnp_last_alive +
+				  *ksocknal_tunables.ksnd_keepalive)
                 return 0;
 
-	if (cfs_time_before(cfs_time_current(),
-			    peer_ni->ksnp_send_keepalive))
+	if (ktime_get_seconds() < peer_ni->ksnp_send_keepalive)
                 return 0;
 
         /* retry 10 secs later, so we wouldn't put pressure
          * on this peer_ni if we failed to send keepalive this time */
-	peer_ni->ksnp_send_keepalive = cfs_time_shift(10);
+	peer_ni->ksnp_send_keepalive = ktime_get_seconds() + 10;
 
         conn = ksocknal_find_conn_locked(peer_ni, NULL, 1);
         if (conn != NULL) {
@@ -2419,12 +2501,12 @@ __must_hold(&ksocknal_data.ksnd_global_lock)
 
 
 static void
-ksocknal_check_peer_timeouts (int idx)
+ksocknal_check_peer_timeouts(int idx)
 {
-	struct list_head       *peers = &ksocknal_data.ksnd_peers[idx];
-        ksock_peer_ni_t     *peer_ni;
-        ksock_conn_t     *conn;
-        ksock_tx_t       *tx;
+	struct list_head *peers = &ksocknal_data.ksnd_peers[idx];
+	struct ksock_peer_ni *peer_ni;
+	struct ksock_conn *conn;
+	struct ksock_tx *tx;
 
  again:
         /* NB. We expect to have a look at all the peers and not find any
@@ -2433,10 +2515,10 @@ ksocknal_check_peer_timeouts (int idx)
 	read_lock(&ksocknal_data.ksnd_global_lock);
 
 	list_for_each_entry(peer_ni, peers, ksnp_list) {
-		ksock_tx_t *tx_stale;
-		cfs_time_t  deadline = 0;
-		int         resid = 0;
-		int         n     = 0;
+		struct ksock_tx *tx_stale;
+		time64_t deadline = 0;
+		int resid = 0;
+		int n = 0;
 
                 if (ksocknal_send_keepalive_locked(peer_ni) != 0) {
 			read_unlock(&ksocknal_data.ksnd_global_lock);
@@ -2460,13 +2542,11 @@ ksocknal_check_peer_timeouts (int idx)
                 /* we can't process stale txs right here because we're
                  * holding only shared lock */
 		if (!list_empty(&peer_ni->ksnp_tx_queue)) {
-                        ksock_tx_t *tx =
-				list_entry(peer_ni->ksnp_tx_queue.next,
-                                                ksock_tx_t, tx_list);
-
-			if (cfs_time_aftereq(cfs_time_current(),
-					     tx->tx_deadline)) {
+			struct ksock_tx *tx;
 
+			tx = list_entry(peer_ni->ksnp_tx_queue.next,
+					struct ksock_tx, tx_list);
+			if (ktime_get_seconds() >= tx->tx_deadline) {
                                 ksocknal_peer_addref(peer_ni);
 				read_unlock(&ksocknal_data.ksnd_global_lock);
 
@@ -2483,8 +2563,7 @@ ksocknal_check_peer_timeouts (int idx)
 		tx_stale = NULL;
 		spin_lock(&peer_ni->ksnp_lock);
 		list_for_each_entry(tx, &peer_ni->ksnp_zc_req_list, tx_zc_list) {
-			if (!cfs_time_aftereq(cfs_time_current(),
-					      tx->tx_deadline))
+			if (ktime_get_seconds() < tx->tx_deadline)
                                 break;
                         /* ignore the TX if connection is being closed */
                         if (tx->tx_conn->ksnc_closing)
@@ -2508,10 +2587,10 @@ ksocknal_check_peer_timeouts (int idx)
 		read_unlock(&ksocknal_data.ksnd_global_lock);
 
 		CERROR("Total %d stale ZC_REQs for peer_ni %s detected; the "
-		       "oldest(%p) timed out %ld secs ago, "
+		       "oldest(%p) timed out %lld secs ago, "
 		       "resid: %d, wmem: %d\n",
 		       n, libcfs_nid2str(peer_ni->ksnp_id.nid), tx_stale,
-		       cfs_duration_sec(cfs_time_current() - deadline),
+		       ktime_get_seconds() - deadline,
 		       resid, conn->ksnc_sock->sk->sk_wmem_queued);
 
                 ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT);
@@ -2525,14 +2604,14 @@ ksocknal_check_peer_timeouts (int idx)
 int ksocknal_reaper(void *arg)
 {
 	wait_queue_entry_t wait;
-	ksock_conn_t      *conn;
-	ksock_sched_t     *sched;
-	struct list_head         enomem_conns;
-        int                nenomem_conns;
-	cfs_duration_t     timeout;
-        int                i;
-        int                peer_index = 0;
-	cfs_time_t         deadline = cfs_time_current();
+	struct ksock_conn *conn;
+	struct ksock_sched *sched;
+	struct list_head enomem_conns;
+	int nenomem_conns;
+	time64_t timeout;
+	int i;
+	int peer_index = 0;
+	time64_t deadline = ktime_get_seconds();
 
         cfs_block_allsigs ();
 
@@ -2542,11 +2621,9 @@ int ksocknal_reaper(void *arg)
 	spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
 
         while (!ksocknal_data.ksnd_shuttingdown) {
-
 		if (!list_empty(&ksocknal_data.ksnd_deathrow_conns)) {
-			conn = list_entry(ksocknal_data. \
-                                               ksnd_deathrow_conns.next,
-                                               ksock_conn_t, ksnc_list);
+			conn = list_entry(ksocknal_data.ksnd_deathrow_conns.next,
+					  struct ksock_conn, ksnc_list);
 			list_del(&conn->ksnc_list);
 
 			spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
@@ -2559,8 +2636,8 @@ int ksocknal_reaper(void *arg)
                 }
 
 		if (!list_empty(&ksocknal_data.ksnd_zombie_conns)) {
-			conn = list_entry(ksocknal_data.ksnd_zombie_conns.\
-                                               next, ksock_conn_t, ksnc_list);
+			conn = list_entry(ksocknal_data.ksnd_zombie_conns.next,
+					  struct ksock_conn, ksnc_list);
 			list_del(&conn->ksnc_list);
 
 			spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
@@ -2583,7 +2660,7 @@ int ksocknal_reaper(void *arg)
                 nenomem_conns = 0;
 		while (!list_empty(&enomem_conns)) {
 			conn = list_entry(enomem_conns.next,
-                                               ksock_conn_t, ksnc_tx_list);
+					  struct ksock_conn, ksnc_tx_list);
 			list_del(&conn->ksnc_tx_list);
 
                         sched = conn->ksnc_scheduler;
@@ -2601,11 +2678,11 @@ int ksocknal_reaper(void *arg)
                 }
 
                 /* careful with the jiffy wrap... */
-		while ((timeout = cfs_time_sub(deadline,
-					       cfs_time_current())) <= 0) {
+		while ((timeout = deadline - ktime_get_seconds()) <= 0) {
                         const int n = 4;
                         const int p = 1;
                         int       chunk = ksocknal_data.ksnd_peer_hash_size;
+			unsigned int lnd_timeout;
 
                         /* Time to check for timeouts on a few more peers: I do
                          * checks every 'p' seconds on a proportion of the peer_ni
@@ -2614,11 +2691,11 @@ int ksocknal_reaper(void *arg)
                          * timeout on any connection within (n+1)/n times the
                          * timeout interval. */
 
-                        if (*ksocknal_tunables.ksnd_timeout > n * p)
-                                chunk = (chunk * n * p) /
-                                        *ksocknal_tunables.ksnd_timeout;
-                        if (chunk == 0)
-                                chunk = 1;
+			lnd_timeout = lnet_get_lnd_timeout();
+			if (lnd_timeout > n * p)
+				chunk = (chunk * n * p) / lnd_timeout;
+			if (chunk == 0)
+				chunk = 1;
 
                         for (i = 0; i < chunk; i++) {
                                 ksocknal_check_peer_timeouts (peer_index);
@@ -2626,7 +2703,7 @@ int ksocknal_reaper(void *arg)
                                              ksocknal_data.ksnd_peer_hash_size;
                         }
 
-			deadline = cfs_time_add(deadline, cfs_time_seconds(p));
+			deadline += p;
                 }
 
                 if (nenomem_conns != 0) {
@@ -2635,16 +2712,16 @@ int ksocknal_reaper(void *arg)
                          * if any go back on my enomem list. */
                         timeout = SOCKNAL_ENOMEM_RETRY;
                 }
-		ksocknal_data.ksnd_reaper_waketime =
-			cfs_time_add(cfs_time_current(), timeout);
+		ksocknal_data.ksnd_reaper_waketime = ktime_get_seconds() +
+						     timeout;
 
-			set_current_state(TASK_INTERRUPTIBLE);
+		set_current_state(TASK_INTERRUPTIBLE);
 		add_wait_queue(&ksocknal_data.ksnd_reaper_waitq, &wait);
 
 		if (!ksocknal_data.ksnd_shuttingdown &&
 		    list_empty(&ksocknal_data.ksnd_deathrow_conns) &&
 		    list_empty(&ksocknal_data.ksnd_zombie_conns))
-			schedule_timeout(timeout);
+			schedule_timeout(cfs_time_seconds(timeout));
 
 		set_current_state(TASK_RUNNING);
 		remove_wait_queue(&ksocknal_data.ksnd_reaper_waitq, &wait);
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c
index 91a9cf05e2ad8..72f2bd526613e 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_lib.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -33,11 +33,11 @@
 #include "socklnd.h"
 
 int
-ksocknal_lib_get_conn_addrs (ksock_conn_t *conn)
+ksocknal_lib_get_conn_addrs(struct ksock_conn *conn)
 {
 	int rc = lnet_sock_getaddr(conn->ksnc_sock, true,
-                                     &conn->ksnc_ipaddr,
-                                     &conn->ksnc_port);
+				   &conn->ksnc_ipaddr,
+				   &conn->ksnc_port);
 
         /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
         LASSERT (!conn->ksnc_closing);
@@ -58,7 +58,7 @@ ksocknal_lib_get_conn_addrs (ksock_conn_t *conn)
 }
 
 int
-ksocknal_lib_zc_capable(ksock_conn_t *conn)
+ksocknal_lib_zc_capable(struct ksock_conn *conn)
 {
 	int  caps = conn->ksnc_sock->sk->sk_route_caps;
 
@@ -71,7 +71,8 @@ ksocknal_lib_zc_capable(ksock_conn_t *conn)
 }
 
 int
-ksocknal_lib_send_iov(ksock_conn_t *conn, ksock_tx_t *tx)
+ksocknal_lib_send_iov(struct ksock_conn *conn, struct ksock_tx *tx,
+		      struct kvec *scratchiov)
 {
 	struct socket  *sock = conn->ksnc_sock;
 	int		nob;
@@ -92,7 +93,6 @@ ksocknal_lib_send_iov(ksock_conn_t *conn, ksock_tx_t *tx)
 		struct kvec *scratchiov = &scratch;
 		unsigned int niov = 1;
 #else
-		struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
 		unsigned int niov = tx->tx_niov;
 #endif
 		struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
@@ -113,41 +113,42 @@ ksocknal_lib_send_iov(ksock_conn_t *conn, ksock_tx_t *tx)
 }
 
 int
-ksocknal_lib_send_kiov(ksock_conn_t *conn, ksock_tx_t *tx)
+ksocknal_lib_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx,
+		       struct kvec *scratchiov)
 {
-        struct socket *sock = conn->ksnc_sock;
-        lnet_kiov_t   *kiov = tx->tx_kiov;
-        int            rc;
-        int            nob;
+	struct socket *sock = conn->ksnc_sock;
+	lnet_kiov_t   *kiov = tx->tx_kiov;
+	int            rc;
+	int            nob;
 
-        /* Not NOOP message */
-        LASSERT (tx->tx_lnetmsg != NULL);
+	/* Not NOOP message */
+	LASSERT(tx->tx_lnetmsg != NULL);
 
-        /* NB we can't trust socket ops to either consume our iovs
-         * or leave them alone. */
-        if (tx->tx_msg.ksm_zc_cookies[0] != 0) {
-                /* Zero copy is enabled */
-                struct sock   *sk = sock->sk;
-                struct page   *page = kiov->kiov_page;
-                int            offset = kiov->kiov_offset;
-                int            fragsize = kiov->kiov_len;
-                int            msgflg = MSG_DONTWAIT;
+	/* NB we can't trust socket ops to either consume our iovs
+	 * or leave them alone. */
+	if (tx->tx_msg.ksm_zc_cookies[0] != 0) {
+		/* Zero copy is enabled */
+		struct sock   *sk = sock->sk;
+		struct page   *page = kiov->kiov_page;
+		int            offset = kiov->kiov_offset;
+		int            fragsize = kiov->kiov_len;
+		int            msgflg = MSG_DONTWAIT;
 
-                CDEBUG(D_NET, "page %p + offset %x for %d\n",
-                               page, offset, kiov->kiov_len);
+		CDEBUG(D_NET, "page %p + offset %x for %d\n",
+			       page, offset, kiov->kiov_len);
 
 		if (!list_empty(&conn->ksnc_tx_queue) ||
-                    fragsize < tx->tx_resid)
-                        msgflg |= MSG_MORE;
-
-                if (sk->sk_prot->sendpage != NULL) {
-                        rc = sk->sk_prot->sendpage(sk, page,
-                                                   offset, fragsize, msgflg);
-                } else {
-                        rc = cfs_tcp_sendpage(sk, page, offset, fragsize,
-                                              msgflg);
-                }
-        } else {
+		    fragsize < tx->tx_resid)
+			msgflg |= MSG_MORE;
+
+		if (sk->sk_prot->sendpage != NULL) {
+			rc = sk->sk_prot->sendpage(sk, page,
+						   offset, fragsize, msgflg);
+		} else {
+			rc = cfs_tcp_sendpage(sk, page, offset, fragsize,
+					      msgflg);
+		}
+	} else {
 #if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK
 		struct kvec	scratch;
 		struct kvec   *scratchiov = &scratch;
@@ -156,7 +157,6 @@ ksocknal_lib_send_kiov(ksock_conn_t *conn, ksock_tx_t *tx)
 #ifdef CONFIG_HIGHMEM
 #warning "XXX risk of kmap deadlock on multiple frags..."
 #endif
-		struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
 		unsigned int  niov = tx->tx_nkiov;
 #endif
 		struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
@@ -181,7 +181,7 @@ ksocknal_lib_send_kiov(ksock_conn_t *conn, ksock_tx_t *tx)
 }
 
 void
-ksocknal_lib_eager_ack (ksock_conn_t *conn)
+ksocknal_lib_eager_ack(struct ksock_conn *conn)
 {
 	struct socket *sock = conn->ksnc_sock;
 
@@ -194,14 +194,13 @@ ksocknal_lib_eager_ack (ksock_conn_t *conn)
 }
 
 int
-ksocknal_lib_recv_iov (ksock_conn_t *conn)
+ksocknal_lib_recv_iov(struct ksock_conn *conn, struct kvec *scratchiov)
 {
 #if SOCKNAL_SINGLE_FRAG_RX
 	struct kvec  scratch;
 	struct kvec *scratchiov = &scratch;
 	unsigned int  niov = 1;
 #else
-	struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
 	unsigned int  niov = conn->ksnc_rx_niov;
 #endif
 	struct kvec *iov = conn->ksnc_rx_iov;
@@ -299,7 +298,8 @@ ksocknal_lib_kiov_vmap(lnet_kiov_t *kiov, int niov,
 }
 
 int
-ksocknal_lib_recv_kiov (ksock_conn_t *conn)
+ksocknal_lib_recv_kiov(struct ksock_conn *conn, struct page **pages,
+		       struct kvec *scratchiov)
 {
 #if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK
 	struct kvec   scratch;
@@ -310,8 +310,6 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn)
 #ifdef CONFIG_HIGHMEM
 #warning "XXX risk of kmap deadlock on multiple frags..."
 #endif
-	struct kvec  *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
-	struct page  **pages      = conn->ksnc_scheduler->kss_rx_scratch_pgs;
 	unsigned int   niov       = conn->ksnc_rx_nkiov;
 #endif
 	lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
@@ -378,7 +376,7 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn)
 }
 
 void
-ksocknal_lib_csum_tx(ksock_tx_t *tx)
+ksocknal_lib_csum_tx(struct ksock_tx *tx)
 {
         int          i;
         __u32        csum;
@@ -417,7 +415,7 @@ ksocknal_lib_csum_tx(ksock_tx_t *tx)
 }
 
 int
-ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
+ksocknal_lib_get_conn_tunables(struct ksock_conn *conn, int *txmem, int *rxmem, int *nagle)
 {
 	struct socket *sock = conn->ksnc_sock;
 	struct tcp_sock *tp = tcp_sk(sock->sk);
@@ -548,12 +546,12 @@ ksocknal_lib_setup_sock (struct socket *sock)
 }
 
 void
-ksocknal_lib_push_conn (ksock_conn_t *conn)
+ksocknal_lib_push_conn(struct ksock_conn *conn)
 {
-        struct sock    *sk;
-        struct tcp_sock *tp;
-        int             nonagle;
-        int             rc;
+	struct sock *sk;
+	struct tcp_sock *tp;
+	int nonagle;
+	int rc;
 
 	rc = ksocknal_connsock_addref(conn);
 	if (rc != 0)                            /* being shut down */
@@ -576,8 +574,8 @@ ksocknal_lib_push_conn (ksock_conn_t *conn)
 	ksocknal_connsock_decref(conn);
 }
 
-extern void ksocknal_read_callback (ksock_conn_t *conn);
-extern void ksocknal_write_callback (ksock_conn_t *conn);
+void ksocknal_read_callback(struct ksock_conn *conn);
+void ksocknal_write_callback(struct ksock_conn *conn);
 /*
  * socket call back in Linux
  */
@@ -588,7 +586,7 @@ ksocknal_data_ready(struct sock *sk)
 ksocknal_data_ready(struct sock *sk, int n)
 #endif
 {
-	ksock_conn_t  *conn;
+	struct ksock_conn  *conn;
 	ENTRY;
 
         /* interleave correctly with closing sockets... */
@@ -614,7 +612,7 @@ ksocknal_data_ready(struct sock *sk, int n)
 static void
 ksocknal_write_space (struct sock *sk)
 {
-        ksock_conn_t  *conn;
+	struct ksock_conn  *conn;
         int            wspace;
         int            min_wpace;
 
@@ -657,14 +655,14 @@ ksocknal_write_space (struct sock *sk)
 }
 
 void
-ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn)
+ksocknal_lib_save_callback(struct socket *sock, struct ksock_conn *conn)
 {
         conn->ksnc_saved_data_ready = sock->sk->sk_data_ready;
         conn->ksnc_saved_write_space = sock->sk->sk_write_space;
 }
 
 void
-ksocknal_lib_set_callback(struct socket *sock,  ksock_conn_t *conn)
+ksocknal_lib_set_callback(struct socket *sock,  struct ksock_conn *conn)
 {
         sock->sk->sk_user_data = conn;
         sock->sk->sk_data_ready = ksocknal_data_ready;
@@ -673,7 +671,7 @@ ksocknal_lib_set_callback(struct socket *sock,  ksock_conn_t *conn)
 }
 
 void
-ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn)
+ksocknal_lib_reset_callback(struct socket *sock, struct ksock_conn *conn)
 {
         /* Remove conn's network callbacks.
          * NB I _have_ to restore the callback, rather than storing a noop,
@@ -690,10 +688,10 @@ ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn)
 }
 
 int
-ksocknal_lib_memory_pressure(ksock_conn_t *conn)
+ksocknal_lib_memory_pressure(struct ksock_conn *conn)
 {
 	int            rc = 0;
-	ksock_sched_t *sched;
+	struct ksock_sched *sched;
 
 	sched = conn->ksnc_scheduler;
 	spin_lock_bh(&sched->kss_lock);
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_modparams.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_modparams.c
index 6495703626094..df9d96e6e4cfc 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_modparams.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_modparams.c
@@ -37,7 +37,7 @@ static int peer_buffer_credits;
 module_param(peer_buffer_credits, int, 0444);
 MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits");
 
-static int peer_timeout = 180;
+static int peer_timeout = DEFAULT_PEER_TIMEOUT;
 module_param(peer_timeout, int, 0444);
 MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)");
 
@@ -155,7 +155,7 @@ module_param(protocol, int, 0644);
 MODULE_PARM_DESC(protocol, "protocol version");
 #endif
 
-ksock_tunables_t ksocknal_tunables;
+struct ksock_tunables ksocknal_tunables;
 
 int ksocknal_tunables_init(void)
 {
diff --git a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c
index 42dff10fdb563..6dd648a2299cc 100644
--- a/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c
+++ b/drivers/staging/lustrefsx/lnet/klnds/socklnd/socklnd_proto.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2012, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  *
  *   Author: Zach Brown <zab@zabbo.net>
  *   Author: Peter J. Braam <braam@clusterfs.com>
@@ -41,8 +41,8 @@
  *   pro_match_tx()       : Called holding glock
  */
 
-static ksock_tx_t *
-ksocknal_queue_tx_msg_v1(ksock_conn_t *conn, ksock_tx_t *tx_msg)
+static struct ksock_tx *
+ksocknal_queue_tx_msg_v1(struct ksock_conn *conn, struct ksock_tx *tx_msg)
 {
         /* V1.x, just enqueue it */
 	list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
@@ -50,9 +50,9 @@ ksocknal_queue_tx_msg_v1(ksock_conn_t *conn, ksock_tx_t *tx_msg)
 }
 
 void
-ksocknal_next_tx_carrier(ksock_conn_t *conn)
+ksocknal_next_tx_carrier(struct ksock_conn *conn)
 {
-        ksock_tx_t     *tx = conn->ksnc_tx_carrier;
+	struct ksock_tx *tx = conn->ksnc_tx_carrier;
 
         /* Called holding BH lock: conn->ksnc_scheduler->kss_lock */
 	LASSERT(!list_empty(&conn->ksnc_tx_queue));
@@ -64,17 +64,17 @@ ksocknal_next_tx_carrier(ksock_conn_t *conn)
                 conn->ksnc_tx_carrier = NULL;
         } else {
 		conn->ksnc_tx_carrier = list_entry(tx->tx_list.next,
-                                                       ksock_tx_t, tx_list);
+						   struct ksock_tx, tx_list);
 		LASSERT(conn->ksnc_tx_carrier->tx_msg.ksm_type ==
 			tx->tx_msg.ksm_type);
         }
 }
 
 static int
-ksocknal_queue_tx_zcack_v2(ksock_conn_t *conn,
-                           ksock_tx_t *tx_ack, __u64 cookie)
+ksocknal_queue_tx_zcack_v2(struct ksock_conn *conn,
+			   struct ksock_tx *tx_ack, __u64 cookie)
 {
-        ksock_tx_t *tx = conn->ksnc_tx_carrier;
+	struct ksock_tx *tx = conn->ksnc_tx_carrier;
 
         LASSERT (tx_ack == NULL ||
                  tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP);
@@ -117,10 +117,10 @@ ksocknal_queue_tx_zcack_v2(ksock_conn_t *conn,
         return 1;
 }
 
-static ksock_tx_t *
-ksocknal_queue_tx_msg_v2(ksock_conn_t *conn, ksock_tx_t *tx_msg)
+static struct ksock_tx *
+ksocknal_queue_tx_msg_v2(struct ksock_conn *conn, struct ksock_tx *tx_msg)
 {
-        ksock_tx_t  *tx  = conn->ksnc_tx_carrier;
+	struct ksock_tx  *tx  = conn->ksnc_tx_carrier;
 
         /*
          * Enqueue tx_msg:
@@ -154,10 +154,10 @@ ksocknal_queue_tx_msg_v2(ksock_conn_t *conn, ksock_tx_t *tx_msg)
 }
 
 static int
-ksocknal_queue_tx_zcack_v3(ksock_conn_t *conn,
-                           ksock_tx_t *tx_ack, __u64 cookie)
+ksocknal_queue_tx_zcack_v3(struct ksock_conn *conn,
+			   struct ksock_tx *tx_ack, __u64 cookie)
 {
-        ksock_tx_t *tx;
+	struct ksock_tx *tx;
 
         if (conn->ksnc_type != SOCKLND_CONN_ACK)
                 return ksocknal_queue_tx_zcack_v2(conn, tx_ack, cookie);
@@ -271,7 +271,7 @@ ksocknal_queue_tx_zcack_v3(ksock_conn_t *conn,
 }
 
 static int
-ksocknal_match_tx(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
+ksocknal_match_tx(struct ksock_conn *conn, struct ksock_tx *tx, int nonblk)
 {
         int nob;
 
@@ -315,7 +315,7 @@ ksocknal_match_tx(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
 }
 
 static int
-ksocknal_match_tx_v3(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
+ksocknal_match_tx_v3(struct ksock_conn *conn, struct ksock_tx *tx, int nonblk)
 {
         int nob;
 
@@ -359,18 +359,18 @@ ksocknal_match_tx_v3(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
 
 /* (Sink) handle incoming ZC request from sender */
 static int
-ksocknal_handle_zcreq(ksock_conn_t *c, __u64 cookie, int remote)
+ksocknal_handle_zcreq(struct ksock_conn *c, __u64 cookie, int remote)
 {
-	ksock_peer_ni_t   *peer_ni = c->ksnc_peer;
-	ksock_conn_t   *conn;
-	ksock_tx_t     *tx;
-	int             rc;
+	struct ksock_peer_ni *peer_ni = c->ksnc_peer;
+	struct ksock_conn *conn;
+	struct ksock_tx *tx;
+	int rc;
 
 	read_lock(&ksocknal_data.ksnd_global_lock);
 
 	conn = ksocknal_find_conn_locked(peer_ni, NULL, !!remote);
 	if (conn != NULL) {
-		ksock_sched_t *sched = conn->ksnc_scheduler;
+		struct ksock_sched *sched = conn->ksnc_scheduler;
 
 		LASSERT(conn->ksnc_proto->pro_queue_tx_zcack != NULL);
 
@@ -402,13 +402,13 @@ ksocknal_handle_zcreq(ksock_conn_t *c, __u64 cookie, int remote)
 
 /* (Sender) handle ZC_ACK from sink */
 static int
-ksocknal_handle_zcack(ksock_conn_t *conn, __u64 cookie1, __u64 cookie2)
+ksocknal_handle_zcack(struct ksock_conn *conn, __u64 cookie1, __u64 cookie2)
 {
-        ksock_peer_ni_t      *peer_ni = conn->ksnc_peer;
-        ksock_tx_t        *tx;
-        ksock_tx_t        *tmp;
-	struct list_head        zlist = LIST_HEAD_INIT(zlist);
-        int                count;
+	struct ksock_peer_ni *peer_ni = conn->ksnc_peer;
+	struct ksock_tx *tx;
+	struct ksock_tx *tmp;
+	struct list_head zlist = LIST_HEAD_INIT(zlist);
+	int count;
 
         if (cookie1 == 0)
                 cookie1 = cookie2;
@@ -440,7 +440,7 @@ ksocknal_handle_zcack(ksock_conn_t *conn, __u64 cookie1, __u64 cookie2)
 	spin_unlock(&peer_ni->ksnp_lock);
 
 	while (!list_empty(&zlist)) {
-		tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list);
+		tx = list_entry(zlist.next, struct ksock_tx, tx_zc_list);
 		list_del(&tx->tx_zc_list);
                 ksocknal_tx_decref(tx);
         }
@@ -449,7 +449,7 @@ ksocknal_handle_zcack(ksock_conn_t *conn, __u64 cookie1, __u64 cookie2)
 }
 
 static int
-ksocknal_send_hello_v1 (ksock_conn_t *conn, struct ksock_hello_msg *hello)
+ksocknal_send_hello_v1(struct ksock_conn *conn, struct ksock_hello_msg *hello)
 {
 	struct socket *sock = conn->ksnc_sock;
 	struct lnet_hdr *hdr;
@@ -524,10 +524,10 @@ ksocknal_send_hello_v1 (ksock_conn_t *conn, struct ksock_hello_msg *hello)
 }
 
 static int
-ksocknal_send_hello_v2 (ksock_conn_t *conn, struct ksock_hello_msg *hello)
+ksocknal_send_hello_v2(struct ksock_conn *conn, struct ksock_hello_msg *hello)
 {
-	struct socket   *sock = conn->ksnc_sock;
-        int             rc;
+	struct socket *sock = conn->ksnc_sock;
+	int rc;
 
         hello->kshm_magic   = LNET_PROTO_MAGIC;
         hello->kshm_version = conn->ksnc_proto->pro_version;
@@ -567,7 +567,8 @@ ksocknal_send_hello_v2 (ksock_conn_t *conn, struct ksock_hello_msg *hello)
 }
 
 static int
-ksocknal_recv_hello_v1(ksock_conn_t *conn, struct ksock_hello_msg *hello,int timeout)
+ksocknal_recv_hello_v1(struct ksock_conn *conn, struct ksock_hello_msg *hello,
+		       int timeout)
 {
 	struct socket *sock = conn->ksnc_sock;
 	struct lnet_hdr *hdr;
@@ -607,7 +608,7 @@ ksocknal_recv_hello_v1(ksock_conn_t *conn, struct ksock_hello_msg *hello,int tim
         hello->kshm_nips            = le32_to_cpu (hdr->payload_length) /
                                          sizeof (__u32);
 
-	if (hello->kshm_nips > LNET_NUM_INTERFACES) {
+	if (hello->kshm_nips > LNET_INTERFACES_NUM) {
 		CERROR("Bad nips %d from ip %pI4h\n",
 		       hello->kshm_nips, &conn->ksnc_ipaddr);
 		rc = -EPROTO;
@@ -643,7 +644,7 @@ ksocknal_recv_hello_v1(ksock_conn_t *conn, struct ksock_hello_msg *hello,int tim
 }
 
 static int
-ksocknal_recv_hello_v2(ksock_conn_t *conn, struct ksock_hello_msg *hello,
+ksocknal_recv_hello_v2(struct ksock_conn *conn, struct ksock_hello_msg *hello,
 		       int timeout)
 {
 	struct socket	  *sock = conn->ksnc_sock;
@@ -677,7 +678,7 @@ ksocknal_recv_hello_v2(ksock_conn_t *conn, struct ksock_hello_msg *hello,
                 __swab32s(&hello->kshm_nips);
         }
 
-	if (hello->kshm_nips > LNET_NUM_INTERFACES) {
+	if (hello->kshm_nips > LNET_INTERFACES_NUM) {
 		CERROR("Bad nips %d from ip %pI4h\n",
 		       hello->kshm_nips, &conn->ksnc_ipaddr);
 		return -EPROTO;
@@ -710,7 +711,7 @@ ksocknal_recv_hello_v2(ksock_conn_t *conn, struct ksock_hello_msg *hello,
 }
 
 static void
-ksocknal_pack_msg_v1(ksock_tx_t *tx)
+ksocknal_pack_msg_v1(struct ksock_tx *tx)
 {
 	/* V1.x has no KSOCK_MSG_NOOP */
 	LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
@@ -724,7 +725,7 @@ ksocknal_pack_msg_v1(ksock_tx_t *tx)
 }
 
 static void
-ksocknal_pack_msg_v2(ksock_tx_t *tx)
+ksocknal_pack_msg_v2(struct ksock_tx *tx)
 {
         tx->tx_iov[0].iov_base = (void *)&tx->tx_msg;
 
@@ -757,7 +758,7 @@ ksocknal_unpack_msg_v2(struct ksock_msg *msg)
         return;  /* Do nothing */
 }
 
-ksock_proto_t  ksocknal_protocol_v1x =
+struct ksock_proto  ksocknal_protocol_v1x =
 {
         .pro_version            = KSOCK_PROTO_V1,
         .pro_send_hello         = ksocknal_send_hello_v1,
@@ -771,7 +772,7 @@ ksock_proto_t  ksocknal_protocol_v1x =
         .pro_match_tx           = ksocknal_match_tx
 };
 
-ksock_proto_t  ksocknal_protocol_v2x =
+struct ksock_proto  ksocknal_protocol_v2x =
 {
         .pro_version            = KSOCK_PROTO_V2,
         .pro_send_hello         = ksocknal_send_hello_v2,
@@ -785,7 +786,7 @@ ksock_proto_t  ksocknal_protocol_v2x =
         .pro_match_tx           = ksocknal_match_tx
 };
 
-ksock_proto_t  ksocknal_protocol_v3x =
+struct ksock_proto  ksocknal_protocol_v3x =
 {
         .pro_version            = KSOCK_PROTO_V3,
         .pro_send_hello         = ksocknal_send_hello_v2,
diff --git a/drivers/staging/lustrefsx/lnet/lnet/acceptor.c b/drivers/staging/lustrefsx/lnet/lnet/acceptor.c
index 8d3d6030d7d31..5be1dd88a6b2f 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/acceptor.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/acceptor.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -32,7 +32,6 @@
 
 #define DEBUG_SUBSYSTEM S_LNET
 
-#include <linux/nsproxy.h>
 #include <linux/completion.h>
 #include <net/sock.h>
 #include <lnet/lib-lnet.h>
@@ -481,14 +480,15 @@ lnet_acceptor_start(void)
 
 	if (lnet_count_acceptor_nets() == 0)  /* not required */
 		return 0;
-
-	lnet_acceptor_state.pta_ns = current->nsproxy->net_ns;
+	if (current->nsproxy && current->nsproxy->net_ns)
+		lnet_acceptor_state.pta_ns = current->nsproxy->net_ns;
+	else
+		lnet_acceptor_state.pta_ns = &init_net;
 	task = kthread_run(lnet_acceptor, (void *)(uintptr_t)secure,
 			   "acceptor_%03ld", secure);
 	if (IS_ERR(task)) {
 		rc2 = PTR_ERR(task);
 		CERROR("Can't start acceptor thread: %ld\n", rc2);
-
 		return -ESRCH;
 	}
 
diff --git a/drivers/staging/lustrefsx/lnet/lnet/api-ni.c b/drivers/staging/lustrefsx/lnet/lnet/api-ni.c
index c70e26680b447..24e7d7aa59cd0 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/api-ni.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/api-ni.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -31,14 +31,25 @@
  */
 
 #define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/ctype.h>
 #include <linux/log2.h>
 #include <linux/ktime.h>
+#include <linux/moduleparam.h>
+#include <linux/uaccess.h>
 
 #include <lnet/lib-lnet.h>
 
 #define D_LNI D_CONSOLE
 
-struct lnet the_lnet;		/* THE state of the network */
+/*
+ * initialize ln_api_mutex statically, since it needs to be used in
+ * discovery_set callback. That module parameter callback can be called
+ * before module init completes. The mutex needs to be ready for use then.
+ */
+struct lnet the_lnet = {
+	.ln_api_mutex = __MUTEX_INITIALIZER(the_lnet.ln_api_mutex),
+};		/* THE state of the network */
 EXPORT_SYMBOL(the_lnet);
 
 static char *ip2nets = "";
@@ -60,13 +71,157 @@ MODULE_PARM_DESC(rnet_htable_size, "size of remote network hash table");
 static int use_tcp_bonding = false;
 module_param(use_tcp_bonding, int, 0444);
 MODULE_PARM_DESC(use_tcp_bonding,
-		 "Set to 1 to use socklnd bonding. 0 to use Multi-Rail");
+		 "use_tcp_bonding parameter has been deprecated");
 
 unsigned int lnet_numa_range = 0;
 module_param(lnet_numa_range, uint, 0444);
 MODULE_PARM_DESC(lnet_numa_range,
 		"NUMA range to consider during Multi-Rail selection");
 
+/*
+ * lnet_health_sensitivity determines by how much we decrement the health
+ * value on sending error. The value defaults to 100, which means health
+ * interface health is decremented by 100 points every failure.
+ */
+unsigned int lnet_health_sensitivity = 100;
+static int sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp);
+#ifdef HAVE_KERNEL_PARAM_OPS
+static struct kernel_param_ops param_ops_health_sensitivity = {
+	.set = sensitivity_set,
+	.get = param_get_int,
+};
+#define param_check_health_sensitivity(name, p) \
+		__param_check(name, p, int)
+module_param(lnet_health_sensitivity, health_sensitivity, S_IRUGO|S_IWUSR);
+#else
+module_param_call(lnet_health_sensitivity, sensitivity_set, param_get_int,
+		  &lnet_health_sensitivity, S_IRUGO|S_IWUSR);
+#endif
+MODULE_PARM_DESC(lnet_health_sensitivity,
+		"Value to decrement the health value by on error");
+
+/*
+ * lnet_recovery_interval determines how often we should perform recovery
+ * on unhealthy interfaces.
+ */
+unsigned int lnet_recovery_interval = 1;
+static int recovery_interval_set(const char *val, cfs_kernel_param_arg_t *kp);
+#ifdef HAVE_KERNEL_PARAM_OPS
+static struct kernel_param_ops param_ops_recovery_interval = {
+	.set = recovery_interval_set,
+	.get = param_get_int,
+};
+#define param_check_recovery_interval(name, p) \
+		__param_check(name, p, int)
+module_param(lnet_recovery_interval, recovery_interval, S_IRUGO|S_IWUSR);
+#else
+module_param_call(lnet_recovery_interval, recovery_interval_set, param_get_int,
+		  &lnet_recovery_interval, S_IRUGO|S_IWUSR);
+#endif
+MODULE_PARM_DESC(lnet_recovery_interval,
+		"Interval to recover unhealthy interfaces in seconds");
+
+static int lnet_interfaces_max = LNET_INTERFACES_MAX_DEFAULT;
+static int intf_max_set(const char *val, cfs_kernel_param_arg_t *kp);
+
+#ifdef HAVE_KERNEL_PARAM_OPS
+static struct kernel_param_ops param_ops_interfaces_max = {
+	.set = intf_max_set,
+	.get = param_get_int,
+};
+
+#define param_check_interfaces_max(name, p) \
+		__param_check(name, p, int)
+
+module_param(lnet_interfaces_max, interfaces_max, 0644);
+#else
+module_param_call(lnet_interfaces_max, intf_max_set, param_get_int,
+		  &lnet_interfaces_max, 0644);
+#endif
+MODULE_PARM_DESC(lnet_interfaces_max,
+		"Maximum number of interfaces in a node.");
+
+unsigned lnet_peer_discovery_disabled = 0;
+static int discovery_set(const char *val, cfs_kernel_param_arg_t *kp);
+
+#ifdef HAVE_KERNEL_PARAM_OPS
+static struct kernel_param_ops param_ops_discovery_disabled = {
+	.set = discovery_set,
+	.get = param_get_int,
+};
+
+#define param_check_discovery_disabled(name, p) \
+		__param_check(name, p, int)
+module_param(lnet_peer_discovery_disabled, discovery_disabled, 0644);
+#else
+module_param_call(lnet_peer_discovery_disabled, discovery_set, param_get_int,
+		  &lnet_peer_discovery_disabled, 0644);
+#endif
+MODULE_PARM_DESC(lnet_peer_discovery_disabled,
+		"Set to 1 to disable peer discovery on this node.");
+
+unsigned int lnet_drop_asym_route;
+static int drop_asym_route_set(const char *val, cfs_kernel_param_arg_t *kp);
+
+#ifdef HAVE_KERNEL_PARAM_OPS
+static struct kernel_param_ops param_ops_drop_asym_route = {
+	.set = drop_asym_route_set,
+	.get = param_get_int,
+};
+
+#define param_check_drop_asym_route(name, p)	\
+	__param_check(name, p, int)
+module_param(lnet_drop_asym_route, drop_asym_route, 0644);
+#else
+module_param_call(lnet_drop_asym_route, drop_asym_route_set, param_get_int,
+		  &lnet_drop_asym_route, 0644);
+#endif
+MODULE_PARM_DESC(lnet_drop_asym_route,
+		 "Set to 1 to drop asymmetrical route messages.");
+
+#define LNET_TRANSACTION_TIMEOUT_NO_HEALTH_DEFAULT 50
+#define LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT 50
+
+unsigned lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT;
+static int transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp);
+#ifdef HAVE_KERNEL_PARAM_OPS
+static struct kernel_param_ops param_ops_transaction_timeout = {
+	.set = transaction_to_set,
+	.get = param_get_int,
+};
+
+#define param_check_transaction_timeout(name, p) \
+		__param_check(name, p, int)
+module_param(lnet_transaction_timeout, transaction_timeout, S_IRUGO|S_IWUSR);
+#else
+module_param_call(lnet_transaction_timeout, transaction_to_set, param_get_int,
+		  &lnet_transaction_timeout, S_IRUGO|S_IWUSR);
+#endif
+MODULE_PARM_DESC(lnet_transaction_timeout,
+		"Maximum number of seconds to wait for a peer response.");
+
+#define LNET_RETRY_COUNT_HEALTH_DEFAULT 2
+unsigned lnet_retry_count = LNET_RETRY_COUNT_HEALTH_DEFAULT;
+static int retry_count_set(const char *val, cfs_kernel_param_arg_t *kp);
+#ifdef HAVE_KERNEL_PARAM_OPS
+static struct kernel_param_ops param_ops_retry_count = {
+	.set = retry_count_set,
+	.get = param_get_int,
+};
+
+#define param_check_retry_count(name, p) \
+		__param_check(name, p, int)
+module_param(lnet_retry_count, retry_count, S_IRUGO|S_IWUSR);
+#else
+module_param_call(lnet_retry_count, retry_count_set, param_get_int,
+		  &lnet_retry_count, S_IRUGO|S_IWUSR);
+#endif
+MODULE_PARM_DESC(lnet_retry_count,
+		 "Maximum number of times to retry transmitting a message");
+
+
+unsigned lnet_lnd_timeout = LNET_LND_DEFAULT_TIMEOUT;
+
 /*
  * This sequence number keeps track of how many times DLC was used to
  * update the local NIs. It is incremented when a NI is added or
@@ -79,6 +234,282 @@ static atomic_t lnet_dlc_seq_no = ATOMIC_INIT(0);
 static int lnet_ping(struct lnet_process_id id, signed long timeout,
 		     struct lnet_process_id __user *ids, int n_ids);
 
+static int lnet_discover(struct lnet_process_id id, __u32 force,
+			 struct lnet_process_id __user *ids, int n_ids);
+
+static int
+sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+	int rc;
+	unsigned *sensitivity = (unsigned *)kp->arg;
+	unsigned long value;
+
+	rc = kstrtoul(val, 0, &value);
+	if (rc) {
+		CERROR("Invalid module parameter value for 'lnet_health_sensitivity'\n");
+		return rc;
+	}
+
+	/*
+	 * The purpose of locking the api_mutex here is to ensure that
+	 * the correct value ends up stored properly.
+	 */
+	mutex_lock(&the_lnet.ln_api_mutex);
+
+	if (value > LNET_MAX_HEALTH_VALUE) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		CERROR("Invalid health value. Maximum: %d value = %lu\n",
+		       LNET_MAX_HEALTH_VALUE, value);
+		return -EINVAL;
+	}
+
+	/*
+	 * if we're turning on health then use the health timeout
+	 * defaults.
+	 */
+	if (*sensitivity == 0 && value != 0) {
+		lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT;
+		lnet_retry_count = LNET_RETRY_COUNT_HEALTH_DEFAULT;
+	/*
+	 * if we're turning off health then use the no health timeout
+	 * default.
+	 */
+	} else if (*sensitivity != 0 && value == 0) {
+		lnet_transaction_timeout =
+			LNET_TRANSACTION_TIMEOUT_NO_HEALTH_DEFAULT;
+		lnet_retry_count = 0;
+	}
+
+	*sensitivity = value;
+
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	return 0;
+}
+
+static int
+recovery_interval_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+	int rc;
+	unsigned *interval = (unsigned *)kp->arg;
+	unsigned long value;
+
+	rc = kstrtoul(val, 0, &value);
+	if (rc) {
+		CERROR("Invalid module parameter value for 'lnet_recovery_interval'\n");
+		return rc;
+	}
+
+	if (value < 1) {
+		CERROR("lnet_recovery_interval must be at least 1 second\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * The purpose of locking the api_mutex here is to ensure that
+	 * the correct value ends up stored properly.
+	 */
+	mutex_lock(&the_lnet.ln_api_mutex);
+
+	*interval = value;
+
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	return 0;
+}
+
+static int
+discovery_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+	int rc;
+	unsigned *discovery = (unsigned *)kp->arg;
+	unsigned long value;
+	struct lnet_ping_buffer *pbuf;
+
+	rc = kstrtoul(val, 0, &value);
+	if (rc) {
+		CERROR("Invalid module parameter value for 'lnet_peer_discovery_disabled'\n");
+		return rc;
+	}
+
+	value = (value) ? 1 : 0;
+
+	/*
+	 * The purpose of locking the api_mutex here is to ensure that
+	 * the correct value ends up stored properly.
+	 */
+	mutex_lock(&the_lnet.ln_api_mutex);
+
+	if (value == *discovery) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return 0;
+	}
+
+	*discovery = value;
+
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return 0;
+	}
+
+	/* tell peers that discovery setting has changed */
+	lnet_net_lock(LNET_LOCK_EX);
+	pbuf = the_lnet.ln_ping_target;
+	if (value)
+		pbuf->pb_info.pi_features &= ~LNET_PING_FEAT_DISCOVERY;
+	else
+		pbuf->pb_info.pi_features |= LNET_PING_FEAT_DISCOVERY;
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	lnet_push_update_to_peers(1);
+
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	return 0;
+}
+
+static int
+drop_asym_route_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+	int rc;
+	unsigned int *drop_asym_route = (unsigned int *)kp->arg;
+	unsigned long value;
+
+	rc = kstrtoul(val, 0, &value);
+	if (rc) {
+		CERROR("Invalid module parameter value for "
+		       "'lnet_drop_asym_route'\n");
+		return rc;
+	}
+
+	/*
+	 * The purpose of locking the api_mutex here is to ensure that
+	 * the correct value ends up stored properly.
+	 */
+	mutex_lock(&the_lnet.ln_api_mutex);
+
+	if (value == *drop_asym_route) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return 0;
+	}
+
+	*drop_asym_route = value;
+
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	return 0;
+}
+
+static int
+transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+	int rc;
+	unsigned *transaction_to = (unsigned *)kp->arg;
+	unsigned long value;
+
+	rc = kstrtoul(val, 0, &value);
+	if (rc) {
+		CERROR("Invalid module parameter value for 'lnet_transaction_timeout'\n");
+		return rc;
+	}
+
+	/*
+	 * The purpose of locking the api_mutex here is to ensure that
+	 * the correct value ends up stored properly.
+	 */
+	mutex_lock(&the_lnet.ln_api_mutex);
+
+	if (value < lnet_retry_count || value == 0) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		CERROR("Invalid value for lnet_transaction_timeout (%lu). "
+		       "Has to be greater than lnet_retry_count (%u)\n",
+		       value, lnet_retry_count);
+		return -EINVAL;
+	}
+
+	if (value == *transaction_to) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return 0;
+	}
+
+	*transaction_to = value;
+	if (lnet_retry_count == 0)
+		lnet_lnd_timeout = value;
+	else
+		lnet_lnd_timeout = value / lnet_retry_count;
+
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	return 0;
+}
+
+static int
+retry_count_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+	int rc;
+	unsigned *retry_count = (unsigned *)kp->arg;
+	unsigned long value;
+
+	rc = kstrtoul(val, 0, &value);
+	if (rc) {
+		CERROR("Invalid module parameter value for 'lnet_retry_count'\n");
+		return rc;
+	}
+
+	/*
+	 * The purpose of locking the api_mutex here is to ensure that
+	 * the correct value ends up stored properly.
+	 */
+	mutex_lock(&the_lnet.ln_api_mutex);
+
+	if (lnet_health_sensitivity == 0) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		CERROR("Can not set retry_count when health feature is turned off\n");
+		return -EINVAL;
+	}
+
+	if (value > lnet_transaction_timeout) {
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		CERROR("Invalid value for lnet_retry_count (%lu). "
+		       "Has to be smaller than lnet_transaction_timeout (%u)\n",
+		       value, lnet_transaction_timeout);
+		return -EINVAL;
+	}
+
+	*retry_count = value;
+
+	if (value == 0)
+		lnet_lnd_timeout = lnet_transaction_timeout;
+	else
+		lnet_lnd_timeout = lnet_transaction_timeout / value;
+
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	return 0;
+}
+
+static int
+intf_max_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+	int value, rc;
+
+	rc = kstrtoint(val, 0, &value);
+	if (rc) {
+		CERROR("Invalid module parameter value for 'lnet_interfaces_max'\n");
+		return rc;
+	}
+
+	if (value < LNET_INTERFACES_MIN) {
+		CWARN("max interfaces provided are too small, setting to %d\n",
+		      LNET_INTERFACES_MAX_DEFAULT);
+		value = LNET_INTERFACES_MAX_DEFAULT;
+	}
+
+	*(int *)kp->arg = value;
+
+	return 0;
+}
+
 static char *
 lnet_get_routes(void)
 {
@@ -112,10 +543,10 @@ static void
 lnet_init_locks(void)
 {
 	spin_lock_init(&the_lnet.ln_eq_wait_lock);
+	spin_lock_init(&the_lnet.ln_msg_resend_lock);
 	init_waitqueue_head(&the_lnet.ln_eq_waitq);
-	init_waitqueue_head(&the_lnet.ln_rc_waitq);
+	init_waitqueue_head(&the_lnet.ln_mt_waitq);
 	mutex_init(&the_lnet.ln_lnd_mutex);
-	mutex_init(&the_lnet.ln_api_mutex);
 }
 
 static void
@@ -326,6 +757,43 @@ static void lnet_assert_wire_constants(void)
 	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.hello.incarnation) == 8);
 	CLASSERT((int)offsetof(struct lnet_hdr, msg.hello.type) == 40);
 	CLASSERT((int)sizeof(((struct lnet_hdr *)0)->msg.hello.type) == 4);
+
+	/* Checks for struct lnet_ni_status and related constants */
+	CLASSERT(LNET_NI_STATUS_INVALID == 0x00000000);
+	CLASSERT(LNET_NI_STATUS_UP == 0x15aac0de);
+	CLASSERT(LNET_NI_STATUS_DOWN == 0xdeadface);
+
+	/* Checks for struct lnet_ni_status */
+	CLASSERT((int)sizeof(struct lnet_ni_status) == 16);
+	CLASSERT((int)offsetof(struct lnet_ni_status, ns_nid) == 0);
+	CLASSERT((int)sizeof(((struct lnet_ni_status *)0)->ns_nid) == 8);
+	CLASSERT((int)offsetof(struct lnet_ni_status, ns_status) == 8);
+	CLASSERT((int)sizeof(((struct lnet_ni_status *)0)->ns_status) == 4);
+	CLASSERT((int)offsetof(struct lnet_ni_status, ns_unused) == 12);
+	CLASSERT((int)sizeof(((struct lnet_ni_status *)0)->ns_unused) == 4);
+
+	/* Checks for struct lnet_ping_info and related constants */
+	CLASSERT(LNET_PROTO_PING_MAGIC == 0x70696E67);
+	CLASSERT(LNET_PING_FEAT_INVAL == 0);
+	CLASSERT(LNET_PING_FEAT_BASE == 1);
+	CLASSERT(LNET_PING_FEAT_NI_STATUS == 2);
+	CLASSERT(LNET_PING_FEAT_RTE_DISABLED == 4);
+	CLASSERT(LNET_PING_FEAT_MULTI_RAIL == 8);
+	CLASSERT(LNET_PING_FEAT_DISCOVERY == 16);
+	CLASSERT(LNET_PING_FEAT_BITS == 31);
+
+	/* Checks for struct lnet_ping_info */
+	CLASSERT((int)sizeof(struct lnet_ping_info) == 16);
+	CLASSERT((int)offsetof(struct lnet_ping_info, pi_magic) == 0);
+	CLASSERT((int)sizeof(((struct lnet_ping_info *)0)->pi_magic) == 4);
+	CLASSERT((int)offsetof(struct lnet_ping_info, pi_features) == 4);
+	CLASSERT((int)sizeof(((struct lnet_ping_info *)0)->pi_features) == 4);
+	CLASSERT((int)offsetof(struct lnet_ping_info, pi_pid) == 8);
+	CLASSERT((int)sizeof(((struct lnet_ping_info *)0)->pi_pid) == 4);
+	CLASSERT((int)offsetof(struct lnet_ping_info, pi_nnis) == 12);
+	CLASSERT((int)sizeof(((struct lnet_ping_info *)0)->pi_nnis) == 4);
+	CLASSERT((int)offsetof(struct lnet_ping_info, pi_ni) == 16);
+	CLASSERT((int)sizeof(((struct lnet_ping_info *)0)->pi_ni) == 0);
 }
 
 static struct lnet_lnd *lnet_find_lnd_by_type(__u32 type)
@@ -343,6 +811,13 @@ static struct lnet_lnd *lnet_find_lnd_by_type(__u32 type)
 	return NULL;
 }
 
+unsigned int
+lnet_get_lnd_timeout(void)
+{
+	return lnet_lnd_timeout;
+}
+EXPORT_SYMBOL(lnet_get_lnd_timeout);
+
 void
 lnet_register_lnd(struct lnet_lnd *lnd)
 {
@@ -375,29 +850,71 @@ lnet_unregister_lnd(struct lnet_lnd *lnd)
 }
 EXPORT_SYMBOL(lnet_unregister_lnd);
 
+void
+lnet_counters_get_common(struct lnet_counters_common *common)
+{
+	struct lnet_counters *ctr;
+	int i;
+
+	memset(common, 0, sizeof(*common));
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) {
+		common->lcc_msgs_max     += ctr->lct_common.lcc_msgs_max;
+		common->lcc_msgs_alloc   += ctr->lct_common.lcc_msgs_alloc;
+		common->lcc_errors       += ctr->lct_common.lcc_errors;
+		common->lcc_send_count   += ctr->lct_common.lcc_send_count;
+		common->lcc_recv_count   += ctr->lct_common.lcc_recv_count;
+		common->lcc_route_count  += ctr->lct_common.lcc_route_count;
+		common->lcc_drop_count   += ctr->lct_common.lcc_drop_count;
+		common->lcc_send_length  += ctr->lct_common.lcc_send_length;
+		common->lcc_recv_length  += ctr->lct_common.lcc_recv_length;
+		common->lcc_route_length += ctr->lct_common.lcc_route_length;
+		common->lcc_drop_length  += ctr->lct_common.lcc_drop_length;
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+EXPORT_SYMBOL(lnet_counters_get_common);
+
 void
 lnet_counters_get(struct lnet_counters *counters)
 {
 	struct lnet_counters *ctr;
+	struct lnet_counters_health *health = &counters->lct_health;
 	int		i;
 
 	memset(counters, 0, sizeof(*counters));
 
+	lnet_counters_get_common(&counters->lct_common);
+
 	lnet_net_lock(LNET_LOCK_EX);
 
 	cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) {
-		counters->msgs_max     += ctr->msgs_max;
-		counters->msgs_alloc   += ctr->msgs_alloc;
-		counters->errors       += ctr->errors;
-		counters->send_count   += ctr->send_count;
-		counters->recv_count   += ctr->recv_count;
-		counters->route_count  += ctr->route_count;
-		counters->drop_count   += ctr->drop_count;
-		counters->send_length  += ctr->send_length;
-		counters->recv_length  += ctr->recv_length;
-		counters->route_length += ctr->route_length;
-		counters->drop_length  += ctr->drop_length;
-
+		health->lch_rst_alloc    += ctr->lct_health.lch_rst_alloc;
+		health->lch_resend_count += ctr->lct_health.lch_resend_count;
+		health->lch_response_timeout_count +=
+				ctr->lct_health.lch_response_timeout_count;
+		health->lch_local_interrupt_count +=
+				ctr->lct_health.lch_local_interrupt_count;
+		health->lch_local_dropped_count +=
+				ctr->lct_health.lch_local_dropped_count;
+		health->lch_local_aborted_count +=
+				ctr->lct_health.lch_local_aborted_count;
+		health->lch_local_no_route_count +=
+				ctr->lct_health.lch_local_no_route_count;
+		health->lch_local_timeout_count +=
+				ctr->lct_health.lch_local_timeout_count;
+		health->lch_local_error_count +=
+				ctr->lct_health.lch_local_error_count;
+		health->lch_remote_dropped_count +=
+				ctr->lct_health.lch_remote_dropped_count;
+		health->lch_remote_error_count +=
+				ctr->lct_health.lch_remote_error_count;
+		health->lch_remote_timeout_count +=
+				ctr->lct_health.lch_remote_timeout_count;
+		health->lch_network_timeout_count +=
+				ctr->lct_health.lch_network_timeout_count;
 	}
 	lnet_net_unlock(LNET_LOCK_EX);
 }
@@ -582,6 +1099,26 @@ lnet_res_lh_initialize(struct lnet_res_container *rec,
 	list_add(&lh->lh_hash_chain, &rec->rec_lh_hash[hash]);
 }
 
+struct list_head **
+lnet_create_array_of_queues(void)
+{
+	struct list_head **qs;
+	struct list_head *q;
+	int i;
+
+	qs = cfs_percpt_alloc(lnet_cpt_table(),
+			      sizeof(struct list_head));
+	if (!qs) {
+		CERROR("Failed to allocate queues\n");
+		return NULL;
+	}
+
+	cfs_percpt_for_each(q, i, qs)
+		INIT_LIST_HEAD(q);
+
+	return qs;
+}
+
 static int lnet_unprepare(void);
 
 static int
@@ -604,12 +1141,18 @@ lnet_prepare(lnet_pid_t requested_pid)
 	the_lnet.ln_pid = requested_pid;
 
 	INIT_LIST_HEAD(&the_lnet.ln_test_peers);
-	INIT_LIST_HEAD(&the_lnet.ln_peers);
 	INIT_LIST_HEAD(&the_lnet.ln_remote_peer_ni_list);
 	INIT_LIST_HEAD(&the_lnet.ln_nets);
 	INIT_LIST_HEAD(&the_lnet.ln_routers);
 	INIT_LIST_HEAD(&the_lnet.ln_drop_rules);
 	INIT_LIST_HEAD(&the_lnet.ln_delay_rules);
+	INIT_LIST_HEAD(&the_lnet.ln_dc_request);
+	INIT_LIST_HEAD(&the_lnet.ln_dc_working);
+	INIT_LIST_HEAD(&the_lnet.ln_dc_expired);
+	INIT_LIST_HEAD(&the_lnet.ln_mt_localNIRecovq);
+	INIT_LIST_HEAD(&the_lnet.ln_mt_peerNIRecovq);
+	init_waitqueue_head(&the_lnet.ln_dc_waitq);
+	LNetInvalidateEQHandle(&the_lnet.ln_mt_eqh);
 
 	rc = lnet_descriptor_setup();
 	if (rc != 0)
@@ -668,6 +1211,12 @@ lnet_prepare(lnet_pid_t requested_pid)
 		goto failed;
 	}
 
+	the_lnet.ln_mt_zombie_rstqs = lnet_create_array_of_queues();
+	if (!the_lnet.ln_mt_zombie_rstqs) {
+		rc = -ENOMEM;
+		goto failed;
+	}
+
 	return 0;
 
  failed:
@@ -678,6 +1227,8 @@ lnet_prepare(lnet_pid_t requested_pid)
 static int
 lnet_unprepare (void)
 {
+	int rc;
+
 	/* NB no LNET_LOCK since this is the last reference.  All LND instances
 	 * have shut down already, so it is safe to unlink and free all
 	 * descriptors, even those that appear committed to a network op (eg MD
@@ -689,6 +1240,17 @@ lnet_unprepare (void)
 	LASSERT(list_empty(&the_lnet.ln_test_peers));
 	LASSERT(list_empty(&the_lnet.ln_nets));
 
+	if (the_lnet.ln_mt_zombie_rstqs) {
+		lnet_clean_zombie_rstqs();
+		the_lnet.ln_mt_zombie_rstqs = NULL;
+	}
+
+	if (!LNetEQHandleIsInvalid(the_lnet.ln_mt_eqh)) {
+		rc = LNetEQFree(the_lnet.ln_mt_eqh);
+		LNetInvalidateEQHandle(&the_lnet.ln_mt_eqh);
+		LASSERT(rc == 0);
+	}
+
 	lnet_portals_destroy();
 
 	if (the_lnet.ln_md_containers != NULL) {
@@ -854,16 +1416,6 @@ lnet_islocalnet(__u32 net_id)
 	return local;
 }
 
-bool
-lnet_is_ni_healthy_locked(struct lnet_ni *ni)
-{
-	if (ni->ni_state == LNET_NI_STATE_ACTIVE ||
-	    ni->ni_state == LNET_NI_STATE_DEGRADED)
-		return true;
-
-	return false;
-}
-
 struct lnet_ni  *
 lnet_nid2ni_locked(lnet_nid_t nid, int cpt)
 {
@@ -931,25 +1483,45 @@ lnet_count_acceptor_nets(void)
 	return count;
 }
 
-static struct lnet_ping_info *
-lnet_ping_info_create(int num_ni)
+struct lnet_ping_buffer *
+lnet_ping_buffer_alloc(int nnis, gfp_t gfp)
+{
+	struct lnet_ping_buffer *pbuf;
+
+	LIBCFS_ALLOC_GFP(pbuf, LNET_PING_BUFFER_SIZE(nnis), gfp);
+	if (pbuf) {
+		pbuf->pb_nnis = nnis;
+		atomic_set(&pbuf->pb_refcnt, 1);
+	}
+
+	return pbuf;
+}
+
+void
+lnet_ping_buffer_free(struct lnet_ping_buffer *pbuf)
 {
-	struct lnet_ping_info *ping_info;
-	unsigned int	 infosz;
+	LASSERT(lnet_ping_buffer_numref(pbuf) == 0);
+	LIBCFS_FREE(pbuf, LNET_PING_BUFFER_SIZE(pbuf->pb_nnis));
+}
 
-	infosz = offsetof(struct lnet_ping_info, pi_ni[num_ni]);
-	LIBCFS_ALLOC(ping_info, infosz);
-	if (ping_info == NULL) {
-		CERROR("Can't allocate ping info[%d]\n", num_ni);
+static struct lnet_ping_buffer *
+lnet_ping_target_create(int nnis)
+{
+	struct lnet_ping_buffer *pbuf;
+
+	pbuf = lnet_ping_buffer_alloc(nnis, GFP_NOFS);
+	if (pbuf == NULL) {
+		CERROR("Can't allocate ping source [%d]\n", nnis);
 		return NULL;
 	}
 
-	ping_info->pi_nnis = num_ni;
-	ping_info->pi_pid = the_lnet.ln_pid;
-	ping_info->pi_magic = LNET_PROTO_PING_MAGIC;
-	ping_info->pi_features = LNET_PING_FEAT_NI_STATUS;
+	pbuf->pb_info.pi_nnis = nnis;
+	pbuf->pb_info.pi_pid = the_lnet.ln_pid;
+	pbuf->pb_info.pi_magic = LNET_PROTO_PING_MAGIC;
+	pbuf->pb_info.pi_features =
+		LNET_PING_FEAT_NI_STATUS | LNET_PING_FEAT_MULTI_RAIL;
 
-	return ping_info;
+	return pbuf;
 }
 
 static inline int
@@ -995,16 +1567,25 @@ lnet_get_ni_count(void)
 	return count;
 }
 
-static inline void
-lnet_ping_info_free(struct lnet_ping_info *pinfo)
+int
+lnet_ping_info_validate(struct lnet_ping_info *pinfo)
 {
-	LIBCFS_FREE(pinfo,
-		    offsetof(struct lnet_ping_info,
-			     pi_ni[pinfo->pi_nnis]));
+	if (!pinfo)
+		return -EINVAL;
+	if (pinfo->pi_magic != LNET_PROTO_PING_MAGIC)
+		return -EPROTO;
+	if (!(pinfo->pi_features & LNET_PING_FEAT_NI_STATUS))
+		return -EPROTO;
+	/* Loopback is guaranteed to be present */
+	if (pinfo->pi_nnis < 1 || pinfo->pi_nnis > lnet_interfaces_max)
+		return -ERANGE;
+	if (LNET_PING_INFO_LONI(pinfo) != LNET_NID_LO_0)
+		return -EPROTO;
+	return 0;
 }
 
 static void
-lnet_ping_info_destroy(void)
+lnet_ping_target_destroy(void)
 {
 	struct lnet_net *net;
 	struct lnet_ni	*ni;
@@ -1019,25 +1600,25 @@ lnet_ping_info_destroy(void)
 		}
 	}
 
-	lnet_ping_info_free(the_lnet.ln_ping_info);
-	the_lnet.ln_ping_info = NULL;
+	lnet_ping_buffer_decref(the_lnet.ln_ping_target);
+	the_lnet.ln_ping_target = NULL;
 
 	lnet_net_unlock(LNET_LOCK_EX);
 }
 
 static void
-lnet_ping_event_handler(struct lnet_event *event)
+lnet_ping_target_event_handler(struct lnet_event *event)
 {
-	struct lnet_ping_info *pinfo = event->md.user_ptr;
+	struct lnet_ping_buffer *pbuf = event->md.user_ptr;
 
 	if (event->unlinked)
-		pinfo->pi_features = LNET_PING_FEAT_INVAL;
+		lnet_ping_buffer_decref(pbuf);
 }
 
 static int
-lnet_ping_info_setup(struct lnet_ping_info **ppinfo,
-		     struct lnet_handle_md *md_handle,
-		     int ni_count, bool set_eq)
+lnet_ping_target_setup(struct lnet_ping_buffer **ppbuf,
+		       struct lnet_handle_md *ping_mdh,
+		       int ni_count, bool set_eq)
 {
 	struct lnet_process_id id = {
 		.nid = LNET_NID_ANY,
@@ -1048,72 +1629,76 @@ lnet_ping_info_setup(struct lnet_ping_info **ppinfo,
 	int rc, rc2;
 
 	if (set_eq) {
-		rc = LNetEQAlloc(0, lnet_ping_event_handler,
+		rc = LNetEQAlloc(0, lnet_ping_target_event_handler,
 				 &the_lnet.ln_ping_target_eq);
 		if (rc != 0) {
-			CERROR("Can't allocate ping EQ: %d\n", rc);
+			CERROR("Can't allocate ping buffer EQ: %d\n", rc);
 			return rc;
 		}
 	}
 
-	*ppinfo = lnet_ping_info_create(ni_count);
-	if (*ppinfo == NULL) {
+	*ppbuf = lnet_ping_target_create(ni_count);
+	if (*ppbuf == NULL) {
 		rc = -ENOMEM;
-		goto failed_0;
+		goto fail_free_eq;
 	}
 
+	/* Ping target ME/MD */
 	rc = LNetMEAttach(LNET_RESERVED_PORTAL, id,
 			  LNET_PROTO_PING_MATCHBITS, 0,
 			  LNET_UNLINK, LNET_INS_AFTER,
 			  &me_handle);
 	if (rc != 0) {
-		CERROR("Can't create ping ME: %d\n", rc);
-		goto failed_1;
+		CERROR("Can't create ping target ME: %d\n", rc);
+		goto fail_decref_ping_buffer;
 	}
 
 	/* initialize md content */
-	md.start     = *ppinfo;
-	md.length    = offsetof(struct lnet_ping_info,
-				pi_ni[(*ppinfo)->pi_nnis]);
+	md.start     = &(*ppbuf)->pb_info;
+	md.length    = LNET_PING_INFO_SIZE((*ppbuf)->pb_nnis);
 	md.threshold = LNET_MD_THRESH_INF;
 	md.max_size  = 0;
 	md.options   = LNET_MD_OP_GET | LNET_MD_TRUNCATE |
 		       LNET_MD_MANAGE_REMOTE;
-	md.user_ptr  = NULL;
 	md.eq_handle = the_lnet.ln_ping_target_eq;
-	md.user_ptr = *ppinfo;
+	md.user_ptr  = *ppbuf;
 
-	rc = LNetMDAttach(me_handle, md, LNET_RETAIN, md_handle);
+	rc = LNetMDAttach(me_handle, md, LNET_RETAIN, ping_mdh);
 	if (rc != 0) {
-		CERROR("Can't attach ping MD: %d\n", rc);
-		goto failed_2;
+		CERROR("Can't attach ping target MD: %d\n", rc);
+		goto fail_unlink_ping_me;
 	}
+	lnet_ping_buffer_addref(*ppbuf);
 
 	return 0;
 
-failed_2:
+fail_unlink_ping_me:
 	rc2 = LNetMEUnlink(me_handle);
 	LASSERT(rc2 == 0);
-failed_1:
-	lnet_ping_info_free(*ppinfo);
-	*ppinfo = NULL;
-failed_0:
-	if (set_eq)
-		LNetEQFree(the_lnet.ln_ping_target_eq);
+fail_decref_ping_buffer:
+	LASSERT(lnet_ping_buffer_numref(*ppbuf) == 1);
+	lnet_ping_buffer_decref(*ppbuf);
+	*ppbuf = NULL;
+fail_free_eq:
+	if (set_eq) {
+		rc2 = LNetEQFree(the_lnet.ln_ping_target_eq);
+		LASSERT(rc2 == 0);
+	}
 	return rc;
 }
 
 static void
-lnet_ping_md_unlink(struct lnet_ping_info *pinfo, struct lnet_handle_md *md_handle)
+lnet_ping_md_unlink(struct lnet_ping_buffer *pbuf,
+		    struct lnet_handle_md *ping_mdh)
 {
 	sigset_t	blocked = cfs_block_allsigs();
 
-	LNetMDUnlink(*md_handle);
-	LNetInvalidateMDHandle(md_handle);
+	LNetMDUnlink(*ping_mdh);
+	LNetInvalidateMDHandle(ping_mdh);
 
-	/* NB md could be busy; this just starts the unlink */
-	while (pinfo->pi_features != LNET_PING_FEAT_INVAL) {
-		CDEBUG(D_NET, "Still waiting for ping MD to unlink\n");
+	/* NB the MD could be busy; this just starts the unlink */
+	while (lnet_ping_buffer_numref(pbuf) > 1) {
+		CDEBUG(D_NET, "Still waiting for ping data MD to unlink\n");
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_timeout(cfs_time_seconds(1));
 	}
@@ -1122,77 +1707,241 @@ lnet_ping_md_unlink(struct lnet_ping_info *pinfo, struct lnet_handle_md *md_hand
 }
 
 static void
-lnet_ping_info_install_locked(struct lnet_ping_info *ping_info)
+lnet_ping_target_install_locked(struct lnet_ping_buffer *pbuf)
 {
-	int			i;
 	struct lnet_ni		*ni;
 	struct lnet_net		*net;
 	struct lnet_ni_status *ns;
+	int			i;
+	int			rc;
 
 	i = 0;
 	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
 		list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
-			LASSERT(i < ping_info->pi_nnis);
+			LASSERT(i < pbuf->pb_nnis);
 
-			ns = &ping_info->pi_ni[i];
+			ns = &pbuf->pb_info.pi_ni[i];
 
 			ns->ns_nid = ni->ni_nid;
 
 			lnet_ni_lock(ni);
 			ns->ns_status = (ni->ni_status != NULL) ?
-					ni->ni_status->ns_status :
+					 ni->ni_status->ns_status :
 						LNET_NI_STATUS_UP;
 			ni->ni_status = ns;
 			lnet_ni_unlock(ni);
 
 			i++;
 		}
-
 	}
+	/*
+	 * We (ab)use the ns_status of the loopback interface to
+	 * transmit the sequence number. The first interface listed
+	 * must be the loopback interface.
+	 */
+	rc = lnet_ping_info_validate(&pbuf->pb_info);
+	if (rc) {
+		LCONSOLE_EMERG("Invalid ping target: %d\n", rc);
+		LBUG();
+	}
+	LNET_PING_BUFFER_SEQNO(pbuf) =
+		atomic_inc_return(&the_lnet.ln_ping_target_seqno);
 }
 
 static void
-lnet_ping_target_update(struct lnet_ping_info *pinfo,
-			struct lnet_handle_md md_handle)
+lnet_ping_target_update(struct lnet_ping_buffer *pbuf,
+			struct lnet_handle_md ping_mdh)
 {
-	struct lnet_ping_info *old_pinfo = NULL;
-	struct lnet_handle_md old_md;
+	struct lnet_ping_buffer *old_pbuf = NULL;
+	struct lnet_handle_md old_ping_md;
 
 	/* switch the NIs to point to the new ping info created */
 	lnet_net_lock(LNET_LOCK_EX);
 
 	if (!the_lnet.ln_routing)
-		pinfo->pi_features |= LNET_PING_FEAT_RTE_DISABLED;
-	lnet_ping_info_install_locked(pinfo);
+		pbuf->pb_info.pi_features |= LNET_PING_FEAT_RTE_DISABLED;
+	if (!lnet_peer_discovery_disabled)
+		pbuf->pb_info.pi_features |= LNET_PING_FEAT_DISCOVERY;
+
+	/* Ensure only known feature bits have been set. */
+	LASSERT(pbuf->pb_info.pi_features & LNET_PING_FEAT_BITS);
+	LASSERT(!(pbuf->pb_info.pi_features & ~LNET_PING_FEAT_BITS));
+
+	lnet_ping_target_install_locked(pbuf);
+
+	if (the_lnet.ln_ping_target) {
+		old_pbuf = the_lnet.ln_ping_target;
+		old_ping_md = the_lnet.ln_ping_target_md;
+	}
+	the_lnet.ln_ping_target_md = ping_mdh;
+	the_lnet.ln_ping_target = pbuf;
+
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	if (old_pbuf) {
+		/* unlink and free the old ping info */
+		lnet_ping_md_unlink(old_pbuf, &old_ping_md);
+		lnet_ping_buffer_decref(old_pbuf);
+	}
+
+	lnet_push_update_to_peers(0);
+}
+
+static void
+lnet_ping_target_fini(void)
+{
+	int		rc;
+
+	lnet_ping_md_unlink(the_lnet.ln_ping_target,
+			    &the_lnet.ln_ping_target_md);
+
+	rc = LNetEQFree(the_lnet.ln_ping_target_eq);
+	LASSERT(rc == 0);
+
+	lnet_ping_target_destroy();
+}
+
+/* Resize the push target. */
+int lnet_push_target_resize(void)
+{
+	struct lnet_process_id id = { LNET_NID_ANY, LNET_PID_ANY };
+	struct lnet_md md = { NULL };
+	struct lnet_handle_me meh;
+	struct lnet_handle_md mdh;
+	struct lnet_handle_md old_mdh;
+	struct lnet_ping_buffer *pbuf;
+	struct lnet_ping_buffer *old_pbuf;
+	int nnis = the_lnet.ln_push_target_nnis;
+	int rc;
+
+	if (nnis <= 0) {
+		rc = -EINVAL;
+		goto fail_return;
+	}
+again:
+	pbuf = lnet_ping_buffer_alloc(nnis, GFP_NOFS);
+	if (!pbuf) {
+		rc = -ENOMEM;
+		goto fail_return;
+	}
+
+	rc = LNetMEAttach(LNET_RESERVED_PORTAL, id,
+			  LNET_PROTO_PING_MATCHBITS, 0,
+			  LNET_UNLINK, LNET_INS_AFTER,
+			  &meh);
+	if (rc) {
+		CERROR("Can't create push target ME: %d\n", rc);
+		goto fail_decref_pbuf;
+	}
+
+	/* initialize md content */
+	md.start     = &pbuf->pb_info;
+	md.length    = LNET_PING_INFO_SIZE(nnis);
+	md.threshold = LNET_MD_THRESH_INF;
+	md.max_size  = 0;
+	md.options   = LNET_MD_OP_PUT | LNET_MD_TRUNCATE |
+		       LNET_MD_MANAGE_REMOTE;
+	md.user_ptr  = pbuf;
+	md.eq_handle = the_lnet.ln_push_target_eq;
 
-	if (the_lnet.ln_ping_info != NULL) {
-		old_pinfo = the_lnet.ln_ping_info;
-		old_md = the_lnet.ln_ping_target_md;
+	rc = LNetMDAttach(meh, md, LNET_RETAIN, &mdh);
+	if (rc) {
+		CERROR("Can't attach push MD: %d\n", rc);
+		goto fail_unlink_meh;
 	}
-	the_lnet.ln_ping_target_md = md_handle;
-	the_lnet.ln_ping_info = pinfo;
+	lnet_ping_buffer_addref(pbuf);
 
+	lnet_net_lock(LNET_LOCK_EX);
+	old_pbuf = the_lnet.ln_push_target;
+	old_mdh = the_lnet.ln_push_target_md;
+	the_lnet.ln_push_target = pbuf;
+	the_lnet.ln_push_target_md = mdh;
 	lnet_net_unlock(LNET_LOCK_EX);
 
-	if (old_pinfo != NULL) {
-		/* unlink the old ping info */
-		lnet_ping_md_unlink(old_pinfo, &old_md);
-		lnet_ping_info_free(old_pinfo);
+	if (old_pbuf) {
+		LNetMDUnlink(old_mdh);
+		lnet_ping_buffer_decref(old_pbuf);
+	}
+
+	if (nnis < the_lnet.ln_push_target_nnis)
+		goto again;
+
+	CDEBUG(D_NET, "nnis %d success\n", nnis);
+
+	return 0;
+
+fail_unlink_meh:
+	LNetMEUnlink(meh);
+fail_decref_pbuf:
+	lnet_ping_buffer_decref(pbuf);
+fail_return:
+	CDEBUG(D_NET, "nnis %d error %d\n", nnis, rc);
+	return rc;
+}
+
+static void lnet_push_target_event_handler(struct lnet_event *ev)
+{
+	struct lnet_ping_buffer *pbuf = ev->md.user_ptr;
+
+	if (pbuf->pb_info.pi_magic == __swab32(LNET_PROTO_PING_MAGIC))
+		lnet_swap_pinginfo(pbuf);
+
+	lnet_peer_push_event(ev);
+	if (ev->unlinked)
+		lnet_ping_buffer_decref(pbuf);
+}
+
+/* Initialize the push target. */
+static int lnet_push_target_init(void)
+{
+	int rc;
+
+	if (the_lnet.ln_push_target)
+		return -EALREADY;
+
+	rc = LNetEQAlloc(0, lnet_push_target_event_handler,
+			 &the_lnet.ln_push_target_eq);
+	if (rc) {
+		CERROR("Can't allocated push target EQ: %d\n", rc);
+		return rc;
+	}
+
+	/* Start at the required minimum, we'll enlarge if required. */
+	the_lnet.ln_push_target_nnis = LNET_INTERFACES_MIN;
+
+	rc = lnet_push_target_resize();
+
+	if (rc) {
+		LNetEQFree(the_lnet.ln_push_target_eq);
+		LNetInvalidateEQHandle(&the_lnet.ln_push_target_eq);
 	}
+
+	return rc;
 }
 
-static void
-lnet_ping_target_fini(void)
+/* Clean up the push target. */
+static void lnet_push_target_fini(void)
 {
-	int		rc;
+	if (!the_lnet.ln_push_target)
+		return;
 
-	lnet_ping_md_unlink(the_lnet.ln_ping_info,
-			    &the_lnet.ln_ping_target_md);
+	/* Unlink and invalidate to prevent new references. */
+	LNetMDUnlink(the_lnet.ln_push_target_md);
+	LNetInvalidateMDHandle(&the_lnet.ln_push_target_md);
 
-	rc = LNetEQFree(the_lnet.ln_ping_target_eq);
-	LASSERT(rc == 0);
+	/* Wait for the unlink to complete. */
+	while (lnet_ping_buffer_numref(the_lnet.ln_push_target) > 1) {
+		CDEBUG(D_NET, "Still waiting for ping data MD to unlink\n");
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(cfs_time_seconds(1));
+	}
 
-	lnet_ping_info_destroy();
+	lnet_ping_buffer_decref(the_lnet.ln_push_target);
+	the_lnet.ln_push_target = NULL;
+	the_lnet.ln_push_target_nnis = 0;
+
+	LNetEQFree(the_lnet.ln_push_target_eq);
+	LNetInvalidateEQHandle(&the_lnet.ln_push_target_eq);
 }
 
 static int
@@ -1215,11 +1964,6 @@ lnet_ni_tq_credits(struct lnet_ni *ni)
 static void
 lnet_ni_unlink_locked(struct lnet_ni *ni)
 {
-	if (!list_empty(&ni->ni_cptlist)) {
-		list_del_init(&ni->ni_cptlist);
-		lnet_ni_decref_locked(ni, 0);
-	}
-
 	/* move it to zombie list and nobody can find it anymore */
 	LASSERT(!list_empty(&ni->ni_netlist));
 	list_move(&ni->ni_netlist, &ni->ni_net->net_ni_zombie);
@@ -1258,7 +2002,13 @@ lnet_clear_zombies_nis_locked(struct lnet_net *net)
 		}
 
 		if (!list_empty(&ni->ni_netlist)) {
+			/* Unlock mutex while waiting to allow other
+			 * threads to read the LNet state and fall through
+			 * to avoid deadlock
+			 */
 			lnet_net_unlock(LNET_LOCK_EX);
+			mutex_unlock(&the_lnet.ln_api_mutex);
+
 			++i;
 			if ((i & (-i)) == i) {
 				CDEBUG(D_WARNING,
@@ -1267,6 +2017,8 @@ lnet_clear_zombies_nis_locked(struct lnet_net *net)
 			}
 			set_current_state(TASK_UNINTERRUPTIBLE);
 			schedule_timeout(cfs_time_seconds(1));
+
+			mutex_lock(&the_lnet.ln_api_mutex);
 			lnet_net_lock(LNET_LOCK_EX);
 			continue;
 		}
@@ -1296,7 +2048,9 @@ lnet_shutdown_lndni(struct lnet_ni *ni)
 	struct lnet_net *net = ni->ni_net;
 
 	lnet_net_lock(LNET_LOCK_EX);
+	lnet_ni_lock(ni);
 	ni->ni_state = LNET_NI_STATE_DELETING;
+	lnet_ni_unlock(ni);
 	lnet_ni_unlink_locked(ni);
 	lnet_incr_dlc_seq();
 	lnet_net_unlock(LNET_LOCK_EX);
@@ -1350,6 +2104,10 @@ static void
 lnet_shutdown_lndnets(void)
 {
 	struct lnet_net *net;
+	struct list_head resend;
+	struct lnet_msg *msg, *tmp;
+
+	INIT_LIST_HEAD(&resend);
 
 	/* NB called holding the global mutex */
 
@@ -1385,6 +2143,16 @@ lnet_shutdown_lndnets(void)
 		lnet_shutdown_lndnet(net);
 	}
 
+	spin_lock(&the_lnet.ln_msg_resend_lock);
+	list_splice(&the_lnet.ln_msg_resend, &resend);
+	spin_unlock(&the_lnet.ln_msg_resend_lock);
+
+	list_for_each_entry_safe(msg, tmp, &resend, msg_list) {
+		list_del_init(&msg->msg_list);
+		msg->msg_no_resend = true;
+		lnet_finalize(msg, -ECANCELED);
+	}
+
 	lnet_net_lock(LNET_LOCK_EX);
 	the_lnet.ln_state = LNET_STATE_SHUTDOWN;
 	lnet_net_unlock(LNET_LOCK_EX);
@@ -1418,7 +2186,9 @@ lnet_startup_lndni(struct lnet_ni *ni, struct lnet_lnd_tunables *tun)
 		goto failed0;
 	}
 
+	lnet_ni_lock(ni);
 	ni->ni_state = LNET_NI_STATE_ACTIVE;
+	lnet_ni_unlock(ni);
 
 	/* We keep a reference on the loopback net through the loopback NI */
 	if (net->net_lnd->lnd_type == LOLND) {
@@ -1453,6 +2223,7 @@ lnet_startup_lndni(struct lnet_ni *ni, struct lnet_lnd_tunables *tun)
 
 	atomic_set(&ni->ni_tx_credits,
 		   lnet_ni_tq_credits(ni) * ni->ni_ncpts);
+	atomic_set(&ni->ni_healthv, LNET_MAX_HEALTH_VALUE);
 
 	CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n",
 		libcfs_nid2str(ni->ni_nid),
@@ -1496,8 +2267,6 @@ lnet_startup_lndnet(struct lnet_net *net, struct lnet_lnd_tunables *tun)
 	if (lnet_net_unique(net->net_id, &the_lnet.ln_nets, &net_l)) {
 		lnd_type = LNET_NETTYP(net->net_id);
 
-		LASSERT(libcfs_isknown_lnd(lnd_type));
-
 		mutex_lock(&the_lnet.ln_lnd_mutex);
 		lnd = lnet_find_lnd_by_type(lnd_type);
 
@@ -1576,7 +2345,7 @@ lnet_startup_lndnet(struct lnet_net *net, struct lnet_lnd_tunables *tun)
 		 * up is actually unique. if it's not fail. */
 		if (!lnet_ni_unique_net(&net_l->net_ni_list,
 					ni->ni_interfaces[0])) {
-			rc = -EINVAL;
+			rc = -EEXIST;
 			goto failed1;
 		}
 
@@ -1701,8 +2470,6 @@ int lnet_lib_init(void)
 
 	lnet_assert_wire_constants();
 
-	memset(&the_lnet, 0, sizeof(the_lnet));
-
 	/* refer to global cfs_cpt_table for now */
 	the_lnet.ln_cpt_table	= cfs_cpt_table;
 	the_lnet.ln_cpt_number	= cfs_cpt_number(cfs_cpt_table);
@@ -1730,6 +2497,7 @@ int lnet_lib_init(void)
 	INIT_LIST_HEAD(&the_lnet.ln_lnds);
 	INIT_LIST_HEAD(&the_lnet.ln_net_zombie);
 	INIT_LIST_HEAD(&the_lnet.ln_rcd_zombie);
+	INIT_LIST_HEAD(&the_lnet.ln_msg_resend);
 	INIT_LIST_HEAD(&the_lnet.ln_rcd_deathrow);
 
 	/* The hash table size is the number of bits it takes to express the set
@@ -1786,8 +2554,8 @@ LNetNIInit(lnet_pid_t requested_pid)
 	int			im_a_router = 0;
 	int			rc;
 	int			ni_count;
-	struct lnet_ping_info	*pinfo;
-	struct lnet_handle_md	md_handle;
+	struct lnet_ping_buffer	*pbuf;
+	struct lnet_handle_md	ping_mdh;
 	struct list_head	net_head;
 	struct lnet_net		*net;
 
@@ -1822,6 +2590,9 @@ LNetNIInit(lnet_pid_t requested_pid)
 		goto err_empty_list;
 	}
 
+	if (use_tcp_bonding)
+		CWARN("'use_tcp_bonding' option has been deprecated. See LU-13641\n");
+
 	/* If LNet is being initialized via DLC it is possible
 	 * that the user requests not to load module parameters (ones which
 	 * are supported by DLC) on initialization.  Therefore, make sure not
@@ -1862,23 +2633,41 @@ LNetNIInit(lnet_pid_t requested_pid)
 	the_lnet.ln_refcount = 1;
 	/* Now I may use my own API functions... */
 
-	rc = lnet_ping_info_setup(&pinfo, &md_handle, ni_count, true);
+	rc = lnet_ping_target_setup(&pbuf, &ping_mdh, ni_count, true);
 	if (rc != 0)
 		goto err_acceptor_stop;
 
-	lnet_ping_target_update(pinfo, md_handle);
+	lnet_ping_target_update(pbuf, ping_mdh);
+
+	rc = LNetEQAlloc(0, lnet_mt_event_handler, &the_lnet.ln_mt_eqh);
+	if (rc != 0) {
+		CERROR("Can't allocate monitor thread EQ: %d\n", rc);
+		goto err_stop_ping;
+	}
 
-	rc = lnet_router_checker_start();
+	rc = lnet_monitor_thr_start();
 	if (rc != 0)
 		goto err_stop_ping;
 
+	rc = lnet_push_target_init();
+	if (rc != 0)
+		goto err_stop_monitor_thr;
+
+	rc = lnet_peer_discovery_start();
+	if (rc != 0)
+		goto err_destroy_push_target;
+
 	lnet_fault_init();
-	lnet_proc_init();
+	lnet_router_debugfs_init();
 
 	mutex_unlock(&the_lnet.ln_api_mutex);
 
 	return 0;
 
+err_destroy_push_target:
+	lnet_push_target_fini();
+err_stop_monitor_thr:
+	lnet_monitor_thr_stop();
 err_stop_ping:
 	lnet_ping_target_fini();
 err_acceptor_stop:
@@ -1927,8 +2716,10 @@ LNetNIFini()
 
 		lnet_fault_fini();
 
-		lnet_proc_fini();
-		lnet_router_checker_stop();
+		lnet_router_debugfs_fini();
+		lnet_peer_discovery_stop();
+		lnet_push_target_fini();
+		lnet_monitor_thr_stop();
 		lnet_ping_target_fini();
 
 		/* Teardown fns that use my own API functions BEFORE here */
@@ -1976,15 +2767,22 @@ lnet_fill_ni_info(struct lnet_ni *ni, struct lnet_ioctl_config_ni *cfg_ni,
 	}
 
 	cfg_ni->lic_nid = ni->ni_nid;
-	cfg_ni->lic_status = ni->ni_status->ns_status;
+	if (ni->ni_nid == LNET_NID_LO_0)
+		cfg_ni->lic_status = LNET_NI_STATUS_UP;
+	else
+		cfg_ni->lic_status = ni->ni_status->ns_status;
 	cfg_ni->lic_tcp_bonding = use_tcp_bonding;
 	cfg_ni->lic_dev_cpt = ni->ni_dev_cpt;
 
 	memcpy(&tun->lt_cmn, &ni->ni_net->net_tunables, sizeof(tun->lt_cmn));
 
 	if (stats) {
-		stats->iel_send_count = atomic_read(&ni->ni_stats.send_count);
-		stats->iel_recv_count = atomic_read(&ni->ni_stats.recv_count);
+		stats->iel_send_count = lnet_sum_stats(&ni->ni_stats,
+						       LNET_STATS_TYPE_SEND);
+		stats->iel_recv_count = lnet_sum_stats(&ni->ni_stats,
+						       LNET_STATS_TYPE_RECV);
+		stats->iel_drop_count = lnet_sum_stats(&ni->ni_stats,
+						       LNET_STATS_TYPE_DROP);
 	}
 
 	/*
@@ -2061,7 +2859,10 @@ lnet_fill_ni_info_legacy(struct lnet_ni *ni,
 	config->cfg_config_u.cfg_net.net_peer_rtr_credits =
 		ni->ni_net->net_tunables.lct_peer_rtr_credits;
 
-	net_config->ni_status = ni->ni_status->ns_status;
+	if (ni->ni_nid == LNET_NID_LO_0)
+		net_config->ni_status = LNET_NI_STATUS_UP;
+	else
+		net_config->ni_status = ni->ni_status->ns_status;
 
 	if (ni->ni_cpts) {
 		int num_cpts = min(ni->ni_ncpts, LNET_MAX_SHOW_NUM_CPT);
@@ -2119,10 +2920,17 @@ lnet_get_next_ni_locked(struct lnet_net *mynet, struct lnet_ni *prev)
 	struct lnet_ni		*ni;
 	struct lnet_net		*net = mynet;
 
+	/*
+	 * It is possible that the net has been cleaned out while there is
+	 * a message being sent. This function accessed the net without
+	 * checking if the list is empty
+	 */
 	if (prev == NULL) {
 		if (net == NULL)
 			net = list_entry(the_lnet.ln_nets.next, struct lnet_net,
 					net_list);
+		if (list_empty(&net->net_ni_list))
+			return NULL;
 		ni = list_entry(net->net_ni_list.next, struct lnet_ni,
 				ni_netlist);
 
@@ -2144,6 +2952,8 @@ lnet_get_next_ni_locked(struct lnet_net *mynet, struct lnet_ni *prev)
 		/* get the next net */
 		net = list_entry(prev->ni_net->net_list.next, struct lnet_net,
 				 net_list);
+		if (list_empty(&net->net_ni_list))
+			return NULL;
 		/* get the ni on it */
 		ni = list_entry(net->net_ni_list.next, struct lnet_ni,
 				ni_netlist);
@@ -2151,6 +2961,9 @@ lnet_get_next_ni_locked(struct lnet_net *mynet, struct lnet_ni *prev)
 		return ni;
 	}
 
+	if (list_empty(&prev->ni_netlist))
+		return NULL;
+
 	/* there are more nis left */
 	ni = list_entry(prev->ni_netlist.next, struct lnet_ni, ni_netlist);
 
@@ -2208,12 +3021,35 @@ lnet_get_ni_config(struct lnet_ioctl_config_ni *cfg_ni,
 	return rc;
 }
 
+int lnet_get_ni_stats(struct lnet_ioctl_element_msg_stats *msg_stats)
+{
+	struct lnet_ni *ni;
+	int cpt;
+	int rc = -ENOENT;
+
+	if (!msg_stats)
+		return -EINVAL;
+
+	cpt = lnet_net_lock_current();
+
+	ni = lnet_get_ni_idx_locked(msg_stats->im_idx);
+
+	if (ni) {
+		lnet_usr_translate_stats(msg_stats, &ni->ni_stats);
+		rc = 0;
+	}
+
+	lnet_net_unlock(cpt);
+
+	return rc;
+}
+
 static int lnet_add_net_common(struct lnet_net *net,
 			       struct lnet_ioctl_config_lnd_tunables *tun)
 {
 	__u32			net_id;
-	struct lnet_ping_info	*pinfo;
-	struct lnet_handle_md	md_handle;
+	struct lnet_ping_buffer *pbuf;
+	struct lnet_handle_md	ping_mdh;
 	int			rc;
 	struct lnet_remotenet *rnet;
 	int			net_ni_count;
@@ -2235,7 +3071,7 @@ static int lnet_add_net_common(struct lnet_net *net,
 
 	/*
 	 * make sure you calculate the correct number of slots in the ping
-	 * info. Since the ping info is a flattened list of all the NIs,
+	 * buffer. Since the ping info is a flattened list of all the NIs,
 	 * we should allocate enough slots to accomodate the number of NIs
 	 * which will be added.
 	 *
@@ -2244,9 +3080,9 @@ static int lnet_add_net_common(struct lnet_net *net,
 	 */
 	net_ni_count = lnet_get_net_ni_count_pre(net);
 
-	rc = lnet_ping_info_setup(&pinfo, &md_handle,
-				  net_ni_count + lnet_get_ni_count(),
-				  false);
+	rc = lnet_ping_target_setup(&pbuf, &ping_mdh,
+				    net_ni_count + lnet_get_ni_count(),
+				    false);
 	if (rc < 0) {
 		lnet_net_free(net);
 		return rc;
@@ -2297,13 +3133,13 @@ static int lnet_add_net_common(struct lnet_net *net,
 	lnet_peer_net_added(net);
 	lnet_net_unlock(LNET_LOCK_EX);
 
-	lnet_ping_target_update(pinfo, md_handle);
+	lnet_ping_target_update(pbuf, ping_mdh);
 
 	return 0;
 
 failed:
-	lnet_ping_md_unlink(pinfo, &md_handle);
-	lnet_ping_info_free(pinfo);
+	lnet_ping_md_unlink(pbuf, &ping_mdh);
+	lnet_ping_buffer_decref(pbuf);
 	return rc;
 }
 
@@ -2351,7 +3187,7 @@ int lnet_dyn_add_ni(struct lnet_ioctl_config_ni *conf)
 	struct lnet_ni *ni;
 	struct lnet_ioctl_config_lnd_tunables *tun = NULL;
 	int rc, i;
-	__u32 net_id;
+	__u32 net_id, lnd_type;
 
 	/* get the tunables if they are available */
 	if (conf->lic_cfg_hdr.ioc_len >=
@@ -2365,6 +3201,12 @@ int lnet_dyn_add_ni(struct lnet_ioctl_config_ni *conf)
 						  tun);
 
 	net_id = LNET_NIDNET(conf->lic_nid);
+	lnd_type = LNET_NETTYP(net_id);
+
+	if (!libcfs_isknown_lnd(lnd_type)) {
+		CERROR("No valid net and lnd information provided\n");
+		return -EINVAL;
+	}
 
 	net = lnet_net_alloc(net_id, NULL);
 	if (!net)
@@ -2394,8 +3236,8 @@ int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf)
 	struct lnet_net	 *net;
 	struct lnet_ni *ni;
 	__u32 net_id = LNET_NIDNET(conf->lic_nid);
-	struct lnet_ping_info *pinfo;
-	struct lnet_handle_md md_handle;
+	struct lnet_ping_buffer *pbuf;
+	struct lnet_handle_md  ping_mdh;
 	int		  rc;
 	int		  net_count;
 	__u32		  addr;
@@ -2413,7 +3255,7 @@ int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf)
 		CERROR("net %s not found\n",
 		       libcfs_net2str(net_id));
 		rc = -ENOENT;
-		goto net_unlock;
+		goto unlock_net;
 	}
 
 	addr = LNET_NIDADDR(conf->lic_nid);
@@ -2424,28 +3266,28 @@ int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf)
 		lnet_net_unlock(0);
 
 		/* create and link a new ping info, before removing the old one */
-		rc = lnet_ping_info_setup(&pinfo, &md_handle,
+		rc = lnet_ping_target_setup(&pbuf, &ping_mdh,
 					lnet_get_ni_count() - net_count,
 					false);
 		if (rc != 0)
-			goto out;
+			goto unlock_api_mutex;
 
 		lnet_shutdown_lndnet(net);
 
 		if (lnet_count_acceptor_nets() == 0)
 			lnet_acceptor_stop();
 
-		lnet_ping_target_update(pinfo, md_handle);
+		lnet_ping_target_update(pbuf, ping_mdh);
 
-		goto out;
+		goto unlock_api_mutex;
 	}
 
 	ni = lnet_nid2ni_locked(conf->lic_nid, 0);
 	if (!ni) {
-		CERROR("nid %s not found \n",
+		CERROR("nid %s not found\n",
 		       libcfs_nid2str(conf->lic_nid));
 		rc = -ENOENT;
-		goto net_unlock;
+		goto unlock_net;
 	}
 
 	net_count = lnet_get_net_ni_count_locked(net);
@@ -2453,27 +3295,27 @@ int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf)
 	lnet_net_unlock(0);
 
 	/* create and link a new ping info, before removing the old one */
-	rc = lnet_ping_info_setup(&pinfo, &md_handle,
+	rc = lnet_ping_target_setup(&pbuf, &ping_mdh,
 				  lnet_get_ni_count() - 1, false);
 	if (rc != 0)
-		goto out;
+		goto unlock_api_mutex;
 
 	lnet_shutdown_lndni(ni);
 
 	if (lnet_count_acceptor_nets() == 0)
 		lnet_acceptor_stop();
 
-	lnet_ping_target_update(pinfo, md_handle);
+	lnet_ping_target_update(pbuf, ping_mdh);
 
 	/* check if the net is empty and remove it if it is */
 	if (net_count == 1)
 		lnet_shutdown_lndnet(net);
 
-	goto out;
+	goto unlock_api_mutex;
 
-net_unlock:
+unlock_net:
 	lnet_net_unlock(0);
-out:
+unlock_api_mutex:
 	mutex_unlock(&the_lnet.ln_api_mutex);
 
 	return rc;
@@ -2541,8 +3383,8 @@ int
 lnet_dyn_del_net(__u32 net_id)
 {
 	struct lnet_net	 *net;
-	struct lnet_ping_info *pinfo;
-	struct lnet_handle_md md_handle;
+	struct lnet_ping_buffer *pbuf;
+	struct lnet_handle_md ping_mdh;
 	int		  rc;
 	int		  net_ni_count;
 
@@ -2556,6 +3398,7 @@ lnet_dyn_del_net(__u32 net_id)
 
 	net = lnet_get_net_locked(net_id);
 	if (net == NULL) {
+		lnet_net_unlock(0);
 		rc = -EINVAL;
 		goto out;
 	}
@@ -2565,8 +3408,8 @@ lnet_dyn_del_net(__u32 net_id)
 	lnet_net_unlock(0);
 
 	/* create and link a new ping info, before removing the old one */
-	rc = lnet_ping_info_setup(&pinfo, &md_handle,
-				  lnet_get_ni_count() - net_ni_count, false);
+	rc = lnet_ping_target_setup(&pbuf, &ping_mdh,
+				    lnet_get_ni_count() - net_ni_count, false);
 	if (rc != 0)
 		goto out;
 
@@ -2575,7 +3418,7 @@ lnet_dyn_del_net(__u32 net_id)
 	if (lnet_count_acceptor_nets() == 0)
 		lnet_acceptor_stop();
 
-	lnet_ping_target_update(pinfo, md_handle);
+	lnet_ping_target_update(pbuf, ping_mdh);
 
 out:
 	mutex_unlock(&the_lnet.ln_api_mutex);
@@ -2593,6 +3436,102 @@ __u32 lnet_get_dlc_seq_locked(void)
 	return atomic_read(&lnet_dlc_seq_no);
 }
 
+static void
+lnet_ni_set_healthv(lnet_nid_t nid, int value, bool all)
+{
+	struct lnet_net *net;
+	struct lnet_ni *ni;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+			if (ni->ni_nid == nid || all) {
+				atomic_set(&ni->ni_healthv, value);
+				if (list_empty(&ni->ni_recovery) &&
+				    value < LNET_MAX_HEALTH_VALUE) {
+					CERROR("manually adding local NI %s to recovery\n",
+					       libcfs_nid2str(ni->ni_nid));
+					list_add_tail(&ni->ni_recovery,
+						      &the_lnet.ln_mt_localNIRecovq);
+					lnet_ni_addref_locked(ni, 0);
+				}
+				if (!all) {
+					lnet_net_unlock(LNET_LOCK_EX);
+					return;
+				}
+			}
+		}
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+static int
+lnet_get_local_ni_hstats(struct lnet_ioctl_local_ni_hstats *stats)
+{
+	int cpt, rc = 0;
+	struct lnet_ni *ni;
+	lnet_nid_t nid = stats->hlni_nid;
+
+	cpt = lnet_net_lock_current();
+	ni = lnet_nid2ni_locked(nid, cpt);
+
+	if (!ni) {
+		rc = -ENOENT;
+		goto unlock;
+	}
+
+	stats->hlni_local_interrupt = atomic_read(&ni->ni_hstats.hlt_local_interrupt);
+	stats->hlni_local_dropped = atomic_read(&ni->ni_hstats.hlt_local_dropped);
+	stats->hlni_local_aborted = atomic_read(&ni->ni_hstats.hlt_local_aborted);
+	stats->hlni_local_no_route = atomic_read(&ni->ni_hstats.hlt_local_no_route);
+	stats->hlni_local_timeout = atomic_read(&ni->ni_hstats.hlt_local_timeout);
+	stats->hlni_local_error = atomic_read(&ni->ni_hstats.hlt_local_error);
+	stats->hlni_health_value = atomic_read(&ni->ni_healthv);
+
+unlock:
+	lnet_net_unlock(cpt);
+
+	return rc;
+}
+
+static int
+lnet_get_local_ni_recovery_list(struct lnet_ioctl_recovery_list *list)
+{
+	struct lnet_ni *ni;
+	int i = 0;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	list_for_each_entry(ni, &the_lnet.ln_mt_localNIRecovq, ni_recovery) {
+		list->rlst_nid_array[i] = ni->ni_nid;
+		i++;
+		if (i >= LNET_MAX_SHOW_NUM_NID)
+			break;
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+	list->rlst_num_nids = i;
+
+	return 0;
+}
+
+static int
+lnet_get_peer_ni_recovery_list(struct lnet_ioctl_recovery_list *list)
+{
+	struct lnet_peer_ni *lpni;
+	int i = 0;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	list_for_each_entry(lpni, &the_lnet.ln_mt_peerNIRecovq, lpni_recovery) {
+		list->rlst_nid_array[i] = lpni->lpni_nid;
+		i++;
+		if (i >= LNET_MAX_SHOW_NUM_NID)
+			break;
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+	list->rlst_num_nids = i;
+
+	return 0;
+}
+
 /**
  * LNet ioctl handler.
  *
@@ -2674,9 +3613,10 @@ LNetCtl(unsigned int cmd, void *arg)
 		__u32 tun_size;
 
 		cfg_ni = arg;
+
 		/* get the tunables if they are available */
 		if (cfg_ni->lic_cfg_hdr.ioc_len <
-		    sizeof(*cfg_ni) + sizeof(*stats)+ sizeof(*tun))
+		    sizeof(*cfg_ni) + sizeof(*stats) + sizeof(*tun))
 			return -EINVAL;
 
 		stats = (struct lnet_ioctl_element_stats *)
@@ -2693,6 +3633,19 @@ LNetCtl(unsigned int cmd, void *arg)
 		return rc;
 	}
 
+	case IOC_LIBCFS_GET_LOCAL_NI_MSG_STATS: {
+		struct lnet_ioctl_element_msg_stats *msg_stats = arg;
+
+		if (msg_stats->im_hdr.ioc_len != sizeof(*msg_stats))
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_get_ni_stats(msg_stats);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+
+		return rc;
+	}
+
 	case IOC_LIBCFS_GET_NET: {
 		size_t total = sizeof(*config) +
 			       sizeof(struct lnet_ioctl_net_config);
@@ -2753,22 +3706,22 @@ LNetCtl(unsigned int cmd, void *arg)
 		return rc;
 
 	case IOC_LIBCFS_SET_NUMA_RANGE: {
-		struct lnet_ioctl_numa_range *numa;
+		struct lnet_ioctl_set_value *numa;
 		numa = arg;
-		if (numa->nr_hdr.ioc_len != sizeof(*numa))
+		if (numa->sv_hdr.ioc_len != sizeof(*numa))
 			return -EINVAL;
-		mutex_lock(&the_lnet.ln_api_mutex);
-		lnet_numa_range = numa->nr_range;
-		mutex_unlock(&the_lnet.ln_api_mutex);
+		lnet_net_lock(LNET_LOCK_EX);
+		lnet_numa_range = numa->sv_value;
+		lnet_net_unlock(LNET_LOCK_EX);
 		return 0;
 	}
 
 	case IOC_LIBCFS_GET_NUMA_RANGE: {
-		struct lnet_ioctl_numa_range *numa;
+		struct lnet_ioctl_set_value *numa;
 		numa = arg;
-		if (numa->nr_hdr.ioc_len != sizeof(*numa))
+		if (numa->sv_hdr.ioc_len != sizeof(*numa))
 			return -EINVAL;
-		numa->nr_range = lnet_numa_range;
+		numa->sv_value = lnet_numa_range;
 		return 0;
 	}
 
@@ -2789,6 +3742,33 @@ LNetCtl(unsigned int cmd, void *arg)
 		return rc;
 	}
 
+	case IOC_LIBCFS_GET_LOCAL_HSTATS: {
+		struct lnet_ioctl_local_ni_hstats *stats = arg;
+
+		if (stats->hlni_hdr.ioc_len < sizeof(*stats))
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_get_local_ni_hstats(stats);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+
+		return rc;
+	}
+
+	case IOC_LIBCFS_GET_RECOVERY_QUEUE: {
+		struct lnet_ioctl_recovery_list *list = arg;
+		if (list->rlst_hdr.ioc_len < sizeof(*list))
+			return -EINVAL;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		if (list->rlst_type == LNET_HEALTH_TYPE_LOCAL_NI)
+			rc = lnet_get_local_ni_recovery_list(list);
+		else
+			rc = lnet_get_peer_ni_recovery_list(list);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+	}
+
 	case IOC_LIBCFS_ADD_PEER_NI: {
 		struct lnet_ioctl_peer_cfg *cfg = arg;
 
@@ -2796,9 +3776,9 @@ LNetCtl(unsigned int cmd, void *arg)
 			return -EINVAL;
 
 		mutex_lock(&the_lnet.ln_api_mutex);
-		rc = lnet_add_peer_ni_to_peer(cfg->prcfg_prim_nid,
-					      cfg->prcfg_cfg_nid,
-					      cfg->prcfg_mr);
+		rc = lnet_add_peer_ni(cfg->prcfg_prim_nid,
+				      cfg->prcfg_cfg_nid,
+				      cfg->prcfg_mr);
 		mutex_unlock(&the_lnet.ln_api_mutex);
 		return rc;
 	}
@@ -2810,8 +3790,8 @@ LNetCtl(unsigned int cmd, void *arg)
 			return -EINVAL;
 
 		mutex_lock(&the_lnet.ln_api_mutex);
-		rc = lnet_del_peer_ni_from_peer(cfg->prcfg_prim_nid,
-						cfg->prcfg_cfg_nid);
+		rc = lnet_del_peer_ni(cfg->prcfg_prim_nid,
+				      cfg->prcfg_cfg_nid);
 		mutex_unlock(&the_lnet.ln_api_mutex);
 		return rc;
 	}
@@ -2840,30 +3820,65 @@ LNetCtl(unsigned int cmd, void *arg)
 
 	case IOC_LIBCFS_GET_PEER_NI: {
 		struct lnet_ioctl_peer_cfg *cfg = arg;
-		struct lnet_peer_ni_credit_info __user *lpni_cri;
-		struct lnet_ioctl_element_stats __user *lpni_stats;
-		size_t usr_size = sizeof(*lpni_cri) + sizeof(*lpni_stats);
 
-		if ((cfg->prcfg_hdr.ioc_len != sizeof(*cfg)) ||
-		    (cfg->prcfg_size != usr_size))
+		if (cfg->prcfg_hdr.ioc_len < sizeof(*cfg))
 			return -EINVAL;
 
-		lpni_cri = cfg->prcfg_bulk;
-		lpni_stats = cfg->prcfg_bulk + sizeof(*lpni_cri);
+		mutex_lock(&the_lnet.ln_api_mutex);
+		rc = lnet_get_peer_info(cfg,
+					(void __user *)cfg->prcfg_bulk);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return rc;
+	}
+
+	case IOC_LIBCFS_GET_PEER_LIST: {
+		struct lnet_ioctl_peer_cfg *cfg = arg;
+
+		if (cfg->prcfg_hdr.ioc_len < sizeof(*cfg))
+			return -EINVAL;
 
 		mutex_lock(&the_lnet.ln_api_mutex);
-		rc = lnet_get_peer_info(cfg->prcfg_count, &cfg->prcfg_prim_nid,
-					&cfg->prcfg_cfg_nid, &cfg->prcfg_mr,
-					lpni_cri, lpni_stats);
+		rc = lnet_get_peer_list(&cfg->prcfg_count, &cfg->prcfg_size,
+				(struct lnet_process_id __user *)cfg->prcfg_bulk);
 		mutex_unlock(&the_lnet.ln_api_mutex);
 		return rc;
 	}
 
-	case IOC_LIBCFS_NOTIFY_ROUTER:
+	case IOC_LIBCFS_SET_HEALHV: {
+		struct lnet_ioctl_reset_health_cfg *cfg = arg;
+		int value;
+		if (cfg->rh_hdr.ioc_len < sizeof(*cfg))
+			return -EINVAL;
+		if (cfg->rh_value < 0 ||
+		    cfg->rh_value > LNET_MAX_HEALTH_VALUE)
+			value = LNET_MAX_HEALTH_VALUE;
+		else
+			value = cfg->rh_value;
+		CDEBUG(D_NET, "Manually setting healthv to %d for %s:%s. all = %d\n",
+		       value, (cfg->rh_type == LNET_HEALTH_TYPE_LOCAL_NI) ?
+		       "local" : "peer", libcfs_nid2str(cfg->rh_nid), cfg->rh_all);
+		mutex_lock(&the_lnet.ln_api_mutex);
+		if (cfg->rh_type == LNET_HEALTH_TYPE_LOCAL_NI)
+			lnet_ni_set_healthv(cfg->rh_nid, value,
+					     cfg->rh_all);
+		else
+			lnet_peer_ni_set_healthv(cfg->rh_nid, value,
+						  cfg->rh_all);
+		mutex_unlock(&the_lnet.ln_api_mutex);
+		return 0;
+	}
+
+	case IOC_LIBCFS_NOTIFY_ROUTER: {
+		time64_t deadline = ktime_get_real_seconds() - data->ioc_u64[0];
+
+		/* The deadline passed in by the user should be some time in
+		 * seconds in the future since the UNIX epoch. We have to map
+		 * that deadline to the wall clock.
+		 */
+		deadline += ktime_get_seconds();
 		return lnet_notify(NULL, data->ioc_nid, data->ioc_flags,
-				  cfs_time_current() -
-				  cfs_time_seconds(cfs_time_current_sec() -
-						   (time_t)data->ioc_u64[0]));
+				   deadline);
+	}
 
 	case IOC_LIBCFS_LNET_DIST:
 		rc = LNetDist(data->ioc_nid, &data->ioc_nid, &data->ioc_u32[1]);
@@ -2888,24 +3903,77 @@ LNetCtl(unsigned int cmd, void *arg)
 		id.nid = data->ioc_nid;
 		id.pid = data->ioc_u32[0];
 
-		/* Don't block longer than 2 minutes */
-		if (data->ioc_u32[1] > 120 * MSEC_PER_SEC)
-			return -EINVAL;
-
-		/* If timestamp is negative then disable timeout */
-		if ((s32)data->ioc_u32[1] < 0)
-			timeout = MAX_SCHEDULE_TIMEOUT;
+		/* If timeout is negative then set default of 3 minutes */
+		if (((s32)data->ioc_u32[1] <= 0) ||
+		    data->ioc_u32[1] > (DEFAULT_PEER_TIMEOUT * MSEC_PER_SEC))
+			timeout = msecs_to_jiffies(DEFAULT_PEER_TIMEOUT * MSEC_PER_SEC);
 		else
 			timeout = msecs_to_jiffies(data->ioc_u32[1]);
 
 		rc = lnet_ping(id, timeout, data->ioc_pbuf1,
 			       data->ioc_plen1 / sizeof(struct lnet_process_id));
+
 		if (rc < 0)
 			return rc;
+
 		data->ioc_count = rc;
 		return 0;
 	}
 
+	case IOC_LIBCFS_PING_PEER: {
+		struct lnet_ioctl_ping_data *ping = arg;
+		struct lnet_peer *lp;
+		signed long timeout;
+
+		/* If timeout is negative then set default of 3 minutes */
+		if (((s32)ping->op_param) <= 0 ||
+		    ping->op_param > (DEFAULT_PEER_TIMEOUT * MSEC_PER_SEC))
+			timeout = msecs_to_jiffies(DEFAULT_PEER_TIMEOUT * MSEC_PER_SEC);
+		else
+			timeout = msecs_to_jiffies(ping->op_param);
+
+		rc = lnet_ping(ping->ping_id, timeout,
+			       ping->ping_buf,
+			       ping->ping_count);
+		if (rc < 0)
+			return rc;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		lp = lnet_find_peer(ping->ping_id.nid);
+		if (lp) {
+			ping->ping_id.nid = lp->lp_primary_nid;
+			ping->mr_info = lnet_peer_is_multi_rail(lp);
+			lnet_peer_decref_locked(lp);
+		}
+		mutex_unlock(&the_lnet.ln_api_mutex);
+
+		ping->ping_count = rc;
+		return 0;
+	}
+
+	case IOC_LIBCFS_DISCOVER: {
+		struct lnet_ioctl_ping_data *discover = arg;
+		struct lnet_peer *lp;
+
+		rc = lnet_discover(discover->ping_id, discover->op_param,
+				   discover->ping_buf,
+				   discover->ping_count);
+		if (rc < 0)
+			return rc;
+
+		mutex_lock(&the_lnet.ln_api_mutex);
+		lp = lnet_find_peer(discover->ping_id.nid);
+		if (lp) {
+			discover->ping_id.nid = lp->lp_primary_nid;
+			discover->mr_info = lnet_peer_is_multi_rail(lp);
+			lnet_peer_decref_locked(lp);
+		}
+		mutex_unlock(&the_lnet.ln_api_mutex);
+
+		discover->ping_count = rc;
+		return 0;
+	}
+
 	default:
 		ni = lnet_net2ni_addref(data->ioc_net);
 		if (ni == NULL)
@@ -3005,43 +4073,47 @@ static int lnet_ping(struct lnet_process_id id, signed long timeout,
 	struct lnet_handle_md mdh;
 	struct lnet_event event;
 	struct lnet_md md = { NULL };
-	int		     which;
-	int		     unlinked = 0;
-	int		     replied = 0;
+	int which;
+	int unlinked = 0;
+	int replied = 0;
 	const signed long a_long_time = msecs_to_jiffies(60 * MSEC_PER_SEC);
-	int		     infosz;
-	struct lnet_ping_info *info;
+	struct lnet_ping_buffer *pbuf;
 	struct lnet_process_id tmpid;
-	int		     i;
-	int		     nob;
-	int		     rc;
-	int		     rc2;
-	sigset_t	 blocked;
-
-	infosz = offsetof(struct lnet_ping_info, pi_ni[n_ids]);
+	int i;
+	int nob;
+	int rc;
+	int rc2;
+	sigset_t blocked;
 
 	/* n_ids limit is arbitrary */
-	if (n_ids <= 0 || n_ids > 20 || id.nid == LNET_NID_ANY)
+	if (n_ids <= 0 || id.nid == LNET_NID_ANY)
 		return -EINVAL;
 
+	/*
+	 * if the user buffer has more space than the lnet_interfaces_max
+	 * then only fill it up to lnet_interfaces_max
+	 */
+	if (n_ids > lnet_interfaces_max)
+		n_ids = lnet_interfaces_max;
+
 	if (id.pid == LNET_PID_ANY)
 		id.pid = LNET_PID_LUSTRE;
 
-	LIBCFS_ALLOC(info, infosz);
-	if (info == NULL)
+	pbuf = lnet_ping_buffer_alloc(n_ids, GFP_NOFS);
+	if (!pbuf)
 		return -ENOMEM;
 
 	/* NB 2 events max (including any unlink event) */
 	rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &eqh);
 	if (rc != 0) {
 		CERROR("Can't allocate EQ: %d\n", rc);
-		goto out_0;
+		goto fail_ping_buffer_decref;
 	}
 
 	/* initialize md content */
-	md.start     = info;
-	md.length    = infosz;
-	md.threshold = 2; /*GET/REPLY*/
+	md.start     = &pbuf->pb_info;
+	md.length    = LNET_PING_INFO_SIZE(n_ids);
+	md.threshold = 2; /* GET/REPLY */
 	md.max_size  = 0;
 	md.options   = LNET_MD_TRUNCATE;
 	md.user_ptr  = NULL;
@@ -3050,16 +4122,15 @@ static int lnet_ping(struct lnet_process_id id, signed long timeout,
 	rc = LNetMDBind(md, LNET_UNLINK, &mdh);
 	if (rc != 0) {
 		CERROR("Can't bind MD: %d\n", rc);
-		goto out_1;
+		goto fail_free_eq;
 	}
 
 	rc = LNetGet(LNET_NID_ANY, mdh, id,
 		     LNET_RESERVED_PORTAL,
-		     LNET_PROTO_PING_MATCHBITS, 0);
+		     LNET_PROTO_PING_MATCHBITS, 0, false);
 
 	if (rc != 0) {
 		/* Don't CERROR; this could be deliberate! */
-
 		rc2 = LNetMDUnlink(mdh);
 		LASSERT(rc2 == 0);
 
@@ -3107,7 +4178,6 @@ static int lnet_ping(struct lnet_process_id id, signed long timeout,
 			replied = 1;
 			rc = event.mlength;
 		}
-
 	} while (rc2 <= 0 || !event.unlinked);
 
 	if (!replied) {
@@ -3115,68 +4185,170 @@ static int lnet_ping(struct lnet_process_id id, signed long timeout,
 			CWARN("%s: Unexpected rc >= 0 but no reply!\n",
 			      libcfs_id2str(id));
 		rc = -EIO;
-		goto out_1;
+		goto fail_free_eq;
 	}
 
 	nob = rc;
-	LASSERT(nob >= 0 && nob <= infosz);
+	LASSERT(nob >= 0 && nob <= LNET_PING_INFO_SIZE(n_ids));
 
-	rc = -EPROTO;				/* if I can't parse... */
+	rc = -EPROTO;		/* if I can't parse... */
 
 	if (nob < 8) {
-		/* can't check magic/version */
 		CERROR("%s: ping info too short %d\n",
 		       libcfs_id2str(id), nob);
-		goto out_1;
+		goto fail_free_eq;
 	}
 
-	if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) {
-		lnet_swap_pinginfo(info);
-	} else if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
+	if (pbuf->pb_info.pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) {
+		lnet_swap_pinginfo(pbuf);
+	} else if (pbuf->pb_info.pi_magic != LNET_PROTO_PING_MAGIC) {
 		CERROR("%s: Unexpected magic %08x\n",
-		       libcfs_id2str(id), info->pi_magic);
-		goto out_1;
+		       libcfs_id2str(id), pbuf->pb_info.pi_magic);
+		goto fail_free_eq;
 	}
 
-	if ((info->pi_features & LNET_PING_FEAT_NI_STATUS) == 0) {
+	if ((pbuf->pb_info.pi_features & LNET_PING_FEAT_NI_STATUS) == 0) {
 		CERROR("%s: ping w/o NI status: 0x%x\n",
-		       libcfs_id2str(id), info->pi_features);
-		goto out_1;
+		       libcfs_id2str(id), pbuf->pb_info.pi_features);
+		goto fail_free_eq;
 	}
 
-	if (nob < offsetof(struct lnet_ping_info, pi_ni[0])) {
-		CERROR("%s: Short reply %d(%d min)\n", libcfs_id2str(id),
-		       nob, (int)offsetof(struct lnet_ping_info, pi_ni[0]));
-		goto out_1;
+	if (nob < LNET_PING_INFO_SIZE(0)) {
+		CERROR("%s: Short reply %d(%d min)\n",
+		       libcfs_id2str(id),
+		       nob, (int)LNET_PING_INFO_SIZE(0));
+		goto fail_free_eq;
 	}
 
-	if (info->pi_nnis < n_ids)
-		n_ids = info->pi_nnis;
+	if (pbuf->pb_info.pi_nnis < n_ids)
+		n_ids = pbuf->pb_info.pi_nnis;
 
-	if (nob < offsetof(struct lnet_ping_info, pi_ni[n_ids])) {
-		CERROR("%s: Short reply %d(%d expected)\n", libcfs_id2str(id),
-		       nob, (int)offsetof(struct lnet_ping_info, pi_ni[n_ids]));
-		goto out_1;
+	if (nob < LNET_PING_INFO_SIZE(n_ids)) {
+		CERROR("%s: Short reply %d(%d expected)\n",
+		       libcfs_id2str(id),
+		       nob, (int)LNET_PING_INFO_SIZE(n_ids));
+		goto fail_free_eq;
 	}
 
-	rc = -EFAULT;				/* If I SEGV... */
+	rc = -EFAULT;		/* if I segv in copy_to_user()... */
 
 	memset(&tmpid, 0, sizeof(tmpid));
 	for (i = 0; i < n_ids; i++) {
-		tmpid.pid = info->pi_pid;
-		tmpid.nid = info->pi_ni[i].ns_nid;
+		tmpid.pid = pbuf->pb_info.pi_pid;
+		tmpid.nid = pbuf->pb_info.pi_ni[i].ns_nid;
 		if (copy_to_user(&ids[i], &tmpid, sizeof(tmpid)))
-			goto out_1;
+			goto fail_free_eq;
 	}
-	rc = info->pi_nnis;
+	rc = pbuf->pb_info.pi_nnis;
 
- out_1:
+ fail_free_eq:
 	rc2 = LNetEQFree(eqh);
 	if (rc2 != 0)
 		CERROR("rc2 %d\n", rc2);
 	LASSERT(rc2 == 0);
 
- out_0:
-	LIBCFS_FREE(info, infosz);
+ fail_ping_buffer_decref:
+	lnet_ping_buffer_decref(pbuf);
+	return rc;
+}
+
+static int
+lnet_discover(struct lnet_process_id id, __u32 force,
+	      struct lnet_process_id __user *ids, int n_ids)
+{
+	struct lnet_peer_ni *lpni;
+	struct lnet_peer_ni *p;
+	struct lnet_peer *lp;
+	struct lnet_process_id *buf;
+	int cpt;
+	int i;
+	int rc;
+	int max_intf = lnet_interfaces_max;
+	size_t buf_size;
+
+	if (n_ids <= 0 ||
+	    id.nid == LNET_NID_ANY)
+		return -EINVAL;
+
+	if (id.pid == LNET_PID_ANY)
+		id.pid = LNET_PID_LUSTRE;
+
+	/*
+	 * if the user buffer has more space than the max_intf
+	 * then only fill it up to max_intf
+	 */
+	if (n_ids > max_intf)
+		n_ids = max_intf;
+
+	buf_size = n_ids * sizeof(*buf);
+
+	LIBCFS_ALLOC(buf, buf_size);
+	if (!buf)
+		return -ENOMEM;
+
+	cpt = lnet_net_lock_current();
+	lpni = lnet_nid2peerni_locked(id.nid, LNET_NID_ANY, cpt);
+	if (IS_ERR(lpni)) {
+		rc = PTR_ERR(lpni);
+		goto out;
+	}
+
+	/*
+	 * Clearing the NIDS_UPTODATE flag ensures the peer will
+	 * be discovered, provided discovery has not been disabled.
+	 */
+	lp = lpni->lpni_peer_net->lpn_peer;
+	spin_lock(&lp->lp_lock);
+	lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE;
+	/* If the force flag is set, force a PING and PUSH as well. */
+	if (force)
+		lp->lp_state |= LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH;
+	spin_unlock(&lp->lp_lock);
+	rc = lnet_discover_peer_locked(lpni, cpt, true);
+	if (rc)
+		goto out_decref;
+
+	/* Peer may have changed. */
+	lp = lpni->lpni_peer_net->lpn_peer;
+	if (lp->lp_nnis < n_ids)
+		n_ids = lp->lp_nnis;
+
+	i = 0;
+	p = NULL;
+	while ((p = lnet_get_next_peer_ni_locked(lp, NULL, p)) != NULL) {
+		buf[i].pid = id.pid;
+		buf[i].nid = p->lpni_nid;
+		if (++i >= n_ids)
+			break;
+	}
+
+	lnet_net_unlock(cpt);
+
+	rc = -EFAULT;
+	if (copy_to_user(ids, buf, n_ids * sizeof(*buf)))
+		goto out_relock;
+	rc = n_ids;
+out_relock:
+	lnet_net_lock(cpt);
+out_decref:
+	lnet_peer_ni_decref_locked(lpni);
+out:
+	lnet_net_unlock(cpt);
+
+	LIBCFS_FREE(buf, buf_size);
+
 	return rc;
 }
+
+/**
+ * Retrieve peer discovery status.
+ *
+ * \retval 1 if lnet_peer_discovery_disabled is 0
+ * \retval 0 if lnet_peer_discovery_disabled is 1
+ */
+int
+LNetGetPeerDiscoveryStatus(void)
+{
+	return !lnet_peer_discovery_disabled;
+}
+EXPORT_SYMBOL(LNetGetPeerDiscoveryStatus);
diff --git a/drivers/staging/lustrefsx/lnet/lnet/config.c b/drivers/staging/lustrefsx/lnet/lnet/config.c
index 2f90e90849ac3..741711af0813f 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/config.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/config.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -32,6 +32,8 @@
 
 #define DEBUG_SUBSYSTEM S_LNET
 
+#include <linux/ctype.h>
+#include <linux/rtnetlink.h>
 #include <linux/inetdevice.h>
 #include <linux/nsproxy.h>
 #include <net/net_namespace.h>
@@ -123,10 +125,10 @@ lnet_ni_unique_net(struct list_head *nilist, char *iface)
 /* check that the NI is unique to the interfaces with in the same NI.
  * This is only a consideration if use_tcp_bonding is set */
 static bool
-lnet_ni_unique_ni(char *iface_list[LNET_NUM_INTERFACES], char *iface)
+lnet_ni_unique_ni(char *iface_list[LNET_INTERFACES_NUM], char *iface)
 {
 	int i;
-	for (i = 0; i < LNET_NUM_INTERFACES; i++) {
+	for (i = 0; i < LNET_INTERFACES_NUM; i++) {
 		if (iface_list[i] != NULL &&
 		    strncmp(iface_list[i], iface, strlen(iface)) == 0)
 			return false;
@@ -309,7 +311,7 @@ lnet_ni_free(struct lnet_ni *ni)
 	if (ni->ni_cpts != NULL)
 		cfs_expr_list_values_free(ni->ni_cpts, ni->ni_ncpts);
 
-	for (i = 0; i < LNET_NUM_INTERFACES &&
+	for (i = 0; i < LNET_INTERFACES_NUM &&
 		    ni->ni_interfaces[i] != NULL; i++) {
 		LIBCFS_FREE(ni->ni_interfaces[i],
 			    strlen(ni->ni_interfaces[i]) + 1);
@@ -409,11 +411,11 @@ lnet_ni_add_interface(struct lnet_ni *ni, char *iface)
 	 * can free the tokens at the end of the function.
 	 * The newly allocated ni_interfaces[] can be
 	 * freed when freeing the NI */
-	while (niface < LNET_NUM_INTERFACES &&
+	while (niface < LNET_INTERFACES_NUM &&
 	       ni->ni_interfaces[niface] != NULL)
 		niface++;
 
-	if (niface >= LNET_NUM_INTERFACES) {
+	if (niface >= LNET_INTERFACES_NUM) {
 		LCONSOLE_ERROR_MSG(0x115, "Too many interfaces "
 				   "for net %s\n",
 				   libcfs_net2str(LNET_NIDNET(ni->ni_nid)));
@@ -456,8 +458,9 @@ lnet_ni_alloc_common(struct lnet_net *net, char *iface)
 	}
 
 	spin_lock_init(&ni->ni_lock);
-	INIT_LIST_HEAD(&ni->ni_cptlist);
 	INIT_LIST_HEAD(&ni->ni_netlist);
+	INIT_LIST_HEAD(&ni->ni_recovery);
+	LNetInvalidateMDHandle(&ni->ni_ping_mdh);
 	ni->ni_refs = cfs_percpt_alloc(lnet_cpt_table(),
 				       sizeof(*ni->ni_refs[0]));
 	if (ni->ni_refs == NULL)
@@ -476,12 +479,12 @@ lnet_ni_alloc_common(struct lnet_net *net, char *iface)
 	ni->ni_nid = LNET_MKNID(net->net_id, 0);
 
 	/* Store net namespace in which current ni is being created */
-	if (current->nsproxy->net_ns != NULL)
+	if (current->nsproxy && current->nsproxy->net_ns)
 		ni->ni_net_ns = get_net(current->nsproxy->net_ns);
 	else
-		ni->ni_net_ns = NULL;
+		ni->ni_net_ns = get_net(&init_net);
 
-	ni->ni_last_alive = cfs_time_current_sec();
+	ni->ni_last_alive = ktime_get_real_seconds();
 	ni->ni_state = LNET_NI_STATE_INIT;
 	list_add_tail(&ni->ni_netlist, &net->net_ni_added);
 
@@ -1121,26 +1124,26 @@ lnet_parse_priority(char *str, unsigned int *priority, char **token)
 }
 
 static int
-lnet_parse_route (char *str, int *im_a_router)
+lnet_parse_route(char *str, int *im_a_router)
 {
 	/* static scratch buffer OK (single threaded) */
-	static char	  cmd[LNET_SINGLE_TEXTBUF_NOB];
+	static char cmd[LNET_SINGLE_TEXTBUF_NOB];
 
-	struct list_head  nets;
-	struct list_head  gateways;
+	struct list_head nets;
+	struct list_head gateways;
 	struct list_head *tmp1;
 	struct list_head *tmp2;
-	__u32		  net;
-	lnet_nid_t	  nid;
-	struct lnet_text_buf  *ltb;
-	int		  rc;
-	char		 *sep;
-	char		 *token = str;
-	int		  ntokens = 0;
-	int		  myrc = -1;
-	__u32		  hops;
-	int		  got_hops = 0;
-	unsigned int	  priority = 0;
+	__u32 net;
+	lnet_nid_t nid;
+	struct lnet_text_buf *ltb;
+	int rc;
+	char *sep;
+	char *token = str;
+	int ntokens = 0;
+	int myrc = -1;
+	__u32 hops;
+	int got_hops = 0;
+	unsigned int priority = 0;
 
 	INIT_LIST_HEAD(&gateways);
 	INIT_LIST_HEAD(&nets);
@@ -1214,8 +1217,7 @@ lnet_parse_route (char *str, int *im_a_router)
 					goto token_error;
 
 				nid = libcfs_str2nid(ltb->ltb_text);
-				if (nid == LNET_NID_ANY ||
-				    LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
+				if (nid == LNET_NID_ANY || nid == LNET_NID_LO_0)
 					goto token_error;
 			}
 		}
@@ -1603,11 +1605,12 @@ lnet_match_networks (char **networksp, char *ip2nets, __u32 *ipaddrs, int nip)
 }
 /*
  * kernel 5.3: commit ef11db3310e272d3d8dbe8739e0770820dd20e52
+ * kernel 4.18.0-193.el8:
  * added in_dev_for_each_ifa_rtnl and in_dev_for_each_ifa_rcu
  * and removed for_ifa and endfor_ifa.
  * Use the _rntl variant as the current locking is rtnl.
  */
-#ifdef in_dev_for_each_ifa_rtnl
+#ifdef HAVE_IN_DEV_FOR_EACH_IFA_RTNL
 #define DECLARE_CONST_IN_IFADDR(ifa)		const struct in_ifaddr *ifa
 #define endfor_ifa(in_dev)
 #else
@@ -1653,7 +1656,7 @@ int lnet_inet_enumerate(struct lnet_inetdev **dev_list, struct net *ns)
 			if (nip >= nalloc) {
 				struct lnet_inetdev *tmp;
 
-				nalloc += LNET_NUM_INTERFACES;
+				nalloc += LNET_INTERFACES_NUM;
 				tmp = krealloc(ifaces, nalloc * sizeof(*tmp),
 					       GFP_KERNEL);
 				if (!tmp) {
@@ -1697,7 +1700,10 @@ lnet_parse_ip2nets (char **networksp, char *ip2nets)
 	int	   rc;
 	int i;
 
-	nip = lnet_inet_enumerate(&ifaces, current->nsproxy->net_ns);
+	if (current->nsproxy && current->nsproxy->net_ns)
+		nip = lnet_inet_enumerate(&ifaces, current->nsproxy->net_ns);
+	else
+		nip = lnet_inet_enumerate(&ifaces, &init_net);
 	if (nip < 0) {
 		if (nip != -ENOENT) {
 			LCONSOLE_ERROR_MSG(0x117,
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-eq.c b/drivers/staging/lustrefsx/lnet/lnet/lib-eq.c
index 3bca6b77539a6..354c9768a3a1d 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lib-eq.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-eq.c
@@ -159,8 +159,6 @@ LNetEQFree(struct lnet_handle_eq eqh)
 	int		size = 0;
 	int		i;
 
-	LASSERT(the_lnet.ln_refcount > 0);
-
 	lnet_res_lock(LNET_LOCK_EX);
 	/* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do
 	 * both EQ lookup and poll event with only lnet_eq_wait_lock */
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-md.c b/drivers/staging/lustrefsx/lnet/lnet/lib-md.c
index a3d0487063cbd..9bf890c9477b6 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lib-md.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-md.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2013, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-move.c b/drivers/staging/lustrefsx/lnet/lnet/lib-move.c
index b60106f949b69..f52621c56b3de 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lib-move.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-move.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -36,6 +36,8 @@
 
 #define DEBUG_SUBSYSTEM S_LNET
 
+#include <linux/pagemap.h>
+
 #include <lnet/lib-lnet.h>
 #include <linux/nsproxy.h>
 #include <net/net_namespace.h>
@@ -44,6 +46,119 @@ static int local_nid_dist_zero = 1;
 module_param(local_nid_dist_zero, int, 0444);
 MODULE_PARM_DESC(local_nid_dist_zero, "Reserved");
 
+struct lnet_send_data {
+	struct lnet_ni *sd_best_ni;
+	struct lnet_peer_ni *sd_best_lpni;
+	struct lnet_peer_ni *sd_final_dst_lpni;
+	struct lnet_peer *sd_peer;
+	struct lnet_peer *sd_gw_peer;
+	struct lnet_peer_ni *sd_gw_lpni;
+	struct lnet_peer_net *sd_peer_net;
+	struct lnet_msg *sd_msg;
+	lnet_nid_t sd_dst_nid;
+	lnet_nid_t sd_src_nid;
+	lnet_nid_t sd_rtr_nid;
+	int sd_cpt;
+	int sd_md_cpt;
+	__u32 sd_send_case;
+};
+
+static inline struct lnet_comm_count *
+get_stats_counts(struct lnet_element_stats *stats,
+		 enum lnet_stats_type stats_type)
+{
+	switch (stats_type) {
+	case LNET_STATS_TYPE_SEND:
+		return &stats->el_send_stats;
+	case LNET_STATS_TYPE_RECV:
+		return &stats->el_recv_stats;
+	case LNET_STATS_TYPE_DROP:
+		return &stats->el_drop_stats;
+	default:
+		CERROR("Unknown stats type\n");
+	}
+
+	return NULL;
+}
+
+void lnet_incr_stats(struct lnet_element_stats *stats,
+		     enum lnet_msg_type msg_type,
+		     enum lnet_stats_type stats_type)
+{
+	struct lnet_comm_count *counts = get_stats_counts(stats, stats_type);
+	if (!counts)
+		return;
+
+	switch (msg_type) {
+	case LNET_MSG_ACK:
+		atomic_inc(&counts->co_ack_count);
+		break;
+	case LNET_MSG_PUT:
+		atomic_inc(&counts->co_put_count);
+		break;
+	case LNET_MSG_GET:
+		atomic_inc(&counts->co_get_count);
+		break;
+	case LNET_MSG_REPLY:
+		atomic_inc(&counts->co_reply_count);
+		break;
+	case LNET_MSG_HELLO:
+		atomic_inc(&counts->co_hello_count);
+		break;
+	default:
+		CERROR("There is a BUG in the code. Unknown message type\n");
+		break;
+	}
+}
+
+__u32 lnet_sum_stats(struct lnet_element_stats *stats,
+		     enum lnet_stats_type stats_type)
+{
+	struct lnet_comm_count *counts = get_stats_counts(stats, stats_type);
+	if (!counts)
+		return 0;
+
+	return (atomic_read(&counts->co_ack_count) +
+		atomic_read(&counts->co_put_count) +
+		atomic_read(&counts->co_get_count) +
+		atomic_read(&counts->co_reply_count) +
+		atomic_read(&counts->co_hello_count));
+}
+
+static inline void assign_stats(struct lnet_ioctl_comm_count *msg_stats,
+				struct lnet_comm_count *counts)
+{
+	msg_stats->ico_get_count = atomic_read(&counts->co_get_count);
+	msg_stats->ico_put_count = atomic_read(&counts->co_put_count);
+	msg_stats->ico_reply_count = atomic_read(&counts->co_reply_count);
+	msg_stats->ico_ack_count = atomic_read(&counts->co_ack_count);
+	msg_stats->ico_hello_count = atomic_read(&counts->co_hello_count);
+}
+
+void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
+			      struct lnet_element_stats *stats)
+{
+	struct lnet_comm_count *counts;
+
+	LASSERT(msg_stats);
+	LASSERT(stats);
+
+	counts = get_stats_counts(stats, LNET_STATS_TYPE_SEND);
+	if (!counts)
+		return;
+	assign_stats(&msg_stats->im_send_stats, counts);
+
+	counts = get_stats_counts(stats, LNET_STATS_TYPE_RECV);
+	if (!counts)
+		return;
+	assign_stats(&msg_stats->im_recv_stats, counts);
+
+	counts = get_stats_counts(stats, LNET_STATS_TYPE_DROP);
+	if (!counts)
+		return;
+	assign_stats(&msg_stats->im_drop_stats, counts);
+}
+
 int
 lnet_fail_nid(lnet_nid_t nid, unsigned int threshold)
 {
@@ -630,25 +745,29 @@ lnet_prep_send(struct lnet_msg *msg, int type, struct lnet_process_id target,
 
 	memset (&msg->msg_hdr, 0, sizeof (msg->msg_hdr));
 	msg->msg_hdr.type           = cpu_to_le32(type);
+	/* dest_nid will be overwritten by lnet_select_pathway() */
+	msg->msg_hdr.dest_nid       = cpu_to_le64(target.nid);
 	msg->msg_hdr.dest_pid       = cpu_to_le32(target.pid);
 	/* src_nid will be set later */
 	msg->msg_hdr.src_pid        = cpu_to_le32(the_lnet.ln_pid);
 	msg->msg_hdr.payload_length = cpu_to_le32(len);
 }
 
-static void
+void
 lnet_ni_send(struct lnet_ni *ni, struct lnet_msg *msg)
 {
-	void   *priv = msg->msg_private;
-	int	rc;
+	void *priv = msg->msg_private;
+	int rc;
 
-	LASSERT (!in_interrupt ());
-	LASSERT (LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND ||
-		 (msg->msg_txcredit && msg->msg_peertxcredit));
+	LASSERT(!in_interrupt());
+	LASSERT(ni->ni_nid == LNET_NID_LO_0 ||
+		(msg->msg_txcredit && msg->msg_peertxcredit));
 
 	rc = (ni->ni_net->net_lnd->lnd_send)(ni, priv, msg);
-	if (rc < 0)
+	if (rc < 0) {
+		msg->msg_no_resend = true;
 		lnet_finalize(msg, rc);
+	}
 }
 
 static int
@@ -686,7 +805,7 @@ lnet_ni_eager_recv(struct lnet_ni *ni, struct lnet_msg *msg)
 static void
 lnet_ni_query_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp)
 {
-	cfs_time_t last_alive = 0;
+	time64_t last_alive = 0;
 	int cpt = lnet_cpt_of_nid_locked(lp->lpni_nid, ni);
 
 	LASSERT(lnet_peer_aliveness_enabled(lp));
@@ -696,7 +815,7 @@ lnet_ni_query_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp)
 	(ni->ni_net->net_lnd->lnd_query)(ni, lp->lpni_nid, &last_alive);
 	lnet_net_lock(cpt);
 
-	lp->lpni_last_query = cfs_time_current();
+	lp->lpni_last_query = ktime_get_seconds();
 
 	if (last_alive != 0) /* NI has updated timestamp */
 		lp->lpni_last_alive = last_alive;
@@ -704,10 +823,10 @@ lnet_ni_query_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp)
 
 /* NB: always called with lnet_net_lock held */
 static inline int
-lnet_peer_is_alive (struct lnet_peer_ni *lp, cfs_time_t now)
+lnet_peer_is_alive(struct lnet_peer_ni *lp, time64_t now)
 {
-	int        alive;
-	cfs_time_t deadline;
+	int alive;
+	time64_t deadline;
 
 	LASSERT (lnet_peer_aliveness_enabled(lp));
 
@@ -717,16 +836,14 @@ lnet_peer_is_alive (struct lnet_peer_ni *lp, cfs_time_t now)
 	 */
 	spin_lock(&lp->lpni_lock);
 	if (!lp->lpni_alive && lp->lpni_alive_count > 0 &&
-	    cfs_time_aftereq(lp->lpni_timestamp, lp->lpni_last_alive)) {
+	    lp->lpni_timestamp >= lp->lpni_last_alive) {
 		spin_unlock(&lp->lpni_lock);
 		return 0;
 	}
 
-	deadline =
-	  cfs_time_add(lp->lpni_last_alive,
-		       cfs_time_seconds(lp->lpni_net->net_tunables.
-					lct_peer_timeout));
-	alive = cfs_time_after(deadline, now);
+	deadline = lp->lpni_last_alive +
+		   lp->lpni_net->net_tunables.lct_peer_timeout;
+	alive = deadline > now;
 
 	/*
 	 * Update obsolete lp_alive except for routers assumed to be dead
@@ -748,9 +865,10 @@ lnet_peer_is_alive (struct lnet_peer_ni *lp, cfs_time_t now)
 /* NB: returns 1 when alive, 0 when dead, negative when error;
  *     may drop the lnet_net_lock */
 static int
-lnet_peer_alive_locked (struct lnet_ni *ni, struct lnet_peer_ni *lp)
+lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp,
+		       struct lnet_msg *msg)
 {
-	cfs_time_t now = cfs_time_current();
+	time64_t now = ktime_get_seconds();
 
 	if (!lnet_peer_aliveness_enabled(lp))
 		return -ENODEV;
@@ -758,23 +876,29 @@ lnet_peer_alive_locked (struct lnet_ni *ni, struct lnet_peer_ni *lp)
 	if (lnet_peer_is_alive(lp, now))
 		return 1;
 
+	/*
+	 * If we're resending a message, let's attempt to send it even if
+	 * the peer is down to fulfill our resend quota on the message
+	 */
+	if (msg->msg_retry_count > 0)
+		return 1;
+
 	/*
 	 * Peer appears dead, but we should avoid frequent NI queries (at
 	 * most once per lnet_queryinterval seconds).
 	 */
 	if (lp->lpni_last_query != 0) {
 		static const int lnet_queryinterval = 1;
+		time64_t next_query;
 
-		cfs_time_t next_query =
-			   cfs_time_add(lp->lpni_last_query,
-					cfs_time_seconds(lnet_queryinterval));
+		next_query = lp->lpni_last_query + lnet_queryinterval;
 
-		if (cfs_time_before(now, next_query)) {
+		if (now < next_query) {
 			if (lp->lpni_alive)
 				CWARN("Unexpected aliveness of peer %s: "
-				      "%d < %d (%d/%d)\n",
+				      "%lld < %lld (%d/%d)\n",
 				      libcfs_nid2str(lp->lpni_nid),
-				      (int)now, (int)next_query,
+				      now, next_query,
 				      lnet_queryinterval,
 				      lp->lpni_net->net_tunables.lct_peer_timeout);
 			return 0;
@@ -814,20 +938,28 @@ lnet_post_send_locked(struct lnet_msg *msg, int do_send)
 	LASSERT(!do_send || msg->msg_tx_delayed);
 	LASSERT(!msg->msg_receiving);
 	LASSERT(msg->msg_tx_committed);
+	/* can't get here if we're sending to the loopback interface */
+	LASSERT(lp->lpni_nid != the_lnet.ln_loni->ni_nid);
 
 	/* NB 'lp' is always the next hop */
 	if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 &&
-	    lnet_peer_alive_locked(ni, lp) == 0) {
-		the_lnet.ln_counters[cpt]->drop_count++;
-		the_lnet.ln_counters[cpt]->drop_length += msg->msg_len;
+	    lnet_peer_alive_locked(ni, lp, msg) == 0) {
+		the_lnet.ln_counters[cpt]->lct_common.lcc_drop_count++;
+		the_lnet.ln_counters[cpt]->lct_common.lcc_drop_length +=
+			msg->msg_len;
 		lnet_net_unlock(cpt);
 		if (msg->msg_txpeer)
-			atomic_inc(&msg->msg_txpeer->lpni_stats.drop_count);
+			lnet_incr_stats(&msg->msg_txpeer->lpni_stats,
+					msg->msg_type,
+					LNET_STATS_TYPE_DROP);
 		if (msg->msg_txni)
-			atomic_inc(&msg->msg_txni->ni_stats.drop_count);
+			lnet_incr_stats(&msg->msg_txni->ni_stats,
+					msg->msg_type,
+					LNET_STATS_TYPE_DROP);
 
 		CNETERR("Dropping message for %s: peer not alive\n",
 			libcfs_id2str(msg->msg_target));
+		msg->msg_health_status = LNET_MSG_STATUS_REMOTE_DROPPED;
 		if (do_send)
 			lnet_finalize(msg, -EHOSTUNREACH);
 
@@ -842,8 +974,12 @@ lnet_post_send_locked(struct lnet_msg *msg, int do_send)
 		CNETERR("Aborting message for %s: LNetM[DE]Unlink() already "
 			"called on the MD/ME.\n",
 			libcfs_id2str(msg->msg_target));
-		if (do_send)
+		if (do_send) {
+			msg->msg_no_resend = true;
+			CDEBUG(D_NET, "msg %p to %s canceled and will not be resent\n",
+			       msg, libcfs_id2str(msg->msg_target));
 			lnet_finalize(msg, -ECANCELED);
+		}
 
 		lnet_net_lock(cpt);
 		return -ECANCELED;
@@ -888,6 +1024,15 @@ lnet_post_send_locked(struct lnet_msg *msg, int do_send)
 		}
 	}
 
+	if (unlikely(!list_empty(&the_lnet.ln_delay_rules)) &&
+	    lnet_delay_rule_match_locked(&msg->msg_hdr, msg)) {
+		msg->msg_tx_delayed = 1;
+		return LNET_CREDIT_WAIT;
+	}
+
+	/* unset the tx_delay flag as we're going to send it now */
+	msg->msg_tx_delayed = 0;
+
 	if (do_send) {
 		lnet_net_unlock(cpt);
 		lnet_ni_send(ni, msg);
@@ -983,6 +1128,9 @@ lnet_post_routed_recv_locked(struct lnet_msg *msg, int do_recv)
 	msg->msg_niov = rbp->rbp_npages;
 	msg->msg_kiov = &rb->rb_kiov[0];
 
+	/* unset the msg-rx_delayed flag since we're receiving the message */
+	msg->msg_rx_delayed = 0;
+
 	if (do_recv) {
 		int cpt = msg->msg_rx_cpt;
 
@@ -1082,15 +1230,6 @@ lnet_return_tx_credits_locked(struct lnet_msg *msg)
 	}
 
 	if (txpeer != NULL) {
-		/*
-		 * TODO:
-		 * Once the patch for the health comes in we need to set
-		 * the health of the peer ni to bad when we fail to send
-		 * a message.
-		 * int status = msg->msg_ev.status;
-		 * if (status != 0)
-		 *	lnet_set_peer_ni_health_locked(txpeer, false)
-		 */
 		msg->msg_txpeer = NULL;
 		lnet_peer_ni_decref_locked(txpeer);
 	}
@@ -1122,6 +1261,8 @@ lnet_drop_routed_msgs_locked(struct list_head *list, int cpt)
 		lnet_ni_recv(msg->msg_rxni, msg->msg_private, NULL,
 			     0, 0, 0, msg->msg_hdr.payload_length);
 		list_del_init(&msg->msg_list);
+		msg->msg_no_resend = true;
+		msg->msg_health_status = LNET_MSG_STATUS_REMOTE_ERROR;
 		lnet_finalize(msg, -ECANCELED);
 	}
 
@@ -1268,7 +1409,7 @@ lnet_compare_routes(struct lnet_route *r1, struct lnet_route *r2)
 }
 
 static struct lnet_peer_ni *
-lnet_find_route_locked(struct lnet_net *net, lnet_nid_t target,
+lnet_find_route_locked(struct lnet_net *net, __u32 remote_net,
 		       lnet_nid_t rtr_nid)
 {
 	struct lnet_remotenet	*rnet;
@@ -1282,7 +1423,7 @@ lnet_find_route_locked(struct lnet_net *net, lnet_nid_t target,
 	/* If @rtr_nid is not LNET_NID_ANY, return the gateway with
 	 * rtr_nid nid, otherwise find the best gateway I can use */
 
-	rnet = lnet_find_rnet_locked(LNET_NIDNET(target));
+	rnet = lnet_find_rnet_locked(remote_net);
 	if (rnet == NULL)
 		return NULL;
 
@@ -1327,30 +1468,42 @@ lnet_find_route_locked(struct lnet_net *net, lnet_nid_t target,
 }
 
 static struct lnet_ni *
-lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *cur_ni,
+lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
+		 struct lnet_peer *peer, struct lnet_peer_net *peer_net,
 		 int md_cpt)
 {
-	struct lnet_ni *ni = NULL, *best_ni = cur_ni;
+	struct lnet_ni *ni = NULL;
 	unsigned int shortest_distance;
 	int best_credits;
+	int best_healthv;
+
+	/*
+	 * If there is no peer_ni that we can send to on this network,
+	 * then there is no point in looking for a new best_ni here.
+	*/
+	if (!lnet_get_next_peer_ni_locked(peer, peer_net, NULL))
+		return best_ni;
 
 	if (best_ni == NULL) {
 		shortest_distance = UINT_MAX;
 		best_credits = INT_MIN;
+		best_healthv = 0;
 	} else {
 		shortest_distance = cfs_cpt_distance(lnet_cpt_table(), md_cpt,
 						     best_ni->ni_dev_cpt);
 		best_credits = atomic_read(&best_ni->ni_tx_credits);
+		best_healthv = atomic_read(&best_ni->ni_healthv);
 	}
 
 	while ((ni = lnet_get_next_ni_locked(local_net, ni))) {
 		unsigned int distance;
 		int ni_credits;
-
-		if (!lnet_is_ni_healthy_locked(ni))
-			continue;
+		int ni_healthv;
+		int ni_fatal;
 
 		ni_credits = atomic_read(&ni->ni_tx_credits);
+		ni_healthv = atomic_read(&ni->ni_healthv);
+		ni_fatal = atomic_read(&ni->ni_fatal_error_on);
 
 		/*
 		 * calculate the distance from the CPT on which
@@ -1361,6 +1514,12 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *cur_ni,
 					    md_cpt,
 					    ni->ni_dev_cpt);
 
+		CDEBUG(D_NET, "compare ni %s [c:%d, d:%d, s:%d] with best_ni %s [c:%d, d:%d, s:%d]\n",
+		       libcfs_nid2str(ni->ni_nid), ni_credits, distance,
+		       ni->ni_seq, (best_ni) ? libcfs_nid2str(best_ni->ni_nid)
+			: "not seleced", best_credits, shortest_distance,
+			(best_ni) ? best_ni->ni_seq : 0);
+
 		/*
 		 * All distances smaller than the NUMA range
 		 * are treated equally.
@@ -1369,383 +1528,242 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *cur_ni,
 			distance = lnet_numa_range;
 
 		/*
-		 * Select on shorter distance, then available
+		 * Select on health, shorter distance, available
 		 * credits, then round-robin.
 		 */
-		if (distance > shortest_distance) {
+		if (ni_fatal) {
+			continue;
+		} else if (ni_healthv < best_healthv) {
+			continue;
+		} else if (ni_healthv > best_healthv) {
+			best_healthv = ni_healthv;
+			/*
+			 * If we're going to prefer this ni because it's
+			 * the healthiest, then we should set the
+			 * shortest_distance in the algorithm in case
+			 * there are multiple NIs with the same health but
+			 * different distances.
+			 */
+			if (distance < shortest_distance)
+				shortest_distance = distance;
+		} else if (distance > shortest_distance) {
 			continue;
 		} else if (distance < shortest_distance) {
 			shortest_distance = distance;
 		} else if (ni_credits < best_credits) {
 			continue;
 		} else if (ni_credits == best_credits) {
-			if (best_ni && (best_ni)->ni_seq <= ni->ni_seq)
+			if (best_ni && best_ni->ni_seq <= ni->ni_seq)
 				continue;
 		}
 		best_ni = ni;
 		best_credits = ni_credits;
 	}
 
+	CDEBUG(D_NET, "selected best_ni %s\n",
+	       (best_ni) ? libcfs_nid2str(best_ni->ni_nid) : "no selection");
+
 	return best_ni;
 }
 
-static int
-lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
-		    struct lnet_msg *msg, lnet_nid_t rtr_nid)
+/*
+ * Traffic to the LNET_RESERVED_PORTAL may not trigger peer discovery,
+ * because such traffic is required to perform discovery. We therefore
+ * exclude all GET and PUT on that portal. We also exclude all ACK and
+ * REPLY traffic, but that is because the portal is not tracked in the
+ * message structure for these message types. We could restrict this
+ * further by also checking for LNET_PROTO_PING_MATCHBITS.
+ */
+static bool
+lnet_msg_discovery(struct lnet_msg *msg)
 {
-	struct lnet_ni		*best_ni;
-	struct lnet_peer_ni	*best_lpni;
-	struct lnet_peer_ni	*best_gw;
-	struct lnet_peer_ni	*lpni;
-	struct lnet_peer_ni	*final_dst;
-	struct lnet_peer	*peer;
-	struct lnet_peer_net	*peer_net;
-	struct lnet_net		*local_net;
-	__u32			seq;
-	int			cpt, cpt2, rc;
-	bool			routing;
-	bool			routing2;
-	bool			ni_is_pref;
-	bool			preferred;
-	bool			local_found;
-	int			best_lpni_credits;
-	int			md_cpt;
+	if (msg->msg_type == LNET_MSG_PUT) {
+		if (msg->msg_hdr.msg.put.ptl_index != LNET_RESERVED_PORTAL)
+			return true;
+	} else if (msg->msg_type == LNET_MSG_GET) {
+		if (msg->msg_hdr.msg.get.ptl_index != LNET_RESERVED_PORTAL)
+			return true;
+	}
+	return false;
+}
 
-	/*
-	 * get an initial CPT to use for locking. The idea here is not to
-	 * serialize the calls to select_pathway, so that as many
-	 * operations can run concurrently as possible. To do that we use
-	 * the CPT where this call is being executed. Later on when we
-	 * determine the CPT to use in lnet_message_commit, we switch the
-	 * lock and check if there was any configuration change.  If none,
-	 * then we proceed, if there is, then we restart the operation.
-	 */
-	cpt = lnet_net_lock_current();
+#define SRC_SPEC	0x0001
+#define SRC_ANY		0x0002
+#define LOCAL_DST	0x0004
+#define REMOTE_DST	0x0008
+#define MR_DST		0x0010
+#define NMR_DST		0x0020
+#define SND_RESP	0x0040
+
+/* The following to defines are used for return codes */
+#define REPEAT_SEND	0x1000
+#define PASS_THROUGH	0x2000
+
+/* The different cases lnet_select pathway needs to handle */
+#define SRC_SPEC_LOCAL_MR_DST	(SRC_SPEC | LOCAL_DST | MR_DST)
+#define SRC_SPEC_ROUTER_MR_DST	(SRC_SPEC | REMOTE_DST | MR_DST)
+#define SRC_SPEC_LOCAL_NMR_DST	(SRC_SPEC | LOCAL_DST | NMR_DST)
+#define SRC_SPEC_ROUTER_NMR_DST	(SRC_SPEC | REMOTE_DST | NMR_DST)
+#define SRC_ANY_LOCAL_MR_DST	(SRC_ANY | LOCAL_DST | MR_DST)
+#define SRC_ANY_ROUTER_MR_DST	(SRC_ANY | REMOTE_DST | MR_DST)
+#define SRC_ANY_LOCAL_NMR_DST	(SRC_ANY | LOCAL_DST | NMR_DST)
+#define SRC_ANY_ROUTER_NMR_DST	(SRC_ANY | REMOTE_DST | NMR_DST)
 
-	md_cpt = lnet_cpt_of_md(msg->msg_md, msg->msg_offset);
-	if (md_cpt == CFS_CPT_ANY)
-		md_cpt = cpt;
+static int
+lnet_handle_lo_send(struct lnet_send_data *sd)
+{
+	struct lnet_msg *msg = sd->sd_msg;
+	int cpt = sd->sd_cpt;
 
-again:
-	best_ni = NULL;
-	best_lpni = NULL;
-	best_gw = NULL;
-	final_dst = NULL;
-	local_net = NULL;
-	routing = false;
-	routing2 = false;
-	local_found = false;
-
-	seq = lnet_get_dlc_seq_locked();
-
-	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
-		lnet_net_unlock(cpt);
-		return -ESHUTDOWN;
-	}
+	/* No send credit hassles with LOLND */
+	lnet_ni_addref_locked(the_lnet.ln_loni, cpt);
+	msg->msg_hdr.dest_nid = cpu_to_le64(the_lnet.ln_loni->ni_nid);
+	if (!msg->msg_routing)
+		msg->msg_hdr.src_nid =
+			cpu_to_le64(the_lnet.ln_loni->ni_nid);
+	msg->msg_target.nid = the_lnet.ln_loni->ni_nid;
+	lnet_msg_commit(msg, cpt);
+	msg->msg_txni = the_lnet.ln_loni;
 
-	peer = lnet_find_or_create_peer_locked(dst_nid, cpt);
-	if (IS_ERR(peer)) {
-		lnet_net_unlock(cpt);
-		return PTR_ERR(peer);
-	}
+	return LNET_CREDIT_OK;
+}
 
-	/* If peer is not healthy then can not send anything to it */
-	if (!lnet_is_peer_healthy_locked(peer)) {
-		lnet_net_unlock(cpt);
-		return -EHOSTUNREACH;
-	}
+static int
+lnet_handle_send(struct lnet_send_data *sd)
+{
+	struct lnet_ni *best_ni = sd->sd_best_ni;
+	struct lnet_peer_ni *best_lpni = sd->sd_best_lpni;
+	struct lnet_peer_ni *final_dst_lpni = sd->sd_final_dst_lpni;
+	struct lnet_msg *msg = sd->sd_msg;
+	int cpt2;
+	__u32 send_case = sd->sd_send_case;
+	int rc;
+	__u32 routing = send_case & REMOTE_DST;
+	 struct lnet_rsp_tracker *rspt;
 
-	if (!peer->lp_multi_rail && lnet_get_num_peer_nis(peer) > 1) {
-		lnet_net_unlock(cpt);
-		CERROR("peer %s is declared to be non MR capable, "
-		       "yet configured with more than one NID\n",
-		       libcfs_nid2str(dst_nid));
-		return -EINVAL;
-	}
+	/*
+	 * Increment sequence number of the selected peer so that we
+	 * pick the next one in Round Robin.
+	 */
+	best_lpni->lpni_seq++;
 
 	/*
-	 * STEP 1: first jab at determining best_ni
-	 * if src_nid is explicitly specified, then best_ni is already
-	 * pre-determiend for us. Otherwise we need to select the best
-	 * one to use later on
+	 * grab a reference on the peer_ni so it sticks around even if
+	 * we need to drop and relock the lnet_net_lock below.
 	 */
-	if (src_nid != LNET_NID_ANY) {
-		best_ni = lnet_nid2ni_locked(src_nid, cpt);
-		if (!best_ni) {
-			lnet_net_unlock(cpt);
-			LCONSOLE_WARN("Can't send to %s: src %s is not a "
-				      "local nid\n", libcfs_nid2str(dst_nid),
-				      libcfs_nid2str(src_nid));
-			return -EINVAL;
-		}
-	}
+	lnet_peer_ni_addref_locked(best_lpni);
 
-	if (msg->msg_type == LNET_MSG_REPLY ||
-	    msg->msg_type == LNET_MSG_ACK ||
-	    !peer->lp_multi_rail ||
-	    best_ni) {
-		/*
-		 * for replies we want to respond on the same peer_ni we
-		 * received the message on if possible. If not, then pick
-		 * a peer_ni to send to
-		 *
-		 * if the peer is non-multi-rail then you want to send to
-		 * the dst_nid provided as well.
-		 *
-		 * If the best_ni has already been determined, IE the
-		 * src_nid has been specified, then use the
-		 * destination_nid provided as well, since we're
-		 * continuing a series of related messages for the same
-		 * RPC.
-		 *
-		 * It is expected to find the lpni using dst_nid, since we
-		 * created it earlier.
-		 */
-		best_lpni = lnet_find_peer_ni_locked(dst_nid);
-		if (best_lpni)
+	/*
+	 * Use lnet_cpt_of_nid() to determine the CPT used to commit the
+	 * message. This ensures that we get a CPT that is correct for
+	 * the NI when the NI has been restricted to a subset of all CPTs.
+	 * If the selected CPT differs from the one currently locked, we
+	 * must unlock and relock the lnet_net_lock(), and then check whether
+	 * the configuration has changed. We don't have a hold on the best_ni
+	 * yet, and it may have vanished.
+	 */
+	cpt2 = lnet_cpt_of_nid_locked(best_lpni->lpni_nid, best_ni);
+	if (sd->sd_cpt != cpt2) {
+		__u32 seq = lnet_get_dlc_seq_locked();
+		lnet_net_unlock(sd->sd_cpt);
+		sd->sd_cpt = cpt2;
+		lnet_net_lock(sd->sd_cpt);
+		if (seq != lnet_get_dlc_seq_locked()) {
 			lnet_peer_ni_decref_locked(best_lpni);
-
-		if (best_lpni && !lnet_get_net_locked(LNET_NIDNET(dst_nid))) {
-			/*
-			 * this lpni is not on a local network so we need
-			 * to route this reply.
-			 */
-			best_gw = lnet_find_route_locked(NULL,
-							 best_lpni->lpni_nid,
-							 rtr_nid);
-			if (best_gw) {
-				/*
-				* RULE: Each node considers only the next-hop
-				*
-				* We're going to route the message, so change the peer to
-				* the router.
-				*/
-				LASSERT(best_gw->lpni_peer_net);
-				LASSERT(best_gw->lpni_peer_net->lpn_peer);
-				peer = best_gw->lpni_peer_net->lpn_peer;
-
-				/*
-				* if the router is not multi-rail then use the best_gw
-				* found to send the message to
-				*/
-				if (!peer->lp_multi_rail)
-					best_lpni = best_gw;
-				else
-					best_lpni = NULL;
-
-				routing = true;
-			} else {
-				best_lpni = NULL;
-			}
-		} else if (!best_lpni) {
-			lnet_net_unlock(cpt);
-			CERROR("unable to send msg_type %d to "
-			      "originating %s. Destination NID not in DB\n",
-			      msg->msg_type, libcfs_nid2str(dst_nid));
-			return -EINVAL;
+			return REPEAT_SEND;
 		}
 	}
 
 	/*
-	 * if the peer is not MR capable, then we should always send to it
-	 * using the first NI in the NET we determined.
+	 * store the best_lpni in the message right away to avoid having
+	 * to do the same operation under different conditions
 	 */
-	if (!peer->lp_multi_rail) {
-		if (!best_lpni) {
-			lnet_net_unlock(cpt);
-			CERROR("no route to %s\n",
-			       libcfs_nid2str(dst_nid));
-			return -EHOSTUNREACH;
-		}
-
-		/* best ni could be set because src_nid was provided */
-		if (!best_ni) {
-			best_ni = lnet_net2ni_locked(best_lpni->lpni_net->net_id, cpt);
-			if (!best_ni) {
-				lnet_net_unlock(cpt);
-				CERROR("no path to %s from net %s\n",
-				libcfs_nid2str(best_lpni->lpni_nid),
-				libcfs_net2str(best_lpni->lpni_net->net_id));
-				return -EHOSTUNREACH;
-			}
-		}
-	}
+	msg->msg_txpeer = best_lpni;
+	msg->msg_txni = best_ni;
 
 	/*
-	 * if we already found a best_ni because src_nid is specified and
-	 * best_lpni because we are replying to a message then just send
-	 * the message
+	 * grab a reference for the best_ni since now it's in use in this
+	 * send. The reference will be dropped in lnet_finalize()
 	 */
-	if (best_ni && best_lpni)
-		goto send;
+	lnet_ni_addref_locked(msg->msg_txni, sd->sd_cpt);
 
 	/*
-	 * If we already found a best_ni because src_nid is specified then
-	 * pick the peer then send the message
+	 * Always set the target.nid to the best peer picked. Either the
+	 * NID will be one of the peer NIDs selected, or the same NID as
+	 * what was originally set in the target or it will be the NID of
+	 * a router if this message should be routed
 	 */
-	if (best_ni)
-		goto pick_peer;
+	msg->msg_target.nid = msg->msg_txpeer->lpni_nid;
 
 	/*
-	 * pick the best_ni by going through all the possible networks of
-	 * that peer and see which local NI is best suited to talk to that
-	 * peer.
-	 *
-	 * Locally connected networks will always be preferred over
-	 * a routed network. If there are only routed paths to the peer,
-	 * then the best route is chosen. If all routes are equal then
-	 * they are used in round robin.
+	 * lnet_msg_commit assigns the correct cpt to the message, which
+	 * is used to decrement the correct refcount on the ni when it's
+	 * time to return the credits
 	 */
-	list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_on_peer_list) {
-		if (!lnet_is_peer_net_healthy_locked(peer_net))
-			continue;
-
-		local_net = lnet_get_net_locked(peer_net->lpn_net_id);
-		if (!local_net && !routing && !local_found) {
-			struct lnet_peer_ni *net_gw;
-
-			lpni = list_entry(peer_net->lpn_peer_nis.next,
-					  struct lnet_peer_ni,
-					  lpni_on_peer_net_list);
-
-			net_gw = lnet_find_route_locked(NULL,
-							lpni->lpni_nid,
-							rtr_nid);
-			if (!net_gw)
-				continue;
-
-			if (best_gw) {
-				/*
-				 * lnet_find_route_locked() call
-				 * will return the best_Gw on the
-				 * lpni->lpni_nid network.
-				 * However, best_gw and net_gw can
-				 * be on different networks.
-				 * Therefore need to compare them
-				 * to pick the better of either.
-				 */
-				if (lnet_compare_peers(best_gw, net_gw) > 0)
-					continue;
-				if (best_gw->lpni_gw_seq <= net_gw->lpni_gw_seq)
-					continue;
-			}
-			best_gw = net_gw;
-			final_dst = lpni;
-
-			routing2 = true;
-		} else {
-			best_gw = NULL;
-			final_dst = NULL;
-			routing2 = false;
-			local_found = true;
-		}
-
-		/*
-		 * a gw on this network is found, but there could be
-		 * other better gateways on other networks. So don't pick
-		 * the best_ni until we determine the best_gw.
-		 */
-		if (best_gw)
-			continue;
+	lnet_msg_commit(msg, sd->sd_cpt);
 
-		/* if no local_net found continue */
-		if (!local_net)
-			continue;
+	/*
+	 * If we are routing the message then we keep the src_nid that was
+	 * set by the originator. If we are not routing then we are the
+	 * originator and set it here.
+	 */
+	if (!msg->msg_routing)
+		msg->msg_hdr.src_nid = cpu_to_le64(msg->msg_txni->ni_nid);
 
+	if (routing) {
+		msg->msg_target_is_router = 1;
+		msg->msg_target.pid = LNET_PID_LUSTRE;
 		/*
-		 * Iterate through the NIs in this local Net and select
-		 * the NI to send from. The selection is determined by
-		 * these 3 criterion in the following priority:
-		 *	1. NUMA
-		 *	2. NI available credits
-		 *	3. Round Robin
+		 * since we're routing we want to ensure that the
+		 * msg_hdr.dest_nid is set to the final destination. When
+		 * the router receives this message it knows how to route
+		 * it.
+		 *
+		 * final_dst_lpni is set at the beginning of the
+		 * lnet_select_pathway() function and is never changed.
+		 * It's safe to use it here.
 		 */
-		best_ni = lnet_get_best_ni(local_net, best_ni, md_cpt);
-	}
-
-	if (!best_ni && !best_gw) {
-		lnet_net_unlock(cpt);
-		LCONSOLE_WARN("No local ni found to send from to %s\n",
-			libcfs_nid2str(dst_nid));
-		return -EINVAL;
-	}
-
-	if (!best_ni) {
-		best_ni = lnet_get_best_ni(best_gw->lpni_net, best_ni, md_cpt);
-		LASSERT(best_gw && best_ni);
-
+		msg->msg_hdr.dest_nid = cpu_to_le64(final_dst_lpni->lpni_nid);
+	} else {
 		/*
-		 * We're going to route the message, so change the peer to
-		 * the router.
+		 * if we're not routing set the dest_nid to the best peer
+		 * ni NID that we picked earlier in the algorithm.
 		 */
-		LASSERT(best_gw->lpni_peer_net);
-		LASSERT(best_gw->lpni_peer_net->lpn_peer);
-		best_gw->lpni_gw_seq++;
-		peer = best_gw->lpni_peer_net->lpn_peer;
+		msg->msg_hdr.dest_nid = cpu_to_le64(msg->msg_txpeer->lpni_nid);
 	}
 
 	/*
-	 * Now that we selected the NI to use increment its sequence
-	 * number so the Round Robin algorithm will detect that it has
-	 * been used and pick the next NI.
-	 */
-	best_ni->ni_seq++;
-
-pick_peer:
-	/*
-	 * At this point the best_ni is on a local network on which
-	 * the peer has a peer_ni as well
+	 * if we have response tracker block update it with the next hop
+	 * nid
 	 */
-	peer_net = lnet_peer_get_net_locked(peer,
-					    best_ni->ni_net->net_id);
-	/*
-	 * peer_net is not available or the src_nid is explicitly defined
-	 * and the peer_net for that src_nid is unhealthy. find a route to
-	 * the destination nid.
-	 */
-	if (!peer_net ||
-	    (src_nid != LNET_NID_ANY &&
-	     !lnet_is_peer_net_healthy_locked(peer_net))) {
-		best_gw = lnet_find_route_locked(best_ni->ni_net,
-						 dst_nid,
-						 rtr_nid);
-		/*
-		 * if no route is found for that network then
-		 * move onto the next peer_ni in the peer
-		 */
-		if (!best_gw) {
-			lnet_net_unlock(cpt);
-			LCONSOLE_WARN("No route to peer from %s\n",
-				libcfs_nid2str(best_ni->ni_nid));
-			return -EHOSTUNREACH;
+	if (msg->msg_md) {
+		rspt = msg->msg_md->md_rspt_ptr;
+		if (rspt) {
+			rspt->rspt_next_hop_nid = msg->msg_txpeer->lpni_nid;
+			CDEBUG(D_NET, "rspt_next_hop_nid = %s\n",
+			       libcfs_nid2str(rspt->rspt_next_hop_nid));
 		}
+	}
 
-		CDEBUG(D_NET, "Best route to %s via %s for %s %d\n",
-			libcfs_nid2str(dst_nid),
-			libcfs_nid2str(best_gw->lpni_nid),
-			lnet_msgtyp2str(msg->msg_type), msg->msg_len);
+	rc = lnet_post_send_locked(msg, 0);
 
-		routing2 = true;
-		/*
-		 * RULE: Each node considers only the next-hop
-		 *
-		 * We're going to route the message, so change the peer to
-		 * the router.
-		 */
-		LASSERT(best_gw->lpni_peer_net);
-		LASSERT(best_gw->lpni_peer_net->lpn_peer);
-		peer = best_gw->lpni_peer_net->lpn_peer;
-	} else if (!lnet_is_peer_net_healthy_locked(peer_net)) {
-		/*
-		 * this peer_net is unhealthy but we still have an opportunity
-		 * to find another peer_net that we can use
-		 */
-		__u32 net_id = peer_net->lpn_net_id;
-		LCONSOLE_WARN("peer net %s unhealthy\n",
-			      libcfs_net2str(net_id));
-		goto again;
-	}
+	if (!rc)
+		CDEBUG(D_NET, "TRACE: %s(%s:%s) -> %s(%s:%s) : %s try# %d\n",
+		       libcfs_nid2str(msg->msg_hdr.src_nid),
+		       libcfs_nid2str(msg->msg_txni->ni_nid),
+		       libcfs_nid2str(sd->sd_src_nid),
+		       libcfs_nid2str(msg->msg_hdr.dest_nid),
+		       libcfs_nid2str(sd->sd_dst_nid),
+		       libcfs_nid2str(msg->msg_txpeer->lpni_nid),
+		       lnet_msgtyp2str(msg->msg_type), msg->msg_retry_count);
+
+	return rc;
+}
 
+static struct lnet_peer_ni *
+lnet_select_peer_ni(struct lnet_send_data *sd, struct lnet_peer *peer,
+		    struct lnet_peer_net *peer_net)
+{
 	/*
 	 * Look at the peer NIs for the destination peer that connect
 	 * to the chosen net. If a peer_ni is preferred when using the
@@ -1754,24 +1772,45 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
 	 * the available transmit credits are used. If the transmit
 	 * credits are equal, we round-robin over the peer_ni.
 	 */
-	lpni = NULL;
-	best_lpni_credits = INT_MIN;
-	preferred = false;
-	best_lpni = NULL;
+	struct lnet_peer_ni *lpni = NULL;
+	struct lnet_peer_ni *best_lpni = NULL;
+	struct lnet_ni *best_ni = sd->sd_best_ni;
+	lnet_nid_t dst_nid = sd->sd_dst_nid;
+	int best_lpni_credits = INT_MIN;
+	bool preferred = false;
+	bool ni_is_pref;
+	int best_lpni_healthv = 0;
+	int lpni_healthv;
+
 	while ((lpni = lnet_get_next_peer_ni_locked(peer, peer_net, lpni))) {
 		/*
-		 * if this peer ni is not healthy just skip it, no point in
-		 * examining it further
+		 * if the best_ni we've chosen aleady has this lpni
+		 * preferred, then let's use it
 		 */
-		if (!lnet_is_peer_ni_healthy_locked(lpni))
-			continue;
-		ni_is_pref = lnet_peer_is_ni_pref_locked(lpni, best_ni);
+		ni_is_pref = lnet_peer_is_pref_nid_locked(lpni,
+							  best_ni->ni_nid);
 
-		/* if this is a preferred peer use it */
-		if (!preferred && ni_is_pref) {
-			preferred = true;
-		} else if (preferred && !ni_is_pref) {
-			/*
+		lpni_healthv = atomic_read(&lpni->lpni_healthv);
+
+		CDEBUG(D_NET, "%s ni_is_pref = %d\n",
+		       libcfs_nid2str(best_ni->ni_nid), ni_is_pref);
+
+		if (best_lpni)
+			CDEBUG(D_NET, "%s c:[%d, %d], s:[%d, %d]\n",
+				libcfs_nid2str(lpni->lpni_nid),
+				lpni->lpni_txcredits, best_lpni_credits,
+				lpni->lpni_seq, best_lpni->lpni_seq);
+
+		/* pick the healthiest peer ni */
+		if (lpni_healthv < best_lpni_healthv) {
+			continue;
+		} else if (lpni_healthv > best_lpni_healthv) {
+			best_lpni_healthv = lpni_healthv;
+		/* if this is a preferred peer use it */
+		} else if (!preferred && ni_is_pref) {
+			preferred = true;
+		} else if (preferred && !ni_is_pref) {
+			/*
 			 * this is not the preferred peer so let's ignore
 			 * it.
 			 */
@@ -1804,174 +1843,1924 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
 	if (!best_lpni) {
 		__u32 net_id = (peer_net) ? peer_net->lpn_net_id :
 			LNET_NIDNET(dst_nid);
-		lnet_net_unlock(cpt);
-		LCONSOLE_WARN("no peer_ni found on peer net %s\n",
+		CDEBUG(D_NET, "no peer_ni found on peer net %s\n",
 				libcfs_net2str(net_id));
+		return NULL;
+	}
+
+	CDEBUG(D_NET, "sd_best_lpni = %s\n",
+	       libcfs_nid2str(best_lpni->lpni_nid));
+
+	return best_lpni;
+}
+
+/*
+ * Prerequisite: the best_ni should already be set in the sd
+ */
+static inline struct lnet_peer_ni *
+lnet_find_best_lpni_on_net(struct lnet_send_data *sd, struct lnet_peer *peer,
+			   __u32 net_id)
+{
+	struct lnet_peer_net *peer_net;
+
+	/*
+	 * The gateway is Multi-Rail capable so now we must select the
+	 * proper peer_ni
+	 */
+	peer_net = lnet_peer_get_net_locked(peer, net_id);
+
+	if (!peer_net) {
+		CERROR("gateway peer %s has no NI on net %s\n",
+		       libcfs_nid2str(peer->lp_primary_nid),
+		       libcfs_net2str(net_id));
+		return NULL;
+	}
+
+	return lnet_select_peer_ni(sd, peer, peer_net);
+}
+
+static inline void
+lnet_set_non_mr_pref_nid(struct lnet_send_data *sd)
+{
+	if (sd->sd_send_case & NMR_DST &&
+	    sd->sd_msg->msg_type != LNET_MSG_REPLY &&
+	    sd->sd_msg->msg_type != LNET_MSG_ACK &&
+	    sd->sd_best_lpni->lpni_pref_nnids == 0) {
+		CDEBUG(D_NET, "Setting preferred local NID %s on NMR peer %s\n",
+		       libcfs_nid2str(sd->sd_best_ni->ni_nid),
+		       libcfs_nid2str(sd->sd_best_lpni->lpni_nid));
+		lnet_peer_ni_set_non_mr_pref_nid(sd->sd_best_lpni,
+						 sd->sd_best_ni->ni_nid);
+	}
+}
+
+/*
+ * Source Specified
+ * Local Destination
+ * non-mr peer
+ *
+ * use the source and destination NIDs as the pathway
+ */
+static int
+lnet_handle_spec_local_nmr_dst(struct lnet_send_data *sd)
+{
+	/* the destination lpni is set before we get here. */
+
+	/* find local NI */
+	sd->sd_best_ni = lnet_nid2ni_locked(sd->sd_src_nid, sd->sd_cpt);
+	if (!sd->sd_best_ni) {
+		CERROR("Can't send to %s: src %s is not a "
+		       "local nid\n", libcfs_nid2str(sd->sd_dst_nid),
+				libcfs_nid2str(sd->sd_src_nid));
+		return -EINVAL;
+	}
+
+	/*
+	 * the preferred NID will only be set for NMR peers
+	 */
+	lnet_set_non_mr_pref_nid(sd);
+
+	return lnet_handle_send(sd);
+}
+
+/*
+ * Source Specified
+ * Local Destination
+ * MR Peer
+ *
+ * Don't run the selection algorithm on the peer NIs. By specifying the
+ * local NID, we're also saying that we should always use the destination NID
+ * provided. This handles the case where we should be using the same
+ * destination NID for the all the messages which belong to the same RPC
+ * request.
+ */
+static int
+lnet_handle_spec_local_mr_dst(struct lnet_send_data *sd)
+{
+	sd->sd_best_ni = lnet_nid2ni_locked(sd->sd_src_nid, sd->sd_cpt);
+	if (!sd->sd_best_ni) {
+		CERROR("Can't send to %s: src %s is not a "
+		       "local nid\n", libcfs_nid2str(sd->sd_dst_nid),
+				libcfs_nid2str(sd->sd_src_nid));
+		return -EINVAL;
+	}
+
+	if (sd->sd_best_lpni &&
+	    sd->sd_best_lpni->lpni_nid == the_lnet.ln_loni->ni_nid)
+		return lnet_handle_lo_send(sd);
+	else if (sd->sd_best_lpni)
+		return lnet_handle_send(sd);
+
+	CERROR("can't send to %s. no NI on %s\n",
+	       libcfs_nid2str(sd->sd_dst_nid),
+	       libcfs_net2str(sd->sd_best_ni->ni_net->net_id));
+
+	return -EHOSTUNREACH;
+}
+
+struct lnet_ni *
+lnet_find_best_ni_on_spec_net(struct lnet_ni *cur_best_ni,
+			      struct lnet_peer *peer,
+			      struct lnet_peer_net *peer_net,
+			      int cpt,
+			      bool incr_seq)
+{
+	struct lnet_net *local_net;
+	struct lnet_ni *best_ni;
+
+	local_net = lnet_get_net_locked(peer_net->lpn_net_id);
+	if (!local_net)
+		return NULL;
+
+	/*
+	 * Iterate through the NIs in this local Net and select
+	 * the NI to send from. The selection is determined by
+	 * these 3 criterion in the following priority:
+	 *	1. NUMA
+	 *	2. NI available credits
+	 *	3. Round Robin
+	 */
+	best_ni = lnet_get_best_ni(local_net, cur_best_ni,
+				   peer, peer_net, cpt);
+
+	if (incr_seq && best_ni)
+		best_ni->ni_seq++;
+
+	return best_ni;
+}
+
+static int
+lnet_handle_find_routed_path(struct lnet_send_data *sd,
+			     lnet_nid_t dst_nid,
+			     struct lnet_peer_ni **gw_lpni,
+			     struct lnet_peer **gw_peer)
+{
+	struct lnet_peer_ni *gw;
+	lnet_nid_t src_nid = sd->sd_src_nid;
+
+	gw = lnet_find_route_locked(NULL, LNET_NIDNET(dst_nid),
+				    sd->sd_rtr_nid);
+	if (!gw) {
+		CERROR("no route to %s from %s\n",
+		       libcfs_nid2str(dst_nid), libcfs_nid2str(src_nid));
 		return -EHOSTUNREACH;
 	}
 
+	/* get the peer of the gw_ni */
+	LASSERT(gw->lpni_peer_net);
+	LASSERT(gw->lpni_peer_net->lpn_peer);
+
+	*gw_peer = gw->lpni_peer_net->lpn_peer;
+
+	if (!sd->sd_best_ni)
+		sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL, *gw_peer,
+					gw->lpni_peer_net,
+					sd->sd_md_cpt,
+					true);
+
+	if (!sd->sd_best_ni) {
+		CERROR("Internal Error. Expected local ni on %s "
+		       "but non found :%s\n",
+		       libcfs_net2str(gw->lpni_peer_net->lpn_net_id),
+		       libcfs_nid2str(sd->sd_src_nid));
+		return -EFAULT;
+	}
+
+	/*
+	 * if gw is MR let's find its best peer_ni
+	 */
+	if (lnet_peer_is_multi_rail(*gw_peer)) {
+		gw = lnet_find_best_lpni_on_net(sd, *gw_peer,
+						sd->sd_best_ni->ni_net->net_id);
+		/*
+		 * We've already verified that the gw has an NI on that
+		 * desired net, but we're not finding it. Something is
+		 * wrong.
+		 */
+		if (!gw) {
+			CERROR("Internal Error. Route expected to %s from %s\n",
+				libcfs_nid2str(dst_nid),
+				libcfs_nid2str(src_nid));
+			return -EFAULT;
+		}
+	}
+
+	*gw_lpni = gw;
+
+	return 0;
+}
+
+/*
+ * Handle two cases:
+ *
+ * Case 1:
+ *  Source specified
+ *  Remote destination
+ *  Non-MR destination
+ *
+ * Case 2:
+ *  Source specified
+ *  Remote destination
+ *  MR destination
+ *
+ * The handling of these two cases is similar. Even though the destination
+ * can be MR or non-MR, we'll deal directly with the router.
+ */
+static int
+lnet_handle_spec_router_dst(struct lnet_send_data *sd)
+{
+	int rc;
+	struct lnet_peer_ni *gw_lpni = NULL;
+	struct lnet_peer *gw_peer = NULL;
+
+	/* find local NI */
+	sd->sd_best_ni = lnet_nid2ni_locked(sd->sd_src_nid, sd->sd_cpt);
+	if (!sd->sd_best_ni) {
+		CERROR("Can't send to %s: src %s is not a "
+		       "local nid\n", libcfs_nid2str(sd->sd_dst_nid),
+				libcfs_nid2str(sd->sd_src_nid));
+		return -EINVAL;
+	}
+
+	rc = lnet_handle_find_routed_path(sd, sd->sd_dst_nid, &gw_lpni,
+				     &gw_peer);
+	if (rc < 0)
+		return rc;
+
+	if (sd->sd_send_case & NMR_DST)
+		/*
+		* since the final destination is non-MR let's set its preferred
+		* NID before we send
+		*/
+		lnet_set_non_mr_pref_nid(sd);
+
+	/*
+	 * We're going to send to the gw found so let's set its
+	 * info
+	 */
+	sd->sd_peer = gw_peer;
+	sd->sd_best_lpni = gw_lpni;
+
+	return lnet_handle_send(sd);
+}
+
+struct lnet_ni *
+lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt)
+{
+	struct lnet_peer_net *peer_net = NULL;
+	struct lnet_ni *best_ni = NULL;
+
+	/*
+	 * The peer can have multiple interfaces, some of them can be on
+	 * the local network and others on a routed network. We should
+	 * prefer the local network. However if the local network is not
+	 * available then we need to try the routed network
+	 */
+
+	/* go through all the peer nets and find the best_ni */
+	list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_peer_nets) {
+		/*
+		 * The peer's list of nets can contain non-local nets. We
+		 * want to only examine the local ones.
+		 */
+		if (!lnet_get_net_locked(peer_net->lpn_net_id))
+			continue;
+		best_ni = lnet_find_best_ni_on_spec_net(best_ni, peer,
+						   peer_net, md_cpt, false);
+	}
+
+	if (best_ni)
+		/* increment sequence number so we can round robin */
+		best_ni->ni_seq++;
+
+	return best_ni;
+}
+
+static struct lnet_ni *
+lnet_find_existing_preferred_best_ni(struct lnet_send_data *sd)
+{
+	struct lnet_ni *best_ni = NULL;
+	struct lnet_peer_net *peer_net;
+	struct lnet_peer *peer = sd->sd_peer;
+	struct lnet_peer_ni *best_lpni = sd->sd_best_lpni;
+	struct lnet_peer_ni *lpni;
+	int cpt = sd->sd_cpt;
+
+	/*
+	 * We must use a consistent source address when sending to a
+	 * non-MR peer. However, a non-MR peer can have multiple NIDs
+	 * on multiple networks, and we may even need to talk to this
+	 * peer on multiple networks -- certain types of
+	 * load-balancing configuration do this.
+	 *
+	 * So we need to pick the NI the peer prefers for this
+	 * particular network.
+	 */
+
+	/* Get the target peer_ni */
+	peer_net = lnet_peer_get_net_locked(peer,
+			LNET_NIDNET(best_lpni->lpni_nid));
+	LASSERT(peer_net != NULL);
+	list_for_each_entry(lpni, &peer_net->lpn_peer_nis,
+				lpni_peer_nis) {
+		if (lpni->lpni_pref_nnids == 0)
+			continue;
+		LASSERT(lpni->lpni_pref_nnids == 1);
+		best_ni = lnet_nid2ni_locked(
+				lpni->lpni_pref.nid, cpt);
+		break;
+	}
+
+	return best_ni;
+}
+
+/* Prerequisite: sd->sd_peer and sd->sd_best_lpni should be set */
+static int
+lnet_select_preferred_best_ni(struct lnet_send_data *sd)
+{
+	struct lnet_ni *best_ni = NULL;
+	struct lnet_peer_ni *best_lpni = sd->sd_best_lpni;
+
+	/*
+	 * We must use a consistent source address when sending to a
+	 * non-MR peer. However, a non-MR peer can have multiple NIDs
+	 * on multiple networks, and we may even need to talk to this
+	 * peer on multiple networks -- certain types of
+	 * load-balancing configuration do this.
+	 *
+	 * So we need to pick the NI the peer prefers for this
+	 * particular network.
+	 */
+
+	best_ni = lnet_find_existing_preferred_best_ni(sd);
+
+	/* if best_ni is still not set just pick one */
+	if (!best_ni) {
+		best_ni =
+		  lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer,
+						sd->sd_best_lpni->lpni_peer_net,
+						sd->sd_md_cpt, true);
+		/* If there is no best_ni we don't have a route */
+		if (!best_ni) {
+			CERROR("no path to %s from net %s\n",
+				libcfs_nid2str(best_lpni->lpni_nid),
+				libcfs_net2str(best_lpni->lpni_net->net_id));
+			return -EHOSTUNREACH;
+		}
+	}
+
+	sd->sd_best_ni = best_ni;
+
+	/* Set preferred NI if necessary. */
+	lnet_set_non_mr_pref_nid(sd);
+
+	return 0;
+}
+
+
+/*
+ * Source not specified
+ * Local destination
+ * Non-MR Peer
+ *
+ * always use the same source NID for NMR peers
+ * If we've talked to that peer before then we already have a preferred
+ * source NI associated with it. Otherwise, we select a preferred local NI
+ * and store it in the peer
+ */
+static int
+lnet_handle_any_local_nmr_dst(struct lnet_send_data *sd)
+{
+	int rc;
+
+	/* sd->sd_best_lpni is already set to the final destination */
+
+	/*
+	 * At this point we should've created the peer ni and peer. If we
+	 * can't find it, then something went wrong. Instead of assert
+	 * output a relevant message and fail the send
+	 */
+	if (!sd->sd_best_lpni) {
+		CERROR("Internal fault. Unable to send msg %s to %s. "
+		       "NID not known\n",
+		       lnet_msgtyp2str(sd->sd_msg->msg_type),
+		       libcfs_nid2str(sd->sd_dst_nid));
+		return -EFAULT;
+	}
+
+	rc = lnet_select_preferred_best_ni(sd);
+	if (!rc)
+		rc = lnet_handle_send(sd);
+
+	return rc;
+}
+
+static int
+lnet_handle_any_mr_dsta(struct lnet_send_data *sd)
+{
+	/*
+	 * NOTE we've already handled the remote peer case. So we only
+	 * need to worry about the local case here.
+	 *
+	 * if we're sending a response, ACK or reply, we need to send it
+	 * to the destination NID given to us. At this point we already
+	 * have the peer_ni we're suppose to send to, so just find the
+	 * best_ni on the peer net and use that. Since we're sending to an
+	 * MR peer then we can just run the selection algorithm on our
+	 * local NIs and pick the best one.
+	 */
+	if (sd->sd_send_case & SND_RESP) {
+		sd->sd_best_ni =
+		  lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer,
+						sd->sd_best_lpni->lpni_peer_net,
+						sd->sd_md_cpt, true);
+
+		if (!sd->sd_best_ni) {
+			/*
+			 * We're not going to deal with not able to send
+			 * a response to the provided final destination
+			 */
+			CERROR("Can't send response to %s. "
+			       "No local NI available\n",
+				libcfs_nid2str(sd->sd_dst_nid));
+			return -EHOSTUNREACH;
+		}
+
+		return lnet_handle_send(sd);
+	}
+
+	/*
+	 * If we get here that means we're sending a fresh request, PUT or
+	 * GET, so we need to run our standard selection algorithm.
+	 * First find the best local interface that's on any of the peer's
+	 * networks.
+	 */
+	sd->sd_best_ni = lnet_find_best_ni_on_local_net(sd->sd_peer,
+							sd->sd_md_cpt);
+	if (sd->sd_best_ni) {
+		sd->sd_best_lpni =
+		  lnet_find_best_lpni_on_net(sd, sd->sd_peer,
+					     sd->sd_best_ni->ni_net->net_id);
+
+		/*
+		 * if we're successful in selecting a peer_ni on the local
+		 * network, then send to it. Otherwise fall through and
+		 * try and see if we can reach it over another routed
+		 * network
+		 */
+		if (sd->sd_best_lpni &&
+		    sd->sd_best_lpni->lpni_nid == the_lnet.ln_loni->ni_nid) {
+			/*
+			 * in case we initially started with a routed
+			 * destination, let's reset to local
+			 */
+			sd->sd_send_case &= ~REMOTE_DST;
+			sd->sd_send_case |= LOCAL_DST;
+			return lnet_handle_lo_send(sd);
+		} else if (sd->sd_best_lpni) {
+			/*
+			 * in case we initially started with a routed
+			 * destination, let's reset to local
+			 */
+			sd->sd_send_case &= ~REMOTE_DST;
+			sd->sd_send_case |= LOCAL_DST;
+			return lnet_handle_send(sd);
+		}
+
+		CERROR("Internal Error. Expected to have a best_lpni: "
+		       "%s -> %s\n",
+		       libcfs_nid2str(sd->sd_src_nid),
+		       libcfs_nid2str(sd->sd_dst_nid));
+
+		return -EFAULT;
+	}
+
+	/*
+	 * Peer doesn't have a local network. Let's see if there is
+	 * a remote network we can reach it on.
+	 */
+	return PASS_THROUGH;
+}
+
+/*
+ * Case 1:
+ *	Source NID not specified
+ *	Local destination
+ *	MR peer
+ *
+ * Case 2:
+ *	Source NID not speified
+ *	Remote destination
+ *	MR peer
+ *
+ * In both of these cases if we're sending a response, ACK or REPLY, then
+ * we need to send to the destination NID provided.
+ *
+ * In the remote case let's deal with MR routers.
+ *
+ */
+
+static int
+lnet_handle_any_mr_dst(struct lnet_send_data *sd)
+{
+	int rc = 0;
+	struct lnet_peer *gw_peer = NULL;
+	struct lnet_peer_ni *gw_lpni = NULL;
+
+	/*
+	 * handle sending a response to a remote peer here so we don't
+	 * have to worry about it if we hit lnet_handle_any_mr_dsta()
+	 */
+	if (sd->sd_send_case & REMOTE_DST &&
+	    sd->sd_send_case & SND_RESP) {
+		struct lnet_peer_ni *gw;
+		struct lnet_peer *gw_peer;
+
+		rc = lnet_handle_find_routed_path(sd, sd->sd_dst_nid, &gw,
+						  &gw_peer);
+		if (rc < 0) {
+			CERROR("Can't send response to %s. "
+			       "No route available\n",
+				libcfs_nid2str(sd->sd_dst_nid));
+			return -EHOSTUNREACH;
+		}
+
+		sd->sd_best_lpni = gw;
+		sd->sd_peer = gw_peer;
+
+		return lnet_handle_send(sd);
+	}
+
+	/*
+	 * Even though the NID for the peer might not be on a local network,
+	 * since the peer is MR there could be other interfaces on the
+	 * local network. In that case we'd still like to prefer the local
+	 * network over the routed network. If we're unable to do that
+	 * then we select the best router among the different routed networks,
+	 * and if the router is MR then we can deal with it as such.
+	 */
+	rc = lnet_handle_any_mr_dsta(sd);
+	if (rc != PASS_THROUGH)
+		return rc;
+
+	/*
+	 * TODO; One possible enhancement is to run the selection
+	 * algorithm on the peer. However for remote peers the credits are
+	 * not decremented, so we'll be basically going over the peer NIs
+	 * in round robin. An MR router will run the selection algorithm
+	 * on the next-hop interfaces.
+	 */
+	rc = lnet_handle_find_routed_path(sd, sd->sd_dst_nid, &gw_lpni,
+					  &gw_peer);
+	if (rc < 0)
+		return rc;
+
+	sd->sd_send_case &= ~LOCAL_DST;
+	sd->sd_send_case |= REMOTE_DST;
+
+	sd->sd_peer = gw_peer;
+	sd->sd_best_lpni = gw_lpni;
+
+	return lnet_handle_send(sd);
+}
+
+/*
+ * Source not specified
+ * Remote destination
+ * Non-MR peer
+ *
+ * Must send to the specified peer NID using the same source NID that
+ * we've used before. If it's the first time to talk to that peer then
+ * find the source NI and assign it as preferred to that peer
+ */
+static int
+lnet_handle_any_router_nmr_dst(struct lnet_send_data *sd)
+{
+	int rc;
+	struct lnet_peer_ni *gw_lpni = NULL;
+	struct lnet_peer *gw_peer = NULL;
+
+	/*
+	 * Let's set if we have a preferred NI to talk to this NMR peer
+	 */
+	sd->sd_best_ni = lnet_find_existing_preferred_best_ni(sd);
+
+	/*
+	 * find the router and that'll find the best NI if we didn't find
+	 * it already.
+	 */
+	rc = lnet_handle_find_routed_path(sd, sd->sd_dst_nid, &gw_lpni,
+					  &gw_peer);
+	if (rc < 0)
+		return rc;
+
+	/*
+	 * set the best_ni we've chosen as the preferred one for
+	 * this peer
+	 */
+	lnet_set_non_mr_pref_nid(sd);
+
+	/* we'll be sending to the gw */
+	sd->sd_best_lpni = gw_lpni;
+	sd->sd_peer = gw_peer;
+
+	return lnet_handle_send(sd);
+}
+
+static int
+lnet_handle_send_case_locked(struct lnet_send_data *sd)
+{
+	/*
+	 * turn off the SND_RESP bit.
+	 * It will be checked in the case handling
+	 */
+	__u32 send_case = sd->sd_send_case &= ~SND_RESP ;
+
+	CDEBUG(D_NET, "Source %s%s to %s %s %s destination\n",
+		(send_case & SRC_SPEC) ? "Specified: " : "ANY",
+		(send_case & SRC_SPEC) ? libcfs_nid2str(sd->sd_src_nid) : "",
+		(send_case & MR_DST) ? "MR: " : "NMR: ",
+		libcfs_nid2str(sd->sd_dst_nid),
+		(send_case & LOCAL_DST) ? "local" : "routed");
+
+	switch (send_case) {
+	/*
+	 * For all cases where the source is specified, we should always
+	 * use the destination NID, whether it's an MR destination or not,
+	 * since we're continuing a series of related messages for the
+	 * same RPC
+	 */
+	case SRC_SPEC_LOCAL_NMR_DST:
+		return lnet_handle_spec_local_nmr_dst(sd);
+	case SRC_SPEC_LOCAL_MR_DST:
+		return lnet_handle_spec_local_mr_dst(sd);
+	case SRC_SPEC_ROUTER_NMR_DST:
+	case SRC_SPEC_ROUTER_MR_DST:
+		return lnet_handle_spec_router_dst(sd);
+	case SRC_ANY_LOCAL_NMR_DST:
+		return lnet_handle_any_local_nmr_dst(sd);
+	case SRC_ANY_LOCAL_MR_DST:
+	case SRC_ANY_ROUTER_MR_DST:
+		return lnet_handle_any_mr_dst(sd);
+	case SRC_ANY_ROUTER_NMR_DST:
+		return lnet_handle_any_router_nmr_dst(sd);
+	default:
+		CERROR("Unknown send case\n");
+		return -1;
+	}
+}
+
+static int
+lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
+		    struct lnet_msg *msg, lnet_nid_t rtr_nid)
+{
+	struct lnet_peer_ni *lpni;
+	struct lnet_peer *peer;
+	struct lnet_send_data send_data;
+	int cpt, rc;
+	int md_cpt;
+	__u32 send_case = 0;
+
+	memset(&send_data, 0, sizeof(send_data));
+
+	/*
+	 * get an initial CPT to use for locking. The idea here is not to
+	 * serialize the calls to select_pathway, so that as many
+	 * operations can run concurrently as possible. To do that we use
+	 * the CPT where this call is being executed. Later on when we
+	 * determine the CPT to use in lnet_message_commit, we switch the
+	 * lock and check if there was any configuration change.  If none,
+	 * then we proceed, if there is, then we restart the operation.
+	 */
+	cpt = lnet_net_lock_current();
+
+	md_cpt = lnet_cpt_of_md(msg->msg_md, msg->msg_offset);
+	if (md_cpt == CFS_CPT_ANY)
+		md_cpt = cpt;
+
+again:
+
+	/*
+	 * If we're being asked to send to the loopback interface, there
+	 * is no need to go through any selection. We can just shortcut
+	 * the entire process and send over lolnd
+	 */
+	send_data.sd_msg = msg;
+	send_data.sd_cpt = cpt;
+	if (dst_nid == LNET_NID_LO_0) {
+		rc = lnet_handle_lo_send(&send_data);
+		lnet_net_unlock(cpt);
+		return rc;
+	}
+
+	/*
+	 * find an existing peer_ni, or create one and mark it as having been
+	 * created due to network traffic. This call will create the
+	 * peer->peer_net->peer_ni tree.
+	 */
+	lpni = lnet_nid2peerni_locked(dst_nid, LNET_NID_ANY, cpt);
+	if (IS_ERR(lpni)) {
+		lnet_net_unlock(cpt);
+		return PTR_ERR(lpni);
+	}
+
+	/*
+	 * Cache the original src_nid. If we need to resend the message
+	 * then we'll need to know whether the src_nid was originally
+	 * specified for this message. If it was originally specified,
+	 * then we need to keep using the same src_nid since it's
+	 * continuing the same sequence of messages.
+	 */
+	msg->msg_src_nid_param = src_nid;
+
+	/*
+	 * Now that we have a peer_ni, check if we want to discover
+	 * the peer. Traffic to the LNET_RESERVED_PORTAL should not
+	 * trigger discovery.
+	 */
+	peer = lpni->lpni_peer_net->lpn_peer;
+	if (lnet_msg_discovery(msg) && !lnet_peer_is_uptodate(peer)) {
+		lnet_nid_t primary_nid;
+		rc = lnet_discover_peer_locked(lpni, cpt, false);
+		if (rc) {
+			lnet_peer_ni_decref_locked(lpni);
+			lnet_net_unlock(cpt);
+			return rc;
+		}
+		/* The peer may have changed. */
+		peer = lpni->lpni_peer_net->lpn_peer;
+		spin_lock(&peer->lp_lock);
+		if (lnet_peer_is_uptodate_locked(peer)) {
+			spin_unlock(&peer->lp_lock);
+		} else {
+			/* queue message and return */
+			msg->msg_rtr_nid_param = rtr_nid;
+			msg->msg_sending = 0;
+			list_add_tail(&msg->msg_list, &peer->lp_dc_pendq);
+			primary_nid = peer->lp_primary_nid;
+			spin_unlock(&peer->lp_lock);
+			lnet_peer_ni_decref_locked(lpni);
+			lnet_net_unlock(cpt);
+
+			CDEBUG(D_NET, "%s pending discovery\n",
+			       libcfs_nid2str(primary_nid));
+
+			return LNET_DC_WAIT;
+		}
+	}
+	lnet_peer_ni_decref_locked(lpni);
+	peer = lpni->lpni_peer_net->lpn_peer;
+
+	/*
+	 * Identify the different send cases
+	 */
+	if (src_nid == LNET_NID_ANY)
+		send_case |= SRC_ANY;
+	else
+		send_case |= SRC_SPEC;
+
+	if (lnet_get_net_locked(LNET_NIDNET(dst_nid)))
+		send_case |= LOCAL_DST;
+	else
+		send_case |= REMOTE_DST;
+
+	/*
+	 * if this is a non-MR peer or if we're recovering a peer ni then
+	 * let's consider this an NMR case so we can hit the destination
+	 * NID.
+	 */
+	if (!lnet_peer_is_multi_rail(peer) || msg->msg_recovery)
+		send_case |= NMR_DST;
+	else
+		send_case |= MR_DST;
+
+	if (msg->msg_type == LNET_MSG_REPLY ||
+	    msg->msg_type == LNET_MSG_ACK)
+		send_case |= SND_RESP;
+
+	/* assign parameters to the send_data */
+	send_data.sd_rtr_nid = rtr_nid;
+	send_data.sd_src_nid = src_nid;
+	send_data.sd_dst_nid = dst_nid;
+	send_data.sd_best_lpni = lpni;
+	/*
+	 * keep a pointer to the final destination in case we're going to
+	 * route, so we'll need to access it later
+	 */
+	send_data.sd_final_dst_lpni = lpni;
+	send_data.sd_peer = peer;
+	send_data.sd_md_cpt = md_cpt;
+	send_data.sd_send_case = send_case;
+
+	rc = lnet_handle_send_case_locked(&send_data);
+
+	/*
+	 * Update the local cpt since send_data.sd_cpt might've been
+	 * updated as a result of calling lnet_handle_send_case_locked().
+	 */
+	cpt = send_data.sd_cpt;
+
+	if (rc == REPEAT_SEND)
+		goto again;
+
+	lnet_net_unlock(cpt);
+
+	return rc;
+}
+
+int
+lnet_send(lnet_nid_t src_nid, struct lnet_msg *msg, lnet_nid_t rtr_nid)
+{
+	lnet_nid_t		dst_nid = msg->msg_target.nid;
+	int			rc;
+
+	/*
+	 * NB: rtr_nid is set to LNET_NID_ANY for all current use-cases,
+	 * but we might want to use pre-determined router for ACK/REPLY
+	 * in the future
+	 */
+	/* NB: ni != NULL == interface pre-determined (ACK/REPLY) */
+	LASSERT(msg->msg_txpeer == NULL);
+	LASSERT(msg->msg_txni == NULL);
+	LASSERT(!msg->msg_sending);
+	LASSERT(!msg->msg_target_is_router);
+	LASSERT(!msg->msg_receiving);
+
+	msg->msg_sending = 1;
+
+	LASSERT(!msg->msg_tx_committed);
+
+	rc = lnet_select_pathway(src_nid, dst_nid, msg, rtr_nid);
+	if (rc < 0)
+		return rc;
+
+	if (rc == LNET_CREDIT_OK)
+		lnet_ni_send(msg->msg_txni, msg);
+
+	/* rc == LNET_CREDIT_OK or LNET_CREDIT_WAIT or LNET_DC_WAIT */
+	return 0;
+}
+
+enum lnet_mt_event_type {
+	MT_TYPE_LOCAL_NI = 0,
+	MT_TYPE_PEER_NI
+};
+
+struct lnet_mt_event_info {
+	enum lnet_mt_event_type mt_type;
+	lnet_nid_t mt_nid;
+};
+
+/* called with res_lock held */
+void
+lnet_detach_rsp_tracker(struct lnet_libmd *md, int cpt)
+{
+	struct lnet_rsp_tracker *rspt;
+
+	/*
+	 * msg has a refcount on the MD so the MD is not going away.
+	 * The rspt queue for the cpt is protected by
+	 * the lnet_net_lock(cpt). cpt is the cpt of the MD cookie.
+	 */
+	if (!md->md_rspt_ptr)
+		return;
+
+	rspt = md->md_rspt_ptr;
+
+	/* debug code */
+	LASSERT(rspt->rspt_cpt == cpt);
+
+	md->md_rspt_ptr = NULL;
+
+	if (LNetMDHandleIsInvalid(rspt->rspt_mdh)) {
+		/*
+		 * The monitor thread has invalidated this handle because the
+		 * response timed out, but it failed to lookup the MD. That
+		 * means this response tracker is on the zombie list. We can
+		 * safely remove it under the resource lock (held by caller) and
+		 * free the response tracker block.
+		 */
+		list_del(&rspt->rspt_on_list);
+		lnet_rspt_free(rspt, cpt);
+	} else {
+		/*
+		 * invalidate the handle to indicate that a response has been
+		 * received, which will then lead the monitor thread to clean up
+		 * the rspt block.
+		 */
+		LNetInvalidateMDHandle(&rspt->rspt_mdh);
+	}
+}
+
+void
+lnet_clean_zombie_rstqs(void)
+{
+	struct lnet_rsp_tracker *rspt, *tmp;
+	int i;
+
+	cfs_cpt_for_each(i, lnet_cpt_table()) {
+		list_for_each_entry_safe(rspt, tmp,
+					 the_lnet.ln_mt_zombie_rstqs[i],
+					 rspt_on_list) {
+			list_del(&rspt->rspt_on_list);
+			lnet_rspt_free(rspt, i);
+		}
+	}
+
+	cfs_percpt_free(the_lnet.ln_mt_zombie_rstqs);
+}
+
+static void
+lnet_finalize_expired_responses(void)
+{
+	struct lnet_libmd *md;
+	struct list_head local_queue;
+	struct lnet_rsp_tracker *rspt, *tmp;
+	ktime_t now;
+	int i;
+
+	if (the_lnet.ln_mt_rstq == NULL)
+		return;
+
+	cfs_cpt_for_each(i, lnet_cpt_table()) {
+		INIT_LIST_HEAD(&local_queue);
+
+		lnet_net_lock(i);
+		if (!the_lnet.ln_mt_rstq[i]) {
+			lnet_net_unlock(i);
+			continue;
+		}
+		list_splice_init(the_lnet.ln_mt_rstq[i], &local_queue);
+		lnet_net_unlock(i);
+
+		now = ktime_get();
+
+		list_for_each_entry_safe(rspt, tmp, &local_queue, rspt_on_list) {
+			/*
+			 * The rspt mdh will be invalidated when a response
+			 * is received or whenever we want to discard the
+			 * block the monitor thread will walk the queue
+			 * and clean up any rsts with an invalid mdh.
+			 * The monitor thread will walk the queue until
+			 * the first unexpired rspt block. This means that
+			 * some rspt blocks which received their
+			 * corresponding responses will linger in the
+			 * queue until they are cleaned up eventually.
+			 */
+			lnet_res_lock(i);
+			if (LNetMDHandleIsInvalid(rspt->rspt_mdh)) {
+				lnet_res_unlock(i);
+				list_del(&rspt->rspt_on_list);
+				lnet_rspt_free(rspt, i);
+				continue;
+			}
+
+			if (ktime_compare(now, rspt->rspt_deadline) >= 0 ||
+			    the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN) {
+				struct lnet_peer_ni *lpni;
+				lnet_nid_t nid;
+
+				md = lnet_handle2md(&rspt->rspt_mdh);
+				if (!md) {
+					/* MD has been queued for unlink, but
+					 * rspt hasn't been detached (Note we've
+					 * checked above that the rspt_mdh is
+					 * valid). Since we cannot lookup the MD
+					 * we're unable to detach the rspt
+					 * ourselves. Thus, move the rspt to the
+					 * zombie list where we'll wait for
+					 * either:
+					 *   1. The remaining operations on the
+					 *   MD to complete. In this case the
+					 *   final operation will result in
+					 *   lnet_msg_detach_md()->
+					 *   lnet_detach_rsp_tracker() where
+					 *   we will clean up this response
+					 *   tracker.
+					 *   2. LNet to shutdown. In this case
+					 *   we'll wait until after all LND Nets
+					 *   have shutdown and then we can
+					 *   safely free any remaining response
+					 *   tracker blocks on the zombie list.
+					 * Note: We need to hold the resource
+					 * lock when adding to the zombie list
+					 * because we may have concurrent access
+					 * with lnet_detach_rsp_tracker().
+					 */
+					LNetInvalidateMDHandle(&rspt->rspt_mdh);
+					list_move(&rspt->rspt_on_list,
+						  the_lnet.ln_mt_zombie_rstqs[i]);
+					lnet_res_unlock(i);
+					continue;
+				}
+				LASSERT(md->md_rspt_ptr == rspt);
+				md->md_rspt_ptr = NULL;
+				lnet_res_unlock(i);
+
+				LNetMDUnlink(rspt->rspt_mdh);
+
+				nid = rspt->rspt_next_hop_nid;
+
+				list_del(&rspt->rspt_on_list);
+				lnet_rspt_free(rspt, i);
+
+				/* If we're shutting down we just want to clean
+				 * up the rspt blocks
+				 */
+				if (the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN)
+					continue;
+
+				lnet_net_lock(i);
+				the_lnet.ln_counters[i]->lct_health.lch_response_timeout_count++;
+				lnet_net_unlock(i);
+
+				CDEBUG(D_NET,
+				       "Response timeout: md = %p: nid = %s\n",
+				       md, libcfs_nid2str(nid));
+
+				/*
+				 * If there is a timeout on the response
+				 * from the next hop decrement its health
+				 * value so that we don't use it
+				 */
+				lnet_net_lock(0);
+				lpni = lnet_find_peer_ni_locked(nid);
+				if (lpni) {
+					lnet_handle_remote_failure_locked(lpni);
+					lnet_peer_ni_decref_locked(lpni);
+				}
+				lnet_net_unlock(0);
+			} else {
+				lnet_res_unlock(i);
+				break;
+			}
+		}
+
+		if (!list_empty(&local_queue)) {
+			lnet_net_lock(i);
+			list_splice(&local_queue, the_lnet.ln_mt_rstq[i]);
+			lnet_net_unlock(i);
+		}
+	}
+}
+
+static void
+lnet_resend_pending_msgs_locked(struct list_head *resendq, int cpt)
+{
+	struct lnet_msg *msg;
+
+	while (!list_empty(resendq)) {
+		struct lnet_peer_ni *lpni;
+
+		msg = list_entry(resendq->next, struct lnet_msg,
+				 msg_list);
+
+		list_del_init(&msg->msg_list);
+
+		lpni = lnet_find_peer_ni_locked(msg->msg_hdr.dest_nid);
+		if (!lpni) {
+			lnet_net_unlock(cpt);
+			CERROR("Expected that a peer is already created for %s\n",
+			       libcfs_nid2str(msg->msg_hdr.dest_nid));
+			msg->msg_no_resend = true;
+			lnet_finalize(msg, -EFAULT);
+			lnet_net_lock(cpt);
+		} else {
+			struct lnet_peer *peer;
+			int rc;
+			lnet_nid_t src_nid = LNET_NID_ANY;
+
+			/*
+			 * if this message is not being routed and the
+			 * peer is non-MR then we must use the same
+			 * src_nid that was used in the original send.
+			 * Otherwise if we're routing the message (IE
+			 * we're a router) then we can use any of our
+			 * local interfaces. It doesn't matter to the
+			 * final destination.
+			 */
+			peer = lpni->lpni_peer_net->lpn_peer;
+			if (!msg->msg_routing &&
+			    !lnet_peer_is_multi_rail(peer))
+				src_nid = le64_to_cpu(msg->msg_hdr.src_nid);
+
+			/*
+			 * If we originally specified a src NID, then we
+			 * must attempt to reuse it in the resend as well.
+			 */
+			if (msg->msg_src_nid_param != LNET_NID_ANY)
+				src_nid = msg->msg_src_nid_param;
+			lnet_peer_ni_decref_locked(lpni);
+
+			lnet_net_unlock(cpt);
+			CDEBUG(D_NET, "resending %s->%s: %s recovery %d try# %d\n",
+			       libcfs_nid2str(src_nid),
+			       libcfs_id2str(msg->msg_target),
+			       lnet_msgtyp2str(msg->msg_type),
+			       msg->msg_recovery,
+			       msg->msg_retry_count);
+			rc = lnet_send(src_nid, msg, LNET_NID_ANY);
+			if (rc) {
+				CERROR("Error sending %s to %s: %d\n",
+				       lnet_msgtyp2str(msg->msg_type),
+				       libcfs_id2str(msg->msg_target), rc);
+				msg->msg_no_resend = true;
+				lnet_finalize(msg, rc);
+			}
+			lnet_net_lock(cpt);
+			if (!rc)
+				the_lnet.ln_counters[cpt]->lct_health.lch_resend_count++;
+		}
+	}
+}
+
+static void
+lnet_resend_pending_msgs(void)
+{
+	int i;
+
+	cfs_cpt_for_each(i, lnet_cpt_table()) {
+		lnet_net_lock(i);
+		lnet_resend_pending_msgs_locked(the_lnet.ln_mt_resendqs[i], i);
+		lnet_net_unlock(i);
+	}
+}
+
+/* called with cpt and ni_lock held */
+static void
+lnet_unlink_ni_recovery_mdh_locked(struct lnet_ni *ni, int cpt, bool force)
+{
+	struct lnet_handle_md recovery_mdh;
+
+	LNetInvalidateMDHandle(&recovery_mdh);
+
+	if (ni->ni_recovery_state & LNET_NI_RECOVERY_PENDING ||
+	    force) {
+		recovery_mdh = ni->ni_ping_mdh;
+		LNetInvalidateMDHandle(&ni->ni_ping_mdh);
+	}
+	lnet_ni_unlock(ni);
+	lnet_net_unlock(cpt);
+	if (!LNetMDHandleIsInvalid(recovery_mdh))
+		LNetMDUnlink(recovery_mdh);
+	lnet_net_lock(cpt);
+	lnet_ni_lock(ni);
+}
+
+static void
+lnet_recover_local_nis(void)
+{
+	struct lnet_mt_event_info *ev_info;
+	struct list_head processed_list;
+	struct list_head local_queue;
+	struct lnet_handle_md mdh;
+	struct lnet_ni *tmp;
+	struct lnet_ni *ni;
+	lnet_nid_t nid;
+	int healthv;
+	int rc;
+
+	INIT_LIST_HEAD(&local_queue);
+	INIT_LIST_HEAD(&processed_list);
+
+	/*
+	 * splice the recovery queue on a local queue. We will iterate
+	 * through the local queue and update it as needed. Once we're
+	 * done with the traversal, we'll splice the local queue back on
+	 * the head of the ln_mt_localNIRecovq. Any newly added local NIs
+	 * will be traversed in the next iteration.
+	 */
+	lnet_net_lock(0);
+	list_splice_init(&the_lnet.ln_mt_localNIRecovq,
+			 &local_queue);
+	lnet_net_unlock(0);
+
+	list_for_each_entry_safe(ni, tmp, &local_queue, ni_recovery) {
+		/*
+		 * if an NI is being deleted or it is now healthy, there
+		 * is no need to keep it around in the recovery queue.
+		 * The monitor thread is the only thread responsible for
+		 * removing the NI from the recovery queue.
+		 * Multiple threads can be adding NIs to the recovery
+		 * queue.
+		 */
+		healthv = atomic_read(&ni->ni_healthv);
+
+		lnet_net_lock(0);
+		lnet_ni_lock(ni);
+		if (ni->ni_state != LNET_NI_STATE_ACTIVE ||
+		    healthv == LNET_MAX_HEALTH_VALUE) {
+			list_del_init(&ni->ni_recovery);
+			lnet_unlink_ni_recovery_mdh_locked(ni, 0, false);
+			lnet_ni_unlock(ni);
+			lnet_ni_decref_locked(ni, 0);
+			lnet_net_unlock(0);
+			continue;
+		}
+
+		/*
+		 * if the local NI failed recovery we must unlink the md.
+		 * But we want to keep the local_ni on the recovery queue
+		 * so we can continue the attempts to recover it.
+		 */
+		if (ni->ni_recovery_state & LNET_NI_RECOVERY_FAILED) {
+			lnet_unlink_ni_recovery_mdh_locked(ni, 0, true);
+			ni->ni_recovery_state &= ~LNET_NI_RECOVERY_FAILED;
+		}
+
+		lnet_ni_unlock(ni);
+		lnet_net_unlock(0);
+
+
+		CDEBUG(D_NET, "attempting to recover local ni: %s\n",
+		       libcfs_nid2str(ni->ni_nid));
+
+		lnet_ni_lock(ni);
+		if (!(ni->ni_recovery_state & LNET_NI_RECOVERY_PENDING)) {
+			ni->ni_recovery_state |= LNET_NI_RECOVERY_PENDING;
+			lnet_ni_unlock(ni);
+
+			LIBCFS_ALLOC(ev_info, sizeof(*ev_info));
+			if (!ev_info) {
+				CERROR("out of memory. Can't recover %s\n",
+				       libcfs_nid2str(ni->ni_nid));
+				lnet_ni_lock(ni);
+				ni->ni_recovery_state &=
+				  ~LNET_NI_RECOVERY_PENDING;
+				lnet_ni_unlock(ni);
+				continue;
+			}
+
+			mdh = ni->ni_ping_mdh;
+			/*
+			 * Invalidate the ni mdh in case it's deleted.
+			 * We'll unlink the mdh in this case below.
+			 */
+			LNetInvalidateMDHandle(&ni->ni_ping_mdh);
+			nid = ni->ni_nid;
+
+			/*
+			 * remove the NI from the local queue and drop the
+			 * reference count to it while we're recovering
+			 * it. The reason for that, is that the NI could
+			 * be deleted, and the way the code is structured
+			 * is if we don't drop the NI, then the deletion
+			 * code will enter a loop waiting for the
+			 * reference count to be removed while holding the
+			 * ln_mutex_lock(). When we look up the peer to
+			 * send to in lnet_select_pathway() we will try to
+			 * lock the ln_mutex_lock() as well, leading to
+			 * a deadlock. By dropping the refcount and
+			 * removing it from the list, we allow for the NI
+			 * to be removed, then we use the cached NID to
+			 * look it up again. If it's gone, then we just
+			 * continue examining the rest of the queue.
+			 */
+			lnet_net_lock(0);
+			list_del_init(&ni->ni_recovery);
+			lnet_ni_decref_locked(ni, 0);
+			lnet_net_unlock(0);
+
+			ev_info->mt_type = MT_TYPE_LOCAL_NI;
+			ev_info->mt_nid = nid;
+			rc = lnet_send_ping(nid, &mdh, LNET_INTERFACES_MIN,
+					    ev_info, the_lnet.ln_mt_eqh, true);
+			/* lookup the nid again */
+			lnet_net_lock(0);
+			ni = lnet_nid2ni_locked(nid, 0);
+			if (!ni) {
+				/*
+				 * the NI has been deleted when we dropped
+				 * the ref count
+				 */
+				lnet_net_unlock(0);
+				LNetMDUnlink(mdh);
+				continue;
+			}
+			/*
+			 * Same note as in lnet_recover_peer_nis(). When
+			 * we're sending the ping, the NI is free to be
+			 * deleted or manipulated. By this point it
+			 * could've been added back on the recovery queue,
+			 * and a refcount taken on it.
+			 * So we can't just add it blindly again or we'll
+			 * corrupt the queue. We must check under lock if
+			 * it's not on any list and if not then add it
+			 * to the processed list, which will eventually be
+			 * spliced back on to the recovery queue.
+			 */
+			ni->ni_ping_mdh = mdh;
+			if (list_empty(&ni->ni_recovery)) {
+				list_add_tail(&ni->ni_recovery, &processed_list);
+				lnet_ni_addref_locked(ni, 0);
+			}
+			lnet_net_unlock(0);
+
+			lnet_ni_lock(ni);
+			if (rc)
+				ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING;
+		}
+		lnet_ni_unlock(ni);
+	}
+
+	/*
+	 * put back the remaining NIs on the ln_mt_localNIRecovq to be
+	 * reexamined in the next iteration.
+	 */
+	list_splice_init(&processed_list, &local_queue);
+	lnet_net_lock(0);
+	list_splice(&local_queue, &the_lnet.ln_mt_localNIRecovq);
+	lnet_net_unlock(0);
+}
+
+static int
+lnet_resendqs_create(void)
+{
+	struct list_head **resendqs;
+	resendqs = lnet_create_array_of_queues();
+
+	if (!resendqs)
+		return -ENOMEM;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_mt_resendqs = resendqs;
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	return 0;
+}
+
+static void
+lnet_clean_local_ni_recoveryq(void)
+{
+	struct lnet_ni *ni;
+
+	/* This is only called when the monitor thread has stopped */
+	lnet_net_lock(0);
+
+	while (!list_empty(&the_lnet.ln_mt_localNIRecovq)) {
+		ni = list_entry(the_lnet.ln_mt_localNIRecovq.next,
+				struct lnet_ni, ni_recovery);
+		list_del_init(&ni->ni_recovery);
+		lnet_ni_lock(ni);
+		lnet_unlink_ni_recovery_mdh_locked(ni, 0, true);
+		lnet_ni_unlock(ni);
+		lnet_ni_decref_locked(ni, 0);
+	}
+
+	lnet_net_unlock(0);
+}
+
+static void
+lnet_unlink_lpni_recovery_mdh_locked(struct lnet_peer_ni *lpni, int cpt,
+				     bool force)
+{
+	struct lnet_handle_md recovery_mdh;
+
+	LNetInvalidateMDHandle(&recovery_mdh);
+
+	if (lpni->lpni_state & LNET_PEER_NI_RECOVERY_PENDING || force) {
+		recovery_mdh = lpni->lpni_recovery_ping_mdh;
+		LNetInvalidateMDHandle(&lpni->lpni_recovery_ping_mdh);
+	}
+	spin_unlock(&lpni->lpni_lock);
+	lnet_net_unlock(cpt);
+	if (!LNetMDHandleIsInvalid(recovery_mdh))
+		LNetMDUnlink(recovery_mdh);
+	lnet_net_lock(cpt);
+	spin_lock(&lpni->lpni_lock);
+}
+
+static void
+lnet_clean_peer_ni_recoveryq(void)
+{
+	struct lnet_peer_ni *lpni, *tmp;
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	list_for_each_entry_safe(lpni, tmp, &the_lnet.ln_mt_peerNIRecovq,
+				 lpni_recovery) {
+		list_del_init(&lpni->lpni_recovery);
+		spin_lock(&lpni->lpni_lock);
+		lnet_unlink_lpni_recovery_mdh_locked(lpni, LNET_LOCK_EX, true);
+		spin_unlock(&lpni->lpni_lock);
+		lnet_peer_ni_decref_locked(lpni);
+	}
+
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+static void
+lnet_clean_resendqs(void)
+{
+	struct lnet_msg *msg, *tmp;
+	struct list_head msgs;
+	int i;
+
+	INIT_LIST_HEAD(&msgs);
+
+	cfs_cpt_for_each(i, lnet_cpt_table()) {
+		lnet_net_lock(i);
+		list_splice_init(the_lnet.ln_mt_resendqs[i], &msgs);
+		lnet_net_unlock(i);
+		list_for_each_entry_safe(msg, tmp, &msgs, msg_list) {
+			list_del_init(&msg->msg_list);
+			msg->msg_no_resend = true;
+			lnet_finalize(msg, -ESHUTDOWN);
+		}
+	}
+
+	cfs_percpt_free(the_lnet.ln_mt_resendqs);
+}
+
+static void
+lnet_recover_peer_nis(void)
+{
+	struct lnet_mt_event_info *ev_info;
+	struct list_head processed_list;
+	struct list_head local_queue;
+	struct lnet_handle_md mdh;
+	struct lnet_peer_ni *lpni;
+	struct lnet_peer_ni *tmp;
+	lnet_nid_t nid;
+	int healthv;
+	int rc;
+
+	INIT_LIST_HEAD(&local_queue);
+	INIT_LIST_HEAD(&processed_list);
+
+	/*
+	 * Always use cpt 0 for locking across all interactions with
+	 * ln_mt_peerNIRecovq
+	 */
+	lnet_net_lock(0);
+	list_splice_init(&the_lnet.ln_mt_peerNIRecovq,
+			 &local_queue);
+	lnet_net_unlock(0);
+
+	list_for_each_entry_safe(lpni, tmp, &local_queue,
+				 lpni_recovery) {
+		/*
+		 * The same protection strategy is used here as is in the
+		 * local recovery case.
+		 */
+		lnet_net_lock(0);
+		healthv = atomic_read(&lpni->lpni_healthv);
+		spin_lock(&lpni->lpni_lock);
+		if (lpni->lpni_state & LNET_PEER_NI_DELETING ||
+		    healthv == LNET_MAX_HEALTH_VALUE) {
+			list_del_init(&lpni->lpni_recovery);
+			lnet_unlink_lpni_recovery_mdh_locked(lpni, 0, false);
+			spin_unlock(&lpni->lpni_lock);
+			lnet_peer_ni_decref_locked(lpni);
+			lnet_net_unlock(0);
+			continue;
+		}
+
+		/*
+		 * If the peer NI has failed recovery we must unlink the
+		 * md. But we want to keep the peer ni on the recovery
+		 * queue so we can try to continue recovering it
+		 */
+		if (lpni->lpni_state & LNET_PEER_NI_RECOVERY_FAILED) {
+			lnet_unlink_lpni_recovery_mdh_locked(lpni, 0, true);
+			lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_FAILED;
+		}
+
+		spin_unlock(&lpni->lpni_lock);
+		lnet_net_unlock(0);
 
-send:
-	/* Shortcut for loopback. */
-	if (best_ni == the_lnet.ln_loni) {
-		/* No send credit hassles with LOLND */
-		lnet_ni_addref_locked(best_ni, cpt);
-		msg->msg_hdr.dest_nid = cpu_to_le64(best_ni->ni_nid);
-		if (!msg->msg_routing)
-			msg->msg_hdr.src_nid = cpu_to_le64(best_ni->ni_nid);
-		msg->msg_target.nid = best_ni->ni_nid;
-		lnet_msg_commit(msg, cpt);
-		msg->msg_txni = best_ni;
-		lnet_net_unlock(cpt);
+		/*
+		 * NOTE: we're racing with peer deletion from user space.
+		 * It's possible that a peer is deleted after we check its
+		 * state. In this case the recovery can create a new peer
+		 */
+		spin_lock(&lpni->lpni_lock);
+		if (!(lpni->lpni_state & LNET_PEER_NI_RECOVERY_PENDING) &&
+		    !(lpni->lpni_state & LNET_PEER_NI_DELETING)) {
+			lpni->lpni_state |= LNET_PEER_NI_RECOVERY_PENDING;
+			spin_unlock(&lpni->lpni_lock);
+
+			LIBCFS_ALLOC(ev_info, sizeof(*ev_info));
+			if (!ev_info) {
+				CERROR("out of memory. Can't recover %s\n",
+				       libcfs_nid2str(lpni->lpni_nid));
+				spin_lock(&lpni->lpni_lock);
+				lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING;
+				spin_unlock(&lpni->lpni_lock);
+				continue;
+			}
+
+			/* look at the comments in lnet_recover_local_nis() */
+			mdh = lpni->lpni_recovery_ping_mdh;
+			LNetInvalidateMDHandle(&lpni->lpni_recovery_ping_mdh);
+			nid = lpni->lpni_nid;
+			lnet_net_lock(0);
+			list_del_init(&lpni->lpni_recovery);
+			lnet_peer_ni_decref_locked(lpni);
+			lnet_net_unlock(0);
+
+			ev_info->mt_type = MT_TYPE_PEER_NI;
+			ev_info->mt_nid = nid;
+			rc = lnet_send_ping(nid, &mdh, LNET_INTERFACES_MIN,
+					    ev_info, the_lnet.ln_mt_eqh, true);
+			lnet_net_lock(0);
+			/*
+			 * lnet_find_peer_ni_locked() grabs a refcount for
+			 * us. No need to take it explicitly.
+			 */
+			lpni = lnet_find_peer_ni_locked(nid);
+			if (!lpni) {
+				lnet_net_unlock(0);
+				LNetMDUnlink(mdh);
+				continue;
+			}
 
-		return LNET_CREDIT_OK;
+			lpni->lpni_recovery_ping_mdh = mdh;
+			/*
+			 * While we're unlocked the lpni could've been
+			 * readded on the recovery queue. In this case we
+			 * don't need to add it to the local queue, since
+			 * it's already on there and the thread that added
+			 * it would've incremented the refcount on the
+			 * peer, which means we need to decref the refcount
+			 * that was implicitly grabbed by find_peer_ni_locked.
+			 * Otherwise, if the lpni is still not on
+			 * the recovery queue, then we'll add it to the
+			 * processed list.
+			 */
+			if (list_empty(&lpni->lpni_recovery))
+				list_add_tail(&lpni->lpni_recovery, &processed_list);
+			else
+				lnet_peer_ni_decref_locked(lpni);
+			lnet_net_unlock(0);
+
+			spin_lock(&lpni->lpni_lock);
+			if (rc)
+				lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING;
+		}
+		spin_unlock(&lpni->lpni_lock);
 	}
 
-	routing = routing || routing2;
+	list_splice_init(&processed_list, &local_queue);
+	lnet_net_lock(0);
+	list_splice(&local_queue, &the_lnet.ln_mt_peerNIRecovq);
+	lnet_net_unlock(0);
+}
 
-	/*
-	 * Increment sequence number of the peer selected so that we
-	 * pick the next one in Round Robin.
-	 */
-	best_lpni->lpni_seq++;
+static int
+lnet_monitor_thread(void *arg)
+{
+	time64_t recovery_timeout = 0;
+	time64_t rsp_timeout = 0;
+	int interval;
+	time64_t now;
 
 	/*
-	 * grab a reference on the peer_ni so it sticks around even if
-	 * we need to drop and relock the lnet_net_lock below.
+	 * The monitor thread takes care of the following:
+	 *  1. Checks the aliveness of routers
+	 *  2. Checks if there are messages on the resend queue to resend
+	 *     them.
+	 *  3. Check if there are any NIs on the local recovery queue and
+	 *     pings them
+	 *  4. Checks if there are any NIs on the remote recovery queue
+	 *     and pings them.
 	 */
-	lnet_peer_ni_addref_locked(best_lpni);
+	cfs_block_allsigs();
 
-	/*
-	 * Use lnet_cpt_of_nid() to determine the CPT used to commit the
-	 * message. This ensures that we get a CPT that is correct for
-	 * the NI when the NI has been restricted to a subset of all CPTs.
-	 * If the selected CPT differs from the one currently locked, we
-	 * must unlock and relock the lnet_net_lock(), and then check whether
-	 * the configuration has changed. We don't have a hold on the best_ni
-	 * yet, and it may have vanished.
-	 */
-	cpt2 = lnet_cpt_of_nid_locked(best_lpni->lpni_nid, best_ni);
-	if (cpt != cpt2) {
-		lnet_net_unlock(cpt);
-		cpt = cpt2;
-		lnet_net_lock(cpt);
-		if (seq != lnet_get_dlc_seq_locked()) {
-			lnet_peer_ni_decref_locked(best_lpni);
-			goto again;
+	while (the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING) {
+		now = ktime_get_real_seconds();
+
+		if (lnet_router_checker_active())
+			lnet_check_routers();
+
+		lnet_resend_pending_msgs();
+
+		if (now >= rsp_timeout) {
+			lnet_finalize_expired_responses();
+			rsp_timeout = now + (lnet_transaction_timeout / 2);
 		}
+
+		if (now >= recovery_timeout) {
+			lnet_recover_local_nis();
+			lnet_recover_peer_nis();
+			recovery_timeout = now + lnet_recovery_interval;
+		}
+
+		/*
+		 * TODO do we need to check if we should sleep without
+		 * timeout?  Technically, an active system will always
+		 * have messages in flight so this check will always
+		 * evaluate to false. And on an idle system do we care
+		 * if we wake up every 1 second? Although, we've seen
+		 * cases where we get a complaint that an idle thread
+		 * is waking up unnecessarily.
+		 */
+		interval = min(lnet_recovery_interval,
+			       lnet_transaction_timeout / 2);
+		wait_event_interruptible_timeout(the_lnet.ln_mt_waitq,
+						false,
+						cfs_time_seconds(interval));
 	}
 
-	/*
-	 * store the best_lpni in the message right away to avoid having
-	 * to do the same operation under different conditions
-	 */
-	msg->msg_txpeer = best_lpni;
-	msg->msg_txni = best_ni;
+	/* clean up the router checker */
+	lnet_prune_rc_data(1);
 
-	/*
-	 * grab a reference for the best_ni since now it's in use in this
-	 * send. the reference will need to be dropped when the message is
-	 * finished in lnet_finalize()
-	 */
-	lnet_ni_addref_locked(msg->msg_txni, cpt);
+	/* Shutting down */
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_mt_state = LNET_MT_STATE_SHUTDOWN;
+	lnet_net_unlock(LNET_LOCK_EX);
 
-	/*
-	 * Always set the target.nid to the best peer picked. Either the
-	 * nid will be one of the preconfigured NIDs, or the same NID as
-	 * what was originally set in the target or it will be the NID of
-	 * a router if this message should be routed
-	 */
-	msg->msg_target.nid = msg->msg_txpeer->lpni_nid;
+	/* signal that the monitor thread is exiting */
+	up(&the_lnet.ln_mt_signal);
 
-	/*
-	 * lnet_msg_commit assigns the correct cpt to the message, which
-	 * is used to decrement the correct refcount on the ni when it's
-	 * time to return the credits
-	 */
-	lnet_msg_commit(msg, cpt);
+	return 0;
+}
 
-	/*
-	 * If we are routing the message then we don't need to overwrite
-	 * the src_nid since it would've been set at the origin. Otherwise
-	 * we are the originator so we need to set it.
-	 */
-	if (!msg->msg_routing)
-		msg->msg_hdr.src_nid = cpu_to_le64(msg->msg_txni->ni_nid);
+/*
+ * lnet_send_ping
+ * Sends a ping.
+ * Returns == 0 if success
+ * Returns > 0 if LNetMDBind or prior fails
+ * Returns < 0 if LNetGet fails
+ */
+int
+lnet_send_ping(lnet_nid_t dest_nid,
+	       struct lnet_handle_md *mdh, int nnis,
+	       void *user_data, struct lnet_handle_eq eqh, bool recovery)
+{
+	struct lnet_md md = { NULL };
+	struct lnet_process_id id;
+	struct lnet_ping_buffer *pbuf;
+	int rc;
 
-	if (routing) {
-		msg->msg_target_is_router = 1;
-		msg->msg_target.pid = LNET_PID_LUSTRE;
+	if (dest_nid == LNET_NID_ANY) {
+		rc = -EHOSTUNREACH;
+		goto fail_error;
+	}
+
+	pbuf = lnet_ping_buffer_alloc(nnis, GFP_NOFS);
+	if (!pbuf) {
+		rc = ENOMEM;
+		goto fail_error;
+	}
+
+	/* initialize md content */
+	md.start     = &pbuf->pb_info;
+	md.length    = LNET_PING_INFO_SIZE(nnis);
+	md.threshold = 2; /* GET/REPLY */
+	md.max_size  = 0;
+	md.options   = LNET_MD_TRUNCATE;
+	md.user_ptr  = user_data;
+	md.eq_handle = eqh;
+
+	rc = LNetMDBind(md, LNET_UNLINK, mdh);
+	if (rc) {
+		lnet_ping_buffer_decref(pbuf);
+		CERROR("Can't bind MD: %d\n", rc);
+		rc = -rc; /* change the rc to positive */
+		goto fail_error;
+	}
+	id.pid = LNET_PID_LUSTRE;
+	id.nid = dest_nid;
+
+	rc = LNetGet(LNET_NID_ANY, *mdh, id,
+		     LNET_RESERVED_PORTAL,
+		     LNET_PROTO_PING_MATCHBITS, 0, recovery);
+
+	if (rc)
+		goto fail_unlink_md;
+
+	return 0;
+
+fail_unlink_md:
+	LNetMDUnlink(*mdh);
+	LNetInvalidateMDHandle(mdh);
+fail_error:
+	return rc;
+}
+
+static void
+lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info,
+			   int status, bool unlink_event)
+{
+	lnet_nid_t nid = ev_info->mt_nid;
+
+	if (ev_info->mt_type == MT_TYPE_LOCAL_NI) {
+		struct lnet_ni *ni;
+
+		lnet_net_lock(0);
+		ni = lnet_nid2ni_locked(nid, 0);
+		if (!ni) {
+			lnet_net_unlock(0);
+			return;
+		}
+		lnet_ni_lock(ni);
+		ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING;
+		if (status)
+			ni->ni_recovery_state |= LNET_NI_RECOVERY_FAILED;
+		lnet_ni_unlock(ni);
+		lnet_net_unlock(0);
+
+		if (status != 0) {
+			CERROR("local NI (%s) recovery failed with %d\n",
+			       libcfs_nid2str(nid), status);
+			return;
+		}
 		/*
-		 * since we're routing we want to ensure that the
-		 * msg_hdr.dest_nid is set to the final destination. When
-		 * the router receives this message it knows how to route
-		 * it.
+		 * need to increment healthv for the ni here, because in
+		 * the lnet_finalize() path we don't have access to this
+		 * NI. And in order to get access to it, we'll need to
+		 * carry forward too much information.
+		 * In the peer case, it'll naturally be incremented
 		 */
-		msg->msg_hdr.dest_nid =
-			cpu_to_le64(final_dst ? final_dst->lpni_nid : dst_nid);
+		if (!unlink_event)
+			lnet_inc_healthv(&ni->ni_healthv);
 	} else {
-		/*
-		 * if we're not routing set the dest_nid to the best peer
-		 * ni that we picked earlier in the algorithm.
-		 */
-		msg->msg_hdr.dest_nid = cpu_to_le64(msg->msg_txpeer->lpni_nid);
+		struct lnet_peer_ni *lpni;
+		int cpt;
+
+		cpt = lnet_net_lock_current();
+		lpni = lnet_find_peer_ni_locked(nid);
+		if (!lpni) {
+			lnet_net_unlock(cpt);
+			return;
+		}
+		spin_lock(&lpni->lpni_lock);
+		lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING;
+		if (status)
+			lpni->lpni_state |= LNET_PEER_NI_RECOVERY_FAILED;
+		spin_unlock(&lpni->lpni_lock);
+		lnet_peer_ni_decref_locked(lpni);
+		lnet_net_unlock(cpt);
+
+		if (status != 0)
+			CERROR("peer NI (%s) recovery failed with %d\n",
+			       libcfs_nid2str(nid), status);
 	}
+}
 
-	rc = lnet_post_send_locked(msg, 0);
+void
+lnet_mt_event_handler(struct lnet_event *event)
+{
+	struct lnet_mt_event_info *ev_info = event->md.user_ptr;
+	struct lnet_ping_buffer *pbuf;
+
+	/* TODO: remove assert */
+	LASSERT(event->type == LNET_EVENT_REPLY ||
+		event->type == LNET_EVENT_SEND ||
+		event->type == LNET_EVENT_UNLINK);
+
+	CDEBUG(D_NET, "Received event: %d status: %d\n", event->type,
+	       event->status);
+
+	switch (event->type) {
+	case LNET_EVENT_UNLINK:
+		CDEBUG(D_NET, "%s recovery ping unlinked\n",
+		       libcfs_nid2str(ev_info->mt_nid));
+		fallthrough;
+	case LNET_EVENT_REPLY:
+		lnet_handle_recovery_reply(ev_info, event->status,
+					   event->type == LNET_EVENT_UNLINK);
+		break;
+	case LNET_EVENT_SEND:
+		CDEBUG(D_NET, "%s recovery message sent %s:%d\n",
+			       libcfs_nid2str(ev_info->mt_nid),
+			       (event->status) ? "unsuccessfully" :
+			       "successfully", event->status);
+		break;
+	default:
+		CERROR("Unexpected event: %d\n", event->type);
+		break;
+	}
+	if (event->unlinked) {
+		LIBCFS_FREE(ev_info, sizeof(*ev_info));
+		pbuf = LNET_PING_INFO_TO_BUFFER(event->md.start);
+		lnet_ping_buffer_decref(pbuf);
+	}
+}
 
-	if (!rc)
-		CDEBUG(D_NET, "TRACE: %s(%s:%s) -> %s(%s:%s) : %s\n",
-		       libcfs_nid2str(msg->msg_hdr.src_nid),
-		       libcfs_nid2str(msg->msg_txni->ni_nid),
-		       libcfs_nid2str(src_nid),
-		       libcfs_nid2str(msg->msg_hdr.dest_nid),
-		       libcfs_nid2str(dst_nid),
-		       libcfs_nid2str(msg->msg_txpeer->lpni_nid),
-		       lnet_msgtyp2str(msg->msg_type));
+static int
+lnet_rsp_tracker_create(void)
+{
+	struct list_head **rstqs;
+	rstqs = lnet_create_array_of_queues();
 
-	lnet_net_unlock(cpt);
+	if (!rstqs)
+		return -ENOMEM;
 
-	return rc;
+	the_lnet.ln_mt_rstq = rstqs;
+
+	return 0;
 }
 
-int
-lnet_send(lnet_nid_t src_nid, struct lnet_msg *msg, lnet_nid_t rtr_nid)
+static void
+lnet_rsp_tracker_clean(void)
 {
-	lnet_nid_t		dst_nid = msg->msg_target.nid;
-	int			rc;
+	lnet_finalize_expired_responses();
 
-	/*
-	 * NB: rtr_nid is set to LNET_NID_ANY for all current use-cases,
-	 * but we might want to use pre-determined router for ACK/REPLY
-	 * in the future
-	 */
-	/* NB: ni != NULL == interface pre-determined (ACK/REPLY) */
-	LASSERT (msg->msg_txpeer == NULL);
-	LASSERT (!msg->msg_sending);
-	LASSERT (!msg->msg_target_is_router);
-	LASSERT (!msg->msg_receiving);
+	cfs_percpt_free(the_lnet.ln_mt_rstq);
+	the_lnet.ln_mt_rstq = NULL;
+}
 
-	msg->msg_sending = 1;
+int lnet_monitor_thr_start(void)
+{
+	int rc = 0;
+	struct task_struct *task;
 
-	LASSERT(!msg->msg_tx_committed);
+	if (the_lnet.ln_mt_state != LNET_MT_STATE_SHUTDOWN)
+		return -EALREADY;
 
-	rc = lnet_select_pathway(src_nid, dst_nid, msg, rtr_nid);
-	if (rc < 0)
+	rc = lnet_resendqs_create();
+	if (rc)
 		return rc;
 
-	if (rc == LNET_CREDIT_OK)
-		lnet_ni_send(msg->msg_txni, msg);
+	rc = lnet_rsp_tracker_create();
+	if (rc)
+		goto clean_queues;
+
+	/* Pre monitor thread start processing */
+	rc = lnet_router_pre_mt_start();
+	if (rc)
+		goto free_mem;
+
+	sema_init(&the_lnet.ln_mt_signal, 0);
+
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_mt_state = LNET_MT_STATE_RUNNING;
+	lnet_net_unlock(LNET_LOCK_EX);
+	task = kthread_run(lnet_monitor_thread, NULL, "monitor_thread");
+	if (IS_ERR(task)) {
+		rc = PTR_ERR(task);
+		CERROR("Can't start monitor thread: %d\n", rc);
+		goto clean_thread;
+	}
+
+	/* post monitor thread start processing */
+	lnet_router_post_mt_start();
 
-	/* rc == LNET_CREDIT_OK or LNET_CREDIT_WAIT */
 	return 0;
+
+clean_thread:
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_mt_state = LNET_MT_STATE_STOPPING;
+	lnet_net_unlock(LNET_LOCK_EX);
+	/* block until event callback signals exit */
+	down(&the_lnet.ln_mt_signal);
+	/* clean up */
+	lnet_router_cleanup();
+free_mem:
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_mt_state = LNET_MT_STATE_SHUTDOWN;
+	lnet_net_unlock(LNET_LOCK_EX);
+	lnet_rsp_tracker_clean();
+	lnet_clean_local_ni_recoveryq();
+	lnet_clean_peer_ni_recoveryq();
+	lnet_clean_resendqs();
+	LNetInvalidateEQHandle(&the_lnet.ln_mt_eqh);
+	return rc;
+clean_queues:
+	lnet_rsp_tracker_clean();
+	lnet_clean_local_ni_recoveryq();
+	lnet_clean_peer_ni_recoveryq();
+	lnet_clean_resendqs();
+	return rc;
+}
+
+void lnet_monitor_thr_stop(void)
+{
+	if (the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN)
+		return;
+
+	LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING);
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_mt_state = LNET_MT_STATE_STOPPING;
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	/* tell the monitor thread that we're shutting down */
+	wake_up(&the_lnet.ln_mt_waitq);
+
+	/* block until monitor thread signals that it's done */
+	down(&the_lnet.ln_mt_signal);
+	LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN);
+
+	/* perform cleanup tasks */
+	lnet_router_cleanup();
+	lnet_rsp_tracker_clean();
+	lnet_clean_local_ni_recoveryq();
+	lnet_clean_peer_ni_recoveryq();
+	lnet_clean_resendqs();
+
+	return;
 }
 
 void
-lnet_drop_message(struct lnet_ni *ni, int cpt, void *private, unsigned int nob)
+lnet_drop_message(struct lnet_ni *ni, int cpt, void *private, unsigned int nob,
+		  __u32 msg_type)
 {
 	lnet_net_lock(cpt);
-	the_lnet.ln_counters[cpt]->drop_count++;
-	the_lnet.ln_counters[cpt]->drop_length += nob;
+	lnet_incr_stats(&ni->ni_stats, msg_type, LNET_STATS_TYPE_DROP);
+	the_lnet.ln_counters[cpt]->lct_common.lcc_drop_count++;
+	the_lnet.ln_counters[cpt]->lct_common.lcc_drop_length += nob;
 	lnet_net_unlock(cpt);
 
 	lnet_ni_recv(ni, private, NULL, 0, 0, 0, nob);
@@ -2043,7 +3832,7 @@ lnet_parse_put(struct lnet_ni *ni, struct lnet_msg *msg)
 			ready_delay = true;
 			goto again;
 		}
-		/* fall through */
+		fallthrough;
 
 	case LNET_MATCHMD_DROP:
 		CNETERR("Dropping PUT from %s portal %d match %llu"
@@ -2128,13 +3917,13 @@ lnet_parse_get(struct lnet_ni *ni, struct lnet_msg *msg, int rdma_get)
 static int
 lnet_parse_reply(struct lnet_ni *ni, struct lnet_msg *msg)
 {
-	void		 *private = msg->msg_private;
-	struct lnet_hdr	 *hdr = &msg->msg_hdr;
+	void *private = msg->msg_private;
+	struct lnet_hdr *hdr = &msg->msg_hdr;
 	struct lnet_process_id src = {0};
-	struct lnet_libmd	 *md;
-	int		  rlength;
-	int		  mlength;
-	int			cpt;
+	struct lnet_libmd *md;
+	int rlength;
+	int mlength;
+	int cpt;
 
 	cpt = lnet_cpt_of_cookie(hdr->msg.reply.dst_wmd.wh_object_cookie);
 	lnet_res_lock(cpt);
@@ -2195,10 +3984,10 @@ lnet_parse_reply(struct lnet_ni *ni, struct lnet_msg *msg)
 static int
 lnet_parse_ack(struct lnet_ni *ni, struct lnet_msg *msg)
 {
-	struct lnet_hdr	 *hdr = &msg->msg_hdr;
+	struct lnet_hdr *hdr = &msg->msg_hdr;
 	struct lnet_process_id src = {0};
-	struct lnet_libmd	 *md;
-	int			cpt;
+	struct lnet_libmd *md;
+	int cpt;
 
 	src.nid = hdr->src_nid;
 	src.pid = hdr->src_pid;
@@ -2405,11 +4194,12 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
 	for_me = (ni->ni_nid == dest_nid);
 	cpt = lnet_cpt_of_nid(from_nid, ni);
 
-	CDEBUG(D_NET, "TRACE: %s(%s) <- %s : %s\n",
+	CDEBUG(D_NET, "TRACE: %s(%s) <- %s : %s - %s\n",
 		libcfs_nid2str(dest_nid),
 		libcfs_nid2str(ni->ni_nid),
 		libcfs_nid2str(src_nid),
-		lnet_msgtyp2str(type));
+		lnet_msgtyp2str(type),
+		(for_me) ? "for me" : "routed");
 
 	switch (type) {
 	case LNET_MSG_ACK:
@@ -2446,10 +4236,10 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
 	}
 
 	if (the_lnet.ln_routing &&
-	    ni->ni_last_alive != cfs_time_current_sec()) {
+	    ni->ni_last_alive != ktime_get_real_seconds()) {
 		/* NB: so far here is the only place to set NI status to "up */
 		lnet_ni_lock(ni);
-		ni->ni_last_alive = cfs_time_current_sec();
+		ni->ni_last_alive = ktime_get_real_seconds();
 		if (ni->ni_status != NULL &&
 		    ni->ni_status->ns_status == LNET_NI_STATUS_DOWN)
 			ni->ni_status->ns_status = LNET_NI_STATUS_UP;
@@ -2513,7 +4303,7 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
 	}
 
 	if (!list_empty(&the_lnet.ln_drop_rules) &&
-	    lnet_drop_rule_match(hdr)) {
+	    lnet_drop_rule_match(hdr, NULL)) {
 		CDEBUG(D_NET, "%s, src %s, dst %s: Dropping %s to simulate"
 			      "silent message loss\n",
 		       libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
@@ -2521,6 +4311,52 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
 		goto drop;
 	}
 
+	if (lnet_drop_asym_route && for_me &&
+	    LNET_NIDNET(src_nid) != LNET_NIDNET(from_nid)) {
+		struct lnet_net *net;
+		struct lnet_remotenet *rnet;
+		bool found = true;
+
+		/* we are dealing with a routed message,
+		 * so see if route to reach src_nid goes through from_nid
+		 */
+		lnet_net_lock(cpt);
+		net = lnet_get_net_locked(LNET_NIDNET(ni->ni_nid));
+		if (!net) {
+			lnet_net_unlock(cpt);
+			CERROR("net %s not found\n",
+			       libcfs_net2str(LNET_NIDNET(ni->ni_nid)));
+			return -EPROTO;
+		}
+
+		rnet = lnet_find_rnet_locked(LNET_NIDNET(src_nid));
+		if (rnet) {
+			struct lnet_peer_ni *gw = NULL;
+			struct lnet_route *route;
+
+			list_for_each_entry(route, &rnet->lrn_routes, lr_list) {
+				found = false;
+				gw = route->lr_gateway;
+				if (gw->lpni_net != net)
+					continue;
+				if (gw->lpni_nid == from_nid) {
+					found = true;
+					break;
+				}
+			}
+		}
+		lnet_net_unlock(cpt);
+		if (!found) {
+			/* we would not use from_nid to route a message to
+			 * src_nid
+			 * => asymmetric routing detected but forbidden
+			 */
+			CERROR("%s, src %s: Dropping asymmetrical route %s\n",
+			       libcfs_nid2str(from_nid),
+			       libcfs_nid2str(src_nid), lnet_msgtyp2str(type));
+			goto drop;
+		}
+	}
 
 	msg = lnet_msg_alloc();
 	if (msg == NULL) {
@@ -2558,7 +4394,7 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
 	}
 
 	lnet_net_lock(cpt);
-	lpni = lnet_nid2peerni_locked(from_nid, cpt);
+	lpni = lnet_nid2peerni_locked(from_nid, ni->ni_nid, cpt);
 	if (IS_ERR(lpni)) {
 		lnet_net_unlock(cpt);
 		CERROR("%s, src %s: Dropping %s "
@@ -2625,7 +4461,7 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
 	lnet_finalize(msg, rc);
 
  drop:
-	lnet_drop_message(ni, cpt, private, payload_length);
+	lnet_drop_message(ni, cpt, private, payload_length, type);
 	return 0;
 }
 EXPORT_SYMBOL(lnet_parse);
@@ -2661,7 +4497,10 @@ lnet_drop_delayed_msg_list(struct list_head *head, char *reason)
 		 * until that's done */
 
 		lnet_drop_message(msg->msg_rxni, msg->msg_rx_cpt,
-				  msg->msg_private, msg->msg_len);
+				  msg->msg_private, msg->msg_len,
+				  msg->msg_type);
+
+		msg->msg_no_resend = true;
 		/*
 		 * NB: message will not generate event because w/o attached MD,
 		 * but we still should give error code so lnet_msg_decommit()
@@ -2704,6 +4543,54 @@ lnet_recv_delayed_msg_list(struct list_head *head)
 	}
 }
 
+static void
+lnet_attach_rsp_tracker(struct lnet_rsp_tracker *rspt, int cpt,
+			struct lnet_libmd *md, struct lnet_handle_md mdh)
+{
+	s64 timeout_ns;
+	bool new_entry = true;
+	struct lnet_rsp_tracker *local_rspt;
+
+	/*
+	 * MD has a refcount taken by message so it's not going away.
+	 * The MD however can be looked up. We need to secure the access
+	 * to the md_rspt_ptr by taking the res_lock.
+	 * The rspt can be accessed without protection up to when it gets
+	 * added to the list.
+	 */
+
+	lnet_res_lock(cpt);
+	local_rspt = md->md_rspt_ptr;
+	timeout_ns = lnet_transaction_timeout * NSEC_PER_SEC;
+	if (local_rspt != NULL) {
+		/*
+		 * we already have an rspt attached to the md, so we'll
+		 * update the deadline on that one.
+		 */
+		LIBCFS_FREE(rspt, sizeof(*rspt));
+		new_entry = false;
+	} else {
+		/* new md */
+		rspt->rspt_mdh = mdh;
+		rspt->rspt_cpt = cpt;
+		/* store the rspt so we can access it when we get the REPLY */
+		md->md_rspt_ptr = rspt;
+		local_rspt = rspt;
+	}
+	local_rspt->rspt_deadline = ktime_add_ns(ktime_get(), timeout_ns);
+
+	/*
+	 * add to the list of tracked responses. It's added to tail of the
+	 * list in order to expire all the older entries first.
+	 */
+	lnet_net_lock(cpt);
+	if (!new_entry && !list_empty(&local_rspt->rspt_on_list))
+		list_del_init(&local_rspt->rspt_on_list);
+	list_add_tail(&local_rspt->rspt_on_list, the_lnet.ln_mt_rstq[cpt]);
+	lnet_net_unlock(cpt);
+	lnet_res_unlock(cpt);
+}
+
 /**
  * Initiate an asynchronous PUT operation.
  *
@@ -2754,10 +4641,11 @@ LNetPut(lnet_nid_t self, struct lnet_handle_md mdh, enum lnet_ack_req ack,
 	__u64 match_bits, unsigned int offset,
 	__u64 hdr_data)
 {
-	struct lnet_msg		*msg;
-	struct lnet_libmd	*md;
-	int			cpt;
-	int			rc;
+	struct lnet_msg *msg;
+	struct lnet_libmd *md;
+	int cpt;
+	int rc;
+	struct lnet_rsp_tracker *rspt = NULL;
 
 	LASSERT(the_lnet.ln_refcount > 0);
 
@@ -2777,6 +4665,17 @@ LNetPut(lnet_nid_t self, struct lnet_handle_md mdh, enum lnet_ack_req ack,
 	msg->msg_vmflush = !!memory_pressure_get();
 
 	cpt = lnet_cpt_of_cookie(mdh.cookie);
+
+	if (ack == LNET_ACK_REQ) {
+		rspt = lnet_rspt_alloc(cpt);
+		if (!rspt) {
+			CERROR("Dropping PUT to %s: ENOMEM on response tracker\n",
+				libcfs_id2str(target));
+			return -ENOMEM;
+		}
+		INIT_LIST_HEAD(&rspt->rspt_on_list);
+	}
+
 	lnet_res_lock(cpt);
 
 	md = lnet_handle2md(&mdh);
@@ -2789,6 +4688,7 @@ LNetPut(lnet_nid_t self, struct lnet_handle_md mdh, enum lnet_ack_req ack,
 			       md->md_me->me_portal);
 		lnet_res_unlock(cpt);
 
+		LIBCFS_FREE(rspt, sizeof(*rspt));
 		lnet_msg_free(msg);
 		return -ENOENT;
 	}
@@ -2821,10 +4721,14 @@ LNetPut(lnet_nid_t self, struct lnet_handle_md mdh, enum lnet_ack_req ack,
 
 	lnet_build_msg_event(msg, LNET_EVENT_SEND);
 
+	if (ack == LNET_ACK_REQ)
+		lnet_attach_rsp_tracker(rspt, cpt, md, mdh);
+
 	rc = lnet_send(self, msg, LNET_NID_ANY);
 	if (rc != 0) {
 		CNETERR("Error sending PUT to %s: %d\n",
 			libcfs_id2str(target), rc);
+		msg->msg_no_resend = true;
 		lnet_finalize(msg, rc);
 	}
 
@@ -2901,8 +4805,10 @@ lnet_create_reply_msg(struct lnet_ni *ni, struct lnet_msg *getmsg)
 	cpt = lnet_cpt_of_nid(peer_id.nid, ni);
 
 	lnet_net_lock(cpt);
-	the_lnet.ln_counters[cpt]->drop_count++;
-	the_lnet.ln_counters[cpt]->drop_length += getmd->md_length;
+	lnet_incr_stats(&ni->ni_stats, LNET_MSG_GET, LNET_STATS_TYPE_DROP);
+	the_lnet.ln_counters[cpt]->lct_common.lcc_drop_count++;
+	the_lnet.ln_counters[cpt]->lct_common.lcc_drop_length +=
+		getmd->md_length;
 	lnet_net_unlock(cpt);
 
 	if (msg != NULL)
@@ -2953,12 +4859,13 @@ EXPORT_SYMBOL(lnet_set_reply_msg_len);
 int
 LNetGet(lnet_nid_t self, struct lnet_handle_md mdh,
 	struct lnet_process_id target, unsigned int portal,
-	__u64 match_bits, unsigned int offset)
+	__u64 match_bits, unsigned int offset, bool recovery)
 {
-	struct lnet_msg		*msg;
-	struct lnet_libmd	*md;
-	int			cpt;
-	int			rc;
+	struct lnet_msg *msg;
+	struct lnet_libmd *md;
+	struct lnet_rsp_tracker *rspt;
+	int cpt;
+	int rc;
 
 	LASSERT(the_lnet.ln_refcount > 0);
 
@@ -2971,13 +4878,24 @@ LNetGet(lnet_nid_t self, struct lnet_handle_md mdh,
 	}
 
 	msg = lnet_msg_alloc();
-	if (msg == NULL) {
+	if (!msg) {
 		CERROR("Dropping GET to %s: ENOMEM on struct lnet_msg\n",
 		       libcfs_id2str(target));
 		return -ENOMEM;
 	}
 
 	cpt = lnet_cpt_of_cookie(mdh.cookie);
+
+	rspt = lnet_rspt_alloc(cpt);
+	if (!rspt) {
+		CERROR("Dropping GET to %s: ENOMEM on response tracker\n",
+		       libcfs_id2str(target));
+		return -ENOMEM;
+	}
+	INIT_LIST_HEAD(&rspt->rspt_on_list);
+
+	msg->msg_recovery = recovery;
+
 	lnet_res_lock(cpt);
 
 	md = lnet_handle2md(&mdh);
@@ -2992,6 +4910,7 @@ LNetGet(lnet_nid_t self, struct lnet_handle_md mdh,
 		lnet_res_unlock(cpt);
 
 		lnet_msg_free(msg);
+		LIBCFS_FREE(rspt, sizeof(*rspt));
 		return -ENOENT;
 	}
 
@@ -3016,10 +4935,13 @@ LNetGet(lnet_nid_t self, struct lnet_handle_md mdh,
 
 	lnet_build_msg_event(msg, LNET_EVENT_SEND);
 
+	lnet_attach_rsp_tracker(rspt, cpt, md, mdh);
+
 	rc = lnet_send(self, msg, LNET_NID_ANY);
 	if (rc < 0) {
 		CNETERR("Error sending GET to %s: %d\n",
 			libcfs_id2str(target), rc);
+		msg->msg_no_resend = true;
 		lnet_finalize(msg, rc);
 	}
 
@@ -3045,14 +4967,14 @@ EXPORT_SYMBOL(LNetGet);
 int
 LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp)
 {
-	struct list_head	*e;
+	struct list_head *e;
 	struct lnet_ni *ni = NULL;
 	struct lnet_remotenet *rnet;
-	__u32			dstnet = LNET_NIDNET(dstnid);
-	int			hops;
-	int			cpt;
-	__u32			order = 2;
-	struct list_head	*rn_list;
+	__u32 dstnet = LNET_NIDNET(dstnid);
+	int hops;
+	int cpt;
+	__u32 order = 2;
+	struct list_head *rn_list;
 
 	/* if !local_nid_dist_zero, I don't return a distance of 0 ever
 	 * (when lustre sees a distance of 0, it substitutes 0@lo), so I
@@ -3068,7 +4990,7 @@ LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp)
 			if (srcnidp != NULL)
 				*srcnidp = dstnid;
 			if (orderp != NULL) {
-				if (LNET_NETTYP(LNET_NIDNET(dstnid)) == LOLND)
+				if (dstnid == LNET_NID_LO_0)
 					*orderp = 0;
 				else
 					*orderp = 1;
@@ -3083,9 +5005,9 @@ LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp)
 			 * current net namespace.
 			 * If not, assign order above 0xffff0000,
 			 * to make this ni not a priority. */
-			if (!net_eq(ni->ni_net_ns, current->nsproxy->net_ns))
-				order += 0xffff0000;
-
+			if (current->nsproxy &&
+			    !net_eq(ni->ni_net_ns, current->nsproxy->net_ns))
+					order += 0xffff0000;
 			if (srcnidp != NULL)
 				*srcnidp = ni->ni_nid;
 			if (orderp != NULL)
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-msg.c b/drivers/staging/lustrefsx/lnet/lnet/lib-msg.c
index 1b90855375a20..959c370d2d4da 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lib-msg.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-msg.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2014, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -65,6 +65,7 @@ lnet_build_msg_event(struct lnet_msg *msg, enum lnet_event_kind ev_type)
 	LASSERT(!msg->msg_routing);
 
 	ev->type = ev_type;
+	ev->msg_type = msg->msg_type;
 
 	if (ev_type == LNET_EVENT_SEND) {
 		/* event for active message */
@@ -75,7 +76,6 @@ lnet_build_msg_event(struct lnet_msg *msg, enum lnet_event_kind ev_type)
 		ev->source.nid	  = LNET_NID_ANY;
 		ev->source.pid    = the_lnet.ln_pid;
 		ev->sender	  = LNET_NID_ANY;
-
 	} else {
 		/* event for passive message */
 		ev->target.pid	  = hdr->dest_pid;
@@ -142,14 +142,18 @@ void
 lnet_msg_commit(struct lnet_msg *msg, int cpt)
 {
 	struct lnet_msg_container *container = the_lnet.ln_msg_containers[cpt];
-	struct lnet_counters *counters = the_lnet.ln_counters[cpt];
+	struct lnet_counters_common *common;
+	s64 timeout_ns;
+
+	/* set the message deadline */
+	timeout_ns = lnet_transaction_timeout * NSEC_PER_SEC;
+	msg->msg_deadline = ktime_add_ns(ktime_get(), timeout_ns);
 
 	/* routed message can be committed for both receiving and sending */
 	LASSERT(!msg->msg_tx_committed);
 
 	if (msg->msg_sending) {
 		LASSERT(!msg->msg_receiving);
-
 		msg->msg_tx_cpt = cpt;
 		msg->msg_tx_committed = 1;
 		if (msg->msg_rx_committed) { /* routed message REPLY */
@@ -163,33 +167,35 @@ lnet_msg_commit(struct lnet_msg *msg, int cpt)
 	}
 
 	LASSERT(!msg->msg_onactivelist);
+
 	msg->msg_onactivelist = 1;
-	list_add(&msg->msg_activelist, &container->msc_active);
+	list_add_tail(&msg->msg_activelist, &container->msc_active);
 
-	counters->msgs_alloc++;
-	if (counters->msgs_alloc > counters->msgs_max)
-		counters->msgs_max = counters->msgs_alloc;
+	common = &the_lnet.ln_counters[cpt]->lct_common;
+	common->lcc_msgs_alloc++;
+	if (common->lcc_msgs_alloc > common->lcc_msgs_max)
+		common->lcc_msgs_max = common->lcc_msgs_alloc;
 }
 
 static void
 lnet_msg_decommit_tx(struct lnet_msg *msg, int status)
 {
-	struct lnet_counters *counters;
+	struct lnet_counters_common *common;
 	struct lnet_event *ev = &msg->msg_ev;
 
 	LASSERT(msg->msg_tx_committed);
 	if (status != 0)
 		goto out;
 
-	counters = the_lnet.ln_counters[msg->msg_tx_cpt];
+	common = &(the_lnet.ln_counters[msg->msg_tx_cpt]->lct_common);
 	switch (ev->type) {
 	default: /* routed message */
 		LASSERT(msg->msg_routing);
 		LASSERT(msg->msg_rx_committed);
 		LASSERT(ev->type == 0);
 
-		counters->route_length += msg->msg_len;
-		counters->route_count++;
+		common->lcc_route_length += msg->msg_len;
+		common->lcc_route_count++;
 		goto incr_stats;
 
 	case LNET_EVENT_PUT:
@@ -203,7 +209,7 @@ lnet_msg_decommit_tx(struct lnet_msg *msg, int status)
 	case LNET_EVENT_SEND:
 		LASSERT(!msg->msg_rx_committed);
 		if (msg->msg_type == LNET_MSG_PUT)
-			counters->send_length += msg->msg_len;
+			common->lcc_send_length += msg->msg_len;
 		break;
 
 	case LNET_EVENT_GET:
@@ -215,13 +221,17 @@ lnet_msg_decommit_tx(struct lnet_msg *msg, int status)
 		break;
 	}
 
-	counters->send_count++;
+	common->lcc_send_count++;
 
 incr_stats:
 	if (msg->msg_txpeer)
-		atomic_inc(&msg->msg_txpeer->lpni_stats.send_count);
+		lnet_incr_stats(&msg->msg_txpeer->lpni_stats,
+				msg->msg_type,
+				LNET_STATS_TYPE_SEND);
 	if (msg->msg_txni)
-		atomic_inc(&msg->msg_txni->ni_stats.send_count);
+		lnet_incr_stats(&msg->msg_txni->ni_stats,
+				msg->msg_type,
+				LNET_STATS_TYPE_SEND);
  out:
 	lnet_return_tx_credits_locked(msg);
 	msg->msg_tx_committed = 0;
@@ -230,7 +240,7 @@ lnet_msg_decommit_tx(struct lnet_msg *msg, int status)
 static void
 lnet_msg_decommit_rx(struct lnet_msg *msg, int status)
 {
-	struct lnet_counters *counters;
+	struct lnet_counters_common *common;
 	struct lnet_event *ev = &msg->msg_ev;
 
 	LASSERT(!msg->msg_tx_committed); /* decommitted or never committed */
@@ -239,7 +249,7 @@ lnet_msg_decommit_rx(struct lnet_msg *msg, int status)
 	if (status != 0)
 		goto out;
 
-	counters = the_lnet.ln_counters[msg->msg_rx_cpt];
+	common = &(the_lnet.ln_counters[msg->msg_rx_cpt]->lct_common);
 	switch (ev->type) {
 	default:
 		LASSERT(ev->type == 0);
@@ -257,7 +267,7 @@ lnet_msg_decommit_rx(struct lnet_msg *msg, int status)
 		 * lnet_msg_decommit_tx(), see details in lnet_parse_get() */
 		LASSERT(msg->msg_type == LNET_MSG_REPLY ||
 			msg->msg_type == LNET_MSG_GET);
-		counters->send_length += msg->msg_wanted;
+		common->lcc_send_length += msg->msg_wanted;
 		break;
 
 	case LNET_EVENT_PUT:
@@ -272,15 +282,19 @@ lnet_msg_decommit_rx(struct lnet_msg *msg, int status)
 		break;
 	}
 
-	counters->recv_count++;
+	common->lcc_recv_count++;
 
 incr_stats:
 	if (msg->msg_rxpeer)
-		atomic_inc(&msg->msg_rxpeer->lpni_stats.recv_count);
+		lnet_incr_stats(&msg->msg_rxpeer->lpni_stats,
+				msg->msg_type,
+				LNET_STATS_TYPE_RECV);
 	if (msg->msg_rxni)
-		atomic_inc(&msg->msg_rxni->ni_stats.recv_count);
+		lnet_incr_stats(&msg->msg_rxni->ni_stats,
+				msg->msg_type,
+				LNET_STATS_TYPE_RECV);
 	if (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_REPLY)
-		counters->recv_length += msg->msg_wanted;
+		common->lcc_recv_length += msg->msg_wanted;
 
  out:
 	lnet_return_rx_credits_locked(msg);
@@ -313,7 +327,7 @@ lnet_msg_decommit(struct lnet_msg *msg, int cpt, int status)
 	list_del(&msg->msg_activelist);
 	msg->msg_onactivelist = 0;
 
-	the_lnet.ln_counters[cpt2]->msgs_alloc--;
+	the_lnet.ln_counters[cpt2]->lct_common.lcc_msgs_alloc--;
 
 	if (cpt2 != cpt) {
 		lnet_net_unlock(cpt2);
@@ -349,29 +363,6 @@ lnet_msg_attach_md(struct lnet_msg *msg, struct lnet_libmd *md,
 	lnet_md_deconstruct(md, &msg->msg_ev.md);
 }
 
-void
-lnet_msg_detach_md(struct lnet_msg *msg, int status)
-{
-	struct lnet_libmd *md = msg->msg_md;
-	int unlink;
-
-	/* Now it's safe to drop my caller's ref */
-	md->md_refcount--;
-	LASSERT(md->md_refcount >= 0);
-
-	unlink = lnet_md_unlinkable(md);
-	if (md->md_eq != NULL) {
-		msg->msg_ev.status   = status;
-		msg->msg_ev.unlinked = unlink;
-		lnet_eq_enqueue_event(md->md_eq, &msg->msg_ev);
-	}
-
-	if (unlink)
-		lnet_md_unlink(md);
-
-	msg->msg_md = NULL;
-}
-
 static int
 lnet_complete_msg_locked(struct lnet_msg *msg, int cpt)
 {
@@ -448,14 +439,549 @@ lnet_complete_msg_locked(struct lnet_msg *msg, int cpt)
 	return 0;
 }
 
+static void
+lnet_dec_healthv_locked(atomic_t *healthv)
+{
+	int h = atomic_read(healthv);
+
+	if (h < lnet_health_sensitivity) {
+		atomic_set(healthv, 0);
+	} else {
+		h -= lnet_health_sensitivity;
+		atomic_set(healthv, h);
+	}
+}
+
+static void
+lnet_handle_local_failure(struct lnet_msg *msg)
+{
+	struct lnet_ni *local_ni;
+
+	local_ni = msg->msg_txni;
+
+	/*
+	 * the lnet_net_lock(0) is used to protect the addref on the ni
+	 * and the recovery queue.
+	 */
+	lnet_net_lock(0);
+	/* the mt could've shutdown and cleaned up the queues */
+	if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) {
+		lnet_net_unlock(0);
+		return;
+	}
+
+	lnet_dec_healthv_locked(&local_ni->ni_healthv);
+	/*
+	 * add the NI to the recovery queue if it's not already there
+	 * and it's health value is actually below the maximum. It's
+	 * possible that the sensitivity might be set to 0, and the health
+	 * value will not be reduced. In this case, there is no reason to
+	 * invoke recovery
+	 */
+	if (list_empty(&local_ni->ni_recovery) &&
+	    atomic_read(&local_ni->ni_healthv) < LNET_MAX_HEALTH_VALUE) {
+		CDEBUG(D_NET, "ni %s added to recovery queue. Health = %d\n",
+			libcfs_nid2str(local_ni->ni_nid),
+			atomic_read(&local_ni->ni_healthv));
+		list_add_tail(&local_ni->ni_recovery,
+			      &the_lnet.ln_mt_localNIRecovq);
+		lnet_ni_addref_locked(local_ni, 0);
+	}
+	lnet_net_unlock(0);
+}
+
+void
+lnet_handle_remote_failure_locked(struct lnet_peer_ni *lpni)
+{
+	/* lpni could be NULL if we're in the LOLND case */
+	if (!lpni)
+		return;
+
+	lnet_dec_healthv_locked(&lpni->lpni_healthv);
+	/*
+	 * add the peer NI to the recovery queue if it's not already there
+	 * and it's health value is actually below the maximum. It's
+	 * possible that the sensitivity might be set to 0, and the health
+	 * value will not be reduced. In this case, there is no reason to
+	 * invoke recovery
+	 */
+	lnet_peer_ni_add_to_recoveryq_locked(lpni);
+}
+
+static void
+lnet_handle_remote_failure(struct lnet_peer_ni *lpni)
+{
+	/* lpni could be NULL if we're in the LOLND case */
+	if (!lpni)
+		return;
+
+	lnet_net_lock(0);
+	/* the mt could've shutdown and cleaned up the queues */
+	if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) {
+		lnet_net_unlock(0);
+		return;
+	}
+	lnet_handle_remote_failure_locked(lpni);
+	lnet_net_unlock(0);
+}
+
+static void
+lnet_incr_hstats(struct lnet_msg *msg, enum lnet_msg_hstatus hstatus)
+{
+	struct lnet_ni *ni = msg->msg_txni;
+	struct lnet_peer_ni *lpni = msg->msg_txpeer;
+	struct lnet_counters_health *health;
+
+	health = &the_lnet.ln_counters[0]->lct_health;
+
+	switch (hstatus) {
+	case LNET_MSG_STATUS_LOCAL_INTERRUPT:
+		atomic_inc(&ni->ni_hstats.hlt_local_interrupt);
+		health->lch_local_interrupt_count++;
+		break;
+	case LNET_MSG_STATUS_LOCAL_DROPPED:
+		atomic_inc(&ni->ni_hstats.hlt_local_dropped);
+		health->lch_local_dropped_count++;
+		break;
+	case LNET_MSG_STATUS_LOCAL_ABORTED:
+		atomic_inc(&ni->ni_hstats.hlt_local_aborted);
+		health->lch_local_aborted_count++;
+		break;
+	case LNET_MSG_STATUS_LOCAL_NO_ROUTE:
+		atomic_inc(&ni->ni_hstats.hlt_local_no_route);
+		health->lch_local_no_route_count++;
+		break;
+	case LNET_MSG_STATUS_LOCAL_TIMEOUT:
+		atomic_inc(&ni->ni_hstats.hlt_local_timeout);
+		health->lch_local_timeout_count++;
+		break;
+	case LNET_MSG_STATUS_LOCAL_ERROR:
+		atomic_inc(&ni->ni_hstats.hlt_local_error);
+		health->lch_local_error_count++;
+		break;
+	case LNET_MSG_STATUS_REMOTE_DROPPED:
+		if (lpni)
+			atomic_inc(&lpni->lpni_hstats.hlt_remote_dropped);
+		health->lch_remote_dropped_count++;
+		break;
+	case LNET_MSG_STATUS_REMOTE_ERROR:
+		if (lpni)
+			atomic_inc(&lpni->lpni_hstats.hlt_remote_error);
+		health->lch_remote_error_count++;
+		break;
+	case LNET_MSG_STATUS_REMOTE_TIMEOUT:
+		if (lpni)
+			atomic_inc(&lpni->lpni_hstats.hlt_remote_timeout);
+		health->lch_remote_timeout_count++;
+		break;
+	case LNET_MSG_STATUS_NETWORK_TIMEOUT:
+		if (lpni)
+			atomic_inc(&lpni->lpni_hstats.hlt_network_timeout);
+		health->lch_network_timeout_count++;
+		break;
+	case LNET_MSG_STATUS_OK:
+		break;
+	default:
+		LBUG();
+	}
+}
+
+static void
+lnet_resend_msg_locked(struct lnet_msg *msg)
+{
+	msg->msg_retry_count++;
+
+	/*
+	 * remove message from the active list and reset it to prepare
+	 * for a resend. Two exceptions to this
+	 *
+	 * 1. the router case. When a message is being routed it is
+	 * committed for rx when received and committed for tx when
+	 * forwarded. We don't want to remove it from the active list, since
+	 * code which handles receiving expects it to remain on the active
+	 * list.
+	 *
+	 * 2. The REPLY case. Reply messages use the same message
+	 * structure for the GET that was received.
+	 */
+	if (!msg->msg_routing && msg->msg_type != LNET_MSG_REPLY) {
+		list_del_init(&msg->msg_activelist);
+		msg->msg_onactivelist = 0;
+	}
+	/*
+	 * The msg_target.nid which was originally set
+	 * when calling LNetGet() or LNetPut() might've
+	 * been overwritten if we're routing this message.
+	 * Call lnet_msg_decommit_tx() to return the credit
+	 * this message consumed. The message will
+	 * consume another credit when it gets resent.
+	 */
+	msg->msg_target.nid = msg->msg_hdr.dest_nid;
+	lnet_msg_decommit_tx(msg, -EAGAIN);
+	msg->msg_sending = 0;
+	msg->msg_receiving = 0;
+	msg->msg_target_is_router = 0;
+
+	CDEBUG(D_NET, "%s->%s:%s:%s - queuing msg (%p) for resend\n",
+	       libcfs_nid2str(msg->msg_hdr.src_nid),
+	       libcfs_nid2str(msg->msg_hdr.dest_nid),
+	       lnet_msgtyp2str(msg->msg_type),
+	       lnet_health_error2str(msg->msg_health_status), msg);
+
+	list_add_tail(&msg->msg_list, the_lnet.ln_mt_resendqs[msg->msg_tx_cpt]);
+
+	wake_up(&the_lnet.ln_mt_waitq);
+}
+
+int
+lnet_check_finalize_recursion_locked(struct lnet_msg *msg,
+				     struct list_head *containerq,
+				     int nworkers, void **workers)
+{
+	int my_slot = -1;
+	int i;
+
+	list_add_tail(&msg->msg_list, containerq);
+
+	for (i = 0; i < nworkers; i++) {
+		if (workers[i] == current)
+			break;
+
+		if (my_slot < 0 && workers[i] == NULL)
+			my_slot = i;
+	}
+
+	if (i < nworkers || my_slot < 0)
+		return -1;
+
+	workers[my_slot] = current;
+
+	return my_slot;
+}
+
+int
+lnet_attempt_msg_resend(struct lnet_msg *msg)
+{
+	struct lnet_msg_container *container;
+	int my_slot;
+	int cpt;
+
+	/* we can only resend tx_committed messages */
+	LASSERT(msg->msg_tx_committed);
+
+	/* don't resend recovery messages */
+	if (msg->msg_recovery) {
+		CDEBUG(D_NET, "msg %s->%s is a recovery ping. retry# %d\n",
+			libcfs_nid2str(msg->msg_from),
+			libcfs_nid2str(msg->msg_target.nid),
+			msg->msg_retry_count);
+		return -ENOTRECOVERABLE;
+	}
+
+	/*
+	 * if we explicitly indicated we don't want to resend then just
+	 * return
+	 */
+	if (msg->msg_no_resend) {
+		CDEBUG(D_NET, "msg %s->%s requested no resend. retry# %d\n",
+			libcfs_nid2str(msg->msg_from),
+			libcfs_nid2str(msg->msg_target.nid),
+			msg->msg_retry_count);
+		return -ENOTRECOVERABLE;
+	}
+
+	/* check if the message has exceeded the number of retries */
+	if (msg->msg_retry_count >= lnet_retry_count) {
+		CNETERR("msg %s->%s exceeded retry count %d\n",
+			libcfs_nid2str(msg->msg_from),
+			libcfs_nid2str(msg->msg_target.nid),
+			msg->msg_retry_count);
+		return -ENOTRECOVERABLE;
+	}
+
+	cpt = msg->msg_tx_cpt;
+	lnet_net_lock(cpt);
+
+	/* check again under lock */
+	if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) {
+		lnet_net_unlock(cpt);
+		return -ESHUTDOWN;
+	}
+
+	container = the_lnet.ln_msg_containers[cpt];
+	my_slot =
+		lnet_check_finalize_recursion_locked(msg,
+					&container->msc_resending,
+					container->msc_nfinalizers,
+					container->msc_resenders);
+
+	/* enough threads are resending */
+	if (my_slot == -1) {
+		lnet_net_unlock(cpt);
+		return 0;
+	}
+
+	while (!list_empty(&container->msc_resending)) {
+		msg = list_entry(container->msc_resending.next,
+					struct lnet_msg, msg_list);
+		list_del(&msg->msg_list);
+
+		/*
+		 * resending the message will require us to call
+		 * lnet_msg_decommit_tx() which will return the credit
+		 * which this message holds. This could trigger another
+		 * queued message to be sent. If that message fails and
+		 * requires a resend we will recurse.
+		 * But since at this point the slot is taken, the message
+		 * will be queued in the container and dealt with
+		 * later. This breaks the recursion.
+		 */
+		lnet_resend_msg_locked(msg);
+	}
+
+	/*
+	 * msc_resenders is an array of process pointers. Each entry holds
+	 * a pointer to the current process operating on the message. An
+	 * array entry is created per CPT. If the array slot is already
+	 * set, then it means that there is a thread on the CPT currently
+	 * resending a message.
+	 * Once the thread finishes clear the slot to enable the thread to
+	 * take on more resend work.
+	 */
+	container->msc_resenders[my_slot] = NULL;
+	lnet_net_unlock(cpt);
+
+	return 0;
+}
+
+/*
+ * Do a health check on the message:
+ * return -1 if we're not going to handle the error or
+ *   if we've reached the maximum number of retries.
+ *   success case will return -1 as well
+ * return 0 if it the message is requeued for send
+ */
+static int
+lnet_health_check(struct lnet_msg *msg)
+{
+	enum lnet_msg_hstatus hstatus = msg->msg_health_status;
+	bool lo = false;
+
+	/* if we're shutting down no point in handling health. */
+	if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING)
+		return -1;
+
+	LASSERT(msg->msg_txni);
+
+	/*
+	 * if we're sending to the LOLND then the msg_txpeer will not be
+	 * set. So no need to sanity check it.
+	 */
+	if (msg->msg_txni->ni_nid != LNET_NID_LO_0)
+		LASSERT(msg->msg_txpeer);
+	else
+		lo = true;
+
+	if (hstatus != LNET_MSG_STATUS_OK &&
+	    ktime_compare(ktime_get(), msg->msg_deadline) >= 0)
+		return -1;
+
+	/*
+	 * stats are only incremented for errors so avoid wasting time
+	 * incrementing statistics if there is no error.
+	 */
+	if (hstatus != LNET_MSG_STATUS_OK) {
+		lnet_net_lock(0);
+		lnet_incr_hstats(msg, hstatus);
+		lnet_net_unlock(0);
+	}
+
+	CDEBUG(D_NET, "health check: %s->%s: %s: %s\n",
+	       libcfs_nid2str(msg->msg_txni->ni_nid),
+	       (lo) ? "self" : libcfs_nid2str(msg->msg_txpeer->lpni_nid),
+	       lnet_msgtyp2str(msg->msg_type),
+	       lnet_health_error2str(hstatus));
+
+	switch (hstatus) {
+	case LNET_MSG_STATUS_OK:
+		lnet_inc_healthv(&msg->msg_txni->ni_healthv);
+		/*
+		 * It's possible msg_txpeer is NULL in the LOLND
+		 * case.
+		 */
+		if (msg->msg_txpeer)
+			lnet_inc_healthv(&msg->msg_txpeer->lpni_healthv);
+
+		/* we can finalize this message */
+		return -1;
+	case LNET_MSG_STATUS_LOCAL_INTERRUPT:
+	case LNET_MSG_STATUS_LOCAL_DROPPED:
+	case LNET_MSG_STATUS_LOCAL_ABORTED:
+	case LNET_MSG_STATUS_LOCAL_NO_ROUTE:
+	case LNET_MSG_STATUS_LOCAL_TIMEOUT:
+		lnet_handle_local_failure(msg);
+		/* add to the re-send queue */
+		return lnet_attempt_msg_resend(msg);
+
+	/*
+	 * These errors will not trigger a resend so simply
+	 * finalize the message
+	 */
+	case LNET_MSG_STATUS_LOCAL_ERROR:
+		lnet_handle_local_failure(msg);
+		return -1;
+
+	/*
+	 * TODO: since the remote dropped the message we can
+	 * attempt a resend safely.
+	 */
+	case LNET_MSG_STATUS_REMOTE_DROPPED:
+		lnet_handle_remote_failure(msg->msg_txpeer);
+		return lnet_attempt_msg_resend(msg);
+
+	case LNET_MSG_STATUS_REMOTE_ERROR:
+	case LNET_MSG_STATUS_REMOTE_TIMEOUT:
+	case LNET_MSG_STATUS_NETWORK_TIMEOUT:
+		lnet_handle_remote_failure(msg->msg_txpeer);
+		return -1;
+	default:
+		LBUG();
+	}
+
+	/* no resend is needed */
+	return -1;
+}
+
+static void
+lnet_msg_detach_md(struct lnet_msg *msg, int cpt, int status)
+{
+	struct lnet_libmd *md = msg->msg_md;
+	int unlink;
+
+	/* Now it's safe to drop my caller's ref */
+	md->md_refcount--;
+	LASSERT(md->md_refcount >= 0);
+
+	unlink = lnet_md_unlinkable(md);
+	if (md->md_eq != NULL) {
+		msg->msg_ev.status   = status;
+		msg->msg_ev.unlinked = unlink;
+		lnet_eq_enqueue_event(md->md_eq, &msg->msg_ev);
+	}
+
+	if (unlink || (md->md_refcount == 0 &&
+		       md->md_threshold == LNET_MD_THRESH_INF))
+		lnet_detach_rsp_tracker(md, cpt);
+
+	if (unlink)
+		lnet_md_unlink(md);
+
+	msg->msg_md = NULL;
+}
+
+static bool
+lnet_is_health_check(struct lnet_msg *msg)
+{
+	bool hc;
+	int status = msg->msg_ev.status;
+
+	if ((!msg->msg_tx_committed && !msg->msg_rx_committed) ||
+	    !msg->msg_onactivelist) {
+		CDEBUG(D_NET, "msg %p not committed for send or receive\n",
+		       msg);
+		return false;
+	}
+
+	if ((msg->msg_tx_committed && !msg->msg_txpeer) ||
+	    (msg->msg_rx_committed && !msg->msg_rxpeer)) {
+		CDEBUG(D_NET, "msg %p failed too early to retry and send\n",
+		       msg);
+		return false;
+	}
+
+	/*
+	 * perform a health check for any message committed for transmit
+	 */
+	hc = msg->msg_tx_committed;
+
+	/* Check for status inconsistencies */
+	if (hc &&
+	    ((!status && msg->msg_health_status != LNET_MSG_STATUS_OK) ||
+	     (status && msg->msg_health_status == LNET_MSG_STATUS_OK))) {
+		CDEBUG(D_NET, "Msg %p is in inconsistent state, don't perform health "
+			      "checking (%d, %d)\n", msg, status,
+			      msg->msg_health_status);
+		hc = false;
+	}
+
+	CDEBUG(D_NET, "health check = %d, status = %d, hstatus = %d\n",
+	       hc, status, msg->msg_health_status);
+
+	return hc;
+}
+
+char *
+lnet_health_error2str(enum lnet_msg_hstatus hstatus)
+{
+	switch (hstatus) {
+	case LNET_MSG_STATUS_LOCAL_INTERRUPT:
+		return "LOCAL_INTERRUPT";
+	case LNET_MSG_STATUS_LOCAL_DROPPED:
+		return "LOCAL_DROPPED";
+	case LNET_MSG_STATUS_LOCAL_ABORTED:
+		return "LOCAL_ABORTED";
+	case LNET_MSG_STATUS_LOCAL_NO_ROUTE:
+		return "LOCAL_NO_ROUTE";
+	case LNET_MSG_STATUS_LOCAL_TIMEOUT:
+		return "LOCAL_TIMEOUT";
+	case LNET_MSG_STATUS_LOCAL_ERROR:
+		return "LOCAL_ERROR";
+	case LNET_MSG_STATUS_REMOTE_DROPPED:
+		return "REMOTE_DROPPED";
+	case LNET_MSG_STATUS_REMOTE_ERROR:
+		return "REMOTE_ERROR";
+	case LNET_MSG_STATUS_REMOTE_TIMEOUT:
+		return "REMOTE_TIMEOUT";
+	case LNET_MSG_STATUS_NETWORK_TIMEOUT:
+		return "NETWORK_TIMEOUT";
+	case LNET_MSG_STATUS_OK:
+		return "OK";
+	default:
+		return "<UNKNOWN>";
+	}
+}
+
+bool
+lnet_send_error_simulation(struct lnet_msg *msg,
+			   enum lnet_msg_hstatus *hstatus)
+{
+	if (!msg)
+		return false;
+
+	if (list_empty(&the_lnet.ln_drop_rules))
+	    return false;
+
+	/* match only health rules */
+	if (!lnet_drop_rule_match(&msg->msg_hdr, hstatus))
+		return false;
+
+	CDEBUG(D_NET, "src %s, dst %s: %s simulate health error: %s\n",
+		libcfs_nid2str(msg->msg_hdr.src_nid),
+		libcfs_nid2str(msg->msg_hdr.dest_nid),
+		lnet_msgtyp2str(msg->msg_type),
+		lnet_health_error2str(*hstatus));
+
+	return true;
+}
+EXPORT_SYMBOL(lnet_send_error_simulation);
+
 void
 lnet_finalize(struct lnet_msg *msg, int status)
 {
-	struct lnet_msg_container	*container;
-	int				my_slot;
-	int				cpt;
-	int				rc;
-	int				i;
+	struct lnet_msg_container *container;
+	int my_slot;
+	int cpt;
+	int rc;
 
 	LASSERT(!in_interrupt());
 
@@ -464,16 +990,35 @@ lnet_finalize(struct lnet_msg *msg, int status)
 
 	msg->msg_ev.status = status;
 
+	if (lnet_is_health_check(msg)) {
+		/*
+		 * Check the health status of the message. If it has one
+		 * of the errors that we're supposed to handle, and it has
+		 * not timed out, then
+		 *	1. Decrement the appropriate health_value
+		 *	2. queue the message on the resend queue
+
+		 * if the message send is success, timed out or failed in the
+		 * health check for any reason then we'll just finalize the
+		 * message. Otherwise just return since the message has been
+		 * put on the resend queue.
+		 */
+		if (!lnet_health_check(msg))
+			return;
+	}
+
+	/*
+	 * We're not going to resend this message so detach its MD and invoke
+	 * the appropriate callbacks
+	 */
 	if (msg->msg_md != NULL) {
 		cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie);
-
 		lnet_res_lock(cpt);
-		lnet_msg_detach_md(msg, status);
+		lnet_msg_detach_md(msg, cpt, status);
 		lnet_res_unlock(cpt);
 	}
 
- again:
-	rc = 0;
+again:
 	if (!msg->msg_tx_committed && !msg->msg_rx_committed) {
 		/* not committed to network yet */
 		LASSERT(!msg->msg_onactivelist);
@@ -490,32 +1035,26 @@ lnet_finalize(struct lnet_msg *msg, int status)
 	lnet_net_lock(cpt);
 
 	container = the_lnet.ln_msg_containers[cpt];
-	list_add_tail(&msg->msg_list, &container->msc_finalizing);
 
 	/* Recursion breaker.  Don't complete the message here if I am (or
 	 * enough other threads are) already completing messages */
+	my_slot = lnet_check_finalize_recursion_locked(msg,
+						&container->msc_finalizing,
+						container->msc_nfinalizers,
+						container->msc_finalizers);
 
-	my_slot = -1;
-	for (i = 0; i < container->msc_nfinalizers; i++) {
-		if (container->msc_finalizers[i] == current)
-			break;
-
-		if (my_slot < 0 && container->msc_finalizers[i] == NULL)
-			my_slot = i;
-	}
-
-	if (i < container->msc_nfinalizers || my_slot < 0) {
+	/* enough threads are resending */
+	if (my_slot == -1) {
 		lnet_net_unlock(cpt);
 		return;
 	}
 
-	container->msc_finalizers[my_slot] = current;
-
+	rc = 0;
 	while (!list_empty(&container->msc_finalizing)) {
 		msg = list_entry(container->msc_finalizing.next,
 				 struct lnet_msg, msg_list);
 
-		list_del(&msg->msg_list);
+		list_del_init(&msg->msg_list);
 
 		/* NB drops and regains the lnet lock if it actually does
 		 * anything, so my finalizing friends can chomp along too */
@@ -553,7 +1092,7 @@ lnet_msg_container_cleanup(struct lnet_msg_container *container)
 				  struct lnet_msg, msg_activelist);
 		LASSERT(msg->msg_onactivelist);
 		msg->msg_onactivelist = 0;
-		list_del(&msg->msg_activelist);
+		list_del_init(&msg->msg_activelist);
 		lnet_msg_free(msg);
 		count++;
 	}
@@ -567,6 +1106,13 @@ lnet_msg_container_cleanup(struct lnet_msg_container *container)
 			    sizeof(*container->msc_finalizers));
 		container->msc_finalizers = NULL;
 	}
+
+	if (container->msc_resenders != NULL) {
+		LIBCFS_FREE(container->msc_resenders,
+			    container->msc_nfinalizers *
+			    sizeof(*container->msc_resenders));
+		container->msc_resenders = NULL;
+	}
 	container->msc_init = 0;
 }
 
@@ -579,6 +1125,7 @@ lnet_msg_container_setup(struct lnet_msg_container *container, int cpt)
 
 	INIT_LIST_HEAD(&container->msc_active);
 	INIT_LIST_HEAD(&container->msc_finalizing);
+	INIT_LIST_HEAD(&container->msc_resending);
 
 	/* number of CPUs */
 	container->msc_nfinalizers = cfs_cpt_weight(lnet_cpt_table(), cpt);
@@ -595,6 +1142,16 @@ lnet_msg_container_setup(struct lnet_msg_container *container, int cpt)
 		return -ENOMEM;
 	}
 
+	LIBCFS_CPT_ALLOC(container->msc_resenders, lnet_cpt_table(), cpt,
+			 container->msc_nfinalizers *
+			 sizeof(*container->msc_resenders));
+
+	if (container->msc_resenders == NULL) {
+		CERROR("Failed to allocate message resenders\n");
+		lnet_msg_container_cleanup(container);
+		return -ENOMEM;
+	}
+
 	return rc;
 }
 
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c b/drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c
index 3773ed9e2436c..75a352dec6ff8 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-ptl.c
@@ -21,7 +21,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
index 973587a2a1dc5..ba330c6d2af1c 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lib-socket.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2015, 2016, Intel Corporation.
+ * Copyright (c) 2015, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,9 +40,9 @@
 #include <linux/syscalls.h>
 #include <net/sock.h>
 
+#include <libcfs/linux/linux-time.h>
 #include <libcfs/linux/linux-net.h>
 #include <libcfs/libcfs.h>
-#include <libcfs/linux/linux-time.h>
 #include <lnet/lib-lnet.h>
 
 /*
@@ -66,20 +66,6 @@
 #define SO_RCVTIMEO SO_RCVTIMEO_OLD
 #endif
 
-static int
-lnet_sock_create_kern(struct socket **sock, struct net *ns)
-{
-	int rc;
-
-#ifdef HAVE_SOCK_CREATE_KERN_USE_NET
-	rc = sock_create_kern(ns, PF_INET, SOCK_STREAM, 0, sock);
-#else
-	rc = sock_create_kern(PF_INET, SOCK_STREAM, 0, sock);
-#endif
-
-	return rc;
-}
-
 int
 lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout)
 {
@@ -186,13 +172,17 @@ lnet_sock_create(struct socket **sockp, int *fatal,
 		 __u32 local_ip, int local_port, struct net *ns)
 {
 	struct sockaddr_in  locaddr;
-	struct socket *sock;
-	int rc;
+	struct socket	   *sock;
+	int		    rc;
 
 	/* All errors are fatal except bind failure if the port is in use */
 	*fatal = 1;
 
-	rc = lnet_sock_create_kern(&sock, ns);
+#ifdef HAVE_SOCK_CREATE_KERN_USE_NET
+	rc = sock_create_kern(ns, PF_INET, SOCK_STREAM, 0, &sock);
+#else
+	rc = sock_create_kern(PF_INET, SOCK_STREAM, 0, &sock);
+#endif
 	*sockp = sock;
 	if (rc != 0) {
 		CERROR("Can't create socket: %d\n", rc);
diff --git a/drivers/staging/lustrefsx/lnet/lnet/lo.c b/drivers/staging/lustrefsx/lnet/lnet/lo.c
index eaa06fb41631d..a11ecddb08349 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/lo.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/lo.c
@@ -22,6 +22,8 @@
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
+ *
+ * Copyright (c) 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lnet/lnet/module.c b/drivers/staging/lustrefsx/lnet/lnet/module.c
index a7190dd79d002..676f7345ca576 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/module.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/module.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -31,8 +31,9 @@
  */
 
 #define DEBUG_SUBSYSTEM S_LNET
+
 #include <lnet/lib-lnet.h>
-#include <lnet/lib-dlc.h>
+#include <uapi/linux/lnet/lnet-dlc.h>
 
 static int config_on_load = 0;
 module_param(config_on_load, int, 0444);
@@ -171,36 +172,45 @@ lnet_dyn_unconfigure_ni(struct libcfs_ioctl_hdr *hdr)
 }
 
 static int
-lnet_ioctl(unsigned int cmd, struct libcfs_ioctl_hdr *hdr)
+lnet_ioctl(struct notifier_block *nb,
+	   unsigned long cmd, void *vdata)
 {
-	int   rc;
+	struct libcfs_ioctl_hdr *hdr = vdata;
+	int rc;
 
 	switch (cmd) {
 	case IOC_LIBCFS_CONFIGURE: {
 		struct libcfs_ioctl_data *data =
 		  (struct libcfs_ioctl_data *)hdr;
 
-		if (data->ioc_hdr.ioc_len < sizeof(*data))
-			return -EINVAL;
-
-		the_lnet.ln_nis_from_mod_params = data->ioc_flags;
-		return lnet_configure(NULL);
+		if (data->ioc_hdr.ioc_len < sizeof(*data)) {
+			rc = -EINVAL;
+		} else {
+			the_lnet.ln_nis_from_mod_params = data->ioc_flags;
+			rc = lnet_configure(NULL);
+		}
+		break;
 	}
 
 	case IOC_LIBCFS_UNCONFIGURE:
-		return lnet_unconfigure();
+		rc = lnet_unconfigure();
+		break;
 
 	case IOC_LIBCFS_ADD_NET:
-		return lnet_dyn_configure_net(hdr);
+		rc = lnet_dyn_configure_net(hdr);
+		break;
 
 	case IOC_LIBCFS_DEL_NET:
-		return lnet_dyn_unconfigure_net(hdr);
+		rc = lnet_dyn_unconfigure_net(hdr);
+		break;
 
 	case IOC_LIBCFS_ADD_LOCAL_NI:
-		return lnet_dyn_configure_ni(hdr);
+		rc = lnet_dyn_configure_ni(hdr);
+		break;
 
 	case IOC_LIBCFS_DEL_LOCAL_NI:
-		return lnet_dyn_unconfigure_ni(hdr);
+		rc = lnet_dyn_unconfigure_ni(hdr);
+		break;
 
 	default:
 		/* Passing LNET_PID_ANY only gives me a ref if the net is up
@@ -211,11 +221,14 @@ lnet_ioctl(unsigned int cmd, struct libcfs_ioctl_hdr *hdr)
 			rc = LNetCtl(cmd, hdr);
 			LNetNIFini();
 		}
-		return rc;
+		break;
 	}
+	return notifier_from_ioctl_errno(rc);
 }
 
-DECLARE_IOCTL_HANDLER(lnet_ioctl_handler, lnet_ioctl);
+static struct notifier_block lnet_ioctl_handler = {
+	.notifier_call = lnet_ioctl,
+};
 
 static int __init lnet_init(void)
 {
@@ -230,7 +243,8 @@ static int __init lnet_init(void)
 		RETURN(rc);
 	}
 
-	rc = libcfs_register_ioctl(&lnet_ioctl_handler);
+	rc = blocking_notifier_chain_register(&libcfs_ioctl_list,
+					      &lnet_ioctl_handler);
 	LASSERT(rc == 0);
 
 	if (config_on_load) {
@@ -246,7 +260,8 @@ static void __exit lnet_exit(void)
 {
 	int rc;
 
-	rc = libcfs_deregister_ioctl(&lnet_ioctl_handler);
+	rc = blocking_notifier_chain_unregister(&libcfs_ioctl_list,
+						&lnet_ioctl_handler);
 	LASSERT(rc == 0);
 
 	lnet_lib_exit();
diff --git a/drivers/staging/lustrefsx/lnet/lnet/net_fault.c b/drivers/staging/lustrefsx/lnet/lnet/net_fault.c
index b3d5b907a827b..4013ac47ab096 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/net_fault.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/net_fault.c
@@ -21,7 +21,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2014, 2016, Intel Corporation.
+ * Copyright (c) 2014, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -37,7 +37,7 @@
 #define DEBUG_SUBSYSTEM S_LNET
 
 #include <lnet/lib-lnet.h>
-#include <lnet/lnetctl.h>
+#include <uapi/linux/lnet/lnetctl.h>
 
 #define LNET_MSG_MASK		(LNET_PUT_BIT | LNET_ACK_BIT | \
 				 LNET_GET_BIT | LNET_REPLY_BIT)
@@ -57,9 +57,9 @@ struct lnet_drop_rule {
 	/**
 	 * seconds to drop the next message, it's exclusive with dr_drop_at
 	 */
-	cfs_time_t		dr_drop_time;
+	time64_t		dr_drop_time;
 	/** baseline to caculate dr_drop_time */
-	cfs_time_t		dr_time_base;
+	time64_t		dr_time_base;
 	/** statistic of dropped messages */
 	struct lnet_fault_stat	dr_stat;
 };
@@ -170,9 +170,9 @@ lnet_drop_rule_add(struct lnet_fault_attr *attr)
 
 	rule->dr_attr = *attr;
 	if (attr->u.drop.da_interval != 0) {
-		rule->dr_time_base = cfs_time_shift(attr->u.drop.da_interval);
-		rule->dr_drop_time = cfs_time_shift(cfs_rand() %
-						    attr->u.drop.da_interval);
+		rule->dr_time_base = ktime_get_seconds() + attr->u.drop.da_interval;
+		rule->dr_drop_time = ktime_get_seconds() +
+				     cfs_rand() % attr->u.drop.da_interval;
 	} else {
 		rule->dr_drop_at = cfs_rand() % attr->u.drop.da_rate;
 	}
@@ -283,10 +283,9 @@ lnet_drop_rule_reset(void)
 		if (attr->u.drop.da_rate != 0) {
 			rule->dr_drop_at = cfs_rand() % attr->u.drop.da_rate;
 		} else {
-			rule->dr_drop_time = cfs_time_shift(cfs_rand() %
-						attr->u.drop.da_interval);
-			rule->dr_time_base = cfs_time_shift(attr->u.drop.
-								  da_interval);
+			rule->dr_drop_time = ktime_get_seconds() +
+					     cfs_rand() % attr->u.drop.da_interval;
+			rule->dr_time_base = ktime_get_seconds() + attr->u.drop.da_interval;
 		}
 		spin_unlock(&rule->dr_lock);
 	}
@@ -295,13 +294,58 @@ lnet_drop_rule_reset(void)
 	EXIT;
 }
 
+static void
+lnet_fault_match_health(enum lnet_msg_hstatus *hstatus, __u32 mask)
+{
+	unsigned int random;
+	int choice;
+	int delta;
+	int best_delta;
+	int i;
+
+	/* assign a random failure */
+	random = cfs_rand();
+	choice = random % (LNET_MSG_STATUS_END - LNET_MSG_STATUS_OK);
+	if (choice == 0)
+		choice++;
+
+	if (mask == HSTATUS_RANDOM) {
+		*hstatus = choice;
+		return;
+	}
+
+	if (mask & (1 << choice)) {
+		*hstatus = choice;
+		return;
+	}
+
+	/* round to the closest ON bit */
+	i = HSTATUS_END;
+	best_delta = HSTATUS_END;
+	while (i > 0) {
+		if (mask & (1 << i)) {
+			delta = choice - i;
+			if (delta < 0)
+				delta *= -1;
+			if (delta < best_delta) {
+				best_delta = delta;
+				choice = i;
+			}
+		}
+		i--;
+	}
+
+	*hstatus = choice;
+}
+
 /**
  * check source/destination NID, portal, message type and drop rate,
  * decide whether should drop this message or not
  */
 static bool
 drop_rule_match(struct lnet_drop_rule *rule, lnet_nid_t src,
-		lnet_nid_t dst, unsigned int type, unsigned int portal)
+		lnet_nid_t dst, unsigned int type, unsigned int portal,
+		enum lnet_msg_hstatus *hstatus)
 {
 	struct lnet_fault_attr	*attr = &rule->dr_attr;
 	bool			 drop;
@@ -309,24 +353,36 @@ drop_rule_match(struct lnet_drop_rule *rule, lnet_nid_t src,
 	if (!lnet_fault_attr_match(attr, src, dst, type, portal))
 		return false;
 
+	/*
+	 * if we're trying to match a health status error but it hasn't
+	 * been set in the rule, then don't match
+	 */
+	if ((hstatus && !attr->u.drop.da_health_error_mask) ||
+	    (!hstatus && attr->u.drop.da_health_error_mask))
+		return false;
+
 	/* match this rule, check drop rate now */
 	spin_lock(&rule->dr_lock);
-	if (rule->dr_drop_time != 0) { /* time based drop */
-		cfs_time_t now = cfs_time_current();
+	if (attr->u.drop.da_random) {
+		int value = cfs_rand() % attr->u.drop.da_interval;
+		if (value >= (attr->u.drop.da_interval / 2))
+			drop = true;
+		else
+			drop = false;
+	} else if (rule->dr_drop_time != 0) { /* time based drop */
+		time64_t now = ktime_get_seconds();
 
 		rule->dr_stat.fs_count++;
-		drop = cfs_time_aftereq(now, rule->dr_drop_time);
+		drop = now >= rule->dr_drop_time;
 		if (drop) {
-			if (cfs_time_after(now, rule->dr_time_base))
+			if (now > rule->dr_time_base)
 				rule->dr_time_base = now;
 
 			rule->dr_drop_time = rule->dr_time_base +
-					     cfs_time_seconds(cfs_rand() %
-						attr->u.drop.da_interval);
-			rule->dr_time_base += cfs_time_seconds(attr->u.drop.
-							       da_interval);
+					     cfs_rand() % attr->u.drop.da_interval;
+			rule->dr_time_base += attr->u.drop.da_interval;
 
-			CDEBUG(D_NET, "Drop Rule %s->%s: next drop : %ld\n",
+			CDEBUG(D_NET, "Drop Rule %s->%s: next drop : %lld\n",
 			       libcfs_nid2str(attr->fa_src),
 			       libcfs_nid2str(attr->fa_dst),
 			       rule->dr_drop_time);
@@ -347,6 +403,9 @@ drop_rule_match(struct lnet_drop_rule *rule, lnet_nid_t src,
 	}
 
 	if (drop) { /* drop this message, update counters */
+		if (hstatus)
+			lnet_fault_match_health(hstatus,
+				attr->u.drop.da_health_error_mask);
 		lnet_fault_stat_inc(&rule->dr_stat, type);
 		rule->dr_stat.u.drop.ds_dropped++;
 	}
@@ -359,15 +418,15 @@ drop_rule_match(struct lnet_drop_rule *rule, lnet_nid_t src,
  * Check if message from \a src to \a dst can match any existed drop rule
  */
 bool
-lnet_drop_rule_match(struct lnet_hdr *hdr)
+lnet_drop_rule_match(struct lnet_hdr *hdr, enum lnet_msg_hstatus *hstatus)
 {
-	struct lnet_drop_rule	*rule;
-	lnet_nid_t		 src = le64_to_cpu(hdr->src_nid);
-	lnet_nid_t		 dst = le64_to_cpu(hdr->dest_nid);
-	unsigned int		 typ = le32_to_cpu(hdr->type);
-	unsigned int		 ptl = -1;
-	bool			 drop = false;
-	int			 cpt;
+	lnet_nid_t src = le64_to_cpu(hdr->src_nid);
+	lnet_nid_t dst = le64_to_cpu(hdr->dest_nid);
+	unsigned int typ = le32_to_cpu(hdr->type);
+	struct lnet_drop_rule *rule;
+	unsigned int ptl = -1;
+	bool drop = false;
+	int cpt;
 
 	/* NB: if Portal is specified, then only PUT and GET will be
 	 * filtered by drop rule */
@@ -378,12 +437,13 @@ lnet_drop_rule_match(struct lnet_hdr *hdr)
 
 	cpt = lnet_net_lock_current();
 	list_for_each_entry(rule, &the_lnet.ln_drop_rules, dr_link) {
-		drop = drop_rule_match(rule, src, dst, typ, ptl);
+		drop = drop_rule_match(rule, src, dst, typ, ptl,
+				       hstatus);
 		if (drop)
 			break;
 	}
-
 	lnet_net_unlock(cpt);
+
 	return drop;
 }
 
@@ -412,9 +472,9 @@ struct lnet_delay_rule {
 	/**
 	 * seconds to delay the next message, it's exclusive with dl_delay_at
 	 */
-	cfs_time_t		dl_delay_time;
+	time64_t		dl_delay_time;
 	/** baseline to caculate dl_delay_time */
-	cfs_time_t		dl_time_base;
+	time64_t		dl_time_base;
 	/** jiffies to send the next delayed message */
 	unsigned long		dl_msg_send;
 	/** delayed message list */
@@ -444,13 +504,6 @@ struct delay_daemon_data {
 
 static struct delay_daemon_data	delay_dd;
 
-static cfs_time_t
-round_timeout(cfs_time_t timeout)
-{
-	return cfs_time_seconds((unsigned int)
-			cfs_duration_sec(cfs_time_sub(timeout, 0)) + 1);
-}
-
 static void
 delay_rule_decref(struct lnet_delay_rule *rule)
 {
@@ -472,8 +525,9 @@ delay_rule_match(struct lnet_delay_rule *rule, lnet_nid_t src,
 		lnet_nid_t dst, unsigned int type, unsigned int portal,
 		struct lnet_msg *msg)
 {
-	struct lnet_fault_attr	*attr = &rule->dl_attr;
-	bool			 delay;
+	struct lnet_fault_attr *attr = &rule->dl_attr;
+	bool delay;
+	time64_t now = ktime_get_seconds();
 
 	if (!lnet_fault_attr_match(attr, src, dst, type, portal))
 		return false;
@@ -481,21 +535,17 @@ delay_rule_match(struct lnet_delay_rule *rule, lnet_nid_t src,
 	/* match this rule, check delay rate now */
 	spin_lock(&rule->dl_lock);
 	if (rule->dl_delay_time != 0) { /* time based delay */
-		cfs_time_t now = cfs_time_current();
-
 		rule->dl_stat.fs_count++;
-		delay = cfs_time_aftereq(now, rule->dl_delay_time);
+		delay = now >= rule->dl_delay_time;
 		if (delay) {
-			if (cfs_time_after(now, rule->dl_time_base))
+			if (now > rule->dl_time_base)
 				rule->dl_time_base = now;
 
 			rule->dl_delay_time = rule->dl_time_base +
-					     cfs_time_seconds(cfs_rand() %
-						attr->u.delay.la_interval);
-			rule->dl_time_base += cfs_time_seconds(attr->u.delay.
-							       la_interval);
+					      cfs_rand() % attr->u.delay.la_interval;
+			rule->dl_time_base += attr->u.delay.la_interval;
 
-			CDEBUG(D_NET, "Delay Rule %s->%s: next delay : %ld\n",
+			CDEBUG(D_NET, "Delay Rule %s->%s: next delay : %lld\n",
 			       libcfs_nid2str(attr->fa_src),
 			       libcfs_nid2str(attr->fa_dst),
 			       rule->dl_delay_time);
@@ -526,11 +576,11 @@ delay_rule_match(struct lnet_delay_rule *rule, lnet_nid_t src,
 	rule->dl_stat.u.delay.ls_delayed++;
 
 	list_add_tail(&msg->msg_list, &rule->dl_msg_list);
-	msg->msg_delay_send = round_timeout(
-			cfs_time_shift(attr->u.delay.la_latency));
+	msg->msg_delay_send = now + attr->u.delay.la_latency;
 	if (rule->dl_msg_send == -1) {
 		rule->dl_msg_send = msg->msg_delay_send;
-		mod_timer(&rule->dl_timer, rule->dl_msg_send);
+		mod_timer(&rule->dl_timer,
+			  jiffies + cfs_time_seconds(attr->u.delay.la_latency));
 	}
 
 	spin_unlock(&rule->dl_lock);
@@ -574,7 +624,7 @@ delayed_msg_check(struct lnet_delay_rule *rule, bool all,
 {
 	struct lnet_msg *msg;
 	struct lnet_msg *tmp;
-	unsigned long	 now = cfs_time_current();
+	time64_t now = ktime_get_seconds();
 
 	if (!all && rule->dl_msg_send > now)
 		return;
@@ -598,7 +648,9 @@ delayed_msg_check(struct lnet_delay_rule *rule, bool all,
 		msg = list_entry(rule->dl_msg_list.next,
 				 struct lnet_msg, msg_list);
 		rule->dl_msg_send = msg->msg_delay_send;
-		mod_timer(&rule->dl_timer, rule->dl_msg_send);
+		mod_timer(&rule->dl_timer,
+			  jiffies +
+			  cfs_time_seconds(msg->msg_delay_send - now));
 	}
 	spin_unlock(&rule->dl_lock);
 }
@@ -614,6 +666,20 @@ delayed_msg_process(struct list_head *msg_list, bool drop)
 		int		rc;
 
 		msg = list_entry(msg_list->next, struct lnet_msg, msg_list);
+
+		if (msg->msg_sending) {
+			/* Delayed send */
+			list_del_init(&msg->msg_list);
+			ni = msg->msg_txni;
+			CDEBUG(D_NET, "TRACE: msg %p %s -> %s : %s\n", msg,
+			       libcfs_nid2str(ni->ni_nid),
+			       libcfs_nid2str(msg->msg_txpeer->lpni_nid),
+			       lnet_msgtyp2str(msg->msg_type));
+			lnet_ni_send(ni, msg);
+			continue;
+		}
+
+		/* Delayed receive */
 		LASSERT(msg->msg_rxpeer != NULL);
 		LASSERT(msg->msg_rxni != NULL);
 
@@ -638,7 +704,7 @@ delayed_msg_process(struct list_head *msg_list, bool drop)
 			case LNET_CREDIT_OK:
 				lnet_ni_recv(ni, msg->msg_private, msg, 0,
 					     0, msg->msg_len, msg->msg_len);
-				/* Fall through */
+				fallthrough;
 			case LNET_CREDIT_WAIT:
 				continue;
 			default: /* failures */
@@ -646,7 +712,8 @@ delayed_msg_process(struct list_head *msg_list, bool drop)
 			}
 		}
 
-		lnet_drop_message(ni, cpt, msg->msg_private, msg->msg_len);
+		lnet_drop_message(ni, cpt, msg->msg_private, msg->msg_len,
+				  msg->msg_type);
 		lnet_finalize(msg, rc);
 	}
 }
@@ -782,9 +849,10 @@ lnet_delay_rule_add(struct lnet_fault_attr *attr)
 
 	rule->dl_attr = *attr;
 	if (attr->u.delay.la_interval != 0) {
-		rule->dl_time_base = cfs_time_shift(attr->u.delay.la_interval);
-		rule->dl_delay_time = cfs_time_shift(cfs_rand() %
-						     attr->u.delay.la_interval);
+		rule->dl_time_base = ktime_get_seconds() +
+				     attr->u.delay.la_interval;
+		rule->dl_delay_time = ktime_get_seconds() +
+				      cfs_rand() % attr->u.delay.la_interval;
 	} else {
 		rule->dl_delay_at = cfs_rand() % attr->u.delay.la_rate;
 	}
@@ -935,10 +1003,10 @@ lnet_delay_rule_reset(void)
 		if (attr->u.delay.la_rate != 0) {
 			rule->dl_delay_at = cfs_rand() % attr->u.delay.la_rate;
 		} else {
-			rule->dl_delay_time = cfs_time_shift(cfs_rand() %
-						attr->u.delay.la_interval);
-			rule->dl_time_base = cfs_time_shift(attr->u.delay.
-								  la_interval);
+			rule->dl_delay_time = ktime_get_seconds() +
+					      cfs_rand() % attr->u.delay.la_interval;
+			rule->dl_time_base = ktime_get_seconds() +
+					     attr->u.delay.la_interval;
 		}
 		spin_unlock(&rule->dl_lock);
 	}
diff --git a/drivers/staging/lustrefsx/lnet/lnet/nidstrings.c b/drivers/staging/lustrefsx/lnet/lnet/nidstrings.c
index 5122a2e6b5d81..fe3add7b9701c 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/nidstrings.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/nidstrings.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -37,7 +37,7 @@
 #define DEBUG_SUBSYSTEM S_LNET
 
 #include <libcfs/libcfs.h>
-#include <lnet/nidstr.h>
+#include <uapi/linux/lnet/nidstr.h>
 
 /* max value for numeric network address */
 #define MAX_NUMERIC_VALUE 0xffffffff
diff --git a/drivers/staging/lustrefsx/lnet/lnet/peer.c b/drivers/staging/lustrefsx/lnet/lnet/peer.c
index 612af87d47692..c2d64d140702e 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/peer.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/peer.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2014, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -34,8 +34,19 @@
 
 #define DEBUG_SUBSYSTEM S_LNET
 
+#include <linux/sched.h>
+#ifdef HAVE_SCHED_HEADERS
+#include <linux/sched/signal.h>
+#endif
+#include <linux/uaccess.h>
+
 #include <lnet/lib-lnet.h>
-#include <lnet/lib-dlc.h>
+#include <uapi/linux/lnet/lnet-dlc.h>
+
+/* Value indicating that recovery needs to re-check a peer immediately. */
+#define LNET_REDISCOVER_PEER	(1)
+
+static int lnet_peer_queue_for_discovery(struct lnet_peer *lp);
 
 static void
 lnet_peer_remove_from_remote_list(struct lnet_peer_ni *lpni)
@@ -127,6 +138,8 @@ lnet_peer_tables_create(void)
 		spin_lock_init(&ptable->pt_zombie_lock);
 		INIT_LIST_HEAD(&ptable->pt_zombie_list);
 
+		INIT_LIST_HEAD(&ptable->pt_peer_list);
+
 		for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
 			INIT_LIST_HEAD(&hash[j]);
 		ptable->pt_hash = hash; /* sign of initialization */
@@ -152,17 +165,19 @@ lnet_peer_ni_alloc(lnet_nid_t nid)
 	INIT_LIST_HEAD(&lpni->lpni_rtrq);
 	INIT_LIST_HEAD(&lpni->lpni_routes);
 	INIT_LIST_HEAD(&lpni->lpni_hashlist);
-	INIT_LIST_HEAD(&lpni->lpni_on_peer_net_list);
+	INIT_LIST_HEAD(&lpni->lpni_peer_nis);
+	INIT_LIST_HEAD(&lpni->lpni_recovery);
 	INIT_LIST_HEAD(&lpni->lpni_on_remote_peer_ni_list);
+	LNetInvalidateMDHandle(&lpni->lpni_recovery_ping_mdh);
 
 	spin_lock_init(&lpni->lpni_lock);
 
 	lpni->lpni_alive = !lnet_peers_start_down(); /* 1 bit!! */
-	lpni->lpni_last_alive = cfs_time_current(); /* assumes alive */
+	lpni->lpni_last_alive = ktime_get_seconds(); /* assumes alive */
 	lpni->lpni_ping_feats = LNET_PING_FEAT_INVAL;
 	lpni->lpni_nid = nid;
 	lpni->lpni_cpt = cpt;
-	lnet_set_peer_ni_health_locked(lpni, true);
+	atomic_set(&lpni->lpni_healthv, LNET_MAX_HEALTH_VALUE);
 
 	net = lnet_get_net_locked(LNET_NIDNET(nid));
 	lpni->lpni_net = net;
@@ -184,7 +199,7 @@ lnet_peer_ni_alloc(lnet_nid_t nid)
 			      &the_lnet.ln_remote_peer_ni_list);
 	}
 
-	/* TODO: update flags */
+	CDEBUG(D_NET, "%p nid %s\n", lpni, libcfs_nid2str(lpni->lpni_nid));
 
 	return lpni;
 }
@@ -198,13 +213,32 @@ lnet_peer_net_alloc(__u32 net_id)
 	if (!lpn)
 		return NULL;
 
-	INIT_LIST_HEAD(&lpn->lpn_on_peer_list);
+	INIT_LIST_HEAD(&lpn->lpn_peer_nets);
 	INIT_LIST_HEAD(&lpn->lpn_peer_nis);
 	lpn->lpn_net_id = net_id;
 
+	CDEBUG(D_NET, "%p net %s\n", lpn, libcfs_net2str(lpn->lpn_net_id));
+
 	return lpn;
 }
 
+void
+lnet_destroy_peer_net_locked(struct lnet_peer_net *lpn)
+{
+	struct lnet_peer *lp;
+
+	CDEBUG(D_NET, "%p net %s\n", lpn, libcfs_net2str(lpn->lpn_net_id));
+
+	LASSERT(atomic_read(&lpn->lpn_refcount) == 0);
+	LASSERT(list_empty(&lpn->lpn_peer_nis));
+	LASSERT(list_empty(&lpn->lpn_peer_nets));
+	lp = lpn->lpn_peer;
+	lpn->lpn_peer = NULL;
+	LIBCFS_FREE(lpn, sizeof(*lpn));
+
+	lnet_peer_decref_locked(lp);
+}
+
 static struct lnet_peer *
 lnet_peer_alloc(lnet_nid_t nid)
 {
@@ -214,47 +248,118 @@ lnet_peer_alloc(lnet_nid_t nid)
 	if (!lp)
 		return NULL;
 
-	INIT_LIST_HEAD(&lp->lp_on_lnet_peer_list);
+	INIT_LIST_HEAD(&lp->lp_peer_list);
 	INIT_LIST_HEAD(&lp->lp_peer_nets);
+	INIT_LIST_HEAD(&lp->lp_dc_list);
+	INIT_LIST_HEAD(&lp->lp_dc_pendq);
+	init_waitqueue_head(&lp->lp_dc_waitq);
+	spin_lock_init(&lp->lp_lock);
 	lp->lp_primary_nid = nid;
+	lp->lp_disc_src_nid = LNET_NID_ANY;
+
+	/*
+	 * Turn off discovery for loopback peer. If you're creating a peer
+	 * for the loopback interface then that was initiated when we
+	 * attempted to send a message over the loopback. There is no need
+	 * to ever use a different interface when sending messages to
+	 * myself.
+	 */
+	if (nid == LNET_NID_LO_0)
+		lp->lp_state = LNET_PEER_NO_DISCOVERY;
+	lp->lp_cpt = lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
 
-	/* TODO: update flags */
+	CDEBUG(D_NET, "%p nid %s\n", lp, libcfs_nid2str(lp->lp_primary_nid));
 
 	return lp;
 }
 
+void
+lnet_destroy_peer_locked(struct lnet_peer *lp)
+{
+	CDEBUG(D_NET, "%p nid %s\n", lp, libcfs_nid2str(lp->lp_primary_nid));
+
+	LASSERT(atomic_read(&lp->lp_refcount) == 0);
+	LASSERT(list_empty(&lp->lp_peer_nets));
+	LASSERT(list_empty(&lp->lp_peer_list));
+	LASSERT(list_empty(&lp->lp_dc_list));
+
+	if (lp->lp_data)
+		lnet_ping_buffer_decref(lp->lp_data);
+
+	/*
+	 * if there are messages still on the pending queue, then make
+	 * sure to queue them on the ln_msg_resend list so they can be
+	 * resent at a later point if the discovery thread is still
+	 * running.
+	 * If the discovery thread has stopped, then the wakeup will be a
+	 * no-op, and it is expected the lnet_shutdown_lndnets() will
+	 * eventually be called, which will traverse this list and
+	 * finalize the messages on the list.
+	 * We can not resend them now because we're holding the cpt lock.
+	 * Releasing the lock can cause an inconsistent state
+	 */
+	spin_lock(&the_lnet.ln_msg_resend_lock);
+	spin_lock(&lp->lp_lock);
+	list_splice(&lp->lp_dc_pendq, &the_lnet.ln_msg_resend);
+	spin_unlock(&lp->lp_lock);
+	spin_unlock(&the_lnet.ln_msg_resend_lock);
+	wake_up(&the_lnet.ln_dc_waitq);
+
+	LIBCFS_FREE(lp, sizeof(*lp));
+}
 
+/*
+ * Detach a peer_ni from its peer_net. If this was the last peer_ni on
+ * that peer_net, detach the peer_net from the peer.
+ *
+ * Call with lnet_net_lock/EX held
+ */
 static void
-lnet_try_destroy_peer_hierarchy_locked(struct lnet_peer_ni *lpni)
+lnet_peer_detach_peer_ni_locked(struct lnet_peer_ni *lpni)
 {
-	struct lnet_peer_net *peer_net;
-	struct lnet_peer *peer;
+	struct lnet_peer_table *ptable;
+	struct lnet_peer_net *lpn;
+	struct lnet_peer *lp;
 
-	/* TODO: could the below situation happen? accessing an already
-	 * destroyed peer? */
-	if (lpni->lpni_peer_net == NULL ||
-	    lpni->lpni_peer_net->lpn_peer == NULL)
-		return;
+	/*
+	 * Belts and suspenders: gracefully handle teardown of a
+	 * partially connected peer_ni.
+	 */
+	lpn = lpni->lpni_peer_net;
 
-	peer_net = lpni->lpni_peer_net;
-	peer = lpni->lpni_peer_net->lpn_peer;
+	list_del_init(&lpni->lpni_peer_nis);
+	/*
+	 * If there are no lpni's left, we detach lpn from
+	 * lp_peer_nets, so it cannot be found anymore.
+	 */
+	if (list_empty(&lpn->lpn_peer_nis))
+		list_del_init(&lpn->lpn_peer_nets);
 
-	list_del_init(&lpni->lpni_on_peer_net_list);
-	lpni->lpni_peer_net = NULL;
+	/* Update peer NID count. */
+	lp = lpn->lpn_peer;
+	lp->lp_nnis--;
 
-	/* if peer_net is empty, then remove it from the peer */
-	if (list_empty(&peer_net->lpn_peer_nis)) {
-		list_del_init(&peer_net->lpn_on_peer_list);
-		peer_net->lpn_peer = NULL;
-		LIBCFS_FREE(peer_net, sizeof(*peer_net));
-
-		/* if the peer is empty then remove it from the
-		 * the_lnet.ln_peers */
-		if (list_empty(&peer->lp_peer_nets)) {
-			list_del_init(&peer->lp_on_lnet_peer_list);
-			LIBCFS_FREE(peer, sizeof(*peer));
-		}
+	/*
+	 * If there are no more peer nets, make the peer unfindable
+	 * via the peer_tables.
+	 *
+	 * Otherwise, if the peer is DISCOVERED, tell discovery to
+	 * take another look at it. This is a no-op if discovery for
+	 * this peer did the detaching.
+	 */
+	if (list_empty(&lp->lp_peer_nets)) {
+		list_del_init(&lp->lp_peer_list);
+		ptable = the_lnet.ln_peer_tables[lp->lp_cpt];
+		ptable->pt_peers--;
+	} else if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING) {
+		/* Discovery isn't running, nothing to do here. */
+	} else if (lp->lp_state & LNET_PEER_DISCOVERED) {
+		lnet_peer_queue_for_discovery(lp);
+		wake_up(&the_lnet.ln_dc_waitq);
 	}
+	CDEBUG(D_NET, "peer %s NID %s\n",
+		libcfs_nid2str(lp->lp_primary_nid),
+		libcfs_nid2str(lpni->lpni_nid));
 }
 
 /* called with lnet_net_lock LNET_LOCK_EX held */
@@ -275,10 +380,18 @@ lnet_peer_ni_del_locked(struct lnet_peer_ni *lpni)
 	/* remove peer ni from the hash list. */
 	list_del_init(&lpni->lpni_hashlist);
 
+	/*
+	 * indicate the peer is being deleted so the monitor thread can
+	 * remove it from the recovery queue.
+	 */
+	spin_lock(&lpni->lpni_lock);
+	lpni->lpni_state |= LNET_PEER_NI_DELETING;
+	spin_unlock(&lpni->lpni_lock);
+
 	/* decrement the ref count on the peer table */
 	ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt];
-	LASSERT(atomic_read(&ptable->pt_number) > 0);
-	atomic_dec(&ptable->pt_number);
+	LASSERT(ptable->pt_number > 0);
+	ptable->pt_number--;
 
 	/*
 	 * The peer_ni can no longer be found with a lookup. But there
@@ -287,7 +400,7 @@ lnet_peer_ni_del_locked(struct lnet_peer_ni *lpni)
 	 *
 	 * The last reference may be lost in a place where the
 	 * lnet_net_lock locks only a single cpt, and that cpt may not
-	 * be lpni->lpni_cpt. So the zombie list of this peer_table
+	 * be lpni->lpni_cpt. So the zombie list of lnet_peer_table
 	 * has its own lock.
 	 */
 	spin_lock(&ptable->pt_zombie_lock);
@@ -295,10 +408,10 @@ lnet_peer_ni_del_locked(struct lnet_peer_ni *lpni)
 	ptable->pt_zombies++;
 	spin_unlock(&ptable->pt_zombie_lock);
 
-	/* no need to keep this peer on the hierarchy anymore */
-	lnet_try_destroy_peer_hierarchy_locked(lpni);
+	/* no need to keep this peer_ni on the hierarchy anymore */
+	lnet_peer_detach_peer_ni_locked(lpni);
 
-	/* decrement reference on peer */
+	/* remove hashlist reference on peer_ni */
 	lnet_peer_ni_decref_locked(lpni);
 
 	return 0;
@@ -326,6 +439,8 @@ lnet_peer_del_locked(struct lnet_peer *peer)
 	struct lnet_peer_ni *lpni = NULL, *lpni2;
 	int rc = 0, rc2 = 0;
 
+	CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(peer->lp_primary_nid));
+
 	lpni = lnet_get_next_peer_ni_locked(peer, NULL, lpni);
 	while (lpni != NULL) {
 		lpni2 = lnet_get_next_peer_ni_locked(peer, NULL, lpni);
@@ -338,6 +453,71 @@ lnet_peer_del_locked(struct lnet_peer *peer)
 	return rc2;
 }
 
+static int
+lnet_peer_del(struct lnet_peer *peer)
+{
+	lnet_net_lock(LNET_LOCK_EX);
+	lnet_peer_del_locked(peer);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	return 0;
+}
+
+/*
+ * Delete a NID from a peer. Call with ln_api_mutex held.
+ *
+ * Error codes:
+ *  -EPERM:  Non-DLC deletion from DLC-configured peer.
+ *  -ENOENT: No lnet_peer_ni corresponding to the nid.
+ *  -ECHILD: The lnet_peer_ni isn't connected to the peer.
+ *  -EBUSY:  The lnet_peer_ni is the primary, and not the only peer_ni.
+ */
+static int
+lnet_peer_del_nid(struct lnet_peer *lp, lnet_nid_t nid, unsigned flags)
+{
+	struct lnet_peer_ni *lpni;
+	lnet_nid_t primary_nid = lp->lp_primary_nid;
+	int rc = 0;
+
+	if (!(flags & LNET_PEER_CONFIGURED)) {
+		if (lp->lp_state & LNET_PEER_CONFIGURED) {
+			rc = -EPERM;
+			goto out;
+		}
+	}
+	lpni = lnet_find_peer_ni_locked(nid);
+	if (!lpni) {
+		rc = -ENOENT;
+		goto out;
+	}
+	lnet_peer_ni_decref_locked(lpni);
+	if (lp != lpni->lpni_peer_net->lpn_peer) {
+		rc = -ECHILD;
+		goto out;
+	}
+
+	/*
+	 * This function only allows deletion of the primary NID if it
+	 * is the only NID.
+	 */
+	if (nid == lp->lp_primary_nid && lp->lp_nnis != 1) {
+		rc = -EBUSY;
+		goto out;
+	}
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	rc = lnet_peer_ni_del_locked(lpni);
+
+	lnet_net_unlock(LNET_LOCK_EX);
+
+out:
+	CDEBUG(D_NET, "peer %s NID %s flags %#x: %d\n",
+	       libcfs_nid2str(primary_nid), libcfs_nid2str(nid), flags, rc);
+
+	return rc;
+}
+
 static void
 lnet_peer_table_cleanup_locked(struct lnet_net *net,
 			       struct lnet_peer_table *ptable)
@@ -424,8 +604,8 @@ lnet_peer_table_del_rtrs_locked(struct lnet_net *net,
 void
 lnet_peer_tables_cleanup(struct lnet_net *net)
 {
-	int				i;
-	struct lnet_peer_table		*ptable;
+	int i;
+	struct lnet_peer_table *ptable;
 
 	LASSERT(the_lnet.ln_state != LNET_STATE_SHUTDOWN || net != NULL);
 	/* If just deleting the peers for a NI, get rid of any routes these
@@ -482,42 +662,24 @@ lnet_find_peer_ni_locked(lnet_nid_t nid)
 }
 
 struct lnet_peer *
-lnet_find_or_create_peer_locked(lnet_nid_t dst_nid, int cpt)
+lnet_find_peer(lnet_nid_t nid)
 {
 	struct lnet_peer_ni *lpni;
-	struct lnet_peer *lp;
+	struct lnet_peer *lp = NULL;
+	int cpt;
 
-	lpni = lnet_find_peer_ni_locked(dst_nid);
-	if (!lpni) {
-		lpni = lnet_nid2peerni_locked(dst_nid, cpt);
-		if (IS_ERR(lpni))
-			return ERR_CAST(lpni);
+	cpt = lnet_net_lock_current();
+	lpni = lnet_find_peer_ni_locked(nid);
+	if (lpni) {
+		lp = lpni->lpni_peer_net->lpn_peer;
+		lnet_peer_addref_locked(lp);
+		lnet_peer_ni_decref_locked(lpni);
 	}
-
-	lp = lpni->lpni_peer_net->lpn_peer;
-	lnet_peer_ni_decref_locked(lpni);
+	lnet_net_unlock(cpt);
 
 	return lp;
 }
 
-struct lnet_peer_ni *
-lnet_get_peer_ni_idx_locked(int idx, struct lnet_peer_net **lpn,
-			    struct lnet_peer **lp)
-{
-	struct lnet_peer_ni	*lpni;
-
-	list_for_each_entry((*lp), &the_lnet.ln_peers, lp_on_lnet_peer_list) {
-		list_for_each_entry((*lpn), &((*lp)->lp_peer_nets), lpn_on_peer_list) {
-			list_for_each_entry(lpni, &((*lpn)->lpn_peer_nis),
-					    lpni_on_peer_net_list)
-				if (idx-- == 0)
-					return lpni;
-		}
-	}
-
-	return NULL;
-}
-
 struct lnet_peer_ni *
 lnet_get_next_peer_ni_locked(struct lnet_peer *peer,
 			     struct lnet_peer_net *peer_net,
@@ -527,18 +689,21 @@ lnet_get_next_peer_ni_locked(struct lnet_peer *peer,
 	struct lnet_peer_net *net = peer_net;
 
 	if (!prev) {
-		if (!net)
+		if (!net) {
+			if (list_empty(&peer->lp_peer_nets))
+				return NULL;
+
 			net = list_entry(peer->lp_peer_nets.next,
 					 struct lnet_peer_net,
-					 lpn_on_peer_list);
+					 lpn_peer_nets);
+		}
 		lpni = list_entry(net->lpn_peer_nis.next, struct lnet_peer_ni,
-				  lpni_on_peer_net_list);
+				  lpni_peer_nis);
 
 		return lpni;
 	}
 
-	if (prev->lpni_on_peer_net_list.next ==
-	    &prev->lpni_peer_net->lpn_peer_nis) {
+	if (prev->lpni_peer_nis.next == &prev->lpni_peer_net->lpn_peer_nis) {
 		/*
 		 * if you reached the end of the peer ni list and the peer
 		 * net is specified then there are no more peer nis in that
@@ -551,428 +716,915 @@ lnet_get_next_peer_ni_locked(struct lnet_peer *peer,
 		 * we reached the end of this net ni list. move to the
 		 * next net
 		 */
-		if (prev->lpni_peer_net->lpn_on_peer_list.next ==
+		if (prev->lpni_peer_net->lpn_peer_nets.next ==
 		    &peer->lp_peer_nets)
 			/* no more nets and no more NIs. */
 			return NULL;
 
 		/* get the next net */
-		net = list_entry(prev->lpni_peer_net->lpn_on_peer_list.next,
+		net = list_entry(prev->lpni_peer_net->lpn_peer_nets.next,
 				 struct lnet_peer_net,
-				 lpn_on_peer_list);
+				 lpn_peer_nets);
 		/* get the ni on it */
 		lpni = list_entry(net->lpn_peer_nis.next, struct lnet_peer_ni,
-				  lpni_on_peer_net_list);
+				  lpni_peer_nis);
 
 		return lpni;
 	}
 
 	/* there are more nis left */
-	lpni = list_entry(prev->lpni_on_peer_net_list.next,
-			  struct lnet_peer_ni, lpni_on_peer_net_list);
+	lpni = list_entry(prev->lpni_peer_nis.next,
+			  struct lnet_peer_ni, lpni_peer_nis);
 
 	return lpni;
 }
 
+/* Call with the ln_api_mutex held */
+int lnet_get_peer_list(u32 *countp, u32 *sizep, struct lnet_process_id __user *ids)
+{
+	struct lnet_process_id id;
+	struct lnet_peer_table *ptable;
+	struct lnet_peer *lp;
+	__u32 count = 0;
+	__u32 size = 0;
+	int lncpt;
+	int cpt;
+	__u32 i;
+	int rc;
+
+	rc = -ESHUTDOWN;
+	if (the_lnet.ln_state != LNET_STATE_RUNNING)
+		goto done;
+
+	lncpt = cfs_percpt_number(the_lnet.ln_peer_tables);
+
+	/*
+	 * Count the number of peers, and return E2BIG if the buffer
+	 * is too small. We'll also return the desired size.
+	 */
+	rc = -E2BIG;
+	for (cpt = 0; cpt < lncpt; cpt++) {
+		ptable = the_lnet.ln_peer_tables[cpt];
+		count += ptable->pt_peers;
+	}
+	size = count * sizeof(*ids);
+	if (size > *sizep)
+		goto done;
+
+	/*
+	 * Walk the peer lists and copy out the primary nids.
+	 * This is safe because the peer lists are only modified
+	 * while the ln_api_mutex is held. So we don't need to
+	 * hold the lnet_net_lock as well, and can therefore
+	 * directly call copy_to_user().
+	 */
+	rc = -EFAULT;
+	memset(&id, 0, sizeof(id));
+	id.pid = LNET_PID_LUSTRE;
+	i = 0;
+	for (cpt = 0; cpt < lncpt; cpt++) {
+		ptable = the_lnet.ln_peer_tables[cpt];
+		list_for_each_entry(lp, &ptable->pt_peer_list, lp_peer_list) {
+			if (i >= count)
+				goto done;
+			id.nid = lp->lp_primary_nid;
+			if (copy_to_user(&ids[i], &id, sizeof(id)))
+				goto done;
+			i++;
+		}
+	}
+	rc = 0;
+done:
+	*countp = count;
+	*sizep = size;
+	return rc;
+}
+
+/*
+ * Start pushes to peers that need to be updated for a configuration
+ * change on this node.
+ */
+void
+lnet_push_update_to_peers(int force)
+{
+	struct lnet_peer_table *ptable;
+	struct lnet_peer *lp;
+	int lncpt;
+	int cpt;
+
+	lnet_net_lock(LNET_LOCK_EX);
+	lncpt = cfs_percpt_number(the_lnet.ln_peer_tables);
+	for (cpt = 0; cpt < lncpt; cpt++) {
+		ptable = the_lnet.ln_peer_tables[cpt];
+		list_for_each_entry(lp, &ptable->pt_peer_list, lp_peer_list) {
+			if (force) {
+				spin_lock(&lp->lp_lock);
+				if (lp->lp_state & LNET_PEER_MULTI_RAIL)
+					lp->lp_state |= LNET_PEER_FORCE_PUSH;
+				spin_unlock(&lp->lp_lock);
+			}
+			if (lnet_peer_needs_push(lp))
+				lnet_peer_queue_for_discovery(lp);
+		}
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+	wake_up(&the_lnet.ln_dc_waitq);
+}
+
+/*
+ * Test whether a ni is a preferred ni for this peer_ni, e.g, whether
+ * this is a preferred point-to-point path. Call with lnet_net_lock in
+ * shared mmode.
+ */
 bool
-lnet_peer_is_ni_pref_locked(struct lnet_peer_ni *lpni, struct lnet_ni *ni)
+lnet_peer_is_pref_nid_locked(struct lnet_peer_ni *lpni, lnet_nid_t nid)
 {
 	int i;
 
+	if (lpni->lpni_pref_nnids == 0)
+		return false;
+	if (lpni->lpni_pref_nnids == 1)
+		return lpni->lpni_pref.nid == nid;
 	for (i = 0; i < lpni->lpni_pref_nnids; i++) {
-		if (lpni->lpni_pref_nids[i] == ni->ni_nid)
+		if (lpni->lpni_pref.nids[i] == nid)
 			return true;
 	}
 	return false;
 }
 
-lnet_nid_t
-lnet_peer_primary_nid_locked(lnet_nid_t nid)
+/*
+ * Set a single ni as preferred, provided no preferred ni is already
+ * defined. Only to be used for non-multi-rail peer_ni.
+ */
+int
+lnet_peer_ni_set_non_mr_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid)
 {
-	struct lnet_peer_ni *lpni;
-	lnet_nid_t primary_nid = nid;
+	int rc = 0;
 
-	lpni = lnet_find_peer_ni_locked(nid);
-	if (lpni) {
-		primary_nid = lpni->lpni_peer_net->lpn_peer->lp_primary_nid;
-		lnet_peer_ni_decref_locked(lpni);
+	spin_lock(&lpni->lpni_lock);
+	if (nid == LNET_NID_ANY) {
+		rc = -EINVAL;
+	} else if (lpni->lpni_pref_nnids > 0) {
+		rc = -EPERM;
+	} else if (lpni->lpni_pref_nnids == 0) {
+		lpni->lpni_pref.nid = nid;
+		lpni->lpni_pref_nnids = 1;
+		lpni->lpni_state |= LNET_PEER_NI_NON_MR_PREF;
 	}
+	spin_unlock(&lpni->lpni_lock);
 
-	return primary_nid;
+	CDEBUG(D_NET, "peer %s nid %s: %d\n",
+	       libcfs_nid2str(lpni->lpni_nid), libcfs_nid2str(nid), rc);
+	return rc;
 }
 
-lnet_nid_t
-LNetPrimaryNID(lnet_nid_t nid)
+/*
+ * Clear the preferred NID from a non-multi-rail peer_ni, provided
+ * this preference was set by lnet_peer_ni_set_non_mr_pref_nid().
+ */
+int
+lnet_peer_ni_clr_non_mr_pref_nid(struct lnet_peer_ni *lpni)
 {
-	struct lnet_peer_ni *lpni;
-	lnet_nid_t primary_nid = nid;
 	int rc = 0;
-	int cpt;
 
-	cpt = lnet_net_lock_current();
-	lpni = lnet_nid2peerni_locked(nid, cpt);
-	if (IS_ERR(lpni)) {
-		rc = PTR_ERR(lpni);
-		goto out_unlock;
+	spin_lock(&lpni->lpni_lock);
+	if (lpni->lpni_state & LNET_PEER_NI_NON_MR_PREF) {
+		lpni->lpni_pref_nnids = 0;
+		lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF;
+	} else if (lpni->lpni_pref_nnids == 0) {
+		rc = -ENOENT;
+	} else {
+		rc = -EPERM;
 	}
-	primary_nid = lpni->lpni_peer_net->lpn_peer->lp_primary_nid;
-	lnet_peer_ni_decref_locked(lpni);
-out_unlock:
-	lnet_net_unlock(cpt);
+	spin_unlock(&lpni->lpni_lock);
 
-	CDEBUG(D_NET, "NID %s primary NID %s rc %d\n", libcfs_nid2str(nid),
-	       libcfs_nid2str(primary_nid), rc);
-	return primary_nid;
+	CDEBUG(D_NET, "peer %s: %d\n",
+	       libcfs_nid2str(lpni->lpni_nid), rc);
+	return rc;
 }
-EXPORT_SYMBOL(LNetPrimaryNID);
 
-struct lnet_peer_net *
-lnet_peer_get_net_locked(struct lnet_peer *peer, __u32 net_id)
+/*
+ * Clear the preferred NIDs from a non-multi-rail peer.
+ */
+void
+lnet_peer_clr_non_mr_pref_nids(struct lnet_peer *lp)
 {
-	struct lnet_peer_net *peer_net;
-	list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_on_peer_list) {
-		if (peer_net->lpn_net_id == net_id)
-			return peer_net;
-	}
-	return NULL;
+	struct lnet_peer_ni *lpni = NULL;
+
+	while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL)
+		lnet_peer_ni_clr_non_mr_pref_nid(lpni);
 }
 
-static int
-lnet_peer_setup_hierarchy(struct lnet_peer *lp, struct lnet_peer_ni *lpni,
-			  lnet_nid_t nid)
+int
+lnet_peer_add_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid)
 {
-	struct lnet_peer_net *lpn = NULL;
-	struct lnet_peer_table *ptable;
-        __u32 net_id = LNET_NIDNET(nid);
+	lnet_nid_t *nids = NULL;
+	lnet_nid_t *oldnids = NULL;
+	struct lnet_peer *lp = lpni->lpni_peer_net->lpn_peer;
+	int size;
+	int i;
+	int rc = 0;
 
-	/*
-	 * Create the peer_ni, peer_net, and peer if they don't exist
-	 * yet.
-	 */
-	if (lp) {
-		lpn = lnet_peer_get_net_locked(lp, net_id);
-	} else {
-		lp = lnet_peer_alloc(nid);
-		if (!lp)
-			goto out_enomem;
+	if (nid == LNET_NID_ANY) {
+		rc = -EINVAL;
+		goto out;
 	}
 
-	if (!lpn) {
-		lpn = lnet_peer_net_alloc(net_id);
-		if (!lpn)
-			goto out_maybe_free_lp;
+	if (lpni->lpni_pref_nnids == 1 && lpni->lpni_pref.nid == nid) {
+		rc = -EEXIST;
+		goto out;
 	}
 
-	if (!lpni) {
-		lpni = lnet_peer_ni_alloc(nid);
-		if (!lpni)
-			goto out_maybe_free_lpn;
+	/* A non-MR node may have only one preferred NI per peer_ni */
+	if (lpni->lpni_pref_nnids > 0) {
+		if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) {
+			rc = -EPERM;
+			goto out;
+		}
+	}
+
+	if (lpni->lpni_pref_nnids != 0) {
+		size = sizeof(*nids) * (lpni->lpni_pref_nnids + 1);
+		LIBCFS_CPT_ALLOC(nids, lnet_cpt_table(), lpni->lpni_cpt, size);
+		if (!nids) {
+			rc = -ENOMEM;
+			goto out;
+		}
+		for (i = 0; i < lpni->lpni_pref_nnids; i++) {
+			if (lpni->lpni_pref.nids[i] == nid) {
+				LIBCFS_FREE(nids, size);
+				rc = -EEXIST;
+				goto out;
+			}
+			nids[i] = lpni->lpni_pref.nids[i];
+		}
+		nids[i] = nid;
 	}
 
-	/* Install the new peer_ni */
 	lnet_net_lock(LNET_LOCK_EX);
-	/* Add peer_ni to global peer table hash, if necessary. */
-	if (list_empty(&lpni->lpni_hashlist)) {
-		ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt];
-		list_add_tail(&lpni->lpni_hashlist,
-			      &ptable->pt_hash[lnet_nid2peerhash(nid)]);
-		ptable->pt_version++;
-		atomic_inc(&ptable->pt_number);
-		atomic_inc(&lpni->lpni_refcount);
+	spin_lock(&lpni->lpni_lock);
+	if (lpni->lpni_pref_nnids == 0) {
+		lpni->lpni_pref.nid = nid;
+	} else {
+		oldnids = lpni->lpni_pref.nids;
+		lpni->lpni_pref.nids = nids;
 	}
+	lpni->lpni_pref_nnids++;
+	lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF;
+	spin_unlock(&lpni->lpni_lock);
+	lnet_net_unlock(LNET_LOCK_EX);
 
-	/* Detach the peer_ni from an existing peer, if necessary. */
-	if (lpni->lpni_peer_net && lpni->lpni_peer_net->lpn_peer != lp)
-		lnet_try_destroy_peer_hierarchy_locked(lpni);
+	if (oldnids) {
+		size = sizeof(*nids) * (lpni->lpni_pref_nnids - 1);
+		LIBCFS_FREE(oldnids, sizeof(*oldnids) * size);
+	}
+out:
+	if (rc == -EEXIST && (lpni->lpni_state & LNET_PEER_NI_NON_MR_PREF)) {
+		spin_lock(&lpni->lpni_lock);
+		lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF;
+		spin_unlock(&lpni->lpni_lock);
+	}
+	CDEBUG(D_NET, "peer %s nid %s: %d\n",
+	       libcfs_nid2str(lp->lp_primary_nid), libcfs_nid2str(nid), rc);
+	return rc;
+}
 
-	/* Add peer_ni to peer_net */
-	lpni->lpni_peer_net = lpn;
-	list_add_tail(&lpni->lpni_on_peer_net_list, &lpn->lpn_peer_nis);
+int
+lnet_peer_del_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid)
+{
+	lnet_nid_t *nids = NULL;
+	lnet_nid_t *oldnids = NULL;
+	struct lnet_peer *lp = lpni->lpni_peer_net->lpn_peer;
+	int size;
+	int i, j;
+	int rc = 0;
 
-	/* Add peer_net to peer */
-	if (!lpn->lpn_peer) {
-		lpn->lpn_peer = lp;
-		list_add_tail(&lpn->lpn_on_peer_list, &lp->lp_peer_nets);
+	if (lpni->lpni_pref_nnids == 0) {
+		rc = -ENOENT;
+		goto out;
 	}
 
-	/* Add peer to global peer list */
-	if (list_empty(&lp->lp_on_lnet_peer_list))
-		list_add_tail(&lp->lp_on_lnet_peer_list, &the_lnet.ln_peers);
-	lnet_net_unlock(LNET_LOCK_EX);
+	if (lpni->lpni_pref_nnids == 1) {
+		if (lpni->lpni_pref.nid != nid) {
+			rc = -ENOENT;
+			goto out;
+		}
+	} else if (lpni->lpni_pref_nnids == 2) {
+		if (lpni->lpni_pref.nids[0] != nid &&
+		    lpni->lpni_pref.nids[1] != nid) {
+			rc = -ENOENT;
+			goto out;
+		}
+	} else {
+		size = sizeof(*nids) * (lpni->lpni_pref_nnids - 1);
+		LIBCFS_CPT_ALLOC(nids, lnet_cpt_table(), lpni->lpni_cpt, size);
+		if (!nids) {
+			rc = -ENOMEM;
+			goto out;
+		}
+		for (i = 0, j = 0; i < lpni->lpni_pref_nnids; i++) {
+			if (lpni->lpni_pref.nids[i] != nid)
+				continue;
+			nids[j++] = lpni->lpni_pref.nids[i];
+		}
+		/* Check if we actually removed a nid. */
+		if (j == lpni->lpni_pref_nnids) {
+			LIBCFS_FREE(nids, size);
+			rc = -ENOENT;
+			goto out;
+		}
+	}
 
-	return 0;
+	lnet_net_lock(LNET_LOCK_EX);
+	spin_lock(&lpni->lpni_lock);
+	if (lpni->lpni_pref_nnids == 1) {
+		lpni->lpni_pref.nid = LNET_NID_ANY;
+	} else if (lpni->lpni_pref_nnids == 2) {
+		oldnids = lpni->lpni_pref.nids;
+		if (oldnids[0] == nid)
+			lpni->lpni_pref.nid = oldnids[1];
+		else
+			lpni->lpni_pref.nid = oldnids[2];
+	} else {
+		oldnids = lpni->lpni_pref.nids;
+		lpni->lpni_pref.nids = nids;
+	}
+	lpni->lpni_pref_nnids--;
+	lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF;
+	spin_unlock(&lpni->lpni_lock);
+	lnet_net_unlock(LNET_LOCK_EX);
 
-out_maybe_free_lpn:
-	if (list_empty(&lpn->lpn_on_peer_list))
-		LIBCFS_FREE(lpn, sizeof(*lpn));
-out_maybe_free_lp:
-	if (list_empty(&lp->lp_on_lnet_peer_list))
-		LIBCFS_FREE(lp, sizeof(*lp));
-out_enomem:
-	return -ENOMEM;
+	if (oldnids) {
+		size = sizeof(*nids) * (lpni->lpni_pref_nnids + 1);
+		LIBCFS_FREE(oldnids, sizeof(*oldnids) * size);
+	}
+out:
+	CDEBUG(D_NET, "peer %s nid %s: %d\n",
+	       libcfs_nid2str(lp->lp_primary_nid), libcfs_nid2str(nid), rc);
+	return rc;
 }
 
-static int
-lnet_add_prim_lpni(lnet_nid_t nid)
+lnet_nid_t
+lnet_peer_primary_nid_locked(lnet_nid_t nid)
 {
-	int rc = 0;
-	struct lnet_peer *peer;
 	struct lnet_peer_ni *lpni;
+	lnet_nid_t primary_nid = nid;
 
-	LASSERT(nid != LNET_NID_ANY);
-
-	/*
-	 * lookup the NID and its peer
-	 *  if the peer doesn't exist, create it.
-	 *  if this is a non-MR peer then change its state to MR and exit.
-	 *  if this is an MR peer and it's a primary NI: NO-OP.
-	 *  if this is an MR peer and it's not a primary NI. Operation not
-	 *     allowed.
-	 *
-	 * The adding and deleting of peer nis is being serialized through
-	 * the api_mutex. So we can look up peers with the mutex locked
-	 * safely. Only when we need to change the ptable, do we need to
-	 * exclusively lock the lnet_net_lock()
-	 */
 	lpni = lnet_find_peer_ni_locked(nid);
-	if (!lpni) {
-		rc = lnet_peer_setup_hierarchy(NULL, NULL, nid);
-		if (rc != 0)
-			return rc;
-		lpni = lnet_find_peer_ni_locked(nid);
+	if (lpni) {
+		primary_nid = lpni->lpni_peer_net->lpn_peer->lp_primary_nid;
+		lnet_peer_ni_decref_locked(lpni);
 	}
 
-	LASSERT(lpni);
+	return primary_nid;
+}
 
-	lnet_peer_ni_decref_locked(lpni);
+bool
+lnet_is_discovery_disabled_locked(struct lnet_peer *lp)
+{
+	if (lnet_peer_discovery_disabled)
+		return true;
 
-	peer = lpni->lpni_peer_net->lpn_peer;
+	if (!(lp->lp_state & LNET_PEER_MULTI_RAIL) ||
+	    (lp->lp_state & LNET_PEER_NO_DISCOVERY)) {
+		return true;
+	}
 
-	/*
-	 * If we found a lpni with the same nid as the NID we're trying to
-	 * create, then we're trying to create an already existing lpni 
-	 * that belongs to a different peer
-	 */
-	if (peer->lp_primary_nid != nid)
-		return -EEXIST;
+	return false;
+}
 
-	/*
-	 * if we found an lpni that is not a multi-rail, which could occur
-	 * if lpni is already created as a non-mr lpni or we just created
-	 * it, then make sure you indicate that this lpni is a primary mr
-	 * capable peer.
-	 *
-	 * TODO: update flags if necessary
-	 */
-	if (!peer->lp_multi_rail && peer->lp_primary_nid == nid)
-		peer->lp_multi_rail = true;
+/*
+ * Peer Discovery
+ */
+bool
+lnet_is_discovery_disabled(struct lnet_peer *lp)
+{
+	bool rc = false;
+
+	spin_lock(&lp->lp_lock);
+	rc = lnet_is_discovery_disabled_locked(lp);
+	spin_unlock(&lp->lp_lock);
 
 	return rc;
 }
 
+lnet_nid_t
+LNetPrimaryNID(lnet_nid_t nid)
+{
+	struct lnet_peer *lp;
+	struct lnet_peer_ni *lpni;
+	lnet_nid_t primary_nid = nid;
+	int rc = 0;
+	int cpt;
+
+	if (nid == LNET_NID_LO_0)
+		return LNET_NID_LO_0;
+
+	cpt = lnet_net_lock_current();
+	lpni = lnet_nid2peerni_locked(nid, LNET_NID_ANY, cpt);
+	if (IS_ERR(lpni)) {
+		rc = PTR_ERR(lpni);
+		goto out_unlock;
+	}
+	lp = lpni->lpni_peer_net->lpn_peer;
+
+	while (!lnet_peer_is_uptodate(lp)) {
+		spin_lock(&lp->lp_lock);
+		/* force a full discovery cycle */
+		lp->lp_state |= LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH;
+		spin_unlock(&lp->lp_lock);
+
+		rc = lnet_discover_peer_locked(lpni, cpt, true);
+		if (rc)
+			goto out_decref;
+		lp = lpni->lpni_peer_net->lpn_peer;
+
+		/* Only try once if discovery is disabled */
+		if (lnet_is_discovery_disabled(lp))
+			break;
+	}
+	primary_nid = lp->lp_primary_nid;
+out_decref:
+	lnet_peer_ni_decref_locked(lpni);
+out_unlock:
+	lnet_net_unlock(cpt);
+
+	CDEBUG(D_NET, "NID %s primary NID %s rc %d\n", libcfs_nid2str(nid),
+	       libcfs_nid2str(primary_nid), rc);
+	return primary_nid;
+}
+EXPORT_SYMBOL(LNetPrimaryNID);
+
+struct lnet_peer_net *
+lnet_peer_get_net_locked(struct lnet_peer *peer, __u32 net_id)
+{
+	struct lnet_peer_net *peer_net;
+	list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_peer_nets) {
+		if (peer_net->lpn_net_id == net_id)
+			return peer_net;
+	}
+	return NULL;
+}
+
+/*
+ * Attach a peer_ni to a peer_net and peer. This function assumes
+ * peer_ni is not already attached to the peer_net/peer. The peer_ni
+ * may be attached to a different peer, in which case it will be
+ * properly detached first. The whole operation is done atomically.
+ *
+ * Always returns 0.  This is the last function called from functions
+ * that do return an int, so returning 0 here allows the compiler to
+ * do a tail call.
+ */
 static int
-lnet_add_peer_ni_to_prim_lpni(lnet_nid_t prim_nid, lnet_nid_t nid)
+lnet_peer_attach_peer_ni(struct lnet_peer *lp,
+				struct lnet_peer_net *lpn,
+				struct lnet_peer_ni *lpni,
+				unsigned flags)
 {
-	struct lnet_peer *peer, *primary_peer;
-	struct lnet_peer_ni *lpni = NULL, *klpni = NULL;
+	struct lnet_peer_table *ptable;
+
+	/* Install the new peer_ni */
+	lnet_net_lock(LNET_LOCK_EX);
+	/* Add peer_ni to global peer table hash, if necessary. */
+	if (list_empty(&lpni->lpni_hashlist)) {
+		int hash = lnet_nid2peerhash(lpni->lpni_nid);
 
-	LASSERT(prim_nid != LNET_NID_ANY && nid != LNET_NID_ANY);
+		ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt];
+		list_add_tail(&lpni->lpni_hashlist, &ptable->pt_hash[hash]);
+		ptable->pt_version++;
+		ptable->pt_number++;
+		/* This is the 1st refcount on lpni. */
+		atomic_inc(&lpni->lpni_refcount);
+	}
 
-	/*
-	 * key nid must be created by this point. If not then this
-	 * operation is not permitted
-	 */
-	klpni = lnet_find_peer_ni_locked(prim_nid);
-	if (!klpni)
-		return -ENOENT;
+	/* Detach the peer_ni from an existing peer, if necessary. */
+	if (lpni->lpni_peer_net) {
+		LASSERT(lpni->lpni_peer_net != lpn);
+		LASSERT(lpni->lpni_peer_net->lpn_peer != lp);
+		lnet_peer_detach_peer_ni_locked(lpni);
+		lnet_peer_net_decref_locked(lpni->lpni_peer_net);
+		lpni->lpni_peer_net = NULL;
+	}
+
+	/* Add peer_ni to peer_net */
+	lpni->lpni_peer_net = lpn;
+	list_add_tail(&lpni->lpni_peer_nis, &lpn->lpn_peer_nis);
+	lnet_peer_net_addref_locked(lpn);
+
+	/* Add peer_net to peer */
+	if (!lpn->lpn_peer) {
+		lpn->lpn_peer = lp;
+		list_add_tail(&lpn->lpn_peer_nets, &lp->lp_peer_nets);
+		lnet_peer_addref_locked(lp);
+	}
+
+	/* Add peer to global peer list, if necessary */
+	ptable = the_lnet.ln_peer_tables[lp->lp_cpt];
+	if (list_empty(&lp->lp_peer_list)) {
+		list_add_tail(&lp->lp_peer_list, &ptable->pt_peer_list);
+		ptable->pt_peers++;
+	}
+
+
+	/* Update peer state */
+	spin_lock(&lp->lp_lock);
+	if (flags & LNET_PEER_CONFIGURED) {
+		if (!(lp->lp_state & LNET_PEER_CONFIGURED))
+			lp->lp_state |= LNET_PEER_CONFIGURED;
+	}
+	if (flags & LNET_PEER_MULTI_RAIL) {
+		if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) {
+			lp->lp_state |= LNET_PEER_MULTI_RAIL;
+			lnet_peer_clr_non_mr_pref_nids(lp);
+		}
+	}
+	spin_unlock(&lp->lp_lock);
+
+	lp->lp_nnis++;
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	CDEBUG(D_NET, "peer %s NID %s flags %#x\n",
+	       libcfs_nid2str(lp->lp_primary_nid),
+	       libcfs_nid2str(lpni->lpni_nid), flags);
 
-	lnet_peer_ni_decref_locked(klpni);
+	return 0;
+}
+
+/*
+ * Create a new peer, with nid as its primary nid.
+ *
+ * Call with the lnet_api_mutex held.
+ */
+static int
+lnet_peer_add(lnet_nid_t nid, unsigned flags)
+{
+	struct lnet_peer *lp;
+	struct lnet_peer_net *lpn;
+	struct lnet_peer_ni *lpni;
+	int rc = 0;
 
-	primary_peer = klpni->lpni_peer_net->lpn_peer;
+	LASSERT(nid != LNET_NID_ANY);
 
+	/*
+	 * No need for the lnet_net_lock here, because the
+	 * lnet_api_mutex is held.
+	 */
 	lpni = lnet_find_peer_ni_locked(nid);
 	if (lpni) {
+		/* A peer with this NID already exists. */
+		lp = lpni->lpni_peer_net->lpn_peer;
 		lnet_peer_ni_decref_locked(lpni);
-
-		peer = lpni->lpni_peer_net->lpn_peer;
 		/*
-		 * lpni already exists in the system but it belongs to
-		 * a different peer. We can't re-added it
+		 * This is an error if the peer was configured and the
+		 * primary NID differs or an attempt is made to change
+		 * the Multi-Rail flag. Otherwise the assumption is
+		 * that an existing peer is being modified.
 		 */
-		if (peer->lp_primary_nid != prim_nid && peer->lp_multi_rail) {
-			CERROR("Cannot add NID %s owned by peer %s to peer %s\n",
-			       libcfs_nid2str(lpni->lpni_nid),
-			       libcfs_nid2str(peer->lp_primary_nid),
-			       libcfs_nid2str(prim_nid));
-			return -EEXIST;
-		} else if (peer->lp_primary_nid == prim_nid) {
-			/*
-			 * found a peer_ni that is already part of the
-			 * peer. This is a no-op operation.
-			 */
-			return 0;
+		if (lp->lp_state & LNET_PEER_CONFIGURED) {
+			if (lp->lp_primary_nid != nid)
+				rc = -EEXIST;
+			else if ((lp->lp_state ^ flags) & LNET_PEER_MULTI_RAIL)
+				rc = -EPERM;
+			goto out;
 		}
+		/* Delete and recreate as a configured peer. */
+		lnet_peer_del(lp);
+	}
 
-		/*
-		 * TODO: else if (peer->lp_primary_nid != prim_nid &&
-		 *		  !peer->lp_multi_rail)
-		 * peer is not an MR peer and it will be moved in the next
-		 * step to klpni, so update its flags accordingly.
-		 * lnet_move_peer_ni()
-		 */
-
-		/*
-		 * TODO: call lnet_update_peer() from here to update the
-		 * flags. This is the case when the lpni you're trying to
-		 * add is already part of the peer. This could've been
-		 * added by the DD previously, so go ahead and do any
-		 * updates to the state if necessary
-		 */
+	/* Create peer, peer_net, and peer_ni. */
+	rc = -ENOMEM;
+	lp = lnet_peer_alloc(nid);
+	if (!lp)
+		goto out;
+	lpn = lnet_peer_net_alloc(LNET_NIDNET(nid));
+	if (!lpn)
+		goto out_free_lp;
+	lpni = lnet_peer_ni_alloc(nid);
+	if (!lpni)
+		goto out_free_lpn;
 
-	}
+	return lnet_peer_attach_peer_ni(lp, lpn, lpni, flags);
 
-	/*
-	 * When we get here we either have found an existing lpni, which
-	 * we can switch to the new peer. Or we need to create one and
-	 * add it to the new peer
-	 */
-	return lnet_peer_setup_hierarchy(primary_peer, lpni, nid);
+out_free_lpn:
+	LIBCFS_FREE(lpn, sizeof(*lpn));
+out_free_lp:
+	LIBCFS_FREE(lp, sizeof(*lp));
+out:
+	CDEBUG(D_NET, "peer %s NID flags %#x: %d\n",
+	       libcfs_nid2str(nid), flags, rc);
+	return rc;
 }
 
 /*
- * lpni creation initiated due to traffic either sending or receiving.
+ * Add a NID to a peer. Call with ln_api_mutex held.
+ *
+ * Error codes:
+ *  -EPERM:    Non-DLC addition to a DLC-configured peer.
+ *  -EEXIST:   The NID was configured by DLC for a different peer.
+ *  -ENOMEM:   Out of memory.
+ *  -ENOTUNIQ: Adding a second peer NID on a single network on a
+ *             non-multi-rail peer.
  */
 static int
-lnet_peer_ni_traffic_add(lnet_nid_t nid)
+lnet_peer_add_nid(struct lnet_peer *lp, lnet_nid_t nid, unsigned flags)
 {
+	struct lnet_peer_net *lpn;
 	struct lnet_peer_ni *lpni;
 	int rc = 0;
 
-	if (nid == LNET_NID_ANY)
-		return -EINVAL;
+	LASSERT(lp);
+	LASSERT(nid != LNET_NID_ANY);
+
+	/* A configured peer can only be updated through configuration. */
+	if (!(flags & LNET_PEER_CONFIGURED)) {
+		if (lp->lp_state & LNET_PEER_CONFIGURED) {
+			rc = -EPERM;
+			goto out;
+		}
+	}
+
+	/*
+	 * The MULTI_RAIL flag can be set but not cleared, because
+	 * that would leave the peer struct in an invalid state.
+	 */
+	if (flags & LNET_PEER_MULTI_RAIL) {
+		spin_lock(&lp->lp_lock);
+		if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) {
+			lp->lp_state |= LNET_PEER_MULTI_RAIL;
+			lnet_peer_clr_non_mr_pref_nids(lp);
+		}
+		spin_unlock(&lp->lp_lock);
+	} else if (lp->lp_state & LNET_PEER_MULTI_RAIL) {
+		rc = -EPERM;
+		goto out;
+	}
 
-	/* lnet_net_lock is not needed here because ln_api_lock is held */
 	lpni = lnet_find_peer_ni_locked(nid);
 	if (lpni) {
 		/*
-		 * TODO: lnet_update_primary_nid() but not all of it
-		 * only indicate if we're converting this to MR capable
-		 * Can happen due to DD
+		 * A peer_ni already exists. This is only a problem if
+		 * it is not connected to this peer and was configured
+		 * by DLC.
 		 */
 		lnet_peer_ni_decref_locked(lpni);
+		if (lpni->lpni_peer_net->lpn_peer == lp)
+			goto out;
+		if (lnet_peer_ni_is_configured(lpni)) {
+			rc = -EEXIST;
+			goto out;
+		}
+		/* If this is the primary NID, destroy the peer. */
+		if (lnet_peer_ni_is_primary(lpni)) {
+			lnet_peer_del(lpni->lpni_peer_net->lpn_peer);
+			lpni = lnet_peer_ni_alloc(nid);
+			if (!lpni) {
+				rc = -ENOMEM;
+				goto out;
+			}
+		}
 	} else {
-		rc = lnet_peer_setup_hierarchy(NULL, NULL, nid);
+		lpni = lnet_peer_ni_alloc(nid);
+		if (!lpni) {
+			rc = -ENOMEM;
+			goto out;
+		}
+	}
+
+	/*
+	 * Get the peer_net. Check that we're not adding a second
+	 * peer_ni on a peer_net of a non-multi-rail peer.
+	 */
+	lpn = lnet_peer_get_net_locked(lp, LNET_NIDNET(nid));
+	if (!lpn) {
+		lpn = lnet_peer_net_alloc(LNET_NIDNET(nid));
+		if (!lpn) {
+			rc = -ENOMEM;
+			goto out_free_lpni;
+		}
+	} else if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) {
+		rc = -ENOTUNIQ;
+		goto out_free_lpni;
 	}
 
+	return lnet_peer_attach_peer_ni(lp, lpn, lpni, flags);
+
+out_free_lpni:
+	/* If the peer_ni was allocated above its peer_net pointer is NULL */
+	if (!lpni->lpni_peer_net)
+		LIBCFS_FREE(lpni, sizeof(*lpni));
+out:
+	CDEBUG(D_NET, "peer %s NID %s flags %#x: %d\n",
+	       libcfs_nid2str(lp->lp_primary_nid), libcfs_nid2str(nid),
+	       flags, rc);
 	return rc;
+}
+
+/*
+ * Update the primary NID of a peer, if possible.
+ *
+ * Call with the lnet_api_mutex held.
+ */
+static int
+lnet_peer_set_primary_nid(struct lnet_peer *lp, lnet_nid_t nid, unsigned flags)
+{
+	lnet_nid_t old = lp->lp_primary_nid;
+	int rc = 0;
 
+	if (lp->lp_primary_nid == nid)
+		goto out;
+	rc = lnet_peer_add_nid(lp, nid, flags);
+	if (rc)
+		goto out;
+	lp->lp_primary_nid = nid;
+out:
+	CDEBUG(D_NET, "peer %s NID %s: %d\n",
+	       libcfs_nid2str(old), libcfs_nid2str(nid), rc);
+	return rc;
 }
 
+/*
+ * lpni creation initiated due to traffic either sending or receiving.
+ */
 static int
-lnet_peer_ni_add_non_mr(lnet_nid_t nid)
+lnet_peer_ni_traffic_add(lnet_nid_t nid, lnet_nid_t pref)
 {
+	struct lnet_peer *lp;
+	struct lnet_peer_net *lpn;
 	struct lnet_peer_ni *lpni;
+	unsigned flags = 0;
+	int rc = 0;
+
+	if (nid == LNET_NID_ANY) {
+		rc = -EINVAL;
+		goto out;
+	}
 
+	/* lnet_net_lock is not needed here because ln_api_lock is held */
 	lpni = lnet_find_peer_ni_locked(nid);
 	if (lpni) {
-		CERROR("Cannot add %s as non-mr when it already exists\n",
-		       libcfs_nid2str(nid));
+		/*
+		 * We must have raced with another thread. Since we
+		 * know next to nothing about a peer_ni created by
+		 * traffic, we just assume everything is ok and
+		 * return.
+		 */
 		lnet_peer_ni_decref_locked(lpni);
-		return -EEXIST;
+		goto out;
 	}
 
-	return lnet_peer_setup_hierarchy(NULL, NULL, nid);
+	/* Create peer, peer_net, and peer_ni. */
+	rc = -ENOMEM;
+	lp = lnet_peer_alloc(nid);
+	if (!lp)
+		goto out;
+	lpn = lnet_peer_net_alloc(LNET_NIDNET(nid));
+	if (!lpn)
+		goto out_free_lp;
+	lpni = lnet_peer_ni_alloc(nid);
+	if (!lpni)
+		goto out_free_lpn;
+	if (pref != LNET_NID_ANY)
+		lnet_peer_ni_set_non_mr_pref_nid(lpni, pref);
+
+	return lnet_peer_attach_peer_ni(lp, lpn, lpni, flags);
+
+out_free_lpn:
+	LIBCFS_FREE(lpn, sizeof(*lpn));
+out_free_lp:
+	LIBCFS_FREE(lp, sizeof(*lp));
+out:
+	CDEBUG(D_NET, "peer %s: %d\n", libcfs_nid2str(nid), rc);
+	return rc;
 }
 
 /*
+ * Implementation of IOC_LIBCFS_ADD_PEER_NI.
+ *
  * This API handles the following combinations:
- *	Create a primary NI if only the prim_nid is provided
- *	Create or add an lpni to a primary NI. Primary NI must've already
- *	been created
- *	Create a non-MR peer.
+ *   Create a peer with its primary NI if only the prim_nid is provided
+ *   Add a NID to a peer identified by the prim_nid. The peer identified
+ *   by the prim_nid must already exist.
+ *   The peer being created may be non-MR.
+ *
+ * The caller must hold ln_api_mutex. This prevents the peer from
+ * being created/modified/deleted by a different thread.
  */
 int
-lnet_add_peer_ni_to_peer(lnet_nid_t prim_nid, lnet_nid_t nid, bool mr)
+lnet_add_peer_ni(lnet_nid_t prim_nid, lnet_nid_t nid, bool mr)
 {
+	struct lnet_peer *lp = NULL;
+	struct lnet_peer_ni *lpni;
+	unsigned flags;
+
+	/* The prim_nid must always be specified */
+	if (prim_nid == LNET_NID_ANY)
+		return -EINVAL;
+
+	flags = LNET_PEER_CONFIGURED;
+	if (mr)
+		flags |= LNET_PEER_MULTI_RAIL;
+
 	/*
-	 * Caller trying to setup an MR like peer hierarchy but
-	 * specifying it to be non-MR. This is not allowed.
+	 * If nid isn't specified, we must create a new peer with
+	 * prim_nid as its primary nid.
 	 */
-	if (prim_nid != LNET_NID_ANY &&
-	    nid != LNET_NID_ANY && !mr)
-		return -EPERM;
+	if (nid == LNET_NID_ANY)
+		return lnet_peer_add(prim_nid, flags);
+
+	/* Look up the prim_nid, which must exist. */
+	lpni = lnet_find_peer_ni_locked(prim_nid);
+	if (!lpni)
+		return -ENOENT;
+	lnet_peer_ni_decref_locked(lpni);
+	lp = lpni->lpni_peer_net->lpn_peer;
 
-	/* Add the primary NID of a peer */
-	if (prim_nid != LNET_NID_ANY &&
-	    nid == LNET_NID_ANY && mr)
-		return lnet_add_prim_lpni(prim_nid);
+	/* Peer must have been configured. */
+	if (!(lp->lp_state & LNET_PEER_CONFIGURED)) {
+		CDEBUG(D_NET, "peer %s was not configured\n",
+		       libcfs_nid2str(prim_nid));
+		return -ENOENT;
+	}
 
-	/* Add a NID to an existing peer */
-	if (prim_nid != LNET_NID_ANY &&
-	    nid != LNET_NID_ANY && mr)
-		return lnet_add_peer_ni_to_prim_lpni(prim_nid, nid);
+	/* Primary NID must match */
+	if (lp->lp_primary_nid != prim_nid) {
+		CDEBUG(D_NET, "prim_nid %s is not primary for peer %s\n",
+		       libcfs_nid2str(prim_nid),
+		       libcfs_nid2str(lp->lp_primary_nid));
+		return -ENODEV;
+	}
 
-	/* Add a non-MR peer NI */
-	if (((prim_nid != LNET_NID_ANY &&
-	      nid == LNET_NID_ANY) ||
-	     (prim_nid == LNET_NID_ANY &&
-	      nid != LNET_NID_ANY)) && !mr)
-		return lnet_peer_ni_add_non_mr(prim_nid != LNET_NID_ANY ?
-							 prim_nid : nid);
+	/* Multi-Rail flag must match. */
+	if ((lp->lp_state ^ flags) & LNET_PEER_MULTI_RAIL) {
+		CDEBUG(D_NET, "multi-rail state mismatch for peer %s\n",
+		       libcfs_nid2str(prim_nid));
+		return -EPERM;
+	}
 
-	return 0;
+	return lnet_peer_add_nid(lp, nid, flags);
 }
 
+/*
+ * Implementation of IOC_LIBCFS_DEL_PEER_NI.
+ *
+ * This API handles the following combinations:
+ *   Delete a NI from a peer if both prim_nid and nid are provided.
+ *   Delete a peer if only prim_nid is provided.
+ *   Delete a peer if its primary nid is provided.
+ *
+ * The caller must hold ln_api_mutex. This prevents the peer from
+ * being modified/deleted by a different thread.
+ */
 int
-lnet_del_peer_ni_from_peer(lnet_nid_t prim_nid, lnet_nid_t nid)
+lnet_del_peer_ni(lnet_nid_t prim_nid, lnet_nid_t nid)
 {
-	lnet_nid_t local_nid;
-	struct lnet_peer *peer;
+	struct lnet_peer *lp;
 	struct lnet_peer_ni *lpni;
-	int rc;
+	unsigned flags;
 
 	if (prim_nid == LNET_NID_ANY)
 		return -EINVAL;
 
-	local_nid = (nid != LNET_NID_ANY) ? nid : prim_nid;
-
-	lpni = lnet_find_peer_ni_locked(local_nid);
+	lpni = lnet_find_peer_ni_locked(prim_nid);
 	if (!lpni)
-		return -EINVAL;
+		return -ENOENT;
 	lnet_peer_ni_decref_locked(lpni);
+	lp = lpni->lpni_peer_net->lpn_peer;
 
-	peer = lpni->lpni_peer_net->lpn_peer;
-	LASSERT(peer != NULL);
-
-	if (peer->lp_primary_nid == lpni->lpni_nid) {
-		/*
-		 * deleting the primary ni is equivalent to deleting the
-		 * entire peer
-		 */
-		lnet_net_lock(LNET_LOCK_EX);
-		rc = lnet_peer_del_locked(peer);
-		lnet_net_unlock(LNET_LOCK_EX);
-
-		return rc;
+	if (prim_nid != lp->lp_primary_nid) {
+		CDEBUG(D_NET, "prim_nid %s is not primary for peer %s\n",
+		       libcfs_nid2str(prim_nid),
+		       libcfs_nid2str(lp->lp_primary_nid));
+		return -ENODEV;
 	}
 
-	lnet_net_lock(LNET_LOCK_EX);
-	rc = lnet_peer_ni_del_locked(lpni);
-	lnet_net_unlock(LNET_LOCK_EX);
+	if (nid == LNET_NID_ANY || nid == lp->lp_primary_nid)
+		return lnet_peer_del(lp);
 
-	return rc;
+	flags = LNET_PEER_CONFIGURED;
+	if (lp->lp_state & LNET_PEER_MULTI_RAIL)
+		flags |= LNET_PEER_MULTI_RAIL;
+
+	return lnet_peer_del_nid(lp, nid, flags);
 }
 
 void
 lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lpni)
 {
 	struct lnet_peer_table *ptable;
+	struct lnet_peer_net *lpn;
+
+	CDEBUG(D_NET, "%p nid %s\n", lpni, libcfs_nid2str(lpni->lpni_nid));
 
 	LASSERT(atomic_read(&lpni->lpni_refcount) == 0);
 	LASSERT(lpni->lpni_rtr_refcount == 0);
 	LASSERT(list_empty(&lpni->lpni_txq));
 	LASSERT(lpni->lpni_txqnob == 0);
+	LASSERT(list_empty(&lpni->lpni_peer_nis));
+	LASSERT(list_empty(&lpni->lpni_on_remote_peer_ni_list));
 
+	lpn = lpni->lpni_peer_net;
+	lpni->lpni_peer_net = NULL;
 	lpni->lpni_net = NULL;
 
 	/* remove the peer ni from the zombie list */
@@ -982,7 +1634,13 @@ lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lpni)
 	ptable->pt_zombies--;
 	spin_unlock(&ptable->pt_zombie_lock);
 
+	if (lpni->lpni_pref_nnids > 1) {
+		LIBCFS_FREE(lpni->lpni_pref.nids,
+			sizeof(*lpni->lpni_pref.nids) * lpni->lpni_pref_nnids);
+	}
 	LIBCFS_FREE(lpni, sizeof(*lpni));
+
+	lnet_peer_net_decref_locked(lpn);
 }
 
 struct lnet_peer_ni *
@@ -1004,7 +1662,7 @@ lnet_nid2peerni_ex(lnet_nid_t nid, int cpt)
 
 	lnet_net_unlock(cpt);
 
-	rc = lnet_peer_ni_traffic_add(nid);
+	rc = lnet_peer_ni_traffic_add(nid, LNET_NID_ANY);
 	if (rc) {
 		lpni = ERR_PTR(rc);
 		goto out_net_relock;
@@ -1019,8 +1677,12 @@ lnet_nid2peerni_ex(lnet_nid_t nid, int cpt)
 	return lpni;
 }
 
+/*
+ * Get a peer_ni for the given nid, create it if necessary. Takes a
+ * hold on the peer_ni.
+ */
 struct lnet_peer_ni *
-lnet_nid2peerni_locked(lnet_nid_t nid, int cpt)
+lnet_nid2peerni_locked(lnet_nid_t nid, lnet_nid_t pref, int cpt)
 {
 	struct lnet_peer_ni *lpni = NULL;
 	int rc;
@@ -1059,7 +1721,7 @@ lnet_nid2peerni_locked(lnet_nid_t nid, int cpt)
 		goto out_mutex_unlock;
 	}
 
-	rc = lnet_peer_ni_traffic_add(nid);
+	rc = lnet_peer_ni_traffic_add(nid, pref);
 	if (rc) {
 		lpni = ERR_PTR(rc);
 		goto out_mutex_unlock;
@@ -1072,20 +1734,1615 @@ lnet_nid2peerni_locked(lnet_nid_t nid, int cpt)
 	mutex_unlock(&the_lnet.ln_api_mutex);
 	lnet_net_lock(cpt);
 
+	/* Lock has been dropped, check again for shutdown. */
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+		if (!IS_ERR(lpni))
+			lnet_peer_ni_decref_locked(lpni);
+		lpni = ERR_PTR(-ESHUTDOWN);
+	}
+
 	return lpni;
 }
 
-void
-lnet_debug_peer(lnet_nid_t nid)
+bool
+lnet_peer_is_uptodate(struct lnet_peer *lp)
 {
-	char			*aliveness = "NA";
-	struct lnet_peer_ni	*lp;
-	int			cpt;
-
-	cpt = lnet_cpt_of_nid(nid, NULL);
-	lnet_net_lock(cpt);
+	bool rc;
 
-	lp = lnet_nid2peerni_locked(nid, cpt);
+	spin_lock(&lp->lp_lock);
+	rc = lnet_peer_is_uptodate_locked(lp);
+	spin_unlock(&lp->lp_lock);
+	return rc;
+}
+
+/*
+ * Is a peer uptodate from the point of view of discovery?
+ *
+ * If it is currently being processed, obviously not.
+ * A forced Ping or Push is also handled by the discovery thread.
+ *
+ * Otherwise look at whether the peer needs rediscovering.
+ */
+bool
+lnet_peer_is_uptodate_locked(struct lnet_peer *lp)
+__must_hold(&lp->lp_lock)
+{
+	bool rc;
+
+	if (lp->lp_state & (LNET_PEER_DISCOVERING |
+			    LNET_PEER_FORCE_PING |
+			    LNET_PEER_FORCE_PUSH)) {
+		rc = false;
+	} else if (lp->lp_state & LNET_PEER_REDISCOVER) {
+			rc = false;
+	} else if (lnet_peer_needs_push(lp)) {
+		rc = false;
+	} else if (lp->lp_state & LNET_PEER_DISCOVERED) {
+		if (lp->lp_state & LNET_PEER_NIDS_UPTODATE)
+			rc = true;
+		else
+			rc = false;
+	} else {
+		rc = false;
+	}
+
+	return rc;
+}
+
+/*
+ * Queue a peer for the attention of the discovery thread.  Call with
+ * lnet_net_lock/EX held. Returns 0 if the peer was queued, and
+ * -EALREADY if the peer was already queued.
+ */
+static int lnet_peer_queue_for_discovery(struct lnet_peer *lp)
+{
+	int rc;
+
+	spin_lock(&lp->lp_lock);
+	if (!(lp->lp_state & LNET_PEER_DISCOVERING))
+		lp->lp_state |= LNET_PEER_DISCOVERING;
+	spin_unlock(&lp->lp_lock);
+	if (list_empty(&lp->lp_dc_list)) {
+		lnet_peer_addref_locked(lp);
+		list_add_tail(&lp->lp_dc_list, &the_lnet.ln_dc_request);
+		wake_up(&the_lnet.ln_dc_waitq);
+		rc = 0;
+	} else {
+		rc = -EALREADY;
+	}
+
+	CDEBUG(D_NET, "Queue peer %s: %d\n",
+	       libcfs_nid2str(lp->lp_primary_nid), rc);
+
+	return rc;
+}
+
+/*
+ * Discovery of a peer is complete. Wake all waiters on the peer.
+ * Call with lnet_net_lock/EX held.
+ */
+static void lnet_peer_discovery_complete(struct lnet_peer *lp)
+{
+	struct lnet_msg *msg, *tmp;
+	int rc = 0;
+	struct list_head pending_msgs;
+
+	INIT_LIST_HEAD(&pending_msgs);
+
+	CDEBUG(D_NET, "Discovery complete. Dequeue peer %s\n",
+	       libcfs_nid2str(lp->lp_primary_nid));
+
+	list_del_init(&lp->lp_dc_list);
+	spin_lock(&lp->lp_lock);
+	list_splice_init(&lp->lp_dc_pendq, &pending_msgs);
+	spin_unlock(&lp->lp_lock);
+	wake_up_all(&lp->lp_dc_waitq);
+
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	/* iterate through all pending messages and send them again */
+	list_for_each_entry_safe(msg, tmp, &pending_msgs, msg_list) {
+		list_del_init(&msg->msg_list);
+		if (lp->lp_dc_error) {
+			lnet_finalize(msg, lp->lp_dc_error);
+			continue;
+		}
+
+		CDEBUG(D_NET, "sending pending message %s to target %s\n",
+		       lnet_msgtyp2str(msg->msg_type),
+		       libcfs_id2str(msg->msg_target));
+		rc = lnet_send(msg->msg_src_nid_param, msg,
+			       msg->msg_rtr_nid_param);
+		if (rc < 0) {
+			CNETERR("Error sending %s to %s: %d\n",
+			       lnet_msgtyp2str(msg->msg_type),
+			       libcfs_id2str(msg->msg_target), rc);
+			lnet_finalize(msg, rc);
+		}
+	}
+	lnet_net_lock(LNET_LOCK_EX);
+	lnet_peer_decref_locked(lp);
+}
+
+/*
+ * Handle inbound push.
+ * Like any event handler, called with lnet_res_lock/CPT held.
+ */
+void lnet_peer_push_event(struct lnet_event *ev)
+{
+	struct lnet_ping_buffer *pbuf = ev->md.user_ptr;
+	struct lnet_peer *lp;
+
+	/* lnet_find_peer() adds a refcount */
+	lp = lnet_find_peer(ev->source.nid);
+	if (!lp) {
+		CDEBUG(D_NET, "Push Put from unknown %s (source %s). Ignoring...\n",
+		       libcfs_nid2str(ev->initiator.nid),
+		       libcfs_nid2str(ev->source.nid));
+		return;
+	}
+
+	/* Ensure peer state remains consistent while we modify it. */
+	spin_lock(&lp->lp_lock);
+
+	/*
+	 * If some kind of error happened the contents of the message
+	 * cannot be used. Clear the NIDS_UPTODATE and set the
+	 * FORCE_PING flag to trigger a ping.
+	 */
+	if (ev->status) {
+		lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE;
+		lp->lp_state |= LNET_PEER_FORCE_PING;
+		CDEBUG(D_NET, "Push Put error %d from %s (source %s)\n",
+		       ev->status,
+		       libcfs_nid2str(lp->lp_primary_nid),
+		       libcfs_nid2str(ev->source.nid));
+		goto out;
+	}
+
+	/*
+	 * A push with invalid or corrupted info. Clear the UPTODATE
+	 * flag to trigger a ping.
+	 */
+	if (lnet_ping_info_validate(&pbuf->pb_info)) {
+		lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE;
+		lp->lp_state |= LNET_PEER_FORCE_PING;
+		CDEBUG(D_NET, "Corrupted Push from %s\n",
+		       libcfs_nid2str(lp->lp_primary_nid));
+		goto out;
+	}
+
+	/*
+	 * Make sure we'll allocate the correct size ping buffer when
+	 * pinging the peer.
+	 */
+	if (lp->lp_data_nnis < pbuf->pb_info.pi_nnis)
+		lp->lp_data_nnis = pbuf->pb_info.pi_nnis;
+
+	/*
+	 * A non-Multi-Rail peer is not supposed to be capable of
+	 * sending a push.
+	 */
+	if (!(pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL)) {
+		CERROR("Push from non-Multi-Rail peer %s dropped\n",
+		       libcfs_nid2str(lp->lp_primary_nid));
+		goto out;
+	}
+
+	/*
+	 * The peer may have discovery disabled at its end. Set
+	 * NO_DISCOVERY as appropriate.
+	 */
+	if (!(pbuf->pb_info.pi_features & LNET_PING_FEAT_DISCOVERY)) {
+		CDEBUG(D_NET, "Peer %s has discovery disabled\n",
+		       libcfs_nid2str(lp->lp_primary_nid));
+		lp->lp_state |= LNET_PEER_NO_DISCOVERY;
+	} else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) {
+		CDEBUG(D_NET, "Peer %s has discovery enabled\n",
+		       libcfs_nid2str(lp->lp_primary_nid));
+		lp->lp_state &= ~LNET_PEER_NO_DISCOVERY;
+	}
+
+	/*
+	 * Update the MULTI_RAIL flag based on the push. If the peer
+	 * was configured with DLC then the setting should match what
+	 * DLC put in.
+	 * NB: We verified above that the MR feature bit is set in pi_features
+	 */
+	if (lp->lp_state & LNET_PEER_MULTI_RAIL) {
+		CDEBUG(D_NET, "peer %s(%p) is MR\n",
+		       libcfs_nid2str(lp->lp_primary_nid), lp);
+	} else if (lp->lp_state & LNET_PEER_CONFIGURED) {
+		CWARN("Push says %s is Multi-Rail, DLC says not\n",
+		      libcfs_nid2str(lp->lp_primary_nid));
+	} else if (lnet_peer_discovery_disabled) {
+		CDEBUG(D_NET, "peer %s(%p) not MR: DD disabled locally\n",
+		       libcfs_nid2str(lp->lp_primary_nid), lp);
+	} else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) {
+		CDEBUG(D_NET, "peer %s(%p) not MR: DD disabled remotely\n",
+		       libcfs_nid2str(lp->lp_primary_nid), lp);
+	} else {
+		CDEBUG(D_NET, "peer %s(%p) is MR capable\n",
+		       libcfs_nid2str(lp->lp_primary_nid), lp);
+		lp->lp_state |= LNET_PEER_MULTI_RAIL;
+		lnet_peer_clr_non_mr_pref_nids(lp);
+	}
+
+	/*
+	 * Check for truncation of the Put message. Clear the
+	 * NIDS_UPTODATE flag and set FORCE_PING to trigger a ping,
+	 * and tell discovery to allocate a bigger buffer.
+	 */
+	if (pbuf->pb_nnis < pbuf->pb_info.pi_nnis) {
+		if (the_lnet.ln_push_target_nnis < pbuf->pb_info.pi_nnis)
+			the_lnet.ln_push_target_nnis = pbuf->pb_info.pi_nnis;
+		lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE;
+		lp->lp_state |= LNET_PEER_FORCE_PING;
+		CDEBUG(D_NET, "Truncated Push from %s (%d nids)\n",
+		       libcfs_nid2str(lp->lp_primary_nid),
+		       pbuf->pb_info.pi_nnis);
+		goto out;
+	}
+
+	/* always assume new data */
+	lp->lp_peer_seqno = LNET_PING_BUFFER_SEQNO(pbuf);
+	lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE;
+
+	/*
+	 * If there is data present that hasn't been processed yet,
+	 * we'll replace it if the Put contained newer data and it
+	 * fits. We're racing with a Ping or earlier Push in this
+	 * case.
+	 */
+	if (lp->lp_state & LNET_PEER_DATA_PRESENT) {
+		if (LNET_PING_BUFFER_SEQNO(pbuf) >
+			LNET_PING_BUFFER_SEQNO(lp->lp_data) &&
+		    pbuf->pb_info.pi_nnis <= lp->lp_data->pb_nnis) {
+			memcpy(&lp->lp_data->pb_info, &pbuf->pb_info,
+			       LNET_PING_INFO_SIZE(pbuf->pb_info.pi_nnis));
+			CDEBUG(D_NET, "Ping/Push race from %s: %u vs %u\n",
+			      libcfs_nid2str(lp->lp_primary_nid),
+			      LNET_PING_BUFFER_SEQNO(pbuf),
+			      LNET_PING_BUFFER_SEQNO(lp->lp_data));
+		}
+		goto out;
+	}
+
+	/*
+	 * Allocate a buffer to copy the data. On a failure we drop
+	 * the Push and set FORCE_PING to force the discovery
+	 * thread to fix the problem by pinging the peer.
+	 */
+	lp->lp_data = lnet_ping_buffer_alloc(lp->lp_data_nnis, GFP_ATOMIC);
+	if (!lp->lp_data) {
+		lp->lp_state |= LNET_PEER_FORCE_PING;
+		CDEBUG(D_NET, "Cannot allocate Push buffer for %s %u\n",
+		       libcfs_nid2str(lp->lp_primary_nid),
+		       LNET_PING_BUFFER_SEQNO(pbuf));
+		goto out;
+	}
+
+	/* Success */
+	memcpy(&lp->lp_data->pb_info, &pbuf->pb_info,
+	       LNET_PING_INFO_SIZE(pbuf->pb_info.pi_nnis));
+	lp->lp_state |= LNET_PEER_DATA_PRESENT;
+	CDEBUG(D_NET, "Received Push %s %u\n",
+	       libcfs_nid2str(lp->lp_primary_nid),
+	       LNET_PING_BUFFER_SEQNO(pbuf));
+
+out:
+	/*
+	 * Queue the peer for discovery if not done, force it on the request
+	 * queue and wake the discovery thread if the peer was already queued,
+	 * because its status changed.
+	 */
+	spin_unlock(&lp->lp_lock);
+	lnet_net_lock(LNET_LOCK_EX);
+	if (!lnet_peer_is_uptodate(lp) && lnet_peer_queue_for_discovery(lp)) {
+		list_move(&lp->lp_dc_list, &the_lnet.ln_dc_request);
+		wake_up(&the_lnet.ln_dc_waitq);
+	}
+	/* Drop refcount from lookup */
+	lnet_peer_decref_locked(lp);
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+/*
+ * Clear the discovery error state, unless we're already discovering
+ * this peer, in which case the error is current.
+ */
+static void lnet_peer_clear_discovery_error(struct lnet_peer *lp)
+{
+	spin_lock(&lp->lp_lock);
+	if (!(lp->lp_state & LNET_PEER_DISCOVERING))
+		lp->lp_dc_error = 0;
+	spin_unlock(&lp->lp_lock);
+}
+
+/*
+ * Peer discovery slow path. The ln_api_mutex is held on entry, and
+ * dropped/retaken within this function. An lnet_peer_ni is passed in
+ * because discovery could tear down an lnet_peer.
+ */
+int
+lnet_discover_peer_locked(struct lnet_peer_ni *lpni, int cpt, bool block)
+{
+	DEFINE_WAIT(wait);
+	struct lnet_peer *lp;
+	int rc = 0;
+	int count = 0;
+
+again:
+	lnet_net_unlock(cpt);
+	lnet_net_lock(LNET_LOCK_EX);
+	lp = lpni->lpni_peer_net->lpn_peer;
+	lnet_peer_clear_discovery_error(lp);
+
+	/*
+	 * We're willing to be interrupted. The lpni can become a
+	 * zombie if we race with DLC, so we must check for that.
+	 */
+	for (;;) {
+		/* Keep lp alive when the lnet_net_lock is unlocked */
+		lnet_peer_addref_locked(lp);
+		prepare_to_wait(&lp->lp_dc_waitq, &wait, TASK_INTERRUPTIBLE);
+		if (signal_pending(current))
+			break;
+		if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING)
+			break;
+		/*
+		 * Don't repeat discovery if discovery is disabled. This is
+		 * done to ensure we can use discovery as a standard ping as
+		 * well for backwards compatibility with routers which do not
+		 * have discovery or have discovery disabled
+		 */
+		if (lnet_is_discovery_disabled(lp) && count > 0)
+			break;
+		if (lp->lp_dc_error)
+			break;
+		if (lnet_peer_is_uptodate(lp))
+			break;
+		lnet_peer_queue_for_discovery(lp);
+		count++;
+		CDEBUG(D_NET, "Discovery attempt # %d\n", count);
+
+		/*
+		 * If caller requested a non-blocking operation then
+		 * return immediately. Once discovery is complete any
+		 * pending messages that were stopped due to discovery
+		 * will be transmitted.
+		 */
+		if (!block)
+			break;
+
+		lnet_net_unlock(LNET_LOCK_EX);
+		schedule();
+		finish_wait(&lp->lp_dc_waitq, &wait);
+		lnet_net_lock(LNET_LOCK_EX);
+		lnet_peer_decref_locked(lp);
+		/* Peer may have changed */
+		lp = lpni->lpni_peer_net->lpn_peer;
+	}
+	finish_wait(&lp->lp_dc_waitq, &wait);
+
+	lnet_net_unlock(LNET_LOCK_EX);
+	lnet_net_lock(cpt);
+	lnet_peer_decref_locked(lp);
+	/*
+	 * The peer may have changed, so re-check and rediscover if that turns
+	 * out to have been the case. The reference count on lp ensured that
+	 * even if it was unlinked from lpni the memory could not be recycled.
+	 * Thus the check below is sufficient to determine whether the peer
+	 * changed. If the peer changed, then lp must not be dereferenced.
+	 */
+	if (lp != lpni->lpni_peer_net->lpn_peer)
+		goto again;
+
+	if (signal_pending(current))
+		rc = -EINTR;
+	else if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING)
+		rc = -ESHUTDOWN;
+	else if (lp->lp_dc_error)
+		rc = lp->lp_dc_error;
+	else if (!block)
+		CDEBUG(D_NET, "non-blocking discovery\n");
+	else if (!lnet_peer_is_uptodate(lp) && !lnet_is_discovery_disabled(lp))
+		goto again;
+
+	CDEBUG(D_NET, "peer %s NID %s: %d. %s\n",
+	       (lp ? libcfs_nid2str(lp->lp_primary_nid) : "(none)"),
+	       libcfs_nid2str(lpni->lpni_nid), rc,
+	       (!block) ? "pending discovery" : "discovery complete");
+
+	return rc;
+}
+
+/* Handle an incoming ack for a push. */
+static void
+lnet_discovery_event_ack(struct lnet_peer *lp, struct lnet_event *ev)
+{
+	struct lnet_ping_buffer *pbuf;
+
+	pbuf = LNET_PING_INFO_TO_BUFFER(ev->md.start);
+	spin_lock(&lp->lp_lock);
+	lp->lp_state &= ~LNET_PEER_PUSH_SENT;
+	lp->lp_push_error = ev->status;
+	if (ev->status)
+		lp->lp_state |= LNET_PEER_PUSH_FAILED;
+	else
+		lp->lp_node_seqno = LNET_PING_BUFFER_SEQNO(pbuf);
+	spin_unlock(&lp->lp_lock);
+
+	CDEBUG(D_NET, "peer %s ev->status %d\n",
+	       libcfs_nid2str(lp->lp_primary_nid), ev->status);
+}
+
+/* Handle a Reply message. This is the reply to a Ping message. */
+static void
+lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev)
+{
+	struct lnet_ping_buffer *pbuf;
+	int rc;
+
+	spin_lock(&lp->lp_lock);
+
+	lp->lp_disc_src_nid = ev->target.nid;
+
+	/*
+	 * If some kind of error happened the contents of message
+	 * cannot be used. Set PING_FAILED to trigger a retry.
+	 */
+	if (ev->status) {
+		lp->lp_state |= LNET_PEER_PING_FAILED;
+		lp->lp_ping_error = ev->status;
+		CDEBUG(D_NET, "Ping Reply error %d from %s (source %s)\n",
+		       ev->status,
+		       libcfs_nid2str(lp->lp_primary_nid),
+		       libcfs_nid2str(ev->source.nid));
+		goto out;
+	}
+
+	pbuf = LNET_PING_INFO_TO_BUFFER(ev->md.start);
+	if (pbuf->pb_info.pi_magic == __swab32(LNET_PROTO_PING_MAGIC))
+		lnet_swap_pinginfo(pbuf);
+
+	/*
+	 * A reply with invalid or corrupted info. Set PING_FAILED to
+	 * trigger a retry.
+	 */
+	rc = lnet_ping_info_validate(&pbuf->pb_info);
+	if (rc) {
+		lp->lp_state |= LNET_PEER_PING_FAILED;
+		lp->lp_ping_error = 0;
+		CDEBUG(D_NET, "Corrupted Ping Reply from %s: %d\n",
+		       libcfs_nid2str(lp->lp_primary_nid), rc);
+		goto out;
+	}
+
+	/* The peer may have discovery disabled at its end. Set
+	 * NO_DISCOVERY as appropriate.
+	 */
+	if ((pbuf->pb_info.pi_features & LNET_PING_FEAT_DISCOVERY) &&
+		!lnet_peer_discovery_disabled) {
+		CDEBUG(D_NET, "Peer %s has discovery enabled\n",
+			libcfs_nid2str(lp->lp_primary_nid));
+		lp->lp_state &= ~LNET_PEER_NO_DISCOVERY;
+	} else {
+		CDEBUG(D_NET, "Peer %s has discovery disabled\n",
+			libcfs_nid2str(lp->lp_primary_nid));
+		lp->lp_state |= LNET_PEER_NO_DISCOVERY;
+	}
+
+	/*
+	 * Update the MULTI_RAIL flag based on the reply. If the peer
+	 * was configured with DLC then the setting should match what
+	 * DLC put in.
+	 */
+	if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL) {
+		if (lp->lp_state & LNET_PEER_MULTI_RAIL) {
+			CDEBUG(D_NET, "peer %s(%p) is MR\n",
+			       libcfs_nid2str(lp->lp_primary_nid), lp);
+		} else if (lp->lp_state & LNET_PEER_CONFIGURED) {
+			CWARN("Reply says %s is Multi-Rail, DLC says not\n",
+			      libcfs_nid2str(lp->lp_primary_nid));
+		} else if (lnet_peer_discovery_disabled) {
+			CDEBUG(D_NET,
+			       "peer %s(%p) not MR: DD disabled locally\n",
+			       libcfs_nid2str(lp->lp_primary_nid), lp);
+		} else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) {
+			CDEBUG(D_NET,
+			       "peer %s(%p) not MR: DD disabled remotely\n",
+			       libcfs_nid2str(lp->lp_primary_nid), lp);
+		} else {
+			CDEBUG(D_NET, "peer %s(%p) is MR capable\n",
+			       libcfs_nid2str(lp->lp_primary_nid), lp);
+			lp->lp_state |= LNET_PEER_MULTI_RAIL;
+			lnet_peer_clr_non_mr_pref_nids(lp);
+		}
+	} else if (lp->lp_state & LNET_PEER_MULTI_RAIL) {
+		if (lp->lp_state & LNET_PEER_CONFIGURED) {
+			CWARN("DLC says %s is Multi-Rail, Reply says not\n",
+			      libcfs_nid2str(lp->lp_primary_nid));
+		} else {
+			CERROR("Multi-Rail state vanished from %s\n",
+			       libcfs_nid2str(lp->lp_primary_nid));
+			lp->lp_state &= ~LNET_PEER_MULTI_RAIL;
+		}
+	}
+
+	/*
+	 * Make sure we'll allocate the correct size ping buffer when
+	 * pinging the peer.
+	 */
+	if (lp->lp_data_nnis < pbuf->pb_info.pi_nnis)
+		lp->lp_data_nnis = pbuf->pb_info.pi_nnis;
+
+	/*
+	 * Check for truncation of the Reply. Clear PING_SENT and set
+	 * PING_FAILED to trigger a retry.
+	 */
+	if (pbuf->pb_nnis < pbuf->pb_info.pi_nnis) {
+		if (the_lnet.ln_push_target_nnis < pbuf->pb_info.pi_nnis)
+			the_lnet.ln_push_target_nnis = pbuf->pb_info.pi_nnis;
+		lp->lp_state |= LNET_PEER_PING_FAILED;
+		lp->lp_ping_error = 0;
+		CDEBUG(D_NET, "Truncated Reply from %s (%d nids)\n",
+		       libcfs_nid2str(lp->lp_primary_nid),
+		       pbuf->pb_info.pi_nnis);
+		goto out;
+	}
+
+	/*
+	 * Check the sequence numbers in the reply. These are only
+	 * available if the reply came from a Multi-Rail peer.
+	 */
+	if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL &&
+	    pbuf->pb_info.pi_nnis > 1 &&
+	    lp->lp_primary_nid == pbuf->pb_info.pi_ni[1].ns_nid) {
+		if (LNET_PING_BUFFER_SEQNO(pbuf) < lp->lp_peer_seqno)
+			CDEBUG(D_NET, "peer %s: seq# got %u have %u. peer rebooted?\n",
+				libcfs_nid2str(lp->lp_primary_nid),
+				LNET_PING_BUFFER_SEQNO(pbuf),
+				lp->lp_peer_seqno);
+
+		lp->lp_peer_seqno = LNET_PING_BUFFER_SEQNO(pbuf);
+	}
+
+	/* We're happy with the state of the data in the buffer. */
+	CDEBUG(D_NET, "peer %s data present %u\n",
+	       libcfs_nid2str(lp->lp_primary_nid), lp->lp_peer_seqno);
+	if (lp->lp_state & LNET_PEER_DATA_PRESENT)
+		lnet_ping_buffer_decref(lp->lp_data);
+	else
+		lp->lp_state |= LNET_PEER_DATA_PRESENT;
+	lnet_ping_buffer_addref(pbuf);
+	lp->lp_data = pbuf;
+out:
+	lp->lp_state &= ~LNET_PEER_PING_SENT;
+	spin_unlock(&lp->lp_lock);
+}
+
+/*
+ * Send event handling. Only matters for error cases, where we clean
+ * up state on the peer and peer_ni that would otherwise be updated in
+ * the REPLY event handler for a successful Ping, and the ACK event
+ * handler for a successful Push.
+ */
+static int
+lnet_discovery_event_send(struct lnet_peer *lp, struct lnet_event *ev)
+{
+	int rc = 0;
+
+	if (!ev->status)
+		goto out;
+
+	spin_lock(&lp->lp_lock);
+	if (ev->msg_type == LNET_MSG_GET) {
+		lp->lp_state &= ~LNET_PEER_PING_SENT;
+		lp->lp_state |= LNET_PEER_PING_FAILED;
+		lp->lp_ping_error = ev->status;
+	} else { /* ev->msg_type == LNET_MSG_PUT */
+		lp->lp_state &= ~LNET_PEER_PUSH_SENT;
+		lp->lp_state |= LNET_PEER_PUSH_FAILED;
+		lp->lp_push_error = ev->status;
+	}
+	spin_unlock(&lp->lp_lock);
+	rc = LNET_REDISCOVER_PEER;
+out:
+	CDEBUG(D_NET, "%s Send to %s: %d\n",
+		(ev->msg_type == LNET_MSG_GET ? "Ping" : "Push"),
+		libcfs_nid2str(ev->target.nid), rc);
+	return rc;
+}
+
+/*
+ * Unlink event handling. This event is only seen if a call to
+ * LNetMDUnlink() caused the event to be unlinked. If this call was
+ * made after the event was set up in LNetGet() or LNetPut() then we
+ * assume the Ping or Push timed out.
+ */
+static void
+lnet_discovery_event_unlink(struct lnet_peer *lp, struct lnet_event *ev)
+{
+	spin_lock(&lp->lp_lock);
+	/* We've passed through LNetGet() */
+	if (lp->lp_state & LNET_PEER_PING_SENT) {
+		lp->lp_state &= ~LNET_PEER_PING_SENT;
+		lp->lp_state |= LNET_PEER_PING_FAILED;
+		lp->lp_ping_error = -ETIMEDOUT;
+		CDEBUG(D_NET, "Ping Unlink for message to peer %s\n",
+			libcfs_nid2str(lp->lp_primary_nid));
+	}
+	/* We've passed through LNetPut() */
+	if (lp->lp_state & LNET_PEER_PUSH_SENT) {
+		lp->lp_state &= ~LNET_PEER_PUSH_SENT;
+		lp->lp_state |= LNET_PEER_PUSH_FAILED;
+		lp->lp_push_error = -ETIMEDOUT;
+		CDEBUG(D_NET, "Push Unlink for message to peer %s\n",
+			libcfs_nid2str(lp->lp_primary_nid));
+	}
+	spin_unlock(&lp->lp_lock);
+}
+
+/*
+ * Event handler for the discovery EQ.
+ *
+ * Called with lnet_res_lock(cpt) held. The cpt is the
+ * lnet_cpt_of_cookie() of the md handle cookie.
+ */
+static void lnet_discovery_event_handler(struct lnet_event *event)
+{
+	struct lnet_peer *lp = event->md.user_ptr;
+	struct lnet_ping_buffer *pbuf;
+	int rc;
+
+	/* discovery needs to take another look */
+	rc = LNET_REDISCOVER_PEER;
+
+	CDEBUG(D_NET, "Received event: %d\n", event->type);
+
+	switch (event->type) {
+	case LNET_EVENT_ACK:
+		lnet_discovery_event_ack(lp, event);
+		break;
+	case LNET_EVENT_REPLY:
+		lnet_discovery_event_reply(lp, event);
+		break;
+	case LNET_EVENT_SEND:
+		/* Only send failure triggers a retry. */
+		rc = lnet_discovery_event_send(lp, event);
+		break;
+	case LNET_EVENT_UNLINK:
+		/* LNetMDUnlink() was called */
+		lnet_discovery_event_unlink(lp, event);
+		break;
+	default:
+		/* Invalid events. */
+		LBUG();
+	}
+	lnet_net_lock(LNET_LOCK_EX);
+	if (event->unlinked) {
+		pbuf = LNET_PING_INFO_TO_BUFFER(event->md.start);
+		lnet_ping_buffer_decref(pbuf);
+		lnet_peer_decref_locked(lp);
+	}
+
+	/* put peer back at end of request queue, if discovery not already
+	 * done */
+	if (rc == LNET_REDISCOVER_PEER && !lnet_peer_is_uptodate(lp) &&
+	    lnet_peer_queue_for_discovery(lp)) {
+		list_move_tail(&lp->lp_dc_list, &the_lnet.ln_dc_request);
+		wake_up(&the_lnet.ln_dc_waitq);
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+/*
+ * Build a peer from incoming data.
+ *
+ * The NIDs in the incoming data are supposed to be structured as follows:
+ *  - loopback
+ *  - primary NID
+ *  - other NIDs in same net
+ *  - NIDs in second net
+ *  - NIDs in third net
+ *  - ...
+ * This due to the way the list of NIDs in the data is created.
+ *
+ * Note that this function will mark the peer uptodate unless an
+ * ENOMEM is encontered. All other errors are due to a conflict
+ * between the DLC configuration and what discovery sees. We treat DLC
+ * as binding, and therefore set the NIDS_UPTODATE flag to prevent the
+ * peer from becoming stuck in discovery.
+ */
+static int lnet_peer_merge_data(struct lnet_peer *lp,
+				struct lnet_ping_buffer *pbuf)
+{
+	struct lnet_peer_ni *lpni;
+	lnet_nid_t *curnis = NULL;
+	lnet_nid_t *addnis = NULL;
+	lnet_nid_t *delnis = NULL;
+	unsigned flags;
+	int ncurnis;
+	int naddnis;
+	int ndelnis;
+	int nnis = 0;
+	int i;
+	int j;
+	int rc;
+
+	flags = LNET_PEER_DISCOVERED;
+	if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL)
+		flags |= LNET_PEER_MULTI_RAIL;
+
+	nnis = MAX(lp->lp_nnis, pbuf->pb_info.pi_nnis);
+	LIBCFS_ALLOC(curnis, nnis * sizeof(lnet_nid_t));
+	LIBCFS_ALLOC(addnis, nnis * sizeof(lnet_nid_t));
+	LIBCFS_ALLOC(delnis, nnis * sizeof(lnet_nid_t));
+	if (!curnis || !addnis || !delnis) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	ncurnis = 0;
+	naddnis = 0;
+	ndelnis = 0;
+
+	/* Construct the list of NIDs present in peer. */
+	lpni = NULL;
+	while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL)
+		curnis[ncurnis++] = lpni->lpni_nid;
+
+	/*
+	 * Check for NIDs in pbuf not present in curnis[].
+	 * The loop starts at 1 to skip the loopback NID.
+	 */
+	for (i = 1; i < pbuf->pb_info.pi_nnis; i++) {
+		for (j = 0; j < ncurnis; j++)
+			if (pbuf->pb_info.pi_ni[i].ns_nid == curnis[j])
+				break;
+		if (j == ncurnis)
+			addnis[naddnis++] = pbuf->pb_info.pi_ni[i].ns_nid;
+	}
+	/*
+	 * Check for NIDs in curnis[] not present in pbuf.
+	 * The nested loop starts at 1 to skip the loopback NID.
+	 *
+	 * But never add the loopback NID to delnis[]: if it is
+	 * present in curnis[] then this peer is for this node.
+	 */
+	for (i = 0; i < ncurnis; i++) {
+		if (curnis[i] == LNET_NID_LO_0)
+			continue;
+		for (j = 1; j < pbuf->pb_info.pi_nnis; j++)
+			if (curnis[i] == pbuf->pb_info.pi_ni[j].ns_nid)
+				break;
+		if (j == pbuf->pb_info.pi_nnis)
+			delnis[ndelnis++] = curnis[i];
+	}
+
+	rc = 0;
+	if (lnet_is_discovery_disabled(lp))
+		goto out;
+
+	for (i = 0; i < naddnis; i++) {
+		rc = lnet_peer_add_nid(lp, addnis[i], flags);
+		if (rc) {
+			CERROR("Error adding NID %s to peer %s: %d\n",
+			       libcfs_nid2str(addnis[i]),
+			       libcfs_nid2str(lp->lp_primary_nid), rc);
+			if (rc == -ENOMEM)
+				goto out;
+		}
+	}
+	for (i = 0; i < ndelnis; i++) {
+		rc = lnet_peer_del_nid(lp, delnis[i], flags);
+		if (rc) {
+			CERROR("Error deleting NID %s from peer %s: %d\n",
+			       libcfs_nid2str(delnis[i]),
+			       libcfs_nid2str(lp->lp_primary_nid), rc);
+			if (rc == -ENOMEM)
+				goto out;
+		}
+	}
+	/*
+	 * Errors other than -ENOMEM are due to peers having been
+	 * configured with DLC. Ignore these because DLC overrides
+	 * Discovery.
+	 */
+	rc = 0;
+out:
+	LIBCFS_FREE(curnis, nnis * sizeof(lnet_nid_t));
+	LIBCFS_FREE(addnis, nnis * sizeof(lnet_nid_t));
+	LIBCFS_FREE(delnis, nnis * sizeof(lnet_nid_t));
+	lnet_ping_buffer_decref(pbuf);
+	CDEBUG(D_NET, "peer %s: %d\n", libcfs_nid2str(lp->lp_primary_nid), rc);
+
+	if (rc) {
+		spin_lock(&lp->lp_lock);
+		lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE;
+		lp->lp_state |= LNET_PEER_FORCE_PING;
+		spin_unlock(&lp->lp_lock);
+	}
+	return rc;
+}
+
+/*
+ * The data in pbuf says lp is its primary peer, but the data was
+ * received by a different peer. Try to update lp with the data.
+ */
+static int
+lnet_peer_set_primary_data(struct lnet_peer *lp, struct lnet_ping_buffer *pbuf)
+{
+	struct lnet_handle_md mdh;
+
+	/* Queue lp for discovery, and force it on the request queue. */
+	lnet_net_lock(LNET_LOCK_EX);
+	if (lnet_peer_queue_for_discovery(lp))
+		list_move(&lp->lp_dc_list, &the_lnet.ln_dc_request);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	LNetInvalidateMDHandle(&mdh);
+
+	/*
+	 * Decide whether we can move the peer to the DATA_PRESENT state.
+	 *
+	 * We replace stale data for a multi-rail peer, repair PING_FAILED
+	 * status, and preempt FORCE_PING.
+	 *
+	 * If after that we have DATA_PRESENT, we merge it into this peer.
+	 */
+	spin_lock(&lp->lp_lock);
+	if (lp->lp_state & LNET_PEER_MULTI_RAIL) {
+		if (lp->lp_peer_seqno < LNET_PING_BUFFER_SEQNO(pbuf)) {
+			lp->lp_peer_seqno = LNET_PING_BUFFER_SEQNO(pbuf);
+		} else if (lp->lp_state & LNET_PEER_DATA_PRESENT) {
+			lp->lp_state &= ~LNET_PEER_DATA_PRESENT;
+			lnet_ping_buffer_decref(pbuf);
+			pbuf = lp->lp_data;
+			lp->lp_data = NULL;
+		}
+	}
+	if (lp->lp_state & LNET_PEER_DATA_PRESENT) {
+		lnet_ping_buffer_decref(lp->lp_data);
+		lp->lp_data = NULL;
+		lp->lp_state &= ~LNET_PEER_DATA_PRESENT;
+	}
+	if (lp->lp_state & LNET_PEER_PING_FAILED) {
+		mdh = lp->lp_ping_mdh;
+		LNetInvalidateMDHandle(&lp->lp_ping_mdh);
+		lp->lp_state &= ~LNET_PEER_PING_FAILED;
+		lp->lp_ping_error = 0;
+	}
+	if (lp->lp_state & LNET_PEER_FORCE_PING)
+		lp->lp_state &= ~LNET_PEER_FORCE_PING;
+	lp->lp_state |= LNET_PEER_NIDS_UPTODATE;
+	spin_unlock(&lp->lp_lock);
+
+	if (!LNetMDHandleIsInvalid(mdh))
+		LNetMDUnlink(mdh);
+
+	if (pbuf)
+		return lnet_peer_merge_data(lp, pbuf);
+
+	CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(lp->lp_primary_nid));
+	return 0;
+}
+
+static bool lnet_is_nid_in_ping_info(lnet_nid_t nid, struct lnet_ping_info *pinfo)
+{
+	int i;
+
+	for (i = 0; i < pinfo->pi_nnis; i++) {
+		if (pinfo->pi_ni[i].ns_nid == nid)
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ * Update a peer using the data received.
+ */
+static int lnet_peer_data_present(struct lnet_peer *lp)
+__must_hold(&lp->lp_lock)
+{
+	struct lnet_ping_buffer *pbuf;
+	struct lnet_peer_ni *lpni;
+	lnet_nid_t nid = LNET_NID_ANY;
+	unsigned flags;
+	int rc = 0;
+
+	pbuf = lp->lp_data;
+	lp->lp_data = NULL;
+	lp->lp_state &= ~LNET_PEER_DATA_PRESENT;
+	lp->lp_state |= LNET_PEER_NIDS_UPTODATE;
+	spin_unlock(&lp->lp_lock);
+
+	/*
+	 * Modifications of peer structures are done while holding the
+	 * ln_api_mutex. A global lock is required because we may be
+	 * modifying multiple peer structures, and a mutex greatly
+	 * simplifies memory management.
+	 *
+	 * The actual changes to the data structures must also protect
+	 * against concurrent lookups, for which the lnet_net_lock in
+	 * LNET_LOCK_EX mode is used.
+	 */
+	mutex_lock(&the_lnet.ln_api_mutex);
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+		rc = -ESHUTDOWN;
+		goto out;
+	}
+
+	/*
+	 * If this peer is not on the peer list then it is being torn
+	 * down, and our reference count may be all that is keeping it
+	 * alive. Don't do any work on it.
+	 */
+	if (list_empty(&lp->lp_peer_list))
+		goto out;
+
+	flags = LNET_PEER_DISCOVERED;
+	if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL)
+		flags |= LNET_PEER_MULTI_RAIL;
+
+	/*
+	 * Check whether the primary NID in the message matches the
+	 * primary NID of the peer. If it does, update the peer, if
+	 * it it does not, check whether there is already a peer with
+	 * that primary NID. If no such peer exists, try to update
+	 * the primary NID of the current peer (allowed if it was
+	 * created due to message traffic) and complete the update.
+	 * If the peer did exist, hand off the data to it.
+	 *
+	 * The peer for the loopback interface is a special case: this
+	 * is the peer for the local node, and we want to set its
+	 * primary NID to the correct value here. Moreover, this peer
+	 * can show up with only the loopback NID in the ping buffer.
+	 */
+	if (pbuf->pb_info.pi_nnis <= 1)
+		goto out;
+	nid = pbuf->pb_info.pi_ni[1].ns_nid;
+	if (lp->lp_primary_nid == LNET_NID_LO_0) {
+		rc = lnet_peer_set_primary_nid(lp, nid, flags);
+		if (!rc)
+			rc = lnet_peer_merge_data(lp, pbuf);
+	} else if (lp->lp_primary_nid == nid ||
+		   (lnet_is_nid_in_ping_info(lp->lp_primary_nid, &pbuf->pb_info) &&
+		    lnet_is_discovery_disabled(lp))) {
+		rc = lnet_peer_merge_data(lp, pbuf);
+	} else {
+		lpni = lnet_find_peer_ni_locked(nid);
+		if (!lpni) {
+			rc = lnet_peer_set_primary_nid(lp, nid, flags);
+			if (rc) {
+				CERROR("Primary NID error %s versus %s: %d\n",
+				       libcfs_nid2str(lp->lp_primary_nid),
+				       libcfs_nid2str(nid), rc);
+			} else {
+				rc = lnet_peer_merge_data(lp, pbuf);
+			}
+		} else {
+			struct lnet_peer *new_lp;
+
+			new_lp = lpni->lpni_peer_net->lpn_peer;
+			/* if lp has discovery/MR enabled that means new_lp
+			 * should have discovery/MR enabled as well, since
+			 * it's the same peer, which we're about to merge
+			 */
+			if (!(lp->lp_state & LNET_PEER_NO_DISCOVERY))
+				new_lp->lp_state &= ~LNET_PEER_NO_DISCOVERY;
+			if (lp->lp_state & LNET_PEER_MULTI_RAIL)
+				new_lp->lp_state |= LNET_PEER_MULTI_RAIL;
+			rc = lnet_peer_set_primary_data(
+				lpni->lpni_peer_net->lpn_peer, pbuf);
+			lnet_peer_ni_decref_locked(lpni);
+		}
+	}
+out:
+	CDEBUG(D_NET, "peer %s: %d\n", libcfs_nid2str(lp->lp_primary_nid), rc);
+	mutex_unlock(&the_lnet.ln_api_mutex);
+
+	spin_lock(&lp->lp_lock);
+	/* Tell discovery to re-check the peer immediately. */
+	if (!rc)
+		rc = LNET_REDISCOVER_PEER;
+	return rc;
+}
+
+/*
+ * A ping failed. Clear the PING_FAILED state and set the
+ * FORCE_PING state, to ensure a retry even if discovery is
+ * disabled. This avoids being left with incorrect state.
+ */
+static int lnet_peer_ping_failed(struct lnet_peer *lp)
+__must_hold(&lp->lp_lock)
+{
+	struct lnet_handle_md mdh;
+	int rc;
+
+	mdh = lp->lp_ping_mdh;
+	LNetInvalidateMDHandle(&lp->lp_ping_mdh);
+	lp->lp_state &= ~LNET_PEER_PING_FAILED;
+	lp->lp_state |= LNET_PEER_FORCE_PING;
+	rc = lp->lp_ping_error;
+	lp->lp_ping_error = 0;
+	spin_unlock(&lp->lp_lock);
+
+	if (!LNetMDHandleIsInvalid(mdh))
+		LNetMDUnlink(mdh);
+
+	CDEBUG(D_NET, "peer %s:%d\n",
+	       libcfs_nid2str(lp->lp_primary_nid), rc);
+
+	spin_lock(&lp->lp_lock);
+	return rc ? rc : LNET_REDISCOVER_PEER;
+}
+
+/*
+ * Select NID to send a Ping or Push to.
+ */
+static lnet_nid_t lnet_peer_select_nid(struct lnet_peer *lp)
+{
+	struct lnet_peer_ni *lpni;
+
+	/* Look for a direct-connected NID for this peer. */
+	lpni = NULL;
+	while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL) {
+		if (!lnet_get_net_locked(lpni->lpni_peer_net->lpn_net_id))
+			continue;
+		break;
+	}
+	if (lpni)
+		return lpni->lpni_nid;
+
+	/* Look for a routed-connected NID for this peer. */
+	lpni = NULL;
+	while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL) {
+		if (!lnet_find_rnet_locked(lpni->lpni_peer_net->lpn_net_id))
+			continue;
+		break;
+	}
+	if (lpni)
+		return lpni->lpni_nid;
+
+	return LNET_NID_ANY;
+}
+
+/* Active side of ping. */
+static int lnet_peer_send_ping(struct lnet_peer *lp)
+__must_hold(&lp->lp_lock)
+{
+	lnet_nid_t pnid;
+	int nnis;
+	int rc;
+	int cpt;
+
+	lp->lp_state |= LNET_PEER_PING_SENT;
+	lp->lp_state &= ~LNET_PEER_FORCE_PING;
+	spin_unlock(&lp->lp_lock);
+
+	cpt = lnet_net_lock_current();
+	/* Refcount for MD. */
+	lnet_peer_addref_locked(lp);
+	pnid = lnet_peer_select_nid(lp);
+	lnet_net_unlock(cpt);
+
+	nnis = MAX(lp->lp_data_nnis, LNET_INTERFACES_MIN);
+
+	rc = lnet_send_ping(pnid, &lp->lp_ping_mdh, nnis, lp,
+			    the_lnet.ln_dc_eqh, false);
+
+	/*
+	 * if LNetMDBind in lnet_send_ping fails we need to decrement the
+	 * refcount on the peer, otherwise LNetMDUnlink will be called
+	 * which will eventually do that.
+	 */
+	if (rc > 0) {
+		lnet_net_lock(cpt);
+		lnet_peer_decref_locked(lp);
+		lnet_net_unlock(cpt);
+		rc = -rc; /* change the rc to negative value */
+		goto fail_error;
+	} else if (rc < 0) {
+		goto fail_error;
+	}
+
+	CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(lp->lp_primary_nid));
+
+	spin_lock(&lp->lp_lock);
+	return 0;
+
+fail_error:
+	CDEBUG(D_NET, "peer %s: %d\n", libcfs_nid2str(lp->lp_primary_nid), rc);
+	/*
+	 * The errors that get us here are considered hard errors and
+	 * cause Discovery to terminate. So we clear PING_SENT, but do
+	 * not set either PING_FAILED or FORCE_PING. In fact we need
+	 * to clear PING_FAILED, because the unlink event handler will
+	 * have set it if we called LNetMDUnlink() above.
+	 */
+	spin_lock(&lp->lp_lock);
+	lp->lp_state &= ~(LNET_PEER_PING_SENT | LNET_PEER_PING_FAILED);
+	return rc;
+}
+
+/*
+ * This function exists because you cannot call LNetMDUnlink() from an
+ * event handler.
+ */
+static int lnet_peer_push_failed(struct lnet_peer *lp)
+__must_hold(&lp->lp_lock)
+{
+	struct lnet_handle_md mdh;
+	int rc;
+
+	mdh = lp->lp_push_mdh;
+	LNetInvalidateMDHandle(&lp->lp_push_mdh);
+	lp->lp_state &= ~LNET_PEER_PUSH_FAILED;
+	rc = lp->lp_push_error;
+	lp->lp_push_error = 0;
+	spin_unlock(&lp->lp_lock);
+
+	if (!LNetMDHandleIsInvalid(mdh))
+		LNetMDUnlink(mdh);
+
+	CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(lp->lp_primary_nid));
+	spin_lock(&lp->lp_lock);
+	return rc ? rc : LNET_REDISCOVER_PEER;
+}
+
+/*
+ * Mark the peer as discovered.
+ */
+static int lnet_peer_discovered(struct lnet_peer *lp)
+__must_hold(&lp->lp_lock)
+{
+	lp->lp_state |= LNET_PEER_DISCOVERED;
+	lp->lp_state &= ~(LNET_PEER_DISCOVERING |
+			  LNET_PEER_REDISCOVER);
+
+	CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(lp->lp_primary_nid));
+
+	return 0;
+}
+
+/* Active side of push. */
+static int lnet_peer_send_push(struct lnet_peer *lp)
+__must_hold(&lp->lp_lock)
+{
+	struct lnet_ping_buffer *pbuf;
+	struct lnet_process_id id;
+	struct lnet_md md;
+	int cpt;
+	int rc;
+
+	/* Don't push to a non-multi-rail peer. */
+	if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) {
+		lp->lp_state &= ~LNET_PEER_FORCE_PUSH;
+		/* if peer's NIDs are uptodate then peer is discovered */
+		if (lp->lp_state & LNET_PEER_NIDS_UPTODATE) {
+			rc = lnet_peer_discovered(lp);
+			return rc;
+		}
+
+		return 0;
+	}
+
+	lp->lp_state |= LNET_PEER_PUSH_SENT;
+	lp->lp_state &= ~LNET_PEER_FORCE_PUSH;
+	spin_unlock(&lp->lp_lock);
+
+	cpt = lnet_net_lock_current();
+	pbuf = the_lnet.ln_ping_target;
+	lnet_ping_buffer_addref(pbuf);
+	lnet_net_unlock(cpt);
+
+	/* Push source MD */
+	md.start     = &pbuf->pb_info;
+	md.length    = LNET_PING_INFO_SIZE(pbuf->pb_nnis);
+	md.threshold = 2; /* Put/Ack */
+	md.max_size  = 0;
+	md.options   = 0;
+	md.eq_handle = the_lnet.ln_dc_eqh;
+	md.user_ptr  = lp;
+
+	rc = LNetMDBind(md, LNET_UNLINK, &lp->lp_push_mdh);
+	if (rc) {
+		lnet_ping_buffer_decref(pbuf);
+		CERROR("Can't bind push source MD: %d\n", rc);
+		goto fail_error;
+	}
+	cpt = lnet_net_lock_current();
+	/* Refcount for MD. */
+	lnet_peer_addref_locked(lp);
+	id.pid = LNET_PID_LUSTRE;
+	id.nid = lnet_peer_select_nid(lp);
+	lnet_net_unlock(cpt);
+
+	if (id.nid == LNET_NID_ANY) {
+		rc = -EHOSTUNREACH;
+		goto fail_unlink;
+	}
+
+	rc = LNetPut(lp->lp_disc_src_nid, lp->lp_push_mdh,
+		     LNET_ACK_REQ, id, LNET_RESERVED_PORTAL,
+		     LNET_PROTO_PING_MATCHBITS, 0, 0);
+
+	/*
+	 * reset the discovery nid. There is no need to restrict sending
+	 * from that source, if we call lnet_push_update_to_peers(). It'll
+	 * get set to a specific NID, if we initiate discovery from the
+	 * scratch
+	 */
+	lp->lp_disc_src_nid = LNET_NID_ANY;
+
+	if (rc)
+		goto fail_unlink;
+
+	CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(lp->lp_primary_nid));
+
+	spin_lock(&lp->lp_lock);
+	return 0;
+
+fail_unlink:
+	LNetMDUnlink(lp->lp_push_mdh);
+	LNetInvalidateMDHandle(&lp->lp_push_mdh);
+fail_error:
+	CDEBUG(D_NET, "peer %s: %d\n", libcfs_nid2str(lp->lp_primary_nid), rc);
+	/*
+	 * The errors that get us here are considered hard errors and
+	 * cause Discovery to terminate. So we clear PUSH_SENT, but do
+	 * not set PUSH_FAILED. In fact we need to clear PUSH_FAILED,
+	 * because the unlink event handler will have set it if we
+	 * called LNetMDUnlink() above.
+	 */
+	spin_lock(&lp->lp_lock);
+	lp->lp_state &= ~(LNET_PEER_PUSH_SENT | LNET_PEER_PUSH_FAILED);
+	return rc;
+}
+
+/*
+ * An unrecoverable error was encountered during discovery.
+ * Set error status in peer and abort discovery.
+ */
+static void lnet_peer_discovery_error(struct lnet_peer *lp, int error)
+{
+	CDEBUG(D_NET, "Discovery error %s: %d\n",
+	       libcfs_nid2str(lp->lp_primary_nid), error);
+
+	spin_lock(&lp->lp_lock);
+	lp->lp_dc_error = error;
+	lp->lp_state &= ~LNET_PEER_DISCOVERING;
+	lp->lp_state |= LNET_PEER_REDISCOVER;
+	spin_unlock(&lp->lp_lock);
+}
+
+/*
+ * Discovering this peer is taking too long. Cancel any Ping or Push
+ * that discovery is waiting on by unlinking the relevant MDs. The
+ * lnet_discovery_event_handler() will proceed from here and complete
+ * the cleanup.
+ */
+static void lnet_peer_cancel_discovery(struct lnet_peer *lp)
+{
+	struct lnet_handle_md ping_mdh;
+	struct lnet_handle_md push_mdh;
+
+	LNetInvalidateMDHandle(&ping_mdh);
+	LNetInvalidateMDHandle(&push_mdh);
+
+	spin_lock(&lp->lp_lock);
+	if (lp->lp_state & LNET_PEER_PING_SENT) {
+		ping_mdh = lp->lp_ping_mdh;
+		LNetInvalidateMDHandle(&lp->lp_ping_mdh);
+	}
+	if (lp->lp_state & LNET_PEER_PUSH_SENT) {
+		push_mdh = lp->lp_push_mdh;
+		LNetInvalidateMDHandle(&lp->lp_push_mdh);
+	}
+	spin_unlock(&lp->lp_lock);
+
+	if (!LNetMDHandleIsInvalid(ping_mdh))
+		LNetMDUnlink(ping_mdh);
+	if (!LNetMDHandleIsInvalid(push_mdh))
+		LNetMDUnlink(push_mdh);
+}
+
+/*
+ * Wait for work to be queued or some other change that must be
+ * attended to. Returns non-zero if the discovery thread should shut
+ * down.
+ */
+static int lnet_peer_discovery_wait_for_work(void)
+{
+	int cpt;
+	int rc = 0;
+
+	DEFINE_WAIT(wait);
+
+	cpt = lnet_net_lock_current();
+	for (;;) {
+		prepare_to_wait(&the_lnet.ln_dc_waitq, &wait,
+				TASK_INTERRUPTIBLE);
+		if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING)
+			break;
+		if (lnet_push_target_resize_needed())
+			break;
+		if (!list_empty(&the_lnet.ln_dc_request))
+			break;
+		if (!list_empty(&the_lnet.ln_msg_resend))
+			break;
+		lnet_net_unlock(cpt);
+
+		/*
+		 * wakeup max every second to check if there are peers that
+		 * have been stuck on the working queue for greater than
+		 * the peer timeout.
+		 */
+		schedule_timeout(cfs_time_seconds(1));
+		finish_wait(&the_lnet.ln_dc_waitq, &wait);
+		cpt = lnet_net_lock_current();
+	}
+	finish_wait(&the_lnet.ln_dc_waitq, &wait);
+
+	if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING)
+		rc = -ESHUTDOWN;
+
+	lnet_net_unlock(cpt);
+
+	CDEBUG(D_NET, "woken: %d\n", rc);
+
+	return rc;
+}
+
+/*
+ * Messages that were pending on a destroyed peer will be put on a global
+ * resend list. The message resend list will be checked by
+ * the discovery thread when it wakes up, and will resend messages. These
+ * messages can still be sendable in the case the lpni which was the initial
+ * cause of the message re-queue was transfered to another peer.
+ *
+ * It is possible that LNet could be shutdown while we're iterating
+ * through the list. lnet_shudown_lndnets() will attempt to access the
+ * resend list, but will have to wait until the spinlock is released, by
+ * which time there shouldn't be any more messages on the resend list.
+ * During shutdown lnet_send() will fail and lnet_finalize() will be called
+ * for the messages so they can be released. The other case is that
+ * lnet_shudown_lndnets() can finalize all the messages before this
+ * function can visit the resend list, in which case this function will be
+ * a no-op.
+ */
+static void lnet_resend_msgs(void)
+{
+	struct lnet_msg *msg, *tmp;
+	struct list_head resend;
+	int rc;
+
+	INIT_LIST_HEAD(&resend);
+
+	spin_lock(&the_lnet.ln_msg_resend_lock);
+	list_splice(&the_lnet.ln_msg_resend, &resend);
+	spin_unlock(&the_lnet.ln_msg_resend_lock);
+
+	list_for_each_entry_safe(msg, tmp, &resend, msg_list) {
+		list_del_init(&msg->msg_list);
+		rc = lnet_send(msg->msg_src_nid_param, msg,
+			       msg->msg_rtr_nid_param);
+		if (rc < 0) {
+			CNETERR("Error sending %s to %s: %d\n",
+			       lnet_msgtyp2str(msg->msg_type),
+			       libcfs_id2str(msg->msg_target), rc);
+			lnet_finalize(msg, rc);
+		}
+	}
+}
+
+/* The discovery thread. */
+static int lnet_peer_discovery(void *arg)
+{
+	struct lnet_peer *lp;
+	int rc;
+
+	CDEBUG(D_NET, "started\n");
+	cfs_block_allsigs();
+
+	for (;;) {
+		if (lnet_peer_discovery_wait_for_work())
+			break;
+
+		lnet_resend_msgs();
+
+		if (lnet_push_target_resize_needed())
+			lnet_push_target_resize();
+
+		lnet_net_lock(LNET_LOCK_EX);
+		if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING) {
+			lnet_net_unlock(LNET_LOCK_EX);
+			break;
+		}
+
+		/*
+		 * Process all incoming discovery work requests.  When
+		 * discovery must wait on a peer to change state, it
+		 * is added to the tail of the ln_dc_working queue. A
+		 * timestamp keeps track of when the peer was added,
+		 * so we can time out discovery requests that take too
+		 * long.
+		 */
+		while (!list_empty(&the_lnet.ln_dc_request)) {
+			lp = list_first_entry(&the_lnet.ln_dc_request,
+					      struct lnet_peer, lp_dc_list);
+			list_move(&lp->lp_dc_list, &the_lnet.ln_dc_working);
+			/*
+			 * set the time the peer was put on the dc_working
+			 * queue. It shouldn't remain on the queue
+			 * forever, in case the GET message (for ping)
+			 * doesn't get a REPLY or the PUT message (for
+			 * push) doesn't get an ACK.
+			 */
+			lp->lp_last_queued = ktime_get_real_seconds();
+			lnet_net_unlock(LNET_LOCK_EX);
+
+			/*
+			 * Select an action depending on the state of
+			 * the peer and whether discovery is disabled.
+			 * The check whether discovery is disabled is
+			 * done after the code that handles processing
+			 * for arrived data, cleanup for failures, and
+			 * forcing a Ping or Push.
+			 */
+			spin_lock(&lp->lp_lock);
+			CDEBUG(D_NET, "peer %s state %#x\n",
+				libcfs_nid2str(lp->lp_primary_nid),
+				lp->lp_state);
+			if (lp->lp_state & LNET_PEER_DATA_PRESENT)
+				rc = lnet_peer_data_present(lp);
+			else if (lp->lp_state & LNET_PEER_PING_FAILED)
+				rc = lnet_peer_ping_failed(lp);
+			else if (lp->lp_state & LNET_PEER_PUSH_FAILED)
+				rc = lnet_peer_push_failed(lp);
+			else if (lp->lp_state & LNET_PEER_FORCE_PING)
+				rc = lnet_peer_send_ping(lp);
+			else if (lp->lp_state & LNET_PEER_FORCE_PUSH)
+				rc = lnet_peer_send_push(lp);
+			else if (!(lp->lp_state & LNET_PEER_NIDS_UPTODATE))
+				rc = lnet_peer_send_ping(lp);
+			else if (lnet_peer_needs_push(lp))
+				rc = lnet_peer_send_push(lp);
+			else
+				rc = lnet_peer_discovered(lp);
+			CDEBUG(D_NET, "peer %s state %#x rc %d\n",
+				libcfs_nid2str(lp->lp_primary_nid),
+				lp->lp_state, rc);
+			spin_unlock(&lp->lp_lock);
+
+			lnet_net_lock(LNET_LOCK_EX);
+			if (rc == LNET_REDISCOVER_PEER) {
+				list_move(&lp->lp_dc_list,
+					  &the_lnet.ln_dc_request);
+			} else if (rc) {
+				lnet_peer_discovery_error(lp, rc);
+			}
+			if (!(lp->lp_state & LNET_PEER_DISCOVERING))
+				lnet_peer_discovery_complete(lp);
+			if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING)
+				break;
+		}
+
+		lnet_net_unlock(LNET_LOCK_EX);
+	}
+
+	CDEBUG(D_NET, "stopping\n");
+	/*
+	 * Clean up before telling lnet_peer_discovery_stop() that
+	 * we're done. Use wake_up() below to somewhat reduce the
+	 * size of the thundering herd if there are multiple threads
+	 * waiting on discovery of a single peer.
+	 */
+
+	/* Queue cleanup 1: stop all pending pings and pushes. */
+	lnet_net_lock(LNET_LOCK_EX);
+	while (!list_empty(&the_lnet.ln_dc_working)) {
+		lp = list_first_entry(&the_lnet.ln_dc_working,
+				      struct lnet_peer, lp_dc_list);
+		list_move(&lp->lp_dc_list, &the_lnet.ln_dc_expired);
+		lnet_net_unlock(LNET_LOCK_EX);
+		lnet_peer_cancel_discovery(lp);
+		lnet_net_lock(LNET_LOCK_EX);
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	/* Queue cleanup 2: wait for the expired queue to clear. */
+	while (!list_empty(&the_lnet.ln_dc_expired))
+		schedule_timeout(cfs_time_seconds(1));
+
+	/* Queue cleanup 3: clear the request queue. */
+	lnet_net_lock(LNET_LOCK_EX);
+	while (!list_empty(&the_lnet.ln_dc_request)) {
+		lp = list_first_entry(&the_lnet.ln_dc_request,
+				      struct lnet_peer, lp_dc_list);
+		lnet_peer_discovery_error(lp, -ESHUTDOWN);
+		lnet_peer_discovery_complete(lp);
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	LNetEQFree(the_lnet.ln_dc_eqh);
+	LNetInvalidateEQHandle(&the_lnet.ln_dc_eqh);
+
+	the_lnet.ln_dc_state = LNET_DC_STATE_SHUTDOWN;
+	wake_up(&the_lnet.ln_dc_waitq);
+
+	CDEBUG(D_NET, "stopped\n");
+
+	return 0;
+}
+
+/* ln_api_mutex is held on entry. */
+int lnet_peer_discovery_start(void)
+{
+	struct task_struct *task;
+	int rc;
+
+	if (the_lnet.ln_dc_state != LNET_DC_STATE_SHUTDOWN)
+		return -EALREADY;
+
+	rc = LNetEQAlloc(0, lnet_discovery_event_handler, &the_lnet.ln_dc_eqh);
+	if (rc != 0) {
+		CERROR("Can't allocate discovery EQ: %d\n", rc);
+		return rc;
+	}
+
+	the_lnet.ln_dc_state = LNET_DC_STATE_RUNNING;
+	task = kthread_run(lnet_peer_discovery, NULL, "lnet_discovery");
+	if (IS_ERR(task)) {
+		rc = PTR_ERR(task);
+		CERROR("Can't start peer discovery thread: %d\n", rc);
+
+		LNetEQFree(the_lnet.ln_dc_eqh);
+		LNetInvalidateEQHandle(&the_lnet.ln_dc_eqh);
+
+		the_lnet.ln_dc_state = LNET_DC_STATE_SHUTDOWN;
+	}
+
+	CDEBUG(D_NET, "discovery start: %d\n", rc);
+
+	return rc;
+}
+
+/* ln_api_mutex is held on entry. */
+void lnet_peer_discovery_stop(void)
+{
+	if (the_lnet.ln_dc_state == LNET_DC_STATE_SHUTDOWN)
+		return;
+
+	LASSERT(the_lnet.ln_dc_state == LNET_DC_STATE_RUNNING);
+	the_lnet.ln_dc_state = LNET_DC_STATE_STOPPING;
+	wake_up(&the_lnet.ln_dc_waitq);
+
+	wait_event(the_lnet.ln_dc_waitq,
+		   the_lnet.ln_dc_state == LNET_DC_STATE_SHUTDOWN);
+
+	LASSERT(list_empty(&the_lnet.ln_dc_request));
+	LASSERT(list_empty(&the_lnet.ln_dc_working));
+	LASSERT(list_empty(&the_lnet.ln_dc_expired));
+
+	CDEBUG(D_NET, "discovery stopped\n");
+}
+
+/* Debugging */
+
+void
+lnet_debug_peer(lnet_nid_t nid)
+{
+	char			*aliveness = "NA";
+	struct lnet_peer_ni	*lp;
+	int			cpt;
+
+	cpt = lnet_cpt_of_nid(nid, NULL);
+	lnet_net_lock(cpt);
+
+	lp = lnet_nid2peerni_locked(nid, LNET_NID_ANY, cpt);
 	if (IS_ERR(lp)) {
 		lnet_net_unlock(cpt);
 		CDEBUG(D_WARNING, "No peer %s\n", libcfs_nid2str(nid));
@@ -1106,6 +3363,8 @@ lnet_debug_peer(lnet_nid_t nid)
 	lnet_net_unlock(cpt);
 }
 
+/* Gathering information for userspace. */
+
 int lnet_get_peer_ni_info(__u32 peer_index, __u64 *nid,
 			  char aliveness[LNET_MAX_STR_LEN],
 			  __u32 *cpt_iter, __u32 *refcount,
@@ -1169,56 +3428,193 @@ int lnet_get_peer_ni_info(__u32 peer_index, __u64 *nid,
 	return found ? 0 : -ENOENT;
 }
 
-int lnet_get_peer_info(__u32 idx, lnet_nid_t *primary_nid, lnet_nid_t *nid,
-		       bool *mr,
-		       struct lnet_peer_ni_credit_info __user *peer_ni_info,
-		       struct lnet_ioctl_element_stats __user *peer_ni_stats)
+/* ln_api_mutex is held, which keeps the peer list stable */
+int lnet_get_peer_info(struct lnet_ioctl_peer_cfg *cfg, void __user *bulk)
 {
-	struct lnet_peer_ni *lpni = NULL;
-	struct lnet_peer_net *lpn = NULL;
-	struct lnet_peer *lp = NULL;
-	struct lnet_peer_ni_credit_info ni_info;
-	struct lnet_ioctl_element_stats ni_stats;
+	struct lnet_ioctl_element_stats *lpni_stats;
+	struct lnet_ioctl_element_msg_stats *lpni_msg_stats;
+	struct lnet_ioctl_peer_ni_hstats *lpni_hstats;
+	struct lnet_peer_ni_credit_info *lpni_info;
+	struct lnet_peer_ni *lpni;
+	struct lnet_peer *lp;
+	lnet_nid_t nid;
+	__u32 size;
 	int rc;
 
-	lpni = lnet_get_peer_ni_idx_locked(idx, &lpn, &lp);
-
-	if (!lpni)
-		return -ENOENT;
+	lp = lnet_find_peer(cfg->prcfg_prim_nid);
 
-	*primary_nid = lp->lp_primary_nid;
-	*mr = lp->lp_multi_rail;
-	*nid = lpni->lpni_nid;
-	snprintf(ni_info.cr_aliveness, LNET_MAX_STR_LEN, "NA");
-	if (lnet_isrouter(lpni) ||
-		lnet_peer_aliveness_enabled(lpni))
-		snprintf(ni_info.cr_aliveness, LNET_MAX_STR_LEN,
-			 lpni->lpni_alive ? "up" : "down");
-
-	ni_info.cr_refcount = atomic_read(&lpni->lpni_refcount);
-	ni_info.cr_ni_peer_tx_credits = (lpni->lpni_net != NULL) ?
-		lpni->lpni_net->net_tunables.lct_peer_tx_credits : 0;
-	ni_info.cr_peer_tx_credits = lpni->lpni_txcredits;
-	ni_info.cr_peer_rtr_credits = lpni->lpni_rtrcredits;
-	ni_info.cr_peer_min_rtr_credits = lpni->lpni_minrtrcredits;
-	ni_info.cr_peer_min_tx_credits = lpni->lpni_mintxcredits;
-	ni_info.cr_peer_tx_qnob = lpni->lpni_txqnob;
-	ni_info.cr_ncpt = lpni->lpni_cpt;
-
-	ni_stats.iel_send_count = atomic_read(&lpni->lpni_stats.send_count);
-	ni_stats.iel_recv_count = atomic_read(&lpni->lpni_stats.recv_count);
-	ni_stats.iel_drop_count = atomic_read(&lpni->lpni_stats.drop_count);
-
-	/* If copy_to_user fails */
-	rc = -EFAULT;
-	if (copy_to_user(peer_ni_info, &ni_info, sizeof(ni_info)))
-		goto copy_failed;
+	if (!lp) {
+		rc = -ENOENT;
+		goto out;
+	}
 
-	if (copy_to_user(peer_ni_stats, &ni_stats, sizeof(ni_stats)))
-		goto copy_failed;
+	size = sizeof(nid) + sizeof(*lpni_info) + sizeof(*lpni_stats)
+		+ sizeof(*lpni_msg_stats) + sizeof(*lpni_hstats);
+	size *= lp->lp_nnis;
+	if (size > cfg->prcfg_size) {
+		cfg->prcfg_size = size;
+		rc = -E2BIG;
+		goto out_lp_decref;
+	}
 
+	cfg->prcfg_prim_nid = lp->lp_primary_nid;
+	cfg->prcfg_mr = lnet_peer_is_multi_rail(lp);
+	cfg->prcfg_cfg_nid = lp->lp_primary_nid;
+	cfg->prcfg_count = lp->lp_nnis;
+	cfg->prcfg_size = size;
+	cfg->prcfg_state = lp->lp_state;
+
+	/* Allocate helper buffers. */
+	rc = -ENOMEM;
+	LIBCFS_ALLOC(lpni_info, sizeof(*lpni_info));
+	if (!lpni_info)
+		goto out_lp_decref;
+	LIBCFS_ALLOC(lpni_stats, sizeof(*lpni_stats));
+	if (!lpni_stats)
+		goto out_free_info;
+	LIBCFS_ALLOC(lpni_msg_stats, sizeof(*lpni_msg_stats));
+	if (!lpni_msg_stats)
+		goto out_free_stats;
+	LIBCFS_ALLOC(lpni_hstats, sizeof(*lpni_hstats));
+	if (!lpni_hstats)
+		goto out_free_msg_stats;
+
+
+	lpni = NULL;
+	rc = -EFAULT;
+	while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL) {
+		nid = lpni->lpni_nid;
+		if (copy_to_user(bulk, &nid, sizeof(nid)))
+			goto out_free_hstats;
+		bulk += sizeof(nid);
+
+		memset(lpni_info, 0, sizeof(*lpni_info));
+		snprintf(lpni_info->cr_aliveness, LNET_MAX_STR_LEN, "NA");
+		if (lnet_isrouter(lpni) ||
+			lnet_peer_aliveness_enabled(lpni))
+			snprintf(lpni_info->cr_aliveness, LNET_MAX_STR_LEN,
+				lpni->lpni_alive ? "up" : "down");
+
+		lpni_info->cr_refcount = atomic_read(&lpni->lpni_refcount);
+		lpni_info->cr_ni_peer_tx_credits = (lpni->lpni_net != NULL) ?
+			lpni->lpni_net->net_tunables.lct_peer_tx_credits : 0;
+		lpni_info->cr_peer_tx_credits = lpni->lpni_txcredits;
+		lpni_info->cr_peer_rtr_credits = lpni->lpni_rtrcredits;
+		lpni_info->cr_peer_min_rtr_credits = lpni->lpni_minrtrcredits;
+		lpni_info->cr_peer_min_tx_credits = lpni->lpni_mintxcredits;
+		lpni_info->cr_peer_tx_qnob = lpni->lpni_txqnob;
+		if (copy_to_user(bulk, lpni_info, sizeof(*lpni_info)))
+			goto out_free_hstats;
+		bulk += sizeof(*lpni_info);
+
+		memset(lpni_stats, 0, sizeof(*lpni_stats));
+		lpni_stats->iel_send_count = lnet_sum_stats(&lpni->lpni_stats,
+							    LNET_STATS_TYPE_SEND);
+		lpni_stats->iel_recv_count = lnet_sum_stats(&lpni->lpni_stats,
+							    LNET_STATS_TYPE_RECV);
+		lpni_stats->iel_drop_count = lnet_sum_stats(&lpni->lpni_stats,
+							    LNET_STATS_TYPE_DROP);
+		if (copy_to_user(bulk, lpni_stats, sizeof(*lpni_stats)))
+			goto out_free_hstats;
+		bulk += sizeof(*lpni_stats);
+		lnet_usr_translate_stats(lpni_msg_stats, &lpni->lpni_stats);
+		if (copy_to_user(bulk, lpni_msg_stats, sizeof(*lpni_msg_stats)))
+			goto out_free_hstats;
+		bulk += sizeof(*lpni_msg_stats);
+		lpni_hstats->hlpni_network_timeout =
+		  atomic_read(&lpni->lpni_hstats.hlt_network_timeout);
+		lpni_hstats->hlpni_remote_dropped =
+		  atomic_read(&lpni->lpni_hstats.hlt_remote_dropped);
+		lpni_hstats->hlpni_remote_timeout =
+		  atomic_read(&lpni->lpni_hstats.hlt_remote_timeout);
+		lpni_hstats->hlpni_remote_error =
+		  atomic_read(&lpni->lpni_hstats.hlt_remote_error);
+		lpni_hstats->hlpni_health_value =
+		  atomic_read(&lpni->lpni_healthv);
+		if (copy_to_user(bulk, lpni_hstats, sizeof(*lpni_hstats)))
+			goto out_free_hstats;
+		bulk += sizeof(*lpni_hstats);
+	}
 	rc = 0;
 
-copy_failed:
+out_free_hstats:
+	LIBCFS_FREE(lpni_hstats, sizeof(*lpni_hstats));
+out_free_msg_stats:
+	LIBCFS_FREE(lpni_msg_stats, sizeof(*lpni_msg_stats));
+out_free_stats:
+	LIBCFS_FREE(lpni_stats, sizeof(*lpni_stats));
+out_free_info:
+	LIBCFS_FREE(lpni_info, sizeof(*lpni_info));
+out_lp_decref:
+	lnet_peer_decref_locked(lp);
+out:
 	return rc;
 }
+
+void
+lnet_peer_ni_add_to_recoveryq_locked(struct lnet_peer_ni *lpni)
+{
+	/* the mt could've shutdown and cleaned up the queues */
+	if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING)
+		return;
+
+	if (list_empty(&lpni->lpni_recovery) &&
+	    atomic_read(&lpni->lpni_healthv) < LNET_MAX_HEALTH_VALUE) {
+		CDEBUG(D_NET, "lpni %s added to recovery queue. Health = %d\n",
+			libcfs_nid2str(lpni->lpni_nid),
+			atomic_read(&lpni->lpni_healthv));
+		list_add_tail(&lpni->lpni_recovery, &the_lnet.ln_mt_peerNIRecovq);
+		lnet_peer_ni_addref_locked(lpni);
+	}
+}
+
+/* Call with the ln_api_mutex held */
+void
+lnet_peer_ni_set_healthv(lnet_nid_t nid, int value, bool all)
+{
+	struct lnet_peer_table *ptable;
+	struct lnet_peer *lp;
+	struct lnet_peer_net *lpn;
+	struct lnet_peer_ni *lpni;
+	int lncpt;
+	int cpt;
+
+	if (the_lnet.ln_state != LNET_STATE_RUNNING)
+		return;
+
+	if (!all) {
+		lnet_net_lock(LNET_LOCK_EX);
+		lpni = lnet_find_peer_ni_locked(nid);
+		if (!lpni) {
+			lnet_net_unlock(LNET_LOCK_EX);
+			return;
+		}
+		atomic_set(&lpni->lpni_healthv, value);
+		lnet_peer_ni_add_to_recoveryq_locked(lpni);
+		lnet_peer_ni_decref_locked(lpni);
+		lnet_net_unlock(LNET_LOCK_EX);
+		return;
+	}
+
+	lncpt = cfs_percpt_number(the_lnet.ln_peer_tables);
+
+	/*
+	 * Walk all the peers and reset the healhv for each one to the
+	 * maximum value.
+	 */
+	lnet_net_lock(LNET_LOCK_EX);
+	for (cpt = 0; cpt < lncpt; cpt++) {
+		ptable = the_lnet.ln_peer_tables[cpt];
+		list_for_each_entry(lp, &ptable->pt_peer_list, lp_peer_list) {
+			list_for_each_entry(lpn, &lp->lp_peer_nets, lpn_peer_nets) {
+				list_for_each_entry(lpni, &lpn->lpn_peer_nis,
+						    lpni_peer_nis) {
+					atomic_set(&lpni->lpni_healthv, value);
+					lnet_peer_ni_add_to_recoveryq_locked(lpni);
+				}
+			}
+		}
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
diff --git a/drivers/staging/lustrefsx/lnet/lnet/router.c b/drivers/staging/lustrefsx/lnet/lnet/router.c
index bd30963a960d1..e2966cf77c561 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/router.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/router.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  *
  *   This file is part of Lustre, https://wiki.whamcloud.com/
  *
@@ -68,9 +68,6 @@ lnet_peer_buffer_credits(struct lnet_net *net)
 	return net->net_tunables.lct_peer_tx_credits;
 }
 
-/* forward ref's */
-static int lnet_router_checker(void *);
-
 static int check_routers_before_use;
 module_param(check_routers_before_use, int, 0444);
 MODULE_PARM_DESC(check_routers_before_use, "Assume routers are down and ping them before use");
@@ -99,9 +96,9 @@ lnet_peers_start_down(void)
 
 void
 lnet_notify_locked(struct lnet_peer_ni *lp, int notifylnd, int alive,
-		   cfs_time_t when)
+		   time64_t when)
 {
-	if (cfs_time_before(when, lp->lpni_timestamp)) { /* out of date information */
+	if (lp->lpni_timestamp > when) { /* out of date information */
 		CDEBUG(D_NET, "Out of date\n");
 		return;
 	}
@@ -114,7 +111,7 @@ lnet_notify_locked(struct lnet_peer_ni *lp, int notifylnd, int alive,
 	 */
 	spin_lock(&lp->lpni_lock);
 
-	lp->lpni_timestamp = when;                /* update timestamp */
+	lp->lpni_timestamp = when; /* update timestamp */
 	lp->lpni_ping_deadline = 0;               /* disable ping timeout */
 
 	if (lp->lpni_alive_count != 0 &&          /* got old news */
@@ -334,7 +331,7 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
 	       libcfs_net2str(net), hops, priority, libcfs_nid2str(gateway));
 
 	if (gateway == LNET_NID_ANY ||
-	    LNET_NETTYP(LNET_NIDNET(gateway)) == LOLND ||
+	    gateway == LNET_NID_LO_0 ||
 	    net == LNET_NIDNET(LNET_NID_ANY) ||
 	    LNET_NETTYP(net) == LOLND ||
 	    LNET_NIDNET(gateway) == net ||
@@ -344,6 +341,13 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
 	if (lnet_islocalnet(net))	/* it's a local network */
 		return -EEXIST;
 
+	if (!lnet_islocalnet(LNET_NIDNET(gateway))) {
+		CERROR("Cannot add route with gateway %s. There is no local interface configured on LNet %s\n",
+		       libcfs_nid2str(gateway),
+		       libcfs_net2str(LNET_NIDNET(gateway)));
+		return -EHOSTUNREACH;
+	}
+
 	/* Assume net, route, all new */
 	LIBCFS_ALLOC(route, sizeof(*route));
 	LIBCFS_ALLOC(rnet, sizeof(*rnet));
@@ -433,8 +437,8 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
 	if (rnet != rnet2)
 		LIBCFS_FREE(rnet, sizeof(*rnet));
 
-	/* indicate to startup the router checker if configured */
-	wake_up(&the_lnet.ln_rc_waitq);
+	/* kick start the monitor thread to handle the added route */
+	wake_up(&the_lnet.ln_mt_waitq);
 
 	return rc;
 }
@@ -577,29 +581,29 @@ lnet_destroy_routes (void)
 	lnet_del_route(LNET_NIDNET(LNET_NID_ANY), LNET_NID_ANY);
 }
 
-int lnet_get_rtr_pool_cfg(int idx, struct lnet_ioctl_pool_cfg *pool_cfg)
+int lnet_get_rtr_pool_cfg(int cpt, struct lnet_ioctl_pool_cfg *pool_cfg)
 {
+	struct lnet_rtrbufpool *rbp;
 	int i, rc = -ENOENT, j;
 
 	if (the_lnet.ln_rtrpools == NULL)
 		return rc;
 
-	for (i = 0; i < LNET_NRBPOOLS; i++) {
-		struct lnet_rtrbufpool *rbp;
 
-		lnet_net_lock(LNET_LOCK_EX);
-		cfs_percpt_for_each(rbp, j, the_lnet.ln_rtrpools) {
-			if (i++ != idx)
-				continue;
+	cfs_percpt_for_each(rbp, i, the_lnet.ln_rtrpools) {
+		if (i != cpt)
+			continue;
 
-			pool_cfg->pl_pools[i].pl_npages = rbp[i].rbp_npages;
-			pool_cfg->pl_pools[i].pl_nbuffers = rbp[i].rbp_nbuffers;
-			pool_cfg->pl_pools[i].pl_credits = rbp[i].rbp_credits;
-			pool_cfg->pl_pools[i].pl_mincredits = rbp[i].rbp_mincredits;
-			rc = 0;
-			break;
+		lnet_net_lock(i);
+		for (j = 0; j < LNET_NRBPOOLS; j++) {
+			pool_cfg->pl_pools[j].pl_npages = rbp[j].rbp_npages;
+			pool_cfg->pl_pools[j].pl_nbuffers = rbp[j].rbp_nbuffers;
+			pool_cfg->pl_pools[j].pl_credits = rbp[j].rbp_credits;
+			pool_cfg->pl_pools[j].pl_mincredits = rbp[j].rbp_mincredits;
 		}
-		lnet_net_unlock(LNET_LOCK_EX);
+		lnet_net_unlock(i);
+		rc = 0;
+		break;
 	}
 
 	lnet_net_lock(LNET_LOCK_EX);
@@ -650,17 +654,21 @@ lnet_get_route(int idx, __u32 *net, __u32 *hops,
 }
 
 void
-lnet_swap_pinginfo(struct lnet_ping_info *info)
+lnet_swap_pinginfo(struct lnet_ping_buffer *pbuf)
 {
-	int		  i;
 	struct lnet_ni_status *stat;
+	int nnis;
+	int i;
 
-	__swab32s(&info->pi_magic);
-	__swab32s(&info->pi_features);
-	__swab32s(&info->pi_pid);
-	__swab32s(&info->pi_nnis);
-	for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
-		stat = &info->pi_ni[i];
+	__swab32s(&pbuf->pb_info.pi_magic);
+	__swab32s(&pbuf->pb_info.pi_features);
+	__swab32s(&pbuf->pb_info.pi_pid);
+	__swab32s(&pbuf->pb_info.pi_nnis);
+	nnis = pbuf->pb_info.pi_nnis;
+	if (nnis > pbuf->pb_nnis)
+		nnis = pbuf->pb_nnis;
+	for (i = 0; i < nnis; i++) {
+		stat = &pbuf->pb_info.pi_ni[i];
 		__swab64s(&stat->ns_nid);
 		__swab32s(&stat->ns_status);
 	}
@@ -674,11 +682,12 @@ lnet_swap_pinginfo(struct lnet_ping_info *info)
 static void
 lnet_parse_rc_info(struct lnet_rc_data *rcd)
 {
-	struct lnet_ping_info	*info = rcd->rcd_pinginfo;
+	struct lnet_ping_buffer	*pbuf = rcd->rcd_pingbuffer;
 	struct lnet_peer_ni	*gw   = rcd->rcd_gateway;
 	struct lnet_route		*rte;
+	int			nnis;
 
-	if (!gw->lpni_alive)
+	if (!gw->lpni_alive || !pbuf)
 		return;
 
 	/*
@@ -687,29 +696,29 @@ lnet_parse_rc_info(struct lnet_rc_data *rcd)
 	 */
 	spin_lock(&gw->lpni_lock);
 
-	if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC))
-		lnet_swap_pinginfo(info);
+	if (pbuf->pb_info.pi_magic == __swab32(LNET_PROTO_PING_MAGIC))
+		lnet_swap_pinginfo(pbuf);
 
 	/* NB always racing with network! */
-	if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
+	if (pbuf->pb_info.pi_magic != LNET_PROTO_PING_MAGIC) {
 		CDEBUG(D_NET, "%s: Unexpected magic %08x\n",
-		       libcfs_nid2str(gw->lpni_nid), info->pi_magic);
+		       libcfs_nid2str(gw->lpni_nid), pbuf->pb_info.pi_magic);
 		gw->lpni_ping_feats = LNET_PING_FEAT_INVAL;
-		spin_unlock(&gw->lpni_lock);
-		return;
+		goto out;
 	}
 
-	gw->lpni_ping_feats = info->pi_features;
-	if ((gw->lpni_ping_feats & LNET_PING_FEAT_MASK) == 0) {
-		CDEBUG(D_NET, "%s: Unexpected features 0x%x\n",
-		       libcfs_nid2str(gw->lpni_nid), gw->lpni_ping_feats);
-		spin_unlock(&gw->lpni_lock);
-		return; /* nothing I can understand */
-	}
+	gw->lpni_ping_feats = pbuf->pb_info.pi_features;
+
+	/* Without NI status info there's nothing more to do. */
+	if ((gw->lpni_ping_feats & LNET_PING_FEAT_NI_STATUS) == 0)
+		goto out;
 
-	if ((gw->lpni_ping_feats & LNET_PING_FEAT_NI_STATUS) == 0) {
-		spin_unlock(&gw->lpni_lock);
-		return; /* can't carry NI status info */
+	/* Determine the number of NIs for which there is data. */
+	nnis = pbuf->pb_info.pi_nnis;
+	if (pbuf->pb_nnis < nnis) {
+		if (rcd->rcd_nnis < nnis)
+			rcd->rcd_nnis = nnis;
+		nnis = pbuf->pb_nnis;
 	}
 
 	list_for_each_entry(rte, &gw->lpni_routes, lr_gwlist) {
@@ -717,24 +726,24 @@ lnet_parse_rc_info(struct lnet_rc_data *rcd)
 		int	up = 0;
 		int	i;
 
+		/* If routing disabled then the route is down. */
 		if ((gw->lpni_ping_feats & LNET_PING_FEAT_RTE_DISABLED) != 0) {
 			rte->lr_downis = 1;
 			continue;
 		}
 
-		for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
-			struct lnet_ni_status *stat = &info->pi_ni[i];
+		for (i = 0; i < nnis; i++) {
+			struct lnet_ni_status *stat = &pbuf->pb_info.pi_ni[i];
 			lnet_nid_t	 nid = stat->ns_nid;
 
 			if (nid == LNET_NID_ANY) {
 				CDEBUG(D_NET, "%s: unexpected LNET_NID_ANY\n",
 				       libcfs_nid2str(gw->lpni_nid));
 				gw->lpni_ping_feats = LNET_PING_FEAT_INVAL;
-				spin_unlock(&gw->lpni_lock);
-				return;
+				goto out;
 			}
 
-			if (LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
+			if (nid == LNET_NID_LO_0)
 				continue;
 
 			if (stat->ns_status == LNET_NI_STATUS_DOWN) {
@@ -753,8 +762,7 @@ lnet_parse_rc_info(struct lnet_rc_data *rcd)
 			CDEBUG(D_NET, "%s: Unexpected status 0x%x\n",
 			       libcfs_nid2str(gw->lpni_nid), stat->ns_status);
 			gw->lpni_ping_feats = LNET_PING_FEAT_INVAL;
-			spin_unlock(&gw->lpni_lock);
-			return;
+			goto out;
 		}
 
 		if (up) { /* ignore downed NIs if NI for dest network is up */
@@ -768,7 +776,7 @@ lnet_parse_rc_info(struct lnet_rc_data *rcd)
 
 		rte->lr_downis = down;
 	}
-
+out:
 	spin_unlock(&gw->lpni_lock);
 }
 
@@ -812,7 +820,7 @@ lnet_router_checker_event(struct lnet_event *event)
 	 * we ping alive routers to try to detect router death before
 	 * apps get burned). */
 
-	lnet_notify_locked(lp, 1, (event->status == 0), cfs_time_current());
+	lnet_notify_locked(lp, 1, !event->status, ktime_get_seconds());
 	/* The router checker will wake up very shortly and do the
 	 * actual notification.
 	 * XXX If 'lp' stops being a router before then, it will still
@@ -832,8 +840,9 @@ lnet_wait_known_routerstate(void)
 	struct list_head *entry;
 	int all_known;
 
-	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+	LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING);
 
+	/* the_lnet.ln_api_mutex must be locked */
 	for (;;) {
 		int cpt = lnet_net_lock_current();
 
@@ -857,8 +866,10 @@ lnet_wait_known_routerstate(void)
 		if (all_known)
 			return;
 
+		mutex_unlock(&the_lnet.ln_api_mutex);
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_timeout(cfs_time_seconds(1));
+		mutex_lock(&the_lnet.ln_api_mutex);
 	}
 }
 
@@ -881,15 +892,15 @@ static void
 lnet_update_ni_status_locked(void)
 {
 	struct lnet_ni *ni = NULL;
-	long now;
-	int timeout;
+	time64_t now;
+	time64_t timeout;
 
 	LASSERT(the_lnet.ln_routing);
 
 	timeout = router_ping_timeout +
 		  MAX(live_router_check_interval, dead_router_check_interval);
 
-	now = cfs_time_current_sec();
+	now = ktime_get_real_seconds();
 	while ((ni = lnet_get_next_ni_locked(NULL, ni))) {
 		if (ni->ni_net->net_lnd->lnd_type == LOLND)
 			continue;
@@ -907,7 +918,7 @@ lnet_update_ni_status_locked(void)
 		LASSERT(ni->ni_status != NULL);
 
 		if (ni->ni_status->ns_status != LNET_NI_STATUS_DOWN) {
-			CDEBUG(D_NET, "NI(%s:%d) status changed to down\n",
+			CDEBUG(D_NET, "NI(%s:%lld) status changed to down\n",
 			       libcfs_nid2str(ni->ni_nid), timeout);
 			/* NB: so far, this is the only place to set
 			 * NI status to "down" */
@@ -932,43 +943,62 @@ lnet_destroy_rc_data(struct lnet_rc_data *rcd)
 		lnet_net_unlock(cpt);
 	}
 
-	if (rcd->rcd_pinginfo != NULL)
-		LIBCFS_FREE(rcd->rcd_pinginfo, LNET_PINGINFO_SIZE);
+	if (rcd->rcd_pingbuffer != NULL)
+		lnet_ping_buffer_decref(rcd->rcd_pingbuffer);
 
 	LIBCFS_FREE(rcd, sizeof(*rcd));
 }
 
 static struct lnet_rc_data *
-lnet_create_rc_data_locked(struct lnet_peer_ni *gateway)
+lnet_update_rc_data_locked(struct lnet_peer_ni *gateway)
 {
-	struct lnet_rc_data		*rcd = NULL;
-	struct lnet_ping_info	*pi;
-	int			rc;
-	int			i;
+	struct lnet_handle_md mdh;
+	struct lnet_rc_data *rcd;
+	struct lnet_ping_buffer *pbuf = NULL;
+	int nnis = LNET_INTERFACES_MIN;
+	int rc;
+	int i;
+
+	rcd = gateway->lpni_rcd;
+	if (rcd) {
+		nnis = rcd->rcd_nnis;
+		mdh = rcd->rcd_mdh;
+		LNetInvalidateMDHandle(&rcd->rcd_mdh);
+		pbuf = rcd->rcd_pingbuffer;
+		rcd->rcd_pingbuffer = NULL;
+	} else {
+		LNetInvalidateMDHandle(&mdh);
+	}
 
 	lnet_net_unlock(gateway->lpni_cpt);
 
-	LIBCFS_ALLOC(rcd, sizeof(*rcd));
-	if (rcd == NULL)
-		goto out;
+	if (rcd) {
+		LNetMDUnlink(mdh);
+		lnet_ping_buffer_decref(pbuf);
+	} else {
+		LIBCFS_ALLOC(rcd, sizeof(*rcd));
+		if (rcd == NULL)
+			goto out;
 
-	LNetInvalidateMDHandle(&rcd->rcd_mdh);
-	INIT_LIST_HEAD(&rcd->rcd_list);
+		LNetInvalidateMDHandle(&rcd->rcd_mdh);
+		INIT_LIST_HEAD(&rcd->rcd_list);
+		rcd->rcd_nnis = nnis;
+	}
 
-	LIBCFS_ALLOC(pi, LNET_PINGINFO_SIZE);
-	if (pi == NULL)
+	pbuf = lnet_ping_buffer_alloc(nnis, GFP_NOFS);
+	if (pbuf == NULL)
 		goto out;
 
-	for (i = 0; i < LNET_MAX_RTR_NIS; i++) {
-		pi->pi_ni[i].ns_nid = LNET_NID_ANY;
-		pi->pi_ni[i].ns_status = LNET_NI_STATUS_INVALID;
+	for (i = 0; i < nnis; i++) {
+		pbuf->pb_info.pi_ni[i].ns_nid = LNET_NID_ANY;
+		pbuf->pb_info.pi_ni[i].ns_status = LNET_NI_STATUS_INVALID;
 	}
-	rcd->rcd_pinginfo = pi;
+	rcd->rcd_pingbuffer = pbuf;
 
 	LASSERT(!LNetEQHandleIsInvalid(the_lnet.ln_rc_eqh));
-	rc = LNetMDBind((struct lnet_md){.start     = pi,
+	rc = LNetMDBind((struct lnet_md){.start     = &pbuf->pb_info,
 				    .user_ptr  = rcd,
-				    .length    = LNET_PINGINFO_SIZE,
+				    .length    = LNET_PING_INFO_SIZE(nnis),
 				    .threshold = LNET_MD_THRESH_INF,
 				    .options   = LNET_MD_TRUNCATE,
 				    .eq_handle = the_lnet.ln_rc_eqh},
@@ -976,33 +1006,37 @@ lnet_create_rc_data_locked(struct lnet_peer_ni *gateway)
 			&rcd->rcd_mdh);
 	if (rc < 0) {
 		CERROR("Can't bind MD: %d\n", rc);
-		goto out;
+		goto out_ping_buffer_decref;
 	}
 	LASSERT(rc == 0);
 
 	lnet_net_lock(gateway->lpni_cpt);
-	/* router table changed or someone has created rcd for this gateway */
-	if (!lnet_isrouter(gateway) || gateway->lpni_rcd != NULL) {
-		lnet_net_unlock(gateway->lpni_cpt);
-		goto out;
+	/* Check if this is still a router. */
+	if (!lnet_isrouter(gateway))
+		goto out_unlock;
+	/* Check if someone else installed router data. */
+	if (gateway->lpni_rcd && gateway->lpni_rcd != rcd)
+		goto out_unlock;
+
+	/* Install and/or update the router data. */
+	if (!gateway->lpni_rcd) {
+		lnet_peer_ni_addref_locked(gateway);
+		rcd->rcd_gateway = gateway;
+		gateway->lpni_rcd = rcd;
 	}
-
-	lnet_peer_ni_addref_locked(gateway);
-	rcd->rcd_gateway = gateway;
-	gateway->lpni_rcd = rcd;
 	gateway->lpni_ping_notsent = 0;
 
 	return rcd;
 
+out_unlock:
+	lnet_net_unlock(gateway->lpni_cpt);
+	rc = LNetMDUnlink(mdh);
+	LASSERT(rc == 0);
+out_ping_buffer_decref:
+	lnet_ping_buffer_decref(pbuf);
 out:
-	if (rcd != NULL) {
-		if (!LNetMDHandleIsInvalid(rcd->rcd_mdh)) {
-			rc = LNetMDUnlink(rcd->rcd_mdh);
-			LASSERT(rc == 0);
-		}
+	if (rcd && rcd != gateway->lpni_rcd)
 		lnet_destroy_rc_data(rcd);
-	}
-
 	lnet_net_lock(gateway->lpni_cpt);
 	return gateway->lpni_rcd;
 }
@@ -1024,14 +1058,14 @@ static void
 lnet_ping_router_locked(struct lnet_peer_ni *rtr)
 {
 	struct lnet_rc_data *rcd = NULL;
-	cfs_time_t      now = cfs_time_current();
-	int             secs;
-	struct lnet_ni  *ni;
+	time64_t now = ktime_get_seconds();
+	time64_t secs;
+	struct lnet_ni *ni;
 
 	lnet_peer_ni_addref_locked(rtr);
 
 	if (rtr->lpni_ping_deadline != 0 && /* ping timed out? */
-	    cfs_time_after(now, rtr->lpni_ping_deadline))
+	    now >  rtr->lpni_ping_deadline)
 		lnet_notify_locked(rtr, 1, 0, now);
 
 	/* Run any outstanding notifications */
@@ -1039,30 +1073,36 @@ lnet_ping_router_locked(struct lnet_peer_ni *rtr)
 	lnet_ni_notify_locked(ni, rtr);
 
 	if (!lnet_isrouter(rtr) ||
-	    the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
+	    the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) {
 		/* router table changed or router checker is shutting down */
 		lnet_peer_ni_decref_locked(rtr);
 		return;
 	}
 
-	rcd = rtr->lpni_rcd != NULL ?
-	      rtr->lpni_rcd : lnet_create_rc_data_locked(rtr);
+	rcd = rtr->lpni_rcd;
 
+	/*
+	 * The response to the router checker ping could've timed out and
+	 * the mdh might've been invalidated, so we need to update it
+	 * again.
+	 */
+	if (!rcd || rcd->rcd_nnis > rcd->rcd_pingbuffer->pb_nnis ||
+	    LNetMDHandleIsInvalid(rcd->rcd_mdh))
+		rcd = lnet_update_rc_data_locked(rtr);
 	if (rcd == NULL)
 		return;
 
 	secs = lnet_router_check_interval(rtr);
 
 	CDEBUG(D_NET,
-	       "rtr %s %d: deadline %lu ping_notsent %d alive %d "
-	       "alive_count %d lpni_ping_timestamp %lu\n",
+	       "rtr %s %lld: deadline %lld ping_notsent %d alive %d "
+	       "alive_count %d lpni_ping_timestamp %lld\n",
 	       libcfs_nid2str(rtr->lpni_nid), secs,
 	       rtr->lpni_ping_deadline, rtr->lpni_ping_notsent,
 	       rtr->lpni_alive, rtr->lpni_alive_count, rtr->lpni_ping_timestamp);
 
 	if (secs != 0 && !rtr->lpni_ping_notsent &&
-	    cfs_time_after(now, cfs_time_add(rtr->lpni_ping_timestamp,
-					     cfs_time_seconds(secs)))) {
+	    now > rtr->lpni_ping_timestamp + secs) {
 		int               rc;
 		struct lnet_process_id id;
 		struct lnet_handle_md mdh;
@@ -1077,14 +1117,14 @@ lnet_ping_router_locked(struct lnet_peer_ni *rtr)
 		mdh = rcd->rcd_mdh;
 
 		if (rtr->lpni_ping_deadline == 0) {
-			rtr->lpni_ping_deadline =
-				cfs_time_shift(router_ping_timeout);
+			rtr->lpni_ping_deadline = ktime_get_seconds() +
+						  router_ping_timeout;
 		}
 
 		lnet_net_unlock(rtr->lpni_cpt);
 
 		rc = LNetGet(LNET_NID_ANY, mdh, id, LNET_RESERVED_PORTAL,
-			     LNET_PROTO_PING_MATCHBITS, 0);
+			     LNET_PROTO_PING_MATCHBITS, 0, false);
 
 		lnet_net_lock(rtr->lpni_cpt);
 		if (rc != 0)
@@ -1095,14 +1135,9 @@ lnet_ping_router_locked(struct lnet_peer_ni *rtr)
 	return;
 }
 
-int
-lnet_router_checker_start(void)
+int lnet_router_pre_mt_start(void)
 {
-	int			rc;
-	int			eqsz = 0;
-	struct task_struct     *task;
-
-	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
+	int rc;
 
 	if (check_routers_before_use &&
 	    dead_router_check_interval <= 0) {
@@ -1112,60 +1147,36 @@ lnet_router_checker_start(void)
 		return -EINVAL;
 	}
 
-	sema_init(&the_lnet.ln_rc_signal, 0);
-
 	rc = LNetEQAlloc(0, lnet_router_checker_event, &the_lnet.ln_rc_eqh);
 	if (rc != 0) {
-		CERROR("Can't allocate EQ(%d): %d\n", eqsz, rc);
+		CERROR("Can't allocate EQ(0): %d\n", rc);
 		return -ENOMEM;
 	}
 
-	the_lnet.ln_rc_state = LNET_RC_STATE_RUNNING;
-	task = kthread_run(lnet_router_checker, NULL, "router_checker");
-	if (IS_ERR(task)) {
-		rc = PTR_ERR(task);
-		CERROR("Can't start router checker thread: %d\n", rc);
-		/* block until event callback signals exit */
-		down(&the_lnet.ln_rc_signal);
-		rc = LNetEQFree(the_lnet.ln_rc_eqh);
-		LASSERT(rc == 0);
-		the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
-		return -ENOMEM;
-	}
+	return 0;
+}
 
+void lnet_router_post_mt_start(void)
+{
 	if (check_routers_before_use) {
 		/* Note that a helpful side-effect of pinging all known routers
 		 * at startup is that it makes them drop stale connections they
 		 * may have to a previous instance of me. */
 		lnet_wait_known_routerstate();
 	}
-
-	return 0;
 }
 
 void
-lnet_router_checker_stop (void)
+lnet_router_cleanup(void)
 {
 	int rc;
 
-	if (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN)
-		return;
-
-	LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
-	the_lnet.ln_rc_state = LNET_RC_STATE_STOPPING;
-	/* wakeup the RC thread if it's sleeping */
-	wake_up(&the_lnet.ln_rc_waitq);
-
-	/* block until event callback signals exit */
-	down(&the_lnet.ln_rc_signal);
-	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
-
 	rc = LNetEQFree(the_lnet.ln_rc_eqh);
 	LASSERT(rc == 0);
 	return;
 }
 
-static void
+void
 lnet_prune_rc_data(int wait_unlink)
 {
 	struct lnet_rc_data *rcd;
@@ -1174,7 +1185,7 @@ lnet_prune_rc_data(int wait_unlink)
 	struct list_head head;
 	int i = 2;
 
-	if (likely(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING &&
+	if (likely(the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING &&
 		   list_empty(&the_lnet.ln_rcd_deathrow) &&
 		   list_empty(&the_lnet.ln_rcd_zombie)))
 		return;
@@ -1183,7 +1194,7 @@ lnet_prune_rc_data(int wait_unlink)
 
 	lnet_net_lock(LNET_LOCK_EX);
 
-	if (the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
+	if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) {
 		/* router checker is stopping, prune all */
 		list_for_each_entry(lp, &the_lnet.ln_routers,
 				    lpni_rtr_list) {
@@ -1247,18 +1258,13 @@ lnet_prune_rc_data(int wait_unlink)
 }
 
 /*
- * This function is called to check if the RC should block indefinitely.
- * It's called from lnet_router_checker() as well as being passed to
- * wait_event_interruptible() to avoid the lost wake_up problem.
- *
- * When it's called from wait_event_interruptible() it is necessary to
- * also not sleep if the rc state is not running to avoid a deadlock
- * when the system is shutting down
+ * This function is called from the monitor thread to check if there are
+ * any active routers that need to be checked.
  */
-static inline bool
+inline bool
 lnet_router_checker_active(void)
 {
-	if (the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING)
+	if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING)
 		return true;
 
 	/* Router Checker thread needs to run when routing is enabled in
@@ -1266,79 +1272,58 @@ lnet_router_checker_active(void)
 	if (the_lnet.ln_routing)
 		return true;
 
+	/* if there are routers that need to be cleaned up then do so */
+	if (!list_empty(&the_lnet.ln_rcd_deathrow) ||
+	    !list_empty(&the_lnet.ln_rcd_zombie))
+		return true;
+
 	return !list_empty(&the_lnet.ln_routers) &&
 		(live_router_check_interval > 0 ||
 		 dead_router_check_interval > 0);
 }
 
-static int
-lnet_router_checker(void *arg)
+void
+lnet_check_routers(void)
 {
 	struct lnet_peer_ni *rtr;
 	struct list_head *entry;
+	__u64	version;
+	int	cpt;
+	int	cpt2;
 
-	cfs_block_allsigs();
-
-	while (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) {
-		__u64	version;
-		int	cpt;
-		int	cpt2;
-
-		cpt = lnet_net_lock_current();
+	cpt = lnet_net_lock_current();
 rescan:
-		version = the_lnet.ln_routers_version;
-
-		list_for_each(entry, &the_lnet.ln_routers) {
-			rtr = list_entry(entry, struct lnet_peer_ni,
-					 lpni_rtr_list);
-
-			cpt2 = rtr->lpni_cpt;
-			if (cpt != cpt2) {
-				lnet_net_unlock(cpt);
-				cpt = cpt2;
-				lnet_net_lock(cpt);
-				/* the routers list has changed */
-				if (version != the_lnet.ln_routers_version)
-					goto rescan;
-			}
+	version = the_lnet.ln_routers_version;
 
-			lnet_ping_router_locked(rtr);
+	list_for_each(entry, &the_lnet.ln_routers) {
+		rtr = list_entry(entry, struct lnet_peer_ni,
+					lpni_rtr_list);
 
-			/* NB dropped lock */
-			if (version != the_lnet.ln_routers_version) {
-				/* the routers list has changed */
+		cpt2 = rtr->lpni_cpt;
+		if (cpt != cpt2) {
+			lnet_net_unlock(cpt);
+			cpt = cpt2;
+			lnet_net_lock(cpt);
+			/* the routers list has changed */
+			if (version != the_lnet.ln_routers_version)
 				goto rescan;
-			}
 		}
 
-		if (the_lnet.ln_routing)
-			lnet_update_ni_status_locked();
-
-		lnet_net_unlock(cpt);
-
-		lnet_prune_rc_data(0); /* don't wait for UNLINK */
+		lnet_ping_router_locked(rtr);
 
-		/* Call schedule_timeout() here always adds 1 to load average
-		 * because kernel counts # active tasks as nr_running
-		 * + nr_uninterruptible. */
-		/* if there are any routes then wakeup every second.  If
-		 * there are no routes then sleep indefinitely until woken
-		 * up by a user adding a route */
-		if (!lnet_router_checker_active())
-			wait_event_interruptible(the_lnet.ln_rc_waitq,
-						 lnet_router_checker_active());
-		else
-			wait_event_interruptible_timeout(the_lnet.ln_rc_waitq,
-							 false,
-							 cfs_time_seconds(1));
+		/* NB dropped lock */
+		if (version != the_lnet.ln_routers_version) {
+			/* the routers list has changed */
+			goto rescan;
+		}
 	}
 
-	lnet_prune_rc_data(1); /* wait for UNLINK */
+	if (the_lnet.ln_routing)
+		lnet_update_ni_status_locked();
 
-	the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
-	up(&the_lnet.ln_rc_signal);
-	/* The unlink event callback will signal final completion */
-	return 0;
+	lnet_net_unlock(cpt);
+
+	lnet_prune_rc_data(0); /* don't wait for UNLINK */
 }
 
 void
@@ -1741,7 +1726,8 @@ lnet_rtrpools_enable(void)
 	lnet_net_lock(LNET_LOCK_EX);
 	the_lnet.ln_routing = 1;
 
-	the_lnet.ln_ping_info->pi_features &= ~LNET_PING_FEAT_RTE_DISABLED;
+	the_lnet.ln_ping_target->pb_info.pi_features &=
+		~LNET_PING_FEAT_RTE_DISABLED;
 	lnet_net_unlock(LNET_LOCK_EX);
 
 	return rc;
@@ -1755,7 +1741,8 @@ lnet_rtrpools_disable(void)
 
 	lnet_net_lock(LNET_LOCK_EX);
 	the_lnet.ln_routing = 0;
-	the_lnet.ln_ping_info->pi_features |= LNET_PING_FEAT_RTE_DISABLED;
+	the_lnet.ln_ping_target->pb_info.pi_features |=
+		LNET_PING_FEAT_RTE_DISABLED;
 
 	tiny_router_buffers = 0;
 	small_router_buffers = 0;
@@ -1765,10 +1752,10 @@ lnet_rtrpools_disable(void)
 }
 
 int
-lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, int alive, cfs_time_t when)
+lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, int alive, time64_t when)
 {
 	struct lnet_peer_ni *lp = NULL;
-	cfs_time_t now = cfs_time_current();
+	time64_t now = ktime_get_seconds();
 	int cpt = lnet_cpt_of_nid(nid, ni);
 
 	LASSERT (!in_interrupt ());
@@ -1787,12 +1774,11 @@ lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, int alive, cfs_time_t when)
 	}
 
 	/* can't do predictions... */
-	if (cfs_time_after(when, now)) {
+	if (when > now) {
 		CWARN("Ignoring prediction from %s of %s %s "
-		      "%ld seconds in the future\n",
+		      "%lld seconds in the future\n",
 		      (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
-		      libcfs_nid2str(nid), alive ? "up" : "down",
-		      cfs_duration_sec(cfs_time_sub(when, now)));
+		      libcfs_nid2str(nid), alive ? "up" : "down", when - now);
 		return -EINVAL;
 	}
 
diff --git a/drivers/staging/lustrefsx/lnet/lnet/router_proc.c b/drivers/staging/lustrefsx/lnet/lnet/router_proc.c
index b7d513521b433..2e60609ee229d 100644
--- a/drivers/staging/lustrefsx/lnet/lnet/router_proc.c
+++ b/drivers/staging/lustrefsx/lnet/lnet/router_proc.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  *
  *   This file is part of Lustre, https://wiki.whamcloud.com/
  *
@@ -21,14 +21,15 @@
  */
 
 #define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/uaccess.h>
+
 #include <libcfs/libcfs.h>
 #include <lnet/lib-lnet.h>
 
 /* This is really lnet_proc.c. You might need to update sanity test 215
  * if any file format is changed. */
 
-static struct ctl_table_header *lnet_table_header = NULL;
-
 #define LNET_LOFFT_BITS		(sizeof(loff_t) * 8)
 /*
  * NB: max allowed LNET_CPT_BITS is 8 on 64-bit system and 2 on 32-bit system
@@ -81,6 +82,7 @@ static int __proc_lnet_stats(void *data, int write,
 {
 	int		 rc;
 	struct lnet_counters *ctrs;
+	struct lnet_counters_common common;
 	int		 len;
 	char		*tmpstr;
 	const int	 tmpsiz = 256; /* 7 %u and 4 __u64 */
@@ -103,16 +105,17 @@ static int __proc_lnet_stats(void *data, int write,
 	}
 
 	lnet_counters_get(ctrs);
+	common = ctrs->lct_common;
 
 	len = snprintf(tmpstr, tmpsiz,
 		       "%u %u %u %u %u %u %u %llu %llu "
 		       "%llu %llu",
-		       ctrs->msgs_alloc, ctrs->msgs_max,
-		       ctrs->errors,
-		       ctrs->send_count, ctrs->recv_count,
-		       ctrs->route_count, ctrs->drop_count,
-		       ctrs->send_length, ctrs->recv_length,
-		       ctrs->route_length, ctrs->drop_length);
+		       common.lcc_msgs_alloc, common.lcc_msgs_max,
+		       common.lcc_errors,
+		       common.lcc_send_count, common.lcc_recv_count,
+		       common.lcc_route_count, common.lcc_drop_count,
+		       common.lcc_send_length, common.lcc_recv_length,
+		       common.lcc_route_length, common.lcc_drop_length);
 
 	if (pos >= min_t(int, len, strlen(tmpstr)))
 		rc = 0;
@@ -244,14 +247,9 @@ proc_lnet_routes(struct ctl_table *table, int write, void __user *buffer,
 	if (len > *lenp) {    /* linux-supplied buffer is too small */
 		rc = -EINVAL;
 	} else if (len > 0) { /* wrote something */
-#ifdef PROC_HANDLER_USE_USER_ATTR
 		if (copy_to_user(buffer, tmpstr, len))
 			rc = -EFAULT;
 		else {
-#else
-		memcpy(buffer, tmpstr, len);
-		{
-#endif
 			off += 1;
 			*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
 		}
@@ -335,15 +333,14 @@ proc_lnet_routers(struct ctl_table *table, int write, void __user *buffer,
 
 		if (peer != NULL) {
 			lnet_nid_t nid = peer->lpni_nid;
-			cfs_time_t now = cfs_time_current();
-			cfs_time_t deadline = peer->lpni_ping_deadline;
+			time64_t now = ktime_get_seconds();
+			time64_t deadline = peer->lpni_ping_deadline;
 			int nrefs     = atomic_read(&peer->lpni_refcount);
 			int nrtrrefs  = peer->lpni_rtr_refcount;
 			int alive_cnt = peer->lpni_alive_count;
 			int alive     = peer->lpni_alive;
 			int pingsent  = !peer->lpni_ping_notsent;
-			int last_ping = cfs_duration_sec(cfs_time_sub(now,
-						     peer->lpni_ping_timestamp));
+			time64_t last_ping = now - peer->lpni_ping_timestamp;
 			int down_ni   = 0;
 			struct lnet_route *rtr;
 
@@ -362,18 +359,18 @@ proc_lnet_routers(struct ctl_table *table, int write, void __user *buffer,
 
 			if (deadline == 0)
 				s += snprintf(s, tmpstr + tmpsiz - s,
-					      "%-4d %7d %9d %6s %12d %9d %8s %7d %s\n",
+					      "%-4d %7d %9d %6s %12llu %9d %8s %7d %s\n",
 					      nrefs, nrtrrefs, alive_cnt,
 					      alive ? "up" : "down", last_ping,
 					      pingsent, "NA", down_ni,
 					      libcfs_nid2str(nid));
 			else
 				s += snprintf(s, tmpstr + tmpsiz - s,
-					      "%-4d %7d %9d %6s %12d %9d %8lu %7d %s\n",
+					      "%-4d %7d %9d %6s %12llu %9d %8llu %7d %s\n",
 					      nrefs, nrtrrefs, alive_cnt,
 					      alive ? "up" : "down", last_ping,
 					      pingsent,
-					      cfs_duration_sec(cfs_time_sub(deadline, now)),
+					      deadline - now,
 					      down_ni, libcfs_nid2str(nid));
 			LASSERT(tmpstr + tmpsiz - s > 0);
 		}
@@ -386,14 +383,9 @@ proc_lnet_routers(struct ctl_table *table, int write, void __user *buffer,
 	if (len > *lenp) {    /* linux-supplied buffer is too small */
 		rc = -EINVAL;
 	} else if (len > 0) { /* wrote something */
-#ifdef PROC_HANDLER_USE_USER_ATTR
 		if (copy_to_user(buffer, tmpstr, len))
 			rc = -EFAULT;
 		else {
-#else
-		memcpy(buffer, tmpstr, len);
-		{
-#endif
 			off += 1;
 			*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
 		}
@@ -531,7 +523,7 @@ proc_lnet_peers(struct ctl_table *table, int write, void __user *buffer,
 		if (peer != NULL) {
 			lnet_nid_t nid = peer->lpni_nid;
 			int nrefs = atomic_read(&peer->lpni_refcount);
-			int lastalive = -1;
+			time64_t lastalive = -1;
 			char *aliveness = "NA";
 			int maxcr = (peer->lpni_net) ?
 			  peer->lpni_net->net_tunables.lct_peer_tx_credits : 0;
@@ -546,11 +538,9 @@ proc_lnet_peers(struct ctl_table *table, int write, void __user *buffer,
 				aliveness = peer->lpni_alive ? "up" : "down";
 
 			if (lnet_peer_aliveness_enabled(peer)) {
-				cfs_time_t now = cfs_time_current();
-				cfs_duration_t delta;
+				time64_t now = ktime_get_seconds();
 
-				delta = cfs_time_sub(now, peer->lpni_last_alive);
-				lastalive = cfs_duration_sec(delta);
+				lastalive = now - peer->lpni_last_alive;
 
 				/* No need to mess up peers contents with
 				 * arbitrarily long integers - it suffices to
@@ -563,7 +553,7 @@ proc_lnet_peers(struct ctl_table *table, int write, void __user *buffer,
 			lnet_net_unlock(cpt);
 
 			s += snprintf(s, tmpstr + tmpsiz - s,
-				      "%-24s %4d %5s %5d %5d %5d %5d %5d %5d %d\n",
+				      "%-24s %4d %5s %5lld %5d %5d %5d %5d %5d %d\n",
 				      libcfs_nid2str(nid), nrefs, aliveness,
 				      lastalive, maxcr, rtrcr, minrtrcr, txcr,
 				      mintxcr, txqnob);
@@ -587,13 +577,9 @@ proc_lnet_peers(struct ctl_table *table, int write, void __user *buffer,
 	if (len > *lenp) {    /* linux-supplied buffer is too small */
 		rc = -EINVAL;
 	} else if (len > 0) { /* wrote something */
-#ifdef PROC_HANDLER_USE_USER_ATTR
 		if (copy_to_user(buffer, tmpstr, len))
 			rc = -EFAULT;
 		else
-#else
-		memcpy(buffer, tmpstr, len);
-#endif
 			*ppos = LNET_PROC_POS_MAKE(cpt, ver, hash, hoff);
 	}
 
@@ -741,12 +727,12 @@ proc_lnet_nis(struct ctl_table *table, int write, void __user *buffer,
 		ni = lnet_get_ni_idx_locked(skip);
 
 		if (ni != NULL) {
-			struct lnet_tx_queue	*tq;
-			char	*stat;
-			long now = cfs_time_current_sec();
-			int	last_alive = -1;
-			int	i;
-			int	j;
+			struct lnet_tx_queue *tq;
+			char *stat;
+			time64_t now = ktime_get_real_seconds();
+			time64_t last_alive = -1;
+			int i;
+			int j;
 
 			if (the_lnet.ln_routing)
 				last_alive = now - ni->ni_last_alive;
@@ -777,7 +763,7 @@ proc_lnet_nis(struct ctl_table *table, int write, void __user *buffer,
 					lnet_net_lock(i);
 
 				s += snprintf(s, tmpstr + tmpsiz - s,
-				      "%-24s %6s %5d %4d %4d %4d %5d %5d %5d\n",
+				      "%-24s %6s %5lld %4d %4d %4d %5d %5d %5d\n",
 				      libcfs_nid2str(ni->ni_nid), stat,
 				      last_alive, *ni->ni_refs[i],
 				      ni->ni_net->net_tunables.lct_peer_tx_credits,
@@ -798,14 +784,9 @@ proc_lnet_nis(struct ctl_table *table, int write, void __user *buffer,
 	if (len > *lenp) {    /* linux-supplied buffer is too small */
 		rc = -EINVAL;
 	} else if (len > 0) { /* wrote something */
-
-#ifdef PROC_HANDLER_USE_USER_ATTR
 		if (copy_to_user(buffer, tmpstr, len))
 			rc = -EFAULT;
 		else
-#else
-		memcpy(buffer, tmpstr, len);
-#endif
 			*ppos += 1;
 	}
 
@@ -974,34 +955,12 @@ static struct ctl_table lnet_table[] = {
 	{ .procname = NULL }
 };
 
-static struct ctl_table top_table[] = {
-	{
-		INIT_CTL_NAME
-		.procname	= "lnet",
-		.mode		= 0555,
-		.data		= NULL,
-		.maxlen		= 0,
-		.child		= lnet_table,
-	},
-	{ .procname = NULL }
-};
-
-void
-lnet_proc_init(void)
+void lnet_router_debugfs_init(void)
 {
-#ifdef CONFIG_SYSCTL
-	if (lnet_table_header == NULL)
-		lnet_table_header = register_sysctl_table(top_table);
-#endif
+	lnet_insert_debugfs(lnet_table);
 }
 
-void
-lnet_proc_fini(void)
+void lnet_router_debugfs_fini(void)
 {
-#ifdef CONFIG_SYSCTL
-	if (lnet_table_header != NULL)
-		unregister_sysctl_table(lnet_table_header);
-
-	lnet_table_header = NULL;
-#endif
+	lnet_remove_debugfs(lnet_table);
 }
diff --git a/drivers/staging/lustrefsx/lnet/selftest/brw_test.c b/drivers/staging/lustrefsx/lnet/selftest/brw_test.c
index 512dbb5b8a2f1..a03f6078c0589 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/brw_test.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/brw_test.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -49,10 +49,10 @@ MODULE_PARM_DESC(brw_inject_errors, "# data errors to inject randomly, zero by d
 #define BRW_MSIZE	sizeof(__u64)
 
 static void
-brw_client_fini (sfw_test_instance_t *tsi)
+brw_client_fini(struct sfw_test_instance *tsi)
 {
-	srpc_bulk_t	*bulk;
-	sfw_test_unit_t	*tsu;
+	struct srpc_bulk *bulk;
+	struct sfw_test_unit *tsu;
 
 	LASSERT(tsi->tsi_is_client);
 
@@ -67,22 +67,22 @@ brw_client_fini (sfw_test_instance_t *tsi)
 }
 
 static int
-brw_client_init (sfw_test_instance_t *tsi)
+brw_client_init(struct sfw_test_instance *tsi)
 {
-	sfw_session_t	 *sn = tsi->tsi_batch->bat_session;
+	struct sfw_session *sn = tsi->tsi_batch->bat_session;
 	int		  flags;
 	int		  off;
 	int		  npg;
 	int		  len;
 	int		  opc;
-	srpc_bulk_t	 *bulk;
-	sfw_test_unit_t	 *tsu;
+	struct srpc_bulk *bulk;
+	struct sfw_test_unit *tsu;
 
 	LASSERT(sn != NULL);
 	LASSERT(tsi->tsi_is_client);
 
 	if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
-		test_bulk_req_t  *breq = &tsi->tsi_u.bulk_v0;
+		struct test_bulk_req *breq = &tsi->tsi_u.bulk_v0;
 
 		opc   = breq->blk_opc;
 		flags = breq->blk_flags;
@@ -93,7 +93,7 @@ brw_client_init (sfw_test_instance_t *tsi)
 		off   = 0;
 
 	} else {
-		test_bulk_req_v1_t  *breq = &tsi->tsi_u.bulk_v1;
+		struct test_bulk_req_v1 *breq = &tsi->tsi_u.bulk_v1;
 
 		/* I should never get this step if it's unknown feature
 		 * because make_session will reject unknown feature */
@@ -137,7 +137,7 @@ brw_client_init (sfw_test_instance_t *tsi)
 #define BRW_MAGIC       0xeeb0eeb1eeb2eeb3ULL
 #define BRW_MSIZE       sizeof(__u64)
 
-int brw_inject_one_error(void)
+static int brw_inject_one_error(void)
 {
 	struct timespec64 ts;
 
@@ -228,7 +228,7 @@ brw_check_page(struct page *pg, int off, int len, int pattern, __u64 magic)
 }
 
 static void
-brw_fill_bulk(srpc_bulk_t *bk, int pattern, __u64 magic)
+brw_fill_bulk(struct srpc_bulk *bk, int pattern, __u64 magic)
 {
 	int	     i;
 	struct page *pg;
@@ -245,7 +245,7 @@ brw_fill_bulk(srpc_bulk_t *bk, int pattern, __u64 magic)
 }
 
 static int
-brw_check_bulk(srpc_bulk_t *bk, int pattern, __u64 magic)
+brw_check_bulk(struct srpc_bulk *bk, int pattern, __u64 magic)
 {
 	int	     i;
 	struct page *pg;
@@ -268,25 +268,25 @@ brw_check_bulk(srpc_bulk_t *bk, int pattern, __u64 magic)
 }
 
 static int
-brw_client_prep_rpc(sfw_test_unit_t *tsu,
-		    struct lnet_process_id dest, srpc_client_rpc_t **rpcpp)
+brw_client_prep_rpc(struct sfw_test_unit *tsu, struct lnet_process_id dest,
+		    struct srpc_client_rpc **rpcpp)
 {
-	srpc_bulk_t	    *bulk = tsu->tsu_private;
-	sfw_test_instance_t *tsi = tsu->tsu_instance;
-	sfw_session_t	    *sn = tsi->tsi_batch->bat_session;
-	srpc_client_rpc_t   *rpc;
-	srpc_brw_reqst_t    *req;
-	int		     flags;
-	int		     npg;
-	int		     len;
-	int		     opc;
-	int		     rc;
+	struct srpc_bulk *bulk = tsu->tsu_private;
+	struct sfw_test_instance *tsi = tsu->tsu_instance;
+	struct sfw_session *sn = tsi->tsi_batch->bat_session;
+	struct srpc_client_rpc *rpc;
+	struct srpc_brw_reqst *req;
+	int flags;
+	int npg;
+	int len;
+	int opc;
+	int rc;
 
 	LASSERT(sn != NULL);
 	LASSERT(bulk != NULL);
 
 	if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
-		test_bulk_req_t *breq = &tsi->tsi_u.bulk_v0;
+		struct test_bulk_req *breq = &tsi->tsi_u.bulk_v0;
 
 		opc   = breq->blk_opc;
 		flags = breq->blk_flags;
@@ -294,8 +294,8 @@ brw_client_prep_rpc(sfw_test_unit_t *tsu,
 		len   = npg * PAGE_SIZE;
 
 	} else {
-		test_bulk_req_v1_t  *breq = &tsi->tsi_u.bulk_v1;
-		int		     off;
+		struct test_bulk_req_v1 *breq = &tsi->tsi_u.bulk_v1;
+		int off;
 
 		/* I should never get this step if it's unknown feature
 		 * because make_session will reject unknown feature */
@@ -312,7 +312,7 @@ brw_client_prep_rpc(sfw_test_unit_t *tsu,
 	if (rc != 0)
 		return rc;
 
-	memcpy(&rpc->crpc_bulk, bulk, offsetof(srpc_bulk_t, bk_iovs[npg]));
+	memcpy(&rpc->crpc_bulk, bulk, offsetof(struct srpc_bulk, bk_iovs[npg]));
 	if (opc == LST_BRW_WRITE)
 		brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_MAGIC);
 	else
@@ -328,14 +328,14 @@ brw_client_prep_rpc(sfw_test_unit_t *tsu,
 }
 
 static void
-brw_client_done_rpc(sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
+brw_client_done_rpc(struct sfw_test_unit *tsu, struct srpc_client_rpc *rpc)
 {
-	__u64                magic = BRW_MAGIC;
-	sfw_test_instance_t *tsi = tsu->tsu_instance;
-	sfw_session_t       *sn = tsi->tsi_batch->bat_session;
-	srpc_msg_t          *msg = &rpc->crpc_replymsg;
-	srpc_brw_reply_t    *reply = &msg->msg_body.brw_reply;
-	srpc_brw_reqst_t    *reqst = &rpc->crpc_reqstmsg.msg_body.brw_reqst;
+	__u64 magic = BRW_MAGIC;
+	struct sfw_test_instance *tsi = tsu->tsu_instance;
+	struct sfw_session *sn = tsi->tsi_batch->bat_session;
+	struct srpc_msg *msg = &rpc->crpc_replymsg;
+	struct srpc_brw_reply *reply = &msg->msg_body.brw_reply;
+	struct srpc_brw_reqst *reqst = &rpc->crpc_reqstmsg.msg_body.brw_reqst;
 
 	LASSERT(sn != NULL);
 
@@ -376,9 +376,9 @@ brw_client_done_rpc(sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
 }
 
 static void
-brw_server_rpc_done(srpc_server_rpc_t *rpc)
+brw_server_rpc_done(struct srpc_server_rpc *rpc)
 {
-	srpc_bulk_t *blk = rpc->srpc_bulk;
+	struct srpc_bulk *blk = rpc->srpc_bulk;
 
 	if (blk == NULL)
 		return;
@@ -396,12 +396,12 @@ brw_server_rpc_done(srpc_server_rpc_t *rpc)
 }
 
 static int
-brw_bulk_ready(srpc_server_rpc_t *rpc, int status)
+brw_bulk_ready(struct srpc_server_rpc *rpc, int status)
 {
-        __u64             magic = BRW_MAGIC;
-        srpc_brw_reply_t *reply = &rpc->srpc_replymsg.msg_body.brw_reply;
-        srpc_brw_reqst_t *reqst;
-        srpc_msg_t       *reqstmsg;
+	__u64 magic = BRW_MAGIC;
+	struct srpc_brw_reply *reply = &rpc->srpc_replymsg.msg_body.brw_reply;
+	struct srpc_brw_reqst *reqst;
+	struct srpc_msg *reqstmsg;
 
         LASSERT (rpc->srpc_bulk != NULL);
         LASSERT (rpc->srpc_reqstbuf != NULL);
@@ -434,13 +434,13 @@ brw_bulk_ready(srpc_server_rpc_t *rpc, int status)
 static int
 brw_server_handle(struct srpc_server_rpc *rpc)
 {
-	struct srpc_service	*sv = rpc->srpc_scd->scd_svc;
-        srpc_msg_t       *replymsg = &rpc->srpc_replymsg;
-        srpc_msg_t       *reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
-        srpc_brw_reply_t *reply = &replymsg->msg_body.brw_reply;
-        srpc_brw_reqst_t *reqst = &reqstmsg->msg_body.brw_reqst;
-	int		  npg;
-        int               rc;
+	struct srpc_service *sv = rpc->srpc_scd->scd_svc;
+	struct srpc_msg *replymsg = &rpc->srpc_replymsg;
+	struct srpc_msg *reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+	struct srpc_brw_reply *reply = &replymsg->msg_body.brw_reply;
+	struct srpc_brw_reqst *reqst = &reqstmsg->msg_body.brw_reqst;
+	int npg;
+	int rc;
 
         LASSERT (sv->sv_id == SRPC_SERVICE_BRW);
 
@@ -505,7 +505,8 @@ brw_server_handle(struct srpc_server_rpc *rpc)
         return 0;
 }
 
-sfw_test_client_ops_t brw_test_client;
+struct sfw_test_client_ops brw_test_client;
+
 void brw_init_test_client(void)
 {
         brw_test_client.tso_init       = brw_client_init;
@@ -514,10 +515,10 @@ void brw_init_test_client(void)
         brw_test_client.tso_done_rpc   = brw_client_done_rpc;
 };
 
-srpc_service_t brw_test_service;
+struct srpc_service brw_test_service;
+
 void brw_init_test_service(void)
 {
-
         brw_test_service.sv_id         = SRPC_SERVICE_BRW;
         brw_test_service.sv_name       = "brw_test";
         brw_test_service.sv_handler    = brw_server_handle;
diff --git a/drivers/staging/lustrefsx/lnet/selftest/conctl.c b/drivers/staging/lustrefsx/lnet/selftest/conctl.c
index 9e60d0d671df2..189435b4375f9 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/conctl.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/conctl.c
@@ -43,27 +43,27 @@
 static int
 lst_session_new_ioctl(struct lstio_session_new_args *args)
 {
-        char      *name;
-        int        rc;
-
-        if (args->lstio_ses_idp   == NULL || /* address for output sid */
-            args->lstio_ses_key   == 0 || /* no key is specified */
-            args->lstio_ses_namep == NULL || /* session name */
-            args->lstio_ses_nmlen <= 0 ||
-            args->lstio_ses_nmlen > LST_NAME_SIZE)
-                return -EINVAL;
+	char *name;
+	int rc;
+
+	if (args->lstio_ses_idp == NULL || /* address for output sid */
+	    args->lstio_ses_key == 0 || /* no key is specified */
+	    args->lstio_ses_namep == NULL || /* session name */
+	    args->lstio_ses_nmlen <= 0 ||
+	    args->lstio_ses_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
 
-        LIBCFS_ALLOC(name, args->lstio_ses_nmlen + 1);
-        if (name == NULL)
-                return -ENOMEM;
+	LIBCFS_ALLOC(name, args->lstio_ses_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_ses_namep,
 			   args->lstio_ses_nmlen)) {
-                LIBCFS_FREE(name, args->lstio_ses_nmlen + 1);
-                return -EFAULT;
-        }
+		LIBCFS_FREE(name, args->lstio_ses_nmlen + 1);
+		return -EFAULT;
+	}
 
-        name[args->lstio_ses_nmlen] = 0;
+	name[args->lstio_ses_nmlen] = 0;
 
 	rc = lstcon_session_new(name,
 				args->lstio_ses_key,
@@ -79,272 +79,272 @@ lst_session_new_ioctl(struct lstio_session_new_args *args)
 static int
 lst_session_end_ioctl(struct lstio_session_end_args *args)
 {
-        if (args->lstio_ses_key != console_session.ses_key)
-                return -EACCES;
+	if (args->lstio_ses_key != console_session.ses_key)
+		return -EACCES;
 
-        return lstcon_session_end();
+	return lstcon_session_end();
 }
 
 static int
 lst_session_info_ioctl(struct lstio_session_info_args *args)
 {
-        /* no checking of key */
-
-        if (args->lstio_ses_idp   == NULL || /* address for ouput sid */
-            args->lstio_ses_keyp  == NULL || /* address for ouput key */
-	    args->lstio_ses_featp  == NULL || /* address for ouput features */
-            args->lstio_ses_ndinfo == NULL || /* address for output ndinfo */
-            args->lstio_ses_namep == NULL || /* address for ouput name */
-            args->lstio_ses_nmlen <= 0 ||
-            args->lstio_ses_nmlen > LST_NAME_SIZE)
-                return -EINVAL;
-
-        return lstcon_session_info(args->lstio_ses_idp,
-                                   args->lstio_ses_keyp,
+	/* no checking of key */
+
+	if (args->lstio_ses_idp == NULL || /* address for ouput sid */
+	    args->lstio_ses_keyp == NULL || /* address for ouput key */
+	    args->lstio_ses_featp == NULL || /* address for ouput features */
+	    args->lstio_ses_ndinfo == NULL || /* address for output ndinfo */
+	    args->lstio_ses_namep == NULL || /* address for ouput name */
+	    args->lstio_ses_nmlen <= 0 ||
+	    args->lstio_ses_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	return lstcon_session_info(args->lstio_ses_idp,
+				   args->lstio_ses_keyp,
 				   args->lstio_ses_featp,
-                                   args->lstio_ses_ndinfo,
-                                   args->lstio_ses_namep,
-                                   args->lstio_ses_nmlen);
+				   args->lstio_ses_ndinfo,
+				   args->lstio_ses_namep,
+				   args->lstio_ses_nmlen);
 }
 
 static int
 lst_debug_ioctl(struct lstio_debug_args *args)
 {
-        char   *name   = NULL;
-        int     client = 1;
-        int     rc;
+	char *name = NULL;
+	int client = 1;
+	int rc;
 
-        if (args->lstio_dbg_key != console_session.ses_key)
-                return -EACCES;
+	if (args->lstio_dbg_key != console_session.ses_key)
+		return -EACCES;
 
-        if (args->lstio_dbg_resultp == NULL)
-                return -EINVAL;
+	if (args->lstio_dbg_resultp == NULL)
+		return -EINVAL;
 
-        if (args->lstio_dbg_namep != NULL && /* name of batch/group */
-            (args->lstio_dbg_nmlen <= 0 ||
-             args->lstio_dbg_nmlen > LST_NAME_SIZE))
-                return -EINVAL;
+	if (args->lstio_dbg_namep != NULL && /* name of batch/group */
+	    (args->lstio_dbg_nmlen <= 0 ||
+	     args->lstio_dbg_nmlen > LST_NAME_SIZE))
+		return -EINVAL;
 
-        if (args->lstio_dbg_namep != NULL) {
-                LIBCFS_ALLOC(name, args->lstio_dbg_nmlen + 1);
-                if (name == NULL)
-                        return -ENOMEM;
+	if (args->lstio_dbg_namep != NULL) {
+		LIBCFS_ALLOC(name, args->lstio_dbg_nmlen + 1);
+		if (name == NULL)
+			return -ENOMEM;
 
 		if (copy_from_user(name, args->lstio_dbg_namep,
-                                       args->lstio_dbg_nmlen)) {
-                        LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
+				   args->lstio_dbg_nmlen)) {
+			LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
 
-                        return -EFAULT;
-                }
+			return -EFAULT;
+		}
 
-                name[args->lstio_dbg_nmlen] = 0;
-        }
+		name[args->lstio_dbg_nmlen] = 0;
+	}
 
-        rc = -EINVAL;
+	rc = -EINVAL;
 
-        switch (args->lstio_dbg_type) {
-        case LST_OPC_SESSION:
-                rc = lstcon_session_debug(args->lstio_dbg_timeout,
-                                          args->lstio_dbg_resultp);
-                break;
+	switch (args->lstio_dbg_type) {
+	case LST_OPC_SESSION:
+		rc = lstcon_session_debug(args->lstio_dbg_timeout,
+					  args->lstio_dbg_resultp);
+		break;
 
-        case LST_OPC_BATCHSRV:
-                client = 0;
-		/* Fall through */
-        case LST_OPC_BATCHCLI:
-                if (name == NULL)
-                        goto out;
+	case LST_OPC_BATCHSRV:
+		client = 0;
+		fallthrough;
+	case LST_OPC_BATCHCLI:
+		if (name == NULL)
+			goto out;
 
-                rc = lstcon_batch_debug(args->lstio_dbg_timeout,
-                                        name, client, args->lstio_dbg_resultp);
-                break;
+		rc = lstcon_batch_debug(args->lstio_dbg_timeout,
+					name, client, args->lstio_dbg_resultp);
+		break;
 
-        case LST_OPC_GROUP:
-                if (name == NULL)
-                        goto out;
+	case LST_OPC_GROUP:
+		if (name == NULL)
+			goto out;
 
-                rc = lstcon_group_debug(args->lstio_dbg_timeout,
-                                        name, args->lstio_dbg_resultp);
-                break;
+		rc = lstcon_group_debug(args->lstio_dbg_timeout,
+					name, args->lstio_dbg_resultp);
+		break;
 
-        case LST_OPC_NODES:
-                if (args->lstio_dbg_count <= 0 ||
-                    args->lstio_dbg_idsp == NULL)
-                        goto out;
+	case LST_OPC_NODES:
+		if (args->lstio_dbg_count <= 0 ||
+		    args->lstio_dbg_idsp == NULL)
+			goto out;
 
-                rc = lstcon_nodes_debug(args->lstio_dbg_timeout,
-                                        args->lstio_dbg_count,
-                                        args->lstio_dbg_idsp,
-                                        args->lstio_dbg_resultp);
-                break;
+		rc = lstcon_nodes_debug(args->lstio_dbg_timeout,
+					args->lstio_dbg_count,
+					args->lstio_dbg_idsp,
+					args->lstio_dbg_resultp);
+		break;
 
-        default:
-                break;
-        }
+	default:
+		break;
+	}
 
 out:
-        if (name != NULL)
-                LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
+	if (name != NULL)
+		LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
 
-        return rc;
+	return rc;
 }
 
 static int
 lst_group_add_ioctl(struct lstio_group_add_args *args)
 {
-        char           *name;
-        int             rc;
+	char *name;
+	int rc;
 
-        if (args->lstio_grp_key != console_session.ses_key)
-                return -EACCES;
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
 
-        if (args->lstio_grp_namep == NULL||
+	if (args->lstio_grp_namep == NULL ||
 	    args->lstio_grp_nmlen <= 0 ||
-            args->lstio_grp_nmlen > LST_NAME_SIZE)
-                return -EINVAL;
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
 
-        LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
-        if (name == NULL)
-                return -ENOMEM;
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_grp_namep,
 			   args->lstio_grp_nmlen)) {
-                LIBCFS_FREE(name, args->lstio_grp_nmlen);
-                return -EFAULT;
-        }
+		LIBCFS_FREE(name, args->lstio_grp_nmlen);
+		return -EFAULT;
+	}
 
-        name[args->lstio_grp_nmlen] = 0;
+	name[args->lstio_grp_nmlen] = 0;
 
-        rc = lstcon_group_add(name);
+	rc = lstcon_group_add(name);
 
-        LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
 
-        return rc;
+	return rc;
 }
 
 static int
 lst_group_del_ioctl(struct lstio_group_del_args *args)
 {
-        int     rc;
-        char   *name;
+	int rc;
+	char *name;
 
-        if (args->lstio_grp_key != console_session.ses_key)
-                return -EACCES;
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
 
-        if (args->lstio_grp_namep == NULL ||
-            args->lstio_grp_nmlen <= 0 ||
-            args->lstio_grp_nmlen > LST_NAME_SIZE)
-                return -EINVAL;
+	if (args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
 
-        LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
-        if (name == NULL)
-                return -ENOMEM;
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_grp_namep,
 			   args->lstio_grp_nmlen)) {
-                LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
-                return -EFAULT;
-        }
+		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+		return -EFAULT;
+	}
 
-        name[args->lstio_grp_nmlen] = 0;
+	name[args->lstio_grp_nmlen] = 0;
 
-        rc = lstcon_group_del(name);
+	rc = lstcon_group_del(name);
 
-        LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
 
-        return rc;
+	return rc;
 }
 
 static int
 lst_group_update_ioctl(struct lstio_group_update_args *args)
 {
-        int     rc;
-        char   *name;
+	int rc;
+	char *name;
 
-        if (args->lstio_grp_key != console_session.ses_key)
-                return -EACCES;
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
 
-        if (args->lstio_grp_resultp == NULL ||
-            args->lstio_grp_namep == NULL ||
+	if (args->lstio_grp_resultp == NULL ||
+	    args->lstio_grp_namep == NULL ||
 	    args->lstio_grp_nmlen <= 0 ||
-            args->lstio_grp_nmlen > LST_NAME_SIZE)
-                return -EINVAL;
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
 
-        LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
-        if (name == NULL)
-                return -ENOMEM;
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_grp_namep,
 			   args->lstio_grp_nmlen)) {
-                LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
-                return -EFAULT;
-        }
-
-        name[args->lstio_grp_nmlen] = 0;
-
-        switch (args->lstio_grp_opc) {
-        case LST_GROUP_CLEAN:
-                rc = lstcon_group_clean(name, args->lstio_grp_args);
-                break;
-
-        case LST_GROUP_REFRESH:
-                rc = lstcon_group_refresh(name, args->lstio_grp_resultp);
-                break;
-
-        case LST_GROUP_RMND:
-                if (args->lstio_grp_count  <= 0 ||
-                    args->lstio_grp_idsp == NULL) {
-                        rc = -EINVAL;
-                        break;
-                }
-                rc = lstcon_nodes_remove(name, args->lstio_grp_count,
-                                         args->lstio_grp_idsp,
-                                         args->lstio_grp_resultp);
-                break;
-
-        default:
-                rc = -EINVAL;
-                break;
-        }
-
-        LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
-
-        return rc;
+		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_grp_nmlen] = 0;
+
+	switch (args->lstio_grp_opc) {
+	case LST_GROUP_CLEAN:
+		rc = lstcon_group_clean(name, args->lstio_grp_args);
+		break;
+
+	case LST_GROUP_REFRESH:
+		rc = lstcon_group_refresh(name, args->lstio_grp_resultp);
+		break;
+
+	case LST_GROUP_RMND:
+		if (args->lstio_grp_count <= 0 ||
+		    args->lstio_grp_idsp == NULL) {
+			rc = -EINVAL;
+			break;
+		}
+		rc = lstcon_nodes_remove(name, args->lstio_grp_count,
+					 args->lstio_grp_idsp,
+					 args->lstio_grp_resultp);
+		break;
+
+	default:
+		rc = -EINVAL;
+		break;
+	}
+
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+	return rc;
 }
 
 static int
 lst_nodes_add_ioctl(struct lstio_group_nodes_args *args)
 {
-	unsigned feats;
-        int     rc;
-        char   *name;
+	unsigned int feats;
+	int rc;
+	char *name;
 
-        if (args->lstio_grp_key != console_session.ses_key)
-                return -EACCES;
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
 
-        if (args->lstio_grp_idsp == NULL || /* array of ids */
-            args->lstio_grp_count <= 0 ||
-            args->lstio_grp_resultp == NULL ||
+	if (args->lstio_grp_idsp == NULL || /* array of ids */
+	    args->lstio_grp_count <= 0 ||
+	    args->lstio_grp_resultp == NULL ||
 	    args->lstio_grp_featp == NULL ||
 	    args->lstio_grp_namep == NULL ||
 	    args->lstio_grp_nmlen <= 0 ||
-            args->lstio_grp_nmlen > LST_NAME_SIZE)
-                return -EINVAL;
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
 
-        LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
-        if (name == NULL)
-                return -ENOMEM;
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_grp_namep,
-                               args->lstio_grp_nmlen)) {
-                LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+			   args->lstio_grp_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
 
-                return -EFAULT;
-        }
+		return -EFAULT;
+	}
 
-        name[args->lstio_grp_nmlen] = 0;
+	name[args->lstio_grp_nmlen] = 0;
 
-        rc = lstcon_nodes_add(name, args->lstio_grp_count,
+	rc = lstcon_nodes_add(name, args->lstio_grp_count,
 			      args->lstio_grp_idsp, &feats,
 			      args->lstio_grp_resultp);
 
@@ -354,50 +354,50 @@ lst_nodes_add_ioctl(struct lstio_group_nodes_args *args)
 		return -EINVAL;
 	}
 
-        return rc;
+	return rc;
 }
 
 static int
 lst_group_list_ioctl(struct lstio_group_list_args *args)
 {
 	if (args->lstio_grp_key != console_session.ses_key)
-                return -EACCES;
+		return -EACCES;
 
-        if (args->lstio_grp_idx   < 0 ||
-            args->lstio_grp_namep == NULL ||
-            args->lstio_grp_nmlen <= 0 ||
-            args->lstio_grp_nmlen > LST_NAME_SIZE)
-                return -EINVAL;
+	if (args->lstio_grp_idx   < 0 ||
+	    args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
 
-        return lstcon_group_list(args->lstio_grp_idx,
-                              args->lstio_grp_nmlen,
-                              args->lstio_grp_namep);
+	return lstcon_group_list(args->lstio_grp_idx,
+				 args->lstio_grp_nmlen,
+				 args->lstio_grp_namep);
 }
 
 static int
 lst_group_info_ioctl(struct lstio_group_info_args *args)
 {
-        char           *name;
-        int             ndent;
-        int             index;
-        int             rc;
+	char *name;
+	int ndent;
+	int index;
+	int rc;
 
-        if (args->lstio_grp_key != console_session.ses_key)
-                return -EACCES;
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
 
-        if (args->lstio_grp_namep == NULL ||
-            args->lstio_grp_nmlen <= 0 ||
-            args->lstio_grp_nmlen > LST_NAME_SIZE)
-                return -EINVAL;
+	if (args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
 
-        if (args->lstio_grp_entp  == NULL && /* output: group entry */
-            args->lstio_grp_dentsp == NULL)  /* output: node entry */
-                return -EINVAL;
+	if (args->lstio_grp_entp == NULL && /* output: group entry */
+	    args->lstio_grp_dentsp == NULL)  /* output: node entry */
+		return -EINVAL;
 
-        if (args->lstio_grp_dentsp != NULL) { /* have node entry */
-                if (args->lstio_grp_idxp == NULL || /* node index */
-                    args->lstio_grp_ndentp == NULL) /* # of node entry */
-                        return -EINVAL;
+	if (args->lstio_grp_dentsp != NULL) { /* have node entry */
+		if (args->lstio_grp_idxp == NULL || /* node index */
+		    args->lstio_grp_ndentp == NULL) /* # of node entry */
+			return -EINVAL;
 
 		if (copy_from_user(&ndent, args->lstio_grp_ndentp,
 				   sizeof(ndent)) ||
@@ -415,19 +415,19 @@ lst_group_info_ioctl(struct lstio_group_info_args *args)
 
 	if (copy_from_user(name, args->lstio_grp_namep,
 			   args->lstio_grp_nmlen)) {
-                LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
-                return -EFAULT;
-        }
+		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+		return -EFAULT;
+	}
 
-        name[args->lstio_grp_nmlen] = 0;
+	name[args->lstio_grp_nmlen] = 0;
 
-        rc = lstcon_group_info(name, args->lstio_grp_entp,
-                               &index, &ndent, args->lstio_grp_dentsp);
+	rc = lstcon_group_info(name, args->lstio_grp_entp,
+			       &index, &ndent, args->lstio_grp_dentsp);
 
-        LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
 
 	if (rc != 0)
-                return rc;
+		return rc;
 
 	if (args->lstio_grp_dentsp != NULL &&
 	    (copy_to_user(args->lstio_grp_idxp, &index, sizeof(index)) ||
@@ -440,20 +440,20 @@ lst_group_info_ioctl(struct lstio_group_info_args *args)
 static int
 lst_batch_add_ioctl(struct lstio_batch_add_args *args)
 {
-        int             rc;
-        char           *name;
+	int rc;
+	char *name;
 
-        if (args->lstio_bat_key != console_session.ses_key)
-                return -EACCES;
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
 
-        if (args->lstio_bat_namep == NULL ||
-            args->lstio_bat_nmlen <= 0 ||
-            args->lstio_bat_nmlen > LST_NAME_SIZE)
-                return -EINVAL;
+	if (args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
 
-        LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
-        if (name == NULL)
-                return -ENOMEM;
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_bat_namep,
 			   args->lstio_bat_nmlen)) {
@@ -461,32 +461,32 @@ lst_batch_add_ioctl(struct lstio_batch_add_args *args)
 		return -EFAULT;
 	}
 
-        name[args->lstio_bat_nmlen] = 0;
+	name[args->lstio_bat_nmlen] = 0;
 
-        rc = lstcon_batch_add(name);
+	rc = lstcon_batch_add(name);
 
-        LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
 
-        return rc;
+	return rc;
 }
 
 static int
 lst_batch_run_ioctl(struct lstio_batch_run_args *args)
 {
-        int             rc;
-        char           *name;
+	int rc;
+	char *name;
 
-        if (args->lstio_bat_key != console_session.ses_key)
-                return -EACCES;
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
 
-        if (args->lstio_bat_namep == NULL ||
-            args->lstio_bat_nmlen <= 0 ||
-            args->lstio_bat_nmlen > LST_NAME_SIZE)
-                return -EINVAL;
+	if (args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
 
-        LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
-        if (name == NULL)
-                return -ENOMEM;
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_bat_namep,
 			   args->lstio_bat_nmlen)) {
@@ -494,34 +494,34 @@ lst_batch_run_ioctl(struct lstio_batch_run_args *args)
 		return -EFAULT;
 	}
 
-        name[args->lstio_bat_nmlen] = 0;
+	name[args->lstio_bat_nmlen] = 0;
 
-        rc = lstcon_batch_run(name, args->lstio_bat_timeout,
-                              args->lstio_bat_resultp);
+	rc = lstcon_batch_run(name, args->lstio_bat_timeout,
+			      args->lstio_bat_resultp);
 
-        LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
 
-        return rc;
+	return rc;
 }
 
 static int
 lst_batch_stop_ioctl(struct lstio_batch_stop_args *args)
 {
-        int             rc;
-        char           *name;
+	int rc;
+	char *name;
 
-        if (args->lstio_bat_key != console_session.ses_key)
-                return -EACCES;
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
 
-        if (args->lstio_bat_resultp == NULL ||
-            args->lstio_bat_namep == NULL ||
-            args->lstio_bat_nmlen <= 0 ||
-            args->lstio_bat_nmlen > LST_NAME_SIZE)
-                return -EINVAL;
+	if (args->lstio_bat_resultp == NULL ||
+	    args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
 
-        LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
-        if (name == NULL)
-                return -ENOMEM;
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_bat_namep,
 			   args->lstio_bat_nmlen)) {
@@ -529,37 +529,37 @@ lst_batch_stop_ioctl(struct lstio_batch_stop_args *args)
 		return -EFAULT;
 	}
 
-        name[args->lstio_bat_nmlen] = 0;
+	name[args->lstio_bat_nmlen] = 0;
 
-        rc = lstcon_batch_stop(name, args->lstio_bat_force,
-                               args->lstio_bat_resultp);
+	rc = lstcon_batch_stop(name, args->lstio_bat_force,
+			       args->lstio_bat_resultp);
 
-        LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
 
-        return rc;
+	return rc;
 }
 
 static int
 lst_batch_query_ioctl(struct lstio_batch_query_args *args)
 {
-        char   *name;
-        int     rc;
+	char *name;
+	int rc;
 
-        if (args->lstio_bat_key != console_session.ses_key)
-                return -EACCES;
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
 
-        if (args->lstio_bat_resultp == NULL ||
-            args->lstio_bat_namep == NULL ||
-            args->lstio_bat_nmlen <= 0 ||
-            args->lstio_bat_nmlen > LST_NAME_SIZE)
-                return -EINVAL;
+	if (args->lstio_bat_resultp == NULL ||
+	    args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
 
-        if (args->lstio_bat_testidx < 0)
-                return -EINVAL;
+	if (args->lstio_bat_testidx < 0)
+		return -EINVAL;
 
-        LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
-        if (name == NULL)
-                return -ENOMEM;
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_bat_namep,
 			   args->lstio_bat_nmlen)) {
@@ -567,92 +567,92 @@ lst_batch_query_ioctl(struct lstio_batch_query_args *args)
 		return -EFAULT;
 	}
 
-        name[args->lstio_bat_nmlen] = 0;
+	name[args->lstio_bat_nmlen] = 0;
 
-        rc = lstcon_test_batch_query(name,
-                                     args->lstio_bat_testidx,
-                                     args->lstio_bat_client,
-                                     args->lstio_bat_timeout,
-                                     args->lstio_bat_resultp);
+	rc = lstcon_test_batch_query(name,
+				     args->lstio_bat_testidx,
+				     args->lstio_bat_client,
+				     args->lstio_bat_timeout,
+				     args->lstio_bat_resultp);
 
-        LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
 
-        return rc;
+	return rc;
 }
 
 static int
 lst_batch_list_ioctl(struct lstio_batch_list_args *args)
 {
-        if (args->lstio_bat_key != console_session.ses_key)
-                return -EACCES;
-
-        if (args->lstio_bat_idx   < 0 ||
-            args->lstio_bat_namep == NULL ||
-            args->lstio_bat_nmlen <= 0 ||
-            args->lstio_bat_nmlen > LST_NAME_SIZE)
-                return -EINVAL;
-
-        return lstcon_batch_list(args->lstio_bat_idx,
-                              args->lstio_bat_nmlen,
-                              args->lstio_bat_namep);
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_idx < 0 ||
+	    args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	return lstcon_batch_list(args->lstio_bat_idx,
+				 args->lstio_bat_nmlen,
+				 args->lstio_bat_namep);
 }
 
 static int
 lst_batch_info_ioctl(struct lstio_batch_info_args *args)
 {
-        char           *name;
-        int             rc;
-        int             index;
-        int             ndent;
+	char *name;
+	int rc;
+	int index;
+	int ndent;
 
-        if (args->lstio_bat_key != console_session.ses_key)
-                return -EACCES;
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
 
-        if (args->lstio_bat_namep == NULL || /* batch name */
-            args->lstio_bat_nmlen <= 0 ||
-            args->lstio_bat_nmlen > LST_NAME_SIZE)
-                return -EINVAL;
+	if (args->lstio_bat_namep == NULL || /* batch name */
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
 
-        if (args->lstio_bat_entp == NULL && /* output: batch entry */
-            args->lstio_bat_dentsp == NULL) /* output: node entry */
-                return -EINVAL;
+	if (args->lstio_bat_entp == NULL && /* output: batch entry */
+	    args->lstio_bat_dentsp == NULL) /* output: node entry */
+		return -EINVAL;
 
-        if (args->lstio_bat_dentsp != NULL) { /* have node entry */
-                if (args->lstio_bat_idxp == NULL || /* node index */
-                    args->lstio_bat_ndentp == NULL) /* # of node entry */
-                        return -EINVAL;
+	if (args->lstio_bat_dentsp != NULL) { /* have node entry */
+		if (args->lstio_bat_idxp == NULL || /* node index */
+		    args->lstio_bat_ndentp == NULL) /* # of node entry */
+			return -EINVAL;
 
 		if (copy_from_user(&index, args->lstio_bat_idxp,
-                                       sizeof(index)) ||
+				   sizeof(index)) ||
 		    copy_from_user(&ndent, args->lstio_bat_ndentp,
-                                       sizeof(ndent)))
-                        return -EFAULT;
+				   sizeof(ndent)))
+			return -EFAULT;
 
-                if (ndent <= 0 || index < 0)
-                        return -EINVAL;
-        }
+		if (ndent <= 0 || index < 0)
+			return -EINVAL;
+	}
 
-        LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
-        if (name == NULL)
-                return -ENOMEM;
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
 
 	if (copy_from_user(name, args->lstio_bat_namep,
 			   args->lstio_bat_nmlen)) {
-                LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
-                return -EFAULT;
-        }
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
 
-        name[args->lstio_bat_nmlen] = 0;
+	name[args->lstio_bat_nmlen] = 0;
 
-        rc = lstcon_batch_info(name,
-                            args->lstio_bat_entp, args->lstio_bat_server,
-                            args->lstio_bat_testidx, &index, &ndent,
-                            args->lstio_bat_dentsp);
+	rc = lstcon_batch_info(name,
+			       args->lstio_bat_entp, args->lstio_bat_server,
+			       args->lstio_bat_testidx, &index, &ndent,
+			       args->lstio_bat_dentsp);
 
-        LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
 
-        if (rc != 0)
-                return rc;
+	if (rc != 0)
+		return rc;
 
 	if (args->lstio_bat_dentsp != NULL &&
 	    (copy_to_user(args->lstio_bat_idxp, &index, sizeof(index)) ||
@@ -665,12 +665,12 @@ lst_batch_info_ioctl(struct lstio_batch_info_args *args)
 static int
 lst_stat_query_ioctl(struct lstio_stat_args *args)
 {
-        int             rc;
-	char           *name = NULL;
+	int rc;
+	char *name = NULL;
 
-        /* TODO: not finished */
-        if (args->lstio_sta_key != console_session.ses_key)
-                return -EACCES;
+	/* TODO: not finished */
+	if (args->lstio_sta_key != console_session.ses_key)
+		return -EACCES;
 
 	if (args->lstio_sta_resultp == NULL)
 		return -EINVAL;
@@ -680,9 +680,9 @@ lst_stat_query_ioctl(struct lstio_stat_args *args)
 			return -EINVAL;
 
 		rc = lstcon_nodes_stat(args->lstio_sta_count,
-                                       args->lstio_sta_idsp,
-                                       args->lstio_sta_timeout,
-                                       args->lstio_sta_resultp);
+				       args->lstio_sta_idsp,
+				       args->lstio_sta_timeout,
+				       args->lstio_sta_resultp);
 	} else if (args->lstio_sta_namep != NULL) {
 		if (args->lstio_sta_nmlen <= 0 ||
 		    args->lstio_sta_nmlen > LST_NAME_SIZE)
@@ -711,12 +711,12 @@ lst_stat_query_ioctl(struct lstio_stat_args *args)
 
 static int lst_test_add_ioctl(struct lstio_test_args *args)
 {
-	char		*batch_name;
-	char		*src_name = NULL;
-	char		*dst_name = NULL;
-	void		*param = NULL;
-	int		ret = 0;
-	int		rc = -ENOMEM;
+	char *batch_name;
+	char *src_name = NULL;
+	char *dst_name = NULL;
+	void *param = NULL;
+	int ret = 0;
+	int rc = -ENOMEM;
 
 	if (args->lstio_tes_resultp == NULL ||
 	    args->lstio_tes_retp == NULL ||
@@ -737,12 +737,12 @@ static int lst_test_add_ioctl(struct lstio_test_args *args)
 	    args->lstio_tes_span <= 0)
 		return -EINVAL;
 
-        /* have parameter, check if parameter length is valid */
-        if (args->lstio_tes_param != NULL &&
-            (args->lstio_tes_param_len <= 0 ||
+	/* have parameter, check if parameter length is valid */
+	if (args->lstio_tes_param != NULL &&
+	    (args->lstio_tes_param_len <= 0 ||
 	     args->lstio_tes_param_len >
-	     PAGE_SIZE - sizeof(lstcon_test_t)))
-                return -EINVAL;
+	     PAGE_SIZE - sizeof(struct lstcon_test)))
+		return -EINVAL;
 
 	LIBCFS_ALLOC(batch_name, args->lstio_tes_bat_nmlen + 1);
 	if (batch_name == NULL)
@@ -777,17 +777,17 @@ static int lst_test_add_ioctl(struct lstio_test_args *args)
 		goto out;
 
 	rc = lstcon_test_add(batch_name,
-			    args->lstio_tes_type,
-			    args->lstio_tes_loop,
-			    args->lstio_tes_concur,
-			    args->lstio_tes_dist, args->lstio_tes_span,
-			    src_name, dst_name, param,
-			    args->lstio_tes_param_len,
-			    &ret, args->lstio_tes_resultp);
-
-        if (ret != 0)
+			     args->lstio_tes_type,
+			     args->lstio_tes_loop,
+			     args->lstio_tes_concur,
+			     args->lstio_tes_dist, args->lstio_tes_span,
+			     src_name, dst_name, param,
+			     args->lstio_tes_param_len,
+			     &ret, args->lstio_tes_resultp);
+
+	if (ret != 0)
 		rc = (copy_to_user(args->lstio_tes_retp, &ret,
-                                       sizeof(ret))) ? -EFAULT : 0;
+				   sizeof(ret))) ? -EFAULT : 0;
 out:
 	if (batch_name != NULL)
 		LIBCFS_FREE(batch_name, args->lstio_tes_bat_nmlen + 1);
@@ -805,36 +805,40 @@ static int lst_test_add_ioctl(struct lstio_test_args *args)
 }
 
 int
-lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_hdr *hdr)
+lstcon_ioctl_entry(struct notifier_block *nb,
+		   unsigned long cmd, void *vdata)
 {
-	char   *buf;
+	struct libcfs_ioctl_hdr *hdr = vdata;
 	struct libcfs_ioctl_data *data;
-	int     opc;
-	int     rc;
+	char *buf = NULL;
+	int rc = -EINVAL;
+	int opc;
 
 	if (cmd != IOC_LIBCFS_LNETST)
-		return -EINVAL;
+		goto err;
 
 	data = container_of(hdr, struct libcfs_ioctl_data, ioc_hdr);
 
 	opc = data->ioc_u32[0];
 
 	if (data->ioc_plen1 > PAGE_SIZE)
-		return -EINVAL;
+		goto err;
 
 	LIBCFS_ALLOC(buf, data->ioc_plen1);
-	if (buf == NULL)
-		return -ENOMEM;
+	if (buf == NULL) {
+		rc = -ENOMEM;
+		goto err;
+	}
 
 	/* copy in parameter */
 	if (copy_from_user(buf, data->ioc_pbuf1, data->ioc_plen1)) {
-		LIBCFS_FREE(buf, data->ioc_plen1);
-		return -EFAULT;
+		rc = -EFAULT;
+		goto out_free_buf;
 	}
 
 	mutex_lock(&console_session.ses_mutex);
 
-	console_session.ses_laststamp = cfs_time_current_sec();
+	console_session.ses_laststamp = ktime_get_real_seconds();
 
 	if (console_session.ses_shutdown) {
 		rc = -ESHUTDOWN;
@@ -851,7 +855,8 @@ lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_hdr *hdr)
 		goto out;
 	}
 
-	memset(&console_session.ses_trans_stat, 0, sizeof(struct lstcon_trans_stat));
+	memset(&console_session.ses_trans_stat, 0,
+	       sizeof(struct lstcon_trans_stat));
 
 	switch (opc) {
 	case LSTIO_SESSION_NEW:
@@ -910,6 +915,7 @@ lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_hdr *hdr)
 		break;
 	default:
 		rc = -EINVAL;
+		goto out;
 	}
 
 	if (copy_to_user(data->ioc_pbuf2, &console_session.ses_trans_stat,
@@ -917,8 +923,8 @@ lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_hdr *hdr)
 		rc = -EFAULT;
 out:
 	mutex_unlock(&console_session.ses_mutex);
-
+out_free_buf:
 	LIBCFS_FREE(buf, data->ioc_plen1);
-
-	return rc;
+err:
+	return notifier_from_ioctl_errno(rc);
 }
diff --git a/drivers/staging/lustrefsx/lnet/selftest/conrpc.c b/drivers/staging/lustrefsx/lnet/selftest/conrpc.c
index a1ef9ada96804..b39756f724a2a 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/conrpc.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/conrpc.c
@@ -43,13 +43,13 @@
 #include "conrpc.h"
 #include "console.h"
 
-void lstcon_rpc_stat_reply(lstcon_rpc_trans_t *, srpc_msg_t *,
-			   lstcon_node_t *, struct lstcon_trans_stat *);
+void lstcon_rpc_stat_reply(struct lstcon_rpc_trans *, struct srpc_msg *,
+			   struct lstcon_node *, struct lstcon_trans_stat *);
 
 static void
-lstcon_rpc_done(srpc_client_rpc_t *rpc)
+lstcon_rpc_done(struct srpc_client_rpc *rpc)
 {
-	lstcon_rpc_t *crpc = (lstcon_rpc_t *)rpc->crpc_priv;
+	struct lstcon_rpc *crpc = rpc->crpc_priv;
 
 	LASSERT(crpc != NULL && rpc == crpc->crp_rpc);
 	LASSERT(crpc->crp_posted && !crpc->crp_finished);
@@ -69,11 +69,11 @@ lstcon_rpc_done(srpc_client_rpc_t *rpc)
 	/* not an orphan RPC */
 	crpc->crp_finished = 1;
 
-	if (crpc->crp_stamp == 0) {
+	if (crpc->crp_stamp_ns == 0) {
 		/* not aborted */
-		LASSERT (crpc->crp_status == 0);
+		LASSERT(crpc->crp_status == 0);
 
-		crpc->crp_stamp  = cfs_time_current();
+		crpc->crp_stamp_ns = ktime_get_ns();
 		crpc->crp_status = rpc->crpc_status;
 	}
 
@@ -85,22 +85,19 @@ lstcon_rpc_done(srpc_client_rpc_t *rpc)
 }
 
 static int
-lstcon_rpc_init(lstcon_node_t *nd, int service, unsigned feats,
-		int bulk_npg, int bulk_len, int embedded, lstcon_rpc_t *crpc)
+lstcon_rpc_init(struct lstcon_node *nd, int service, unsigned int feats,
+		int bulk_npg, int bulk_len, int embedded,
+		struct lstcon_rpc *crpc)
 {
+	memset(crpc, 0, sizeof(*crpc));
+
 	crpc->crp_rpc = sfw_create_rpc(nd->nd_id, service,
 				       feats, bulk_npg, bulk_len,
 				       lstcon_rpc_done, (void *)crpc);
 	if (crpc->crp_rpc == NULL)
 		return -ENOMEM;
 
-	crpc->crp_trans	   = NULL;
 	crpc->crp_node	   = nd;
-	crpc->crp_posted   = 0;
-	crpc->crp_finished = 0;
-	crpc->crp_unpacked = 0;
-	crpc->crp_status   = 0;
-	crpc->crp_stamp	   = 0;
 	crpc->crp_embedded = embedded;
 	INIT_LIST_HEAD(&crpc->crp_link);
 
@@ -110,17 +107,17 @@ lstcon_rpc_init(lstcon_node_t *nd, int service, unsigned feats,
 }
 
 static int
-lstcon_rpc_prep(lstcon_node_t *nd, int service, unsigned feats,
-		int bulk_npg, int bulk_len, lstcon_rpc_t **crpcpp)
+lstcon_rpc_prep(struct lstcon_node *nd, int service, unsigned int feats,
+		int bulk_npg, int bulk_len, struct lstcon_rpc **crpcpp)
 {
-	lstcon_rpc_t  *crpc = NULL;
-	int            rc;
+	struct lstcon_rpc *crpc = NULL;
+	int rc;
 
 	spin_lock(&console_session.ses_rpc_lock);
 
 	if (!list_empty(&console_session.ses_rpc_freelist)) {
 		crpc = list_entry(console_session.ses_rpc_freelist.next,
-				  lstcon_rpc_t, crp_link);
+				  struct lstcon_rpc, crp_link);
 		list_del_init(&crpc->crp_link);
 	}
 
@@ -144,10 +141,10 @@ lstcon_rpc_prep(lstcon_node_t *nd, int service, unsigned feats,
 }
 
 void
-lstcon_rpc_put(lstcon_rpc_t *crpc)
+lstcon_rpc_put(struct lstcon_rpc *crpc)
 {
-	srpc_bulk_t *bulk = &crpc->crp_rpc->crpc_bulk;
-	int	     i;
+	struct srpc_bulk *bulk = &crpc->crp_rpc->crpc_bulk;
+	int i;
 
 	LASSERT(list_empty(&crpc->crp_link));
 
@@ -179,9 +176,9 @@ lstcon_rpc_put(lstcon_rpc_t *crpc)
 }
 
 static void
-lstcon_rpc_post(lstcon_rpc_t *crpc)
+lstcon_rpc_post(struct lstcon_rpc *crpc)
 {
-        lstcon_rpc_trans_t *trans = crpc->crp_trans;
+	struct lstcon_rpc_trans *trans = crpc->crp_trans;
 
         LASSERT (trans != NULL);
 
@@ -232,9 +229,9 @@ lstcon_rpc_trans_name(int transop)
 
 int
 lstcon_rpc_trans_prep(struct list_head *translist, int transop,
-		      lstcon_rpc_trans_t **transpp)
+		      struct lstcon_rpc_trans **transpp)
 {
-	lstcon_rpc_trans_t *trans;
+	struct lstcon_rpc_trans *trans;
 
 	if (translist != NULL) {
 		list_for_each_entry(trans, translist, tas_link) {
@@ -272,18 +269,18 @@ lstcon_rpc_trans_prep(struct list_head *translist, int transop,
 }
 
 void
-lstcon_rpc_trans_addreq(lstcon_rpc_trans_t *trans, lstcon_rpc_t *crpc)
+lstcon_rpc_trans_addreq(struct lstcon_rpc_trans *trans, struct lstcon_rpc *crpc)
 {
 	list_add_tail(&crpc->crp_link, &trans->tas_rpcs_list);
 	crpc->crp_trans = trans;
 }
 
 void
-lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error)
+lstcon_rpc_trans_abort(struct lstcon_rpc_trans *trans, int error)
 {
-	srpc_client_rpc_t *rpc;
-	lstcon_rpc_t	  *crpc;
-	lstcon_node_t	  *nd;
+	struct srpc_client_rpc *rpc;
+	struct lstcon_rpc *crpc;
+	struct lstcon_node *nd;
 
 	list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
 		rpc = crpc->crp_rpc;
@@ -291,16 +288,16 @@ lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error)
 		spin_lock(&rpc->crpc_lock);
 
 		if (!crpc->crp_posted || /* not posted */
-		    crpc->crp_stamp != 0) { /* rpc done or aborted already */
-			if (crpc->crp_stamp == 0) {
-				crpc->crp_stamp = cfs_time_current();
+		    crpc->crp_stamp_ns != 0) { /* rpc done or aborted already */
+			if (crpc->crp_stamp_ns == 0) {
+				crpc->crp_stamp_ns = ktime_get_ns();
 				crpc->crp_status = -EINTR;
 			}
 			spin_unlock(&rpc->crpc_lock);
 			continue;
 		}
 
-		crpc->crp_stamp  = cfs_time_current();
+		crpc->crp_stamp_ns  = ktime_get_ns();
 		crpc->crp_status = error;
 
 		spin_unlock(&rpc->crpc_lock);
@@ -311,16 +308,16 @@ lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error)
 			continue;
 
 		nd = crpc->crp_node;
-		if (cfs_time_after(nd->nd_stamp, crpc->crp_stamp))
+		if (ktime_to_ns(nd->nd_stamp) > crpc->crp_stamp_ns)
 			continue;
 
-		nd->nd_stamp = crpc->crp_stamp;
+		nd->nd_stamp = ktime_set(0, crpc->crp_stamp_ns);
 		nd->nd_state = LST_NODE_DOWN;
 	}
 }
 
 static int
-lstcon_rpc_trans_check(lstcon_rpc_trans_t *trans)
+lstcon_rpc_trans_check(struct lstcon_rpc_trans *trans)
 {
 	if (console_session.ses_shutdown &&
 	    !list_empty(&trans->tas_olink)) /* Not an end session RPC */
@@ -330,10 +327,10 @@ lstcon_rpc_trans_check(lstcon_rpc_trans_t *trans)
 }
 
 int
-lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout)
+lstcon_rpc_trans_postwait(struct lstcon_rpc_trans *trans, int timeout)
 {
-	lstcon_rpc_t  *crpc;
-	int	       rc;
+	struct lstcon_rpc *crpc;
+	int rc;
 
 	if (list_empty(&trans->tas_rpcs_list))
                 return 0;
@@ -381,14 +378,14 @@ lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout)
 }
 
 static int
-lstcon_rpc_get_reply(lstcon_rpc_t *crpc, srpc_msg_t **msgpp)
+lstcon_rpc_get_reply(struct lstcon_rpc *crpc, struct srpc_msg **msgpp)
 {
-        lstcon_node_t        *nd  = crpc->crp_node;
-        srpc_client_rpc_t    *rpc = crpc->crp_rpc;
-        srpc_generic_reply_t *rep;
+	struct lstcon_node *nd = crpc->crp_node;
+	struct srpc_client_rpc *rpc = crpc->crp_rpc;
+	struct srpc_generic_reply *rep;
 
-        LASSERT (nd != NULL && rpc != NULL);
-        LASSERT (crpc->crp_stamp != 0);
+	LASSERT(nd != NULL && rpc != NULL);
+	LASSERT(crpc->crp_stamp_ns != 0);
 
         if (crpc->crp_status != 0) {
                 *msgpp = NULL;
@@ -401,11 +398,11 @@ lstcon_rpc_get_reply(lstcon_rpc_t *crpc, srpc_msg_t **msgpp)
                 crpc->crp_unpacked = 1;
         }
 
-        if (cfs_time_after(nd->nd_stamp, crpc->crp_stamp))
-                return 0;
+	if (ktime_to_ns(nd->nd_stamp) > crpc->crp_stamp_ns)
+		return 0;
 
-        nd->nd_stamp = crpc->crp_stamp;
-        rep = &(*msgpp)->msg_body.reply;
+	nd->nd_stamp = ktime_set(0, crpc->crp_stamp_ns);
+	rep = &(*msgpp)->msg_body.reply;
 
         if (rep->sid.ses_nid == LNET_NID_ANY)
                 nd->nd_state = LST_NODE_UNKNOWN;
@@ -418,11 +415,12 @@ lstcon_rpc_get_reply(lstcon_rpc_t *crpc, srpc_msg_t **msgpp)
 }
 
 void
-lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans, struct lstcon_trans_stat *stat)
+lstcon_rpc_trans_stat(struct lstcon_rpc_trans *trans,
+		      struct lstcon_trans_stat *stat)
 {
-	lstcon_rpc_t	*crpc;
-	srpc_msg_t	*rep;
-	int		 error;
+	struct lstcon_rpc *crpc;
+	struct srpc_msg	*rep;
+	int error;
 
 	LASSERT(stat != NULL);
 
@@ -431,7 +429,7 @@ lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans, struct lstcon_trans_stat *stat)
 	list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
 		lstcon_rpc_stat_total(stat, 1);
 
-		LASSERT(crpc->crp_stamp != 0);
+		LASSERT(crpc->crp_stamp_ns != 0);
 
                 error = lstcon_rpc_get_reply(crpc, &rep);
                 if (error != 0) {
@@ -464,20 +462,20 @@ lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans, struct lstcon_trans_stat *stat)
 }
 
 int
-lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
+lstcon_rpc_trans_interpreter(struct lstcon_rpc_trans *trans,
 			     struct list_head __user *head_up,
 			     lstcon_rpc_readent_func_t readent)
 {
-	struct list_head      tmp;
-	struct list_head     __user *next;
-	struct lstcon_rpc_ent     *ent;
-        srpc_generic_reply_t *rep;
-        lstcon_rpc_t         *crpc;
-        srpc_msg_t           *msg;
-        lstcon_node_t        *nd;
-        cfs_duration_t        dur;
+	struct list_head tmp;
+	struct list_head __user *next;
+	struct lstcon_rpc_ent *ent;
+	struct srpc_generic_reply *rep;
+	struct lstcon_rpc *crpc;
+	struct srpc_msg *msg;
+	struct lstcon_node *nd;
 	struct timespec64 ts;
-        int                   error;
+	int error;
+	s64 dur;
 
 	LASSERT(head_up != NULL);
 
@@ -495,15 +493,15 @@ lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
 
 		ent = list_entry(next, struct lstcon_rpc_ent, rpe_link);
 
-		LASSERT(crpc->crp_stamp != 0);
+		LASSERT(crpc->crp_stamp_ns != 0);
 
-                error = lstcon_rpc_get_reply(crpc, &msg);
+		error = lstcon_rpc_get_reply(crpc, &msg);
 
-                nd = crpc->crp_node;
+		nd = crpc->crp_node;
 
-		dur = (cfs_duration_t)cfs_time_sub(crpc->crp_stamp,
-		       (cfs_time_t)console_session.ses_id.ses_stamp);
-		jiffies_to_timespec64(dur, &ts);
+		dur = crpc->crp_stamp_ns -
+		      console_session.ses_id.ses_stamp * NSEC_PER_MSEC;
+		ts = ns_to_timespec64(dur);
 
 		if (copy_to_user(&ent->rpe_peer,
 				 &nd->nd_id, sizeof(struct lnet_process_id)) ||
@@ -518,7 +516,7 @@ lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
 			continue;
 
 		/* RPC is done */
-		rep = (srpc_generic_reply_t *)&msg->msg_body.reply;
+		rep = (struct srpc_generic_reply *)&msg->msg_body.reply;
 
 		if (copy_to_user(&ent->rpe_sid,
 				 &rep->sid, sizeof(rep->sid)) ||
@@ -538,12 +536,12 @@ lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
 }
 
 void
-lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans)
+lstcon_rpc_trans_destroy(struct lstcon_rpc_trans *trans)
 {
-	srpc_client_rpc_t *rpc;
-	lstcon_rpc_t      *crpc;
-	lstcon_rpc_t      *tmp;
-	int                count = 0;
+	struct srpc_client_rpc *rpc;
+	struct lstcon_rpc *crpc;
+	struct lstcon_rpc *tmp;
+	int count = 0;
 
 	list_for_each_entry_safe(crpc, tmp, &trans->tas_rpcs_list, crp_link) {
 		rpc = crpc->crp_rpc;
@@ -592,12 +590,12 @@ lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans)
 }
 
 int
-lstcon_sesrpc_prep(lstcon_node_t *nd, int transop,
-		   unsigned feats, lstcon_rpc_t **crpc)
+lstcon_sesrpc_prep(struct lstcon_node *nd, int transop,
+		   unsigned int feats, struct lstcon_rpc **crpc)
 {
-        srpc_mksn_reqst_t *msrq;
-        srpc_rmsn_reqst_t *rsrq;
-        int                rc;
+	struct srpc_mksn_reqst *msrq;
+	struct srpc_rmsn_reqst *rsrq;
+	int rc;
 
         switch (transop) {
         case LST_TRANS_SESNEW:
@@ -631,10 +629,11 @@ lstcon_sesrpc_prep(lstcon_node_t *nd, int transop,
 }
 
 int
-lstcon_dbgrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc)
+lstcon_dbgrpc_prep(struct lstcon_node *nd, unsigned int feats,
+		   struct lstcon_rpc **crpc)
 {
-	srpc_debug_reqst_t *drq;
-	int		    rc;
+	struct srpc_debug_reqst *drq;
+	int rc;
 
 	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_DEBUG, feats, 0, 0, crpc);
         if (rc != 0)
@@ -649,12 +648,12 @@ lstcon_dbgrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc)
 }
 
 int
-lstcon_batrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
-		   lstcon_tsb_hdr_t *tsb, lstcon_rpc_t **crpc)
+lstcon_batrpc_prep(struct lstcon_node *nd, int transop, unsigned int feats,
+		   struct lstcon_tsb_hdr *tsb, struct lstcon_rpc **crpc)
 {
-	lstcon_batch_t	   *batch;
-	srpc_batch_reqst_t *brq;
-	int		    rc;
+	struct lstcon_batch *batch;
+	struct srpc_batch_reqst *brq;
+	int rc;
 
 	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_BATCH, feats, 0, 0, crpc);
         if (rc != 0)
@@ -675,17 +674,18 @@ lstcon_batrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
 
         LASSERT (tsb->tsb_index == 0);
 
-        batch = (lstcon_batch_t *)tsb;
+	batch = (struct lstcon_batch *)tsb;
         brq->bar_arg = batch->bat_arg;
 
         return 0;
 }
 
 int
-lstcon_statrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc)
+lstcon_statrpc_prep(struct lstcon_node *nd, unsigned int feats,
+		    struct lstcon_rpc **crpc)
 {
-	srpc_stat_reqst_t *srq;
-	int		   rc;
+	struct srpc_stat_reqst *srq;
+	int rc;
 
 	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_QUERY_STAT, feats, 0, 0, crpc);
         if (rc != 0)
@@ -715,15 +715,15 @@ lstcon_next_id(int idx, int nkiov, lnet_kiov_t *kiov)
 }
 
 static int
-lstcon_dstnodes_prep(lstcon_group_t *grp, int idx,
+lstcon_dstnodes_prep(struct lstcon_group *grp, int idx,
                      int dist, int span, int nkiov, lnet_kiov_t *kiov)
 {
 	struct lnet_process_id_packed *pid;
-        lstcon_ndlink_t          *ndl;
-        lstcon_node_t            *nd;
-        int                       start;
-        int                       end;
-        int                       i = 0;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_node *nd;
+	int start;
+	int end;
+	int i = 0;
 
         LASSERT (dist >= 1);
         LASSERT (span >= 1);
@@ -769,9 +769,10 @@ lstcon_dstnodes_prep(lstcon_group_t *grp, int idx,
 }
 
 static int
-lstcon_pingrpc_prep(struct lst_test_ping_param *param, srpc_test_reqst_t *req)
+lstcon_pingrpc_prep(struct lst_test_ping_param *param,
+		    struct srpc_test_reqst *req)
 {
-        test_ping_req_t *prq = &req->tsr_u.ping;
+	struct test_ping_req *prq = &req->tsr_u.ping;
 
         prq->png_size   = param->png_size;
         prq->png_flags  = param->png_flags;
@@ -780,9 +781,10 @@ lstcon_pingrpc_prep(struct lst_test_ping_param *param, srpc_test_reqst_t *req)
 }
 
 static int
-lstcon_bulkrpc_v0_prep(struct lst_test_bulk_param *param, srpc_test_reqst_t *req)
+lstcon_bulkrpc_v0_prep(struct lst_test_bulk_param *param,
+		       struct srpc_test_reqst *req)
 {
-	test_bulk_req_t *brq = &req->tsr_u.bulk_v0;
+	struct test_bulk_req *brq = &req->tsr_u.bulk_v0;
 
 	brq->blk_opc    = param->blk_opc;
 	brq->blk_npg    = (param->blk_size + PAGE_SIZE - 1) /
@@ -794,9 +796,9 @@ lstcon_bulkrpc_v0_prep(struct lst_test_bulk_param *param, srpc_test_reqst_t *req
 
 static int
 lstcon_bulkrpc_v1_prep(struct lst_test_bulk_param *param, bool is_client,
-		       srpc_test_reqst_t *req)
+		       struct srpc_test_reqst *req)
 {
-	test_bulk_req_v1_t *brq = &req->tsr_u.bulk_v1;
+	struct test_bulk_req_v1 *brq = &req->tsr_u.bulk_v1;
 
 	brq->blk_opc	= param->blk_opc;
 	brq->blk_flags	= param->blk_flags;
@@ -807,17 +809,17 @@ lstcon_bulkrpc_v1_prep(struct lst_test_bulk_param *param, bool is_client,
 }
 
 int
-lstcon_testrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
-                    lstcon_test_t *test, lstcon_rpc_t **crpc)
+lstcon_testrpc_prep(struct lstcon_node *nd, int transop, unsigned int feats,
+		    struct lstcon_test *test, struct lstcon_rpc **crpc)
 {
-        lstcon_group_t    *sgrp = test->tes_src_grp;
-        lstcon_group_t    *dgrp = test->tes_dst_grp;
-        srpc_test_reqst_t *trq;
-        srpc_bulk_t       *bulk;
-        int                i;
-	int		   npg = 0;
-	int		   nob = 0;
-	int		   rc  = 0;
+	struct lstcon_group *sgrp = test->tes_src_grp;
+	struct lstcon_group *dgrp = test->tes_dst_grp;
+	struct srpc_test_reqst *trq;
+	struct srpc_bulk *bulk;
+	int i;
+	int npg = 0;
+	int nob = 0;
+	int rc = 0;
 
 	if (transop == LST_TRANS_TSBCLIADD) {
 		npg = sfw_id_pages(test->tes_span);
@@ -915,11 +917,11 @@ lstcon_testrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
 }
 
 static int
-lstcon_sesnew_stat_reply(lstcon_rpc_trans_t *trans,
-			 lstcon_node_t *nd, srpc_msg_t *reply)
+lstcon_sesnew_stat_reply(struct lstcon_rpc_trans *trans,
+			 struct lstcon_node *nd, struct srpc_msg *reply)
 {
-	srpc_mksn_reply_t *mksn_rep = &reply->msg_body.mksn_reply;
-	int		   status   = mksn_rep->mksn_status;
+	struct srpc_mksn_reply *mksn_rep = &reply->msg_body.mksn_reply;
+	int status = mksn_rep->mksn_status;
 
 	if (status == 0 &&
 	    (reply->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
@@ -962,15 +964,15 @@ lstcon_sesnew_stat_reply(lstcon_rpc_trans_t *trans,
 }
 
 void
-lstcon_rpc_stat_reply(lstcon_rpc_trans_t *trans, srpc_msg_t *msg,
-		      lstcon_node_t *nd, struct lstcon_trans_stat *stat)
+lstcon_rpc_stat_reply(struct lstcon_rpc_trans *trans, struct srpc_msg *msg,
+		      struct lstcon_node *nd, struct lstcon_trans_stat *stat)
 {
-        srpc_rmsn_reply_t  *rmsn_rep;
-        srpc_debug_reply_t *dbg_rep;
-        srpc_batch_reply_t *bat_rep;
-        srpc_test_reply_t  *test_rep;
-        srpc_stat_reply_t  *stat_rep;
-        int                 rc = 0;
+	struct srpc_rmsn_reply *rmsn_rep;
+	struct srpc_debug_reply *dbg_rep;
+	struct srpc_batch_reply *bat_rep;
+	struct srpc_test_reply *test_rep;
+	struct srpc_stat_reply *stat_rep;
+	int rc = 0;
 
 	switch (trans->tas_opc) {
 	case LST_TRANS_SESNEW:
@@ -1085,14 +1087,14 @@ int
 lstcon_rpc_trans_ndlist(struct list_head *ndlist,
 			struct list_head *translist, int transop,
 			void *arg, lstcon_rpc_cond_func_t condition,
-			lstcon_rpc_trans_t **transpp)
+			struct lstcon_rpc_trans **transpp)
 {
-        lstcon_rpc_trans_t *trans;
-        lstcon_ndlink_t    *ndl;
-        lstcon_node_t      *nd;
-        lstcon_rpc_t       *rpc;
-	unsigned	    feats;
-        int                 rc;
+	struct lstcon_rpc_trans *trans;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_node *nd;
+	struct lstcon_rpc *rpc;
+	unsigned int feats;
+	int rc;
 
         /* Creating session RPG for list of nodes */
 
@@ -1130,14 +1132,16 @@ lstcon_rpc_trans_ndlist(struct list_head *ndlist,
 		case LST_TRANS_TSBCLIADD:
 		case LST_TRANS_TSBSRVADD:
 			rc = lstcon_testrpc_prep(nd, transop, feats,
-						 (lstcon_test_t *)arg, &rpc);
+						 (struct lstcon_test *)arg,
+						 &rpc);
 			break;
 		case LST_TRANS_TSBRUN:
 		case LST_TRANS_TSBSTOP:
 		case LST_TRANS_TSBCLIQRY:
 		case LST_TRANS_TSBSRVQRY:
 			rc = lstcon_batrpc_prep(nd, transop, feats,
-						(lstcon_tsb_hdr_t *)arg, &rpc);
+						(struct lstcon_tsb_hdr *)arg,
+						&rpc);
 			break;
 		case LST_TRANS_STATQRY:
 			rc = lstcon_statrpc_prep(nd, feats, &rpc);
@@ -1169,16 +1173,16 @@ lstcon_rpc_trans_ndlist(struct list_head *ndlist,
 static void
 lstcon_rpc_pinger(void *arg)
 {
-        stt_timer_t        *ptimer = (stt_timer_t *)arg;
-        lstcon_rpc_trans_t *trans;
-        lstcon_rpc_t       *crpc;
-        srpc_msg_t         *rep;
-        srpc_debug_reqst_t *drq;
-        lstcon_ndlink_t    *ndl;
-        lstcon_node_t      *nd;
+	struct stt_timer *ptimer = arg;
+	struct lstcon_rpc_trans *trans;
+	struct lstcon_rpc *crpc;
+	struct srpc_msg *rep;
+	struct srpc_debug_reqst *drq;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_node *nd;
 	int intv;
-        int                 count = 0;
-        int                 rc;
+	int count = 0;
+	int rc;
 
         /* RPC pinger is a special case of transaction,
          * it's called by timer at 8 seconds interval.
@@ -1191,8 +1195,8 @@ lstcon_rpc_pinger(void *arg)
         }
 
 	if (!console_session.ses_expired &&
-	    cfs_time_current_sec() - console_session.ses_laststamp >
-	    (time_t)console_session.ses_timeout)
+	    ktime_get_real_seconds() - console_session.ses_laststamp >
+	    (time64_t)console_session.ses_timeout)
 		console_session.ses_expired = 1;
 
 	trans = console_session.ses_ping;
@@ -1245,12 +1249,13 @@ lstcon_rpc_pinger(void *arg)
 			lstcon_rpc_put(crpc);
 		}
 
-                if (nd->nd_state != LST_NODE_ACTIVE)
-                        continue;
+		if (nd->nd_state != LST_NODE_ACTIVE)
+			continue;
 
-		intv = cfs_duration_sec(jiffies - nd->nd_stamp);
+		intv = div_u64(ktime_ms_delta(ktime_get(), nd->nd_stamp),
+			       MSEC_PER_SEC);
 		if (intv < nd->nd_timeout / 2)
-                        continue;
+			continue;
 
 		rc = lstcon_rpc_init(nd, SRPC_SERVICE_DEBUG,
 				     trans->tas_features, 0, 0, 1, crpc);
@@ -1277,7 +1282,7 @@ lstcon_rpc_pinger(void *arg)
 
         CDEBUG(D_NET, "Ping %d nodes in session\n", count);
 
-	ptimer->stt_expires = (cfs_time_t)(cfs_time_current_sec() + LST_PING_INTERVAL);
+	ptimer->stt_expires = ktime_get_real_seconds() + LST_PING_INTERVAL;
 	stt_add_timer(ptimer);
 
 	mutex_unlock(&console_session.ses_mutex);
@@ -1286,8 +1291,8 @@ lstcon_rpc_pinger(void *arg)
 int
 lstcon_rpc_pinger_start(void)
 {
-	stt_timer_t	*ptimer;
-	int		 rc;
+	struct stt_timer *ptimer;
+	int rc;
 
 	LASSERT(list_empty(&console_session.ses_rpc_freelist));
 	LASSERT(atomic_read(&console_session.ses_rpc_counter) == 0);
@@ -1300,7 +1305,7 @@ lstcon_rpc_pinger_start(void)
         }
 
 	ptimer = &console_session.ses_ping_timer;
-	ptimer->stt_expires = (cfs_time_t)(cfs_time_current_sec() + LST_PING_INTERVAL);
+	ptimer->stt_expires = ktime_get_real_seconds() + LST_PING_INTERVAL;
 
 	stt_add_timer(ptimer);
 
@@ -1326,10 +1331,10 @@ lstcon_rpc_pinger_stop(void)
 void
 lstcon_rpc_cleanup_wait(void)
 {
-	lstcon_rpc_trans_t	*trans;
-	lstcon_rpc_t		*crpc;
-	struct list_head	*pacer;
-	struct list_head	 zlist;
+	struct lstcon_rpc_trans	*trans;
+	struct lstcon_rpc *crpc;
+	struct list_head *pacer;
+	struct list_head zlist;
 
 	/* Called with hold of global mutex */
 
@@ -1337,7 +1342,7 @@ lstcon_rpc_cleanup_wait(void)
 
 	while (!list_empty(&console_session.ses_trans_list)) {
 		list_for_each(pacer, &console_session.ses_trans_list) {
-			trans = list_entry(pacer, lstcon_rpc_trans_t,
+			trans = list_entry(pacer, struct lstcon_rpc_trans,
 					   tas_link);
 
 			CDEBUG(D_NET, "Session closed, wakeup transaction %s\n",
@@ -1370,10 +1375,10 @@ lstcon_rpc_cleanup_wait(void)
 	spin_unlock(&console_session.ses_rpc_lock);
 
 	while (!list_empty(&zlist)) {
-		crpc = list_entry(zlist.next, lstcon_rpc_t, crp_link);
+		crpc = list_entry(zlist.next, struct lstcon_rpc, crp_link);
 
 		list_del(&crpc->crp_link);
-		LIBCFS_FREE(crpc, sizeof(lstcon_rpc_t));
+		LIBCFS_FREE(crpc, sizeof(*crpc));
 	}
 }
 
diff --git a/drivers/staging/lustrefsx/lnet/selftest/conrpc.h b/drivers/staging/lustrefsx/lnet/selftest/conrpc.h
index fd56e648491ce..51d4ee90e07cc 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/conrpc.h
+++ b/drivers/staging/lustrefsx/lnet/selftest/conrpc.h
@@ -40,7 +40,6 @@
 #define __LST_CONRPC_H__
 
 #include <libcfs/libcfs.h>
-#include <lnet/lnet.h>
 #include <lnet/lib-types.h>
 #include "rpc.h"
 #include "selftest.h"
@@ -58,9 +57,9 @@ struct lstcon_tsb_hdr;
 struct lstcon_test;
 struct lstcon_node;
 
-typedef struct lstcon_rpc {
+struct lstcon_rpc {
 	struct list_head	 crp_link;	/* chain on rpc transaction */
-	srpc_client_rpc_t	*crp_rpc;	/* client rpc */
+	struct srpc_client_rpc	*crp_rpc;	/* client rpc */
 	struct lstcon_node	*crp_node;	/* destination node */
 	struct lstcon_rpc_trans *crp_trans;	/* conrpc transaction */
 
@@ -70,10 +69,10 @@ typedef struct lstcon_rpc {
 	/** RPC is embedded in other structure and can't free it */
 	unsigned int		 crp_embedded:1;
         int                      crp_status;     /* console rpc errors */
-        cfs_time_t               crp_stamp;      /* replied time stamp */
-} lstcon_rpc_t;
+	s64			 crp_stamp_ns;	 /* replied time stamp */
+};
 
-typedef struct lstcon_rpc_trans {
+struct lstcon_rpc_trans {
 	/* link chain on owner list */
 	struct list_head	tas_olink;
 	/* link chain on global list */
@@ -87,7 +86,7 @@ typedef struct lstcon_rpc_trans {
 	wait_queue_head_t	tas_waitq;	/* wait queue head */
 	atomic_t		tas_remaining;	/* # of un-scheduled rpcs */
 	struct list_head	tas_rpcs_list;	/* queued requests */
-} lstcon_rpc_trans_t;
+};
 
 #define LST_TRANS_PRIVATE       0x1000
 
@@ -105,36 +104,37 @@ typedef struct lstcon_rpc_trans {
 
 #define LST_TRANS_STATQRY       0x21
 
-typedef int (* lstcon_rpc_cond_func_t)(int, struct lstcon_node *, void *);
-typedef int (*lstcon_rpc_readent_func_t)(int, srpc_msg_t *,
+typedef int (*lstcon_rpc_cond_func_t)(int, struct lstcon_node *, void *);
+typedef int (*lstcon_rpc_readent_func_t)(int, struct srpc_msg *,
 					 struct lstcon_rpc_ent __user *);
 
 int  lstcon_sesrpc_prep(struct lstcon_node *nd, int transop,
-			unsigned version, lstcon_rpc_t **crpc);
+			unsigned int version, struct lstcon_rpc **crpc);
 int  lstcon_dbgrpc_prep(struct lstcon_node *nd,
-			unsigned version, lstcon_rpc_t **crpc);
+			unsigned int version, struct lstcon_rpc **crpc);
 int  lstcon_batrpc_prep(struct lstcon_node *nd, int transop, unsigned version,
-                        struct lstcon_tsb_hdr *tsb, lstcon_rpc_t **crpc);
+			struct lstcon_tsb_hdr *tsb, struct lstcon_rpc **crpc);
 int  lstcon_testrpc_prep(struct lstcon_node *nd, int transop, unsigned version,
-                         struct lstcon_test *test, lstcon_rpc_t **crpc);
+			 struct lstcon_test *test, struct lstcon_rpc **crpc);
 int  lstcon_statrpc_prep(struct lstcon_node *nd, unsigned version,
-			 lstcon_rpc_t **crpc);
-void lstcon_rpc_put(lstcon_rpc_t *crpc);
+			 struct lstcon_rpc **crpc);
+void lstcon_rpc_put(struct lstcon_rpc *crpc);
 int  lstcon_rpc_trans_prep(struct list_head *translist,
-			   int transop, lstcon_rpc_trans_t **transpp);
+			   int transop, struct lstcon_rpc_trans **transpp);
 int  lstcon_rpc_trans_ndlist(struct list_head *ndlist,
 			     struct list_head *translist, int transop,
 			     void *arg, lstcon_rpc_cond_func_t condition,
-			     lstcon_rpc_trans_t **transpp);
-void lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans,
+			     struct lstcon_rpc_trans **transpp);
+void lstcon_rpc_trans_stat(struct lstcon_rpc_trans *trans,
 			   struct lstcon_trans_stat *stat);
-int  lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
+int  lstcon_rpc_trans_interpreter(struct lstcon_rpc_trans *trans,
 				  struct list_head __user *head_up,
 				  lstcon_rpc_readent_func_t readent);
-void lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error);
-void lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans);
-void lstcon_rpc_trans_addreq(lstcon_rpc_trans_t *trans, lstcon_rpc_t *req);
-int  lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout);
+void lstcon_rpc_trans_abort(struct lstcon_rpc_trans *trans, int error);
+void lstcon_rpc_trans_destroy(struct lstcon_rpc_trans *trans);
+void lstcon_rpc_trans_addreq(struct lstcon_rpc_trans *trans,
+			     struct lstcon_rpc *req);
+int  lstcon_rpc_trans_postwait(struct lstcon_rpc_trans *trans, int timeout);
 int  lstcon_rpc_pinger_start(void);
 void lstcon_rpc_pinger_stop(void);
 void lstcon_rpc_cleanup_wait(void);
diff --git a/drivers/staging/lustrefsx/lnet/selftest/console.c b/drivers/staging/lustrefsx/lnet/selftest/console.c
index a9fe8a85a2dd1..1e37454732cd1 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/console.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/console.c
@@ -36,7 +36,6 @@
  * Author: Liang Zhen <liangzhen@clusterfs.com>
  */
 
-
 #include <libcfs/libcfs.h>
 #include <lnet/lib-lnet.h>
 #include "console.h"
@@ -55,10 +54,10 @@ do {                                                    \
         (p)->nle_nnode ++;                              \
 } while (0)
 
-lstcon_session_t        console_session;
+struct lstcon_session console_session;
 
 static void
-lstcon_node_get(lstcon_node_t *nd)
+lstcon_node_get(struct lstcon_node *nd)
 {
         LASSERT (nd->nd_ref >= 1);
 
@@ -66,10 +65,11 @@ lstcon_node_get(lstcon_node_t *nd)
 }
 
 static int
-lstcon_node_find(struct lnet_process_id id, lstcon_node_t **ndpp, int create)
+lstcon_node_find(struct lnet_process_id id, struct lstcon_node **ndpp,
+		 int create)
 {
-	lstcon_ndlink_t	*ndl;
-	unsigned int	 idx = LNET_NIDADDR(id.nid) % LST_GLOBAL_HASHSIZE;
+	struct lstcon_ndlink *ndl;
+	unsigned int idx = LNET_NIDADDR(id.nid) % LST_GLOBAL_HASHSIZE;
 
 	LASSERT(id.nid != LNET_NID_ANY);
 
@@ -87,20 +87,20 @@ lstcon_node_find(struct lnet_process_id id, lstcon_node_t **ndpp, int create)
         if (!create)
                 return -ENOENT;
 
-        LIBCFS_ALLOC(*ndpp, sizeof(lstcon_node_t) + sizeof(lstcon_ndlink_t));
-        if (*ndpp == NULL)
-                return -ENOMEM;
+	LIBCFS_ALLOC(*ndpp, sizeof(**ndpp) + sizeof(*ndl));
+	if (*ndpp == NULL)
+		return -ENOMEM;
 
-        ndl = (lstcon_ndlink_t *)(*ndpp + 1);
+	ndl = (struct lstcon_ndlink *)(*ndpp + 1);
 
-        ndl->ndl_node = *ndpp;
+	ndl->ndl_node = *ndpp;
 
-        ndl->ndl_node->nd_ref   = 1;
-        ndl->ndl_node->nd_id    = id;
-        ndl->ndl_node->nd_stamp = cfs_time_current();
-        ndl->ndl_node->nd_state = LST_NODE_UNKNOWN;
-        ndl->ndl_node->nd_timeout = 0;
-        memset(&ndl->ndl_node->nd_ping, 0, sizeof(lstcon_rpc_t));
+	ndl->ndl_node->nd_ref   = 1;
+	ndl->ndl_node->nd_id    = id;
+	ndl->ndl_node->nd_stamp = ktime_get();
+	ndl->ndl_node->nd_state = LST_NODE_UNKNOWN;
+	ndl->ndl_node->nd_timeout = 0;
+	memset(&ndl->ndl_node->nd_ping, 0, sizeof(ndl->ndl_node->nd_ping));
 
 	/* queued in global hash & list, no refcount is taken by
 	 * global hash & list, if caller release his refcount,
@@ -112,16 +112,16 @@ lstcon_node_find(struct lnet_process_id id, lstcon_node_t **ndpp, int create)
 }
 
 static void
-lstcon_node_put(lstcon_node_t *nd)
+lstcon_node_put(struct lstcon_node *nd)
 {
-	lstcon_ndlink_t *ndl;
+	struct lstcon_ndlink *ndl;
 
 	LASSERT(nd->nd_ref > 0);
 
 	if (--nd->nd_ref > 0)
 		return;
 
-	ndl = (lstcon_ndlink_t *)(nd + 1);
+	ndl = (struct lstcon_ndlink *)(nd + 1);
 
 	LASSERT(!list_empty(&ndl->ndl_link));
 	LASSERT(!list_empty(&ndl->ndl_hlink));
@@ -130,17 +130,17 @@ lstcon_node_put(lstcon_node_t *nd)
 	list_del(&ndl->ndl_link);
 	list_del(&ndl->ndl_hlink);
 
-	LIBCFS_FREE(nd, sizeof(lstcon_node_t) + sizeof(lstcon_ndlink_t));
+	LIBCFS_FREE(nd, sizeof(*nd) + sizeof(*ndl));
 }
 
 static int
 lstcon_ndlink_find(struct list_head *hash, struct lnet_process_id id,
-		   lstcon_ndlink_t **ndlpp, int create)
+		   struct lstcon_ndlink **ndlpp, int create)
 {
-	unsigned int	 idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE;
-	lstcon_ndlink_t *ndl;
-	lstcon_node_t   *nd;
-	int		 rc;
+	unsigned int idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_node *nd;
+	int rc;
 
 	if (id.nid == LNET_NID_ANY)
 		return -EINVAL;
@@ -163,7 +163,7 @@ lstcon_ndlink_find(struct list_head *hash, struct lnet_process_id id,
         if (rc != 0)
                 return rc;
 
-        LIBCFS_ALLOC(ndl, sizeof(lstcon_ndlink_t));
+	LIBCFS_ALLOC(ndl, sizeof(*ndl));
         if (ndl == NULL) {
                 lstcon_node_put(nd);
                 return -ENOMEM;
@@ -179,7 +179,7 @@ lstcon_ndlink_find(struct list_head *hash, struct lnet_process_id id,
 }
 
 static void
-lstcon_ndlink_release(lstcon_ndlink_t *ndl)
+lstcon_ndlink_release(struct lstcon_ndlink *ndl)
 {
 	LASSERT(list_empty(&ndl->ndl_link));
 	LASSERT(!list_empty(&ndl->ndl_hlink));
@@ -191,12 +191,12 @@ lstcon_ndlink_release(lstcon_ndlink_t *ndl)
 }
 
 static int
-lstcon_group_alloc(char *name, lstcon_group_t **grpp)
+lstcon_group_alloc(char *name, struct lstcon_group **grpp)
 {
-	lstcon_group_t *grp;
-	int		i;
+	struct lstcon_group *grp;
+	int i;
 
-        LIBCFS_ALLOC(grp, offsetof(lstcon_group_t,
+	LIBCFS_ALLOC(grp, offsetof(struct lstcon_group,
                                    grp_ndl_hash[LST_NODE_HASHSIZE]));
         if (grp == NULL)
                 return -ENOMEM;
@@ -204,7 +204,7 @@ lstcon_group_alloc(char *name, lstcon_group_t **grpp)
         grp->grp_ref = 1;
 	if (name != NULL) {
 		if (strlen(name) > sizeof(grp->grp_name)-1) {
-			LIBCFS_FREE(grp, offsetof(lstcon_group_t,
+			LIBCFS_FREE(grp, offsetof(struct lstcon_group,
 					  grp_ndl_hash[LST_NODE_HASHSIZE]));
 			return -E2BIG;
 		}
@@ -224,18 +224,19 @@ lstcon_group_alloc(char *name, lstcon_group_t **grpp)
 }
 
 static void
-lstcon_group_addref(lstcon_group_t *grp)
+lstcon_group_addref(struct lstcon_group *grp)
 {
 	grp->grp_ref++;
 }
 
-static void lstcon_group_ndlink_release(lstcon_group_t *, lstcon_ndlink_t *);
+static void lstcon_group_ndlink_release(struct lstcon_group *,
+					struct lstcon_ndlink *);
 
 static void
-lstcon_group_drain(lstcon_group_t *grp, int keep)
+lstcon_group_drain(struct lstcon_group *grp, int keep)
 {
-	lstcon_ndlink_t *ndl;
-	lstcon_ndlink_t *tmp;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_ndlink *tmp;
 
 	list_for_each_entry_safe(ndl, tmp, &grp->grp_ndl_list, ndl_link) {
 		if ((ndl->ndl_node->nd_state & keep) == 0)
@@ -244,7 +245,7 @@ lstcon_group_drain(lstcon_group_t *grp, int keep)
 }
 
 static void
-lstcon_group_decref(lstcon_group_t *grp)
+lstcon_group_decref(struct lstcon_group *grp)
 {
 	int i;
 
@@ -259,14 +260,14 @@ lstcon_group_decref(lstcon_group_t *grp)
 	for (i = 0; i < LST_NODE_HASHSIZE; i++)
 		LASSERT(list_empty(&grp->grp_ndl_hash[i]));
 
-	LIBCFS_FREE(grp, offsetof(lstcon_group_t,
+	LIBCFS_FREE(grp, offsetof(struct lstcon_group,
 				  grp_ndl_hash[LST_NODE_HASHSIZE]));
 }
 
 static int
-lstcon_group_find(const char *name, lstcon_group_t **grpp)
+lstcon_group_find(const char *name, struct lstcon_group **grpp)
 {
-	lstcon_group_t *grp;
+	struct lstcon_group *grp;
 
 	list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) {
 		if (strncmp(grp->grp_name, name, LST_NAME_SIZE) != 0)
@@ -281,8 +282,8 @@ lstcon_group_find(const char *name, lstcon_group_t **grpp)
 }
 
 static int
-lstcon_group_ndlink_find(lstcon_group_t *grp, struct lnet_process_id id,
-                         lstcon_ndlink_t **ndlpp, int create)
+lstcon_group_ndlink_find(struct lstcon_group *grp, struct lnet_process_id id,
+			 struct lstcon_ndlink **ndlpp, int create)
 {
 	int rc;
 
@@ -300,7 +301,7 @@ lstcon_group_ndlink_find(lstcon_group_t *grp, struct lnet_process_id id,
 }
 
 static void
-lstcon_group_ndlink_release(lstcon_group_t *grp, lstcon_ndlink_t *ndl)
+lstcon_group_ndlink_release(struct lstcon_group *grp, struct lstcon_ndlink *ndl)
 {
 	list_del_init(&ndl->ndl_link);
 	lstcon_ndlink_release(ndl);
@@ -308,8 +309,8 @@ lstcon_group_ndlink_release(lstcon_group_t *grp, lstcon_ndlink_t *ndl)
 }
 
 static void
-lstcon_group_ndlink_move(lstcon_group_t *old,
-                         lstcon_group_t *new, lstcon_ndlink_t *ndl)
+lstcon_group_ndlink_move(struct lstcon_group *old,
+			 struct lstcon_group *new, struct lstcon_ndlink *ndl)
 {
 	unsigned int idx = LNET_NIDADDR(ndl->ndl_node->nd_id.nid) %
 					LST_NODE_HASHSIZE;
@@ -326,21 +327,21 @@ lstcon_group_ndlink_move(lstcon_group_t *old,
 }
 
 static void
-lstcon_group_move(lstcon_group_t *old, lstcon_group_t *new)
+lstcon_group_move(struct lstcon_group *old, struct lstcon_group *new)
 {
-	lstcon_ndlink_t *ndl;
+	struct lstcon_ndlink *ndl;
 
 	while (!list_empty(&old->grp_ndl_list)) {
 		ndl = list_entry(old->grp_ndl_list.next,
-				 lstcon_ndlink_t, ndl_link);
+				 struct lstcon_ndlink, ndl_link);
 		lstcon_group_ndlink_move(old, new, ndl);
 	}
 }
 
 static int
-lstcon_sesrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+lstcon_sesrpc_condition(int transop, struct lstcon_node *nd, void *arg)
 {
-        lstcon_group_t *grp = (lstcon_group_t *)arg;
+	struct lstcon_group *grp = arg;
 
         switch (transop) {
         case LST_TRANS_SESNEW:
@@ -367,10 +368,10 @@ lstcon_sesrpc_condition(int transop, lstcon_node_t *nd, void *arg)
 }
 
 static int
-lstcon_sesrpc_readent(int transop, srpc_msg_t *msg,
+lstcon_sesrpc_readent(int transop, struct srpc_msg *msg,
 		      struct lstcon_rpc_ent __user *ent_up)
 {
-        srpc_debug_reply_t *rep;
+	struct srpc_debug_reply *rep;
 
         switch (transop) {
         case LST_TRANS_SESNEW:
@@ -396,16 +397,17 @@ lstcon_sesrpc_readent(int transop, srpc_msg_t *msg,
 }
 
 static int
-lstcon_group_nodes_add(lstcon_group_t *grp,
+lstcon_group_nodes_add(struct lstcon_group *grp,
 		       int count, struct lnet_process_id __user *ids_up,
-		       unsigned *featp, struct list_head __user *result_up)
+		       unsigned int *featp,
+		       struct list_head __user *result_up)
 {
-        lstcon_rpc_trans_t      *trans;
-        lstcon_ndlink_t         *ndl;
-        lstcon_group_t          *tmp;
-	struct lnet_process_id        id;
-        int                      i;
-        int                      rc;
+	struct lstcon_rpc_trans *trans;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_group *tmp;
+	struct lnet_process_id id;
+	int i;
+	int rc;
 
         rc = lstcon_group_alloc(NULL, &tmp);
         if (rc != 0) {
@@ -463,16 +465,16 @@ lstcon_group_nodes_add(lstcon_group_t *grp,
 }
 
 static int
-lstcon_group_nodes_remove(lstcon_group_t *grp,
+lstcon_group_nodes_remove(struct lstcon_group *grp,
 			  int count, struct lnet_process_id __user *ids_up,
 			  struct list_head __user *result_up)
 {
-        lstcon_rpc_trans_t     *trans;
-        lstcon_ndlink_t        *ndl;
-        lstcon_group_t         *tmp;
-	struct lnet_process_id       id;
-        int                     rc;
-        int                     i;
+	struct lstcon_rpc_trans *trans;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_group *tmp;
+	struct lnet_process_id id;
+	int rc;
+	int i;
 
         /* End session and remove node from the group */
 
@@ -520,8 +522,8 @@ lstcon_group_nodes_remove(lstcon_group_t *grp,
 int
 lstcon_group_add(char *name)
 {
-        lstcon_group_t *grp;
-        int             rc;
+	struct lstcon_group *grp;
+	int rc;
 
         rc = (lstcon_group_find(name, &grp) == 0)? -EEXIST: 0;
         if (rc != 0) {
@@ -545,7 +547,7 @@ int
 lstcon_nodes_add(char *name, int count, struct lnet_process_id __user *ids_up,
 		 unsigned *featp, struct list_head __user *result_up)
 {
-        lstcon_group_t         *grp;
+	struct lstcon_group         *grp;
         int                     rc;
 
         LASSERT (count > 0);
@@ -575,9 +577,9 @@ lstcon_nodes_add(char *name, int count, struct lnet_process_id __user *ids_up,
 int
 lstcon_group_del(char *name)
 {
-        lstcon_rpc_trans_t *trans;
-        lstcon_group_t     *grp;
-        int                 rc;
+	struct lstcon_rpc_trans *trans;
+	struct lstcon_group *grp;
+	int rc;
 
         rc = lstcon_group_find(name, &grp);
         if (rc != 0) {
@@ -616,8 +618,8 @@ lstcon_group_del(char *name)
 int
 lstcon_group_clean(char *name, int args)
 {
-        lstcon_group_t *grp = NULL;
-        int             rc;
+	struct lstcon_group *grp = NULL;
+	int rc;
 
         rc = lstcon_group_find(name, &grp);
         if (rc != 0) {
@@ -650,8 +652,8 @@ lstcon_nodes_remove(char *name, int count,
 		    struct lnet_process_id __user *ids_up,
 		    struct list_head __user *result_up)
 {
-        lstcon_group_t *grp = NULL;
-        int             rc;
+	struct lstcon_group *grp = NULL;
+	int rc;
 
         rc = lstcon_group_find(name, &grp);
         if (rc != 0) {
@@ -679,9 +681,9 @@ lstcon_nodes_remove(char *name, int count,
 int
 lstcon_group_refresh(char *name, struct list_head __user *result_up)
 {
-        lstcon_rpc_trans_t      *trans;
-        lstcon_group_t          *grp;
-        int                      rc;
+	struct lstcon_rpc_trans *trans;
+	struct lstcon_group *grp;
+	int rc;
 
         rc = lstcon_group_find(name, &grp);
         if (rc != 0) {
@@ -721,7 +723,7 @@ lstcon_group_refresh(char *name, struct list_head __user *result_up)
 int
 lstcon_group_list(int index, int len, char __user *name_up)
 {
-	lstcon_group_t *grp;
+	struct lstcon_group *grp;
 
 	LASSERT(index >= 0);
 	LASSERT(name_up != NULL);
@@ -740,10 +742,10 @@ static int
 lstcon_nodes_getent(struct list_head *head, int *index_p,
 		    int *count_p, struct lstcon_node_ent __user *dents_up)
 {
-        lstcon_ndlink_t  *ndl;
-        lstcon_node_t    *nd;
-        int               count = 0;
-        int               index = 0;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_node *nd;
+	int count = 0;
+	int index = 0;
 
 	LASSERT(index_p != NULL && count_p != NULL);
 	LASSERT(dents_up != NULL);
@@ -782,9 +784,9 @@ lstcon_group_info(char *name, struct lstcon_ndlist_ent __user *gents_p,
 		  struct lstcon_node_ent __user *dents_up)
 {
 	struct lstcon_ndlist_ent *gentp;
-        lstcon_group_t      *grp;
-        lstcon_ndlink_t     *ndl;
-        int                  rc;
+	struct lstcon_group *grp;
+	struct lstcon_ndlink *ndl;
+	int rc;
 
         rc = lstcon_group_find(name, &grp);
         if (rc != 0) {
@@ -824,9 +826,9 @@ lstcon_group_info(char *name, struct lstcon_ndlist_ent __user *gents_p,
 }
 
 static int
-lstcon_batch_find(const char *name, lstcon_batch_t **batpp)
+lstcon_batch_find(const char *name, struct lstcon_batch **batpp)
 {
-	lstcon_batch_t *bat;
+	struct lstcon_batch *bat;
 
 	list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) {
 		if (strncmp(bat->bat_name, name, LST_NAME_SIZE) == 0) {
@@ -841,9 +843,9 @@ lstcon_batch_find(const char *name, lstcon_batch_t **batpp)
 int
 lstcon_batch_add(char *name)
 {
-        lstcon_batch_t   *bat;
-        int               i;
-        int               rc;
+	struct lstcon_batch *bat;
+	int i;
+	int rc;
 
         rc = (lstcon_batch_find(name, &bat) == 0)? -EEXIST: 0;
         if (rc != 0) {
@@ -851,17 +853,17 @@ lstcon_batch_add(char *name)
                 return rc;
         }
 
-        LIBCFS_ALLOC(bat, sizeof(lstcon_batch_t));
+	LIBCFS_ALLOC(bat, sizeof(*bat));
         if (bat == NULL) {
                 CERROR("Can't allocate descriptor for batch %s\n", name);
                 return -ENOMEM;
         }
 
-        LIBCFS_ALLOC(bat->bat_cli_hash,
+	LIBCFS_ALLOC(bat->bat_cli_hash,
 		     sizeof(struct list_head) * LST_NODE_HASHSIZE);
-        if (bat->bat_cli_hash == NULL) {
-                CERROR("Can't allocate hash for batch %s\n", name);
-                LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+	if (bat->bat_cli_hash == NULL) {
+		CERROR("Can't allocate hash for batch %s\n", name);
+		LIBCFS_FREE(bat, sizeof(*bat));
 
                 return -ENOMEM;
         }
@@ -871,7 +873,7 @@ lstcon_batch_add(char *name)
         if (bat->bat_srv_hash == NULL) {
                 CERROR("Can't allocate hash for batch %s\n", name);
                 LIBCFS_FREE(bat->bat_cli_hash, LST_NODE_HASHSIZE);
-                LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+		LIBCFS_FREE(bat, sizeof(*bat));
 
                 return -ENOMEM;
         }
@@ -879,7 +881,7 @@ lstcon_batch_add(char *name)
 	if (strlen(name) > sizeof(bat->bat_name)-1) {
 		LIBCFS_FREE(bat->bat_srv_hash, LST_NODE_HASHSIZE);
 		LIBCFS_FREE(bat->bat_cli_hash, LST_NODE_HASHSIZE);
-		LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+		LIBCFS_FREE(bat, sizeof(*bat));
 		return -E2BIG;
 	}
 	strncpy(bat->bat_name, name, sizeof(bat->bat_name));
@@ -907,7 +909,7 @@ lstcon_batch_add(char *name)
 int
 lstcon_batch_list(int index, int len, char __user *name_up)
 {
-	lstcon_batch_t *bat;
+	struct lstcon_batch *bat;
 
 	LASSERT(name_up != NULL);
 	LASSERT(index >= 0);
@@ -928,12 +930,12 @@ lstcon_batch_info(char *name, struct lstcon_test_batch_ent __user *ent_up,
 		  struct lstcon_node_ent __user *dents_up)
 {
 	struct lstcon_test_batch_ent *entp;
-	struct list_head	*clilst;
-	struct list_head	*srvlst;
-        lstcon_test_t           *test = NULL;
-        lstcon_batch_t          *bat;
-        lstcon_ndlink_t         *ndl;
-        int                      rc;
+	struct list_head *clilst;
+	struct list_head *srvlst;
+	struct lstcon_test *test = NULL;
+	struct lstcon_batch *bat;
+	struct lstcon_ndlink *ndl;
+	int rc;
 
         rc = lstcon_batch_find(name, &bat);
         if (rc != 0) {
@@ -996,7 +998,7 @@ lstcon_batch_info(char *name, struct lstcon_test_batch_ent __user *ent_up,
 }
 
 static int
-lstcon_batrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+lstcon_batrpc_condition(int transop, struct lstcon_node *nd, void *arg)
 {
         switch (transop) {
         case LST_TRANS_TSBRUN:
@@ -1018,10 +1020,10 @@ lstcon_batrpc_condition(int transop, lstcon_node_t *nd, void *arg)
 }
 
 static int
-lstcon_batch_op(lstcon_batch_t *bat, int transop,
+lstcon_batch_op(struct lstcon_batch *bat, int transop,
 		struct list_head __user *result_up)
 {
-        lstcon_rpc_trans_t *trans;
+	struct lstcon_rpc_trans *trans;
         int                 rc;
 
         rc = lstcon_rpc_trans_ndlist(&bat->bat_cli_list,
@@ -1044,8 +1046,8 @@ lstcon_batch_op(lstcon_batch_t *bat, int transop,
 int
 lstcon_batch_run(char *name, int timeout, struct list_head __user *result_up)
 {
-        lstcon_batch_t *bat;
-        int             rc;
+	struct lstcon_batch *bat;
+	int rc;
 
         if (lstcon_batch_find(name, &bat) != 0) {
                 CDEBUG(D_NET, "Can't find batch %s\n", name);
@@ -1066,8 +1068,8 @@ lstcon_batch_run(char *name, int timeout, struct list_head __user *result_up)
 int
 lstcon_batch_stop(char *name, int force, struct list_head __user *result_up)
 {
-        lstcon_batch_t *bat;
-        int             rc;
+	struct lstcon_batch *bat;
+	int rc;
 
         if (lstcon_batch_find(name, &bat) != 0) {
                 CDEBUG(D_NET, "Can't find batch %s\n", name);
@@ -1086,17 +1088,17 @@ lstcon_batch_stop(char *name, int force, struct list_head __user *result_up)
 }
 
 static void
-lstcon_batch_destroy(lstcon_batch_t *bat)
+lstcon_batch_destroy(struct lstcon_batch *bat)
 {
-        lstcon_ndlink_t    *ndl;
-        lstcon_test_t      *test;
-        int                 i;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_test *test;
+	int i;
 
 	list_del(&bat->bat_link);
 
 	while (!list_empty(&bat->bat_test_list)) {
 		test = list_entry(bat->bat_test_list.next,
-				  lstcon_test_t, tes_link);
+				  struct lstcon_test, tes_link);
 		LASSERT(list_empty(&test->tes_trans_list));
 
 		list_del(&test->tes_link);
@@ -1104,7 +1106,7 @@ lstcon_batch_destroy(lstcon_batch_t *bat)
 		lstcon_group_decref(test->tes_src_grp);
 		lstcon_group_decref(test->tes_dst_grp);
 
-		LIBCFS_FREE(test, offsetof(lstcon_test_t,
+		LIBCFS_FREE(test, offsetof(struct lstcon_test,
 					   tes_param[test->tes_paramlen]));
 	}
 
@@ -1112,7 +1114,7 @@ lstcon_batch_destroy(lstcon_batch_t *bat)
 
 	while (!list_empty(&bat->bat_cli_list)) {
 		ndl = list_entry(bat->bat_cli_list.next,
-				 lstcon_ndlink_t, ndl_link);
+				 struct lstcon_ndlink, ndl_link);
 		list_del_init(&ndl->ndl_link);
 
 		lstcon_ndlink_release(ndl);
@@ -1120,7 +1122,7 @@ lstcon_batch_destroy(lstcon_batch_t *bat)
 
 	while (!list_empty(&bat->bat_srv_list)) {
 		ndl = list_entry(bat->bat_srv_list.next,
-				 lstcon_ndlink_t, ndl_link);
+				 struct lstcon_ndlink, ndl_link);
 		list_del_init(&ndl->ndl_link);
 
 		lstcon_ndlink_release(ndl);
@@ -1135,19 +1137,18 @@ lstcon_batch_destroy(lstcon_batch_t *bat)
 		    sizeof(struct list_head) * LST_NODE_HASHSIZE);
 	LIBCFS_FREE(bat->bat_srv_hash,
 		    sizeof(struct list_head) * LST_NODE_HASHSIZE);
-	LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+	LIBCFS_FREE(bat, sizeof(*bat));
 }
 
 static int
-lstcon_testrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+lstcon_testrpc_condition(int transop, struct lstcon_node *nd, void *arg)
 {
-	lstcon_test_t	 *test;
-	lstcon_batch_t	 *batch;
-	lstcon_ndlink_t  *ndl;
+	struct lstcon_test *test = arg;
+	struct lstcon_batch *batch;
+	struct lstcon_ndlink *ndl;
 	struct list_head *hash;
 	struct list_head *head;
 
-	test = (lstcon_test_t *)arg;
 	LASSERT(test != NULL);
 
 	batch = test->tes_batch;
@@ -1183,12 +1184,13 @@ lstcon_testrpc_condition(int transop, lstcon_node_t *nd, void *arg)
 }
 
 static int
-lstcon_test_nodes_add(lstcon_test_t *test, struct list_head __user *result_up)
+lstcon_test_nodes_add(struct lstcon_test *test,
+		      struct list_head __user *result_up)
 {
-        lstcon_rpc_trans_t     *trans;
-        lstcon_group_t         *grp;
-        int                     transop;
-        int                     rc;
+	struct lstcon_rpc_trans *trans;
+	struct lstcon_group *grp;
+	int transop;
+	int rc;
 
         LASSERT (test->tes_src_grp != NULL);
         LASSERT (test->tes_dst_grp != NULL);
@@ -1235,7 +1237,7 @@ lstcon_test_nodes_add(lstcon_test_t *test, struct list_head __user *result_up)
 }
 
 static int
-lstcon_verify_batch(const char *name, lstcon_batch_t **batch)
+lstcon_verify_batch(const char *name, struct lstcon_batch **batch)
 {
 	int rc;
 
@@ -1254,10 +1256,10 @@ lstcon_verify_batch(const char *name, lstcon_batch_t **batch)
 }
 
 static int
-lstcon_verify_group(const char *name, lstcon_group_t **grp)
+lstcon_verify_group(const char *name, struct lstcon_group **grp)
 {
-	int			rc;
-	lstcon_ndlink_t		*ndl;
+	int rc;
+	struct lstcon_ndlink *ndl;
 
 	rc = lstcon_group_find(name, grp);
 	if (rc != 0) {
@@ -1283,11 +1285,11 @@ lstcon_test_add(char *batch_name, int type, int loop,
 		void *param, int paramlen, int *retp,
 		struct list_head __user *result_up)
 {
-	lstcon_test_t	 *test	 = NULL;
-	int		 rc;
-	lstcon_group_t	 *src_grp = NULL;
-	lstcon_group_t	 *dst_grp = NULL;
-	lstcon_batch_t	 *batch = NULL;
+	struct lstcon_test *test = NULL;
+	int rc;
+	struct lstcon_group *src_grp = NULL;
+	struct lstcon_group *dst_grp = NULL;
+	struct lstcon_batch *batch = NULL;
 
 	/*
 	 * verify that a batch of the given name exists, and the groups
@@ -1309,7 +1311,7 @@ lstcon_test_add(char *batch_name, int type, int loop,
 	if (dst_grp->grp_userland)
 		*retp = 1;
 
-	LIBCFS_ALLOC(test, offsetof(lstcon_test_t, tes_param[paramlen]));
+	LIBCFS_ALLOC(test, offsetof(struct lstcon_test, tes_param[paramlen]));
 	if (!test) {
 		CERROR("Can't allocate test descriptor\n");
 		rc = -ENOMEM;
@@ -1356,7 +1358,8 @@ lstcon_test_add(char *batch_name, int type, int loop,
 	return rc;
 out:
 	if (test != NULL)
-		LIBCFS_FREE(test, offsetof(lstcon_test_t, tes_param[paramlen]));
+		LIBCFS_FREE(test, offsetof(struct lstcon_test,
+					   tes_param[paramlen]));
 
 	if (dst_grp != NULL)
 		lstcon_group_decref(dst_grp);
@@ -1368,9 +1371,10 @@ lstcon_test_add(char *batch_name, int type, int loop,
 }
 
 static int
-lstcon_test_find(lstcon_batch_t *batch, int idx, lstcon_test_t **testpp)
+lstcon_test_find(struct lstcon_batch *batch, int idx,
+		 struct lstcon_test **testpp)
 {
-	lstcon_test_t *test;
+	struct lstcon_test *test;
 
 	list_for_each_entry(test, &batch->bat_test_list, tes_link) {
 		if (idx == test->tes_hdr.tsb_index) {
@@ -1383,10 +1387,10 @@ lstcon_test_find(lstcon_batch_t *batch, int idx, lstcon_test_t **testpp)
 }
 
 static int
-lstcon_tsbrpc_readent(int transop, srpc_msg_t *msg,
+lstcon_tsbrpc_readent(int transop, struct srpc_msg *msg,
 		      struct lstcon_rpc_ent __user *ent_up)
 {
-        srpc_batch_reply_t *rep = &msg->msg_body.bat_reply;
+	struct srpc_batch_reply *rep = &msg->msg_body.bat_reply;
 
         LASSERT (transop == LST_TRANS_TSBCLIQRY ||
                  transop == LST_TRANS_TSBSRVQRY);
@@ -1403,14 +1407,14 @@ int
 lstcon_test_batch_query(char *name, int testidx, int client,
 			int timeout, struct list_head __user *result_up)
 {
-        lstcon_rpc_trans_t *trans;
-	struct list_head   *translist;
-	struct list_head   *ndlist;
-        lstcon_tsb_hdr_t   *hdr;
-        lstcon_batch_t     *batch;
-        lstcon_test_t      *test = NULL;
-        int                 transop;
-        int                 rc;
+	struct lstcon_rpc_trans *trans;
+	struct list_head *translist;
+	struct list_head *ndlist;
+	struct lstcon_tsb_hdr *hdr;
+	struct lstcon_batch *batch;
+	struct lstcon_test *test = NULL;
+	int transop;
+	int rc;
 
         rc = lstcon_batch_find(name, &batch);
         if (rc != 0) {
@@ -1462,13 +1466,13 @@ lstcon_test_batch_query(char *name, int testidx, int client,
 }
 
 static int
-lstcon_statrpc_readent(int transop, srpc_msg_t *msg,
+lstcon_statrpc_readent(int transop, struct srpc_msg *msg,
 		       struct lstcon_rpc_ent __user *ent_up)
 {
-	srpc_stat_reply_t *rep = &msg->msg_body.stat_reply;
-	struct sfw_counters __user  *sfwk_stat;
+	struct srpc_stat_reply *rep = &msg->msg_body.stat_reply;
+	struct sfw_counters __user *sfwk_stat;
 	struct srpc_counters __user *srpc_stat;
-	struct lnet_counters __user *lnet_stat;
+	struct lnet_counters_common __user *lnet_stat;
 
         if (rep->str_status != 0)
                 return 0;
@@ -1476,7 +1480,7 @@ lstcon_statrpc_readent(int transop, srpc_msg_t *msg,
 	sfwk_stat = (struct sfw_counters __user *)&ent_up->rpe_payload[0];
 	srpc_stat = (struct srpc_counters __user *)
 		((char __user *)sfwk_stat + sizeof(*sfwk_stat));
-	lnet_stat = (struct lnet_counters __user *)
+	lnet_stat = (struct lnet_counters_common __user *)
 		((char __user *)srpc_stat + sizeof(*srpc_stat));
 
 	if (copy_to_user(sfwk_stat, &rep->str_fw, sizeof(*sfwk_stat)) ||
@@ -1492,7 +1496,7 @@ lstcon_ndlist_stat(struct list_head *ndlist,
 		   int timeout, struct list_head __user *result_up)
 {
 	struct list_head    head;
-	lstcon_rpc_trans_t *trans;
+	struct lstcon_rpc_trans *trans;
 	int		    rc;
 
 	INIT_LIST_HEAD(&head);
@@ -1517,8 +1521,8 @@ int
 lstcon_group_stat(char *grp_name, int timeout,
 		  struct list_head __user *result_up)
 {
-        lstcon_group_t     *grp;
-        int                 rc;
+	struct lstcon_group *grp;
+	int rc;
 
         rc = lstcon_group_find(grp_name, &grp);
         if (rc != 0) {
@@ -1537,11 +1541,11 @@ int
 lstcon_nodes_stat(int count, struct lnet_process_id __user *ids_up,
 		  int timeout, struct list_head __user *result_up)
 {
-        lstcon_ndlink_t         *ndl;
-        lstcon_group_t          *tmp;
-	struct lnet_process_id        id;
-        int                      i;
-        int                      rc;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_group *tmp;
+	struct lnet_process_id id;
+	int i;
+	int rc;
 
         rc = lstcon_group_alloc(NULL, &tmp);
         if (rc != 0) {
@@ -1582,8 +1586,8 @@ lstcon_debug_ndlist(struct list_head *ndlist,
 		    struct list_head *translist,
 		    int timeout, struct list_head __user *result_up)
 {
-        lstcon_rpc_trans_t *trans;
-        int                 rc;
+	struct lstcon_rpc_trans *trans;
+	int rc;
 
         rc = lstcon_rpc_trans_ndlist(ndlist, translist, LST_TRANS_SESQRY,
                                      NULL, lstcon_sesrpc_condition, &trans);
@@ -1612,8 +1616,8 @@ int
 lstcon_batch_debug(int timeout, char *name,
 		   int client, struct list_head __user *result_up)
 {
-        lstcon_batch_t *bat;
-        int             rc;
+	struct lstcon_batch *bat;
+	int rc;
 
         rc = lstcon_batch_find(name, &bat);
         if (rc != 0)
@@ -1630,8 +1634,8 @@ int
 lstcon_group_debug(int timeout, char *name,
 		   struct list_head __user *result_up)
 {
-        lstcon_group_t *grp;
-        int             rc;
+	struct lstcon_group *grp;
+	int rc;
 
         rc = lstcon_group_find(name, &grp);
         if (rc != 0)
@@ -1645,15 +1649,15 @@ lstcon_group_debug(int timeout, char *name,
 }
 
 int
-lstcon_nodes_debug(int timeout,
-		   int count, struct lnet_process_id __user *ids_up,
+lstcon_nodes_debug(int timeout, int count,
+		   struct lnet_process_id __user *ids_up,
 		   struct list_head __user *result_up)
 {
-	struct lnet_process_id  id;
-        lstcon_ndlink_t   *ndl;
-        lstcon_group_t    *grp;
-        int                i;
-        int                rc;
+	struct lnet_process_id id;
+	struct lstcon_ndlink *ndl;
+	struct lstcon_group *grp;
+	int i;
+	int rc;
 
         rc = lstcon_group_alloc(NULL, &grp);
         if (rc != 0) {
@@ -1700,11 +1704,11 @@ lstcon_new_session_id(struct lst_sid *sid)
 {
 	struct lnet_process_id id;
 
-        LASSERT (console_session.ses_state == LST_SESSION_NONE);
+	LASSERT(console_session.ses_state == LST_SESSION_NONE);
 
-        LNetGetId(1, &id);
-        sid->ses_nid   = id.nid;
-        sid->ses_stamp = cfs_time_current();
+	LNetGetId(1, &id);
+	sid->ses_nid = id.nid;
+	sid->ses_stamp = div_u64(ktime_get_ns(), NSEC_PER_MSEC);
 }
 
 int
@@ -1759,7 +1763,7 @@ lstcon_session_new(char *name, int key, unsigned feats,
 
         rc = lstcon_rpc_pinger_start();
         if (rc != 0) {
-                lstcon_batch_t *bat = NULL;
+		struct lstcon_batch *bat = NULL;
 
                 lstcon_batch_find(LST_DEFAULT_BATCH, &bat);
                 lstcon_batch_destroy(bat);
@@ -1783,8 +1787,8 @@ lstcon_session_info(struct lst_sid __user *sid_up, int __user *key_up,
 		    char __user *name_up, int len)
 {
 	struct lstcon_ndlist_ent *entp;
-        lstcon_ndlink_t     *ndl;
-        int                  rc = 0;
+	struct lstcon_ndlink *ndl;
+	int rc = 0;
 
         if (console_session.ses_state != LST_SESSION_ACTIVE)
                 return -ESRCH;
@@ -1814,10 +1818,10 @@ lstcon_session_info(struct lst_sid __user *sid_up, int __user *key_up,
 int
 lstcon_session_end()
 {
-        lstcon_rpc_trans_t *trans;
-        lstcon_group_t     *grp;
-        lstcon_batch_t     *bat;
-        int                 rc = 0;
+	struct lstcon_rpc_trans *trans;
+	struct lstcon_group *grp;
+	struct lstcon_batch *bat;
+	int rc = 0;
 
         LASSERT (console_session.ses_state == LST_SESSION_ACTIVE);
 
@@ -1850,7 +1854,7 @@ lstcon_session_end()
 	/* destroy all batches */
 	while (!list_empty(&console_session.ses_bat_list)) {
 		bat = list_entry(console_session.ses_bat_list.next,
-				 lstcon_batch_t, bat_link);
+				 struct lstcon_batch, bat_link);
 
 		lstcon_batch_destroy(bat);
 	}
@@ -1858,7 +1862,7 @@ lstcon_session_end()
 	/* destroy all groups */
 	while (!list_empty(&console_session.ses_grp_list)) {
 		grp = list_entry(console_session.ses_grp_list.next,
-				 lstcon_group_t, grp_link);
+				 struct lstcon_group, grp_link);
 		LASSERT(grp->grp_ref == 1);
 
 		lstcon_group_decref(grp);
@@ -1906,15 +1910,15 @@ lstcon_session_feats_check(unsigned feats)
 }
 
 static int
-lstcon_acceptor_handle (srpc_server_rpc_t *rpc)
+lstcon_acceptor_handle(struct srpc_server_rpc *rpc)
 {
-        srpc_msg_t        *rep  = &rpc->srpc_replymsg;
-        srpc_msg_t        *req  = &rpc->srpc_reqstbuf->buf_msg;
-        srpc_join_reqst_t *jreq = &req->msg_body.join_reqst;
-        srpc_join_reply_t *jrep = &rep->msg_body.join_reply;
-        lstcon_group_t    *grp  = NULL;
-        lstcon_ndlink_t   *ndl;
-        int                rc   = 0;
+	struct srpc_msg *rep = &rpc->srpc_replymsg;
+	struct srpc_msg *req = &rpc->srpc_reqstbuf->buf_msg;
+	struct srpc_join_reqst *jreq = &req->msg_body.join_reqst;
+	struct srpc_join_reply *jrep = &rep->msg_body.join_reply;
+	struct lstcon_group *grp = NULL;
+	struct lstcon_ndlink *ndl;
+	int rc = 0;
 
         sfw_unpack_message(req);
 
@@ -1989,7 +1993,8 @@ lstcon_acceptor_handle (srpc_server_rpc_t *rpc)
         return rc;
 }
 
-static srpc_service_t lstcon_acceptor_service;
+static struct srpc_service lstcon_acceptor_service;
+
 static void lstcon_init_acceptor_service(void)
 {
         /* initialize selftest console acceptor service table */
@@ -1999,9 +2004,9 @@ static void lstcon_init_acceptor_service(void)
 	lstcon_acceptor_service.sv_wi_total = SFW_FRWK_WI_MAX;
 }
 
-int lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_hdr *hdr);
-
-DECLARE_IOCTL_HANDLER(lstcon_ioctl_handler, lstcon_ioctl_entry);
+static struct notifier_block lstcon_ioctl_handler = {
+	.notifier_call = lstcon_ioctl_entry,
+};
 
 /* initialize console */
 int
@@ -2010,8 +2015,6 @@ lstcon_console_init(void)
         int     i;
         int     rc;
 
-        memset(&console_session, 0, sizeof(lstcon_session_t));
-
 	console_session.ses_id		    = LST_INVALID_SID;
 	console_session.ses_state	    = LST_SESSION_NONE;
 	console_session.ses_timeout	    = 0;
@@ -2019,7 +2022,7 @@ lstcon_console_init(void)
 	console_session.ses_expired	    = 0;
 	console_session.ses_feats_updated   = 0;
 	console_session.ses_features	    = LST_FEATS_MASK;
-	console_session.ses_laststamp = cfs_time_current_sec();
+	console_session.ses_laststamp = ktime_get_real_seconds();
 
 	mutex_init(&console_session.ses_mutex);
 
@@ -2055,12 +2058,12 @@ lstcon_console_init(void)
                 goto out;
         }
 
-        rc = libcfs_register_ioctl(&lstcon_ioctl_handler);
-
-        if (rc == 0) {
-                lstcon_rpc_module_init();
-                return 0;
-        }
+	rc = blocking_notifier_chain_register(&libcfs_ioctl_list,
+					      &lstcon_ioctl_handler);
+	if (rc == 0) {
+		lstcon_rpc_module_init();
+		return 0;
+	}
 
 out:
 	srpc_shutdown_service(&lstcon_acceptor_service);
@@ -2077,9 +2080,10 @@ lstcon_console_init(void)
 int
 lstcon_console_fini(void)
 {
-        int     i;
+	int i;
 
-        libcfs_deregister_ioctl(&lstcon_ioctl_handler);
+	blocking_notifier_chain_unregister(&libcfs_ioctl_list,
+					   &lstcon_ioctl_handler);
 
 	mutex_lock(&console_session.ses_mutex);
 
diff --git a/drivers/staging/lustrefsx/lnet/selftest/console.h b/drivers/staging/lustrefsx/lnet/selftest/console.h
index ae76a50b4d173..02c76a89627e6 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/console.h
+++ b/drivers/staging/lustrefsx/lnet/selftest/console.h
@@ -39,29 +39,32 @@
 #ifndef __LST_CONSOLE_H__
 #define __LST_CONSOLE_H__
 
+#include <linux/uaccess.h>
 
 #include <libcfs/libcfs.h>
-#include <lnet/lnet.h>
 #include <lnet/lib-types.h>
 #include "selftest.h"
 #include "conrpc.h"
 
-typedef struct lstcon_node {
-	struct lnet_process_id    nd_id;          /* id of the node */
+/* node descriptor */
+struct lstcon_node {
+	struct lnet_process_id    nd_id;	/* id of the node */
         int                  nd_ref;         /* reference count */
         int                  nd_state;       /* state of the node */
         int                  nd_timeout;     /* session timeout */
-        cfs_time_t           nd_stamp;       /* timestamp of last replied RPC */
-        struct lstcon_rpc    nd_ping;        /* ping rpc */
-} lstcon_node_t;                                /*** node descriptor */
+	ktime_t			nd_stamp;	/* last RPC reply timestamp */
+	struct lstcon_rpc	nd_ping;	/* ping rpc */
+};
 
-typedef struct {
+/* node link descriptor */
+struct lstcon_ndlink {
 	struct list_head	ndl_link;	/* chain on list */
 	struct list_head	ndl_hlink;	/* chain on hash */
-	lstcon_node_t		*ndl_node;	/* pointer to node */
-} lstcon_ndlink_t;				/*** node link descriptor */
+	struct lstcon_node	*ndl_node;	/* pointer to node */
+};
 
-typedef struct {
+/* (alias of nodes) group descriptor */
+struct lstcon_group {
 	struct list_head	grp_link;	/* chain on global group list */
 	int			grp_ref;	/* reference count */
 	int			grp_userland;	/* has userland nodes */
@@ -71,19 +74,20 @@ typedef struct {
 	struct list_head	grp_trans_list;	/* transaction list */
 	struct list_head	grp_ndl_list;	/* nodes list */
 	struct list_head	grp_ndl_hash[0];/* hash table for nodes */
-} lstcon_group_t;		/*** (alias of nodes) group descriptor */
+};
 
 #define LST_BATCH_IDLE          0xB0            /* idle batch */
 #define LST_BATCH_RUNNING       0xB1            /* running batch */
 
-typedef struct lstcon_tsb_hdr {
+struct lstcon_tsb_hdr {
 	struct lst_bid		tsb_id;		/* batch ID */
         int                     tsb_index;      /* test index */
-} lstcon_tsb_hdr_t;
+};
 
-typedef struct {
+/* (tests ) batch descriptor */
+struct lstcon_batch {
 	/* test_batch header */
-	lstcon_tsb_hdr_t	bat_hdr;
+	struct lstcon_tsb_hdr	bat_hdr;
 	/* chain on session's batches list */
 	struct list_head	bat_link;
 	/* # of test */
@@ -99,7 +103,7 @@ typedef struct {
 	struct list_head	bat_test_list;
 	/* list head of transaction */
 	struct list_head	bat_trans_list;
-	/* list head of client nodes (lstcon_node_t) */
+	/* list head of client nodes (struct lstcon_node) */
 	struct list_head	bat_cli_list;
 	/* hash table of client nodes */
 	struct list_head	*bat_cli_hash;
@@ -107,15 +111,16 @@ typedef struct {
 	struct list_head	bat_srv_list;
 	/* hash table of server nodes */
 	struct list_head	*bat_srv_hash;
-} lstcon_batch_t;		/*** (tests ) batch descritptor */
+};
 
-typedef struct lstcon_test {
+/* a single test descriptor */
+struct lstcon_test {
 	/* test batch header */
-	lstcon_tsb_hdr_t	tes_hdr;
+	struct lstcon_tsb_hdr	tes_hdr;
 	/* chain on batch's tests list */
 	struct list_head	tes_link;
 	/* pointer to batch */
-	lstcon_batch_t	       *tes_batch;
+	struct lstcon_batch	*tes_batch;
 
         int                   tes_type;       /* type of the test, i.e: bulk, ping */
         int                   tes_stop_onerr; /* stop on error */
@@ -127,12 +132,12 @@ typedef struct lstcon_test {
         int                   tes_cliidx;     /* client index, used for RPC creating */
 
 	struct list_head	tes_trans_list;	/* transaction list */
-	lstcon_group_t		*tes_src_grp;	/* group run the test */
-	lstcon_group_t		*tes_dst_grp;	/* target group */
+	struct lstcon_group	*tes_src_grp;	/* group run the test */
+	struct lstcon_group	*tes_dst_grp;	/* target group */
 
         int                   tes_paramlen;   /* test parameter length */
         char                  tes_param[0];   /* test parameter */
-} lstcon_test_t;                                /*** a single test descriptor */
+};
 
 #define LST_GLOBAL_HASHSIZE     503             /* global nodes hash table size */
 #define LST_NODE_HASHSIZE       239             /* node hash table (for batch or group) */
@@ -142,13 +147,13 @@ typedef struct lstcon_test {
 
 #define LST_CONSOLE_TIMEOUT     300             /* default console timeout */
 
-typedef struct {
+struct lstcon_session {
 	struct mutex		ses_mutex;      /* only 1 thread in session */
-	struct lst_sid               ses_id;         /* global session id */
+	struct lst_sid          ses_id;         /* global session id */
         int                     ses_key;        /* local session key */
         int                     ses_state;      /* state of session */
         int                     ses_timeout;    /* timeout in seconds */
-	time_t			ses_laststamp;  /* last operation stamp (seconds) */
+	time64_t		ses_laststamp;  /* last operation stamp (seconds) */
 	/** tests features of the session */
 	unsigned		ses_features;
 	/** features are synced with remote test nodes */
@@ -161,9 +166,9 @@ typedef struct {
 	unsigned		ses_expired:1;
         __u64                   ses_id_cookie;  /* batch id cookie */
         char                    ses_name[LST_NAME_SIZE];  /* session name */
-        lstcon_rpc_trans_t     *ses_ping;       /* session pinger */
-        stt_timer_t             ses_ping_timer; /* timer for pinger */
-	struct lstcon_trans_stat     ses_trans_stat; /* transaction stats */
+	struct lstcon_rpc_trans	*ses_ping;      /* session pinger */
+	struct stt_timer	ses_ping_timer;	/* timer for pinger */
+	struct lstcon_trans_stat ses_trans_stat;/* transaction stats */
 
 	struct list_head	ses_trans_list;	/* global list of transaction */
 	struct list_head	ses_grp_list;	/* global list of groups */
@@ -174,9 +179,9 @@ typedef struct {
 	spinlock_t		ses_rpc_lock;	/* serialize */
 	atomic_t		ses_rpc_counter;/* # of initialized RPCs */
 	struct list_head	ses_rpc_freelist;/* idle console rpc */
-} lstcon_session_t;		/*** session descriptor */
+}; /* session descriptor */
 
-extern lstcon_session_t         console_session;
+extern struct lstcon_session console_session;
 
 static inline struct lstcon_trans_stat *
 lstcon_trans_stat(void)
@@ -250,6 +255,8 @@ extern int lstcon_test_add(char *batch_name, int type, int loop,
 			   void *param, int paramlen, int *retp,
 			   struct list_head __user *result_up);
 
+int lstcon_ioctl_entry(struct notifier_block *nb,
+		       unsigned long cmd, void *vdata);
 int lstcon_console_init(void);
 int lstcon_console_fini(void);
 
diff --git a/drivers/staging/lustrefsx/lnet/selftest/framework.c b/drivers/staging/lustrefsx/lnet/selftest/framework.c
index b5d430dde00d1..000fca9d34e33 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/framework.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/framework.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -51,49 +51,49 @@ MODULE_PARM_DESC(rpc_timeout, "rpc timeout in seconds (64 by default, 0 == never
 
 #define sfw_unpack_id(id)               \
 do {                                    \
-        __swab64s(&(id).nid);           \
-        __swab32s(&(id).pid);           \
+	__swab64s(&(id).nid);           \
+	__swab32s(&(id).pid);           \
 } while (0)
 
 #define sfw_unpack_sid(sid)             \
 do {                                    \
-        __swab64s(&(sid).ses_nid);      \
-        __swab64s(&(sid).ses_stamp);    \
+	__swab64s(&(sid).ses_nid);      \
+	__swab64s(&(sid).ses_stamp);    \
 } while (0)
 
 #define sfw_unpack_fw_counters(fc)        \
 do {                                      \
-        __swab32s(&(fc).running_ms);      \
-        __swab32s(&(fc).active_batches);  \
-        __swab32s(&(fc).zombie_sessions); \
-        __swab32s(&(fc).brw_errors);      \
-        __swab32s(&(fc).ping_errors);     \
+	__swab32s(&(fc).running_ms);      \
+	__swab32s(&(fc).active_batches);  \
+	__swab32s(&(fc).zombie_sessions); \
+	__swab32s(&(fc).brw_errors);      \
+	__swab32s(&(fc).ping_errors);     \
 } while (0)
 
 #define sfw_unpack_rpc_counters(rc)     \
 do {                                    \
-        __swab32s(&(rc).errors);        \
-        __swab32s(&(rc).rpcs_sent);     \
-        __swab32s(&(rc).rpcs_rcvd);     \
-        __swab32s(&(rc).rpcs_dropped);  \
-        __swab32s(&(rc).rpcs_expired);  \
-        __swab64s(&(rc).bulk_get);      \
-        __swab64s(&(rc).bulk_put);      \
+	__swab32s(&(rc).errors);        \
+	__swab32s(&(rc).rpcs_sent);     \
+	__swab32s(&(rc).rpcs_rcvd);     \
+	__swab32s(&(rc).rpcs_dropped);  \
+	__swab32s(&(rc).rpcs_expired);  \
+	__swab64s(&(rc).bulk_get);      \
+	__swab64s(&(rc).bulk_put);      \
 } while (0)
 
 #define sfw_unpack_lnet_counters(lc)    \
 do {                                    \
-        __swab32s(&(lc).errors);        \
-        __swab32s(&(lc).msgs_max);      \
-        __swab32s(&(lc).msgs_alloc);    \
-        __swab32s(&(lc).send_count);    \
-        __swab32s(&(lc).recv_count);    \
-        __swab32s(&(lc).drop_count);    \
-        __swab32s(&(lc).route_count);   \
-        __swab64s(&(lc).send_length);   \
-        __swab64s(&(lc).recv_length);   \
-        __swab64s(&(lc).drop_length);   \
-        __swab64s(&(lc).route_length);  \
+	__swab32s(&(lc).lcc_errors);        \
+	__swab32s(&(lc).lcc_msgs_max);      \
+	__swab32s(&(lc).lcc_msgs_alloc);    \
+	__swab32s(&(lc).lcc_send_count);    \
+	__swab32s(&(lc).lcc_recv_count);    \
+	__swab32s(&(lc).lcc_drop_count);    \
+	__swab32s(&(lc).lcc_route_count);   \
+	__swab64s(&(lc).lcc_send_length);   \
+	__swab64s(&(lc).lcc_recv_length);   \
+	__swab64s(&(lc).lcc_drop_length);   \
+	__swab64s(&(lc).lcc_route_length);  \
 } while (0)
 
 #define sfw_test_active(t)      (atomic_read(&(t)->tsi_nactive) != 0)
@@ -111,21 +111,21 @@ static struct smoketest_framework {
 	/* serialise */
 	spinlock_t		fw_lock;
 	/* _the_ session */
-	sfw_session_t		*fw_session;
+	struct sfw_session	*fw_session;
 	/* shutdown in progress */
 	int			fw_shuttingdown;
 	/* running RPC */
-	srpc_server_rpc_t	*fw_active_srpc;
+	struct srpc_server_rpc	*fw_active_srpc;
 } sfw_data;
 
 /* forward ref's */
-int sfw_stop_batch (sfw_batch_t *tsb, int force);
-void sfw_destroy_session (sfw_session_t *sn);
+static int sfw_stop_batch(struct sfw_batch *tsb, int force);
+static void sfw_destroy_session(struct sfw_session *sn);
 
-static inline sfw_test_case_t *
+static inline struct sfw_test_case *
 sfw_find_test_case(int id)
 {
-	sfw_test_case_t *tsc;
+	struct sfw_test_case *tsc;
 
 	LASSERT(id <= SRPC_SERVICE_MAX_ID);
 	LASSERT(id > SRPC_FRAMEWORK_SERVICE_MAX_ID);
@@ -139,9 +139,10 @@ sfw_find_test_case(int id)
 }
 
 static int
-sfw_register_test (srpc_service_t *service, sfw_test_client_ops_t *cliops)
+sfw_register_test(struct srpc_service *service,
+		  struct sfw_test_client_ops *cliops)
 {
-        sfw_test_case_t *tsc;
+	struct sfw_test_case *tsc;
 
         if (sfw_find_test_case(service->sv_id) != NULL) {
                 CERROR ("Failed to register test %s (%d)\n",
@@ -149,7 +150,7 @@ sfw_register_test (srpc_service_t *service, sfw_test_client_ops_t *cliops)
                 return -EEXIST;
         }
 
-        LIBCFS_ALLOC(tsc, sizeof(sfw_test_case_t));
+	LIBCFS_ALLOC(tsc, sizeof(*tsc));
         if (tsc == NULL)
                 return -ENOMEM;
 
@@ -163,8 +164,8 @@ sfw_register_test (srpc_service_t *service, sfw_test_client_ops_t *cliops)
 static void
 sfw_add_session_timer (void)
 {
-        sfw_session_t *sn = sfw_data.fw_session;
-        stt_timer_t   *timer = &sn->sn_timer;
+	struct sfw_session *sn = sfw_data.fw_session;
+	struct stt_timer *timer = &sn->sn_timer;
 
         LASSERT (!sfw_data.fw_shuttingdown);
 
@@ -174,8 +175,7 @@ sfw_add_session_timer (void)
         LASSERT (!sn->sn_timer_active);
 
         sn->sn_timer_active = 1;
-	timer->stt_expires = cfs_time_add(sn->sn_timeout,
-					  cfs_time_current_sec());
+	timer->stt_expires = ktime_get_real_seconds()+ sn->sn_timeout;
         stt_add_timer(timer);
         return;
 }
@@ -183,7 +183,7 @@ sfw_add_session_timer (void)
 static int
 sfw_del_session_timer (void)
 {
-        sfw_session_t *sn = sfw_data.fw_session;
+	struct sfw_session *sn = sfw_data.fw_session;
 
         if (sn == NULL || !sn->sn_timer_active)
                 return 0;
@@ -203,10 +203,10 @@ static void
 sfw_deactivate_session (void)
 __must_hold(&sfw_data.fw_lock)
 {
-        sfw_session_t *sn = sfw_data.fw_session;
+	struct sfw_session *sn = sfw_data.fw_session;
         int            nactive = 0;
-        sfw_batch_t   *tsb;
-        sfw_test_case_t *tsc;
+	struct sfw_batch *tsb;
+	struct sfw_test_case *tsc;
 
         if (sn == NULL) return;
 
@@ -246,7 +246,7 @@ __must_hold(&sfw_data.fw_lock)
 static void
 sfw_session_expired (void *data)
 {
-	sfw_session_t *sn = data;
+	struct sfw_session *sn = data;
 
 	spin_lock(&sfw_data.fw_lock);
 
@@ -264,12 +264,12 @@ sfw_session_expired (void *data)
 }
 
 static inline void
-sfw_init_session(sfw_session_t *sn, struct lst_sid sid,
+sfw_init_session(struct sfw_session *sn, struct lst_sid sid,
 		 unsigned features, const char *name)
 {
-        stt_timer_t *timer = &sn->sn_timer;
+	struct stt_timer *timer = &sn->sn_timer;
 
-        memset(sn, 0, sizeof(sfw_session_t));
+	memset(sn, 0, sizeof(struct sfw_session));
 	INIT_LIST_HEAD(&sn->sn_list);
 	INIT_LIST_HEAD(&sn->sn_batches);
 	atomic_set(&sn->sn_refcount, 1);        /* +1 for caller */
@@ -277,14 +277,14 @@ sfw_init_session(sfw_session_t *sn, struct lst_sid sid,
 	atomic_set(&sn->sn_ping_errors, 0);
 	strlcpy(&sn->sn_name[0], name, sizeof(sn->sn_name));
 
-        sn->sn_timer_active = 0;
-        sn->sn_id           = sid;
-	sn->sn_features	    = features;
-        sn->sn_timeout      = session_timeout;
-        sn->sn_started      = cfs_time_current();
+	sn->sn_timer_active = 0;
+	sn->sn_id = sid;
+	sn->sn_features = features;
+	sn->sn_timeout = session_timeout;
+	sn->sn_started = ktime_get();
 
-        timer->stt_data = sn;
-        timer->stt_func = sfw_session_expired;
+	timer->stt_data = sn;
+	timer->stt_func = sfw_session_expired;
 	INIT_LIST_HEAD(&timer->stt_list);
 }
 
@@ -308,7 +308,7 @@ sfw_server_rpc_done(struct srpc_server_rpc *rpc)
 }
 
 static void
-sfw_client_rpc_fini (srpc_client_rpc_t *rpc)
+sfw_client_rpc_fini(struct srpc_client_rpc *rpc)
 {
 	LASSERT(rpc->crpc_bulk.bk_niov == 0);
 	LASSERT(list_empty(&rpc->crpc_list));
@@ -329,11 +329,11 @@ sfw_client_rpc_fini (srpc_client_rpc_t *rpc)
 	spin_unlock(&sfw_data.fw_lock);
 }
 
-static sfw_batch_t *
+static struct sfw_batch *
 sfw_find_batch(struct lst_bid bid)
 {
-	sfw_session_t *sn = sfw_data.fw_session;
-	sfw_batch_t   *bat;
+	struct sfw_session *sn = sfw_data.fw_session;
+	struct sfw_batch *bat;
 
 	LASSERT(sn != NULL);
 
@@ -345,11 +345,11 @@ sfw_find_batch(struct lst_bid bid)
 	return NULL;
 }
 
-static sfw_batch_t *
+static struct sfw_batch *
 sfw_bid2batch(struct lst_bid bid)
 {
-        sfw_session_t *sn = sfw_data.fw_session;
-        sfw_batch_t   *bat;
+	struct sfw_session *sn = sfw_data.fw_session;
+	struct sfw_batch *bat;
 
         LASSERT (sn != NULL);
 
@@ -357,7 +357,7 @@ sfw_bid2batch(struct lst_bid bid)
         if (bat != NULL)
                 return bat;
 
-        LIBCFS_ALLOC(bat, sizeof(sfw_batch_t));
+	LIBCFS_ALLOC(bat, sizeof(*bat));
 	if (bat == NULL)
                 return NULL;
 
@@ -372,11 +372,11 @@ sfw_bid2batch(struct lst_bid bid)
 }
 
 static int
-sfw_get_stats (srpc_stat_reqst_t *request, srpc_stat_reply_t *reply)
+sfw_get_stats(struct srpc_stat_reqst *request, struct srpc_stat_reply *reply)
 {
-        sfw_session_t  *sn = sfw_data.fw_session;
+	struct sfw_session *sn = sfw_data.fw_session;
 	struct sfw_counters *cnt = &reply->str_fw;
-        sfw_batch_t    *bat;
+	struct sfw_batch *bat;
 
         reply->str_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
 
@@ -390,14 +390,14 @@ sfw_get_stats (srpc_stat_reqst_t *request, srpc_stat_reply_t *reply)
                 return 0;
         }
 
-	lnet_counters_get(&reply->str_lnet);
+	lnet_counters_get_common(&reply->str_lnet);
 	srpc_get_counters(&reply->str_rpc);
 
         /* send over the msecs since the session was started
          - with 32 bits to send, this is ~49 days */
-	cnt->running_ms      = jiffies_to_msecs(jiffies - sn->sn_started);
-	cnt->brw_errors      = atomic_read(&sn->sn_brw_errors);
-	cnt->ping_errors     = atomic_read(&sn->sn_ping_errors);
+	cnt->running_ms = ktime_ms_delta(ktime_get(), sn->sn_started);
+	cnt->brw_errors = atomic_read(&sn->sn_brw_errors);
+	cnt->ping_errors = atomic_read(&sn->sn_ping_errors);
 	cnt->zombie_sessions = atomic_read(&sfw_data.fw_nzombies);
 
 	cnt->active_batches = 0;
@@ -411,12 +411,12 @@ sfw_get_stats (srpc_stat_reqst_t *request, srpc_stat_reply_t *reply)
 }
 
 int
-sfw_make_session(srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply)
+sfw_make_session(struct srpc_mksn_reqst *request, struct srpc_mksn_reply *reply)
 {
-	sfw_session_t *sn = sfw_data.fw_session;
-	srpc_msg_t    *msg = container_of(request, srpc_msg_t,
-					  msg_body.mksn_reqst);
-	int	       cplen = 0;
+	struct sfw_session *sn = sfw_data.fw_session;
+	struct srpc_msg *msg = container_of(request, struct srpc_msg,
+					    msg_body.mksn_reqst);
+	int cplen = 0;
 
         if (request->mksn_sid.ses_nid == LNET_NID_ANY) {
                 reply->mksn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
@@ -446,7 +446,7 @@ sfw_make_session(srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply)
 
 	/* reject the request if it requires unknown features
 	 * NB: old version will always accept all features because it's not
-	 * aware of srpc_msg_t::msg_ses_feats, it's a defect but it's also
+	 * aware of struct srpc_msg::msg_ses_feats, it's a defect but it's also
 	 * harmless because it will return zero feature to console, and it's
 	 * console's responsibility to make sure all nodes in a session have
 	 * same feature mask. */
@@ -456,7 +456,7 @@ sfw_make_session(srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply)
 	}
 
 	/* brand new or create by force */
-	LIBCFS_ALLOC(sn, sizeof(sfw_session_t));
+	LIBCFS_ALLOC(sn, sizeof(*sn));
 	if (sn == NULL) {
 		CERROR("dropping RPC mksn under memory pressure\n");
 		return -ENOMEM;
@@ -480,9 +480,10 @@ sfw_make_session(srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply)
 }
 
 static int
-sfw_remove_session (srpc_rmsn_reqst_t *request, srpc_rmsn_reply_t *reply)
+sfw_remove_session(struct srpc_rmsn_reqst *request,
+		   struct srpc_rmsn_reply *reply)
 {
-        sfw_session_t *sn = sfw_data.fw_session;
+	struct sfw_session *sn = sfw_data.fw_session;
 
         reply->rmsn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
 
@@ -512,9 +513,10 @@ sfw_remove_session (srpc_rmsn_reqst_t *request, srpc_rmsn_reply_t *reply)
 }
 
 static int
-sfw_debug_session (srpc_debug_reqst_t *request, srpc_debug_reply_t *reply)
+sfw_debug_session(struct srpc_debug_reqst *request,
+		  struct srpc_debug_reply *reply)
 {
-        sfw_session_t *sn = sfw_data.fw_session;
+	struct sfw_session *sn = sfw_data.fw_session;
 
         if (sn == NULL) {
                 reply->dbg_status = ESRCH;
@@ -533,10 +535,10 @@ sfw_debug_session (srpc_debug_reqst_t *request, srpc_debug_reply_t *reply)
 }
 
 static void
-sfw_test_rpc_fini (srpc_client_rpc_t *rpc)
+sfw_test_rpc_fini(struct srpc_client_rpc *rpc)
 {
-	sfw_test_unit_t	    *tsu = rpc->crpc_priv;
-	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	struct sfw_test_unit *tsu = rpc->crpc_priv;
+	struct sfw_test_instance *tsi = tsu->tsu_instance;
 
 	/* Called with hold of tsi->tsi_lock */
 	LASSERT(list_empty(&rpc->crpc_list));
@@ -544,7 +546,7 @@ sfw_test_rpc_fini (srpc_client_rpc_t *rpc)
 }
 
 static inline int
-sfw_test_buffers(sfw_test_instance_t *tsi)
+sfw_test_buffers(struct sfw_test_instance *tsi)
 {
 	struct sfw_test_case	*tsc;
 	struct srpc_service	*svc;
@@ -618,10 +620,10 @@ sfw_unload_test(struct sfw_test_instance *tsi)
 }
 
 static void
-sfw_destroy_test_instance (sfw_test_instance_t *tsi)
+sfw_destroy_test_instance(struct sfw_test_instance *tsi)
 {
-        srpc_client_rpc_t *rpc;
-        sfw_test_unit_t   *tsu;
+	struct srpc_client_rpc *rpc;
+	struct sfw_test_unit *tsu;
 
         if (!tsi->tsi_is_client) goto clean;
 
@@ -633,14 +635,14 @@ sfw_destroy_test_instance (sfw_test_instance_t *tsi)
 
 	while (!list_empty(&tsi->tsi_units)) {
 		tsu = list_entry(tsi->tsi_units.next,
-				 sfw_test_unit_t, tsu_list);
+				 struct sfw_test_unit, tsu_list);
 		list_del(&tsu->tsu_list);
 		LIBCFS_FREE(tsu, sizeof(*tsu));
 	}
 
 	while (!list_empty(&tsi->tsi_free_rpcs)) {
 		rpc = list_entry(tsi->tsi_free_rpcs.next,
-				 srpc_client_rpc_t, crpc_list);
+				 struct srpc_client_rpc, crpc_list);
 		list_del(&rpc->crpc_list);
 		LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
 	}
@@ -652,35 +654,35 @@ sfw_destroy_test_instance (sfw_test_instance_t *tsi)
 }
 
 static void
-sfw_destroy_batch (sfw_batch_t *tsb)
+sfw_destroy_batch(struct sfw_batch *tsb)
 {
-	sfw_test_instance_t *tsi;
+	struct sfw_test_instance *tsi;
 
 	LASSERT(!sfw_batch_active(tsb));
 	LASSERT(list_empty(&tsb->bat_list));
 
 	while (!list_empty(&tsb->bat_tests)) {
 		tsi = list_entry(tsb->bat_tests.next,
-				 sfw_test_instance_t, tsi_list);
+				 struct sfw_test_instance, tsi_list);
 		list_del_init(&tsi->tsi_list);
 		sfw_destroy_test_instance(tsi);
 	}
 
-	LIBCFS_FREE(tsb, sizeof(sfw_batch_t));
+	LIBCFS_FREE(tsb, sizeof(*tsb));
 	return;
 }
 
-void
-sfw_destroy_session (sfw_session_t *sn)
+static void
+sfw_destroy_session(struct sfw_session *sn)
 {
-	sfw_batch_t *batch;
+	struct sfw_batch *batch;
 
 	LASSERT(list_empty(&sn->sn_list));
 	LASSERT(sn != sfw_data.fw_session);
 
 	while (!list_empty(&sn->sn_batches)) {
 		batch = list_entry(sn->sn_batches.next,
-				   sfw_batch_t, bat_list);
+				   struct sfw_batch, bat_list);
 		list_del_init(&batch->bat_list);
 		sfw_destroy_batch(batch);
 	}
@@ -691,9 +693,9 @@ sfw_destroy_session (sfw_session_t *sn)
 }
 
 static void
-sfw_unpack_addtest_req(srpc_msg_t *msg)
+sfw_unpack_addtest_req(struct srpc_msg *msg)
 {
-        srpc_test_reqst_t *req = &msg->msg_body.tes_reqst;
+	struct srpc_test_reqst *req = &msg->msg_body.tes_reqst;
 
         LASSERT (msg->msg_type == SRPC_MSG_TEST_REQST);
         LASSERT (req->tsr_is_client);
@@ -705,14 +707,14 @@ sfw_unpack_addtest_req(srpc_msg_t *msg)
 
 	if (req->tsr_service == SRPC_SERVICE_BRW) {
 		if ((msg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) {
-			test_bulk_req_t *bulk = &req->tsr_u.bulk_v0;
+			struct test_bulk_req *bulk = &req->tsr_u.bulk_v0;
 
 			__swab32s(&bulk->blk_opc);
 			__swab32s(&bulk->blk_npg);
 			__swab32s(&bulk->blk_flags);
 
 		} else {
-			test_bulk_req_v1_t *bulk = &req->tsr_u.bulk_v1;
+			struct test_bulk_req_v1 *bulk = &req->tsr_u.bulk_v1;
 
 			__swab16s(&bulk->blk_opc);
 			__swab16s(&bulk->blk_flags);
@@ -724,7 +726,7 @@ sfw_unpack_addtest_req(srpc_msg_t *msg)
 	}
 
         if (req->tsr_service == SRPC_SERVICE_PING) {
-                test_ping_req_t *ping = &req->tsr_u.ping;
+		struct test_ping_req *ping = &req->tsr_u.ping;
 
                 __swab32s(&ping->png_size);
                 __swab32s(&ping->png_flags);
@@ -736,16 +738,16 @@ sfw_unpack_addtest_req(srpc_msg_t *msg)
 }
 
 static int
-sfw_add_test_instance (sfw_batch_t *tsb, srpc_server_rpc_t *rpc)
+sfw_add_test_instance(struct sfw_batch *tsb, struct srpc_server_rpc *rpc)
 {
-        srpc_msg_t          *msg = &rpc->srpc_reqstbuf->buf_msg;
-        srpc_test_reqst_t   *req = &msg->msg_body.tes_reqst;
-        srpc_bulk_t         *bk = rpc->srpc_bulk;
-        int                  ndest = req->tsr_ndest;
-        sfw_test_unit_t     *tsu;
-        sfw_test_instance_t *tsi;
-        int                  i;
-        int                  rc;
+	struct srpc_msg *msg = &rpc->srpc_reqstbuf->buf_msg;
+	struct srpc_test_reqst *req = &msg->msg_body.tes_reqst;
+	struct srpc_bulk *bk = rpc->srpc_bulk;
+	int ndest = req->tsr_ndest;
+	struct sfw_test_unit *tsu;
+	struct sfw_test_instance *tsi;
+	int i;
+	int rc;
 
         LIBCFS_ALLOC(tsi, sizeof(*tsi));
         if (tsi == NULL) {
@@ -802,7 +804,7 @@ sfw_add_test_instance (sfw_batch_t *tsb, srpc_server_rpc_t *rpc)
                         sfw_unpack_id(id);
 
                 for (j = 0; j < tsi->tsi_concur; j++) {
-                        LIBCFS_ALLOC(tsu, sizeof(sfw_test_unit_t));
+			LIBCFS_ALLOC(tsu, sizeof(*tsu));
                         if (tsu == NULL) {
                                 rc = -ENOMEM;
                                 CERROR ("Can't allocate tsu for %d\n",
@@ -831,11 +833,11 @@ sfw_add_test_instance (sfw_batch_t *tsb, srpc_server_rpc_t *rpc)
 }
 
 static void
-sfw_test_unit_done (sfw_test_unit_t *tsu)
+sfw_test_unit_done(struct sfw_test_unit *tsu)
 {
-        sfw_test_instance_t *tsi = tsu->tsu_instance;
-        sfw_batch_t         *tsb = tsi->tsi_batch;
-        sfw_session_t       *sn = tsb->bat_session;
+	struct sfw_test_instance *tsi = tsu->tsu_instance;
+	struct sfw_batch *tsb = tsi->tsi_batch;
+	struct sfw_session *sn = tsb->bat_session;
 
         LASSERT (sfw_test_active(tsi));
 
@@ -874,10 +876,10 @@ sfw_test_unit_done (sfw_test_unit_t *tsu)
 }
 
 static void
-sfw_test_rpc_done (srpc_client_rpc_t *rpc)
+sfw_test_rpc_done(struct srpc_client_rpc *rpc)
 {
-        sfw_test_unit_t     *tsu = rpc->crpc_priv;
-        sfw_test_instance_t *tsi = tsu->tsu_instance;
+	struct sfw_test_unit *tsu = rpc->crpc_priv;
+	struct sfw_test_instance *tsi = tsu->tsu_instance;
         int                  done = 0;
 
         tsi->tsi_ops->tso_done_rpc(tsu, rpc);
@@ -910,12 +912,12 @@ sfw_test_rpc_done (srpc_client_rpc_t *rpc)
 }
 
 int
-sfw_create_test_rpc(sfw_test_unit_t *tsu, struct lnet_process_id peer,
+sfw_create_test_rpc(struct sfw_test_unit *tsu, struct lnet_process_id peer,
 		    unsigned features, int nblk, int blklen,
-		    srpc_client_rpc_t **rpcpp)
+		    struct srpc_client_rpc **rpcpp)
 {
-	srpc_client_rpc_t   *rpc = NULL;
-	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	struct srpc_client_rpc *rpc = NULL;
+	struct sfw_test_instance *tsi = tsu->tsu_instance;
 
 	spin_lock(&tsi->tsi_lock);
 
@@ -924,7 +926,7 @@ sfw_create_test_rpc(sfw_test_unit_t *tsu, struct lnet_process_id peer,
 	if (!list_empty(&tsi->tsi_free_rpcs)) {
 		/* pick request from buffer */
 		rpc = list_entry(tsi->tsi_free_rpcs.next,
-				 srpc_client_rpc_t, crpc_list);
+				 struct srpc_client_rpc, crpc_list);
 		LASSERT(nblk == rpc->crpc_bulk.bk_niov);
 		list_del_init(&rpc->crpc_list);
 	}
@@ -953,11 +955,11 @@ sfw_create_test_rpc(sfw_test_unit_t *tsu, struct lnet_process_id peer,
 }
 
 static int
-sfw_run_test (swi_workitem_t *wi)
+sfw_run_test(struct swi_workitem *wi)
 {
-        sfw_test_unit_t     *tsu = wi->swi_workitem.wi_data;
-        sfw_test_instance_t *tsi = tsu->tsu_instance;
-        srpc_client_rpc_t   *rpc = NULL;
+	struct sfw_test_unit *tsu = wi->swi_workitem.wi_data;
+	struct sfw_test_instance *tsi = tsu->tsu_instance;
+	struct srpc_client_rpc *rpc = NULL;
 
         LASSERT (wi == &tsu->tsu_worker);
 
@@ -1002,11 +1004,11 @@ sfw_run_test (swi_workitem_t *wi)
 }
 
 static int
-sfw_run_batch (sfw_batch_t *tsb)
+sfw_run_batch(struct sfw_batch *tsb)
 {
-        swi_workitem_t      *wi;
-        sfw_test_unit_t     *tsu;
-        sfw_test_instance_t *tsi;
+	struct swi_workitem *wi;
+	struct sfw_test_unit *tsu;
+	struct sfw_test_instance *tsi;
 
         if (sfw_batch_active(tsb)) {
 		CDEBUG(D_NET, "Batch already active: %llu (%d)\n",
@@ -1038,11 +1040,11 @@ sfw_run_batch (sfw_batch_t *tsb)
 	return 0;
 }
 
-int
-sfw_stop_batch (sfw_batch_t *tsb, int force)
+static int
+sfw_stop_batch(struct sfw_batch *tsb, int force)
 {
-        sfw_test_instance_t *tsi;
-        srpc_client_rpc_t   *rpc;
+	struct sfw_test_instance *tsi;
+	struct srpc_client_rpc *rpc;
 
         if (!sfw_batch_active(tsb)) {
 		CDEBUG(D_NET, "Batch %llu inactive\n", tsb->bat_id.bat_id);
@@ -1081,9 +1083,10 @@ sfw_stop_batch (sfw_batch_t *tsb, int force)
 }
 
 static int
-sfw_query_batch (sfw_batch_t *tsb, int testidx, srpc_batch_reply_t *reply)
+sfw_query_batch(struct sfw_batch *tsb, int testidx,
+		struct srpc_batch_reply *reply)
 {
-        sfw_test_instance_t *tsi;
+	struct sfw_test_instance *tsi;
 
         if (testidx < 0)
                 return -EINVAL;
@@ -1105,7 +1108,7 @@ sfw_query_batch (sfw_batch_t *tsb, int testidx, srpc_batch_reply_t *reply)
 }
 
 void
-sfw_free_pages (srpc_server_rpc_t *rpc)
+sfw_free_pages(struct srpc_server_rpc *rpc)
 {
         srpc_free_bulk(rpc->srpc_bulk);
         rpc->srpc_bulk = NULL;
@@ -1126,13 +1129,13 @@ sfw_alloc_pages(struct srpc_server_rpc *rpc, int cpt, int npages, int len,
 }
 
 static int
-sfw_add_test (srpc_server_rpc_t *rpc)
+sfw_add_test(struct srpc_server_rpc *rpc)
 {
-        sfw_session_t     *sn = sfw_data.fw_session;
-        srpc_test_reply_t *reply = &rpc->srpc_replymsg.msg_body.tes_reply;
-        srpc_test_reqst_t *request;
-        int                rc;
-        sfw_batch_t       *bat;
+	struct sfw_session *sn = sfw_data.fw_session;
+	struct srpc_test_reply *reply = &rpc->srpc_replymsg.msg_body.tes_reply;
+	struct srpc_test_reqst *request;
+	int rc;
+	struct sfw_batch *bat;
 
         request = &rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst;
         reply->tsr_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
@@ -1196,11 +1199,12 @@ sfw_add_test (srpc_server_rpc_t *rpc)
 }
 
 static int
-sfw_control_batch (srpc_batch_reqst_t *request, srpc_batch_reply_t *reply)
+sfw_control_batch(struct srpc_batch_reqst *request,
+		  struct srpc_batch_reply *reply)
 {
-        sfw_session_t *sn = sfw_data.fw_session;
+	struct sfw_session *sn = sfw_data.fw_session;
         int            rc = 0;
-        sfw_batch_t   *bat;
+	struct sfw_batch *bat;
 
         reply->bar_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
 
@@ -1240,8 +1244,8 @@ static int
 sfw_handle_server_rpc(struct srpc_server_rpc *rpc)
 {
 	struct srpc_service	*sv = rpc->srpc_scd->scd_svc;
-	srpc_msg_t     *reply	= &rpc->srpc_replymsg;
-	srpc_msg_t     *request	= &rpc->srpc_reqstbuf->buf_msg;
+	struct srpc_msg     *reply	= &rpc->srpc_replymsg;
+	struct srpc_msg     *request	= &rpc->srpc_reqstbuf->buf_msg;
 	unsigned	features = LST_FEATS_MASK;
 	int		rc = 0;
 
@@ -1274,7 +1278,7 @@ sfw_handle_server_rpc(struct srpc_server_rpc *rpc)
 
 	if (sv->sv_id != SRPC_SERVICE_MAKE_SESSION &&
 	    sv->sv_id != SRPC_SERVICE_DEBUG) {
-		sfw_session_t *sn = sfw_data.fw_session;
+		struct sfw_session *sn = sfw_data.fw_session;
 
 		if (sn != NULL &&
 		    sn->sn_features != request->msg_ses_feats) {
@@ -1390,12 +1394,12 @@ sfw_bulk_ready(struct srpc_server_rpc *rpc, int status)
 	return rc;
 }
 
-srpc_client_rpc_t *
+struct srpc_client_rpc *
 sfw_create_rpc(struct lnet_process_id peer, int service,
 	       unsigned features, int nbulkiov, int bulklen,
-	       void (*done)(srpc_client_rpc_t *), void *priv)
+	       void (*done)(struct srpc_client_rpc *), void *priv)
 {
-	srpc_client_rpc_t *rpc = NULL;
+	struct srpc_client_rpc *rpc = NULL;
 
 	spin_lock(&sfw_data.fw_lock);
 
@@ -1404,7 +1408,7 @@ sfw_create_rpc(struct lnet_process_id peer, int service,
 
 	if (nbulkiov == 0 && !list_empty(&sfw_data.fw_zombie_rpcs)) {
 		rpc = list_entry(sfw_data.fw_zombie_rpcs.next,
-                                     srpc_client_rpc_t, crpc_list);
+				     struct srpc_client_rpc, crpc_list);
 		list_del(&rpc->crpc_list);
 
                 srpc_init_client_rpc(rpc, peer, service, 0, 0,
@@ -1428,7 +1432,7 @@ sfw_create_rpc(struct lnet_process_id peer, int service,
 }
 
 void
-sfw_unpack_message (srpc_msg_t *msg)
+sfw_unpack_message(struct srpc_msg *msg)
 {
         if (msg->msg_magic == SRPC_MSG_MAGIC)
                 return; /* no flipping needed */
@@ -1437,7 +1441,7 @@ sfw_unpack_message (srpc_msg_t *msg)
         LASSERT (msg->msg_magic == __swab32(SRPC_MSG_MAGIC));
 
         if (msg->msg_type == SRPC_MSG_STAT_REQST) {
-                srpc_stat_reqst_t *req = &msg->msg_body.stat_reqst;
+		struct srpc_stat_reqst *req = &msg->msg_body.stat_reqst;
 
                 __swab32s(&req->str_type);
                 __swab64s(&req->str_rpyid);
@@ -1446,7 +1450,7 @@ sfw_unpack_message (srpc_msg_t *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_STAT_REPLY) {
-                srpc_stat_reply_t *rep = &msg->msg_body.stat_reply;
+		struct srpc_stat_reply *rep = &msg->msg_body.stat_reply;
 
                 __swab32s(&rep->str_status);
                 sfw_unpack_sid(rep->str_sid);
@@ -1457,7 +1461,7 @@ sfw_unpack_message (srpc_msg_t *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_MKSN_REQST) {
-                srpc_mksn_reqst_t *req = &msg->msg_body.mksn_reqst;
+		struct srpc_mksn_reqst *req = &msg->msg_body.mksn_reqst;
 
                 __swab64s(&req->mksn_rpyid);
                 __swab32s(&req->mksn_force);
@@ -1466,7 +1470,7 @@ sfw_unpack_message (srpc_msg_t *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_MKSN_REPLY) {
-                srpc_mksn_reply_t *rep = &msg->msg_body.mksn_reply;
+		struct srpc_mksn_reply *rep = &msg->msg_body.mksn_reply;
 
                 __swab32s(&rep->mksn_status);
                 __swab32s(&rep->mksn_timeout);
@@ -1475,7 +1479,7 @@ sfw_unpack_message (srpc_msg_t *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_RMSN_REQST) {
-                srpc_rmsn_reqst_t *req = &msg->msg_body.rmsn_reqst;
+		struct srpc_rmsn_reqst *req = &msg->msg_body.rmsn_reqst;
 
                 __swab64s(&req->rmsn_rpyid);
                 sfw_unpack_sid(req->rmsn_sid);
@@ -1483,7 +1487,7 @@ sfw_unpack_message (srpc_msg_t *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_RMSN_REPLY) {
-                srpc_rmsn_reply_t *rep = &msg->msg_body.rmsn_reply;
+		struct srpc_rmsn_reply *rep = &msg->msg_body.rmsn_reply;
 
                 __swab32s(&rep->rmsn_status);
                 sfw_unpack_sid(rep->rmsn_sid);
@@ -1491,7 +1495,7 @@ sfw_unpack_message (srpc_msg_t *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_DEBUG_REQST) {
-                srpc_debug_reqst_t *req = &msg->msg_body.dbg_reqst;
+		struct srpc_debug_reqst *req = &msg->msg_body.dbg_reqst;
 
                 __swab64s(&req->dbg_rpyid);
                 __swab32s(&req->dbg_flags);
@@ -1500,7 +1504,7 @@ sfw_unpack_message (srpc_msg_t *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_DEBUG_REPLY) {
-                srpc_debug_reply_t *rep = &msg->msg_body.dbg_reply;
+		struct srpc_debug_reply *rep = &msg->msg_body.dbg_reply;
 
                 __swab32s(&rep->dbg_nbatch);
                 __swab32s(&rep->dbg_timeout);
@@ -1509,7 +1513,7 @@ sfw_unpack_message (srpc_msg_t *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_BATCH_REQST) {
-                srpc_batch_reqst_t *req = &msg->msg_body.bat_reqst;
+		struct srpc_batch_reqst *req = &msg->msg_body.bat_reqst;
 
                 __swab32s(&req->bar_opc);
                 __swab64s(&req->bar_rpyid);
@@ -1521,7 +1525,7 @@ sfw_unpack_message (srpc_msg_t *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_BATCH_REPLY) {
-                srpc_batch_reply_t *rep = &msg->msg_body.bat_reply;
+		struct srpc_batch_reply *rep = &msg->msg_body.bat_reply;
 
                 __swab32s(&rep->bar_status);
                 sfw_unpack_sid(rep->bar_sid);
@@ -1529,7 +1533,7 @@ sfw_unpack_message (srpc_msg_t *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_TEST_REQST) {
-                srpc_test_reqst_t *req = &msg->msg_body.tes_reqst;
+		struct srpc_test_reqst *req = &msg->msg_body.tes_reqst;
 
                 __swab64s(&req->tsr_rpyid);
                 __swab64s(&req->tsr_bulkid);
@@ -1543,7 +1547,7 @@ sfw_unpack_message (srpc_msg_t *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_TEST_REPLY) {
-                srpc_test_reply_t *rep = &msg->msg_body.tes_reply;
+		struct srpc_test_reply *rep = &msg->msg_body.tes_reply;
 
                 __swab32s(&rep->tsr_status);
                 sfw_unpack_sid(rep->tsr_sid);
@@ -1551,7 +1555,7 @@ sfw_unpack_message (srpc_msg_t *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_JOIN_REQST) {
-                srpc_join_reqst_t *req = &msg->msg_body.join_reqst;
+		struct srpc_join_reqst *req = &msg->msg_body.join_reqst;
 
                 __swab64s(&req->join_rpyid);
                 sfw_unpack_sid(req->join_sid);
@@ -1559,7 +1563,7 @@ sfw_unpack_message (srpc_msg_t *msg)
         }
 
         if (msg->msg_type == SRPC_MSG_JOIN_REPLY) {
-                srpc_join_reply_t *rep = &msg->msg_body.join_reply;
+		struct srpc_join_reply *rep = &msg->msg_body.join_reply;
 
                 __swab32s(&rep->join_status);
                 __swab32s(&rep->join_timeout);
@@ -1572,7 +1576,7 @@ sfw_unpack_message (srpc_msg_t *msg)
 }
 
 void
-sfw_abort_rpc (srpc_client_rpc_t *rpc)
+sfw_abort_rpc(struct srpc_client_rpc *rpc)
 {
 	LASSERT(atomic_read(&rpc->crpc_refcount) > 0);
 	LASSERT(rpc->crpc_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
@@ -1584,7 +1588,7 @@ sfw_abort_rpc (srpc_client_rpc_t *rpc)
 }
 
 void
-sfw_post_rpc (srpc_client_rpc_t *rpc)
+sfw_post_rpc(struct srpc_client_rpc *rpc)
 {
 	spin_lock(&rpc->crpc_lock);
 
@@ -1600,44 +1604,14 @@ sfw_post_rpc (srpc_client_rpc_t *rpc)
 	return;
 }
 
-static srpc_service_t sfw_services[] =
-{
-        {
-                /* sv_id */    SRPC_SERVICE_DEBUG,
-                /* sv_name */  "debug",
-                0
-        },
-        {
-                /* sv_id */    SRPC_SERVICE_QUERY_STAT,
-                /* sv_name */  "query stats",
-                0
-        },
-        {
-                /* sv_id */    SRPC_SERVICE_MAKE_SESSION,
-                /* sv_name */  "make session",
-                0
-        },
-        {
-                /* sv_id */    SRPC_SERVICE_REMOVE_SESSION,
-                /* sv_name */  "remove session",
-                0
-        },
-        {
-                /* sv_id */    SRPC_SERVICE_BATCH,
-                /* sv_name */  "batch service",
-                0
-        },
-        {
-                /* sv_id */    SRPC_SERVICE_TEST,
-                /* sv_name */  "test service",
-                0
-        },
-        {
-                /* sv_id */    0,
-                /* sv_name */  NULL,
-                0
-        }
-};
+static struct srpc_service sfw_services[] = {
+	{ .sv_id = SRPC_SERVICE_DEBUG,		.sv_name = "debug", },
+	{ .sv_id = SRPC_SERVICE_QUERY_STAT,	.sv_name = "query stats", },
+	{ .sv_id = SRPC_SERVICE_MAKE_SESSION,	.sv_name = "make session", },
+	{ .sv_id = SRPC_SERVICE_REMOVE_SESSION,	.sv_name = "remove session", },
+	{ .sv_id = SRPC_SERVICE_BATCH,		.sv_name = "batch service", },
+	{ .sv_id = SRPC_SERVICE_TEST,		.sv_name = "test service", },
+	{ .sv_id = 0, } };
 
 int
 sfw_startup (void)
@@ -1645,8 +1619,8 @@ sfw_startup (void)
         int              i;
         int              rc;
         int              error;
-        srpc_service_t  *sv;
-        sfw_test_case_t *tsc;
+	struct srpc_service *sv;
+	struct sfw_test_case *tsc;
 
 
         if (session_timeout < 0) {
@@ -1740,8 +1714,8 @@ sfw_startup (void)
 void
 sfw_shutdown (void)
 {
-	srpc_service_t	*sv;
-	sfw_test_case_t	*tsc;
+	struct srpc_service *sv;
+	struct sfw_test_case *tsc;
 	int		 i;
 
 	spin_lock(&sfw_data.fw_lock);
@@ -1778,10 +1752,10 @@ sfw_shutdown (void)
         }
 
 	while (!list_empty(&sfw_data.fw_zombie_rpcs)) {
-		srpc_client_rpc_t *rpc;
+		struct srpc_client_rpc *rpc;
 
 		rpc = list_entry(sfw_data.fw_zombie_rpcs.next,
-				 srpc_client_rpc_t, crpc_list);
+				 struct srpc_client_rpc, crpc_list);
 		list_del(&rpc->crpc_list);
 
 		LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
@@ -1797,7 +1771,7 @@ sfw_shutdown (void)
 
 	while (!list_empty(&sfw_data.fw_tests)) {
 		tsc = list_entry(sfw_data.fw_tests.next,
-				 sfw_test_case_t, tsc_list);
+				 struct sfw_test_case, tsc_list);
 
 		srpc_wait_service_shutdown(tsc->tsc_srv_service);
 
diff --git a/drivers/staging/lustrefsx/lnet/selftest/module.c b/drivers/staging/lustrefsx/lnet/selftest/module.c
index 56212a840dcc4..e0baadb6b9202 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/module.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/module.c
@@ -52,61 +52,58 @@ struct cfs_wi_sched **lst_sched_test;
 static void
 lnet_selftest_exit(void)
 {
-	int	i;
-
-        switch (lst_init_step) {
-                case LST_INIT_CONSOLE:
-                        lstcon_console_fini();
-		/* Fall through */
-                case LST_INIT_FW:
-                        sfw_shutdown();
-		/* Fall through */
-                case LST_INIT_RPC:
-                        srpc_shutdown();
-		/* Fall through */
-		case LST_INIT_WI_TEST:
-			for (i = 0;
-			     i < cfs_cpt_number(lnet_cpt_table()); i++) {
-				if (lst_sched_test[i] == NULL)
-					continue;
-				cfs_wi_sched_destroy(lst_sched_test[i]);
-			}
-			LIBCFS_FREE(lst_sched_test,
-				    sizeof(lst_sched_test[0]) *
-				    cfs_cpt_number(lnet_cpt_table()));
-			lst_sched_test = NULL;
-		/* Fall through */
-
-		case LST_INIT_WI_SERIAL:
-			cfs_wi_sched_destroy(lst_sched_serial);
-			lst_sched_serial = NULL;
-		/* Fall through */
-                case LST_INIT_NONE:
-                        break;
-		/* Fall through */
-                default:
-                        LBUG();
-        }
-        return;
+	int i;
+
+	switch (lst_init_step) {
+	case LST_INIT_CONSOLE:
+		lstcon_console_fini();
+		fallthrough;
+	case LST_INIT_FW:
+		sfw_shutdown();
+		fallthrough;
+	case LST_INIT_RPC:
+		srpc_shutdown();
+		fallthrough;
+	case LST_INIT_WI_TEST:
+		for (i = 0;
+		     i < cfs_cpt_number(lnet_cpt_table()); i++) {
+			if (lst_sched_test[i] == NULL)
+				continue;
+			cfs_wi_sched_destroy(lst_sched_test[i]);
+		}
+		LIBCFS_FREE(lst_sched_test,
+			    sizeof(lst_sched_test[0]) *
+			    cfs_cpt_number(lnet_cpt_table()));
+		lst_sched_test = NULL;
+		fallthrough;
+	case LST_INIT_WI_SERIAL:
+		cfs_wi_sched_destroy(lst_sched_serial);
+		lst_sched_serial = NULL;
+		fallthrough;
+	case LST_INIT_NONE:
+		break;
+	default:
+		LBUG();
+	}
 }
 
 void
 lnet_selftest_structure_assertion(void)
 {
-        CLASSERT(sizeof(srpc_msg_t) == 160);
-        CLASSERT(sizeof(srpc_test_reqst_t) == 70);
-        CLASSERT(offsetof(srpc_msg_t, msg_body.tes_reqst.tsr_concur) == 72);
-        CLASSERT(offsetof(srpc_msg_t, msg_body.tes_reqst.tsr_ndest) == 78);
-        CLASSERT(sizeof(srpc_stat_reply_t) == 136);
-        CLASSERT(sizeof(srpc_stat_reqst_t) == 28);
+	CLASSERT(sizeof(struct srpc_msg) == 160);
+	CLASSERT(sizeof(struct srpc_test_reqst) == 70);
+	CLASSERT(offsetof(struct srpc_msg, msg_body.tes_reqst.tsr_concur) == 72);
+	CLASSERT(offsetof(struct srpc_msg, msg_body.tes_reqst.tsr_ndest) == 78);
+	CLASSERT(sizeof(struct srpc_stat_reply) == 136);
+	CLASSERT(sizeof(struct srpc_stat_reqst) == 28);
 }
 
 static int __init
 lnet_selftest_init(void)
 {
-	int	nscheds;
-	int	rc;
-	int	i;
+	int nscheds;
+	int rc;
+	int i;
 
 	rc = cfs_wi_sched_create("lst_s", lnet_cpt_table(), CFS_CPT_ANY,
 				 1, &lst_sched_serial);
@@ -130,31 +127,31 @@ lnet_selftest_init(void)
 		rc = cfs_wi_sched_create("lst_t", lnet_cpt_table(), i,
 					 nthrs, &lst_sched_test[i]);
 		if (rc != 0) {
-			CERROR("Failed to create CPU partition affinity WI "
-			       "scheduler %d for LST\n", i);
+			CERROR("Failed to create CPU partition affinity WI scheduler %d for LST\n",
+			       i);
 			goto error;
 		}
 	}
 
-        rc = srpc_startup();
-        if (rc != 0) {
-                CERROR("LST can't startup rpc\n");
-                goto error;
-        }
-        lst_init_step = LST_INIT_RPC;
-
-        rc = sfw_startup();
-        if (rc != 0) {
-                CERROR("LST can't startup framework\n");
-                goto error;
-        }
-        lst_init_step = LST_INIT_FW;
-
-        rc = lstcon_console_init();
-        if (rc != 0) {
-                CERROR("LST can't startup console\n");
-                goto error;
-        }
+	rc = srpc_startup();
+	if (rc != 0) {
+		CERROR("LST can't startup rpc\n");
+		goto error;
+	}
+	lst_init_step = LST_INIT_RPC;
+
+	rc = sfw_startup();
+	if (rc != 0) {
+		CERROR("LST can't startup framework\n");
+		goto error;
+	}
+	lst_init_step = LST_INIT_FW;
+
+	rc = lstcon_console_init();
+	if (rc != 0) {
+		CERROR("LST can't startup console\n");
+		goto error;
+	}
 	lst_init_step = LST_INIT_CONSOLE;
 	return 0;
 error:
diff --git a/drivers/staging/lustrefsx/lnet/selftest/ping_test.c b/drivers/staging/lustrefsx/lnet/selftest/ping_test.c
index ea2076103c756..2d1403b34c7bc 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/ping_test.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/ping_test.c
@@ -44,17 +44,17 @@ static int ping_srv_workitems = SFW_TEST_WI_MAX;
 module_param(ping_srv_workitems, int, 0644);
 MODULE_PARM_DESC(ping_srv_workitems, "# PING server workitems");
 
-typedef struct {
+struct lst_ping_data {
 	spinlock_t	pnd_lock;	/* serialize */
 	int		pnd_counter;	/* sequence counter */
-} lst_ping_data_t;
+};
 
-static lst_ping_data_t  lst_ping_data;
+static struct lst_ping_data lst_ping_data;
 
 static int
-ping_client_init(sfw_test_instance_t *tsi)
+ping_client_init(struct sfw_test_instance *tsi)
 {
-	sfw_session_t *sn = tsi->tsi_batch->bat_session;
+	struct sfw_session *sn = tsi->tsi_batch->bat_session;
 
 	LASSERT(tsi->tsi_is_client);
 	LASSERT(sn != NULL && (sn->sn_features & ~LST_FEATS_MASK) == 0);
@@ -66,9 +66,9 @@ ping_client_init(sfw_test_instance_t *tsi)
 }
 
 static void
-ping_client_fini (sfw_test_instance_t *tsi)
+ping_client_fini(struct sfw_test_instance *tsi)
 {
-        sfw_session_t *sn = tsi->tsi_batch->bat_session;
+	struct sfw_session *sn = tsi->tsi_batch->bat_session;
         int            errors;
 
         LASSERT (sn != NULL);
@@ -82,14 +82,14 @@ ping_client_fini (sfw_test_instance_t *tsi)
 }
 
 static int
-ping_client_prep_rpc(sfw_test_unit_t *tsu,
-		     struct lnet_process_id dest, srpc_client_rpc_t **rpc)
+ping_client_prep_rpc(struct sfw_test_unit *tsu, struct lnet_process_id dest,
+		     struct srpc_client_rpc **rpc)
 {
-	srpc_ping_reqst_t   *req;
-	sfw_test_instance_t *tsi = tsu->tsu_instance;
-	sfw_session_t       *sn  = tsi->tsi_batch->bat_session;
+	struct srpc_ping_reqst *req;
+	struct sfw_test_instance *tsi = tsu->tsu_instance;
+	struct sfw_session *sn = tsi->tsi_batch->bat_session;
 	struct timespec64 ts;
-	int		     rc;
+	int rc;
 
 	LASSERT(sn != NULL);
 	LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
@@ -114,12 +114,12 @@ ping_client_prep_rpc(sfw_test_unit_t *tsu,
 }
 
 static void
-ping_client_done_rpc (sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
+ping_client_done_rpc(struct sfw_test_unit *tsu, struct srpc_client_rpc *rpc)
 {
-        sfw_test_instance_t *tsi = tsu->tsu_instance;
-	sfw_session_t *sn = tsi->tsi_batch->bat_session;
-	srpc_ping_reqst_t *reqst = &rpc->crpc_reqstmsg.msg_body.ping_reqst;
-	srpc_ping_reply_t *reply = &rpc->crpc_replymsg.msg_body.ping_reply;
+	struct sfw_test_instance *tsi = tsu->tsu_instance;
+	struct sfw_session *sn = tsi->tsi_batch->bat_session;
+	struct srpc_ping_reqst *reqst = &rpc->crpc_reqstmsg.msg_body.ping_reqst;
+	struct srpc_ping_reply *reply = &rpc->crpc_replymsg.msg_body.ping_reply;
 	struct timespec64 ts;
 
 	LASSERT(sn != NULL);
@@ -167,11 +167,11 @@ ping_client_done_rpc (sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
 static int
 ping_server_handle(struct srpc_server_rpc *rpc)
 {
-	struct srpc_service	*sv  = rpc->srpc_scd->scd_svc;
-        srpc_msg_t        *reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
-	srpc_msg_t	  *replymsg = &rpc->srpc_replymsg;
-        srpc_ping_reqst_t *req = &reqstmsg->msg_body.ping_reqst;
-        srpc_ping_reply_t *rep = &rpc->srpc_replymsg.msg_body.ping_reply;
+	struct srpc_service *sv  = rpc->srpc_scd->scd_svc;
+	struct srpc_msg *reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+	struct srpc_msg *replymsg = &rpc->srpc_replymsg;
+	struct srpc_ping_reqst *req = &reqstmsg->msg_body.ping_reqst;
+	struct srpc_ping_reply *rep = &rpc->srpc_replymsg.msg_body.ping_reply;
 
         LASSERT (sv->sv_id == SRPC_SERVICE_PING);
 
@@ -207,7 +207,8 @@ ping_server_handle(struct srpc_server_rpc *rpc)
 	return 0;
 }
 
-sfw_test_client_ops_t ping_test_client;
+struct sfw_test_client_ops ping_test_client;
+
 void ping_init_test_client(void)
 {
         ping_test_client.tso_init     = ping_client_init;
@@ -216,7 +217,8 @@ void ping_init_test_client(void)
         ping_test_client.tso_done_rpc = ping_client_done_rpc;
 }
 
-srpc_service_t ping_test_service;
+struct srpc_service ping_test_service;
+
 void ping_init_test_service(void)
 {
 	ping_test_service.sv_id       = SRPC_SERVICE_PING;
diff --git a/drivers/staging/lustrefsx/lnet/selftest/rpc.c b/drivers/staging/lustrefsx/lnet/selftest/rpc.c
index ed88dfeac7085..bd7a2d5ec0757 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/rpc.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/rpc.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -42,21 +42,21 @@
 
 #include "selftest.h"
 
-typedef enum {
-        SRPC_STATE_NONE,
-        SRPC_STATE_NI_INIT,
-        SRPC_STATE_EQ_INIT,
-        SRPC_STATE_RUNNING,
-        SRPC_STATE_STOPPING,
-} srpc_state_t;
+enum srpc_state {
+	SRPC_STATE_NONE,
+	SRPC_STATE_NI_INIT,
+	SRPC_STATE_EQ_INIT,
+	SRPC_STATE_RUNNING,
+	SRPC_STATE_STOPPING,
+};
 
 static struct smoketest_rpc {
 	spinlock_t	 rpc_glock;	/* global lock */
-	srpc_service_t	*rpc_services[SRPC_SERVICE_MAX_ID + 1];
-	struct lnet_handle_eq	rpc_lnet_eq;	/* _the_ LNet event queue */
-	srpc_state_t	 rpc_state;
+	struct srpc_service	*rpc_services[SRPC_SERVICE_MAX_ID + 1];
+	struct lnet_handle_eq	 rpc_lnet_eq;	/* _the_ LNet event queue */
+	enum srpc_state		 rpc_state;
 	struct srpc_counters	 rpc_counters;
-	__u64		 rpc_matchbits;	/* matchbits counter */
+	__u64			 rpc_matchbits;	/* matchbits counter */
 } srpc_data;
 
 static inline int
@@ -67,7 +67,7 @@ srpc_serv_portal(int svc_id)
 }
 
 /* forward ref's */
-int srpc_handle_rpc(swi_workitem_t *wi);
+static int srpc_handle_rpc(struct swi_workitem *wi);
 
 void srpc_get_counters(struct srpc_counters *cnt)
 {
@@ -84,7 +84,8 @@ void srpc_set_counters(const struct srpc_counters *cnt)
 }
 
 static int
-srpc_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i, int off, int nob)
+srpc_add_bulk_page(struct srpc_bulk *bk, struct page *pg, int i, int off,
+		   int nob)
 {
 	LASSERT(off < PAGE_SIZE);
 	LASSERT(nob > 0 && nob <= PAGE_SIZE);
@@ -96,48 +97,49 @@ srpc_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i, int off, int nob)
 }
 
 void
-srpc_free_bulk (srpc_bulk_t *bk)
+srpc_free_bulk(struct srpc_bulk *bk)
 {
-        int         i;
+	int i;
 	struct page *pg;
 
-        LASSERT (bk != NULL);
+	LASSERT(bk != NULL);
 
-        for (i = 0; i < bk->bk_niov; i++) {
-                pg = bk->bk_iovs[i].kiov_page;
-                if (pg == NULL) break;
+	for (i = 0; i < bk->bk_niov; i++) {
+		pg = bk->bk_iovs[i].kiov_page;
+		if (pg == NULL)
+			break;
 
 		__free_page(pg);
-        }
+	}
 
-        LIBCFS_FREE(bk, offsetof(srpc_bulk_t, bk_iovs[bk->bk_niov]));
-        return;
+	LIBCFS_FREE(bk, offsetof(struct srpc_bulk, bk_iovs[bk->bk_niov]));
+	return;
 }
 
-srpc_bulk_t *
+struct srpc_bulk *
 srpc_alloc_bulk(int cpt, unsigned bulk_off, unsigned bulk_npg,
 		unsigned bulk_len, int sink)
 {
-	srpc_bulk_t  *bk;
-	int	      i;
+	struct srpc_bulk *bk;
+	int i;
 
 	LASSERT(bulk_npg > 0 && bulk_npg <= LNET_MAX_IOV);
 
 	LIBCFS_CPT_ALLOC(bk, lnet_cpt_table(), cpt,
-			 offsetof(srpc_bulk_t, bk_iovs[bulk_npg]));
+			 offsetof(struct srpc_bulk, bk_iovs[bulk_npg]));
 	if (bk == NULL) {
 		CERROR("Can't allocate descriptor for %d pages\n", bulk_npg);
 		return NULL;
 	}
 
-	memset(bk, 0, offsetof(srpc_bulk_t, bk_iovs[bulk_npg]));
+	memset(bk, 0, offsetof(struct srpc_bulk, bk_iovs[bulk_npg]));
 	bk->bk_sink   = sink;
 	bk->bk_len    = bulk_len;
 	bk->bk_niov   = bulk_npg;
 
 	for (i = 0; i < bulk_npg; i++) {
 		struct page *pg;
-		int	    nob;
+		int nob;
 
 		pg = cfs_page_cpt_alloc(lnet_cpt_table(), cpt, GFP_KERNEL);
 		if (pg == NULL) {
@@ -190,11 +192,11 @@ srpc_init_server_rpc(struct srpc_server_rpc *rpc,
 static void
 srpc_service_fini(struct srpc_service *svc)
 {
-	struct srpc_service_cd	*scd;
-	struct srpc_server_rpc	*rpc;
-	struct srpc_buffer	*buf;
-	struct list_head		*q;
-	int			i;
+	struct srpc_service_cd *scd;
+	struct srpc_server_rpc *rpc;
+	struct srpc_buffer *buf;
+	struct list_head *q;
+	int i;
 
 	if (svc->sv_cpt_data == NULL)
 		return;
@@ -210,8 +212,8 @@ srpc_service_fini(struct srpc_service *svc)
 
 			while (!list_empty(q)) {
 				buf = list_entry(q->next,
-						     struct srpc_buffer,
-						     buf_list);
+						 struct srpc_buffer,
+						 buf_list);
 				list_del(&buf->buf_list);
 				LIBCFS_FREE(buf, sizeof(*buf));
 			}
@@ -221,8 +223,8 @@ srpc_service_fini(struct srpc_service *svc)
 
 		while (!list_empty(&scd->scd_rpc_free)) {
 			rpc = list_entry(scd->scd_rpc_free.next,
-					     struct srpc_server_rpc,
-					     srpc_list);
+					 struct srpc_server_rpc,
+					 srpc_list);
 			list_del(&rpc->srpc_list);
 			LIBCFS_FREE(rpc, sizeof(*rpc));
 		}
@@ -246,11 +248,11 @@ int srpc_add_buffer(struct swi_workitem *wi);
 static int
 srpc_service_init(struct srpc_service *svc)
 {
-	struct srpc_service_cd	*scd;
-	struct srpc_server_rpc	*rpc;
-	int			nrpcs;
-	int			i;
-	int			j;
+	struct srpc_service_cd *scd;
+	struct srpc_server_rpc *rpc;
+	int nrpcs;
+	int i;
+	int j;
 
 	svc->sv_shuttingdown = 0;
 
@@ -327,13 +329,13 @@ srpc_add_service(struct srpc_service *sv)
 	CDEBUG(D_NET, "Adding service: id %d, name %s\n", id, sv->sv_name);
 	return 0;
 
- failed:
+failed:
 	srpc_service_fini(sv);
 	return -EBUSY;
 }
 
 int
-srpc_remove_service (srpc_service_t *sv)
+srpc_remove_service(struct srpc_service *sv)
 {
 	int id = sv->sv_id;
 
@@ -352,98 +354,100 @@ srpc_remove_service (srpc_service_t *sv)
 static int
 srpc_post_passive_rdma(int portal, int local, __u64 matchbits, void *buf,
 		       int len, int options, struct lnet_process_id peer,
-		       struct lnet_handle_md *mdh, srpc_event_t *ev)
+		       struct lnet_handle_md *mdh, struct srpc_event *ev)
 {
-	int		 rc;
-	struct lnet_md	 md;
+	int rc;
+	struct lnet_md md;
 	struct lnet_handle_me meh;
 
 	rc = LNetMEAttach(portal, peer, matchbits, 0, LNET_UNLINK,
 			  local ? LNET_INS_LOCAL : LNET_INS_AFTER, &meh);
-        if (rc != 0) {
-                CERROR ("LNetMEAttach failed: %d\n", rc);
-                LASSERT (rc == -ENOMEM);
-                return -ENOMEM;
-        }
-
-        md.threshold = 1;
-        md.user_ptr  = ev;
-        md.start     = buf;
-        md.length    = len;
-        md.options   = options;
-        md.eq_handle = srpc_data.rpc_lnet_eq;
-
-        rc = LNetMDAttach(meh, md, LNET_UNLINK, mdh);
-        if (rc != 0) {
-                CERROR ("LNetMDAttach failed: %d\n", rc);
-                LASSERT (rc == -ENOMEM);
-
-                rc = LNetMEUnlink(meh);
-                LASSERT (rc == 0);
-                return -ENOMEM;
-        }
-
-        CDEBUG (D_NET,
-		"Posted passive RDMA: peer %s, portal %d, matchbits %#llx\n",
-                libcfs_id2str(peer), portal, matchbits);
-        return 0;
+	if (rc != 0) {
+		CERROR("LNetMEAttach failed: %d\n", rc);
+		LASSERT(rc == -ENOMEM);
+		return -ENOMEM;
+	}
+
+	md.threshold = 1;
+	md.user_ptr  = ev;
+	md.start     = buf;
+	md.length    = len;
+	md.options   = options;
+	md.eq_handle = srpc_data.rpc_lnet_eq;
+
+	rc = LNetMDAttach(meh, md, LNET_UNLINK, mdh);
+	if (rc != 0) {
+		CERROR("LNetMDAttach failed: %d\n", rc);
+		LASSERT(rc == -ENOMEM);
+
+		rc = LNetMEUnlink(meh);
+		LASSERT(rc == 0);
+		return -ENOMEM;
+	}
+
+	CDEBUG(D_NET,
+	       "Posted passive RDMA: peer %s, portal %d, matchbits %#llx\n",
+	       libcfs_id2str(peer), portal, matchbits);
+	return 0;
 }
 
 static int
 srpc_post_active_rdma(int portal, __u64 matchbits, void *buf, int len,
-		      int options, struct lnet_process_id peer, lnet_nid_t self,
-		      struct lnet_handle_md *mdh, srpc_event_t *ev)
+		      int options, struct lnet_process_id peer,
+		      lnet_nid_t self, struct lnet_handle_md *mdh,
+		      struct srpc_event *ev)
 {
 	int rc;
 	struct lnet_md md;
 
-        md.user_ptr  = ev;
-        md.start     = buf;
-        md.length    = len;
-        md.eq_handle = srpc_data.rpc_lnet_eq;
-        md.threshold = ((options & LNET_MD_OP_GET) != 0) ? 2 : 1;
-        md.options   = options & ~(LNET_MD_OP_PUT | LNET_MD_OP_GET);
-
-        rc = LNetMDBind(md, LNET_UNLINK, mdh);
-        if (rc != 0) {
-                CERROR ("LNetMDBind failed: %d\n", rc);
-                LASSERT (rc == -ENOMEM);
-                return -ENOMEM;
-        }
-
-        /* this is kind of an abuse of the LNET_MD_OP_{PUT,GET} options.
-         * they're only meaningful for MDs attached to an ME (i.e. passive
-         * buffers... */
-        if ((options & LNET_MD_OP_PUT) != 0) {
-                rc = LNetPut(self, *mdh, LNET_NOACK_REQ, peer,
-                             portal, matchbits, 0, 0);
-        } else {
-                LASSERT ((options & LNET_MD_OP_GET) != 0);
-
-                rc = LNetGet(self, *mdh, peer, portal, matchbits, 0);
-        }
-
-        if (rc != 0) {
-		CERROR ("LNet%s(%s, %d, %lld) failed: %d\n",
-                        ((options & LNET_MD_OP_PUT) != 0) ? "Put" : "Get",
-                        libcfs_id2str(peer), portal, matchbits, rc);
-
-                /* The forthcoming unlink event will complete this operation
-                 * with failure, so fall through and return success here.
-                 */
-                rc = LNetMDUnlink(*mdh);
-                LASSERT (rc == 0);
-        } else {
-                CDEBUG (D_NET,
-			"Posted active RDMA: peer %s, portal %u, matchbits %#llx\n",
-                        libcfs_id2str(peer), portal, matchbits);
-        }
-        return 0;
+	md.user_ptr  = ev;
+	md.start     = buf;
+	md.length    = len;
+	md.eq_handle = srpc_data.rpc_lnet_eq;
+	md.threshold = ((options & LNET_MD_OP_GET) != 0) ? 2 : 1;
+	md.options   = options & ~(LNET_MD_OP_PUT | LNET_MD_OP_GET);
+
+	rc = LNetMDBind(md, LNET_UNLINK, mdh);
+	if (rc != 0) {
+		CERROR("LNetMDBind failed: %d\n", rc);
+		LASSERT(rc == -ENOMEM);
+		return -ENOMEM;
+	}
+
+	/* this is kind of an abuse of the LNET_MD_OP_{PUT,GET} options.
+	 * they're only meaningful for MDs attached to an ME (i.e. passive
+	 * buffers...
+	 */
+	if ((options & LNET_MD_OP_PUT) != 0) {
+		rc = LNetPut(self, *mdh, LNET_NOACK_REQ, peer,
+			     portal, matchbits, 0, 0);
+	} else {
+		LASSERT((options & LNET_MD_OP_GET) != 0);
+
+		rc = LNetGet(self, *mdh, peer, portal, matchbits, 0, false);
+	}
+
+	if (rc != 0) {
+		CERROR("LNet%s(%s, %d, %lld) failed: %d\n",
+		       ((options & LNET_MD_OP_PUT) != 0) ? "Put" : "Get",
+		       libcfs_id2str(peer), portal, matchbits, rc);
+
+		/* The forthcoming unlink event will complete this operation
+		 * with failure, so fall through and return success here.
+		 */
+		rc = LNetMDUnlink(*mdh);
+		LASSERT(rc == 0);
+	} else {
+		CDEBUG(D_NET,
+		       "Posted active RDMA: peer %s, portal %u, matchbits %#llx\n",
+		       libcfs_id2str(peer), portal, matchbits);
+	}
+	return 0;
 }
 
 static int
 srpc_post_passive_rqtbuf(int service, int local, void *buf, int len,
-			 struct lnet_handle_md *mdh, srpc_event_t *ev)
+			 struct lnet_handle_md *mdh, struct srpc_event *ev)
 {
 	struct lnet_process_id any = {0};
 
@@ -459,9 +463,9 @@ static int
 srpc_service_post_buffer(struct srpc_service_cd *scd, struct srpc_buffer *buf)
 __must_hold(&scd->scd_lock)
 {
-	struct srpc_service	*sv = scd->scd_svc;
-	struct srpc_msg		*msg = &buf->buf_msg;
-	int			rc;
+	struct srpc_service *sv = scd->scd_svc;
+	struct srpc_msg *msg = &buf->buf_msg;
+	int rc;
 
 	LNetInvalidateMDHandle(&buf->buf_mdh);
 	list_add(&buf->buf_list, &scd->scd_buf_posted);
@@ -507,9 +511,10 @@ __must_hold(&scd->scd_lock)
 int
 srpc_add_buffer(struct swi_workitem *wi)
 {
-	struct srpc_service_cd	*scd = wi->swi_workitem.wi_data;
-	struct srpc_buffer	*buf;
-	int			rc = 0;
+	struct srpc_service_cd *scd = container_of(wi, struct srpc_service_cd,
+						   scd_buf_wi);
+	struct srpc_buffer *buf;
+	int rc = 0;
 
 	/* it's called by workitem scheduler threads, these threads
 	 * should have been set CPT affinity, so buffers will be posted
@@ -553,7 +558,7 @@ srpc_add_buffer(struct swi_workitem *wi)
 	}
 
 	if (rc != 0) {
-		scd->scd_buf_err_stamp = cfs_time_current_sec();
+		scd->scd_buf_err_stamp = ktime_get_real_seconds();
 		scd->scd_buf_err = rc;
 
 		LASSERT(scd->scd_buf_posting > 0);
@@ -567,9 +572,9 @@ srpc_add_buffer(struct swi_workitem *wi)
 int
 srpc_service_add_buffers(struct srpc_service *sv, int nbuffer)
 {
-	struct srpc_service_cd	*scd;
-	int			rc = 0;
-	int			i;
+	struct srpc_service_cd *scd;
+	int rc = 0;
+	int i;
 
 	LASSERTF(nbuffer > 0, "nbuffer must be positive: %d\n", nbuffer);
 
@@ -621,9 +626,9 @@ srpc_service_add_buffers(struct srpc_service *sv, int nbuffer)
 void
 srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer)
 {
-	struct srpc_service_cd	*scd;
-	int			num;
-	int			i;
+	struct srpc_service_cd *scd;
+	int num;
+	int i;
 
 	LASSERT(!sv->sv_shuttingdown);
 
@@ -641,9 +646,9 @@ srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer)
 int
 srpc_finish_service(struct srpc_service *sv)
 {
-	struct srpc_service_cd	*scd;
-	struct srpc_server_rpc	*rpc;
-	int			i;
+	struct srpc_service_cd *scd;
+	struct srpc_server_rpc *rpc;
+	int i;
 
 	LASSERT(sv->sv_shuttingdown); /* srpc_shutdown_service called */
 
@@ -655,8 +660,8 @@ srpc_finish_service(struct srpc_service *sv)
 		}
 
 		if (scd->scd_buf_nposted > 0) {
-			CDEBUG(D_NET, "waiting for %d posted buffers to "
-			       "unlink\n", scd->scd_buf_nposted);
+			CDEBUG(D_NET, "waiting for %d posted buffers to unlink\n",
+			       scd->scd_buf_nposted);
 			spin_unlock(&scd->scd_lock);
 			return 0;
 		}
@@ -667,10 +672,8 @@ srpc_finish_service(struct srpc_service *sv)
 		}
 
 		rpc = list_entry(scd->scd_rpc_active.next,
-				     struct srpc_server_rpc, srpc_list);
-		CNETERR("Active RPC %p on shutdown: sv %s, peer %s, "
-			"wi %s scheduled %d running %d, "
-			"ev fired %d type %d status %d lnet %d\n",
+				 struct srpc_server_rpc, srpc_list);
+		CNETERR("Active RPC %p on shutdown: sv %s, peer %s, wi %s scheduled %d running %d, ev fired %d type %d status %d lnet %d\n",
 			rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
 			swi_state2str(rpc->srpc_wi.swi_state),
 			rpc->srpc_wi.swi_workitem.wi_scheduled,
@@ -688,7 +691,8 @@ srpc_finish_service(struct srpc_service *sv)
 
 /* called with sv->sv_lock held */
 static void
-srpc_service_recycle_buffer(struct srpc_service_cd *scd, srpc_buffer_t *buf)
+srpc_service_recycle_buffer(struct srpc_service_cd *scd,
+			    struct srpc_buffer *buf)
 __must_hold(&scd->scd_lock)
 {
 	if (!scd->scd_svc->sv_shuttingdown && scd->scd_buf_adjust >= 0) {
@@ -721,9 +725,9 @@ __must_hold(&scd->scd_lock)
 void
 srpc_abort_service(struct srpc_service *sv)
 {
-	struct srpc_service_cd	*scd;
-	struct srpc_server_rpc	*rpc;
-	int			i;
+	struct srpc_service_cd *scd;
+	struct srpc_server_rpc *rpc;
+	int i;
 
 	CDEBUG(D_NET, "Aborting service: id %d, name %s\n",
 	       sv->sv_id, sv->sv_name);
@@ -733,7 +737,8 @@ srpc_abort_service(struct srpc_service *sv)
 
 		/* schedule in-flight RPCs to notice the abort, NB:
 		 * racing with incoming RPCs; complete fix should make test
-		 * RPCs carry session ID in its headers */
+		 * RPCs carry session ID in its headers
+		 */
 		list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list) {
 			rpc->srpc_aborted = 1;
 			swi_schedule_workitem(&rpc->srpc_wi);
@@ -744,12 +749,12 @@ srpc_abort_service(struct srpc_service *sv)
 }
 
 void
-srpc_shutdown_service(srpc_service_t *sv)
+srpc_shutdown_service(struct srpc_service *sv)
 {
-	struct srpc_service_cd	*scd;
-	struct srpc_server_rpc	*rpc;
-	srpc_buffer_t		*buf;
-	int			i;
+	struct srpc_service_cd *scd;
+	struct srpc_server_rpc *rpc;
+	struct srpc_buffer *buf;
+	int i;
 
 	CDEBUG(D_NET, "Shutting down service: id %d, name %s\n",
 	       sv->sv_id, sv->sv_name);
@@ -772,135 +777,139 @@ srpc_shutdown_service(srpc_service_t *sv)
 		spin_unlock(&scd->scd_lock);
 
 		/* OK to traverse scd_buf_posted without lock, since no one
-		 * touches scd_buf_posted now */
+		 * touches scd_buf_posted now
+		 */
 		list_for_each_entry(buf, &scd->scd_buf_posted, buf_list)
 			LNetMDUnlink(buf->buf_mdh);
 	}
 }
 
 static int
-srpc_send_request (srpc_client_rpc_t *rpc)
+srpc_send_request(struct srpc_client_rpc *rpc)
 {
-        srpc_event_t *ev = &rpc->crpc_reqstev;
-        int           rc;
+	struct srpc_event *ev = &rpc->crpc_reqstev;
+	int rc;
 
-        ev->ev_fired = 0;
-        ev->ev_data  = rpc;
-        ev->ev_type  = SRPC_REQUEST_SENT;
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = SRPC_REQUEST_SENT;
 
 	rc = srpc_post_active_rdma(srpc_serv_portal(rpc->crpc_service),
 				   rpc->crpc_service, &rpc->crpc_reqstmsg,
-				   sizeof(srpc_msg_t), LNET_MD_OP_PUT,
+				   sizeof(struct srpc_msg), LNET_MD_OP_PUT,
 				   rpc->crpc_dest, LNET_NID_ANY,
 				   &rpc->crpc_reqstmdh, ev);
-        if (rc != 0) {
-                LASSERT (rc == -ENOMEM);
-                ev->ev_fired = 1;  /* no more event expected */
-        }
-        return rc;
+	if (rc != 0) {
+		LASSERT(rc == -ENOMEM);
+		ev->ev_fired = 1;  /* no more event expected */
+	}
+	return rc;
 }
 
 static int
-srpc_prepare_reply (srpc_client_rpc_t *rpc)
+srpc_prepare_reply(struct srpc_client_rpc *rpc)
 {
-        srpc_event_t *ev = &rpc->crpc_replyev;
-        __u64        *id = &rpc->crpc_reqstmsg.msg_body.reqst.rpyid;
-        int           rc;
+	struct srpc_event *ev = &rpc->crpc_replyev;
+	u64 *id = &rpc->crpc_reqstmsg.msg_body.reqst.rpyid;
+	int rc;
 
-        ev->ev_fired = 0;
-        ev->ev_data  = rpc;
-        ev->ev_type  = SRPC_REPLY_RCVD;
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = SRPC_REPLY_RCVD;
 
-        *id = srpc_next_id();
+	*id = srpc_next_id();
 
 	rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id,
-                                    &rpc->crpc_replymsg, sizeof(srpc_msg_t),
-                                    LNET_MD_OP_PUT, rpc->crpc_dest,
-                                    &rpc->crpc_replymdh, ev);
-        if (rc != 0) {
-                LASSERT (rc == -ENOMEM);
-                ev->ev_fired = 1;  /* no more event expected */
-        }
-        return rc;
+				    &rpc->crpc_replymsg,
+				    sizeof(struct srpc_msg),
+				    LNET_MD_OP_PUT, rpc->crpc_dest,
+				    &rpc->crpc_replymdh, ev);
+	if (rc != 0) {
+		LASSERT(rc == -ENOMEM);
+		ev->ev_fired = 1;  /* no more event expected */
+	}
+	return rc;
 }
 
 static int
-srpc_prepare_bulk (srpc_client_rpc_t *rpc)
+srpc_prepare_bulk(struct srpc_client_rpc *rpc)
 {
-        srpc_bulk_t  *bk = &rpc->crpc_bulk;
-        srpc_event_t *ev = &rpc->crpc_bulkev;
-        __u64        *id = &rpc->crpc_reqstmsg.msg_body.reqst.bulkid;
-        int           rc;
-        int           opt;
+	struct srpc_bulk *bk = &rpc->crpc_bulk;
+	struct srpc_event *ev = &rpc->crpc_bulkev;
+	__u64 *id = &rpc->crpc_reqstmsg.msg_body.reqst.bulkid;
+	int rc;
+	int opt;
 
-        LASSERT (bk->bk_niov <= LNET_MAX_IOV);
+	LASSERT(bk->bk_niov <= LNET_MAX_IOV);
 
-        if (bk->bk_niov == 0) return 0; /* nothing to do */
+	/* nothing to do */
+	if (bk->bk_niov == 0)
+		return 0;
 
-        opt = bk->bk_sink ? LNET_MD_OP_PUT : LNET_MD_OP_GET;
-        opt |= LNET_MD_KIOV;
+	opt = bk->bk_sink ? LNET_MD_OP_PUT : LNET_MD_OP_GET;
+	opt |= LNET_MD_KIOV;
 
-        ev->ev_fired = 0;
-        ev->ev_data  = rpc;
-        ev->ev_type  = SRPC_BULK_REQ_RCVD;
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = SRPC_BULK_REQ_RCVD;
 
-        *id = srpc_next_id();
+	*id = srpc_next_id();
 
 	rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id,
-                                    &bk->bk_iovs[0], bk->bk_niov, opt,
-                                    rpc->crpc_dest, &bk->bk_mdh, ev);
-        if (rc != 0) {
-                LASSERT (rc == -ENOMEM);
-                ev->ev_fired = 1;  /* no more event expected */
-        }
-        return rc;
+				    &bk->bk_iovs[0], bk->bk_niov, opt,
+				    rpc->crpc_dest, &bk->bk_mdh, ev);
+	if (rc != 0) {
+		LASSERT(rc == -ENOMEM);
+		ev->ev_fired = 1;  /* no more event expected */
+	}
+	return rc;
 }
 
 static int
-srpc_do_bulk (srpc_server_rpc_t *rpc)
+srpc_do_bulk(struct srpc_server_rpc *rpc)
 {
-        srpc_event_t  *ev = &rpc->srpc_ev;
-        srpc_bulk_t   *bk = rpc->srpc_bulk;
-        __u64          id = rpc->srpc_reqstbuf->buf_msg.msg_body.reqst.bulkid;
-        int            rc;
-        int            opt;
-
-        LASSERT (bk != NULL);
-
-        opt = bk->bk_sink ? LNET_MD_OP_GET : LNET_MD_OP_PUT;
-        opt |= LNET_MD_KIOV;
-
-        ev->ev_fired = 0;
-        ev->ev_data  = rpc;
-        ev->ev_type  = bk->bk_sink ? SRPC_BULK_GET_RPLD : SRPC_BULK_PUT_SENT;
-
-        rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, id,
-                                   &bk->bk_iovs[0], bk->bk_niov, opt,
-                                   rpc->srpc_peer, rpc->srpc_self,
-                                   &bk->bk_mdh, ev);
-        if (rc != 0)
-                ev->ev_fired = 1;  /* no more event expected */
-        return rc;
+	struct srpc_event *ev = &rpc->srpc_ev;
+	struct srpc_bulk *bk = rpc->srpc_bulk;
+	__u64 id = rpc->srpc_reqstbuf->buf_msg.msg_body.reqst.bulkid;
+	int rc;
+	int opt;
+
+	LASSERT(bk != NULL);
+
+	opt = bk->bk_sink ? LNET_MD_OP_GET : LNET_MD_OP_PUT;
+	opt |= LNET_MD_KIOV;
+
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = bk->bk_sink ? SRPC_BULK_GET_RPLD : SRPC_BULK_PUT_SENT;
+
+	rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, id,
+				   &bk->bk_iovs[0], bk->bk_niov, opt,
+				   rpc->srpc_peer, rpc->srpc_self,
+				   &bk->bk_mdh, ev);
+	if (rc != 0)
+		ev->ev_fired = 1;  /* no more event expected */
+	return rc;
 }
 
 /* only called from srpc_handle_rpc */
 static void
-srpc_server_rpc_done(srpc_server_rpc_t *rpc, int status)
+srpc_server_rpc_done(struct srpc_server_rpc *rpc, int status)
 {
 	struct srpc_service_cd	*scd = rpc->srpc_scd;
 	struct srpc_service	*sv  = scd->scd_svc;
-	srpc_buffer_t		*buffer;
+	struct srpc_buffer *buffer;
 
-        LASSERT (status != 0 || rpc->srpc_wi.swi_state == SWI_STATE_DONE);
+	LASSERT(status != 0 || rpc->srpc_wi.swi_state == SWI_STATE_DONE);
 
-        rpc->srpc_status = status;
+	rpc->srpc_status = status;
 
-        CDEBUG_LIMIT (status == 0 ? D_NET : D_NETERROR,
-                "Server RPC %p done: service %s, peer %s, status %s:%d\n",
-                rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
-                swi_state2str(rpc->srpc_wi.swi_state), status);
+	CDEBUG_LIMIT(status == 0 ? D_NET : D_NETERROR,
+		     "Server RPC %p done: service %s, peer %s, status %s:%d\n",
+		     rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+		     swi_state2str(rpc->srpc_wi.swi_state), status);
 
-        if (status != 0) {
+	if (status != 0) {
 		spin_lock(&srpc_data.rpc_glock);
 		srpc_data.rpc_counters.rpcs_dropped++;
 		spin_unlock(&srpc_data.rpc_glock);
@@ -914,7 +923,8 @@ srpc_server_rpc_done(srpc_server_rpc_t *rpc, int status)
 
 	if (rpc->srpc_reqstbuf != NULL) {
 		/* NB might drop sv_lock in srpc_service_recycle_buffer, but
-		 * sv won't go away for scd_rpc_active must not be empty */
+		 * sv won't go away for scd_rpc_active must not be empty
+		 */
 		srpc_service_recycle_buffer(scd, rpc->srpc_reqstbuf);
 		rpc->srpc_reqstbuf = NULL;
 	}
@@ -932,7 +942,7 @@ srpc_server_rpc_done(srpc_server_rpc_t *rpc, int status)
 
 	if (!sv->sv_shuttingdown && !list_empty(&scd->scd_buf_blocked)) {
 		buffer = list_entry(scd->scd_buf_blocked.next,
-					srpc_buffer_t, buf_list);
+				    struct srpc_buffer, buf_list);
 		list_del(&buffer->buf_list);
 
 		srpc_init_server_rpc(rpc, scd, buffer);
@@ -947,14 +957,14 @@ srpc_server_rpc_done(srpc_server_rpc_t *rpc, int status)
 }
 
 /* handles an incoming RPC */
-int
-srpc_handle_rpc(swi_workitem_t *wi)
+static int srpc_handle_rpc(struct swi_workitem *wi)
 {
-	struct srpc_server_rpc	*rpc = wi->swi_workitem.wi_data;
-	struct srpc_service_cd	*scd = rpc->srpc_scd;
-	struct srpc_service	*sv = scd->scd_svc;
-	srpc_event_t		*ev = &rpc->srpc_ev;
-	int			rc = 0;
+	struct srpc_server_rpc *rpc = container_of(wi, struct srpc_server_rpc,
+						   srpc_wi);
+	struct srpc_service_cd *scd = rpc->srpc_scd;
+	struct srpc_service *sv = scd->scd_svc;
+	struct srpc_event *ev = &rpc->srpc_ev;
+	int rc = 0;
 
 	LASSERT(wi == &rpc->srpc_wi);
 
@@ -963,31 +973,32 @@ srpc_handle_rpc(swi_workitem_t *wi)
 	if (sv->sv_shuttingdown || rpc->srpc_aborted) {
 		spin_unlock(&scd->scd_lock);
 
-                if (rpc->srpc_bulk != NULL)
-                        LNetMDUnlink(rpc->srpc_bulk->bk_mdh);
-                LNetMDUnlink(rpc->srpc_replymdh);
+		if (rpc->srpc_bulk != NULL)
+			LNetMDUnlink(rpc->srpc_bulk->bk_mdh);
+		LNetMDUnlink(rpc->srpc_replymdh);
 
-                if (ev->ev_fired) { /* no more event, OK to finish */
-                        srpc_server_rpc_done(rpc, -ESHUTDOWN);
-                        return 1;
-                }
-                return 0;
-        }
+		if (ev->ev_fired) { /* no more event, OK to finish */
+			srpc_server_rpc_done(rpc, -ESHUTDOWN);
+			return 1;
+		}
+		return 0;
+	}
 
 	spin_unlock(&scd->scd_lock);
 
-        switch (wi->swi_state) {
-        default:
-                LBUG ();
-        case SWI_STATE_NEWBORN: {
-                srpc_msg_t           *msg;
-                srpc_generic_reply_t *reply;
+	switch (wi->swi_state) {
+	default:
+		LBUG();
+		fallthrough;
+	case SWI_STATE_NEWBORN: {
+		struct srpc_msg *msg;
+		struct srpc_generic_reply *reply;
 
-                msg = &rpc->srpc_reqstbuf->buf_msg;
-                reply = &rpc->srpc_replymsg.msg_body.reply;
+		msg = &rpc->srpc_reqstbuf->buf_msg;
+		reply = &rpc->srpc_replymsg.msg_body.reply;
 
-                if (msg->msg_magic == 0) {
-                        /* moaned already in srpc_lnet_ev_handler */
+		if (msg->msg_magic == 0) {
+			/* moaned already in srpc_lnet_ev_handler */
 			srpc_server_rpc_done(rpc, EBADMSG);
 			return 1;
 		}
@@ -1007,67 +1018,67 @@ srpc_handle_rpc(swi_workitem_t *wi)
 				srpc_server_rpc_done(rpc, rc);
 				return 1;
 			}
-                }
-
-                wi->swi_state = SWI_STATE_BULK_STARTED;
-
-                if (rpc->srpc_bulk != NULL) {
-                        rc = srpc_do_bulk(rpc);
-                        if (rc == 0)
-                                return 0; /* wait for bulk */
-
-                        LASSERT (ev->ev_fired);
-                        ev->ev_status = rc;
-                }
-        }
-	/* Fall through */
-        case SWI_STATE_BULK_STARTED:
-                LASSERT (rpc->srpc_bulk == NULL || ev->ev_fired);
-
-                if (rpc->srpc_bulk != NULL) {
-                        rc = ev->ev_status;
-
-                        if (sv->sv_bulk_ready != NULL)
-                                rc = (*sv->sv_bulk_ready) (rpc, rc);
-
-                        if (rc != 0) {
-                                srpc_server_rpc_done(rpc, rc);
-                                return 1;
-                        }
-                }
-
-                wi->swi_state = SWI_STATE_REPLY_SUBMITTED;
-                rc = srpc_send_reply(rpc);
-                if (rc == 0)
-                        return 0; /* wait for reply */
-                srpc_server_rpc_done(rpc, rc);
-                return 1;
-
-        case SWI_STATE_REPLY_SUBMITTED:
-                if (!ev->ev_fired) {
-                        CERROR("RPC %p: bulk %p, service %d\n",
+		}
+
+		wi->swi_state = SWI_STATE_BULK_STARTED;
+
+		if (rpc->srpc_bulk != NULL) {
+			rc = srpc_do_bulk(rpc);
+			if (rc == 0)
+				return 0; /* wait for bulk */
+
+			LASSERT(ev->ev_fired);
+			ev->ev_status = rc;
+		}
+	}
+	fallthrough;
+	case SWI_STATE_BULK_STARTED:
+		LASSERT(rpc->srpc_bulk == NULL || ev->ev_fired);
+
+		if (rpc->srpc_bulk != NULL) {
+			rc = ev->ev_status;
+
+			if (sv->sv_bulk_ready != NULL)
+				rc = (*sv->sv_bulk_ready) (rpc, rc);
+
+			if (rc != 0) {
+				srpc_server_rpc_done(rpc, rc);
+				return 1;
+			}
+		}
+
+		wi->swi_state = SWI_STATE_REPLY_SUBMITTED;
+		rc = srpc_send_reply(rpc);
+		if (rc == 0)
+			return 0; /* wait for reply */
+		srpc_server_rpc_done(rpc, rc);
+		return 1;
+
+	case SWI_STATE_REPLY_SUBMITTED:
+		if (!ev->ev_fired) {
+			CERROR("RPC %p: bulk %p, service %d\n",
 			       rpc, rpc->srpc_bulk, sv->sv_id);
-                        CERROR("Event: status %d, type %d, lnet %d\n",
-                               ev->ev_status, ev->ev_type, ev->ev_lnet);
-                        LASSERT (ev->ev_fired);
-                }
+			CERROR("Event: status %d, type %d, lnet %d\n",
+			       ev->ev_status, ev->ev_type, ev->ev_lnet);
+			LASSERT(ev->ev_fired);
+		}
 
-                wi->swi_state = SWI_STATE_DONE;
-                srpc_server_rpc_done(rpc, ev->ev_status);
-                return 1;
-        }
+		wi->swi_state = SWI_STATE_DONE;
+		srpc_server_rpc_done(rpc, ev->ev_status);
+		return 1;
+	}
 
-        return 0;
+	return 0;
 }
 
 static void
 srpc_client_rpc_expired (void *data)
 {
-        srpc_client_rpc_t *rpc = data;
+	struct srpc_client_rpc *rpc = data;
 
-        CWARN ("Client RPC expired: service %d, peer %s, timeout %d.\n",
-               rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
-               rpc->crpc_timeout);
+	CWARN("Client RPC expired: service %d, peer %s, timeout %d.\n",
+	      rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+	      rpc->crpc_timeout);
 
 	spin_lock(&rpc->crpc_lock);
 
@@ -1082,9 +1093,9 @@ srpc_client_rpc_expired (void *data)
 }
 
 static void
-srpc_add_client_rpc_timer(srpc_client_rpc_t *rpc)
+srpc_add_client_rpc_timer(struct srpc_client_rpc *rpc)
 {
-	stt_timer_t *timer = &rpc->crpc_timer;
+	struct stt_timer *timer = &rpc->crpc_timer;
 
 	if (rpc->crpc_timeout == 0)
 		return;
@@ -1092,8 +1103,7 @@ srpc_add_client_rpc_timer(srpc_client_rpc_t *rpc)
 	INIT_LIST_HEAD(&timer->stt_list);
 	timer->stt_data	   = rpc;
 	timer->stt_func    = srpc_client_rpc_expired;
-	timer->stt_expires = cfs_time_add(rpc->crpc_timeout,
-					  cfs_time_current_sec());
+	timer->stt_expires = ktime_get_real_seconds() + rpc->crpc_timeout;
 	stt_add_timer(timer);
 	return;
 }
@@ -1102,9 +1112,10 @@ srpc_add_client_rpc_timer(srpc_client_rpc_t *rpc)
  * Called with rpc->crpc_lock held.
  *
  * Upon exit the RPC expiry timer is not queued and the handler is not
- * running on any CPU. */
+ * running on any CPU.
+ */
 static void
-srpc_del_client_rpc_timer (srpc_client_rpc_t *rpc)
+srpc_del_client_rpc_timer(struct srpc_client_rpc *rpc)
 {
 	/* timer not planted or already exploded */
 	if (rpc->crpc_timeout == 0)
@@ -1125,34 +1136,34 @@ srpc_del_client_rpc_timer (srpc_client_rpc_t *rpc)
 }
 
 static void
-srpc_client_rpc_done (srpc_client_rpc_t *rpc, int status)
+srpc_client_rpc_done(struct srpc_client_rpc *rpc, int status)
 {
-	swi_workitem_t *wi = &rpc->crpc_wi;
+	struct swi_workitem *wi = &rpc->crpc_wi;
 
 	LASSERT(status != 0 || wi->swi_state == SWI_STATE_DONE);
 
 	spin_lock(&rpc->crpc_lock);
 
-        rpc->crpc_closed = 1;
-        if (rpc->crpc_status == 0)
-                rpc->crpc_status = status;
-
-        srpc_del_client_rpc_timer(rpc);
-
-        CDEBUG_LIMIT ((status == 0) ? D_NET : D_NETERROR,
-                "Client RPC done: service %d, peer %s, status %s:%d:%d\n",
-                rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
-                swi_state2str(wi->swi_state), rpc->crpc_aborted, status);
-
-        /*
-         * No one can schedule me now since:
-         * - RPC timer has been defused.
-         * - all LNet events have been fired.
-         * - crpc_closed has been set, preventing srpc_abort_rpc from
-         *   scheduling me.
-         * Cancel pending schedules and prevent future schedule attempts:
-         */
-        LASSERT (!srpc_event_pending(rpc));
+	rpc->crpc_closed = 1;
+	if (rpc->crpc_status == 0)
+		rpc->crpc_status = status;
+
+	srpc_del_client_rpc_timer(rpc);
+
+	CDEBUG_LIMIT((status == 0) ? D_NET : D_NETERROR,
+		     "Client RPC done: service %d, peer %s, status %s:%d:%d\n",
+		     rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+		     swi_state2str(wi->swi_state), rpc->crpc_aborted, status);
+
+	/*
+	 * No one can schedule me now since:
+	 * - RPC timer has been defused.
+	 * - all LNet events have been fired.
+	 * - crpc_closed has been set, preventing srpc_abort_rpc from
+	 *   scheduling me.
+	 * Cancel pending schedules and prevent future schedule attempts:
+	 */
+	LASSERT(!srpc_event_pending(rpc));
 	swi_exit_workitem(wi);
 
 	spin_unlock(&rpc->crpc_lock);
@@ -1163,19 +1174,19 @@ srpc_client_rpc_done (srpc_client_rpc_t *rpc, int status)
 
 /* sends an outgoing RPC */
 int
-srpc_send_rpc (swi_workitem_t *wi)
+srpc_send_rpc(struct swi_workitem *wi)
 {
-        int                rc = 0;
-	srpc_client_rpc_t *rpc;
-	srpc_msg_t        *reply;
-	int                do_bulk;
+	int rc = 0;
+	struct srpc_client_rpc *rpc;
+	struct srpc_msg *reply;
+	int do_bulk;
 
 	LASSERT(wi != NULL);
 
 	rpc = wi->swi_workitem.wi_data;
 
-        LASSERT (rpc != NULL);
-        LASSERT (wi == &rpc->crpc_wi);
+	LASSERT(rpc != NULL);
+	LASSERT(wi == &rpc->crpc_wi);
 
 	reply = &rpc->crpc_replymsg;
 	do_bulk = rpc->crpc_bulk.bk_niov > 0;
@@ -1189,86 +1200,93 @@ srpc_send_rpc (swi_workitem_t *wi)
 
 	spin_unlock(&rpc->crpc_lock);
 
-        switch (wi->swi_state) {
-        default:
-                LBUG ();
-        case SWI_STATE_NEWBORN:
-                LASSERT (!srpc_event_pending(rpc));
+	switch (wi->swi_state) {
+	default:
+		LBUG();
+	case SWI_STATE_NEWBORN:
+		LASSERT(!srpc_event_pending(rpc));
+
+		rc = srpc_prepare_reply(rpc);
+		if (rc != 0) {
+			srpc_client_rpc_done(rpc, rc);
+			return 1;
+		}
 
-                rc = srpc_prepare_reply(rpc);
-                if (rc != 0) {
-                        srpc_client_rpc_done(rpc, rc);
-                        return 1;
-                }
+		rc = srpc_prepare_bulk(rpc);
+		if (rc != 0)
+			break;
 
-                rc = srpc_prepare_bulk(rpc);
-                if (rc != 0) break;
+		wi->swi_state = SWI_STATE_REQUEST_SUBMITTED;
+		rc = srpc_send_request(rpc);
+		break;
 
-                wi->swi_state = SWI_STATE_REQUEST_SUBMITTED;
-                rc = srpc_send_request(rpc);
-                break;
+	case SWI_STATE_REQUEST_SUBMITTED:
+		/* CAVEAT EMPTOR: rqtev, rpyev, and bulkev may come in any
+		 * order; however, they're processed in a strict order:
+		 * rqt, rpy, and bulk.
+		 */
+		if (!rpc->crpc_reqstev.ev_fired)
+			break;
 
-        case SWI_STATE_REQUEST_SUBMITTED:
-                /* CAVEAT EMPTOR: rqtev, rpyev, and bulkev may come in any
-                 * order; however, they're processed in a strict order:
-                 * rqt, rpy, and bulk. */
-                if (!rpc->crpc_reqstev.ev_fired) break;
+		rc = rpc->crpc_reqstev.ev_status;
+		if (rc != 0)
+			break;
 
-                rc = rpc->crpc_reqstev.ev_status;
-                if (rc != 0) break;
+		wi->swi_state = SWI_STATE_REQUEST_SENT;
+		fallthrough;
+	case SWI_STATE_REQUEST_SENT: {
+		enum srpc_msg_type type;
 
-                wi->swi_state = SWI_STATE_REQUEST_SENT;
-                /* perhaps more events, fall thru */
-		/* Fall through */
-        case SWI_STATE_REQUEST_SENT: {
-                srpc_msg_type_t type = srpc_service2reply(rpc->crpc_service);
+		type = srpc_service2reply(rpc->crpc_service);
 
-                if (!rpc->crpc_replyev.ev_fired) break;
+		if (!rpc->crpc_replyev.ev_fired)
+			break;
 
-                rc = rpc->crpc_replyev.ev_status;
-                if (rc != 0) break;
+		rc = rpc->crpc_replyev.ev_status;
+		if (rc != 0)
+			break;
 
 		srpc_unpack_msg_hdr(reply);
 		if (reply->msg_type != type ||
 		    (reply->msg_magic != SRPC_MSG_MAGIC &&
 		     reply->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
-                        CWARN ("Bad message from %s: type %u (%d expected),"
-                               " magic %u (%d expected).\n",
-                               libcfs_id2str(rpc->crpc_dest),
-                               reply->msg_type, type,
-                               reply->msg_magic, SRPC_MSG_MAGIC);
-                        rc = -EBADMSG;
-                        break;
-                }
-
-                if (do_bulk && reply->msg_body.reply.status != 0) {
-                        CWARN ("Remote error %d at %s, unlink bulk buffer in "
-                               "case peer didn't initiate bulk transfer\n",
-                               reply->msg_body.reply.status,
-                               libcfs_id2str(rpc->crpc_dest));
-                        LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
-                }
-
-                wi->swi_state = SWI_STATE_REPLY_RECEIVED;
-        }
-	/* Fall through */
-        case SWI_STATE_REPLY_RECEIVED:
-                if (do_bulk && !rpc->crpc_bulkev.ev_fired) break;
-
-                rc = do_bulk ? rpc->crpc_bulkev.ev_status : 0;
-
-                /* Bulk buffer was unlinked due to remote error. Clear error
-                 * since reply buffer still contains valid data.
-                 * NB rpc->crpc_done shouldn't look into bulk data in case of
-                 * remote error. */
-                if (do_bulk && rpc->crpc_bulkev.ev_lnet == LNET_EVENT_UNLINK &&
-                    rpc->crpc_status == 0 && reply->msg_body.reply.status != 0)
-                        rc = 0;
-
-                wi->swi_state = SWI_STATE_DONE;
-                srpc_client_rpc_done(rpc, rc);
-                return 1;
-        }
+			CWARN("Bad message from %s: type %u (%d expected), magic %u (%d expected).\n",
+			      libcfs_id2str(rpc->crpc_dest),
+			      reply->msg_type, type,
+			      reply->msg_magic, SRPC_MSG_MAGIC);
+			rc = -EBADMSG;
+			break;
+		}
+
+		if (do_bulk && reply->msg_body.reply.status != 0) {
+			CWARN("Remote error %d at %s, unlink bulk buffer in case peer didn't initiate bulk transfer\n",
+			      reply->msg_body.reply.status,
+			      libcfs_id2str(rpc->crpc_dest));
+			LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
+		}
+
+		wi->swi_state = SWI_STATE_REPLY_RECEIVED;
+	}
+	fallthrough;
+	case SWI_STATE_REPLY_RECEIVED:
+		if (do_bulk && !rpc->crpc_bulkev.ev_fired)
+			break;
+
+		rc = do_bulk ? rpc->crpc_bulkev.ev_status : 0;
+
+		/* Bulk buffer was unlinked due to remote error. Clear error
+		 * since reply buffer still contains valid data.
+		 * NB rpc->crpc_done shouldn't look into bulk data in case of
+		 * remote error.
+		 */
+		if (do_bulk && rpc->crpc_bulkev.ev_lnet == LNET_EVENT_UNLINK &&
+		    rpc->crpc_status == 0 && reply->msg_body.reply.status != 0)
+			rc = 0;
+
+		wi->swi_state = SWI_STATE_DONE;
+		srpc_client_rpc_done(rpc, rc);
+		return 1;
+	}
 
 	if (rc != 0) {
 		spin_lock(&rpc->crpc_lock);
@@ -1277,85 +1295,85 @@ srpc_send_rpc (swi_workitem_t *wi)
 	}
 
 abort:
-        if (rpc->crpc_aborted) {
-                LNetMDUnlink(rpc->crpc_reqstmdh);
-                LNetMDUnlink(rpc->crpc_replymdh);
-                LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
-
-                if (!srpc_event_pending(rpc)) {
-                        srpc_client_rpc_done(rpc, -EINTR);
-                        return 1;
-                }
-        }
-        return 0;
+	if (rpc->crpc_aborted) {
+		LNetMDUnlink(rpc->crpc_reqstmdh);
+		LNetMDUnlink(rpc->crpc_replymdh);
+		LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
+
+		if (!srpc_event_pending(rpc)) {
+			srpc_client_rpc_done(rpc, -EINTR);
+			return 1;
+		}
+	}
+	return 0;
 }
 
-srpc_client_rpc_t *
+struct srpc_client_rpc *
 srpc_create_client_rpc(struct lnet_process_id peer, int service,
 		       int nbulkiov, int bulklen,
-		       void (*rpc_done)(srpc_client_rpc_t *),
-		       void (*rpc_fini)(srpc_client_rpc_t *), void *priv)
+		       void (*rpc_done)(struct srpc_client_rpc *),
+		       void (*rpc_fini)(struct srpc_client_rpc *), void *priv)
 {
-        srpc_client_rpc_t *rpc;
+	struct srpc_client_rpc *rpc;
 
-        LIBCFS_ALLOC(rpc, offsetof(srpc_client_rpc_t,
-                                   crpc_bulk.bk_iovs[nbulkiov]));
-        if (rpc == NULL)
-                return NULL;
+	LIBCFS_ALLOC(rpc, offsetof(struct srpc_client_rpc,
+				   crpc_bulk.bk_iovs[nbulkiov]));
+	if (rpc == NULL)
+		return NULL;
 
-        srpc_init_client_rpc(rpc, peer, service, nbulkiov,
-                             bulklen, rpc_done, rpc_fini, priv);
-        return rpc;
+	srpc_init_client_rpc(rpc, peer, service, nbulkiov,
+			     bulklen, rpc_done, rpc_fini, priv);
+	return rpc;
 }
 
 /* called with rpc->crpc_lock held */
 void
-srpc_abort_rpc (srpc_client_rpc_t *rpc, int why)
+srpc_abort_rpc(struct srpc_client_rpc *rpc, int why)
 {
-        LASSERT (why != 0);
+	LASSERT(why != 0);
 
-        if (rpc->crpc_aborted || /* already aborted */
-            rpc->crpc_closed)    /* callback imminent */
-                return;
+	if (rpc->crpc_aborted || /* already aborted */
+	    rpc->crpc_closed)    /* callback imminent */
+		return;
 
-        CDEBUG (D_NET,
-                "Aborting RPC: service %d, peer %s, state %s, why %d\n",
-                rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
-                swi_state2str(rpc->crpc_wi.swi_state), why);
+	CDEBUG(D_NET,
+	       "Aborting RPC: service %d, peer %s, state %s, why %d\n",
+	       rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+	       swi_state2str(rpc->crpc_wi.swi_state), why);
 
-        rpc->crpc_aborted = 1;
-        rpc->crpc_status  = why;
-        swi_schedule_workitem(&rpc->crpc_wi);
-        return;
+	rpc->crpc_aborted = 1;
+	rpc->crpc_status  = why;
+	swi_schedule_workitem(&rpc->crpc_wi);
+	return;
 }
 
 /* called with rpc->crpc_lock held */
 void
-srpc_post_rpc (srpc_client_rpc_t *rpc)
+srpc_post_rpc(struct srpc_client_rpc *rpc)
 {
-        LASSERT (!rpc->crpc_aborted);
-        LASSERT (srpc_data.rpc_state == SRPC_STATE_RUNNING);
+	LASSERT(!rpc->crpc_aborted);
+	LASSERT(srpc_data.rpc_state == SRPC_STATE_RUNNING);
 
-        CDEBUG (D_NET, "Posting RPC: peer %s, service %d, timeout %d\n",
-                libcfs_id2str(rpc->crpc_dest), rpc->crpc_service,
-                rpc->crpc_timeout);
+	CDEBUG(D_NET, "Posting RPC: peer %s, service %d, timeout %d\n",
+	       libcfs_id2str(rpc->crpc_dest), rpc->crpc_service,
+	       rpc->crpc_timeout);
 
-        srpc_add_client_rpc_timer(rpc);
-        swi_schedule_workitem(&rpc->crpc_wi);
-        return;
+	srpc_add_client_rpc_timer(rpc);
+	swi_schedule_workitem(&rpc->crpc_wi);
+	return;
 }
 
 
 int
 srpc_send_reply(struct srpc_server_rpc *rpc)
 {
-	srpc_event_t		*ev = &rpc->srpc_ev;
-	struct srpc_msg		*msg = &rpc->srpc_replymsg;
-	struct srpc_buffer	*buffer = rpc->srpc_reqstbuf;
-	struct srpc_service_cd	*scd = rpc->srpc_scd;
-	struct srpc_service	*sv = scd->scd_svc;
-	__u64			rpyid;
-	int			rc;
+	struct srpc_event *ev = &rpc->srpc_ev;
+	struct srpc_msg *msg = &rpc->srpc_replymsg;
+	struct srpc_buffer *buffer = rpc->srpc_reqstbuf;
+	struct srpc_service_cd *scd = rpc->srpc_scd;
+	struct srpc_service *sv = scd->scd_svc;
+	__u64 rpyid;
+	int rc;
 
 	LASSERT(buffer != NULL);
 	rpyid = buffer->buf_msg.msg_body.reqst.rpyid;
@@ -1364,7 +1382,8 @@ srpc_send_reply(struct srpc_server_rpc *rpc)
 
 	if (!sv->sv_shuttingdown && !srpc_serv_is_framework(sv)) {
 		/* Repost buffer before replying since test client
-		 * might send me another RPC once it gets the reply */
+		 * might send me another RPC once it gets the reply
+		 */
 		if (srpc_service_post_buffer(scd, buffer) != 0)
 			CWARN("Failed to repost %s buffer\n", sv->sv_name);
 		rpc->srpc_reqstbuf = NULL;
@@ -1372,37 +1391,37 @@ srpc_send_reply(struct srpc_server_rpc *rpc)
 
 	spin_unlock(&scd->scd_lock);
 
-        ev->ev_fired = 0;
-        ev->ev_data  = rpc;
-        ev->ev_type  = SRPC_REPLY_SENT;
-
-        msg->msg_magic   = SRPC_MSG_MAGIC;
-        msg->msg_version = SRPC_MSG_VERSION;
-        msg->msg_type    = srpc_service2reply(sv->sv_id);
-
-        rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, rpyid, msg,
-                                   sizeof(*msg), LNET_MD_OP_PUT,
-                                   rpc->srpc_peer, rpc->srpc_self,
-                                   &rpc->srpc_replymdh, ev);
-        if (rc != 0)
-                ev->ev_fired = 1;  /* no more event expected */
-        return rc;
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = SRPC_REPLY_SENT;
+
+	msg->msg_magic   = SRPC_MSG_MAGIC;
+	msg->msg_version = SRPC_MSG_VERSION;
+	msg->msg_type    = srpc_service2reply(sv->sv_id);
+
+	rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, rpyid, msg,
+				   sizeof(*msg), LNET_MD_OP_PUT,
+				   rpc->srpc_peer, rpc->srpc_self,
+				   &rpc->srpc_replymdh, ev);
+	if (rc != 0)
+		ev->ev_fired = 1;  /* no more event expected */
+	return rc;
 }
 
 /* when in kernel always called with LNET_LOCK() held, and in thread context */
 static void
 srpc_lnet_ev_handler(struct lnet_event *ev)
 {
-	struct srpc_service_cd	*scd;
-	srpc_event_t      *rpcev = ev->md.user_ptr;
-	srpc_client_rpc_t *crpc;
-	srpc_server_rpc_t *srpc;
-	srpc_buffer_t     *buffer;
-	srpc_service_t    *sv;
-	srpc_msg_t        *msg;
-	srpc_msg_type_t    type;
+	struct srpc_service_cd *scd;
+	struct srpc_event *rpcev = ev->md.user_ptr;
+	struct srpc_client_rpc *crpc;
+	struct srpc_server_rpc *srpc;
+	struct srpc_buffer *buffer;
+	struct srpc_service *sv;
+	struct srpc_msg *msg;
+	enum srpc_msg_type type;
 
-	LASSERT (!in_interrupt());
+	LASSERT(!in_interrupt());
 
 	if (ev->status != 0) {
 		__u32 errors;
@@ -1417,41 +1436,43 @@ srpc_lnet_ev_handler(struct lnet_event *ev)
 			ev->status, ev->type, errors);
 	}
 
-        rpcev->ev_lnet = ev->type;
+	rpcev->ev_lnet = ev->type;
 
-        switch (rpcev->ev_type) {
-        default:
-                CERROR("Unknown event: status %d, type %d, lnet %d\n",
-                       rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet);
-                LBUG ();
-        case SRPC_REQUEST_SENT:
-                if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
+	switch (rpcev->ev_type) {
+	default:
+		CERROR("Unknown event: status %d, type %d, lnet %d\n",
+		       rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet);
+		LBUG();
+		fallthrough;
+	case SRPC_REQUEST_SENT:
+		if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
 			spin_lock(&srpc_data.rpc_glock);
 			srpc_data.rpc_counters.rpcs_sent++;
 			spin_unlock(&srpc_data.rpc_glock);
-                }
-		/* Fall through */
-        case SRPC_REPLY_RCVD:
-        case SRPC_BULK_REQ_RCVD:
-                crpc = rpcev->ev_data;
-
-                if (rpcev != &crpc->crpc_reqstev &&
-                    rpcev != &crpc->crpc_replyev &&
-                    rpcev != &crpc->crpc_bulkev) {
-                        CERROR("rpcev %p, crpc %p, reqstev %p, replyev %p, bulkev %p\n",
-                               rpcev, crpc, &crpc->crpc_reqstev,
-                               &crpc->crpc_replyev, &crpc->crpc_bulkev);
-                        CERROR("Bad event: status %d, type %d, lnet %d\n",
-                               rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet);
-                        LBUG ();
-                }
+		}
+		fallthrough;
+	case SRPC_REPLY_RCVD:
+	case SRPC_BULK_REQ_RCVD:
+		crpc = rpcev->ev_data;
+
+		if (rpcev != &crpc->crpc_reqstev &&
+		    rpcev != &crpc->crpc_replyev &&
+		    rpcev != &crpc->crpc_bulkev) {
+			CERROR("rpcev %p, crpc %p, reqstev %p, replyev %p, bulkev %p\n",
+			       rpcev, crpc, &crpc->crpc_reqstev,
+			       &crpc->crpc_replyev, &crpc->crpc_bulkev);
+			CERROR("Bad event: status %d, type %d, lnet %d\n",
+			       rpcev->ev_status, rpcev->ev_type,
+			       rpcev->ev_lnet);
+			LBUG();
+		}
 
 		spin_lock(&crpc->crpc_lock);
 
 		LASSERT(rpcev->ev_fired == 0);
 		rpcev->ev_fired  = 1;
 		rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ?
-						-EINTR : ev->status;
+				   -EINTR : ev->status;
 		swi_schedule_workitem(&crpc->crpc_wi);
 
 		spin_unlock(&crpc->crpc_lock);
@@ -1465,28 +1486,30 @@ srpc_lnet_ev_handler(struct lnet_event *ev)
 
 		spin_lock(&scd->scd_lock);
 
-                LASSERT (ev->unlinked);
-                LASSERT (ev->type == LNET_EVENT_PUT ||
-                         ev->type == LNET_EVENT_UNLINK);
-                LASSERT (ev->type != LNET_EVENT_UNLINK ||
-                         sv->sv_shuttingdown);
+		LASSERT(ev->unlinked);
+		LASSERT(ev->type == LNET_EVENT_PUT ||
+			ev->type == LNET_EVENT_UNLINK);
+		LASSERT(ev->type != LNET_EVENT_UNLINK ||
+			sv->sv_shuttingdown);
 
-                buffer = container_of(ev->md.start, srpc_buffer_t, buf_msg);
+		buffer = container_of(ev->md.start, struct srpc_buffer,
+				      buf_msg);
 		buffer->buf_peer = ev->source;
-                buffer->buf_self = ev->target.nid;
+		buffer->buf_self = ev->target.nid;
 
 		LASSERT(scd->scd_buf_nposted > 0);
 		scd->scd_buf_nposted--;
 
 		if (sv->sv_shuttingdown) {
 			/* Leave buffer on scd->scd_buf_nposted since
-			 * srpc_finish_service needs to traverse it. */
+			 * srpc_finish_service needs to traverse it.
+			 */
 			spin_unlock(&scd->scd_lock);
 			break;
 		}
 
 		if (scd->scd_buf_err_stamp != 0 &&
-		    scd->scd_buf_err_stamp < cfs_time_current_sec()) {
+		    scd->scd_buf_err_stamp < ktime_get_real_seconds()) {
 			/* re-enable adding buffer */
 			scd->scd_buf_err_stamp = 0;
 			scd->scd_buf_err = 0;
@@ -1504,22 +1527,22 @@ srpc_lnet_ev_handler(struct lnet_event *ev)
 		msg = &buffer->buf_msg;
 		type = srpc_service2request(sv->sv_id);
 
-                if (ev->status != 0 || ev->mlength != sizeof(*msg) ||
-                    (msg->msg_type != type &&
-                     msg->msg_type != __swab32(type)) ||
-                    (msg->msg_magic != SRPC_MSG_MAGIC &&
-                     msg->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
-                        CERROR ("Dropping RPC (%s) from %s: "
-                                "status %d mlength %d type %u magic %u.\n",
-                                sv->sv_name, libcfs_id2str(ev->initiator),
-                                ev->status, ev->mlength,
-                                msg->msg_type, msg->msg_magic);
-
-                        /* NB can't call srpc_service_recycle_buffer here since
-                         * it may call LNetM[DE]Attach. The invalid magic tells
-                         * srpc_handle_rpc to drop this RPC */
-                        msg->msg_magic = 0;
-                }
+		if (ev->status != 0 || ev->mlength != sizeof(*msg) ||
+		    (msg->msg_type != type &&
+		     msg->msg_type != __swab32(type)) ||
+		    (msg->msg_magic != SRPC_MSG_MAGIC &&
+		     msg->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
+			CERROR("Dropping RPC (%s) from %s: status %d mlength %d type %u magic %u.\n",
+			       sv->sv_name, libcfs_id2str(ev->initiator),
+			       ev->status, ev->mlength,
+			       msg->msg_type, msg->msg_magic);
+
+			/* NB can't call srpc_service_recycle_buffer here since
+			 * it may call LNetM[DE]Attach. The invalid magic tells
+			 * srpc_handle_rpc to drop this RPC
+			 */
+			msg->msg_magic = 0;
+		}
 
 		if (!list_empty(&scd->scd_rpc_free)) {
 			srpc = list_entry(scd->scd_rpc_free.next,
@@ -1541,19 +1564,18 @@ srpc_lnet_ev_handler(struct lnet_event *ev)
 		spin_lock(&srpc_data.rpc_glock);
 		srpc_data.rpc_counters.rpcs_rcvd++;
 		spin_unlock(&srpc_data.rpc_glock);
-                break;
-
-        case SRPC_BULK_GET_RPLD:
-                LASSERT (ev->type == LNET_EVENT_SEND ||
-                         ev->type == LNET_EVENT_REPLY ||
-                         ev->type == LNET_EVENT_UNLINK);
+		break;
 
-                if (!ev->unlinked)
-                        break; /* wait for final event */
-		/* Fall through */
+	case SRPC_BULK_GET_RPLD:
+		LASSERT(ev->type == LNET_EVENT_SEND ||
+			ev->type == LNET_EVENT_REPLY ||
+			ev->type == LNET_EVENT_UNLINK);
 
-        case SRPC_BULK_PUT_SENT:
-                if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
+		if (!ev->unlinked)
+			break; /* wait for final event */
+		fallthrough;
+	case SRPC_BULK_PUT_SENT:
+		if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
 			spin_lock(&srpc_data.rpc_glock);
 
 			if (rpcev->ev_type == SRPC_BULK_GET_RPLD)
@@ -1563,7 +1585,7 @@ srpc_lnet_ev_handler(struct lnet_event *ev)
 
 			spin_unlock(&srpc_data.rpc_glock);
 		}
-		/* Fall through */
+		fallthrough;
 	case SRPC_REPLY_SENT:
 		srpc = rpcev->ev_data;
 		scd  = srpc->srpc_scd;
@@ -1594,84 +1616,84 @@ srpc_startup (void)
 	/* 1 second pause to avoid timestamp reuse */
 	set_current_state(TASK_UNINTERRUPTIBLE);
 	schedule_timeout(cfs_time_seconds(1));
-	srpc_data.rpc_matchbits = ((__u64) cfs_time_current_sec()) << 48;
+	srpc_data.rpc_matchbits = ((__u64) ktime_get_real_seconds()) << 48;
 
 	srpc_data.rpc_state = SRPC_STATE_NONE;
 
 	rc = LNetNIInit(LNET_PID_LUSTRE);
-        if (rc < 0) {
-                CERROR ("LNetNIInit() has failed: %d\n", rc);
+	if (rc < 0) {
+		CERROR("LNetNIInit() has failed: %d\n", rc);
 		return rc;
-        }
+	}
 
-        srpc_data.rpc_state = SRPC_STATE_NI_INIT;
+	srpc_data.rpc_state = SRPC_STATE_NI_INIT;
 
 	LNetInvalidateEQHandle(&srpc_data.rpc_lnet_eq);
 	rc = LNetEQAlloc(0, srpc_lnet_ev_handler, &srpc_data.rpc_lnet_eq);
-        if (rc != 0) {
-                CERROR("LNetEQAlloc() has failed: %d\n", rc);
-                goto bail;
-        }
+	if (rc != 0) {
+		CERROR("LNetEQAlloc() has failed: %d\n", rc);
+		goto bail;
+	}
 
 	rc = LNetSetLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
 	LASSERT(rc == 0);
 	rc = LNetSetLazyPortal(SRPC_REQUEST_PORTAL);
 	LASSERT(rc == 0);
 
-        srpc_data.rpc_state = SRPC_STATE_EQ_INIT;
+	srpc_data.rpc_state = SRPC_STATE_EQ_INIT;
 
-        rc = stt_startup();
+	rc = stt_startup();
 
 bail:
-        if (rc != 0)
-                srpc_shutdown();
-        else
-                srpc_data.rpc_state = SRPC_STATE_RUNNING;
+	if (rc != 0)
+		srpc_shutdown();
+	else
+		srpc_data.rpc_state = SRPC_STATE_RUNNING;
 
-        return rc;
+	return rc;
 }
 
 void
 srpc_shutdown (void)
 {
-        int i;
-        int rc;
-        int state;
+	int i;
+	int rc;
+	int state;
 
-        state = srpc_data.rpc_state;
-        srpc_data.rpc_state = SRPC_STATE_STOPPING;
+	state = srpc_data.rpc_state;
+	srpc_data.rpc_state = SRPC_STATE_STOPPING;
 
-        switch (state) {
-        default:
-                LBUG ();
-        case SRPC_STATE_RUNNING:
+	switch (state) {
+	default:
+		LBUG();
+		fallthrough;
+	case SRPC_STATE_RUNNING:
 		spin_lock(&srpc_data.rpc_glock);
 
-                for (i = 0; i <= SRPC_SERVICE_MAX_ID; i++) {
-                        srpc_service_t *sv = srpc_data.rpc_services[i];
+		for (i = 0; i <= SRPC_SERVICE_MAX_ID; i++) {
+			struct srpc_service *sv = srpc_data.rpc_services[i];
 
-                        LASSERTF (sv == NULL,
-                                  "service not empty: id %d, name %s\n",
-                                  i, sv->sv_name);
-                }
+			LASSERTF(sv == NULL,
+				 "service not empty: id %d, name %s\n",
+				 i, sv->sv_name);
+		}
 
 		spin_unlock(&srpc_data.rpc_glock);
 
-                stt_shutdown();
-		/* Fall through */
+		stt_shutdown();
+		fallthrough;
 
-        case SRPC_STATE_EQ_INIT:
-                rc = LNetClearLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
+	case SRPC_STATE_EQ_INIT:
+		rc = LNetClearLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
 		rc = LNetClearLazyPortal(SRPC_REQUEST_PORTAL);
-                LASSERT (rc == 0);
-                rc = LNetEQFree(srpc_data.rpc_lnet_eq);
-                LASSERT (rc == 0); /* the EQ should have no user by now */
-		/* Fall through */
+		LASSERT(rc == 0);
+		rc = LNetEQFree(srpc_data.rpc_lnet_eq);
+		LASSERT(rc == 0); /* the EQ should have no user by now */
+		fallthrough;
 
-        case SRPC_STATE_NI_INIT:
-                LNetNIFini();
-		/* Fall through */
-        }
+	case SRPC_STATE_NI_INIT:
+		LNetNIFini();
+	}
 
-        return;
+	return;
 }
diff --git a/drivers/staging/lustrefsx/lnet/selftest/rpc.h b/drivers/staging/lustrefsx/lnet/selftest/rpc.h
index aab2629e7ba1d..8cc8c434645d5 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/rpc.h
+++ b/drivers/staging/lustrefsx/lnet/selftest/rpc.h
@@ -33,14 +33,14 @@
 #ifndef __SELFTEST_RPC_H__
 #define __SELFTEST_RPC_H__
 
-#include <lnet/lnetst.h>
+#include <uapi/linux/lnet/lnetst.h>
 
 /*
  * LST wired structures
- * 
+ *
  * XXX: *REPLY == *REQST + 1
  */
-typedef enum {
+enum srpc_msg_type {
         SRPC_MSG_MKSN_REQST     = 0,
         SRPC_MSG_MKSN_REPLY     = 1,
         SRPC_MSG_RMSN_REQST     = 2,
@@ -59,118 +59,118 @@ typedef enum {
         SRPC_MSG_PING_REPLY     = 15,
         SRPC_MSG_JOIN_REQST     = 16,
         SRPC_MSG_JOIN_REPLY     = 17,
-} srpc_msg_type_t;
+};
 
 /* CAVEAT EMPTOR:
- * All srpc_*_reqst_t's 1st field must be matchbits of reply buffer,
+ * All struct srpc_*_reqst's 1st field must be matchbits of reply buffer,
  * and 2nd field matchbits of bulk buffer if any.
  *
- * All srpc_*_reply_t's 1st field must be a __u32 status, and 2nd field
+ * All struct srpc_*_reply's 1st field must be a __u32 status, and 2nd field
  * session id if needed.
  */
-typedef struct {
+struct srpc_generic_reqst {
         __u64 			rpyid;  	/* reply buffer matchbits */
         __u64 			bulkid; 	/* bulk buffer matchbits */
-} WIRE_ATTR srpc_generic_reqst_t;
+} WIRE_ATTR;
 
-typedef struct {
+struct srpc_generic_reply {
         __u32                   status;
 	struct lst_sid               sid;
-} WIRE_ATTR srpc_generic_reply_t;
+} WIRE_ATTR;
 
 /* FRAMEWORK RPCs */
-typedef struct {
+struct srpc_mksn_reqst {
         __u64 			mksn_rpyid;      /* reply buffer matchbits */
 	struct lst_sid               mksn_sid;        /* session id */
         __u32 			mksn_force;      /* use brute force */
         char  			mksn_name[LST_NAME_SIZE];
-} WIRE_ATTR srpc_mksn_reqst_t; 			/* make session request */
+} WIRE_ATTR;					/* make session request */
 
-typedef struct {
+struct srpc_mksn_reply {
         __u32                   mksn_status;      /* session status */
 	struct lst_sid               mksn_sid;         /* session id */
         __u32                   mksn_timeout;     /* session timeout */
         char  			mksn_name[LST_NAME_SIZE];
-} WIRE_ATTR srpc_mksn_reply_t; /* make session reply */
+} WIRE_ATTR;					/* make session reply */
 
-typedef struct {
-        __u64			rmsn_rpyid;      /* reply buffer matchbits */
-	struct lst_sid		rmsn_sid;        /* session id */
-} WIRE_ATTR srpc_rmsn_reqst_t; /* remove session request */
+struct srpc_rmsn_reqst {
+	__u64			rmsn_rpyid;	/* reply buffer matchbits */
+	struct lst_sid		rmsn_sid;	/* session id */
+} WIRE_ATTR;					/* remove session request */
 
-typedef struct {
+struct srpc_rmsn_reply {
         __u32			rmsn_status;
-	struct lst_sid		rmsn_sid;        /* session id */
-} WIRE_ATTR srpc_rmsn_reply_t; /* remove session reply */
+	struct lst_sid		rmsn_sid;	/* session id */
+} WIRE_ATTR;					/* remove session reply */
 
-typedef struct {
+struct srpc_join_reqst {
         __u64			join_rpyid;     /* reply buffer matchbits */
 	struct lst_sid               join_sid;       /* session id to join */
         char                    join_group[LST_NAME_SIZE]; /* group name */
-} WIRE_ATTR srpc_join_reqst_t;
+} WIRE_ATTR;
 
-typedef struct {
+struct srpc_join_reply {
         __u32                   join_status;    /* returned status */
 	struct lst_sid               join_sid;       /* session id */
         __u32 			join_timeout;   /* # seconds' inactivity to expire */
         char                    join_session[LST_NAME_SIZE]; /* session name */
-} WIRE_ATTR srpc_join_reply_t;
+} WIRE_ATTR;
 
-typedef struct {
+struct srpc_debug_reqst {
         __u64                   dbg_rpyid;      /* reply buffer matchbits */ 
 	struct lst_sid               dbg_sid;        /* session id */
         __u32                   dbg_flags;      /* bitmap of debug */
-} WIRE_ATTR srpc_debug_reqst_t;
+} WIRE_ATTR;
 
-typedef struct {
+struct srpc_debug_reply {
         __u32                   dbg_status;     /* returned code */
 	struct lst_sid               dbg_sid;        /* session id */
         __u32                   dbg_timeout;    /* session timeout */
         __u32                   dbg_nbatch;     /* # of batches in the node */
         char                    dbg_name[LST_NAME_SIZE]; /* session name */
-} WIRE_ATTR srpc_debug_reply_t;
+} WIRE_ATTR;
 
 #define SRPC_BATCH_OPC_RUN      1
 #define SRPC_BATCH_OPC_STOP     2
 #define SRPC_BATCH_OPC_QUERY    3
 
-typedef struct {
+struct srpc_batch_reqst {
         __u64                   bar_rpyid;      /* reply buffer matchbits */ 
 	struct lst_sid               bar_sid;        /* session id */
 	struct lst_bid               bar_bid;        /* batch id */
         __u32                   bar_opc;        /* create/start/stop batch */
         __u32                   bar_testidx;    /* index of test */
         __u32                   bar_arg;        /* parameters */
-} WIRE_ATTR srpc_batch_reqst_t;
+} WIRE_ATTR;
 
-typedef struct {
+struct srpc_batch_reply {
         __u32                   bar_status;     /* status of request */
 	struct lst_sid		bar_sid;	/* session id */
         __u32                   bar_active;     /* # of active tests in batch/test */
         __u32                   bar_time;       /* remained time */
-} WIRE_ATTR srpc_batch_reply_t;
+} WIRE_ATTR;
 
-typedef struct {
+struct srpc_stat_reqst {
         __u64                   str_rpyid;      /* reply buffer matchbits */
 	struct lst_sid		str_sid;	/* session id */
         __u32                   str_type;       /* type of stat */
-} WIRE_ATTR srpc_stat_reqst_t;
+} WIRE_ATTR;
 
-typedef struct {
-        __u32                   str_status;
-	struct lst_sid		str_sid;
-	struct sfw_counters	str_fw;
-	struct srpc_counters	str_rpc;
-	struct lnet_counters	str_lnet;
-} WIRE_ATTR srpc_stat_reply_t;
+struct srpc_stat_reply {
+	__u32                    str_status;
+	struct lst_sid           str_sid;
+	struct sfw_counters      str_fw;
+	struct srpc_counters     str_rpc;
+	struct lnet_counters_common str_lnet;
+} WIRE_ATTR;
 
-typedef struct {
+struct test_bulk_req {
         __u32                   blk_opc;        /* bulk operation code */
         __u32                   blk_npg;        /* # of pages */
         __u32                   blk_flags;      /* reserved flags */
-} WIRE_ATTR test_bulk_req_t;
+} WIRE_ATTR;
 
-typedef struct {
+struct test_bulk_req_v1 {
 	/** bulk operation code */
 	__u16			blk_opc;
 	/** data check flags */
@@ -179,14 +179,14 @@ typedef struct {
 	__u32			blk_len;
 	/** bulk offset */
 	__u32                   blk_offset;
-} WIRE_ATTR test_bulk_req_v1_t;
+} WIRE_ATTR;
 
-typedef struct {
+struct test_ping_req {
 	__u32			png_size;       /* size of ping message */
 	__u32			png_flags;      /* reserved flags */
-} WIRE_ATTR test_ping_req_t;
+} WIRE_ATTR;
 
-typedef struct {
+struct srpc_test_reqst {
 	__u64			tsr_rpyid;      /* reply buffer matchbits */
 	__u64			tsr_bulkid;     /* bulk buffer matchbits */
 	struct lst_sid		tsr_sid;        /* session id */
@@ -200,86 +200,86 @@ typedef struct {
 	__u32			tsr_ndest;      /* # of dest nodes */
 
 	union {
-		test_ping_req_t		ping;
-		test_bulk_req_t		bulk_v0;
-		test_bulk_req_v1_t	bulk_v1;
-	}		tsr_u;
-} WIRE_ATTR srpc_test_reqst_t;
+		struct test_ping_req	ping;
+		struct test_bulk_req	bulk_v0;
+		struct test_bulk_req_v1	bulk_v1;
+	} tsr_u;
+} WIRE_ATTR;
 
-typedef struct {
+struct srpc_test_reply {
 	__u32			tsr_status;     /* returned code */
 	struct lst_sid		tsr_sid;
-} WIRE_ATTR srpc_test_reply_t;
+} WIRE_ATTR;
 
 /* TEST RPCs */
-typedef struct {
+struct srpc_ping_reqst {
         __u64                   pnr_rpyid;
         __u32                   pnr_magic;
         __u32                   pnr_seq;
         __u64                   pnr_time_sec;
 	__u64                   pnr_time_nsec;
-} WIRE_ATTR srpc_ping_reqst_t;
+} WIRE_ATTR;
 
-typedef struct {
+struct srpc_ping_reply {
         __u32                   pnr_status;
         __u32                   pnr_magic;
         __u32                   pnr_seq;
-} WIRE_ATTR srpc_ping_reply_t;
+} WIRE_ATTR;
 
-typedef struct {
+struct srpc_brw_reqst {
         __u64                   brw_rpyid;      /* reply buffer matchbits */
         __u64                   brw_bulkid;     /* bulk buffer matchbits */
         __u32                   brw_rw;         /* read or write */
         __u32                   brw_len;        /* bulk data len */
         __u32                   brw_flags;      /* bulk data patterns */
-} WIRE_ATTR srpc_brw_reqst_t; /* bulk r/w request */
+} WIRE_ATTR;					/* bulk r/w request */
 
-typedef struct {
+struct srpc_brw_reply {
         __u32                   brw_status;
-} WIRE_ATTR srpc_brw_reply_t; /* bulk r/w reply */
+} WIRE_ATTR; /* bulk r/w reply */
 
 #define SRPC_MSG_MAGIC                  0xeeb0f00d
 #define SRPC_MSG_VERSION                1
 
-typedef struct srpc_msg {
+struct srpc_msg {
 	/** magic number */
 	__u32	msg_magic;
 	/** message version number */
 	__u32	msg_version;
-	/** type of message body: srpc_msg_type_t */
+	/** type of message body: enum srpc_msg_type */
 	__u32	msg_type;
 	__u32	msg_reserved0;
 	__u32	msg_reserved1;
 	/** test session features */
 	__u32	msg_ses_feats;
         union {
-                srpc_generic_reqst_t reqst;
-                srpc_generic_reply_t reply;
-
-                srpc_mksn_reqst_t    mksn_reqst;
-                srpc_mksn_reply_t    mksn_reply;
-                srpc_rmsn_reqst_t    rmsn_reqst;
-                srpc_rmsn_reply_t    rmsn_reply;
-                srpc_debug_reqst_t   dbg_reqst;
-                srpc_debug_reply_t   dbg_reply;
-                srpc_batch_reqst_t   bat_reqst;
-                srpc_batch_reply_t   bat_reply;
-                srpc_stat_reqst_t    stat_reqst;
-                srpc_stat_reply_t    stat_reply;
-                srpc_test_reqst_t    tes_reqst;
-                srpc_test_reply_t    tes_reply;
-                srpc_join_reqst_t    join_reqst;
-                srpc_join_reply_t    join_reply;
-
-                srpc_ping_reqst_t    ping_reqst;
-                srpc_ping_reply_t    ping_reply;
-                srpc_brw_reqst_t     brw_reqst;
-                srpc_brw_reply_t     brw_reply;
-        }     msg_body;
-} WIRE_ATTR srpc_msg_t;
+		struct srpc_generic_reqst	reqst;
+		struct srpc_generic_reply	reply;
+
+		struct srpc_mksn_reqst		mksn_reqst;
+		struct srpc_mksn_reply		mksn_reply;
+		struct srpc_rmsn_reqst		rmsn_reqst;
+		struct srpc_rmsn_reply		rmsn_reply;
+		struct srpc_debug_reqst		dbg_reqst;
+		struct srpc_debug_reply		dbg_reply;
+		struct srpc_batch_reqst		bat_reqst;
+		struct srpc_batch_reply		bat_reply;
+		struct srpc_stat_reqst		stat_reqst;
+		struct srpc_stat_reply		stat_reply;
+		struct srpc_test_reqst		tes_reqst;
+		struct srpc_test_reply		tes_reply;
+		struct srpc_join_reqst		join_reqst;
+		struct srpc_join_reply		join_reply;
+
+		struct srpc_ping_reqst		ping_reqst;
+		struct srpc_ping_reply		ping_reply;
+		struct srpc_brw_reqst		brw_reqst;
+		struct srpc_brw_reply		brw_reply;
+	} msg_body;
+} WIRE_ATTR;
 
 static inline void
-srpc_unpack_msg_hdr(srpc_msg_t *msg)
+srpc_unpack_msg_hdr(struct srpc_msg *msg)
 {
 	if (msg->msg_magic == SRPC_MSG_MAGIC)
 		return; /* no flipping needed */
diff --git a/drivers/staging/lustrefsx/lnet/selftest/selftest.h b/drivers/staging/lustrefsx/lnet/selftest/selftest.h
index 2a29161cd4802..3f7c295e9a90c 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/selftest.h
+++ b/drivers/staging/lustrefsx/lnet/selftest/selftest.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -42,7 +42,7 @@
 #include <lnet/api.h>
 #include <lnet/lib-lnet.h>
 #include <lnet/lib-types.h>
-#include <lnet/lnetst.h>
+#include <uapi/linux/lnet/lnetst.h>
 
 #include "rpc.h"
 #include "timer.h"
@@ -89,7 +89,7 @@ struct sfw_test_instance;
 /* all reply/bulk RDMAs go to this portal */
 #define SRPC_RDMA_PORTAL                52
 
-static inline srpc_msg_type_t
+static inline enum srpc_msg_type
 srpc_service2request (int service)
 {
         switch (service) {
@@ -124,13 +124,13 @@ srpc_service2request (int service)
         }
 }
 
-static inline srpc_msg_type_t
+static inline enum srpc_msg_type
 srpc_service2reply (int service)
 {
         return srpc_service2request(service) + 1;
 }
 
-typedef enum {
+enum srpc_event_type {
         SRPC_BULK_REQ_RCVD   = 1, /* passive bulk request(PUT sink/GET source) received */
         SRPC_BULK_PUT_SENT   = 2, /* active bulk PUT sent (source) */
         SRPC_BULK_GET_RPLD   = 3, /* active bulk GET replied (sink) */
@@ -138,73 +138,74 @@ typedef enum {
         SRPC_REPLY_SENT      = 5, /* outgoing reply sent */
         SRPC_REQUEST_RCVD    = 6, /* incoming request received */
         SRPC_REQUEST_SENT    = 7, /* outgoing request sent */
-} srpc_event_type_t;
+};
 
 /* RPC event */
-typedef struct {
-        srpc_event_type_t ev_type;   /* what's up */
-	enum lnet_event_kind ev_lnet;   /* LNet event type */
+struct srpc_event {
+	enum srpc_event_type	ev_type;   /* what's up */
+	enum lnet_event_kind	ev_lnet;   /* LNet event type */
         int               ev_fired;  /* LNet event fired? */
         int               ev_status; /* LNet event status */
         void             *ev_data;   /* owning server/client RPC */
-} srpc_event_t;
+};
 
-typedef struct {
+/* bulk descriptor */
+struct srpc_bulk {
         int              bk_len;  /* len of bulk data */
 	struct lnet_handle_md bk_mdh;
         int              bk_sink; /* sink/source */
         int              bk_niov; /* # iov in bk_iovs */
         lnet_kiov_t      bk_iovs[0];
-} srpc_bulk_t; /* bulk descriptor */
+};
 
 /* message buffer descriptor */
-typedef struct srpc_buffer {
+struct srpc_buffer {
 	struct list_head	buf_list; /* chain on srpc_service::*_msgq */
-	srpc_msg_t		buf_msg;
+	struct srpc_msg		buf_msg;
 	struct lnet_handle_md	buf_mdh;
 	lnet_nid_t		buf_self;
 	struct lnet_process_id	buf_peer;
-} srpc_buffer_t;
+};
 
 struct swi_workitem;
-typedef int (*swi_action_t) (struct swi_workitem *);
+typedef int (*swi_action_t)(struct swi_workitem *);
 
-typedef struct swi_workitem {
+struct swi_workitem {
 	struct cfs_wi_sched	*swi_sched;
-	struct cfs_workitem       swi_workitem;
+	struct cfs_workitem	swi_workitem;
         swi_action_t         swi_action;
         int                  swi_state;
-} swi_workitem_t;
+};
 
 /* server-side state of a RPC */
-typedef struct srpc_server_rpc {
+struct srpc_server_rpc {
 	/* chain on srpc_service::*_rpcq */
 	struct list_head	srpc_list;
 	struct srpc_service_cd *srpc_scd;
-	swi_workitem_t		srpc_wi;
-	srpc_event_t		srpc_ev;	/* bulk/reply event */
+	struct swi_workitem	srpc_wi;
+	struct srpc_event	srpc_ev;	/* bulk/reply event */
 	lnet_nid_t		srpc_self;
 	struct lnet_process_id	srpc_peer;
-	srpc_msg_t		srpc_replymsg;
+	struct srpc_msg		srpc_replymsg;
 	struct lnet_handle_md	srpc_replymdh;
-	srpc_buffer_t		*srpc_reqstbuf;
-	srpc_bulk_t		*srpc_bulk;
+	struct srpc_buffer     *srpc_reqstbuf;
+	struct srpc_bulk       *srpc_bulk;
 
 	unsigned int	srpc_aborted; /* being given up */
 	int		srpc_status;
 	void		(*srpc_done)(struct srpc_server_rpc *);
-} srpc_server_rpc_t;
+};
 
 /* client-side state of a RPC */
-typedef struct srpc_client_rpc {
+struct srpc_client_rpc {
 	struct list_head	crpc_list;	/* chain on user's lists */
 	spinlock_t		crpc_lock;	/* serialize */
 	int			crpc_service;
 	atomic_t		crpc_refcount;
 	/* # seconds to wait for reply */
 	int			crpc_timeout;
-	stt_timer_t		crpc_timer;
-	swi_workitem_t		crpc_wi;
+	struct stt_timer	crpc_timer;
+	struct swi_workitem	crpc_wi;
 	struct lnet_process_id	crpc_dest;
 
         void               (*crpc_done)(struct srpc_client_rpc *);
@@ -216,21 +217,21 @@ typedef struct srpc_client_rpc {
         unsigned int         crpc_aborted:1; /* being given up */
         unsigned int         crpc_closed:1;  /* completed */
 
-        /* RPC events */
-        srpc_event_t         crpc_bulkev;    /* bulk event */
-        srpc_event_t         crpc_reqstev;   /* request event */
-        srpc_event_t         crpc_replyev;   /* reply event */
+	/* RPC events */
+	struct srpc_event	crpc_bulkev;	/* bulk event */
+	struct srpc_event	crpc_reqstev;	/* request event */
+	struct srpc_event	crpc_replyev;	/* reply event */
 
-        /* bulk, request(reqst), and reply exchanged on wire */
-        srpc_msg_t           crpc_reqstmsg;
-        srpc_msg_t           crpc_replymsg;
+	/* bulk, request(reqst), and reply exchanged on wire */
+	struct srpc_msg		crpc_reqstmsg;
+	struct srpc_msg		crpc_replymsg;
 	struct lnet_handle_md	crpc_reqstmdh;
 	struct lnet_handle_md	crpc_replymdh;
-        srpc_bulk_t          crpc_bulk;
-} srpc_client_rpc_t;
+	struct srpc_bulk	crpc_bulk;
+};
 
 #define srpc_client_rpc_size(rpc)                                       \
-offsetof(srpc_client_rpc_t, crpc_bulk.bk_iovs[(rpc)->crpc_bulk.bk_niov])
+offsetof(struct srpc_client_rpc, crpc_bulk.bk_iovs[(rpc)->crpc_bulk.bk_niov])
 
 #define srpc_client_rpc_addref(rpc)                                     \
 do {                                                                    \
@@ -262,19 +263,19 @@ struct srpc_service_cd {
 	/** backref to service */
 	struct srpc_service	*scd_svc;
 	/** event buffer */
-	srpc_event_t		scd_ev;
+	struct srpc_event	scd_ev;
 	/** free RPC descriptors */
 	struct list_head	scd_rpc_free;
 	/** in-flight RPCs */
 	struct list_head	scd_rpc_active;
 	/** workitem for posting buffer */
-	swi_workitem_t		scd_buf_wi;
+	struct swi_workitem	scd_buf_wi;
 	/** CPT id */
 	int			scd_cpt;
 	/** error code for scd_buf_wi */
 	int			scd_buf_err;
 	/** timestamp for scd_buf_err */
-	unsigned long		scd_buf_err_stamp;
+	time64_t		scd_buf_err_stamp;
 	/** total # request buffers */
 	int			scd_buf_total;
 	/** # posted request buffers */
@@ -302,7 +303,7 @@ struct srpc_service_cd {
 #define SFW_FRWK_WI_MIN		16
 #define SFW_FRWK_WI_MAX		256
 
-typedef struct srpc_service {
+struct srpc_service {
 	int			sv_id;		/* service id */
 	const char		*sv_name;	/* human readable name */
 	int			sv_wi_total;	/* total server workitems */
@@ -314,11 +315,11 @@ typedef struct srpc_service {
          * - sv_handler: process incoming RPC request
          * - sv_bulk_ready: notify bulk data
          */
-        int              (*sv_handler) (srpc_server_rpc_t *);
-        int              (*sv_bulk_ready) (srpc_server_rpc_t *, int);
-} srpc_service_t;
+	int              (*sv_handler)(struct srpc_server_rpc *);
+	int              (*sv_bulk_ready)(struct srpc_server_rpc *, int);
+};
 
-typedef struct {
+struct sfw_session {
 	/* chain on fw_zombie_sessions */
 	struct list_head	sn_list;
 	struct lst_sid		sn_id;		/* unique identifier */
@@ -326,42 +327,42 @@ typedef struct {
 	unsigned int		sn_timeout;
 	int			sn_timer_active;
 	unsigned int		sn_features;
-	stt_timer_t		sn_timer;
+	struct stt_timer	sn_timer;
 	struct list_head	sn_batches;	/* list of batches */
 	char			sn_name[LST_NAME_SIZE];
 	atomic_t		sn_refcount;
 	atomic_t		sn_brw_errors;
 	atomic_t		sn_ping_errors;
-	cfs_time_t		sn_started;
-} sfw_session_t;
+	ktime_t			sn_started;
+};
 
 #define sfw_sid_equal(sid0, sid1)     ((sid0).ses_nid == (sid1).ses_nid && \
                                        (sid0).ses_stamp == (sid1).ses_stamp)
 
-typedef struct {
+struct sfw_batch {
 	struct list_head	bat_list;	/* chain on sn_batches */
 	struct lst_bid		bat_id;		/* batch id */
 	int			bat_error;	/* error code of batch */
-	sfw_session_t		*bat_session;	/* batch's session */
+	struct sfw_session	*bat_session;	/* batch's session */
 	atomic_t		bat_nactive;	/* # of active tests */
 	struct list_head	bat_tests;	/* test instances */
-} sfw_batch_t;
+};
 
-typedef struct {
-        int  (*tso_init)(struct sfw_test_instance *tsi); /* intialize test client */
-        void (*tso_fini)(struct sfw_test_instance *tsi); /* finalize test client */
-        int  (*tso_prep_rpc)(struct sfw_test_unit *tsu,
+struct sfw_test_client_ops {
+	int  (*tso_init)(struct sfw_test_instance *tsi); /* intailize test client */
+	void (*tso_fini)(struct sfw_test_instance *tsi); /* finalize test client */
+	int  (*tso_prep_rpc)(struct sfw_test_unit *tsu,
 			     struct lnet_process_id dest,
-                             srpc_client_rpc_t **rpc);   /* prep a tests rpc */
+			     struct srpc_client_rpc **rpc); /* prep a tests rpc */
         void (*tso_done_rpc)(struct sfw_test_unit *tsu,
-                             srpc_client_rpc_t *rpc);    /* done a test rpc */
-} sfw_test_client_ops_t;
+			     struct srpc_client_rpc *rpc);  /* done a test rpc */
+};
 
-typedef struct sfw_test_instance {
+struct sfw_test_instance {
 	struct list_head	tsi_list;	/* chain on batch */
 	int			tsi_service;	/* test type */
-	sfw_batch_t		*tsi_batch;	/* batch */
-	sfw_test_client_ops_t	*tsi_ops;	/* test client operations */
+	struct sfw_batch	*tsi_batch;	/* batch */
+	struct sfw_test_client_ops	*tsi_ops;	/* test client operations */
 
 	/* public parameter for all test units */
 	unsigned int		tsi_is_client:1;     /* is test client */
@@ -378,11 +379,11 @@ typedef struct sfw_test_instance {
 	struct list_head	tsi_active_rpcs;/* active rpcs */
 
 	union {
-		test_ping_req_t		ping;	  /* ping parameter */
-		test_bulk_req_t		bulk_v0;  /* bulk parameter */
-		test_bulk_req_v1_t	bulk_v1;  /* bulk v1 parameter */
+		struct test_ping_req	ping;	  /* ping parameter */
+		struct test_bulk_req	bulk_v0;  /* bulk parameter */
+		struct test_bulk_req_v1	bulk_v1;  /* bulk v1 parameter */
 	} tsi_u;
-} sfw_test_instance_t;
+};
 
 /* XXX: trailing (PAGE_SIZE % sizeof(struct lnet_process_id)) bytes at
  * the end of pages are not used */
@@ -391,57 +392,59 @@ typedef struct sfw_test_instance {
 #define SFW_MAX_NDESTS     (LNET_MAX_IOV * SFW_ID_PER_PAGE)
 #define sfw_id_pages(n)    (((n) + SFW_ID_PER_PAGE - 1) / SFW_ID_PER_PAGE)
 
-typedef struct sfw_test_unit {
+struct sfw_test_unit {
 	struct list_head	tsu_list;	/* chain on lst_test_instance */
 	struct lnet_process_id	tsu_dest;	/* id of dest node */
 	int			tsu_loop;	/* loop count of the test */
-	sfw_test_instance_t	*tsu_instance;	/* pointer to test instance */
+	struct sfw_test_instance *tsu_instance;	/* pointer to test instance */
 	void			*tsu_private;	/* private data */
-	swi_workitem_t		tsu_worker;	/* workitem of the test unit */
-} sfw_test_unit_t;
+	struct swi_workitem	 tsu_worker;	/* workitem of the test unit */
+};
 
-typedef struct sfw_test_case {
-	struct list_head	tsc_list;		/* chain on fw_tests */
-	srpc_service_t		*tsc_srv_service;	/* test service */
-	sfw_test_client_ops_t	*tsc_cli_ops;		/* ops of test client */
-} sfw_test_case_t;
+struct sfw_test_case {
+	struct list_head		tsc_list;		/* chain on fw_tests */
+	struct srpc_service		*tsc_srv_service;	/* test service */
+	struct sfw_test_client_ops	*tsc_cli_ops;		/* ops of test client */
+};
 
-srpc_client_rpc_t *
+struct srpc_client_rpc *
 sfw_create_rpc(struct lnet_process_id peer, int service,
 	       unsigned features, int nbulkiov, int bulklen,
-	       void (*done) (srpc_client_rpc_t *), void *priv);
-int sfw_create_test_rpc(sfw_test_unit_t *tsu,
+	       void (*done)(struct srpc_client_rpc *), void *priv);
+int sfw_create_test_rpc(struct sfw_test_unit *tsu,
 			struct lnet_process_id peer, unsigned int features,
-			int nblk, int blklen, srpc_client_rpc_t **rpc);
-void sfw_abort_rpc(srpc_client_rpc_t *rpc);
-void sfw_post_rpc(srpc_client_rpc_t *rpc);
-void sfw_client_rpc_done(srpc_client_rpc_t *rpc);
-void sfw_unpack_message(srpc_msg_t *msg);
-void sfw_free_pages(srpc_server_rpc_t *rpc);
-void sfw_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i);
-int sfw_alloc_pages(srpc_server_rpc_t *rpc, int cpt, int npages, int len,
+			int nblk, int blklen, struct srpc_client_rpc **rpc);
+void sfw_abort_rpc(struct srpc_client_rpc *rpc);
+void sfw_post_rpc(struct srpc_client_rpc *rpc);
+void sfw_client_rpc_done(struct srpc_client_rpc *rpc);
+void sfw_unpack_message(struct srpc_msg *msg);
+void sfw_free_pages(struct srpc_server_rpc *rpc);
+void sfw_add_bulk_page(struct srpc_bulk *bk, struct page *pg, int i);
+int sfw_alloc_pages(struct srpc_server_rpc *rpc, int cpt, int npages, int len,
 		    int sink);
-int sfw_make_session (srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply);
+int sfw_make_session(struct srpc_mksn_reqst *request,
+		     struct srpc_mksn_reply *reply);
 
-srpc_client_rpc_t *
+struct srpc_client_rpc *
 srpc_create_client_rpc(struct lnet_process_id peer, int service,
                        int nbulkiov, int bulklen,
-                       void (*rpc_done)(srpc_client_rpc_t *),
-                       void (*rpc_fini)(srpc_client_rpc_t *), void *priv);
-void srpc_post_rpc(srpc_client_rpc_t *rpc);
-void srpc_abort_rpc(srpc_client_rpc_t *rpc, int why);
-void srpc_free_bulk(srpc_bulk_t *bk);
-srpc_bulk_t *srpc_alloc_bulk(int cpt, unsigned off, unsigned bulk_npg,
-			     unsigned bulk_len, int sink);
-int srpc_send_rpc(swi_workitem_t *wi);
-int srpc_send_reply(srpc_server_rpc_t *rpc);
-int srpc_add_service(srpc_service_t *sv);
-int srpc_remove_service(srpc_service_t *sv);
-void srpc_shutdown_service(srpc_service_t *sv);
-void srpc_abort_service(srpc_service_t *sv);
-int srpc_finish_service(srpc_service_t *sv);
-int srpc_service_add_buffers(srpc_service_t *sv, int nbuffer);
-void srpc_service_remove_buffers(srpc_service_t *sv, int nbuffer);
+		       void (*rpc_done)(struct srpc_client_rpc *),
+		       void (*rpc_fini)(struct srpc_client_rpc *), void *priv);
+void srpc_post_rpc(struct srpc_client_rpc *rpc);
+void srpc_abort_rpc(struct srpc_client_rpc *rpc, int why);
+void srpc_free_bulk(struct srpc_bulk *bk);
+struct srpc_bulk *srpc_alloc_bulk(int cpt, unsigned int off,
+				  unsigned int bulk_npg, unsigned int bulk_len,
+				  int sink);
+int srpc_send_rpc(struct swi_workitem *wi);
+int srpc_send_reply(struct srpc_server_rpc *rpc);
+int srpc_add_service(struct srpc_service *sv);
+int srpc_remove_service(struct srpc_service *sv);
+void srpc_shutdown_service(struct srpc_service *sv);
+void srpc_abort_service(struct srpc_service *sv);
+int srpc_finish_service(struct srpc_service *sv);
+int srpc_service_add_buffers(struct srpc_service *sv, int nbuffer);
+void srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer);
 void srpc_get_counters(struct srpc_counters *cnt);
 void srpc_set_counters(const struct srpc_counters *cnt);
 
@@ -457,13 +460,14 @@ srpc_serv_is_framework(struct srpc_service *svc)
 static inline int
 swi_wi_action(struct cfs_workitem *wi)
 {
-        swi_workitem_t *swi = container_of(wi, swi_workitem_t, swi_workitem);
+	struct swi_workitem *swi;
 
-        return swi->swi_action(swi);
+	swi = container_of(wi, struct swi_workitem, swi_workitem);
+	return swi->swi_action(swi);
 }
 
 static inline void
-swi_init_workitem(swi_workitem_t *swi, void *data,
+swi_init_workitem(struct swi_workitem *swi, void *data,
 		  swi_action_t action, struct cfs_wi_sched *sched)
 {
 	swi->swi_sched  = sched;
@@ -473,19 +477,19 @@ swi_init_workitem(swi_workitem_t *swi, void *data,
 }
 
 static inline void
-swi_schedule_workitem(swi_workitem_t *wi)
+swi_schedule_workitem(struct swi_workitem *wi)
 {
 	cfs_wi_schedule(wi->swi_sched, &wi->swi_workitem);
 }
 
 static inline void
-swi_exit_workitem(swi_workitem_t *swi)
+swi_exit_workitem(struct swi_workitem *swi)
 {
 	cfs_wi_exit(swi->swi_sched, &swi->swi_workitem);
 }
 
 static inline int
-swi_deschedule_workitem(swi_workitem_t *swi)
+swi_deschedule_workitem(struct swi_workitem *swi)
 {
 	return cfs_wi_deschedule(swi->swi_sched, &swi->swi_workitem);
 }
@@ -496,7 +500,7 @@ void sfw_shutdown(void);
 void srpc_shutdown(void);
 
 static inline void
-srpc_destroy_client_rpc (srpc_client_rpc_t *rpc)
+srpc_destroy_client_rpc(struct srpc_client_rpc *rpc)
 {
 	LASSERT (rpc != NULL);
 	LASSERT (!srpc_event_pending(rpc));
@@ -512,14 +516,14 @@ srpc_destroy_client_rpc (srpc_client_rpc_t *rpc)
 }
 
 static inline void
-srpc_init_client_rpc(srpc_client_rpc_t *rpc, struct lnet_process_id peer,
+srpc_init_client_rpc(struct srpc_client_rpc *rpc, struct lnet_process_id peer,
 		     int service, int nbulkiov, int bulklen,
-		     void (*rpc_done)(srpc_client_rpc_t *),
-		     void (*rpc_fini)(srpc_client_rpc_t *), void *priv)
+		     void (*rpc_done)(struct srpc_client_rpc *),
+		     void (*rpc_fini)(struct srpc_client_rpc *), void *priv)
 {
 	LASSERT(nbulkiov <= LNET_MAX_IOV);
 
-	memset(rpc, 0, offsetof(srpc_client_rpc_t,
+	memset(rpc, 0, offsetof(struct srpc_client_rpc,
 				crpc_bulk.bk_iovs[nbulkiov]));
 
 	INIT_LIST_HEAD(&rpc->crpc_list);
@@ -585,7 +589,7 @@ do {									\
 } while (0)
 
 static inline void
-srpc_wait_service_shutdown(srpc_service_t *sv)
+srpc_wait_service_shutdown(struct srpc_service *sv)
 {
 	int i = 2;
 
@@ -601,13 +605,13 @@ srpc_wait_service_shutdown(srpc_service_t *sv)
 	}
 }
 
-extern sfw_test_client_ops_t ping_test_client;
-extern srpc_service_t        ping_test_service;
+extern struct sfw_test_client_ops ping_test_client;
+extern struct srpc_service ping_test_service;
 void ping_init_test_client(void);
 void ping_init_test_service(void);
 
-extern sfw_test_client_ops_t brw_test_client;
-extern srpc_service_t        brw_test_service;
+extern struct sfw_test_client_ops brw_test_client;
+extern struct srpc_service brw_test_service;
 void brw_init_test_client(void);
 void brw_init_test_service(void);
 
diff --git a/drivers/staging/lustrefsx/lnet/selftest/timer.c b/drivers/staging/lustrefsx/lnet/selftest/timer.c
index 7e09e6672b3ef..3ceec81bf1b08 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/timer.c
+++ b/drivers/staging/lustrefsx/lnet/selftest/timer.c
@@ -56,7 +56,7 @@
 static struct st_timer_data {
 	spinlock_t		stt_lock;
 	/* start time of the slot processed previously */
-	cfs_time_t		stt_prev_slot;
+	time64_t		stt_prev_slot;
 	struct list_head	stt_hash[STTIMER_NSLOTS];
 	int			stt_shuttingdown;
 	wait_queue_head_t	stt_waitq;
@@ -64,7 +64,7 @@ static struct st_timer_data {
 } stt_data;
 
 void
-stt_add_timer(stt_timer_t *timer)
+stt_add_timer(struct stt_timer *timer)
 {
 	struct list_head *pos;
 
@@ -74,11 +74,12 @@ stt_add_timer(stt_timer_t *timer)
 	LASSERT(!stt_data.stt_shuttingdown);
 	LASSERT(timer->stt_func != NULL);
 	LASSERT(list_empty(&timer->stt_list));
-	LASSERT(cfs_time_after(timer->stt_expires, cfs_time_current_sec()));
+	LASSERT(timer->stt_expires > ktime_get_real_seconds());
 
 	/* a simple insertion sort */
 	list_for_each_prev(pos, STTIMER_SLOT(timer->stt_expires)) {
-		stt_timer_t *old = list_entry(pos, stt_timer_t, stt_list);
+		struct stt_timer *old = list_entry(pos, struct stt_timer,
+						   stt_list);
 
 		if (timer->stt_expires >= old->stt_expires)
 			break;
@@ -98,7 +99,7 @@ stt_add_timer(stt_timer_t *timer)
  * another CPU.
  */
 int
-stt_del_timer(stt_timer_t *timer)
+stt_del_timer(struct stt_timer *timer)
 {
 	int ret = 0;
 
@@ -118,13 +119,13 @@ stt_del_timer(stt_timer_t *timer)
 
 /* called with stt_data.stt_lock held */
 static int
-stt_expire_list(struct list_head *slot, cfs_time_t now)
+stt_expire_list(struct list_head *slot, time64_t now)
 {
 	int	     expired = 0;
-	stt_timer_t *timer;
+	struct stt_timer *timer;
 
 	while (!list_empty(slot)) {
-		timer = list_entry(slot->next, stt_timer_t, stt_list);
+		timer = list_entry(slot->next, struct stt_timer, stt_list);
 
 		if (timer->stt_expires > now)
 			break;
@@ -142,20 +143,20 @@ stt_expire_list(struct list_head *slot, cfs_time_t now)
 }
 
 static int
-stt_check_timers(cfs_time_t *last)
+stt_check_timers(time64_t *last)
 {
 	int expired = 0;
-	cfs_time_t now;
-        cfs_time_t this_slot;
+	time64_t now;
+	time64_t this_slot;
 
-	now = cfs_time_current_sec();
-        this_slot = now & STTIMER_SLOTTIMEMASK;
+	now = ktime_get_real_seconds();
+	this_slot = now & STTIMER_SLOTTIMEMASK;
 
 	spin_lock(&stt_data.stt_lock);
 
-	while (cfs_time_aftereq(this_slot, *last)) {
+	while (this_slot >= *last) {
 		expired += stt_expire_list(STTIMER_SLOT(this_slot), now);
-		this_slot = cfs_time_sub(this_slot, STTIMER_SLOTTIME);
+		this_slot = this_slot - STTIMER_SLOTTIME;
 	}
 
 	*last = now & STTIMER_SLOTTIMEMASK;
@@ -210,7 +211,7 @@ stt_startup (void)
         int i;
 
         stt_data.stt_shuttingdown = 0;
-	stt_data.stt_prev_slot = cfs_time_current_sec() & STTIMER_SLOTTIMEMASK;
+	stt_data.stt_prev_slot = ktime_get_real_seconds() & STTIMER_SLOTTIMEMASK;
 
 	spin_lock_init(&stt_data.stt_lock);
         for (i = 0; i < STTIMER_NSLOTS; i++)
diff --git a/drivers/staging/lustrefsx/lnet/selftest/timer.h b/drivers/staging/lustrefsx/lnet/selftest/timer.h
index 71c3de2736b15..e769c4cc9ebd7 100644
--- a/drivers/staging/lustrefsx/lnet/selftest/timer.h
+++ b/drivers/staging/lustrefsx/lnet/selftest/timer.h
@@ -34,15 +34,15 @@
 #ifndef __SELFTEST_TIMER_H__
 #define __SELFTEST_TIMER_H__
 
-typedef struct {
+struct stt_timer {
 	struct list_head	stt_list;
-	cfs_time_t		stt_expires;
+	time64_t		stt_expires;
 	void			(*stt_func)(void *);
 	void			*stt_data;
-} stt_timer_t;
+};
 
-void stt_add_timer(stt_timer_t *timer);
-int stt_del_timer(stt_timer_t *timer);
+void stt_add_timer(struct stt_timer *timer);
+int stt_del_timer(struct stt_timer *timer);
 int stt_startup(void);
 void stt_shutdown(void);
 
diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_handler.c b/drivers/staging/lustrefsx/lustre/fid/fid_handler.c
index ef61772f0dcb2..8676ec223548d 100644
--- a/drivers/staging/lustrefsx/lustre/fid/fid_handler.c
+++ b/drivers/staging/lustrefsx/lustre/fid/fid_handler.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -48,8 +48,6 @@
 #include <lustre_fid.h>
 #include "fid_internal.h"
 
-static void seq_server_proc_fini(struct lu_server_seq *seq);
-
 /* Assigns client to sequence controller node. */
 int seq_server_set_cli(const struct lu_env *env, struct lu_server_seq *seq,
 		       struct lu_client_seq *cli)
@@ -458,35 +456,43 @@ LU_KEY_INIT_FINI(seq, struct seq_thread_info);
 /* context key: seq_thread_key */
 LU_CONTEXT_KEY_DEFINE(seq, LCT_MD_THREAD | LCT_DT_THREAD);
 
-extern const struct file_operations seq_fld_proc_seq_fops;
+extern const struct file_operations seq_fld_debugfs_seq_fops;
+
+static void seq_server_debugfs_fini(struct lu_server_seq *seq)
+{
+	if (!IS_ERR_OR_NULL(seq->lss_debugfs_entry))
+		ldebugfs_remove(&seq->lss_debugfs_entry);
+}
 
-static int seq_server_proc_init(struct lu_server_seq *seq)
+static int seq_server_debugfs_init(struct lu_server_seq *seq)
 {
-#ifdef CONFIG_PROC_FS
 	int rc;
 	ENTRY;
 
-	seq->lss_proc_dir = lprocfs_register(seq->lss_name,
-					     seq_type_proc_dir,
-					     NULL, NULL);
-	if (IS_ERR(seq->lss_proc_dir)) {
-		rc = PTR_ERR(seq->lss_proc_dir);
+	seq->lss_debugfs_entry = ldebugfs_register(seq->lss_name,
+						   seq_debugfs_dir,
+						   NULL, NULL);
+	if (IS_ERR_OR_NULL(seq->lss_debugfs_entry)) {
+		rc = seq->lss_debugfs_entry ? PTR_ERR(seq->lss_debugfs_entry)
+					    : -ENOMEM;
+		seq->lss_debugfs_entry = NULL;
 		RETURN(rc);
 	}
 
-	rc = lprocfs_add_vars(seq->lss_proc_dir, seq_server_proc_list, seq);
+	rc = ldebugfs_add_vars(seq->lss_debugfs_entry,
+			       seq_server_debugfs_list, seq);
 	if (rc) {
-		CERROR("%s: Can't init sequence manager "
-		       "proc, rc %d\n", seq->lss_name, rc);
+		CERROR("%s: Can't init sequence manager debugfs, rc %d\n",
+		       seq->lss_name, rc);
 		GOTO(out_cleanup, rc);
 	}
 
 	if (seq->lss_type == LUSTRE_SEQ_CONTROLLER) {
-		rc = lprocfs_seq_create(seq->lss_proc_dir, "fldb", 0644,
-					&seq_fld_proc_seq_fops, seq);
+		rc = ldebugfs_seq_create(seq->lss_debugfs_entry, "fldb", 0644,
+					 &seq_fld_debugfs_seq_fops, seq);
 		if (rc) {
-			CERROR("%s: Can't create fldb for sequence manager "
-			       "proc: rc = %d\n", seq->lss_name, rc);
+			CERROR("%s: Can't create fldb for sequence manager debugfs: rc = %d\n",
+			       seq->lss_name, rc);
 			GOTO(out_cleanup, rc);
 		}
 	}
@@ -494,24 +500,8 @@ static int seq_server_proc_init(struct lu_server_seq *seq)
 	RETURN(0);
 
 out_cleanup:
-	seq_server_proc_fini(seq);
+	seq_server_debugfs_fini(seq);
 	return rc;
-#else /* !CONFIG_PROC_FS */
-	return 0;
-#endif /* CONFIG_PROC_FS */
-}
-
-static void seq_server_proc_fini(struct lu_server_seq *seq)
-{
-#ifdef CONFIG_PROC_FS
-        ENTRY;
-        if (seq->lss_proc_dir != NULL) {
-                if (!IS_ERR(seq->lss_proc_dir))
-                        lprocfs_remove(&seq->lss_proc_dir);
-                seq->lss_proc_dir = NULL;
-        }
-        EXIT;
-#endif /* CONFIG_PROC_FS */
 }
 
 int seq_server_init(const struct lu_env *env,
@@ -592,7 +582,7 @@ int seq_server_init(const struct lu_env *env,
 			lu_seq_range_is_sane(&seq->lss_space));
 	}
 
-        rc  = seq_server_proc_init(seq);
+	rc  = seq_server_debugfs_init(seq);
         if (rc)
                 GOTO(out, rc);
 
@@ -609,7 +599,7 @@ void seq_server_fini(struct lu_server_seq *seq,
 {
         ENTRY;
 
-        seq_server_proc_fini(seq);
+	seq_server_debugfs_fini(seq);
         seq_store_fini(seq, env);
 
         EXIT;
diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_internal.h b/drivers/staging/lustrefsx/lustre/fid/fid_internal.h
index 9ad1420e1812e..1c6587d43b52b 100644
--- a/drivers/staging/lustrefsx/lustre/fid/fid_internal.h
+++ b/drivers/staging/lustrefsx/lustre/fid/fid_internal.h
@@ -36,7 +36,6 @@
 #ifndef __FID_INTERNAL_H
 #define __FID_INTERNAL_H
 
-#include <lustre/lustre_idl.h>
 #include <libcfs/libcfs.h>
 
 #ifdef HAVE_SERVER_SUPPORT
@@ -56,9 +55,7 @@ enum {
 
 extern struct lu_context_key seq_thread_key;
 
-# ifdef CONFIG_PROC_FS
-extern struct lprocfs_vars seq_server_proc_list[];
-# endif
+extern struct ldebugfs_vars seq_server_debugfs_list[];
 
 /* Store API functions. */
 struct dt_device;
@@ -90,10 +87,8 @@ void fid_server_mod_exit(void);
 int seq_client_alloc_super(struct lu_client_seq *seq,
 			   const struct lu_env *env);
 
-# ifdef CONFIG_PROC_FS
-extern struct lprocfs_vars seq_client_proc_list[];
-# endif
+extern struct dentry *seq_debugfs_dir;
 
-extern struct proc_dir_entry *seq_type_proc_dir;
+extern struct ldebugfs_vars seq_client_debugfs_list[];
 
 #endif /* __FID_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_lib.c b/drivers/staging/lustrefsx/lustre/fid/fid_lib.c
index 7c5477c044351..ab3a59820abc7 100644
--- a/drivers/staging/lustrefsx/lustre/fid/fid_lib.c
+++ b/drivers/staging/lustrefsx/lustre/fid/fid_lib.c
@@ -41,7 +41,6 @@
 
 #include <libcfs/libcfs.h>
 #include <linux/module.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_fid.h>
 
 /**
diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_request.c b/drivers/staging/lustrefsx/lustre/fid/fid_request.c
index ab1cca59bc916..93f6402a12232 100644
--- a/drivers/staging/lustrefsx/lustre/fid/fid_request.c
+++ b/drivers/staging/lustrefsx/lustre/fid/fid_request.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,8 +38,8 @@
 
 #define DEBUG_SUBSYSTEM S_FID
 
+#include <linux/err.h>
 #include <linux/module.h>
-#include <libcfs/libcfs.h>
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
@@ -48,6 +48,8 @@
 #include <lustre_mdc.h>
 #include "fid_internal.h"
 
+struct dentry *seq_debugfs_dir;
+
 static int seq_client_rpc(struct lu_client_seq *seq,
                           struct lu_seq_range *output, __u32 opc,
                           const char *opcname)
@@ -438,51 +440,57 @@ void seq_client_flush(struct lu_client_seq *seq)
 }
 EXPORT_SYMBOL(seq_client_flush);
 
-static void seq_client_proc_fini(struct lu_client_seq *seq)
+static void seq_client_debugfs_fini(struct lu_client_seq *seq)
 {
-#ifdef CONFIG_PROC_FS
-	ENTRY;
-	if (seq->lcs_proc_dir) {
-		if (!IS_ERR(seq->lcs_proc_dir))
-			lprocfs_remove(&seq->lcs_proc_dir);
-		seq->lcs_proc_dir = NULL;
-	}
-	EXIT;
-#endif /* CONFIG_PROC_FS */
+	if (!IS_ERR_OR_NULL(seq->lcs_debugfs_entry))
+		ldebugfs_remove(&seq->lcs_debugfs_entry);
 }
 
-static int seq_client_proc_init(struct lu_client_seq *seq)
+static int seq_client_debugfs_init(struct lu_client_seq *seq)
 {
-#ifdef CONFIG_PROC_FS
         int rc;
-        ENTRY;
 
-	seq->lcs_proc_dir = lprocfs_register(seq->lcs_name, seq_type_proc_dir,
-					     NULL, NULL);
-        if (IS_ERR(seq->lcs_proc_dir)) {
-                CERROR("%s: LProcFS failed in seq-init\n",
-                       seq->lcs_name);
-                rc = PTR_ERR(seq->lcs_proc_dir);
-                RETURN(rc);
+	seq->lcs_debugfs_entry = ldebugfs_register(seq->lcs_name,
+						   seq_debugfs_dir,
+						   NULL, NULL);
+	if (IS_ERR_OR_NULL(seq->lcs_debugfs_entry)) {
+		CERROR("%s: LdebugFS failed in seq-init\n", seq->lcs_name);
+		rc = seq->lcs_debugfs_entry ? PTR_ERR(seq->lcs_debugfs_entry)
+					    : -ENOMEM;
+		seq->lcs_debugfs_entry = NULL;
+		RETURN(rc);
         }
 
-	rc = lprocfs_add_vars(seq->lcs_proc_dir, seq_client_proc_list, seq);
-        if (rc) {
-                CERROR("%s: Can't init sequence manager "
-                       "proc, rc %d\n", seq->lcs_name, rc);
+	rc = ldebugfs_add_vars(seq->lcs_debugfs_entry,
+			       seq_client_debugfs_list, seq);
+	if (rc) {
+		CERROR("%s: Can't init sequence manager debugfs, rc %d\n",
+		       seq->lcs_name, rc);
                 GOTO(out_cleanup, rc);
         }
 
         RETURN(0);
 
 out_cleanup:
-        seq_client_proc_fini(seq);
+	seq_client_debugfs_fini(seq);
         return rc;
+}
 
-#else /* !CONFIG_PROC_FS */
-	return 0;
-#endif /* CONFIG_PROC_FS */
+void seq_client_fini(struct lu_client_seq *seq)
+{
+	ENTRY;
+
+	seq_client_debugfs_fini(seq);
+
+	if (seq->lcs_exp != NULL) {
+		class_export_put(seq->lcs_exp);
+		seq->lcs_exp = NULL;
+	}
+
+	seq->lcs_srv = NULL;
+	EXIT;
 }
+EXPORT_SYMBOL(seq_client_fini);
 
 int seq_client_init(struct lu_client_seq *seq,
                     struct obd_export *exp,
@@ -515,29 +523,13 @@ int seq_client_init(struct lu_client_seq *seq,
 	snprintf(seq->lcs_name, sizeof(seq->lcs_name),
 		 "cli-%s", prefix);
 
-	rc = seq_client_proc_init(seq);
+	rc = seq_client_debugfs_init(seq);
 	if (rc)
 		seq_client_fini(seq);
 	RETURN(rc);
 }
 EXPORT_SYMBOL(seq_client_init);
 
-void seq_client_fini(struct lu_client_seq *seq)
-{
-        ENTRY;
-
-        seq_client_proc_fini(seq);
-
-        if (seq->lcs_exp != NULL) {
-                class_export_put(seq->lcs_exp);
-                seq->lcs_exp = NULL;
-        }
-
-        seq->lcs_srv = NULL;
-        EXIT;
-}
-EXPORT_SYMBOL(seq_client_fini);
-
 int client_fid_init(struct obd_device *obd,
 		    struct obd_export *exp, enum lu_cli_type type)
 {
@@ -591,21 +583,18 @@ int client_fid_fini(struct obd_device *obd)
 }
 EXPORT_SYMBOL(client_fid_fini);
 
-struct proc_dir_entry *seq_type_proc_dir;
-
 static int __init fid_init(void)
 {
-	seq_type_proc_dir = lprocfs_register(LUSTRE_SEQ_NAME,
-					     proc_lustre_root,
-					     NULL, NULL);
-	if (IS_ERR(seq_type_proc_dir))
-		return PTR_ERR(seq_type_proc_dir);
+#ifdef HAVE_SERVER_SUPPORT
+	int rc = fid_server_mod_init();
 
-# ifdef HAVE_SERVER_SUPPORT
-	fid_server_mod_init();
-# endif
-
-	return 0;
+	if (rc)
+		return rc;
+#endif
+	seq_debugfs_dir = ldebugfs_register(LUSTRE_SEQ_NAME,
+					    debugfs_lustre_root,
+					    NULL, NULL);
+	return PTR_ERR_OR_ZERO(seq_debugfs_dir);
 }
 
 static void __exit fid_exit(void)
@@ -613,11 +602,8 @@ static void __exit fid_exit(void)
 # ifdef HAVE_SERVER_SUPPORT
 	fid_server_mod_exit();
 # endif
-
-	if (seq_type_proc_dir != NULL && !IS_ERR(seq_type_proc_dir)) {
-		lprocfs_remove(&seq_type_proc_dir);
-		seq_type_proc_dir = NULL;
-	}
+	if (!IS_ERR_OR_NULL(seq_debugfs_dir))
+		ldebugfs_remove(&seq_debugfs_dir);
 }
 
 MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
diff --git a/drivers/staging/lustrefsx/lustre/fid/fid_store.c b/drivers/staging/lustrefsx/lustre/fid/fid_store.c
index 225ddfad6f634..1565d80811d29 100644
--- a/drivers/staging/lustrefsx/lustre/fid/fid_store.c
+++ b/drivers/staging/lustrefsx/lustre/fid/fid_store.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c b/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c
index d95888f15cfcb..5ac2b883d0861 100644
--- a/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c
+++ b/drivers/staging/lustrefsx/lustre/fid/lproc_fid.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -47,8 +47,6 @@
 #include <lprocfs_status.h>
 #include "fid_internal.h"
 
-#ifdef CONFIG_PROC_FS
-
 /* Format: [0x64BIT_INT - 0x64BIT_INT] + 32 bytes just in case */
 #define MAX_FID_RANGE_STRLEN (32 + 2 * 2 * sizeof(__u64))
 /**
@@ -60,34 +58,37 @@
  * safe for production use.
  */
 static int
-lprocfs_fid_write_common(struct file *file, const char __user *buffer,
-				size_t count, struct lu_seq_range *range)
+ldebugfs_fid_write_common(const char __user *buffer, size_t count,
+			  struct lu_seq_range *range)
 {
+	char kernbuf[MAX_FID_RANGE_STRLEN];
 	struct lu_seq_range tmp = {
 		.lsr_start = 0,
 	};
-	char kernbuf[MAX_FID_RANGE_STRLEN];
-	ENTRY;
+	int rc;
 
-	LASSERT(range != NULL);
+	ENTRY;
+	LASSERT(range);
 
 	if (count >= sizeof(kernbuf))
 		RETURN(-EINVAL);
 
-	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
+	if (copy_from_user(kernbuf, buffer, count))
 		RETURN(-EFAULT);
 
 	kernbuf[count] = 0;
 
 	if (count == 5 && strcmp(kernbuf, "clear") == 0) {
 		memset(range, 0, sizeof(*range));
-		RETURN(0);
+		RETURN(count);
 	}
 
 	/* of the form "[0x0000000240000400 - 0x000000028000400]" */
-	sscanf(kernbuf, "[%llx - %llx]\n",
-	       (long long unsigned *)&tmp.lsr_start,
-	       (long long unsigned *)&tmp.lsr_end);
+	rc = sscanf(kernbuf, "[%llx - %llx]\n",
+		    (unsigned long long *)&tmp.lsr_start,
+		    (unsigned long long *)&tmp.lsr_end);
+	if (rc != 2)
+		RETURN(-EINVAL);
 	if (!lu_seq_range_is_sane(&tmp) || lu_seq_range_is_zero(&tmp) ||
 	    tmp.lsr_start < range->lsr_start || tmp.lsr_end > range->lsr_end)
 		RETURN(-EINVAL);
@@ -97,23 +98,24 @@ lprocfs_fid_write_common(struct file *file, const char __user *buffer,
 
 #ifdef HAVE_SERVER_SUPPORT
 /*
- * Server side procfs stuff.
+ * Server side debugfs stuff.
  */
 static ssize_t
-lprocfs_server_fid_space_seq_write(struct file *file, const char __user *buffer,
-					size_t count, loff_t *off)
+ldebugfs_server_fid_space_seq_write(struct file *file,
+				    const char __user *buffer,
+				    size_t count, loff_t *off)
 {
-	struct lu_server_seq *seq = ((struct seq_file *)file->private_data)->private;
+	struct lu_server_seq *seq;
 	int rc;
-	ENTRY;
 
-	LASSERT(seq != NULL);
+	ENTRY;
+	seq = ((struct seq_file *)file->private_data)->private;
 
 	mutex_lock(&seq->lss_mutex);
-	rc = lprocfs_fid_write_common(file, buffer, count, &seq->lss_space);
+	rc = ldebugfs_fid_write_common(buffer, count, &seq->lss_space);
 	if (rc == 0) {
-		CDEBUG(D_INFO, "%s: Space: "DRANGE"\n",
-			seq->lss_name, PRANGE(&seq->lss_space));
+		CDEBUG(D_INFO, "%s: Space: " DRANGE "\n",
+		       seq->lss_name, PRANGE(&seq->lss_space));
 	}
 	mutex_unlock(&seq->lss_mutex);
 
@@ -121,13 +123,11 @@ lprocfs_server_fid_space_seq_write(struct file *file, const char __user *buffer,
 }
 
 static int
-lprocfs_server_fid_space_seq_show(struct seq_file *m, void *unused)
+ldebugfs_server_fid_space_seq_show(struct seq_file *m, void *unused)
 {
 	struct lu_server_seq *seq = (struct lu_server_seq *)m->private;
 	ENTRY;
 
-	LASSERT(seq != NULL);
-
 	mutex_lock(&seq->lss_mutex);
 	seq_printf(m, "[%#llx - %#llx]:%x:%s\n", PRANGE(&seq->lss_space));
 	mutex_unlock(&seq->lss_mutex);
@@ -136,14 +136,12 @@ lprocfs_server_fid_space_seq_show(struct seq_file *m, void *unused)
 }
 
 static int
-lprocfs_server_fid_server_seq_show(struct seq_file *m, void *unused)
+ldebugfs_server_fid_server_seq_show(struct seq_file *m, void *unused)
 {
 	struct lu_server_seq *seq = (struct lu_server_seq *)m->private;
 	struct client_obd *cli;
 	ENTRY;
 
-	LASSERT(seq != NULL);
-
 	if (seq->lss_cli) {
 		if (seq->lss_cli->lcs_exp != NULL) {
 			cli = &seq->lss_cli->lcs_exp->exp_obd->u.cli;
@@ -158,34 +156,24 @@ lprocfs_server_fid_server_seq_show(struct seq_file *m, void *unused)
 	RETURN(0);
 }
 
-static ssize_t
-lprocfs_server_fid_width_seq_write(struct file *file, const char __user *buffer,
-					size_t count, loff_t *off)
+static ssize_t ldebugfs_server_fid_width_seq_write(struct file *file,
+						   const char __user *buffer,
+						   size_t count, loff_t *off)
 {
-	struct lu_server_seq *seq = ((struct seq_file *)file->private_data)->private;
+	struct seq_file *m = file->private_data;
+	struct lu_server_seq *seq = m->private;
 	int rc;
-	__s64 val;
-	ENTRY;
-
-	LASSERT(seq != NULL);
 
+	ENTRY;
 	mutex_lock(&seq->lss_mutex);
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtoull_from_user(buffer, count, 0, &seq->lss_width);
 	if (rc) {
 		CERROR("%s: invalid FID sequence width: rc = %d\n",
 		       seq->lss_name, rc);
 		GOTO(out_unlock, count = rc);
 	}
 
-	if (val < 0) {
-		CERROR("%s: invalid FID sequence width: rc = %d\n",
-		       seq->lss_name, -ERANGE);
-		GOTO(out_unlock, count = -ERANGE);
-	}
-
-	seq->lss_width = val;
-
 	CDEBUG(D_INFO, "%s: Width: %llu\n",
 	       seq->lss_name, seq->lss_width);
 out_unlock:
@@ -195,13 +183,11 @@ lprocfs_server_fid_width_seq_write(struct file *file, const char __user *buffer,
 }
 
 static int
-lprocfs_server_fid_width_seq_show(struct seq_file *m, void *unused)
+ldebugfs_server_fid_width_seq_show(struct seq_file *m, void *unused)
 {
 	struct lu_server_seq *seq = (struct lu_server_seq *)m->private;
-	ENTRY;
-
-	LASSERT(seq != NULL);
 
+	ENTRY;
 	mutex_lock(&seq->lss_mutex);
 	seq_printf(m, "%llu\n", seq->lss_width);
 	mutex_unlock(&seq->lss_mutex);
@@ -209,17 +195,17 @@ lprocfs_server_fid_width_seq_show(struct seq_file *m, void *unused)
 	RETURN(0);
 }
 
-LPROC_SEQ_FOPS(lprocfs_server_fid_space);
-LPROC_SEQ_FOPS(lprocfs_server_fid_width);
-LPROC_SEQ_FOPS_RO(lprocfs_server_fid_server);
+LDEBUGFS_SEQ_FOPS(ldebugfs_server_fid_space);
+LDEBUGFS_SEQ_FOPS(ldebugfs_server_fid_width);
+LDEBUGFS_SEQ_FOPS_RO(ldebugfs_server_fid_server);
 
-struct lprocfs_vars seq_server_proc_list[] = {
+struct ldebugfs_vars seq_server_debugfs_list[] = {
 	{ .name	=	"space",
-	  .fops	=	&lprocfs_server_fid_space_fops	},
+	  .fops	=	&ldebugfs_server_fid_space_fops	},
 	{ .name	=	"width",
-	  .fops	=	&lprocfs_server_fid_width_fops	},
+	  .fops	=	&ldebugfs_server_fid_width_fops	},
 	{ .name	=	"server",
-	  .fops	=	&lprocfs_server_fid_server_fops	},
+	  .fops	=	&ldebugfs_server_fid_server_fops},
 	{ NULL }
 };
 
@@ -350,7 +336,7 @@ struct seq_operations fldb_sops = {
 static int fldb_seq_open(struct inode *inode, struct file *file)
 {
 	struct seq_file		*seq;
-	struct lu_server_seq    *ss = (struct lu_server_seq *) PDE_DATA(inode);
+	struct lu_server_seq *ss = inode->i_private;
 	struct lu_server_fld    *fld;
 	struct dt_object	*obj;
 	const struct dt_it_ops  *iops;
@@ -361,10 +347,6 @@ static int fldb_seq_open(struct inode *inode, struct file *file)
 	fld = ss->lss_site->ss_server_fld;
 	LASSERT(fld != NULL);
 
-	rc = LPROCFS_ENTRY_CHECK(inode);
-	if (rc < 0)
-		return rc;
-
 	rc = seq_open(file, &fldb_sops);
 	if (rc)
 		return rc;
@@ -416,7 +398,7 @@ static int fldb_seq_release(struct inode *inode, struct file *file)
 
 	param = seq->private;
 	if (param == NULL) {
-		lprocfs_seq_release(inode, file);
+		seq_release(inode, file);
 		return 0;
 	}
 
@@ -430,7 +412,7 @@ static int fldb_seq_release(struct inode *inode, struct file *file)
 	iops->fini(&param->fsp_env, param->fsp_it);
 	lu_env_fini(&param->fsp_env);
 	OBD_FREE_PTR(param);
-	lprocfs_seq_release(inode, file);
+	seq_release(inode, file);
 
 	return 0;
 }
@@ -496,7 +478,7 @@ static ssize_t fldb_seq_write(struct file *file, const char __user *buf,
 	RETURN(rc < 0 ? rc : len);
 }
 
-const struct file_operations seq_fld_proc_seq_fops = {
+const struct file_operations seq_fld_debugfs_seq_fops = {
 	.owner	 = THIS_MODULE,
 	.open	 = fldb_seq_open,
 	.read	 = seq_read,
@@ -506,21 +488,22 @@ const struct file_operations seq_fld_proc_seq_fops = {
 
 #endif /* HAVE_SERVER_SUPPORT */
 
-/* Client side procfs stuff */
+/* Client side debugfs stuff */
 static ssize_t
-lprocfs_client_fid_space_seq_write(struct file *file, const char __user *buffer,
-				   size_t count, loff_t *off)
+ldebugfs_client_fid_space_seq_write(struct file *file,
+				    const char __user *buffer,
+				    size_t count, loff_t *off)
 {
-	struct lu_client_seq *seq = ((struct seq_file *)file->private_data)->private;
+	struct lu_client_seq *seq;
 	int rc;
-	ENTRY;
 
-	LASSERT(seq != NULL);
+	ENTRY;
+	seq = ((struct seq_file *)file->private_data)->private;
 
 	mutex_lock(&seq->lcs_mutex);
-	rc = lprocfs_fid_write_common(file, buffer, count, &seq->lcs_space);
+	rc = ldebugfs_fid_write_common(buffer, count, &seq->lcs_space);
 	if (rc == 0) {
-		CDEBUG(D_INFO, "%s: Space: "DRANGE"\n",
+		CDEBUG(D_INFO, "%s: Space: " DRANGE "\n",
                        seq->lcs_name, PRANGE(&seq->lcs_space));
 	}
 
@@ -529,68 +512,58 @@ lprocfs_client_fid_space_seq_write(struct file *file, const char __user *buffer,
 	RETURN(count);
 }
 
-static int
-lprocfs_client_fid_space_seq_show(struct seq_file *m, void *unused)
+static int ldebugfs_client_fid_space_seq_show(struct seq_file *m, void *unused)
 {
 	struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
-	ENTRY;
-
-	LASSERT(seq != NULL);
 
+	ENTRY;
 	mutex_lock(&seq->lcs_mutex);
-	seq_printf(m, "[%#llx - %#llx]:%x:%s\n",
-		   PRANGE(&seq->lcs_space));
+	seq_printf(m, "[%#llx - %#llx]:%x:%s\n", PRANGE(&seq->lcs_space));
 	mutex_unlock(&seq->lcs_mutex);
 
 	RETURN(0);
 }
 
-static ssize_t
-lprocfs_client_fid_width_seq_write(struct file *file, const char __user *buffer,
-				   size_t count, loff_t *off)
+static ssize_t ldebugfs_client_fid_width_seq_write(struct file *file,
+						   const char __user *buffer,
+						   size_t count, loff_t *off)
 {
-	struct lu_client_seq *seq = ((struct seq_file *)file->private_data)->private;
-	__u64 max;
+	struct seq_file *m = file->private_data;
+	struct lu_client_seq *seq = m->private;
+	u64 val;
+	u64 max;
 	int rc;
-	__s64 val;
-	ENTRY;
 
-	LASSERT(seq != NULL);
+	ENTRY;
+	rc = kstrtoull_from_user(buffer, count, 0, &val);
+	if (rc)
+		return rc;
 
 	mutex_lock(&seq->lcs_mutex);
-
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
-	if (rc) {
-		GOTO(out_unlock, count = rc);
-	}
-
 	if (seq->lcs_type == LUSTRE_SEQ_DATA)
 		max = LUSTRE_DATA_SEQ_MAX_WIDTH;
 	else
 		max = LUSTRE_METADATA_SEQ_MAX_WIDTH;
 
-	if (val <= max && val > 0) {
+	if (val <= max) {
 		seq->lcs_width = val;
 
-		CDEBUG(D_INFO, "%s: Sequence size: %llu\n",
-		       seq->lcs_name, seq->lcs_width);
+		CDEBUG(D_INFO, "%s: Sequence size: %llu\n", seq->lcs_name,
+		       seq->lcs_width);
 	} else {
-		GOTO(out_unlock, count = -ERANGE);
+		count = -ERANGE;
 	}
 
-out_unlock:
 	mutex_unlock(&seq->lcs_mutex);
 	RETURN(count);
 }
 
 static int
-lprocfs_client_fid_width_seq_show(struct seq_file *m, void *unused)
+ldebugfs_client_fid_width_seq_show(struct seq_file *m, void *unused)
 {
 	struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
-	ENTRY;
-
-	LASSERT(seq != NULL);
 
+	ENTRY;
 	mutex_lock(&seq->lcs_mutex);
 	seq_printf(m, "%llu\n", seq->lcs_width);
 	mutex_unlock(&seq->lcs_mutex);
@@ -599,13 +572,11 @@ lprocfs_client_fid_width_seq_show(struct seq_file *m, void *unused)
 }
 
 static int
-lprocfs_client_fid_fid_seq_show(struct seq_file *m, void *unused)
+ldebugfs_client_fid_fid_seq_show(struct seq_file *m, void *unused)
 {
 	struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
-	ENTRY;
-
-	LASSERT(seq != NULL);
 
+	ENTRY;
 	mutex_lock(&seq->lcs_mutex);
 	seq_printf(m, DFID"\n", PFID(&seq->lcs_fid));
 	mutex_unlock(&seq->lcs_mutex);
@@ -614,38 +585,37 @@ lprocfs_client_fid_fid_seq_show(struct seq_file *m, void *unused)
 }
 
 static int
-lprocfs_client_fid_server_seq_show(struct seq_file *m, void *unused)
+ldebugfs_client_fid_server_seq_show(struct seq_file *m, void *unused)
 {
 	struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
 	struct client_obd *cli;
 	ENTRY;
 
-	LASSERT(seq != NULL);
-
-	if (seq->lcs_exp != NULL) {
+	if (seq->lcs_exp) {
 		cli = &seq->lcs_exp->exp_obd->u.cli;
 		seq_printf(m, "%s\n", cli->cl_target_uuid.uuid);
+#ifdef HAVE_SERVER_SUPPORT
 	} else {
 		seq_printf(m, "%s\n", seq->lcs_srv->lss_name);
+#endif /* HAVE_SERVER_SUPPORT */
 	}
+
 	RETURN(0);
 }
 
-LPROC_SEQ_FOPS(lprocfs_client_fid_space);
-LPROC_SEQ_FOPS(lprocfs_client_fid_width);
-LPROC_SEQ_FOPS_RO(lprocfs_client_fid_server);
-LPROC_SEQ_FOPS_RO(lprocfs_client_fid_fid);
+LDEBUGFS_SEQ_FOPS(ldebugfs_client_fid_space);
+LDEBUGFS_SEQ_FOPS(ldebugfs_client_fid_width);
+LDEBUGFS_SEQ_FOPS_RO(ldebugfs_client_fid_server);
+LDEBUGFS_SEQ_FOPS_RO(ldebugfs_client_fid_fid);
 
-struct lprocfs_vars seq_client_proc_list[] = {
+struct ldebugfs_vars seq_client_debugfs_list[] = {
 	{ .name	=	"space",
-	  .fops	=	&lprocfs_client_fid_space_fops	},
+	  .fops	=	&ldebugfs_client_fid_space_fops	},
 	{ .name	=	"width",
-	  .fops	=	&lprocfs_client_fid_width_fops	},
+	  .fops	=	&ldebugfs_client_fid_width_fops	},
 	{ .name	=	"server",
-	  .fops	=	&lprocfs_client_fid_server_fops	},
+	  .fops	=	&ldebugfs_client_fid_server_fops},
 	{ .name	=	"fid",
-	  .fops	=	&lprocfs_client_fid_fid_fops	},
+	  .fops	=	&ldebugfs_client_fid_fid_fops	},
 	{ NULL }
 };
-
-#endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_cache.c b/drivers/staging/lustrefsx/lustre/fld/fld_cache.c
index 9b46feed04e72..f638e0dcd1ea4 100644
--- a/drivers/staging/lustrefsx/lustre/fld/fld_cache.c
+++ b/drivers/staging/lustrefsx/lustre/fld/fld_cache.c
@@ -88,27 +88,14 @@ struct fld_cache *fld_cache_init(const char *name,
  */
 void fld_cache_fini(struct fld_cache *cache)
 {
-        __u64 pct;
-        ENTRY;
-
-        LASSERT(cache != NULL);
-        fld_cache_flush(cache);
-
-        if (cache->fci_stat.fst_count > 0) {
-                pct = cache->fci_stat.fst_cache * 100;
-                do_div(pct, cache->fci_stat.fst_count);
-        } else {
-                pct = 0;
-        }
+	LASSERT(cache != NULL);
+	fld_cache_flush(cache);
 
-        CDEBUG(D_INFO, "FLD cache statistics (%s):\n", cache->fci_name);
-	CDEBUG(D_INFO, "  Total reqs: %llu\n", cache->fci_stat.fst_count);
+	CDEBUG(D_INFO, "FLD cache statistics (%s):\n", cache->fci_name);
 	CDEBUG(D_INFO, "  Cache reqs: %llu\n", cache->fci_stat.fst_cache);
-	CDEBUG(D_INFO, "  Cache hits: %llu%%\n", pct);
-
-        OBD_FREE_PTR(cache);
+	CDEBUG(D_INFO, "  Total reqs: %llu\n", cache->fci_stat.fst_count);
 
-        EXIT;
+	OBD_FREE_PTR(cache);
 }
 
 /**
diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_handler.c b/drivers/staging/lustrefsx/lustre/fld/fld_handler.c
index 375070464cd85..42f00da7f1363 100644
--- a/drivers/staging/lustrefsx/lustre/fld/fld_handler.c
+++ b/drivers/staging/lustrefsx/lustre/fld/fld_handler.c
@@ -103,15 +103,16 @@ EXPORT_SYMBOL(fld_server_create);
 /**
  * Extract index information from fld name like srv-fsname-MDT0000
  **/
-int fld_name_to_index(const char *name, __u32 *index)
+int fld_name_to_index(const char *name, u32 *index)
 {
 	char *dash;
 	int rc;
+
 	ENTRY;
 
 	CDEBUG(D_INFO, "get index from %s\n", name);
 	dash = strrchr(name, '-');
-	if (dash == NULL)
+	if (!dash)
 		RETURN(-EINVAL);
 	dash++;
 	rc = target_name2index(dash, index, NULL);
@@ -124,17 +125,20 @@ int fld_name_to_index(const char *name, __u32 *index)
 int fld_update_from_controller(const struct lu_env *env,
 			       struct lu_server_fld *fld)
 {
-	struct fld_thread_info	  *info;
-	struct lu_seq_range	  *range;
+	struct fld_thread_info *info;
+	struct lu_seq_range *range;
 	struct lu_seq_range_array *lsra;
-	__u32			  index;
-	struct ptlrpc_request	  *req;
-	int			  rc;
-	int			  i;
+	u32 index;
+	struct ptlrpc_request *req;
+	int rc;
+	int i;
+
 	ENTRY;
 
-	/* Update only happens during initalization, i.e. local FLDB
-	 * does not exist yet */
+	/*
+	 * Update only happens during initalization, i.e. local FLDB
+	 * does not exist yet
+	 */
 	if (!fld->lsf_new)
 		RETURN(0);
 
@@ -162,7 +166,7 @@ int fld_update_from_controller(const struct lu_env *env,
 		LASSERT(req != NULL);
 		lsra = (struct lu_seq_range_array *)req_capsule_server_get(
 					  &req->rq_pill, &RMF_GENERIC_DATA);
-		if (lsra == NULL)
+		if (!lsra)
 			GOTO(out, rc = -EPROTO);
 
 		range_array_le_to_cpu(lsra, lsra);
@@ -188,7 +192,7 @@ int fld_update_from_controller(const struct lu_env *env,
 
 	fld->lsf_new = 1;
 out:
-	if (req != NULL)
+	if (req)
 		ptlrpc_req_finished(req);
 
 	RETURN(rc);
@@ -204,6 +208,7 @@ int fld_local_lookup(const struct lu_env *env, struct lu_server_fld *fld,
 	struct lu_seq_range *erange;
 	struct fld_thread_info *info;
 	int rc;
+
 	ENTRY;
 
 	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
@@ -215,9 +220,9 @@ int fld_local_lookup(const struct lu_env *env, struct lu_server_fld *fld,
 	if (rc == 0) {
 		if (unlikely(fld_range_type(erange) != fld_range_type(range) &&
 			     !fld_range_is_any(range))) {
-			CERROR("%s: FLD cache range "DRANGE" does not match"
-			       "requested flag %x: rc = %d\n", fld->lsf_name,
-			       PRANGE(erange), range->lsr_flags, -EIO);
+			CERROR("%s: FLD cache range "DRANGE" does not match requested flag %x: rc = %d\n",
+			       fld->lsf_name, PRANGE(erange), range->lsr_flags,
+			       -EIO);
 			RETURN(-EIO);
 		}
 		*range = *erange;
@@ -237,8 +242,9 @@ EXPORT_SYMBOL(fld_local_lookup);
 int fld_server_lookup(const struct lu_env *env, struct lu_server_fld *fld,
 		      u64 seq, struct lu_seq_range *range)
 {
-	__u32 index;
+	u32 index;
 	int rc;
+
 	ENTRY;
 
 	rc = fld_local_lookup(env, fld, seq, range);
@@ -250,18 +256,21 @@ int fld_server_lookup(const struct lu_env *env, struct lu_server_fld *fld,
 		RETURN(rc);
 
 	if (index == 0 && rc == LDD_F_SV_TYPE_MDT) {
-		/* On server side, all entries should be in cache.
-		 * If we can not find it in cache, just return error */
+		/*
+		 * On server side, all entries should be in cache.
+		 * If we can not find it in cache, just return error
+		 */
 		CERROR("%s: Cannot find sequence %#llx: rc = %d\n",
 		       fld->lsf_name, seq, -ENOENT);
 		RETURN(-ENOENT);
 	} else {
-		if (fld->lsf_control_exp == NULL) {
-			CERROR("%s: lookup %#llx, but not connects to MDT0"
-			       "yet: rc = %d.\n", fld->lsf_name, seq, -EIO);
+		if (!fld->lsf_control_exp) {
+			CERROR("%s: lookup %#llx, but not connects to MDT0 yet: rc = %d.\n",
+			       fld->lsf_name, seq, -EIO);
 			RETURN(-EIO);
 		}
-		/* send request to mdt0 i.e. super seq. controller.
+		/*
+		 * send request to mdt0 i.e. super seq. controller.
 		 * This is temporary solution, long term solution is fld
 		 * replication on all mdt servers.
 		 */
@@ -281,17 +290,17 @@ EXPORT_SYMBOL(fld_server_lookup);
  */
 static int fld_handle_lookup(struct tgt_session_info *tsi)
 {
-	struct obd_export	*exp = tsi->tsi_exp;
-	struct lu_site		*site = exp->exp_obd->obd_lu_dev->ld_site;
-	struct lu_server_fld	*fld;
-	struct lu_seq_range	*in;
-	struct lu_seq_range	*out;
-	int			rc;
+	struct obd_export *exp = tsi->tsi_exp;
+	struct lu_site *site = exp->exp_obd->obd_lu_dev->ld_site;
+	struct lu_server_fld *fld;
+	struct lu_seq_range *in;
+	struct lu_seq_range *out;
+	int rc;
 
 	ENTRY;
 
 	in = req_capsule_client_get(tsi->tsi_pill, &RMF_FLD_MDFLD);
-	if (in == NULL)
+	if (!in)
 		RETURN(err_serious(-EPROTO));
 
 	rc = req_capsule_server_pack(tsi->tsi_pill);
@@ -299,7 +308,7 @@ static int fld_handle_lookup(struct tgt_session_info *tsi)
 		RETURN(err_serious(rc));
 
 	out = req_capsule_server_get(tsi->tsi_pill, &RMF_FLD_MDFLD);
-	if (out == NULL)
+	if (!out)
 		RETURN(err_serious(-EPROTO));
 	*out = *in;
 
@@ -315,18 +324,18 @@ static int fld_handle_lookup(struct tgt_session_info *tsi)
 
 static int fld_handle_read(struct tgt_session_info *tsi)
 {
-	struct obd_export	*exp = tsi->tsi_exp;
-	struct lu_site		*site = exp->exp_obd->obd_lu_dev->ld_site;
-	struct lu_seq_range	*in;
-	void			*data;
-	int			rc;
+	struct obd_export *exp = tsi->tsi_exp;
+	struct lu_site *site = exp->exp_obd->obd_lu_dev->ld_site;
+	struct lu_seq_range *in;
+	void *data;
+	int rc;
 
 	ENTRY;
 
 	req_capsule_set(tsi->tsi_pill, &RQF_FLD_READ);
 
 	in = req_capsule_client_get(tsi->tsi_pill, &RMF_FLD_MDFLD);
-	if (in == NULL)
+	if (!in)
 		RETURN(err_serious(-EPROTO));
 
 	req_capsule_set_size(tsi->tsi_pill, &RMF_GENERIC_DATA, RCL_SERVER,
@@ -365,12 +374,13 @@ static int fld_handle_query(struct tgt_session_info *tsi)
  * fid_is_local() is supposed to be used in assertion checks only.
  */
 int fid_is_local(const struct lu_env *env,
-                 struct lu_site *site, const struct lu_fid *fid)
+		 struct lu_site *site, const struct lu_fid *fid)
 {
 	int result;
 	struct seq_server_site *ss_site;
 	struct lu_seq_range *range;
 	struct fld_thread_info *info;
+
 	ENTRY;
 
 	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
@@ -378,7 +388,7 @@ int fid_is_local(const struct lu_env *env,
 
 	result = 1; /* conservatively assume fid is local */
 	ss_site = lu_site2seq(site);
-	if (ss_site->ss_client_fld != NULL) {
+	if (ss_site->ss_client_fld) {
 		int rc;
 
 		rc = fld_cache_lookup(ss_site->ss_client_fld->lcf_cache,
@@ -388,54 +398,37 @@ int fid_is_local(const struct lu_env *env,
 	}
 	return result;
 }
+EXPORT_SYMBOL(fid_is_local);
 
-static void fld_server_proc_fini(struct lu_server_fld *fld);
+static void fld_server_debugfs_fini(struct lu_server_fld *fld)
+{
+	if (!IS_ERR_OR_NULL(fld->lsf_debugfs_entry))
+		ldebugfs_remove(&fld->lsf_debugfs_entry);
+}
 
-#ifdef CONFIG_PROC_FS
-static int fld_server_proc_init(struct lu_server_fld *fld)
+static int fld_server_debugfs_init(struct lu_server_fld *fld)
 {
-        int rc = 0;
-        ENTRY;
+	int rc = 0;
 
-	fld->lsf_proc_dir = lprocfs_register(fld->lsf_name, fld_type_proc_dir,
-					     fld_server_proc_list, fld);
-	if (IS_ERR(fld->lsf_proc_dir)) {
-		rc = PTR_ERR(fld->lsf_proc_dir);
+	ENTRY;
+	fld->lsf_debugfs_entry = ldebugfs_register(fld->lsf_name,
+						   fld_debugfs_dir,
+						   NULL, NULL);
+	if (IS_ERR_OR_NULL(fld->lsf_debugfs_entry)) {
+		rc = fld->lsf_debugfs_entry ? PTR_ERR(fld->lsf_debugfs_entry)
+					    : -ENOMEM;
+		fld->lsf_debugfs_entry = NULL;
 		RETURN(rc);
 	}
 
-	rc = lprocfs_seq_create(fld->lsf_proc_dir, "fldb", 0444,
-				&fld_proc_seq_fops, fld);
-	if (rc) {
-		lprocfs_remove(&fld->lsf_proc_dir);
-		fld->lsf_proc_dir = NULL;
-	}
+	rc = ldebugfs_seq_create(fld->lsf_debugfs_entry, "fldb", 0444,
+				 &fld_debugfs_seq_fops, fld);
+	if (rc)
+		ldebugfs_remove(&fld->lsf_debugfs_entry);
 
 	RETURN(rc);
 }
 
-static void fld_server_proc_fini(struct lu_server_fld *fld)
-{
-        ENTRY;
-        if (fld->lsf_proc_dir != NULL) {
-                if (!IS_ERR(fld->lsf_proc_dir))
-                        lprocfs_remove(&fld->lsf_proc_dir);
-                fld->lsf_proc_dir = NULL;
-        }
-        EXIT;
-}
-#else
-static int fld_server_proc_init(struct lu_server_fld *fld)
-{
-        return 0;
-}
-
-static void fld_server_proc_fini(struct lu_server_fld *fld)
-{
-        return;
-}
-#endif
-
 int fld_server_init(const struct lu_env *env, struct lu_server_fld *fld,
 		    struct dt_device *dt, const char *prefix, int type)
 {
@@ -463,7 +456,7 @@ int fld_server_init(const struct lu_env *env, struct lu_server_fld *fld,
 	if (rc)
 		GOTO(out_cache, rc);
 
-	rc = fld_server_proc_init(fld);
+	rc = fld_server_debugfs_init(fld);
 	if (rc)
 		GOTO(out_index, rc);
 
@@ -484,10 +477,10 @@ void fld_server_fini(const struct lu_env *env, struct lu_server_fld *fld)
 {
 	ENTRY;
 
-	fld_server_proc_fini(fld);
+	fld_server_debugfs_fini(fld);
 	fld_index_fini(env, fld);
 
-	if (fld->lsf_cache != NULL) {
+	if (fld->lsf_cache) {
 		if (!IS_ERR(fld->lsf_cache))
 			fld_cache_fini(fld->lsf_cache);
 		fld->lsf_cache = NULL;
diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_index.c b/drivers/staging/lustrefsx/lustre/fld/fld_index.c
index fa9ca9427f22f..f2079cb5b1f49 100644
--- a/drivers/staging/lustrefsx/lustre/fld/fld_index.c
+++ b/drivers/staging/lustrefsx/lustre/fld/fld_index.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -84,10 +84,10 @@ int fld_declare_index_create(const struct lu_env *env,
 			     const struct lu_seq_range *new_range,
 			     struct thandle *th)
 {
-	struct lu_seq_range	*tmp;
-	struct lu_seq_range	*range;
-	struct fld_thread_info	*info;
-	int			rc = 0;
+	struct lu_seq_range *tmp;
+	struct lu_seq_range *range;
+	struct fld_thread_info *info;
+	int rc = 0;
 
 	ENTRY;
 
@@ -109,8 +109,10 @@ int fld_declare_index_create(const struct lu_env *env,
 		GOTO(out, rc);
 	}
 
-	/* Check for merge case, since the fld entry can only be increamental,
-	 * so we will only check whether it can be merged from the left. */
+	/*
+	 * Check for merge case, since the fld entry can only be increamental,
+	 * so we will only check whether it can be merged from the left.
+	 */
 	if (new_range->lsr_start == range->lsr_end && range->lsr_end != 0 &&
 	    lu_seq_range_compare_loc(new_range, range) == 0) {
 		range_cpu_to_be(tmp, range);
@@ -156,12 +158,13 @@ int fld_declare_index_create(const struct lu_env *env,
 int fld_index_create(const struct lu_env *env, struct lu_server_fld *fld,
 		     const struct lu_seq_range *new_range, struct thandle *th)
 {
-	struct lu_seq_range	*range;
-	struct lu_seq_range	*tmp;
-	struct fld_thread_info	*info;
-	int			rc = 0;
-	int			deleted = 0;
-	struct fld_cache_entry	*flde;
+	struct lu_seq_range *range;
+	struct lu_seq_range *tmp;
+	struct fld_thread_info *info;
+	int rc = 0;
+	int deleted = 0;
+	struct fld_cache_entry *flde;
+
 	ENTRY;
 
 	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
@@ -193,7 +196,7 @@ int fld_index_create(const struct lu_env *env, struct lu_server_fld *fld,
 
 	range_cpu_to_be(tmp, tmp);
 	rc = dt_insert(env, fld->lsf_obj, (struct dt_rec *)tmp,
-		       (struct dt_key *)&tmp->lsr_start, th, 1);
+		       (struct dt_key *)&tmp->lsr_start, th);
 	if (rc != 0) {
 		CERROR("%s: insert range "DRANGE" failed: rc = %d\n",
 		       fld->lsf_name, PRANGE(new_range), rc);
@@ -229,11 +232,11 @@ int fld_index_create(const struct lu_env *env, struct lu_server_fld *fld,
 int fld_index_lookup(const struct lu_env *env, struct lu_server_fld *fld,
 		     u64 seq, struct lu_seq_range *range)
 {
-        struct lu_seq_range     *fld_rec;
-        struct fld_thread_info  *info;
-        int rc;
+	struct lu_seq_range *fld_rec;
+	struct fld_thread_info *info;
+	int rc;
 
-        ENTRY;
+	ENTRY;
 
 	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
 	fld_rec = &info->fti_rec;
@@ -245,12 +248,12 @@ int fld_index_lookup(const struct lu_env *env, struct lu_server_fld *fld,
 			rc = 0;
 		else
 			rc = -ENOENT;
-        }
+	}
 
 	CDEBUG(D_INFO, "%s: lookup seq = %#llx range : "DRANGE" rc = %d\n",
-               fld->lsf_name, seq, PRANGE(range), rc);
+	       fld->lsf_name, seq, PRANGE(range), rc);
 
-        RETURN(rc);
+	RETURN(rc);
 }
 
 /**
@@ -273,6 +276,7 @@ int fld_insert_entry(const struct lu_env *env,
 	struct thandle *th;
 	struct dt_device *dt = lu2dt_dev(fld->lsf_obj->do_lu.lo_dev);
 	int rc;
+
 	ENTRY;
 
 	LASSERT(mutex_is_locked(&fld->lsf_lock));
@@ -325,16 +329,18 @@ static int fld_insert_special_entries(const struct lu_env *env,
 int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
 		   struct dt_device *dt, int type)
 {
-	struct dt_object	*dt_obj = NULL;
-	struct lu_fid		fid;
-	struct lu_attr		*attr = NULL;
-	struct lu_seq_range	*range = NULL;
-	struct fld_thread_info	*info;
-	struct dt_object_format	dof;
-	struct dt_it		*it;
-	const struct dt_it_ops	*iops;
-	int			rc;
-	__u32			index;
+	struct dt_object *dt_obj = NULL;
+	struct lu_fid fid;
+	struct lu_attr *attr = NULL;
+	struct lu_seq_range *range = NULL;
+	struct fld_thread_info *info;
+	struct dt_object_format dof;
+	struct dt_it *it;
+	const struct dt_it_ops *iops;
+	int rc;
+	u32 index;
+	int range_count = 0;
+
 	ENTRY;
 
 	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
@@ -342,7 +348,7 @@ int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
 
 	lu_local_obj_fid(&fid, FLD_INDEX_OID);
 	OBD_ALLOC_PTR(attr);
-	if (attr == NULL)
+	if (!attr)
 		RETURN(-ENOMEM);
 
 	memset(attr, 0, sizeof(*attr));
@@ -388,26 +394,41 @@ int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
 		GOTO(out, rc = PTR_ERR(it));
 
 	rc = iops->load(env, it, 0);
+	if (rc > 0)
+		rc = 0;
+	else if (rc == 0)
+		rc = iops->next(env, it);
+
 	if (rc < 0)
 		GOTO(out_it_fini, rc);
 
-	if (rc > 0) {
-		/* Load FLD entry into server cache */
-		do {
-			rc = iops->rec(env, it, (struct dt_rec *)range, 0);
-			if (rc != 0)
-				GOTO(out_it_put, rc);
-			LASSERT(range != NULL);
-			range_be_to_cpu(range, range);
+	while (rc == 0) {
+		rc = iops->rec(env, it, (struct dt_rec *)range, 0);
+		if (rc != 0)
+			GOTO(out_it_put, rc);
+
+		range_be_to_cpu(range, range);
+
+		/*
+		 * Newly created ldiskfs IAM indexes may include a
+		 * zeroed-out key and record. Ignore it here.
+		 */
+		if (range->lsr_start < range->lsr_end) {
 			rc = fld_cache_insert(fld->lsf_cache, range);
 			if (rc != 0)
 				GOTO(out_it_put, rc);
-			rc = iops->next(env, it);
-		} while (rc == 0);
-	} else {
-		fld->lsf_new = 1;
+
+			range_count++;
+		}
+
+		rc = iops->next(env, it);
+		if (rc < 0)
+			GOTO(out_it_fini, rc);
 	}
 
+	if (range_count == 0)
+		fld->lsf_new = 1;
+
 	rc = fld_name_to_index(fld->lsf_name, &index);
 	if (rc < 0)
 		GOTO(out_it_put, rc);
@@ -415,8 +436,10 @@ int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
 		rc = 0;
 
 	if (index == 0 && type == LU_SEQ_RANGE_MDT) {
-		/* Note: fld_insert_entry will detect whether these
-		 * special entries already exist inside FLDB */
+		/*
+		 * Note: fld_insert_entry will detect whether these
+		 * special entries already exist inside FLDB
+		 */
 		mutex_lock(&fld->lsf_lock);
 		rc = fld_insert_special_entries(env, fld);
 		mutex_unlock(&fld->lsf_lock);
@@ -431,11 +454,11 @@ int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
 out_it_fini:
 	iops->fini(env, it);
 out:
-	if (attr != NULL)
+	if (attr)
 		OBD_FREE_PTR(attr);
 
 	if (rc < 0) {
-		if (dt_obj != NULL)
+		if (dt_obj)
 			dt_object_put(env, dt_obj);
 		fld->lsf_obj = NULL;
 	}
@@ -445,7 +468,7 @@ int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
 void fld_index_fini(const struct lu_env *env, struct lu_server_fld *fld)
 {
 	ENTRY;
-	if (fld->lsf_obj != NULL) {
+	if (fld->lsf_obj) {
 		if (!IS_ERR(fld->lsf_obj))
 			dt_object_put(env, fld->lsf_obj);
 		fld->lsf_obj = NULL;
@@ -457,12 +480,12 @@ int fld_server_read(const struct lu_env *env, struct lu_server_fld *fld,
 		    struct lu_seq_range *range, void *data, int data_len)
 {
 	struct lu_seq_range_array *lsra = data;
-	struct fld_thread_info	  *info;
-	struct dt_object	  *dt_obj = fld->lsf_obj;
-	struct lu_seq_range	  *entry;
-	struct dt_it		  *it;
-	const struct dt_it_ops	  *iops;
-	int			  rc;
+	struct fld_thread_info *info;
+	struct dt_object *dt_obj = fld->lsf_obj;
+	struct lu_seq_range *entry;
+	struct dt_it *it;
+	const struct dt_it_ops *iops;
+	int rc;
 
 	ENTRY;
 
diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_internal.h b/drivers/staging/lustrefsx/lustre/fld/fld_internal.h
index dcb24a3c2f22a..48337e0b6839b 100644
--- a/drivers/staging/lustrefsx/lustre/fld/fld_internal.h
+++ b/drivers/staging/lustrefsx/lustre/fld/fld_internal.h
@@ -56,7 +56,6 @@
 #define __FLD_INTERNAL_H
 
 #include <obd.h>
-#include <lustre/lustre_idl.h>
 #include <libcfs/libcfs.h>
 #include <lustre_fld.h>
 
@@ -139,12 +138,6 @@ enum {
 
 extern struct lu_fld_hash fld_hash[];
 
-
-#ifdef CONFIG_PROC_FS
-extern struct proc_dir_entry *fld_type_proc_dir;
-extern struct lprocfs_vars fld_client_proc_list[];
-#endif
-
 # ifdef HAVE_SERVER_SUPPORT
 struct fld_thread_info {
 	struct lu_seq_range fti_rec;
@@ -172,16 +165,15 @@ int fld_index_lookup(const struct lu_env *env, struct lu_server_fld *fld,
 		     u64 seq, struct lu_seq_range *range);
 
 int fld_name_to_index(const char *name, __u32 *index);
-int fld_server_mod_init(void);
 
+int fld_server_mod_init(void);
 void fld_server_mod_exit(void);
 
 int fld_server_read(const struct lu_env *env, struct lu_server_fld *fld,
 		    struct lu_seq_range *range, void *data, int data_len);
-#ifdef CONFIG_PROC_FS
-extern const struct file_operations fld_proc_seq_fops;
-extern struct lprocfs_vars fld_server_proc_list[];
-#endif
+
+extern const struct file_operations fld_debugfs_seq_fops;
+extern struct dentry *fld_debugfs_dir;
 
 # endif /* HAVE_SERVER_SUPPORT */
 
@@ -189,6 +181,8 @@ int fld_client_rpc(struct obd_export *exp,
                    struct lu_seq_range *range, __u32 fld_op,
 		   struct ptlrpc_request **reqp);
 
+extern struct ldebugfs_vars fld_client_debugfs_list[];
+
 struct fld_cache *fld_cache_init(const char *name,
                                  int cache_size, int cache_threshold);
 
diff --git a/drivers/staging/lustrefsx/lustre/fld/fld_request.c b/drivers/staging/lustrefsx/lustre/fld/fld_request.c
index 19b5789c19851..3dd616e0a6e94 100644
--- a/drivers/staging/lustrefsx/lustre/fld/fld_request.c
+++ b/drivers/staging/lustrefsx/lustre/fld/fld_request.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -60,15 +60,18 @@ static int fld_rrb_hash(struct lu_client_fld *fld, u64 seq)
 static struct lu_fld_target *
 fld_rrb_scan(struct lu_client_fld *fld, u64 seq)
 {
-        struct lu_fld_target *target;
-        int hash;
-        ENTRY;
+	struct lu_fld_target *target;
+	int hash;
+
+	ENTRY;
 
-	/* Because almost all of special sequence located in MDT0,
+	/*
+	 * Because almost all of special sequence located in MDT0,
 	 * it should go to index 0 directly, instead of calculating
 	 * hash again, and also if other MDTs is not being connected,
 	 * the fld lookup requests(for seq on MDT0) should not be
-	 * blocked because of other MDTs */
+	 * blocked because of other MDTs
+	 */
 	if (fid_seq_is_norm(seq))
 		hash = fld_rrb_hash(fld, seq);
 	else
@@ -76,57 +79,59 @@ fld_rrb_scan(struct lu_client_fld *fld, u64 seq)
 
 again:
 	list_for_each_entry(target, &fld->lcf_targets, ft_chain) {
-                if (target->ft_idx == hash)
-                        RETURN(target);
-        }
+		if (target->ft_idx == hash)
+			RETURN(target);
+	}
 
 	if (hash != 0) {
-		/* It is possible the remote target(MDT) are not connected to
+		/*
+		 * It is possible the remote target(MDT) are not connected to
 		 * with client yet, so we will refer this to MDT0, which should
-		 * be connected during mount */
+		 * be connected during mount
+		 */
 		hash = 0;
 		goto again;
 	}
 
-	CERROR("%s: Can't find target by hash %d (seq %#llx). "
-               "Targets (%d):\n", fld->lcf_name, hash, seq,
-               fld->lcf_count);
+	CERROR("%s: Can't find target by hash %d (seq %#llx). Targets (%d):\n",
+	       fld->lcf_name, hash, seq, fld->lcf_count);
 
 	list_for_each_entry(target, &fld->lcf_targets, ft_chain) {
-                const char *srv_name = target->ft_srv != NULL  ?
-                        target->ft_srv->lsf_name : "<null>";
-                const char *exp_name = target->ft_exp != NULL ?
-                        (char *)target->ft_exp->exp_obd->obd_uuid.uuid :
-                        "<null>";
+		const char *srv_name = target->ft_srv != NULL  ?
+			target->ft_srv->lsf_name : "<null>";
+		const char *exp_name = target->ft_exp != NULL ?
+			(char *)target->ft_exp->exp_obd->obd_uuid.uuid :
+			"<null>";
 
 		CERROR("  exp: 0x%p (%s), srv: 0x%p (%s), idx: %llu\n",
-                       target->ft_exp, exp_name, target->ft_srv,
-                       srv_name, target->ft_idx);
-        }
-
-        /*
-         * If target is not found, there is logical error anyway, so here is
-         * LBUG() to catch this situation.
-         */
-        LBUG();
-        RETURN(NULL);
+		       target->ft_exp, exp_name, target->ft_srv,
+		       srv_name, target->ft_idx);
+	}
+
+	/*
+	 * If target is not found, there is logical error anyway, so here is
+	 * LBUG() to catch this situation.
+	 */
+	LBUG();
+	RETURN(NULL);
 }
 
 struct lu_fld_hash fld_hash[] = {
-        {
-                .fh_name = "RRB",
-                .fh_hash_func = fld_rrb_hash,
-                .fh_scan_func = fld_rrb_scan
-        },
-        {
+	{
+		.fh_name = "RRB",
+		.fh_hash_func = fld_rrb_hash,
+		.fh_scan_func = fld_rrb_scan
+	},
+	{
 		NULL,
-        }
+	}
 };
 
 static struct lu_fld_target *
 fld_client_get_target(struct lu_client_fld *fld, u64 seq)
 {
 	struct lu_fld_target *target;
+
 	ENTRY;
 
 	LASSERT(fld->lcf_hash != NULL);
@@ -135,13 +140,12 @@ fld_client_get_target(struct lu_client_fld *fld, u64 seq)
 	target = fld->lcf_hash->fh_scan_func(fld, seq);
 	spin_unlock(&fld->lcf_lock);
 
-        if (target != NULL) {
-		CDEBUG(D_INFO, "%s: Found target (idx %llu"
-		       ") by seq %#llx\n", fld->lcf_name,
-                       target->ft_idx, seq);
-        }
+	if (target) {
+		CDEBUG(D_INFO, "%s: Found target (idx %llu) by seq %#llx\n",
+		       fld->lcf_name, target->ft_idx, seq);
+	}
 
-        RETURN(target);
+	RETURN(target);
 }
 
 /*
@@ -149,44 +153,45 @@ fld_client_get_target(struct lu_client_fld *fld, u64 seq)
  * of FLD module.
  */
 int fld_client_add_target(struct lu_client_fld *fld,
-                          struct lu_fld_target *tar)
+			  struct lu_fld_target *tar)
 {
 	const char *name;
-        struct lu_fld_target *target, *tmp;
-        ENTRY;
+	struct lu_fld_target *target, *tmp;
 
-        LASSERT(tar != NULL);
+	ENTRY;
+
+	LASSERT(tar != NULL);
 	name = fld_target_name(tar);
-        LASSERT(name != NULL);
-        LASSERT(tar->ft_srv != NULL || tar->ft_exp != NULL);
+	LASSERT(name != NULL);
+	LASSERT(tar->ft_srv != NULL || tar->ft_exp != NULL);
 
 	CDEBUG(D_INFO, "%s: Adding target %s (idx %llu)\n", fld->lcf_name,
 	       name, tar->ft_idx);
 
-        OBD_ALLOC_PTR(target);
-        if (target == NULL)
-                RETURN(-ENOMEM);
+	OBD_ALLOC_PTR(target);
+	if (!target)
+		RETURN(-ENOMEM);
 
 	spin_lock(&fld->lcf_lock);
 	list_for_each_entry(tmp, &fld->lcf_targets, ft_chain) {
 		if (tmp->ft_idx == tar->ft_idx) {
 			spin_unlock(&fld->lcf_lock);
-                        OBD_FREE_PTR(target);
+			OBD_FREE_PTR(target);
 			CERROR("Target %s exists in FLD and known as %s:#%llu\n",
-                               name, fld_target_name(tmp), tmp->ft_idx);
-                        RETURN(-EEXIST);
-                }
-        }
+			       name, fld_target_name(tmp), tmp->ft_idx);
+			RETURN(-EEXIST);
+		}
+	}
 
-        target->ft_exp = tar->ft_exp;
-        if (target->ft_exp != NULL)
-                class_export_get(target->ft_exp);
-        target->ft_srv = tar->ft_srv;
-        target->ft_idx = tar->ft_idx;
+	target->ft_exp = tar->ft_exp;
+	if (target->ft_exp)
+		class_export_get(target->ft_exp);
+	target->ft_srv = tar->ft_srv;
+	target->ft_idx = tar->ft_idx;
 
 	list_add_tail(&target->ft_chain, &fld->lcf_targets);
 
-        fld->lcf_count++;
+	fld->lcf_count++;
 	spin_unlock(&fld->lcf_lock);
 
 	RETURN(0);
@@ -194,9 +199,10 @@ int fld_client_add_target(struct lu_client_fld *fld,
 EXPORT_SYMBOL(fld_client_add_target);
 
 /* Remove export from FLD */
-int fld_client_del_target(struct lu_client_fld *fld, __u64 idx)
+int fld_client_del_target(struct lu_client_fld *fld, u64 idx)
 {
 	struct lu_fld_target *target, *tmp;
+
 	ENTRY;
 
 	spin_lock(&fld->lcf_lock);
@@ -206,182 +212,161 @@ int fld_client_del_target(struct lu_client_fld *fld, __u64 idx)
 			list_del(&target->ft_chain);
 			spin_unlock(&fld->lcf_lock);
 
-                        if (target->ft_exp != NULL)
-                                class_export_put(target->ft_exp);
+			if (target->ft_exp)
+				class_export_put(target->ft_exp);
 
-                        OBD_FREE_PTR(target);
-                        RETURN(0);
-                }
-        }
+			OBD_FREE_PTR(target);
+			RETURN(0);
+		}
+	}
 	spin_unlock(&fld->lcf_lock);
 	RETURN(-ENOENT);
 }
 
-#ifdef CONFIG_PROC_FS
-static int fld_client_proc_init(struct lu_client_fld *fld)
+struct dentry *fld_debugfs_dir;
+
+static int fld_client_debugfs_init(struct lu_client_fld *fld)
 {
 	int rc;
-	ENTRY;
 
-	fld->lcf_proc_dir = lprocfs_register(fld->lcf_name, fld_type_proc_dir,
-					     NULL, NULL);
-	if (IS_ERR(fld->lcf_proc_dir)) {
-		CERROR("%s: LProcFS failed in fld-init\n",
-		       fld->lcf_name);
-		rc = PTR_ERR(fld->lcf_proc_dir);
+	ENTRY;
+	fld->lcf_debugfs_entry = ldebugfs_register(fld->lcf_name,
+						   fld_debugfs_dir,
+						   fld_client_debugfs_list,
+						   fld);
+	if (IS_ERR_OR_NULL(fld->lcf_debugfs_entry)) {
+		CERROR("%s: LdebugFS failed in fld-init\n", fld->lcf_name);
+		rc = fld->lcf_debugfs_entry ? PTR_ERR(fld->lcf_debugfs_entry)
+					    : -ENOMEM;
+		fld->lcf_debugfs_entry = NULL;
 		RETURN(rc);
 	}
 
-	rc = lprocfs_add_vars(fld->lcf_proc_dir, fld_client_proc_list, fld);
-	if (rc) {
-		CERROR("%s: Can't init FLD proc, rc %d\n",
-		       fld->lcf_name, rc);
-		GOTO(out_cleanup, rc);
-	}
-
-	RETURN(0);
-
-out_cleanup:
-	fld_client_proc_fini(fld);
-	return rc;
-}
-
-void fld_client_proc_fini(struct lu_client_fld *fld)
-{
-        ENTRY;
-        if (fld->lcf_proc_dir) {
-                if (!IS_ERR(fld->lcf_proc_dir))
-                        lprocfs_remove(&fld->lcf_proc_dir);
-                fld->lcf_proc_dir = NULL;
-        }
-        EXIT;
-}
-#else /* !CONFIG_PROC_FS */
-static int fld_client_proc_init(struct lu_client_fld *fld)
-{
-        return 0;
+	return 0;
 }
 
-void fld_client_proc_fini(struct lu_client_fld *fld)
+void fld_client_debugfs_fini(struct lu_client_fld *fld)
 {
-        return;
+	if (!IS_ERR_OR_NULL(fld->lcf_debugfs_entry))
+		ldebugfs_remove(&fld->lcf_debugfs_entry);
 }
-#endif /* CONFIG_PROC_FS */
-
-EXPORT_SYMBOL(fld_client_proc_fini);
+EXPORT_SYMBOL(fld_client_debugfs_fini);
 
 static inline int hash_is_sane(int hash)
 {
-        return (hash >= 0 && hash < ARRAY_SIZE(fld_hash));
+	return (hash >= 0 && hash < ARRAY_SIZE(fld_hash));
 }
 
 int fld_client_init(struct lu_client_fld *fld,
-                    const char *prefix, int hash)
+		    const char *prefix, int hash)
 {
-        int cache_size, cache_threshold;
-        int rc;
-        ENTRY;
-
-        LASSERT(fld != NULL);
+	int cache_size, cache_threshold;
+	int rc;
 
-        snprintf(fld->lcf_name, sizeof(fld->lcf_name),
-                 "cli-%s", prefix);
+	ENTRY;
+	snprintf(fld->lcf_name, sizeof(fld->lcf_name),
+		 "cli-%s", prefix);
 
-        if (!hash_is_sane(hash)) {
-                CERROR("%s: Wrong hash function %#x\n",
-                       fld->lcf_name, hash);
-                RETURN(-EINVAL);
-        }
+	if (!hash_is_sane(hash)) {
+		CERROR("%s: Wrong hash function %#x\n",
+		       fld->lcf_name, hash);
+		RETURN(-EINVAL);
+	}
 
 	fld->lcf_count = 0;
 	spin_lock_init(&fld->lcf_lock);
 	fld->lcf_hash = &fld_hash[hash];
 	INIT_LIST_HEAD(&fld->lcf_targets);
 
-        cache_size = FLD_CLIENT_CACHE_SIZE /
-                sizeof(struct fld_cache_entry);
+	cache_size = FLD_CLIENT_CACHE_SIZE /
+		sizeof(struct fld_cache_entry);
 
-        cache_threshold = cache_size *
-                FLD_CLIENT_CACHE_THRESHOLD / 100;
+	cache_threshold = cache_size *
+		FLD_CLIENT_CACHE_THRESHOLD / 100;
 
-        fld->lcf_cache = fld_cache_init(fld->lcf_name,
-                                        cache_size, cache_threshold);
-        if (IS_ERR(fld->lcf_cache)) {
-                rc = PTR_ERR(fld->lcf_cache);
-                fld->lcf_cache = NULL;
-                GOTO(out, rc);
-        }
+	fld->lcf_cache = fld_cache_init(fld->lcf_name,
+					cache_size, cache_threshold);
+	if (IS_ERR(fld->lcf_cache)) {
+		rc = PTR_ERR(fld->lcf_cache);
+		fld->lcf_cache = NULL;
+		GOTO(out, rc);
+	}
 
-        rc = fld_client_proc_init(fld);
-        if (rc)
-                GOTO(out, rc);
-        EXIT;
+	rc = fld_client_debugfs_init(fld);
+	if (rc)
+		GOTO(out, rc);
+	EXIT;
 out:
-        if (rc)
-                fld_client_fini(fld);
-        else
-                CDEBUG(D_INFO, "%s: Using \"%s\" hash\n",
-                       fld->lcf_name, fld->lcf_hash->fh_name);
-        return rc;
+	if (rc)
+		fld_client_fini(fld);
+	else
+		CDEBUG(D_INFO, "%s: Using \"%s\" hash\n",
+		       fld->lcf_name, fld->lcf_hash->fh_name);
+	return rc;
 }
 EXPORT_SYMBOL(fld_client_init);
 
 void fld_client_fini(struct lu_client_fld *fld)
 {
 	struct lu_fld_target *target, *tmp;
+
 	ENTRY;
 
 	spin_lock(&fld->lcf_lock);
 	list_for_each_entry_safe(target, tmp, &fld->lcf_targets, ft_chain) {
-                fld->lcf_count--;
+		fld->lcf_count--;
 		list_del(&target->ft_chain);
-                if (target->ft_exp != NULL)
-                        class_export_put(target->ft_exp);
-                OBD_FREE_PTR(target);
-        }
+		if (target->ft_exp)
+			class_export_put(target->ft_exp);
+		OBD_FREE_PTR(target);
+	}
 	spin_unlock(&fld->lcf_lock);
 
-        if (fld->lcf_cache != NULL) {
-                if (!IS_ERR(fld->lcf_cache))
-                        fld_cache_fini(fld->lcf_cache);
-                fld->lcf_cache = NULL;
-        }
+	if (fld->lcf_cache) {
+		if (!IS_ERR(fld->lcf_cache))
+			fld_cache_fini(fld->lcf_cache);
+		fld->lcf_cache = NULL;
+	}
 
-        EXIT;
+	EXIT;
 }
 EXPORT_SYMBOL(fld_client_fini);
 
 int fld_client_rpc(struct obd_export *exp,
-		   struct lu_seq_range *range, __u32 fld_op,
+		   struct lu_seq_range *range, u32 fld_op,
 		   struct ptlrpc_request **reqp)
 {
 	struct ptlrpc_request *req = NULL;
-	struct lu_seq_range   *prange;
-	__u32                 *op;
-	int                    rc = 0;
-	struct obd_import     *imp;
+	struct lu_seq_range *prange;
+	u32 *op;
+	int rc = 0;
+	struct obd_import *imp;
+
 	ENTRY;
 
 	LASSERT(exp != NULL);
 
-again:
 	imp = class_exp2cliimp(exp);
 	switch (fld_op) {
 	case FLD_QUERY:
 		req = ptlrpc_request_alloc_pack(imp, &RQF_FLD_QUERY,
 						LUSTRE_MDS_VERSION, FLD_QUERY);
-		if (req == NULL)
+		if (!req)
 			RETURN(-ENOMEM);
 
-		/* XXX: only needed when talking to old server(< 2.6), it should
-		 * be removed when < 2.6 server is not supported */
+		/*
+		 * XXX: only needed when talking to old server(< 2.6), it should
+		 * be removed when < 2.6 server is not supported
+		 */
 		op = req_capsule_client_get(&req->rq_pill, &RMF_FLD_OPC);
 		*op = FLD_LOOKUP;
 
-		/* For MDS_MDS seq lookup, it will always use LWP connection,
+		/*
+		 * For MDS_MDS seq lookup, it will always use LWP connection,
 		 * but LWP will be evicted after restart, so cause the error.
 		 * so we will set no_delay for seq lookup request, once the
-		 * request fails because of the eviction. always retry here */
+		 * request fails because of the eviction. always retry here
+		 */
 		if (imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS) {
 			req->rq_allow_replay = 1;
 			req->rq_no_delay = 1;
@@ -390,7 +375,7 @@ int fld_client_rpc(struct obd_export *exp,
 	case FLD_READ:
 		req = ptlrpc_request_alloc_pack(imp, &RQF_FLD_READ,
 						LUSTRE_MDS_VERSION, FLD_READ);
-		if (req == NULL)
+		if (!req)
 			RETURN(-ENOMEM);
 
 		req_capsule_set_size(&req->rq_pill, &RMF_GENERIC_DATA,
@@ -407,13 +392,19 @@ int fld_client_rpc(struct obd_export *exp,
 	prange = req_capsule_client_get(&req->rq_pill, &RMF_FLD_MDFLD);
 	*prange = *range;
 	ptlrpc_request_set_replen(req);
-        req->rq_request_portal = FLD_REQUEST_PORTAL;
+	req->rq_request_portal = FLD_REQUEST_PORTAL;
 	req->rq_reply_portal = MDC_REPLY_PORTAL;
-        ptlrpc_at_set_req_timeout(req);
-
-	obd_get_request_slot(&exp->exp_obd->u.cli);
-	rc = ptlrpc_queue_wait(req);
-	obd_put_request_slot(&exp->exp_obd->u.cli);
+	ptlrpc_at_set_req_timeout(req);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_FLD_QUERY_REQ && req->rq_no_delay)) {
+		/* the same error returned by ptlrpc_import_delay_req */
+		rc = -EWOULDBLOCK;
+		req->rq_status = rc;
+	} else {
+		obd_get_request_slot(&exp->exp_obd->u.cli);
+		rc = ptlrpc_queue_wait(req);
+		obd_put_request_slot(&exp->exp_obd->u.cli);
+	}
 
 	if (rc == -ENOENT) {
 		/* Don't loop forever on non-existing FID sequences. */
@@ -426,14 +417,11 @@ int fld_client_rpc(struct obd_export *exp,
 		    imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS &&
 		    OCD_HAS_FLAG(&imp->imp_connect_data, LIGHTWEIGHT) &&
 		    rc != -ENOTSUPP) {
-			/* Since LWP is not replayable, so it will keep
-			 * trying unless umount happens or the remote
-			 * target does not support the operation, otherwise
-			 * it would cause unecessary failure of the
-			 * application. */
-			ptlrpc_req_finished(req);
-			rc = 0;
-			goto again;
+			/*
+			 * Since LWP is not replayable, so notify the caller
+			 * to retry if needed after a while.
+			 */
+			rc = -EAGAIN;
 		}
 		GOTO(out_req, rc);
 	}
@@ -441,31 +429,32 @@ int fld_client_rpc(struct obd_export *exp,
 	if (fld_op == FLD_QUERY) {
 		prange = req_capsule_server_get(&req->rq_pill,
 						&RMF_FLD_MDFLD);
-		if (prange == NULL)
+		if (!prange)
 			GOTO(out_req, rc = -EFAULT);
 		*range = *prange;
 	}
 
 	EXIT;
 out_req:
-	if (rc != 0 || reqp == NULL) {
+	if (rc != 0 || !reqp) {
 		ptlrpc_req_finished(req);
 		req = NULL;
 	}
 
-	if (reqp != NULL)
+	if (reqp)
 		*reqp = req;
 
 	return rc;
 }
 
 int fld_client_lookup(struct lu_client_fld *fld, u64 seq, u32 *mds,
-		      __u32 flags, const struct lu_env *env)
+		      u32 flags, const struct lu_env *env)
 {
 	struct lu_seq_range res = { 0 };
 	struct lu_fld_target *target;
 	struct lu_fld_target *origin;
 	int rc;
+
 	ENTRY;
 
 	rc = fld_cache_lookup(fld->lcf_cache, seq, &res);
@@ -474,20 +463,19 @@ int fld_client_lookup(struct lu_client_fld *fld, u64 seq, u32 *mds,
 		RETURN(0);
 	}
 
-        /* Can not find it in the cache */
-        target = fld_client_get_target(fld, seq);
-        LASSERT(target != NULL);
+	/* Can not find it in the cache */
+	target = fld_client_get_target(fld, seq);
+	LASSERT(target != NULL);
 	origin = target;
 again:
-	CDEBUG(D_INFO, "%s: Lookup fld entry (seq: %#llx) on "
-	       "target %s (idx %llu)\n", fld->lcf_name, seq,
-               fld_target_name(target), target->ft_idx);
+	CDEBUG(D_INFO, "%s: Lookup fld entry (seq: %#llx) on target %s (idx %llu)\n",
+	       fld->lcf_name, seq, fld_target_name(target), target->ft_idx);
 
 	res.lsr_start = seq;
 	fld_range_set_type(&res, flags);
 
 #ifdef HAVE_SERVER_SUPPORT
-	if (target->ft_srv != NULL) {
+	if (target->ft_srv) {
 		LASSERT(env != NULL);
 		rc = fld_server_lookup(env, target->ft_srv, seq, &res);
 	} else
@@ -497,15 +485,17 @@ int fld_client_lookup(struct lu_client_fld *fld, u64 seq, u32 *mds,
 	}
 
 	if (rc == -ESHUTDOWN) {
-		/* If fld lookup failed because the target has been shutdown,
+		/*
+		 * If fld lookup failed because the target has been shutdown,
 		 * then try next target in the list, until trying all targets
-		 * or fld lookup succeeds */
+		 * or fld lookup succeeds
+		 */
 		spin_lock(&fld->lcf_lock);
-
-		/* If the next entry in the list is the head of the list,
+		/*
+		 * If the next entry in the list is the head of the list,
 		 * move to the next entry after the head and retrieve
-		 * the target. Else retreive the next target entry. */
-
+		 * the target. Else retreive the next target entry.
+		 */
 		if (target->ft_chain.next == &fld->lcf_targets)
 			target = list_entry(target->ft_chain.next->next,
 					    struct lu_fld_target, ft_chain);
@@ -528,25 +518,23 @@ EXPORT_SYMBOL(fld_client_lookup);
 
 void fld_client_flush(struct lu_client_fld *fld)
 {
-        fld_cache_flush(fld->lcf_cache);
+	fld_cache_flush(fld->lcf_cache);
 }
 
-
-struct proc_dir_entry *fld_type_proc_dir;
-
 static int __init fld_init(void)
 {
-	fld_type_proc_dir = lprocfs_register(LUSTRE_FLD_NAME,
-					     proc_lustre_root,
-					     NULL, NULL);
-	if (IS_ERR(fld_type_proc_dir))
-		return PTR_ERR(fld_type_proc_dir);
-
 #ifdef HAVE_SERVER_SUPPORT
-	fld_server_mod_init();
+	int rc;
+
+	rc = fld_server_mod_init();
+	if (rc)
+		return rc;
 #endif /* HAVE_SERVER_SUPPORT */
 
-	return 0;
+	fld_debugfs_dir = ldebugfs_register(LUSTRE_FLD_NAME,
+					    debugfs_lustre_root,
+					    NULL, NULL);
+	return PTR_ERR_OR_ZERO(fld_debugfs_dir);
 }
 
 static void __exit fld_exit(void)
@@ -555,10 +543,8 @@ static void __exit fld_exit(void)
 	fld_server_mod_exit();
 #endif /* HAVE_SERVER_SUPPORT */
 
-	if (fld_type_proc_dir != NULL && !IS_ERR(fld_type_proc_dir)) {
-		lprocfs_remove(&fld_type_proc_dir);
-		fld_type_proc_dir = NULL;
-	}
+	if (!IS_ERR_OR_NULL(fld_debugfs_dir))
+		ldebugfs_remove(&fld_debugfs_dir);
 }
 
 MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
diff --git a/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c b/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c
index 926ed5598052b..a555889f57730 100644
--- a/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c
+++ b/drivers/staging/lustrefsx/lustre/fld/lproc_fld.c
@@ -41,37 +41,36 @@
 
 #include <libcfs/libcfs.h>
 #include <linux/module.h>
+
+#ifdef HAVE_SERVER_SUPPORT
 #include <dt_object.h>
+#endif
 #include <obd_support.h>
 #include <lustre_fld.h>
 #include <lustre_fid.h>
 #include "fld_internal.h"
 
-#ifdef CONFIG_PROC_FS
 static int
-fld_proc_targets_seq_show(struct seq_file *m, void *unused)
+fld_debugfs_targets_seq_show(struct seq_file *m, void *unused)
 {
 	struct lu_client_fld *fld = (struct lu_client_fld *)m->private;
         struct lu_fld_target *target;
-	ENTRY;
-
-	LASSERT(fld != NULL);
 
+	ENTRY;
 	spin_lock(&fld->lcf_lock);
 	list_for_each_entry(target, &fld->lcf_targets, ft_chain)
 	seq_printf(m, "%s\n", fld_target_name(target));
 	spin_unlock(&fld->lcf_lock);
+
 	RETURN(0);
 }
 
 static int
-fld_proc_hash_seq_show(struct seq_file *m, void *unused)
+fld_debugfs_hash_seq_show(struct seq_file *m, void *unused)
 {
 	struct lu_client_fld *fld = (struct lu_client_fld *)m->private;
-	ENTRY;
-
-	LASSERT(fld != NULL);
 
+	ENTRY;
 	spin_lock(&fld->lcf_lock);
 	seq_printf(m, "%s\n", fld->lcf_hash->fh_name);
 	spin_unlock(&fld->lcf_lock);
@@ -80,7 +79,7 @@ fld_proc_hash_seq_show(struct seq_file *m, void *unused)
 }
 
 static ssize_t
-fld_proc_hash_seq_write(struct file *file, const char __user *buffer,
+fld_debugfs_hash_seq_write(struct file *file, const char __user *buffer,
 			size_t count, loff_t *off)
 {
 	struct lu_client_fld *fld;
@@ -91,13 +90,12 @@ fld_proc_hash_seq_write(struct file *file, const char __user *buffer,
 	if (count > sizeof(fh_name))
 		return -ENAMETOOLONG;
 
-	if (lprocfs_copy_from_user(file, fh_name, buffer, count) != 0)
+	if (copy_from_user(fh_name, buffer, count) != 0)
 		return -EFAULT;
 
 	fld = ((struct seq_file *)file->private_data)->private;
-	LASSERT(fld != NULL);
 
-	for (i = 0; fld_hash[i].fh_name != NULL; i++) {
+	for (i = 0; fld_hash[i].fh_name; i++) {
 		if (count != strlen(fld_hash[i].fh_name))
 			continue;
 
@@ -107,7 +105,7 @@ fld_proc_hash_seq_write(struct file *file, const char __user *buffer,
 		}
 	}
 
-	if (hash != NULL) {
+	if (hash) {
 		spin_lock(&fld->lcf_lock);
 		fld->lcf_hash = hash;
 		spin_unlock(&fld->lcf_lock);
@@ -119,15 +117,14 @@ fld_proc_hash_seq_write(struct file *file, const char __user *buffer,
 	return count;
 }
 
-static ssize_t
-lprocfs_cache_flush_seq_write(struct file *file, const char __user *buffer,
-			       size_t count, loff_t *pos)
+static ssize_t ldebugfs_cache_flush_seq_write(struct file *file,
+					      const char __user *buffer,
+					      size_t count, loff_t *pos)
 {
-	struct lu_client_fld *fld = ((struct seq_file *)file->private_data)->private;
-	ENTRY;
-
-        LASSERT(fld != NULL);
+	struct seq_file *m = file->private_data;
+	struct lu_client_fld *fld = m->private;
 
+	ENTRY;
         fld_cache_flush(fld->lcf_cache);
 
         CDEBUG(D_INFO, "%s: Lookup cache is flushed\n", fld->lcf_name);
@@ -135,15 +132,15 @@ lprocfs_cache_flush_seq_write(struct file *file, const char __user *buffer,
         RETURN(count);
 }
 
-LPROC_SEQ_FOPS_RO(fld_proc_targets);
-LPROC_SEQ_FOPS(fld_proc_hash);
-LPROC_SEQ_FOPS_WO_TYPE(fld, cache_flush);
+LDEBUGFS_SEQ_FOPS_RO(fld_debugfs_targets);
+LDEBUGFS_SEQ_FOPS(fld_debugfs_hash);
+LDEBUGFS_FOPS_WR_ONLY(fld, cache_flush);
 
-struct lprocfs_vars fld_client_proc_list[] = {
+struct ldebugfs_vars fld_client_debugfs_list[] = {
 	{ .name	=	"targets",
-	  .fops	=	&fld_proc_targets_fops	},
+	  .fops	=	&fld_debugfs_targets_fops	},
 	{ .name	=	"hash",
-	  .fops	=	&fld_proc_hash_fops	},
+	  .fops	=	&fld_debugfs_hash_fops	},
 	{ .name	=	"cache_flush",
 	  .fops	=	&fld_cache_flush_fops	},
 	{ NULL }
@@ -275,17 +272,13 @@ struct seq_operations fldb_sops = {
 static int fldb_seq_open(struct inode *inode, struct file *file)
 {
 	struct seq_file		*seq;
-	struct lu_server_fld    *fld = (struct lu_server_fld *)PDE_DATA(inode);
+	struct lu_server_fld    *fld = inode->i_private;
 	struct dt_object	*obj;
 	const struct dt_it_ops  *iops;
 	struct fld_seq_param    *param = NULL;
 	int			env_init = 0;
 	int			rc;
 
-	rc = LPROCFS_ENTRY_CHECK(inode);
-	if (rc < 0)
-		return rc;
-
 	rc = seq_open(file, &fldb_sops);
 	if (rc)
 		GOTO(out, rc);
@@ -355,17 +348,11 @@ static int fldb_seq_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-const struct file_operations fld_proc_seq_fops = {
+const struct file_operations fld_debugfs_seq_fops = {
 	.owner   = THIS_MODULE,
 	.open    = fldb_seq_open,
 	.read    = seq_read,
 	.release = fldb_seq_release,
 };
 
-struct lprocfs_vars fld_server_proc_list[] = {
-	{ NULL }
-};
-
 # endif /* HAVE_SERVER_SUPPORT */
-
-#endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/include/cl_object.h b/drivers/staging/lustrefsx/lustre/include/cl_object.h
index 78d09269a33c9..f0c8a5b4bfda0 100644
--- a/drivers/staging/lustrefsx/lustre/include/cl_object.h
+++ b/drivers/staging/lustrefsx/lustre/include/cl_object.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -88,14 +88,17 @@
 /*
  * super-class definitions.
  */
+#include <linux/aio.h>
+#include <linux/fs.h>
+
 #include <libcfs/libcfs.h>
-#include <libcfs/libcfs_ptask.h>
 #include <lu_object.h>
 #include <linux/atomic.h>
 #include <linux/mutex.h>
 #include <linux/radix-tree.h>
 #include <linux/spinlock.h>
 #include <linux/wait.h>
+#include <linux/pagevec.h>
 #include <lustre_dlm.h>
 
 struct obd_info;
@@ -118,8 +121,6 @@ struct cl_io_slice;
 
 struct cl_req_attr;
 
-extern struct cfs_ptask_engine *cl_io_engine;
-
 /**
  * Device in the client stack.
  *
@@ -415,6 +416,13 @@ struct cl_object_operations {
 	void (*coo_req_attr_set)(const struct lu_env *env,
 				 struct cl_object *obj,
 				 struct cl_req_attr *attr);
+	/**
+	 * Flush \a obj data corresponding to \a lock. Used for DoM
+	 * locks in llite's cancelling blocking ast callback.
+	 */
+	int (*coo_object_flush)(const struct lu_env *env,
+				struct cl_object *obj,
+				struct ldlm_lock *lock);
 };
 
 /**
@@ -703,7 +711,7 @@ enum cl_page_type {
 
         /** Transient page, the transient cl_page is used to bind a cl_page
          *  to vmpage which is not belonging to the same object of cl_page.
-         *  it is used in DirectIO, lockless IO and liblustre. */
+         *  it is used in DirectIO and lockless IO. */
         CPT_TRANSIENT,
 };
 
@@ -864,6 +872,13 @@ struct cl_page_operations {
          */
         int (*cpo_is_vmlocked)(const struct lu_env *env,
                                const struct cl_page_slice *slice);
+
+	/**
+	 * Update file attributes when all we have is this page.  Used for tiny
+	 * writes to update attributes when we don't have a full cl_io.
+	 */
+	void (*cpo_page_touch)(const struct lu_env *env,
+			       const struct cl_page_slice *slice, size_t to);
         /**
          * Page destruction.
          */
@@ -888,7 +903,8 @@ struct cl_page_operations {
                            const struct cl_page_slice *slice);
         /** Destructor. Frees resources and slice itself. */
         void (*cpo_fini)(const struct lu_env *env,
-                         struct cl_page_slice *slice);
+			 struct cl_page_slice *slice,
+			 struct pagevec *pvec);
         /**
          * Optional debugging helper. Prints given page slice.
          *
@@ -1071,15 +1087,13 @@ static inline bool __page_in_use(const struct cl_page *page, int refc)
  * (struct cl_lock) and a list of layers (struct cl_lock_slice), linked to
  * cl_lock::cll_layers list through cl_lock_slice::cls_linkage.
  *
- * Typical cl_lock consists of the two layers:
+ * Typical cl_lock consists of one layer:
  *
- *     - vvp_lock (vvp specific data), and
  *     - lov_lock (lov specific data).
  *
  * lov_lock contains an array of sub-locks. Each of these sub-locks is a
  * normal cl_lock: it has a header (struct cl_lock) and a list of layers:
  *
- *     - lovsub_lock, and
  *     - osc_lock
  *
  * Each sub-lock is associated with a cl_object (representing stripe
@@ -1199,7 +1213,7 @@ struct cl_lock {
 /**
  * Per-layer part of cl_lock
  *
- * \see vvp_lock, lov_lock, lovsub_lock, osc_lock
+ * \see lov_lock, osc_lock
  */
 struct cl_lock_slice {
         struct cl_lock                  *cls_lock;
@@ -1213,7 +1227,7 @@ struct cl_lock_slice {
 
 /**
  *
- * \see vvp_lock_ops, lov_lock_ops, lovsub_lock_ops, osc_lock_ops
+ * \see lov_lock_ops, osc_lock_ops
  */
 struct cl_lock_operations {
 	/** @{ */
@@ -1225,8 +1239,7 @@ struct cl_lock_operations {
 	 *		@anchor for resources
 	 * \retval -ve	failure
 	 *
-	 * \see vvp_lock_enqueue(), lov_lock_enqueue(), lovsub_lock_enqueue(),
-	 * \see osc_lock_enqueue()
+	 * \see lov_lock_enqueue(), osc_lock_enqueue()
 	 */
 	int  (*clo_enqueue)(const struct lu_env *env,
 			    const struct cl_lock_slice *slice,
@@ -1241,8 +1254,7 @@ struct cl_lock_operations {
 	/**
 	 * Destructor. Frees resources and the slice.
 	 *
-	 * \see vvp_lock_fini(), lov_lock_fini(), lovsub_lock_fini(),
-	 * \see osc_lock_fini()
+	 * \see lov_lock_fini(), osc_lock_fini()
 	 */
         void (*clo_fini)(const struct lu_env *env, struct cl_lock_slice *slice);
         /**
@@ -1297,7 +1309,7 @@ struct cl_page_list {
 	struct task_struct	*pl_owner;
 };
 
-/** 
+/**
  * A 2-queue of pages. A convenience data-type for common use case, 2-queue
  * contains an incoming page list and an outgoing page list.
  */
@@ -1378,6 +1390,10 @@ enum cl_io_type {
 	 * To write out a range of file
 	 */
 	CIT_FSYNC,
+	/**
+	 * glimpse. An io context to acquire glimpse lock.
+	 */
+	CIT_GLIMPSE,
 	/**
          * Miscellaneous io. This is used for occasional io activity that
          * doesn't fit into other types. Currently this is used for:
@@ -1389,8 +1405,6 @@ enum cl_io_type {
          *     - VM induced page write-out. An io context for writing page out
          *     for memory cleansing;
          *
-         *     - glimpse. An io context to acquire glimpse lock.
-         *
          *     - grouplock. An io context to acquire group lock.
          *
          * CIT_MISC io is used simply as a context in which locks and pages
@@ -1607,25 +1621,30 @@ enum cl_enq_flags {
          * -EWOULDBLOCK is returned immediately.
          */
         CEF_NONBLOCK     = 0x00000001,
-        /**
-         * take lock asynchronously (out of order), as it cannot
-         * deadlock. This is for LDLM_FL_HAS_INTENT locks used for glimpsing.
-         */
-        CEF_ASYNC        = 0x00000002,
+	/**
+	 * Tell lower layers this is a glimpse request, translated to
+	 * LDLM_FL_HAS_INTENT at LDLM layer.
+	 *
+	 * Also, because glimpse locks never block other locks, we count this
+	 * as automatically compatible with other osc locks.
+	 * (see osc_lock_compatible)
+	 */
+	CEF_GLIMPSE        = 0x00000002,
         /**
          * tell the server to instruct (though a flag in the blocking ast) an
          * owner of the conflicting lock, that it can drop dirty pages
          * protected by this lock, without sending them to the server.
          */
         CEF_DISCARD_DATA = 0x00000004,
-        /**
-         * tell the sub layers that it must be a `real' lock. This is used for
-         * mmapped-buffer locks and glimpse locks that must be never converted
-         * into lockless mode.
-         *
-         * \see vvp_mmap_locks(), cl_glimpse_lock().
-         */
-        CEF_MUST         = 0x00000008,
+	/**
+	 * tell the sub layers that it must be a `real' lock. This is used for
+	 * mmapped-buffer locks, glimpse locks, manually requested locks
+	 * (LU_LADVISE_LOCKAHEAD) that must never be converted into lockless
+	 * mode.
+	 *
+	 * \see vvp_mmap_locks(), cl_glimpse_lock, cl_request_lock().
+	 */
+	CEF_MUST         = 0x00000008,
         /**
          * tell the sub layers that never request a `real' lock. This flag is
          * not used currently.
@@ -1638,9 +1657,16 @@ enum cl_enq_flags {
          */
         CEF_NEVER        = 0x00000010,
         /**
-         * for async glimpse lock.
+	 * tell the dlm layer this is a speculative lock request
+	 * speculative lock requests are locks which are not requested as part
+	 * of an I/O operation.  Instead, they are requested because we expect
+	 * to use them in the future.  They are requested asynchronously at the
+	 * ptlrpc layer.
+	 *
+	 * Currently used for asynchronous glimpse locks and manually requested
+	 * locks (LU_LADVISE_LOCKAHEAD).
          */
-        CEF_AGL          = 0x00000020,
+	CEF_SPECULATIVE          = 0x00000020,
 	/**
 	 * enqueue a lock to test DLM lock existence.
 	 */
@@ -1650,10 +1676,14 @@ enum cl_enq_flags {
 	 * is known to exist.
 	 */
 	CEF_LOCK_MATCH  = 0x00000080,
+	/**
+	 * tell the DLM layer to lock only the requested range
+	 */
+	CEF_LOCK_NO_EXPAND    = 0x00000100,
 	/**
 	 * mask of enq_flags.
 	 */
-	CEF_MASK         = 0x000000ff,
+	CEF_MASK         = 0x000001ff,
 };
 
 /**
@@ -1731,21 +1761,10 @@ enum cl_fsync_mode {
 	CL_FSYNC_ALL   = 3
 };
 
-struct cl_io_range {
-	loff_t cir_pos;
-	size_t cir_count;
-};
-
-struct cl_io_pt {
-	struct cl_io_pt		*cip_next;
-	struct cfs_ptask	 cip_task;
-	struct kiocb		 cip_iocb;
-	struct iov_iter		 cip_iter;
-	struct file		*cip_file;
-	enum cl_io_type		 cip_iot;
-	loff_t			 cip_pos;
-	size_t			 cip_count;
-	ssize_t			 cip_result;
+struct cl_io_rw_common {
+	loff_t	crw_pos;
+	size_t	crw_count;
+	int	crw_nonblock;
 };
 
 /**
@@ -1775,27 +1794,30 @@ struct cl_io {
         struct cl_lockset              ci_lockset;
         /** lock requirements, this is just a help info for sublayers. */
         enum cl_io_lock_dmd            ci_lockreq;
-        union {
-		struct cl_rw_io {
-			struct iov_iter		 rw_iter;
-			struct kiocb		 rw_iocb;
-			struct cl_io_range	 rw_range;
-			struct file		*rw_file;
-			unsigned int		 rw_nonblock:1,
-						 rw_append:1,
-						 rw_sync:1;
-			int (*rw_ptask)(struct cfs_ptask *ptask);
-		} ci_rw;
+	/** layout version when this IO occurs */
+	__u32				ci_layout_version;
+	union {
+		struct cl_rd_io {
+			struct cl_io_rw_common rd;
+		} ci_rd;
+		struct cl_wr_io {
+			struct cl_io_rw_common wr;
+			int                    wr_append;
+			int                    wr_sync;
+		} ci_wr;
+		struct cl_io_rw_common ci_rw;
 		struct cl_setattr_io {
 			struct ost_lvb		 sa_attr;
 			unsigned int		 sa_attr_flags;
-			unsigned int		 sa_valid;
+			unsigned int		 sa_avalid; /* ATTR_* */
+			unsigned int		 sa_xvalid; /* OP_XVALID */
 			int			 sa_stripe_index;
 			struct ost_layout	 sa_layout;
 			const struct lu_fid	*sa_parent_fid;
 		} ci_setattr;
 		struct cl_data_version_io {
 			u64 dv_data_version;
+			u32 dv_layout_version;
 			int dv_flags;
 		} ci_data_version;
                 struct cl_fault_io {
@@ -1850,8 +1872,10 @@ struct cl_io {
 	 */
 			     ci_ignore_layout:1,
 	/**
-	 * Need MDS intervention to complete a write. This usually means the
-	 * corresponding component is not initialized for the writing extent.
+	 * Need MDS intervention to complete a write.
+	 * Write intent is required for the following cases:
+	 * 1. component being written is not initialized, or
+	 * 2. the mirrored files are NOT in WRITE_PENDING state.
 	 */
 			     ci_need_write_intent:1,
 	/**
@@ -1870,12 +1894,43 @@ struct cl_io {
 	 * O_NOATIME
 	 */
 			     ci_noatime:1,
-	/** Set to 1 if parallel execution is allowed for current I/O? */
-			     ci_pio:1;
+	/* Tell sublayers not to expand LDLM locks requested for this IO */
+			     ci_lock_no_expand:1,
+	/**
+	 * Set if non-delay RPC should be used for this IO.
+	 *
+	 * If this file has multiple mirrors, and if the OSTs of the current
+	 * mirror is inaccessible, non-delay RPC would error out quickly so
+	 * that the upper layer can try to access the next mirror.
+	 */
+			     ci_ndelay:1,
+	/**
+	 * Set if we've tried all mirrors for this read IO, if it's not set,
+	 * the read IO will check to-be-read OSCs' status, and make fast-switch
+	 * another mirror if some of the OSTs are not healthy.
+	 */
+			     ci_tried_all_mirrors:1;
+	/**
+	 * Bypass quota check
+	 */
+	unsigned	     ci_noquota:1;
+	/**
+	 * How many times the read has retried before this one.
+	 * Set by the top level and consumed by the LOV.
+	 */
+	unsigned             ci_ndelay_tried;
+	/**
+	 * Designated mirror index for this I/O.
+	 */
+	unsigned	     ci_designated_mirror;
 	/**
 	 * Number of pages owned by this IO. For invariant checking.
 	 */
 	unsigned	     ci_owned_nr;
+	/**
+	 * Range of write intent. Valid if ci_need_write_intent is set.
+	 */
+	struct lu_extent	ci_write_intent;
 };
 
 /** @} cl_io */
@@ -2058,6 +2113,9 @@ int cl_object_fiemap(const struct lu_env *env, struct cl_object *obj,
 int cl_object_layout_get(const struct lu_env *env, struct cl_object *obj,
 			 struct cl_layout *cl);
 loff_t cl_object_maxbytes(struct cl_object *obj);
+int cl_object_flush(const struct lu_env *env, struct cl_object *obj,
+		    struct ldlm_lock *lock);
+
 
 /**
  * Returns true, iff \a o0 and \a o1 are slices of the same object.
@@ -2112,6 +2170,9 @@ struct cl_page *cl_page_alloc       (const struct lu_env *env,
 void            cl_page_get         (struct cl_page *page);
 void            cl_page_put         (const struct lu_env *env,
                                      struct cl_page *page);
+void		cl_pagevec_put      (const struct lu_env *env,
+				     struct cl_page *page,
+				     struct pagevec *pvec);
 void            cl_page_print       (const struct lu_env *env, void *cookie,
                                      lu_printer_t printer,
                                      const struct cl_page *pg);
@@ -2179,6 +2240,8 @@ void    cl_page_discard(const struct lu_env *env, struct cl_io *io,
 void    cl_page_delete(const struct lu_env *env, struct cl_page *pg);
 int     cl_page_is_vmlocked(const struct lu_env *env,
 			    const struct cl_page *pg);
+void	cl_page_touch(const struct lu_env *env, const struct cl_page *pg,
+		      size_t to);
 void    cl_page_export(const struct lu_env *env,
 		       struct cl_page *pg, int uptodate);
 loff_t  cl_offset(const struct cl_object *obj, pgoff_t idx);
@@ -2306,12 +2369,12 @@ int   cl_io_cancel       (const struct lu_env *env, struct cl_io *io,
  */
 static inline int cl_io_is_append(const struct cl_io *io)
 {
-	return io->ci_type == CIT_WRITE && io->u.ci_rw.rw_append;
+	return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_append;
 }
 
 static inline int cl_io_is_sync_write(const struct cl_io *io)
 {
-	return io->ci_type == CIT_WRITE && io->u.ci_rw.rw_sync;
+	return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_sync;
 }
 
 static inline int cl_io_is_mkwrite(const struct cl_io *io)
@@ -2324,8 +2387,8 @@ static inline int cl_io_is_mkwrite(const struct cl_io *io)
  */
 static inline int cl_io_is_trunc(const struct cl_io *io)
 {
-        return io->ci_type == CIT_SETATTR &&
-                (io->u.ci_setattr.sa_valid & ATTR_SIZE);
+	return io->ci_type == CIT_SETATTR &&
+		(io->u.ci_setattr.sa_avalid & ATTR_SIZE);
 }
 
 struct cl_io *cl_io_top(struct cl_io *io);
@@ -2333,13 +2396,12 @@ struct cl_io *cl_io_top(struct cl_io *io);
 void cl_io_print(const struct lu_env *env, void *cookie,
                  lu_printer_t printer, const struct cl_io *io);
 
-#define CL_IO_SLICE_CLEAN(foo_io, base)                                 \
-do {                                                                    \
-        typeof(foo_io) __foo_io = (foo_io);                             \
-                                                                        \
-        CLASSERT(offsetof(typeof(*__foo_io), base) == 0);               \
-        memset(&__foo_io->base + 1, 0,                                  \
-               (sizeof *__foo_io) - sizeof __foo_io->base);             \
+#define CL_IO_SLICE_CLEAN(foo_io, base)					\
+do {									\
+	typeof(foo_io) __foo_io = (foo_io);				\
+									\
+	memset(&__foo_io->base, 0,					\
+	       sizeof(*__foo_io) - offsetof(typeof(*__foo_io), base));	\
 } while (0)
 
 /** @} cl_io */
diff --git a/drivers/staging/lustrefsx/lustre/include/dt_object.h b/drivers/staging/lustrefsx/lustre/include/dt_object.h
index e872981b5284e..f16895ddafba6 100644
--- a/drivers/staging/lustrefsx/lustre/include/dt_object.h
+++ b/drivers/staging/lustrefsx/lustre/include/dt_object.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -86,6 +86,8 @@ struct dt_device_param {
 	 * calculation */
 	unsigned int	   ddp_extent_tax;
 	unsigned int	   ddp_brw_size;	/* optimal RPC size */
+	/* T10PI checksum type, zero if not supported */
+	enum cksum_types   ddp_t10_cksum_type;
 };
 
 /**
@@ -251,6 +253,13 @@ struct dt_device_operations {
                              const struct dt_device *dev,
                              struct dt_device_param *param);
 
+	/**
+	 * Return device's super block.
+	 *
+	 * \param[in] dev	dt device
+	 */
+	struct super_block *(*dt_mnt_sb_get)(const struct dt_device *dev);
+
 	/**
 	 * Sync the device.
 	 *
@@ -369,6 +378,9 @@ struct dt_allocation_hint {
 	const void		*dah_eadata;
 	int			dah_eadata_len;
 	__u32			dah_mode;
+	int			dah_append_stripes;
+	bool			dah_can_block;
+	char			*dah_append_pool;
 };
 
 /**
@@ -416,6 +428,8 @@ typedef __u64 dt_obj_version_t;
 
 union ldlm_policy_data;
 
+struct md_layout_change;
+
 /**
  * A dt_object provides common operations to create and destroy
  * objects and to manage regular and extended attributes.
@@ -1040,8 +1054,7 @@ struct dt_object_operations {
 	 */
 	int (*do_declare_layout_change)(const struct lu_env *env,
 					struct dt_object *dt,
-					struct layout_intent *layout,
-					const struct lu_buf *buf,
+					struct md_layout_change *mlc,
 					struct thandle *th);
 
 	/**
@@ -1057,8 +1070,8 @@ struct dt_object_operations {
 	 * \retval -ne		error code
 	 */
 	int (*do_layout_change)(const struct lu_env *env, struct dt_object *dt,
-				struct layout_intent *layout,
-				const struct lu_buf *buf, struct thandle *th);
+				struct md_layout_change *mlc,
+				struct thandle *th);
 };
 
 enum dt_bufs_type {
@@ -1136,7 +1149,6 @@ struct dt_body_operations {
 	 * \param[in] pos	position in the object to start
 	 * \param[out] pos	\a pos + bytes written
 	 * \param[in] th	transaction handle
-	 * \param[in] ignore	unused (was used to request quota ignorance)
 	 *
 	 * \retval positive	bytes written on success
 	 * \retval negative	negated errno on error
@@ -1145,8 +1157,7 @@ struct dt_body_operations {
 			     struct dt_object *dt,
 			     const struct lu_buf *buf,
 			     loff_t *pos,
-			     struct thandle *th,
-			     int ignore);
+			     struct thandle *th);
 
 	/**
 	 * Return buffers for data.
@@ -1175,6 +1186,7 @@ struct dt_body_operations {
 	 * \param[in] pos	position in the object to start
 	 * \param[in] len	size of region in bytes
 	 * \param[out] lb	array of descriptors to fill
+	 * \param[in] maxlnb	max slots in @lnb array
 	 * \param[in] rw	0 if used to read, 1 if used for write
 	 *
 	 * \retval positive	number of descriptors on success
@@ -1185,6 +1197,7 @@ struct dt_body_operations {
 			    loff_t pos,
 			    ssize_t len,
 			    struct niobuf_local *lb,
+			    int maxlnb,
 			    enum dt_bufs_type rw);
 
 	/**
@@ -1479,7 +1492,6 @@ struct dt_index_operations {
 	 * \param[in] rec	buffer storing value
 	 * \param[in] key	key
 	 * \param[in] th	transaction handle
-	 * \param[in] ignore	unused (was used to request quota ignorance)
 	 *
 	 * \retval 0		on success
 	 * \retval negative	negated errno on error
@@ -1488,8 +1500,7 @@ struct dt_index_operations {
 			  struct dt_object *dt,
 			  const struct dt_rec *rec,
 			  const struct dt_key *key,
-			  struct thandle *th,
-			  int ignore);
+			  struct thandle *th);
 
 	/**
 	 * Declare intention to delete a key/value from an index.
@@ -1782,6 +1793,14 @@ struct dt_device {
 	struct list_head		   dd_txn_callbacks;
 	unsigned int			   dd_record_fid_accessed:1,
 					   dd_rdonly:1;
+
+	/* sysfs and debugfs handling */
+	struct dentry			  *dd_debugfs_entry;
+
+	const struct attribute		 **dd_def_attrs;
+	struct kobject			   dd_kobj;
+	struct kobj_type		   dd_ktype;
+	struct completion		   dd_kobj_unregister;
 };
 
 int  dt_device_init(struct dt_device *dev, struct lu_device_type *t);
@@ -1900,7 +1919,9 @@ struct thandle {
 				th_wait_submit:1,
 	/* complex transaction which will track updates on all targets,
 	 * including OSTs */
-				th_complex:1;
+				th_complex:1,
+	/* whether ignore quota */
+				th_ignore_quota:1;
 };
 
 /**
@@ -2380,13 +2401,14 @@ static inline int dt_ref_del(const struct lu_env *env,
 
 static inline int dt_bufs_get(const struct lu_env *env, struct dt_object *d,
 			      struct niobuf_remote *rnb,
-			      struct niobuf_local *lnb, enum dt_bufs_type rw)
+			      struct niobuf_local *lnb, int maxlnb,
+			      enum dt_bufs_type rw)
 {
 	LASSERT(d);
 	LASSERT(d->do_body_ops);
 	LASSERT(d->do_body_ops->dbo_bufs_get);
 	return d->do_body_ops->dbo_bufs_get(env, d, rnb->rnb_offset,
-					    rnb->rnb_len, lnb, rw);
+					    rnb->rnb_len, lnb, maxlnb, rw);
 }
 
 static inline int dt_bufs_put(const struct lu_env *env, struct dt_object *d,
@@ -2450,12 +2472,12 @@ static inline int dt_declare_write(const struct lu_env *env,
 
 static inline ssize_t dt_write(const struct lu_env *env, struct dt_object *dt,
 			       const struct lu_buf *buf, loff_t *pos,
-			       struct thandle *th, int rq)
+			       struct thandle *th)
 {
 	LASSERT(dt);
 	LASSERT(dt->do_body_ops);
 	LASSERT(dt->do_body_ops->dbo_write);
-	return dt->do_body_ops->dbo_write(env, dt, buf, pos, th, rq);
+	return dt->do_body_ops->dbo_write(env, dt, buf, pos, th);
 }
 
 static inline int dt_declare_punch(const struct lu_env *env,
@@ -2525,6 +2547,16 @@ static inline void dt_conf_get(const struct lu_env *env,
         return dev->dd_ops->dt_conf_get(env, dev, param);
 }
 
+static inline struct super_block *dt_mnt_sb_get(const struct dt_device *dev)
+{
+	LASSERT(dev);
+	LASSERT(dev->dd_ops);
+	if (dev->dd_ops->dt_mnt_sb_get)
+		return dev->dd_ops->dt_mnt_sb_get(dev);
+
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 static inline int dt_sync(const struct lu_env *env, struct dt_device *dev)
 {
         LASSERT(dev);
@@ -2558,11 +2590,10 @@ static inline int dt_declare_insert(const struct lu_env *env,
 }
 
 static inline int dt_insert(const struct lu_env *env,
-                                    struct dt_object *dt,
-                                    const struct dt_rec *rec,
-                                    const struct dt_key *key,
-                                    struct thandle *th,
-                                    int noquota)
+			    struct dt_object *dt,
+			    const struct dt_rec *rec,
+			    const struct dt_key *key,
+			    struct thandle *th)
 {
         LASSERT(dt);
         LASSERT(dt->do_index_ops);
@@ -2571,7 +2602,7 @@ static inline int dt_insert(const struct lu_env *env,
 	if (CFS_FAULT_CHECK(OBD_FAIL_DT_INSERT))
 		return cfs_fail_err;
 
-	return dt->do_index_ops->dio_insert(env, dt, rec, key, th, noquota);
+	return dt->do_index_ops->dio_insert(env, dt, rec, key, th);
 }
 
 static inline int dt_declare_xattr_del(const struct lu_env *env,
@@ -2747,26 +2778,24 @@ static inline int dt_lookup(const struct lu_env *env,
 
 static inline int dt_declare_layout_change(const struct lu_env *env,
 					   struct dt_object *o,
-					   struct layout_intent *layout,
-					   const struct lu_buf *buf,
+					   struct md_layout_change *mlc,
 					   struct thandle *th)
 {
 	LASSERT(o);
 	LASSERT(o->do_ops);
 	LASSERT(o->do_ops->do_declare_layout_change);
-	return o->do_ops->do_declare_layout_change(env, o, layout, buf, th);
+	return o->do_ops->do_declare_layout_change(env, o, mlc, th);
 }
 
 static inline int dt_layout_change(const struct lu_env *env,
 				   struct dt_object *o,
-				   struct layout_intent *layout,
-				   const struct lu_buf *buf,
+				   struct md_layout_change *mlc,
 				   struct thandle *th)
 {
 	LASSERT(o);
 	LASSERT(o->do_ops);
 	LASSERT(o->do_ops->do_layout_change);
-	return o->do_ops->do_layout_change(env, o, layout, buf, th);
+	return o->do_ops->do_layout_change(env, o, mlc, th);
 }
 
 struct dt_find_hint {
@@ -2815,6 +2844,9 @@ static inline struct dt_thread_info *dt_info(const struct lu_env *env)
 
 int dt_global_init(void);
 void dt_global_fini(void);
+int dt_tunables_init(struct dt_device *dt, struct obd_type *type,
+		     const char *name, struct ldebugfs_vars *list);
+int dt_tunables_fini(struct dt_device *dt);
 
 # ifdef CONFIG_PROC_FS
 int lprocfs_dt_blksize_seq_show(struct seq_file *m, void *v);
diff --git a/drivers/staging/lustrefsx/lustre/include/llog_swab.h b/drivers/staging/lustrefsx/lustre/include/llog_swab.h
index a0b8d022c1a5b..6fe62bce3bcb3 100644
--- a/drivers/staging/lustrefsx/lustre/include/llog_swab.h
+++ b/drivers/staging/lustrefsx/lustre/include/llog_swab.h
@@ -48,7 +48,7 @@
 #ifndef _LLOG_SWAB_H_
 #define _LLOG_SWAB_H_
 
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 struct lustre_cfg;
 
 void lustre_swab_lu_fid(struct lu_fid *fid);
diff --git a/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h b/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h
index a9d6342f1b6c3..85b66b3af7126 100644
--- a/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h
+++ b/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -47,7 +47,7 @@
 
 #include <libcfs/libcfs.h>
 #include <libcfs/linux/linux-fs.h>
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 
 /*
  * Liuux 5.6 introduces proc_ops with v5.5-8862-gd56c0d45f0e2
@@ -58,18 +58,43 @@ struct lprocfs_vars {
 	const char			*name;
 	const struct proc_ops		*fops;
 	void				*data;
-	/* /proc file mode. */
+	/** /proc file mode. */
 	mode_t				 proc_mode;
 };
 
+/** Provide a debugfs container */
 struct ldebugfs_vars {
 	const char			*name;
 	const struct file_operations	*fops;
 	void				*data;
-	/* debugfs file mode. */
+	/** debugfs file mode. */
 	mode_t				 proc_mode;
 };
 
+static inline unsigned int pct(unsigned long a, unsigned long b)
+{
+	return b ? a * 100 / b : 0;
+}
+
+#define PAGES_TO_MiB(pages)	((pages) >> (20 - PAGE_SHIFT))
+#define MiB_TO_PAGES(mb)	((mb) << (20 - PAGE_SHIFT))
+
+/**
+ * Append a space separated list of current set flags to str.
+ */
+#define flag2str(port, flag)						\
+	do {								\
+		if ((port)->port##_##flag) {				\
+			seq_printf(m, "%s" #flag, first ? "" : ", ");	\
+			first = false;					\
+		}							\
+	} while (0)
+
+void obd_connect_seq_flags2str(struct seq_file *m, __u64 flags, __u64 flags2,
+			       const char *sep);
+void obd_connect_data_seqprint(struct seq_file *m,
+			       struct obd_connect_data *ocd);
+
 /* if we find more consumers this could be generalized */
 #define OBD_HIST_MAX 32
 struct obd_histogram {
@@ -349,28 +374,29 @@ enum {
 #define PTLRPC_FIRST_CNTR PTLRPC_REQWAIT_CNTR
 
 enum lprocfs_extra_opc {
-        LDLM_GLIMPSE_ENQUEUE = 0,
-        LDLM_PLAIN_ENQUEUE,
-        LDLM_EXTENT_ENQUEUE,
-        LDLM_FLOCK_ENQUEUE,
-        LDLM_IBITS_ENQUEUE,
-        MDS_REINT_SETATTR,
-        MDS_REINT_CREATE,
-        MDS_REINT_LINK,
-        MDS_REINT_UNLINK,
-        MDS_REINT_RENAME,
-        MDS_REINT_OPEN,
-        MDS_REINT_SETXATTR,
-        BRW_READ_BYTES,
-        BRW_WRITE_BYTES,
-        EXTRA_LAST_OPC
+	LDLM_GLIMPSE_ENQUEUE = 0,
+	LDLM_PLAIN_ENQUEUE,
+	LDLM_EXTENT_ENQUEUE,
+	LDLM_FLOCK_ENQUEUE,
+	LDLM_IBITS_ENQUEUE,
+	MDS_REINT_SETATTR,
+	MDS_REINT_CREATE,
+	MDS_REINT_LINK,
+	MDS_REINT_UNLINK,
+	MDS_REINT_RENAME,
+	MDS_REINT_OPEN,
+	MDS_REINT_SETXATTR,
+	MDS_REINT_RESYNC,
+	BRW_READ_BYTES,
+	BRW_WRITE_BYTES,
+	EXTRA_LAST_OPC
 };
 
 #define EXTRA_FIRST_OPC LDLM_GLIMPSE_ENQUEUE
 /* class_obd.c */
 extern struct proc_dir_entry *proc_lustre_root;
 extern struct dentry *debugfs_lustre_root;
-extern struct kobject *lustre_kobj;
+extern struct kset *lustre_kset;
 
 struct obd_device;
 struct obd_histogram;
@@ -387,7 +413,7 @@ struct obd_job_stats {
 	struct list_head	ojs_list;	/* list of job_stat structs */
 	rwlock_t		ojs_lock;	/* protect ojs_list/js_list */
 	unsigned int		ojs_cleanup_interval;/* seconds before expiry */
-	time_t			ojs_last_cleanup; /* previous cleanup time */
+	time64_t		ojs_last_cleanup; /* previous cleanup time */
 	cntr_init_callback	ojs_cntr_init_fn;/* lprocfs_stats initializer */
 	unsigned short		ojs_cntr_num;	/* number of stats in struct */
 	bool			ojs_cleaning;	/* currently expiring stats */
@@ -463,13 +489,9 @@ extern struct lprocfs_stats *
 lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags);
 extern void lprocfs_clear_stats(struct lprocfs_stats *stats);
 extern void lprocfs_free_stats(struct lprocfs_stats **stats);
-extern void lprocfs_init_ops_stats(int num_private_stats,
-                                   struct lprocfs_stats *stats);
-extern void lprocfs_init_mps_stats(int num_private_stats,
-                                   struct lprocfs_stats *stats);
 extern void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats);
 extern int lprocfs_alloc_obd_stats(struct obd_device *obddev,
-                                   unsigned int num_private_stats);
+				   unsigned int num_stats);
 extern int lprocfs_alloc_md_stats(struct obd_device *obddev,
                                   unsigned int num_private_stats);
 extern void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
@@ -484,10 +506,14 @@ extern int lprocfs_add_clear_entry(struct obd_device * obd,
 #ifdef HAVE_SERVER_SUPPORT
 extern int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *peer_nid);
 extern int lprocfs_exp_cleanup(struct obd_export *exp);
+struct dentry *ldebugfs_add_symlink(const char *name, const char *target,
+				    const char *format, ...);
 #else
 static inline int lprocfs_exp_cleanup(struct obd_export *exp)
 { return 0; }
 #endif
+struct dentry *ldebugfs_add_simple(struct dentry *root, char *name, void *data,
+				   const struct file_operations *fops);
 extern struct proc_dir_entry *
 lprocfs_add_simple(struct proc_dir_entry *root, char *name,
 		   void *data, const struct proc_ops *ops);
@@ -504,11 +530,12 @@ extern int lprocfs_nid_stats_clear_seq_show(struct seq_file *file, void *data);
 extern int ldebugfs_register_stats(struct dentry *parent, const char *name,
 				   struct lprocfs_stats *stats);
 extern int lprocfs_register_stats(struct proc_dir_entry *root, const char *name,
-                                  struct lprocfs_stats *stats);
+				  struct lprocfs_stats *stats);
+extern const struct file_operations ldebugfs_stats_seq_fops;
 
 /* lprocfs_status.c */
 extern int ldebugfs_add_vars(struct dentry *parent, struct ldebugfs_vars *var,
-			     void *data);
+			      void *data);
 extern int lprocfs_add_vars(struct proc_dir_entry *root,
 			    struct lprocfs_vars *var, void *data);
 
@@ -546,44 +573,32 @@ static inline int LPROCFS_ENTRY_CHECK(struct inode *inode)
 static inline int LPROCFS_ENTRY_CHECK(struct inode *inode)
 { return 0; }
 #endif
-extern int lprocfs_obd_setup(struct obd_device *dev);
+
+extern int lprocfs_obd_setup(struct obd_device *obd, bool uuid_only);
 extern int lprocfs_obd_cleanup(struct obd_device *obd);
 #ifdef HAVE_SERVER_SUPPORT
 extern const struct file_operations lprocfs_evict_client_fops;
 #endif
 
-extern int ldebugfs_seq_create(struct dentry *parent, const char *name,
-			       umode_t mode,
-			       const struct file_operations *seq_fops,
-			       void *data);
+int ldebugfs_seq_create(struct dentry *parent, const char *name, umode_t mode,
+			const struct file_operations *seq_fops, void *data);
 extern int lprocfs_seq_create(struct proc_dir_entry *parent, const char *name,
 			      mode_t mode, const struct proc_ops *seq_fops,
 			      void *data);
-extern int lprocfs_obd_seq_create(struct obd_device *dev, const char *name,
+extern int lprocfs_obd_seq_create(struct obd_device *obd, const char *name,
 				  mode_t mode, const struct proc_ops *seq_fops,
 				  void *data);
 
 /* Generic callbacks */
-extern int lprocfs_u64_seq_show(struct seq_file *m, void *data);
-extern int lprocfs_atomic_seq_show(struct seq_file *m, void *data);
-extern ssize_t lprocfs_atomic_seq_write(struct file *file,
-					const char __user *buffer,
-					size_t count, loff_t *off);
-extern int lprocfs_uint_seq_show(struct seq_file *m, void *data);
-extern ssize_t lprocfs_uint_seq_write(struct file *file,
-				      const char __user *buffer,
-				      size_t count, loff_t *off);
-extern int lprocfs_wr_uint(struct file *file, const char __user *buffer,
-			   unsigned long count, void *data);
 extern int lprocfs_uuid_seq_show(struct seq_file *m, void *data);
-extern int lprocfs_name_seq_show(struct seq_file *m, void *data);
 extern int lprocfs_server_uuid_seq_show(struct seq_file *m, void *data);
-extern int lprocfs_conn_uuid_seq_show(struct seq_file *m, void *data);
+ssize_t conn_uuid_show(struct kobject *kobj, struct attribute *attr, char *buf);
 extern int lprocfs_import_seq_show(struct seq_file *m, void *data);
 extern int lprocfs_state_seq_show(struct seq_file *m, void *data);
 extern int lprocfs_connect_flags_seq_show(struct seq_file *m, void *data);
 #ifdef HAVE_SERVER_SUPPORT
-extern int lprocfs_num_exports_seq_show(struct seq_file *m, void *data);
+ssize_t num_exports_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf);
 #endif
 struct adaptive_timeout;
 extern int lprocfs_at_hist_helper(struct seq_file *m,
@@ -597,32 +612,27 @@ extern ssize_t
 lprocfs_evict_client_seq_write(struct file *file, const char __user *buffer,
 				size_t count, loff_t *off);
 #endif
+ssize_t ping_store(struct kobject *kobj, struct attribute *attr,
+		   const char *buffer, size_t count);
+ssize_t ping_show(struct kobject *kobj, struct attribute *attr,
+		  char *buffer);
+
 extern ssize_t
-lprocfs_ping_seq_write(struct file *file, const char __user *buffer,
-		       size_t count, loff_t *off);
-extern ssize_t
+ldebugfs_import_seq_write(struct file *file, const char __user *buffer,
+			  size_t count, loff_t *off);
+static inline ssize_t
 lprocfs_import_seq_write(struct file *file, const char __user *buffer,
-			 size_t count, loff_t *off);
+			 size_t count, loff_t *off)
+{
+	return ldebugfs_import_seq_write(file, buffer, count, off);
+}
+
 extern int lprocfs_pinger_recov_seq_show(struct seq_file *m, void *data);
 extern ssize_t
 lprocfs_pinger_recov_seq_write(struct file *file, const char __user *buffer,
 			       size_t count, loff_t *off);
 
-/* Statfs helpers */
-extern int lprocfs_blksize_seq_show(struct seq_file *m, void *data);
-extern int lprocfs_kbytestotal_seq_show(struct seq_file *m, void *data);
-extern int lprocfs_kbytesfree_seq_show(struct seq_file *m, void *data);
-extern int lprocfs_kbytesavail_seq_show(struct seq_file *m, void *data);
-extern int lprocfs_filestotal_seq_show(struct seq_file *m, void *data);
-extern int lprocfs_filesfree_seq_show(struct seq_file *m, void *data);
-
-extern int lprocfs_seq_read_frac_helper(struct seq_file *m, long val, int mult);
-extern int lprocfs_read_frac_helper(char *buffer, unsigned long count,
-                                    long val, int mult);
-extern int lprocfs_str_to_s64(struct file *, const char __user *buffer,
-			      unsigned long count, __s64 *val);
-extern int lprocfs_str_with_units_to_s64(struct file *,
-					 const char __user *buffer,
+extern int lprocfs_str_with_units_to_s64(const char __user *buffer,
 					 unsigned long count, __s64 *val,
 					 char defunit);
 
@@ -645,10 +655,10 @@ int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data);
 int lprocfs_hash_seq_show(struct seq_file *m, void *data);
 
 /* lprocfs_status.c: IR factor */
-int lprocfs_ir_factor_seq_show(struct seq_file *m, void *data);
-ssize_t
-lprocfs_ir_factor_seq_write(struct file *file, const char __user *buffer,
-				size_t count, loff_t *off);
+ssize_t ir_factor_show(struct kobject *kobj, struct attribute *attr,
+		       char *buf);
+ssize_t ir_factor_store(struct kobject *kobj, struct attribute *attr,
+			const char *buffer, size_t count);
 #endif
 
 /* lprocfs_status.c: dump pages on cksum error */
@@ -673,10 +683,75 @@ extern int lprocfs_seq_release(struct inode *, struct file *);
 #define LPROCFS_CLIMP_EXIT(obd)                 \
 	up_read(&(obd)->u.cli.cl_sem);
 
+/* write the name##_seq_show function, call LDEBUGFS_SEQ_FOPS_RO for read-only
+ * debugfs entries; otherwise, you will define name##_seq_write function also
+ * for a read-write debugfs entry, and then call LDEBUGFS_SEQ_FOPS instead.
+ * Finally, call ldebugfs_seq_create(obd, filename, 0444, &name#_fops, data);
+ */
+#define __LDEBUGFS_SEQ_FOPS(name, custom_seq_write)			\
+static int name##_single_open(struct inode *inode, struct file *file)	\
+{									\
+	return single_open(file, name##_seq_show, inode->i_private);	\
+}									\
+static const struct file_operations name##_fops = {			\
+	.owner	 = THIS_MODULE,						\
+	.open	 = name##_single_open,					\
+	.read	 = seq_read,						\
+	.write	 = custom_seq_write,					\
+	.llseek	 = seq_lseek,						\
+	.release = single_release,					\
+}
+
+#define LDEBUGFS_SEQ_FOPS_RO(name)	__LDEBUGFS_SEQ_FOPS(name, NULL)
+#define LDEBUGFS_SEQ_FOPS(name)		__LDEBUGFS_SEQ_FOPS(name, \
+							    name##_seq_write)
+
+#define LDEBUGFS_SEQ_FOPS_RO_TYPE(name, type)				\
+	static int name##_##type##_seq_show(struct seq_file *m, void *v)\
+	{								\
+		return lprocfs_##type##_seq_show(m, m->private);	\
+	}								\
+	LDEBUGFS_SEQ_FOPS_RO(name##_##type)
+
+#define LDEBUGFS_SEQ_FOPS_RW_TYPE(name, type)				\
+	static int name##_##type##_seq_show(struct seq_file *m, void *v)\
+	{								\
+		return lprocfs_##type##_seq_show(m, m->private);	\
+	}								\
+	static ssize_t name##_##type##_seq_write(struct file *file,	\
+			const char __user *buffer, size_t count,	\
+			loff_t *off)					\
+	{								\
+		struct seq_file *seq = file->private_data;		\
+		return ldebugfs_##type##_seq_write(file, buffer, count,	\
+						   seq->private);	\
+	}								\
+	LDEBUGFS_SEQ_FOPS(name##_##type);
+
+#define LDEBUGFS_FOPS_WR_ONLY(name, type)				\
+	static ssize_t name##_##type##_write(struct file *file,		\
+			const char __user *buffer, size_t count,	\
+			loff_t *off)					\
+	{								\
+		return ldebugfs_##type##_seq_write(file, buffer, count,	\
+						   off);		\
+	}								\
+	static int name##_##type##_open(struct inode *inode,		\
+					struct file *file)		\
+	{								\
+		return single_open(file, NULL, inode->i_private);	\
+	}								\
+	static const struct file_operations name##_##type##_fops = {	\
+		.open	 = name##_##type##_open,			\
+		.write	 = name##_##type##_write,			\
+		.release = single_release,				\
+	};
+
 /* write the name##_seq_show function, call LPROC_SEQ_FOPS_RO for read-only
-  proc entries; otherwise, you will define name##_seq_write function also for
-  a read-write proc entry, and then call LPROC_SEQ_SEQ instead. Finally,
-  call lprocfs_obd_seq_create(obd, filename, 0444, &name#_fops, data); */
+ * proc entries; otherwise, you will define name##_seq_write function also for
+ * a read-write proc entry, and then call LPROC_SEQ_FOPS instead. Finally,
+ * call ldebugfs_obd_seq_create(obd, filename, 0444, &name#_fops, data);
+ */
 #define __LPROC_SEQ_FOPS(name, custom_seq_write)			\
 static int name##_single_open(struct inode *inode, struct file *file)	\
 {									\
@@ -687,7 +762,8 @@ static int name##_single_open(struct inode *inode, struct file *file)	\
 		return rc;						\
 									\
 	return single_open(file, name##_seq_show,			\
-			   inode->i_private ? : PDE_DATA(inode));	\
+			   inode->i_private ? inode->i_private :	\
+					      PDE_DATA(inode));		\
 }									\
 static const struct proc_ops name##_fops = {				\
 	PROC_OWNER(THIS_MODULE)						\
@@ -719,11 +795,11 @@ static const struct proc_ops name##_fops = {				\
 	{								\
 		struct seq_file *seq = file->private_data;		\
 		return lprocfs_##type##_seq_write(file, buffer,		\
-						count, seq->private);	\
+						  count, seq->private);	\
 	}								\
 	LPROC_SEQ_FOPS(name##_##type);
 
-#define LPROC_SEQ_FOPS_WO_TYPE(name, type)				\
+#define LPROC_SEQ_FOPS_WR_ONLY(name, type)				\
 	static ssize_t name##_##type##_write(struct file *file,		\
 			const char __user *buffer, size_t count,	\
 			loff_t *off)					\
@@ -733,7 +809,8 @@ static const struct proc_ops name##_fops = {				\
 	static int name##_##type##_open(struct inode *inode, struct file *file)\
 	{								\
 		return single_open(file, NULL,				\
-				   inode->i_private ? : PDE_DATA(inode));\
+				   inode->i_private ? inode->i_private : \
+				   PDE_DATA(inode));			\
 	}								\
 	static const struct proc_ops name##_##type##_fops = {		\
 		.proc_open	= name##_##type##_open,			\
@@ -749,22 +826,10 @@ struct lustre_attr {
 			 const char *buf, size_t len);
 };
 
-/*
- * Hacks to get around set_fs removal.
- */
-void lprocfs_file_set_kernel(struct file *file);
-bool lprocfs_file_is_kernel(struct file *file);
-
-/*
- * Version of copy_from_user() that uses the above hacks to determine
- * whether it's dealing with user or kernel space.
- */
-unsigned long lprocfs_copy_from_user(struct file *file, void *to,
-				     const void __user *from, unsigned long n);
-
 #define LUSTRE_ATTR(name, mode, show, store) \
 static struct lustre_attr lustre_attr_##name = __ATTR(name, mode, show, store)
 
+#define LUSTRE_WO_ATTR(name) LUSTRE_ATTR(name, 0200, NULL, name##_store)
 #define LUSTRE_RO_ATTR(name) LUSTRE_ATTR(name, 0444, name##_show, NULL)
 #define LUSTRE_RW_ATTR(name) LUSTRE_ATTR(name, 0644, name##_show, name##_store)
 
@@ -786,33 +851,43 @@ int lprocfs_job_stats_log(struct obd_device *obd, char *jobid,
 void lprocfs_job_stats_fini(struct obd_device *obd);
 int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
 			   cntr_init_callback fn);
-int lprocfs_job_interval_seq_show(struct seq_file *m, void *data);
-ssize_t
-lprocfs_job_interval_seq_write(struct file *file, const char __user *buffer,
-				size_t count, loff_t *off);
-/* lproc_status.c */
-int lprocfs_recovery_time_soft_seq_show(struct seq_file *m, void *data);
-ssize_t lprocfs_recovery_time_soft_seq_write(struct file *file,
-						const char __user *buffer,
-						size_t count, loff_t *off);
-int lprocfs_recovery_time_hard_seq_show(struct seq_file *m, void *data);
-ssize_t
-lprocfs_recovery_time_hard_seq_write(struct file *file,
-				     const char __user *buffer,
-				     size_t count, loff_t *off);
-int lprocfs_target_instance_seq_show(struct seq_file *m, void *data);
+ssize_t job_cleanup_interval_show(struct kobject *kobj, struct attribute *attr,
+				  char *buf);
+ssize_t job_cleanup_interval_store(struct kobject *kobj,
+				   struct attribute *attr,
+				   const char *buffer, size_t count);
+/* lproc_status_server.c */
+ssize_t recovery_time_soft_show(struct kobject *kobj, struct attribute *attr,
+				char *buf);
+ssize_t recovery_time_soft_store(struct kobject *kobj,
+				 struct attribute *attr,
+				 const char *buffer, size_t count);
+ssize_t recovery_time_hard_show(struct kobject *kobj, struct attribute *attr,
+				char *buf);
+ssize_t recovery_time_hard_store(struct kobject *kobj,
+				 struct attribute *attr,
+				 const char *buffer, size_t count);
+ssize_t instance_show(struct kobject *kobj, struct attribute *attr,
+		      char *buf);
 #endif
+/* lproc_status.c */
 int lprocfs_obd_max_pages_per_rpc_seq_show(struct seq_file *m, void *data);
 ssize_t lprocfs_obd_max_pages_per_rpc_seq_write(struct file *file,
 						const char __user *buffer,
 						size_t count, loff_t *off);
+int lprocfs_obd_short_io_bytes_seq_show(struct seq_file *m, void *data);
+ssize_t lprocfs_obd_short_io_bytes_seq_write(struct file *file,
+					     const char __user *buffer,
+					     size_t count, loff_t *off);
+ssize_t short_io_bytes_show(struct kobject *kobj, struct attribute *attr,
+			    char *buf);
+ssize_t short_io_bytes_store(struct kobject *kobj, struct attribute *attr,
+			     const char *buffer, size_t count);
 
 struct root_squash_info;
-int lprocfs_wr_root_squash(struct file *file, const char __user *buffer,
-			   unsigned long count,
+int lprocfs_wr_root_squash(const char __user *buffer, unsigned long count,
 			   struct root_squash_info *squash, char *name);
-int lprocfs_wr_nosquash_nids(struct file *file, const char __user *buffer,
-			     unsigned long count,
+int lprocfs_wr_nosquash_nids(const char __user *buffer, unsigned long count,
 			     struct root_squash_info *squash, char *name);
 
 #else /* !CONFIG_PROC_FS */
@@ -852,16 +927,10 @@ static inline int lprocfs_register_stats(struct proc_dir_entry *root,
                                          const char *name,
                                          struct lprocfs_stats *stats)
 { return 0; }
-static inline void lprocfs_init_ops_stats(int num_private_stats,
-                                          struct lprocfs_stats *stats)
-{ return; }
-static inline void lprocfs_init_mps_stats(int num_private_stats,
-                                          struct lprocfs_stats *stats)
-{ return; }
 static inline void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
 { return; }
 static inline int lprocfs_alloc_obd_stats(struct obd_device *obddev,
-                                          unsigned int num_private_stats)
+					  unsigned int num_stats)
 { return 0; }
 static inline int lprocfs_alloc_md_stats(struct obd_device *obddev,
                                          unsigned int num_private_stats)
@@ -910,18 +979,14 @@ static inline void lprocfs_remove(struct proc_dir_entry **root)
 static inline void lprocfs_remove_proc_entry(const char *name,
                                              struct proc_dir_entry *parent)
 { return; }
-static inline int lprocfs_obd_setup(struct obd_device *dev)
+static inline int lprocfs_obd_setup(struct obd_device *dev, bool uuid_only)
 { return 0; }
 static inline int lprocfs_obd_cleanup(struct obd_device *dev)
 { return 0; }
 static inline int lprocfs_uuid_seq_show(struct seq_file *m, void *data)
 { return 0; }
-static inline int lprocfs_name_seq_show(struct seq_file *m, void *data)
-{ return 0; }
 static inline int lprocfs_server_seq_show(struct seq_file *m, void *data)
 { return 0; }
-static inline int lprocfs_conn_uuid_seq_show(struct seq_file *m, void *data)
-{ return 0; }
 static inline int lprocfs_import_seq_show(struct seq_file *m, void *data)
 { return 0; }
 static inline int lprocfs_state_seq_show(struct seq_file *m, void *data)
@@ -953,6 +1018,10 @@ lprocfs_ping_seq_write(struct file *file, const char __user *buffer,
 		       size_t count, loff_t *off)
 { return 0; }
 static inline ssize_t
+ldebugfs_import_seq_write(struct file *file, const char __user *buffer,
+			  size_t count, loff_t *off)
+{ return 0; }
+static inline ssize_t
 lprocfs_import_seq_write(struct file *file, const char __user *buffer,
 			 size_t count, loff_t *off)
 { return 0; }
@@ -1008,7 +1077,7 @@ u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx,
 #define LPROC_SEQ_FOPS(name)
 #define LPROC_SEQ_FOPS_RO_TYPE(name, type)
 #define LPROC_SEQ_FOPS_RW_TYPE(name, type)
-#define LPROC_SEQ_FOPS_WO_TYPE(name, type)
+#define LPROC_SEQ_FOPS_WR_ONLY(name, type)
 
 /* lprocfs_jobstats.c */
 static inline
diff --git a/drivers/staging/lustrefsx/lustre/include/lu_object.h b/drivers/staging/lustrefsx/lustre/include/lu_object.h
index ae5bb3dde4c82..93218aa30e7ff 100644
--- a/drivers/staging/lustrefsx/lustre/include/lu_object.h
+++ b/drivers/staging/lustrefsx/lustre/include/lu_object.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -33,9 +33,13 @@
 #ifndef __LUSTRE_LU_OBJECT_H
 #define __LUSTRE_LU_OBJECT_H
 
+#ifdef HAVE_LINUX_STDARG_HEADER
+#include <linux/stdarg.h>
+#else
 #include <stdarg.h>
+#endif
 #include <libcfs/libcfs.h>
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 #include <lu_ref.h>
 #include <linux/percpu_counter.h>
 
@@ -426,26 +430,8 @@ struct lu_attr {
         __u32          la_rdev;
 	/** project id */
 	__u32	       la_projid;
-};
-
-/** Bit-mask of valid attributes */
-enum la_valid {
-        LA_ATIME = 1 << 0,
-        LA_MTIME = 1 << 1,
-        LA_CTIME = 1 << 2,
-        LA_SIZE  = 1 << 3,
-        LA_MODE  = 1 << 4,
-        LA_UID   = 1 << 5,
-        LA_GID   = 1 << 6,
-        LA_BLOCKS = 1 << 7,
-        LA_TYPE   = 1 << 8,
-        LA_FLAGS  = 1 << 9,
-        LA_NLINK  = 1 << 10,
-        LA_RDEV   = 1 << 11,
-        LA_BLKSIZE = 1 << 12,
-        LA_KILL_SUID = 1 << 13,
-        LA_KILL_SGID = 1 << 14,
-	LA_PROJID    = 1 << 15,
+	/** set layout version to OST objects. */
+	__u32		la_layout_version;
 };
 
 /**
@@ -484,17 +470,23 @@ enum lu_object_header_flags {
 	/**
 	 * Mark this object has already been taken out of cache.
 	 */
-	LU_OBJECT_UNHASHED = 1,
+	LU_OBJECT_UNHASHED	= 1,
+	/**
+	 * Object is initialized, when object is found in cache, it may not be
+	 * intialized yet, the object allocator will initialize it.
+	 */
+	LU_OBJECT_INITED	= 2
 };
 
 enum lu_object_header_attr {
-        LOHA_EXISTS   = 1 << 0,
-        LOHA_REMOTE   = 1 << 1,
-        /**
-         * UNIX file type is stored in S_IFMT bits.
-         */
-        LOHA_FT_START = 001 << 12, /**< S_IFIFO */
-        LOHA_FT_END   = 017 << 12, /**< S_IFMT */
+	LOHA_EXISTS		= 1 << 0,
+	LOHA_REMOTE		= 1 << 1,
+	LOHA_HAS_AGENT_ENTRY	= 1 << 2,
+	/**
+	 * UNIX file type is stored in S_IFMT bits.
+	 */
+	LOHA_FT_START		= 001 << 12, /**< S_IFIFO */
+	LOHA_FT_END		= 017 << 12, /**< S_IFMT */
 };
 
 /**
@@ -548,31 +540,6 @@ struct lu_object_header {
 
 struct fld;
 
-struct lu_site_bkt_data {
-	/**
-	 * number of object in this bucket on the lsb_lru list.
-	 */
-	long			lsb_lru_len;
-	/**
-	 * LRU list, updated on each access to object. Protected by
-	 * bucket lock of lu_site::ls_obj_hash.
-	 *
-	 * "Cold" end of LRU is lu_site::ls_lru.next. Accessed object are
-	 * moved to the lu_site::ls_lru.prev (this is due to the non-existence
-	 * of list_for_each_entry_safe_reverse()).
-	 */
-	struct list_head	lsb_lru;
-	/**
-	 * Wait-queue signaled when an object in this site is ultimately
-	 * destroyed (lu_object_free()). It is used by lu_object_find() to
-	 * wait before re-trying when object in the process of destruction is
-	 * found in the hash table.
-	 *
-	 * \see htable_lookup().
-	 */
-	wait_queue_head_t	lsb_marche_funebre;
-};
-
 enum {
 	LU_SS_CREATED		= 0,
 	LU_SS_CACHE_HIT,
@@ -643,14 +610,8 @@ struct lu_site {
 	struct percpu_counter   ls_lru_len_counter;
 };
 
-static inline struct lu_site_bkt_data *
-lu_site_bkt_from_fid(struct lu_site *site, struct lu_fid *fid)
-{
-	struct cfs_hash_bd bd;
-
-        cfs_hash_bd_get(site->ls_obj_hash, fid, &bd);
-        return cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
-}
+wait_queue_head_t *
+lu_site_wq_from_fid(struct lu_site *site, struct lu_fid *fid);
 
 static inline struct seq_server_site *lu_site2seq(const struct lu_site *s)
 {
@@ -715,6 +676,14 @@ static inline int lu_object_is_dying(const struct lu_object_header *h)
 	return test_bit(LU_OBJECT_HEARD_BANSHEE, &h->loh_flags);
 }
 
+/**
+ * Return true if object is initialized.
+ */
+static inline int lu_object_is_inited(const struct lu_object_header *h)
+{
+	return test_bit(LU_OBJECT_INITED, &h->loh_flags);
+}
+
 void lu_object_put(const struct lu_env *env, struct lu_object *o);
 void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o);
 void lu_object_unhash(const struct lu_env *env, struct lu_object *o);
@@ -844,6 +813,22 @@ int lu_object_invariant(const struct lu_object *o);
  */
 #define lu_object_remote(o) unlikely((o)->lo_header->loh_attr & LOHA_REMOTE)
 
+/**
+ * Check whether the object as agent entry on current target
+ */
+#define lu_object_has_agent_entry(o) \
+	unlikely((o)->lo_header->loh_attr & LOHA_HAS_AGENT_ENTRY)
+
+static inline void lu_object_set_agent_entry(struct lu_object *o)
+{
+	o->lo_header->loh_attr |= LOHA_HAS_AGENT_ENTRY;
+}
+
+static inline void lu_object_clear_agent_entry(struct lu_object *o)
+{
+	o->lo_header->loh_attr &= ~LOHA_HAS_AGENT_ENTRY;
+}
+
 static inline int lu_object_assert_exists(const struct lu_object *o)
 {
 	return lu_object_exists(o);
@@ -860,7 +845,8 @@ static inline int lu_object_assert_not_exists(const struct lu_object *o)
 static inline __u32 lu_object_attr(const struct lu_object *o)
 {
 	LASSERT(lu_object_exists(o) != 0);
-        return o->lo_header->loh_attr;
+
+	return o->lo_header->loh_attr & S_IFMT;
 }
 
 static inline void lu_object_ref_add(struct lu_object *o,
@@ -907,7 +893,9 @@ struct lu_rdpg {
 
 enum lu_xattr_flags {
 	LU_XATTR_REPLACE = (1 << 0),
-	LU_XATTR_CREATE  = (1 << 1)
+	LU_XATTR_CREATE  = (1 << 1),
+	LU_XATTR_MERGE   = (1 << 2),
+	LU_XATTR_SPLIT   = (1 << 3),
 };
 
 /** @} helpers */
@@ -1129,20 +1117,20 @@ struct lu_context_key {
 };
 
 #define LU_KEY_INIT(mod, type)                                    \
-        static void* mod##_key_init(const struct lu_context *ctx, \
-                                    struct lu_context_key *key)   \
-        {                                                         \
-                type *value;                                      \
+	static void *mod##_key_init(const struct lu_context *ctx, \
+				    struct lu_context_key *key)   \
+	{                                                         \
+		type *value;                                      \
                                                                   \
 		CLASSERT(PAGE_SIZE >= sizeof(*value));		  \
                                                                   \
-                OBD_ALLOC_PTR(value);                             \
-                if (value == NULL)                                \
-                        value = ERR_PTR(-ENOMEM);                 \
-                                                                  \
-                return value;                                     \
-        }                                                         \
-        struct __##mod##__dummy_init {;} /* semicolon catcher */
+		OBD_ALLOC_PTR(value);                             \
+		if (value == NULL)                                \
+			value = ERR_PTR(-ENOMEM);                 \
+								  \
+		return value;                                     \
+	}                                                         \
+	struct __##mod##__dummy_init { ; } /* semicolon catcher */
 
 #define LU_KEY_FINI(mod, type)                                              \
         static void mod##_key_fini(const struct lu_context *ctx,            \
@@ -1278,6 +1266,37 @@ void lu_env_fini  (struct lu_env *env);
 int  lu_env_refill(struct lu_env *env);
 int  lu_env_refill_by_tags(struct lu_env *env, __u32 ctags, __u32 stags);
 
+static inline void* lu_env_info(const struct lu_env *env,
+				const struct lu_context_key *key)
+{
+	void *info;
+	info = lu_context_key_get(&env->le_ctx, key);
+	if (!info) {
+		if (!lu_env_refill((struct lu_env *)env))
+			info = lu_context_key_get(&env->le_ctx, key);
+	}
+	LASSERT(info);
+	return info;
+}
+
+#ifdef HAVE_SERVER_SUPPORT
+struct lu_env *lu_env_find(void);
+int lu_env_add(struct lu_env *env);
+void lu_env_remove(struct lu_env *env);
+#else
+static inline struct lu_env *lu_env_find(void)
+{
+	return NULL;
+}
+static inline int lu_env_add(struct lu_env *env)
+{
+	return 0;
+}
+static inline void lu_env_remove(struct lu_env *env)
+{
+}
+#endif /* HAVE_SERVER_SUPPORT */
+
 /** @} lu_context */
 
 /**
@@ -1294,6 +1313,26 @@ struct lu_name {
         int            ln_namelen;
 };
 
+static inline bool name_is_dot_or_dotdot(const char *name, int namelen)
+{
+	return name[0] == '.' &&
+	       (namelen == 1 || (namelen == 2 && name[1] == '.'));
+}
+
+static inline bool lu_name_is_dot_or_dotdot(const struct lu_name *lname)
+{
+	return name_is_dot_or_dotdot(lname->ln_name, lname->ln_namelen);
+}
+
+static inline bool lu_name_is_valid_len(const char *name, size_t name_len)
+{
+	return name != NULL &&
+	       name_len > 0 &&
+	       name_len < INT_MAX &&
+	       strlen(name) == name_len &&
+	       memchr(name, '/', name_len) == NULL;
+}
+
 /**
  * Validate names (path components)
  *
@@ -1305,12 +1344,7 @@ struct lu_name {
  */
 static inline bool lu_name_is_valid_2(const char *name, size_t name_len)
 {
-	return name != NULL &&
-	       name_len > 0 &&
-	       name_len < INT_MAX &&
-	       name[name_len] == '\0' &&
-	       strlen(name) == name_len &&
-	       memchr(name, '/', name_len) == NULL;
+	return lu_name_is_valid_len(name, name_len) && name[name_len] == '\0';
 }
 
 static inline bool lu_name_is_valid(const struct lu_name *ln)
diff --git a/drivers/staging/lustrefsx/lustre/include/lu_target.h b/drivers/staging/lustrefsx/lustre/include/lu_target.h
index 0d3ef968923ad..0810fbea8b55e 100644
--- a/drivers/staging/lustrefsx/lustre/include/lu_target.h
+++ b/drivers/staging/lustrefsx/lustre/include/lu_target.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -126,14 +126,17 @@ struct tg_grants_data {
 	u64			 tgd_tot_granted;
 	/* grant used by I/Os in progress (between prepare and commit) */
 	u64			 tgd_tot_pending;
+	/* amount of available space in percentage that is never used for
+	 * grants, used on MDT to always keep space for metadata. */
+	u64			 tgd_reserved_pcnt;
 	/* number of clients using grants */
 	int			 tgd_tot_granted_clients;
 	/* shall we grant space to clients not
 	 * supporting OBD_CONNECT_GRANT_PARAM? */
-	int			 tgd_grant_compat_disable;
+	unsigned int		 tgd_grant_compat_disable:1;
 	/* protect all statfs-related counters */
 	spinlock_t		 tgd_osfs_lock;
-	__u64			 tgd_osfs_age;
+	time64_t		 tgd_osfs_age;
 	int			 tgd_blockbits;
 	/* counters used during statfs update, protected by ofd_osfs_lock.
 	 * record when some statfs refresh are in progress */
@@ -201,8 +204,18 @@ struct lu_target {
 
 	/* target grants fields */
 	struct tg_grants_data	 lut_tgd;
+
+	/* target tunables */
+	const struct attribute	**lut_attrs;
+
+	/* FMD (file modification data) values */
+	int			 lut_fmd_max_num;
+	time64_t		 lut_fmd_max_age;
 };
 
+#define LUT_FMD_MAX_NUM_DEFAULT 128
+#define LUT_FMD_MAX_AGE_DEFAULT (obd_timeout + 10)
+
 /* number of slots in reply bitmap */
 #define LUT_REPLY_SLOTS_PER_CHUNK (1<<20)
 #define LUT_REPLY_SLOTS_MAX_CHUNKS 16
@@ -356,7 +369,7 @@ struct tgt_handler {
 	/* Flags in enum tgt_handler_flags */
 	__u32			 th_flags;
 	/* Request version for this opcode */
-	int			 th_version;
+	enum lustre_msg_version	 th_version;
 	/* Handler function */
 	int			(*th_act)(struct tgt_session_info *tsi);
 	/* Handler function for high priority requests */
@@ -409,8 +422,6 @@ int tgt_convert(struct tgt_session_info *tsi);
 int tgt_bl_callback(struct tgt_session_info *tsi);
 int tgt_cp_callback(struct tgt_session_info *tsi);
 int tgt_llog_open(struct tgt_session_info *tsi);
-int tgt_llog_close(struct tgt_session_info *tsi);
-int tgt_llog_destroy(struct tgt_session_info *tsi);
 int tgt_llog_read_header(struct tgt_session_info *tsi);
 int tgt_llog_next_block(struct tgt_session_info *tsi);
 int tgt_llog_prev_block(struct tgt_session_info *tsi);
@@ -426,15 +437,13 @@ int tgt_sync(const struct lu_env *env, struct lu_target *tgt,
 int tgt_io_thread_init(struct ptlrpc_thread *thread);
 void tgt_io_thread_done(struct ptlrpc_thread *thread);
 
-int tgt_extent_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
-		    __u64 start, __u64 end, struct lustre_handle *lh,
-		    int mode, __u64 *flags);
+int tgt_mdt_data_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
+		      struct lustre_handle *lh, int mode, __u64 *flags);
+void tgt_mdt_data_unlock(struct lustre_handle *lh, enum ldlm_mode mode);
+int tgt_extent_lock(const struct lu_env *env, struct ldlm_namespace *ns,
+		    struct ldlm_res_id *res_id, __u64 start, __u64 end,
+		    struct lustre_handle *lh, int mode, __u64 *flags);
 void tgt_extent_unlock(struct lustre_handle *lh, enum ldlm_mode mode);
-int tgt_brw_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
-		 struct obd_ioobj *obj, struct niobuf_remote *nb,
-		 struct lustre_handle *lh, enum ldlm_mode mode);
-void tgt_brw_unlock(struct obd_ioobj *obj, struct niobuf_remote *niob,
-		    struct lustre_handle *lh, enum ldlm_mode mode);
 int tgt_brw_read(struct tgt_session_info *tsi);
 int tgt_brw_write(struct tgt_session_info *tsi);
 int tgt_hpreq_handler(struct ptlrpc_request *req);
@@ -494,6 +503,8 @@ int tgt_add_reply_data(const struct lu_env *env, struct lu_target *tgt,
 		       struct thandle *th, bool update_lrd_file);
 struct tg_reply_data *tgt_lookup_reply_by_xid(struct tg_export_data *ted,
 					       __u64 xid);
+int tgt_tunables_init(struct lu_target *lut);
+void tgt_tunables_fini(struct lu_target *lut);
 
 /* target/tgt_grant.c */
 static inline int exp_grant_param_supp(struct obd_export *exp)
@@ -521,8 +532,36 @@ int tgt_grant_commit_cb_add(struct thandle *th, struct obd_export *exp,
 long tgt_grant_create(const struct lu_env *env, struct obd_export *exp,
 		      s64 *nr);
 int tgt_statfs_internal(const struct lu_env *env, struct lu_target *lut,
-			struct obd_statfs *osfs, __u64 max_age,
+			struct obd_statfs *osfs, time64_t max_age,
 			int *from_cache);
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 16, 53, 0)
+ssize_t sync_lock_cancel_show(struct kobject *kobj,
+			      struct attribute *attr, char *buf);
+ssize_t sync_lock_cancel_store(struct kobject *kobj, struct attribute *attr,
+			       const char *buffer, size_t count);
+#endif
+ssize_t tot_dirty_show(struct kobject *kobj, struct attribute *attr,
+		       char *buf);
+ssize_t tot_granted_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf);
+ssize_t tot_pending_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf);
+ssize_t grant_compat_disable_show(struct kobject *kobj, struct attribute *attr,
+				  char *buf);
+ssize_t grant_compat_disable_store(struct kobject *kobj,
+				   struct attribute *attr,
+				   const char *buffer, size_t count);
+
+/* FMD */
+void tgt_fmd_update(struct obd_export *exp, const struct lu_fid *fid,
+		    __u64 xid);
+bool tgt_fmd_check(struct obd_export *exp, const struct lu_fid *fid,
+		   __u64 xid);
+#ifdef DO_FMD_DROP
+void tgt_fmd_drop(struct obd_export *exp, const struct lu_fid *fid);
+#else
+#define tgt_fmd_drop(exp, fid) do {} while (0)
+#endif
 
 /* target/update_trans.c */
 int distribute_txn_init(const struct lu_env *env,
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h b/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h
index 6f57a20a6a8ab..e5466c7886238 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/ll_fiemap.h
@@ -38,36 +38,12 @@
  * Author: Andreas Dilger <adilger@sun.com>
  */
 
-#ifndef _LUSTRE_FIEMAP_H
-#define _LUSTRE_FIEMAP_H
-
-#include <stddef.h>
-#include <linux/fiemap.h>
-
-/* XXX: We use fiemap_extent::fe_reserved[0] */
-#define fe_device	fe_reserved[0]
-
-static inline size_t fiemap_count_to_size(size_t extent_count)
-{
-	return sizeof(struct fiemap) + extent_count *
-				       sizeof(struct fiemap_extent);
-}
-
-static inline unsigned fiemap_size_to_count(size_t array_size)
-{
-	return (array_size - sizeof(struct fiemap)) /
-	       sizeof(struct fiemap_extent);
-}
-
-#define FIEMAP_FLAG_DEVICE_ORDER 0x40000000 /* return device ordered mapping */
-
-#ifdef FIEMAP_FLAGS_COMPAT
-#undef FIEMAP_FLAGS_COMPAT
-#endif
+/*
+ * NOTE: This file is DEPRECATED! Please include linux/lustre/lustre_fiemap.h
+ * directly instead of this file. This file will be removed from a
+ * future version of lustre!
+ */
 
-/* Lustre specific flags - use a high bit, don't conflict with upstream flag */
-#define FIEMAP_EXTENT_NO_DIRECT 0x40000000 /* Data mapping undefined */
-#define FIEMAP_EXTENT_NET       0x80000000 /* Data stored remotely.
-					    * Sets NO_DIRECT flag */
+#include <linux/lustre/lustre_fiemap.h>
 
-#endif /* _LUSTRE_FIEMAP_H */
+#warning "Including ll_fiemap.h is deprecated. Include linux/lustre/lustre_fiemap.h directly."
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_barrier_user.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_barrier_user.h
index e69bdc2795e56..f8489d55a3b44 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_barrier_user.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_barrier_user.h
@@ -6,13 +6,13 @@
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 only,
  * as published by the Free Software Foundation.
-
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License version 2 for more details.  A copy is
  * included in the COPYING file that accompanied this code.
-
+ *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2016, Intel Corporation.
+ * Copyright (c) 2017, Intel Corporation.
  *
  * lustre/include/lustre/lustre_barrier_user.h
  *
@@ -28,46 +28,13 @@
  *
  * Author: Fan, Yong <fan.yong@intel.com>
  */
-#ifndef _LUSTRE_BARRIER_USER_H
-# define _LUSTRE_BARRIER_USER_H
-
-#include <lustre/lustre_user.h>
 
-#define BARRIER_VERSION_V1	1
-#define BARRIER_TIMEOUT_DEFAULT	30
-
-enum barrier_commands {
-	BC_FREEZE	= 1,
-	BC_THAW		= 2,
-	BC_STAT		= 3,
-	BC_RESCAN	= 4,
-};
-
-enum barrier_status {
-	BS_INIT		= 0,
-	BS_FREEZING_P1	= 1,
-	BS_FREEZING_P2	= 2,
-	BS_FROZEN	= 3,
-	BS_THAWING	= 4,
-	BS_THAWED	= 5,
-	BS_FAILED	= 6,
-	BS_EXPIRED	= 7,
-	BS_RESCAN	= 8,
-};
+/*
+ * NOTE: This file is DEPRECATED! Please include linux/lustre/lustre_barrier_user.h
+ * directly instead of this file. This file will be removed from a
+ * future version of lustre!
+ */
 
-struct barrier_ctl {
-	__u32	bc_version;
-	__u32	bc_cmd;
-	union {
-		__s32	bc_timeout;
-		__u32	bc_total;
-	};
-	union {
-		__u32	bc_status;
-		__u32	bc_absence;
-	};
-	char	bc_name[12];
-	__u32	bc_padding;
-};
+#include <linux/lustre/lustre_barrier_user.h>
 
-#endif /* _LUSTRE_BARRIER_USER_H */
+#warning "Including lustre_barrier_user.h is deprecated. Include linux/lustre/lustre_barrier_user.h directly."
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_lfsck_user.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_lfsck_user.h
index a02f65fa08aef..7b84426fa2750 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_lfsck_user.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_lfsck_user.h
@@ -6,13 +6,13 @@
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 only,
  * as published by the Free Software Foundation.
-
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License version 2 for more details.  A copy is
  * included in the COPYING file that accompanied this code.
-
+ *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * lustre/include/lustre/lustre_lfsck_user.h
@@ -30,207 +30,11 @@
  * Author: Fan, Yong <fan.yong@intel.com>
  */
 
-#ifndef _LUSTRE_LFSCK_USER_H
-# define _LUSTRE_LFSCK_USER_H
-# include <lustre/lustre_user.h>
-
-/**
- * state machine:
- *
- *					LS_INIT
- *					   |
- *				     (lfsck|start)
- *					   |
- *					   v
- *				   LS_SCANNING_PHASE1
- *					|	^
- *					|	:
- *					| (lfsck:restart)
- *					|	:
- *					v	:
- *	-----------------------------------------------------------------
- *	|		    |^		|^	   |^	      |^	|^
- *	|		    |:		|:	   |:	      |:	|:
- *	v		    v:		v:	   v:	      v:	v:
- * LS_SCANNING_PHASE2	LS_FAILED  LS_STOPPED  LS_PAUSED LS_CRASHED LS_PARTIAL
- *			  (CO_)       (CO_)	 (CO_)
- *	|	^	    ^:		^:	   ^:	      ^:	^:
- *	|	:	    |:		|:	   |:	      |:	|:
- *	| (lfsck:restart)   |:		|:	   |:	      |:	|:
- *	v	:	    |v		|v	   |v	      |v	|v
- *	-----------------------------------------------------------------
- *	    |
- *	    v
- *    LS_COMPLETED
+/*
+ * NOTE: This file is DEPRECATED! Please include linux/lustre/lustre_lfsck_user.h
+ * directly instead of this file. This file will be removed from a
+ * future version of lustre!
  */
-enum lfsck_status {
-	/* The lfsck file is new created, for new MDT, upgrading from old disk,
-	 * or re-creating the lfsck file manually. */
-	LS_INIT			= 0,
-
-	/* The first-step system scanning. The checked items during the phase1
-	 * scanning depends on the LFSCK type. */
-	LS_SCANNING_PHASE1	= 1,
-
-	/* The second-step system scanning. The checked items during the phase2
-	 * scanning depends on the LFSCK type. */
-	LS_SCANNING_PHASE2	= 2,
-
-	/* The LFSCK processing has completed for all objects. */
-	LS_COMPLETED		= 3,
-
-	/* The LFSCK exited automatically for failure, will not auto restart. */
-	LS_FAILED		= 4,
-
-	/* The LFSCK is stopped manually, will not auto restart. */
-	LS_STOPPED		= 5,
-
-	/* LFSCK is paused automatically when umount,
-	 * will be restarted automatically when remount. */
-	LS_PAUSED		= 6,
-
-	/* System crashed during the LFSCK,
-	 * will be restarted automatically after recovery. */
-	LS_CRASHED		= 7,
-
-	/* Some OST/MDT failed during the LFSCK, or not join the LFSCK. */
-	LS_PARTIAL		= 8,
-
-	/* The LFSCK is failed because its controller is failed. */
-	LS_CO_FAILED		= 9,
-
-	/* The LFSCK is stopped because its controller is stopped. */
-	LS_CO_STOPPED		= 10,
-
-	/* The LFSCK is paused because its controller is paused. */
-	LS_CO_PAUSED		= 11,
-
-	LS_MAX
-};
-
-static inline const char *lfsck_status2name(int status)
-{
-	static const char * const lfsck_status_names[] = {
-		[LS_INIT]		= "init",
-		[LS_SCANNING_PHASE1]	= "scanning-phase1",
-		[LS_SCANNING_PHASE2]	= "scanning-phase2",
-		[LS_COMPLETED]		= "completed",
-		[LS_FAILED]		= "failed",
-		[LS_STOPPED]		= "stopped",
-		[LS_PAUSED]		= "paused",
-		[LS_CRASHED]		= "crashed",
-		[LS_PARTIAL]		= "partial",
-		[LS_CO_FAILED]		= "co-failed",
-		[LS_CO_STOPPED]		= "co-stopped",
-		[LS_CO_PAUSED]		= "co-paused"
-	};
-
-	if (status < 0 || status >= LS_MAX)
-		return "unknown";
-
-	return lfsck_status_names[status];
-}
-
-enum lfsck_param_flags {
-	/* Reset LFSCK iterator position to the device beginning. */
-	LPF_RESET		= 0x0001,
-
-	/* Exit when fail. */
-	LPF_FAILOUT		= 0x0002,
-
-	/* Dryrun mode, only check without modification */
-	LPF_DRYRUN		= 0x0004,
-
-	/* LFSCK runs on all targets. */
-	LPF_ALL_TGT		= 0x0008,
-
-	/* Broadcast the command to other MDTs. Only valid on the sponsor MDT */
-	LPF_BROADCAST		= 0x0010,
-
-	/* Handle orphan OST-objects. */
-	LPF_OST_ORPHAN		= 0x0020,
-
-	/* Create OST-object for dangling LOV EA. */
-	LPF_CREATE_OSTOBJ	= 0x0040,
-
-	/* Create MDT-object for dangling name entry. */
-	LPF_CREATE_MDTOBJ	= 0x0080,
-
-	/* Do not return until the LFSCK not running. */
-	LPF_WAIT		= 0x0100,
-
-	/* Delay to create OST-object for dangling LOV EA. */
-	LPF_DELAY_CREATE_OSTOBJ	= 0x0200,
-};
-
-enum lfsck_type {
-	/* For MDT and OST internal OSD consistency check/repair. */
-	LFSCK_TYPE_SCRUB	= 0x0000,
-
-	/* For MDT-OST (layout, object) consistency check/repair. */
-	LFSCK_TYPE_LAYOUT	= 0x0001,
-
-	/* For MDT (FID-in-dirent, linkEA) consistency check/repair. */
-	LFSCK_TYPE_NAMESPACE	= 0x0004,
-	LFSCK_TYPES_SUPPORTED	= (LFSCK_TYPE_SCRUB | LFSCK_TYPE_LAYOUT |
-				   LFSCK_TYPE_NAMESPACE),
-	LFSCK_TYPES_DEF		= LFSCK_TYPES_SUPPORTED,
-	LFSCK_TYPES_ALL		= ((__u16)(~0))
-};
-
-#define LFSCK_VERSION_V1	1
-#define LFSCK_VERSION_V2	2
-
-#define LFSCK_SPEED_NO_LIMIT	0
-#define LFSCK_SPEED_LIMIT_DEF	LFSCK_SPEED_NO_LIMIT
-#define LFSCK_ASYNC_WIN_DEFAULT 1024
-#define LFSCK_ASYNC_WIN_MAX	((__u16)(~0))
-#define LFSCK_TYPE_BITS		16
-
-enum lfsck_start_valid {
-	LSV_SPEED_LIMIT		= 0x00000001,
-	LSV_ERROR_HANDLE	= 0x00000002,
-	LSV_DRYRUN		= 0x00000004,
-	LSV_ASYNC_WINDOWS	= 0x00000008,
-	LSV_CREATE_OSTOBJ	= 0x00000010,
-	LSV_CREATE_MDTOBJ	= 0x00000020,
-	LSV_DELAY_CREATE_OSTOBJ	= 0x00000040,
-};
-
-/* Arguments for starting lfsck. */
-struct lfsck_start {
-	/* Which arguments are valid, see 'enum lfsck_start_valid'. */
-	__u32   ls_valid;
-
-	/* How many items can be scanned at most per second. */
-	__u32   ls_speed_limit;
-
-	/* For compatibility between user space tools and kernel service. */
-	__u16   ls_version;
-
-	/* Which LFSCK components to be (have been) started. */
-	__u16   ls_active;
-
-	/* Flags for the LFSCK, see 'enum lfsck_param_flags'. */
-	__u16   ls_flags;
-
-	/* The windows size for async requests pipeline. */
-	__u16   ls_async_windows;
-};
-
-struct lfsck_stop {
-	__u32	ls_status;
-	__u16	ls_flags;
-	__u16	ls_padding_1; /* For 64-bits aligned. */
-	__u64	ls_padding_2;
-};
-
-struct lfsck_query {
-	__u16	lu_types;
-	__u16	lu_flags;
-	__u32	lu_mdts_count[LFSCK_TYPE_BITS][LS_MAX + 1];
-	__u32	lu_osts_count[LFSCK_TYPE_BITS][LS_MAX + 1];
-	__u64	lu_repaired[LFSCK_TYPE_BITS];
-};
 
-#endif /* _LUSTRE_LFSCK_USER_H */
+#include <linux/lustre/lustre_lfsck_user.h>
+#warning "Including lustre_lfsck_user.h is deprecated. Include linux/lustre/lustre_lfsck_user.h directly."
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h
index 67ed9768fcb2f..9d8f5ebefa569 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustre_user.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2016, Intel Corporation.
+ * Copyright (c) 2010, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -34,1638 +34,15 @@
  * Lustre public user-space interface definitions.
  */
 
-#ifndef _LUSTRE_USER_H
-#define _LUSTRE_USER_H
-
-/** \defgroup lustreuser lustreuser
- *
- * @{
- */
-
-#include <linux/types.h>
-
-#ifdef __KERNEL__
-# include <linux/fs.h>
-# include <linux/quota.h>
-# include <linux/string.h> /* snprintf() */
-# include <linux/version.h>
-#else /* !__KERNEL__ */
-# include <limits.h>
-# include <stdbool.h>
-# include <stdio.h> /* snprintf() */
-# include <string.h>
-# define NEED_QUOTA_DEFS
-/* # include <sys/quota.h> - this causes complaints about caddr_t */
-# include <sys/stat.h>
-#endif /* __KERNEL__ */
-#include <lustre/ll_fiemap.h>
-
 /*
- * This is a temporary solution of adding quota type.
- * Should be removed as soon as system header is updated.
- */
-#undef LL_MAXQUOTAS
-#define LL_MAXQUOTAS 3
-#undef INITQFNAMES
-#define INITQFNAMES { \
-    "user",	/* USRQUOTA */ \
-    "group",	/* GRPQUOTA */ \
-    "project",	/* PRJQUOTA */ \
-    "undefined", \
-};
-#ifndef USRQUOTA
-#define USRQUOTA 0
-#endif
-#ifndef GRPQUOTA
-#define GRPQUOTA 1
-#endif
-#ifndef PRJQUOTA
-#define PRJQUOTA 2
-#endif
-
-#if defined(__x86_64__) || defined(__ia64__) || defined(__ppc64__) || \
-    defined(__craynv) || defined(__mips64__) || defined(__powerpc64__) || \
-    defined(__aarch64__)
-typedef struct stat	lstat_t;
-# define lstat_f	lstat
-# define fstat_f	fstat
-# define fstatat_f	fstatat
-# define HAVE_LOV_USER_MDS_DATA
-#elif defined(__USE_LARGEFILE64) || defined(__KERNEL__)
-typedef struct stat64	lstat_t;
-# define lstat_f	lstat64
-# define fstat_f	fstat64
-# define fstatat_f	fstatat64
-# define HAVE_LOV_USER_MDS_DATA
-#endif
-
-#define LUSTRE_EOF 0xffffffffffffffffULL
-
-/* for statfs() */
-#define LL_SUPER_MAGIC 0x0BD00BD0
-
-#ifndef FSFILT_IOC_GETFLAGS
-#define FSFILT_IOC_GETFLAGS               _IOR('f', 1, long)
-#define FSFILT_IOC_SETFLAGS               _IOW('f', 2, long)
-#define FSFILT_IOC_GETVERSION             _IOR('f', 3, long)
-#define FSFILT_IOC_SETVERSION             _IOW('f', 4, long)
-#define FSFILT_IOC_GETVERSION_OLD         _IOR('v', 1, long)
-#define FSFILT_IOC_SETVERSION_OLD         _IOW('v', 2, long)
-#endif
-
-/* FIEMAP flags supported by Lustre */
-#define LUSTRE_FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_DEVICE_ORDER)
-
-enum obd_statfs_state {
-        OS_STATE_DEGRADED       = 0x00000001, /**< RAID degraded/rebuilding */
-        OS_STATE_READONLY       = 0x00000002, /**< filesystem is read-only */
-	OS_STATE_NOPRECREATE    = 0x00000004, /**< no object precreation */
-	OS_STATE_ENOSPC		= 0x00000020, /**< not enough free space */
-	OS_STATE_ENOINO		= 0x00000040, /**< not enough inodes */
-};
-
-struct obd_statfs {
-        __u64           os_type;
-        __u64           os_blocks;
-        __u64           os_bfree;
-        __u64           os_bavail;
-        __u64           os_files;
-        __u64           os_ffree;
-        __u8            os_fsid[40];
-        __u32           os_bsize;
-        __u32           os_namelen;
-        __u64           os_maxbytes;
-        __u32           os_state;       /**< obd_statfs_state OS_STATE_* flag */
-	__u32           os_fprecreated;	/* objs available now to the caller */
-					/* used in QoS code to find preferred
-					 * OSTs */
-        __u32           os_spare2;
-        __u32           os_spare3;
-        __u32           os_spare4;
-        __u32           os_spare5;
-        __u32           os_spare6;
-        __u32           os_spare7;
-        __u32           os_spare8;
-        __u32           os_spare9;
-};
-
-/**
- * File IDentifier.
- *
- * FID is a cluster-wide unique identifier of a file or an object (stripe).
- * FIDs are never reused.
- **/
-struct lu_fid {
-       /**
-	* FID sequence. Sequence is a unit of migration: all files (objects)
-	* with FIDs from a given sequence are stored on the same server.
-	* Lustre should support 2^64 objects, so even if each sequence
-	* has only a single object we can still enumerate 2^64 objects.
-	**/
-	__u64 f_seq;
-	/* FID number within sequence. */
-	__u32 f_oid;
-	/**
-	 * FID version, used to distinguish different versions (in the sense
-	 * of snapshots, etc.) of the same file system object. Not currently
-	 * used.
-	 **/
-	__u32 f_ver;
-};
-
-static inline bool fid_is_zero(const struct lu_fid *fid)
-{
-	return fid->f_seq == 0 && fid->f_oid == 0;
-}
-
-/* Currently, the filter_fid::ff_parent::f_ver is not the real parent
- * MDT-object's FID::f_ver, instead it is the OST-object index in its
- * parent MDT-object's layout EA. */
-#define f_stripe_idx f_ver
-
-struct ost_layout {
-	__u32	ol_stripe_size;
-	__u32	ol_stripe_count;
-	__u64	ol_comp_start;
-	__u64	ol_comp_end;
-	__u32	ol_comp_id;
-} __attribute__((packed));
-
-/* The filter_fid structure has changed several times over its lifetime.
- * For a long time "trusted.fid" held the MDT inode parent FID/IGIF and
- * stripe_index and the "self FID" (objid/seq) to be able to recover the
- * OST objects in case of corruption.  With the move to 2.4 and OSD-API for
- * the OST, the "trusted.lma" xattr was added to the OST objects to store
- * the "self FID" to be consistent with the MDT on-disk format, and the
- * filter_fid only stored the MDT inode parent FID and stripe index.
- *
- * In 2.10, the addition of PFL composite layouts required more information
- * to be stored into the filter_fid in order to be able to identify which
- * component the OST object belonged.  As well, the stripe size may vary
- * between components, so it was no longer safe to assume the stripe size
- * or stripe_count of a file.  This is also more robust for plain layouts.
- *
- * For ldiskfs OSTs that were formatted with 256-byte inodes, there is not
- * enough space to store both the filter_fid and LMA in the inode, so they
- * are packed into struct lustre_ost_attrs on disk in trusted.lma to avoid
- * an extra seek for every OST object access.
- *
- * In 2.11, FLR mirror layouts also need to store the layout version and
- * range so that writes to old versions of the layout are not allowed.
- * That ensures that mirrored objects are not modified by evicted clients,
- * and ensures that the components are correctly marked stale on the MDT.
- */
-struct filter_fid_18_23 {
-	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
-	__u64			ff_objid;
-	__u64			ff_seq;
-};
-
-struct filter_fid_24_29 {
-	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
-};
-
-struct filter_fid_210 {
-	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
-	struct ost_layout	ff_layout;
-};
-
-struct filter_fid {
-	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
-	struct ost_layout	ff_layout;
-	__u32			ff_layout_version;
-	__u32			ff_range; /* range of layout version that
-					   * write are allowed */
-} __attribute__((packed));
-
-/* Userspace should treat lu_fid as opaque, and only use the following methods
- * to print or parse them.  Other functions (e.g. compare, swab) could be moved
- * here from lustre_idl.h if needed. */
-typedef struct lu_fid lustre_fid;
-
-enum lma_compat {
-	LMAC_HSM	 = 0x00000001,
-/*	LMAC_SOM	 = 0x00000002, obsolete since 2.8.0 */
-	LMAC_NOT_IN_OI	 = 0x00000004, /* the object does NOT need OI mapping */
-	LMAC_FID_ON_OST  = 0x00000008, /* For OST-object, its OI mapping is
-				       * under /O/<seq>/d<x>. */
-	LMAC_STRIPE_INFO = 0x00000010, /* stripe info in the LMA EA. */
-	LMAC_COMP_INFO	 = 0x00000020, /* Component info in the LMA EA. */
-};
-
-/**
- * Masks for all features that should be supported by a Lustre version to
- * access a specific file.
- * This information is stored in lustre_mdt_attrs::lma_incompat.
- */
-enum lma_incompat {
-	LMAI_RELEASED		= 0x00000001, /* file is released */
-	LMAI_AGENT		= 0x00000002, /* agent inode */
-	LMAI_REMOTE_PARENT	= 0x00000004, /* the parent of the object
-						 is on the remote MDT */
-	LMAI_STRIPED		= 0x00000008, /* striped directory inode */
-	LMAI_ORPHAN		= 0x00000010, /* inode is orphan */
-	LMA_INCOMPAT_SUPP	= (LMAI_AGENT | LMAI_REMOTE_PARENT | \
-				   LMAI_STRIPED | LMAI_ORPHAN)
-};
-
-
-/**
- * Following struct for object attributes, that will be kept inode's EA.
- * Introduced in 2.0 release (please see b15993, for details)
- * Added to all objects since Lustre 2.4 as contains self FID
- */
-struct lustre_mdt_attrs {
-	/**
-	 * Bitfield for supported data in this structure. From enum lma_compat.
-	 * lma_self_fid and lma_flags are always available.
-	 */
-	__u32   lma_compat;
-	/**
-	 * Per-file incompat feature list. Lustre version should support all
-	 * flags set in this field. The supported feature mask is available in
-	 * LMA_INCOMPAT_SUPP.
-	 */
-	__u32   lma_incompat;
-	/** FID of this inode */
-	struct lu_fid  lma_self_fid;
-};
-
-struct lustre_ost_attrs {
-	/* Use lustre_mdt_attrs directly for now, need a common header
-	 * structure if want to change lustre_mdt_attrs in future. */
-	struct lustre_mdt_attrs loa_lma;
-
-	/* Below five elements are for OST-object's PFID EA, the
-	 * lma_parent_fid::f_ver is composed of the stripe_count (high 16 bits)
-	 * and the stripe_index (low 16 bits), the size should not exceed
-	 * 5 * sizeof(__u64)) to be accessable by old Lustre. If the flag
-	 * LMAC_STRIPE_INFO is set, then loa_parent_fid and loa_stripe_size
-	 * are valid; if the flag LMAC_COMP_INFO is set, then the next three
-	 * loa_comp_* elements are valid. */
-	struct lu_fid	loa_parent_fid;
-	__u32		loa_stripe_size;
-	__u32		loa_comp_id;
-	__u64		loa_comp_start;
-	__u64		loa_comp_end;
-};
-
-/**
- * Prior to 2.4, the LMA structure also included SOM attributes which has since
- * been moved to a dedicated xattr
- * lma_flags was also removed because of lma_compat/incompat fields.
- */
-#define LMA_OLD_SIZE (sizeof(struct lustre_mdt_attrs) + 5 * sizeof(__u64))
-
-/**
- * OST object IDentifier.
+ * NOTE: This file is DEPRECATED! Please include linux/lustre/lustre_user.h
+ * directly instead of this file. This file will be removed from a
+ * future version of lustre!
  */
-struct ost_id {
-	union {
-		struct {
-			__u64	oi_id;
-			__u64	oi_seq;
-		} oi;
-		struct lu_fid oi_fid;
-	};
-};
-
-#define DOSTID "%#llx:%llu"
-#define POSTID(oi) ((unsigned long long)ostid_seq(oi)), \
-		   ((unsigned long long)ostid_id(oi))
 
-struct ll_futimes_3 {
-	__u64 lfu_atime_sec;
-	__u64 lfu_atime_nsec;
-	__u64 lfu_mtime_sec;
-	__u64 lfu_mtime_nsec;
-	__u64 lfu_ctime_sec;
-	__u64 lfu_ctime_nsec;
-};
+#include <linux/lustre/lustre_user.h>
 
-/*
- * The ioctl naming rules:
- * LL_*     - works on the currently opened filehandle instead of parent dir
- * *_OBD_*  - gets data for both OSC or MDC (LOV, LMV indirectly)
- * *_MDC_*  - gets/sets data related to MDC
- * *_LOV_*  - gets/sets data related to OSC/LOV
- * *FILE*   - called on parent dir and passes in a filename
- * *STRIPE* - set/get lov_user_md
- * *INFO    - set/get lov_user_mds_data
- */
-/*	lustre_ioctl.h			101-150 */
-#define LL_IOC_GETFLAGS                 _IOR ('f', 151, long)
-#define LL_IOC_SETFLAGS                 _IOW ('f', 152, long)
-#define LL_IOC_CLRFLAGS                 _IOW ('f', 153, long)
-#define LL_IOC_LOV_SETSTRIPE            _IOW ('f', 154, long)
-#define LL_IOC_LOV_SETSTRIPE_NEW	_IOWR('f', 154, struct lov_user_md)
-#define LL_IOC_LOV_GETSTRIPE            _IOW ('f', 155, long)
-#define LL_IOC_LOV_GETSTRIPE_NEW	_IOR('f', 155, struct lov_user_md)
-#define LL_IOC_LOV_SETEA                _IOW ('f', 156, long)
-/*	LL_IOC_RECREATE_OBJ             157 obsolete */
-/*	LL_IOC_RECREATE_FID             157 obsolete */
-#define LL_IOC_GROUP_LOCK               _IOW ('f', 158, long)
-#define LL_IOC_GROUP_UNLOCK             _IOW ('f', 159, long)
-/*	LL_IOC_QUOTACHECK               160 OBD_IOC_QUOTACHECK */
-/*	LL_IOC_POLL_QUOTACHECK		161 OBD_IOC_POLL_QUOTACHECK */
-/*	LL_IOC_QUOTACTL			162 OBD_IOC_QUOTACTL */
-#define IOC_OBD_STATFS                  _IOWR('f', 164, struct obd_statfs *)
-/*	IOC_LOV_GETINFO                 165 obsolete */
-#define LL_IOC_FLUSHCTX                 _IOW ('f', 166, long)
-/*	LL_IOC_RMTACL                   167 obsolete */
-#define LL_IOC_GETOBDCOUNT              _IOR ('f', 168, long)
-#define LL_IOC_LLOOP_ATTACH             _IOWR('f', 169, long)
-#define LL_IOC_LLOOP_DETACH             _IOWR('f', 170, long)
-#define LL_IOC_LLOOP_INFO               _IOWR('f', 171, struct lu_fid)
-#define LL_IOC_LLOOP_DETACH_BYDEV       _IOWR('f', 172, long)
-#define LL_IOC_PATH2FID                 _IOR ('f', 173, long)
-#define LL_IOC_GET_CONNECT_FLAGS        _IOWR('f', 174, __u64 *)
-#define LL_IOC_GET_MDTIDX               _IOR ('f', 175, int)
-#define LL_IOC_FUTIMES_3		_IOWR('f', 176, struct ll_futimes_3)
-/*	lustre_ioctl.h			177-210 */
-#define LL_IOC_HSM_STATE_GET		_IOR('f', 211, struct hsm_user_state)
-#define LL_IOC_HSM_STATE_SET		_IOW('f', 212, struct hsm_state_set)
-#define LL_IOC_HSM_CT_START		_IOW('f', 213, struct lustre_kernelcomm)
-#define LL_IOC_HSM_COPY_START		_IOW('f', 214, struct hsm_copy *)
-#define LL_IOC_HSM_COPY_END		_IOW('f', 215, struct hsm_copy *)
-#define LL_IOC_HSM_PROGRESS		_IOW('f', 216, struct hsm_user_request)
-#define LL_IOC_HSM_REQUEST		_IOW('f', 217, struct hsm_user_request)
-#define LL_IOC_DATA_VERSION		_IOR('f', 218, struct ioc_data_version)
-#define LL_IOC_LOV_SWAP_LAYOUTS		_IOW('f', 219, \
-						struct lustre_swap_layouts)
-#define LL_IOC_HSM_ACTION		_IOR('f', 220, \
-						struct hsm_current_action)
-/*	lustre_ioctl.h			221-232 */
-#define LL_IOC_LMV_SETSTRIPE		_IOWR('f', 240, struct lmv_user_md)
-#define LL_IOC_LMV_GETSTRIPE		_IOWR('f', 241, struct lmv_user_md)
-#define LL_IOC_REMOVE_ENTRY		_IOWR('f', 242, __u64)
-#define LL_IOC_SET_LEASE		_IOWR('f', 243, long)
-#define LL_IOC_GET_LEASE		_IO('f', 244)
-#define LL_IOC_HSM_IMPORT		_IOWR('f', 245, struct hsm_user_import)
-#define LL_IOC_LMV_SET_DEFAULT_STRIPE	_IOWR('f', 246, struct lmv_user_md)
-#define LL_IOC_MIGRATE			_IOR('f', 247, int)
-#define LL_IOC_FID2MDTIDX		_IOWR('f', 248, struct lu_fid)
-#define LL_IOC_GETPARENT		_IOWR('f', 249, struct getparent)
-#define LL_IOC_LADVISE			_IOR('f', 250, struct llapi_lu_ladvise)
-
-#ifndef	FS_IOC_FSGETXATTR
-/*
- * Structure for FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR.
+/* Disable warning until 2.16 or 3.0, until new header is widely available.
+ * This gives apps time to move to the new header without spurious warnings.
+#warning "Including lustre/lustre_user.h is deprecated. Include linux/lustre/lustre_user.h instead."
 */
-struct fsxattr {
-	__u32           fsx_xflags;     /* xflags field value (get/set) */
-	__u32           fsx_extsize;    /* extsize field value (get/set)*/
-	__u32           fsx_nextents;   /* nextents field value (get)   */
-	__u32           fsx_projid;     /* project identifier (get/set) */
-	unsigned char   fsx_pad[12];
-};
-#define FS_IOC_FSGETXATTR		_IOR('X', 31, struct fsxattr)
-#define FS_IOC_FSSETXATTR		_IOW('X', 32, struct fsxattr)
-#endif
-#define LL_IOC_FSGETXATTR		FS_IOC_FSGETXATTR
-#define LL_IOC_FSSETXATTR		FS_IOC_FSSETXATTR
-#define LL_PROJINHERIT_FL		0x20000000
-
-
-/* Lease types for use as arg and return of LL_IOC_{GET,SET}_LEASE ioctl. */
-enum ll_lease_type {
-	LL_LEASE_RDLCK	= 0x1,
-	LL_LEASE_WRLCK	= 0x2,
-	LL_LEASE_UNLCK	= 0x4,
-};
-
-#define LL_STATFS_LMV		1
-#define LL_STATFS_LOV		2
-#define LL_STATFS_NODELAY	4
-
-#define IOC_MDC_TYPE            'i'
-#define IOC_MDC_LOOKUP          _IOWR(IOC_MDC_TYPE, 20, struct obd_device *)
-#define IOC_MDC_GETFILESTRIPE   _IOWR(IOC_MDC_TYPE, 21, struct lov_user_md *)
-#define IOC_MDC_GETFILEINFO     _IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data *)
-#define LL_IOC_MDC_GETINFO      _IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data *)
-
-#define MAX_OBD_NAME 128 /* If this changes, a NEW ioctl must be added */
-
-/* Define O_LOV_DELAY_CREATE to be a mask that is not useful for regular
- * files, but are unlikely to be used in practice and are not harmful if
- * used incorrectly.  O_NOCTTY and FASYNC are only meaningful for character
- * devices and are safe for use on new files. See LU-4209. */
-/* To be compatible with old statically linked binary we keep the check for
- * the older 0100000000 flag.  This is already removed upstream.  LU-812. */
-#define O_LOV_DELAY_CREATE_1_8	0100000000 /* FMODE_NONOTIFY masked in 2.6.36 */
-#ifndef FASYNC
-#define FASYNC			00020000   /* fcntl, for BSD compatibility */
-#endif
-#define O_LOV_DELAY_CREATE_MASK	(O_NOCTTY | FASYNC)
-#define O_LOV_DELAY_CREATE		(O_LOV_DELAY_CREATE_1_8 | \
-					 O_LOV_DELAY_CREATE_MASK)
-
-#define LL_FILE_IGNORE_LOCK     0x00000001
-#define LL_FILE_GROUP_LOCKED    0x00000002
-#define LL_FILE_READAHEA        0x00000004
-#define LL_FILE_LOCKED_DIRECTIO 0x00000008 /* client-side locks with dio */
-#define LL_FILE_LOCKLESS_IO     0x00000010 /* server-side locks with cio */
-
-#define LOV_USER_MAGIC_V1	0x0BD10BD0
-#define LOV_USER_MAGIC		LOV_USER_MAGIC_V1
-#define LOV_USER_MAGIC_JOIN_V1	0x0BD20BD0
-#define LOV_USER_MAGIC_V3	0x0BD30BD0
-/* 0x0BD40BD0 is occupied by LOV_MAGIC_MIGRATE */
-#define LOV_USER_MAGIC_SPECIFIC 0x0BD50BD0	/* for specific OSTs */
-#define LOV_USER_MAGIC_COMP_V1	0x0BD60BD0
-
-#define LMV_USER_MAGIC		0x0CD30CD0    /* default lmv magic */
-#define LMV_USER_MAGIC_V0	0x0CD20CD0    /* old default lmv magic */
-
-#define LOV_PATTERN_NONE	0x000
-#define LOV_PATTERN_RAID0	0x001
-#define LOV_PATTERN_RAID1	0x002
-#define LOV_PATTERN_FIRST	0x100
-#define LOV_PATTERN_CMOBD	0x200
-
-#define LOV_PATTERN_F_MASK	0xffff0000
-#define LOV_PATTERN_F_HOLE	0x40000000 /* there is hole in LOV EA */
-#define LOV_PATTERN_F_RELEASED	0x80000000 /* HSM released file */
-#define LOV_PATTERN_DEFAULT	0xffffffff
-
-static inline bool lov_pattern_supported(__u32 pattern)
-{
-	return pattern == LOV_PATTERN_RAID0 ||
-	       pattern == (LOV_PATTERN_RAID0 | LOV_PATTERN_F_RELEASED);
-}
-
-#define LOV_MAXPOOLNAME 15
-#define LOV_POOLNAMEF "%.15s"
-
-#define LOV_MIN_STRIPE_BITS 16   /* maximum PAGE_SIZE (ia64), power of 2 */
-#define LOV_MIN_STRIPE_SIZE (1 << LOV_MIN_STRIPE_BITS)
-#define LOV_MAX_STRIPE_COUNT_OLD 160
-/* This calculation is crafted so that input of 4096 will result in 160
- * which in turn is equal to old maximal stripe count.
- * XXX: In fact this is too simpified for now, what it also need is to get
- * ea_type argument to clearly know how much space each stripe consumes.
- *
- * The limit of 12 pages is somewhat arbitrary, but is a reasonably large
- * allocation that is sufficient for the current generation of systems.
- *
- * (max buffer size - lov+rpc header) / sizeof(struct lov_ost_data_v1) */
-#define LOV_MAX_STRIPE_COUNT 2000  /* ((12 * 4096 - 256) / 24) */
-#define LOV_ALL_STRIPES       0xffff /* only valid for directories */
-#define LOV_V1_INSANE_STRIPE_COUNT 65532 /* maximum stripe count bz13933 */
-
-#define XATTR_LUSTRE_PREFIX	"lustre."
-#define XATTR_LUSTRE_LOV	XATTR_LUSTRE_PREFIX"lov"
-
-#define lov_user_ost_data lov_user_ost_data_v1
-struct lov_user_ost_data_v1 {     /* per-stripe data structure */
-	struct ost_id l_ost_oi;	  /* OST object ID */
-	__u32 l_ost_gen;          /* generation of this OST index */
-	__u32 l_ost_idx;          /* OST index in LOV */
-} __attribute__((packed));
-
-#define lov_user_md lov_user_md_v1
-struct lov_user_md_v1 {           /* LOV EA user data (host-endian) */
-	__u32 lmm_magic;          /* magic number = LOV_USER_MAGIC_V1 */
-	__u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
-	struct ost_id lmm_oi;	  /* MDT parent inode id/seq (id/0 for 1.x) */
-	__u32 lmm_stripe_size;    /* size of stripe in bytes */
-	__u16 lmm_stripe_count;   /* num stripes in use for this object */
-	union {
-		__u16 lmm_stripe_offset;  /* starting stripe offset in
-					   * lmm_objects, use when writing */
-		__u16 lmm_layout_gen;     /* layout generation number
-					   * used when reading */
-	};
-	struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
-} __attribute__((packed,  __may_alias__));
-
-struct lov_user_md_v3 {           /* LOV EA user data (host-endian) */
-	__u32 lmm_magic;          /* magic number = LOV_USER_MAGIC_V3 */
-	__u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
-	struct ost_id lmm_oi;	  /* MDT parent inode id/seq (id/0 for 1.x) */
-	__u32 lmm_stripe_size;    /* size of stripe in bytes */
-	__u16 lmm_stripe_count;   /* num stripes in use for this object */
-	union {
-		__u16 lmm_stripe_offset;  /* starting stripe offset in
-					   * lmm_objects, use when writing */
-		__u16 lmm_layout_gen;     /* layout generation number
-					   * used when reading */
-	};
-	char  lmm_pool_name[LOV_MAXPOOLNAME + 1]; /* pool name */
-	struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
-} __attribute__((packed));
-
-struct lu_extent {
-	__u64	e_start;
-	__u64	e_end;
-};
-
-#define DEXT "[ %#llx , %#llx )"
-#define PEXT(ext) (ext)->e_start, (ext)->e_end
-
-static inline bool lu_extent_is_overlapped(struct lu_extent *e1,
-					   struct lu_extent *e2)
-{
-	return e1->e_start < e2->e_end && e2->e_start < e1->e_end;
-}
-
-enum lov_comp_md_entry_flags {
-	LCME_FL_PRIMARY	= 0x00000001,	/* Not used */
-	LCME_FL_STALE	= 0x00000002,	/* Not used */
-	LCME_FL_OFFLINE	= 0x00000004,	/* Not used */
-	LCME_FL_PREFERRED = 0x00000008, /* Not used */
-	LCME_FL_INIT	= 0x00000010,	/* instantiated */
-	LCME_FL_NEG	= 0x80000000	/* used to indicate a negative flag,
-					   won't be stored on disk */
-};
-
-#define LCME_KNOWN_FLAGS	(LCME_FL_NEG | LCME_FL_INIT)
-
-/* lcme_id can be specified as certain flags, and the the first
- * bit of lcme_id is used to indicate that the ID is representing
- * certain LCME_FL_* but not a real ID. Which implies we can have
- * at most 31 flags (see LCME_FL_XXX). */
-enum lcme_id {
-	LCME_ID_INVAL	= 0x0,
-	LCME_ID_MAX	= 0x7FFFFFFF,
-	LCME_ID_ALL	= 0xFFFFFFFF,
-	LCME_ID_NOT_ID	= LCME_FL_NEG
-};
-
-#define LCME_ID_MASK	LCME_ID_MAX
-
-struct lov_comp_md_entry_v1 {
-	__u32			lcme_id;        /* unique id of component */
-	__u32			lcme_flags;     /* LCME_FL_XXX */
-	struct lu_extent	lcme_extent;    /* file extent for component */
-	__u32			lcme_offset;    /* offset of component blob,
-						   start from lov_comp_md_v1 */
-	__u32			lcme_size;      /* size of component blob */
-	__u64			lcme_padding[2];
-} __attribute__((packed));
-
-enum lov_comp_md_flags;
-
-struct lov_comp_md_v1 {
-	__u32	lcm_magic;      /* LOV_USER_MAGIC_COMP_V1 */
-	__u32	lcm_size;       /* overall size including this struct */
-	__u32	lcm_layout_gen;
-	__u16	lcm_flags;
-	__u16	lcm_entry_count;
-	__u64	lcm_padding1;
-	__u64	lcm_padding2;
-	struct lov_comp_md_entry_v1 lcm_entries[0];
-} __attribute__((packed));
-
-static inline __u32 lov_user_md_size(__u16 stripes, __u32 lmm_magic)
-{
-	if (stripes == (__u16)-1)
-		stripes = 0;
-
-	if (lmm_magic == LOV_USER_MAGIC_V1)
-		return sizeof(struct lov_user_md_v1) +
-			      stripes * sizeof(struct lov_user_ost_data_v1);
-	return sizeof(struct lov_user_md_v3) +
-				stripes * sizeof(struct lov_user_ost_data_v1);
-}
-
-/* Compile with -D_LARGEFILE64_SOURCE or -D_GNU_SOURCE (or #define) to
- * use this.  It is unsafe to #define those values in this header as it
- * is possible the application has already #included <sys/stat.h>. */
-#ifdef HAVE_LOV_USER_MDS_DATA
-#define lov_user_mds_data lov_user_mds_data_v1
-struct lov_user_mds_data_v1 {
-        lstat_t lmd_st;                 /* MDS stat struct */
-        struct lov_user_md_v1 lmd_lmm;  /* LOV EA V1 user data */
-} __attribute__((packed));
-
-struct lov_user_mds_data_v3 {
-        lstat_t lmd_st;                 /* MDS stat struct */
-        struct lov_user_md_v3 lmd_lmm;  /* LOV EA V3 user data */
-} __attribute__((packed));
-#endif
-
-struct lmv_user_mds_data {
-	struct lu_fid	lum_fid;
-	__u32		lum_padding;
-	__u32		lum_mds;
-};
-
-enum lmv_hash_type {
-	LMV_HASH_TYPE_UNKNOWN	= 0,	/* 0 is reserved for testing purpose */
-	LMV_HASH_TYPE_ALL_CHARS = 1,
-	LMV_HASH_TYPE_FNV_1A_64 = 2,
-	LMV_HASH_TYPE_MAX,
-};
-
-#define LMV_HASH_NAME_ALL_CHARS	"all_char"
-#define LMV_HASH_NAME_FNV_1A_64	"fnv_1a_64"
-
-extern char *mdt_hash_name[LMV_HASH_TYPE_MAX];
-
-/* Got this according to how get LOV_MAX_STRIPE_COUNT, see above,
- * (max buffer size - lmv+rpc header) / sizeof(struct lmv_user_mds_data) */
-#define LMV_MAX_STRIPE_COUNT 2000  /* ((12 * 4096 - 256) / 24) */
-#define lmv_user_md lmv_user_md_v1
-struct lmv_user_md_v1 {
-	__u32	lum_magic;	 /* must be the first field */
-	__u32	lum_stripe_count;  /* dirstripe count */
-	__u32	lum_stripe_offset; /* MDT idx for default dirstripe */
-	__u32	lum_hash_type;     /* Dir stripe policy */
-	__u32	lum_type;	  /* LMV type: default or normal */
-	__u32	lum_padding1;
-	__u32	lum_padding2;
-	__u32	lum_padding3;
-	char	lum_pool_name[LOV_MAXPOOLNAME + 1];
-	struct	lmv_user_mds_data  lum_objects[0];
-} __attribute__((packed));
-
-static inline int lmv_user_md_size(int stripes, int lmm_magic)
-{
-	return sizeof(struct lmv_user_md) +
-		      stripes * sizeof(struct lmv_user_mds_data);
-}
-
-struct ll_recreate_obj {
-        __u64 lrc_id;
-        __u32 lrc_ost_idx;
-};
-
-struct ll_fid {
-        __u64 id;         /* holds object id */
-        __u32 generation; /* holds object generation */
-        __u32 f_type;     /* holds object type or stripe idx when passing it to
-                           * OST for saving into EA. */
-};
-
-#define UUID_MAX        40
-struct obd_uuid {
-        char uuid[UUID_MAX];
-};
-
-static inline bool obd_uuid_equals(const struct obd_uuid *u1,
-				   const struct obd_uuid *u2)
-{
-	return strcmp((char *)u1->uuid, (char *)u2->uuid) == 0;
-}
-
-static inline int obd_uuid_empty(struct obd_uuid *uuid)
-{
-        return uuid->uuid[0] == '\0';
-}
-
-static inline void obd_str2uuid(struct obd_uuid *uuid, const char *tmp)
-{
-        strncpy((char *)uuid->uuid, tmp, sizeof(*uuid));
-        uuid->uuid[sizeof(*uuid) - 1] = '\0';
-}
-
-/* For printf's only, make sure uuid is terminated */
-static inline char *obd_uuid2str(const struct obd_uuid *uuid)
-{
-	if (uuid == NULL)
-		return NULL;
-
-        if (uuid->uuid[sizeof(*uuid) - 1] != '\0') {
-                /* Obviously not safe, but for printfs, no real harm done...
-                   we're always null-terminated, even in a race. */
-                static char temp[sizeof(*uuid)];
-                memcpy(temp, uuid->uuid, sizeof(*uuid) - 1);
-                temp[sizeof(*uuid) - 1] = '\0';
-                return temp;
-        }
-        return (char *)(uuid->uuid);
-}
-
-#define LUSTRE_MAXFSNAME 8
-
-/* Extract fsname from uuid (or target name) of a target
-   e.g. (myfs-OST0007_UUID -> myfs)
-   see also deuuidify. */
-static inline void obd_uuid2fsname(char *buf, char *uuid, int buflen)
-{
-        char *p;
-
-        strncpy(buf, uuid, buflen - 1);
-        buf[buflen - 1] = '\0';
-        p = strrchr(buf, '-');
-	if (p != NULL)
-		*p = '\0';
-}
-
-/* printf display format for Lustre FIDs
- * usage: printf("file FID is "DFID"\n", PFID(fid)); */
-#define FID_NOBRACE_LEN 40
-#define FID_LEN (FID_NOBRACE_LEN + 2)
-#define DFID_NOBRACE "%#llx:0x%x:0x%x"
-#define DFID "["DFID_NOBRACE"]"
-#define PFID(fid) (unsigned long long)(fid)->f_seq, (fid)->f_oid, (fid)->f_ver
-
-/* scanf input parse format for fids in DFID_NOBRACE format
- * Need to strip '[' from DFID format first or use "["SFID"]" at caller.
- * usage: sscanf(fidstr, SFID, RFID(&fid)); */
-#define SFID "0x%llx:0x%x:0x%x"
-#define RFID(fid) &((fid)->f_seq), &((fid)->f_oid), &((fid)->f_ver)
-
-/********* Quotas **********/
-
-#define LUSTRE_QUOTABLOCK_BITS 10
-#define LUSTRE_QUOTABLOCK_SIZE (1 << LUSTRE_QUOTABLOCK_BITS)
-
-static inline __u64 lustre_stoqb(size_t space)
-{
-	return (space + LUSTRE_QUOTABLOCK_SIZE - 1) >> LUSTRE_QUOTABLOCK_BITS;
-}
-
-#define Q_QUOTACHECK	0x800100 /* deprecated as of 2.4 */
-#define Q_INITQUOTA	0x800101 /* deprecated as of 2.4  */
-#define Q_GETOINFO	0x800102 /* get obd quota info */
-#define Q_GETOQUOTA	0x800103 /* get obd quotas */
-#define Q_FINVALIDATE	0x800104 /* deprecated as of 2.4 */
-
-/* these must be explicitly translated into linux Q_* in ll_dir_ioctl */
-#define LUSTRE_Q_QUOTAON    0x800002     /* deprecated as of 2.4 */
-#define LUSTRE_Q_QUOTAOFF   0x800003     /* deprecated as of 2.4 */
-#define LUSTRE_Q_GETINFO    0x800005     /* get information about quota files */
-#define LUSTRE_Q_SETINFO    0x800006     /* set information about quota files */
-#define LUSTRE_Q_GETQUOTA   0x800007     /* get user quota structure */
-#define LUSTRE_Q_SETQUOTA   0x800008     /* set user quota structure */
-/* lustre-specific control commands */
-#define LUSTRE_Q_INVALIDATE  0x80000b     /* deprecated as of 2.4 */
-#define LUSTRE_Q_FINVALIDATE 0x80000c     /* deprecated as of 2.4 */
-
-#define ALLQUOTA 255       /* set all quota */
-static inline char *qtype_name(int qtype)
-{
-	switch (qtype) {
-	case USRQUOTA:
-		return "usr";
-	case GRPQUOTA:
-		return "grp";
-	case PRJQUOTA:
-		return "prj";
-	}
-	return "unknown";
-}
-
-#define IDENTITY_DOWNCALL_MAGIC 0x6d6dd629
-
-/* permission */
-#define N_PERMS_MAX      64
-
-struct perm_downcall_data {
-        __u64 pdd_nid;
-        __u32 pdd_perm;
-        __u32 pdd_padding;
-};
-
-struct identity_downcall_data {
-        __u32                            idd_magic;
-        __u32                            idd_err;
-        __u32                            idd_uid;
-        __u32                            idd_gid;
-        __u32                            idd_nperms;
-        __u32                            idd_ngroups;
-        struct perm_downcall_data idd_perms[N_PERMS_MAX];
-        __u32                            idd_groups[0];
-};
-
-#ifdef NEED_QUOTA_DEFS
-#ifndef QIF_BLIMITS
-#define QIF_BLIMITS     1
-#define QIF_SPACE       2
-#define QIF_ILIMITS     4
-#define QIF_INODES      8
-#define QIF_BTIME       16
-#define QIF_ITIME       32
-#define QIF_LIMITS      (QIF_BLIMITS | QIF_ILIMITS)
-#define QIF_USAGE       (QIF_SPACE | QIF_INODES)
-#define QIF_TIMES       (QIF_BTIME | QIF_ITIME)
-#define QIF_ALL         (QIF_LIMITS | QIF_USAGE | QIF_TIMES)
-#endif
-
-#endif /* !__KERNEL__ */
-
-/* lustre volatile file support
- * file name header: .^L^S^T^R:volatile"
- */
-#define LUSTRE_VOLATILE_HDR	".\x0c\x13\x14\x12:VOLATILE"
-#define LUSTRE_VOLATILE_HDR_LEN	14
-
-typedef enum lustre_quota_version {
-        LUSTRE_QUOTA_V2 = 1
-} lustre_quota_version_t;
-
-/* XXX: same as if_dqinfo struct in kernel */
-struct obd_dqinfo {
-        __u64 dqi_bgrace;
-        __u64 dqi_igrace;
-        __u32 dqi_flags;
-        __u32 dqi_valid;
-};
-
-/* XXX: same as if_dqblk struct in kernel, plus one padding */
-struct obd_dqblk {
-        __u64 dqb_bhardlimit;
-        __u64 dqb_bsoftlimit;
-        __u64 dqb_curspace;
-        __u64 dqb_ihardlimit;
-        __u64 dqb_isoftlimit;
-        __u64 dqb_curinodes;
-        __u64 dqb_btime;
-        __u64 dqb_itime;
-        __u32 dqb_valid;
-        __u32 dqb_padding;
-};
-
-enum {
-        QC_GENERAL      = 0,
-        QC_MDTIDX       = 1,
-        QC_OSTIDX       = 2,
-        QC_UUID         = 3
-};
-
-struct if_quotactl {
-        __u32                   qc_cmd;
-        __u32                   qc_type;
-        __u32                   qc_id;
-        __u32                   qc_stat;
-        __u32                   qc_valid;
-        __u32                   qc_idx;
-        struct obd_dqinfo       qc_dqinfo;
-        struct obd_dqblk        qc_dqblk;
-        char                    obd_type[16];
-        struct obd_uuid         obd_uuid;
-};
-
-/* swap layout flags */
-#define SWAP_LAYOUTS_CHECK_DV1		(1 << 0)
-#define SWAP_LAYOUTS_CHECK_DV2		(1 << 1)
-#define SWAP_LAYOUTS_KEEP_MTIME		(1 << 2)
-#define SWAP_LAYOUTS_KEEP_ATIME		(1 << 3)
-#define SWAP_LAYOUTS_CLOSE		(1 << 4)
-
-/* Swap XATTR_NAME_HSM as well, only on the MDT so far */
-#define SWAP_LAYOUTS_MDS_HSM		(1 << 31)
-struct lustre_swap_layouts {
-	__u64	sl_flags;
-	__u32	sl_fd;
-	__u32	sl_gid;
-	__u64	sl_dv1;
-	__u64	sl_dv2;
-};
-
-
-/********* Changelogs **********/
-/** Changelog record types */
-enum changelog_rec_type {
-        CL_MARK     = 0,
-        CL_CREATE   = 1,  /* namespace */
-        CL_MKDIR    = 2,  /* namespace */
-        CL_HARDLINK = 3,  /* namespace */
-        CL_SOFTLINK = 4,  /* namespace */
-        CL_MKNOD    = 5,  /* namespace */
-        CL_UNLINK   = 6,  /* namespace */
-        CL_RMDIR    = 7,  /* namespace */
-        CL_RENAME   = 8,  /* namespace */
-        CL_EXT      = 9,  /* namespace extended record (2nd half of rename) */
-        CL_OPEN     = 10, /* not currently used */
-        CL_CLOSE    = 11, /* may be written to log only with mtime change */
-	CL_LAYOUT   = 12, /* file layout/striping modified */
-	CL_TRUNC    = 13,
-	CL_SETATTR  = 14,
-	CL_XATTR    = 15,
-	CL_HSM      = 16, /* HSM specific events, see flags */
-	CL_MTIME    = 17, /* Precedence: setattr > mtime > ctime > atime */
-	CL_CTIME    = 18,
-	CL_ATIME    = 19,
-	CL_MIGRATE  = 20,
-	CL_LAST
-};
-
-static inline const char *changelog_type2str(int type) {
-	static const char *changelog_str[] = {
-		"MARK",  "CREAT", "MKDIR", "HLINK", "SLINK", "MKNOD", "UNLNK",
-		"RMDIR", "RENME", "RNMTO", "OPEN",  "CLOSE", "LYOUT", "TRUNC",
-		"SATTR", "XATTR", "HSM",   "MTIME", "CTIME", "ATIME", "MIGRT"
-	};
-
-	if (type >= 0 && type < CL_LAST)
-		return changelog_str[type];
-	return NULL;
-}
-
-/* per-record flags */
-#define CLF_FLAGSHIFT   12
-#define CLF_FLAGMASK    ((1U << CLF_FLAGSHIFT) - 1)
-#define CLF_VERMASK     (~CLF_FLAGMASK)
-enum changelog_rec_flags {
-	CLF_VERSION	= 0x1000,
-	CLF_RENAME	= 0x2000,
-	CLF_JOBID	= 0x4000,
-	CLF_SUPPORTED	= CLF_VERSION | CLF_RENAME | CLF_JOBID
-};
-
-
-/* Anything under the flagmask may be per-type (if desired) */
-/* Flags for unlink */
-#define CLF_UNLINK_LAST       0x0001 /* Unlink of last hardlink */
-#define CLF_UNLINK_HSM_EXISTS 0x0002 /* File has something in HSM */
-                                     /* HSM cleaning needed */
-/* Flags for rename */
-#define CLF_RENAME_LAST		0x0001 /* rename unlink last hardlink
-					* of target */
-#define CLF_RENAME_LAST_EXISTS	0x0002 /* rename unlink last hardlink of target
-					* has an archive in backend */
-
-/* Flags for HSM */
-/* 12b used (from high weight to low weight):
- * 2b for flags
- * 3b for event
- * 7b for error code
- */
-#define CLF_HSM_ERR_L        0 /* HSM return code, 7 bits */
-#define CLF_HSM_ERR_H        6
-#define CLF_HSM_EVENT_L      7 /* HSM event, 3 bits, see enum hsm_event */
-#define CLF_HSM_EVENT_H      9
-#define CLF_HSM_FLAG_L      10 /* HSM flags, 2 bits, 1 used, 1 spare */
-#define CLF_HSM_FLAG_H      11
-#define CLF_HSM_SPARE_L     12 /* 4 spare bits */
-#define CLF_HSM_SPARE_H     15
-#define CLF_HSM_LAST        15
-
-/* Remove bits higher than _h, then extract the value
- * between _h and _l by shifting lower weigth to bit 0. */
-#define CLF_GET_BITS(_b, _h, _l) (((_b << (CLF_HSM_LAST - _h)) & 0xFFFF) \
-                                   >> (CLF_HSM_LAST - _h + _l))
-
-#define CLF_HSM_SUCCESS      0x00
-#define CLF_HSM_MAXERROR     0x7E
-#define CLF_HSM_ERROVERFLOW  0x7F
-
-#define CLF_HSM_DIRTY        1 /* file is dirty after HSM request end */
-
-/* 3 bits field => 8 values allowed */
-enum hsm_event {
-        HE_ARCHIVE      = 0,
-        HE_RESTORE      = 1,
-        HE_CANCEL       = 2,
-        HE_RELEASE      = 3,
-        HE_REMOVE       = 4,
-        HE_STATE        = 5,
-        HE_SPARE1       = 6,
-        HE_SPARE2       = 7,
-};
-
-static inline enum hsm_event hsm_get_cl_event(__u16 flags)
-{
-	return (enum hsm_event)CLF_GET_BITS(flags, CLF_HSM_EVENT_H,
-					    CLF_HSM_EVENT_L);
-}
-
-static inline void hsm_set_cl_event(int *flags, enum hsm_event he)
-{
-        *flags |= (he << CLF_HSM_EVENT_L);
-}
-
-static inline __u16 hsm_get_cl_flags(int flags)
-{
-        return CLF_GET_BITS(flags, CLF_HSM_FLAG_H, CLF_HSM_FLAG_L);
-}
-
-static inline void hsm_set_cl_flags(int *flags, int bits)
-{
-        *flags |= (bits << CLF_HSM_FLAG_L);
-}
-
-static inline int hsm_get_cl_error(int flags)
-{
-        return CLF_GET_BITS(flags, CLF_HSM_ERR_H, CLF_HSM_ERR_L);
-}
-
-static inline void hsm_set_cl_error(int *flags, int error)
-{
-        *flags |= (error << CLF_HSM_ERR_L);
-}
-
-enum changelog_send_flag {
-	/* Not yet implemented */
-	CHANGELOG_FLAG_FOLLOW   = 0x01,
-	/* Blocking IO makes sense in case of slow user parsing of the records,
-	 * but it also prevents us from cleaning up if the records are not
-	 * consumed. */
-	CHANGELOG_FLAG_BLOCK    = 0x02,
-	/* Pack jobid into the changelog records if available. */
-	CHANGELOG_FLAG_JOBID    = 0x04,
-};
-
-#define CR_MAXSIZE cfs_size_round(2 * NAME_MAX + 2 + \
-				  changelog_rec_offset(CLF_SUPPORTED))
-
-/* 31 usable bytes string + null terminator. */
-#define LUSTRE_JOBID_SIZE	32
-
-/* This is the minimal changelog record. It can contain extensions
- * such as rename fields or process jobid. Its exact content is described
- * by the cr_flags.
- *
- * Extensions are packed in the same order as their corresponding flags.
- */
-struct changelog_rec {
-	__u16			cr_namelen;
-	__u16			cr_flags; /**< \a changelog_rec_flags */
-	__u32			cr_type;  /**< \a changelog_rec_type */
-	__u64			cr_index; /**< changelog record number */
-	__u64			cr_prev;  /**< last index for this target fid */
-	__u64			cr_time;
-	union {
-		lustre_fid	cr_tfid;        /**< target fid */
-		__u32		cr_markerflags; /**< CL_MARK flags */
-	};
-	lustre_fid		cr_pfid;        /**< parent fid */
-};
-
-/* Changelog extension for RENAME. */
-struct changelog_ext_rename {
-	lustre_fid		cr_sfid;     /**< source fid, or zero */
-	lustre_fid		cr_spfid;    /**< source parent fid, or zero */
-};
-
-/* Changelog extension to include JOBID. */
-struct changelog_ext_jobid {
-	char	cr_jobid[LUSTRE_JOBID_SIZE];	/**< zero-terminated string. */
-};
-
-
-static inline size_t changelog_rec_offset(enum changelog_rec_flags crf)
-{
-	size_t size = sizeof(struct changelog_rec);
-
-	if (crf & CLF_RENAME)
-		size += sizeof(struct changelog_ext_rename);
-
-	if (crf & CLF_JOBID)
-		size += sizeof(struct changelog_ext_jobid);
-
-	return size;
-}
-
-static inline size_t changelog_rec_size(const struct changelog_rec *rec)
-{
-	return changelog_rec_offset(rec->cr_flags);
-}
-
-static inline size_t changelog_rec_varsize(const struct changelog_rec *rec)
-{
-	return changelog_rec_size(rec) - sizeof(*rec) + rec->cr_namelen;
-}
-
-static inline
-struct changelog_ext_rename *changelog_rec_rename(const struct changelog_rec *rec)
-{
-	enum changelog_rec_flags crf = rec->cr_flags & CLF_VERSION;
-
-	return (struct changelog_ext_rename *)((char *)rec +
-					       changelog_rec_offset(crf));
-}
-
-/* The jobid follows the rename extension, if present */
-static inline
-struct changelog_ext_jobid *changelog_rec_jobid(const struct changelog_rec *rec)
-{
-	enum changelog_rec_flags crf = rec->cr_flags &
-					(CLF_VERSION | CLF_RENAME);
-
-	return (struct changelog_ext_jobid *)((char *)rec +
-					      changelog_rec_offset(crf));
-}
-
-/* The name follows the rename and jobid extensions, if present */
-static inline char *changelog_rec_name(const struct changelog_rec *rec)
-{
-	return (char *)rec + changelog_rec_offset(rec->cr_flags &
-						  CLF_SUPPORTED);
-}
-
-static inline size_t changelog_rec_snamelen(const struct changelog_rec *rec)
-{
-	return rec->cr_namelen - strlen(changelog_rec_name(rec)) - 1;
-}
-
-static inline char *changelog_rec_sname(const struct changelog_rec *rec)
-{
-	char *cr_name = changelog_rec_name(rec);
-
-	return cr_name + strlen(cr_name) + 1;
-}
-
-/**
- * Remap a record to the desired format as specified by the crf flags.
- * The record must be big enough to contain the final remapped version.
- * Superfluous extension fields are removed and missing ones are added
- * and zeroed. The flags of the record are updated accordingly.
- *
- * The jobid and rename extensions can be added to a record, to match the
- * format an application expects, typically. In this case, the newly added
- * fields will be zeroed.
- * The Jobid field can be removed, to guarantee compatibility with older
- * clients that don't expect this field in the records they process.
- *
- * The following assumptions are being made:
- *   - CLF_RENAME will not be removed
- *   - CLF_JOBID will not be added without CLF_RENAME being added too
- *
- * @param[in,out]  rec         The record to remap.
- * @param[in]      crf_wanted  Flags describing the desired extensions.
- */
-static inline void changelog_remap_rec(struct changelog_rec *rec,
-				       enum changelog_rec_flags crf_wanted)
-{
-	char *jid_mov;
-	char *rnm_mov;
-
-	crf_wanted &= CLF_SUPPORTED;
-
-	if ((rec->cr_flags & CLF_SUPPORTED) == crf_wanted)
-		return;
-
-	/* First move the variable-length name field */
-	memmove((char *)rec + changelog_rec_offset(crf_wanted),
-		changelog_rec_name(rec), rec->cr_namelen);
-
-	/* Locations of jobid and rename extensions in the remapped record */
-	jid_mov = (char *)rec +
-		  changelog_rec_offset(crf_wanted & ~CLF_JOBID);
-	rnm_mov = (char *)rec +
-		  changelog_rec_offset(crf_wanted & ~(CLF_JOBID | CLF_RENAME));
-
-	/* Move the extension fields to the desired positions */
-	if ((crf_wanted & CLF_JOBID) && (rec->cr_flags & CLF_JOBID))
-		memmove(jid_mov, changelog_rec_jobid(rec),
-			sizeof(struct changelog_ext_jobid));
-
-	if ((crf_wanted & CLF_RENAME) && (rec->cr_flags & CLF_RENAME))
-		memmove(rnm_mov, changelog_rec_rename(rec),
-			sizeof(struct changelog_ext_rename));
-
-	/* Clear newly added fields */
-	if ((crf_wanted & CLF_JOBID) && !(rec->cr_flags & CLF_JOBID))
-		memset(jid_mov, 0, sizeof(struct changelog_ext_jobid));
-
-	if ((crf_wanted & CLF_RENAME) && !(rec->cr_flags & CLF_RENAME))
-		memset(rnm_mov, 0, sizeof(struct changelog_ext_rename));
-
-	/* Update the record's flags accordingly */
-	rec->cr_flags = (rec->cr_flags & CLF_FLAGMASK) | crf_wanted;
-}
-
-enum changelog_message_type {
-        CL_RECORD = 10, /* message is a changelog_rec */
-        CL_EOF    = 11, /* at end of current changelog */
-};
-
-/********* Misc **********/
-
-struct ioc_data_version {
-        __u64 idv_version;
-        __u64 idv_flags;     /* See LL_DV_xxx */
-};
-#define LL_DV_RD_FLUSH (1 << 0) /* Flush dirty pages from clients */
-#define LL_DV_WR_FLUSH (1 << 1) /* Flush all caching pages from clients */
-
-#ifndef offsetof
-#define offsetof(typ, memb)     ((unsigned long)((char *)&(((typ *)0)->memb)))
-#endif
-
-#define dot_lustre_name ".lustre"
-
-
-/********* HSM **********/
-
-/** HSM per-file state
- * See HSM_FLAGS below.
- */
-enum hsm_states {
-	HS_NONE		= 0x00000000,
-	HS_EXISTS	= 0x00000001,
-	HS_DIRTY	= 0x00000002,
-	HS_RELEASED	= 0x00000004,
-	HS_ARCHIVED	= 0x00000008,
-	HS_NORELEASE	= 0x00000010,
-	HS_NOARCHIVE	= 0x00000020,
-	HS_LOST		= 0x00000040,
-};
-
-/* HSM user-setable flags. */
-#define HSM_USER_MASK   (HS_NORELEASE | HS_NOARCHIVE | HS_DIRTY)
-
-/* Other HSM flags. */
-#define HSM_STATUS_MASK (HS_EXISTS | HS_LOST | HS_RELEASED | HS_ARCHIVED)
-
-/*
- * All HSM-related possible flags that could be applied to a file.
- * This should be kept in sync with hsm_states.
- */
-#define HSM_FLAGS_MASK  (HSM_USER_MASK | HSM_STATUS_MASK)
-
-/**
- * HSM request progress state
- */
-enum hsm_progress_states {
-	HPS_WAITING	= 1,
-	HPS_RUNNING	= 2,
-	HPS_DONE	= 3,
-};
-#define HPS_NONE	0
-
-static inline const char *hsm_progress_state2name(enum hsm_progress_states s)
-{
-	switch  (s) {
-	case HPS_WAITING:	return "waiting";
-	case HPS_RUNNING:	return "running";
-	case HPS_DONE:		return "done";
-	default:		return "unknown";
-	}
-}
-
-struct hsm_extent {
-	__u64 offset;
-	__u64 length;
-} __attribute__((packed));
-
-/**
- * Current HSM states of a Lustre file.
- *
- * This structure purpose is to be sent to user-space mainly. It describes the
- * current HSM flags and in-progress action.
- */
-struct hsm_user_state {
-	/** Current HSM states, from enum hsm_states. */
-	__u32			hus_states;
-	__u32			hus_archive_id;
-	/**  The current undergoing action, if there is one */
-	__u32			hus_in_progress_state;
-	__u32			hus_in_progress_action;
-	struct hsm_extent	hus_in_progress_location;
-	char			hus_extended_info[];
-};
-
-struct hsm_state_set_ioc {
-	struct lu_fid	hssi_fid;
-	__u64		hssi_setmask;
-	__u64		hssi_clearmask;
-};
-
-/*
- * This structure describes the current in-progress action for a file.
- * it is retuned to user space and send over the wire
- */
-struct hsm_current_action {
-	/**  The current undergoing action, if there is one */
-	/* state is one of hsm_progress_states */
-	__u32			hca_state;
-	/* action is one of hsm_user_action */
-	__u32			hca_action;
-	struct hsm_extent	hca_location;
-};
-
-/***** HSM user requests ******/
-/* User-generated (lfs/ioctl) request types */
-enum hsm_user_action {
-        HUA_NONE    =  1, /* no action (noop) */
-        HUA_ARCHIVE = 10, /* copy to hsm */
-        HUA_RESTORE = 11, /* prestage */
-        HUA_RELEASE = 12, /* drop ost objects */
-        HUA_REMOVE  = 13, /* remove from archive */
-        HUA_CANCEL  = 14  /* cancel a request */
-};
-
-static inline const char *hsm_user_action2name(enum hsm_user_action  a)
-{
-        switch  (a) {
-        case HUA_NONE:    return "NOOP";
-        case HUA_ARCHIVE: return "ARCHIVE";
-        case HUA_RESTORE: return "RESTORE";
-        case HUA_RELEASE: return "RELEASE";
-        case HUA_REMOVE:  return "REMOVE";
-        case HUA_CANCEL:  return "CANCEL";
-        default:          return "UNKNOWN";
-        }
-}
-
-/*
- * List of hr_flags (bit field)
- */
-#define HSM_FORCE_ACTION 0x0001
-/* used by CT, cannot be set by user */
-#define HSM_GHOST_COPY   0x0002
-
-/**
- * Contains all the fixed part of struct hsm_user_request.
- *
- */
-struct hsm_request {
-	__u32 hr_action;	/* enum hsm_user_action */
-	__u32 hr_archive_id;	/* archive id, used only with HUA_ARCHIVE */
-	__u64 hr_flags;		/* request flags */
-	__u32 hr_itemcount;	/* item count in hur_user_item vector */
-	__u32 hr_data_len;
-};
-
-struct hsm_user_item {
-       lustre_fid        hui_fid;
-       struct hsm_extent hui_extent;
-} __attribute__((packed));
-
-struct hsm_user_request {
-	struct hsm_request	hur_request;
-	struct hsm_user_item	hur_user_item[0];
-	/* extra data blob at end of struct (after all
-	 * hur_user_items), only use helpers to access it
-	 */
-} __attribute__((packed));
-
-/** Return pointer to data field in a hsm user request */
-static inline void *hur_data(struct hsm_user_request *hur)
-{
-	return &(hur->hur_user_item[hur->hur_request.hr_itemcount]);
-}
-
-/**
- * Compute the current length of the provided hsm_user_request.  This returns -1
- * instead of an errno because ssize_t is defined to be only [ -1, SSIZE_MAX ]
- *
- * return -1 on bounds check error.
- */
-static inline ssize_t hur_len(struct hsm_user_request *hur)
-{
-	__u64	size;
-
-	/* can't overflow a __u64 since hr_itemcount is only __u32 */
-	size = offsetof(struct hsm_user_request, hur_user_item[0]) +
-		(__u64)hur->hur_request.hr_itemcount *
-		sizeof(hur->hur_user_item[0]) + hur->hur_request.hr_data_len;
-
-	if (size != (ssize_t)size)
-		return -1;
-
-	return size;
-}
-
-/****** HSM RPCs to copytool *****/
-/* Message types the copytool may receive */
-enum hsm_message_type {
-        HMT_ACTION_LIST = 100, /* message is a hsm_action_list */
-};
-
-/* Actions the copytool may be instructed to take for a given action_item */
-enum hsm_copytool_action {
-        HSMA_NONE    = 10, /* no action */
-        HSMA_ARCHIVE = 20, /* arbitrary offset */
-        HSMA_RESTORE = 21,
-        HSMA_REMOVE  = 22,
-        HSMA_CANCEL  = 23
-};
-
-static inline const char *hsm_copytool_action2name(enum hsm_copytool_action  a)
-{
-        switch  (a) {
-        case HSMA_NONE:    return "NOOP";
-        case HSMA_ARCHIVE: return "ARCHIVE";
-        case HSMA_RESTORE: return "RESTORE";
-        case HSMA_REMOVE:  return "REMOVE";
-        case HSMA_CANCEL:  return "CANCEL";
-        default:           return "UNKNOWN";
-        }
-}
-
-/* Copytool item action description */
-struct hsm_action_item {
-	__u32      hai_len;     /* valid size of this struct */
-	__u32      hai_action;  /* hsm_copytool_action, but use known size */
-	lustre_fid hai_fid;     /* Lustre FID to operate on */
-	lustre_fid hai_dfid;    /* fid used for data access */
-	struct hsm_extent hai_extent;  /* byte range to operate on */
-	__u64      hai_cookie;  /* action cookie from coordinator */
-	__u64      hai_gid;     /* grouplock id */
-	char       hai_data[0]; /* variable length */
-} __attribute__((packed));
-
-/**
- * helper function which print in hexa the first bytes of
- * hai opaque field
- *
- * \param hai [IN]        record to print
- * \param buffer [IN,OUT] buffer to write the hex string to
- * \param len [IN]        max buffer length
- *
- * \retval buffer
- */
-static inline char *hai_dump_data_field(const struct hsm_action_item *hai,
-					char *buffer, size_t len)
-{
-	int i;
-	int data_len;
-	char *ptr;
-
-	ptr = buffer;
-	data_len = hai->hai_len - sizeof(*hai);
-	for (i = 0; (i < data_len) && (len > 2); i++) {
-		snprintf(ptr, 3, "%02X", (unsigned char)hai->hai_data[i]);
-		ptr += 2;
-		len -= 2;
-	}
-
-	*ptr = '\0';
-
-	return buffer;
-}
-
-/* Copytool action list */
-#define HAL_VERSION 1
-#define HAL_MAXSIZE LNET_MTU /* bytes, used in userspace only */
-struct hsm_action_list {
-	__u32 hal_version;
-	__u32 hal_count;       /* number of hai's to follow */
-	__u64 hal_compound_id; /* returned by coordinator */
-	__u64 hal_flags;
-	__u32 hal_archive_id; /* which archive backend */
-	__u32 padding1;
-	char  hal_fsname[0];   /* null-terminated */
-	/* struct hsm_action_item[hal_count] follows, aligned on 8-byte
-	   boundaries. See hai_zero */
-} __attribute__((packed));
-
-#ifndef HAVE_CFS_SIZE_ROUND
-static inline int cfs_size_round (int val)
-{
-        return (val + 7) & (~0x7);
-}
-#define HAVE_CFS_SIZE_ROUND
-#endif
-
-/* Return pointer to first hai in action list */
-static inline struct hsm_action_item *hai_first(struct hsm_action_list *hal)
-{
-	return (struct hsm_action_item *)(hal->hal_fsname +
-					  cfs_size_round(strlen(hal-> \
-								hal_fsname)
-							 + 1));
-}
-/* Return pointer to next hai */
-static inline struct hsm_action_item * hai_next(struct hsm_action_item *hai)
-{
-        return (struct hsm_action_item *)((char *)hai +
-                                          cfs_size_round(hai->hai_len));
-}
-
-/* Return size of an hsm_action_list */
-static inline size_t hal_size(struct hsm_action_list *hal)
-{
-	__u32 i;
-	size_t sz;
-	struct hsm_action_item *hai;
-
-	sz = sizeof(*hal) + cfs_size_round(strlen(hal->hal_fsname) + 1);
-	hai = hai_first(hal);
-	for (i = 0; i < hal->hal_count ; i++, hai = hai_next(hai))
-		sz += cfs_size_round(hai->hai_len);
-
-	return sz;
-}
-
-/* HSM file import
- * describe the attributes to be set on imported file
- */
-struct hsm_user_import {
-	__u64		hui_size;
-	__u64		hui_atime;
-	__u64		hui_mtime;
-	__u32		hui_atime_ns;
-	__u32		hui_mtime_ns;
-	__u32		hui_uid;
-	__u32		hui_gid;
-	__u32		hui_mode;
-	__u32		hui_archive_id;
-};
-
-/* Copytool progress reporting */
-#define HP_FLAG_COMPLETED 0x01
-#define HP_FLAG_RETRY     0x02
-
-struct hsm_progress {
-	lustre_fid		hp_fid;
-	__u64			hp_cookie;
-	struct hsm_extent	hp_extent;
-	__u16			hp_flags;
-	__u16			hp_errval; /* positive val */
-	__u32			padding;
-};
-
-struct hsm_copy {
-	__u64			hc_data_version;
-	__u16			hc_flags;
-	__u16			hc_errval; /* positive val */
-	__u32			padding;
-	struct hsm_action_item	hc_hai;
-};
-
-/* JSON objects */
-enum llapi_json_types {
-	LLAPI_JSON_INTEGER = 1,
-	LLAPI_JSON_BIGNUM,
-	LLAPI_JSON_REAL,
-	LLAPI_JSON_STRING
-};
-
-struct llapi_json_item {
-	char			*lji_key;
-	__u32			lji_type;
-	union {
-		int	lji_integer;
-		__u64	lji_u64;
-		double	lji_real;
-		char	*lji_string;
-	};
-	struct llapi_json_item	*lji_next;
-};
-
-struct llapi_json_item_list {
-	int			ljil_item_count;
-	struct llapi_json_item	*ljil_items;
-};
-
-enum lu_ladvise_type {
-	LU_LADVISE_INVALID	= 0,
-	LU_LADVISE_WILLREAD	= 1,
-	LU_LADVISE_DONTNEED	= 2,
-};
-
-#define LU_LADVISE_NAMES {						\
-	[LU_LADVISE_WILLREAD]	= "willread",				\
-	[LU_LADVISE_DONTNEED]	= "dontneed",				\
-}
-
-/* This is the userspace argument for ladvise.  It is currently the same as
- * what goes on the wire (struct lu_ladvise), but is defined separately as we
- * may need info which is only used locally. */
-struct llapi_lu_ladvise {
-	__u16 lla_advice;	/* advice type */
-	__u16 lla_value1;	/* values for different advice types */
-	__u32 lla_value2;
-	__u64 lla_start;	/* first byte of extent for advice */
-	__u64 lla_end;		/* last byte of extent for advice */
-	__u32 lla_value3;
-	__u32 lla_value4;
-};
-
-enum ladvise_flag {
-	LF_ASYNC	= 0x00000001,
-};
-
-#define LADVISE_MAGIC 0x1ADF1CE0
-#define LF_MASK LF_ASYNC
-
-/* This is the userspace argument for ladvise, corresponds to ladvise_hdr which
- * is used on the wire.  It is defined separately as we may need info which is
- * only used locally. */
-struct llapi_ladvise_hdr {
-	__u32			lah_magic;	/* LADVISE_MAGIC */
-	__u32			lah_count;	/* number of advices */
-	__u64			lah_flags;	/* from enum ladvise_flag */
-	__u32			lah_value1;	/* unused */
-	__u32			lah_value2;	/* unused */
-	__u64			lah_value3;	/* unused */
-	struct llapi_lu_ladvise	lah_advise[0];	/* advices in this header */
-};
-
-#define LAH_COUNT_MAX	(1024)
-
-/* Shared key */
-enum sk_crypt_alg {
-	SK_CRYPT_INVALID	= -1,
-	SK_CRYPT_EMPTY		= 0,
-	SK_CRYPT_AES256_CTR	= 1,
-	SK_CRYPT_MAX		= 2,
-};
-
-enum sk_hmac_alg {
-	SK_HMAC_INVALID	= -1,
-	SK_HMAC_EMPTY	= 0,
-	SK_HMAC_SHA256	= 1,
-	SK_HMAC_SHA512	= 2,
-	SK_HMAC_MAX	= 3,
-};
-
-struct sk_crypt_type {
-	char    *sct_name;
-	size_t   sct_bytes;
-};
-
-struct sk_hmac_type {
-	char    *sht_name;
-	size_t   sht_bytes;
-};
-
-/** @} lustreuser */
-#endif /* _LUSTRE_USER_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h
index 67df286a5c358..518a00d089e36 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,9 +38,17 @@
  * @{
  */
 
+#include <glob.h>
 #include <stdarg.h>
 #include <stdint.h>
-#include <lustre/lustre_user.h>
+#include <time.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <linux/lustre/lustre_user.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
 
 #ifndef LL_MAXQUOTAS
 #define LL_MAXQUOTAS 3
@@ -50,8 +58,17 @@
 #define ARRAY_SIZE(a) ((sizeof(a)) / (sizeof((a)[0])))
 #endif
 
-extern bool liblustreapi_initialized;
+#ifndef fallthrough
+#define fallthrough do {} while (0)  /* fallthrough */
+#endif
 
+#define lustre_fid struct lu_fid
+
+/* Currently external applications can access this but in the
+ * future this will no longer be exposed for the user. Instead
+ * if you want to know if the library is initialized just call
+ * llapi_liblustreapi_initialized() which is now available. */
+extern bool liblustreapi_initialized;
 
 typedef void (*llapi_cb_t)(char *obd_type_name, char *obd_name, char *obd_uuid,
 			   void *args);
@@ -71,6 +88,10 @@ enum llapi_message_level {
 typedef void (*llapi_log_callback_t)(enum llapi_message_level level, int err,
 				     const char *fmt, va_list ap);
 
+static inline bool llapi_liblustreapi_initialized(void)
+{
+	return liblustreapi_initialized;
+}
 
 /* the bottom three bits reserved for llapi_message_level */
 #define LLAPI_MSG_MASK          0x00000007
@@ -87,10 +108,11 @@ static inline const char *llapi_msg_level2str(enum llapi_message_level level)
 
 	return levels[level];
 }
-extern void llapi_msg_set_level(int level);
+
+void llapi_msg_set_level(int level);
 int llapi_msg_get_level(void);
-extern llapi_log_callback_t llapi_error_callback_set(llapi_log_callback_t cb);
-extern llapi_log_callback_t llapi_info_callback_set(llapi_log_callback_t cb);
+llapi_log_callback_t llapi_error_callback_set(llapi_log_callback_t cb);
+llapi_log_callback_t llapi_info_callback_set(llapi_log_callback_t cb);
 
 void llapi_error(enum llapi_message_level level, int err, const char *fmt, ...)
 	__attribute__((__format__(__printf__, 3, 4)));
@@ -110,53 +132,64 @@ struct llapi_stripe_param {
 	__u32			lsp_osts[0];
 };
 
-extern int llapi_file_open_param(const char *name, int flags, mode_t mode,
-				 const struct llapi_stripe_param *param);
-extern int llapi_file_create(const char *name, unsigned long long stripe_size,
-                             int stripe_offset, int stripe_count,
-                             int stripe_pattern);
-extern int llapi_file_open(const char *name, int flags, int mode,
-                           unsigned long long stripe_size, int stripe_offset,
-                           int stripe_count, int stripe_pattern);
-extern int llapi_file_create_pool(const char *name,
-                                  unsigned long long stripe_size,
-                                  int stripe_offset, int stripe_count,
-                                  int stripe_pattern, char *pool_name);
-extern int llapi_file_open_pool(const char *name, int flags, int mode,
-                                unsigned long long stripe_size,
-                                int stripe_offset, int stripe_count,
-                                int stripe_pattern, char *pool_name);
-extern int llapi_poollist(const char *name);
-extern int llapi_get_poollist(const char *name, char **poollist, int list_size,
-                              char *buffer, int buffer_size);
-extern int llapi_get_poolmembers(const char *poolname, char **members,
-                                 int list_size, char *buffer, int buffer_size);
-extern int llapi_file_get_stripe(const char *path, struct lov_user_md *lum);
-extern int llapi_file_lookup(int dirfd, const char *name);
-
-#define VERBOSE_COUNT		   0x1
-#define VERBOSE_SIZE		   0x2
-#define VERBOSE_OFFSET		   0x4
-#define VERBOSE_POOL		   0x8
-#define VERBOSE_DETAIL		  0x10
-#define VERBOSE_OBJID		  0x20
-#define VERBOSE_GENERATION	  0x40
-#define VERBOSE_MDTINDEX	  0x80
-#define VERBOSE_LAYOUT		 0x100
-#define VERBOSE_COMP_COUNT	 0x200
-#define VERBOSE_COMP_FLAGS	 0x400
-#define VERBOSE_COMP_START	 0x800
-#define VERBOSE_COMP_END	0x1000
-#define VERBOSE_COMP_ID		0x2000
-#define VERBOSE_DFID		0x4000
-#define VERBOSE_HASH_TYPE	0x8000
-#define VERBOSE_DEFAULT		(VERBOSE_COUNT | VERBOSE_SIZE | \
-				 VERBOSE_OFFSET | VERBOSE_POOL | \
-				 VERBOSE_OBJID | VERBOSE_GENERATION | \
-				 VERBOSE_LAYOUT | VERBOSE_HASH_TYPE | \
-				 VERBOSE_COMP_COUNT | VERBOSE_COMP_FLAGS | \
-				 VERBOSE_COMP_START | VERBOSE_COMP_END | \
-				 VERBOSE_COMP_ID)
+#define lsp_tgts	lsp_osts
+
+int llapi_file_open_param(const char *name, int flags, mode_t mode,
+			  const struct llapi_stripe_param *param);
+int llapi_file_create(const char *name, unsigned long long stripe_size,
+		      int stripe_offset, int stripe_count, int stripe_pattern);
+int llapi_file_open(const char *name, int flags, int mode,
+		    unsigned long long stripe_size, int stripe_offset,
+		    int stripe_count, int stripe_pattern);
+int llapi_file_create_pool(const char *name, unsigned long long stripe_size,
+			   int stripe_offset, int stripe_count,
+			   int stripe_pattern, char *pool_name);
+int llapi_file_open_pool(const char *name, int flags, int mode,
+			 unsigned long long stripe_size, int stripe_offset,
+			 int stripe_count, int stripe_pattern, char *pool_name);
+int llapi_poollist(const char *name);
+int llapi_get_poollist(const char *name, char **poollist, int list_size,
+		       char *buffer, int buffer_size);
+int llapi_get_poolmembers(const char *poolname, char **members, int list_size,
+			  char *buffer, int buffer_size);
+int llapi_file_get_stripe(const char *path, struct lov_user_md *lum);
+int llapi_file_lookup(int dirfd, const char *name);
+void llapi_set_command_name(const char *cmd);
+void llapi_clear_command_name(void);
+
+enum llapi_layout_verbose  {
+	VERBOSE_STRIPE_COUNT	=     0x1,
+	VERBOSE_STRIPE_SIZE	=     0x2,
+	VERBOSE_STRIPE_OFFSET	=     0x4,
+	VERBOSE_POOL		=     0x8,
+	VERBOSE_DETAIL		=    0x10,
+	VERBOSE_OBJID		=    0x20,
+	VERBOSE_GENERATION	=    0x40,
+	VERBOSE_MDTINDEX	=    0x80,
+	VERBOSE_PATTERN		=   0x100,
+	VERBOSE_COMP_COUNT	=   0x200,
+	VERBOSE_COMP_FLAGS	=   0x400,
+	VERBOSE_COMP_START	=   0x800,
+	VERBOSE_COMP_END	=  0x1000,
+	VERBOSE_COMP_ID		=  0x2000,
+	VERBOSE_DFID		=  0x4000,
+	VERBOSE_HASH_TYPE	=  0x8000,
+	VERBOSE_MIRROR_COUNT	= 0x10000,
+	VERBOSE_MIRROR_ID	= 0x20000,
+	VERBOSE_DEFAULT		= VERBOSE_STRIPE_COUNT | VERBOSE_STRIPE_SIZE |
+				  VERBOSE_STRIPE_OFFSET | VERBOSE_POOL |
+				  VERBOSE_OBJID | VERBOSE_GENERATION |
+				  VERBOSE_PATTERN | VERBOSE_HASH_TYPE |
+				  VERBOSE_COMP_COUNT | VERBOSE_COMP_FLAGS |
+				  VERBOSE_COMP_START | VERBOSE_COMP_END |
+				  VERBOSE_COMP_ID | VERBOSE_MIRROR_COUNT |
+				  VERBOSE_MIRROR_ID
+};
+/* Compatibility with original names */
+#define VERBOSE_SIZE	VERBOSE_STRIPE_SIZE
+#define VERBOSE_COUNT	VERBOSE_STRIPE_COUNT
+#define VERBOSE_OFFSET	VERBOSE_STRIPE_OFFSET
+#define VERBOSE_LAYOUT	VERBOSE_PATTERN
 
 struct find_param {
 	unsigned int		 fp_max_depth;
@@ -179,7 +212,11 @@ struct find_param {
 				 fp_comp_start_sign:2,
 				 fp_comp_end_sign:2,
 				 fp_comp_count_sign:2,
-				 fp_mdt_count_sign:2;
+				 fp_mirror_count_sign:2,
+				 fp_mirror_index_sign:2,
+				 fp_mirror_id_sign:2,
+				 fp_mdt_count_sign:2,
+				 fp_blocks_sign:2;
 	unsigned long long	 fp_size;
 	unsigned long long	 fp_size_units;
 
@@ -214,21 +251,30 @@ struct find_param {
 				 fp_exclude_projid:1,
 				 fp_check_comp_count:1,
 				 fp_exclude_comp_count:1,
+				 fp_check_mirror_count:1,
+				 fp_exclude_mirror_count:1,
 				 fp_check_comp_flags:1,
-				 fp_exclude_comp_flags:1,
+				 fp_check_mirror_state:1,
 				 fp_check_comp_start:1,
 				 fp_exclude_comp_start:1,
 				 fp_check_comp_end:1,
 				 fp_exclude_comp_end:1,
 				 fp_check_comp_id:1,
 				 fp_exclude_comp_id:1,
+				 fp_check_mirror_id:1,
+				 fp_exclude_mirror_id:1,
+				 fp_check_mirror_index:1,
+				 fp_exclude_mirror_index:1,
 				 fp_check_mdt_count:1,
 				 fp_exclude_mdt_count:1,
 				 fp_check_hash_type:1,
 				 fp_exclude_hash_type:1,
-				 fp_yaml:1;	/* output layout in YAML */
+				 fp_yaml:1,	/* output layout in YAML */
+				 fp_check_blocks:1,
+				 fp_exclude_blocks:1,
+				 fp_lazy:1;
 
-	int			 fp_verbose;
+	enum llapi_layout_verbose fp_verbose;
 	int			 fp_quiet;
 
 	/* regular expression */
@@ -261,14 +307,22 @@ struct find_param {
 	__u32			 fp_layout;
 
 	__u32			 fp_comp_count;
+	__u32			 fp_mirror_count;
 	__u32			 fp_comp_flags;
+	__u32			 fp_comp_neg_flags;
+	__u16			 fp_mirror_state;
+	__u16			 fp_mirror_neg_state;
 	__u32			 fp_comp_id;
+	__u16			 fp_mirror_id;
+	__u16			 fp_mirror_index;
 	unsigned long long	 fp_comp_start;
 	unsigned long long	 fp_comp_start_units;
 	unsigned long long	 fp_comp_end;
 	unsigned long long	 fp_comp_end_units;
 	unsigned long long	 fp_mdt_count;
 	unsigned		 fp_projid;
+	unsigned long long	 fp_blocks;
+	unsigned long long	 fp_blocks_units;
 
 	/* In-process parameters. */
 	unsigned long		 fp_got_uuids:1,
@@ -277,104 +331,123 @@ struct find_param {
 	unsigned int		 fp_hash_type;
 };
 
-extern int llapi_ostlist(char *path, struct find_param *param);
-extern int llapi_uuid_match(char *real_uuid, char *search_uuid);
-extern int llapi_getstripe(char *path, struct find_param *param);
-extern int llapi_find(char *path, struct find_param *param);
-
-extern int llapi_file_fget_mdtidx(int fd, int *mdtidx);
-extern int llapi_dir_set_default_lmv_stripe(const char *name, int stripe_offset,
-					   int stripe_count, int stripe_pattern,
-					    const char *pool_name);
-extern int llapi_dir_create_pool(const char *name, int flags, int stripe_offset,
-				 int stripe_count, int stripe_pattern,
-				 const char *poolname);
+int llapi_ostlist(char *path, struct find_param *param);
+int llapi_uuid_match(char *real_uuid, char *search_uuid);
+int llapi_getstripe(char *path, struct find_param *param);
+int llapi_find(char *path, struct find_param *param);
+
+int llapi_file_fget_mdtidx(int fd, int *mdtidx);
+int llapi_dir_set_default_lmv(const char *name,
+			      const struct llapi_stripe_param *param);
+int llapi_dir_set_default_lmv_stripe(const char *name, int stripe_offset,
+				     int stripe_count, int stripe_pattern,
+				     const char *pool_name);
+int llapi_dir_create(const char *name, mode_t mode,
+		     const struct llapi_stripe_param *param);
+int llapi_dir_create_pool(const char *name, int flags, int stripe_offset,
+			  int stripe_count, int stripe_pattern,
+			  const char *poolname);
 int llapi_direntry_remove(char *dname);
 
 int llapi_obd_fstatfs(int fd, __u32 type, __u32 index,
 		      struct obd_statfs *stat_buf, struct obd_uuid *uuid_buf);
-extern int llapi_obd_statfs(char *path, __u32 type, __u32 index,
-                     struct obd_statfs *stat_buf,
-                     struct obd_uuid *uuid_buf);
-extern int llapi_ping(char *obd_type, char *obd_name);
-extern int llapi_target_check(int num_types, char **obd_types, char *dir);
-extern int llapi_file_get_lov_uuid(const char *path, struct obd_uuid *lov_uuid);
-extern int llapi_file_get_lmv_uuid(const char *path, struct obd_uuid *lmv_uuid);
-extern int llapi_file_fget_lov_uuid(int fd, struct obd_uuid *lov_uuid);
-extern int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count);
-extern int llapi_lmv_get_uuids(int fd, struct obd_uuid *uuidp, int *mdt_count);
-extern int llapi_is_lustre_mnttype(const char *type);
-extern int llapi_search_ost(char *fsname, char *poolname, char *ostname);
-extern int llapi_get_obd_count(char *mnt, int *count, int is_mdt);
-extern int llapi_parse_size(const char *optarg, unsigned long long *size,
-			    unsigned long long *size_units, int bytes_spec);
-extern int llapi_search_mounts(const char *pathname, int index,
-                               char *mntdir, char *fsname);
-extern int llapi_search_fsname(const char *pathname, char *fsname);
-extern int llapi_getname(const char *path, char *buf, size_t size);
-extern int llapi_search_fileset(const char *pathname, char *fileset);
-
-extern int llapi_search_rootpath(char *pathname, const char *fsname);
-extern int llapi_nodemap_exists(const char *name);
-extern int llapi_migrate_mdt(char *path, struct find_param *param);
-extern int llapi_mv(char *path, struct find_param *param);
+int llapi_obd_statfs(char *path, __u32 type, __u32 index,
+		     struct obd_statfs *stat_buf, struct obd_uuid *uuid_buf);
+int llapi_ping(char *obd_type, char *obd_name);
+int llapi_target_check(int num_types, char **obd_types, char *dir);
+int llapi_file_get_lov_uuid(const char *path, struct obd_uuid *lov_uuid);
+int llapi_file_get_lmv_uuid(const char *path, struct obd_uuid *lmv_uuid);
+int llapi_file_fget_lov_uuid(int fd, struct obd_uuid *lov_uuid);
+int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count);
+int llapi_lmv_get_uuids(int fd, struct obd_uuid *uuidp, int *mdt_count);
+int llapi_is_lustre_mnttype(const char *type);
+int llapi_search_tgt(char *fsname, char *poolname, char *tgtname, bool is_mdt);
+int llapi_search_ost(char *fsname, char *poolname, char *ostname);
+int llapi_get_obd_count(char *mnt, int *count, int is_mdt);
+int llapi_parse_size(const char *optarg, unsigned long long *size,
+		     unsigned long long *size_units, int bytes_spec);
+int llapi_search_mounts(const char *pathname, int index, char *mntdir,
+			char *fsname);
+int llapi_search_fsname(const char *pathname, char *fsname);
+int llapi_getname(const char *path, char *buf, size_t size);
+int llapi_search_fileset(const char *pathname, char *fileset);
+
+int llapi_search_rootpath(char *pathname, const char *fsname);
+int llapi_nodemap_exists(const char *name);
+int llapi_migrate_mdt(char *path, struct find_param *param);
+int llapi_mv(char *path, struct find_param *param);
 
 struct mntent;
+
 #define HAVE_LLAPI_IS_LUSTRE_MNT
-extern int llapi_is_lustre_mnt(struct mntent *mnt);
-extern int llapi_quotactl(char *mnt, struct if_quotactl *qctl);
-extern int llapi_target_iterate(int type_num, char **obd_type, void *args,
-				llapi_cb_t cb);
-extern int llapi_get_connect_flags(const char *mnt, __u64 *flags);
-extern int llapi_cp(int argc, char *argv[]);
-extern int llapi_ls(int argc, char *argv[]);
-extern int llapi_fid2path(const char *device, const char *fidstr, char *path,
-			  int pathlen, long long *recno, int *linkno);
-extern int llapi_path2fid(const char *path, lustre_fid *fid);
-extern int llapi_get_mdt_index_by_fid(int fd, const lustre_fid *fid,
-				      int *mdt_index);
-extern int llapi_fd2fid(const int fd, lustre_fid *fid);
+int llapi_is_lustre_mnt(struct mntent *mnt);
+int llapi_quotactl(char *mnt, struct if_quotactl *qctl);
+int llapi_target_iterate(int type_num, char **obd_type, void *args,
+			 llapi_cb_t cb);
+int llapi_get_connect_flags(const char *mnt, __u64 *flags);
+int llapi_cp(int argc, char *argv[]);
+int llapi_ls(int argc, char *argv[]);
+int llapi_fid2path(const char *device, const char *fidstr, char *path,
+		   int pathlen, long long *recno, int *linkno);
+int llapi_path2fid(const char *path, struct lu_fid *fid);
+int llapi_get_mdt_index_by_fid(int fd, const struct lu_fid *fid,
+			       int *mdt_index);
+int llapi_get_lum_file(const char *path, __u64 *valid, lstatx_t *statx,
+		       struct lov_user_md *lum, size_t lumsize);
+int llapi_get_lum_dir(const char *path, __u64 *valid, lstatx_t *statx,
+		      struct lov_user_md *lum, size_t lumsize);
+int llapi_get_lum_file_fd(int dir_fd, const char *fname, __u64 *valid,
+			  lstatx_t *statx, struct lov_user_md *lum,
+			  size_t lumsize);
+int llapi_get_lum_dir_fd(int dir_fd, __u64 *valid, lstatx_t *statx,
+			 struct lov_user_md *lum, size_t lumsize);
+
+int llapi_fd2fid(int fd, struct lu_fid *fid);
 /* get FID of parent dir + the related name of entry in this parent dir */
-extern int llapi_path2parent(const char *path, unsigned int linkno,
-			     lustre_fid *parent_fid, char *name,
-			     size_t name_size);
-extern int llapi_fd2parent(int fd, unsigned int linkno,
-			   lustre_fid *parent_fid, char *name,
-			   size_t name_size);
-extern int llapi_chomp_string(char *buf);
-extern int llapi_open_by_fid(const char *dir, const lustre_fid *fid,
-			     int open_flags);
-
-extern int llapi_get_version_string(char *version, unsigned int version_size);
+int llapi_path2parent(const char *path, unsigned int linkno,
+		      struct lu_fid *parent_fid, char *name, size_t name_size);
+int llapi_fd2parent(int fd, unsigned int linkno, struct lu_fid *parent_fid,
+		    char *name, size_t name_size);
+int llapi_rmfid(const char *path, struct fid_array *fa);
+int llapi_chomp_string(char *buf);
+int llapi_open_by_fid(const char *dir, const struct lu_fid *fid,
+		      int open_flags);
+int llapi_get_version_string(char *version, unsigned int version_size);
 /* llapi_get_version() is deprecated, use llapi_get_version_string() instead */
-extern int llapi_get_version(char *buffer, int buffer_size, char **version)
+int llapi_get_version(char *buffer, int buffer_size, char **version)
 	__attribute__((deprecated));
-extern int llapi_get_data_version(int fd, __u64 *data_version, __u64 flags);
-extern int llapi_hsm_state_get_fd(int fd, struct hsm_user_state *hus);
-extern int llapi_hsm_state_get(const char *path, struct hsm_user_state *hus);
-extern int llapi_hsm_state_set_fd(int fd, __u64 setmask, __u64 clearmask,
-				  __u32 archive_id);
-extern int llapi_hsm_state_set(const char *path, __u64 setmask, __u64 clearmask,
-			       __u32 archive_id);
-extern int llapi_hsm_register_event_fifo(const char *path);
-extern int llapi_hsm_unregister_event_fifo(const char *path);
-extern void llapi_hsm_log_error(enum llapi_message_level level, int _rc,
-				const char *fmt, va_list args);
-
-extern int llapi_get_agent_uuid(char *path, char *buf, size_t bufsize);
-extern int llapi_create_volatile_idx(char *directory, int idx, int mode);
-static inline int llapi_create_volatile(char *directory, int mode)
+int llapi_get_data_version(int fd, __u64 *data_version, __u64 flags);
+int llapi_file_flush(int fd);
+extern int llapi_get_ost_layout_version(int fd, __u32 *layout_version);
+int llapi_hsm_state_get_fd(int fd, struct hsm_user_state *hus);
+int llapi_hsm_state_get(const char *path, struct hsm_user_state *hus);
+int llapi_hsm_state_set_fd(int fd, __u64 setmask, __u64 clearmask,
+			   __u32 archive_id);
+int llapi_hsm_state_set(const char *path, __u64 setmask, __u64 clearmask,
+			__u32 archive_id);
+int llapi_hsm_register_event_fifo(const char *path);
+int llapi_hsm_unregister_event_fifo(const char *path);
+void llapi_hsm_log_error(enum llapi_message_level level, int _rc,
+			 const char *fmt, va_list args);
+
+int llapi_get_agent_uuid(char *path, char *buf, size_t bufsize);
+int llapi_create_volatile_idx(const char *directory, int mdt_idx,
+			      int open_flags);
+int llapi_create_volatile_param(const char *directory, int mdt_idx,
+				int open_flags, mode_t mode,
+				const struct llapi_stripe_param *stripe_param);
+
+static inline int llapi_create_volatile(char *directory, int open_flags)
 {
-	return llapi_create_volatile_idx(directory, -1, mode);
+	return llapi_create_volatile_idx(directory, -1, open_flags);
 }
 
 
-extern int llapi_fswap_layouts_grouplock(int fd1, int fd2, __u64 dv1, __u64 dv2,
-					 int gid, __u64 flags);
-extern int llapi_fswap_layouts(int fd1, int fd2, __u64 dv1, __u64 dv2,
-			       __u64 flags);
-extern int llapi_swap_layouts(const char *path1, const char *path2,
-			      __u64 dv1, __u64 dv2, __u64 flags);
+int llapi_fswap_layouts_grouplock(int fd1, int fd2, __u64 dv1, __u64 dv2,
+				  int gid, __u64 flags);
+int llapi_fswap_layouts(int fd1, int fd2, __u64 dv1, __u64 dv2, __u64 flags);
+int llapi_swap_layouts(const char *path1, const char *path2, __u64 dv1,
+		       __u64 dv2, __u64 flags);
 
 /* Changelog interface.  priv is private state, managed internally by these
  * functions */
@@ -384,15 +457,18 @@ extern int llapi_swap_layouts(const char *path1, const char *path2,
  * converted to extended format in the lustre api to ease changelog analysis. */
 #define HAVE_CHANGELOG_EXTEND_REC 1
 
-extern int llapi_changelog_start(void **priv, enum changelog_send_flag flags,
-				 const char *mdtname, long long startrec);
-extern int llapi_changelog_fini(void **priv);
-extern int llapi_changelog_recv(void *priv, struct changelog_rec **rech);
-extern int llapi_changelog_free(struct changelog_rec **rech);
-extern int llapi_changelog_get_fd(void *priv);
+int llapi_changelog_start(void **priv, enum changelog_send_flag flags,
+			  const char *mdtname, long long startrec);
+int llapi_changelog_fini(void **priv);
+int llapi_changelog_recv(void *priv, struct changelog_rec **rech);
+int llapi_changelog_in_buf(void *priv);
+int llapi_changelog_free(struct changelog_rec **rech);
+int llapi_changelog_get_fd(void *priv);
 /* Allow records up to endrec to be destroyed; requires registered id. */
-extern int llapi_changelog_clear(const char *mdtname, const char *idstr,
-                                 long long endrec);
+int llapi_changelog_clear(const char *mdtname, const char *idstr,
+			  long long endrec);
+extern int llapi_changelog_set_xflags(void *priv,
+				    enum changelog_send_extra_flag extra_flags);
 
 /* HSM copytool interface.
  * priv is private state, managed internally by these functions
@@ -400,52 +476,51 @@ extern int llapi_changelog_clear(const char *mdtname, const char *idstr,
 struct hsm_copytool_private;
 struct hsm_copyaction_private;
 
-extern int llapi_hsm_copytool_register(struct hsm_copytool_private **priv,
-				       const char *mnt, int archive_count,
-				       int *archives, int rfd_flags);
-extern int llapi_hsm_copytool_unregister(struct hsm_copytool_private **priv);
-extern int llapi_hsm_copytool_get_fd(struct hsm_copytool_private *ct);
-extern int llapi_hsm_copytool_recv(struct hsm_copytool_private *priv,
-				   struct hsm_action_list **hal, int *msgsize);
-extern int llapi_hsm_action_begin(struct hsm_copyaction_private **phcp,
-				  const struct hsm_copytool_private *ct,
-				  const struct hsm_action_item *hai,
-				  int restore_mdt_index, int restore_open_flags,
-				  bool is_error);
-extern int llapi_hsm_action_end(struct hsm_copyaction_private **phcp,
-				const struct hsm_extent *he,
-				int hp_flags, int errval);
-extern int llapi_hsm_action_progress(struct hsm_copyaction_private *hcp,
-				     const struct hsm_extent *he, __u64 total,
-				     int hp_flags);
-extern int llapi_hsm_action_get_dfid(const struct hsm_copyaction_private *hcp,
-				     lustre_fid *fid);
-extern int llapi_hsm_action_get_fd(const struct hsm_copyaction_private *hcp);
-extern int llapi_hsm_import(const char *dst, int archive, const struct stat *st,
-			    unsigned long long stripe_size, int stripe_offset,
-			    int stripe_count, int stripe_pattern,
-			    char *pool_name, lustre_fid *newfid);
+int llapi_hsm_copytool_register(struct hsm_copytool_private **priv,
+				const char *mnt, int archive_count,
+				int *archives, int rfd_flags);
+int llapi_hsm_copytool_unregister(struct hsm_copytool_private **priv);
+int llapi_hsm_copytool_get_fd(struct hsm_copytool_private *ct);
+int llapi_hsm_copytool_recv(struct hsm_copytool_private *priv,
+			    struct hsm_action_list **hal, int *msgsize);
+int llapi_hsm_action_begin(struct hsm_copyaction_private **phcp,
+			   const struct hsm_copytool_private *ct,
+			   const struct hsm_action_item *hai,
+			   int restore_mdt_index, int restore_open_flags,
+			   bool is_error);
+int llapi_hsm_action_end(struct hsm_copyaction_private **phcp,
+			 const struct hsm_extent *he, int hp_flags, int errval);
+int llapi_hsm_action_progress(struct hsm_copyaction_private *hcp,
+			      const struct hsm_extent *he, __u64 total,
+			      int hp_flags);
+int llapi_hsm_action_get_dfid(const struct hsm_copyaction_private *hcp,
+			      struct lu_fid *fid);
+int llapi_hsm_action_get_fd(const struct hsm_copyaction_private *hcp);
+int llapi_hsm_import(const char *dst, int archive, const struct stat *st,
+		     unsigned long long stripe_size, int stripe_offset,
+		     int stripe_count, int stripe_pattern, char *pool_name,
+		     struct lu_fid *newfid);
 
 /* HSM user interface */
-extern struct hsm_user_request *llapi_hsm_user_request_alloc(int itemcount,
-							     int data_len);
-extern int llapi_hsm_request(const char *path,
-			     const struct hsm_user_request *request);
-extern int llapi_hsm_current_action(const char *path,
-				    struct hsm_current_action *hca);
+struct hsm_user_request *llapi_hsm_user_request_alloc(int itemcount,
+						      int data_len);
+int llapi_hsm_request(const char *path, const struct hsm_user_request *request);
+int llapi_hsm_current_action(const char *path, struct hsm_current_action *hca);
 
 /* JSON handling */
-extern int llapi_json_init_list(struct llapi_json_item_list **item_list);
-extern int llapi_json_destroy_list(struct llapi_json_item_list **item_list);
-extern int llapi_json_add_item(struct llapi_json_item_list **item_list,
-			       char *key, __u32 type, void *val);
-extern int llapi_json_write_list(struct llapi_json_item_list **item_list,
-				 FILE *fp);
+int llapi_json_init_list(struct llapi_json_item_list **item_list);
+int llapi_json_destroy_list(struct llapi_json_item_list **item_list);
+int llapi_json_add_item(struct llapi_json_item_list **item_list, char *key,
+			__u32 type, void *val);
+int llapi_json_write_list(struct llapi_json_item_list **item_list, FILE *fp);
 
 /* File lease */
-extern int llapi_lease_get(int fd, int mode);
-extern int llapi_lease_check(int fd);
-extern int llapi_lease_put(int fd);
+int llapi_lease_acquire(int fd, enum ll_lease_mode mode);
+int llapi_lease_release(int fd);
+int llapi_lease_set(int fd, const struct ll_ioc_lease *data);
+int llapi_lease_check(int fd);
+int llapi_lease_get(int fd, int mode); /* obsoleted */
+int llapi_lease_put(int fd); /* obsoleted */
 
 /* Group lock */
 int llapi_group_lock(int fd, int gid);
@@ -458,9 +533,33 @@ int llapi_ladvise(int fd, unsigned long long flags, int num_advise,
 
 /* llapi_layout user interface */
 
+/**
+ * An array element storing component info to be resynced during mirror
+ * resynchronization.
+ */
+struct llapi_resync_comp {
+	uint64_t lrc_start;
+	uint64_t lrc_end;
+	uint32_t lrc_mirror_id;
+	uint32_t lrc_id;	/* component id */
+	bool lrc_synced;
+};
+
 /** Opaque data type abstracting the layout of a Lustre file. */
 struct llapi_layout;
 
+int llapi_mirror_truncate(int fd, unsigned int id, off_t length);
+ssize_t llapi_mirror_write(int fd, unsigned int id, const void *buf,
+			   size_t count, off_t pos);
+uint32_t llapi_mirror_find(struct llapi_layout *layout,
+			   uint64_t file_start, uint64_t file_end,
+			   uint64_t *endp);
+int llapi_mirror_find_stale(struct llapi_layout *layout,
+		struct llapi_resync_comp *comp, size_t comp_size,
+		__u16 *mirror_ids, int ids_nr);
+int llapi_mirror_resync_many(int fd, struct llapi_layout *layout,
+			     struct llapi_resync_comp *comp_array,
+			     int comp_size,  uint64_t start, uint64_t end);
 /*
  * Flags to control how layouts are retrieved.
  */
@@ -487,8 +586,8 @@ struct llapi_layout *llapi_layout_get_by_fd(int fd, uint32_t flags);
 
 /**
  * Return a pointer to a newly-allocated opaque data type containing the
- * layout for the file associated with Lustre file identifier string
- * \a fidstr.  The string \a path must name a path within the
+ * layout for the file associated with Lustre file identifier
+ * \a fid.  The string \a path must name a path within the
  * filesystem that contains the file being looked up, such as the
  * filesystem root.  The returned pointer should be freed with
  * llapi_layout_free() when it is no longer needed.  Failure is
@@ -496,9 +595,35 @@ struct llapi_layout *llapi_layout_get_by_fd(int fd, uint32_t flags);
  * stored in errno.
  */
 struct llapi_layout *llapi_layout_get_by_fid(const char *path,
-					     const lustre_fid *fid,
+					     const struct lu_fid *fid,
 					     uint32_t flags);
 
+enum llapi_layout_xattr_flags {
+	LLAPI_LXF_CHECK = 0x0001,
+	LLAPI_LXF_COPY  = 0x0002,
+};
+
+/**
+ * Return a pointer to a newly-allocated opaque data type containing the
+ * layout for the file associated with extended attribute \a lov_xattr.  The
+ * length of the extended attribute is \a lov_xattr_size. The \a lov_xattr
+ * should be raw xattr without being swapped, since this function will swap it
+ * properly. Thus, \a lov_xattr will be modified during the process. If the
+ * \a LLAPI_LXF_CHECK flag of \a flags is set, this function will check whether
+ * the objects count in lum is consistent with the stripe count in lum. This
+ * check only apply to regular file, so \a LLAPI_LXF_CHECK flag should be
+ * cleared if the xattr belongs to a directory. If the \a LLAPI_LXF_COPY flag
+ * of \a flags is set, this function will use a temporary buffer for byte
+ * swapping when necessary, leaving \a lov_xattr untouched. Otherwise, the byte
+ * swapping will be done to the \a lov_xattr buffer directly.  The returned
+ * pointer should be freed with llapi_layout_free() when it is no longer
+ * needed.  Failure is  * indicated with a NULL return value and an appropriate
+ * error code stored in errno.
+ */
+struct llapi_layout *llapi_layout_get_by_xattr(void *lov_xattr,
+					       ssize_t lov_xattr_size,
+					       uint32_t flags);
+
 /**
  * Allocate a new layout. Use this when creating a new file with
  * llapi_layout_file_create().
@@ -510,6 +635,19 @@ struct llapi_layout *llapi_layout_alloc(void);
  */
 void llapi_layout_free(struct llapi_layout *layout);
 
+/**
+ * llapi_layout_merge() - Merge a composite layout into another one.
+ * @dst_layout: Destination composite layout.
+ * @src_layout: Source composite layout.
+ *
+ * This function copies all of the components from @src_layout and
+ * appends them to @dst_layout.
+ *
+ * Return: 0 on success or -1 on failure.
+ */
+int llapi_layout_merge(struct llapi_layout **dst_layout,
+		       const struct llapi_layout *src_layout);
+
 /** Not a valid stripe size, offset, or RAID pattern. */
 #define LLAPI_LAYOUT_INVALID	0x1000000000000001ULL
 
@@ -531,7 +669,8 @@ void llapi_layout_free(struct llapi_layout *layout);
  * stored using RAID0.  That is, data will be split evenly and without
  * redundancy across all OSTs in the layout.
  */
-#define LLAPI_LAYOUT_RAID0	0
+#define LLAPI_LAYOUT_RAID0	0ULL
+#define LLAPI_LAYOUT_MDT	2ULL
 
 /**
 * The layout includes a specific set of OSTs on which to allocate.
@@ -731,6 +870,39 @@ int llapi_layout_file_open(const char *path, int open_flags, mode_t mode,
 int llapi_layout_file_create(const char *path, int open_flags, int mode,
 			     const struct llapi_layout *layout);
 
+/**
+ * Set flags to the header of component layout.
+ */
+int llapi_layout_flags_set(struct llapi_layout *layout, uint32_t flags);
+int llapi_layout_flags_get(struct llapi_layout *layout, uint32_t *flags);
+const char *llapi_layout_flags_string(uint32_t flags);
+const __u16 llapi_layout_string_flags(char *string);
+
+/**
+ * llapi_layout_mirror_count_get() - Get mirror count from the header of
+ *				     a layout.
+ * @layout: Layout to get mirror count from.
+ * @count:  Returned mirror count value.
+ *
+ * This function gets mirror count from the header of a layout.
+ *
+ * Return: 0 on success or -1 on failure.
+ */
+int llapi_layout_mirror_count_get(struct llapi_layout *layout,
+				  uint16_t *count);
+
+/**
+ * llapi_layout_mirror_count_set() - Set mirror count to the header of a layout.
+ * @layout: Layout to set mirror count in.
+ * @count:  Mirror count value to be set.
+ *
+ * This function sets mirror count to the header of a layout.
+ *
+ * Return: 0 on success or -1 on failure.
+ */
+int llapi_layout_mirror_count_set(struct llapi_layout *layout,
+				  uint16_t count);
+
 /**
  * Fetch the start and end offset of the current layout component.
  */
@@ -748,12 +920,10 @@ static const struct comp_flag_name {
 	const char *cfn_name;
 } comp_flags_table[] = {
 	{ LCME_FL_INIT,		"init" },
-	/* For now, only "init" is supported
-	{ LCME_FL_PRIMARY,	"primary" },
 	{ LCME_FL_STALE,	"stale" },
+	{ LCME_FL_PREF_RW,	"prefer" },
 	{ LCME_FL_OFFLINE,	"offline" },
-	{ LCME_FL_PREFERRED,	"preferred" }
-	*/
+	{ LCME_FL_NOSYNC,	"nosync" },
 };
 
 /**
@@ -773,10 +943,18 @@ int llapi_layout_comp_flags_clear(struct llapi_layout *layout, uint32_t flags);
  * Fetches the file-unique component ID of the current layout component.
  */
 int llapi_layout_comp_id_get(const struct llapi_layout *layout, uint32_t *id);
+/**
+ * Fetches the mirror ID of the current layout component.
+ */
+int llapi_layout_mirror_id_get(const struct llapi_layout *layout, uint32_t *id);
 /**
  * Adds one component to the existing composite or plain layout.
  */
 int llapi_layout_comp_add(struct llapi_layout *layout);
+/**
+ * Adds a first component of a mirror to the existing composite layout.
+ */
+int llapi_layout_add_first_comp(struct llapi_layout *layout);
 /**
  * Deletes the current layout component from the composite layout.
  */
@@ -813,10 +991,59 @@ int llapi_layout_file_comp_del(const char *path, uint32_t id, uint32_t flags);
  * attributes are passed in by @comp and @valid is used to specify which
  * attributes in the component are going to be changed.
  */
-int llapi_layout_file_comp_set(const char *path,
-			       const struct llapi_layout *comp,
-			       uint32_t valid);
+int llapi_layout_file_comp_set(const char *path, uint32_t *ids, uint32_t *flags,
+			       size_t count);
+/**
+ * Check if the file layout is composite.
+ */
+bool llapi_layout_is_composite(struct llapi_layout *layout);
+
+enum {
+	LLAPI_LAYOUT_ITER_CONT = 0,
+	LLAPI_LAYOUT_ITER_STOP = 1,
+};
+
+/**
+ * Iteration callback function.
+ *
+ * \retval LLAPI_LAYOUT_ITER_CONT	Iteration proceeds
+ * \retval LLAPI_LAYOUT_ITER_STOP	Stop iteration
+ * \retval < 0				error code
+ */
+typedef int (*llapi_layout_iter_cb)(struct llapi_layout *layout, void *cbdata);
+
+/**
+ * Iterate all components in the corresponding layout
+ */
+int llapi_layout_comp_iterate(struct llapi_layout *layout,
+			      llapi_layout_iter_cb cb, void *cbdata);
+
+/**
+ * FLR: mirror operation APIs
+ */
+int llapi_mirror_set(int fd, unsigned int id);
+int llapi_mirror_clear(int fd);
+ssize_t llapi_mirror_read(int fd, unsigned int id,
+			   void *buf, size_t count, off_t pos);
+ssize_t llapi_mirror_copy_many(int fd, __u16 src, __u16 *dst, size_t count);
+int llapi_mirror_copy(int fd, unsigned int src, unsigned int dst,
+		       off_t pos, size_t count);
+
+int llapi_param_get_paths(const char *pattern, glob_t *paths);
+int llapi_param_get_value(const char *path, char **buf, size_t *buflen);
+void llapi_param_paths_free(glob_t *paths);
+
+/* MDLL */
+int llapi_dir_open_pool(const char *name, int flags, int mode,
+			unsigned long long stripe_size, int stripe_offset,
+			int stripe_count, int stripe_pattern, char *pool_name);
+
+void llapi_hsm_action_begin_restore_dir(struct hsm_copytool_private *ct);
 
 /** @} llapi */
 
+#if defined(__cplusplus)
+}
+#endif
+
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_acl.h b/drivers/staging/lustrefsx/lustre/include/lustre_acl.h
index beab4a225119f..933d09ab4ef1f 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_acl.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_acl.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2014, Intel Corporation.
+ * Copyright (c) 2014, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_barrier.h b/drivers/staging/lustrefsx/lustre/include/lustre_barrier.h
index 231eae97972ee..df6f78bb4b29b 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_barrier.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_barrier.h
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2016, Intel Corporation.
+ * Copyright (c) 2017, Intel Corporation.
  *
  * lustre/include/lustre_barrier.h
  *
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
index 2b14937780e6a..71af35a8f839e 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -33,15 +33,19 @@
 #ifndef _LUSTRE_COMPAT_H
 #define _LUSTRE_COMPAT_H
 
+#include <linux/aio.h>
+#include <linux/fs.h>
 #include <linux/fs_struct.h>
 #include <linux/namei.h>
 #include <linux/pagemap.h>
 #include <linux/bio.h>
 #include <linux/xattr.h>
+#include <linux/workqueue.h>
+#include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/security.h>
 
-#include <libcfs/libcfs.h>
+#include <libcfs/linux/linux-fs.h>
 #include <lustre_patchless_compat.h>
 #include <obd_support.h>
 
@@ -80,22 +84,6 @@ static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
 		path_put(&old_pwd);
 }
 
-/*
- * set ATTR_BLOCKS to a high value to avoid any risk of collision with other
- * ATTR_* attributes (see bug 13828)
- */
-#define ATTR_BLOCKS    (1 << 27)
-
-/*
- * In more recent kernels, this flag was removed because nobody was using it.
- * But Lustre does. So define it if needed. It is safe to do so, since it's
- * not been replaced with a different flag with the same value, and Lustre
- * only uses it internally.
- */
-#ifndef ATTR_ATTR_FLAG
-#define ATTR_ATTR_FLAG (1 << 10)
-#endif
-
 #define current_ngroups current_cred()->group_info->ngroups
 #define current_groups current_cred()->group_info->small_block
 
@@ -114,13 +102,16 @@ static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
 #endif
 
 #ifdef HAVE_GENERIC_PERMISSION_2ARGS
-# define ll_generic_permission(inode, mask, flags, check_acl) \
+# define ll_generic_permission(mnt_userns, inode, mask, flags, check_acl) \
 	 generic_permission(inode, mask)
 #elif defined HAVE_GENERIC_PERMISSION_4ARGS
-# define ll_generic_permission(inode, mask, flags, check_acl) \
+# define ll_generic_permission(mnt_userns, inode, mask, flags, check_acl) \
 	 generic_permission(inode, mask, flags, check_acl)
+#elif defined HAVE_USER_NAMESPACE_ARG
+# define ll_generic_permission(mnt_userns, inode, mask, flags, check_acl) \
+	 generic_permission(mnt_userns, inode, mask)
 #else
-# define ll_generic_permission(inode, mask, flags, check_acl) \
+# define ll_generic_permission(mnt_userns, inode, mask, flags, check_acl) \
 	 generic_permission(inode, mask, check_acl)
 #endif
 
@@ -156,8 +147,12 @@ static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
 #define simple_setattr(dentry, ops) inode_setattr((dentry)->d_inode, ops)
 #endif
 
-#ifndef SLAB_DESTROY_BY_RCU
-#define SLAB_DESTROY_BY_RCU 0
+#ifndef HAVE_INIT_LIST_HEAD_RCU
+static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
+{
+	WRITE_ONCE(list->next, list);
+	WRITE_ONCE(list->prev, list);
+}
 #endif
 
 #ifndef HAVE_DQUOT_SUSPEND
@@ -190,6 +185,12 @@ static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
 #define bvl_to_page(bvl)		(bvl->bv_page)
 #endif
 
+#ifdef HAVE_BVEC_ITER
+#define bio_start_sector(bio) (bio->bi_iter.bi_sector)
+#else
+#define bio_start_sector(bio) (bio->bi_sector)
+#endif
+
 #ifndef HAVE_BLK_QUEUE_MAX_SEGMENTS
 #define blk_queue_max_segments(rq, seg)                      \
         do { blk_queue_max_phys_segments(rq, seg);           \
@@ -317,7 +318,7 @@ static inline void set_nlink(struct inode *inode, unsigned int nlink)
 }
 #endif
 
-#ifdef HAVE_INODEOPS_USE_UMODE_T
+#if defined(HAVE_INODEOPS_USE_UMODE_T) || defined(HAVE_USER_NAMESPACE_ARG)
 # define ll_umode_t	umode_t
 #else
 # define ll_umode_t	int
@@ -408,6 +409,16 @@ static inline void truncate_inode_pages_final(struct address_space *map)
 }
 #endif
 
+#ifndef HAVE_PTR_ERR_OR_ZERO
+static inline int __must_check PTR_ERR_OR_ZERO(__force const void *ptr)
+{
+	if (IS_ERR(ptr))
+		return PTR_ERR(ptr);
+	else
+		return 0;
+}
+#endif
+
 #ifndef SIZE_MAX
 #define SIZE_MAX	(~(size_t)0)
 #endif
@@ -438,9 +449,11 @@ static inline void truncate_inode_pages_final(struct address_space *map)
 #endif
 
 #ifdef HAVE_PID_NS_FOR_CHILDREN
-# define ll_task_pid_ns(task)	((task)->nsproxy->pid_ns_for_children)
+# define ll_task_pid_ns(task) \
+	 ((task)->nsproxy ? ((task)->nsproxy->pid_ns_for_children) : NULL)
 #else
-# define ll_task_pid_ns(task)	((task)->nsproxy->pid_ns)
+# define ll_task_pid_ns(task) \
+	 ((task)->nsproxy ? ((task)->nsproxy->pid_ns) : NULL)
 #endif
 
 #ifdef HAVE_FULL_NAME_HASH_3ARGS
@@ -655,10 +668,122 @@ static inline struct timespec current_time(struct inode *inode)
 }
 #endif
 
+#ifndef time_after32
+/**
+ * time_after32 - compare two 32-bit relative times
+ * @a: the time which may be after @b
+ * @b: the time which may be before @a
+ *
+ * time_after32(a, b) returns true if the time @a is after time @b.
+ * time_before32(b, a) returns true if the time @b is before time @a.
+ *
+ * Similar to time_after(), compare two 32-bit timestamps for relative
+ * times.  This is useful for comparing 32-bit seconds values that can't
+ * be converted to 64-bit values (e.g. due to disk format or wire protocol
+ * issues) when it is known that the times are less than 68 years apart.
+ */
+#define time_after32(a, b)     ((s32)((u32)(b) - (u32)(a)) < 0)
+#define time_before32(b, a)    time_after32(a, b)
+
+#endif
+
 #ifndef __GFP_COLD
 #define __GFP_COLD 0
 #endif
 
+#ifndef alloc_workqueue
+#define alloc_workqueue(name, flags, max_active) create_workqueue(name)
+#endif
+
+#ifndef READ_ONCE
+#define READ_ONCE ACCESS_ONCE
+#endif
+
+#if IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)
+static inline unsigned short blk_integrity_interval(struct blk_integrity *bi)
+{
+#ifdef HAVE_INTERVAL_EXP_BLK_INTEGRITY
+	return bi->interval_exp ? 1 << bi->interval_exp : 0;
+#elif defined(HAVE_INTERVAL_BLK_INTEGRITY)
+	return bi->interval;
+#else
+	return bi->sector_size;
+#endif /* !HAVE_INTERVAL_EXP_BLK_INTEGRITY */
+}
+
+static inline const char *blk_integrity_name(struct blk_integrity *bi)
+{
+#ifdef HAVE_INTERVAL_EXP_BLK_INTEGRITY
+	return bi->profile->name;
+#else
+	return bi->name;
+#endif
+}
+
+static inline unsigned int bip_size(struct bio_integrity_payload *bip)
+{
+#ifdef HAVE_BIP_ITER_BIO_INTEGRITY_PAYLOAD
+	return bip->bip_iter.bi_size;
+#else
+	return bip->bip_size;
+#endif
+}
+#else /* !CONFIG_BLK_DEV_INTEGRITY */
+static inline unsigned short blk_integrity_interval(struct blk_integrity *bi)
+{
+	return 0;
+}
+static inline const char *blk_integrity_name(struct blk_integrity *bi)
+{
+	/* gcc8 dislikes when strcmp() is called against NULL */
+	return "";
+}
+#endif /* !CONFIG_BLK_DEV_INTEGRITY */
+
+#ifndef INTEGRITY_FLAG_READ
+#define INTEGRITY_FLAG_READ BLK_INTEGRITY_VERIFY
+#endif
+
+#ifndef INTEGRITY_FLAG_WRITE
+#define INTEGRITY_FLAG_WRITE BLK_INTEGRITY_GENERATE
+#endif
+
+static inline bool bdev_integrity_enabled(struct block_device *bdev, int rw)
+{
+#if IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)
+	struct blk_integrity *bi = bdev_get_integrity(bdev);
+
+	if (bi == NULL)
+		return false;
+
+#ifdef HAVE_INTERVAL_EXP_BLK_INTEGRITY
+	if (rw == 0 && bi->profile->verify_fn != NULL &&
+	    (bi->flags & INTEGRITY_FLAG_READ))
+		return true;
+
+	if (rw == 1 && bi->profile->generate_fn != NULL &&
+	    (bi->flags & INTEGRITY_FLAG_WRITE))
+		return true;
+#else
+	if (rw == 0 && bi->verify_fn != NULL &&
+	    (bi->flags & INTEGRITY_FLAG_READ))
+		return true;
+
+	if (rw == 1 && bi->generate_fn != NULL &&
+	    (bi->flags & INTEGRITY_FLAG_WRITE))
+		return true;
+#endif /* !HAVE_INTERVAL_EXP_BLK_INTEGRITY */
+#endif /* !CONFIG_BLK_DEV_INTEGRITY */
+
+	return false;
+}
+
+#ifdef HAVE_PAGEVEC_INIT_ONE_PARAM
+#define ll_pagevec_init(pvec, n) pagevec_init(pvec)
+#else
+#define ll_pagevec_init(pvec, n) pagevec_init(pvec, n)
+#endif
+
 #ifdef HAVE_I_PAGES
 #define page_tree i_pages
 #else
@@ -667,16 +792,16 @@ static inline struct timespec current_time(struct inode *inode)
 #define xa_unlock_irq(lockp) spin_unlock_irq(lockp)
 #endif
 
-#ifndef HAVE_LINUX_SELINUX_IS_ENABLED
-#define selinux_is_enabled() 1
-#endif
-
 #ifndef KMEM_CACHE_USERCOPY
 #define kmem_cache_create_usercopy(name, size, align, flags, useroffset, \
 				   usersize, ctor)			 \
 	kmem_cache_create(name, size, align, flags, ctor)
 #endif
 
+#ifndef HAVE_LINUX_SELINUX_IS_ENABLED
+#define selinux_is_enabled() 1
+#endif
+
 static inline void ll_security_release_secctx(char *secdata, u32 seclen)
 {
 #ifdef HAVE_SEC_RELEASE_SECCTX_1ARG
@@ -727,7 +852,7 @@ static inline int ll_vfs_removexattr(struct dentry *dentry, struct inode *inode,
 #ifdef HAVE_USER_NAMESPACE_ARG
 	return vfs_removexattr(&init_user_ns, dentry, name);
 #elif defined(HAVE_VFS_SETXATTR)
-    return __vfs_removexattr(dentry, name);
+	return __vfs_removexattr(dentry, name);
 #else
 	if (unlikely(!inode->i_op->setxattr))
 		return -EOPNOTSUPP;
@@ -736,6 +861,10 @@ static inline int ll_vfs_removexattr(struct dentry *dentry, struct inode *inode,
 #endif
 }
 
+#ifndef fallthrough
+#define fallthrough do {} while (0)  /* fallthrough */
+#endif
+
 #ifndef HAVE_USER_NAMESPACE_ARG
 #define posix_acl_update_mode(ns, inode, mode, acl) \
 	posix_acl_update_mode(inode, mode, acl)
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_disk.h b/drivers/staging/lustrefsx/lustre/include/lustre_disk.h
index 763e682f2d2b2..1529b1ad75d07 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_disk.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_disk.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -48,8 +48,8 @@
 #include <linux/backing-dev.h>
 #include <linux/list.h>
 #include <libcfs/libcfs.h>
-#include <uapi/linux/lustre_disk.h>
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_disk.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 
 #define IS_MDT(data)		((data)->lsi_flags & LDD_F_SV_TYPE_MDT)
 #define IS_OST(data)		((data)->lsi_flags & LDD_F_SV_TYPE_OST)
@@ -111,6 +111,7 @@ struct lustre_mount_data {
 
 /****************** superblock additional info *********************/
 struct ll_sb_info;
+struct kobject;
 
 struct lustre_sb_info {
 	int                       lsi_flags;
@@ -119,6 +120,7 @@ struct lustre_sb_info {
 	struct ll_sb_info        *lsi_llsbi;   /* add'l client sbi info */
 	struct dt_device	 *lsi_dt_dev;  /* dt device to access disk fs*/
 	atomic_t		  lsi_mounts;  /* references to the srv_mnt */
+	struct kobject		 *lsi_kobj;
 	char			  lsi_svname[MTI_NAME_MAXLEN];
 	/* lsi_osd_obdname format = 'lsi->ls_svname'-osd */
 	char			  lsi_osd_obdname[MTI_NAME_MAXLEN + 4];
@@ -129,8 +131,9 @@ struct lustre_sb_info {
 	char			  lsi_fstype[16];
 	struct backing_dev_info   lsi_bdi;     /* each client mountpoint needs
 						  own backing_dev_info */
+	/* protect lsi_lwp_list */
+	struct mutex		  lsi_lwp_mutex;
 	struct list_head	  lsi_lwp_list;
-	spinlock_t		  lsi_lwp_lock;
 	unsigned long		  lsi_lwp_started:1;
 };
 
@@ -353,7 +356,6 @@ int server_mti_print(const char *title, struct mgs_target_info *mti);
 void server_calc_timeout(struct lustre_sb_info *lsi, struct obd_device *obd);
 # endif
 
-/* mgc_request.c */
 int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id,
 		     enum mgs_cfg_type type);
 int mgc_logname2resid(char *fsname, struct ldlm_res_id *res_id,
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_dlm.h b/drivers/staging/lustrefsx/lustre/include/lustre_dlm.h
index 3eed4226f85a7..c6291b62f4259 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_dlm.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_dlm.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2016, Intel Corporation.
+ * Copyright (c) 2010, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -64,6 +64,9 @@ extern struct kset *ldlm_svc_kset;
 #define LDLM_DEFAULT_LRU_SIZE (100 * num_online_cpus())
 #define LDLM_DEFAULT_MAX_ALIVE		3900	/* 3900 seconds ~65 min */
 #define LDLM_CTIME_AGE_LIMIT (10)
+/* if client lock is unused for that time it can be cancelled if any other
+ * client shows interest in that lock, e.g. glimpse is occured. */
+#define LDLM_DIRTY_AGE_LIMIT (10)
 #define LDLM_DEFAULT_PARALLEL_AST_LIMIT 1024
 
 /**
@@ -232,8 +235,8 @@ struct ldlm_pool_ops {
  * This feature is commonly referred to as lru_resize.
  */
 struct ldlm_pool {
-	/** Pool proc directory. */
-	struct proc_dir_entry	*pl_proc_dir;
+	/** Pool debugfs directory. */
+	struct dentry		*pl_debugfs_entry;
 	/** Pool name, must be long enough to hold compound proc entry name. */
 	char			pl_name[100];
 	/** Lock for protecting SLV/CLV updates. */
@@ -269,9 +272,10 @@ struct ldlm_pool {
 	struct completion	 pl_kobj_unregister;
 };
 
-typedef int (*ldlm_res_policy)(struct ldlm_namespace *, struct ldlm_lock **,
-			       void *req_cookie, enum ldlm_mode mode,
-			       __u64 flags, void *data);
+typedef int (*ldlm_res_policy)(const struct lu_env *env,
+			       struct ldlm_namespace *,
+			       struct ldlm_lock **, void *req_cookie,
+			       enum ldlm_mode mode, __u64 flags, void *data);
 
 typedef int (*ldlm_cancel_cbt)(struct ldlm_lock *lock);
 
@@ -289,11 +293,10 @@ typedef int (*ldlm_cancel_cbt)(struct ldlm_lock *lock);
  * of ldlm_[res_]lvbo_[init,update,fill]() functions.
  */
 struct ldlm_valblock_ops {
-        int (*lvbo_init)(struct ldlm_resource *res);
-        int (*lvbo_update)(struct ldlm_resource *res,
-                           struct ptlrpc_request *r,
-                           int increase);
-        int (*lvbo_free)(struct ldlm_resource *res);
+	int (*lvbo_init)(struct ldlm_resource *res);
+	int (*lvbo_update)(struct ldlm_resource *res, struct ldlm_lock *lock,
+			   struct ptlrpc_request *r, int increase);
+	int (*lvbo_free)(struct ldlm_resource *res);
 	/* Return size of lvb data appropriate RPC size can be reserved */
 	int (*lvbo_size)(struct ldlm_lock *lock);
 	/* Called to fill in lvb data to RPC buffer @buf */
@@ -348,6 +351,14 @@ enum ldlm_ns_type {
 	LDLM_NS_TYPE_MGT,		/**< MGT namespace */
 };
 
+enum ldlm_namespace_flags {
+	/**
+	 * Flag to indicate the LRU cancel is in progress.
+	 * Used to limit the process by 1 thread only.
+	 */
+	LDLM_LRU_CANCEL = 0
+};
+
 /**
  * LDLM Namespace.
  *
@@ -376,6 +387,9 @@ struct ldlm_namespace {
 	/** Flag indicating if namespace is on client instead of server */
 	enum ldlm_side		ns_client;
 
+	/** name of this namespace */
+	char			*ns_name;
+
 	/** Resource hash table for namespace. */
 	struct cfs_hash		*ns_rs_hash;
 
@@ -394,8 +408,8 @@ struct ldlm_namespace {
 	/** Client side original connect flags supported by server. */
 	__u64			ns_orig_connect_flags;
 
-	/* namespace proc dir entry */
-	struct proc_dir_entry	*ns_proc_dir_entry;
+	/* namespace debugfs dir entry */
+	struct dentry		*ns_debugfs_entry;
 
 	/**
 	 * Position in global namespace list linking all namespaces on
@@ -439,14 +453,20 @@ struct ldlm_namespace {
 	 * This allows the client to start caching negative dentries
 	 * for a directory and may save an RPC for a later stat.
 	 */
-	unsigned int		ns_ctime_age_limit;
-
+	time64_t		ns_ctime_age_limit;
+	/**
+	 * Number of seconds since the lock was last used. The client may
+	 * cancel the lock limited by this age and flush related data if
+	 * any other client shows interest in it doing glimpse request.
+	 * This allows to cache stat data locally for such files early.
+	 */
+	time64_t		ns_dirty_age_limit;
 	/**
 	 * Used to rate-limit ldlm_namespace_dump calls.
 	 * \see ldlm_namespace_dump. Increased by 10 seconds every time
 	 * it is called.
 	 */
-	cfs_time_t		ns_next_dump;
+	time64_t		ns_next_dump;
 
 	/** "policy" function that does actual lock conflict determination */
 	ldlm_res_policy		ns_policy;
@@ -484,7 +504,7 @@ struct ldlm_namespace {
 	 * The resources in this namespace remember contended state during
 	 * \a ns_contention_time, in seconds.
 	 */
-	unsigned		ns_contention_time;
+	time64_t		ns_contention_time;
 
 	/**
 	 * Limit size of contended extent locks, in bytes.
@@ -519,6 +539,11 @@ struct ldlm_namespace {
 
 	struct kobject		ns_kobj; /* sysfs object */
 	struct completion	ns_kobj_unregister;
+
+	/**
+	 * To avoid another ns_lock usage, a separate bitops field.
+	 */
+	unsigned long		ns_flags;
 };
 
 /**
@@ -527,8 +552,6 @@ struct ldlm_namespace {
 static inline int ns_is_client(struct ldlm_namespace *ns)
 {
         LASSERT(ns != NULL);
-        LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT |
-                                    LDLM_NAMESPACE_SERVER)));
         LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT ||
                 ns->ns_client == LDLM_NAMESPACE_SERVER);
         return ns->ns_client == LDLM_NAMESPACE_CLIENT;
@@ -540,8 +563,6 @@ static inline int ns_is_client(struct ldlm_namespace *ns)
 static inline int ns_is_server(struct ldlm_namespace *ns)
 {
         LASSERT(ns != NULL);
-        LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT |
-                                    LDLM_NAMESPACE_SERVER)));
         LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT ||
                 ns->ns_client == LDLM_NAMESPACE_SERVER);
         return ns->ns_client == LDLM_NAMESPACE_SERVER;
@@ -584,6 +605,9 @@ typedef int (*ldlm_completion_callback)(struct ldlm_lock *lock, __u64 flags,
 /** Type for glimpse callback function of a lock. */
 typedef int (*ldlm_glimpse_callback)(struct ldlm_lock *lock, void *data);
 
+/** Type for created callback function of a lock. */
+typedef void (*ldlm_created_callback)(struct ldlm_lock *lock);
+
 /** Work list for sending GL ASTs to multiple locks. */
 struct ldlm_glimpse_work {
 	struct ldlm_lock	*gl_lock; /* lock to glimpse */
@@ -595,6 +619,11 @@ struct ldlm_glimpse_work {
 	void			*gl_interpret_data;
 };
 
+struct ldlm_bl_desc {
+	unsigned int bl_same_client:1,
+		     bl_cos_incompat:1;
+};
+
 struct ldlm_cb_set_arg {
 	struct ptlrpc_request_set	*set;
 	int				 type; /* LDLM_{CP,BL,GL}_CALLBACK */
@@ -603,6 +632,7 @@ struct ldlm_cb_set_arg {
 	union ldlm_gl_desc		*gl_desc; /* glimpse AST descriptor */
 	ptlrpc_interpterer_t		 gl_interpret_reply;
 	void				*gl_interpret_data;
+	struct ldlm_bl_desc		*bl_desc;
 };
 
 struct ldlm_cb_async_args {
@@ -610,8 +640,8 @@ struct ldlm_cb_async_args {
 	struct ldlm_lock	*ca_lock;
 };
 
-/** The ldlm_glimpse_work is allocated on the stack and should not be freed. */
-#define LDLM_GL_WORK_NOFREE 0x1
+/** The ldlm_glimpse_work was slab allocated & must be freed accordingly.*/
+#define LDLM_GL_WORK_SLAB_ALLOCATED 0x1
 
 /** Interval node data for each LDLM_EXTENT lock. */
 struct ldlm_interval {
@@ -634,6 +664,19 @@ struct ldlm_interval_tree {
 	struct interval_node	*lit_root; /* actual ldlm_interval */
 };
 
+/**
+ * Lists of waiting locks for each inodebit type.
+ * A lock can be in several liq_waiting lists and it remains in lr_waiting.
+ */
+struct ldlm_ibits_queues {
+	struct list_head	liq_waiting[MDS_INODELOCK_NUMBITS];
+};
+
+struct ldlm_ibits_node {
+	struct list_head	lin_link[MDS_INODELOCK_NUMBITS];
+	struct ldlm_lock	*lock;
+};
+
 /** Whether to track references to exports by LDLM locks. */
 #define LUSTRE_TRACKS_LOCK_EXP_REFS (0)
 
@@ -724,14 +767,17 @@ struct ldlm_lock {
 	struct list_head	l_lru;
 	/**
 	 * Linkage to resource's lock queues according to current lock state.
-	 * (could be granted, waiting or converting)
+	 * (could be granted or waiting)
 	 * Protected by lr_lock in struct ldlm_resource.
 	 */
 	struct list_head	l_res_link;
 	/**
-	 * Tree node for ldlm_extent.
+	 * Internal structures per lock type..
 	 */
-	struct ldlm_interval	*l_tree_node;
+	union {
+		struct ldlm_interval	*l_tree_node;
+		struct ldlm_ibits_node  *l_ibits_node;
+	};
 	/**
 	 * Per export hash of locks.
 	 * Protected by per-bucket exp->exp_lock_hash locks.
@@ -845,10 +891,13 @@ struct ldlm_lock {
 	 * the lock at client, e.g. enqueue the lock. For server it is the
 	 * time when blocking ast was sent.
 	 */
-		time64_t        l_activity;
-		time64_t        l_blast_sent;
+		time64_t	l_activity;
+		time64_t	l_blast_sent;
 	};
 
+	/* separate ost_lvb used mostly by Data-on-MDT for now.
+	 * It is introduced to don't mix with layout lock data. */
+	struct ost_lvb		 l_ost_lvb;
 	/*
 	 * Server-side-only members.
 	 */
@@ -876,7 +925,7 @@ struct ldlm_lock {
 	 * under this lock.
 	 * \see ost_rw_prolong_locks
 	 */
-	cfs_time_t		l_callback_timeout;
+	time64_t		l_callback_timeout;
 
 	/** Local PID of process which created this lock. */
 	__u32			l_pid;
@@ -928,6 +977,20 @@ struct ldlm_lock {
 	struct list_head	l_exp_list;
 };
 
+/**
+ * Describe the overlap between two locks.  itree_overlap_cb data.
+ */
+struct ldlm_match_data {
+	struct ldlm_lock	*lmd_old;
+	struct ldlm_lock	*lmd_lock;
+	enum ldlm_mode		*lmd_mode;
+	union ldlm_policy_data	*lmd_policy;
+	__u64			 lmd_flags;
+	__u64			 lmd_skip_flags;
+	int			 lmd_unref;
+	bool			 lmd_has_ast_data;
+};
+
 /** For uncommitted cross-MDT lock, store transno this lock belongs to */
 #define l_transno l_client_cookie
 
@@ -935,6 +998,15 @@ struct ldlm_lock {
  *  which is for server. */
 #define l_slc_link l_rk_ast
 
+#define HANDLE_MAP_SIZE  ((LMV_MAX_STRIPE_COUNT + 7) >> 3)
+
+struct lustre_handle_array {
+	unsigned int		ha_count;
+	/* ha_map is used as bit flag to indicate handle is remote or local */
+	char			ha_map[HANDLE_MAP_SIZE];
+	struct lustre_handle	ha_handles[0];
+};
+
 /**
  * LDLM resource description.
  * Basically, resource is a representation for a single object.
@@ -966,8 +1038,6 @@ struct ldlm_resource {
 	 * @{ */
 	/** List of locks in granted state */
 	struct list_head	lr_granted;
-	/** List of locks waiting to change their granted mode (converted) */
-	struct list_head	lr_converting;
 	/**
 	 * List of locks that could not be granted due to conflicts and
 	 * that are waiting for conflicts to go away */
@@ -977,16 +1047,21 @@ struct ldlm_resource {
 	/** Resource name */
 	struct ldlm_res_id	lr_name;
 
-	/**
-	 * Interval trees (only for extent locks) for all modes of this resource
-	 */
-	struct ldlm_interval_tree *lr_itree;
+	union {
+		/**
+		 * Interval trees (only for extent locks) for all modes of
+		 * this resource
+		 */
+		struct ldlm_interval_tree *lr_itree;
+		struct ldlm_ibits_queues *lr_ibits_queues;
+	};
 
 	union {
 		/**
 		 * When the resource was considered as contended,
-		 * used only on server side. */
-		cfs_time_t	lr_contention_time;
+		 * used only on server side.
+		 */
+		time64_t	lr_contention_time;
 		/**
 		 * Associated inode, used only on client side.
 		 */
@@ -1011,16 +1086,27 @@ struct ldlm_resource {
 	struct lu_ref		lr_reference;
 };
 
+static inline int ldlm_is_granted(struct ldlm_lock *lock)
+{
+	return lock->l_req_mode == lock->l_granted_mode;
+}
+
 static inline bool ldlm_has_layout(struct ldlm_lock *lock)
 {
 	return lock->l_resource->lr_type == LDLM_IBITS &&
 		lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_LAYOUT;
 }
 
+static inline bool ldlm_has_dom(struct ldlm_lock *lock)
+{
+	return lock->l_resource->lr_type == LDLM_IBITS &&
+		lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_DOM;
+}
+
 static inline char *
 ldlm_ns_name(struct ldlm_namespace *ns)
 {
-        return ns->ns_rs_hash->hs_name;
+	return ns->ns_name;
 }
 
 static inline struct ldlm_namespace *
@@ -1127,10 +1213,11 @@ struct ldlm_enqueue_info {
 	void		*ei_cb_local_bl; /** blocking local lock callback */
 	void		*ei_cb_cp;	/** lock completion callback */
 	void		*ei_cb_gl;	/** lock glimpse callback */
+	ldlm_created_callback ei_cb_created;	/** lock created callback */
 	void		*ei_cbdata;	/** Data to be passed into callbacks. */
 	void		*ei_namespace;	/** lock namespace **/
-	unsigned int	ei_enq_slave:1,	/** whether enqueue slave stripes */
-			ei_nonblock:1;	/** non block enqueue */
+	u64		ei_inodebits;	/** lock inode bits **/
+	unsigned int	ei_enq_slave:1;	/** whether enqueue slave stripes */
 };
 
 #define ei_res_id	ei_cb_gl
@@ -1203,21 +1290,21 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
  * LDLM_PROCESS_RESCAN:
  *
  * It's used when policy functions are called from ldlm_reprocess_queue() to
- * reprocess the wait & convert list and try to grant locks, blocking ASTs
+ * reprocess the wait list and try to grant locks, blocking ASTs
  * have already been sent in this situation, completion ASTs need be sent for
  * the locks being granted.
  *
  * LDLM_PROCESS_ENQUEUE:
  *
  * It's used when policy functions are called from ldlm_lock_enqueue() to
- * process the wait & convert list for handling an enqueue request, blocking
+ * process the wait list for handling an enqueue request, blocking
  * ASTs have not been sent yet, so list of conflicting locks would be
  * collected and ASTs sent.
  *
  * LDLM_PROCESS_RECOVERY:
  *
  * It's used when policy functions are called from ldlm_reprocess_queue() to
- * reprocess the wait & convert list when recovery done. In case of blocking
+ * reprocess the wait list when recovery done. In case of blocking
  * ASTs are lost before recovery, it needs not only to grant locks if
  * available, but also send blocking ASTs to the locks doesn't have AST sent
  * flag. Completion ASTs need be sent for the locks being granted.
@@ -1233,6 +1320,12 @@ typedef int (*ldlm_processing_policy)(struct ldlm_lock *lock, __u64 *flags,
 				      enum ldlm_error *err,
 				      struct list_head *work_list);
 
+typedef int (*ldlm_reprocessing_policy)(struct ldlm_resource *res,
+					struct list_head *queue,
+					struct list_head *work_list,
+					enum ldlm_process_intention intention,
+					struct ldlm_lock *hint);
+
 /**
  * Return values for lock iterators.
  * Also used during deciding of lock grants and cancellations.
@@ -1269,7 +1362,7 @@ struct ldlm_prolong_args {
 	struct ldlm_res_id	lpa_resid;
 	struct ldlm_extent	lpa_extent;
 	enum ldlm_mode		lpa_mode;
-	int			lpa_timeout;
+	time64_t		lpa_timeout;
 	int			lpa_locks_cnt;
 	int			lpa_blocks_cnt;
 };
@@ -1303,14 +1396,11 @@ int ldlm_glimpse_locks(struct ldlm_resource *res,
  * MDT or OST to pass through LDLM requests to LDLM for handling
  * @{
  */
-int ldlm_handle_enqueue(struct ptlrpc_request *req, ldlm_completion_callback,
-                        ldlm_blocking_callback, ldlm_glimpse_callback);
 int ldlm_handle_enqueue0(struct ldlm_namespace *ns, struct ptlrpc_request *req,
-                         const struct ldlm_request *dlm_req,
-                         const struct ldlm_callback_suite *cbs);
-int ldlm_handle_convert(struct ptlrpc_request *req);
+			 const struct ldlm_request *dlm_req,
+			 const struct ldlm_callback_suite *cbs);
 int ldlm_handle_convert0(struct ptlrpc_request *req,
-                         const struct ldlm_request *dlm_req);
+			 const struct ldlm_request *dlm_req);
 int ldlm_handle_cancel(struct ptlrpc_request *req);
 int ldlm_request_cancel(struct ptlrpc_request *req,
 			const struct ldlm_request *dlm_req,
@@ -1318,10 +1408,10 @@ int ldlm_request_cancel(struct ptlrpc_request *req,
 /** @} ldlm_handlers */
 
 void ldlm_revoke_export_locks(struct obd_export *exp);
-unsigned int ldlm_bl_timeout(struct ldlm_lock *lock);
+time64_t ldlm_bl_timeout(struct ldlm_lock *lock);
 #endif
 int ldlm_del_waiting_lock(struct ldlm_lock *lock);
-int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout);
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, time64_t timeout);
 int ldlm_get_ref(void);
 void ldlm_put_ref(void);
 int ldlm_init_export(struct obd_export *exp);
@@ -1331,6 +1421,8 @@ struct ldlm_lock *ldlm_request_lock(struct ptlrpc_request *req);
 /* ldlm_lock.c */
 #ifdef HAVE_SERVER_SUPPORT
 ldlm_processing_policy ldlm_get_processing_policy(struct ldlm_resource *res);
+ldlm_reprocessing_policy
+ldlm_get_reprocessing_policy(struct ldlm_resource *res);
 #endif
 void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg);
 void ldlm_lock2handle(const struct ldlm_lock *lock,
@@ -1366,9 +1458,11 @@ ldlm_handle2lock_long(const struct lustre_handle *h, __u64 flags)
  * Update Lock Value Block Operations (LVBO) on a resource taking into account
  * data from request \a r
  */
-static inline int ldlm_res_lvbo_update(struct ldlm_resource *res,
-				       struct ptlrpc_request *req, int increase)
+static inline int ldlm_lvbo_update(struct ldlm_resource *res,
+				   struct ldlm_lock *lock,
+				   struct ptlrpc_request *req, int increase)
 {
+	struct ldlm_namespace *ns = ldlm_res_to_ns(res);
 	int rc;
 
 	/* delayed lvb init may be required */
@@ -1378,14 +1472,21 @@ static inline int ldlm_res_lvbo_update(struct ldlm_resource *res,
 		return rc;
 	}
 
-	if (ldlm_res_to_ns(res)->ns_lvbo &&
-	    ldlm_res_to_ns(res)->ns_lvbo->lvbo_update) {
-		return ldlm_res_to_ns(res)->ns_lvbo->lvbo_update(res, req,
-								 increase);
-	}
+	if (ns->ns_lvbo && ns->ns_lvbo->lvbo_update)
+		return ns->ns_lvbo->lvbo_update(res, lock, req, increase);
+
 	return 0;
 }
 
+static inline int ldlm_res_lvbo_update(struct ldlm_resource *res,
+				       struct ptlrpc_request *req,
+				       int increase)
+{
+	return ldlm_lvbo_update(res, NULL, req, increase);
+}
+
+int is_granted_or_cancelled_nolock(struct ldlm_lock *lock);
+
 int ldlm_error2errno(enum ldlm_error error);
 enum ldlm_error ldlm_errno2error(int err_no); /* don't call it `errno': this
 					       * confuses user-space. */
@@ -1448,17 +1549,33 @@ void ldlm_lock_fail_match_locked(struct ldlm_lock *lock);
 void ldlm_lock_fail_match(struct ldlm_lock *lock);
 void ldlm_lock_allow_match(struct ldlm_lock *lock);
 void ldlm_lock_allow_match_locked(struct ldlm_lock *lock);
-enum ldlm_mode ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags,
-			       const struct ldlm_res_id *, enum ldlm_type type,
-			       union ldlm_policy_data *, enum ldlm_mode mode,
-			       struct lustre_handle *, int unref);
+enum ldlm_mode ldlm_lock_match_with_skip(struct ldlm_namespace *ns,
+					 __u64 flags, __u64 skip_flags,
+					 const struct ldlm_res_id *res_id,
+					 enum ldlm_type type,
+					 union ldlm_policy_data *policy,
+					 enum ldlm_mode mode,
+					 struct lustre_handle *lh,
+					 int unref);
+static inline enum ldlm_mode ldlm_lock_match(struct ldlm_namespace *ns,
+					     __u64 flags,
+					     const struct ldlm_res_id *res_id,
+					     enum ldlm_type type,
+					     union ldlm_policy_data *policy,
+					     enum ldlm_mode mode,
+					     struct lustre_handle *lh,
+					     int unref)
+{
+	return ldlm_lock_match_with_skip(ns, flags, 0, res_id, type, policy,
+					 mode, lh, unref);
+}
+struct ldlm_lock *search_itree(struct ldlm_resource *res,
+			       struct ldlm_match_data *data);
 enum ldlm_mode ldlm_revalidate_lock_handle(const struct lustre_handle *lockh,
 					   __u64 *bits);
-struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock,
-					enum ldlm_mode new_mode, __u32 *flags);
-void ldlm_lock_downgrade(struct ldlm_lock *lock, enum ldlm_mode new_mode);
+void ldlm_lock_mode_downgrade(struct ldlm_lock *lock, enum ldlm_mode new_mode);
 void ldlm_lock_cancel(struct ldlm_lock *lock);
-void ldlm_reprocess_all(struct ldlm_resource *res);
+void ldlm_reprocess_all(struct ldlm_resource *res, struct ldlm_lock *hint);
 void ldlm_reprocess_recovery_done(struct ldlm_namespace *ns);
 void ldlm_lock_dump_handle(int level, const struct lustre_handle *lockh);
 void ldlm_unlink_lock_skiplist(struct ldlm_lock *req);
@@ -1480,12 +1597,40 @@ void ldlm_namespace_unregister(struct ldlm_namespace *ns,
 			       enum ldlm_side client);
 void ldlm_namespace_get(struct ldlm_namespace *ns);
 void ldlm_namespace_put(struct ldlm_namespace *ns);
-int ldlm_proc_setup(void);
-#ifdef CONFIG_PROC_FS
-void ldlm_proc_cleanup(void);
-#else
-static inline void ldlm_proc_cleanup(void) {}
-#endif
+
+int ldlm_debugfs_setup(void);
+void ldlm_debugfs_cleanup(void);
+
+static inline void ldlm_svc_get_eopc(const struct ldlm_request *dlm_req,
+				     struct lprocfs_stats *srv_stats)
+{
+	int lock_type = 0, op = 0;
+
+	lock_type = dlm_req->lock_desc.l_resource.lr_type;
+
+	switch (lock_type) {
+	case LDLM_PLAIN:
+		op = PTLRPC_LAST_CNTR + LDLM_PLAIN_ENQUEUE;
+		break;
+	case LDLM_EXTENT:
+		op = PTLRPC_LAST_CNTR + LDLM_EXTENT_ENQUEUE;
+		break;
+	case LDLM_FLOCK:
+		op = PTLRPC_LAST_CNTR + LDLM_FLOCK_ENQUEUE;
+		break;
+	case LDLM_IBITS:
+		op = PTLRPC_LAST_CNTR + LDLM_IBITS_ENQUEUE;
+		break;
+	default:
+		op = 0;
+		break;
+	}
+
+	if (op != 0)
+		lprocfs_counter_incr(srv_stats, op);
+
+	return;
+}
 
 /* resource.c - internal */
 struct ldlm_resource *ldlm_resource_get(struct ldlm_namespace *ns,
@@ -1555,7 +1700,8 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
 			  enum ldlm_mode mode, __u64 *flags, void *lvb,
 			  __u32 lvb_len,
 			  const struct lustre_handle *lockh, int rc);
-int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
+int ldlm_cli_enqueue_local(const struct lu_env *env,
+			   struct ldlm_namespace *ns,
 			   const struct ldlm_res_id *res_id,
 			   enum ldlm_type type, union ldlm_policy_data *policy,
 			   enum ldlm_mode mode, __u64 *flags,
@@ -1565,8 +1711,9 @@ int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
 			   void *data, __u32 lvb_len, enum lvb_type lvb_type,
 			   const __u64 *client_cookie,
 			   struct lustre_handle *lockh);
-int ldlm_cli_convert(const struct lustre_handle *lockh, int new_mode,
-		     __u32 *flags);
+int ldlm_cli_convert_req(struct ldlm_lock *lock, __u32 *flags, __u64 new_bits);
+int ldlm_cli_convert(struct ldlm_lock *lock,
+		     enum ldlm_cancel_flags cancel_flags);
 int ldlm_cli_update_pool(struct ptlrpc_request *req);
 int ldlm_cli_cancel(const struct lustre_handle *lockh,
 		    enum ldlm_cancel_flags cancel_flags);
@@ -1590,8 +1737,15 @@ int ldlm_cli_cancel_list_local(struct list_head *cancels, int count,
 int ldlm_cli_cancel_list(struct list_head *head, int count,
 			 struct ptlrpc_request *req,
 			 enum ldlm_cancel_flags flags);
+
+int ldlm_inodebits_drop(struct ldlm_lock *lock, __u64 to_drop);
+int ldlm_cli_inodebits_convert(struct ldlm_lock *lock,
+			       enum ldlm_cancel_flags cancel_flags);
+
 /** @} ldlm_cli_api */
 
+extern unsigned int ldlm_enqueue_min;
+
 /* mds/handler.c */
 /* This has to be here because recursive inclusion sucks. */
 int intent_disposition(struct ldlm_reply *rep, int flag);
@@ -1639,7 +1793,6 @@ void unlock_res_and_lock(struct ldlm_lock *lock);
  * There are not used outside of ldlm.
  * @{
  */
-int ldlm_pools_recalc(enum ldlm_side client);
 int ldlm_pools_init(void);
 void ldlm_pools_fini(void);
 
@@ -1648,7 +1801,7 @@ int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
 int ldlm_pool_shrink(struct ldlm_pool *pl, int nr, gfp_t gfp_mask);
 void ldlm_pool_fini(struct ldlm_pool *pl);
 int ldlm_pool_setup(struct ldlm_pool *pl, int limit);
-int ldlm_pool_recalc(struct ldlm_pool *pl);
+time64_t ldlm_pool_recalc(struct ldlm_pool *pl);
 __u32 ldlm_pool_get_lvf(struct ldlm_pool *pl);
 __u64 ldlm_pool_get_slv(struct ldlm_pool *pl);
 __u64 ldlm_pool_get_clv(struct ldlm_pool *pl);
@@ -1673,5 +1826,7 @@ static inline int ldlm_extent_contain(const struct ldlm_extent *ex1,
 	return ex1->start <= ex2->start && ex1->end >= ex2->end;
 }
 
+int ldlm_inodebits_drop(struct ldlm_lock *lock,  __u64 to_drop);
+
 #endif
 /** @} LDLM */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h b/drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h
index cab4e5f2f702a..9fdebcefe66a5 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_dlm_flags.h
@@ -26,7 +26,7 @@
 #ifndef LDLM_ALL_FLAGS_MASK
 
 /** l_flags bits marked as "all_flags" bits */
-#define LDLM_FL_ALL_FLAGS_MASK          0x00FFFFFFC08F932FULL
+#define LDLM_FL_ALL_FLAGS_MASK          0x00FFFFFFC28F932FULL
 
 /** extent, mode, or resource changed */
 #define LDLM_FL_LOCK_CHANGED            0x0000000000000001ULL // bit   0
@@ -44,7 +44,7 @@
 
 /**
  * Server placed lock on conv list, or a recovering client wants the lock
- * added to the conv list, no questions asked. */
+ * added to the conv list, no questions asked. (obsoleted) */
 #define LDLM_FL_BLOCK_CONV              0x0000000000000004ULL // bit   2
 #define ldlm_is_block_conv(_l)          LDLM_TEST_FLAG(( _l), 1ULL <<  2)
 #define ldlm_set_block_conv(_l)         LDLM_SET_FLAG((  _l), 1ULL <<  2)
@@ -58,6 +58,15 @@
 #define ldlm_set_block_wait(_l)         LDLM_SET_FLAG((  _l), 1ULL <<  3)
 #define ldlm_clear_block_wait(_l)       LDLM_CLEAR_FLAG((_l), 1ULL <<  3)
 
+/**
+ * Lock request is speculative/asynchronous, and cannot wait for any reason.
+ * Fail the lock request if any blocking locks are encountered.
+ * */
+#define LDLM_FL_SPECULATIVE		0x0000000000000010ULL /* bit   4 */
+#define ldlm_is_speculative(_l)		LDLM_TEST_FLAG((_l), 1ULL <<  4)
+#define ldlm_set_speculative(_l)	LDLM_SET_FLAG((_l), 1ULL <<  4)
+#define ldlm_clear_specualtive_(_l)	LDLM_CLEAR_FLAG((_l), 1ULL <<  4)
+
 /** blocking or cancel packet was queued for sending. */
 #define LDLM_FL_AST_SENT                0x0000000000000020ULL // bit   5
 #define ldlm_is_ast_sent(_l)            LDLM_TEST_FLAG(( _l), 1ULL <<  5)
@@ -138,6 +147,35 @@
 #define ldlm_set_cos_incompat(_l)	LDLM_SET_FLAG((_l), 1ULL << 24)
 #define ldlm_clear_cos_incompat(_l)	LDLM_CLEAR_FLAG((_l), 1ULL << 24)
 
+/*
+ * Flag indicates that lock is being converted (downgraded) during the blocking
+ * AST instead of cancelling. Used for IBITS locks now and drops conflicting
+ * bits only keepeing other.
+ */
+#define LDLM_FL_CONVERTING              0x0000000002000000ULL /* bit  25 */
+#define ldlm_is_converting(_l)          LDLM_TEST_FLAG((_l), 1ULL << 25)
+#define ldlm_set_converting(_l)         LDLM_SET_FLAG((_l), 1ULL << 25)
+#define ldlm_clear_converting(_l)       LDLM_CLEAR_FLAG((_l), 1ULL << 25)
+
+/**
+ * Part of original lockahead implementation, OBD_CONNECT_LOCKAHEAD_OLD.
+ * Reserved temporarily to allow those implementations to keep working.
+ * Will be removed after 2.12 release.
+ * */
+#define LDLM_FL_LOCKAHEAD_OLD_RESERVED	0x0000000010000000ULL /* bit  28 */
+#define ldlm_is_do_not_expand_io(_l)	 LDLM_TEST_FLAG((_l), 1ULL << 28)
+#define ldlm_set_do_not_expand_io(_l)	 LDLM_SET_FLAG((_l), 1ULL << 28)
+#define ldlm_clear_do_not_expand_io(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 28)
+
+/**
+ * Do not expand this lock.  Grant it only on the extent requested.
+ * Used for manually requested locks from the client (LU_LADVISE_LOCKAHEAD).
+ * */
+#define LDLM_FL_NO_EXPANSION		0x0000000020000000ULL /* bit  29 */
+#define ldlm_is_do_not_expand(_l)	LDLM_TEST_FLAG((_l), 1ULL << 29)
+#define ldlm_set_do_not_expand(_l)	LDLM_SET_FLAG((_l), 1ULL << 29)
+#define ldlm_clear_do_not_expand(_l)	LDLM_CLEAR_FLAG((_l), 1ULL << 29)
+
 /**
  * measure lock contention and return -EUSERS if locking contention is high */
 #define LDLM_FL_DENY_ON_CONTENTION        0x0000000040000000ULL // bit  30
@@ -354,26 +392,43 @@
 #define ldlm_is_cos_enabled(_l)          LDLM_TEST_FLAG((_l), 1ULL << 57)
 #define ldlm_set_cos_enabled(_l)         LDLM_SET_FLAG((_l), 1ULL << 57)
 
+/**
+ * This flags means to use non-delay RPC to send dlm request RPC.
+ */
+#define LDLM_FL_NDELAY			 0x0400000000000000ULL /* bit  58 */
+#define ldlm_is_ndelay(_l)		 LDLM_TEST_FLAG((_l), 1ULL << 58)
+#define ldlm_set_ndelay(_l)		 LDLM_SET_FLAG((_l), 1ULL << 58)
+
+/**
+ * LVB from this lock is cached in osc object
+ */
+#define LDLM_FL_LVB_CACHED              0x0800000000000000ULL /* bit  59 */
+#define ldlm_is_lvb_cached(_l)          LDLM_TEST_FLAG((_l), 1ULL << 59)
+#define ldlm_set_lvb_cached(_l)         LDLM_SET_FLAG((_l), 1ULL << 59)
+#define ldlm_clear_lvb_cached(_l)       LDLM_CLEAR_FLAG((_l), 1ULL << 59)
+
 /** l_flags bits marked as "ast" bits */
 #define LDLM_FL_AST_MASK                (LDLM_FL_FLOCK_DEADLOCK		|\
-					 LDLM_FL_AST_DISCARD_DATA)
+					 LDLM_FL_DISCARD_DATA)
 
 /** l_flags bits marked as "blocked" bits */
 #define LDLM_FL_BLOCKED_MASK            (LDLM_FL_BLOCK_GRANTED		|\
-					 LDLM_FL_BLOCK_CONV		|\
 					 LDLM_FL_BLOCK_WAIT)
 
 /** l_flags bits marked as "gone" bits */
 #define LDLM_FL_GONE_MASK		(LDLM_FL_DESTROYED		|\
 					 LDLM_FL_FAILED)
 
-/** l_flags bits marked as "inherit" bits */
-/* Flags inherited from wire on enqueue/reply between client/server. */
-/* NO_TIMEOUT flag to force ldlm_lock_match() to wait with no timeout. */
-/* TEST_LOCK flag to not let TEST lock to be granted. */
+/** l_flags bits marked as "inherit" bits
+ * Flags inherited from wire on enqueue/reply between client/server.
+ * CANCEL_ON_BLOCK so server will not grant if a blocking lock is found
+ * NO_TIMEOUT flag to force ldlm_lock_match() to wait with no timeout.
+ * TEST_LOCK flag to not let TEST lock to be granted.
+ * NO_EXPANSION to tell server not to expand extent of lock request */
 #define LDLM_FL_INHERIT_MASK            (LDLM_FL_CANCEL_ON_BLOCK	|\
 					 LDLM_FL_NO_TIMEOUT		|\
-					 LDLM_FL_TEST_LOCK)
+					 LDLM_FL_TEST_LOCK              |\
+					 LDLM_FL_NO_EXPANSION)
 
 /** flags returned in @flags parameter on ldlm_lock_enqueue,
  * to be re-constructed on re-send */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_eacl.h b/drivers/staging/lustrefsx/lustre/include/lustre_eacl.h
index 3061be1bc6124..03b9adc84897c 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_eacl.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_eacl.h
@@ -24,7 +24,7 @@
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_errno.h b/drivers/staging/lustrefsx/lustre/include/lustre_errno.h
similarity index 100%
rename from drivers/staging/lustrefsx/lustre/include/lustre/lustre_errno.h
rename to drivers/staging/lustrefsx/lustre/include/lustre_errno.h
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_export.h b/drivers/staging/lustrefsx/lustre/include/lustre_export.h
index 8552d3d1c00a7..4fb3a9c4b2d18 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_export.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_export.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -42,8 +42,10 @@
  * @{
  */
 
+#include <linux/workqueue.h>
+
 #include <lprocfs_status.h>
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 #include <lustre_dlm.h>
 
 struct mds_client_data;
@@ -99,6 +101,13 @@ struct tg_export_data {
 	long			ted_grant;    /* in bytes */
 	long			ted_pending;  /* bytes just being written */
 	__u8			ted_pagebits; /* log2 of client page size */
+
+	/**
+	 * File Modification Data (FMD) tracking
+	 */
+	spinlock_t		ted_fmd_lock; /* protects ted_fmd_list */
+	struct list_head	ted_fmd_list; /* FIDs being modified */
+	int			ted_fmd_count;/* items in ted_fmd_list */
 };
 
 /**
@@ -119,13 +128,10 @@ struct ec_export_data { /* echo client */
 /** Filter (oss-side) specific import data */
 struct filter_export_data {
 	struct tg_export_data	fed_ted;
-	spinlock_t		fed_lock;	/**< protects fed_mod_list */
 	__u64			fed_lastid_gen;
-	struct list_head	fed_mod_list; /* files being modified */
 	/* count of SOFT_SYNC RPCs, which will be reset after
 	 * ofd_soft_sync_limit number of RPCs, and trigger a sync. */
 	atomic_t		fed_soft_sync_count;
-	int			fed_mod_count;/* items in fed_writing list */
 	__u32			fed_group;
 };
 
@@ -202,6 +208,8 @@ struct obd_export {
 	struct obd_uuid		exp_client_uuid;
         /** To link all exports on an obd device */
 	struct list_head	exp_obd_chain;
+	/** work_struct for destruction of export */
+	struct work_struct	exp_zombie_work;
 	/* Unlinked export list */
 	struct list_head	exp_stale_list;
 	struct hlist_node	exp_uuid_hash;	/** uuid-export hash*/
@@ -239,45 +247,44 @@ struct obd_export {
 	/** Last committed transno for this export */
 	__u64			exp_last_committed;
 	/** When was last request received */
-	cfs_time_t		exp_last_request_time;
+	time64_t		exp_last_request_time;
 	/** On replay all requests waiting for replay are linked here */
 	struct list_head	exp_req_replay_queue;
 	/**
 	 * protects exp_flags, exp_outstanding_replies and the change
 	 * of exp_imp_reverse
 	 */
-	spinlock_t		  exp_lock;
+	spinlock_t		exp_lock;
 	/** Compatibility flags for this export are embedded into
 	 *  exp_connect_data */
-	struct obd_connect_data   exp_connect_data;
-        enum obd_option           exp_flags;
-        unsigned long             exp_failed:1,
-                                  exp_in_recovery:1,
-                                  exp_disconnected:1,
-                                  exp_connecting:1,
-                                  /** VBR: export missed recovery */
-                                  exp_delayed:1,
-                                  /** VBR: failed version checking */
-                                  exp_vbr_failed:1,
-                                  exp_req_replay_needed:1,
-                                  exp_lock_replay_needed:1,
-                                  exp_need_sync:1,
-                                  exp_flvr_changed:1,
-                                  exp_flvr_adapt:1,
-                                  exp_libclient:1, /* liblustre client? */
-				  /* if to swap nidtbl entries for 2.2 clients.
-				   * Only used by the MGS to fix LU-1644. */
-				  exp_need_mne_swab:1,
-				  /* The export already got final replay ping
-				   * request. */
-				  exp_replay_done:1;
-        /* also protected by exp_lock */
-        enum lustre_sec_part      exp_sp_peer;
-        struct sptlrpc_flavor     exp_flvr;             /* current */
-        struct sptlrpc_flavor     exp_flvr_old[2];      /* about-to-expire */
-	time64_t		  exp_flvr_expire[2];	/* seconds */
-
-        /** protects exp_hp_rpcs */
+	struct obd_connect_data exp_connect_data;
+	enum obd_option		exp_flags;
+	unsigned long		exp_failed:1,
+				exp_in_recovery:1,
+				exp_disconnected:1,
+				exp_connecting:1,
+				/** VBR: export missed recovery */
+				exp_delayed:1,
+				/** VBR: failed version checking */
+				exp_vbr_failed:1,
+				exp_req_replay_needed:1,
+				exp_lock_replay_needed:1,
+				exp_need_sync:1,
+				exp_flvr_changed:1,
+				exp_flvr_adapt:1,
+				/* if to swap nidtbl entries for 2.2 clients.
+				 * Only used by the MGS to fix LU-1644. */
+				exp_need_mne_swab:1,
+				/* The export already got final replay ping
+				 * request. */
+				exp_replay_done:1;
+	/* also protected by exp_lock */
+	enum lustre_sec_part	exp_sp_peer;
+	struct sptlrpc_flavor	exp_flvr;		/* current */
+	struct sptlrpc_flavor	exp_flvr_old[2];	/* about-to-expire */
+	time64_t		exp_flvr_expire[2];	/* seconds */
+
+	/** protects exp_hp_rpcs */
 	spinlock_t		exp_rpc_lock;
 	struct list_head	exp_hp_rpcs;	/* (potential) HP RPCs */
 	struct list_head	exp_reg_rpcs;  /* RPC being handled */
@@ -318,6 +325,18 @@ static inline __u64 exp_connect_flags(struct obd_export *exp)
 	return *exp_connect_flags_ptr(exp);
 }
 
+static inline __u64 *exp_connect_flags2_ptr(struct obd_export *exp)
+{
+	return &exp->exp_connect_data.ocd_connect_flags2;
+}
+
+static inline __u64 exp_connect_flags2(struct obd_export *exp)
+{
+	if (exp_connect_flags(exp) & OBD_CONNECT_FLAGS2)
+		return *exp_connect_flags2_ptr(exp);
+	return 0;
+}
+
 static inline int exp_max_brw_size(struct obd_export *exp)
 {
 	LASSERT(exp != NULL);
@@ -332,13 +351,6 @@ static inline int exp_connect_multibulk(struct obd_export *exp)
 	return exp_max_brw_size(exp) > ONE_MB_BRW_SIZE;
 }
 
-static inline int exp_expired(struct obd_export *exp, cfs_duration_t age)
-{
-        LASSERT(exp->exp_delayed);
-        return cfs_time_before(cfs_time_add(exp->exp_last_request_time, age),
-                               cfs_time_current_sec());
-}
-
 static inline int exp_connect_cancelset(struct obd_export *exp)
 {
 	LASSERT(exp != NULL);
@@ -407,6 +419,13 @@ static inline bool imp_connect_disp_stripe(struct obd_import *imp)
 	return ocd->ocd_connect_flags & OBD_CONNECT_DISP_STRIPE;
 }
 
+static inline bool imp_connect_shortio(struct obd_import *imp)
+{
+	struct obd_connect_data *ocd = &imp->imp_connect_data;
+
+	return ocd->ocd_connect_flags & OBD_CONNECT_SHORTIO;
+}
+
 static inline __u64 exp_connect_ibits(struct obd_export *exp)
 {
 	struct obd_connect_data *ocd;
@@ -420,13 +439,55 @@ static inline int exp_connect_large_acl(struct obd_export *exp)
 	return !!(exp_connect_flags(exp) & OBD_CONNECT_LARGE_ACL);
 }
 
+static inline int exp_connect_lockahead_old(struct obd_export *exp)
+{
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_LOCKAHEAD_OLD);
+}
+
+static inline int exp_connect_lockahead(struct obd_export *exp)
+{
+	return !!(exp_connect_flags2(exp) & OBD_CONNECT2_LOCKAHEAD);
+}
+
+static inline int exp_connect_flr(struct obd_export *exp)
+{
+	return !!(exp_connect_flags2(exp) & OBD_CONNECT2_FLR);
+}
+
+static inline int exp_mdll(struct obd_export *exp)
+{
+	return !!(exp_connect_flags2(exp) & OBD_CONNECT2_MDLL);
+}
+
+static inline int exp_connect_lock_convert(struct obd_export *exp)
+{
+	return !!(exp_connect_flags2(exp) & OBD_CONNECT2_LOCK_CONVERT);
+}
+
 extern struct obd_export *class_conn2export(struct lustre_handle *conn);
-extern struct obd_device *class_conn2obd(struct lustre_handle *conn);
 
-#define KKUC_CT_DATA_MAGIC	0x092013cea
+static inline int exp_connect_archive_id_array(struct obd_export *exp)
+{
+	return !!(exp_connect_flags2(exp) & OBD_CONNECT2_ARCHIVE_ID_ARRAY);
+}
+
+static inline int exp_connect_sepol(struct obd_export *exp)
+{
+	return !!(exp_connect_flags2(exp) & OBD_CONNECT2_SELINUX_POLICY);
+}
+
+enum {
+	/* archive_ids in array format */
+	KKUC_CT_DATA_ARRAY_MAGIC	= 0x092013cea,
+	/* archive_ids in bitmap format */
+	KKUC_CT_DATA_BITMAP_MAGIC	= 0x082018cea,
+};
+
+
 struct kkuc_ct_data {
 	__u32		kcd_magic;
-	__u32		kcd_archive;
+	__u32		kcd_nr_archives;
+	__u32		kcd_archives[0];
 };
 
 /** @} export */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_fid.h b/drivers/staging/lustrefsx/lustre/include/lustre_fid.h
index 43d0c3419417d..ea6d743b1aaae 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_fid.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_fid.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -149,9 +149,9 @@
  */
 
 #include <libcfs/libcfs.h>
-#include <uapi/linux/lustre_fid.h>
-#include <lustre/lustre_idl.h>
-#include <uapi/linux/lustre_ostid.h>
+#include <uapi/linux/lustre/lustre_fid.h>
+#include <uapi/linux/lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_ostid.h>
 
 struct lu_env;
 struct lu_site;
@@ -196,13 +196,6 @@ enum {
 	LUSTRE_SEQ_SUPER_WIDTH = ((1ULL << 30ULL) * LUSTRE_SEQ_META_WIDTH)
 };
 
-enum {
-        /** 2^6 FIDs for OI containers */
-        OSD_OI_FID_OID_BITS     = 6,
-        /** reserve enough FIDs in case we want more in the future */
-        OSD_OI_FID_OID_BITS_MAX = 10,
-};
-
 /** special OID for local objects */
 enum local_oid {
 	/** \see fld_mod_init */
@@ -225,6 +218,7 @@ enum local_oid {
 	OSD_LPF_OID		= 19UL,
 	REPLY_DATA_OID		= 21UL,
 	ACCT_PROJECT_OID	= 22UL,
+	INDEX_BACKUP_OID	= 4116UL,
 	OFD_LAST_GROUP_OID	= 4117UL,
 	LLOG_CATALOGS_OID	= 4118UL,
 	MGS_CONFIGS_OID		= 4119UL,
@@ -350,10 +344,13 @@ static inline void filter_fid_cpu_to_le(struct filter_fid *dst,
 {
 	fid_cpu_to_le(&dst->ff_parent, &src->ff_parent);
 
-	if (size < sizeof(struct filter_fid))
+	if (size < sizeof(struct filter_fid)) {
 		memset(&dst->ff_layout, 0, sizeof(dst->ff_layout));
-	else
+	} else {
 		ost_layout_cpu_to_le(&dst->ff_layout, &src->ff_layout);
+		dst->ff_layout_version = cpu_to_le32(src->ff_layout_version);
+		dst->ff_range = cpu_to_le32(src->ff_range);
+	}
 
 	/* XXX: Add more if filter_fid is enlarged in the future. */
 }
@@ -363,10 +360,13 @@ static inline void filter_fid_le_to_cpu(struct filter_fid *dst,
 {
 	fid_le_to_cpu(&dst->ff_parent, &src->ff_parent);
 
-	if (size < sizeof(struct filter_fid))
+	if (size < sizeof(struct filter_fid)) {
 		memset(&dst->ff_layout, 0, sizeof(dst->ff_layout));
-	else
+	} else {
 		ost_layout_le_to_cpu(&dst->ff_layout, &src->ff_layout);
+		dst->ff_layout_version = le32_to_cpu(src->ff_layout_version);
+		dst->ff_range = le32_to_cpu(src->ff_range);
+	}
 
 	/* XXX: Add more if filter_fid is enlarged in the future. */
 }
@@ -416,8 +416,8 @@ struct lu_client_seq {
          */
         struct lu_seq_range         lcs_space;
 
-        /* Seq related proc */
-	struct proc_dir_entry   *lcs_proc_dir;
+	/* Seq related debugfs */
+	struct dentry		*lcs_debugfs_entry;
 
         /* This holds last allocated fid in last obtained seq */
         struct lu_fid           lcs_fid;
@@ -427,7 +427,7 @@ struct lu_client_seq {
 
         /*
          * Service uuid, passed from MDT + seq name to form unique seq name to
-         * use it with procfs.
+	 * use it with debugfs.
          */
         char                    lcs_name[80];
 
@@ -463,8 +463,8 @@ struct lu_server_seq {
         /* /seq file object device */
         struct dt_object       *lss_obj;
 
-        /* Seq related proc */
-	struct proc_dir_entry	*lss_proc_dir;
+	/* Seq related debugfs */
+	struct dentry		*lss_debugfs_entry;
 
         /* LUSTRE_SEQ_SERVER or LUSTRE_SEQ_CONTROLLER */
         enum lu_mgr_type       lss_type;
@@ -477,7 +477,7 @@ struct lu_server_seq {
 
         /*
          * Service uuid, passed from MDT + seq name to form unique seq name to
-         * use it with procfs.
+	 * use it with debugfs.
          */
         char                    lss_name[80];
 
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_fld.h b/drivers/staging/lustrefsx/lustre/include/lustre_fld.h
index 2f39962f8fb5e..102dcfac77480 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_fld.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_fld.h
@@ -38,7 +38,7 @@
  * @{
  */
 
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 #include <libcfs/libcfs.h>
 #include <seq_range.h>
 
@@ -67,9 +67,10 @@ struct lu_fld_target {
 };
 
 struct lu_server_fld {
-        /**
-         * Fld dir proc entry. */
-	struct proc_dir_entry	*lsf_proc_dir;
+	/**
+	 * Fld dir debugfs entry.
+	 */
+	struct dentry		*lsf_debugfs_entry;
 
         /**
          * /fld file object device */
@@ -108,8 +109,9 @@ struct lu_server_fld {
 
 struct lu_client_fld {
 	/**
-	 * Client side proc entry. */
-	struct proc_dir_entry	*lcf_proc_dir;
+	 * Client side debugfs entry.
+	 */
+	struct dentry		*lcf_debugfs_entry;
 
 	/**
 	 * List of exports client FLD knows about. */
@@ -132,7 +134,8 @@ struct lu_client_fld {
         struct fld_cache        *lcf_cache;
 
         /**
-         * Client fld proc entry name. */
+	 * Client fld debugfs entry name.
+	 */
         char                     lcf_name[80];
 };
 
@@ -189,7 +192,7 @@ int fld_client_add_target(struct lu_client_fld *fld,
 int fld_client_del_target(struct lu_client_fld *fld,
                           __u64 idx);
 
-void fld_client_proc_fini(struct lu_client_fld *fld);
+void fld_client_debugfs_fini(struct lu_client_fld *fld);
 
 /** @} fld */
 
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_ha.h b/drivers/staging/lustrefsx/lustre/include/lustre_ha.h
index 7c22d985af5a4..2cb4969b615bf 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_ha.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_ha.h
@@ -50,7 +50,7 @@ void ptlrpc_free_committed(struct obd_import *imp);
 void ptlrpc_wake_delayed(struct obd_import *imp);
 int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async);
 int ptlrpc_set_import_active(struct obd_import *imp, int active);
-void ptlrpc_activate_import(struct obd_import *imp);
+void ptlrpc_activate_import(struct obd_import *imp, bool set_state_full);
 void ptlrpc_deactivate_import(struct obd_import *imp);
 void ptlrpc_invalidate_import(struct obd_import *imp);
 void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt);
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_idmap.h b/drivers/staging/lustrefsx/lustre/include/lustre_idmap.h
index 57a192359d118..a8c5a218b6c7d 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_idmap.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_idmap.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_import.h b/drivers/staging/lustrefsx/lustre/include/lustre_import.h
index 1b44d32393139..430fde2e92738 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_import.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_import.h
@@ -42,10 +42,15 @@
  *
  * @{
  */
-
-#include <lustre_handles.h>
-#include <lustre/lustre_idl.h>
-
+#include <linux/atomic.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/time.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <libcfs/libcfs.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 
 /**
  * Adaptive Timeout stuff
@@ -101,19 +106,21 @@ enum lustre_imp_state {
         LUSTRE_IMP_RECOVER    = 8,
         LUSTRE_IMP_FULL       = 9,
         LUSTRE_IMP_EVICTED    = 10,
+	LUSTRE_IMP_IDLE	      = 11,
+	LUSTRE_IMP_LAST
 };
 
 /** Returns test string representation of numeric import state \a state */
 static inline char * ptlrpc_import_state_name(enum lustre_imp_state state)
 {
-        static char* import_state_names[] = {
-                "<UNKNOWN>", "CLOSED",  "NEW", "DISCONN",
-                "CONNECTING", "REPLAY", "REPLAY_LOCKS", "REPLAY_WAIT",
-                "RECOVER", "FULL", "EVICTED",
-        };
-
-        LASSERT (state <= LUSTRE_IMP_EVICTED);
-        return import_state_names[state];
+	static char *import_state_names[] = {
+		"<UNKNOWN>", "CLOSED",  "NEW", "DISCONN",
+		"CONNECTING", "REPLAY", "REPLAY_LOCKS", "REPLAY_WAIT",
+		"RECOVER", "FULL", "EVICTED", "IDLE",
+	};
+
+	LASSERT(state < LUSTRE_IMP_LAST);
+	return import_state_names[state];
 }
 
 /**
@@ -140,9 +147,9 @@ struct obd_import_conn {
         /** uuid of remote side */
         struct obd_uuid           oic_uuid;
         /**
-         * Time (64 bit jiffies) of last connection attempt on this connection
+	 * Time (64 bit seconds) of last connection attempt on this connection
          */
-        __u64                     oic_last_attempt;
+	time64_t		  oic_last_attempt;
 };
 
 /* state history */
@@ -157,8 +164,6 @@ struct import_state_hist {
  * Imports are representing client-side view to remote target.
  */
 struct obd_import {
-	/** Local handle (== id) for this import. */
-	struct portals_handle     imp_handle;
 	/** Reference counter */
 	atomic_t                  imp_refcount;
 	struct lustre_handle      imp_dlm_handle; /* client's ldlm export */
@@ -168,8 +173,8 @@ struct obd_import {
         struct ptlrpc_client     *imp_client;
 	/** List element for linking into pinger chain */
 	struct list_head	  imp_pinger_chain;
-	/** List element for linking into chain for destruction */
-	struct list_head	  imp_zombie_chain;
+	/** work struct for destruction of import */
+	struct work_struct	  imp_zombie_work;
 
         /**
          * Lists of requests that are retained for replay, waiting for a reply,
@@ -213,12 +218,17 @@ struct obd_import {
 	/** Wait queue for those who need to wait for recovery completion */
 	wait_queue_head_t         imp_recovery_waitq;
 
+	/** Number of requests allocated */
+	atomic_t                  imp_reqs;
 	/** Number of requests currently in-flight */
 	atomic_t                  imp_inflight;
 	/** Number of requests currently unregistering */
 	atomic_t                  imp_unregistering;
 	/** Number of replay requests inflight */
 	atomic_t                  imp_replay_inflight;
+	/** In-flight replays rate control */
+	wait_queue_head_t	  imp_replay_waitq;
+
 	/** Number of currently happening import invalidations */
 	atomic_t                  imp_inval_count;
 	/** Numbner of request timeouts */
@@ -232,6 +242,8 @@ struct obd_import {
         int                       imp_state_hist_idx;
         /** Current import generation. Incremented on every reconnect */
         int                       imp_generation;
+	/** Idle connection initiated at this generation */
+	int			  imp_initiated_at;
         /** Incremented every time we send reconnection request */
         __u32                     imp_conn_cnt;
        /** 
@@ -256,9 +268,9 @@ struct obd_import {
          */
         struct lustre_handle      imp_remote_handle;
         /** When to perform next ping. time in jiffies. */
-        cfs_time_t                imp_next_ping;
+	time64_t		imp_next_ping;
 	/** When we last successfully connected. time in 64bit jiffies */
-        __u64                     imp_last_success_conn;
+	time64_t		imp_last_success_conn;
 
         /** List of all possible connection for import. */
 	struct list_head	imp_conn_list;
@@ -283,9 +295,6 @@ struct obd_import {
 				  imp_server_timeout:1,
 				  /* VBR: imp in delayed recovery */
 				  imp_delayed_recovery:1,
-				  /* VBR: if gap was found then no lock replays
-				   */
-				  imp_no_lock_replay:1,
 				  /* recovery by versions was failed */
 				  imp_vbr_failed:1,
 				  /* force an immidiate ping */
@@ -298,30 +307,32 @@ struct obd_import {
 				  imp_resend_replay:1,
 				  /* disable normal recovery, for test only. */
 				  imp_no_pinger_recover:1,
-#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
-				  /* need IR MNE swab */
-				  imp_need_mne_swab:1,
-#endif
 				  /* import must be reconnected instead of
 				   * chouse new connection */
 				  imp_force_reconnect:1,
 				  /* import has tried to connect with server */
 				  imp_connect_tried:1,
 				  /* connected but not FULL yet */
-				  imp_connected:1;
-	__u32                     imp_connect_op;
-	struct obd_connect_data   imp_connect_data;
-	__u64                     imp_connect_flags_orig;
-	__u64                     imp_connect_flags2_orig;
-	int                       imp_connect_error;
-
-	__u32                     imp_msg_magic;
-				  /* adjusted based on server capability */
-	__u32                     imp_msghdr_flags;
-
-				  /* adaptive timeout data */
-	struct imp_at             imp_at;
-	time64_t		  imp_last_reply_time;	/* for health check */
+				  imp_connected:1,
+				  /* grant shrink disabled */
+				  imp_grant_shrink_disabled:1,
+				  /* to supress LCONSOLE() at conn.restore */
+				  imp_was_idle:1;
+	u32			  imp_connect_op;
+	u32			  imp_idle_timeout;
+	u32			  imp_idle_debug;
+	struct obd_connect_data	  imp_connect_data;
+	__u64			  imp_connect_flags_orig;
+	__u64			  imp_connect_flags2_orig;
+	int			  imp_connect_error;
+
+	enum lustre_msg_magic	imp_msg_magic;
+				/* adjusted based on server capability */
+	enum lustre_msghdr	imp_msghdr_flags;
+
+				/* adaptive timeout data */
+	struct imp_at		imp_at;
+	time64_t		imp_last_reply_time;	/* for health check */
 };
 
 /* import.c */
@@ -331,11 +342,11 @@ static inline unsigned int at_est2timeout(unsigned int val)
         return (val + (val >> 2) + 5);
 }
 
-static inline unsigned int at_timeout2est(unsigned int val)
+static inline timeout_t at_timeout2est(timeout_t timeout)
 {
-        /* restore estimate value from timeout: e=4/5(t-5) */
-        LASSERT(val);
-        return (max((val << 2) / 5, 5U) - 4);
+	/* restore estimate value from timeout: e=4/5(t-5) */
+	LASSERT(timeout > 0);
+	return max((timeout << 2) / 5, 5) - 4;
 }
 
 static inline void at_reset_nolock(struct adaptive_timeout *at, int val)
@@ -381,7 +392,6 @@ extern unsigned int at_max;
 /* genops.c */
 struct obd_export;
 extern struct obd_import *class_exp2cliimp(struct obd_export *);
-extern struct obd_import *class_conn2cliimp(struct lustre_handle *);
 
 /** @} import */
 
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_kernelcomm.h b/drivers/staging/lustrefsx/lustre/include/lustre_kernelcomm.h
index 4fc76566501ba..4af88af0edf87 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_kernelcomm.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_kernelcomm.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2013, 2015, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,7 +38,7 @@
 #define __LUSTRE_KERNELCOMM_H__
 
 /* For declarations shared with userspace */
-#include <uapi_kernelcomm.h>
+#include <uapi/linux/lustre/lustre_kernelcomm.h>
 
 /* prototype for callback function on kuc groups */
 typedef int (*libcfs_kkuc_cb_t)(void *data, void *cb_arg);
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_lfsck.h b/drivers/staging/lustrefsx/lustre/include/lustre_lfsck.h
index 37f6ee1de49eb..11409b97e66c8 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_lfsck.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_lfsck.h
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2013, 2016, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
  */
 /*
  * lustre/include/lustre_lfsck.h
@@ -33,7 +33,7 @@
 #ifndef _LUSTRE_LFSCK_H
 # define _LUSTRE_LFSCK_H
 
-#include <lustre/lustre_lfsck_user.h>
+#include <uapi/linux/lustre/lustre_lfsck_user.h>
 #include <lustre_dlm.h>
 #include <lu_object.h>
 #include <dt_object.h>
@@ -101,10 +101,10 @@ int lfsck_query(const struct lu_env *env, struct dt_device *key,
 		struct lfsck_request *req, struct lfsck_reply *rep,
 		struct lfsck_query *que);
 
-int lfsck_get_speed(struct seq_file *m, struct dt_device *key);
+int lfsck_get_speed(struct seq_file *m, char *buf, struct dt_device *key);
 int lfsck_set_speed(struct dt_device *key, __u32 val);
-int lfsck_get_windows(struct seq_file *m, struct dt_device *key);
-int lfsck_set_windows(struct dt_device *key, int val);
+int lfsck_get_windows(char *buf, struct dt_device *key);
+int lfsck_set_windows(struct dt_device *key, unsigned int val);
 
 int lfsck_dump(struct seq_file *m, struct dt_device *key, enum lfsck_type type);
 
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_lib.h b/drivers/staging/lustrefsx/lustre/include/lustre_lib.h
index df1ca627aa4d0..f67791252056d 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_lib.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_lib.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -42,11 +42,15 @@
  * @{
  */
 
-#include <libcfs/linux/linux-misc.h>
+#ifdef HAVE_SCHED_HEADERS
+#include <linux/sched/signal.h>
+#include <linux/sched/mm.h>
+#endif
+
 #include <libcfs/libcfs.h>
-#include <lustre/lustre_idl.h>
-#include <lustre_ver.h>
-#include <uapi/linux/lustre_cfg.h>
+#include <uapi/linux/lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_ver.h>
+#include <uapi/linux/lustre/lustre_cfg.h>
 
 /* target.c */
 struct ptlrpc_request;
@@ -69,7 +73,6 @@ int rev_import_init(struct obd_export *exp);
 int target_handle_connect(struct ptlrpc_request *req);
 int target_handle_disconnect(struct ptlrpc_request *req);
 void target_destroy_export(struct obd_export *exp);
-int target_handle_ping(struct ptlrpc_request *req);
 void target_committed_to_req(struct ptlrpc_request *req);
 void target_cancel_recovery_timer(struct obd_device *obd);
 void target_stop_recovery_thread(struct obd_device *obd);
@@ -161,9 +164,9 @@ static inline int back_to_sleep(void *arg)
 #define LWI_ON_SIGNAL_NOOP ((void (*)(void *))(-1))
 
 struct l_wait_info {
-        cfs_duration_t lwi_timeout;
-        cfs_duration_t lwi_interval;
-        int            lwi_allow_intr;
+	long		lwi_timeout;
+	long		lwi_interval;
+	int		lwi_allow_intr;
         int  (*lwi_on_timeout)(void *);
         void (*lwi_on_signal)(void *);
         void  *lwi_cb_data;
@@ -255,8 +258,8 @@ static inline void __add_wait_queue_exclusive(wait_queue_head_t *q,
 #define __l_wait_event(wq, condition, info, ret, l_add_wait)                   \
 do {                                                                           \
 	wait_queue_entry_t __wait;                                             \
-	cfs_duration_t __timeout = info->lwi_timeout;                          \
-	sigset_t   __blocked;                                              \
+	long __timeout = info->lwi_timeout;				       \
+	sigset_t __blocked;						       \
 	int   __allow_intr = info->lwi_allow_intr;                             \
 									       \
 	ret = 0;                                                               \
@@ -305,13 +308,12 @@ do {                                                                           \
 		if (__timeout == 0) {                                          \
 			schedule();					       \
 		} else {                                                       \
-			cfs_duration_t interval = info->lwi_interval?          \
-					     min_t(cfs_duration_t,             \
-						 info->lwi_interval,__timeout):\
-					     __timeout;                        \
-			cfs_duration_t remaining = schedule_timeout(interval); \
-			__timeout = cfs_time_sub(__timeout,                    \
-					    cfs_time_sub(interval, remaining));\
+			long interval = info->lwi_interval ?		       \
+						min_t(long, info->lwi_interval,\
+						      __timeout) : __timeout;  \
+			long remaining = schedule_timeout(interval);	       \
+									       \
+			__timeout -= interval - remaining;		       \
 			if (__timeout == 0) {                                  \
 				if (info->lwi_on_timeout == NULL ||            \
 				    info->lwi_on_timeout(info->lwi_cb_data)) { \
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_linkea.h b/drivers/staging/lustrefsx/lustre/include/lustre_linkea.h
index 89a040f735d5d..3bf6e2b54fd9b 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_linkea.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_linkea.h
@@ -21,7 +21,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2013, 2014, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
  * Use is subject to license terms.
  *
  * Author: di wang <di.wang@intel.com>
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_lmv.h b/drivers/staging/lustrefsx/lustre/include/lustre_lmv.h
index f936973801012..d5fb751524b0b 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_lmv.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_lmv.h
@@ -32,7 +32,7 @@
 
 #ifndef _LUSTRE_LMV_H
 #define _LUSTRE_LMV_H
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 
 struct lmv_oinfo {
 	struct lu_fid	lmo_fid;
@@ -46,6 +46,8 @@ struct lmv_stripe_md {
 	__u32	lsm_md_master_mdt_index;
 	__u32	lsm_md_hash_type;
 	__u32	lsm_md_layout_version;
+	__u32	lsm_md_migrate_offset;
+	__u32	lsm_md_migrate_hash;
 	__u32	lsm_md_default_count;
 	__u32	lsm_md_default_index;
 	char	lsm_md_pool_name[LOV_MAXPOOLNAME + 1];
@@ -64,6 +66,10 @@ lsm_md_eq(const struct lmv_stripe_md *lsm1, const struct lmv_stripe_md *lsm2)
 	    lsm1->lsm_md_hash_type != lsm2->lsm_md_hash_type ||
 	    lsm1->lsm_md_layout_version !=
 				lsm2->lsm_md_layout_version ||
+	    lsm1->lsm_md_migrate_offset !=
+				lsm2->lsm_md_migrate_offset ||
+	    lsm1->lsm_md_migrate_hash !=
+				lsm2->lsm_md_migrate_hash ||
 	    strcmp(lsm1->lsm_md_pool_name,
 		      lsm2->lsm_md_pool_name) != 0)
 		return false;
@@ -76,14 +82,27 @@ lsm_md_eq(const struct lmv_stripe_md *lsm1, const struct lmv_stripe_md *lsm2)
 
 	return true;
 }
+
+static inline void lsm_md_dump(int mask, const struct lmv_stripe_md *lsm)
+{
+	int i;
+
+	CDEBUG(mask, "magic %#x stripe count %d master mdt %d hash type %#x "
+		"version %d migrate offset %d migrate hash %#x pool %s\n",
+		lsm->lsm_md_magic, lsm->lsm_md_stripe_count,
+		lsm->lsm_md_master_mdt_index, lsm->lsm_md_hash_type,
+		lsm->lsm_md_layout_version, lsm->lsm_md_migrate_offset,
+		lsm->lsm_md_migrate_hash, lsm->lsm_md_pool_name);
+
+	for (i = 0; i < lsm->lsm_md_stripe_count; i++)
+		CDEBUG(mask, "stripe[%d] "DFID"\n",
+		       i, PFID(&lsm->lsm_md_oinfo[i].lmo_fid));
+}
+
 union lmv_mds_md;
 
 void lmv_free_memmd(struct lmv_stripe_md *lsm);
 
-int lmvea_load_shards(const struct lu_env *env, struct dt_object *obj,
-		      struct lu_dirent *ent, struct lu_buf *buf,
-		      bool resize);
-
 static inline void lmv1_le_to_cpu(struct lmv_mds_md_v1 *lmv_dst,
 				  const struct lmv_mds_md_v1 *lmv_src)
 {
@@ -141,18 +160,14 @@ static inline int lmv_name_to_stripe_index(__u32 lmv_hash_type,
 					   unsigned int stripe_count,
 					   const char *name, int namelen)
 {
-	int	idx;
-	__u32	hash_type = lmv_hash_type & LMV_HASH_TYPE_MASK;
+	int idx;
 
 	LASSERT(namelen > 0);
-	if (stripe_count <= 1)
-		return 0;
 
-	/* for migrating object, always start from 0 stripe */
-	if (lmv_hash_type & LMV_HASH_FLAG_MIGRATION)
+	if (stripe_count <= 1)
 		return 0;
 
-	switch (hash_type) {
+	switch (lmv_hash_type & LMV_HASH_TYPE_MASK) {
 	case LMV_HASH_TYPE_ALL_CHARS:
 		idx = lmv_hash_all_chars(stripe_count, name, namelen);
 		break;
@@ -164,8 +179,8 @@ static inline int lmv_name_to_stripe_index(__u32 lmv_hash_type,
 		break;
 	}
 
-	CDEBUG(D_INFO, "name %.*s hash_type %d idx %d\n", namelen, name,
-	       hash_type, idx);
+	CDEBUG(D_INFO, "name %.*s hash_type %#x idx %d/%u\n", namelen, name,
+	       lmv_hash_type, idx, stripe_count);
 
 	return idx;
 }
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_log.h b/drivers/staging/lustrefsx/lustre/include/lustre_log.h
index 237da21bf4210..f2522050f7337 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_log.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_log.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -52,9 +52,9 @@
  */
 
 #include <obd_class.h>
-#include <lustre/lustre_idl.h>
 #include <dt_object.h>
-#include <lustre_log_user.h>
+#include <uapi/linux/lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_log_user.h>
 
 #define LOG_NAME_LIMIT(logname, name)                   \
         snprintf(logname, sizeof(logname), "LOGS/%s", name)
@@ -160,6 +160,7 @@ int llog_cat_process_or_fork(const struct lu_env *env,
 int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh,
 		     llog_cb_t cb, void *data, int startcat, int startidx);
 __u64 llog_cat_size(const struct lu_env *env, struct llog_handle *cat_llh);
+__u32 llog_cat_free_space(struct llog_handle *cat_llh);
 int llog_cat_reverse_process(const struct lu_env *env,
 			     struct llog_handle *cat_llh, llog_cb_t cb,
 			     void *data);
@@ -170,8 +171,6 @@ int llog_setup(const struct lu_env *env, struct obd_device *obd,
 int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt);
 int llog_cleanup(const struct lu_env *env, struct llog_ctxt *);
 int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags);
-int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt,
-		struct llog_cookie *cookies, int flags);
 
 /* llog_ioctl.c */
 struct obd_ioctl_data;
@@ -202,8 +201,6 @@ struct llog_operations {
 	int (*lop_sync)(struct llog_ctxt *ctxt, struct obd_export *exp,
 			int flags);
 	int (*lop_cleanup)(const struct lu_env *env, struct llog_ctxt *ctxt);
-	int (*lop_cancel)(const struct lu_env *env, struct llog_ctxt *ctxt,
-			  struct llog_cookie *cookies, int flags);
 	int (*lop_connect)(struct llog_ctxt *ctxt, struct llog_logid *logid,
 			   struct llog_gen *gen, struct obd_uuid *uuid);
 	/**
@@ -271,8 +268,8 @@ struct llog_handle {
 	 * case, after it will have reached LLOG_HDR_BITMAP_SIZE, llh_cat_idx
 	 * will become its upper limit */
 	int			 lgh_last_idx;
-	int			 lgh_cur_idx; /* used during llog_process */
-	__u64			 lgh_cur_offset; /* used during llog_process */
+	struct rw_semaphore	 lgh_last_sem;
+	__u64			 lgh_cur_offset; /* used for test only */
 	struct llog_ctxt	*lgh_ctxt;
 	union {
 		struct plain_handle_data	 phd;
@@ -284,7 +281,7 @@ struct llog_handle {
 	atomic_t		 lgh_refcount;
 
 	int			lgh_max_size;
-	__u32			lgh_stale:1;
+	bool			lgh_destroyed;
 };
 
 /* llog_osd.c */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_mdc.h b/drivers/staging/lustrefsx/lustre/include/lustre_mdc.h
index be0eb7742e644..826eef7bc646f 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_mdc.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_mdc.h
@@ -53,7 +53,7 @@
 #include <lustre_intent.h>
 #include <libcfs/libcfs.h>
 #include <obd_class.h>
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 #include <lustre_lib.h>
 #include <lustre_dlm.h>
 #include <lustre_export.h>
@@ -63,104 +63,6 @@ struct obd_export;
 struct ptlrpc_request;
 struct obd_device;
 
-/**
- * Serializes in-flight MDT-modifying RPC requests to preserve idempotency.
- *
- * This mutex is used to implement execute-once semantics on the MDT.
- * The MDT stores the last transaction ID and result for every client in
- * its last_rcvd file. If the client doesn't get a reply, it can safely
- * resend the request and the MDT will reconstruct the reply being aware
- * that the request has already been executed. Without this lock,
- * execution status of concurrent in-flight requests would be
- * overwritten.
- *
- * This design limits the extent to which we can keep a full pipeline of
- * in-flight requests from a single client.  This limitation could be
- * overcome by allowing multiple slots per client in the last_rcvd file.
- */
-struct mdc_rpc_lock {
-	/** Lock protecting in-flight RPC concurrency. */
-	struct mutex		rpcl_mutex;
-	/** Intent associated with currently executing request. */
-	struct lookup_intent	*rpcl_it;
-	/** Used for MDS/RPC load testing purposes. */
-	int			rpcl_fakes;
-};
-
-#define MDC_FAKE_RPCL_IT ((void *)0x2c0012bfUL)
-
-static inline void mdc_init_rpc_lock(struct mdc_rpc_lock *lck)
-{
-	mutex_init(&lck->rpcl_mutex);
-        lck->rpcl_it = NULL;
-}
-
-static inline void mdc_get_rpc_lock(struct mdc_rpc_lock *lck,
-				    struct lookup_intent *it)
-{
-	ENTRY;
-
-	if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP ||
-			   it->it_op == IT_LAYOUT || it->it_op == IT_READDIR))
-		return;
-
-	/* This would normally block until the existing request finishes.
-	 * If fail_loc is set it will block until the regular request is
-	 * done, then set rpcl_it to MDC_FAKE_RPCL_IT.  Once that is set
-	 * it will only be cleared when all fake requests are finished.
-	 * Only when all fake requests are finished can normal requests
-	 * be sent, to ensure they are recoverable again. */
- again:
-	mutex_lock(&lck->rpcl_mutex);
-
-	if (CFS_FAIL_CHECK_QUIET(OBD_FAIL_MDC_RPCS_SEM)) {
-		lck->rpcl_it = MDC_FAKE_RPCL_IT;
-		lck->rpcl_fakes++;
-		mutex_unlock(&lck->rpcl_mutex);
-		return;
-	}
-
-	/* This will only happen when the CFS_FAIL_CHECK() was
-	 * just turned off but there are still requests in progress.
-	 * Wait until they finish.  It doesn't need to be efficient
-	 * in this extremely rare case, just have low overhead in
-	 * the common case when it isn't true. */
-	while (unlikely(lck->rpcl_it == MDC_FAKE_RPCL_IT)) {
-		mutex_unlock(&lck->rpcl_mutex);
-		schedule_timeout(cfs_time_seconds(1) / 4);
-		goto again;
-	}
-
-	LASSERT(lck->rpcl_it == NULL);
-	lck->rpcl_it = it;
-}
-
-static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck,
-				    struct lookup_intent *it)
-{
-	if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP ||
-			   it->it_op == IT_LAYOUT || it->it_op == IT_READDIR))
-		goto out;
-
-	if (lck->rpcl_it == MDC_FAKE_RPCL_IT) { /* OBD_FAIL_MDC_RPCS_SEM */
-		mutex_lock(&lck->rpcl_mutex);
-
-		LASSERTF(lck->rpcl_fakes > 0, "%d\n", lck->rpcl_fakes);
-		lck->rpcl_fakes--;
-
-		if (lck->rpcl_fakes == 0)
-			lck->rpcl_it = NULL;
-
-	} else {
-		LASSERTF(it == lck->rpcl_it, "%p != %p\n", it, lck->rpcl_it);
-		lck->rpcl_it = NULL;
-	}
-
-	mutex_unlock(&lck->rpcl_mutex);
- out:
-	EXIT;
-}
-
 static inline void mdc_get_mod_rpc_slot(struct ptlrpc_request *req,
 					struct lookup_intent *it)
 {
@@ -234,6 +136,17 @@ static inline void cl_lov_delay_create_clear(unsigned int *flags)
 		*flags &= ~O_LOV_DELAY_CREATE_MASK;
 }
 
+static inline bool cl_is_lu_noimport(unsigned int flags)
+{
+	return (flags & O_LU_NOIMPORT_MASK) == O_LU_NOIMPORT_MASK;
+}
+
+static inline void cl_lu_noimport_clear(unsigned int *flags)
+{
+	if (cl_is_lu_noimport(*flags))
+		*flags &= ~O_LU_NOIMPORT_MASK;
+}
+
 /** @} mdc */
 
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_mds.h b/drivers/staging/lustrefsx/lustre/include/lustre_mds.h
index c254c7f730f10..cb43281574890 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_mds.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_mds.h
@@ -45,7 +45,7 @@
 
 #include <lustre_handles.h>
 #include <libcfs/libcfs.h>
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 #include <lustre_lib.h>
 #include <lustre_dlm.h>
 #include <lustre_export.h>
@@ -60,13 +60,34 @@ struct mds_capa_info {
         struct lustre_capa_key *capa;
 };
 
+struct md_rejig_data {
+	struct md_object	*mrd_obj;
+	__u16			mrd_mirror_id;
+};
+
 #define MDD_OBD_NAME     "mdd_obd"
 #define MDD_OBD_UUID     "mdd_obd_uuid"
 
-static inline int md_should_create(__u64 flags)
+static inline int md_should_create(u64 open_flags)
 {
-	return !(flags & MDS_OPEN_DELAY_CREATE) && (flags & FMODE_WRITE) &&
-               !(flags & MDS_OPEN_LEASE);
+	return !(open_flags & MDS_OPEN_DELAY_CREATE) &&
+		(open_flags & MDS_FMODE_WRITE) &&
+	       !(open_flags & MDS_OPEN_LEASE);
+}
+
+/* do NOT or the MAY_*'s, you'll get the weakest */
+static inline int mds_accmode(u64 open_flags)
+{
+	int res = 0;
+
+	if (open_flags & MDS_FMODE_READ)
+		res |= MAY_READ;
+	if (open_flags & (MDS_FMODE_WRITE | MDS_OPEN_TRUNC | MDS_OPEN_APPEND))
+		res |= MAY_WRITE;
+	if (open_flags & MDS_FMODE_EXEC)
+		res = MAY_EXEC;
+
+	return res;
 }
 
 /** @} mds */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_net.h b/drivers/staging/lustrefsx/lustre/include/lustre_net.h
index f6d67c832ed64..3a94a921e11de 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_net.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_net.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2016, Intel Corporation.
+ * Copyright (c) 2010, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -50,12 +50,13 @@
  *
  * @{
  */
-
+#include <linux/kobject.h>
 #include <linux/uio.h>
 #include <libcfs/libcfs.h>
-#include <lnet/nidstr.h>
 #include <lnet/api.h>
-#include <lustre/lustre_idl.h>
+#include <lnet/lib-types.h>
+#include <uapi/linux/lnet/nidstr.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 #include <lustre_ha.h>
 #include <lustre_sec.h>
 #include <lustre_import.h>
@@ -63,7 +64,7 @@
 #include <lu_object.h>
 #include <lustre_req_layout.h>
 #include <obd_support.h>
-#include <lustre_ver.h>
+#include <uapi/linux/lustre/lustre_ver.h>
 
 /* MD flags we _always_ use */
 #define PTLRPC_MD_OPTIONS  0
@@ -75,7 +76,7 @@
  * value.  The client is free to limit the actual RPC size for any bulk
  * transfer via cl_max_pages_per_rpc to some non-power-of-two value.
  * NOTE: This is limited to 16 (=64GB RPCs) by IOOBJ_MAX_BRW_BITS. */
-#define PTLRPC_BULK_OPS_BITS	4
+#define PTLRPC_BULK_OPS_BITS	6
 #if PTLRPC_BULK_OPS_BITS > 16
 #error "More than 65536 BRW RPCs not allowed by IOOBJ_MAX_BRW_BITS."
 #endif
@@ -472,19 +473,31 @@
  * - single object with 16 pages is 512 bytes
  * - OST_IO_MAXREQSIZE must be at least 1 page of cookies plus some spillover
  * - Must be a multiple of 1024
- * - actual size is about 18K
  */
-#define _OST_MAXREQSIZE_SUM (sizeof(struct lustre_msg) + \
-			     sizeof(struct ptlrpc_body) + \
-			     sizeof(struct obdo) + \
-			     sizeof(struct obd_ioobj) + \
-			     sizeof(struct niobuf_remote) * DT_MAX_BRW_PAGES)
+#define _OST_MAXREQSIZE_BASE ((unsigned long)(sizeof(struct lustre_msg) + \
+				    sizeof(struct ptlrpc_body) +	  \
+				    sizeof(struct obdo) +		  \
+				    sizeof(struct obd_ioobj) +		  \
+				    sizeof(struct niobuf_remote)))
+#define _OST_MAXREQSIZE_SUM ((unsigned long)(_OST_MAXREQSIZE_BASE +	  \
+				   sizeof(struct niobuf_remote) *	  \
+				   (DT_MAX_BRW_PAGES - 1)))
 /**
  * FIEMAP request can be 4K+ for now
  */
-#define OST_MAXREQSIZE		(16 * 1024)
-#define OST_IO_MAXREQSIZE	max_t(int, OST_MAXREQSIZE, \
-				(((_OST_MAXREQSIZE_SUM - 1) | (1024 - 1)) + 1))
+#define OST_MAXREQSIZE		(16UL * 1024UL)
+#define OST_IO_MAXREQSIZE	max(OST_MAXREQSIZE,			\
+				   ((_OST_MAXREQSIZE_SUM - 1) |		\
+				    (1024UL - 1)) + 1)
+/* Safe estimate of free space in standard RPC, provides upper limit for # of
+ * bytes of i/o to pack in RPC (skipping bulk transfer). */
+#define OST_SHORT_IO_SPACE	(OST_IO_MAXREQSIZE - _OST_MAXREQSIZE_BASE)
+
+/* Actual size used for short i/o buffer.  Calculation means this:
+ * At least one page (for large PAGE_SIZE), or 16 KiB, but not more
+ * than the available space aligned to a page boundary. */
+#define OBD_MAX_SHORT_IO_BYTES	min(max(PAGE_SIZE, 16UL * 1024UL), \
+				    OST_SHORT_IO_SPACE & PAGE_MASK)
 
 #define OST_MAXREPSIZE		(9 * 1024)
 #define OST_IO_MAXREPSIZE	OST_MAXREPSIZE
@@ -498,6 +511,7 @@
  */
 #define OST_IO_BUFSIZE		max_t(int, OST_IO_MAXREQSIZE + 1024, 64 * 1024)
 
+
 /* Macro to hide a typecast. */
 #define ptlrpc_req_async_args(req) ((void *)&req->rq_async_args)
 
@@ -552,7 +566,6 @@ union ptlrpc_async_args {
 };
 
 struct ptlrpc_request_set;
-typedef int (*set_interpreter_func)(struct ptlrpc_request_set *, void *, int);
 typedef int (*set_producer_func)(struct ptlrpc_request_set *, void *);
 
 /**
@@ -574,19 +587,8 @@ struct ptlrpc_request_set {
 	atomic_t		set_remaining;
 	/** wait queue to wait on for request events */
 	wait_queue_head_t	set_waitq;
-	wait_queue_head_t      *set_wakeup_ptr;
 	/** List of requests in the set */
 	struct list_head	set_requests;
-	/**
-	 * List of completion callbacks to be called when the set is completed
-	 * This is only used if \a set_interpret is NULL.
-	 * Links struct ptlrpc_set_cbdata.
-	 */
-	struct list_head	set_cblist;
-	/** Completion callback, if only one. */
-	set_interpreter_func	set_interpret;
-	/** opaq argument passed to completion \a set_interpret callback. */
-	void			*set_arg;
 	/**
 	 * Lock for \a set_new_requests manipulations
 	 * locked so that any old caller can communicate requests to
@@ -608,18 +610,6 @@ struct ptlrpc_request_set {
 	unsigned int		 set_allow_intr:1;
 };
 
-/**
- * Description of a single ptrlrpc_set callback
- */
-struct ptlrpc_set_cbdata {
-	/** List linkage item */
-	struct list_head	psc_item;
-	/** Pointer to interpreting function */
-	set_interpreter_func	psc_interpret;
-	/** Opaq argument to pass to the callback */
-	void			*psc_data;
-};
-
 struct ptlrpc_bulk_desc;
 struct ptlrpc_service_part;
 struct ptlrpc_service;
@@ -784,9 +774,9 @@ struct ptlrpc_cli_req {
 	/** For bulk requests on client only: bulk descriptor */
 	struct ptlrpc_bulk_desc		*cr_bulk;
 	/** optional time limit for send attempts */
-	cfs_duration_t			 cr_delay_limit;
+	time64_t			 cr_delay_limit;
 	/** time request was first queued */
-	cfs_time_t			 cr_queued_time;
+	time64_t			 cr_queued_time;
 	/** request sent in nanoseconds */
 	ktime_t				 cr_sent_ns;
 	/** time for request really sent out */
@@ -1059,6 +1049,13 @@ struct ptlrpc_request {
 	/** description of flavors for client & server */
 	struct sptlrpc_flavor		 rq_flvr;
 
+	/**
+	 * SELinux policy info at the time of the request
+	 * sepol string format is:
+	 * <mode>:<policy name>:<policy version>:<policy hash>
+	 */
+	char rq_sepol[LUSTRE_NODEMAP_SEPOL_LENGTH + 1];
+
 	/* client/server security flags */
 	unsigned int
                                  rq_ctx_init:1,      /* context initiation */
@@ -1115,8 +1112,17 @@ struct ptlrpc_request {
 	/**
 	 * service time estimate (secs)
 	 * If the request is not served by this time, it is marked as timed out.
+	 * Do not change to time64_t since this is transmitted over the wire.
+	 *
+	 * The linux kernel handles timestamps with time64_t and timeouts
+	 * are normally done with jiffies. Lustre shares the rq_timeout between
+	 * nodes. Since jiffies can vary from node to node Lustre instead
+	 * will express the timeout value in seconds. To avoid confusion with
+	 * timestamps (time64_t) and jiffy timeouts (long) Lustre timeouts
+	 * are expressed in s32 (timeout_t). Also what is transmitted over
+	 * the wire is 32 bits.
 	 */
-	int				 rq_timeout;
+	timeout_t			 rq_timeout;
 	/**
 	 * when request/reply sent (secs), or time when request should be sent
 	 */
@@ -1173,37 +1179,37 @@ static inline bool ptlrpc_nrs_req_can_move(struct ptlrpc_request *req)
 /** @} nrs */
 
 /**
- * Returns 1 if request buffer at offset \a index was already swabbed
+ * Returns true if request buffer at offset \a index was already swabbed
  */
-static inline int lustre_req_swabbed(struct ptlrpc_request *req, size_t index)
+static inline bool lustre_req_swabbed(struct ptlrpc_request *req, size_t index)
 {
-        LASSERT(index < sizeof(req->rq_req_swab_mask) * 8);
-        return req->rq_req_swab_mask & (1 << index);
+	LASSERT(index < sizeof(req->rq_req_swab_mask) * 8);
+	return req->rq_req_swab_mask & (1 << index);
 }
 
 /**
- * Returns 1 if request reply buffer at offset \a index was already swabbed
+ * Returns true if request reply buffer at offset \a index was already swabbed
  */
-static inline int lustre_rep_swabbed(struct ptlrpc_request *req, size_t index)
+static inline bool lustre_rep_swabbed(struct ptlrpc_request *req, size_t index)
 {
-        LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8);
-        return req->rq_rep_swab_mask & (1 << index);
+	LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8);
+	return req->rq_rep_swab_mask & (1 << index);
 }
 
 /**
- * Returns 1 if request needs to be swabbed into local cpu byteorder
+ * Returns true if request needs to be swabbed into local cpu byteorder
  */
-static inline int ptlrpc_req_need_swab(struct ptlrpc_request *req)
+static inline bool ptlrpc_req_need_swab(struct ptlrpc_request *req)
 {
-        return lustre_req_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+	return lustre_req_swabbed(req, MSG_PTLRPC_HEADER_OFF);
 }
 
 /**
- * Returns 1 if request reply needs to be swabbed into local cpu byteorder
+ * Returns true if request reply needs to be swabbed into local cpu byteorder
  */
-static inline int ptlrpc_rep_need_swab(struct ptlrpc_request *req)
+static inline bool ptlrpc_rep_need_swab(struct ptlrpc_request *req)
 {
-        return lustre_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+	return lustre_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
 }
 
 /**
@@ -1438,6 +1444,8 @@ extern const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kvec_ops;
  *  Another user is readpage for MDT.
  */
 struct ptlrpc_bulk_desc {
+	/** number MD's assigned including zero-sends */
+	unsigned int bd_refs;
 	/** completed with failure */
 	unsigned long bd_failure:1;
 	/** client side */
@@ -1462,6 +1470,7 @@ struct ptlrpc_bulk_desc {
 	int                    bd_max_iov;      /* allocated size of bd_iov */
 	int                    bd_nob;          /* # bytes covered */
 	int                    bd_nob_transferred; /* # bytes GOT/PUT */
+	unsigned int	       bd_nob_last;	/* # bytes in last MD */
 
 	__u64                  bd_last_mbits;
 
@@ -1469,6 +1478,8 @@ struct ptlrpc_bulk_desc {
 	lnet_nid_t             bd_sender;       /* stash event::sender */
 	int			bd_md_count;	/* # valid entries in bd_mds */
 	int			bd_md_max_brw;	/* max entries in bd_mds */
+	/** array of offsets for each MD */
+	unsigned int		bd_mds_off[PTLRPC_BULK_OPS_COUNT];
 	/** array of associated MDs */
 	struct lnet_handle_md	bd_mds[PTLRPC_BULK_OPS_COUNT];
 
@@ -1693,8 +1704,8 @@ struct ptlrpc_service {
 	int				srv_nthrs_cpt_init;
 	/** limit of threads number for each partition */
 	int				srv_nthrs_cpt_limit;
-        /** Root of /proc dir tree for this service */
-	struct proc_dir_entry           *srv_procroot;
+	/** Root of debugfs dir tree for this service */
+	struct dentry		       *srv_debugfs_entry;
         /** Pointer to statistic data for this service */
         struct lprocfs_stats           *srv_stats;
         /** # hp per lp reqs to handle */
@@ -1720,17 +1731,25 @@ struct ptlrpc_service {
         int                             srv_watchdog_factor;
         /** under unregister_service */
         unsigned                        srv_is_stopping:1;
+	/** Whether or not to restrict service threads to CPUs in this CPT */
+	unsigned			srv_cpt_bind:1;
 
+	/** max # request buffers */
+	int				srv_nrqbds_max;
 	/** max # request buffers in history per partition */
 	int				srv_hist_nrqbds_cpt_max;
-	/** number of CPTs this service bound on */
+	/** number of CPTs this service associated with */
 	int				srv_ncpts;
-	/** CPTs array this service bound on */
+	/** CPTs array this service associated with */
 	__u32				*srv_cpts;
 	/** 2^srv_cptab_bits >= cfs_cpt_numbert(srv_cptable) */
 	int				srv_cpt_bits;
 	/** CPT table this service is running over */
 	struct cfs_cpt_table		*srv_cptable;
+
+	/* sysfs object */
+	struct kobject			srv_kobj;
+	struct completion		srv_kobj_unregister;
 	/**
 	 * partition data for ptlrpc service
 	 */
@@ -1777,6 +1796,8 @@ struct ptlrpc_service_part {
 	 * threads starting & stopping are also protected by this lock.
 	 */
 	spinlock_t			scp_lock  __cfs_cacheline_aligned;
+	/** userland serialization */
+	struct mutex			scp_mutex;
 	/** total # req buffer descs allocated */
 	int				scp_nrqbds_total;
 	/** # posted request buffers for receiving */
@@ -1791,8 +1812,8 @@ struct ptlrpc_service_part {
 	struct list_head		scp_rqbd_posted;
 	/** incoming reqs */
 	struct list_head		scp_req_incoming;
-	/** timeout before re-posting reqs, in tick */
-	cfs_duration_t			scp_rqbd_timeout;
+	/** timeout before re-posting reqs, in jiffies */
+	long				scp_rqbd_timeout;
 	/**
 	 * all threads sleep on this. This wait-queue is signalled when new
 	 * incoming request arrives and when difficult reply has to be handled.
@@ -1843,7 +1864,7 @@ struct ptlrpc_service_part {
 	/** early reply timer */
 	struct timer_list		scp_at_timer;
 	/** debug */
-	cfs_time_t			scp_at_checktime;
+	ktime_t				scp_at_checktime;
 	/** check early replies */
 	unsigned			scp_at_check;
 	/** @} */
@@ -2061,7 +2082,7 @@ static inline int ptlrpc_server_bulk_active(struct ptlrpc_bulk_desc *desc)
 	LASSERT(desc != NULL);
 
 	spin_lock(&desc->bd_lock);
-	rc = desc->bd_md_count;
+	rc = desc->bd_refs;
 	spin_unlock(&desc->bd_lock);
 	return rc;
 }
@@ -2078,14 +2099,15 @@ static inline int ptlrpc_client_bulk_active(struct ptlrpc_request *req)
 	LASSERT(req != NULL);
 	desc = req->rq_bulk;
 
+	if (!desc)
+		return 0;
+
 	if (req->rq_bulk_deadline > ktime_get_real_seconds())
 		return 1;
 
-	if (!desc)
-		return 0;
 
 	spin_lock(&desc->bd_lock);
-	rc = desc->bd_md_count;
+	rc = desc->bd_refs;
 	spin_unlock(&desc->bd_lock);
 	return rc;
 }
@@ -2125,10 +2147,8 @@ void ptlrpc_abort_set(struct ptlrpc_request_set *set);
 struct ptlrpc_request_set *ptlrpc_prep_set(void);
 struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func,
 					     void *arg);
-int ptlrpc_set_add_cb(struct ptlrpc_request_set *set,
-                      set_interpreter_func fn, void *data);
 int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set);
-int ptlrpc_set_wait(struct ptlrpc_request_set *);
+int ptlrpc_set_wait(const struct lu_env *env, struct ptlrpc_request_set *);
 void ptlrpc_mark_interrupted(struct ptlrpc_request *req);
 void ptlrpc_set_destroy(struct ptlrpc_request_set *);
 void ptlrpc_set_add_req(struct ptlrpc_request_set *, struct ptlrpc_request *);
@@ -2245,8 +2265,8 @@ struct ptlrpc_service_thr_conf {
 	/* user specified threads number, it will be validated due to
 	 * other members of this structure. */
 	unsigned int			tc_nthrs_user;
-	/* set NUMA node affinity for service threads */
-	unsigned int			tc_cpu_affinity;
+	/* bind service threads to only CPUs in their associated CPT */
+	unsigned int			tc_cpu_bind;
 	/* Tags for lu_context associated with service thread */
 	__u32				tc_ctx_tags;
 };
@@ -2255,6 +2275,8 @@ struct ptlrpc_service_cpt_conf {
 	struct cfs_cpt_table		*cc_cptable;
 	/* string pattern to describe CPTs for a service */
 	char				*cc_pattern;
+	/* whether or not to have per-CPT service partitions */
+	bool				cc_affinity;
 };
 
 struct ptlrpc_service_conf {
@@ -2287,18 +2309,18 @@ void ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs);
 int ptlrpc_hpreq_handler(struct ptlrpc_request *req);
 struct ptlrpc_service *ptlrpc_register_service(
 				struct ptlrpc_service_conf *conf,
-				struct proc_dir_entry *proc_entry);
+				struct kset *parent,
+				struct dentry *debugfs_entry);
 void ptlrpc_stop_all_threads(struct ptlrpc_service *svc);
 
 int ptlrpc_start_threads(struct ptlrpc_service *svc);
 int ptlrpc_unregister_service(struct ptlrpc_service *service);
-int liblustre_check_services(void *arg);
-void ptlrpc_daemonize(char *name);
 int ptlrpc_service_health_check(struct ptlrpc_service *);
 void ptlrpc_server_drop_request(struct ptlrpc_request *req);
 void ptlrpc_request_change_export(struct ptlrpc_request *req,
 				  struct obd_export *export);
-void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay);
+void ptlrpc_update_export_timer(struct obd_export *exp,
+				time64_t extra_delay);
 
 int ptlrpc_hr_init(void);
 void ptlrpc_hr_fini(void);
@@ -2311,8 +2333,10 @@ void ptlrpc_hr_fini(void);
  * @{
  */
 int ptlrpc_connect_import(struct obd_import *imp);
+int ptlrpc_connect_import_locked(struct obd_import *imp);
 int ptlrpc_init_import(struct obd_import *imp);
 int ptlrpc_disconnect_import(struct obd_import *imp, int noclose);
+int ptlrpc_disconnect_and_idle_import(struct obd_import *imp);
 int ptlrpc_import_recovery_state_machine(struct obd_import *imp);
 void deuuidify(char *uuid, const char *prefix, char **uuid_start,
 	       int *uuid_len);
@@ -2326,8 +2350,14 @@ int ptlrpc_reconnect_import(struct obd_import *imp);
  *
  * @{
  */
-int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout,
-			 __u32 index);
+#define PTLRPC_MAX_BUFCOUNT \
+	(sizeof(((struct ptlrpc_request *)0)->rq_req_swab_mask) * 8)
+#define MD_MAX_BUFLEN		(MDS_REG_MAXREQSIZE > OUT_MAXREQSIZE ? \
+				 MDS_REG_MAXREQSIZE : OUT_MAXREQSIZE)
+#define PTLRPC_MAX_BUFLEN	(OST_IO_MAXREQSIZE > MD_MAX_BUFLEN ? \
+				 OST_IO_MAXREQSIZE : MD_MAX_BUFLEN)
+bool ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout,
+			  __u32 index);
 void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout,
 			    __u32 index);
 int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len);
@@ -2370,7 +2400,7 @@ __u32 lustre_msg_get_op_flags(struct lustre_msg *msg);
 void lustre_msg_add_op_flags(struct lustre_msg *msg, __u32 flags);
 struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg);
 __u32 lustre_msg_get_type(struct lustre_msg *msg);
-__u32 lustre_msg_get_version(struct lustre_msg *msg);
+enum lustre_msg_version lustre_msg_get_version(struct lustre_msg *msg);
 void lustre_msg_add_version(struct lustre_msg *msg, __u32 version);
 __u32 lustre_msg_get_opc(struct lustre_msg *msg);
 __u64 lustre_msg_get_last_xid(struct lustre_msg *msg);
@@ -2385,8 +2415,8 @@ void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit);
 int lustre_msg_get_status(struct lustre_msg *msg);
 __u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg);
 __u32 lustre_msg_get_magic(struct lustre_msg *msg);
-__u32 lustre_msg_get_timeout(struct lustre_msg *msg);
-__u32 lustre_msg_get_service_time(struct lustre_msg *msg);
+timeout_t lustre_msg_get_timeout(struct lustre_msg *msg);
+timeout_t lustre_msg_get_service_timeout(struct lustre_msg *msg);
 char *lustre_msg_get_jobid(struct lustre_msg *msg);
 __u32 lustre_msg_get_cksum(struct lustre_msg *msg);
 __u64 lustre_msg_get_mbits(struct lustre_msg *msg);
@@ -2403,8 +2433,9 @@ void lustre_msg_set_status(struct lustre_msg *msg, __u32 status);
 void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt);
 void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *sizes);
 void ptlrpc_request_set_replen(struct ptlrpc_request *req);
-void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout);
-void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time);
+void lustre_msg_set_timeout(struct lustre_msg *msg, timeout_t timeout);
+void lustre_msg_set_service_timeout(struct lustre_msg *msg,
+				    timeout_t service_timeout);
 void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid);
 void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum);
 void lustre_msg_set_mbits(struct lustre_msg *msg, __u64 mbits);
@@ -2588,11 +2619,8 @@ static inline int ptlrpc_req_get_repsize(struct ptlrpc_request *req)
 static inline int ptlrpc_send_limit_expired(struct ptlrpc_request *req)
 {
         if (req->rq_delay_limit != 0 &&
-            cfs_time_before(cfs_time_add(req->rq_queued_time,
-                                         cfs_time_seconds(req->rq_delay_limit)),
-                            cfs_time_current())) {
+	    req->rq_queued_time + req->rq_delay_limit < ktime_get_seconds())
                 return 1;
-        }
         return 0;
 }
 
@@ -2659,11 +2687,6 @@ struct timeout_item;
 typedef int (*timeout_cb_t)(struct timeout_item *, void *);
 int ptlrpc_pinger_add_import(struct obd_import *imp);
 int ptlrpc_pinger_del_import(struct obd_import *imp);
-int ptlrpc_add_timeout_client(int time, enum timeout_event event,
-			      timeout_cb_t cb, void *data,
-			      struct list_head *obd_list);
-int ptlrpc_del_timeout_client(struct list_head *obd_list,
-                              enum timeout_event event);
 struct ptlrpc_request * ptlrpc_prep_ping(struct obd_import *imp);
 int ptlrpc_obd_ping(struct obd_device *obd);
 void ping_evictor_start(void);
@@ -2702,11 +2725,9 @@ static inline void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes) {}
 
 /* ptlrpc/llog_server.c */
 int llog_origin_handle_open(struct ptlrpc_request *req);
-int llog_origin_handle_destroy(struct ptlrpc_request *req);
 int llog_origin_handle_prev_block(struct ptlrpc_request *req);
 int llog_origin_handle_next_block(struct ptlrpc_request *req);
 int llog_origin_handle_read_header(struct ptlrpc_request *req);
-int llog_origin_handle_close(struct ptlrpc_request *req);
 
 /* ptlrpc/llog_client.c */
 extern struct llog_operations llog_client_ops;
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nodemap.h b/drivers/staging/lustrefsx/lustre/include/lustre_nodemap.h
index 7cabc6f2424d7..9d200bf651b64 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_nodemap.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_nodemap.h
@@ -21,13 +21,16 @@
  */
 /*
  * Copyright (C) 2013, Trustees of Indiana University
+ *
+ * Copyright (c) 2017, Intel Corporation.
+ *
  * Author: Joshua Walgenbach <jjw@iu.edu>
  */
 
 #ifndef _LUSTRE_NODEMAP_H
 #define _LUSTRE_NODEMAP_H
 
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 
 #define LUSTRE_NODEMAP_NAME "nodemap"
 
@@ -73,7 +76,8 @@ struct lu_nodemap {
 				 nmf_deny_unknown:1,
 				 nmf_allow_root_access:1,
 				 nmf_map_uid_only:1,
-				 nmf_map_gid_only:1;
+				 nmf_map_gid_only:1,
+				 nmf_enable_audit:1;
 	/* unique ID set by MGS */
 	unsigned int		 nm_id;
 	/* nodemap ref counter */
@@ -102,6 +106,8 @@ struct lu_nodemap {
 	struct nodemap_pde	*nm_pde_data;
 	/* fileset the nodes of this nodemap are restricted to */
 	char			 nm_fileset[PATH_MAX+1];
+	/* information about the expected SELinux policy on the nodes */
+	char			 nm_sepol[LUSTRE_NODEMAP_SEPOL_LENGTH + 1];
 
 	/* used when loading/unloading nodemaps */
 	struct list_head	 nm_list;
@@ -132,6 +138,7 @@ int nodemap_set_deny_unknown(const char *name, bool deny_unknown);
 int nodemap_set_mapping_mode(const char *name, enum nodemap_mapping_modes mode);
 int nodemap_set_squash_uid(const char *name, uid_t uid);
 int nodemap_set_squash_gid(const char *name, gid_t gid);
+int nodemap_set_audit_mode(const char *name, bool enable_audit);
 bool nodemap_can_setquota(const struct lu_nodemap *nodemap);
 int nodemap_add_idmap(const char *name, enum nodemap_id_type id_type,
 		      const __u32 map[2]);
@@ -139,6 +146,8 @@ int nodemap_del_idmap(const char *name, enum nodemap_id_type id_type,
 		      const __u32 map[2]);
 int nodemap_set_fileset(const char *name, const char *fileset);
 char *nodemap_get_fileset(const struct lu_nodemap *nodemap);
+int nodemap_set_sepol(const char *name, const char *sepol);
+const char *nodemap_get_sepol(const struct lu_nodemap *nodemap);
 __u32 nodemap_map_id(struct lu_nodemap *nodemap,
 		     enum nodemap_id_type id_type,
 		     enum nodemap_tree_type tree_type, __u32 id);
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_nrs_tbf.h b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_tbf.h
index 6e0c736ab8d87..0a407197c36f6 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_nrs_tbf.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_nrs_tbf.h
@@ -51,7 +51,31 @@ struct nrs_tbf_jobid {
 	struct list_head tj_linkage;
 };
 
-#define NRS_TBF_KEY_LEN	(LNET_NIDSTR_SIZE + LUSTRE_JOBID_SIZE + 3 + 2)
+#define MAX_U32_STR_LEN	10
+#define NRS_TBF_KEY_LEN	(LNET_NIDSTR_SIZE + LUSTRE_JOBID_SIZE + \
+			 MAX_U32_STR_LEN + MAX_U32_STR_LEN + 3 + 2)
+
+enum nrs_tbf_flag {
+	NRS_TBF_FLAG_INVALID    = 0x0000000,
+	NRS_TBF_FLAG_JOBID      = 0x0000001,
+	NRS_TBF_FLAG_NID        = 0x0000002,
+	NRS_TBF_FLAG_OPCODE     = 0x0000004,
+	NRS_TBF_FLAG_GENERIC    = 0x0000008,
+	NRS_TBF_FLAG_UID        = 0x0000010,
+	NRS_TBF_FLAG_GID        = 0x0000020,
+};
+
+struct tbf_id {
+	enum nrs_tbf_flag	ti_type;
+	u32			ti_uid;
+	u32			ti_gid;
+};
+
+struct nrs_tbf_id {
+	struct tbf_id		nti_id;
+	struct list_head	nti_linkage;
+};
+
 struct nrs_tbf_client {
 	/** Resource object for policy instance. */
 	struct ptlrpc_nrs_resource	 tc_res;
@@ -63,6 +87,8 @@ struct nrs_tbf_client {
 	char				 tc_jobid[LUSTRE_JOBID_SIZE];
 	/** opcode of the client. */
 	__u32				 tc_opcode;
+	/** gid or uid of the client. */
+	struct tbf_id			tc_id;
 	/** Hash key of the client. */
 	char				 tc_key[NRS_TBF_KEY_LEN];
 	/** Reference number of the client. */
@@ -85,6 +111,13 @@ struct nrs_tbf_client {
 	__u64				 tc_depth;
 	/** Time check-point. */
 	__u64				 tc_check_time;
+	/** Deadline of a class */
+	__u64				 tc_deadline;
+	/**
+	 * Time residue: the remainder of elapsed time
+	 * divided by nsecs when dequeue a request.
+	 */
+	__u64				 tc_nsecs_resid;
 	/** List of queued requests. */
 	struct list_head		 tc_list;
 	/** Node in binary heap. */
@@ -102,8 +135,11 @@ struct nrs_tbf_client {
 
 #define MAX_TBF_NAME (16)
 
-#define NTRS_STOPPING	0x0000001
-#define NTRS_DEFAULT	0x0000002
+enum nrs_rule_flags {
+	NTRS_STOPPING	= 0x00000001,
+	NTRS_DEFAULT	= 0x00000002,
+	NTRS_REALTIME	= 0x00000004,
+};
 
 struct nrs_tbf_rule {
 	/** Name of the rule. */
@@ -120,6 +156,10 @@ struct nrs_tbf_rule {
 	struct list_head		 tr_jobids;
 	/** Jobid list string of the rule.*/
 	char				*tr_jobids_str;
+	/** uid/gid list of the rule. */
+	struct list_head		tr_ids;
+	/** uid/gid list string of the rule. */
+	char				*tr_ids_str;
 	/** Opcode bitmap of the rule. */
 	struct cfs_bitmap		*tr_opcodes;
 	/** Opcode list string of the rule.*/
@@ -139,7 +179,7 @@ struct nrs_tbf_rule {
 	/** List of client. */
 	struct list_head		 tr_cli_list;
 	/** Flags of the rule. */
-	__u32				 tr_flags;
+	enum nrs_rule_flags		 tr_flags;
 	/** Usage Reference count taken on the rule. */
 	atomic_t			 tr_ref;
 	/** Generation of the rule. */
@@ -168,16 +208,10 @@ struct nrs_tbf_ops {
 #define NRS_TBF_TYPE_NID	"nid"
 #define NRS_TBF_TYPE_OPCODE	"opcode"
 #define NRS_TBF_TYPE_GENERIC	"generic"
+#define NRS_TBF_TYPE_UID	"uid"
+#define NRS_TBF_TYPE_GID	"gid"
 #define NRS_TBF_TYPE_MAX_LEN	20
 
-enum nrs_tbf_flag {
-	NRS_TBF_FLAG_INVALID	= 0x0000000,
-	NRS_TBF_FLAG_JOBID	= 0x0000001,
-	NRS_TBF_FLAG_NID	= 0x0000002,
-	NRS_TBF_FLAG_OPCODE	= 0x0000004,
-	NRS_TBF_FLAG_GENERIC	= 0x0000008,
-};
-
 struct nrs_tbf_type {
 	const char		*ntt_name;
 	enum nrs_tbf_flag	 ntt_flag;
@@ -270,12 +304,14 @@ struct nrs_tbf_cmd {
 			char			*ts_nids_str;
 			struct list_head	 ts_jobids;
 			char			*ts_jobids_str;
+			struct list_head	 ts_ids;
+			char			*ts_ids_str;
 			struct cfs_bitmap	*ts_opcodes;
 			char			*ts_opcodes_str;
 			struct list_head	 ts_conds;
 			char			*ts_conds_str;
 			__u32			 ts_valid_type;
-			__u32			 ts_rule_flags;
+			enum nrs_rule_flags	 ts_rule_flags;
 			char			*ts_next_name;
 		} tc_start;
 		struct nrs_tbf_cmd_change {
@@ -289,6 +325,8 @@ enum nrs_tbf_field {
 	NRS_TBF_FIELD_NID,
 	NRS_TBF_FIELD_JOBID,
 	NRS_TBF_FIELD_OPCODE,
+	NRS_TBF_FIELD_UID,
+	NRS_TBF_FIELD_GID,
 	NRS_TBF_FIELD_MAX
 };
 
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_obdo.h b/drivers/staging/lustrefsx/lustre/include/lustre_obdo.h
index d3afac961b043..dd99eee5af714 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_obdo.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_obdo.h
@@ -35,7 +35,7 @@
 #ifndef _LUSTRE_OBDO_H_
 #define _LUSTRE_OBDO_H_
 
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 
 /**
  * Create an obdo to send over the wire
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_cl_internal.h b/drivers/staging/lustrefsx/lustre/include/lustre_osc.h
similarity index 52%
rename from drivers/staging/lustrefsx/lustre/osc/osc_cl_internal.h
rename to drivers/staging/lustrefsx/lustre/include/lustre_osc.h
index 7e6cbc017dfde..f865036f897cf 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_cl_internal.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_osc.h
@@ -23,35 +23,99 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  * Lustre is a trademark of Sun Microsystems, Inc.
  */
 /*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
+ * lustre/include/lustre_osc.h
  *
- * Internal interfaces of OSC layer.
+ * OSC layer structures and methods common for both OSC and MDC.
+ *
+ * This file contains OSC interfaces used by OSC and MDC. Most of them
+ * were just moved from lustre/osc/osc_cl_internal.h for Data-on-MDT
+ * purposes.
  *
  *   Author: Nikita Danilov <nikita.danilov@sun.com>
  *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ *   Author: Mikhail Pershin <mike.pershin@intel.com>
  */
 
-#ifndef OSC_CL_INTERNAL_H
-#define OSC_CL_INTERNAL_H
+#ifndef LUSTRE_OSC_H
+#define LUSTRE_OSC_H
 
 #include <libcfs/libcfs.h>
 #include <obd.h>
-/* osc_build_res_name() */
 #include <cl_object.h>
-#include "osc_internal.h"
 
 /** \defgroup osc osc
  *  @{
  */
 
+struct osc_quota_info {
+	/** linkage for quota hash table */
+	struct hlist_node oqi_hash;
+	__u32             oqi_id;
+};
+
+enum async_flags {
+	ASYNC_READY = 0x1, /* ap_make_ready will not be called before this
+			      page is added to an rpc */
+	ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */
+	ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called
+				     to give the caller a chance to update
+				     or cancel the size of the io */
+	ASYNC_HP = 0x10,
+};
+
+struct osc_async_page {
+	int			oap_magic;
+	unsigned short		oap_cmd;
+	unsigned short		oap_interrupted:1;
+
+	struct list_head	oap_pending_item;
+	struct list_head	oap_rpc_item;
+
+	loff_t			oap_obj_off;
+	unsigned		oap_page_off;
+	enum async_flags	oap_async_flags;
+
+	struct brw_page		oap_brw_page;
+
+	struct ptlrpc_request	*oap_request;
+	struct client_obd	*oap_cli;
+	struct osc_object	*oap_obj;
+
+	spinlock_t		 oap_lock;
+};
+
+#define oap_page	oap_brw_page.pg
+#define oap_count	oap_brw_page.count
+#define oap_brw_flags	oap_brw_page.flag
+
+static inline struct osc_async_page *brw_page2oap(struct brw_page *pga)
+{
+	return container_of(pga, struct osc_async_page, oap_brw_page);
+}
+
+struct osc_device {
+	struct cl_device	od_cl;
+	struct obd_export	*od_exp;
+
+	/* Write stats is actually protected by client_obd's lock. */
+	struct osc_stats {
+		uint64_t	os_lockless_writes;    /* by bytes */
+		uint64_t	os_lockless_reads;     /* by bytes */
+		uint64_t	os_lockless_truncates; /* by times */
+	} od_stats;
+
+	/* configuration item(s) */
+	time64_t		od_contention_time;
+	int			od_lockless_truncate;
+};
+
 struct osc_extent;
 
 /**
@@ -63,7 +127,9 @@ struct osc_io {
 	/** true if this io is lockless. */
 	unsigned int	   oi_lockless:1,
 	/** true if this io is counted as active IO */
-			   oi_is_active:1;
+			   oi_is_active:1,
+	/** true if this io has CAP_SYS_RESOURCE */
+			   oi_cap_sys_resource:1;
 	/** how many LRU pages are reserved for this IO */
 	unsigned long	   oi_lru_reserved;
 
@@ -78,8 +144,8 @@ struct osc_io {
 	struct obdo        oi_oa;
 	struct osc_async_cbargs {
 		bool		  opc_rpc_sent;
-		int               opc_rc;
-		struct completion	opc_sync;
+		int		  opc_rc;
+		struct completion opc_sync;
 	} oi_cbarg;
 };
 
@@ -87,7 +153,7 @@ struct osc_io {
  * State maintained by osc layer for the duration of a system call.
  */
 struct osc_session {
-        struct osc_io       os_io;
+	struct osc_io os_io;
 };
 
 #define OTI_PVEC_SIZE 256
@@ -99,6 +165,7 @@ struct osc_thread_info {
 	struct lustre_handle	oti_handle;
 	struct cl_page_list	oti_plist;
 	struct cl_io		oti_io;
+	struct pagevec		oti_pagevec;
 	void			*oti_pvec[OTI_PVEC_SIZE];
 	/**
 	 * Fields used by cl_lock_discard_pages().
@@ -110,21 +177,88 @@ struct osc_thread_info {
 	struct lu_buf		oti_ladvise_buf;
 };
 
+static inline __u64 osc_enq2ldlm_flags(__u32 enqflags)
+{
+	__u64 result = 0;
+
+	CDEBUG(D_DLMTRACE, "flags: %x\n", enqflags);
+
+	LASSERT((enqflags & ~CEF_MASK) == 0);
+
+	if (enqflags & CEF_NONBLOCK)
+		result |= LDLM_FL_BLOCK_NOWAIT;
+	if (enqflags & CEF_GLIMPSE)
+		result |= LDLM_FL_HAS_INTENT;
+	if (enqflags & CEF_DISCARD_DATA)
+		result |= LDLM_FL_AST_DISCARD_DATA;
+	if (enqflags & CEF_PEEK)
+		result |= LDLM_FL_TEST_LOCK;
+	if (enqflags & CEF_LOCK_MATCH)
+		result |= LDLM_FL_MATCH_LOCK;
+	if (enqflags & CEF_LOCK_NO_EXPAND)
+		result |= LDLM_FL_NO_EXPANSION;
+	if (enqflags & CEF_SPECULATIVE)
+		result |= LDLM_FL_SPECULATIVE;
+	return result;
+}
+
+typedef int (*osc_enqueue_upcall_f)(void *cookie, struct lustre_handle *lockh,
+				    int rc);
+
+struct osc_enqueue_args {
+	struct obd_export	*oa_exp;
+	enum ldlm_type		oa_type;
+	enum ldlm_mode		oa_mode;
+	__u64			*oa_flags;
+	osc_enqueue_upcall_f	oa_upcall;
+	void			*oa_cookie;
+	struct ost_lvb		*oa_lvb;
+	struct lustre_handle	oa_lockh;
+	bool			oa_speculative;
+};
+
+/**
+ * Bit flags for osc_dlm_lock_at_pageoff().
+ */
+enum osc_dap_flags {
+	/**
+	 * Just check if the desired lock exists, it won't hold reference
+	 * count on lock.
+	 */
+	OSC_DAP_FL_TEST_LOCK = 1 << 0,
+	/**
+	 * Return the lock even if it is being canceled.
+	 */
+	OSC_DAP_FL_CANCELING = 1 << 1
+};
+
+/*
+ * The set of operations which are different for MDC and OSC objects
+ */
+struct osc_object_operations {
+	void (*oto_build_res_name)(struct osc_object *osc,
+				   struct ldlm_res_id *resname);
+	struct ldlm_lock* (*oto_dlmlock_at_pgoff)(const struct lu_env *env,
+						struct osc_object *obj,
+						pgoff_t index,
+						enum osc_dap_flags dap_flags);
+};
+
 struct osc_object {
-        struct cl_object   oo_cl;
-        struct lov_oinfo  *oo_oinfo;
-        /**
-         * True if locking against this stripe got -EUSERS.
-         */
-        int                oo_contended;
-        cfs_time_t         oo_contention_time;
+	struct cl_object	oo_cl;
+	struct lov_oinfo	*oo_oinfo;
+	/**
+	 * True if locking against this stripe got -EUSERS.
+	 */
+	int			oo_contended;
+	ktime_t			oo_contention_time;
 #ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
-        /**
-         * IO context used for invariant checks in osc_lock_has_pages().
-         */
-        struct cl_io       oo_debug_io;
-        /** Serialization object for osc_object::oo_debug_io. */
-	struct mutex	   oo_debug_mutex;
+	/**
+	 * IO context used for invariant checks in osc_lock_has_pages().
+	 */
+	struct cl_io		oo_debug_io;
+	/** Serialization object for osc_object::oo_debug_io. */
+	struct mutex		oo_debug_mutex;
 #endif
 	/**
 	 * used by the osc to keep track of what objects to build into rpcs.
@@ -138,7 +272,7 @@ struct osc_object {
 	/**
 	 * extent is a red black tree to manage (async) dirty pages.
 	 */
-	struct rb_root       oo_root;
+	struct rb_root		oo_root;
 	/**
 	 * Manage write(dirty) extents.
 	 */
@@ -148,12 +282,12 @@ struct osc_object {
 
 	struct list_head	oo_reading_exts;
 
-	atomic_t	 oo_nr_reads;
-	atomic_t	 oo_nr_writes;
+	atomic_t		oo_nr_reads;
+	atomic_t		oo_nr_writes;
 
 	/** Protect extent tree. Will be used to protect
 	 * oo_{read|write}_pages soon. */
-	spinlock_t	    oo_lock;
+	spinlock_t		oo_lock;
 
 	/**
 	 * Radix tree for caching pages
@@ -169,8 +303,25 @@ struct osc_object {
 	/** number of active IOs of this object */
 	atomic_t		oo_nr_ios;
 	wait_queue_head_t	oo_io_waitq;
+
+	const struct osc_object_operations *oo_obj_ops;
+	bool			oo_initialized;
 };
 
+static inline void osc_build_res_name(struct osc_object *osc,
+				      struct ldlm_res_id *resname)
+{
+	return osc->oo_obj_ops->oto_build_res_name(osc, resname);
+}
+
+static inline struct ldlm_lock *osc_dlmlock_at_pgoff(const struct lu_env *env,
+						    struct osc_object *obj,
+						    pgoff_t index,
+						    enum osc_dap_flags flags)
+{
+	return obj->oo_obj_ops->oto_dlmlock_at_pgoff(env, obj, index, flags);
+}
+
 static inline void osc_object_lock(struct osc_object *obj)
 {
 	spin_lock(&obj->oo_lock);
@@ -200,15 +351,27 @@ static inline int osc_object_is_locked(struct osc_object *obj)
 #endif
 }
 
+static inline void osc_object_set_contended(struct osc_object *obj)
+{
+	obj->oo_contention_time = ktime_get();
+	/* mb(); */
+	obj->oo_contended = 1;
+}
+
+static inline void osc_object_clear_contended(struct osc_object *obj)
+{
+	obj->oo_contended = 0;
+}
+
 /*
  * Lock "micro-states" for osc layer.
  */
 enum osc_lock_state {
-        OLS_NEW,
-        OLS_ENQUEUED,
-        OLS_UPCALL_RECEIVED,
-        OLS_GRANTED,
-        OLS_CANCELLED
+	OLS_NEW,
+	OLS_ENQUEUED,
+	OLS_UPCALL_RECEIVED,
+	OLS_GRANTED,
+	OLS_CANCELLED
 };
 
 /**
@@ -271,55 +434,68 @@ struct osc_lock {
 	/** DLM flags with which osc_lock::ols_lock was enqueued */
 	__u64			ols_flags;
 	/** osc_lock::ols_lock handle */
-	struct lustre_handle     ols_handle;
+	struct lustre_handle	ols_handle;
 	struct ldlm_enqueue_info ols_einfo;
-	enum osc_lock_state      ols_state;
+	enum osc_lock_state	ols_state;
 	/** lock value block */
 	struct ost_lvb		ols_lvb;
-
-        /**
-         * true, if ldlm_lock_addref() was called against
-         * osc_lock::ols_lock. This is used for sanity checking.
-         *
-         * \see osc_lock::ols_has_ref
-         */
-        unsigned                  ols_hold :1,
-        /**
-         * this is much like osc_lock::ols_hold, except that this bit is
-         * cleared _after_ reference in released in osc_lock_unuse(). This
-         * fine distinction is needed because:
-         *
-         *     - if ldlm lock still has a reference, osc_ast_data_get() needs
-         *       to return associated cl_lock (so that a flag is needed that is
-         *       cleared after ldlm_lock_decref() returned), and
-         *
-         *     - ldlm_lock_decref() can invoke blocking ast (for a
-         *       LDLM_FL_CBPENDING lock), and osc_lock functions like
-         *       osc_lock_cancel() called from there need to know whether to
-         *       release lock reference (so that a flag is needed that is
-         *       cleared before ldlm_lock_decref() is called).
-         */
-                                 ols_has_ref:1,
-        /**
-         * inherit the lockless attribute from top level cl_io.
-         * If true, osc_lock_enqueue is able to tolerate the -EUSERS error.
-         */
-                                 ols_locklessable:1,
-        /**
-         * if set, the osc_lock is a glimpse lock. For glimpse locks, we treat
-         * the EVAVAIL error as torerable, this will make upper logic happy
-         * to wait all glimpse locks to each OSTs to be completed.
-         * Glimpse lock converts to normal lock if the server lock is
-         * granted.
-         * Glimpse lock should be destroyed immediately after use.
-         */
-                                 ols_glimpse:1,
-        /**
-         * For async glimpse lock.
-         */
-                                 ols_agl:1;
+	/** Lockless operations to be used by lockless lock */
+	const struct cl_lock_operations *ols_lockless_ops;
+	/**
+	 * true, if ldlm_lock_addref() was called against
+	 * osc_lock::ols_lock. This is used for sanity checking.
+	 *
+	 * \see osc_lock::ols_has_ref
+	 */
+	unsigned		ols_hold :1,
+	/**
+	 * this is much like osc_lock::ols_hold, except that this bit is
+	 * cleared _after_ reference in released in osc_lock_unuse(). This
+	 * fine distinction is needed because:
+	 *
+	 *     - if ldlm lock still has a reference, osc_ast_data_get() needs
+	 *       to return associated cl_lock (so that a flag is needed that is
+	 *       cleared after ldlm_lock_decref() returned), and
+	 *
+	 *     - ldlm_lock_decref() can invoke blocking ast (for a
+	 *       LDLM_FL_CBPENDING lock), and osc_lock functions like
+	 *       osc_lock_cancel() called from there need to know whether to
+	 *       release lock reference (so that a flag is needed that is
+	 *       cleared before ldlm_lock_decref() is called).
+	 */
+				ols_has_ref:1,
+	/**
+	 * inherit the lockless attribute from top level cl_io.
+	 * If true, osc_lock_enqueue is able to tolerate the -EUSERS error.
+	 */
+				ols_locklessable:1,
+	/**
+	 * if set, the osc_lock is a glimpse lock. For glimpse locks, we treat
+	 * the EVAVAIL error as torerable, this will make upper logic happy
+	 * to wait all glimpse locks to each OSTs to be completed.
+	 * Glimpse lock converts to normal lock if the server lock is granted.
+	 * Glimpse lock should be destroyed immediately after use.
+	 */
+				ols_glimpse:1,
+	/**
+	 * For async glimpse lock.
+	 */
+				ols_agl:1,
+	/**
+	 * for speculative locks - asynchronous glimpse locks and ladvise
+	 * lockahead manual lock requests
+	 *
+	 * Used to tell osc layer to not wait for the ldlm reply from the
+	 * server, so the osc lock will be short lived - It only exists to
+	 * create the ldlm request and is not updated on request completion.
+	 */
+				ols_speculative:1;
 };
 
+static inline int osc_lock_is_lockless(const struct osc_lock *ols)
+{
+	return (ols->ols_cl.cls_ops == ols->ols_lockless_ops);
+}
 
 /**
  * Page state private for osc layer.
@@ -348,7 +524,7 @@ struct osc_page {
 	/**
 	 * in LRU?
 	 */
-			      ops_in_lru:1,
+				ops_in_lru:1,
 	/**
 	 * Set if the page must be transferred with OBD_BRW_SRVLOCK.
 	 */
@@ -364,7 +540,19 @@ struct osc_page {
 	/**
 	 * Submit time - the time when the page is starting RPC. For debugging.
 	 */
-	cfs_time_t            ops_submit_time;
+	ktime_t			ops_submit_time;
+};
+
+struct osc_brw_async_args {
+	struct obdo		*aa_oa;
+	int			 aa_requested_nob;
+	int			 aa_nio_count;
+	u32			 aa_page_count;
+	s32			 aa_resends;
+	struct brw_page		**aa_ppga;
+	struct client_obd	*aa_cli;
+	struct list_head	 aa_oaps;
+	struct list_head	 aa_exts;
 };
 
 extern struct kmem_cache *osc_lock_kmem;
@@ -372,32 +560,27 @@ extern struct kmem_cache *osc_object_kmem;
 extern struct kmem_cache *osc_thread_kmem;
 extern struct kmem_cache *osc_session_kmem;
 extern struct kmem_cache *osc_extent_kmem;
+extern struct kmem_cache *osc_quota_kmem;
+extern struct kmem_cache *osc_obdo_kmem;
 
-extern struct lu_device_type osc_device_type;
 extern struct lu_context_key osc_key;
 extern struct lu_context_key osc_session_key;
 
 #define OSC_FLAGS (ASYNC_URGENT|ASYNC_READY)
 
-int osc_lock_init(const struct lu_env *env,
-                  struct cl_object *obj, struct cl_lock *lock,
-                  const struct cl_io *io);
-int osc_io_init  (const struct lu_env *env,
-                  struct cl_object *obj, struct cl_io *io);
-struct lu_object *osc_object_alloc(const struct lu_env *env,
-                                   const struct lu_object_header *hdr,
-                                   struct lu_device *dev);
+/* osc_page.c */
 int osc_page_init(const struct lu_env *env, struct cl_object *obj,
 		  struct cl_page *page, pgoff_t ind);
-
-void osc_index2policy(union ldlm_policy_data *policy,
-		      const struct cl_object *obj, pgoff_t start, pgoff_t end);
-int  osc_lvb_print(const struct lu_env *env, void *cookie,
-		   lu_printer_t p, const struct ost_lvb *lvb);
-
+void osc_index2policy(union ldlm_policy_data *policy, const struct cl_object *obj,
+		      pgoff_t start, pgoff_t end);
 void osc_lru_add_batch(struct client_obd *cli, struct list_head *list);
 void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
 		     enum cl_req_type crt, int brw_flags);
+int lru_queue_work(const struct lu_env *env, void *data);
+long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
+		    long target, bool force);
+
+/* osc_cache.c */
 int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops);
 int osc_set_async_flags(struct osc_object *obj, struct osc_page *opg,
 			u32 async_flags);
@@ -411,8 +594,9 @@ int osc_teardown_async_page(const struct lu_env *env, struct osc_object *obj,
 			    struct osc_page *ops);
 int osc_flush_async_page(const struct lu_env *env, struct cl_io *io,
 			 struct osc_page *ops);
-int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
-			 struct list_head *list, int cmd, int brw_flags);
+int osc_queue_sync_pages(const struct lu_env *env, const struct cl_io *io,
+			 struct osc_object *obj, struct list_head *list,
+			 int brw_flags);
 int osc_cache_truncate_start(const struct lu_env *env, struct osc_object *obj,
 			     __u64 size, struct osc_extent **extp);
 void osc_cache_truncate_end(const struct lu_env *env, struct osc_extent *ext);
@@ -420,59 +604,161 @@ int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
 			      pgoff_t start, pgoff_t end, int hp, int discard);
 int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
 			 pgoff_t start, pgoff_t end);
-void osc_io_unplug(const struct lu_env *env, struct client_obd *cli,
-		   struct osc_object *osc);
-int lru_queue_work(const struct lu_env *env, void *data);
+int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli,
+		   struct osc_object *osc, int async);
+static inline void osc_wake_cache_waiters(struct client_obd *cli)
+{
+	wake_up(&cli->cl_cache_waiters);
+}
 
-void osc_object_set_contended  (struct osc_object *obj);
-void osc_object_clear_contended(struct osc_object *obj);
-int  osc_object_is_contended   (struct osc_object *obj);
+static inline int osc_io_unplug_async(const struct lu_env *env,
+				      struct client_obd *cli,
+				      struct osc_object *osc)
+{
+	return osc_io_unplug0(env, cli, osc, 1);
+}
 
-int  osc_lock_is_lockless      (const struct osc_lock *olck);
+static inline void osc_io_unplug(const struct lu_env *env,
+				 struct client_obd *cli,
+				 struct osc_object *osc)
+{
+	(void)osc_io_unplug0(env, cli, osc, 0);
+}
+
+typedef int (*osc_page_gang_cbt)(const struct lu_env *, struct cl_io *,
+				 struct osc_page *, void *);
+int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
+			struct osc_object *osc, pgoff_t start, pgoff_t end,
+			osc_page_gang_cbt cb, void *cbdata);
+int osc_discard_cb(const struct lu_env *env, struct cl_io *io,
+		   struct osc_page *ops, void *cbdata);
+
+/* osc_dev.c */
+int osc_device_init(const struct lu_env *env, struct lu_device *d,
+		    const char *name, struct lu_device *next);
+struct lu_device *osc_device_fini(const struct lu_env *env,
+				  struct lu_device *d);
+struct lu_device *osc_device_free(const struct lu_env *env,
+				  struct lu_device *d);
+
+/* osc_object.c */
+int osc_object_init(const struct lu_env *env, struct lu_object *obj,
+		    const struct lu_object_conf *conf);
+void osc_object_free(const struct lu_env *env, struct lu_object *obj);
+int osc_lvb_print(const struct lu_env *env, void *cookie,
+		  lu_printer_t p, const struct ost_lvb *lvb);
+int osc_object_print(const struct lu_env *env, void *cookie,
+		     lu_printer_t p, const struct lu_object *obj);
+int osc_attr_get(const struct lu_env *env, struct cl_object *obj,
+		 struct cl_attr *attr);
+int osc_attr_update(const struct lu_env *env, struct cl_object *obj,
+		    const struct cl_attr *attr, unsigned valid);
+int osc_object_glimpse(const struct lu_env *env, const struct cl_object *obj,
+		       struct ost_lvb *lvb);
+int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc);
+int osc_object_is_contended(struct osc_object *obj);
+int osc_object_find_cbdata(const struct lu_env *env, struct cl_object *obj,
+			   ldlm_iterator_t iter, void *data);
+int osc_object_prune(const struct lu_env *env, struct cl_object *obj);
+
+/* osc_request.c */
+void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd);
+int osc_setup_common(struct obd_device *obd, struct lustre_cfg *lcfg);
+int osc_precleanup_common(struct obd_device *obd);
+int osc_cleanup_common(struct obd_device *obd);
+int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+		       u32 keylen, void *key, u32 vallen, void *val,
+		       struct ptlrpc_request_set *set);
+int osc_ldlm_resource_invalidate(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+				 struct hlist_node *hnode, void *arg);
+int osc_reconnect(const struct lu_env *env, struct obd_export *exp,
+		  struct obd_device *obd, struct obd_uuid *cluuid,
+		  struct obd_connect_data *data, void *localdata);
+int osc_disconnect(struct obd_export *exp);
+int osc_punch_send(struct obd_export *exp, struct obdo *oa,
+		   obd_enqueue_update_f upcall, void *cookie);
+
+/* osc_io.c */
+int osc_io_submit(const struct lu_env *env, const struct cl_io_slice *ios,
+		  enum cl_req_type crt, struct cl_2queue *queue);
+int osc_io_commit_async(const struct lu_env *env,
+			const struct cl_io_slice *ios,
+			struct cl_page_list *qin, int from, int to,
+			cl_commit_cbt cb);
+int osc_io_iter_init(const struct lu_env *env, const struct cl_io_slice *ios);
+void osc_io_iter_fini(const struct lu_env *env,
+		      const struct cl_io_slice *ios);
+int osc_io_write_iter_init(const struct lu_env *env,
+			   const struct cl_io_slice *ios);
+void osc_io_write_iter_fini(const struct lu_env *env,
+			    const struct cl_io_slice *ios);
+int osc_io_fault_start(const struct lu_env *env, const struct cl_io_slice *ios);
+void osc_io_setattr_end(const struct lu_env *env,
+			const struct cl_io_slice *slice);
+int osc_io_read_start(const struct lu_env *env,
+		      const struct cl_io_slice *slice);
+int osc_io_write_start(const struct lu_env *env,
+		       const struct cl_io_slice *slice);
+void osc_io_end(const struct lu_env *env, const struct cl_io_slice *slice);
+int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj,
+		  struct cl_fsync_io *fio);
+void osc_io_fsync_end(const struct lu_env *env,
+		      const struct cl_io_slice *slice);
+void osc_read_ahead_release(const struct lu_env *env, void *cbdata);
+
+/* osc_lock.c */
+void osc_lock_to_lockless(const struct lu_env *env, struct osc_lock *ols,
+			  int force);
+void osc_lock_wake_waiters(const struct lu_env *env, struct osc_object *osc,
+			   struct osc_lock *oscl);
+int osc_lock_enqueue_wait(const struct lu_env *env, struct osc_object *obj,
+			  struct osc_lock *oscl);
+void osc_lock_set_writer(const struct lu_env *env, const struct cl_io *io,
+			 struct cl_object *obj, struct osc_lock *oscl);
+int osc_lock_print(const struct lu_env *env, void *cookie,
+		   lu_printer_t p, const struct cl_lock_slice *slice);
+void osc_lock_cancel(const struct lu_env *env,
+		     const struct cl_lock_slice *slice);
+void osc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice);
+int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data);
+unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock);
 
 /*****************************************************************************
  *
- * Accessors.
+ * Accessors and type conversions.
  *
  */
-
 static inline struct osc_thread_info *osc_env_info(const struct lu_env *env)
 {
-        struct osc_thread_info *info;
+	struct osc_thread_info *info;
 
-        info = lu_context_key_get(&env->le_ctx, &osc_key);
-        LASSERT(info != NULL);
-        return info;
+	info = lu_context_key_get(&env->le_ctx, &osc_key);
+	LASSERT(info != NULL);
+	return info;
 }
 
 static inline struct osc_session *osc_env_session(const struct lu_env *env)
 {
-        struct osc_session *ses;
+	struct osc_session *ses;
 
-        ses = lu_context_key_get(env->le_ses, &osc_session_key);
-        LASSERT(ses != NULL);
-        return ses;
+	ses = lu_context_key_get(env->le_ses, &osc_session_key);
+	LASSERT(ses != NULL);
+	return ses;
 }
 
 static inline struct osc_io *osc_env_io(const struct lu_env *env)
 {
-        return &osc_env_session(env)->os_io;
-}
-
-static inline int osc_is_object(const struct lu_object *obj)
-{
-        return obj->lo_dev->ld_type == &osc_device_type;
+	return &osc_env_session(env)->os_io;
 }
 
 static inline struct osc_device *lu2osc_dev(const struct lu_device *d)
 {
-        LINVRNT(d->ld_type == &osc_device_type);
-        return container_of0(d, struct osc_device, od_cl.cd_lu_dev);
+	return container_of0(d, struct osc_device, od_cl.cd_lu_dev);
 }
 
 static inline struct obd_export *osc_export(const struct osc_object *obj)
 {
-        return lu2osc_dev(obj->oo_cl.co_lu.lo_dev)->od_exp;
+	return lu2osc_dev(obj->oo_cl.co_lu.lo_dev)->od_exp;
 }
 
 static inline struct client_obd *osc_cli(const struct osc_object *obj)
@@ -482,8 +768,7 @@ static inline struct client_obd *osc_cli(const struct osc_object *obj)
 
 static inline struct osc_object *cl2osc(const struct cl_object *obj)
 {
-        LINVRNT(osc_is_object(&obj->co_lu));
-        return container_of0(obj, struct osc_object, oo_cl);
+	return container_of0(obj, struct osc_object, oo_cl);
 }
 
 static inline struct cl_object *osc2cl(const struct osc_object *obj)
@@ -491,6 +776,36 @@ static inline struct cl_object *osc2cl(const struct osc_object *obj)
 	return (struct cl_object *)&obj->oo_cl;
 }
 
+static inline struct osc_device *obd2osc_dev(const struct obd_device *d)
+{
+	return container_of0(d->obd_lu_dev, struct osc_device,
+			     od_cl.cd_lu_dev);
+}
+
+static inline struct lu_device *osc2lu_dev(struct osc_device *osc)
+{
+	return &osc->od_cl.cd_lu_dev;
+}
+
+static inline struct lu_object *osc2lu(struct osc_object *osc)
+{
+	return &osc->oo_cl.co_lu;
+}
+
+static inline struct osc_object *lu2osc(const struct lu_object *obj)
+{
+	return container_of0(obj, struct osc_object, oo_cl.co_lu);
+}
+
+static inline struct osc_io *cl2osc_io(const struct lu_env *env,
+				       const struct cl_io_slice *slice)
+{
+	struct osc_io *oio = container_of0(slice, struct osc_io, oi_cl);
+
+	LINVRNT(oio == osc_env_io(env));
+	return oio;
+}
+
 static inline enum ldlm_mode osc_cl_lock2ldlm(enum cl_lock_mode mode)
 {
 	LASSERT(mode == CLM_READ || mode == CLM_WRITE || mode == CLM_GROUP);
@@ -513,8 +828,7 @@ static inline enum cl_lock_mode osc_ldlm2cl_lock(enum ldlm_mode mode)
 
 static inline struct osc_page *cl2osc_page(const struct cl_page_slice *slice)
 {
-        LINVRNT(osc_is_object(&slice->cpl_obj->co_lu));
-        return container_of0(slice, struct osc_page, ops_cl);
+	return container_of0(slice, struct osc_page, ops_cl);
 }
 
 static inline struct osc_page *oap2osc(struct osc_async_page *oap)
@@ -549,18 +863,12 @@ osc_cl_page_osc(struct cl_page *page, struct osc_object *osc)
 
 static inline struct osc_lock *cl2osc_lock(const struct cl_lock_slice *slice)
 {
-        LINVRNT(osc_is_object(&slice->cls_obj->co_lu));
-        return container_of0(slice, struct osc_lock, ols_cl);
-}
-
-static inline struct osc_lock *osc_lock_at(const struct cl_lock *lock)
-{
-        return cl2osc_lock(cl_lock_at(lock, &osc_device_type));
+	return container_of0(slice, struct osc_lock, ols_cl);
 }
 
 static inline int osc_io_srvlock(struct osc_io *oio)
 {
-        return (oio->oi_lockless && !oio->oi_cl.cis_io->ci_no_srvlock);
+	return (oio->oi_lockless && !oio->oi_cl.cis_io->ci_no_srvlock);
 }
 
 enum osc_extent_state {
@@ -626,7 +934,9 @@ struct osc_extent {
 				oe_hp:1,
 	/** this extent should be written back asap. set if one of pages is
 	 * called by page WB daemon, or sync write or reading requests. */
-				oe_urgent:1;
+				oe_urgent:1,
+	/** Non-delay RPC should be used for this extent. */
+				oe_ndelay:1;
 	/** how many grants allocated for this extent.
 	 *  Grant allocated for this extent. There is no grant allocated
 	 *  for reading extents and sync write extents. */
@@ -660,20 +970,10 @@ struct osc_extent {
 	int			oe_rc;
 	/** max pages per rpc when this extent was created */
 	unsigned int		oe_mppr;
+	/** FLR: layout version when this osc_extent is publised */
+	__u32			oe_layout_version;
 };
 
-int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
-		      int sent, int rc);
-int osc_extent_release(const struct lu_env *env, struct osc_extent *ext);
-
-int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc,
-			   pgoff_t start, pgoff_t end, bool discard_pages);
-
-typedef int (*osc_page_gang_cbt)(const struct lu_env *, struct cl_io *,
-				 struct osc_page *, void *);
-int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
-			 struct osc_object *osc, pgoff_t start, pgoff_t end,
-			 osc_page_gang_cbt cb, void *cbdata);
 /** @} osc */
 
-#endif /* OSC_CL_INTERNAL_H */
+#endif /* LUSTRE_OSC_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_patchless_compat.h b/drivers/staging/lustrefsx/lustre/include/lustre_patchless_compat.h
index 2ad8bce19ac53..b6070871e555c 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_patchless_compat.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_patchless_compat.h
@@ -111,26 +111,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
 #  define ll_d_count(d)			((d)->d_count)
 #endif /* HAVE_DCACHE_LOCK */
 
-#ifdef ATTR_OPEN
-# define ATTR_FROM_OPEN ATTR_OPEN
-#else
-# ifndef ATTR_FROM_OPEN
-#  define ATTR_FROM_OPEN 0
-# endif
-#endif /* ATTR_OPEN */
-
-#ifndef ATTR_RAW
-#define ATTR_RAW 0
-#endif
-
-#ifndef ATTR_CTIME_SET
-/*
- * set ATTR_CTIME_SET to a high value to avoid any risk of collision with other
- * ATTR_* attributes (see bug 13828)
- */
-#define ATTR_CTIME_SET (1 << 28)
-#endif
-
 #ifndef HAVE_IN_COMPAT_SYSCALL
 #define in_compat_syscall	is_compat_task
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_quota.h b/drivers/staging/lustrefsx/lustre/include/lustre_quota.h
index 8cb25d2374322..17ff2da6240ca 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_quota.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_quota.h
@@ -21,7 +21,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2012, 2014, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  * Use is subject to license terms.
  */
 
@@ -175,13 +175,22 @@ struct qsd_instance;
  * Below are the function prototypes to be used by OSD layer to manage quota
  * enforcement. Arguments are documented where each function is defined.  */
 
+/* flags for quota local enforcement */
+enum osd_quota_local_flags {
+	QUOTA_FL_OVER_USRQUOTA = 1 << 0,
+	QUOTA_FL_OVER_GRPQUOTA = 1 << 1,
+	QUOTA_FL_SYNC = 1 << 2,
+	QUOTA_FL_OVER_PRJQUOTA = 1 << 3,
+};
+
 struct qsd_instance *qsd_init(const struct lu_env *, char *, struct dt_device *,
-			      struct proc_dir_entry *);
+			      struct proc_dir_entry *, bool is_md);
 int qsd_prepare(const struct lu_env *, struct qsd_instance *);
 int qsd_start(const struct lu_env *, struct qsd_instance *);
 void qsd_fini(const struct lu_env *, struct qsd_instance *);
 int qsd_op_begin(const struct lu_env *, struct qsd_instance *,
-		 struct lquota_trans *, struct lquota_id_info *, int *);
+		 struct lquota_trans *, struct lquota_id_info *,
+		 enum osd_quota_local_flags *);
 void qsd_op_end(const struct lu_env *, struct qsd_instance *,
 		struct lquota_trans *);
 void qsd_op_adjust(const struct lu_env *, struct qsd_instance *,
@@ -212,13 +221,13 @@ struct lquota_id_info {
 	bool			 lqi_is_blk;
 };
 
-/* Since we enforce only inode quota in meta pool (MDTs), and block quota in
- * data pool (OSTs), there are at most 4 quota ids being enforced in a single
- * transaction, which is chown transaction:
+/* With the DoM, both inode quota in meta pool and block quota in data pool
+ * will be enforced at MDT, there are at most 4 quota ids being enforced in
+ * a single transaction for inode and block quota, which is chown transaction:
  * original uid and gid, new uid and gid.
  *
  * This value might need to be revised when directory quota is added.  */
-#define QUOTA_MAX_TRANSIDS    4
+#define QUOTA_MAX_TRANSIDS    8
 
 /* all qids involved in a single transaction */
 struct lquota_trans {
@@ -226,12 +235,6 @@ struct lquota_trans {
 	struct lquota_id_info	lqt_ids[QUOTA_MAX_TRANSIDS];
 };
 
-/* flags for quota local enforcement */
-#define QUOTA_FL_OVER_USRQUOTA  0x01
-#define QUOTA_FL_OVER_GRPQUOTA  0x02
-#define QUOTA_FL_SYNC           0x04
-#define QUOTA_FL_OVER_PRJQUOTA  0x08
-
 #define IS_LQUOTA_RES(res)						\
 	(res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA ||	\
 	 res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA_GLB)
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h b/drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h
index 46e6fa862f48e..7b6c03b195624 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -59,7 +59,7 @@ enum req_location {
 };
 
 /* Maximal number of fields (buffers) in a request message. */
-#define REQ_MAX_FIELD_NR 10
+#define REQ_MAX_FIELD_NR 11
 
 struct req_capsule {
         struct ptlrpc_request   *rc_req;
@@ -128,6 +128,7 @@ int req_capsule_server_grow(struct req_capsule *pill,
 			    __u32 newlen);
 int  req_layout_init(void);
 void req_layout_fini(void);
+int req_check_sepol(struct req_capsule *pill);
 
 extern struct req_format RQF_OBD_PING;
 extern struct req_format RQF_OBD_SET_INFO;
@@ -145,6 +146,7 @@ extern struct req_format RQF_FLD_READ;
 extern struct req_format RQF_MDS_CONNECT;
 extern struct req_format RQF_MDS_DISCONNECT;
 extern struct req_format RQF_MDS_STATFS;
+extern struct req_format RQF_MDS_STATFS_NEW;
 extern struct req_format RQF_MDS_GET_ROOT;
 extern struct req_format RQF_MDS_SYNC;
 extern struct req_format RQF_MDS_GETXATTR;
@@ -156,7 +158,7 @@ extern struct req_format RQF_OUT_UPDATE;
  */
 extern struct req_format RQF_MDS_GETATTR_NAME;
 extern struct req_format RQF_MDS_CLOSE;
-extern struct req_format RQF_MDS_INTENT_CLOSE;
+extern struct req_format RQF_MDS_CLOSE_INTENT;
 extern struct req_format RQF_MDS_CONNECT;
 extern struct req_format RQF_MDS_DISCONNECT;
 extern struct req_format RQF_MDS_GET_INFO;
@@ -176,6 +178,8 @@ extern struct req_format RQF_MDS_QUOTACTL;
 extern struct req_format RQF_QUOTA_DQACQ;
 extern struct req_format RQF_MDS_SWAP_LAYOUTS;
 extern struct req_format RQF_MDS_REINT_MIGRATE;
+extern struct req_format RQF_MDS_REINT_RESYNC;
+extern struct req_format RQF_MDS_RMFID;
 /* MDS hsm formats */
 extern struct req_format RQF_MDS_HSM_STATE_GET;
 extern struct req_format RQF_MDS_HSM_STATE_SET;
@@ -215,7 +219,6 @@ extern struct req_format RQF_LDLM_INTENT_LAYOUT;
 extern struct req_format RQF_LDLM_INTENT_GETATTR;
 extern struct req_format RQF_LDLM_INTENT_OPEN;
 extern struct req_format RQF_LDLM_INTENT_CREATE;
-extern struct req_format RQF_LDLM_INTENT_UNLINK;
 extern struct req_format RQF_LDLM_INTENT_GETXATTR;
 extern struct req_format RQF_LDLM_INTENT_QUOTA;
 extern struct req_format RQF_LDLM_CANCEL;
@@ -223,15 +226,12 @@ extern struct req_format RQF_LDLM_CALLBACK;
 extern struct req_format RQF_LDLM_CP_CALLBACK;
 extern struct req_format RQF_LDLM_BL_CALLBACK;
 extern struct req_format RQF_LDLM_GL_CALLBACK;
-extern struct req_format RQF_LDLM_GL_DESC_CALLBACK;
+extern struct req_format RQF_LDLM_GL_CALLBACK_DESC;
 /* LOG req_format */
-extern struct req_format RQF_LOG_CANCEL;
 extern struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE;
-extern struct req_format RQF_LLOG_ORIGIN_HANDLE_DESTROY;
 extern struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK;
 extern struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK;
 extern struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER;
-extern struct req_format RQF_LLOG_ORIGIN_CONNECT;
 
 extern struct req_format RQF_CONNECT;
 
@@ -257,6 +257,7 @@ extern struct req_msg_field RMF_IDX_INFO;
 extern struct req_msg_field RMF_CLOSE_DATA;
 extern struct req_msg_field RMF_FILE_SECCTX_NAME;
 extern struct req_msg_field RMF_FILE_SECCTX;
+extern struct req_msg_field RMF_FID_ARRAY;
 
 /*
  * connection handle received in MDS_CONNECT request.
@@ -291,6 +292,7 @@ extern struct req_msg_field RMF_HSM_USER_STATE;
 extern struct req_msg_field RMF_HSM_STATE_SET;
 extern struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION;
 extern struct req_msg_field RMF_MDS_HSM_REQUEST;
+extern struct req_msg_field RMF_SELINUX_POL;
 
 /* seq-mgr fields */
 extern struct req_msg_field RMF_SEQ_OPC;
@@ -313,10 +315,12 @@ extern struct req_msg_field RMF_OBD_IOOBJ;
 extern struct req_msg_field RMF_OBD_ID;
 extern struct req_msg_field RMF_FID;
 extern struct req_msg_field RMF_NIOBUF_REMOTE;
+extern struct req_msg_field RMF_NIOBUF_INLINE;
 extern struct req_msg_field RMF_RCS;
 extern struct req_msg_field RMF_FIEMAP_KEY;
 extern struct req_msg_field RMF_FIEMAP_VAL;
 extern struct req_msg_field RMF_OST_ID;
+extern struct req_msg_field RMF_SHORT_IO;
 
 /* MGS config read message format */
 extern struct req_msg_field RMF_MGS_CONFIG_BODY;
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_scrub.h b/drivers/staging/lustrefsx/lustre/include/lustre_scrub.h
new file mode 100644
index 0000000000000..3eba040fac690
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_scrub.h
@@ -0,0 +1,375 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Intel Corporation.
+ */
+/*
+ * lustre/include/lustre_scrub.h
+ *
+ * Shared definitions and declarations for Lustre OI scrub.
+ *
+ * Author: Fan Yong <fan.yong@intel.com>
+ */
+
+#ifndef _LUSTRE_SCRUB_H
+# define _LUSTRE_SCRUB_H
+
+#include <dt_object.h>
+#include <lustre_net.h>
+
+#define OSD_OI_FID_OID_BITS_MAX	10
+#define OSD_OI_FID_NR_MAX	(1UL << OSD_OI_FID_OID_BITS_MAX)
+#define SCRUB_OI_BITMAP_SIZE	(OSD_OI_FID_NR_MAX >> 3)
+#define PFID_STRIPE_IDX_BITS	16
+#define PFID_STRIPE_COUNT_MASK	((1 << PFID_STRIPE_IDX_BITS) - 1)
+
+#define SCRUB_MAGIC_V1			0x4C5FD252
+#define SCRUB_CHECKPOINT_INTERVAL	60
+#define SCRUB_WINDOW_SIZE		1024
+
+enum scrub_next_status {
+	/* exit current loop and process next group */
+	SCRUB_NEXT_BREAK	= 1,
+
+	/* skip current object and process next bit */
+	SCRUB_NEXT_CONTINUE	= 2,
+
+	/* exit all the loops */
+	SCRUB_NEXT_EXIT		= 3,
+
+	/* wait for free cache slot */
+	SCRUB_NEXT_WAIT		= 4,
+
+	/* simulate system crash during OI scrub */
+	SCRUB_NEXT_CRASH	= 5,
+
+	/* simulate failure during OI scrub */
+	SCRUB_NEXT_FATAL	= 6,
+
+	/* new created object, no scrub on it */
+	SCRUB_NEXT_NOSCRUB	= 7,
+
+	/* the object has no FID-in-LMA */
+	SCRUB_NEXT_NOLMA	= 8,
+
+	/* for OST-object */
+	SCRUB_NEXT_OSTOBJ	= 9,
+
+	/* old OST-object, no LMA or no FID-on-OST flags in LMA */
+	SCRUB_NEXT_OSTOBJ_OLD	= 10,
+};
+
+enum scrub_local_file_flags {
+	SLFF_SCAN_SUBITEMS	= 0x0001,
+	SLFF_HIDE_FID		= 0x0002,
+	SLFF_SHOW_NAME		= 0x0004,
+	SLFF_NO_OI		= 0x0008,
+	SLFF_IDX_IN_FID		= 0x0010,
+};
+
+enum scrub_status {
+	/* The scrub file is new created, for new MDT, upgrading from old disk,
+	 * or re-creating the scrub file manually. */
+	SS_INIT		= 0,
+
+	/* The scrub is checking/repairing the OI files. */
+	SS_SCANNING	= 1,
+
+	/* The scrub checked/repaired the OI files successfully. */
+	SS_COMPLETED	= 2,
+
+	/* The scrub failed to check/repair the OI files. */
+	SS_FAILED	= 3,
+
+	/* The scrub is stopped manually, the OI files may be inconsistent. */
+	SS_STOPPED	= 4,
+
+	/* The scrub is paused automatically when umount. */
+	SS_PAUSED	= 5,
+
+	/* The scrub crashed during the scanning, should be restarted. */
+	SS_CRASHED	= 6,
+};
+
+enum scrub_flags {
+	/* OI files have been recreated, OI mappings should be re-inserted. */
+	SF_RECREATED	= 0x0000000000000001ULL,
+
+	/* OI files are invalid, should be rebuild ASAP */
+	SF_INCONSISTENT	= 0x0000000000000002ULL,
+
+	/* OI scrub is triggered automatically. */
+	SF_AUTO		= 0x0000000000000004ULL,
+
+	/* The device is upgraded from 1.8 format. */
+	SF_UPGRADE	= 0x0000000000000008ULL,
+};
+
+enum scrub_param {
+	/* Exit when fail. */
+	SP_FAILOUT	= 0x0001,
+
+	/* Check only without repairing. */
+	SP_DRYRUN	= 0x0002,
+};
+
+enum scrub_start {
+	/* Set failout flag. */
+	SS_SET_FAILOUT		= 0x00000001,
+
+	/* Clear failout flag. */
+	SS_CLEAR_FAILOUT	= 0x00000002,
+
+	/* Reset scrub start position. */
+	SS_RESET		= 0x00000004,
+
+	/* Trigger full scrub automatically. */
+	SS_AUTO_FULL		= 0x00000008,
+
+	/* Trigger partial scrub automatically. */
+	SS_AUTO_PARTIAL		= 0x00000010,
+
+	/* Set dryrun flag. */
+	SS_SET_DRYRUN		= 0x00000020,
+
+	/* Clear dryrun flag. */
+	SS_CLEAR_DRYRUN		= 0x00000040,
+};
+
+enum osd_lf_flags {
+	OLF_SCAN_SUBITEMS	= 0x0001,
+	OLF_HIDE_FID		= 0x0002,
+	OLF_SHOW_NAME		= 0x0004,
+	OLF_NO_OI		= 0x0008,
+	OLF_IDX_IN_FID		= 0x0010,
+	OLF_NOT_BACKUP		= 0x0020,
+};
+
+/* There are some overhead to detect OI inconsistency automatically
+ * during normal RPC handling. We do not want to always auto detect
+ * OI inconsistency especailly when OI scrub just done recently.
+ *
+ * The 'auto_scrub' defines the time (united as second) interval to
+ * enable auto detect OI inconsistency since last OI scurb done. */
+enum auto_scrub {
+	/* Disable auto scrub. */
+	AS_NEVER	= 0,
+
+	/* 1 second is too short interval, it is almost equal to always auto
+	 * detect inconsistent OI, usually used for test. */
+	AS_ALWAYS	= 1,
+
+	/* Enable auto detect OI inconsistency one month (60 * 60 * 24 * 30)
+	 * after last OI scrub. */
+	AS_DEFAULT	= 2592000LL,
+};
+
+struct scrub_file {
+	/* 128-bit uuid for volume. */
+	__u8    sf_uuid[16];
+
+	/* See 'enum scrub_flags'. */
+	__u64   sf_flags;
+
+	/* The scrub magic. */
+	__u32   sf_magic;
+
+	/* See 'enum scrub_status'. */
+	__u16   sf_status;
+
+	/* See 'enum scrub_param'. */
+	__u16   sf_param;
+
+	/* The time for the last OI scrub completed. */
+	time64_t sf_time_last_complete;
+
+	/* The ttime for the latest OI scrub ran. */
+	time64_t sf_time_latest_start;
+
+	/* The time for the last OI scrub checkpoint. */
+	time64_t sf_time_last_checkpoint;
+
+	/* The position for the latest OI scrub started from. */
+	__u64   sf_pos_latest_start;
+
+	/* The position for the last OI scrub checkpoint. */
+	__u64   sf_pos_last_checkpoint;
+
+	/* The position for the first should be updated object. */
+	__u64   sf_pos_first_inconsistent;
+
+	/* How many objects have been checked. */
+	__u64   sf_items_checked;
+
+	/* How many objects have been updated. */
+	__u64   sf_items_updated;
+
+	/* How many objects failed to be processed. */
+	__u64   sf_items_failed;
+
+	/* How many prior objects have been updated during scanning. */
+	__u64   sf_items_updated_prior;
+
+	/* How many objects marked as LDISKFS_STATE_LUSTRE_NOSCRUB. */
+	__u64   sf_items_noscrub;
+
+	/* How many IGIF objects. */
+	__u64   sf_items_igif;
+
+	/* How long the OI scrub has run in seconds. Do NOT change
+	 * to time64_t since this breaks backwards compatibility.
+	 * It shouldn't take more than 136 years to complete :-)
+	 */
+	time_t	sf_run_time;
+
+	/* How many completed OI scrub ran on the device. */
+	__u32   sf_success_count;
+
+	/* How many OI files. */
+	__u16   sf_oi_count;
+
+	/* Keep the flags after scrub reset. See 'enum scrub_internal_flags' */
+	__u16	sf_internal_flags;
+
+	__u32	sf_reserved_1;
+	__u64	sf_reserved_2[16];
+
+	/* Bitmap for OI files recreated case. */
+	__u8    sf_oi_bitmap[SCRUB_OI_BITMAP_SIZE];
+};
+
+struct lustre_scrub {
+	/* Object for the scrub file. */
+	struct dt_object       *os_obj;
+
+	struct ptlrpc_thread    os_thread;
+	struct list_head	os_inconsistent_items;
+
+	/* write lock for scrub prep/update/post/checkpoint,
+	 * read lock for scrub dump. */
+	struct rw_semaphore	os_rwsem;
+	spinlock_t		os_lock;
+
+	/* Scrub file in memory. */
+	struct scrub_file       os_file;
+
+	/* Buffer for scrub file load/store. */
+	struct scrub_file       os_file_disk;
+
+	const char	       *os_name;
+
+	/* The time for last checkpoint, seconds */
+	time64_t		os_time_last_checkpoint;
+
+	/* The time for next checkpoint, seconds */
+	time64_t		os_time_next_checkpoint;
+
+	/* How many objects have been checked since last checkpoint. */
+	__u64			os_new_checked;
+	__u64			os_pos_current;
+	__u32			os_start_flags;
+	unsigned int		os_in_prior:1, /* process inconsistent item
+						* found by RPC prior */
+				os_waiting:1, /* Waiting for scan window. */
+				os_full_speed:1, /* run w/o speed limit */
+				os_paused:1, /* The scrub is paused. */
+				os_convert_igif:1,
+				os_partial_scan:1,
+				os_in_join:1,
+				os_full_scrub:1;
+};
+
+#define INDEX_BACKUP_MAGIC_V1	0x1E41F208
+#define INDEX_BACKUP_BUFSIZE	(4096 * 4)
+
+enum lustre_index_backup_policy {
+	/* By default, do not backup the index */
+	LIBP_NONE	= 0,
+
+	/* Backup the dirty index objects when umount */
+	LIBP_AUTO	= 1,
+};
+
+struct lustre_index_backup_header {
+	__u32		libh_magic;
+	__u32		libh_count;
+	__u32		libh_keysize;
+	__u32		libh_recsize;
+	struct lu_fid	libh_owner;
+	__u64		libh_pad[60]; /* keep header 512 bytes aligned */
+};
+
+struct lustre_index_backup_unit {
+	struct list_head	libu_link;
+	struct lu_fid		libu_fid;
+	__u32			libu_keysize;
+	__u32			libu_recsize;
+};
+
+struct lustre_index_restore_unit {
+	struct list_head	liru_link;
+	struct lu_fid		liru_pfid;
+	struct lu_fid		liru_cfid;
+	__u64			liru_clid;
+	int			liru_len;
+	char			liru_name[0];
+};
+
+void scrub_file_init(struct lustre_scrub *scrub, __u8 *uuid);
+void scrub_file_reset(struct lustre_scrub *scrub, __u8 *uuid, __u64 flags);
+int scrub_file_load(const struct lu_env *env, struct lustre_scrub *scrub);
+int scrub_file_store(const struct lu_env *env, struct lustre_scrub *scrub);
+int scrub_checkpoint(const struct lu_env *env, struct lustre_scrub *scrub);
+int scrub_start(int (*threadfn)(void *data), struct lustre_scrub *scrub,
+		void *data, __u32 flags);
+void scrub_stop(struct lustre_scrub *scrub);
+void scrub_dump(struct seq_file *m, struct lustre_scrub *scrub);
+
+int lustre_liru_new(struct list_head *head, const struct lu_fid *pfid,
+		    const struct lu_fid *cfid, __u64 child,
+		    const char *name, int namelen);
+
+int lustre_index_register(struct dt_device *dev, const char *devname,
+			  struct list_head *head, spinlock_t *lock, int *guard,
+			  const struct lu_fid *fid,
+			  __u32 keysize, __u32 recsize);
+
+void lustre_index_backup(const struct lu_env *env, struct dt_device *dev,
+			 const char *devname, struct list_head *head,
+			 spinlock_t *lock, int *guard, bool backup);
+int lustre_index_restore(const struct lu_env *env, struct dt_device *dev,
+			 const struct lu_fid *parent_fid,
+			 const struct lu_fid *tgt_fid,
+			 const struct lu_fid *bak_fid, const char *name,
+			 struct list_head *head, spinlock_t *lock,
+			 char *buf, int bufsize);
+
+static inline void lustre_fid2lbx(char *buf, const struct lu_fid *fid, int len)
+{
+	snprintf(buf, len, DFID_NOBRACE".lbx", PFID(fid));
+}
+
+static inline const char *osd_scrub2name(struct lustre_scrub *scrub)
+{
+	return scrub->os_name;
+}
+#endif /* _LUSTRE_SCRUB_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_sec.h b/drivers/staging/lustrefsx/lustre/include/lustre_sec.h
index 7e6f490854911..6a69d01150aa1 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_sec.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_sec.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -549,7 +549,7 @@ struct ptlrpc_cli_ctx {
 	atomic_t		cc_refcount;
 	struct ptlrpc_sec      *cc_sec;
 	struct ptlrpc_ctx_ops  *cc_ops;
-	cfs_time_t		cc_expire;	/* in seconds */
+	time64_t		cc_expire;	/* in seconds */
 	unsigned int		cc_early_expire:1;
 	unsigned long		cc_flags;
 	struct vfs_cred		cc_vcred;
@@ -869,6 +869,17 @@ struct ptlrpc_sec {
         /** owning import */
         struct obd_import              *ps_import;
 	spinlock_t			ps_lock;
+	/** mtime of SELinux policy file */
+	ktime_t				ps_sepol_mtime;
+	/** next check time of SELinux policy file */
+	ktime_t				ps_sepol_checknext;
+	/**
+	 * SELinux policy info
+	 * sepol string format is:
+	 * <mode>:<policy name>:<policy version>:<policy hash>
+	 */
+	char				ps_sepol[LUSTRE_NODEMAP_SEPOL_LENGTH
+						 + 1];
 
 	/*
 	 * garbage collection
@@ -1092,6 +1103,7 @@ int  sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req,
 void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req);
 
 void sptlrpc_request_out_callback(struct ptlrpc_request *req);
+int sptlrpc_get_sepol(struct ptlrpc_request *req);
 
 /*
  * exported higher interface of import & request
@@ -1109,6 +1121,7 @@ void sptlrpc_import_flush_all_ctx(struct obd_import *imp);
 int  sptlrpc_req_get_ctx(struct ptlrpc_request *req);
 void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync);
 int  sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout);
+int  sptlrpc_export_update_ctx(struct obd_export *exp);
 int  sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req);
 void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode);
 
@@ -1193,10 +1206,6 @@ int sptlrpc_current_user_desc_size(void);
 int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset);
 int sptlrpc_unpack_user_desc(struct lustre_msg *req, int offset, int swabbed);
 
-
-#define CFS_CAP_CHOWN_MASK (1 << CFS_CAP_CHOWN)
-#define CFS_CAP_SYS_RESOURCE_MASK (1 << CFS_CAP_SYS_RESOURCE)
-
 /** @} sptlrpc */
 
 #endif /* _LUSTRE_SEC_H_ */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_swab.h b/drivers/staging/lustrefsx/lustre/include/lustre_swab.h
index 8f8b375e64c25..96dcd493f5f33 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_swab.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_swab.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2016, Intel Corporation.
+ * Copyright (c) 2017, Intel Corporation.
  *
  * Copyright 2015 Cray Inc, all rights reserved.
  * Author: Ben Evans.
@@ -48,10 +48,11 @@
 #ifndef _LUSTRE_SWAB_H_
 #define _LUSTRE_SWAB_H_
 
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 
 void lustre_swab_orphan_ent(struct lu_orphan_ent *ent);
 void lustre_swab_orphan_ent_v2(struct lu_orphan_ent_v2 *ent);
+void lustre_swab_orphan_ent_v3(struct lu_orphan_ent_v3 *ent);
 void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
 void lustre_swab_connect(struct obd_connect_data *ocd);
 void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
@@ -92,11 +93,13 @@ void lustre_swab_obdo(struct obdo *o);
 void lustre_swab_ost_body(struct ost_body *b);
 void lustre_swab_ost_last_id(__u64 *id);
 void lustre_swab_fiemap(struct fiemap *fiemap);
+void lustre_swab_fiemap_info_key(struct ll_fiemap_info_key *fiemap_info);
 void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum);
 void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum);
 void lustre_swab_lov_comp_md_v1(struct lov_comp_md_v1 *lum);
 void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
 				     int stripe_count);
+void lustre_swab_lov_user_md(struct lov_user_md *lum, size_t size);
 void lustre_swab_lov_mds_md(struct lov_mds_md *lmm);
 void lustre_swab_idx_info(struct idx_info *ii);
 void lustre_swab_lip_header(struct lu_idxpage *lip);
@@ -118,6 +121,7 @@ void lustre_swab_object_update_result(struct object_update_result *our);
 void lustre_swab_object_update_reply(struct object_update_reply *our);
 void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl);
 void lustre_swab_close_data(struct close_data *data);
+void lustre_swab_close_data_resync_done(struct close_data_resync_done *resync);
 void lustre_swab_lmv_user_md(struct lmv_user_md *lum);
 void lustre_swab_ladvise(struct lu_ladvise *ladvise);
 void lustre_swab_ladvise_hdr(struct ladvise_hdr *ladvise_hdr);
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_update.h b/drivers/staging/lustrefsx/lustre/include/lustre_update.h
index 968cc51028d86..78cd3d4bfdd51 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_update.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_update.h
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2013, 2016, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
  */
 /*
  * lustre/include/lustre_update.h
@@ -454,6 +454,9 @@ int out_xattr_get_pack(const struct lu_env *env,
 		       struct object_update *update, size_t *max_update_size,
 		       const struct lu_fid *fid, const char *name,
 		       const int bufsize);
+int out_xattr_list_pack(const struct lu_env *env, struct object_update *update,
+		       size_t *max_update_size, const struct lu_fid *fid,
+		       const int bufsize);
 int out_read_pack(const struct lu_env *env, struct object_update *update,
 		  size_t *max_update_length, const struct lu_fid *fid,
 		  size_t size, loff_t pos);
diff --git a/drivers/staging/lustrefsx/lustre/include/md_object.h b/drivers/staging/lustrefsx/lustre/include/md_object.h
index d64d243ff8988..a5f994e36d50b 100644
--- a/drivers/staging/lustrefsx/lustre/include/md_object.h
+++ b/drivers/staging/lustrefsx/lustre/include/md_object.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -74,6 +74,7 @@ enum ma_valid {
 	MA_HSM       = 1 << 6,
 	MA_PFID      = 1 << 7,
 	MA_LMV_DEF   = 1 << 8,
+	MA_SOM	     = 1 << 9,
 };
 
 typedef enum {
@@ -108,34 +109,47 @@ struct md_hsm {
 	__u64	mh_arch_ver;
 };
 
+
+/* memory structure for SOM attributes
+ * for fields description see the on disk structure som_attrs
+ * which is defined in lustre_idl.h
+ */
+struct md_som {
+	__u16	ms_valid;
+	__u64	ms_size;
+	__u64	ms_blocks;
+};
+
 struct md_attr {
-        __u64                   ma_valid;
-        __u64                   ma_need;
-        __u64                   ma_attr_flags;
-        struct lu_attr          ma_attr;
-        struct lu_fid           ma_pfid;
-        struct md_hsm           ma_hsm;
-        struct lov_mds_md      *ma_lmm;
-	union lmv_mds_md       *ma_lmv;
-        void                   *ma_acl;
-        int                     ma_lmm_size;
-        int                     ma_lmv_size;
-        int                     ma_acl_size;
+	__u64			 ma_valid;
+	__u64			 ma_need;
+	__u64			 ma_attr_flags;
+	struct lu_attr		 ma_attr;
+	struct lu_fid		 ma_pfid;
+	struct md_hsm		 ma_hsm;
+	struct md_som		 ma_som;
+	struct lov_mds_md	*ma_lmm;
+	union lmv_mds_md	*ma_lmv;
+	void			*ma_acl;
+	int			 ma_lmm_size;
+	int			 ma_lmv_size;
+	int			 ma_acl_size;
+	int			 ma_enable_chprojid_gid;
 };
 
 /** Additional parameters for create */
 struct md_op_spec {
-        union {
-                /** symlink target */
-                const char               *sp_symname;
-                /** eadata for regular files */
-                struct md_spec_reg {
-                        const void *eadata;
-                        int  eadatalen;
-                } sp_ea;
-        } u;
-
-	/** Create flag from client: such as MDS_OPEN_CREAT, and others. */
+	union {
+		/** symlink target */
+		const char *sp_symname;
+		/** eadata for regular files */
+		struct md_spec_reg {
+			void *eadata;
+			int  eadatalen;
+		} sp_ea;
+	} u;
+
+	/** Open flags from client: such as MDS_OPEN_CREAT, and others. */
 	__u64      sp_cr_flags;
 
 	/* File security context for creates. */
@@ -150,10 +164,30 @@ struct md_op_spec {
 		     sp_permitted:1, /* do not check permission */
 		     sp_migrate_close:1; /* close the file during migrate */
 	/** Current lock mode for parent dir where create is performing. */
-        mdl_mode_t sp_cr_mode;
+	mdl_mode_t sp_cr_mode;
+
+	/** to create directory */
+	const struct dt_index_features *sp_feat;
+};
+
+enum md_layout_opc {
+	MD_LAYOUT_NOP	= 0,
+	MD_LAYOUT_WRITE,	/* FLR: write the file */
+	MD_LAYOUT_RESYNC,	/* FLR: resync starts */
+	MD_LAYOUT_RESYNC_DONE,	/* FLR: resync done */
+};
 
-        /** to create directory */
-        const struct dt_index_features *sp_feat;
+/**
+ * Parameters for layout change API.
+ */
+struct md_layout_change {
+	enum md_layout_opc	 mlc_opc;
+	__u16			 mlc_mirror_id;
+	struct layout_intent	*mlc_intent;
+	struct lu_buf		 mlc_buf;
+	struct lustre_som_attrs	 mlc_som;
+	size_t			 mlc_resync_count;
+	__u32			*mlc_resync_ids;
 };
 
 union ldlm_policy_data;
@@ -161,51 +195,53 @@ union ldlm_policy_data;
  * Operations implemented for each md object (both directory and leaf).
  */
 struct md_object_operations {
-        int (*moo_permission)(const struct lu_env *env,
-                              struct md_object *pobj, struct md_object *cobj,
-                              struct md_attr *attr, int mask);
+	int (*moo_permission)(const struct lu_env *env,
+			      struct md_object *pobj, struct md_object *cobj,
+			      struct md_attr *attr, int mask);
 
-        int (*moo_attr_get)(const struct lu_env *env, struct md_object *obj,
-                            struct md_attr *attr);
+	int (*moo_attr_get)(const struct lu_env *env, struct md_object *obj,
+			    struct md_attr *attr);
 
-        int (*moo_attr_set)(const struct lu_env *env, struct md_object *obj,
-                            const struct md_attr *attr);
+	int (*moo_attr_set)(const struct lu_env *env, struct md_object *obj,
+			    const struct md_attr *attr);
 
-        int (*moo_xattr_get)(const struct lu_env *env, struct md_object *obj,
-                             struct lu_buf *buf, const char *name);
+	int (*moo_xattr_get)(const struct lu_env *env, struct md_object *obj,
+			     struct lu_buf *buf, const char *name);
 
-        int (*moo_xattr_list)(const struct lu_env *env, struct md_object *obj,
-                              struct lu_buf *buf);
+	int (*moo_xattr_list)(const struct lu_env *env, struct md_object *obj,
+			      struct lu_buf *buf);
 
-        int (*moo_xattr_set)(const struct lu_env *env, struct md_object *obj,
-                             const struct lu_buf *buf, const char *name,
-                             int fl);
+	int (*moo_xattr_set)(const struct lu_env *env, struct md_object *obj,
+			     const struct lu_buf *buf, const char *name,
+			     int fl);
 
-        int (*moo_xattr_del)(const struct lu_env *env, struct md_object *obj,
-                             const char *name);
+	int (*moo_xattr_del)(const struct lu_env *env, struct md_object *obj,
+			     const char *name);
 
 	/** This method is used to swap the layouts between 2 objects */
 	int (*moo_swap_layouts)(const struct lu_env *env,
 			       struct md_object *obj1, struct md_object *obj2,
 			       __u64 flags);
 
-        /** \retval number of bytes actually read upon success */
-        int (*moo_readpage)(const struct lu_env *env, struct md_object *obj,
-                            const struct lu_rdpg *rdpg);
+	/** \retval number of bytes actually read upon success */
+	int (*moo_readpage)(const struct lu_env *env, struct md_object *obj,
+			    const struct lu_rdpg *rdpg);
+
+	int (*moo_readlink)(const struct lu_env *env, struct md_object *obj,
+			    struct lu_buf *buf);
 
-        int (*moo_readlink)(const struct lu_env *env, struct md_object *obj,
-                            struct lu_buf *buf);
 	int (*moo_changelog)(const struct lu_env *env,
-			     enum changelog_rec_type type, int flags,
+			     enum changelog_rec_type type,
+			     enum changelog_rec_flags clf_flags,
 			     struct md_device *m, const struct lu_fid *fid);
 
-        int (*moo_open)(const struct lu_env *env,
-                        struct md_object *obj, int flag);
+	int (*moo_open)(const struct lu_env *env,
+			struct md_object *obj, u64 open_flags);
 
-        int (*moo_close)(const struct lu_env *env, struct md_object *obj,
-                         struct md_attr *ma, int mode);
+	int (*moo_close)(const struct lu_env *env, struct md_object *obj,
+			 struct md_attr *ma, u64 open_flags);
 
-        int (*moo_object_sync)(const struct lu_env *, struct md_object *);
+	int (*moo_object_sync)(const struct lu_env *, struct md_object *);
 
 	int (*moo_object_lock)(const struct lu_env *env, struct md_object *obj,
 			       struct lustre_handle *lh,
@@ -222,55 +258,62 @@ struct md_object_operations {
 	 *
 	 * The caller should have held layout lock.
 	 *
+	 * This API can be extended to support every other layout changing
+	 * operations, such as component {add,del,change}, layout swap,
+	 * layout merge, etc. One of the benefits by doing this is that the MDT
+	 * no longer needs to understand layout.
+	 *
+	 * However, layout creation, removal, and fetch should still use
+	 * xattr_{get,set}() because they don't interpret layout on the
+	 * MDT layer.
+	 *
 	 * \param[in] env	execution environment
 	 * \param[in] obj	MD object
 	 * \param[in] layout	data structure to describe the changes to
 	 *			the MD object's layout
-	 * \param[in] buf	buffer containing the client's lovea
 	 *
 	 * \retval 0		success
 	 * \retval -ne		error code
 	 */
 	int (*moo_layout_change)(const struct lu_env *env,
 				 struct md_object *obj,
-				 struct layout_intent *layout,
-				 const struct lu_buf *buf);
+				 struct md_layout_change *layout);
 };
 
 /**
  * Operations implemented for each directory object.
  */
 struct md_dir_operations {
-        int (*mdo_is_subdir) (const struct lu_env *env, struct md_object *obj,
-                              const struct lu_fid *fid, struct lu_fid *sfid);
+	int (*mdo_is_subdir)(const struct lu_env *env, struct md_object *obj,
+			     const struct lu_fid *fid);
 
-        int (*mdo_lookup)(const struct lu_env *env, struct md_object *obj,
-                          const struct lu_name *lname, struct lu_fid *fid,
-                          struct md_op_spec *spec);
+	int (*mdo_lookup)(const struct lu_env *env, struct md_object *obj,
+			  const struct lu_name *lname, struct lu_fid *fid,
+			  struct md_op_spec *spec);
 
-        mdl_mode_t (*mdo_lock_mode)(const struct lu_env *env,
-                                    struct md_object *obj,
-                                    mdl_mode_t mode);
+	mdl_mode_t (*mdo_lock_mode)(const struct lu_env *env,
+				    struct md_object *obj,
+				    mdl_mode_t mode);
 
-        int (*mdo_create)(const struct lu_env *env, struct md_object *pobj,
-                          const struct lu_name *lname, struct md_object *child,
-                          struct md_op_spec *spec,
-                          struct md_attr *ma);
+	int (*mdo_create)(const struct lu_env *env, struct md_object *pobj,
+			  const struct lu_name *lname, struct md_object *child,
+			  struct md_op_spec *spec,
+			  struct md_attr *ma);
 
-        /** This method is used for creating data object for this meta object*/
-        int (*mdo_create_data)(const struct lu_env *env, struct md_object *p,
-                               struct md_object *o,
-                               const struct md_op_spec *spec,
-                               struct md_attr *ma);
+	/** This method is used for creating data object for this meta object*/
+	int (*mdo_create_data)(const struct lu_env *env, struct md_object *p,
+			       struct md_object *o,
+			       const struct md_op_spec *spec,
+			       struct md_attr *ma);
 
-        int (*mdo_rename)(const struct lu_env *env, struct md_object *spobj,
-                          struct md_object *tpobj, const struct lu_fid *lf,
-                          const struct lu_name *lsname, struct md_object *tobj,
-                          const struct lu_name *ltname, struct md_attr *ma);
+	int (*mdo_rename)(const struct lu_env *env, struct md_object *spobj,
+			  struct md_object *tpobj, const struct lu_fid *lf,
+			  const struct lu_name *lsname, struct md_object *tobj,
+			  const struct lu_name *ltname, struct md_attr *ma);
 
-        int (*mdo_link)(const struct lu_env *env, struct md_object *tgt_obj,
-                        struct md_object *src_obj, const struct lu_name *lname,
-                        struct md_attr *ma);
+	int (*mdo_link)(const struct lu_env *env, struct md_object *tgt_obj,
+			struct md_object *src_obj, const struct lu_name *lname,
+			struct md_attr *ma);
 
 	int (*mdo_unlink)(const struct lu_env *env, struct md_object *pobj,
 			  struct md_object *cobj, const struct lu_name *lname,
@@ -278,7 +321,8 @@ struct md_dir_operations {
 
 	int (*mdo_migrate)(const struct lu_env *env, struct md_object *pobj,
 			   struct md_object *sobj, const struct lu_name *lname,
-			   struct md_object *tobj, struct md_attr *ma);
+			   struct md_object *tobj, struct md_op_spec *spec,
+			   struct md_attr *ma);
 };
 
 struct md_device_operations {
@@ -286,8 +330,8 @@ struct md_device_operations {
 	int (*mdo_root_get)(const struct lu_env *env, struct md_device *m,
 			    struct lu_fid *f);
 
-	int (*mdo_maxeasize_get)(const struct lu_env *env, struct md_device *m,
-				int *easize);
+	const struct dt_device_param *(*mdo_dtconf_get)(const struct lu_env *e,
+							struct md_device *m);
 
         int (*mdo_statfs)(const struct lu_env *env, struct md_device *m,
                           struct obd_statfs *sfs);
@@ -346,22 +390,19 @@ static inline struct md_object *md_object_find_slice(const struct lu_env *env,
 
 
 /** md operations */
-static inline int mo_permission(const struct lu_env *env,
-                                struct md_object *p,
-                                struct md_object *c,
-                                struct md_attr *at,
-                                int mask)
+static inline int mo_permission(const struct lu_env *env, struct md_object *p,
+				struct md_object *c, struct md_attr *at,
+				int mask)
 {
-        LASSERT(c->mo_ops->moo_permission);
-        return c->mo_ops->moo_permission(env, p, c, at, mask);
+	LASSERT(c->mo_ops->moo_permission);
+	return c->mo_ops->moo_permission(env, p, c, at, mask);
 }
 
-static inline int mo_attr_get(const struct lu_env *env,
-                              struct md_object *m,
-                              struct md_attr *at)
+static inline int mo_attr_get(const struct lu_env *env, struct md_object *m,
+			      struct md_attr *at)
 {
-        LASSERT(m->mo_ops->moo_attr_get);
-        return m->mo_ops->moo_attr_get(env, m, at);
+	LASSERT(m->mo_ops->moo_attr_get);
+	return m->mo_ops->moo_attr_get(env, m, at);
 }
 
 static inline int mo_readlink(const struct lu_env *env,
@@ -374,8 +415,8 @@ static inline int mo_readlink(const struct lu_env *env,
 
 static inline int mo_changelog(const struct lu_env *env,
 			       enum changelog_rec_type type,
-			       int flags, struct md_device *m,
-			       const struct lu_fid *fid)
+			       enum changelog_rec_flags clf_flags,
+			       struct md_device *m, const struct lu_fid *fid)
 {
 	struct lu_fid rootfid;
 	struct md_object *root;
@@ -390,7 +431,7 @@ static inline int mo_changelog(const struct lu_env *env,
 		RETURN(PTR_ERR(root));
 
 	LASSERT(root->mo_ops->moo_changelog);
-	rc = root->mo_ops->moo_changelog(env, type, flags, m, fid);
+	rc = root->mo_ops->moo_changelog(env, type, clf_flags, m, fid);
 
 	lu_object_put(env, &root->mo_lu);
 
@@ -448,12 +489,11 @@ static inline int mo_invalidate(const struct lu_env *env, struct md_object *m)
 
 static inline int mo_layout_change(const struct lu_env *env,
 				   struct md_object *m,
-				   struct layout_intent *layout,
-				   const struct lu_buf *buf)
+				   struct md_layout_change *layout)
 {
 	/* need instantiate objects which in the access range */
 	LASSERT(m->mo_ops->moo_layout_change);
-	return m->mo_ops->moo_layout_change(env, m, layout, buf);
+	return m->mo_ops->moo_layout_change(env, m, layout);
 }
 
 static inline int mo_swap_layouts(const struct lu_env *env,
@@ -467,21 +507,18 @@ static inline int mo_swap_layouts(const struct lu_env *env,
 	return o1->mo_ops->moo_swap_layouts(env, o1, o2, flags);
 }
 
-static inline int mo_open(const struct lu_env *env,
-                          struct md_object *m,
-                          int flags)
+static inline int mo_open(const struct lu_env *env, struct md_object *m,
+			  u64 open_flags)
 {
-        LASSERT(m->mo_ops->moo_open);
-        return m->mo_ops->moo_open(env, m, flags);
+	LASSERT(m->mo_ops->moo_open);
+	return m->mo_ops->moo_open(env, m, open_flags);
 }
 
-static inline int mo_close(const struct lu_env *env,
-                           struct md_object *m,
-                           struct md_attr *ma,
-                           int mode)
+static inline int mo_close(const struct lu_env *env, struct md_object *m,
+			   struct md_attr *ma, u64 open_flags)
 {
-        LASSERT(m->mo_ops->moo_close);
-        return m->mo_ops->moo_close(env, m, ma, mode);
+	LASSERT(m->mo_ops->moo_close);
+	return m->mo_ops->moo_close(env, m, ma, open_flags);
 }
 
 static inline int mo_readpage(const struct lu_env *env,
@@ -576,19 +613,20 @@ static inline int mdo_migrate(const struct lu_env *env,
 			     struct md_object *sobj,
 			     const struct lu_name *lname,
 			     struct md_object *tobj,
+			     struct md_op_spec *spec,
 			     struct md_attr *ma)
 {
 	LASSERT(pobj->mo_dir_ops->mdo_migrate);
-	return pobj->mo_dir_ops->mdo_migrate(env, pobj, sobj, lname, tobj, ma);
+	return pobj->mo_dir_ops->mdo_migrate(env, pobj, sobj, lname, tobj, spec,
+					     ma);
 }
 
 static inline int mdo_is_subdir(const struct lu_env *env,
-                                struct md_object *mo,
-                                const struct lu_fid *fid,
-                                struct lu_fid *sfid)
+				struct md_object *mo,
+				const struct lu_fid *fid)
 {
-        LASSERT(mo->mo_dir_ops->mdo_is_subdir);
-        return mo->mo_dir_ops->mdo_is_subdir(env, mo, fid, sfid);
+	LASSERT(mo->mo_dir_ops->mdo_is_subdir);
+	return mo->mo_dir_ops->mdo_is_subdir(env, mo, fid);
 }
 
 static inline int mdo_link(const struct lu_env *env,
@@ -611,6 +649,14 @@ static inline int mdo_unlink(const struct lu_env *env,
 	return p->mo_dir_ops->mdo_unlink(env, p, c, lname, ma, no_name);
 }
 
+static inline int mdo_statfs(const struct lu_env *env,
+			     struct md_device *m,
+			     struct obd_statfs *sfs)
+{
+	LASSERT(m->md_ops->mdo_statfs);
+	return m->md_ops->mdo_statfs(env, m, sfs);
+}
+
 /**
  * Used in MDD/OUT layer for object lock rule
  **/
@@ -624,6 +670,7 @@ enum mdd_object_role {
 
 struct dt_device;
 
+void lustre_som_swab(struct lustre_som_attrs *attrs);
 int lustre_buf2hsm(void *buf, int rc, struct md_hsm *mh);
 void lustre_hsm2buf(void *buf, const struct md_hsm *mh);
 
@@ -650,6 +697,8 @@ struct lu_ucred {
 	struct group_info	*uc_ginfo;
 	struct md_identity	*uc_identity;
 	char			 uc_jobid[LUSTRE_JOBID_SIZE];
+	lnet_nid_t		 uc_nid;
+	bool			 uc_enable_audit;
 };
 
 struct lu_ucred *lu_ucred(const struct lu_env *env);
diff --git a/drivers/staging/lustrefsx/lustre/include/obd.h b/drivers/staging/lustrefsx/lustre/include/obd.h
index 9d49ce5a2a17a..b80a98332d6d5 100644
--- a/drivers/staging/lustrefsx/lustre/include/obd.h
+++ b/drivers/staging/lustrefsx/lustre/include/obd.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -33,9 +33,11 @@
 #ifndef __OBD_H
 #define __OBD_H
 
+#include <linux/kobject.h>
 #include <linux/spinlock.h>
+#include <linux/sysfs.h>
 
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 #include <lustre_lib.h>
 #include <libcfs/bitmap.h>
 #ifdef HAVE_SERVER_SUPPORT
@@ -100,11 +102,15 @@ struct obd_type {
 	struct md_ops		*typ_md_ops;
 	struct proc_dir_entry	*typ_procroot;
 	struct proc_dir_entry	*typ_procsym;
-	__u32			 typ_sym_filter;
+	struct dentry		*typ_debugfs_entry;
+#ifdef HAVE_SERVER_SUPPORT
+	bool			 typ_sym_filter;
+#endif
 	char			*typ_name;
 	int			 typ_refcnt;
 	struct lu_device_type	*typ_lu;
 	spinlock_t		 obd_type_lock;
+	struct kobject		*typ_kobj;
 };
 
 struct brw_page {
@@ -116,7 +122,7 @@ struct brw_page {
 
 struct timeout_item {
 	enum timeout_event ti_event;
-	cfs_time_t         ti_timeout;
+	time64_t	   ti_timeout;
 	timeout_cb_t       ti_cb;
 	void              *ti_cb_data;
 	struct list_head   ti_obd_list;
@@ -126,16 +132,15 @@ struct timeout_item {
 #define OBD_MAX_RIF_DEFAULT	8
 #define OBD_MAX_RIF_MAX		512
 #define OSC_MAX_RIF_MAX		256
-#define OSC_MAX_DIRTY_DEFAULT	(OBD_MAX_RIF_DEFAULT * 4)
+#define OSC_MAX_DIRTY_DEFAULT	2000	 /* Arbitrary large value */
 #define OSC_MAX_DIRTY_MB_MAX	2048     /* arbitrary, but < MAX_LONG bytes */
 #define OSC_DEFAULT_RESENDS	10
 
-/* possible values for fo_sync_lock_cancel */
-enum {
-        NEVER_SYNC_ON_CANCEL = 0,
-        BLOCKING_SYNC_ON_CANCEL = 1,
-        ALWAYS_SYNC_ON_CANCEL = 2,
-        NUM_SYNC_ON_CANCEL_STATES
+/* possible values for lut_sync_lock_cancel */
+enum tgt_sync_lock_cancel {
+	SYNC_LOCK_CANCEL_NEVER    = 0,
+	SYNC_LOCK_CANCEL_BLOCKING = 1,
+	SYNC_LOCK_CANCEL_ALWAYS   = 2,
 };
 
 /*
@@ -181,6 +186,17 @@ struct client_obd {
 	 * run-time if a larger observed size is advertised by the MDT. */
 	__u32			 cl_max_mds_easize;
 
+	/* Data-on-MDT specific value to set larger reply buffer for possible
+	 * data read along with open/stat requests. By default it tries to use
+	 * unused space in reply buffer.
+	 * This value is used to ensure that reply buffer has at least as
+	 * much free space as value indicates. That free space is gained from
+	 * LOV EA buffer which is small for DoM files and on big systems can
+	 * provide up to 32KB of extra space in reply buffer.
+	 * Default value is 8K now.
+	 */
+	__u32			 cl_dom_min_inline_repsize;
+
 	enum lustre_sec_part	 cl_sp_me;
 	enum lustre_sec_part	 cl_sp_to;
 	struct sptlrpc_flavor	 cl_flvr_mgc; /* fixed flavor of mgc->mgs */
@@ -188,7 +204,6 @@ struct client_obd {
 	/* the grant values are protected by loi_list_lock below */
 	unsigned long		 cl_dirty_pages;      /* all _dirty_ in pages */
 	unsigned long		 cl_dirty_max_pages;  /* allowed w/o rpc */
-	unsigned long		 cl_dirty_transit;    /* dirty synchronous */
 	unsigned long		 cl_avail_grant;   /* bytes of credit for ost */
 	unsigned long		 cl_lost_grant;    /* lost credits (trunc) */
 	/* grant consumed for dirty pages */
@@ -199,10 +214,10 @@ struct client_obd {
 	 * grant before trying to dirty a page and unreserve the rest.
 	 * See osc_{reserve|unreserve}_grant for details. */
 	long			cl_reserved_grant;
-	struct list_head	cl_cache_waiters; /* waiting for cache/grant */
-	cfs_time_t		cl_next_shrink_grant;   /* jiffies */
-	struct list_head	cl_grant_shrink_list;  /* Timeout event list */
-	int			cl_grant_shrink_interval; /* seconds */
+	wait_queue_head_t	cl_cache_waiters; /* waiting for cache/grant */
+	time64_t		cl_next_shrink_grant;	/* seconds */
+	struct list_head	cl_grant_chain;
+	time64_t		cl_grant_shrink_interval; /* seconds */
 
 	/* A chunk is an optimal size used by osc_extent to determine
 	 * the extent size. A chunk is max(PAGE_SIZE, OST block size) */
@@ -240,8 +255,9 @@ struct client_obd {
 	/* just a sum of the loi/lop pending numbers to be exported by /proc */
 	atomic_t		cl_pending_w_pages;
 	atomic_t		cl_pending_r_pages;
-	__u32			cl_max_pages_per_rpc;
-	__u32			cl_max_rpcs_in_flight;
+	u32			cl_max_pages_per_rpc;
+	u32			cl_max_rpcs_in_flight;
+	u32			cl_max_short_io_bytes;
 	struct obd_histogram	cl_read_rpc_hist;
 	struct obd_histogram	cl_write_rpc_hist;
 	struct obd_histogram	cl_read_page_hist;
@@ -288,8 +304,6 @@ struct client_obd {
 	atomic_t		 cl_destroy_in_flight;
 	wait_queue_head_t	 cl_destroy_waitq;
 
-        struct mdc_rpc_lock     *cl_rpc_lock;
-
 	/* modify rpcs in flight
 	 * currently used for metadata only */
 	spinlock_t		 cl_mod_rpcs_lock;
@@ -304,8 +318,11 @@ struct client_obd {
 	struct mutex		  cl_mgc_mutex;
 	struct local_oid_storage *cl_mgc_los;
 	struct dt_object	 *cl_mgc_configs_dir;
-	atomic_t		  cl_mgc_refcount;
 	struct obd_export        *cl_mgc_mgsexp;
+	atomic_t		  cl_mgc_refcount;
+	/* in-flight control list and total RPCs counter */
+	struct list_head	 cl_flight_waiters;
+	__u32			 cl_rpcs_in_flight;
 
         /* checksumming for data sent over the network */
 	unsigned int		 cl_checksum:1, /* 0 = disabled, 1 = enabled */
@@ -313,7 +330,7 @@ struct client_obd {
         /* supported checksum types that are worked out at connect time */
         __u32                    cl_supp_cksum_types;
         /* checksum algorithm to be used */
-        cksum_type_t             cl_cksum_type;
+	enum cksum_types	 cl_cksum_type;
 
         /* also protected by the poorly named _loi_list_lock lock above */
         struct osc_async_rc      cl_ar;
@@ -327,8 +344,11 @@ struct client_obd {
 	/* ptlrpc work for writeback in ptlrpcd context */
 	void			*cl_writeback_work;
 	void			*cl_lru_work;
+	struct mutex		  cl_quota_mutex;
 	/* hash tables for osc_quota_info */
 	struct cfs_hash		*cl_quota_hash[LL_MAXQUOTAS];
+	/* the xid of the request updating the hash tables */
+	__u64			 cl_quota_last_xid;
 	/* Links to the global list of registered changelog devices */
 	struct list_head	 cl_chg_dev_linkage;
 };
@@ -358,6 +378,10 @@ struct ost_pool {
 
 /* allow statfs data caching for 1 second */
 #define OBD_STATFS_CACHE_SECONDS 1
+/* arbitrary maximum. larger would be useless, allows catching bogus input */
+#define OBD_STATFS_CACHE_MAX_AGE 3600 /* seconds */
+/* By default, don't do time based negative cache invalidation */
+#define OBD_NEG_CACHE_TIMEOUT_DEFAULT_SECS (-1) /* seconds */
 
 struct lov_tgt_desc {
 	struct list_head    ltd_kill;
@@ -371,6 +395,11 @@ struct lov_tgt_desc {
                             ltd_reap:1;  /* should this target be deleted */
 };
 
+struct lov_md_tgt_desc {
+	struct obd_device *lmtd_mdc;
+	__u32		   lmtd_index;
+};
+
 struct lov_obd {
 	struct lov_desc		desc;
 	struct lov_tgt_desc   **lov_tgts;		/* sparse array */
@@ -393,10 +422,15 @@ struct lov_obd {
 	struct cl_client_cache *lov_cache;
 
 	struct rw_semaphore	lov_notify_lock;
+	/* Data-on-MDT: MDC array */
+	struct lov_md_tgt_desc	*lov_mdc_tgts;
+
+	struct kobject		*lov_tgts_kobj;
 };
 
 struct lmv_tgt_desc {
 	struct obd_uuid		ltd_uuid;
+	struct obd_device	*ltd_obd;
 	struct obd_export	*ltd_exp;
 	__u32			ltd_idx;
 	struct mutex		ltd_fid_mutex;
@@ -407,19 +441,23 @@ struct lmv_obd {
 	struct lu_client_fld	lmv_fld;
 	spinlock_t		lmv_lock;
 	struct lmv_desc		desc;
-	struct proc_dir_entry	*targets_proc_entry;
 
 	struct mutex		lmv_init_mutex;
 	int			connected;
 	int			max_easize;
 	int			max_def_easize;
+	u32			lmv_statfs_start;
 
-	__u32			tgts_size; /* size of tgts array */
+	u32			tgts_size; /* size of tgts array */
 	struct lmv_tgt_desc	**tgts;
 
 	struct obd_connect_data	conn_data;
+	struct kobject		*lmv_tgts_kobj;
 };
 
+/* Minimum sector size is 512 */
+#define MAX_GUARD_NUMBER (PAGE_SIZE / 512)
+
 struct niobuf_local {
 	__u64		lnb_file_offset;
 	__u32		lnb_page_offset;
@@ -428,6 +466,11 @@ struct niobuf_local {
 	int		lnb_rc;
 	struct page	*lnb_page;
 	void		*lnb_data;
+	__u16		lnb_guards[MAX_GUARD_NUMBER];
+	__u16		lnb_guard_rpc:1;
+	__u16		lnb_guard_disk:1;
+	/* separate unlock for read path to allow shared access */
+	__u16		lnb_locked:1;
 };
 
 struct tgt_thread_big_cache {
@@ -540,7 +583,7 @@ enum obd_notify_event {
 
 /*
  * Data structure used to pass obd_notify()-event to non-obd listeners (llite
- * and liblustre being main examples).
+ * being main example).
  */
 struct obd_notify_upcall {
 	int (*onu_upcall)(struct obd_device *host, struct obd_device *watched,
@@ -595,7 +638,6 @@ struct obd_device {
 					 * (for /proc/status only!!) */
 		obd_no_ir:1,		/* no imperative recovery. */
 		obd_process_conf:1,	/* device is processing mgs config */
-		obd_uses_nid_stats:1,	/* maintain per-client OBD stats */
 		obd_checksum_dump:1;	/* dump pages upon cksum error */
 
         /* use separate field as it is set in interrupt to don't mess with
@@ -623,7 +665,7 @@ struct obd_device {
 	spinlock_t		obd_dev_lock; /* protect OBD bitfield above */
 	spinlock_t		obd_osfs_lock;
 	struct obd_statfs	obd_osfs;       /* locked by obd_osfs_lock */
-	__u64			obd_osfs_age;
+	time64_t		obd_osfs_age;
 	__u64			obd_last_committed;
 	struct mutex		obd_dev_mutex;
 	struct lvfs_run_ctxt	obd_lvfs_ctxt;
@@ -635,9 +677,9 @@ struct obd_device {
 	struct obd_export	*obd_lwp_export;
 	/* list of exports in LRU order, for ping evictor, with obd_dev_lock */
 	struct list_head	obd_exports_timed;
-	time_t			obd_eviction_timer;	/* for ping evictor */
+	time64_t		obd_eviction_timer;	/* for ping evictor */
 
-	int                     obd_max_recoverable_clients;
+	atomic_t                obd_max_recoverable_clients;
 	atomic_t                obd_connected_clients;
 	int                     obd_stale_clients;
         /* this lock protects all recovery list_heads, timer and
@@ -648,7 +690,7 @@ struct obd_device {
 	int			obd_requests_queued_for_recovery;
 	wait_queue_head_t	obd_next_transno_waitq;
 	/* protected by obd_recovery_task_lock */
-	struct timer_list	obd_recovery_timer;
+	struct hrtimer		obd_recovery_timer;
 	/* seconds */
 	time64_t		obd_recovery_start;
 	/* seconds, for lprocfs_status */
@@ -683,16 +725,17 @@ struct obd_device {
 
 	/* Fields used by LProcFS */
 	struct lprocfs_stats		*obd_stats;
-	unsigned int			obd_cntr_base;
 
-	unsigned int			 obd_md_cntr_base;
 	struct lprocfs_stats		*obd_md_stats;
 
+	struct dentry			*obd_debugfs_entry;
 	struct proc_dir_entry	*obd_proc_entry;
 	struct proc_dir_entry	*obd_proc_exports_entry;
-	struct proc_dir_entry	*obd_svc_procroot;
+	struct dentry			*obd_svc_debugfs_entry;
 	struct lprocfs_stats	*obd_svc_stats;
+	const struct attribute	       **obd_attrs;
 	struct lprocfs_vars	*obd_vars;
+	struct ldebugfs_vars	*obd_debugfs_vars;
 	atomic_t		obd_evict_inprogress;
 	wait_queue_head_t	obd_evict_inprogress_waitq;
 	struct list_head	obd_evict_list;	/* protected with pet_lock */
@@ -709,6 +752,10 @@ struct obd_device {
 	/**
 	 * List of outstanding class_incref()'s fo this OBD. For debugging. */
 	struct lu_ref			obd_reference;
+
+	struct kset		        obd_kset; /* sysfs object collection */
+	struct kobj_type		obd_ktype;
+	struct completion		obd_kobj_unregister;
 };
 
 /* get/set_info keys */
@@ -741,6 +788,17 @@ struct obd_device {
 #define KEY_CACHE_LRU_SHRINK	"cache_lru_shrink"
 #define KEY_OSP_CONNECTED	"osp_connected"
 
+/* Flags for op_xvalid */
+enum op_xvalid {
+	OP_XVALID_CTIME_SET	= BIT(0),	/* 0x0001 */
+	OP_XVALID_BLOCKS	= BIT(1),	/* 0x0002 */
+	OP_XVALID_OWNEROVERRIDE	= BIT(2),	/* 0x0004 */
+	OP_XVALID_FLAGS		= BIT(3),	/* 0x0008 */
+	OP_XVALID_PROJID	= BIT(4),	/* 0x0010 */
+	OP_XVALID_LAZYSIZE	= BIT(5),	/* 0x0020 */
+	OP_XVALID_LAZYBLOCKS	= BIT(6),	/* 0x0040 */
+};
+
 struct lu_context;
 
 static inline int it_to_lock_mode(struct lookup_intent *it)
@@ -748,15 +806,14 @@ static inline int it_to_lock_mode(struct lookup_intent *it)
 	/* CREAT needs to be tested before open (both could be set) */
 	if (it->it_op & IT_CREAT)
 		return LCK_CW;
-	else if (it->it_op & (IT_GETATTR | IT_OPEN | IT_LOOKUP |
-			      IT_LAYOUT))
+	else if (it->it_op & (IT_GETATTR | IT_OPEN | IT_LOOKUP))
 		return LCK_CR;
+	else if (it->it_op & IT_LAYOUT)
+		return (it->it_flags & FMODE_WRITE) ? LCK_EX : LCK_CR;
 	else if (it->it_op &  IT_READDIR)
 		return LCK_PR;
 	else if (it->it_op &  IT_GETXATTR)
 		return LCK_PR;
-	else if (it->it_op &  IT_SETXATTR)
-		return LCK_PW;
 
 	LASSERTF(0, "Invalid it_op: %d\n", it->it_op);
 	return -EINVAL;
@@ -768,6 +825,7 @@ enum md_op_flags {
 	MF_MDC_CANCEL_FID3	= 1 << 2,
 	MF_MDC_CANCEL_FID4	= 1 << 3,
 	MF_GET_MDT_IDX		= 1 << 4,
+	MF_GETATTR_BY_FID	= 1 << 5,
 };
 
 enum md_cli_flags {
@@ -785,7 +843,7 @@ enum md_cli_flags {
  */
 static inline bool it_has_reply_body(const struct lookup_intent *it)
 {
-	return it->it_op & (IT_OPEN | IT_UNLINK | IT_LOOKUP | IT_GETATTR);
+	return it->it_op & (IT_OPEN | IT_LOOKUP | IT_GETATTR);
 }
 
 struct md_op_data {
@@ -795,10 +853,12 @@ struct md_op_data {
 	struct lu_fid		op_fid4; /* to the operation locks. */
 	u32			op_mds;  /* what mds server open will go to */
 	__u32			op_mode;
-	struct lustre_handle	op_handle;
+	struct lustre_handle	op_open_handle;
 	s64			op_mod_time;
 	const char		*op_name;
 	size_t			op_namelen;
+	struct rw_semaphore	*op_mea1_sem;
+	struct rw_semaphore	*op_mea2_sem;
 	struct lmv_stripe_md	*op_mea1;
 	struct lmv_stripe_md	*op_mea2;
 	__u32			op_suppgids[2];
@@ -810,9 +870,10 @@ struct md_op_data {
 
 	/* iattr fields and blocks. */
 	struct iattr            op_attr;
+	enum op_xvalid		op_xvalid;	/* eXtra validity flags */
 	loff_t                  op_attr_blocks;
-	__u64                   op_valid; /* OBD_MD_* */
-	unsigned int		op_attr_flags; /* LUSTRE_{SYNC,..}_FL */
+	u64			op_valid;	/* OBD_MD_* */
+	unsigned int		op_attr_flags;	/* LUSTRE_{SYNC,..}_FL */
 
 	enum md_op_flags	op_flags;
 
@@ -827,8 +888,9 @@ struct md_op_data {
 	__u64			op_data_version;
 	struct lustre_handle	op_lease_handle;
 
-	/* File security context, for creates. */
+	/* File security context, for creates/metadata ops */
 	const char	       *op_file_secctx_name;
+	__u32			op_file_secctx_name_size;
 	void		       *op_file_secctx;
 	__u32			op_file_secctx_size;
 
@@ -840,6 +902,19 @@ struct md_op_data {
 	/* Used by readdir */
 	unsigned int		op_max_pages;
 
+	__u16			op_mirror_id;
+
+	/*
+	 * used to access migrating dir: if it's set, assume migration is
+	 * finished, use the new layout to access dir, otherwise use old layout.
+	 * By default it's not set, because new files are created under new
+	 * layout, if we can't find file with name under both old and new
+	 * layout, we are sure file with name doesn't exist, but in reverse
+	 * order there may be a race with creation by others.
+	 */
+	bool			op_post_migrate;
+	/* used to access dir with bash hash */
+	__u32			op_stripe_index;
 };
 
 struct md_callback {
@@ -911,9 +986,9 @@ struct obd_ops {
 	 * about this.
 	 */
 	int (*o_statfs)(const struct lu_env *, struct obd_export *exp,
-			struct obd_statfs *osfs, __u64 max_age, __u32 flags);
+			struct obd_statfs *osfs, time64_t max_age, __u32 flags);
 	int (*o_statfs_async)(struct obd_export *exp, struct obd_info *oinfo,
-			      __u64 max_age, struct ptlrpc_request_set *set);
+			      time64_t max_age, struct ptlrpc_request_set *set);
 	int (*o_create)(const struct lu_env *env, struct obd_export *exp,
 			struct obdo *oa);
 	int (*o_destroy)(const struct lu_env *env, struct obd_export *exp,
@@ -947,8 +1022,6 @@ struct obd_ops {
 	int (*o_quotactl)(struct obd_device *, struct obd_export *,
 			  struct obd_quotactl *);
 
-	int (*o_ping)(const struct lu_env *, struct obd_export *exp);
-
 	/* pools methods */
 	int (*o_pool_new)(struct obd_device *obd, char *poolname);
 	int (*o_pool_del)(struct obd_device *obd, char *poolname);
@@ -956,12 +1029,6 @@ struct obd_ops {
 			  char *ostname);
 	int (*o_pool_rem)(struct obd_device *obd, char *poolname,
 			  char *ostname);
-	void (*o_getref)(struct obd_device *obd);
-	void (*o_putref)(struct obd_device *obd);
-	/*
-	 * NOTE: If adding ops, add another LPROCFS_OBD_OP_INIT() line
-	 * to lprocfs_alloc_obd_stats() in obdclass/lprocfs_status.c.
-	 * Also, add a wrapper function in include/linux/obd_class.h. */
 };
 
 /* lmv structures */
@@ -983,7 +1050,7 @@ struct md_open_data {
 };
 
 struct obd_client_handle {
-	struct lustre_handle	 och_fh;
+	struct lustre_handle	 och_open_handle;
 	struct lu_fid		 och_fid;
 	struct md_open_data	*och_mod;
 	struct lustre_handle	 och_lease_handle; /* open lock for lease */
@@ -997,18 +1064,6 @@ struct lookup_intent;
 struct cl_attr;
 
 struct md_ops {
-	/* Every operation from MD_STATS_FIRST_OP up to and including
-	 * MD_STATS_LAST_OP will be counted by EXP_MD_OP_INCREMENT()
-	 * and will appear in /proc/fs/lustre/{lmv,mdc}/.../md_stats.
-	 * Operations after MD_STATS_LAST_OP are excluded from stats.
-	 * There are a few reasons for doing this: we prune the 17
-	 * counters which will be of minimal use in understanding
-	 * metadata utilization, we save memory by allocating 15
-	 * instead of 32 counters, we save cycles by not counting.
-	 *
-	 * MD_STATS_FIRST_OP must be the first member of md_ops.
-	 */
-#define MD_STATS_FIRST_OP m_close
 	int (*m_close)(struct obd_export *, struct md_op_data *,
 		       struct md_open_data *, struct ptlrpc_request **);
 
@@ -1049,12 +1104,11 @@ struct md_ops {
 			struct ptlrpc_request **);
 
 	int (*m_setxattr)(struct obd_export *, const struct lu_fid *,
-			  u64, const char *, const char *, int, int, int, u32,
-			  struct ptlrpc_request **);
+			  u64, const char *, const void *, size_t, unsigned int,
+			  u32, struct ptlrpc_request **);
 
 	int (*m_getxattr)(struct obd_export *, const struct lu_fid *,
-			  u64, const char *, const char *, int, int, int,
-			  struct ptlrpc_request **);
+			  u64, const char *, size_t, struct ptlrpc_request **);
 
 	int (*m_intent_getattr_async)(struct obd_export *,
 				      struct md_enqueue_info *);
@@ -1062,7 +1116,7 @@ struct md_ops {
         int (*m_revalidate_lock)(struct obd_export *, struct lookup_intent *,
                                  struct lu_fid *, __u64 *bits);
 
-#define MD_STATS_LAST_OP m_revalidate_lock
+	int (*m_file_resync)(struct obd_export *, struct md_op_data *);
 
 	int (*m_get_root)(struct obd_export *, const char *, struct lu_fid *);
 	int (*m_null_inode)(struct obd_export *, const struct lu_fid *);
@@ -1107,6 +1161,8 @@ struct md_ops {
 				  struct lu_fid *fid);
 	int (*m_unpackmd)(struct obd_export *exp, struct lmv_stripe_md **plsm,
 			  const union lmv_mds_md *lmv, size_t lmv_size);
+	int (*m_rmfid)(struct obd_export *exp, struct fid_array *fa, int *rcs,
+		       struct ptlrpc_request_set *set);
 };
 
 static inline struct md_open_data *obd_mod_alloc(void)
@@ -1201,7 +1257,8 @@ static inline int cli_brw_size(struct obd_device *obd)
 	return obd->u.cli.cl_max_pages_per_rpc << PAGE_SHIFT;
 }
 
-/* when RPC size or the max RPCs in flight is increased, the max dirty pages
+/*
+ * When RPC size or the max RPCs in flight is increased, the max dirty pages
  * of the client should be increased accordingly to avoid sending fragmented
  * RPCs over the network when the client runs out of the maximum dirty space
  * when so many RPCs are being generated.
@@ -1209,10 +1266,10 @@ static inline int cli_brw_size(struct obd_device *obd)
 static inline void client_adjust_max_dirty(struct client_obd *cli)
 {
 	 /* initializing */
-	if (cli->cl_dirty_max_pages <= 0)
-		cli->cl_dirty_max_pages = (OSC_MAX_DIRTY_DEFAULT * 1024 * 1024)
-							>> PAGE_SHIFT;
-	else {
+	if (cli->cl_dirty_max_pages <= 0) {
+		cli->cl_dirty_max_pages =
+			(OSC_MAX_DIRTY_DEFAULT * 1024 * 1024) >> PAGE_SHIFT;
+	} else {
 		unsigned long dirty_max = cli->cl_max_rpcs_in_flight *
 					  cli->cl_max_pages_per_rpc;
 
@@ -1222,6 +1279,12 @@ static inline void client_adjust_max_dirty(struct client_obd *cli)
 
 	if (cli->cl_dirty_max_pages > cfs_totalram_pages() / 8)
 		cli->cl_dirty_max_pages = cfs_totalram_pages() / 8;
+
+	/* This value is exported to userspace through the max_dirty_mb
+	 * parameter.  So we round up the number of pages to make it a round
+	 * number of MBs. */
+	cli->cl_dirty_max_pages = round_up(cli->cl_dirty_max_pages,
+					   1 << (20 - PAGE_SHIFT));
 }
 
 #endif /* __OBD_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/obd_cksum.h b/drivers/staging/lustrefsx/lustre/include/obd_cksum.h
index 6a0cfe8d72fc0..6e807d762c354 100644
--- a/drivers/staging/lustrefsx/lustre/include/obd_cksum.h
+++ b/drivers/staging/lustrefsx/lustre/include/obd_cksum.h
@@ -34,9 +34,12 @@
 #define __OBD_CKSUM
 #include <libcfs/libcfs.h>
 #include <libcfs/libcfs_crypto.h>
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 
-static inline unsigned char cksum_obd2cfs(cksum_type_t cksum_type)
+int obd_t10_cksum_speed(const char *obd_name,
+			enum cksum_types cksum_type);
+
+static inline unsigned char cksum_obd2cfs(enum cksum_types cksum_type)
 {
 	switch (cksum_type) {
 	case OBD_CKSUM_CRC32:
@@ -52,58 +55,23 @@ static inline unsigned char cksum_obd2cfs(cksum_type_t cksum_type)
 	return 0;
 }
 
-/* The OBD_FL_CKSUM_* flags is packed into 5 bits of o_flags, since there can
- * only be a single checksum type per RPC.
- *
- * The OBD_CHECKSUM_* type bits passed in ocd_cksum_types are a 32-bit bitmask
- * since they need to represent the full range of checksum algorithms that
- * both the client and server can understand.
- *
- * In case of an unsupported types/flags we fall back to ADLER
- * because that is supported by all clients since 1.8
- *
- * In case multiple algorithms are supported the best one is used. */
-static inline u32 cksum_type_pack(cksum_type_t cksum_type)
-{
-	unsigned int    performance = 0, tmp;
-	u32		flag = OBD_FL_CKSUM_ADLER;
-
-	if (cksum_type & OBD_CKSUM_CRC32) {
-		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32));
-		if (tmp > performance) {
-			performance = tmp;
-			flag = OBD_FL_CKSUM_CRC32;
-		}
-	}
-	if (cksum_type & OBD_CKSUM_CRC32C) {
-		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C));
-		if (tmp > performance) {
-			performance = tmp;
-			flag = OBD_FL_CKSUM_CRC32C;
-		}
-	}
-	if (cksum_type & OBD_CKSUM_ADLER) {
-		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER));
-		if (tmp > performance) {
-			performance = tmp;
-			flag = OBD_FL_CKSUM_ADLER;
-		}
-	}
-	if (unlikely(cksum_type && !(cksum_type & (OBD_CKSUM_CRC32C |
-						   OBD_CKSUM_CRC32 |
-						   OBD_CKSUM_ADLER))))
-		CWARN("unknown cksum type %x\n", cksum_type);
-
-	return flag;
-}
+u32 obd_cksum_type_pack(const char *obd_name, enum cksum_types cksum_type);
 
-static inline cksum_type_t cksum_type_unpack(u32 o_flags)
+static inline enum cksum_types obd_cksum_type_unpack(u32 o_flags)
 {
 	switch (o_flags & OBD_FL_CKSUM_ALL) {
 	case OBD_FL_CKSUM_CRC32C:
 		return OBD_CKSUM_CRC32C;
 	case OBD_FL_CKSUM_CRC32:
 		return OBD_CKSUM_CRC32;
+	case OBD_FL_CKSUM_T10IP512:
+		return OBD_CKSUM_T10IP512;
+	case OBD_FL_CKSUM_T10IP4K:
+		return OBD_CKSUM_T10IP4K;
+	case OBD_FL_CKSUM_T10CRC512:
+		return OBD_CKSUM_T10CRC512;
+	case OBD_FL_CKSUM_T10CRC4K:
+		return OBD_CKSUM_T10CRC4K;
 	default:
 		break;
 	}
@@ -115,9 +83,9 @@ static inline cksum_type_t cksum_type_unpack(u32 o_flags)
  * 1.8 supported ADLER it is base and not depend on hw
  * Client uses all available local algos
  */
-static inline cksum_type_t cksum_types_supported_client(void)
+static inline enum cksum_types obd_cksum_types_supported_client(void)
 {
-	cksum_type_t ret = OBD_CKSUM_ADLER;
+	enum cksum_types ret = OBD_CKSUM_ADLER;
 
 	CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n",
 	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)),
@@ -129,32 +97,13 @@ static inline cksum_type_t cksum_types_supported_client(void)
 	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) > 0)
 		ret |= OBD_CKSUM_CRC32;
 
-	return ret;
-}
-
-/* Server uses algos that perform at 50% or better of the Adler */
-static inline enum cksum_types cksum_types_supported_server(void)
-{
-	enum cksum_types ret = OBD_CKSUM_ADLER;
-	int base_speed;
-
-	CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n",
-	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)),
-	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)),
-	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)));
-
-	base_speed = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)) / 2;
-
-	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) >=
-	    base_speed)
-		ret |= OBD_CKSUM_CRC32C;
-	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) >=
-	    base_speed)
-		ret |= OBD_CKSUM_CRC32;
+	/* Client support all kinds of T10 checksum */
+	ret |= OBD_CKSUM_T10_ALL;
 
 	return ret;
 }
 
+enum cksum_types obd_cksum_types_supported_server(const char *obd_name);
 
 /* Select the best checksum algorithm among those supplied in the cksum_types
  * input.
@@ -163,13 +112,69 @@ static inline enum cksum_types cksum_types_supported_server(void)
  * checksum type due to its benchmarking at libcfs module load.
  * Caution is advised, however, since what is fastest on a single client may
  * not be the fastest or most efficient algorithm on the server.  */
-static inline cksum_type_t cksum_type_select(cksum_type_t cksum_types)
+static inline enum cksum_types
+obd_cksum_type_select(const char *obd_name, enum cksum_types cksum_types)
 {
-	return cksum_type_unpack(cksum_type_pack(cksum_types));
+	u32 flag = obd_cksum_type_pack(obd_name, cksum_types);
+
+	return obd_cksum_type_unpack(flag);
 }
 
 /* Checksum algorithm names. Must be defined in the same order as the
  * OBD_CKSUM_* flags. */
-#define DECLARE_CKSUM_NAME char *cksum_name[] = {"crc32", "adler", "crc32c"}
+#define DECLARE_CKSUM_NAME const char *cksum_name[] = {"crc32", "adler", \
+	"crc32c", "reserved", "t10ip512", "t10ip4K", "t10crc512", "t10crc4K"}
+
+typedef __u16 (obd_dif_csum_fn) (void *, unsigned int);
+
+__u16 obd_dif_crc_fn(void *data, unsigned int len);
+__u16 obd_dif_ip_fn(void *data, unsigned int len);
+int obd_page_dif_generate_buffer(const char *obd_name, struct page *page,
+				 __u32 offset, __u32 length,
+				 __u16 *guard_start, int guard_number,
+				 int *used_number, int sector_size,
+				 obd_dif_csum_fn *fn);
+/*
+ * If checksum type is one T10 checksum types, init the csum_fn and sector
+ * size. Otherwise, init them to NULL/zero.
+ */
+static inline void obd_t10_cksum2dif(enum cksum_types cksum_type,
+				     obd_dif_csum_fn **fn, int *sector_size)
+{
+	*fn = NULL;
+	*sector_size = 0;
+
+#if IS_ENABLED(CONFIG_CRC_T10DIF)
+	switch (cksum_type) {
+	case OBD_CKSUM_T10IP512:
+		*fn = obd_dif_ip_fn;
+		*sector_size = 512;
+		break;
+	case OBD_CKSUM_T10IP4K:
+		*fn = obd_dif_ip_fn;
+		*sector_size = 4096;
+		break;
+	case OBD_CKSUM_T10CRC512:
+		*fn = obd_dif_crc_fn;
+		*sector_size = 512;
+		break;
+	case OBD_CKSUM_T10CRC4K:
+		*fn = obd_dif_crc_fn;
+		*sector_size = 4096;
+		break;
+	default:
+		break;
+	}
+#endif /* CONFIG_CRC_T10DIF */
+}
+
+enum obd_t10_cksum_type {
+	OBD_T10_CKSUM_UNKNOWN = 0,
+	OBD_T10_CKSUM_IP512,
+	OBD_T10_CKSUM_IP4K,
+	OBD_T10_CKSUM_CRC512,
+	OBD_T10_CKSUM_CRC4K,
+	OBD_T10_CKSUM_MAX
+};
 
 #endif /* __OBD_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/obd_class.h b/drivers/staging/lustrefsx/lustre/include/obd_class.h
index 437a700807142..f44c4dc42a4ea 100644
--- a/drivers/staging/lustrefsx/lustre/include/obd_class.h
+++ b/drivers/staging/lustrefsx/lustre/include/obd_class.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -32,13 +32,13 @@
 #ifndef __CLASS_OBD_H
 #define __CLASS_OBD_H
 
-
+#include <linux/kobject.h>
 #include <obd_support.h>
 #include <lustre_import.h>
 #include <lustre_net.h>
 #include <obd.h>
 #include <lustre_lib.h>
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 #include <lprocfs_status.h>
 
 #define OBD_STATFS_NODELAY      0x0001  /* requests should be send without delay
@@ -47,27 +47,36 @@
                                          * obd_osfs_age */
 #define OBD_STATFS_FOR_MDT0	0x0004	/* The statfs is only for retrieving
 					 * information from MDT0. */
+#define OBD_STATFS_SUM		0x0008	/* get aggregated statfs from MDT */
 
 extern rwlock_t obd_dev_lock;
 
 /* OBD Operations Declarations */
-extern struct obd_device *class_conn2obd(struct lustre_handle *);
 extern struct obd_device *class_exp2obd(struct obd_export *);
 extern int class_handle_ioctl(unsigned int cmd, unsigned long arg);
-extern int lustre_get_jobid(char *jobid);
+int lustre_get_jobid(char *jobid, size_t len);
+void lustre_jobid_clear(const char *jobid);
+void jobid_cache_fini(void);
+int jobid_cache_init(void);
 
 struct lu_device_type;
 
 /* genops.c */
 struct obd_export *class_conn2export(struct lustre_handle *);
-int class_register_type(struct obd_ops *, struct md_ops *, bool enable_proc,
-			struct lprocfs_vars *module_vars,
+struct kobject *class_setup_tunables(const char *name);
+int class_register_type(const struct obd_ops *dt_ops,
+			const struct md_ops *md_ops, bool enable_proc,
+			struct ldebugfs_vars *module_vars,
 			const char *nm, struct lu_device_type *ldt);
 int class_unregister_type(const char *nm);
 
-struct obd_device *class_newdev(const char *type_name, const char *name);
-void class_release_dev(struct obd_device *obd);
+struct obd_device *class_newdev(const char *type_name, const char *name,
+				const char *uuid);
+int class_register_device(struct obd_device *obd);
+void class_unregister_device(struct obd_device *obd);
+void class_free_dev(struct obd_device *obd);
 
+struct obd_device *class_dev_by_str(const char *str);
 int class_name2dev(const char *name);
 struct obd_device *class_name2obd(const char *name);
 int class_uuid2dev(struct obd_uuid *uuid);
@@ -83,7 +92,17 @@ int get_devices_count(void);
 
 int class_notify_sptlrpc_conf(const char *fsname, int namelen);
 
-char *obd_export_nid2str(struct obd_export *exp);
+static inline char *obd_export_nid2str(struct obd_export *exp)
+{
+	return exp->exp_connection == NULL ?
+	       "<unknown>" : libcfs_nid2str(exp->exp_connection->c_peer.nid);
+}
+
+static inline char *obd_import_nid2str(struct obd_import *imp)
+{
+	return imp->imp_connection == NULL ?
+	       "<unknown>" : libcfs_nid2str(imp->imp_connection->c_peer.nid);
+}
 
 int obd_export_evict_by_nid(struct obd_device *obd, const char *nid);
 int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid);
@@ -133,8 +152,9 @@ struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg,
 				     const char *new_name);
 void print_lustre_cfg(struct lustre_cfg *lcfg);
 int class_process_config(struct lustre_cfg *lcfg);
-int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
-			     struct lustre_cfg *lcfg, void *data);
+ssize_t class_set_global(const char *param);
+ssize_t class_modify_config(struct lustre_cfg *lcfg, const char *prefix,
+			    struct kobject *kobj);
 int class_attach(struct lustre_cfg *lcfg);
 int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
 int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg);
@@ -164,7 +184,6 @@ int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg);
 #define CFG_F_START     0x01   /* Set when we start updating from a log */
 #define CFG_F_MARKER    0x02   /* We are within a maker */
 #define CFG_F_SKIP      0x04   /* We should ignore this cfg command */
-#define CFG_F_COMPAT146 0x08   /* Allow old-style logs */
 #define CFG_F_EXCLUDE   0x10   /* OST exclusion list */
 
 /* Passed as data param to class_config_parse_llog */
@@ -228,7 +247,7 @@ static inline bool logname_is_barrier(const char *logname)
 struct config_llog_data {
 	struct ldlm_res_id	    cld_resid;
 	struct config_llog_instance cld_cfg;
-	struct list_head	    cld_list_chain;
+	struct list_head	    cld_list_chain;/* on config_llog_list */
 	atomic_t		    cld_refcount;
 	struct config_llog_data    *cld_sptlrpc;/* depended sptlrpc log */
 	struct config_llog_data    *cld_params;	/* common parameters log */
@@ -328,6 +347,8 @@ struct obd_export *class_export_get(struct obd_export *exp);
 void class_export_put(struct obd_export *exp);
 struct obd_export *class_new_export(struct obd_device *obddev,
                                     struct obd_uuid *cluuid);
+struct obd_export *class_new_export_self(struct obd_device *obd,
+					 struct obd_uuid *uuid);
 void class_unlink_export(struct obd_export *exp);
 
 struct obd_import *class_import_get(struct obd_import *);
@@ -347,6 +368,7 @@ void class_disconnect_exports(struct obd_device *obddev);
 int class_manual_cleanup(struct obd_device *obd);
 void class_disconnect_stale_exports(struct obd_device *,
                                     int (*test_export)(struct obd_export *));
+
 static inline enum obd_option exp_flags_from_obd(struct obd_device *obd)
 {
         return ((obd->obd_fail ? OBD_OPT_FAILOVER : 0) |
@@ -381,25 +403,25 @@ void la_from_obdo(struct lu_attr *la, const struct obdo *dst, u64 valid);
 void obdo_cpy_md(struct obdo *dst, const struct obdo *src, u64 valid);
 void obdo_to_ioobj(const struct obdo *oa, struct obd_ioobj *ioobj);
 
-#define OBT(dev)        (dev)->obd_type
 #define OBP(dev, op)    (dev)->obd_type->typ_dt_ops->o_ ## op
 #define MDP(dev, op)    (dev)->obd_type->typ_md_ops->m_ ## op
-#define CTXTP(ctxt, op) (ctxt)->loc_logops->lop_##op
 
-/* Ensure obd_setup: used for cleanup which must be called
-   while obd is stopping */
-#define OBD_CHECK_DEV(obd)                                      \
-do {                                                            \
-        if (!(obd)) {                                           \
-                CERROR("NULL device\n");                        \
-                RETURN(-ENODEV);                                \
-        }                                                       \
-} while (0)
+static inline int obd_check_dev(struct obd_device *obd)
+{
+	if (!obd) {
+		CERROR("NULL device\n");
+		return -ENODEV;
+	}
+	return 0;
+}
 
 /* ensure obd_setup and !obd_stopping */
 #define OBD_CHECK_DEV_ACTIVE(obd)                               \
 do {                                                            \
-        OBD_CHECK_DEV(obd);                                     \
+	rc = obd_check_dev(obd);				\
+	if (rc)							\
+		return rc;					\
+								\
         if (!(obd)->obd_set_up || (obd)->obd_stopping) {        \
                 CERROR("Device %d not setup\n",                 \
                        (obd)->obd_minor);                       \
@@ -408,61 +430,6 @@ do {                                                            \
 } while (0)
 
 
-#ifdef CONFIG_PROC_FS
-#define OBD_COUNTER_OFFSET(op)						       \
-	((offsetof(struct obd_ops, o_ ## op) -				       \
-	  offsetof(struct obd_ops, o_iocontrol))			       \
-	 / sizeof(((struct obd_ops *)NULL)->o_iocontrol))
-
-/* The '- 1' below is for o_owner. */
-#define NUM_OBD_STATS							       \
-	(sizeof(struct obd_ops) /					       \
-	 sizeof(((struct obd_ops *)NULL)->o_iocontrol) - 1)
-
-#define OBD_COUNTER_INCREMENT(obd, op)					       \
-	lprocfs_counter_incr((obd)->obd_stats,				       \
-			     (obd)->obd_cntr_base + OBD_COUNTER_OFFSET(op))
-
-#define EXP_COUNTER_INCREMENT(exp, op)					       \
-	do {								       \
-		unsigned int _off;					       \
-		_off = (exp)->exp_obd->obd_cntr_base + OBD_COUNTER_OFFSET(op); \
-		lprocfs_counter_incr((exp)->exp_obd->obd_stats, _off);	       \
-		if ((exp)->exp_obd->obd_uses_nid_stats &&		       \
-		    (exp)->exp_nid_stats != NULL)			       \
-			lprocfs_counter_incr((exp)->exp_nid_stats->nid_stats,  \
-					     _off);			       \
-	} while (0)
-
-#define _MD_COUNTER_OFFSET(m_op)					       \
-	((offsetof(struct md_ops, m_op) -				       \
-	  offsetof(struct md_ops, MD_STATS_FIRST_OP)) /			       \
-	 sizeof(((struct md_ops *)NULL)->MD_STATS_FIRST_OP))
-
-#define MD_COUNTER_OFFSET(op) _MD_COUNTER_OFFSET(m_ ## op)
-
-#define NUM_MD_STATS							       \
-	(_MD_COUNTER_OFFSET(MD_STATS_LAST_OP) -				       \
-	 _MD_COUNTER_OFFSET(MD_STATS_FIRST_OP) + 1)
-
-/* Note that we only increment md counters for ops whose offset is less
- * than NUM_MD_STATS. This is explained in a comment in the definition
- * of struct md_ops. */
-#define EXP_MD_COUNTER_INCREMENT(exp, op)				       \
-	do {								       \
-		if (MD_COUNTER_OFFSET(op) < NUM_MD_STATS)		       \
-			lprocfs_counter_incr((exp)->exp_obd->obd_md_stats,     \
-					(exp)->exp_obd->obd_md_cntr_base +     \
-					MD_COUNTER_OFFSET(op));	               \
-	} while (0)
-
-#else
-#define OBD_COUNTER_OFFSET(op)
-#define OBD_COUNTER_INCREMENT(obd, op)
-#define EXP_COUNTER_INCREMENT(exp, op)
-#define EXP_MD_COUNTER_INCREMENT(exp, op)
-#endif
-
 static inline int lprocfs_nid_ldlm_stats_init(struct nid_stat* tmp)
 {
 	/* Always add in ldlm_stats */
@@ -478,57 +445,16 @@ static inline int lprocfs_nid_ldlm_stats_init(struct nid_stat* tmp)
 				      tmp->nid_ldlm_stats);
 }
 
-#define EXP_CHECK_MD_OP(exp, op)					\
-do {									\
-	if ((exp) == NULL) {						\
-		CERROR("obd_" #op ": NULL export\n");			\
-		RETURN(-ENODEV);					\
-	}								\
-	if ((exp)->exp_obd == NULL || !OBT((exp)->exp_obd)) {		\
-		CERROR("obd_" #op ": cleaned up obd\n");		\
-		RETURN(-EOPNOTSUPP);					\
-	}								\
-	if (!OBT((exp)->exp_obd) || !MDP((exp)->exp_obd, op)) {		\
-		CERROR("%s: obd_" #op ": dev %d no operation\n",	\
-		       (exp)->exp_obd->obd_name,			\
-		       (exp)->exp_obd->obd_minor);			\
-		RETURN(-EOPNOTSUPP);					\
-	}								\
-} while (0)
-
-
-#define OBD_CHECK_DT_OP(obd, op, err)					\
-do {									\
-	if (!OBT(obd) || !OBP((obd), op)) {				\
-		if (err)						\
-			CERROR("%s: no obd_" #op " operation\n",	\
-			       obd->obd_name);				\
-		RETURN(err);						\
-	}								\
-} while (0)
-
-#define EXP_CHECK_DT_OP(exp, op)					\
-do {									\
-	if ((exp) == NULL) {						\
-		CERROR("obd_" #op ": NULL export\n");			\
-		RETURN(-ENODEV);					\
-	}								\
-	if ((exp)->exp_obd == NULL || !OBT((exp)->exp_obd)) {		\
-		CERROR("obd_" #op ": cleaned up obd\n");		\
-		RETURN(-EOPNOTSUPP);					\
-	}								\
-	OBD_CHECK_DT_OP((exp)->exp_obd, op, -EOPNOTSUPP);		\
-} while (0)
-
-#define CTXT_CHECK_OP(ctxt, op, err)					\
-do {									\
-	if (!OBT(ctxt->loc_obd) || !CTXTP((ctxt), op)) {		\
-		if (err)						\
-			CERROR("%s: no lop_" #op "operation\n",		\
-			       ctxt->loc_obd->obd_name);		\
-		RETURN(err);						\
-	}								\
-} while (0)
+static inline int exp_check_ops(struct obd_export *exp)
+{
+	if (exp == NULL) {
+		RETURN(-ENODEV);
+	}
+	if (exp->exp_obd == NULL || !exp->exp_obd->obd_type) {
+		RETURN(-EOPNOTSUPP);
+	}
+	RETURN(0);
+}
 
 static inline int class_devno_max(void)
 {
@@ -542,8 +468,15 @@ static inline int obd_get_info(const struct lu_env *env, struct obd_export *exp,
 	int rc;
 	ENTRY;
 
-	EXP_CHECK_DT_OP(exp, get_info);
-	EXP_COUNTER_INCREMENT(exp, get_info);
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_get_info) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
 
 	rc = OBP(exp->exp_obd, get_info)(env, exp, keylen, key, vallen, val);
 	RETURN(rc);
@@ -558,8 +491,15 @@ static inline int obd_set_info_async(const struct lu_env *env,
         int rc;
         ENTRY;
 
-        EXP_CHECK_DT_OP(exp, set_info_async);
-        EXP_COUNTER_INCREMENT(exp, set_info_async);
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_set_info_async) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
 
         rc = OBP(exp->exp_obd, set_info_async)(env, exp, keylen, key, vallen,
                                                val, set);
@@ -583,18 +523,14 @@ static inline int obd_set_info_async(const struct lu_env *env,
  * functionality of ->o_precleanup() and ->o_cleanup() they override. Hence,
  * obd_precleanup() and obd_cleanup() call both lu_device and obd operations.
  */
-
-#define DECLARE_LU_VARS(ldt, d)                 \
-        struct lu_device_type *ldt;       \
-        struct lu_device *d
-
 static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg)
 {
         int rc;
-        DECLARE_LU_VARS(ldt, d);
+	struct lu_device_type *ldt = obd->obd_type->typ_lu;
+	struct lu_device *d;
+
         ENTRY;
 
-        ldt = obd->obd_type->typ_lu;
         if (ldt != NULL) {
                 struct lu_context  session_ctx;
                 struct lu_env env;
@@ -618,8 +554,11 @@ static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg)
                 lu_context_fini(&session_ctx);
 
         } else {
-                OBD_CHECK_DT_OP(obd, setup, -EOPNOTSUPP);
-                OBD_COUNTER_INCREMENT(obd, setup);
+		if (!obd->obd_type->typ_dt_ops->o_setup) {
+			CERROR("%s: no %s operation\n", obd->obd_name,
+			       __func__);
+			RETURN(-EOPNOTSUPP);
+		}
                 rc = OBP(obd, setup)(obd, cfg);
         }
         RETURN(rc);
@@ -628,23 +567,30 @@ static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg)
 static inline int obd_precleanup(struct obd_device *obd)
 {
 	int rc;
-	DECLARE_LU_VARS(ldt, d);
+	struct lu_device_type *ldt = obd->obd_type->typ_lu;
+	struct lu_device *d = obd->obd_lu_dev;
+
 	ENTRY;
 
-	OBD_CHECK_DEV(obd);
-	ldt = obd->obd_type->typ_lu;
-	d = obd->obd_lu_dev;
 	if (ldt != NULL && d != NULL) {
-		struct lu_env env;
-
-		rc = lu_env_init(&env, ldt->ldt_ctx_tags);
-		if (rc == 0) {
-			ldt->ldt_ops->ldto_device_fini(&env, d);
-			lu_env_fini(&env);
+		struct lu_env *env = lu_env_find();
+		struct lu_env _env;
+
+		if (!env) {
+			env = &_env;
+			rc = lu_env_init(env, ldt->ldt_ctx_tags);
+			LASSERT(rc == 0);
+			lu_env_add(env);
+		}
+		ldt->ldt_ops->ldto_device_fini(env, d);
+		if (env == &_env) {
+			lu_env_remove(env);
+			lu_env_fini(env);
 		}
 	}
-	OBD_CHECK_DT_OP(obd, precleanup, 0);
-	OBD_COUNTER_INCREMENT(obd, precleanup);
+
+	if (!obd->obd_type->typ_dt_ops->o_precleanup)
+		RETURN(0);
 
 	rc = OBP(obd, precleanup)(obd);
 	RETURN(rc);
@@ -653,13 +599,10 @@ static inline int obd_precleanup(struct obd_device *obd)
 static inline int obd_cleanup(struct obd_device *obd)
 {
         int rc;
-        DECLARE_LU_VARS(ldt, d);
-        ENTRY;
-
-        OBD_CHECK_DEV(obd);
+	struct lu_device_type *ldt = obd->obd_type->typ_lu;
+	struct lu_device *d = obd->obd_lu_dev;
 
-        ldt = obd->obd_type->typ_lu;
-        d = obd->obd_lu_dev;
+	ENTRY;
         if (ldt != NULL && d != NULL) {
                 struct lu_env env;
 
@@ -670,8 +613,8 @@ static inline int obd_cleanup(struct obd_device *obd)
                         obd->obd_lu_dev = NULL;
                 }
         }
-        OBD_CHECK_DT_OP(obd, cleanup, 0);
-        OBD_COUNTER_INCREMENT(obd, cleanup);
+	if (!obd->obd_type->typ_dt_ops->o_cleanup)
+		RETURN(0);
 
         rc = OBP(obd, cleanup)(obd);
         RETURN(rc);
@@ -698,18 +641,16 @@ static inline void obd_cleanup_client_import(struct obd_device *obd)
         EXIT;
 }
 
-static inline int
-obd_process_config(struct obd_device *obd, int datalen, void *data)
+static inline int obd_process_config(struct obd_device *obd, int datalen,
+				     void *data)
 {
         int rc;
-        DECLARE_LU_VARS(ldt, d);
-        ENTRY;
+	struct lu_device_type *ldt = obd->obd_type->typ_lu;
+	struct lu_device *d = obd->obd_lu_dev;
 
-        OBD_CHECK_DEV(obd);
+	ENTRY;
 
         obd->obd_process_conf = 1;
-        ldt = obd->obd_type->typ_lu;
-        d = obd->obd_lu_dev;
         if (ldt != NULL && d != NULL) {
                 struct lu_env env;
 
@@ -719,10 +660,14 @@ obd_process_config(struct obd_device *obd, int datalen, void *data)
                         lu_env_fini(&env);
                 }
         } else {
-                OBD_CHECK_DT_OP(obd, process_config, -EOPNOTSUPP);
+		if (!obd->obd_type->typ_dt_ops->o_process_config) {
+			CERROR("%s: no %s operation\n",
+			       obd->obd_name, __func__);
+			RETURN(-EOPNOTSUPP);
+		}
                 rc = OBP(obd, process_config)(obd, datalen, data);
         }
-        OBD_COUNTER_INCREMENT(obd, process_config);
+
         obd->obd_process_conf = 0;
 
         RETURN(rc);
@@ -734,8 +679,15 @@ static inline int obd_create(const struct lu_env *env, struct obd_export *exp,
 	int rc;
 	ENTRY;
 
-	EXP_CHECK_DT_OP(exp, create);
-	EXP_COUNTER_INCREMENT(exp, create);
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_create) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
 
 	rc = OBP(exp->exp_obd, create)(env, exp, obdo);
 	RETURN(rc);
@@ -747,8 +699,15 @@ static inline int obd_destroy(const struct lu_env *env, struct obd_export *exp,
 	int rc;
 	ENTRY;
 
-	EXP_CHECK_DT_OP(exp, destroy);
-	EXP_COUNTER_INCREMENT(exp, destroy);
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_destroy) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
 
 	rc = OBP(exp->exp_obd, destroy)(env, exp, obdo);
 	RETURN(rc);
@@ -760,8 +719,16 @@ static inline int obd_getattr(const struct lu_env *env, struct obd_export *exp,
 	int rc;
 
 	ENTRY;
-	EXP_CHECK_DT_OP(exp, getattr);
-	EXP_COUNTER_INCREMENT(exp, getattr);
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_getattr) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
+
 	rc = OBP(exp->exp_obd, getattr)(env, exp, oa);
 
 	RETURN(rc);
@@ -773,8 +740,16 @@ static inline int obd_setattr(const struct lu_env *env, struct obd_export *exp,
 	int rc;
 
 	ENTRY;
-	EXP_CHECK_DT_OP(exp, setattr);
-	EXP_COUNTER_INCREMENT(exp, setattr);
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_setattr) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
+
 	rc = OBP(exp->exp_obd, setattr)(env, exp, oa);
 
 	RETURN(rc);
@@ -788,8 +763,10 @@ static inline int obd_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
         ENTRY;
 
         OBD_CHECK_DEV_ACTIVE(obd);
-        OBD_CHECK_DT_OP(obd, add_conn, -EOPNOTSUPP);
-        OBD_COUNTER_INCREMENT(obd, add_conn);
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_add_conn) {
+		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
+		RETURN(-EOPNOTSUPP);
+	}
 
         rc = OBP(obd, add_conn)(imp, uuid, priority);
         RETURN(rc);
@@ -802,8 +779,10 @@ static inline int obd_del_conn(struct obd_import *imp, struct obd_uuid *uuid)
         ENTRY;
 
         OBD_CHECK_DEV_ACTIVE(obd);
-        OBD_CHECK_DT_OP(obd, del_conn, -EOPNOTSUPP);
-        OBD_COUNTER_INCREMENT(obd, del_conn);
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_del_conn) {
+		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
+		RETURN(-EOPNOTSUPP);
+	}
 
         rc = OBP(obd, del_conn)(imp, uuid);
         RETURN(rc);
@@ -814,8 +793,9 @@ static inline struct obd_uuid *obd_get_uuid(struct obd_export *exp)
         struct obd_uuid *uuid;
         ENTRY;
 
-        OBD_CHECK_DT_OP(exp->exp_obd, get_uuid, NULL);
-        EXP_COUNTER_INCREMENT(exp, get_uuid);
+	if (!exp->exp_obd->obd_type ||
+	    !exp->exp_obd->obd_type->typ_dt_ops->o_get_uuid)
+		RETURN(NULL);
 
         uuid = OBP(exp->exp_obd, get_uuid)(exp);
         RETURN(uuid);
@@ -838,8 +818,10 @@ static inline int obd_connect(const struct lu_env *env,
         ENTRY;
 
         OBD_CHECK_DEV_ACTIVE(obd);
-        OBD_CHECK_DT_OP(obd, connect, -EOPNOTSUPP);
-        OBD_COUNTER_INCREMENT(obd, connect);
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_connect) {
+		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
+		RETURN(-EOPNOTSUPP);
+	}
 
         rc = OBP(obd, connect)(env, exp, obd, cluuid, data, localdata);
         /* check that only subset is granted */
@@ -862,8 +844,8 @@ static inline int obd_reconnect(const struct lu_env *env,
         ENTRY;
 
         OBD_CHECK_DEV_ACTIVE(obd);
-        OBD_CHECK_DT_OP(obd, reconnect, 0);
-        OBD_COUNTER_INCREMENT(obd, reconnect);
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_reconnect)
+		RETURN(0);
 
         rc = OBP(obd, reconnect)(env, exp, obd, cluuid, d, localdata);
         /* check that only subset is granted */
@@ -876,9 +858,15 @@ static inline int obd_disconnect(struct obd_export *exp)
 {
         int rc;
         ENTRY;
-
-        EXP_CHECK_DT_OP(exp, disconnect);
-        EXP_COUNTER_INCREMENT(exp, disconnect);
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_disconnect) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
 
         rc = OBP(exp->exp_obd, disconnect)(exp);
         RETURN(rc);
@@ -890,8 +878,8 @@ static inline int obd_fid_init(struct obd_device *obd, struct obd_export *exp,
 	int rc;
 	ENTRY;
 
-	OBD_CHECK_DT_OP(obd, fid_init, 0);
-	OBD_COUNTER_INCREMENT(obd, fid_init);
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_fid_init)
+		RETURN(0);
 
 	rc = OBP(obd, fid_init)(obd, exp, type);
 	RETURN(rc);
@@ -901,9 +889,8 @@ static inline int obd_fid_fini(struct obd_device *obd)
 {
 	int rc;
 	ENTRY;
-
-	OBD_CHECK_DT_OP(obd, fid_fini, 0);
-	OBD_COUNTER_INCREMENT(obd, fid_fini);
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_fid_fini)
+		RETURN(0);
 
 	rc = OBP(obd, fid_fini)(obd);
 	RETURN(rc);
@@ -916,33 +903,29 @@ static inline int obd_fid_alloc(const struct lu_env *env,
 {
 	int rc;
 	ENTRY;
-
-	EXP_CHECK_DT_OP(exp, fid_alloc);
-	EXP_COUNTER_INCREMENT(exp, fid_alloc);
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_fid_alloc) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
 
 	rc = OBP(exp->exp_obd, fid_alloc)(env, exp, fid, op_data);
 	RETURN(rc);
 }
 
-static inline int obd_ping(const struct lu_env *env, struct obd_export *exp)
-{
-        int rc;
-        ENTRY;
-
-        OBD_CHECK_DT_OP(exp->exp_obd, ping, 0);
-        EXP_COUNTER_INCREMENT(exp, ping);
-
-        rc = OBP(exp->exp_obd, ping)(env, exp);
-        RETURN(rc);
-}
-
 static inline int obd_pool_new(struct obd_device *obd, char *poolname)
 {
         int rc;
         ENTRY;
 
-        OBD_CHECK_DT_OP(obd, pool_new, -EOPNOTSUPP);
-        OBD_COUNTER_INCREMENT(obd, pool_new);
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_pool_new) {
+		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
+		RETURN(-EOPNOTSUPP);
+	}
 
         rc = OBP(obd, pool_new)(obd, poolname);
         RETURN(rc);
@@ -952,173 +935,166 @@ static inline int obd_pool_del(struct obd_device *obd, char *poolname)
 {
         int rc;
         ENTRY;
-
-        OBD_CHECK_DT_OP(obd, pool_del, -EOPNOTSUPP);
-        OBD_COUNTER_INCREMENT(obd, pool_del);
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_pool_del) {
+		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
+		RETURN(-EOPNOTSUPP);
+	}
 
         rc = OBP(obd, pool_del)(obd, poolname);
         RETURN(rc);
 }
 
-static inline int obd_pool_add(struct obd_device *obd, char *poolname, char *ostname)
+static inline int obd_pool_add(struct obd_device *obd, char *poolname,
+			       char *ostname)
 {
         int rc;
         ENTRY;
 
-        OBD_CHECK_DT_OP(obd, pool_add, -EOPNOTSUPP);
-        OBD_COUNTER_INCREMENT(obd, pool_add);
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_pool_add) {
+		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
+		RETURN(-EOPNOTSUPP);
+	}
 
         rc = OBP(obd, pool_add)(obd, poolname, ostname);
         RETURN(rc);
 }
 
-static inline int obd_pool_rem(struct obd_device *obd, char *poolname, char *ostname)
+static inline int obd_pool_rem(struct obd_device *obd, char *poolname,
+			       char *ostname)
 {
-        int rc;
-        ENTRY;
-
-        OBD_CHECK_DT_OP(obd, pool_rem, -EOPNOTSUPP);
-        OBD_COUNTER_INCREMENT(obd, pool_rem);
-
-        rc = OBP(obd, pool_rem)(obd, poolname, ostname);
-        RETURN(rc);
-}
+	int rc;
 
-static inline void obd_getref(struct obd_device *obd)
-{
-        ENTRY;
-        if (OBT(obd) && OBP(obd, getref)) {
-                OBD_COUNTER_INCREMENT(obd, getref);
-                OBP(obd, getref)(obd);
-        }
-        EXIT;
-}
+	ENTRY;
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_pool_rem) {
+		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
+		RETURN(-EOPNOTSUPP);
+	}
 
-static inline void obd_putref(struct obd_device *obd)
-{
-        ENTRY;
-        if (OBT(obd) && OBP(obd, putref)) {
-                OBD_COUNTER_INCREMENT(obd, putref);
-                OBP(obd, putref)(obd);
-        }
-        EXIT;
+	rc = OBP(obd, pool_rem)(obd, poolname, ostname);
+	RETURN(rc);
 }
 
 static inline int obd_init_export(struct obd_export *exp)
 {
-        int rc = 0;
+	int rc = 0;
 
-        ENTRY;
-        if ((exp)->exp_obd != NULL && OBT((exp)->exp_obd) &&
-            OBP((exp)->exp_obd, init_export))
-                rc = OBP(exp->exp_obd, init_export)(exp);
-        RETURN(rc);
+	ENTRY;
+	if (exp->exp_obd != NULL && exp->exp_obd->obd_type &&
+	    OBP((exp)->exp_obd, init_export))
+		rc = OBP(exp->exp_obd, init_export)(exp);
+	RETURN(rc);
 }
 
 static inline int obd_destroy_export(struct obd_export *exp)
 {
-        ENTRY;
-        if ((exp)->exp_obd != NULL && OBT((exp)->exp_obd) &&
-            OBP((exp)->exp_obd, destroy_export))
-                OBP(exp->exp_obd, destroy_export)(exp);
-        RETURN(0);
+	ENTRY;
+	if (exp->exp_obd != NULL && exp->exp_obd->obd_type &&
+	    OBP(exp->exp_obd, destroy_export))
+		OBP(exp->exp_obd, destroy_export)(exp);
+	RETURN(0);
 }
 
-/* @max_age is the oldest time in jiffies that we accept using a cached data.
+/* @max_age is the oldest time in seconds that we accept using a cached data.
  * If the cache is older than @max_age we will get a new value from the
- * target.  Use a value of "cfs_time_current() + HZ" to guarantee freshness. */
+ * target. Use a value of 'ktime_get_seconds() + X' to guarantee freshness.
+ */
 static inline int obd_statfs_async(struct obd_export *exp,
-                                   struct obd_info *oinfo,
-                                   __u64 max_age,
-                                   struct ptlrpc_request_set *rqset)
+				   struct obd_info *oinfo,
+				   time64_t max_age,
+				   struct ptlrpc_request_set *rqset)
 {
-        int rc = 0;
-        struct obd_device *obd;
-        ENTRY;
-
-        if (exp == NULL || exp->exp_obd == NULL)
-                RETURN(-EINVAL);
-
-        obd = exp->exp_obd;
-        OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP);
-        OBD_COUNTER_INCREMENT(obd, statfs);
-
-	CDEBUG(D_SUPER, "%s: osfs %p age %llu, max_age %llu\n",
-               obd->obd_name, &obd->obd_osfs, obd->obd_osfs_age, max_age);
-        if (cfs_time_before_64(obd->obd_osfs_age, max_age)) {
-                rc = OBP(obd, statfs_async)(exp, oinfo, max_age, rqset);
-        } else {
-		CDEBUG(D_SUPER, "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n",
-                       obd->obd_name, &obd->obd_osfs,
-                       obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
-                       obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
-		spin_lock(&obd->obd_osfs_lock);
-		memcpy(oinfo->oi_osfs, &obd->obd_osfs, sizeof(*oinfo->oi_osfs));
-		spin_unlock(&obd->obd_osfs_lock);
-                oinfo->oi_flags |= OBD_STATFS_FROM_CACHE;
-                if (oinfo->oi_cb_up)
-                        oinfo->oi_cb_up(oinfo, 0);
-        }
-        RETURN(rc);
-}
-
-static inline int obd_statfs_rqset(struct obd_export *exp,
-                                   struct obd_statfs *osfs, __u64 max_age,
-                                   __u32 flags)
-{
-	struct ptlrpc_request_set *set = NULL;
-	struct obd_info oinfo = {
-		.oi_osfs = osfs,
-		.oi_flags = flags,
-	};
+	struct obd_device *obd;
 	int rc = 0;
 
 	ENTRY;
 
-	set = ptlrpc_prep_set();
-	if (set == NULL)
-		RETURN(-ENOMEM);
-
-	rc = obd_statfs_async(exp, &oinfo, max_age, set);
-	if (rc == 0)
-		rc = ptlrpc_set_wait(set);
+	if (exp == NULL || exp->exp_obd == NULL)
+		RETURN(-EINVAL);
 
-	ptlrpc_set_destroy(set);
+	obd = exp->exp_obd;
+	if (!obd->obd_type || !obd->obd_type->typ_dt_ops->o_statfs) {
+		rc = -EOPNOTSUPP;
+		CERROR("%s: no statfs operation: rc = %d\n", obd->obd_name, rc);
+		RETURN(rc);
+	}
 
+	CDEBUG(D_SUPER, "%s: age %lld, max_age %lld\n",
+	       obd->obd_name, obd->obd_osfs_age, max_age);
+	if (obd->obd_osfs_age < max_age) {
+		rc = OBP(obd, statfs_async)(exp, oinfo, max_age, rqset);
+	} else {
+		CDEBUG(D_SUPER,
+		       "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n",
+		       obd->obd_name, &obd->obd_osfs,
+		       obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
+		       obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
+		spin_lock(&obd->obd_osfs_lock);
+		memcpy(oinfo->oi_osfs, &obd->obd_osfs, sizeof(*oinfo->oi_osfs));
+		spin_unlock(&obd->obd_osfs_lock);
+		oinfo->oi_flags |= OBD_STATFS_FROM_CACHE;
+		if (oinfo->oi_cb_up)
+			oinfo->oi_cb_up(oinfo, 0);
+	}
 	RETURN(rc);
 }
 
-/* @max_age is the oldest time in jiffies that we accept using a cached data.
+/* @max_age is the oldest time in seconds that we accept using a cached data.
  * If the cache is older than @max_age we will get a new value from the
- * target.  Use a value of "cfs_time_current() + HZ" to guarantee freshness. */
+ * target. Use a value of 'ktime_get_seconds() + X' to guarantee freshness.
+ */
 static inline int obd_statfs(const struct lu_env *env, struct obd_export *exp,
-                             struct obd_statfs *osfs, __u64 max_age,
-                             __u32 flags)
+			     struct obd_statfs *osfs, time64_t max_age,
+			     __u32 flags)
 {
-        int rc = 0;
-        struct obd_device *obd = exp->exp_obd;
-        ENTRY;
+	struct obd_device *obd = exp->exp_obd;
+	int rc = 0;
 
-        if (obd == NULL)
-                RETURN(-EINVAL);
+	ENTRY;
+	if (unlikely(obd == NULL))
+		RETURN(-EINVAL);
 
-        OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP);
 	OBD_CHECK_DEV_ACTIVE(obd);
-        OBD_COUNTER_INCREMENT(obd, statfs);
 
-	CDEBUG(D_SUPER, "osfs %llu, max_age %llu\n",
-               obd->obd_osfs_age, max_age);
-        if (cfs_time_before_64(obd->obd_osfs_age, max_age)) {
-                rc = OBP(obd, statfs)(env, exp, osfs, max_age, flags);
-                if (rc == 0) {
+	if (unlikely(!obd->obd_type || !obd->obd_type->typ_dt_ops->o_statfs)) {
+		CERROR("%s: no %s operation\n", obd->obd_name, __func__);
+		RETURN(-EOPNOTSUPP);
+	}
+
+	CDEBUG(D_SUPER, "%s: age %lld, max_age %lld\n",
+	       obd->obd_name, obd->obd_osfs_age, max_age);
+	/* ignore cache if aggregated isn't expected */
+	if (obd->obd_osfs_age < max_age ||
+	    ((obd->obd_osfs.os_state & OS_STATE_SUM) &&
+	     !(flags & OBD_STATFS_SUM))) {
+		/* the RPC will block anyway, so avoid sending many at once */
+		rc = mutex_lock_interruptible(&obd->obd_dev_mutex);
+		if (rc)
+			RETURN(rc);
+		if (obd->obd_osfs_age < max_age ||
+		    ((obd->obd_osfs.os_state & OS_STATE_SUM) &&
+		     !(flags & OBD_STATFS_SUM))) {
+			rc = OBP(obd, statfs)(env, exp, osfs, max_age, flags);
+		} else {
+			mutex_unlock(&obd->obd_dev_mutex);
+			GOTO(cached, rc = 0);
+		}
+		if (rc == 0) {
+			CDEBUG(D_SUPER,
+			       "%s: update %p cache blocks %llu/%llu objects %llu/%llu\n",
+			       obd->obd_name, &obd->obd_osfs,
+			       osfs->os_bavail, osfs->os_blocks,
+			       osfs->os_ffree, osfs->os_files);
+
 			spin_lock(&obd->obd_osfs_lock);
 			memcpy(&obd->obd_osfs, osfs, sizeof(obd->obd_osfs));
-			obd->obd_osfs_age = cfs_time_current_64();
+			obd->obd_osfs_age = ktime_get_seconds();
 			spin_unlock(&obd->obd_osfs_lock);
 		}
+		mutex_unlock(&obd->obd_dev_mutex);
 	} else {
-		CDEBUG(D_SUPER, "%s: use %p cache blocks %llu/%llu"
-		       " objects %llu/%llu\n",
+cached:
+		CDEBUG(D_SUPER,
+		       "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n",
 		       obd->obd_name, &obd->obd_osfs,
 		       obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
 		       obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
@@ -1138,8 +1114,17 @@ static inline int obd_preprw(const struct lu_env *env, int cmd,
 	int rc;
 
 	ENTRY;
-	EXP_CHECK_DT_OP(exp, preprw);
-	EXP_COUNTER_INCREMENT(exp, preprw);
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_preprw) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
+
 	rc = OBP(exp->exp_obd, preprw)(env, cmd, exp, oa, objcount, obj, remote,
 				       pages, local);
 
@@ -1150,14 +1135,23 @@ static inline int obd_commitrw(const struct lu_env *env, int cmd,
 			       struct obd_export *exp, struct obdo *oa,
 			       int objcount, struct obd_ioobj *obj,
 			       struct niobuf_remote *rnb, int pages,
-			       struct niobuf_local *local, int rc)
+			       struct niobuf_local *local, const int orig_rc)
 {
+	int rc;
 	ENTRY;
 
-	EXP_CHECK_DT_OP(exp, commitrw);
-	EXP_COUNTER_INCREMENT(exp, commitrw);
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_commitrw) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
+
 	rc = OBP(exp->exp_obd, commitrw)(env, cmd, exp, oa, objcount, obj,
-					 rnb, pages, local, rc);
+					 rnb, pages, local, orig_rc);
 
 	RETURN(rc);
 }
@@ -1168,8 +1162,15 @@ static inline int obd_iocontrol(unsigned int cmd, struct obd_export *exp,
         int rc;
         ENTRY;
 
-        EXP_CHECK_DT_OP(exp, iocontrol);
-        EXP_COUNTER_INCREMENT(exp, iocontrol);
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_iocontrol) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
 
         rc = OBP(exp->exp_obd, iocontrol)(cmd, exp, len, karg, uarg);
         RETURN(rc);
@@ -1185,10 +1186,10 @@ static inline void obd_import_event(struct obd_device *obd,
                 EXIT;
                 return;
         }
-        if (obd->obd_set_up && OBP(obd, import_event)) {
-                OBD_COUNTER_INCREMENT(obd, import_event);
+
+        if (obd->obd_set_up && OBP(obd, import_event))
                 OBP(obd, import_event)(obd, imp, event);
-        }
+
         EXIT;
 }
 
@@ -1198,7 +1199,10 @@ static inline int obd_notify(struct obd_device *obd,
 {
 	int rc;
 	ENTRY;
-	OBD_CHECK_DEV(obd);
+
+	rc = obd_check_dev(obd);
+	if (rc)
+		return rc;
 
 	if (!obd->obd_set_up) {
 		CDEBUG(D_HA, "obd %s not set up\n", obd->obd_name);
@@ -1210,7 +1214,6 @@ static inline int obd_notify(struct obd_device *obd,
 		RETURN(-ENOSYS);
 	}
 
-	OBD_COUNTER_INCREMENT(obd, notify);
 	rc = OBP(obd, notify)(obd, watched, ev);
 
 	RETURN(rc);
@@ -1243,45 +1246,58 @@ static inline int obd_quotactl(struct obd_export *exp,
         int rc;
         ENTRY;
 
-        EXP_CHECK_DT_OP(exp, quotactl);
-        EXP_COUNTER_INCREMENT(exp, quotactl);
+	rc = exp_check_ops(exp);
+	if (rc)
+		RETURN(rc);
+
+	if (!exp->exp_obd->obd_type->typ_dt_ops->o_quotactl) {
+		CERROR("%s: no %s operation\n",
+		       (exp)->exp_obd->obd_name, __func__);
+		RETURN(-ENOTSUPP);
+	}
 
         rc = OBP(exp->exp_obd, quotactl)(exp->exp_obd, exp, oqctl);
         RETURN(rc);
 }
 
 static inline int obd_health_check(const struct lu_env *env,
-                                   struct obd_device *obd)
-{
-        /* returns: 0 on healthy
-         *         >0 on unhealthy + reason code/flag
-         *            however the only suppored reason == 1 right now
-         *            We'll need to define some better reasons
-         *            or flags in the future.
-         *         <0 on error
-         */
-        int rc;
-        ENTRY;
+				   struct obd_device *obd)
+{
+	/* returns: 0 on healthy
+	 *         >0 on unhealthy + reason code/flag
+	 *            however the only suppored reason == 1 right now
+	 *            We'll need to define some better reasons
+	 *            or flags in the future.
+	 *         <0 on error
+	 */
+	int rc;
 
-        /* don't use EXP_CHECK_DT_OP, because NULL method is normal here */
-        if (obd == NULL || !OBT(obd)) {
-                CERROR("cleaned up obd\n");
-                RETURN(-EOPNOTSUPP);
-        }
-        if (!obd->obd_set_up || obd->obd_stopping)
-                RETURN(0);
-        if (!OBP(obd, health_check))
-                RETURN(0);
+	ENTRY;
 
-        rc = OBP(obd, health_check)(env, obd);
-        RETURN(rc);
+	/* NULL method is normal here */
+	if (obd == NULL || !obd->obd_type) {
+		CERROR("cleaned up obd\n");
+		RETURN(-EOPNOTSUPP);
+	}
+	if (!obd->obd_set_up || obd->obd_stopping)
+		RETURN(0);
+	if (!OBP(obd, health_check))
+		RETURN(0);
+
+	rc = OBP(obd, health_check)(env, obd);
+	RETURN(rc);
 }
 
 static inline int obd_register_observer(struct obd_device *obd,
                                         struct obd_device *observer)
 {
+	int rc;
         ENTRY;
-        OBD_CHECK_DEV(obd);
+
+	rc = obd_check_dev(obd);
+	if (rc)
+		return rc;
+
 	down_write(&obd->obd_observer_link_sem);
         if (obd->obd_observer && observer) {
 		up_write(&obd->obd_observer_link_sem);
@@ -1293,51 +1309,79 @@ static inline int obd_register_observer(struct obd_device *obd,
 }
 
 /* metadata helpers */
+enum mps_stat_idx {
+	LPROC_MD_CLOSE,
+	LPROC_MD_CREATE,
+	LPROC_MD_ENQUEUE,
+	LPROC_MD_GETATTR,
+	LPROC_MD_INTENT_LOCK,
+	LPROC_MD_LINK,
+	LPROC_MD_RENAME,
+	LPROC_MD_SETATTR,
+	LPROC_MD_FSYNC,
+	LPROC_MD_READ_PAGE,
+	LPROC_MD_UNLINK,
+	LPROC_MD_SETXATTR,
+	LPROC_MD_GETXATTR,
+	LPROC_MD_INTENT_GETATTR_ASYNC,
+	LPROC_MD_REVALIDATE_LOCK,
+	LPROC_MD_LAST_OPC,
+};
+
 static inline int md_get_root(struct obd_export *exp, const char *fileset,
 			      struct lu_fid *fid)
 {
 	int rc;
 
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, get_root);
-	EXP_MD_COUNTER_INCREMENT(exp, get_root);
-	rc = MDP(exp->exp_obd, get_root)(exp, fileset, fid);
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
 
-	RETURN(rc);
+	return MDP(exp->exp_obd, get_root)(exp, fileset, fid);
 }
 
-static inline int md_getattr(struct obd_export *exp, struct md_op_data *op_data,
-                             struct ptlrpc_request **request)
+static inline int md_getattr(struct obd_export *exp,
+			     struct md_op_data *op_data,
+			     struct ptlrpc_request **request)
 {
-        int rc;
-        ENTRY;
-        EXP_CHECK_MD_OP(exp, getattr);
-        EXP_MD_COUNTER_INCREMENT(exp, getattr);
-        rc = MDP(exp->exp_obd, getattr)(exp, op_data, request);
-        RETURN(rc);
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_GETATTR);
+
+	return MDP(exp->exp_obd, getattr)(exp, op_data, request);
 }
 
 static inline int md_null_inode(struct obd_export *exp,
                                    const struct lu_fid *fid)
 {
-        int rc;
-        ENTRY;
-        EXP_CHECK_MD_OP(exp, null_inode);
-        EXP_MD_COUNTER_INCREMENT(exp, null_inode);
-        rc = MDP(exp->exp_obd, null_inode)(exp, fid);
-        RETURN(rc);
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, null_inode)(exp, fid);
 }
 
 static inline int md_close(struct obd_export *exp, struct md_op_data *op_data,
                            struct md_open_data *mod,
                            struct ptlrpc_request **request)
 {
-        int rc;
-        ENTRY;
-        EXP_CHECK_MD_OP(exp, close);
-        EXP_MD_COUNTER_INCREMENT(exp, close);
-        rc = MDP(exp->exp_obd, close)(exp, op_data, mod, request);
-        RETURN(rc);
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_CLOSE);
+
+	return MDP(exp->exp_obd, close)(exp, op_data, mod, request);
 }
 
 static inline int md_create(struct obd_export *exp, struct md_op_data *op_data,
@@ -1345,13 +1389,18 @@ static inline int md_create(struct obd_export *exp, struct md_op_data *op_data,
 			    uid_t uid, gid_t gid, cfs_cap_t cap_effective,
 			    __u64 rdev, struct ptlrpc_request **request)
 {
-        int rc;
-        ENTRY;
-        EXP_CHECK_MD_OP(exp, create);
-        EXP_MD_COUNTER_INCREMENT(exp, create);
-        rc = MDP(exp->exp_obd, create)(exp, op_data, data, datalen, mode,
-                                       uid, gid, cap_effective, rdev, request);
-        RETURN(rc);
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_CREATE);
+
+	return MDP(exp->exp_obd, create)(exp, op_data, data, datalen, mode,
+					 uid, gid, cap_effective, rdev,
+					 request);
 }
 
 static inline int md_enqueue(struct obd_export *exp,
@@ -1362,24 +1411,29 @@ static inline int md_enqueue(struct obd_export *exp,
 			     __u64 extra_lock_flags)
 {
 	int rc;
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, enqueue);
-	EXP_MD_COUNTER_INCREMENT(exp, enqueue);
-	rc = MDP(exp->exp_obd, enqueue)(exp, einfo, policy, op_data, lockh,
-					extra_lock_flags);
-        RETURN(rc);
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_ENQUEUE);
+
+	return MDP(exp->exp_obd, enqueue)(exp, einfo, policy, op_data, lockh,
+		   extra_lock_flags);
 }
 
 static inline int md_getattr_name(struct obd_export *exp,
                                   struct md_op_data *op_data,
                                   struct ptlrpc_request **request)
 {
-        int rc;
-        ENTRY;
-        EXP_CHECK_MD_OP(exp, getattr_name);
-        EXP_MD_COUNTER_INCREMENT(exp, getattr_name);
-        rc = MDP(exp->exp_obd, getattr_name)(exp, op_data, request);
-        RETURN(rc);
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, getattr_name)(exp, op_data, request);
 }
 
 static inline int md_intent_lock(struct obd_export *exp,
@@ -1390,36 +1444,49 @@ static inline int md_intent_lock(struct obd_export *exp,
 				 __u64 extra_lock_flags)
 {
 	int rc;
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, intent_lock);
-	EXP_MD_COUNTER_INCREMENT(exp, intent_lock);
-	rc = MDP(exp->exp_obd, intent_lock)(exp, op_data, it, reqp, cb_blocking,
-					    extra_lock_flags);
-	RETURN(rc);
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_INTENT_LOCK);
+
+	return MDP(exp->exp_obd, intent_lock)(exp, op_data, it, reqp,
+					      cb_blocking, extra_lock_flags);
 }
 
 static inline int md_link(struct obd_export *exp, struct md_op_data *op_data,
                           struct ptlrpc_request **request)
 {
-        int rc;
-        ENTRY;
-        EXP_CHECK_MD_OP(exp, link);
-        EXP_MD_COUNTER_INCREMENT(exp, link);
-        rc = MDP(exp->exp_obd, link)(exp, op_data, request);
-        RETURN(rc);
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_LINK);
+
+	return MDP(exp->exp_obd, link)(exp, op_data, request);
 }
 
 static inline int md_rename(struct obd_export *exp, struct md_op_data *op_data,
-			    const char *old, size_t oldlen, const char *new,
-			    size_t newlen, struct ptlrpc_request **request)
+			    const char *old_name, size_t oldlen,
+			    const char *new_name, size_t newlen,
+			    struct ptlrpc_request **request)
 {
-        int rc;
-        ENTRY;
-        EXP_CHECK_MD_OP(exp, rename);
-        EXP_MD_COUNTER_INCREMENT(exp, rename);
-        rc = MDP(exp->exp_obd, rename)(exp, op_data, old, oldlen, new,
-                                       newlen, request);
-        RETURN(rc);
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_RENAME);
+
+	return MDP(exp->exp_obd, rename)(exp, op_data, old_name, oldlen,
+					 new_name, newlen, request);
 }
 
 static inline int md_setattr(struct obd_export *exp, struct md_op_data *op_data,
@@ -1427,11 +1494,15 @@ static inline int md_setattr(struct obd_export *exp, struct md_op_data *op_data,
 			     struct ptlrpc_request **request)
 {
 	int rc;
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, setattr);
-	EXP_MD_COUNTER_INCREMENT(exp, setattr);
-	rc = MDP(exp->exp_obd, setattr)(exp, op_data, ea, ealen, request);
-	RETURN(rc);
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_SETATTR);
+
+	return MDP(exp->exp_obd, setattr)(exp, op_data, ea, ealen, request);
 }
 
 static inline int md_fsync(struct obd_export *exp, const struct lu_fid *fid,
@@ -1439,12 +1510,27 @@ static inline int md_fsync(struct obd_export *exp, const struct lu_fid *fid,
 {
 	int rc;
 
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, fsync);
-	EXP_MD_COUNTER_INCREMENT(exp, fsync);
-	rc = MDP(exp->exp_obd, fsync)(exp, fid, request);
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
 
-	RETURN(rc);
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_FSYNC);
+
+	return MDP(exp->exp_obd, fsync)(exp, fid, request);
+}
+
+/* FLR: resync mirrored files. */
+static inline int md_file_resync(struct obd_export *exp,
+				 struct md_op_data *data)
+{
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, file_resync)(exp, data);
 }
 
 static inline int md_read_page(struct obd_export *exp,
@@ -1454,23 +1540,31 @@ static inline int md_read_page(struct obd_export *exp,
 			       struct page **ppage)
 {
 	int rc;
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, read_page);
-	EXP_MD_COUNTER_INCREMENT(exp, read_page);
-	rc = MDP(exp->exp_obd, read_page)(exp, op_data, cb_op, hash_offset,
-					  ppage);
-	RETURN(rc);
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_READ_PAGE);
+
+	return MDP(exp->exp_obd, read_page)(exp, op_data, cb_op, hash_offset,
+					    ppage);
 }
 
 static inline int md_unlink(struct obd_export *exp, struct md_op_data *op_data,
                             struct ptlrpc_request **request)
 {
-        int rc;
-        ENTRY;
-        EXP_CHECK_MD_OP(exp, unlink);
-        EXP_MD_COUNTER_INCREMENT(exp, unlink);
-        rc = MDP(exp->exp_obd, unlink)(exp, op_data, request);
-        RETURN(rc);
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_UNLINK);
+
+	return MDP(exp->exp_obd, unlink)(exp, op_data, request);
 }
 
 static inline int md_get_lustre_md(struct obd_export *exp,
@@ -1479,19 +1573,25 @@ static inline int md_get_lustre_md(struct obd_export *exp,
                                    struct obd_export *md_exp,
                                    struct lustre_md *md)
 {
-        ENTRY;
-        EXP_CHECK_MD_OP(exp, get_lustre_md);
-        EXP_MD_COUNTER_INCREMENT(exp, get_lustre_md);
-        RETURN(MDP(exp->exp_obd, get_lustre_md)(exp, req, dt_exp, md_exp, md));
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, get_lustre_md)(exp, req, dt_exp, md_exp, md);
 }
 
 static inline int md_free_lustre_md(struct obd_export *exp,
                                     struct lustre_md *md)
 {
-        ENTRY;
-        EXP_CHECK_MD_OP(exp, free_lustre_md);
-        EXP_MD_COUNTER_INCREMENT(exp, free_lustre_md);
-        RETURN(MDP(exp->exp_obd, free_lustre_md)(exp, md));
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, free_lustre_md)(exp, md);
 }
 
 static inline int md_merge_attr(struct obd_export *exp,
@@ -1499,67 +1599,88 @@ static inline int md_merge_attr(struct obd_export *exp,
 				struct cl_attr *attr,
 				ldlm_blocking_callback cb)
 {
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, merge_attr);
-	EXP_MD_COUNTER_INCREMENT(exp, merge_attr);
-	RETURN(MDP(exp->exp_obd, merge_attr)(exp, lsm, attr, cb));
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, merge_attr)(exp, lsm, attr, cb);
 }
 
 static inline int md_setxattr(struct obd_export *exp, const struct lu_fid *fid,
-			      u64 valid, const char *name,
-			      const char *input, int input_size,
-			      int output_size, int flags, __u32 suppgid,
-			      struct ptlrpc_request **request)
+			      u64 obd_md_valid, const char *name,
+			      const void *value, size_t value_size,
+			      unsigned int xattr_flags, u32 suppgid,
+			      struct ptlrpc_request **req)
 {
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, setxattr);
-	EXP_MD_COUNTER_INCREMENT(exp, setxattr);
-	RETURN(MDP(exp->exp_obd, setxattr)(exp, fid, valid, name, input,
-					   input_size, output_size, flags,
-					   suppgid, request));
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_SETXATTR);
+
+	return MDP(exp->exp_obd, setxattr)(exp, fid, obd_md_valid, name,
+					   value, value_size, xattr_flags,
+					   suppgid, req);
 }
 
 static inline int md_getxattr(struct obd_export *exp, const struct lu_fid *fid,
-			      u64 valid, const char *name,
-			      const char *input, int input_size,
-			      int output_size, int flags,
-			      struct ptlrpc_request **request)
+			      u64 obd_md_valid, const char *name,
+			      size_t buf_size, struct ptlrpc_request **req)
 {
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, getxattr);
-	EXP_MD_COUNTER_INCREMENT(exp, getxattr);
-	RETURN(MDP(exp->exp_obd, getxattr)(exp, fid, valid, name, input,
-					   input_size, output_size, flags,
-					   request));
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_GETXATTR);
+
+	return MDP(exp->exp_obd, getxattr)(exp, fid, obd_md_valid, name,
+					   buf_size, req);
 }
 
 static inline int md_set_open_replay_data(struct obd_export *exp,
 					  struct obd_client_handle *och,
 					  struct lookup_intent *it)
 {
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, set_open_replay_data);
-	EXP_MD_COUNTER_INCREMENT(exp, set_open_replay_data);
-	RETURN(MDP(exp->exp_obd, set_open_replay_data)(exp, och, it));
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, set_open_replay_data)(exp, och, it);
 }
 
 static inline int md_clear_open_replay_data(struct obd_export *exp,
                                             struct obd_client_handle *och)
 {
-        ENTRY;
-        EXP_CHECK_MD_OP(exp, clear_open_replay_data);
-        EXP_MD_COUNTER_INCREMENT(exp, clear_open_replay_data);
-        RETURN(MDP(exp->exp_obd, clear_open_replay_data)(exp, och));
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, clear_open_replay_data)(exp, och);
 }
 
 static inline int md_set_lock_data(struct obd_export *exp,
 				   const struct lustre_handle *lockh,
 				   void *data, __u64 *bits)
 {
-        ENTRY;
-        EXP_CHECK_MD_OP(exp, set_lock_data);
-        EXP_MD_COUNTER_INCREMENT(exp, set_lock_data);
-        RETURN(MDP(exp->exp_obd, set_lock_data)(exp, lockh, data, bits));
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, set_lock_data)(exp, lockh, data, bits);
 }
 
 static inline
@@ -1568,14 +1689,13 @@ int md_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
 		     enum ldlm_cancel_flags cancel_flags, void *opaque)
 {
 	int rc;
-	ENTRY;
 
-	EXP_CHECK_MD_OP(exp, cancel_unused);
-	EXP_MD_COUNTER_INCREMENT(exp, cancel_unused);
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
 
-	rc = MDP(exp->exp_obd, cancel_unused)(exp, fid, policy, mode,
-					      cancel_flags, opaque);
-	RETURN(rc);
+	return MDP(exp->exp_obd, cancel_unused)(exp, fid, policy, mode,
+						cancel_flags, opaque);
 }
 
 static inline enum ldlm_mode md_lock_match(struct obd_export *exp, __u64 flags,
@@ -1585,43 +1705,57 @@ static inline enum ldlm_mode md_lock_match(struct obd_export *exp, __u64 flags,
 					   enum ldlm_mode mode,
 					   struct lustre_handle *lockh)
 {
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, lock_match);
-	EXP_MD_COUNTER_INCREMENT(exp, lock_match);
-	RETURN(MDP(exp->exp_obd, lock_match)(exp, flags, fid, type,
-					     policy, mode, lockh));
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, lock_match)(exp, flags, fid, type,
+					     policy, mode, lockh);
 }
 
 static inline int md_init_ea_size(struct obd_export *exp, __u32 ea_size,
 				  __u32 def_ea_size)
 {
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, init_ea_size);
-	EXP_MD_COUNTER_INCREMENT(exp, init_ea_size);
-	RETURN(MDP(exp->exp_obd, init_ea_size)(exp, ea_size, def_ea_size));
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, init_ea_size)(exp, ea_size, def_ea_size);
 }
 
 static inline int md_intent_getattr_async(struct obd_export *exp,
 					  struct md_enqueue_info *minfo)
 {
 	int rc;
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, intent_getattr_async);
-	EXP_MD_COUNTER_INCREMENT(exp, intent_getattr_async);
-	rc = MDP(exp->exp_obd, intent_getattr_async)(exp, minfo);
-	RETURN(rc);
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_INTENT_GETATTR_ASYNC);
+
+	return MDP(exp->exp_obd, intent_getattr_async)(exp, minfo);
 }
 
 static inline int md_revalidate_lock(struct obd_export *exp,
                                      struct lookup_intent *it,
                                      struct lu_fid *fid, __u64 *bits)
 {
-        int rc;
-        ENTRY;
-        EXP_CHECK_MD_OP(exp, revalidate_lock);
-        EXP_MD_COUNTER_INCREMENT(exp, revalidate_lock);
-        rc = MDP(exp->exp_obd, revalidate_lock)(exp, it, fid, bits);
-        RETURN(rc);
+	int rc;
+
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+			     LPROC_MD_REVALIDATE_LOCK);
+
+	return MDP(exp->exp_obd, revalidate_lock)(exp, it, fid, bits);
 }
 
 static inline int md_get_fid_from_lsm(struct obd_export *exp,
@@ -1630,13 +1764,14 @@ static inline int md_get_fid_from_lsm(struct obd_export *exp,
 				      struct lu_fid *fid)
 {
 	int rc;
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, get_fid_from_lsm);
-	EXP_MD_COUNTER_INCREMENT(exp, get_fid_from_lsm);
-	rc = MDP(exp->exp_obd, get_fid_from_lsm)(exp, lsm, name, namelen, fid);
-	RETURN(rc);
-}
 
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
+
+	return MDP(exp->exp_obd, get_fid_from_lsm)(exp, lsm, name, namelen,
+						   fid);
+}
 
 /* Unpack an MD struct from disk to in-memory format.
  * Returns +ve size of unpacked MD (0 for free), or -ve error.
@@ -1649,31 +1784,30 @@ static inline int md_unpackmd(struct obd_export *exp,
 			      const union lmv_mds_md *lmm, size_t lmm_size)
 {
 	int rc;
-	ENTRY;
-	EXP_CHECK_MD_OP(exp, unpackmd);
-	EXP_MD_COUNTER_INCREMENT(exp, unpackmd);
-	rc = MDP(exp->exp_obd, unpackmd)(exp, plsm, lmm, lmm_size);
-	RETURN(rc);
-}
 
-/* OBD Metadata Support */
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
 
-extern int obd_init_caches(void);
-extern void obd_cleanup_caches(void);
+	return MDP(exp->exp_obd, unpackmd)(exp, plsm, lmm, lmm_size);
+}
 
-/* support routines */
-extern struct kmem_cache *obdo_cachep;
+static inline int md_rmfid(struct obd_export *exp, struct fid_array *fa,
+			   int *rcs, struct ptlrpc_request_set *set)
+{
+	int rc;
 
-#define OBDO_ALLOC(ptr)                                                       \
-do {                                                                          \
-	OBD_SLAB_ALLOC_PTR_GFP((ptr), obdo_cachep, GFP_NOFS);             \
-} while(0)
+	rc = exp_check_ops(exp);
+	if (rc)
+		return rc;
 
-#define OBDO_FREE(ptr)                                                        \
-do {                                                                          \
-        OBD_SLAB_FREE_PTR((ptr), obdo_cachep);                                \
-} while(0)
+	return MDP(exp->exp_obd, rmfid)(exp, fa, rcs, set);
+}
 
+/* OBD Metadata Support */
+
+extern int obd_init_caches(void);
+extern void obd_cleanup_caches(void);
 
 typedef int (*register_lwp_cb)(void *data);
 
@@ -1705,13 +1839,14 @@ int lustre_register_fs(void);
 int lustre_unregister_fs(void);
 int lustre_check_exclusion(struct super_block *sb, char *svname);
 
-/* sysctl.c */
-extern int obd_sysctl_init(void);
-extern void obd_sysctl_clean(void);
-
-/* uuid.c  */
 typedef __u8 class_uuid_t[16];
-void class_uuid_unparse(class_uuid_t in, struct obd_uuid *out);
+static inline void class_uuid_unparse(class_uuid_t uu, struct obd_uuid *out)
+{
+	snprintf(out->uuid, sizeof(out->uuid), "%02x%02x%02x%02x-%02x%02x-"
+		 "%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+		 uu[14], uu[15], uu[12], uu[13], uu[10], uu[11], uu[8], uu[9],
+		 uu[6], uu[7], uu[4], uu[5], uu[2], uu[3], uu[0], uu[1]);
+}
 
 /* lustre_peer.c    */
 int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index);
@@ -1720,7 +1855,7 @@ int class_del_uuid (const char *uuid);
 int class_check_uuid(struct obd_uuid *uuid, __u64 nid);
 
 /* class_obd.c */
-extern char obd_jobid_node[];
+extern char obd_jobid_name[];
 
 /* prng.c */
 #define ll_generate_random_uuid(uuid_out) cfs_get_random_bytes(uuid_out, sizeof(class_uuid_t))
@@ -1746,5 +1881,4 @@ extern struct miscdevice obd_psdev;
 int obd_ioctl_getdata(char **buf, int *len, void __user *arg);
 int class_procfs_init(void);
 int class_procfs_clean(void);
-
 #endif /* __LINUX_OBD_CLASS_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/obd_support.h b/drivers/staging/lustrefsx/lustre/include/obd_support.h
index c22e08fe8cdb2..356585d91932b 100644
--- a/drivers/staging/lustrefsx/lustre/include/obd_support.h
+++ b/drivers/staging/lustrefsx/lustre/include/obd_support.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -33,11 +33,8 @@
 #ifndef _OBD_SUPPORT
 #define _OBD_SUPPORT
 
-#ifndef __KERNEL__
-# error Userspace should not include obd_support.h.
-#endif /* !__KERNEL__ */
-
 #include <linux/atomic.h>
+#include <linux/ctype.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
 #include <linux/types.h>
@@ -56,6 +53,7 @@ enum {
 extern unsigned int obd_debug_peer_on_timeout;
 extern unsigned int obd_dump_on_timeout;
 extern unsigned int obd_dump_on_eviction;
+extern unsigned int obd_lbug_on_eviction;
 /* obd_timeout should only be used for recovery, not for
    networking / disk / timings affected by load (use Adaptive Timeouts) */
 extern unsigned int obd_timeout;          /* seconds */
@@ -70,7 +68,6 @@ extern int at_early_margin;
 extern int at_extra;
 extern unsigned long obd_max_dirty_pages;
 extern atomic_long_t obd_dirty_pages;
-extern atomic_long_t obd_dirty_transit_pages;
 extern char obd_jobid_var[];
 
 /* Some hash init argument constants */
@@ -182,7 +179,9 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_MDS_GET_ROOT_NET	 0x11b
 #define OBD_FAIL_MDS_GET_ROOT_PACK	 0x11c
 #define OBD_FAIL_MDS_STATFS_PACK         0x11d
+#define OBD_FAIL_MDS_STATFS_SUM_PACK     0x11d
 #define OBD_FAIL_MDS_STATFS_NET          0x11e
+#define OBD_FAIL_MDS_STATFS_SUM_NET      0x11e
 #define OBD_FAIL_MDS_GETATTR_NAME_NET    0x11f
 #define OBD_FAIL_MDS_PIN_NET             0x120
 #define OBD_FAIL_MDS_UNPIN_NET           0x121
@@ -245,11 +244,16 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a
 #define OBD_FAIL_MDS_LLOG_CREATE_FAILED2 0x15b
 #define OBD_FAIL_MDS_FLD_LOOKUP			0x15c
+#define OBD_FAIL_MDS_CHANGELOG_REORDER	0x15d
 #define OBD_FAIL_MDS_INTENT_DELAY		0x160
 #define OBD_FAIL_MDS_XATTR_REP			0x161
 #define OBD_FAIL_MDS_TRACK_OVERFLOW	 0x162
 #define OBD_FAIL_MDS_LOV_CREATE_RACE	 0x163
 #define OBD_FAIL_MDS_HSM_CDT_DELAY	 0x164
+#define OBD_FAIL_MDS_ORPHAN_DELETE	 0x165
+#define OBD_FAIL_MDS_RMFID_NET		 0x166
+#define OBD_FAIL_MDS_REINT_OPEN		 0x169
+#define OBD_FAIL_MDS_REINT_OPEN2	 0x16a
 
 /* layout lock */
 #define OBD_FAIL_MDS_NO_LL_GETATTR	 0x170
@@ -265,6 +269,8 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_MDS_RECOVERY_ACCEPTS_GAPS 0x185
 #define OBD_FAIL_MDS_GET_INFO_NET        0x186
 #define OBD_FAIL_MDS_DQACQ_NET           0x187
+#define OBD_FAIL_MDS_STRIPE_CREATE	 0x188
+#define OBD_FAIL_MDS_STRIPE_FID		 0x189
 
 /* OI scrub */
 #define OBD_FAIL_OSD_SCRUB_DELAY			0x190
@@ -275,6 +281,12 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OSD_COMPAT_INVALID_ENTRY		0x195
 #define OBD_FAIL_OSD_COMPAT_NO_ENTRY			0x196
 #define OBD_FAIL_OSD_OST_EA_FID_SET			0x197
+#define OBD_FAIL_OSD_NO_OI_ENTRY			0x198
+#define OBD_FAIL_OSD_INDEX_CRASH			0x199
+
+#define OBD_FAIL_OSD_TXN_START				0x19a
+
+#define OBD_FAIL_OSD_DUPLICATE_MAP			0x19b
 
 #define OBD_FAIL_OFD_SET_OID				0x1e0
 
@@ -329,6 +341,14 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OST_PAUSE_PUNCH         0x236
 #define OBD_FAIL_OST_LADVISE_PAUSE	 0x237
 #define OBD_FAIL_OST_FAKE_RW		 0x238
+#define OBD_FAIL_OST_LIST_ASSERT         0x239
+#define OBD_FAIL_OST_GL_WORK_ALLOC	 0x240
+#define OBD_FAIL_OST_SKIP_LV_CHECK	 0x241
+#define OBD_FAIL_OST_STATFS_DELAY	 0x242
+#define OBD_FAIL_OST_INTEGRITY_FAULT	 0x243
+#define OBD_FAIL_OST_INTEGRITY_CMP	 0x244
+#define OBD_FAIL_OST_DISCONNECT_DELAY	 0x245
+#define OBD_FAIL_OST_2BIG_NIOBUF	 0x248
 
 #define OBD_FAIL_LDLM                    0x300
 #define OBD_FAIL_LDLM_NAMESPACE_NEW      0x301
@@ -371,9 +391,11 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_LDLM_SRV_GL_AST	 0x326
 #define OBD_FAIL_LDLM_WATERMARK_LOW	 0x327
 #define OBD_FAIL_LDLM_WATERMARK_HIGH	 0x328
+#define OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL 0x329
 
 #define OBD_FAIL_LDLM_GRANT_CHECK        0x32a
 #define OBD_FAIL_LDLM_PROLONG_PAUSE	 0x32b
+#define OBD_FAIL_LDLM_LOCK_REPLAY	 0x32d
 
 /* LOCKLESS IO */
 #define OBD_FAIL_LDLM_SET_CONTENTION     0x385
@@ -399,6 +421,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OSC_DELAY_SETTIME	 0x412
 #define OBD_FAIL_OSC_CONNECT_GRANT_PARAM 0x413
 #define OBD_FAIL_OSC_DELAY_IO            0x414
+#define OBD_FAIL_OSC_NO_SIZE_DATA        0x415
 
 #define OBD_FAIL_PTLRPC                  0x500
 #define OBD_FAIL_PTLRPC_ACK              0x501
@@ -427,19 +450,21 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_PTLRPC_LONG_BOTH_UNLINK 0x51c
 #define OBD_FAIL_PTLRPC_CLIENT_BULK_CB3  0x520
 #define OBD_FAIL_PTLRPC_BULK_ATTACH      0x521
+#define OBD_FAIL_PTLRPC_RESEND_RACE	 0x525
 #define OBD_FAIL_PTLRPC_CONNECT_RACE	 0x531
 
 #define OBD_FAIL_OBD_PING_NET            0x600
-#define OBD_FAIL_OBD_LOG_CANCEL_NET      0x601
+/*	OBD_FAIL_OBD_LOG_CANCEL_NET      0x601 obsolete since 1.5 */
 #define OBD_FAIL_OBD_LOGD_NET            0x602
 /*	OBD_FAIL_OBD_QC_CALLBACK_NET     0x603 obsolete since 2.4 */
 #define OBD_FAIL_OBD_DQACQ               0x604
 #define OBD_FAIL_OBD_LLOG_SETUP          0x605
-#define OBD_FAIL_OBD_LOG_CANCEL_REP      0x606
+/*	OBD_FAIL_OBD_LOG_CANCEL_REP      0x606 obsolete since 1.5 */
 #define OBD_FAIL_OBD_IDX_READ_NET        0x607
 #define OBD_FAIL_OBD_IDX_READ_BREAK	 0x608
 #define OBD_FAIL_OBD_NO_LRU		 0x609
 #define OBD_FAIL_OBDCLASS_MODULE_LOAD	 0x60a
+#define OBD_FAIL_OBD_ZERO_NLINK_RACE	 0x60b
 
 #define OBD_FAIL_TGT_REPLY_NET           0x700
 #define OBD_FAIL_TGT_CONN_RACE           0x701
@@ -462,14 +487,19 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_TGT_CLIENT_DEL		 0x718
 #define OBD_FAIL_TGT_SLUGGISH_NET	 0x719
 #define OBD_FAIL_TGT_RCVD_EIO		 0x720
+#define OBD_FAIL_TGT_RECOVERY_REQ_RACE	 0x721
+#define OBD_FAIL_TGT_REPLY_DATA_RACE	 0x722
+#define OBD_FAIL_TGT_NO_GRANT		 0x725
 
 #define OBD_FAIL_MDC_REVALIDATE_PAUSE    0x800
 #define OBD_FAIL_MDC_ENQUEUE_PAUSE       0x801
 #define OBD_FAIL_MDC_OLD_EXT_FLAGS       0x802
 #define OBD_FAIL_MDC_GETATTR_ENQUEUE     0x803
-#define OBD_FAIL_MDC_RPCS_SEM		 0x804
+#define OBD_FAIL_MDC_RPCS_SEM		 0x804 /* deprecated */
 #define OBD_FAIL_MDC_LIGHTWEIGHT	 0x805
 #define OBD_FAIL_MDC_CLOSE		 0x806
+#define OBD_FAIL_MDC_MERGE		 0x807
+#define OBD_FAIL_MDC_GLIMPSE_DDOS	 0x808
 
 #define OBD_FAIL_MGS                     0x900
 #define OBD_FAIL_MGS_ALL_REQUEST_NET     0x901
@@ -501,6 +531,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_FLD                     0x1100
 #define OBD_FAIL_FLD_QUERY_NET           0x1101
 #define OBD_FAIL_FLD_READ_NET		 0x1102
+#define OBD_FAIL_FLD_QUERY_REQ		 0x1103
 
 #define OBD_FAIL_SEC_CTX                 0x1200
 #define OBD_FAIL_SEC_CTX_INIT_NET        0x1201
@@ -509,18 +540,25 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_SEC_CTX_HDL_PAUSE       0x1204
 
 #define OBD_FAIL_LLOG                               0x1300
-#define OBD_FAIL_LLOG_ORIGIN_CONNECT_NET            0x1301
+/* was	OBD_FAIL_LLOG_ORIGIN_CONNECT_NET            0x1301 until 2.4 */
 #define OBD_FAIL_LLOG_ORIGIN_HANDLE_CREATE_NET      0x1302
-#define OBD_FAIL_LLOG_ORIGIN_HANDLE_DESTROY_NET     0x1303
+/* was	OBD_FAIL_LLOG_ORIGIN_HANDLE_DESTROY_NET     0x1303 until 2.11 */
 #define OBD_FAIL_LLOG_ORIGIN_HANDLE_READ_HEADER_NET 0x1304
 #define OBD_FAIL_LLOG_ORIGIN_HANDLE_NEXT_BLOCK_NET  0x1305
 #define OBD_FAIL_LLOG_ORIGIN_HANDLE_PREV_BLOCK_NET  0x1306
-#define OBD_FAIL_LLOG_ORIGIN_HANDLE_WRITE_REC_NET   0x1307
-#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CLOSE_NET       0x1308
-#define OBD_FAIL_LLOG_CATINFO_NET                   0x1309
+/* was	OBD_FAIL_LLOG_ORIGIN_HANDLE_WRITE_REC_NET   0x1307 until 2.1 */
+/* was	OBD_FAIL_LLOG_ORIGIN_HANDLE_CLOSE_NET       0x1308 until 1.8 */
+/* was	OBD_FAIL_LLOG_CATINFO_NET                   0x1309 until 2.3 */
 #define OBD_FAIL_MDS_SYNC_CAPA_SL                   0x1310
 #define OBD_FAIL_SEQ_ALLOC                          0x1311
 #define OBD_FAIL_CAT_RECORDS			    0x1312
+#define OBD_FAIL_CAT_FREE_RECORDS		    0x1313
+#define OBD_FAIL_TIME_IN_CHLOG_USER		    0x1314
+#define CFS_FAIL_CHLOG_USER_REG_UNREG_RACE	    0x1315
+#define OBD_FAIL_FORCE_GC_THREAD		    0x1316
+#define OBD_FAIL_LLOG_PROCESS_TIMEOUT		    0x1317
+#define OBD_FAIL_LLOG_PURGE_DELAY		    0x1318
+#define OBD_FAIL_CATLIST			    0x131b
 
 #define OBD_FAIL_LLITE                              0x1400
 #define OBD_FAIL_LLITE_FAULT_TRUNC_RACE             0x1401
@@ -536,9 +574,10 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_LLITE_NEWNODE_PAUSE		    0x140a
 #define OBD_FAIL_LLITE_SETDIRSTRIPE_PAUSE	    0x140b
 #define OBD_FAIL_LLITE_CREATE_NODE_PAUSE	    0x140c
-#define OBD_FAIL_LLITE_PTASK_IO_FAIL		    0x140d
 #define OBD_FAIL_LLITE_IMUTEX_SEC		    0x140e
 #define OBD_FAIL_LLITE_IMUTEX_NOSEC		    0x140f
+#define OBD_FAIL_LLITE_OPEN_BY_NAME		    0x1410
+#define OBD_FAIL_LLITE_SHORT_COMMIT		    0x1415
 
 #define OBD_FAIL_FID_INDIR	0x1501
 #define OBD_FAIL_FID_INLMA	0x1502
@@ -587,9 +626,11 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV	0x162a
 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV	0x162b
 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME	0x162c
-#define OBD_FAIL_LFSCK_ASSISTANT_DIRECT	0x162d
+#define OBD_FAIL_LFSCK_ENGINE_DELAY	0x162d
 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2	0x162e
 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE	0x162f
+#define OBD_FAIL_LFSCK_NO_AGENTOBJ	0x1630
+#define OBD_FAIL_LFSCK_NO_AGENTENT	0x1631
 
 #define OBD_FAIL_LFSCK_NOTIFY_NET	0x16f0
 #define OBD_FAIL_LFSCK_QUERY_NET	0x16f1
@@ -603,14 +644,17 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_INVALIDATE_UPDATE	0x1705
 
 /* MIGRATE */
-#define OBD_FAIL_MIGRATE_NET_REP		0x1800
 #define OBD_FAIL_MIGRATE_ENTRIES		0x1801
-#define OBD_FAIL_MIGRATE_LINKEA			0x1802
-#define OBD_FAIL_MIGRATE_DELAY			0x1803
 
 /* LMV */
 #define OBD_FAIL_UNKNOWN_LMV_STRIPE		0x1901
 
+/* FLR */
+#define OBD_FAIL_FLR_GLIMPSE_IMMUTABLE		0x1A00
+#define OBD_FAIL_FLR_LV_DELAY			0x1A01
+#define OBD_FAIL_FLR_LV_INC			0x1A02
+#define OBD_FAIL_FLR_RANDOM_PICK_MIRROR	0x1A03
+
 /* DT */
 #define OBD_FAIL_DT_DECLARE_ATTR_GET		0x2000
 #define OBD_FAIL_DT_ATTR_GET			0x2001
@@ -642,14 +686,19 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OSP_CHECK_INVALID_REC		0x2100
 #define OBD_FAIL_OSP_CHECK_ENOMEM		0x2101
 #define OBD_FAIL_OSP_FAKE_PRECREATE		0x2102
+#define OBD_FAIL_OSP_RPCS_SEM			0x2104
+#define OBD_FAIL_OSP_CANT_PROCESS_LLOG		0x2105
+#define OBD_FAIL_OSP_INVALID_LOGID		0x2106
 
- /* barrier */
+/* barrier */
 #define OBD_FAIL_MGS_BARRIER_READ_NET		0x2200
 #define OBD_FAIL_MGS_BARRIER_NOTIFY_NET		0x2201
 
 #define OBD_FAIL_BARRIER_DELAY			0x2202
 #define OBD_FAIL_BARRIER_FAILURE		0x2203
 
+#define OBD_FAIL_OSD_FAIL_AT_TRUNCATE		0x2301
+
 /* Assign references to moved code to reduce code changes */
 #define OBD_FAIL_PRECHECK(id)                   CFS_FAIL_PRECHECK(id)
 #define OBD_FAIL_CHECK(id)                      CFS_FAIL_CHECK(id)
@@ -731,11 +780,13 @@ static inline void obd_memory_sub(long size)
 
 #define __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, flags)		      \
 do {									      \
-	(ptr) = (cptab) == NULL ?					      \
-		kmalloc(size, (flags) | __GFP_ZERO) :			      \
-		cfs_cpt_malloc(cptab, cpt, size, (flags) | __GFP_ZERO);	      \
-	if (likely((ptr) != NULL))                                            \
-		OBD_ALLOC_POST(ptr, size, "kmalloced");                       \
+	if (cptab)							      \
+		ptr = cfs_cpt_malloc((cptab), (cpt), (size),		      \
+				     (flags) | __GFP_ZERO | __GFP_NOWARN);    \
+	if (!(cptab) || unlikely(!(ptr))) /* retry without CPT if failure */  \
+		ptr = kmalloc(size, (flags) | __GFP_ZERO);		      \
+	if (likely((ptr) != NULL))					      \
+		OBD_ALLOC_POST((ptr), (size), "kmalloced");		      \
 } while (0)
 
 #define OBD_ALLOC_GFP(ptr, size, gfp_mask)				      \
@@ -762,7 +813,7 @@ do {									      \
 #define __OBD_VMALLOC_VERBOSE(ptr, cptab, cpt, size)			      \
 do {									      \
 	(ptr) = cptab == NULL ?						      \
-		__ll_vmalloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO):    \
+		__ll_vmalloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO) :   \
 		cfs_cpt_vzalloc(cptab, cpt, size);			      \
 	if (unlikely((ptr) == NULL)) {                                        \
 		CERROR("vmalloc of '" #ptr "' (%d bytes) failed\n",           \
@@ -823,7 +874,7 @@ do {									      \
 do {									      \
 	if (is_vmalloc_addr(ptr)) {					      \
 		OBD_FREE_PRE(ptr, size, "vfreed");			      \
-		libcfs_vfree_atomic(ptr);						      \
+		libcfs_vfree_atomic(ptr);				      \
 		POISON_PTR(ptr);					      \
 	} else {							      \
 		OBD_FREE(ptr, size);					      \
@@ -911,4 +962,29 @@ static inline int lustre_to_lma_flags(__u32 la_flags)
 	return (la_flags & LUSTRE_ORPHAN_FL) ? LMAI_ORPHAN : 0;
 }
 
+/* Convert wire LUSTRE_*_FL to corresponding client local VFS S_* values
+ * for the client inode i_flags.  The LUSTRE_*_FL are the Lustre wire
+ * protocol equivalents of LDISKFS_*_FL values stored on disk, while
+ * the S_* flags are kernel-internal values that change between kernel
+ * versions. These flags are set/cleared via FSFILT_IOC_{GET,SET}_FLAGS.
+ * See b=16526 for a full history.
+ */
+static inline int ll_ext_to_inode_flags(int flags)
+{
+	return (((flags & LUSTRE_SYNC_FL)      ? S_SYNC      : 0) |
+		((flags & LUSTRE_NOATIME_FL)   ? S_NOATIME   : 0) |
+		((flags & LUSTRE_APPEND_FL)    ? S_APPEND    : 0) |
+		((flags & LUSTRE_DIRSYNC_FL)   ? S_DIRSYNC   : 0) |
+		((flags & LUSTRE_IMMUTABLE_FL) ? S_IMMUTABLE : 0));
+}
+
+static inline int ll_inode_to_ext_flags(int iflags)
+{
+	return (((iflags & S_SYNC)      ? LUSTRE_SYNC_FL      : 0) |
+		((iflags & S_NOATIME)   ? LUSTRE_NOATIME_FL   : 0) |
+		((iflags & S_APPEND)    ? LUSTRE_APPEND_FL    : 0) |
+		((iflags & S_DIRSYNC)   ? LUSTRE_DIRSYNC_FL   : 0) |
+		((iflags & S_IMMUTABLE) ? LUSTRE_IMMUTABLE_FL : 0));
+}
+
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/include/obj_update.h b/drivers/staging/lustrefsx/lustre/include/obj_update.h
index c381f77f0045e..8c88de86005ea 100644
--- a/drivers/staging/lustrefsx/lustre/include/obj_update.h
+++ b/drivers/staging/lustrefsx/lustre/include/obj_update.h
@@ -31,7 +31,7 @@
 #ifndef _OBJ_UPDATE_H_
 #define _OBJ_UPDATE_H_
 
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 
 static inline size_t
 object_update_param_size(const struct object_update_param *param)
diff --git a/drivers/staging/lustrefsx/lustre/include/seq_range.h b/drivers/staging/lustrefsx/lustre/include/seq_range.h
index 616ee3a78e68b..374d1932f0bdf 100644
--- a/drivers/staging/lustrefsx/lustre/include/seq_range.h
+++ b/drivers/staging/lustrefsx/lustre/include/seq_range.h
@@ -34,7 +34,7 @@
 #ifndef _SEQ_RANGE_H_
 #define _SEQ_RANGE_H_
 
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 
 /**
  * computes the sequence range type \a range
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_barrier_user.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_barrier_user.h
new file mode 100644
index 0000000000000..38084241d8998
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_barrier_user.h
@@ -0,0 +1,74 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Intel Corporation.
+ *
+ * lustre/include/lustre/lustre_barrier_user.h
+ *
+ * Lustre write barrier (on MDT) userspace interfaces.
+ *
+ * Author: Fan, Yong <fan.yong@intel.com>
+ */
+#ifndef _LUSTRE_BARRIER_USER_H
+# define _LUSTRE_BARRIER_USER_H
+
+#include <linux/types.h>
+#include <linux/lustre/lustre_user.h>
+
+#define BARRIER_VERSION_V1	1
+#define BARRIER_TIMEOUT_DEFAULT	30
+
+enum barrier_commands {
+	BC_FREEZE	= 1,
+	BC_THAW		= 2,
+	BC_STAT		= 3,
+	BC_RESCAN	= 4,
+};
+
+enum barrier_status {
+	BS_INIT		= 0,
+	BS_FREEZING_P1	= 1,
+	BS_FREEZING_P2	= 2,
+	BS_FROZEN	= 3,
+	BS_THAWING	= 4,
+	BS_THAWED	= 5,
+	BS_FAILED	= 6,
+	BS_EXPIRED	= 7,
+	BS_RESCAN	= 8,
+};
+
+struct barrier_ctl {
+	__u32	bc_version;
+	__u32	bc_cmd;
+	union {
+		__s32	bc_timeout;
+		__u32	bc_total;
+	};
+	union {
+		__u32	bc_status;
+		__u32	bc_absence;
+	};
+	char	bc_name[12];
+	__u32	bc_padding;
+};
+
+#endif /* _LUSTRE_BARRIER_USER_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_cfg.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_cfg.h
similarity index 77%
rename from drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_cfg.h
rename to drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_cfg.h
index b1f68d50b0242..30d5c7d614892 100644
--- a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_cfg.h
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_cfg.h
@@ -33,14 +33,10 @@
 #ifndef _UAPI_LUSTRE_CFG_H
 #define _UAPI_LUSTRE_CFG_H
 
+#include <linux/errno.h>
 #include <linux/kernel.h>
-#include <lustre/lustre_user.h>
-
-/* Handle older distros */
-#ifndef __ALIGN_KERNEL
-# define __ALIGN_KERNEL(x, a)		__ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
-# define __ALIGN_KERNEL_MASK(x, mask)	(((x) + (mask)) & ~(mask))
-#endif
+#include <linux/types.h>
+#include <linux/lustre/lustre_user.h>
 
 /** \defgroup cfg cfg
  *
@@ -139,6 +135,8 @@ enum lcfg_command_type {
 						 *  users
 						 */
 	LCFG_NODEMAP_MAP_MODE	  = 0x00ce059, /**< set the mapping mode */
+	LCFG_NODEMAP_AUDIT_MODE	  = 0x00ce05a, /**< set the audit mode */
+	LCFG_NODEMAP_SET_SEPOL	  = 0x00ce05b, /**< set SELinux policy */
 };
 
 struct lustre_cfg_bufs {
@@ -160,6 +158,57 @@ struct lustre_cfg {
 	__u32 lcfg_buflens[0];
 };
 
+struct lcfg_type_data {
+	__u32	 ltd_type;
+	char	*ltd_name;
+	char	*ltd_bufs[4];
+};
+
+static struct lcfg_type_data lcfg_data_table[] = {
+	{ LCFG_ATTACH, "attach", { "type", "UUID", "3", "4" } },
+	{ LCFG_DETACH, "detach", { "1", "2", "3", "4" } },
+	{ LCFG_SETUP, "setup", { "UUID", "node", "options", "failout" } },
+	{ LCFG_CLEANUP, "cleanup", { "1", "2", "3", "4" } },
+	{ LCFG_ADD_UUID, "add_uuid", { "node", "2", "3", "4" }  },
+	{ LCFG_DEL_UUID, "del_uuid", { "1", "2", "3", "4" }  },
+	{ LCFG_MOUNTOPT, "new_profile", { "name", "lov", "lmv", "4" }  },
+	{ LCFG_DEL_MOUNTOPT, "del_mountopt", { "1", "2", "3", "4" }  },
+	{ LCFG_SET_TIMEOUT, "set_timeout", { "parameter", "2", "3", "4" }  },
+	{ LCFG_SET_UPCALL, "set_upcall", { "1", "2", "3", "4" }  },
+	{ LCFG_ADD_CONN, "add_conn", { "node", "2", "3", "4" }  },
+	{ LCFG_DEL_CONN, "del_conn", { "1", "2", "3", "4" }  },
+	{ LCFG_LOV_ADD_OBD, "add_osc", { "ost", "index", "gen", "UUID" } },
+	{ LCFG_LOV_DEL_OBD, "del_osc", { "1", "2", "3", "4" } },
+	{ LCFG_PARAM, "conf_param", { "parameter", "value", "3", "4" } },
+	{ LCFG_MARKER, "marker", { "1", "2", "3", "4" } },
+	{ LCFG_LOG_START, "log_start", { "1", "2", "3", "4" } },
+	{ LCFG_LOG_END, "log_end", { "1", "2", "3", "4" } },
+	{ LCFG_LOV_ADD_INA, "add_osc_inactive", { "1", "2", "3", "4" }  },
+	{ LCFG_ADD_MDC, "add_mdc", { "mdt", "index", "gen", "UUID" } },
+	{ LCFG_DEL_MDC, "del_mdc", { "1", "2", "3", "4" } },
+	{ LCFG_SPTLRPC_CONF, "security", { "parameter", "2", "3", "4" } },
+	{ LCFG_POOL_NEW, "new_pool", { "fsname", "pool", "3", "4" }  },
+	{ LCFG_POOL_ADD, "add_pool", { "fsname", "pool", "ost", "4" } },
+	{ LCFG_POOL_REM, "remove_pool", { "fsname", "pool", "ost", "4" } },
+	{ LCFG_POOL_DEL, "del_pool", { "fsname", "pool", "3", "4" } },
+	{ LCFG_SET_LDLM_TIMEOUT, "set_ldlm_timeout",
+	  { "parameter", "2", "3", "4" } },
+	{ LCFG_SET_PARAM, "set_param", { "parameter", "value", "3", "4" } },
+	{ 0, NULL, { NULL, NULL, NULL, NULL } }
+};
+
+static inline struct lcfg_type_data *lcfg_cmd2data(__u32 cmd)
+{
+	int i = 0;
+
+	while (lcfg_data_table[i].ltd_type != 0) {
+		if (lcfg_data_table[i].ltd_type == cmd)
+			return &lcfg_data_table[i];
+		i++;
+	}
+	return NULL;
+}
+
 enum cfg_record_type {
 	PORTALS_CFG_TYPE	= 1,
 	LUSTRE_CFG_TYPE		= 123,
@@ -201,7 +250,7 @@ static inline void lustre_cfg_bufs_reset(struct lustre_cfg_bufs *bufs,
 static inline void *lustre_cfg_buf(struct lustre_cfg *lcfg, __u32 index)
 {
 	__u32 i;
-	size_t offset;
+	__kernel_size_t offset;
 	__u32 bufcount;
 
 	if (!lcfg)
@@ -261,7 +310,7 @@ static inline void lustre_cfg_init(struct lustre_cfg *lcfg, int cmd,
 	}
 }
 
-static inline int lustre_cfg_sanity_check(void *buf, size_t len)
+static inline int lustre_cfg_sanity_check(void *buf, __kernel_size_t len)
 {
 	struct lustre_cfg *lcfg = (struct lustre_cfg *)buf;
 
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_disk.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_disk.h
similarity index 85%
rename from drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_disk.h
rename to drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_disk.h
index 8887c82d3b8b9..e9cbf3066738a 100644
--- a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_disk.h
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_disk.h
@@ -29,8 +29,6 @@
  * This file is part of Lustre, http://www.lustre.org/
  * Lustre is a trademark of Sun Microsystems, Inc.
  *
- * uapi/linux/lustre_disk.h
- *
  * Lustre disk format definitions.
  *
  * Author: Nathan Rutman <nathan.rutman@seagate.com>
@@ -62,11 +60,16 @@
 #define MGS_NIDTBL_DIR		"NIDTBL_VERSIONS"
 #define QMT_DIR			"quota_master"
 #define QSD_DIR			"quota_slave"
+#define QSD_DIR_DT		"quota_slave_dt"
+#define QSD_DIR_MD		"quota_slave_md"
 #define HSM_ACTIONS		"hsm_actions"
 #define LFSCK_DIR		"LFSCK"
 #define LFSCK_BOOKMARK		"lfsck_bookmark"
 #define LFSCK_LAYOUT		"lfsck_layout"
 #define LFSCK_NAMESPACE		"lfsck_namespace"
+#define REMOTE_PARENT_DIR	"REMOTE_PARENT_DIR"
+#define INDEX_BACKUP_DIR	"index_backup"
+#define MDT_ORPHAN_DIR		"PENDING"
 
 /****************** persistent mount data *********************/
 
@@ -88,7 +91,7 @@
 /** regenerate config logs for this fs or server */
 #define LDD_F_WRITECONF		0x0100
 /** COMPAT_14 */
-#define LDD_F_UPGRADE14		0x0200
+/*#define LDD_F_UPGRADE14		0x0200 deprecated since 1.8 */
 /** process as lctl conf_param */
 #define LDD_F_PARAM		0x0400
 /** all nodes are specified as service nodes */
@@ -114,36 +117,9 @@ enum ldd_mount_type {
 	LDD_MT_LAST
 };
 
-/* On-disk configuration file. In host-endian order. */
-struct lustre_disk_data {
-	__u32 ldd_magic;
-	__u32 ldd_feature_compat;	/* compatible feature flags */
-	__u32 ldd_feature_rocompat;	/* read-only compatible feature flags */
-	__u32 ldd_feature_incompat;	/* incompatible feature flags */
-
-	__u32 ldd_config_ver;		/* config rewrite count - not used */
-	__u32 ldd_flags;		/* LDD_SV_TYPE */
-	__u32 ldd_svindex;		/* server index (0001), must match
-					 * svname
-					 */
-	__u32 ldd_mount_type;		/* target fs type LDD_MT_* */
-	char  ldd_fsname[64];		/* filesystem this server is part of,
-					 * MTI_NAME_MAXLEN
-					 */
-	char  ldd_svname[64];		/* this server's name (lustre-mdt0001)*/
-	__u8  ldd_uuid[40];		/* server UUID (COMPAT_146) */
-
-	char  ldd_userdata[1024 - 200];	/* arbitrary user string '200' */
-	__u8  ldd_padding[4096 - 1024];	/* 1024 */
-	char  ldd_mount_opts[4096];	/* target fs mount opts '4096' */
-	char  ldd_params[4096];		/* key=value pairs '8192' */
-};
-
 /****************** last_rcvd file *********************/
 
 #define LR_EXPIRE_INTERVALS 16	/**< number of intervals to track transno */
-#define ENOENT_VERSION 1	/** 'virtual' version of non-existent object */
-
 #define LR_SERVER_SIZE	512
 #define LR_CLIENT_START	8192
 #define LR_CLIENT_SIZE	128
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_fid.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fid.h
similarity index 97%
rename from drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_fid.h
rename to drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fid.h
index 3e58dd5329c3f..f11ad3b3b2115 100644
--- a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_fid.h
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fid.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  *
  * Copyright 2016 Cray Inc, all rights reserved.
  * Author: Ben Evans.
@@ -37,7 +37,8 @@
 #ifndef _UAPI_LUSTRE_FID_H_
 #define _UAPI_LUSTRE_FID_H_
 
-#include <lustre/lustre_idl.h>
+#include <linux/types.h>
+#include <linux/lustre/lustre_idl.h>
 
 /** returns fid object sequence */
 static inline __u64 fid_seq(const struct lu_fid *fid)
@@ -277,7 +278,7 @@ static inline bool fid_is_last_id(const struct lu_fid *fid)
  * \param fid an igif to get inode number from.
  * \return inode number for the igif.
  */
-static inline ino_t lu_igif_ino(const struct lu_fid *fid)
+static inline __kernel_ino_t lu_igif_ino(const struct lu_fid *fid)
 {
 	return fid_seq(fid);
 }
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fiemap.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fiemap.h
new file mode 100644
index 0000000000000..8cdb05dedbd8c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_fiemap.h
@@ -0,0 +1,72 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2014, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * FIEMAP data structures and flags. This header file will be used until
+ * fiemap.h is available in the upstream kernel.
+ *
+ * Author: Kalpak Shah <kalpak.shah@sun.com>
+ * Author: Andreas Dilger <adilger@sun.com>
+ */
+
+#ifndef _LUSTRE_FIEMAP_H
+#define _LUSTRE_FIEMAP_H
+
+#include <stddef.h>
+#include <linux/fiemap.h>
+#include <linux/types.h>
+
+/* XXX: We use fiemap_extent::fe_reserved[0] */
+#define fe_device	fe_reserved[0]
+
+static inline __kernel_size_t fiemap_count_to_size(__kernel_size_t extent_count)
+{
+	return sizeof(struct fiemap) + extent_count *
+				       sizeof(struct fiemap_extent);
+}
+
+static inline unsigned int fiemap_size_to_count(__kernel_size_t array_size)
+{
+	return (array_size - sizeof(struct fiemap)) /
+	       sizeof(struct fiemap_extent);
+}
+
+#define FIEMAP_FLAG_DEVICE_ORDER 0x40000000 /* return device ordered mapping */
+
+#ifdef FIEMAP_FLAGS_COMPAT
+#undef FIEMAP_FLAGS_COMPAT
+#endif
+
+/* Lustre specific flags - use a high bit, don't conflict with upstream flag */
+#define FIEMAP_EXTENT_NO_DIRECT 0x40000000 /* Data mapping undefined */
+#define FIEMAP_EXTENT_NET       0x80000000 /* Data stored remotely.
+					    * Sets NO_DIRECT flag */
+
+#endif /* _LUSTRE_FIEMAP_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_idl.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_idl.h
similarity index 81%
rename from drivers/staging/lustrefsx/lustre/include/lustre/lustre_idl.h
rename to drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_idl.h
index 9840237e4e046..33462dd2d5e01 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre/lustre_idl.h
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_idl.h
@@ -23,14 +23,12 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  * Lustre is a trademark of Sun Microsystems, Inc.
  *
- * lustre/include/lustre/lustre_idl.h
- *
  * Lustre wire protocol definitions.
  */
 
@@ -42,7 +40,7 @@
  * that are used in interfaces with userspace should go in lustre_user.h.
  *
  * All structs being declared here should be built from simple fixed-size
- * types (__u8, __u16, __u32, __u64) or be built from other types or
+ * types defined in linux/types.h or be built from other types or
  * structs also declared in this file.  Similarly, all flags and magic
  * values in those structs should also be declared here.  This ensures
  * that the Lustre wire protocol is not influenced by external dependencies.
@@ -70,11 +68,24 @@
 #define _LUSTRE_IDL_H_
 
 #include <asm/byteorder.h>
+#include <linux/errno.h>
+#include <linux/fiemap.h>
 #include <linux/types.h>
+/*
+ * This is due to us being out of kernel and the way the OpenSFS branch
+ * handles CFLAGS.
+ */
+#ifdef __KERNEL__
+# include <uapi/linux/lnet/lnet-types.h>
+#else
+# include <linux/lnet/lnet-types.h>
+#endif
+#include <linux/lustre/lustre_user.h>
+#include <linux/lustre/lustre_ver.h>
 
-#include <lnet/types.h>
-#include <lustre/lustre_user.h> /* Defn's shared with user-space. */
-#include <lustre_ver.h>
+#if defined(__cplusplus)
+extern "C" {
+#endif
 
 /*
  *  GENERAL STUFF
@@ -86,25 +97,25 @@
 
 #define CONNMGR_REQUEST_PORTAL          1
 #define CONNMGR_REPLY_PORTAL            2
-//#define OSC_REQUEST_PORTAL            3
+/* #define OSC_REQUEST_PORTAL		 3*/
 #define OSC_REPLY_PORTAL                4
-//#define OSC_BULK_PORTAL               5
+/*#define OSC_BULK_PORTAL		 5*/
 #define OST_IO_PORTAL                   6
 #define OST_CREATE_PORTAL               7
 #define OST_BULK_PORTAL                 8
-//#define MDC_REQUEST_PORTAL            9
+/*#define MDC_REQUEST_PORTAL		 9*/
 #define MDC_REPLY_PORTAL               10
-//#define MDC_BULK_PORTAL              11
+/*#define MDC_BULK_PORTAL		11*/
 #define MDS_REQUEST_PORTAL             12
-//#define MDS_REPLY_PORTAL             13
+#define MDS_IO_PORTAL			13
 #define MDS_BULK_PORTAL                14
 #define LDLM_CB_REQUEST_PORTAL         15
 #define LDLM_CB_REPLY_PORTAL           16
 #define LDLM_CANCEL_REQUEST_PORTAL     17
 #define LDLM_CANCEL_REPLY_PORTAL       18
-//#define PTLBD_REQUEST_PORTAL           19
-//#define PTLBD_REPLY_PORTAL             20
-//#define PTLBD_BULK_PORTAL              21
+/*#define PTLBD_REQUEST_PORTAL		19*/
+/*#define PTLBD_REPLY_PORTAL		20*/
+/*#define PTLBD_BULK_PORTAL		21*/
 #define MDS_SETATTR_PORTAL             22
 #define MDS_READPAGE_PORTAL            23
 #define OUT_PORTAL			24
@@ -117,28 +128,8 @@
 #define SEQ_DATA_PORTAL                31
 #define SEQ_CONTROLLER_PORTAL          32
 #define MGS_BULK_PORTAL                33
-
-/* Portal 63 is reserved for the Cray Inc DVS - nic@cray.com, roe@cray.com, n8851@cray.com */
-
-/* packet types */
-#define PTL_RPC_MSG_REQUEST 4711
-#define PTL_RPC_MSG_ERR     4712
-#define PTL_RPC_MSG_REPLY   4713
-
-/* DON'T use swabbed values of MAGIC as magic! */
-#define LUSTRE_MSG_MAGIC_V2 0x0BD00BD3
-#define LUSTRE_MSG_MAGIC_V2_SWABBED 0xD30BD00B
-
-#define LUSTRE_MSG_MAGIC LUSTRE_MSG_MAGIC_V2
-
-#define PTLRPC_MSG_VERSION  0x00000003
-#define LUSTRE_VERSION_MASK 0xffff0000
-#define LUSTRE_OBD_VERSION  0x00010000
-#define LUSTRE_MDS_VERSION  0x00020000
-#define LUSTRE_OST_VERSION  0x00030000
-#define LUSTRE_DLM_VERSION  0x00040000
-#define LUSTRE_LOG_VERSION  0x00050000
-#define LUSTRE_MGS_VERSION  0x00060000
+/* #define DVS_PORTAL			63 */
+/* reserved for Cray DVS - spitzcor@cray.com, roe@cray.com, n8851@cray.com */
 
 /**
  * Describes a range of sequence, lsr_start is included but lsr_end is
@@ -178,12 +169,14 @@ extern void lustre_loa_init(struct lustre_ost_attrs *loa,
 			    const struct lu_fid *fid,
 			    __u32 compat, __u32 incompat);
 
-/* copytool uses a 32b bitmask field to encode archive-Ids during register
- * with MDT thru kuc.
+/* copytool can use any nonnegative integer to represent archive-Ids during
+ * register with MDT thru kuc.
  * archive num = 0 => all
- * archive num from 1 to 32
+ * archive num from 1 to MAX_U32
  */
-#define LL_HSM_MAX_ARCHIVE (sizeof(__u32) * 8)
+#define LL_HSM_ORIGIN_MAX_ARCHIVE	(sizeof(__u32) * 8)
+/* the max count of archive ids that one agent can support */
+#define LL_HSM_MAX_ARCHIVES_PER_AGENT	1024
 
 /**
  * HSM on-disk attributes stored in a separate xattr.
@@ -389,6 +382,23 @@ struct lu_orphan_ent_v2 {
 	struct lu_orphan_rec_v2	loe_rec;
 };
 
+struct lu_orphan_rec_v3 {
+	struct lu_orphan_rec	lor_rec;
+	struct ost_layout	lor_layout;
+	/* The OST-object declared layout version in PFID EA.*/
+	__u32			lor_layout_version;
+	/* The OST-object declared layout range (of version) in PFID EA.*/
+	__u32			lor_range;
+	__u32			lor_padding_1;
+	__u64			lor_padding_2;
+};
+
+struct lu_orphan_ent_v3 {
+	/* The orphan OST-object's FID */
+	struct lu_fid		loe_key;
+	struct lu_orphan_rec_v3	loe_rec;
+};
+
 /** @} lu_fid */
 
 /** \defgroup lu_dir lu_dir
@@ -514,18 +524,21 @@ static inline struct lu_dirent *lu_dirent_next(struct lu_dirent *ent)
 	return next;
 }
 
-static inline size_t lu_dirent_calc_size(size_t namelen, __u16 attr)
+static inline __kernel_size_t lu_dirent_calc_size(size_t namelen, __u16 attr)
 {
-	size_t size;
+	__kernel_size_t size;
 
 	if (attr & LUDA_TYPE) {
-		const size_t align = sizeof(struct luda_type) - 1;
-                size = (sizeof(struct lu_dirent) + namelen + align) & ~align;
-                size += sizeof(struct luda_type);
-        } else
-                size = sizeof(struct lu_dirent) + namelen;
+		const __kernel_size_t align = sizeof(struct luda_type) - 1;
+
+		size = (sizeof(struct lu_dirent) + namelen + 1 + align) &
+		       ~align;
+		size += sizeof(struct luda_type);
+	} else {
+		size = sizeof(struct lu_dirent) + namelen + 1;
+	}
 
-        return (size + 7) & ~7;
+	return (size + 7) & ~7;
 }
 
 #define MDS_DIR_END_OFF 0xfffffffffffffffeULL
@@ -569,59 +582,109 @@ static inline void lustre_handle_copy(struct lustre_handle *tgt,
 	tgt->cookie = src->cookie;
 }
 
-struct lustre_handle_array {
-	unsigned int		count;
-	struct lustre_handle	handles[0];
+/* lustre_msg struct magic.  DON'T use swabbed values of MAGIC as magic! */
+enum lustre_msg_magic {
+	LUSTRE_MSG_MAGIC_V2		= 0x0BD00BD3,
+	LUSTRE_MSG_MAGIC_V2_SWABBED	= 0xD30BD00B,
+	LUSTRE_MSG_MAGIC		= LUSTRE_MSG_MAGIC_V2
 };
 
 /* flags for lm_flags */
-#define MSGHDR_AT_SUPPORT               0x1
-#define MSGHDR_CKSUM_INCOMPAT18         0x2
+enum lustre_msghdr {
+	MSGHDR_AT_SUPPORT	= 0x1,	/* adaptive timeouts, lm_cksum valid
+					 * in early reply messages */
+	MSGHDR_CKSUM_INCOMPAT18	= 0x2,	/* compat for 1.8, needs to be set well
+					 * beyond 2.8.0 for compatibility */
+};
 
 #define lustre_msg lustre_msg_v2
 /* we depend on this structure to be 8-byte aligned */
 /* this type is only endian-adjusted in lustre_unpack_msg() */
 struct lustre_msg_v2 {
-        __u32 lm_bufcount;
-        __u32 lm_secflvr;
-        __u32 lm_magic;
-        __u32 lm_repsize;
-        __u32 lm_cksum;
-        __u32 lm_flags;
-        __u32 lm_padding_2;
-        __u32 lm_padding_3;
-        __u32 lm_buflens[0];
-};
-
-/* without gss, ptlrpc_body is put at the first buffer. */
+	__u32 lm_bufcount;	/* number of buffers in lm_buflens[] */
+	__u32 lm_secflvr;	/* 0 = no crypto, or sptlrpc security flavour */
+	__u32 lm_magic;		/* RPC version magic = LUSTRE_MSG_MAGIC_V2 */
+	__u32 lm_repsize;	/* size of preallocated reply buffer */
+	__u32 lm_cksum;		/* CRC32 of ptlrpc_body early reply messages */
+	__u32 lm_flags;		/* enum lustre_msghdr MSGHDR_* flags */
+	__u32 lm_padding_2;	/* unused */
+	__u32 lm_padding_3;	/* unused */
+	__u32 lm_buflens[0];	/* length of additional buffers in bytes,
+				 * padded to a multiple of 8 bytes. */
+	/*
+	 * message buffers are packed after padded lm_buflens[] array,
+	 * padded to a multiple of 8 bytes each to align contents.
+	 */
+};
+
+/* ptlrpc_body packet pb_types */
+#define PTL_RPC_MSG_REQUEST	4711	/* normal RPC request message */
+#define PTL_RPC_MSG_ERR		4712	/* error reply if request unprocessed */
+#define PTL_RPC_MSG_REPLY	4713	/* normal RPC reply message */
+
+/* ptlrpc_body pb_version ((target_version << 16) | rpc_version) */
+enum lustre_msg_version {
+	PTLRPC_MSG_VERSION	= 0x00000003,
+	LUSTRE_VERSION_MASK	= 0xffff0000,
+	LUSTRE_OBD_VERSION	= 0x00010000,
+	LUSTRE_MDS_VERSION	= 0x00020000,
+	LUSTRE_OST_VERSION	= 0x00030000,
+	LUSTRE_DLM_VERSION	= 0x00040000,
+	LUSTRE_LOG_VERSION	= 0x00050000,
+	LUSTRE_MGS_VERSION	= 0x00060000,
+};
+
+/* pb_flags that apply to all request messages */
+/* #define MSG_LAST_REPLAY	0x0001 obsolete 2.0 => {REQ,LOCK}_REPLAY_DONE */
+#define MSG_RESENT		0x0002 /* was previously sent, no reply seen */
+#define MSG_REPLAY		0x0004 /* was processed, got reply, recovery */
+/* #define MSG_AT_SUPPORT	0x0008 obsolete since 1.5, AT always enabled */
+/* #define MSG_DELAY_REPLAY	0x0010 obsolete since 2.0 */
+/* #define MSG_VERSION_REPLAY	0x0020 obsolete since 1.8.2, VBR always on */
+#define MSG_REQ_REPLAY_DONE	0x0040 /* request replay over, locks next */
+#define MSG_LOCK_REPLAY_DONE	0x0080 /* lock replay over, client done */
+
+/* pb_op_flags for connect opcodes: MDS_CONNECT, OST_CONNECT, MGS_CONNECT */
+#define MSG_CONNECT_RECOVERING	0x00000001 /* target is in recovery */
+#define MSG_CONNECT_RECONNECT	0x00000002 /* tgt already has client import */
+#define MSG_CONNECT_REPLAYABLE	0x00000004 /* target supports RPC replay */
+/* #define MSG_CONNECT_PEER	0x00000008 obsolete since 1.2, removed in 1.5 */
+#define MSG_CONNECT_LIBCLIENT	0x00000010 /* obsolete since 2.3, removed 2.6 */
+#define MSG_CONNECT_INITIAL	0x00000020 /* first client connection attempt */
+/* #define MSG_CONNECT_ASYNC	0x00000040 obsolete since 1.5 */
+#define MSG_CONNECT_NEXT_VER	0x00000080 /* use next version of lustre_msg */
+#define MSG_CONNECT_TRANSNO	0x00000100 /* client sent transno in replay */
+
+/* number of previous object versions in pb_pre_versions[] */
 #define PTLRPC_NUM_VERSIONS     4
+/* without gss, ptlrpc_body is put at the first buffer. */
 struct ptlrpc_body_v3 {
 	struct lustre_handle pb_handle;
-	__u32 pb_type;
-	__u32 pb_version;
-	__u32 pb_opc;
-	__u32 pb_status;
-	__u64 pb_last_xid; /* highest replied XID without lower unreplied XID */
-	__u16 pb_tag;      /* virtual slot idx for multiple modifying RPCs */
+	__u32 pb_type;		/* request/reply/err type: PTL_RPC_MSG_* */
+	__u32 pb_version;	/* LUSTRE_*_VERSION | PTLRPC_MSG_VERSION */
+	__u32 pb_opc;		/* RPC opcodes: MDS_*, OST_*, LDLM_, ... */
+	__u32 pb_status;	/* negative Linux x86 error number */
+	__u64 pb_last_xid;	/* highest replied XID w/o lower unreplied XID*/
+	__u16 pb_tag;		/* multiple modifying RPCs virtual slot index */
 	__u16 pb_padding0;
 	__u32 pb_padding1;
-	__u64 pb_last_committed;
-	__u64 pb_transno;
-	__u32 pb_flags;
-	__u32 pb_op_flags;
-	__u32 pb_conn_cnt;
-	__u32 pb_timeout;  /* for req, the deadline, for rep, the service est */
-	__u32 pb_service_time; /* for rep, actual service time */
-	__u32 pb_limit;
-	__u64 pb_slv;
-	/* VBR: pre-versions */
+	__u64 pb_last_committed;/* rep: highest pb_transno committed to disk */
+	__u64 pb_transno;	/* server-assigned transno for modifying RPCs */
+	__u32 pb_flags;		/* req: MSG_* flags */
+	__u32 pb_op_flags;	/* req: MSG_CONNECT_* flags */
+	__u32 pb_conn_cnt;	/* connect instance of this client on server */
+	__u32 pb_timeout;	/* req: max wait time; rep: service estimate */
+	__u32 pb_service_time;	/* rep: server arrival to reply in seconds */
+	__u32 pb_limit;		/* rep: dynamic DLM LRU lock count limit */
+	__u64 pb_slv;		/* rep: dynamic DLM LRU server lock volume */
+	/* VBR: rep: previous pb_version(s) of objects modified by this RPC */
 	__u64 pb_pre_versions[PTLRPC_NUM_VERSIONS];
 	__u64 pb_mbits;	/**< match bits for bulk request */
-	/* padding for future needs */
+	/* padding for future needs - fix lustre_swab_ptlrpc_body() also */
 	__u64 pb_padding64_0;
 	__u64 pb_padding64_1;
 	__u64 pb_padding64_2;
-	char  pb_jobid[LUSTRE_JOBID_SIZE];
+	char  pb_jobid[LUSTRE_JOBID_SIZE]; /* req: ASCII jobid from env + NUL */
 };
 #define ptlrpc_body     ptlrpc_body_v3
 
@@ -677,38 +740,6 @@ struct ptlrpc_body_v2 {
 /** only use in req->rq_{req,rep}_swab_mask */
 #define MSG_PTLRPC_HEADER_OFF           31
 
-/* Flags that are operation-specific go in the top 16 bits. */
-#define MSG_OP_FLAG_MASK   0xffff0000
-#define MSG_OP_FLAG_SHIFT  16
-
-/* Flags that apply to all requests are in the bottom 16 bits */
-#define MSG_GEN_FLAG_MASK     0x0000ffff
-#define MSG_LAST_REPLAY           0x0001
-#define MSG_RESENT                0x0002
-#define MSG_REPLAY                0x0004
-/* #define MSG_AT_SUPPORT         0x0008
- * This was used in early prototypes of adaptive timeouts, and while there
- * shouldn't be any users of that code there also isn't a need for using this
- * bits. Defer usage until at least 1.10 to avoid potential conflict. */
-#define MSG_DELAY_REPLAY          0x0010
-#define MSG_VERSION_REPLAY        0x0020
-#define MSG_REQ_REPLAY_DONE       0x0040
-#define MSG_LOCK_REPLAY_DONE      0x0080
-
-/*
- * Flags for all connect opcodes (MDS_CONNECT, OST_CONNECT)
- */
-
-#define MSG_CONNECT_RECOVERING  0x00000001
-#define MSG_CONNECT_RECONNECT   0x00000002
-#define MSG_CONNECT_REPLAYABLE  0x00000004
-//#define MSG_CONNECT_PEER        0x8
-#define MSG_CONNECT_LIBCLIENT   0x00000010
-#define MSG_CONNECT_INITIAL     0x00000020
-#define MSG_CONNECT_ASYNC       0x00000040
-#define MSG_CONNECT_NEXT_VER    0x00000080 /* use next version of lustre_msg */
-#define MSG_CONNECT_TRANSNO     0x00000100 /* report transno */
-
 /* Connect flags */
 #define OBD_CONNECT_RDONLY                0x1ULL /*client has read-only access*/
 #define OBD_CONNECT_INDEX                 0x2ULL /*connect specific LOV idx */
@@ -783,14 +814,30 @@ struct ptlrpc_body_v2 {
 							 RPCs in parallel */
 #define OBD_CONNECT_DIR_STRIPE	 0x400000000000000ULL /* striped DNE dir */
 #define OBD_CONNECT_SUBTREE	0x800000000000000ULL /* fileset mount */
-#define OBD_CONNECT_LOCK_AHEAD	 0x1000000000000000ULL /* lock ahead */
+#define OBD_CONNECT_LOCKAHEAD_OLD 0x1000000000000000ULL /* Old Cray lockahead */
+
 /** bulk matchbits is sent within ptlrpc_body */
 #define OBD_CONNECT_BULK_MBITS	 0x2000000000000000ULL
 #define OBD_CONNECT_OBDOPACK	 0x4000000000000000ULL /* compact OUT obdo */
 #define OBD_CONNECT_FLAGS2	 0x8000000000000000ULL /* second flags word */
 /* ocd_connect_flags2 flags */
-#define OBD_CONNECT2_FILE_SECCTX	0x1ULL /* set file security context at create */
-
+#define OBD_CONNECT2_FILE_SECCTX	 0x1ULL /* set file security context at create */
+#define OBD_CONNECT2_LOCKAHEAD		 0x2ULL /* ladvise lockahead v2 */
+#define OBD_CONNECT2_DIR_MIGRATE	 0x4ULL /* migrate striped dir */
+#define OBD_CONNECT2_SUM_STATFS		0x8ULL /* MDT return aggregated stats */
+#define OBD_CONNECT2_FLR		0x20ULL /* FLR support */
+#define OBD_CONNECT2_WBC_INTENTS	0x40ULL /* create/unlink/... intents for wbc, also operations under client-held parent locks */
+#define OBD_CONNECT2_LOCK_CONVERT	0x80ULL /* IBITS lock convert support */
+#define OBD_CONNECT2_ARCHIVE_ID_ARRAY	0x100ULL /* store HSM archive_id in array */
+#define OBD_CONNECT2_SELINUX_POLICY	0x400ULL /* has client SELinux policy */
+#define OBD_CONNECT2_LSOM		0x800ULL /* LSOM support */
+#define OBD_CONNECT2_ASYNC_DISCARD	0x4000ULL /* support async DoM data discard */
+#define OBD_CONNECT2_ENCRYPT		0x8000ULL /* client-to-disk encrypt */
+#define OBD_CONNECT2_FIDMAP	       0x10000ULL /* FID map */
+#define OBD_CONNECT2_GETATTR_PFID      0x20000ULL /* pack parent FID in getattr */
+/* risk of forwards incompatibility with upstream - use high order bits to mitigate */
+#define OBD_CONNECT2_MDLL           0x1000000000000000ULL /* enable metadata lazy load */
+#define OBD_CONNECT2_MDLL_AUTO_REFRESH 0x2000000000000000ULL /* enable metadata lazy load auto-refresh */
 /* XXX README XXX:
  * Please DO NOT add flag values here before first ensuring that this same
  * flag value is not in use on some other branch.  Please clear any such
@@ -832,13 +879,25 @@ struct ptlrpc_body_v2 {
 				OBD_CONNECT_FLOCK_DEAD | \
 				OBD_CONNECT_DISP_STRIPE | OBD_CONNECT_LFSCK | \
 				OBD_CONNECT_OPEN_BY_FID | \
-				OBD_CONNECT_DIR_STRIPE | \
-				OBD_CONNECT_BULK_MBITS | \
+				OBD_CONNECT_DIR_STRIPE | OBD_CONNECT_GRANT | \
+				OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_SRVLOCK | \
+				OBD_CONNECT_BULK_MBITS | OBD_CONNECT_CKSUM | \
 				OBD_CONNECT_MULTIMODRPCS | \
 				OBD_CONNECT_SUBTREE | OBD_CONNECT_LARGE_ACL | \
-				OBD_CONNECT_FLAGS2)
-
-#define MDT_CONNECT_SUPPORTED2 OBD_CONNECT2_FILE_SECCTX
+				OBD_CONNECT_GRANT_PARAM | \
+				OBD_CONNECT_SHORTIO | OBD_CONNECT_FLAGS2)
+
+#define MDT_CONNECT_SUPPORTED2 (OBD_CONNECT2_FILE_SECCTX | OBD_CONNECT2_FLR | \
+                                OBD_CONNECT2_SUM_STATFS | \
+				OBD_CONNECT2_LOCK_CONVERT | \
+				OBD_CONNECT2_DIR_MIGRATE | \
+				OBD_CONNECT2_ARCHIVE_ID_ARRAY | \
+				OBD_CONNECT2_SELINUX_POLICY | \
+				OBD_CONNECT2_LSOM | \
+				OBD_CONNECT2_ASYNC_DISCARD | \
+				OBD_CONNECT2_GETATTR_PFID | \
+				OBD_CONNECT2_MDLL | \
+				OBD_CONNECT2_MDLL_AUTO_REFRESH)
 
 #define OST_CONNECT_SUPPORTED  (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
 				OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
@@ -856,10 +915,12 @@ struct ptlrpc_body_v2 {
 				OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_FID | \
 				OBD_CONNECT_PINGLESS | OBD_CONNECT_LFSCK | \
 				OBD_CONNECT_BULK_MBITS | \
-				OBD_CONNECT_GRANT_PARAM)
-#define OST_CONNECT_SUPPORTED2 0
+				OBD_CONNECT_GRANT_PARAM | \
+				OBD_CONNECT_SHORTIO | OBD_CONNECT_FLAGS2)
 
-#define ECHO_CONNECT_SUPPORTED 0
+#define OST_CONNECT_SUPPORTED2 OBD_CONNECT2_LOCKAHEAD
+
+#define ECHO_CONNECT_SUPPORTED (OBD_CONNECT_FID)
 #define ECHO_CONNECT_SUPPORTED2 0
 
 #define MGS_CONNECT_SUPPORTED  (OBD_CONNECT_VERSION | OBD_CONNECT_AT | \
@@ -871,6 +932,7 @@ struct ptlrpc_body_v2 {
 
 /* Features required for this version of the client to work with server */
 #define CLIENT_CONNECT_MDT_REQD (OBD_CONNECT_FID |	\
+				 OBD_CONNECT_ATTRFID |	\
 				 OBD_CONNECT_FULL20)
 
 /* This structure is used for both request and reply.
@@ -927,21 +989,43 @@ struct obd_connect_data {
 /*
  * Supported checksum algorithms. Up to 32 checksum types are supported.
  * (32-bit mask stored in obd_connect_data::ocd_cksum_types)
- * Please update DECLARE_CKSUM_NAME/OBD_CKSUM_ALL in obd.h when adding a new
- * algorithm and also the OBD_FL_CKSUM* flags.
+ * Please update DECLARE_CKSUM_NAME in obd_cksum.h when adding a new
+ * algorithm and also the OBD_FL_CKSUM* flags, OBD_CKSUM_ALL flag,
+ * OBD_FL_CKSUM_ALL flag and potentially OBD_CKSUM_T10_ALL flag.
+ */
+enum cksum_types {
+	OBD_CKSUM_CRC32		= 0x00000001,
+	OBD_CKSUM_ADLER		= 0x00000002,
+	OBD_CKSUM_CRC32C	= 0x00000004,
+	OBD_CKSUM_RESERVED	= 0x00000008,
+	OBD_CKSUM_T10IP512	= 0x00000010,
+	OBD_CKSUM_T10IP4K	= 0x00000020,
+	OBD_CKSUM_T10CRC512	= 0x00000040,
+	OBD_CKSUM_T10CRC4K	= 0x00000080,
+};
+
+#define OBD_CKSUM_T10_ALL (OBD_CKSUM_T10IP512 | OBD_CKSUM_T10IP4K | \
+	OBD_CKSUM_T10CRC512 | OBD_CKSUM_T10CRC4K)
+
+#define OBD_CKSUM_ALL (OBD_CKSUM_CRC32 | OBD_CKSUM_ADLER | OBD_CKSUM_CRC32C | \
+		       OBD_CKSUM_T10_ALL)
+
+/*
+ * The default checksum algorithm used on top of T10PI GRD tags for RPC.
+ * Considering that the checksum-of-checksums is only computing CRC32 on a
+ * 4KB chunk of GRD tags for a 1MB RPC for 512B sectors, or 16KB of GRD
+ * tags for 16MB of 4KB sectors, this is only 1/256 or 1/1024 of the
+ * total data being checksummed, so the checksum type used here should not
+ * affect overall system performance noticeably.
  */
-typedef enum cksum_types {
-        OBD_CKSUM_CRC32 = 0x00000001,
-        OBD_CKSUM_ADLER = 0x00000002,
-        OBD_CKSUM_CRC32C= 0x00000004,
-} cksum_type_t;
+#define OBD_CKSUM_T10_TOP OBD_CKSUM_ADLER
 
 /*
  *   OST requests: OBDO & OBD request records
  */
 
 /* opcodes */
-typedef enum {
+enum ost_cmd {
         OST_REPLY      =  0,       /* reply ? */
         OST_GETATTR    =  1,
         OST_SETATTR    =  2,
@@ -962,8 +1046,10 @@ typedef enum {
         OST_QUOTACTL   = 19,
 	OST_QUOTA_ADJUST_QUNIT = 20, /* not used since 2.4 */
 	OST_LADVISE    = 21,
-	OST_LAST_OPC /* must be < 33 to avoid MDS_GETATTR */
-} ost_cmd_t;
+	OST_LAST_OPC, /* must be < 33 to avoid MDS_GETATTR */
+	OST_FALLOCATE  = 22,
+	OST_SEEK       = 23,
+};
 #define OST_FIRST_OPC  OST_REPLY
 
 enum obdo_flags {
@@ -980,13 +1066,16 @@ enum obdo_flags {
         OBD_FL_NO_GRPQUOTA  = 0x00000200, /* the object's group is over quota */
         OBD_FL_CREATE_CROW  = 0x00000400, /* object should be create on write */
         OBD_FL_SRVLOCK      = 0x00000800, /* delegate DLM locking to server */
-        OBD_FL_CKSUM_CRC32  = 0x00001000, /* CRC32 checksum type */
-        OBD_FL_CKSUM_ADLER  = 0x00002000, /* ADLER checksum type */
-        OBD_FL_CKSUM_CRC32C = 0x00004000, /* CRC32C checksum type */
-        OBD_FL_CKSUM_RSVD2  = 0x00008000, /* for future cksum types */
-        OBD_FL_CKSUM_RSVD3  = 0x00010000, /* for future cksum types */
-        OBD_FL_SHRINK_GRANT = 0x00020000, /* object shrink the grant */
-        OBD_FL_MMAP         = 0x00040000, /* object is mmapped on the client.
+	OBD_FL_CKSUM_CRC32  = 0x00001000, /* CRC32 checksum type */
+	OBD_FL_CKSUM_ADLER  = 0x00002000, /* ADLER checksum type */
+	OBD_FL_CKSUM_CRC32C = 0x00004000, /* CRC32C checksum type */
+	OBD_FL_CKSUM_T10IP512  = 0x00005000, /* T10PI IP cksum, 512B sector */
+	OBD_FL_CKSUM_T10IP4K   = 0x00006000, /* T10PI IP cksum, 4KB sector */
+	OBD_FL_CKSUM_T10CRC512 = 0x00007000, /* T10PI CRC cksum, 512B sector */
+	OBD_FL_CKSUM_T10CRC4K  = 0x00008000, /* T10PI CRC cksum, 4KB sector */
+	OBD_FL_CKSUM_RSVD3  = 0x00010000, /* for future cksum types */
+	OBD_FL_SHRINK_GRANT = 0x00020000, /* object shrink the grant */
+	OBD_FL_MMAP         = 0x00040000, /* object is mmapped on the client.
                                            * XXX: obsoleted - reserved for old
                                            * clients prior than 2.2 */
         OBD_FL_RECOV_RESEND = 0x00080000, /* recoverable resent */
@@ -995,10 +1084,18 @@ enum obdo_flags {
 	OBD_FL_SHORT_IO	    = 0x00400000, /* short io request */
 	/* OBD_FL_LOCAL_MASK = 0xF0000000, was local-only flags until 2.10 */
 
-	/* Note that while these checksum values are currently separate bits,
-	 * in 2.x we can actually allow all values from 1-31 if we wanted. */
+	/*
+	 * Note that while the original checksum values were separate bits,
+	 * in 2.x we can actually allow all values from 1-31. T10-PI checksum
+	 * types already use values which are not separate bits.
+	 */
 	OBD_FL_CKSUM_ALL    = OBD_FL_CKSUM_CRC32 | OBD_FL_CKSUM_ADLER |
-			      OBD_FL_CKSUM_CRC32C,
+			      OBD_FL_CKSUM_CRC32C | OBD_FL_CKSUM_T10IP512 |
+			      OBD_FL_CKSUM_T10IP4K | OBD_FL_CKSUM_T10CRC512 |
+			      OBD_FL_CKSUM_T10CRC4K,
+
+	OBD_FL_NO_QUOTA_ALL = OBD_FL_NO_USRQUOTA | OBD_FL_NO_GRPQUOTA |
+			      OBD_FL_NO_PRJQUOTA,
 };
 
 /*
@@ -1035,10 +1132,10 @@ enum obdo_flags {
  * those *_DEF magics are only used on server side internally, they
  * won't be put on wire or disk.
  */
-#define LOV_MAGIC_DEF		0x10000000
-#define LOV_MAGIC_V1_DEF	(LOV_MAGIC_DEF | LOV_MAGIC_V1)
-#define LOV_MAGIC_V3_DEF	(LOV_MAGIC_DEF | LOV_MAGIC_V3)
-#define LOV_MAGIC_COMP_V1_DEF	(LOV_MAGIC_DEF | LOV_MAGIC_COMP_V1)
+#define LOV_MAGIC_DEFINED		0x10000000
+#define LOV_MAGIC_V1_DEFINED		(LOV_MAGIC_DEFINED | LOV_MAGIC_V1)
+#define LOV_MAGIC_V3_DEFINED		(LOV_MAGIC_DEFINED | LOV_MAGIC_V3)
+#define LOV_MAGIC_COMP_V1_DEFINED	(LOV_MAGIC_DEFINED | LOV_MAGIC_COMP_V1)
 
 #define lov_pattern(pattern)		(pattern & ~LOV_PATTERN_F_MASK)
 #define lov_pattern_flags(pattern)	(pattern & LOV_PATTERN_F_MASK)
@@ -1081,6 +1178,7 @@ struct lov_mds_md_v1 {            /* LOV EA mds/wire data (little-endian) */
 #define XATTR_TRUSTED_PREFIX    "trusted."
 #define XATTR_SECURITY_PREFIX   "security."
 
+#define XATTR_NAME_SOM		"trusted.som"
 #define XATTR_NAME_LOV          "trusted.lov"
 #define XATTR_NAME_LMA          "trusted.lma"
 #define XATTR_NAME_LMV          "trusted.lmv"
@@ -1122,7 +1220,7 @@ static inline __u32 lov_mds_md_size(__u16 stripes, __u32 lmm_magic)
 }
 
 static inline __u32
-lov_mds_md_max_stripe_count(size_t buf_size, __u32 lmm_magic)
+lov_mds_md_max_stripe_count(__kernel_size_t buf_size, __u32 lmm_magic)
 {
 	switch (lmm_magic) {
 	case LOV_MAGIC_V1: {
@@ -1158,20 +1256,21 @@ lov_mds_md_max_stripe_count(size_t buf_size, __u32 lmm_magic)
 #define OBD_MD_FLUID       (0x00000200ULL) /* user ID */
 #define OBD_MD_FLGID       (0x00000400ULL) /* group ID */
 #define OBD_MD_FLFLAGS     (0x00000800ULL) /* flags word */
+#define OBD_MD_DOM_SIZE    (0X00001000ULL) /* Data-on-MDT component size */
 #define OBD_MD_FLNLINK     (0x00002000ULL) /* link count */
-#define OBD_MD_FLGENER     (0x00004000ULL) /* generation number */
-/*#define OBD_MD_FLINLINE    (0x00008000ULL)  inline data. used until 1.6.5 */
+#define OBD_MD_FLPARENT    (0x00004000ULL) /* parent FID */
+#define OBD_MD_LAYOUT_VERSION (0x00008000ULL) /* OST object layout version */
 #define OBD_MD_FLRDEV      (0x00010000ULL) /* device number */
 #define OBD_MD_FLEASIZE    (0x00020000ULL) /* extended attribute data */
 #define OBD_MD_LINKNAME    (0x00040000ULL) /* symbolic link target */
 #define OBD_MD_FLHANDLE    (0x00080000ULL) /* file/lock handle */
 #define OBD_MD_FLCKSUM     (0x00100000ULL) /* bulk data checksum */
-#define OBD_MD_FLQOS       (0x00200000ULL) /* quality of service stats */
-/*	OBD_MD_FLCOOKIE    (0x00800000ULL)    obsolete in 2.8 */
+/*	OBD_MD_FLQOS       (0x00200000ULL) has never been used */
+/*	OBD_MD_FLCOOKIE    (0x00800000ULL) obsolete in 2.8 */
 #define OBD_MD_FLPRJQUOTA  (0x00400000ULL) /* over quota flags sent from ost */
 #define OBD_MD_FLGROUP     (0x01000000ULL) /* group */
 #define OBD_MD_FLFID       (0x02000000ULL) /* ->ost write inline fid */
-#define OBD_MD_FLEPOCH     (0x04000000ULL) /* ->ost write with ioepoch */
+/*	OBD_MD_FLEPOCH     (0x04000000ULL) obsolete 2.7.50 */
                                            /* ->mds if epoch opens or closes */
 #define OBD_MD_FLGRANT     (0x08000000ULL) /* ost preallocation space grant */
 #define OBD_MD_FLDIREA     (0x10000000ULL) /* dir's extended attribute data */
@@ -1180,7 +1279,7 @@ lov_mds_md_max_stripe_count(size_t buf_size, __u32 lmm_magic)
 #define OBD_MD_FLMODEASIZE (0x80000000ULL) /* EA size will be changed */
 
 #define OBD_MD_MDS         (0x0000000100000000ULL) /* where an inode lives on */
-#define OBD_MD_REINT       (0x0000000200000000ULL) /* reintegrate oa */
+/*	OBD_MD_REINT       (0x0000000200000000ULL) obsolete 1.8 */
 #define OBD_MD_MEA         (0x0000000400000000ULL) /* CMD split EA  */
 #define OBD_MD_TSTATE      (0x0000000800000000ULL) /* transient state field */
 
@@ -1188,10 +1287,10 @@ lov_mds_md_max_stripe_count(size_t buf_size, __u32 lmm_magic)
 #define OBD_MD_FLXATTRLS     (0x0000002000000000ULL) /* xattr list */
 #define OBD_MD_FLXATTRRM     (0x0000004000000000ULL) /* xattr remove */
 #define OBD_MD_FLACL         (0x0000008000000000ULL) /* ACL */
-/*	OBD_MD_FLRMTPERM     (0x0000010000000000ULL) remote perm, obsolete */
-#define OBD_MD_FLMDSCAPA     (0x0000020000000000ULL) /* MDS capability */
-#define OBD_MD_FLOSSCAPA     (0x0000040000000000ULL) /* OSS capability */
-#define OBD_MD_FLCKSPLIT     (0x0000080000000000ULL) /* Check split on server */
+#define OBD_MD_FLAGSTATFS    (0x0000010000000000ULL) /* aggregated statfs */
+/*	OBD_MD_FLMDSCAPA     (0x0000020000000000ULL) obsolete 2.7.54 */
+/*	OBD_MD_FLOSSCAPA     (0x0000040000000000ULL) obsolete 2.7.54 */
+/*      OBD_MD_FLCKSPLIT     (0x0000080000000000ULL) obsolete 2.3.58*/
 #define OBD_MD_FLCROSSREF    (0x0000100000000000ULL) /* Cross-ref case */
 #define OBD_MD_FLGETATTRLOCK (0x0000200000000000ULL) /* Get IOEpoch attributes
                                                       * under lock; for xattr
@@ -1206,6 +1305,10 @@ lov_mds_md_max_stripe_count(size_t buf_size, __u32 lmm_magic)
 #define OBD_MD_DEFAULT_MEA   (0x0040000000000000ULL) /* default MEA */
 #define OBD_MD_FLOSTLAYOUT   (0x0080000000000000ULL) /* contain ost_layout */
 #define OBD_MD_FLPROJID      (0x0100000000000000ULL) /* project ID */
+#define OBD_MD_SECCTX        (0x0200000000000000ULL) /* embed security xattr */
+
+#define OBD_MD_FLLAZYSIZE    (0x0400000000000000ULL) /* Lazy size */
+#define OBD_MD_FLLAZYBLOCKS  (0x0800000000000000ULL) /* Lazy blocks */
 
 #define OBD_MD_FLALLQUOTA (OBD_MD_FLUSRQUOTA | \
 			   OBD_MD_FLGRPQUOTA | \
@@ -1215,7 +1318,7 @@ lov_mds_md_max_stripe_count(size_t buf_size, __u32 lmm_magic)
 			  OBD_MD_FLCTIME | OBD_MD_FLSIZE  | OBD_MD_FLBLKSZ | \
 			  OBD_MD_FLMODE  | OBD_MD_FLTYPE  | OBD_MD_FLUID   | \
 			  OBD_MD_FLGID   | OBD_MD_FLFLAGS | OBD_MD_FLNLINK | \
-			  OBD_MD_FLGENER | OBD_MD_FLRDEV  | OBD_MD_FLGROUP | \
+			  OBD_MD_FLPARENT | OBD_MD_FLRDEV  | OBD_MD_FLGROUP | \
 			  OBD_MD_FLPROJID)
 
 #define OBD_MD_FLXATTRALL (OBD_MD_FLXATTR | OBD_MD_FLXATTRLS)
@@ -1241,6 +1344,9 @@ struct hsm_state_set {
 #define OBD_BRW_READ            0x01
 #define OBD_BRW_WRITE           0x02
 #define OBD_BRW_RWMASK          (OBD_BRW_READ | OBD_BRW_WRITE)
+#define OBD_BRW_NDELAY		0x04 /* Non-delay RPC should be issued for
+				      * this page. Non-delay RPCs have bit
+				      * rq_no_delay set. */
 #define OBD_BRW_SYNC            0x08 /* this page is a part of synchronous
                                       * transfer and is not accounted in
                                       * the grant. */
@@ -1495,11 +1601,11 @@ struct lquota_lvb {
 #define lvb_glb_ver  lvb_id_may_rel /* current version of the global index */
 
 /* op codes */
-typedef enum {
+enum quota_cmd {
 	QUOTA_DQACQ	= 601,
 	QUOTA_DQREL	= 602,
 	QUOTA_LAST_OPC
-} quota_cmd_t;
+};
 #define QUOTA_FIRST_OPC	QUOTA_DQACQ
 
 /*
@@ -1507,7 +1613,7 @@ typedef enum {
  */
 
 /* opcodes */
-typedef enum {
+enum mds_cmd {
 	MDS_GETATTR		= 33,
 	MDS_GETATTR_NAME	= 34,
 	MDS_CLOSE		= 35,
@@ -1537,17 +1643,18 @@ typedef enum {
 	MDS_HSM_CT_REGISTER	= 59,
 	MDS_HSM_CT_UNREGISTER	= 60,
 	MDS_SWAP_LAYOUTS	= 61,
+	MDS_RMFID		= 62,
 	MDS_LAST_OPC
-} mds_cmd_t;
+};
 
 #define MDS_FIRST_OPC    MDS_GETATTR
 
 
 /* opcodes for object update */
-typedef enum {
+enum update_cmd {
 	OUT_UPDATE	= 1000,
 	OUT_UPDATE_LAST_OPC
-} update_cmd_t;
+};
 
 #define OUT_UPDATE_FIRST_OPC    OUT_UPDATE
 
@@ -1555,7 +1662,7 @@ typedef enum {
  * Do not exceed 63
  */
 
-typedef enum {
+enum mds_reint_op {
 	REINT_SETATTR  = 1,
 	REINT_CREATE   = 2,
 	REINT_LINK     = 3,
@@ -1565,8 +1672,9 @@ typedef enum {
 	REINT_SETXATTR = 7,
 	REINT_RMENTRY  = 8,
 	REINT_MIGRATE  = 9,
-        REINT_MAX
-} mds_reint_t, mdt_reint_t;
+	REINT_RESYNC   = 10,
+	REINT_MAX
+};
 
 /* the disposition of the intent outlines what was executed */
 #define DISP_IT_EXECD        0x00000001
@@ -1584,28 +1692,33 @@ typedef enum {
 #define DISP_OPEN_DENY	     0x10000000
 
 /* INODE LOCK PARTS */
-#define MDS_INODELOCK_LOOKUP 0x000001	/* For namespace, dentry etc, and also
-					 * was used to protect permission (mode,
-					 * owner, group etc) before 2.4. */
-#define MDS_INODELOCK_UPDATE 0x000002	/* size, links, timestamps */
-#define MDS_INODELOCK_OPEN   0x000004	/* For opened files */
-#define MDS_INODELOCK_LAYOUT 0x000008	/* for layout */
-
-/* The PERM bit is added int 2.4, and it is used to protect permission(mode,
- * owner, group, acl etc), so to separate the permission from LOOKUP lock.
- * Because for remote directories(in DNE), these locks will be granted by
- * different MDTs(different ldlm namespace).
- *
- * For local directory, MDT will always grant UPDATE_LOCK|PERM_LOCK together.
- * For Remote directory, the master MDT, where the remote directory is, will
- * grant UPDATE_LOCK|PERM_LOCK, and the remote MDT, where the name entry is,
- * will grant LOOKUP_LOCK. */
-#define MDS_INODELOCK_PERM   0x000010
-#define MDS_INODELOCK_XATTR  0x000020	/* extended attributes */
-
-#define MDS_INODELOCK_MAXSHIFT 5
+enum mds_ibits_locks {
+	MDS_INODELOCK_LOOKUP	= 0x000001, /* For namespace, dentry etc.  Was
+					     * used to protect permission (mode,
+					     * owner, group, etc) before 2.4. */
+	MDS_INODELOCK_UPDATE	= 0x000002, /* size, links, timestamps */
+	MDS_INODELOCK_OPEN	= 0x000004, /* For opened files */
+	MDS_INODELOCK_LAYOUT	= 0x000008, /* for layout */
+
+	/* The PERM bit is added in 2.4, and is used to protect permission
+	 * (mode, owner, group, ACL, etc.) separate from LOOKUP lock.
+	 * For remote directories (in DNE) these locks will be granted by
+	 * different MDTs (different LDLM namespace).
+	 *
+	 * For local directory, the MDT always grants UPDATE|PERM together.
+	 * For remote directory, master MDT (where remote directory is) grants
+	 * UPDATE|PERM, and remote MDT (where name entry is) grants LOOKUP_LOCK.
+	 */
+	MDS_INODELOCK_PERM	= 0x000010,
+	MDS_INODELOCK_XATTR	= 0x000020, /* non-permission extended attrs */
+	MDS_INODELOCK_DOM	= 0x000040, /* Data for Data-on-MDT files */
+	/* Do not forget to increase MDS_INODELOCK_NUMBITS when adding bits */
+};
+#define MDS_INODELOCK_NUMBITS 7
 /* This FULL lock is useful to take on unlink sort of operations */
-#define MDS_INODELOCK_FULL ((1<<(MDS_INODELOCK_MAXSHIFT+1))-1)
+#define MDS_INODELOCK_FULL ((1 << MDS_INODELOCK_NUMBITS) - 1)
+/* DOM lock shouldn't be canceled early, use this macro for ELC */
+#define MDS_INODELOCK_ELC (MDS_INODELOCK_FULL & ~MDS_INODELOCK_DOM)
 
 /* NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2],
  * but was moved into name[1] along with the OID to avoid consuming the
@@ -1625,17 +1738,17 @@ enum {
 enum {
 	/* these should be identical to their EXT4_*_FL counterparts, they are
 	 * redefined here only to avoid dragging in fs/ext4/ext4.h */
-	LUSTRE_SYNC_FL = 0x00000008, /* Synchronous updates */
-	LUSTRE_IMMUTABLE_FL = 0x00000010, /* Immutable file */
-	LUSTRE_APPEND_FL = 0x00000020, /* writes to file may only append */
-	LUSTRE_NODUMP_FL = 0x00000040, /* do not dump file */
-	LUSTRE_NOATIME_FL = 0x00000080, /* do not update atime */
-	LUSTRE_INDEX_FL = 0x00001000, /* hash-indexed directory */
-	LUSTRE_DIRSYNC_FL = 0x00010000, /* dirsync behaviour (dir only) */
-	LUSTRE_TOPDIR_FL = 0x00020000, /* Top of directory hierarchies*/
-	LUSTRE_DIRECTIO_FL = 0x00100000, /* Use direct i/o */
-	LUSTRE_INLINE_DATA_FL = 0x10000000, /* Inode has inline data. */
-	LUSTRE_PROJINHERIT_FL = 0x20000000, /* Create with parents projid */
+	LUSTRE_SYNC_FL		= 0x00000008, /* Synchronous updates */
+	LUSTRE_IMMUTABLE_FL	= 0x00000010, /* Immutable file */
+	LUSTRE_APPEND_FL	= 0x00000020, /* file writes may only append */
+	LUSTRE_NODUMP_FL	= 0x00000040, /* do not dump file */
+	LUSTRE_NOATIME_FL	= 0x00000080, /* do not update atime */
+	LUSTRE_INDEX_FL		= 0x00001000, /* hash-indexed directory */
+	LUSTRE_DIRSYNC_FL	= 0x00010000, /* dirsync behaviour (dir only) */
+	LUSTRE_TOPDIR_FL	= 0x00020000, /* Top of directory hierarchies*/
+	LUSTRE_DIRECTIO_FL	= 0x00100000, /* Use direct i/o */
+	LUSTRE_INLINE_DATA_FL	= 0x10000000, /* Inode has inline data. */
+	LUSTRE_PROJINHERIT_FL	= 0x20000000, /* Create with parents projid */
 
 	/* These flags will not be identical to any EXT4_*_FL counterparts,
 	 * and only reserved for lustre purpose. Note: these flags might
@@ -1644,45 +1757,26 @@ enum {
 	 * wired by la_flags see osd_attr_get().
 	 * 2. If these flags needs to be stored into inode, they will be
 	 * stored in LMA. see LMAI_XXXX */
-	LUSTRE_ORPHAN_FL = 0x00002000,
+	LUSTRE_ORPHAN_FL	= 0x00002000,
+	LUSTRE_SET_SYNC_FL	= 0x00040000, /* Synchronous setattr on OSTs */
 
-	LUSTRE_LMA_FL_MASKS = LUSTRE_ORPHAN_FL,
+	LUSTRE_LMA_FL_MASKS	= LUSTRE_ORPHAN_FL,
 };
 
-#ifndef FS_XFLAG_PROJINHERIT
-#define FS_XFLAG_PROJINHERIT	0x00000200	/* create with parents projid */
+#ifndef FS_XFLAG_SYNC
+#define FS_XFLAG_SYNC		0x00000020	/* all writes synchronous */
 #endif
-
-#ifdef __KERNEL__
-/* Convert wire LUSTRE_*_FL to corresponding client local VFS S_* values
- * for the client inode i_flags.  The LUSTRE_*_FL are the Lustre wire
- * protocol equivalents of LDISKFS_*_FL values stored on disk, while
- * the S_* flags are kernel-internal values that change between kernel
- * versions.  These flags are set/cleared via FSFILT_IOC_{GET,SET}_FLAGS.
- * See b=16526 for a full history. */
-static inline int ll_ext_to_inode_flags(int flags)
-{
-        return (((flags & LUSTRE_SYNC_FL)      ? S_SYNC      : 0) |
-                ((flags & LUSTRE_NOATIME_FL)   ? S_NOATIME   : 0) |
-                ((flags & LUSTRE_APPEND_FL)    ? S_APPEND    : 0) |
-#if defined(S_DIRSYNC)
-                ((flags & LUSTRE_DIRSYNC_FL)   ? S_DIRSYNC   : 0) |
+#ifndef FS_XFLAG_NOATIME
+#define FS_XFLAG_NOATIME	0x00000040	/* do not update access time */
 #endif
-		((flags & LUSTRE_IMMUTABLE_FL) ? S_IMMUTABLE : 0) |
-		((flags & LUSTRE_PROJINHERIT_FL) ? FS_XFLAG_PROJINHERIT : 0));
-}
-
-static inline int ll_inode_to_ext_flags(int iflags)
-{
-        return (((iflags & S_SYNC)      ? LUSTRE_SYNC_FL      : 0) |
-                ((iflags & S_NOATIME)   ? LUSTRE_NOATIME_FL   : 0) |
-                ((iflags & S_APPEND)    ? LUSTRE_APPEND_FL    : 0) |
-#if defined(S_DIRSYNC)
-                ((iflags & S_DIRSYNC)   ? LUSTRE_DIRSYNC_FL   : 0) |
+#ifndef FS_XFLAG_IMMUTABLE
+#define FS_XFLAG_IMMUTABLE	0x00000008	/* file cannot be modified */
 #endif
-		((iflags & S_IMMUTABLE) ? LUSTRE_IMMUTABLE_FL : 0) |
-		((iflags & FS_XFLAG_PROJINHERIT) ? LUSTRE_PROJINHERIT_FL : 0));
-}
+#ifndef FS_XFLAG_APPEND
+#define FS_XFLAG_APPEND		0x00000010	/* all writes append */
+#endif
+#ifndef FS_XFLAG_PROJINHERIT
+#define FS_XFLAG_PROJINHERIT	0x00000200	/* create with parents projid */
 #endif
 
 /* 64 possible states */
@@ -1693,14 +1787,14 @@ enum md_transient_state {
 struct mdt_body {
 	struct lu_fid mbo_fid1;
 	struct lu_fid mbo_fid2;
-	struct lustre_handle mbo_handle;
+	struct lustre_handle mbo_open_handle;
 	__u64	mbo_valid;
 	__u64	mbo_size; /* Offset, in the case of MDS_READPAGE */
 	__s64	mbo_mtime;
 	__s64	mbo_atime;
 	__s64	mbo_ctime;
 	__u64	mbo_blocks; /* XID, in the case of MDS_READPAGE */
-	__u64	mbo_ioepoch;
+	__u64	mbo_version; /* was mbo_ioepoch before 2.11 */
 	__u64	mbo_t_state; /* transient file state defined in
 			      * enum md_transient_state
 			      * was "ino" until 2.4.0 */
@@ -1713,7 +1807,7 @@ struct mdt_body {
 	__u32	mbo_flags;   /* LUSTRE_*_FL file attributes */
 	__u32	mbo_rdev;
 	__u32	mbo_nlink; /* #bytes to read in the case of MDS_READPAGE */
-	__u32	mbo_unused2; /* was "generation" until 2.4.0 */
+	__u32	mbo_layout_gen; /* was "generation" until 2.4.0 */
 	__u32	mbo_suppgid;
 	__u32	mbo_eadatasize;
 	__u32	mbo_aclsize;
@@ -1722,15 +1816,15 @@ struct mdt_body {
 	__u32	mbo_uid_h; /* high 32-bits of uid, for FUID */
 	__u32	mbo_gid_h; /* high 32-bits of gid, for FUID */
 	__u32	mbo_projid;
-	__u64	mbo_padding_6; /* also fix lustre_swab_mdt_body */
-	__u64	mbo_padding_7;
-	__u64	mbo_padding_8;
+	__u64	mbo_dom_size; /* size of DOM component */
+	__u64	mbo_dom_blocks; /* blocks consumed by DOM component */
+	__u64	mbo_padding_8; /* also fix lustre_swab_mdt_body */
 	__u64	mbo_padding_9;
 	__u64	mbo_padding_10;
 }; /* 216 */
 
 struct mdt_ioepoch {
-	struct lustre_handle mio_handle;
+	struct lustre_handle mio_open_handle;
 	__u64 mio_unused1; /* was ioepoch */
 	__u32 mio_unused2; /* was flags */
 	__u32 mio_padding;
@@ -1794,103 +1888,72 @@ struct mdt_rec_setattr {
 #define MDS_ATTR_FROM_OPEN  0x4000ULL /* = 16384, called from open path, ie O_TRUNC */
 #define MDS_ATTR_BLOCKS     0x8000ULL /* = 32768 */
 #define MDS_ATTR_PROJID	    0x10000ULL	/* = 65536 */
-
-#ifndef FMODE_READ
-#define FMODE_READ               00000001
-#define FMODE_WRITE              00000002
-#endif
-
-#define MDS_FMODE_CLOSED         00000000
-#define MDS_FMODE_EXEC           00000004
-/*	MDS_FMODE_EPOCH          01000000 obsolete since 2.8.0 */
-/*	MDS_FMODE_TRUNC          02000000 obsolete since 2.8.0 */
-/*	MDS_FMODE_SOM            04000000 obsolete since 2.8.0 */
-
-#define MDS_OPEN_CREATED         00000010
-#define MDS_OPEN_CROSS           00000020
-
-#define MDS_OPEN_CREAT           00000100
-#define MDS_OPEN_EXCL            00000200
-#define MDS_OPEN_TRUNC           00001000
-#define MDS_OPEN_APPEND          00002000
-#define MDS_OPEN_SYNC            00010000
-#define MDS_OPEN_DIRECTORY       00200000
-
-#define MDS_OPEN_BY_FID 	040000000 /* open_by_fid for known object */
-#define MDS_OPEN_DELAY_CREATE  0100000000 /* delay initial object create */
-#define MDS_OPEN_OWNEROVERRIDE 0200000000 /* NFSD rw-reopen ro file for owner */
-#define MDS_OPEN_JOIN_FILE     0400000000 /* open for join file.
-                                           * We do not support JOIN FILE
-                                           * anymore, reserve this flags
-                                           * just for preventing such bit
-                                           * to be reused. */
-
-#define MDS_OPEN_LOCK         04000000000 /* This open requires open lock */
-#define MDS_OPEN_HAS_EA      010000000000 /* specify object create pattern */
-#define MDS_OPEN_HAS_OBJS    020000000000 /* Just set the EA the obj exist */
-#define MDS_OPEN_NORESTORE  0100000000000ULL /* Do not restore file at open */
-#define MDS_OPEN_NEWSTRIPE  0200000000000ULL /* New stripe needed (restripe or
-                                              * hsm restore) */
-#define MDS_OPEN_VOLATILE   0400000000000ULL /* File is volatile = created
-						unlinked */
-#define MDS_OPEN_LEASE	   01000000000000ULL /* Open the file and grant lease
-					      * delegation, succeed if it's not
-					      * being opened with conflict mode.
-					      */
-#define MDS_OPEN_RELEASE   02000000000000ULL /* Open the file for HSM release */
-
-/* lustre internal open flags, which should not be set from user space */
-#define MDS_OPEN_FL_INTERNAL (MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS |	\
-			      MDS_OPEN_OWNEROVERRIDE | MDS_OPEN_LOCK |	\
-			      MDS_OPEN_BY_FID | MDS_OPEN_LEASE |	\
-			      MDS_OPEN_RELEASE)
+#define MDS_ATTR_LSIZE      0x20000ULL	/* = 131072 */
+#define MDS_ATTR_LBLOCKS    0x40000ULL	/* = 262144 */
+#define MDS_ATTR_OVERRIDE	0x2000000ULL /* = 33554432 */
 
 enum mds_op_bias {
-	MDS_CHECK_SPLIT		= 1 << 0,
+/*	MDS_CHECK_SPLIT		= 1 << 0, obsolete before 2.3.58 */
+	/* used for remote object getattr/open by name: in the original
+	 * getattr/open request, MDT found the object against name is on another
+	 * MDT, then packed FID and LOOKUP lock in reply and returned -EREMOTE,
+	 * and client knew it's a remote object, then set this flag in
+	 * getattr/open request and sent to the corresponding MDT to finish
+	 * getattr/open, which fetched attributes and UPDATE lock/opened file.
+	 */
 	MDS_CROSS_REF		= 1 << 1,
-	MDS_VTX_BYPASS		= 1 << 2,
+/*	MDS_VTX_BYPASS		= 1 << 2, obsolete since 2.3.54 */
 	MDS_PERM_BYPASS		= 1 << 3,
 /*	MDS_SOM			= 1 << 4, obsolete since 2.8.0 */
 	MDS_QUOTA_IGNORE	= 1 << 5,
-	/* Was MDS_CLOSE_CLEANUP (1 << 6), No more used */
+/*	MDS_CLOSE_CLEANUP	= 1 << 6, obsolete since 2.3.51 */
 	MDS_KEEP_ORPHAN		= 1 << 7,
 	MDS_RECOV_OPEN		= 1 << 8,
 	MDS_DATA_MODIFIED	= 1 << 9,
 	MDS_CREATE_VOLATILE	= 1 << 10,
 	MDS_OWNEROVERRIDE	= 1 << 11,
 	MDS_HSM_RELEASE		= 1 << 12,
-	MDS_RENAME_MIGRATE	= 1 << 13,
+	MDS_CLOSE_MIGRATE	= 1 << 13,
 	MDS_CLOSE_LAYOUT_SWAP	= 1 << 14,
+	MDS_CLOSE_LAYOUT_MERGE	= 1 << 15,
+	MDS_CLOSE_RESYNC_DONE	= 1 << 16,
+	MDS_CLOSE_LAYOUT_SPLIT	= 1 << 17,
+	MDS_TRUNC_KEEP_LEASE	= 1 << 18,
+	MDS_CLOSE_UPDATE_TIMES	= 1 << 20,
 };
 
+#define MDS_CLOSE_INTENT (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP |         \
+			  MDS_CLOSE_LAYOUT_MERGE | MDS_CLOSE_LAYOUT_SPLIT | \
+			  MDS_CLOSE_RESYNC_DONE)
+
 /* instance of mdt_reint_rec */
 struct mdt_rec_create {
-        __u32           cr_opcode;
-        __u32           cr_cap;
-        __u32           cr_fsuid;
-        __u32           cr_fsuid_h;
-        __u32           cr_fsgid;
-        __u32           cr_fsgid_h;
-        __u32           cr_suppgid1;
-        __u32           cr_suppgid1_h;
-        __u32           cr_suppgid2;
-        __u32           cr_suppgid2_h;
-        struct lu_fid   cr_fid1;
-        struct lu_fid   cr_fid2;
-        struct lustre_handle cr_old_handle; /* handle in case of open replay */
+	__u32		cr_opcode;
+	__u32		cr_cap;
+	__u32		cr_fsuid;
+	__u32		cr_fsuid_h;
+	__u32		cr_fsgid;
+	__u32		cr_fsgid_h;
+	__u32		cr_suppgid1;
+	__u32		cr_suppgid1_h;
+	__u32		cr_suppgid2;
+	__u32		cr_suppgid2_h;
+	struct lu_fid	cr_fid1;
+	struct lu_fid	cr_fid2;
+	struct lustre_handle cr_open_handle_old; /* in case of open replay */
 	__s64		cr_time;
-        __u64           cr_rdev;
-        __u64           cr_ioepoch;
-        __u64           cr_padding_1;   /* rr_blocks */
-        __u32           cr_mode;
-        __u32           cr_bias;
-        /* use of helpers set/get_mrc_cr_flags() is needed to access
-         * 64 bits cr_flags [cr_flags_l, cr_flags_h], this is done to
-         * extend cr_flags size without breaking 1.8 compat */
-        __u32           cr_flags_l;     /* for use with open, low  32 bits  */
-        __u32           cr_flags_h;     /* for use with open, high 32 bits */
-        __u32           cr_umask;       /* umask for create */
-        __u32           cr_padding_4;   /* rr_padding_4 */
+	__u64		cr_rdev;
+	__u64		cr_ioepoch;
+	__u64		cr_padding_1;   /* rr_blocks */
+	__u32		cr_mode;
+	__u32		cr_bias;
+	/* use of helpers set/get_mrc_cr_flags() is needed to access
+	 * 64 bits cr_flags [cr_flags_l, cr_flags_h], this is done to
+	 * extend cr_flags size without breaking 1.8 compat */
+	__u32		cr_flags_l;	/* for use with open, low  32 bits  */
+	__u32		cr_flags_h;	/* for use with open, high 32 bits */
+	__u32		cr_umask;	/* umask for create */
+	__u32		cr_padding_4;   /* rr_padding_4 */
 };
 
 /* instance of mdt_reint_rec */
@@ -2003,6 +2066,35 @@ struct mdt_rec_setxattr {
         __u32           sx_padding_11;  /* rr_padding_4 */
 };
 
+/* instance of mdt_reint_rec
+ * FLR: for file resync MDS_REINT_RESYNC RPC. */
+struct mdt_rec_resync {
+	__u32           rs_opcode;
+	__u32           rs_cap;
+	__u32           rs_fsuid;
+	__u32           rs_fsuid_h;
+	__u32           rs_fsgid;
+	__u32           rs_fsgid_h;
+	__u32           rs_suppgid1;
+	__u32           rs_suppgid1_h;
+	__u32           rs_suppgid2;
+	__u32           rs_suppgid2_h;
+	struct lu_fid   rs_fid;
+	__u8		rs_padding0[sizeof(struct lu_fid)];
+	struct lustre_handle rs_lease_handle;	/* rr_mtime */
+	__s64		rs_padding1;	/* rr_atime */
+	__s64		rs_padding2;	/* rr_ctime */
+	__u64           rs_padding3;	/* rr_size */
+	__u64           rs_padding4;	/* rr_blocks */
+	__u32           rs_bias;
+	__u32           rs_padding5;	/* rr_mode */
+	__u32           rs_padding6;	/* rr_flags */
+	__u32           rs_padding7;	/* rr_flags_h */
+	__u32           rs_padding8;	/* rr_umask */
+	__u16           rs_mirror_id;
+	__u16           rs_padding9;	/* rr_padding_4 */
+};
+
 /*
  * mdt_rec_reint is the template for all mdt_reint_xxx structures.
  * Do NOT change the size of various members, otherwise the value
@@ -2034,7 +2126,8 @@ struct mdt_rec_reint {
 	__u32           rr_flags;
 	__u32           rr_flags_h;
 	__u32           rr_umask;
-	__u32           rr_padding_4; /* also fix lustre_swab_mdt_rec_reint */
+	__u16		rr_mirror_id;
+	__u16           rr_padding_4; /* also fix lustre_swab_mdt_rec_reint */
 };
 
 /* lmv structures */
@@ -2065,9 +2158,16 @@ struct lmv_mds_md_v1 {
 					 * used for now. Higher 16 bits will
 					 * be used to mark the object status,
 					 * for example migrating or dead. */
-	__u32 lmv_layout_version;	/* Used for directory restriping */
-	__u32 lmv_padding1;
-	__u64 lmv_padding2;
+	__u32 lmv_layout_version;	/* increased each time layout changed,
+					 * by directory migration, restripe
+					 * and LFSCK. */
+	__u32 lmv_migrate_offset;	/* once this is set, it means this
+					 * directory is been migrated, stripes
+					 * before this offset belong to target,
+					 * from this to source. */
+	__u32 lmv_migrate_hash;		/* hash type of source stripes of
+					 * migrating directory */
+	__u32 lmv_padding2;
 	__u64 lmv_padding3;
 	char lmv_pool_name[LOV_MAXPOOLNAME + 1];	/* pool name */
 	struct lu_fid lmv_stripe_fids[0];	/* FIDs for each stripe */
@@ -2087,7 +2187,7 @@ struct lmv_mds_md_v1 {
 
 #define LMV_HASH_FLAG_MIGRATION	0x80000000
 
-#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 10, 53, 0)
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 12, 53, 0)
 /* Since lustre 2.8, this flag will not be needed, instead this DEAD
  * and orphan flags will be stored in LMA (see LMAI_ORPHAN)
  * Keep this flag just for LFSCK, because it still might meet such
@@ -2115,11 +2215,11 @@ struct lmv_mds_md_v1 {
  **/
 #define LUSTRE_FNV_1A_64_PRIME	0x100000001b3ULL
 #define LUSTRE_FNV_1A_64_OFFSET_BIAS 0xcbf29ce484222325ULL
-static inline __u64 lustre_hash_fnv_1a_64(const void *buf, size_t size)
+static inline __u64 lustre_hash_fnv_1a_64(const void *buf, __kernel_size_t size)
 {
 	__u64 hash = LUSTRE_FNV_1A_64_OFFSET_BIAS;
 	const unsigned char *p = buf;
-	size_t i;
+	__kernel_size_t i;
 
 	for (i = 0; i < size; i++) {
 		hash ^= p[i];
@@ -2135,18 +2235,22 @@ union lmv_mds_md {
 	struct lmv_user_md	 lmv_user_md;
 };
 
-static inline int lmv_mds_md_size(int stripe_count, unsigned int lmm_magic)
+static inline __kernel_ssize_t lmv_mds_md_size(int stripe_count,
+					       unsigned int lmm_magic)
 {
+	__kernel_ssize_t len = -EINVAL;
+
 	switch (lmm_magic) {
-	case LMV_MAGIC_V1:{
+	case LMV_MAGIC_V1: {
 		struct lmv_mds_md_v1 *lmm1;
 
-		return sizeof(*lmm1) + stripe_count *
-				       sizeof(lmm1->lmv_stripe_fids[0]);
-	}
+		len = sizeof(*lmm1);
+		len += stripe_count * sizeof(lmm1->lmv_stripe_fids[0]);
+		break; }
 	default:
-		return -EINVAL;
+		break;
 	}
+	return len;
 }
 
 static inline int lmv_mds_md_stripe_count_get(const union lmv_mds_md *lmm)
@@ -2198,12 +2302,12 @@ enum fld_op {
 };
 
 /* LFSCK opcodes */
-typedef enum {
+enum lfsck_cmd {
 	LFSCK_NOTIFY		= 1101,
 	LFSCK_QUERY		= 1102,
 	LFSCK_LAST_OPC,
-	LFSCK_FIRST_OPC 	= LFSCK_NOTIFY
-} lfsck_cmd_t;
+	LFSCK_FIRST_OPC		= LFSCK_NOTIFY
+};
 
 /*
  *  LOV data structures
@@ -2239,7 +2343,7 @@ struct lov_desc {
  *   LDLM requests:
  */
 /* opcodes -- MUST be distinct from OST/MDS opcodes */
-typedef enum {
+enum ldlm_cmd {
         LDLM_ENQUEUE     = 101,
         LDLM_CONVERT     = 102,
         LDLM_CANCEL      = 103,
@@ -2248,7 +2352,7 @@ typedef enum {
         LDLM_GL_CALLBACK = 106,
         LDLM_SET_INFO    = 107,
         LDLM_LAST_OPC
-} ldlm_cmd_t;
+};
 #define LDLM_FIRST_OPC LDLM_ENQUEUE
 
 #define RES_NAME_SIZE 4
@@ -2263,7 +2367,7 @@ struct ldlm_res_id {
 			(unsigned long long)(res)->lr_name.name[3]
 
 /* lock types */
-typedef enum ldlm_mode {
+enum ldlm_mode {
 	LCK_MINMODE	= 0,
 	LCK_EX		= 1,
 	LCK_PW		= 2,
@@ -2274,17 +2378,17 @@ typedef enum ldlm_mode {
 	LCK_GROUP	= 64,
 	LCK_COS		= 128,
 	LCK_MAXMODE
-} ldlm_mode_t;
+};
 
 #define LCK_MODE_NUM    8
 
-typedef enum ldlm_type {
+enum ldlm_type {
 	LDLM_PLAIN	= 10,
 	LDLM_EXTENT	= 11,
 	LDLM_FLOCK	= 12,
 	LDLM_IBITS	= 13,
 	LDLM_MAX_TYPE
-} ldlm_type_t;
+};
 
 #define LDLM_MIN_TYPE LDLM_PLAIN
 
@@ -2294,8 +2398,18 @@ struct ldlm_extent {
         __u64 gid;
 };
 
+static inline bool ldlm_extent_equal(const struct ldlm_extent *ex1,
+				    const struct ldlm_extent *ex2)
+{
+	return ex1->start == ex2->start && ex1->end == ex2->end;
+}
+
 struct ldlm_inodebits {
-        __u64 bits;
+	__u64 bits;
+	union {
+		__u64 try_bits; /* optional bits to try */
+		__u64 cancel_bits; /* for lock convert */
+	};
 };
 
 struct ldlm_flock_wire {
@@ -2312,11 +2426,11 @@ struct ldlm_flock_wire {
  * this ever changes we will need to swab the union differently based
  * on the resource type. */
 
-typedef union ldlm_wire_policy_data {
+union ldlm_wire_policy_data {
 	struct ldlm_extent	l_extent;
 	struct ldlm_flock_wire	l_flock;
 	struct ldlm_inodebits	l_inodebits;
-} ldlm_wire_policy_data_t;
+};
 
 struct barrier_lvb {
 	__u32	lvb_status;
@@ -2338,19 +2452,21 @@ union ldlm_gl_desc {
 enum ldlm_intent_flags {
 	IT_OPEN        = 0x00000001,
 	IT_CREAT       = 0x00000002,
-	IT_OPEN_CREAT  = 0x00000003,
-	IT_READDIR     = 0x00000004,
+	IT_OPEN_CREAT  = IT_OPEN | IT_CREAT, /* To allow case label. */
+	IT_READDIR     = 0x00000004, /* Used by mdc, not put on the wire. */
 	IT_GETATTR     = 0x00000008,
 	IT_LOOKUP      = 0x00000010,
-	IT_UNLINK      = 0x00000020,
-	IT_TRUNC       = 0x00000040,
+/*	IT_UNLINK      = 0x00000020, Obsolete. */
+/*	IT_TRUNC       = 0x00000040, Obsolete. */
 	IT_GETXATTR    = 0x00000080,
-	IT_EXEC        = 0x00000100,
-	IT_PIN         = 0x00000200,
+/*	IT_EXEC        = 0x00000100, Obsolete. */
+/*	IT_PIN         = 0x00000200, Obsolete. */
 	IT_LAYOUT      = 0x00000400,
 	IT_QUOTA_DQACQ = 0x00000800,
 	IT_QUOTA_CONN  = 0x00001000,
-	IT_SETXATTR    = 0x00002000,
+/*	IT_SETXATTR    = 0x00002000, Obsolete. */
+	IT_GLIMPSE     = 0x00004000,
+	IT_BRW	       = 0x00008000,
 };
 
 struct ldlm_intent {
@@ -2374,10 +2490,10 @@ struct ldlm_lock_desc {
 #define LDLM_ENQUEUE_CANCEL_OFF 1
 
 struct ldlm_request {
-        __u32 lock_flags;
-        __u32 lock_count;
-        struct ldlm_lock_desc lock_desc;
-        struct lustre_handle lock_handle[LDLM_LOCKREQ_HANDLES];
+	__u32 lock_flags;		/* LDLM_FL_*, see lustre_dlm_flags.h */
+	__u32 lock_count;		/* number of locks in lock_handle[] */
+	struct ldlm_lock_desc lock_desc;/* lock descriptor */
+	struct lustre_handle lock_handle[LDLM_LOCKREQ_HANDLES];
 };
 
 struct ldlm_reply {
@@ -2395,17 +2511,17 @@ struct ldlm_reply {
 /*
  * Opcodes for mountconf (mgs and mgc)
  */
-typedef enum {
-        MGS_CONNECT = 250,
-        MGS_DISCONNECT,
-        MGS_EXCEPTION,         /* node died, etc. */
-        MGS_TARGET_REG,        /* whenever target starts up */
-        MGS_TARGET_DEL,
-        MGS_SET_INFO,
-        MGS_CONFIG_READ,
-        MGS_LAST_OPC
-} mgs_cmd_t;
-#define MGS_FIRST_OPC MGS_CONNECT
+enum mgs_cmd {
+	MGS_CONNECT	= 250,
+	MGS_DISCONNECT	= 251,
+	MGS_EXCEPTION	= 252,	/* node died, etc. */
+	MGS_TARGET_REG	= 253,	/* whenever target starts up */
+	MGS_TARGET_DEL	= 254,
+	MGS_SET_INFO	= 255,
+	MGS_CONFIG_READ	= 256,
+	MGS_LAST_OPC,
+	MGS_FIRST_OPC	= MGS_CONNECT
+};
 
 #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0)
 #define MGS_PARAM_MAXLEN 1024
@@ -2421,17 +2537,17 @@ struct mgs_send_param {
 #define MTI_PARAM_MAXLEN 4096
 #define MTI_NIDS_MAX     32
 struct mgs_target_info {
-        __u32            mti_lustre_ver;
-        __u32            mti_stripe_index;
-        __u32            mti_config_ver;
-        __u32            mti_flags;
-        __u32            mti_nid_count;
-        __u32            mti_instance; /* Running instance of target */
-        char             mti_fsname[MTI_NAME_MAXLEN];
-        char             mti_svname[MTI_NAME_MAXLEN];
-        char             mti_uuid[sizeof(struct obd_uuid)];
-        __u64            mti_nids[MTI_NIDS_MAX];     /* host nids (lnet_nid_t)*/
-        char             mti_params[MTI_PARAM_MAXLEN];
+	__u32		mti_lustre_ver;
+	__u32		mti_stripe_index;
+	__u32		mti_config_ver;
+	__u32		mti_flags;    /* LDD_F_* */
+	__u32		mti_nid_count;
+	__u32		mti_instance; /* Running instance of target */
+	char		mti_fsname[MTI_NAME_MAXLEN];
+	char		mti_svname[MTI_NAME_MAXLEN];
+	char		mti_uuid[sizeof(struct obd_uuid)];
+	__u64		mti_nids[MTI_NIDS_MAX]; /* host nids (lnet_nid_t) */
+	char		mti_params[MTI_PARAM_MAXLEN];
 };
 
 struct mgs_nidtbl_entry {
@@ -2497,15 +2613,14 @@ struct cfg_marker {
 /*
  * Opcodes for multiple servers.
  */
-
-typedef enum {
-        OBD_PING = 400,
-        OBD_LOG_CANCEL,
-	OBD_QC_CALLBACK, /* not used since 2.4 */
-	OBD_IDX_READ,
-        OBD_LAST_OPC
-} obd_cmd_t;
-#define OBD_FIRST_OPC OBD_PING
+enum obd_cmd {
+	OBD_PING	= 400,
+/*	OBD_LOG_CANCEL	= 401, obsolete since 1.5 */
+/*	OBD_QC_CALLBACK	= 402, obsolete since 2.4 */
+	OBD_IDX_READ	= 403,
+	OBD_LAST_OPC,
+	OBD_FIRST_OPC = OBD_PING
+};
 
 /**
  * llog contexts indices.
@@ -2554,7 +2669,7 @@ struct llog_catid {
 #define LLOG_OP_MAGIC 0x10600000
 #define LLOG_OP_MASK  0xfff00000
 
-typedef enum {
+enum llog_op_type {
 	LLOG_PAD_MAGIC		= LLOG_OP_MAGIC | 0x00000,
 	OST_SZ_REC		= LLOG_OP_MAGIC | 0x00f00,
 	/* OST_RAID1_REC	= LLOG_OP_MAGIC | 0x01000, never used */
@@ -2571,11 +2686,12 @@ typedef enum {
 	/* LLOG_JOIN_REC	= LLOG_OP_MAGIC | 0x50000, obsolete  1.8.0 */
 	CHANGELOG_REC		= LLOG_OP_MAGIC | 0x60000,
 	CHANGELOG_USER_REC	= LLOG_OP_MAGIC | 0x70000,
+	CHANGELOG_USER_REC2	= LLOG_OP_MAGIC | 0x70002,
 	HSM_AGENT_REC		= LLOG_OP_MAGIC | 0x80000,
 	UPDATE_REC		= LLOG_OP_MAGIC | 0xa0000,
 	LLOG_HDR_MAGIC		= LLOG_OP_MAGIC | 0x45539,
 	LLOG_LOGID_MAGIC	= LLOG_OP_MAGIC | 0x4553b,
-} llog_op_type;
+};
 
 #define LLOG_REC_HDR_NEEDS_SWABBING(r) \
 	(((r)->lrh_type & __swab32(LLOG_OP_MASK)) == __swab32(LLOG_OP_MAGIC))
@@ -2589,12 +2705,12 @@ struct llog_rec_hdr {
 	__u32	lrh_index;
 	__u32	lrh_type;
 	__u32	lrh_id;
-};
+} __attribute__((packed));
 
 struct llog_rec_tail {
 	__u32	lrt_len;
 	__u32	lrt_index;
-};
+} __attribute__((packed));
 
 /* Where data follow just after header */
 #define REC_DATA(ptr)						\
@@ -2652,7 +2768,7 @@ struct llog_setattr64_rec_v2 {
 	__u32			lsr_gid_h;
 	__u64			lsr_valid;
 	__u32			lsr_projid;
-	__u32			lsr_padding1;
+	__u32			lsr_layout_version;
 	__u64			lsr_padding2;
 	__u64			lsr_padding3;
 	struct llog_rec_tail	lsr_tail;
@@ -2676,8 +2792,13 @@ struct llog_size_change_rec {
 #define CHANGELOG_ALLMASK 0XFFFFFFFF
 /** default \a changelog_rec_type mask. Allow all of them, except
  * CL_ATIME since it can really be time consuming, and not necessary
- * under normal use. */
-#define CHANGELOG_DEFMASK (CHANGELOG_ALLMASK & ~(1 << CL_ATIME))
+ * under normal use.
+ * Remove also CL_OPEN, CL_GETXATTR and CL_DN_OPEN from default list as it can
+ * be costly and only necessary for audit purpose.
+ */
+#define CHANGELOG_DEFMASK (CHANGELOG_ALLMASK & \
+			   ~(1 << CL_ATIME | 1 << CL_OPEN | 1 << CL_GETXATTR | \
+			     1 << CL_DN_OPEN))
 
 /* changelog llog name, needed by client replicators */
 #define CHANGELOG_CATALOG "changelog_catalog"
@@ -2697,11 +2818,13 @@ struct llog_changelog_rec {
 #define CHANGELOG_USER_PREFIX "cl"
 
 struct llog_changelog_user_rec {
-        struct llog_rec_hdr   cur_hdr;
-        __u32                 cur_id;
-        __u32                 cur_padding;
-        __u64                 cur_endrec;
-        struct llog_rec_tail  cur_tail;
+	struct llog_rec_hdr   cur_hdr;
+	__u32                 cur_id;
+	/* only intended to be used in relative time comparisons to
+	 * detect idle users */
+	__u32                 cur_time;
+	__u64                 cur_endrec;
+	struct llog_rec_tail  cur_tail;
 } __attribute__((packed));
 
 enum agent_req_status {
@@ -2737,7 +2860,7 @@ struct llog_agent_req_rec {
 						 * agent_req_status */
 	__u32			arr_archive_id;	/**< backend archive number */
 	__u64			arr_flags;	/**< req flags */
-	__u64			arr_compound_id;	/**< compound cookie */
+	__u64			arr_compound_id; /**< compound cookie, ignored */
 	__u64			arr_req_create;	/**< req. creation time */
 	__u64			arr_req_change;	/**< req. status change time */
 	struct hsm_action_item	arr_hai;	/**< req. to the agent */
@@ -2766,12 +2889,25 @@ enum llog_flag {
 	LLOG_F_IS_PLAIN		= 0x4,
 	LLOG_F_EXT_JOBID	= 0x8,
 	LLOG_F_IS_FIXSIZE	= 0x10,
+	LLOG_F_EXT_EXTRA_FLAGS  = 0x20,
+	LLOG_F_EXT_X_UIDGID	= 0x40,
+	LLOG_F_EXT_X_NID	= 0x80,
+	LLOG_F_EXT_X_OMODE	= 0x100,
+	LLOG_F_EXT_X_XATTR	= 0x200,
+	LLOG_F_RM_ON_ERR	= 0x400,
 
 	/* Note: Flags covered by LLOG_F_EXT_MASK will be inherited from
 	 * catlog to plain log, so do not add LLOG_F_IS_FIXSIZE here,
 	 * because the catlog record is usually fixed size, but its plain
 	 * log record can be variable */
-	LLOG_F_EXT_MASK = LLOG_F_EXT_JOBID,
+	LLOG_F_EXT_MASK = LLOG_F_EXT_JOBID | LLOG_F_EXT_EXTRA_FLAGS |
+			  LLOG_F_EXT_X_UIDGID | LLOG_F_EXT_X_NID |
+			  LLOG_F_EXT_X_OMODE | LLOG_F_EXT_X_XATTR,
+};
+
+/* means first record of catalog */
+enum {
+	LLOG_CAT_FIRST = -1,
 };
 
 /* On-disk header structure of each log object, stored in little endian order */
@@ -2817,9 +2953,13 @@ struct llog_log_hdr {
 						 llh->llh_hdr.lrh_len -	\
 						 sizeof(llh->llh_tail)))
 
-/** log cookies are used to reference a specific log file and a record therein */
+/** log cookies are used to reference a specific log file and a record therein,
+    and pass record offset from llog_process_thread to llog_write */
 struct llog_cookie {
-        struct llog_logid       lgc_lgl;
+	union {
+		struct llog_logid	lgc_lgl;
+		__u64			lgc_offset;
+	};
         __u32                   lgc_subsys;
         __u32                   lgc_index;
         __u32                   lgc_padding;
@@ -2827,17 +2967,17 @@ struct llog_cookie {
 
 /** llog protocol */
 enum llogd_rpc_ops {
-        LLOG_ORIGIN_HANDLE_CREATE       = 501,
-        LLOG_ORIGIN_HANDLE_NEXT_BLOCK   = 502,
-        LLOG_ORIGIN_HANDLE_READ_HEADER  = 503,
-        LLOG_ORIGIN_HANDLE_WRITE_REC    = 504,
-        LLOG_ORIGIN_HANDLE_CLOSE        = 505,
-        LLOG_ORIGIN_CONNECT             = 506,
-	LLOG_CATINFO			= 507,  /* deprecated */
-        LLOG_ORIGIN_HANDLE_PREV_BLOCK   = 508,
-        LLOG_ORIGIN_HANDLE_DESTROY      = 509,  /* for destroy llog object*/
-        LLOG_LAST_OPC,
-        LLOG_FIRST_OPC                  = LLOG_ORIGIN_HANDLE_CREATE
+	LLOG_ORIGIN_HANDLE_CREATE	= 501,
+	LLOG_ORIGIN_HANDLE_NEXT_BLOCK	= 502,
+	LLOG_ORIGIN_HANDLE_READ_HEADER	= 503,
+/*	LLOG_ORIGIN_HANDLE_WRITE_REC	= 504, Obsolete by 2.1. */
+/*	LLOG_ORIGIN_HANDLE_CLOSE	= 505, Obsolete by 1.8. */
+/*	LLOG_ORIGIN_CONNECT		= 506, Obsolete by 2.4. */
+/*	LLOG_CATINFO			= 507, Obsolete by 2.3. */
+	LLOG_ORIGIN_HANDLE_PREV_BLOCK	= 508,
+	LLOG_ORIGIN_HANDLE_DESTROY	= 509, /* Obsolete by 2.11. */
+	LLOG_LAST_OPC,
+	LLOG_FIRST_OPC			= LLOG_ORIGIN_HANDLE_CREATE
 };
 
 struct llogd_body {
@@ -2891,7 +3031,7 @@ struct obdo {
 	 *
 	 * sizeof(ost_layout) + sieof(__u32) == sizeof(llog_cookie). */
 	struct ost_layout	o_layout;
-	__u32			o_padding_3;
+	__u32			o_layout_version;
 	__u32			o_uid_h;
 	__u32			o_gid_h;
 
@@ -3064,13 +3204,13 @@ union lu_page {
 };
 
 /* security opcodes */
-typedef enum {
+enum sec_cmd {
         SEC_CTX_INIT            = 801,
         SEC_CTX_INIT_CONT       = 802,
         SEC_CTX_FINI            = 803,
         SEC_LAST_OPC,
         SEC_FIRST_OPC           = SEC_CTX_INIT
-} sec_cmd_t;
+};
 
 /*
  * capa related definitions
@@ -3151,7 +3291,7 @@ struct link_ea_entry {
         unsigned char      lee_reclen[2];
         unsigned char      lee_parent_fid[sizeof(struct lu_fid)];
         char               lee_name[0];
-}__attribute__((packed));
+} __attribute__((packed));
 
 /** fid2path request/reply structure */
 struct getinfo_fid2path {
@@ -3173,7 +3313,7 @@ struct getparent {
 	char		gp_name[0];     /**< zero-terminated link name */
 } __attribute__((packed));
 
-enum {
+enum layout_intent_opc {
 	LAYOUT_INTENT_ACCESS	= 0,	/** generic access */
 	LAYOUT_INTENT_READ	= 1,	/** not used */
 	LAYOUT_INTENT_WRITE	= 2,	/** write file, for comp layout */
@@ -3187,8 +3327,7 @@ enum {
 struct layout_intent {
 	__u32 li_opc;	/* intent operation for enqueue, read, write etc */
 	__u32 li_flags;
-	__u64 li_start;
-	__u64 li_end;
+	struct lu_extent li_extent;
 } __attribute__((packed));
 
 /**
@@ -3198,7 +3337,7 @@ struct layout_intent {
  */
 struct hsm_progress_kernel {
 	/* Field taken from struct hsm_progress */
-	lustre_fid		hpk_fid;
+	struct lu_fid		hpk_fid;
 	__u64			hpk_cookie;
 	struct hsm_extent	hpk_extent;
 	__u16			hpk_flags;
@@ -3263,6 +3402,7 @@ enum update_type {
 	OUT_PUNCH		= 14,
 	OUT_READ		= 15,
 	OUT_NOOP		= 16,
+	OUT_XATTR_LIST		= 17,
 	OUT_LAST
 };
 
@@ -3353,11 +3493,22 @@ struct mdc_swap_layouts {
 	__u64           msl_flags;
 } __attribute__((packed));
 
+#define INLINE_RESYNC_ARRAY_SIZE	15
+struct close_data_resync_done {
+	__u32	resync_count;
+	__u32	resync_ids_inline[INLINE_RESYNC_ARRAY_SIZE];
+};
+
 struct close_data {
 	struct lustre_handle	cd_handle;
 	struct lu_fid		cd_fid;
 	__u64			cd_data_version;
-	__u64			cd_reserved[8];
+	union {
+		__u64				cd_reserved[8];
+		struct close_data_resync_done	cd_resync;
+		/* split close */
+		__u16				cd_mirror_id;
+	};
 };
 
 /* Update llog format */
@@ -3366,7 +3517,7 @@ struct update_op {
 	__u16		uop_type;
 	__u16		uop_param_count;
 	__u16		uop_params_off[0];
-};
+} __attribute__((packed));
 
 struct update_ops {
 	struct update_op	uops_op[0];
@@ -3417,6 +3568,19 @@ struct llog_update_record {
 	*/
 };
 
+/* sepol string format is:
+ * <1-digit for SELinux status>:<policy name>:<policy version>:<policy hash>
+ */
+/* Max length of the sepol string
+ * Should be large enough to contain a sha512sum of the policy
+ */
+#define SELINUX_MODE_LEN 1
+#define SELINUX_POLICY_VER_LEN 3 /* 3 chars to leave room for the future */
+#define SELINUX_POLICY_HASH_LEN 64
+#define LUSTRE_NODEMAP_SEPOL_LENGTH (SELINUX_MODE_LEN + NAME_MAX + \
+				     SELINUX_POLICY_VER_LEN + \
+				     SELINUX_POLICY_HASH_LEN + 3)
+
 /* nodemap records, uses 32 byte record length */
 #define LUSTRE_NODEMAP_NAME_LENGTH 16
 struct nodemap_cluster_rec {
@@ -3487,5 +3651,9 @@ struct ladvise_hdr {
 	struct lu_ladvise	lah_advise[0];	/* advices in this header */
 };
 
+#if defined(__cplusplus)
+}
+#endif
+
 #endif
 /** @} lustreidl */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ioctl.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ioctl.h
similarity index 93%
rename from drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ioctl.h
rename to drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ioctl.h
index 9fddf2b1b9bd3..d0dc08bda5433 100644
--- a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ioctl.h
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ioctl.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2014, 2016, Intel Corporation.
+ * Copyright (c) 2014, 2017, Intel Corporation.
  */
 #ifndef _UAPI_LUSTRE_IOCTL_H
 #define _UAPI_LUSTRE_IOCTL_H
@@ -31,20 +31,13 @@
 #include <linux/ioctl.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
-#include <lustre/lustre_idl.h>
+#include <linux/lustre/lustre_idl.h>
 
-#ifndef __KERNEL__
-# define __user
-#endif
-
-#if !defined(__KERNEL__) && !defined(LUSTRE_UTILS)
-# error This file is for Lustre internal use only.
-#endif
-
-/* Handle older distros */
-#ifndef __ALIGN_KERNEL
-# define __ALIGN_KERNEL(x, a)	__ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
-# define __ALIGN_KERNEL_MASK(x, mask)	(((x) + (mask)) & ~(mask))
+/*
+ * sparse kernel source annotations
+ */
+#ifndef __user
+#define __user
 #endif
 
 enum md_echo_cmd {
@@ -64,7 +57,6 @@ enum md_echo_cmd {
 
 #define OBD_IOCTL_VERSION	0x00010004
 #define OBD_DEV_BY_DEVNAME	0xffffd0de
-#define OBD_MAX_IOCTL_BUFFER	CONFIG_LUSTRE_OBD_MAX_IOCTL_BUFFER
 
 struct obd_ioctl_data {
 	__u32		ioc_len;
@@ -228,13 +220,14 @@ static inline __u32 obd_ioctl_packlen(struct obd_ioctl_data *data)
 #define OBD_IOC_LCFG_ERASE	_IOWR('f', 209, OBD_IOC_DATA_TYPE)
 #define OBD_IOC_GET_OBJ_VERSION	_IOR('f', 210, OBD_IOC_DATA_TYPE)
 
-/*	lustre/lustre_user.h	212-217 */
-#define OBD_IOC_GET_MNTOPT	_IOW('f', 220, mntopt_t)
+/*	lustre/lustre_user.h	211-220 */
+/* was #define OBD_IOC_GET_MNTOPT	_IOW('f', 220, mntopt_t) until 2.11 */
 #define OBD_IOC_ECHO_MD		_IOR('f', 221, struct obd_ioctl_data)
 #define OBD_IOC_ECHO_ALLOC_SEQ	_IOWR('f', 222, struct obd_ioctl_data)
 #define OBD_IOC_START_LFSCK	_IOWR('f', 230, OBD_IOC_DATA_TYPE)
 #define OBD_IOC_STOP_LFSCK	_IOW('f', 231, OBD_IOC_DATA_TYPE)
 #define OBD_IOC_QUERY_LFSCK	_IOR('f', 232, struct obd_ioctl_data)
+#define OBD_IOC_CHLG_POLL	_IOR('f', 233, long)
 /*	lustre/lustre_user.h	240-249 */
 /*	LIBCFS_IOC_DEBUG_MASK	250 */
 
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi_kernelcomm.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_kernelcomm.h
similarity index 88%
rename from drivers/staging/lustrefsx/lustre/include/uapi_kernelcomm.h
rename to drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_kernelcomm.h
index e8119f5278c23..26819ff7995cf 100644
--- a/drivers/staging/lustrefsx/lustre/include/uapi_kernelcomm.h
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_kernelcomm.h
@@ -73,17 +73,26 @@ enum kuc_generic_message_type {
 #define KUC_GRP_HSM	0x02
 #define KUC_GRP_MAX	KUC_GRP_HSM
 
-#define LK_FLG_STOP 0x01
+enum lk_flags {
+	LK_FLG_STOP	= 0x0001,
+	LK_FLG_DATANR	= 0x0002,
+};
 #define LK_NOFD -1U
 
-/* kernelcomm control structure, passed from userspace to kernel */
+/* kernelcomm control structure, passed from userspace to kernel.
+ * For compatibility with old copytools, users who pass ARCHIVE_IDs
+ * to kernel using lk_data_count and lk_data should fill lk_flags with
+ * LK_FLG_DATANR. Otherwise kernel will take lk_data_count as bitmap of
+ * ARCHIVE IDs.
+ */
 struct lustre_kernelcomm {
 	__u32 lk_wfd;
 	__u32 lk_rfd;
 	__u32 lk_uid;
 	__u32 lk_group;
-	__u32 lk_data;
+	__u32 lk_data_count;
 	__u32 lk_flags;
+	__u32 lk_data[0];
 } __attribute__((packed));
 
 #endif	/* __UAPI_KERNELCOMM_H__ */
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_lfsck_user.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_lfsck_user.h
new file mode 100644
index 0000000000000..68c8d3a1009c4
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_lfsck_user.h
@@ -0,0 +1,238 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * lustre/include/lustre/lustre_lfsck_user.h
+ *
+ * Lustre LFSCK userspace interfaces.
+ *
+ * Author: Fan, Yong <fan.yong@intel.com>
+ */
+
+#ifndef _LUSTRE_LFSCK_USER_H
+# define _LUSTRE_LFSCK_USER_H
+
+#include <linux/types.h>
+#include <linux/lustre/lustre_user.h>
+
+/**
+ * state machine:
+ *
+ *					LS_INIT
+ *					   |
+ *				     (lfsck|start)
+ *					   |
+ *					   v
+ *				   LS_SCANNING_PHASE1
+ *					|	^
+ *					|	:
+ *					| (lfsck:restart)
+ *					|	:
+ *					v	:
+ *	-----------------------------------------------------------------
+ *	|		    |^		|^	   |^	      |^	|^
+ *	|		    |:		|:	   |:	      |:	|:
+ *	v		    v:		v:	   v:	      v:	v:
+ * LS_SCANNING_PHASE2	LS_FAILED  LS_STOPPED  LS_PAUSED LS_CRASHED LS_PARTIAL
+ *			  (CO_)       (CO_)	 (CO_)
+ *	|	^	    ^:		^:	   ^:	      ^:	^:
+ *	|	:	    |:		|:	   |:	      |:	|:
+ *	| (lfsck:restart)   |:		|:	   |:	      |:	|:
+ *	v	:	    |v		|v	   |v	      |v	|v
+ *	-----------------------------------------------------------------
+ *	    |
+ *	    v
+ *    LS_COMPLETED
+ */
+enum lfsck_status {
+	/* The lfsck file is new created, for new MDT, upgrading from old disk,
+	 * or re-creating the lfsck file manually. */
+	LS_INIT			= 0,
+
+	/* The first-step system scanning. The checked items during the phase1
+	 * scanning depends on the LFSCK type. */
+	LS_SCANNING_PHASE1	= 1,
+
+	/* The second-step system scanning. The checked items during the phase2
+	 * scanning depends on the LFSCK type. */
+	LS_SCANNING_PHASE2	= 2,
+
+	/* The LFSCK processing has completed for all objects. */
+	LS_COMPLETED		= 3,
+
+	/* The LFSCK exited automatically for failure, will not auto restart. */
+	LS_FAILED		= 4,
+
+	/* The LFSCK is stopped manually, will not auto restart. */
+	LS_STOPPED		= 5,
+
+	/* LFSCK is paused automatically when umount,
+	 * will be restarted automatically when remount. */
+	LS_PAUSED		= 6,
+
+	/* System crashed during the LFSCK,
+	 * will be restarted automatically after recovery. */
+	LS_CRASHED		= 7,
+
+	/* Some OST/MDT failed during the LFSCK, or not join the LFSCK. */
+	LS_PARTIAL		= 8,
+
+	/* The LFSCK is failed because its controller is failed. */
+	LS_CO_FAILED		= 9,
+
+	/* The LFSCK is stopped because its controller is stopped. */
+	LS_CO_STOPPED		= 10,
+
+	/* The LFSCK is paused because its controller is paused. */
+	LS_CO_PAUSED		= 11,
+
+	LS_MAX
+};
+
+static inline const char *lfsck_status2name(int status)
+{
+	static const char * const lfsck_status_names[] = {
+		[LS_INIT]		= "init",
+		[LS_SCANNING_PHASE1]	= "scanning-phase1",
+		[LS_SCANNING_PHASE2]	= "scanning-phase2",
+		[LS_COMPLETED]		= "completed",
+		[LS_FAILED]		= "failed",
+		[LS_STOPPED]		= "stopped",
+		[LS_PAUSED]		= "paused",
+		[LS_CRASHED]		= "crashed",
+		[LS_PARTIAL]		= "partial",
+		[LS_CO_FAILED]		= "co-failed",
+		[LS_CO_STOPPED]		= "co-stopped",
+		[LS_CO_PAUSED]		= "co-paused"
+	};
+
+	if (status < 0 || status >= LS_MAX)
+		return "unknown";
+
+	return lfsck_status_names[status];
+}
+
+enum lfsck_param_flags {
+	/* Reset LFSCK iterator position to the device beginning. */
+	LPF_RESET		= 0x0001,
+
+	/* Exit when fail. */
+	LPF_FAILOUT		= 0x0002,
+
+	/* Dryrun mode, only check without modification */
+	LPF_DRYRUN		= 0x0004,
+
+	/* LFSCK runs on all targets. */
+	LPF_ALL_TGT		= 0x0008,
+
+	/* Broadcast the command to other MDTs. Only valid on the sponsor MDT */
+	LPF_BROADCAST		= 0x0010,
+
+	/* Handle orphan OST-objects. */
+	LPF_OST_ORPHAN		= 0x0020,
+
+	/* Create OST-object for dangling LOV EA. */
+	LPF_CREATE_OSTOBJ	= 0x0040,
+
+	/* Create MDT-object for dangling name entry. */
+	LPF_CREATE_MDTOBJ	= 0x0080,
+
+	/* Do not return until the LFSCK not running. */
+	LPF_WAIT		= 0x0100,
+
+	/* Delay to create OST-object for dangling LOV EA. */
+	LPF_DELAY_CREATE_OSTOBJ	= 0x0200,
+};
+
+enum lfsck_type {
+	/* For MDT and OST internal OSD consistency check/repair. */
+	LFSCK_TYPE_SCRUB	= 0x0000,
+
+	/* For MDT-OST (layout, object) consistency check/repair. */
+	LFSCK_TYPE_LAYOUT	= 0x0001,
+
+	/* For MDT (FID-in-dirent, linkEA) consistency check/repair. */
+	LFSCK_TYPE_NAMESPACE	= 0x0004,
+	LFSCK_TYPES_SUPPORTED	= (LFSCK_TYPE_SCRUB | LFSCK_TYPE_LAYOUT |
+				   LFSCK_TYPE_NAMESPACE),
+	LFSCK_TYPES_DEF		= LFSCK_TYPES_SUPPORTED,
+	LFSCK_TYPES_ALL		= ((__u16)(~0))
+};
+
+#define LFSCK_VERSION_V1	1
+#define LFSCK_VERSION_V2	2
+
+#define LFSCK_SPEED_NO_LIMIT	0
+#define LFSCK_SPEED_LIMIT_DEF	LFSCK_SPEED_NO_LIMIT
+#define LFSCK_ASYNC_WIN_DEFAULT 1024
+#define LFSCK_ASYNC_WIN_MAX	((__u16)(~0))
+#define LFSCK_TYPE_BITS		16
+
+enum lfsck_start_valid {
+	LSV_SPEED_LIMIT		= 0x00000001,
+	LSV_ERROR_HANDLE	= 0x00000002,
+	LSV_DRYRUN		= 0x00000004,
+	LSV_ASYNC_WINDOWS	= 0x00000008,
+	LSV_CREATE_OSTOBJ	= 0x00000010,
+	LSV_CREATE_MDTOBJ	= 0x00000020,
+	LSV_DELAY_CREATE_OSTOBJ	= 0x00000040,
+};
+
+/* Arguments for starting lfsck. */
+struct lfsck_start {
+	/* Which arguments are valid, see 'enum lfsck_start_valid'. */
+	__u32   ls_valid;
+
+	/* How many items can be scanned at most per second. */
+	__u32   ls_speed_limit;
+
+	/* For compatibility between user space tools and kernel service. */
+	__u16   ls_version;
+
+	/* Which LFSCK components to be (have been) started. */
+	__u16   ls_active;
+
+	/* Flags for the LFSCK, see 'enum lfsck_param_flags'. */
+	__u16   ls_flags;
+
+	/* The windows size for async requests pipeline. */
+	__u16   ls_async_windows;
+};
+
+struct lfsck_stop {
+	__u32	ls_status;
+	__u16	ls_flags;
+	__u16	ls_padding_1; /* For 64-bits aligned. */
+	__u64	ls_padding_2;
+};
+
+struct lfsck_query {
+	__u16	lu_types;
+	__u16	lu_flags;
+	__u32	lu_mdts_count[LFSCK_TYPE_BITS][LS_MAX + 1];
+	__u32	lu_osts_count[LFSCK_TYPE_BITS][LS_MAX + 1];
+	__u64	lu_repaired[LFSCK_TYPE_BITS];
+};
+
+#endif /* _LUSTRE_LFSCK_USER_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_log_user.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_log_user.h
similarity index 97%
rename from drivers/staging/lustrefsx/lustre/include/lustre_log_user.h
rename to drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_log_user.h
index ee5f0f7385fa0..bcf46eb21e6c2 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_log_user.h
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_log_user.h
@@ -38,7 +38,8 @@
 #ifndef _LUSTRE_LOG_USER_H
 #define _LUSTRE_LOG_USER_H
 
-#include <uapi/linux/lustre_fid.h>
+#include <linux/types.h>
+#include <linux/lustre/lustre_fid.h>
 
 /*  Lustre logs use FIDs constructed from oi_id and oi_seq directly,
  *  without attempting to use the IGIF and IDIF ranges as is done
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ostid.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ostid.h
similarity index 95%
rename from drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ostid.h
rename to drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ostid.h
index c0e662ae7b84f..90fa213f83e90 100644
--- a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_ostid.h
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ostid.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  *
  * Copyright 2015 Cray Inc, all rights reserved.
  * Author: Ben Evans.
@@ -34,15 +34,9 @@
 #ifndef _UAPI_LUSTRE_OSTID_H_
 #define _UAPI_LUSTRE_OSTID_H_
 
-/*
- * This is due to us being out of kernel and the way the OpenSFS branch
- * handles CFLAGS. Upstream will just have linux/lustre_fid.h
- */
-#ifdef __KERNEL__
-#include <uapi/linux/lustre_fid.h>
-#else
-#include <linux/lustre_fid.h>
-#endif
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/lustre/lustre_fid.h>
 
 static inline __u64 lmm_oi_id(const struct ost_id *oi)
 {
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_param.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_param.h
similarity index 100%
rename from drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre_param.h
rename to drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_param.h
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_user.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_user.h
new file mode 100644
index 0000000000000..470c97e577cc0
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_user.h
@@ -0,0 +1,2378 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/lustre_user.h
+ *
+ * Lustre public user-space interface definitions.
+ */
+
+#ifndef _LUSTRE_USER_H
+#define _LUSTRE_USER_H
+
+/** \defgroup lustreuser lustreuser
+ *
+ * @{
+ */
+
+#ifdef __KERNEL__
+# include <linux/fs.h>
+# include <linux/quota.h>
+# include <linux/unistd.h>
+#else /* ! __KERNEL__ */
+# define __USE_ISOC99   1
+# include <stdbool.h>
+# include <stdio.h> /* snprintf() */
+# define NEED_QUOTA_DEFS
+/* # include <sys/quota.h> - this causes complaints about caddr_t */
+# include <sys/stat.h>
+
+# define __USE_GNU  1
+# define __USE_XOPEN2K8  1
+# define FILEID_LUSTRE 0x97 /* for name_to_handle_at() (and llapi_fd2fid()) */
+#endif /* !__KERNEL__ */
+
+#include <linux/limits.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/string.h>
+
+/* Handle older distros */
+#ifndef __ALIGN_KERNEL
+#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask))
+#define __ALIGN_KERNEL(x, a)	     __ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
+#endif
+
+#include <linux/lustre/lustre_fiemap.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#ifdef __STRICT_ANSI__
+#define typeof  __typeof__
+#endif
+
+/*
+ * This is a temporary solution of adding quota type.
+ * Should be removed as soon as system header is updated.
+ */
+#undef LL_MAXQUOTAS
+#define LL_MAXQUOTAS 3
+#undef INITQFNAMES
+#define INITQFNAMES { \
+    "user",	/* USRQUOTA */ \
+    "group",	/* GRPQUOTA */ \
+    "project",	/* PRJQUOTA */ \
+    "undefined", \
+};
+#ifndef USRQUOTA
+#define USRQUOTA 0
+#endif
+#ifndef GRPQUOTA
+#define GRPQUOTA 1
+#endif
+#ifndef PRJQUOTA
+#define PRJQUOTA 2
+#endif
+
+/*
+ * We need to always use 64bit version because the structure
+ * is shared across entire cluster where 32bit and 64bit machines
+ * are co-existing.
+ */
+#if __BITS_PER_LONG != 64 || defined(__ARCH_WANT_STAT64)
+typedef struct stat64   lstat_t;
+#define lstat_f  lstat64
+#define fstat_f         fstat64
+#define fstatat_f       fstatat64
+#else
+typedef struct stat     lstat_t;
+#define lstat_f  lstat
+#define fstat_f         fstat
+#define fstatat_f       fstatat
+#endif
+
+#ifndef STATX_BASIC_STATS
+/*
+ * Timestamp structure for the timestamps in struct statx.
+ *
+ * tv_sec holds the number of seconds before (negative) or after (positive)
+ * 00:00:00 1st January 1970 UTC.
+ *
+ * tv_nsec holds a number of nanoseconds (0..999,999,999) after the tv_sec time.
+ *
+ * __reserved is held in case we need a yet finer resolution.
+ */
+struct statx_timestamp {
+	__s64	tv_sec;
+	__u32	tv_nsec;
+	__s32	__reserved;
+};
+
+/*
+ * Structures for the extended file attribute retrieval system call
+ * (statx()).
+ *
+ * The caller passes a mask of what they're specifically interested in as a
+ * parameter to statx().  What statx() actually got will be indicated in
+ * st_mask upon return.
+ *
+ * For each bit in the mask argument:
+ *
+ * - if the datum is not supported:
+ *
+ *   - the bit will be cleared, and
+ *
+ *   - the datum will be set to an appropriate fabricated value if one is
+ *     available (eg. CIFS can take a default uid and gid), otherwise
+ *
+ *   - the field will be cleared;
+ *
+ * - otherwise, if explicitly requested:
+ *
+ *   - the datum will be synchronised to the server if AT_STATX_FORCE_SYNC is
+ *     set or if the datum is considered out of date, and
+ *
+ *   - the field will be filled in and the bit will be set;
+ *
+ * - otherwise, if not requested, but available in approximate form without any
+ *   effort, it will be filled in anyway, and the bit will be set upon return
+ *   (it might not be up to date, however, and no attempt will be made to
+ *   synchronise the internal state first);
+ *
+ * - otherwise the field and the bit will be cleared before returning.
+ *
+ * Items in STATX_BASIC_STATS may be marked unavailable on return, but they
+ * will have values installed for compatibility purposes so that stat() and
+ * co. can be emulated in userspace.
+ */
+struct statx {
+	/* 0x00 */
+	__u32	stx_mask;	/* What results were written [uncond] */
+	__u32	stx_blksize;	/* Preferred general I/O size [uncond] */
+	__u64	stx_attributes;	/* Flags conveying information about the file [uncond] */
+	/* 0x10 */
+	__u32	stx_nlink;	/* Number of hard links */
+	__u32	stx_uid;	/* User ID of owner */
+	__u32	stx_gid;	/* Group ID of owner */
+	__u16	stx_mode;	/* File mode */
+	__u16	__spare0[1];
+	/* 0x20 */
+	__u64	stx_ino;	/* Inode number */
+	__u64	stx_size;	/* File size */
+	__u64	stx_blocks;	/* Number of 512-byte blocks allocated */
+	__u64	stx_attributes_mask; /* Mask to show what's supported in stx_attributes */
+	/* 0x40 */
+	struct statx_timestamp	stx_atime;	/* Last access time */
+	struct statx_timestamp	stx_btime;	/* File creation time */
+	struct statx_timestamp	stx_ctime;	/* Last attribute change time */
+	struct statx_timestamp	stx_mtime;	/* Last data modification time */
+	/* 0x80 */
+	__u32	stx_rdev_major;	/* Device ID of special file [if bdev/cdev] */
+	__u32	stx_rdev_minor;
+	__u32	stx_dev_major;	/* ID of device containing file [uncond] */
+	__u32	stx_dev_minor;
+	/* 0x90 */
+	__u64	__spare2[14];	/* Spare space for future expansion */
+	/* 0x100 */
+};
+
+/*
+ * Flags to be stx_mask
+ *
+ * Query request/result mask for statx() and struct statx::stx_mask.
+ *
+ * These bits should be set in the mask argument of statx() to request
+ * particular items when calling statx().
+ */
+#define STATX_TYPE		0x00000001U	/* Want/got stx_mode & S_IFMT */
+#define STATX_MODE		0x00000002U	/* Want/got stx_mode & ~S_IFMT */
+#define STATX_NLINK		0x00000004U	/* Want/got stx_nlink */
+#define STATX_UID		0x00000008U	/* Want/got stx_uid */
+#define STATX_GID		0x00000010U	/* Want/got stx_gid */
+#define STATX_ATIME		0x00000020U	/* Want/got stx_atime */
+#define STATX_MTIME		0x00000040U	/* Want/got stx_mtime */
+#define STATX_CTIME		0x00000080U	/* Want/got stx_ctime */
+#define STATX_INO		0x00000100U	/* Want/got stx_ino */
+#define STATX_SIZE		0x00000200U	/* Want/got stx_size */
+#define STATX_BLOCKS		0x00000400U	/* Want/got stx_blocks */
+#define STATX_BASIC_STATS	0x000007ffU	/* The stuff in the normal stat struct */
+#define STATX_BTIME		0x00000800U	/* Want/got stx_btime */
+#define STATX_ALL		0x00000fffU	/* All currently supported flags */
+#define STATX__RESERVED		0x80000000U	/* Reserved for future struct statx expansion */
+
+/*
+ * Attributes to be found in stx_attributes and masked in stx_attributes_mask.
+ *
+ * These give information about the features or the state of a file that might
+ * be of use to ordinary userspace programs such as GUIs or ls rather than
+ * specialised tools.
+ *
+ * Note that the flags marked [I] correspond to generic FS_IOC_FLAGS
+ * semantically.  Where possible, the numerical value is picked to correspond
+ * also.
+ */
+#define STATX_ATTR_COMPRESSED		0x00000004 /* [I] File is compressed by the fs */
+#define STATX_ATTR_IMMUTABLE		0x00000010 /* [I] File is marked immutable */
+#define STATX_ATTR_APPEND		0x00000020 /* [I] File is append-only */
+#define STATX_ATTR_NODUMP		0x00000040 /* [I] File is not to be dumped */
+#define STATX_ATTR_ENCRYPTED		0x00000800 /* [I] File requires key to decrypt in fs */
+
+#define STATX_ATTR_AUTOMOUNT		0x00001000 /* Dir: Automount trigger */
+
+#endif
+
+typedef struct statx lstatx_t;
+
+#define HAVE_LOV_USER_MDS_DATA
+
+#define LUSTRE_EOF 0xffffffffffffffffULL
+
+/* for statfs() */
+#define LL_SUPER_MAGIC 0x0BD00BD0
+
+#define FSFILT_IOC_GETVERSION		_IOR('f', 3, long)
+
+/* FIEMAP flags supported by Lustre */
+#define LUSTRE_FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_DEVICE_ORDER)
+
+enum obd_statfs_state {
+	OS_STATE_DEGRADED	= 0x00000001, /**< RAID degraded/rebuilding */
+	OS_STATE_READONLY	= 0x00000002, /**< filesystem is read-only */
+	OS_STATE_NOPRECREATE	= 0x00000004, /**< no object precreation */
+	OS_STATE_UNUSED1	= 0x00000008, /**< obsolete 1.6, was EROFS=30 */
+	OS_STATE_UNUSED2	= 0x00000010, /**< obsolete 1.6, was EROFS=30 */
+	OS_STATE_ENOSPC		= 0x00000020, /**< not enough free space */
+	OS_STATE_ENOINO		= 0x00000040, /**< not enough inodes */
+	OS_STATE_SUM		= 0x00000100, /**< aggregated for all tagrets */
+	OS_STATE_NONROT		= 0x00000200, /**< non-rotational device */
+};
+
+/** filesystem statistics/attributes for target device */
+struct obd_statfs {
+	__u64		os_type;	/* EXT4_SUPER_MAGIC, UBERBLOCK_MAGIC */
+	__u64		os_blocks;	/* total size in #os_bsize blocks */
+	__u64		os_bfree;	/* number of unused blocks */
+	__u64		os_bavail;	/* blocks available for allocation */
+	__u64		os_files;	/* total number of objects */
+	__u64		os_ffree;	/* # objects that could be created */
+	__u8		os_fsid[40];	/* identifier for filesystem */
+	__u32		os_bsize;	/* block size in bytes for os_blocks */
+	__u32		os_namelen;	/* maximum length of filename in bytes*/
+	__u64		os_maxbytes;	/* maximum object size in bytes */
+	__u32		os_state;       /**< obd_statfs_state OS_STATE_* flag */
+	__u32		os_fprecreated;	/* objs available now to the caller */
+					/* used in QoS code to find preferred
+					 * OSTs */
+	__u32           os_granted;	/* space granted for MDS */
+	__u32           os_spare3;	/* Unused padding fields.  Remember */
+	__u32           os_spare4;	/* to fix lustre_swab_obd_statfs() */
+	__u32           os_spare5;
+	__u32           os_spare6;
+	__u32           os_spare7;
+	__u32           os_spare8;
+	__u32           os_spare9;
+};
+
+/**
+ * File IDentifier.
+ *
+ * FID is a cluster-wide unique identifier of a file or an object (stripe).
+ * FIDs are never reused.
+ **/
+struct lu_fid {
+       /**
+	* FID sequence. Sequence is a unit of migration: all files (objects)
+	* with FIDs from a given sequence are stored on the same server.
+	* Lustre should support 2^64 objects, so even if each sequence
+	* has only a single object we can still enumerate 2^64 objects.
+	**/
+	__u64 f_seq;
+	/* FID number within sequence. */
+	__u32 f_oid;
+	/**
+	 * FID version, used to distinguish different versions (in the sense
+	 * of snapshots, etc.) of the same file system object. Not currently
+	 * used.
+	 **/
+	__u32 f_ver;
+} __attribute__((packed));
+
+static inline bool fid_is_zero(const struct lu_fid *fid)
+{
+	return fid->f_seq == 0 && fid->f_oid == 0;
+}
+
+/* Currently, the filter_fid::ff_parent::f_ver is not the real parent
+ * MDT-object's FID::f_ver, instead it is the OST-object index in its
+ * parent MDT-object's layout EA. */
+#define f_stripe_idx f_ver
+
+struct ost_layout {
+	__u32	ol_stripe_size;
+	__u32	ol_stripe_count;
+	__u64	ol_comp_start;
+	__u64	ol_comp_end;
+	__u32	ol_comp_id;
+} __attribute__((packed));
+
+/* The filter_fid structure has changed several times over its lifetime.
+ * For a long time "trusted.fid" held the MDT inode parent FID/IGIF and
+ * stripe_index and the "self FID" (objid/seq) to be able to recover the
+ * OST objects in case of corruption.  With the move to 2.4 and OSD-API for
+ * the OST, the "trusted.lma" xattr was added to the OST objects to store
+ * the "self FID" to be consistent with the MDT on-disk format, and the
+ * filter_fid only stored the MDT inode parent FID and stripe index.
+ *
+ * In 2.10, the addition of PFL composite layouts required more information
+ * to be stored into the filter_fid in order to be able to identify which
+ * component the OST object belonged.  As well, the stripe size may vary
+ * between components, so it was no longer safe to assume the stripe size
+ * or stripe_count of a file.  This is also more robust for plain layouts.
+ *
+ * For ldiskfs OSTs that were formatted with 256-byte inodes, there is not
+ * enough space to store both the filter_fid and LMA in the inode, so they
+ * are packed into struct lustre_ost_attrs on disk in trusted.lma to avoid
+ * an extra seek for every OST object access.
+ *
+ * In 2.11, FLR mirror layouts also need to store the layout version and
+ * range so that writes to old versions of the layout are not allowed.
+ * That ensures that mirrored objects are not modified by evicted clients,
+ * and ensures that the components are correctly marked stale on the MDT.
+ */
+struct filter_fid_18_23 {
+	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
+	__u64			ff_objid;
+	__u64			ff_seq;
+};
+
+struct filter_fid_24_29 {
+	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
+};
+
+struct filter_fid_210 {
+	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
+	struct ost_layout	ff_layout;
+};
+
+struct filter_fid {
+	struct lu_fid		ff_parent;	/* stripe_idx in f_ver */
+	struct ost_layout	ff_layout;
+	__u32			ff_layout_version;
+	__u32			ff_range; /* range of layout version that
+					   * write are allowed */
+} __attribute__((packed));
+
+/* Userspace should treat lu_fid as opaque, and only use the following methods
+ * to print or parse them.  Other functions (e.g. compare, swab) could be moved
+ * here from lustre_idl.h if needed. */
+struct lu_fid;
+
+enum lma_compat {
+	LMAC_HSM	 = 0x00000001,
+/*	LMAC_SOM	 = 0x00000002, obsolete since 2.8.0 */
+	LMAC_NOT_IN_OI	 = 0x00000004, /* the object does NOT need OI mapping */
+	LMAC_FID_ON_OST  = 0x00000008, /* For OST-object, its OI mapping is
+				       * under /O/<seq>/d<x>. */
+	LMAC_STRIPE_INFO = 0x00000010, /* stripe info in the LMA EA. */
+	LMAC_COMP_INFO	 = 0x00000020, /* Component info in the LMA EA. */
+	LMAC_IDX_BACKUP  = 0x00000040, /* Has index backup. */
+};
+
+/**
+ * Masks for all features that should be supported by a Lustre version to
+ * access a specific file.
+ * This information is stored in lustre_mdt_attrs::lma_incompat.
+ */
+enum lma_incompat {
+	LMAI_RELEASED		= 0x00000001, /* file is released */
+	LMAI_AGENT		= 0x00000002, /* agent inode */
+	LMAI_REMOTE_PARENT	= 0x00000004, /* the parent of the object
+						 is on the remote MDT */
+	LMAI_STRIPED		= 0x00000008, /* striped directory inode */
+	LMAI_ORPHAN		= 0x00000010, /* inode is orphan */
+	LMA_INCOMPAT_SUPP	= (LMAI_AGENT | LMAI_REMOTE_PARENT | \
+				   LMAI_STRIPED | LMAI_ORPHAN)
+};
+
+
+/**
+ * Following struct for object attributes, that will be kept inode's EA.
+ * Introduced in 2.0 release (please see b15993, for details)
+ * Added to all objects since Lustre 2.4 as contains self FID
+ */
+struct lustre_mdt_attrs {
+	/**
+	 * Bitfield for supported data in this structure. From enum lma_compat.
+	 * lma_self_fid and lma_flags are always available.
+	 */
+	__u32   lma_compat;
+	/**
+	 * Per-file incompat feature list. Lustre version should support all
+	 * flags set in this field. The supported feature mask is available in
+	 * LMA_INCOMPAT_SUPP.
+	 */
+	__u32   lma_incompat;
+	/** FID of this inode */
+	struct lu_fid  lma_self_fid;
+};
+
+struct lustre_ost_attrs {
+	/* Use lustre_mdt_attrs directly for now, need a common header
+	 * structure if want to change lustre_mdt_attrs in future. */
+	struct lustre_mdt_attrs loa_lma;
+
+	/* Below five elements are for OST-object's PFID EA, the
+	 * lma_parent_fid::f_ver is composed of the stripe_count (high 16 bits)
+	 * and the stripe_index (low 16 bits), the size should not exceed
+	 * 5 * sizeof(__u64)) to be accessable by old Lustre. If the flag
+	 * LMAC_STRIPE_INFO is set, then loa_parent_fid and loa_stripe_size
+	 * are valid; if the flag LMAC_COMP_INFO is set, then the next three
+	 * loa_comp_* elements are valid. */
+	struct lu_fid	loa_parent_fid;
+	__u32		loa_stripe_size;
+	__u32		loa_comp_id;
+	__u64		loa_comp_start;
+	__u64		loa_comp_end;
+};
+
+/**
+ * Prior to 2.4, the LMA structure also included SOM attributes which has since
+ * been moved to a dedicated xattr
+ * lma_flags was also removed because of lma_compat/incompat fields.
+ */
+#define LMA_OLD_SIZE (sizeof(struct lustre_mdt_attrs) + 5 * sizeof(__u64))
+
+enum lustre_som_flags {
+	/* Unknow or no SoM data, must get size from OSTs. */
+	SOM_FL_UNKNOWN	= 0x0000,
+	/* Known strictly correct, FLR or DoM file (SoM guaranteed). */
+	SOM_FL_STRICT	= 0x0001,
+	/* Known stale - was right at some point in the past, but it is
+	 * known (or likely) to be incorrect now (e.g. opened for write). */
+	SOM_FL_STALE	= 0x0002,
+	/* Approximate, may never have been strictly correct,
+	 * need to sync SOM data to achieve eventual consistency. */
+	SOM_FL_LAZY	= 0x0004,
+};
+
+struct lustre_som_attrs {
+	__u16	lsa_valid;
+	__u16	lsa_reserved[3];
+	__u64	lsa_size;
+	__u64	lsa_blocks;
+};
+
+/**
+ * OST object IDentifier.
+ */
+struct ost_id {
+	union {
+		struct {
+			__u64	oi_id;
+			__u64	oi_seq;
+		} oi;
+		struct lu_fid oi_fid;
+	};
+} __attribute__((packed));
+
+#define DOSTID "%#llx:%llu"
+#define POSTID(oi) ((unsigned long long)ostid_seq(oi)), \
+		   ((unsigned long long)ostid_id(oi))
+
+struct ll_futimes_3 {
+	__u64 lfu_atime_sec;
+	__u64 lfu_atime_nsec;
+	__u64 lfu_mtime_sec;
+	__u64 lfu_mtime_nsec;
+	__u64 lfu_ctime_sec;
+	__u64 lfu_ctime_nsec;
+};
+
+/*
+ * Maximum number of mirrors currently implemented.
+ */
+#define LUSTRE_MIRROR_COUNT_MAX		16
+
+/* Lease types for use as arg and return of LL_IOC_{GET,SET}_LEASE ioctl. */
+enum ll_lease_mode {
+	LL_LEASE_RDLCK	= 0x01,
+	LL_LEASE_WRLCK	= 0x02,
+	LL_LEASE_UNLCK	= 0x04,
+};
+
+enum ll_lease_flags {
+	LL_LEASE_RESYNC		= 0x1,
+	LL_LEASE_RESYNC_DONE	= 0x2,
+	LL_LEASE_LAYOUT_MERGE	= 0x4,
+	LL_LEASE_LAYOUT_SPLIT	= 0x8,
+};
+
+#define IOC_IDS_MAX	4096
+struct ll_ioc_lease {
+	__u32		lil_mode;
+	__u32		lil_flags;
+	__u32		lil_count;
+	__u32		lil_ids[0];
+};
+
+struct ll_ioc_lease_id {
+	__u32		lil_mode;
+	__u32		lil_flags;
+	__u32		lil_count;
+	__u16		lil_mirror_id;
+	__u16		lil_padding1;
+	__u64		lil_padding2;
+	__u32		lil_ids[0];
+};
+
+/*
+ * The ioctl naming rules:
+ * LL_*     - works on the currently opened filehandle instead of parent dir
+ * *_OBD_*  - gets data for both OSC or MDC (LOV, LMV indirectly)
+ * *_MDC_*  - gets/sets data related to MDC
+ * *_LOV_*  - gets/sets data related to OSC/LOV
+ * *FILE*   - called on parent dir and passes in a filename
+ * *STRIPE* - set/get lov_user_md
+ * *INFO    - set/get lov_user_mds_data
+ */
+/*	lustre_ioctl.h			101-150 */
+#define LL_IOC_GETFLAGS                 _IOR ('f', 151, long)
+#define LL_IOC_SETFLAGS                 _IOW ('f', 152, long)
+#define LL_IOC_CLRFLAGS                 _IOW ('f', 153, long)
+#define LL_IOC_LOV_SETSTRIPE            _IOW ('f', 154, long)
+#define LL_IOC_LOV_SETSTRIPE_NEW	_IOWR('f', 154, struct lov_user_md)
+#define LL_IOC_LOV_GETSTRIPE            _IOW ('f', 155, long)
+#define LL_IOC_LOV_GETSTRIPE_NEW	_IOR('f', 155, struct lov_user_md)
+#define LL_IOC_LOV_SETEA                _IOW ('f', 156, long)
+/*	LL_IOC_RECREATE_OBJ             157 obsolete */
+/*	LL_IOC_RECREATE_FID             157 obsolete */
+#define LL_IOC_GROUP_LOCK               _IOW ('f', 158, long)
+#define LL_IOC_GROUP_UNLOCK             _IOW ('f', 159, long)
+/*	LL_IOC_QUOTACHECK               160 OBD_IOC_QUOTACHECK */
+/*	LL_IOC_POLL_QUOTACHECK		161 OBD_IOC_POLL_QUOTACHECK */
+/*	LL_IOC_QUOTACTL			162 OBD_IOC_QUOTACTL */
+#define IOC_OBD_STATFS                  _IOWR('f', 164, struct obd_statfs *)
+/*	IOC_LOV_GETINFO                 165 obsolete */
+#define LL_IOC_FLUSHCTX                 _IOW ('f', 166, long)
+/*	LL_IOC_RMTACL                   167 obsolete */
+#define LL_IOC_GETOBDCOUNT              _IOR ('f', 168, long)
+#define LL_IOC_LLOOP_ATTACH             _IOWR('f', 169, long)
+#define LL_IOC_LLOOP_DETACH             _IOWR('f', 170, long)
+#define LL_IOC_LLOOP_INFO               _IOWR('f', 171, struct lu_fid)
+#define LL_IOC_LLOOP_DETACH_BYDEV       _IOWR('f', 172, long)
+#define LL_IOC_PATH2FID                 _IOR ('f', 173, long)
+#define LL_IOC_GET_CONNECT_FLAGS        _IOWR('f', 174, __u64 *)
+#define LL_IOC_GET_MDTIDX               _IOR ('f', 175, int)
+#define LL_IOC_FUTIMES_3		_IOWR('f', 176, struct ll_futimes_3)
+#define LL_IOC_FLR_SET_MIRROR		_IOW ('f', 177, long)
+/*	lustre_ioctl.h			177-210 */
+#define LL_IOC_HSM_STATE_GET		_IOR('f', 211, struct hsm_user_state)
+#define LL_IOC_HSM_STATE_SET		_IOW('f', 212, struct hsm_state_set)
+#define LL_IOC_HSM_CT_START		_IOW('f', 213, struct lustre_kernelcomm)
+#define LL_IOC_HSM_COPY_START		_IOW('f', 214, struct hsm_copy *)
+#define LL_IOC_HSM_COPY_END		_IOW('f', 215, struct hsm_copy *)
+#define LL_IOC_HSM_PROGRESS		_IOW('f', 216, struct hsm_user_request)
+#define LL_IOC_HSM_REQUEST		_IOW('f', 217, struct hsm_user_request)
+#define LL_IOC_DATA_VERSION		_IOR('f', 218, struct ioc_data_version)
+#define LL_IOC_LOV_SWAP_LAYOUTS		_IOW('f', 219, \
+						struct lustre_swap_layouts)
+#define LL_IOC_HSM_ACTION		_IOR('f', 220, \
+						struct hsm_current_action)
+/*	lustre_ioctl.h			221-232 */
+#define LL_IOC_LMV_SETSTRIPE		_IOWR('f', 240, struct lmv_user_md)
+#define LL_IOC_LMV_GETSTRIPE		_IOWR('f', 241, struct lmv_user_md)
+#define LL_IOC_REMOVE_ENTRY		_IOWR('f', 242, __u64)
+#define LL_IOC_RMFID			_IOR('f', 242, struct fid_array)
+#define LL_IOC_SET_LEASE		_IOWR('f', 243, struct ll_ioc_lease)
+#define LL_IOC_SET_LEASE_OLD		_IOWR('f', 243, long)
+#define LL_IOC_GET_LEASE		_IO('f', 244)
+#define LL_IOC_HSM_IMPORT		_IOWR('f', 245, struct hsm_user_import)
+#define LL_IOC_LMV_SET_DEFAULT_STRIPE	_IOWR('f', 246, struct lmv_user_md)
+#define LL_IOC_MIGRATE			_IOR('f', 247, int)
+#define LL_IOC_FID2MDTIDX		_IOWR('f', 248, struct lu_fid)
+#define LL_IOC_GETPARENT		_IOWR('f', 249, struct getparent)
+#define LL_IOC_LADVISE			_IOR('f', 250, struct llapi_lu_ladvise)
+
+#ifndef	FS_IOC_FSGETXATTR
+/*
+ * Structure for FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR.
+*/
+struct fsxattr {
+	__u32           fsx_xflags;     /* xflags field value (get/set) */
+	__u32           fsx_extsize;    /* extsize field value (get/set)*/
+	__u32           fsx_nextents;   /* nextents field value (get)   */
+	__u32           fsx_projid;     /* project identifier (get/set) */
+	unsigned char   fsx_pad[12];
+};
+#define FS_IOC_FSGETXATTR		_IOR('X', 31, struct fsxattr)
+#define FS_IOC_FSSETXATTR		_IOW('X', 32, struct fsxattr)
+#endif
+#define LL_IOC_FSGETXATTR		FS_IOC_FSGETXATTR
+#define LL_IOC_FSSETXATTR		FS_IOC_FSSETXATTR
+#ifndef FS_XFLAG_PROJINHERIT
+#define FS_XFLAG_PROJINHERIT		0x00000200
+#endif
+
+
+#define LL_STATFS_LMV		1
+#define LL_STATFS_LOV		2
+#define LL_STATFS_NODELAY	4
+
+#define IOC_MDC_TYPE		'i'
+#define IOC_MDC_LOOKUP		_IOWR(IOC_MDC_TYPE, 20, struct obd_device *)
+#define IOC_MDC_GETFILESTRIPE	_IOWR(IOC_MDC_TYPE, 21, struct lov_user_md *)
+#ifdef HAVE_LOV_USER_MDS_DATA
+#define IOC_MDC_GETFILEINFO_OLD	_IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data_v1 *)
+#define IOC_MDC_GETFILEINFO	_IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data)
+#define LL_IOC_MDC_GETINFO_OLD	_IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data_v1 *)
+#define LL_IOC_MDC_GETINFO	_IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data)
+#endif
+
+#define MAX_OBD_NAME 128 /* If this changes, a NEW ioctl must be added */
+
+/* Define O_LOV_DELAY_CREATE to be a mask that is not useful for regular
+ * files, but are unlikely to be used in practice and are not harmful if
+ * used incorrectly.  O_NOCTTY and FASYNC are only meaningful for character
+ * devices and are safe for use on new files. See LU-4209. */
+/* To be compatible with old statically linked binary we keep the check for
+ * the older 0100000000 flag.  This is already removed upstream.  LU-812. */
+#define O_LOV_DELAY_CREATE_1_8	0100000000 /* FMODE_NONOTIFY masked in 2.6.36 */
+#ifndef FASYNC
+#define FASYNC			00020000   /* fcntl, for BSD compatibility */
+#endif
+#define O_LOV_DELAY_CREATE_MASK	(O_NOCTTY | FASYNC)
+#define O_LOV_DELAY_CREATE		(O_LOV_DELAY_CREATE_1_8 | \
+					 O_LOV_DELAY_CREATE_MASK)
+
+#define O_LU_NOIMPORT_MASK	(O_NOCTTY | O_DSYNC | O_DIRECT)
+#define O_LU_NOIMPORT		O_LU_NOIMPORT_MASK
+
+#define LL_FILE_IGNORE_LOCK     0x00000001
+#define LL_FILE_GROUP_LOCKED    0x00000002
+#define LL_FILE_READAHEA        0x00000004
+#define LL_FILE_LOCKED_DIRECTIO 0x00000008 /* client-side locks with dio */
+#define LL_FILE_LOCKLESS_IO     0x00000010 /* server-side locks with cio */
+
+#define LOV_USER_MAGIC_V1	0x0BD10BD0
+#define LOV_USER_MAGIC		LOV_USER_MAGIC_V1
+#define LOV_USER_MAGIC_JOIN_V1	0x0BD20BD0
+#define LOV_USER_MAGIC_V3	0x0BD30BD0
+/* 0x0BD40BD0 is occupied by LOV_MAGIC_MIGRATE */
+#define LOV_USER_MAGIC_SPECIFIC 0x0BD50BD0	/* for specific OSTs */
+#define LOV_USER_MAGIC_COMP_V1	0x0BD60BD0
+
+#define LMV_USER_MAGIC		0x0CD30CD0    /* default lmv magic */
+#define LMV_USER_MAGIC_V0	0x0CD20CD0    /* old default lmv magic*/
+#define LMV_USER_MAGIC_SPECIFIC	0x0CD40CD0
+
+#define LOV_PATTERN_NONE	0x000
+#define LOV_PATTERN_RAID0	0x001
+#define LOV_PATTERN_RAID1	0x002
+#define LOV_PATTERN_MDT		0x100
+#define LOV_PATTERN_CMOBD	0x200
+
+#define LOV_PATTERN_F_MASK	0xffff0000
+#define LOV_PATTERN_F_HOLE	0x40000000 /* there is hole in LOV EA */
+#define LOV_PATTERN_F_RELEASED	0x80000000 /* HSM released file */
+#define LOV_PATTERN_DEFAULT	0xffffffff
+
+static inline bool lov_pattern_supported(__u32 pattern)
+{
+	return (pattern & ~LOV_PATTERN_F_RELEASED) == LOV_PATTERN_RAID0 ||
+	       (pattern & ~LOV_PATTERN_F_RELEASED) == LOV_PATTERN_MDT;
+}
+
+#define LOV_MAXPOOLNAME 15
+#define LOV_POOLNAMEF "%.15s"
+
+#define LOV_MIN_STRIPE_BITS 16   /* maximum PAGE_SIZE (ia64), power of 2 */
+#define LOV_MIN_STRIPE_SIZE (1 << LOV_MIN_STRIPE_BITS)
+#define LOV_MAX_STRIPE_COUNT_OLD 160
+/* This calculation is crafted so that input of 4096 will result in 160
+ * which in turn is equal to old maximal stripe count.
+ * XXX: In fact this is too simpified for now, what it also need is to get
+ * ea_type argument to clearly know how much space each stripe consumes.
+ *
+ * The limit of 12 pages is somewhat arbitrary, but is a reasonably large
+ * allocation that is sufficient for the current generation of systems.
+ *
+ * (max buffer size - lov+rpc header) / sizeof(struct lov_ost_data_v1) */
+#define LOV_MAX_STRIPE_COUNT 2000  /* ((12 * 4096 - 256) / 24) */
+#define LOV_ALL_STRIPES       0xffff /* only valid for directories */
+#define LOV_V1_INSANE_STRIPE_COUNT 65532 /* maximum stripe count bz13933 */
+
+#define XATTR_LUSTRE_PREFIX	"lustre."
+#define XATTR_LUSTRE_LOV	XATTR_LUSTRE_PREFIX"lov"
+
+/* Please update if XATTR_LUSTRE_LOV".set" groks more flags in the future */
+#define allowed_lustre_lov(att) (strcmp((att), XATTR_LUSTRE_LOV".add") == 0 || \
+			strcmp((att), XATTR_LUSTRE_LOV".set") == 0 || \
+			strcmp((att), XATTR_LUSTRE_LOV".set.flags") == 0 || \
+			strcmp((att), XATTR_LUSTRE_LOV".del") == 0)
+
+#define lov_user_ost_data lov_user_ost_data_v1
+struct lov_user_ost_data_v1 {     /* per-stripe data structure */
+	struct ost_id l_ost_oi;	  /* OST object ID */
+	__u32 l_ost_gen;          /* generation of this OST index */
+	__u32 l_ost_idx;          /* OST index in LOV */
+} __attribute__((packed));
+
+#define lov_user_md lov_user_md_v1
+struct lov_user_md_v1 {           /* LOV EA user data (host-endian) */
+	__u32 lmm_magic;          /* magic number = LOV_USER_MAGIC_V1 */
+	__u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+	struct ost_id lmm_oi;	  /* MDT parent inode id/seq (id/0 for 1.x) */
+	__u32 lmm_stripe_size;    /* size of stripe in bytes */
+	__u16 lmm_stripe_count;   /* num stripes in use for this object */
+	union {
+		__u16 lmm_stripe_offset;  /* starting stripe offset in
+					   * lmm_objects, use when writing */
+		__u16 lmm_layout_gen;     /* layout generation number
+					   * used when reading */
+	};
+	struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+} __attribute__((packed, __may_alias__));
+
+struct lov_user_md_v3 {           /* LOV EA user data (host-endian) */
+	__u32 lmm_magic;          /* magic number = LOV_USER_MAGIC_V3 */
+	__u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+	struct ost_id lmm_oi;	  /* MDT parent inode id/seq (id/0 for 1.x) */
+	__u32 lmm_stripe_size;    /* size of stripe in bytes */
+	__u16 lmm_stripe_count;   /* num stripes in use for this object */
+	union {
+		__u16 lmm_stripe_offset;  /* starting stripe offset in
+					   * lmm_objects, use when writing */
+		__u16 lmm_layout_gen;     /* layout generation number
+					   * used when reading */
+	};
+	char  lmm_pool_name[LOV_MAXPOOLNAME + 1]; /* pool name */
+	struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+} __attribute__((packed, __may_alias__));
+
+struct lu_extent {
+	__u64	e_start;
+	__u64	e_end;
+} __attribute__((packed));
+
+#define DEXT "[%#llx, %#llx)"
+#define PEXT(ext) (unsigned long long)(ext)->e_start, (unsigned long long)(ext)->e_end
+
+static inline bool lu_extent_is_overlapped(struct lu_extent *e1,
+					   struct lu_extent *e2)
+{
+	return e1->e_start < e2->e_end && e2->e_start < e1->e_end;
+}
+
+static inline bool lu_extent_is_whole(struct lu_extent *e)
+{
+	return e->e_start == 0 && e->e_end == LUSTRE_EOF;
+}
+
+enum lov_comp_md_entry_flags {
+	LCME_FL_STALE	= 0x00000001,	/* FLR: stale data */
+	LCME_FL_PREF_RD	= 0x00000002,	/* FLR: preferred for reading */
+	LCME_FL_PREF_WR	= 0x00000004,	/* FLR: preferred for writing */
+	LCME_FL_PREF_RW	= LCME_FL_PREF_RD | LCME_FL_PREF_WR,
+	LCME_FL_OFFLINE	= 0x00000008,	/* Not used */
+	LCME_FL_INIT	= 0x00000010,	/* instantiated */
+	LCME_FL_NOSYNC	= 0x00000020,	/* FLR: no sync for the mirror */
+	LCME_FL_NEG	= 0x80000000	/* used to indicate a negative flag,
+					   won't be stored on disk */
+};
+
+#define LCME_KNOWN_FLAGS	(LCME_FL_NEG | LCME_FL_INIT | LCME_FL_STALE | \
+				 LCME_FL_PREF_RW | LCME_FL_NOSYNC)
+/* The flags can be set by users at mirror creation time. */
+#define LCME_USER_FLAGS		(LCME_FL_PREF_RW)
+
+/* The flags are for mirrors */
+#define LCME_MIRROR_FLAGS	(LCME_FL_NOSYNC)
+
+/* These flags have meaning when set in a default layout and will be inherited
+ * from the default/template layout set on a directory.
+ */
+#define LCME_TEMPLATE_FLAGS	(LCME_FL_PREF_RW | LCME_FL_NOSYNC)
+
+/* the highest bit in obdo::o_layout_version is used to mark if the file is
+ * being resynced. */
+#define LU_LAYOUT_RESYNC	LCME_FL_NEG
+
+/* lcme_id can be specified as certain flags, and the the first
+ * bit of lcme_id is used to indicate that the ID is representing
+ * certain LCME_FL_* but not a real ID. Which implies we can have
+ * at most 31 flags (see LCME_FL_XXX). */
+enum lcme_id {
+	LCME_ID_INVAL	= 0x0,
+	LCME_ID_MAX	= 0x7FFFFFFF,
+	LCME_ID_ALL	= 0xFFFFFFFF,
+	LCME_ID_NOT_ID	= LCME_FL_NEG
+};
+
+#define LCME_ID_MASK	LCME_ID_MAX
+
+struct lov_comp_md_entry_v1 {
+	__u32			lcme_id;        /* unique id of component */
+	__u32			lcme_flags;     /* LCME_FL_XXX */
+	struct lu_extent	lcme_extent;    /* file extent for component */
+	__u32			lcme_offset;    /* offset of component blob,
+						   start from lov_comp_md_v1 */
+	__u32			lcme_size;      /* size of component blob */
+	__u32			lcme_layout_gen;
+	__u64			lcme_timestamp;	/* snapshot time if applicable*/
+	__u32			lcme_padding_1;
+} __attribute__((packed));
+
+#define SEQ_ID_MAX		0x0000FFFF
+#define SEQ_ID_MASK		SEQ_ID_MAX
+/* bit 30:16 of lcme_id is used to store mirror id */
+#define MIRROR_ID_MASK		0x7FFF0000
+#define MIRROR_ID_NEG		0x8000
+#define MIRROR_ID_SHIFT		16
+
+static inline __u32 pflr_id(__u16 mirror_id, __u16 seqid)
+{
+	return ((mirror_id << MIRROR_ID_SHIFT) & MIRROR_ID_MASK) | seqid;
+}
+
+static inline __u16 mirror_id_of(__u32 id)
+{
+	return (id & MIRROR_ID_MASK) >> MIRROR_ID_SHIFT;
+}
+
+/**
+ * on-disk data for lcm_flags. Valid if lcm_magic is LOV_MAGIC_COMP_V1.
+ */
+enum lov_comp_md_flags {
+	/* the least 2 bits are used by FLR to record file state */
+	LCM_FL_NONE          = 0,
+	LCM_FL_RDONLY           = 1,
+	LCM_FL_WRITE_PENDING    = 2,
+	LCM_FL_SYNC_PENDING     = 3,
+	LCM_FL_FLR_MASK         = 0x3,
+};
+
+struct lov_comp_md_v1 {
+	__u32	lcm_magic;      /* LOV_USER_MAGIC_COMP_V1 */
+	__u32	lcm_size;       /* overall size including this struct */
+	__u32	lcm_layout_gen;
+	__u16	lcm_flags;
+	__u16	lcm_entry_count;
+	/* lcm_mirror_count stores the number of actual mirrors minus 1,
+	 * so that non-flr files will have value 0 meaning 1 mirror. */
+	__u16	lcm_mirror_count;
+	__u16	lcm_padding1[3];
+	__u64	lcm_padding2;
+	struct lov_comp_md_entry_v1 lcm_entries[0];
+} __attribute__((packed));
+
+static inline __u32 lov_user_md_size(__u16 stripes, __u32 lmm_magic)
+{
+	if (stripes == (__u16)-1)
+		stripes = 0;
+
+	if (lmm_magic == LOV_USER_MAGIC_V1)
+		return sizeof(struct lov_user_md_v1) +
+			      stripes * sizeof(struct lov_user_ost_data_v1);
+	return sizeof(struct lov_user_md_v3) +
+				stripes * sizeof(struct lov_user_ost_data_v1);
+}
+
+/* Compile with -D_LARGEFILE64_SOURCE or -D_GNU_SOURCE (or #define) to
+ * use this.  It is unsafe to #define those values in this header as it
+ * is possible the application has already #included <sys/stat.h>. */
+#ifdef HAVE_LOV_USER_MDS_DATA
+#define lov_user_mds_data lov_user_mds_data_v2
+struct lov_user_mds_data_v1 {
+	lstat_t lmd_st;                 /* MDS stat struct */
+	struct lov_user_md_v1 lmd_lmm;  /* LOV EA V1 user data */
+} __attribute__((packed));
+
+struct lov_user_mds_data_v2 {
+	struct lu_fid lmd_fid;		/* Lustre FID */
+	lstatx_t lmd_stx;		/* MDS statx struct */
+	__u64 lmd_flags;		/* MDS stat flags */
+	__u32 lmd_lmmsize;		/* LOV EA size */
+	__u32 lmd_padding;		/* unused */
+	struct lov_user_md_v1 lmd_lmm;	/* LOV EA user data */
+} __attribute__((packed));
+#endif
+
+struct lmv_user_mds_data {
+	struct lu_fid	lum_fid;
+	__u32		lum_padding;
+	__u32		lum_mds;
+} __attribute__((packed, __may_alias__));
+
+enum lmv_hash_type {
+	LMV_HASH_TYPE_UNKNOWN	= 0,	/* 0 is reserved for testing purpose */
+	LMV_HASH_TYPE_ALL_CHARS = 1,
+	LMV_HASH_TYPE_FNV_1A_64 = 2,
+	LMV_HASH_TYPE_MAX,
+};
+
+#define LMV_HASH_NAME_ALL_CHARS	"all_char"
+#define LMV_HASH_NAME_FNV_1A_64	"fnv_1a_64"
+
+extern char *mdt_hash_name[LMV_HASH_TYPE_MAX];
+
+/* Got this according to how get LOV_MAX_STRIPE_COUNT, see above,
+ * (max buffer size - lmv+rpc header) / sizeof(struct lmv_user_mds_data) */
+#define LMV_MAX_STRIPE_COUNT 2000  /* ((12 * 4096 - 256) / 24) */
+#define lmv_user_md lmv_user_md_v1
+struct lmv_user_md_v1 {
+	__u32	lum_magic;	 /* must be the first field */
+	__u32	lum_stripe_count;  /* dirstripe count */
+	__u32	lum_stripe_offset; /* MDT idx for default dirstripe */
+	__u32	lum_hash_type;     /* Dir stripe policy */
+	__u32	lum_type;	  /* LMV type: default or normal */
+	__u32	lum_padding1;
+	__u32	lum_padding2;
+	__u32	lum_padding3;
+	char	lum_pool_name[LOV_MAXPOOLNAME + 1];
+	struct	lmv_user_mds_data  lum_objects[0];
+} __attribute__((packed));
+
+static inline int lmv_user_md_size(int stripes, int lmm_magic)
+{
+	int size = sizeof(struct lmv_user_md);
+
+	if (lmm_magic == LMV_USER_MAGIC_SPECIFIC)
+		size += stripes * sizeof(struct lmv_user_mds_data);
+
+	return size;
+}
+
+struct ll_recreate_obj {
+	__u64 lrc_id;
+	__u32 lrc_ost_idx;
+};
+
+struct ll_fid {
+	__u64 id;         /* holds object id */
+	__u32 generation; /* holds object generation */
+	__u32 f_type;     /* holds object type or stripe idx when passing it to
+			   * OST for saving into EA. */
+};
+
+#define UUID_MAX        40
+struct obd_uuid {
+	char uuid[UUID_MAX];
+};
+
+static inline bool obd_uuid_equals(const struct obd_uuid *u1,
+				   const struct obd_uuid *u2)
+{
+	return strcmp((char *)u1->uuid, (char *)u2->uuid) == 0;
+}
+
+static inline int obd_uuid_empty(struct obd_uuid *uuid)
+{
+	return uuid->uuid[0] == '\0';
+}
+
+static inline void obd_str2uuid(struct obd_uuid *uuid, const char *tmp)
+{
+	strncpy((char *)uuid->uuid, tmp, sizeof(*uuid));
+	uuid->uuid[sizeof(*uuid) - 1] = '\0';
+}
+
+/* For printf's only, make sure uuid is terminated */
+static inline char *obd_uuid2str(const struct obd_uuid *uuid)
+{
+	if (uuid == NULL)
+		return NULL;
+
+	if (uuid->uuid[sizeof(*uuid) - 1] != '\0') {
+		/* Obviously not safe, but for printfs, no real harm done...
+		   we're always null-terminated, even in a race. */
+		static char temp[sizeof(*uuid->uuid)];
+
+		memcpy(temp, uuid->uuid, sizeof(*uuid->uuid) - 1);
+		temp[sizeof(*uuid->uuid) - 1] = '\0';
+
+		return temp;
+	}
+	return (char *)(uuid->uuid);
+}
+
+#define LUSTRE_MAXFSNAME 8
+
+/* Extract fsname from uuid (or target name) of a target
+   e.g. (myfs-OST0007_UUID -> myfs)
+   see also deuuidify. */
+static inline void obd_uuid2fsname(char *buf, char *uuid, int buflen)
+{
+	char *p;
+
+	strncpy(buf, uuid, buflen - 1);
+	buf[buflen - 1] = '\0';
+	p = strrchr(buf, '-');
+	if (p != NULL)
+		*p = '\0';
+}
+
+/* printf display format for Lustre FIDs
+ * usage: printf("file FID is "DFID"\n", PFID(fid)); */
+#define FID_NOBRACE_LEN 40
+#define FID_LEN (FID_NOBRACE_LEN + 2)
+#define DFID_NOBRACE "%#llx:0x%x:0x%x"
+#define DFID "["DFID_NOBRACE"]"
+#define PFID(fid) (unsigned long long)(fid)->f_seq, (fid)->f_oid, (fid)->f_ver
+
+/* scanf input parse format for fids in DFID_NOBRACE format
+ * Need to strip '[' from DFID format first or use "["SFID"]" at caller.
+ * usage: sscanf(fidstr, SFID, RFID(&fid)); */
+#define SFID "0x%llx:0x%x:0x%x"
+#define RFID(fid) (unsigned long long *)&((fid)->f_seq), &((fid)->f_oid), &((fid)->f_ver)
+
+/********* Quotas **********/
+
+#define LUSTRE_QUOTABLOCK_BITS 10
+#define LUSTRE_QUOTABLOCK_SIZE (1 << LUSTRE_QUOTABLOCK_BITS)
+
+static inline __u64 lustre_stoqb(__kernel_size_t space)
+{
+	return (space + LUSTRE_QUOTABLOCK_SIZE - 1) >> LUSTRE_QUOTABLOCK_BITS;
+}
+
+#define Q_QUOTACHECK	0x800100 /* deprecated as of 2.4 */
+#define Q_INITQUOTA	0x800101 /* deprecated as of 2.4  */
+#define Q_GETOINFO	0x800102 /* get obd quota info */
+#define Q_GETOQUOTA	0x800103 /* get obd quotas */
+#define Q_FINVALIDATE	0x800104 /* deprecated as of 2.4 */
+
+/* these must be explicitly translated into linux Q_* in ll_dir_ioctl */
+#define LUSTRE_Q_QUOTAON    0x800002     /* deprecated as of 2.4 */
+#define LUSTRE_Q_QUOTAOFF   0x800003     /* deprecated as of 2.4 */
+#define LUSTRE_Q_GETINFO    0x800005     /* get information about quota files */
+#define LUSTRE_Q_SETINFO    0x800006     /* set information about quota files */
+#define LUSTRE_Q_GETQUOTA   0x800007     /* get user quota structure */
+#define LUSTRE_Q_SETQUOTA   0x800008     /* set user quota structure */
+/* lustre-specific control commands */
+#define LUSTRE_Q_INVALIDATE  0x80000b     /* deprecated as of 2.4 */
+#define LUSTRE_Q_FINVALIDATE 0x80000c     /* deprecated as of 2.4 */
+#define LUSTRE_Q_GETDEFAULT  0x80000d     /* get default quota */
+#define LUSTRE_Q_SETDEFAULT  0x80000e     /* set default quota */
+
+/* In the current Lustre implementation, the grace time is either the time
+ * or the timestamp to be used after some quota ID exceeds the soft limt,
+ * 48 bits should be enough, its high 16 bits can be used as quota flags.
+ * */
+#define LQUOTA_GRACE_BITS	48
+#define LQUOTA_GRACE_MASK	((1ULL << LQUOTA_GRACE_BITS) - 1)
+#define LQUOTA_GRACE_MAX	LQUOTA_GRACE_MASK
+#define LQUOTA_GRACE(t)		(t & LQUOTA_GRACE_MASK)
+#define LQUOTA_FLAG(t)		(t >> LQUOTA_GRACE_BITS)
+#define LQUOTA_GRACE_FLAG(t, f)	((__u64)t | (__u64)f << LQUOTA_GRACE_BITS)
+
+/* different quota flags */
+
+/* the default quota flag, the corresponding quota ID will use the default
+ * quota setting, the hardlimit and softlimit of its quota record in the global
+ * quota file will be set to 0, the low 48 bits of the grace will be set to 0
+ * and high 16 bits will contain this flag (see above comment).
+ * */
+#define LQUOTA_FLAG_DEFAULT	0x0001
+
+#define ALLQUOTA 255       /* set all quota */
+static inline char *qtype_name(int qtype)
+{
+	switch (qtype) {
+	case USRQUOTA:
+		return "usr";
+	case GRPQUOTA:
+		return "grp";
+	case PRJQUOTA:
+		return "prj";
+	}
+	return "unknown";
+}
+
+#define IDENTITY_DOWNCALL_MAGIC 0x6d6dd629
+#define SEPOL_DOWNCALL_MAGIC 0x8b8bb842
+
+/* permission */
+#define N_PERMS_MAX      64
+
+struct perm_downcall_data {
+	__u64 pdd_nid;
+	__u32 pdd_perm;
+	__u32 pdd_padding;
+};
+
+struct identity_downcall_data {
+	__u32                            idd_magic;
+	__u32                            idd_err;
+	__u32                            idd_uid;
+	__u32                            idd_gid;
+	__u32                            idd_nperms;
+	__u32                            idd_ngroups;
+	struct perm_downcall_data idd_perms[N_PERMS_MAX];
+	__u32                            idd_groups[0];
+};
+
+struct sepol_downcall_data {
+	__u32		sdd_magic;
+	__s64		sdd_sepol_mtime;
+	__u16		sdd_sepol_len;
+	char		sdd_sepol[0];
+};
+
+#ifdef NEED_QUOTA_DEFS
+#ifndef QIF_BLIMITS
+#define QIF_BLIMITS     1
+#define QIF_SPACE       2
+#define QIF_ILIMITS     4
+#define QIF_INODES      8
+#define QIF_BTIME       16
+#define QIF_ITIME       32
+#define QIF_LIMITS      (QIF_BLIMITS | QIF_ILIMITS)
+#define QIF_USAGE       (QIF_SPACE | QIF_INODES)
+#define QIF_TIMES       (QIF_BTIME | QIF_ITIME)
+#define QIF_ALL         (QIF_LIMITS | QIF_USAGE | QIF_TIMES)
+#endif
+
+#endif /* !__KERNEL__ */
+
+/* lustre volatile file support
+ * file name header: ".^L^S^T^R:volatile"
+ */
+#define LUSTRE_VOLATILE_HDR	".\x0c\x13\x14\x12:VOLATILE"
+#define LUSTRE_VOLATILE_HDR_LEN	14
+
+enum lustre_quota_version {
+	LUSTRE_QUOTA_V2 = 1
+};
+
+/* XXX: same as if_dqinfo struct in kernel */
+struct obd_dqinfo {
+	__u64 dqi_bgrace;
+	__u64 dqi_igrace;
+	__u32 dqi_flags;
+	__u32 dqi_valid;
+};
+
+/* XXX: same as if_dqblk struct in kernel, plus one padding */
+struct obd_dqblk {
+	__u64 dqb_bhardlimit;
+	__u64 dqb_bsoftlimit;
+	__u64 dqb_curspace;
+	__u64 dqb_ihardlimit;
+	__u64 dqb_isoftlimit;
+	__u64 dqb_curinodes;
+	__u64 dqb_btime;
+	__u64 dqb_itime;
+	__u32 dqb_valid;
+	__u32 dqb_padding;
+};
+
+enum {
+	QC_GENERAL      = 0,
+	QC_MDTIDX       = 1,
+	QC_OSTIDX       = 2,
+	QC_UUID         = 3
+};
+
+struct if_quotactl {
+	__u32                   qc_cmd;
+	__u32                   qc_type;
+	__u32                   qc_id;
+	__u32                   qc_stat;
+	__u32                   qc_valid;
+	__u32                   qc_idx;
+	struct obd_dqinfo       qc_dqinfo;
+	struct obd_dqblk        qc_dqblk;
+	char                    obd_type[16];
+	struct obd_uuid         obd_uuid;
+};
+
+/* swap layout flags */
+#define SWAP_LAYOUTS_CHECK_DV1		(1 << 0)
+#define SWAP_LAYOUTS_CHECK_DV2		(1 << 1)
+#define SWAP_LAYOUTS_KEEP_MTIME		(1 << 2)
+#define SWAP_LAYOUTS_KEEP_ATIME		(1 << 3)
+#define SWAP_LAYOUTS_CLOSE		(1 << 4)
+
+/* Swap XATTR_NAME_HSM as well, only on the MDT so far */
+#define SWAP_LAYOUTS_MDS_HSM		(1 << 31)
+struct lustre_swap_layouts {
+	__u64	sl_flags;
+	__u32	sl_fd;
+	__u32	sl_gid;
+	__u64	sl_dv1;
+	__u64	sl_dv2;
+};
+
+/** Bit-mask of valid attributes */
+/* The LA_* flags are written to disk as part of the ChangeLog records
+ * so they are part of the on-disk and network protocol, and cannot be changed.
+ * Only the first 12 bits are currently saved.
+ */
+enum la_valid {
+	LA_ATIME	= 1 << 0,	/* 0x00001 */
+	LA_MTIME	= 1 << 1,	/* 0x00002 */
+	LA_CTIME	= 1 << 2,	/* 0x00004 */
+	LA_SIZE		= 1 << 3,	/* 0x00008 */
+	LA_MODE		= 1 << 4,	/* 0x00010 */
+	LA_UID		= 1 << 5,	/* 0x00020 */
+	LA_GID		= 1 << 6,	/* 0x00040 */
+	LA_BLOCKS	= 1 << 7,	/* 0x00080 */
+	LA_TYPE		= 1 << 8,	/* 0x00100 */
+	LA_FLAGS	= 1 << 9,	/* 0x00200 */
+	LA_NLINK	= 1 << 10,	/* 0x00400 */
+	LA_RDEV		= 1 << 11,	/* 0x00800 */
+	LA_BLKSIZE	= 1 << 12,	/* 0x01000 */
+	LA_KILL_SUID	= 1 << 13,	/* 0x02000 */
+	LA_KILL_SGID	= 1 << 14,	/* 0x04000 */
+	LA_PROJID	= 1 << 15,	/* 0x08000 */
+	LA_LAYOUT_VERSION = 1 << 16,	/* 0x10000 */
+	LA_LSIZE	= 1 << 17,	/* 0x20000 */
+	LA_LBLOCKS	= 1 << 18,	/* 0x40000 */
+	/**
+	 * Attributes must be transmitted to OST objects
+	 */
+	LA_REMOTE_ATTR_SET = (LA_UID | LA_GID | LA_PROJID | LA_LAYOUT_VERSION)
+};
+
+#define MDS_FMODE_READ           00000001
+#define MDS_FMODE_WRITE          00000002
+
+#define MDS_FMODE_CLOSED         00000000
+#define MDS_FMODE_EXEC           00000004
+/*	MDS_FMODE_EPOCH          01000000 obsolete since 2.8.0 */
+/*	MDS_FMODE_TRUNC          02000000 obsolete since 2.8.0 */
+/*	MDS_FMODE_SOM            04000000 obsolete since 2.8.0 */
+
+#define MDS_OPEN_CREATED         00000010
+/*	MDS_OPEN_CROSS           00000020 obsolete in 2.12, internal use only */
+
+#define MDS_OPEN_CREAT           00000100
+#define MDS_OPEN_EXCL            00000200
+#define MDS_OPEN_TRUNC           00001000
+#define MDS_OPEN_APPEND          00002000
+#define MDS_OPEN_SYNC            00010000
+#define MDS_OPEN_DIRECTORY       00200000
+
+#define MDS_OPEN_NOIMPORT	020000000 /* nocache object create */
+#define MDS_OPEN_BY_FID		040000000 /* open_by_fid for known object */
+#define MDS_OPEN_DELAY_CREATE  0100000000 /* delay initial object create */
+#define MDS_OPEN_OWNEROVERRIDE 0200000000 /* NFSD rw-reopen ro file for owner */
+#define MDS_OPEN_JOIN_FILE     0400000000 /* open for join file.
+					   * We do not support JOIN FILE
+					   * anymore, reserve this flags
+					   * just for preventing such bit
+					   * to be reused. */
+
+#define MDS_OPEN_LOCK         04000000000 /* This open requires open lock */
+#define MDS_OPEN_HAS_EA      010000000000 /* specify object create pattern */
+#define MDS_OPEN_HAS_OBJS    020000000000 /* Just set the EA the obj exist */
+#define MDS_OPEN_NORESTORE  0100000000000ULL /* Do not restore file at open */
+#define MDS_OPEN_NEWSTRIPE  0200000000000ULL /* New stripe needed (restripe or
+					      * hsm restore) */
+#define MDS_OPEN_VOLATILE   0400000000000ULL /* File is volatile = created
+						unlinked */
+#define MDS_OPEN_LEASE	   01000000000000ULL /* Open the file and grant lease
+					      * delegation, succeed if it's not
+					      * being opened with conflict mode.
+					      */
+#define MDS_OPEN_RELEASE   02000000000000ULL /* Open the file for HSM release */
+
+#define MDS_OPEN_RESYNC    04000000000000ULL /* FLR: file resync */
+
+/* lustre internal open flags, which should not be set from user space */
+#define MDS_OPEN_FL_INTERNAL (MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS |	\
+			      MDS_OPEN_OWNEROVERRIDE | MDS_OPEN_LOCK |	\
+			      MDS_OPEN_BY_FID | MDS_OPEN_LEASE |	\
+			      MDS_OPEN_RELEASE | MDS_OPEN_RESYNC)
+
+
+/********* Changelogs **********/
+/** Changelog record types */
+enum changelog_rec_type {
+	CL_NONE     = -1,
+	CL_MARK     = 0,
+	CL_CREATE   = 1,  /* namespace */
+	CL_MKDIR    = 2,  /* namespace */
+	CL_HARDLINK = 3,  /* namespace */
+	CL_SOFTLINK = 4,  /* namespace */
+	CL_MKNOD    = 5,  /* namespace */
+	CL_UNLINK   = 6,  /* namespace */
+	CL_RMDIR    = 7,  /* namespace */
+	CL_RENAME   = 8,  /* namespace */
+	CL_EXT      = 9,  /* namespace extended record (2nd half of rename) */
+	CL_OPEN     = 10, /* not currently used */
+	CL_CLOSE    = 11, /* may be written to log only with mtime change */
+	CL_LAYOUT   = 12, /* file layout/striping modified */
+	CL_TRUNC    = 13,
+	CL_SETATTR  = 14,
+	CL_SETXATTR = 15,
+	CL_XATTR    = CL_SETXATTR, /* Deprecated name */
+	CL_HSM      = 16, /* HSM specific events, see flags */
+	CL_MTIME    = 17, /* Precedence: setattr > mtime > ctime > atime */
+	CL_CTIME    = 18,
+	CL_ATIME    = 19,
+	CL_MIGRATE  = 20,
+	CL_FLRW     = 21, /* FLR: file was firstly written */
+	CL_RESYNC   = 22, /* FLR: file was resync-ed */
+	CL_GETXATTR = 23,
+	CL_DN_OPEN  = 24, /* denied open */
+	CL_LAST
+};
+
+static inline const char *changelog_type2str(int type) {
+	static const char *changelog_str[] = {
+		"MARK",  "CREAT", "MKDIR", "HLINK", "SLINK", "MKNOD", "UNLNK",
+		"RMDIR", "RENME", "RNMTO", "OPEN",  "CLOSE", "LYOUT", "TRUNC",
+		"SATTR", "XATTR", "HSM",   "MTIME", "CTIME", "ATIME", "MIGRT",
+		"FLRW",  "RESYNC","GXATR", "NOPEN",
+	};
+
+	if (type >= 0 && type < CL_LAST)
+		return changelog_str[type];
+	return NULL;
+}
+
+/* 12 bits of per-record data can be stored in the bottom of the flags */
+#define CLF_FLAGSHIFT   12
+enum changelog_rec_flags {
+	CLF_VERSION	= 0x1000,
+	CLF_RENAME	= 0x2000,
+	CLF_JOBID	= 0x4000,
+	CLF_EXTRA_FLAGS = 0x8000,
+	CLF_SUPPORTED	= CLF_VERSION | CLF_RENAME | CLF_JOBID |
+			  CLF_EXTRA_FLAGS,
+	CLF_FLAGMASK	= (1U << CLF_FLAGSHIFT) - 1,
+	CLF_VERMASK	= ~CLF_FLAGMASK,
+};
+
+
+/* Anything under the flagmask may be per-type (if desired) */
+/* Flags for unlink */
+#define CLF_UNLINK_LAST       0x0001 /* Unlink of last hardlink */
+#define CLF_UNLINK_HSM_EXISTS 0x0002 /* File has something in HSM */
+				     /* HSM cleaning needed */
+/* Flags for rename */
+#define CLF_RENAME_LAST		0x0001 /* rename unlink last hardlink
+					* of target */
+#define CLF_RENAME_LAST_EXISTS	0x0002 /* rename unlink last hardlink of target
+					* has an archive in backend */
+
+/* Flags for HSM */
+/* 12b used (from high weight to low weight):
+ * 2b for flags
+ * 3b for event
+ * 7b for error code
+ */
+#define CLF_HSM_ERR_L        0 /* HSM return code, 7 bits */
+#define CLF_HSM_ERR_H        6
+#define CLF_HSM_EVENT_L      7 /* HSM event, 3 bits, see enum hsm_event */
+#define CLF_HSM_EVENT_H      9
+#define CLF_HSM_FLAG_L      10 /* HSM flags, 2 bits, 1 used, 1 spare */
+#define CLF_HSM_FLAG_H      11
+#define CLF_HSM_SPARE_L     12 /* 4 spare bits */
+#define CLF_HSM_SPARE_H     15
+#define CLF_HSM_LAST        15
+
+/* Remove bits higher than _h, then extract the value
+ * between _h and _l by shifting lower weigth to bit 0. */
+#define CLF_GET_BITS(_b, _h, _l) (((_b << (CLF_HSM_LAST - _h)) & 0xFFFF) \
+				   >> (CLF_HSM_LAST - _h + _l))
+
+#define CLF_HSM_SUCCESS      0x00
+#define CLF_HSM_MAXERROR     0x7E
+#define CLF_HSM_ERROVERFLOW  0x7F
+
+#define CLF_HSM_DIRTY        1 /* file is dirty after HSM request end */
+
+/* 3 bits field => 8 values allowed */
+enum hsm_event {
+	HE_ARCHIVE      = 0,
+	HE_RESTORE      = 1,
+	HE_CANCEL       = 2,
+	HE_RELEASE      = 3,
+	HE_REMOVE       = 4,
+	HE_STATE        = 5,
+	HE_SPARE1       = 6,
+	HE_SPARE2       = 7,
+	/* Leaving HE_SPARE2 as is. Its referred in the Lemur code */
+	HE_IMPORT       = 7,
+};
+
+static inline enum hsm_event hsm_get_cl_event(__u16 flags)
+{
+	return (enum hsm_event)CLF_GET_BITS(flags, CLF_HSM_EVENT_H,
+					    CLF_HSM_EVENT_L);
+}
+
+static inline void hsm_set_cl_event(enum changelog_rec_flags *clf_flags,
+				    enum hsm_event he)
+{
+	*clf_flags |= (he << CLF_HSM_EVENT_L);
+}
+
+static inline __u16 hsm_get_cl_flags(enum changelog_rec_flags clf_flags)
+{
+	return CLF_GET_BITS(clf_flags, CLF_HSM_FLAG_H, CLF_HSM_FLAG_L);
+}
+
+static inline void hsm_set_cl_flags(enum changelog_rec_flags *clf_flags,
+				    unsigned int bits)
+{
+	*clf_flags |= (bits << CLF_HSM_FLAG_L);
+}
+
+static inline int hsm_get_cl_error(enum changelog_rec_flags clf_flags)
+{
+	return CLF_GET_BITS(clf_flags, CLF_HSM_ERR_H, CLF_HSM_ERR_L);
+}
+
+static inline void hsm_set_cl_error(enum changelog_rec_flags *clf_flags,
+				    unsigned int error)
+{
+	*clf_flags |= (error << CLF_HSM_ERR_L);
+}
+
+enum changelog_rec_extra_flags {
+	CLFE_INVALID	= 0,
+	CLFE_UIDGID	= 0x0001,
+	CLFE_NID	= 0x0002,
+	CLFE_OPEN	= 0x0004,
+	CLFE_XATTR	= 0x0008,
+	CLFE_SUPPORTED	= CLFE_UIDGID | CLFE_NID | CLFE_OPEN | CLFE_XATTR
+};
+
+enum changelog_send_flag {
+	/* Not yet implemented */
+	CHANGELOG_FLAG_FOLLOW      = 0x01,
+	/* Blocking IO makes sense in case of slow user parsing of the records,
+	 * but it also prevents us from cleaning up if the records are not
+	 * consumed. */
+	CHANGELOG_FLAG_BLOCK       = 0x02,
+	/* Pack jobid into the changelog records if available. */
+	CHANGELOG_FLAG_JOBID       = 0x04,
+	/* Pack additional flag bits into the changelog record */
+	CHANGELOG_FLAG_EXTRA_FLAGS = 0x08,
+};
+
+enum changelog_send_extra_flag {
+	/* Pack uid/gid into the changelog record */
+	CHANGELOG_EXTRA_FLAG_UIDGID = 0x01,
+	/* Pack nid into the changelog record */
+	CHANGELOG_EXTRA_FLAG_NID    = 0x02,
+	/* Pack open mode into the changelog record */
+	CHANGELOG_EXTRA_FLAG_OMODE  = 0x04,
+	/* Pack xattr name into the changelog record */
+	CHANGELOG_EXTRA_FLAG_XATTR  = 0x08,
+};
+
+#define CR_MAXSIZE __ALIGN_KERNEL(2 * NAME_MAX + 2 + \
+				  changelog_rec_offset(CLF_SUPPORTED, \
+						       CLFE_SUPPORTED), 8)
+
+/* 31 usable bytes string + null terminator. */
+#define LUSTRE_JOBID_SIZE	32
+
+/* This is the minimal changelog record. It can contain extensions
+ * such as rename fields or process jobid. Its exact content is described
+ * by the cr_flags and cr_extra_flags.
+ *
+ * Extensions are packed in the same order as their corresponding flags,
+ * then in the same order as their corresponding extra flags.
+ */
+struct changelog_rec {
+	__u16			cr_namelen;
+	__u16			cr_flags; /**< \a changelog_rec_flags */
+	__u32			cr_type;  /**< \a changelog_rec_type */
+	__u64			cr_index; /**< changelog record number */
+	__u64			cr_prev;  /**< last index for this target fid */
+	__u64			cr_time;
+	union {
+		struct lu_fid	cr_tfid;        /**< target fid */
+		__u32		cr_markerflags; /**< CL_MARK flags */
+	};
+	struct lu_fid		cr_pfid;        /**< parent fid */
+} __attribute__ ((packed));
+
+/* Changelog extension for RENAME. */
+struct changelog_ext_rename {
+	struct lu_fid		cr_sfid;     /**< source fid, or zero */
+	struct lu_fid		cr_spfid;    /**< source parent fid, or zero */
+};
+
+/* Changelog extension to include JOBID. */
+struct changelog_ext_jobid {
+	char	cr_jobid[LUSTRE_JOBID_SIZE];	/**< zero-terminated string. */
+};
+
+/* Changelog extension to include additional flags. */
+struct changelog_ext_extra_flags {
+	__u64 cr_extra_flags; /* Additional CLFE_* flags */
+};
+
+/* Changelog extra extension to include UID/GID. */
+struct changelog_ext_uidgid {
+	__u64	cr_uid;
+	__u64	cr_gid;
+};
+
+/* Changelog extra extension to include NID. */
+struct changelog_ext_nid {
+	/* have __u64 instead of lnet_nid_t type for use by client api */
+	__u64 cr_nid;
+	/* for use when IPv6 support is added */
+	__u64 extra;
+	__u32 padding;
+};
+
+/* Changelog extra extension to include low 32 bits of MDS_OPEN_* flags. */
+struct changelog_ext_openmode {
+	__u32 cr_openflags;
+};
+
+/* Changelog extra extension to include xattr */
+struct changelog_ext_xattr {
+	char cr_xattr[XATTR_NAME_MAX + 1]; /**< zero-terminated string. */
+};
+
+static inline struct changelog_ext_extra_flags *changelog_rec_extra_flags(
+	const struct changelog_rec *rec);
+
+static inline __kernel_size_t changelog_rec_offset(enum changelog_rec_flags crf,
+					  enum changelog_rec_extra_flags cref)
+{
+	__kernel_size_t size = sizeof(struct changelog_rec);
+
+	if (crf & CLF_RENAME)
+		size += sizeof(struct changelog_ext_rename);
+
+	if (crf & CLF_JOBID)
+		size += sizeof(struct changelog_ext_jobid);
+
+	if (crf & CLF_EXTRA_FLAGS) {
+		size += sizeof(struct changelog_ext_extra_flags);
+		if (cref & CLFE_UIDGID)
+			size += sizeof(struct changelog_ext_uidgid);
+		if (cref & CLFE_NID)
+			size += sizeof(struct changelog_ext_nid);
+		if (cref & CLFE_OPEN)
+			size += sizeof(struct changelog_ext_openmode);
+		if (cref & CLFE_XATTR)
+			size += sizeof(struct changelog_ext_xattr);
+	}
+
+	return size;
+}
+
+static inline __kernel_size_t changelog_rec_size(const struct changelog_rec *rec)
+{
+	enum changelog_rec_extra_flags cref = CLFE_INVALID;
+
+	if (rec->cr_flags & CLF_EXTRA_FLAGS)
+		cref = changelog_rec_extra_flags(rec)->cr_extra_flags;
+
+	return changelog_rec_offset(rec->cr_flags, cref);
+}
+
+static inline __kernel_size_t changelog_rec_varsize(const struct changelog_rec *rec)
+{
+	return changelog_rec_size(rec) - sizeof(*rec) + rec->cr_namelen;
+}
+
+static inline
+struct changelog_ext_rename *changelog_rec_rename(const struct changelog_rec *rec)
+{
+	enum changelog_rec_flags crf = rec->cr_flags & CLF_VERSION;
+
+	return (struct changelog_ext_rename *)((char *)rec +
+					       changelog_rec_offset(crf,
+								 CLFE_INVALID));
+}
+
+/* The jobid follows the rename extension, if present */
+static inline
+struct changelog_ext_jobid *changelog_rec_jobid(const struct changelog_rec *rec)
+{
+	enum changelog_rec_flags crf = rec->cr_flags &
+					(CLF_VERSION | CLF_RENAME);
+
+	return (struct changelog_ext_jobid *)((char *)rec +
+					      changelog_rec_offset(crf,
+								 CLFE_INVALID));
+}
+
+/* The additional flags follow the rename and jobid extensions, if present */
+static inline
+struct changelog_ext_extra_flags *changelog_rec_extra_flags(
+	const struct changelog_rec *rec)
+{
+	enum changelog_rec_flags crf = rec->cr_flags &
+		(CLF_VERSION | CLF_RENAME | CLF_JOBID);
+
+	return (struct changelog_ext_extra_flags *)((char *)rec +
+						 changelog_rec_offset(crf,
+								 CLFE_INVALID));
+}
+
+/* The uid/gid is the first extra extension */
+static inline
+struct changelog_ext_uidgid *changelog_rec_uidgid(
+	const struct changelog_rec *rec)
+{
+	enum changelog_rec_flags crf = rec->cr_flags &
+		(CLF_VERSION | CLF_RENAME | CLF_JOBID | CLF_EXTRA_FLAGS);
+
+	return (struct changelog_ext_uidgid *)((char *)rec +
+					       changelog_rec_offset(crf,
+								 CLFE_INVALID));
+}
+
+/* The nid is the second extra extension */
+static inline
+struct changelog_ext_nid *changelog_rec_nid(const struct changelog_rec *rec)
+{
+	enum changelog_rec_flags crf = rec->cr_flags &
+		(CLF_VERSION | CLF_RENAME | CLF_JOBID | CLF_EXTRA_FLAGS);
+	enum changelog_rec_extra_flags cref = CLFE_INVALID;
+
+	if (rec->cr_flags & CLF_EXTRA_FLAGS)
+		cref = changelog_rec_extra_flags(rec)->cr_extra_flags &
+		       CLFE_UIDGID;
+
+	return (struct changelog_ext_nid *)((char *)rec +
+					    changelog_rec_offset(crf, cref));
+}
+
+/* The OPEN mode is the third extra extension */
+static inline
+struct changelog_ext_openmode *changelog_rec_openmode(
+	const struct changelog_rec *rec)
+{
+	enum changelog_rec_flags crf = rec->cr_flags &
+		(CLF_VERSION | CLF_RENAME | CLF_JOBID | CLF_EXTRA_FLAGS);
+	enum changelog_rec_extra_flags cref = CLFE_INVALID;
+
+	if (rec->cr_flags & CLF_EXTRA_FLAGS)
+		cref = changelog_rec_extra_flags(rec)->cr_extra_flags &
+		       (CLFE_UIDGID | CLFE_NID);
+
+	return (struct changelog_ext_openmode *)((char *)rec +
+					       changelog_rec_offset(crf, cref));
+}
+
+/* The xattr name is the fourth extra extension */
+static inline
+struct changelog_ext_xattr *changelog_rec_xattr(
+	const struct changelog_rec *rec)
+{
+	enum changelog_rec_flags crf = rec->cr_flags &
+		(CLF_VERSION | CLF_RENAME | CLF_JOBID | CLF_EXTRA_FLAGS);
+	enum changelog_rec_extra_flags cref = CLFE_INVALID;
+
+	if (rec->cr_flags & CLF_EXTRA_FLAGS)
+		cref = changelog_rec_extra_flags(rec)->cr_extra_flags &
+			(CLFE_UIDGID | CLFE_NID | CLFE_OPEN);
+
+	return (struct changelog_ext_xattr *)((char *)rec +
+					      changelog_rec_offset(crf, cref));
+}
+
+/* The name follows the rename, jobid  and extra flags extns, if present */
+static inline char *changelog_rec_name(const struct changelog_rec *rec)
+{
+	enum changelog_rec_extra_flags cref = CLFE_INVALID;
+
+	if (rec->cr_flags & CLF_EXTRA_FLAGS)
+		cref = changelog_rec_extra_flags(rec)->cr_extra_flags;
+
+	return (char *)rec + changelog_rec_offset(rec->cr_flags & CLF_SUPPORTED,
+						  cref & CLFE_SUPPORTED);
+}
+
+static inline char *changelog_rec_sname(const struct changelog_rec *rec)
+{
+	return strchrnul(changelog_rec_name(rec), '\0') + 1;
+}
+
+static inline __kernel_size_t changelog_rec_snamelen(const struct changelog_rec *rec)
+{
+	return strlen(changelog_rec_sname(rec));
+}
+
+/**
+ * Remap a record to the desired format as specified by the crf flags.
+ * The record must be big enough to contain the final remapped version.
+ * Superfluous extension fields are removed and missing ones are added
+ * and zeroed. The flags of the record are updated accordingly.
+ *
+ * The jobid and rename extensions can be added to a record, to match the
+ * format an application expects, typically. In this case, the newly added
+ * fields will be zeroed.
+ * The Jobid field can be removed, to guarantee compatibility with older
+ * clients that don't expect this field in the records they process.
+ *
+ * The following assumptions are being made:
+ *   - CLF_RENAME will not be removed
+ *   - CLF_JOBID will not be added without CLF_RENAME being added too
+ *   - CLF_EXTRA_FLAGS will not be added without CLF_JOBID being added too
+ *
+ * @param[in,out]  rec         The record to remap.
+ * @param[in]      crf_wanted  Flags describing the desired extensions.
+ * @param[in]      cref_want   Flags describing the desired extra extensions.
+ */
+static inline void changelog_remap_rec(struct changelog_rec *rec,
+				       enum changelog_rec_flags crf_wanted,
+				       enum changelog_rec_extra_flags cref_want)
+{
+	char *xattr_mov = NULL;
+	char *omd_mov = NULL;
+	char *nid_mov = NULL;
+	char *uidgid_mov = NULL;
+	char *ef_mov;
+	char *jid_mov;
+	char *rnm_mov;
+	enum changelog_rec_extra_flags cref = CLFE_INVALID;
+
+	crf_wanted &= CLF_SUPPORTED;
+	cref_want &= CLFE_SUPPORTED;
+
+	if ((rec->cr_flags & CLF_SUPPORTED) == crf_wanted) {
+		if (!(rec->cr_flags & CLF_EXTRA_FLAGS) ||
+		    (rec->cr_flags & CLF_EXTRA_FLAGS &&
+		    (changelog_rec_extra_flags(rec)->cr_extra_flags &
+							CLFE_SUPPORTED) ==
+								     cref_want))
+			return;
+	}
+
+	/* First move the variable-length name field */
+	memmove((char *)rec + changelog_rec_offset(crf_wanted, cref_want),
+		changelog_rec_name(rec), rec->cr_namelen);
+
+	/* Locations of extensions in the remapped record */
+	if (rec->cr_flags & CLF_EXTRA_FLAGS) {
+		xattr_mov = (char *)rec +
+			changelog_rec_offset(crf_wanted & CLF_SUPPORTED,
+					     cref_want & ~CLFE_XATTR);
+		omd_mov = (char *)rec +
+			changelog_rec_offset(crf_wanted & CLF_SUPPORTED,
+					     cref_want & ~(CLFE_OPEN |
+							   CLFE_XATTR));
+		nid_mov = (char *)rec +
+			changelog_rec_offset(crf_wanted & CLF_SUPPORTED,
+					     cref_want & ~(CLFE_NID |
+							   CLFE_OPEN |
+							   CLFE_XATTR));
+		uidgid_mov = (char *)rec +
+			changelog_rec_offset(crf_wanted & CLF_SUPPORTED,
+					     cref_want & ~(CLFE_UIDGID |
+							   CLFE_NID |
+							   CLFE_OPEN |
+							   CLFE_XATTR));
+		cref = changelog_rec_extra_flags(rec)->cr_extra_flags;
+	}
+
+	ef_mov  = (char *)rec +
+		  changelog_rec_offset(crf_wanted & ~CLF_EXTRA_FLAGS,
+				       CLFE_INVALID);
+	jid_mov = (char *)rec +
+		  changelog_rec_offset(crf_wanted &
+				       ~(CLF_EXTRA_FLAGS | CLF_JOBID),
+				       CLFE_INVALID);
+	rnm_mov = (char *)rec +
+		  changelog_rec_offset(crf_wanted &
+				       ~(CLF_EXTRA_FLAGS |
+					 CLF_JOBID |
+					 CLF_RENAME),
+				       CLFE_INVALID);
+
+	/* Move the extension fields to the desired positions */
+	if ((crf_wanted & CLF_EXTRA_FLAGS) &&
+	    (rec->cr_flags & CLF_EXTRA_FLAGS)) {
+		if ((cref_want & CLFE_XATTR) && (cref & CLFE_XATTR))
+			memmove(xattr_mov, changelog_rec_xattr(rec),
+				sizeof(struct changelog_ext_xattr));
+
+		if ((cref_want & CLFE_OPEN) && (cref & CLFE_OPEN))
+			memmove(omd_mov, changelog_rec_openmode(rec),
+				sizeof(struct changelog_ext_openmode));
+
+		if ((cref_want & CLFE_NID) && (cref & CLFE_NID))
+			memmove(nid_mov, changelog_rec_nid(rec),
+				sizeof(struct changelog_ext_nid));
+
+		if ((cref_want & CLFE_UIDGID) && (cref & CLFE_UIDGID))
+			memmove(uidgid_mov, changelog_rec_uidgid(rec),
+				sizeof(struct changelog_ext_uidgid));
+
+		memmove(ef_mov, changelog_rec_extra_flags(rec),
+			sizeof(struct changelog_ext_extra_flags));
+	}
+
+	if ((crf_wanted & CLF_JOBID) && (rec->cr_flags & CLF_JOBID))
+		memmove(jid_mov, changelog_rec_jobid(rec),
+			sizeof(struct changelog_ext_jobid));
+
+	if ((crf_wanted & CLF_RENAME) && (rec->cr_flags & CLF_RENAME))
+		memmove(rnm_mov, changelog_rec_rename(rec),
+			sizeof(struct changelog_ext_rename));
+
+	/* Clear newly added fields */
+	if (xattr_mov && (cref_want & CLFE_XATTR) &&
+	    !(cref & CLFE_XATTR))
+		memset(xattr_mov, 0, sizeof(struct changelog_ext_xattr));
+
+	if (omd_mov && (cref_want & CLFE_OPEN) &&
+	    !(cref & CLFE_OPEN))
+		memset(omd_mov, 0, sizeof(struct changelog_ext_openmode));
+
+	if (nid_mov && (cref_want & CLFE_NID) &&
+	    !(cref & CLFE_NID))
+		memset(nid_mov, 0, sizeof(struct changelog_ext_nid));
+
+	if (uidgid_mov && (cref_want & CLFE_UIDGID) &&
+	    !(cref & CLFE_UIDGID))
+		memset(uidgid_mov, 0, sizeof(struct changelog_ext_uidgid));
+
+	if ((crf_wanted & CLF_EXTRA_FLAGS) &&
+	    !(rec->cr_flags & CLF_EXTRA_FLAGS))
+		memset(ef_mov, 0, sizeof(struct changelog_ext_extra_flags));
+
+	if ((crf_wanted & CLF_JOBID) && !(rec->cr_flags & CLF_JOBID))
+		memset(jid_mov, 0, sizeof(struct changelog_ext_jobid));
+
+	if ((crf_wanted & CLF_RENAME) && !(rec->cr_flags & CLF_RENAME))
+		memset(rnm_mov, 0, sizeof(struct changelog_ext_rename));
+
+	/* Update the record's flags accordingly */
+	rec->cr_flags = (rec->cr_flags & CLF_FLAGMASK) | crf_wanted;
+	if (rec->cr_flags & CLF_EXTRA_FLAGS)
+		changelog_rec_extra_flags(rec)->cr_extra_flags =
+			changelog_rec_extra_flags(rec)->cr_extra_flags |
+			cref_want;
+}
+
+enum changelog_message_type {
+	CL_RECORD = 10, /* message is a changelog_rec */
+	CL_EOF    = 11, /* at end of current changelog */
+};
+
+/********* Misc **********/
+
+struct ioc_data_version {
+	__u64	idv_version;
+	__u32	idv_layout_version; /* FLR: layout version for OST objects */
+	__u32	idv_flags;	/* enum ioc_data_version_flags */
+};
+
+enum ioc_data_version_flags {
+	LL_DV_RD_FLUSH	= (1 << 0), /* Flush dirty pages from clients */
+	LL_DV_WR_FLUSH	= (1 << 1), /* Flush all caching pages from clients */
+};
+
+#ifndef offsetof
+#define offsetof(typ, memb)     ((unsigned long)((char *)&(((typ *)0)->memb)))
+#endif
+
+#define dot_lustre_name ".lustre"
+
+
+/********* HSM **********/
+
+/** HSM per-file state
+ * See HSM_FLAGS below.
+ */
+enum hsm_states {
+	HS_NONE		= 0x00000000,
+	HS_EXISTS	= 0x00000001,
+	HS_DIRTY	= 0x00000002,
+	HS_RELEASED	= 0x00000004,
+	HS_ARCHIVED	= 0x00000008,
+	HS_NORELEASE	= 0x00000010,
+	HS_NOARCHIVE	= 0x00000020,
+	HS_LOST		= 0x00000040,
+};
+
+/* HSM user-setable flags. */
+#define HSM_USER_MASK   (HS_NORELEASE | HS_NOARCHIVE | HS_DIRTY)
+
+/* Other HSM flags. */
+#define HSM_STATUS_MASK (HS_EXISTS | HS_LOST | HS_RELEASED | HS_ARCHIVED)
+
+/*
+ * All HSM-related possible flags that could be applied to a file.
+ * This should be kept in sync with hsm_states.
+ */
+#define HSM_FLAGS_MASK  (HSM_USER_MASK | HSM_STATUS_MASK)
+
+/**
+ * HSM request progress state
+ */
+enum hsm_progress_states {
+	HPS_NONE	= 0,
+	HPS_WAITING	= 1,
+	HPS_RUNNING	= 2,
+	HPS_DONE	= 3,
+};
+
+static inline const char *hsm_progress_state2name(enum hsm_progress_states s)
+{
+	switch  (s) {
+	case HPS_WAITING:	return "waiting";
+	case HPS_RUNNING:	return "running";
+	case HPS_DONE:		return "done";
+	default:		return "unknown";
+	}
+}
+
+struct hsm_extent {
+	__u64 offset;
+	__u64 length;
+} __attribute__((packed));
+
+/**
+ * Current HSM states of a Lustre file.
+ *
+ * This structure purpose is to be sent to user-space mainly. It describes the
+ * current HSM flags and in-progress action.
+ */
+struct hsm_user_state {
+	/** Current HSM states, from enum hsm_states. */
+	__u32			hus_states;
+	__u32			hus_archive_id;
+	/**  The current undergoing action, if there is one */
+	__u32			hus_in_progress_state;
+	__u32			hus_in_progress_action;
+	struct hsm_extent	hus_in_progress_location;
+	char			hus_extended_info[];
+};
+
+struct hsm_state_set_ioc {
+	struct lu_fid	hssi_fid;
+	__u64		hssi_setmask;
+	__u64		hssi_clearmask;
+};
+
+/*
+ * This structure describes the current in-progress action for a file.
+ * it is retuned to user space and send over the wire
+ */
+struct hsm_current_action {
+	/**  The current undergoing action, if there is one */
+	/* state is one of hsm_progress_states */
+	__u32			hca_state;
+	/* action is one of hsm_user_action */
+	__u32			hca_action;
+	struct hsm_extent	hca_location;
+};
+
+/***** HSM user requests ******/
+/* User-generated (lfs/ioctl) request types */
+enum hsm_user_action {
+	HUA_NONE    =  1, /* no action (noop) */
+	HUA_ARCHIVE = 10, /* copy to hsm */
+	HUA_RESTORE = 11, /* prestage */
+	HUA_RELEASE = 12, /* drop ost objects */
+	HUA_REMOVE  = 13, /* remove from archive */
+	HUA_CANCEL  = 14, /* cancel a request */
+	HUA_IMPORT  = 15, /* add a new file */
+};
+
+static inline const char *hsm_user_action2name(enum hsm_user_action  a)
+{
+	switch  (a) {
+	case HUA_NONE:    return "NOOP";
+	case HUA_ARCHIVE: return "ARCHIVE";
+	case HUA_RESTORE: return "RESTORE";
+	case HUA_RELEASE: return "RELEASE";
+	case HUA_REMOVE:  return "REMOVE";
+	case HUA_CANCEL:  return "CANCEL";
+	case HUA_IMPORT:  return "IMPORT";
+	default:          return "UNKNOWN";
+	}
+}
+
+/*
+ * List of hr_flags (bit field)
+ */
+#define HSM_FORCE_ACTION 0x0001
+/* used by CT, cannot be set by user */
+#define HSM_GHOST_COPY   0x0002
+
+/**
+ * Contains all the fixed part of struct hsm_user_request.
+ *
+ */
+struct hsm_request {
+	__u32 hr_action;	/* enum hsm_user_action */
+	__u32 hr_archive_id;	/* archive id, used only with HUA_ARCHIVE */
+	__u64 hr_flags;		/* request flags */
+	__u32 hr_itemcount;	/* item count in hur_user_item vector */
+	__u32 hr_data_len;
+};
+
+struct hsm_user_item {
+       struct lu_fid        hui_fid;
+       struct hsm_extent hui_extent;
+} __attribute__((packed));
+
+struct hsm_user_request {
+	struct hsm_request	hur_request;
+	struct hsm_user_item	hur_user_item[0];
+	/* extra data blob at end of struct (after all
+	 * hur_user_items), only use helpers to access it
+	 */
+} __attribute__((packed));
+
+/** Return pointer to data field in a hsm user request */
+static inline void *hur_data(struct hsm_user_request *hur)
+{
+	return &(hur->hur_user_item[hur->hur_request.hr_itemcount]);
+}
+
+/**
+ * Compute the current length of the provided hsm_user_request.  This returns -1
+ * instead of an errno because __kernel_ssize_t is defined to be only
+ * [ -1, SSIZE_MAX ]
+ *
+ * return -1 on bounds check error.
+ */
+static inline __kernel_size_t hur_len(struct hsm_user_request *hur)
+{
+	__u64	size;
+
+	/* can't overflow a __u64 since hr_itemcount is only __u32 */
+	size = offsetof(struct hsm_user_request, hur_user_item[0]) +
+		(__u64)hur->hur_request.hr_itemcount *
+		sizeof(hur->hur_user_item[0]) + hur->hur_request.hr_data_len;
+
+	if ((__kernel_ssize_t)size < 0)
+		return -1;
+
+	return size;
+}
+
+/****** HSM RPCs to copytool *****/
+/* Message types the copytool may receive */
+enum hsm_message_type {
+	HMT_ACTION_LIST = 100, /* message is a hsm_action_list */
+};
+
+/* Actions the copytool may be instructed to take for a given action_item */
+enum hsm_copytool_action {
+	HSMA_NONE    = 10, /* no action */
+	HSMA_ARCHIVE = 20, /* arbitrary offset */
+	HSMA_RESTORE = 21,
+	HSMA_REMOVE  = 22,
+	HSMA_CANCEL  = 23,
+	HSMA_IMPORT  = 24
+};
+
+static inline const char *hsm_copytool_action2name(enum hsm_copytool_action  a)
+{
+	switch  (a) {
+	case HSMA_NONE:    return "NOOP";
+	case HSMA_ARCHIVE: return "ARCHIVE";
+	case HSMA_RESTORE: return "RESTORE";
+	case HSMA_REMOVE:  return "REMOVE";
+	case HSMA_CANCEL:  return "CANCEL";
+	case HSMA_IMPORT:  return "IMPORT";
+	default:           return "UNKNOWN";
+	}
+}
+
+/* Copytool item action description */
+struct hsm_action_item {
+	__u32      hai_len;     /* valid size of this struct */
+	__u32      hai_action;  /* hsm_copytool_action, but use known size */
+	struct lu_fid hai_fid;     /* Lustre FID to operate on */
+	struct lu_fid hai_dfid;    /* fid used for data access */
+	struct hsm_extent hai_extent;  /* byte range to operate on */
+	__u64      hai_cookie;  /* action cookie from coordinator */
+	__u64      hai_gid;     /* grouplock id */
+	char       hai_data[0]; /* variable length */
+} __attribute__((packed));
+
+/**
+ * helper function which print in hexa the first bytes of
+ * hai opaque field
+ *
+ * \param hai [IN]        record to print
+ * \param buffer [IN,OUT] buffer to write the hex string to
+ * \param len [IN]        max buffer length
+ *
+ * \retval buffer
+ */
+static inline char *hai_dump_data_field(const struct hsm_action_item *hai,
+					char *buffer, __kernel_size_t len)
+{
+	int i;
+	int data_len;
+	char *ptr;
+
+	ptr = buffer;
+	data_len = hai->hai_len - sizeof(*hai);
+	for (i = 0; (i < data_len) && (len > 2); i++) {
+		snprintf(ptr, 3, "%02X", (unsigned char)hai->hai_data[i]);
+		ptr += 2;
+		len -= 2;
+	}
+
+	*ptr = '\0';
+
+	return buffer;
+}
+
+/* Copytool action list */
+#define HAL_VERSION 1
+#define HAL_MAXSIZE LNET_MTU /* bytes, used in userspace only */
+struct hsm_action_list {
+	__u32 hal_version;
+	__u32 hal_count;       /* number of hai's to follow */
+	__u64 hal_compound_id; /* returned by coordinator, ignored */
+	__u64 hal_flags;
+	__u32 hal_archive_id; /* which archive backend */
+	__u32 padding1;
+	char  hal_fsname[0];   /* null-terminated */
+	/* struct hsm_action_item[hal_count] follows, aligned on 8-byte
+	   boundaries. See hai_zero */
+} __attribute__((packed));
+
+/* Return pointer to first hai in action list */
+static inline struct hsm_action_item *hai_first(struct hsm_action_list *hal)
+{
+	__kernel_size_t offset = __ALIGN_KERNEL(strlen(hal->hal_fsname) + 1, 8);
+
+	return (struct hsm_action_item *)(hal->hal_fsname + offset);
+}
+
+/* Return pointer to next hai */
+static inline struct hsm_action_item * hai_next(struct hsm_action_item *hai)
+{
+	__kernel_size_t offset = __ALIGN_KERNEL(hai->hai_len, 8);
+
+	return (struct hsm_action_item *)((char *)hai + offset);
+}
+
+/* Return size of an hsm_action_list */
+static inline __kernel_size_t hal_size(struct hsm_action_list *hal)
+{
+	__u32 i;
+	__kernel_size_t sz;
+	struct hsm_action_item *hai;
+
+	sz = sizeof(*hal) + __ALIGN_KERNEL(strlen(hal->hal_fsname) + 1, 8);
+	hai = hai_first(hal);
+	for (i = 0; i < hal->hal_count ; i++, hai = hai_next(hai))
+		sz += __ALIGN_KERNEL(hai->hai_len, 8);
+
+	return sz;
+}
+
+/* HSM file import
+ * describe the attributes to be set on imported file
+ */
+struct hsm_user_import {
+	__u64		hui_size;
+	__u64		hui_atime;
+	__u64		hui_mtime;
+	__u32		hui_atime_ns;
+	__u32		hui_mtime_ns;
+	__u32		hui_uid;
+	__u32		hui_gid;
+	__u32		hui_mode;
+	__u32		hui_archive_id;
+};
+
+/* Copytool progress reporting */
+#define HP_FLAG_COMPLETED 0x01
+#define HP_FLAG_RETRY     0x02
+
+struct hsm_progress {
+	struct lu_fid		hp_fid;
+	__u64			hp_cookie;
+	struct hsm_extent	hp_extent;
+	__u16			hp_flags;
+	__u16			hp_errval; /* positive val */
+	__u32			padding;
+};
+
+struct hsm_copy {
+	__u64			hc_data_version;
+	__u16			hc_flags;
+	__u16			hc_errval; /* positive val */
+	__u32			padding;
+	struct hsm_action_item	hc_hai;
+};
+
+/* JSON objects */
+enum llapi_json_types {
+	LLAPI_JSON_INTEGER = 1,
+	LLAPI_JSON_BIGNUM,
+	LLAPI_JSON_REAL,
+	LLAPI_JSON_STRING
+};
+
+struct llapi_json_item {
+	char			*lji_key;
+	__u32			lji_type;
+	union {
+		int	lji_integer;
+		__u64	lji_u64;
+		double	lji_real;
+		char	*lji_string;
+	};
+	struct llapi_json_item	*lji_next;
+};
+
+struct llapi_json_item_list {
+	int			ljil_item_count;
+	struct llapi_json_item	*ljil_items;
+};
+
+enum lu_ladvise_type {
+	LU_LADVISE_INVALID	= 0,
+	LU_LADVISE_WILLREAD	= 1,
+	LU_LADVISE_DONTNEED	= 2,
+	LU_LADVISE_LOCKNOEXPAND = 3,
+	LU_LADVISE_LOCKAHEAD	= 4,
+	LU_LADVISE_MAX
+};
+
+#define LU_LADVISE_NAMES {						\
+	[LU_LADVISE_WILLREAD]		= "willread",			\
+	[LU_LADVISE_DONTNEED]		= "dontneed",			\
+	[LU_LADVISE_LOCKNOEXPAND]	= "locknoexpand",		\
+	[LU_LADVISE_LOCKAHEAD]		= "lockahead",			\
+}
+
+/* This is the userspace argument for ladvise.  It is currently the same as
+ * what goes on the wire (struct lu_ladvise), but is defined separately as we
+ * may need info which is only used locally. */
+struct llapi_lu_ladvise {
+	__u16 lla_advice;	/* advice type */
+	__u16 lla_value1;	/* values for different advice types */
+	__u32 lla_value2;
+	__u64 lla_start;	/* first byte of extent for advice */
+	__u64 lla_end;		/* last byte of extent for advice */
+	__u32 lla_value3;
+	__u32 lla_value4;
+};
+
+enum ladvise_flag {
+	LF_ASYNC	= 0x00000001,
+	LF_UNSET        = 0x00000002,
+};
+
+#define LADVISE_MAGIC 0x1ADF1CE0
+/* Masks of valid flags for each advice */
+#define LF_LOCKNOEXPAND_MASK LF_UNSET
+/* Flags valid for all advices not explicitly specified */
+#define LF_DEFAULT_MASK LF_ASYNC
+/* All flags */
+#define LF_MASK (LF_ASYNC | LF_UNSET)
+
+#define lla_lockahead_mode   lla_value1
+#define lla_peradvice_flags    lla_value2
+#define lla_lockahead_result lla_value3
+
+/* This is the userspace argument for ladvise, corresponds to ladvise_hdr which
+ * is used on the wire.  It is defined separately as we may need info which is
+ * only used locally. */
+struct llapi_ladvise_hdr {
+	__u32			lah_magic;	/* LADVISE_MAGIC */
+	__u32			lah_count;	/* number of advices */
+	__u64			lah_flags;	/* from enum ladvise_flag */
+	__u32			lah_value1;	/* unused */
+	__u32			lah_value2;	/* unused */
+	__u64			lah_value3;	/* unused */
+	struct llapi_lu_ladvise	lah_advise[0];	/* advices in this header */
+};
+
+#define LAH_COUNT_MAX	(1024)
+
+/* Shared key */
+enum sk_crypt_alg {
+	SK_CRYPT_INVALID	= -1,
+	SK_CRYPT_EMPTY		= 0,
+	SK_CRYPT_AES256_CTR	= 1,
+};
+
+enum sk_hmac_alg {
+	SK_HMAC_INVALID	= -1,
+	SK_HMAC_EMPTY	= 0,
+	SK_HMAC_SHA256	= 1,
+	SK_HMAC_SHA512	= 2,
+};
+
+struct sk_crypt_type {
+	const char     *sct_name;
+	int		sct_type;
+};
+
+struct sk_hmac_type {
+	const char     *sht_name;
+	int		sht_type;
+};
+
+enum lock_mode_user {
+	MODE_READ_USER = 1,
+	MODE_WRITE_USER,
+	MODE_MAX_USER,
+};
+
+#define LOCK_MODE_NAMES { \
+	[MODE_READ_USER]  = "READ",\
+	[MODE_WRITE_USER] = "WRITE"\
+}
+
+enum lockahead_results {
+	LLA_RESULT_SENT = 0,
+	LLA_RESULT_DIFFERENT,
+	LLA_RESULT_SAME,
+};
+
+struct fid_array {
+	__u32 fa_nr;
+	/* make header's size equal lu_fid */
+	__u32 fa_padding0;
+	__u64 fa_padding1;
+	struct lu_fid fa_fids[0];
+};
+#define OBD_MAX_FIDS_IN_ARRAY	4096
+
+#if defined(__cplusplus)
+}
+#endif
+
+/** @} lustreuser */
+
+#endif /* _LUSTRE_USER_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_ver.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ver.h
similarity index 83%
rename from drivers/staging/lustrefsx/lustre/include/lustre_ver.h
rename to drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ver.h
index 0557c2dd554e5..90aa25d8aab8a 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_ver.h
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_ver.h
@@ -23,15 +23,9 @@
 #define LUSTRE_VERSION_CODE						\
 	OBD_OCD_VERSION(LUSTRE_MAJOR, LUSTRE_MINOR, LUSTRE_PATCH, LUSTRE_FIX)
 
-/* liblustre clients are only allowed to connect if their LUSTRE_FIX mismatches
- * by this amount (set in lustre/autoconf/lustre-version.ac). */
-#define LUSTRE_VERSION_ALLOWED_OFFSET OBD_OCD_VERSION(0, 0, 1, 32)
-
-#ifdef __KERNEL__
 /* If lustre version of client and servers it connects to differs by more
  * than this amount, client would issue a warning.
  * (set in lustre/autoconf/lustre-version.ac) */
 #define LUSTRE_VERSION_OFFSET_WARN OBD_OCD_VERSION(0, 4, 50, 0)
-#endif
 
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/include/upcall_cache.h b/drivers/staging/lustrefsx/lustre/include/upcall_cache.h
index accc4495d156e..1f02294b9660d 100644
--- a/drivers/staging/lustrefsx/lustre/include/upcall_cache.h
+++ b/drivers/staging/lustrefsx/lustre/include/upcall_cache.h
@@ -34,7 +34,7 @@
 #define _UPCALL_CACHE_H
 
 #include <libcfs/libcfs.h>
-#include <lnet/types.h>
+#include <uapi/linux/lnet/lnet-types.h>
 
 /** \defgroup ucache ucache
  *
@@ -85,8 +85,8 @@ struct upcall_cache_entry {
 	atomic_t		ue_refcount;
 	int			ue_flags;
 	wait_queue_head_t	ue_waitq;
-	cfs_time_t		ue_acquire_expire;
-	cfs_time_t		ue_expire;
+	time64_t		ue_acquire_expire;
+	time64_t		ue_expire;
 	union {
 		struct md_identity	identity;
 	} u;
@@ -121,8 +121,8 @@ struct upcall_cache {
 
 	char			uc_name[40];		/* for upcall */
 	char			uc_upcall[UC_CACHE_UPCALL_MAXPATH];
-	int			uc_acquire_expire;	/* seconds */
-	int			uc_entry_expire;	/* seconds */
+	time64_t		uc_acquire_expire;	/* seconds */
+	time64_t		uc_entry_expire;	/* seconds */
 	struct upcall_cache_ops	*uc_ops;
 };
 
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/interval_tree.c b/drivers/staging/lustrefsx/lustre/ldlm/interval_tree.c
index 7dd0c65332649..b39b105a894e6 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/interval_tree.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/interval_tree.c
@@ -36,11 +36,8 @@
  * Author: Huang Wei <huangwei@clusterfs.com>
  * Author: Jay Xiong <jinshan.xiong@sun.com>
  */
-#ifdef __KERNEL__
-# include <lustre_dlm.h>
-#else
-# include <libcfs/libcfs.h>
-#endif
+
+#include <lustre_dlm.h>
 #include <interval_tree.h>
 
 enum {
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_extent.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_extent.c
index 1088d583145e7..59d1302a36516 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_extent.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_extent.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2013, Intel Corporation.
+ * Copyright (c) 2010, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -269,38 +269,49 @@ ldlm_extent_internal_policy_waiting(struct ldlm_lock *req,
 static void ldlm_extent_policy(struct ldlm_resource *res,
 			       struct ldlm_lock *lock, __u64 *flags)
 {
-        struct ldlm_extent new_ex = { .start = 0, .end = OBD_OBJECT_EOF };
-
-        if (lock->l_export == NULL)
-                /*
-                 * this is local lock taken by server (e.g., as a part of
-                 * OST-side locking, or unlink handling). Expansion doesn't
-                 * make a lot of sense for local locks, because they are
-                 * dropped immediately on operation completion and would only
-                 * conflict with other threads.
-                 */
-                return;
+	struct ldlm_extent new_ex = { .start = 0, .end = OBD_OBJECT_EOF };
+
+	if (lock->l_export == NULL)
+		/*
+		 * this is a local lock taken by server (e.g., as a part of
+		 * OST-side locking, or unlink handling). Expansion doesn't
+		 * make a lot of sense for local locks, because they are
+		 * dropped immediately on operation completion and would only
+		 * conflict with other threads.
+		 */
+		return;
 
-        if (lock->l_policy_data.l_extent.start == 0 &&
-            lock->l_policy_data.l_extent.end == OBD_OBJECT_EOF)
-                /* fast-path whole file locks */
-                return;
+	if (lock->l_policy_data.l_extent.start == 0 &&
+	    lock->l_policy_data.l_extent.end == OBD_OBJECT_EOF)
+		/* fast-path whole file locks */
+		return;
 
-        ldlm_extent_internal_policy_granted(lock, &new_ex);
-        ldlm_extent_internal_policy_waiting(lock, &new_ex);
+	/* Because reprocess_queue zeroes flags and uses it to return
+	 * LDLM_FL_LOCK_CHANGED, we must check for the NO_EXPANSION flag
+	 * in the lock flags rather than the 'flags' argument */
+	if (likely(!(lock->l_flags & LDLM_FL_NO_EXPANSION))) {
+		ldlm_extent_internal_policy_granted(lock, &new_ex);
+		ldlm_extent_internal_policy_waiting(lock, &new_ex);
+	} else {
+		LDLM_DEBUG(lock, "Not expanding manually requested lock.\n");
+		new_ex.start = lock->l_policy_data.l_extent.start;
+		new_ex.end = lock->l_policy_data.l_extent.end;
+		/* In case the request is not on correct boundaries, we call
+		 * fixup. (normally called in ldlm_extent_internal_policy_*) */
+		ldlm_extent_internal_policy_fixup(lock, &new_ex, 0);
+	}
 
-        if (new_ex.start != lock->l_policy_data.l_extent.start ||
-            new_ex.end != lock->l_policy_data.l_extent.end) {
-                *flags |= LDLM_FL_LOCK_CHANGED;
-                lock->l_policy_data.l_extent.start = new_ex.start;
-                lock->l_policy_data.l_extent.end = new_ex.end;
-        }
+	if (!ldlm_extent_equal(&new_ex, &lock->l_policy_data.l_extent)) {
+		*flags |= LDLM_FL_LOCK_CHANGED;
+		lock->l_policy_data.l_extent.start = new_ex.start;
+		lock->l_policy_data.l_extent.end = new_ex.end;
+	}
 }
 
 static int ldlm_check_contention(struct ldlm_lock *lock, int contended_locks)
 {
 	struct ldlm_resource *res = lock->l_resource;
-	cfs_time_t now = cfs_time_current();
+	time64_t now = ktime_get_seconds();
 
 	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_SET_CONTENTION))
 		return 1;
@@ -308,8 +319,9 @@ static int ldlm_check_contention(struct ldlm_lock *lock, int contended_locks)
 	CDEBUG(D_DLMTRACE, "contended locks = %d\n", contended_locks);
 	if (contended_locks > ldlm_res_to_ns(res)->ns_contended_locks)
 		res->lr_contention_time = now;
-	return cfs_time_before(now, cfs_time_add(res->lr_contention_time,
-		cfs_time_seconds(ldlm_res_to_ns(res)->ns_contention_time)));
+
+	return now < res->lr_contention_time +
+		     ldlm_res_to_ns(res)->ns_contention_time;
 }
 
 struct ldlm_extent_compat_args {
@@ -421,7 +433,8 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
                         }
 
                         if (tree->lit_mode == LCK_GROUP) {
-                                if (*flags & LDLM_FL_BLOCK_NOWAIT) {
+				if (*flags & (LDLM_FL_BLOCK_NOWAIT |
+					      LDLM_FL_SPECULATIVE)) {
                                         compat = -EWOULDBLOCK;
                                         goto destroylock;
                                 }
@@ -438,10 +451,24 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
                                 continue;
                         }
 
-                        if (!work_list) {
-                                rc = interval_is_overlapped(tree->lit_root,&ex);
-                                if (rc)
-                                        RETURN(0);
+			/* We've found a potentially blocking lock, check
+			 * compatibility.  This handles locks other than GROUP
+			 * locks, which are handled separately above.
+			 *
+			 * Locks with FL_SPECULATIVE are asynchronous requests
+			 * which must never wait behind another lock, so they
+			 * fail if any conflicting lock is found. */
+			if (!work_list || (*flags & LDLM_FL_SPECULATIVE)) {
+				rc = interval_is_overlapped(tree->lit_root,
+							    &ex);
+				if (rc) {
+					if (!work_list) {
+						RETURN(0);
+					} else {
+						compat = -EWOULDBLOCK;
+						goto destroylock;
+					}
+				}
                         } else {
                                 interval_search(tree->lit_root, &ex,
                                                 ldlm_extent_compat_cb, &data);
@@ -528,8 +555,8 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
                                     lock->l_policy_data.l_extent.gid) {
                                         /* If existing lock with matched gid is granted,
                                            we grant new one too. */
-                                        if (lock->l_req_mode == lock->l_granted_mode)
-                                                RETURN(2);
+					if (ldlm_is_granted(lock))
+						RETURN(2);
 
                                         /* Otherwise we are scanning queue of waiting
                                          * locks and it means current request would
@@ -537,7 +564,8 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
                                          * already blocked.
                                          * If we are in nonblocking mode - return
                                          * immediately */
-                                        if (*flags & LDLM_FL_BLOCK_NOWAIT) {
+					if (*flags & (LDLM_FL_BLOCK_NOWAIT
+						      | LDLM_FL_SPECULATIVE)) {
                                                 compat = -EWOULDBLOCK;
                                                 goto destroylock;
                                         }
@@ -556,8 +584,8 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
                                 }
                         }
 
-                        if (unlikely(req_mode == LCK_GROUP &&
-                                     (lock->l_req_mode != lock->l_granted_mode))) {
+			if (unlikely(req_mode == LCK_GROUP &&
+				     !ldlm_is_granted(lock))) {
                                 scan = 1;
                                 compat = 0;
                                 if (lock->l_req_mode != LCK_GROUP) {
@@ -580,10 +608,11 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
                         }
 
                         if (unlikely(lock->l_req_mode == LCK_GROUP)) {
-                                /* If compared lock is GROUP, then requested is PR/PW/
-                                 * so this is not compatible; extent range does not
-                                 * matter */
-                                if (*flags & LDLM_FL_BLOCK_NOWAIT) {
+				/* If compared lock is GROUP, then requested is
+				 * PR/PW so this is not compatible; extent
+				 * range does not matter */
+				if (*flags & (LDLM_FL_BLOCK_NOWAIT
+					      | LDLM_FL_SPECULATIVE)) {
                                         compat = -EWOULDBLOCK;
                                         goto destroylock;
                                 } else {
@@ -602,6 +631,11 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
                         if (!work_list)
                                 RETURN(0);
 
+			if (*flags & LDLM_FL_SPECULATIVE) {
+				compat = -EWOULDBLOCK;
+				goto destroylock;
+			}
+
                         /* don't count conflicting glimpse locks */
                         if (lock->l_req_mode == LCK_PR &&
                             lock->l_policy_data.l_extent.start == 0 &&
@@ -642,7 +676,7 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
 void ldlm_lock_prolong_one(struct ldlm_lock *lock,
 			   struct ldlm_prolong_args *arg)
 {
-	int timeout;
+	time64_t timeout;
 
 	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PROLONG_PAUSE, 3);
 
@@ -662,7 +696,7 @@ void ldlm_lock_prolong_one(struct ldlm_lock *lock,
 	 */
 	timeout = arg->lpa_timeout + (ldlm_bl_timeout(lock) >> 1);
 
-	LDLM_DEBUG(lock, "refreshed to %ds.\n", timeout);
+	LDLM_DEBUG(lock, "refreshed to %llds.\n", timeout);
 
 	arg->lpa_blocks_cnt++;
 
@@ -752,25 +786,24 @@ int ldlm_process_extent_lock(struct ldlm_lock *lock, __u64 *flags,
 			     enum ldlm_error *err, struct list_head *work_list)
 {
 	struct ldlm_resource *res = lock->l_resource;
-	struct list_head rpc_list;
 	int rc, rc2;
 	int contended_locks = 0;
+	struct list_head *grant_work = intention == LDLM_PROCESS_ENQUEUE ?
+							NULL : work_list;
 	ENTRY;
 
-	LASSERT(lock->l_granted_mode != lock->l_req_mode);
-	LASSERT(list_empty(&res->lr_converting));
+	LASSERT(!ldlm_is_granted(lock));
 	LASSERT(!(*flags & LDLM_FL_DENY_ON_CONTENTION) ||
 		!ldlm_is_ast_discard_data(lock));
-	INIT_LIST_HEAD(&rpc_list);
 	check_res_locked(res);
 	*err = ELDLM_OK;
 
 	if (intention == LDLM_PROCESS_RESCAN) {
-                /* Careful observers will note that we don't handle -EWOULDBLOCK
-                 * here, but it's ok for a non-obvious reason -- compat_queue
-                 * can only return -EWOULDBLOCK if (flags & BLOCK_NOWAIT).
-                 * flags should always be zero here, and if that ever stops
-                 * being true, we want to find out. */
+		/* Careful observers will note that we don't handle -EWOULDBLOCK
+		 * here, but it's ok for a non-obvious reason -- compat_queue
+		 * can only return -EWOULDBLOCK if (flags & BLOCK_NOWAIT |
+		 * SPECULATIVE). flags should always be zero here, and if that
+		 * ever stops being true, we want to find out. */
                 LASSERT(*flags == 0);
                 rc = ldlm_extent_compat_queue(&res->lr_granted, lock, flags,
                                               err, NULL, &contended_locks);
@@ -786,49 +819,38 @@ int ldlm_process_extent_lock(struct ldlm_lock *lock, __u64 *flags,
 
                 if (!OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_EVICT_RACE))
                         ldlm_extent_policy(res, lock, flags);
-                ldlm_grant_lock(lock, work_list);
+		ldlm_grant_lock(lock, grant_work);
                 RETURN(LDLM_ITER_CONTINUE);
         }
 
-	LASSERT((intention == LDLM_PROCESS_ENQUEUE && work_list == NULL) ||
-		(intention == LDLM_PROCESS_RECOVERY && work_list != NULL));
- restart:
         contended_locks = 0;
         rc = ldlm_extent_compat_queue(&res->lr_granted, lock, flags, err,
-                                      &rpc_list, &contended_locks);
+				      work_list, &contended_locks);
 	if (rc < 0)
 		GOTO(out_rpc_list, rc);
 
 	rc2 = 0;
 	if (rc != 2) {
 		rc2 = ldlm_extent_compat_queue(&res->lr_waiting, lock,
-					       flags, err, &rpc_list,
+					       flags, err, work_list,
 					       &contended_locks);
 		if (rc2 < 0)
 			GOTO(out_rpc_list, rc = rc2);
 	}
 
-	if (rc + rc2 != 2) {
-		/* Adding LDLM_FL_NO_TIMEOUT flag to granted lock to force
-		 * client to wait for the lock endlessly once the lock is
-		 * enqueued -bzzz */
-		rc = ldlm_handle_conflict_lock(lock, flags, &rpc_list,
-					       LDLM_FL_NO_TIMEOUT);
-		if (rc == -ERESTART)
-			GOTO(restart, rc);
-		*err = rc;
-	} else {
+	if (rc + rc2 == 2) {
 		ldlm_extent_policy(res, lock, flags);
 		ldlm_resource_unlink_lock(lock);
-		ldlm_grant_lock(lock, work_list);
-		rc = 0;
+		ldlm_grant_lock(lock, grant_work);
+	} else {
+		/* Adding LDLM_FL_NO_TIMEOUT flag to granted lock to
+		 * force client to wait for the lock endlessly once
+		 * the lock is enqueued -bzzz */
+		*flags |= LDLM_FL_NO_TIMEOUT;
 	}
+	rc = LDLM_ITER_CONTINUE;
 
 out_rpc_list:
-	if (!list_empty(&rpc_list)) {
-		LASSERT(!ldlm_is_ast_discard_data(lock));
-		ldlm_discard_bl_list(&rpc_list);
-	}
 	RETURN(rc);
 }
 #endif /* HAVE_SERVER_SUPPORT */
@@ -943,7 +965,7 @@ __u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms)
 EXPORT_SYMBOL(ldlm_extent_shift_kms);
 
 struct kmem_cache *ldlm_interval_slab;
-struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock)
+static struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock)
 {
 	struct ldlm_interval *node;
 	ENTRY;
@@ -1004,6 +1026,14 @@ static inline int ldlm_mode_to_index(enum ldlm_mode mode)
 	return index;
 }
 
+int ldlm_extent_alloc_lock(struct ldlm_lock *lock)
+{
+	lock->l_tree_node = NULL;
+	if (ldlm_interval_alloc(lock) == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
 /** Add newly granted lock into interval tree for the resource. */
 void ldlm_extent_add_lock(struct ldlm_resource *res,
                           struct ldlm_lock *lock)
@@ -1013,7 +1043,7 @@ void ldlm_extent_add_lock(struct ldlm_resource *res,
         struct ldlm_extent *extent;
 	int idx, rc;
 
-        LASSERT(lock->l_granted_mode == lock->l_req_mode);
+	LASSERT(ldlm_is_granted(lock));
 
         node = lock->l_tree_node;
         LASSERT(node != NULL);
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_flock.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_flock.c
index b3d669799ceba..be849938cc6c6 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_flock.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_flock.c
@@ -27,7 +27,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2014, Intel Corporation.
+ * Copyright (c) 2010, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -289,6 +289,8 @@ ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags,
 	int overlaps = 0;
 	int splitted = 0;
 	const struct ldlm_callback_suite null_cbs = { NULL };
+	struct list_head *grant_work = intention == LDLM_PROCESS_ENQUEUE ?
+							NULL : work_list;
 	ENTRY;
 
 	CDEBUG(D_DLMTRACE, "flags %#llx owner %llu pid %u mode %u start "
@@ -348,7 +350,7 @@ ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags,
 				reprocess_failed = 1;
 				if (ldlm_flock_deadlock(req, lock)) {
 					ldlm_flock_cancel_on_deadlock(req,
-							work_list);
+							grant_work);
 					RETURN(LDLM_ITER_CONTINUE);
 				}
 				continue;
@@ -579,7 +581,7 @@ ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags,
 restart:
 				ldlm_reprocess_queue(res, &res->lr_waiting,
 						     &rpc_list,
-						     LDLM_PROCESS_RESCAN);
+						     LDLM_PROCESS_RESCAN, NULL);
 
                                 unlock_res_and_lock(req);
                                 rc = ldlm_run_ast_work(ns, &rpc_list,
@@ -590,7 +592,7 @@ ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags,
                        }
                 } else {
                         LASSERT(req->l_completion_ast);
-                        ldlm_add_ast_work_item(req, NULL, work_list);
+			ldlm_add_ast_work_item(req, NULL, grant_work);
                 }
 #else /* !HAVE_SERVER_SUPPORT */
                 /* The only one possible case for client-side calls flock
@@ -742,7 +744,7 @@ ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
 		RETURN(-EIO);
 	}
 
-        /* ldlm_lock_enqueue() has already placed lock on the granted list. */
+	/* ldlm_lock_enqueue() has already placed lock on the granted list. */
 	ldlm_resource_unlink_lock(lock);
 
 	/* Import invalidation. We need to actually release the lock
@@ -757,7 +759,7 @@ ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
 			LASSERT(ldlm_is_test_lock(lock));
 
 		if (ldlm_is_test_lock(lock) || ldlm_is_flock_deadlock(lock))
-			mode = flock_type(getlk);
+			mode = getlk->fl_type;
 		else
 			mode = lock->l_granted_mode;
 
@@ -780,27 +782,26 @@ ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
 	LDLM_DEBUG(lock, "client-side enqueue granted");
 
 	if (flags & LDLM_FL_TEST_LOCK) {
-                /* fcntl(F_GETLK) request */
-                /* The old mode was saved in getlk->fl_type so that if the mode
-                 * in the lock changes we can decref the appropriate refcount.*/
+		/*
+		 * fcntl(F_GETLK) request
+		 * The old mode was saved in getlk->fl_type so that if the mode
+		 * in the lock changes we can decref the appropriate refcount.
+		 */
 		LASSERT(ldlm_is_test_lock(lock));
-		ldlm_flock_destroy(lock, flock_type(getlk),
-				   LDLM_FL_WAIT_NOREPROC);
+		ldlm_flock_destroy(lock, getlk->fl_type, LDLM_FL_WAIT_NOREPROC);
 		switch (lock->l_granted_mode) {
 		case LCK_PR:
-			flock_set_type(getlk, F_RDLCK);
+			getlk->fl_type = F_RDLCK;
 			break;
 		case LCK_PW:
-			flock_set_type(getlk, F_WRLCK);
+			getlk->fl_type = F_WRLCK;
 			break;
 		default:
-			flock_set_type(getlk, F_UNLCK);
+			getlk->fl_type = F_UNLCK;
 		}
-		flock_set_pid(getlk, (pid_t)lock->l_policy_data.l_flock.pid);
-		flock_set_start(getlk,
-				(loff_t)lock->l_policy_data.l_flock.start);
-		flock_set_end(getlk,
-			      (loff_t)lock->l_policy_data.l_flock.end);
+		getlk->fl_pid = (pid_t)lock->l_policy_data.l_flock.pid;
+		getlk->fl_start = (loff_t)lock->l_policy_data.l_flock.start;
+		getlk->fl_end = (loff_t)lock->l_policy_data.l_flock.end;
 	} else {
 		__u64 noreproc = LDLM_FL_WAIT_NOREPROC;
 
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_inodebits.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_inodebits.c
index 90e34a612d7c8..c407cf676fba8 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_inodebits.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_inodebits.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -57,6 +57,89 @@
 #include "ldlm_internal.h"
 
 #ifdef HAVE_SERVER_SUPPORT
+
+/**
+ * It should iterate through all waiting locks on a given resource queue and
+ * attempt to grant them. An optimization is to check only heads waitintg
+ * locks for each inodebit type.
+ *
+ * Must be called with resource lock held.
+ */
+int ldlm_reprocess_inodebits_queue(struct ldlm_resource *res,
+				   struct list_head *queue,
+				   struct list_head *work_list,
+				   enum ldlm_process_intention intention,
+				   struct ldlm_lock *hint)
+{
+	__u64 flags;
+	int rc = LDLM_ITER_CONTINUE;
+	enum ldlm_error err;
+	struct list_head bl_ast_list = LIST_HEAD_INIT(bl_ast_list);
+	struct ldlm_ibits_queues *queues = res->lr_ibits_queues;
+	int i;
+
+	ENTRY;
+
+	check_res_locked(res);
+
+	LASSERT(res->lr_type == LDLM_IBITS);
+	LASSERT(intention == LDLM_PROCESS_RESCAN ||
+		intention == LDLM_PROCESS_RECOVERY);
+
+	if (intention == LDLM_PROCESS_RECOVERY)
+		return ldlm_reprocess_queue(res, queue, work_list, intention,
+					    NULL);
+
+restart:
+	CDEBUG(D_DLMTRACE, "--- Reprocess resource "DLDLMRES" (%p)\n",
+	       PLDLMRES(res), res);
+
+	for (i = 0; i < MDS_INODELOCK_NUMBITS; i++) {
+		struct list_head rpc_list = LIST_HEAD_INIT(rpc_list);
+		struct list_head *head = &queues->liq_waiting[i];
+		struct ldlm_lock *pending;
+		struct ldlm_ibits_node *node;
+
+		if (list_empty(head))
+			continue;
+		if (hint && !(hint->l_policy_data.l_inodebits.bits & (1 << i)))
+			continue;
+
+		node = list_entry(head->next, struct ldlm_ibits_node,
+				  lin_link[i]);
+
+		pending = node->lock;
+		LDLM_DEBUG(pending, "Reprocessing lock from queue %d", i);
+
+		flags = 0;
+		rc = ldlm_process_inodebits_lock(pending, &flags, intention,
+						 &err, &rpc_list);
+		if (ldlm_is_granted(pending)) {
+			list_splice(&rpc_list, work_list);
+			/* Try to grant more locks from current queue */
+			i--;
+		} else {
+			list_splice(&rpc_list, &bl_ast_list);
+		}
+	}
+
+	if (!list_empty(&bl_ast_list)) {
+		unlock_res(res);
+
+		rc = ldlm_run_ast_work(ldlm_res_to_ns(res), &bl_ast_list,
+				       LDLM_WORK_BL_AST);
+
+		lock_res(res);
+		if (rc == -ERESTART)
+			GOTO(restart, rc);
+	}
+
+	if (!list_empty(&bl_ast_list))
+		ldlm_discard_bl_list(&bl_ast_list);
+
+	RETURN(rc);
+}
+
 /**
  * Determine if the lock is compatible with all locks on the queue.
  *
@@ -79,12 +162,18 @@ ldlm_inodebits_compat_queue(struct list_head *queue, struct ldlm_lock *req,
 	struct list_head *tmp;
 	struct ldlm_lock *lock;
 	__u64 req_bits = req->l_policy_data.l_inodebits.bits;
+	__u64 *try_bits = &req->l_policy_data.l_inodebits.try_bits;
 	int compat = 1;
+
 	ENTRY;
 
-	/* There is no sense in lock with no bits set, I think.
-	 * Also, such a lock would be compatible with any other bit lock */
-	LASSERT(req_bits != 0);
+	/* There is no sense in lock with no bits set. Also such a lock
+	 * would be compatible with any other bit lock.
+	 * Meanwhile that can be true if there were just try_bits and all
+	 * are failed, so just exit gracefully and let the caller to care.
+	 */
+	if ((req_bits | *try_bits) == 0)
+		RETURN(0);
 
 	list_for_each(tmp, queue) {
 		struct list_head *mode_tail;
@@ -99,11 +188,10 @@ ldlm_inodebits_compat_queue(struct list_head *queue, struct ldlm_lock *req,
 
 		/* last lock in mode group */
 		LASSERT(lock->l_sl_mode.prev != NULL);
-		mode_tail = &list_entry(lock->l_sl_mode.prev,
-					struct ldlm_lock,
+		mode_tail = &list_entry(lock->l_sl_mode.prev, struct ldlm_lock,
 					l_sl_mode)->l_res_link;
 
-		/* if reqest lock is not COS_INCOMPAT and COS is disabled,
+		/* if request lock is not COS_INCOMPAT and COS is disabled,
 		 * they are compatible, IOW this request is from a local
 		 * transaction on a DNE system. */
 		if (lock->l_req_mode == LCK_COS && !ldlm_is_cos_incompat(req) &&
@@ -125,8 +213,24 @@ ldlm_inodebits_compat_queue(struct list_head *queue, struct ldlm_lock *req,
 
 			/* Advance loop cursor to last lock in policy group. */
 			tmp = &list_entry(lock->l_sl_policy.prev,
-					      struct ldlm_lock,
-					      l_sl_policy)->l_res_link;
+					  struct ldlm_lock,
+					  l_sl_policy)->l_res_link;
+
+			/* New lock's try_bits are filtered out by ibits
+			 * of all locks in both granted and waiting queues.
+			 */
+			*try_bits &= ~(lock->l_policy_data.l_inodebits.bits |
+				lock->l_policy_data.l_inodebits.try_bits);
+
+			if ((req_bits | *try_bits) == 0)
+				RETURN(0);
+
+			/* The new lock ibits is more preferable than try_bits
+			 * of waiting locks so drop conflicting try_bits in
+			 * the waiting queue.
+			 * Notice that try_bits of granted locks must be zero.
+			 */
+			lock->l_policy_data.l_inodebits.try_bits &= ~req_bits;
 
 			/* Locks with overlapping bits conflict. */
 			if (lock->l_policy_data.l_inodebits.bits & req_bits) {
@@ -138,6 +242,7 @@ ldlm_inodebits_compat_queue(struct list_head *queue, struct ldlm_lock *req,
 				    ldlm_is_cos_enabled(req) &&
 				    lock->l_client_cookie == req->l_client_cookie)
 					goto not_conflicting;
+
 				/* Found a conflicting policy group. */
 				if (!work_list)
 					RETURN(0);
@@ -146,22 +251,21 @@ ldlm_inodebits_compat_queue(struct list_head *queue, struct ldlm_lock *req,
 
 				/* Add locks of the policy group to @work_list
 				 * as blocking locks for @req */
-                                if (lock->l_blocking_ast)
-                                        ldlm_add_ast_work_item(lock, req,
-                                                               work_list);
-                                head = &lock->l_sl_policy;
+				if (lock->l_blocking_ast)
+					ldlm_add_ast_work_item(lock, req,
+							       work_list);
+				head = &lock->l_sl_policy;
 				list_for_each_entry(lock, head, l_sl_policy)
-                                        if (lock->l_blocking_ast)
-                                                ldlm_add_ast_work_item(lock, req,
-                                                                       work_list);
-                        }
-                not_conflicting:
-                        if (tmp == mode_tail)
-                                break;
-
-                        tmp = tmp->next;
-			lock = list_entry(tmp, struct ldlm_lock,
-                                              l_res_link);
+					if (lock->l_blocking_ast)
+						ldlm_add_ast_work_item(lock,
+								req, work_list);
+			}
+not_conflicting:
+			if (tmp == mode_tail)
+				break;
+
+			tmp = tmp->next;
+			lock = list_entry(tmp, struct ldlm_lock, l_res_link);
 		} /* Loop over policy groups within one mode group. */
 	} /* Loop over mode groups within @queue. */
 
@@ -182,57 +286,95 @@ int ldlm_process_inodebits_lock(struct ldlm_lock *lock, __u64 *flags,
 				struct list_head *work_list)
 {
 	struct ldlm_resource *res = lock->l_resource;
-	struct list_head rpc_list;
+	struct list_head *grant_work = intention == LDLM_PROCESS_ENQUEUE ?
+							NULL : work_list;
 	int rc;
+
 	ENTRY;
 
-	LASSERT(lock->l_granted_mode != lock->l_req_mode);
-	LASSERT(list_empty(&res->lr_converting));
-	INIT_LIST_HEAD(&rpc_list);
+	LASSERT(!ldlm_is_granted(lock));
 	check_res_locked(res);
 
-	/* (*flags & LDLM_FL_BLOCK_NOWAIT) is for layout lock right now. */
-	if (intention == LDLM_PROCESS_RESCAN ||
-	    (*flags & LDLM_FL_BLOCK_NOWAIT)) {
-		*err = ELDLM_LOCK_ABORTED;
-		if (*flags & LDLM_FL_BLOCK_NOWAIT)
+	if (intention == LDLM_PROCESS_RESCAN) {
+		struct list_head *bl_list;
+
+		if (*flags & LDLM_FL_BLOCK_NOWAIT) {
+			bl_list = NULL;
 			*err = ELDLM_LOCK_WOULDBLOCK;
+		} else {
+			bl_list = work_list;
+			*err = ELDLM_LOCK_ABORTED;
+		}
 
-                rc = ldlm_inodebits_compat_queue(&res->lr_granted, lock, NULL);
-                if (!rc)
-                        RETURN(LDLM_ITER_STOP);
-                rc = ldlm_inodebits_compat_queue(&res->lr_waiting, lock, NULL);
-                if (!rc)
-                        RETURN(LDLM_ITER_STOP);
+		LASSERT(lock->l_policy_data.l_inodebits.bits != 0);
 
-                ldlm_resource_unlink_lock(lock);
-                ldlm_grant_lock(lock, work_list);
+		/* It is possible that some of granted locks was not canceled
+		 * but converted and is kept in granted queue. So there is
+		 * a window where lock with 'ast_sent' might become granted
+		 * again. Meanwhile a new lock may appear in that window and
+		 * conflicts with the converted lock so the following scenario
+		 * is possible:
+		 *
+		 * 1) lock1 conflicts with lock2
+		 * 2) bl_ast was sent for lock2
+		 * 3) lock3 comes and conflicts with lock2 too
+		 * 4) no bl_ast sent because lock2->l_bl_ast_sent is 1
+		 * 5) lock2 was converted for lock1 but not for lock3
+		 * 6) lock1 granted, lock3 still is waiting for lock2, but
+		 *    there will never be another bl_ast for that
+		 *
+		 * To avoid this scenario the work_list is used below to collect
+		 * any blocked locks from granted queue during every reprocess
+		 * and bl_ast will be sent if needed.
+		 */
+		rc = ldlm_inodebits_compat_queue(&res->lr_granted, lock,
+						 bl_list);
+		if (!rc)
+			RETURN(LDLM_ITER_STOP);
+		rc = ldlm_inodebits_compat_queue(&res->lr_waiting, lock, NULL);
+		if (!rc)
+			RETURN(LDLM_ITER_STOP);
+
+		/* grant also try_bits if any */
+		if (lock->l_policy_data.l_inodebits.try_bits != 0) {
+			lock->l_policy_data.l_inodebits.bits |=
+				lock->l_policy_data.l_inodebits.try_bits;
+			lock->l_policy_data.l_inodebits.try_bits = 0;
+			*flags |= LDLM_FL_LOCK_CHANGED;
+		}
+		ldlm_resource_unlink_lock(lock);
+		ldlm_grant_lock(lock, grant_work);
 
 		*err = ELDLM_OK;
 		RETURN(LDLM_ITER_CONTINUE);
 	}
 
-	LASSERT((intention == LDLM_PROCESS_ENQUEUE && work_list == NULL) ||
-		(intention == LDLM_PROCESS_RECOVERY && work_list != NULL));
- restart:
-        rc = ldlm_inodebits_compat_queue(&res->lr_granted, lock, &rpc_list);
-        rc += ldlm_inodebits_compat_queue(&res->lr_waiting, lock, &rpc_list);
+	rc = ldlm_inodebits_compat_queue(&res->lr_granted, lock, work_list);
+	rc += ldlm_inodebits_compat_queue(&res->lr_waiting, lock, work_list);
 
-        if (rc != 2) {
-		rc = ldlm_handle_conflict_lock(lock, flags, &rpc_list, 0);
-		if (rc == -ERESTART)
-			GOTO(restart, rc);
-		*err = rc;
+	if (rc != 2) {
+		/* if there were only bits to try and all are conflicting */
+		if ((lock->l_policy_data.l_inodebits.bits |
+		     lock->l_policy_data.l_inodebits.try_bits) == 0) {
+			*err = ELDLM_LOCK_WOULDBLOCK;
+		} else {
+			*err = ELDLM_OK;
+		}
 	} else {
+		/* grant also all remaining try_bits */
+		if (lock->l_policy_data.l_inodebits.try_bits != 0) {
+			lock->l_policy_data.l_inodebits.bits |=
+				lock->l_policy_data.l_inodebits.try_bits;
+			lock->l_policy_data.l_inodebits.try_bits = 0;
+			*flags |= LDLM_FL_LOCK_CHANGED;
+		}
+		LASSERT(lock->l_policy_data.l_inodebits.bits);
 		ldlm_resource_unlink_lock(lock);
-		ldlm_grant_lock(lock, work_list);
-		rc = 0;
+		ldlm_grant_lock(lock, grant_work);
+		*err = ELDLM_OK;
 	}
 
-	if (!list_empty(&rpc_list))
-		ldlm_discard_bl_list(&rpc_list);
-
-	RETURN(rc);
+	RETURN(LDLM_ITER_CONTINUE);
 }
 #endif /* HAVE_SERVER_SUPPORT */
 
@@ -240,6 +382,10 @@ void ldlm_ibits_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy,
 				     union ldlm_policy_data *lpolicy)
 {
 	lpolicy->l_inodebits.bits = wpolicy->l_inodebits.bits;
+	/**
+	 * try_bits are to be handled outside of generic write_to_local due
+	 * to different behavior on a server and client.
+	 */
 }
 
 void ldlm_ibits_policy_local_to_wire(const union ldlm_policy_data *lpolicy,
@@ -247,4 +393,185 @@ void ldlm_ibits_policy_local_to_wire(const union ldlm_policy_data *lpolicy,
 {
 	memset(wpolicy, 0, sizeof(*wpolicy));
 	wpolicy->l_inodebits.bits = lpolicy->l_inodebits.bits;
+	wpolicy->l_inodebits.try_bits = lpolicy->l_inodebits.try_bits;
+}
+
+/**
+ * Attempt to convert already granted IBITS lock with several bits set to
+ * a lock with less bits (downgrade).
+ *
+ * Such lock conversion is used to keep lock with non-blocking bits instead of
+ * cancelling it, introduced for better support of DoM files.
+ */
+int ldlm_inodebits_drop(struct ldlm_lock *lock, __u64 to_drop)
+{
+	ENTRY;
+
+	check_res_locked(lock->l_resource);
+
+	/* Just return if there are no conflicting bits */
+	if ((lock->l_policy_data.l_inodebits.bits & to_drop) == 0) {
+		LDLM_WARN(lock, "try to drop unset bits %#llx/%#llx",
+			  lock->l_policy_data.l_inodebits.bits, to_drop);
+		/* nothing to do */
+		RETURN(0);
+	}
+
+	/* remove lock from a skiplist and put in the new place
+	 * according with new inodebits */
+	ldlm_resource_unlink_lock(lock);
+	lock->l_policy_data.l_inodebits.bits &= ~to_drop;
+	ldlm_grant_lock_with_skiplist(lock);
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_inodebits_drop);
+
+/* convert single lock */
+int ldlm_cli_inodebits_convert(struct ldlm_lock *lock,
+			       enum ldlm_cancel_flags cancel_flags)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+	struct ldlm_lock_desc ld = { { 0 } };
+	__u64 drop_bits, new_bits;
+	__u32 flags = 0;
+	int rc;
+
+	ENTRY;
+
+	check_res_locked(lock->l_resource);
+
+	/* Lock is being converted already */
+	if (ldlm_is_converting(lock)) {
+		if (!(cancel_flags & LCF_ASYNC)) {
+			struct l_wait_info lwi = { 0 };
+
+			unlock_res_and_lock(lock);
+			l_wait_event(lock->l_waitq,
+				     is_lock_converted(lock), &lwi);
+			lock_res_and_lock(lock);
+		}
+		RETURN(0);
+	}
+
+	/* lru_cancel may happen in parallel and call ldlm_cli_cancel_list()
+	 * independently.
+	 */
+	if (ldlm_is_canceling(lock))
+		RETURN(-EINVAL);
+
+	/* no need in only local convert */
+	if (lock->l_flags & (LDLM_FL_LOCAL_ONLY | LDLM_FL_CANCEL_ON_BLOCK))
+		RETURN(-EINVAL);
+
+	drop_bits = lock->l_policy_data.l_inodebits.cancel_bits;
+	/* no cancel bits - means that caller needs full cancel */
+	if (drop_bits == 0)
+		RETURN(-EINVAL);
+
+	new_bits = lock->l_policy_data.l_inodebits.bits & ~drop_bits;
+	/* check if all lock bits are dropped, proceed with cancel */
+	if (!new_bits)
+		RETURN(-EINVAL);
+
+	/* check if no dropped bits, consider this as successful convert */
+	if (lock->l_policy_data.l_inodebits.bits == new_bits)
+		RETURN(0);
+
+	ldlm_set_converting(lock);
+	/* Finally call cancel callback for remaining bits only.
+	 * It is important to have converting flag during that
+	 * so blocking_ast callback can distinguish convert from
+	 * cancels.
+	 */
+	ld.l_policy_data.l_inodebits.cancel_bits = drop_bits;
+	unlock_res_and_lock(lock);
+	lock->l_blocking_ast(lock, &ld, lock->l_ast_data, LDLM_CB_CANCELING);
+	/* now notify server about convert */
+	rc = ldlm_cli_convert_req(lock, &flags, new_bits);
+	lock_res_and_lock(lock);
+	if (rc)
+		GOTO(full_cancel, rc);
+
+	/* Finally clear these bits in lock ibits */
+	ldlm_inodebits_drop(lock, drop_bits);
+
+	/* Being locked again check if lock was canceled, it is important
+	 * to do and don't drop cbpending below
+	 */
+	if (ldlm_is_canceling(lock))
+		GOTO(full_cancel, rc = -EINVAL);
+
+	/* also check again if more bits to be cancelled appeared */
+	if (drop_bits != lock->l_policy_data.l_inodebits.cancel_bits)
+		GOTO(clear_converting, rc = -EAGAIN);
+
+	/* clear cbpending flag early, it is safe to match lock right after
+	 * client convert because it is downgrade always.
+	 */
+	ldlm_clear_cbpending(lock);
+	ldlm_clear_bl_ast(lock);
+	spin_lock(&ns->ns_lock);
+	if (list_empty(&lock->l_lru))
+		ldlm_lock_add_to_lru_nolock(lock);
+	spin_unlock(&ns->ns_lock);
+
+	/* the job is done, zero the cancel_bits. If more conflicts appear,
+	 * it will result in another cycle of ldlm_cli_inodebits_convert().
+	 */
+full_cancel:
+	lock->l_policy_data.l_inodebits.cancel_bits = 0;
+clear_converting:
+	ldlm_clear_converting(lock);
+	RETURN(rc);
+}
+
+int ldlm_inodebits_alloc_lock(struct ldlm_lock *lock)
+{
+	if (ldlm_is_ns_srv(lock)) {
+		int i;
+
+		OBD_SLAB_ALLOC_PTR(lock->l_ibits_node, ldlm_inodebits_slab);
+		if (lock->l_ibits_node == NULL)
+			return -ENOMEM;
+		for (i = 0; i < MDS_INODELOCK_NUMBITS; i++)
+			INIT_LIST_HEAD(&lock->l_ibits_node->lin_link[i]);
+		lock->l_ibits_node->lock = lock;
+	} else {
+		lock->l_ibits_node = NULL;
+	}
+	return 0;
+}
+
+void ldlm_inodebits_add_lock(struct ldlm_resource *res, struct list_head *head,
+			     struct ldlm_lock *lock)
+{
+	int i;
+
+	if (!ldlm_is_ns_srv(lock))
+		return;
+
+	if (head == &res->lr_waiting) {
+		for (i = 0; i < MDS_INODELOCK_NUMBITS; i++) {
+			if (lock->l_policy_data.l_inodebits.bits & (1 << i))
+				list_add_tail(&lock->l_ibits_node->lin_link[i],
+					&res->lr_ibits_queues->liq_waiting[i]);
+		}
+	} else if (head == &res->lr_granted && lock->l_ibits_node != NULL) {
+		for (i = 0; i < MDS_INODELOCK_NUMBITS; i++)
+			LASSERT(list_empty(&lock->l_ibits_node->lin_link[i]));
+		OBD_SLAB_FREE_PTR(lock->l_ibits_node, ldlm_inodebits_slab);
+		lock->l_ibits_node = NULL;
+	}
+}
+
+void ldlm_inodebits_unlink_lock(struct ldlm_lock *lock)
+{
+	int i;
+
+	ldlm_unlink_lock_skiplist(lock);
+	if (!ldlm_is_ns_srv(lock))
+		return;
+
+	for (i = 0; i < MDS_INODELOCK_NUMBITS; i++)
+		list_del_init(&lock->l_ibits_node->lin_link[i]);
 }
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h
index 779dec55882e5..733773c50ed0c 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,6 +40,7 @@ extern struct mutex ldlm_cli_namespace_lock;
 extern struct list_head ldlm_cli_active_namespace_list;
 extern struct list_head ldlm_cli_inactive_namespace_list;
 extern unsigned int ldlm_cancel_unused_locks_before_replay;
+extern struct kmem_cache *ldlm_glimpse_work_kmem;
 
 static inline int ldlm_namespace_nr_read(enum ldlm_side client)
 {
@@ -97,30 +98,27 @@ struct ldlm_namespace *ldlm_namespace_first_locked(enum ldlm_side);
 /* ldlm_request.c */
 /* Cancel lru flag, it indicates we cancel aged locks. */
 enum ldlm_lru_flags {
-	LDLM_LRU_FLAG_AGED	= 0x01, /* Cancel aged locks (non LRU resize) */
-	LDLM_LRU_FLAG_PASSED	= 0x02, /* Cancel passed number of locks */
-	LDLM_LRU_FLAG_SHRINK	= 0x04, /* Cancel locks from shrinker */
-	LDLM_LRU_FLAG_LRUR	= 0x08, /* Cancel locks from lru resize */
-	LDLM_LRU_FLAG_NO_WAIT	= 0x10, /* Cancel locks w/o blocking (neither
-					 * sending nor waiting for any RPCs) */
-	LDLM_LRU_FLAG_CLEANUP	= 0x20, /* Used when clearing lru, tells
-					 * prepare_lru_list to set discard flag
-					 * on PR extent locks so we don't waste
-					 * time saving pages that will be
-					 * discarded momentarily */
+	LDLM_LRU_FLAG_NO_WAIT	= 0x1, /* Cancel locks w/o blocking (neither
+					* sending nor waiting for any RPCs) */
+	LDLM_LRU_FLAG_CLEANUP	= 0x2, /* Used when clearing lru, tells
+					* prepare_lru_list to set discard flag
+					* on PR extent locks so we don't waste
+					* time saving pages that will be
+					* discarded momentarily */
 };
 
-int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr,
+int ldlm_cancel_lru(struct ldlm_namespace *ns, int min,
 		    enum ldlm_cancel_flags cancel_flags,
 		    enum ldlm_lru_flags lru_flags);
 int ldlm_cancel_lru_local(struct ldlm_namespace *ns,
-			  struct list_head *cancels, int count, int max,
+			  struct list_head *cancels, int min, int max,
 			  enum ldlm_cancel_flags cancel_flags,
 			  enum ldlm_lru_flags lru_flags);
 extern unsigned int ldlm_enqueue_min;
 /* ldlm_resource.c */
 extern struct kmem_cache *ldlm_resource_slab;
 extern struct kmem_cache *ldlm_lock_slab;
+extern struct kmem_cache *ldlm_inodebits_slab;
 extern struct kmem_cache *ldlm_interval_tree_slab;
 
 void ldlm_resource_insert_lock_after(struct ldlm_lock *original,
@@ -135,6 +133,7 @@ typedef enum {
 	LDLM_WORK_GL_AST
 } ldlm_desc_ast_t;
 
+void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock);
 void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list);
 int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill,
 		  enum req_location loc, void *data, int size);
@@ -143,7 +142,9 @@ ldlm_lock_create(struct ldlm_namespace *ns, const struct ldlm_res_id *,
 		 enum ldlm_type type, enum ldlm_mode mode,
 		 const struct ldlm_callback_suite *cbs,
 		 void *data, __u32 lvb_len, enum lvb_type lvb_type);
-enum ldlm_error ldlm_lock_enqueue(struct ldlm_namespace *, struct ldlm_lock **,
+enum ldlm_error ldlm_lock_enqueue(const struct lu_env *env,
+				  struct ldlm_namespace *,
+				  struct ldlm_lock **,
 				  void *cookie, __u64 *flags);
 void ldlm_lock_addref_internal(struct ldlm_lock *, enum ldlm_mode mode);
 void ldlm_lock_addref_internal_nolock(struct ldlm_lock *, enum ldlm_mode mode);
@@ -154,13 +155,16 @@ void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
 #ifdef HAVE_SERVER_SUPPORT
 int ldlm_reprocess_queue(struct ldlm_resource *res, struct list_head *queue,
 			 struct list_head *work_list,
-			 enum ldlm_process_intention intention);
+			 enum ldlm_process_intention intention,
+			 struct ldlm_lock *hint);
 int ldlm_handle_conflict_lock(struct ldlm_lock *lock, __u64 *flags,
-			      struct list_head *rpc_list, __u64 grant_flags);
+			      struct list_head *rpc_list);
 void ldlm_discard_bl_list(struct list_head *bl_list);
+void ldlm_clear_blocking_lock(struct ldlm_lock *lock);
+void ldlm_clear_blocking_data(struct ldlm_lock *lock);
 #endif
 int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
-                      ldlm_desc_ast_t ast_type);
+		      ldlm_desc_ast_t ast_type);
 int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq);
 int ldlm_lock_remove_from_lru_check(struct ldlm_lock *lock, ktime_t last_use);
 #define ldlm_lock_remove_from_lru(lock) \
@@ -173,6 +177,7 @@ void ldlm_lock_destroy_nolock(struct ldlm_lock *lock);
 
 int ldlm_export_cancel_blocked_locks(struct obd_export *exp);
 int ldlm_export_cancel_locks(struct obd_export *exp);
+void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock);
 
 /* ldlm_lockd.c */
 int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld,
@@ -185,6 +190,7 @@ int ldlm_bl_thread_wakeup(void);
 
 void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
                              struct ldlm_lock_desc *ld, struct ldlm_lock *lock);
+void ldlm_bl_desc2lock(const struct ldlm_lock_desc *ld, struct ldlm_lock *lock);
 
 #ifdef HAVE_SERVER_SUPPORT
 /* ldlm_plain.c */
@@ -197,14 +203,25 @@ int ldlm_process_inodebits_lock(struct ldlm_lock *lock, __u64 *flags,
 				enum ldlm_process_intention intention,
 				enum ldlm_error *err,
 				struct list_head *work_list);
+int ldlm_reprocess_inodebits_queue(struct ldlm_resource *res,
+				   struct list_head *queue,
+				   struct list_head *work_list,
+				   enum ldlm_process_intention intention,
+				   struct ldlm_lock *hint);
 /* ldlm_extent.c */
 int ldlm_process_extent_lock(struct ldlm_lock *lock, __u64 *flags,
 			     enum ldlm_process_intention intention,
 			     enum ldlm_error *err, struct list_head *work_list);
 #endif
+int ldlm_extent_alloc_lock(struct ldlm_lock *lock);
 void ldlm_extent_add_lock(struct ldlm_resource *res, struct ldlm_lock *lock);
 void ldlm_extent_unlink_lock(struct ldlm_lock *lock);
 
+int ldlm_inodebits_alloc_lock(struct ldlm_lock *lock);
+void ldlm_inodebits_add_lock(struct ldlm_resource *res, struct list_head *head,
+			     struct ldlm_lock *lock);
+void ldlm_inodebits_unlink_lock(struct ldlm_lock *lock);
+
 /* ldlm_flock.c */
 int ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags,
 			    enum ldlm_process_intention intention,
@@ -216,7 +233,7 @@ void ldlm_destroy_flock_export(struct obd_export *exp);
 void l_check_ns_lock(struct ldlm_namespace *ns);
 void l_check_no_ns_lock(struct ldlm_namespace *ns);
 
-extern struct proc_dir_entry *ldlm_svc_proc_dir;
+extern struct dentry *ldlm_svc_debugfs_dir;
 
 struct ldlm_state {
         struct ptlrpc_service *ldlm_cb_service;
@@ -230,7 +247,6 @@ struct ldlm_state {
 extern struct kmem_cache *ldlm_interval_slab; /* slab cache for ldlm_interval */
 extern void ldlm_interval_attach(struct ldlm_interval *n, struct ldlm_lock *l);
 extern struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l);
-extern struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock);
 extern void ldlm_interval_free(struct ldlm_interval *node);
 /* this function must be called with res lock held */
 static inline struct ldlm_extent *
@@ -281,7 +297,7 @@ enum ldlm_policy_res {
 	static ssize_t var##_store(struct kobject *kobj,		   \
 				   struct attribute *attr,		   \
 				   const char *buffer,			   \
-				   unsigned long count)			   \
+				   size_t count)			   \
 	{								   \
 		struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool,\
 						    pl_kobj);		   \
@@ -317,7 +333,7 @@ enum ldlm_policy_res {
 	static ssize_t var##_store(struct kobject *kobj,		   \
 				   struct attribute *attr,		   \
 				   const char *buffer,			   \
-				   unsigned long count)			   \
+				   size_t count)			   \
 	{								   \
 		struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool,\
 						    pl_kobj);		   \
@@ -336,28 +352,24 @@ enum ldlm_policy_res {
 	struct __##var##__dummy_write {; } /* semicolon catcher */
 
 static inline void
-ldlm_add_var(struct lprocfs_vars *vars, struct proc_dir_entry *proc_dir,
-	     const char *name, void *data, const struct proc_ops *ops)
+ldlm_add_var(struct ldebugfs_vars *vars, struct dentry *debugfs_entry,
+	     const char *name, void *data, const struct file_operations *ops)
 {
 	snprintf((char *)vars->name, MAX_STRING_SIZE, "%s", name);
 	vars->data = data;
 	vars->fops = ops;
-	lprocfs_add_vars(proc_dir, vars, NULL);
+	ldebugfs_add_vars(debugfs_entry, vars, NULL);
 }
 
 static inline int is_granted_or_cancelled(struct ldlm_lock *lock)
 {
-        int ret = 0;
+	int ret = 0;
 
-        lock_res_and_lock(lock);
-	if ((lock->l_req_mode == lock->l_granted_mode) &&
-	     !ldlm_is_cp_reqd(lock))
-		ret = 1;
-	else if (ldlm_is_failed(lock) || ldlm_is_cancel(lock))
-                ret = 1;
-        unlock_res_and_lock(lock);
+	lock_res_and_lock(lock);
+	ret = is_granted_or_cancelled_nolock(lock);
+	unlock_res_and_lock(lock);
 
-        return ret;
+	return ret;
 }
 
 static inline bool is_bl_done(struct ldlm_lock *lock)
@@ -373,6 +385,17 @@ static inline bool is_bl_done(struct ldlm_lock *lock)
 	return bl_done;
 }
 
+static inline bool is_lock_converted(struct ldlm_lock *lock)
+{
+	bool ret = 0;
+
+	lock_res_and_lock(lock);
+	ret = (lock->l_policy_data.l_inodebits.cancel_bits == 0);
+	unlock_res_and_lock(lock);
+
+	return ret;
+}
+
 typedef void (*ldlm_policy_wire_to_local_t)(const union ldlm_wire_policy_data *,
 					    union ldlm_policy_data *);
 typedef void (*ldlm_policy_local_to_wire_t)(const union ldlm_policy_data *,
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c
index 33d871da4bdf6..41e655b6fc353 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lib.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2016, Intel Corporation.
+ * Copyright (c) 2010, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -39,6 +39,8 @@
 
 #define DEBUG_SUBSYSTEM S_LDLM
 
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
 #include <linux/kthread.h>
 #include <libcfs/libcfs.h>
 #include <obd.h>
@@ -358,12 +360,13 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
 		     sizeof(server_uuid)));
 
 	cli->cl_dirty_pages = 0;
+	cli->cl_dirty_max_pages = 0;
 	cli->cl_avail_grant = 0;
 	/* FIXME: Should limit this for the sum of all cl_dirty_max_pages. */
 	/* cl_dirty_max_pages may be changed at connect time in
 	 * ptlrpc_connect_interpret(). */
 	client_adjust_max_dirty(cli);
-	INIT_LIST_HEAD(&cli->cl_cache_waiters);
+	init_waitqueue_head(&cli->cl_cache_waiters);
 	INIT_LIST_HEAD(&cli->cl_loi_ready_list);
 	INIT_LIST_HEAD(&cli->cl_loi_hp_ready_list);
 	INIT_LIST_HEAD(&cli->cl_loi_write_list);
@@ -390,9 +393,15 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
 	spin_lock_init(&cli->cl_lru_list_lock);
 	atomic_long_set(&cli->cl_unstable_count, 0);
 	INIT_LIST_HEAD(&cli->cl_shrink_list);
+	INIT_LIST_HEAD(&cli->cl_grant_chain);
+
+	INIT_LIST_HEAD(&cli->cl_flight_waiters);
+	cli->cl_rpcs_in_flight = 0;
 
 	init_waitqueue_head(&cli->cl_destroy_waitq);
 	atomic_set(&cli->cl_destroy_in_flight, 0);
+
+	cli->cl_supp_cksum_types = OBD_CKSUM_CRC32;
 #ifdef ENABLE_CHECKSUM
 	/* Turn on checksumming by default. */
 	cli->cl_checksum = 1;
@@ -401,7 +410,7 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
 	 * Set cl_chksum* to CRC32 for now to avoid returning screwed info
 	 * through procfs.
 	 */
-	cli->cl_cksum_type = cli->cl_supp_cksum_types = OBD_CKSUM_CRC32;
+	cli->cl_cksum_type = cli->cl_supp_cksum_types;
 #endif
 	atomic_set(&cli->cl_resends, OSC_DEFAULT_RESENDS);
 
@@ -409,6 +418,8 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
 	 * from OFD after connecting. */
 	cli->cl_max_pages_per_rpc = PTLRPC_MAX_BRW_PAGES;
 
+	cli->cl_max_short_io_bytes = OBD_MAX_SHORT_IO_BYTES;
+
 	/* set cl_chunkbits default value to PAGE_SHIFT,
 	 * it will be updated at OSC connection time. */
 	cli->cl_chunkbits = PAGE_SHIFT;
@@ -426,7 +437,7 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
 			cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_MAX;
 		else
 			cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT;
-        }
+	}
 
 	spin_lock_init(&cli->cl_mod_rpcs_lock);
 	spin_lock_init(&cli->cl_mod_rpcs_hist.oh_lock);
@@ -599,6 +610,7 @@ int client_connect_import(const struct lu_env *env,
 			 ocd->ocd_connect_flags, "old %#llx, new %#llx\n",
 			 data->ocd_connect_flags, ocd->ocd_connect_flags);
 		data->ocd_connect_flags = ocd->ocd_connect_flags;
+		data->ocd_connect_flags2 = ocd->ocd_connect_flags2;
 	}
 
 	ptlrpc_pinger_add_import(imp);
@@ -731,6 +743,32 @@ int server_disconnect_export(struct obd_export *exp)
 }
 EXPORT_SYMBOL(server_disconnect_export);
 
+static inline int target_check_recovery_timer(struct obd_device *target)
+{
+	ktime_t remaining;
+	s64 timeout;
+
+	if (!target->obd_recovering || target->obd_recovery_start == 0)
+		return 0;
+
+	remaining = hrtimer_expires_remaining(&target->obd_recovery_timer);
+	timeout = ktime_divns(remaining, NSEC_PER_SEC);
+	if (timeout > -30)
+		return 0;
+
+	/* the recovery timer should expire, but it isn't triggered,
+	 * it's better to abort the recovery of this target to speed up
+	 * the recovery of the whole cluster. */
+	spin_lock(&target->obd_dev_lock);
+	if (target->obd_recovering) {
+		CERROR("%s: Aborting recovery\n", target->obd_name);
+		target->obd_abort_recovery = 1;
+		wake_up(&target->obd_next_transno_waitq);
+	}
+	spin_unlock(&target->obd_dev_lock);
+	return 0;
+}
+
 /* --------------------------------------------------------------------------
  * from old lib/target.c
  * -------------------------------------------------------------------------- */
@@ -741,12 +779,11 @@ static int target_handle_reconnect(struct lustre_handle *conn,
 {
 	struct obd_device *target;
 	struct lustre_handle *hdl;
-	cfs_time_t now;
-	cfs_time_t deadline;
-	int timeout;
+	ktime_t remaining;
+	s64 timeout;
 	int rc = 0;
-	ENTRY;
 
+	ENTRY;
 	hdl = &exp->exp_imp_reverse->imp_remote_handle;
 	if (!exp->exp_connection || !lustre_handle_is_used(hdl)) {
 		conn->cookie = exp->exp_handle.h_cookie;
@@ -780,46 +817,45 @@ static int target_handle_reconnect(struct lustre_handle *conn,
 		GOTO(out_already, rc);
 	}
 
-	now = cfs_time_current();
-	deadline = target->obd_recovery_timer.expires;
-	if (cfs_time_before(now, deadline)) {
-		struct target_distribute_txn_data *tdtd =
-					class_exp2tgt(exp)->lut_tdtd;
+	remaining = hrtimer_expires_remaining(&target->obd_recovery_timer);
+	timeout = ktime_divns(remaining, NSEC_PER_SEC);
+	if (timeout > 0) {
+		LCONSOLE_WARN("%s: Client %s (at %s) reconnected, waiting for %d clients in recovery for %lld:%.02lld\n",
+			      target->obd_name,
+			      obd_uuid2str(&exp->exp_client_uuid),
+			      obd_export_nid2str(exp),
+			      atomic_read(&target->obd_max_recoverable_clients),
+			      timeout / 60, timeout % 60);
+	} else {
+		struct target_distribute_txn_data *tdtd;
 		int size = 0;
 		int count = 0;
 		char *buf = NULL;
 
-		timeout = cfs_duration_sec(cfs_time_sub(deadline, now));
+		target_check_recovery_timer(target);
+
+		tdtd = class_exp2tgt(exp)->lut_tdtd;
 		if (tdtd && tdtd->tdtd_show_update_logs_retrievers)
 			buf = tdtd->tdtd_show_update_logs_retrievers(
 				tdtd->tdtd_show_retrievers_cbdata,
 				&size, &count);
 
 		if (count > 0)
-			LCONSOLE_WARN("%s: Recovery already passed deadline "
-				      "%d:%.02d. It is due to DNE recovery "
-				      "failed/stuck on the %d MDT(s):%s. "
-				      "Please wait until all MDTs recovered "
-				      "or abort the recovery by force.\n",
-				      target->obd_name, timeout / 60,
-				      timeout % 60, count,
-				      buf ? buf : "unknown (not enough RAM)");
+			LCONSOLE_WARN("%s: Client %s (at %s) reconnecting, waiting for %d MDTs (%s) in recovery for %lld:%.02lld. Please wait until all MDTs recovered or you may force MDT evicition via 'lctl --device %s abort_recovery.\n",
+				      target->obd_name,
+				      obd_uuid2str(&exp->exp_client_uuid),
+				      obd_export_nid2str(exp), count,
+				      buf ? buf : "unknown (not enough RAM)",
+				      (abs(timeout) + target->obd_recovery_timeout) / 60,
+				      (abs(timeout) + target->obd_recovery_timeout) % 60,
+				      target->obd_name);
 		else
-			LCONSOLE_WARN("%s: Recovery already passed deadline "
-				      "%d:%.02d. If you do not want to wait "
-				      "more, please abort the recovery by "
-				      "force.\n", target->obd_name,
-				      timeout / 60, timeout % 60);
+			LCONSOLE_WARN("%s: Recovery already passed deadline %lld:%.02lld. If you do not want to wait more, you may force taget eviction via 'lctl --device %s abort_recovery.\n",
+				      target->obd_name, abs(timeout) / 60,
+				      abs(timeout) % 60, target->obd_name);
 
 		if (buf != NULL)
 			OBD_FREE(buf, size);
-	} else {
-		timeout = cfs_duration_sec(cfs_time_sub(now, deadline));
-		LCONSOLE_WARN("%s: Recovery already passed deadline"
-			" %d:%.02d, It is most likely due to DNE"
-			" recovery is failed or stuck, please wait a"
-			" few more minutes or abort the recovery.\n",
-			target->obd_name, timeout / 60, timeout % 60);
 	}
 
 out_already:
@@ -950,7 +986,6 @@ int target_handle_connect(struct ptlrpc_request *req)
 	 * reconnect case */
 	struct lustre_handle conn;
 	struct lustre_handle *tmp;
-        struct obd_uuid tgtuuid;
         struct obd_uuid cluuid;
         char *str;
         int rc = 0;
@@ -959,7 +994,6 @@ int target_handle_connect(struct ptlrpc_request *req)
 	bool	 mds_conn = false, lw_client = false, initial_conn = false;
 	bool	 mds_mds_conn = false;
 	bool	 new_mds_mds_conn = false;
-	bool	 target_referenced = false;
         struct obd_connect_data *data, *tmpdata;
         int size, tmpsize;
         lnet_nid_t *client_nid = NULL;
@@ -973,11 +1007,7 @@ int target_handle_connect(struct ptlrpc_request *req)
                 GOTO(out, rc = -EINVAL);
         }
 
-        obd_str2uuid(&tgtuuid, str);
-        target = class_uuid2obd(&tgtuuid);
-        if (!target)
-                target = class_name2obd(str);
-
+	target = class_dev_by_str(str);
 	if (!target) {
 		deuuidify(str, NULL, &target_start, &target_len);
 		LCONSOLE_ERROR_MSG(0x137, "%s: not available for connect "
@@ -989,6 +1019,9 @@ int target_handle_connect(struct ptlrpc_request *req)
 	}
 
 	spin_lock(&target->obd_dev_lock);
+
+	target->obd_conn_inprogress++;
+
 	if (target->obd_stopping || !target->obd_set_up) {
 		spin_unlock(&target->obd_dev_lock);
 
@@ -1010,13 +1043,6 @@ int target_handle_connect(struct ptlrpc_request *req)
 		GOTO(out, rc = -EAGAIN);
 	}
 
-	/* Make sure the target isn't cleaned up while we're here. Yes,
-	 * there's still a race between the above check and our incref here.
-	 * Really, class_uuid2obd should take the ref. */
-	class_incref(target, __func__, current);
-	target_referenced = true;
-
-	target->obd_conn_inprogress++;
 	spin_unlock(&target->obd_dev_lock);
 
         str = req_capsule_client_get(&req->rq_pill, &RMF_CLUUID);
@@ -1033,11 +1059,13 @@ int target_handle_connect(struct ptlrpc_request *req)
 
         conn = *tmp;
 
-        size = req_capsule_get_size(&req->rq_pill, &RMF_CONNECT_DATA,
-                                    RCL_CLIENT);
-        data = req_capsule_client_get(&req->rq_pill, &RMF_CONNECT_DATA);
-        if (!data)
-                GOTO(out, rc = -EPROTO);
+	size = req_capsule_get_size(&req->rq_pill, &RMF_CONNECT_DATA,
+				    RCL_CLIENT);
+	if (size < 0 || size > 8 * sizeof(struct obd_connect_data))
+		GOTO(out, rc = -EPROTO);
+	data = req_capsule_client_get(&req->rq_pill, &RMF_CONNECT_DATA);
+	if (!data)
+		GOTO(out, rc = -EPROTO);
 
         rc = req_capsule_server_pack(&req->rq_pill);
         if (rc)
@@ -1055,50 +1083,36 @@ int target_handle_connect(struct ptlrpc_request *req)
 	 */
 	if (!(data->ocd_connect_flags & OBD_CONNECT_FULL20))
 		GOTO(out, rc = -EPROTO);
-#endif
 
+	/* Don't allow liblustre clients to connect.
+	 * - testing was disabled in v2_2_50_0-61-g6a75d65
+	 * - building was disabled in v2_5_58_0-28-g7277179
+	 * - client code was deleted in v2_6_50_0-101-gcdfbc72,
+	 * - clients were refused connect for version difference > 0.0.1.32  */
 	if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
-		if (data->ocd_version < LUSTRE_VERSION_CODE -
-		                               LUSTRE_VERSION_ALLOWED_OFFSET ||
-		    data->ocd_version > LUSTRE_VERSION_CODE +
-		                               LUSTRE_VERSION_ALLOWED_OFFSET) {
-			DEBUG_REQ(D_WARNING, req, "Refusing %s (%d.%d.%d.%d) "
-				  "libclient connection attempt",
-				  data->ocd_version < LUSTRE_VERSION_CODE ?
-				  "old" : "new",
-				  OBD_OCD_VERSION_MAJOR(data->ocd_version),
-				  OBD_OCD_VERSION_MINOR(data->ocd_version),
-				  OBD_OCD_VERSION_PATCH(data->ocd_version),
-				  OBD_OCD_VERSION_FIX(data->ocd_version));
-			data = req_capsule_server_sized_get(&req->rq_pill,
-							    &RMF_CONNECT_DATA,
-				    offsetof(typeof(*data), ocd_version) +
-					     sizeof(data->ocd_version));
-			if (data) {
-				data->ocd_connect_flags = OBD_CONNECT_VERSION;
-				data->ocd_version = LUSTRE_VERSION_CODE;
-			}
-			GOTO(out, rc = -EPROTO);
-		}
+		DEBUG_REQ(D_WARNING, req, "Refusing libclient connection");
+		GOTO(out, rc = -EPROTO);
 	}
+#endif
 
 	/* Note: lw_client is needed in MDS-MDS failover during update log
 	 * processing, so we needs to allow lw_client to be connected at
-	 * anytime, instead of only the initial connection */
-	lw_client = (data->ocd_connect_flags & OBD_CONNECT_LIGHTWEIGHT) != 0;
+	 * anytime, instead of only the initial connection
+	 */
+	lw_client = OCD_HAS_FLAG(data, LIGHTWEIGHT);
 
 	if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_INITIAL) {
 		initial_conn = true;
-		mds_conn = (data->ocd_connect_flags & OBD_CONNECT_MDS) != 0;
-		mds_mds_conn = (data->ocd_connect_flags &
-				OBD_CONNECT_MDS_MDS) != 0;
+		mds_conn = OCD_HAS_FLAG(data, MDS);
+		mds_mds_conn = OCD_HAS_FLAG(data, MDS_MDS);
 
 		/* OBD_CONNECT_MNE_SWAB is defined as OBD_CONNECT_MDS_MDS
 		 * for Imperative Recovery connection from MGC to MGS.
 		 *
 		 * Via check OBD_CONNECT_FID, we can distinguish whether
 		 * the OBD_CONNECT_MDS_MDS/OBD_CONNECT_MNE_SWAB is from
-		 * MGC or MDT. */
+		 * MGC or MDT, since MGC does not use OBD_CONNECT_FID.
+		 */
 		if (!lw_client &&
 		    (data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) &&
 		    (data->ocd_connect_flags & OBD_CONNECT_FID) &&
@@ -1147,27 +1161,29 @@ int target_handle_connect(struct ptlrpc_request *req)
 		export = NULL;
 		rc = -EALREADY;
 	} else if ((mds_conn || (lw_client && initial_conn) ||
-		   data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) &&
-		   export->exp_connection != NULL) {
+		   OCD_HAS_FLAG(data, MDS_MDS)) && export->exp_connection) {
 		spin_unlock(&export->exp_lock);
 		if (req->rq_peer.nid != export->exp_connection->c_peer.nid) {
 			/* MDS or LWP reconnected after failover. */
-			LCONSOLE_WARN("%s: Received %s connection from "
-			    "%s, removing former export from %s\n",
-			    target->obd_name, mds_conn ? "MDS" : "LWP",
-			    libcfs_nid2str(req->rq_peer.nid),
-			    libcfs_nid2str(export->exp_connection->c_peer.nid));
+			LCONSOLE_WARN("%s: Received %s connection from %s, removing former export from %s\n",
+				      target->obd_name,
+				      lw_client ? "LWP" : "MDS",
+				      libcfs_nid2str(req->rq_peer.nid),
+				      libcfs_nid2str(export->exp_connection->c_peer.nid));
 		} else {
-			/* New MDS connection from the same NID. */
-			LCONSOLE_WARN("%s: Received new %s connection from "
-				"%s, removing former export from same NID\n",
-				target->obd_name, mds_conn ? "MDS" : "LWP",
-				libcfs_nid2str(req->rq_peer.nid));
+			/* New connection from the same NID. */
+			LCONSOLE_WARN("%s: Received new %s connection from %s, %s former export from same NID\n",
+				      target->obd_name,
+				      lw_client ? "LWP" : "MDS",
+				      libcfs_nid2str(req->rq_peer.nid),
+				      OCD_HAS_FLAG(data, MDS_MDS) ?
+				      "keep" : "remove");
 		}
 
 		if (req->rq_peer.nid == export->exp_connection->c_peer.nid &&
-		    data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) {
-			/* Because exports between MDTs will always be
+		    OCD_HAS_FLAG(data, MDS_MDS)) {
+			/*
+			 * Because exports between MDTs will always be
 			 * kept, let's do not fail such export if they
 			 * come from the same NID, otherwise it might
 			 * cause eviction between MDTs, which might
@@ -1234,11 +1250,11 @@ int target_handle_connect(struct ptlrpc_request *req)
                 GOTO(out, rc);
         }
 
-	CDEBUG(D_HA, "%s: connection from %s@%s %st%llu exp %p cur %ld last %ld\n",
-               target->obd_name, cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
-              target->obd_recovering ? "recovering/" : "", data->ocd_transno,
-              export, (long)cfs_time_current_sec(),
-              export ? (long)export->exp_last_request_time : 0);
+	CDEBUG(D_HA, "%s: connection from %s@%s %st%llu exp %p cur %lld last %lld\n",
+	       target->obd_name, cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
+	       target->obd_recovering ? "recovering/" : "", data->ocd_transno,
+	       export, ktime_get_seconds(),
+	       export ? export->exp_last_request_time : 0);
 
 	/* If this is the first time a client connects, reset the recovery
 	 * timer. Discard lightweight connections which might be local. */
@@ -1264,27 +1280,37 @@ int target_handle_connect(struct ptlrpc_request *req)
 		/* allow "new" MDT to be connected during recovery, since we
 		 * need retrieve recovery update records from it */
 		if (target->obd_recovering && !lw_client && !mds_mds_conn) {
-                        cfs_time_t t;
-			int	c; /* connected */
-			int	i; /* in progress */
-			int	k; /* known */
-			int	s; /* stale/evicted */
-
-			c = atomic_read(&target->obd_connected_clients);
-			i = atomic_read(&target->obd_lock_replay_clients);
-			k = target->obd_max_recoverable_clients;
-			s = target->obd_stale_clients;
-			t = target->obd_recovery_timer.expires;
-			t = cfs_time_sub(t, cfs_time_current());
-			t = cfs_duration_sec(t);
-			LCONSOLE_WARN("%s: Denying connection for new client %s"
-				      "(at %s), waiting for %d known clients "
-				      "(%d recovered, %d in progress, and %d "
-				      "evicted) to recover in %d:%.02d\n",
+			struct hrtimer *timer = &target->obd_recovery_timer;
+			ktime_t remaining;
+			s64 timeout, left;
+			int in_progress;
+			int connected;
+			int known;
+			int stale;
+			char *msg;
+
+			connected = atomic_read(&target->obd_connected_clients);
+			in_progress = atomic_read(&target->obd_lock_replay_clients);
+			known =
+			   atomic_read(&target->obd_max_recoverable_clients);
+			stale = target->obd_stale_clients;
+			remaining = hrtimer_expires_remaining(timer);
+			left = ktime_divns(remaining, NSEC_PER_SEC);
+			if (ktime_to_ns(remaining) > 0) {
+				msg = "to recover in";
+				timeout = left;
+			} else {
+				msg = "already passed deadline";
+				timeout = -left;
+
+				target_check_recovery_timer(target);
+			}
+
+			LCONSOLE_WARN("%s: Denying connection for new client %s (at %s), waiting for %d known clients (%d recovered, %d in progress, and %d evicted) %s %lld:%.02lld\n",
 				      target->obd_name, cluuid.uuid,
-				      libcfs_nid2str(req->rq_peer.nid), k,
-				      c - i, i, s, (int)t / 60,
-				      (int)t % 60);
+				      libcfs_nid2str(req->rq_peer.nid), known,
+				      connected - in_progress, in_progress,
+				      stale, msg, timeout / 60, timeout % 60);
 			rc = -EBUSY;
 		} else {
 dont_check_exports:
@@ -1339,37 +1365,26 @@ int target_handle_connect(struct ptlrpc_request *req)
 		spin_unlock(&export->exp_lock);
 		CDEBUG(D_RPCTRACE, "%s: %s already connected at greater "
 		       "or equal conn_cnt: %d >= %d\n",
-                       cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
-                       export->exp_conn_cnt,
-                       lustre_msg_get_conn_cnt(req->rq_reqmsg));
-
-                GOTO(out, rc = -EALREADY);
-        }
-        LASSERT(lustre_msg_get_conn_cnt(req->rq_reqmsg) > 0);
-        export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
+		       cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
+		       export->exp_conn_cnt,
+		       lustre_msg_get_conn_cnt(req->rq_reqmsg));
 
-	/* Don't evict liblustre clients for not pinging. */
-        if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
-                export->exp_libclient = 1;
-		spin_unlock(&export->exp_lock);
-
-		spin_lock(&target->obd_dev_lock);
-		list_del_init(&export->exp_obd_chain_timed);
-		spin_unlock(&target->obd_dev_lock);
-	} else {
-		spin_unlock(&export->exp_lock);
+		GOTO(out, rc = -EALREADY);
 	}
+	LASSERT(lustre_msg_get_conn_cnt(req->rq_reqmsg) > 0);
+	export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
+	spin_unlock(&export->exp_lock);
 
-        if (export->exp_connection != NULL) {
+	if (export->exp_connection != NULL) {
 		/* Check to see if connection came from another NID. */
-                if ((export->exp_connection->c_peer.nid != req->rq_peer.nid) &&
+		if ((export->exp_connection->c_peer.nid != req->rq_peer.nid) &&
 		    !hlist_unhashed(&export->exp_nid_hash))
-                        cfs_hash_del(export->exp_obd->obd_nid_hash,
-                                     &export->exp_connection->c_peer.nid,
-                                     &export->exp_nid_hash);
+			cfs_hash_del(export->exp_obd->obd_nid_hash,
+				     &export->exp_connection->c_peer.nid,
+				     &export->exp_nid_hash);
 
-                ptlrpc_connection_put(export->exp_connection);
-        }
+		ptlrpc_connection_put(export->exp_connection);
+	}
 
 	export->exp_connection = ptlrpc_connection_get(req->rq_peer,
 						       req->rq_self,
@@ -1425,9 +1440,10 @@ int target_handle_connect(struct ptlrpc_request *req)
 		 * also needs to be increased to match other recovery checking
 		 * condition. */
 		if (new_mds_mds_conn)
-			target->obd_max_recoverable_clients++;
+			atomic_inc(&target->obd_max_recoverable_clients);
+
 		if (atomic_inc_return(&target->obd_connected_clients) ==
-		    target->obd_max_recoverable_clients)
+		    atomic_read(&target->obd_max_recoverable_clients))
 			wake_up(&target->obd_next_transno_waitq);
 	}
 
@@ -1443,12 +1459,11 @@ int target_handle_connect(struct ptlrpc_request *req)
 
 		class_export_put(export);
 	}
-	if (target_referenced == true && target != NULL) {
+	if (target != NULL) {
 		spin_lock(&target->obd_dev_lock);
 		target->obd_conn_inprogress--;
 		spin_unlock(&target->obd_dev_lock);
-
-		class_decref(target, __func__, current);
+		class_decref(target, "find", current);
 	}
 	req->rq_status = rc;
 	RETURN(rc);
@@ -1460,11 +1475,23 @@ int target_handle_disconnect(struct ptlrpc_request *req)
         ENTRY;
 
         rc = req_capsule_server_pack(&req->rq_pill);
-        if (rc)
-                RETURN(rc);
+	if (rc)
+		RETURN(rc);
+
+	/* In case of target disconnect, updating sec ctx immediately is
+	 * required in order to record latest sequence number used.
+	 * Sequence is normally updated on export destroy, but this event
+	 * can occur too late, ie after a new target connect request has
+	 * been processed.
+	 * Maintaining correct sequence when client connection becomes idle
+	 * ensures that GSS does not erroneously consider requests as replays.
+	 */
+	rc = sptlrpc_export_update_ctx(req->rq_export);
+	if (rc)
+		RETURN(rc);
 
 	/* Keep the rq_export around so we can send the reply. */
-        req->rq_status = obd_disconnect(class_export_get(req->rq_export));
+	req->rq_status = obd_disconnect(class_export_get(req->rq_export));
 
         RETURN(0);
 }
@@ -1577,14 +1604,14 @@ static void target_finish_recovery(struct lu_target *lut)
 
 	/* Only log a recovery message when recovery has occurred. */
 	if (obd->obd_recovery_start) {
-		time64_t now = ktime_get_real_seconds();
+		time64_t now = ktime_get_seconds();
 		time64_t elapsed_time;
 
 		elapsed_time = max_t(time64_t, now - obd->obd_recovery_start, 1);
 		LCONSOLE_INFO("%s: Recovery over after %lld:%.02lld, of %d clients "
 			"%d recovered and %d %s evicted.\n", obd->obd_name,
 			(s64)elapsed_time / 60, (s64)elapsed_time % 60,
-			obd->obd_max_recoverable_clients,
+			atomic_read(&obd->obd_max_recoverable_clients),
 			atomic_read(&obd->obd_connected_clients),
 			obd->obd_stale_clients,
 			obd->obd_stale_clients == 1 ? "was" : "were");
@@ -1607,15 +1634,16 @@ static void target_finish_recovery(struct lu_target *lut)
 	}
 	spin_unlock(&obd->obd_recovery_task_lock);
 
-	obd->obd_recovery_end = ktime_get_real_seconds();
+	obd->obd_recovery_end = ktime_get_seconds();
 
 	/* When recovery finished, cleanup orphans on MDS and OST. */
-        if (OBT(obd) && OBP(obd, postrecov)) {
-                int rc = OBP(obd, postrecov)(obd);
-                if (rc < 0)
-                        LCONSOLE_WARN("%s: Post recovery failed, rc %d\n",
-                                      obd->obd_name, rc);
-        }
+	if (obd->obd_type && OBP(obd, postrecov)) {
+		int rc = OBP(obd, postrecov)(obd);
+
+		if (rc < 0)
+			LCONSOLE_WARN("%s: Post recovery failed, rc %d\n",
+				      obd->obd_name, rc);
+	}
         EXIT;
 }
 
@@ -1712,12 +1740,14 @@ EXPORT_SYMBOL(target_cleanup_recovery);
 /* obd_recovery_task_lock should be held */
 void target_cancel_recovery_timer(struct obd_device *obd)
 {
-        CDEBUG(D_HA, "%s: cancel recovery timer\n", obd->obd_name);
-	del_timer(&obd->obd_recovery_timer);
+	CDEBUG(D_HA, "%s: cancel recovery timer\n", obd->obd_name);
+	hrtimer_cancel(&obd->obd_recovery_timer);
 }
 
 static void target_start_recovery_timer(struct obd_device *obd)
 {
+	ktime_t delay;
+
 	if (obd->obd_recovery_start != 0)
 		return;
 
@@ -1734,33 +1764,36 @@ static void target_start_recovery_timer(struct obd_device *obd)
 		return;
 	}
 
-	mod_timer(&obd->obd_recovery_timer,
-		  cfs_time_shift(obd->obd_recovery_timeout));
-	obd->obd_recovery_start = ktime_get_real_seconds();
+	obd->obd_recovery_start = ktime_get_seconds();
+	delay = ktime_set(obd->obd_recovery_start +
+			  obd->obd_recovery_timeout, 0);
+	hrtimer_start(&obd->obd_recovery_timer, delay, HRTIMER_MODE_ABS);
 	spin_unlock(&obd->obd_dev_lock);
 
-	LCONSOLE_WARN("%s: Will be in recovery for at least %llu:%02llu, or until %d client%s reconnect%s\n",
+	LCONSOLE_WARN("%s: Will be in recovery for at least %lu:%02lu, or until %d client%s reconnect%s\n",
 		      obd->obd_name,
 		      obd->obd_recovery_timeout / 60,
 		      obd->obd_recovery_timeout % 60,
-		      obd->obd_max_recoverable_clients,
-		      (obd->obd_max_recoverable_clients == 1) ? "" : "s",
-		      (obd->obd_max_recoverable_clients == 1) ? "s": "");
+		      atomic_read(&obd->obd_max_recoverable_clients),
+		      (atomic_read(&obd->obd_max_recoverable_clients) == 1) ?
+		      "" : "s",
+		      (atomic_read(&obd->obd_max_recoverable_clients) == 1) ?
+		      "s" : "");
 }
 
 /**
  * extend recovery window.
  *
- * if @extend is true, extend recovery window to have @drt remaining at least;
- * otherwise, make sure the recovery timeout value is not less than @drt.
+ * if @extend is true, extend recovery window to have @dr_timeout remaining
+ * at least; otherwise, make sure the recovery timeout value is not less
+ * than @dr_timeout.
  */
-static void extend_recovery_timer(struct obd_device *obd, int drt,
+static void extend_recovery_timer(struct obd_device *obd, time_t dr_timeout,
 				  bool extend)
 {
-	time64_t now;
-	time64_t end;
-	time64_t left;
-	time64_t to;
+	ktime_t left_ns;
+	time_t timeout;
+	time_t left;
 
 	spin_lock(&obd->obd_dev_lock);
 	if (!obd->obd_recovering || obd->obd_abort_recovery) {
@@ -1769,33 +1802,43 @@ static void extend_recovery_timer(struct obd_device *obd, int drt,
 	}
 	LASSERT(obd->obd_recovery_start != 0);
 
-	now = ktime_get_real_seconds();
-	to = obd->obd_recovery_timeout;
-	end = obd->obd_recovery_start + to;
-	left = end - now;
+	left_ns = hrtimer_expires_remaining(&obd->obd_recovery_timer);
+	left = ktime_divns(left_ns, NSEC_PER_SEC);
 
-        if (extend && (drt > left)) {
-                to += drt - left;
-        } else if (!extend && (drt > to)) {
-                to = drt;
-        }
-
-	if (to > obd->obd_recovery_time_hard) {
-		to = obd->obd_recovery_time_hard;
-		CWARN("%s: extended recovery timer reaching hard limit: %lld, extend: %d\n",
-		      obd->obd_name, to, extend);
+	if (extend) {
+		timeout = obd->obd_recovery_timeout;
+		/* dr_timeout will happen after the hrtimer has expired.
+		 * Add the excess time to the soft recovery timeout without
+		 * exceeding the hard recovery timeout.
+		 */
+		if (dr_timeout > left) {
+			timeout += dr_timeout - left;
+			timeout = min_t(time_t, obd->obd_recovery_time_hard,
+					timeout);
+		}
+	} else {
+		timeout = clamp_t(time_t, dr_timeout, obd->obd_recovery_timeout,
+				  obd->obd_recovery_time_hard);
 	}
 
-	if (obd->obd_recovery_timeout < to) {
-                obd->obd_recovery_timeout = to;
-		end = obd->obd_recovery_start + to;
-		mod_timer(&obd->obd_recovery_timer,
-			  cfs_time_shift(end - now));
-        }
+	if (timeout == obd->obd_recovery_time_hard)
+		CWARN("%s: extended recovery timer reached hard limit: %ld, extend: %d\n",
+		      obd->obd_name, timeout, extend);
+
+	if (obd->obd_recovery_timeout < timeout) {
+		ktime_t end, now;
+
+		obd->obd_recovery_timeout = timeout;
+		end = ktime_set(obd->obd_recovery_start + timeout, 0);
+		now = ktime_set(ktime_get_seconds(), 0);
+		left_ns = ktime_sub(end, now);
+		hrtimer_start(&obd->obd_recovery_timer, end, HRTIMER_MODE_ABS);
+		left = ktime_divns(left_ns, NSEC_PER_SEC);
+	}
 	spin_unlock(&obd->obd_dev_lock);
 
-	CDEBUG(D_HA, "%s: recovery timer will expire in %lld seconds\n",
-		obd->obd_name, (s64)(end - now));
+	CDEBUG(D_HA, "%s: recovery timer will expire in %ld seconds\n",
+		obd->obd_name, left);
 }
 
 /* Reset the timer with each new client connection */
@@ -1808,40 +1851,45 @@ static void extend_recovery_timer(struct obd_device *obd, int drt,
  * be extended to make sure the client could be reconnected, in the
  * process, the timeout from the new client should be ignored.
  */
-
 static void
 check_and_start_recovery_timer(struct obd_device *obd,
-                               struct ptlrpc_request *req,
-                               int new_client)
+			       struct ptlrpc_request *req,
+			       int new_client)
 {
-        int service_time = lustre_msg_get_service_time(req->rq_reqmsg);
-        struct obd_device_target *obt = &obd->u.obt;
+	timeout_t service_timeout = lustre_msg_get_service_timeout(req->rq_reqmsg);
+	struct obd_device_target *obt = &obd->u.obt;
 
-        if (!new_client && service_time)
-                /* Teach server about old server's estimates, as first guess
-                 * at how long new requests will take. */
+	if (!new_client && service_timeout)
+		/*
+		 * Teach server about old server's estimates, as first guess
+		 * at how long new requests will take.
+		 */
 		at_measured(&req->rq_rqbd->rqbd_svcpt->scp_at_estimate,
-                            service_time);
+			    service_timeout);
 
-        target_start_recovery_timer(obd);
+	target_start_recovery_timer(obd);
 
-	/* Convert the service time to RPC timeout,
-	 * and reuse service_time to limit stack usage. */
-	service_time = at_est2timeout(service_time);
+	/*
+	 * Convert the service time to RPC timeout,
+	 * and reuse service_timeout to limit stack usage.
+	 */
+	service_timeout = at_est2timeout(service_timeout);
 
 	if (OBD_FAIL_CHECK(OBD_FAIL_TGT_SLUGGISH_NET) &&
-	    service_time < at_extra)
-		service_time = at_extra;
+	    service_timeout < at_extra)
+		service_timeout = at_extra;
 
-	/* We expect other clients to timeout within service_time, then try
+	/*
+	 * We expect other clients to timeout within service_timeout, then try
 	 * to reconnect, then try the failover server.  The max delay between
-	 * connect attempts is SWITCH_MAX + SWITCH_INC + INITIAL. */
-        service_time += 2 * INITIAL_CONNECT_TIMEOUT;
+	 * connect attempts is SWITCH_MAX + SWITCH_INC + INITIAL.
+	 */
+	service_timeout += 2 * INITIAL_CONNECT_TIMEOUT;
 
-        LASSERT(obt->obt_magic == OBT_MAGIC);
-	service_time += 2 * (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC);
-	if (service_time > obd->obd_recovery_timeout && !new_client)
-		extend_recovery_timer(obd, service_time, false);
+	LASSERT(obt->obt_magic == OBT_MAGIC);
+	service_timeout += 2 * (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC);
+	if (service_timeout > obd->obd_recovery_timeout && !new_client)
+		extend_recovery_timer(obd, service_timeout, false);
 }
 
 /** Health checking routines */
@@ -1913,9 +1961,10 @@ static int check_for_next_transno(struct lu_target *lut)
 	queue_len = obd->obd_requests_queued_for_recovery;
 	next_transno = obd->obd_next_recovery_transno;
 
-	CDEBUG(D_HA, "max: %d, connected: %d, completed: %d, queue_len: %d, "
-	       "req_transno: %llu, next_transno: %llu\n",
-	       obd->obd_max_recoverable_clients, connected, completed,
+	CDEBUG(D_HA,
+	       "max: %d, connected: %d, completed: %d, queue_len: %d, req_transno: %llu, next_transno: %llu\n",
+	       atomic_read(&obd->obd_max_recoverable_clients),
+	       connected, completed,
 	       queue_len, req_transno, next_transno);
 
 	if (obd->obd_abort_recovery) {
@@ -1987,6 +2036,24 @@ static int check_for_next_lock(struct lu_target *lut)
 	return wake_up;
 }
 
+static int check_update_llog(struct lu_target *lut)
+{
+	struct obd_device *obd = lut->lut_obd;
+	struct target_distribute_txn_data *tdtd = lut->lut_tdtd;
+
+	if (obd->obd_abort_recovery) {
+		CDEBUG(D_HA, "waking for aborted recovery\n");
+		return 1;
+	}
+
+	if (atomic_read(&tdtd->tdtd_recovery_threads_count) == 0) {
+		CDEBUG(D_HA, "waking for completion of reading update log\n");
+		return 1;
+	}
+
+	return 0;
+}
+
 /**
  * wait for recovery events,
  * check its status with help of check_routine
@@ -2010,7 +2077,7 @@ static int target_recovery_overseer(struct lu_target *lut,
 			last = now;
 		}
 	}
-	if (obd->obd_recovery_start != 0 && ktime_get_real_seconds() >=
+	if (obd->obd_recovery_start != 0 && ktime_get_seconds() >=
 	      (obd->obd_recovery_start + obd->obd_recovery_time_hard)) {
 		__u64 next_update_transno = 0;
 
@@ -2026,16 +2093,16 @@ static int target_recovery_overseer(struct lu_target *lut,
 			 * updatelog retrieve threads did not get any records
 			 * yet, let's wait those threads stopped */
 			if (next_update_transno == 0) {
-				struct l_wait_info lwi = { 0 };
+				spin_unlock(&obd->obd_recovery_task_lock);
 
-				l_wait_event(tdtd->tdtd_recovery_threads_waitq,
-				       atomic_read(
-				       &tdtd->tdtd_recovery_threads_count) == 0,
-				       &lwi);
+				while (wait_event_timeout(
+					tdtd->tdtd_recovery_threads_waitq,
+					check_update_llog(lut),
+					cfs_time_seconds(60)) == 0);
 
+				spin_lock(&obd->obd_recovery_task_lock);
 				next_update_transno =
-					distribute_txn_get_next_transno(
-								lut->lut_tdtd);
+					distribute_txn_get_next_transno(tdtd);
 			}
 		}
 
@@ -2088,6 +2155,7 @@ static int target_recovery_overseer(struct lu_target *lut,
 		return 1;
 	} else if (obd->obd_recovery_expired) {
 		obd->obd_recovery_expired = 0;
+
 		/** If some clients died being recovered, evict them */
 		LCONSOLE_WARN("%s: recovery is timed out, "
 			      "evict stale exports\n", obd->obd_name);
@@ -2178,34 +2246,41 @@ static void handle_recovery_req(struct ptlrpc_thread *thread,
         (void)handler(req);
         lu_context_exit(&thread->t_env->le_ctx);
 
-        /* don't reset timer for final stage */
-        if (!exp_finished(req->rq_export)) {
-                int to = obd_timeout;
+	req->rq_svc_thread->t_env->le_ses = NULL;
+
+	/* don't reset timer for final stage */
+	if (!exp_finished(req->rq_export)) {
+		timeout_t timeout = obd_timeout;
 
-                /**
-                 * Add request timeout to the recovery time so next request from
-                 * this client may come in recovery time
-                 */
-                if (!AT_OFF) {
+		/**
+		 * Add request @timeout to the recovery time so next request from
+		 * this client may come in recovery time
+		 */
+		if (!AT_OFF) {
 			struct ptlrpc_service_part *svcpt;
+			timeout_t est_timeout;
 
 			svcpt = req->rq_rqbd->rqbd_svcpt;
 			/* If the server sent early reply for this request,
 			 * the client will recalculate the timeout according to
 			 * current server estimate service time, so we will
 			 * use the maxium timeout here for waiting the client
-			 * sending the next req */
-			to = max((int)at_est2timeout(
-				 at_get(&svcpt->scp_at_estimate)),
-				 (int)lustre_msg_get_timeout(req->rq_reqmsg));
-			/* Add 2 net_latency, one for balance rq_deadline
+			 * sending the next req
+			 */
+			est_timeout = at_get(&svcpt->scp_at_estimate);
+			timeout = max_t(timeout_t, at_est2timeout(est_timeout),
+					lustre_msg_get_timeout(req->rq_reqmsg));
+			/*
+			 * Add 2 net_latency, one for balance rq_deadline
 			 * (see ptl_send_rpc), one for resend the req to server,
 			 * Note: client will pack net_latency in replay req
-			 * (see ptlrpc_replay_req) */
-			to += 2 * lustre_msg_get_service_time(req->rq_reqmsg);
-                }
-                extend_recovery_timer(class_exp2obd(req->rq_export), to, true);
-        }
+			 * (see ptlrpc_replay_req)
+			 */
+			timeout += 2 * lustre_msg_get_service_timeout(req->rq_reqmsg);
+		}
+		extend_recovery_timer(class_exp2obd(req->rq_export), timeout,
+				      true);
+	}
 	EXIT;
 }
 
@@ -2215,15 +2290,17 @@ static int check_for_recovery_ready(struct lu_target *lut)
 	struct obd_device *obd = lut->lut_obd;
 	unsigned int clnts = atomic_read(&obd->obd_connected_clients);
 
-	CDEBUG(D_HA, "connected %d stale %d max_recoverable_clients %d"
-	       " abort %d expired %d\n", clnts, obd->obd_stale_clients,
-	       obd->obd_max_recoverable_clients, obd->obd_abort_recovery,
-	       obd->obd_recovery_expired);
+	CDEBUG(D_HA,
+	       "connected %d stale %d max_recoverable_clients %d abort %d expired %d\n",
+	       clnts, obd->obd_stale_clients,
+	       atomic_read(&obd->obd_max_recoverable_clients),
+	       obd->obd_abort_recovery, obd->obd_recovery_expired);
 
 	if (!obd->obd_abort_recovery && !obd->obd_recovery_expired) {
-		LASSERT(clnts <= obd->obd_max_recoverable_clients);
+		LASSERT(clnts <=
+			atomic_read(&obd->obd_max_recoverable_clients));
 		if (clnts + obd->obd_stale_clients <
-		    obd->obd_max_recoverable_clients)
+		    atomic_read(&obd->obd_max_recoverable_clients))
 			return 0;
 	}
 
@@ -2234,7 +2311,8 @@ static int check_for_recovery_ready(struct lu_target *lut)
 			 * timer expired, and some clients got evicted */
 			extend_recovery_timer(obd, obd->obd_recovery_timeout,
 					      true);
-			CDEBUG(D_HA, "%s update recovery is not ready, extend recovery %llu\n",
+			CDEBUG(D_HA,
+			       "%s update recovery is not ready, extend recovery %lu\n",
 			       obd->obd_name, obd->obd_recovery_timeout);
 			return 0;
 		}
@@ -2327,6 +2405,8 @@ static void drop_duplicate_replay_req(struct lu_env *env,
 	obd->obd_replayed_requests++;
 }
 
+#define WATCHDOG_TIMEOUT (obd_timeout * 10)
+
 static void replay_request_or_update(struct lu_env *env,
 				     struct lu_target *lut,
 				     struct target_recovery_data *trd,
@@ -2397,8 +2477,13 @@ static void replay_request_or_update(struct lu_env *env,
 				  lustre_msg_get_transno(req->rq_reqmsg),
 				  libcfs_nid2str(req->rq_peer.nid));
 
+			thread->t_watchdog = lc_watchdog_add(WATCHDOG_TIMEOUT,
+							     NULL, NULL);
 			handle_recovery_req(thread, req,
 					    trd->trd_recovery_handler);
+			lc_watchdog_delete(thread->t_watchdog);
+			thread->t_watchdog = NULL;
+
 			/**
 			 * bz18031: increase next_recovery_transno before
 			 * target_request_copy_put() will drop exp_rpc reference
@@ -2418,7 +2503,11 @@ static void replay_request_or_update(struct lu_env *env,
 			LASSERT(tdtd != NULL);
 			dtrq = distribute_txn_get_next_req(tdtd);
 			lu_context_enter(&thread->t_env->le_ctx);
+			thread->t_watchdog = lc_watchdog_add(WATCHDOG_TIMEOUT,
+							     NULL, NULL);
 			rc = tdtd->tdtd_replay_handler(env, tdtd, dtrq);
+			lc_watchdog_delete(thread->t_watchdog);
+			thread->t_watchdog = NULL;
 			lu_context_exit(&thread->t_env->le_ctx);
 			extend_recovery_timer(obd, obd_timeout, true);
 
@@ -2473,18 +2562,16 @@ static int target_recovery_thread(void *arg)
         if (thread == NULL)
                 RETURN(-ENOMEM);
 
-        OBD_ALLOC_PTR(env);
-        if (env == NULL) {
-                OBD_FREE_PTR(thread);
-                RETURN(-ENOMEM);
-        }
+	OBD_ALLOC_PTR(env);
+	if (env == NULL)
+		GOTO(out_thread, rc = -ENOMEM);
+	rc = lu_env_add(env);
+	if (rc)
+		GOTO(out_env, rc);
 
         rc = lu_context_init(&env->le_ctx, LCT_MD_THREAD | LCT_DT_THREAD);
-        if (rc) {
-                OBD_FREE_PTR(thread);
-                OBD_FREE_PTR(env);
-                RETURN(rc);
-        }
+	if (rc)
+		GOTO(out_env_remove, rc);
 
         thread->t_env = env;
         thread->t_id = -1; /* force filter_iobuf_get/put to use local buffers */
@@ -2526,6 +2613,11 @@ static int target_recovery_thread(void *arg)
 		LASSERT(trd->trd_processing_task == current_pid());
 		DEBUG_REQ(D_HA, req, "processing lock from %s: ",
 			  libcfs_nid2str(req->rq_peer.nid));
+		if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_LOCK_REPLAY)) {
+			req->rq_status = -ENODEV;
+			target_request_copy_put(req);
+			continue;
+		}
 		handle_recovery_req(thread, req,
 				    trd->trd_recovery_handler);
 		target_request_copy_put(req);
@@ -2576,8 +2668,12 @@ static int target_recovery_thread(void *arg)
 	complete(&trd->trd_finishing);
 
 	tgt_io_thread_done(thread);
-	OBD_FREE_PTR(thread);
+out_env_remove:
+	lu_env_remove(env);
+out_env:
 	OBD_FREE_PTR(env);
+out_thread:
+	OBD_FREE_PTR(thread);
 	RETURN(rc);
 }
 
@@ -2634,17 +2730,20 @@ void target_recovery_fini(struct obd_device *obd)
 }
 EXPORT_SYMBOL(target_recovery_fini);
 
-static void target_recovery_expired(cfs_timer_cb_arg_t data)
+static enum hrtimer_restart target_recovery_expired(struct hrtimer *timer)
 {
-	struct obd_device *obd = cfs_from_timer(obd, data, obd_recovery_timer);
-	CDEBUG(D_HA, "%s: recovery timed out; %d clients are still in recovery"
-	       " after %llus (%d clients connected)\n",
+	struct obd_device *obd = container_of(timer, struct obd_device,
+					      obd_recovery_timer);
+
+	CDEBUG(D_HA,
+	       "%s: recovery timed out; %d clients are still in recovery after %llu seconds (%d clients connected)\n",
 	       obd->obd_name, atomic_read(&obd->obd_lock_replay_clients),
-	       (s64)(ktime_get_real_seconds() - obd->obd_recovery_start),
+	       ktime_get_real_seconds() - obd->obd_recovery_start,
 	       atomic_read(&obd->obd_connected_clients));
 
 	obd->obd_recovery_expired = 1;
 	wake_up(&obd->obd_next_transno_waitq);
+	return HRTIMER_NORESTART;
 }
 
 void target_recovery_init(struct lu_target *lut, svc_handler_t handler)
@@ -2654,7 +2753,7 @@ void target_recovery_init(struct lu_target *lut, svc_handler_t handler)
 	if (lut->lut_bottom->dd_rdonly)
 		return;
 
-	if (obd->obd_max_recoverable_clients == 0) {
+	if (atomic_read(&obd->obd_max_recoverable_clients) == 0) {
 		/** Update server last boot epoch */
 		tgt_boot_epoch_update(lut);
 		return;
@@ -2662,14 +2761,16 @@ void target_recovery_init(struct lu_target *lut, svc_handler_t handler)
 
 	CDEBUG(D_HA, "RECOVERY: service %s, %d recoverable clients, "
 	       "last_transno %llu\n", obd->obd_name,
-	       obd->obd_max_recoverable_clients, obd->obd_last_committed);
-        LASSERT(obd->obd_stopping == 0);
-        obd->obd_next_recovery_transno = obd->obd_last_committed + 1;
-        obd->obd_recovery_start = 0;
-        obd->obd_recovery_end = 0;
-
-	cfs_timer_setup(&obd->obd_recovery_timer, target_recovery_expired,
-			(unsigned long)obd, 0);
+	       atomic_read(&obd->obd_max_recoverable_clients),
+	       obd->obd_last_committed);
+	LASSERT(obd->obd_stopping == 0);
+	obd->obd_next_recovery_transno = obd->obd_last_committed + 1;
+	obd->obd_recovery_start = 0;
+	obd->obd_recovery_end = 0;
+
+	hrtimer_init(&obd->obd_recovery_timer, CLOCK_MONOTONIC,
+		     HRTIMER_MODE_ABS);
+	obd->obd_recovery_timer.function = &target_recovery_expired;
 	target_start_recovery_thread(lut, handler);
 }
 EXPORT_SYMBOL(target_recovery_init);
@@ -2725,6 +2826,17 @@ int target_queue_recovery_request(struct ptlrpc_request *req,
         target_process_req_flags(obd, req);
 
         if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LOCK_REPLAY_DONE) {
+		if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_TGT_RECOVERY_REQ_RACE))) {
+			if (cfs_fail_val == 1) {
+				cfs_race_state = 1;
+				cfs_fail_val = 0;
+				wake_up(&cfs_race_waitq);
+
+				set_current_state(TASK_INTERRUPTIBLE);
+				schedule_timeout(cfs_time_seconds(1));
+			}
+		}
+
                 /* client declares he's ready to complete recovery
                  * so, we put the request on th final queue */
 		target_request_copy_get(req);
@@ -2875,12 +2987,6 @@ int target_queue_recovery_request(struct ptlrpc_request *req,
 	RETURN(0);
 }
 
-int target_handle_ping(struct ptlrpc_request *req)
-{
-        obd_ping(req->rq_svc_thread->t_env, req->rq_export);
-        return req_capsule_server_pack(&req->rq_pill);
-}
-
 void target_committed_to_req(struct ptlrpc_request *req)
 {
         struct obd_export *exp = req->rq_export;
@@ -3172,10 +3278,10 @@ static inline const char *bulk2type(struct ptlrpc_request *req)
 int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc,
                    struct l_wait_info *lwi)
 {
-	struct ptlrpc_request	*req = desc->bd_req;
-	time_t			 start = cfs_time_current_sec();
-	time_t			 deadline;
-	int			 rc = 0;
+	struct ptlrpc_request *req = desc->bd_req;
+	time64_t start = ktime_get_seconds();
+	time64_t deadline;
+	int rc = 0;
 
 	ENTRY;
 
@@ -3222,12 +3328,13 @@ int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc,
 		deadline = req->rq_deadline;
 
 	do {
-		long timeoutl = deadline - cfs_time_current_sec();
-		cfs_duration_t timeout = timeoutl <= 0 ?
-					 CFS_TICK : cfs_time_seconds(timeoutl);
-		time_t	rq_deadline;
+		time64_t timeoutl = deadline - ktime_get_seconds();
+		long timeout_jiffies = timeoutl <= 0 ?
+				       1 : cfs_time_seconds(timeoutl);
+		time64_t rq_deadline;
 
-		*lwi = LWI_TIMEOUT_INTERVAL(timeout, cfs_time_seconds(1),
+		*lwi = LWI_TIMEOUT_INTERVAL(timeout_jiffies,
+					    cfs_time_seconds(1),
 					    target_bulk_timeout, desc);
 		rc = l_wait_event(desc->bd_waitq,
 				  !ptlrpc_server_bulk_active(desc) ||
@@ -3237,17 +3344,17 @@ int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc,
 				  lwi);
 		LASSERT(rc == 0 || rc == -ETIMEDOUT);
 		/* Wait again if we changed rq_deadline. */
-		rq_deadline = ACCESS_ONCE(req->rq_deadline);
+		rq_deadline = READ_ONCE(req->rq_deadline);
 		deadline = start + bulk_timeout;
 		if (deadline > rq_deadline)
 			deadline = rq_deadline;
-	} while ((rc == -ETIMEDOUT) &&
-		 (deadline > cfs_time_current_sec()));
+	} while (rc == -ETIMEDOUT &&
+		 deadline > ktime_get_seconds());
 
 	if (rc == -ETIMEDOUT) {
-		DEBUG_REQ(D_ERROR, req, "timeout on bulk %s after %ld%+lds",
+		DEBUG_REQ(D_ERROR, req, "timeout on bulk %s after %lld%+llds",
 			  bulk2type(req), deadline - start,
-			  cfs_time_current_sec() - deadline);
+			  ktime_get_real_seconds() - deadline);
 		ptlrpc_abort_bulk(desc);
 	} else if (exp->exp_failed) {
 		DEBUG_REQ(D_ERROR, req, "Eviction on bulk %s",
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c
index df28b2d7b5131..42eccaf9cf861 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lock.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2016, Intel Corporation.
+ * Copyright (c) 2010, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -44,6 +44,9 @@
 
 #include "ldlm_internal.h"
 
+struct kmem_cache *ldlm_glimpse_work_kmem;
+EXPORT_SYMBOL(ldlm_glimpse_work_kmem);
+
 /* lock types */
 char *ldlm_lockname[] = {
 	[0] = "--",
@@ -122,8 +125,6 @@ const char *ldlm_it2str(enum ldlm_intent_flags it)
 		return "getattr";
 	case IT_LOOKUP:
 		return "lookup";
-	case IT_UNLINK:
-		return "unlink";
 	case IT_GETXATTR:
 		return "getxattr";
 	case IT_LAYOUT:
@@ -150,6 +151,19 @@ ldlm_processing_policy ldlm_get_processing_policy(struct ldlm_resource *res)
         return ldlm_processing_policy_table[res->lr_type];
 }
 EXPORT_SYMBOL(ldlm_get_processing_policy);
+
+static ldlm_reprocessing_policy ldlm_reprocessing_policy_table[] = {
+	[LDLM_PLAIN]	= ldlm_reprocess_queue,
+	[LDLM_EXTENT]	= ldlm_reprocess_queue,
+	[LDLM_FLOCK]	= ldlm_reprocess_queue,
+	[LDLM_IBITS]	= ldlm_reprocess_inodebits_queue,
+};
+
+ldlm_reprocessing_policy ldlm_get_reprocessing_policy(struct ldlm_resource *res)
+{
+	return ldlm_reprocessing_policy_table[res->lr_type];
+}
+
 #endif /* HAVE_SERVER_SUPPORT */
 
 void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg)
@@ -204,8 +218,6 @@ void ldlm_lock_put(struct ldlm_lock *lock)
                 lprocfs_counter_decr(ldlm_res_to_ns(res)->ns_stats,
                                      LDLM_NSS_LOCKS);
                 lu_ref_del(&res->lr_reference, "lock", lock);
-                ldlm_resource_putref(res);
-                lock->l_resource = NULL;
                 if (lock->l_export) {
                         class_export_lock_put(lock->l_export, lock);
                         lock->l_export = NULL;
@@ -214,7 +226,15 @@ void ldlm_lock_put(struct ldlm_lock *lock)
                 if (lock->l_lvb_data != NULL)
                         OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
 
-                ldlm_interval_free(ldlm_interval_detach(lock));
+		if (res->lr_type == LDLM_EXTENT) {
+			ldlm_interval_free(ldlm_interval_detach(lock));
+		} else if (res->lr_type == LDLM_IBITS) {
+			if (lock->l_ibits_node != NULL)
+				OBD_SLAB_FREE_PTR(lock->l_ibits_node,
+						  ldlm_inodebits_slab);
+		}
+		ldlm_resource_putref(res);
+		lock->l_resource = NULL;
                 lu_ref_fini(&lock->l_reference);
 		OBD_FREE_RCU(lock, sizeof(*lock), &lock->l_handle);
         }
@@ -477,7 +497,7 @@ static struct ldlm_lock *ldlm_lock_new(struct ldlm_resource *resource)
 
         lprocfs_counter_incr(ldlm_res_to_ns(resource)->ns_stats,
                              LDLM_NSS_LOCKS);
-	INIT_LIST_HEAD(&lock->l_handle.h_link);
+	INIT_LIST_HEAD_RCU(&lock->l_handle.h_link);
 	class_handle_hash(&lock->l_handle, &lock_handle_ops);
 
         lu_ref_init(&lock->l_reference);
@@ -664,12 +684,19 @@ static void ldlm_add_bl_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
 		 * discard dirty data, rather than writing back. */
 		if (ldlm_is_ast_discard_data(new))
 			ldlm_set_discard_data(lock);
-		LASSERT(list_empty(&lock->l_bl_ast));
-		list_add(&lock->l_bl_ast, work_list);
-                LDLM_LOCK_GET(lock);
-                LASSERT(lock->l_blocking_lock == NULL);
-                lock->l_blocking_lock = LDLM_LOCK_GET(new);
-        }
+
+		/* Lock can be converted from a blocking state back to granted
+		 * after lock convert or COS downgrade but still be in an
+		 * older bl_list because it is controlled only by
+		 * ldlm_work_bl_ast_lock(), let it be processed there.
+		 */
+		if (list_empty(&lock->l_bl_ast)) {
+			list_add(&lock->l_bl_ast, work_list);
+			LDLM_LOCK_GET(lock);
+		}
+		LASSERT(lock->l_blocking_lock == NULL);
+		lock->l_blocking_lock = LDLM_LOCK_GET(new);
+	}
 }
 
 /**
@@ -867,7 +894,8 @@ void ldlm_lock_decref_internal(struct ldlm_lock *lock, enum ldlm_mode mode)
         } else if (ns_is_client(ns) &&
                    !lock->l_readers && !lock->l_writers &&
 		   !ldlm_is_no_lru(lock) &&
-		   !ldlm_is_bl_ast(lock)) {
+		   !ldlm_is_bl_ast(lock) &&
+		   !ldlm_is_converting(lock)) {
 
                 LDLM_DEBUG(lock, "add lock into lru list");
 
@@ -1071,16 +1099,14 @@ static void ldlm_granted_list_add_lock(struct ldlm_lock *lock,
  * Add a lock to granted list on a resource maintaining skiplist
  * correctness.
  */
-static void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock)
+void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock)
 {
-        struct sl_insert_point prev;
-        ENTRY;
+	struct sl_insert_point prev;
 
-        LASSERT(lock->l_req_mode == lock->l_granted_mode);
+	LASSERT(ldlm_is_granted(lock));
 
-        search_granted_lock(&lock->l_resource->lr_granted, lock, &prev);
-        ldlm_granted_list_add_lock(lock, &prev);
-        EXIT;
+	search_granted_lock(&lock->l_resource->lr_granted, lock, &prev);
+	ldlm_granted_list_add_lock(lock, &prev);
 }
 
 /**
@@ -1090,7 +1116,6 @@ static void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock)
  * NOTE: called by
  *  - ldlm_lock_enqueue
  *  - ldlm_reprocess_queue
- *  - ldlm_lock_convert
  *
  * must be called with lr_lock held
  */
@@ -1131,18 +1156,6 @@ void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list)
         EXIT;
 }
 
-/**
- * Describe the overlap between two locks.  itree_overlap_cb data.
- */
-struct lock_match_data {
-	struct ldlm_lock	*lmd_old;
-	struct ldlm_lock	*lmd_lock;
-	enum ldlm_mode		*lmd_mode;
-	union ldlm_policy_data	*lmd_policy;
-	__u64			 lmd_flags;
-	int			 lmd_unref;
-};
-
 /**
  * Check if the given @lock meets the criteria for a match.
  * A reference on the lock is taken if matched.
@@ -1150,10 +1163,10 @@ struct lock_match_data {
  * \param lock     test-against this lock
  * \param data	   parameters
  */
-static int lock_matches(struct ldlm_lock *lock, struct lock_match_data *data)
+static int lock_matches(struct ldlm_lock *lock, struct ldlm_match_data *data)
 {
 	union ldlm_policy_data *lpol = &lock->l_policy_data;
-	enum ldlm_mode match;
+	enum ldlm_mode match = LCK_MINMODE;
 
 	if (lock == data->lmd_old)
 		return INTERVAL_ITER_STOP;
@@ -1178,6 +1191,17 @@ static int lock_matches(struct ldlm_lock *lock, struct lock_match_data *data)
 
 	if (!(lock->l_req_mode & *data->lmd_mode))
 		return INTERVAL_ITER_CONT;
+
+	/* When we search for ast_data, we are not doing a traditional match,
+	 * so we don't worry about IBITS or extent matching.
+	 */
+	if (data->lmd_has_ast_data) {
+		if (!lock->l_ast_data)
+			return INTERVAL_ITER_CONT;
+
+		goto matched;
+	}
+
 	match = lock->l_req_mode;
 
 	switch (lock->l_resource->lr_type) {
@@ -1211,6 +1235,11 @@ static int lock_matches(struct ldlm_lock *lock, struct lock_match_data *data)
 	if (!equi(data->lmd_flags & LDLM_FL_LOCAL_ONLY, ldlm_is_local(lock)))
 		return INTERVAL_ITER_CONT;
 
+	/* Filter locks by skipping flags */
+	if (data->lmd_skip_flags & lock->l_flags)
+		return INTERVAL_ITER_CONT;
+
+matched:
 	if (data->lmd_flags & LDLM_FL_TEST_LOCK) {
 		LDLM_LOCK_GET(lock);
 		ldlm_lock_touch_in_lru(lock);
@@ -1227,7 +1256,7 @@ static int lock_matches(struct ldlm_lock *lock, struct lock_match_data *data)
 static unsigned int itree_overlap_cb(struct interval_node *in, void *args)
 {
 	struct ldlm_interval *node = to_ldlm_interval(in);
-	struct lock_match_data *data = args;
+	struct ldlm_match_data *data = args;
 	struct ldlm_lock *lock;
 	int rc;
 
@@ -1247,8 +1276,8 @@ static unsigned int itree_overlap_cb(struct interval_node *in, void *args)
  *
  * \retval a referenced lock or NULL.
  */
-static struct ldlm_lock *search_itree(struct ldlm_resource *res,
-				      struct lock_match_data *data)
+struct ldlm_lock *search_itree(struct ldlm_resource *res,
+			       struct ldlm_match_data *data)
 {
 	struct interval_node_extent ext = {
 		.start     = data->lmd_policy->l_extent.start,
@@ -1256,6 +1285,8 @@ static struct ldlm_lock *search_itree(struct ldlm_resource *res,
 	};
 	int idx;
 
+	data->lmd_lock = NULL;
+
 	for (idx = 0; idx < LCK_MODE_NUM; idx++) {
 		struct ldlm_interval_tree *tree = &res->lr_itree[idx];
 
@@ -1267,9 +1298,13 @@ static struct ldlm_lock *search_itree(struct ldlm_resource *res,
 
 		interval_search(tree->lit_root, &ext,
 				itree_overlap_cb, data);
+		if (data->lmd_lock)
+			return data->lmd_lock;
 	}
-	return data->lmd_lock;
+
+	return NULL;
 }
+EXPORT_SYMBOL(search_itree);
 
 
 /**
@@ -1281,16 +1316,19 @@ static struct ldlm_lock *search_itree(struct ldlm_resource *res,
  * \retval a referenced lock or NULL.
  */
 static struct ldlm_lock *search_queue(struct list_head *queue,
-				      struct lock_match_data *data)
+				      struct ldlm_match_data *data)
 {
 	struct ldlm_lock *lock;
 	int rc;
 
+	data->lmd_lock = NULL;
+
 	list_for_each_entry(lock, queue, l_res_link) {
 		rc = lock_matches(lock, data);
 		if (rc == INTERVAL_ITER_STOP)
 			return data->lmd_lock;
 	}
+
 	return NULL;
 }
 
@@ -1366,24 +1404,28 @@ EXPORT_SYMBOL(ldlm_lock_allow_match);
  * keep caller code unchanged), the context failure will be discovered by
  * caller sometime later.
  */
-enum ldlm_mode ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags,
-			       const struct ldlm_res_id *res_id,
-			       enum ldlm_type type,
-			       union ldlm_policy_data *policy,
-			       enum ldlm_mode mode,
-			       struct lustre_handle *lockh, int unref)
-{
-	struct lock_match_data data = {
-		.lmd_old	= NULL,
-		.lmd_lock	= NULL,
-		.lmd_mode	= &mode,
-		.lmd_policy	= policy,
-		.lmd_flags	= flags,
-		.lmd_unref	= unref,
+enum ldlm_mode ldlm_lock_match_with_skip(struct ldlm_namespace *ns,
+					 __u64 flags, __u64 skip_flags,
+					 const struct ldlm_res_id *res_id,
+					 enum ldlm_type type,
+					 union ldlm_policy_data *policy,
+					 enum ldlm_mode mode,
+					 struct lustre_handle *lockh, int unref)
+{
+	struct ldlm_match_data data = {
+		.lmd_old = NULL,
+		.lmd_lock = NULL,
+		.lmd_mode = &mode,
+		.lmd_policy = policy,
+		.lmd_flags = flags,
+		.lmd_skip_flags = skip_flags,
+		.lmd_unref = unref,
+		.lmd_has_ast_data = false,
 	};
 	struct ldlm_resource *res;
 	struct ldlm_lock *lock;
-	int rc = 0;
+	int matched;
+
 	ENTRY;
 
 	if (ns == NULL) {
@@ -1404,101 +1446,78 @@ enum ldlm_mode ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags,
 
 	LDLM_RESOURCE_ADDREF(res);
 	lock_res(res);
-
 	if (res->lr_type == LDLM_EXTENT)
 		lock = search_itree(res, &data);
 	else
 		lock = search_queue(&res->lr_granted, &data);
-	if (lock != NULL)
-		GOTO(out, rc = 1);
-	if (flags & LDLM_FL_BLOCK_GRANTED)
-		GOTO(out, rc = 0);
-	lock = search_queue(&res->lr_converting, &data);
-	if (lock != NULL)
-		GOTO(out, rc = 1);
-	lock = search_queue(&res->lr_waiting, &data);
-	if (lock != NULL)
-		GOTO(out, rc = 1);
-
-        EXIT;
- out:
-        unlock_res(res);
-        LDLM_RESOURCE_DELREF(res);
-        ldlm_resource_putref(res);
+	if (!lock && !(flags & LDLM_FL_BLOCK_GRANTED))
+		lock = search_queue(&res->lr_waiting, &data);
+	matched = lock ? mode : 0;
+	unlock_res(res);
+	LDLM_RESOURCE_DELREF(res);
+	ldlm_resource_putref(res);
 
-        if (lock) {
-                ldlm_lock2handle(lock, lockh);
-                if ((flags & LDLM_FL_LVB_READY) &&
+	if (lock) {
+		ldlm_lock2handle(lock, lockh);
+		if ((flags & LDLM_FL_LVB_READY) &&
 		    (!ldlm_is_lvb_ready(lock))) {
 			__u64 wait_flags = LDLM_FL_LVB_READY |
 				LDLM_FL_DESTROYED | LDLM_FL_FAIL_NOTIFIED;
-                        struct l_wait_info lwi;
-                        if (lock->l_completion_ast) {
-                                int err = lock->l_completion_ast(lock,
-                                                          LDLM_FL_WAIT_NOREPROC,
-                                                                 NULL);
-                                if (err) {
-                                        if (flags & LDLM_FL_TEST_LOCK)
-                                                LDLM_LOCK_RELEASE(lock);
-                                        else
-                                                ldlm_lock_decref_internal(lock,
-                                                                          mode);
-                                        rc = 0;
-                                        goto out2;
-                                }
-                        }
+			struct l_wait_info lwi;
+
+			if (lock->l_completion_ast) {
+				int err = lock->l_completion_ast(lock,
+							LDLM_FL_WAIT_NOREPROC,
+							NULL);
+				if (err)
+					GOTO(out_fail_match, matched = 0);
+			}
 
-                        lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(obd_timeout),
-                                               NULL, LWI_ON_SIGNAL_NOOP, NULL);
+			lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(obd_timeout),
+					       NULL, LWI_ON_SIGNAL_NOOP, NULL);
 
 			/* XXX FIXME see comment on CAN_MATCH in lustre_dlm.h */
-			l_wait_event(lock->l_waitq,
-				     lock->l_flags & wait_flags,
+			l_wait_event(lock->l_waitq, lock->l_flags & wait_flags,
 				     &lwi);
-			if (!ldlm_is_lvb_ready(lock)) {
-                                if (flags & LDLM_FL_TEST_LOCK)
-                                        LDLM_LOCK_RELEASE(lock);
-                                else
-                                        ldlm_lock_decref_internal(lock, mode);
-                                rc = 0;
-                        }
-                }
-        }
- out2:
-        if (rc) {
-		LDLM_DEBUG(lock, "matched (%llu %llu)",
-                           (type == LDLM_PLAIN || type == LDLM_IBITS) ?
-                                res_id->name[2] : policy->l_extent.start,
-                           (type == LDLM_PLAIN || type == LDLM_IBITS) ?
-                                res_id->name[3] : policy->l_extent.end);
-
-                /* check user's security context */
-                if (lock->l_conn_export &&
-                    sptlrpc_import_check_ctx(
-                                class_exp2cliimp(lock->l_conn_export))) {
-                        if (!(flags & LDLM_FL_TEST_LOCK))
-                                ldlm_lock_decref_internal(lock, mode);
-                        rc = 0;
-                }
+			if (!ldlm_is_lvb_ready(lock))
+				GOTO(out_fail_match, matched = 0);
+		}
 
-                if (flags & LDLM_FL_TEST_LOCK)
-                        LDLM_LOCK_RELEASE(lock);
+		/* check user's security context */
+		if (lock->l_conn_export &&
+		    sptlrpc_import_check_ctx(
+				class_exp2cliimp(lock->l_conn_export)))
+			GOTO(out_fail_match, matched = 0);
 
-        } else if (!(flags & LDLM_FL_TEST_LOCK)) {/*less verbose for test-only*/
-                LDLM_DEBUG_NOLOCK("not matched ns %p type %u mode %u res "
+		LDLM_DEBUG(lock, "matched (%llu %llu)",
+			   (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+			   res_id->name[2] : policy->l_extent.start,
+			   (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+			   res_id->name[3] : policy->l_extent.end);
+
+out_fail_match:
+		if (flags & LDLM_FL_TEST_LOCK)
+			LDLM_LOCK_RELEASE(lock);
+		else if (!matched)
+			ldlm_lock_decref_internal(lock, mode);
+	}
+
+	/* less verbose for test-only */
+	if (!matched && !(flags & LDLM_FL_TEST_LOCK)) {
+		LDLM_DEBUG_NOLOCK("not matched ns %p type %u mode %u res "
 				  "%llu/%llu (%llu %llu)", ns,
-                                  type, mode, res_id->name[0], res_id->name[1],
-                                  (type == LDLM_PLAIN || type == LDLM_IBITS) ?
-                                        res_id->name[2] :policy->l_extent.start,
-                                  (type == LDLM_PLAIN || type == LDLM_IBITS) ?
-                                        res_id->name[3] : policy->l_extent.end);
-        }
+				  type, mode, res_id->name[0], res_id->name[1],
+				  (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+				  res_id->name[2] : policy->l_extent.start,
+				  (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+				  res_id->name[3] : policy->l_extent.end);
+	}
 	if (data.lmd_old != NULL)
 		LDLM_LOCK_PUT(data.lmd_old);
 
-	return rc ? mode : 0;
+	return matched;
 }
-EXPORT_SYMBOL(ldlm_lock_match);
+EXPORT_SYMBOL(ldlm_lock_match_with_skip);
 
 enum ldlm_mode ldlm_revalidate_lock_handle(const struct lustre_handle *lockh,
 					   __u64 *bits)
@@ -1669,11 +1688,18 @@ struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns,
 		lock->l_glimpse_ast = cbs->lcs_glimpse;
 	}
 
-	lock->l_tree_node = NULL;
-	/* if this is the extent lock, allocate the interval tree node */
-	if (type == LDLM_EXTENT)
-		if (ldlm_interval_alloc(lock) == NULL)
-			GOTO(out, rc = -ENOMEM);
+	switch (type) {
+	case LDLM_EXTENT:
+		rc = ldlm_extent_alloc_lock(lock);
+		break;
+	case LDLM_IBITS:
+		rc = ldlm_inodebits_alloc_lock(lock);
+		break;
+	default:
+		rc = 0;
+	}
+	if (rc)
+		GOTO(out, rc);
 
 	if (lvb_len) {
 		lock->l_lvb_len = lvb_len;
@@ -1694,6 +1720,30 @@ struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns,
 	RETURN(ERR_PTR(rc));
 }
 
+#ifdef HAVE_SERVER_SUPPORT
+static enum ldlm_error ldlm_lock_enqueue_helper(struct ldlm_lock *lock,
+					     __u64 *flags)
+{
+	struct ldlm_resource *res = lock->l_resource;
+	enum ldlm_error rc = ELDLM_OK;
+	struct list_head rpc_list = LIST_HEAD_INIT(rpc_list);
+	ldlm_processing_policy policy;
+
+	ENTRY;
+
+	policy = ldlm_get_processing_policy(res);
+	policy(lock, flags, LDLM_PROCESS_ENQUEUE, &rc, &rpc_list);
+	if (rc == ELDLM_OK && lock->l_granted_mode != lock->l_req_mode &&
+	    res->lr_type != LDLM_FLOCK)
+		rc = ldlm_handle_conflict_lock(lock, flags, &rpc_list);
+
+	if (!list_empty(&rpc_list))
+		ldlm_discard_bl_list(&rpc_list);
+
+	RETURN(rc);
+}
+#endif
+
 /**
  * Enqueue (request) a lock.
  *
@@ -1704,16 +1754,14 @@ struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns,
  * set, skip all the enqueueing and delegate lock processing to intent policy
  * function.
  */
-enum ldlm_error ldlm_lock_enqueue(struct ldlm_namespace *ns,
+enum ldlm_error ldlm_lock_enqueue(const struct lu_env *env,
+				  struct ldlm_namespace *ns,
 				  struct ldlm_lock **lockp,
 				  void *cookie, __u64 *flags)
 {
 	struct ldlm_lock *lock = *lockp;
 	struct ldlm_resource *res = lock->l_resource;
 	int local = ns_is_client(ldlm_res_to_ns(res));
-#ifdef HAVE_SERVER_SUPPORT
-	ldlm_processing_policy policy;
-#endif
 	enum ldlm_error rc = ELDLM_OK;
 	struct ldlm_interval *node = NULL;
 	ENTRY;
@@ -1721,8 +1769,8 @@ enum ldlm_error ldlm_lock_enqueue(struct ldlm_namespace *ns,
         /* policies are not executed on the client or during replay */
         if ((*flags & (LDLM_FL_HAS_INTENT|LDLM_FL_REPLAY)) == LDLM_FL_HAS_INTENT
             && !local && ns->ns_policy) {
-                rc = ns->ns_policy(ns, lockp, cookie, lock->l_req_mode, *flags,
-                                   NULL);
+		rc = ns->ns_policy(env, ns, lockp, cookie, lock->l_req_mode,
+				   *flags, NULL);
                 if (rc == ELDLM_LOCK_REPLACED) {
                         /* The lock that was returned has already been granted,
                          * and placed into lockp.  If it's not the same as the
@@ -1735,7 +1783,7 @@ enum ldlm_error ldlm_lock_enqueue(struct ldlm_namespace *ns,
                         *flags |= LDLM_FL_LOCK_CHANGED;
                         RETURN(0);
 		} else if (rc != ELDLM_OK &&
-			   lock->l_req_mode == lock->l_granted_mode) {
+			   ldlm_is_granted(lock)) {
 			LASSERT(*flags & LDLM_FL_RESENT);
 			/* It may happen that ns_policy returns an error in
 			 * resend case, object may be unlinked or just some
@@ -1758,7 +1806,7 @@ enum ldlm_error ldlm_lock_enqueue(struct ldlm_namespace *ns,
 		 * Take NO_TIMEOUT from the lock as it is inherited through
 		 * LDLM_FL_INHERIT_MASK */
 		*flags |= LDLM_FL_LOCK_CHANGED;
-		if (lock->l_req_mode != lock->l_granted_mode)
+		if (!ldlm_is_granted(lock))
 			*flags |= LDLM_FL_BLOCK_GRANTED;
 		*flags |= lock->l_flags & LDLM_FL_NO_TIMEOUT;
 		RETURN(ELDLM_OK);
@@ -1771,8 +1819,8 @@ enum ldlm_error ldlm_lock_enqueue(struct ldlm_namespace *ns,
 	if (!local && (*flags & LDLM_FL_REPLAY) && res->lr_type == LDLM_EXTENT)
 		OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, GFP_NOFS);
 
-        lock_res_and_lock(lock);
-        if (local && lock->l_req_mode == lock->l_granted_mode) {
+	lock_res_and_lock(lock);
+	if (local && ldlm_is_granted(lock)) {
                 /* The server returned a blocked lock, but it was granted
                  * before we got a chance to actually enqueue it.  We don't
                  * need to do anything else. */
@@ -1813,33 +1861,27 @@ enum ldlm_error ldlm_lock_enqueue(struct ldlm_namespace *ns,
 	 * more or less trusting the clients not to lie.
 	 *
 	 * FIXME (bug 268): Detect obvious lies by checking compatibility in
-	 * granted/converting queues. */
+	 * granted queue. */
         if (local) {
-                if (*flags & LDLM_FL_BLOCK_CONV)
-                        ldlm_resource_add_lock(res, &res->lr_converting, lock);
-                else if (*flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED))
-                        ldlm_resource_add_lock(res, &res->lr_waiting, lock);
-                else
-                        ldlm_grant_lock(lock, NULL);
+		if (*flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED))
+			ldlm_resource_add_lock(res, &res->lr_waiting, lock);
+		else
+			ldlm_grant_lock(lock, NULL);
 		GOTO(out, rc = ELDLM_OK);
 #ifdef HAVE_SERVER_SUPPORT
-        } else if (*flags & LDLM_FL_REPLAY) {
-                if (*flags & LDLM_FL_BLOCK_CONV) {
-                        ldlm_resource_add_lock(res, &res->lr_converting, lock);
-			GOTO(out, rc = ELDLM_OK);
-                } else if (*flags & LDLM_FL_BLOCK_WAIT) {
-                        ldlm_resource_add_lock(res, &res->lr_waiting, lock);
+	} else if (*flags & LDLM_FL_REPLAY) {
+		if (*flags & LDLM_FL_BLOCK_WAIT) {
+			ldlm_resource_add_lock(res, &res->lr_waiting, lock);
 			GOTO(out, rc = ELDLM_OK);
-                } else if (*flags & LDLM_FL_BLOCK_GRANTED) {
-                        ldlm_grant_lock(lock, NULL);
+		} else if (*flags & LDLM_FL_BLOCK_GRANTED) {
+			ldlm_grant_lock(lock, NULL);
 			GOTO(out, rc = ELDLM_OK);
-                }
-                /* If no flags, fall through to normal enqueue path. */
-        }
+		}
+		/* If no flags, fall through to normal enqueue path. */
+	}
 
-        policy = ldlm_processing_policy_table[res->lr_type];
-	policy(lock, flags, LDLM_PROCESS_ENQUEUE, &rc, NULL);
-        GOTO(out, rc);
+	rc = ldlm_lock_enqueue_helper(lock, flags);
+	GOTO(out, rc);
 #else
         } else {
                 CERROR("This is client-side-only module, cannot handle "
@@ -1864,31 +1906,42 @@ enum ldlm_error ldlm_lock_enqueue(struct ldlm_namespace *ns,
  */
 int ldlm_reprocess_queue(struct ldlm_resource *res, struct list_head *queue,
 			 struct list_head *work_list,
-			 enum ldlm_process_intention intention)
+			 enum ldlm_process_intention intention,
+			 struct ldlm_lock *hint)
 {
 	struct list_head *tmp, *pos;
 	ldlm_processing_policy policy;
 	__u64 flags;
 	int rc = LDLM_ITER_CONTINUE;
 	enum ldlm_error err;
+	struct list_head bl_ast_list = LIST_HEAD_INIT(bl_ast_list);
+
 	ENTRY;
 
 	check_res_locked(res);
 
-	policy = ldlm_processing_policy_table[res->lr_type];
+	policy = ldlm_get_processing_policy(res);
 	LASSERT(policy);
 	LASSERT(intention == LDLM_PROCESS_RESCAN ||
 		intention == LDLM_PROCESS_RECOVERY);
 
+restart:
 	list_for_each_safe(tmp, pos, queue) {
 		struct ldlm_lock *pending;
+		struct list_head rpc_list = LIST_HEAD_INIT(rpc_list);
 
 		pending = list_entry(tmp, struct ldlm_lock, l_res_link);
 
                 CDEBUG(D_INFO, "Reprocessing lock %p\n", pending);
 
                 flags = 0;
-		rc = policy(pending, &flags, intention, &err, work_list);
+		rc = policy(pending, &flags, intention, &err, &rpc_list);
+		if (pending->l_granted_mode == pending->l_req_mode ||
+		    res->lr_type == LDLM_FLOCK) {
+			list_splice(&rpc_list, work_list);
+		} else {
+			list_splice(&rpc_list, &bl_ast_list);
+		}
 		/*
 		 * When this is called from recovery done, we always want
 		 * to scan the whole list no matter what 'rc' is returned.
@@ -1898,6 +1951,20 @@ int ldlm_reprocess_queue(struct ldlm_resource *res, struct list_head *queue,
 			break;
         }
 
+	if (!list_empty(&bl_ast_list)) {
+		unlock_res(res);
+
+		rc = ldlm_run_ast_work(ldlm_res_to_ns(res), &bl_ast_list,
+				       LDLM_WORK_BL_AST);
+
+		lock_res(res);
+		if (rc == -ERESTART)
+			GOTO(restart, rc);
+	}
+
+	if (!list_empty(&bl_ast_list))
+		ldlm_discard_bl_list(&bl_ast_list);
+
         RETURN(intention == LDLM_PROCESS_RESCAN ? rc : LDLM_ITER_CONTINUE);
 }
 
@@ -1908,7 +1975,6 @@ int ldlm_reprocess_queue(struct ldlm_resource *res, struct list_head *queue,
  * \param[in] lock		The lock to be enqueued.
  * \param[out] flags		Lock flags for the lock to be enqueued.
  * \param[in] rpc_list		Conflicting locks list.
- * \param[in] grant_flags	extra flags when granting a lock.
  *
  * \retval -ERESTART:	Some lock was instantly canceled while sending
  * 			blocking ASTs, caller needs to re-check conflicting
@@ -1917,7 +1983,7 @@ int ldlm_reprocess_queue(struct ldlm_resource *res, struct list_head *queue,
  * \reval 0:		Lock is successfully added in waiting list.
  */
 int ldlm_handle_conflict_lock(struct ldlm_lock *lock, __u64 *flags,
-			      struct list_head *rpc_list, __u64 grant_flags)
+			      struct list_head *rpc_list)
 {
 	struct ldlm_resource *res = lock->l_resource;
 	int rc;
@@ -1942,6 +2008,9 @@ int ldlm_handle_conflict_lock(struct ldlm_lock *lock, __u64 *flags,
 	    !ns_is_client(ldlm_res_to_ns(res)))
 		class_fail_export(lock->l_export);
 
+	if (rc == -ERESTART)
+		ldlm_reprocess_all(res, NULL);
+
 	lock_res(res);
 	if (rc == -ERESTART) {
 		/* 15715: The lock was granted and destroyed after
@@ -1953,7 +2022,7 @@ int ldlm_handle_conflict_lock(struct ldlm_lock *lock, __u64 *flags,
 			RETURN(-EAGAIN);
 
 		/* lock was granted while resource was unlocked. */
-		if (lock->l_granted_mode == lock->l_req_mode) {
+		if (ldlm_is_granted(lock)) {
 			/* bug 11300: if the lock has been granted,
 			 * break earlier because otherwise, we will go
 			 * to restart and ldlm_resource_unlink will be
@@ -1961,12 +2030,10 @@ int ldlm_handle_conflict_lock(struct ldlm_lock *lock, __u64 *flags,
 			 * freed. Then we will fail at
 			 * ldlm_extent_add_lock() */
 			*flags &= ~LDLM_FL_BLOCKED_MASK;
-			RETURN(0);
 		}
 
-		RETURN(rc);
 	}
-	*flags |= (LDLM_FL_BLOCK_GRANTED | grant_flags);
+	*flags |= LDLM_FL_BLOCK_GRANTED;
 
 	RETURN(0);
 }
@@ -1979,27 +2046,21 @@ int ldlm_handle_conflict_lock(struct ldlm_lock *lock, __u64 *flags,
  */
 void ldlm_discard_bl_list(struct list_head *bl_list)
 {
-	struct list_head *tmp, *pos;
-        ENTRY;
+	struct ldlm_lock *lock, *tmp;
 
-	list_for_each_safe(pos, tmp, bl_list) {
-                struct ldlm_lock *lock =
-			list_entry(pos, struct ldlm_lock, l_bl_ast);
+	ENTRY;
 
+	list_for_each_entry_safe(lock, tmp, bl_list, l_bl_ast) {
+		LASSERT(!list_empty(&lock->l_bl_ast));
 		list_del_init(&lock->l_bl_ast);
-		LASSERT(ldlm_is_ast_sent(lock));
 		ldlm_clear_ast_sent(lock);
 		LASSERT(lock->l_bl_ast_run == 0);
-		LASSERT(lock->l_blocking_lock);
-		LDLM_LOCK_RELEASE(lock->l_blocking_lock);
-		lock->l_blocking_lock = NULL;
+		ldlm_clear_blocking_lock(lock);
 		LDLM_LOCK_RELEASE(lock);
 	}
 	EXIT;
 }
 
-#endif
-
 /**
  * Process a call to blocking AST callback for a lock in ast_work list
  */
@@ -2007,9 +2068,11 @@ static int
 ldlm_work_bl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
 {
 	struct ldlm_cb_set_arg *arg = opaq;
-	struct ldlm_lock_desc   d;
-	int                     rc;
-	struct ldlm_lock       *lock;
+	struct ldlm_lock *lock;
+	struct ldlm_lock_desc d;
+	struct ldlm_bl_desc bld;
+	int rc;
+
 	ENTRY;
 
 	if (list_empty(arg->list))
@@ -2017,66 +2080,49 @@ ldlm_work_bl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
 
 	lock = list_entry(arg->list->next, struct ldlm_lock, l_bl_ast);
 
-	/* nobody should touch l_bl_ast */
+	/* nobody should touch l_bl_ast but some locks in the list may become
+	 * granted after lock convert or COS downgrade, these locks should be
+	 * just skipped here and removed from the list.
+	 */
 	lock_res_and_lock(lock);
 	list_del_init(&lock->l_bl_ast);
 
+	/* lock is not blocking lock anymore, but was kept in the list because
+	 * it can managed only here.
+	 */
+	if (!ldlm_is_ast_sent(lock)) {
+		unlock_res_and_lock(lock);
+		LDLM_LOCK_RELEASE(lock);
+		RETURN(0);
+	}
+
+	LASSERT(lock->l_blocking_lock);
+	ldlm_lock2desc(lock->l_blocking_lock, &d);
+	/* copy blocking lock ibits in cancel_bits as well,
+	 * new client may use them for lock convert and it is
+	 * important to use new field to convert locks from
+	 * new servers only
+	 */
+	d.l_policy_data.l_inodebits.cancel_bits =
+		lock->l_blocking_lock->l_policy_data.l_inodebits.bits;
+
+	/* Blocking lock is being destroyed here but some information about it
+	 * may be needed inside l_blocking_ast() function below,
+	 * e.g. in mdt_blocking_ast(). So save needed data in bl_desc.
+	 */
+	bld.bl_same_client = lock->l_client_cookie ==
+			     lock->l_blocking_lock->l_client_cookie;
+	bld.bl_cos_incompat = ldlm_is_cos_incompat(lock->l_blocking_lock);
+	arg->bl_desc = &bld;
+
 	LASSERT(ldlm_is_ast_sent(lock));
 	LASSERT(lock->l_bl_ast_run == 0);
-	LASSERT(lock->l_blocking_lock);
 	lock->l_bl_ast_run++;
+	ldlm_clear_blocking_lock(lock);
 	unlock_res_and_lock(lock);
 
-	ldlm_lock2desc(lock->l_blocking_lock, &d);
-
 	rc = lock->l_blocking_ast(lock, &d, (void *)arg, LDLM_CB_BLOCKING);
-	LDLM_LOCK_RELEASE(lock->l_blocking_lock);
-	lock->l_blocking_lock = NULL;
-	LDLM_LOCK_RELEASE(lock);
 
-	RETURN(rc);
-}
-
-/**
- * Process a call to completion AST callback for a lock in ast_work list
- */
-static int
-ldlm_work_cp_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
-{
-	struct ldlm_cb_set_arg  *arg = opaq;
-	int                      rc = 0;
-	struct ldlm_lock        *lock;
-	ldlm_completion_callback completion_callback;
-	ENTRY;
-
-	if (list_empty(arg->list))
-		RETURN(-ENOENT);
-
-	lock = list_entry(arg->list->next, struct ldlm_lock, l_cp_ast);
-
-	/* It's possible to receive a completion AST before we've set
-	 * the l_completion_ast pointer: either because the AST arrived
-	 * before the reply, or simply because there's a small race
-	 * window between receiving the reply and finishing the local
-	 * enqueue. (bug 842)
-	 *
-	 * This can't happen with the blocking_ast, however, because we
-	 * will never call the local blocking_ast until we drop our
-	 * reader/writer reference, which we won't do until we get the
-	 * reply and finish enqueueing. */
-
-	/* nobody should touch l_cp_ast */
-	lock_res_and_lock(lock);
-	list_del_init(&lock->l_cp_ast);
-	LASSERT(ldlm_is_cp_reqd(lock));
-	/* save l_completion_ast since it can be changed by
-	 * mds_intent_policy(), see bug 14225 */
-	completion_callback = lock->l_completion_ast;
-	ldlm_clear_cp_reqd(lock);
-	unlock_res_and_lock(lock);
-
-	if (completion_callback != NULL)
-		rc = completion_callback(lock, 0, (void *)arg);
 	LDLM_LOCK_RELEASE(lock);
 
 	RETURN(rc);
@@ -2141,12 +2187,60 @@ int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
 		rc = 1;
 
 	LDLM_LOCK_RELEASE(lock);
-
-	if ((gl_work->gl_flags & LDLM_GL_WORK_NOFREE) == 0)
+	if (gl_work->gl_flags & LDLM_GL_WORK_SLAB_ALLOCATED)
+		OBD_SLAB_FREE_PTR(gl_work, ldlm_glimpse_work_kmem);
+	else
 		OBD_FREE_PTR(gl_work);
 
 	RETURN(rc);
 }
+#endif
+
+/**
+ * Process a call to completion AST callback for a lock in ast_work list
+ */
+static int
+ldlm_work_cp_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+	struct ldlm_cb_set_arg *arg = opaq;
+	struct ldlm_lock *lock;
+	ldlm_completion_callback completion_callback;
+	int rc = 0;
+
+	ENTRY;
+
+	if (list_empty(arg->list))
+		RETURN(-ENOENT);
+
+	lock = list_entry(arg->list->next, struct ldlm_lock, l_cp_ast);
+
+	/* It's possible to receive a completion AST before we've set
+	 * the l_completion_ast pointer: either because the AST arrived
+	 * before the reply, or simply because there's a small race
+	 * window between receiving the reply and finishing the local
+	 * enqueue. (bug 842)
+	 *
+	 * This can't happen with the blocking_ast, however, because we
+	 * will never call the local blocking_ast until we drop our
+	 * reader/writer reference, which we won't do until we get the
+	 * reply and finish enqueueing. */
+
+	/* nobody should touch l_cp_ast */
+	lock_res_and_lock(lock);
+	list_del_init(&lock->l_cp_ast);
+	LASSERT(ldlm_is_cp_reqd(lock));
+	/* save l_completion_ast since it can be changed by
+	 * mds_intent_policy(), see bug 14225 */
+	completion_callback = lock->l_completion_ast;
+	ldlm_clear_cp_reqd(lock);
+	unlock_res_and_lock(lock);
+
+	if (completion_callback != NULL)
+		rc = completion_callback(lock, 0, (void *)arg);
+	LDLM_LOCK_RELEASE(lock);
+
+	RETURN(rc);
+}
 
 /**
  * Process list of locks in need of ASTs being sent.
@@ -2155,11 +2249,11 @@ int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
  * one.
  */
 int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
-                      ldlm_desc_ast_t ast_type)
+		      ldlm_desc_ast_t ast_type)
 {
 	struct ldlm_cb_set_arg *arg;
-	set_producer_func       work_ast_lock;
-	int                     rc;
+	set_producer_func work_ast_lock;
+	int rc;
 
 	if (list_empty(rpc_list))
 		RETURN(0);
@@ -2172,24 +2266,26 @@ int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
 	arg->list = rpc_list;
 
 	switch (ast_type) {
-		case LDLM_WORK_BL_AST:
-			arg->type = LDLM_BL_CALLBACK;
-			work_ast_lock = ldlm_work_bl_ast_lock;
-			break;
-		case LDLM_WORK_CP_AST:
-			arg->type = LDLM_CP_CALLBACK;
-			work_ast_lock = ldlm_work_cp_ast_lock;
-			break;
-		case LDLM_WORK_REVOKE_AST:
-			arg->type = LDLM_BL_CALLBACK;
-			work_ast_lock = ldlm_work_revoke_ast_lock;
-			break;
-		case LDLM_WORK_GL_AST:
-			arg->type = LDLM_GL_CALLBACK;
-			work_ast_lock = ldlm_work_gl_ast_lock;
-			break;
-		default:
-			LBUG();
+	case LDLM_WORK_CP_AST:
+		arg->type = LDLM_CP_CALLBACK;
+		work_ast_lock = ldlm_work_cp_ast_lock;
+		break;
+#ifdef HAVE_SERVER_SUPPORT
+	case LDLM_WORK_BL_AST:
+		arg->type = LDLM_BL_CALLBACK;
+		work_ast_lock = ldlm_work_bl_ast_lock;
+		break;
+	case LDLM_WORK_REVOKE_AST:
+		arg->type = LDLM_BL_CALLBACK;
+		work_ast_lock = ldlm_work_revoke_ast_lock;
+		break;
+	case LDLM_WORK_GL_AST:
+		arg->type = LDLM_GL_CALLBACK;
+		work_ast_lock = ldlm_work_gl_ast_lock;
+		break;
+#endif
+	default:
+		LBUG();
 	}
 
 	/* We create a ptlrpc request set with flow control extension.
@@ -2201,7 +2297,7 @@ int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
 	if (arg->set == NULL)
 		GOTO(out, rc = -ENOMEM);
 
-	ptlrpc_set_wait(arg->set);
+	ptlrpc_set_wait(NULL, arg->set);
 	ptlrpc_set_destroy(arg->set);
 
 	rc = atomic_read(&arg->restart) ? -ERESTART : 0;
@@ -2214,26 +2310,29 @@ int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
 /**
  * Try to grant all waiting locks on a resource.
  *
- * Calls ldlm_reprocess_queue on converting and waiting queues.
+ * Calls ldlm_reprocess_queue on waiting queue.
  *
  * Typically called after some resource locks are cancelled to see
  * if anything could be granted as a result of the cancellation.
  */
 static void __ldlm_reprocess_all(struct ldlm_resource *res,
-				 enum ldlm_process_intention intention)
+				 enum ldlm_process_intention intention,
+				 struct ldlm_lock *hint)
 {
 	struct list_head rpc_list;
 #ifdef HAVE_SERVER_SUPPORT
+	ldlm_reprocessing_policy reprocess;
 	struct obd_device *obd;
-        int rc;
-        ENTRY;
+	int rc;
+
+	ENTRY;
 
 	INIT_LIST_HEAD(&rpc_list);
-        /* Local lock trees don't get reprocessed. */
-        if (ns_is_client(ldlm_res_to_ns(res))) {
-                EXIT;
-                return;
-        }
+	/* Local lock trees don't get reprocessed. */
+	if (ns_is_client(ldlm_res_to_ns(res))) {
+		EXIT;
+		return;
+	}
 
 	/* Disable reprocess during lock replay stage but allow during
 	 * request replay stage.
@@ -2244,35 +2343,32 @@ static void __ldlm_reprocess_all(struct ldlm_resource *res,
 		RETURN_EXIT;
 restart:
 	lock_res(res);
-	rc = ldlm_reprocess_queue(res, &res->lr_converting, &rpc_list,
-				  intention);
-	if (rc == LDLM_ITER_CONTINUE)
-		ldlm_reprocess_queue(res, &res->lr_waiting, &rpc_list,
-				     intention);
+	reprocess = ldlm_get_reprocessing_policy(res);
+	reprocess(res, &res->lr_waiting, &rpc_list, intention, hint);
 	unlock_res(res);
 
-        rc = ldlm_run_ast_work(ldlm_res_to_ns(res), &rpc_list,
-                               LDLM_WORK_CP_AST);
-        if (rc == -ERESTART) {
+	rc = ldlm_run_ast_work(ldlm_res_to_ns(res), &rpc_list,
+			       LDLM_WORK_CP_AST);
+	if (rc == -ERESTART) {
 		LASSERT(list_empty(&rpc_list));
-                goto restart;
-        }
+		goto restart;
+	}
 #else
-        ENTRY;
+	ENTRY;
 
 	INIT_LIST_HEAD(&rpc_list);
-        if (!ns_is_client(ldlm_res_to_ns(res))) {
-                CERROR("This is client-side-only module, cannot handle "
-                       "LDLM_NAMESPACE_SERVER resource type lock.\n");
-                LBUG();
-        }
+	if (!ns_is_client(ldlm_res_to_ns(res))) {
+		CERROR("This is client-side-only module, cannot handle "
+		       "LDLM_NAMESPACE_SERVER resource type lock.\n");
+		LBUG();
+	}
 #endif
-        EXIT;
+	EXIT;
 }
 
-void ldlm_reprocess_all(struct ldlm_resource *res)
+void ldlm_reprocess_all(struct ldlm_resource *res, struct ldlm_lock *hint)
 {
-	__ldlm_reprocess_all(res, LDLM_PROCESS_RESCAN);
+	__ldlm_reprocess_all(res, LDLM_PROCESS_RESCAN, hint);
 }
 EXPORT_SYMBOL(ldlm_reprocess_all);
 
@@ -2282,7 +2378,7 @@ static int ldlm_reprocess_res(struct cfs_hash *hs, struct cfs_hash_bd *bd,
 	struct ldlm_resource *res = cfs_hash_object(hs, hnode);
 
 	/* This is only called once after recovery done. LU-8306. */
-	__ldlm_reprocess_all(res, LDLM_PROCESS_RECOVERY);
+	__ldlm_reprocess_all(res, LDLM_PROCESS_RECOVERY, NULL);
 	return 0;
 }
 
@@ -2364,6 +2460,7 @@ void ldlm_lock_cancel(struct ldlm_lock *lock)
          * talking to me first. -phik */
         if (lock->l_readers || lock->l_writers) {
                 LDLM_ERROR(lock, "lock still has references");
+		unlock_res_and_lock(lock);
                 LBUG();
         }
 
@@ -2381,8 +2478,8 @@ void ldlm_lock_cancel(struct ldlm_lock *lock)
         ldlm_resource_unlink_lock(lock);
         ldlm_lock_destroy_nolock(lock);
 
-        if (lock->l_granted_mode == lock->l_req_mode)
-                ldlm_pool_del(&ns->ns_pool, lock);
+	if (ldlm_is_granted(lock))
+		ldlm_pool_del(&ns->ns_pool, lock);
 
         /* Make sure we will not be called again for same lock what is possible
          * if not to zero out lock->l_granted_mode */
@@ -2414,6 +2511,7 @@ int ldlm_lock_set_data(const struct lustre_handle *lockh, void *data)
 EXPORT_SYMBOL(ldlm_lock_set_data);
 
 struct export_cl_data {
+	const struct lu_env	*ecl_env;
 	struct obd_export	*ecl_exp;
 	int			ecl_loop;
 };
@@ -2426,10 +2524,10 @@ static void ldlm_cancel_lock_for_export(struct obd_export *exp,
 
 	res = ldlm_resource_getref(lock->l_resource);
 
-	ldlm_res_lvbo_update(res, NULL, 1);
+	ldlm_lvbo_update(res, lock, NULL, 1);
 	ldlm_lock_cancel(lock);
 	if (!exp->exp_obd->obd_stopping)
-		ldlm_reprocess_all(res);
+		ldlm_reprocess_all(res, lock);
 	ldlm_resource_putref(res);
 
 	ecl->ecl_loop++;
@@ -2466,10 +2564,17 @@ ldlm_cancel_locks_for_export_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd,
  */
 int ldlm_export_cancel_blocked_locks(struct obd_export *exp)
 {
+	struct lu_env env;
 	struct export_cl_data	ecl = {
 		.ecl_exp	= exp,
 		.ecl_loop	= 0,
 	};
+	int rc;
+
+	rc = lu_env_init(&env, LCT_DT_THREAD);
+	if (rc)
+		RETURN(rc);
+	ecl.ecl_env = &env;
 
 	while (!list_empty(&exp->exp_bl_list)) {
 		struct ldlm_lock *lock;
@@ -2492,6 +2597,8 @@ int ldlm_export_cancel_blocked_locks(struct obd_export *exp)
 		LDLM_LOCK_RELEASE(lock);
 	}
 
+	lu_env_fini(&env);
+
 	CDEBUG(D_DLMTRACE, "Export %p, canceled %d locks, "
 	       "left on hash table %d.\n", exp, ecl.ecl_loop,
 	       atomic_read(&exp->exp_lock_hash->hs_count));
@@ -2506,10 +2613,16 @@ int ldlm_export_cancel_blocked_locks(struct obd_export *exp)
  */
 int ldlm_export_cancel_locks(struct obd_export *exp)
 {
-	struct export_cl_data	ecl = {
-		.ecl_exp	= exp,
-		.ecl_loop	= 0,
-	};
+	struct export_cl_data ecl;
+	struct lu_env env;
+	int rc;
+
+	rc = lu_env_init(&env, LCT_DT_THREAD);
+	if (rc)
+		RETURN(rc);
+	ecl.ecl_env = &env;
+	ecl.ecl_exp = exp;
+	ecl.ecl_loop = 0;
 
 	cfs_hash_for_each_empty(exp->exp_lock_hash,
 				ldlm_cancel_locks_for_export_cb, &ecl);
@@ -2523,26 +2636,35 @@ int ldlm_export_cancel_locks(struct obd_export *exp)
 	    exp->exp_obd->obd_stopping)
 		ldlm_reprocess_recovery_done(exp->exp_obd->obd_namespace);
 
+	lu_env_fini(&env);
+
 	return ecl.ecl_loop;
 }
 
 /**
- * Downgrade an exclusive lock.
+ * Downgrade an PW/EX lock to COS | CR mode.
  *
- * A fast variant of ldlm_lock_convert for convertion of exclusive locks. The
+ * A lock mode convertion from PW/EX mode to less conflict mode. The
  * convertion may fail if lock was canceled before downgrade, but it doesn't
  * indicate any problem, because such lock has no reader or writer, and will
  * be released soon.
- * Used by Commit on Sharing (COS) code.
+ *
+ * Used by Commit on Sharing (COS) code to force object changes commit in case
+ * of conflict. Converted lock is considered as new lock and all blocking AST
+ * things are cleared, so any pending or new blocked lock on that lock will
+ * cause new call to blocking_ast and force resource object commit.
+ *
+ * Also used by layout_change to replace EX lock to CR lock.
  *
  * \param lock A lock to convert
  * \param new_mode new lock mode
  */
-void ldlm_lock_downgrade(struct ldlm_lock *lock, enum ldlm_mode new_mode)
+void ldlm_lock_mode_downgrade(struct ldlm_lock *lock, enum ldlm_mode new_mode)
 {
+#ifdef HAVE_SERVER_SUPPORT
 	ENTRY;
 
-	LASSERT(new_mode == LCK_COS);
+	LASSERT(new_mode == LCK_COS || new_mode == LCK_CR);
 
 	lock_res_and_lock(lock);
 
@@ -2560,146 +2682,22 @@ void ldlm_lock_downgrade(struct ldlm_lock *lock, enum ldlm_mode new_mode)
 	 * ldlm_grant_lock() called below.
 	 */
 	ldlm_pool_del(&ldlm_lock_to_ns(lock)->ns_pool, lock);
+
+	/* Consider downgraded lock as a new lock and clear all states
+	 * related to a previous blocking AST processing.
+	 */
+	ldlm_clear_blocking_data(lock);
+
 	lock->l_req_mode = new_mode;
 	ldlm_grant_lock(lock, NULL);
-
 	unlock_res_and_lock(lock);
 
-	ldlm_reprocess_all(lock->l_resource);
+	ldlm_reprocess_all(lock->l_resource, lock);
 
 	EXIT;
-}
-EXPORT_SYMBOL(ldlm_lock_downgrade);
-
-/**
- * Attempt to convert already granted lock to a different mode.
- *
- * While lock conversion is not currently used, future client-side
- * optimizations could take advantage of it to avoid discarding cached
- * pages on a file.
- */
-struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock,
-					enum ldlm_mode new_mode, __u32 *flags)
-{
-	struct list_head rpc_list;
-	struct ldlm_resource *res;
-	struct ldlm_namespace *ns;
-	int granted = 0;
-#ifdef HAVE_SERVER_SUPPORT
-	int old_mode;
-	struct sl_insert_point prev;
-#endif
-	struct ldlm_interval *node;
-	ENTRY;
-
-	INIT_LIST_HEAD(&rpc_list);
-	/* Just return if mode is unchanged. */
-	if (new_mode == lock->l_granted_mode) {
-		*flags |= LDLM_FL_BLOCK_GRANTED;
-		RETURN(lock->l_resource);
-	}
-
-	/* I can't check the type of lock here because the bitlock of lock
-	 * is not held here, so do the allocation blindly. -jay */
-	OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, GFP_NOFS);
-	if (node == NULL)  /* Actually, this causes EDEADLOCK to be returned */
-		RETURN(NULL);
-
-	LASSERTF((new_mode == LCK_PW && lock->l_granted_mode == LCK_PR),
-		 "new_mode %u, granted %u\n", new_mode, lock->l_granted_mode);
-
-	lock_res_and_lock(lock);
-
-	res = lock->l_resource;
-	ns  = ldlm_res_to_ns(res);
-
-#ifdef HAVE_SERVER_SUPPORT
-	old_mode = lock->l_req_mode;
 #endif
-	lock->l_req_mode = new_mode;
-	if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS) {
-#ifdef HAVE_SERVER_SUPPORT
-		/* remember the lock position where the lock might be
-		 * added back to the granted list later and also
-		 * remember the join mode for skiplist fixing. */
-		prev.res_link = lock->l_res_link.prev;
-		prev.mode_link = lock->l_sl_mode.prev;
-		prev.policy_link = lock->l_sl_policy.prev;
-#endif
-                ldlm_resource_unlink_lock(lock);
-        } else {
-                ldlm_resource_unlink_lock(lock);
-                if (res->lr_type == LDLM_EXTENT) {
-                        /* FIXME: ugly code, I have to attach the lock to a
-                         * interval node again since perhaps it will be granted
-                         * soon */
-			INIT_LIST_HEAD(&node->li_group);
-                        ldlm_interval_attach(node, lock);
-                        node = NULL;
-                }
-        }
-
-        /*
-         * Remove old lock from the pool before adding the lock with new
-         * mode below in ->policy()
-         */
-        ldlm_pool_del(&ns->ns_pool, lock);
-
-        /* If this is a local resource, put it on the appropriate list. */
-        if (ns_is_client(ldlm_res_to_ns(res))) {
-                if (*flags & (LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_GRANTED)) {
-                        ldlm_resource_add_lock(res, &res->lr_converting, lock);
-                } else {
-                        /* This should never happen, because of the way the
-                         * server handles conversions. */
-			LDLM_ERROR(lock, "Erroneous flags %x on local lock\n",
-                                   *flags);
-                        LBUG();
-
-                        ldlm_grant_lock(lock, &rpc_list);
-                        granted = 1;
-                        /* FIXME: completion handling not with lr_lock held ! */
-                        if (lock->l_completion_ast)
-                                lock->l_completion_ast(lock, 0, NULL);
-                }
-#ifdef HAVE_SERVER_SUPPORT
-	} else {
-		int rc;
-		enum ldlm_error err;
-		__u64 pflags = 0;
-		ldlm_processing_policy policy;
-
-                policy = ldlm_processing_policy_table[res->lr_type];
-		rc = policy(lock, &pflags, LDLM_PROCESS_RESCAN, &err,
-			    &rpc_list);
-                if (rc == LDLM_ITER_STOP) {
-                        lock->l_req_mode = old_mode;
-                        if (res->lr_type == LDLM_EXTENT)
-                                ldlm_extent_add_lock(res, lock);
-                        else
-                                ldlm_granted_list_add_lock(lock, &prev);
-
-                        res = NULL;
-                } else {
-                        *flags |= LDLM_FL_BLOCK_GRANTED;
-                        granted = 1;
-                }
-        }
-#else
-        } else {
-                CERROR("This is client-side-only module, cannot handle "
-                       "LDLM_NAMESPACE_SERVER resource type lock.\n");
-                LBUG();
-        }
-#endif
-        unlock_res_and_lock(lock);
-
-        if (granted)
-                ldlm_run_ast_work(ns, &rpc_list, LDLM_WORK_CP_AST);
-        if (node)
-                OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
-        RETURN(res);
 }
+EXPORT_SYMBOL(ldlm_lock_mode_downgrade);
 
 /**
  * Print lock with lock handle \a lockh description into debug log.
@@ -2749,17 +2747,17 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
         va_start(args, fmt);
 
         if (exp && exp->exp_connection) {
-                nid = libcfs_nid2str(exp->exp_connection->c_peer.nid);
+		nid = obd_export_nid2str(exp);
         } else if (exp && exp->exp_obd != NULL) {
                 struct obd_import *imp = exp->exp_obd->u.cli.cl_import;
-                nid = libcfs_nid2str(imp->imp_connection->c_peer.nid);
+		nid = obd_import_nid2str(imp);
         }
 
         if (resource == NULL) {
                 libcfs_debug_vmsg2(msgdata, fmt, args,
 		       " ns: \?\? lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
 		       "res: \?\? rrc=\?\? type: \?\?\? flags: %#llx nid: %s "
-		       "remote: %#llx expref: %d pid: %u timeout: %lu "
+		       "remote: %#llx expref: %d pid: %u timeout: %lld "
 		       "lvb_type: %d\n",
                        lock,
 		       lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
@@ -2779,7 +2777,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
 			" ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
 			"res: "DLDLMRES" rrc: %d type: %s [%llu->%llu] "
 			"(req %llu->%llu) flags: %#llx nid: %s remote: "
-			"%#llx expref: %d pid: %u timeout: %lu lvb_type: %d\n",
+			"%#llx expref: %d pid: %u timeout: %lld lvb_type: %d\n",
 			ldlm_lock_to_ns_name(lock), lock,
 			lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
 			lock->l_readers, lock->l_writers,
@@ -2802,7 +2800,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
 			" ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
 			"res: "DLDLMRES" rrc: %d type: %s pid: %d "
 			"[%llu->%llu] flags: %#llx nid: %s "
-			"remote: %#llx expref: %d pid: %u timeout: %lu\n",
+			"remote: %#llx expref: %d pid: %u timeout: %lld\n",
 			ldlm_lock_to_ns_name(lock), lock,
 			lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
 			lock->l_readers, lock->l_writers,
@@ -2822,9 +2820,9 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
 	case LDLM_IBITS:
 		libcfs_debug_vmsg2(msgdata, fmt, args,
 			" ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
-			"res: "DLDLMRES" bits %#llx rrc: %d type: %s "
+			"res: "DLDLMRES" bits %#llx/%#llx rrc: %d type: %s "
 			"flags: %#llx nid: %s remote: %#llx expref: %d "
-			"pid: %u timeout: %lu lvb_type: %d\n",
+			"pid: %u timeout: %lld lvb_type: %d\n",
 			ldlm_lock_to_ns_name(lock),
 			lock, lock->l_handle.h_cookie,
 			atomic_read(&lock->l_refc),
@@ -2833,6 +2831,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
 			ldlm_lockname[lock->l_req_mode],
 			PLDLMRES(resource),
 			lock->l_policy_data.l_inodebits.bits,
+			lock->l_policy_data.l_inodebits.try_bits,
 			atomic_read(&resource->lr_refcount),
 			ldlm_typename[resource->lr_type],
 			lock->l_flags, nid, lock->l_remote_handle.cookie,
@@ -2846,7 +2845,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
 			" ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
 			"res: "DLDLMRES" rrc: %d type: %s flags: %#llx "
 			"nid: %s remote: %#llx expref: %d pid: %u "
-			"timeout: %lu lvb_type: %d\n",
+			"timeout: %lld lvb_type: %d\n",
 			ldlm_lock_to_ns_name(lock),
 			lock, lock->l_handle.h_cookie,
 			atomic_read(&lock->l_refc),
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c
index 465ffda035dbe..ac7a9910e4d45 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_lockd.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2016, Intel Corporation.
+ * Copyright (c) 2010, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,7 +40,7 @@
 #include <linux/kthread.h>
 #include <linux/list.h>
 #include <libcfs/libcfs.h>
-#include <lustre/lustre_errno.h>
+#include <lustre_errno.h>
 #include <lustre_dlm.h>
 #include <obd_class.h>
 #include "ldlm_internal.h"
@@ -49,6 +49,11 @@ static int ldlm_num_threads;
 module_param(ldlm_num_threads, int, 0444);
 MODULE_PARM_DESC(ldlm_num_threads, "number of DLM service threads to start");
 
+static unsigned int ldlm_cpu_bind = 1;
+module_param(ldlm_cpu_bind, uint, 0444);
+MODULE_PARM_DESC(ldlm_cpu_bind,
+		 "bind DLM service threads to particular CPU partitions");
+
 static char *ldlm_cpts;
 module_param(ldlm_cpts, charp, 0444);
 MODULE_PARM_DESC(ldlm_cpts, "CPU partitions ldlm threads should run on");
@@ -64,18 +69,16 @@ struct kset *ldlm_svc_kset;
 
 static struct ldlm_state *ldlm_state;
 
-static inline cfs_time_t round_timeout(cfs_time_t timeout)
-{
-        return cfs_time_seconds((int)cfs_duration_sec(cfs_time_sub(timeout, 0)) + 1);
-}
-
-/* timeout for initial callback (AST) reply (bz10399) */
-static inline unsigned int ldlm_get_rq_timeout(void)
+/* timeout for initial callback (AST) reply (bz10399)
+ * Due to having to send a 32 bit time value over the
+ * wire return it as timeout_t instead of time64_t
+ */
+static inline timeout_t ldlm_get_rq_timeout(void)
 {
-        /* Non-AT value */
-        unsigned int timeout = min(ldlm_timeout, obd_timeout / 3);
+	/* Non-AT value */
+	timeout_t timeout = min(ldlm_timeout, obd_timeout / 3);
 
-        return timeout < 1 ? 1 : timeout;
+	return timeout < 1 ? 1 : timeout;
 }
 
 struct ldlm_bl_pool {
@@ -133,7 +136,7 @@ static DEFINE_SPINLOCK(waiting_locks_spinlock); /* BH lock (timer) */
  * All access to it should be under waiting_locks_spinlock.
  */
 static LIST_HEAD(waiting_locks_list);
-static void waiting_locks_callback(cfs_timer_cb_arg_t unused);
+static void waiting_locks_callback(TIMER_DATA_TYPE unused);
 static CFS_DEFINE_TIMER(waiting_locks_timer, waiting_locks_callback, 0, 0);
 
 enum elt_state {
@@ -147,6 +150,10 @@ static enum elt_state expired_lock_thread_state = ELT_STOPPED;
 static int expired_lock_dump;
 static LIST_HEAD(expired_lock_list);
 
+static int ldlm_lock_busy(struct ldlm_lock *lock);
+static int ldlm_add_waiting_lock(struct ldlm_lock *lock, time64_t timeout);
+static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, time64_t seconds);
+
 static inline int have_expired_locks(void)
 {
 	int need_to_run;
@@ -228,14 +235,30 @@ static int expired_lock_main(void *arg)
 			export = class_export_lock_get(lock->l_export, lock);
 			spin_unlock_bh(&waiting_locks_spinlock);
 
-			spin_lock_bh(&export->exp_bl_list_lock);
-			list_del_init(&lock->l_exp_list);
-			spin_unlock_bh(&export->exp_bl_list_lock);
-
-			do_dump++;
-			class_fail_export(export);
+			/* Check if we need to prolong timeout */
+			if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT) &&
+			    lock->l_callback_timeout != 0 && /* not AST error */
+			    ldlm_lock_busy(lock)) {
+				LDLM_DEBUG(lock, "prolong the busy lock");
+				lock_res_and_lock(lock);
+				ldlm_add_waiting_lock(lock,
+						ldlm_bl_timeout(lock) >> 1);
+				unlock_res_and_lock(lock);
+			} else {
+				spin_lock_bh(&export->exp_bl_list_lock);
+				list_del_init(&lock->l_exp_list);
+				spin_unlock_bh(&export->exp_bl_list_lock);
+
+				LDLM_ERROR(lock,
+					   "lock callback timer expired after %llds: evicting client at %s ",
+					   ktime_get_real_seconds() -
+					   lock->l_blast_sent,
+					   obd_export_nid2str(export));
+				ldlm_lock_to_ns(lock)->ns_timeouts++;
+				do_dump++;
+				class_fail_export(export);
+			}
 			class_export_lock_put(export, lock);
-
 			/* release extra ref grabbed by ldlm_add_waiting_lock()
 			 * or ldlm_failed_ast() */
 			LDLM_LOCK_RELEASE(lock);
@@ -258,9 +281,6 @@ static int expired_lock_main(void *arg)
 	RETURN(0);
 }
 
-static int ldlm_add_waiting_lock(struct ldlm_lock *lock);
-static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, int seconds);
-
 /**
  * Check if there is a request in the export request list
  * which prevents the lock canceling.
@@ -274,7 +294,7 @@ static int ldlm_lock_busy(struct ldlm_lock *lock)
 	if (lock->l_export == NULL)
 		return 0;
 
-	spin_lock_bh(&lock->l_export->exp_rpc_lock);
+	spin_lock(&lock->l_export->exp_rpc_lock);
 	list_for_each_entry(req, &lock->l_export->exp_hp_rpcs,
 				rq_exp_list) {
 		if (req->rq_ops->hpreq_lock_match) {
@@ -283,12 +303,12 @@ static int ldlm_lock_busy(struct ldlm_lock *lock)
 				break;
 		}
 	}
-	spin_unlock_bh(&lock->l_export->exp_rpc_lock);
+	spin_unlock(&lock->l_export->exp_rpc_lock);
 	RETURN(match);
 }
 
 /* This is called from within a timer interrupt and cannot schedule */
-static void waiting_locks_callback(cfs_timer_cb_arg_t unused)
+static void waiting_locks_callback(TIMER_DATA_TYPE unused)
 {
 	struct ldlm_lock	*lock;
 	int			need_dump = 0;
@@ -296,42 +316,10 @@ static void waiting_locks_callback(cfs_timer_cb_arg_t unused)
 	spin_lock_bh(&waiting_locks_spinlock);
 	while (!list_empty(&waiting_locks_list)) {
 		lock = list_entry(waiting_locks_list.next, struct ldlm_lock,
-                                      l_pending_chain);
-                if (cfs_time_after(lock->l_callback_timeout,
-                                   cfs_time_current()) ||
-                    (lock->l_req_mode == LCK_GROUP))
-                        break;
-
-                /* Check if we need to prolong timeout */
-                if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT) &&
-                    ldlm_lock_busy(lock)) {
-                        int cont = 1;
-
-                        if (lock->l_pending_chain.next == &waiting_locks_list)
-                                cont = 0;
-
-                        LDLM_LOCK_GET(lock);
-
-			spin_unlock_bh(&waiting_locks_spinlock);
-			LDLM_DEBUG(lock, "prolong the busy lock");
-			ldlm_refresh_waiting_lock(lock,
-						  ldlm_bl_timeout(lock) >> 1);
-			spin_lock_bh(&waiting_locks_spinlock);
-
-                        if (!cont) {
-                                LDLM_LOCK_RELEASE(lock);
-                                break;
-                        }
-
-                        LDLM_LOCK_RELEASE(lock);
-                        continue;
-                }
-                ldlm_lock_to_ns(lock)->ns_timeouts++;
-		LDLM_ERROR(lock, "lock callback timer expired after %llds: "
-                           "evicting client at %s ",
-			   ktime_get_real_seconds() - lock->l_blast_sent,
-                           libcfs_nid2str(
-                                   lock->l_export->exp_connection->c_peer.nid));
+				  l_pending_chain);
+		if (lock->l_callback_timeout > ktime_get_seconds() ||
+		    lock->l_req_mode == LCK_GROUP)
+			break;
 
                 /* no needs to take an extra ref on the lock since it was in
                  * the waiting_locks_list and ldlm_add_waiting_lock()
@@ -348,17 +336,18 @@ static void waiting_locks_callback(cfs_timer_cb_arg_t unused)
 		wake_up(&expired_lock_wait_queue);
 	}
 
-        /*
-         * Make sure the timer will fire again if we have any locks
-         * left.
-         */
+	/*
+	 * Make sure the timer will fire again if we have any locks
+	 * left.
+	 */
 	if (!list_empty(&waiting_locks_list)) {
-                cfs_time_t timeout_rounded;
+		unsigned long timeout_jiffies;
+
 		lock = list_entry(waiting_locks_list.next, struct ldlm_lock,
-                                      l_pending_chain);
-                timeout_rounded = (cfs_time_t)round_timeout(lock->l_callback_timeout);
-		mod_timer(&waiting_locks_timer, timeout_rounded);
-        }
+				  l_pending_chain);
+		timeout_jiffies = cfs_time_seconds(lock->l_callback_timeout);
+		mod_timer(&waiting_locks_timer, timeout_jiffies);
+	}
 	spin_unlock_bh(&waiting_locks_spinlock);
 }
 
@@ -374,10 +363,10 @@ static void waiting_locks_callback(cfs_timer_cb_arg_t unused)
  *
  * Called with the namespace lock held.
  */
-static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, int seconds)
+static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, time64_t seconds)
 {
-        cfs_time_t timeout;
-        cfs_time_t timeout_rounded;
+	unsigned long timeout_jiffies;
+	time64_t timeout;
 
 	if (!list_empty(&lock->l_pending_chain))
                 return 0;
@@ -386,28 +375,29 @@ static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, int seconds)
             OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT))
                 seconds = 1;
 
-        timeout = cfs_time_shift(seconds);
-        if (likely(cfs_time_after(timeout, lock->l_callback_timeout)))
+	timeout = ktime_get_seconds() + seconds;
+	if (likely(timeout > lock->l_callback_timeout))
                 lock->l_callback_timeout = timeout;
 
-        timeout_rounded = round_timeout(lock->l_callback_timeout);
+	timeout_jiffies = cfs_time_seconds(lock->l_callback_timeout);
 
-	if (cfs_time_before(timeout_rounded, waiting_locks_timer.expires) ||
-	    !timer_pending(&waiting_locks_timer)) {
-		mod_timer(&waiting_locks_timer, timeout_rounded);
-        }
-        /* if the new lock has a shorter timeout than something earlier on
-           the list, we'll wait the longer amount of time; no big deal. */
-        /* FIFO */
+	if (time_before(timeout_jiffies, waiting_locks_timer.expires) ||
+	    !timer_pending(&waiting_locks_timer))
+		mod_timer(&waiting_locks_timer, timeout_jiffies);
+
+	/* if the new lock has a shorter timeout than something earlier on
+	 * the list, we'll wait the longer amount of time; no big deal.
+	 */
+	/* FIFO */
 	list_add_tail(&lock->l_pending_chain, &waiting_locks_list);
-        return 1;
+	return 1;
 }
 
 static void ldlm_add_blocked_lock(struct ldlm_lock *lock)
 {
 	spin_lock_bh(&lock->l_export->exp_bl_list_lock);
 	if (list_empty(&lock->l_exp_list)) {
-		if (lock->l_granted_mode != lock->l_req_mode)
+		if (!ldlm_is_granted(lock))
 			list_add_tail(&lock->l_exp_list,
 				      &lock->l_export->exp_bl_list);
 		else
@@ -425,10 +415,9 @@ static void ldlm_add_blocked_lock(struct ldlm_lock *lock)
 		obd_stale_export_adjust(lock->l_export);
 }
 
-static int ldlm_add_waiting_lock(struct ldlm_lock *lock)
+static int ldlm_add_waiting_lock(struct ldlm_lock *lock, time64_t timeout)
 {
 	int ret;
-	int timeout = ldlm_bl_timeout(lock);
 
 	/* NB: must be called with hold of lock_res_and_lock() */
 	LASSERT(ldlm_is_res_locked(lock));
@@ -447,12 +436,12 @@ static int ldlm_add_waiting_lock(struct ldlm_lock *lock)
 	}
 
 	if (ldlm_is_destroyed(lock)) {
-		static cfs_time_t next;
+		static time64_t next;
 
 		spin_unlock_bh(&waiting_locks_spinlock);
 		LDLM_ERROR(lock, "not waiting on destroyed lock (bug 5653)");
-		if (cfs_time_after(cfs_time_current(), next)) {
-			next = cfs_time_shift(14400);
+		if (ktime_get_seconds() > next) {
+			next = ktime_get_seconds() + 14400;
 			libcfs_debug_dumpstack(NULL);
 		}
 		return 0;
@@ -471,7 +460,7 @@ static int ldlm_add_waiting_lock(struct ldlm_lock *lock)
 	if (ret)
 		ldlm_add_blocked_lock(lock);
 
-	LDLM_DEBUG(lock, "%sadding to wait list(timeout: %d, AT: %s)",
+	LDLM_DEBUG(lock, "%sadding to wait list(timeout: %lld, AT: %s)",
 		   ret == 0 ? "not re-" : "", timeout,
 		   AT_OFF ? "off" : "on");
 	return ret;
@@ -501,10 +490,11 @@ static int __ldlm_del_waiting_lock(struct ldlm_lock *lock)
 			del_timer(&waiting_locks_timer);
                 } else {
                         struct ldlm_lock *next;
+
 			next = list_entry(list_next, struct ldlm_lock,
-                                              l_pending_chain);
+					  l_pending_chain);
 			mod_timer(&waiting_locks_timer,
-				  round_timeout(next->l_callback_timeout));
+				  cfs_time_seconds(next->l_callback_timeout));
                 }
         }
 	list_del_init(&lock->l_pending_chain);
@@ -547,7 +537,7 @@ int ldlm_del_waiting_lock(struct ldlm_lock *lock)
  *
  * Called with namespace lock held.
  */
-int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout)
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, time64_t timeout)
 {
 	if (lock->l_export == NULL) {
 		/* We don't have a "waiting locks list" on clients. */
@@ -587,7 +577,7 @@ int ldlm_del_waiting_lock(struct ldlm_lock *lock)
         RETURN(0);
 }
 
-int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout)
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, time64_t timeout)
 {
         RETURN(0);
 }
@@ -605,9 +595,9 @@ int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout)
  *
  * \retval            timeout in seconds to wait for the client reply
  */
-unsigned int ldlm_bl_timeout(struct ldlm_lock *lock)
+time64_t ldlm_bl_timeout(struct ldlm_lock *lock)
 {
-	unsigned int timeout;
+	time64_t timeout;
 
 	if (AT_OFF)
 		return obd_timeout / 2;
@@ -617,7 +607,7 @@ unsigned int ldlm_bl_timeout(struct ldlm_lock *lock)
 	 * It would be nice to have some kind of "early reply" mechanism for
 	 * lock callbacks too... */
 	timeout = at_get(&lock->l_export->exp_bl_lock_at);
-	return max(timeout + (timeout >> 1), ldlm_enqueue_min);
+	return max(timeout + (timeout >> 1), (time64_t)ldlm_enqueue_min);
 }
 EXPORT_SYMBOL(ldlm_bl_timeout);
 
@@ -639,6 +629,7 @@ static void ldlm_failed_ast(struct ldlm_lock *lock, int rc,
 		/* the lock was not in any list, grab an extra ref before adding
 		 * the lock to the expired list */
 		LDLM_LOCK_GET(lock);
+	lock->l_callback_timeout = 0; /* differentiate it from expired locks */
 	list_add(&lock->l_pending_chain, &expired_lock_list);
 	wake_up(&expired_lock_wait_queue);
 	spin_unlock_bh(&waiting_locks_spinlock);
@@ -654,14 +645,7 @@ static int ldlm_handle_ast_error(struct ldlm_lock *lock,
 	struct lnet_process_id peer = req->rq_import->imp_connection->c_peer;
 
 	if (!req->rq_replied || (rc && rc != -EINVAL)) {
-		if (lock->l_export && lock->l_export->exp_libclient) {
-			LDLM_DEBUG(lock,
-				   "%s AST (req@%p x%llu) to liblustre client (nid %s) timeout, just cancelling lock",
-				   ast_type, req, req->rq_xid,
-				   libcfs_nid2str(peer.nid));
-			ldlm_lock_cancel(lock);
-			rc = -ERESTART;
-		} else if (ldlm_is_cancel(lock)) {
+		if (ldlm_is_cancel(lock)) {
 			LDLM_DEBUG(lock,
 				   "%s AST (req@%p x%llu) timeout from nid %s, but cancel was received (AST reply lost?)",
 				   ast_type, req, req->rq_xid,
@@ -713,7 +697,7 @@ static int ldlm_handle_ast_error(struct ldlm_lock *lock,
 			/* update lvbo to return proper attributes.
 			 * see bug 23174 */
 			ldlm_resource_getref(res);
-			ldlm_res_lvbo_update(res, NULL, 1);
+			ldlm_lvbo_update(res, lock, NULL, 1);
 			ldlm_resource_putref(res);
 		}
 		ldlm_lock_cancel(lock);
@@ -724,9 +708,9 @@ static int ldlm_handle_ast_error(struct ldlm_lock *lock,
 }
 
 static int ldlm_cb_interpret(const struct lu_env *env,
-                             struct ptlrpc_request *req, void *data, int rc)
+		             struct ptlrpc_request *req, void *args, int rc)
 {
-        struct ldlm_cb_async_args *ca   = data;
+        struct ldlm_cb_async_args *ca   = args;
         struct ldlm_lock          *lock = ca->ca_lock;
         struct ldlm_cb_set_arg    *arg  = ca->ca_set_arg;
         ENTRY;
@@ -744,15 +728,16 @@ static int ldlm_cb_interpret(const struct lu_env *env,
 		 *   -ELDLM_NO_LOCK_DATA when inode is cleared. LU-274
 		 */
 		if (unlikely(arg->gl_interpret_reply)) {
-			rc = arg->gl_interpret_reply(env, req, data, rc);
+			rc = arg->gl_interpret_reply(NULL, req, args, rc);
 		} else if (rc == -ELDLM_NO_LOCK_DATA) {
-			LDLM_DEBUG(lock, "lost race - client has a lock but no "
-				   "inode");
-			ldlm_res_lvbo_update(lock->l_resource, NULL, 1);
+			LDLM_DEBUG(lock,
+				   "lost race - client has a lock but no inode");
+			ldlm_lvbo_update(lock->l_resource, lock, NULL, 1);
 		} else if (rc != 0) {
 			rc = ldlm_handle_ast_error(lock, req, rc, "glimpse");
 		} else {
-			rc = ldlm_res_lvbo_update(lock->l_resource, req, 1);
+			rc = ldlm_lvbo_update(lock->l_resource,
+					      lock, req, 1);
 		}
 		break;
 	case LDLM_BL_CALLBACK:
@@ -780,8 +765,8 @@ static int ldlm_cb_interpret(const struct lu_env *env,
 
 static void ldlm_update_resend(struct ptlrpc_request *req, void *data)
 {
-	struct ldlm_cb_async_args *ca   = data;
-	struct ldlm_lock          *lock = ca->ca_lock;
+	struct ldlm_cb_async_args *ca = data;
+	struct ldlm_lock *lock = ca->ca_lock;
 
 	ldlm_refresh_waiting_lock(lock, ldlm_bl_timeout(lock));
 }
@@ -821,7 +806,7 @@ static void ldlm_lock_reorder_req(struct ldlm_lock *lock)
 		RETURN_EXIT;
 	}
 
-	spin_lock_bh(&lock->l_export->exp_rpc_lock);
+	spin_lock(&lock->l_export->exp_rpc_lock);
 	list_for_each_entry(req, &lock->l_export->exp_hp_rpcs,
 			    rq_exp_list) {
 		/* Do not process requests that were not yet added to there
@@ -835,7 +820,7 @@ static void ldlm_lock_reorder_req(struct ldlm_lock *lock)
 		    req->rq_ops->hpreq_lock_match(req, lock))
 			ptlrpc_nrs_req_hp_move(req);
 	}
-	spin_unlock_bh(&lock->l_export->exp_rpc_lock);
+	spin_unlock(&lock->l_export->exp_rpc_lock);
 	EXIT;
 }
 
@@ -874,18 +859,18 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock,
 
         ldlm_lock_reorder_req(lock);
 
-        req = ptlrpc_request_alloc_pack(lock->l_export->exp_imp_reverse,
-                                        &RQF_LDLM_BL_CALLBACK,
-                                        LUSTRE_DLM_VERSION, LDLM_BL_CALLBACK);
-        if (req == NULL)
-                RETURN(-ENOMEM);
+	req = ptlrpc_request_alloc_pack(lock->l_export->exp_imp_reverse,
+					&RQF_LDLM_BL_CALLBACK,
+					LUSTRE_DLM_VERSION, LDLM_BL_CALLBACK);
+	if (req == NULL)
+		RETURN(-ENOMEM);
 
-        CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
-        ca = ptlrpc_req_async_args(req);
-        ca->ca_set_arg = arg;
-        ca->ca_lock = lock;
+	CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
+	ca = ptlrpc_req_async_args(req);
+	ca->ca_set_arg = arg;
+	ca->ca_lock = lock;
 
-        req->rq_interpret_reply = ldlm_cb_interpret;
+	req->rq_interpret_reply = ldlm_cb_interpret;
 
 	lock_res_and_lock(lock);
 	if (ldlm_is_destroyed(lock)) {
@@ -895,7 +880,7 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock,
 		RETURN(0);
 	}
 
-	if (lock->l_granted_mode != lock->l_req_mode) {
+	if (!ldlm_is_granted(lock)) {
 		/* this blocking AST will be communicated as part of the
 		 * completion AST instead */
 		ldlm_add_blocked_lock(lock);
@@ -925,8 +910,8 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock,
 
 		req->rq_no_resend = 1;
 	} else {
-		LASSERT(lock->l_granted_mode == lock->l_req_mode);
-		ldlm_add_waiting_lock(lock);
+		LASSERT(ldlm_is_granted(lock));
+		ldlm_add_waiting_lock(lock, ldlm_bl_timeout(lock));
 		unlock_res_and_lock(lock);
 
 		/* Do not resend after lock callback timeout */
@@ -990,26 +975,25 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
 		lvb_len = 0;
 
 	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_CLIENT, lvb_len);
-        rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CP_CALLBACK);
-        if (rc) {
-                ptlrpc_request_free(req);
-                RETURN(rc);
-        }
+	rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CP_CALLBACK);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
 
-        CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
-        ca = ptlrpc_req_async_args(req);
-        ca->ca_set_arg = arg;
-        ca->ca_lock = lock;
+	CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
+	ca = ptlrpc_req_async_args(req);
+	ca->ca_set_arg = arg;
+	ca->ca_lock = lock;
 
-        req->rq_interpret_reply = ldlm_cb_interpret;
-        body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	req->rq_interpret_reply = ldlm_cb_interpret;
+	body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
 
-        body->lock_handle[0] = lock->l_remote_handle;
+	body->lock_handle[0] = lock->l_remote_handle;
 	body->lock_flags = ldlm_flags_to_wire(flags);
         ldlm_lock2desc(lock, &body->lock_desc);
 	if (lvb_len > 0) {
 		void *lvb = req_capsule_client_get(&req->rq_pill, &RMF_DLM_LVB);
-
 		lvb_len = ldlm_lvbo_fill(lock, lvb, &lvb_len);
 		if (lvb_len < 0) {
 			/* We still need to send the RPC to wake up the blocked
@@ -1060,7 +1044,7 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
 			lock_res_and_lock(lock);
 		} else {
 			/* start the lock-timeout clock */
-			ldlm_add_waiting_lock(lock);
+			ldlm_add_waiting_lock(lock, ldlm_bl_timeout(lock));
 			/* Do not resend after lock callback timeout */
 			req->rq_delay_limit = ldlm_bl_timeout(lock);
 			req->rq_resend_cb = ldlm_update_resend;
@@ -1098,7 +1082,7 @@ int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data)
 
 	if (arg->gl_desc != NULL)
 		/* There is a glimpse descriptor to pack */
-		req_fmt = &RQF_LDLM_GL_DESC_CALLBACK;
+		req_fmt = &RQF_LDLM_GL_CALLBACK_DESC;
 	else
 		req_fmt = &RQF_LDLM_GL_CALLBACK;
 
@@ -1116,9 +1100,9 @@ int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data)
 		*desc = *arg->gl_desc;
 	}
 
-        body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
-        body->lock_handle[0] = lock->l_remote_handle;
-        ldlm_lock2desc(lock, &body->lock_desc);
+	body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	body->lock_handle[0] = lock->l_remote_handle;
+	ldlm_lock2desc(lock, &body->lock_desc);
 
 	CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
 	ca = ptlrpc_req_async_args(req);
@@ -1146,6 +1130,7 @@ int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data)
 
 	RETURN(rc);
 }
+EXPORT_SYMBOL(ldlm_server_glimpse_ast);
 
 int ldlm_glimpse_locks(struct ldlm_resource *res,
 		       struct list_head *gl_work_list)
@@ -1156,7 +1141,7 @@ int ldlm_glimpse_locks(struct ldlm_resource *res,
 	rc = ldlm_run_ast_work(ldlm_res_to_ns(res), gl_work_list,
 			       LDLM_WORK_GL_AST);
 	if (rc == -ERESTART)
-		ldlm_reprocess_all(res);
+		ldlm_reprocess_all(res, NULL);
 
 	RETURN(rc);
 }
@@ -1178,40 +1163,6 @@ struct ldlm_lock *ldlm_request_lock(struct ptlrpc_request *req)
 }
 EXPORT_SYMBOL(ldlm_request_lock);
 
-static void ldlm_svc_get_eopc(const struct ldlm_request *dlm_req,
-                       struct lprocfs_stats *srv_stats)
-{
-        int lock_type = 0, op = 0;
-
-        lock_type = dlm_req->lock_desc.l_resource.lr_type;
-
-        switch (lock_type) {
-        case LDLM_PLAIN:
-                op = PTLRPC_LAST_CNTR + LDLM_PLAIN_ENQUEUE;
-                break;
-        case LDLM_EXTENT:
-                if (dlm_req->lock_flags & LDLM_FL_HAS_INTENT)
-                        op = PTLRPC_LAST_CNTR + LDLM_GLIMPSE_ENQUEUE;
-                else
-                        op = PTLRPC_LAST_CNTR + LDLM_EXTENT_ENQUEUE;
-                break;
-        case LDLM_FLOCK:
-                op = PTLRPC_LAST_CNTR + LDLM_FLOCK_ENQUEUE;
-                break;
-        case LDLM_IBITS:
-                op = PTLRPC_LAST_CNTR + LDLM_IBITS_ENQUEUE;
-                break;
-        default:
-                op = 0;
-                break;
-        }
-
-        if (op)
-                lprocfs_counter_incr(srv_stats, op);
-
-        return;
-}
-
 /**
  * Main server-side entry point into LDLM for enqueue. This is called by ptlrpc
  * service threads to carry out client lock enqueueing requests.
@@ -1228,6 +1179,7 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
 	void *cookie = NULL;
 	int rc = 0;
 	struct ldlm_resource *res = NULL;
+	const struct lu_env *env = req->rq_svc_thread->t_env;
 	ENTRY;
 
 	LDLM_DEBUG_NOLOCK("server-side enqueue handler START");
@@ -1237,7 +1189,9 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
 
 	LASSERT(req->rq_export);
 
-	if (ptlrpc_req2svc(req)->srv_stats != NULL)
+	/* for intent enqueue the stat will be updated inside intent policy */
+	if (ptlrpc_req2svc(req)->srv_stats != NULL &&
+	    !(dlm_req->lock_flags & LDLM_FL_HAS_INTENT))
 		ldlm_svc_get_eopc(dlm_req, ptlrpc_req2svc(req)->srv_stats);
 
         if (req->rq_export && req->rq_export->exp_nid_stats &&
@@ -1341,9 +1295,11 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
 				     &lock->l_policy_data);
 	if (dlm_req->lock_desc.l_resource.lr_type == LDLM_EXTENT)
 		lock->l_req_extent = lock->l_policy_data.l_extent;
+	else if (dlm_req->lock_desc.l_resource.lr_type == LDLM_IBITS)
+		lock->l_policy_data.l_inodebits.try_bits =
+			dlm_req->lock_desc.l_policy_data.l_inodebits.try_bits;
 
 existing_lock:
-
         if (flags & LDLM_FL_HAS_INTENT) {
                 /* In this case, the reply buffer is allocated deep in
                  * local_lock_enqueue by the policy function. */
@@ -1355,25 +1311,25 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
 		req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB,
 				     RCL_SERVER, ldlm_lvbo_size(lock));
 
-                if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR))
-                        GOTO(out, rc = -ENOMEM);
+		if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR))
+			GOTO(out, rc = -ENOMEM);
 
-                rc = req_capsule_server_pack(&req->rq_pill);
-                if (rc)
-                        GOTO(out, rc);
-        }
+		rc = req_capsule_server_pack(&req->rq_pill);
+		if (rc)
+			GOTO(out, rc);
+	}
 
-	err = ldlm_lock_enqueue(ns, &lock, cookie, &flags);
+	err = ldlm_lock_enqueue(env, ns, &lock, cookie, &flags);
 	if (err) {
 		if ((int)err < 0)
 			rc = (int)err;
 		GOTO(out, err);
 	}
 
-        dlm_rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+	dlm_rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
 
-        ldlm_lock2desc(lock, &dlm_rep->lock_desc);
-        ldlm_lock2handle(lock, &dlm_rep->lock_handle);
+	ldlm_lock2desc(lock, &dlm_rep->lock_desc);
+	ldlm_lock2handle(lock, &dlm_rep->lock_handle);
 
 	if (lock && lock->l_resource->lr_type == LDLM_EXTENT)
 		OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_BL_EVICT, 6);
@@ -1395,8 +1351,24 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
                 LDLM_ERROR(lock, "lock on destroyed export %p", req->rq_export);
                 rc = -ENOTCONN;
 	} else if (ldlm_is_ast_sent(lock)) {
+		/* fill lock desc for possible lock convert */
+		if (lock->l_blocking_lock &&
+		    lock->l_resource->lr_type == LDLM_IBITS) {
+			struct ldlm_lock *bl_lock = lock->l_blocking_lock;
+			struct ldlm_lock_desc *rep_desc = &dlm_rep->lock_desc;
+
+			LDLM_DEBUG(lock,
+				   "save blocking bits %llx in granted lock",
+				   bl_lock->l_policy_data.l_inodebits.bits);
+			/* If lock is blocked then save blocking ibits
+			 * in returned lock policy for the possible lock
+			 * convert on a client.
+			 */
+			rep_desc->l_policy_data.l_inodebits.cancel_bits =
+				bl_lock->l_policy_data.l_inodebits.bits;
+		}
 		dlm_rep->lock_flags |= ldlm_flags_to_wire(LDLM_FL_AST_SENT);
-                if (lock->l_granted_mode == lock->l_req_mode) {
+		if (ldlm_is_granted(lock)) {
                         /*
                          * Only cancel lock if it was granted, because it would
                          * be destroyed immediately and would never be granted
@@ -1408,38 +1380,15 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
                                 unlock_res_and_lock(lock);
                                 ldlm_lock_cancel(lock);
                                 lock_res_and_lock(lock);
-                        } else
-                                ldlm_add_waiting_lock(lock);
-                }
-        }
-        /* Make sure we never ever grant usual metadata locks to liblustre
-           clients */
-        if ((dlm_req->lock_desc.l_resource.lr_type == LDLM_PLAIN ||
-            dlm_req->lock_desc.l_resource.lr_type == LDLM_IBITS) &&
-             req->rq_export->exp_libclient) {
-		if (unlikely(!ldlm_is_cancel_on_block(lock) ||
-                             !(dlm_rep->lock_flags & LDLM_FL_CANCEL_ON_BLOCK))){
-                        CERROR("Granting sync lock to libclient. "
-			       "req fl %d, rep fl %d, lock fl %#llx\n",
-                               dlm_req->lock_flags, dlm_rep->lock_flags,
-                               lock->l_flags);
-                        LDLM_ERROR(lock, "sync lock");
-			if (dlm_req->lock_flags & LDLM_FL_HAS_INTENT) {
-				struct ldlm_intent *it;
-
-				it = req_capsule_client_get(&req->rq_pill,
-							    &RMF_LDLM_INTENT);
-				if (it != NULL) {
-					CERROR("This is intent %s (%llu)\n",
-					       ldlm_it2str(it->opc), it->opc);
-				}
+			} else {
+				ldlm_add_waiting_lock(lock,
+						      ldlm_bl_timeout(lock));
 			}
                 }
         }
+	unlock_res_and_lock(lock);
 
-        unlock_res_and_lock(lock);
-
-        EXIT;
+	EXIT;
  out:
         req->rq_status = rc ?: err; /* return either error - bug 11190 */
         if (!req->rq_packed_final) {
@@ -1522,114 +1471,126 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
 			}
 		}
 
-                if (!err && dlm_req->lock_desc.l_resource.lr_type != LDLM_FLOCK)
-                        ldlm_reprocess_all(lock->l_resource);
+		if (!err && !ldlm_is_cbpending(lock) &&
+		    dlm_req->lock_desc.l_resource.lr_type != LDLM_FLOCK)
+			ldlm_reprocess_all(lock->l_resource, lock);
 
-                LDLM_LOCK_RELEASE(lock);
-        }
+		LDLM_LOCK_RELEASE(lock);
+	}
 
-        LDLM_DEBUG_NOLOCK("server-side enqueue handler END (lock %p, rc %d)",
-                          lock, rc);
+	LDLM_DEBUG_NOLOCK("server-side enqueue handler END (lock %p, rc %d)",
+			  lock, rc);
 
         return rc;
 }
 
-/**
- * Old-style LDLM main entry point for server code enqueue.
+/* Clear the blocking lock, the race is possible between ldlm_handle_convert0()
+ * and ldlm_work_bl_ast_lock(), so this is done under lock with check for NULL.
  */
-int ldlm_handle_enqueue(struct ptlrpc_request *req,
-                        ldlm_completion_callback completion_callback,
-                        ldlm_blocking_callback blocking_callback,
-                        ldlm_glimpse_callback glimpse_callback)
+void ldlm_clear_blocking_lock(struct ldlm_lock *lock)
 {
-        struct ldlm_request *dlm_req;
-        struct ldlm_callback_suite cbs = {
-                .lcs_completion = completion_callback,
-                .lcs_blocking   = blocking_callback,
-                .lcs_glimpse    = glimpse_callback
-        };
-        int rc;
+	if (lock->l_blocking_lock) {
+		LDLM_LOCK_RELEASE(lock->l_blocking_lock);
+		lock->l_blocking_lock = NULL;
+	}
+}
 
-        dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
-        if (dlm_req != NULL) {
-                rc = ldlm_handle_enqueue0(req->rq_export->exp_obd->obd_namespace,
-                                          req, dlm_req, &cbs);
-        } else {
-                rc = -EFAULT;
-        }
-        return rc;
+/* A lock can be converted to new ibits or mode and should be considered
+ * as new lock. Clear all states related to a previous blocking AST
+ * processing so new conflicts will cause new blocking ASTs.
+ *
+ * This is used during lock convert below and lock downgrade to COS mode in
+ * ldlm_lock_mode_downgrade().
+ */
+void ldlm_clear_blocking_data(struct ldlm_lock *lock)
+{
+	ldlm_clear_ast_sent(lock);
+	lock->l_bl_ast_run = 0;
+	ldlm_clear_blocking_lock(lock);
 }
 
 /**
  * Main LDLM entry point for server code to process lock conversion requests.
  */
 int ldlm_handle_convert0(struct ptlrpc_request *req,
-                         const struct ldlm_request *dlm_req)
+			 const struct ldlm_request *dlm_req)
 {
-        struct ldlm_reply *dlm_rep;
-        struct ldlm_lock *lock;
-        int rc;
-        ENTRY;
+	struct obd_export *exp = req->rq_export;
+	struct ldlm_reply *dlm_rep;
+	struct ldlm_lock *lock;
+	__u64 bits;
+	__u64 new_bits;
+	int rc;
 
-        if (req->rq_export && req->rq_export->exp_nid_stats &&
-            req->rq_export->exp_nid_stats->nid_ldlm_stats)
-                lprocfs_counter_incr(req->rq_export->exp_nid_stats->nid_ldlm_stats,
-                                     LDLM_CONVERT - LDLM_FIRST_OPC);
+	ENTRY;
 
-        rc = req_capsule_server_pack(&req->rq_pill);
-        if (rc)
-                RETURN(rc);
+	if (exp && exp->exp_nid_stats && exp->exp_nid_stats->nid_ldlm_stats)
+		lprocfs_counter_incr(exp->exp_nid_stats->nid_ldlm_stats,
+				     LDLM_CONVERT - LDLM_FIRST_OPC);
 
-        dlm_rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
-        dlm_rep->lock_flags = dlm_req->lock_flags;
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc)
+		RETURN(rc);
 
-        lock = ldlm_handle2lock(&dlm_req->lock_handle[0]);
-        if (!lock) {
-		req->rq_status = LUSTRE_EINVAL;
-        } else {
-                void *res = NULL;
+	dlm_rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+	dlm_rep->lock_flags = dlm_req->lock_flags;
 
-                LDLM_DEBUG(lock, "server-side convert handler START");
+	lock = ldlm_handle2lock(&dlm_req->lock_handle[0]);
+	if (!lock) {
+		LDLM_DEBUG_NOLOCK("server lock is canceled already");
+		req->rq_status = ELDLM_NO_LOCK_DATA;
+		RETURN(0);
+	}
 
-                res = ldlm_lock_convert(lock, dlm_req->lock_desc.l_req_mode,
-                                        &dlm_rep->lock_flags);
-                if (res) {
-                        if (ldlm_del_waiting_lock(lock))
-                                LDLM_DEBUG(lock, "converted waiting lock");
-                        req->rq_status = 0;
-                } else {
-			req->rq_status = LUSTRE_EDEADLK;
-                }
-        }
+	LDLM_DEBUG(lock, "server-side convert handler START");
 
-        if (lock) {
-                if (!req->rq_status)
-                        ldlm_reprocess_all(lock->l_resource);
-                LDLM_DEBUG(lock, "server-side convert handler END");
-                LDLM_LOCK_PUT(lock);
-        } else
-                LDLM_DEBUG_NOLOCK("server-side convert handler END");
+	lock_res_and_lock(lock);
+	bits = lock->l_policy_data.l_inodebits.bits;
+	new_bits = dlm_req->lock_desc.l_policy_data.l_inodebits.bits;
 
-        RETURN(0);
-}
+	if (ldlm_is_cancel(lock)) {
+		LDLM_DEBUG(lock, "convert on canceled lock!");
+		unlock_res_and_lock(lock);
+		GOTO(out_put, rc = ELDLM_NO_LOCK_DATA);
+	}
 
-/**
- * Old-style main LDLM entry point for server code to process lock conversion
- * requests.
- */
-int ldlm_handle_convert(struct ptlrpc_request *req)
-{
-        int rc;
-        struct ldlm_request *dlm_req;
+	if (dlm_req->lock_desc.l_req_mode != lock->l_granted_mode) {
+		LDLM_ERROR(lock, "lock mode differs!");
+		unlock_res_and_lock(lock);
+		GOTO(out_put, rc = -EPROTO);
+	}
 
-        dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
-        if (dlm_req != NULL) {
-                rc = ldlm_handle_convert0(req, dlm_req);
-        } else {
-                CERROR ("Can't unpack dlm_req\n");
-                rc = -EFAULT;
-        }
-        return rc;
+	if (bits == new_bits) {
+		/*
+		 * This can be valid situation if CONVERT RPCs are
+		 * re-ordered. Just finish silently
+		 */
+		LDLM_DEBUG(lock, "lock is converted already!");
+		unlock_res_and_lock(lock);
+	} else {
+		if (ldlm_is_waited(lock))
+			ldlm_del_waiting_lock(lock);
+
+		ldlm_clear_cbpending(lock);
+		lock->l_policy_data.l_inodebits.cancel_bits = 0;
+		ldlm_inodebits_drop(lock, bits & ~new_bits);
+
+		ldlm_clear_blocking_data(lock);
+		unlock_res_and_lock(lock);
+
+		ldlm_reprocess_all(lock->l_resource, NULL);
+	}
+
+	dlm_rep->lock_handle = lock->l_remote_handle;
+	ldlm_ibits_policy_local_to_wire(&lock->l_policy_data,
+					&dlm_rep->lock_desc.l_policy_data);
+	rc = ELDLM_OK;
+	EXIT;
+out_put:
+	LDLM_DEBUG(lock, "server-side convert handler END, rc = %d", rc);
+	LDLM_LOCK_PUT(lock);
+	req->rq_status = rc;
+	return 0;
 }
 
 /**
@@ -1642,14 +1603,22 @@ int ldlm_request_cancel(struct ptlrpc_request *req,
 			const struct ldlm_request *dlm_req,
 			int first, enum lustre_at_flags flags)
 {
-        struct ldlm_resource *res, *pres = NULL;
-        struct ldlm_lock *lock;
-        int i, count, done = 0;
-        ENTRY;
+	struct ldlm_resource *res, *pres = NULL;
+	struct ldlm_lock *lock;
+	int i, count, done = 0;
+	unsigned int size;
 
-        count = dlm_req->lock_count ? dlm_req->lock_count : 1;
-        if (first >= count)
-                RETURN(0);
+	ENTRY;
+
+	size = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT);
+	if (size <= offsetof(struct ldlm_request, lock_handle) ||
+	    (size - offsetof(struct ldlm_request, lock_handle)) /
+	     sizeof(struct lustre_handle) < dlm_req->lock_count)
+		RETURN(0);
+
+	count = dlm_req->lock_count ? dlm_req->lock_count : 1;
+	if (first >= count)
+		RETURN(0);
 
 	if (count == 1 && dlm_req->lock_handle[0].cookie == 0)
 		RETURN(0);
@@ -1676,20 +1645,24 @@ int ldlm_request_cancel(struct ptlrpc_request *req,
 
 		/* This code is an optimization to only attempt lock
 		 * granting on the resource (that could be CPU-expensive)
-		 * after we are done cancelling lock in that resource. */
-                if (res != pres) {
-                        if (pres != NULL) {
-                                ldlm_reprocess_all(pres);
-                                LDLM_RESOURCE_DELREF(pres);
-                                ldlm_resource_putref(pres);
-                        }
-                        if (res != NULL) {
-                                ldlm_resource_getref(res);
-                                LDLM_RESOURCE_ADDREF(res);
-                                ldlm_res_lvbo_update(res, NULL, 1);
-                        }
-                        pres = res;
-                }
+		 * after we are done cancelling lock in that resource.
+		 */
+		if (res != pres) {
+			if (pres != NULL) {
+				ldlm_reprocess_all(pres, NULL);
+				LDLM_RESOURCE_DELREF(pres);
+				ldlm_resource_putref(pres);
+			}
+			if (res != NULL) {
+				ldlm_resource_getref(res);
+				LDLM_RESOURCE_ADDREF(res);
+
+				if (!ldlm_is_discard_data(lock))
+					ldlm_lvbo_update(res, lock,
+							 NULL, 1);
+			}
+			pres = res;
+		}
 
 		if ((flags & LATF_STATS) && ldlm_is_ast_sent(lock) &&
 		    lock->l_blast_sent != 0) {
@@ -1699,16 +1672,16 @@ int ldlm_request_cancel(struct ptlrpc_request *req,
 				   (s64)delay);
 			at_measured(&lock->l_export->exp_bl_lock_at, delay);
 		}
-                ldlm_lock_cancel(lock);
-                LDLM_LOCK_PUT(lock);
-        }
-        if (pres != NULL) {
-                ldlm_reprocess_all(pres);
-                LDLM_RESOURCE_DELREF(pres);
-                ldlm_resource_putref(pres);
-        }
-        LDLM_DEBUG_NOLOCK("server-side cancel handler END");
-        RETURN(done);
+		ldlm_lock_cancel(lock);
+		LDLM_LOCK_PUT(lock);
+	}
+	if (pres != NULL) {
+		ldlm_reprocess_all(pres, NULL);
+		LDLM_RESOURCE_DELREF(pres);
+		ldlm_resource_putref(pres);
+	}
+	LDLM_DEBUG_NOLOCK("server-side cancel handler END");
+	RETURN(done);
 }
 EXPORT_SYMBOL(ldlm_request_cancel);
 
@@ -1729,14 +1702,18 @@ int ldlm_handle_cancel(struct ptlrpc_request *req)
                 RETURN(-EFAULT);
         }
 
-        if (req->rq_export && req->rq_export->exp_nid_stats &&
-            req->rq_export->exp_nid_stats->nid_ldlm_stats)
-                lprocfs_counter_incr(req->rq_export->exp_nid_stats->nid_ldlm_stats,
-                                     LDLM_CANCEL - LDLM_FIRST_OPC);
+	if (req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT) <
+	    offsetof(struct ldlm_request, lock_handle[1]))
+		RETURN(-EPROTO);
 
-        rc = req_capsule_server_pack(&req->rq_pill);
-        if (rc)
-                RETURN(rc);
+	if (req->rq_export && req->rq_export->exp_nid_stats &&
+	    req->rq_export->exp_nid_stats->nid_ldlm_stats)
+		lprocfs_counter_incr(req->rq_export->exp_nid_stats->nid_ldlm_stats,
+				     LDLM_CANCEL - LDLM_FIRST_OPC);
+
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc)
+		RETURN(rc);
 
 	if (!ldlm_request_cancel(req, dlm_req, 0, LATF_STATS))
 		req->rq_status = LUSTRE_ESTALE;
@@ -1745,20 +1722,62 @@ int ldlm_handle_cancel(struct ptlrpc_request *req)
 }
 #endif /* HAVE_SERVER_SUPPORT */
 
+/**
+ * Server may pass additional information about blocking lock.
+ * For IBITS locks it is conflicting bits which can be used for
+ * lock convert instead of cancel.
+ */
+void ldlm_bl_desc2lock(const struct ldlm_lock_desc *ld, struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	check_res_locked(lock->l_resource);
+	if (ns_is_client(ns) && ld &&
+	    (lock->l_resource->lr_type == LDLM_IBITS)) {
+		/*
+		 * Lock description contains policy of blocking lock,
+		 * and its cancel_bits is used to pass conflicting bits.
+		 * NOTE: ld can be NULL or can be not NULL but zeroed if
+		 * passed from ldlm_bl_thread_blwi(), check below used bits
+		 * in ld to make sure it is valid description.
+		 */
+		if (ld->l_policy_data.l_inodebits.cancel_bits &&
+		    ldlm_res_eq(&ld->l_resource.lr_name,
+				&lock->l_resource->lr_name) &&
+		    !(ldlm_is_cbpending(lock) &&
+		      lock->l_policy_data.l_inodebits.cancel_bits == 0)) {
+			/* always combine conflicting ibits */
+			lock->l_policy_data.l_inodebits.cancel_bits |=
+				ld->l_policy_data.l_inodebits.cancel_bits;
+		} else {
+			/* If cancel_bits are not obtained or
+			 * if the lock is already CBPENDING and
+			 * has no cancel_bits set
+			 * - the full lock is to be cancelled
+			 */
+			lock->l_policy_data.l_inodebits.cancel_bits = 0;
+		}
+	}
+}
+
 /**
  * Callback handler for receiving incoming blocking ASTs.
  *
  * This can only happen on client side.
  */
 void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
-                             struct ldlm_lock_desc *ld, struct ldlm_lock *lock)
+			     struct ldlm_lock_desc *ld, struct ldlm_lock *lock)
 {
-        int do_ast;
-        ENTRY;
+	int do_ast;
 
-        LDLM_DEBUG(lock, "client blocking AST callback handler");
+	ENTRY;
 
-        lock_res_and_lock(lock);
+	LDLM_DEBUG(lock, "client blocking AST callback handler");
+
+	lock_res_and_lock(lock);
+
+	/* get extra information from desc if any */
+	ldlm_bl_desc2lock(ld, lock);
 	ldlm_set_cbpending(lock);
 
 	if (ldlm_is_cancel_on_block(lock))
@@ -1783,12 +1802,26 @@ void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
         EXIT;
 }
 
+static int ldlm_callback_reply(struct ptlrpc_request *req, int rc)
+{
+	if (req->rq_no_reply)
+		return 0;
+
+	req->rq_status = rc;
+	if (!req->rq_packed_final) {
+		rc = lustre_pack_reply(req, 1, NULL, NULL);
+		if (rc)
+			return rc;
+	}
+	return ptlrpc_reply(req);
+}
+
 /**
  * Callback handler for receiving incoming completion ASTs.
  *
  * This only can happen on client side.
  */
-static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
+static int ldlm_handle_cp_callback(struct ptlrpc_request *req,
                                     struct ldlm_namespace *ns,
                                     struct ldlm_request *dlm_req,
                                     struct ldlm_lock *lock)
@@ -1802,11 +1835,14 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
 
 	INIT_LIST_HEAD(&ast_list);
 	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) {
-		int to = cfs_time_seconds(1);
+		long to = cfs_time_seconds(1);
+
+		ldlm_callback_reply(req, 0);
+
 		while (to > 0) {
 			set_current_state(TASK_INTERRUPTIBLE);
-			schedule_timeout(to);
-			if (lock->l_granted_mode == lock->l_req_mode ||
+			to = schedule_timeout(to);
+			if (ldlm_is_granted(lock) ||
 			    ldlm_is_destroyed(lock))
 				break;
 		}
@@ -1832,8 +1868,29 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
 	}
 
 	lock_res_and_lock(lock);
+
+	if (!ldlm_res_eq(&dlm_req->lock_desc.l_resource.lr_name,
+			 &lock->l_resource->lr_name)) {
+		ldlm_resource_unlink_lock(lock);
+		unlock_res_and_lock(lock);
+		rc = ldlm_lock_change_resource(ns, lock,
+				&dlm_req->lock_desc.l_resource.lr_name);
+		if (rc < 0) {
+			LDLM_ERROR(lock, "Failed to allocate resource");
+			GOTO(out, rc);
+		}
+		LDLM_DEBUG(lock, "completion AST, new resource");
+		lock_res_and_lock(lock);
+	}
+
+	if (ldlm_is_failed(lock)) {
+		unlock_res_and_lock(lock);
+		LDLM_LOCK_RELEASE(lock);
+		RETURN(-EINVAL);
+	}
+
 	if (ldlm_is_destroyed(lock) ||
-	    lock->l_granted_mode == lock->l_req_mode) {
+	    ldlm_is_granted(lock)) {
 		/* bug 11300: the lock has already been granted */
 		unlock_res_and_lock(lock);
 		LDLM_DEBUG(lock, "Double grant race happened");
@@ -1855,26 +1912,15 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
 		LDLM_DEBUG(lock, "completion AST, new policy data");
 	}
 
-        ldlm_resource_unlink_lock(lock);
-        if (memcmp(&dlm_req->lock_desc.l_resource.lr_name,
-                   &lock->l_resource->lr_name,
-                   sizeof(lock->l_resource->lr_name)) != 0) {
-                unlock_res_and_lock(lock);
-		rc = ldlm_lock_change_resource(ns, lock,
-				&dlm_req->lock_desc.l_resource.lr_name);
-		if (rc < 0) {
-			LDLM_ERROR(lock, "Failed to allocate resource");
-			GOTO(out, rc);
-		}
-                LDLM_DEBUG(lock, "completion AST, new resource");
-                CERROR("change resource!\n");
-                lock_res_and_lock(lock);
-        }
+	ldlm_resource_unlink_lock(lock);
 
-        if (dlm_req->lock_flags & LDLM_FL_AST_SENT) {
-		/* BL_AST locks are not needed in LRU.
-		 * Let ldlm_cancel_lru() be fast. */
-                ldlm_lock_remove_from_lru(lock);
+	if (dlm_req->lock_flags & LDLM_FL_AST_SENT) {
+		/*
+		 * BL_AST locks are not needed in LRU.
+		 * Let ldlm_cancel_lru() be fast.
+		 */
+		ldlm_lock_remove_from_lru(lock);
+		ldlm_bl_desc2lock(&dlm_req->lock_desc, lock);
 		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST;
                 LDLM_DEBUG(lock, "completion AST includes blocking AST");
         }
@@ -1911,6 +1957,8 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
 		wake_up(&lock->l_waitq);
 	}
 	LDLM_LOCK_RELEASE(lock);
+
+	return 0;
 }
 
 /**
@@ -1925,10 +1973,12 @@ static void ldlm_handle_gl_callback(struct ptlrpc_request *req,
                                     struct ldlm_request *dlm_req,
                                     struct ldlm_lock *lock)
 {
-        int rc = -ENOSYS;
-        ENTRY;
+	struct ldlm_lock_desc *ld = &dlm_req->lock_desc;
+	int rc = -ENOSYS;
 
-        LDLM_DEBUG(lock, "client glimpse AST callback handler");
+	ENTRY;
+
+	LDLM_DEBUG(lock, "client glimpse AST callback handler");
 
         if (lock->l_glimpse_ast != NULL)
                 rc = lock->l_glimpse_ast(lock, req);
@@ -1945,10 +1995,17 @@ static void ldlm_handle_gl_callback(struct ptlrpc_request *req,
             !lock->l_readers && !lock->l_writers &&
 	    ktime_after(ktime_get(),
 			ktime_add(lock->l_last_used,
-				  ktime_set(10, 0)))) {
-                unlock_res_and_lock(lock);
-                if (ldlm_bl_to_thread_lock(ns, NULL, lock))
-                        ldlm_handle_bl_callback(ns, NULL, lock);
+				  ktime_set(ns->ns_dirty_age_limit, 0)))) {
+		unlock_res_and_lock(lock);
+
+		/* For MDS glimpse it is always DOM lock, set corresponding
+		 * cancel_bits to perform lock convert if needed
+		 */
+		if (lock->l_resource->lr_type == LDLM_IBITS)
+			ld->l_policy_data.l_inodebits.cancel_bits =
+							MDS_INODELOCK_DOM;
+		if (ldlm_bl_to_thread_lock(ns, ld, lock))
+			ldlm_handle_bl_callback(ns, ld, lock);
 
                 EXIT;
                 return;
@@ -1958,20 +2015,6 @@ static void ldlm_handle_gl_callback(struct ptlrpc_request *req,
         EXIT;
 }
 
-static int ldlm_callback_reply(struct ptlrpc_request *req, int rc)
-{
-        if (req->rq_no_reply)
-                return 0;
-
-        req->rq_status = rc;
-        if (!req->rq_packed_final) {
-                rc = lustre_pack_reply(req, 1, NULL, NULL);
-                if (rc)
-                        return rc;
-        }
-        return ptlrpc_reply(req);
-}
-
 static int __ldlm_bl_to_thread(struct ldlm_bl_work_item *blwi,
 			       enum ldlm_cancel_flags cancel_flags)
 {
@@ -2194,35 +2237,6 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
                 rc = ldlm_handle_setinfo(req);
                 ldlm_callback_reply(req, rc);
                 RETURN(0);
-        case LLOG_ORIGIN_HANDLE_CREATE:
-                req_capsule_set(&req->rq_pill, &RQF_LLOG_ORIGIN_HANDLE_CREATE);
-                if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
-                        RETURN(0);
-		rc = llog_origin_handle_open(req);
-                ldlm_callback_reply(req, rc);
-                RETURN(0);
-        case LLOG_ORIGIN_HANDLE_NEXT_BLOCK:
-                req_capsule_set(&req->rq_pill,
-                                &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK);
-                if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
-                        RETURN(0);
-                rc = llog_origin_handle_next_block(req);
-                ldlm_callback_reply(req, rc);
-                RETURN(0);
-        case LLOG_ORIGIN_HANDLE_READ_HEADER:
-                req_capsule_set(&req->rq_pill,
-                                &RQF_LLOG_ORIGIN_HANDLE_READ_HEADER);
-                if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
-                        RETURN(0);
-                rc = llog_origin_handle_read_header(req);
-                ldlm_callback_reply(req, rc);
-                RETURN(0);
-        case LLOG_ORIGIN_HANDLE_CLOSE:
-                if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
-                        RETURN(0);
-                rc = llog_origin_handle_close(req);
-                ldlm_callback_reply(req, rc);
-                RETURN(0);
         default:
                 CERROR("unknown opcode %u\n",
                        lustre_msg_get_opc(req->rq_reqmsg));
@@ -2307,30 +2321,31 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
                 CDEBUG(D_INODE, "blocking ast\n");
                 req_capsule_extend(&req->rq_pill, &RQF_LDLM_BL_CALLBACK);
 		if (!ldlm_is_cancel_on_block(lock)) {
-                        rc = ldlm_callback_reply(req, 0);
-                        if (req->rq_no_reply || rc)
-                                ldlm_callback_errmsg(req, "Normal process", rc,
-                                                     &dlm_req->lock_handle[0]);
-                }
-                if (ldlm_bl_to_thread_lock(ns, &dlm_req->lock_desc, lock))
-                        ldlm_handle_bl_callback(ns, &dlm_req->lock_desc, lock);
-                break;
-        case LDLM_CP_CALLBACK:
-                CDEBUG(D_INODE, "completion ast\n");
-                req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK);
-                ldlm_callback_reply(req, 0);
-                ldlm_handle_cp_callback(req, ns, dlm_req, lock);
-                break;
-        case LDLM_GL_CALLBACK:
-                CDEBUG(D_INODE, "glimpse ast\n");
-                req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK);
-                ldlm_handle_gl_callback(req, ns, dlm_req, lock);
-                break;
-        default:
-                LBUG();                         /* checked above */
-        }
+			rc = ldlm_callback_reply(req, 0);
+			if (req->rq_no_reply || rc)
+				ldlm_callback_errmsg(req, "Normal process", rc,
+						     &dlm_req->lock_handle[0]);
+		}
+		if (ldlm_bl_to_thread_lock(ns, &dlm_req->lock_desc, lock))
+			ldlm_handle_bl_callback(ns, &dlm_req->lock_desc, lock);
+		break;
+	case LDLM_CP_CALLBACK:
+		CDEBUG(D_INODE, "completion ast\n");
+		req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK);
+		rc = ldlm_handle_cp_callback(req, ns, dlm_req, lock);
+		if (!OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE))
+			ldlm_callback_reply(req, rc);
+		break;
+	case LDLM_GL_CALLBACK:
+		CDEBUG(D_INODE, "glimpse ast\n");
+		req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK);
+		ldlm_handle_gl_callback(req, ns, dlm_req, lock);
+		break;
+	default:
+		LBUG(); /* checked above */
+	}
 
-        RETURN(0);
+	RETURN(0);
 }
 
 #ifdef HAVE_SERVER_SUPPORT
@@ -2341,145 +2356,169 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
  */
 static int ldlm_cancel_handler(struct ptlrpc_request *req)
 {
-        int rc;
-        ENTRY;
-
-        /* Requests arrive in sender's byte order.  The ptlrpc service
-         * handler has already checked and, if necessary, byte-swapped the
-         * incoming request message body, but I am responsible for the
-         * message buffers. */
-
-        req_capsule_init(&req->rq_pill, req, RCL_SERVER);
+	int rc;
 
-        if (req->rq_export == NULL) {
-                struct ldlm_request *dlm_req;
-
-                CERROR("%s from %s arrived at %lu with bad export cookie "
-		       "%llu\n",
-                       ll_opcode2str(lustre_msg_get_opc(req->rq_reqmsg)),
-                       libcfs_nid2str(req->rq_peer.nid),
-                       req->rq_arrival_time.tv_sec,
-                       lustre_msg_get_handle(req->rq_reqmsg)->cookie);
-
-                if (lustre_msg_get_opc(req->rq_reqmsg) == LDLM_CANCEL) {
-                        req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK);
-                        dlm_req = req_capsule_client_get(&req->rq_pill,
-                                                         &RMF_DLM_REQ);
-                        if (dlm_req != NULL)
-                                ldlm_lock_dump_handle(D_ERROR,
-                                                      &dlm_req->lock_handle[0]);
-                }
-                ldlm_callback_reply(req, -ENOTCONN);
-                RETURN(0);
-        }
+	ENTRY;
 
-        switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+	/* Requests arrive in sender's byte order.  The ptlrpc service
+	 * handler has already checked and, if necessary, byte-swapped the
+	 * incoming request message body, but I am responsible for the
+	 * message buffers. */
+
+	req_capsule_init(&req->rq_pill, req, RCL_SERVER);
+
+	if (req->rq_export == NULL) {
+		struct ldlm_request *dlm_req;
+
+		CERROR("%s from %s arrived at %llu with bad export cookie %llu\n",
+		       ll_opcode2str(lustre_msg_get_opc(req->rq_reqmsg)),
+		       libcfs_nid2str(req->rq_peer.nid),
+		       (unsigned long long)req->rq_arrival_time.tv_sec,
+		       lustre_msg_get_handle(req->rq_reqmsg)->cookie);
+
+		if (lustre_msg_get_opc(req->rq_reqmsg) == LDLM_CANCEL) {
+			req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK);
+			dlm_req = req_capsule_client_get(&req->rq_pill,
+							 &RMF_DLM_REQ);
+			if (dlm_req != NULL)
+				ldlm_lock_dump_handle(D_ERROR,
+						      &dlm_req->lock_handle[0]);
+		}
+		ldlm_callback_reply(req, -ENOTCONN);
+		RETURN(0);
+	}
 
-        /* XXX FIXME move this back to mds/handler.c, bug 249 */
-        case LDLM_CANCEL:
-                req_capsule_set(&req->rq_pill, &RQF_LDLM_CANCEL);
-                CDEBUG(D_INODE, "cancel\n");
+	switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+	/* XXX FIXME move this back to mds/handler.c, bug 249 */
+	case LDLM_CANCEL:
+		req_capsule_set(&req->rq_pill, &RQF_LDLM_CANCEL);
+		CDEBUG(D_INODE, "cancel\n");
 		if (CFS_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_NET) ||
 		    CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND) ||
 		    CFS_FAIL_CHECK(OBD_FAIL_LDLM_BL_EVICT))
 			RETURN(0);
-                rc = ldlm_handle_cancel(req);
-                if (rc)
-                        break;
-                RETURN(0);
-        default:
-                CERROR("invalid opcode %d\n",
-                       lustre_msg_get_opc(req->rq_reqmsg));
-                req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK);
-                ldlm_callback_reply(req, -EINVAL);
-        }
+		rc = ldlm_handle_cancel(req);
+		break;
+	case LDLM_CONVERT:
+	{
+		struct ldlm_request *dlm_req;
 
-        RETURN(0);
+		req_capsule_set(&req->rq_pill, &RQF_LDLM_CONVERT);
+		CDEBUG(D_INODE, "convert\n");
+
+		dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+		if (dlm_req == NULL) {
+			CDEBUG(D_INFO, "bad request buffer for cancel\n");
+			rc = ldlm_callback_reply(req, -EPROTO);
+		} else {
+			req->rq_status = ldlm_handle_convert0(req, dlm_req);
+			rc = ptlrpc_reply(req);
+		}
+		break;
+	}
+	default:
+		CERROR("invalid opcode %d\n",
+		       lustre_msg_get_opc(req->rq_reqmsg));
+		req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK);
+		rc = ldlm_callback_reply(req, -EINVAL);
+	}
+
+	RETURN(rc);
 }
 
 static int ldlm_cancel_hpreq_lock_match(struct ptlrpc_request *req,
-                                        struct ldlm_lock *lock)
+					struct ldlm_lock *lock)
 {
-        struct ldlm_request *dlm_req;
-        struct lustre_handle lockh;
-        int rc = 0;
-        int i;
-        ENTRY;
-
-        dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
-        if (dlm_req == NULL)
-                RETURN(0);
+	struct ldlm_request *dlm_req;
+	struct lustre_handle lockh;
+	int rc = 0;
+	int i;
 
-        ldlm_lock2handle(lock, &lockh);
-        for (i = 0; i < dlm_req->lock_count; i++) {
-                if (lustre_handle_equal(&dlm_req->lock_handle[i],
-                                        &lockh)) {
-                        DEBUG_REQ(D_RPCTRACE, req,
-				  "Prio raised by lock %#llx.", lockh.cookie);
+	ENTRY;
 
-                        rc = 1;
-                        break;
-                }
-        }
+	dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	if (dlm_req == NULL)
+		RETURN(0);
 
-        RETURN(rc);
+	ldlm_lock2handle(lock, &lockh);
+	for (i = 0; i < dlm_req->lock_count; i++) {
+		if (lustre_handle_equal(&dlm_req->lock_handle[i],
+					&lockh)) {
+			DEBUG_REQ(D_RPCTRACE, req,
+				  "Prio raised by lock %#llx.", lockh.cookie);
+			rc = 1;
+			break;
+		}
+	}
 
+	RETURN(rc);
 }
 
 static int ldlm_cancel_hpreq_check(struct ptlrpc_request *req)
 {
-        struct ldlm_request *dlm_req;
-        int rc = 0;
-        int i;
-        ENTRY;
+	struct ldlm_request *dlm_req;
+	int rc = 0;
+	int i;
+	unsigned int size;
 
-        /* no prolong in recovery */
-        if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
-                RETURN(0);
+	ENTRY;
 
-        dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
-        if (dlm_req == NULL)
-                RETURN(-EFAULT);
+	/* no prolong in recovery */
+	if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
+		RETURN(0);
 
-        for (i = 0; i < dlm_req->lock_count; i++) {
-                struct ldlm_lock *lock;
+	dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	if (dlm_req == NULL)
+		RETURN(-EFAULT);
 
-                lock = ldlm_handle2lock(&dlm_req->lock_handle[i]);
-                if (lock == NULL)
-                        continue;
+	size = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT);
+	if (size <= offsetof(struct ldlm_request, lock_handle) ||
+	    (size - offsetof(struct ldlm_request, lock_handle)) /
+	     sizeof(struct lustre_handle) < dlm_req->lock_count)
+		RETURN(-EPROTO);
+
+	for (i = 0; i < dlm_req->lock_count; i++) {
+		struct ldlm_lock *lock;
+
+		lock = ldlm_handle2lock(&dlm_req->lock_handle[i]);
+		if (lock == NULL)
+			continue;
 
 		rc = ldlm_is_ast_sent(lock) ? 1 : 0;
-                if (rc)
-                        LDLM_DEBUG(lock, "hpreq cancel lock");
-                LDLM_LOCK_PUT(lock);
+		if (rc)
+			LDLM_DEBUG(lock, "hpreq cancel/convert lock");
+		LDLM_LOCK_PUT(lock);
 
-                if (rc)
-                        break;
-        }
+		if (rc)
+			break;
+	}
 
-        RETURN(rc);
+	RETURN(rc);
 }
 
 static struct ptlrpc_hpreq_ops ldlm_cancel_hpreq_ops = {
-        .hpreq_lock_match = ldlm_cancel_hpreq_lock_match,
+	.hpreq_lock_match = ldlm_cancel_hpreq_lock_match,
 	.hpreq_check      = ldlm_cancel_hpreq_check,
 	.hpreq_fini       = NULL,
 };
 
 static int ldlm_hpreq_handler(struct ptlrpc_request *req)
 {
-        ENTRY;
+	ENTRY;
 
-        req_capsule_init(&req->rq_pill, req, RCL_SERVER);
+	req_capsule_init(&req->rq_pill, req, RCL_SERVER);
 
-        if (req->rq_export == NULL)
-                RETURN(0);
+	if (req->rq_export == NULL)
+		RETURN(0);
 
-        if (LDLM_CANCEL == lustre_msg_get_opc(req->rq_reqmsg)) {
-                req_capsule_set(&req->rq_pill, &RQF_LDLM_CANCEL);
-                req->rq_ops = &ldlm_cancel_hpreq_ops;
-        }
-        RETURN(0);
+	if (LDLM_CANCEL == lustre_msg_get_opc(req->rq_reqmsg)) {
+		req_capsule_set(&req->rq_pill, &RQF_LDLM_CANCEL);
+		req->rq_ops = &ldlm_cancel_hpreq_ops;
+	} else if (LDLM_CONVERT == lustre_msg_get_opc(req->rq_reqmsg)) {
+		req_capsule_set(&req->rq_pill, &RQF_LDLM_CONVERT);
+		req->rq_ops = &ldlm_cancel_hpreq_ops;
+	}
+	RETURN(0);
 }
 
 static int ldlm_revoke_lock_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd,
@@ -2491,10 +2530,10 @@ static int ldlm_revoke_lock_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd,
 
         lock_res_and_lock(lock);
 
-        if (lock->l_req_mode != lock->l_granted_mode) {
-                unlock_res_and_lock(lock);
-                return 0;
-        }
+	if (!ldlm_is_granted(lock)) {
+		unlock_res_and_lock(lock);
+		return 0;
+	}
 
         LASSERT(lock->l_resource);
         if (lock->l_resource->lr_type != LDLM_IBITS &&
@@ -2726,9 +2765,22 @@ static int ldlm_bl_thread_exports(struct ldlm_bl_pool *blp,
  */
 static int ldlm_bl_thread_main(void *arg)
 {
-        struct ldlm_bl_pool *blp;
+	struct lu_env *env;
+	struct ldlm_bl_pool *blp;
 	struct ldlm_bl_thread_data *bltd = arg;
-        ENTRY;
+	int rc;
+
+	ENTRY;
+
+	OBD_ALLOC_PTR(env);
+	if (!env)
+		RETURN(-ENOMEM);
+	rc = lu_env_init(env, LCT_DT_THREAD);
+	if (rc)
+		GOTO(out_env, rc);
+	rc = lu_env_add(env);
+	if (rc)
+		GOTO(out_env_fini, rc);
 
 	blp = bltd->bltd_blp;
 
@@ -2772,7 +2824,13 @@ static int ldlm_bl_thread_main(void *arg)
 
 	atomic_dec(&blp->blp_num_threads);
 	complete(&blp->blp_comp);
-	RETURN(0);
+
+	lu_env_remove(env);
+out_env_fini:
+	lu_env_fini(env);
+out_env:
+	OBD_FREE_PTR(env);
+	RETURN(rc);
 }
 
 
@@ -2973,7 +3031,7 @@ static int ldlm_setup(void)
         if (ldlm_state == NULL)
                 RETURN(-ENOMEM);
 
-	ldlm_kobj = kobject_create_and_add("ldlm", lustre_kobj);
+	ldlm_kobj = kobject_create_and_add("ldlm", &lustre_kset->kobj);
 	if (!ldlm_kobj)
 		GOTO(out, -ENOMEM);
 
@@ -2989,11 +3047,9 @@ static int ldlm_setup(void)
 	if (!ldlm_svc_kset)
 		GOTO(out, -ENOMEM);
 
-#ifdef CONFIG_PROC_FS
-	rc = ldlm_proc_setup();
+	rc = ldlm_debugfs_setup();
 	if (rc != 0)
 		GOTO(out, rc);
-#endif /* CONFIG_PROC_FS */
 
 	memset(&conf, 0, sizeof(conf));
 	conf = (typeof(conf)) {
@@ -3014,18 +3070,20 @@ static int ldlm_setup(void)
 			.tc_nthrs_base		= LDLM_NTHRS_BASE,
 			.tc_nthrs_max		= LDLM_NTHRS_MAX,
 			.tc_nthrs_user		= ldlm_num_threads,
-			.tc_cpu_affinity	= 1,
+			.tc_cpu_bind		= ldlm_cpu_bind,
 			.tc_ctx_tags		= LCT_MD_THREAD | LCT_DT_THREAD,
 		},
 		.psc_cpt		= {
 			.cc_pattern		= ldlm_cpts,
+			.cc_affinity		= true,
 		},
 		.psc_ops		= {
 			.so_req_handler		= ldlm_callback_handler,
 		},
 	};
 	ldlm_state->ldlm_cb_service = \
-			ptlrpc_register_service(&conf, ldlm_svc_proc_dir);
+			ptlrpc_register_service(&conf, ldlm_svc_kset,
+						ldlm_svc_debugfs_dir);
 	if (IS_ERR(ldlm_state->ldlm_cb_service)) {
 		CERROR("failed to start service\n");
 		rc = PTR_ERR(ldlm_state->ldlm_cb_service);
@@ -3054,13 +3112,14 @@ static int ldlm_setup(void)
 			.tc_nthrs_base		= LDLM_NTHRS_BASE,
 			.tc_nthrs_max		= LDLM_NTHRS_MAX,
 			.tc_nthrs_user		= ldlm_num_threads,
-			.tc_cpu_affinity	= 1,
+			.tc_cpu_bind		= ldlm_cpu_bind,
 			.tc_ctx_tags		= LCT_MD_THREAD | \
 						  LCT_DT_THREAD | \
 						  LCT_CL_THREAD,
 		},
 		.psc_cpt		= {
 			.cc_pattern		= ldlm_cpts,
+			.cc_affinity		= true,
 		},
 		.psc_ops		= {
 			.so_req_handler		= ldlm_cancel_handler,
@@ -3068,7 +3127,8 @@ static int ldlm_setup(void)
 		},
 	};
 	ldlm_state->ldlm_cancel_service = \
-			ptlrpc_register_service(&conf, ldlm_svc_proc_dir);
+			ptlrpc_register_service(&conf, ldlm_svc_kset,
+						ldlm_svc_debugfs_dir);
 	if (IS_ERR(ldlm_state->ldlm_cancel_service)) {
 		CERROR("failed to start service\n");
 		rc = PTR_ERR(ldlm_state->ldlm_cancel_service);
@@ -3179,10 +3239,12 @@ static int ldlm_cleanup(void)
 		kset_unregister(ldlm_ns_kset);
 	if (ldlm_svc_kset)
 		kset_unregister(ldlm_svc_kset);
-	if (ldlm_kobj)
+	if (ldlm_kobj) {
+		sysfs_remove_group(ldlm_kobj, &ldlm_attr_group);
 		kobject_put(ldlm_kobj);
+	}
 
-	ldlm_proc_cleanup();
+	ldlm_debugfs_cleanup();
 
 #ifdef HAVE_SERVER_SUPPORT
 	if (expired_lock_thread_state != ELT_STOPPED) {
@@ -3209,7 +3271,7 @@ int ldlm_init(void)
 
 	ldlm_lock_slab = kmem_cache_create("ldlm_locks",
 			      sizeof(struct ldlm_lock), 0,
-			      SLAB_HWCACHE_ALIGN | SLAB_DESTROY_BY_RCU, NULL);
+			      SLAB_HWCACHE_ALIGN, NULL);
 	if (ldlm_lock_slab == NULL)
 		goto out_resource;
 
@@ -3225,11 +3287,30 @@ int ldlm_init(void)
 	if (ldlm_interval_tree_slab == NULL)
 		goto out_interval;
 
+#ifdef HAVE_SERVER_SUPPORT
+	ldlm_inodebits_slab = kmem_cache_create("ldlm_ibits_node",
+						sizeof(struct ldlm_ibits_node),
+						0, SLAB_HWCACHE_ALIGN, NULL);
+	if (ldlm_inodebits_slab == NULL)
+		goto out_interval_tree;
+
+	ldlm_glimpse_work_kmem = kmem_cache_create("ldlm_glimpse_work_kmem",
+					sizeof(struct ldlm_glimpse_work),
+					0, 0, NULL);
+	if (ldlm_glimpse_work_kmem == NULL)
+		goto out_inodebits;
+#endif
+
 #if LUSTRE_TRACKS_LOCK_EXP_REFS
 	class_export_dump_hook = ldlm_dump_export_locks;
 #endif
 	return 0;
-
+#ifdef HAVE_SERVER_SUPPORT
+out_inodebits:
+	kmem_cache_destroy(ldlm_inodebits_slab);
+out_interval_tree:
+	kmem_cache_destroy(ldlm_interval_tree_slab);
+#endif
 out_interval:
 	kmem_cache_destroy(ldlm_interval_slab);
 out_lock:
@@ -3245,11 +3326,17 @@ void ldlm_exit(void)
 	if (ldlm_refcount)
 		CERROR("ldlm_refcount is %d in ldlm_exit!\n", ldlm_refcount);
 	kmem_cache_destroy(ldlm_resource_slab);
-	/* ldlm_lock_put() use RCU to call ldlm_lock_free, so need call
-	 * synchronize_rcu() to wait a grace period elapsed, so that
-	 * ldlm_lock_free() get a chance to be called. */
-	synchronize_rcu();
+	/*
+	 * ldlm_lock_put() use RCU to call ldlm_lock_free, so need call
+	 * rcu_barrier() to wait all outstanding RCU callbacks to complete,
+	 * so that ldlm_lock_free() get a chance to be called.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(ldlm_lock_slab);
 	kmem_cache_destroy(ldlm_interval_slab);
 	kmem_cache_destroy(ldlm_interval_tree_slab);
+#ifdef HAVE_SERVER_SUPPORT
+	kmem_cache_destroy(ldlm_inodebits_slab);
+	kmem_cache_destroy(ldlm_glimpse_work_kmem);
+#endif
 }
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_plain.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_plain.c
index 6453cabf1921f..6407fd20884f8 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_plain.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_plain.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -129,14 +129,14 @@ int ldlm_process_plain_lock(struct ldlm_lock *lock, __u64 *flags,
 			    enum ldlm_error *err, struct list_head *work_list)
 {
 	struct ldlm_resource *res = lock->l_resource;
-	struct list_head rpc_list;
+	struct list_head *grant_work = intention == LDLM_PROCESS_ENQUEUE ?
+							NULL : work_list;
 	int rc;
 	ENTRY;
 
-	LASSERT(lock->l_granted_mode != lock->l_req_mode);
+	LASSERT(!ldlm_is_granted(lock));
 	check_res_locked(res);
-	LASSERT(list_empty(&res->lr_converting));
-	INIT_LIST_HEAD(&rpc_list);
+	*err = ELDLM_OK;
 
 	if (intention == LDLM_PROCESS_RESCAN) {
                 LASSERT(work_list != NULL);
@@ -148,31 +148,19 @@ int ldlm_process_plain_lock(struct ldlm_lock *lock, __u64 *flags,
                         RETURN(LDLM_ITER_STOP);
 
                 ldlm_resource_unlink_lock(lock);
-                ldlm_grant_lock(lock, work_list);
+		ldlm_grant_lock(lock, grant_work);
                 RETURN(LDLM_ITER_CONTINUE);
         }
 
-	LASSERT((intention == LDLM_PROCESS_ENQUEUE && work_list == NULL) ||
-		(intention == LDLM_PROCESS_RECOVERY && work_list != NULL));
- restart:
-        rc = ldlm_plain_compat_queue(&res->lr_granted, lock, &rpc_list);
-        rc += ldlm_plain_compat_queue(&res->lr_waiting, lock, &rpc_list);
-
-        if (rc != 2) {
-		rc = ldlm_handle_conflict_lock(lock, flags, &rpc_list, 0);
-		if (rc == -ERESTART)
-			GOTO(restart, rc);
-		*err = rc;
-	} else {
+	rc = ldlm_plain_compat_queue(&res->lr_granted, lock, work_list);
+	rc += ldlm_plain_compat_queue(&res->lr_waiting, lock, work_list);
+
+	if (rc == 2) {
 		ldlm_resource_unlink_lock(lock);
-		ldlm_grant_lock(lock, work_list);
-		rc = 0;
+		ldlm_grant_lock(lock, grant_work);
 	}
 
-	if (!list_empty(&rpc_list))
-		ldlm_discard_bl_list(&rpc_list);
-
-	RETURN(rc);
+	RETURN(LDLM_ITER_CONTINUE);
 }
 #endif /* HAVE_SERVER_SUPPORT */
 
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_pool.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_pool.c
index 2afed77ea5f70..0a423d5615b5b 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_pool.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_pool.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2016, Intel Corporation.
+ * Copyright (c) 2010, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -93,7 +93,8 @@
 
 #define DEBUG_SUBSYSTEM S_LDLM
 
-#include <linux/kthread.h>
+#include <linux/workqueue.h>
+#include <libcfs/linux/linux-mem.h>
 #include <lustre_dlm.h>
 #include <cl_object.h>
 #include <obd_class.h>
@@ -497,22 +498,14 @@ static int ldlm_cli_pool_recalc(struct ldlm_pool *pl)
         ldlm_cli_pool_pop_slv(pl);
 	spin_unlock(&pl->pl_lock);
 
-        /*
-         * Do not cancel locks in case lru resize is disabled for this ns.
-         */
-        if (!ns_connect_lru_resize(ldlm_pl2ns(pl)))
-		GOTO(out, ret = 0);
-
         /*
          * In the time of canceling locks on client we do not need to maintain
          * sharp timing, we only want to cancel locks asap according to new SLV.
          * It may be called when SLV has changed much, this is why we do not
          * take into account pl->pl_recalc_time here.
          */
-	ret = ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LCF_ASYNC,
-			      LDLM_LRU_FLAG_LRUR);
+	ret = ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LCF_ASYNC, 0);
 
-out:
 	spin_lock(&pl->pl_lock);
 	/*
 	 * Time of LRU resizing might be longer than period,
@@ -556,7 +549,7 @@ static int ldlm_cli_pool_shrink(struct ldlm_pool *pl,
 	if (nr == 0)
 		return (unused / 100) * sysctl_vfs_cache_pressure;
 	else
-		return ldlm_cancel_lru(ns, nr, LCF_ASYNC, LDLM_LRU_FLAG_SHRINK);
+		return ldlm_cancel_lru(ns, nr, LCF_ASYNC, 0);
 }
 
 static struct ldlm_pool_ops ldlm_srv_pool_ops = {
@@ -574,7 +567,7 @@ static struct ldlm_pool_ops ldlm_cli_pool_ops = {
  * Pool recalc wrapper. Will call either client or server pool recalc callback
  * depending what pool \a pl is used.
  */
-int ldlm_pool_recalc(struct ldlm_pool *pl)
+time64_t ldlm_pool_recalc(struct ldlm_pool *pl)
 {
 	time64_t recalc_interval_sec;
 	int count;
@@ -694,7 +687,8 @@ static int lprocfs_pool_state_seq_show(struct seq_file *m, void *unused)
 		   granted, limit);
 	return 0;
 }
-LPROC_SEQ_FOPS_RO(lprocfs_pool_state);
+
+LDEBUGFS_SEQ_FOPS_RO(lprocfs_pool_state);
 
 static ssize_t grant_speed_show(struct kobject *kobj, struct attribute *attr,
 				char *buf)
@@ -778,11 +772,11 @@ static int ldlm_pool_sysfs_init(struct ldlm_pool *pl)
 	return err;
 }
 
-static int ldlm_pool_proc_init(struct ldlm_pool *pl)
+static int ldlm_pool_debugfs_init(struct ldlm_pool *pl)
 {
 	struct ldlm_namespace *ns = ldlm_pl2ns(pl);
-	struct proc_dir_entry *parent_ns_proc;
-	struct lprocfs_vars pool_vars[2];
+	struct dentry *debugfs_ns_parent;
+	struct ldebugfs_vars pool_vars[2];
 	char *var_name = NULL;
 	int rc = 0;
 	ENTRY;
@@ -791,18 +785,18 @@ static int ldlm_pool_proc_init(struct ldlm_pool *pl)
 	if (!var_name)
 		RETURN(-ENOMEM);
 
-	parent_ns_proc = ns->ns_proc_dir_entry;
-	if (parent_ns_proc == NULL) {
-		CERROR("%s: proc entry is not initialized\n",
+	debugfs_ns_parent = ns->ns_debugfs_entry;
+	if (IS_ERR_OR_NULL(debugfs_ns_parent)) {
+		CERROR("%s: debugfs entry is not initialized\n",
 		       ldlm_ns_name(ns));
 		GOTO(out_free_name, rc = -EINVAL);
 	}
-	pl->pl_proc_dir = lprocfs_register("pool", parent_ns_proc,
-					   NULL, NULL);
-	if (IS_ERR(pl->pl_proc_dir)) {
-		rc = PTR_ERR(pl->pl_proc_dir);
-		pl->pl_proc_dir = NULL;
-		CERROR("%s: cannot create 'pool' proc entry: rc = %d\n",
+	pl->pl_debugfs_entry = ldebugfs_register("pool", debugfs_ns_parent,
+						 NULL, NULL);
+	if (IS_ERR(pl->pl_debugfs_entry)) {
+		rc = PTR_ERR(pl->pl_debugfs_entry);
+		pl->pl_debugfs_entry = NULL;
+		CERROR("%s: cannot create 'pool' debugfs entry: rc = %d\n",
 		       ldlm_ns_name(ns), rc);
 		GOTO(out_free_name, rc);
 	}
@@ -811,7 +805,7 @@ static int ldlm_pool_proc_init(struct ldlm_pool *pl)
 	memset(pool_vars, 0, sizeof(pool_vars));
 	pool_vars[0].name = var_name;
 
-	ldlm_add_var(&pool_vars[0], pl->pl_proc_dir, "state", pl,
+	ldlm_add_var(&pool_vars[0], pl->pl_debugfs_entry, "state", pl,
 		     &lprocfs_pool_state_fops);
 
         pl->pl_stats = lprocfs_alloc_stats(LDLM_POOL_LAST_STAT -
@@ -852,7 +846,8 @@ static int ldlm_pool_proc_init(struct ldlm_pool *pl)
         lprocfs_counter_init(pl->pl_stats, LDLM_POOL_TIMING_STAT,
                              LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
                              "recalc_timing", "sec");
-	rc = lprocfs_register_stats(pl->pl_proc_dir, "stats", pl->pl_stats);
+	rc = ldebugfs_register_stats(pl->pl_debugfs_entry, "stats",
+				     pl->pl_stats);
 
         EXIT;
 out_free_name:
@@ -866,15 +861,15 @@ static void ldlm_pool_sysfs_fini(struct ldlm_pool *pl)
 	wait_for_completion(&pl->pl_kobj_unregister);
 }
 
-static void ldlm_pool_proc_fini(struct ldlm_pool *pl)
+static void ldlm_pool_debugfs_fini(struct ldlm_pool *pl)
 {
         if (pl->pl_stats != NULL) {
                 lprocfs_free_stats(&pl->pl_stats);
                 pl->pl_stats = NULL;
         }
-        if (pl->pl_proc_dir != NULL) {
-                lprocfs_remove(&pl->pl_proc_dir);
-                pl->pl_proc_dir = NULL;
+	if (pl->pl_debugfs_entry != NULL) {
+		ldebugfs_remove(&pl->pl_debugfs_entry);
+		pl->pl_debugfs_entry = NULL;
         }
 }
 
@@ -908,7 +903,7 @@ int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
                 pl->pl_recalc_period = LDLM_POOL_CLI_DEF_RECALC_PERIOD;
         }
         pl->pl_client_lock_volume = 0;
-        rc = ldlm_pool_proc_init(pl);
+	rc = ldlm_pool_debugfs_init(pl);
         if (rc)
                 RETURN(rc);
 
@@ -925,7 +920,7 @@ void ldlm_pool_fini(struct ldlm_pool *pl)
 {
 	ENTRY;
 	ldlm_pool_sysfs_fini(pl);
-	ldlm_pool_proc_fini(pl);
+	ldlm_pool_debugfs_fini(pl);
 
         /*
          * Pool should not be used after this point. We can't free it here as
@@ -1070,10 +1065,8 @@ __u32 ldlm_pool_get_lvf(struct ldlm_pool *pl)
 	return atomic_read(&pl->pl_lock_volume_factor);
 }
 
-static struct ptlrpc_thread *ldlm_pools_thread;
 static struct shrinker *ldlm_pools_srv_shrinker;
 static struct shrinker *ldlm_pools_cli_shrinker;
-static struct completion ldlm_pools_comp;
 
 /*
 * count locks from all namespaces (if possible). Returns number of
@@ -1241,108 +1234,35 @@ static int ldlm_pools_cli_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
 
 #endif /* HAVE_SHRINKER_COUNT */
 
-int ldlm_pools_recalc(enum ldlm_side client)
+static time64_t ldlm_pools_recalc_delay(enum ldlm_side side)
 {
-	unsigned long nr_l = 0, nr_p = 0, l;
 	struct ldlm_namespace *ns;
 	struct ldlm_namespace *ns_old = NULL;
-	int nr, equal = 0;
 	/* seconds of sleep if no active namespaces */
-	int time = client ? LDLM_POOL_CLI_DEF_RECALC_PERIOD :
-			    LDLM_POOL_SRV_DEF_RECALC_PERIOD;
-
-	/*
-	 * No need to setup pool limit for client pools.
-	 */
-	if (client == LDLM_NAMESPACE_SERVER) {
+	time64_t delay = side == LDLM_NAMESPACE_SERVER ?
+				 LDLM_POOL_SRV_DEF_RECALC_PERIOD :
+				 LDLM_POOL_CLI_DEF_RECALC_PERIOD;
+	int nr;
+
+	/* Recalc at least ldlm_namespace_nr(side) namespaces. */
+	for (nr = ldlm_namespace_nr_read(side); nr > 0; nr--) {
+		int skip;
 		/*
-		 * Check all modest namespaces first.
+		 * Lock the list, get first @ns in the list, getref, move it
+		 * to the tail, unlock and call pool recalc. This way we avoid
+		 * calling recalc under @ns lock, which is really good as we
+		 * get rid of potential deadlock on side nodes when canceling
+		 * locks synchronously.
 		 */
-		mutex_lock(ldlm_namespace_lock(client));
-		list_for_each_entry(ns, ldlm_namespace_list(client),
-				    ns_list_chain)
-		{
-			if (ns->ns_appetite != LDLM_NAMESPACE_MODEST)
-				continue;
-
-                        l = ldlm_pool_granted(&ns->ns_pool);
-                        if (l == 0)
-                                l = 1;
-
-                        /*
-                         * Set the modest pools limit equal to their avg granted
-                         * locks + ~6%.
-                         */
-                        l += dru(l, LDLM_POOLS_MODEST_MARGIN_SHIFT, 0);
-                        ldlm_pool_setup(&ns->ns_pool, l);
-                        nr_l += l;
-                        nr_p++;
-                }
-
-                /*
-                 * Make sure that modest namespaces did not eat more that 2/3
-                 * of limit.
-                 */
-                if (nr_l >= 2 * (LDLM_POOL_HOST_L / 3)) {
-                        CWARN("\"Modest\" pools eat out 2/3 of server locks "
-                              "limit (%lu of %lu). This means that you have too "
-                              "many clients for this amount of server RAM. "
-                              "Upgrade server!\n", nr_l, LDLM_POOL_HOST_L);
-                        equal = 1;
-                }
-
-		/*
-		 * The rest is given to greedy namespaces.
-		 */
-		list_for_each_entry(ns, ldlm_namespace_list(client),
-				    ns_list_chain)
-		{
-			if (!equal && ns->ns_appetite != LDLM_NAMESPACE_GREEDY)
-				continue;
-
-                        if (equal) {
-                                /*
-                                 * In the case 2/3 locks are eaten out by
-                                 * modest pools, we re-setup equal limit
-                                 * for _all_ pools.
-                                 */
-                                l = LDLM_POOL_HOST_L /
-					ldlm_namespace_nr_read(client);
-                        } else {
-                                /*
-                                 * All the rest of greedy pools will have
-                                 * all locks in equal parts.
-                                 */
-                                l = (LDLM_POOL_HOST_L - nr_l) /
-					(ldlm_namespace_nr_read(client) -
-                                         nr_p);
-                        }
-                        ldlm_pool_setup(&ns->ns_pool, l);
-                }
-		mutex_unlock(ldlm_namespace_lock(client));
-        }
-
-        /*
-         * Recalc at least ldlm_namespace_nr(client) namespaces.
-         */
-	for (nr = ldlm_namespace_nr_read(client); nr > 0; nr--) {
-                int     skip;
-                /*
-                 * Lock the list, get first @ns in the list, getref, move it
-                 * to the tail, unlock and call pool recalc. This way we avoid
-                 * calling recalc under @ns lock what is really good as we get
-                 * rid of potential deadlock on client nodes when canceling
-                 * locks synchronously.
-                 */
-		mutex_lock(ldlm_namespace_lock(client));
-		if (list_empty(ldlm_namespace_list(client))) {
-			mutex_unlock(ldlm_namespace_lock(client));
+		mutex_lock(ldlm_namespace_lock(side));
+		if (list_empty(ldlm_namespace_list(side))) {
+			mutex_unlock(ldlm_namespace_lock(side));
 			break;
 		}
-		ns = ldlm_namespace_first_locked(client);
+		ns = ldlm_namespace_first_locked(side);
 
 		if (ns_old == ns) { /* Full pass complete */
-			mutex_unlock(ldlm_namespace_lock(client));
+			mutex_unlock(ldlm_namespace_lock(side));
 			break;
 		}
 
@@ -1357,8 +1277,8 @@ int ldlm_pools_recalc(enum ldlm_side client)
 		 *   there).
 		 */
 		if (ldlm_ns_empty(ns)) {
-			ldlm_namespace_move_to_inactive_locked(ns, client);
-			mutex_unlock(ldlm_namespace_lock(client));
+			ldlm_namespace_move_to_inactive_locked(ns, side);
+			mutex_unlock(ldlm_namespace_lock(side));
 			continue;
 		}
 
@@ -1378,144 +1298,118 @@ int ldlm_pools_recalc(enum ldlm_side client)
 		}
 		spin_unlock(&ns->ns_lock);
 
-		ldlm_namespace_move_to_active_locked(ns, client);
-		mutex_unlock(ldlm_namespace_lock(client));
+		ldlm_namespace_move_to_active_locked(ns, side);
+		mutex_unlock(ldlm_namespace_lock(side));
 
 		/*
 		 * After setup is done - recalc the pool.
 		 */
 		if (!skip) {
-			int ttime = ldlm_pool_recalc(&ns->ns_pool);
-
-			if (ttime < time)
-				time = ttime;
-
+			delay = min(delay, ldlm_pool_recalc(&ns->ns_pool));
 			ldlm_namespace_put(ns);
 		}
-        }
-
-	/* Wake up the blocking threads from time to time. */
-	ldlm_bl_thread_wakeup();
+	}
 
-	return time;
+	return delay;
 }
 
-static int ldlm_pools_thread_main(void *arg)
-{
-	struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg;
-	int s_time, c_time;
-	ENTRY;
+static void ldlm_pools_recalc_task(struct work_struct *ws);
+static DECLARE_DELAYED_WORK(ldlm_pools_recalc_work, ldlm_pools_recalc_task);
 
-	thread_set_flags(thread, SVC_RUNNING);
-	wake_up(&thread->t_ctl_waitq);
-
-	CDEBUG(D_DLMTRACE, "%s: pool thread starting, process %d\n",
-	       "ldlm_poold", current_pid());
+static void ldlm_pools_recalc_task(struct work_struct *ws)
+{
+	/* seconds of sleep if no active namespaces */
+	time64_t delay;
+#ifdef HAVE_SERVER_SUPPORT
+	struct ldlm_namespace *ns;
+	unsigned long nr_l = 0, nr_p = 0, l;
+	int equal = 0;
 
-        while (1) {
-                struct l_wait_info lwi;
+	/* Check all modest namespaces first. */
+	mutex_lock(ldlm_namespace_lock(LDLM_NAMESPACE_SERVER));
+	list_for_each_entry(ns, ldlm_namespace_list(LDLM_NAMESPACE_SERVER),
+			    ns_list_chain) {
+		if (ns->ns_appetite != LDLM_NAMESPACE_MODEST)
+			continue;
 
-		/*
-		 * Recal all pools on this tick.
-		 */
-		s_time = ldlm_pools_recalc(LDLM_NAMESPACE_SERVER);
-		c_time = ldlm_pools_recalc(LDLM_NAMESPACE_CLIENT);
+		l = ldlm_pool_granted(&ns->ns_pool);
+		if (l == 0)
+			l = 1;
 
 		/*
-		 * Wait until the next check time, or until we're
-		 * stopped.
+		 * Set the modest pools limit equal to their avg granted
+		 * locks + ~6%.
 		 */
-		lwi = LWI_TIMEOUT(cfs_time_seconds(min(s_time, c_time)),
-				  NULL, NULL);
-                l_wait_event(thread->t_ctl_waitq,
-                             thread_is_stopping(thread) ||
-                             thread_is_event(thread),
-                             &lwi);
-
-                if (thread_test_and_clear_flags(thread, SVC_STOPPING))
-                        break;
-                else
-                        thread_test_and_clear_flags(thread, SVC_EVENT);
-        }
-
-	thread_set_flags(thread, SVC_STOPPED);
-	wake_up(&thread->t_ctl_waitq);
-
-	CDEBUG(D_DLMTRACE, "%s: pool thread exiting, process %d\n",
-		"ldlm_poold", current_pid());
-
-	complete_and_exit(&ldlm_pools_comp, 0);
-}
-
-static int ldlm_pools_thread_start(void)
-{
-	struct l_wait_info lwi = { 0 };
-	struct task_struct *task;
-	ENTRY;
-
-	if (ldlm_pools_thread != NULL)
-		RETURN(-EALREADY);
-
-	OBD_ALLOC_PTR(ldlm_pools_thread);
-	if (ldlm_pools_thread == NULL)
-		RETURN(-ENOMEM);
-
-	init_completion(&ldlm_pools_comp);
-	init_waitqueue_head(&ldlm_pools_thread->t_ctl_waitq);
+		l += dru(l, LDLM_POOLS_MODEST_MARGIN_SHIFT, 0);
+		ldlm_pool_setup(&ns->ns_pool, l);
+		nr_l += l;
+		nr_p++;
+	}
 
-	task = kthread_run(ldlm_pools_thread_main, ldlm_pools_thread,
-			   "ldlm_poold");
-	if (IS_ERR(task)) {
-		CERROR("Can't start pool thread, error %ld\n", PTR_ERR(task));
-		OBD_FREE(ldlm_pools_thread, sizeof(*ldlm_pools_thread));
-		ldlm_pools_thread = NULL;
-		RETURN(PTR_ERR(task));
+	/*
+	 * Make sure than modest namespaces did not eat more that 2/3
+	 * of limit.
+	 */
+	if (nr_l >= 2 * (LDLM_POOL_HOST_L / 3)) {
+		CWARN("'Modest' pools eat out 2/3 of server locks "
+		      "limit (%lu of %lu). This means that you have too "
+		      "many clients for this amount of server RAM. "
+		      "Upgrade server!\n", nr_l, LDLM_POOL_HOST_L);
+		equal = 1;
 	}
-	l_wait_event(ldlm_pools_thread->t_ctl_waitq,
-		     thread_is_running(ldlm_pools_thread), &lwi);
-	RETURN(0);
-}
 
-static void ldlm_pools_thread_stop(void)
-{
-	ENTRY;
+	/* The rest is given to greedy namespaces. */
+	list_for_each_entry(ns, ldlm_namespace_list(LDLM_NAMESPACE_SERVER),
+			    ns_list_chain) {
+		if (!equal && ns->ns_appetite != LDLM_NAMESPACE_GREEDY)
+			continue;
 
-	if (ldlm_pools_thread == NULL) {
-		EXIT;
-		return;
+		if (equal) {
+			/*
+			 * In the case 2/3 locks are eaten out by
+			 * modest pools, we re-setup equal limit
+			 * for _all_ pools.
+			 */
+			l = LDLM_POOL_HOST_L /
+				ldlm_namespace_nr_read(LDLM_NAMESPACE_SERVER);
+		} else {
+			/*
+			 * All the rest of greedy pools will have
+			 * all locks in equal parts.
+			 */
+			l = (LDLM_POOL_HOST_L - nr_l) /
+				(ldlm_namespace_nr_read(LDLM_NAMESPACE_SERVER) -
+				 nr_p);
+		}
+		ldlm_pool_setup(&ns->ns_pool, l);
 	}
+	mutex_unlock(ldlm_namespace_lock(LDLM_NAMESPACE_SERVER));
 
-	thread_set_flags(ldlm_pools_thread, SVC_STOPPING);
-	wake_up(&ldlm_pools_thread->t_ctl_waitq);
+	delay = min(ldlm_pools_recalc_delay(LDLM_NAMESPACE_SERVER),
+		    ldlm_pools_recalc_delay(LDLM_NAMESPACE_CLIENT));
+#else  /* !HAVE_SERVER_SUPPORT */
+	delay = ldlm_pools_recalc_delay(LDLM_NAMESPACE_CLIENT);
+#endif /* HAVE_SERVER_SUPPORT */
 
-	/*
-	 * Make sure that pools thread is finished before freeing @thread.
-	 * This fixes possible race and oops due to accessing freed memory
-	 * in pools thread.
-	 */
-	wait_for_completion(&ldlm_pools_comp);
-	OBD_FREE_PTR(ldlm_pools_thread);
-	ldlm_pools_thread = NULL;
-	EXIT;
+	/* Wake up the blocking threads from time to time. */
+	ldlm_bl_thread_wakeup();
+
+	schedule_delayed_work(&ldlm_pools_recalc_work, cfs_time_seconds(delay));
 }
 
 int ldlm_pools_init(void)
 {
-	int rc;
 	DEF_SHRINKER_VAR(shsvar, ldlm_pools_srv_shrink,
 			 ldlm_pools_srv_count, ldlm_pools_srv_scan);
 	DEF_SHRINKER_VAR(shcvar, ldlm_pools_cli_shrink,
 			 ldlm_pools_cli_count, ldlm_pools_cli_scan);
-	ENTRY;
 
-	rc = ldlm_pools_thread_start();
-	if (rc == 0) {
-		ldlm_pools_srv_shrinker =
-			set_shrinker(DEFAULT_SEEKS, &shsvar);
-		ldlm_pools_cli_shrinker =
-			set_shrinker(DEFAULT_SEEKS, &shcvar);
-	}
-	RETURN(rc);
+	schedule_delayed_work(&ldlm_pools_recalc_work,
+			      LDLM_POOL_CLI_DEF_RECALC_PERIOD);
+	ldlm_pools_srv_shrinker = set_shrinker(DEFAULT_SEEKS, &shsvar);
+	ldlm_pools_cli_shrinker = set_shrinker(DEFAULT_SEEKS, &shcvar);
+
+	return 0;
 }
 
 void ldlm_pools_fini(void)
@@ -1528,7 +1422,7 @@ void ldlm_pools_fini(void)
 		remove_shrinker(ldlm_pools_cli_shrinker);
 		ldlm_pools_cli_shrinker = NULL;
 	}
-	ldlm_pools_thread_stop();
+	cancel_delayed_work_sync(&ldlm_pools_recalc_work);
 }
 
 #else /* !HAVE_LRU_RESIZE_SUPPORT */
@@ -1537,7 +1431,7 @@ int ldlm_pool_setup(struct ldlm_pool *pl, int limit)
         return 0;
 }
 
-int ldlm_pool_recalc(struct ldlm_pool *pl)
+time64_t ldlm_pool_recalc(struct ldlm_pool *pl)
 {
         return 0;
 }
@@ -1614,8 +1508,4 @@ void ldlm_pools_fini(void)
 	return;
 }
 
-int ldlm_pools_recalc(enum ldlm_side client)
-{
-	return 0;
-}
 #endif /* HAVE_LRU_RESIZE_SUPPORT */
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c
index d15cff5fb27b6..f16aaa954a54c 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_request.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2016, Intel Corporation.
+ * Copyright (c) 2010, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -57,8 +57,7 @@
 
 #define DEBUG_SUBSYSTEM S_LDLM
 
-#include <lustre/lustre_errno.h>
-
+#include <lustre_errno.h>
 #include <lustre_dlm.h>
 #include <obd_class.h>
 #include <obd.h>
@@ -68,6 +67,7 @@
 unsigned int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT;
 module_param(ldlm_enqueue_min, uint, 0644);
 MODULE_PARM_DESC(ldlm_enqueue_min, "lock enqueue timeout minimum");
+EXPORT_SYMBOL(ldlm_enqueue_min);
 
 /* in client side, whether the cached locks will be canceled before replay */
 unsigned int ldlm_cancel_unused_locks_before_replay = 1;
@@ -121,16 +121,16 @@ int ldlm_expired_completion_wait(void *data)
 
         ENTRY;
         if (lock->l_conn_export == NULL) {
-                static cfs_time_t next_dump = 0, last_dump = 0;
+		static time64_t next_dump, last_dump;
 
 		LDLM_ERROR(lock, "lock timed out (enqueued at %lld, %llds ago); "
 			   "not entering recovery in server code, just going back to sleep",
 			   (s64)lock->l_activity,
 			   (s64)(ktime_get_real_seconds() -
 				 lock->l_activity));
-                if (cfs_time_after(cfs_time_current(), next_dump)) {
+		if (ktime_get_seconds() > next_dump) {
                         last_dump = next_dump;
-                        next_dump = cfs_time_shift(300);
+			next_dump = ktime_get_seconds() + 300;
                         ldlm_namespace_dump(D_DLMTRACE,
                                             ldlm_lock_to_ns(lock));
                         if (last_dump == 0)
@@ -150,6 +150,19 @@ int ldlm_expired_completion_wait(void *data)
         RETURN(0);
 }
 
+int is_granted_or_cancelled_nolock(struct ldlm_lock *lock)
+{
+	int ret = 0;
+
+	check_res_locked(lock->l_resource);
+	if (ldlm_is_granted(lock) && !ldlm_is_cp_reqd(lock))
+		ret = 1;
+	else if (ldlm_is_failed(lock) || ldlm_is_cancel(lock))
+		ret = 1;
+	return ret;
+}
+EXPORT_SYMBOL(is_granted_or_cancelled_nolock);
+
 /**
  * Calculate the Completion timeout (covering enqueue, BL AST, data flush,
  * lock cancel, and their replies). Used for lock completion timeout on the
@@ -162,9 +175,9 @@ int ldlm_expired_completion_wait(void *data)
 
 /* We use the same basis for both server side and client side functions
    from a single node. */
-static unsigned int ldlm_cp_timeout(struct ldlm_lock *lock)
+static time64_t ldlm_cp_timeout(struct ldlm_lock *lock)
 {
-	unsigned int timeout;
+	time64_t timeout;
 
 	if (AT_OFF)
 		return obd_timeout;
@@ -173,7 +186,7 @@ static unsigned int ldlm_cp_timeout(struct ldlm_lock *lock)
 	 * lock from another client.  Server will evict the other client if it
 	 * doesn't respond reasonably, and then give us the lock. */
 	timeout = at_get(ldlm_lock_to_ns_at(lock));
-	return max(3 * timeout, ldlm_enqueue_min);
+	return max(3 * timeout, (time64_t) ldlm_enqueue_min);
 }
 
 /**
@@ -221,9 +234,9 @@ int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data)
 		RETURN(ldlm_completion_tail(lock, data));
 	}
 
-	LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, "
-		   "going forward");
-	ldlm_reprocess_all(lock->l_resource);
+	LDLM_DEBUG(lock,
+		   "client-side enqueue returned a blocked lock, going forward");
+	ldlm_reprocess_all(lock->l_resource, NULL);
 	RETURN(0);
 }
 EXPORT_SYMBOL(ldlm_completion_ast_async);
@@ -243,8 +256,6 @@ EXPORT_SYMBOL(ldlm_completion_ast_async);
  *
  *     - to force all locks when resource is destroyed (cleanup_resource());
  *
- *     - during lock conversion (not used currently).
- *
  * If lock is not granted in the first case, this function waits until second
  * or penultimate cases happen in some other thread.
  *
@@ -256,7 +267,7 @@ int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
         struct obd_device *obd;
         struct obd_import *imp = NULL;
         struct l_wait_info lwi;
-        __u32 timeout;
+	time64_t timeout;
         int rc = 0;
         ENTRY;
 
@@ -285,7 +296,7 @@ int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
 	timeout = ldlm_cp_timeout(lock);
 
 	lwd.lwd_lock = lock;
-	lock->l_activity = cfs_time_current_sec();
+	lock->l_activity = ktime_get_real_seconds();
 
 	if (ldlm_is_no_timeout(lock)) {
                 LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT");
@@ -434,7 +445,8 @@ int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp)
 /**
  * Enqueue a local lock (typically on a server).
  */
-int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
+int ldlm_cli_enqueue_local(const struct lu_env *env,
+			   struct ldlm_namespace *ns,
 			   const struct ldlm_res_id *res_id,
 			   enum ldlm_type type, union ldlm_policy_data *policy,
 			   enum ldlm_mode mode, __u64 *flags,
@@ -467,6 +479,7 @@ int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
 	err = ldlm_lvbo_init(lock->l_resource);
 	if (err < 0) {
 		LDLM_ERROR(lock, "delayed lvb init failed (rc %d)", err);
+		ldlm_lock_destroy_nolock(lock);
 		GOTO(out, err);
 	}
 
@@ -491,15 +504,15 @@ int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
 		lock->l_req_extent = policy->l_extent;
 	}
 
-        err = ldlm_lock_enqueue(ns, &lock, policy, flags);
-        if (unlikely(err != ELDLM_OK))
-                GOTO(out, err);
+	err = ldlm_lock_enqueue(env, ns, &lock, policy, flags);
+	if (unlikely(err != ELDLM_OK))
+		GOTO(out, err);
 
-        if (policy != NULL)
-                *policy = lock->l_policy_data;
+	if (policy != NULL)
+		*policy = lock->l_policy_data;
 
-        if (lock->l_completion_ast)
-                lock->l_completion_ast(lock, *flags, NULL);
+	if (lock->l_completion_ast)
+		lock->l_completion_ast(lock, *flags, NULL);
 
         LDLM_DEBUG(lock, "client-side local enqueue handler, new lock created");
         EXIT;
@@ -517,9 +530,8 @@ static void failed_lock_cleanup(struct ldlm_namespace *ns,
 
         /* Set a flag to prevent us from sending a CANCEL (bug 407) */
         lock_res_and_lock(lock);
-        /* Check that lock is not granted or failed, we might race. */
-        if ((lock->l_req_mode != lock->l_granted_mode) &&
-	    !ldlm_is_failed(lock)) {
+	/* Check that lock is not granted or failed, we might race. */
+	if (!ldlm_is_granted(lock) && !ldlm_is_failed(lock)) {
 		/* Make sure that this lock will not be found by raced
 		 * bl_ast and -EINVAL reply is sent to server anyways.
 		 * b=17645*/
@@ -566,12 +578,16 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
 			  __u32 lvb_len, const struct lustre_handle *lockh,
 			  int rc)
 {
-        struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
-        int is_replay = *flags & LDLM_FL_REPLAY;
-        struct ldlm_lock *lock;
-        struct ldlm_reply *reply;
-        int cleanup_phase = 1;
-        ENTRY;
+	struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+	const struct lu_env *env = NULL;
+	int is_replay = *flags & LDLM_FL_REPLAY;
+	struct ldlm_lock *lock;
+	struct ldlm_reply *reply;
+	int cleanup_phase = 1;
+	ENTRY;
+
+	if (req && req->rq_svc_thread)
+		env = req->rq_svc_thread->t_env;
 
         lock = ldlm_handle2lock(lockh);
         /* ldlm_cli_enqueue is holding a reference on this lock. */
@@ -680,26 +696,27 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
 						&lock->l_policy_data);
 		}
 
-                if (type != LDLM_PLAIN)
-                        LDLM_DEBUG(lock,"client-side enqueue, new policy data");
-        }
+		if (type != LDLM_PLAIN)
+			LDLM_DEBUG(lock,"client-side enqueue, new policy data");
+	}
 
 	if ((*flags) & LDLM_FL_AST_SENT) {
-                lock_res_and_lock(lock);
+		lock_res_and_lock(lock);
+		ldlm_bl_desc2lock(&reply->lock_desc, lock);
 		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST;
-                unlock_res_and_lock(lock);
-                LDLM_DEBUG(lock, "enqueue reply includes blocking AST");
-        }
+		unlock_res_and_lock(lock);
+		LDLM_DEBUG(lock, "enqueue reply includes blocking AST");
+	}
 
-        /* If the lock has already been granted by a completion AST, don't
-         * clobber the LVB with an older one. */
+	/* If the lock has already been granted by a completion AST, don't
+	 * clobber the LVB with an older one. */
 	if (lvb_len > 0) {
 		/* We must lock or a racing completion might update lvb without
 		 * letting us know and we'll clobber the correct value.
 		 * Cannot unlock after the check either, a that still leaves
 		 * a tiny window for completion to get in */
 		lock_res_and_lock(lock);
-		if (lock->l_req_mode != lock->l_granted_mode)
+		if (!ldlm_is_granted(lock))
 			rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER,
 					   lock->l_lvb_data, lvb_len);
 		unlock_res_and_lock(lock);
@@ -709,16 +726,16 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
 		}
         }
 
-        if (!is_replay) {
-                rc = ldlm_lock_enqueue(ns, &lock, NULL, flags);
-                if (lock->l_completion_ast != NULL) {
-                        int err = lock->l_completion_ast(lock, *flags, NULL);
-                        if (!rc)
-                                rc = err;
-                        if (rc)
-                                cleanup_phase = 1;
-                }
-        }
+	if (!is_replay) {
+		rc = ldlm_lock_enqueue(env, ns, &lock, NULL, flags);
+		if (lock->l_completion_ast != NULL) {
+			int err = lock->l_completion_ast(lock, *flags, NULL);
+			if (!rc)
+				rc = err;
+			if (rc)
+				cleanup_phase = 1;
+		}
+	}
 
 	if (lvb_len > 0 && lvb != NULL) {
 		/* Copy the LVB here, and not earlier, because the completion
@@ -790,8 +807,7 @@ int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req,
 	struct req_capsule	*pill = &req->rq_pill;
 	struct ldlm_request	*dlm = NULL;
 	struct list_head	head = LIST_HEAD_INIT(head);
-	enum ldlm_lru_flags lru_flags;
-	int avail, to_free, pack = 0;
+	int avail, to_free = 0, pack = 0;
 	int rc;
 	ENTRY;
 
@@ -802,10 +818,10 @@ int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req,
 		req_capsule_filled_sizes(pill, RCL_CLIENT);
 		avail = ldlm_capsule_handles_avail(pill, RCL_CLIENT, canceloff);
 
-		lru_flags = LDLM_LRU_FLAG_NO_WAIT | (ns_connect_lru_resize(ns) ?
-			LDLM_LRU_FLAG_LRUR : LDLM_LRU_FLAG_AGED);
-		to_free = !ns_connect_lru_resize(ns) &&
-			opc == LDLM_ENQUEUE ? 1 : 0;
+		/* If we have reached the limit, free +1 slot for the new one */
+		if (!ns_connect_lru_resize(ns) && opc == LDLM_ENQUEUE &&
+		    ns->ns_nr_unused >= ns->ns_max_unused)
+			to_free = 1;
 
 		/* Cancel LRU locks here _only_ if the server supports
 		 * EARLY_CANCEL. Otherwise we have to send extra CANCEL
@@ -813,7 +829,7 @@ int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req,
 		if (avail > count)
 			count += ldlm_cancel_lru_local(ns, cancels, to_free,
 						       avail - count, 0,
-						       lru_flags);
+						       LDLM_LRU_FLAG_NO_WAIT);
 		if (avail > count)
 			pack = count;
 		else
@@ -927,6 +943,10 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
 					lvb_len, lvb_type);
 		if (IS_ERR(lock))
 			RETURN(PTR_ERR(lock));
+
+		if (einfo->ei_cb_created)
+			einfo->ei_cb_created(lock);
+
                 /* for the local lock, add the reference */
                 ldlm_lock_addref_internal(lock, einfo->ei_mode);
                 ldlm_lock2handle(lock, lockh);
@@ -948,7 +968,7 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
 	lock->l_export = NULL;
 	lock->l_blocking_ast = einfo->ei_cb_bl;
 	lock->l_flags |= (*flags & (LDLM_FL_NO_LRU | LDLM_FL_EXCL));
-        lock->l_activity = cfs_time_current_sec();
+	lock->l_activity = ktime_get_real_seconds();
 
 	/* lock not sent to server yet */
 	if (reqp == NULL || *reqp == NULL) {
@@ -972,12 +992,42 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
 			 DLM_LOCKREQ_OFF, len, (int)sizeof(*body));
 	}
 
+	if (*flags & LDLM_FL_NDELAY) {
+		DEBUG_REQ(D_DLMTRACE, req, "enque lock with no delay\n");
+		req->rq_no_resend = req->rq_no_delay = 1;
+		/* probably set a shorter timeout value and handle ETIMEDOUT
+		 * in osc_lock_upcall() correctly */
+		/* lustre_msg_set_timeout(req, req->rq_timeout / 2); */
+	}
+
 	/* Dump lock data into the request buffer */
 	body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
 	ldlm_lock2desc(lock, &body->lock_desc);
 	body->lock_flags = ldlm_flags_to_wire(*flags);
 	body->lock_handle[0] = *lockh;
 
+	/* extended LDLM opcodes in client stats */
+	if (exp->exp_obd->obd_svc_stats != NULL) {
+		bool glimpse = *flags & LDLM_FL_HAS_INTENT;
+
+		/* OST glimpse has no intent buffer */
+		if (req_capsule_has_field(&req->rq_pill, &RMF_LDLM_INTENT,
+					  RCL_CLIENT)) {
+			struct ldlm_intent *it;
+
+			it = req_capsule_client_get(&req->rq_pill,
+						    &RMF_LDLM_INTENT);
+			glimpse = (it && (it->opc == IT_GLIMPSE));
+		}
+
+		if (!glimpse)
+			ldlm_svc_get_eopc(body, exp->exp_obd->obd_svc_stats);
+		else
+			lprocfs_counter_incr(exp->exp_obd->obd_svc_stats,
+					     PTLRPC_LAST_CNTR +
+					     LDLM_GLIMPSE_ENQUEUE);
+	}
+
 	if (async) {
 		LASSERT(reqp != NULL);
 		RETURN(0);
@@ -1008,103 +1058,78 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
 }
 EXPORT_SYMBOL(ldlm_cli_enqueue);
 
-static int ldlm_cli_convert_local(struct ldlm_lock *lock, int new_mode,
-                                  __u32 *flags)
+/**
+ * Client-side IBITS lock convert.
+ *
+ * Inform server that lock has been converted instead of canceling.
+ * Server finishes convert on own side and does reprocess to grant
+ * all related waiting locks.
+ *
+ * Since convert means only ibits downgrading, client doesn't need to
+ * wait for server reply to finish local converting process so this request
+ * is made asynchronous.
+ *
+ */
+int ldlm_cli_convert_req(struct ldlm_lock *lock, __u32 *flags, __u64 new_bits)
 {
-        struct ldlm_resource *res;
-        int rc;
-        ENTRY;
-        if (ns_is_client(ldlm_lock_to_ns(lock))) {
-                CERROR("Trying to cancel local lock\n");
-                LBUG();
-        }
-        LDLM_DEBUG(lock, "client-side local convert");
+	struct ldlm_request *body;
+	struct ptlrpc_request *req;
+	struct obd_export *exp = lock->l_conn_export;
 
-        res = ldlm_lock_convert(lock, new_mode, flags);
-        if (res) {
-                ldlm_reprocess_all(res);
-                rc = 0;
-        } else {
-		rc = LUSTRE_EDEADLK;
-        }
-        LDLM_DEBUG(lock, "client-side local convert handler END");
-        LDLM_LOCK_PUT(lock);
-        RETURN(rc);
-}
+	ENTRY;
 
-/* FIXME: one of ldlm_cli_convert or the server side should reject attempted
- * conversion of locks which are on the waiting or converting queue */
-/* Caller of this code is supposed to take care of lock readers/writers
-   accounting */
-int ldlm_cli_convert(const struct lustre_handle *lockh, int new_mode,
-		     __u32 *flags)
-{
-        struct ldlm_request   *body;
-        struct ldlm_reply     *reply;
-        struct ldlm_lock      *lock;
-        struct ldlm_resource  *res;
-        struct ptlrpc_request *req;
-        int                    rc;
-        ENTRY;
+	LASSERT(exp != NULL);
 
-        lock = ldlm_handle2lock(lockh);
-        if (!lock) {
-                LBUG();
-                RETURN(-EINVAL);
-        }
-        *flags = 0;
+	/* this is better to check earlier and it is done so already,
+	 * but this check is kept too as final one to issue an error
+	 * if any new code will miss such check.
+	 */
+	if (!exp_connect_lock_convert(exp)) {
+		LDLM_ERROR(lock, "server doesn't support lock convert\n");
+		RETURN(-EPROTO);
+	}
 
-        if (lock->l_conn_export == NULL)
-                RETURN(ldlm_cli_convert_local(lock, new_mode, flags));
+	if (lock->l_resource->lr_type != LDLM_IBITS) {
+		LDLM_ERROR(lock, "convert works with IBITS locks only.");
+		RETURN(-EINVAL);
+	}
 
-        LDLM_DEBUG(lock, "client-side convert");
+	LDLM_DEBUG(lock, "client-side convert");
 
-        req = ptlrpc_request_alloc_pack(class_exp2cliimp(lock->l_conn_export),
-                                        &RQF_LDLM_CONVERT, LUSTRE_DLM_VERSION,
-                                        LDLM_CONVERT);
-        if (req == NULL) {
-                LDLM_LOCK_PUT(lock);
-                RETURN(-ENOMEM);
-        }
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+					&RQF_LDLM_CONVERT, LUSTRE_DLM_VERSION,
+					LDLM_CONVERT);
+	if (req == NULL)
+		RETURN(-ENOMEM);
 
-        body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
-        body->lock_handle[0] = lock->l_remote_handle;
+	body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	body->lock_handle[0] = lock->l_remote_handle;
+
+	body->lock_desc.l_req_mode = lock->l_req_mode;
+	body->lock_desc.l_granted_mode = lock->l_granted_mode;
+
+	body->lock_desc.l_policy_data.l_inodebits.bits = new_bits;
+	body->lock_desc.l_policy_data.l_inodebits.cancel_bits = 0;
 
-        body->lock_desc.l_req_mode = new_mode;
 	body->lock_flags = ldlm_flags_to_wire(*flags);
+	body->lock_count = 1;
 
+	ptlrpc_request_set_replen(req);
 
-        ptlrpc_request_set_replen(req);
-        rc = ptlrpc_queue_wait(req);
-        if (rc != ELDLM_OK)
-                GOTO(out, rc);
+	/*
+	 * Use cancel portals for convert as well as high-priority handling.
+	 */
+	req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
+	req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
 
-        reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
-        if (reply == NULL)
-                GOTO(out, rc = -EPROTO);
+	ptlrpc_at_set_req_timeout(req);
 
-        if (req->rq_status)
-                GOTO(out, rc = req->rq_status);
-
-        res = ldlm_lock_convert(lock, new_mode, &reply->lock_flags);
-        if (res != NULL) {
-                ldlm_reprocess_all(res);
-                /* Go to sleep until the lock is granted. */
-                /* FIXME: or cancelled. */
-                if (lock->l_completion_ast) {
-                        rc = lock->l_completion_ast(lock, LDLM_FL_WAIT_NOREPROC,
-                                                    NULL);
-                        if (rc)
-                                GOTO(out, rc);
-                }
-        } else {
-		rc = LUSTRE_EDEADLK;
-        }
-        EXIT;
- out:
-        LDLM_LOCK_PUT(lock);
-        ptlrpc_req_finished(req);
-        return rc;
+	if (exp->exp_obd->obd_svc_stats != NULL)
+		lprocfs_counter_incr(exp->exp_obd->obd_svc_stats,
+				     LDLM_CONVERT - LDLM_FIRST_OPC);
+
+	ptlrpcd_add_req(req);
+	RETURN(0);
 }
 
 /**
@@ -1122,9 +1147,12 @@ static __u64 ldlm_cli_cancel_local(struct ldlm_lock *lock)
         if (lock->l_conn_export) {
                 bool local_only;
 
-                LDLM_DEBUG(lock, "client-side cancel");
-                /* Set this flag to prevent others from getting new references*/
-                lock_res_and_lock(lock);
+		LDLM_DEBUG(lock, "client-side cancel");
+		OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL,
+				 cfs_fail_val);
+
+		/* Set this flag to prevent others from getting new references*/
+		lock_res_and_lock(lock);
 		ldlm_set_cbpending(lock);
 		local_only = !!(lock->l_flags &
 				(LDLM_FL_LOCAL_ONLY|LDLM_FL_CANCEL_ON_BLOCK));
@@ -1133,23 +1161,23 @@ static __u64 ldlm_cli_cancel_local(struct ldlm_lock *lock)
 			LDLM_FL_BL_AST : LDLM_FL_CANCELING;
 		unlock_res_and_lock(lock);
 
-                if (local_only) {
-                        CDEBUG(D_DLMTRACE, "not sending request (at caller's "
-                               "instruction)\n");
-                        rc = LDLM_FL_LOCAL_ONLY;
-                }
-                ldlm_lock_cancel(lock);
-        } else {
-                if (ns_is_client(ldlm_lock_to_ns(lock))) {
-                        LDLM_ERROR(lock, "Trying to cancel local lock");
-                        LBUG();
-                }
-                LDLM_DEBUG(lock, "server-side local cancel");
-                ldlm_lock_cancel(lock);
-                ldlm_reprocess_all(lock->l_resource);
-        }
+		if (local_only) {
+			CDEBUG(D_DLMTRACE,
+			       "not sending request (at caller's instruction)\n");
+			rc = LDLM_FL_LOCAL_ONLY;
+		}
+		ldlm_lock_cancel(lock);
+	} else {
+		if (ns_is_client(ldlm_lock_to_ns(lock))) {
+			LDLM_ERROR(lock, "Trying to cancel local lock");
+			LBUG();
+		}
+		LDLM_DEBUG(lock, "server-side local cancel");
+		ldlm_lock_cancel(lock);
+		ldlm_reprocess_all(lock->l_resource, lock);
+	}
 
-        RETURN(rc);
+	RETURN(rc);
 }
 
 /**
@@ -1347,6 +1375,27 @@ int ldlm_cli_update_pool(struct ptlrpc_request *req)
         RETURN(0);
 }
 
+int ldlm_cli_convert(struct ldlm_lock *lock,
+		     enum ldlm_cancel_flags cancel_flags)
+{
+	int rc = -EINVAL;
+
+	LASSERT(!lock->l_readers && !lock->l_writers);
+	LDLM_DEBUG(lock, "client lock convert START");
+
+	if (lock->l_resource->lr_type == LDLM_IBITS) {
+		lock_res_and_lock(lock);
+		do {
+			rc = ldlm_cli_inodebits_convert(lock, cancel_flags);
+		} while (rc == -EAGAIN);
+		unlock_res_and_lock(lock);
+	}
+
+	LDLM_DEBUG(lock, "client lock convert END");
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_cli_convert);
+
 /**
  * Client side lock cancel.
  *
@@ -1356,12 +1405,12 @@ int ldlm_cli_cancel(const struct lustre_handle *lockh,
 		    enum ldlm_cancel_flags cancel_flags)
 {
 	struct obd_export *exp;
-	enum ldlm_lru_flags lru_flags;
 	int avail, count = 1;
 	__u64 rc = 0;
 	struct ldlm_namespace *ns;
 	struct ldlm_lock *lock;
 	struct list_head cancels = LIST_HEAD_INIT(cancels);
+
 	ENTRY;
 
 	lock = ldlm_handle2lock_long(lockh, 0);
@@ -1371,6 +1420,8 @@ int ldlm_cli_cancel(const struct lustre_handle *lockh,
 	}
 
 	lock_res_and_lock(lock);
+	LASSERT(!ldlm_is_converting(lock));
+
 	/* Lock is being canceled and the caller doesn't want to wait */
 	if (ldlm_is_canceling(lock)) {
 		if (cancel_flags & LCF_ASYNC) {
@@ -1407,10 +1458,8 @@ int ldlm_cli_cancel(const struct lustre_handle *lockh,
 		LASSERT(avail > 0);
 
 		ns = ldlm_lock_to_ns(lock);
-		lru_flags = ns_connect_lru_resize(ns) ?
-			LDLM_LRU_FLAG_LRUR : LDLM_LRU_FLAG_AGED;
 		count += ldlm_cancel_lru_local(ns, &cancels, 0, avail - 1,
-					       LCF_BL_AST, lru_flags);
+					       LCF_BL_AST, 0);
 	}
 	ldlm_cli_cancel_list(&cancels, count, NULL, cancel_flags);
 	RETURN(0);
@@ -1473,11 +1522,11 @@ int ldlm_cli_cancel_list_local(struct list_head *cancels, int count,
  */
 static enum ldlm_policy_res
 ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock,
-			   int unused, int added, int count)
+			   int added, int min)
 {
 	enum ldlm_policy_res result = LDLM_POLICY_CANCEL_LOCK;
 
-	/* don't check added & count since we want to process all locks
+	/* don't check @added & @min since we want to process all locks
 	 * from unused list.
 	 * It's fine to not take lock to access lock->l_resource since
 	 * the lock has already been granted so it won't change. */
@@ -1486,7 +1535,7 @@ ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock,
 		case LDLM_IBITS:
 			if (ns->ns_cancel != NULL && ns->ns_cancel(lock) != 0)
 				break;
-			/* Fall through */
+			fallthrough;
 		default:
 			result = LDLM_POLICY_SKIP_LOCK;
 			break;
@@ -1497,8 +1546,8 @@ ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock,
 
 /**
  * Callback function for LRU-resize policy. Decides whether to keep
- * \a lock in LRU for current \a LRU size \a unused, added in current
- * scan \a added and number of locks to be preferably canceled \a count.
+ * \a lock in LRU for \a added in current scan and \a min number of locks
+ * to be preferably canceled.
  *
  * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
  *
@@ -1506,32 +1555,29 @@ ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock,
  */
 static enum ldlm_policy_res ldlm_cancel_lrur_policy(struct ldlm_namespace *ns,
 						    struct ldlm_lock *lock,
-						    int unused, int added,
-						    int count)
+						    int added, int min)
 {
 	ktime_t cur = ktime_get();
 	struct ldlm_pool *pl = &ns->ns_pool;
 	u64 slv, lvf, lv;
 	s64 la;
 
-	/* Stop LRU processing when we reach past @count or have checked all
-	 * locks in LRU. */
-	if (count && added >= count)
-		return LDLM_POLICY_KEEP_LOCK;
+	if (added < min)
+		return LDLM_POLICY_CANCEL_LOCK;
 
 	/* Despite of the LV, It doesn't make sense to keep the lock which
 	 * is unused for ns_max_age time.
 	 */
-	if (ktime_after(ktime_get(),
-			ktime_add(lock->l_last_used, ns->ns_max_age)))
+	if (ktime_after(cur, ktime_add(lock->l_last_used, ns->ns_max_age)))
 		return LDLM_POLICY_CANCEL_LOCK;
 
 	slv = ldlm_pool_get_slv(pl);
 	lvf = ldlm_pool_get_lvf(pl);
-	la = ktime_to_ns(ktime_sub(cur, lock->l_last_used)) / NSEC_PER_SEC;
-	lv = lvf * la * unused;
+	la = div_u64(ktime_to_ns(ktime_sub(cur, lock->l_last_used)),
+		     NSEC_PER_SEC);
+	lv = lvf * la * ns->ns_nr_unused;
 
-	/* Inform pool about current CLV to see it via proc. */
+	/* Inform pool about current CLV to see it via debugfs. */
 	ldlm_pool_set_clv(pl, lv);
 
 	/* Stop when SLV is not yet come from server or lv is smaller than
@@ -1545,42 +1591,21 @@ static enum ldlm_policy_res ldlm_cancel_lrur_policy(struct ldlm_namespace *ns,
 static enum ldlm_policy_res
 ldlm_cancel_lrur_no_wait_policy(struct ldlm_namespace *ns,
 				struct ldlm_lock *lock,
-				int unused, int added,
-				int count)
+				int added, int min)
 {
 	enum ldlm_policy_res result;
 
-	result = ldlm_cancel_lrur_policy(ns, lock, unused, added, count);
+	result = ldlm_cancel_lrur_policy(ns, lock, added, min);
 	if (result == LDLM_POLICY_KEEP_LOCK)
 		return result;
 
-	return ldlm_cancel_no_wait_policy(ns, lock, unused, added, count);
-}
-
-/**
- * Callback function for proc used policy. Makes decision whether to keep
- * \a lock in LRU for current \a LRU size \a unused, added in current scan \a
- * added and number of locks to be preferably canceled \a count.
- *
- * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
- *
- * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
- */
-static enum ldlm_policy_res ldlm_cancel_passed_policy(struct ldlm_namespace *ns,
-						      struct ldlm_lock *lock,
-						      int unused, int added,
-						      int count)
-{
-	/* Stop LRU processing when we reach past @count or have checked all
-	 * locks in LRU. */
-	return (added >= count) ?
-		LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+	return ldlm_cancel_no_wait_policy(ns, lock, added, min);
 }
 
 /**
- * Callback function for aged policy. Makes decision whether to keep \a lock in
- * LRU for current LRU size \a unused, added in current scan \a added and
- * number of locks to be preferably canceled \a count.
+ * Callback function for aged policy. Decides whether to keep
+ * \a lock in LRU for \a added in current scan and \a min number of locks
+ * to be preferably canceled.
  *
  * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
  *
@@ -1588,10 +1613,9 @@ static enum ldlm_policy_res ldlm_cancel_passed_policy(struct ldlm_namespace *ns,
  */
 static enum ldlm_policy_res ldlm_cancel_aged_policy(struct ldlm_namespace *ns,
 						    struct ldlm_lock *lock,
-						    int unused, int added,
-						    int count)
+						    int added, int min)
 {
-	if ((added >= count) &&
+	if ((added >= min) &&
 	    ktime_before(ktime_get(),
 			 ktime_add(lock->l_last_used, ns->ns_max_age)))
 		return LDLM_POLICY_KEEP_LOCK;
@@ -1602,76 +1626,43 @@ static enum ldlm_policy_res ldlm_cancel_aged_policy(struct ldlm_namespace *ns,
 static enum ldlm_policy_res
 ldlm_cancel_aged_no_wait_policy(struct ldlm_namespace *ns,
 				struct ldlm_lock *lock,
-				int unused, int added, int count)
+				int added, int min)
 {
 	enum ldlm_policy_res result;
 
-	result = ldlm_cancel_aged_policy(ns, lock, unused, added, count);
+	result = ldlm_cancel_aged_policy(ns, lock, added, min);
 	if (result == LDLM_POLICY_KEEP_LOCK)
 		return result;
 
-	return ldlm_cancel_no_wait_policy(ns, lock, unused, added, count);
-}
-
-/**
- * Callback function for default policy. Makes decision whether to keep \a lock
- * in LRU for current LRU size \a unused, added in current scan \a added and
- * number of locks to be preferably canceled \a count.
- *
- * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
- *
- * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
- */
-static
-enum ldlm_policy_res ldlm_cancel_default_policy(struct ldlm_namespace *ns,
-						struct ldlm_lock *lock,
-						int unused, int added,
-						int count)
-{
-	/* Stop LRU processing when we reach past count or have checked all
-	 * locks in LRU. */
-        return (added >= count) ?
-                LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+	return ldlm_cancel_no_wait_policy(ns, lock, added, min);
 }
 
 typedef enum ldlm_policy_res
 (*ldlm_cancel_lru_policy_t)(struct ldlm_namespace *ns, struct ldlm_lock *lock,
-			    int unused, int added, int count);
+			    int added, int min);
 
 static ldlm_cancel_lru_policy_t
 ldlm_cancel_lru_policy(struct ldlm_namespace *ns, enum ldlm_lru_flags lru_flags)
 {
 	if (ns_connect_lru_resize(ns)) {
-		if (lru_flags & LDLM_LRU_FLAG_SHRINK)
-			/* We kill passed number of old locks. */
-			return ldlm_cancel_passed_policy;
-		if (lru_flags & LDLM_LRU_FLAG_LRUR) {
-			if (lru_flags & LDLM_LRU_FLAG_NO_WAIT)
-				return ldlm_cancel_lrur_no_wait_policy;
-			else
-				return ldlm_cancel_lrur_policy;
-		}
-		if (lru_flags & LDLM_LRU_FLAG_PASSED)
-			return ldlm_cancel_passed_policy;
+		if (lru_flags & LDLM_LRU_FLAG_NO_WAIT)
+			return ldlm_cancel_lrur_no_wait_policy;
+		else
+			return ldlm_cancel_lrur_policy;
 	} else {
-		if (lru_flags & LDLM_LRU_FLAG_AGED) {
-			if (lru_flags & LDLM_LRU_FLAG_NO_WAIT)
-				return ldlm_cancel_aged_no_wait_policy;
-			else
-				return ldlm_cancel_aged_policy;
-		}
+		if (lru_flags & LDLM_LRU_FLAG_NO_WAIT)
+			return ldlm_cancel_aged_no_wait_policy;
+		else
+			return ldlm_cancel_aged_policy;
 	}
-	if (lru_flags & LDLM_LRU_FLAG_NO_WAIT)
-		return ldlm_cancel_no_wait_policy;
-
-	return ldlm_cancel_default_policy;
 }
 
 /**
- * - Free space in LRU for \a count new locks,
+ * - Free space in LRU for \a min new locks,
  *   redundant unused locks are canceled locally;
  * - also cancel locally unused aged locks;
  * - do not cancel more than \a max locks;
+ * - if some locks are cancelled, try to cancel at least \a batch locks
  * - GET the found locks and add them into the \a cancels list.
  *
  * A client lock can be added to the l_bl_ast list only when it is
@@ -1682,30 +1673,22 @@ ldlm_cancel_lru_policy(struct ldlm_namespace *ns, enum ldlm_lru_flags lru_flags)
  * attempt to cancel a lock rely on this flag, l_bl_ast list is accessed
  * later without any special locking.
  *
- * Calling policies for enabled LRU resize:
- * ----------------------------------------
- * flags & LDLM_LRU_FLAG_LRUR - use LRU resize policy (SLV from server) to
- *				cancel not more than \a count locks;
- *
- * flags & LDLM_LRU_FLAG_PASSED - cancel \a count number of old locks (located
- *				at the beginning of LRU list);
+ * Locks are cancelled according to the LRU resize policy (SLV from server)
+ * if LRU resize is enabled; otherwise, the "aged policy" is used;
  *
- * flags & LDLM_LRU_FLAG_SHRINK - cancel not more than \a count locks according
- *				to memory pressre policy function;
- *
- * flags & LDLM_LRU_FLAG_AGED - cancel \a count locks according to "aged policy"
+ * LRU flags:
+ * ----------------------------------------
  *
- * flags & LDLM_LRU_FLAG_NO_WAIT - cancel as many unused locks as possible
- *				(typically before replaying locks) w/o
- *				sending any RPCs or waiting for any
- *				outstanding RPC to complete.
+ * flags & LDLM_LRU_FLAG_NO_WAIT - cancel locks w/o sending any RPCs or waiting
+ *				for any outstanding RPC to complete.
  *
  * flags & LDLM_CANCEL_CLEANUP - when cancelling read locks, do not check for
- * 				other read locks covering the same pages, just
- * 				discard those pages.
+ *				 other read locks covering the same pages, just
+ *				 discard those pages.
  */
 static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
-				 struct list_head *cancels, int count, int max,
+				 struct list_head *cancels,
+				 int min, int max, int batch,
 				 enum ldlm_lru_flags lru_flags)
 {
 	ldlm_cancel_lru_policy_t pf;
@@ -1714,8 +1697,26 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
 
 	ENTRY;
 
+	/*
+	 * Let only 1 thread to proceed. However, not for those which have the
+	 * @max limit given (ELC), as LRU may be left not cleaned up in full.
+	 */
+	if (max == 0) {
+		if (test_and_set_bit(LDLM_LRU_CANCEL, &ns->ns_flags))
+			RETURN(0);
+	} else if (test_bit(LDLM_LRU_CANCEL, &ns->ns_flags))
+		RETURN(0);
+
+	LASSERT(ergo(max, min <= max));
+	/* No sense to give @batch for ELC */
+	LASSERT(ergo(max, batch == 0));
+
 	if (!ns_connect_lru_resize(ns))
-		count += ns->ns_nr_unused - ns->ns_max_unused;
+		min = max_t(int, min, ns->ns_nr_unused - ns->ns_max_unused);
+
+	/* If at least 1 lock is to be cancelled, cancel at least @batch locks */
+	if (min && min < batch)
+		min = batch;
 
 	pf = ldlm_cancel_lru_policy(ns, lru_flags);
 	LASSERT(pf != NULL);
@@ -1768,7 +1769,7 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
 		 * old locks, but additionally choose them by
 		 * their weight. Big extent locks will stay in
 		 * the cache. */
-		result = pf(ns, lock, ns->ns_nr_unused, added, count);
+		result = pf(ns, lock, added, min);
 		if (result == LDLM_POLICY_KEEP_LOCK) {
 			lu_ref_del(&lock->l_reference, __func__, current);
 			LDLM_LOCK_RELEASE(lock);
@@ -1777,7 +1778,6 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
 
 		if (result == LDLM_POLICY_SKIP_LOCK) {
 			lu_ref_del(&lock->l_reference, __func__, current);
-			LDLM_LOCK_RELEASE(lock);
 			if (no_wait) {
 				spin_lock(&ns->ns_lock);
 				if (!list_empty(&lock->l_lru) &&
@@ -1785,6 +1785,8 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
 					ns->ns_last_pos = &lock->l_lru;
 				spin_unlock(&ns->ns_lock);
 			}
+
+			LDLM_LOCK_RELEASE(lock);
 			continue;
 		}
 
@@ -1821,8 +1823,8 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
 		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING;
 
 		if ((lru_flags & LDLM_LRU_FLAG_CLEANUP) &&
-		    lock->l_resource->lr_type == LDLM_EXTENT &&
-		    lock->l_granted_mode == LCK_PR)
+		    (lock->l_resource->lr_type == LDLM_EXTENT ||
+		     ldlm_has_dom(lock)) && lock->l_granted_mode == LCK_PR)
 			ldlm_set_discard_data(lock);
 
 		/* We can't re-add to l_lru as it confuses the
@@ -1836,18 +1838,25 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
 		unlock_res_and_lock(lock);
 		lu_ref_del(&lock->l_reference, __FUNCTION__, current);
 		added++;
+		/* Once a lock added, batch the requested amount */
+		if (min == 0)
+			min = batch;
 	}
+
+	if (max == 0)
+		clear_bit(LDLM_LRU_CANCEL, &ns->ns_flags);
+
 	RETURN(added);
 }
 
 int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
-			  int count, int max,
+			  int min, int max,
 			  enum ldlm_cancel_flags cancel_flags,
 			  enum ldlm_lru_flags lru_flags)
 {
 	int added;
 
-	added = ldlm_prepare_lru_list(ns, cancels, count, max, lru_flags);
+	added = ldlm_prepare_lru_list(ns, cancels, min, max, 0, lru_flags);
 	if (added <= 0)
 		return added;
 
@@ -1855,14 +1864,14 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
 }
 
 /**
- * Cancel at least \a nr locks from given namespace LRU.
+ * Cancel at least \a min locks from given namespace LRU.
  *
  * When called with LCF_ASYNC the blocking callback will be handled
  * in a thread and this function will return after the thread has been
  * asked to call the callback.  When called with LCF_ASYNC the blocking
  * callback will be performed in this function.
  */
-int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr,
+int ldlm_cancel_lru(struct ldlm_namespace *ns, int min,
 		    enum ldlm_cancel_flags cancel_flags,
 		    enum ldlm_lru_flags lru_flags)
 {
@@ -1872,7 +1881,7 @@ int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr,
 
 	/* Just prepare the list of locks, do not actually cancel them yet.
 	 * Locks are cancelled later in a separate thread. */
-	count = ldlm_prepare_lru_list(ns, &cancels, nr, 0, lru_flags);
+	count = ldlm_prepare_lru_list(ns, &cancels, min, 0, 0, lru_flags);
 	rc = ldlm_bl_to_thread_list(ns, NULL, &cancels, count, cancel_flags);
 	if (rc == 0)
 		RETURN(count);
@@ -1894,47 +1903,50 @@ int ldlm_cancel_resource_local(struct ldlm_resource *res,
 {
 	struct ldlm_lock *lock;
 	int count = 0;
+
 	ENTRY;
 
 	lock_res(res);
 	list_for_each_entry(lock, &res->lr_granted, l_res_link) {
-                if (opaque != NULL && lock->l_ast_data != opaque) {
-                        LDLM_ERROR(lock, "data %p doesn't match opaque %p",
-                                   lock->l_ast_data, opaque);
-                        //LBUG();
-                        continue;
-                }
+		if (opaque != NULL && lock->l_ast_data != opaque) {
+			LDLM_ERROR(lock, "data %p doesn't match opaque %p",
+				   lock->l_ast_data, opaque);
+			continue;
+		}
 
-                if (lock->l_readers || lock->l_writers)
-                        continue;
+		if (lock->l_readers || lock->l_writers)
+			continue;
 
-		/* If somebody is already doing CANCEL, or blocking AST came,
-		 * skip this lock. */
+		/*
+		 * If somebody is already doing CANCEL, or blocking AST came
+		 * then skip this lock.
+		 */
 		if (ldlm_is_bl_ast(lock) || ldlm_is_canceling(lock))
 			continue;
 
-                if (lockmode_compat(lock->l_granted_mode, mode))
-                        continue;
+		if (lockmode_compat(lock->l_granted_mode, mode))
+			continue;
 
-                /* If policy is given and this is IBITS lock, add to list only
-                 * those locks that match by policy. */
-                if (policy && (lock->l_resource->lr_type == LDLM_IBITS) &&
-                    !(lock->l_policy_data.l_inodebits.bits &
-                      policy->l_inodebits.bits))
-                        continue;
+		/* If policy is given and this is IBITS lock, add to list only
+		 * those locks that match by policy.
+		 * Skip locks with DoM bit always to don't flush data.
+		 */
+		if (policy && (lock->l_resource->lr_type == LDLM_IBITS) &&
+		    (!(lock->l_policy_data.l_inodebits.bits &
+		      policy->l_inodebits.bits) || ldlm_has_dom(lock)))
+			continue;
 
 		/* See CBPENDING comment in ldlm_cancel_lru */
 		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING |
 				 lock_flags;
-
 		LASSERT(list_empty(&lock->l_bl_ast));
 		list_add(&lock->l_bl_ast, cancels);
-                LDLM_LOCK_GET(lock);
-                count++;
-        }
-        unlock_res(res);
+		LDLM_LOCK_GET(lock);
+		count++;
+	}
+	unlock_res(res);
 
-        RETURN(ldlm_cli_cancel_list_local(cancels, count, cancel_flags));
+	RETURN(ldlm_cli_cancel_list_local(cancels, count, cancel_flags));
 }
 EXPORT_SYMBOL(ldlm_cancel_resource_local);
 
@@ -2088,41 +2100,34 @@ int ldlm_cli_cancel_unused(struct ldlm_namespace *ns,
 /* Lock iterators. */
 
 int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter,
-                          void *closure)
+			  void *closure)
 {
 	struct list_head *tmp, *next;
-        struct ldlm_lock *lock;
-        int rc = LDLM_ITER_CONTINUE;
+	struct ldlm_lock *lock;
+	int rc = LDLM_ITER_CONTINUE;
 
-        ENTRY;
+	ENTRY;
 
-        if (!res)
-                RETURN(LDLM_ITER_CONTINUE);
+	if (!res)
+		RETURN(LDLM_ITER_CONTINUE);
 
-        lock_res(res);
+	lock_res(res);
 	list_for_each_safe(tmp, next, &res->lr_granted) {
 		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
 
-                if (iter(lock, closure) == LDLM_ITER_STOP)
-                        GOTO(out, rc = LDLM_ITER_STOP);
-        }
-
-	list_for_each_safe(tmp, next, &res->lr_converting) {
-		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
-
-                if (iter(lock, closure) == LDLM_ITER_STOP)
-                        GOTO(out, rc = LDLM_ITER_STOP);
-        }
+		if (iter(lock, closure) == LDLM_ITER_STOP)
+			GOTO(out, rc = LDLM_ITER_STOP);
+	}
 
 	list_for_each_safe(tmp, next, &res->lr_waiting) {
 		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
 
-                if (iter(lock, closure) == LDLM_ITER_STOP)
-                        GOTO(out, rc = LDLM_ITER_STOP);
-        }
- out:
-        unlock_res(res);
-        RETURN(rc);
+		if (iter(lock, closure) == LDLM_ITER_STOP)
+			GOTO(out, rc = LDLM_ITER_STOP);
+	}
+out:
+	unlock_res(res);
+	RETURN(rc);
 }
 
 struct iter_helper_data {
@@ -2216,6 +2221,8 @@ static int replay_lock_interpret(const struct lu_env *env,
 
 	ENTRY;
 	atomic_dec(&req->rq_import->imp_replay_inflight);
+	wake_up(&req->rq_import->imp_replay_waitq);
+
 	if (rc != ELDLM_OK)
 		GOTO(out, rc);
 
@@ -2281,28 +2288,23 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
                 RETURN(0);
         }
 
-        /*
-         * If granted mode matches the requested mode, this lock is granted.
-         *
-         * If they differ, but we have a granted mode, then we were granted
-         * one mode and now want another: ergo, converting.
-         *
-         * If we haven't been granted anything and are on a resource list,
-         * then we're blocked/waiting.
-         *
-         * If we haven't been granted anything and we're NOT on a resource list,
-         * then we haven't got a reply yet and don't have a known disposition.
-         * This happens whenever a lock enqueue is the request that triggers
-         * recovery.
-         */
-        if (lock->l_granted_mode == lock->l_req_mode)
-                flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_GRANTED;
-        else if (lock->l_granted_mode)
-                flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_CONV;
+	/*
+	 * If granted mode matches the requested mode, this lock is granted.
+	 *
+	 * If we haven't been granted anything and are on a resource list,
+	 * then we're blocked/waiting.
+	 *
+	 * If we haven't been granted anything and we're NOT on a resource list,
+	 * then we haven't got a reply yet and don't have a known disposition.
+	 * This happens whenever a lock enqueue is the request that triggers
+	 * recovery.
+	 */
+	if (ldlm_is_granted(lock))
+		flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_GRANTED;
 	else if (!list_empty(&lock->l_res_link))
-                flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_WAIT;
-        else
-                flags = LDLM_FL_REPLAY;
+		flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_WAIT;
+	else
+		flags = LDLM_FL_REPLAY;
 
         req = ptlrpc_request_alloc_pack(imp, &RQF_LDLM_ENQUEUE,
                                         LUSTRE_DLM_VERSION, LDLM_ENQUEUE);
@@ -2311,6 +2313,8 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
 
         /* We're part of recovery, so don't wait for it. */
         req->rq_send_state = LUSTRE_IMP_REPLAY_LOCKS;
+	/* If the state changed while we were prepared, don't wait */
+	req->rq_no_delay = 1;
 
         body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
         ldlm_lock2desc(lock, &body->lock_desc);
@@ -2369,7 +2373,20 @@ static void ldlm_cancel_unused_locks_for_replay(struct ldlm_namespace *ns)
 			   canceled, ldlm_ns_name(ns));
 }
 
-int ldlm_replay_locks(struct obd_import *imp)
+static int lock_can_replay(struct obd_import *imp)
+{
+	struct client_obd *cli = &imp->imp_obd->u.cli;
+
+	CDEBUG(D_HA, "check lock replay limit, inflights = %u(%u)\n",
+	       atomic_read(&imp->imp_replay_inflight) - 1,
+	       cli->cl_max_rpcs_in_flight);
+
+	/* +1 due to ldlm_lock_replay() increment */
+	return atomic_read(&imp->imp_replay_inflight) <
+	       1 + min_t(u32, cli->cl_max_rpcs_in_flight, 8);
+}
+
+int __ldlm_replay_locks(struct obd_import *imp, bool rate_limit)
 {
 	struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
 	struct list_head list = LIST_HEAD_INIT(list);
@@ -2378,15 +2395,12 @@ int ldlm_replay_locks(struct obd_import *imp)
 
 	ENTRY;
 
-	LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
+	LASSERT(atomic_read(&imp->imp_replay_inflight) == 1);
 
 	/* don't replay locks if import failed recovery */
 	if (imp->imp_vbr_failed)
 		RETURN(0);
 
-	/* ensure this doesn't fall to 0 before all have been queued */
-	atomic_inc(&imp->imp_replay_inflight);
-
 	if (ldlm_cancel_unused_locks_before_replay)
 		ldlm_cancel_unused_locks_for_replay(ns);
 
@@ -2394,15 +2408,64 @@ int ldlm_replay_locks(struct obd_import *imp)
 
 	list_for_each_entry_safe(lock, next, &list, l_pending_chain) {
 		list_del_init(&lock->l_pending_chain);
-		if (rc) {
+		/* If we disconnected in the middle - cleanup and let
+		 * reconnection to happen again. LU-14027 */
+		if (rc || (imp->imp_state != LUSTRE_IMP_REPLAY_LOCKS)) {
 			LDLM_LOCK_RELEASE(lock);
-			continue; /* or try to do the rest? */
+			continue;
 		}
 		rc = replay_one_lock(imp, lock);
 		LDLM_LOCK_RELEASE(lock);
+
+		if (rate_limit)
+			wait_event_idle_exclusive(imp->imp_replay_waitq,
+						  lock_can_replay(imp));
 	}
 
+	RETURN(rc);
+}
+
+/**
+ * Lock replay uses rate control and can sleep waiting so
+ * must be in separate thread from ptlrpcd itself
+ */
+static int ldlm_lock_replay_thread(void *data)
+{
+	struct obd_import *imp = data;
+
+	unshare_fs_struct();
+
+	CDEBUG(D_HA, "lock replay thread %s to %s@%s\n",
+	       imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
+	       imp->imp_connection->c_remote_uuid.uuid);
+
+	__ldlm_replay_locks(imp, true);
 	atomic_dec(&imp->imp_replay_inflight);
+	ptlrpc_import_recovery_state_machine(imp);
+	class_import_put(imp);
 
-	RETURN(rc);
+	return 0;
+}
+
+int ldlm_replay_locks(struct obd_import *imp)
+{
+	struct task_struct *task;
+	int rc = 0;
+
+	class_import_get(imp);
+	/* ensure this doesn't fall to 0 before all have been queued */
+	atomic_inc(&imp->imp_replay_inflight);
+
+	task = kthread_run(ldlm_lock_replay_thread, imp, "ldlm_lock_replay");
+	if (IS_ERR(task)) {
+		rc = PTR_ERR(task);
+		CDEBUG(D_HA, "can't start lock replay thread: rc = %d\n", rc);
+
+		/* run lock replay without rate control */
+		rc = __ldlm_replay_locks(imp, false);
+		atomic_dec(&imp->imp_replay_inflight);
+		class_import_put(imp);
+	}
+
+	return rc;
 }
diff --git a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c
index 042633867837b..8b36f70af7f56 100644
--- a/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c
+++ b/drivers/staging/lustrefsx/lustre/ldlm/ldlm_resource.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2016, Intel Corporation.
+ * Copyright (c) 2010, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -43,6 +43,7 @@
 
 struct kmem_cache *ldlm_resource_slab, *ldlm_lock_slab;
 struct kmem_cache *ldlm_interval_tree_slab;
+struct kmem_cache *ldlm_inodebits_slab;
 
 int ldlm_srv_namespace_nr = 0;
 int ldlm_cli_namespace_nr = 0;
@@ -58,26 +59,45 @@ LIST_HEAD(ldlm_cli_active_namespace_list);
 /* Client namespaces that don't have any locks in them */
 LIST_HEAD(ldlm_cli_inactive_namespace_list);
 
-static struct proc_dir_entry *ldlm_type_proc_dir;
-static struct proc_dir_entry *ldlm_ns_proc_dir;
-struct proc_dir_entry *ldlm_svc_proc_dir;
+static struct dentry *ldlm_debugfs_dir;
+static struct dentry *ldlm_ns_debugfs_dir;
+struct dentry *ldlm_svc_debugfs_dir;
 
 /* during debug dump certain amount of granted locks for one resource to avoid
  * DDOS. */
 static unsigned int ldlm_dump_granted_max = 256;
 
-#ifdef CONFIG_PROC_FS
-static ssize_t
-lprocfs_dump_ns_seq_write(struct file *file, const char __user *buffer,
-			  size_t count, loff_t *off)
+static ssize_t ldebugfs_dump_ns_seq_write(struct file *file,
+					  const char __user *buffer,
+					  size_t count, loff_t *off)
 {
 	ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE);
 	ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE);
 	RETURN(count);
 }
-LPROC_SEQ_FOPS_WO_TYPE(ldlm, dump_ns);
 
-LPROC_SEQ_FOPS_RW_TYPE(ldlm_rw, uint);
+LDEBUGFS_FOPS_WR_ONLY(ldlm, dump_ns);
+
+static int ldlm_rw_uint_seq_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%u\n", *(unsigned int *)m->private);
+	return 0;
+}
+
+static ssize_t
+ldlm_rw_uint_seq_write(struct file *file, const char __user *buffer,
+		       size_t count, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+
+	if (!count)
+		return 0;
+
+	return kstrtouint_from_user(buffer, count, 0,
+				    (unsigned int *)seq->private);
+}
+
+LDEBUGFS_SEQ_FOPS(ldlm_rw_uint);
 
 #ifdef HAVE_SERVER_SUPPORT
 
@@ -97,7 +117,7 @@ static ssize_t seq_watermark_write(struct file *file,
 	bool wm_low = (data == &ldlm_reclaim_threshold_mb) ? true : false;
 	int rc;
 
-	rc = lprocfs_str_with_units_to_s64(file, buffer, count, &value, 'M');
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &value, 'M');
 	if (rc) {
 		CERROR("Failed to set %s, rc = %d.\n",
 		       wm_low ? "lock_reclaim_threshold_mb" : "lock_limit_mb",
@@ -144,7 +164,7 @@ static ssize_t seq_watermark_write(struct file *file,
 
 static int seq_watermark_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, seq_watermark_show, PDE_DATA(inode));
+	return single_open(file, seq_watermark_show, inode->i_private);
 }
 
 static const struct file_operations ldlm_watermark_fops = {
@@ -165,7 +185,7 @@ static int seq_granted_show(struct seq_file *m, void *data)
 
 static int seq_granted_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, seq_granted_show, PDE_DATA(inode));
+	return single_open(file, seq_granted_show, inode->i_private);
 }
 
 static const struct file_operations ldlm_granted_fops = {
@@ -178,59 +198,62 @@ static const struct file_operations ldlm_granted_fops = {
 
 #endif /* HAVE_SERVER_SUPPORT */
 
-int ldlm_proc_setup(void)
-{
-	int rc;
-	struct lprocfs_vars list[] = {
-		{ .name	=	"dump_namespaces",
-		  .fops	=	&ldlm_dump_ns_fops,
-		  .proc_mode =	0222 },
-		{ .name	=	"dump_granted_max",
-		  .fops	=	&ldlm_rw_uint_fops,
-		  .data	=	&ldlm_dump_granted_max },
+static struct ldebugfs_vars ldlm_debugfs_list[] = {
+	{ .name	=	"dump_namespaces",
+	  .fops	=	&ldlm_dump_ns_fops,
+	  .proc_mode =	0222 },
+	{ .name	=	"dump_granted_max",
+	  .fops	=	&ldlm_rw_uint_fops,
+	  .data	=	&ldlm_dump_granted_max },
 #ifdef HAVE_SERVER_SUPPORT
-		{ .name =	"lock_reclaim_threshold_mb",
-		  .fops =	&ldlm_watermark_fops,
-		  .data =	&ldlm_reclaim_threshold_mb },
-		{ .name =	"lock_limit_mb",
-		  .fops =	&ldlm_watermark_fops,
-		  .data =	&ldlm_lock_limit_mb },
-		{ .name =	"lock_granted_count",
-		  .fops =	&ldlm_granted_fops,
-		  .data =	&ldlm_granted_total },
+	{ .name =	"lock_reclaim_threshold_mb",
+	  .fops =	&ldlm_watermark_fops,
+	  .data =	&ldlm_reclaim_threshold_mb },
+	{ .name =	"lock_limit_mb",
+	  .fops =	&ldlm_watermark_fops,
+	  .data =	&ldlm_lock_limit_mb },
+	{ .name =	"lock_granted_count",
+	  .fops =	&ldlm_granted_fops,
+	  .data =	&ldlm_granted_total },
 #endif
-		{ NULL }};
-	ENTRY;
-	LASSERT(ldlm_ns_proc_dir == NULL);
+	{ NULL }
+};
 
-	ldlm_type_proc_dir = lprocfs_register(OBD_LDLM_DEVICENAME,
-					      proc_lustre_root,
-					      NULL, NULL);
-	if (IS_ERR(ldlm_type_proc_dir)) {
-		CERROR("LProcFS failed in ldlm-init\n");
-		rc = PTR_ERR(ldlm_type_proc_dir);
+int ldlm_debugfs_setup(void)
+{
+	int rc;
+
+	ENTRY;
+	ldlm_debugfs_dir = ldebugfs_register(OBD_LDLM_DEVICENAME,
+					     debugfs_lustre_root,
+					     NULL, NULL);
+	if (IS_ERR_OR_NULL(ldlm_debugfs_dir)) {
+		CERROR("LDebugFS failed in ldlm-init\n");
+		rc = ldlm_debugfs_dir ? PTR_ERR(ldlm_debugfs_dir) : -ENOMEM;
 		GOTO(err, rc);
 	}
 
-	ldlm_ns_proc_dir = lprocfs_register("namespaces",
-					    ldlm_type_proc_dir,
-					    NULL, NULL);
-	if (IS_ERR(ldlm_ns_proc_dir)) {
+	ldlm_ns_debugfs_dir = ldebugfs_register("namespaces",
+						ldlm_debugfs_dir,
+						NULL, NULL);
+	if (IS_ERR_OR_NULL(ldlm_ns_debugfs_dir)) {
 		CERROR("LProcFS failed in ldlm-init\n");
-		rc = PTR_ERR(ldlm_ns_proc_dir);
+		rc = ldlm_ns_debugfs_dir ? PTR_ERR(ldlm_ns_debugfs_dir)
+					 : -ENOMEM;
 		GOTO(err_type, rc);
 	}
 
-	ldlm_svc_proc_dir = lprocfs_register("services",
-					     ldlm_type_proc_dir,
-					     NULL, NULL);
-	if (IS_ERR(ldlm_svc_proc_dir)) {
+	ldlm_svc_debugfs_dir = ldebugfs_register("services",
+						 ldlm_debugfs_dir,
+						 NULL, NULL);
+	if (IS_ERR_OR_NULL(ldlm_svc_debugfs_dir)) {
 		CERROR("LProcFS failed in ldlm-init\n");
-		rc = PTR_ERR(ldlm_svc_proc_dir);
+		rc = ldlm_svc_debugfs_dir ? PTR_ERR(ldlm_svc_debugfs_dir)
+					  : -ENOMEM;
 		GOTO(err_ns, rc);
 	}
 
-	rc = lprocfs_add_vars(ldlm_type_proc_dir, list, NULL);
+	rc = ldebugfs_add_vars(ldlm_debugfs_dir, ldlm_debugfs_list, NULL);
 	if (rc != 0) {
 		CERROR("LProcFS failed in ldlm-init\n");
 		GOTO(err_svc, rc);
@@ -239,26 +262,32 @@ int ldlm_proc_setup(void)
 	RETURN(0);
 
 err_svc:
-	lprocfs_remove(&ldlm_svc_proc_dir);
+	ldebugfs_remove(&ldlm_svc_debugfs_dir);
 err_ns:
-        lprocfs_remove(&ldlm_ns_proc_dir);
+	ldebugfs_remove(&ldlm_ns_debugfs_dir);
 err_type:
-        lprocfs_remove(&ldlm_type_proc_dir);
+	ldebugfs_remove(&ldlm_debugfs_dir);
 err:
-        ldlm_svc_proc_dir = NULL;
-        RETURN(rc);
+	ldlm_svc_debugfs_dir = NULL;
+	ldlm_ns_debugfs_dir = NULL;
+	ldlm_debugfs_dir = NULL;
+	RETURN(rc);
 }
 
-void ldlm_proc_cleanup(void)
+void ldlm_debugfs_cleanup(void)
 {
-        if (ldlm_svc_proc_dir)
-                lprocfs_remove(&ldlm_svc_proc_dir);
+	if (!IS_ERR_OR_NULL(ldlm_svc_debugfs_dir))
+		ldebugfs_remove(&ldlm_svc_debugfs_dir);
 
-        if (ldlm_ns_proc_dir)
-                lprocfs_remove(&ldlm_ns_proc_dir);
+	if (!IS_ERR_OR_NULL(ldlm_ns_debugfs_dir))
+		ldebugfs_remove(&ldlm_ns_debugfs_dir);
 
-        if (ldlm_type_proc_dir)
-                lprocfs_remove(&ldlm_type_proc_dir);
+	if (!IS_ERR_OR_NULL(ldlm_debugfs_dir))
+		ldebugfs_remove(&ldlm_debugfs_dir);
+
+	ldlm_svc_debugfs_dir = NULL;
+	ldlm_ns_debugfs_dir = NULL;
+	ldlm_debugfs_dir = NULL;
 }
 
 static ssize_t resource_count_show(struct kobject *kobj, struct attribute *attr,
@@ -326,18 +355,8 @@ static ssize_t lru_size_store(struct kobject *kobj, struct attribute *attr,
                 CDEBUG(D_DLMTRACE,
                        "dropping all unused locks from namespace %s\n",
                        ldlm_ns_name(ns));
-                if (ns_connect_lru_resize(ns)) {
-			/* Try to cancel all @ns_nr_unused locks. */
-			ldlm_cancel_lru(ns, ns->ns_nr_unused, 0,
-					LDLM_LRU_FLAG_PASSED |
-					LDLM_LRU_FLAG_CLEANUP);
-		} else {
-			tmp = ns->ns_max_unused;
-			ns->ns_max_unused = 0;
-			ldlm_cancel_lru(ns, 0, 0, LDLM_LRU_FLAG_PASSED |
-					LDLM_LRU_FLAG_CLEANUP);
-			ns->ns_max_unused = tmp;
-		}
+		/* Try to cancel all @ns_nr_unused locks. */
+		ldlm_cancel_lru(ns, INT_MAX, 0, LDLM_LRU_FLAG_CLEANUP);
 		return count;
 	}
 
@@ -360,7 +379,6 @@ static ssize_t lru_size_store(struct kobject *kobj, struct attribute *attr,
 		       "changing namespace %s unused locks from %u to %u\n",
 		       ldlm_ns_name(ns), ns->ns_nr_unused,
 		       (unsigned int)tmp);
-		ldlm_cancel_lru(ns, tmp, LCF_ASYNC, LDLM_LRU_FLAG_PASSED);
 
 		if (!lru_resize) {
 			CDEBUG(D_DLMTRACE,
@@ -368,13 +386,12 @@ static ssize_t lru_size_store(struct kobject *kobj, struct attribute *attr,
 			       ldlm_ns_name(ns));
 			ns->ns_connect_flags &= ~OBD_CONNECT_LRU_RESIZE;
 		}
+		ldlm_cancel_lru(ns, tmp, LCF_ASYNC, 0);
         } else {
 		CDEBUG(D_DLMTRACE,
 		       "changing namespace %s max_unused from %u to %u\n",
 		       ldlm_ns_name(ns), ns->ns_max_unused,
 		       (unsigned int)tmp);
-		ns->ns_max_unused = (unsigned int)tmp;
-		ldlm_cancel_lru(ns, 0, LCF_ASYNC, LDLM_LRU_FLAG_PASSED);
 
 		/* Make sure that LRU resize was originally supported before
 		 * turning it on here.
@@ -386,6 +403,8 @@ static ssize_t lru_size_store(struct kobject *kobj, struct attribute *attr,
                                ldlm_ns_name(ns));
                         ns->ns_connect_flags |= OBD_CONNECT_LRU_RESIZE;
                 }
+		ns->ns_max_unused = (unsigned int)tmp;
+		ldlm_cancel_lru(ns, 0, LCF_ASYNC, 0);
         }
 
         return count;
@@ -409,7 +428,6 @@ static ssize_t lru_max_age_store(struct kobject *kobj, struct attribute *attr,
 	int scale = NSEC_PER_MSEC;
 	unsigned long long tmp;
 	char *buf;
-	int err;
 
 	/* Did the user ask in seconds or milliseconds. Default is in ms */
 	buf = strstr(buffer, "ms");
@@ -422,8 +440,7 @@ static ssize_t lru_max_age_store(struct kobject *kobj, struct attribute *attr,
 	if (buf)
 		*buf = '\0';
 
-	err = kstrtoull(buffer, 10, &tmp);
-	if (err != 0)
+	if (kstrtoull(buffer, 10, &tmp))
 		return -EINVAL;
 
 	ns->ns_max_age = ktime_set(0, tmp * scale);
@@ -464,6 +481,32 @@ static ssize_t early_lock_cancel_store(struct kobject *kobj,
 }
 LUSTRE_RW_ATTR(early_lock_cancel);
 
+static ssize_t dirty_age_limit_show(struct kobject *kobj,
+				    struct attribute *attr, char *buf)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+
+	return sprintf(buf, "%llu\n", ns->ns_dirty_age_limit);
+}
+
+static ssize_t dirty_age_limit_store(struct kobject *kobj,
+				     struct attribute *attr,
+				     const char *buffer, size_t count)
+{
+	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+						 ns_kobj);
+	unsigned long long tmp;
+
+	if (kstrtoull(buffer, 10, &tmp))
+		return -EINVAL;
+
+	ns->ns_dirty_age_limit = tmp;
+
+	return count;
+}
+LUSTRE_RW_ATTR(dirty_age_limit);
+
 #ifdef HAVE_SERVER_SUPPORT
 static ssize_t ctime_age_limit_show(struct kobject *kobj,
 				    struct attribute *attr, char *buf)
@@ -471,7 +514,7 @@ static ssize_t ctime_age_limit_show(struct kobject *kobj,
 	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
 						 ns_kobj);
 
-	return sprintf(buf, "%u\n", ns->ns_ctime_age_limit);
+	return sprintf(buf, "%llu\n", ns->ns_ctime_age_limit);
 }
 
 static ssize_t ctime_age_limit_store(struct kobject *kobj,
@@ -480,11 +523,9 @@ static ssize_t ctime_age_limit_store(struct kobject *kobj,
 {
 	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
 						 ns_kobj);
-	unsigned long tmp;
-	int err;
+	unsigned long long tmp;
 
-	err = kstrtoul(buffer, 10, &tmp);
-	if (err != 0)
+	if (kstrtoull(buffer, 10, &tmp))
 		return -EINVAL;
 
 	ns->ns_ctime_age_limit = tmp;
@@ -537,7 +578,7 @@ static ssize_t contention_seconds_show(struct kobject *kobj,
 	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
 						 ns_kobj);
 
-	return sprintf(buf, "%u\n", ns->ns_contention_time);
+	return sprintf(buf, "%llu\n", ns->ns_contention_time);
 }
 
 static ssize_t contention_seconds_store(struct kobject *kobj,
@@ -546,11 +587,9 @@ static ssize_t contention_seconds_store(struct kobject *kobj,
 {
 	struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
 						 ns_kobj);
-	unsigned long tmp;
-	int err;
+	unsigned long long tmp;
 
-	err = kstrtoul(buffer, 10, &tmp);
-	if (err != 0)
+	if (kstrtoull(buffer, 10, &tmp))
 		return -EINVAL;
 
 	ns->ns_contention_time = tmp;
@@ -625,6 +664,7 @@ static struct attribute *ldlm_ns_attrs[] = {
 	&lustre_attr_lru_size.attr,
 	&lustre_attr_lru_max_age.attr,
 	&lustre_attr_early_lock_cancel.attr,
+	&lustre_attr_dirty_age_limit.attr,
 #ifdef HAVE_SERVER_SUPPORT
 	&lustre_attr_ctime_age_limit.attr,
 	&lustre_attr_lock_timeouts.attr,
@@ -649,13 +689,13 @@ static struct kobj_type ldlm_ns_ktype = {
 	.release	= ldlm_ns_release,
 };
 
-static void ldlm_namespace_proc_unregister(struct ldlm_namespace *ns)
+static void ldlm_namespace_debugfs_unregister(struct ldlm_namespace *ns)
 {
-	if (ns->ns_proc_dir_entry == NULL)
+	if (IS_ERR_OR_NULL(ns->ns_debugfs_entry))
                 CERROR("dlm namespace %s has no procfs dir?\n",
                        ldlm_ns_name(ns));
 	else
-		lprocfs_remove(&ns->ns_proc_dir_entry);
+		ldebugfs_remove(&ns->ns_debugfs_entry);
 
 	if (ns->ns_stats != NULL)
 		lprocfs_free_stats(&ns->ns_stats);
@@ -688,31 +728,23 @@ int ldlm_namespace_sysfs_register(struct ldlm_namespace *ns)
 	return err;
 }
 
-static int ldlm_namespace_proc_register(struct ldlm_namespace *ns)
+static int ldlm_namespace_debugfs_register(struct ldlm_namespace *ns)
 {
-	struct proc_dir_entry *ns_pde;
+	struct dentry *ns_entry;
 
-        LASSERT(ns != NULL);
-        LASSERT(ns->ns_rs_hash != NULL);
-
-	if (ns->ns_proc_dir_entry != NULL) {
-		ns_pde = ns->ns_proc_dir_entry;
+	if (!IS_ERR_OR_NULL(ns->ns_debugfs_entry)) {
+		ns_entry = ns->ns_debugfs_entry;
 	} else {
-		ns_pde = proc_mkdir(ldlm_ns_name(ns), ldlm_ns_proc_dir);
-		if (ns_pde == NULL)
+		ns_entry = debugfs_create_dir(ldlm_ns_name(ns),
+					      ldlm_ns_debugfs_dir);
+		if (!ns_entry)
 			return -ENOMEM;
-		ns->ns_proc_dir_entry = ns_pde;
+		ns->ns_debugfs_entry = ns_entry;
 	}
 
 	return 0;
 }
 #undef MAX_STRING_SIZE
-#else /* CONFIG_PROC_FS */
-
-#define ldlm_namespace_proc_unregister(ns)      ({;})
-#define ldlm_namespace_proc_register(ns)        ({0;})
-
-#endif /* CONFIG_PROC_FS */
 
 static unsigned ldlm_res_hop_hash(struct cfs_hash *hs,
                                   const void *key, unsigned mask)
@@ -927,9 +959,12 @@ struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name,
 		nsb->nsb_reclaim_start = 0;
         }
 
-        ns->ns_obd      = obd;
-        ns->ns_appetite = apt;
-        ns->ns_client   = client;
+	ns->ns_obd = obd;
+	ns->ns_appetite = apt;
+	ns->ns_client = client;
+	ns->ns_name = kstrdup(name, GFP_KERNEL);
+	if (!ns->ns_name)
+		goto out_hash;
 
 	INIT_LIST_HEAD(&ns->ns_list_chain);
 	INIT_LIST_HEAD(&ns->ns_unused_list);
@@ -946,12 +981,14 @@ struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name,
         ns->ns_max_unused         = LDLM_DEFAULT_LRU_SIZE;
 	ns->ns_max_age            = ktime_set(LDLM_DEFAULT_MAX_ALIVE, 0);
         ns->ns_ctime_age_limit    = LDLM_CTIME_AGE_LIMIT;
+	ns->ns_dirty_age_limit    = LDLM_DIRTY_AGE_LIMIT;
         ns->ns_timeouts           = 0;
         ns->ns_orig_connect_flags = 0;
         ns->ns_connect_flags      = 0;
         ns->ns_stopping           = 0;
 	ns->ns_reclaim_start	  = 0;
 	ns->ns_last_pos		  = &ns->ns_unused_list;
+	ns->ns_flags		  = 0;
 
 	rc = ldlm_namespace_sysfs_register(ns);
 	if (rc) {
@@ -959,7 +996,7 @@ struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name,
 		GOTO(out_hash, rc);
 	}
 
-	rc = ldlm_namespace_proc_register(ns);
+	rc = ldlm_namespace_debugfs_register(ns);
 	if (rc) {
 		CERROR("Can't initialize ns proc, rc %d\n", rc);
 		GOTO(out_sysfs, rc);
@@ -975,12 +1012,13 @@ struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name,
         ldlm_namespace_register(ns, client);
         RETURN(ns);
 out_proc:
-	ldlm_namespace_proc_unregister(ns);
+	ldlm_namespace_debugfs_unregister(ns);
 out_sysfs:
 	ldlm_namespace_sysfs_unregister(ns);
 	ldlm_namespace_cleanup(ns, 0);
 out_hash:
-        cfs_hash_putref(ns->ns_rs_hash);
+	kfree(ns->ns_name);
+	cfs_hash_putref(ns->ns_rs_hash);
 out_ns:
         OBD_FREE_PTR(ns);
 out_ref:
@@ -1079,14 +1117,13 @@ static void cleanup_resource(struct ldlm_resource *res, struct list_head *q,
 static int ldlm_resource_clean(struct cfs_hash *hs, struct cfs_hash_bd *bd,
 			       struct hlist_node *hnode, void *arg)
 {
-        struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+	struct ldlm_resource *res = cfs_hash_object(hs, hnode);
 	__u64 flags = *(__u64 *)arg;
 
-        cleanup_resource(res, &res->lr_granted, flags);
-        cleanup_resource(res, &res->lr_converting, flags);
-        cleanup_resource(res, &res->lr_waiting, flags);
+	cleanup_resource(res, &res->lr_granted, flags);
+	cleanup_resource(res, &res->lr_waiting, flags);
 
-        return 0;
+	return 0;
 }
 
 static int ldlm_resource_complain(struct cfs_hash *hs, struct cfs_hash_bd *bd,
@@ -1100,7 +1137,8 @@ static int ldlm_resource_complain(struct cfs_hash *hs, struct cfs_hash_bd *bd,
 	       ldlm_ns_name(ldlm_res_to_ns(res)), PLDLMRES(res), res,
 	       atomic_read(&res->lr_refcount) - 1);
 
-	ldlm_resource_dump(D_ERROR, res);
+	/* Use D_NETERROR since it is in the default mask */
+	ldlm_resource_dump(D_NETERROR, res);
 	unlock_res(res);
 	return 0;
 }
@@ -1242,12 +1280,14 @@ void ldlm_namespace_free_post(struct ldlm_namespace *ns)
 	 * Removing it after @dir may cause oops. */
 	ldlm_pool_fini(&ns->ns_pool);
 
-	ldlm_namespace_proc_unregister(ns);
+	ldlm_namespace_debugfs_unregister(ns);
 	ldlm_namespace_sysfs_unregister(ns);
 	cfs_hash_putref(ns->ns_rs_hash);
+	kfree(ns->ns_name);
 	/* Namespace \a ns should be not on list at this time, otherwise
 	 * this will cause issues related to using freed \a ns in poold
-	 * thread. */
+	 * thread.
+	 */
 	LASSERT(list_empty(&ns->ns_list_chain));
 	OBD_FREE_PTR(ns);
 	ldlm_put_ref();
@@ -1352,33 +1392,62 @@ struct ldlm_namespace *ldlm_namespace_first_locked(enum ldlm_side client)
 			    struct ldlm_namespace, ns_list_chain);
 }
 
+static bool ldlm_resource_extent_new(struct ldlm_resource *res)
+{
+	int idx;
+
+	OBD_SLAB_ALLOC(res->lr_itree, ldlm_interval_tree_slab,
+		       sizeof(*res->lr_itree) * LCK_MODE_NUM);
+	if (res->lr_itree == NULL)
+		return false;
+	/* Initialize interval trees for each lock mode. */
+	for (idx = 0; idx < LCK_MODE_NUM; idx++) {
+		res->lr_itree[idx].lit_size = 0;
+		res->lr_itree[idx].lit_mode = 1 << idx;
+		res->lr_itree[idx].lit_root = NULL;
+	}
+	return true;
+}
+
+static bool ldlm_resource_inodebits_new(struct ldlm_resource *res)
+{
+	int i;
+
+	OBD_ALLOC_PTR(res->lr_ibits_queues);
+	if (res->lr_ibits_queues == NULL)
+		return false;
+	for (i = 0; i < MDS_INODELOCK_NUMBITS; i++)
+		INIT_LIST_HEAD(&res->lr_ibits_queues->liq_waiting[i]);
+	return true;
+}
+
 /** Create and initialize new resource. */
 static struct ldlm_resource *ldlm_resource_new(enum ldlm_type ldlm_type)
 {
 	struct ldlm_resource *res;
-	int idx;
+	bool rc;
 
 	OBD_SLAB_ALLOC_PTR_GFP(res, ldlm_resource_slab, GFP_NOFS);
 	if (res == NULL)
 		return NULL;
 
-	if (ldlm_type == LDLM_EXTENT) {
-		OBD_SLAB_ALLOC(res->lr_itree, ldlm_interval_tree_slab,
-			       sizeof(*res->lr_itree) * LCK_MODE_NUM);
-		if (res->lr_itree == NULL) {
-			OBD_SLAB_FREE_PTR(res, ldlm_resource_slab);
-			return NULL;
-		}
-		/* Initialize interval trees for each lock mode. */
-		for (idx = 0; idx < LCK_MODE_NUM; idx++) {
-			res->lr_itree[idx].lit_size = 0;
-			res->lr_itree[idx].lit_mode = 1 << idx;
-			res->lr_itree[idx].lit_root = NULL;
-		}
+	switch (ldlm_type) {
+	case LDLM_EXTENT:
+		rc = ldlm_resource_extent_new(res);
+		break;
+	case LDLM_IBITS:
+		rc = ldlm_resource_inodebits_new(res);
+		break;
+	default:
+		rc = true;
+		break;
+	}
+	if (!rc) {
+		OBD_SLAB_FREE_PTR(res, ldlm_resource_slab);
+		return NULL;
 	}
 
 	INIT_LIST_HEAD(&res->lr_granted);
-	INIT_LIST_HEAD(&res->lr_converting);
 	INIT_LIST_HEAD(&res->lr_waiting);
 
 	atomic_set(&res->lr_refcount, 1);
@@ -1393,6 +1462,20 @@ static struct ldlm_resource *ldlm_resource_new(enum ldlm_type ldlm_type)
 	return res;
 }
 
+static void ldlm_resource_free(struct ldlm_resource *res)
+{
+	if (res->lr_type == LDLM_EXTENT) {
+		if (res->lr_itree != NULL)
+			OBD_SLAB_FREE(res->lr_itree, ldlm_interval_tree_slab,
+				      sizeof(*res->lr_itree) * LCK_MODE_NUM);
+	} else if (res->lr_type == LDLM_IBITS) {
+		if (res->lr_ibits_queues != NULL)
+			OBD_FREE_PTR(res->lr_ibits_queues);
+	}
+
+	OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res);
+}
+
 /**
  * Return a reference to resource with given name, creating it if necessary.
  * Args: namespace with ns_lock unlocked
@@ -1447,10 +1530,7 @@ ldlm_resource_get(struct ldlm_namespace *ns, struct ldlm_resource *parent,
 		cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
 		/* Clean lu_ref for failed resource. */
 		lu_ref_fini(&res->lr_reference);
-		if (res->lr_itree != NULL)
-			OBD_SLAB_FREE(res->lr_itree, ldlm_interval_tree_slab,
-				      sizeof(*res->lr_itree) * LCK_MODE_NUM);
-		OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res);
+		ldlm_resource_free(res);
 found:
 		res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
 		return res;
@@ -1491,28 +1571,23 @@ struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res)
 static void __ldlm_resource_putref_final(struct cfs_hash_bd *bd,
                                          struct ldlm_resource *res)
 {
-        struct ldlm_ns_bucket *nsb = res->lr_ns_bucket;
+	struct ldlm_ns_bucket *nsb = res->lr_ns_bucket;
 
 	if (!list_empty(&res->lr_granted)) {
-                ldlm_resource_dump(D_ERROR, res);
-                LBUG();
-        }
-
-	if (!list_empty(&res->lr_converting)) {
-                ldlm_resource_dump(D_ERROR, res);
-                LBUG();
-        }
+		ldlm_resource_dump(D_ERROR, res);
+		LBUG();
+	}
 
 	if (!list_empty(&res->lr_waiting)) {
-                ldlm_resource_dump(D_ERROR, res);
-                LBUG();
-        }
+		ldlm_resource_dump(D_ERROR, res);
+		LBUG();
+	}
 
-        cfs_hash_bd_del_locked(nsb->nsb_namespace->ns_rs_hash,
-                               bd, &res->lr_hash);
-        lu_ref_fini(&res->lr_reference);
-        if (cfs_hash_bd_count_get(bd) == 0)
-                ldlm_namespace_put(nsb->nsb_namespace);
+	cfs_hash_bd_del_locked(nsb->nsb_namespace->ns_rs_hash,
+			       bd, &res->lr_hash);
+	lu_ref_fini(&res->lr_reference);
+	if (cfs_hash_bd_count_get(bd) == 0)
+		ldlm_namespace_put(nsb->nsb_namespace);
 }
 
 /* Returns 1 if the resource was freed, 0 if it remains. */
@@ -1531,10 +1606,7 @@ int ldlm_resource_putref(struct ldlm_resource *res)
 		cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
 		if (ns->ns_lvbo && ns->ns_lvbo->lvbo_free)
 			ns->ns_lvbo->lvbo_free(res);
-		if (res->lr_itree != NULL)
-			OBD_SLAB_FREE(res->lr_itree, ldlm_interval_tree_slab,
-				      sizeof(*res->lr_itree) * LCK_MODE_NUM);
-		OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res);
+		ldlm_resource_free(res);
 		return 1;
 	}
 	return 0;
@@ -1559,6 +1631,9 @@ void ldlm_resource_add_lock(struct ldlm_resource *res, struct list_head *head,
 	LASSERT(list_empty(&lock->l_res_link));
 
 	list_add_tail(&lock->l_res_link, head);
+
+	if (res->lr_type == LDLM_IBITS)
+		ldlm_inodebits_add_lock(res, head, lock);
 }
 
 /**
@@ -1591,11 +1666,18 @@ void ldlm_resource_unlink_lock(struct ldlm_lock *lock)
 {
         int type = lock->l_resource->lr_type;
 
-        check_res_locked(lock->l_resource);
-        if (type == LDLM_IBITS || type == LDLM_PLAIN)
-                ldlm_unlink_lock_skiplist(lock);
-        else if (type == LDLM_EXTENT)
-                ldlm_extent_unlink_lock(lock);
+	check_res_locked(lock->l_resource);
+	switch (type) {
+	case LDLM_PLAIN:
+		ldlm_unlink_lock_skiplist(lock);
+		break;
+	case LDLM_EXTENT:
+		ldlm_extent_unlink_lock(lock);
+		break;
+	case LDLM_IBITS:
+		ldlm_inodebits_unlink_lock(lock);
+		break;
+	}
 	list_del_init(&lock->l_res_link);
 }
 EXPORT_SYMBOL(ldlm_resource_unlink_lock);
@@ -1655,14 +1737,14 @@ void ldlm_namespace_dump(int level, struct ldlm_namespace *ns)
 	       ldlm_ns_name(ns), atomic_read(&ns->ns_bref),
 	       ns_is_client(ns) ? "client" : "server");
 
-	if (cfs_time_before(cfs_time_current(), ns->ns_next_dump))
+	if (ktime_get_seconds() < ns->ns_next_dump)
 		return;
 
 	cfs_hash_for_each_nolock(ns->ns_rs_hash,
 				 ldlm_res_hash_dump,
 				 (void *)(unsigned long)level, 0);
 	spin_lock(&ns->ns_lock);
-	ns->ns_next_dump = cfs_time_shift(10);
+	ns->ns_next_dump = ktime_get_seconds() + 10;
 	spin_unlock(&ns->ns_lock);
 }
 
@@ -1695,15 +1777,11 @@ void ldlm_resource_dump(int level, struct ldlm_resource *res)
                         }
                 }
         }
-	if (!list_empty(&res->lr_converting)) {
-                CDEBUG(level, "Converting locks:\n");
-		list_for_each_entry(lock, &res->lr_converting, l_res_link)
-                        LDLM_DEBUG_LIMIT(level, lock, "###");
-        }
+
 	if (!list_empty(&res->lr_waiting)) {
-                CDEBUG(level, "Waiting locks:\n");
+		CDEBUG(level, "Waiting locks:\n");
 		list_for_each_entry(lock, &res->lr_waiting, l_res_link)
-                        LDLM_DEBUG_LIMIT(level, lock, "###");
-        }
+			LDLM_DEBUG_LIMIT(level, lock, "###");
+	}
 }
 EXPORT_SYMBOL(ldlm_resource_dump);
diff --git a/drivers/staging/lustrefsx/lustre/llite/Makefile b/drivers/staging/lustrefsx/lustre/llite/Makefile
index 96430e764665b..19f415face716 100644
--- a/drivers/staging/lustrefsx/lustre/llite/Makefile
+++ b/drivers/staging/lustrefsx/lustre/llite/Makefile
@@ -7,7 +7,7 @@ lustre-y	+= rw26.o super25.o statahead.o xattr_security.o
 lustre-y	+= glimpse.o
 lustre-y	+= lcommon_cl.o
 lustre-y	+= lcommon_misc.o
-lustre-y	+= vvp_dev.o vvp_page.o vvp_lock.o vvp_io.o vvp_object.o
+lustre-y	+= vvp_dev.o vvp_page.o vvp_io.o vvp_object.o
 lustre-y	+= range_lock.o
 
 include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/llite/dcache.c b/drivers/staging/lustrefsx/lustre/llite/dcache.c
index 6da6b5956ab4e..e0cd72b79e265 100644
--- a/drivers/staging/lustrefsx/lustre/llite/dcache.c
+++ b/drivers/staging/lustrefsx/lustre/llite/dcache.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,7 +38,6 @@
 #define DEBUG_SUBSYSTEM S_LLITE
 
 #include <obd_support.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_dlm.h>
 
 #include "llite_internal.h"
@@ -304,6 +303,8 @@ static int ll_revalidate_dentry(struct dentry *dentry,
 				unsigned int lookup_flags)
 {
 	struct inode *dir = dentry->d_parent->d_inode;
+	struct ll_dentry_data *lld = dentry->d_fsdata;
+	struct ll_sb_info *sbi;
 
 	/* If this is intermediate component path lookup and we were able to get
 	 * to this dentry, then its lock has not been revoked and the
@@ -333,6 +334,28 @@ static int ll_revalidate_dentry(struct dentry *dentry,
 		return -ECHILD;
 #endif
 
+	/*
+	 * To support metadata lazy load, we want to bypass negative lookup cache
+	 * on the client. A negative dentry cache is a dentry node that does not
+	 * have an inode associated with it. In these cases, return 0 here
+	 * to force a lookup call to the server.
+	 */
+	sbi = ll_s2sbi(dentry->d_sb);
+	if (d_is_negative(dentry) &&
+		sbi->ll_neg_dentry_timeout != OBD_NEG_CACHE_TIMEOUT_DEFAULT_SECS) {
+		LASSERT(lld != NULL);
+		if (!lld->lld_neg_cache_timeout)
+			lld->lld_neg_cache_timeout = jiffies + sbi->ll_neg_dentry_timeout * HZ;
+
+		if (time_after(jiffies, lld->lld_neg_cache_timeout)) {
+			CDEBUG(D_VFSTRACE,
+				   "negative dentry past timeout - flags: %u\n", lookup_flags);
+			return 0;
+		}
+		CDEBUG(D_VFSTRACE,
+		       "negative dentry within timeout - flags: %u\n", lookup_flags);
+	}
+
 	if (dentry_may_statahead(dir, dentry))
 		ll_statahead(dir, &dentry, dentry->d_inode == NULL);
 
diff --git a/drivers/staging/lustrefsx/lustre/llite/dir.c b/drivers/staging/lustrefsx/lustre/llite/dir.c
index 6e987fe2f7387..a6200132a22db 100644
--- a/drivers/staging/lustrefsx/lustre/llite/dir.c
+++ b/drivers/staging/lustrefsx/lustre/llite/dir.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,21 +38,20 @@
 #include <linux/pagemap.h>
 #include <linux/mm.h>
 #include <linux/version.h>
+#include <linux/security.h>
 #include <linux/user_namespace.h>
 #ifdef HAVE_UIDGID_HEADER
 # include <linux/uidgid.h>
 #endif
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/buffer_head.h>   // for wait_on_buffer
 #include <linux/pagevec.h>
 
 #define DEBUG_SUBSYSTEM S_LLITE
 
-#include <lustre/lustre_idl.h>
-
 #include <obd_support.h>
 #include <obd_class.h>
-#include <uapi/linux/lustre_ioctl.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
 #include <lustre_lib.h>
 #include <lustre_dlm.h>
 #include <lustre_fid.h>
@@ -61,6 +60,39 @@
 
 #include "llite_internal.h"
 
+static void ll_check_and_trigger_restore(struct inode *dir)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(dir);
+	u32 hus_states;
+	__u32 gen = 0;
+	int rc;
+
+	if (!(sbi && (sbi->ll_flags & LL_SBI_MDLL)))
+		return;
+	/*
+	 * TODO-MDLL:
+	 * use API that does a cached read instead of
+	 * going to the mdt for getting the hsm state.
+	 * Tracked with Simba-21644
+	 */
+	rc = ll_get_hsm_state(dir, &hus_states);
+	if (rc == 0 && (hus_states & HS_RELEASED)) {
+		CDEBUG(D_HSM, "MDLL Calling ll_layout_restore for dir "DFID"\n",
+		       PFID(ll_inode2fid(dir)));
+		rc = ll_layout_restore(dir, 0, OBD_OBJECT_EOF);
+		if (rc) {
+			CERROR("MDLL ll_layout_restore ("DFID") error rc: %d\n",
+			       PFID(ll_inode2fid(dir)), rc);
+		} else {
+			CDEBUG(D_HSM, "MDLL Restore triggered for dir "DFID"\n",
+			       PFID(ll_inode2fid(dir)));
+			ll_layout_refresh(dir, &gen);
+			CDEBUG(D_HSM, "MDLL Restore done for dir "DFID"\n",
+			       PFID(ll_inode2fid(dir)));
+		}
+	}
+}
+
 /*
  * (new) readdir implementation overview.
  *
@@ -149,6 +181,8 @@ struct page *ll_get_dir_page(struct inode *dir, struct md_op_data *op_data,
 	struct page		*page;
 	int			rc;
 
+	ll_check_and_trigger_restore(dir);
+
 	cb_op.md_blocking_ast = ll_md_blocking_ast;
 	rc = md_read_page(ll_i2mdexp(dir), op_data, &cb_op, offset, &page);
 	if (rc != 0)
@@ -322,6 +356,7 @@ static int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
 	int			hash64	= sbi->ll_flags & LL_SBI_64BIT_HASH;
 	int			api32	= ll_need_32bit_api(sbi);
 	struct md_op_data	*op_data;
+	struct lu_fid		pfid = { 0 };
 	__u64			pos;
 	int			rc;
 	ENTRY;
@@ -341,34 +376,36 @@ static int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
 		 */
 		GOTO(out, rc = 0);
 
-	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
-				     LUSTRE_OPC_ANY, inode);
-	if (IS_ERR(op_data))
-		GOTO(out, rc = PTR_ERR(op_data));
-
-	if (unlikely(op_data->op_mea1 != NULL)) {
-		/* This is only needed for striped dir to fill ..,
-		 * see lmv_read_entry */
+	if (unlikely(ll_i2info(inode)->lli_lsm_md != NULL)) {
+		/*
+		 * This is only needed for striped dir to fill ..,
+		 * see lmv_read_page()
+		 */
 		if (file_dentry(filp)->d_parent != NULL &&
 		    file_dentry(filp)->d_parent->d_inode != NULL) {
-			__u64 ibits = MDS_INODELOCK_UPDATE;
+			__u64 ibits = MDS_INODELOCK_LOOKUP;
 			struct inode *parent =
 				file_dentry(filp)->d_parent->d_inode;
 
 			if (ll_have_md_lock(parent, &ibits, LCK_MINMODE))
-				op_data->op_fid3 = *ll_inode2fid(parent);
+				pfid = *ll_inode2fid(parent);
 		}
 
 		/* If it can not find in cache, do lookup .. on the master
 		 * object */
-		if (fid_is_zero(&op_data->op_fid3)) {
-			rc = ll_dir_get_parent_fid(inode, &op_data->op_fid3);
-			if (rc != 0) {
-				ll_finish_md_op_data(op_data);
+		if (fid_is_zero(&pfid)) {
+			rc = ll_dir_get_parent_fid(inode, &pfid);
+			if (rc != 0)
 				RETURN(rc);
-			}
 		}
 	}
+
+	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, inode);
+	if (IS_ERR(op_data))
+		GOTO(out, rc = PTR_ERR(op_data));
+	op_data->op_fid3 = pfid;
+
 #ifdef HAVE_DIR_CONTEXT
 	ctx->pos = pos;
 	rc = ll_dir_read(inode, &pos, op_data, ctx);
@@ -435,7 +472,7 @@ static int ll_send_mgc_param(struct obd_export *mgc, char *string)
  *                      <0 if the creation is failed.
  */
 static int ll_dir_setdirstripe(struct dentry *dparent, struct lmv_user_md *lump,
-			       const char *dirname, umode_t mode)
+			       size_t len, const char *dirname, umode_t mode)
 {
 	struct inode *parent = dparent->d_inode;
 	struct ptlrpc_request *request = NULL;
@@ -454,7 +491,8 @@ static int ll_dir_setdirstripe(struct dentry *dparent, struct lmv_user_md *lump,
 	int err;
 	ENTRY;
 
-	if (unlikely(lump->lum_magic != LMV_USER_MAGIC))
+	if (unlikely(lump->lum_magic != LMV_USER_MAGIC &&
+		     lump->lum_magic != LMV_USER_MAGIC_SPECIFIC))
 		RETURN(-EINVAL);
 
 	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p) name %s "
@@ -470,7 +508,8 @@ static int ll_dir_setdirstripe(struct dentry *dparent, struct lmv_user_md *lump,
 	    !OBD_FAIL_CHECK(OBD_FAIL_LLITE_NO_CHECK_DEAD))
 		RETURN(-ENOENT);
 
-	if (lump->lum_magic != cpu_to_le32(LMV_USER_MAGIC))
+	if (lump->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
+	    lump->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
 		lustre_swab_lmv_user_md(lump);
 
 	if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent)))
@@ -495,7 +534,7 @@ static int ll_dir_setdirstripe(struct dentry *dparent, struct lmv_user_md *lump,
 	}
 
 	op_data->op_cli_flags |= CLI_SET_MEA;
-	err = md_create(sbi->ll_md_exp, op_data, lump, sizeof(*lump), mode,
+	err = md_create(sbi->ll_md_exp, op_data, lump, len, mode,
 			from_kuid(&init_user_ns, current_fsuid()),
 			from_kgid(&init_user_ns, current_fsgid()),
 			cfs_curproc_cap_pack(), 0, &request);
@@ -511,11 +550,13 @@ static int ll_dir_setdirstripe(struct dentry *dparent, struct lmv_user_md *lump,
 	dentry.d_inode = inode;
 
 	if (sbi->ll_flags & LL_SBI_FILE_SECCTX) {
-		inode_lock(inode);
+		/* no need to protect selinux_inode_setsecurity() by
+		 * inode_lock. Taking it would lead to a client deadlock
+		 * LU-13617
+		 */
 		err = security_inode_notifysecctx(inode,
 						  op_data->op_file_secctx,
 						  op_data->op_file_secctx_size);
-		inode_unlock(inode);
 	} else {
 		err = ll_inode_init_security(&dentry, inode, parent);
 	}
@@ -536,69 +577,67 @@ static int ll_dir_setdirstripe(struct dentry *dparent, struct lmv_user_md *lump,
 int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
                      int set_default)
 {
-        struct ll_sb_info *sbi = ll_i2sbi(inode);
-        struct md_op_data *op_data;
-        struct ptlrpc_request *req = NULL;
-        int rc = 0;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct md_op_data *op_data;
+	struct ptlrpc_request *req = NULL;
+	int rc = 0;
 #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0)
-        struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
-        struct obd_device *mgc = lsi->lsi_mgc;
+	struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
+	struct obd_device *mgc = lsi->lsi_mgc;
 #endif
-        int lum_size;
+	int lum_size;
 	ENTRY;
 
-        if (lump != NULL) {
-                /*
-                 * This is coming from userspace, so should be in
-                 * local endian.  But the MDS would like it in little
-                 * endian, so we swab it before we send it.
-                 */
-                switch (lump->lmm_magic) {
-                case LOV_USER_MAGIC_V1: {
-                        if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V1))
-                                lustre_swab_lov_user_md_v1(lump);
-                        lum_size = sizeof(struct lov_user_md_v1);
-                        break;
-                }
-                case LOV_USER_MAGIC_V3: {
-                        if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V3))
-                                lustre_swab_lov_user_md_v3(
-                                        (struct lov_user_md_v3 *)lump);
-                        lum_size = sizeof(struct lov_user_md_v3);
-                        break;
-                }
-		case LOV_USER_MAGIC_COMP_V1: {
-			if (lump->lmm_magic !=
-			    cpu_to_le32(LOV_USER_MAGIC_COMP_V1))
-				lustre_swab_lov_comp_md_v1(
-					(struct lov_comp_md_v1 *)lump);
-			lum_size = le32_to_cpu(
-				((struct lov_comp_md_v1 *)lump)->lcm_size);
+	if (lump != NULL) {
+		switch (lump->lmm_magic) {
+		case LOV_USER_MAGIC_V1:
+			lum_size = sizeof(struct lov_user_md_v1);
 			break;
-		}
-		case LMV_USER_MAGIC: {
+		case LOV_USER_MAGIC_V3:
+			lum_size = sizeof(struct lov_user_md_v3);
+			break;
+		case LOV_USER_MAGIC_COMP_V1:
+			lum_size = ((struct lov_comp_md_v1 *)lump)->lcm_size;
+			break;
+		case LMV_USER_MAGIC:
 			if (lump->lmm_magic != cpu_to_le32(LMV_USER_MAGIC))
 				lustre_swab_lmv_user_md(
 					(struct lmv_user_md *)lump);
 			lum_size = sizeof(struct lmv_user_md);
 			break;
+		case LOV_USER_MAGIC_SPECIFIC: {
+			struct lov_user_md_v3 *v3 =
+				(struct lov_user_md_v3 *)lump;
+			if (v3->lmm_stripe_count > LOV_MAX_STRIPE_COUNT)
+				RETURN(-EINVAL);
+			lum_size = lov_user_md_size(v3->lmm_stripe_count,
+						    LOV_USER_MAGIC_SPECIFIC);
+			break;
+		}
+		default:
+			CDEBUG(D_IOCTL, "bad userland LOV MAGIC:"
+					" %#08x != %#08x nor %#08x\n",
+					lump->lmm_magic, LOV_USER_MAGIC_V1,
+					LOV_USER_MAGIC_V3);
+			RETURN(-EINVAL);
 		}
-                default: {
-                        CDEBUG(D_IOCTL, "bad userland LOV MAGIC:"
-                                        " %#08x != %#08x nor %#08x\n",
-                                        lump->lmm_magic, LOV_USER_MAGIC_V1,
-                                        LOV_USER_MAGIC_V3);
-                        RETURN(-EINVAL);
-                }
-                }
-        } else {
-                lum_size = sizeof(struct lov_user_md_v1);
-        }
 
-        op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
-                                     LUSTRE_OPC_ANY, NULL);
-        if (IS_ERR(op_data))
-                RETURN(PTR_ERR(op_data));
+		/*
+		 * This is coming from userspace, so should be in
+		 * local endian.  But the MDS would like it in little
+		 * endian, so we swab it before we send it.
+		 */
+		if ((__swab32(lump->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) ==
+		    le32_to_cpu(LOV_MAGIC_MAGIC))
+			lustre_swab_lov_user_md(lump, 0);
+	} else {
+		lum_size = sizeof(struct lov_user_md_v1);
+	}
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
 
 	/* swabbing is done in lov_setstripe() on server side */
 	rc = md_setattr(sbi->ll_md_exp, op_data, lump, lum_size, &req);
@@ -661,16 +700,10 @@ int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
 	RETURN(rc);
 }
 
-/**
- * This function will be used to get default LOV/LMV/Default LMV
- * @valid will be used to indicate which stripe it will retrieve
- * 	OBD_MD_MEA  		LMV stripe EA
- * 	OBD_MD_DEFAULT_MEA	Default LMV stripe EA
- *  	otherwise		Default LOV EA.
- * Each time, it can only retrieve 1 stripe EA
- **/
-int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size,
-		     struct ptlrpc_request **request, u64 valid)
+static int ll_dir_get_default_layout(struct inode *inode, void **plmm,
+				     int *plmm_size,
+				     struct ptlrpc_request **request, u64 valid,
+				     enum get_default_layout_type type)
 {
 	struct ll_sb_info *sbi = ll_i2sbi(inode);
 	struct mdt_body   *body;
@@ -678,6 +711,7 @@ int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size,
 	struct ptlrpc_request *req = NULL;
 	int rc, lmm_size;
 	struct md_op_data *op_data;
+	struct lu_fid fid;
 	ENTRY;
 
 	rc = ll_get_default_mdsize(sbi, &lmm_size);
@@ -691,11 +725,19 @@ int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size,
 		RETURN(PTR_ERR(op_data));
 
 	op_data->op_valid = valid | OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
+
+	if (type == GET_DEFAULT_LAYOUT_ROOT) {
+		lu_root_fid(&op_data->op_fid1);
+		fid = op_data->op_fid1;
+	} else {
+		fid = *ll_inode2fid(inode);
+	}
+
 	rc = md_getattr(sbi->ll_md_exp, op_data, &req);
 	ll_finish_md_op_data(op_data);
 	if (rc < 0) {
-		CDEBUG(D_INFO, "md_getattr failed on inode "
-		       DFID": rc %d\n", PFID(ll_inode2fid(inode)), rc);
+		CDEBUG(D_INFO, "md_getattr failed on inode "DFID": rc %d\n",
+		       PFID(&fid), rc);
 		GOTO(out, rc);
 	}
 
@@ -721,17 +763,11 @@ int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size,
 	/* We don't swab objects for directories */
 	switch (le32_to_cpu(lmm->lmm_magic)) {
 	case LOV_MAGIC_V1:
-		if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
-			lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
-		break;
 	case LOV_MAGIC_V3:
-		if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
-			lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
-		break;
 	case LOV_MAGIC_COMP_V1:
+	case LOV_USER_MAGIC_SPECIFIC:
 		if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
-			lustre_swab_lov_comp_md_v1(
-					(struct lov_comp_md_v1 *)lmm);
+			lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0);
 		break;
 	case LMV_MAGIC_V1:
 		if (LMV_MAGIC != cpu_to_le32(LMV_MAGIC))
@@ -752,6 +788,75 @@ int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size,
 	return rc;
 }
 
+/**
+ * This function will be used to get default LOV/LMV/Default LMV
+ * @valid will be used to indicate which stripe it will retrieve.
+ * If the directory does not have its own default layout, then the
+ * function will request the default layout from root FID.
+ *	OBD_MD_MEA		LMV stripe EA
+ *	OBD_MD_DEFAULT_MEA	Default LMV stripe EA
+ *	otherwise		Default LOV EA.
+ * Each time, it can only retrieve 1 stripe EA
+ **/
+int ll_dir_getstripe_default(struct inode *inode, void **plmm, int *plmm_size,
+			     struct ptlrpc_request **request,
+			     struct ptlrpc_request **root_request,
+			     u64 valid)
+{
+	struct ptlrpc_request *req = NULL;
+	struct ptlrpc_request *root_req = NULL;
+	struct lov_mds_md *lmm = NULL;
+	int lmm_size = 0;
+	int rc = 0;
+	ENTRY;
+
+	rc = ll_dir_get_default_layout(inode, (void **)&lmm, &lmm_size,
+				       &req, valid, 0);
+	if (rc == -ENODATA && !fid_is_root(ll_inode2fid(inode)) &&
+	    !(valid & (OBD_MD_MEA|OBD_MD_DEFAULT_MEA)) && root_request != NULL){
+		int rc2 = ll_dir_get_default_layout(inode, (void **)&lmm,
+						    &lmm_size, &root_req, valid,
+						    GET_DEFAULT_LAYOUT_ROOT);
+		if (rc2 == 0)
+			rc = 0;
+	}
+
+	*plmm = lmm;
+	*plmm_size = lmm_size;
+	*request = req;
+	if (root_request != NULL)
+		*root_request = root_req;
+
+	RETURN(rc);
+}
+
+/**
+ * This function will be used to get default LOV/LMV/Default LMV
+ * @valid will be used to indicate which stripe it will retrieve
+ *	OBD_MD_MEA		LMV stripe EA
+ *	OBD_MD_DEFAULT_MEA	Default LMV stripe EA
+ *	otherwise		Default LOV EA.
+ * Each time, it can only retrieve 1 stripe EA
+ **/
+int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size,
+		     struct ptlrpc_request **request, u64 valid)
+{
+	struct ptlrpc_request *req = NULL;
+	struct lov_mds_md *lmm = NULL;
+	int lmm_size = 0;
+	int rc = 0;
+	ENTRY;
+
+	rc = ll_dir_get_default_layout(inode, (void **)&lmm, &lmm_size,
+				       &req, valid, 0);
+
+	*plmm = lmm;
+	*plmm_size = lmm_size;
+	*request = req;
+
+	RETURN(rc);
+}
+
 int ll_get_mdt_idx_by_fid(struct ll_sb_info *sbi, const struct lu_fid *fid)
 {
 	struct md_op_data	*op_data;
@@ -960,25 +1065,110 @@ static int ll_ioc_copy_end(struct super_block *sb, struct hsm_copy *copy)
 }
 
 
-static int copy_and_ioctl(int cmd, struct obd_export *exp,
-			  const void __user *data, size_t size)
+static int copy_and_ct_start(int cmd, struct obd_export *exp,
+			     const struct lustre_kernelcomm __user *data)
 {
-	void *copy;
+	struct lustre_kernelcomm *lk;
+	struct lustre_kernelcomm *tmp;
+	size_t size = sizeof(*lk);
+	size_t new_size;
+	int i;
 	int rc;
 
-	OBD_ALLOC(copy, size);
-	if (copy == NULL)
+	/* copy data from userspace to get numbers of archive_id */
+	OBD_ALLOC(lk, size);
+	if (lk == NULL)
 		return -ENOMEM;
 
-	if (copy_from_user(copy, data, size)) {
-		rc = -EFAULT;
-		goto out;
+	if (copy_from_user(lk, data, size))
+		GOTO(out_lk, rc = -EFAULT);
+
+	if (lk->lk_flags & LK_FLG_STOP)
+		goto do_ioctl;
+
+	if (!(lk->lk_flags & LK_FLG_DATANR)) {
+		__u32 archive_mask = lk->lk_data_count;
+		int count;
+
+		/* old hsm agent to old MDS */
+		if (!exp_connect_archive_id_array(exp))
+			goto do_ioctl;
+
+		/* old hsm agent to new MDS */
+		lk->lk_flags |= LK_FLG_DATANR;
+
+		if (archive_mask == 0)
+			goto do_ioctl;
+
+		count = hweight32(archive_mask);
+		new_size = offsetof(struct lustre_kernelcomm, lk_data[count]);
+		OBD_ALLOC(tmp, new_size);
+		if (tmp == NULL)
+			GOTO(out_lk, rc = -ENOMEM);
+
+		memcpy(tmp, lk, size);
+		tmp->lk_data_count = count;
+		OBD_FREE(lk, size);
+		lk = tmp;
+		size = new_size;
+
+		count = 0;
+		for (i = 0; i < sizeof(archive_mask) * 8; i++) {
+			if ((1 << i) & archive_mask) {
+				lk->lk_data[count] = i + 1;
+				count++;
+			}
+		}
+		goto do_ioctl;
 	}
 
-	rc = obd_iocontrol(cmd, exp, size, copy, NULL);
-out:
-	OBD_FREE(copy, size);
+	/* new hsm agent to new mds */
+	if (lk->lk_data_count > 0) {
+		new_size = offsetof(struct lustre_kernelcomm,
+				    lk_data[lk->lk_data_count]);
+		OBD_ALLOC(tmp, new_size);
+		if (tmp == NULL)
+			GOTO(out_lk, rc = -ENOMEM);
+
+		OBD_FREE(lk, size);
+		lk = tmp;
+		size = new_size;
+
+		if (copy_from_user(lk, data, size))
+			GOTO(out_lk, rc = -EFAULT);
+	}
 
+	/* new hsm agent to old MDS */
+	if (!exp_connect_archive_id_array(exp)) {
+		__u32 archives = 0;
+
+		if (lk->lk_data_count > LL_HSM_ORIGIN_MAX_ARCHIVE)
+			GOTO(out_lk, rc = -EINVAL);
+
+		for (i = 0; i < lk->lk_data_count; i++) {
+			if (lk->lk_data[i] > LL_HSM_ORIGIN_MAX_ARCHIVE) {
+				rc = -EINVAL;
+				CERROR("%s: archive id %d requested but only "
+				       "[0 - %zu] supported: rc = %d\n",
+				       exp->exp_obd->obd_name, lk->lk_data[i],
+				       LL_HSM_ORIGIN_MAX_ARCHIVE, rc);
+				GOTO(out_lk, rc);
+			}
+
+			if (lk->lk_data[i] == 0) {
+				archives = 0;
+				break;
+			}
+
+			archives |= (1 << (lk->lk_data[i] - 1));
+		}
+		lk->lk_flags &= ~LK_FLG_DATANR;
+		lk->lk_data_count = archives;
+	}
+do_ioctl:
+	rc = obd_iocontrol(cmd, exp, size, lk, NULL);
+out_lk:
+	OBD_FREE(lk, size);
 	return rc;
 }
 
@@ -999,32 +1189,38 @@ static int check_owner(int type, int id)
 	return 0;
 }
 
-static int quotactl_ioctl(struct ll_sb_info *sbi, struct if_quotactl *qctl)
+static int quotactl_ioctl(struct super_block *sb, struct if_quotactl *qctl)
 {
-        int cmd = qctl->qc_cmd;
-        int type = qctl->qc_type;
-        int id = qctl->qc_id;
-        int valid = qctl->qc_valid;
-        int rc = 0;
-        ENTRY;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int cmd = qctl->qc_cmd;
+	int type = qctl->qc_type;
+	int id = qctl->qc_id;
+	int valid = qctl->qc_valid;
+	int rc = 0;
+	ENTRY;
 
-        switch (cmd) {
-        case Q_SETQUOTA:
-        case Q_SETINFO:
+	switch (cmd) {
+	case Q_SETQUOTA:
+	case Q_SETINFO:
+	case LUSTRE_Q_SETDEFAULT:
 		if (!cfs_capable(CFS_CAP_SYS_ADMIN))
 			RETURN(-EPERM);
+
+		if (sb->s_flags & SB_RDONLY)
+			RETURN(-EROFS);
 		break;
 	case Q_GETQUOTA:
+	case LUSTRE_Q_GETDEFAULT:
 		if (check_owner(type, id) &&
 		    (!cfs_capable(CFS_CAP_SYS_ADMIN)))
 			RETURN(-EPERM);
-                break;
-        case Q_GETINFO:
-                break;
-        default:
-                CERROR("unsupported quotactl op: %#x\n", cmd);
-                RETURN(-ENOTTY);
-        }
+		break;
+	case Q_GETINFO:
+		break;
+	default:
+		CERROR("unsupported quotactl op: %#x\n", cmd);
+		RETURN(-ENOTSUPP);
+	}
 
         if (valid != QC_GENERAL) {
                 if (cmd == Q_GETINFO)
@@ -1121,6 +1317,54 @@ static int quotactl_ioctl(struct ll_sb_info *sbi, struct if_quotactl *qctl)
         RETURN(rc);
 }
 
+int ll_rmfid(struct file *file, void __user *arg)
+{
+	const struct fid_array __user *ufa = arg;
+	struct fid_array *lfa = NULL;
+	size_t size;
+	unsigned nr;
+	int i, rc, *rcs = NULL;
+	ENTRY;
+
+	if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
+	    !(ll_i2sbi(file_inode(file))->ll_flags & LL_SBI_USER_FID2PATH))
+		RETURN(-EPERM);
+	/* Only need to get the buflen */
+	if (get_user(nr, &ufa->fa_nr))
+		RETURN(-EFAULT);
+	/* DoS protection */
+	if (nr > OBD_MAX_FIDS_IN_ARRAY)
+		RETURN(-E2BIG);
+
+	size = offsetof(struct fid_array, fa_fids[nr]);
+	OBD_ALLOC(lfa, size);
+	if (!lfa)
+		RETURN(-ENOMEM);
+	OBD_ALLOC(rcs, sizeof(int) * nr);
+	if (!rcs)
+		GOTO(free_lfa, rc = -ENOMEM);
+
+	if (copy_from_user(lfa, arg, size))
+		GOTO(free_rcs, rc = -EFAULT);
+
+	/* Call mdc_iocontrol */
+	rc = md_rmfid(ll_i2mdexp(file_inode(file)), lfa, rcs, NULL);
+	if (!rc) {
+		for (i = 0; i < nr; i++)
+			if (rcs[i])
+				lfa->fa_fids[i].f_ver = rcs[i];
+		if (copy_to_user(arg, lfa, size))
+			rc = -EFAULT;
+	}
+
+free_rcs:
+	OBD_FREE(rcs, sizeof(int) * nr);
+free_lfa:
+	OBD_FREE(lfa, size);
+
+	RETURN(rc);
+}
+
 /* This function tries to get a single name component,
  * to send to the server. No actual path traversal involved,
  * so we limit to NAME_MAX */
@@ -1153,46 +1397,46 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct dentry *dentry = file_dentry(file);
 	struct inode *inode = file_inode(file);
-        struct ll_sb_info *sbi = ll_i2sbi(inode);
-        struct obd_ioctl_data *data;
-        int rc = 0;
-        ENTRY;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct obd_ioctl_data *data;
+	int rc = 0;
+	ENTRY;
 
 	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%#x\n",
 	       PFID(ll_inode2fid(inode)), inode, cmd);
 
-        /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
-        if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
-                return -ENOTTY;
-
-        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
-        switch(cmd) {
-        case FSFILT_IOC_GETFLAGS:
-        case FSFILT_IOC_SETFLAGS:
-                RETURN(ll_iocontrol(inode, file, cmd, arg));
-        case FSFILT_IOC_GETVERSION_OLD:
-        case FSFILT_IOC_GETVERSION:
+	/* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
+	if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
+		return -ENOTTY;
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+	case FS_IOC_SETFLAGS:
+		RETURN(ll_iocontrol(inode, file, cmd, arg));
+	case FSFILT_IOC_GETVERSION:
+	case FS_IOC_GETVERSION:
 		RETURN(put_user(inode->i_generation, (int __user *)arg));
-        /* We need to special case any other ioctls we want to handle,
-         * to send them to the MDS/OST as appropriate and to properly
-         * network encode the arg field.
-        case FSFILT_IOC_SETVERSION_OLD:
-        case FSFILT_IOC_SETVERSION:
-        */
-        case LL_IOC_GET_MDTIDX: {
-                int mdtidx;
-
-                mdtidx = ll_get_mdt_idx(inode);
-                if (mdtidx < 0)
-                        RETURN(mdtidx);
+	/* We need to special case any other ioctls we want to handle,
+	 * to send them to the MDS/OST as appropriate and to properly
+	 * network encode the arg field. */
+	case FS_IOC_SETVERSION:
+		RETURN(-ENOTSUPP);
+
+	case LL_IOC_GET_MDTIDX: {
+		int mdtidx;
+
+		mdtidx = ll_get_mdt_idx(inode);
+		if (mdtidx < 0)
+			RETURN(mdtidx);
 
 		if (put_user((int)mdtidx, (int __user *)arg))
-                        RETURN(-EFAULT);
+			RETURN(-EFAULT);
 
-                return 0;
-        }
-        case IOC_MDC_LOOKUP: {
-		int namelen, len = 0;
+		return 0;
+	}
+	case IOC_MDC_LOOKUP: {
+				     int namelen, len = 0;
 		char *buf = NULL;
 		char *filename;
 
@@ -1248,8 +1492,9 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		lum = (struct lmv_user_md *)data->ioc_inlbuf2;
 		lumlen = data->ioc_inllen2;
 
-		if (lum->lum_magic != LMV_USER_MAGIC ||
-		    lumlen != sizeof(*lum)) {
+		if ((lum->lum_magic != LMV_USER_MAGIC &&
+		     lum->lum_magic != LMV_USER_MAGIC_SPECIFIC) ||
+		    lumlen < sizeof(*lum)) {
 			CERROR("%s: wrong lum magic %x or size %d: rc = %d\n",
 			       filename, lum->lum_magic, lumlen, -EFAULT);
 			GOTO(lmv_out_free, rc = -EINVAL);
@@ -1260,7 +1505,7 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 #else
 		mode = data->ioc_type;
 #endif
-		rc = ll_dir_setdirstripe(dentry, lum, filename, mode);
+		rc = ll_dir_setdirstripe(dentry, lum, lumlen, filename, mode);
 lmv_out_free:
 		OBD_FREE_LARGE(buf, len);
 		RETURN(rc);
@@ -1284,34 +1529,51 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	}
 	case LL_IOC_LOV_SETSTRIPE_NEW:
 	case LL_IOC_LOV_SETSTRIPE: {
-		struct lov_user_md_v3 lumv3;
-		struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
+		struct lov_user_md_v3 *lumv3 = NULL;
+		struct lov_user_md_v1 lumv1;
+		struct lov_user_md_v1 *lumv1_ptr = &lumv1;
 		struct lov_user_md_v1 __user *lumv1p =
 			(struct lov_user_md_v1 __user *)arg;
 		struct lov_user_md_v3 __user *lumv3p =
 			(struct lov_user_md_v3 __user *)arg;
+		int lum_size = 0;
 
 		int set_default = 0;
 
 		CLASSERT(sizeof(struct lov_user_md_v3) >
 			 sizeof(struct lov_comp_md_v1));
-		LASSERT(sizeof(lumv3) == sizeof(*lumv3p));
-		LASSERT(sizeof(lumv3.lmm_objects[0]) ==
-				sizeof(lumv3p->lmm_objects[0]));
+		CLASSERT(sizeof(*lumv3) == sizeof(*lumv3p));
 		/* first try with v1 which is smaller than v3 */
-		if (copy_from_user(lumv1, lumv1p, sizeof(*lumv1)))
-                        RETURN(-EFAULT);
-
-		if (lumv1->lmm_magic == LOV_USER_MAGIC_V3)
-			if (copy_from_user(&lumv3, lumv3p, sizeof(lumv3)))
-				RETURN(-EFAULT);
+		if (copy_from_user(&lumv1, lumv1p, sizeof(lumv1)))
+			RETURN(-EFAULT);
 
 		if (inode->i_sb->s_root == file_dentry(file))
 			set_default = 1;
 
-		/* in v1 and v3 cases lumv1 points to data */
-		rc = ll_dir_setstripe(inode, lumv1, set_default);
+		switch (lumv1.lmm_magic) {
+		case LOV_USER_MAGIC_V3:
+		case LOV_USER_MAGIC_SPECIFIC:
+			lum_size = ll_lov_user_md_size(&lumv1);
+			if (lum_size < 0)
+				RETURN(lum_size);
+			OBD_ALLOC(lumv3, lum_size);
+			if (!lumv3)
+				RETURN(-ENOMEM);
+			if (copy_from_user(lumv3, lumv3p, lum_size))
+				GOTO(out, rc = -EFAULT);
+			lumv1_ptr = (struct lov_user_md_v1 *)lumv3;
+			break;
+		case LOV_USER_MAGIC_V1:
+			break;
+		default:
+			GOTO(out, rc = -ENOTSUPP);
+		}
 
+		/* in v1 and v3 cases lumv1 points to data */
+		rc = ll_dir_setstripe(inode, lumv1_ptr, set_default);
+out:
+		if (lumv3)
+			OBD_FREE(lumv3, lum_size);
 		RETURN(rc);
 	}
 	case LL_IOC_LMV_GETSTRIPE: {
@@ -1319,6 +1581,7 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 					(struct lmv_user_md __user *)arg;
 		struct lmv_user_md	lum;
 		struct ptlrpc_request	*request = NULL;
+		struct ptlrpc_request	*root_request = NULL;
 		union lmv_mds_md	*lmm = NULL;
 		int			lmmsize;
 		u64			valid = 0;
@@ -1344,8 +1607,8 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		else
 			RETURN(-EINVAL);
 
-		rc = ll_dir_getstripe(inode, (void **)&lmm, &lmmsize, &request,
-				      valid);
+		rc = ll_dir_getstripe_default(inode, (void **)&lmm, &lmmsize,
+					      &request, &root_request, valid);
 		if (rc != 0)
 			GOTO(finish_req, rc);
 
@@ -1368,7 +1631,8 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			GOTO(finish_req, rc = -E2BIG);
 		}
 
-		lum_size = lmv_user_md_size(stripe_count, LMV_MAGIC_V1);
+		lum_size = lmv_user_md_size(stripe_count,
+					    LMV_USER_MAGIC_SPECIFIC);
 		OBD_ALLOC(tmp, lum_size);
 		if (tmp == NULL)
 			GOTO(finish_req, rc = -ENOMEM);
@@ -1385,12 +1649,15 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			struct lu_fid	fid;
 
 			fid_le_to_cpu(&fid, &lmm->lmv_md_v1.lmv_stripe_fids[i]);
-			mdt_index = ll_get_mdt_idx_by_fid(sbi, &fid);
-			if (mdt_index < 0)
-				GOTO(out_tmp, rc = mdt_index);
+			if (fid_is_sane(&fid)) {
+				mdt_index = ll_get_mdt_idx_by_fid(sbi, &fid);
+				if (mdt_index < 0)
+					GOTO(out_tmp, rc = mdt_index);
+
+				tmp->lum_objects[i].lum_mds = mdt_index;
+				tmp->lum_objects[i].lum_fid = fid;
+			}
 
-			tmp->lum_objects[i].lum_mds = mdt_index;
-			tmp->lum_objects[i].lum_fid = fid;
 			tmp->lum_stripe_count++;
 		}
 
@@ -1400,6 +1667,7 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		OBD_FREE(tmp, lum_size);
 finish_req:
 		ptlrpc_req_finished(request);
+		ptlrpc_req_finished(root_request);
 		return rc;
 	}
 
@@ -1430,6 +1698,8 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                         ll_putname(filename);
 		RETURN(rc);
 	}
+	case LL_IOC_RMFID:
+		RETURN(ll_rmfid(file, (void __user *)arg));
 	case LL_IOC_LOV_SWAP_LAYOUTS:
 		RETURN(-EPERM);
 	case IOC_OBD_STATFS:
@@ -1437,62 +1707,93 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case LL_IOC_LOV_GETSTRIPE:
 	case LL_IOC_LOV_GETSTRIPE_NEW:
 	case LL_IOC_MDC_GETINFO:
+	case LL_IOC_MDC_GETINFO_OLD:
 	case IOC_MDC_GETFILEINFO:
+	case IOC_MDC_GETFILEINFO_OLD:
 	case IOC_MDC_GETFILESTRIPE: {
 		struct ptlrpc_request *request = NULL;
+		struct ptlrpc_request *root_request = NULL;
 		struct lov_user_md __user *lump;
-                struct lov_mds_md *lmm = NULL;
-                struct mdt_body *body;
-                char *filename = NULL;
-                int lmmsize;
-
-                if (cmd == IOC_MDC_GETFILEINFO ||
-                    cmd == IOC_MDC_GETFILESTRIPE) {
+		struct lov_mds_md *lmm = NULL;
+		struct mdt_body *body;
+		char *filename = NULL;
+		lstat_t __user *statp = NULL;
+		lstatx_t __user *stxp = NULL;
+		__u64 __user *flagsp = NULL;
+		__u32 __user *lmmsizep = NULL;
+		struct lu_fid __user *fidp = NULL;
+		int lmmsize;
+
+		if (cmd == IOC_MDC_GETFILEINFO_OLD ||
+		    cmd == IOC_MDC_GETFILEINFO ||
+		    cmd == IOC_MDC_GETFILESTRIPE) {
 			filename = ll_getname((const char __user *)arg);
-                        if (IS_ERR(filename))
-                                RETURN(PTR_ERR(filename));
+			if (IS_ERR(filename))
+				RETURN(PTR_ERR(filename));
 
-                        rc = ll_lov_getstripe_ea_info(inode, filename, &lmm,
-                                                      &lmmsize, &request);
+			rc = ll_lov_getstripe_ea_info(inode, filename, &lmm,
+						      &lmmsize, &request);
 		} else {
-			rc = ll_dir_getstripe(inode, (void **)&lmm, &lmmsize,
-					      &request, 0);
+			rc = ll_dir_getstripe_default(inode, (void **)&lmm,
+						      &lmmsize, &request,
+						      &root_request, 0);
 		}
 
-                if (request) {
-                        body = req_capsule_server_get(&request->rq_pill,
-                                                      &RMF_MDT_BODY);
-                        LASSERT(body != NULL);
-                } else {
-                        GOTO(out_req, rc);
-                }
+		if (request) {
+			body = req_capsule_server_get(&request->rq_pill,
+						      &RMF_MDT_BODY);
+			LASSERT(body != NULL);
+		} else {
+			GOTO(out_req, rc);
+		}
 
-                if (rc < 0) {
-                        if (rc == -ENODATA && (cmd == IOC_MDC_GETFILEINFO ||
-                                               cmd == LL_IOC_MDC_GETINFO))
-                                GOTO(skip_lmm, rc = 0);
-                        else
-                                GOTO(out_req, rc);
-                }
+		if (rc == -ENODATA && (cmd == IOC_MDC_GETFILEINFO ||
+				       cmd == LL_IOC_MDC_GETINFO ||
+				       cmd == IOC_MDC_GETFILEINFO_OLD ||
+				       cmd == LL_IOC_MDC_GETINFO_OLD)) {
+			lmmsize = 0;
+			rc = 0;
+		}
+
+		if (rc < 0)
+			GOTO(out_req, rc);
 
 		if (cmd == IOC_MDC_GETFILESTRIPE ||
 		    cmd == LL_IOC_LOV_GETSTRIPE ||
 		    cmd == LL_IOC_LOV_GETSTRIPE_NEW) {
 			lump = (struct lov_user_md __user *)arg;
-                } else {
+		} else if (cmd == IOC_MDC_GETFILEINFO_OLD ||
+			   cmd == LL_IOC_MDC_GETINFO_OLD){
+			struct lov_user_mds_data_v1 __user *lmdp;
+
+			lmdp = (struct lov_user_mds_data_v1 __user *)arg;
+			statp = &lmdp->lmd_st;
+			lump = &lmdp->lmd_lmm;
+		} else {
 			struct lov_user_mds_data __user *lmdp;
+
 			lmdp = (struct lov_user_mds_data __user *)arg;
-                        lump = &lmdp->lmd_lmm;
-                }
-		if (copy_to_user(lump, lmm, lmmsize)) {
+			fidp = &lmdp->lmd_fid;
+			stxp = &lmdp->lmd_stx;
+			flagsp = &lmdp->lmd_flags;
+			lmmsizep = &lmdp->lmd_lmmsize;
+			lump = &lmdp->lmd_lmm;
+		}
+
+		if (lmmsize == 0) {
+			/* If the file has no striping then zero out *lump so
+			 * that the caller isn't confused by garbage. */
+			if (clear_user(lump, sizeof(*lump)))
+				GOTO(out_req, rc = -EFAULT);
+		} else if (copy_to_user(lump, lmm, lmmsize)) {
 			if (copy_to_user(lump, lmm, sizeof(*lump)))
-                                GOTO(out_req, rc = -EFAULT);
-                        rc = -EOVERFLOW;
-                }
-        skip_lmm:
-                if (cmd == IOC_MDC_GETFILEINFO || cmd == LL_IOC_MDC_GETINFO) {
-			struct lov_user_mds_data __user *lmdp;
-                        lstat_t st = { 0 };
+				GOTO(out_req, rc = -EFAULT);
+			rc = -EOVERFLOW;
+		}
+
+		if (cmd == IOC_MDC_GETFILEINFO_OLD ||
+		    cmd == LL_IOC_MDC_GETINFO_OLD) {
+			lstat_t st = { 0 };
 
 			st.st_dev	= inode->i_sb->s_dev;
 			st.st_mode	= body->mbo_mode;
@@ -1510,29 +1811,86 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 						sbi->ll_flags &
 						LL_SBI_32BIT_API);
 
-			lmdp = (struct lov_user_mds_data __user *)arg;
-			if (copy_to_user(&lmdp->lmd_st, &st, sizeof(st)))
-                                GOTO(out_req, rc = -EFAULT);
-                }
+			if (copy_to_user(statp, &st, sizeof(st)))
+				GOTO(out_req, rc = -EFAULT);
+		} else if (cmd == IOC_MDC_GETFILEINFO ||
+			   cmd == LL_IOC_MDC_GETINFO) {
+			lstatx_t stx = { 0 };
+			__u64 valid = body->mbo_valid;
+
+			stx.stx_blksize = PAGE_SIZE;
+			stx.stx_nlink = body->mbo_nlink;
+			stx.stx_uid = body->mbo_uid;
+			stx.stx_gid = body->mbo_gid;
+			stx.stx_mode = body->mbo_mode;
+			stx.stx_ino = cl_fid_build_ino(&body->mbo_fid1,
+						       sbi->ll_flags &
+						       LL_SBI_32BIT_API);
+			stx.stx_size = body->mbo_size;
+			stx.stx_blocks = body->mbo_blocks;
+			stx.stx_atime.tv_sec = body->mbo_atime;
+			stx.stx_ctime.tv_sec = body->mbo_ctime;
+			stx.stx_mtime.tv_sec = body->mbo_mtime;
+			stx.stx_rdev_major = MAJOR(body->mbo_rdev);
+			stx.stx_rdev_minor = MINOR(body->mbo_rdev);
+			stx.stx_dev_major = MAJOR(inode->i_sb->s_dev);
+			stx.stx_dev_minor = MINOR(inode->i_sb->s_dev);
+			stx.stx_mask |= STATX_BASIC_STATS;
 
-                EXIT;
-        out_req:
-                ptlrpc_req_finished(request);
-                if (filename)
-                        ll_putname(filename);
-                return rc;
-        }
+			/*
+			 * For a striped directory, the size and blocks returned
+			 * from MDT is not correct.
+			 * The size and blocks are aggregated by client across
+			 * all stripes.
+			 * Thus for a striped directory, do not return the valid
+			 * FLSIZE and FLBLOCKS flags to the caller.
+			 * However, this whould be better decided by the MDS
+			 * instead of the client.
+			 */
+			if (cmd == LL_IOC_MDC_GETINFO &&
+			    ll_i2info(inode)->lli_lsm_md != NULL)
+				valid &= ~(OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
+
+			if (flagsp && copy_to_user(flagsp, &valid,
+						   sizeof(*flagsp)))
+				GOTO(out_req, rc = -EFAULT);
+
+			if (fidp && copy_to_user(fidp, &body->mbo_fid1,
+						 sizeof(*fidp)))
+				GOTO(out_req, rc = -EFAULT);
+
+			if (!(valid & OBD_MD_FLSIZE))
+				stx.stx_mask &= ~STATX_SIZE;
+			if (!(valid & OBD_MD_FLBLOCKS))
+				stx.stx_mask &= ~STATX_BLOCKS;
+
+			if (stxp && copy_to_user(stxp, &stx, sizeof(stx)))
+				GOTO(out_req, rc = -EFAULT);
+
+			if (lmmsizep && copy_to_user(lmmsizep, &lmmsize,
+						     sizeof(*lmmsizep)))
+				GOTO(out_req, rc = -EFAULT);
+		}
+
+		EXIT;
+out_req:
+		ptlrpc_req_finished(request);
+		ptlrpc_req_finished(root_request);
+		if (filename)
+			ll_putname(filename);
+		return rc;
+	}
 	case OBD_IOC_QUOTACTL: {
-                struct if_quotactl *qctl;
+		struct if_quotactl *qctl;
 
-                OBD_ALLOC_PTR(qctl);
-                if (!qctl)
-                        RETURN(-ENOMEM);
+		OBD_ALLOC_PTR(qctl);
+		if (!qctl)
+			RETURN(-ENOMEM);
 
 		if (copy_from_user(qctl, (void __user *)arg, sizeof(*qctl)))
-                        GOTO(out_quotactl, rc = -EFAULT);
+			GOTO(out_quotactl, rc = -EFAULT);
 
-                rc = quotactl_ioctl(sbi, qctl);
+		rc = quotactl_ioctl(inode->i_sb, qctl);
 
 		if (rc == 0 &&
 		    copy_to_user((void __user *)arg, qctl, sizeof(*qctl)))
@@ -1545,6 +1903,76 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
         case OBD_IOC_GETDTNAME:
         case OBD_IOC_GETMDNAME:
                 RETURN(ll_get_obd_name(inode, cmd, arg));
+	case LL_IOC_HSM_STATE_GET: {
+		struct md_op_data *op_data;
+		struct hsm_user_state *hus;
+		int rc;
+
+		OBD_ALLOC_PTR(hus);
+		if (hus == NULL)
+			RETURN(-ENOMEM);
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+					     LUSTRE_OPC_ANY, hus);
+		if (IS_ERR(op_data)) {
+			OBD_FREE_PTR(hus);
+			RETURN(PTR_ERR(op_data));
+		}
+
+		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
+				   op_data, NULL);
+
+		if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
+			rc = -EFAULT;
+
+		ll_finish_md_op_data(op_data);
+		OBD_FREE_PTR(hus);
+		RETURN(rc);
+	}
+	case LL_IOC_HSM_STATE_SET: {
+		struct hsm_state_set *hss;
+		int rc;
+
+		OBD_ALLOC_PTR(hss);
+		if (hss == NULL)
+			RETURN(-ENOMEM);
+
+		if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
+			OBD_FREE_PTR(hss);
+			RETURN(-EFAULT);
+		}
+
+		rc = ll_hsm_state_set(inode, hss);
+
+		OBD_FREE_PTR(hss);
+		RETURN(rc);
+	}
+	case LL_IOC_HSM_ACTION: {
+		struct md_op_data *op_data;
+		struct hsm_current_action *hca;
+		int rc;
+
+		OBD_ALLOC_PTR(hca);
+		if (hca == NULL)
+			RETURN(-ENOMEM);
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+					     LUSTRE_OPC_ANY, hca);
+		if (IS_ERR(op_data)) {
+			OBD_FREE_PTR(hca);
+			RETURN(PTR_ERR(op_data));
+		}
+
+		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
+				   op_data, NULL);
+
+		if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
+			rc = -EFAULT;
+
+		ll_finish_md_op_data(op_data);
+		OBD_FREE_PTR(hca);
+		RETURN(rc);
+	}
         case LL_IOC_FLUSHCTX:
                 RETURN(ll_flush_ctx(inode));
         case LL_IOC_GETOBDCOUNT: {
@@ -1683,8 +2111,8 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		if (!cfs_capable(CFS_CAP_SYS_ADMIN))
 			RETURN(-EPERM);
 
-		rc = copy_and_ioctl(cmd, sbi->ll_md_exp, (void __user *)arg,
-				    sizeof(struct lustre_kernelcomm));
+		rc = copy_and_ct_start(cmd, sbi->ll_md_exp,
+				       (struct lustre_kernelcomm __user *)arg);
 		RETURN(rc);
 
 	case LL_IOC_HSM_COPY_START: {
@@ -1706,6 +2134,24 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		OBD_FREE_PTR(copy);
 		RETURN(rc);
 	}
+	case LL_IOC_HSM_IMPORT: {
+		struct hsm_user_import *hui;
+
+		OBD_ALLOC_PTR(hui);
+		if (hui == NULL)
+			RETURN(-ENOMEM);
+
+		if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
+			OBD_FREE_PTR(hui);
+			RETURN(-EFAULT);
+		}
+
+		rc = ll_hsm_import(inode, file, hui);
+
+		CDEBUG(D_HSM, "MDLL hsm_state import: %d\n", rc);
+		OBD_FREE_PTR(hui);
+		RETURN(rc);
+	}
 	case LL_IOC_HSM_COPY_END: {
 		struct hsm_copy	*copy;
 		int		 rc;
@@ -1719,6 +2165,8 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		}
 
 		rc = ll_ioc_copy_end(inode->i_sb, copy);
+		CDEBUG(D_HSM, "MDLL hsm_copy_end: %d\n", rc);
+
 		if (copy_to_user((char __user *)arg, copy, sizeof(*copy)))
 			rc = -EFAULT;
 
@@ -1726,15 +2174,15 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		RETURN(rc);
 	}
 	case LL_IOC_MIGRATE: {
-		char		*buf = NULL;
-		const char	*filename;
-		int		namelen = 0;
-		int		len;
-		int		rc;
-		int		mdtidx;
+		struct lmv_user_md *lum;
+		char *buf = NULL;
+		int len;
+		char *filename;
+		int namelen = 0;
+		int rc;
 
 		rc = obd_ioctl_getdata(&buf, &len, (void __user *)arg);
-		if (rc < 0)
+		if (rc)
 			RETURN(rc);
 
 		data = (struct obd_ioctl_data *)buf;
@@ -1744,15 +2192,22 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 		filename = data->ioc_inlbuf1;
 		namelen = data->ioc_inllen1;
-		/* \0 is packed at the end of filename */
-		if (namelen < 1 || namelen != strlen(filename) + 1)
-			GOTO(migrate_free, rc = -EINVAL);
 
-		if (data->ioc_inllen2 != sizeof(mdtidx))
+		if (namelen < 1 || namelen != strlen(filename) + 1) {
+			CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
 			GOTO(migrate_free, rc = -EINVAL);
-		mdtidx = *(int *)data->ioc_inlbuf2;
+		}
+
+		lum = (struct lmv_user_md *)data->ioc_inlbuf2;
+		if (lum->lum_magic != LMV_USER_MAGIC &&
+		    lum->lum_magic != LMV_USER_MAGIC_SPECIFIC) {
+			rc = -EINVAL;
+			CERROR("%s: wrong lum magic %x: rc = %d\n",
+			       filename, lum->lum_magic, rc);
+			GOTO(migrate_free, rc);
+		}
 
-		rc = ll_migrate(inode, file, mdtidx, filename, namelen - 1);
+		rc = ll_migrate(inode, file, lum, filename);
 migrate_free:
 		OBD_FREE_LARGE(buf, len);
 
diff --git a/drivers/staging/lustrefsx/lustre/llite/file.c b/drivers/staging/lustrefsx/lustre/llite/file.c
index 65d57dbe70b42..a37308caf619d 100644
--- a/drivers/staging/lustrefsx/lustre/llite/file.c
+++ b/drivers/staging/lustrefsx/lustre/llite/file.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -45,15 +45,19 @@
 #ifdef HAVE_UIDGID_HEADER
 # include <linux/uidgid.h>
 #endif
-#include <lustre/ll_fiemap.h>
 
-#include <uapi/linux/lustre_ioctl.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
 #include <lustre_swab.h>
 
 #include "cl_object.h"
 #include "llite_internal.h"
 #include "vvp_internal.h"
 
+struct split_param {
+	struct inode	*sp_inode;
+	__u16		sp_mirror_id;
+};
+
 static int
 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
 
@@ -95,12 +99,15 @@ static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
 	op_data->op_attr.ia_mtime = inode->i_mtime;
 	op_data->op_attr.ia_ctime = inode->i_ctime;
 	op_data->op_attr.ia_size = i_size_read(inode);
-	op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
-				     ATTR_MTIME | ATTR_MTIME_SET |
-				     ATTR_CTIME | ATTR_CTIME_SET;
+	op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
+				      ATTR_MTIME | ATTR_MTIME_SET |
+				      ATTR_CTIME);
+	op_data->op_xvalid |= OP_XVALID_CTIME_SET;
 	op_data->op_attr_blocks = inode->i_blocks;
 	op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
-	op_data->op_handle = och->och_fh;
+	if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
+		op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
+	op_data->op_open_handle = och->och_open_handle;
 
 	if (och->och_flags & FMODE_WRITE &&
 	    ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
@@ -145,20 +152,53 @@ static int ll_close_inode_openhandle(struct inode *inode,
 
 	ll_prepare_close(inode, op_data, och);
 	switch (bias) {
-	case MDS_CLOSE_LAYOUT_SWAP:
+	case MDS_CLOSE_LAYOUT_MERGE:
+		/* merge blocks from the victim inode */
+		op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
+		op_data->op_attr.ia_valid |= ATTR_SIZE;
+		op_data->op_xvalid |= OP_XVALID_BLOCKS;
+		fallthrough;
+	case MDS_CLOSE_LAYOUT_SPLIT:
+	case MDS_CLOSE_LAYOUT_SWAP: {
+		struct split_param *sp = data;
+
 		LASSERT(data != NULL);
-		op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
+		op_data->op_bias |= bias;
 		op_data->op_data_version = 0;
 		op_data->op_lease_handle = och->och_lease_handle;
-		op_data->op_fid2 = *ll_inode2fid(data);
+		if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
+			op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
+			op_data->op_mirror_id = sp->sp_mirror_id;
+		} else {
+			op_data->op_fid2 = *ll_inode2fid(data);
+		}
+		break;
+	}
+
+	case MDS_CLOSE_RESYNC_DONE: {
+		struct ll_ioc_lease *ioc = data;
+
+		LASSERT(data != NULL);
+		op_data->op_attr_blocks +=
+			ioc->lil_count * op_data->op_attr_blocks;
+		op_data->op_attr.ia_valid |= ATTR_SIZE;
+		op_data->op_xvalid |= OP_XVALID_BLOCKS;
+		op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
+
+		op_data->op_lease_handle = och->och_lease_handle;
+		op_data->op_data = &ioc->lil_ids[0];
+		op_data->op_data_size =
+			ioc->lil_count * sizeof(ioc->lil_ids[0]);
 		break;
+	}
 
 	case MDS_HSM_RELEASE:
 		LASSERT(data != NULL);
 		op_data->op_bias |= MDS_HSM_RELEASE;
 		op_data->op_data_version = *(__u64 *)data;
 		op_data->op_lease_handle = och->och_lease_handle;
-		op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
+		op_data->op_attr.ia_valid |= ATTR_SIZE;
+		op_data->op_xvalid |= OP_XVALID_BLOCKS;
 		break;
 
 	default:
@@ -166,13 +206,17 @@ static int ll_close_inode_openhandle(struct inode *inode,
 		break;
 	}
 
+	if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
+		op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
+	if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
+		op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
+
 	rc = md_close(md_exp, op_data, och->och_mod, &req);
 	if (rc != 0 && rc != -EINTR)
 		CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 		       md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 
-	if (rc == 0 &&
-	    op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
+	if (rc == 0 && op_data->op_bias & bias) {
 		struct mdt_body *body;
 
 		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
@@ -185,7 +229,7 @@ static int ll_close_inode_openhandle(struct inode *inode,
 out:
 
 	md_clear_open_replay_data(md_exp, och);
-	och->och_fh.cookie = DEAD_HANDLE_MAGIC;
+	och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
 	OBD_FREE_PTR(och);
 
 	ptlrpc_req_finished(req);	/* This is close request */
@@ -287,7 +331,9 @@ static int ll_md_close(struct inode *inode, struct file *file)
 	}
 	mutex_unlock(&lli->lli_och_mutex);
 
-	if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
+	/* LU-4398: do not cache write open lock if the file has exec bit */
+	if ((lockmode == LCK_CW && inode->i_mode & S_IXUGO) ||
+	    !md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 			   LDLM_IBITS, &policy, lockmode, &lockh))
 		rc = ll_md_real_close(inode, fd->fd_omode);
 
@@ -344,12 +390,146 @@ int ll_file_release(struct inode *inode, struct file *file)
 	RETURN(rc);
 }
 
+static inline int ll_dom_readpage(void *data, struct page *page)
+{
+	struct niobuf_local *lnb = data;
+	void *kaddr;
+
+	kaddr = ll_kmap_atomic(page, KM_USER0);
+	memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
+	if (lnb->lnb_len < PAGE_SIZE)
+		memset(kaddr + lnb->lnb_len, 0,
+		       PAGE_SIZE - lnb->lnb_len);
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+	ll_kunmap_atomic(kaddr, KM_USER0);
+	unlock_page(page);
+
+	return 0;
+}
+
+void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req)
+{
+	struct lu_env *env;
+	struct cl_io *io;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct cl_object *obj = lli->lli_clob;
+	struct address_space *mapping = inode->i_mapping;
+	struct page *vmpage;
+	struct niobuf_remote *rnb;
+	struct mdt_body *body;
+	char *data;
+	unsigned long index, start;
+	struct niobuf_local lnb;
+	__u16 refcheck;
+	int rc;
+
+	ENTRY;
+
+	if (obj == NULL)
+		RETURN_EXIT;
+
+	if (!req_capsule_field_present(&req->rq_pill, &RMF_NIOBUF_INLINE,
+				       RCL_SERVER))
+		RETURN_EXIT;
+
+	rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
+	if (rnb == NULL || rnb->rnb_len == 0)
+		RETURN_EXIT;
+
+	/* LU-11595: Server may return whole file and that is OK always or
+	 * it may return just file tail and its offset must be aligned with
+	 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
+	 * smaller then offset may be not aligned and that data is just ignored.
+	 */
+	if (rnb->rnb_offset % PAGE_SIZE)
+		RETURN_EXIT;
+
+	/* Server returns whole file or just file tail if it fills in reply
+	 * buffer, in both cases total size should be equal to the file size.
+	 */
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (rnb->rnb_offset + rnb->rnb_len != body->mbo_dom_size) {
+		CERROR("%s: server returns off/len %llu/%u but size %llu\n",
+		       ll_get_fsname(inode->i_sb, NULL, 0), rnb->rnb_offset,
+		       rnb->rnb_len, body->mbo_dom_size);
+		RETURN_EXIT;
+	}
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN_EXIT;
+	io = vvp_env_thread_io(env);
+	io->ci_obj = obj;
+	io->ci_ignore_layout = 1;
+	rc = cl_io_init(env, io, CIT_MISC, obj);
+	if (rc)
+		GOTO(out_io, rc);
+
+	CDEBUG(D_INFO, "Get data along with open at %llu len %i, size %llu\n",
+	       rnb->rnb_offset, rnb->rnb_len, body->mbo_dom_size);
+
+	data = (char *)rnb + sizeof(*rnb);
+
+	lnb.lnb_file_offset = rnb->rnb_offset;
+	start = lnb.lnb_file_offset / PAGE_SIZE;
+	index = 0;
+	LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
+	lnb.lnb_page_offset = 0;
+	do {
+		struct cl_page *page;
+
+		lnb.lnb_data = data + (index << PAGE_SHIFT);
+		lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
+		if (lnb.lnb_len > PAGE_SIZE)
+			lnb.lnb_len = PAGE_SIZE;
+
+		vmpage = read_cache_page(mapping, index + start,
+					 ll_dom_readpage, &lnb);
+		if (IS_ERR(vmpage)) {
+			CWARN("%s: cannot fill page %lu for "DFID
+			      " with data: rc = %li\n",
+			      ll_get_fsname(inode->i_sb, NULL, 0),
+			      index + start, PFID(lu_object_fid(&obj->co_lu)),
+			      PTR_ERR(vmpage));
+			break;
+		}
+		lock_page(vmpage);
+		if (vmpage->mapping == NULL) {
+			unlock_page(vmpage);
+			put_page(vmpage);
+			/* page was truncated */
+			break;
+		}
+		/* attach VM page to CL page cache */
+		page = cl_page_find(env, obj, vmpage->index, vmpage,
+				    CPT_CACHEABLE);
+		if (IS_ERR(page)) {
+			ClearPageUptodate(vmpage);
+			unlock_page(vmpage);
+			put_page(vmpage);
+			break;
+		}
+		cl_page_export(env, page, 1);
+		cl_page_put(env, page);
+		unlock_page(vmpage);
+		put_page(vmpage);
+		index++;
+	} while (rnb->rnb_len > (index << PAGE_SHIFT));
+
+out_io:
+	cl_io_fini(env, io);
+	cl_env_put(env, &refcheck);
+
+	EXIT;
+}
+
 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 				struct lookup_intent *itp)
 {
 	struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 	struct dentry *parent = de->d_parent;
-	const char *name = NULL;
+	char *name = NULL;
 	int len = 0;
 	struct md_op_data *op_data;
 	struct ptlrpc_request *req = NULL;
@@ -361,21 +541,43 @@ static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 
 	/* if server supports open-by-fid, or file name is invalid, don't pack
 	 * name in open request */
-	if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
-	    lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
-		name = de->d_name.name;
+	if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
+	    !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
+retry:
 		len = de->d_name.len;
+		name = kmalloc(len + 1, GFP_NOFS);
+		if (!name)
+			RETURN(-ENOMEM);
+
+		/* race here */
+		spin_lock(&de->d_lock);
+		if (len != de->d_name.len) {
+			spin_unlock(&de->d_lock);
+			kfree(name);
+			goto retry;
+		}
+		memcpy(name, de->d_name.name, len);
+		name[len] = '\0';
+		spin_unlock(&de->d_lock);
+
+		if (!lu_name_is_valid_2(name, len)) {
+			kfree(name);
+			RETURN(-ESTALE);
+		}
 	}
 
 	op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 				     name, len, 0, LUSTRE_OPC_ANY, NULL);
-	if (IS_ERR(op_data))
+	if (IS_ERR(op_data)) {
+		kfree(name);
 		RETURN(PTR_ERR(op_data));
+	}
 	op_data->op_data = lmm;
 	op_data->op_data_size = lmmsize;
 
 	rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 			    &ll_md_blocking_ast, 0);
+	kfree(name);
 	ll_finish_md_op_data(op_data);
 	if (rc == -ESTALE) {
 		/* reason for keep own exit path - don`t flood log
@@ -398,8 +600,25 @@ static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 	}
 
 	rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
-	if (!rc && itp->it_lock_mode)
-		ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
+
+	if (!rc && itp->it_lock_mode) {
+		__u64 bits = 0;
+
+		/* If we got a lock back and it has a LOOKUP bit set,
+		 * make sure the dentry is marked as valid so we can find it.
+		 * We don't need to care about actual hashing since other bits
+		 * of kernel will deal with that later.
+		 */
+		ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, &bits);
+		if (bits & MDS_INODELOCK_LOOKUP)
+			d_lustre_revalidate(de);
+
+		/* if DoM bit returned along with LAYOUT bit then there
+		 * can be read-on-open data returned.
+		 */
+		if (bits & MDS_INODELOCK_DOM && bits & MDS_INODELOCK_LAYOUT)
+			ll_dom_finish_open(de->d_inode, req);
+	}
 
 out:
 	ptlrpc_req_finished(req);
@@ -424,7 +643,7 @@ static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 	struct mdt_body *body;
 
 	body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
-	och->och_fh = body->mbo_handle;
+	och->och_open_handle = body->mbo_open_handle;
 	och->och_fid = body->mbo_fid1;
 	och->och_lease_handle.cookie = it->it_lock_handle;
 	och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
@@ -494,7 +713,7 @@ int ll_file_open(struct inode *inode, struct file *file)
 
 	fd = ll_file_data_get();
 	if (fd == NULL)
-		GOTO(out_openerr, rc = -ENOMEM);
+		GOTO(out_nofiledata, rc = -ENOMEM);
 
 	fd->fd_file = file;
 	if (S_ISDIR(inode->i_mode))
@@ -506,6 +725,8 @@ int ll_file_open(struct inode *inode, struct file *file)
         }
 
 	if (!it || !it->it_disposition) {
+		CDEBUG(D_HSM, "MDLL file->f_flags=0x%x/0%o\n",
+		       file->f_flags, file->f_flags);
                 /* Convert f_flags into access mode. We cannot use file->f_mode,
                  * because everything but O_ACCMODE mask was stripped from
                  * there */
@@ -514,12 +735,13 @@ int ll_file_open(struct inode *inode, struct file *file)
                 if (file->f_flags & O_TRUNC)
                         oit.it_flags |= FMODE_WRITE;
 
-                /* kernel only call f_op->open in dentry_open.  filp_open calls
-                 * dentry_open after call to open_namei that checks permissions.
-                 * Only nfsd_open call dentry_open directly without checking
-                 * permissions and because of that this code below is safe. */
-                if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
-                        oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
+		/* kernel only call f_op->open in dentry_open.  filp_open calls
+		 * dentry_open after call to open_namei that checks permissions.
+		 * Only nfsd_open call dentry_open directly without checking
+		 * permissions and because of that this code below is safe.
+		 */
+		if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
+			oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 
                 /* We do not want O_EXCL here, presumably we opened the file
                  * already? XXX - NFS implications? */
@@ -643,6 +865,7 @@ int ll_file_open(struct inode *inode, struct file *file)
                 GOTO(out_och_free, rc);
 
 	cl_lov_delay_create_clear(&file->f_flags);
+	cl_lu_noimport_clear(&file->f_flags);
 	GOTO(out_och_free, rc);
 
 out_och_free:
@@ -663,6 +886,7 @@ int ll_file_open(struct inode *inode, struct file *file)
                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
         }
 
+out_nofiledata:
 	if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 		ptlrpc_req_finished(it->it_request);
 		it_clear_disposition(it, DISP_ENQ_OPEN_REF);
@@ -700,7 +924,7 @@ static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
  * if it has an open lock in cache already.
  */
 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
-				struct lustre_handle *old_handle)
+				struct lustre_handle *old_open_handle)
 {
 	struct ll_inode_info *lli = ll_i2info(inode);
 	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
@@ -733,7 +957,7 @@ static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 		*och_p = NULL;
 	}
 
-	*old_handle = fd->fd_och->och_fh;
+	*old_open_handle = fd->fd_och->och_open_handle;
 
 	EXIT;
 out_unlock:
@@ -794,7 +1018,7 @@ ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 	struct ll_sb_info *sbi = ll_i2sbi(inode);
 	struct md_op_data *op_data;
 	struct ptlrpc_request *req = NULL;
-	struct lustre_handle old_handle = { 0 };
+	struct lustre_handle old_open_handle = { 0 };
 	struct obd_client_handle *och = NULL;
 	int rc;
 	int rc2;
@@ -807,7 +1031,7 @@ ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 		if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 			RETURN(ERR_PTR(-EPERM));
 
-		rc = ll_lease_och_acquire(inode, file, &old_handle);
+		rc = ll_lease_och_acquire(inode, file, &old_open_handle);
 		if (rc)
 			RETURN(ERR_PTR(rc));
 	}
@@ -822,7 +1046,7 @@ ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 		GOTO(out, rc = PTR_ERR(op_data));
 
 	/* To tell the MDT this openhandle is from the same owner */
-	op_data->op_handle = old_handle;
+	op_data->op_open_handle = old_open_handle;
 
 	it.it_flags = fmode | open_flags;
 	it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
@@ -848,7 +1072,9 @@ ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 		GOTO(out_release_it, rc);
 
 	LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
-	ll_och_fill(sbi->ll_md_exp, &it, och);
+	rc = ll_och_fill(sbi->ll_md_exp, &it, och);
+	if (rc)
+		GOTO(out_release_it, rc);
 
 	if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
 		GOTO(out_close, rc = -EOPNOTSUPP);
@@ -936,7 +1162,7 @@ static int ll_swap_layouts_close(struct obd_client_handle *och,
 	if (rc == 0)
 		GOTO(out_free_och, rc = -EINVAL);
 
-	/* Close the file and swap layouts between inode & inode2.
+	/* Close the file and {swap,merge} layouts between inode & inode2.
 	 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
 	 * because we still need it to pack l_remote_handle to MDT. */
 	rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
@@ -955,8 +1181,10 @@ static int ll_swap_layouts_close(struct obd_client_handle *och,
  * Release lease and close the file.
  * It will check if the lease has ever broken.
  */
-static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
-			  bool *lease_broken)
+static int ll_lease_close_intent(struct obd_client_handle *och,
+				 struct inode *inode,
+				 bool *lease_broken, enum mds_op_bias bias,
+				 void *data)
 {
 	struct ldlm_lock *lock;
 	bool cancelled = true;
@@ -971,19 +1199,71 @@ static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
 		LDLM_LOCK_PUT(lock);
 	}
 
-	CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
-	       PFID(&ll_i2info(inode)->lli_fid), cancelled);
-
-	if (!cancelled)
-		ldlm_cli_cancel(&och->och_lease_handle, 0);
+	CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
+	       PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
 
 	if (lease_broken != NULL)
 		*lease_broken = cancelled;
 
-	rc = ll_close_inode_openhandle(inode, och, 0, NULL);
+	if (!cancelled && !bias)
+		ldlm_cli_cancel(&och->och_lease_handle, 0);
+
+	if (cancelled) { /* no need to excute intent */
+		bias = 0;
+		data = NULL;
+	}
+
+	rc = ll_close_inode_openhandle(inode, och, bias, data);
 	RETURN(rc);
 }
 
+static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
+			  bool *lease_broken)
+{
+	return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
+}
+
+/**
+ * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
+ */
+static int ll_lease_file_resync(struct obd_client_handle *och,
+				struct inode *inode, unsigned long arg)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct md_op_data *op_data;
+	struct ll_ioc_lease_id ioc;
+	__u64 data_version_unused;
+	int rc;
+	ENTRY;
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
+			   sizeof(ioc)))
+		RETURN(-EFAULT);
+
+	/* before starting file resync, it's necessary to clean up page cache
+	 * in client memory, otherwise once the layout version is increased,
+	 * writing back cached data will be denied the OSTs. */
+	rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
+	if (rc)
+		GOTO(out, rc);
+
+	op_data->op_lease_handle = och->och_lease_handle;
+	op_data->op_mirror_id = ioc.lil_mirror_id;
+	rc = md_file_resync(sbi->ll_md_exp, op_data);
+	if (rc)
+		GOTO(out, rc);
+
+	EXIT;
+out:
+	ll_finish_md_op_data(op_data);
+	return rc;
+}
+
 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
 {
 	struct ll_inode_info *lli = ll_i2info(inode);
@@ -1023,11 +1303,14 @@ int ll_merge_attr(const struct lu_env *env, struct inode *inode)
 	ctime = inode->i_ctime.tv_sec;
 
 	cl_object_attr_lock(obj);
-	rc = cl_object_attr_get(env, obj, attr);
+	if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
+		rc = -EINVAL;
+	else
+		rc = cl_object_attr_get(env, obj, attr);
 	cl_object_attr_unlock(obj);
 
 	if (rc != 0)
-		GOTO(out_size_unlock, rc);
+		GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
 
 	if (atime < attr->cat_atime)
 		atime = attr->cat_atime;
@@ -1054,12 +1337,73 @@ int ll_merge_attr(const struct lu_env *env, struct inode *inode)
 	RETURN(rc);
 }
 
+/**
+ * Set designated mirror for I/O.
+ *
+ * So far only read, write, and truncated can support to issue I/O to
+ * designated mirror.
+ */
+void ll_io_set_mirror(struct cl_io *io, const struct file *file)
+{
+	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+	/* clear layout version for generic(non-resync) I/O in case it carries
+	 * stale layout version due to I/O restart */
+	io->ci_layout_version = 0;
+
+	/* FLR: disable non-delay for designated mirror I/O because obviously
+	 * only one mirror is available */
+	if (fd->fd_designated_mirror > 0) {
+		io->ci_ndelay = 0;
+		io->ci_designated_mirror = fd->fd_designated_mirror;
+		io->ci_layout_version = fd->fd_layout_version;
+	}
+
+	CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
+	       file->f_path.dentry->d_name.name, io->ci_designated_mirror);
+}
+
+/*
+ * This is relatime_need_update() from Linux 5.17, which is not exported.
+ */
+static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
+				struct timespec64 now)
+{
+
+	if (!(mnt->mnt_flags & MNT_RELATIME))
+		return 1;
+	/*
+	 * Is mtime younger than atime? If yes, update atime:
+	 */
+	if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0)
+		return 1;
+	/*
+	 * Is ctime younger than atime? If yes, update atime:
+	 */
+	if (timespec64_compare(&inode->i_ctime, &inode->i_atime) >= 0)
+		return 1;
+
+	/*
+	 * Is the previous atime value older than a day? If yes,
+	 * update atime:
+	 */
+	if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60)
+		return 1;
+	/*
+	 * Good, we can skip the atime update:
+	 */
+	return 0;
+}
+
+/*
+ * Very similar to kernel function: !__atime_needs_update()
+ */
 static bool file_is_noatime(const struct file *file)
 {
-	const struct vfsmount *mnt = file->f_path.mnt;
-	const struct inode *inode = file_inode((struct file *)file);
+	struct vfsmount *mnt = file->f_path.mnt;
+	struct inode *inode = file_inode((struct file *)file);
+	struct timespec64 now;
 
-	/* Adapted from file_accessed() and touch_atime().*/
 	if (file->f_flags & O_NOATIME)
 		return true;
 
@@ -1078,23 +1422,25 @@ static bool file_is_noatime(const struct file *file)
 	if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
 		return true;
 
+	now = current_time(inode);
+
+	if (!relatime_need_update(mnt, inode, now))
+		return true;
+
 	return false;
 }
 
-static int ll_file_io_ptask(struct cfs_ptask *ptask);
-
 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
 {
 	struct inode *inode = file_inode(file);
+	struct ll_file_data *fd  = LUSTRE_FPRIVATE(file);
+
+	io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
+	io->ci_lock_no_expand = fd->ll_lock_no_expand;
 
-	memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
-	init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
-	io->u.ci_rw.rw_file = file;
-	io->u.ci_rw.rw_ptask = ll_file_io_ptask;
-	io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
 	if (iot == CIT_WRITE) {
-		io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
-		io->u.ci_rw.rw_sync   = !!(file->f_flags & O_SYNC ||
+		io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
+		io->u.ci_wr.wr_sync   = !!(file->f_flags & O_SYNC ||
 					   file->f_flags & O_DIRECT ||
 					   IS_SYNC(inode));
 	}
@@ -1107,94 +1453,12 @@ static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
 		io->ci_lockreq = CILR_MANDATORY;
 	}
 	io->ci_noatime = file_is_noatime(file);
-	if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
-		io->ci_pio = !io->u.ci_rw.rw_append;
-	else
-		io->ci_pio = 0;
-}
-
-static int ll_file_io_ptask(struct cfs_ptask *ptask)
-{
-	struct cl_io_pt *pt = ptask->pt_cbdata;
-	struct file *file = pt->cip_file;
-	struct lu_env *env;
-	struct cl_io *io;
-	loff_t pos = pt->cip_pos;
-	int rc;
-	__u16 refcheck;
-	ENTRY;
-
-	env = cl_env_get(&refcheck);
-	if (IS_ERR(env))
-		RETURN(PTR_ERR(env));
-
-	CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
-		file_dentry(file)->d_name.name,
-		pt->cip_iot == CIT_READ ? "read" : "write",
-		pos, pos + pt->cip_count);
-
-restart:
-	io = vvp_env_thread_io(env);
-	ll_io_init(io, file, pt->cip_iot);
-	io->u.ci_rw.rw_iter = pt->cip_iter;
-	io->u.ci_rw.rw_iocb = pt->cip_iocb;
-	io->ci_pio = 0; /* It's already in parallel task */
-
-	rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
-			   pt->cip_count - pt->cip_result);
-	if (!rc) {
-		struct vvp_io *vio = vvp_env_io(env);
-
-		vio->vui_io_subtype = IO_NORMAL;
-		vio->vui_fd = LUSTRE_FPRIVATE(file);
-
-		ll_cl_add(file, env, io, LCC_RW);
-		rc = cl_io_loop(env, io);
-		ll_cl_remove(file, env);
-	} else {
-		/* cl_io_rw_init() handled IO */
-		rc = io->ci_result;
-	}
-
-	if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
-		if (io->ci_nob > 0)
-			io->ci_nob /= 2;
-		rc = -EIO;
-	}
-
-	if (io->ci_nob > 0) {
-		pt->cip_result += io->ci_nob;
-		iov_iter_advance(&pt->cip_iter, io->ci_nob);
-		pos += io->ci_nob;
-		pt->cip_iocb.ki_pos = pos;
-#ifdef HAVE_KIOCB_KI_LEFT
-		pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
-#elif defined(HAVE_KI_NBYTES)
-		pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
-#endif
-	}
-
-	cl_io_fini(env, io);
-
-	if ((rc == 0 || rc == -ENODATA) &&
-	    pt->cip_result < pt->cip_count &&
-	    io->ci_need_restart) {
-		CDEBUG(D_VFSTRACE,
-			"%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
-			file_dentry(file)->d_name.name,
-			pt->cip_iot == CIT_READ ? "read" : "write",
-			pos, pos + pt->cip_count - pt->cip_result,
-			pt->cip_result, rc);
-		goto restart;
-	}
 
-	CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
-		file_dentry(file)->d_name.name,
-		pt->cip_iot == CIT_READ ? "read" : "write",
-		pt->cip_result, rc);
+	/* FLR: only use non-delay I/O for read as there is only one
+	 * avaliable mirror for write. */
+	io->ci_ndelay = !(iot == CIT_WRITE);
 
-	cl_env_put(env, &refcheck);
-	RETURN(pt->cip_result > 0 ? 0 : rc);
+	ll_io_set_mirror(io, file);
 }
 
 static ssize_t
@@ -1202,45 +1466,43 @@ ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
 		   struct file *file, enum cl_io_type iot,
 		   loff_t *ppos, size_t count)
 {
-	struct range_lock	range;
 	struct vvp_io		*vio = vvp_env_io(env);
 	struct inode		*inode = file_inode(file);
 	struct ll_inode_info	*lli = ll_i2info(inode);
 	struct ll_file_data	*fd  = LUSTRE_FPRIVATE(file);
+	struct range_lock	range;
 	struct cl_io		*io;
-	loff_t			pos = *ppos;
 	ssize_t			result = 0;
 	int			rc = 0;
+	unsigned		retried = 0;
+	bool			restarted = false;
 
 	ENTRY;
 
-	CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
+	CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
 		file_dentry(file)->d_name.name,
-		iot == CIT_READ ? "read" : "write", pos, pos + count);
+		iot == CIT_READ ? "read" : "write", *ppos, count);
 
 restart:
 	io = vvp_env_thread_io(env);
 	ll_io_init(io, file, iot);
-	if (args->via_io_subtype == IO_NORMAL) {
-		io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
-		io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
-	} else {
-		io->ci_pio = 0;
-	}
+	io->ci_ndelay_tried = retried;
 
-	if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
+	if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
 		bool range_locked = false;
 
 		if (file->f_flags & O_APPEND)
 			range_lock_init(&range, 0, LUSTRE_EOF);
 		else
-			range_lock_init(&range, pos, pos + count - 1);
+			range_lock_init(&range, *ppos, *ppos + count - 1);
 
 		vio->vui_fd  = LUSTRE_FPRIVATE(file);
 		vio->vui_io_subtype = args->via_io_subtype;
 
 		switch (vio->vui_io_subtype) {
 		case IO_NORMAL:
+			vio->vui_iter = args->u.normal.via_iter;
+			vio->vui_iocb = args->u.normal.via_iocb;
 			/* Direct IO reads must also take range lock,
 			 * or multiple reads will try to work on the same pages
 			 * See LU-6227 for details. */
@@ -1266,16 +1528,7 @@ ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
 		}
 
 		ll_cl_add(file, env, io, LCC_RW);
-		if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
-		    !lli->lli_inode_locked) {
-			inode_lock(inode);
-			lli->lli_inode_locked = 1;
-		}
 		rc = cl_io_loop(env, io);
-		if (lli->lli_inode_locked) {
-			lli->lli_inode_locked = 0;
-			inode_unlock(inode);
-		}
 		ll_cl_remove(file, env);
 
 		if (range_locked) {
@@ -1291,38 +1544,29 @@ ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
 	if (io->ci_nob > 0) {
 		result += io->ci_nob;
 		count  -= io->ci_nob;
+		*ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
 
-		if (args->via_io_subtype == IO_NORMAL) {
-			iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
-
-			/* CLIO is too complicated. See LU-11069. */
-			if (cl_io_is_append(io))
-				pos = io->u.ci_rw.rw_iocb.ki_pos;
-			else
-				pos += io->ci_nob;
-
-			args->u.normal.via_iocb->ki_pos = pos;
-			if (io->ci_pio) {
-#ifdef HAVE_KIOCB_KI_LEFT
-				args->u.normal.via_iocb->ki_left = count;
-#elif defined(HAVE_KI_NBYTES)
-				args->u.normal.via_iocb->ki_nbytes = count;
-#endif
-			}
-		} else {
-			/* for splice */
-			pos = io->u.ci_rw.rw_range.cir_pos;
-		}
+		/* prepare IO restart */
+		if (count > 0 && args->via_io_subtype == IO_NORMAL)
+			args->u.normal.via_iter = vio->vui_iter;
 	}
 out:
 	cl_io_fini(env, io);
 
+	CDEBUG(D_VFSTRACE,
+	       "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
+	       file->f_path.dentry->d_name.name,
+	       iot, rc, result, io->ci_need_restart);
+
 	if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
 		CDEBUG(D_VFSTRACE,
-			"%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
-			file_dentry(file)->d_name.name,
-			iot == CIT_READ ? "read" : "write",
-			pos, pos + count, result, rc);
+		       "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
+		       file_dentry(file)->d_name.name,
+		       iot == CIT_READ ? "read" : "write",
+		       *ppos, count, result, rc);
+		/* preserve the tried count for FLR */
+		retried = io->ci_ndelay_tried;
+		restarted = true;
 		goto restart;
 	}
 
@@ -1346,11 +1590,7 @@ ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
 		}
 	}
 
-	CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
-		file_dentry(file)->d_name.name,
-		iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
-
-	*ppos = pos;
+	CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
 
 	RETURN(result > 0 ? result : rc);
 }
@@ -1391,8 +1631,7 @@ ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
  * \retval - number of bytes have been read, or error code if error occurred.
  */
 static ssize_t
-ll_do_fast_read(const struct lu_env *env, struct kiocb *iocb,
-		struct iov_iter *iter)
+ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
 {
 	ssize_t result;
 
@@ -1404,9 +1643,7 @@ ll_do_fast_read(const struct lu_env *env, struct kiocb *iocb,
 	if (iocb->ki_filp->f_flags & O_DIRECT)
 		return 0;
 
-	ll_cl_add(iocb->ki_filp, env, NULL, LCC_RW);
 	result = generic_file_read_iter(iocb, iter);
-	ll_cl_remove(iocb->ki_filp, env);
 
 	/* If the first page is not in cache, generic_file_aio_read() will be
 	 * returned with -ENODATA.
@@ -1428,34 +1665,101 @@ static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
 	struct lu_env *env;
 	struct vvp_io_args *args;
+	struct file *file = iocb->ki_filp;
 	ssize_t result;
 	ssize_t rc2;
 	__u16 refcheck;
 
-	env = cl_env_get(&refcheck);
-	if (IS_ERR(env))
-		return PTR_ERR(env);
+	if (!iov_iter_count(to))
+		return 0;
 
-	result = ll_do_fast_read(env, iocb, to);
+	result = ll_do_fast_read(iocb, to);
 	if (result < 0 || iov_iter_count(to) == 0)
 		GOTO(out, result);
 
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		return PTR_ERR(env);
+
 	args = ll_env_args(env, IO_NORMAL);
 	args->u.normal.via_iter = to;
 	args->u.normal.via_iocb = iocb;
 
-	rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
+	rc2 = ll_file_io_generic(env, args, file, CIT_READ,
 				 &iocb->ki_pos, iov_iter_count(to));
 	if (rc2 > 0)
 		result += rc2;
 	else if (result == 0)
 		result = rc2;
 
-out:
 	cl_env_put(env, &refcheck);
+out:
+	if (result > 0)
+		ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
+				  LUSTRE_FPRIVATE(file), iocb->ki_pos, result,
+				  READ);
+
 	return result;
 }
 
+/**
+ * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
+ * If a page is already in the page cache and dirty (and some other things -
+ * See ll_tiny_write_begin for the instantiation of these rules), then we can
+ * write to it without doing a full I/O, because Lustre already knows about it
+ * and will write it out.  This saves a lot of processing time.
+ *
+ * All writes here are within one page, so exclusion is handled by the page
+ * lock on the vm page.  We do not do tiny writes for writes which touch
+ * multiple pages because it's very unlikely multiple sequential pages are
+ * are already dirty.
+ *
+ * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
+ * and are unlikely to be to already dirty pages.
+ *
+ * Attribute updates are important here, we do them in ll_tiny_write_end.
+ */
+static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
+{
+	ssize_t count = iov_iter_count(iter);
+	struct  file *file = iocb->ki_filp;
+	struct  inode *inode = file_inode(file);
+	bool    lock_inode = !IS_NOSEC(inode);
+	ssize_t result = 0;
+
+	ENTRY;
+
+	/* Restrict writes to single page and < PAGE_SIZE.  See comment at top
+	 * of function for why.
+	 */
+	if (count >= PAGE_SIZE ||
+	    (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
+		RETURN(0);
+
+	if (unlikely(lock_inode))
+		inode_lock(inode);
+	result = __generic_file_write_iter(iocb, iter);
+
+	if (unlikely(lock_inode))
+		inode_unlock(inode);
+
+	/* If the page is not already dirty, ll_tiny_write_begin returns
+	 * -ENODATA.  We continue on to normal write.
+	 */
+	if (result == -ENODATA)
+		result = 0;
+
+	if (result > 0) {
+		ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
+				   result);
+		ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
+	}
+
+	CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
+
+	RETURN(result);
+}
+
 /*
  * Write to a file (through the page cache).
  */
@@ -1463,21 +1767,57 @@ static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct vvp_io_args *args;
 	struct lu_env *env;
-	ssize_t result;
+	ssize_t rc_tiny = 0, rc_normal;
+	struct file *file = iocb->ki_filp;
 	__u16 refcheck;
 
-	env = cl_env_get(&refcheck);
-	if (IS_ERR(env))
-		return PTR_ERR(env);
+	ENTRY;
+
+	if (!iov_iter_count(from))
+		GOTO(out, rc_normal = 0);
+
+	/* NB: we can't do direct IO for tiny writes because they use the page
+	 * cache, we can't do sync writes because tiny writes can't flush
+	 * pages, and we can't do append writes because we can't guarantee the
+	 * required DLM locks are held to protect file size.
+	 */
+	if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(file))) &&
+	    !(file->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
+		rc_tiny = ll_do_tiny_write(iocb, from);
+
+	/* In case of error, go on and try normal write - Only stop if tiny
+	 * write completed I/O.
+	 */
+	if (iov_iter_count(from) == 0)
+		GOTO(out, rc_normal = rc_tiny);
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		return PTR_ERR(env);
 
 	args = ll_env_args(env, IO_NORMAL);
 	args->u.normal.via_iter = from;
 	args->u.normal.via_iocb = iocb;
 
-	result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
-				    &iocb->ki_pos, iov_iter_count(from));
+	rc_normal = ll_file_io_generic(env, args, file, CIT_WRITE,
+				       &iocb->ki_pos, iov_iter_count(from));
+
+	/* On success, combine bytes written. */
+	if (rc_tiny >= 0 && rc_normal > 0)
+		rc_normal += rc_tiny;
+	/* On error, only return error from normal write if tiny write did not
+	 * write any bytes.  Otherwise return bytes written by tiny write.
+	 */
+	else if (rc_tiny > 0)
+		rc_normal = rc_tiny;
+
 	cl_env_put(env, &refcheck);
-	return result;
+out:
+	if (rc_normal > 0)
+		ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
+				  LUSTRE_FPRIVATE(file), iocb->ki_pos,
+				  rc_normal, WRITE);
+	RETURN(rc_normal);
 }
 
 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
@@ -1524,6 +1864,9 @@ static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 	if (result)
 		RETURN(result);
 
+	if (!iov_count)
+		RETURN(0);
+
 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
 	iov_iter_init(&to, READ, iov, nr_segs, iov_count);
 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
@@ -1538,30 +1881,26 @@ static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
 			    loff_t *ppos)
 {
-	struct lu_env *env;
 	struct iovec   iov = { .iov_base = buf, .iov_len = count };
-	struct kiocb  *kiocb;
+	struct kiocb   kiocb;
 	ssize_t        result;
-	__u16          refcheck;
+
 	ENTRY;
 
-	env = cl_env_get(&refcheck);
-	if (IS_ERR(env))
-		RETURN(PTR_ERR(env));
+	if (!count)
+		RETURN(0);
 
-	kiocb = &ll_env_info(env)->lti_kiocb;
-        init_sync_kiocb(kiocb, file);
-        kiocb->ki_pos = *ppos;
+	init_sync_kiocb(&kiocb, file);
+	kiocb.ki_pos = *ppos;
 #ifdef HAVE_KIOCB_KI_LEFT
-	kiocb->ki_left = count;
+	kiocb.ki_left = count;
 #elif defined(HAVE_KI_NBYTES)
-	kiocb->ki_nbytes = count;
+	kiocb.i_nbytes = count;
 #endif
 
-	result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
-	*ppos = kiocb->ki_pos;
+	result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
+	*ppos = kiocb.ki_pos;
 
-	cl_env_put(env, &refcheck);
 	RETURN(result);
 }
 
@@ -1581,6 +1920,9 @@ static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	if (result)
 		RETURN(result);
 
+	if (!iov_count)
+		RETURN(0);
+
 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
 	iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
@@ -1595,31 +1937,27 @@ static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 static ssize_t ll_file_write(struct file *file, const char __user *buf,
 			     size_t count, loff_t *ppos)
 {
-	struct lu_env *env;
 	struct iovec   iov = { .iov_base = (void __user *)buf,
 			       .iov_len = count };
-        struct kiocb  *kiocb;
-        ssize_t        result;
-	__u16          refcheck;
-        ENTRY;
+	struct kiocb   kiocb;
+	ssize_t        result;
 
-        env = cl_env_get(&refcheck);
-        if (IS_ERR(env))
-                RETURN(PTR_ERR(env));
+	ENTRY;
 
-	kiocb = &ll_env_info(env)->lti_kiocb;
-        init_sync_kiocb(kiocb, file);
-        kiocb->ki_pos = *ppos;
+	if (!count)
+		RETURN(0);
+
+	init_sync_kiocb(&kiocb, file);
+	kiocb.ki_pos = *ppos;
 #ifdef HAVE_KIOCB_KI_LEFT
-	kiocb->ki_left = count;
+	kiocb.ki_left = count;
 #elif defined(HAVE_KI_NBYTES)
-	kiocb->ki_nbytes = count;
+	kiocb.ki_nbytes = count;
 #endif
 
-	result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
-	*ppos = kiocb->ki_pos;
+	result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
+	*ppos = kiocb.ki_pos;
 
-	cl_env_put(env, &refcheck);
 	RETURN(result);
 }
 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
@@ -1647,6 +1985,11 @@ static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
 
         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
         cl_env_put(env, &refcheck);
+
+	if (result > 0)
+		ll_rw_stats_tally(ll_i2sbi(file_inode(in_file)), current->pid,
+				  LUSTRE_FPRIVATE(in_file), *ppos, result,
+				  READ);
         RETURN(result);
 }
 
@@ -1660,6 +2003,12 @@ int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
 	int rc;
 	ENTRY;
 
+	if ((__swab32(lum->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) ==
+	    le32_to_cpu(LOV_MAGIC_MAGIC)) {
+		/* this code will only exist for big-endian systems */
+		lustre_swab_lov_user_md(lum, 0);
+	}
+
 	ll_inode_size_lock(inode);
 	rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
 	if (rc < 0)
@@ -1722,13 +2071,14 @@ int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
 	    lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
 		GOTO(out, rc = -EPROTO);
 
-        /*
-         * This is coming from the MDS, so is probably in
-         * little endian.  We convert it to host endian before
-         * passing it to userspace.
-         */
-        if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
-		int stripe_count;
+	/*
+	 * This is coming from the MDS, so is probably in
+	 * little endian.  We convert it to host endian before
+	 * passing it to userspace.
+	 */
+	if ((lmm->lmm_magic & __swab32(LOV_MAGIC_MAGIC)) ==
+	    __swab32(LOV_MAGIC_MAGIC)) {
+		int stripe_count = 0;
 
 		if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
 		    lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
@@ -1738,27 +2088,19 @@ int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
 				stripe_count = 0;
 		}
 
-                /* if function called for directory - we should
-                 * avoid swab not existent lsm objects */
-                if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
-			lustre_swab_lov_user_md_v1(
-					(struct lov_user_md_v1 *)lmm);
-			if (S_ISREG(body->mbo_mode))
-				lustre_swab_lov_user_md_objects(
-				    ((struct lov_user_md_v1 *)lmm)->lmm_objects,
-				    stripe_count);
-		} else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
-			lustre_swab_lov_user_md_v3(
-					(struct lov_user_md_v3 *)lmm);
-			if (S_ISREG(body->mbo_mode))
-				lustre_swab_lov_user_md_objects(
-				    ((struct lov_user_md_v3 *)lmm)->lmm_objects,
-				    stripe_count);
-		} else if (lmm->lmm_magic ==
-			   cpu_to_le32(LOV_MAGIC_COMP_V1)) {
-			lustre_swab_lov_comp_md_v1(
-					(struct lov_comp_md_v1 *)lmm);
-		}
+		lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0);
+
+		/* if function called for directory - we should
+		 * avoid swab not existent lsm objects */
+		if (lmm->lmm_magic == LOV_MAGIC_V1 && S_ISREG(body->mbo_mode))
+			lustre_swab_lov_user_md_objects(
+				((struct lov_user_md_v1 *)lmm)->lmm_objects,
+				stripe_count);
+		else if (lmm->lmm_magic == LOV_MAGIC_V3 &&
+			 S_ISREG(body->mbo_mode))
+			lustre_swab_lov_user_md_objects(
+				((struct lov_user_md_v3 *)lmm)->lmm_objects,
+				stripe_count);
 	}
 
 out:
@@ -1845,7 +2187,7 @@ static int ll_lov_setstripe(struct inode *inode, struct file *file,
 	cl_lov_delay_create_clear(&file->f_flags);
 
 out:
-	OBD_FREE(klum, lum_size);
+	OBD_FREE_LARGE(klum, lum_size);
 	RETURN(rc);
 }
 
@@ -1888,6 +2230,10 @@ ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
 		struct cl_layout cl = {
 			.cl_is_composite = false,
 		};
+		struct lu_extent ext = {
+			.e_start = 0,
+			.e_end = OBD_OBJECT_EOF,
+		};
 
 		env = cl_env_get(&refcheck);
 		if (IS_ERR(env))
@@ -1895,7 +2241,8 @@ ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
 
 		rc = cl_object_layout_get(env, obj, &cl);
 		if (!rc && cl.cl_is_composite)
-			rc = ll_layout_write_intent(inode, 0, OBD_OBJECT_EOF);
+			rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
+						    &ext);
 
 		cl_env_put(env, &refcheck);
 		if (rc)
@@ -1989,7 +2336,9 @@ int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
         if (!och)
                 GOTO(out, rc = -ENOMEM);
 
-	ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
+	rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
+	if (rc)
+		GOTO(out, rc);
 
 	rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 out:
@@ -2105,18 +2454,8 @@ int ll_fid2path(struct inode *inode, void __user *arg)
 	RETURN(rc);
 }
 
-/*
- * Read the data_version for inode.
- *
- * This value is computed using stripe object version on OST.
- * Version is computed using server side locking.
- *
- * @param flags if do sync on the OST side;
- *		0: no sync
- *		LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
- *		LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
- */
-int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
+static int
+ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
 {
 	struct cl_object *obj = ll_i2info(inode)->lli_clob;
 	struct lu_env *env;
@@ -2126,11 +2465,12 @@ int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
 
 	ENTRY;
 
+	ioc->idv_version = 0;
+	ioc->idv_layout_version = UINT_MAX;
+
 	/* If no file object initialized, we consider its version is 0. */
-	if (obj == NULL) {
-		*data_version = 0;
+	if (obj == NULL)
 		RETURN(0);
-	}
 
 	env = cl_env_get(&refcheck);
 	if (IS_ERR(env))
@@ -2139,7 +2479,8 @@ int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
 	io = vvp_env_thread_io(env);
 	io->ci_obj = obj;
 	io->u.ci_data_version.dv_data_version = 0;
-	io->u.ci_data_version.dv_flags = flags;
+	io->u.ci_data_version.dv_layout_version = UINT_MAX;
+	io->u.ci_data_version.dv_flags = ioc->idv_flags;
 
 restart:
 	if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
@@ -2147,7 +2488,8 @@ int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
 	else
 		result = io->ci_result;
 
-	*data_version = io->u.ci_data_version.dv_data_version;
+	ioc->idv_version = io->u.ci_data_version.dv_data_version;
+	ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
 
 	cl_io_fini(env, io);
 
@@ -2159,6 +2501,29 @@ int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
 	RETURN(result);
 }
 
+/*
+ * Read the data_version for inode.
+ *
+ * This value is computed using stripe object version on OST.
+ * Version is computed using server side locking.
+ *
+ * @param flags if do sync on the OST side;
+ *		0: no sync
+ *		LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
+ *		LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
+ */
+int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
+{
+	struct ioc_data_version ioc = { .idv_flags = flags };
+	int rc;
+
+	rc = ll_ioc_data_version(inode, &ioc);
+	if (!rc)
+		*data_version = ioc.idv_version;
+
+	return rc;
+}
+
 /*
  * Trigger a HSM release request for the provided inode.
  */
@@ -2175,7 +2540,20 @@ int ll_hsm_release(struct inode *inode)
 	       ll_get_fsname(inode->i_sb, NULL, 0),
 	       PFID(&ll_i2info(inode)->lli_fid));
 
-	och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
+	/*
+	 * For directory, this is not the right
+	 * way to do the release. Ideally this should clean
+	 * up the directory without triggering update to the backend.
+	 * Right now, this just sets the RELEASED bit for the
+	 * directory. This is left as is so as to have a way to set
+	 * the RELEASED bit as a deug/recovery method
+	 * instead of doing a rm on the directory.
+	 * TODO-MDLL: Tracking SIM - Simba-21969
+	 */
+	if (S_ISDIR(inode->i_mode))
+		och = ll_lease_open(inode, NULL, FMODE_READ, MDS_OPEN_RELEASE);
+	else
+		och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
 	if (IS_ERR(och))
 		GOTO(out, rc = PTR_ERR(och));
 
@@ -2184,12 +2562,21 @@ int ll_hsm_release(struct inode *inode)
 	if (rc != 0)
 		GOTO(out, rc);
 
-	env = cl_env_get(&refcheck);
-	if (IS_ERR(env))
-		GOTO(out, rc = PTR_ERR(env));
+	/* Don't need to merge these attrs for directories */
+	if (!S_ISDIR(inode->i_mode)) {
+		env = cl_env_get(&refcheck);
+		if (IS_ERR(env))
+			GOTO(out, rc = PTR_ERR(env));
 
-	ll_merge_attr(env, inode);
-	cl_env_put(env, &refcheck);
+		rc = ll_merge_attr(env, inode);
+		cl_env_put(env, &refcheck);
+
+		/* If error happen, we have the wrong size for a file.
+		 * Don't release it.
+		 */
+		if (rc != 0)
+			GOTO(out, rc);
+	}
 
 	/* Release the file.
 	 * NB: lease lock handle is released in mdc_hsm_release_pack() because
@@ -2323,8 +2710,9 @@ static int ll_swap_layouts(struct file *file1, struct file *file2,
 
 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
 {
-	struct md_op_data	*op_data;
-	int			 rc;
+	struct obd_export *exp = ll_i2mdexp(inode);
+	struct md_op_data *op_data;
+	int rc;
 	ENTRY;
 
 	/* Detect out-of range masks */
@@ -2337,33 +2725,35 @@ int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
 	    !cfs_capable(CFS_CAP_SYS_ADMIN))
 		RETURN(-EPERM);
 
-	/* Detect out-of range archive id */
-	if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
-	    (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
-		RETURN(-EINVAL);
+	if (!exp_connect_archive_id_array(exp)) {
+		/* Detect out-of range archive id */
+		if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
+		    (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
+			RETURN(-EINVAL);
+	}
 
 	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
 				     LUSTRE_OPC_ANY, hss);
 	if (IS_ERR(op_data))
 		RETURN(PTR_ERR(op_data));
 
-	rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
-			   sizeof(*op_data), op_data, NULL);
+	rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
+			   op_data, NULL);
 
 	ll_finish_md_op_data(op_data);
 
 	RETURN(rc);
 }
 
-static int ll_hsm_import(struct inode *inode, struct file *file,
-			 struct hsm_user_import *hui)
+int ll_hsm_import(struct inode *inode, struct file *file,
+		  struct hsm_user_import *hui)
 {
 	struct hsm_state_set	*hss = NULL;
 	struct iattr		*attr = NULL;
 	int			 rc;
 	ENTRY;
 
-	if (!S_ISREG(inode->i_mode))
+	if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
 		RETURN(-EINVAL);
 
 	/* set HSM flags */
@@ -2383,7 +2773,12 @@ static int ll_hsm_import(struct inode *inode, struct file *file,
 		GOTO(out, rc = -ENOMEM);
 
 	attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
-	attr->ia_mode |= S_IFREG;
+
+	if (S_ISDIR(inode->i_mode))
+		attr->ia_mode |= S_IFDIR;
+	else
+		attr->ia_mode |= S_IFREG;
+
 	attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
 	attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
 	attr->ia_size = hui->hui_size;
@@ -2397,13 +2792,23 @@ static int ll_hsm_import(struct inode *inode, struct file *file,
 			 ATTR_MTIME | ATTR_MTIME_SET |
 			 ATTR_ATIME | ATTR_ATIME_SET;
 
-	inode_lock(inode);
+	/*
+	 * TODO-MDLL check if this needs to be done here
+	 * or in ll_setattr_raw().  The ll_setattr_raw does a
+	 * unlock() before it calls the ll_md_setattr() for
+	 * regular files using S_ISREG(). Calling this for
+	 * inodes other than files might result in a deadlock.
+	 * Tracked with Simba-20393.
+	 */
+	if (S_ISREG(inode->i_mode))
+		inode_lock(inode);
 
-	rc = ll_setattr_raw(file_dentry(file), attr, true);
+	rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
 	if (rc == -ENODATA)
 		rc = 0;
 
-	inode_unlock(inode);
+	if (S_ISREG(inode->i_mode))
+		inode_unlock(inode);
 
 out:
 	if (hss != NULL)
@@ -2427,7 +2832,7 @@ static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
 	struct iattr ia = {
 		.ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
 			    ATTR_MTIME | ATTR_MTIME_SET |
-			    ATTR_CTIME | ATTR_CTIME_SET,
+			    ATTR_CTIME,
 		.ia_atime = {
 			.tv_sec = lfu->lfu_atime_sec,
 			.tv_nsec = lfu->lfu_atime_nsec,
@@ -2451,12 +2856,197 @@ static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
 		RETURN(-EINVAL);
 
 	inode_lock(inode);
-	rc = ll_setattr_raw(file_dentry(file), &ia, false);
+	rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
+			    false);
 	inode_unlock(inode);
 
 	RETURN(rc);
 }
 
+static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
+{
+	switch (mode) {
+	case MODE_READ_USER:
+		return CLM_READ;
+	case MODE_WRITE_USER:
+		return CLM_WRITE;
+	default:
+		return -EINVAL;
+	}
+}
+
+static const char *const user_lockname[] = LOCK_MODE_NAMES;
+
+/* Used to allow the upper layers of the client to request an LDLM lock
+ * without doing an actual read or write.
+ *
+ * Used for ladvise lockahead to manually request specific locks.
+ *
+ * \param[in] file	file this ladvise lock request is on
+ * \param[in] ladvise	ladvise struct describing this lock request
+ *
+ * \retval 0		success, no detailed result available (sync requests
+ *			and requests sent to the server [not handled locally]
+ *			cannot return detailed results)
+ * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
+ *					 see definitions for details.
+ * \retval negative	negative errno on error
+ */
+int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
+{
+	struct lu_env *env = NULL;
+	struct cl_io *io  = NULL;
+	struct cl_lock *lock = NULL;
+	struct cl_lock_descr *descr = NULL;
+	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	enum cl_lock_mode cl_mode;
+	off_t start = ladvise->lla_start;
+	off_t end = ladvise->lla_end;
+	int result;
+	__u16 refcheck;
+
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
+	       "start=%llu, end=%llu\n", dentry->d_name.len,
+	       dentry->d_name.name, dentry->d_inode,
+	       user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
+	       (__u64) end);
+
+	cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
+	if (cl_mode < 0)
+		GOTO(out, result = cl_mode);
+
+	/* Get IO environment */
+	result = cl_io_get(inode, &env, &io, &refcheck);
+	if (result <= 0)
+		GOTO(out, result);
+
+	result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+	if (result > 0) {
+		/*
+		 * nothing to do for this io. This currently happens when
+		 * stripe sub-object's are not yet created.
+		 */
+		result = io->ci_result;
+	} else if (result == 0) {
+		lock = vvp_env_lock(env);
+		descr = &lock->cll_descr;
+
+		descr->cld_obj   = io->ci_obj;
+		/* Convert byte offsets to pages */
+		descr->cld_start = cl_index(io->ci_obj, start);
+		descr->cld_end   = cl_index(io->ci_obj, end);
+		descr->cld_mode  = cl_mode;
+		/* CEF_MUST is used because we do not want to convert a
+		 * lockahead request to a lockless lock */
+		descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
+				       CEF_NONBLOCK;
+
+		if (ladvise->lla_peradvice_flags & LF_ASYNC)
+			descr->cld_enq_flags |= CEF_SPECULATIVE;
+
+		result = cl_lock_request(env, io, lock);
+
+		/* On success, we need to release the lock */
+		if (result >= 0)
+			cl_lock_release(env, lock);
+	}
+	cl_io_fini(env, io);
+	cl_env_put(env, &refcheck);
+
+	/* -ECANCELED indicates a matching lock with a different extent
+	 * was already present, and -EEXIST indicates a matching lock
+	 * on exactly the same extent was already present.
+	 * We convert them to positive values for userspace to make
+	 * recognizing true errors easier.
+	 * Note we can only return these detailed results on async requests,
+	 * as sync requests look the same as i/o requests for locking. */
+	if (result == -ECANCELED)
+		result = LLA_RESULT_DIFFERENT;
+	else if (result == -EEXIST)
+		result = LLA_RESULT_SAME;
+
+out:
+	RETURN(result);
+}
+static const char *const ladvise_names[] = LU_LADVISE_NAMES;
+
+static int ll_ladvise_sanity(struct inode *inode,
+			     struct llapi_lu_ladvise *ladvise)
+{
+	enum lu_ladvise_type advice = ladvise->lla_advice;
+	/* Note the peradvice flags is a 32 bit field, so per advice flags must
+	 * be in the first 32 bits of enum ladvise_flags */
+	__u32 flags = ladvise->lla_peradvice_flags;
+	/* 3 lines at 80 characters per line, should be plenty */
+	int rc = 0;
+
+	if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
+		rc = -EINVAL;
+		CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
+		       "last supported advice is %s (value '%d'): rc = %d\n",
+		       ll_get_fsname(inode->i_sb, NULL, 0), advice,
+		       ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
+		GOTO(out, rc);
+	}
+
+	/* Per-advice checks */
+	switch (advice) {
+	case LU_LADVISE_LOCKNOEXPAND:
+		if (flags & ~LF_LOCKNOEXPAND_MASK) {
+			rc = -EINVAL;
+			CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
+			       "rc = %d\n",
+			       ll_get_fsname(inode->i_sb, NULL, 0), flags,
+			       ladvise_names[advice], rc);
+			GOTO(out, rc);
+		}
+		break;
+	case LU_LADVISE_LOCKAHEAD:
+		/* Currently only READ and WRITE modes can be requested */
+		if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
+		    ladvise->lla_lockahead_mode == 0) {
+			rc = -EINVAL;
+			CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
+			       "rc = %d\n",
+			       ll_get_fsname(inode->i_sb, NULL, 0),
+			       ladvise->lla_lockahead_mode,
+			       ladvise_names[advice], rc);
+			GOTO(out, rc);
+		}
+		fallthrough;
+	case LU_LADVISE_WILLREAD:
+	case LU_LADVISE_DONTNEED:
+	default:
+		/* Note fall through above - These checks apply to all advices
+		 * except LOCKNOEXPAND */
+		if (flags & ~LF_DEFAULT_MASK) {
+			rc = -EINVAL;
+			CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
+			       "rc = %d\n",
+			       ll_get_fsname(inode->i_sb, NULL, 0), flags,
+			       ladvise_names[advice], rc);
+			GOTO(out, rc);
+		}
+		if (ladvise->lla_start >= ladvise->lla_end) {
+			rc = -EINVAL;
+			CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
+			       "for %s: rc = %d\n",
+			       ll_get_fsname(inode->i_sb, NULL, 0),
+			       ladvise->lla_start, ladvise->lla_end,
+			       ladvise_names[advice], rc);
+			GOTO(out, rc);
+		}
+		break;
+	}
+
+out:
+	return rc;
+}
+#undef ERRSIZE
+
 /*
  * Give file access advices
  *
@@ -2506,6 +3096,15 @@ static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
 	RETURN(rc);
 }
 
+static int ll_lock_noexpand(struct file *file, int flags)
+{
+	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+	fd->ll_lock_no_expand = !(flags & LF_UNSET);
+
+	return 0;
+}
+
 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
 			unsigned long arg)
 {
@@ -2516,7 +3115,9 @@ int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
 			   sizeof(fsxattr)))
 		RETURN(-EFAULT);
 
-	fsxattr.fsx_xflags = ll_inode_to_ext_flags(inode->i_flags);
+	fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
+	if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
+		fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
 	fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
 	if (copy_to_user((struct fsxattr __user *)arg,
 			 &fsxattr, sizeof(fsxattr)))
@@ -2525,55 +3126,276 @@ int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
 	RETURN(0);
 }
 
-int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
-			unsigned long arg)
+int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
+{
+	/*
+	 * Project Quota ID state is only allowed to change from within the init
+	 * namespace. Enforce that restriction only if we are trying to change
+	 * the quota ID state. Everything else is allowed in user namespaces.
+	 */
+	if (current_user_ns() == &init_user_ns)
+		return 0;
+
+	if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
+		return -EINVAL;
+
+	if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
+		if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
+			return -EINVAL;
+	} else {
+		if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
+			unsigned long arg)
+{
+
+	struct md_op_data *op_data;
+	struct ptlrpc_request *req = NULL;
+	int rc = 0;
+	struct fsxattr fsxattr;
+	struct cl_object *obj;
+	struct iattr *attr;
+	int flags;
+
+	if (copy_from_user(&fsxattr,
+			   (const struct fsxattr __user *)arg,
+			   sizeof(fsxattr)))
+		RETURN(-EFAULT);
+
+	rc = ll_ioctl_check_project(inode, &fsxattr);
+	if (rc)
+		RETURN(rc);
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
+	op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
+	if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
+		op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
+	op_data->op_projid = fsxattr.fsx_projid;
+	op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
+	rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
+			0, &req);
+	ptlrpc_req_finished(req);
+	if (rc)
+		GOTO(out_fsxattr, rc);
+	ll_update_inode_flags(inode, op_data->op_attr_flags);
+	obj = ll_i2info(inode)->lli_clob;
+	if (obj == NULL)
+		GOTO(out_fsxattr, rc);
+
+	OBD_ALLOC_PTR(attr);
+	if (attr == NULL)
+		GOTO(out_fsxattr, rc = -ENOMEM);
+
+	rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
+			    fsxattr.fsx_xflags);
+	OBD_FREE_PTR(attr);
+out_fsxattr:
+	ll_finish_md_op_data(op_data);
+	RETURN(rc);
+}
+
+static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
+				 unsigned long arg)
+{
+	struct inode		*inode = file_inode(file);
+	struct ll_file_data	*fd = LUSTRE_FPRIVATE(file);
+	struct ll_inode_info	*lli = ll_i2info(inode);
+	struct obd_client_handle *och = NULL;
+	struct split_param sp;
+	bool lease_broken;
+	fmode_t fmode = 0;
+	enum mds_op_bias bias = 0;
+	struct file *layout_file = NULL;
+	void *data = NULL;
+	size_t data_size = 0;
+	long rc;
+	ENTRY;
+
+	mutex_lock(&lli->lli_och_mutex);
+	if (fd->fd_lease_och != NULL) {
+		och = fd->fd_lease_och;
+		fd->fd_lease_och = NULL;
+	}
+	mutex_unlock(&lli->lli_och_mutex);
+
+	if (och == NULL)
+		GOTO(out, rc = -ENOLCK);
+
+	fmode = och->och_flags;
+
+	switch (ioc->lil_flags) {
+	case LL_LEASE_RESYNC_DONE:
+		if (ioc->lil_count > IOC_IDS_MAX)
+			GOTO(out, rc = -EINVAL);
+
+		data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
+		OBD_ALLOC(data, data_size);
+		if (!data)
+			GOTO(out, rc = -ENOMEM);
+
+		if (copy_from_user(data, (void __user *)arg, data_size))
+			GOTO(out, rc = -EFAULT);
+
+		bias = MDS_CLOSE_RESYNC_DONE;
+		break;
+	case LL_LEASE_LAYOUT_MERGE: {
+		int fd;
+
+		if (ioc->lil_count != 1)
+			GOTO(out, rc = -EINVAL);
+
+		arg += sizeof(*ioc);
+		if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
+			GOTO(out, rc = -EFAULT);
+
+		layout_file = fget(fd);
+		if (!layout_file)
+			GOTO(out, rc = -EBADF);
+
+		if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
+				(layout_file->f_flags & O_ACCMODE) == O_RDONLY)
+			GOTO(out, rc = -EPERM);
+
+		data = file_inode(layout_file);
+		bias = MDS_CLOSE_LAYOUT_MERGE;
+		break;
+	}
+	case LL_LEASE_LAYOUT_SPLIT: {
+		int fdv;
+		int mirror_id;
+
+		if (ioc->lil_count != 2)
+			GOTO(out, rc = -EINVAL);
+
+		arg += sizeof(*ioc);
+		if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
+			GOTO(out, rc = -EFAULT);
+
+		arg += sizeof(__u32);
+		if (copy_from_user(&mirror_id, (void __user *)arg,
+				   sizeof(__u32)))
+			GOTO(out, rc = -EFAULT);
+
+		layout_file = fget(fdv);
+		if (!layout_file)
+			GOTO(out, rc = -EBADF);
+
+		sp.sp_inode = file_inode(layout_file);
+		sp.sp_mirror_id = (__u16)mirror_id;
+		data = &sp;
+		bias = MDS_CLOSE_LAYOUT_SPLIT;
+		break;
+	}
+	default:
+		/* without close intent */
+		break;
+	}
+
+	rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	rc = ll_lease_och_release(inode, file);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	if (lease_broken)
+		fmode = 0;
+	EXIT;
+
+out:
+	switch (ioc->lil_flags) {
+	case LL_LEASE_RESYNC_DONE:
+		if (data)
+			OBD_FREE(data, data_size);
+		break;
+	case LL_LEASE_LAYOUT_MERGE:
+	case LL_LEASE_LAYOUT_SPLIT:
+		if (layout_file)
+			fput(layout_file);
+		break;
+	}
+
+	if (!rc)
+		rc = ll_lease_type_from_fmode(fmode);
+	RETURN(rc);
+}
+
+static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
+			      unsigned long arg)
 {
+	struct inode *inode = file_inode(file);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+	struct obd_client_handle *och = NULL;
+	__u64 open_flags = 0;
+	bool lease_broken;
+	fmode_t fmode;
+	long rc;
+	ENTRY;
 
-	struct md_op_data *op_data;
-	struct ptlrpc_request *req = NULL;
-	int rc = 0;
-	struct fsxattr fsxattr;
-	struct cl_object *obj;
-
-	/* only root could change project ID */
-	if (!cfs_capable(CFS_CAP_SYS_ADMIN))
-		RETURN(-EPERM);
-
-	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
-				     LUSTRE_OPC_ANY, NULL);
-	if (IS_ERR(op_data))
-		RETURN(PTR_ERR(op_data));
-
-	if (copy_from_user(&fsxattr,
-			   (const struct fsxattr __user *)arg,
-			   sizeof(fsxattr)))
-		GOTO(out_fsxattr1, rc = -EFAULT);
+	switch (ioc->lil_mode) {
+	case LL_LEASE_WRLCK:
+		if (!(file->f_mode & FMODE_WRITE))
+			RETURN(-EPERM);
+		fmode = FMODE_WRITE;
+		break;
+	case LL_LEASE_RDLCK:
+		if (!(file->f_mode & FMODE_READ))
+			RETURN(-EPERM);
+		fmode = FMODE_READ;
+		break;
+	case LL_LEASE_UNLCK:
+		RETURN(ll_file_unlock_lease(file, ioc, arg));
+	default:
+		RETURN(-EINVAL);
+	}
 
-	op_data->op_attr_flags = fsxattr.fsx_xflags;
-	op_data->op_projid = fsxattr.fsx_projid;
-	op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
-	rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
-			0, &req);
-	ptlrpc_req_finished(req);
+	CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
 
-	obj = ll_i2info(inode)->lli_clob;
-	if (obj) {
-		struct iattr *attr;
+	/* apply for lease */
+	if (ioc->lil_flags & LL_LEASE_RESYNC)
+		open_flags = MDS_OPEN_RESYNC;
+	och = ll_lease_open(inode, file, fmode, open_flags);
+	if (IS_ERR(och))
+		RETURN(PTR_ERR(och));
 
-		inode->i_flags = ll_ext_to_inode_flags(fsxattr.fsx_xflags);
-		OBD_ALLOC_PTR(attr);
-		if (attr == NULL)
-			GOTO(out_fsxattr1, rc = -ENOMEM);
-		attr->ia_valid = ATTR_ATTR_FLAG;
-		rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
+	if (ioc->lil_flags & LL_LEASE_RESYNC) {
+		rc = ll_lease_file_resync(och, inode, arg);
+		if (rc) {
+			ll_lease_close(och, inode, NULL);
+			RETURN(rc);
+		}
+		rc = ll_layout_refresh(inode, &fd->fd_layout_version);
+		if (rc) {
+			ll_lease_close(och, inode, NULL);
+			RETURN(rc);
+		}
+	}
 
-		OBD_FREE_PTR(attr);
+	rc = 0;
+	mutex_lock(&lli->lli_och_mutex);
+	if (fd->fd_lease_och == NULL) {
+		fd->fd_lease_och = och;
+		och = NULL;
+	}
+	mutex_unlock(&lli->lli_och_mutex);
+	if (och != NULL) {
+		/* impossible now that only excl is supported for now */
+		ll_lease_close(och, inode, &lease_broken);
+		rc = -EBUSY;
 	}
-out_fsxattr1:
-	ll_finish_md_op_data(op_data);
 	RETURN(rc);
-
-
 }
 
 static long
@@ -2586,15 +3408,15 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
 	       PFID(ll_inode2fid(inode)), inode, cmd);
-        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
 
-        /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
-        if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
-                RETURN(-ENOTTY);
+	/* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
+	if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
+		RETURN(-ENOTTY);
 
-        switch(cmd) {
-        case LL_IOC_GETFLAGS:
-                /* Get the current value of the file flags */
+	switch (cmd) {
+	case LL_IOC_GETFLAGS:
+		/* Get the current value of the file flags */
 		return put_user(fd->fd_flags, (int __user *)arg);
         case LL_IOC_SETFLAGS:
         case LL_IOC_CLRFLAGS:
@@ -2647,9 +3469,6 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			struct ll_inode_info		*lli;
 			struct obd_client_handle	*och = NULL;
 
-			if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
-				GOTO(out, rc = -EINVAL);
-
 			lli = ll_i2info(inode);
 			mutex_lock(&lli->lli_och_mutex);
 			if (fd->fd_lease_och != NULL) {
@@ -2671,12 +3490,18 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case LL_IOC_LOV_GETSTRIPE:
 	case LL_IOC_LOV_GETSTRIPE_NEW:
 		RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
-        case FSFILT_IOC_GETFLAGS:
-        case FSFILT_IOC_SETFLAGS:
-                RETURN(ll_iocontrol(inode, file, cmd, arg));
-        case FSFILT_IOC_GETVERSION_OLD:
-        case FSFILT_IOC_GETVERSION:
+	case FS_IOC_GETFLAGS:
+	case FS_IOC_SETFLAGS:
+		RETURN(ll_iocontrol(inode, file, cmd, arg));
+	case FSFILT_IOC_GETVERSION:
+	case FS_IOC_GETVERSION:
 		RETURN(put_user(inode->i_generation, (int __user *)arg));
+	/* We need to special case any other ioctls we want to handle,
+	 * to send them to the MDS/OST as appropriate and to properly
+	 * network encode the arg field. */
+	case FS_IOC_SETVERSION:
+		RETURN(-ENOTSUPP);
+
         case LL_IOC_GROUP_LOCK:
                 RETURN(ll_get_grouplock(inode, file, arg));
         case LL_IOC_GROUP_UNLOCK:
@@ -2684,12 +3509,6 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
         case IOC_OBD_STATFS:
 		RETURN(ll_obd_statfs(inode, (void __user *)arg));
 
-        /* We need to special case any other ioctls we want to handle,
-         * to send them to the MDS/OST as appropriate and to properly
-         * network encode the arg field.
-        case FSFILT_IOC_SETVERSION_OLD:
-        case FSFILT_IOC_SETVERSION:
-        */
 	case LL_IOC_FLUSHCTX:
 		RETURN(ll_flush_ctx(inode));
 	case LL_IOC_PATH2FID: {
@@ -2712,7 +3531,7 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			RETURN(-EFAULT);
 
 		idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
-		rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
+		rc = ll_ioc_data_version(inode, &idv);
 
 		if (rc == 0 &&
 		    copy_to_user((char __user *)arg, &idv, sizeof(idv)))
@@ -2806,71 +3625,18 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		OBD_FREE_PTR(hca);
 		RETURN(rc);
 	}
-	case LL_IOC_SET_LEASE: {
-		struct ll_inode_info *lli = ll_i2info(inode);
-		struct obd_client_handle *och = NULL;
-		bool lease_broken;
-		fmode_t fmode;
-
-		switch (arg) {
-		case LL_LEASE_WRLCK:
-			if (!(file->f_mode & FMODE_WRITE))
-				RETURN(-EPERM);
-			fmode = FMODE_WRITE;
-			break;
-		case LL_LEASE_RDLCK:
-			if (!(file->f_mode & FMODE_READ))
-				RETURN(-EPERM);
-			fmode = FMODE_READ;
-			break;
-		case LL_LEASE_UNLCK:
-			mutex_lock(&lli->lli_och_mutex);
-			if (fd->fd_lease_och != NULL) {
-				och = fd->fd_lease_och;
-				fd->fd_lease_och = NULL;
-			}
-			mutex_unlock(&lli->lli_och_mutex);
-
-			if (och == NULL)
-				RETURN(-ENOLCK);
-
-			fmode = och->och_flags;
-			rc = ll_lease_close(och, inode, &lease_broken);
-			if (rc < 0)
-				RETURN(rc);
-
-			rc = ll_lease_och_release(inode, file);
-			if (rc < 0)
-				RETURN(rc);
-
-			if (lease_broken)
-				fmode = 0;
-
-			RETURN(ll_lease_type_from_fmode(fmode));
-		default:
-			RETURN(-EINVAL);
-		}
+	case LL_IOC_SET_LEASE_OLD: {
+		struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
 
-		CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
+		RETURN(ll_file_set_lease(file, &ioc, 0));
+	}
+	case LL_IOC_SET_LEASE: {
+		struct ll_ioc_lease ioc;
 
-		/* apply for lease */
-		och = ll_lease_open(inode, file, fmode, 0);
-		if (IS_ERR(och))
-			RETURN(PTR_ERR(och));
+		if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
+			RETURN(-EFAULT);
 
-		rc = 0;
-		mutex_lock(&lli->lli_och_mutex);
-		if (fd->fd_lease_och == NULL) {
-			fd->fd_lease_och = och;
-			och = NULL;
-		}
-		mutex_unlock(&lli->lli_och_mutex);
-		if (och != NULL) {
-			/* impossible now that only excl is supported for now */
-			ll_lease_close(och, inode, &lease_broken);
-			rc = -EBUSY;
-		}
-		RETURN(rc);
+		RETURN(ll_file_set_lease(file, &ioc, arg));
 	}
 	case LL_IOC_GET_LEASE: {
 		struct ll_inode_info *lli = ll_i2info(inode);
@@ -2923,55 +3689,92 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		RETURN(ll_file_futimes_3(file, &lfu));
 	}
 	case LL_IOC_LADVISE: {
-		struct llapi_ladvise_hdr *ladvise_hdr;
+		struct llapi_ladvise_hdr *k_ladvise_hdr;
+		struct llapi_ladvise_hdr __user *u_ladvise_hdr;
 		int i;
 		int num_advise;
-		int alloc_size = sizeof(*ladvise_hdr);
+		int alloc_size = sizeof(*k_ladvise_hdr);
 
 		rc = 0;
-		OBD_ALLOC_PTR(ladvise_hdr);
-		if (ladvise_hdr == NULL)
+		u_ladvise_hdr = (void __user *)arg;
+		OBD_ALLOC_PTR(k_ladvise_hdr);
+		if (k_ladvise_hdr == NULL)
 			RETURN(-ENOMEM);
 
-		if (copy_from_user(ladvise_hdr,
-				   (const struct llapi_ladvise_hdr __user *)arg,
-				   alloc_size))
+		if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
 			GOTO(out_ladvise, rc = -EFAULT);
 
-		if (ladvise_hdr->lah_magic != LADVISE_MAGIC ||
-		    ladvise_hdr->lah_count < 1)
+		if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
+		    k_ladvise_hdr->lah_count < 1)
 			GOTO(out_ladvise, rc = -EINVAL);
 
-		num_advise = ladvise_hdr->lah_count;
+		num_advise = k_ladvise_hdr->lah_count;
 		if (num_advise >= LAH_COUNT_MAX)
 			GOTO(out_ladvise, rc = -EFBIG);
 
-		OBD_FREE_PTR(ladvise_hdr);
-		alloc_size = offsetof(typeof(*ladvise_hdr),
+		OBD_FREE_PTR(k_ladvise_hdr);
+		alloc_size = offsetof(typeof(*k_ladvise_hdr),
 				      lah_advise[num_advise]);
-		OBD_ALLOC(ladvise_hdr, alloc_size);
-		if (ladvise_hdr == NULL)
+		OBD_ALLOC(k_ladvise_hdr, alloc_size);
+		if (k_ladvise_hdr == NULL)
 			RETURN(-ENOMEM);
 
 		/*
 		 * TODO: submit multiple advices to one server in a single RPC
 		 */
-		if (copy_from_user(ladvise_hdr,
-				   (const struct llapi_ladvise_hdr __user *)arg,
-				   alloc_size))
+		if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
 			GOTO(out_ladvise, rc = -EFAULT);
 
 		for (i = 0; i < num_advise; i++) {
-			rc = ll_ladvise(inode, file, ladvise_hdr->lah_flags,
-					&ladvise_hdr->lah_advise[i]);
+			struct llapi_lu_ladvise *k_ladvise =
+					&k_ladvise_hdr->lah_advise[i];
+			struct llapi_lu_ladvise __user *u_ladvise =
+					&u_ladvise_hdr->lah_advise[i];
+
+			rc = ll_ladvise_sanity(inode, k_ladvise);
 			if (rc)
+				GOTO(out_ladvise, rc);
+
+			switch (k_ladvise->lla_advice) {
+			case LU_LADVISE_LOCKNOEXPAND:
+				rc = ll_lock_noexpand(file,
+					       k_ladvise->lla_peradvice_flags);
+				GOTO(out_ladvise, rc);
+			case LU_LADVISE_LOCKAHEAD:
+
+				rc = ll_file_lock_ahead(file, k_ladvise);
+
+				if (rc < 0)
+					GOTO(out_ladvise, rc);
+
+				if (put_user(rc,
+					     &u_ladvise->lla_lockahead_result))
+					GOTO(out_ladvise, rc = -EFAULT);
+				break;
+			default:
+				rc = ll_ladvise(inode, file,
+						k_ladvise_hdr->lah_flags,
+						k_ladvise);
+				if (rc)
+					GOTO(out_ladvise, rc);
 				break;
+			}
+
 		}
 
 out_ladvise:
-		OBD_FREE(ladvise_hdr, alloc_size);
+		OBD_FREE(k_ladvise_hdr, alloc_size);
 		RETURN(rc);
 	}
+	case LL_IOC_FLR_SET_MIRROR: {
+		/* mirror I/O must be direct to avoid polluting page cache
+		 * by stale data. */
+		if (!(file->f_flags & O_DIRECT))
+			RETURN(-EINVAL);
+
+		fd->fd_designated_mirror = (__u32)arg;
+		RETURN(0);
+	}
 	case LL_IOC_FSGETXATTR:
 		RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
 	case LL_IOC_FSSETXATTR:
@@ -3160,7 +3963,6 @@ int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct dentry *dentry = file_dentry(file);
-	bool lock_inode;
 #elif defined(HAVE_FILE_FSYNC_2ARGS)
 int ll_fsync(struct file *file, int datasync)
 {
@@ -3185,9 +3987,7 @@ int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
 
 #ifdef HAVE_FILE_FSYNC_4ARGS
 	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
-	lock_inode = !lli->lli_inode_locked;
-	if (lock_inode)
-		inode_lock(inode);
+	inode_lock(inode);
 #else
 	/* fsync's caller has already called _fdata{sync,write}, we want
 	 * that IO to finish before calling the osc and mdc sync methods */
@@ -3227,8 +4027,7 @@ int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
 	}
 
 #ifdef HAVE_FILE_FSYNC_4ARGS
-	if (lock_inode)
-		inode_unlock(inode);
+	inode_unlock(inode);
 #endif
 	RETURN(rc);
 }
@@ -3412,48 +4211,61 @@ int ll_get_fid_by_name(struct inode *parent, const char *name,
 	RETURN(rc);
 }
 
-int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
-	       const char *name, int namelen)
+int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
+	       const char *name)
 {
-	struct dentry         *dchild = NULL;
-	struct inode          *child_inode = NULL;
-	struct md_op_data     *op_data;
+	struct dentry *dchild = NULL;
+	struct inode *child_inode = NULL;
+	struct md_op_data *op_data;
 	struct ptlrpc_request *request = NULL;
 	struct obd_client_handle *och = NULL;
-	struct qstr           qstr;
-	struct mdt_body		*body;
-	int                    rc;
-	__u64			data_version = 0;
+	struct qstr qstr;
+	struct mdt_body	*body;
+	__u64 data_version = 0;
+	size_t namelen = strlen(name);
+	int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
+	int rc;
 	ENTRY;
 
-	CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
-	       name, PFID(ll_inode2fid(parent)), mdtidx);
+	CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
+	       PFID(ll_inode2fid(parent)), name,
+	       lum->lum_stripe_offset, lum->lum_stripe_count);
 
-	op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
-				     0, LUSTRE_OPC_ANY, NULL);
-	if (IS_ERR(op_data))
-		RETURN(PTR_ERR(op_data));
+	if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
+	    lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
+		lustre_swab_lmv_user_md(lum);
 
 	/* Get child FID first */
 	qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
 	qstr.name = name;
 	qstr.len = namelen;
 	dchild = d_lookup(file_dentry(file), &qstr);
-	if (dchild != NULL) {
-		if (dchild->d_inode != NULL)
+	if (dchild) {
+		if (dchild->d_inode)
 			child_inode = igrab(dchild->d_inode);
 		dput(dchild);
 	}
 
-	if (child_inode == NULL) {
-		rc = ll_get_fid_by_name(parent, name, namelen,
-					&op_data->op_fid3, &child_inode);
-		if (rc != 0)
-			GOTO(out_free, rc);
+	if (!child_inode) {
+		rc = ll_get_fid_by_name(parent, name, namelen, NULL,
+					&child_inode);
+		if (rc)
+			RETURN(rc);
 	}
 
-	if (child_inode == NULL)
-		GOTO(out_free, rc = -EINVAL);
+	if (!child_inode)
+		RETURN(-ENOENT);
+
+	if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
+	      OBD_CONNECT2_DIR_MIGRATE)) {
+		if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
+		    ll_i2info(child_inode)->lli_lsm_md) {
+			CERROR("%s: MDT doesn't support stripe directory "
+			       "migration!\n",
+			       ll_get_fsname(parent->i_sb, NULL, 0));
+			GOTO(out_iput, rc = -EOPNOTSUPP);
+		}
+	}
 
 	/*
 	 * lfs migrate command needs to be blocked on the client
@@ -3463,6 +4275,11 @@ int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
 	if (child_inode == parent->i_sb->s_root->d_inode)
 		GOTO(out_iput, rc = -EINVAL);
 
+	op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
+				     child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		GOTO(out_iput, rc = PTR_ERR(op_data));
+
 	inode_lock(child_inode);
 	op_data->op_fid3 = *ll_inode2fid(child_inode);
 	if (!fid_is_sane(&op_data->op_fid3)) {
@@ -3472,15 +4289,10 @@ int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
 		GOTO(out_unlock, rc = -EINVAL);
 	}
 
-	rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
-	if (rc < 0)
-		GOTO(out_unlock, rc);
+	op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
+	op_data->op_data = lum;
+	op_data->op_data_size = lumlen;
 
-	if (rc == mdtidx) {
-		CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
-		       PFID(&op_data->op_fid3), mdtidx);
-		GOTO(out_unlock, rc = 0);
-	}
 again:
 	if (S_ISREG(child_inode->i_mode)) {
 		och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
@@ -3495,17 +4307,18 @@ int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
 		if (rc != 0)
 			GOTO(out_close, rc);
 
-		op_data->op_handle = och->och_fh;
-		op_data->op_data = och->och_mod;
+		op_data->op_open_handle = och->och_open_handle;
 		op_data->op_data_version = data_version;
 		op_data->op_lease_handle = och->och_lease_handle;
-		op_data->op_bias |= MDS_RENAME_MIGRATE;
+		op_data->op_bias |= MDS_CLOSE_MIGRATE;
+
+		spin_lock(&och->och_mod->mod_open_req->rq_lock);
+		och->och_mod->mod_open_req->rq_replay = 0;
+		spin_unlock(&och->och_mod->mod_open_req->rq_lock);
 	}
 
-	op_data->op_mds = mdtidx;
-	op_data->op_cli_flags = CLI_MIGRATE;
-	rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
-		       namelen, name, namelen, &request);
+	rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
+		       name, namelen, &request);
 	if (rc == 0) {
 		LASSERT(request != NULL);
 		ll_update_times(request, parent);
@@ -3515,12 +4328,11 @@ int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
 
 		/* If the server does release layout lock, then we cleanup
 		 * the client och here, otherwise release it in out_close: */
-		if (och != NULL &&
-		    body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
+		if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
 			obd_mod_put(och->och_mod);
 			md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
 						  och);
-			och->och_fh.cookie = DEAD_HANDLE_MAGIC;
+			och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
 			OBD_FREE_PTR(och);
 			och = NULL;
 		}
@@ -3536,16 +4348,15 @@ int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
 		goto again;
 
 out_close:
-	if (och != NULL) /* close the file */
+	if (och)
 		ll_lease_close(och, child_inode, NULL);
-	if (rc == 0)
+	if (!rc)
 		clear_nlink(child_inode);
 out_unlock:
 	inode_unlock(child_inode);
+	ll_finish_md_op_data(op_data);
 out_iput:
 	iput(child_inode);
-out_free:
-	ll_finish_md_op_data(op_data);
 	RETURN(rc);
 }
 
@@ -3586,7 +4397,7 @@ int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
                ldlm_lockname[mode]);
 
 	flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
-	for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
+	for (i = 0; i < MDS_INODELOCK_NUMBITS && *bits != 0; i++) {
 		policy.l_inodebits.bits = *bits & (1 << i);
 		if (policy.l_inodebits.bits == 0)
 			continue;
@@ -3653,105 +4464,81 @@ static int ll_inode_revalidate_fini(struct inode *inode, int rc)
 	return rc;
 }
 
-static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
+static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
 {
-        struct inode *inode = dentry->d_inode;
-        struct ptlrpc_request *req = NULL;
-        struct obd_export *exp;
-        int rc = 0;
-        ENTRY;
-
-        LASSERT(inode != NULL);
+	struct inode *parent;
+	struct inode *inode = dentry->d_inode;
+	struct obd_export *exp = ll_i2mdexp(inode);
+	struct lookup_intent oit = {
+		.it_op = op,
+	};
+	struct ptlrpc_request *req = NULL;
+	struct md_op_data *op_data;
+	const char *name = NULL;
+	size_t namelen = 0;
+	int rc = 0;
+	ENTRY;
 
 	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
 	       PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
 
-        exp = ll_i2mdexp(inode);
-
-        /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
-         *      But under CMD case, it caused some lock issues, should be fixed
-         *      with new CMD ibits lock. See bug 12718 */
-	if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
-                struct lookup_intent oit = { .it_op = IT_GETATTR };
-                struct md_op_data *op_data;
-
-                if (ibits == MDS_INODELOCK_LOOKUP)
-                        oit.it_op = IT_LOOKUP;
-
-                /* Call getattr by fid, so do not provide name at all. */
-                op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
-                                             dentry->d_inode, NULL, 0, 0,
-                                             LUSTRE_OPC_ANY, NULL);
-                if (IS_ERR(op_data))
-                        RETURN(PTR_ERR(op_data));
-
-		rc = md_intent_lock(exp, op_data, &oit, &req,
-				    &ll_md_blocking_ast, 0);
-                ll_finish_md_op_data(op_data);
-                if (rc < 0) {
-                        rc = ll_inode_revalidate_fini(inode, rc);
-                        GOTO (out, rc);
-                }
-
-                rc = ll_revalidate_it_finish(req, &oit, dentry);
-                if (rc != 0) {
-                        ll_intent_release(&oit);
-                        GOTO(out, rc);
-                }
-
-                /* Unlinked? Unhash dentry, so it is not picked up later by
-                   do_lookup() -> ll_revalidate_it(). We cannot use d_drop
-                   here to preserve get_cwd functionality on 2.6.
-                   Bug 10503 */
-		if (!dentry->d_inode->i_nlink) {
-			ll_lock_dcache(inode);
-			d_lustre_invalidate(dentry, 0);
-			ll_unlock_dcache(inode);
-		}
+	if (exp_connect_flags2(exp) & OBD_CONNECT2_GETATTR_PFID) {
+		parent = dentry->d_parent->d_inode;
+		name = dentry->d_name.name;
+		namelen = dentry->d_name.len;
+	} else {
+		parent = inode;
+	}
 
-                ll_lookup_finish_locks(&oit, dentry);
-        } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
-		struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
-		u64 valid = OBD_MD_FLGETATTR;
-		struct md_op_data *op_data;
-		int ealen = 0;
+	op_data = ll_prep_md_op_data(NULL, parent, inode, name, namelen, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
 
-		if (S_ISREG(inode->i_mode)) {
-			rc = ll_get_default_mdsize(sbi, &ealen);
-			if (rc)
-				RETURN(rc);
-			valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
-		}
+	/* Call getattr by fid */
+	if (exp_connect_flags2(exp) & OBD_CONNECT2_GETATTR_PFID)
+		op_data->op_flags = MF_GETATTR_BY_FID;
+	rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
+	ll_finish_md_op_data(op_data);
+	if (rc < 0) {
+		rc = ll_inode_revalidate_fini(inode, rc);
+		GOTO(out, rc);
+	}
 
-                op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
-                                             0, ealen, LUSTRE_OPC_ANY,
-                                             NULL);
-                if (IS_ERR(op_data))
-                        RETURN(PTR_ERR(op_data));
+	rc = ll_revalidate_it_finish(req, &oit, dentry);
+	if (rc != 0) {
+		ll_intent_release(&oit);
+		GOTO(out, rc);
+	}
 
-                op_data->op_valid = valid;
-                rc = md_getattr(sbi->ll_md_exp, op_data, &req);
-                ll_finish_md_op_data(op_data);
-                if (rc) {
-                        rc = ll_inode_revalidate_fini(inode, rc);
-                        RETURN(rc);
-                }
+	/* Unlinked? Unhash dentry, so it is not picked up later by
+	 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
+	 * here to preserve get_cwd functionality on 2.6.
+	 * Bug 10503 */
+	if (!dentry->d_inode->i_nlink) {
+		ll_lock_dcache(inode);
+		d_lustre_invalidate(dentry, 0);
+		ll_unlock_dcache(inode);
+	}
 
-                rc = ll_prep_inode(&inode, req, NULL, NULL);
-        }
+	ll_lookup_finish_locks(&oit, dentry);
 out:
-        ptlrpc_req_finished(req);
-        return rc;
+	ptlrpc_req_finished(req);
+
+	return rc;
 }
 
 static int ll_merge_md_attr(struct inode *inode)
 {
+	struct ll_inode_info *lli = ll_i2info(inode);
 	struct cl_attr attr = { 0 };
 	int rc;
 
-	LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
+	LASSERT(lli->lli_lsm_md != NULL);
+	down_read(&lli->lli_lsm_sem);
 	rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
 			   &attr, ll_md_blocking_ast);
+	up_read(&lli->lli_lsm_sem);
 	if (rc != 0)
 		RETURN(rc);
 
@@ -3766,43 +4553,6 @@ static int ll_merge_md_attr(struct inode *inode)
 	RETURN(0);
 }
 
-static int
-ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
-{
-	struct inode	*inode = dentry->d_inode;
-	int		 rc;
-	ENTRY;
-
-	rc = __ll_inode_revalidate(dentry, ibits);
-	if (rc != 0)
-		RETURN(rc);
-
-	/* if object isn't regular file, don't validate size */
-	if (!S_ISREG(inode->i_mode)) {
-		if (S_ISDIR(inode->i_mode) &&
-		    ll_i2info(inode)->lli_lsm_md != NULL) {
-			rc = ll_merge_md_attr(inode);
-			if (rc != 0)
-				RETURN(rc);
-		}
-
-		inode->i_atime.tv_sec = ll_i2info(inode)->lli_atime;
-		inode->i_mtime.tv_sec = ll_i2info(inode)->lli_mtime;
-		inode->i_ctime.tv_sec = ll_i2info(inode)->lli_ctime;
-	} else {
-		/* In case of restore, the MDT has the right size and has
-		 * already send it back without granting the layout lock,
-		 * inode is up-to-date so glimpse is useless.
-		 * Also to glimpse we need the layout, in case of a running
-		 * restore the MDT holds the layout lock so the glimpse will
-		 * block up to the end of restore (getattr will block)
-		 */
-		if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
-			rc = ll_glimpse_size(inode);
-	}
-	RETURN(rc);
-}
-
 static inline dev_t ll_compat_encode_dev(dev_t dev)
 {
 	/* The compat_sys_*stat*() syscalls will fail unless the
@@ -3816,26 +4566,51 @@ static inline dev_t ll_compat_encode_dev(dev_t dev)
 }
 
 #if defined(HAVE_USER_NAMESPACE_ARG) || defined(HAVE_INODEOPS_ENHANCED_GETATTR)
-int ll_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat,
-	       u32 request_mask, unsigned int flags)
-
+int ll_getattr(struct user_namespace *mnt_userns, const struct path *path,
+	       struct kstat *stat, u32 request_mask, unsigned int flags)
 {
 	struct dentry *de = path->dentry;
 #else
 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
 {
 #endif
-        struct inode *inode = de->d_inode;
-        struct ll_sb_info *sbi = ll_i2sbi(inode);
-        struct ll_inode_info *lli = ll_i2info(inode);
-        int res = 0;
+	struct inode *inode = de->d_inode;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int rc;
+
+	ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
+
+	rc = ll_inode_revalidate(de, IT_GETATTR);
+	if (rc < 0)
+		RETURN(rc);
 
-	res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
-				      MDS_INODELOCK_LOOKUP);
-        ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
+	if (S_ISREG(inode->i_mode)) {
+		/* In case of restore, the MDT has the right size and has
+		 * already send it back without granting the layout lock,
+		 * inode is up-to-date so glimpse is useless.
+		 * Also to glimpse we need the layout, in case of a running
+		 * restore the MDT holds the layout lock so the glimpse will
+		 * block up to the end of restore (getattr will block)
+		 */
+		if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
+			rc = ll_glimpse_size(inode);
+			if (rc < 0)
+				RETURN(rc);
+		}
+	} else {
+		/* If object isn't regular a file then don't validate size. */
+		if (S_ISDIR(inode->i_mode) &&
+		    lli->lli_lsm_md != NULL) {
+			rc = ll_merge_md_attr(inode);
+			if (rc < 0)
+				RETURN(rc);
+		}
 
-        if (res)
-                return res;
+		inode->i_atime.tv_sec = lli->lli_atime;
+		inode->i_mtime.tv_sec = lli->lli_mtime;
+		inode->i_ctime.tv_sec = lli->lli_ctime;
+	}
 
 	OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
 
@@ -3902,12 +4677,21 @@ static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	return rc;
 }
 
-struct posix_acl *ll_get_acl(struct inode *inode, int type)
+struct posix_acl *ll_get_acl(struct inode *inode, int type
+#ifdef HAVE_GET_ACL_RCU_ARG
+			     , bool rcu
+#endif /* HAVE_GET_ACL_RCU_ARG */
+			    )
 {
 	struct ll_inode_info *lli = ll_i2info(inode);
 	struct posix_acl *acl = NULL;
 	ENTRY;
 
+#ifdef HAVE_GET_ACL_RCU_ARG
+    if (rcu)
+        return ERR_PTR(-ECHILD);
+#endif
+
 	spin_lock(&lli->lli_lock);
 	/* VFS' acl_permission_check->check_acl will release the refcount */
 	acl = posix_acl_dup(lli->lli_posix_acl);
@@ -3918,35 +4702,37 @@ struct posix_acl *ll_get_acl(struct inode *inode, int type)
 
 #ifdef HAVE_IOP_SET_ACL
 #ifdef CONFIG_FS_POSIX_ACL
-int ll_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type)
+int ll_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+               struct posix_acl *acl, int type)
 {
 	struct ll_sb_info *sbi = ll_i2sbi(inode);
 	struct ptlrpc_request *req = NULL;
 	const char *name = NULL;
 	char *value = NULL;
 	size_t value_size = 0;
-	int rc;
+	int rc = 0;
 	ENTRY;
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		name = XATTR_NAME_POSIX_ACL_ACCESS;
-		if (acl) {
-			rc = posix_acl_update_mode(mnt_userns, inode, &inode->i_mode, &acl);
-			if (rc)
-				GOTO(out, rc);
-		}
+		if (acl)
+                    rc = posix_acl_update_mode(mnt_userns, inode,
+                                               &inode->i_mode, &acl);
+            break;
 
-		break;
 	case ACL_TYPE_DEFAULT:
 		name = XATTR_NAME_POSIX_ACL_DEFAULT;
 		if (!S_ISDIR(inode->i_mode))
-			GOTO(out, rc = acl ? -EACCES : 0);
-
+			rc = acl ? -EACCES : 0;
 		break;
+
 	default:
-		GOTO(out, rc = -EINVAL);
+		rc = -EINVAL;
+		break;
 	}
+	if (rc)
+		return rc;
 
 	if (acl) {
 		value_size = posix_acl_xattr_size(acl->a_count);
@@ -3961,16 +4747,16 @@ int ll_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct po
 
 	rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
 			 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
-			 name, value, value_size, 0, 0, 0, &req);
+			 name, value, value_size, 0, 0, &req);
 
 	ptlrpc_req_finished(req);
 out_value:
 	kfree(value);
 out:
-	if (!rc)
-		set_cached_acl(inode, type, acl);
-	else
+	if (rc)
 		forget_cached_acl(inode, type);
+	else
+		set_cached_acl(inode, type, acl);
 	RETURN(rc);
 }
 #endif /* CONFIG_FS_POSIX_ACL */
@@ -4010,7 +4796,20 @@ ll_check_acl(struct inode *inode, int mask)
 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
 #endif /* HAVE_USER_NAMESPACE_ARG */
 
-int ll_inode_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask)
+#ifdef HAVE_GENERIC_PERMISSION_4ARGS
+int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
+#else
+# ifdef HAVE_INODE_PERMISION_2ARGS
+int ll_inode_permission(struct inode *inode, int mask)
+# else
+#  ifdef HAVE_USER_NAMESPACE_ARG
+int ll_inode_permission(struct user_namespace *mnt_userns, struct inode *inode,
+			int mask)
+#  else
+int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
+#  endif
+# endif
+#endif
 {
 	int rc = 0;
 	struct ll_sb_info *sbi;
@@ -4033,8 +4832,7 @@ int ll_inode_permission(struct user_namespace *mnt_userns, struct inode *inode,
         * need to do it before permission check. */
 
         if (inode == inode->i_sb->s_root->d_inode) {
-		rc = __ll_inode_revalidate(inode->i_sb->s_root,
-					   MDS_INODELOCK_LOOKUP);
+		rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
                 if (rc)
                         RETURN(rc);
         }
@@ -4071,7 +4869,7 @@ int ll_inode_permission(struct user_namespace *mnt_userns, struct inode *inode,
 	}
 
 	ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
-	rc = generic_permission(mnt_userns, inode, mask);
+	rc = ll_generic_permission(mnt_userns, inode, mask, flags, ll_check_acl);
 	/* restore current process's credentials and FS capability */
 	if (squash_id) {
 		revert_creds(old_cred);
@@ -4236,7 +5034,6 @@ static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
 {
 	struct ll_sb_info *sbi = ll_i2sbi(inode);
 	struct ptlrpc_request *req;
-	struct mdt_body *body;
 	void *lvbdata;
 	void *lmm;
 	int lmmsize;
@@ -4256,18 +5053,20 @@ static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
 	 * layout here. Please note that we can't use the LVB buffer in
 	 * completion AST because it doesn't have a large enough buffer */
 	rc = ll_get_default_mdsize(sbi, &lmmsize);
-	if (rc == 0)
-		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
-				OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
-				lmmsize, 0, &req);
 	if (rc < 0)
 		RETURN(rc);
 
-	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
-	if (body == NULL)
-		GOTO(out, rc = -EPROTO);
+	rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
+			 XATTR_NAME_LOV, lmmsize, &req);
+	if (rc < 0) {
+		if (rc == -ENODATA)
+			GOTO(out, rc = 0); /* empty layout */
+		else
+			RETURN(rc);
+	}
 
-	lmmsize = body->mbo_eadatasize;
+	lmmsize = rc;
+	rc = 0;
 	if (lmmsize == 0) /* empty layout */
 		GOTO(out, rc = 0);
 
@@ -4465,7 +5264,6 @@ int ll_layout_refresh(struct inode *inode, __u32 *gen)
 
 	/* sanity checks */
 	LASSERT(fid_is_sane(ll_inode2fid(inode)));
-	LASSERT(S_ISREG(inode->i_mode));
 
 	/* take layout lock mutex to enqueue layout lock exclusively. */
 	mutex_lock(&lli->lli_layout_mutex);
@@ -4498,19 +5296,20 @@ int ll_layout_refresh(struct inode *inode, __u32 *gen)
  * Issue layout intent RPC indicating where in a file an IO is about to write.
  *
  * \param[in] inode	file inode.
- * \param[in] start	start offset of fille in bytes where an IO is about to
- *			write.
- * \param[in] end	exclusive end offset in bytes of the write range.
+ * \param[in] ext	write range with start offset of fille in bytes where
+ *			an IO is about to write, and exclusive end offset in
+ *			bytes.
  *
  * \retval 0	on success
  * \retval < 0	error code
  */
-int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end)
+int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
+			   struct lu_extent *ext)
 {
 	struct layout_intent intent = {
-		.li_opc = LAYOUT_INTENT_WRITE,
-		.li_start = start,
-		.li_end = end,
+		.li_opc = opc,
+		.li_extent.e_start = ext->e_start,
+		.li_extent.e_end = ext->e_end,
 	};
 	int rc;
 	ENTRY;
diff --git a/drivers/staging/lustrefsx/lustre/llite/glimpse.c b/drivers/staging/lustrefsx/lustre/llite/glimpse.c
index d34be28747bdd..ddbaa142514de 100644
--- a/drivers/staging/lustrefsx/lustre/llite/glimpse.c
+++ b/drivers/staging/lustrefsx/lustre/llite/glimpse.c
@@ -23,14 +23,13 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  * Lustre is a trademark of Sun Microsystems, Inc.
  *
- * glimpse code shared between vvp and liblustre (and other Lustre clients in
- * the future).
+ * glimpse code used by vvp (and other Lustre clients in the future).
  *
  *   Author: Nikita Danilov <nikita.danilov@sun.com>
  *   Author: Oleg Drokin <oleg.drokin@sun.com>
@@ -92,7 +91,7 @@ int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
 	CDEBUG(D_DLMTRACE, "Glimpsing inode "DFID"\n", PFID(fid));
 
 	/* NOTE: this looks like DLM lock request, but it may
-	 *       not be one. Due to CEF_ASYNC flag (translated
+	 *       not be one. Due to CEF_GLIMPSE flag (translated
 	 *       to LDLM_FL_HAS_INTENT by osc), this is
 	 *       glimpse request, that won't revoke any
 	 *       conflicting DLM locks held. Instead,
@@ -107,14 +106,10 @@ int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
 	*descr = whole_file;
 	descr->cld_obj = clob;
 	descr->cld_mode = CLM_READ;
-	descr->cld_enq_flags = CEF_ASYNC | CEF_MUST;
+	descr->cld_enq_flags = CEF_GLIMPSE | CEF_MUST;
 	if (agl)
-		descr->cld_enq_flags |= CEF_AGL;
+		descr->cld_enq_flags |= CEF_SPECULATIVE | CEF_NONBLOCK;
 	/*
-	 * CEF_ASYNC is used because glimpse sub-locks cannot
-	 * deadlock (because they never conflict with other
-	 * locks) and, hence, can be enqueued out-of-order.
-	 *
 	 * CEF_MUST protects glimpse lock from conversion into
 	 * a lockless mode.
 	 */
@@ -140,7 +135,20 @@ int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
 	RETURN(result);
 }
 
-static int cl_io_get(struct inode *inode, struct lu_env **envout,
+/**
+ * Get an IO environment for special operations such as glimpse locks and
+ * manually requested locks (ladvise lockahead)
+ *
+ * \param[in]  inode	inode the operation is being performed on
+ * \param[out] envout	thread specific execution environment
+ * \param[out] ioout	client io description
+ * \param[out] refcheck	reference check
+ *
+ * \retval 1		on success
+ * \retval 0		not a regular file, cannot get environment
+ * \retval negative	negative errno on error
+ */
+int cl_io_get(struct inode *inode, struct lu_env **envout,
 		     struct cl_io **ioout, __u16 *refcheck)
 {
 	struct lu_env		*env;
@@ -178,31 +186,37 @@ int cl_glimpse_size0(struct inode *inode, int agl)
          */
         struct lu_env          *env = NULL;
         struct cl_io           *io  = NULL;
-	__u16                   refcheck;
-        int                     result;
-
-        ENTRY;
-
-        result = cl_io_get(inode, &env, &io, &refcheck);
-        if (result > 0) {
-	again:
-		io->ci_verify_layout = 1;
-                result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
-                if (result > 0)
-                        /*
-                         * nothing to do for this io. This currently happens
-                         * when stripe sub-object's are not yet created.
-                         */
-                        result = io->ci_result;
-                else if (result == 0)
-                        result = cl_glimpse_lock(env, io, inode, io->ci_obj,
-                                                 agl);
+	__u16			refcheck;
+	int			retried = 0;
+	int                     result;
+
+	ENTRY;
+
+	result = cl_io_get(inode, &env, &io, &refcheck);
+	if (result <= 0)
+		RETURN(result);
+
+	do {
+		io->ci_ndelay_tried = retried++;
+		io->ci_ndelay = io->ci_verify_layout = 1;
+		result = cl_io_init(env, io, CIT_GLIMPSE, io->ci_obj);
+		if (result > 0) {
+			/*
+			 * nothing to do for this io. This currently happens
+			 * when stripe sub-object's are not yet created.
+			 */
+			result = io->ci_result;
+		} else if (result == 0) {
+			result = cl_glimpse_lock(env, io, inode, io->ci_obj,
+						 agl);
+			if (!agl && result == -EWOULDBLOCK)
+				io->ci_need_restart = 1;
+		}
 
 		OBD_FAIL_TIMEOUT(OBD_FAIL_GLIMPSE_DELAY, 2);
-                cl_io_fini(env, io);
-		if (unlikely(io->ci_need_restart))
-			goto again;
-		cl_env_put(env, &refcheck);
-	}
+		cl_io_fini(env, io);
+	} while (unlikely(io->ci_need_restart));
+
+	cl_env_put(env, &refcheck);
 	RETURN(result);
 }
diff --git a/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c b/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c
index a5fe1978c66a2..21a10ec551e44 100644
--- a/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c
+++ b/drivers/staging/lustrefsx/lustre/llite/lcommon_cl.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -47,7 +47,6 @@
 #include <obd_support.h>
 #include <lustre_fid.h>
 #include <lustre_dlm.h>
-#include <lustre_ver.h>
 #include <lustre_mdc.h>
 #include <cl_object.h>
 
@@ -69,7 +68,7 @@ __u16 cl_inode_fini_refcheck;
 static DEFINE_MUTEX(cl_inode_fini_guard);
 
 int cl_setattr_ost(struct cl_object *obj, const struct iattr *attr,
-		   unsigned int attr_flags)
+		   enum op_xvalid xvalid, unsigned int attr_flags)
 {
         struct lu_env *env;
         struct cl_io  *io;
@@ -91,10 +90,14 @@ int cl_setattr_ost(struct cl_object *obj, const struct iattr *attr,
 	io->u.ci_setattr.sa_attr.lvb_ctime = attr->ia_ctime.tv_sec;
 	io->u.ci_setattr.sa_attr.lvb_size = attr->ia_size;
 	io->u.ci_setattr.sa_attr_flags = attr_flags;
-	io->u.ci_setattr.sa_valid = attr->ia_valid;
+	io->u.ci_setattr.sa_avalid = attr->ia_valid;
+	io->u.ci_setattr.sa_xvalid = xvalid;
 	io->u.ci_setattr.sa_parent_fid = lu_object_fid(&obj->co_lu);
 
 again:
+	if (attr->ia_valid & ATTR_FILE)
+		ll_io_set_mirror(io, attr->ia_file);
+
         if (cl_io_init(env, io, CIT_SETATTR, io->ci_obj) == 0) {
 		struct vvp_io *vio = vvp_env_io(env);
 
@@ -213,12 +216,12 @@ static void cl_object_put_last(struct lu_env *env, struct cl_object *obj)
 
 	if (unlikely(atomic_read(&header->loh_ref) != 1)) {
 		struct lu_site *site = obj->co_lu.lo_dev->ld_site;
-		struct lu_site_bkt_data *bkt;
+		wait_queue_head_t *wq;
 
-		bkt = lu_site_bkt_from_fid(site, &header->loh_fid);
+		wq = lu_site_wq_from_fid(site, &header->loh_fid);
 
 		init_waitqueue_entry(&waiter, current);
-		add_wait_queue(&bkt->lsb_marche_funebre, &waiter);
+		add_wait_queue(wq, &waiter);
 
 		while (1) {
 			set_current_state(TASK_UNINTERRUPTIBLE);
@@ -228,7 +231,7 @@ static void cl_object_put_last(struct lu_env *env, struct cl_object *obj)
 		}
 
 		set_current_state(TASK_RUNNING);
-		remove_wait_queue(&bkt->lsb_marche_funebre, &waiter);
+		remove_wait_queue(wq, &waiter);
 	}
 
 	cl_object_put(env, obj);
diff --git a/drivers/staging/lustrefsx/lustre/llite/lcommon_misc.c b/drivers/staging/lustrefsx/lustre/llite/lcommon_misc.c
index ced348a36b42a..5869d949ff97b 100644
--- a/drivers/staging/lustrefsx/lustre/llite/lcommon_misc.c
+++ b/drivers/staging/lustrefsx/lustre/llite/lcommon_misc.c
@@ -23,14 +23,13 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  * Lustre is a trademark of Sun Microsystems, Inc.
  *
- * cl code shared between vvp and liblustre (and other Lustre clients in the
- * future).
+ * cl code used by vvp (and other Lustre clients in the future).
  *
  */
 #define DEBUG_SUBSYSTEM S_LLITE
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_internal.h b/drivers/staging/lustrefsx/lustre/llite/llite_internal.h
index bfc4f8bfd7bea..83f0e616d83c5 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_internal.h
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -33,7 +33,6 @@
 #ifndef LLITE_INTERNAL_H
 #define LLITE_INTERNAL_H
 #include <lustre_debug.h>
-#include <lustre_ver.h>
 #include <lustre_disk.h>  /* for s2sbi */
 #include <lustre_eacl.h>
 #include <lustre_linkea.h>
@@ -45,8 +44,8 @@
 #include <lustre_intent.h>
 #include <linux/compat.h>
 #include <linux/aio.h>
-
 #include <lustre_compat.h>
+
 #include "vvp_internal.h"
 #include "range_lock.h"
 
@@ -80,6 +79,7 @@ struct ll_dentry_data {
 	unsigned int			lld_invalid:1;
 	unsigned int			lld_nfs_dentry:1;
 	struct rcu_head			lld_rcu_head;
+	unsigned long			lld_neg_cache_timeout;
 };
 
 #define ll_d2d(de) ((struct ll_dentry_data*)((de)->d_fsdata))
@@ -136,8 +136,7 @@ struct ll_inode_info {
 
 	/* update atime from MDS no matter if it's older than
 	 * local inode atime. */
-	unsigned int	lli_update_atime:1,
-			lli_inode_locked:1;
+	unsigned int	lli_update_atime:1;
 
 	/* Try to make the d::member and f::member are aligned. Before using
 	 * these members, make clear whether it is directory or not. */
@@ -167,6 +166,8 @@ struct ll_inode_info {
 			unsigned int			lli_sa_enabled:1;
 			/* generation for statahead */
 			unsigned int			lli_sa_generation;
+			/* rw lock protects lli_lsm_md */
+			struct rw_semaphore		lli_lsm_sem;
 			/* directory stripe information */
 			struct lmv_stripe_md		*lli_lsm_md;
 			/* default directory stripe offset.  This is extracted
@@ -179,8 +180,8 @@ struct ll_inode_info {
 
 		/* for non-directory */
 		struct {
-			struct mutex			lli_size_mutex;
-			char			       *lli_symlink_name;
+			struct mutex		lli_size_mutex;
+			char		       *lli_symlink_name;
 			/*
 			 * struct rw_semaphore {
 			 *    signed long	count;     // align d.d_def_acl
@@ -188,23 +189,23 @@ struct ll_inode_info {
 			 *    struct list_head wait_list;
 			 * }
 			 */
-			struct rw_semaphore		lli_trunc_sem;
-			struct range_lock_tree		lli_write_tree;
+			struct rw_semaphore	lli_trunc_sem;
+			struct range_lock_tree	lli_write_tree;
 
-			struct rw_semaphore		lli_glimpse_sem;
-			cfs_time_t			lli_glimpse_time;
-			struct list_head		lli_agl_list;
-			__u64				lli_agl_index;
+			struct rw_semaphore	lli_glimpse_sem;
+			ktime_t			lli_glimpse_time;
+			struct list_head	lli_agl_list;
+			__u64			lli_agl_index;
 
 			/* for writepage() only to communicate to fsync */
-			int				lli_async_rc;
+			int			lli_async_rc;
 
 			/*
-			 * whenever a process try to read/write the file, the
+			 * Whenever a process try to read/write the file, the
 			 * jobid of the process will be saved here, and it'll
 			 * be packed into the write PRC when flush later.
 			 *
-			 * so the read/write statistics for jobid will not be
+			 * So the read/write statistics for jobid will not be
 			 * accurate if the file is shared by different jobs.
 			 */
 			char                    lli_jobid[LUSTRE_JOBID_SIZE];
@@ -238,9 +239,7 @@ struct ll_inode_info {
 
 #ifndef HAVE_USER_NAMESPACE_ARG
 #define inode_permission(ns, inode, mask)	inode_permission(inode, mask)
-#define generic_permission(ns, inode, mask)	generic_permission(inode, mask)
 #define simple_setattr(ns, de, iattr)		simple_setattr(de, iattr)
-#define ll_inode_permission(ns, inode, mask)	ll_inode_permission(inode, mask)
 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
 #define ll_getattr(ns, path, stat, mask, fl)	ll_getattr(path, stat, mask, fl)
 #endif /* HAVE_INODEOPS_ENHANCED_GETATTR */
@@ -272,6 +271,8 @@ enum ll_file_flags {
 	LLIF_FILE_RESTORING	= 1,
 	/* Xattr cache is attached to the file */
 	LLIF_XATTR_CACHE	= 2,
+	/* Project inherit */
+	LLIF_PROJECT_INHERIT	= 3,
 };
 
 static inline void ll_file_set_flag(struct ll_inode_info *lli,
@@ -306,12 +307,32 @@ int ll_xattr_cache_get(struct inode *inode,
 			size_t size,
 			__u64 valid);
 
+static inline bool obd_connect_has_secctx(struct obd_connect_data *data)
+{
+#if defined(HAVE_SECURITY_DENTRY_INIT_SECURITY) && defined(CONFIG_SECURITY)
+	return data->ocd_connect_flags & OBD_CONNECT_FLAGS2 &&
+	       data->ocd_connect_flags2 & OBD_CONNECT2_FILE_SECCTX;
+#else
+	return false;
+#endif /* HAVE_SECURITY_DENTRY_INIT_SECURITY */
+}
+
+static inline void obd_connect_set_secctx(struct obd_connect_data *data)
+{
+#if defined(HAVE_SECURITY_DENTRY_INIT_SECURITY) && defined(CONFIG_SECURITY)
+	data->ocd_connect_flags2 |= OBD_CONNECT2_FILE_SECCTX;
+#endif /* HAVE_SECURITY_DENTRY_INIT_SECURITY */
+}
+
 int ll_dentry_init_security(struct dentry *dentry, int mode, struct qstr *name,
 			    const char **secctx_name, void **secctx,
 			    __u32 *secctx_size);
 int ll_inode_init_security(struct dentry *dentry, struct inode *inode,
 			   struct inode *dir);
 
+int ll_listsecurity(struct inode *inode, char *secctx_name,
+		    size_t secctx_name_size);
+
 /*
  * Locking to guarantee consistency of non-atomic updates to long long i_size,
  * consistency between file size and KMS.
@@ -322,18 +343,19 @@ int ll_inode_init_security(struct dentry *dentry, struct inode *inode,
 void ll_inode_size_lock(struct inode *inode);
 void ll_inode_size_unlock(struct inode *inode);
 
-// FIXME: replace the name of this with LL_I to conform to kernel stuff
-// static inline struct ll_inode_info *LL_I(struct inode *inode)
 static inline struct ll_inode_info *ll_i2info(struct inode *inode)
 {
-        return container_of(inode, struct ll_inode_info, lli_vfs_inode);
+	return container_of(inode, struct ll_inode_info, lli_vfs_inode);
 }
 
+/* default to use at least 16M for fast read if possible */
+#define RA_REMAIN_WINDOW_MIN			MiB_TO_PAGES(16UL)
+
 /* default to about 64M of readahead on a given system. */
-#define SBI_DEFAULT_READAHEAD_MAX	(64UL << (20 - PAGE_SHIFT))
+#define SBI_DEFAULT_READAHEAD_MAX		MiB_TO_PAGES(64UL)
 
 /* default to read-ahead full files smaller than 2MB on the second read */
-#define SBI_DEFAULT_READAHEAD_WHOLE_MAX	(2UL << (20 - PAGE_SHIFT))
+#define SBI_DEFAULT_READAHEAD_WHOLE_MAX		MiB_TO_PAGES(2UL)
 
 enum ra_stat {
         RA_STAT_HIT = 0,
@@ -442,7 +464,11 @@ enum stats_track_type {
 				       * suppress_pings */
 #define LL_SBI_FAST_READ     0x400000 /* fast read support */
 #define LL_SBI_FILE_SECCTX   0x800000 /* set file security context at create */
-#define LL_SBI_PIO          0x1000000 /* parallel IO support */
+/*	LL_SBI_PIO	    0x1000000    parallel IO support, introduced in
+					 2.10, abandoned */
+#define LL_SBI_TINY_WRITE   0x2000000 /* tiny write support */
+#define LL_SBI_MDLL_AUTO_REFRESH   0x10000000 /* enable metadata lazy load */
+#define LL_SBI_MDLL   0x20000000 /* enable metadata lazy load auto-refresh */
 
 #define LL_SBI_FLAGS { 	\
 	"nolck",	\
@@ -470,6 +496,7 @@ enum stats_track_type {
 	"fast_read",	\
 	"file_secctx",	\
 	"pio",		\
+	"tiny_write",	\
 }
 
 /* This is embedded into llite super-blocks to keep track of connect
@@ -488,20 +515,24 @@ struct lustre_client_ocd {
 struct ll_sb_info {
 	/* this protects pglist and ra_info.  It isn't safe to
 	 * grab from interrupt contexts */
-	spinlock_t		  ll_lock;
-	spinlock_t		  ll_pp_extent_lock; /* pp_extent entry*/
-	spinlock_t		  ll_process_lock; /* ll_rw_process_info */
-        struct obd_uuid           ll_sb_uuid;
-        struct obd_export        *ll_md_exp;
-        struct obd_export        *ll_dt_exp;
-        struct proc_dir_entry*    ll_proc_root;
-        struct lu_fid             ll_root_fid; /* root object fid */
+	spinlock_t		 ll_lock;
+	spinlock_t		 ll_pp_extent_lock; /* pp_extent entry*/
+	spinlock_t		 ll_process_lock; /* ll_rw_process_info */
+	struct obd_uuid		 ll_sb_uuid;
+	struct obd_export	*ll_md_exp;
+	struct obd_export	*ll_dt_exp;
+	struct obd_device	*ll_md_obd;
+	struct obd_device	*ll_dt_obd;
+	struct dentry		*ll_debugfs_entry;
+	struct lu_fid		 ll_root_fid; /* root object fid */
 
         int                       ll_flags;
 	unsigned int		  ll_umounting:1,
 				  ll_xattr_cache_enabled:1,
 				  ll_xattr_cache_set:1, /* already set to 0/1 */
-				  ll_client_common_fill_super_succeeded:1;
+				  ll_client_common_fill_super_succeeded:1,
+				  ll_checksum_set:1,
+				  ll_inode_cache_enabled:1;
 
         struct lustre_client_ocd  ll_lco;
 
@@ -552,6 +583,19 @@ struct ll_sb_info {
 
 	/* st_blksize returned by stat(2), when non-zero */
 	unsigned int		  ll_stat_blksize;
+
+	/* maximum relative age of cached statfs results */
+	unsigned int		  ll_statfs_max_age;
+
+	/*
+	 * seconds after which negative dentries should be invalidated.
+	 * -1 disables invalidation of negative entries based on timeout
+	 * 0 always triggers serverside validation
+	 */
+	int			  ll_neg_dentry_timeout;
+
+	struct kset		  ll_kset;	/* sysfs object */
+	struct completion	  ll_kobj_unregister;
 };
 
 /*
@@ -656,11 +700,19 @@ struct ll_file_data {
 	 * true: failure is known, not report again.
 	 * false: unknown failure, should report. */
 	bool fd_write_failed;
+	bool ll_lock_no_expand;
 	rwlock_t fd_lock; /* protect lcc list */
 	struct list_head fd_lccs; /* list of ll_cl_context */
+	/* Used by mirrored file to lead IOs to a specific mirror, usually
+	 * for mirror resync. 0 means default. */
+	__u32 fd_designated_mirror;
+	/* The layout version when resync starts. Resync I/O should carry this
+	 * layout version for verification to OST objects */
+	__u32 fd_layout_version;
 };
 
-extern struct proc_dir_entry *proc_lustre_fs_root;
+void llite_tunables_unregister(void);
+int llite_tunables_register(void);
 
 static inline struct inode *ll_info2i(struct ll_inode_info *lli)
 {
@@ -698,6 +750,11 @@ static inline bool ll_sbi_has_fast_read(struct ll_sb_info *sbi)
 	return !!(sbi->ll_flags & LL_SBI_FAST_READ);
 }
 
+static inline bool ll_sbi_has_tiny_write(struct ll_sb_info *sbi)
+{
+	return !!(sbi->ll_flags & LL_SBI_TINY_WRITE);
+}
+
 void ll_ras_enter(struct file *f);
 
 /* llite/lcommon_misc.c */
@@ -708,21 +765,9 @@ int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock,
 void cl_put_grouplock(struct ll_grouplock *lg);
 
 /* llite/lproc_llite.c */
-#ifdef CONFIG_PROC_FS
-int lprocfs_ll_register_mountpoint(struct proc_dir_entry *parent,
-				   struct super_block *sb);
-int lprocfs_ll_register_obd(struct super_block *sb, const char *obdname);
-void lprocfs_ll_unregister_mountpoint(struct ll_sb_info *sbi);
+int ll_debugfs_register_super(struct super_block *sb, const char *name);
+void ll_debugfs_unregister_super(struct super_block *sb);
 void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count);
-extern struct lprocfs_vars lprocfs_llite_obd_vars[];
-#else
-static inline int lprocfs_ll_register_mountpoint(struct proc_dir_entry *parent,
-					struct super_block *sb) {return 0; }
-static inline int lprocfs_ll_register_obd(struct super_block *sb,
-					  const char *obdname) {return 0; }
-static inline void lprocfs_ll_unregister_mountpoint(struct ll_sb_info *sbi) {}
-static void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count) {}
-#endif
 
 enum {
 	LPROC_LL_DIRTY_HITS,
@@ -764,6 +809,10 @@ enum {
 };
 
 /* llite/dir.c */
+enum get_default_layout_type {
+	GET_DEFAULT_LAYOUT_ROOT = 1,
+};
+
 struct ll_dir_chain {
 };
 
@@ -806,6 +855,8 @@ void ll_update_times(struct ptlrpc_request *request, struct inode *inode);
 int ll_writepage(struct page *page, struct writeback_control *wbc);
 int ll_writepages(struct address_space *, struct writeback_control *wbc);
 int ll_readpage(struct file *file, struct page *page);
+int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
+			   struct cl_page *page, struct file *file);
 void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras);
 int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io);
 
@@ -841,7 +892,11 @@ int ll_getattr(struct user_namespace *mnt_userns, const struct path *path,
 #else
 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat);
 #endif /* HAVE_USER_NAMESPACE_ARG */
-struct posix_acl *ll_get_acl(struct inode *inode, int type);
+struct posix_acl *ll_get_acl(struct inode *inode, int type
+#ifdef HAVE_GET_ACL_RCU_ARG
+			     , bool rcu
+#endif /* HAVE_GET_ACL_RCU_ARG */
+			     );
 #ifdef HAVE_IOP_SET_ACL
 #ifdef CONFIG_FS_POSIX_ACL
 int ll_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
@@ -851,11 +906,42 @@ int ll_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
 #endif /* CONFIG_FS_POSIX_ACL */
 
 #endif
-int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
-	       const char *name, int namelen);
+
+static inline int ll_xflags_to_inode_flags(int xflags)
+{
+	return ((xflags & FS_XFLAG_SYNC)      ? S_SYNC      : 0) |
+	       ((xflags & FS_XFLAG_NOATIME)   ? S_NOATIME   : 0) |
+	       ((xflags & FS_XFLAG_APPEND)    ? S_APPEND    : 0) |
+	       ((xflags & FS_XFLAG_IMMUTABLE) ? S_IMMUTABLE : 0);
+}
+
+static inline int ll_inode_flags_to_xflags(int flags)
+{
+	return ((flags & S_SYNC)      ? FS_XFLAG_SYNC      : 0) |
+	       ((flags & S_NOATIME)   ? FS_XFLAG_NOATIME   : 0) |
+	       ((flags & S_APPEND)    ? FS_XFLAG_APPEND    : 0) |
+	       ((flags & S_IMMUTABLE) ? FS_XFLAG_IMMUTABLE : 0);
+}
+
+int ll_migrate(struct inode *parent, struct file *file,
+	       struct lmv_user_md *lum, const char *name);
 int ll_get_fid_by_name(struct inode *parent, const char *name,
 		       int namelen, struct lu_fid *fid, struct inode **inode);
-int ll_inode_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask);
+#ifdef HAVE_GENERIC_PERMISSION_4ARGS
+int ll_inode_permission(struct inode *inode, int mask, unsigned int flags);
+#else
+# ifndef HAVE_INODE_PERMISION_2ARGS
+#  ifdef HAVE_USER_NAMESPACE_ARG
+int ll_inode_permission(struct user_namespace *mnt_userns, struct inode *inode,
+			int mask);
+#  else
+int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd);
+#  endif
+# else
+int ll_inode_permission(struct inode *inode, int mask);
+# endif
+#endif
+int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa);
 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
 			unsigned long arg);
 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
@@ -869,9 +955,11 @@ int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
                              struct ptlrpc_request **request);
 int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
                      int set_default);
-int ll_dir_getstripe(struct inode *inode, void **lmmp,
-		     int *lmm_size, struct ptlrpc_request **request,
-		     u64 valid);
+int ll_dir_getstripe_default(struct inode *inode, void **lmmp,
+			     int *lmm_size, struct ptlrpc_request **request,
+			     struct ptlrpc_request **root_request, u64 valid);
+int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size,
+		     struct ptlrpc_request **request, u64 valid);
 #ifdef HAVE_FILE_FSYNC_4ARGS
 int ll_fsync(struct file *file, loff_t start, loff_t end, int data);
 #elif defined(HAVE_FILE_FSYNC_2ARGS)
@@ -884,6 +972,9 @@ int ll_fid2path(struct inode *inode, void __user *arg);
 int ll_data_version(struct inode *inode, __u64 *data_version, int flags);
 int ll_hsm_release(struct inode *inode);
 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss);
+void ll_io_set_mirror(struct cl_io *io, const struct file *file);
+int ll_hsm_import(struct inode *inode, struct file *file,
+		  struct hsm_user_import *hui);
 
 /* llite/dcache.c */
 
@@ -906,13 +997,15 @@ void ll_kill_super(struct super_block *sb);
 struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock);
 void ll_dir_clear_lsm_md(struct inode *inode);
 void ll_clear_inode(struct inode *inode);
-int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import);
+int ll_setattr_raw(struct dentry *dentry, struct iattr *attr,
+		   enum op_xvalid xvalid, bool hsm_import);
 int ll_setattr(struct user_namespace *mnt_userns, struct dentry *de,
 	       struct iattr *attr);
 int ll_statfs(struct dentry *de, struct kstatfs *sfs);
-int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
-                       __u64 max_age, __u32 flags);
+int ll_statfs_internal(struct ll_sb_info *sbi, struct obd_statfs *osfs,
+		       u32 flags);
 int ll_update_inode(struct inode *inode, struct lustre_md *md);
+void ll_update_inode_flags(struct inode *inode, int ext_flags);
 int ll_read_inode2(struct inode *inode, void *opaque);
 void ll_delete_inode(struct inode *inode);
 int ll_iocontrol(struct inode *inode, struct file *file,
@@ -932,7 +1025,6 @@ int ll_obd_statfs(struct inode *inode, void __user *arg);
 int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize);
 int ll_get_default_mdsize(struct ll_sb_info *sbi, int *default_mdsize);
 int ll_set_default_mdsize(struct ll_sb_info *sbi, int default_mdsize);
-int ll_process_config(struct lustre_cfg *lcfg);
 
 enum {
 	LUSTRE_OPC_MKDIR	= 0,
@@ -942,6 +1034,7 @@ enum {
 	LUSTRE_OPC_ANY		= 5,
 };
 
+void ll_unlock_md_op_lsm(struct md_op_data *op_data);
 struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
 				      struct inode *i1, struct inode *i2,
 				      const char *name, size_t namelen,
@@ -954,6 +1047,8 @@ ssize_t ll_copy_user_md(const struct lov_user_md __user *md,
 			struct lov_user_md **kbuf);
 void ll_open_cleanup(struct super_block *sb, struct ptlrpc_request *open_req);
 
+void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req);
+
 /* Compute expected user md size when passing in a md from user space */
 static inline ssize_t ll_lov_user_md_size(const struct lov_user_md *lum)
 {
@@ -1022,7 +1117,6 @@ struct ll_thread_info {
 	struct iov_iter		lti_iter;
 	struct vvp_io_args	lti_args;
 	struct ra_io_arg	lti_ria;
-	struct kiocb		lti_kiocb;
 	struct ll_cl_context	lti_io_ctx;
 };
 
@@ -1237,11 +1331,18 @@ static inline int cl_glimpse_size(struct inode *inode)
 	return cl_glimpse_size0(inode, 0);
 }
 
+/* AGL is 'asychronous glimpse lock', which is a speculative lock taken as
+ * part of statahead */
 static inline int cl_agl(struct inode *inode)
 {
 	return cl_glimpse_size0(inode, 1);
 }
 
+int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise);
+
+int cl_io_get(struct inode *inode, struct lu_env **envout,
+	      struct cl_io **ioout, __u16 *refcheck);
+
 static inline int ll_glimpse_size(struct inode *inode)
 {
 	struct ll_inode_info *lli = ll_i2info(inode);
@@ -1249,7 +1350,7 @@ static inline int ll_glimpse_size(struct inode *inode)
 
 	down_read(&lli->lli_glimpse_sem);
 	rc = cl_glimpse_size(inode);
-	lli->lli_glimpse_time = cfs_time_current();
+	lli->lli_glimpse_time = ktime_get();
 	up_read(&lli->lli_glimpse_sem);
 	return rc;
 }
@@ -1376,9 +1477,15 @@ static inline int d_lustre_invalid(const struct dentry *dentry)
 static inline void __d_lustre_invalidate(struct dentry *dentry)
 {
 	struct ll_dentry_data *lld = ll_d2d(dentry);
+	struct ll_sb_info *sbi = ll_s2sbi(dentry->d_sb);
+
+	if (lld == NULL)
+		return;
 
-	if (lld != NULL)
-		lld->lld_invalid = 1;
+	if (sbi->ll_neg_dentry_timeout != OBD_NEG_CACHE_TIMEOUT_DEFAULT_SECS)
+		lld->lld_neg_cache_timeout =
+			jiffies + sbi->ll_neg_dentry_timeout * HZ;
+	lld->lld_invalid = 1;
 }
 
 /*
@@ -1419,7 +1526,8 @@ static inline void d_lustre_revalidate(struct dentry *dentry)
 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf);
 int ll_layout_refresh(struct inode *inode, __u32 *gen);
 int ll_layout_restore(struct inode *inode, loff_t start, __u64 length);
-int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end);
+int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
+			   struct lu_extent *ext);
 
 int ll_xattr_init(void);
 void ll_xattr_fini(void);
@@ -1431,7 +1539,7 @@ int ll_getparent(struct file *file, struct getparent __user *arg);
 
 /* lcommon_cl.c */
 int cl_setattr_ost(struct cl_object *obj, const struct iattr *attr,
-		   unsigned int attr_flags);
+		   enum op_xvalid xvalid, unsigned int attr_flags);
 
 extern struct lu_env *cl_inode_fini_env;
 extern __u16 cl_inode_fini_refcheck;
@@ -1442,4 +1550,6 @@ void cl_inode_fini(struct inode *inode);
 u64 cl_fid_build_ino(const struct lu_fid *fid, int api32);
 u32 cl_fid_build_gen(const struct lu_fid *fid);
 
+int ll_get_hsm_state(struct inode *inode, u32 *hus_states);
+
 #endif /* LLITE_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_lib.c b/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
index 04256c2600083..c10a3ebfabafe 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -46,16 +46,18 @@
 #ifdef HAVE_UIDGID_HEADER
 # include <linux/uidgid.h>
 #endif
+#include <linux/security.h>
 
-#include <uapi/linux/lustre_ioctl.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
 #ifdef HAVE_UAPI_LINUX_MOUNT_H
 #include <uapi/linux/mount.h>
 #endif
+
 #include <lustre_ha.h>
 #include <lustre_dlm.h>
 #include <lprocfs_status.h>
 #include <lustre_disk.h>
-#include <uapi/linux/lustre_param.h>
+#include <uapi/linux/lustre/lustre_param.h>
 #include <lustre_log.h>
 #include <cl_object.h>
 #include <obd_cksum.h>
@@ -86,6 +88,8 @@ static struct ll_sb_info *ll_init_sbi(void)
 	spin_lock_init(&sbi->ll_pp_extent_lock);
 	spin_lock_init(&sbi->ll_process_lock);
         sbi->ll_rw_stats_on = 0;
+	sbi->ll_statfs_max_age = OBD_STATFS_CACHE_SECONDS;
+	sbi->ll_neg_dentry_timeout = OBD_NEG_CACHE_TIMEOUT_DEFAULT_SECS;
 
         si_meminfo(&si);
         pages = si.totalram - si.totalhigh;
@@ -111,6 +115,9 @@ static struct ll_sb_info *ll_init_sbi(void)
 #ifdef ENABLE_CHECKSUM
         sbi->ll_flags |= LL_SBI_CHECKSUM;
 #endif
+#ifdef ENABLE_FLOCK
+	sbi->ll_flags |= LL_SBI_FLOCK;
+#endif
 
 #ifdef HAVE_LRU_RESIZE_SUPPORT
         sbi->ll_flags |= LL_SBI_LRU_RESIZE;
@@ -133,6 +140,7 @@ static struct ll_sb_info *ll_init_sbi(void)
 	atomic_set(&sbi->ll_agl_total, 0);
 	sbi->ll_flags |= LL_SBI_AGL_ENABLED;
 	sbi->ll_flags |= LL_SBI_FAST_READ;
+	sbi->ll_flags |= LL_SBI_TINY_WRITE;
 
 	/* root squash */
 	sbi->ll_squash.rsi_uid = 0;
@@ -160,30 +168,23 @@ static void ll_free_sbi(struct super_block *sb)
 	EXIT;
 }
 
-static inline int obd_connect_has_secctx(struct obd_connect_data *data)
-{
-	return data->ocd_connect_flags & OBD_CONNECT_FLAGS2 &&
-	       data->ocd_connect_flags2 & OBD_CONNECT2_FILE_SECCTX;
-}
-
 static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
                                     struct vfsmount *mnt)
 {
 	struct inode *root = NULL;
-        struct ll_sb_info *sbi = ll_s2sbi(sb);
-        struct obd_device *obd;
-        struct obd_statfs *osfs = NULL;
-        struct ptlrpc_request *request = NULL;
-        struct obd_connect_data *data = NULL;
-        struct obd_uuid *uuid;
-        struct md_op_data *op_data;
-        struct lustre_md lmd;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct obd_statfs *osfs = NULL;
+	struct ptlrpc_request *request = NULL;
+	struct obd_connect_data *data = NULL;
+	struct obd_uuid *uuid;
+	struct md_op_data *op_data;
+	struct lustre_md lmd;
 	u64 valid;
-        int size, err, checksum;
-        ENTRY;
+	int size, err, checksum;
 
-        obd = class_name2obd(md);
-        if (!obd) {
+	ENTRY;
+	sbi->ll_md_obd = class_name2obd(md);
+	if (!sbi->ll_md_obd) {
                 CERROR("MD %s: not setup or attached\n", md);
                 RETURN(-EINVAL);
         }
@@ -198,13 +199,18 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
                 RETURN(-ENOMEM);
         }
 
-        /* indicate the features supported by this client */
-        data->ocd_connect_flags = OBD_CONNECT_IBITS    | OBD_CONNECT_NODEVOH  |
-                                  OBD_CONNECT_ATTRFID  |
-                                  OBD_CONNECT_VERSION  | OBD_CONNECT_BRW_SIZE |
-                                  OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA |
-                                  OBD_CONNECT_CANCELSET | OBD_CONNECT_FID     |
-                                  OBD_CONNECT_AT       | OBD_CONNECT_LOV_V3   |
+	/* pass client page size via ocd_grant_blkbits, the server should report
+	 * back its backend blocksize for grant calculation purpose */
+	data->ocd_grant_blkbits = PAGE_SHIFT;
+
+	/* indicate MDT features supported by this client */
+	data->ocd_connect_flags = OBD_CONNECT_IBITS    | OBD_CONNECT_NODEVOH  |
+				  OBD_CONNECT_ATTRFID  | OBD_CONNECT_GRANT |
+				  OBD_CONNECT_VERSION  | OBD_CONNECT_BRW_SIZE |
+				  OBD_CONNECT_SRVLOCK  | OBD_CONNECT_TRUNCLOCK|
+				  OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA |
+				  OBD_CONNECT_CANCELSET | OBD_CONNECT_FID     |
+				  OBD_CONNECT_AT       | OBD_CONNECT_LOV_V3   |
 				  OBD_CONNECT_VBR | OBD_CONNECT_FULL20 |
 				  OBD_CONNECT_64BITHASH |
 				  OBD_CONNECT_EINPROGRESS |
@@ -215,11 +221,31 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 				  OBD_CONNECT_DISP_STRIPE | OBD_CONNECT_LFSCK |
 				  OBD_CONNECT_OPEN_BY_FID |
 				  OBD_CONNECT_DIR_STRIPE |
-				  OBD_CONNECT_BULK_MBITS |
+				  OBD_CONNECT_BULK_MBITS | OBD_CONNECT_CKSUM |
 				  OBD_CONNECT_SUBTREE |
-				  OBD_CONNECT_FLAGS2 | OBD_CONNECT_MULTIMODRPCS;
-
-	data->ocd_connect_flags2 = 0;
+				  OBD_CONNECT_MULTIMODRPCS |
+				  OBD_CONNECT_GRANT_PARAM |
+				  OBD_CONNECT_SHORTIO | OBD_CONNECT_FLAGS2;
+
+	data->ocd_connect_flags2 = OBD_CONNECT2_FLR |
+				   OBD_CONNECT2_LOCK_CONVERT |
+				   OBD_CONNECT2_DIR_MIGRATE |
+				   OBD_CONNECT2_SUM_STATFS |
+				   OBD_CONNECT2_ARCHIVE_ID_ARRAY |
+				   OBD_CONNECT2_LSOM |
+				   OBD_CONNECT2_ASYNC_DISCARD |
+				   OBD_CONNECT2_GETATTR_PFID;
+	if (sbi->ll_flags & LL_SBI_MDLL)
+		data->ocd_connect_flags2 |= OBD_CONNECT2_MDLL;
+
+	if (sbi->ll_flags & LL_SBI_MDLL_AUTO_REFRESH)
+		data->ocd_connect_flags2 |= OBD_CONNECT2_MDLL_AUTO_REFRESH;
+
+	if (sbi->ll_flags & LL_SBI_MDLL)
+		data->ocd_connect_flags2 |= OBD_CONNECT2_MDLL;
+
+	if (sbi->ll_flags & LL_SBI_MDLL_AUTO_REFRESH)
+		data->ocd_connect_flags2 |= OBD_CONNECT2_MDLL_AUTO_REFRESH;
 
 #ifdef HAVE_LRU_RESIZE_SUPPORT
         if (sbi->ll_flags & LL_SBI_LRU_RESIZE)
@@ -230,6 +256,8 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 				   OBD_CONNECT_LARGE_ACL;
 #endif
 
+	data->ocd_cksum_types = obd_cksum_types_supported_client();
+
 	if (OBD_FAIL_CHECK(OBD_FAIL_MDC_LIGHTWEIGHT))
 		/* flag mdc connection as lightweight, only used for test
 		 * purpose, use with care */
@@ -261,13 +289,16 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	if (sbi->ll_flags & LL_SBI_ALWAYS_PING)
 		data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS;
 
-#if defined(HAVE_SECURITY_DENTRY_INIT_SECURITY) && defined(CONFIG_SECURITY)
-	data->ocd_connect_flags2 |= OBD_CONNECT2_FILE_SECCTX;
-#endif /* HAVE_SECURITY_DENTRY_INIT_SECURITY */
+	obd_connect_set_secctx(data);
+
+#if defined(CONFIG_SECURITY)
+	data->ocd_connect_flags2 |= OBD_CONNECT2_SELINUX_POLICY;
+#endif
 
 	data->ocd_brw_size = MD_MAX_BRW_SIZE;
 
-        err = obd_connect(NULL, &sbi->ll_md_exp, obd, &sbi->ll_sb_uuid, data, NULL);
+	err = obd_connect(NULL, &sbi->ll_md_exp, sbi->ll_md_obd,
+			  &sbi->ll_sb_uuid, data, NULL);
 	if (err == -EBUSY) {
 		LCONSOLE_ERROR_MSG(0x14f, "An MDT (md %s) is performing "
 				   "recovery, of which this client is not a "
@@ -293,7 +324,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	 * can make sure the client can be mounted as long as MDT0 is
 	 * avaible */
 	err = obd_statfs(NULL, sbi->ll_md_exp, osfs,
-			cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			ktime_get_seconds() - sbi->ll_statfs_max_age,
 			OBD_STATFS_FOR_MDT0);
 	if (err)
 		GOTO(out_md_fid, err);
@@ -336,6 +367,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	sb->s_blocksize_bits = log2(osfs->os_bsize);
 	sb->s_magic = LL_SUPER_MAGIC;
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sbi->ll_inode_cache_enabled = 1;
 	sbi->ll_namelen = osfs->os_namelen;
 	sbi->ll_mnt.mnt = current->fs->root.mnt;
 
@@ -380,8 +412,8 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 		}
 	}
 
-	obd = class_name2obd(dt);
-	if (!obd) {
+	sbi->ll_dt_obd = class_name2obd(dt);
+	if (!sbi->ll_dt_obd) {
 		CERROR("DT %s: not setup or attached\n", dt);
 		GOTO(out_md_fid, err = -ENODEV);
 	}
@@ -390,6 +422,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	 * back its backend blocksize for grant calculation purpose */
 	data->ocd_grant_blkbits = PAGE_SHIFT;
 
+	/* indicate OST features supported by this client */
 	data->ocd_connect_flags = OBD_CONNECT_GRANT | OBD_CONNECT_VERSION |
 				  OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE |
 				  OBD_CONNECT_CANCELSET | OBD_CONNECT_FID |
@@ -401,23 +434,41 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 				  OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
 				  OBD_CONNECT_LAYOUTLOCK |
 				  OBD_CONNECT_PINGLESS | OBD_CONNECT_LFSCK |
-				  OBD_CONNECT_BULK_MBITS;
+				  OBD_CONNECT_BULK_MBITS | OBD_CONNECT_SHORTIO |
+				  OBD_CONNECT_FLAGS2 | OBD_CONNECT_GRANT_SHRINK;
+
+/* The client currently advertises support for OBD_CONNECT_LOCKAHEAD_OLD so it
+ * can interoperate with an older version of lockahead which was released prior
+ * to landing in master. This support will be dropped when 2.13 development
+ * starts.  At the point, we should not just drop the connect flag (below), we
+ * should also remove the support in the code.
+ *
+ * Removing it means a few things:
+ * 1. Remove this section here
+ * 2. Remove CEF_NONBLOCK in ll_file_lockahead()
+ * 3. Remove function exp_connect_lockahead_old
+ * 4. Remove LDLM_FL_LOCKAHEAD_OLD_RESERVED in lustre_dlm_flags.h
+ * */
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 12, 50, 0)
+	data->ocd_connect_flags |= OBD_CONNECT_LOCKAHEAD_OLD;
+#endif
 
-	data->ocd_connect_flags2 = 0;
+	data->ocd_connect_flags2 = OBD_CONNECT2_LOCKAHEAD;
 
 	if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_GRANT_PARAM))
 		data->ocd_connect_flags |= OBD_CONNECT_GRANT_PARAM;
 
 	/* OBD_CONNECT_CKSUM should always be set, even if checksums are
 	 * disabled by default, because it can still be enabled on the
-	 * fly via /proc. As a consequence, we still need to come to an
-	 * agreement on the supported algorithms at connect time */
+	 * fly via /sys. As a consequence, we still need to come to an
+	 * agreement on the supported algorithms at connect time
+	 */
 	data->ocd_connect_flags |= OBD_CONNECT_CKSUM;
 
 	if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CKSUM_ADLER_ONLY))
 		data->ocd_cksum_types = OBD_CKSUM_ADLER;
 	else
-		data->ocd_cksum_types = cksum_types_supported_client();
+		data->ocd_cksum_types = obd_cksum_types_supported_client();
 
 #ifdef HAVE_LRU_RESIZE_SUPPORT
 	data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
@@ -430,13 +481,13 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	       "ocd_grant: %d\n", data->ocd_connect_flags,
 	       data->ocd_version, data->ocd_grant);
 
-	obd->obd_upcall.onu_owner = &sbi->ll_lco;
-	obd->obd_upcall.onu_upcall = cl_ocd_update;
+	sbi->ll_dt_obd->obd_upcall.onu_owner = &sbi->ll_lco;
+	sbi->ll_dt_obd->obd_upcall.onu_upcall = cl_ocd_update;
 
 	data->ocd_brw_size = DT_MAX_BRW_SIZE;
 
-	err = obd_connect(NULL, &sbi->ll_dt_exp, obd, &sbi->ll_sb_uuid, data,
-			  NULL);
+	err = obd_connect(NULL, &sbi->ll_dt_exp, sbi->ll_dt_obd,
+			  &sbi->ll_sb_uuid, data, NULL);
 	if (err == -EBUSY) {
 		LCONSOLE_ERROR_MSG(0x150, "An OST (dt %s) is performing "
 				   "recovery, of which this client is not a "
@@ -452,10 +503,15 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	sbi->ll_dt_exp->exp_connect_data = *data;
 
 	/* Don't change value if it was specified in the config log */
-	if (sbi->ll_ra_info.ra_max_read_ahead_whole_pages == -1)
+	if (sbi->ll_ra_info.ra_max_read_ahead_whole_pages == -1) {
 		sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
 			max_t(unsigned long, SBI_DEFAULT_READAHEAD_WHOLE_MAX,
 			      (data->ocd_brw_size >> PAGE_SHIFT));
+		if (sbi->ll_ra_info.ra_max_read_ahead_whole_pages >
+		    sbi->ll_ra_info.ra_max_pages_per_file)
+			sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
+				sbi->ll_ra_info.ra_max_pages_per_file;
+	}
 
 	err = obd_fid_init(sbi->ll_dt_exp->exp_obd, sbi->ll_dt_exp,
 			   LUSTRE_SEQ_METADATA);
@@ -546,13 +602,15 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	}
 
 	checksum = sbi->ll_flags & LL_SBI_CHECKSUM;
-	err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM),
-				 KEY_CHECKSUM, sizeof(checksum), &checksum,
-				 NULL);
-	if (err) {
-		CERROR("%s: Set checksum failed: rc = %d\n",
-		       sbi->ll_dt_exp->exp_obd->obd_name, err);
-		GOTO(out_root, err);
+	if (sbi->ll_checksum_set) {
+		err = obd_set_info_async(NULL, sbi->ll_dt_exp,
+					 sizeof(KEY_CHECKSUM), KEY_CHECKSUM,
+					 sizeof(checksum), &checksum, NULL);
+		if (err) {
+			CERROR("%s: Set checksum failed: rc = %d\n",
+			       sbi->ll_dt_exp->exp_obd->obd_name, err);
+			GOTO(out_root, err);
+		}
 	}
 	cl_sb_init(sb);
 
@@ -591,14 +649,21 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	if (osfs != NULL)
 		OBD_FREE_PTR(osfs);
 
-	if (sbi->ll_proc_root != NULL) {
-		err = lprocfs_ll_register_obd(sb, dt);
+	if (sbi->ll_dt_obd) {
+		err = sysfs_create_link(&sbi->ll_kset.kobj,
+					&sbi->ll_dt_obd->obd_kset.kobj,
+					sbi->ll_dt_obd->obd_type->typ_name);
 		if (err < 0) {
 			CERROR("%s: could not register %s in llite: rc = %d\n",
 			       dt, ll_get_fsname(sb, NULL, 0), err);
 			err = 0;
 		}
-		err = lprocfs_ll_register_obd(sb, md);
+	}
+
+	if (sbi->ll_md_obd) {
+		err = sysfs_create_link(&sbi->ll_kset.kobj,
+					&sbi->ll_md_obd->obd_kset.kobj,
+					sbi->ll_md_obd->obd_type->typ_name);
 		if (err < 0) {
 			CERROR("%s: could not register %s in llite: rc = %d\n",
 			       md, ll_get_fsname(sb, NULL, 0), err);
@@ -615,11 +680,13 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 out_dt:
 	obd_disconnect(sbi->ll_dt_exp);
 	sbi->ll_dt_exp = NULL;
+	sbi->ll_dt_obd = NULL;
 out_md_fid:
 	obd_fid_fini(sbi->ll_md_exp->exp_obd);
 out_md:
 	obd_disconnect(sbi->ll_md_exp);
 	sbi->ll_md_exp = NULL;
+	sbi->ll_md_obd = NULL;
 out:
 	if (data != NULL)
 		OBD_FREE_PTR(data);
@@ -711,7 +778,7 @@ static void client_common_put_super(struct super_block *sb)
 	obd_disconnect(sbi->ll_dt_exp);
 	sbi->ll_dt_exp = NULL;
 
-	lprocfs_ll_unregister_mountpoint(sbi);
+	ll_debugfs_unregister_super(sb);
 
 	obd_fid_fini(sbi->ll_md_exp->exp_obd);
 	obd_disconnect(sbi->ll_md_exp);
@@ -749,56 +816,67 @@ void ll_kill_super(struct super_block *sb)
 
 static inline int ll_set_opt(const char *opt, char *data, int fl)
 {
-        if (strncmp(opt, data, strlen(opt)) != 0)
-                return(0);
-        else
-                return(fl);
+	if (strncmp(opt, data, strlen(opt)) != 0)
+		return 0;
+	else
+		return fl;
 }
 
 /* non-client-specific mount options are parsed in lmd_parse */
-static int ll_options(char *options, int *flags)
+static int ll_options(char *options, struct ll_sb_info *sbi)
 {
-        int tmp;
-        char *s1 = options, *s2;
-        ENTRY;
+	int tmp;
+	char *s1 = options, *s2;
+	int *flags = &sbi->ll_flags;
+	ENTRY;
 
-        if (!options)
-                RETURN(0);
+	if (!options)
+		RETURN(0);
 
-        CDEBUG(D_CONFIG, "Parsing opts %s\n", options);
+	CDEBUG(D_CONFIG, "Parsing opts %s\n", options);
 
-        while (*s1) {
-                CDEBUG(D_SUPER, "next opt=%s\n", s1);
-                tmp = ll_set_opt("nolock", s1, LL_SBI_NOLCK);
-                if (tmp) {
-                        *flags |= tmp;
-                        goto next;
-                }
-                tmp = ll_set_opt("flock", s1, LL_SBI_FLOCK);
-                if (tmp) {
-                        *flags |= tmp;
-                        goto next;
-                }
-                tmp = ll_set_opt("localflock", s1, LL_SBI_LOCALFLOCK);
-                if (tmp) {
-                        *flags |= tmp;
-                        goto next;
-                }
-                tmp = ll_set_opt("noflock", s1, LL_SBI_FLOCK|LL_SBI_LOCALFLOCK);
-                if (tmp) {
-                        *flags &= ~tmp;
-                        goto next;
-                }
-                tmp = ll_set_opt("user_xattr", s1, LL_SBI_USER_XATTR);
-                if (tmp) {
-                        *flags |= tmp;
-                        goto next;
-                }
-                tmp = ll_set_opt("nouser_xattr", s1, LL_SBI_USER_XATTR);
-                if (tmp) {
-                        *flags &= ~tmp;
-                        goto next;
-                }
+	while (*s1) {
+		CDEBUG(D_SUPER, "next opt=%s\n", s1);
+		tmp = ll_set_opt("nolock", s1, LL_SBI_NOLCK);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("flock", s1, LL_SBI_FLOCK);
+		if (tmp) {
+			*flags = (*flags & ~LL_SBI_LOCALFLOCK) | tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("localflock", s1, LL_SBI_LOCALFLOCK);
+		if (tmp) {
+			*flags = (*flags & ~LL_SBI_FLOCK) | tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("mdll_auto_refresh", s1, LL_SBI_MDLL_AUTO_REFRESH);
+		if (tmp) {
+			*flags = (*flags & ~LL_SBI_MDLL_AUTO_REFRESH) | tmp;
+			goto next;
+		  }
+		tmp = ll_set_opt("mdll", s1, LL_SBI_MDLL);
+		if (tmp) {
+			*flags = (*flags & ~LL_SBI_MDLL) | tmp;
+			goto next;
+		  }
+		tmp = ll_set_opt("noflock", s1, LL_SBI_FLOCK|LL_SBI_LOCALFLOCK);
+		if (tmp) {
+			*flags &= ~tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("user_xattr", s1, LL_SBI_USER_XATTR);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("nouser_xattr", s1, LL_SBI_USER_XATTR);
+		if (tmp) {
+			*flags &= ~tmp;
+			goto next;
+		}
 		tmp = ll_set_opt("context", s1, 1);
 		if (tmp)
 			goto next;
@@ -822,16 +900,18 @@ static int ll_options(char *options, int *flags)
 			goto next;
 		}
 
-                tmp = ll_set_opt("checksum", s1, LL_SBI_CHECKSUM);
-                if (tmp) {
-                        *flags |= tmp;
-                        goto next;
-                }
-                tmp = ll_set_opt("nochecksum", s1, LL_SBI_CHECKSUM);
-                if (tmp) {
-                        *flags &= ~tmp;
-                        goto next;
-                }
+		tmp = ll_set_opt("checksum", s1, LL_SBI_CHECKSUM);
+		if (tmp) {
+			*flags |= tmp;
+			sbi->ll_checksum_set = 1;
+			goto next;
+		}
+		tmp = ll_set_opt("nochecksum", s1, LL_SBI_CHECKSUM);
+		if (tmp) {
+			*flags &= ~tmp;
+			sbi->ll_checksum_set = 1;
+			goto next;
+		}
                 tmp = ll_set_opt("lruresize", s1, LL_SBI_LRU_RESIZE);
                 if (tmp) {
                         *flags |= tmp;
@@ -918,21 +998,24 @@ void ll_lli_init(struct ll_inode_info *lli)
 		lli->lli_opendir_pid = 0;
 		lli->lli_sa_enabled = 0;
 		lli->lli_def_stripe_offset = -1;
+		init_rwsem(&lli->lli_lsm_sem);
 	} else {
 		mutex_init(&lli->lli_size_mutex);
 		lli->lli_symlink_name = NULL;
 		init_rwsem(&lli->lli_trunc_sem);
 		range_lock_tree_init(&lli->lli_write_tree);
 		init_rwsem(&lli->lli_glimpse_sem);
-		lli->lli_glimpse_time = 0;
+		lli->lli_glimpse_time = ktime_set(0, 0);
 		INIT_LIST_HEAD(&lli->lli_agl_list);
 		lli->lli_agl_index = 0;
 		lli->lli_async_rc = 0;
 	}
 	mutex_init(&lli->lli_layout_mutex);
-	memset(lli->lli_jobid, 0, LUSTRE_JOBID_SIZE);
+	memset(lli->lli_jobid, 0, sizeof(lli->lli_jobid));
 }
 
+#define MAX_STRING_SIZE 128
+
 #ifndef HAVE_SUPER_SETUP_BDI_NAME
 
 #define LSI_BDI_INITIALIZED	0x00400000
@@ -941,8 +1024,6 @@ void ll_lli_init(struct ll_inode_info *lli)
 # define BDI_CAP_MAP_COPY	0
 #endif
 
-#define MAX_STRING_SIZE 128
-
 static int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
 {
 	struct  lustre_sb_info *lsi = s2lsi(sb);
@@ -973,58 +1054,66 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
 {
 	struct	lustre_profile *lprof = NULL;
 	struct	lustre_sb_info *lsi = s2lsi(sb);
-	struct	ll_sb_info *sbi;
+	struct	ll_sb_info *sbi = NULL;
 	char	*dt = NULL, *md = NULL;
 	char	*profilenm = get_profile_name(sb);
 	struct config_llog_instance *cfg;
 	/* %p for void* in printf needs 16+2 characters: 0xffffffffffffffff */
 	const int instlen = 16 + 2;
 	unsigned long cfg_instance = ll_get_cfg_instance(sb);
-	int	md_len = 0;
-	int	dt_len = 0;
-	int	err;
-	ENTRY;
+	char name[MAX_STRING_SIZE];
+	int md_len = 0;
+	int dt_len = 0;
+	char *ptr;
+	int len;
+	int err;
 
+	ENTRY;
 	/* for ASLR, to map between cfg_instance and hashed ptr */
 	CDEBUG(D_VFSTRACE, "VFS Op: cfg_instance %s-%016lx (sb %p)\n",
 	       profilenm, cfg_instance, sb);
 
+	try_module_get(THIS_MODULE);
+
 	OBD_ALLOC_PTR(cfg);
 	if (cfg == NULL)
-		RETURN(-ENOMEM);
-
-	try_module_get(THIS_MODULE);
+		GOTO(out_free_cfg, err = -ENOMEM);
 
 	/* client additional sb info */
 	lsi->lsi_llsbi = sbi = ll_init_sbi();
-	if (!sbi) {
-		module_put(THIS_MODULE);
-		OBD_FREE_PTR(cfg);
-		RETURN(-ENOMEM);
-	}
+	if (!sbi)
+		GOTO(out_free_cfg, err = -ENOMEM);
 
-	err = ll_options(lsi->lsi_lmd->lmd_opts, &sbi->ll_flags);
+	err = ll_options(lsi->lsi_lmd->lmd_opts, sbi);
 	if (err)
-		GOTO(out_free, err);
+		GOTO(out_free_cfg, err);
 
 	err = super_setup_bdi_name(sb, "lustre-%016lx", cfg_instance);
 	if (err)
-		GOTO(out_free, err);
+		GOTO(out_free_cfg, err);
 
 #ifndef HAVE_DCACHE_LOCK
 	/* kernel >= 2.6.38 store dentry operations in sb->s_d_op. */
 	sb->s_d_op = &ll_d_ops;
 #endif
+	/* Get fsname */
+	len = strlen(profilenm);
+	ptr = strrchr(profilenm, '-');
+	if (ptr && (strcmp(ptr, "-client") == 0))
+		len -= 7;
 
-	/* Call lprocfs_ll_register_mountpoint() before lustre_process_log()
-	 * so that "llite.*.*" params can be processed correctly. */
-	if (proc_lustre_fs_root != NULL) {
-		err = lprocfs_ll_register_mountpoint(proc_lustre_fs_root, sb);
-		if (err < 0) {
-			CERROR("%s: could not register mountpoint in llite: "
-			       "rc = %d\n", ll_get_fsname(sb, NULL, 0), err);
-			err = 0;
-		}
+	/* Mount info */
+	snprintf(name, MAX_STRING_SIZE, "%.*s-%016lx", len,
+		 profilenm, cfg_instance);
+
+	/* Call ll_debugfs_register_super() before lustre_process_log()
+	 * so that "llite.*.*" params can be processed correctly.
+	 */
+	err = ll_debugfs_register_super(sb, name);
+	if (err < 0) {
+		CERROR("%s: could not register mountpoint in llite: rc = %d\n",
+		       ll_get_fsname(sb, NULL, 0), err);
+		err = 0;
 	}
 
 	/* The cfg_instance is a value unique to this super, in case some
@@ -1037,7 +1126,7 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
 	/* set up client obds */
 	err = lustre_process_log(sb, profilenm, cfg);
 	if (err < 0)
-		GOTO(out_proc, err);
+		GOTO(out_debugfs, err);
 
 	/* Profile set with LCFG_MOUNTOPT so we can find our mdc and osc obds */
 	lprof = class_get_profile(profilenm);
@@ -1045,7 +1134,7 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
 		LCONSOLE_ERROR_MSG(0x156, "The client profile '%s' could not be"
 				   " read from the MGS.  Does that filesystem "
 				   "exist?\n", profilenm);
-		GOTO(out_proc, err = -EINVAL);
+		GOTO(out_debugfs, err = -EINVAL);
 	}
 	CDEBUG(D_CONFIG, "Found profile %s: mdc=%s osc=%s\n", profilenm,
 	       lprof->lp_md, lprof->lp_dt);
@@ -1053,38 +1142,42 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
 	dt_len = strlen(lprof->lp_dt) + instlen + 2;
 	OBD_ALLOC(dt, dt_len);
 	if (!dt)
-		GOTO(out_proc, err = -ENOMEM);
+		GOTO(out_profile, err = -ENOMEM);
 	snprintf(dt, dt_len - 1, "%s-%016lx", lprof->lp_dt, cfg_instance);
 
 	md_len = strlen(lprof->lp_md) + instlen + 2;
 	OBD_ALLOC(md, md_len);
 	if (!md)
-		GOTO(out_proc, err = -ENOMEM);
+		GOTO(out_free_dt, err = -ENOMEM);
 	snprintf(md, md_len - 1, "%s-%016lx", lprof->lp_md, cfg_instance);
 
 	/* connections, registrations, sb setup */
 	err = client_common_fill_super(sb, md, dt, mnt);
 	if (err < 0)
-		GOTO(out_proc, err);
+		GOTO(out_free_md, err);
 
 	sbi->ll_client_common_fill_super_succeeded = 1;
 
-out_proc:
-	if (err < 0)
-		lprocfs_ll_unregister_mountpoint(sbi);
-out_free:
+out_free_md:
 	if (md)
 		OBD_FREE(md, md_len);
+out_free_dt:
 	if (dt)
 		OBD_FREE(dt, dt_len);
-	if (lprof != NULL)
+out_profile:
+	if (lprof)
 		class_put_profile(lprof);
+out_debugfs:
+	if (err < 0)
+		ll_debugfs_unregister_super(sb);
+out_free_cfg:
+	if (cfg)
+		OBD_FREE_PTR(cfg);
+
 	if (err)
 		ll_put_super(sb);
 	else if (sbi->ll_flags & LL_SBI_VERBOSE)
 		LCONSOLE_WARN("Mounted %s\n", profilenm);
-
-	OBD_FREE_PTR(cfg);
 	RETURN(err);
 } /* ll_fill_super */
 
@@ -1100,6 +1193,8 @@ void ll_put_super(struct super_block *sb)
 	int next, force = 1, rc = 0;
 	ENTRY;
 
+	if (!sbi)
+		GOTO(out_no_sbi, 0);
 
 	/* Should replace instance_id with something better for ASLR */
 	CDEBUG(D_VFSTRACE, "VFS Op: cfg_instance %s-%016lx (sb %p)\n",
@@ -1164,7 +1259,7 @@ void ll_put_super(struct super_block *sb)
 
         ll_free_sbi(sb);
         lsi->lsi_llsbi = NULL;
-
+out_no_sbi:
 	lustre_common_put_super(sb);
 
 	cl_env_cache_purge(~0);
@@ -1268,108 +1363,124 @@ static int ll_init_lsm_md(struct inode *inode, struct lustre_md *md)
 {
 	struct lu_fid *fid;
 	struct lmv_stripe_md *lsm = md->lmv;
+	struct ll_inode_info *lli = ll_i2info(inode);
 	int i;
 
 	LASSERT(lsm != NULL);
+
+	CDEBUG(D_INODE, "%s: "DFID" set dir layout:\n",
+		ll_get_fsname(inode->i_sb, NULL, 0),
+		PFID(&lli->lli_fid));
+	lsm_md_dump(D_INODE, lsm);
+
 	/* XXX sigh, this lsm_root initialization should be in
 	 * LMV layer, but it needs ll_iget right now, so we
 	 * put this here right now. */
 	for (i = 0; i < lsm->lsm_md_stripe_count; i++) {
 		fid = &lsm->lsm_md_oinfo[i].lmo_fid;
 		LASSERT(lsm->lsm_md_oinfo[i].lmo_root == NULL);
+
+		if (!fid_is_sane(fid))
+			continue;
+
 		/* Unfortunately ll_iget will call ll_update_inode,
 		 * where the initialization of slave inode is slightly
 		 * different, so it reset lsm_md to NULL to avoid
 		 * initializing lsm for slave inode. */
-		/* For migrating inode, master stripe and master object will
-		 * be same, so we only need assign this inode */
-		if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION && i == 0)
-			lsm->lsm_md_oinfo[i].lmo_root = inode;
-		else
-			lsm->lsm_md_oinfo[i].lmo_root =
+		lsm->lsm_md_oinfo[i].lmo_root =
 				ll_iget_anon_dir(inode->i_sb, fid, md);
-
 		if (IS_ERR(lsm->lsm_md_oinfo[i].lmo_root)) {
 			int rc = PTR_ERR(lsm->lsm_md_oinfo[i].lmo_root);
 
 			lsm->lsm_md_oinfo[i].lmo_root = NULL;
+			while (i-- > 0) {
+				iput(lsm->lsm_md_oinfo[i].lmo_root);
+				lsm->lsm_md_oinfo[i].lmo_root = NULL;
+			}
 			return rc;
 		}
 	}
 
-	return 0;
-}
+	lli->lli_lsm_md = lsm;
 
-static inline int lli_lsm_md_eq(const struct lmv_stripe_md *lsm_md1,
-				const struct lmv_stripe_md *lsm_md2)
-{
-	return lsm_md1->lsm_md_magic == lsm_md2->lsm_md_magic &&
-	       lsm_md1->lsm_md_stripe_count == lsm_md2->lsm_md_stripe_count &&
-	       lsm_md1->lsm_md_master_mdt_index ==
-					lsm_md2->lsm_md_master_mdt_index &&
-	       lsm_md1->lsm_md_hash_type == lsm_md2->lsm_md_hash_type &&
-	       lsm_md1->lsm_md_layout_version ==
-					lsm_md2->lsm_md_layout_version &&
-	       strcmp(lsm_md1->lsm_md_pool_name,
-		      lsm_md2->lsm_md_pool_name) == 0;
+	return 0;
 }
 
 static int ll_update_lsm_md(struct inode *inode, struct lustre_md *md)
 {
 	struct ll_inode_info *lli = ll_i2info(inode);
 	struct lmv_stripe_md *lsm = md->lmv;
-	int	rc;
+	struct cl_attr	*attr;
+	int rc = 0;
+
 	ENTRY;
 
 	LASSERT(S_ISDIR(inode->i_mode));
 	CDEBUG(D_INODE, "update lsm %p of "DFID"\n", lli->lli_lsm_md,
 	       PFID(ll_inode2fid(inode)));
 
-	/* no striped information from request. */
-	if (lsm == NULL) {
-		if (lli->lli_lsm_md == NULL) {
-			RETURN(0);
-		} else if (lli->lli_lsm_md->lsm_md_hash_type &
-						LMV_HASH_FLAG_MIGRATION) {
-			/* migration is done, the temporay MIGRATE layout has
-			 * been removed */
-			CDEBUG(D_INODE, DFID" finish migration.\n",
-			       PFID(ll_inode2fid(inode)));
-			lmv_free_memmd(lli->lli_lsm_md);
-			lli->lli_lsm_md = NULL;
-			RETURN(0);
-		} else {
-			/* The lustre_md from req does not include stripeEA,
-			 * see ll_md_setattr */
-			RETURN(0);
-		}
-	}
-
-	/* set the directory layout */
-	if (lli->lli_lsm_md == NULL) {
-		struct cl_attr	*attr;
+	/*
+	 * no striped information from request, lustre_md from req does not
+	 * include stripeEA, see ll_md_setattr()
+	 */
+	if (!lsm)
+		RETURN(0);
 
-		rc = ll_init_lsm_md(inode, md);
-		if (rc != 0)
-			RETURN(rc);
+	/*
+	 * normally dir layout doesn't change, only take read lock to check
+	 * that to avoid blocking other MD operations.
+	 */
+	down_read(&lli->lli_lsm_sem);
 
-		/* set md->lmv to NULL, so the following free lustre_md
-		 * will not free this lsm */
-		md->lmv = NULL;
-		lli->lli_lsm_md = lsm;
+	/* some concurrent lookup initialized lsm, and unchanged */
+	if (lli->lli_lsm_md && lsm_md_eq(lli->lli_lsm_md, lsm))
+		GOTO(unlock, rc = 0);
 
-		OBD_ALLOC_PTR(attr);
-		if (attr == NULL)
-			RETURN(-ENOMEM);
+	/* if dir layout doesn't match, check whether version is increased,
+	 * which means layout is changed, this happens in dir split/merge and
+	 * lfsck.
+	 */
+	if (lli->lli_lsm_md &&
+	    lsm->lsm_md_layout_version <=
+	    lli->lli_lsm_md->lsm_md_layout_version) {
+		CERROR("%s: "DFID" dir layout mismatch:\n",
+		       ll_get_fsname(inode->i_sb, NULL, 0),
+		       PFID(&lli->lli_fid));
+		lsm_md_dump(D_ERROR, lli->lli_lsm_md);
+		lsm_md_dump(D_ERROR, lsm);
+		GOTO(unlock, rc = -EINVAL);
+	}
+  
+	up_read(&lli->lli_lsm_sem);
+	down_write(&lli->lli_lsm_sem);
+	/* clear existing lsm */
+	if (lli->lli_lsm_md) {
+		lmv_free_memmd(lli->lli_lsm_md);
+		lli->lli_lsm_md = NULL;
+ 	}
 
-		/* validate the lsm */
-		rc = md_merge_attr(ll_i2mdexp(inode), lsm, attr,
-				   ll_md_blocking_ast);
-		if (rc != 0) {
-			OBD_FREE_PTR(attr);
-			RETURN(rc);
-		}
+	rc = ll_init_lsm_md(inode, md);
+	up_write(&lli->lli_lsm_sem);
+	if (rc)
+		RETURN(rc);
+ 
+	/* set md->lmv to NULL, so the following free lustre_md will not free
+	 * this lsm.
+	 */
+	md->lmv = NULL;
 
+	/* md_merge_attr() may take long, since lsm is already set, switch to
+	 * read lock.
+	 */
+	down_read(&lli->lli_lsm_sem);
+	OBD_ALLOC_PTR(attr);
+	if (!attr)
+		GOTO(unlock, rc = -ENOMEM);
+ 
+	/* validate the lsm */
+	rc = md_merge_attr(ll_i2mdexp(inode), lli->lli_lsm_md, attr,
+			   ll_md_blocking_ast);
+	if (!rc) {
 		if (md->body->mbo_valid & OBD_MD_FLNLINK)
 			md->body->mbo_nlink = attr->cat_nlink;
 		if (md->body->mbo_valid & OBD_MD_FLSIZE)
@@ -1380,51 +1491,14 @@ static int ll_update_lsm_md(struct inode *inode, struct lustre_md *md)
 			md->body->mbo_ctime = attr->cat_ctime;
 		if (md->body->mbo_valid & OBD_MD_FLMTIME)
 			md->body->mbo_mtime = attr->cat_mtime;
-
-		OBD_FREE_PTR(attr);
-
-		CDEBUG(D_INODE, "Set lsm %p magic %x to "DFID"\n", lsm,
-		       lsm->lsm_md_magic, PFID(ll_inode2fid(inode)));
-		RETURN(0);
 	}
 
-	/* Compare the old and new stripe information */
-	if (!lsm_md_eq(lli->lli_lsm_md, lsm)) {
-		struct lmv_stripe_md	*old_lsm = lli->lli_lsm_md;
-		int			idx;
-
-		CERROR("%s: inode "DFID"(%p)'s lmv layout mismatch (%p)/(%p)"
-		       "magic:0x%x/0x%x stripe count: %d/%d master_mdt: %d/%d"
-		       "hash_type:0x%x/0x%x layout: 0x%x/0x%x pool:%s/%s\n",
-		       ll_get_fsname(inode->i_sb, NULL, 0), PFID(&lli->lli_fid),
-		       inode, lsm, old_lsm,
-		       lsm->lsm_md_magic, old_lsm->lsm_md_magic,
-		       lsm->lsm_md_stripe_count,
-		       old_lsm->lsm_md_stripe_count,
-		       lsm->lsm_md_master_mdt_index,
-		       old_lsm->lsm_md_master_mdt_index,
-		       lsm->lsm_md_hash_type, old_lsm->lsm_md_hash_type,
-		       lsm->lsm_md_layout_version,
-		       old_lsm->lsm_md_layout_version,
-		       lsm->lsm_md_pool_name,
-		       old_lsm->lsm_md_pool_name);
-
-		for (idx = 0; idx < old_lsm->lsm_md_stripe_count; idx++) {
-			CERROR("%s: sub FIDs in old lsm idx %d, old: "DFID"\n",
-			       ll_get_fsname(inode->i_sb, NULL, 0), idx,
-			       PFID(&old_lsm->lsm_md_oinfo[idx].lmo_fid));
-		}
-
-		for (idx = 0; idx < lsm->lsm_md_stripe_count; idx++) {
-			CERROR("%s: sub FIDs in new lsm idx %d, new: "DFID"\n",
-			       ll_get_fsname(inode->i_sb, NULL, 0), idx,
-			       PFID(&lsm->lsm_md_oinfo[idx].lmo_fid));
-		}
-
-		RETURN(-EIO);
-	}
+	OBD_FREE_PTR(attr);
+	GOTO(unlock, rc);
+unlock:
+	up_read(&lli->lli_lsm_sem);
 
-	RETURN(0);
+	return rc;
 }
 
 void ll_clear_inode(struct inode *inode)
@@ -1561,7 +1635,8 @@ static int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data)
  *
  * In case of HSMimport, we only set attr on MDS.
  */
-int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import)
+int ll_setattr_raw(struct dentry *dentry, struct iattr *attr,
+		   enum op_xvalid xvalid, bool hsm_import)
 {
         struct inode *inode = dentry->d_inode;
         struct ll_inode_info *lli = ll_i2info(inode);
@@ -1601,12 +1676,12 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import)
 			RETURN(-EPERM);
 	}
 
-        /* We mark all of the fields "set" so MDS/OST does not re-set them */
-	if (!(attr->ia_valid & ATTR_CTIME_SET) &&
-	    (attr->ia_valid & ATTR_CTIME)) {
+	/* We mark all of the fields "set" so MDS/OST does not re-set them */
+	if (!(xvalid & OP_XVALID_CTIME_SET) &&
+	     (attr->ia_valid & ATTR_CTIME)) {
 		attr->ia_ctime = current_time(inode);
-                attr->ia_valid |= ATTR_CTIME_SET;
-        }
+		xvalid |= OP_XVALID_CTIME_SET;
+	}
 	if (!(attr->ia_valid & ATTR_ATIME_SET) &&
 	    (attr->ia_valid & ATTR_ATIME)) {
 		attr->ia_atime = current_time(inode);
@@ -1638,13 +1713,22 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import)
 
 	if (!hsm_import && attr->ia_valid & ATTR_SIZE) {
 		/* If we are changing file size, file content is
-		 * modified, flag it. */
-		attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
+		 * modified, flag it.
+		 */
+		xvalid |= OP_XVALID_OWNEROVERRIDE;
 		op_data->op_bias |= MDS_DATA_MODIFIED;
 		ll_file_clear_flag(lli, LLIF_DATA_MODIFIED);
 	}
 
+	if (attr->ia_valid & ATTR_FILE) {
+		struct ll_file_data *fd = LUSTRE_FPRIVATE(attr->ia_file);
+
+		if (fd->fd_lease_och)
+			op_data->op_bias |= MDS_TRUNC_KEEP_LEASE;
+	}
+
 	op_data->op_attr = *attr;
+	op_data->op_xvalid = xvalid;
 
 	rc = ll_md_setattr(dentry, op_data);
 	if (rc)
@@ -1653,17 +1737,17 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import)
 	if (!S_ISREG(inode->i_mode) || hsm_import)
 		GOTO(out, rc = 0);
 
-	if (attr->ia_valid & (ATTR_SIZE |
-			      ATTR_ATIME | ATTR_ATIME_SET |
-			      ATTR_MTIME | ATTR_MTIME_SET |
-			      ATTR_CTIME | ATTR_CTIME_SET)) {
+	if (attr->ia_valid & (ATTR_SIZE | ATTR_ATIME | ATTR_ATIME_SET |
+			      ATTR_MTIME | ATTR_MTIME_SET | ATTR_CTIME) ||
+	    xvalid & OP_XVALID_CTIME_SET) {
 		/* For truncate and utimes sending attributes to OSTs, setting
 		 * mtime/atime to the past will be performed under PW [0:EOF]
 		 * extent lock (new_size:EOF for truncate).  It may seem
 		 * excessive to send mtime/atime updates to OSTs when not
 		 * setting times to past, but it is necessary due to possible
-		 * time de-synchronization between MDT inode and OST objects */
-		rc = cl_setattr_ost(lli->lli_clob, attr, 0);
+		 * time de-synchronization between MDT inode and OST objects
+		 */
+		rc = cl_setattr_ost(lli->lli_clob, attr, xvalid, 0);
 	}
 
 	/* If the file was restored, it needs to set dirty flag.
@@ -1724,10 +1808,11 @@ int ll_setattr(struct user_namespace *mnt_userns, struct dentry *de,
 	       struct iattr *attr)
 {
 	int mode = de->d_inode->i_mode;
+	enum op_xvalid xvalid = 0;
 
 	if ((attr->ia_valid & (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) ==
 			      (ATTR_CTIME|ATTR_SIZE|ATTR_MODE))
-		attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
+		xvalid |= OP_XVALID_OWNEROVERRIDE;
 
 	if (((attr->ia_valid & (ATTR_MODE|ATTR_FORCE|ATTR_SIZE)) ==
 			       (ATTR_SIZE|ATTR_MODE)) &&
@@ -1748,61 +1833,60 @@ int ll_setattr(struct user_namespace *mnt_userns, struct dentry *de,
 	    !(attr->ia_valid & ATTR_KILL_SGID))
 		attr->ia_valid |= ATTR_KILL_SGID;
 
-	/* avoid polluted from ATTR_TIMES_SET,
-	 * projid is not expected to be set here */
-	attr->ia_valid &= ~MDS_ATTR_PROJID;
-
-	return ll_setattr_raw(de, attr, false);
+	return ll_setattr_raw(de, attr, xvalid, false);
 }
 
-int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
-                       __u64 max_age, __u32 flags)
+int ll_statfs_internal(struct ll_sb_info *sbi, struct obd_statfs *osfs,
+		       u32 flags)
 {
-        struct ll_sb_info *sbi = ll_s2sbi(sb);
-        struct obd_statfs obd_osfs;
-        int rc;
-        ENTRY;
+	struct obd_statfs obd_osfs = { 0 };
+	time64_t max_age;
+	int rc;
 
-        rc = obd_statfs(NULL, sbi->ll_md_exp, osfs, max_age, flags);
-        if (rc) {
-                CERROR("md_statfs fails: rc = %d\n", rc);
-                RETURN(rc);
-        }
+	ENTRY;
+	max_age = ktime_get_seconds() - sbi->ll_statfs_max_age;
+
+	rc = obd_statfs(NULL, sbi->ll_md_exp, osfs, max_age, flags);
+	if (rc)
+		RETURN(rc);
 
-        osfs->os_type = sb->s_magic;
+	osfs->os_type = LL_SUPER_MAGIC;
 
 	CDEBUG(D_SUPER, "MDC blocks %llu/%llu objects %llu/%llu\n",
-               osfs->os_bavail, osfs->os_blocks, osfs->os_ffree,osfs->os_files);
+	      osfs->os_bavail, osfs->os_blocks, osfs->os_ffree, osfs->os_files);
 
-        if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
-                flags |= OBD_STATFS_NODELAY;
+	if (osfs->os_state & OS_STATE_SUM)
+		GOTO(out, rc);
 
-        rc = obd_statfs_rqset(sbi->ll_dt_exp, &obd_osfs, max_age, flags);
-        if (rc) {
-                CERROR("obd_statfs fails: rc = %d\n", rc);
-                RETURN(rc);
-        }
+	if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
+		flags |= OBD_STATFS_NODELAY;
+
+	rc = obd_statfs(NULL, sbi->ll_dt_exp, &obd_osfs, max_age, flags);
+	if (rc) /* Possibly a filesystem with no OSTs.  Report MDT totals. */
+		GOTO(out, rc = 0);
 
 	CDEBUG(D_SUPER, "OSC blocks %llu/%llu objects %llu/%llu\n",
-               obd_osfs.os_bavail, obd_osfs.os_blocks, obd_osfs.os_ffree,
-               obd_osfs.os_files);
-
-        osfs->os_bsize = obd_osfs.os_bsize;
-        osfs->os_blocks = obd_osfs.os_blocks;
-        osfs->os_bfree = obd_osfs.os_bfree;
-        osfs->os_bavail = obd_osfs.os_bavail;
-
-        /* If we don't have as many objects free on the OST as inodes
-         * on the MDS, we reduce the total number of inodes to
-         * compensate, so that the "inodes in use" number is correct.
-         */
-        if (obd_osfs.os_ffree < osfs->os_ffree) {
-                osfs->os_files = (osfs->os_files - osfs->os_ffree) +
-                        obd_osfs.os_ffree;
-                osfs->os_ffree = obd_osfs.os_ffree;
-        }
+	       obd_osfs.os_bavail, obd_osfs.os_blocks, obd_osfs.os_ffree,
+	       obd_osfs.os_files);
+
+	osfs->os_bsize = obd_osfs.os_bsize;
+	osfs->os_blocks = obd_osfs.os_blocks;
+	osfs->os_bfree = obd_osfs.os_bfree;
+	osfs->os_bavail = obd_osfs.os_bavail;
+
+	/* If we have _some_ OSTs, but don't have as many free objects on the
+	 * OSTs as inodes on the MDTs, reduce the reported number of inodes
+	 * to compensate, so that the "inodes in use" number is correct.
+	 * This should be kept in sync with lod_statfs() behaviour.
+	 */
+	if (obd_osfs.os_files && obd_osfs.os_ffree < osfs->os_ffree) {
+		osfs->os_files = (osfs->os_files - osfs->os_ffree) +
+				 obd_osfs.os_ffree;
+		osfs->os_ffree = obd_osfs.os_ffree;
+	}
 
-        RETURN(rc);
+out:
+	RETURN(rc);
 }
 int ll_statfs(struct dentry *de, struct kstatfs *sfs)
 {
@@ -1814,12 +1898,10 @@ int ll_statfs(struct dentry *de, struct kstatfs *sfs)
 	CDEBUG(D_VFSTRACE, "VFS Op: at %llu jiffies\n", get_jiffies_64());
         ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_STAFS, 1);
 
-        /* Some amount of caching on the client is allowed */
-        rc = ll_statfs_internal(sb, &osfs,
-                                cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
-                                0);
-        if (rc)
-                return rc;
+	/* Some amount of caching on the client is allowed */
+	rc = ll_statfs_internal(ll_s2sbi(sb), &osfs, OBD_STATFS_SUM);
+	if (rc)
+		return rc;
 
         statfs_unpack(sfs, &osfs);
 
@@ -1863,6 +1945,15 @@ void ll_inode_size_unlock(struct inode *inode)
 	mutex_unlock(&lli->lli_size_mutex);
 }
 
+void ll_update_inode_flags(struct inode *inode, int ext_flags)
+{
+	inode->i_flags = ll_ext_to_inode_flags(ext_flags);
+	if (ext_flags & LUSTRE_PROJINHERIT_FL)
+		ll_file_set_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT);
+	else
+		ll_file_clear_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT);
+}
+
 int ll_update_inode(struct inode *inode, struct lustre_md *md)
 {
 	struct ll_inode_info *lli = ll_i2info(inode);
@@ -1920,7 +2011,7 @@ int ll_update_inode(struct inode *inode, struct lustre_md *md)
 
 	/* Clear i_flags to remove S_NOSEC before permissions are updated */
 	if (body->mbo_valid & OBD_MD_FLFLAGS)
-		inode->i_flags = ll_ext_to_inode_flags(body->mbo_flags);
+		ll_update_inode_flags(inode, body->mbo_flags);
 	if (body->mbo_valid & OBD_MD_FLMODE)
 		inode->i_mode = (inode->i_mode & S_IFMT) |
 				(body->mbo_mode & ~S_IFMT);
@@ -2048,11 +2139,17 @@ void ll_delete_inode(struct inode *inode)
 	unsigned long nrpages;
 	ENTRY;
 
-	if (S_ISREG(inode->i_mode) && lli->lli_clob != NULL)
+	if (S_ISREG(inode->i_mode) && lli->lli_clob != NULL) {
 		/* It is last chance to write out dirty pages,
-		 * otherwise we may lose data while umount */
-		cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, CL_FSYNC_LOCAL, 1);
-
+		 * otherwise we may lose data while umount.
+		 *
+		 * If i_nlink is 0 then just discard data. This is safe because
+		 * local inode gets i_nlink 0 from server only for the last
+		 * unlink, so that file is not opened somewhere else
+		 */
+		cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, inode->i_nlink ?
+				   CL_FSYNC_LOCAL : CL_FSYNC_DISCARD, 1);
+	}
 	truncate_inode_pages_final(mapping);
 
 	/* Workaround for LU-118: Note nrpages may not be totally updated when
@@ -2085,13 +2182,13 @@ void ll_delete_inode(struct inode *inode)
 int ll_iocontrol(struct inode *inode, struct file *file,
                  unsigned int cmd, unsigned long arg)
 {
-        struct ll_sb_info *sbi = ll_i2sbi(inode);
-        struct ptlrpc_request *req = NULL;
-        int rc, flags = 0;
-        ENTRY;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *req = NULL;
+	int rc, flags = 0;
+	ENTRY;
 
-        switch(cmd) {
-        case FSFILT_IOC_GETFLAGS: {
+	switch (cmd) {
+	case FS_IOC_GETFLAGS: {
                 struct mdt_body *body;
                 struct md_op_data *op_data;
 
@@ -2115,32 +2212,41 @@ int ll_iocontrol(struct inode *inode, struct file *file,
 
 		flags = body->mbo_flags;
 
-                ptlrpc_req_finished(req);
+		ptlrpc_req_finished(req);
 
 		RETURN(put_user(flags, (int __user *)arg));
-        }
-        case FSFILT_IOC_SETFLAGS: {
+	}
+	case FS_IOC_SETFLAGS: {
 		struct iattr *attr;
 		struct md_op_data *op_data;
 		struct cl_object *obj;
+		struct fsxattr fa = { 0 };
 
 		if (get_user(flags, (int __user *)arg))
 			RETURN(-EFAULT);
 
-                op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
-                                             LUSTRE_OPC_ANY, NULL);
-                if (IS_ERR(op_data))
-                        RETURN(PTR_ERR(op_data));
+		fa.fsx_projid = ll_i2info(inode)->lli_projid;
+		if (flags & LUSTRE_PROJINHERIT_FL)
+			fa.fsx_xflags = FS_XFLAG_PROJINHERIT;
+
+		rc = ll_ioctl_check_project(inode, &fa);
+		if (rc)
+			RETURN(rc);
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+					     LUSTRE_OPC_ANY, NULL);
+		if (IS_ERR(op_data))
+			RETURN(PTR_ERR(op_data));
 
 		op_data->op_attr_flags = flags;
-                op_data->op_attr.ia_valid |= ATTR_ATTR_FLAG;
+		op_data->op_xvalid |= OP_XVALID_FLAGS;
 		rc = md_setattr(sbi->ll_md_exp, op_data, NULL, 0, &req);
                 ll_finish_md_op_data(op_data);
                 ptlrpc_req_finished(req);
 		if (rc)
 			RETURN(rc);
 
-		inode->i_flags = ll_ext_to_inode_flags(flags);
+		ll_update_inode_flags(inode, flags);
 
 		obj = ll_i2info(inode)->lli_clob;
 		if (obj == NULL)
@@ -2150,8 +2256,7 @@ int ll_iocontrol(struct inode *inode, struct file *file,
 		if (attr == NULL)
 			RETURN(-ENOMEM);
 
-		attr->ia_valid = ATTR_ATTR_FLAG;
-		rc = cl_setattr_ost(obj, attr, flags);
+		rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS, flags);
 
 		OBD_FREE_PTR(attr);
 		RETURN(rc);
@@ -2299,7 +2404,7 @@ void ll_open_cleanup(struct super_block *sb, struct ptlrpc_request *open_req)
 	}
 
 	op_data->op_fid1 = body->mbo_fid1;
-	op_data->op_handle = body->mbo_handle;
+	op_data->op_open_handle = body->mbo_open_handle;
 	op_data->op_mod_time = ktime_get_real_seconds();
 	md_close(exp, op_data, NULL, &close_req);
 	ptlrpc_req_finished(close_req);
@@ -2392,8 +2497,10 @@ int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req,
 	md_free_lustre_md(sbi->ll_md_exp, &md);
 
 cleanup:
-	if (rc != 0 && it != NULL && it->it_op & IT_OPEN)
+	if (rc != 0 && it != NULL && it->it_op & IT_OPEN) {
+		ll_intent_drop_lock(it);
 		ll_open_cleanup(sb != NULL ? sb : (*inode)->i_sb, req);
+	}
 
 	return rc;
 }
@@ -2441,30 +2548,21 @@ int ll_obd_statfs(struct inode *inode, void __user *arg)
 	return rc;
 }
 
-int ll_process_config(struct lustre_cfg *lcfg)
+/*
+ * this is normally called in ll_fini_md_op_data(), but sometimes it needs to
+ * be called early to avoid deadlock.
+ */
+void ll_unlock_md_op_lsm(struct md_op_data *op_data)
 {
-	struct super_block *sb;
-	unsigned long x;
-	int rc = 0;
-	char *ptr;
+	if (op_data->op_mea2_sem) {
+		up_read(op_data->op_mea2_sem);
+		op_data->op_mea2_sem = NULL;
+	}
 
-	/* The instance name contains the sb: lustre-client-aacfe000 */
-	ptr = strrchr(lustre_cfg_string(lcfg, 0), '-');
-	if (!ptr || !*(++ptr))
-		return -EINVAL;
-	if (sscanf(ptr, "%lx", &x) != 1)
-		return -EINVAL;
-	sb = (struct super_block *)x;
-	/* This better be a real Lustre superblock! */
-	LASSERT(s2lsi(sb)->lsi_lmd->lmd_magic == LMD_MAGIC);
-
-	/* Note we have not called client_common_fill_super yet, so
-	   proc fns must be able to handle that! */
-	rc = class_process_proc_param(PARAM_LLITE, lprocfs_llite_obd_vars,
-				      lcfg, sb);
-	if (rc > 0)
-		rc = 0;
-	return rc;
+	if (op_data->op_mea1_sem) {
+		up_read(op_data->op_mea1_sem);
+		op_data->op_mea1_sem = NULL;
+	}
 }
 
 /* this function prepares md_op_data hint for passing it down to MD stack. */
@@ -2483,7 +2581,9 @@ struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
 		if (namelen > ll_i2sbi(i1)->ll_namelen)
 			return ERR_PTR(-ENAMETOOLONG);
 
-		if (!lu_name_is_valid_2(name, namelen))
+		/* "/" is not valid name, but it's allowed */
+		if (!lu_name_is_valid_2(name, namelen) &&
+		    strncmp("/", name, namelen) != 0)
 			return ERR_PTR(-EINVAL);
 	}
 
@@ -2496,7 +2596,10 @@ struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
 	ll_i2gids(op_data->op_suppgids, i1, i2);
 	op_data->op_fid1 = *ll_inode2fid(i1);
 	op_data->op_default_stripe_offset = -1;
+
 	if (S_ISDIR(i1->i_mode)) {
+		down_read(&ll_i2info(i1)->lli_lsm_sem);
+		op_data->op_mea1_sem = &ll_i2info(i1)->lli_lsm_sem;
 		op_data->op_mea1 = ll_i2info(i1)->lli_lsm_md;
 		if (opc == LUSTRE_OPC_MKDIR)
 			op_data->op_default_stripe_offset =
@@ -2505,8 +2608,14 @@ struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
 
 	if (i2) {
 		op_data->op_fid2 = *ll_inode2fid(i2);
-		if (S_ISDIR(i2->i_mode))
+		if (S_ISDIR(i2->i_mode)) {
+			if (i2 != i1) {
+				down_read(&ll_i2info(i2)->lli_lsm_sem);
+				op_data->op_mea2_sem =
+						&ll_i2info(i2)->lli_lsm_sem;
+			}
 			op_data->op_mea2 = ll_i2info(i2)->lli_lsm_md;
+		}
 	} else {
 		fid_zero(&op_data->op_fid2);
 	}
@@ -2520,15 +2629,14 @@ struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
 	op_data->op_name = name;
 	op_data->op_namelen = namelen;
 	op_data->op_mode = mode;
-	op_data->op_mod_time = cfs_time_current_sec();
+	op_data->op_mod_time = ktime_get_real_seconds();
 	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
 	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
 	op_data->op_cap = cfs_curproc_cap_pack();
+	op_data->op_mds = 0;
 	if ((opc == LUSTRE_OPC_CREATE) && (name != NULL) &&
 	     filename_is_volatile(name, namelen, &op_data->op_mds)) {
 		op_data->op_bias |= MDS_CREATE_VOLATILE;
-	} else {
-		op_data->op_mds = 0;
 	}
 	op_data->op_data = data;
 
@@ -2537,9 +2645,10 @@ struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
 
 void ll_finish_md_op_data(struct md_op_data *op_data)
 {
+	ll_unlock_md_op_lsm(op_data);
 	ll_security_release_secctx(op_data->op_file_secctx,
 				   op_data->op_file_secctx_size);
-        OBD_FREE_PTR(op_data);
+	OBD_FREE_PTR(op_data);
 }
 
 #ifdef HAVE_SUPEROPS_USE_DENTRY
@@ -2548,7 +2657,7 @@ int ll_show_options(struct seq_file *seq, struct dentry *dentry)
 int ll_show_options(struct seq_file *seq, struct vfsmount *vfs)
 #endif
 {
-        struct ll_sb_info *sbi;
+	struct ll_sb_info *sbi;
 
 #ifdef HAVE_SUPEROPS_USE_DENTRY
 	LASSERT((seq != NULL) && (dentry != NULL));
@@ -2558,20 +2667,25 @@ int ll_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	sbi = ll_s2sbi(vfs->mnt_sb);
 #endif
 
-        if (sbi->ll_flags & LL_SBI_NOLCK)
-                seq_puts(seq, ",nolock");
+	if (sbi->ll_flags & LL_SBI_NOLCK)
+		seq_puts(seq, ",nolock");
 
-        if (sbi->ll_flags & LL_SBI_FLOCK)
-                seq_puts(seq, ",flock");
-
-        if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
-                seq_puts(seq, ",localflock");
+	/* "flock" is the default since 2.13, but it wasn't for many years,
+	 * so it is still useful to print this to show it is enabled.
+	 * Start to print "noflock" so it is now clear when flock is disabled.
+	 */
+	if (sbi->ll_flags & LL_SBI_FLOCK)
+		seq_puts(seq, ",flock");
+	else if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
+		seq_puts(seq, ",localflock");
+	else
+		seq_puts(seq, ",noflock");
 
-        if (sbi->ll_flags & LL_SBI_USER_XATTR)
-                seq_puts(seq, ",user_xattr");
+	if (sbi->ll_flags & LL_SBI_USER_XATTR)
+		seq_puts(seq, ",user_xattr");
 
-        if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
-                seq_puts(seq, ",lazystatfs");
+	if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
+		seq_puts(seq, ",lazystatfs");
 
 	if (sbi->ll_flags & LL_SBI_USER_FID2PATH)
 		seq_puts(seq, ",user_fid2path");
@@ -2579,7 +2693,13 @@ int ll_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (sbi->ll_flags & LL_SBI_ALWAYS_PING)
 		seq_puts(seq, ",always_ping");
 
-        RETURN(0);
+	if (sbi->ll_flags & LL_SBI_MDLL)
+		seq_puts(seq, ",mdll");
+
+	if (sbi->ll_flags & LL_SBI_MDLL_AUTO_REFRESH)
+		seq_puts(seq, ",mdll_auto_refresh");
+
+	RETURN(0);
 }
 
 /**
@@ -2697,12 +2817,12 @@ ssize_t ll_copy_user_md(const struct lov_user_md __user *md,
 	if (lum_size < 0)
 		RETURN(lum_size);
 
-	OBD_ALLOC(*kbuf, lum_size);
+	OBD_ALLOC_LARGE(*kbuf, lum_size);
 	if (*kbuf == NULL)
 		RETURN(-ENOMEM);
 
 	if (copy_from_user(*kbuf, md, lum_size) != 0) {
-		OBD_FREE(*kbuf, lum_size);
+		OBD_FREE_LARGE(*kbuf, lum_size);
 		RETURN(-EFAULT);
 	}
 
@@ -2730,7 +2850,7 @@ void ll_compute_rootsquash_state(struct ll_sb_info *sbi)
 		matched = false;
 		i = 0;
 		while (LNetGetId(i++, &id) != -ENOENT) {
-			if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
+			if (id.nid == LNET_NID_LO_0)
 				continue;
 			if (cfs_match_nid(id.nid, &squash->rsi_nosquash_nids)) {
 				matched = true;
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c b/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
index e286c559c1f67..9be9bd690ee6d 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_mmap.c
@@ -150,7 +150,7 @@ static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage,
 	int                      result;
 	__u16			 refcheck;
 	sigset_t		 set;
-	struct inode             *inode;
+	struct inode             *inode = NULL;
 	struct ll_inode_info     *lli;
 	ENTRY;
 
@@ -222,6 +222,16 @@ static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage,
 	CDEBUG(D_MMAP, "%s mkwrite with %d\n", current->comm, result);
 	LASSERT(ergo(result == 0, PageLocked(vmpage)));
 
+	/* if page has been unmapped, presumably due to lock reclaim for
+	 * concurrent usage, add some delay before retrying to prevent
+	 * entering live-lock situation with competitors
+	 */
+	if (result == -ENODATA && inode != NULL) {
+		CDEBUG(D_MMAP, "delaying new page-fault for inode %p to "
+			       "prevent live-lock\n", inode);
+		msleep(10);
+	}
+
 	return result;
 }
 
@@ -383,6 +393,12 @@ static vm_fault_t ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                 result |= VM_FAULT_LOCKED;
         }
 	cfs_restore_sigs(set);
+
+	if (vmf->page && result == VM_FAULT_LOCKED)
+		ll_rw_stats_tally(ll_i2sbi(file_inode(vma->vm_file)),
+				  current->pid, LUSTRE_FPRIVATE(vma->vm_file),
+				  cl_offset(NULL, vmf->page->index), PAGE_SIZE,
+				  READ);
         return result;
 }
 
@@ -439,6 +455,11 @@ static vm_fault_t ll_page_mkwrite(struct vm_area_struct *vma,
                 break;
         }
 
+	if (result == VM_FAULT_LOCKED)
+		ll_rw_stats_tally(ll_i2sbi(file_inode(vma->vm_file)),
+				  current->pid, LUSTRE_FPRIVATE(vma->vm_file),
+				  cl_offset(NULL, vmf->page->index), PAGE_SIZE,
+				  WRITE);
         return result;
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_nfs.c b/drivers/staging/lustrefsx/lustre/llite/llite_nfs.c
index c24f7f6498ba0..2e207361dd908 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_nfs.c
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_nfs.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c b/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
old mode 100755
new mode 100644
index 6ed67697eb455..10f058b8256dd
--- a/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
+++ b/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -36,58 +36,124 @@
 #ifdef HAVE_UIDGID_HEADER
 # include <linux/uidgid.h>
 #endif
-#include <uapi/linux/lustre_param.h>
+#include <uapi/linux/lustre/lustre_param.h>
 #include <lprocfs_status.h>
 #include <obd_support.h>
 
 #include "llite_internal.h"
 #include "vvp_internal.h"
 
-struct proc_dir_entry *proc_lustre_fs_root;
+static struct kobject *llite_kobj;
+static struct dentry *llite_root;
 
-#ifdef CONFIG_PROC_FS
-/* /proc/lustre/llite mount point registration */
-static const struct proc_ops ll_rw_extents_stats_fops;
-static const struct proc_ops ll_rw_extents_stats_pp_fops;
-static const struct proc_ops ll_rw_offset_stats_fops;
-static __s64 ll_stats_pid_write(struct file *file,
-				const char __user *buf, size_t len);
+int llite_tunables_register(void)
+{
+	int rc = 0;
+
+	llite_kobj = class_setup_tunables("llite");
+	if (IS_ERR(llite_kobj))
+		return PTR_ERR(llite_kobj);
+
+	llite_root = debugfs_create_dir("llite", debugfs_lustre_root);
+	if (IS_ERR_OR_NULL(llite_root)) {
+		rc = llite_root ? PTR_ERR(llite_root) : -ENOMEM;
+		llite_root = NULL;
+		kobject_put(llite_kobj);
+		llite_kobj = NULL;
+	}
+
+	return rc;
+}
 
-static int ll_blksize_seq_show(struct seq_file *m, void *v)
+void llite_tunables_unregister(void)
 {
-	struct super_block *sb = m->private;
-	struct obd_statfs osfs;
+	if (llite_kobj) {
+		kobject_put(llite_kobj);
+		llite_kobj = NULL;
+	}
+
+	if (!IS_ERR_OR_NULL(llite_root)) {
+		debugfs_remove(llite_root);
+		llite_root = NULL;
+	}
+}
+
+/* <debugfs>/lustre/llite mount point registration */
+static const struct file_operations ll_rw_extents_stats_fops;
+static const struct file_operations ll_rw_extents_stats_pp_fops;
+static const struct file_operations ll_rw_offset_stats_fops;
+
+/**
+ * ll_stats_pid_write() - Determine if stats collection should be enabled
+ * @buf: Buffer containing the data written
+ * @len: Number of bytes in the buffer
+ *
+ * Several proc files begin collecting stats when a value is written, and stop
+ * collecting when either '0' or 'disable' is written. This function checks the
+ * written value to see if collection should be enabled or disabled.
+ *
+ * Return: If '0' or 'disable' is provided, 0 is returned. If the text
+ * equivalent of a number is written, that number is returned. Otherwise,
+ * 1 is returned. Non-zero return values indicate collection should be enabled.
+ */
+static s64 ll_stats_pid_write(const char __user *buf, size_t len)
+{
+	unsigned long long value = 1;
+	char kernbuf[16];
 	int rc;
 
-	LASSERT(sb != NULL);
-	rc = ll_statfs_internal(sb, &osfs,
-				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
-				OBD_STATFS_NODELAY);
-	if (!rc)
-		seq_printf(m, "%u\n", osfs.os_bsize);
-	return rc;
+	rc = kstrtoull_from_user(buf, len, 0, &value);
+	if (rc < 0 && len < sizeof(kernbuf)) {
+		if (copy_from_user(kernbuf, buf, len))
+			return -EFAULT;
+		kernbuf[len] = 0;
+
+		if (kernbuf[len - 1] == '\n')
+			kernbuf[len - 1] = 0;
+
+		if (strncasecmp(kernbuf, "disable", 7) == 0)
+			value = 0;
+	}
+
+	return value;
 }
-LPROC_SEQ_FOPS_RO(ll_blksize);
 
-static int ll_stat_blksize_seq_show(struct seq_file *m, void *v)
+static ssize_t blocksize_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
 {
-	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	struct obd_statfs osfs;
+	int rc;
 
-	seq_printf(m, "%u\n", sbi->ll_stat_blksize);
+	rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY);
+	if (rc)
+		return rc;
 
-	return 0;
+	return sprintf(buf, "%u\n", osfs.os_bsize);
 }
+LUSTRE_RO_ATTR(blocksize);
 
-static ssize_t ll_stat_blksize_seq_write(struct file *file,
-					 const char __user *buffer,
-					 size_t count, loff_t *off)
+static ssize_t stat_blocksize_show(struct kobject *kobj, struct attribute *attr,
+				   char *buf)
 {
-	struct seq_file *m = file->private_data;
-	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
-	__s64 val;
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return sprintf(buf, "%u\n", sbi->ll_stat_blksize);
+}
+
+static ssize_t stat_blocksize_store(struct kobject *kobj,
+				    struct attribute *attr,
+				    const char *buffer,
+				    size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	unsigned int val;
 	int rc;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtouint(buffer, 10, &val);
 	if (rc)
 		return rc;
 
@@ -98,173 +164,135 @@ static ssize_t ll_stat_blksize_seq_write(struct file *file,
 
 	return count;
 }
-LPROC_SEQ_FOPS(ll_stat_blksize);
+LUSTRE_RW_ATTR(stat_blocksize);
 
-static int ll_kbytestotal_seq_show(struct seq_file *m, void *v)
+static ssize_t kbytestotal_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
 {
-	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 	struct obd_statfs osfs;
+	u32 blk_size;
+	u64 result;
 	int rc;
 
-	LASSERT(sb != NULL);
-	rc = ll_statfs_internal(sb, &osfs,
-				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
-				OBD_STATFS_NODELAY);
-	if (!rc) {
-		__u32 blk_size = osfs.os_bsize >> 10;
-		__u64 result = osfs.os_blocks;
+	rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY);
+	if (rc)
+		return rc;
 
-		while (blk_size >>= 1)
-			result <<= 1;
+	blk_size = osfs.os_bsize >> 10;
+	result = osfs.os_blocks;
 
-		seq_printf(m, "%llu\n", result);
-	}
-	return rc;
+	while (blk_size >>= 1)
+		result <<= 1;
+
+	return sprintf(buf, "%llu\n", result);
 }
-LPROC_SEQ_FOPS_RO(ll_kbytestotal);
+LUSTRE_RO_ATTR(kbytestotal);
 
-static int ll_kbytesfree_seq_show(struct seq_file *m, void *v)
+static ssize_t kbytesfree_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
 {
-	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 	struct obd_statfs osfs;
+	u32 blk_size;
+	u64 result;
 	int rc;
 
-	LASSERT(sb != NULL);
-	rc = ll_statfs_internal(sb, &osfs,
-				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
-				OBD_STATFS_NODELAY);
-	if (!rc) {
-		__u32 blk_size = osfs.os_bsize >> 10;
-		__u64 result = osfs.os_bfree;
+	rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY);
+	if (rc)
+		return rc;
 
-		while (blk_size >>= 1)
-			result <<= 1;
+	blk_size = osfs.os_bsize >> 10;
+	result = osfs.os_bfree;
 
-		seq_printf(m, "%llu\n", result);
-	}
-	return rc;
+	while (blk_size >>= 1)
+		result <<= 1;
+
+	return sprintf(buf, "%llu\n", result);
 }
-LPROC_SEQ_FOPS_RO(ll_kbytesfree);
+LUSTRE_RO_ATTR(kbytesfree);
 
-static int ll_kbytesavail_seq_show(struct seq_file *m, void *v)
+static ssize_t kbytesavail_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
 {
-	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 	struct obd_statfs osfs;
+	u32 blk_size;
+	u64 result;
 	int rc;
 
-	LASSERT(sb != NULL);
-	rc = ll_statfs_internal(sb, &osfs,
-				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
-				OBD_STATFS_NODELAY);
-	if (!rc) {
-		__u32 blk_size = osfs.os_bsize >> 10;
-		__u64 result = osfs.os_bavail;
+	rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY);
+	if (rc)
+		return rc;
 
-		while (blk_size >>= 1)
-			result <<= 1;
+	blk_size = osfs.os_bsize >> 10;
+	result = osfs.os_bavail;
 
-		seq_printf(m, "%llu\n", result);
-	}
-	return rc;
+	while (blk_size >>= 1)
+		result <<= 1;
+
+	return sprintf(buf, "%llu\n", result);
 }
-LPROC_SEQ_FOPS_RO(ll_kbytesavail);
+LUSTRE_RO_ATTR(kbytesavail);
 
-static int ll_filestotal_seq_show(struct seq_file *m, void *v)
+static ssize_t filestotal_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
 {
-	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 	struct obd_statfs osfs;
 	int rc;
 
-	LASSERT(sb != NULL);
-	rc = ll_statfs_internal(sb, &osfs,
-				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
-				OBD_STATFS_NODELAY);
-	if (!rc)
-		seq_printf(m, "%llu\n", osfs.os_files);
-	return rc;
+	rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY);
+	if (rc)
+		return rc;
+
+	return sprintf(buf, "%llu\n", osfs.os_files);
 }
-LPROC_SEQ_FOPS_RO(ll_filestotal);
+LUSTRE_RO_ATTR(filestotal);
 
-static int ll_filesfree_seq_show(struct seq_file *m, void *v)
+static ssize_t filesfree_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
 {
-	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 	struct obd_statfs osfs;
 	int rc;
 
-	LASSERT(sb != NULL);
-	rc = ll_statfs_internal(sb, &osfs,
-				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
-				OBD_STATFS_NODELAY);
-	if (!rc)
-		seq_printf(m, "%llu\n", osfs.os_ffree);
-	return rc;
-}
-LPROC_SEQ_FOPS_RO(ll_filesfree);
-
-static int ll_client_type_seq_show(struct seq_file *m, void *v)
-{
-	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
-
-	LASSERT(sbi != NULL);
+	rc = ll_statfs_internal(sbi, &osfs, OBD_STATFS_NODELAY);
+	if (rc)
+		return rc;
 
-	seq_puts(m, "local client\n");
-	return 0;
+	return sprintf(buf, "%llu\n", osfs.os_ffree);
 }
-LPROC_SEQ_FOPS_RO(ll_client_type);
+LUSTRE_RO_ATTR(filesfree);
 
-static int ll_fstype_seq_show(struct seq_file *m, void *v)
+static ssize_t client_type_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
 {
-	struct super_block *sb = m->private;
-
-	LASSERT(sb != NULL);
-	seq_printf(m, "%s\n", sb->s_type->name);
-	return 0;
+	return sprintf(buf, "local client\n");
 }
-LPROC_SEQ_FOPS_RO(ll_fstype);
+LUSTRE_RO_ATTR(client_type);
 
-static int ll_sb_uuid_seq_show(struct seq_file *m, void *v)
+static ssize_t fstype_show(struct kobject *kobj, struct attribute *attr,
+			   char *buf)
 {
-	struct super_block *sb = m->private;
-
-	LASSERT(sb != NULL);
-	seq_printf(m, "%s\n", ll_s2sbi(sb)->ll_sb_uuid.uuid);
-	return 0;
+	return sprintf(buf, "lustre\n");
 }
-LPROC_SEQ_FOPS_RO(ll_sb_uuid);
+LUSTRE_RO_ATTR(fstype);
 
-static int ll_xattr_cache_seq_show(struct seq_file *m, void *v)
+static ssize_t uuid_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf)
 {
-	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 
-	seq_printf(m, "%u\n", sbi->ll_xattr_cache_enabled);
-	return 0;
+	return sprintf(buf, "%s\n", sbi->ll_sb_uuid.uuid);
 }
-
-static ssize_t ll_xattr_cache_seq_write(struct file *file,
-					const char __user *buffer,
-					size_t count, loff_t *off)
-{
-	struct seq_file *m = file->private_data;
-	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
-	__s64 val;
-	int rc;
-
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
-	if (rc)
-		return rc;
-
-	if (val != 0 && val != 1)
-		return -ERANGE;
-
-	if (val == 1 && !(sbi->ll_flags & LL_SBI_XATTR_CACHE))
-		return -ENOTSUPP;
-
-	sbi->ll_xattr_cache_enabled = val;
-	sbi->ll_xattr_cache_set = 1;
-
-	return count;
-}
-LPROC_SEQ_FOPS(ll_xattr_cache);
+LUSTRE_RO_ATTR(uuid);
 
 static int ll_site_stats_seq_show(struct seq_file *m, void *v)
 {
@@ -276,21 +304,21 @@ static int ll_site_stats_seq_show(struct seq_file *m, void *v)
 	 */
 	return cl_site_stats_print(lu2cl_site(ll_s2sbi(sb)->ll_site), m);
 }
-LPROC_SEQ_FOPS_RO(ll_site_stats);
+
+LDEBUGFS_SEQ_FOPS_RO(ll_site_stats);
 
 static int ll_max_readahead_mb_seq_show(struct seq_file *m, void *v)
 {
 	struct super_block *sb = m->private;
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
-	long pages_number;
-	int mult;
+	unsigned long ra_max_mb;
 
 	spin_lock(&sbi->ll_lock);
-	pages_number = sbi->ll_ra_info.ra_max_pages;
+	ra_max_mb = PAGES_TO_MiB(sbi->ll_ra_info.ra_max_pages);
 	spin_unlock(&sbi->ll_lock);
 
-	mult = 1 << (20 - PAGE_SHIFT);
-	return lprocfs_seq_read_frac_helper(m, pages_number, mult);
+	seq_printf(m, "%lu\n", ra_max_mb);
+	return 0;
 }
 
 static ssize_t
@@ -300,45 +328,43 @@ ll_max_readahead_mb_seq_write(struct file *file, const char __user *buffer,
 	struct seq_file *m = file->private_data;
 	struct super_block *sb = m->private;
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
-	__s64 pages_number;
+	s64 ra_max_mb, pages_number;
 	int rc;
 
-	rc = lprocfs_str_with_units_to_s64(file, buffer, count,
-			&pages_number, 'M');
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &ra_max_mb, 'M');
 	if (rc)
 		return rc;
 
-	pages_number >>= PAGE_SHIFT;
-
+	pages_number = round_up(ra_max_mb, 1024 * 1024) >> PAGE_SHIFT;
 	if (pages_number < 0 || pages_number > cfs_totalram_pages() / 2) {
 		/* 1/2 of RAM */
-		CERROR("%s: can't set max_readahead_mb=%lu > %luMB\n",
-		       ll_get_fsname(sb, NULL, 0),
-		       (unsigned long)pages_number >> (20 - PAGE_SHIFT),
-		       cfs_totalram_pages() >> (20 - PAGE_SHIFT + 1));
+		CERROR("%s: can't set max_readahead_mb=%llu > %luMB\n",
+		       ll_get_fsname(sb, NULL, 0), PAGES_TO_MiB(pages_number),
+		       PAGES_TO_MiB(cfs_totalram_pages()));
 		return -ERANGE;
 	}
 
 	spin_lock(&sbi->ll_lock);
 	sbi->ll_ra_info.ra_max_pages = pages_number;
 	spin_unlock(&sbi->ll_lock);
+
 	return count;
 }
-LPROC_SEQ_FOPS(ll_max_readahead_mb);
+
+LDEBUGFS_SEQ_FOPS(ll_max_readahead_mb);
 
 static int ll_max_readahead_per_file_mb_seq_show(struct seq_file *m, void *v)
 {
 	struct super_block *sb = m->private;
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
-	long pages_number;
-	int mult;
+	unsigned long ra_max_file_mb;
 
 	spin_lock(&sbi->ll_lock);
-	pages_number = sbi->ll_ra_info.ra_max_pages_per_file;
+	ra_max_file_mb = PAGES_TO_MiB(sbi->ll_ra_info.ra_max_pages_per_file);
 	spin_unlock(&sbi->ll_lock);
 
-	mult = 1 << (20 - PAGE_SHIFT);
-	return lprocfs_seq_read_frac_helper(m, pages_number, mult);
+	seq_printf(m, "%lu\n", ra_max_file_mb);
+	return 0;
 }
 
 static ssize_t
@@ -349,44 +375,43 @@ ll_max_readahead_per_file_mb_seq_write(struct file *file,
 	struct seq_file *m = file->private_data;
 	struct super_block *sb = m->private;
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	s64 ra_max_file_mb, pages_number;
 	int rc;
-	__s64 pages_number;
 
-	rc = lprocfs_str_with_units_to_s64(file, buffer, count,
-			&pages_number, 'M');
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &ra_max_file_mb,
+					   'M');
 	if (rc)
 		return rc;
 
-	pages_number >>= PAGE_SHIFT;
-
+	pages_number = round_up(ra_max_file_mb, 1024 * 1024) >> PAGE_SHIFT;
 	if (pages_number < 0 || pages_number > sbi->ll_ra_info.ra_max_pages) {
-		CERROR("%s: can't set max_readahead_per_file_mb=%lu > "
-		       "max_read_ahead_mb=%lu\n", ll_get_fsname(sb, NULL, 0),
-		       (unsigned long)pages_number >> (20 - PAGE_SHIFT),
-		       sbi->ll_ra_info.ra_max_pages >> (20 - PAGE_SHIFT));
+		CERROR("%s: can't set max_readahead_per_file_mb=%llu > max_read_ahead_mb=%lu\n",
+		       ll_get_fsname(sb, NULL, 0), PAGES_TO_MiB(pages_number),
+		       PAGES_TO_MiB(sbi->ll_ra_info.ra_max_pages));
 		return -ERANGE;
 	}
 
 	spin_lock(&sbi->ll_lock);
 	sbi->ll_ra_info.ra_max_pages_per_file = pages_number;
 	spin_unlock(&sbi->ll_lock);
+
 	return count;
 }
-LPROC_SEQ_FOPS(ll_max_readahead_per_file_mb);
+
+LDEBUGFS_SEQ_FOPS(ll_max_readahead_per_file_mb);
 
 static int ll_max_read_ahead_whole_mb_seq_show(struct seq_file *m, void *v)
 {
 	struct super_block *sb = m->private;
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
-	long pages_number;
-	int mult;
+	unsigned long ra_max_whole_mb;
 
 	spin_lock(&sbi->ll_lock);
-	pages_number = sbi->ll_ra_info.ra_max_read_ahead_whole_pages;
+	ra_max_whole_mb = PAGES_TO_MiB(sbi->ll_ra_info.ra_max_read_ahead_whole_pages);
 	spin_unlock(&sbi->ll_lock);
 
-	mult = 1 << (20 - PAGE_SHIFT);
-	return lprocfs_seq_read_frac_helper(m, pages_number, mult);
+	seq_printf(m, "%lu\n", ra_max_whole_mb);
+	return 0;
 }
 
 static ssize_t
@@ -397,52 +422,50 @@ ll_max_read_ahead_whole_mb_seq_write(struct file *file,
 	struct seq_file *m = file->private_data;
 	struct super_block *sb = m->private;
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	s64 ra_max_whole_mb, pages_number;
 	int rc;
-	__s64 pages_number;
 
-	rc = lprocfs_str_with_units_to_s64(file, buffer, count,
-			&pages_number, 'M');
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &ra_max_whole_mb,
+					   'M');
 	if (rc)
 		return rc;
 
-	pages_number >>= PAGE_SHIFT;
-
+	pages_number = round_up(ra_max_whole_mb, 1024 * 1024) >> PAGE_SHIFT;
 	/* Cap this at the current max readahead window size, the readahead
-	 * algorithm does this anyway so it's pointless to set it larger. */
+	 * algorithm does this anyway so it's pointless to set it larger.
+	 */
 	if (pages_number < 0 ||
 	    pages_number > sbi->ll_ra_info.ra_max_pages_per_file) {
-		int pages_shift = 20 - PAGE_SHIFT;
-		CERROR("%s: can't set max_read_ahead_whole_mb=%lu > "
-		       "max_read_ahead_per_file_mb=%lu\n",
-		       ll_get_fsname(sb, NULL, 0),
-		       (unsigned long)pages_number >> pages_shift,
-		       sbi->ll_ra_info.ra_max_pages_per_file >> pages_shift);
+		CERROR("%s: can't set max_read_ahead_whole_mb=%llu > max_read_ahead_per_file_mb=%lu\n",
+		       ll_get_fsname(sb, NULL, 0), PAGES_TO_MiB(pages_number),
+		       PAGES_TO_MiB(sbi->ll_ra_info.ra_max_pages_per_file));
 		return -ERANGE;
 	}
 
 	spin_lock(&sbi->ll_lock);
 	sbi->ll_ra_info.ra_max_read_ahead_whole_pages = pages_number;
 	spin_unlock(&sbi->ll_lock);
+
 	return count;
 }
-LPROC_SEQ_FOPS(ll_max_read_ahead_whole_mb);
+
+LDEBUGFS_SEQ_FOPS(ll_max_read_ahead_whole_mb);
 
 static int ll_max_cached_mb_seq_show(struct seq_file *m, void *v)
 {
 	struct super_block     *sb    = m->private;
 	struct ll_sb_info      *sbi   = ll_s2sbi(sb);
 	struct cl_client_cache *cache = sbi->ll_cache;
-	int shift = 20 - PAGE_SHIFT;
 	long max_cached_mb;
 	long unused_mb;
 
-	max_cached_mb = cache->ccc_lru_max >> shift;
-	unused_mb = atomic_long_read(&cache->ccc_lru_left) >> shift;
+	max_cached_mb = PAGES_TO_MiB(cache->ccc_lru_max);
+	unused_mb = PAGES_TO_MiB(atomic_long_read(&cache->ccc_lru_left));
 	seq_printf(m, "users: %d\n"
-		   "max_cached_mb: %ld\n"
-		   "used_mb: %ld\n"
-		   "unused_mb: %ld\n"
-		   "reclaim_count: %u\n",
+		      "max_cached_mb: %ld\n"
+		      "used_mb: %ld\n"
+		      "unused_mb: %ld\n"
+		      "reclaim_count: %u\n",
 		   atomic_read(&cache->ccc_users),
 		   max_cached_mb,
 		   max_cached_mb - unused_mb,
@@ -451,9 +474,9 @@ static int ll_max_cached_mb_seq_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static ssize_t
-ll_max_cached_mb_seq_write(struct file *file, const char __user *buffer,
-			   size_t count, loff_t *off)
+static ssize_t ll_max_cached_mb_seq_write(struct file *file,
+					  const char __user *buffer,
+					  size_t count, loff_t *off)
 {
 	struct seq_file *m = file->private_data;
 	struct super_block *sb = m->private;
@@ -464,21 +487,20 @@ ll_max_cached_mb_seq_write(struct file *file, const char __user *buffer,
 	long nrpages = 0;
 	__u16 refcheck;
 	__s64 pages_number;
-	long rc;
+	int rc;
 	char kernbuf[128];
-	ENTRY;
 
+	ENTRY;
 	if (count >= sizeof(kernbuf))
 		RETURN(-EINVAL);
 
-	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
+	if (copy_from_user(kernbuf, buffer, count))
 		RETURN(-EFAULT);
 	kernbuf[count] = 0;
 
 	buffer += lprocfs_find_named_value(kernbuf, "max_cached_mb:", &count) -
 		  kernbuf;
-	rc = lprocfs_str_with_units_to_s64(file, buffer, count,
-			&pages_number, 'M');
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &pages_number, 'M');
 	if (rc)
 		RETURN(rc);
 
@@ -487,7 +509,7 @@ ll_max_cached_mb_seq_write(struct file *file, const char __user *buffer,
 	if (pages_number < 0 || pages_number > cfs_totalram_pages()) {
 		CERROR("%s: can't set max cache more than %lu MB\n",
 		       ll_get_fsname(sb, NULL, 0),
-		       cfs_totalram_pages() >> (20 - PAGE_SHIFT));
+		       PAGES_TO_MiB(cfs_totalram_pages()));
 		RETURN(-ERANGE);
 	}
 	/* Allow enough cache so clients can make well-formed RPCs */
@@ -505,7 +527,7 @@ ll_max_cached_mb_seq_write(struct file *file, const char __user *buffer,
 
 	env = cl_env_get(&refcheck);
 	if (IS_ERR(env))
-		RETURN(rc);
+		RETURN(PTR_ERR(env));
 
 	diff = -diff;
 	while (diff > 0) {
@@ -558,218 +580,225 @@ ll_max_cached_mb_seq_write(struct file *file, const char __user *buffer,
 	}
 	return rc;
 }
-LPROC_SEQ_FOPS(ll_max_cached_mb);
 
-static int ll_checksum_seq_show(struct seq_file *m, void *v)
+LDEBUGFS_SEQ_FOPS(ll_max_cached_mb);
+
+static ssize_t checksums_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
 {
-	struct super_block *sb = m->private;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 
-	seq_printf(m, "%u\n", (sbi->ll_flags & LL_SBI_CHECKSUM) ? 1 : 0);
-	return 0;
+	return sprintf(buf, "%u\n", (sbi->ll_flags & LL_SBI_CHECKSUM) ? 1 : 0);
 }
 
-static ssize_t ll_checksum_seq_write(struct file *file,
-				     const char __user *buffer,
-				     size_t count, loff_t *off)
+static ssize_t checksums_store(struct kobject *kobj, struct attribute *attr,
+			       const char *buffer, size_t count)
 {
-	struct seq_file *m = file->private_data;
-	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	bool val;
+	int tmp;
 	int rc;
-	__s64 val;
 
 	if (!sbi->ll_dt_exp)
 		/* Not set up yet */
 		return -EAGAIN;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtobool(buffer, &val);
 	if (rc)
 		return rc;
 	if (val)
 		sbi->ll_flags |= LL_SBI_CHECKSUM;
 	else
 		sbi->ll_flags &= ~LL_SBI_CHECKSUM;
+	tmp = val;
 
 	rc = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM),
-				KEY_CHECKSUM, sizeof(val), &val, NULL);
+				KEY_CHECKSUM, sizeof(tmp), &tmp, NULL);
 	if (rc)
 		CWARN("Failed to set OSC checksum flags: %d\n", rc);
 
 	return count;
 }
-LPROC_SEQ_FOPS(ll_checksum);
+LUSTRE_RW_ATTR(checksums);
+
+LUSTRE_ATTR(checksum_pages, 0644, checksums_show, checksums_store);
 
-static int ll_rd_track_id(struct seq_file *m, enum stats_track_type type)
+static ssize_t ll_rd_track_id(struct kobject *kobj, char *buf,
+			      enum stats_track_type type)
 {
-	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 
-	if (ll_s2sbi(sb)->ll_stats_track_type == type) {
-		seq_printf(m, "%d\n",
-			   ll_s2sbi(sb)->ll_stats_track_id);
-	} else if (ll_s2sbi(sb)->ll_stats_track_type == STATS_TRACK_ALL) {
-		seq_puts(m, "0 (all)\n");
-	} else {
-		seq_puts(m, "untracked\n");
-	}
-	return 0;
+	if (sbi->ll_stats_track_type == type)
+		return sprintf(buf, "%d\n", sbi->ll_stats_track_id);
+	else if (sbi->ll_stats_track_type == STATS_TRACK_ALL)
+		return sprintf(buf, "0 (all)\n");
+
+	return sprintf(buf, "untracked\n");
 }
 
-static int ll_wr_track_id(struct file *file,
-			  const char __user *buffer, unsigned long count,
-			  void *data, enum stats_track_type type)
+static ssize_t ll_wr_track_id(struct kobject *kobj, const char *buffer,
+			      size_t count, enum stats_track_type type)
 {
-	struct super_block *sb = data;
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	unsigned long pid;
 	int rc;
-	__s64 pid;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &pid);
+	rc = kstrtoul(buffer, 10, &pid);
 	if (rc)
 		return rc;
-	if (pid > INT_MAX || pid < 0)
-		return -ERANGE;
 
-	ll_s2sbi(sb)->ll_stats_track_id = pid;
+	sbi->ll_stats_track_id = pid;
 	if (pid == 0)
-		ll_s2sbi(sb)->ll_stats_track_type = STATS_TRACK_ALL;
+		sbi->ll_stats_track_type = STATS_TRACK_ALL;
 	else
-		ll_s2sbi(sb)->ll_stats_track_type = type;
-	lprocfs_clear_stats(ll_s2sbi(sb)->ll_stats);
+		sbi->ll_stats_track_type = type;
+	lprocfs_clear_stats(sbi->ll_stats);
 	return count;
 }
 
-static int ll_track_pid_seq_show(struct seq_file *m, void *v)
+static ssize_t stats_track_pid_show(struct kobject *kobj,
+				    struct attribute *attr,
+				    char *buf)
 {
-	return ll_rd_track_id(m, STATS_TRACK_PID);
+	return ll_rd_track_id(kobj, buf, STATS_TRACK_PID);
 }
 
-static ssize_t ll_track_pid_seq_write(struct file *file,
-				      const char __user *buffer,
-				      size_t count, loff_t *off)
+static ssize_t stats_track_pid_store(struct kobject *kobj,
+				     struct attribute *attr,
+				     const char *buffer,
+				     size_t count)
 {
-	struct seq_file *seq = file->private_data;
-	return ll_wr_track_id(file, buffer, count, seq->private,
-			STATS_TRACK_PID);
+	return ll_wr_track_id(kobj, buffer, count, STATS_TRACK_PID);
 }
-LPROC_SEQ_FOPS(ll_track_pid);
+LUSTRE_RW_ATTR(stats_track_pid);
 
-static int ll_track_ppid_seq_show(struct seq_file *m, void *v)
+static ssize_t stats_track_ppid_show(struct kobject *kobj,
+				     struct attribute *attr,
+				     char *buf)
 {
-	return ll_rd_track_id(m, STATS_TRACK_PPID);
+	return ll_rd_track_id(kobj, buf, STATS_TRACK_PPID);
 }
 
-static ssize_t ll_track_ppid_seq_write(struct file *file,
-				       const char __user *buffer,
-				       size_t count, loff_t *off)
+static ssize_t stats_track_ppid_store(struct kobject *kobj,
+				      struct attribute *attr,
+				      const char *buffer,
+				      size_t count)
 {
-	struct seq_file *seq = file->private_data;
-	return ll_wr_track_id(file, buffer, count, seq->private,
-			STATS_TRACK_PPID);
+	return ll_wr_track_id(kobj, buffer, count, STATS_TRACK_PPID);
 }
-LPROC_SEQ_FOPS(ll_track_ppid);
+LUSTRE_RW_ATTR(stats_track_ppid);
 
-static int ll_track_gid_seq_show(struct seq_file *m, void *v)
+static ssize_t stats_track_gid_show(struct kobject *kobj,
+				    struct attribute *attr,
+				    char *buf)
 {
-	return ll_rd_track_id(m, STATS_TRACK_GID);
+	return ll_rd_track_id(kobj, buf, STATS_TRACK_GID);
 }
 
-static ssize_t ll_track_gid_seq_write(struct file *file,
-				      const char __user *buffer,
-				      size_t count, loff_t *off)
+static ssize_t stats_track_gid_store(struct kobject *kobj,
+				     struct attribute *attr,
+				     const char *buffer,
+				     size_t count)
 {
-	struct seq_file *seq = file->private_data;
-	return ll_wr_track_id(file, buffer, count, seq->private,
-			STATS_TRACK_GID);
+	return ll_wr_track_id(kobj, buffer, count, STATS_TRACK_GID);
 }
-LPROC_SEQ_FOPS(ll_track_gid);
+LUSTRE_RW_ATTR(stats_track_gid);
 
-static int ll_statahead_running_max_seq_show(struct seq_file *m, void *v)
+static ssize_t statahead_running_max_show(struct kobject *kobj,
+					  struct attribute *attr,
+					  char *buf)
 {
-	struct super_block *sb = m->private;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 
-	seq_printf(m, "%u\n", sbi->ll_sa_running_max);
-	return 0;
+	return snprintf(buf, 16, "%u\n", sbi->ll_sa_running_max);
 }
 
-static ssize_t ll_statahead_running_max_seq_write(struct file *file,
-					   const char __user *buffer,
-					   size_t count, loff_t *off)
+static ssize_t statahead_running_max_store(struct kobject *kobj,
+					   struct attribute *attr,
+					   const char *buffer,
+					   size_t count)
 {
-	struct seq_file *m = file->private_data;
-	struct super_block *sb = m->private;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	unsigned long val;
 	int rc;
-	__s64 val;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtoul(buffer, 0, &val);
 	if (rc)
 		return rc;
 
-	if (val >= 0 || val <= LL_SA_RUNNING_MAX)
+	if (val <= LL_SA_RUNNING_MAX) {
 		sbi->ll_sa_running_max = val;
-	else
-		CERROR("%s: bad statahead_running_max value %lld. Valid values "
-		       "are in the range [0, %u]\n", ll_get_fsname(sb, NULL, 0),
-		       val, LL_SA_RUNNING_MAX);
+		return count;
+	}
 
-	return count;
+	CERROR("Bad statahead_running_max value %lu. Valid values "
+	       "are in the range [0, %d]\n", val, LL_SA_RUNNING_MAX);
+
+	return -ERANGE;
 }
-LPROC_SEQ_FOPS(ll_statahead_running_max);
+LUSTRE_RW_ATTR(statahead_running_max);
 
-static int ll_statahead_max_seq_show(struct seq_file *m, void *v)
+static ssize_t statahead_max_show(struct kobject *kobj,
+				  struct attribute *attr,
+				  char *buf)
 {
-	struct super_block *sb = m->private;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 
-	seq_printf(m, "%u\n", sbi->ll_sa_max);
-	return 0;
+	return sprintf(buf, "%u\n", sbi->ll_sa_max);
 }
 
-static ssize_t ll_statahead_max_seq_write(struct file *file,
-					  const char __user *buffer,
-					  size_t count, loff_t *off)
+static ssize_t statahead_max_store(struct kobject *kobj,
+				   struct attribute *attr,
+				   const char *buffer,
+				   size_t count)
 {
-	struct seq_file *m = file->private_data;
-	struct super_block *sb = m->private;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	unsigned long val;
 	int rc;
-	__s64 val;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtoul(buffer, 0, &val);
 	if (rc)
 		return rc;
 
-	if (val >= 0 && val <= LL_SA_RPC_MAX)
+	if (val <= LL_SA_RPC_MAX)
 		sbi->ll_sa_max = val;
 	else
-		CERROR("%s: bad statahead_max value %lld. Valid values are in "
-		       "are in the range [0, %u]\n", ll_get_fsname(sb, NULL, 0),
+		CERROR("Bad statahead_max value %lu. Valid values are in the range [0, %d]\n",
 		       val, LL_SA_RPC_MAX);
 
 	return count;
 }
-LPROC_SEQ_FOPS(ll_statahead_max);
+LUSTRE_RW_ATTR(statahead_max);
 
-static int ll_statahead_agl_seq_show(struct seq_file *m, void *v)
+static ssize_t statahead_agl_show(struct kobject *kobj,
+				  struct attribute *attr,
+				  char *buf)
 {
-	struct super_block *sb = m->private;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 
-	seq_printf(m, "%u\n",
-		   sbi->ll_flags & LL_SBI_AGL_ENABLED ? 1 : 0);
-	return 0;
+	return sprintf(buf, "%u\n", sbi->ll_flags & LL_SBI_AGL_ENABLED ? 1 : 0);
 }
 
-static ssize_t ll_statahead_agl_seq_write(struct file *file,
-					  const char __user *buffer,
-					  size_t count, loff_t *off)
+static ssize_t statahead_agl_store(struct kobject *kobj,
+				   struct attribute *attr,
+				   const char *buffer,
+				   size_t count)
 {
-	struct seq_file *m = file->private_data;
-	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	bool val;
 	int rc;
-	__s64 val;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtobool(buffer, &val);
 	if (rc)
 		return rc;
 
@@ -780,7 +809,7 @@ static ssize_t ll_statahead_agl_seq_write(struct file *file,
 
 	return count;
 }
-LPROC_SEQ_FOPS(ll_statahead_agl);
+LUSTRE_RW_ATTR(statahead_agl);
 
 static int ll_statahead_stats_seq_show(struct seq_file *m, void *v)
 {
@@ -788,35 +817,37 @@ static int ll_statahead_stats_seq_show(struct seq_file *m, void *v)
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
 
 	seq_printf(m, "statahead total: %u\n"
-		    "statahead wrong: %u\n"
-		    "agl total: %u\n",
-		    atomic_read(&sbi->ll_sa_total),
-		    atomic_read(&sbi->ll_sa_wrong),
-		    atomic_read(&sbi->ll_agl_total));
+		      "statahead wrong: %u\n"
+		      "agl total: %u\n",
+		   atomic_read(&sbi->ll_sa_total),
+		   atomic_read(&sbi->ll_sa_wrong),
+		   atomic_read(&sbi->ll_agl_total));
 	return 0;
 }
-LPROC_SEQ_FOPS_RO(ll_statahead_stats);
 
-static int ll_lazystatfs_seq_show(struct seq_file *m, void *v)
+LDEBUGFS_SEQ_FOPS_RO(ll_statahead_stats);
+
+static ssize_t lazystatfs_show(struct kobject *kobj,
+			       struct attribute *attr,
+			       char *buf)
 {
-	struct super_block *sb = m->private;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 
-	seq_printf(m, "%u\n",
-		   (sbi->ll_flags & LL_SBI_LAZYSTATFS) ? 1 : 0);
-	return 0;
+	return sprintf(buf, "%u\n", (sbi->ll_flags & LL_SBI_LAZYSTATFS) ? 1 : 0);
 }
 
-static ssize_t ll_lazystatfs_seq_write(struct file *file,
-				       const char __user *buffer,
-					size_t count, loff_t *off)
+static ssize_t lazystatfs_store(struct kobject *kobj,
+				struct attribute *attr,
+				const char *buffer,
+				size_t count)
 {
-	struct seq_file *m = file->private_data;
-	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	bool val;
 	int rc;
-	__s64 val;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtobool(buffer, &val);
 	if (rc)
 		return rc;
 
@@ -827,12 +858,74 @@ static ssize_t ll_lazystatfs_seq_write(struct file *file,
 
 	return count;
 }
-LPROC_SEQ_FOPS(ll_lazystatfs);
+LUSTRE_RW_ATTR(lazystatfs);
 
-static int ll_max_easize_seq_show(struct seq_file *m, void *v)
+static ssize_t statfs_max_age_show(struct kobject *kobj, struct attribute *attr,
+				   char *buf)
 {
-	struct super_block *sb = m->private;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", sbi->ll_statfs_max_age);
+}
+
+static ssize_t statfs_max_age_store(struct kobject *kobj,
+				    struct attribute *attr, const char *buffer,
+				    size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buffer, 10, &val);
+	if (rc)
+		return rc;
+	if (val > OBD_STATFS_CACHE_MAX_AGE)
+		return -EINVAL;
+
+	sbi->ll_statfs_max_age = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(statfs_max_age);
+
+static ssize_t neg_dentry_timeout_show(struct kobject *kobj, struct attribute *attr,
+									   char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+	                                      ll_kset.kobj);
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", sbi->ll_neg_dentry_timeout);
+}
+
+static ssize_t neg_dentry_timeout_store(struct kobject *kobj,
+										struct attribute *attr, const char *buffer,
+										size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+	                                      ll_kset.kobj);
+	int val;
+	int rc;
+
+	rc = kstrtoint(buffer, 10, &val);
+	if (rc)
+		return rc;
+	if (val < OBD_NEG_CACHE_TIMEOUT_DEFAULT_SECS)
+		return -EINVAL;
+
+	sbi->ll_neg_dentry_timeout = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(neg_dentry_timeout);
+
+static ssize_t max_easize_show(struct kobject *kobj,
+			       struct attribute *attr,
+			       char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 	unsigned int ealen;
 	int rc;
 
@@ -840,10 +933,9 @@ static int ll_max_easize_seq_show(struct seq_file *m, void *v)
 	if (rc)
 		return rc;
 
-	seq_printf(m, "%u\n", ealen);
-	return 0;
+	return sprintf(buf, "%u\n", ealen);
 }
-LPROC_SEQ_FOPS_RO(ll_max_easize);
+LUSTRE_RO_ATTR(max_easize);
 
 /**
  * Get default_easize.
@@ -856,10 +948,12 @@ LPROC_SEQ_FOPS_RO(ll_max_easize);
  * \retval 0		on success
  * \retval negative	negated errno on failure
  */
-static int ll_default_easize_seq_show(struct seq_file *m, void *v)
+static ssize_t default_easize_show(struct kobject *kobj,
+				   struct attribute *attr,
+				   char *buf)
 {
-	struct super_block *sb = m->private;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 	unsigned int ealen;
 	int rc;
 
@@ -867,8 +961,7 @@ static int ll_default_easize_seq_show(struct seq_file *m, void *v)
 	if (rc)
 		return rc;
 
-	seq_printf(m, "%u\n", ealen);
-	return 0;
+	return sprintf(buf, "%u\n", ealen);
 }
 
 /**
@@ -887,24 +980,22 @@ static int ll_default_easize_seq_show(struct seq_file *m, void *v)
  * \retval positive	\a count on success
  * \retval negative	negated errno on failure
  */
-static ssize_t ll_default_easize_seq_write(struct file *file,
-					   const char __user *buffer,
-					   size_t count, loff_t *unused)
-{
-	struct seq_file	*seq = file->private_data;
-	struct super_block *sb = (struct super_block *)seq->private;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
-	__s64 val;
+static ssize_t default_easize_store(struct kobject *kobj,
+				    struct attribute *attr,
+				    const char *buffer,
+				    size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	unsigned int val;
 	int rc;
 
 	if (count == 0)
 		return 0;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtouint(buffer, 10, &val);
 	if (rc)
 		return rc;
-	if (val < 0 || val > INT_MAX)
-		return -ERANGE;
 
 	rc = ll_set_default_mdsize(sbi, val);
 	if (rc)
@@ -912,7 +1003,7 @@ static ssize_t ll_default_easize_seq_write(struct file *file,
 
 	return count;
 }
-LPROC_SEQ_FOPS(ll_default_easize);
+LUSTRE_RW_ATTR(default_easize);
 
 static int ll_sbi_flags_seq_show(struct seq_file *m, void *v)
 {
@@ -936,74 +1027,142 @@ static int ll_sbi_flags_seq_show(struct seq_file *m, void *v)
 	seq_printf(m, "\b\n");
 	return 0;
 }
-LPROC_SEQ_FOPS_RO(ll_sbi_flags);
 
-static int ll_fast_read_seq_show(struct seq_file *m, void *v)
+LDEBUGFS_SEQ_FOPS_RO(ll_sbi_flags);
+
+static ssize_t xattr_cache_show(struct kobject *kobj,
+				struct attribute *attr,
+				char *buf)
 {
-	struct super_block *sb = m->private;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 
-	seq_printf(m, "%u\n", !!(sbi->ll_flags & LL_SBI_FAST_READ));
-	return 0;
+	return sprintf(buf, "%u\n", sbi->ll_xattr_cache_enabled);
 }
 
-static ssize_t
-ll_fast_read_seq_write(struct file *file, const char __user *buffer,
-		       size_t count, loff_t *off)
+static ssize_t xattr_cache_store(struct kobject *kobj,
+				 struct attribute *attr,
+				 const char *buffer,
+				 size_t count)
 {
-	struct seq_file *m = file->private_data;
-	struct super_block *sb = m->private;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	bool val;
 	int rc;
-	__s64 val;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtobool(buffer, &val);
+	if (rc)
+		return rc;
+
+	if (val && !(sbi->ll_flags & LL_SBI_XATTR_CACHE))
+		return -ENOTSUPP;
+
+	sbi->ll_xattr_cache_enabled = val;
+	sbi->ll_xattr_cache_set = 1;
+
+	return count;
+}
+LUSTRE_RW_ATTR(xattr_cache);
+
+static ssize_t tiny_write_show(struct kobject *kobj,
+			       struct attribute *attr,
+			       char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return sprintf(buf, "%u\n", !!(sbi->ll_flags & LL_SBI_TINY_WRITE));
+}
+
+static ssize_t tiny_write_store(struct kobject *kobj,
+				struct attribute *attr,
+				const char *buffer,
+				size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	bool val;
+	int rc;
+
+	rc = kstrtobool(buffer, &val);
 	if (rc)
 		return rc;
 
 	spin_lock(&sbi->ll_lock);
-	if (val == 1)
-		sbi->ll_flags |= LL_SBI_FAST_READ;
+	if (val)
+		sbi->ll_flags |= LL_SBI_TINY_WRITE;
 	else
-		sbi->ll_flags &= ~LL_SBI_FAST_READ;
+		sbi->ll_flags &= ~LL_SBI_TINY_WRITE;
 	spin_unlock(&sbi->ll_lock);
 
 	return count;
 }
-LPROC_SEQ_FOPS(ll_fast_read);
+LUSTRE_RW_ATTR(tiny_write);
 
-static int ll_pio_seq_show(struct seq_file *m, void *v)
+static ssize_t fast_read_show(struct kobject *kobj,
+			      struct attribute *attr,
+			      char *buf)
 {
-	struct super_block *sb = m->private;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
 
-	seq_printf(m, "%u\n", !!(sbi->ll_flags & LL_SBI_PIO));
-	return 0;
+	return sprintf(buf, "%u\n", !!(sbi->ll_flags & LL_SBI_FAST_READ));
 }
 
-static ssize_t ll_pio_seq_write(struct file *file, const char __user *buffer,
-				size_t count, loff_t *off)
+static ssize_t fast_read_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buffer,
+			       size_t count)
 {
-	struct seq_file *m = file->private_data;
-	struct super_block *sb = m->private;
-	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	bool val;
 	int rc;
-	__s64 val;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtobool(buffer, &val);
 	if (rc)
 		return rc;
 
 	spin_lock(&sbi->ll_lock);
-	if (val == 1)
-		sbi->ll_flags |= LL_SBI_PIO;
+	if (val)
+		sbi->ll_flags |= LL_SBI_FAST_READ;
 	else
-		sbi->ll_flags &= ~LL_SBI_PIO;
+		sbi->ll_flags &= ~LL_SBI_FAST_READ;
 	spin_unlock(&sbi->ll_lock);
 
 	return count;
 }
-LPROC_SEQ_FOPS(ll_pio);
+LUSTRE_RW_ATTR(fast_read);
+
+static ssize_t inode_cache_show(struct kobject *kobj,
+				struct attribute *attr,
+				char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", sbi->ll_inode_cache_enabled);
+}
+
+static ssize_t inode_cache_store(struct kobject *kobj,
+				 struct attribute *attr,
+				 const char *buffer,
+				 size_t count)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	bool val;
+	int rc;
+
+	rc = kstrtobool(buffer, &val);
+	if (rc)
+		return rc;
+
+	sbi->ll_inode_cache_enabled = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(inode_cache);
 
 static int ll_unstable_stats_seq_show(struct seq_file *m, void *v)
 {
@@ -1017,8 +1176,8 @@ static int ll_unstable_stats_seq_show(struct seq_file *m, void *v)
 	mb    = (pages * PAGE_SIZE) >> 20;
 
 	seq_printf(m, "unstable_check:     %8d\n"
-		   "unstable_pages: %12ld\n"
-		   "unstable_mb:        %8d\n",
+		      "unstable_pages: %12ld\n"
+		      "unstable_mb:        %8d\n",
 		   cache->ccc_unstable_check, pages, mb);
 	return 0;
 }
@@ -1030,32 +1189,33 @@ static ssize_t ll_unstable_stats_seq_write(struct file *file,
 	struct seq_file *seq = file->private_data;
 	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)seq->private);
 	char kernbuf[128];
+	bool val;
 	int rc;
-	__s64 val;
 
 	if (count == 0)
 		return 0;
 	if (count >= sizeof(kernbuf))
 		return -EINVAL;
 
-	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
+	if (copy_from_user(kernbuf, buffer, count))
 		return -EFAULT;
 	kernbuf[count] = 0;
 
 	buffer += lprocfs_find_named_value(kernbuf, "unstable_check:", &count) -
 		  kernbuf;
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtobool_from_user(buffer, count, &val);
 	if (rc < 0)
 		return rc;
 
 	/* borrow lru lock to set the value */
 	spin_lock(&sbi->ll_cache->ccc_lru_lock);
-	sbi->ll_cache->ccc_unstable_check = !!val;
+	sbi->ll_cache->ccc_unstable_check = val;
 	spin_unlock(&sbi->ll_cache->ccc_lru_lock);
 
 	return count;
 }
-LPROC_SEQ_FOPS(ll_unstable_stats);
+
+LDEBUGFS_SEQ_FOPS(ll_unstable_stats);
 
 static int ll_root_squash_seq_show(struct seq_file *m, void *v)
 {
@@ -1076,10 +1236,11 @@ static ssize_t ll_root_squash_seq_write(struct file *file,
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
 	struct root_squash_info *squash = &sbi->ll_squash;
 
-	return lprocfs_wr_root_squash(file, buffer, count, squash,
+	return lprocfs_wr_root_squash(buffer, count, squash,
 				      ll_get_fsname(sb, NULL, 0));
 }
-LPROC_SEQ_FOPS(ll_root_squash);
+
+LDEBUGFS_SEQ_FOPS(ll_root_squash);
 
 static int ll_nosquash_nids_seq_show(struct seq_file *m, void *v)
 {
@@ -1112,7 +1273,7 @@ static ssize_t ll_nosquash_nids_seq_write(struct file *file,
 	struct root_squash_info *squash = &sbi->ll_squash;
 	int rc;
 
-	rc = lprocfs_wr_nosquash_nids(file, buffer, count, squash,
+	rc = lprocfs_wr_nosquash_nids(buffer, count, squash,
 				      ll_get_fsname(sb, NULL, 0));
 	if (rc < 0)
 		return rc;
@@ -1121,80 +1282,79 @@ static ssize_t ll_nosquash_nids_seq_write(struct file *file,
 
 	return rc;
 }
-LPROC_SEQ_FOPS(ll_nosquash_nids);
 
-struct lprocfs_vars lprocfs_llite_obd_vars[] = {
-	{ .name	=	"uuid",
-	  .fops	=	&ll_sb_uuid_fops			},
-	{ .name	=	"fstype",
-	  .fops	=	&ll_fstype_fops				},
+LDEBUGFS_SEQ_FOPS(ll_nosquash_nids);
+
+struct ldebugfs_vars lprocfs_llite_obd_vars[] = {
 	{ .name	=	"site",
 	  .fops	=	&ll_site_stats_fops			},
-	{ .name	=	"blocksize",
-	  .fops	=	&ll_blksize_fops			},
-	{ .name	=	"stat_blocksize",
-	  .fops	=	&ll_stat_blksize_fops			},
-	{ .name	=	"kbytestotal",
-	  .fops	=	&ll_kbytestotal_fops			},
-	{ .name	=	"kbytesfree",
-	  .fops	=	&ll_kbytesfree_fops			},
-	{ .name	=	"kbytesavail",
-	  .fops	=	&ll_kbytesavail_fops			},
-	{ .name	=	"filestotal",
-	  .fops	=	&ll_filestotal_fops			},
-	{ .name	=	"filesfree",
-	  .fops	=	&ll_filesfree_fops			},
-	{ .name	=	"client_type",
-	  .fops	=	&ll_client_type_fops			},
-	{ .name	=	"max_read_ahead_mb",
-	  .fops	=	&ll_max_readahead_mb_fops		},
-	{ .name	=	"max_read_ahead_per_file_mb",
-	  .fops	=	&ll_max_readahead_per_file_mb_fops	},
-	{ .name	=	"max_read_ahead_whole_mb",
-	  .fops	=	&ll_max_read_ahead_whole_mb_fops	},
+	{ .name =	"max_read_ahead_mb",
+	  .fops =	&ll_max_readahead_mb_fops		},
+	{ .name =	"max_read_ahead_per_file_mb",
+	  .fops =	&ll_max_readahead_per_file_mb_fops	},
+	{ .name =	"max_read_ahead_whole_mb",
+	  .fops =	&ll_max_read_ahead_whole_mb_fops	},
 	{ .name	=	"max_cached_mb",
 	  .fops	=	&ll_max_cached_mb_fops			},
-	{ .name	=	"checksum_pages",
-	  .fops	=	&ll_checksum_fops			},
-	{ .name	=	"stats_track_pid",
-	  .fops	=	&ll_track_pid_fops			},
-	{ .name	=	"stats_track_ppid",
-	  .fops	=	&ll_track_ppid_fops			},
-	{ .name	=	"stats_track_gid",
-	  .fops	=	&ll_track_gid_fops			},
-	{ .name	=	"statahead_max",
-	  .fops	=	&ll_statahead_max_fops			},
-	{ .name	=	"statahead_running_max",
-	  .fops	=	&ll_statahead_running_max_fops		},
-	{ .name	=	"statahead_agl",
-	  .fops	=	&ll_statahead_agl_fops			},
 	{ .name	=	"statahead_stats",
 	  .fops	=	&ll_statahead_stats_fops		},
-	{ .name	=	"lazystatfs",
-	  .fops	=	&ll_lazystatfs_fops			},
-	{ .name	=	"max_easize",
-	  .fops	=	&ll_max_easize_fops			},
-	{ .name	=	"default_easize",
-	  .fops	=	&ll_default_easize_fops			},
-	{ .name	=	"sbi_flags",
-	  .fops	=	&ll_sbi_flags_fops			},
-	{ .name	=	"xattr_cache",
-	  .fops	=	&ll_xattr_cache_fops			},
 	{ .name	=	"unstable_stats",
 	  .fops	=	&ll_unstable_stats_fops			},
+	{ .name =	"sbi_flags",
+	  .fops =	&ll_sbi_flags_fops			},
 	{ .name	=	"root_squash",
 	  .fops	=	&ll_root_squash_fops			},
 	{ .name	=	"nosquash_nids",
 	  .fops	=	&ll_nosquash_nids_fops			},
-	{ .name =	"fast_read",
-	  .fops =	&ll_fast_read_fops,			},
-	{ .name =	"pio",
-	  .fops =	&ll_pio_fops,				},
 	{ NULL }
 };
 
 #define MAX_STRING_SIZE 128
 
+static struct attribute *llite_attrs[] = {
+	&lustre_attr_blocksize.attr,
+	&lustre_attr_stat_blocksize.attr,
+	&lustre_attr_kbytestotal.attr,
+	&lustre_attr_kbytesfree.attr,
+	&lustre_attr_kbytesavail.attr,
+	&lustre_attr_filestotal.attr,
+	&lustre_attr_filesfree.attr,
+	&lustre_attr_client_type.attr,
+	&lustre_attr_fstype.attr,
+	&lustre_attr_uuid.attr,
+	&lustre_attr_checksums.attr,
+	&lustre_attr_checksum_pages.attr,
+	&lustre_attr_stats_track_pid.attr,
+	&lustre_attr_stats_track_ppid.attr,
+	&lustre_attr_stats_track_gid.attr,
+	&lustre_attr_statahead_running_max.attr,
+	&lustre_attr_statahead_max.attr,
+	&lustre_attr_statahead_agl.attr,
+	&lustre_attr_lazystatfs.attr,
+	&lustre_attr_statfs_max_age.attr,
+	&lustre_attr_max_easize.attr,
+	&lustre_attr_default_easize.attr,
+	&lustre_attr_xattr_cache.attr,
+	&lustre_attr_fast_read.attr,
+	&lustre_attr_tiny_write.attr,
+	&lustre_attr_neg_dentry_timeout.attr,
+	&lustre_attr_inode_cache.attr,
+	NULL,
+};
+
+static void llite_kobj_release(struct kobject *kobj)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	complete(&sbi->ll_kobj_unregister);
+}
+
+static struct kobj_type llite_ktype = {
+	.default_attrs  = llite_attrs,
+	.sysfs_ops      = &lustre_sysfs_ops,
+	.release        = llite_kobj_release,
+};
+
 static const struct llite_file_opcode {
         __u32       opcode;
         __u32       type;
@@ -1280,61 +1440,45 @@ static const char *ra_stat_string[] = {
 	[RA_STAT_FAILED_REACH_END] = "failed to reach end"
 };
 
-LPROC_SEQ_FOPS_RO_TYPE(llite, name);
-LPROC_SEQ_FOPS_RO_TYPE(llite, uuid);
-
-int lprocfs_ll_register_mountpoint(struct proc_dir_entry *parent,
-				   struct super_block *sb)
+int ll_debugfs_register_super(struct super_block *sb, const char *name)
 {
-	struct lprocfs_vars lvars[2];
 	struct lustre_sb_info *lsi = s2lsi(sb);
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
-	unsigned long cfg_instance = ll_get_cfg_instance(sb);
-	char name[MAX_STRING_SIZE + 1], *ptr;
-	int err, id, len, rc;
-	ENTRY;
-
-	memset(lvars, 0, sizeof(lvars));
-
-	name[MAX_STRING_SIZE] = '\0';
-	lvars[0].name = name;
+	int err, id, rc;
 
-	LASSERT(sbi != NULL);
-
-	/* Get fsname */
-	len = strlen(lsi->lsi_lmd->lmd_profile);
-	ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-');
-	if (ptr && (strcmp(ptr, "-client") == 0))
-		len -= 7;
+	ENTRY;
+	LASSERT(sbi);
 
-	/* Mount info */
-	snprintf(name, MAX_STRING_SIZE, "%.*s-%016lx", len,
-		 lsi->lsi_lmd->lmd_profile, cfg_instance);
+	if (IS_ERR_OR_NULL(llite_root))
+		goto out_ll_kset;
 
-	sbi->ll_proc_root = lprocfs_register(name, parent, NULL, NULL);
-	if (IS_ERR(sbi->ll_proc_root)) {
-		err = PTR_ERR(sbi->ll_proc_root);
-		sbi->ll_proc_root = NULL;
+	sbi->ll_debugfs_entry = ldebugfs_register(name, llite_root,
+						  lprocfs_llite_obd_vars, sb);
+	if (IS_ERR_OR_NULL(sbi->ll_debugfs_entry)) {
+		err = sbi->ll_debugfs_entry ? PTR_ERR(sbi->ll_debugfs_entry) :
+					      -ENOMEM;
+		sbi->ll_debugfs_entry = NULL;
 		RETURN(err);
 	}
 
-	rc = lprocfs_seq_create(sbi->ll_proc_root, "dump_page_cache", 0444,
-				&vvp_dump_pgcache_file_ops, sbi);
+	rc = ldebugfs_seq_create(sbi->ll_debugfs_entry, "dump_page_cache",0444,
+				 &vvp_dump_pgcache_file_ops, sbi);
 	if (rc)
 		CWARN("Error adding the dump_page_cache file\n");
 
-	rc = lprocfs_seq_create(sbi->ll_proc_root, "extents_stats", 0644,
-				&ll_rw_extents_stats_fops, sbi);
+	rc = ldebugfs_seq_create(sbi->ll_debugfs_entry, "extents_stats", 0644,
+				 &ll_rw_extents_stats_fops, sbi);
 	if (rc)
 		CWARN("Error adding the extent_stats file\n");
 
-	rc = lprocfs_seq_create(sbi->ll_proc_root, "extents_stats_per_process",
-				0644, &ll_rw_extents_stats_pp_fops, sbi);
+	rc = ldebugfs_seq_create(sbi->ll_debugfs_entry,
+				 "extents_stats_per_process", 0644,
+				 &ll_rw_extents_stats_pp_fops, sbi);
 	if (rc)
 		CWARN("Error adding the extents_stats_per_process file\n");
 
-	rc = lprocfs_seq_create(sbi->ll_proc_root, "offset_stats", 0644,
-				&ll_rw_offset_stats_fops, sbi);
+	rc = ldebugfs_seq_create(sbi->ll_debugfs_entry, "offset_stats", 0644,
+				 &ll_rw_offset_stats_fops, sbi);
 	if (rc)
 		CWARN("Error adding the offset_stats file\n");
 
@@ -1342,11 +1486,13 @@ int lprocfs_ll_register_mountpoint(struct proc_dir_entry *parent,
 	sbi->ll_stats = lprocfs_alloc_stats(LPROC_LL_FILE_OPCODES,
 					    LPROCFS_STATS_FLAG_NONE);
 	if (sbi->ll_stats == NULL)
-		GOTO(out, err = -ENOMEM);
+		GOTO(out_debugfs, err = -ENOMEM);
+
 	/* do counter init */
 	for (id = 0; id < LPROC_LL_FILE_OPCODES; id++) {
-		__u32 type = llite_opcode_table[id].type;
+		u32 type = llite_opcode_table[id].type;
 		void *ptr = NULL;
+
 		if (type & LPROCFS_TYPE_REGS)
 			ptr = "regs";
 		else if (type & LPROCFS_TYPE_BYTES)
@@ -1358,98 +1504,78 @@ int lprocfs_ll_register_mountpoint(struct proc_dir_entry *parent,
 				     (type & LPROCFS_CNTR_AVGMINMAX),
 				     llite_opcode_table[id].opname, ptr);
 	}
-	err = lprocfs_register_stats(sbi->ll_proc_root, "stats", sbi->ll_stats);
+
+	err = ldebugfs_register_stats(sbi->ll_debugfs_entry, "stats",
+				      sbi->ll_stats);
 	if (err)
-		GOTO(out, err);
+		GOTO(out_stats, err);
 
 	sbi->ll_ra_stats = lprocfs_alloc_stats(ARRAY_SIZE(ra_stat_string),
 					       LPROCFS_STATS_FLAG_NONE);
 	if (sbi->ll_ra_stats == NULL)
-		GOTO(out, err = -ENOMEM);
+		GOTO(out_stats, err = -ENOMEM);
 
 	for (id = 0; id < ARRAY_SIZE(ra_stat_string); id++)
 		lprocfs_counter_init(sbi->ll_ra_stats, id, 0,
 				     ra_stat_string[id], "pages");
-	err = lprocfs_register_stats(sbi->ll_proc_root, "read_ahead_stats",
-				     sbi->ll_ra_stats);
-	if (err)
-		GOTO(out, err);
 
+	err = ldebugfs_register_stats(sbi->ll_debugfs_entry, "read_ahead_stats",
+				      sbi->ll_ra_stats);
+	if (err)
+		GOTO(out_ra_stats, err);
+
+out_ll_kset:
+	/* Yes we also register sysfs mount kset here as well */
+	sbi->ll_kset.kobj.parent = llite_kobj;
+	sbi->ll_kset.kobj.ktype = &llite_ktype;
+	init_completion(&sbi->ll_kobj_unregister);
+	err = kobject_set_name(&sbi->ll_kset.kobj, "%s", name);
+	if (err)
+		GOTO(out_ra_stats, err);
 
-	err = lprocfs_add_vars(sbi->ll_proc_root, lprocfs_llite_obd_vars, sb);
+	err = kset_register(&sbi->ll_kset);
 	if (err)
-		GOTO(out, err);
+		GOTO(out_ra_stats, err);
+
+	lsi->lsi_kobj = kobject_get(&sbi->ll_kset.kobj);
+
+	RETURN(0);
+out_ra_stats:
+	lprocfs_free_stats(&sbi->ll_ra_stats);
+out_stats:
+	lprocfs_free_stats(&sbi->ll_stats);
+out_debugfs:
+	ldebugfs_remove(&sbi->ll_debugfs_entry);
 
-out:
-	if (err) {
-		lprocfs_remove(&sbi->ll_proc_root);
-		lprocfs_free_stats(&sbi->ll_ra_stats);
-		lprocfs_free_stats(&sbi->ll_stats);
-	}
 	RETURN(err);
 }
 
-int lprocfs_ll_register_obd(struct super_block *sb, const char *obdname)
+void ll_debugfs_unregister_super(struct super_block *sb)
 {
-	struct lprocfs_vars lvars[2];
+	struct lustre_sb_info *lsi = s2lsi(sb);
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
-	struct obd_device *obd;
-	struct proc_dir_entry *dir;
-	char name[MAX_STRING_SIZE + 1];
-	int err;
-	ENTRY;
 
-	memset(lvars, 0, sizeof(lvars));
+	if (!IS_ERR_OR_NULL(sbi->ll_debugfs_entry))
+		ldebugfs_remove(&sbi->ll_debugfs_entry);
 
-	name[MAX_STRING_SIZE] = '\0';
-	lvars[0].name = name;
+	if (sbi->ll_dt_obd)
+		sysfs_remove_link(&sbi->ll_kset.kobj,
+				  sbi->ll_dt_obd->obd_type->typ_name);
 
-	LASSERT(sbi != NULL);
-	LASSERT(obdname != NULL);
-
-	obd = class_name2obd(obdname);
-
-	LASSERT(obd != NULL);
-	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
-	LASSERT(obd->obd_type->typ_name != NULL);
-
-	dir = proc_mkdir(obd->obd_type->typ_name, sbi->ll_proc_root);
-	if (dir == NULL)
-		GOTO(out, err = -ENOMEM);
-
-	snprintf(name, MAX_STRING_SIZE, "common_name");
-	lvars[0].fops = &llite_name_fops;
-	err = lprocfs_add_vars(dir, lvars, obd);
-	if (err)
-		GOTO(out, err);
+	if (sbi->ll_md_obd)
+		sysfs_remove_link(&sbi->ll_kset.kobj,
+				  sbi->ll_md_obd->obd_type->typ_name);
 
-	snprintf(name, MAX_STRING_SIZE, "uuid");
-	lvars[0].fops = &llite_uuid_fops;
-	err = lprocfs_add_vars(dir, lvars, obd);
-	if (err)
-		GOTO(out, err);
+	kobject_put(lsi->lsi_kobj);
 
-out:
-	if (err) {
-		lprocfs_remove(&sbi->ll_proc_root);
-		lprocfs_free_stats(&sbi->ll_ra_stats);
-		lprocfs_free_stats(&sbi->ll_stats);
-	}
-	RETURN(err);
-}
+	kset_unregister(&sbi->ll_kset);
+	wait_for_completion(&sbi->ll_kobj_unregister);
 
-void lprocfs_ll_unregister_mountpoint(struct ll_sb_info *sbi)
-{
-        if (sbi->ll_proc_root) {
-                lprocfs_remove(&sbi->ll_proc_root);
-                lprocfs_free_stats(&sbi->ll_ra_stats);
-                lprocfs_free_stats(&sbi->ll_stats);
-        }
+	lprocfs_free_stats(&sbi->ll_ra_stats);
+	lprocfs_free_stats(&sbi->ll_stats);
 }
 #undef MAX_STRING_SIZE
 
-#define pct(a,b) (b ? a * 100 / b : 0)
-
 static void ll_display_extents_info(struct ll_rw_extents_info *io_extents,
                                    struct seq_file *seq, int which)
 {
@@ -1473,14 +1599,14 @@ static void ll_display_extents_info(struct ll_rw_extents_info *io_extents,
                 w = pp_info->pp_w_hist.oh_buckets[i];
                 read_cum += r;
                 write_cum += w;
-                end = 1 << (i + LL_HIST_START - units);
-                seq_printf(seq, "%4lu%c - %4lu%c%c: %14lu %4lu %4lu  | "
-                           "%14lu %4lu %4lu\n", start, *unitp, end, *unitp,
+		end = BIT(i + LL_HIST_START - units);
+		seq_printf(seq, "%4lu%c - %4lu%c%c: %14lu %4u %4u  | "
+			   "%14lu %4u %4u\n", start, *unitp, end, *unitp,
                            (i == LL_HIST_MAX - 1) ? '+' : ' ',
                            r, pct(r, read_tot), pct(read_cum, read_tot),
                            w, pct(w, write_tot), pct(write_cum, write_tot));
                 start = end;
-                if (start == 1<<10) {
+		if (start == BIT(10)) {
                         start = 1;
                         units += 10;
                         unitp++;
@@ -1535,7 +1661,7 @@ static ssize_t ll_rw_extents_stats_pp_seq_write(struct file *file,
 	if (len == 0)
 		return -EINVAL;
 
-	value = ll_stats_pid_write(file, buf, len);
+	value = ll_stats_pid_write(buf, len);
 
 	if (value == 0)
 		sbi->ll_rw_stats_on = 0;
@@ -1552,7 +1678,7 @@ static ssize_t ll_rw_extents_stats_pp_seq_write(struct file *file,
 	return len;
 }
 
-LPROC_SEQ_FOPS(ll_rw_extents_stats_pp);
+LDEBUGFS_SEQ_FOPS(ll_rw_extents_stats_pp);
 
 static int ll_rw_extents_stats_seq_show(struct seq_file *seq, void *v)
 {
@@ -1593,7 +1719,7 @@ static ssize_t ll_rw_extents_stats_seq_write(struct file *file,
 	if (len == 0)
 		return -EINVAL;
 
-	value = ll_stats_pid_write(file, buf, len);
+	value = ll_stats_pid_write(buf, len);
 
 	if (value == 0)
 		sbi->ll_rw_stats_on = 0;
@@ -1610,7 +1736,8 @@ static ssize_t ll_rw_extents_stats_seq_write(struct file *file,
 
 	return len;
 }
-LPROC_SEQ_FOPS(ll_rw_extents_stats);
+
+LDEBUGFS_SEQ_FOPS(ll_rw_extents_stats);
 
 void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
                        struct ll_file_data *file, loff_t pos,
@@ -1647,15 +1774,15 @@ void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
                 lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_w_hist);
         }
 
-        for(i = 0; (count >= (1 << LL_HIST_START << i)) &&
-             (i < (LL_HIST_MAX - 1)); i++);
-        if (rw == 0) {
-                io_extents->pp_extents[cur].pp_r_hist.oh_buckets[i]++;
-                io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_r_hist.oh_buckets[i]++;
-        } else {
-                io_extents->pp_extents[cur].pp_w_hist.oh_buckets[i]++;
-                io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_w_hist.oh_buckets[i]++;
-        }
+	for (i = 0; (count >= BIT(LL_HIST_START + i)) &&
+	     (i < (LL_HIST_MAX - 1)); i++);
+	if (rw == 0) {
+		io_extents->pp_extents[cur].pp_r_hist.oh_buckets[i]++;
+		io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_r_hist.oh_buckets[i]++;
+	} else {
+		io_extents->pp_extents[cur].pp_w_hist.oh_buckets[i]++;
+		io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_w_hist.oh_buckets[i]++;
+	}
 	spin_unlock(&sbi->ll_pp_extent_lock);
 
 	spin_lock(&sbi->ll_process_lock);
@@ -1741,7 +1868,7 @@ static int ll_rw_offset_stats_seq_show(struct seq_file *seq, void *v)
 	for (i = 0; i < LL_OFFSET_HIST_MAX; i++) {
 		if (offset[i].rw_pid != 0)
 			seq_printf(seq,
-				   "%3c %10d %14Lu %14Lu %17lu %17lu %14Lu",
+				  "%3c %10d %14llu %14llu %17lu %17lu %14lld\n",
 				   offset[i].rw_op == READ ? 'R' : 'W',
 				   offset[i].rw_pid,
 				   offset[i].rw_range_start,
@@ -1755,7 +1882,7 @@ static int ll_rw_offset_stats_seq_show(struct seq_file *seq, void *v)
 	for (i = 0; i < LL_PROCESS_HIST_MAX; i++) {
 		if (process[i].rw_pid != 0)
 			seq_printf(seq,
-				   "%3c %10d %14Lu %14Lu %17lu %17lu %14Lu",
+				  "%3c %10d %14llu %14llu %17lu %17lu %14lld\n",
 				   process[i].rw_op == READ ? 'R' : 'W',
 				   process[i].rw_pid,
 				   process[i].rw_range_start,
@@ -1782,7 +1909,7 @@ static ssize_t ll_rw_offset_stats_seq_write(struct file *file,
 	if (len == 0)
 		return -EINVAL;
 
-	value = ll_stats_pid_write(file, buf, len);
+	value = ll_stats_pid_write(buf, len);
 
 	if (value == 0)
 		sbi->ll_rw_stats_on = 0;
@@ -1801,43 +1928,4 @@ static ssize_t ll_rw_offset_stats_seq_write(struct file *file,
 	return len;
 }
 
-/**
- * ll_stats_pid_write() - Determine if stats collection should be enabled
- * @buf: Buffer containing the data written
- * @len: Number of bytes in the buffer
- *
- * Several proc files begin collecting stats when a value is written, and stop
- * collecting when either '0' or 'disable' is written. This function checks the
- * written value to see if collection should be enabled or disabled.
- *
- * Return: If '0' or 'disable' is provided, 0 is returned. If the text
- * equivalent of a number is written, that number is returned. Otherwise,
- * 1 is returned. Non-zero return values indicate collection should be enabled.
- */
-static __s64 ll_stats_pid_write(struct file *file, const char __user *buf,
-				size_t len)
-{
-	__s64 value = 1;
-	int rc;
-	char kernbuf[16];
-
-	rc = lprocfs_str_to_s64(file, buf, len, &value);
-
-	if (rc < 0 && len < sizeof(kernbuf)) {
-
-		if (lprocfs_copy_from_user(file, kernbuf, buf, len))
-			return -EFAULT;
-		kernbuf[len] = 0;
-
-		if (kernbuf[len - 1] == '\n')
-			kernbuf[len - 1] = 0;
-
-		if (strncasecmp(kernbuf, "disable", 7) == 0)
-			value = 0;
-	}
-
-	return value;
-}
-
-LPROC_SEQ_FOPS(ll_rw_offset_stats);
-#endif /* CONFIG_PROC_FS */
+LDEBUGFS_SEQ_FOPS(ll_rw_offset_stats);
diff --git a/drivers/staging/lustrefsx/lustre/llite/namei.c b/drivers/staging/lustrefsx/lustre/llite/namei.c
index 3ff57049caa9c..61349f5cb65e5 100644
--- a/drivers/staging/lustrefsx/lustre/llite/namei.c
+++ b/drivers/staging/lustrefsx/lustre/llite/namei.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -46,7 +46,6 @@
 #include <obd_support.h>
 #include <lustre_fid.h>
 #include <lustre_dlm.h>
-#include <lustre_ver.h>
 #include "llite_internal.h"
 
 #ifndef HAVE_USER_NAMESPACE_ARG
@@ -150,6 +149,9 @@ struct inode *ll_iget(struct super_block *sb, ino_t hash,
 			inode_has_no_xattr(inode);
 			unlock_new_inode(inode);
 		}
+	} else if (is_bad_inode(inode)) {
+		iput(inode);
+		inode = ERR_PTR(-ESTALE);
 	} else if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
 		rc = ll_update_inode(inode, md);
 		CDEBUG(D_VFSTRACE, "got inode: "DFID"(%p): rc = %d\n",
@@ -193,168 +195,314 @@ int ll_test_inode_by_fid(struct inode *inode, void *opaque)
 	return lu_fid_eq(&ll_i2info(inode)->lli_fid, opaque);
 }
 
-int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
-		       void *data, int flag)
+static int ll_dom_lock_cancel(struct inode *inode, struct ldlm_lock *lock)
 {
-	struct lustre_handle lockh;
+	struct lu_env *env;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	__u16 refcheck;
 	int rc;
 	ENTRY;
 
-	switch (flag) {
-	case LDLM_CB_BLOCKING:
-		ldlm_lock2handle(lock, &lockh);
-		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
-		if (rc < 0) {
-			CDEBUG(D_INODE, "ldlm_cli_cancel: rc = %d\n", rc);
-			RETURN(rc);
-		}
-		break;
-	case LDLM_CB_CANCELING: {
-		struct inode *inode = ll_inode_from_resource_lock(lock);
-		__u64 bits = lock->l_policy_data.l_inodebits.bits;
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	/* reach MDC layer to flush data under  the DoM ldlm lock */
+	rc = cl_object_flush(env, lli->lli_clob, lock);
+	if (rc == -ENODATA) {
+		CDEBUG(D_INODE, "inode "DFID" layout has no DoM stripe\n",
+		       PFID(ll_inode2fid(inode)));
+		/* most likely result of layout change, do nothing */
+		rc = 0;
+	}
 
-		/* Inode is set to lock->l_resource->lr_lvb_inode
-		 * for mdc - bug 24555 */
-		LASSERT(lock->l_ast_data == NULL);
+	cl_env_put(env, &refcheck);
+	RETURN(rc);
+}
 
-		if (inode == NULL)
-			break;
+static void ll_lock_cancel_bits(struct ldlm_lock *lock, __u64 to_cancel)
+{
+	struct inode *inode = ll_inode_from_resource_lock(lock);
+	__u64 bits = to_cancel;
+	int rc;
 
-		/* Invalidate all dentries associated with this inode */
-		LASSERT(ldlm_is_canceling(lock));
+	ENTRY;
+
+	if (!inode) {
+		/* That means the inode is evicted most likely and may cause
+		 * the skipping of lock cleanups below, so print the message
+		 * about that in log.
+		 */
+		if (lock->l_resource->lr_lvb_inode)
+			LDLM_DEBUG(lock,
+				   "can't take inode for the lock (%sevicted)\n",
+				   lock->l_resource->lr_lvb_inode->i_state &
+				   I_FREEING ? "" : "not ");
+		RETURN_EXIT;
+	}
+
+	if (!fid_res_name_eq(ll_inode2fid(inode),
+			     &lock->l_resource->lr_name)) {
+		LDLM_ERROR(lock, "data mismatch with object "DFID"(%p)",
+			   PFID(ll_inode2fid(inode)), inode);
+		LBUG();
+	}
+
+	if (bits & MDS_INODELOCK_XATTR) {
+		if (S_ISDIR(inode->i_mode))
+			ll_i2info(inode)->lli_def_stripe_offset = -1;
+		ll_xattr_cache_destroy(inode);
+		bits &= ~MDS_INODELOCK_XATTR;
+	}
 
-		if (!fid_res_name_eq(ll_inode2fid(inode),
-				     &lock->l_resource->lr_name)) {
-			LDLM_ERROR(lock, "data mismatch with object "DFID"(%p)",
-				   PFID(ll_inode2fid(inode)), inode);
+	/* For OPEN locks we differentiate between lock modes
+	 * LCK_CR, LCK_CW, LCK_PR - bug 22891 */
+	if (bits & MDS_INODELOCK_OPEN)
+		ll_have_md_lock(inode, &bits, lock->l_req_mode);
+
+	if (bits & MDS_INODELOCK_OPEN) {
+		fmode_t fmode;
+
+		switch (lock->l_req_mode) {
+		case LCK_CW:
+			fmode = FMODE_WRITE;
+			break;
+		case LCK_PR:
+			fmode = FMODE_EXEC;
+			break;
+		case LCK_CR:
+			fmode = FMODE_READ;
+			break;
+		default:
+			LDLM_ERROR(lock, "bad lock mode for OPEN lock");
 			LBUG();
 		}
 
-		if (bits & MDS_INODELOCK_XATTR) {
-			if (S_ISDIR(inode->i_mode))
-				ll_i2info(inode)->lli_def_stripe_offset = -1;
-			ll_xattr_cache_destroy(inode);
-			bits &= ~MDS_INODELOCK_XATTR;
-		}
+		ll_md_real_close(inode, fmode);
 
-		/* For OPEN locks we differentiate between lock modes
-		 * LCK_CR, LCK_CW, LCK_PR - bug 22891 */
-		if (bits & MDS_INODELOCK_OPEN)
-			ll_have_md_lock(inode, &bits, lock->l_req_mode);
-
-		if (bits & MDS_INODELOCK_OPEN) {
-			fmode_t fmode;
-
-			switch (lock->l_req_mode) {
-			case LCK_CW:
-				fmode = FMODE_WRITE;
-				break;
-			case LCK_PR:
-				fmode = FMODE_EXEC;
-				break;
-			case LCK_CR:
-				fmode = FMODE_READ;
-				break;
-			default:
-				LDLM_ERROR(lock, "bad lock mode for OPEN lock");
-				LBUG();
-			}
+		bits &= ~MDS_INODELOCK_OPEN;
+	}
 
-			ll_md_real_close(inode, fmode);
+	if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE |
+		    MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM |
+		    MDS_INODELOCK_DOM))
+		ll_have_md_lock(inode, &bits, LCK_MINMODE);
 
-			bits &= ~MDS_INODELOCK_OPEN;
-		}
+	if (bits & MDS_INODELOCK_DOM) {
+		rc =  ll_dom_lock_cancel(inode, lock);
+		if (rc < 0)
+			CDEBUG(D_INODE, "cannot flush DoM data "
+			       DFID": rc = %d\n",
+			       PFID(ll_inode2fid(inode)), rc);
+	}
 
-		if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE |
-			    MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM))
-			ll_have_md_lock(inode, &bits, LCK_MINMODE);
-
-		if (bits & MDS_INODELOCK_LAYOUT) {
-			struct cl_object_conf conf = {
-				.coc_opc = OBJECT_CONF_INVALIDATE,
-				.coc_inode = inode,
-			};
-
-			rc = ll_layout_conf(inode, &conf);
-			if (rc < 0)
-				CDEBUG(D_INODE, "cannot invalidate layout of "
-				       DFID": rc = %d\n",
-				       PFID(ll_inode2fid(inode)), rc);
-		}
+	if (bits & MDS_INODELOCK_LAYOUT) {
+		struct cl_object_conf conf = {
+			.coc_opc = OBJECT_CONF_INVALIDATE,
+			.coc_inode = inode,
+		};
 
-		if (bits & MDS_INODELOCK_UPDATE) {
-			struct ll_inode_info *lli = ll_i2info(inode);
-			lli->lli_update_atime = 1;
-		}
+		rc = ll_layout_conf(inode, &conf);
+		if (rc < 0)
+			CDEBUG(D_INODE, "cannot invalidate layout of "
+			       DFID": rc = %d\n",
+			       PFID(ll_inode2fid(inode)), rc);
+	}
+
+	if (bits & MDS_INODELOCK_UPDATE) {
+		struct ll_inode_info *lli = ll_i2info(inode);
+
+		lli->lli_update_atime = 1;
+	}
 
-		if ((bits & MDS_INODELOCK_UPDATE) && S_ISDIR(inode->i_mode)) {
-			struct ll_inode_info *lli = ll_i2info(inode);
+	if ((bits & MDS_INODELOCK_UPDATE) && S_ISDIR(inode->i_mode)) {
+		struct ll_inode_info *lli = ll_i2info(inode);
 
-			CDEBUG(D_INODE, "invalidating inode "DFID" lli = %p, "
-			       "pfid  = "DFID"\n", PFID(ll_inode2fid(inode)),
-			       lli, PFID(&lli->lli_pfid));
-			truncate_inode_pages(inode->i_mapping, 0);
+		CDEBUG(D_INODE, "invalidating inode "DFID" lli = %p, "
+		       "pfid  = "DFID"\n", PFID(ll_inode2fid(inode)),
+		       lli, PFID(&lli->lli_pfid));
+		truncate_inode_pages(inode->i_mapping, 0);
 
-			if (unlikely(!fid_is_zero(&lli->lli_pfid))) {
-				struct inode *master_inode = NULL;
-				unsigned long hash;
+		if (unlikely(!fid_is_zero(&lli->lli_pfid))) {
+			struct inode *master_inode = NULL;
+			unsigned long hash;
 
-				/* This is slave inode, since all of the child
-				 * dentry is connected on the master inode, so
-				 * we have to invalidate the negative children
-				 * on master inode */
-				CDEBUG(D_INODE, "Invalidate s"DFID" m"DFID"\n",
-				       PFID(ll_inode2fid(inode)),
-				       PFID(&lli->lli_pfid));
+			/* This is slave inode, since all of the child dentry
+			 * is connected on the master inode, so we have to
+			 * invalidate the negative children on master inode */
+			CDEBUG(D_INODE, "Invalidate s"DFID" m"DFID"\n",
+			       PFID(ll_inode2fid(inode)), PFID(&lli->lli_pfid));
 
-				hash = cl_fid_build_ino(&lli->lli_pfid,
+			hash = cl_fid_build_ino(&lli->lli_pfid,
 					ll_need_32bit_api(ll_i2sbi(inode)));
 
-				/* Do not lookup the inode with ilookup5,
-				 * otherwise it will cause dead lock,
-				 *
-				 * 1. Client1 send chmod req to the MDT0, then
-				 * on MDT0, it enqueues master and all of its
-				 * slaves lock, (mdt_attr_set() ->
-				 * mdt_lock_slaves()), after gets master and
-				 * stripe0 lock, it will send the enqueue req
-				 * (for stripe1) to MDT1, then MDT1 finds the
-				 * lock has been granted to client2. Then MDT1
-				 * sends blocking ast to client2.
-				 *
-				 * 2. At the same time, client2 tries to unlink
-				 * the striped dir (rm -rf striped_dir), and
-				 * during lookup, it will hold the master inode
-				 * of the striped directory, whose inode state
-				 * is NEW, then tries to revalidate all of its
-				 * slaves, (ll_prep_inode()->ll_iget()->
-				 * ll_read_inode2()-> ll_update_inode().). And
-				 * it will be blocked on the server side because
-				 * of 1.
-				 *
-				 * 3. Then the client get the blocking_ast req,
-				 * cancel the lock, but being blocked if using
-				 * ->ilookup5()), because master inode state is
-				 *  NEW. */
-				master_inode = ilookup5_nowait(inode->i_sb,
-						    hash, ll_test_inode_by_fid,
+			/* Do not lookup the inode with ilookup5, otherwise
+			 * it will cause dead lock,
+			 * 1. Client1 send chmod req to the MDT0, then on MDT0,
+			 * it enqueues master and all of its slaves lock,
+			 * (mdt_attr_set() -> mdt_lock_slaves()), after gets
+			 * master and stripe0 lock, it will send the enqueue
+			 * req (for stripe1) to MDT1, then MDT1 finds the lock
+			 * has been granted to client2. Then MDT1 sends blocking
+			 * ast to client2.
+			 * 2. At the same time, client2 tries to unlink
+			 * the striped dir (rm -rf striped_dir), and during
+			 * lookup, it will hold the master inode of the striped
+			 * directory, whose inode state is NEW, then tries to
+			 * revalidate all of its slaves, (ll_prep_inode()->
+			 * ll_iget()->ll_read_inode2()-> ll_update_inode().).
+			 * And it will be blocked on the server side because
+			 * of 1.
+			 * 3. Then the client get the blocking_ast req, cancel
+			 * the lock, but being blocked if using ->ilookup5()),
+			 * because master inode state is NEW. */
+			master_inode = ilookup5_nowait(inode->i_sb, hash,
+							ll_test_inode_by_fid,
 							(void *)&lli->lli_pfid);
-				if (master_inode) {
-					ll_invalidate_negative_children(
-								master_inode);
-					iput(master_inode);
-				}
-			} else {
-				ll_invalidate_negative_children(inode);
+			if (master_inode) {
+				ll_invalidate_negative_children(master_inode);
+				iput(master_inode);
 			}
+		} else {
+			ll_invalidate_negative_children(inode);
 		}
+	}
 
-		if ((bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM)) &&
-		    inode->i_sb->s_root != NULL &&
-		    inode != inode->i_sb->s_root->d_inode)
-			ll_invalidate_aliases(inode);
+	if ((bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM)) &&
+	    inode->i_sb->s_root != NULL &&
+	    inode != inode->i_sb->s_root->d_inode)
+		ll_invalidate_aliases(inode);
 
-		iput(inode);
+	if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM))
+		forget_all_cached_acls(inode);
+
+	iput(inode);
+	RETURN_EXIT;
+}
+
+/* Check if the given lock may be downgraded instead of canceling and
+ * that convert is really needed. */
+int ll_md_need_convert(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+	struct inode *inode;
+	__u64 wanted = lock->l_policy_data.l_inodebits.cancel_bits;
+	__u64 bits = lock->l_policy_data.l_inodebits.bits & ~wanted;
+	enum ldlm_mode mode = LCK_MINMODE;
+
+	if (!lock->l_conn_export ||
+	    !exp_connect_lock_convert(lock->l_conn_export))
+		return 0;
+
+	if (!wanted || !bits || ldlm_is_cancel(lock))
+		return 0;
+
+	/* do not convert locks other than DOM for now */
+	if (!((bits | wanted) & MDS_INODELOCK_DOM))
+		return 0;
+
+	/* We may have already remaining bits in some other lock so
+	 * lock convert will leave us just extra lock for the same bit.
+	 * Check if client has other lock with the same bits and the same
+	 * or lower mode and don't convert if any.
+	 */
+	switch (lock->l_req_mode) {
+	case LCK_PR:
+		mode = LCK_PR;
+		fallthrough;
+	case LCK_PW:
+		mode |= LCK_CR;
+		break;
+	case LCK_CW:
+		mode = LCK_CW;
+		fallthrough;
+	case LCK_CR:
+		mode |= LCK_CR;
+		break;
+	default:
+		/* do not convert other modes */
+		return 0;
+	}
+
+	/* is lock is too old to be converted? */
+	lock_res_and_lock(lock);
+	if (ktime_after(ktime_get(),
+			ktime_add(lock->l_last_used,
+				  ktime_set(ns->ns_dirty_age_limit, 0)))) {
+		unlock_res_and_lock(lock);
+		return 0;
+	}
+	unlock_res_and_lock(lock);
+
+	inode = ll_inode_from_resource_lock(lock);
+	ll_have_md_lock(inode, &bits, mode);
+	iput(inode);
+	return !!(bits);
+}
+
+int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *ld,
+		       void *data, int flag)
+{
+	struct lustre_handle lockh;
+	int rc;
+
+	ENTRY;
+
+	switch (flag) {
+	case LDLM_CB_BLOCKING:
+	{
+		__u64 cancel_flags = LCF_ASYNC;
+
+		/* if lock convert is not needed then still have to
+		 * pass lock via ldlm_cli_convert() to keep all states
+		 * correct, set cancel_bits to full lock bits to cause
+		 * full cancel to happen.
+		 */
+		if (!ll_md_need_convert(lock)) {
+			lock_res_and_lock(lock);
+			lock->l_policy_data.l_inodebits.cancel_bits =
+					lock->l_policy_data.l_inodebits.bits;
+			unlock_res_and_lock(lock);
+		}
+		rc = ldlm_cli_convert(lock, cancel_flags);
+		if (!rc)
+			RETURN(0);
+		/* continue with cancel otherwise */
+		ldlm_lock2handle(lock, &lockh);
+		rc = ldlm_cli_cancel(&lockh, cancel_flags);
+		if (rc < 0) {
+			CDEBUG(D_INODE, "ldlm_cli_cancel: rc = %d\n", rc);
+			RETURN(rc);
+		}
+		break;
+	}
+	case LDLM_CB_CANCELING:
+	{
+		__u64 to_cancel = lock->l_policy_data.l_inodebits.bits;
+
+		/* Nothing to do for non-granted locks */
+		if (!ldlm_is_granted(lock))
+			break;
+
+		/* If 'ld' is supplied then bits to be cancelled are passed
+		 * implicitly by lock converting and cancel_bits from 'ld'
+		 * should be used. Otherwise full cancel is being performed
+		 * and lock inodebits are used.
+		 *
+		 * Note: we cannot rely on cancel_bits in lock itself at this
+		 * moment because they can be changed by concurrent thread,
+		 * so ldlm_cli_inodebits_convert() pass cancel bits implicitly
+		 * in 'ld' parameter.
+		 */
+		if (ld) {
+			/* partial bits cancel allowed only during convert */
+			LASSERT(ldlm_is_converting(lock));
+			/* mask cancel bits by lock bits so only no any unused
+			 * bits are passed to ll_lock_cancel_bits()
+			 */
+			to_cancel &= ld->l_policy_data.l_inodebits.cancel_bits;
+		}
+		ll_lock_cancel_bits(lock, to_cancel);
 		break;
 	}
 	default:
@@ -474,7 +622,8 @@ struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de)
 
 static int ll_lookup_it_finish(struct ptlrpc_request *request,
 			       struct lookup_intent *it,
-			       struct inode *parent, struct dentry **de)
+			       struct inode *parent, struct dentry **de,
+			       void *secctx, __u32 secctxlen, ktime_t kstart)
 {
 	struct inode		 *inode = NULL;
 	__u64			  bits = 0;
@@ -487,20 +636,58 @@ static int ll_lookup_it_finish(struct ptlrpc_request *request,
 	CDEBUG(D_DENTRY, "it %p it_disposition %x\n", it,
 	       it->it_disposition);
 	if (!it_disposition(it, DISP_LOOKUP_NEG)) {
-                rc = ll_prep_inode(&inode, request, (*de)->d_sb, it);
-                if (rc)
-                        RETURN(rc);
-
-                ll_set_lock_data(ll_i2sbi(parent)->ll_md_exp, inode, it, &bits);
-
-                /* We used to query real size from OSTs here, but actually
-                   this is not needed. For stat() calls size would be updated
-                   from subsequent do_revalidate()->ll_inode_revalidate_it() in
-                   2.4 and
-                   vfs_getattr_it->ll_getattr()->ll_inode_revalidate_it() in 2.6
-                   Everybody else who needs correct file size would call
-                   ll_glimpse_size or some equivalent themselves anyway.
-                   Also see bug 7198. */
+		struct req_capsule *pill = &request->rq_pill;
+		struct mdt_body *body = req_capsule_server_get(pill,
+							       &RMF_MDT_BODY);
+
+		rc = ll_prep_inode(&inode, request, (*de)->d_sb, it);
+		if (rc)
+			RETURN(rc);
+
+		ll_set_lock_data(ll_i2sbi(parent)->ll_md_exp, inode, it, &bits);
+		/* OPEN can return data if lock has DoM+LAYOUT bits set */
+		if (it->it_op & IT_OPEN &&
+		    bits & MDS_INODELOCK_DOM && bits & MDS_INODELOCK_LAYOUT)
+			ll_dom_finish_open(inode, request);
+
+		/* We used to query real size from OSTs here, but actually
+		 * this is not needed. For stat() calls size would be updated
+		 * from subsequent do_revalidate()->ll_inode_revalidate_it() in
+		 * 2.4 and
+		 * vfs_getattr_it->ll_getattr()->ll_inode_revalidate_it() in 2.6
+		 * Everybody else who needs correct file size would call
+		 * ll_glimpse_size or some equivalent themselves anyway.
+		 * Also see bug 7198.
+		 */
+
+		/* If security context was returned by MDT, put it in
+		 * inode now to save an extra getxattr from security hooks,
+		 * and avoid deadlock.
+		 */
+		if (body->mbo_valid & OBD_MD_SECCTX) {
+			secctx = req_capsule_server_get(pill, &RMF_FILE_SECCTX);
+			secctxlen = req_capsule_get_size(pill,
+							   &RMF_FILE_SECCTX,
+							   RCL_SERVER);
+
+			if (secctxlen)
+				CDEBUG(D_SEC, "server returned security context"
+				       " for "DFID"\n",
+				       PFID(ll_inode2fid(inode)));
+		}
+
+		if (secctx && secctxlen) {
+			/* no need to protect selinux_inode_setsecurity() by
+			 * inode_lock. Taking it would lead to a client deadlock
+			 * LU-13617
+			 */
+			rc = security_inode_notifysecctx(inode, secctx,
+							 secctxlen);
+			if (rc)
+				CWARN("cannot set security context for "
+				      DFID": rc = %d\n",
+				      PFID(ll_inode2fid(inode)), rc);
+		}
 	}
 
 	/* Only hash *de if it is unhashed (new dentry).
@@ -517,9 +704,9 @@ static int ll_lookup_it_finish(struct ptlrpc_request *request,
 		if (bits & MDS_INODELOCK_LOOKUP)
 			d_lustre_revalidate(*de);
 	} else if (!it_disposition(it, DISP_OPEN_CREATE)) {
-		/* If file created on server, don't depend on parent UPDATE
-		 * lock to unhide it. It is left hidden and next lookup can
-		 * find it in ll_splice_alias.
+		/*
+		 * If file was created on the server, the dentry is revalidated
+		 * in ll_create_it if the lock allows for it.
 		 */
 		/* Check that parent has UPDATE lock. */
 		struct lookup_intent parent_it = {
@@ -544,11 +731,18 @@ static int ll_lookup_it_finish(struct ptlrpc_request *request,
 		}
 	}
 
+	if (it_disposition(it, DISP_OPEN_CREATE)) {
+		ll_stats_ops_tally(ll_i2sbi(parent), LPROC_LL_MKNOD,
+				   ktime_us_delta(ktime_get(), kstart));
+	}
+
 	GOTO(out, rc = 0);
 
 out:
-	if (rc != 0 && it->it_op & IT_OPEN)
+	if (rc != 0 && it->it_op & IT_OPEN) {
+		ll_intent_drop_lock(it);
 		ll_open_cleanup((*de)->d_sb, request);
+	}
 
 	return rc;
 }
@@ -557,13 +751,16 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
 				   struct lookup_intent *it,
 				   void **secctx, __u32 *secctxlen)
 {
+	ktime_t kstart = ktime_get();
 	struct lookup_intent lookup_it = { .it_op = IT_LOOKUP };
 	struct dentry *save = dentry, *retval;
 	struct ptlrpc_request *req = NULL;
 	struct md_op_data *op_data = NULL;
-        __u32 opc;
-        int rc;
-        ENTRY;
+	__u32 opc;
+	int rc;
+	char secctx_name[XATTR_NAME_MAX + 1];
+
+	ENTRY;
 
         if (dentry->d_name.len > ll_i2sbi(parent)->ll_namelen)
                 RETURN(ERR_PTR(-ENAMETOOLONG));
@@ -611,10 +808,32 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
 					     &op_data->op_file_secctx_size);
 		if (rc < 0)
 			GOTO(out, retval = ERR_PTR(rc));
-		if (secctx != NULL)
+		if (secctx)
 			*secctx = op_data->op_file_secctx;
-		if (secctxlen != NULL)
+		if (secctxlen)
 			*secctxlen = op_data->op_file_secctx_size;
+	} else {
+		if (secctx)
+			*secctx = NULL;
+		if (secctxlen)
+			*secctxlen = 0;
+	}
+
+	/* ask for security context upon intent */
+	if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_OPEN)) {
+		/* get name of security xattr to request to server */
+		rc = ll_listsecurity(parent, secctx_name,
+				     sizeof(secctx_name));
+		if (rc < 0) {
+			CDEBUG(D_SEC, "cannot get security xattr name for "
+			       DFID": rc = %d\n",
+			       PFID(ll_inode2fid(parent)), rc);
+		} else if (rc > 0) {
+			op_data->op_file_secctx_name = secctx_name;
+			op_data->op_file_secctx_name_size = rc;
+			CDEBUG(D_SEC, "'%.*s' is security xattr for "DFID"\n",
+			       rc, secctx_name, PFID(ll_inode2fid(parent)));
+		}
 	}
 
 	rc = md_intent_lock(ll_i2mdexp(parent), op_data, it, &req,
@@ -648,11 +867,15 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
 	if (rc < 0)
 		GOTO(out, retval = ERR_PTR(rc));
 
-	rc = ll_lookup_it_finish(req, it, parent, &dentry);
-        if (rc != 0) {
-                ll_intent_release(it);
-                GOTO(out, retval = ERR_PTR(rc));
-        }
+	/* dir layout may change */
+	ll_unlock_md_op_lsm(op_data);
+	rc = ll_lookup_it_finish(req, it, parent, &dentry,
+				 secctx ? *secctx : NULL,
+				 secctxlen ? *secctxlen : 0, kstart);
+	if (rc != 0) {
+		ll_intent_release(it);
+		GOTO(out, retval = ERR_PTR(rc));
+	}
 
         if ((it->it_op & IT_OPEN) && dentry->d_inode &&
             !S_ISREG(dentry->d_inode->i_mode) &&
@@ -665,7 +888,7 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
 
 out:
 	if (op_data != NULL && !IS_ERR(op_data)) {
-		if (secctx != NULL && secctxlen != NULL) {
+		if (secctx && secctxlen) {
 			/* caller needs sec ctx info, so reset it in op_data to
 			 * prevent it from being freed */
 			op_data->op_file_secctx = NULL;
@@ -991,6 +1214,7 @@ static int ll_create_it(struct inode *dir, struct dentry *dentry,
 			void *secctx, __u32 secctxlen)
 {
 	struct inode *inode;
+	__u64 bits = 0;
 	int rc = 0;
 	ENTRY;
 
@@ -1006,14 +1230,15 @@ static int ll_create_it(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		RETURN(PTR_ERR(inode));
 
-	if ((ll_i2sbi(inode)->ll_flags & LL_SBI_FILE_SECCTX) &&
-	    secctx != NULL) {
-		inode_lock(inode);
+	if ((ll_i2sbi(inode)->ll_flags & LL_SBI_FILE_SECCTX) && secctx) {
 		/* must be done before d_instantiate, because it calls
 		 * security_d_instantiate, which means a getxattr if security
 		 * context is not set yet */
+		/* no need to protect selinux_inode_setsecurity() by
+		 * inode_lock. Taking it would lead to a client deadlock
+		 * LU-13617
+		 */
 		rc = security_inode_notifysecctx(inode, secctx, secctxlen);
-		inode_unlock(inode);
 		if (rc)
 			RETURN(rc);
 	}
@@ -1026,6 +1251,10 @@ static int ll_create_it(struct inode *dir, struct dentry *dentry,
 			RETURN(rc);
 	}
 
+	ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, inode, it, &bits);
+	if (bits & MDS_INODELOCK_LOOKUP)
+		d_lustre_revalidate(dentry);
+
 	RETURN(0);
 }
 
@@ -1129,14 +1358,16 @@ static int ll_new_node(struct inode *dir, struct dentry *dchild,
 		GOTO(err_exit, err);
 
 	if (sbi->ll_flags & LL_SBI_FILE_SECCTX) {
-		inode_lock(inode);
 		/* must be done before d_instantiate, because it calls
 		 * security_d_instantiate, which means a getxattr if security
 		 * context is not set yet */
+		/* no need to protect selinux_inode_setsecurity() by
+		 * inode_lock. Taking it would lead to a client deadlock
+		 * LU-13617
+		 */
 		err = security_inode_notifysecctx(inode,
 						  op_data->op_file_secctx,
 						  op_data->op_file_secctx_size);
-		inode_unlock(inode);
 		if (err)
 			GOTO(err_exit, err);
 	}
@@ -1161,42 +1392,42 @@ static int ll_new_node(struct inode *dir, struct dentry *dchild,
 }
 
 static int ll_mknod(struct user_namespace *mnt_userns, struct inode *dir,
-		    struct dentry *dchild, umode_t mode, dev_t rdev)
+		    struct dentry *dchild, ll_umode_t mode, dev_t rdev)
 {
 	struct qstr *name = &dchild->d_name;
 	int err;
-        ENTRY;
+	ENTRY;
 
 	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p) mode %o dev %x\n",
 	       name->len, name->name, PFID(ll_inode2fid(dir)), dir,
-               mode, rdev);
+	       mode, rdev);
 
 	if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir)))
 		mode &= ~current_umask();
 
-        switch (mode & S_IFMT) {
-        case 0:
-                mode |= S_IFREG; /* for mode = 0 case, fallthrough */
-		/* Fall through */
-        case S_IFREG:
-        case S_IFCHR:
-        case S_IFBLK:
-        case S_IFIFO:
-        case S_IFSOCK:
+	switch (mode & S_IFMT) {
+	case 0:
+		mode |= S_IFREG;
+		fallthrough;
+	case S_IFREG:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFIFO:
+	case S_IFSOCK:
 		err = ll_new_node(dir, dchild, NULL, mode, old_encode_dev(rdev),
 				  LUSTRE_OPC_MKNOD);
-                break;
-        case S_IFDIR:
-                err = -EPERM;
-                break;
-        default:
-                err = -EINVAL;
-        }
+		break;
+	case S_IFDIR:
+		err = -EPERM;
+		break;
+	default:
+		err = -EINVAL;
+	}
 
-        if (!err)
-                ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKNOD, 1);
+	if (!err)
+		ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKNOD, 1);
 
-        RETURN(err);
+	RETURN(err);
 }
 
 #ifdef HAVE_IOP_ATOMIC_OPEN
@@ -1333,7 +1564,7 @@ static int ll_link(struct dentry *old_dentry, struct inode *dir,
 }
 
 static int ll_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
-		    struct dentry *dchild, umode_t mode)
+		    struct dentry *dchild, ll_umode_t mode)
 {
 	struct qstr *name = &dchild->d_name;
         int err;
diff --git a/drivers/staging/lustrefsx/lustre/llite/range_lock.c b/drivers/staging/lustrefsx/lustre/llite/range_lock.c
index 56e129165c4be..7a4c9c4cb766a 100644
--- a/drivers/staging/lustrefsx/lustre/llite/range_lock.c
+++ b/drivers/staging/lustrefsx/lustre/llite/range_lock.c
@@ -33,8 +33,11 @@
  * Author: Prakash Surya <surya1@llnl.gov>
  * Author: Bobi Jam <bobijam.xu@intel.com>
  */
+#ifdef HAVE_SCHED_HEADERS
+#include <linux/sched/signal.h>
+#endif
 #include "range_lock.h"
-#include <lustre/lustre_user.h>
+#include <uapi/linux/lustre/lustre_user.h>
 
 /**
  * Initialize a range lock tree
diff --git a/drivers/staging/lustrefsx/lustre/llite/rw.c b/drivers/staging/lustrefsx/lustre/llite/rw.c
index a00ccef398702..a5f3f9c187d57 100644
--- a/drivers/staging/lustrefsx/lustre/llite/rw.c
+++ b/drivers/staging/lustrefsx/lustre/llite/rw.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -369,7 +369,7 @@ ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io,
 					 io->ci_obj, ra.cra_end, page_idx);
 				/* update read ahead RPC size.
 				 * NB: it's racy but doesn't matter */
-				if (ras->ras_rpc_size > ra.cra_rpc_size &&
+				if (ras->ras_rpc_size != ra.cra_rpc_size &&
 				    ra.cra_rpc_size > 0)
 					ras->ras_rpc_size = ra.cra_rpc_size;
 				/* trim it to align with optimal RPC size */
@@ -714,7 +714,10 @@ static void ras_increase_window(struct inode *inode,
 
 		wlen = min(ras->ras_window_len + ras->ras_rpc_size,
 			   ra->ra_max_pages_per_file);
-		ras->ras_window_len = ras_align(ras, wlen, NULL);
+		if (wlen < ras->ras_rpc_size)
+			ras->ras_window_len = wlen;
+		else
+			ras->ras_window_len = ras_align(ras, wlen, NULL);
 	}
 }
 
@@ -1074,7 +1077,7 @@ void ll_cl_remove(struct file *file, const struct lu_env *env)
 	write_unlock(&fd->fd_lock);
 }
 
-static int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
+int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
 			   struct cl_page *page, struct file *file)
 {
 	struct inode              *inode  = vvp_object_inode(page->cp_obj);
@@ -1082,6 +1085,7 @@ static int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
 	struct ll_file_data       *fd     = LUSTRE_FPRIVATE(file);
 	struct ll_readahead_state *ras    = &fd->fd_ras;
 	struct cl_2queue          *queue  = &io->ci_queue;
+	struct cl_sync_io	  *anchor = NULL;
 	struct vvp_page           *vpg;
 	int			   rc = 0;
 	bool			   uptodate;
@@ -1109,6 +1113,10 @@ static int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
 		cl_page_export(env, page, 1);
 		cl_page_disown(env, io, page);
 	} else {
+		anchor = &vvp_env_info(env)->vti_anchor;
+		cl_sync_io_init(anchor, 1, &cl_sync_io_end);
+		page->cp_sync_io = anchor;
+
 		cl_2queue_add(queue, page);
 	}
 
@@ -1129,10 +1137,30 @@ static int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
 			task_io_account_read(PAGE_SIZE * count);
 	}
 
-	/*
-	 * Unlock unsent pages in case of error.
-	 */
+
+	if (anchor != NULL && !cl_page_is_owned(page, io)) { /* have sent */
+		rc = cl_sync_io_wait(env, anchor, 0);
+
+		cl_page_assume(env, io, page);
+		cl_page_list_del(env, &queue->c2_qout, page);
+
+		if (!PageUptodate(cl_page_vmpage(page))) {
+			/* Failed to read a mirror, discard this page so that
+			 * new page can be created with new mirror.
+			 *
+			 * TODO: this is not needed after page reinit
+			 * route is implemented */
+			cl_page_discard(env, io, page);
+		}
+		cl_page_disown(env, io, page);
+	}
+
+	/* TODO: discard all pages until page reinit route is implemented */
+	cl_page_list_discard(env, io, &queue->c2_qin);
+
+	/* Unlock unsent read pages in case of error. */
 	cl_page_list_disown(env, io, &queue->c2_qin);
+
 	cl_2queue_fini(env, queue);
 
 	RETURN(rc);
@@ -1143,24 +1171,25 @@ int ll_readpage(struct file *file, struct page *vmpage)
 	struct inode *inode = file_inode(file);
 	struct cl_object *clob = ll_i2info(inode)->lli_clob;
 	struct ll_cl_context *lcc;
-	const struct lu_env  *env;
-	struct cl_io   *io;
+	const struct lu_env  *env = NULL;
+	struct cl_io   *io = NULL;
 	struct cl_page *page;
 	int result;
 	ENTRY;
 
 	lcc = ll_cl_find(file);
-	if (lcc == NULL) {
-		unlock_page(vmpage);
-		RETURN(-EIO);
+	if (lcc != NULL) {
+		env = lcc->lcc_env;
+		io  = lcc->lcc_io;
 	}
 
-	env = lcc->lcc_env;
-	io  = lcc->lcc_io;
 	if (io == NULL) { /* fast read */
 		struct inode *inode = file_inode(file);
 		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 		struct ll_readahead_state *ras = &fd->fd_ras;
+		struct lu_env  *local_env = NULL;
+		unsigned long fast_read_pages =
+			max(RA_REMAIN_WINDOW_MIN, ras->ras_rpc_size);
 		struct vvp_page *vpg;
 
 		result = -ENODATA;
@@ -1173,11 +1202,16 @@ int ll_readpage(struct file *file, struct page *vmpage)
 			RETURN(result);
 		}
 
+		if (!env) {
+			local_env = cl_env_percpu_get();
+			env = local_env;
+		}
+
 		vpg = cl2vvp_page(cl_object_page_slice(page->cp_obj, page));
 		if (vpg->vpg_defer_uptodate) {
 			enum ras_update_flags flags = LL_RAS_HIT;
 
-			if (lcc->lcc_type == LCC_MMAP)
+			if (lcc && lcc->lcc_type == LCC_MMAP)
 				flags |= LL_RAS_MMAP;
 
 			/* For fast read, it updates read ahead state only
@@ -1192,7 +1226,7 @@ int ll_readpage(struct file *file, struct page *vmpage)
 			 * the case, we can't do fast IO because we will need
 			 * a cl_io to issue the RPC. */
 			if (ras->ras_window_start + ras->ras_window_len <
-			    ras->ras_next_readahead + PTLRPC_MAX_BRW_PAGES) {
+			    ras->ras_next_readahead + fast_read_pages) {
 				/* export the page and skip io stack */
 				vpg->vpg_ra_used = 1;
 				cl_page_export(env, page, 1);
@@ -1200,8 +1234,14 @@ int ll_readpage(struct file *file, struct page *vmpage)
 			}
 		}
 
-		unlock_page(vmpage);
+		/* release page refcount before unlocking the page to ensure
+		 * the object won't be destroyed in the calling path of
+		 * cl_page_put(). Please see comment in ll_releasepage(). */
 		cl_page_put(env, page);
+		unlock_page(vmpage);
+		if (local_env)
+			cl_env_percpu_put(local_env);
+
 		RETURN(result);
 	}
 
@@ -1211,6 +1251,7 @@ int ll_readpage(struct file *file, struct page *vmpage)
 		LASSERT(page->cp_type == CPT_CACHEABLE);
 		if (likely(!PageUptodate(vmpage))) {
 			cl_page_assume(env, io, page);
+
 			result = ll_io_read_page(env, io, page, file);
 		} else {
 			/* Page from a non-object file. */
@@ -1224,28 +1265,3 @@ int ll_readpage(struct file *file, struct page *vmpage)
         }
 	RETURN(result);
 }
-
-int ll_page_sync_io(const struct lu_env *env, struct cl_io *io,
-		    struct cl_page *page, enum cl_req_type crt)
-{
-	struct cl_2queue  *queue;
-	int result;
-
-	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
-
-	queue = &io->ci_queue;
-	cl_2queue_init_page(queue, page);
-
-	result = cl_io_submit_sync(env, io, crt, queue, 0);
-	LASSERT(cl_page_is_owned(page, io));
-
-	if (crt == CRT_READ)
-		/*
-		 * in CRT_WRITE case page is left locked even in case of
-		 * error.
-		 */
-		cl_page_list_disown(env, io, &queue->c2_qin);
-	cl_2queue_fini(env, queue);
-
-	return result;
-}
diff --git a/drivers/staging/lustrefsx/lustre/llite/rw26.c b/drivers/staging/lustrefsx/lustre/llite/rw26.c
index 9cba2d0b5e8e3..9a1f0b6021baf 100644
--- a/drivers/staging/lustrefsx/lustre/llite/rw26.c
+++ b/drivers/staging/lustrefsx/lustre/llite/rw26.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -577,45 +577,83 @@ ll_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 
 /**
  * Prepare partially written-to page for a write.
+ * @pg is owned when passed in and disowned when it returns non-zero result to
+ * the caller.
  */
 static int ll_prepare_partial_page(const struct lu_env *env, struct cl_io *io,
-				   struct cl_page *pg)
+				   struct cl_page *pg, struct file *file)
 {
 	struct cl_attr *attr   = vvp_env_thread_attr(env);
 	struct cl_object *obj  = io->ci_obj;
 	struct vvp_page *vpg   = cl_object_page_slice(obj, pg);
 	loff_t          offset = cl_offset(obj, vvp_index(vpg));
 	int             result;
+	ENTRY;
 
 	cl_object_attr_lock(obj);
 	result = cl_object_attr_get(env, obj, attr);
 	cl_object_attr_unlock(obj);
-	if (result == 0) {
-		/*
-		 * If are writing to a new page, no need to read old data.
-		 * The extent locking will have updated the KMS, and for our
-		 * purposes here we can treat it like i_size.
-		 */
-		if (attr->cat_kms <= offset) {
-			char *kaddr = ll_kmap_atomic(vpg->vpg_page, KM_USER0);
+	if (result) {
+		cl_page_disown(env, io, pg);
+		GOTO(out, result);
+	}
 
-			memset(kaddr, 0, cl_page_size(obj));
-			ll_kunmap_atomic(kaddr, KM_USER0);
-		} else if (vpg->vpg_defer_uptodate)
-			vpg->vpg_ra_used = 1;
-		else
-			result = ll_page_sync_io(env, io, pg, CRT_READ);
+	/*
+	 * If are writing to a new page, no need to read old data.
+	 * The extent locking will have updated the KMS, and for our
+	 * purposes here we can treat it like i_size.
+	 */
+	if (attr->cat_kms <= offset) {
+		char *kaddr = ll_kmap_atomic(vpg->vpg_page, KM_USER0);
+
+		memset(kaddr, 0, cl_page_size(obj));
+		ll_kunmap_atomic(kaddr, KM_USER0);
+		GOTO(out, result = 0);
+	}
+
+	if (vpg->vpg_defer_uptodate) {
+		vpg->vpg_ra_used = 1;
+		GOTO(out, result = 0);
 	}
+
+	result = ll_io_read_page(env, io, pg, file);
+	if (result)
+		GOTO(out, result);
+
+	/* ll_io_read_page() disowns the page */
+	result = cl_page_own(env, io, pg);
+	if (!result) {
+		if (!PageUptodate(cl_page_vmpage(pg))) {
+			cl_page_disown(env, io, pg);
+			result = -EIO;
+		}
+	} else if (result == -ENOENT) {
+		/* page was truncated */
+		result = -EAGAIN;
+	}
+	EXIT;
+
+out:
 	return result;
 }
 
+static int ll_tiny_write_begin(struct page *vmpage)
+{
+	/* Page must be present, up to date, dirty, and not in writeback. */
+	if (!vmpage || !PageUptodate(vmpage) || !PageDirty(vmpage) ||
+	    PageWriteback(vmpage))
+		return -ENODATA;
+
+	return 0;
+}
+
 static int ll_write_begin(struct file *file, struct address_space *mapping,
 			  loff_t pos, unsigned len, unsigned flags,
 			  struct page **pagep, void **fsdata)
 {
-	struct ll_cl_context *lcc;
+	struct ll_cl_context *lcc = NULL;
 	const struct lu_env  *env = NULL;
-	struct cl_io   *io;
+	struct cl_io   *io = NULL;
 	struct cl_page *page = NULL;
 
 	struct cl_object *clob = ll_i2info(mapping->host)->lli_clob;
@@ -626,17 +664,27 @@ static int ll_write_begin(struct file *file, struct address_space *mapping,
 	int result = 0;
 	ENTRY;
 
-	CDEBUG(D_PAGE, "Writing %lu of %d to %d bytes\n", index, from, len);
+	CDEBUG(D_VFSTRACE, "Writing %lu of %d to %d bytes\n", index, from, len);
 
 	lcc = ll_cl_find(file);
 	if (lcc == NULL) {
-		io = NULL;
-		GOTO(out, result = -EIO);
+		vmpage = grab_cache_page_nowait(mapping, index);
+		result = ll_tiny_write_begin(vmpage);
+		GOTO(out, result);
 	}
 
 	env = lcc->lcc_env;
 	io  = lcc->lcc_io;
 
+	if (file->f_flags & O_DIRECT && io->ci_designated_mirror > 0) {
+		/* direct IO failed because it couldn't clean up cached pages,
+		 * this causes a problem for mirror write because the cached
+		 * page may belong to another mirror, which will result in
+		 * problem submitting the I/O. */
+		GOTO(out, result = -EBUSY);
+	}
+
+again:
 	/* To avoid deadlock, try to lock page first. */
 	vmpage = grab_cache_page_nowait(mapping, index);
 
@@ -689,13 +737,18 @@ static int ll_write_begin(struct file *file, struct address_space *mapping,
 			/* TODO: can be optimized at OSC layer to check if it
 			 * is a lockless IO. In that case, it's not necessary
 			 * to read the data. */
-			result = ll_prepare_partial_page(env, io, page);
-			if (result == 0)
-				SetPageUptodate(vmpage);
+			result = ll_prepare_partial_page(env, io, page, file);
+			if (result) {
+				/* vmpage should have been unlocked */
+				put_page(vmpage);
+				vmpage = NULL;
+
+				if (result == -EAGAIN)
+					goto again;
+				GOTO(out, result);
+			}
 		}
 	}
-	if (result < 0)
-		cl_page_unassume(env, io, page);
 	EXIT;
 out:
 	if (result < 0) {
@@ -703,6 +756,7 @@ static int ll_write_begin(struct file *file, struct address_space *mapping,
 			unlock_page(vmpage);
 			put_page(vmpage);
 		}
+		/* On tiny_write failure, page and io are always null. */
 		if (!IS_ERR_OR_NULL(page)) {
 			lu_ref_del(&page->cp_reference, "cl_io", io);
 			cl_page_put(env, page);
@@ -716,6 +770,47 @@ static int ll_write_begin(struct file *file, struct address_space *mapping,
 	RETURN(result);
 }
 
+static int ll_tiny_write_end(struct file *file, struct address_space *mapping,
+			     loff_t pos, unsigned int len, unsigned int copied,
+			     struct page *vmpage)
+{
+	struct cl_page *clpage = (struct cl_page *) vmpage->private;
+	loff_t kms = pos+copied;
+	loff_t to = kms & (PAGE_SIZE-1) ? kms & (PAGE_SIZE-1) : PAGE_SIZE;
+	__u16 refcheck;
+	struct lu_env *env = cl_env_get(&refcheck);
+	int rc = 0;
+
+	ENTRY;
+
+	if (IS_ERR(env)) {
+		rc = PTR_ERR(env);
+		goto out;
+	}
+
+	/* This page is dirty in cache, so it should have a cl_page pointer
+	 * set in vmpage->private.
+	 */
+	LASSERT(clpage != NULL);
+
+	if (copied == 0)
+		goto out_env;
+
+	/* Update the underlying size information in the OSC/LOV objects this
+	 * page is part of.
+	 */
+	cl_page_touch(env, clpage, to);
+
+out_env:
+	cl_env_put(env, &refcheck);
+
+out:
+	/* Must return page unlocked. */
+	unlock_page(vmpage);
+
+	RETURN(rc);
+}
+
 static int ll_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *vmpage, void *fsdata)
@@ -732,6 +827,14 @@ static int ll_write_end(struct file *file, struct address_space *mapping,
 
 	put_page(vmpage);
 
+	CDEBUG(D_VFSTRACE, "pos %llu, len %u, copied %u\n", pos, len, copied);
+
+	if (lcc == NULL) {
+		result = ll_tiny_write_end(file, mapping, pos, len, copied,
+					   vmpage);
+		GOTO(out, result);
+	}
+
 	LASSERT(lcc != NULL);
 	env  = lcc->lcc_env;
 	page = lcc->lcc_page;
@@ -761,7 +864,7 @@ static int ll_write_end(struct file *file, struct address_space *mapping,
 		if (plist->pl_nr >= PTLRPC_MAX_BRW_PAGES)
 			unplug = true;
 
-		CL_PAGE_DEBUG(D_PAGE, env, page,
+		CL_PAGE_DEBUG(D_VFSTRACE, env, page,
 			      "queued page: %d.\n", plist->pl_nr);
 	} else {
 		cl_page_disown(env, io, page);
@@ -773,11 +876,14 @@ static int ll_write_end(struct file *file, struct address_space *mapping,
 		/* page list is not contiguous now, commit it now */
 		unplug = true;
 	}
-	if (unplug || io->u.ci_rw.rw_sync)
+	if (unplug || io->u.ci_wr.wr_sync)
 		result = vvp_io_write_commit(env, io);
 
 	if (result < 0)
 		io->ci_result = result;
+
+
+out:
 	RETURN(result >= 0 ? copied : result);
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/llite/statahead.c b/drivers/staging/lustrefsx/lustre/llite/statahead.c
index 5b2af025d28f9..397712909b3f4 100644
--- a/drivers/staging/lustrefsx/lustre/llite/statahead.c
+++ b/drivers/staging/lustrefsx/lustre/llite/statahead.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -330,6 +330,58 @@ __sa_make_ready(struct ll_statahead_info *sai, struct sa_entry *entry, int ret)
 	return (index == sai->sai_index_wait);
 }
 
+/* finish async stat RPC arguments */
+static void sa_fini_data(struct md_enqueue_info *minfo)
+{
+	ll_unlock_md_op_lsm(&minfo->mi_data);
+	iput(minfo->mi_dir);
+	OBD_FREE_PTR(minfo);
+}
+
+static int ll_statahead_interpret(struct ptlrpc_request *req,
+				  struct md_enqueue_info *minfo, int rc);
+
+/*
+ * prepare arguments for async stat RPC.
+ */
+static struct md_enqueue_info *
+sa_prep_data(struct inode *dir, struct inode *child, struct sa_entry *entry)
+{
+	struct md_enqueue_info   *minfo;
+	struct ldlm_enqueue_info *einfo;
+	struct md_op_data        *op_data;
+
+	OBD_ALLOC_PTR(minfo);
+	if (minfo == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	op_data = ll_prep_md_op_data(&minfo->mi_data, dir, child,
+				     entry->se_qstr.name, entry->se_qstr.len, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data)) {
+		OBD_FREE_PTR(minfo);
+		return (struct md_enqueue_info *)op_data;
+	}
+
+	if (child == NULL)
+		op_data->op_fid2 = entry->se_fid;
+
+	minfo->mi_it.it_op = IT_GETATTR;
+	minfo->mi_dir = igrab(dir);
+	minfo->mi_cb = ll_statahead_interpret;
+	minfo->mi_cbdata = entry;
+
+	einfo = &minfo->mi_einfo;
+	einfo->ei_type   = LDLM_IBITS;
+	einfo->ei_mode   = it_to_lock_mode(&minfo->mi_it);
+	einfo->ei_cb_bl  = ll_md_blocking_ast;
+	einfo->ei_cb_cp  = ldlm_completion_ast;
+	einfo->ei_cb_gl  = NULL;
+	einfo->ei_cbdata = NULL;
+
+	return minfo;
+}
+
 /*
  * release resources used in async stat RPC, update entry state and wakeup if
  * scanner process it waiting on this entry.
@@ -346,8 +398,7 @@ sa_make_ready(struct ll_statahead_info *sai, struct sa_entry *entry, int ret)
 	if (minfo) {
 		entry->se_minfo = NULL;
 		ll_intent_release(&minfo->mi_it);
-		iput(minfo->mi_dir);
-		OBD_FREE_PTR(minfo);
+		sa_fini_data(minfo);
 	}
 
 	if (req) {
@@ -493,10 +544,11 @@ static void ll_sai_put(struct ll_statahead_info *sai)
 static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai)
 {
 	struct ll_inode_info *lli = ll_i2info(inode);
-	__u64 index = lli->lli_agl_index;
+	u64 index = lli->lli_agl_index;
+	ktime_t expire;
 	int rc;
-	ENTRY;
 
+	ENTRY;
 	LASSERT(list_empty(&lli->lli_agl_list));
 
         /* AGL maybe fall behind statahead with one entry */
@@ -539,8 +591,9 @@ static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai)
          *    relative rare. AGL can ignore such case, and it will not muchly
          *    affect the performance.
          */
-        if (lli->lli_glimpse_time != 0 &&
-            cfs_time_before(cfs_time_shift(-1), lli->lli_glimpse_time)) {
+	expire = ktime_sub_ns(ktime_get(), NSEC_PER_SEC);
+	if (ktime_to_ns(lli->lli_glimpse_time) &&
+	    ktime_before(expire, lli->lli_glimpse_time)) {
 		up_write(&lli->lli_glimpse_sem);
                 lli->lli_agl_index = 0;
                 iput(inode);
@@ -552,7 +605,7 @@ static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai)
 
         cl_agl(inode);
         lli->lli_agl_index = 0;
-        lli->lli_glimpse_time = cfs_time_current();
+	lli->lli_glimpse_time = ktime_get();
 	up_write(&lli->lli_glimpse_sem);
 
         CDEBUG(D_READA, "Handled (init) async glimpse: inode= "
@@ -580,14 +633,14 @@ static void sa_instantiate(struct ll_statahead_info *sai,
 	int rc = 0;
 	ENTRY;
 
-        LASSERT(entry->se_handle != 0);
+	LASSERT(entry->se_handle != 0);
 
-        minfo = entry->se_minfo;
-        it = &minfo->mi_it;
-        req = entry->se_req;
-        body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
-        if (body == NULL)
-                GOTO(out, rc = -EFAULT);
+	minfo = entry->se_minfo;
+	it = &minfo->mi_it;
+	req = entry->se_req;
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EFAULT);
 
 	child = entry->se_inode;
 	if (child != NULL) {
@@ -602,25 +655,25 @@ static void sa_instantiate(struct ll_statahead_info *sai,
 
 	it->it_lock_handle = entry->se_handle;
 	rc = md_revalidate_lock(ll_i2mdexp(dir), it, ll_inode2fid(dir), NULL);
-        if (rc != 1)
-                GOTO(out, rc = -EAGAIN);
+	if (rc != 1)
+		GOTO(out, rc = -EAGAIN);
 
-        rc = ll_prep_inode(&child, req, dir->i_sb, it);
-        if (rc)
-                GOTO(out, rc);
+	rc = ll_prep_inode(&child, req, dir->i_sb, it);
+	if (rc)
+		GOTO(out, rc);
 
 	CDEBUG(D_READA, "%s: setting %.*s"DFID" l_data to inode %p\n",
 	       ll_get_fsname(child->i_sb, NULL, 0),
 	       entry->se_qstr.len, entry->se_qstr.name,
 	       PFID(ll_inode2fid(child)), child);
-        ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, child, it, NULL);
+	ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, child, it, NULL);
 
-        entry->se_inode = child;
+	entry->se_inode = child;
 
-        if (agl_should_run(sai, child))
-                ll_agl_add(sai, child, entry->se_index);
+	if (agl_should_run(sai, child))
+		ll_agl_add(sai, child, entry->se_index);
 
-        EXIT;
+	EXIT;
 
 out:
 	/* sa_make_ready() will drop ldlm ibits lock refcount by calling
@@ -684,8 +737,7 @@ static int ll_statahead_interpret(struct ptlrpc_request *req,
 
 	if (rc != 0) {
 		ll_intent_release(it);
-		iput(dir);
-		OBD_FREE_PTR(minfo);
+		sa_fini_data(minfo);
 	} else {
 		/* release ibits lock ASAP to avoid deadlock when statahead
 		 * thread enqueues lock on parent in readdir and another
@@ -693,6 +745,7 @@ static int ll_statahead_interpret(struct ptlrpc_request *req,
 		 * unlink. */
 		handle = it->it_lock_handle;
 		ll_intent_drop_lock(it);
+		ll_unlock_md_op_lsm(&minfo->mi_data);
 	}
 
 	spin_lock(&lli->lli_sa_lock);
@@ -722,53 +775,6 @@ static int ll_statahead_interpret(struct ptlrpc_request *req,
 	RETURN(rc);
 }
 
-/* finish async stat RPC arguments */
-static void sa_fini_data(struct md_enqueue_info *minfo)
-{
-        iput(minfo->mi_dir);
-        OBD_FREE_PTR(minfo);
-}
-
-/*
- * prepare arguments for async stat RPC.
- */
-static struct md_enqueue_info *
-sa_prep_data(struct inode *dir, struct inode *child, struct sa_entry *entry)
-{
-	struct md_enqueue_info   *minfo;
-	struct ldlm_enqueue_info *einfo;
-	struct md_op_data        *op_data;
-
-	OBD_ALLOC_PTR(minfo);
-	if (minfo == NULL)
-		return ERR_PTR(-ENOMEM);
-
-	op_data = ll_prep_md_op_data(&minfo->mi_data, dir, child, NULL, 0, 0,
-				     LUSTRE_OPC_ANY, NULL);
-	if (IS_ERR(op_data)) {
-		OBD_FREE_PTR(minfo);
-		return (struct md_enqueue_info *)op_data;
-	}
-
-	if (child == NULL)
-		op_data->op_fid2 = entry->se_fid;
-
-	minfo->mi_it.it_op = IT_GETATTR;
-	minfo->mi_dir = igrab(dir);
-	minfo->mi_cb = ll_statahead_interpret;
-	minfo->mi_cbdata = entry;
-
-	einfo = &minfo->mi_einfo;
-	einfo->ei_type   = LDLM_IBITS;
-	einfo->ei_mode   = it_to_lock_mode(&minfo->mi_it);
-	einfo->ei_cb_bl  = ll_md_blocking_ast;
-	einfo->ei_cb_cp  = ldlm_completion_ast;
-	einfo->ei_cb_gl  = NULL;
-	einfo->ei_cbdata = NULL;
-
-	return minfo;
-}
-
 /* async stat for file not found in dcache */
 static int sa_lookup(struct inode *dir, struct sa_entry *entry)
 {
@@ -810,22 +816,20 @@ static int sa_revalidate(struct inode *dir, struct sa_entry *entry,
 	if (d_mountpoint(dentry))
 		RETURN(1);
 
+	minfo = sa_prep_data(dir, inode, entry);
+	if (IS_ERR(minfo))
+		RETURN(PTR_ERR(minfo));
+
 	entry->se_inode = igrab(inode);
 	rc = md_revalidate_lock(ll_i2mdexp(dir), &it, ll_inode2fid(inode),
 				NULL);
 	if (rc == 1) {
 		entry->se_handle = it.it_lock_handle;
 		ll_intent_release(&it);
+		sa_fini_data(minfo);
 		RETURN(1);
 	}
 
-	minfo = sa_prep_data(dir, inode, entry);
-	if (IS_ERR(minfo)) {
-		entry->se_inode = NULL;
-		iput(inode);
-		RETURN(PTR_ERR(minfo));
-	}
-
 	rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo);
 	if (rc < 0) {
 		entry->se_inode = NULL;
@@ -922,6 +926,7 @@ static int ll_agl_thread(void *arg)
 			list_del_init(&clli->lli_agl_list);
 			spin_unlock(&plli->lli_agl_lock);
 			ll_agl_trigger(&clli->lli_vfs_inode, sai);
+			cond_resched();
 		} else {
 			spin_unlock(&plli->lli_agl_lock);
 		}
@@ -999,8 +1004,7 @@ static int ll_statahead_thread(void *arg)
 	CDEBUG(D_READA, "statahead thread starting: sai %p, parent %.*s\n",
 	       sai, parent->d_name.len, parent->d_name.name);
 
-	op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0,
-				     LUSTRE_OPC_ANY, dir);
+	OBD_ALLOC_PTR(op_data);
 	if (IS_ERR(op_data))
 		GOTO(out, rc = PTR_ERR(op_data));
 
@@ -1022,8 +1026,16 @@ static int ll_statahead_thread(void *arg)
 		struct lu_dirpage *dp;
 		struct lu_dirent  *ent;
 
+		op_data = ll_prep_md_op_data(op_data, dir, dir, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, dir);
+		if (IS_ERR(op_data)) {
+			rc = PTR_ERR(op_data);
+			break;
+		}
+
 		sai->sai_in_readpage = 1;
 		page = ll_get_dir_page(dir, op_data, pos, &chain);
+		ll_unlock_md_op_lsm(op_data);
 		sai->sai_in_readpage = 0;
 		if (IS_ERR(page)) {
 			rc = PTR_ERR(page);
@@ -1109,7 +1121,7 @@ static int ll_statahead_thread(void *arg)
 
 					ll_agl_trigger(&clli->lli_vfs_inode,
 							sai);
-
+					cond_resched();
 					spin_lock(&lli->lli_agl_lock);
 				}
 				spin_unlock(&lli->lli_agl_lock);
@@ -1598,7 +1610,6 @@ static int start_statahead_thread(struct inode *dir, struct dentry *dentry)
 		spin_lock(&lli->lli_sa_lock);
 		lli->lli_sai = NULL;
 		spin_unlock(&lli->lli_sa_lock);
-		atomic_dec(&ll_i2sbi(parent->d_inode)->ll_sa_running);
 		rc = PTR_ERR(task);
 		CERROR("can't start ll_sa thread, rc: %d\n", rc);
 		GOTO(out, rc);
diff --git a/drivers/staging/lustrefsx/lustre/llite/super25.c b/drivers/staging/lustrefsx/lustre/llite/super25.c
index 7118cce98561b..8fbbea24c9ce2 100644
--- a/drivers/staging/lustrefsx/lustre/llite/super25.c
+++ b/drivers/staging/lustrefsx/lustre/llite/super25.c
@@ -77,11 +77,22 @@ static void ll_destroy_inode(struct inode *inode)
 }
 #endif
 
+static int ll_drop_inode(struct inode *inode)
+{
+  struct ll_sb_info *sbi = ll_i2sbi(inode);
+
+  if (!sbi->ll_inode_cache_enabled)
+	return 1;
+
+  return generic_drop_inode(inode);
+}
+
 /* exported operations */
 struct super_operations lustre_super_operations =
 {
         .alloc_inode   = ll_alloc_inode,
         .destroy_inode = ll_destroy_inode,
+        .drop_inode    = ll_drop_inode,
 #ifdef HAVE_SBOPS_EVICT_INODE
         .evict_inode   = ll_delete_inode,
 #else
@@ -95,12 +106,8 @@ struct super_operations lustre_super_operations =
         .show_options  = ll_show_options,
 };
 
-
-void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg));
-
 static int __init lustre_init(void)
 {
-	struct proc_dir_entry *entry;
 	struct lnet_process_id lnet_id;
 	struct timespec64 ts;
 	int i, rc, seed[2];
@@ -132,15 +139,9 @@ static int __init lustre_init(void)
 	if (ll_file_data_slab == NULL)
 		GOTO(out_cache, rc = -ENOMEM);
 
-	entry = lprocfs_register("llite", proc_lustre_root, NULL, NULL);
-	if (IS_ERR(entry)) {
-		rc = PTR_ERR(entry);
-		CERROR("cannot register '/proc/fs/lustre/llite': rc = %d\n",
-		       rc);
+	rc = llite_tunables_register();
+	if (rc)
 		GOTO(out_cache, rc);
-	}
-
-	proc_lustre_fs_root = entry;
 
 	cfs_get_random_bytes(seed, sizeof(seed));
 
@@ -150,7 +151,7 @@ static int __init lustre_init(void)
 		if (LNetGetId(i, &lnet_id) == -ENOENT)
 			break;
 
-		if (LNET_NETTYP(LNET_NIDNET(lnet_id.nid)) != LOLND)
+		if (lnet_id.nid != LNET_NID_LO_0)
 			seed[0] ^= LNET_NIDADDR(lnet_id.nid);
 	}
 
@@ -159,7 +160,7 @@ static int __init lustre_init(void)
 
 	rc = vvp_global_init();
 	if (rc != 0)
-		GOTO(out_proc, rc);
+		GOTO(out_tunables, rc);
 
 	cl_inode_fini_env = cl_env_alloc(&cl_inode_fini_refcheck,
 					 LCT_REMEMBER | LCT_NOREF);
@@ -174,7 +175,6 @@ static int __init lustre_init(void)
 
 	lustre_register_client_fill_super(ll_fill_super);
 	lustre_register_kill_super_cb(ll_kill_super);
-	lustre_register_client_process_config(ll_process_config);
 
 	RETURN(0);
 
@@ -182,15 +182,11 @@ static int __init lustre_init(void)
 	cl_env_put(cl_inode_fini_env, &cl_inode_fini_refcheck);
 out_vvp:
 	vvp_global_fini();
-out_proc:
-	lprocfs_remove(&proc_lustre_fs_root);
+out_tunables:
+	llite_tunables_unregister();
 out_cache:
-	if (ll_inode_cachep != NULL)
-		kmem_cache_destroy(ll_inode_cachep);
-
-	if (ll_file_data_slab != NULL)
-		kmem_cache_destroy(ll_file_data_slab);
-
+	kmem_cache_destroy(ll_inode_cachep);
+	kmem_cache_destroy(ll_file_data_slab);
 	return rc;
 }
 
@@ -198,14 +194,20 @@ static void __exit lustre_exit(void)
 {
 	lustre_register_client_fill_super(NULL);
 	lustre_register_kill_super_cb(NULL);
-	lustre_register_client_process_config(NULL);
 
-	lprocfs_remove(&proc_lustre_fs_root);
+	llite_tunables_unregister();
 
 	ll_xattr_fini();
 	cl_env_put(cl_inode_fini_env, &cl_inode_fini_refcheck);
 	vvp_global_fini();
 
+#ifdef HAVE_INODE_I_RCU
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+#endif
 	kmem_cache_destroy(ll_inode_cachep);
 	kmem_cache_destroy(ll_file_data_slab);
 }
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c b/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c
index 2f640635afea2..d36aed3919268 100644
--- a/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_dev.c
@@ -53,7 +53,6 @@
  */
 
 static struct kmem_cache *ll_thread_kmem;
-struct kmem_cache *vvp_lock_kmem;
 struct kmem_cache *vvp_object_kmem;
 static struct kmem_cache *vvp_session_kmem;
 static struct kmem_cache *vvp_thread_kmem;
@@ -64,11 +63,6 @@ static struct lu_kmem_descr vvp_caches[] = {
 		.ckd_name  = "ll_thread_kmem",
 		.ckd_size  = sizeof(struct ll_thread_info),
 	},
-	{
-		.ckd_cache = &vvp_lock_kmem,
-		.ckd_name  = "vvp_lock_kmem",
-		.ckd_size  = sizeof(struct vvp_lock),
-	},
 	{
 		.ckd_cache = &vvp_object_kmem,
 		.ckd_name  = "vvp_object_kmem",
@@ -361,26 +355,10 @@ int cl_sb_fini(struct super_block *sb)
 
 /****************************************************************************
  *
- * /proc/fs/lustre/llite/$MNT/dump_page_cache
+ * debugfs/lustre/llite/$MNT/dump_page_cache
  *
  ****************************************************************************/
 
-/*
- * To represent contents of a page cache as a byte stream, following
- * information if encoded in 64bit offset:
- *
- *       - file hash bucket in lu_site::ls_hash[]       28bits
- *
- *       - how far file is from bucket head              4bits
- *
- *       - page index                                   32bits
- *
- * First two data identify a file in the cache uniquely.
- */
-
-#define PGC_OBJ_SHIFT (32 + 4)
-#define PGC_DEPTH_SHIFT (32)
-
 struct vvp_pgcache_id {
         unsigned                 vpi_bucket;
         unsigned                 vpi_depth;
@@ -390,22 +368,18 @@ struct vvp_pgcache_id {
         struct lu_object_header *vpi_obj;
 };
 
-static void vvp_pgcache_id_unpack(loff_t pos, struct vvp_pgcache_id *id)
-{
-        CLASSERT(sizeof(pos) == sizeof(__u64));
-
-        id->vpi_index  = pos & 0xffffffff;
-        id->vpi_depth  = (pos >> PGC_DEPTH_SHIFT) & 0xf;
-        id->vpi_bucket = ((unsigned long long)pos >> PGC_OBJ_SHIFT);
-}
-
-static loff_t vvp_pgcache_id_pack(struct vvp_pgcache_id *id)
-{
-        return
-                ((__u64)id->vpi_index) |
-                ((__u64)id->vpi_depth  << PGC_DEPTH_SHIFT) |
-                ((__u64)id->vpi_bucket << PGC_OBJ_SHIFT);
-}
+struct vvp_seq_private {
+	struct ll_sb_info	*vsp_sbi;
+	struct lu_env		*vsp_env;
+	u16			vsp_refcheck;
+	struct cl_object	*vsp_clob;
+	struct vvp_pgcache_id	vvp_id;
+	/*
+	 * prev_pos is the 'pos' of the last object returned
+	 * by ->start of ->next.
+	 */
+	loff_t			vvp_prev_pos;
+};
 
 static int vvp_pgcache_obj_get(struct cfs_hash *hs, struct cfs_hash_bd *bd,
 			       struct hlist_node *hnode, void *data)
@@ -413,12 +387,12 @@ static int vvp_pgcache_obj_get(struct cfs_hash *hs, struct cfs_hash_bd *bd,
         struct vvp_pgcache_id   *id  = data;
         struct lu_object_header *hdr = cfs_hash_object(hs, hnode);
 
+	if (lu_object_is_dying(hdr))
+		return 0;
+
         if (id->vpi_curdep-- > 0)
                 return 0; /* continue */
 
-        if (lu_object_is_dying(hdr))
-                return 1;
-
         cfs_hash_get(hs, hnode);
         id->vpi_obj = hdr;
         return 1;
@@ -430,8 +404,7 @@ static struct cl_object *vvp_pgcache_obj(const struct lu_env *env,
 {
 	LASSERT(lu_device_is_cl(dev));
 
-	id->vpi_depth &= 0xf;
-	id->vpi_obj    = NULL;
+	id->vpi_obj = NULL;
 	id->vpi_curdep = id->vpi_depth;
 
 	cfs_hash_hlist_for_each(dev->ld_site->ls_obj_hash, id->vpi_bucket,
@@ -445,52 +418,42 @@ static struct cl_object *vvp_pgcache_obj(const struct lu_env *env,
 			return lu2cl(lu_obj);
 		}
 		lu_object_put(env, lu_object_top(id->vpi_obj));
-
-	} else if (id->vpi_curdep > 0) {
-		id->vpi_depth = 0xf;
 	}
 	return NULL;
 }
 
-static loff_t vvp_pgcache_find(const struct lu_env *env,
-			       struct lu_device *dev, loff_t pos)
+static struct page *vvp_pgcache_current(struct vvp_seq_private *priv)
 {
-	struct cl_object     *clob;
-	struct lu_site       *site;
-	struct vvp_pgcache_id id;
-
-	site = dev->ld_site;
-	vvp_pgcache_id_unpack(pos, &id);
+	struct lu_device *dev = &priv->vsp_sbi->ll_cl->cd_lu_dev;
 
 	while (1) {
-		if (id.vpi_bucket >= CFS_HASH_NHLIST(site->ls_obj_hash))
-			return ~0ULL;
-		clob = vvp_pgcache_obj(env, dev, &id);
-		if (clob != NULL) {
-			struct inode *inode = vvp_object_inode(clob);
-			struct page *vmpage;
-			int nr;
-
-			nr = find_get_pages_contig(inode->i_mapping,
-						   id.vpi_index, 1, &vmpage);
-			if (nr > 0) {
-				id.vpi_index = vmpage->index;
-				/* Cant support over 16T file */
-				nr = !(vmpage->index > 0xffffffff);
-				put_page(vmpage);
-			}
-
-			lu_object_ref_del(&clob->co_lu, "dump", current);
-			cl_object_put(env, clob);
-			if (nr > 0)
-				return vvp_pgcache_id_pack(&id);
+		struct inode *inode;
+		struct page *vmpage;
+		int nr;
+
+		if (!priv->vsp_clob) {
+			struct cl_object *clob;
+
+			while ((clob = vvp_pgcache_obj(priv->vsp_env, dev, &priv->vvp_id)) == NULL &&
+			       ++(priv->vvp_id.vpi_bucket) < CFS_HASH_NHLIST(dev->ld_site->ls_obj_hash))
+				priv->vvp_id.vpi_depth = 0;
+			if (!clob)
+				return NULL;
+			priv->vsp_clob = clob;
+			priv->vvp_id.vpi_index = 0;
+		}
+
+		inode = vvp_object_inode(priv->vsp_clob);
+		nr = find_get_pages_contig(inode->i_mapping, priv->vvp_id.vpi_index, 1, &vmpage);
+		if (nr > 0) {
+			priv->vvp_id.vpi_index = vmpage->index;
+			return vmpage;
 		}
-		/* to the next object. */
-		++id.vpi_depth;
-		id.vpi_depth &= 0xf;
-		if (id.vpi_depth == 0 && ++id.vpi_bucket == 0)
-			return ~0ULL;
-		id.vpi_index = 0;
+		lu_object_ref_del(&priv->vsp_clob->co_lu, "dump", current);
+		cl_object_put(priv->vsp_env, priv->vsp_clob);
+		priv->vsp_clob = NULL;
+		priv->vvp_id.vpi_index = 0;
+		priv->vvp_id.vpi_depth++;
 	}
 }
 
@@ -532,92 +495,72 @@ static void vvp_pgcache_page_show(const struct lu_env *env,
 
 static int vvp_pgcache_show(struct seq_file *f, void *v)
 {
-	loff_t                   pos;
-	struct ll_sb_info       *sbi;
-	struct cl_object        *clob;
-	struct lu_env           *env;
-	struct vvp_pgcache_id    id;
-	__u16                    refcheck;
-	int                      result;
-
-	env = cl_env_get(&refcheck);
-	if (!IS_ERR(env)) {
-		pos = *(loff_t *) v;
-		vvp_pgcache_id_unpack(pos, &id);
-		sbi = f->private;
-		clob = vvp_pgcache_obj(env, &sbi->ll_cl->cd_lu_dev, &id);
-		if (clob != NULL) {
-			struct inode *inode = vvp_object_inode(clob);
-			struct cl_page *page = NULL;
-			struct page *vmpage;
-
-			result = find_get_pages_contig(inode->i_mapping,
-						      id.vpi_index, 1, &vmpage);
-			if (result > 0) {
-				lock_page(vmpage);
-				page = cl_vmpage_page(vmpage, clob);
-				unlock_page(vmpage);
-
-				put_page(vmpage);
-			}
-
-			seq_printf(f, "%8x@"DFID": ", id.vpi_index,
-				   PFID(lu_object_fid(&clob->co_lu)));
-			if (page != NULL) {
-				vvp_pgcache_page_show(env, f, page);
-				cl_page_put(env, page);
-			} else
-				seq_puts(f, "missing\n");
-			lu_object_ref_del(&clob->co_lu, "dump", current);
-			cl_object_put(env, clob);
-		} else
-			seq_printf(f, "%llx missing\n", pos);
-		cl_env_put(env, &refcheck);
-		result = 0;
-	} else
-		result = PTR_ERR(env);
-	return result;
+	struct vvp_seq_private *priv = f->private;
+	struct page *vmpage = v;
+	struct cl_page *page;
+
+	seq_printf(f, "%8lx@" DFID ": ", vmpage->index,
+		   PFID(lu_object_fid(&priv->vsp_clob->co_lu)));
+	lock_page(vmpage);
+	page = cl_vmpage_page(vmpage, priv->vsp_clob);
+	unlock_page(vmpage);
+	put_page(vmpage);
+
+	if (page) {
+		vvp_pgcache_page_show(priv->vsp_env, f, page);
+		cl_page_put(priv->vsp_env, page);
+	} else {
+		seq_puts(f, "missing\n");
+	}
+
+	return 0;
 }
 
-static void *vvp_pgcache_start(struct seq_file *f, loff_t *pos)
+static void vvp_pgcache_rewind(struct vvp_seq_private *priv)
 {
-        struct ll_sb_info *sbi;
-        struct lu_env     *env;
-	__u16              refcheck;
+	if (priv->vvp_prev_pos) {
+		memset(&priv->vvp_id, 0, sizeof(priv->vvp_id));
+		priv->vvp_prev_pos = 0;
+		if (priv->vsp_clob) {
+			lu_object_ref_del(&priv->vsp_clob->co_lu, "dump",
+					  current);
+			cl_object_put(priv->vsp_env, priv->vsp_clob);
+		}
+		priv->vsp_clob = NULL;
+	}
+}
 
-        sbi = f->private;
+static struct page *vvp_pgcache_next_page(struct vvp_seq_private *priv)
+{
+	priv->vvp_id.vpi_index += 1;
+	return vvp_pgcache_current(priv);
+}
 
-        env = cl_env_get(&refcheck);
-        if (!IS_ERR(env)) {
-                sbi = f->private;
-                if (sbi->ll_site->ls_obj_hash->hs_cur_bits > 64 - PGC_OBJ_SHIFT)
-                        pos = ERR_PTR(-EFBIG);
-                else {
-                        *pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev,
-                                                *pos);
-                        if (*pos == ~0ULL)
-                                pos = NULL;
-                }
-                cl_env_put(env, &refcheck);
-        }
-        return pos;
+static void *vvp_pgcache_start(struct seq_file *f, loff_t *pos)
+{
+	struct vvp_seq_private *priv = f->private;
+
+	if (*pos == 0) {
+		vvp_pgcache_rewind(priv);
+	} else if (*pos == priv->vvp_prev_pos) {
+		/* Return the current item */;
+	} else {
+		WARN_ON(*pos != priv->vvp_prev_pos + 1);
+		priv->vvp_id.vpi_index += 1;
+	}
+
+	priv->vvp_prev_pos = *pos;
+	return vvp_pgcache_current(priv);
 }
 
 static void *vvp_pgcache_next(struct seq_file *f, void *v, loff_t *pos)
 {
-        struct ll_sb_info *sbi;
-        struct lu_env     *env;
-	__u16              refcheck;
+	struct vvp_seq_private *priv = f->private;
 
-        env = cl_env_get(&refcheck);
-        if (!IS_ERR(env)) {
-                sbi = f->private;
-                *pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev, *pos + 1);
-                if (*pos == ~0ULL)
-                        pos = NULL;
-                cl_env_put(env, &refcheck);
-        }
-        return pos;
+	WARN_ON(*pos != priv->vvp_prev_pos);
+	*pos += 1;
+	priv->vvp_prev_pos = *pos;
+	return vvp_pgcache_next_page(priv);
 }
 
 static void vvp_pgcache_stop(struct seq_file *f, void *v)
@@ -634,22 +577,44 @@ static struct seq_operations vvp_pgcache_ops = {
 
 static int vvp_dump_pgcache_seq_open(struct inode *inode, struct file *filp)
 {
-	struct ll_sb_info	*sbi = PDE_DATA(inode);
-	struct seq_file		*seq;
-	int			result;
-
-	result = seq_open(filp, &vvp_pgcache_ops);
-	if (result == 0) {
-		seq = filp->private_data;
-		seq->private = sbi;
+	struct vvp_seq_private *priv;
+
+	priv = __seq_open_private(filp, &vvp_pgcache_ops, sizeof(*priv));
+	if (!priv)
+		return -ENOMEM;
+
+	priv->vsp_sbi = inode->i_private;
+	priv->vsp_env = cl_env_get(&priv->vsp_refcheck);
+	priv->vsp_clob = NULL;
+	memset(&priv->vvp_id, 0, sizeof(priv->vvp_id));
+	if (IS_ERR(priv->vsp_env)) {
+		int err = PTR_ERR(priv->vsp_env);
+
+		seq_release_private(inode, filp);
+		return err;
 	}
-	return result;
+
+	return 0;
+}
+
+static int vvp_dump_pgcache_seq_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct vvp_seq_private *priv = seq->private;
+
+	if (priv->vsp_clob) {
+		lu_object_ref_del(&priv->vsp_clob->co_lu, "dump", current);
+		cl_object_put(priv->vsp_env, priv->vsp_clob);
+	}
+
+	cl_env_put(priv->vsp_env, &priv->vsp_refcheck);
+	return seq_release_private(inode, file);
 }
 
-const struct proc_ops vvp_dump_pgcache_file_ops = {
-	PROC_OWNER(THIS_MODULE)
-        .proc_open    = vvp_dump_pgcache_seq_open,
-        .proc_read    = seq_read,
-        .proc_lseek   = seq_lseek,
-        .proc_release = seq_release,
+const struct file_operations vvp_dump_pgcache_file_ops = {
+	.owner	 = THIS_MODULE,
+	.open	 = vvp_dump_pgcache_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = vvp_dump_pgcache_seq_release,
 };
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_internal.h b/drivers/staging/lustrefsx/lustre/llite/vvp_internal.h
index 9973d646ae703..0fb9b51a8f618 100644
--- a/drivers/staging/lustrefsx/lustre/llite/vvp_internal.h
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2013, 2016, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -37,7 +37,6 @@
 #ifndef VVP_INTERNAL_H
 #define VVP_INTERNAL_H
 
-#include <lustre/lustre_idl.h>
 #include <cl_object.h>
 
 enum obd_notify_event;
@@ -61,7 +60,13 @@ struct vvp_io {
 	/** super class */
 	struct cl_io_slice     vui_cl;
 	struct cl_io_lock_link vui_link;
-	/** Total size for the left IO. */
+	/**
+	 * I/O vector information to or from which read/write is going.
+	 */
+	struct iov_iter *vui_iter;
+	/**
+	 * Total size for the left IO.
+	 */
 	size_t vui_tot_count;
 
 	union {
@@ -88,6 +93,7 @@ struct vvp_io {
 			 * check that flags are from filemap_fault
 			 */
 			bool			 ft_flags_valid;
+			struct cl_page_list	 ft_queue;
 		} fault;
 		struct {
 			struct pipe_inode_info	*vui_pipe;
@@ -111,6 +117,7 @@ struct vvp_io {
 	* File descriptor against which IO is done.
 	*/
 	struct ll_file_data	*vui_fd;
+	struct kiocb		*vui_iocb;
 
 	/* Readahead state. */
 	pgoff_t	vui_ra_start;
@@ -124,7 +131,6 @@ extern struct lu_device_type vvp_device_type;
 extern struct lu_context_key vvp_session_key;
 extern struct lu_context_key vvp_thread_key;
 
-extern struct kmem_cache *vvp_lock_kmem;
 extern struct kmem_cache *vvp_object_kmem;
 
 struct vvp_thread_info {
@@ -132,6 +138,7 @@ struct vvp_thread_info {
 	struct cl_lock_descr	vti_descr;
 	struct cl_io		vti_io;
 	struct cl_attr		vti_attr;
+	struct cl_sync_io	vti_anchor;
 };
 
 static inline struct vvp_thread_info *vvp_env_info(const struct lu_env *env)
@@ -251,10 +258,6 @@ struct vvp_device {
 	struct cl_device   *vdv_next;
 };
 
-struct vvp_lock {
-	struct cl_lock_slice vlk_cl;
-};
-
 static inline struct lu_device *vvp2lu_dev(struct vvp_device *vdv)
 {
 	return &vdv->vdv_cl.cd_lu_dev;
@@ -293,11 +296,6 @@ static inline struct page *cl2vm_page(const struct cl_page_slice *slice)
 	return cl2vvp_page(slice)->vpg_page;
 }
 
-static inline struct vvp_lock *cl2vvp_lock(const struct cl_lock_slice *slice)
-{
-	return container_of(slice, struct vvp_lock, vlk_cl);
-}
-
 #ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
 # define CLOBINVRNT(env, clob, expr)					\
 	do {								\
@@ -317,8 +315,6 @@ int lov_read_and_clear_async_rc(struct cl_object *clob);
 int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
 		struct cl_io *io);
 int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io);
-int vvp_lock_init(const struct lu_env *env, struct cl_object *obj,
-		  struct cl_lock *lock, const struct cl_io *io);
 int vvp_page_init(const struct lu_env *env, struct cl_object *obj,
 		  struct cl_page *page, pgoff_t index);
 struct lu_object *vvp_object_alloc(const struct lu_env *env,
@@ -328,6 +324,6 @@ struct lu_object *vvp_object_alloc(const struct lu_env *env,
 int vvp_global_init(void);
 void vvp_global_fini(void);
 
-extern const struct proc_ops vvp_dump_pgcache_file_ops;
+extern const struct file_operations vvp_dump_pgcache_file_ops;
 
 #endif /* VVP_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_io.c b/drivers/staging/lustrefsx/lustre/llite/vvp_io.c
index 1bcadeb7cf0da..6d8070c5b8bfd 100644
--- a/drivers/staging/lustrefsx/lustre/llite/vvp_io.c
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_io.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -299,12 +299,14 @@ static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
 	struct cl_object *obj = io->ci_obj;
 	struct vvp_io    *vio = cl2vvp_io(env, ios);
 	struct inode     *inode = vvp_object_inode(obj);
+	__u32		  gen = 0;
 	int rc;
+	ENTRY;
 
 	CLOBINVRNT(env, obj, vvp_object_invariant(obj));
 
 	CDEBUG(D_VFSTRACE, DFID" ignore/verify layout %d/%d, layout version %d "
-	       "need write layout %d, restore needed %d\n",
+			   "need write layout %d, restore needed %d\n",
 	       PFID(lu_object_fid(&obj->co_lu)),
 	       io->ci_ignore_layout, io->ci_verify_layout,
 	       vio->vui_layout_gen, io->ci_need_write_intent,
@@ -321,18 +323,40 @@ static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
 		 * block on layout lock held by the MDT
 		 * as MDT will not send new layout in lvb (see LU-3124)
 		 * we have to explicitly fetch it, all this will be done
-		 * by ll_layout_refresh()
+		 * by ll_layout_refresh().
+		 * Even if ll_layout_restore() returns zero, it doesn't mean
+		 * that restore has been successful. Therefore it sets
+		 * ci_verify_layout so that it will check layout at the end
+		 * of this function.
 		 */
-		if (rc == 0) {
-			io->ci_restore_needed = 0;
-			io->ci_need_restart = 1;
-			io->ci_verify_layout = 1;
-		} else {
+		if (rc) {
 			io->ci_restore_needed = 1;
 			io->ci_need_restart = 0;
 			io->ci_verify_layout = 0;
 			io->ci_result = rc;
+			GOTO(out, rc);
+		}
+
+		io->ci_restore_needed = 0;
+
+		/* Even if ll_layout_restore() returns zero, it doesn't mean
+		 * that restore has been successful. Therefore it should verify
+		 * if there was layout change and restart I/O correspondingly.
+		 */
+		ll_layout_refresh(inode, &gen);
+		io->ci_need_restart = vio->vui_layout_gen != gen;
+		if (io->ci_need_restart) {
+			CDEBUG(D_VFSTRACE,
+			       DFID" layout changed from %d to %d.\n",
+			       PFID(lu_object_fid(&obj->co_lu)),
+			       vio->vui_layout_gen, gen);
+			/* today successful restore is the only possible
+			 * case */
+			/* restore was done, clear restoring state */
+			ll_file_clear_flag(ll_i2info(vvp_object_inode(obj)),
+					   LLIF_FILE_RESTORING);
 		}
+		GOTO(out, 0);
 	}
 
 	/**
@@ -340,47 +364,29 @@ static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
 	 * RPC.
 	 */
 	if (io->ci_need_write_intent) {
-		loff_t start = 0;
-		loff_t end = OBD_OBJECT_EOF;
+		enum layout_intent_opc opc = LAYOUT_INTENT_WRITE;
 
 		io->ci_need_write_intent = 0;
 
 		LASSERT(io->ci_type == CIT_WRITE ||
 			cl_io_is_trunc(io) || cl_io_is_mkwrite(io));
 
-		if (io->ci_type == CIT_WRITE) {
-			if (!cl_io_is_append(io)) {
-				start = io->u.ci_rw.rw_range.cir_pos;
-				end = start + io->u.ci_rw.rw_range.cir_count;
-			}
-		} else if (cl_io_is_trunc(io)) {
-			/* for writes, e_end is endpos, the location of the file
-			 * pointer after the write is completed, so it is not accessed.
-			 * For truncate, 'end' is the size, and *is* acccessed.
-			 * In other words, writes are [start, end), but truncate is
-			 * [start, size], where both are included.  So add 1 to the
-			 * size when creating the write intent to account for this.
-			 */
-			end = io->u.ci_setattr.sa_attr.lvb_size + 1;
-		} else { /* mkwrite */
-			pgoff_t index = io->u.ci_fault.ft_index;
+		CDEBUG(D_VFSTRACE, DFID" write layout, type %u "DEXT"\n",
+		       PFID(lu_object_fid(&obj->co_lu)), io->ci_type,
+		       PEXT(&io->ci_write_intent));
 
-			start = cl_offset(io->ci_obj, index);
-			end = cl_offset(io->ci_obj, index + 1);
-		}
+		if (cl_io_is_trunc(io))
+			opc = LAYOUT_INTENT_TRUNC;
 
-		CDEBUG(D_VFSTRACE, DFID" write layout, type %u [%llu, %llu)\n",
-		       PFID(lu_object_fid(&obj->co_lu)), io->ci_type,
-		       start, end);
-		rc = ll_layout_write_intent(inode, start, end);
+		rc = ll_layout_write_intent(inode, opc, &io->ci_write_intent);
 		io->ci_result = rc;
 		if (!rc)
 			io->ci_need_restart = 1;
+		GOTO(out, rc);
 	}
 
-	if (!io->ci_ignore_layout && io->ci_verify_layout) {
-		__u32 gen = 0;
-
+	if (!io->ci_need_restart &&
+	    !io->ci_ignore_layout && io->ci_verify_layout) {
 		/* check layout version */
 		ll_layout_refresh(inode, &gen);
 		io->ci_need_restart = vio->vui_layout_gen != gen;
@@ -389,13 +395,11 @@ static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
 			       DFID" layout changed from %d to %d.\n",
 			       PFID(lu_object_fid(&obj->co_lu)),
 			       vio->vui_layout_gen, gen);
-			/* today successful restore is the only possible
-			 * case */
-			/* restore was done, clear restoring state */
-			ll_file_clear_flag(ll_i2info(vvp_object_inode(obj)),
-					   LLIF_FILE_RESTORING);
 		}
+		GOTO(out, 0);
 	}
+out:
+	EXIT;
 }
 
 static void vvp_io_fault_fini(const struct lu_env *env,
@@ -426,7 +430,8 @@ static enum cl_lock_mode vvp_mode_from_vma(struct vm_area_struct *vma)
         return CLM_READ;
 }
 
-static int vvp_mmap_locks(const struct lu_env *env, struct cl_io *io)
+static int vvp_mmap_locks(const struct lu_env *env,
+			  struct vvp_io *vio, struct cl_io *io)
 {
 	struct vvp_thread_info *vti = vvp_env_info(env);
 	struct mm_struct *mm = current->mm;
@@ -445,14 +450,18 @@ static int vvp_mmap_locks(const struct lu_env *env, struct cl_io *io)
 	if (!cl_is_normalio(env, io))
 		RETURN(0);
 
+	/* nfs or loop back device write */
+	if (vio->vui_iter == NULL)
+		RETURN(0);
+
 	/* No MM (e.g. NFS)? No vmas too. */
 	if (mm == NULL)
 		RETURN(0);
 
-	if (!iter_is_iovec(&io->u.ci_rw.rw_iter) && !iov_iter_is_kvec(&io->u.ci_rw.rw_iter))
+	if (!iter_is_iovec(vio->vui_iter) && !iov_iter_is_kvec(vio->vui_iter))
 		RETURN(0);
 
-	for (i = io->u.ci_rw.rw_iter;
+	for (i = *vio->vui_iter;
 	     iov_iter_count(&i);
 	     iov_iter_advance(&i, iov.iov_len)) {
 		iov = iov_iter_iovec(&i);
@@ -528,37 +537,38 @@ static void vvp_io_advance(const struct lu_env *env,
 		return;
 
 	vio->vui_tot_count -= nob;
-	if (io->ci_pio) {
-		iov_iter_advance(&io->u.ci_rw.rw_iter, nob);
-		io->u.ci_rw.rw_iocb.ki_pos = io->u.ci_rw.rw_range.cir_pos;
-#ifdef HAVE_KIOCB_KI_LEFT
-		io->u.ci_rw.rw_iocb.ki_left = vio->vui_tot_count;
-#elif defined(HAVE_KI_NBYTES)
-		io->u.ci_rw.rw_iocb.ki_nbytes = vio->vui_tot_count;
-#endif
-	} else {
-		/* It was truncated to stripe size in vvp_io_rw_lock() */
-		iov_iter_reexpand(&io->u.ci_rw.rw_iter, vio->vui_tot_count);
-	}
+	iov_iter_reexpand(vio->vui_iter, vio->vui_tot_count);
+}
+
+static void vvp_io_update_iov(const struct lu_env *env,
+			      struct vvp_io *vio, struct cl_io *io)
+{
+	size_t size = io->u.ci_rw.crw_count;
+
+	if (!cl_is_normalio(env, io) || vio->vui_iter == NULL)
+		return;
+
+	iov_iter_truncate(vio->vui_iter, size);
 }
 
 static int vvp_io_rw_lock(const struct lu_env *env, struct cl_io *io,
                           enum cl_lock_mode mode, loff_t start, loff_t end)
 {
+	struct vvp_io *vio = vvp_env_io(env);
 	int result;
 	int ast_flags = 0;
 
 	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
 	ENTRY;
 
-	if (cl_is_normalio(env, io))
-		iov_iter_truncate(&io->u.ci_rw.rw_iter,
-				  io->u.ci_rw.rw_range.cir_count);
+	vvp_io_update_iov(env, vio, io);
 
-	if (io->u.ci_rw.rw_nonblock)
+	if (io->u.ci_rw.crw_nonblock)
 		ast_flags |= CEF_NONBLOCK;
+	if (io->ci_lock_no_expand)
+		ast_flags |= CEF_LOCK_NO_EXPAND;
 
-	result = vvp_mmap_locks(env, io);
+	result = vvp_mmap_locks(env, vio, io);
 	if (result == 0)
 		result = vvp_io_one_lock(env, io, ast_flags, mode, start, end);
 
@@ -569,13 +579,13 @@ static int vvp_io_read_lock(const struct lu_env *env,
                             const struct cl_io_slice *ios)
 {
 	struct cl_io *io = ios->cis_io;
-	struct cl_io_range *range = &io->u.ci_rw.rw_range;
-	int rc;
+	struct cl_io_rw_common *rd = &io->u.ci_rd.rd;
+	int result;
 
 	ENTRY;
-	rc = vvp_io_rw_lock(env, io, CLM_READ, range->cir_pos,
-			    range->cir_pos + range->cir_count - 1);
-	RETURN(rc);
+	result = vvp_io_rw_lock(env, io, CLM_READ, rd->crw_pos,
+				rd->crw_pos + rd->crw_count - 1);
+	RETURN(result);
 }
 
 static int vvp_io_fault_lock(const struct lu_env *env,
@@ -594,27 +604,26 @@ static int vvp_io_fault_lock(const struct lu_env *env,
 }
 
 static int vvp_io_write_lock(const struct lu_env *env,
-                             const struct cl_io_slice *ios)
+			     const struct cl_io_slice *ios)
 {
 	struct cl_io *io = ios->cis_io;
 	loff_t start;
 	loff_t end;
-	int rc;
 
-	ENTRY;
-	if (io->u.ci_rw.rw_append) {
+	if (io->u.ci_wr.wr_append) {
 		start = 0;
 		end   = OBD_OBJECT_EOF;
 	} else {
-		start = io->u.ci_rw.rw_range.cir_pos;
-		end   = start + io->u.ci_rw.rw_range.cir_count - 1;
+		start = io->u.ci_wr.wr.crw_pos;
+		end   = start + io->u.ci_wr.wr.crw_count - 1;
 	}
-	rc = vvp_io_rw_lock(env, io, CLM_WRITE, start, end);
-	RETURN(rc);
+
+	RETURN(vvp_io_rw_lock(env, io, CLM_WRITE, start, end));
 }
 
 static int vvp_io_setattr_iter_init(const struct lu_env *env,
 				    const struct cl_io_slice *ios)
+
 {
 	return 0;
 }
@@ -631,12 +640,12 @@ static int vvp_io_setattr_lock(const struct lu_env *env,
 	__u64 new_size;
 	__u32 enqflags = 0;
 
-        if (cl_io_is_trunc(io)) {
-                new_size = io->u.ci_setattr.sa_attr.lvb_size;
-                if (new_size == 0)
-                        enqflags = CEF_DISCARD_DATA;
-        } else {
-		unsigned int valid = io->u.ci_setattr.sa_valid;
+	if (cl_io_is_trunc(io)) {
+		new_size = io->u.ci_setattr.sa_attr.lvb_size;
+		if (new_size == 0)
+			enqflags = CEF_DISCARD_DATA;
+	} else {
+		unsigned int valid = io->u.ci_setattr.sa_avalid;
 
 		if (!(valid & TIMES_SET_FLAGS))
 			return 0;
@@ -685,16 +694,16 @@ static int vvp_io_setattr_time(const struct lu_env *env,
         int result;
         unsigned valid = CAT_CTIME;
 
-        cl_object_attr_lock(obj);
-        attr->cat_ctime = io->u.ci_setattr.sa_attr.lvb_ctime;
-        if (io->u.ci_setattr.sa_valid & ATTR_ATIME_SET) {
-                attr->cat_atime = io->u.ci_setattr.sa_attr.lvb_atime;
-                valid |= CAT_ATIME;
-        }
-        if (io->u.ci_setattr.sa_valid & ATTR_MTIME_SET) {
-                attr->cat_mtime = io->u.ci_setattr.sa_attr.lvb_mtime;
-                valid |= CAT_MTIME;
-        }
+	cl_object_attr_lock(obj);
+	attr->cat_ctime = io->u.ci_setattr.sa_attr.lvb_ctime;
+	if (io->u.ci_setattr.sa_avalid & ATTR_ATIME_SET) {
+		attr->cat_atime = io->u.ci_setattr.sa_attr.lvb_atime;
+		valid |= CAT_ATIME;
+	}
+	if (io->u.ci_setattr.sa_avalid & ATTR_MTIME_SET) {
+		attr->cat_mtime = io->u.ci_setattr.sa_attr.lvb_mtime;
+		valid |= CAT_MTIME;
+	}
 	result = cl_object_attr_update(env, obj, attr, valid);
 	cl_object_attr_unlock(obj);
 
@@ -716,7 +725,7 @@ static int vvp_io_setattr_start(const struct lu_env *env,
 		inode_lock(inode);
 	}
 
-	if (io->u.ci_setattr.sa_valid & TIMES_SET_FLAGS)
+	if (io->u.ci_setattr.sa_avalid & TIMES_SET_FLAGS)
 		return vvp_io_setattr_time(env, ios);
 
 	return 0;
@@ -764,34 +773,36 @@ static int vvp_io_read_start(const struct lu_env *env,
 	struct inode		*inode = vvp_object_inode(obj);
 	struct ll_inode_info	*lli   = ll_i2info(inode);
 	struct file		*file  = vio->vui_fd->fd_file;
-	struct cl_io_range	*range = &io->u.ci_rw.rw_range;
-	loff_t pos = range->cir_pos; /* for generic_file_splice_read() only */
-	size_t tot = vio->vui_tot_count;
-	int exceed = 0;
-	int result;
+	loff_t  pos = io->u.ci_rd.rd.crw_pos;
+	long    cnt = io->u.ci_rd.rd.crw_count;
+	long    tot = vio->vui_tot_count;
+	int     exceed = 0;
+	int     result;
+	ENTRY;
 
 	CLOBINVRNT(env, obj, vvp_object_invariant(obj));
 
 	CDEBUG(D_VFSTRACE, "%s: read [%llu, %llu)\n",
 		file_dentry(file)->d_name.name,
-		range->cir_pos, range->cir_pos + range->cir_count);
+		pos, pos + cnt);
 
 	if (vio->vui_io_subtype == IO_NORMAL)
 		down_read(&lli->lli_trunc_sem);
 
 	if (!can_populate_pages(env, io, inode))
-		return 0;
+		RETURN(0);
 
-	result = vvp_prep_size(env, obj, io, range->cir_pos, tot, &exceed);
+	/* Unless this is reading a sparse file, otherwise the lock has already
+	 * been acquired so vvp_prep_size() is an empty op. */
+	result = vvp_prep_size(env, obj, io, pos, cnt, &exceed);
 	if (result != 0)
-		return result;
+		RETURN(result);
 	else if (exceed != 0)
-		goto out;
+		GOTO(out, result);
 
 	LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu,
 			 "Read ino %lu, %lu bytes, offset %lld, size %llu\n",
-			 inode->i_ino, range->cir_count, range->cir_pos,
-			 i_size_read(inode));
+			 inode->i_ino, cnt, pos, i_size_read(inode));
 
 	/* turn off the kernel's read-ahead */
 	vio->vui_fd->fd_file->f_ra.ra_pages = 0;
@@ -799,7 +810,7 @@ static int vvp_io_read_start(const struct lu_env *env,
 	/* initialize read-ahead window once per syscall */
 	if (!vio->vui_ra_valid) {
 		vio->vui_ra_valid = true;
-		vio->vui_ra_start = cl_index(obj, range->cir_pos);
+		vio->vui_ra_start = cl_index(obj, pos);
 		vio->vui_ra_count = cl_index(obj, tot + PAGE_SIZE - 1);
 		ll_ras_enter(file);
 	}
@@ -808,17 +819,12 @@ static int vvp_io_read_start(const struct lu_env *env,
 	file_accessed(file);
 	switch (vio->vui_io_subtype) {
 	case IO_NORMAL:
-		LASSERTF(io->u.ci_rw.rw_iocb.ki_pos == range->cir_pos,
-			 "ki_pos %lld [%lld, %lld)\n",
-			 io->u.ci_rw.rw_iocb.ki_pos,
-			 range->cir_pos, range->cir_pos + range->cir_count);
-		result = generic_file_read_iter(&io->u.ci_rw.rw_iocb,
-						&io->u.ci_rw.rw_iter);
+		LASSERT(vio->vui_iocb->ki_pos == pos);
+		result = generic_file_read_iter(vio->vui_iocb, vio->vui_iter);
 		break;
 	case IO_SPLICE:
 		result = generic_file_splice_read(file, &pos,
-						  vio->u.splice.vui_pipe,
-						  range->cir_count,
+						  vio->u.splice.vui_pipe, cnt,
 						  vio->u.splice.vui_flags);
 		/* LU-1109: do splice read stripe by stripe otherwise if it
 		 * may make nfsd stuck if this read occupied all internal pipe
@@ -829,14 +835,13 @@ static int vvp_io_read_start(const struct lu_env *env,
 		CERROR("Wrong IO type %u\n", vio->vui_io_subtype);
 		LBUG();
 	}
+	GOTO(out, result);
 
 out:
 	if (result >= 0) {
-		if (result < range->cir_count)
+		if (result < cnt)
 			io->ci_continue = 0;
 		io->ci_nob += result;
-		ll_rw_stats_tally(ll_i2sbi(inode), current->pid, vio->vui_fd,
-				  range->cir_pos, result, READ);
 		result = 0;
 	}
 
@@ -892,6 +897,7 @@ static int vvp_io_commit_sync(const struct lu_env *env, struct cl_io *io,
 			SetPageUptodate(cl_page_vmpage(page));
 			cl_page_disown(env, io, page);
 
+			/* held in ll_cl_init() */
 			lu_ref_del(&page->cp_reference, "cl_io", io);
 			cl_page_put(env, page);
 		}
@@ -910,6 +916,7 @@ static void write_commit_callback(const struct lu_env *env, struct cl_io *io,
 
 	cl_page_disown(env, io, page);
 
+	/* held in ll_cl_init() */
 	lu_ref_del(&page->cp_reference, "cl_io", cl_io_top(io));
 	cl_page_put(env, page);
 }
@@ -1010,6 +1017,7 @@ int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io)
 
 		cl_page_disown(env, io, page);
 
+		/* held in ll_cl_init() */
 		lu_ref_del(&page->cp_reference, "cl_io", io);
 		cl_page_put(env, page);
 	}
@@ -1027,10 +1035,14 @@ static int vvp_io_write_start(const struct lu_env *env,
 	struct inode		*inode = vvp_object_inode(obj);
 	struct ll_inode_info	*lli   = ll_i2info(inode);
 	struct file		*file  = vio->vui_fd->fd_file;
-	struct cl_io_range	*range = &io->u.ci_rw.rw_range;
-	bool			 lock_inode = !lli->lli_inode_locked &&
-					      !IS_NOSEC(inode);
 	ssize_t			 result = 0;
+	loff_t			 pos = io->u.ci_wr.wr.crw_pos;
+	size_t			 cnt = io->u.ci_wr.wr.crw_count;
+	bool			 lock_inode = !IS_NOSEC(inode);
+	size_t nob = io->ci_nob;
+	struct iov_iter iter;
+	size_t written = 0;
+
 	ENTRY;
 
 	if (vio->vui_io_subtype == IO_NORMAL)
@@ -1045,29 +1057,28 @@ static int vvp_io_write_start(const struct lu_env *env,
 		 * out-of-order writes.
 		 */
 		ll_merge_attr(env, inode);
-		range->cir_pos = i_size_read(inode);
-		io->u.ci_rw.rw_iocb.ki_pos = range->cir_pos;
+		pos = io->u.ci_wr.wr.crw_pos = i_size_read(inode);
+		vio->vui_iocb->ki_pos = pos;
 	} else {
-		LASSERTF(io->u.ci_rw.rw_iocb.ki_pos == range->cir_pos,
+		LASSERTF(vio->vui_iocb->ki_pos == pos,
 			 "ki_pos %lld [%lld, %lld)\n",
-			 io->u.ci_rw.rw_iocb.ki_pos,
-			 range->cir_pos, range->cir_pos + range->cir_count);
+			 vio->vui_iocb->ki_pos,
+			 pos, pos + cnt);
 	}
 
 	CDEBUG(D_VFSTRACE, "%s: write [%llu, %llu)\n",
 		file_dentry(file)->d_name.name,
-		range->cir_pos, range->cir_pos + range->cir_count);
+		pos, pos + cnt);
 
 	/* The maximum Lustre file size is variable, based on the OST maximum
 	 * object size and number of stripes.  This needs another check in
 	 * addition to the VFS checks earlier. */
-	if (range->cir_pos + range->cir_count > ll_file_maxbytes(inode)) {
+	if (pos + cnt > ll_file_maxbytes(inode)) {
 		CDEBUG(D_INODE,
 		       "%s: file %s ("DFID") offset %llu > maxbytes %llu\n",
 		       ll_get_fsname(inode->i_sb, NULL, 0),
 		       file_dentry(file)->d_name.name,
-		       PFID(ll_inode2fid(inode)),
-		       range->cir_pos + range->cir_count,
+		       PFID(ll_inode2fid(inode)), pos + cnt,
 		       ll_file_maxbytes(inode));
 		RETURN(-EFBIG);
 	}
@@ -1079,52 +1090,85 @@ static int vvp_io_write_start(const struct lu_env *env,
 	if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_IMUTEX_NOSEC) && lock_inode)
 		RETURN(-EINVAL);
 
-	/*
-	 * When using the locked AIO function (generic_file_aio_write())
-	 * testing has shown the inode mutex to be a limiting factor
-	 * with multi-threaded single shared file performance. To get
-	 * around this, we now use the lockless version. To maintain
-	 * consistency, proper locking to protect against writes,
-	 * trucates, etc. is handled in the higher layers of lustre.
-	 */
-	if (lock_inode)
-		inode_lock(inode);
-	result = __generic_file_write_iter(&io->u.ci_rw.rw_iocb,
-					   &io->u.ci_rw.rw_iter);
-	if (lock_inode)
-		inode_unlock(inode);
-
-	if (result > 0 || result == -EIOCBQUEUED)
+	if (vio->vui_iter == NULL) {
+		/* from a temp io in ll_cl_init(). */
+		result = 0;
+	} else {
+		/*
+		 * When using the locked AIO function (generic_file_aio_write())
+		 * testing has shown the inode mutex to be a limiting factor
+		 * with multi-threaded single shared file performance. To get
+		 * around this, we now use the lockless version. To maintain
+		 * consistency, proper locking to protect against writes,
+		 * trucates, etc. is handled in the higher layers of lustre.
+		 */
+		lock_inode = !IS_NOSEC(inode);
+		iter = *vio->vui_iter;
+
+		if (unlikely(lock_inode))
+			inode_lock(inode);
+		result = __generic_file_write_iter(vio->vui_iocb,
+						   vio->vui_iter);
+		if (unlikely(lock_inode))
+			inode_unlock(inode);
+
+		written = result;
+		if (result > 0 || result == -EIOCBQUEUED)
 #ifdef HAVE_GENERIC_WRITE_SYNC_2ARGS
-		result = generic_write_sync(&io->u.ci_rw.rw_iocb, result);
+			result = generic_write_sync(vio->vui_iocb, result);
 #else
-	{
-		ssize_t err;
+		{
+			ssize_t err;
 
-		err = generic_write_sync(io->u.ci_rw.rw_iocb.ki_filp,
-					 range->cir_pos, result);
-		if (err < 0 && result > 0)
-			result = err;
-	}
+			err = generic_write_sync(vio->vui_iocb->ki_filp, pos,
+						 result);
+			if (err < 0 && result > 0)
+				result = err;
+		}
 #endif
+	}
 
 	if (result > 0) {
 		result = vvp_io_write_commit(env, io);
+		/* Simulate short commit */
+		if (CFS_FAULT_CHECK(OBD_FAIL_LLITE_SHORT_COMMIT)) {
+			vio->u.write.vui_written >>= 1;
+			if (vio->u.write.vui_written > 0)
+				io->ci_need_restart = 1;
+		}
 		if (vio->u.write.vui_written > 0) {
 			result = vio->u.write.vui_written;
 			CDEBUG(D_VFSTRACE, "%s: write nob %zd, result: %zd\n",
 				file_dentry(file)->d_name.name,
 				io->ci_nob, result);
 			io->ci_nob += result;
+		} else {
+			io->ci_continue = 0;
 		}
 	}
+	if (vio->vui_iocb->ki_pos != (pos + io->ci_nob - nob)) {
+		CDEBUG(D_VFSTRACE, "%s: write position mismatch: "
+		       "ki_pos %lld vs. pos %lld, written %ld, commit %ld "
+		       "rc %ld\n",
+		       file_dentry(file)->d_name.name,
+		       vio->vui_iocb->ki_pos, pos + io->ci_nob - nob,
+		       written, io->ci_nob - nob, result);
+		/*
+		 * Rewind ki_pos and vui_iter to where it has
+		 * successfully committed.
+		 */
+		vio->vui_iocb->ki_pos = pos + io->ci_nob - nob;
+		iov_iter_advance(&iter, io->ci_nob - nob);
+		vio->vui_iter->iov = iter.iov;
+		vio->vui_iter->nr_segs = iter.nr_segs;
+		vio->vui_iter->iov_offset = iter.iov_offset;
+		vio->vui_iter->count = iter.count;
+	}
 	if (result > 0) {
 		ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
 
-		if (result < range->cir_count)
+		if (result < cnt)
 			io->ci_continue = 0;
-		ll_rw_stats_tally(ll_i2sbi(inode), current->pid,
-				  vio->vui_fd, range->cir_pos, result, WRITE);
 		result = 0;
 	}
 
@@ -1279,7 +1323,7 @@ static int vvp_io_fault_start(const struct lu_env *env,
 	if (fio->ft_mkwrite) {
 		wait_on_page_writeback(vmpage);
 		if (!PageDirty(vmpage)) {
-			struct cl_page_list *plist = &io->ci_queue.c2_qin;
+			struct cl_page_list *plist = &vio->u.fault.ft_queue;
 			struct vvp_page *vpg = cl_object_page_slice(obj, page);
 			int to = PAGE_SIZE;
 
@@ -1291,13 +1335,34 @@ static int vvp_io_fault_start(const struct lu_env *env,
 
 			/* size fixup */
 			if (last_index == vvp_index(vpg))
-				to = size & ~PAGE_MASK;
+				to = ((size - 1) & ~PAGE_MASK) + 1;
 
 			/* Do not set Dirty bit here so that in case IO is
 			 * started before the page is really made dirty, we
 			 * still have chance to detect it. */
 			result = cl_io_commit_async(env, io, plist, 0, to,
 						    mkwrite_commit_callback);
+			/* Have overquota flag, trying sync write to check
+			 * whether indeed out of quota */
+			if (result == -EDQUOT) {
+				cl_page_get(page);
+				result = vvp_io_commit_sync(env, io,
+							    plist, 0, to);
+				if (result >= 0) {
+					io->ci_noquota = 1;
+					cl_page_own(env, io, page);
+					cl_page_list_add(plist, page);
+					lu_ref_add(&page->cp_reference,
+						   "cl_io", io);
+					result = cl_io_commit_async(env, io,
+						plist, 0, to,
+						mkwrite_commit_callback);
+					io->ci_noquota = 0;
+				} else {
+					cl_page_put(env, page);
+				}
+			}
+
 			LASSERT(cl_page_is_owned(page, io));
 			cl_page_list_fini(env, plist);
 
@@ -1312,8 +1377,9 @@ static int vvp_io_fault_start(const struct lu_env *env,
 				if (result == -EDQUOT)
 					result = -ENOSPC;
 				GOTO(out, result);
-			} else
+			} else {
 				cl_page_disown(env, io, page);
+			}
 		}
 	}
 
@@ -1422,6 +1488,9 @@ static const struct cl_io_operations vvp_io_ops = {
 			.cio_start	= vvp_io_fsync_start,
 			.cio_fini	= vvp_io_fini
 		},
+		[CIT_GLIMPSE] = {
+			.cio_fini	= vvp_io_fini
+		},
 		[CIT_MISC] = {
 			.cio_fini	= vvp_io_fini
 		},
@@ -1453,13 +1522,16 @@ int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
 	vio->vui_ra_valid = false;
 	result = 0;
 	if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE) {
+		size_t count;
 		struct ll_inode_info *lli = ll_i2info(inode);
 
-		vio->vui_tot_count = io->u.ci_rw.rw_range.cir_count;
+		count = io->u.ci_rw.crw_count;
 		/* "If nbyte is 0, read() will return 0 and have no other
 		 *  results."  -- Single Unix Spec */
-		if (vio->vui_tot_count == 0)
+		if (count == 0)
 			result = 1;
+		else
+			vio->vui_tot_count = count;
 
 		/* for read/write, we store the jobid in the inode, and
 		 * it'll be fetched by osc when building RPC.
@@ -1467,7 +1539,7 @@ int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
 		 * it's not accurate if the file is shared by different
 		 * jobs.
 		 */
-		lustre_get_jobid(lli->lli_jobid);
+		lustre_get_jobid(lli->lli_jobid, sizeof(lli->lli_jobid));
 	} else if (io->ci_type == CIT_SETATTR) {
 		if (!cl_io_is_trunc(io))
 			io->ci_lockreq = CILR_MANDATORY;
@@ -1490,5 +1562,6 @@ int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
 				PFID(lu_object_fid(&obj->co_lu)), result);
 	}
 
+	io->ci_result = result < 0 ? result : 0;
 	RETURN(result);
 }
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_lock.c b/drivers/staging/lustrefsx/lustre/llite/vvp_lock.c
deleted file mode 100644
index 651b8e128239d..0000000000000
--- a/drivers/staging/lustrefsx/lustre/llite/vvp_lock.c
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2014, 2015, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * Implementation of cl_lock for VVP layer.
- *
- *   Author: Nikita Danilov <nikita.danilov@sun.com>
- */
-
-#define DEBUG_SUBSYSTEM S_LLITE
-
-#include <obd_support.h>
-#include "vvp_internal.h"
-
-/*****************************************************************************
- *
- * Vvp lock functions.
- *
- */
-
-static void vvp_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice)
-{
-	struct vvp_lock *vlk = cl2vvp_lock(slice);
-
-	OBD_SLAB_FREE_PTR(vlk, vvp_lock_kmem);
-}
-
-static int vvp_lock_enqueue(const struct lu_env *env,
-			    const struct cl_lock_slice *slice,
-			    struct cl_io *unused, struct cl_sync_io *anchor)
-{
-	CLOBINVRNT(env, slice->cls_obj, vvp_object_invariant(slice->cls_obj));
-
-	return 0;
-}
-
-static const struct cl_lock_operations vvp_lock_ops = {
-	.clo_fini	= vvp_lock_fini,
-	.clo_enqueue	= vvp_lock_enqueue,
-};
-
-int vvp_lock_init(const struct lu_env *env, struct cl_object *obj,
-		  struct cl_lock *lock, const struct cl_io *unused)
-{
-	struct vvp_lock *vlk;
-	int result;
-
-	CLOBINVRNT(env, obj, vvp_object_invariant(obj));
-
-	OBD_SLAB_ALLOC_PTR_GFP(vlk, vvp_lock_kmem, GFP_NOFS);
-	if (vlk != NULL) {
-		cl_lock_slice_add(lock, &vlk->vlk_cl, obj, &vvp_lock_ops);
-		result = 0;
-	} else {
-		result = -ENOMEM;
-	}
-
-	return result;
-}
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_object.c b/drivers/staging/lustrefsx/lustre/llite/vvp_object.c
index fd7211f60c61f..c3bf715667577 100644
--- a/drivers/staging/lustrefsx/lustre/llite/vvp_object.c
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_object.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -169,6 +169,13 @@ static int vvp_prune(const struct lu_env *env, struct cl_object *obj)
 	}
 
 	truncate_inode_pages(inode->i_mapping, 0);
+	if (inode->i_mapping->nrpages) {
+		CDEBUG(D_VFSTRACE, DFID ": still has %lu pages remaining\n",
+		       PFID(lu_object_fid(&obj->co_lu)),
+		       inode->i_mapping->nrpages);
+		RETURN(-EIO);
+	}
+
 	RETURN(0);
 }
 
@@ -198,26 +205,25 @@ static void vvp_req_attr_set(const struct lu_env *env, struct cl_object *obj,
 {
 	struct inode *inode;
 	struct obdo  *oa;
-	u64 valid_flags = OBD_MD_FLTYPE;
+	u64 valid_flags = OBD_MD_FLTYPE | OBD_MD_FLUID | OBD_MD_FLGID;
 
 	oa = attr->cra_oa;
 	inode = vvp_object_inode(obj);
 
 	if (attr->cra_type == CRT_WRITE) {
-		valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME |
-			       OBD_MD_FLUID | OBD_MD_FLGID;
+		valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME;
 		obdo_set_o_projid(oa, ll_i2info(inode)->lli_projid);
 	}
 	obdo_from_inode(oa, inode, valid_flags & attr->cra_flags);
 	obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
 	if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_INVALID_PFID))
 		oa->o_parent_oid++;
-	memcpy(attr->cra_jobid, ll_i2info(inode)->lli_jobid, LUSTRE_JOBID_SIZE);
+	memcpy(attr->cra_jobid, ll_i2info(inode)->lli_jobid,
+	       sizeof(attr->cra_jobid));
 }
 
 static const struct cl_object_operations vvp_ops = {
 	.coo_page_init    = vvp_page_init,
-	.coo_lock_init    = vvp_lock_init,
 	.coo_io_init      = vvp_io_init,
 	.coo_attr_get     = vvp_attr_get,
 	.coo_attr_update  = vvp_attr_update,
diff --git a/drivers/staging/lustrefsx/lustre/llite/vvp_page.c b/drivers/staging/lustrefsx/lustre/llite/vvp_page.c
index 47d48639ad43c..0f4e2a9e83dac 100644
--- a/drivers/staging/lustrefsx/lustre/llite/vvp_page.c
+++ b/drivers/staging/lustrefsx/lustre/llite/vvp_page.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -54,16 +54,22 @@
  *
  */
 
-static void vvp_page_fini_common(struct vvp_page *vpg)
+static void vvp_page_fini_common(struct vvp_page *vpg, struct pagevec *pvec)
 {
 	struct page *vmpage = vpg->vpg_page;
 
 	LASSERT(vmpage != NULL);
-	put_page(vmpage);
+	if (pvec) {
+		if (!pagevec_add(pvec, vmpage))
+			pagevec_release(pvec);
+	} else {
+		put_page(vmpage);
+	}
 }
 
 static void vvp_page_fini(const struct lu_env *env,
-			  struct cl_page_slice *slice)
+			  struct cl_page_slice *slice,
+			  struct pagevec *pvec)
 {
 	struct vvp_page *vpg     = cl2vvp_page(slice);
 	struct page     *vmpage  = vpg->vpg_page;
@@ -73,7 +79,7 @@ static void vvp_page_fini(const struct lu_env *env,
 	 * VPG_FREEING state.
 	 */
 	LASSERT((struct cl_page *)vmpage->private != slice->cpl_page);
-	vvp_page_fini_common(vpg);
+	vvp_page_fini_common(vpg, pvec);
 }
 
 static int vvp_page_own(const struct lu_env *env,
@@ -144,7 +150,7 @@ static void vvp_page_discard(const struct lu_env *env,
 	LASSERT(vmpage != NULL);
 	LASSERT(PageLocked(vmpage));
 
-	if (vpg->vpg_defer_uptodate && !vpg->vpg_ra_used)
+	if (vpg->vpg_defer_uptodate && !vpg->vpg_ra_used && vmpage->mapping)
 		ll_ra_stats_inc(vmpage->mapping->host, RA_STAT_DISCARDED);
 
 	ll_invalidate_page(vmpage);
@@ -154,14 +160,12 @@ static void vvp_page_delete(const struct lu_env *env,
 			    const struct cl_page_slice *slice)
 {
 	struct page      *vmpage = cl2vm_page(slice);
-	struct inode     *inode  = vmpage->mapping->host;
-	struct cl_object *obj    = slice->cpl_obj;
 	struct cl_page   *page   = slice->cpl_page;
 	int refc;
 
 	LASSERT(PageLocked(vmpage));
 	LASSERT((struct cl_page *)vmpage->private == page);
-	LASSERT(inode == vvp_object_inode(obj));
+
 
 	/* Drop the reference count held in vvp_page_init */
 	refc = atomic_dec_return(&page->cp_ref);
@@ -242,8 +246,8 @@ static void vvp_vmpage_error(struct inode *inode, struct page *vmpage, int ioret
 		else
 			set_bit(AS_EIO, &inode->i_mapping->flags);
 
-		if ((ioret == -ESHUTDOWN || ioret == -EINTR) &&
-		     obj->vob_discard_page_warned == 0) {
+		if ((ioret == -ESHUTDOWN || ioret == -EINTR ||
+		     ioret == -EIO) && obj->vob_discard_page_warned == 0) {
 			obj->vob_discard_page_warned = 1;
 			ll_dirty_page_discard_warn(vmpage, ioret);
 		}
@@ -269,8 +273,14 @@ static void vvp_page_completion_read(const struct lu_env *env,
 	if (ioret == 0)  {
 		if (!vpg->vpg_defer_uptodate)
 			cl_page_export(env, page, 1);
-	} else {
+	} else if (vpg->vpg_defer_uptodate) {
 		vpg->vpg_defer_uptodate = 0;
+		if (ioret == -EWOULDBLOCK) {
+			/* mirror read failed, it needs to destroy the page
+			 * because subpage would be from wrong osc when trying
+			 * to read from a new mirror */
+			ll_invalidate_page(vmpage);
+		}
 	}
 
 	if (page->cp_sync_io == NULL)
@@ -484,13 +494,14 @@ vvp_transient_page_completion(const struct lu_env *env,
 }
 
 static void vvp_transient_page_fini(const struct lu_env *env,
-				    struct cl_page_slice *slice)
+				    struct cl_page_slice *slice,
+				    struct pagevec *pvec)
 {
 	struct vvp_page *vpg = cl2vvp_page(slice);
 	struct cl_page *clp = slice->cpl_page;
 	struct vvp_object *clobj = cl2vvp(clp->cp_obj);
 
-	vvp_page_fini_common(vpg);
+	vvp_page_fini_common(vpg, pvec);
 	atomic_dec(&clobj->vob_transient_pages);
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr.c b/drivers/staging/lustrefsx/lustre/llite/xattr.c
index ae0ee171ac4e7..35da3f779e02a 100644
--- a/drivers/staging/lustrefsx/lustre/llite/xattr.c
+++ b/drivers/staging/lustrefsx/lustre/llite/xattr.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -42,8 +42,8 @@
 
 #include <obd_support.h>
 #include <lustre_dlm.h>
-#include <lustre_ver.h>
 #include <lustre_eacl.h>
+#include <lustre_swab.h>
 
 #include "llite_internal.h"
 
@@ -111,7 +111,10 @@ static int ll_xattr_set_common(const struct xattr_handler *handler,
 	int rc;
 	ENTRY;
 
-	if (flags == XATTR_REPLACE) {
+	/* When setxattr() is called with a size of 0 the value is
+	 * unconditionally replaced by "". When removexattr() is
+	 * called we get a NULL value and XATTR_REPLACE for flags. */
+	if (!value && flags == XATTR_REPLACE) {
 		ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REMOVEXATTR, 1);
 		valid = OBD_MD_FLXATTRRM;
 	} else {
@@ -131,7 +134,7 @@ static int ll_xattr_set_common(const struct xattr_handler *handler,
 	if ((handler->flags == XATTR_ACL_ACCESS_T ||
 	     handler->flags == XATTR_ACL_DEFAULT_T) &&
 /* Test for older kernels that was cleaned up in LU-12477 and LU-10092 */
-#if defined(HAVE_USER_NAMESPACE_ARG) || defined(HAVE_INODE_OWNER_OR_CAPABLE)
+#if defined(HAVE_INODE_OWNER_OR_CAPABLE) || defined(HAVE_USER_NAMESPACE_ARG)
 	    !inode_owner_or_capable(mnt_userns, inode))
 #else
 	    !is_owner_or_cap(inode))
@@ -163,7 +166,7 @@ static int ll_xattr_set_common(const struct xattr_handler *handler,
 		RETURN(-ENOMEM);
 
 	rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode), valid, fullname,
-			 pv, size, 0, flags, ll_i2suppgid(inode), &req);
+			 pv, size, flags, ll_i2suppgid(inode), &req);
 	kfree(fullname);
 	if (rc) {
 		if (rc == -EOPNOTSUPP && handler->flags == XATTR_USER_T) {
@@ -177,7 +180,7 @@ static int ll_xattr_set_common(const struct xattr_handler *handler,
 	RETURN(0);
 }
 
-static int get_hsm_state(struct inode *inode, u32 *hus_states)
+int ll_get_hsm_state(struct inode *inode, u32 *hus_states)
 {
 	struct md_op_data *op_data;
 	struct hsm_user_state *hus;
@@ -208,7 +211,7 @@ static int get_hsm_state(struct inode *inode, u32 *hus_states)
 	return rc;
 }
 
-static int ll_adjust_lum(struct inode *inode, struct lov_user_md *lump)
+static int ll_adjust_lum(struct inode *inode, struct lov_user_md *lump, size_t size)
 {
 	struct lov_comp_md_v1 *comp_v1 = (struct lov_comp_md_v1 *)lump;
 	struct lov_user_md *v1 = lump;
@@ -223,7 +226,12 @@ static int ll_adjust_lum(struct inode *inode, struct lov_user_md *lump)
 		return 0;
 
 	if (lump->lmm_magic == LOV_USER_MAGIC_COMP_V1) {
+		if (size < sizeof(*comp_v1))
+			return -ERANGE;
+
 		entry_count = comp_v1->lcm_entry_count;
+		if (size < offsetof(typeof(*comp_v1), lcm_entries[entry_count]))
+			return -ERANGE;
 		is_composite = true;
 	}
 
@@ -231,6 +239,10 @@ static int ll_adjust_lum(struct inode *inode, struct lov_user_md *lump)
 		if (lump->lmm_magic == LOV_USER_MAGIC_COMP_V1) {
 			void *ptr = comp_v1;
 
+			if (comp_v1->lcm_entries[i].lcme_offset + sizeof(*v1) >
+			    size)
+				return -ERANGE;
+
 			ptr += comp_v1->lcm_entries[i].lcme_offset;
 			v1 = (struct lov_user_md *)ptr;
 		}
@@ -249,7 +261,7 @@ static int ll_adjust_lum(struct inode *inode, struct lov_user_md *lump)
 			if (!release_checked) {
 				u32 state = HS_NONE;
 
-				rc = get_hsm_state(inode, &state);
+				rc = ll_get_hsm_state(inode, &state);
 				if (rc)
 					return rc;
 
@@ -278,7 +290,13 @@ static int ll_setstripe_ea(struct dentry *dentry, struct lov_user_md *lump,
 	if (!size && lump)
 		lump = NULL;
 
-	rc = ll_adjust_lum(inode, lump);
+	if (size && size < sizeof(*lump)) {
+		/* ll_adjust_lum() or ll_lov_user_md_size() might access
+		 * before size - just give up now.
+		 */
+		return -ERANGE;
+	}
+	rc = ll_adjust_lum(inode, lump, size);
 	if (rc)
 		return rc;
 
@@ -346,8 +364,13 @@ static int ll_xattr_set(const struct xattr_handler *handler,
 		return 0;
 	}
 
-    return ll_xattr_set_common(handler, mnt_userns, dentry, inode, name,
-    				   value, size, flags);
+	if (strncmp(name, "lov.", 4) == 0 &&
+	    (__swab32(((struct lov_user_md *)value)->lmm_magic) &
+	    le32_to_cpu(LOV_MAGIC_MASK)) == le32_to_cpu(LOV_MAGIC_MAGIC))
+		lustre_swab_lov_user_md((struct lov_user_md *)value, 0);
+
+	return ll_xattr_set_common(handler, mnt_userns, dentry, inode, name,
+				   value, size, flags);
 }
 
 int ll_xattr_list(struct inode *inode, const char *name, int type, void *buffer,
@@ -356,7 +379,6 @@ int ll_xattr_list(struct inode *inode, const char *name, int type, void *buffer,
 	struct ll_inode_info *lli = ll_i2info(inode);
         struct ll_sb_info *sbi = ll_i2sbi(inode);
         struct ptlrpc_request *req = NULL;
-        struct mdt_body *body;
         void *xdata;
 	int rc;
 	ENTRY;
@@ -383,35 +405,25 @@ int ll_xattr_list(struct inode *inode, const char *name, int type, void *buffer,
 		}
 	} else {
 getxattr_nocache:
-		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
-				 valid, name, NULL, 0, size, 0, &req);
+		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), valid,
+				 name, size, &req);
 		if (rc < 0)
 			GOTO(out_xattr, rc);
 
-		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
-		LASSERT(body);
-
 		/* only detect the xattr size */
 		if (size == 0)
-			GOTO(out, rc = body->mbo_eadatasize);
+			GOTO(out, rc);
 
-		if (size < body->mbo_eadatasize) {
-			CERROR("server bug: replied size %u > %u\n",
-				body->mbo_eadatasize, (int)size);
+		if (size < rc)
 			GOTO(out, rc = -ERANGE);
-		}
-
-		if (body->mbo_eadatasize == 0)
-			GOTO(out, rc = -ENODATA);
 
 		/* do not need swab xattr data */
 		xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA,
-							body->mbo_eadatasize);
+						     rc);
 		if (!xdata)
-			GOTO(out, rc = -EFAULT);
+			GOTO(out, rc = -EPROTO);
 
-		memcpy(buffer, xdata, body->mbo_eadatasize);
-		rc = body->mbo_eadatasize;
+		memcpy(buffer, xdata, rc);
 	}
 
 	EXIT;
@@ -524,21 +536,37 @@ static ssize_t ll_getxattr_lov(struct inode *inode, void *buf, size_t buf_size)
 		 * recognizing layout gen as stripe offset when the
 		 * file is restored. See LU-2809.
 		 */
-		if (((struct lov_mds_md *)buf)->lmm_magic == LOV_MAGIC_COMP_V1)
+		if ((((struct lov_mds_md *)buf)->lmm_magic &
+		    __swab32(LOV_MAGIC_MAGIC)) == __swab32(LOV_MAGIC_MAGIC))
+			lustre_swab_lov_user_md((struct lov_user_md *)buf,
+						cl.cl_size);
+
+		switch (((struct lov_mds_md *)buf)->lmm_magic) {
+		case LOV_MAGIC_V1:
+		case LOV_MAGIC_V3:
+		case LOV_MAGIC_SPECIFIC:
+			((struct lov_mds_md *)buf)->lmm_layout_gen = 0;
+			break;
+		case LOV_MAGIC_COMP_V1:
 			goto out_env;
+		default:
+			CERROR("Invalid LOV magic %08x\n",
+			       ((struct lov_mds_md *)buf)->lmm_magic);
+			GOTO(out_env, rc = -EINVAL);
+		}
 
-		((struct lov_mds_md *)buf)->lmm_layout_gen = 0;
 out_env:
 		cl_env_put(env, &refcheck);
 
 		RETURN(rc);
 	} else if (S_ISDIR(inode->i_mode)) {
 		struct ptlrpc_request *req = NULL;
+		struct ptlrpc_request *root_req = NULL;
 		struct lov_mds_md *lmm = NULL;
 		int lmm_size = 0;
 
-		rc = ll_dir_getstripe(inode, (void **)&lmm, &lmm_size,
-				      &req, 0);
+		rc = ll_dir_getstripe_default(inode, (void **)&lmm, &lmm_size,
+					      &req, &root_req, 0);
 		if (rc < 0)
 			GOTO(out_req, rc);
 
@@ -553,6 +581,8 @@ static ssize_t ll_getxattr_lov(struct inode *inode, void *buf, size_t buf_size)
 out_req:
 		if (req)
 			ptlrpc_req_finished(req);
+		if (root_req)
+			ptlrpc_req_finished(root_req);
 
 		RETURN(rc);
 	} else {
diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr26.c b/drivers/staging/lustrefsx/lustre/llite/xattr26.c
index 84e9b8bcbe915..28772dd5a74a1 100644
--- a/drivers/staging/lustrefsx/lustre/llite/xattr26.c
+++ b/drivers/staging/lustrefsx/lustre/llite/xattr26.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,7 +40,7 @@
 
 #include <obd_support.h>
 #include <lustre_dlm.h>
-#include <lustre_ver.h>
+#include <uapi/linux/lustre/lustre_ver.h>
 #include <lustre_eacl.h>
 
 #include "llite_internal.h"
@@ -152,7 +152,7 @@ int ll_setxattr_common(struct inode *inode, const char *name,
 	}
 
 	rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode), valid, name, pv,
-			 size, 0, flags, ll_i2suppgid(inode), &req);
+			 size, flags, ll_i2suppgid(inode), &req);
 	if (rc) {
 		if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) {
 			LCONSOLE_INFO("Disabling user_xattr feature because "
@@ -329,7 +329,6 @@ int ll_getxattr_common(struct inode *inode, const char *name,
 {
 	struct ll_sb_info *sbi = ll_i2sbi(inode);
 	struct ptlrpc_request *req = NULL;
-	struct mdt_body *body;
 	int xattr_type, rc;
 	void *xdata;
 	struct ll_inode_info *lli = ll_i2info(inode);
@@ -405,36 +404,25 @@ int ll_getxattr_common(struct inode *inode, const char *name,
 		}
 	} else {
 getxattr_nocache:
-		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
-				valid, name, NULL, 0, size, 0, &req);
-
+		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), valid,
+				 name, size, &req);
 		if (rc < 0)
 			GOTO(out_xattr, rc);
 
-		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
-		LASSERT(body);
-
 		/* only detect the xattr size */
 		if (size == 0)
-			GOTO(out, rc = body->mbo_eadatasize);
+			GOTO(out, rc);
 
-		if (size < body->mbo_eadatasize) {
-			CERROR("server bug: replied size %u > %u\n",
-				body->mbo_eadatasize, (int)size);
+		if (size < rc)
 			GOTO(out, rc = -ERANGE);
-		}
-
-		if (body->mbo_eadatasize == 0)
-			GOTO(out, rc = -ENODATA);
 
 		/* do not need swab xattr data */
 		xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA,
-							body->mbo_eadatasize);
+						     rc);
 		if (!xdata)
-			GOTO(out, rc = -EFAULT);
+			GOTO(out, rc = -EPROTO);
 
-		memcpy(buffer, xdata, body->mbo_eadatasize);
-		rc = body->mbo_eadatasize;
+		memcpy(buffer, xdata, rc);
 	}
 
 	EXIT;
diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr_cache.c b/drivers/staging/lustrefsx/lustre/llite/xattr_cache.c
index a001e5c2d8a7b..f1022b0296f47 100644
--- a/drivers/staging/lustrefsx/lustre/llite/xattr_cache.c
+++ b/drivers/staging/lustrefsx/lustre/llite/xattr_cache.c
@@ -24,7 +24,7 @@
 /*
  * Copyright 2012 Xyratex Technology Limited
  *
- * Copyright (c) 2013, 2016, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
  *
  * Author: Andrew Perepechko <Andrew_Perepechko@xyratex.com>
  *
@@ -37,7 +37,6 @@
 #include <linux/mm.h>
 #include <obd_support.h>
 #include <lustre_dlm.h>
-#include <lustre_ver.h>
 #include "llite_internal.h"
 
 /* If we ever have hundreds of extended attributes, we might want to consider
diff --git a/drivers/staging/lustrefsx/lustre/llite/xattr_security.c b/drivers/staging/lustrefsx/lustre/llite/xattr_security.c
index d7e9ec8473ae4..83dccf8a52e3e 100644
--- a/drivers/staging/lustrefsx/lustre/llite/xattr_security.c
+++ b/drivers/staging/lustrefsx/lustre/llite/xattr_security.c
@@ -205,3 +205,36 @@ ll_inode_init_security(struct dentry *dentry, struct inode *inode,
 	return err;
 }
 #endif /* HAVE_SECURITY_IINITSEC_CALLBACK */
+
+/**
+ * Get security context xattr name used by policy.
+ *
+ * \retval >= 0     length of xattr name
+ * \retval < 0      failure to get security context xattr name
+ */
+int
+ll_listsecurity(struct inode *inode, char *secctx_name, size_t secctx_name_size)
+{
+	int rc;
+
+	if (!selinux_is_enabled())
+		return 0;
+
+#ifdef HAVE_SECURITY_INODE_LISTSECURITY
+	rc = security_inode_listsecurity(inode, secctx_name, secctx_name_size);
+	if (rc >= secctx_name_size)
+		rc = -ERANGE;
+	else if (rc >= 0)
+		secctx_name[rc] = '\0';
+	return rc;
+#else /* !HAVE_SECURITY_INODE_LISTSECURITY */
+	rc = sizeof(XATTR_NAME_SELINUX);
+	if (secctx_name && rc < secctx_name_size) {
+		memcpy(secctx_name, XATTR_NAME_SELINUX, rc);
+		secctx_name[rc] = '\0';
+	} else {
+		rc = -ERANGE;
+	}
+	return rc;
+#endif /* HAVE_SECURITY_INODE_LISTSECURITY */
+}
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c b/drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c
index b5ec306dcc224..b439d87ae9348 100644
--- a/drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c
@@ -40,7 +40,6 @@
 #include <linux/seq_file.h>
 
 #include <obd_support.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_fid.h>
 #include <lustre_lib.h>
 #include <lustre_net.h>
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c b/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c
index bb792e751e94f..24c616b4b6cd9 100644
--- a/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c
@@ -42,7 +42,6 @@
 #include <lustre_intent.h>
 
 #include <obd_support.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_lib.h>
 #include <lustre_net.h>
 #include <lustre_dlm.h>
@@ -55,7 +54,8 @@ static int lmv_intent_remote(struct obd_export *exp, struct lookup_intent *it,
 			     const struct lu_fid *parent_fid,
 			     struct ptlrpc_request **reqp,
 			     ldlm_blocking_callback cb_blocking,
-			     __u64 extra_lock_flags)
+			     __u64 extra_lock_flags,
+			     const char *secctx_name, __u32 secctx_name_size)
 {
 	struct obd_device	*obd = exp->exp_obd;
 	struct lmv_obd		*lmv = &obd->u.lmv;
@@ -74,13 +74,6 @@ static int lmv_intent_remote(struct obd_export *exp, struct lookup_intent *it,
 
 	LASSERT((body->mbo_valid & OBD_MD_MDS));
 
-	/*
-	 * Unfortunately, we have to lie to MDC/MDS to retrieve
-	 * attributes llite needs and provideproper locking.
-	 */
-	if (it->it_op & IT_LOOKUP)
-		it->it_op = IT_GETATTR;
-
 	/*
 	 * We got LOOKUP lock, but we really need attrs.
 	 */
@@ -115,6 +108,16 @@ static int lmv_intent_remote(struct obd_export *exp, struct lookup_intent *it,
 	CDEBUG(D_INODE, "REMOTE_INTENT with fid="DFID" -> mds #%u\n",
 	       PFID(&body->mbo_fid1), tgt->ltd_idx);
 
+	/* ask for security context upon intent */
+	if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_OPEN) &&
+	    secctx_name_size != 0 && secctx_name != NULL) {
+		op_data->op_file_secctx_name = secctx_name;
+		op_data->op_file_secctx_name_size = secctx_name_size;
+		CDEBUG(D_SEC, "'%.*s' is security xattr to fetch for "
+		       DFID"\n",
+		       secctx_name_size, secctx_name, PFID(&body->mbo_fid1));
+	}
+
 	rc = md_intent_lock(tgt->ltd_exp, op_data, it, &req, cb_blocking,
 			    extra_lock_flags);
         if (rc)
@@ -153,13 +156,14 @@ int lmv_revalidate_slaves(struct obd_export *exp,
 			  ldlm_blocking_callback cb_blocking,
 			  int extra_lock_flags)
 {
-	struct obd_device      *obd = exp->exp_obd;
-	struct lmv_obd         *lmv = &obd->u.lmv;
-	struct ptlrpc_request	*req = NULL;
-	struct mdt_body		*body;
-	struct md_op_data      *op_data;
-	int                     i;
-	int                     rc = 0;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct ptlrpc_request *req = NULL;
+	struct mdt_body *body;
+	struct md_op_data *op_data;
+	int i;
+	int valid_stripe_count = 0;
+	int rc = 0;
 
 	ENTRY;
 
@@ -185,6 +189,9 @@ int lmv_revalidate_slaves(struct obd_export *exp,
 		fid = lsm->lsm_md_oinfo[i].lmo_fid;
 		inode = lsm->lsm_md_oinfo[i].lmo_root;
 
+		if (!inode)
+			continue;
+
 		/*
 		 * Prepare op_data for revalidating. Note that @fid2 shluld be
 		 * defined otherwise it will go to server and take new lock
@@ -193,8 +200,14 @@ int lmv_revalidate_slaves(struct obd_export *exp,
 		memset(op_data, 0, sizeof(*op_data));
 		op_data->op_fid1 = fid;
 		op_data->op_fid2 = fid;
+		/* shard revalidate only needs to fetch attributes and UPDATE
+		 * lock, which is similar to the bottom half of remote object
+		 * getattr, set this flag so that MDT skips checking whether
+		 * it's remote object.
+		 */
+		op_data->op_bias = MDS_CROSS_REF;
 
-		tgt = lmv_locate_mds(lmv, op_data, &fid);
+		tgt = lmv_get_target(lmv, lsm->lsm_md_oinfo[i].lmo_mds, NULL);
 		if (IS_ERR(tgt))
 			GOTO(cleanup, rc = PTR_ERR(tgt));
 
@@ -208,6 +221,12 @@ int lmv_revalidate_slaves(struct obd_export *exp,
 
 		rc = md_intent_lock(tgt->ltd_exp, op_data, &it, &req,
 				    cb_blocking, extra_lock_flags);
+		if (rc == -ENOENT) {
+			/* skip stripe is not exists */
+			rc = 0;
+			continue;
+		}
+
 		if (rc < 0)
 			GOTO(cleanup, rc);
 
@@ -243,17 +262,22 @@ int lmv_revalidate_slaves(struct obd_export *exp,
 			ldlm_lock_decref(lockh, it.it_lock_mode);
 			it.it_lock_mode = 0;
 		}
+
+		valid_stripe_count++;
 	}
 
 cleanup:
 	if (req != NULL)
 		ptlrpc_req_finished(req);
 
+	/* if all stripes are invalid, return -ENOENT to notify user */
+	if (!rc && !valid_stripe_count)
+		rc = -ENOENT;
+
 	OBD_FREE_PTR(op_data);
 	RETURN(rc);
 }
 
-
 /*
  * IT_OPEN is intended to open (and create, possible) an object. Parent (pid)
  * may be split dir.
@@ -264,13 +288,58 @@ static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
 			   ldlm_blocking_callback cb_blocking,
 			   __u64 extra_lock_flags)
 {
-	struct obd_device	*obd = exp->exp_obd;
-	struct lmv_obd		*lmv = &obd->u.lmv;
-	struct lmv_tgt_desc	*tgt;
-	struct mdt_body		*body;
-	int			rc;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	struct mdt_body *body;
+	__u64 flags = it->it_flags;
+	int rc;
+
 	ENTRY;
 
+	if ((it->it_op & IT_CREAT) && !(flags & MDS_OPEN_BY_FID)) {
+		/* don't allow create under dir with bad hash */
+		if (lmv_is_dir_bad_hash(op_data->op_mea1))
+			RETURN(-EBADF);
+
+		if (lmv_is_dir_migrating(op_data->op_mea1)) {
+			if (flags & O_EXCL) {
+				/*
+				 * open(O_CREAT | O_EXCL) needs to check
+				 * existing name, which should be done on both
+				 * old and new layout, to avoid creating new
+				 * file under old layout, check old layout on
+				 * client side.
+				 */
+				tgt = lmv_locate_tgt(lmv, op_data,
+						     &op_data->op_fid1);
+				if (IS_ERR(tgt))
+					RETURN(PTR_ERR(tgt));
+
+				rc = md_getattr_name(tgt->ltd_exp, op_data,
+						     reqp);
+				if (!rc) {
+					ptlrpc_req_finished(*reqp);
+					*reqp = NULL;
+					RETURN(-EEXIST);
+				}
+
+				if (rc != -ENOENT)
+					RETURN(rc);
+
+				op_data->op_post_migrate = true;
+			} else {
+				/*
+				 * open(O_CREAT) will be sent to MDT in old
+				 * layout first, to avoid creating new file
+				 * under old layout, clear O_CREAT.
+				 */
+				it->it_flags &= ~O_CREAT;
+			}
+		}
+	}
+
+retry:
 	if (it->it_flags & MDS_OPEN_BY_FID) {
 		LASSERT(fid_is_sane(&op_data->op_fid2));
 
@@ -290,7 +359,7 @@ static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
 		LASSERT(fid_is_zero(&op_data->op_fid2));
 		LASSERT(op_data->op_name != NULL);
 
-		tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+		tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
 		if (IS_ERR(tgt))
 			RETURN(PTR_ERR(tgt));
 	}
@@ -321,8 +390,21 @@ static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
 	 */
 	if ((it->it_disposition & DISP_LOOKUP_NEG) &&
 	    !(it->it_disposition & DISP_OPEN_CREATE) &&
-	    !(it->it_disposition & DISP_OPEN_OPEN))
+	    !(it->it_disposition & DISP_OPEN_OPEN)) {
+		if (!(it->it_flags & MDS_OPEN_BY_FID) &&
+		    lmv_dir_retry_check_update(op_data)) {
+			ptlrpc_req_finished(*reqp);
+			it->it_request = NULL;
+			it->it_disposition = 0;
+			*reqp = NULL;
+
+			it->it_flags = flags;
+			fid_zero(&op_data->op_fid2);
+			goto retry;
+		}
+
 		RETURN(rc);
+	}
 
 	body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
 	if (body == NULL)
@@ -331,7 +413,9 @@ static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
 	/* Not cross-ref case, just get out of here. */
 	if (unlikely((body->mbo_valid & OBD_MD_MDS))) {
 		rc = lmv_intent_remote(exp, it, &op_data->op_fid1, reqp,
-				       cb_blocking, extra_lock_flags);
+				       cb_blocking, extra_lock_flags,
+				       op_data->op_file_secctx_name,
+				       op_data->op_file_secctx_name_size);
 		if (rc != 0)
 			RETURN(rc);
 
@@ -352,42 +436,56 @@ lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data,
 		  ldlm_blocking_callback cb_blocking,
 		  __u64 extra_lock_flags)
 {
-	struct obd_device	*obd = exp->exp_obd;
-	struct lmv_obd		*lmv = &obd->u.lmv;
-	struct lmv_tgt_desc	*tgt = NULL;
-	struct mdt_body		*body;
-	struct lmv_stripe_md	*lsm = op_data->op_mea1;
-	int			rc = 0;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt = NULL;
+	struct mdt_body *body;
+	int rc;
 	ENTRY;
 
-	/* If it returns ERR_PTR(-EBADFD) then it is an unknown hash type
-	 * it will try all stripes to locate the object */
-	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
-	if (IS_ERR(tgt) && (PTR_ERR(tgt) != -EBADFD))
-		RETURN(PTR_ERR(tgt));
-
-	/* Both migrating dir and unknown hash dir need to try
-	 * all of sub-stripes */
-	if (lsm != NULL && !lmv_is_known_hash_type(lsm->lsm_md_hash_type)) {
-		struct lmv_oinfo *oinfo;
+retry:
+	if (op_data->op_flags & MF_GETATTR_BY_FID) {
+		/* getattr by FID, replace fid1 with stripe FID,
+		 * NB, don't replace if name is "/", because it may be a subtree
+		 * mount, and if it's a striped directory, fid1 will be replaced
+		 * to stripe FID by hash, while fid2 is master object FID, which
+		 * will be treated as a remote object if the two FIDs are
+		 * located on different MDTs, and LOOKUP lock can't be fetched.
+		 */
+		LASSERT(op_data->op_name);
+		if (op_data->op_namelen != 1 ||
+		    strncmp(op_data->op_name, "/", 1) != 0) {
+			tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
+			if (IS_ERR(tgt))
+				RETURN(PTR_ERR(tgt));
+		}
 
-		oinfo = &lsm->lsm_md_oinfo[0];
+		/* name is used to locate stripe target, clear it here
+		 * to avoid packing name in request, so that MDS knows
+		 * it's getattr by FID.
+		 */
+		op_data->op_name = NULL;
+		op_data->op_namelen = 0;
 
-		op_data->op_fid1 = oinfo->lmo_fid;
-		op_data->op_mds = oinfo->lmo_mds;
-		tgt = lmv_get_target(lmv, oinfo->lmo_mds, NULL);
-		if (IS_ERR(tgt))
-			RETURN(PTR_ERR(tgt));
+		/* getattr request is sent to MDT where fid2 inode is */
+		tgt = lmv_find_target(lmv, &op_data->op_fid2);
+	} else if (op_data->op_name) {
+		/* getattr by name */
+		tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
+		if (!fid_is_sane(&op_data->op_fid2))
+			fid_zero(&op_data->op_fid2);
+	} else {
+		/* old way to getattr by FID, parent FID not packed */
+		tgt = lmv_find_target(lmv, &op_data->op_fid1);
 	}
-
-	if (!fid_is_sane(&op_data->op_fid2))
-		fid_zero(&op_data->op_fid2);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
 
 	CDEBUG(D_INODE, "LOOKUP_INTENT with fid1="DFID", fid2="DFID
-	       ", name='%s' -> mds #%u lsm=%p lsm_magic=%x\n",
+	       ", name='%s' -> mds #%u\n",
 	       PFID(&op_data->op_fid1), PFID(&op_data->op_fid2),
 	       op_data->op_name ? op_data->op_name : "<NULL>",
-	       tgt->ltd_idx, lsm, lsm == NULL ? -1 : lsm->lsm_md_magic);
+	       tgt->ltd_idx);
 
 	op_data->op_bias &= ~MDS_CROSS_REF;
 
@@ -407,37 +505,14 @@ lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data,
 				RETURN(rc);
 		}
 		RETURN(rc);
-	} else if (it_disposition(it, DISP_LOOKUP_NEG) && lsm != NULL &&
-		   lmv_need_try_all_stripes(lsm)) {
-		/* For migrating and unknown hash type directory, it will
-		 * try to target the entry on other stripes */
-		int stripe_index;
-
-		for (stripe_index = 1;
-		     stripe_index < lsm->lsm_md_stripe_count &&
-		     it_disposition(it, DISP_LOOKUP_NEG); stripe_index++) {
-			struct lmv_oinfo *oinfo;
-
-			/* release the previous request */
-			ptlrpc_req_finished(*reqp);
-			it->it_request = NULL;
-			*reqp = NULL;
-
-			oinfo = &lsm->lsm_md_oinfo[stripe_index];
-			tgt = lmv_find_target(lmv, &oinfo->lmo_fid);
-			if (IS_ERR(tgt))
-				RETURN(PTR_ERR(tgt));
-
-			CDEBUG(D_INODE, "Try other stripes " DFID"\n",
-			       PFID(&oinfo->lmo_fid));
+	} else if (it_disposition(it, DISP_LOOKUP_NEG) &&
+		   lmv_dir_retry_check_update(op_data)) {
+		ptlrpc_req_finished(*reqp);
+		it->it_request = NULL;
+		it->it_disposition = 0;
+		*reqp = NULL;
 
-			op_data->op_fid1 = oinfo->lmo_fid;
-			it->it_disposition &= ~DISP_ENQ_COMPLETE;
-			rc = md_intent_lock(tgt->ltd_exp, op_data, it, reqp,
-					    cb_blocking, extra_lock_flags);
-			if (rc != 0)
-				RETURN(rc);
-		}
+		goto retry;
 	}
 
 	if (!it_has_reply_body(it))
@@ -454,7 +529,9 @@ lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data,
 	/* Not cross-ref case, just get out of here. */
 	if (unlikely((body->mbo_valid & OBD_MD_MDS))) {
 		rc = lmv_intent_remote(exp, it, NULL, reqp, cb_blocking,
-				       extra_lock_flags);
+				       extra_lock_flags,
+				       op_data->op_file_secctx_name,
+				       op_data->op_file_secctx_name_size);
 		if (rc != 0)
 			RETURN(rc);
 		body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h b/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h
index 8ef0631f3301a..0ad743244e93e 100644
--- a/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h
@@ -33,7 +33,6 @@
 #ifndef _LMV_INTERNAL_H_
 #define _LMV_INTERNAL_H_
 
-#include <lustre/lustre_idl.h>
 #include <obd.h>
 #include <lustre_lmv.h>
 
@@ -59,6 +58,9 @@ int lmv_revalidate_slaves(struct obd_export *exp,
 			  ldlm_blocking_callback cb_blocking,
 			  int extra_lock_flags);
 
+int lmv_getattr_name(struct obd_export *exp, struct md_op_data *op_data,
+		     struct ptlrpc_request **preq);
+
 static inline struct obd_device *lmv2obd_dev(struct lmv_obd *lmv)
 {
 	return container_of0(lmv, struct obd_device, u.lmv);
@@ -123,39 +125,90 @@ static inline int lmv_stripe_md_size(int stripe_count)
 	return sizeof(*lsm) + stripe_count * sizeof(lsm->lsm_md_oinfo[0]);
 }
 
+/* for file under migrating directory, return the target stripe info */
 static inline const struct lmv_oinfo *
 lsm_name_to_stripe_info(const struct lmv_stripe_md *lsm, const char *name,
-			int namelen)
+			int namelen, bool post_migrate)
 {
+	__u32 hash_type = lsm->lsm_md_hash_type;
+	__u32 stripe_count = lsm->lsm_md_stripe_count;
 	int stripe_index;
 
-	stripe_index = lmv_name_to_stripe_index(lsm->lsm_md_hash_type,
-						lsm->lsm_md_stripe_count,
+	if (hash_type & LMV_HASH_FLAG_MIGRATION) {
+		if (post_migrate) {
+			hash_type &= ~LMV_HASH_FLAG_MIGRATION;
+			stripe_count = lsm->lsm_md_migrate_offset;
+		} else {
+			hash_type = lsm->lsm_md_migrate_hash;
+			stripe_count -= lsm->lsm_md_migrate_offset;
+		}
+	}
+
+	stripe_index = lmv_name_to_stripe_index(hash_type, stripe_count,
 						name, namelen);
 	if (stripe_index < 0)
 		return ERR_PTR(stripe_index);
 
-	LASSERTF(stripe_index < lsm->lsm_md_stripe_count,
-		 "stripe_index = %d, stripe_count = %d hash_type = %x"
-		 "name = %.*s\n", stripe_index, lsm->lsm_md_stripe_count,
-		 lsm->lsm_md_hash_type, namelen, name);
+	if ((lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION) && !post_migrate)
+		stripe_index += lsm->lsm_md_migrate_offset;
+
+	if (stripe_index >= lsm->lsm_md_stripe_count) {
+		CERROR("stripe_index %d stripe_count %d hash_type %#x "
+			"migrate_offset %d migrate_hash %#x name %.*s\n",
+			stripe_index, lsm->lsm_md_stripe_count,
+			lsm->lsm_md_hash_type, lsm->lsm_md_migrate_offset,
+			lsm->lsm_md_migrate_hash, namelen, name);
+		return ERR_PTR(-EBADF);
+	}
 
 	return &lsm->lsm_md_oinfo[stripe_index];
 }
 
-static inline bool lmv_need_try_all_stripes(const struct lmv_stripe_md *lsm)
+static inline bool lmv_is_dir_migrating(const struct lmv_stripe_md *lsm)
 {
-	return !lmv_is_known_hash_type(lsm->lsm_md_hash_type) ||
-	       lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION;
+	return lsm ? lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION : false;
+}
+
+static inline bool lmv_is_dir_bad_hash(const struct lmv_stripe_md *lsm)
+{
+	if (!lsm)
+		return false;
+
+	if (lmv_is_dir_migrating(lsm)) {
+		if (lsm->lsm_md_stripe_count - lsm->lsm_md_migrate_offset > 1)
+			return !lmv_is_known_hash_type(
+					lsm->lsm_md_migrate_hash);
+		return false;
+	}
+
+	return !lmv_is_known_hash_type(lsm->lsm_md_hash_type);
 }
 
-struct lmv_tgt_desc
-*lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
-		struct lu_fid *fid);
+static inline bool lmv_dir_retry_check_update(struct md_op_data *op_data)
+{
+	const struct lmv_stripe_md *lsm = op_data->op_mea1;
+
+	if (!lsm)
+		return false;
+
+	if (lmv_is_dir_migrating(lsm) && !op_data->op_post_migrate) {
+		op_data->op_post_migrate = true;
+		return true;
+	}
+
+	if (lmv_is_dir_bad_hash(lsm) &&
+	    op_data->op_stripe_index < lsm->lsm_md_stripe_count - 1) {
+		op_data->op_stripe_index++;
+		return true;
+	}
+
+	return false;
+}
+
+struct lmv_tgt_desc *lmv_locate_tgt(struct lmv_obd *lmv,
+				    struct md_op_data *op_data,
+				    struct lu_fid *fid);
 /* lproc_lmv.c */
-#ifdef CONFIG_PROC_FS
-extern struct lprocfs_vars lprocfs_lmv_obd_vars[];
-#endif
-extern const struct proc_ops lmv_proc_target_fops;
+int lmv_tunables_init(struct obd_device *obd);
 
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c b/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c
index 8b073a6d9846f..078f6e2a59aad 100644
--- a/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -31,7 +31,8 @@
  */
 
 #define DEBUG_SUBSYSTEM S_LMV
-#include <linux/slab.h>
+
+#include <linux/file.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/user_namespace.h>
@@ -45,7 +46,6 @@
 #include <linux/seq_file.h>
 #include <linux/namei.h>
 
-#include <lustre/lustre_idl.h>
 #include <obd_support.h>
 #include <lustre_lib.h>
 #include <lustre_net.h>
@@ -54,7 +54,7 @@
 #include <lprocfs_status.h>
 #include <cl_object.h>
 #include <lustre_fid.h>
-#include <uapi/linux/lustre_ioctl.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
 #include <lustre_kernelcomm.h>
 #include "lmv_internal.h"
 
@@ -213,30 +213,24 @@ static int lmv_connect(const struct lu_env *env,
 	lmv->connected = 0;
 	lmv->conn_data = *data;
 
-	if (lmv->targets_proc_entry == NULL) {
-		lmv->targets_proc_entry = lprocfs_register("target_obds",
-							   obd->obd_proc_entry,
-							   NULL, NULL);
-		if (IS_ERR(lmv->targets_proc_entry)) {
-			CERROR("%s: cannot register "
-			       "/proc/fs/lustre/%s/%s/target_obds\n",
-			       obd->obd_name, obd->obd_type->typ_name,
-			       obd->obd_name);
-			lmv->targets_proc_entry = NULL;
-		}
+	lmv->lmv_tgts_kobj = kobject_create_and_add("target_obds",
+						    &obd->obd_kset.kobj);
+	if (!lmv->lmv_tgts_kobj) {
+		CERROR("%s: cannot create /sys/fs/lustre/%s/%s/target_obds\n",
+		       obd->obd_name, obd->obd_type->typ_name, obd->obd_name);
 	}
 
 	rc = lmv_check_connect(obd);
 	if (rc != 0)
-		GOTO(out_proc, rc);
+		GOTO(out_sysfs, rc);
 
 	*pexp = exp;
 
 	RETURN(rc);
 
-out_proc:
-	if (lmv->targets_proc_entry != NULL)
-		lprocfs_remove(&lmv->targets_proc_entry);
+out_sysfs:
+	if (lmv->lmv_tgts_kobj)
+		kobject_put(lmv->lmv_tgts_kobj);
 
 	class_disconnect(exp);
 
@@ -271,10 +265,12 @@ static int lmv_init_ea_size(struct obd_export *exp, __u32 easize,
 	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
 		struct lmv_tgt_desc *tgt = lmv->tgts[i];
 
-		if (tgt == NULL || tgt->ltd_exp == NULL || !tgt->ltd_active) {
+		if (tgt == NULL || tgt->ltd_exp == NULL) {
 			CWARN("%s: NULL export for %d\n", obd->obd_name, i);
 			continue;
 		}
+		if (!tgt->ltd_active)
+			continue;
 
 		rc = md_init_ea_size(tgt->ltd_exp, easize, def_easize);
 		if (rc) {
@@ -363,23 +359,11 @@ int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
 		mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
 		atomic_read(&obd->obd_refcount));
 
-	if (lmv->targets_proc_entry != NULL) {
-		struct proc_dir_entry *mdc_symlink;
-
-		LASSERT(mdc_obd->obd_type != NULL);
-		LASSERT(mdc_obd->obd_type->typ_name != NULL);
-		mdc_symlink = lprocfs_add_symlink(mdc_obd->obd_name,
-						  lmv->targets_proc_entry,
-						  "../../../%s/%s",
-						  mdc_obd->obd_type->typ_name,
-						  mdc_obd->obd_name);
-		if (mdc_symlink == NULL) {
-			CERROR("cannot register LMV target "
-			       "/proc/fs/lustre/%s/%s/target_obds/%s\n",
-			       obd->obd_type->typ_name, obd->obd_name,
-			       mdc_obd->obd_name);
-		}
-	}
+	if (lmv->lmv_tgts_kobj)
+		/* Even if we failed to create the link, that's fine */
+		rc = sysfs_create_link(lmv->lmv_tgts_kobj,
+				       &mdc_obd->obd_kset.kobj,
+				       mdc_obd->obd_name);
 	RETURN(0);
 }
 
@@ -415,7 +399,7 @@ static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
 	mutex_lock(&lmv->lmv_init_mutex);
 	if ((index < lmv->tgts_size) && (lmv->tgts[index] != NULL)) {
 		tgt = lmv->tgts[index];
-		CERROR("%s: UUID %s already assigned at LOV target index %d:"
+		CERROR("%s: UUID %s already assigned at LMV target index %d:"
 		       " rc = %d\n", obd->obd_name,
 		       obd_uuid2str(&tgt->ltd_uuid), index, -EEXIST);
 		mutex_unlock(&lmv->lmv_init_mutex);
@@ -584,9 +568,9 @@ static int lmv_disconnect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
                 mdc_obd->obd_fail = obd->obd_fail;
                 mdc_obd->obd_no_recov = obd->obd_no_recov;
 
-		if (lmv->targets_proc_entry != NULL)
-			lprocfs_remove_proc_entry(mdc_obd->obd_name,
-						  lmv->targets_proc_entry);
+		if (lmv->lmv_tgts_kobj)
+			sysfs_remove_link(lmv->lmv_tgts_kobj,
+					  mdc_obd->obd_name);
 	}
 
 	rc = obd_fid_fini(tgt->ltd_exp->exp_obd);
@@ -629,11 +613,8 @@ static int lmv_disconnect(struct obd_export *exp)
 		lmv_disconnect_mdc(obd, lmv->tgts[i]);
         }
 
-	if (lmv->targets_proc_entry != NULL)
-		lprocfs_remove(&lmv->targets_proc_entry);
-	else
-		CERROR("/proc/fs/lustre/%s/%s/target_obds missing\n",
-		       obd->obd_type->typ_name, obd->obd_name);
+	if (lmv->lmv_tgts_kobj)
+		kobject_put(lmv->lmv_tgts_kobj);
 
 out_local:
         /*
@@ -681,8 +662,8 @@ static int lmv_fid2path(struct obd_export *exp, int len, void *karg,
 		int len;
 
 		ori_gf = (struct getinfo_fid2path *)karg;
-		if (strlen(ori_gf->gf_u.gf_path) +
-		    strlen(gf->gf_u.gf_path) > ori_gf->gf_pathlen)
+		if (strlen(ori_gf->gf_u.gf_path) + 1 +
+		    strlen(gf->gf_u.gf_path) + 1 > ori_gf->gf_pathlen)
 			GOTO(out_fid2path, rc = -EOVERFLOW);
 
 		ptr = ori_gf->gf_u.gf_path;
@@ -819,23 +800,42 @@ static int lmv_hsm_ct_register(struct obd_device *obd, unsigned int cmd,
 			       void __user *uarg)
 {
 	struct lmv_obd *lmv = &obd->u.lmv;
-	struct file		*filp;
-	__u32			 i, j;
-	int			 err;
-	bool			 any_set = false;
-	struct kkuc_ct_data	 kcd = {
-		.kcd_magic   = KKUC_CT_DATA_MAGIC,
-		.kcd_archive = lk->lk_data,
-	};
-	int			 rc = 0;
+	struct file *filp;
+	__u32 i, j;
+	int err;
+	bool any_set = false;
+	struct kkuc_ct_data *kcd;
+	size_t kcd_size;
+	int rc = 0;
 	ENTRY;
 
 	filp = fget(lk->lk_wfd);
 	if (!filp)
 		RETURN(-EBADF);
 
+	if (lk->lk_flags & LK_FLG_DATANR)
+		kcd_size = offsetof(struct kkuc_ct_data,
+				    kcd_archives[lk->lk_data_count]);
+	else
+		kcd_size = sizeof(*kcd);
+
+	OBD_ALLOC(kcd, kcd_size);
+	if (kcd == NULL)
+		GOTO(err_fput, rc = -ENOMEM);
+
+	kcd->kcd_nr_archives = lk->lk_data_count;
+	if (lk->lk_flags & LK_FLG_DATANR) {
+		kcd->kcd_magic = KKUC_CT_DATA_ARRAY_MAGIC;
+		if (lk->lk_data_count > 0)
+			memcpy(kcd->kcd_archives, lk->lk_data,
+			       sizeof(*kcd->kcd_archives) * lk->lk_data_count);
+	} else {
+		kcd->kcd_magic = KKUC_CT_DATA_BITMAP_MAGIC;
+	}
+
 	rc = libcfs_kkuc_group_add(filp, &obd->obd_uuid, lk->lk_uid,
-				   lk->lk_group, &kcd, sizeof(kcd));
+				   lk->lk_group, kcd, kcd_size);
+	OBD_FREE(kcd, kcd_size);
 	if (rc)
 		GOTO(err_fput, rc);
 
@@ -934,7 +934,7 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
 			RETURN(-EFAULT);
 
 		rc = obd_statfs(NULL, tgt->ltd_exp, &stat_buf,
-				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
 				0);
 		if (rc)
 			RETURN(rc);
@@ -1175,7 +1175,7 @@ static int lmv_placement_policy(struct obd_device *obd,
 	 * 1. See if the stripe offset is specified by lum.
 	 * 2. Then check if there is default stripe offset.
 	 * 3. Finally choose MDS by name hash if the parent
-	 *    is striped directory. (see lmv_locate_mds()). */
+	 *    is striped directory. (see lmv_locate_tgt()). */
 	if (op_data->op_cli_flags & CLI_SET_MEA && lum != NULL &&
 	    le32_to_cpu(lum->lum_stripe_offset) != (__u32)-1) {
 		*mds = le32_to_cpu(lum->lum_stripe_offset);
@@ -1287,16 +1287,11 @@ static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 	spin_lock_init(&lmv->lmv_lock);
 	mutex_init(&lmv->lmv_init_mutex);
 
-#ifdef CONFIG_PROC_FS
-	obd->obd_vars = lprocfs_lmv_obd_vars;
-	lprocfs_obd_setup(obd);
-	lprocfs_alloc_md_stats(obd, 0);
-	rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd",
-				0444, &lmv_proc_target_fops, obd);
+	rc = lmv_tunables_init(obd);
 	if (rc)
-		CWARN("%s: error adding LMV target_obd file: rc = %d\n",
+		CWARN("%s: error adding LMV sysfs/debugfs files: rc = %d\n",
 		      obd->obd_name, rc);
-#endif
+
 	rc = fld_client_init(&lmv->lmv_fld, obd->obd_name,
 			     LUSTRE_CLI_FLD_HASH_DHT);
 	if (rc) {
@@ -1361,49 +1356,88 @@ static int lmv_process_config(struct obd_device *obd, size_t len, void *buf)
 	RETURN(rc);
 }
 
+static int lmv_select_statfs_mdt(struct lmv_obd *lmv, __u32 flags)
+{
+	int i;
+
+	if (flags & OBD_STATFS_FOR_MDT0)
+		return 0;
+
+	if (lmv->lmv_statfs_start || lmv->desc.ld_tgt_count == 1)
+		return lmv->lmv_statfs_start;
+
+	/* choose initial MDT for this client */
+	for (i = 0;; i++) {
+		struct lnet_process_id lnet_id;
+		if (LNetGetId(i, &lnet_id) == -ENOENT)
+			break;
+
+		if (lnet_id.nid != LNET_NID_LO_0) {
+			/* We dont need a full 64-bit modulus, just enough
+			 * to distribute the requests across MDTs evenly.
+			 */
+			lmv->lmv_statfs_start =
+				(u32)lnet_id.nid % lmv->desc.ld_tgt_count;
+			break;
+		}
+	}
+
+	return lmv->lmv_statfs_start;
+}
+
 static int lmv_statfs(const struct lu_env *env, struct obd_export *exp,
-                      struct obd_statfs *osfs, __u64 max_age, __u32 flags)
+		      struct obd_statfs *osfs, time64_t max_age, __u32 flags)
 {
 	struct obd_device	*obd = class_exp2obd(exp);
 	struct lmv_obd		*lmv = &obd->u.lmv;
 	struct obd_statfs	*temp;
 	int			 rc = 0;
-	__u32			 i;
+	__u32			 i, idx;
 	ENTRY;
 
         OBD_ALLOC(temp, sizeof(*temp));
         if (temp == NULL)
                 RETURN(-ENOMEM);
 
-        for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
-		if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+	/* distribute statfs among MDTs */
+	idx = lmv_select_statfs_mdt(lmv, flags);
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++, idx++) {
+		idx = idx % lmv->desc.ld_tgt_count;
+		if (lmv->tgts[idx] == NULL || lmv->tgts[idx]->ltd_exp == NULL)
 			continue;
 
-		rc = obd_statfs(env, lmv->tgts[i]->ltd_exp, temp,
+		rc = obd_statfs(env, lmv->tgts[idx]->ltd_exp, temp,
 				max_age, flags);
 		if (rc) {
-			CERROR("can't stat MDS #%d (%s), error %d\n", i,
-			       lmv->tgts[i]->ltd_exp->exp_obd->obd_name,
+			CERROR("%s: can't stat MDS #%d: rc = %d\n",
+			       lmv->tgts[idx]->ltd_exp->exp_obd->obd_name, i,
 			       rc);
 			GOTO(out_free_temp, rc);
 		}
 
+		if (temp->os_state & OS_STATE_SUM ||
+		    flags == OBD_STATFS_FOR_MDT0) {
+			/* reset to the last aggregated values
+			 * and don't sum with non-aggrated data */
+			/* If the statfs is from mount, it needs to retrieve
+			 * necessary information from MDT0. i.e. mount does
+			 * not need the merged osfs from all of MDT. Also
+			 * clients can be mounted as long as MDT0 is in
+			 * service */
+			*osfs = *temp;
+			break;
+		}
+
 		if (i == 0) {
 			*osfs = *temp;
-			/* If the statfs is from mount, it will needs
-			 * retrieve necessary information from MDT0.
-			 * i.e. mount does not need the merged osfs
-			 * from all of MDT.
-			 * And also clients can be mounted as long as
-			 * MDT0 is in service*/
-			if (flags & OBD_STATFS_FOR_MDT0)
-				GOTO(out_free_temp, rc);
-                } else {
-                        osfs->os_bavail += temp->os_bavail;
-                        osfs->os_blocks += temp->os_blocks;
-                        osfs->os_ffree += temp->os_ffree;
-                        osfs->os_files += temp->os_files;
-                }
+		} else {
+			osfs->os_bavail += temp->os_bavail;
+			osfs->os_blocks += temp->os_blocks;
+			osfs->os_ffree += temp->os_ffree;
+			osfs->os_files += temp->os_files;
+			osfs->os_granted += temp->os_granted;
+		}
         }
 
         EXIT;
@@ -1425,9 +1459,8 @@ static int lmv_get_root(struct obd_export *exp, const char *fileset,
 }
 
 static int lmv_getxattr(struct obd_export *exp, const struct lu_fid *fid,
-			u64 valid, const char *name,
-			const char *input, int input_size, int output_size,
-			int flags, struct ptlrpc_request **request)
+			u64 obd_md_valid, const char *name, size_t buf_size,
+			struct ptlrpc_request **req)
 {
         struct obd_device      *obd = exp->exp_obd;
         struct lmv_obd         *lmv = &obd->u.lmv;
@@ -1439,17 +1472,16 @@ static int lmv_getxattr(struct obd_export *exp, const struct lu_fid *fid,
         if (IS_ERR(tgt))
                 RETURN(PTR_ERR(tgt));
 
-	rc = md_getxattr(tgt->ltd_exp, fid, valid, name, input,
-			 input_size, output_size, flags, request);
+	rc = md_getxattr(tgt->ltd_exp, fid, obd_md_valid, name, buf_size, req);
 
 	RETURN(rc);
 }
 
 static int lmv_setxattr(struct obd_export *exp, const struct lu_fid *fid,
-			u64 valid, const char *name,
-			const char *input, int input_size, int output_size,
-			int flags, __u32 suppgid,
-			struct ptlrpc_request **request)
+			u64 obd_md_valid, const char *name,
+			const void *value, size_t value_size,
+			unsigned int xattr_flags, u32 suppgid,
+			struct ptlrpc_request **req)
 {
         struct obd_device      *obd = exp->exp_obd;
         struct lmv_obd         *lmv = &obd->u.lmv;
@@ -1461,9 +1493,8 @@ static int lmv_setxattr(struct obd_export *exp, const struct lu_fid *fid,
         if (IS_ERR(tgt))
                 RETURN(PTR_ERR(tgt));
 
-	rc = md_setxattr(tgt->ltd_exp, fid, valid, name, input,
-			 input_size, output_size, flags, suppgid,
-			 request);
+	rc = md_setxattr(tgt->ltd_exp, fid, obd_md_valid, name,
+			 value, value_size, xattr_flags, suppgid, req);
 
 	RETURN(rc);
 }
@@ -1532,81 +1563,93 @@ static int lmv_close(struct obd_export *exp, struct md_op_data *op_data,
         RETURN(rc);
 }
 
-/**
- * Choosing the MDT by name or FID in @op_data.
- * For non-striped directory, it will locate MDT by fid.
- * For striped-directory, it will locate MDT by name. And also
- * it will reset op_fid1 with the FID of the choosen stripe.
- **/
-struct lmv_tgt_desc *
-lmv_locate_target_for_name(struct lmv_obd *lmv, struct lmv_stripe_md *lsm,
-			   const char *name, int namelen, struct lu_fid *fid,
-			   u32 *mds)
+struct lmv_tgt_desc*
+__lmv_locate_tgt(struct lmv_obd *lmv, struct lmv_stripe_md *lsm,
+		 const char *name, int namelen, struct lu_fid *fid, u32 *mds,
+		 bool post_migrate)
 {
-	struct lmv_tgt_desc	*tgt;
-	const struct lmv_oinfo	*oinfo;
+	struct lmv_tgt_desc *tgt;
+	const struct lmv_oinfo *oinfo;
+
+	if (lsm == NULL || namelen == 0) {
+		tgt = lmv_find_target(lmv, fid);
+		if (IS_ERR(tgt))
+			return tgt;
+
+		LASSERT(mds);
+		*mds = tgt->ltd_idx;
+		return tgt;
+	}
 
 	if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_NAME_HASH)) {
 		if (cfs_fail_val >= lsm->lsm_md_stripe_count)
-			RETURN(ERR_PTR(-EBADF));
+			return ERR_PTR(-EBADF);
 		oinfo = &lsm->lsm_md_oinfo[cfs_fail_val];
 	} else {
-		oinfo = lsm_name_to_stripe_info(lsm, name, namelen);
+		oinfo = lsm_name_to_stripe_info(lsm, name, namelen,
+						post_migrate);
 		if (IS_ERR(oinfo))
-			RETURN(ERR_CAST(oinfo));
+			return ERR_CAST(oinfo);
 	}
 
 	if (fid != NULL)
 		*fid = oinfo->lmo_fid;
 	if (mds != NULL)
 		*mds = oinfo->lmo_mds;
+	/* check stripe FID is sane */
+	if (!fid_is_sane(&oinfo->lmo_fid))
+		return ERR_PTR(-ENODEV);
 
 	tgt = lmv_get_target(lmv, oinfo->lmo_mds, NULL);
 
 	CDEBUG(D_INFO, "locate on mds %u "DFID"\n", oinfo->lmo_mds,
 	       PFID(&oinfo->lmo_fid));
+
 	return tgt;
 }
 
+
 /**
- * Locate mds by fid or name
+ * Locate mdt by fid or name
  *
- * For striped directory (lsm != NULL), it will locate the stripe
- * by name hash (see lsm_name_to_stripe_info()). Note: if the hash_type
- * is unknown, it will return -EBADFD, and lmv_intent_lookup might need
- * walk through all of stripes to locate the entry.
+ * For striped directory, it will locate the stripe by name hash, if hash_type
+ * is unknown, it will return the stripe specified by 'op_data->op_stripe_index'
+ * which is set outside, and if dir is migrating, 'op_data->op_post_migrate'
+ * indicates whether old or new layout is used to locate.
  *
  * For normal direcotry, it will locate MDS by FID directly.
- * \param[in] lmv	LMV device
- * \param[in] op_data	client MD stack parameters, name, namelen
- *                      mds_num etc.
- * \param[in] fid	object FID used to locate MDS.
+ *
+ * \param[in] lmv		LMV device
+ * \param[in/out] op_data	client MD stack parameters, name, namelen etc,
+ *                      	op_mds and op_fid1 will be updated if op_mea1
+ *                      	indicates fid1 represents a striped directory.
+ * \param[out] fid		object FID used to locate MDS.
  *
  * retval		pointer to the lmv_tgt_desc if succeed.
  *                      ERR_PTR(errno) if failed.
  */
 struct lmv_tgt_desc*
-lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
+lmv_locate_tgt(struct lmv_obd *lmv, struct md_op_data *op_data,
 	       struct lu_fid *fid)
 {
-	struct lmv_stripe_md	*lsm = op_data->op_mea1;
-	struct lmv_tgt_desc	*tgt;
+	struct lmv_stripe_md *lsm = op_data->op_mea1;
+	struct lmv_oinfo *oinfo;
+	struct lmv_tgt_desc *tgt;
 
 	/* During creating VOLATILE file, it should honor the mdt
 	 * index if the file under striped dir is being restored, see
 	 * ct_restore(). */
 	if (op_data->op_bias & MDS_CREATE_VOLATILE &&
 	    (int)op_data->op_mds != -1) {
-		int i;
 		tgt = lmv_get_target(lmv, op_data->op_mds, NULL);
 		if (IS_ERR(tgt))
 			return tgt;
 
-		if (lsm != NULL) {
+		if (lsm) {
+			int i;
+
 			/* refill the right parent fid */
 			for (i = 0; i < lsm->lsm_md_stripe_count; i++) {
-				struct lmv_oinfo *oinfo;
-
 				oinfo = &lsm->lsm_md_oinfo[i];
 				if (oinfo->lmo_mds == op_data->op_mds) {
 					*fid = oinfo->lmo_fid;
@@ -1617,22 +1660,21 @@ lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
 			if (i == lsm->lsm_md_stripe_count)
 				*fid = lsm->lsm_md_oinfo[0].lmo_fid;
 		}
+	} else if (lmv_is_dir_bad_hash(lsm)) {
+		LASSERT(op_data->op_stripe_index < lsm->lsm_md_stripe_count);
+		oinfo = &lsm->lsm_md_oinfo[op_data->op_stripe_index];
 
-		return tgt;
-	}
-
-	if (lsm == NULL || op_data->op_namelen == 0) {
-		tgt = lmv_find_target(lmv, fid);
-		if (IS_ERR(tgt))
-			return tgt;
-
-		op_data->op_mds = tgt->ltd_idx;
-		return tgt;
+		*fid = oinfo->lmo_fid;
+		op_data->op_mds = oinfo->lmo_mds;
+		tgt = lmv_get_target(lmv, oinfo->lmo_mds, NULL);
+	} else {
+		tgt = __lmv_locate_tgt(lmv, lsm, op_data->op_name,
+				       op_data->op_namelen, fid,
+				       &op_data->op_mds,
+				       op_data->op_post_migrate);
 	}
 
-	return lmv_locate_target_for_name(lmv, lsm, op_data->op_name,
-					  op_data->op_namelen, fid,
-					  &op_data->op_mds);
+	return tgt;
 }
 
 int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
@@ -1649,7 +1691,33 @@ int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
 	if (!lmv->desc.ld_active_tgt_count)
 		RETURN(-EIO);
 
-	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (lmv_is_dir_bad_hash(op_data->op_mea1))
+		RETURN(-EBADF);
+
+	if (lmv_is_dir_migrating(op_data->op_mea1)) {
+		/*
+		 * if parent is migrating, create() needs to lookup existing
+		 * name, to avoid creating new file under old layout of
+		 * migrating directory, check old layout here.
+		 */
+		tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
+
+		rc = md_getattr_name(tgt->ltd_exp, op_data, request);
+		if (!rc) {
+			ptlrpc_req_finished(*request);
+			*request = NULL;
+			RETURN(-EEXIST);
+		}
+
+		if (rc != -ENOENT)
+			RETURN(rc);
+
+		op_data->op_post_migrate = true;
+	}
+
+	tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
 	if (IS_ERR(tgt))
 		RETURN(PTR_ERR(tgt));
 
@@ -1660,6 +1728,7 @@ int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
 	rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
 	if (rc)
 		RETURN(rc);
+
 	if (exp_connect_flags(exp) & OBD_CONNECT_DIR_STRIPE) {
 		/* Send the create request to the MDT where the object
 		 * will be located */
@@ -1699,7 +1768,7 @@ lmv_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
 
 	CDEBUG(D_INODE, "ENQUEUE on "DFID"\n", PFID(&op_data->op_fid1));
 
-	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	tgt = lmv_find_target(lmv, &op_data->op_fid1);
 	if (IS_ERR(tgt))
 		RETURN(PTR_ERR(tgt));
 
@@ -1712,19 +1781,20 @@ lmv_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
 	RETURN(rc);
 }
 
-static int
+int
 lmv_getattr_name(struct obd_export *exp,struct md_op_data *op_data,
 		 struct ptlrpc_request **preq)
 {
-	struct ptlrpc_request   *req = NULL;
-	struct obd_device       *obd = exp->exp_obd;
-	struct lmv_obd          *lmv = &obd->u.lmv;
-	struct lmv_tgt_desc     *tgt;
-	struct mdt_body         *body;
-	int                      rc;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	struct mdt_body *body;
+	int rc;
+
 	ENTRY;
 
-	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+retry:
+	tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
 	if (IS_ERR(tgt))
 		RETURN(PTR_ERR(tgt));
 
@@ -1733,31 +1803,28 @@ lmv_getattr_name(struct obd_export *exp,struct md_op_data *op_data,
 		PFID(&op_data->op_fid1), tgt->ltd_idx);
 
 	rc = md_getattr_name(tgt->ltd_exp, op_data, preq);
-	if (rc != 0)
+	if (rc == -ENOENT && lmv_dir_retry_check_update(op_data)) {
+		ptlrpc_req_finished(*preq);
+		*preq = NULL;
+		goto retry;
+	}
+
+	if (rc)
 		RETURN(rc);
 
 	body = req_capsule_server_get(&(*preq)->rq_pill, &RMF_MDT_BODY);
 	LASSERT(body != NULL);
 
 	if (body->mbo_valid & OBD_MD_MDS) {
-		struct lu_fid rid = body->mbo_fid1;
-		CDEBUG(D_INODE, "Request attrs for "DFID"\n",
-		       PFID(&rid));
-
-		tgt = lmv_find_target(lmv, &rid);
-		if (IS_ERR(tgt)) {
-			ptlrpc_req_finished(*preq);
-			preq = NULL;
-			RETURN(PTR_ERR(tgt));
-		}
-
-		op_data->op_fid1 = rid;
+		op_data->op_fid1 = body->mbo_fid1;
 		op_data->op_valid |= OBD_MD_FLCROSSREF;
 		op_data->op_namelen = 0;
 		op_data->op_name = NULL;
-		rc = md_getattr_name(tgt->ltd_exp, op_data, &req);
+
 		ptlrpc_req_finished(*preq);
-		*preq = req;
+		*preq = NULL;
+
+		goto retry;
 	}
 
 	RETURN(rc);
@@ -1827,19 +1894,40 @@ static int lmv_link(struct obd_export *exp, struct md_op_data *op_data,
 	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
 	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
 	op_data->op_cap = cfs_curproc_cap_pack();
-	if (op_data->op_mea2 != NULL) {
-		struct lmv_stripe_md	*lsm = op_data->op_mea2;
-		const struct lmv_oinfo	*oinfo;
 
-		oinfo = lsm_name_to_stripe_info(lsm, op_data->op_name,
-						op_data->op_namelen);
-		if (IS_ERR(oinfo))
-			RETURN(PTR_ERR(oinfo));
+	if (lmv_is_dir_migrating(op_data->op_mea2)) {
+		struct lu_fid fid1 = op_data->op_fid1;
+		struct lmv_stripe_md *lsm1 = op_data->op_mea1;
 
-		op_data->op_fid2 = oinfo->lmo_fid;
+		/*
+		 * avoid creating new file under old layout of migrating
+		 * directory, check it here.
+		 */
+		tgt = __lmv_locate_tgt(lmv, op_data->op_mea2, op_data->op_name,
+				       op_data->op_namelen, &op_data->op_fid2,
+				       &op_data->op_mds, false);
+		tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
+
+		op_data->op_fid1 = op_data->op_fid2;
+		op_data->op_mea1 = op_data->op_mea2;
+		rc = md_getattr_name(tgt->ltd_exp, op_data, request);
+		op_data->op_fid1 = fid1;
+		op_data->op_mea1 = lsm1;
+		if (!rc) {
+			ptlrpc_req_finished(*request);
+			*request = NULL;
+			RETURN(-EEXIST);
+		}
+
+		if (rc != -ENOENT)
+			RETURN(rc);
 	}
 
-	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
+	tgt = __lmv_locate_tgt(lmv, op_data->op_mea2, op_data->op_name,
+			       op_data->op_namelen, &op_data->op_fid2,
+			       &op_data->op_mds, true);
 	if (IS_ERR(tgt))
 		RETURN(PTR_ERR(tgt));
 
@@ -1857,158 +1945,323 @@ static int lmv_link(struct obd_export *exp, struct md_op_data *op_data,
 	RETURN(rc);
 }
 
-static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
-		      const char *old, size_t oldlen,
-		      const char *new, size_t newlen,
-		      struct ptlrpc_request **request)
+static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data,
+			const char *name, size_t namelen,
+			struct ptlrpc_request **request)
 {
-	struct obd_device       *obd = exp->exp_obd;
-	struct lmv_obd          *lmv = &obd->u.lmv;
-	struct lmv_tgt_desc     *src_tgt;
-	struct lmv_tgt_desc     *tgt_tgt;
-	struct obd_export	*target_exp;
-	struct mdt_body		*body;
-	int			rc;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_stripe_md *lsm = op_data->op_mea1;
+	struct lmv_tgt_desc *parent_tgt;
+	struct lmv_tgt_desc *sp_tgt;
+	struct lmv_tgt_desc *tp_tgt = NULL;
+	struct lmv_tgt_desc *child_tgt;
+	struct lmv_tgt_desc *tgt;
+	struct lu_fid target_fid;
+	int rc;
+
 	ENTRY;
 
-	LASSERT(oldlen != 0);
+	LASSERT(op_data->op_cli_flags & CLI_MIGRATE);
 
-	CDEBUG(D_INODE, "RENAME %.*s in "DFID":%d to %.*s in "DFID":%d\n",
-	       (int)oldlen, old, PFID(&op_data->op_fid1),
-	       op_data->op_mea1 ? op_data->op_mea1->lsm_md_stripe_count : 0,
-	       (int)newlen, new, PFID(&op_data->op_fid2),
-	       op_data->op_mea2 ? op_data->op_mea2->lsm_md_stripe_count : 0);
+	CDEBUG(D_INODE, "MIGRATE "DFID"/%.*s\n",
+	       PFID(&op_data->op_fid1), (int)namelen, name);
 
 	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
 	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
 	op_data->op_cap = cfs_curproc_cap_pack();
-	if (op_data->op_cli_flags & CLI_MIGRATE) {
-		LASSERTF(fid_is_sane(&op_data->op_fid3), "invalid FID "DFID"\n",
-			 PFID(&op_data->op_fid3));
-
-		if (op_data->op_mea1 != NULL) {
-			struct lmv_stripe_md	*lsm = op_data->op_mea1;
-			struct lmv_tgt_desc	*tmp;
-
-			/* Fix the parent fid for striped dir */
-			tmp = lmv_locate_target_for_name(lmv, lsm, old,
-							 oldlen,
-							 &op_data->op_fid1,
-							 NULL);
-			if (IS_ERR(tmp))
-				RETURN(PTR_ERR(tmp));
-		}
-
-		rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
-		if (rc != 0)
-			RETURN(rc);
 
-		src_tgt = lmv_find_target(lmv, &op_data->op_fid3);
-		if (IS_ERR(src_tgt))
-			RETURN(PTR_ERR(src_tgt));
+	parent_tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	if (IS_ERR(parent_tgt))
+		RETURN(PTR_ERR(parent_tgt));
 
-		target_exp = src_tgt->ltd_exp;
-	} else {
-		if (op_data->op_mea1 != NULL) {
-			struct lmv_stripe_md	*lsm = op_data->op_mea1;
+	if (lsm) {
+		__u32 hash_type = lsm->lsm_md_hash_type;
+		__u32 stripe_count = lsm->lsm_md_stripe_count;
 
-			src_tgt = lmv_locate_target_for_name(lmv, lsm, old,
-							     oldlen,
-							     &op_data->op_fid1,
-							     &op_data->op_mds);
-		} else {
-			src_tgt = lmv_find_target(lmv, &op_data->op_fid1);
+		/*
+		 * old stripes are appended after new stripes for migrating
+		 * directory.
+		 */
+		if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION) {
+			hash_type = lsm->lsm_md_migrate_hash;
+			stripe_count -= lsm->lsm_md_migrate_offset;
 		}
-		if (IS_ERR(src_tgt))
-			RETURN(PTR_ERR(src_tgt));
 
+		rc = lmv_name_to_stripe_index(hash_type, stripe_count, name,
+					      namelen);
+		if (rc < 0)
+			RETURN(rc);
 
-		if (op_data->op_mea2 != NULL) {
-			struct lmv_stripe_md	*lsm = op_data->op_mea2;
+		if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION)
+			rc += lsm->lsm_md_migrate_offset;
 
-			tgt_tgt = lmv_locate_target_for_name(lmv, lsm, new,
-							     newlen,
-							     &op_data->op_fid2,
-							     &op_data->op_mds);
-		} else {
-			tgt_tgt = lmv_find_target(lmv, &op_data->op_fid2);
+		/* save it in fid4 temporarily for early cancel */
+		op_data->op_fid4 = lsm->lsm_md_oinfo[rc].lmo_fid;
+		sp_tgt = lmv_get_target(lmv, lsm->lsm_md_oinfo[rc].lmo_mds,
+					NULL);
+		if (IS_ERR(sp_tgt))
+			RETURN(PTR_ERR(sp_tgt));
 
+		/*
+		 * if parent is being migrated too, fill op_fid2 with target
+		 * stripe fid, otherwise the target stripe is not created yet.
+		 */
+		if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION) {
+			hash_type = lsm->lsm_md_hash_type &
+				    ~LMV_HASH_FLAG_MIGRATION;
+			stripe_count = lsm->lsm_md_migrate_offset;
+
+			rc = lmv_name_to_stripe_index(hash_type, stripe_count,
+						      name, namelen);
+			if (rc < 0)
+				RETURN(rc);
+
+			op_data->op_fid2 = lsm->lsm_md_oinfo[rc].lmo_fid;
+			tp_tgt = lmv_get_target(lmv,
+						lsm->lsm_md_oinfo[rc].lmo_mds,
+						NULL);
+			if (IS_ERR(tp_tgt))
+				RETURN(PTR_ERR(tp_tgt));
 		}
-		if (IS_ERR(tgt_tgt))
-			RETURN(PTR_ERR(tgt_tgt));
-
-		target_exp = tgt_tgt->ltd_exp;
+	} else {
+		sp_tgt = parent_tgt;
 	}
 
-	/*
-	 * LOOKUP lock on src child (fid3) should also be cancelled for
-	 * src_tgt in mdc_rename.
-	 */
-	op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
-
-	/*
-	 * Cancel UPDATE locks on tgt parent (fid2), tgt_tgt is its
-	 * own target.
-	 */
-	rc = lmv_early_cancel(exp, NULL, op_data, src_tgt->ltd_idx,
-			      LCK_EX, MDS_INODELOCK_UPDATE,
-			      MF_MDC_CANCEL_FID2);
+	child_tgt = lmv_find_target(lmv, &op_data->op_fid3);
+	if (IS_ERR(child_tgt))
+		RETURN(PTR_ERR(child_tgt));
 
-	if (rc != 0)
+	if (!S_ISDIR(op_data->op_mode) && tp_tgt)
+		rc = __lmv_fid_alloc(lmv, &target_fid, tp_tgt->ltd_idx);
+	else
+		rc = lmv_fid_alloc(NULL, exp, &target_fid, op_data);
+	if (rc)
 		RETURN(rc);
+
 	/*
-	 * Cancel LOOKUP locks on source child (fid3) for parent tgt_tgt.
+	 * for directory, send migrate request to the MDT where the object will
+	 * be migrated to, because we can't create a striped directory remotely.
+	 *
+	 * otherwise, send to the MDT where source is located because regular
+	 * file may open lease.
+	 *
+	 * NB. if MDT doesn't support DIR_MIGRATE, send to source MDT too for
+	 * backward compatibility.
 	 */
-	if (fid_is_sane(&op_data->op_fid3)) {
-		struct lmv_tgt_desc *tgt;
-
-		tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	if (S_ISDIR(op_data->op_mode) &&
+	    (exp_connect_flags2(exp) & OBD_CONNECT2_DIR_MIGRATE)) {
+		tgt = lmv_find_target(lmv, &target_fid);
 		if (IS_ERR(tgt))
 			RETURN(PTR_ERR(tgt));
+	} else {
+		tgt = child_tgt;
+	}
 
-		/* Cancel LOOKUP lock on its parent */
-		rc = lmv_early_cancel(exp, tgt, op_data, src_tgt->ltd_idx,
-				      LCK_EX, MDS_INODELOCK_LOOKUP,
-				      MF_MDC_CANCEL_FID3);
-		if (rc != 0)
+	/* cancel UPDATE lock of parent master object */
+	rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_idx, LCK_EX,
+			      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1);
+	if (rc)
+		RETURN(rc);
+
+	/* cancel UPDATE lock of source parent */
+	if (sp_tgt != parent_tgt) {
+		/*
+		 * migrate RPC packs master object FID, because we can only pack
+		 * two FIDs in reint RPC, but MDS needs to know both source
+		 * parent and target parent, and it will obtain them from master
+		 * FID and LMV, the other FID in RPC is kept for target.
+		 *
+		 * since this FID is not passed to MDC, cancel it anyway.
+		 */
+		rc = lmv_early_cancel(exp, sp_tgt, op_data, -1, LCK_EX,
+				      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID4);
+		if (rc)
 			RETURN(rc);
 
-		rc = lmv_early_cancel(exp, NULL, op_data, src_tgt->ltd_idx,
-				      LCK_EX, MDS_INODELOCK_FULL,
+		op_data->op_flags &= ~MF_MDC_CANCEL_FID4;
+	}
+	op_data->op_fid4 = target_fid;
+
+	/* cancel UPDATE locks of target parent */
+	rc = lmv_early_cancel(exp, tp_tgt, op_data, tgt->ltd_idx, LCK_EX,
+			      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID2);
+	if (rc)
+		RETURN(rc);
+
+	/* cancel LOOKUP lock of source if source is remote object */
+	if (child_tgt != sp_tgt) {
+		rc = lmv_early_cancel(exp, sp_tgt, op_data, tgt->ltd_idx,
+				      LCK_EX, MDS_INODELOCK_LOOKUP,
 				      MF_MDC_CANCEL_FID3);
-		if (rc != 0)
+		if (rc)
 			RETURN(rc);
 	}
 
-retry_rename:
-	/*
-	 * Cancel all the locks on tgt child (fid4).
-	 */
-	if (fid_is_sane(&op_data->op_fid4)) {
-		struct lmv_tgt_desc *tgt;
+	/* cancel ELC locks of source */
+	rc = lmv_early_cancel(exp, child_tgt, op_data, tgt->ltd_idx, LCK_EX,
+			      MDS_INODELOCK_ELC, MF_MDC_CANCEL_FID3);
+	if (rc)
+		RETURN(rc);
 
-		rc = lmv_early_cancel(exp, NULL, op_data, src_tgt->ltd_idx,
-				      LCK_EX, MDS_INODELOCK_FULL,
-				      MF_MDC_CANCEL_FID4);
-		if (rc != 0)
+	rc = md_rename(tgt->ltd_exp, op_data, name, namelen, NULL, 0, request);
+
+	RETURN(rc);
+}
+
+static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
+		      const char *old, size_t oldlen,
+		      const char *new, size_t newlen,
+		      struct ptlrpc_request **request)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *sp_tgt;
+	struct lmv_tgt_desc *tp_tgt = NULL;
+	struct lmv_tgt_desc *src_tgt = NULL;
+	struct lmv_tgt_desc *tgt;
+	struct mdt_body *body;
+	int rc;
+
+	ENTRY;
+
+	LASSERT(oldlen != 0);
+
+	if (op_data->op_cli_flags & CLI_MIGRATE) {
+		rc = lmv_migrate(exp, op_data, old, oldlen, request);
+		RETURN(rc);
+	}
+
+	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
+	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
+	op_data->op_cap = cfs_curproc_cap_pack();
+
+	if (lmv_is_dir_migrating(op_data->op_mea2)) {
+		struct lu_fid fid1 = op_data->op_fid1;
+		struct lmv_stripe_md *lsm1 = op_data->op_mea1;
+
+		/*
+		 * we avoid creating new file under old layout of migrating
+		 * directory, if there is an existing file with new name under
+		 * old layout, we can't unlink file in old layout and rename to
+		 * new layout in one transaction, so return -EBUSY here.`
+		 */
+		tgt = __lmv_locate_tgt(lmv, op_data->op_mea2, new, newlen,
+				       &op_data->op_fid2, &op_data->op_mds,
+				       false);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
+
+		op_data->op_fid1 = op_data->op_fid2;
+		op_data->op_mea1 = op_data->op_mea2;
+		op_data->op_name = new;
+		op_data->op_namelen = newlen;
+		rc = md_getattr_name(tgt->ltd_exp, op_data, request);
+		op_data->op_fid1 = fid1;
+		op_data->op_mea1 = lsm1;
+		op_data->op_name = NULL;
+		op_data->op_namelen = 0;
+		if (!rc) {
+			ptlrpc_req_finished(*request);
+			*request = NULL;
+			RETURN(-EBUSY);
+		}
+
+		if (rc != -ENOENT)
 			RETURN(rc);
+	}
+
+	/* rename to new layout for migrating directory */
+	tp_tgt = __lmv_locate_tgt(lmv, op_data->op_mea2, new, newlen,
+				  &op_data->op_fid2, &op_data->op_mds, true);
+	if (IS_ERR(tp_tgt))
+		RETURN(PTR_ERR(tp_tgt));
 
+	/* Since the target child might be destroyed, and it might become
+	 * orphan, and we can only check orphan on the local MDT right now, so
+	 * we send rename request to the MDT where target child is located. If
+	 * target child does not exist, then it will send the request to the
+	 * target parent */
+	if (fid_is_sane(&op_data->op_fid4)) {
 		tgt = lmv_find_target(lmv, &op_data->op_fid4);
 		if (IS_ERR(tgt))
 			RETURN(PTR_ERR(tgt));
+	} else {
+		tgt = tp_tgt;
+	}
+
+	op_data->op_flags |= MF_MDC_CANCEL_FID4;
 
-		/* Since the target child might be destroyed, and it might
-		 * become orphan, and we can only check orphan on the local
-		 * MDT right now, so we send rename request to the MDT where
-		 * target child is located. If target child does not exist,
-		 * then it will send the request to the target parent */
-		target_exp = tgt->ltd_exp;
+	/* cancel UPDATE locks of target parent */
+	rc = lmv_early_cancel(exp, tp_tgt, op_data, tgt->ltd_idx, LCK_EX,
+			      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID2);
+	if (rc != 0)
+		RETURN(rc);
+
+	if (fid_is_sane(&op_data->op_fid4)) {
+		/* cancel LOOKUP lock of target on target parent */
+		if (tgt != tp_tgt) {
+			rc = lmv_early_cancel(exp, tp_tgt, op_data,
+					      tgt->ltd_idx, LCK_EX,
+					      MDS_INODELOCK_LOOKUP,
+					      MF_MDC_CANCEL_FID4);
+			if (rc != 0)
+				RETURN(rc);
+		}
 	}
 
-	rc = md_rename(target_exp, op_data, old, oldlen, new, newlen,
-		       request);
+	if (fid_is_sane(&op_data->op_fid3)) {
+		src_tgt = lmv_find_target(lmv, &op_data->op_fid3);
+		if (IS_ERR(src_tgt))
+			RETURN(PTR_ERR(src_tgt));
 
-	if (rc != 0 && rc != -EXDEV)
+		/* cancel ELC locks of source */
+		rc = lmv_early_cancel(exp, src_tgt, op_data, tgt->ltd_idx,
+				      LCK_EX, MDS_INODELOCK_ELC,
+				      MF_MDC_CANCEL_FID3);
+		if (rc != 0)
+			RETURN(rc);
+	}
+
+retry:
+	sp_tgt = __lmv_locate_tgt(lmv, op_data->op_mea1, old, oldlen,
+				  &op_data->op_fid1, &op_data->op_mds,
+				  op_data->op_post_migrate);
+	if (IS_ERR(sp_tgt))
+		RETURN(PTR_ERR(sp_tgt));
+
+	/* cancel UPDATE locks of source parent */
+	rc = lmv_early_cancel(exp, sp_tgt, op_data, tgt->ltd_idx, LCK_EX,
+			      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1);
+	if (rc != 0)
+		RETURN(rc);
+
+	if (fid_is_sane(&op_data->op_fid3)) {
+		/* cancel LOOKUP lock of source on source parent */
+		if (src_tgt != sp_tgt) {
+			rc = lmv_early_cancel(exp, sp_tgt, op_data,
+					      tgt->ltd_idx, LCK_EX,
+					      MDS_INODELOCK_LOOKUP,
+					      MF_MDC_CANCEL_FID3);
+			if (rc != 0)
+				RETURN(rc);
+		}
+	}
+
+rename:
+	CDEBUG(D_INODE, "RENAME "DFID"/%.*s to "DFID"/%.*s\n",
+		PFID(&op_data->op_fid1), (int)oldlen, old,
+		PFID(&op_data->op_fid2), (int)newlen, new);
+
+	rc = md_rename(tgt->ltd_exp, op_data, old, oldlen, new, newlen,
+			request);
+	if (rc == -ENOENT && lmv_dir_retry_check_update(op_data)) {
+		ptlrpc_req_finished(*request);
+		*request = NULL;
+		goto retry;
+	}
+
+	if (rc && rc != -EXDEV)
 		RETURN(rc);
 
 	body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY);
@@ -2019,13 +2272,28 @@ static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
 	if (likely(!(body->mbo_valid & OBD_MD_MDS)))
 		RETURN(rc);
 
-	CDEBUG(D_INODE, "%s: try rename to another MDT for "DFID"\n",
-	       exp->exp_obd->obd_name, PFID(&body->mbo_fid1));
-
 	op_data->op_fid4 = body->mbo_fid1;
+
 	ptlrpc_req_finished(*request);
 	*request = NULL;
-	goto retry_rename;
+
+	tgt = lmv_find_target(lmv, &op_data->op_fid4);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	if (fid_is_sane(&op_data->op_fid4)) {
+		/* cancel LOOKUP lock of target on target parent */
+		if (tgt != tp_tgt) {
+			rc = lmv_early_cancel(exp, tp_tgt, op_data,
+					      tgt->ltd_idx, LCK_EX,
+					      MDS_INODELOCK_LOOKUP,
+					      MF_MDC_CANCEL_FID4);
+			if (rc != 0)
+				RETURN(rc);
+		}
+	}
+
+	goto rename;
 }
 
 static int lmv_setattr(struct obd_export *exp, struct md_op_data *op_data,
@@ -2037,8 +2305,9 @@ static int lmv_setattr(struct obd_export *exp, struct md_op_data *op_data,
 	int                      rc = 0;
 	ENTRY;
 
-	CDEBUG(D_INODE, "SETATTR for "DFID", valid 0x%x\n",
-	       PFID(&op_data->op_fid1), op_data->op_attr.ia_valid);
+	CDEBUG(D_INODE, "SETATTR for "DFID", valid 0x%x/0x%x\n",
+	       PFID(&op_data->op_fid1), op_data->op_attr.ia_valid,
+	       op_data->op_xvalid);
 
 	op_data->op_flags |= MF_MDC_CANCEL_FID1;
 	tgt = lmv_find_target(lmv, &op_data->op_fid1);
@@ -2067,146 +2336,228 @@ static int lmv_fsync(struct obd_export *exp, const struct lu_fid *fid,
 	RETURN(rc);
 }
 
-/**
- * Get current minimum entry from striped directory
- *
- * This function will search the dir entry, whose hash value is the
- * closest(>=) to @hash_offset, from all of sub-stripes, and it is
- * only being called for striped directory.
- *
- * \param[in] exp		export of LMV
- * \param[in] op_data		parameters transferred beween client MD stack
- *                              stripe_information will be included in this
- *                              parameter
- * \param[in] cb_op		ldlm callback being used in enqueue in
- *                              mdc_read_page
- * \param[in] hash_offset	the hash value, which is used to locate
- *                              minum(closet) dir entry
- * \param[in|out] stripe_offset the caller use this to indicate the stripe
- *                              index of last entry, so to avoid hash conflict
- *                              between stripes. It will also be used to
- *                              return the stripe index of current dir entry.
- * \param[in|out] entp		the minum entry and it also is being used
- *                              to input the last dir entry to resolve the
- *                              hash conflict
- *
- * \param[out] ppage		the page which holds the minum entry
- *
- * \retval                      = 0 get the entry successfully
- *                              negative errno (< 0) does not get the entry
- */
-static int lmv_get_min_striped_entry(struct obd_export *exp,
-				     struct md_op_data *op_data,
-				     struct md_callback *cb_op,
-				     __u64 hash_offset, int *stripe_offset,
-				     struct lu_dirent **entp,
-				     struct page **ppage)
+struct stripe_dirent {
+	struct page		*sd_page;
+	struct lu_dirpage	*sd_dp;
+	struct lu_dirent	*sd_ent;
+	bool			 sd_eof;
+};
+
+struct lmv_dir_ctxt {
+	struct lmv_obd		*ldc_lmv;
+	struct md_op_data	*ldc_op_data;
+	struct md_callback	*ldc_cb_op;
+	__u64			 ldc_hash;
+	int			 ldc_count;
+	struct stripe_dirent	 ldc_stripes[0];
+};
+
+static inline void stripe_dirent_unload(struct stripe_dirent *stripe)
 {
-	struct obd_device	*obd = exp->exp_obd;
-	struct lmv_obd		*lmv = &obd->u.lmv;
-	struct lmv_stripe_md	*lsm = op_data->op_mea1;
-	struct lmv_tgt_desc	*tgt;
-	int			stripe_count;
-	struct lu_dirent	*min_ent = NULL;
-	struct page		*min_page = NULL;
-	int			min_idx = 0;
-	int			i;
-	int			rc = 0;
-	ENTRY;
+	if (stripe->sd_page) {
+		kunmap(stripe->sd_page);
+		put_page(stripe->sd_page);
+		stripe->sd_page = NULL;
+		stripe->sd_ent = NULL;
+	}
+}
 
-	stripe_count = lsm->lsm_md_stripe_count;
-	for (i = 0; i < stripe_count; i++) {
-		struct lu_dirent	*ent = NULL;
-		struct page		*page = NULL;
-		struct lu_dirpage	*dp;
-		__u64			stripe_hash = hash_offset;
+static inline void put_lmv_dir_ctxt(struct lmv_dir_ctxt *ctxt)
+{
+	int i;
 
-		tgt = lmv_get_target(lmv, lsm->lsm_md_oinfo[i].lmo_mds, NULL);
-		if (IS_ERR(tgt))
-			GOTO(out, rc = PTR_ERR(tgt));
-
-		/* op_data will be shared by each stripe, so we need
-		 * reset these value for each stripe */
-		op_data->op_fid1 = lsm->lsm_md_oinfo[i].lmo_fid;
-		op_data->op_fid2 = lsm->lsm_md_oinfo[i].lmo_fid;
-		op_data->op_data = lsm->lsm_md_oinfo[i].lmo_root;
-next:
-		rc = md_read_page(tgt->ltd_exp, op_data, cb_op, stripe_hash,
-				  &page);
-		if (rc != 0)
-			GOTO(out, rc);
+	for (i = 0; i < ctxt->ldc_count; i++)
+		stripe_dirent_unload(&ctxt->ldc_stripes[i]);
+}
 
-		dp = page_address(page);
-		for (ent = lu_dirent_start(dp); ent != NULL;
-		     ent = lu_dirent_next(ent)) {
-			/* Skip dummy entry */
-			if (le16_to_cpu(ent->lde_namelen) == 0)
-				continue;
+/* if @ent is dummy, or . .., get next */
+static struct lu_dirent *stripe_dirent_get(struct lmv_dir_ctxt *ctxt,
+					   struct lu_dirent *ent,
+					   int stripe_index)
+{
+	for (; ent; ent = lu_dirent_next(ent)) {
+		/* Skip dummy entry */
+		if (le16_to_cpu(ent->lde_namelen) == 0)
+			continue;
 
-			if (le64_to_cpu(ent->lde_hash) < hash_offset)
-				continue;
+		/* skip . and .. for other stripes */
+		if (stripe_index &&
+		    (strncmp(ent->lde_name, ".",
+			     le16_to_cpu(ent->lde_namelen)) == 0 ||
+		     strncmp(ent->lde_name, "..",
+			     le16_to_cpu(ent->lde_namelen)) == 0))
+			continue;
 
-			if (le64_to_cpu(ent->lde_hash) == hash_offset &&
-			    (*entp == ent || i < *stripe_offset))
-				continue;
+		if (le64_to_cpu(ent->lde_hash) >= ctxt->ldc_hash)
+			break;
+	}
 
-			/* skip . and .. for other stripes */
-			if (i != 0 &&
-			    (strncmp(ent->lde_name, ".",
-				     le16_to_cpu(ent->lde_namelen)) == 0 ||
-			     strncmp(ent->lde_name, "..",
-				     le16_to_cpu(ent->lde_namelen)) == 0))
-				continue;
+	return ent;
+}
+
+static struct lu_dirent *stripe_dirent_load(struct lmv_dir_ctxt *ctxt,
+					    struct stripe_dirent *stripe,
+					    int stripe_index)
+{
+	struct md_op_data *op_data = ctxt->ldc_op_data;
+	struct lmv_oinfo *oinfo;
+	struct lu_fid fid = op_data->op_fid1;
+	struct inode *inode = op_data->op_data;
+	struct lmv_tgt_desc *tgt;
+	struct lu_dirent *ent = stripe->sd_ent;
+	__u64 hash = ctxt->ldc_hash;
+	int rc = 0;
+
+	ENTRY;
+
+	LASSERT(stripe == &ctxt->ldc_stripes[stripe_index]);
+	LASSERT(!ent);
+
+	do {
+		if (stripe->sd_page) {
+			__u64 end = le64_to_cpu(stripe->sd_dp->ldp_hash_end);
+
+			/* @hash should be the last dirent hash */
+			LASSERTF(hash <= end,
+				 "ctxt@%p stripe@%p hash %llx end %llx\n",
+				 ctxt, stripe, hash, end);
+			/* unload last page */
+			stripe_dirent_unload(stripe);
+			/* eof */
+			if (end == MDS_DIR_END_OFF) {
+				stripe->sd_eof = true;
+				break;
+			}
+			hash = end;
+		}
+
+		oinfo = &op_data->op_mea1->lsm_md_oinfo[stripe_index];
+		if (!oinfo->lmo_root) {
+			rc = -ENOENT;
 			break;
 		}
 
-		if (ent == NULL) {
-			stripe_hash = le64_to_cpu(dp->ldp_hash_end);
+		tgt = lmv_get_target(ctxt->ldc_lmv, oinfo->lmo_mds, NULL);
+		if (IS_ERR(tgt)) {
+			rc = PTR_ERR(tgt);
+			break;
+		}
+
+		/* op_data is shared by stripes, reset after use */
+		op_data->op_fid1 = oinfo->lmo_fid;
+		op_data->op_fid2 = oinfo->lmo_fid;
+		op_data->op_data = oinfo->lmo_root;
 
-			kunmap(page);
-			put_page(page);
-			page = NULL;
+		rc = md_read_page(tgt->ltd_exp, op_data, ctxt->ldc_cb_op, hash,
+				  &stripe->sd_page);
+
+		op_data->op_fid1 = fid;
+		op_data->op_fid2 = fid;
+		op_data->op_data = inode;
+
+		if (rc)
+			break;
 
-			/* reach the end of current stripe, go to next stripe */
-			if (stripe_hash == MDS_DIR_END_OFF)
+		stripe->sd_dp = page_address(stripe->sd_page);
+		ent = stripe_dirent_get(ctxt, lu_dirent_start(stripe->sd_dp),
+					stripe_index);
+		/* in case a page filled with ., .. and dummy, read next */
+	} while (!ent);
+
+	stripe->sd_ent = ent;
+	if (rc) {
+		LASSERT(!ent);
+		/* treat error as eof, so dir can be partially accessed */
+		stripe->sd_eof = true;
+		LCONSOLE_WARN("dir "DFID" stripe %d readdir failed: %d, "
+			      "directory is partially accessed!\n",
+			      PFID(&ctxt->ldc_op_data->op_fid1), stripe_index,
+			      rc);
+	}
+
+	RETURN(ent);
+}
+
+static int lmv_file_resync(struct obd_export *exp, struct md_op_data *data)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lmv_obd		*lmv = &obd->u.lmv;
+	struct lmv_tgt_desc	*tgt;
+	int			 rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc != 0)
+		RETURN(rc);
+
+	tgt = lmv_find_target(lmv, &data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	data->op_flags |= MF_MDC_CANCEL_FID1;
+	rc = md_file_resync(tgt->ltd_exp, data);
+	RETURN(rc);
+}
+
+/**
+ * Get dirent with the closest hash for striped directory
+ *
+ * This function will search the dir entry, whose hash value is the
+ * closest(>=) to hash from all of sub-stripes, and it is only being called
+ * for striped directory.
+ *
+ * \param[in] ctxt		dir read context
+ *
+ * \retval                      dirent get the entry successfully
+ *                              NULL does not get the entry, normally it means
+ *                              it reaches the end of the directory, while read
+ *                              stripe dirent error is ignored to allow partial
+ *                              access.
+ */
+static struct lu_dirent *lmv_dirent_next(struct lmv_dir_ctxt *ctxt)
+{
+	struct stripe_dirent *stripe;
+	struct lu_dirent *ent = NULL;
+	int i;
+	int min = -1;
+
+	/* TODO: optimize with k-way merge sort */
+	for (i = 0; i < ctxt->ldc_count; i++) {
+		stripe = &ctxt->ldc_stripes[i];
+		if (stripe->sd_eof)
+			continue;
+
+		if (!stripe->sd_ent) {
+			stripe_dirent_load(ctxt, stripe, i);
+			if (!stripe->sd_ent) {
+				LASSERT(stripe->sd_eof);
 				continue;
-			else
-				goto next;
+			}
 		}
 
-		if (min_ent != NULL) {
-			if (le64_to_cpu(min_ent->lde_hash) >
-			    le64_to_cpu(ent->lde_hash)) {
-				min_ent = ent;
-				kunmap(min_page);
-				put_page(min_page);
-				min_idx = i;
-				min_page = page;
-			} else {
-				kunmap(page);
-				put_page(page);
-				page = NULL;
-			}
-		} else {
-			min_ent = ent;
-			min_page = page;
-			min_idx = i;
+		if (min == -1 ||
+		    le64_to_cpu(ctxt->ldc_stripes[min].sd_ent->lde_hash) >
+		    le64_to_cpu(stripe->sd_ent->lde_hash)) {
+			min = i;
+			if (le64_to_cpu(stripe->sd_ent->lde_hash) ==
+			    ctxt->ldc_hash)
+				break;
 		}
 	}
 
-out:
-	if (*ppage != NULL) {
-		kunmap(*ppage);
-		put_page(*ppage);
+	if (min != -1) {
+		stripe = &ctxt->ldc_stripes[min];
+		ent = stripe->sd_ent;
+		/* pop found dirent */
+		stripe->sd_ent = stripe_dirent_get(ctxt, lu_dirent_next(ent),
+						   min);
 	}
-	*stripe_offset = min_idx;
-	*entp = min_ent;
-	*ppage = min_page;
-	RETURN(rc);
+
+	return ent;
 }
 
 /**
- * Build dir entry page from a striped directory
+ * Build dir entry page for striped directory
  *
  * This function gets one entry by @offset from a striped directory. It will
  * read entries from all of stripes, and choose one closest to the required
@@ -2215,12 +2566,11 @@ static int lmv_get_min_striped_entry(struct obd_export *exp,
  * and .. in a directory.
  * 2. op_data will be shared by all of stripes, instead of allocating new
  * one, so need to restore before reusing.
- * 3. release the entry page if that is not being chosen.
  *
  * \param[in] exp	obd export refer to LMV
  * \param[in] op_data	hold those MD parameters of read_entry
  * \param[in] cb_op	ldlm callback being used in enqueue in mdc_read_entry
- * \param[out] ldp	the entry being read
+ * \param[in] offset	starting hash offset
  * \param[out] ppage	the page holding the entry. Note: because the entry
  *                      will be accessed in upper layer, so we need hold the
  *                      page until the usages of entry is finished, see
@@ -2229,124 +2579,117 @@ static int lmv_get_min_striped_entry(struct obd_export *exp,
  * retval		=0 if get entry successfully
  *                      <0 cannot get entry
  */
-static int lmv_read_striped_page(struct obd_export *exp,
+static int lmv_striped_read_page(struct obd_export *exp,
 				 struct md_op_data *op_data,
 				 struct md_callback *cb_op,
 				 __u64 offset, struct page **ppage)
 {
-	struct lu_fid		master_fid = op_data->op_fid1;
-	struct inode		*master_inode = op_data->op_data;
-	__u64			hash_offset = offset;
-	struct lu_dirpage	*dp;
-	struct page		*min_ent_page = NULL;
-	struct page		*ent_page = NULL;
-	struct lu_dirent	*ent;
-	void			*area;
-	int			ent_idx = 0;
-	struct lu_dirent	*min_ent = NULL;
-	struct lu_dirent	*last_ent;
-	size_t			left_bytes;
-	int			rc;
+	struct page *page = NULL;
+	struct lu_dirpage *dp;
+	void *start;
+	struct lu_dirent *ent;
+	struct lu_dirent *last_ent;
+	int stripe_count;
+	struct lmv_dir_ctxt *ctxt;
+	struct lu_dirent *next = NULL;
+	__u16 ent_size;
+	size_t left_bytes;
+	int rc = 0;
 	ENTRY;
 
 	/* Allocate a page and read entries from all of stripes and fill
 	 * the page by hash order */
-	ent_page = alloc_page(GFP_KERNEL);
-	if (ent_page == NULL)
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
 		RETURN(-ENOMEM);
 
 	/* Initialize the entry page */
-	dp = kmap(ent_page);
+	dp = kmap(page);
 	memset(dp, 0, sizeof(*dp));
 	dp->ldp_hash_start = cpu_to_le64(offset);
-	dp->ldp_flags |= LDF_COLLIDE;
 
-	area = dp + 1;
+	start = dp + 1;
 	left_bytes = PAGE_SIZE - sizeof(*dp);
-	ent = area;
+	ent = start;
 	last_ent = ent;
-	do {
-		__u16	ent_size;
 
-		/* Find the minum entry from all sub-stripes */
-		rc = lmv_get_min_striped_entry(exp, op_data, cb_op, hash_offset,
-					       &ent_idx, &min_ent,
-					       &min_ent_page);
-		if (rc != 0)
-			GOTO(out, rc);
-
-		/* If it can not get minum entry, it means it already reaches
-		 * the end of this directory */
-		if (min_ent == NULL) {
-			last_ent->lde_reclen = 0;
-			hash_offset = MDS_DIR_END_OFF;
-			GOTO(out, rc);
+	/* initalize dir read context */
+	stripe_count = op_data->op_mea1->lsm_md_stripe_count;
+	OBD_ALLOC(ctxt, offsetof(typeof(*ctxt), ldc_stripes[stripe_count]));
+	if (!ctxt)
+		GOTO(free_page, rc = -ENOMEM);
+	ctxt->ldc_lmv = &exp->exp_obd->u.lmv;
+	ctxt->ldc_op_data = op_data;
+	ctxt->ldc_cb_op = cb_op;
+	ctxt->ldc_hash = offset;
+	ctxt->ldc_count = stripe_count;
+
+	while (1) {
+		next = lmv_dirent_next(ctxt);
+
+		/* end of directory */
+		if (!next) {
+			ctxt->ldc_hash = MDS_DIR_END_OFF;
+			break;
 		}
+		ctxt->ldc_hash = le64_to_cpu(next->lde_hash);
 
-		ent_size = le16_to_cpu(min_ent->lde_reclen);
+		ent_size = le16_to_cpu(next->lde_reclen);
 
-		/* the last entry lde_reclen is 0, but it might not
-		 * the end of this entry of this temporay entry */
-		if (ent_size == 0)
+		/* the last entry lde_reclen is 0, but it might not be the last
+		 * one of this temporay dir page */
+		if (!ent_size)
 			ent_size = lu_dirent_calc_size(
-					le16_to_cpu(min_ent->lde_namelen),
-					le32_to_cpu(min_ent->lde_attrs));
-		if (ent_size > left_bytes) {
-			last_ent->lde_reclen = cpu_to_le16(0);
-			hash_offset = le64_to_cpu(min_ent->lde_hash);
-			GOTO(out, rc);
-		}
+					le16_to_cpu(next->lde_namelen),
+					le32_to_cpu(next->lde_attrs));
+		/* page full */
+		if (ent_size > left_bytes)
+			break;
 
-		memcpy(ent, min_ent, ent_size);
+		memcpy(ent, next, ent_size);
 
 		/* Replace . with master FID and Replace .. with the parent FID
 		 * of master object */
 		if (strncmp(ent->lde_name, ".",
 			    le16_to_cpu(ent->lde_namelen)) == 0 &&
 		    le16_to_cpu(ent->lde_namelen) == 1)
-			fid_cpu_to_le(&ent->lde_fid, &master_fid);
+			fid_cpu_to_le(&ent->lde_fid, &op_data->op_fid1);
 		else if (strncmp(ent->lde_name, "..",
 				   le16_to_cpu(ent->lde_namelen)) == 0 &&
 			   le16_to_cpu(ent->lde_namelen) == 2)
 			fid_cpu_to_le(&ent->lde_fid, &op_data->op_fid3);
 
+		CDEBUG(D_INODE, "entry %.*s hash %#llx\n",
+		       le16_to_cpu(ent->lde_namelen), ent->lde_name,
+		       le64_to_cpu(ent->lde_hash));
+
 		left_bytes -= ent_size;
 		ent->lde_reclen = cpu_to_le16(ent_size);
 		last_ent = ent;
 		ent = (void *)ent + ent_size;
-		hash_offset = le64_to_cpu(min_ent->lde_hash);
-		if (hash_offset == MDS_DIR_END_OFF) {
-			last_ent->lde_reclen = 0;
-			break;
-		}
-	} while (1);
-out:
-	if (min_ent_page != NULL) {
-		kunmap(min_ent_page);
-		put_page(min_ent_page);
-	}
+	};
 
-	if (unlikely(rc != 0)) {
-		__free_page(ent_page);
-		ent_page = NULL;
-	} else {
-		if (ent == area)
-			dp->ldp_flags |= LDF_EMPTY;
-		dp->ldp_flags = cpu_to_le32(dp->ldp_flags);
-		dp->ldp_hash_end = cpu_to_le64(hash_offset);
-	}
+	last_ent->lde_reclen = 0;
 
-	/* We do not want to allocate md_op_data during each
-	 * dir entry reading, so op_data will be shared by every stripe,
-	 * then we need to restore it back to original value before
-	 * return to the upper layer */
-	op_data->op_fid1 = master_fid;
-	op_data->op_fid2 = master_fid;
-	op_data->op_data = master_inode;
+	if (ent == start)
+		dp->ldp_flags |= LDF_EMPTY;
+	else if (ctxt->ldc_hash == le64_to_cpu(last_ent->lde_hash))
+		dp->ldp_flags |= LDF_COLLIDE;
+	dp->ldp_flags = cpu_to_le32(dp->ldp_flags);
+	dp->ldp_hash_end = cpu_to_le64(ctxt->ldc_hash);
 
-	*ppage = ent_page;
+	put_lmv_dir_ctxt(ctxt);
+	OBD_FREE(ctxt, offsetof(typeof(*ctxt), ldc_stripes[stripe_count]));
 
-	RETURN(rc);
+	*ppage = page;
+
+	RETURN(0);
+
+free_page:
+	kunmap(page);
+	__free_page(page);
+
+	return rc;
 }
 
 int lmv_read_page(struct obd_export *exp, struct md_op_data *op_data,
@@ -2361,7 +2704,7 @@ int lmv_read_page(struct obd_export *exp, struct md_op_data *op_data,
 	ENTRY;
 
 	if (unlikely(lsm != NULL)) {
-		rc = lmv_read_striped_page(exp, op_data, cb_op, offset, ppage);
+		rc = lmv_striped_read_page(exp, op_data, cb_op, offset, ppage);
 		RETURN(rc);
 	}
 
@@ -2399,68 +2742,34 @@ int lmv_read_page(struct obd_export *exp, struct md_op_data *op_data,
  *                      negative errno if failed.
  */
 static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data,
-                      struct ptlrpc_request **request)
+		      struct ptlrpc_request **request)
 {
-	struct obd_device       *obd = exp->exp_obd;
-	struct lmv_obd          *lmv = &obd->u.lmv;
-	struct lmv_tgt_desc     *tgt = NULL;
-	struct lmv_tgt_desc     *parent_tgt = NULL;
-	struct mdt_body		*body;
-	int                     rc;
-	int			stripe_index = 0;
-	struct lmv_stripe_md	*lsm = op_data->op_mea1;
-	ENTRY;
-
-retry_unlink:
-	/* For striped dir, we need to locate the parent as well */
-	if (lsm != NULL) {
-		struct lmv_tgt_desc *tmp;
-
-		LASSERT(op_data->op_name != NULL &&
-			op_data->op_namelen != 0);
-
-		tmp = lmv_locate_target_for_name(lmv, lsm,
-						 op_data->op_name,
-						 op_data->op_namelen,
-						 &op_data->op_fid1,
-						 &op_data->op_mds);
-
-		/* return -EBADFD means unknown hash type, might
-		 * need try all sub-stripe here */
-		if (IS_ERR(tmp) && PTR_ERR(tmp) != -EBADFD)
-			RETURN(PTR_ERR(tmp));
-
-		/* Note: both migrating dir and unknown hash dir need to
-		 * try all of sub-stripes, so we need start search the
-		 * name from stripe 0, but migrating dir is already handled
-		 * inside lmv_locate_target_for_name(), so we only check
-		 * unknown hash type directory here */
-		if (!lmv_is_known_hash_type(lsm->lsm_md_hash_type)) {
-			struct lmv_oinfo *oinfo;
-
-			oinfo = &lsm->lsm_md_oinfo[stripe_index];
-
-			op_data->op_fid1 = oinfo->lmo_fid;
-			op_data->op_mds = oinfo->lmo_mds;
-		}
-	}
-
-try_next_stripe:
-	/* Send unlink requests to the MDT where the child is located */
-	if (likely(!fid_is_zero(&op_data->op_fid2)))
-		tgt = lmv_find_target(lmv, &op_data->op_fid2);
-	else if (lsm != NULL)
-		tgt = lmv_get_target(lmv, op_data->op_mds, NULL);
-	else
-		tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	struct lmv_tgt_desc *parent_tgt;
+	struct mdt_body *body;
+	int rc;
 
-	if (IS_ERR(tgt))
-		RETURN(PTR_ERR(tgt));
+	ENTRY;
 
 	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
 	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
 	op_data->op_cap = cfs_curproc_cap_pack();
 
+retry:
+	parent_tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(parent_tgt))
+		RETURN(PTR_ERR(parent_tgt));
+
+	if (likely(!fid_is_zero(&op_data->op_fid2))) {
+		tgt = lmv_find_target(lmv, &op_data->op_fid2);
+		if (IS_ERR(tgt))
+			RETURN(PTR_ERR(tgt));
+	} else {
+		tgt = parent_tgt;
+	}
+
 	/*
 	 * If child's fid is given, cancel unused locks for it if it is from
 	 * another export than parent.
@@ -2470,50 +2779,29 @@ static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data,
 	 */
 	op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
 
-	/*
-	 * Cancel FULL locks on child (fid3).
-	 */
-	parent_tgt = lmv_find_target(lmv, &op_data->op_fid1);
-	if (IS_ERR(parent_tgt))
-		RETURN(PTR_ERR(parent_tgt));
-
-	if (parent_tgt != tgt) {
+	if (parent_tgt != tgt)
 		rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_idx,
 				      LCK_EX, MDS_INODELOCK_LOOKUP,
 				      MF_MDC_CANCEL_FID3);
-	}
 
 	rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_idx, LCK_EX,
-			      MDS_INODELOCK_FULL, MF_MDC_CANCEL_FID3);
-	if (rc != 0)
+			      MDS_INODELOCK_ELC, MF_MDC_CANCEL_FID3);
+	if (rc)
 		RETURN(rc);
 
 	CDEBUG(D_INODE, "unlink with fid="DFID"/"DFID" -> mds #%u\n",
 	       PFID(&op_data->op_fid1), PFID(&op_data->op_fid2), tgt->ltd_idx);
 
 	rc = md_unlink(tgt->ltd_exp, op_data, request);
-	if (rc != 0 && rc != -EREMOTE && rc != -ENOENT)
-		RETURN(rc);
-
-	/* Try next stripe if it is needed. */
-	if (rc == -ENOENT && lsm != NULL && lmv_need_try_all_stripes(lsm)) {
-		struct lmv_oinfo *oinfo;
-
-		stripe_index++;
-		if (stripe_index >= lsm->lsm_md_stripe_count)
-			RETURN(rc);
-
-		oinfo = &lsm->lsm_md_oinfo[stripe_index];
-
-		op_data->op_fid1 = oinfo->lmo_fid;
-		op_data->op_mds = oinfo->lmo_mds;
-
+	if (rc == -ENOENT && lmv_dir_retry_check_update(op_data)) {
 		ptlrpc_req_finished(*request);
 		*request = NULL;
-
-		goto try_next_stripe;
+		goto retry;
 	}
 
+	if (rc != -EREMOTE)
+		RETURN(rc);
+
 	body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY);
 	if (body == NULL)
 		RETURN(-EPROTO);
@@ -2522,40 +2810,23 @@ static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data,
 	if (likely(!(body->mbo_valid & OBD_MD_MDS)))
 		RETURN(rc);
 
-	CDEBUG(D_INODE, "%s: try unlink to another MDT for "DFID"\n",
-	       exp->exp_obd->obd_name, PFID(&body->mbo_fid1));
-
-	/* This is a remote object, try remote MDT, Note: it may
-	 * try more than 1 time here, Considering following case
-	 * /mnt/lustre is root on MDT0, remote1 is on MDT1
-	 * 1. Initially A does not know where remote1 is, it send
-	 *    unlink RPC to MDT0, MDT0 return -EREMOTE, it will
-	 *    resend unlink RPC to MDT1 (retry 1st time).
-	 *
-	 * 2. During the unlink RPC in flight,
-	 *    client B mv /mnt/lustre/remote1 /mnt/lustre/remote2
-	 *    and create new remote1, but on MDT0
-	 *
-	 * 3. MDT1 get unlink RPC(from A), then do remote lock on
-	 *    /mnt/lustre, then lookup get fid of remote1, and find
-	 *    it is remote dir again, and replay -EREMOTE again.
-	 *
-	 * 4. Then A will resend unlink RPC to MDT0. (retry 2nd times).
-	 *
-	 * In theory, it might try unlimited time here, but it should
-	 * be very rare case.  */
+	/* This is a remote object, try remote MDT. */
 	op_data->op_fid2 = body->mbo_fid1;
 	ptlrpc_req_finished(*request);
 	*request = NULL;
 
-	goto retry_unlink;
+	tgt = lmv_find_target(lmv, &op_data->op_fid2);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	goto retry;
 }
 
 static int lmv_precleanup(struct obd_device *obd)
 {
 	ENTRY;
 	libcfs_kkuc_group_rem(&obd->obd_uuid, 0, KUC_GRP_HSM);
-	fld_client_proc_fini(&obd->u.lmv.lmv_fld);
+	fld_client_debugfs_fini(&obd->u.lmv.lmv_fld);
 	lprocfs_obd_cleanup(obd);
 	lprocfs_free_md_stats(obd);
 	RETURN(0);
@@ -2631,6 +2902,96 @@ static int lmv_get_info(const struct lu_env *env, struct obd_export *exp,
         RETURN(-EINVAL);
 }
 
+static int lmv_rmfid(struct obd_export *exp, struct fid_array *fa,
+		     int *__rcs, struct ptlrpc_request_set *_set)
+{
+	struct obd_device *obddev = class_exp2obd(exp);
+	struct ptlrpc_request_set *set = _set;
+	struct lmv_obd *lmv = &obddev->u.lmv;
+	int tgt_count = lmv->desc.ld_tgt_count;
+	struct fid_array *fat, **fas = NULL;
+	int i, rc, **rcs = NULL;
+
+	if (!set) {
+		set = ptlrpc_prep_set();
+		if (!set)
+			RETURN(-ENOMEM);
+	}
+
+	/* split FIDs by targets */
+	OBD_ALLOC(fas, sizeof(fas) * tgt_count);
+	if (fas == NULL)
+		GOTO(out, rc = -ENOMEM);
+	OBD_ALLOC(rcs, sizeof(int *) * tgt_count);
+	if (rcs == NULL)
+		GOTO(out_fas, rc = -ENOMEM);
+
+	for (i = 0; i < fa->fa_nr; i++) {
+		unsigned int idx;
+
+		rc = lmv_fld_lookup(lmv, &fa->fa_fids[i], &idx);
+		if (rc) {
+			CDEBUG(D_OTHER, "can't lookup "DFID": rc = %d\n",
+			       PFID(&fa->fa_fids[i]), rc);
+			continue;
+		}
+		LASSERT(idx < tgt_count);
+		if (!fas[idx])
+			OBD_ALLOC(fas[idx], offsetof(struct fid_array,
+				  fa_fids[fa->fa_nr]));
+		if (!fas[idx])
+			GOTO(out, rc = -ENOMEM);
+		if (!rcs[idx])
+			OBD_ALLOC(rcs[idx], sizeof(int) * fa->fa_nr);
+		if (!rcs[idx])
+			GOTO(out, rc = -ENOMEM);
+
+		fat = fas[idx];
+		fat->fa_fids[fat->fa_nr++] = fa->fa_fids[i];
+	}
+
+	for (i = 0; i < tgt_count; i++) {
+		fat = fas[i];
+		if (!fat || fat->fa_nr == 0)
+			continue;
+		rc = md_rmfid(lmv->tgts[i]->ltd_exp, fat, rcs[i], set);
+	}
+
+	rc = ptlrpc_set_wait(NULL, set);
+	if (rc == 0) {
+		int j = 0;
+		for (i = 0; i < tgt_count; i++) {
+			fat = fas[i];
+			if (!fat || fat->fa_nr == 0)
+				continue;
+			/* copy FIDs back */
+			memcpy(fa->fa_fids + j, fat->fa_fids,
+			       fat->fa_nr * sizeof(struct lu_fid));
+			/* copy rcs back */
+			memcpy(__rcs + j, rcs[i], fat->fa_nr * sizeof(**rcs));
+			j += fat->fa_nr;
+		}
+	}
+	if (set != _set)
+		ptlrpc_set_destroy(set);
+
+out:
+	for (i = 0; i < tgt_count; i++) {
+		if (fas && fas[i])
+			OBD_FREE(fas[i], offsetof(struct fid_array,
+						fa_fids[fa->fa_nr]));
+		if (rcs && rcs[i])
+			OBD_FREE(rcs[i], sizeof(int) * fa->fa_nr);
+	}
+	if (rcs)
+		OBD_FREE(rcs, sizeof(int *) * tgt_count);
+out_fas:
+	if (fas)
+		OBD_FREE(fas, sizeof(fas) * tgt_count);
+
+	RETURN(rc);
+}
+
 /**
  * Asynchronously set by key a value associated with a LMV device.
  *
@@ -2705,13 +3066,15 @@ static int lmv_unpack_md_v1(struct obd_export *exp, struct lmv_stripe_md *lsm,
 	else
 		lsm->lsm_md_hash_type = le32_to_cpu(lmm1->lmv_hash_type);
 	lsm->lsm_md_layout_version = le32_to_cpu(lmm1->lmv_layout_version);
+	lsm->lsm_md_migrate_offset = le32_to_cpu(lmm1->lmv_migrate_offset);
+	lsm->lsm_md_migrate_hash = le32_to_cpu(lmm1->lmv_migrate_hash);
 	cplen = strlcpy(lsm->lsm_md_pool_name, lmm1->lmv_pool_name,
 			sizeof(lsm->lsm_md_pool_name));
 
 	if (cplen >= sizeof(lsm->lsm_md_pool_name))
 		RETURN(-E2BIG);
 
-	CDEBUG(D_INFO, "unpack lsm count %d, master %d hash_type %d"
+	CDEBUG(D_INFO, "unpack lsm count %d, master %d hash_type %#x "
 	       "layout_version %d\n", lsm->lsm_md_stripe_count,
 	       lsm->lsm_md_master_mdt_index, lsm->lsm_md_hash_type,
 	       lsm->lsm_md_layout_version);
@@ -2720,10 +3083,22 @@ static int lmv_unpack_md_v1(struct obd_export *exp, struct lmv_stripe_md *lsm,
 	for (i = 0; i < stripe_count; i++) {
 		fid_le_to_cpu(&lsm->lsm_md_oinfo[i].lmo_fid,
 			      &lmm1->lmv_stripe_fids[i]);
+		/*
+		 * set default value -1, so lmv_locate_tgt() knows this stripe
+		 * target is not initialized.
+		 */
+		lsm->lsm_md_oinfo[i].lmo_mds = (u32)-1;
+		if (!fid_is_sane(&lsm->lsm_md_oinfo[i].lmo_fid))
+			continue;
+
 		rc = lmv_fld_lookup(lmv, &lsm->lsm_md_oinfo[i].lmo_fid,
 				    &lsm->lsm_md_oinfo[i].lmo_mds);
-		if (rc != 0)
+		if (rc == -ENOENT)
+			continue;
+
+		if (rc)
 			RETURN(rc);
+
 		CDEBUG(D_INFO, "unpack fid #%d "DFID"\n", i,
 		       PFID(&lsm->lsm_md_oinfo[i].lmo_fid));
 	}
@@ -2746,12 +3121,9 @@ static int lmv_unpackmd(struct obd_export *exp, struct lmv_stripe_md **lsmp,
 	/* Free memmd */
 	if (lsm != NULL && lmm == NULL) {
 		int i;
+
 		for (i = 0; i < lsm->lsm_md_stripe_count; i++) {
-			/* For migrating inode, the master stripe and master
-			 * object will be the same, so do not need iput, see
-			 * ll_update_lsm_md */
-			if (!(lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION &&
-			      i == 0) && lsm->lsm_md_oinfo[i].lmo_root != NULL)
+			if (lsm->lsm_md_oinfo[i].lmo_root)
 				iput(lsm->lsm_md_oinfo[i].lmo_root);
 		}
 		lsm_size = lmv_stripe_md_size(lsm->lsm_md_stripe_count);
@@ -2963,35 +3335,34 @@ int lmv_clear_open_replay_data(struct obd_export *exp,
 int lmv_intent_getattr_async(struct obd_export *exp,
 			     struct md_enqueue_info *minfo)
 {
-	struct md_op_data       *op_data = &minfo->mi_data;
-	struct obd_device       *obd = exp->exp_obd;
-	struct lmv_obd          *lmv = &obd->u.lmv;
-	struct lmv_tgt_desc     *ptgt = NULL;
-	struct lmv_tgt_desc     *ctgt = NULL;
-	int                      rc;
+	struct md_op_data *op_data = &minfo->mi_data;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *ptgt;
+	struct lmv_tgt_desc *ctgt;
+	int rc;
+
 	ENTRY;
 
 	if (!fid_is_sane(&op_data->op_fid2))
 		RETURN(-EINVAL);
 
-	ptgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	ptgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
 	if (IS_ERR(ptgt))
 		RETURN(PTR_ERR(ptgt));
 
-	ctgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
+	ctgt = lmv_find_target(lmv, &op_data->op_fid2);
 	if (IS_ERR(ctgt))
 		RETURN(PTR_ERR(ctgt));
 
-	/*
-	 * if child is on remote MDT, we need 2 async RPCs to fetch both LOOKUP
-	 * lock on parent, and UPDATE lock on child MDT, which makes all
-	 * complicated. Considering remote dir is rare case, and not supporting
-	 * it in statahead won't cause any issue, drop its support for now.
+	/* remote object needs two RPCs to lookup and getattr, considering the
+	 * complexity, don't support statahead for now.
 	 */
 	if (ptgt != ctgt)
-		RETURN(-ENOTSUPP);
+		RETURN(-EREMOTE);
 
 	rc = md_intent_getattr_async(ptgt->ltd_exp, minfo);
+
 	RETURN(rc);
 }
 
@@ -3019,7 +3390,7 @@ int lmv_get_fid_from_lsm(struct obd_export *exp,
 	const struct lmv_oinfo *oinfo;
 
 	LASSERT(lsm != NULL);
-	oinfo = lsm_name_to_stripe_info(lsm, name, namelen);
+	oinfo = lsm_name_to_stripe_info(lsm, name, namelen, false);
 	if (IS_ERR(oinfo))
 		return PTR_ERR(oinfo);
 
@@ -3096,6 +3467,9 @@ static int lmv_merge_attr(struct obd_export *exp,
 	for (i = 0; i < lsm->lsm_md_stripe_count; i++) {
 		struct inode *inode = lsm->lsm_md_oinfo[i].lmo_root;
 
+		if (!inode)
+			continue;
+
 		CDEBUG(D_INFO,
 		       "" DFID " size %llu, blocks %llu nlink %u, atime %lld ctime %lld, mtime %lld.\n",
 		       PFID(&lsm->lsm_md_oinfo[i].lmo_fid),
@@ -3156,6 +3530,7 @@ struct md_ops lmv_md_ops = {
         .m_setattr              = lmv_setattr,
         .m_setxattr             = lmv_setxattr,
 	.m_fsync		= lmv_fsync,
+	.m_file_resync		= lmv_file_resync,
 	.m_read_page		= lmv_read_page,
         .m_unlink               = lmv_unlink,
         .m_init_ea_size         = lmv_init_ea_size,
@@ -3171,6 +3546,7 @@ struct md_ops lmv_md_ops = {
 	.m_revalidate_lock      = lmv_revalidate_lock,
 	.m_get_fid_from_lsm	= lmv_get_fid_from_lsm,
 	.m_unpackmd		= lmv_unpackmd,
+	.m_rmfid		= lmv_rmfid,
 };
 
 static int __init lmv_init(void)
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c b/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c
index 37c22a92de716..dc35e7d9d9e66 100644
--- a/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c
+++ b/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -32,63 +32,58 @@
 
 #define DEBUG_SUBSYSTEM S_CLASS
 
-#include <linux/version.h>
 #include <linux/seq_file.h>
-#include <asm/statfs.h>
+#include <linux/statfs.h>
 #include <lprocfs_status.h>
 #include <obd_class.h>
 
 #include "lmv_internal.h"
 
-#ifndef CONFIG_PROC_FS
-static struct lprocfs_vars lprocfs_module_vars[] = { {0} };
-static struct lprocfs_vars lprocfs_obd_vars[] = { {0} };
-#else
-static int lmv_numobd_seq_show(struct seq_file *m, void *v)
+static ssize_t numobd_show(struct kobject *kobj, struct attribute *attr,
+			   char *buf)
 {
-	struct obd_device	*dev = (struct obd_device *)m->private;
-        struct lmv_desc         *desc;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lmv_desc *desc;
 
-        LASSERT(dev != NULL);
         desc = &dev->u.lmv.desc;
-	seq_printf(m, "%u\n", desc->ld_tgt_count);
-	return 0;
+	return sprintf(buf, "%u\n", desc->ld_tgt_count);
 }
-LPROC_SEQ_FOPS_RO(lmv_numobd);
+LUSTRE_RO_ATTR(numobd);
 
-static int lmv_activeobd_seq_show(struct seq_file *m, void *v)
+static ssize_t activeobd_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
 {
-	struct obd_device	*dev = (struct obd_device *)m->private;
-        struct lmv_desc         *desc;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lmv_desc *desc;
 
-        LASSERT(dev != NULL);
         desc = &dev->u.lmv.desc;
-	seq_printf(m, "%u\n", desc->ld_active_tgt_count);
-	return 0;
+	return sprintf(buf, "%u\n", desc->ld_active_tgt_count);
 }
-LPROC_SEQ_FOPS_RO(lmv_activeobd);
+LUSTRE_RO_ATTR(activeobd);
 
-static int lmv_desc_uuid_seq_show(struct seq_file *m, void *v)
+static ssize_t desc_uuid_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
 {
-	struct obd_device	*dev = (struct obd_device*)m->private;
-        struct lmv_obd          *lmv;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lmv_desc *desc;
 
-        LASSERT(dev != NULL);
-        lmv = &dev->u.lmv;
-	seq_printf(m, "%s\n", lmv->desc.ld_uuid.uuid);
-	return 0;
+	desc = &dev->u.lmv.desc;
+	return sprintf(buf, "%s\n", desc->ld_uuid.uuid);
 }
-LPROC_SEQ_FOPS_RO(lmv_desc_uuid);
+LUSTRE_RO_ATTR(desc_uuid);
 
+#ifdef CONFIG_PROC_FS
 static void *lmv_tgt_seq_start(struct seq_file *p, loff_t *pos)
 {
 	struct obd_device       *dev = p->private;
 	struct lmv_obd          *lmv = &dev->u.lmv;
 
 	while (*pos < lmv->tgts_size) {
-		if (lmv->tgts[*pos] != NULL)
+		if (lmv->tgts[*pos])
 			return lmv->tgts[*pos];
-
 		++*pos;
 	}
 
@@ -97,7 +92,6 @@ static void *lmv_tgt_seq_start(struct seq_file *p, loff_t *pos)
 
 static void lmv_tgt_seq_stop(struct seq_file *p, void *v)
 {
-        return;
 }
 
 static void *lmv_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos)
@@ -107,9 +101,8 @@ static void *lmv_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos)
 
 	++*pos;
 	while (*pos < lmv->tgts_size) {
-		if (lmv->tgts[*pos] != NULL)
+		if (lmv->tgts[*pos])
 			return lmv->tgts[*pos];
-
 		++*pos;
 	}
 
@@ -120,10 +113,12 @@ static int lmv_tgt_seq_show(struct seq_file *p, void *v)
 {
 	struct lmv_tgt_desc     *tgt = v;
 
-	if (tgt == NULL)
+	if (!tgt)
 		return 0;
-	seq_printf(p, "%u: %s %sACTIVE\n", tgt->ltd_idx,
-		  tgt->ltd_uuid.uuid, tgt->ltd_active ? "" : "IN");
+
+	seq_printf(p, "%u: %s %sACTIVE\n",
+		   tgt->ltd_idx, tgt->ltd_uuid.uuid,
+		   tgt->ltd_active ? "" : "IN");
 	return 0;
 }
 
@@ -148,21 +143,7 @@ static int lmv_target_seq_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
-LPROC_SEQ_FOPS_RO_TYPE(lmv, uuid);
-
-struct lprocfs_vars lprocfs_lmv_obd_vars[] = {
-	{ .name	=	"numobd",
-	  .fops	=	&lmv_numobd_fops	},
-	{ .name	=	"activeobd",
-	  .fops	=	&lmv_activeobd_fops	},
-	{ .name	=	"uuid",
-	  .fops	=	&lmv_uuid_fops		},
-	{ .name	=	"desc_uuid",
-	  .fops	=	&lmv_desc_uuid_fops	},
-	{ NULL }
-};
-
-const struct proc_ops lmv_proc_target_fops = {
+static const struct proc_ops lmv_proc_target_fops = {
 	PROC_OWNER(THIS_MODULE)
 	.proc_open	= lmv_target_seq_open,
 	.proc_read	= seq_read,
@@ -170,3 +151,39 @@ const struct proc_ops lmv_proc_target_fops = {
 	.proc_release	= seq_release,
 };
 #endif /* CONFIG_PROC_FS */
+
+static struct attribute *lmv_attrs[] = {
+	&lustre_attr_activeobd.attr,
+	&lustre_attr_desc_uuid.attr,
+	&lustre_attr_numobd.attr,
+	NULL,
+};
+
+int lmv_tunables_init(struct obd_device *obd)
+{
+	int rc;
+
+	obd->obd_ktype.default_attrs = lmv_attrs;
+	rc = lprocfs_obd_setup(obd, true);
+	if (rc)
+		goto out_failed;
+#ifdef CONFIG_PROC_FS
+	rc = lprocfs_alloc_md_stats(obd, 0);
+	if (rc) {
+		lprocfs_obd_cleanup(obd);
+		goto out_failed;
+	}
+
+	rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd",
+				0444, &lmv_proc_target_fops, obd);
+	if (rc) {
+		lprocfs_free_md_stats(obd);
+		lprocfs_obd_cleanup(obd);
+		CWARN("%s: error adding LMV target_obd file: rc = %d\n",
+		      obd->obd_name, rc);
+		rc = 0;
+	}
+#endif /* CONFIG_PROC_FS */
+out_failed:
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/lustre/lov/Makefile b/drivers/staging/lustrefsx/lustre/lov/Makefile
index e74389ed4c3e3..dae11b1647cbe 100644
--- a/drivers/staging/lustrefsx/lustre/lov/Makefile
+++ b/drivers/staging/lustrefsx/lustre/lov/Makefile
@@ -2,7 +2,7 @@ obj-$(CONFIG_LUSTREFSX_FS)	+= lov.o
 
 lov-y	:= lov_dev.o lov_ea.o lov_io.o lov_lock.o lov_merge.o lov_obd.o
 lov-y	+= lov_object.o lov_offset.o lov_pack.o lov_page.o lov_pool.o
-lov-y	+= lov_request.o lovsub_dev.o lovsub_lock.o lovsub_object.o
-lov-y	+= lovsub_page.o lproc_lov.o
+lov-y	+= lov_request.o lovsub_dev.o lovsub_object.o
+lov-y	+= lproc_lov.o
 
 include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_cl_internal.h b/drivers/staging/lustrefsx/lustre/lov/lov_cl_internal.h
index 0e84ab38e189a..62ee46daed68f 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_cl_internal.h
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_cl_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -81,7 +81,6 @@
 
 struct lovsub_device;
 struct lovsub_object;
-struct lovsub_lock;
 
 enum lov_device_flags {
         LOV_DEV_INITIALIZED = 1 << 0
@@ -91,6 +90,12 @@ enum lov_device_flags {
  * Upper half.
  */
 
+/* Data-on-MDT array item in lov_device::ld_md_tgts[] */
+struct lovdom_device {
+	struct cl_device	*ldm_mdc;
+	int			 ldm_idx;
+};
+
 struct lov_device {
         /*
          * XXX Locking of lov-private data is missing.
@@ -101,6 +106,13 @@ struct lov_device {
         __u32                     ld_target_nr;
         struct lovsub_device    **ld_target;
         __u32                     ld_flags;
+
+	/* Data-on-MDT devices */
+	__u32			  ld_md_tgts_nr;
+	struct lovdom_device	 *ld_md_tgts;
+	struct obd_device	 *ld_lmv;
+	/* LU site for subdevices */
+	struct lu_site		  ld_site;
 };
 
 /**
@@ -129,15 +141,48 @@ static inline char *llt2str(enum lov_layout_type llt)
 	return "";
 }
 
+/**
+ * Return lov_layout_entry_type associated with a given composite layout
+ * entry.
+ */
+static inline __u32 lov_entry_type(struct lov_stripe_md_entry *lsme)
+{
+	if ((lov_pattern(lsme->lsme_pattern) == LOV_PATTERN_RAID0) ||
+	    (lov_pattern(lsme->lsme_pattern) == LOV_PATTERN_MDT))
+		return lov_pattern(lsme->lsme_pattern);
+	return 0;
+}
+
+struct lov_layout_entry;
+struct lov_object;
+struct lov_lock_sub;
+
+struct lov_comp_layout_entry_ops {
+	int (*lco_init)(const struct lu_env *env, struct lov_device *dev,
+			struct lov_object *lov, unsigned int index,
+			const struct cl_object_conf *conf,
+			struct lov_layout_entry *lle);
+	void (*lco_fini)(const struct lu_env *env,
+			 struct lov_layout_entry *lle);
+	int  (*lco_getattr)(const struct lu_env *env, struct lov_object *obj,
+			    unsigned int index, struct lov_layout_entry *lle,
+			    struct cl_attr **attr);
+};
+
 struct lov_layout_raid0 {
 	unsigned               lo_nr;
+	/**
+	 * record the stripe no before the truncate size, used for setting OST
+	 * object size for truncate. LU-14128.
+	 */
+	int                    lo_trunc_stripeno;
 	/**
 	 * When this is true, lov_object::lo_attr contains
 	 * valid up to date attributes for a top-level
 	 * object. This field is reset to 0 when attributes of
 	 * any sub-object change.
 	 */
-	int		       lo_attr_valid;
+	bool		       lo_attr_valid;
 	/**
 	 * Array of sub-objects. Allocated when top-object is
 	 * created (lov_init_raid0()).
@@ -165,6 +210,38 @@ struct lov_layout_raid0 {
 	struct cl_attr         lo_attr;
 };
 
+struct lov_layout_dom {
+	/* keep this always at first place so DOM layout entry
+	 * can be addressed also as RAID0 after initialization.
+	 */
+	struct lov_layout_raid0 lo_dom_r0;
+	struct lovsub_object *lo_dom;
+	struct lov_oinfo *lo_loi;
+};
+
+struct lov_layout_entry {
+	__u32				lle_type;
+	unsigned int			lle_valid:1;
+	struct lu_extent		*lle_extent;
+	struct lov_stripe_md_entry	*lle_lsme;
+	struct lov_comp_layout_entry_ops *lle_comp_ops;
+	union {
+		struct lov_layout_raid0	lle_raid0;
+		struct lov_layout_dom	lle_dom;
+	};
+};
+
+struct lov_mirror_entry {
+	unsigned short	lre_mirror_id;
+	unsigned short	lre_preferred:1,
+			lre_stale:1,	/* set if any components is stale */
+			lre_valid:1;	/* set if at least one of components
+					 * in this mirror is valid */
+	unsigned short	lre_start;	/* index to lo_entries, start index of
+					 * this mirror */
+	unsigned short	lre_end;	/* end index of this mirror */
+};
+
 /**
  * lov-specific file state.
  *
@@ -180,7 +257,7 @@ struct lov_layout_raid0 {
  * function corresponding to the current layout type.
  */
 struct lov_object {
-	struct cl_object       lo_cl;
+	struct cl_object	lo_cl;
 	/**
 	 * Serializes object operations with transitions between layout types.
 	 *
@@ -220,13 +297,37 @@ struct lov_object {
 		} released;
 		struct lov_layout_composite {
 			/**
-			 * Current valid entry count of lo_entries.
+			 * flags of lov_comp_md_v1::lcm_flags. Mainly used
+			 * by FLR.
+			 */
+			uint32_t        lo_flags;
+			/**
+			 * For FLR: index of preferred mirror to read.
+			 * Preferred mirror is initialized by the preferred
+			 * bit of lsme. It can be changed when the preferred
+			 * is inaccessible.
+			 * In order to make lov_lsm_entry() return the same
+			 * mirror in the same IO context, it's only possible
+			 * to change the preferred mirror when the
+			 * lo_active_ios reaches zero.
+			 */
+			int             lo_preferred_mirror;
+			/**
+			 * For FLR: the lock to protect access to
+			 * lo_preferred_mirror.
 			 */
-			unsigned int lo_entry_count;
-			struct lov_layout_entry {
-				struct lu_extent lle_extent;
-				struct lov_layout_raid0 lle_raid0;
-			} *lo_entries;
+			spinlock_t      lo_write_lock;
+			/**
+			 * For FLR: Number of (valid) mirrors.
+			 */
+			unsigned        lo_mirror_count;
+			struct lov_mirror_entry *lo_mirrors;
+			/**
+			 * Current entry count of lo_entries, include
+			 * invalid entries.
+			 */
+			unsigned int    lo_entry_count;
+			struct lov_layout_entry *lo_entries;
 		} composite;
 	} u;
 	/**
@@ -236,11 +337,80 @@ struct lov_object {
 	struct task_struct            *lo_owner;
 };
 
-#define lov_foreach_layout_entry(lov, entry)			\
-	for (entry = &lov->u.composite.lo_entries[0];		\
-	     entry < &lov->u.composite.lo_entries		\
-			[lov->u.composite.lo_entry_count];	\
-	     entry++)
+static inline struct lov_layout_raid0 *lov_r0(struct lov_object *lov, int i)
+{
+	LASSERT(lov->lo_type == LLT_COMP);
+	LASSERTF(i < lov->u.composite.lo_entry_count,
+		 "entry %d entry_count %d", i, lov->u.composite.lo_entry_count);
+
+	return &lov->u.composite.lo_entries[i].lle_raid0;
+}
+
+static inline struct lov_stripe_md_entry *lov_lse(struct lov_object *lov, int i)
+{
+	LASSERT(lov->lo_lsm != NULL);
+	LASSERT(i < lov->lo_lsm->lsm_entry_count);
+
+	return lov->lo_lsm->lsm_entries[i];
+}
+
+static inline unsigned lov_flr_state(const struct lov_object *lov)
+{
+	if (lov->lo_type != LLT_COMP)
+		return LCM_FL_NONE;
+
+	return lov->u.composite.lo_flags & LCM_FL_FLR_MASK;
+}
+
+static inline bool lov_is_flr(const struct lov_object *lov)
+{
+	return lov_flr_state(lov) != LCM_FL_NONE;
+}
+
+static inline struct lov_layout_entry *lov_entry(struct lov_object *lov, int i)
+{
+	LASSERT(lov->lo_type == LLT_COMP);
+	LASSERTF(i < lov->u.composite.lo_entry_count,
+		 "entry %d entry_count %d", i, lov->u.composite.lo_entry_count);
+
+	return &lov->u.composite.lo_entries[i];
+}
+
+#define lov_for_layout_entry(lov, entry, start, end)			\
+	for (entry = lov_entry(lov, start);				\
+	     entry <= lov_entry(lov, end); entry++)
+
+#define lov_foreach_layout_entry(lov, entry)				\
+	lov_for_layout_entry(lov, entry, 0,				\
+			     (lov)->u.composite.lo_entry_count - 1)
+
+#define lov_foreach_mirror_layout_entry(lov, entry, lre)		\
+	lov_for_layout_entry(lov, entry, (lre)->lre_start, (lre)->lre_end)
+
+static inline struct lov_mirror_entry *
+lov_mirror_entry(struct lov_object *lov, int i)
+{
+	LASSERT(i < lov->u.composite.lo_mirror_count);
+	return &lov->u.composite.lo_mirrors[i];
+}
+
+#define lov_foreach_mirror_entry(lov, lre)				\
+	for (lre = lov_mirror_entry(lov, 0);				\
+	     lre <= lov_mirror_entry(lov,				\
+				lov->u.composite.lo_mirror_count - 1);	\
+	     lre++)
+
+static inline unsigned
+lov_layout_entry_index(struct lov_object *lov, struct lov_layout_entry *entry)
+{
+	struct lov_layout_entry *first = &lov->u.composite.lo_entries[0];
+	unsigned index = (unsigned)(entry - first);
+
+	LASSERT(entry >= first);
+	LASSERT(index < lov->u.composite.lo_entry_count);
+
+	return index;
+}
 
 /**
  * State lov_lock keeps for each sub-lock.
@@ -270,6 +440,8 @@ struct lov_page {
 	struct cl_page_slice	lps_cl;
 	/** layout_entry + stripe index, composed using lov_comp_index() */
 	unsigned int		lps_index;
+	/* the layout gen when this page was created */
+	__u32			lps_layout_gen;
 };
 
 /*
@@ -288,13 +460,6 @@ struct lovsub_object {
         int                     lso_index;
 };
 
-/**
- * Lock state at lovsub layer.
- */
-struct lovsub_lock {
-        struct cl_lock_slice  lss_cl;
-};
-
 /**
  * Describe the environment settings for sublocks.
  */
@@ -303,11 +468,6 @@ struct lov_sublock_env {
         struct cl_io        *lse_io;
 };
 
-struct lovsub_page {
-        struct cl_page_slice lsb_cl;
-};
-
-
 struct lov_thread_info {
 	struct cl_object_conf   lti_stripe_conf;
 	struct lu_fid           lti_fid;
@@ -356,6 +516,26 @@ struct lov_io_sub {
 struct lov_io {
         /** super-class */
         struct cl_io_slice lis_cl;
+
+	/**
+	 * FLR: index to lo_mirrors. Valid only if lov_is_flr() returns true.
+	 *
+	 * The mirror index of this io. Preserved over cl_io_init()
+	 * if io->ci_ndelay_tried is greater than zero.
+	 */
+	int			lis_mirror_index;
+	/**
+	 * FLR: the layout gen when lis_mirror_index was cached. The
+	 * mirror index makes sense only when the layout gen doesn't
+	 * change.
+	 */
+	int			lis_mirror_layout_gen;
+
+	/**
+	 * fields below this will be initialized in lov_io_init().
+	 */
+	unsigned		lis_preserved;
+
         /**
          * Pointer to the object slice. This is a duplicate of
          * lov_io::lis_cl::cis_object.
@@ -398,6 +578,7 @@ struct lov_io {
 	 * All sub-io's created in this lov_io.
 	 */
 	struct list_head	lis_subios;
+
 };
 
 struct lov_session {
@@ -416,7 +597,6 @@ extern struct kmem_cache *lov_object_kmem;
 extern struct kmem_cache *lov_thread_kmem;
 extern struct kmem_cache *lov_session_kmem;
 
-extern struct kmem_cache *lovsub_lock_kmem;
 extern struct kmem_cache *lovsub_object_kmem;
 
 int   lov_object_init     (const struct lu_env *env, struct lu_object *obj,
@@ -427,8 +607,6 @@ int   lov_lock_init       (const struct lu_env *env, struct cl_object *obj,
                            struct cl_lock *lock, const struct cl_io *io);
 int   lov_io_init         (const struct lu_env *env, struct cl_object *obj,
                            struct cl_io *io);
-int   lovsub_lock_init    (const struct lu_env *env, struct cl_object *obj,
-                           struct cl_lock *lock, const struct cl_io *io);
 
 int   lov_lock_init_composite(const struct lu_env *env, struct cl_object *obj,
                            struct cl_lock *lock, const struct cl_io *io);
@@ -446,8 +624,6 @@ struct lov_io_sub *lov_sub_get(const struct lu_env *env, struct lov_io *lio,
 
 int   lov_page_init       (const struct lu_env *env, struct cl_object *ob,
 			   struct cl_page *page, pgoff_t index);
-int   lovsub_page_init    (const struct lu_env *env, struct cl_object *ob,
-			   struct cl_page *page, pgoff_t index);
 int   lov_page_init_empty (const struct lu_env *env, struct cl_object *obj,
 			   struct cl_page *page, pgoff_t index);
 int   lov_page_init_composite(const struct lu_env *env, struct cl_object *obj,
@@ -461,11 +637,27 @@ struct lu_object *lovsub_object_alloc(const struct lu_env *env,
 
 struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov);
 int lov_page_stripe(const struct cl_page *page);
+bool lov_page_is_empty(const struct cl_page *page);
 int lov_lsm_entry(const struct lov_stripe_md *lsm, __u64 offset);
+int lov_io_layout_at(struct lov_io *lio, __u64 offset);
 
 #define lov_foreach_target(lov, var)                    \
         for (var = 0; var < lov_targets_nr(lov); ++var)
 
+static inline struct lu_extent *lov_io_extent(struct lov_io *io, int i)
+{
+	return &lov_lse(io->lis_object, i)->lsme_extent;
+}
+
+/**
+ * For layout entries within @ext.
+ */
+#define lov_foreach_io_layout(ind, lio, ext)				\
+	for (ind = lov_io_layout_at(lio, (ext)->e_start);		\
+	     ind >= 0 &&						\
+	     lu_extent_is_overlapped(lov_io_extent(lio, ind), ext);	\
+	     ind = lov_io_layout_at(lio, lov_io_extent(lio, ind)->e_end))
+
 /*****************************************************************************
  *
  * Type conversions.
@@ -575,22 +767,6 @@ static inline struct lovsub_object *lu2lovsub(const struct lu_object *obj)
         return container_of0(obj, struct lovsub_object, lso_cl.co_lu);
 }
 
-static inline struct lovsub_lock *
-cl2lovsub_lock(const struct cl_lock_slice *slice)
-{
-        LINVRNT(lovsub_is_object(&slice->cls_obj->co_lu));
-        return container_of(slice, struct lovsub_lock, lss_cl);
-}
-
-static inline struct lovsub_lock *cl2sub_lock(const struct cl_lock *lock)
-{
-        const struct cl_lock_slice *slice;
-
-        slice = cl_lock_at(lock, &lovsub_device_type);
-        LASSERT(slice != NULL);
-        return cl2lovsub_lock(slice);
-}
-
 static inline struct lov_lock *cl2lov_lock(const struct cl_lock_slice *slice)
 {
         LINVRNT(lov_is_object(&slice->cls_obj->co_lu));
@@ -603,13 +779,6 @@ static inline struct lov_page *cl2lov_page(const struct cl_page_slice *slice)
         return container_of0(slice, struct lov_page, lps_cl);
 }
 
-static inline struct lovsub_page *
-cl2lovsub_page(const struct cl_page_slice *slice)
-{
-        LINVRNT(lovsub_is_object(&slice->cpl_obj->co_lu));
-        return container_of0(slice, struct lovsub_page, lsb_cl);
-}
-
 static inline struct lov_io *cl2lov_io(const struct lu_env *env,
                                 const struct cl_io_slice *ios)
 {
@@ -634,23 +803,6 @@ static inline struct lov_thread_info *lov_env_info(const struct lu_env *env)
         return info;
 }
 
-static inline struct lov_layout_raid0 *lov_r0(struct lov_object *lov, int i)
-{
-	LASSERT(lov->lo_type == LLT_COMP);
-	LASSERTF(i < lov->u.composite.lo_entry_count,
-		 "entry %d entry_count %d", i, lov->u.composite.lo_entry_count);
-
-	return &lov->u.composite.lo_entries[i].lle_raid0;
-}
-
-static inline struct lov_stripe_md_entry *lov_lse(struct lov_object *lov, int i)
-{
-	LASSERT(lov->lo_lsm != NULL);
-	LASSERT(i < lov->lo_lsm->lsm_entry_count);
-
-	return lov->lo_lsm->lsm_entries[i];
-}
-
 /* lov_pack.c */
 int lov_getstripe(const struct lu_env *env, struct lov_object *obj,
 		  struct lov_stripe_md *lsm, struct lov_user_md __user *lump,
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_dev.c b/drivers/staging/lustrefsx/lustre/lov/lov_dev.c
index 2506c39ec7296..1faef7ad76afa 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_dev.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_dev.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -46,43 +46,37 @@ struct kmem_cache *lov_object_kmem;
 struct kmem_cache *lov_thread_kmem;
 struct kmem_cache *lov_session_kmem;
 
-struct kmem_cache *lovsub_lock_kmem;
 struct kmem_cache *lovsub_object_kmem;
 
 struct lu_kmem_descr lov_caches[] = {
-        {
-                .ckd_cache = &lov_lock_kmem,
-                .ckd_name  = "lov_lock_kmem",
-                .ckd_size  = sizeof (struct lov_lock)
-        },
-        {
-                .ckd_cache = &lov_object_kmem,
-                .ckd_name  = "lov_object_kmem",
-                .ckd_size  = sizeof (struct lov_object)
-        },
-        {
-                .ckd_cache = &lov_thread_kmem,
-                .ckd_name  = "lov_thread_kmem",
-                .ckd_size  = sizeof (struct lov_thread_info)
-        },
-        {
-                .ckd_cache = &lov_session_kmem,
-                .ckd_name  = "lov_session_kmem",
-                .ckd_size  = sizeof (struct lov_session)
-        },
-        {
-                .ckd_cache = &lovsub_lock_kmem,
-                .ckd_name  = "lovsub_lock_kmem",
-                .ckd_size  = sizeof (struct lovsub_lock)
-        },
-        {
-                .ckd_cache = &lovsub_object_kmem,
-                .ckd_name  = "lovsub_object_kmem",
-                .ckd_size  = sizeof (struct lovsub_object)
-        },
-        {
-                .ckd_cache = NULL
-        }
+	{
+		.ckd_cache = &lov_lock_kmem,
+		.ckd_name  = "lov_lock_kmem",
+		.ckd_size  = sizeof(struct lov_lock)
+	},
+	{
+		.ckd_cache = &lov_object_kmem,
+		.ckd_name  = "lov_object_kmem",
+		.ckd_size  = sizeof(struct lov_object)
+	},
+	{
+		.ckd_cache = &lov_thread_kmem,
+		.ckd_name  = "lov_thread_kmem",
+		.ckd_size  = sizeof(struct lov_thread_info)
+	},
+	{
+		.ckd_cache = &lov_session_kmem,
+		.ckd_name  = "lov_session_kmem",
+		.ckd_size  = sizeof(struct lov_session)
+	},
+	{
+		.ckd_cache = &lovsub_object_kmem,
+		.ckd_name  = "lovsub_object_kmem",
+		.ckd_size  = sizeof(struct lovsub_object)
+	},
+	{
+		.ckd_cache = NULL
+	}
 };
 
 /*****************************************************************************
@@ -97,7 +91,7 @@ static void *lov_key_init(const struct lu_context *ctx,
 	struct lov_thread_info *info;
 
 	OBD_SLAB_ALLOC_PTR_GFP(info, lov_thread_kmem, GFP_NOFS);
-	if (info == NULL)
+	if (!info)
 		info = ERR_PTR(-ENOMEM);
 	return info;
 }
@@ -110,9 +104,9 @@ static void lov_key_fini(const struct lu_context *ctx,
 }
 
 struct lu_context_key lov_key = {
-        .lct_tags = LCT_CL_THREAD,
-        .lct_init = lov_key_init,
-        .lct_fini = lov_key_fini
+	.lct_tags = LCT_CL_THREAD,
+	.lct_init = lov_key_init,
+	.lct_fini = lov_key_fini
 };
 
 static void *lov_session_key_init(const struct lu_context *ctx,
@@ -121,113 +115,180 @@ static void *lov_session_key_init(const struct lu_context *ctx,
 	struct lov_session *info;
 
 	OBD_SLAB_ALLOC_PTR_GFP(info, lov_session_kmem, GFP_NOFS);
-	if (info == NULL)
+	if (!info)
 		info = ERR_PTR(-ENOMEM);
 	return info;
 }
 
 static void lov_session_key_fini(const struct lu_context *ctx,
-                                 struct lu_context_key *key, void *data)
+				 struct lu_context_key *key, void *data)
 {
-        struct lov_session *info = data;
-        OBD_SLAB_FREE_PTR(info, lov_session_kmem);
+	struct lov_session *info = data;
+
+	OBD_SLAB_FREE_PTR(info, lov_session_kmem);
 }
 
 struct lu_context_key lov_session_key = {
-        .lct_tags = LCT_SESSION,
-        .lct_init = lov_session_key_init,
-        .lct_fini = lov_session_key_fini
+	.lct_tags = LCT_SESSION,
+	.lct_init = lov_session_key_init,
+	.lct_fini = lov_session_key_fini
 };
 
 /* type constructor/destructor: lov_type_{init,fini,start,stop}() */
 LU_TYPE_INIT_FINI(lov, &lov_key, &lov_session_key);
 
+
+static int lov_mdc_dev_init(const struct lu_env *env, struct lov_device *ld,
+			    struct lu_device *mdc_dev, __u32 idx, __u32 nr)
+{
+	struct cl_device *cl;
+
+	ENTRY;
+	cl = cl_type_setup(env, &ld->ld_site, &lovsub_device_type,
+			   mdc_dev);
+	if (IS_ERR(cl))
+		RETURN(PTR_ERR(cl));
+
+	ld->ld_md_tgts[nr].ldm_mdc = cl;
+	ld->ld_md_tgts[nr].ldm_idx = idx;
+	RETURN(0);
+}
+
 static struct lu_device *lov_device_fini(const struct lu_env *env,
-                                         struct lu_device *d)
+					 struct lu_device *d)
 {
-        int i;
-        struct lov_device *ld = lu2lov_dev(d);
+	struct lov_device *ld = lu2lov_dev(d);
+	int i;
 
-        LASSERT(ld->ld_lov != NULL);
-        if (ld->ld_target == NULL)
-                RETURN(NULL);
+	LASSERT(ld->ld_lov != NULL);
 
-        lov_foreach_target(ld, i) {
-                struct lovsub_device *lsd;
+	if (ld->ld_lmv) {
+		class_decref(ld->ld_lmv, "lov", d);
+		ld->ld_lmv = NULL;
+	}
 
-                lsd = ld->ld_target[i];
-                if (lsd != NULL) {
-                        cl_stack_fini(env, lovsub2cl_dev(lsd));
-                        ld->ld_target[i] = NULL;
-                }
-        }
-        RETURN(NULL);
+	if (ld->ld_md_tgts) {
+		for (i = 0; i < ld->ld_md_tgts_nr; i++) {
+			if (!ld->ld_md_tgts[i].ldm_mdc)
+				continue;
+
+			cl_stack_fini(env, ld->ld_md_tgts[i].ldm_mdc);
+			ld->ld_md_tgts[i].ldm_mdc = NULL;
+			ld->ld_lov->lov_mdc_tgts[i].lmtd_mdc = NULL;
+		}
+	}
+
+	if (ld->ld_target) {
+		lov_foreach_target(ld, i) {
+			struct lovsub_device *lsd;
+
+			lsd = ld->ld_target[i];
+			if (lsd) {
+				cl_stack_fini(env, lovsub2cl_dev(lsd));
+				ld->ld_target[i] = NULL;
+			}
+		}
+	}
+	RETURN(NULL);
 }
 
 static int lov_device_init(const struct lu_env *env, struct lu_device *d,
-                           const char *name, struct lu_device *next)
+			   const char *name, struct lu_device *next)
 {
-        struct lov_device *ld = lu2lov_dev(d);
-        int i;
-        int rc = 0;
-
-        LASSERT(d->ld_site != NULL);
-        if (ld->ld_target == NULL)
-                RETURN(rc);
-
-        lov_foreach_target(ld, i) {
-                struct lovsub_device *lsd;
-                struct cl_device     *cl;
-                struct lov_tgt_desc  *desc;
-
-                desc = ld->ld_lov->lov_tgts[i];
-                if (desc == NULL)
-                        continue;
-
-                cl = cl_type_setup(env, d->ld_site, &lovsub_device_type,
-                                   desc->ltd_obd->obd_lu_dev);
-                if (IS_ERR(cl)) {
-                        rc = PTR_ERR(cl);
-                        break;
-                }
-                lsd = cl2lovsub_dev(cl);
-                ld->ld_target[i] = lsd;
-        }
+	struct lov_device *ld = lu2lov_dev(d);
+	int i;
+	int rc = 0;
+
+	/* check all added already MDC subdevices and initialize them */
+	for (i = 0; i < ld->ld_md_tgts_nr; i++) {
+		struct obd_device *mdc;
+		__u32 idx;
+
+		mdc = ld->ld_lov->lov_mdc_tgts[i].lmtd_mdc;
+		idx = ld->ld_lov->lov_mdc_tgts[i].lmtd_index;
+
+		if (!mdc)
+			continue;
+
+		rc = lov_mdc_dev_init(env, ld, mdc->obd_lu_dev, idx, i);
+		if (rc) {
+			CERROR("%s: failed to add MDC %s as target: rc = %d\n",
+			       d->ld_obd->obd_name,
+			       obd_uuid2str(&mdc->obd_uuid), rc);
+			GOTO(out_err, rc);
+		}
+	}
+
+	if (!ld->ld_target)
+		RETURN(0);
+
+	lov_foreach_target(ld, i) {
+		struct lovsub_device *lsd;
+		struct cl_device *cl;
+		struct lov_tgt_desc *desc;
+
+		desc = ld->ld_lov->lov_tgts[i];
+		if (!desc)
+			continue;
 
-        if (rc)
-                lov_device_fini(env, d);
-        else
-                ld->ld_flags |= LOV_DEV_INITIALIZED;
+		cl = cl_type_setup(env, &ld->ld_site, &lovsub_device_type,
+				   desc->ltd_obd->obd_lu_dev);
+		if (IS_ERR(cl))
+			GOTO(out_err, rc = PTR_ERR(cl));
 
-        RETURN(rc);
+		lsd = cl2lovsub_dev(cl);
+		ld->ld_target[i] = lsd;
+	}
+	ld->ld_flags |= LOV_DEV_INITIALIZED;
+	RETURN(0);
+
+out_err:
+	lu_device_fini(d);
+	RETURN(rc);
 }
 
 /* Free the lov specific data created for the back end lu_device. */
 static struct lu_device *lov_device_free(const struct lu_env *env,
-                                         struct lu_device *d)
+					 struct lu_device *d)
 {
 	struct lov_device *ld = lu2lov_dev(d);
 	const int nr = ld->ld_target_nr;
 
+	lu_site_fini(&ld->ld_site);
+
 	cl_device_fini(lu2cl_dev(d));
-	if (ld->ld_target != NULL)
+	if (ld->ld_target) {
 		OBD_FREE(ld->ld_target, nr * sizeof ld->ld_target[0]);
+		ld->ld_target = NULL;
+	}
+	if (ld->ld_md_tgts) {
+		OBD_FREE(ld->ld_md_tgts,
+			 sizeof(*ld->ld_md_tgts) * LOV_MDC_TGT_MAX);
+		ld->ld_md_tgts = NULL;
+	}
+	/* free array of MDCs */
+	if (ld->ld_lov->lov_mdc_tgts) {
+		OBD_FREE(ld->ld_lov->lov_mdc_tgts,
+			 sizeof(*ld->ld_lov->lov_mdc_tgts) * LOV_MDC_TGT_MAX);
+		ld->ld_lov->lov_mdc_tgts = NULL;
+	}
 
 	OBD_FREE_PTR(ld);
 	return NULL;
 }
 
 static void lov_cl_del_target(const struct lu_env *env, struct lu_device *dev,
-                              __u32 index)
+			      __u32 index)
 {
-        struct lov_device *ld = lu2lov_dev(dev);
-        ENTRY;
+	struct lov_device *ld = lu2lov_dev(dev);
 
-        if (ld->ld_target[index] != NULL) {
-                cl_stack_fini(env, lovsub2cl_dev(ld->ld_target[index]));
-                ld->ld_target[index] = NULL;
-        }
-        EXIT;
+	ENTRY;
+
+	if (ld->ld_target[index]) {
+		cl_stack_fini(env, lovsub2cl_dev(ld->ld_target[index]));
+		ld->ld_target[index] = NULL;
+	}
+	EXIT;
 }
 
 static int lov_expand_targets(const struct lu_env *env, struct lov_device *dev)
@@ -245,7 +306,7 @@ static int lov_expand_targets(const struct lu_env *env, struct lov_device *dev)
 		const size_t sz = sizeof(newd[0]);
 
 		OBD_ALLOC(newd, tgt_size * sz);
-		if (newd != NULL) {
+		if (newd) {
 			if (sub_size > 0) {
 				memcpy(newd, dev->ld_target, sub_size * sz);
 				OBD_FREE(dev->ld_target, sub_size * sz);
@@ -262,32 +323,31 @@ static int lov_expand_targets(const struct lu_env *env, struct lov_device *dev)
 }
 
 static int lov_cl_add_target(const struct lu_env *env, struct lu_device *dev,
-                             __u32 index)
+			     __u32 index)
 {
-        struct obd_device    *obd = dev->ld_obd;
-        struct lov_device    *ld  = lu2lov_dev(dev);
-        struct lov_tgt_desc  *tgt;
-        struct lovsub_device *lsd;
-        struct cl_device     *cl;
-        int rc;
-        ENTRY;
-
-        obd_getref(obd);
-
-        tgt = obd->u.lov.lov_tgts[index];
-        LASSERT(tgt != NULL);
-        LASSERT(tgt->ltd_obd != NULL);
-
-        if (!tgt->ltd_obd->obd_set_up) {
-                CERROR("Target %s not set up\n", obd_uuid2str(&tgt->ltd_uuid));
-                RETURN(-EINVAL);
-        }
+	struct obd_device    *obd = dev->ld_obd;
+	struct lov_device    *ld  = lu2lov_dev(dev);
+	struct lov_tgt_desc  *tgt;
+	struct lovsub_device *lsd;
+	struct cl_device     *cl;
+	int rc;
+
+	ENTRY;
+
+	lov_tgts_getref(obd);
 
-        rc = lov_expand_targets(env, ld);
-        if (rc == 0 && ld->ld_flags & LOV_DEV_INITIALIZED) {
-                LASSERT(dev->ld_site != NULL);
+	tgt = obd->u.lov.lov_tgts[index];
+	LASSERT(tgt != NULL);
+	LASSERT(tgt->ltd_obd != NULL);
 
-		cl = cl_type_setup(env, dev->ld_site, &lovsub_device_type,
+	if (!tgt->ltd_obd->obd_set_up) {
+		CERROR("Target %s not set up\n", obd_uuid2str(&tgt->ltd_uuid));
+		RETURN(-EINVAL);
+	}
+
+	rc = lov_expand_targets(env, ld);
+	if (rc == 0 && ld->ld_flags & LOV_DEV_INITIALIZED) {
+		cl = cl_type_setup(env, &ld->ld_site, &lovsub_device_type,
 				   tgt->ltd_obd->obd_lu_dev);
 		if (!IS_ERR(cl)) {
 			lsd = cl2lovsub_dev(cl);
@@ -299,94 +359,239 @@ static int lov_cl_add_target(const struct lu_env *env, struct lu_device *dev,
 			rc = PTR_ERR(cl);
 		}
         }
-        obd_putref(obd);
-        RETURN(rc);
+
+	lov_tgts_putref(obd);
+
+	RETURN(rc);
+}
+
+/**
+ * Add new MDC target device in LOV.
+ *
+ * This function is part of the configuration log processing. It adds new MDC
+ * device to the MDC device array indexed by their indexes.
+ *
+ * \param[in] env	execution environment
+ * \param[in] d		LU device of LOV device
+ * \param[in] mdc	MDC device to add
+ * \param[in] idx	MDC device index
+ *
+ * \retval		0 if successful
+ * \retval		negative value on error
+ */
+static int lov_add_mdc_target(const struct lu_env *env, struct lu_device *d,
+			      struct obd_device *mdc, __u32 idx)
+{
+	struct lov_device *ld = lu2lov_dev(d);
+	struct obd_device *lov_obd = d->ld_obd;
+	struct obd_device *lmv_obd;
+	int next;
+	int rc = 0;
+
+	ENTRY;
+
+	LASSERT(mdc != NULL);
+	if (ld->ld_md_tgts_nr == LOV_MDC_TGT_MAX) {
+		/*
+		 * If the maximum value of LOV_MDC_TGT_MAX will become too
+		 * small then all MD target handling must be rewritten in LOD
+		 * manner, check lod_add_device() and related functionality.
+		 */
+		CERROR("%s: cannot serve more than %d MDC devices\n",
+		       lov_obd->obd_name, LOV_MDC_TGT_MAX);
+		RETURN(-ERANGE);
+	}
+
+	/*
+	 * grab FLD from lmv, do that here, when first MDC is added
+	 * to be sure LMV is set up and can be found
+	 */
+	if (!ld->ld_lmv) {
+		next = 0;
+		while ((lmv_obd = class_devices_in_group(&lov_obd->obd_uuid,
+							 &next)) != NULL) {
+			if ((strncmp(lmv_obd->obd_type->typ_name,
+				     LUSTRE_LMV_NAME,
+				     strlen(LUSTRE_LMV_NAME)) == 0))
+				break;
+		}
+		if (!lmv_obd) {
+			CERROR("%s: cannot find LMV OBD by UUID (%s)\n",
+			       lov_obd->obd_name,
+			       obd_uuid2str(&lmv_obd->obd_uuid));
+			RETURN(-ENODEV);
+		}
+		spin_lock(&lmv_obd->obd_dev_lock);
+		class_incref(lmv_obd, "lov", ld);
+		spin_unlock(&lmv_obd->obd_dev_lock);
+		ld->ld_lmv = lmv_obd;
+	}
+
+	LASSERT(lov_obd->u.lov.lov_mdc_tgts[ld->ld_md_tgts_nr].lmtd_mdc ==
+		NULL);
+
+	if (ld->ld_flags & LOV_DEV_INITIALIZED) {
+		rc = lov_mdc_dev_init(env, ld, mdc->obd_lu_dev, idx,
+				      ld->ld_md_tgts_nr);
+		if (rc) {
+			CERROR("%s: failed to add MDC %s as target: rc = %d\n",
+			       lov_obd->obd_name, obd_uuid2str(&mdc->obd_uuid),
+			       rc);
+			RETURN(rc);
+		}
+	}
+
+	lov_obd->u.lov.lov_mdc_tgts[ld->ld_md_tgts_nr].lmtd_mdc = mdc;
+	lov_obd->u.lov.lov_mdc_tgts[ld->ld_md_tgts_nr].lmtd_index = idx;
+	ld->ld_md_tgts_nr++;
+
+	RETURN(rc);
 }
 
 static int lov_process_config(const struct lu_env *env,
-                              struct lu_device *d, struct lustre_cfg *cfg)
+			      struct lu_device *d, struct lustre_cfg *cfg)
 {
-        struct obd_device *obd = d->ld_obd;
-        int cmd;
-        int rc;
-        int gen;
-        __u32 index;
-
-        obd_getref(obd);
-
-        cmd = cfg->lcfg_command;
-        rc = lov_process_config_base(d->ld_obd, cfg, &index, &gen);
-        if (rc == 0) {
-                switch(cmd) {
-                case LCFG_LOV_ADD_OBD:
-                case LCFG_LOV_ADD_INA:
-                        rc = lov_cl_add_target(env, d, index);
-                        if (rc != 0)
-				lov_del_target(d->ld_obd, index, NULL, 0);
-                        break;
-                case LCFG_LOV_DEL_OBD:
-                        lov_cl_del_target(env, d, index);
-                        break;
-                }
-        }
-        obd_putref(obd);
-        RETURN(rc);
+	struct obd_device *obd = d->ld_obd;
+	int cmd;
+	int rc;
+	int gen;
+	u32 index;
+
+	lov_tgts_getref(obd);
+
+	cmd = cfg->lcfg_command;
+
+	rc = lov_process_config_base(d->ld_obd, cfg, &index, &gen);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	switch (cmd) {
+	case LCFG_LOV_ADD_OBD:
+	case LCFG_LOV_ADD_INA:
+		rc = lov_cl_add_target(env, d, index);
+		if (rc != 0)
+			lov_del_target(d->ld_obd, index, NULL, 0);
+		break;
+	case LCFG_LOV_DEL_OBD:
+		lov_cl_del_target(env, d, index);
+		break;
+	case LCFG_ADD_MDC:
+	{
+		struct obd_device *mdc;
+		struct obd_uuid tgt_uuid;
+
+		/*
+		 * modify_mdc_tgts add 0:lustre-clilmv  1:lustre-MDT0000_UUID
+		 * 2:0  3:1  4:lustre-MDT0000-mdc_UUID
+		 */
+		if (LUSTRE_CFG_BUFLEN(cfg, 1) > sizeof(tgt_uuid.uuid))
+			GOTO(out, rc = -EINVAL);
+
+		obd_str2uuid(&tgt_uuid, lustre_cfg_buf(cfg, 1));
+
+		rc = kstrtou32(lustre_cfg_buf(cfg, 2), 10, &index);
+		if (rc)
+			GOTO(out, rc);
+
+		mdc = class_find_client_obd(&tgt_uuid, LUSTRE_MDC_NAME,
+					    &obd->obd_uuid);
+		if (!mdc)
+			GOTO(out, rc = -ENODEV);
+		rc = lov_add_mdc_target(env, d, mdc, index);
+		break;
+	}
+	}
+out:
+	lov_tgts_putref(obd);
+	RETURN(rc);
 }
 
 static const struct lu_device_operations lov_lu_ops = {
-        .ldo_object_alloc      = lov_object_alloc,
-        .ldo_process_config    = lov_process_config,
+	.ldo_object_alloc      = lov_object_alloc,
+	.ldo_process_config    = lov_process_config,
 };
 
 static struct lu_device *lov_device_alloc(const struct lu_env *env,
-                                          struct lu_device_type *t,
-                                          struct lustre_cfg *cfg)
+					  struct lu_device_type *t,
+					  struct lustre_cfg *cfg)
 {
-        struct lu_device *d;
-        struct lov_device *ld;
-        struct obd_device *obd;
-        int rc;
+	struct lu_device *d;
+	struct lov_device *ld;
+	struct obd_device *obd;
+	int rc;
 
-        OBD_ALLOC_PTR(ld);
-        if (ld == NULL)
-                RETURN(ERR_PTR(-ENOMEM));
+	OBD_ALLOC_PTR(ld);
+	if (!ld)
+		RETURN(ERR_PTR(-ENOMEM));
 
 	cl_device_init(&ld->ld_cl, t);
 	d = lov2lu_dev(ld);
 	d->ld_ops = &lov_lu_ops;
 
-        /* setup the LOV OBD */
-        obd = class_name2obd(lustre_cfg_string(cfg, 0));
-        LASSERT(obd != NULL);
-        rc = lov_setup(obd, cfg);
-        if (rc) {
-                lov_device_free(env, d);
-                RETURN(ERR_PTR(rc));
-        }
+	/* setup the LOV OBD */
+	obd = class_name2obd(lustre_cfg_string(cfg, 0));
+	LASSERT(obd != NULL);
+	rc = lov_setup(obd, cfg);
+	if (rc)
+		GOTO(out, rc);
+
+	/* Alloc MDC devices array */
+	/* XXX: need dynamic allocation at some moment */
+	OBD_ALLOC(ld->ld_md_tgts, sizeof(*ld->ld_md_tgts) * LOV_MDC_TGT_MAX);
+	if (!ld->ld_md_tgts)
+		GOTO(out, rc = -ENOMEM);
+
+	ld->ld_md_tgts_nr = 0;
+
+	ld->ld_lov = &obd->u.lov;
+	OBD_ALLOC(ld->ld_lov->lov_mdc_tgts,
+		  sizeof(*ld->ld_lov->lov_mdc_tgts) * LOV_MDC_TGT_MAX);
+	if (!ld->ld_lov->lov_mdc_tgts)
+		GOTO(out_md_tgts, rc = -ENOMEM);
+
+	rc = lu_site_init(&ld->ld_site, d);
+	if (rc != 0)
+		GOTO(out_mdc_tgts, rc);
+
+	rc = lu_site_init_finish(&ld->ld_site);
+	if (rc != 0)
+		GOTO(out_site, rc);
+
+	RETURN(d);
+out_site:
+	lu_site_fini(&ld->ld_site);
+out_mdc_tgts:
+	OBD_FREE(ld->ld_lov->lov_mdc_tgts,
+		 sizeof(*ld->ld_lov->lov_mdc_tgts) * LOV_MDC_TGT_MAX);
+	ld->ld_lov->lov_mdc_tgts = NULL;
+out_md_tgts:
+	OBD_FREE(ld->ld_md_tgts, sizeof(*ld->ld_md_tgts) * LOV_MDC_TGT_MAX);
+	ld->ld_md_tgts = NULL;
+out:
+	OBD_FREE_PTR(ld);
 
-        ld->ld_lov = &obd->u.lov;
-        RETURN(d);
+	return ERR_PTR(rc);
 }
 
 static const struct lu_device_type_operations lov_device_type_ops = {
-        .ldto_init = lov_type_init,
-        .ldto_fini = lov_type_fini,
+	.ldto_init = lov_type_init,
+	.ldto_fini = lov_type_fini,
 
-        .ldto_start = lov_type_start,
-        .ldto_stop  = lov_type_stop,
+	.ldto_start = lov_type_start,
+	.ldto_stop  = lov_type_stop,
 
-        .ldto_device_alloc = lov_device_alloc,
-        .ldto_device_free  = lov_device_free,
+	.ldto_device_alloc = lov_device_alloc,
+	.ldto_device_free  = lov_device_free,
 
-        .ldto_device_init    = lov_device_init,
-        .ldto_device_fini    = lov_device_fini
+	.ldto_device_init    = lov_device_init,
+	.ldto_device_fini    = lov_device_fini
 };
 
 struct lu_device_type lov_device_type = {
-        .ldt_tags     = LU_DEVICE_CL,
-        .ldt_name     = LUSTRE_LOV_NAME,
-        .ldt_ops      = &lov_device_type_ops,
-        .ldt_ctx_tags = LCT_CL_THREAD
+	.ldt_tags     = LU_DEVICE_CL,
+	.ldt_name     = LUSTRE_LOV_NAME,
+	.ldt_ops      = &lov_device_type_ops,
+	.ldt_ctx_tags = LCT_CL_THREAD
 };
 
 /** @} lov */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_ea.c b/drivers/staging/lustrefsx/lustre/lov/lov_ea.c
index 5b50b0a9294dc..1d388637d0235 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_ea.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_ea.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -41,9 +41,6 @@
 #include <libcfs/libcfs.h>
 
 #include <obd_class.h>
-#include <lustre/lustre_idl.h>
-#include <lustre/lustre_user.h>
-
 #include "lov_internal.h"
 
 static inline void
@@ -53,8 +50,10 @@ lu_extent_le_to_cpu(struct lu_extent *dst, const struct lu_extent *src)
 	dst->e_end = le64_to_cpu(src->e_end);
 }
 
-/* Find minimum stripe maxbytes value.  For inactive or
- * reconnecting targets use LUSTRE_EXT3_STRIPE_MAXBYTES. */
+/*
+ * Find minimum stripe maxbytes value.  For inactive or
+ * reconnecting targets use LUSTRE_EXT3_STRIPE_MAXBYTES.
+ */
 static loff_t lov_tgt_maxbytes(struct lov_tgt_desc *tgt)
 {
 	struct obd_import *imp;
@@ -64,11 +63,12 @@ static loff_t lov_tgt_maxbytes(struct lov_tgt_desc *tgt)
 		return maxbytes;
 
 	imp = tgt->ltd_obd->u.cli.cl_import;
-	if (imp == NULL)
+	if (!imp)
 		return maxbytes;
 
 	spin_lock(&imp->imp_lock);
-	if (imp->imp_state == LUSTRE_IMP_FULL &&
+	if ((imp->imp_state == LUSTRE_IMP_FULL ||
+	    imp->imp_state == LUSTRE_IMP_IDLE) &&
 	    (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES) &&
 	    imp->imp_connect_data.ocd_maxbytes > 0)
 		maxbytes = imp->imp_connect_data.ocd_maxbytes;
@@ -93,7 +93,8 @@ static int lsm_lmm_verify_v1v3(struct lov_mds_md *lmm, size_t lmm_size,
 		return -EINVAL;
 	}
 
-	if (lov_pattern(le32_to_cpu(lmm->lmm_pattern)) != LOV_PATTERN_RAID0) {
+	if (lov_pattern(le32_to_cpu(lmm->lmm_pattern)) != LOV_PATTERN_MDT &&
+	    lov_pattern(le32_to_cpu(lmm->lmm_pattern)) != LOV_PATTERN_RAID0) {
 		CERROR("bad striping pattern\n");
 		lov_dump_lmm_common(D_WARNING, lmm);
 		return -EINVAL;
@@ -184,7 +185,7 @@ lsme_unpack(struct lov_obd *lov, struct lov_mds_md *lmm, size_t buf_size,
 
 	lsme_size = offsetof(typeof(*lsme), lsme_oinfo[stripe_count]);
 	OBD_ALLOC_LARGE(lsme, lsme_size);
-	if (lsme == NULL)
+	if (!lsme)
 		RETURN(ERR_PTR(-ENOMEM));
 
 	lsme->lsme_magic = magic;
@@ -195,7 +196,7 @@ lsme_unpack(struct lov_obd *lov, struct lov_mds_md *lmm, size_t buf_size,
 	lsme->lsme_stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
 	lsme->lsme_layout_gen = le16_to_cpu(lmm->lmm_layout_gen);
 
-	if (pool_name != NULL) {
+	if (pool_name) {
 		size_t pool_name_len;
 
 		pool_name_len = strlcpy(lsme->lsme_pool_name, pool_name,
@@ -204,12 +205,22 @@ lsme_unpack(struct lov_obd *lov, struct lov_mds_md *lmm, size_t buf_size,
 			GOTO(out_lsme, rc = -E2BIG);
 	}
 
+	/* with Data-on-MDT set maxbytes to stripe size */
+	if (lsme_is_dom(lsme)) {
+		if (maxbytes) {
+			lov_bytes = lsme->lsme_stripe_size;
+			goto out_dom1;
+		} else {
+			goto out_dom2;
+		}
+	}
+
 	for (i = 0; i < stripe_count; i++) {
 		struct lov_oinfo *loi;
 		struct lov_tgt_desc *ltd;
 
 		OBD_SLAB_ALLOC_PTR_GFP(loi, lov_oinfo_slab, GFP_NOFS);
-		if (loi == NULL)
+		if (!loi)
 			GOTO(out_lsme, rc = -ENOMEM);
 
 		lsme->lsme_oinfo[i] = loi;
@@ -230,7 +241,7 @@ lsme_unpack(struct lov_obd *lov, struct lov_mds_md *lmm, size_t buf_size,
 		}
 
 		ltd = lov->lov_tgts[loi->loi_ost_idx];
-		if (ltd == NULL) {
+		if (!ltd) {
 			CERROR("%s: OST index %d missing\n",
 			       (char*)lov->desc.ld_uuid.uuid, loi->loi_ost_idx);
 			lov_dump_lmm_v1(D_WARNING, lmm);
@@ -242,17 +253,21 @@ lsme_unpack(struct lov_obd *lov, struct lov_mds_md *lmm, size_t buf_size,
 			min_stripe_maxbytes = lov_bytes;
 	}
 
-	if (min_stripe_maxbytes == 0)
-		min_stripe_maxbytes = LUSTRE_EXT3_STRIPE_MAXBYTES;
+	if (maxbytes) {
+		if (min_stripe_maxbytes == 0)
+			min_stripe_maxbytes = LUSTRE_EXT3_STRIPE_MAXBYTES;
 
-	lov_bytes = min_stripe_maxbytes * stripe_count;
+		if (stripe_count == 0)
+			stripe_count = lov->desc.ld_tgt_count;
 
-	if (maxbytes != NULL) {
-		if (lov_bytes < min_stripe_maxbytes) /* handle overflow */
-			*maxbytes = MAX_LFS_FILESIZE;
+		if (min_stripe_maxbytes <= LLONG_MAX / stripe_count)
+			lov_bytes = min_stripe_maxbytes * stripe_count;
 		else
-			*maxbytes = lov_bytes;
+			lov_bytes = MAX_LFS_FILESIZE;
+out_dom1:
+		*maxbytes = min_t(loff_t, lov_bytes, MAX_LFS_FILESIZE);
 	}
+out_dom2:
 
 	return lsme;
 
@@ -260,7 +275,7 @@ lsme_unpack(struct lov_obd *lov, struct lov_mds_md *lmm, size_t buf_size,
 	for (i = 0; i < stripe_count; i++) {
 		struct lov_oinfo *loi = lsme->lsme_oinfo[i];
 
-		if (loi != NULL)
+		if (loi)
 			OBD_SLAB_FREE_PTR(lsme->lsme_oinfo[i], lov_oinfo_slab);
 	}
 	OBD_FREE_LARGE(lsme, lsme_size);
@@ -293,7 +308,7 @@ lov_stripe_md *lsm_unpackmd_v1v3(struct lov_obd *lov, struct lov_mds_md *lmm,
 
 	lsm_size = offsetof(typeof(*lsm), lsm_entries[1]);
 	OBD_ALLOC(lsm, lsm_size);
-	if (lsm == NULL)
+	if (!lsm)
 		GOTO(out_lsme, rc = -ENOMEM);
 
 	atomic_set(&lsm->lsm_refc, 1);
@@ -384,7 +399,8 @@ lsme_unpack_comp(struct lov_obd *lov, struct lov_mds_md *lmm,
 	unsigned int stripe_count;
 
 	stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
-	if (stripe_count == 0)
+	if (stripe_count == 0 &&
+	    lov_pattern(le32_to_cpu(lmm->lmm_pattern)) != LOV_PATTERN_MDT)
 		RETURN(ERR_PTR(-EINVAL));
 	/* un-instantiated lmm contains no ost id info, i.e. lov_ost_data_v1 */
 	if (!inited)
@@ -427,7 +443,7 @@ lsm_unpackmd_comp_md_v1(struct lov_obd *lov, void *buf, size_t buf_size)
 
 	lsm_size = offsetof(typeof(*lsm), lsm_entries[entry_count]);
 	OBD_ALLOC(lsm, lsm_size);
-	if (lsm == NULL)
+	if (!lsm)
 		return ERR_PTR(-ENOMEM);
 
 	atomic_set(&lsm->lsm_refc, 1);
@@ -435,6 +451,8 @@ lsm_unpackmd_comp_md_v1(struct lov_obd *lov, void *buf, size_t buf_size)
 	lsm->lsm_magic = le32_to_cpu(lcm->lcm_magic);
 	lsm->lsm_layout_gen = le32_to_cpu(lcm->lcm_layout_gen);
 	lsm->lsm_entry_count = entry_count;
+	lsm->lsm_mirror_count = le16_to_cpu(lcm->lcm_mirror_count);
+	lsm->lsm_flags = le16_to_cpu(lcm->lcm_flags);
 	lsm->lsm_is_released = true;
 	lsm->lsm_maxbytes = LLONG_MIN;
 
@@ -463,16 +481,22 @@ lsm_unpackmd_comp_md_v1(struct lov_obd *lov, void *buf, size_t buf_size)
 		lsm->lsm_entries[i] = lsme;
 		lsme->lsme_id = le32_to_cpu(lcme->lcme_id);
 		lsme->lsme_flags = le32_to_cpu(lcme->lcme_flags);
+		if (lsme->lsme_flags & LCME_FL_NOSYNC)
+			lsme->lsme_timestamp =
+				le64_to_cpu(lcme->lcme_timestamp);
 		lu_extent_le_to_cpu(&lsme->lsme_extent, &lcme->lcme_extent);
 
 		if (i == entry_count - 1) {
 			lsm->lsm_maxbytes = (loff_t)lsme->lsme_extent.e_start +
 					    maxbytes;
-			/* the last component hasn't been defined, or
-			 * lsm_maxbytes overflowed. */
-			if (lsme->lsme_extent.e_end != LUSTRE_EOF ||
-			    lsm->lsm_maxbytes <
-			    (loff_t)lsme->lsme_extent.e_start)
+			/*
+			 * the last component hasn't been defined, or
+			 * lsm_maxbytes overflowed.
+			 */
+			if (!lsme_is_dom(lsme) &&
+			    (lsme->lsme_extent.e_end != LUSTRE_EOF ||
+			     lsm->lsm_maxbytes <
+			     (loff_t)lsme->lsme_extent.e_start))
 				lsm->lsm_maxbytes = MAX_LFS_FILESIZE;
 		}
 	}
@@ -481,7 +505,7 @@ lsm_unpackmd_comp_md_v1(struct lov_obd *lov, void *buf, size_t buf_size)
 
 out_lsm:
 	for (i = 0; i < entry_count; i++)
-		if (lsm->lsm_entries[i] != NULL)
+		if (lsm->lsm_entries[i])
 			lsme_free(lsm->lsm_entries[i]);
 
 	OBD_FREE(lsm, lsm_size);
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_internal.h b/drivers/staging/lustrefsx/lustre/lov/lov_internal.h
index 524b0a4eac681..a1cbea9a5c4d4 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_internal.h
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -34,7 +34,7 @@
 #define LOV_INTERNAL_H
 
 #include <obd_class.h>
-#include <lustre/lustre_user.h>
+#include <uapi/linux/lustre/lustre_user.h>
 
 /* If we are unable to get the maximum object size from the OST in
  * ocd_maxbytes using OBD_CONNECT_MAXBYTES, then we fall back to using
@@ -47,6 +47,7 @@ struct lov_stripe_md_entry {
 	u32			lsme_magic;
 	u32			lsme_flags;
 	u32			lsme_pattern;
+	u64			lsme_timestamp;
 	u32			lsme_stripe_size;
 	u16			lsme_stripe_count;
 	u16			lsme_layout_gen;
@@ -54,6 +55,11 @@ struct lov_stripe_md_entry {
 	struct lov_oinfo       *lsme_oinfo[];
 };
 
+static inline bool lsme_is_dom(struct lov_stripe_md_entry *lsme)
+{
+	return (lov_pattern(lsme->lsme_pattern) == LOV_PATTERN_MDT);
+}
+
 static inline void copy_lsm_entry(struct lov_stripe_md_entry *dst,
 				  struct lov_stripe_md_entry *src)
 {
@@ -75,8 +81,10 @@ struct lov_stripe_md {
 	struct ost_id	lsm_oi;
 	u32		lsm_magic;
 	u32		lsm_layout_gen;
-	u32		lsm_entry_count;
+	u16		lsm_flags;
 	bool		lsm_is_released;
+	u16		lsm_mirror_count;
+	u16		lsm_entry_count;
 	struct lov_stripe_md_entry *lsm_entries[];
 };
 
@@ -119,7 +127,7 @@ static inline size_t lov_comp_md_size(const struct lov_stripe_md *lsm)
 			stripe_count = 0;
 
 		size += sizeof(*lsme);
-		size += lov_mds_md_size(lsme->lsme_stripe_count,
+		size += lov_mds_md_size(stripe_count,
 					lsme->lsme_magic);
 	}
 
@@ -187,19 +195,22 @@ void lsm_free(struct lov_stripe_md *lsm);
 })
 #elif BITS_PER_LONG == 32
 # define lov_do_div64(n, base) ({					\
+	uint64_t __num = (n);						\
 	uint64_t __rem;							\
 	if ((sizeof(base) > 4) && (((base) & 0xffffffff00000000ULL) != 0)) {  \
-		int __remainder;					      \
-		LASSERTF(!((base) & (LOV_MIN_STRIPE_SIZE - 1)), "64 bit lov " \
-			 "division %llu / %llu\n", (n), (uint64_t)(base));    \
-		__remainder = (n) & (LOV_MIN_STRIPE_SIZE - 1);		\
-		(n) >>= LOV_MIN_STRIPE_BITS;				\
-		__rem = do_div(n, (base) >> LOV_MIN_STRIPE_BITS);	\
+		int __remainder;					\
+		LASSERTF(!((base) & (LOV_MIN_STRIPE_SIZE - 1)),		\
+			 "64 bit lov division %llu / %llu\n",		\
+			 __num, (uint64_t)(base));			\
+		__remainder = __num & (LOV_MIN_STRIPE_SIZE - 1);	\
+		__num >>= LOV_MIN_STRIPE_BITS;				\
+		__rem = do_div(__num, (base) >> LOV_MIN_STRIPE_BITS);	\
 		__rem <<= LOV_MIN_STRIPE_BITS;				\
 		__rem += __remainder;					\
 	} else {							\
-		__rem = do_div(n, base);				\
+		__rem = do_div(__num, base);				\
 	}								\
+	(n) = __num;							\
 	__rem;								\
 })
 #endif
@@ -246,6 +257,7 @@ int lov_merge_lvb_kms(struct lov_stripe_md *lsm, int index,
                       struct ost_lvb *lvb, __u64 *kms_place);
 
 /* lov_offset.c */
+loff_t stripe_width(struct lov_stripe_md *lsm, unsigned int index);
 u64 lov_stripe_size(struct lov_stripe_md *lsm, int index,
 		    u64 ost_size, int stripeno);
 int lov_stripe_offset(struct lov_stripe_md *lsm, int index, loff_t lov_off,
@@ -264,6 +276,8 @@ int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
 int lov_fini_statfs_set(struct lov_request_set *set);
 
 /* lov_obd.c */
+void lov_tgts_getref(struct obd_device *obd);
+void lov_tgts_putref(struct obd_device *obd);
 void lov_stripe_lock(struct lov_stripe_md *md);
 void lov_stripe_unlock(struct lov_stripe_md *md);
 void lov_fix_desc(struct lov_desc *desc);
@@ -273,13 +287,13 @@ void lov_fix_desc_pattern(__u32 *val);
 void lov_fix_desc_qos_maxage(__u32 *val);
 __u16 lov_get_stripe_count(struct lov_obd *lov, __u32 magic,
 			   __u16 stripe_count);
-int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
-                    struct obd_connect_data *data);
+int lov_connect_obd(struct obd_device *obd, u32 index, int activate,
+		    struct obd_connect_data *data);
 int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
 int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg,
-                            __u32 *indexp, int *genp);
-int lov_del_target(struct obd_device *obd, __u32 index,
-                   struct obd_uuid *uuidp, int gen);
+			    u32 *indexp, int *genp);
+int lov_del_target(struct obd_device *obd, u32 index,
+		   struct obd_uuid *uuidp, int gen);
 
 /* lov_pack.c */
 ssize_t lov_lsm_pack(const struct lov_stripe_md *lsm, void *buf,
@@ -298,14 +312,13 @@ void lsm_free_plain(struct lov_stripe_md *lsm);
 void dump_lsm(unsigned int level, const struct lov_stripe_md *lsm);
 
 /* lproc_lov.c */
-extern const struct proc_ops lov_proc_target_fops;
-#ifdef CONFIG_PROC_FS
-extern struct lprocfs_vars lprocfs_lov_obd_vars[];
-#endif
+int lov_tunables_init(struct obd_device *obd);
 
 /* lov_cl.c */
 extern struct lu_device_type lov_device_type;
 
+#define LOV_MDC_TGT_MAX 256
+
 /* pools */
 extern struct cfs_hash_ops pool_hash_operations;
 /* ost_pool methods */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_io.c b/drivers/staging/lustrefsx/lustre/lov/lov_io.c
index 5544a9744b73e..c6eb7121b5db9 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_io.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_io.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -56,7 +56,7 @@ static inline struct lov_io_sub *lov_sub_alloc(struct lov_io *lio, int index)
 		OBD_ALLOC_PTR(sub);
 	}
 
-	if (sub != NULL) {
+	if (sub) {
 		INIT_LIST_HEAD(&sub->sub_list);
 		INIT_LIST_HEAD(&sub->sub_linkage);
 		sub->sub_subio_index = index;
@@ -82,13 +82,22 @@ static void lov_io_sub_fini(const struct lu_env *env, struct lov_io *lio,
 
 	cl_io_fini(sub->sub_env, &sub->sub_io);
 
-	if (sub->sub_env != NULL && !IS_ERR(sub->sub_env)) {
+	if (sub->sub_env && !IS_ERR(sub->sub_env)) {
 		cl_env_put(sub->sub_env, &sub->sub_refcheck);
 		sub->sub_env = NULL;
 	}
 	EXIT;
 }
 
+static inline bool
+is_index_within_mirror(struct lov_object *lov, int index, int mirror_index)
+{
+	struct lov_layout_composite *comp = &lov->u.composite;
+	struct lov_mirror_entry *lre = &comp->lo_mirrors[mirror_index];
+
+	return (index >= lre->lre_start && index <= lre->lre_end);
+}
+
 static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio,
 			   struct lov_io_sub *sub)
 {
@@ -106,10 +115,17 @@ static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio,
 		     !lov_r0(lov, index)->lo_sub[stripe]))
 		RETURN(-EIO);
 
+	LASSERTF(is_index_within_mirror(lov, index, lio->lis_mirror_index),
+		 DFID "iot = %d, index = %d, mirror = %d\n",
+		 PFID(lu_object_fid(lov2lu(lov))), io->ci_type, index,
+		 lio->lis_mirror_index);
+
 	/* obtain new environment */
 	sub->sub_env = cl_env_get(&sub->sub_refcheck);
-	if (IS_ERR(sub->sub_env))
+	if (IS_ERR(sub->sub_env)) {
 		result = PTR_ERR(sub->sub_env);
+		RETURN(result);
+	}
 
 	sub_obj = lovsub2cl(lov_r0(lov, index)->lo_sub[stripe]);
 	sub_io  = &sub->sub_io;
@@ -122,7 +138,10 @@ static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio,
 	sub_io->ci_type    = io->ci_type;
 	sub_io->ci_no_srvlock = io->ci_no_srvlock;
 	sub_io->ci_noatime = io->ci_noatime;
-	sub_io->ci_pio = io->ci_pio;
+	sub_io->ci_lock_no_expand = io->ci_lock_no_expand;
+	sub_io->ci_ndelay = io->ci_ndelay;
+	sub_io->ci_layout_version = io->ci_layout_version;
+	sub_io->ci_tried_all_mirrors = io->ci_tried_all_mirrors;
 
 	result = cl_io_sub_init(sub->sub_env, sub_io, io->ci_type, sub_obj);
 
@@ -149,7 +168,7 @@ struct lov_io_sub *lov_sub_get(const struct lu_env *env,
 
 	if (rc == 0) {
 		sub = lov_sub_alloc(lio, index);
-		if (sub == NULL)
+		if (!sub)
 			GOTO(out, rc = -ENOMEM);
 
 		rc = lov_io_sub_init(env, lio, sub);
@@ -164,6 +183,8 @@ struct lov_io_sub *lov_sub_get(const struct lu_env *env,
 out:
 	if (rc < 0)
 		sub = ERR_PTR(rc);
+	else
+		sub->sub_io.ci_noquota = lio->lis_cl.cis_io->ci_noquota;
 	RETURN(sub);
 }
 
@@ -199,9 +220,270 @@ static int lov_io_subio_init(const struct lu_env *env, struct lov_io *lio,
 	RETURN(0);
 }
 
+/**
+ * Decide if it will need write intent RPC
+ */
+static int lov_io_mirror_write_intent(struct lov_io *lio,
+	struct lov_object *obj, struct cl_io *io)
+{
+	struct lov_layout_composite *comp = &obj->u.composite;
+	struct lu_extent *ext = &io->ci_write_intent;
+	struct lov_mirror_entry *lre;
+	struct lov_mirror_entry *primary;
+	struct lov_layout_entry *lle;
+	size_t count = 0;
+	ENTRY;
+
+	*ext = (typeof(*ext)) { lio->lis_pos, lio->lis_endpos };
+	io->ci_need_write_intent = 0;
+
+	if (!(io->ci_type == CIT_WRITE || cl_io_is_trunc(io) ||
+	      cl_io_is_mkwrite(io)))
+		RETURN(0);
+
+	/*
+	 * FLR: check if it needs to send a write intent RPC to server.
+	 * Writing to sync_pending file needs write intent RPC to change
+	 * the file state back to write_pending, so that the layout version
+	 * can be increased when the state changes to sync_pending at a later
+	 * time. Otherwise there exists a chance that an evicted client may
+	 * dirty the file data while resync client is working on it.
+	 * Designated I/O is allowed for resync workload.
+	 */
+	if (lov_flr_state(obj) == LCM_FL_RDONLY ||
+	    (lov_flr_state(obj) == LCM_FL_SYNC_PENDING &&
+	     io->ci_designated_mirror == 0)) {
+		io->ci_need_write_intent = 1;
+		RETURN(0);
+	}
+
+	LASSERT((lov_flr_state(obj) == LCM_FL_WRITE_PENDING));
+	LASSERT(comp->lo_preferred_mirror >= 0);
+
+	/*
+	 * need to iterate all components to see if there are
+	 * multiple components covering the writing component
+	 */
+	primary = &comp->lo_mirrors[comp->lo_preferred_mirror];
+	LASSERT(!primary->lre_stale);
+	lov_foreach_mirror_layout_entry(obj, lle, primary) {
+		LASSERT(lle->lle_valid);
+		if (!lu_extent_is_overlapped(ext, lle->lle_extent))
+			continue;
+
+		ext->e_start = MIN(ext->e_start, lle->lle_extent->e_start);
+		ext->e_end = MAX(ext->e_end, lle->lle_extent->e_end);
+		++count;
+	}
+	if (count == 0) {
+		CERROR(DFID ": cannot find any valid components covering "
+		       "file extent "DEXT", mirror: %d\n",
+		       PFID(lu_object_fid(lov2lu(obj))), PEXT(ext),
+		       primary->lre_mirror_id);
+		RETURN(-EIO);
+	}
+
+	count = 0;
+	lov_foreach_mirror_entry(obj, lre) {
+		if (lre == primary)
+			continue;
+
+		lov_foreach_mirror_layout_entry(obj, lle, lre) {
+			if (!lle->lle_valid)
+				continue;
+
+			if (lu_extent_is_overlapped(ext, lle->lle_extent)) {
+				++count;
+				break;
+			}
+		}
+	}
+
+	CDEBUG(D_VFSTRACE, DFID "there are %zd components to be staled to "
+	       "modify file extent "DEXT", iot: %d\n",
+	       PFID(lu_object_fid(lov2lu(obj))), count, PEXT(ext), io->ci_type);
+
+	io->ci_need_write_intent = count > 0;
+
+	RETURN(0);
+}
+
+static int lov_io_mirror_init(struct lov_io *lio, struct lov_object *obj,
+			       struct cl_io *io)
+{
+	struct lov_layout_composite *comp = &obj->u.composite;
+	int index;
+	int i;
+	int result;
+	ENTRY;
+
+	if (!lov_is_flr(obj)) {
+		/* only locks/pages are manipulated for CIT_MISC op, no
+		 * cl_io_loop() will be called, don't check/set mirror info.
+		 */
+		if (io->ci_type != CIT_MISC) {
+			LASSERT(comp->lo_preferred_mirror == 0);
+			lio->lis_mirror_index = comp->lo_preferred_mirror;
+		}
+		io->ci_ndelay = 0;
+		RETURN(0);
+	}
+
+	/* transfer the layout version for verification */
+	if (io->ci_layout_version == 0)
+		io->ci_layout_version = obj->lo_lsm->lsm_layout_gen;
+
+	/* find the corresponding mirror for designated mirror IO */
+	if (io->ci_designated_mirror > 0) {
+		struct lov_mirror_entry *entry;
+
+		LASSERT(!io->ci_ndelay);
+
+		CDEBUG(D_LAYOUT, "designated I/O mirror state: %d\n",
+		      lov_flr_state(obj));
+
+		if ((cl_io_is_trunc(io) || io->ci_type == CIT_WRITE) &&
+		    (io->ci_layout_version != obj->lo_lsm->lsm_layout_gen)) {
+			/*
+			 * For resync I/O, the ci_layout_version was the layout
+			 * version when resync starts. If it doesn't match the
+			 * current object layout version, it means the layout
+			 * has been changed
+			 */
+			RETURN(-ESTALE);
+		}
+
+		io->ci_layout_version |= LU_LAYOUT_RESYNC;
+
+		index = 0;
+		lio->lis_mirror_index = -1;
+		lov_foreach_mirror_entry(obj, entry) {
+			if (entry->lre_mirror_id ==
+			    io->ci_designated_mirror) {
+				lio->lis_mirror_index = index;
+				break;
+			}
+
+			index++;
+		}
+
+		RETURN(lio->lis_mirror_index < 0 ? -EINVAL : 0);
+	}
+
+	result = lov_io_mirror_write_intent(lio, obj, io);
+	if (result)
+		RETURN(result);
+
+	if (io->ci_need_write_intent) {
+		CDEBUG(D_VFSTRACE, DFID " need write intent for [%llu, %llu)\n",
+		       PFID(lu_object_fid(lov2lu(obj))),
+		       lio->lis_pos, lio->lis_endpos);
+
+		if (cl_io_is_trunc(io)) {
+			/**
+			 * for truncate, we uses [size, EOF) to judge whether
+			 * a write intent needs to be send, but we need to
+			 * restore the write extent to [0, size], in truncate,
+			 * the byte in the size position is accessed.
+			 */
+			io->ci_write_intent.e_start = 0;
+			io->ci_write_intent.e_end =
+					io->u.ci_setattr.sa_attr.lvb_size + 1;
+		}
+		/* stop cl_io_init() loop */
+		RETURN(1);
+	}
+
+	if (io->ci_ndelay_tried == 0 || /* first time to try */
+	    /* reset the mirror index if layout has changed */
+	    lio->lis_mirror_layout_gen != obj->lo_lsm->lsm_layout_gen) {
+		lio->lis_mirror_layout_gen = obj->lo_lsm->lsm_layout_gen;
+		index = lio->lis_mirror_index = comp->lo_preferred_mirror;
+	} else {
+		index = lio->lis_mirror_index;
+		LASSERT(index >= 0);
+
+		/* move mirror index to the next one */
+		index = (index + 1) % comp->lo_mirror_count;
+	}
+
+	for (i = 0; i < comp->lo_mirror_count; i++) {
+		struct lu_extent ext = { .e_start = lio->lis_pos,
+					 .e_end   = lio->lis_pos + 1 };
+		struct lov_mirror_entry *lre;
+		struct lov_layout_entry *lle;
+		bool found = false;
+
+		lre = &comp->lo_mirrors[(index + i) % comp->lo_mirror_count];
+		if (!lre->lre_valid)
+			continue;
+
+		lov_foreach_mirror_layout_entry(obj, lle, lre) {
+			if (!lle->lle_valid)
+				continue;
+
+			if (lu_extent_is_overlapped(&ext, lle->lle_extent)) {
+				found = true;
+				break;
+			}
+		} /* each component of the mirror */
+		if (found) {
+			index = (index + i) % comp->lo_mirror_count;
+			break;
+		}
+	} /* each mirror */
+
+	if (i == comp->lo_mirror_count) {
+		CERROR(DFID": failed to find a component covering "
+		       "I/O region at %llu\n",
+		       PFID(lu_object_fid(lov2lu(obj))), lio->lis_pos);
+
+		dump_lsm(D_ERROR, obj->lo_lsm);
+
+		RETURN(-EIO);
+	}
+
+	CDEBUG(D_VFSTRACE, DFID ": flr state: %d, move mirror from %d to %d, "
+	       "have retried: %d, mirror count: %d\n",
+	       PFID(lu_object_fid(lov2lu(obj))), lov_flr_state(obj),
+	       lio->lis_mirror_index, index, io->ci_ndelay_tried,
+	       comp->lo_mirror_count);
+
+	lio->lis_mirror_index = index;
+
+	/*
+	 * FLR: if all mirrors have been tried once, most likely the network
+	 * of this client has been partitioned. We should relinquish CPU for
+	 * a while before trying again.
+	 */
+	if (io->ci_ndelay && io->ci_ndelay_tried > 0 &&
+	    (io->ci_ndelay_tried % comp->lo_mirror_count == 0)) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC)); /* 10ms */
+		if (signal_pending(current))
+			RETURN(-EINTR);
+
+		/**
+		 * we'd set ci_tried_all_mirrors to turn off fast mirror
+		 * switching for read after we've tried all mirrors several
+		 * rounds.
+		 */
+		io->ci_tried_all_mirrors = io->ci_ndelay_tried %
+					   (comp->lo_mirror_count * 4) == 0;
+	}
+	++io->ci_ndelay_tried;
+
+	CDEBUG(D_VFSTRACE, "use %sdelayed RPC state for this IO\n",
+	       io->ci_ndelay ? "non-" : "");
+
+	RETURN(0);
+}
+
 static int lov_io_slice_init(struct lov_io *lio,
 			     struct lov_object *obj, struct cl_io *io)
 {
+	int index;
+	int result = 0;
 	ENTRY;
 
 	io->ci_result = 0;
@@ -212,42 +494,45 @@ static int lov_io_slice_init(struct lov_io *lio,
 	switch (io->ci_type) {
 	case CIT_READ:
 	case CIT_WRITE:
-		lio->lis_pos = io->u.ci_rw.rw_range.cir_pos;
-		lio->lis_endpos = lio->lis_pos + io->u.ci_rw.rw_range.cir_count;
+		lio->lis_pos = io->u.ci_rw.crw_pos;
+		lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count;
 		lio->lis_io_endpos = lio->lis_endpos;
 		if (cl_io_is_append(io)) {
 			LASSERT(io->ci_type == CIT_WRITE);
 
-			/* If there is LOV EA hole, then we may cannot locate
-			 * the current file-tail exactly. */
+			/*
+			 * If there is LOV EA hole, then we may cannot locate
+			 * the current file-tail exactly.
+			 */
 			if (unlikely(obj->lo_lsm->lsm_entries[0]->lsme_pattern &
 				     LOV_PATTERN_F_HOLE))
-				RETURN(-EIO);
+				GOTO(out, result = -EIO);
 
 			lio->lis_pos = 0;
 			lio->lis_endpos = OBD_OBJECT_EOF;
 		}
 		break;
 
-        case CIT_SETATTR:
-                if (cl_io_is_trunc(io))
-                        lio->lis_pos = io->u.ci_setattr.sa_attr.lvb_size;
-                else
-                        lio->lis_pos = 0;
-                lio->lis_endpos = OBD_OBJECT_EOF;
-                break;
+	case CIT_SETATTR:
+		if (cl_io_is_trunc(io))
+			lio->lis_pos = io->u.ci_setattr.sa_attr.lvb_size;
+		else
+			lio->lis_pos = 0;
+		lio->lis_endpos = OBD_OBJECT_EOF;
+		break;
 
 	case CIT_DATA_VERSION:
 		lio->lis_pos = 0;
 		lio->lis_endpos = OBD_OBJECT_EOF;
 		break;
 
-        case CIT_FAULT: {
-                pgoff_t index = io->u.ci_fault.ft_index;
-                lio->lis_pos = cl_offset(io->ci_obj, index);
-                lio->lis_endpos = cl_offset(io->ci_obj, index + 1);
-                break;
-        }
+	case CIT_FAULT: {
+		pgoff_t index = io->u.ci_fault.ft_index;
+
+		lio->lis_pos = cl_offset(io->ci_obj, index);
+		lio->lis_endpos = cl_offset(io->ci_obj, index + 1);
+		break;
+	}
 
 	case CIT_FSYNC: {
 		lio->lis_pos = io->u.ci_fsync.fi_start;
@@ -261,16 +546,84 @@ static int lov_io_slice_init(struct lov_io *lio,
 		break;
 	}
 
-        case CIT_MISC:
-                lio->lis_pos = 0;
-                lio->lis_endpos = OBD_OBJECT_EOF;
-                break;
+	case CIT_GLIMPSE:
+		lio->lis_pos = 0;
+		lio->lis_endpos = OBD_OBJECT_EOF;
 
-        default:
-                LBUG();
-        }
+		if (lov_flr_state(obj) == LCM_FL_RDONLY &&
+		    !OBD_FAIL_CHECK(OBD_FAIL_FLR_GLIMPSE_IMMUTABLE))
+			/* SoM is accurate, no need glimpse */
+			GOTO(out, result = 1);
+		break;
 
-	RETURN(0);
+	case CIT_MISC:
+		lio->lis_pos = 0;
+		lio->lis_endpos = OBD_OBJECT_EOF;
+		break;
+
+	default:
+		LBUG();
+	}
+
+	result = lov_io_mirror_init(lio, obj, io);
+	if (result)
+		GOTO(out, result);
+
+	/* check if it needs to instantiate layout */
+	if (!(io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io) ||
+	      (cl_io_is_trunc(io) && io->u.ci_setattr.sa_attr.lvb_size > 0)))
+		GOTO(out, result = 0);
+
+	/*
+	 * for truncate, it only needs to instantiate the components
+	 * before the truncated size.
+	 */
+	if (cl_io_is_trunc(io)) {
+		io->ci_write_intent.e_start = 0;
+		/* for writes, e_end is endpos, the location of the file
+		 * pointer after the write is completed, so it is not accessed.
+		 * For truncate, 'end' is the size, and *is* acccessed.
+		 * In other words, writes are [start, end), but truncate is
+		 * [start, size], where both are included.  So add 1 to the
+		 * size when creating the write intent to account for this.
+		 */
+		io->ci_write_intent.e_end =
+			io->u.ci_setattr.sa_attr.lvb_size + 1;
+	} else {
+		io->ci_write_intent.e_start = lio->lis_pos;
+		io->ci_write_intent.e_end = lio->lis_endpos;
+	}
+
+	index = 0;
+	lov_foreach_io_layout(index, lio, &io->ci_write_intent) {
+		if (!lsm_entry_inited(obj->lo_lsm, index)) {
+			io->ci_need_write_intent = 1;
+			break;
+		}
+	}
+
+	if (io->ci_need_write_intent && io->ci_designated_mirror > 0) {
+		/*
+		 * REINT_SYNC RPC has already tried to instantiate all of the
+		 * components involved, obviously it didn't succeed. Skip this
+		 * mirror for now. The server won't be able to figure out
+		 * which mirror it should instantiate components
+		 */
+		CERROR(DFID": trying to instantiate components for designated "
+		       "I/O, file state: %d\n",
+		       PFID(lu_object_fid(lov2lu(obj))), lov_flr_state(obj));
+
+		io->ci_need_write_intent = 0;
+		GOTO(out, result = -EIO);
+	}
+
+	if (io->ci_need_write_intent)
+		GOTO(out, result = 1);
+
+	EXIT;
+
+out:
+	return result;
 }
 
 static void lov_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
@@ -310,13 +663,13 @@ static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio,
 	int index = lov_comp_entry(sub->sub_subio_index);
 	int stripe = lov_comp_stripe(sub->sub_subio_index);
 
-	io->ci_pio = parent->ci_pio;
 	switch (io->ci_type) {
 	case CIT_SETATTR: {
 		io->u.ci_setattr.sa_attr = parent->u.ci_setattr.sa_attr;
 		io->u.ci_setattr.sa_attr_flags =
 			parent->u.ci_setattr.sa_attr_flags;
-		io->u.ci_setattr.sa_valid = parent->u.ci_setattr.sa_valid;
+		io->u.ci_setattr.sa_avalid = parent->u.ci_setattr.sa_avalid;
+		io->u.ci_setattr.sa_xvalid = parent->u.ci_setattr.sa_xvalid;
 		io->u.ci_setattr.sa_stripe_index = stripe;
 		io->u.ci_setattr.sa_parent_fid =
 					parent->u.ci_setattr.sa_parent_fid;
@@ -355,16 +708,13 @@ static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio,
 	}
 	case CIT_READ:
 	case CIT_WRITE: {
-		io->u.ci_rw.rw_ptask = parent->u.ci_rw.rw_ptask;
-		io->u.ci_rw.rw_iter = parent->u.ci_rw.rw_iter;
-		io->u.ci_rw.rw_iocb = parent->u.ci_rw.rw_iocb;
-		io->u.ci_rw.rw_file = parent->u.ci_rw.rw_file;
-		io->u.ci_rw.rw_sync = parent->u.ci_rw.rw_sync;
+		io->u.ci_wr.wr_sync = cl_io_is_sync_write(parent);
+		io->ci_tried_all_mirrors = parent->ci_tried_all_mirrors;
 		if (cl_io_is_append(parent)) {
-			io->u.ci_rw.rw_append = 1;
+			io->u.ci_wr.wr_append = 1;
 		} else {
-			io->u.ci_rw.rw_range.cir_pos = start;
-			io->u.ci_rw.rw_range.cir_count = end - start;
+			io->u.ci_rw.crw_pos = start;
+			io->u.ci_rw.crw_count = end - start;
 		}
 		break;
 	}
@@ -376,6 +726,8 @@ static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio,
 		io->u.ci_ladvise.li_flags = parent->u.ci_ladvise.li_flags;
 		break;
 	}
+	case CIT_GLIMPSE:
+	case CIT_MISC:
 	default:
 		break;
 	}
@@ -383,63 +735,75 @@ static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio,
 
 static loff_t lov_offset_mod(loff_t val, int delta)
 {
-        if (val != OBD_OBJECT_EOF)
-                val += delta;
-        return val;
+	if (val != OBD_OBJECT_EOF)
+		val += delta;
+	return val;
 }
 
+static int lov_io_add_sub(const struct lu_env *env, struct lov_io *lio,
+			  struct lov_io_sub *sub, u64 start, u64 end)
+{
+	int rc;
+
+	end = lov_offset_mod(end, 1);
+	lov_io_sub_inherit(sub, lio, start, end);
+	rc = cl_io_iter_init(sub->sub_env, &sub->sub_io);
+	if (rc != 0) {
+		cl_io_iter_fini(sub->sub_env, &sub->sub_io);
+		return rc;
+	}
+
+	list_add_tail(&sub->sub_linkage, &lio->lis_active);
+
+	return rc;
+}
 static int lov_io_iter_init(const struct lu_env *env,
 			    const struct cl_io_slice *ios)
 {
-	struct cl_io         *io = ios->cis_io;
-	struct lov_io        *lio = cl2lov_io(env, ios);
+	struct lov_io *lio = cl2lov_io(env, ios);
 	struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
-	struct lov_io_sub    *sub;
-	struct lov_layout_entry *le;
+	struct lov_io_sub *sub;
 	struct lu_extent ext;
 	int index;
 	int rc = 0;
 
-        ENTRY;
+	ENTRY;
 
 	ext.e_start = lio->lis_pos;
 	ext.e_end = lio->lis_endpos;
 
-	index = 0;
-	lov_foreach_layout_entry(lio->lis_object, le) {
+	lov_foreach_io_layout(index, lio, &ext) {
+		struct lov_layout_entry *le = lov_entry(lio->lis_object, index);
 		struct lov_layout_raid0 *r0 = &le->lle_raid0;
 		u64 start;
 		u64 end;
 		int stripe;
+		bool tested_trunc_stripe = false;
 
-		index++;
-		if (!lu_extent_is_overlapped(&ext, &le->lle_extent))
-			continue;
+		r0->lo_trunc_stripeno = -1;
 
 		CDEBUG(D_VFSTRACE, "component[%d] flags %#x\n",
-		       index - 1, lsm->lsm_entries[index - 1]->lsme_flags);
-		if (!lsm_entry_inited(lsm, index - 1)) {
-			/* truncate IO will trigger write intent as well, and
-			 * it's handled in lov_io_setattr_iter_init() */
-			if (io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io)) {
-				io->ci_need_write_intent = 1;
-				/* execute it in main thread */
-				io->ci_pio = 0;
-				rc = -ENODATA;
-				break;
-			}
-
-			/* Read from uninitialized components should return
-			 * zero filled pages. */
+		       index, lsm->lsm_entries[index]->lsme_flags);
+		if (!lsm_entry_inited(lsm, index)) {
+			/*
+			 * Read from uninitialized components should return
+			 * zero filled pages.
+			 */
 			continue;
 		}
 
+		if (!le->lle_valid && !ios->cis_io->ci_designated_mirror) {
+			CERROR("I/O to invalid component: %d, mirror: %d\n",
+			       index, lio->lis_mirror_index);
+			RETURN(-EIO);
+		}
+
 		for (stripe = 0; stripe < r0->lo_nr; stripe++) {
-			if (!lov_stripe_intersects(lsm, index - 1, stripe,
+			if (!lov_stripe_intersects(lsm, index, stripe,
 						   &ext, &start, &end))
 				continue;
 
-			if (unlikely(r0->lo_sub[stripe] == NULL)) {
+			if (unlikely(!r0->lo_sub[stripe])) {
 				if (ios->cis_io->ci_type == CIT_READ ||
 				    ios->cis_io->ci_type == CIT_WRITE ||
 				    ios->cis_io->ci_type == CIT_FAULT)
@@ -448,29 +812,79 @@ static int lov_io_iter_init(const struct lu_env *env,
 				continue;
 			}
 
-			end = lov_offset_mod(end, 1);
+			if (cl_io_is_trunc(ios->cis_io) &&
+			    !tested_trunc_stripe) {
+				int prev;
+				u64 tr_start;
+
+				prev = (stripe == 0) ? r0->lo_nr - 1 :
+							stripe - 1;
+				/**
+				 * Only involving previous stripe if the
+				 * truncate in this component is at the
+				 * beginning of this stripe.
+				 */
+				tested_trunc_stripe = true;
+				if (ext.e_start < lsm->lsm_entries[index]->
+							lsme_extent.e_start) {
+					/* need previous stripe involvement */
+					r0->lo_trunc_stripeno = prev;
+				} else {
+					tr_start = ext.e_start;
+					tr_start = lov_do_div64(tr_start,
+						      stripe_width(lsm, index));
+					/* tr_start %= stripe_swidth */
+					if (tr_start == stripe * lsm->
+							lsm_entries[index]->
+							lsme_stripe_size)
+						r0->lo_trunc_stripeno = prev;
+				}
+			}
+
+			/* if the last stripe is the trunc stripeno */
+			if (r0->lo_trunc_stripeno == stripe)
+				r0->lo_trunc_stripeno = -1;
+
 			sub = lov_sub_get(env, lio,
-					  lov_comp_index(index - 1, stripe));
-			if (IS_ERR(sub)) {
-				rc = PTR_ERR(sub);
+					  lov_comp_index(index, stripe));
+			if (IS_ERR(sub))
+				return PTR_ERR(sub);
+
+			rc = lov_io_add_sub(env, lio, sub, start, end);
+			if (rc != 0)
 				break;
+		}
+		if (rc != 0)
+			break;
+
+		if (r0->lo_trunc_stripeno != -1) {
+			stripe = r0->lo_trunc_stripeno;
+			if (unlikely(!r0->lo_sub[stripe])) {
+				r0->lo_trunc_stripeno = -1;
+				continue;
+			}
+			sub = lov_sub_get(env, lio,
+					  lov_comp_index(index, stripe));
+			if (IS_ERR(sub))
+				return PTR_ERR(sub);
+
+			/**
+			 * the prev sub could be used by another truncate, we'd
+			 * skip it. LU-14128 happends when expand truncate +
+			 * read get wrong kms.
+			 */
+			if (!list_empty(&sub->sub_linkage)) {
+				r0->lo_trunc_stripeno = -1;
+				continue;
 			}
 
-			lov_io_sub_inherit(sub, lio, start, end);
-			rc = cl_io_iter_init(sub->sub_env, &sub->sub_io);
-			if (rc != 0)
-				cl_io_iter_fini(sub->sub_env, &sub->sub_io);
+			(void)lov_stripe_intersects(lsm, index, stripe, &ext,
+						    &start, &end);
+			rc = lov_io_add_sub(env, lio, sub, start, end);
 			if (rc != 0)
 				break;
 
-			CDEBUG(D_VFSTRACE,
-				"shrink stripe: {%d, %d} range: [%llu, %llu)\n",
-				index, stripe, start, end);
-
-			list_add_tail(&sub->sub_linkage, &lio->lis_active);
 		}
-		if (rc != 0)
-			break;
 	}
 	RETURN(rc);
 }
@@ -478,12 +892,10 @@ static int lov_io_iter_init(const struct lu_env *env,
 static int lov_io_rw_iter_init(const struct lu_env *env,
 			       const struct cl_io_slice *ios)
 {
-	struct cl_io *io = ios->cis_io;
 	struct lov_io *lio = cl2lov_io(env, ios);
-	struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+	struct cl_io *io = ios->cis_io;
 	struct lov_stripe_md_entry *lse;
-	struct cl_io_range *range = &io->u.ci_rw.rw_range;
-	loff_t start = range->cir_pos;
+	loff_t start = io->u.ci_rw.crw_pos;
 	loff_t next;
 	int index;
 
@@ -493,14 +905,14 @@ static int lov_io_rw_iter_init(const struct lu_env *env,
 	if (cl_io_is_append(io))
 		RETURN(lov_io_iter_init(env, ios));
 
-	index = lov_lsm_entry(lsm, range->cir_pos);
+	index = lov_io_layout_at(lio, io->u.ci_rw.crw_pos);
 	if (index < 0) { /* non-existing layout component */
 		if (io->ci_type == CIT_READ) {
-			/* TODO: it needs to detect the next component and
-			 * then set the next pos */
+			/*
+			 * TODO: it needs to detect the next component and
+			 * then set the next pos
+			 */
 			io->ci_continue = 0;
-			/* execute it in main thread */
-			io->ci_pio = 0;
 
 			RETURN(lov_io_iter_init(env, ios));
 		}
@@ -508,6 +920,10 @@ static int lov_io_rw_iter_init(const struct lu_env *env,
 		RETURN(-ENODATA);
 	}
 
+	if (!lov_entry(lio->lis_object, index)->lle_valid &&
+	    !io->ci_designated_mirror)
+		RETURN(io->ci_type == CIT_READ ? -EAGAIN : -EIO);
+
 	lse = lov_lse(lio->lis_object, index);
 
 	next = MAX_LFS_FILESIZE;
@@ -520,37 +936,20 @@ static int lov_io_rw_iter_init(const struct lu_env *env,
 			next = MAX_LFS_FILESIZE;
 	}
 
-	LASSERTF(range->cir_pos >= lse->lsme_extent.e_start,
-		 "pos %lld, [%lld, %lld)\n", range->cir_pos,
+	LASSERTF(io->u.ci_rw.crw_pos >= lse->lsme_extent.e_start,
+		 "pos %lld, [%lld, %lld)\n", io->u.ci_rw.crw_pos,
 		 lse->lsme_extent.e_start, lse->lsme_extent.e_end);
 	next = min_t(__u64, next, lse->lsme_extent.e_end);
 	next = min_t(loff_t, next, lio->lis_io_endpos);
 
-	io->ci_continue  = next < lio->lis_io_endpos;
-	range->cir_count = next - range->cir_pos;
-	lio->lis_pos     = range->cir_pos;
-	lio->lis_endpos  = range->cir_pos + range->cir_count;
+	io->ci_continue = next < lio->lis_io_endpos;
+	io->u.ci_rw.crw_count = next - io->u.ci_rw.crw_pos;
+	lio->lis_pos    = io->u.ci_rw.crw_pos;
+	lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count;
 	CDEBUG(D_VFSTRACE,
-	       "stripe: {%d, %llu} range: [%llu, %llu) end: %llu, count: %zd\n",
-	       index, start, lio->lis_pos, lio->lis_endpos,
-	       lio->lis_io_endpos, range->cir_count);
-
-	if (!io->ci_continue) {
-		/* the last piece of IO, execute it in main thread */
-		io->ci_pio = 0;
-	}
-
-	if (io->ci_pio) {
-		/* it only splits IO here for parallel IO,
-		 * there will be no actual IO going to occur,
-		 * so it doesn't need to invoke lov_io_iter_init()
-		 * to initialize sub IOs. */
-		if (!lsm_entry_inited(lsm, index)) {
-			io->ci_need_write_intent = 1;
-			RETURN(-ENODATA);
-		}
-		RETURN(0);
-	}
+	       "stripe: %llu chunk: [%llu, %llu) %llu, %zd\n",
+	       (__u64)start, lio->lis_pos, lio->lis_endpos,
+	       (__u64)lio->lis_io_endpos, io->u.ci_rw.crw_count);
 
 	/*
 	 * XXX The following call should be optimized: we know, that
@@ -564,18 +963,14 @@ static int lov_io_setattr_iter_init(const struct lu_env *env,
 {
 	struct lov_io *lio = cl2lov_io(env, ios);
 	struct cl_io *io = ios->cis_io;
-	struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
 	int index;
 	ENTRY;
 
 	if (cl_io_is_trunc(io) && lio->lis_pos > 0) {
-		index = lov_lsm_entry(lsm, lio->lis_pos);
-		CDEBUG(D_VFSTRACE, "component[%d] flags %#x pos %llu\n",
-			index, lsm->lsm_entries[index]->lsme_flags, lio->lis_pos);
-		if (index > 0 && !lsm_entry_inited(lsm, index)) {
-			io->ci_need_write_intent = 1;
+		index = lov_io_layout_at(lio, lio->lis_pos - 1);
+		/* no entry found for such offset */
+		if (index < 0)
 			RETURN(io->ci_result = -ENODATA);
-		}
 	}
 
 	RETURN(lov_io_iter_init(env, ios));
@@ -602,49 +997,49 @@ static int lov_io_call(const struct lu_env *env, struct lov_io *lio,
 
 static int lov_io_lock(const struct lu_env *env, const struct cl_io_slice *ios)
 {
-        ENTRY;
-        RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_lock));
+	ENTRY;
+	RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_lock));
 }
 
 static int lov_io_start(const struct lu_env *env, const struct cl_io_slice *ios)
 {
-        ENTRY;
-        RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_start));
+	ENTRY;
+	RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_start));
 }
 
 static int lov_io_end_wrapper(const struct lu_env *env, struct cl_io *io)
 {
-        ENTRY;
-        /*
-         * It's possible that lov_io_start() wasn't called against this
-         * sub-io, either because previous sub-io failed, or upper layer
-         * completed IO.
-         */
-        if (io->ci_state == CIS_IO_GOING)
-                cl_io_end(env, io);
-        else
-                io->ci_state = CIS_IO_FINISHED;
-        RETURN(0);
+	ENTRY;
+	/*
+	 * It's possible that lov_io_start() wasn't called against this
+	 * sub-io, either because previous sub-io failed, or upper layer
+	 * completed IO.
+	 */
+	if (io->ci_state == CIS_IO_GOING)
+		cl_io_end(env, io);
+	else
+		io->ci_state = CIS_IO_FINISHED;
+	RETURN(0);
 }
 
 static int lov_io_iter_fini_wrapper(const struct lu_env *env, struct cl_io *io)
 {
-        cl_io_iter_fini(env, io);
-        RETURN(0);
+	cl_io_iter_fini(env, io);
+	RETURN(0);
 }
 
 static int lov_io_unlock_wrapper(const struct lu_env *env, struct cl_io *io)
 {
-        cl_io_unlock(env, io);
-        RETURN(0);
+	cl_io_unlock(env, io);
+	RETURN(0);
 }
 
 static void lov_io_end(const struct lu_env *env, const struct cl_io_slice *ios)
 {
-        int rc;
+	int rc;
 
-        rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_end_wrapper);
-        LASSERT(rc == 0);
+	rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_end_wrapper);
+	LASSERT(rc == 0);
 }
 
 static void
@@ -652,14 +1047,18 @@ lov_io_data_version_end(const struct lu_env *env, const struct cl_io_slice *ios)
 {
 	struct lov_io *lio = cl2lov_io(env, ios);
 	struct cl_io *parent = lio->lis_cl.cis_io;
+	struct cl_data_version_io *pdv = &parent->u.ci_data_version;
 	struct lov_io_sub *sub;
 
 	ENTRY;
 	list_for_each_entry(sub, &lio->lis_active, sub_linkage) {
-		lov_io_end_wrapper(env, &sub->sub_io);
+		struct cl_data_version_io *sdv = &sub->sub_io.u.ci_data_version;
 
-		parent->u.ci_data_version.dv_data_version +=
-			sub->sub_io.u.ci_data_version.dv_data_version;
+		lov_io_end_wrapper(sub->sub_env, &sub->sub_io);
+
+		pdv->dv_data_version += sdv->dv_data_version;
+		if (pdv->dv_layout_version > sdv->dv_layout_version)
+			pdv->dv_layout_version = sdv->dv_layout_version;
 
 		if (parent->ci_result == 0)
 			parent->ci_result = sub->sub_io.ci_result;
@@ -671,26 +1070,26 @@ lov_io_data_version_end(const struct lu_env *env, const struct cl_io_slice *ios)
 static void lov_io_iter_fini(const struct lu_env *env,
                              const struct cl_io_slice *ios)
 {
-        struct lov_io *lio = cl2lov_io(env, ios);
-        int rc;
+	struct lov_io *lio = cl2lov_io(env, ios);
+	int rc;
 
-        ENTRY;
-        rc = lov_io_call(env, lio, lov_io_iter_fini_wrapper);
-        LASSERT(rc == 0);
+	ENTRY;
+	rc = lov_io_call(env, lio, lov_io_iter_fini_wrapper);
+	LASSERT(rc == 0);
 	while (!list_empty(&lio->lis_active))
 		list_del_init(lio->lis_active.next);
-        EXIT;
+	EXIT;
 }
 
 static void lov_io_unlock(const struct lu_env *env,
                           const struct cl_io_slice *ios)
 {
-        int rc;
+	int rc;
 
-        ENTRY;
-        rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_unlock_wrapper);
-        LASSERT(rc == 0);
-        EXIT;
+	ENTRY;
+	rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_unlock_wrapper);
+	LASSERT(rc == 0);
+	EXIT;
 }
 
 static int lov_io_read_ahead(const struct lu_env *env,
@@ -712,14 +1111,18 @@ static int lov_io_read_ahead(const struct lu_env *env,
 	ENTRY;
 
 	offset = cl_offset(obj, start);
-	index = lov_lsm_entry(loo->lo_lsm, offset);
+	index = lov_io_layout_at(lio, offset);
 	if (index < 0 || !lsm_entry_inited(loo->lo_lsm, index))
 		RETURN(-ENODATA);
 
+	/* avoid readahead to expand to stale components */
+	if (!lov_entry(loo, index)->lle_valid)
+		RETURN(-EIO);
+
 	stripe = lov_stripe_number(loo->lo_lsm, index, offset);
 
 	r0 = lov_r0(loo, index);
-	if (unlikely(r0->lo_sub[stripe] == NULL))
+	if (unlikely(!r0->lo_sub[stripe]))
 		RETURN(-EIO);
 
 	sub = lov_sub_get(env, lio, lov_comp_index(index, stripe));
@@ -750,7 +1153,7 @@ static int lov_io_read_ahead(const struct lu_env *env,
 					       ra_end, stripe);
 
 	/* boundary of current component */
-	ra_end = cl_index(obj, (loff_t)lov_lse(loo, index)->lsme_extent.e_end);
+	ra_end = cl_index(obj, (loff_t)lov_io_extent(lio, index)->e_end);
 	if (ra_end != CL_PAGE_EOF && ra->cra_end >= ra_end)
 		ra->cra_end = ra_end - 1;
 
@@ -794,35 +1197,37 @@ static int lov_io_submit(const struct lu_env *env,
 	struct lov_io_sub	*sub;
 	struct cl_page_list	*plist = &lov_env_info(env)->lti_plist;
 	struct cl_page		*page;
+	struct cl_page		*tmp;
 	int index;
 	int rc = 0;
 	ENTRY;
 
-	if (lio->lis_nr_subios == 1) {
-		int idx = lio->lis_single_subio_index;
-
-		sub = lov_sub_get(env, lio, idx);
-		LASSERT(!IS_ERR(sub));
-		LASSERT(sub == &lio->lis_single_subio);
-		rc = cl_io_submit_rw(sub->sub_env, &sub->sub_io,
-				     crt, queue);
-		RETURN(rc);
-	}
-
 	cl_page_list_init(plist);
 	while (qin->pl_nr > 0) {
 		struct cl_2queue  *cl2q = &lov_env_info(env)->lti_cl2q;
 
-		cl_2queue_init(cl2q);
-
 		page = cl_page_list_first(qin);
+		if (lov_page_is_empty(page)) {
+			cl_page_list_move(&queue->c2_qout, qin, page);
+
+			/*
+			 * it could only be mirror read to get here therefore
+			 * the pages will be transient. We don't care about
+			 * the return code of cl_page_prep() at all.
+			 */
+			(void) cl_page_prep(env, ios->cis_io, page, crt);
+			cl_page_completion(env, page, crt, 0);
+			continue;
+		}
+
+		cl_2queue_init(cl2q);
 		cl_page_list_move(&cl2q->c2_qin, qin, page);
 
 		index = lov_page_index(page);
-		while (qin->pl_nr > 0) {
-			page = cl_page_list_first(qin);
+		cl_page_list_for_each_safe(page, tmp, qin) {
+			/* this page is not on this stripe */
 			if (index != lov_page_index(page))
-				break;
+				continue;
 
 			cl_page_list_move(&cl2q->c2_qin, qin, page);
 		}
@@ -855,7 +1260,7 @@ static int lov_io_commit_async(const struct lu_env *env,
 			       cl_commit_cbt cb)
 {
 	struct cl_page_list *plist = &lov_env_info(env)->lti_plist;
-	struct lov_io     *lio = cl2lov_io(env, ios);
+	struct lov_io *lio = cl2lov_io(env, ios);
 	struct lov_io_sub *sub;
 	struct cl_page *page;
 	int rc = 0;
@@ -864,6 +1269,8 @@ static int lov_io_commit_async(const struct lu_env *env,
 	if (lio->lis_nr_subios == 1) {
 		int idx = lio->lis_single_subio_index;
 
+		LASSERT(!lov_page_is_empty(cl_page_list_first(queue)));
+
 		sub = lov_sub_get(env, lio, idx);
 		LASSERT(!IS_ERR(sub));
 		LASSERT(sub == &lio->lis_single_subio);
@@ -879,6 +1286,8 @@ static int lov_io_commit_async(const struct lu_env *env,
 
 		LASSERT(plist->pl_nr == 0);
 		page = cl_page_list_first(queue);
+		LASSERT(!lov_page_is_empty(page));
+
 		cl_page_list_move(plist, queue, page);
 
 		index = lov_page_index(page);
@@ -957,25 +1366,25 @@ static void lov_io_fsync_end(const struct lu_env *env,
 }
 
 static const struct cl_io_operations lov_io_ops = {
-        .op = {
-                [CIT_READ] = {
-                        .cio_fini      = lov_io_fini,
-                        .cio_iter_init = lov_io_rw_iter_init,
-                        .cio_iter_fini = lov_io_iter_fini,
-                        .cio_lock      = lov_io_lock,
-                        .cio_unlock    = lov_io_unlock,
-                        .cio_start     = lov_io_start,
-                        .cio_end       = lov_io_end
-                },
-                [CIT_WRITE] = {
-                        .cio_fini      = lov_io_fini,
-                        .cio_iter_init = lov_io_rw_iter_init,
-                        .cio_iter_fini = lov_io_iter_fini,
-                        .cio_lock      = lov_io_lock,
-                        .cio_unlock    = lov_io_unlock,
-                        .cio_start     = lov_io_start,
-                        .cio_end       = lov_io_end
-                },
+	.op = {
+		[CIT_READ] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_rw_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_start,
+			.cio_end       = lov_io_end
+		},
+		[CIT_WRITE] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_rw_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_start,
+			.cio_end       = lov_io_end
+		},
 		[CIT_SETATTR] = {
 			.cio_fini      = lov_io_fini,
 			.cio_iter_init = lov_io_setattr_iter_init,
@@ -986,23 +1395,23 @@ static const struct cl_io_operations lov_io_ops = {
 			.cio_end       = lov_io_end
 		},
 		[CIT_DATA_VERSION] = {
-			.cio_fini	= lov_io_fini,
-			.cio_iter_init	= lov_io_iter_init,
-			.cio_iter_fini	= lov_io_iter_fini,
-			.cio_lock	= lov_io_lock,
-			.cio_unlock	= lov_io_unlock,
-			.cio_start	= lov_io_start,
-			.cio_end	= lov_io_data_version_end,
+			.cio_fini       = lov_io_fini,
+			.cio_iter_init  = lov_io_iter_init,
+			.cio_iter_fini  = lov_io_iter_fini,
+			.cio_lock       = lov_io_lock,
+			.cio_unlock     = lov_io_unlock,
+			.cio_start      = lov_io_start,
+			.cio_end        = lov_io_data_version_end,
+		},
+		[CIT_FAULT] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_fault_start,
+			.cio_end       = lov_io_end
 		},
-                [CIT_FAULT] = {
-                        .cio_fini      = lov_io_fini,
-                        .cio_iter_init = lov_io_iter_init,
-                        .cio_iter_fini = lov_io_iter_fini,
-                        .cio_lock      = lov_io_lock,
-                        .cio_unlock    = lov_io_unlock,
-                        .cio_start     = lov_io_fault_start,
-                        .cio_end       = lov_io_end
-                },
 		[CIT_FSYNC] = {
 			.cio_fini      = lov_io_fini,
 			.cio_iter_init = lov_io_iter_init,
@@ -1021,11 +1430,14 @@ static const struct cl_io_operations lov_io_ops = {
 			.cio_start     = lov_io_start,
 			.cio_end       = lov_io_end
 		},
+		[CIT_GLIMPSE] = {
+			.cio_fini      = lov_io_fini,
+		},
 		[CIT_MISC] = {
 			.cio_fini      = lov_io_fini
 		}
 	},
-	.cio_read_ahead		       = lov_io_read_ahead,
+	.cio_read_ahead                = lov_io_read_ahead,
 	.cio_submit                    = lov_io_submit,
 	.cio_commit_async              = lov_io_commit_async,
 };
@@ -1057,7 +1469,7 @@ static int lov_empty_io_submit(const struct lu_env *env,
 static void lov_empty_impossible(const struct lu_env *env,
                                  struct cl_io_slice *ios)
 {
-        LBUG();
+	LBUG();
 }
 
 #define LOV_EMPTY_IMPOSSIBLE ((void *)lov_empty_impossible)
@@ -1066,43 +1478,46 @@ static void lov_empty_impossible(const struct lu_env *env,
  * An io operation vector for files without stripes.
  */
 static const struct cl_io_operations lov_empty_io_ops = {
-        .op = {
-                [CIT_READ] = {
-                        .cio_fini       = lov_empty_io_fini,
+	.op = {
+		[CIT_READ] = {
+			.cio_fini       = lov_empty_io_fini,
 #if 0
-                        .cio_iter_init  = LOV_EMPTY_IMPOSSIBLE,
-                        .cio_lock       = LOV_EMPTY_IMPOSSIBLE,
-                        .cio_start      = LOV_EMPTY_IMPOSSIBLE,
-                        .cio_end        = LOV_EMPTY_IMPOSSIBLE
+			.cio_iter_init  = LOV_EMPTY_IMPOSSIBLE,
+			.cio_lock       = LOV_EMPTY_IMPOSSIBLE,
+			.cio_start      = LOV_EMPTY_IMPOSSIBLE,
+			.cio_end        = LOV_EMPTY_IMPOSSIBLE
 #endif
-                },
-                [CIT_WRITE] = {
-                        .cio_fini      = lov_empty_io_fini,
-                        .cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
-                        .cio_lock      = LOV_EMPTY_IMPOSSIBLE,
-                        .cio_start     = LOV_EMPTY_IMPOSSIBLE,
-                        .cio_end       = LOV_EMPTY_IMPOSSIBLE
-                },
-                [CIT_SETATTR] = {
-                        .cio_fini      = lov_empty_io_fini,
-                        .cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
-                        .cio_lock      = LOV_EMPTY_IMPOSSIBLE,
-                        .cio_start     = LOV_EMPTY_IMPOSSIBLE,
-                        .cio_end       = LOV_EMPTY_IMPOSSIBLE
-                },
-                [CIT_FAULT] = {
-                        .cio_fini      = lov_empty_io_fini,
-                        .cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
-                        .cio_lock      = LOV_EMPTY_IMPOSSIBLE,
-                        .cio_start     = LOV_EMPTY_IMPOSSIBLE,
-                        .cio_end       = LOV_EMPTY_IMPOSSIBLE
-                },
+		},
+		[CIT_WRITE] = {
+			.cio_fini      = lov_empty_io_fini,
+			.cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+			.cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+			.cio_start     = LOV_EMPTY_IMPOSSIBLE,
+			.cio_end       = LOV_EMPTY_IMPOSSIBLE
+		},
+		[CIT_SETATTR] = {
+			.cio_fini      = lov_empty_io_fini,
+			.cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+			.cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+			.cio_start     = LOV_EMPTY_IMPOSSIBLE,
+			.cio_end       = LOV_EMPTY_IMPOSSIBLE
+		},
+		[CIT_FAULT] = {
+			.cio_fini      = lov_empty_io_fini,
+			.cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+			.cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+			.cio_start     = LOV_EMPTY_IMPOSSIBLE,
+			.cio_end       = LOV_EMPTY_IMPOSSIBLE
+		},
 		[CIT_FSYNC] = {
 			.cio_fini      = lov_empty_io_fini
 		},
 		[CIT_LADVISE] = {
 			.cio_fini   = lov_empty_io_fini
 		},
+		[CIT_GLIMPSE] = {
+			.cio_fini      = lov_empty_io_fini
+		},
 		[CIT_MISC] = {
 			.cio_fini      = lov_empty_io_fini
 		}
@@ -1114,23 +1529,26 @@ static const struct cl_io_operations lov_empty_io_ops = {
 int lov_io_init_composite(const struct lu_env *env, struct cl_object *obj,
 			  struct cl_io *io)
 {
-	struct lov_io       *lio = lov_env_io(env);
-	struct lov_object   *lov = cl2lov(obj);
+	struct lov_io *lio = lov_env_io(env);
+	struct lov_object *lov = cl2lov(obj);
+	int result;
 
 	ENTRY;
+
 	INIT_LIST_HEAD(&lio->lis_active);
-	io->ci_result = lov_io_slice_init(lio, lov, io);
-	if (io->ci_result != 0)
-		RETURN(io->ci_result);
-
-	if (io->ci_result == 0) {
-		io->ci_result = lov_io_subio_init(env, lio, io);
-		if (io->ci_result == 0) {
-			cl_io_slice_add(io, &lio->lis_cl, obj, &lov_io_ops);
-			atomic_inc(&lov->lo_active_ios);
-		}
+	result = lov_io_slice_init(lio, lov, io);
+	if (result)
+		GOTO(out, result);
+
+	result = lov_io_subio_init(env, lio, io);
+	if (!result) {
+		cl_io_slice_add(io, &lio->lis_cl, obj, &lov_io_ops);
+		atomic_inc(&lov->lo_active_ios);
 	}
-	RETURN(io->ci_result);
+	EXIT;
+out:
+	io->ci_result = result < 0 ? result : 0;
+	return result;
 }
 
 int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj,
@@ -1146,6 +1564,7 @@ int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj,
 	default:
 		LBUG();
 	case CIT_MISC:
+	case CIT_GLIMPSE:
 	case CIT_READ:
 		result = 0;
 		break;
@@ -1189,6 +1608,7 @@ int lov_io_init_released(const struct lu_env *env, struct cl_object *obj,
 		LASSERTF(0, "invalid type %d\n", io->ci_type);
 		result = -EOPNOTSUPP;
 		break;
+	case CIT_GLIMPSE:
 	case CIT_MISC:
 	case CIT_FSYNC:
 	case CIT_LADVISE:
@@ -1196,7 +1616,8 @@ int lov_io_init_released(const struct lu_env *env, struct cl_object *obj,
 		result = 1;
 		break;
 	case CIT_SETATTR:
-		/* the truncate to 0 is managed by MDT:
+		/*
+		 * the truncate to 0 is managed by MDT:
 		 * - in open, for open O_TRUNC
 		 * - in setattr, for truncate
 		 */
@@ -1223,4 +1644,45 @@ int lov_io_init_released(const struct lu_env *env, struct cl_object *obj,
 	io->ci_result = result < 0 ? result : 0;
 	RETURN(result);
 }
+
+/**
+ * Return the index in composite:lo_entries by the file offset
+ */
+int lov_io_layout_at(struct lov_io *lio, __u64 offset)
+{
+	struct lov_object *lov = lio->lis_object;
+	struct lov_layout_composite *comp = &lov->u.composite;
+	int start_index = 0;
+	int end_index = comp->lo_entry_count - 1;
+	int i;
+
+	LASSERT(lov->lo_type == LLT_COMP);
+
+	/* This is actual file offset so nothing can cover eof. */
+	if (offset == LUSTRE_EOF)
+		return -1;
+
+	if (lov_is_flr(lov)) {
+		struct lov_mirror_entry *lre;
+
+		LASSERT(lio->lis_mirror_index >= 0);
+
+		lre = &comp->lo_mirrors[lio->lis_mirror_index];
+		start_index = lre->lre_start;
+		end_index = lre->lre_end;
+	}
+
+	for (i = start_index; i <= end_index; i++) {
+		struct lov_layout_entry *lle = lov_entry(lov, i);
+
+		if ((offset >= lle->lle_extent->e_start &&
+		     offset < lle->lle_extent->e_end) ||
+		    (offset == OBD_OBJECT_EOF &&
+		     lle->lle_extent->e_end == OBD_OBJECT_EOF))
+			return i;
+	}
+
+	return -1;
+}
+
 /** @} lov */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_lock.c b/drivers/staging/lustrefsx/lustre/lov/lov_lock.c
index efa4cc11ea94e..1b4a95876cc75 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_lock.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_lock.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -52,22 +52,22 @@ static struct lov_sublock_env *lov_sublock_env_get(const struct lu_env *env,
 						   const struct cl_lock *parent,
 						   struct lov_lock_sub *lls)
 {
-        struct lov_sublock_env *subenv;
-        struct lov_io          *lio    = lov_env_io(env);
-        struct cl_io           *io     = lio->lis_cl.cis_io;
-        struct lov_io_sub      *sub;
-
-        subenv = &lov_env_session(env)->ls_subenv;
-
-        /*
-         * FIXME: We tend to use the subio's env & io to call the sublock
-         * lock operations because osc lock sometimes stores some control
-         * variables in thread's IO infomation(Now only lockless information).
-         * However, if the lock's host(object) is different from the object
-         * for current IO, we have no way to get the subenv and subio because
-         * they are not initialized at all. As a temp fix, in this case,
-         * we still borrow the parent's env to call sublock operations.
-         */
+	struct lov_sublock_env *subenv;
+	struct lov_io          *lio    = lov_env_io(env);
+	struct cl_io           *io     = lio->lis_cl.cis_io;
+	struct lov_io_sub      *sub;
+
+	subenv = &lov_env_session(env)->ls_subenv;
+
+	/*
+	 * FIXME: We tend to use the subio's env & io to call the sublock
+	 * lock operations because osc lock sometimes stores some control
+	 * variables in thread's IO infomation(Now only lockless information).
+	 * However, if the lock's host(object) is different from the object
+	 * for current IO, we have no way to get the subenv and subio because
+	 * they are not initialized at all. As a temp fix, in this case,
+	 * we still borrow the parent's env to call sublock operations.
+	 */
 	if (!io || !cl_object_same(io->ci_obj, parent->cll_descr.cld_obj)) {
 		subenv->lse_env = env;
 		subenv->lse_io = io;
@@ -89,6 +89,7 @@ static int lov_sublock_init(const struct lu_env *env,
 {
 	struct lov_sublock_env *subenv;
 	int result;
+
 	ENTRY;
 
 	subenv = lov_sublock_env_get(env, parent, lls);
@@ -111,6 +112,7 @@ static int lov_sublock_init(const struct lu_env *env,
  * through already created sub-locks (possibly shared with other top-locks).
  */
 static struct lov_lock *lov_lock_sub_init(const struct lu_env *env,
+					  const struct cl_io *io,
 					  const struct cl_object *obj,
 					  struct cl_lock *lock)
 {
@@ -133,20 +135,18 @@ static struct lov_lock *lov_lock_sub_init(const struct lu_env *env,
 		ext.e_end  = cl_offset(obj, lock->cll_descr.cld_end + 1);
 
 	nr = 0;
-	for (index = lov_lsm_entry(lov->lo_lsm, ext.e_start);
-	     index >= 0 && index < lov->lo_lsm->lsm_entry_count; index++) {
+	lov_foreach_io_layout(index, lov_env_io(env), &ext) {
 		struct lov_layout_raid0 *r0 = lov_r0(lov, index);
 
-		/* assume lsm entries are sorted. */
-		if (!lu_extent_is_overlapped(&ext,
-					     &lov_lse(lov, index)->lsme_extent))
-			break;
-
 		for (i = 0; i < r0->lo_nr; i++) {
-			if (likely(r0->lo_sub[i] != NULL) && /* spare layout */
-			    lov_stripe_intersects(lov->lo_lsm, index, i,
-						  &ext, &start, &end))
-				nr++;
+			if (likely(r0->lo_sub[i])) {/* spare layout */
+				if (lov_stripe_intersects(lov->lo_lsm, index, i,
+							  &ext, &start, &end))
+					nr++;
+				else if (cl_io_is_trunc(io) &&
+					 r0->lo_trunc_stripeno == i)
+					nr++;
+			}
 		}
 	}
 	/**
@@ -156,28 +156,33 @@ static struct lov_lock *lov_lock_sub_init(const struct lu_env *env,
 	 */
 
 	OBD_ALLOC_LARGE(lovlck, offsetof(struct lov_lock, lls_sub[nr]));
-	if (lovlck == NULL)
+	if (!lovlck)
 		RETURN(ERR_PTR(-ENOMEM));
 
 	lovlck->lls_nr = nr;
 	nr = 0;
-	for (index = lov_lsm_entry(lov->lo_lsm, ext.e_start);
-	     index >= 0 && index < lov->lo_lsm->lsm_entry_count; index++) {
+	lov_foreach_io_layout(index, lov_env_io(env), &ext) {
 		struct lov_layout_raid0 *r0 = lov_r0(lov, index);
 
-		/* assume lsm entries are sorted. */
-		if (!lu_extent_is_overlapped(&ext,
-					     &lov_lse(lov, index)->lsme_extent))
-			break;
 		for (i = 0; i < r0->lo_nr; ++i) {
 			struct lov_lock_sub *lls = &lovlck->lls_sub[nr];
 			struct cl_lock_descr *descr = &lls->sub_lock.cll_descr;
+			bool intersect = false;
 
-			if (unlikely(r0->lo_sub[i] == NULL) ||
-			    !lov_stripe_intersects(lov->lo_lsm, index, i,
-						   &ext, &start, &end))
+			if (unlikely(!r0->lo_sub[i]))
 				continue;
 
+			intersect = lov_stripe_intersects(lov->lo_lsm, index, i,
+							  &ext, &start, &end);
+			if (intersect)
+				goto init_sublock;
+
+			if (cl_io_is_trunc(io) && i == r0->lo_trunc_stripeno)
+				goto init_sublock;
+
+			continue;
+
+init_sublock:
 			LASSERT(descr->cld_obj == NULL);
 			descr->cld_obj   = lovsub2cl(r0->lo_sub[i]);
 			descr->cld_start = cl_index(descr->cld_obj, start);
@@ -244,10 +249,10 @@ static int lov_lock_enqueue(const struct lu_env *env,
 			    const struct cl_lock_slice *slice,
 			    struct cl_io *io, struct cl_sync_io *anchor)
 {
-	struct cl_lock          *lock   = slice->cls_lock;
-	struct lov_lock         *lovlck = cl2lov_lock(slice);
-	int                     i;
-	int                     rc      = 0;
+	struct cl_lock *lock = slice->cls_lock;
+	struct lov_lock *lovlck = cl2lov_lock(slice);
+	int i;
+	int rc = 0;
 
 	ENTRY;
 
@@ -274,16 +279,16 @@ static int lov_lock_enqueue(const struct lu_env *env,
 static void lov_lock_cancel(const struct lu_env *env,
 			    const struct cl_lock_slice *slice)
 {
-	struct cl_lock  *lock   = slice->cls_lock;
+	struct cl_lock *lock = slice->cls_lock;
 	struct lov_lock *lovlck = cl2lov_lock(slice);
 	int i;
 
 	ENTRY;
 
 	for (i = 0; i < lovlck->lls_nr; ++i) {
-		struct lov_lock_sub     *lls = &lovlck->lls_sub[i];
-		struct cl_lock          *sublock = &lls->sub_lock;
-		struct lov_sublock_env  *subenv;
+		struct lov_lock_sub *lls = &lovlck->lls_sub[i];
+		struct cl_lock *sublock = &lls->sub_lock;
+		struct lov_sublock_env *subenv;
 
 		if (!lls->sub_is_enqueued)
 			continue;
@@ -301,27 +306,27 @@ static void lov_lock_cancel(const struct lu_env *env,
 }
 
 static int lov_lock_print(const struct lu_env *env, void *cookie,
-                          lu_printer_t p, const struct cl_lock_slice *slice)
+			  lu_printer_t p, const struct cl_lock_slice *slice)
 {
-        struct lov_lock *lck = cl2lov_lock(slice);
-        int              i;
+	struct lov_lock *lck = cl2lov_lock(slice);
+	int i;
 
-        (*p)(env, cookie, "%d\n", lck->lls_nr);
-        for (i = 0; i < lck->lls_nr; ++i) {
-                struct lov_lock_sub *sub;
+	(*p)(env, cookie, "%d\n", lck->lls_nr);
+	for (i = 0; i < lck->lls_nr; ++i) {
+		struct lov_lock_sub *sub;
 
-                sub = &lck->lls_sub[i];
+		sub = &lck->lls_sub[i];
 		(*p)(env, cookie, "    %d %x: ", i, sub->sub_is_enqueued);
 		cl_lock_print(env, cookie, p, &sub->sub_lock);
-        }
-        return 0;
+	}
+	return 0;
 }
 
 static const struct cl_lock_operations lov_lock_ops = {
-        .clo_fini      = lov_lock_fini,
-        .clo_enqueue   = lov_lock_enqueue,
-        .clo_cancel    = lov_lock_cancel,
-        .clo_print     = lov_lock_print
+	.clo_fini      = lov_lock_fini,
+	.clo_enqueue   = lov_lock_enqueue,
+	.clo_cancel    = lov_lock_cancel,
+	.clo_print     = lov_lock_print
 };
 
 int lov_lock_init_composite(const struct lu_env *env, struct cl_object *obj,
@@ -331,7 +336,7 @@ int lov_lock_init_composite(const struct lu_env *env, struct cl_object *obj,
 	int result = 0;
 
 	ENTRY;
-	lck = lov_lock_sub_init(env, obj, lock);
+	lck = lov_lock_sub_init(env, io, obj, lock);
 	if (!IS_ERR(lck))
 		cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_lock_ops);
 	else
@@ -343,6 +348,7 @@ static void lov_empty_lock_fini(const struct lu_env *env,
 				struct cl_lock_slice *slice)
 {
 	struct lov_lock *lck = cl2lov_lock(slice);
+
 	OBD_SLAB_FREE_PTR(lck, lov_lock_kmem);
 }
 
@@ -367,7 +373,7 @@ int lov_lock_init_empty(const struct lu_env *env, struct cl_object *obj,
 
 	ENTRY;
 	OBD_SLAB_ALLOC_PTR_GFP(lck, lov_lock_kmem, GFP_NOFS);
-	if (lck != NULL) {
+	if (lck) {
 		cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_empty_lock_ops);
 		result = 0;
 	}
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_merge.c b/drivers/staging/lustrefsx/lustre/lov/lov_merge.c
index de9e4298dd884..8a6ced24ff522 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_merge.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_merge.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_obd.c b/drivers/staging/lustrefsx/lustre/lov/lov_obd.c
index 8cdd60fc90171..b9c42313fe3ae 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_obd.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_obd.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,16 +40,14 @@
 #define DEBUG_SUBSYSTEM S_LOV
 #include <libcfs/libcfs.h>
 
-#include <lustre/lustre_idl.h>
-
 #include <cl_object.h>
 #include <lustre_dlm.h>
 #include <lustre_fid.h>
-#include <uapi/linux/lustre_ioctl.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
 #include <lustre_lib.h>
 #include <lustre_mds.h>
 #include <lustre_net.h>
-#include <uapi/linux/lustre_param.h>
+#include <uapi/linux/lustre/lustre_param.h>
 #include <lustre_swab.h>
 #include <lprocfs_status.h>
 #include <obd_class.h>
@@ -59,7 +57,7 @@
 
 /* Keep a refcount of lov->tgt usage to prevent racing with addition/deletion.
    Any function that expects lov_tgts to remain stationary must take a ref. */
-static void lov_getref(struct obd_device *obd)
+void lov_tgts_getref(struct obd_device *obd)
 {
 	struct lov_obd *lov = &obd->u.lov;
 
@@ -72,7 +70,7 @@ static void lov_getref(struct obd_device *obd)
 
 static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt);
 
-static void lov_putref(struct obd_device *obd)
+void lov_tgts_putref(struct obd_device *obd)
 {
 	struct lov_obd *lov = &obd->u.lov;
 
@@ -102,21 +100,21 @@ static void lov_putref(struct obd_device *obd)
 
 		list_for_each_entry_safe(tgt, n, &kill, ltd_kill) {
 			list_del(&tgt->ltd_kill);
-                        /* Disconnect */
-                        __lov_del_obd(obd, tgt);
-                }
-        } else {
+			/* Disconnect */
+			__lov_del_obd(obd, tgt);
+		}
+	} else {
 		mutex_unlock(&lov->lov_lock);
-        }
+	}
 }
 
 static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
-                              enum obd_notify_event ev);
+			      enum obd_notify_event ev);
 static int lov_notify(struct obd_device *obd, struct obd_device *watched,
 		      enum obd_notify_event ev);
 
-int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
-                    struct obd_connect_data *data)
+int lov_connect_obd(struct obd_device *obd, u32 index, int activate,
+		    struct obd_connect_data *data)
 {
 	struct lov_obd *lov = &obd->u.lov;
 	struct obd_uuid *tgt_uuid;
@@ -148,12 +146,12 @@ int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
          */
         imp = tgt_obd->u.cli.cl_import;
 
-        if (activate) {
-                tgt_obd->obd_no_recov = 0;
-                /* FIXME this is probably supposed to be
-                   ptlrpc_set_import_active.  Horrible naming. */
-                ptlrpc_activate_import(imp);
-        }
+	if (activate) {
+		tgt_obd->obd_no_recov = 0;
+		/* FIXME this is probably supposed to be
+		   ptlrpc_set_import_active.  Horrible naming. */
+		ptlrpc_activate_import(imp, false);
+	}
 
         rc = obd_register_observer(tgt_obd, obd);
         if (rc) {
@@ -182,26 +180,17 @@ int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
         CDEBUG(D_CONFIG, "Connected tgt idx %d %s (%s) %sactive\n", index,
                obd_uuid2str(tgt_uuid), tgt_obd->obd_name, activate ? "":"in");
 
-	if (lov->targets_proc_entry != NULL) {
-		struct proc_dir_entry *osc_symlink;
-		struct obd_device *osc_obd;
-
-		osc_obd = lov->lov_tgts[index]->ltd_exp->exp_obd;
-
-		LASSERT(osc_obd != NULL);
-		LASSERT(osc_obd->obd_magic == OBD_DEVICE_MAGIC);
-		LASSERT(osc_obd->obd_type->typ_name != NULL);
-
-		osc_symlink = lprocfs_add_symlink(osc_obd->obd_name,
-						  lov->targets_proc_entry,
-						  "../../../%s/%s",
-						  osc_obd->obd_type->typ_name,
-						  osc_obd->obd_name);
-		if (osc_symlink == NULL) {
-			CERROR("cannot register LOV target "
-			       "/proc/fs/lustre/%s/%s/target_obds/%s\n",
-			       obd->obd_type->typ_name, obd->obd_name,
-			       osc_obd->obd_name);
+	if (lov->lov_tgts_kobj) {
+		/* Even if we failed, that's ok */
+		rc = sysfs_create_link(lov->lov_tgts_kobj,
+				       &tgt_obd->obd_kset.kobj,
+				       tgt_obd->obd_name);
+		if (rc) {
+			CERROR("%s: can't register LOV target /sys/fs/lustre/%s/%s/target_obds/%s : rc = %d\n",
+			       obd->obd_name, obd->obd_type->typ_name,
+			       obd->obd_name,
+			       lov->lov_tgts[index]->ltd_exp->exp_obd->obd_name,
+			       rc);
 		}
 	}
 	RETURN(0);
@@ -234,17 +223,8 @@ static int lov_connect(const struct lu_env *env,
         if (data)
                 lov->lov_ocd = *data;
 
-	lov->targets_proc_entry = lprocfs_register("target_obds",
-						   obd->obd_proc_entry,
-						   NULL, NULL);
-	if (IS_ERR(lov->targets_proc_entry)) {
-		CERROR("%s: cannot register "
-		       "/proc/fs/lustre/%s/%s/target_obds\n",
-		       obd->obd_name, obd->obd_type->typ_name, obd->obd_name);
-		lov->targets_proc_entry = NULL;
-	}
+	lov_tgts_getref(obd);
 
-        obd_getref(obd);
         for (i = 0; i < lov->desc.ld_tgt_count; i++) {
                 tgt = lov->lov_tgts[i];
                 if (!tgt || obd_uuid_empty(&tgt->ltd_uuid))
@@ -267,9 +247,10 @@ static int lov_connect(const struct lu_env *env,
                                obd->obd_name, rc);
                 }
         }
-        obd_putref(obd);
 
-        RETURN(0);
+	lov_tgts_putref(obd);
+
+	RETURN(0);
 }
 
 static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
@@ -290,6 +271,10 @@ static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
         }
 
 	if (osc_obd) {
+		if (lov->lov_tgts_kobj)
+			sysfs_remove_link(lov->lov_tgts_kobj,
+					  osc_obd->obd_name);
+
 		/* Pass it on to our clients.
 		 * XXX This should be an argument to disconnect,
 		 * XXX not a back-door flag on the OBD.  Ah well.
@@ -318,40 +303,39 @@ static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
 
 static int lov_disconnect(struct obd_export *exp)
 {
-        struct obd_device *obd = class_exp2obd(exp);
-        struct lov_obd *lov = &obd->u.lov;
-        int i, rc;
-        ENTRY;
-
-        if (!lov->lov_tgts)
-                goto out;
-
-        /* Only disconnect the underlying layers on the final disconnect. */
-        lov->lov_connects--;
-        if (lov->lov_connects != 0) {
-                /* why should there be more than 1 connect? */
-                CERROR("disconnect #%d\n", lov->lov_connects);
-                goto out;
-        }
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lov_obd *lov = &obd->u.lov;
+	u32 index;
+	int rc;
 
-        /* Let's hold another reference so lov_del_obd doesn't spin through
-           putref every time */
-        obd_getref(obd);
+	ENTRY;
+	if (!lov->lov_tgts)
+		goto out;
+
+	/* Only disconnect the underlying layers on the final disconnect. */
+	lov->lov_connects--;
+	if (lov->lov_connects != 0) {
+		/* why should there be more than 1 connect? */
+		CWARN("%s: unexpected disconnect #%d\n",
+		      obd->obd_name, lov->lov_connects);
+		goto out;
+	}
 
-        for (i = 0; i < lov->desc.ld_tgt_count; i++) {
-                if (lov->lov_tgts[i] && lov->lov_tgts[i]->ltd_exp) {
-                        /* Disconnection is the last we know about an obd */
-			lov_del_target(obd, i, NULL, lov->lov_tgts[i]->ltd_gen);
-                }
-        }
-        obd_putref(obd);
+	/* hold another ref so lov_del_obd() doesn't spin in putref each time */
+	lov_tgts_getref(obd);
 
-	if (lov->targets_proc_entry != NULL)
-		lprocfs_remove(&lov->targets_proc_entry);
+	for (index = 0; index < lov->desc.ld_tgt_count; index++) {
+		if (lov->lov_tgts[index] && lov->lov_tgts[index]->ltd_exp) {
+			/* Disconnection is the last we know about an OBD */
+			lov_del_target(obd, index, NULL,
+				       lov->lov_tgts[index]->ltd_gen);
+		}
+	}
+	lov_tgts_putref(obd);
 
 out:
-        rc = class_disconnect(exp); /* bz 9811 */
-        RETURN(rc);
+	rc = class_disconnect(exp); /* bz 9811 */
+	RETURN(rc);
 }
 
 /* Error codes:
@@ -372,7 +356,7 @@ static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
         CDEBUG(D_INFO, "Searching in lov %p for uuid %s event(%d)\n",
                lov, uuid->uuid, ev);
 
-	obd_getref(obd);
+	lov_tgts_getref(obd);
 	for (index = 0; index < lov->desc.ld_tgt_count; index++) {
 		tgt = lov->lov_tgts[index];
 		if (!tgt)
@@ -447,7 +431,7 @@ static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
 		       index, tgt->ltd_exp->exp_handle.h_cookie);
 
  out:
-	obd_putref(obd);
+	lov_tgts_putref(obd);
 	RETURN(index);
 }
 
@@ -497,37 +481,37 @@ static int lov_notify(struct obd_device *obd, struct obd_device *watched,
 }
 
 static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
-                          __u32 index, int gen, int active)
+			  u32 index, int gen, int active)
 {
-        struct lov_obd *lov = &obd->u.lov;
-        struct lov_tgt_desc *tgt;
-        struct obd_device *tgt_obd;
-        int rc;
-        ENTRY;
+	struct lov_obd *lov = &obd->u.lov;
+	struct lov_tgt_desc *tgt;
+	struct obd_device *tgt_obd;
+	int rc;
 
-        CDEBUG(D_CONFIG, "uuid:%s idx:%d gen:%d active:%d\n",
-               uuidp->uuid, index, gen, active);
+	ENTRY;
+	CDEBUG(D_CONFIG, "uuid:%s idx:%u gen:%d active:%d\n",
+	       uuidp->uuid, index, gen, active);
 
-        if (gen <= 0) {
-                CERROR("request to add OBD %s with invalid generation: %d\n",
-                       uuidp->uuid, gen);
-                RETURN(-EINVAL);
-        }
+	if (gen <= 0) {
+		CERROR("%s: request to add '%s' with invalid generation: %d\n",
+		       obd->obd_name, uuidp->uuid, gen);
+		RETURN(-EINVAL);
+	}
 
-        tgt_obd = class_find_client_obd(uuidp, LUSTRE_OSC_NAME,
-                                        &obd->obd_uuid);
-        if (tgt_obd == NULL)
-                RETURN(-EINVAL);
+	tgt_obd = class_find_client_obd(uuidp, LUSTRE_OSC_NAME, &obd->obd_uuid);
+	if (tgt_obd == NULL)
+		RETURN(-EINVAL);
 
 	mutex_lock(&lov->lov_lock);
 
-        if ((index < lov->lov_tgt_size) && (lov->lov_tgts[index] != NULL)) {
-                tgt = lov->lov_tgts[index];
-                CERROR("UUID %s already assigned at LOV target index %d\n",
-                       obd_uuid2str(&tgt->ltd_uuid), index);
+	if ((index < lov->lov_tgt_size) && (lov->lov_tgts[index] != NULL)) {
+		tgt = lov->lov_tgts[index];
+		rc = -EEXIST;
+		CERROR("%s: UUID %s already assigned at index %d: rc = %d\n",
+		       obd->obd_name, obd_uuid2str(&tgt->ltd_uuid), index, rc);
 		mutex_unlock(&lov->lov_lock);
-                RETURN(-EEXIST);
-        }
+		RETURN(rc);
+	}
 
         if (index >= lov->lov_tgt_size) {
                 /* We need to reallocate the lov target array. */
@@ -595,7 +579,7 @@ static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
                 RETURN(0);
         }
 
-        obd_getref(obd);
+	lov_tgts_getref(obd);
 
         rc = lov_connect_obd(obd, index, active, &lov->lov_ocd);
         if (rc)
@@ -618,17 +602,17 @@ static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
 			active ? OBD_NOTIFY_CONNECT : OBD_NOTIFY_INACTIVE);
 
 out:
-        if (rc) {
-                CERROR("add failed (%d), deleting %s\n", rc,
-                       obd_uuid2str(&tgt->ltd_uuid));
+	if (rc) {
+		CERROR("%s: add failed, deleting %s: rc = %d\n",
+		       obd->obd_name, obd_uuid2str(&tgt->ltd_uuid), rc);
 		lov_del_target(obd, index, NULL, 0);
-        }
-        obd_putref(obd);
-        RETURN(rc);
+	}
+	lov_tgts_putref(obd);
+	RETURN(rc);
 }
 
 /* Schedule a target for deletion */
-int lov_del_target(struct obd_device *obd, __u32 index,
+int lov_del_target(struct obd_device *obd, u32 index,
                    struct obd_uuid *uuidp, int gen)
 {
         struct lov_obd *lov = &obd->u.lov;
@@ -644,7 +628,7 @@ int lov_del_target(struct obd_device *obd, __u32 index,
 
 	/* to make sure there's no ongoing lov_notify() now */
 	down_write(&lov->lov_notify_lock);
-        obd_getref(obd);
+	lov_tgts_getref(obd);
 
         if (!lov->lov_tgts[index]) {
                 CERROR("LOV target at index %d is not setup.\n", index);
@@ -665,12 +649,12 @@ int lov_del_target(struct obd_device *obd, __u32 index,
 
         lov->lov_tgts[index]->ltd_reap = 1;
         lov->lov_death_row++;
-        /* we really delete it from obd_putref */
+	/* we really delete it from lov_tgts_putref() */
 out:
-        obd_putref(obd);
+	lov_tgts_putref(obd);
 	up_write(&lov->lov_notify_lock);
 
-        RETURN(rc);
+	RETURN(rc);
 }
 
 static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
@@ -747,9 +731,6 @@ int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 {
 	struct lov_desc *desc;
 	struct lov_obd *lov = &obd->u.lov;
-#ifdef CONFIG_PROC_FS
-	struct obd_type *type;
-#endif
 	int rc;
 	ENTRY;
 
@@ -803,45 +784,12 @@ int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
         if (rc)
 		GOTO(out, rc);
 
-#ifdef CONFIG_PROC_FS
-	obd->obd_vars = lprocfs_lov_obd_vars;
-	/* If this is true then both client (lov) and server
-	 * (lod) are on the same node. The lod layer if loaded
-	 * first will register the lov proc directory. In that
-	 * case obd->obd_type->typ_procroot will be not set.
-	 * Instead we use type->typ_procsym as the parent. */
-	type = class_search_type(LUSTRE_LOD_NAME);
-	if (type != NULL && type->typ_procsym != NULL) {
-		obd->obd_proc_entry = lprocfs_register(obd->obd_name,
-						       type->typ_procsym,
-						       obd->obd_vars, obd);
-		if (IS_ERR(obd->obd_proc_entry)) {
-			rc = PTR_ERR(obd->obd_proc_entry);
-			CERROR("error %d setting up lprocfs for %s\n", rc,
-			       obd->obd_name);
-			obd->obd_proc_entry = NULL;
-		}
-	} else {
-		rc = lprocfs_obd_setup(obd);
-	}
+	rc = lov_tunables_init(obd);
+	if (rc)
+		GOTO(out, rc);
 
-	if (rc == 0) {
-		rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd",
-					0444, &lov_proc_target_fops, obd);
-		if (rc)
-			CWARN("Error adding the target_obd file\n");
-
-		lov->lov_pool_proc_entry = lprocfs_register("pools",
-							    obd->obd_proc_entry,
-							    NULL, NULL);
-		if (IS_ERR(lov->lov_pool_proc_entry)) {
-			rc = PTR_ERR(lov->lov_pool_proc_entry);
-			CERROR("error %d setting up lprocfs for pools\n", rc);
-			lov->lov_pool_proc_entry = NULL;
-		}
-	}
-#endif
-	RETURN(0);
+	lov->lov_tgts_kobj = kobject_create_and_add("target_obds",
+						    &obd->obd_kset.kobj);
 
 out:
 	return rc;
@@ -854,6 +802,11 @@ static int lov_cleanup(struct obd_device *obd)
         struct pool_desc *pool;
         ENTRY;
 
+	if (lov->lov_tgts_kobj) {
+		kobject_put(lov->lov_tgts_kobj);
+		lov->lov_tgts_kobj = NULL;
+	}
+
 	list_for_each_safe(pos, tmp, &lov->lov_pool_list) {
 		pool = list_entry(pos, struct pool_desc, pool_list);
                 /* free pool structs */
@@ -869,14 +822,13 @@ static int lov_cleanup(struct obd_device *obd)
 	lprocfs_obd_cleanup(obd);
         if (lov->lov_tgts) {
                 int i;
-                obd_getref(obd);
+		lov_tgts_getref(obd);
                 for (i = 0; i < lov->desc.ld_tgt_count; i++) {
 			if (!lov->lov_tgts[i])
 				continue;
 
 			/* Inactive targets may never have connected */
-			if (lov->lov_tgts[i]->ltd_active ||
-			    atomic_read(&lov->lov_refcount))
+			if (lov->lov_tgts[i]->ltd_active)
 				/* We should never get here - these
 				 * should have been removed in the
 				 * disconnect. */
@@ -886,7 +838,7 @@ static int lov_cleanup(struct obd_device *obd)
 				       atomic_read(&lov->lov_refcount));
 			lov_del_target(obd, i, NULL, 0);
 		}
-                obd_putref(obd);
+		lov_tgts_putref(obd);
                 OBD_FREE(lov->lov_tgts, sizeof(*lov->lov_tgts) *
                          lov->lov_tgt_size);
                 lov->lov_tgt_size = 0;
@@ -901,50 +853,56 @@ static int lov_cleanup(struct obd_device *obd)
 }
 
 int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg,
-                            __u32 *indexp, int *genp)
+			    u32 *indexp, int *genp)
 {
-        struct obd_uuid obd_uuid;
-        int cmd;
-        int rc = 0;
-        ENTRY;
+	struct obd_uuid obd_uuid;
+	int cmd;
+	int rc = 0;
 
-        switch(cmd = lcfg->lcfg_command) {
-        case LCFG_LOV_ADD_OBD:
-        case LCFG_LOV_ADD_INA:
-        case LCFG_LOV_DEL_OBD: {
-                __u32 index;
-                int gen;
-                /* lov_modify_tgts add  0:lov_mdsA  1:ost1_UUID  2:0  3:1 */
-                if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid))
-                        GOTO(out, rc = -EINVAL);
-
-                obd_str2uuid(&obd_uuid,  lustre_cfg_buf(lcfg, 1));
-
-		if (sscanf(lustre_cfg_buf(lcfg, 2), "%u", indexp) != 1)
-                        GOTO(out, rc = -EINVAL);
-                if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", genp) != 1)
-                        GOTO(out, rc = -EINVAL);
-                index = *indexp;
-                gen = *genp;
-                if (cmd == LCFG_LOV_ADD_OBD)
-                        rc = lov_add_target(obd, &obd_uuid, index, gen, 1);
-                else if (cmd == LCFG_LOV_ADD_INA)
-                        rc = lov_add_target(obd, &obd_uuid, index, gen, 0);
-                else
-                        rc = lov_del_target(obd, index, &obd_uuid, gen);
-                GOTO(out, rc);
-        }
-        case LCFG_PARAM: {
+	ENTRY;
+	switch (cmd = lcfg->lcfg_command) {
+	case LCFG_ADD_MDC:
+	case LCFG_DEL_MDC:
+		break;
+	case LCFG_LOV_ADD_OBD:
+	case LCFG_LOV_ADD_INA:
+	case LCFG_LOV_DEL_OBD: {
+		u32 index;
+		int gen;
+
+		/* lov_modify_tgts add  0:lov_mdsA  1:ost1_UUID  2:0  3:1 */
+		if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid))
+			GOTO(out, rc = -EINVAL);
+
+		obd_str2uuid(&obd_uuid,  lustre_cfg_buf(lcfg, 1));
+
+		rc = kstrtou32(lustre_cfg_buf(lcfg, 2), 10, indexp);
+		if (rc)
+			GOTO(out, rc);
+		rc = kstrtoint(lustre_cfg_buf(lcfg, 3), 10, genp);
+		if (rc)
+			GOTO(out, rc);
+		index = *indexp;
+		gen = *genp;
+		if (cmd == LCFG_LOV_ADD_OBD)
+			rc = lov_add_target(obd, &obd_uuid, index, gen, 1);
+		else if (cmd == LCFG_LOV_ADD_INA)
+			rc = lov_add_target(obd, &obd_uuid, index, gen, 0);
+		else
+			rc = lov_del_target(obd, index, &obd_uuid, gen);
+
+		GOTO(out, rc);
+	}
+	case LCFG_PARAM: {
 		struct lov_desc *desc = &(obd->u.lov.desc);
+		ssize_t count;
 
 		if (!desc)
 			GOTO(out, rc = -EINVAL);
 
-		rc = class_process_proc_param(PARAM_LOV, obd->obd_vars,
-					      lcfg, obd);
-		if (rc > 0)
-			rc = 0;
-                GOTO(out, rc);
+		count = class_modify_config(lcfg, PARAM_LOV,
+					    &obd->obd_kset.kobj);
+		GOTO(out, rc = count < 0 ? count : 0);
         }
         case LCFG_POOL_NEW:
         case LCFG_POOL_ADD:
@@ -962,84 +920,50 @@ int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg,
         RETURN(rc);
 }
 
-static int
-lov_statfs_interpret(struct ptlrpc_request_set *rqset, void *data, int rc)
-{
-	struct lov_request_set *lovset = (struct lov_request_set *)data;
-	int err;
-	ENTRY;
-
-	if (rc)
-		atomic_set(&lovset->set_completes, 0);
-
-	err = lov_fini_statfs_set(lovset);
-	RETURN(rc ? rc : err);
-}
-
-static int lov_statfs_async(struct obd_export *exp, struct obd_info *oinfo,
-                            __u64 max_age, struct ptlrpc_request_set *rqset)
-{
-        struct obd_device      *obd = class_exp2obd(exp);
-        struct lov_request_set *set;
-        struct lov_request *req;
-	struct list_head *pos;
-        struct lov_obd *lov;
-        int rc = 0;
-        ENTRY;
-
-        LASSERT(oinfo != NULL);
-        LASSERT(oinfo->oi_osfs != NULL);
-
-        lov = &obd->u.lov;
-        rc = lov_prep_statfs_set(obd, oinfo, &set);
-        if (rc)
-                RETURN(rc);
-
-	list_for_each(pos, &set->set_list) {
-		req = list_entry(pos, struct lov_request, rq_link);
-                rc = obd_statfs_async(lov->lov_tgts[req->rq_idx]->ltd_exp,
-                                      &req->rq_oi, max_age, rqset);
-                if (rc)
-                        break;
-        }
-
-	if (rc || list_empty(&rqset->set_requests)) {
-		int err;
-		if (rc)
-			atomic_set(&set->set_completes, 0);
-		err = lov_fini_statfs_set(set);
-		RETURN(rc ? rc : err);
-	}
-
-	LASSERT(rqset->set_interpret == NULL);
-	rqset->set_interpret = lov_statfs_interpret;
-	rqset->set_arg = (void *)set;
-	RETURN(0);
-}
-
 static int lov_statfs(const struct lu_env *env, struct obd_export *exp,
-		      struct obd_statfs *osfs, __u64 max_age, __u32 flags)
+		      struct obd_statfs *osfs, time64_t max_age, __u32 flags)
 {
-	struct ptlrpc_request_set *set = NULL;
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lov_obd *lov = &obd->u.lov;
 	struct obd_info oinfo = {
 		.oi_osfs = osfs,
 		.oi_flags = flags,
 	};
+	struct ptlrpc_request_set *rqset;
+	struct lov_request_set *set = NULL;
+	struct lov_request *req;
 	int rc = 0;
+	int rc2;
 
 	ENTRY;
 
-	/* for obdclass we forbid using obd_statfs_rqset, but prefer using async
-	 * statfs requests */
-	set = ptlrpc_prep_set();
-	if (set == NULL)
+	rqset = ptlrpc_prep_set();
+	if (rqset == NULL)
 		RETURN(-ENOMEM);
 
-	rc = lov_statfs_async(exp, &oinfo, max_age, set);
+	rc = lov_prep_statfs_set(obd, &oinfo, &set);
+	if (rc < 0)
+		GOTO(out_rqset, rc);
+
+	list_for_each_entry(req, &set->set_list, rq_link) {
+		rc = obd_statfs_async(lov->lov_tgts[req->rq_idx]->ltd_exp,
+				      &req->rq_oi, max_age, rqset);
+		if (rc < 0)
+			GOTO(out_set, rc);
+	}
+
+	rc = ptlrpc_set_wait(env, rqset);
+
+out_set:
+	if (rc < 0)
+		atomic_set(&set->set_completes, 0);
+
+	rc2 = lov_fini_statfs_set(set);
 	if (rc == 0)
-		rc = ptlrpc_set_wait(set);
+		rc = rc2;
 
-	ptlrpc_set_destroy(set);
+out_rqset:
+	ptlrpc_set_destroy(rqset);
 
 	RETURN(rc);
 }
@@ -1047,35 +971,39 @@ static int lov_statfs(const struct lu_env *env, struct obd_export *exp,
 static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
 			 void *karg, void __user *uarg)
 {
-        struct obd_device *obddev = class_exp2obd(exp);
-        struct lov_obd *lov = &obddev->u.lov;
-        int i = 0, rc = 0, count = lov->desc.ld_tgt_count;
-        struct obd_uuid *uuidp;
-        ENTRY;
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lov_obd *lov = &obd->u.lov;
+	int i = 0, rc = 0, count = lov->desc.ld_tgt_count;
+	struct obd_uuid *uuidp;
 
-        switch (cmd) {
-        case IOC_OBD_STATFS: {
-                struct obd_ioctl_data *data = karg;
-                struct obd_device *osc_obd;
-                struct obd_statfs stat_buf = {0};
-                __u32 index;
+	ENTRY;
+	switch (cmd) {
+	case IOC_OBD_STATFS: {
+		struct obd_ioctl_data *data = karg;
+		struct obd_device *osc_obd;
+		struct obd_statfs stat_buf = {0};
+		struct obd_import *imp;
+		__u32 index;
 		__u32 flags;
 
-                memcpy(&index, data->ioc_inlbuf2, sizeof(index));
-                if ((index >= count))
-                        RETURN(-ENODEV);
+		memcpy(&index, data->ioc_inlbuf2, sizeof(index));
+		if (index >= count)
+			RETURN(-ENODEV);
 
-                if (!lov->lov_tgts[index])
-                        /* Try again with the next index */
-                        RETURN(-EAGAIN);
-                if (!lov->lov_tgts[index]->ltd_active)
-                        RETURN(-ENODATA);
+		if (!lov->lov_tgts[index])
+			/* Try again with the next index */
+			RETURN(-EAGAIN);
 
-                osc_obd = class_exp2obd(lov->lov_tgts[index]->ltd_exp);
-                if (!osc_obd)
-                        RETURN(-EINVAL);
+		osc_obd = class_exp2obd(lov->lov_tgts[index]->ltd_exp);
+		if (!osc_obd)
+			RETURN(-EINVAL);
 
-                /* copy UUID */
+		imp = osc_obd->u.cli.cl_import;
+		if (!lov->lov_tgts[index]->ltd_active &&
+		    imp->imp_state != LUSTRE_IMP_IDLE)
+			RETURN(-ENODATA);
+
+		/* copy UUID */
 		if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(osc_obd),
 				 min_t(unsigned long, data->ioc_plen2,
 				       sizeof(struct obd_uuid))))
@@ -1084,12 +1012,12 @@ static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
 		memcpy(&flags, data->ioc_inlbuf1, sizeof(flags));
 		flags = flags & LL_STATFS_NODELAY ? OBD_STATFS_NODELAY : 0;
 
-                /* got statfs data */
-                rc = obd_statfs(NULL, lov->lov_tgts[index]->ltd_exp, &stat_buf,
-                                cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
-                                flags);
-                if (rc)
-                        RETURN(rc);
+		/* got statfs data */
+		rc = obd_statfs(NULL, lov->lov_tgts[index]->ltd_exp, &stat_buf,
+				ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
+				flags);
+		if (rc)
+			RETURN(rc);
 		if (copy_to_user(data->ioc_pbuf1, &stat_buf,
 				 min_t(unsigned long, data->ioc_plen1,
 				       sizeof(struct obd_statfs))))
@@ -1202,12 +1130,11 @@ static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                         if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp)
                                 continue;
 
-                        /* ll_umount_begin() sets force flag but for lov, not
-                         * osc. Let's pass it through */
-                        osc_obd = class_exp2obd(lov->lov_tgts[i]->ltd_exp);
-                        osc_obd->obd_force = obddev->obd_force;
-                        err = obd_iocontrol(cmd, lov->lov_tgts[i]->ltd_exp,
-                                            len, karg, uarg);
+			/* ll_umount_begin() sets force on lov, pass to osc */
+			osc_obd = class_exp2obd(lov->lov_tgts[i]->ltd_exp);
+			osc_obd->obd_force = obd->obd_force;
+			err = obd_iocontrol(cmd, lov->lov_tgts[i]->ltd_exp,
+					    len, karg, uarg);
 			if (err) {
                                 if (lov->lov_tgts[i]->ltd_active) {
                                         CDEBUG(err == -ENOTTY ?
@@ -1243,7 +1170,7 @@ static int lov_get_info(const struct lu_env *env, struct obd_export *exp,
 	if (vallen == NULL || val == NULL)
 		RETURN(-EFAULT);
 
-	obd_getref(obddev);
+	lov_tgts_getref(obddev);
 
 	if (KEY_IS(KEY_MAX_EASIZE)) {
 		u32 max_stripe_count = min_t(u32, ld->ld_active_tgt_count,
@@ -1261,7 +1188,7 @@ static int lov_get_info(const struct lu_env *env, struct obd_export *exp,
 		rc = -EINVAL;
 	}
 
-	obd_putref(obddev);
+	lov_tgts_putref(obddev);
 
 	RETURN(rc);
 }
@@ -1274,58 +1201,71 @@ static int lov_set_info_async(const struct lu_env *env, struct obd_export *exp,
 	struct obd_device *obddev = class_exp2obd(exp);
 	struct lov_obd *lov = &obddev->u.lov;
 	struct lov_tgt_desc *tgt;
-	int do_inactive = 0;
-	int no_set = 0;
-	u32 count;
+	bool do_inactive = false, no_set = false;
 	u32 i;
 	int rc = 0;
 	int err;
-        ENTRY;
 
-        if (set == NULL) {
-                no_set = 1;
-                set = ptlrpc_prep_set();
-                if (!set)
-                        RETURN(-ENOMEM);
-        }
+	ENTRY;
 
-        obd_getref(obddev);
-        count = lov->desc.ld_tgt_count;
+	if (set == NULL) {
+		no_set = true;
+		set = ptlrpc_prep_set();
+		if (!set)
+			RETURN(-ENOMEM);
+	}
+
+	lov_tgts_getref(obddev);
 
 	if (KEY_IS(KEY_CHECKSUM)) {
-                do_inactive = 1;
+		do_inactive = true;
 	} else if (KEY_IS(KEY_CACHE_SET)) {
 		LASSERT(lov->lov_cache == NULL);
 		lov->lov_cache = val;
-		do_inactive = 1;
+		do_inactive = true;
 		cl_cache_incref(lov->lov_cache);
 	}
 
-	for (i = 0; i < count; i++) {
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
 		tgt = lov->lov_tgts[i];
 
-                /* OST was disconnected */
-                if (!tgt || !tgt->ltd_exp)
-                        continue;
+		/* OST was disconnected */
+		if (tgt == NULL || tgt->ltd_exp == NULL)
+			continue;
 
-                /* OST is inactive and we don't want inactive OSCs */
-                if (!tgt->ltd_active && !do_inactive)
-                        continue;
+		/* OST is inactive and we don't want inactive OSCs */
+		if (!tgt->ltd_active && !do_inactive)
+			continue;
 
 		err = obd_set_info_async(env, tgt->ltd_exp, keylen, key,
 					 vallen, val, set);
-                if (!rc)
-                        rc = err;
-        }
 
-        obd_putref(obddev);
-        if (no_set) {
-                err = ptlrpc_set_wait(set);
-                if (!rc)
-                        rc = err;
-                ptlrpc_set_destroy(set);
-        }
-        RETURN(rc);
+		if (rc == 0)
+			rc = err;
+	}
+
+	/* cycle through MDC target for Data-on-MDT */
+	for (i = 0; i < LOV_MDC_TGT_MAX; i++) {
+		struct obd_device *mdc;
+
+		mdc = lov->lov_mdc_tgts[i].lmtd_mdc;
+		if (mdc == NULL)
+			continue;
+
+		err = obd_set_info_async(env, mdc->obd_self_export,
+					 keylen, key, vallen, val, set);
+		if (rc == 0)
+			rc = err;
+	}
+
+	lov_tgts_putref(obddev);
+	if (no_set) {
+		err = ptlrpc_set_wait(env, set);
+		if (rc == 0)
+			rc = err;
+		ptlrpc_set_destroy(set);
+	}
+	RETURN(rc);
 }
 
 void lov_stripe_lock(struct lov_stripe_md *md)
@@ -1363,7 +1303,7 @@ static int lov_quotactl(struct obd_device *obd, struct obd_export *exp,
 	}
 
         /* for lov tgt */
-        obd_getref(obd);
+	lov_tgts_getref(obd);
         for (i = 0; i < lov->desc.ld_tgt_count; i++) {
                 int err;
 
@@ -1395,7 +1335,7 @@ static int lov_quotactl(struct obd_device *obd, struct obd_export *exp,
                         bhardlimit += oqctl->qc_dqblk.dqb_bhardlimit;
                 }
         }
-        obd_putref(obd);
+	lov_tgts_putref(obd);
 
         if (oqctl->qc_cmd == Q_GETOQUOTA) {
                 oqctl->qc_dqblk.dqb_curspace = curspace;
@@ -1411,7 +1351,6 @@ static struct obd_ops lov_obd_ops = {
 	.o_connect		= lov_connect,
 	.o_disconnect		= lov_disconnect,
 	.o_statfs		= lov_statfs,
-	.o_statfs_async		= lov_statfs_async,
 	.o_iocontrol		= lov_iocontrol,
 	.o_get_info		= lov_get_info,
 	.o_set_info_async	= lov_set_info_async,
@@ -1420,8 +1359,6 @@ static struct obd_ops lov_obd_ops = {
 	.o_pool_rem		= lov_pool_remove,
 	.o_pool_add		= lov_pool_add,
 	.o_pool_del		= lov_pool_del,
-	.o_getref		= lov_getref,
-	.o_putref		= lov_putref,
 	.o_quotactl		= lov_quotactl,
 };
 
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_object.c b/drivers/staging/lustrefsx/lustre/lov/lov_object.c
index c1cf76367697e..590a2009a87ef 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_object.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_object.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -37,6 +37,8 @@
 
 #define DEBUG_SUBSYSTEM S_LOV
 
+#include <linux/random.h>
+
 #include "lov_cl_internal.h"
 
 static inline struct lov_device *lov_object_dev(struct lov_object *obj)
@@ -74,6 +76,8 @@ struct lov_layout_operations {
                             struct cl_object *obj, struct cl_io *io);
         int  (*llo_getattr)(const struct lu_env *env, struct cl_object *obj,
                             struct cl_attr *attr);
+	int  (*llo_flush)(const struct lu_env *env, struct cl_object *obj,
+			  struct ldlm_lock *lock);
 };
 
 static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov);
@@ -89,30 +93,40 @@ static void lov_lsm_put(struct lov_stripe_md *lsm)
  * Lov object layout operations.
  *
  */
-static int lov_init_empty(const struct lu_env *env, struct lov_device *dev,
-			  struct lov_object *lov, struct lov_stripe_md *lsm,
-			  const struct cl_object_conf *conf,
-			  union lov_layout_state *state)
+
+static struct cl_object *lov_sub_find(const struct lu_env *env,
+				      struct cl_device *dev,
+				      const struct lu_fid *fid,
+				      const struct cl_object_conf *conf)
 {
-	return 0;
+	struct lu_object *o;
+
+	ENTRY;
+
+	o = lu_object_find_at(env, cl2lu_dev(dev), fid, &conf->coc_lu);
+	LASSERT(ergo(!IS_ERR(o), o->lo_dev->ld_type == &lovsub_device_type));
+	RETURN(lu2cl(o));
 }
 
-static struct cl_object *lov_sub_find(const struct lu_env *env,
-                                      struct cl_device *dev,
-                                      const struct lu_fid *fid,
-                                      const struct cl_object_conf *conf)
+static int lov_page_slice_fixup(struct lov_object *lov,
+				struct cl_object *stripe)
 {
-        struct lu_object *o;
+	struct cl_object_header *hdr = cl_object_header(&lov->lo_cl);
+	struct cl_object *o;
 
-        ENTRY;
-        o = lu_object_find_at(env, cl2lu_dev(dev), fid, &conf->coc_lu);
-        LASSERT(ergo(!IS_ERR(o), o->lo_dev->ld_type == &lovsub_device_type));
-        RETURN(lu2cl(o));
+	if (stripe == NULL)
+		return hdr->coh_page_bufsize - lov->lo_cl.co_slice_off -
+		       cfs_size_round(sizeof(struct lov_page));
+
+	cl_object_for_each(o, stripe)
+		o->co_slice_off += hdr->coh_page_bufsize;
+
+	return cl_object_header(stripe)->coh_page_bufsize;
 }
 
 static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
-			struct cl_object *subobj, struct lov_layout_raid0 *r0,
-			struct lov_oinfo *oinfo, int idx)
+			struct cl_object *subobj, struct lov_oinfo *oinfo,
+			int idx)
 {
 	struct cl_object_header *hdr;
 	struct cl_object_header *subhdr;
@@ -132,7 +146,7 @@ static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
 		return -EIO;
 	}
 
-	hdr    = cl_object_header(lov2cl(lov));
+	hdr = cl_object_header(lov2cl(lov));
 	subhdr = cl_object_header(subobj);
 
 	CDEBUG(D_INODE, DFID"@%p[%d:%d] -> "DFID"@%p: ostid: "DOSTID
@@ -145,13 +159,14 @@ static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
 	spin_lock(&subhdr->coh_attr_guard);
 	parent = subhdr->coh_parent;
 	if (parent == NULL) {
+		struct lovsub_object *lso = cl2lovsub(subobj);
+
 		subhdr->coh_parent = hdr;
 		spin_unlock(&subhdr->coh_attr_guard);
 		subhdr->coh_nesting = hdr->coh_nesting + 1;
 		lu_object_ref_add(&subobj->co_lu, "lov-parent", lov);
-		r0->lo_sub[stripe] = cl2lovsub(subobj);
-		r0->lo_sub[stripe]->lso_super = lov;
-		r0->lo_sub[stripe]->lso_index = idx;
+		lso->lso_super = lov;
+		lso->lso_index = idx;
 		result = 0;
 	} else {
 		struct lu_object  *old_obj;
@@ -181,42 +196,28 @@ static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
 	return result;
 }
 
-static int lov_page_slice_fixup(struct lov_object *lov,
-				struct cl_object *stripe)
-{
-	struct cl_object_header *hdr = cl_object_header(&lov->lo_cl);
-	struct cl_object *o;
-
-	if (stripe == NULL)
-		return hdr->coh_page_bufsize - lov->lo_cl.co_slice_off -
-		       cfs_size_round(sizeof(struct lov_page));
-
-	cl_object_for_each(o, stripe)
-		o->co_slice_off += hdr->coh_page_bufsize;
-
-	return cl_object_header(stripe)->coh_page_bufsize;
-}
-
 static int lov_init_raid0(const struct lu_env *env, struct lov_device *dev,
-			  struct lov_object *lov, int index,
-			  struct lov_layout_raid0 *r0)
+			  struct lov_object *lov, unsigned int index,
+			  const struct cl_object_conf *conf,
+			  struct lov_layout_entry *lle)
 {
-	struct lov_thread_info  *lti     = lov_env_info(env);
-	struct cl_object_conf   *subconf = &lti->lti_stripe_conf;
-	struct lu_fid           *ofid    = &lti->lti_fid;
-	struct cl_object        *stripe;
+	struct lov_layout_raid0 *r0 = &lle->lle_raid0;
+	struct lov_thread_info *lti = lov_env_info(env);
+	struct cl_object_conf *subconf = &lti->lti_stripe_conf;
+	struct lu_fid *ofid = &lti->lti_fid;
+	struct cl_object *stripe;
 	struct lov_stripe_md_entry *lse  = lov_lse(lov, index);
 	int result;
-	int psz;
+	int psz, sz;
 	int i;
 
 	ENTRY;
 
 	spin_lock_init(&r0->lo_sub_lock);
 	r0->lo_nr = lse->lsme_stripe_count;
-	LASSERT(r0->lo_nr <= lov_targets_nr(dev));
+	r0->lo_trunc_stripeno = -1;
 
-	OBD_ALLOC_LARGE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]);
+	OBD_ALLOC_LARGE(r0->lo_sub, r0->lo_nr * sizeof(r0->lo_sub[0]));
 	if (r0->lo_sub == NULL)
 		GOTO(out, result = -ENOMEM);
 
@@ -255,7 +256,7 @@ static int lov_init_raid0(const struct lu_env *env, struct lov_device *dev,
 		if (IS_ERR(stripe))
 			GOTO(out, result = PTR_ERR(stripe));
 
-		result = lov_init_sub(env, lov, stripe, r0, oinfo,
+		result = lov_init_sub(env, lov, stripe, oinfo,
 				      lov_comp_index(index, i));
 		if (result == -EAGAIN) { /* try again */
 			--i;
@@ -264,7 +265,9 @@ static int lov_init_raid0(const struct lu_env *env, struct lov_device *dev,
 		}
 
 		if (result == 0) {
-			int sz = lov_page_slice_fixup(lov, stripe);
+			r0->lo_sub[i] = cl2lovsub(stripe);
+
+			sz = lov_page_slice_fixup(lov, stripe);
 			LASSERT(ergo(psz > 0, psz == sz));
 			psz = sz;
 		}
@@ -275,16 +278,369 @@ static int lov_init_raid0(const struct lu_env *env, struct lov_device *dev,
 	RETURN(result);
 }
 
+static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov,
+			       struct lov_layout_raid0 *r0,
+			       struct lovsub_object *los, int idx)
+{
+	struct cl_object        *sub;
+	struct lu_site          *site;
+	wait_queue_head_t *wq;
+	wait_queue_entry_t *waiter;
+
+        LASSERT(r0->lo_sub[idx] == los);
+
+	sub = lovsub2cl(los);
+	site = sub->co_lu.lo_dev->ld_site;
+	wq = lu_site_wq_from_fid(site, &sub->co_lu.lo_header->loh_fid);
+
+        cl_object_kill(env, sub);
+        /* release a reference to the sub-object and ... */
+        lu_object_ref_del(&sub->co_lu, "lov-parent", lov);
+        cl_object_put(env, sub);
+
+	/* ... wait until it is actually destroyed---sub-object clears its
+	 * ->lo_sub[] slot in lovsub_object_free() */
+	if (r0->lo_sub[idx] == los) {
+		waiter = &lov_env_info(env)->lti_waiter;
+		init_waitqueue_entry(waiter, current);
+		add_wait_queue(wq, waiter);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		while (1) {
+			/* this wait-queue is signaled at the end of
+			 * lu_object_free(). */
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			spin_lock(&r0->lo_sub_lock);
+			if (r0->lo_sub[idx] == los) {
+				spin_unlock(&r0->lo_sub_lock);
+				schedule();
+			} else {
+				spin_unlock(&r0->lo_sub_lock);
+				set_current_state(TASK_RUNNING);
+				break;
+			}
+		}
+		remove_wait_queue(wq, waiter);
+	}
+	LASSERT(r0->lo_sub[idx] == NULL);
+}
+
+static void lov_delete_raid0(const struct lu_env *env, struct lov_object *lov,
+			     struct lov_layout_entry *lle)
+{
+	struct lov_layout_raid0 *r0 = &lle->lle_raid0;
+
+	ENTRY;
+
+        if (r0->lo_sub != NULL) {
+		int i;
+
+		for (i = 0; i < r0->lo_nr; ++i) {
+			struct lovsub_object *los = r0->lo_sub[i];
+
+			if (los != NULL) {
+				cl_object_prune(env, &los->lso_cl);
+				/*
+				 * If top-level object is to be evicted from
+				 * the cache, so are its sub-objects.
+				 */
+				lov_subobject_kill(env, lov, r0, los, i);
+			}
+		}
+	}
+
+	EXIT;
+}
+
+static void lov_fini_raid0(const struct lu_env *env,
+			   struct lov_layout_entry *lle)
+{
+	struct lov_layout_raid0 *r0 = &lle->lle_raid0;
+
+	if (r0->lo_sub != NULL) {
+		OBD_FREE_LARGE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]);
+		r0->lo_sub = NULL;
+	}
+}
+
+static int lov_print_raid0(const struct lu_env *env, void *cookie,
+			   lu_printer_t p, const struct lov_layout_entry *lle)
+{
+	const struct lov_layout_raid0 *r0 = &lle->lle_raid0;
+	int i;
+
+	for (i = 0; i < r0->lo_nr; ++i) {
+		struct lu_object *sub;
+
+		if (r0->lo_sub[i] != NULL) {
+			sub = lovsub2lu(r0->lo_sub[i]);
+			lu_object_print(env, cookie, p, sub);
+		} else {
+			(*p)(env, cookie, "sub %d absent\n", i);
+		}
+	}
+	return 0;
+}
+
+static int lov_attr_get_raid0(const struct lu_env *env, struct lov_object *lov,
+			      unsigned int index, struct lov_layout_entry *lle,
+			      struct cl_attr **lov_attr)
+{
+	struct lov_layout_raid0 *r0 = &lle->lle_raid0;
+	struct lov_stripe_md *lsm = lov->lo_lsm;
+	struct ost_lvb *lvb = &lov_env_info(env)->lti_lvb;
+	struct cl_attr *attr = &r0->lo_attr;
+	__u64 kms = 0;
+	int result = 0;
+
+	if (r0->lo_attr_valid) {
+		*lov_attr = attr;
+		return 0;
+	}
+
+	memset(lvb, 0, sizeof(*lvb));
+
+	/* XXX: timestamps can be negative by sanity:test_39m,
+	 * how can it be? */
+	lvb->lvb_atime = LLONG_MIN;
+	lvb->lvb_ctime = LLONG_MIN;
+	lvb->lvb_mtime = LLONG_MIN;
+
+	/*
+	 * XXX that should be replaced with a loop over sub-objects,
+	 * doing cl_object_attr_get() on them. But for now, let's
+	 * reuse old lov code.
+	 */
+
+	/*
+	 * XXX take lsm spin-lock to keep lov_merge_lvb_kms()
+	 * happy. It's not needed, because new code uses
+	 * ->coh_attr_guard spin-lock to protect consistency of
+	 * sub-object attributes.
+	 */
+	lov_stripe_lock(lsm);
+	result = lov_merge_lvb_kms(lsm, index, lvb, &kms);
+	lov_stripe_unlock(lsm);
+	if (result == 0) {
+		cl_lvb2attr(attr, lvb);
+		attr->cat_kms = kms;
+		r0->lo_attr_valid = 1;
+		*lov_attr = attr;
+	}
+
+	return result;
+}
+
+static struct lov_comp_layout_entry_ops raid0_ops = {
+	.lco_init      = lov_init_raid0,
+	.lco_fini      = lov_fini_raid0,
+	.lco_getattr   = lov_attr_get_raid0,
+};
+
+static int lov_attr_get_dom(const struct lu_env *env, struct lov_object *lov,
+			    unsigned int index, struct lov_layout_entry *lle,
+			    struct cl_attr **lov_attr)
+{
+	struct lov_layout_dom *dom = &lle->lle_dom;
+	struct lov_oinfo *loi = dom->lo_loi;
+	struct cl_attr *attr = &dom->lo_dom_r0.lo_attr;
+
+	if (dom->lo_dom_r0.lo_attr_valid) {
+		*lov_attr = attr;
+		return 0;
+	}
+
+	if (OST_LVB_IS_ERR(loi->loi_lvb.lvb_blocks))
+		return OST_LVB_GET_ERR(loi->loi_lvb.lvb_blocks);
+
+	cl_lvb2attr(attr, &loi->loi_lvb);
+
+	/* DoM component size can be bigger than stripe size after
+	 * client's setattr RPC, so do not count anything beyond
+	 * component end. Alternatively, check that limit on server
+	 * and do not allow size overflow there. */
+	if (attr->cat_size > lle->lle_extent->e_end)
+		attr->cat_size = lle->lle_extent->e_end;
+
+	attr->cat_kms = attr->cat_size;
+
+	dom->lo_dom_r0.lo_attr_valid = 1;
+	*lov_attr = attr;
+
+	return 0;
+}
+
+/**
+ * Lookup FLD to get MDS index of the given DOM object FID.
+ *
+ * \param[in]  ld	LOV device
+ * \param[in]  fid	FID to lookup
+ * \param[out] nr	index in MDC array to return back
+ *
+ * \retval		0 and \a mds filled with MDS index if successful
+ * \retval		negative value on error
+ */
+static int lov_fld_lookup(struct lov_device *ld, const struct lu_fid *fid,
+			  __u32 *nr)
+{
+	__u32 mds_idx;
+	int i, rc;
+
+	ENTRY;
+
+	rc = fld_client_lookup(&ld->ld_lmv->u.lmv.lmv_fld, fid_seq(fid),
+			       &mds_idx, LU_SEQ_RANGE_MDT, NULL);
+	if (rc) {
+		CERROR("%s: error while looking for mds number. Seq %#llx"
+		       ", err = %d\n", lu_dev_name(cl2lu_dev(&ld->ld_cl)),
+		       fid_seq(fid), rc);
+		RETURN(rc);
+	}
+
+	CDEBUG(D_INODE, "FLD lookup got mds #%x for fid="DFID"\n",
+	       mds_idx, PFID(fid));
+
+	/* find proper MDC device in the array */
+	for (i = 0; i < ld->ld_md_tgts_nr; i++) {
+		if (ld->ld_md_tgts[i].ldm_mdc != NULL &&
+		    ld->ld_md_tgts[i].ldm_idx == mds_idx)
+			break;
+	}
+
+	if (i == ld->ld_md_tgts_nr) {
+		CERROR("%s: cannot find corresponding MDC device for mds #%x "
+		       "for fid="DFID"\n", lu_dev_name(cl2lu_dev(&ld->ld_cl)),
+		       mds_idx, PFID(fid));
+		rc = -EINVAL;
+	} else {
+		*nr = i;
+	}
+	RETURN(rc);
+}
+
+/**
+ * Implementation of lov_comp_layout_entry_ops::lco_init for DOM object.
+ *
+ * Init the DOM object for the first time. It prepares also RAID0 entry
+ * for it to use in common methods with ordinary RAID0 layout entries.
+ *
+ * \param[in] env	execution environment
+ * \param[in] dev	LOV device
+ * \param[in] lov	LOV object
+ * \param[in] index	Composite layout entry index in LSM
+ * \param[in] lle	Composite LOV layout entry
+ */
+static int lov_init_dom(const struct lu_env *env, struct lov_device *dev,
+			struct lov_object *lov, unsigned int index,
+			const struct cl_object_conf *conf,
+			struct lov_layout_entry *lle)
+{
+	struct lov_thread_info *lti = lov_env_info(env);
+	struct lov_stripe_md_entry *lsme = lov_lse(lov, index);
+	struct cl_object *clo;
+	struct lu_object *o = lov2lu(lov);
+	const struct lu_fid *fid = lu_object_fid(o);
+	struct cl_device *mdcdev;
+	struct lov_oinfo *loi = NULL;
+	struct cl_object_conf *sconf = &lti->lti_stripe_conf;
+
+	int rc;
+	__u32 idx = 0;
+
+	ENTRY;
+
+	LASSERT(index == 0);
+
+	/* find proper MDS device */
+	rc = lov_fld_lookup(dev, fid, &idx);
+	if (rc)
+		RETURN(rc);
+
+	LASSERTF(dev->ld_md_tgts[idx].ldm_mdc != NULL,
+		 "LOV md target[%u] is NULL\n", idx);
+
+	/* check lsm is DOM, more checks are needed */
+	LASSERT(lsme->lsme_stripe_count == 0);
+
+	/*
+	 * Create lower cl_objects.
+	 */
+	mdcdev = dev->ld_md_tgts[idx].ldm_mdc;
+
+	LASSERTF(mdcdev != NULL, "non-initialized mdc subdev\n");
+
+	/* DoM object has no oinfo in LSM entry, create it exclusively */
+	OBD_SLAB_ALLOC_PTR_GFP(loi, lov_oinfo_slab, GFP_NOFS);
+	if (loi == NULL)
+		RETURN(-ENOMEM);
+
+	fid_to_ostid(lu_object_fid(lov2lu(lov)), &loi->loi_oi);
+
+	sconf->u.coc_oinfo = loi;
+again:
+	clo = lov_sub_find(env, mdcdev, fid, sconf);
+	if (IS_ERR(clo))
+		GOTO(out, rc = PTR_ERR(clo));
+
+	rc = lov_init_sub(env, lov, clo, loi, lov_comp_index(index, 0));
+	if (rc == -EAGAIN) /* try again */
+		goto again;
+	else if (rc != 0)
+		GOTO(out, rc);
+
+	lle->lle_dom.lo_dom = cl2lovsub(clo);
+	spin_lock_init(&lle->lle_dom.lo_dom_r0.lo_sub_lock);
+	lle->lle_dom.lo_dom_r0.lo_nr = 1;
+	lle->lle_dom.lo_dom_r0.lo_sub = &lle->lle_dom.lo_dom;
+	lle->lle_dom.lo_loi = loi;
+
+	rc = lov_page_slice_fixup(lov, clo);
+	RETURN(rc);
+
+out:
+	if (loi != NULL)
+		OBD_SLAB_FREE_PTR(loi, lov_oinfo_slab);
+	return rc;
+}
+
+/**
+ * Implementation of lov_layout_operations::llo_fini for DOM object.
+ *
+ * Finish the DOM object and free related memory.
+ *
+ * \param[in] env	execution environment
+ * \param[in] lov	LOV object
+ * \param[in] state	LOV layout state
+ */
+static void lov_fini_dom(const struct lu_env *env,
+			 struct lov_layout_entry *lle)
+{
+	if (lle->lle_dom.lo_dom != NULL)
+		lle->lle_dom.lo_dom = NULL;
+	if (lle->lle_dom.lo_loi != NULL)
+		OBD_SLAB_FREE_PTR(lle->lle_dom.lo_loi, lov_oinfo_slab);
+}
+
+static struct lov_comp_layout_entry_ops dom_ops = {
+	.lco_init = lov_init_dom,
+	.lco_fini = lov_fini_dom,
+	.lco_getattr = lov_attr_get_dom,
+};
+
 static int lov_init_composite(const struct lu_env *env, struct lov_device *dev,
 			      struct lov_object *lov, struct lov_stripe_md *lsm,
 			      const struct cl_object_conf *conf,
 			      union lov_layout_state *state)
 {
 	struct lov_layout_composite *comp = &state->composite;
+	struct lov_layout_entry *lle;
+	struct lov_mirror_entry *lre;
 	unsigned int entry_count;
 	unsigned int psz = 0;
+	unsigned int mirror_count;
+	int flr_state = lsm->lsm_flags & LCM_FL_FLR_MASK;
 	int result = 0;
-	int i;
+	unsigned int seq;
+	int i, j;
 
 	ENTRY;
 
@@ -293,38 +649,157 @@ static int lov_init_composite(const struct lu_env *env, struct lov_device *dev,
 	lov->lo_lsm = lsm_addref(lsm);
 	lov->lo_layout_invalid = true;
 
+	dump_lsm(D_INODE, lsm);
+
 	entry_count = lsm->lsm_entry_count;
-	comp->lo_entry_count = entry_count;
+
+	spin_lock_init(&comp->lo_write_lock);
+	comp->lo_flags = lsm->lsm_flags;
+	comp->lo_mirror_count = lsm->lsm_mirror_count + 1;
+	comp->lo_entry_count = lsm->lsm_entry_count;
+	comp->lo_preferred_mirror = -1;
+
+	if (equi(flr_state == LCM_FL_NONE, comp->lo_mirror_count > 1))
+		RETURN(-EINVAL);
+
+	OBD_ALLOC(comp->lo_mirrors,
+		  comp->lo_mirror_count * sizeof(*comp->lo_mirrors));
+	if (comp->lo_mirrors == NULL)
+		RETURN(-ENOMEM);
 
 	OBD_ALLOC(comp->lo_entries, entry_count * sizeof(*comp->lo_entries));
 	if (comp->lo_entries == NULL)
 		RETURN(-ENOMEM);
 
-	for (i = 0; i < entry_count; i++) {
-		struct lov_layout_entry *le = &comp->lo_entries[i];
+	/* Initiate all entry types and extents data at first */
+	for (i = 0, j = 0, mirror_count = 1; i < entry_count; i++) {
+		int mirror_id = 0;
+
+		lle = &comp->lo_entries[i];
+
+		lle->lle_lsme = lsm->lsm_entries[i];
+		lle->lle_type = lov_entry_type(lle->lle_lsme);
+		switch (lle->lle_type) {
+		case LOV_PATTERN_RAID0:
+			lle->lle_comp_ops = &raid0_ops;
+			break;
+		case LOV_PATTERN_MDT:
+			lle->lle_comp_ops = &dom_ops;
+			break;
+		default:
+			CERROR("%s: unknown composite layout entry type %i\n",
+			       lov2obd(dev->ld_lov)->obd_name,
+			       lsm->lsm_entries[i]->lsme_pattern);
+			dump_lsm(D_ERROR, lsm);
+			RETURN(-EIO);
+		}
+
+		lle->lle_extent = &lle->lle_lsme->lsme_extent;
+		lle->lle_valid = !(lle->lle_lsme->lsme_flags & LCME_FL_STALE);
+
+		if (flr_state != LCM_FL_NONE)
+			mirror_id = mirror_id_of(lle->lle_lsme->lsme_id);
+
+		lre = &comp->lo_mirrors[j];
+		if (i > 0) {
+			if (mirror_id == lre->lre_mirror_id) {
+				lre->lre_valid |= lle->lle_valid;
+				lre->lre_stale |= !lle->lle_valid;
+				lre->lre_end = i;
+				continue;
+			}
+
+			/* new mirror detected, assume that the mirrors
+			 * are shorted in layout */
+			++mirror_count;
+			++j;
+			if (j >= comp->lo_mirror_count)
+				break;
+
+			lre = &comp->lo_mirrors[j];
+		}
+
+		/* entries must be sorted by mirrors */
+		lre->lre_mirror_id = mirror_id;
+		lre->lre_start = lre->lre_end = i;
+		lre->lre_preferred = !!(lle->lle_lsme->lsme_flags &
+					LCME_FL_PREF_RD);
+		lre->lre_valid = lle->lle_valid;
+		lre->lre_stale = !lle->lle_valid;
+	}
+
+	/* sanity check for FLR */
+	if (mirror_count != comp->lo_mirror_count) {
+		CDEBUG(D_INODE, DFID
+		       " doesn't have the # of mirrors it claims, %u/%u\n",
+		       PFID(lu_object_fid(lov2lu(lov))), mirror_count,
+		       comp->lo_mirror_count + 1);
+
+		GOTO(out, result = -EINVAL);
+	}
+
+	lov_foreach_layout_entry(lov, lle) {
+		int index = lov_layout_entry_index(lov, lle);
 
-		le->lle_extent = lsm->lsm_entries[i]->lsme_extent;
 		/**
 		 * If the component has not been init-ed on MDS side, for
 		 * PFL layout, we'd know that the components beyond this one
 		 * will be dynamically init-ed later on file write/trunc ops.
 		 */
-		if (!lsm_entry_inited(lsm, i))
+		if (!lsme_inited(lle->lle_lsme))
 			continue;
 
-		result = lov_init_raid0(env, dev, lov, i, &le->lle_raid0);
+		result = lle->lle_comp_ops->lco_init(env, dev, lov, index,
+						     conf, lle);
 		if (result < 0)
 			break;
 
 		LASSERT(ergo(psz > 0, psz == result));
 		psz = result;
 	}
+
 	if (psz > 0)
 		cl_object_header(&lov->lo_cl)->coh_page_bufsize += psz;
 
+	/* decide the preferred mirror. It uses the hash value of lov_object
+	 * so that different clients would use different mirrors for read. */
+	mirror_count = 0;
+	seq = hash_long((unsigned long)lov, 8);
+	for (i = 0; i < comp->lo_mirror_count; i++) {
+		unsigned int idx = (i + seq) % comp->lo_mirror_count;
+
+		lre = lov_mirror_entry(lov, idx);
+		if (lre->lre_stale)
+			continue;
+
+		mirror_count++; /* valid mirror */
+
+		if (lre->lre_preferred || comp->lo_preferred_mirror < 0)
+			comp->lo_preferred_mirror = idx;
+	}
+	if (!mirror_count) {
+		CDEBUG(D_INODE, DFID
+		       " doesn't have any valid mirrors\n",
+		       PFID(lu_object_fid(lov2lu(lov))));
+
+		comp->lo_preferred_mirror = 0;
+	}
+
+	LASSERT(comp->lo_preferred_mirror >= 0);
+
+	EXIT;
+out:
 	return result > 0 ? 0 : result;
 }
 
+static int lov_init_empty(const struct lu_env *env, struct lov_device *dev,
+			  struct lov_object *lov, struct lov_stripe_md *lsm,
+			  const struct cl_object_conf *conf,
+			  union lov_layout_state *state)
+{
+	return 0;
+}
+
 static int lov_init_released(const struct lu_env *env,
 			     struct lov_device *dev, struct lov_object *lov,
 			     struct lov_stripe_md *lsm,
@@ -339,43 +814,6 @@ static int lov_init_released(const struct lu_env *env,
 	return 0;
 }
 
-static struct cl_object *lov_find_subobj(const struct lu_env *env,
-					 struct lov_object *lov,
-					 struct lov_stripe_md *lsm,
-					 int index)
-{
-	struct lov_device	*dev = lu2lov_dev(lov2lu(lov)->lo_dev);
-	struct lov_thread_info  *lti = lov_env_info(env);
-	struct lu_fid		*ofid = &lti->lti_fid;
-	struct lov_oinfo	*oinfo;
-	struct cl_device	*subdev;
-	int			entry = lov_comp_entry(index);
-	int			stripe = lov_comp_stripe(index);
-	int			ost_idx;
-	int			rc;
-	struct cl_object	*result;
-
-	if (lov->lo_type != LLT_COMP)
-		GOTO(out, result = NULL);
-
-	if (entry >= lsm->lsm_entry_count ||
-	    stripe >= lsm->lsm_entries[entry]->lsme_stripe_count)
-		GOTO(out, result = NULL);
-
-	oinfo = lsm->lsm_entries[entry]->lsme_oinfo[stripe];
-	ost_idx = oinfo->loi_ost_idx;
-	rc = ostid_to_fid(ofid, &oinfo->loi_oi, ost_idx);
-	if (rc != 0)
-		GOTO(out, result = NULL);
-
-	subdev = lovsub2cl_dev(dev->ld_target[ost_idx]);
-	result = lov_sub_find(env, subdev, ofid, NULL);
-out:
-	if (result == NULL)
-		result = ERR_PTR(-EINVAL);
-	return result;
-}
-
 static int lov_delete_empty(const struct lu_env *env, struct lov_object *lov,
 			    union lov_layout_state *state)
 {
@@ -385,77 +823,6 @@ static int lov_delete_empty(const struct lu_env *env, struct lov_object *lov,
 	return 0;
 }
 
-static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov,
-			       struct lov_layout_raid0 *r0,
-			       struct lovsub_object *los, int idx)
-{
-	struct cl_object        *sub;
-	struct lu_site          *site;
-	struct lu_site_bkt_data *bkt;
-	wait_queue_entry_t      *waiter;
-
-        LASSERT(r0->lo_sub[idx] == los);
-
-        sub  = lovsub2cl(los);
-        site = sub->co_lu.lo_dev->ld_site;
-        bkt  = lu_site_bkt_from_fid(site, &sub->co_lu.lo_header->loh_fid);
-
-        cl_object_kill(env, sub);
-        /* release a reference to the sub-object and ... */
-        lu_object_ref_del(&sub->co_lu, "lov-parent", lov);
-        cl_object_put(env, sub);
-
-        /* ... wait until it is actually destroyed---sub-object clears its
-         * ->lo_sub[] slot in lovsub_object_fini() */
-	if (r0->lo_sub[idx] == los) {
-		waiter = &lov_env_info(env)->lti_waiter;
-		init_waitqueue_entry(waiter, current);
-		add_wait_queue(&bkt->lsb_marche_funebre, waiter);
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		while (1) {
-			/* this wait-queue is signaled at the end of
-			 * lu_object_free(). */
-			set_current_state(TASK_UNINTERRUPTIBLE);
-			spin_lock(&r0->lo_sub_lock);
-			if (r0->lo_sub[idx] == los) {
-				spin_unlock(&r0->lo_sub_lock);
-				schedule();
-			} else {
-				spin_unlock(&r0->lo_sub_lock);
-				set_current_state(TASK_RUNNING);
-				break;
-			}
-		}
-		remove_wait_queue(&bkt->lsb_marche_funebre, waiter);
-	}
-	LASSERT(r0->lo_sub[idx] == NULL);
-}
-
-static void lov_delete_raid0(const struct lu_env *env, struct lov_object *lov,
-			     struct lov_layout_raid0 *r0)
-{
-	ENTRY;
-
-        if (r0->lo_sub != NULL) {
-		int i;
-
-		for (i = 0; i < r0->lo_nr; ++i) {
-			struct lovsub_object *los = r0->lo_sub[i];
-
-			if (los != NULL) {
-				cl_object_prune(env, &los->lso_cl);
-				/*
-				 * If top-level object is to be evicted from
-				 * the cache, so are its sub-objects.
-				 */
-				lov_subobject_kill(env, lov, r0, los, i);
-			}
-		}
-	}
-
-	EXIT;
-}
-
 static int lov_delete_composite(const struct lu_env *env,
 				struct lov_object *lov,
 				union lov_layout_state *state)
@@ -470,7 +837,7 @@ static int lov_delete_composite(const struct lu_env *env,
 	lov_layout_wait(env, lov);
 	if (comp->lo_entries)
 		lov_foreach_layout_entry(lov, entry)
-			lov_delete_raid0(env, lov, &entry->lle_raid0);
+			lov_delete_raid0(env, lov, entry);
 
 	RETURN(0);
 }
@@ -481,15 +848,6 @@ static void lov_fini_empty(const struct lu_env *env, struct lov_object *lov,
 	LASSERT(lov->lo_type == LLT_EMPTY || lov->lo_type == LLT_RELEASED);
 }
 
-static void lov_fini_raid0(const struct lu_env *env,
-			   struct lov_layout_raid0 *r0)
-{
-	if (r0->lo_sub != NULL) {
-		OBD_FREE_LARGE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]);
-		r0->lo_sub = NULL;
-	}
-}
-
 static void lov_fini_composite(const struct lu_env *env,
 			       struct lov_object *lov,
 			       union lov_layout_state *state)
@@ -501,13 +859,21 @@ static void lov_fini_composite(const struct lu_env *env,
 		struct lov_layout_entry *entry;
 
 		lov_foreach_layout_entry(lov, entry)
-			lov_fini_raid0(env, &entry->lle_raid0);
+			entry->lle_comp_ops->lco_fini(env, entry);
 
 		OBD_FREE(comp->lo_entries,
 			 comp->lo_entry_count * sizeof(*comp->lo_entries));
 		comp->lo_entries = NULL;
 	}
 
+	if (comp->lo_mirrors != NULL) {
+		OBD_FREE(comp->lo_mirrors,
+			 comp->lo_mirror_count * sizeof(*comp->lo_mirrors));
+		comp->lo_mirrors = NULL;
+	}
+
+	memset(comp, 0, sizeof(*comp));
+
 	dump_lsm(D_INODE, lov->lo_lsm);
 	lov_free_memmd(&lov->lo_lsm);
 
@@ -530,24 +896,6 @@ static int lov_print_empty(const struct lu_env *env, void *cookie,
         return 0;
 }
 
-static int lov_print_raid0(const struct lu_env *env, void *cookie,
-			   lu_printer_t p, struct lov_layout_raid0 *r0)
-{
-	int i;
-
-	for (i = 0; i < r0->lo_nr; ++i) {
-		struct lu_object *sub;
-
-		if (r0->lo_sub[i] != NULL) {
-			sub = lovsub2lu(r0->lo_sub[i]);
-			lu_object_print(env, cookie, p, sub);
-		} else {
-			(*p)(env, cookie, "sub %d absent\n", i);
-		}
-	}
-	return 0;
-}
-
 static int lov_print_composite(const struct lu_env *env, void *cookie,
 			       lu_printer_t p, const struct lu_object *o)
 {
@@ -563,12 +911,15 @@ static int lov_print_composite(const struct lu_env *env, void *cookie,
 
 	for (i = 0; i < lsm->lsm_entry_count; i++) {
 		struct lov_stripe_md_entry *lse = lsm->lsm_entries[i];
+		struct lov_layout_entry *lle = lov_entry(lov, i);
 
-		(*p)(env, cookie, DEXT ": { 0x%08X, %u, %u, %#x, %u, %u }\n",
+		(*p)(env, cookie,
+		     DEXT ": { 0x%08X, %u, %#x, %u, %#x, %u, %u }\n",
 		     PEXT(&lse->lsme_extent), lse->lsme_magic,
-		     lse->lsme_id, lse->lsme_layout_gen, lse->lsme_flags,
-		     lse->lsme_stripe_count, lse->lsme_stripe_size);
-		lov_print_raid0(env, cookie, p, lov_r0(lov, i));
+		     lse->lsme_id, lse->lsme_pattern, lse->lsme_layout_gen,
+		     lse->lsme_flags, lse->lsme_stripe_count,
+		     lse->lsme_stripe_size);
+		lov_print_raid0(env, cookie, p, lle);
 	}
 
 	return 0;
@@ -602,51 +953,6 @@ static int lov_attr_get_empty(const struct lu_env *env, struct cl_object *obj,
         return 0;
 }
 
-static int lov_attr_get_raid0(const struct lu_env *env, struct lov_object *lov,
-			      unsigned int index, struct lov_layout_raid0 *r0)
-
-{
-	struct lov_stripe_md *lsm = lov->lo_lsm;
-	struct ost_lvb *lvb = &lov_env_info(env)->lti_lvb;
-	struct cl_attr *attr = &r0->lo_attr;
-	__u64 kms = 0;
-	int result = 0;
-
-	if (r0->lo_attr_valid)
-		return 0;
-
-	memset(lvb, 0, sizeof(*lvb));
-
-	/* XXX: timestamps can be negative by sanity:test_39m,
-	 * how can it be? */
-	lvb->lvb_atime = LLONG_MIN;
-	lvb->lvb_ctime = LLONG_MIN;
-	lvb->lvb_mtime = LLONG_MIN;
-
-	/*
-	 * XXX that should be replaced with a loop over sub-objects,
-	 * doing cl_object_attr_get() on them. But for now, let's
-	 * reuse old lov code.
-	 */
-
-	/*
-	 * XXX take lsm spin-lock to keep lov_merge_lvb_kms()
-	 * happy. It's not needed, because new code uses
-	 * ->coh_attr_guard spin-lock to protect consistency of
-	 * sub-object attributes.
-	 */
-	lov_stripe_lock(lsm);
-	result = lov_merge_lvb_kms(lsm, index, lvb, &kms);
-	lov_stripe_unlock(lsm);
-	if (result == 0) {
-		cl_lvb2attr(attr, lvb);
-		attr->cat_kms = kms;
-		r0->lo_attr_valid = 1;
-	}
-
-	return result;
-}
-
 static int lov_attr_get_composite(const struct lu_env *env,
 				  struct cl_object *obj,
 				  struct cl_attr *attr)
@@ -654,25 +960,34 @@ static int lov_attr_get_composite(const struct lu_env *env,
 	struct lov_object	*lov = cl2lov(obj);
 	struct lov_layout_entry *entry;
 	int			 result = 0;
-	int			 index = 0;
 
 	ENTRY;
 
 	attr->cat_size = 0;
 	attr->cat_blocks = 0;
 	lov_foreach_layout_entry(lov, entry) {
-		struct lov_layout_raid0 *r0 = &entry->lle_raid0;
-		struct cl_attr *lov_attr = &r0->lo_attr;
+		struct cl_attr *lov_attr = NULL;
+		int index = lov_layout_entry_index(lov, entry);
+
+		if (!entry->lle_valid)
+			continue;
 
 		/* PFL: This component has not been init-ed. */
 		if (!lsm_entry_inited(lov->lo_lsm, index))
-			break;
+			continue;
 
-		result = lov_attr_get_raid0(env, lov, index, r0);
-		if (result != 0)
-			break;
+		result = entry->lle_comp_ops->lco_getattr(env, lov, index,
+							  entry, &lov_attr);
+		if (result < 0)
+			RETURN(result);
 
-		index++;
+		if (lov_attr == NULL)
+			continue;
+
+		CDEBUG(D_INODE, "COMP ID #%i: s=%llu m=%llu a=%llu c=%llu "
+		       "b=%llu\n", index - 1, lov_attr->cat_size,
+		       lov_attr->cat_mtime, lov_attr->cat_atime,
+		       lov_attr->cat_ctime, lov_attr->cat_blocks);
 
 		/* merge results */
 		attr->cat_blocks += lov_attr->cat_blocks;
@@ -687,29 +1002,58 @@ static int lov_attr_get_composite(const struct lu_env *env,
 		if (attr->cat_mtime < lov_attr->cat_mtime)
 			attr->cat_mtime = lov_attr->cat_mtime;
 	}
-	RETURN(result);
+
+	RETURN(0);
+}
+
+static int lov_flush_composite(const struct lu_env *env,
+			       struct cl_object *obj,
+			       struct ldlm_lock *lock)
+{
+	struct lov_object *lov = cl2lov(obj);
+	struct lov_layout_entry *lle;
+	int rc = -ENODATA;
+
+	ENTRY;
+
+	lov_foreach_layout_entry(lov, lle) {
+		if (!lsme_is_dom(lle->lle_lsme))
+			continue;
+		rc = cl_object_flush(env, lovsub2cl(lle->lle_dom.lo_dom), lock);
+		break;
+	}
+
+	RETURN(rc);
+}
+
+static int lov_flush_empty(const struct lu_env *env, struct cl_object *obj,
+			   struct ldlm_lock *lock)
+{
+	return 0;
 }
 
 const static struct lov_layout_operations lov_dispatch[] = {
-        [LLT_EMPTY] = {
-                .llo_init      = lov_init_empty,
-                .llo_delete    = lov_delete_empty,
-                .llo_fini      = lov_fini_empty,
-                .llo_print     = lov_print_empty,
-                .llo_page_init = lov_page_init_empty,
-                .llo_lock_init = lov_lock_init_empty,
-                .llo_io_init   = lov_io_init_empty,
+	[LLT_EMPTY] = {
+		.llo_init      = lov_init_empty,
+		.llo_delete    = lov_delete_empty,
+		.llo_fini      = lov_fini_empty,
+		.llo_print     = lov_print_empty,
+		.llo_page_init = lov_page_init_empty,
+		.llo_lock_init = lov_lock_init_empty,
+		.llo_io_init   = lov_io_init_empty,
 		.llo_getattr   = lov_attr_get_empty,
-        },
-        [LLT_RELEASED] = {
-                .llo_init      = lov_init_released,
-                .llo_delete    = lov_delete_empty,
-                .llo_fini      = lov_fini_released,
-                .llo_print     = lov_print_released,
-                .llo_page_init = lov_page_init_empty,
-                .llo_lock_init = lov_lock_init_empty,
-                .llo_io_init   = lov_io_init_released,
+		.llo_flush     = lov_flush_empty,
+	},
+	[LLT_RELEASED] = {
+		.llo_init      = lov_init_released,
+		.llo_delete    = lov_delete_empty,
+		.llo_fini      = lov_fini_released,
+		.llo_print     = lov_print_released,
+		.llo_page_init = lov_page_init_empty,
+		.llo_lock_init = lov_lock_init_empty,
+		.llo_io_init   = lov_io_init_released,
 		.llo_getattr   = lov_attr_get_empty,
+		.llo_flush     = lov_flush_empty,
 	},
 	[LLT_COMP] = {
 		.llo_init      = lov_init_composite,
@@ -720,6 +1064,7 @@ const static struct lov_layout_operations lov_dispatch[] = {
 		.llo_lock_init = lov_lock_init_composite,
 		.llo_io_init   = lov_io_init_composite,
 		.llo_getattr   = lov_attr_get_composite,
+		.llo_flush     = lov_flush_composite,
 	},
 };
 
@@ -881,12 +1226,11 @@ static int lov_layout_change(const struct lu_env *unused,
 	CDEBUG(D_INODE, DFID "Apply new layout lov %p, type %d\n",
 	       PFID(lu_object_fid(lov2lu(lov))), lov, llt);
 
-	lov->lo_type = LLT_EMPTY;
-
 	/* page bufsize fixup */
 	cl_object_header(&lov->lo_cl)->coh_page_bufsize -=
 		lov_page_slice_fixup(lov, NULL);
 
+	lov->lo_type = llt;
 	rc = new_ops->llo_init(env, lov_dev, lov, lsm, conf, state);
 	if (rc != 0) {
 		struct obd_device *obd = lov2obd(lov_dev->ld_lov);
@@ -896,11 +1240,10 @@ static int lov_layout_change(const struct lu_env *unused,
 		new_ops->llo_delete(env, lov, state);
 		new_ops->llo_fini(env, lov, state);
 		/* this file becomes an EMPTY file. */
+		lov->lo_type = LLT_EMPTY;
 		GOTO(out, rc);
 	}
 
-	lov->lo_type = llt;
-
 out:
 	cl_env_put(env, &refcheck);
 	RETURN(rc);
@@ -1056,7 +1399,7 @@ int lov_page_init(const struct lu_env *env, struct cl_object *obj,
 int lov_io_init(const struct lu_env *env, struct cl_object *obj,
 		struct cl_io *io)
 {
-	CL_IO_SLICE_CLEAN(lov_env_io(env), lis_cl);
+	CL_IO_SLICE_CLEAN(lov_env_io(env), lis_preserved);
 
 	CDEBUG(D_INODE, DFID "io %p type %d ignore/verify layout %d/%d\n",
 	       PFID(lu_object_fid(&obj->co_lu)), io, io->ci_type,
@@ -1258,6 +1601,43 @@ struct fiemap_state {
 	bool			fs_enough;
 };
 
+static struct cl_object *lov_find_subobj(const struct lu_env *env,
+					 struct lov_object *lov,
+					 struct lov_stripe_md *lsm,
+					 int index)
+{
+	struct lov_device	*dev = lu2lov_dev(lov2lu(lov)->lo_dev);
+	struct lov_thread_info  *lti = lov_env_info(env);
+	struct lu_fid		*ofid = &lti->lti_fid;
+	struct lov_oinfo	*oinfo;
+	struct cl_device	*subdev;
+	int			entry = lov_comp_entry(index);
+	int			stripe = lov_comp_stripe(index);
+	int			ost_idx;
+	int			rc;
+	struct cl_object	*result;
+
+	if (lov->lo_type != LLT_COMP)
+		GOTO(out, result = NULL);
+
+	if (entry >= lsm->lsm_entry_count ||
+	    stripe >= lsm->lsm_entries[entry]->lsme_stripe_count)
+		GOTO(out, result = NULL);
+
+	oinfo = lsm->lsm_entries[entry]->lsme_oinfo[stripe];
+	ost_idx = oinfo->loi_ost_idx;
+	rc = ostid_to_fid(ofid, &oinfo->loi_oi, ost_idx);
+	if (rc != 0)
+		GOTO(out, result = NULL);
+
+	subdev = lovsub2cl_dev(dev->ld_target[ost_idx]);
+	result = lov_sub_find(env, subdev, ofid, NULL);
+out:
+	if (result == NULL)
+		result = ERR_PTR(-EINVAL);
+	return result;
+}
+
 int fiemap_for_stripe(const struct lu_env *env, struct cl_object *obj,
 		      struct lov_stripe_md *lsm, struct fiemap *fiemap,
 		      size_t *buflen, struct ll_fiemap_info_key *fmkey,
@@ -1298,7 +1678,7 @@ int fiemap_for_stripe(const struct lu_env *env, struct cl_object *obj,
 	if (lun_start == lun_end)
 		return 0;
 
-	req_fm_len = obd_object_end - lun_start;
+	req_fm_len = obd_object_end - lun_start + 1;
 	fs->fs_fm->fm_length = 0;
 	len_mapped_single_call = 0;
 
@@ -1341,7 +1721,7 @@ int fiemap_for_stripe(const struct lu_env *env, struct cl_object *obj,
 			fs->fs_fm->fm_mapped_extents = 1;
 
 			fm_ext[0].fe_logical = lun_start;
-			fm_ext[0].fe_length = obd_object_end - lun_start;
+			fm_ext[0].fe_length = obd_object_end - lun_start + 1;
 			fm_ext[0].fe_flags |= FIEMAP_EXTENT_UNKNOWN;
 
 			goto inactive_tgt;
@@ -1456,8 +1836,11 @@ static int lov_object_fiemap(const struct lu_env *env, struct cl_object *obj,
 	ENTRY;
 
 	lsm = lov_lsm_addref(cl2lov(obj));
-	if (lsm == NULL)
-		RETURN(-ENODATA);
+	if (lsm == NULL) {
+		/* no extent: there is no object for mapping */
+		fiemap->fm_mapped_extents = 0;
+		return 0;
+	}
 
 	if (!(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) {
 		/**
@@ -1471,6 +1854,10 @@ static int lov_object_fiemap(const struct lu_env *env, struct cl_object *obj,
 			GOTO(out_lsm, rc = -ENOTSUPP);
 	}
 
+	/* No support for DOM layout yet. */
+	if (lsme_is_dom(lsm->lsm_entries[0]))
+		GOTO(out_lsm, rc = -ENOTSUPP);
+
 	if (lsm->lsm_is_released) {
 		if (fiemap->fm_start < fmkey->lfik_oa.o_size) {
 			/**
@@ -1537,6 +1924,7 @@ static int lov_object_fiemap(const struct lu_env *env, struct cl_object *obj,
 	if (start_entry == -1 || end_entry == -1)
 		GOTO(out_fm_local, rc = -EINVAL);
 
+	/* TODO: rewrite it with lov_foreach_io_layout() */
 	for (entry = start_entry; entry <= end_entry; entry++) {
 		lsme = lsm->lsm_entries[entry];
 
@@ -1666,6 +2054,13 @@ static loff_t lov_object_maxbytes(struct cl_object *obj)
 	return maxbytes;
 }
 
+static int lov_object_flush(const struct lu_env *env, struct cl_object *obj,
+			    struct ldlm_lock *lock)
+{
+	return LOV_2DISPATCH_MAYLOCK(cl2lov(obj), llo_flush, true, env, obj,
+				     lock);
+}
+
 static const struct cl_object_operations lov_ops = {
 	.coo_page_init    = lov_page_init,
 	.coo_lock_init    = lov_lock_init,
@@ -1677,6 +2072,7 @@ static const struct cl_object_operations lov_ops = {
 	.coo_layout_get   = lov_object_layout_get,
 	.coo_maxbytes     = lov_object_maxbytes,
 	.coo_fiemap       = lov_object_fiemap,
+	.coo_object_flush = lov_object_flush
 };
 
 static const struct lu_object_operations lov_lu_obj_ops = {
@@ -1768,6 +2164,7 @@ int lov_read_and_clear_async_rc(struct cl_object *clob)
 				}
 			}
 		}
+		fallthrough;
 		case LLT_RELEASED:
 		case LLT_EMPTY:
 			break;
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_offset.c b/drivers/staging/lustrefsx/lustre/lov/lov_offset.c
index 3ff0a38a7e263..de2e6c47da8ee 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_offset.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_offset.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,12 +38,15 @@
 
 #include "lov_internal.h"
 
-static loff_t stripe_width(struct lov_stripe_md *lsm, unsigned int index)
+loff_t stripe_width(struct lov_stripe_md *lsm, unsigned int index)
 {
 	struct lov_stripe_md_entry *entry = lsm->lsm_entries[index];
 
 	LASSERT(index < lsm->lsm_entry_count);
 
+	if (lsme_is_dom(entry))
+		return (loff_t)entry->lsme_stripe_size;
+
 	return (loff_t)entry->lsme_stripe_size * entry->lsme_stripe_count;
 }
 
@@ -55,10 +58,11 @@ u64 lov_stripe_size(struct lov_stripe_md *lsm, int index, u64 ost_size,
 	unsigned long stripe_size;
 	loff_t swidth;
 	loff_t lov_size;
-        ENTRY;
 
-        if (ost_size == 0)
-                RETURN(0);
+	ENTRY;
+
+	if (ost_size == 0)
+		RETURN(0);
 
 	swidth = stripe_width(lsm, index);
 
@@ -69,7 +73,7 @@ u64 lov_stripe_size(struct lov_stripe_md *lsm, int index, u64 ost_size,
 	else
 		lov_size = (ost_size - 1) * swidth + (stripeno + 1) * ssize;
 
-        RETURN(lov_size);
+	RETURN(lov_size);
 }
 
 /**
@@ -86,7 +90,8 @@ pgoff_t lov_stripe_pgoff(struct lov_stripe_md *lsm, int index,
 	return offset >> PAGE_SHIFT;
 }
 
-/* we have an offset in file backed by an lov and want to find out where
+/*
+ * we have an offset in file backed by an lov and want to find out where
  * that offset lands in our given stripe of the file.  for the easy
  * case where the offset is within the stripe, we just have to scale the
  * offset down to make it relative to the stripe instead of the lov.
@@ -133,7 +138,8 @@ pgoff_t lov_stripe_pgoff(struct lov_stripe_md *lsm, int index,
  * this function returns < 0 when the offset was "before" the stripe and
  * was moved forward to the start of the stripe in question;  0 when it
  * falls in the stripe and no shifting was done; > 0 when the offset
- * was outside the stripe and was pulled back to its final byte. */
+ * was outside the stripe and was pulled back to its final byte.
+ */
 int lov_stripe_offset(struct lov_stripe_md *lsm, int index, loff_t lov_off,
 		      int stripeno, loff_t *obdoff)
 {
@@ -141,12 +147,12 @@ int lov_stripe_offset(struct lov_stripe_md *lsm, int index, loff_t lov_off,
 	loff_t stripe_off;
 	loff_t this_stripe;
 	loff_t swidth;
-        int ret = 0;
+	int ret = 0;
 
-        if (lov_off == OBD_OBJECT_EOF) {
-                *obdoff = OBD_OBJECT_EOF;
-                return 0;
-        }
+	if (lov_off == OBD_OBJECT_EOF) {
+		*obdoff = OBD_OBJECT_EOF;
+		return 0;
+	}
 
 	swidth = stripe_width(lsm, index);
 
@@ -154,23 +160,24 @@ int lov_stripe_offset(struct lov_stripe_md *lsm, int index, loff_t lov_off,
 	stripe_off = lov_do_div64(lov_off, swidth);
 
 	this_stripe = (loff_t)stripeno * ssize;
-        if (stripe_off < this_stripe) {
-                stripe_off = 0;
-                ret = -1;
-        } else {
-                stripe_off -= this_stripe;
-
-                if (stripe_off >= ssize) {
-                        stripe_off = ssize;
-                        ret = 1;
-                }
-        }
-
-        *obdoff = lov_off * ssize + stripe_off;
-        return ret;
+	if (stripe_off < this_stripe) {
+		stripe_off = 0;
+		ret = -1;
+	} else {
+		stripe_off -= this_stripe;
+
+		if (stripe_off >= ssize) {
+			stripe_off = ssize;
+			ret = 1;
+		}
+	}
+
+	*obdoff = lov_off * ssize + stripe_off;
+	return ret;
 }
 
-/* Given a whole-file size and a stripe number, give the file size which
+/*
+ * Given a whole-file size and a stripe number, give the file size which
  * corresponds to the individual object of that stripe.
  *
  * This behaves basically in the same was as lov_stripe_offset, except that
@@ -197,8 +204,8 @@ loff_t lov_size_to_stripe(struct lov_stripe_md *lsm, int index, u64 file_size,
 	loff_t this_stripe;
 	loff_t swidth;
 
-        if (file_size == OBD_OBJECT_EOF)
-                return OBD_OBJECT_EOF;
+	if (file_size == OBD_OBJECT_EOF)
+		return OBD_OBJECT_EOF;
 
 	swidth = stripe_width(lsm, index);
 
@@ -206,35 +213,39 @@ loff_t lov_size_to_stripe(struct lov_stripe_md *lsm, int index, u64 file_size,
 	stripe_off = lov_do_div64(file_size, swidth);
 
 	this_stripe = (loff_t)stripeno * ssize;
-        if (stripe_off < this_stripe) {
-                /* Move to end of previous stripe, or zero */
-                if (file_size > 0) {
-                        file_size--;
-                        stripe_off = ssize;
-                } else {
-                        stripe_off = 0;
-                }
-        } else {
-                stripe_off -= this_stripe;
-
-                if (stripe_off >= ssize) {
-                        /* Clamp to end of this stripe */
-                        stripe_off = ssize;
-                }
-        }
-
-        return (file_size * ssize + stripe_off);
+	if (stripe_off < this_stripe) {
+		/* Move to end of previous stripe, or zero */
+		if (file_size > 0) {
+			file_size--;
+			stripe_off = ssize;
+		} else {
+			stripe_off = 0;
+		}
+	} else {
+		stripe_off -= this_stripe;
+
+		if (stripe_off >= ssize) {
+			/* Clamp to end of this stripe */
+			stripe_off = ssize;
+		}
+	}
+
+	return (file_size * ssize + stripe_off);
 }
 
-/* given an extent in an lov and a stripe, calculate the extent of the stripe
+/*
+ * given an extent in an lov and a stripe, calculate the extent of the stripe
  * that is contained within the lov extent.  this returns true if the given
- * stripe does intersect with the lov extent. */
+ * stripe does intersect with the lov extent.
+ *
+ * Closed interval [@obd_start, @obd_end] will be returned.
+ */
 int lov_stripe_intersects(struct lov_stripe_md *lsm, int index, int stripeno,
 			  struct lu_extent *ext, u64 *obd_start, u64 *obd_end)
 {
 	struct lov_stripe_md_entry *entry = lsm->lsm_entries[index];
 	u64 start, end;
-        int start_side, end_side;
+	int start_side, end_side;
 
 	if (!lu_extent_is_overlapped(ext, &entry->lsme_extent))
 			return 0;
@@ -250,24 +261,28 @@ int lov_stripe_intersects(struct lov_stripe_md *lsm, int index, int stripeno,
 	CDEBUG(D_INODE, "[%lld->%lld] -> [(%d) %lld->%lld (%d)]\n",
 		start, end, start_side, *obd_start, *obd_end, end_side);
 
-        /* this stripe doesn't intersect the file extent when neither
-         * start or the end intersected the stripe and obd_start and
-         * obd_end got rounded up to the save value. */
-        if (start_side != 0 && end_side != 0 && *obd_start == *obd_end)
-                return 0;
-
-        /* as mentioned in the lov_stripe_offset commentary, end
-         * might have been shifted in the wrong direction.  This
-         * happens when an end offset is before the stripe when viewed
-         * through the "mod stripe size" math. we detect it being shifted
-         * in the wrong direction and touch it up.
-         * interestingly, this can't underflow since end must be > start
-         * if we passed through the previous check.
-         * (should we assert for that somewhere?) */
-        if (end_side != 0)
-                (*obd_end)--;
-
-        return 1;
+	/*
+	 * this stripe doesn't intersect the file extent when neither
+	 * start or the end intersected the stripe and obd_start and
+	 * obd_end got rounded up to the save value.
+	 */
+	if (start_side != 0 && end_side != 0 && *obd_start == *obd_end)
+		return 0;
+
+	/*
+	 * as mentioned in the lov_stripe_offset commentary, end
+	 * might have been shifted in the wrong direction.  This
+	 * happens when an end offset is before the stripe when viewed
+	 * through the "mod stripe size" math. we detect it being shifted
+	 * in the wrong direction and touch it up.
+	 * interestingly, this can't underflow since end must be > start
+	 * if we passed through the previous check.
+	 * (should we assert for that somewhere?)
+	 */
+	if (end_side != 0)
+		(*obd_end)--;
+
+	return 1;
 }
 
 /* compute which stripe number "lov_off" will be written into */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_pack.c b/drivers/staging/lustrefsx/lustre/lov/lov_pack.c
index dd29ff51dcc1c..6fe3c2ff5bd5b 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_pack.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_pack.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,9 +38,6 @@
 
 #define DEBUG_SUBSYSTEM S_LOV
 
-#include <lustre/lustre_idl.h>
-#include <lustre/lustre_user.h>
-
 #include <lustre_net.h>
 #include <lustre_swab.h>
 #include <obd.h>
@@ -53,16 +50,16 @@
 void lov_dump_lmm_common(int level, void *lmmp)
 {
 	struct lov_mds_md *lmm = lmmp;
-	struct ost_id	oi;
+	struct ost_id oi;
 
 	lmm_oi_le_to_cpu(&oi, &lmm->lmm_oi);
-	CDEBUG_LIMIT(level, "objid "DOSTID", magic 0x%08x, pattern %#x\n",
-		     POSTID(&oi), le32_to_cpu(lmm->lmm_magic),
-		     le32_to_cpu(lmm->lmm_pattern));
-	CDEBUG_LIMIT(level, "stripe_size %u, stripe_count %u, layout_gen %u\n",
-		     le32_to_cpu(lmm->lmm_stripe_size),
-		     le16_to_cpu(lmm->lmm_stripe_count),
-		     le16_to_cpu(lmm->lmm_layout_gen));
+	CDEBUG(level, "objid "DOSTID", magic 0x%08x, pattern %#x\n",
+	       POSTID(&oi), le32_to_cpu(lmm->lmm_magic),
+	       le32_to_cpu(lmm->lmm_pattern));
+	CDEBUG(level, "stripe_size %u, stripe_count %u, layout_gen %u\n",
+	       le32_to_cpu(lmm->lmm_stripe_size),
+	       le16_to_cpu(lmm->lmm_stripe_count),
+	       le16_to_cpu(lmm->lmm_layout_gen));
 }
 
 static void lov_dump_lmm_objects(int level, struct lov_ost_data *lod,
@@ -71,9 +68,8 @@ static void lov_dump_lmm_objects(int level, struct lov_ost_data *lod,
 	int i;
 
 	if (stripe_count > LOV_V1_INSANE_STRIPE_COUNT) {
-		CDEBUG_LIMIT(level,
-			     "bad stripe_count %u > max_stripe_count %u\n",
-			     stripe_count, LOV_V1_INSANE_STRIPE_COUNT);
+		CDEBUG(level, "bad stripe_count %u > max_stripe_count %u\n",
+		       stripe_count, LOV_V1_INSANE_STRIPE_COUNT);
 		return;
 	}
 
@@ -81,22 +77,22 @@ static void lov_dump_lmm_objects(int level, struct lov_ost_data *lod,
 		struct ost_id oi;
 
 		ostid_le_to_cpu(&lod->l_ost_oi, &oi);
-		CDEBUG_LIMIT(level, "stripe %u idx %u subobj "DOSTID"\n", i,
-			     le32_to_cpu(lod->l_ost_idx), POSTID(&oi));
+		CDEBUG(level, "stripe %u idx %u subobj "DOSTID"\n", i,
+		       le32_to_cpu(lod->l_ost_idx), POSTID(&oi));
 	}
 }
 
 void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm)
 {
-        lov_dump_lmm_common(level, lmm);
-        lov_dump_lmm_objects(level, lmm->lmm_objects,
-                             le16_to_cpu(lmm->lmm_stripe_count));
+	lov_dump_lmm_common(level, lmm);
+	lov_dump_lmm_objects(level, lmm->lmm_objects,
+			     le16_to_cpu(lmm->lmm_stripe_count));
 }
 
 void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm)
 {
 	lov_dump_lmm_common(level, lmm);
-	CDEBUG_LIMIT(level, "pool_name "LOV_POOLNAMEF"\n", lmm->lmm_pool_name);
+	CDEBUG(level, "pool_name "LOV_POOLNAMEF"\n", lmm->lmm_pool_name);
 	lov_dump_lmm_objects(level, lmm->lmm_objects,
 			     le16_to_cpu(lmm->lmm_stripe_count));
 }
@@ -114,8 +110,8 @@ void lov_dump_lmm(int level, void *lmm)
 		lov_dump_lmm_v3(level, (struct lov_mds_md_v3 *)lmm);
 		break;
 	default:
-		CDEBUG_LIMIT(level, "unrecognized lmm_magic %x, assuming %x\n",
-			     magic, LOV_MAGIC_V1);
+		CDEBUG(level, "unrecognized lmm_magic %x, assuming %x\n",
+		       magic, LOV_MAGIC_V1);
 		lov_dump_lmm_common(level, lmm);
 		break;
 	}
@@ -137,6 +133,7 @@ ssize_t lov_lsm_pack_v1v3(const struct lov_stripe_md *lsm, void *buf,
 	struct lov_ost_data_v1 *lmm_objects;
 	size_t lmm_size;
 	unsigned int i;
+
 	ENTRY;
 
 	lmm_size = lov_mds_md_size(lsm->lsm_entries[0]->lsme_stripe_count,
@@ -147,7 +144,8 @@ ssize_t lov_lsm_pack_v1v3(const struct lov_stripe_md *lsm, void *buf,
 	if (buf_size < lmm_size)
 		RETURN(-ERANGE);
 
-	/* lmmv1 and lmmv3 point to the same struct and have the
+	/*
+	 * lmmv1 and lmmv3 point to the same struct and have the
 	 * same first fields
 	 */
 	lmmv1->lmm_magic = cpu_to_le32(lsm->lsm_magic);
@@ -195,6 +193,7 @@ ssize_t lov_lsm_pack(const struct lov_stripe_md *lsm, void *buf,
 	unsigned int offset;
 	unsigned int size;
 	unsigned int i;
+
 	ENTRY;
 
 	if (lsm->lsm_magic == LOV_MAGIC_V1 || lsm->lsm_magic == LOV_MAGIC_V3)
@@ -210,6 +209,8 @@ ssize_t lov_lsm_pack(const struct lov_stripe_md *lsm, void *buf,
 	lcmv1->lcm_magic = cpu_to_le32(lsm->lsm_magic);
 	lcmv1->lcm_size = cpu_to_le32(lmm_size);
 	lcmv1->lcm_layout_gen = cpu_to_le32(lsm->lsm_layout_gen);
+	lcmv1->lcm_flags = cpu_to_le16(lsm->lsm_flags);
+	lcmv1->lcm_mirror_count = cpu_to_le16(lsm->lsm_mirror_count);
 	lcmv1->lcm_entry_count = cpu_to_le16(lsm->lsm_entry_count);
 
 	offset = sizeof(*lcmv1) + sizeof(*lcme) * lsm->lsm_entry_count;
@@ -224,6 +225,9 @@ ssize_t lov_lsm_pack(const struct lov_stripe_md *lsm, void *buf,
 
 		lcme->lcme_id = cpu_to_le32(lsme->lsme_id);
 		lcme->lcme_flags = cpu_to_le32(lsme->lsme_flags);
+		if (lsme->lsme_flags & LCME_FL_NOSYNC)
+			lcme->lcme_timestamp =
+				cpu_to_le64(lsme->lsme_timestamp);
 		lcme->lcme_extent.e_start =
 			cpu_to_le64(lsme->lsme_extent.e_start);
 		lcme->lcme_extent.e_end =
@@ -286,8 +290,10 @@ __u16 lov_get_stripe_count(struct lov_obd *lov, __u32 magic, __u16 stripe_count)
 	if (!stripe_count)
 		stripe_count = 1;
 
-	/* stripe count is based on whether ldiskfs can handle
-	 * larger EA sizes */
+	/*
+	 * stripe count is based on whether ldiskfs can handle
+	 * larger EA sizes
+	 */
 	if (lov->lov_ocd.ocd_connect_flags & OBD_CONNECT_MAX_EASIZE &&
 	    lov->lov_ocd.ocd_max_easize)
 		max_stripes = lov_mds_md_max_stripe_count(
@@ -313,7 +319,8 @@ int lov_free_memmd(struct lov_stripe_md **lsmp)
 	return refc;
 }
 
-/* Unpack LOV object metadata from disk storage.  It is packed in LE byte
+/*
+ * Unpack LOV object metadata from disk storage.  It is packed in LE byte
  * order and is opaque to the networking layer.
  */
 struct lov_stripe_md *lov_unpackmd(struct lov_obd *lov, void *buf,
@@ -322,6 +329,7 @@ struct lov_stripe_md *lov_unpackmd(struct lov_obd *lov, void *buf,
 	const struct lsm_operations *op;
 	struct lov_stripe_md *lsm;
 	u32 magic;
+
 	ENTRY;
 
 	if (buf_size < sizeof(magic))
@@ -329,7 +337,7 @@ struct lov_stripe_md *lov_unpackmd(struct lov_obd *lov, void *buf,
 
 	magic = le32_to_cpu(*(u32 *)buf);
 	op = lsm_op_find(magic);
-	if (op == NULL)
+	if (!op)
 		RETURN(ERR_PTR(-EINVAL));
 
 	lsm = op->lsm_unpackmd(lov, buf, buf_size);
@@ -337,7 +345,8 @@ struct lov_stripe_md *lov_unpackmd(struct lov_obd *lov, void *buf,
 	RETURN(lsm);
 }
 
-/* Retrieve object striping information.
+/*
+ * Retrieve object striping information.
  *
  * @lump is a pointer to an in-core struct with lmm_ost_count indicating
  * the maximum number of OST indices which will fit in the user buffer.
@@ -353,10 +362,10 @@ int lov_getstripe(const struct lu_env *env, struct lov_object *obj,
 	/* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */
 	struct lov_mds_md *lmmk, *lmm;
 	struct lov_user_md_v1 lum;
-	size_t	lmmk_size;
-	ssize_t	lmm_size, lum_size = 0;
-	static bool printed;
-	int	rc = 0;
+	size_t lmmk_size, lum_size = 0;
+	ssize_t lmm_size;
+	int rc = 0;
+
 	ENTRY;
 
 	if (lsm->lsm_magic != LOV_MAGIC_V1 && lsm->lsm_magic != LOV_MAGIC_V3 &&
@@ -366,18 +375,10 @@ int lov_getstripe(const struct lu_env *env, struct lov_object *obj,
 		GOTO(out, rc = -EIO);
 	}
 
-	if (!printed) {
-		LCONSOLE_WARN("%s: using old ioctl(LL_IOC_LOV_GETSTRIPE) on "
-			      DFID", use llapi_layout_get_by_path()\n",
-			      current->comm,
-			      PFID(&obj->lo_cl.co_lu.lo_header->loh_fid));
-		printed = true;
-	}
-
 	lmmk_size = lov_comp_md_size(lsm);
 
 	OBD_ALLOC_LARGE(lmmk, lmmk_size);
-	if (lmmk == NULL)
+	if (!lmmk)
 		GOTO(out, rc = -ENOMEM);
 
 	lmm_size = lov_lsm_pack(lsm, lmmk, lmmk_size);
@@ -397,8 +398,10 @@ int lov_getstripe(const struct lu_env *env, struct lov_object *obj,
 		}
 	}
 
-	/* Legacy appication passes limited buffer, we need to figure out
-	 * the user buffer size by the passed in lmm_stripe_count. */
+	/*
+	 * Legacy appication passes limited buffer, we need to figure out
+	 * the user buffer size by the passed in lmm_stripe_count.
+	 */
 	if (copy_from_user(&lum, lump, sizeof(struct lov_user_md_v1)))
 		GOTO(out_free, rc = -EFAULT);
 
@@ -410,8 +413,10 @@ int lov_getstripe(const struct lu_env *env, struct lov_object *obj,
 	if (lum_size != 0) {
 		struct lov_mds_md *comp_md = lmmk;
 
-		/* Legacy app (ADIO for instance) treats the layout as V1/V3
-		 * blindly, we'd return a reasonable V1/V3 for them. */
+		/*
+		 * Legacy app (ADIO for instance) treats the layout as V1/V3
+		 * blindly, we'd return a reasonable V1/V3 for them.
+		 */
 		if (lmmk->lmm_magic == LOV_MAGIC_COMP_V1) {
 			struct lov_comp_md_v1 *comp_v1;
 			struct cl_object *cl_obj;
@@ -424,8 +429,10 @@ int lov_getstripe(const struct lu_env *env, struct lov_object *obj,
 			cl_object_attr_get(env, cl_obj, &attr);
 			cl_object_attr_unlock(cl_obj);
 
-			/* return the last instantiated component if file size
-			 * is non-zero, otherwise, return the last component.*/
+			/*
+			 * return the last instantiated component if file size
+			 * is non-zero, otherwise, return the last component.
+			 */
 			comp_v1 = (struct lov_comp_md_v1 *)lmmk;
 			i = attr.cat_size == 0 ? comp_v1->lcm_entry_count : 0;
 			for (; i < comp_v1->lcm_entry_count; i++) {
@@ -437,10 +444,11 @@ int lov_getstripe(const struct lu_env *env, struct lov_object *obj,
 				i--;
 			comp_md = (struct lov_mds_md *)((char *)comp_v1 +
 					comp_v1->lcm_entries[i].lcme_offset);
+			lum_size = comp_v1->lcm_entries[i].lcme_size;
 		}
 
 		lmm = comp_md;
-		lmm_size = lum_size;
+		lmm_size = min(lum_size, lmmk_size);
 	} else {
 		lmm = lmmk;
 		lmm_size = lmmk_size;
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_page.c b/drivers/staging/lustrefsx/lustre/lov/lov_page.c
index 869c0b8478760..34fbc66e47172 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_page.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_page.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -56,8 +56,8 @@ static int lov_comp_page_print(const struct lu_env *env,
 	struct lov_page *lp = cl2lov_page(slice);
 
 	return (*printer)(env, cookie,
-			  LUSTRE_LOV_NAME"-page@%p, comp index: %x\n",
-			  lp, lp->lps_index);
+			  LUSTRE_LOV_NAME"-page@%p, comp index: %x, gen: %u\n",
+			  lp, lp->lps_index, lp->lps_layout_gen);
 }
 
 static const struct cl_page_operations lov_comp_page_ops = {
@@ -68,21 +68,22 @@ int lov_page_init_composite(const struct lu_env *env, struct cl_object *obj,
 			    struct cl_page *page, pgoff_t index)
 {
 	struct lov_object *loo = cl2lov(obj);
-	struct lov_io     *lio = lov_env_io(env);
-	struct cl_object  *subobj;
-	struct cl_object  *o;
+	struct lov_io *lio = lov_env_io(env);
+	struct cl_object *subobj;
+	struct cl_object *o;
 	struct lov_io_sub *sub;
-	struct lov_page   *lpg = cl_object_page_slice(obj, page);
+	struct lov_page *lpg = cl_object_page_slice(obj, page);
 	struct lov_layout_raid0 *r0;
-	loff_t             offset;
-	loff_t             suboff;
-	int                entry;
-	int                stripe;
-	int                rc;
+	loff_t offset;
+	loff_t suboff;
+	int entry;
+	int stripe;
+	int rc;
+
 	ENTRY;
 
 	offset = cl_offset(obj, index);
-	entry = lov_lsm_entry(loo->lo_lsm, offset);
+	entry = lov_io_layout_at(lio, offset);
 	if (entry < 0 || !lsm_entry_inited(loo->lo_lsm, entry)) {
 		/* non-existing layout component */
 		lov_page_init_empty(env, obj, page, index);
@@ -96,6 +97,7 @@ int lov_page_init_composite(const struct lu_env *env, struct cl_object *obj,
 	LASSERT(rc == 0);
 
 	lpg->lps_index = lov_comp_index(entry, stripe);
+	lpg->lps_layout_gen = loo->lo_lsm->lsm_layout_gen;
 	cl_page_slice_add(page, &lpg->lps_cl, obj, index, &lov_comp_page_ops);
 
 	sub = lov_sub_get(env, lio, lpg->lps_index);
@@ -105,7 +107,7 @@ int lov_page_init_composite(const struct lu_env *env, struct cl_object *obj,
 	subobj = lovsub2cl(r0->lo_sub[stripe]);
 	list_for_each_entry(o, &subobj->co_lu.lo_header->loh_layers,
 			    co_lu.lo_linkage) {
-		if (o->co_ops->coo_page_init != NULL) {
+		if (o->co_ops->coo_page_init) {
 			rc = o->co_ops->coo_page_init(sub->sub_env, o, page,
 						      cl_index(subobj, suboff));
 			if (rc != 0)
@@ -120,9 +122,9 @@ static int lov_empty_page_print(const struct lu_env *env,
 				const struct cl_page_slice *slice,
 				void *cookie, lu_printer_t printer)
 {
-        struct lov_page *lp = cl2lov_page(slice);
+	struct lov_page *lp = cl2lov_page(slice);
 
-        return (*printer)(env, cookie, LUSTRE_LOV_NAME"-page@%p, empty.\n", lp);
+	return (*printer)(env, cookie, LUSTRE_LOV_NAME"-page@%p, empty.\n", lp);
 }
 
 static const struct cl_page_operations lov_empty_page_ops = {
@@ -134,8 +136,10 @@ int lov_page_init_empty(const struct lu_env *env, struct cl_object *obj,
 {
 	struct lov_page *lpg = cl_object_page_slice(obj, page);
 	void *addr;
+
 	ENTRY;
 
+	lpg->lps_index = ~0;
 	cl_page_slice_add(page, &lpg->lps_cl, obj, index, &lov_empty_page_ops);
 	addr = kmap(page->cp_vmpage);
 	memset(addr, 0, cl_page_size(obj));
@@ -144,6 +148,14 @@ int lov_page_init_empty(const struct lu_env *env, struct cl_object *obj,
 	RETURN(0);
 }
 
+bool lov_page_is_empty(const struct cl_page *page)
+{
+	const struct cl_page_slice *slice = cl_page_at(page, &lov_device_type);
+
+	LASSERT(slice != NULL);
+	return slice->cpl_ops == &lov_empty_page_ops;
+}
+
 
 /** @} lov */
 
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_pool.c b/drivers/staging/lustrefsx/lustre/lov/lov_pool.c
index 02b8899cb1b68..6173dbe1429ae 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_pool.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_pool.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2014, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -152,7 +152,6 @@ struct cfs_hash_ops pool_hash_operations = {
 };
 
 #ifdef CONFIG_PROC_FS
-/* ifdef needed for liblustre support */
 /*
  * pool /proc seq_file methods
  */
@@ -182,14 +181,11 @@ static void *pool_proc_next(struct seq_file *s, void *v, loff_t *pos)
 
         /* iterate to find a non empty entry */
         prev_idx = iter->idx;
-	down_read(&pool_tgt_rw_sem(iter->pool));
         iter->idx++;
-        if (iter->idx == pool_tgt_count(iter->pool)) {
+	if (iter->idx >= pool_tgt_count(iter->pool)) {
                 iter->idx = prev_idx; /* we stay on the last entry */
-		up_read(&pool_tgt_rw_sem(iter->pool));
                 return NULL;
         }
-	up_read(&pool_tgt_rw_sem(iter->pool));
         (*pos)++;
         /* return != NULL to continue */
         return iter;
@@ -220,6 +216,7 @@ static void *pool_proc_start(struct seq_file *s, loff_t *pos)
          * we can free it at stop() */
         /* /!\ do not forget to restore it to pool before freeing it */
         s->private = iter;
+	down_read(&pool_tgt_rw_sem(pool));
         if (*pos > 0) {
                 loff_t i;
                 void *ptr;
@@ -241,6 +238,7 @@ static void pool_proc_stop(struct seq_file *s, void *v)
          * calling start() method (see seq_read() from fs/seq_file.c)
          * we have to free only if s->private is an iterator */
         if ((iter) && (iter->magic == POOL_IT_MAGIC)) {
+		up_read(&pool_tgt_rw_sem(iter->pool));
                 /* we restore s->private so next call to pool_proc_start()
                  * will work */
                 s->private = iter->pool;
@@ -259,9 +257,7 @@ static int pool_proc_show(struct seq_file *s, void *v)
 	LASSERT(iter->pool != NULL);
 	LASSERT(iter->idx <= pool_tgt_count(iter->pool));
 
-	down_read(&pool_tgt_rw_sem(iter->pool));
         tgt = pool_tgt(iter->pool, iter->idx);
-	up_read(&pool_tgt_rw_sem(iter->pool));
         if (tgt)
                 seq_printf(s, "%s\n", obd_uuid2str(&(tgt->ltd_uuid)));
 
@@ -287,7 +283,7 @@ static int pool_proc_open(struct inode *inode, struct file *file)
         return rc;
 }
 
-static struct proc_ops pool_proc_operations = {
+const static struct proc_ops pool_proc_operations = {
 	.proc_open	= pool_proc_open,
 	.proc_read	= seq_read,
 	.proc_lseek	= seq_lseek,
@@ -549,7 +545,7 @@ int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname)
 
 
         /* search ost in lov array */
-        obd_getref(obd);
+	lov_tgts_getref(obd);
         for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) {
                 if (!lov->lov_tgts[lov_idx])
                         continue;
@@ -570,9 +566,10 @@ int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname)
 
         EXIT;
 out:
-        obd_putref(obd);
-        lov_pool_putref(pool);
-        return rc;
+	lov_tgts_putref(obd);
+	lov_pool_putref(pool);
+
+	return rc;
 }
 
 int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname)
@@ -592,7 +589,7 @@ int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname)
 
         obd_str2uuid(&ost_uuid, ostname);
 
-        obd_getref(obd);
+	lov_tgts_getref(obd);
         /* search ost in lov array, to get index */
         for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) {
                 if (!lov->lov_tgts[lov_idx])
@@ -614,7 +611,8 @@ int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname)
 
         EXIT;
 out:
-        obd_putref(obd);
-        lov_pool_putref(pool);
-        return rc;
+	lov_tgts_putref(obd);
+	lov_pool_putref(pool);
+
+	return rc;
 }
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_request.c b/drivers/staging/lustrefsx/lustre/lov/lov_request.c
index fe74af4b7f82d..75e5c901fd91e 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_request.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_request.c
@@ -35,8 +35,6 @@
 #include <libcfs/libcfs.h>
 
 #include <obd_class.h>
-#include <lustre/lustre_idl.h>
-
 #include "lov_internal.h"
 
 static void lov_init_set(struct lov_request_set *set)
@@ -51,6 +49,7 @@ static void lov_finish_set(struct lov_request_set *set)
 {
 	struct list_head *pos, *n;
 	struct lov_request *req;
+
 	ENTRY;
 
 	LASSERT(set != NULL);
@@ -58,7 +57,7 @@ static void lov_finish_set(struct lov_request_set *set)
 		req = list_entry(pos, struct lov_request, rq_link);
 		list_del_init(&req->rq_link);
 
-		if (req->rq_oi.oi_osfs != NULL)
+		if (req->rq_oi.oi_osfs)
 			OBD_FREE_PTR(req->rq_oi.oi_osfs);
 
 		OBD_FREE_PTR(req);
@@ -80,18 +79,18 @@ static void
 lov_set_add_req(struct lov_request *req, struct lov_request_set *set)
 {
 	list_add_tail(&req->rq_link, &set->set_list);
-        set->set_count++;
-        req->rq_rqset = set;
+	set->set_count++;
+	req->rq_rqset = set;
 }
 
 static int lov_check_set(struct lov_obd *lov, int idx)
 {
 	int rc = 0;
+
 	mutex_lock(&lov->lov_lock);
 
-	if (lov->lov_tgts[idx] == NULL ||
-	    lov->lov_tgts[idx]->ltd_active ||
-	    (lov->lov_tgts[idx]->ltd_exp != NULL &&
+	if (!lov->lov_tgts[idx] || lov->lov_tgts[idx]->ltd_active ||
+	    (lov->lov_tgts[idx]->ltd_exp &&
 	     class_exp2cliimp(lov->lov_tgts[idx]->ltd_exp)->imp_connect_tried))
 		rc = 1;
 
@@ -99,7 +98,8 @@ static int lov_check_set(struct lov_obd *lov, int idx)
 	return rc;
 }
 
-/* Check if the OSC connection exists and is active.
+/*
+ * Check if the OSC connection exists and is active.
  * If the OSC has not yet had a chance to connect to the OST the first time,
  * wait once for it to connect instead of returning an error.
  */
@@ -108,19 +108,24 @@ static int lov_check_and_wait_active(struct lov_obd *lov, int ost_idx)
 	wait_queue_head_t waitq;
 	struct l_wait_info lwi;
 	struct lov_tgt_desc *tgt;
+	struct obd_import *imp = NULL;
 	int rc = 0;
 
 	mutex_lock(&lov->lov_lock);
 
 	tgt = lov->lov_tgts[ost_idx];
 
-	if (unlikely(tgt == NULL))
+	if (unlikely(!tgt))
 		GOTO(out, rc = 0);
 
 	if (likely(tgt->ltd_active))
 		GOTO(out, rc = 1);
 
-	if (tgt->ltd_exp && class_exp2cliimp(tgt->ltd_exp)->imp_connect_tried)
+	if (tgt->ltd_exp)
+		imp = class_exp2cliimp(tgt->ltd_exp);
+	if (imp && imp->imp_connect_tried)
+		GOTO(out, rc = 0);
+	if (imp && imp->imp_state == LUSTRE_IMP_IDLE)
 		GOTO(out, rc = 0);
 
 	mutex_unlock(&lov->lov_lock);
@@ -142,20 +147,20 @@ static int lov_check_and_wait_active(struct lov_obd *lov, int ost_idx)
 
 #define LOV_U64_MAX ((__u64)~0ULL)
 #define LOV_SUM_MAX(tot, add)                                           \
-        do {                                                            \
-                if ((tot) + (add) < (tot))                              \
-                        (tot) = LOV_U64_MAX;                            \
-                else                                                    \
-                        (tot) += (add);                                 \
-        } while(0)
+	do {                                                            \
+		if ((tot) + (add) < (tot))                              \
+			(tot) = LOV_U64_MAX;                            \
+		else                                                    \
+			(tot) += (add);                                 \
+	} while (0)
 
 static int
 lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs, int success)
 {
-        ENTRY;
+	ENTRY;
 
-        if (success) {
-                __u32 expected_stripes = lov_get_stripe_count(&obd->u.lov,
+	if (success) {
+		__u32 expected_stripes = lov_get_stripe_count(&obd->u.lov,
 							      LOV_MAGIC, 0);
 		if (osfs->os_files != LOV_U64_MAX)
 			lov_do_div64(osfs->os_files, expected_stripes);
@@ -164,7 +169,7 @@ lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs, int success)
 
 		spin_lock(&obd->obd_osfs_lock);
 		memcpy(&obd->obd_osfs, osfs, sizeof(*osfs));
-		obd->obd_osfs_age = cfs_time_current_64();
+		obd->obd_osfs_age = ktime_get_seconds();
 		spin_unlock(&obd->obd_osfs_lock);
 		RETURN(0);
 	}
@@ -177,7 +182,7 @@ int lov_fini_statfs_set(struct lov_request_set *set)
 	int rc = 0;
 	ENTRY;
 
-	if (set == NULL)
+	if (!set)
 		RETURN(0);
 
 	if (atomic_read(&set->set_completes)) {
@@ -194,84 +199,91 @@ static void
 lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs,
 		  int success)
 {
-        int shift = 0, quit = 0;
-        __u64 tmp;
-
-        if (success == 0) {
-                memcpy(osfs, lov_sfs, sizeof(*lov_sfs));
-        } else {
-                if (osfs->os_bsize != lov_sfs->os_bsize) {
-                        /* assume all block sizes are always powers of 2 */
-                        /* get the bits difference */
-                        tmp = osfs->os_bsize | lov_sfs->os_bsize;
-                        for (shift = 0; shift <= 64; ++shift) {
-                                if (tmp & 1) {
-                                        if (quit)
-                                                break;
-                                        else
-                                                quit = 1;
-                                        shift = 0;
-                                }
-                                tmp >>= 1;
-                        }
-                }
-
-                if (osfs->os_bsize < lov_sfs->os_bsize) {
-                        osfs->os_bsize = lov_sfs->os_bsize;
-
-                        osfs->os_bfree  >>= shift;
-                        osfs->os_bavail >>= shift;
-                        osfs->os_blocks >>= shift;
-                } else if (shift != 0) {
-                        lov_sfs->os_bfree  >>= shift;
-                        lov_sfs->os_bavail >>= shift;
-                        lov_sfs->os_blocks >>= shift;
-                }
+	int shift = 0, quit = 0;
+	__u64 tmp;
+
+	if (success == 0) {
+		memcpy(osfs, lov_sfs, sizeof(*lov_sfs));
+	} else {
+		if (osfs->os_bsize != lov_sfs->os_bsize) {
+			/* assume all block sizes are always powers of 2 */
+			/* get the bits difference */
+			tmp = osfs->os_bsize | lov_sfs->os_bsize;
+			for (shift = 0; shift <= 64; ++shift) {
+				if (tmp & 1) {
+					if (quit)
+						break;
+					quit = 1;
+					shift = 0;
+				}
+				tmp >>= 1;
+			}
+		}
+
+		if (osfs->os_bsize < lov_sfs->os_bsize) {
+			osfs->os_bsize = lov_sfs->os_bsize;
+
+			osfs->os_bfree  >>= shift;
+			osfs->os_bavail >>= shift;
+			osfs->os_blocks >>= shift;
+		} else if (shift != 0) {
+			lov_sfs->os_bfree  >>= shift;
+			lov_sfs->os_bavail >>= shift;
+			lov_sfs->os_blocks >>= shift;
+		}
 #ifdef MIN_DF
-                /* Sandia requested that df (and so, statfs) only
-                   returned minimal available space on
-                   a single OST, so people would be able to
-                   write this much data guaranteed. */
-                if (osfs->os_bavail > lov_sfs->os_bavail) {
-                        /* Presumably if new bavail is smaller,
-                           new bfree is bigger as well */
-                        osfs->os_bfree = lov_sfs->os_bfree;
-                        osfs->os_bavail = lov_sfs->os_bavail;
-                }
+		/*
+		 * Sandia requested that df (and so, statfs) only
+		 * returned minimal available space on
+		 * a single OST, so people would be able to
+		 * write this much data guaranteed.
+		 */
+		if (osfs->os_bavail > lov_sfs->os_bavail) {
+			/*
+			 * Presumably if new bavail is smaller,
+			 * new bfree is bigger as well
+			 */
+			osfs->os_bfree = lov_sfs->os_bfree;
+			osfs->os_bavail = lov_sfs->os_bavail;
+		}
 #else
-                osfs->os_bfree += lov_sfs->os_bfree;
-                osfs->os_bavail += lov_sfs->os_bavail;
+		osfs->os_bfree += lov_sfs->os_bfree;
+		osfs->os_bavail += lov_sfs->os_bavail;
 #endif
-                osfs->os_blocks += lov_sfs->os_blocks;
-                /* XXX not sure about this one - depends on policy.
-                 *   - could be minimum if we always stripe on all OBDs
-                 *     (but that would be wrong for any other policy,
-                 *     if one of the OBDs has no more objects left)
-                 *   - could be sum if we stripe whole objects
-                 *   - could be average, just to give a nice number
-                 *
-                 * To give a "reasonable" (if not wholly accurate)
-                 * number, we divide the total number of free objects
-                 * by expected stripe count (watch out for overflow).
-                 */
-                LOV_SUM_MAX(osfs->os_files, lov_sfs->os_files);
-                LOV_SUM_MAX(osfs->os_ffree, lov_sfs->os_ffree);
-        }
+		osfs->os_blocks += lov_sfs->os_blocks;
+		/*
+		 * XXX not sure about this one - depends on policy.
+		 *   - could be minimum if we always stripe on all OBDs
+		 *     (but that would be wrong for any other policy,
+		 *     if one of the OBDs has no more objects left)
+		 *   - could be sum if we stripe whole objects
+		 *   - could be average, just to give a nice number
+		 *
+		 * To give a "reasonable" (if not wholly accurate)
+		 * number, we divide the total number of free objects
+		 * by expected stripe count (watch out for overflow).
+		 */
+		LOV_SUM_MAX(osfs->os_files, lov_sfs->os_files);
+		LOV_SUM_MAX(osfs->os_ffree, lov_sfs->os_ffree);
+	}
 }
 
-/* The callback for osc_statfs_async that finilizes a request info when a
- * response is received. */
+/*
+ * The callback for osc_statfs_async that finilizes a request info when a
+ * response is received.
+ */
 static int cb_statfs_update(void *cookie, int rc)
 {
-        struct obd_info *oinfo = cookie;
-        struct lov_request *lovreq;
-        struct lov_request_set *set;
-        struct obd_statfs *osfs, *lov_sfs;
-        struct lov_obd *lov;
-        struct lov_tgt_desc *tgt;
-        struct obd_device *lovobd, *tgtobd;
-        int success;
-        ENTRY;
+	struct obd_info *oinfo = cookie;
+	struct lov_request *lovreq;
+	struct lov_request_set *set;
+	struct obd_statfs *osfs, *lov_sfs;
+	struct lov_obd *lov;
+	struct lov_tgt_desc *tgt;
+	struct obd_device *lovobd, *tgtobd;
+	int success;
+
+	ENTRY;
 
 	lovreq = container_of(oinfo, struct lov_request, rq_oi);
 	set = lovreq->rq_rqset;
@@ -280,91 +292,101 @@ static int cb_statfs_update(void *cookie, int rc)
 	osfs = set->set_oi->oi_osfs;
 	lov_sfs = oinfo->oi_osfs;
 	success = atomic_read(&set->set_success);
-	/* XXX: the same is done in lov_update_common_set, however
-	   lovset->set_exp is not initialized. */
+	/*
+	 * XXX: the same is done in lov_update_common_set, however
+	 * lovset->set_exp is not initialized.
+	 */
 	lov_update_set(set, lovreq, rc);
 	if (rc)
 		GOTO(out, rc);
 
-        obd_getref(lovobd);
-        tgt = lov->lov_tgts[lovreq->rq_idx];
-        if (!tgt || !tgt->ltd_active)
-                GOTO(out_update, rc);
+	lov_tgts_getref(lovobd);
+	tgt = lov->lov_tgts[lovreq->rq_idx];
+	if (!tgt || !tgt->ltd_active)
+		GOTO(out_update, rc);
 
-        tgtobd = class_exp2obd(tgt->ltd_exp);
+	tgtobd = class_exp2obd(tgt->ltd_exp);
 	spin_lock(&tgtobd->obd_osfs_lock);
 	memcpy(&tgtobd->obd_osfs, lov_sfs, sizeof(*lov_sfs));
 	if ((oinfo->oi_flags & OBD_STATFS_FROM_CACHE) == 0)
-		tgtobd->obd_osfs_age = cfs_time_current_64();
+		tgtobd->obd_osfs_age = ktime_get_seconds();
 	spin_unlock(&tgtobd->obd_osfs_lock);
 
 out_update:
-        lov_update_statfs(osfs, lov_sfs, success);
-        obd_putref(lovobd);
-
+	lov_update_statfs(osfs, lov_sfs, success);
+	lov_tgts_putref(lovobd);
 out:
 	RETURN(0);
 }
 
 int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
-                        struct lov_request_set **reqset)
+			struct lov_request_set **reqset)
 {
-        struct lov_request_set *set;
-        struct lov_obd *lov = &obd->u.lov;
-        int rc = 0, i;
-        ENTRY;
+	struct lov_request_set *set;
+	struct lov_obd *lov = &obd->u.lov;
+	int rc = 0, i;
+
+	ENTRY;
 
-        OBD_ALLOC(set, sizeof(*set));
-        if (set == NULL)
-                RETURN(-ENOMEM);
-        lov_init_set(set);
+	OBD_ALLOC(set, sizeof(*set));
+	if (!set)
+		RETURN(-ENOMEM);
+	lov_init_set(set);
 
-        set->set_obd = obd;
-        set->set_oi = oinfo;
+	set->set_obd = obd;
+	set->set_oi = oinfo;
 
-        /* We only get block data from the OBD */
-        for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+	/* We only get block data from the OBD */
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+		struct lov_tgt_desc *ltd = lov->lov_tgts[i];
 		struct lov_request *req;
 
-		if (lov->lov_tgts[i] == NULL ||
-		    (oinfo->oi_flags & OBD_STATFS_NODELAY &&
-		     !lov->lov_tgts[i]->ltd_active)) {
+		if (!ltd) {
 			CDEBUG(D_HA, "lov idx %d inactive\n", i);
 			continue;
 		}
 
-		/* skip targets that have been explicitely disabled by the
-		 * administrator */
-		if (!lov->lov_tgts[i]->ltd_exp) {
+		/*
+		 * skip targets that have been explicitely disabled by the
+		 * administrator
+		 */
+		if (!ltd->ltd_exp) {
 			CDEBUG(D_HA, "lov idx %d administratively disabled\n",
 			       i);
 			continue;
 		}
 
-		if (!lov->lov_tgts[i]->ltd_active)
+		if (oinfo->oi_flags & OBD_STATFS_NODELAY &&
+		    class_exp2cliimp(ltd->ltd_exp)->imp_state !=
+		    LUSTRE_IMP_IDLE && !ltd->ltd_active) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", i);
+			continue;
+		}
+
+		if (!ltd->ltd_active)
 			lov_check_and_wait_active(lov, i);
 
 		OBD_ALLOC(req, sizeof(*req));
-		if (req == NULL)
+		if (!req)
 			GOTO(out_set, rc = -ENOMEM);
 
-                OBD_ALLOC(req->rq_oi.oi_osfs, sizeof(*req->rq_oi.oi_osfs));
-                if (req->rq_oi.oi_osfs == NULL) {
-                        OBD_FREE(req, sizeof(*req));
-                        GOTO(out_set, rc = -ENOMEM);
-                }
-
-                req->rq_idx = i;
-                req->rq_oi.oi_cb_up = cb_statfs_update;
-                req->rq_oi.oi_flags = oinfo->oi_flags;
-
-                lov_set_add_req(req, set);
-        }
-        if (!set->set_count)
-                GOTO(out_set, rc = -EIO);
-        *reqset = set;
-        RETURN(rc);
+		OBD_ALLOC(req->rq_oi.oi_osfs, sizeof(*req->rq_oi.oi_osfs));
+		if (!req->rq_oi.oi_osfs) {
+			OBD_FREE(req, sizeof(*req));
+			GOTO(out_set, rc = -ENOMEM);
+		}
+
+		req->rq_idx = i;
+		req->rq_oi.oi_cb_up = cb_statfs_update;
+		req->rq_oi.oi_flags = oinfo->oi_flags;
+
+		lov_set_add_req(req, set);
+	}
+	if (!set->set_count)
+		GOTO(out_set, rc = -EIO);
+	*reqset = set;
+	RETURN(rc);
 out_set:
-        lov_fini_statfs_set(set);
-        RETURN(rc);
+	lov_fini_statfs_set(set);
+	RETURN(rc);
 }
diff --git a/drivers/staging/lustrefsx/lustre/lov/lovsub_dev.c b/drivers/staging/lustrefsx/lustre/lov/lovsub_dev.c
index 0ada9b5b9ce53..90a11e75393b9 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lovsub_dev.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lovsub_dev.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2013, 2016, Intel Corporation.
+ * Copyright (c) 2013, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -49,33 +49,33 @@
  */
 
 static int lovsub_device_init(const struct lu_env *env, struct lu_device *d,
-                              const char *name, struct lu_device *next)
+			      const char *name, struct lu_device *next)
 {
-        struct lovsub_device  *lsd = lu2lovsub_dev(d);
-        struct lu_device_type *ldt;
-        int rc;
-
-        ENTRY;
-        next->ld_site = d->ld_site;
-        ldt = next->ld_type;
-        LASSERT(ldt != NULL);
-        rc = ldt->ldt_ops->ldto_device_init(env, next, ldt->ldt_name, NULL);
-        if (rc) {
-                next->ld_site = NULL;
-                RETURN(rc);
-        }
-
-        lu_device_get(next);
-        lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init);
-        lsd->acid_next = lu2cl_dev(next);
-        RETURN(rc);
+	struct lovsub_device  *lsd = lu2lovsub_dev(d);
+	struct lu_device_type *ldt;
+	int rc;
+
+	ENTRY;
+	next->ld_site = d->ld_site;
+	ldt = next->ld_type;
+	LASSERT(ldt != NULL);
+	rc = ldt->ldt_ops->ldto_device_init(env, next, ldt->ldt_name, NULL);
+	if (rc) {
+		next->ld_site = NULL;
+		RETURN(rc);
+	}
+
+	lu_device_get(next);
+	lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init);
+	lsd->acid_next = lu2cl_dev(next);
+	RETURN(rc);
 }
 
 static struct lu_device *lovsub_device_fini(const struct lu_env *env,
-                                            struct lu_device *d)
+					    struct lu_device *d)
 {
-        struct lu_device *next;
-        struct lovsub_device *lsd;
+	struct lu_device *next;
+	struct lovsub_device *lsd;
 
 	ENTRY;
 	lsd = lu2lovsub_dev(d);
@@ -87,8 +87,8 @@ static struct lu_device *lovsub_device_fini(const struct lu_env *env,
 static struct lu_device *lovsub_device_free(const struct lu_env *env,
 					    struct lu_device *d)
 {
-	struct lovsub_device *lsd  = lu2lovsub_dev(d);
-	struct lu_device     *next = cl2lu_dev(lsd->acid_next);
+	struct lovsub_device *lsd = lu2lovsub_dev(d);
+	struct lu_device *next = cl2lu_dev(lsd->acid_next);
 
 	if (atomic_read(&d->ld_ref) && d->ld_site) {
 		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_ERROR, NULL);
@@ -100,48 +100,48 @@ static struct lu_device *lovsub_device_free(const struct lu_env *env,
 }
 
 static const struct lu_device_operations lovsub_lu_ops = {
-        .ldo_object_alloc      = lovsub_object_alloc,
-        .ldo_process_config    = NULL,
-        .ldo_recovery_complete = NULL
+	.ldo_object_alloc      = lovsub_object_alloc,
+	.ldo_process_config    = NULL,
+	.ldo_recovery_complete = NULL
 };
 
 static struct lu_device *lovsub_device_alloc(const struct lu_env *env,
-                                             struct lu_device_type *t,
-                                             struct lustre_cfg *cfg)
+					     struct lu_device_type *t,
+					     struct lustre_cfg *cfg)
 {
-        struct lu_device     *d;
-        struct lovsub_device *lsd;
-
-        OBD_ALLOC_PTR(lsd);
-        if (lsd != NULL) {
-                int result;
-
-                result = cl_device_init(&lsd->acid_cl, t);
-                if (result == 0) {
-                        d = lovsub2lu_dev(lsd);
-                        d->ld_ops         = &lovsub_lu_ops;
-                } else
-                        d = ERR_PTR(result);
-        } else
-                d = ERR_PTR(-ENOMEM);
-        return d;
+	struct lu_device *d;
+	struct lovsub_device *lsd;
+
+	OBD_ALLOC_PTR(lsd);
+	if (lsd) {
+		int result;
+
+		result = cl_device_init(&lsd->acid_cl, t);
+		if (result == 0) {
+			d = lovsub2lu_dev(lsd);
+			d->ld_ops         = &lovsub_lu_ops;
+		} else
+			d = ERR_PTR(result);
+	} else
+		d = ERR_PTR(-ENOMEM);
+	return d;
 }
 
 static const struct lu_device_type_operations lovsub_device_type_ops = {
-        .ldto_device_alloc = lovsub_device_alloc,
-        .ldto_device_free  = lovsub_device_free,
+	.ldto_device_alloc = lovsub_device_alloc,
+	.ldto_device_free = lovsub_device_free,
 
-        .ldto_device_init    = lovsub_device_init,
-        .ldto_device_fini    = lovsub_device_fini
+	.ldto_device_init = lovsub_device_init,
+	.ldto_device_fini = lovsub_device_fini
 };
 
 #define LUSTRE_LOVSUB_NAME         "lovsub"
 
 struct lu_device_type lovsub_device_type = {
-        .ldt_tags     = LU_DEVICE_CL,
-        .ldt_name     = LUSTRE_LOVSUB_NAME,
-        .ldt_ops      = &lovsub_device_type_ops,
-        .ldt_ctx_tags = LCT_CL_THREAD
+	.ldt_tags     = LU_DEVICE_CL,
+	.ldt_name     = LUSTRE_LOVSUB_NAME,
+	.ldt_ops      = &lovsub_device_type_ops,
+	.ldt_ctx_tags = LCT_CL_THREAD
 };
 
 
diff --git a/drivers/staging/lustrefsx/lustre/lov/lovsub_lock.c b/drivers/staging/lustrefsx/lustre/lov/lovsub_lock.c
deleted file mode 100644
index de8b5c72260d7..0000000000000
--- a/drivers/staging/lustrefsx/lustre/lov/lovsub_lock.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2011, 2016, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * Implementation of cl_lock for LOVSUB layer.
- *
- *   Author: Nikita Danilov <nikita.danilov@sun.com>
- */
-
-#define DEBUG_SUBSYSTEM S_LOV
-
-#include "lov_cl_internal.h"
-
-/** \addtogroup lov
- *  @{
- */
-
-/*****************************************************************************
- *
- * Lovsub lock operations.
- *
- */
-
-static void lovsub_lock_fini(const struct lu_env *env,
-                             struct cl_lock_slice *slice)
-{
-        struct lovsub_lock   *lsl;
-
-	ENTRY;
-	lsl = cl2lovsub_lock(slice);
-	OBD_SLAB_FREE_PTR(lsl, lovsub_lock_kmem);
-	EXIT;
-}
-
-static const struct cl_lock_operations lovsub_lock_ops = {
-        .clo_fini    = lovsub_lock_fini,
-};
-
-int lovsub_lock_init(const struct lu_env *env, struct cl_object *obj,
-		     struct cl_lock *lock, const struct cl_io *io)
-{
-	struct lovsub_lock *lsk;
-	int result;
-
-	ENTRY;
-	OBD_SLAB_ALLOC_PTR_GFP(lsk, lovsub_lock_kmem, GFP_NOFS);
-	if (lsk != NULL) {
-		cl_lock_slice_add(lock, &lsk->lss_cl, obj, &lovsub_lock_ops);
-		result = 0;
-	} else
-		result = -ENOMEM;
-	RETURN(result);
-}
-
-/** @} lov */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lovsub_object.c b/drivers/staging/lustrefsx/lustre/lov/lovsub_object.c
index 1471de7915162..d219356cb3ad3 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lovsub_object.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lovsub_object.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -49,37 +49,39 @@
  */
 
 int lovsub_object_init(const struct lu_env *env, struct lu_object *obj,
-                       const struct lu_object_conf *conf)
+		       const struct lu_object_conf *conf)
 {
-        struct lovsub_device  *dev   = lu2lovsub_dev(obj->lo_dev);
-        struct lu_object      *below;
-        struct lu_device      *under;
-
-        int result;
-
-        ENTRY;
-        under = &dev->acid_next->cd_lu_dev;
-        below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under);
-        if (below != NULL) {
-                lu_object_add(obj, below);
-		cl_object_page_init(lu2cl(obj), sizeof(struct lovsub_page));
-                result = 0;
-        } else
-                result = -ENOMEM;
-        RETURN(result);
+	struct lovsub_device *dev = lu2lovsub_dev(obj->lo_dev);
+	struct lu_object *below;
+	struct lu_device *under;
+
+	int result;
+
+	ENTRY;
+	under = &dev->acid_next->cd_lu_dev;
+	below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under);
+	if (below) {
+		lu_object_add(obj, below);
+		cl_object_page_init(lu2cl(obj), 0);
+		result = 0;
+	} else
+		result = -ENOMEM;
+	RETURN(result);
 
 }
 
 static void lovsub_object_free(const struct lu_env *env, struct lu_object *obj)
 {
-        struct lovsub_object *los = lu2lovsub(obj);
-        struct lov_object    *lov = los->lso_super;
-        ENTRY;
-
-        /* We can't assume lov was assigned here, because of the shadow
-         * object handling in lu_object_find.
-         */
-	if (lov != NULL) {
+	struct lovsub_object *los = lu2lovsub(obj);
+	struct lov_object *lov = los->lso_super;
+
+	ENTRY;
+
+	/*
+	 * We can't assume lov was assigned here, because of the shadow
+	 * object handling in lu_object_find.
+	 */
+	if (lov) {
 		int index = lov_comp_entry(los->lso_index);
 		int stripe = lov_comp_stripe(los->lso_index);
 		struct lov_layout_raid0 *r0 = lov_r0(lov, index);
@@ -91,18 +93,18 @@ static void lovsub_object_free(const struct lu_env *env, struct lu_object *obj)
 		spin_unlock(&r0->lo_sub_lock);
 	}
 
-        lu_object_fini(obj);
-        lu_object_header_fini(&los->lso_header.coh_lu);
-        OBD_SLAB_FREE_PTR(los, lovsub_object_kmem);
-        EXIT;
+	lu_object_fini(obj);
+	lu_object_header_fini(&los->lso_header.coh_lu);
+	OBD_SLAB_FREE_PTR(los, lovsub_object_kmem);
+	EXIT;
 }
 
 static int lovsub_object_print(const struct lu_env *env, void *cookie,
-                               lu_printer_t p, const struct lu_object *obj)
+			       lu_printer_t p, const struct lu_object *obj)
 {
-        struct lovsub_object *los = lu2lovsub(obj);
+	struct lovsub_object *los = lu2lovsub(obj);
 
-        return (*p)(env, cookie, "[%d]", los->lso_index);
+	return (*p)(env, cookie, "[%d]", los->lso_index);
 }
 
 static int lovsub_attr_update(const struct lu_env *env, struct cl_object *obj,
@@ -117,13 +119,13 @@ static int lovsub_attr_update(const struct lu_env *env, struct cl_object *obj,
 }
 
 static int lovsub_object_glimpse(const struct lu_env *env,
-                                 const struct cl_object *obj,
-                                 struct ost_lvb *lvb)
+				 const struct cl_object *obj,
+				 struct ost_lvb *lvb)
 {
-        struct lovsub_object *los = cl2lovsub(obj);
+	struct lovsub_object *los = cl2lovsub(obj);
 
-        ENTRY;
-        RETURN(cl_object_glimpse(env, &los->lso_super->lo_cl, lvb));
+	ENTRY;
+	RETURN(cl_object_glimpse(env, &los->lso_super->lo_cl, lvb));
 }
 
 /**
@@ -136,6 +138,7 @@ static void lovsub_req_attr_set(const struct lu_env *env, struct cl_object *obj,
 {
 	struct lovsub_object *subobj = cl2lovsub(obj);
 	struct lov_stripe_md *lsm = subobj->lso_super->lo_lsm;
+
 	ENTRY;
 	cl_req_attr_set(env, &subobj->lso_super->lo_cl, attr);
 
@@ -151,20 +154,18 @@ static void lovsub_req_attr_set(const struct lu_env *env, struct cl_object *obj,
 }
 
 static const struct cl_object_operations lovsub_ops = {
-	.coo_page_init    = lovsub_page_init,
-	.coo_lock_init    = lovsub_lock_init,
 	.coo_attr_update  = lovsub_attr_update,
 	.coo_glimpse      = lovsub_object_glimpse,
 	.coo_req_attr_set = lovsub_req_attr_set
 };
 
 static const struct lu_object_operations lovsub_lu_obj_ops = {
-        .loo_object_init      = lovsub_object_init,
-        .loo_object_delete    = NULL,
-        .loo_object_release   = NULL,
-        .loo_object_free      = lovsub_object_free,
-        .loo_object_print     = lovsub_object_print,
-        .loo_object_invariant = NULL
+	.loo_object_init      = lovsub_object_init,
+	.loo_object_delete    = NULL,
+	.loo_object_release   = NULL,
+	.loo_object_free      = lovsub_object_free,
+	.loo_object_print     = lovsub_object_print,
+	.loo_object_invariant = NULL
 };
 
 struct lu_object *lovsub_object_alloc(const struct lu_env *env,
@@ -176,7 +177,7 @@ struct lu_object *lovsub_object_alloc(const struct lu_env *env,
 
 	ENTRY;
 	OBD_SLAB_ALLOC_PTR_GFP(los, lovsub_object_kmem, GFP_NOFS);
-	if (los != NULL) {
+	if (los) {
 		struct cl_object_header *hdr;
 
 		obj = lovsub2lu(los);
diff --git a/drivers/staging/lustrefsx/lustre/lov/lovsub_page.c b/drivers/staging/lustrefsx/lustre/lov/lovsub_page.c
deleted file mode 100644
index c10a3dfa38c1e..0000000000000
--- a/drivers/staging/lustrefsx/lustre/lov/lovsub_page.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2013, 2014, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * Implementation of cl_page for LOVSUB layer.
- *
- *   Author: Nikita Danilov <nikita.danilov@sun.com>
- */
-
-#define DEBUG_SUBSYSTEM S_LOV
-
-#include "lov_cl_internal.h"
-
-/** \addtogroup lov
- *  @{
- */
-
-/*****************************************************************************
- *
- * Lovsub page operations.
- *
- */
-
-static void lovsub_page_fini(const struct lu_env *env,
-                             struct cl_page_slice *slice)
-{
-}
-
-static const struct cl_page_operations lovsub_page_ops = {
-        .cpo_fini   = lovsub_page_fini
-};
-
-int lovsub_page_init(const struct lu_env *env, struct cl_object *obj,
-		     struct cl_page *page, pgoff_t index)
-{
-	struct lovsub_page *lsb = cl_object_page_slice(obj, page);
-	ENTRY;
-
-	cl_page_slice_add(page, &lsb->lsb_cl, obj, index, &lovsub_page_ops);
-	RETURN(0);
-}
-
-/** @} lov */
diff --git a/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c b/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c
index 41215c11998ef..f6eeebed9e2b0 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lproc_lov.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -35,10 +35,9 @@
 #include <asm/statfs.h>
 #include <lprocfs_status.h>
 #include <obd_class.h>
-#include <uapi/linux/lustre_param.h>
+#include <uapi/linux/lustre/lustre_param.h>
 #include "lov_internal.h"
 
-#ifdef CONFIG_PROC_FS
 static int lov_stripesize_seq_show(struct seq_file *m, void *v)
 {
 	struct obd_device *dev = (struct obd_device *)m->private;
@@ -57,12 +56,12 @@ static ssize_t lov_stripesize_seq_write(struct file *file,
 {
 	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
 	struct lov_desc *desc;
-	__s64 val;
+	s64 val;
 	int rc;
 
 	LASSERT(dev != NULL);
 	desc = &dev->u.lov.desc;
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &val, '1');
 	if (rc)
 		return rc;
 	if (val < 0)
@@ -75,150 +74,135 @@ static ssize_t lov_stripesize_seq_write(struct file *file,
 }
 LPROC_SEQ_FOPS(lov_stripesize);
 
-static int lov_stripeoffset_seq_show(struct seq_file *m, void *v)
+static ssize_t stripeoffset_show(struct kobject *kobj, struct attribute *attr,
+				 char *buf)
 {
-	struct obd_device *dev = (struct obd_device *)m->private;
-	struct lov_desc *desc;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &dev->u.lov.desc;
 
-	LASSERT(dev != NULL);
-	desc = &dev->u.lov.desc;
-	seq_printf(m, "%lld\n", desc->ld_default_stripe_offset);
-	return 0;
+	return sprintf(buf, "%lld\n", desc->ld_default_stripe_offset);
 }
 
-static ssize_t lov_stripeoffset_seq_write(struct file *file,
-					  const char __user *buffer,
-					  size_t count, loff_t *off)
+static ssize_t stripeoffset_store(struct kobject *kobj, struct attribute *attr,
+				  const char *buf, size_t count)
 {
-	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
-	struct lov_desc *desc;
-	__s64 val;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &dev->u.lov.desc;
+	long val;
 	int rc;
 
-	LASSERT(dev != NULL);
-	desc = &dev->u.lov.desc;
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtol(buf, 0, &val);
 	if (rc)
 		return rc;
-	if (val < -1)
+	if (val < -1 || val > LOV_MAX_STRIPE_COUNT)
 		return -ERANGE;
 
 	desc->ld_default_stripe_offset = val;
 
 	return count;
 }
-LPROC_SEQ_FOPS(lov_stripeoffset);
+LUSTRE_RW_ATTR(stripeoffset);
 
-static int lov_stripetype_seq_show(struct seq_file *m, void *v)
+static ssize_t stripetype_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
 {
-	struct obd_device* dev = (struct obd_device*)m->private;
-	struct lov_desc *desc;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &dev->u.lov.desc;
 
-	LASSERT(dev != NULL);
-	desc = &dev->u.lov.desc;
-	seq_printf(m, "%u\n", desc->ld_pattern);
-	return 0;
+	return sprintf(buf, "%u\n", desc->ld_pattern);
 }
 
-static ssize_t lov_stripetype_seq_write(struct file *file,
-					const char __user *buffer,
-					size_t count, loff_t *off)
+static ssize_t stripetype_store(struct kobject *kobj, struct attribute *attr,
+				const char *buffer, size_t count)
 {
-	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
-	struct lov_desc *desc;
-	int pattern, rc;
-	__s64 val;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &dev->u.lov.desc;
+	u32 pattern;
+	int rc;
 
-	LASSERT(dev != NULL);
-	desc = &dev->u.lov.desc;
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtouint(buffer, 0, &pattern);
 	if (rc)
 		return rc;
-	if (val < INT_MIN || val > INT_MAX)
-		return -ERANGE;
 
-	pattern = val;
 	lov_fix_desc_pattern(&pattern);
 	desc->ld_pattern = pattern;
 
 	return count;
 }
-LPROC_SEQ_FOPS(lov_stripetype);
+LUSTRE_RW_ATTR(stripetype);
 
-static int lov_stripecount_seq_show(struct seq_file *m, void *v)
+static ssize_t stripecount_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
 {
-	struct obd_device *dev = (struct obd_device *)m->private;
-	struct lov_desc *desc;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &dev->u.lov.desc;
 
-	LASSERT(dev != NULL);
-	desc = &dev->u.lov.desc;
-	seq_printf(m, "%d\n",
-		  (__s16)(desc->ld_default_stripe_count + 1) - 1);
-	return 0;
+	return sprintf(buf, "%d\n",
+		       (__s16)(desc->ld_default_stripe_count + 1) - 1);
 }
 
-static ssize_t lov_stripecount_seq_write(struct file *file,
-					 const char __user *buffer,
-					 size_t count, loff_t *off)
+static ssize_t stripecount_store(struct kobject *kobj, struct attribute *attr,
+				 const char *buffer, size_t count)
 {
-	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
-	struct lov_desc *desc;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &dev->u.lov.desc;
+	int stripe_count;
 	int rc;
-	__u32 stripe_count;
-	__s64 val;
 
-	LASSERT(dev != NULL);
-	desc = &dev->u.lov.desc;
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtoint(buffer, 0, &stripe_count);
 	if (rc)
 		return rc;
-	if (val < -1)
+
+	if (stripe_count < -1)
 		return -ERANGE;
 
-	stripe_count = val;
 	lov_fix_desc_stripe_count(&stripe_count);
 	desc->ld_default_stripe_count = stripe_count;
 
 	return count;
 }
-LPROC_SEQ_FOPS(lov_stripecount);
+LUSTRE_RW_ATTR(stripecount);
 
-static int lov_numobd_seq_show(struct seq_file *m, void *v)
+static ssize_t numobd_show(struct kobject *kobj, struct attribute *attr,
+			   char *buf)
 {
-	struct obd_device *dev = (struct obd_device*)m->private;
-	struct lov_desc *desc;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &dev->u.lov.desc;
 
-	LASSERT(dev != NULL);
-	desc = &dev->u.lov.desc;
-	seq_printf(m, "%u\n", desc->ld_tgt_count);
-	return 0;
+	return sprintf(buf, "%u\n", desc->ld_tgt_count);
 }
-LPROC_SEQ_FOPS_RO(lov_numobd);
+LUSTRE_RO_ATTR(numobd);
 
-static int lov_activeobd_seq_show(struct seq_file *m, void *v)
+static ssize_t activeobd_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
 {
-	struct obd_device* dev = (struct obd_device*)m->private;
-	struct lov_desc *desc;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &dev->u.lov.desc;
 
-	LASSERT(dev != NULL);
-	desc = &dev->u.lov.desc;
-	seq_printf(m, "%u\n", desc->ld_active_tgt_count);
-	return 0;
+	return sprintf(buf, "%u\n", desc->ld_active_tgt_count);
 }
-LPROC_SEQ_FOPS_RO(lov_activeobd);
+LUSTRE_RO_ATTR(activeobd);
 
-static int lov_desc_uuid_seq_show(struct seq_file *m, void *v)
+static ssize_t desc_uuid_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
 {
-	struct obd_device *dev = m->private;
-	struct lov_obd *lov;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lov_desc *desc = &dev->u.lov.desc;
 
-	LASSERT(dev != NULL);
-	lov = &dev->u.lov;
-	seq_printf(m, "%s\n", lov->desc.ld_uuid.uuid);
-	return 0;
+	return sprintf(buf, "%s\n", desc->ld_uuid.uuid);
 }
-LPROC_SEQ_FOPS_RO(lov_desc_uuid);
+LUSTRE_RO_ATTR(desc_uuid);
 
+#ifdef CONFIG_PROC_FS
 static void *lov_tgt_seq_start(struct seq_file *p, loff_t *pos)
 {
         struct obd_device *dev = p->private;
@@ -251,6 +235,7 @@ static void *lov_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos)
 static int lov_tgt_seq_show(struct seq_file *p, void *v)
 {
         struct lov_tgt_desc *tgt = v;
+
 	seq_printf(p, "%d: %s %sACTIVE\n", tgt->ltd_index,
 		   obd_uuid2str(&tgt->ltd_uuid),
 		   tgt->ltd_active ? "" : "IN");
@@ -269,10 +254,6 @@ static int lov_target_seq_open(struct inode *inode, struct file *file)
 	struct seq_file *seq;
 	int rc;
 
-	rc = LPROCFS_ENTRY_CHECK(inode);
-	if (rc < 0)
-		return rc;
-
 	rc = seq_open(file, &lov_tgt_sops);
 	if (rc)
 		return rc;
@@ -282,47 +263,13 @@ static int lov_target_seq_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
-LPROC_SEQ_FOPS_RO_TYPE(lov, uuid);
-LPROC_SEQ_FOPS_RO_TYPE(lov, filestotal);
-LPROC_SEQ_FOPS_RO_TYPE(lov, filesfree);
-LPROC_SEQ_FOPS_RO_TYPE(lov, blksize);
-LPROC_SEQ_FOPS_RO_TYPE(lov, kbytestotal);
-LPROC_SEQ_FOPS_RO_TYPE(lov, kbytesfree);
-LPROC_SEQ_FOPS_RO_TYPE(lov, kbytesavail);
-
 struct lprocfs_vars lprocfs_lov_obd_vars[] = {
-	{ .name	=	"uuid",
-	  .fops	=	&lov_uuid_fops		},
-	{ .name	=	"stripesize",
-	  .fops	=	&lov_stripesize_fops	},
-	{ .name	=	"stripeoffset",
-	  .fops	=	&lov_stripeoffset_fops	},
-	{ .name	=	"stripecount",
-	  .fops	=	&lov_stripecount_fops	},
-	{ .name	=	"stripetype",
-	  .fops	=	&lov_stripetype_fops	},
-	{ .name	=	"numobd",
-	  .fops	=	&lov_numobd_fops	},
-	{ .name	=	"activeobd",
-	  .fops	=	&lov_activeobd_fops	},
-	{ .name	=	"filestotal",
-	  .fops	=	&lov_filestotal_fops	},
-	{ .name	=	"filesfree",
-	  .fops	=	&lov_filesfree_fops	},
-	{ .name	=	"blocksize",
-	  .fops	=	&lov_blksize_fops	},
-	{ .name	=	"kbytestotal",
-	  .fops	=	&lov_kbytestotal_fops	},
-	{ .name	=	"kbytesfree",
-	  .fops	=	&lov_kbytesfree_fops	},
-	{ .name	=	"kbytesavail",
-	  .fops	=	&lov_kbytesavail_fops	},
-	{ .name	=	"desc_uuid",
-	  .fops	=	&lov_desc_uuid_fops	},
+	{ .name =	"stripesize",
+	  .fops =	&lov_stripesize_fops	},
 	{ NULL }
 };
 
-const struct proc_ops lov_proc_target_fops = {
+static const struct proc_ops lov_proc_target_fops = {
 	PROC_OWNER(THIS_MODULE)
 	.proc_open	= lov_target_seq_open,
 	.proc_read	= seq_read,
@@ -330,3 +277,68 @@ const struct proc_ops lov_proc_target_fops = {
 	.proc_release	= lprocfs_seq_release,
 };
 #endif /* CONFIG_PROC_FS */
+
+static struct attribute *lov_attrs[] = {
+	&lustre_attr_activeobd.attr,
+	&lustre_attr_numobd.attr,
+	&lustre_attr_desc_uuid.attr,
+	&lustre_attr_stripeoffset.attr,
+	&lustre_attr_stripetype.attr,
+	&lustre_attr_stripecount.attr,
+	NULL,
+};
+
+int lov_tunables_init(struct obd_device *obd)
+{
+	struct lov_obd *lov = &obd->u.lov;
+#if defined(CONFIG_PROC_FS) && defined(HAVE_SERVER_SUPPORT)
+	struct obd_type *type;
+#endif
+	int rc;
+
+	obd->obd_vars = lprocfs_lov_obd_vars;
+#if defined(CONFIG_PROC_FS) && defined(HAVE_SERVER_SUPPORT)
+	/* If this is true then both client (lov) and server
+	 * (lod) are on the same node. The lod layer if loaded
+	 * first will register the lov proc directory. In that
+	 * case obd->obd_type->typ_procroot will be not set.
+	 * Instead we use type->typ_procsym as the parent.
+	 */
+	type = class_search_type(LUSTRE_LOD_NAME);
+	if (type && type->typ_procsym) {
+		obd->obd_proc_entry = lprocfs_register(obd->obd_name,
+						       type->typ_procsym,
+						       obd->obd_vars, obd);
+		if (IS_ERR(obd->obd_proc_entry)) {
+			rc = PTR_ERR(obd->obd_proc_entry);
+			CERROR("error %d setting up lprocfs for %s\n", rc,
+			       obd->obd_name);
+			obd->obd_proc_entry = NULL;
+		}
+	}
+#endif
+	obd->obd_ktype.default_attrs = lov_attrs;
+	rc = lprocfs_obd_setup(obd, false);
+	if (rc)
+		GOTO(out, rc);
+
+#ifdef CONFIG_PROC_FS
+	rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd", 0444,
+				&lov_proc_target_fops, obd);
+	if (rc)
+		CWARN("%s: Error adding the target_obd file : rc %d\n",
+		      obd->obd_name, rc);
+
+	lov->lov_pool_proc_entry = lprocfs_register("pools",
+						    obd->obd_proc_entry,
+						    NULL, NULL);
+	if (IS_ERR(lov->lov_pool_proc_entry)) {
+		rc = PTR_ERR(lov->lov_pool_proc_entry);
+		CERROR("%s: error setting up debugfs for pools : rc %d\n",
+		       obd->obd_name, rc);
+		lov->lov_pool_proc_entry = NULL;
+	}
+#endif /* CONFIG_FS_PROC */
+out:
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/lustre/mdc/Makefile b/drivers/staging/lustrefsx/lustre/mdc/Makefile
index e13d6af6f9949..7c9329681bdf2 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/Makefile
+++ b/drivers/staging/lustrefsx/lustre/mdc/Makefile
@@ -1,6 +1,6 @@
 obj-$(CONFIG_LUSTREFSX_FS)	+= mdc.o
 
 mdc-y		:= mdc_request.o mdc_reint.o lproc_mdc.o mdc_lib.o mdc_locks.o
-mdc-y		+= mdc_changelog.o
+mdc-y		+= mdc_changelog.o mdc_dev.o
 
 include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c b/drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c
index 57cd679138950..0c2e79a2a336d 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c
+++ b/drivers/staging/lustrefsx/lustre/mdc/lproc_mdc.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -31,127 +31,280 @@
  */
 #define DEBUG_SUBSYSTEM S_CLASS
 
-#include <linux/version.h>
 #include <linux/vfs.h>
 #include <obd_class.h>
 #include <lprocfs_status.h>
-
+#include <lustre_osc.h>
+#include <cl_object.h>
 #include "mdc_internal.h"
 
-#ifdef CONFIG_PROC_FS
-static int mdc_active_seq_show(struct seq_file *m, void *v)
+static ssize_t active_show(struct kobject *kobj, struct attribute *attr,
+			   char *buf)
 {
-	struct obd_device *dev = m->private;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	ssize_t len;
 
 	LPROCFS_CLIMP_CHECK(dev);
-	seq_printf(m, "%d\n", !dev->u.cli.cl_import->imp_deactive);
+	len = sprintf(buf, "%d\n", !dev->u.cli.cl_import->imp_deactive);
 	LPROCFS_CLIMP_EXIT(dev);
-	return 0;
+	return len;
 }
 
-static ssize_t mdc_active_seq_write(struct file *file,
-				    const char __user *buffer,
-				    size_t count, loff_t *off)
+static ssize_t active_store(struct kobject *kobj, struct attribute *attr,
+			    const char *buffer, size_t count)
 {
-	struct obd_device *dev;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	bool val;
 	int rc;
-	__s64 val;
 
-	dev = ((struct seq_file *)file->private_data)->private;
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtobool(buffer, &val);
 	if (rc)
 		return rc;
-	if (val < 0 || val > 1)
-		return -ERANGE;
 
 	/* opposite senses */
 	if (dev->u.cli.cl_import->imp_deactive == val)
 		rc = ptlrpc_set_import_active(dev->u.cli.cl_import, val);
 	else
-		CDEBUG(D_CONFIG, "activate %llu: ignoring repeat request\n",
+		CDEBUG(D_CONFIG, "activate %u: ignoring repeat request\n",
 		       val);
 
 	return count;
 }
-LPROC_SEQ_FOPS(mdc_active);
+LUSTRE_RW_ATTR(active);
 
-static int mdc_max_rpcs_in_flight_seq_show(struct seq_file *m, void *v)
+static ssize_t max_rpcs_in_flight_show(struct kobject *kobj,
+				       struct attribute *attr,
+				       char *buf)
 {
-	struct obd_device *dev = m->private;
-	__u32 max;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	ssize_t len;
+	u32 max;
 
 	max = obd_get_max_rpcs_in_flight(&dev->u.cli);
-	seq_printf(m, "%u\n", max);
+	len = sprintf(buf, "%u\n", max);
 
-	return 0;
+	return len;
 }
 
-static ssize_t mdc_max_rpcs_in_flight_seq_write(struct file *file,
-						const char __user *buffer,
-						size_t count, loff_t *off)
+static ssize_t max_rpcs_in_flight_store(struct kobject *kobj,
+					struct attribute *attr,
+					const char *buffer,
+					size_t count)
 {
-	struct obd_device *dev;
-	__s64 val;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	unsigned int val;
 	int rc;
 
-	dev = ((struct seq_file *)file->private_data)->private;
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtouint(buffer, 10, &val);
 	if (rc)
 		return rc;
 
-	if (val < 0 || val > UINT_MAX)
-		return -ERANGE;
-
 	rc = obd_set_max_rpcs_in_flight(&dev->u.cli, val);
+	if (rc)
+		count = rc;
+
+	return count;
+}
+LUSTRE_RW_ATTR(max_rpcs_in_flight);
+
+static ssize_t max_mod_rpcs_in_flight_show(struct kobject *kobj,
+					   struct attribute *attr,
+					   char *buf)
+{
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	u16 max;
+
+	max = obd_get_max_mod_rpcs_in_flight(&dev->u.cli);
+	return sprintf(buf, "%hu\n", max);
+}
+
+static ssize_t max_mod_rpcs_in_flight_store(struct kobject *kobj,
+					    struct attribute *attr,
+					    const char *buffer,
+					    size_t count)
+{
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	u16 val;
+	int rc;
+
+	rc = kstrtou16(buffer, 10, &val);
 	if (rc)
 		return rc;
 
+	rc = obd_set_max_mod_rpcs_in_flight(&dev->u.cli, val);
+	if (rc)
+		count = rc;
+
 	return count;
 }
-LPROC_SEQ_FOPS(mdc_max_rpcs_in_flight);
+LUSTRE_RW_ATTR(max_mod_rpcs_in_flight);
 
-static int mdc_max_mod_rpcs_in_flight_seq_show(struct seq_file *m, void *v)
+static int mdc_max_dirty_mb_seq_show(struct seq_file *m, void *v)
 {
 	struct obd_device *dev = m->private;
-	__u16 max;
+	struct client_obd *cli = &dev->u.cli;
+	unsigned long val;
 
-	max = obd_get_max_mod_rpcs_in_flight(&dev->u.cli);
-	seq_printf(m, "%hu\n", max);
+	spin_lock(&cli->cl_loi_list_lock);
+	val = PAGES_TO_MiB(cli->cl_dirty_max_pages);
+	spin_unlock(&cli->cl_loi_list_lock);
 
+	seq_printf(m, "%lu\n", val);
 	return 0;
 }
 
-static ssize_t mdc_max_mod_rpcs_in_flight_seq_write(struct file *file,
-						    const char __user *buffer,
-						    size_t count, loff_t *off)
+static ssize_t mdc_max_dirty_mb_seq_write(struct file *file,
+					  const char __user *buffer,
+					  size_t count, loff_t *off)
 {
-	struct obd_device *dev =
-			((struct seq_file *)file->private_data)->private;
-	__s64 val;
+	struct seq_file *sfl = file->private_data;
+	struct obd_device *dev = sfl->private;
+	struct client_obd *cli = &dev->u.cli;
+	s64 pages_number;
 	int rc;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &pages_number, 'M');
 	if (rc)
 		return rc;
 
-	if (val < 0 || val > USHRT_MAX)
+	/* MB -> pages */
+	pages_number = round_up(pages_number, 1024 * 1024) >> PAGE_SHIFT;
+	if (pages_number <= 0 ||
+	    pages_number >= MiB_TO_PAGES(OSC_MAX_DIRTY_MB_MAX) ||
+	    pages_number > cfs_totalram_pages() / 4) /* 1/4 of RAM */
 		return -ERANGE;
 
-	rc = obd_set_max_mod_rpcs_in_flight(&dev->u.cli, val);
+	spin_lock(&cli->cl_loi_list_lock);
+	cli->cl_dirty_max_pages = pages_number;
+	osc_wake_cache_waiters(cli);
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(mdc_max_dirty_mb);
+
+static ssize_t contention_seconds_show(struct kobject *kobj,
+				       struct attribute *attr,
+				       char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct osc_device *od = obd2osc_dev(obd);
+
+	return sprintf(buf, "%lld\n", od->od_contention_time);
+}
+
+static ssize_t contention_seconds_store(struct kobject *kobj,
+					struct attribute *attr,
+					const char *buffer,
+					size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct osc_device *od = obd2osc_dev(obd);
+	time64_t val;
+	int rc;
+
+	rc = kstrtoll(buffer, 0, &val);
 	if (rc)
-		count = rc;
+		return rc;
+
+	od->od_contention_time = val;
 
 	return count;
 }
-LPROC_SEQ_FOPS(mdc_max_mod_rpcs_in_flight);
+LUSTRE_RW_ATTR(contention_seconds);
 
-static int mdc_rpc_stats_seq_show(struct seq_file *seq, void *v)
+LUSTRE_ATTR(mds_conn_uuid, 0444, conn_uuid_show, NULL);
+LUSTRE_RO_ATTR(conn_uuid);
+
+LUSTRE_RW_ATTR(ping);
+
+static int mdc_cached_mb_seq_show(struct seq_file *m, void *v)
 {
-	struct obd_device *dev = seq->private;
+	struct obd_device *dev = m->private;
+	struct client_obd *cli = &dev->u.cli;
+	int shift = 20 - PAGE_SHIFT;
+
+	seq_printf(m, "used_mb: %ld\n"
+		   "busy_cnt: %ld\n"
+		   "reclaim: %llu\n",
+		   (atomic_long_read(&cli->cl_lru_in_list) +
+		    atomic_long_read(&cli->cl_lru_busy)) >> shift,
+		    atomic_long_read(&cli->cl_lru_busy),
+		   cli->cl_lru_reclaim);
 
-	return obd_mod_rpc_stats_seq_show(&dev->u.cli, seq);
+	return 0;
 }
 
+/* shrink the number of caching pages to a specific number */
+static ssize_t
+mdc_cached_mb_seq_write(struct file *file, const char __user *buffer,
+			size_t count, loff_t *off)
+{
+	struct seq_file *sfl = file->private_data;
+	struct obd_device *dev = sfl->private;
+	struct client_obd *cli = &dev->u.cli;
+	__s64 pages_number;
+	long rc;
+	char kernbuf[128];
+
+	if (count >= sizeof(kernbuf))
+		return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+	kernbuf[count] = 0;
+
+	buffer += lprocfs_find_named_value(kernbuf, "used_mb:", &count) -
+		  kernbuf;
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &pages_number, 'M');
+	if (rc)
+		return rc;
+
+	pages_number >>= PAGE_SHIFT;
+
+	if (pages_number < 0)
+		return -ERANGE;
+
+	rc = atomic_long_read(&cli->cl_lru_in_list) - pages_number;
+	if (rc > 0) {
+		struct lu_env *env;
+		__u16 refcheck;
+
+		env = cl_env_get(&refcheck);
+		if (!IS_ERR(env)) {
+			(void)osc_lru_shrink(env, cli, rc, true);
+			cl_env_put(env, &refcheck);
+		}
+	}
+
+	return count;
+}
+LPROC_SEQ_FOPS(mdc_cached_mb);
+
+static int mdc_unstable_stats_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+	struct client_obd *cli = &dev->u.cli;
+	long pages;
+	int mb;
+
+	pages = atomic_long_read(&cli->cl_unstable_count);
+	mb    = (pages * PAGE_SIZE) >> 20;
+
+	seq_printf(m, "unstable_pages: %20ld\n"
+		   "unstable_mb:              %10d\n", pages, mb);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(mdc_unstable_stats);
+
 static ssize_t mdc_rpc_stats_seq_write(struct file *file,
 				       const char __user *buf,
 				       size_t len, loff_t *off)
@@ -162,22 +315,174 @@ static ssize_t mdc_rpc_stats_seq_write(struct file *file,
 
 	lprocfs_oh_clear(&cli->cl_mod_rpcs_hist);
 
+	lprocfs_oh_clear(&cli->cl_read_rpc_hist);
+	lprocfs_oh_clear(&cli->cl_write_rpc_hist);
+	lprocfs_oh_clear(&cli->cl_read_page_hist);
+	lprocfs_oh_clear(&cli->cl_write_page_hist);
+	lprocfs_oh_clear(&cli->cl_read_offset_hist);
+	lprocfs_oh_clear(&cli->cl_write_offset_hist);
+
 	return len;
 }
+
+static int mdc_rpc_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct obd_device *dev = seq->private;
+	struct client_obd *cli = &dev->u.cli;
+	unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum;
+	int i;
+
+	obd_mod_rpc_stats_seq_show(&dev->u.cli, seq);
+
+	spin_lock(&cli->cl_loi_list_lock);
+
+	seq_printf(seq, "\nread RPCs in flight:  %d\n",
+		   cli->cl_r_in_flight);
+	seq_printf(seq, "write RPCs in flight: %d\n",
+		   cli->cl_w_in_flight);
+	seq_printf(seq, "pending write pages:  %d\n",
+		   atomic_read(&cli->cl_pending_w_pages));
+	seq_printf(seq, "pending read pages:   %d\n",
+		   atomic_read(&cli->cl_pending_r_pages));
+
+	seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
+	seq_printf(seq, "pages per rpc         rpcs   %% cum %% |");
+	seq_printf(seq, "       rpcs   %% cum %%\n");
+
+	read_tot = lprocfs_oh_sum(&cli->cl_read_page_hist);
+	write_tot = lprocfs_oh_sum(&cli->cl_write_page_hist);
+
+	read_cum = 0;
+	write_cum = 0;
+	for (i = 0; i < OBD_HIST_MAX; i++) {
+		unsigned long r = cli->cl_read_page_hist.oh_buckets[i];
+		unsigned long w = cli->cl_write_page_hist.oh_buckets[i];
+
+		read_cum += r;
+		write_cum += w;
+		seq_printf(seq, "%d:\t\t%10lu %3u %3u   | %10lu %3u %3u\n",
+			   1 << i, r, pct(r, read_tot),
+			   pct(read_cum, read_tot), w,
+			   pct(w, write_tot),
+			   pct(write_cum, write_tot));
+		if (read_cum == read_tot && write_cum == write_tot)
+			break;
+	}
+
+	seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
+	seq_printf(seq, "rpcs in flight        rpcs   %% cum %% |");
+	seq_printf(seq, "       rpcs   %% cum %%\n");
+
+	read_tot = lprocfs_oh_sum(&cli->cl_read_rpc_hist);
+	write_tot = lprocfs_oh_sum(&cli->cl_write_rpc_hist);
+
+	read_cum = 0;
+	write_cum = 0;
+	for (i = 0; i < OBD_HIST_MAX; i++) {
+		unsigned long r = cli->cl_read_rpc_hist.oh_buckets[i];
+		unsigned long w = cli->cl_write_rpc_hist.oh_buckets[i];
+
+		read_cum += r;
+		write_cum += w;
+		seq_printf(seq, "%d:\t\t%10lu %3u %3u   | %10lu %3u %3u\n",
+			   i, r, pct(r, read_tot), pct(read_cum, read_tot), w,
+			   pct(w, write_tot), pct(write_cum, write_tot));
+		if (read_cum == read_tot && write_cum == write_tot)
+			break;
+	}
+
+	seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
+	seq_printf(seq, "offset                rpcs   %% cum %% |");
+	seq_printf(seq, "       rpcs   %% cum %%\n");
+
+	read_tot = lprocfs_oh_sum(&cli->cl_read_offset_hist);
+	write_tot = lprocfs_oh_sum(&cli->cl_write_offset_hist);
+
+	read_cum = 0;
+	write_cum = 0;
+	for (i = 0; i < OBD_HIST_MAX; i++) {
+		unsigned long r = cli->cl_read_offset_hist.oh_buckets[i];
+		unsigned long w = cli->cl_write_offset_hist.oh_buckets[i];
+
+		read_cum += r;
+		write_cum += w;
+		seq_printf(seq, "%d:\t\t%10lu %3u %3u   | %10lu %3u %3u\n",
+			   (i == 0) ? 0 : 1 << (i - 1),
+			   r, pct(r, read_tot), pct(read_cum, read_tot),
+			   w, pct(w, write_tot), pct(write_cum, write_tot));
+		if (read_cum == read_tot && write_cum == write_tot)
+			break;
+	}
+	spin_unlock(&cli->cl_loi_list_lock);
+
+	return 0;
+}
 LPROC_SEQ_FOPS(mdc_rpc_stats);
 
-LPROC_SEQ_FOPS_WO_TYPE(mdc, ping);
+static int mdc_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct timespec64 now;
+	struct obd_device *dev = seq->private;
+	struct osc_stats *stats = &obd2osc_dev(dev)->od_stats;
+
+	ktime_get_real_ts64(&now);
+
+	seq_printf(seq, "snapshot_time:         %lld.%09lu (secs.nsecs)\n",
+		   (s64)now.tv_sec, now.tv_nsec);
+	seq_printf(seq, "lockless_write_bytes\t\t%llu\n",
+		   stats->os_lockless_writes);
+	seq_printf(seq, "lockless_read_bytes\t\t%llu\n",
+		   stats->os_lockless_reads);
+	seq_printf(seq, "lockless_truncate\t\t%llu\n",
+		   stats->os_lockless_truncates);
+	return 0;
+}
+
+static ssize_t mdc_stats_seq_write(struct file *file,
+				   const char __user *buf,
+				   size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct obd_device *dev = seq->private;
+	struct osc_stats *stats = &obd2osc_dev(dev)->od_stats;
+
+	memset(stats, 0, sizeof(*stats));
+	return len;
+}
+LPROC_SEQ_FOPS(mdc_stats);
+
+static int mdc_dom_min_repsize_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+
+	seq_printf(m, "%u\n", dev->u.cli.cl_dom_min_inline_repsize);
+
+	return 0;
+}
+
+static ssize_t mdc_dom_min_repsize_seq_write(struct file *file,
+					     const char __user *buffer,
+					     size_t count, loff_t *off)
+{
+	struct obd_device *dev;
+	unsigned int val;
+	int rc;
+
+	dev =  ((struct seq_file *)file->private_data)->private;
+	rc = kstrtouint_from_user(buffer, count, 0, &val);
+	if (rc)
+		return rc;
+
+	if (val > MDC_DOM_MAX_INLINE_REPSIZE)
+		return -ERANGE;
+
+	dev->u.cli.cl_dom_min_inline_repsize = val;
+	return count;
+}
+LPROC_SEQ_FOPS(mdc_dom_min_repsize);
 
-LPROC_SEQ_FOPS_RO_TYPE(mdc, uuid);
 LPROC_SEQ_FOPS_RO_TYPE(mdc, connect_flags);
-LPROC_SEQ_FOPS_RO_TYPE(mdc, blksize);
-LPROC_SEQ_FOPS_RO_TYPE(mdc, kbytestotal);
-LPROC_SEQ_FOPS_RO_TYPE(mdc, kbytesfree);
-LPROC_SEQ_FOPS_RO_TYPE(mdc, kbytesavail);
-LPROC_SEQ_FOPS_RO_TYPE(mdc, filestotal);
-LPROC_SEQ_FOPS_RO_TYPE(mdc, filesfree);
 LPROC_SEQ_FOPS_RO_TYPE(mdc, server_uuid);
-LPROC_SEQ_FOPS_RO_TYPE(mdc, conn_uuid);
 LPROC_SEQ_FOPS_RO_TYPE(mdc, timeouts);
 LPROC_SEQ_FOPS_RO_TYPE(mdc, state);
 LPROC_SEQ_FOPS_RW_TYPE(mdc, obd_max_pages_per_rpc);
@@ -185,35 +490,16 @@ LPROC_SEQ_FOPS_RW_TYPE(mdc, import);
 LPROC_SEQ_FOPS_RW_TYPE(mdc, pinger_recov);
 
 struct lprocfs_vars lprocfs_mdc_obd_vars[] = {
-	{ .name	=	"uuid",
-	  .fops	=	&mdc_uuid_fops		},
-	{ .name	=	"ping",
-	  .fops	=	&mdc_ping_fops,
-	  .proc_mode =	0222			},
 	{ .name	=	"connect_flags",
 	  .fops	=	&mdc_connect_flags_fops	},
-	{ .name	=	"blocksize",
-	  .fops	=	&mdc_blksize_fops	},
-	{ .name	=	"kbytestotal",
-	  .fops	=	&mdc_kbytestotal_fops	},
-	{ .name	=	"kbytesfree",
-	  .fops	=	&mdc_kbytesfree_fops	},
-	{ .name	=	"kbytesavail",
-	  .fops	=	&mdc_kbytesavail_fops	},
-	{ .name	=	"filestotal",
-	  .fops	=	&mdc_filestotal_fops	},
-	{ .name	=	"filesfree",
-	  .fops	=	&mdc_filesfree_fops	},
 	{ .name	=	"mds_server_uuid",
 	  .fops	=	&mdc_server_uuid_fops	},
-	{ .name	=	"mds_conn_uuid",
-	  .fops	=	&mdc_conn_uuid_fops	},
-	{ .name	=	"max_pages_per_rpc",
-	  .fops	=	&mdc_obd_max_pages_per_rpc_fops	},
-	{ .name	=	"max_rpcs_in_flight",
-	  .fops	=	&mdc_max_rpcs_in_flight_fops	},
-	{ .name	=	"max_mod_rpcs_in_flight",
-	  .fops	=	&mdc_max_mod_rpcs_in_flight_fops	},
+	{ .name =	"max_pages_per_rpc",
+	  .fops =	&mdc_obd_max_pages_per_rpc_fops },
+	{ .name =	"max_dirty_mb",
+	  .fops =	&mdc_max_dirty_mb_fops		},
+	{ .name	=	"mdc_cached_mb",
+	  .fops	=	&mdc_cached_mb_fops		},
 	{ .name	=	"timeouts",
 	  .fops	=	&mdc_timeouts_fops		},
 	{ .name	=	"import",
@@ -224,8 +510,53 @@ struct lprocfs_vars lprocfs_mdc_obd_vars[] = {
 	  .fops	=	&mdc_pinger_recov_fops		},
 	{ .name	=	"rpc_stats",
 	  .fops	=	&mdc_rpc_stats_fops		},
-	{ .name	=	"active",
-	  .fops	=	&mdc_active_fops		},
+	{ .name	=	"unstable_stats",
+	  .fops	=	&mdc_unstable_stats_fops	},
+	{ .name	=	"mdc_stats",
+	  .fops	=	&mdc_stats_fops			},
+	{ .name	=	"mdc_dom_min_repsize",
+	  .fops	=	&mdc_dom_min_repsize_fops	},
 	{ NULL }
 };
-#endif /* CONFIG_PROC_FS */
+
+static struct attribute *mdc_attrs[] = {
+	&lustre_attr_active.attr,
+	&lustre_attr_max_rpcs_in_flight.attr,
+	&lustre_attr_max_mod_rpcs_in_flight.attr,
+	&lustre_attr_contention_seconds.attr,
+	&lustre_attr_mds_conn_uuid.attr,
+	&lustre_attr_conn_uuid.attr,
+	&lustre_attr_ping.attr,
+	NULL,
+};
+
+int mdc_tunables_init(struct obd_device *obd)
+{
+	int rc;
+
+	obd->obd_ktype.default_attrs = mdc_attrs;
+	obd->obd_vars = lprocfs_mdc_obd_vars;
+
+	rc = lprocfs_obd_setup(obd, false);
+	if (rc)
+		goto out_failed;
+#ifdef CONFIG_PROC_FS
+	rc = lprocfs_alloc_md_stats(obd, 0);
+	if (rc) {
+		lprocfs_obd_cleanup(obd);
+		goto out_failed;
+	}
+#endif
+	rc = sptlrpc_lprocfs_cliobd_attach(obd);
+	if (rc) {
+#ifdef CONFIG_PROC_FS
+		lprocfs_free_md_stats(obd);
+#endif
+		lprocfs_obd_cleanup(obd);
+		goto out_failed;
+	}
+	ptlrpc_lprocfs_register_obd(obd);
+
+out_failed:
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c
index c99a3bacf24d6..1c8eb65110500 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_changelog.c
@@ -23,6 +23,8 @@
  * Copyright (c) 2017, Commissariat a l'Energie Atomique et aux Energies
  *                     Alternatives.
  *
+ * Copyright (c) 2017, Intel Corporation.
+ *
  * Author: Henri Doreau <henri.doreau@cea.fr>
  */
 
@@ -31,9 +33,11 @@
 #include <linux/init.h>
 #include <linux/kthread.h>
 #include <linux/poll.h>
-#include <linux/miscdevice.h>
+#include <linux/device.h>
+#include <linux/cdev.h>
 
 #include <lustre_log.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
 
 #include "mdc_internal.h"
 
@@ -55,38 +59,44 @@ static LIST_HEAD(chlg_registered_devices);
 
 struct chlg_registered_dev {
 	/* Device name of the form "changelog-{MDTNAME}" */
-	char			ced_name[32];
-	/* Misc device descriptor */
-	struct miscdevice	ced_misc;
+	char			 ced_name[32];
+	/* changelog char device */
+	struct cdev		 ced_cdev;
+	struct device		*ced_device;
 	/* OBDs referencing this device (multiple mount point) */
-	struct list_head	ced_obds;
+	struct list_head	 ced_obds;
 	/* Reference counter for proper deregistration */
-	struct kref		ced_refs;
+	struct kref		 ced_refs;
 	/* Link within the global chlg_registered_devices */
-	struct list_head	ced_link;
+	struct list_head	 ced_link;
 };
 
 struct chlg_reader_state {
 	/* Shortcut to the corresponding OBD device */
-	struct obd_device	*crs_obd;
+	struct obd_device	   *crs_obd;
+	/* the corresponding chlg_registered_dev */
+	struct chlg_registered_dev *crs_ced;
 	/* Producer thread (if any) */
-	struct task_struct	*crs_prod_task;
+	struct task_struct	   *crs_prod_task;
 	/* An error occurred that prevents from reading further */
-	bool			 crs_err;
+	int			    crs_err;
 	/* EOF, no more records available */
-	bool			 crs_eof;
+	bool			    crs_eof;
 	/* Desired start position */
-	__u64			 crs_start_offset;
+	__u64			    crs_start_offset;
 	/* Wait queue for the catalog processing thread */
-	wait_queue_head_t	 crs_waitq_prod;
+	wait_queue_head_t	    crs_waitq_prod;
 	/* Wait queue for the record copy threads */
-	wait_queue_head_t	 crs_waitq_cons;
+	wait_queue_head_t	    crs_waitq_cons;
 	/* Mutex protecting crs_rec_count and crs_rec_queue */
-	struct mutex		 crs_lock;
+	struct mutex		    crs_lock;
 	/* Number of item in the list */
-	__u64			 crs_rec_count;
+	__u64			    crs_rec_count;
 	/* List of prefetched enqueued_record::enq_linkage_items */
-	struct list_head	 crs_rec_queue;
+	struct list_head	    crs_rec_queue;
+	unsigned int		    crs_last_catidx;
+	unsigned int		    crs_last_idx;
+	bool			    crs_poll;
 };
 
 struct chlg_rec_entry {
@@ -103,6 +113,81 @@ enum {
 	CDEV_CHLG_MAX_PREFETCH = 1024,
 };
 
+static DEFINE_IDR(chlg_minor_idr);
+static DEFINE_SPINLOCK(chlg_minor_lock);
+
+static int chlg_minor_alloc(int *pminor)
+{
+	void *minor_allocated = (void *)-1;
+	int minor;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock(&chlg_minor_lock);
+	minor = idr_alloc(&chlg_minor_idr, minor_allocated, 0,
+			  MDC_CHANGELOG_DEV_COUNT, GFP_NOWAIT);
+	spin_unlock(&chlg_minor_lock);
+	idr_preload_end();
+
+	if (minor < 0)
+		return minor;
+
+	*pminor = minor;
+	return 0;
+}
+
+static void chlg_minor_free(int minor)
+{
+	spin_lock(&chlg_minor_lock);
+	idr_remove(&chlg_minor_idr, minor);
+	spin_unlock(&chlg_minor_lock);
+}
+
+static void chlg_device_release(struct device *dev)
+{
+	struct chlg_registered_dev *entry = dev_get_drvdata(dev);
+
+	chlg_minor_free(MINOR(entry->ced_cdev.dev));
+	OBD_FREE_PTR(entry);
+}
+
+/**
+ * Deregister a changelog character device whose refcount has reached zero.
+ */
+static void chlg_dev_clear(struct kref *kref)
+{
+	struct chlg_registered_dev *entry;
+	
+	ENTRY;
+	entry = container_of(kref, struct chlg_registered_dev,
+			     ced_refs);
+
+	list_del(&entry->ced_link);
+	cdev_del(&entry->ced_cdev);
+	device_destroy(mdc_changelog_class, entry->ced_cdev.dev);
+	EXIT;
+}
+
+static inline struct obd_device* chlg_obd_get(struct chlg_registered_dev *dev)
+{
+	struct obd_device *obd;
+
+	mutex_lock(&chlg_registered_dev_lock);
+	if (list_empty(&dev->ced_obds))
+		return NULL;
+
+	obd = list_first_entry(&dev->ced_obds, struct obd_device,
+			       u.cli.cl_chg_dev_linkage);
+	class_incref(obd, "changelog", dev);
+	mutex_unlock(&chlg_registered_dev_lock);
+	return obd;
+}
+
+static inline void chlg_obd_put(struct chlg_registered_dev *dev,
+			 struct obd_device *obd)
+{
+	class_decref(obd, "changelog", dev);
+}
+
 /**
  * ChangeLog catalog processing callback invoked on each record.
  * If the current record is eligible to userland delivery, push
@@ -122,7 +207,6 @@ static int chlg_read_cat_process_cb(const struct lu_env *env,
 	struct llog_changelog_rec *rec;
 	struct chlg_reader_state *crs = data;
 	struct chlg_rec_entry *enq;
-	struct l_wait_info lwi = { 0 };
 	size_t len;
 	int rc;
 	ENTRY;
@@ -132,6 +216,9 @@ static int chlg_read_cat_process_cb(const struct lu_env *env,
 
 	rec = container_of(hdr, struct llog_changelog_rec, cr_hdr);
 
+	crs->crs_last_catidx = llh->lgh_hdr->llh_cat_idx;
+	crs->crs_last_idx = hdr->lrh_index;
+
 	if (rec->cr_hdr.lrh_type != CHANGELOG_REC) {
 		rc = -EINVAL;
 		CERROR("%s: not a changelog rec %x/%d in llog "DFID" rc = %d\n",
@@ -152,9 +239,9 @@ static int chlg_read_cat_process_cb(const struct lu_env *env,
 	       PFID(&rec->cr.cr_tfid), PFID(&rec->cr.cr_pfid),
 	       rec->cr.cr_namelen, changelog_rec_name(&rec->cr));
 
-	l_wait_event(crs->crs_waitq_prod,
-		     (crs->crs_rec_count < CDEV_CHLG_MAX_PREFETCH ||
-		      kthread_should_stop()), &lwi);
+	wait_event_interruptible(crs->crs_waitq_prod,
+				 crs->crs_rec_count < CDEV_CHLG_MAX_PREFETCH ||
+				 kthread_should_stop());
 
 	if (kthread_should_stop())
 		RETURN(LLOG_PROC_BREAK);
@@ -197,13 +284,23 @@ static void enq_record_delete(struct chlg_rec_entry *rec)
 static int chlg_load(void *args)
 {
 	struct chlg_reader_state *crs = args;
-	struct obd_device *obd = crs->crs_obd;
+	struct chlg_registered_dev *ced = crs->crs_ced;
+	struct obd_device *obd = NULL;
 	struct llog_ctxt *ctx = NULL;
 	struct llog_handle *llh = NULL;
-	struct l_wait_info lwi = { 0 };
 	int rc;
 	ENTRY;
 
+	crs->crs_last_catidx = -1;
+	crs->crs_last_idx = 0;
+
+again:
+	obd = chlg_obd_get(ced);
+	if (obd == NULL)
+		RETURN(-ENODEV);
+
+	crs->crs_obd = obd;
+
 	ctx = llog_get_context(obd, LLOG_CHANGELOG_REPL_CTXT);
 	if (ctx == NULL)
 		GOTO(err_out, rc = -ENOENT);
@@ -216,24 +313,41 @@ static int chlg_load(void *args)
 		GOTO(err_out, rc);
 	}
 
-	rc = llog_init_handle(NULL, llh, LLOG_F_IS_CAT|LLOG_F_EXT_JOBID, NULL);
+
+	rc = llog_init_handle(NULL, llh,
+			      LLOG_F_IS_CAT |
+			      LLOG_F_EXT_JOBID |
+			      LLOG_F_EXT_EXTRA_FLAGS |
+			      LLOG_F_EXT_X_UIDGID |
+			      LLOG_F_EXT_X_NID |
+			      LLOG_F_EXT_X_OMODE |
+			      LLOG_F_EXT_X_XATTR,
+			      NULL);
 	if (rc) {
 		CERROR("%s: fail to init llog handle: rc = %d\n",
 		       obd->obd_name, rc);
 		GOTO(err_out, rc);
 	}
 
-	rc = llog_cat_process(NULL, llh, chlg_read_cat_process_cb, crs, 0, 0);
+	rc = llog_cat_process(NULL, llh, chlg_read_cat_process_cb, crs,
+				crs->crs_last_catidx, crs->crs_last_idx);
 	if (rc < 0) {
 		CERROR("%s: fail to process llog: rc = %d\n", obd->obd_name, rc);
 		GOTO(err_out, rc);
 	}
+	if (!kthread_should_stop() && crs->crs_poll) {
+		llog_cat_close(NULL, llh);
+		llog_ctxt_put(ctx);
+		class_decref(obd, "changelog", crs);
+		schedule_timeout_interruptible(HZ);
+		goto again;
+	}
 
 	crs->crs_eof = true;
 
 err_out:
 	if (rc < 0)
-		crs->crs_err = true;
+		crs->crs_err = rc;
 
 	wake_up_all(&crs->crs_waitq_cons);
 
@@ -243,7 +357,9 @@ static int chlg_load(void *args)
 	if (ctx != NULL)
 		llog_ctxt_put(ctx);
 
-	l_wait_event(crs->crs_waitq_prod, kthread_should_stop(), &lwi);
+	crs->crs_obd = NULL;
+	chlg_obd_put(ced, obd);
+	wait_event_interruptible(crs->crs_waitq_prod, kthread_should_stop());
 
 	RETURN(rc);
 }
@@ -266,17 +382,22 @@ static ssize_t chlg_read(struct file *file, char __user *buff, size_t count,
 	struct chlg_reader_state *crs = file->private_data;
 	struct chlg_rec_entry *rec;
 	struct chlg_rec_entry *tmp;
-	struct l_wait_info lwi = { 0 };
-	ssize_t  written_total = 0;
+	size_t written_total = 0;
+	ssize_t rc;
 	LIST_HEAD(consumed);
 	ENTRY;
 
-	if (file->f_flags & O_NONBLOCK && crs->crs_rec_count == 0)
-		RETURN(-EAGAIN);
+	if (file->f_flags & O_NONBLOCK && crs->crs_rec_count == 0) {
+		if (crs->crs_err < 0)
+			RETURN(crs->crs_err);
+		else if (crs->crs_eof)
+			RETURN(0);
+		else
+			RETURN(-EAGAIN);
+	}
 
-	l_wait_event(crs->crs_waitq_cons,
-		     crs->crs_rec_count > 0 || crs->crs_eof || crs->crs_err,
-		     &lwi);
+	rc = wait_event_interruptible(crs->crs_waitq_cons,
+			crs->crs_rec_count > 0 || crs->crs_eof || crs->crs_err);
 
 	mutex_lock(&crs->crs_lock);
 	list_for_each_entry_safe(rec, tmp, &crs->crs_rec_queue, enq_linkage) {
@@ -284,8 +405,7 @@ static ssize_t chlg_read(struct file *file, char __user *buff, size_t count,
 			break;
 
 		if (copy_to_user(buff, rec->enq_record, rec->enq_length)) {
-			if (written_total == 0)
-				written_total = -EFAULT;
+			rc = -EFAULT;
 			break;
 		}
 
@@ -299,15 +419,19 @@ static ssize_t chlg_read(struct file *file, char __user *buff, size_t count,
 	}
 	mutex_unlock(&crs->crs_lock);
 
-	if (written_total > 0)
+	if (written_total > 0) {
+		rc = written_total;
 		wake_up_all(&crs->crs_waitq_prod);
+	} else if (rc == 0) {
+		rc = crs->crs_err;
+	}
 
 	list_for_each_entry_safe(rec, tmp, &consumed, enq_linkage)
 		enq_record_delete(rec);
 
 	*ppos = crs->crs_start_offset;
 
-	RETURN(written_total);
+	RETURN(rc);
 }
 
 /**
@@ -392,15 +516,23 @@ static loff_t chlg_llseek(struct file *file, loff_t off, int whence)
  */
 static int chlg_clear(struct chlg_reader_state *crs, __u32 reader, __u64 record)
 {
-	struct obd_device *obd = crs->crs_obd;
+	struct obd_device *obd = NULL;
 	struct changelog_setinfo cs  = {
 		.cs_recno = record,
 		.cs_id    = reader
 	};
+	int rc;
+
+	obd = chlg_obd_get(crs->crs_ced);
+	if (obd == NULL)
+		return -ENODEV;
+
+	rc = obd_set_info_async(NULL, obd->obd_self_export,
+				strlen(KEY_CHANGELOG_CLEAR),
+				KEY_CHANGELOG_CLEAR, sizeof(cs), &cs, NULL);
 
-	return obd_set_info_async(NULL, obd->obd_self_export,
-				  strlen(KEY_CHANGELOG_CLEAR),
-				  KEY_CHANGELOG_CLEAR, sizeof(cs), &cs, NULL);
+	chlg_obd_put(crs->crs_ced, obd);
+	return rc;
 }
 
 /** Maximum changelog control command size */
@@ -449,31 +581,6 @@ static ssize_t chlg_write(struct file *file, const char __user *buff,
 	return rc < 0 ? rc : count;
 }
 
-/**
- * Find the OBD device associated to a changelog character device.
- * @param[in]  cdev  character device instance descriptor
- * @return corresponding OBD device or NULL if none was found.
- */
-static struct obd_device *chlg_obd_get(dev_t cdev)
-{
-	int minor = MINOR(cdev);
-	struct obd_device *obd = NULL;
-	struct chlg_registered_dev *curr;
-
-	mutex_lock(&chlg_registered_dev_lock);
-	list_for_each_entry(curr, &chlg_registered_devices, ced_link) {
-		if (curr->ced_misc.minor == minor) {
-			/* take the first available OBD device attached */
-			obd = list_first_entry(&curr->ced_obds,
-					       struct obd_device,
-					       u.cli.cl_chg_dev_linkage);
-			break;
-		}
-	}
-	mutex_unlock(&chlg_registered_dev_lock);
-	return obd;
-}
-
 /**
  * Open handler, initialize internal CRS state and spawn prefetch thread if
  * needed.
@@ -484,19 +591,19 @@ static struct obd_device *chlg_obd_get(dev_t cdev)
 static int chlg_open(struct inode *inode, struct file *file)
 {
 	struct chlg_reader_state *crs;
-	struct obd_device *obd = chlg_obd_get(inode->i_rdev);
+	struct chlg_registered_dev *dev;
 	struct task_struct *task;
 	int rc;
 	ENTRY;
 
-	if (!obd)
-		RETURN(-ENODEV);
+	dev = container_of(inode->i_cdev, struct chlg_registered_dev, ced_cdev);
 
 	OBD_ALLOC_PTR(crs);
 	if (!crs)
 		RETURN(-ENOMEM);
 
-	crs->crs_obd = obd;
+	kref_get(&dev->ced_refs);
+	crs->crs_ced = dev;
 	crs->crs_err = false;
 	crs->crs_eof = false;
 
@@ -510,7 +617,7 @@ static int chlg_open(struct inode *inode, struct file *file)
 		if (IS_ERR(task)) {
 			rc = PTR_ERR(task);
 			CERROR("%s: cannot start changelog thread: rc = %d\n",
-			       obd->obd_name, rc);
+			       dev->ced_name, rc);
 			GOTO(err_crs, rc);
 		}
 		crs->crs_prod_task = task;
@@ -520,6 +627,7 @@ static int chlg_open(struct inode *inode, struct file *file)
 	RETURN(0);
 
 err_crs:
+	kref_put(&dev->ced_refs, chlg_dev_clear);
 	OBD_FREE_PTR(crs);
 	return rc;
 }
@@ -536,15 +644,18 @@ static int chlg_release(struct inode *inode, struct file *file)
 	struct chlg_reader_state *crs = file->private_data;
 	struct chlg_rec_entry *rec;
 	struct chlg_rec_entry *tmp;
+	int rc = 0;
 
 	if (crs->crs_prod_task)
-		kthread_stop(crs->crs_prod_task);
+		rc = kthread_stop(crs->crs_prod_task);
 
 	list_for_each_entry_safe(rec, tmp, &crs->crs_rec_queue, enq_linkage)
 		enq_record_delete(rec);
 
+	kref_put(&crs->crs_ced->ced_refs, chlg_dev_clear);
 	OBD_FREE_PTR(crs);
-	return 0;
+
+	return rc;
 }
 
 /**
@@ -572,6 +683,23 @@ static unsigned int chlg_poll(struct file *file, poll_table *wait)
 	return mask;
 }
 
+static long chlg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int rc;
+
+	struct chlg_reader_state *crs = file->private_data;
+	switch (cmd) {
+	case OBD_IOC_CHLG_POLL:
+		crs->crs_poll = !!arg;
+		rc = 0;
+		break;
+	default:
+		rc = -EINVAL;
+		break;
+	}
+	return rc;
+}
+
 static const struct file_operations chlg_fops = {
 	.owner		= THIS_MODULE,
 	.llseek		= chlg_llseek,
@@ -580,17 +708,18 @@ static const struct file_operations chlg_fops = {
 	.open		= chlg_open,
 	.release	= chlg_release,
 	.poll		= chlg_poll,
+	.unlocked_ioctl	= chlg_ioctl,
 };
 
 /**
  * This uses obd_name of the form: "testfs-MDT0000-mdc-ffff88006501600"
  * and returns a name of the form: "changelog-testfs-MDT0000".
  */
-static void get_chlg_name(char *name, size_t name_len, struct obd_device *obd)
+static void get_target_name(char *name, size_t name_len, struct obd_device *obd)
 {
 	int i;
 
-	snprintf(name, name_len, "changelog-%s", obd->obd_name);
+	snprintf(name, name_len, "%s", obd->obd_name);
 
 	/* Find the 2nd '-' from the end and truncate on it */
 	for (i = 0; i < 2; i++) {
@@ -652,18 +781,16 @@ int mdc_changelog_cdev_init(struct obd_device *obd)
 {
 	struct chlg_registered_dev *exist;
 	struct chlg_registered_dev *entry;
-	int rc;
+	struct device *device;
+	dev_t dev;
+	int minor, rc;
 	ENTRY;
 
 	OBD_ALLOC_PTR(entry);
 	if (entry == NULL)
 		RETURN(-ENOMEM);
 
-	get_chlg_name(entry->ced_name, sizeof(entry->ced_name), obd);
-
-	entry->ced_misc.minor = MISC_DYNAMIC_MINOR;
-	entry->ced_misc.name  = entry->ced_name;
-	entry->ced_misc.fops  = &chlg_fops;
+	get_target_name(entry->ced_name, sizeof(entry->ced_name), obd);
 
 	kref_init(&entry->ced_refs);
 	INIT_LIST_HEAD(&entry->ced_obds);
@@ -677,15 +804,41 @@ int mdc_changelog_cdev_init(struct obd_device *obd)
 		GOTO(out_unlock, rc = 0);
 	}
 
+	list_add_tail(&obd->u.cli.cl_chg_dev_linkage, &entry->ced_obds);
+	list_add_tail(&entry->ced_link, &chlg_registered_devices);
+
 	/* Register new character device */
-	rc = misc_register(&entry->ced_misc);
-	if (rc != 0)
+	cdev_init(&entry->ced_cdev, &chlg_fops);
+	entry->ced_cdev.owner = THIS_MODULE;
+
+	rc = chlg_minor_alloc(&minor);
+	if (rc)
 		GOTO(out_unlock, rc);
 
-	list_add_tail(&obd->u.cli.cl_chg_dev_linkage, &entry->ced_obds);
-	list_add_tail(&entry->ced_link, &chlg_registered_devices);
+	dev = MKDEV(MAJOR(mdc_changelog_dev), minor);
+	rc = cdev_add(&entry->ced_cdev, dev, 1);
+	if (rc)
+		GOTO(out_minor, rc);
+
+	device = device_create(mdc_changelog_class, NULL, dev, entry, "%s-%s",
+			       MDC_CHANGELOG_DEV_NAME, entry->ced_name);
+	if (IS_ERR(device))
+		GOTO(out_cdev, rc = PTR_ERR(device));
+
+	device->release = chlg_device_release;
+	entry->ced_device = device;
 
 	entry = NULL;	/* prevent it from being freed below */
+	GOTO(out_unlock, rc = 0);
+
+out_cdev:
+	cdev_del(&entry->ced_cdev);
+
+out_minor:
+	chlg_minor_free(minor);
+
+	list_del_init(&obd->u.cli.cl_chg_dev_linkage);
+	list_del(&entry->ced_link);
 
 out_unlock:
 	mutex_unlock(&chlg_registered_dev_lock);
@@ -694,23 +847,6 @@ int mdc_changelog_cdev_init(struct obd_device *obd)
 	RETURN(rc);
 }
 
-/**
- * Deregister a changelog character device whose refcount has reached zero.
- */
-static void chlg_dev_clear(struct kref *kref)
-{
-	struct chlg_registered_dev *entry = container_of(kref,
-						      struct chlg_registered_dev,
-						      ced_refs);
-	ENTRY;
-
-	LASSERT(mutex_is_locked(&chlg_registered_dev_lock));
-	list_del(&entry->ced_link);
-	misc_deregister(&entry->ced_misc);
-	OBD_FREE_PTR(entry);
-	EXIT;
-}
-
 /**
  * Release OBD, decrease reference count of the corresponding changelog device.
  */
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_dev.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_dev.c
new file mode 100644
index 0000000000000..3606778434879
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_dev.c
@@ -0,0 +1,1564 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Implementation of cl_device, cl_req for MDC layer.
+ *
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+#include <obd_class.h>
+#include <lustre_osc.h>
+
+#include "mdc_internal.h"
+
+static void mdc_lock_build_policy(const struct lu_env *env,
+				  union ldlm_policy_data *policy)
+{
+	memset(policy, 0, sizeof *policy);
+	policy->l_inodebits.bits = MDS_INODELOCK_DOM;
+}
+
+int mdc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
+{
+	return osc_ldlm_glimpse_ast(dlmlock, data);
+}
+
+static void mdc_lock_build_einfo(const struct lu_env *env,
+				 const struct cl_lock *lock,
+				 struct osc_object *osc,
+				 struct ldlm_enqueue_info *einfo)
+{
+	einfo->ei_type = LDLM_IBITS;
+	einfo->ei_mode = osc_cl_lock2ldlm(lock->cll_descr.cld_mode);
+	einfo->ei_cb_bl = mdc_ldlm_blocking_ast;
+	einfo->ei_cb_cp = ldlm_completion_ast;
+	einfo->ei_cb_gl = mdc_ldlm_glimpse_ast;
+	einfo->ei_cbdata = osc; /* value to be put into ->l_ast_data */
+}
+
+static void mdc_lock_lvb_update(const struct lu_env *env,
+				struct osc_object *osc,
+				struct ldlm_lock *dlmlock,
+				struct ost_lvb *lvb);
+
+static int mdc_set_dom_lock_data(struct ldlm_lock *lock, void *data)
+{
+	int set = 0;
+
+	LASSERT(lock != NULL);
+	LASSERT(lock->l_glimpse_ast == mdc_ldlm_glimpse_ast);
+
+	lock_res_and_lock(lock);
+
+	if (lock->l_ast_data == NULL)
+		lock->l_ast_data = data;
+	if (lock->l_ast_data == data)
+		set = 1;
+
+	unlock_res_and_lock(lock);
+
+	return set;
+}
+
+int mdc_dom_lock_match(const struct lu_env *env, struct obd_export *exp,
+		       struct ldlm_res_id *res_id, enum ldlm_type type,
+		       union ldlm_policy_data *policy, enum ldlm_mode mode,
+		       __u64 *flags, struct osc_object *obj,
+		       struct lustre_handle *lockh, int unref)
+{
+	struct obd_device *obd = exp->exp_obd;
+	__u64 lflags = *flags;
+	enum ldlm_mode rc;
+
+	ENTRY;
+
+	rc = ldlm_lock_match(obd->obd_namespace, lflags,
+			     res_id, type, policy, mode, lockh, unref);
+	if (rc == 0 || lflags & LDLM_FL_TEST_LOCK)
+		RETURN(rc);
+
+	if (obj != NULL) {
+		struct ldlm_lock *lock = ldlm_handle2lock(lockh);
+
+		LASSERT(lock != NULL);
+		if (mdc_set_dom_lock_data(lock, obj)) {
+			lock_res_and_lock(lock);
+			if (!ldlm_is_lvb_cached(lock)) {
+				LASSERT(lock->l_ast_data == obj);
+				mdc_lock_lvb_update(env, obj, lock, NULL);
+				ldlm_set_lvb_cached(lock);
+			}
+			unlock_res_and_lock(lock);
+		} else {
+			ldlm_lock_decref(lockh, rc);
+			rc = 0;
+		}
+		LDLM_LOCK_PUT(lock);
+	}
+	RETURN(rc);
+}
+
+/**
+ * Finds an existing lock covering a page with given index.
+ * Copy of osc_obj_dlmlock_at_pgoff() but for DoM IBITS lock.
+ */
+struct ldlm_lock *mdc_dlmlock_at_pgoff(const struct lu_env *env,
+				       struct osc_object *obj, pgoff_t index,
+				       enum osc_dap_flags dap_flags)
+{
+	struct osc_thread_info *info = osc_env_info(env);
+	struct ldlm_res_id *resname = &info->oti_resname;
+	union ldlm_policy_data *policy = &info->oti_policy;
+	struct lustre_handle lockh;
+	struct ldlm_lock *lock = NULL;
+	enum ldlm_mode mode;
+	__u64 flags;
+
+	ENTRY;
+
+	fid_build_reg_res_name(lu_object_fid(osc2lu(obj)), resname);
+	mdc_lock_build_policy(env, policy);
+
+	flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
+	if (dap_flags & OSC_DAP_FL_TEST_LOCK)
+		flags |= LDLM_FL_TEST_LOCK;
+
+again:
+	/* Next, search for already existing extent locks that will cover us */
+	/* If we're trying to read, we also search for an existing PW lock.  The
+	 * VFS and page cache already protect us locally, so lots of readers/
+	 * writers can share a single PW lock. */
+	mode = mdc_dom_lock_match(env, osc_export(obj), resname, LDLM_IBITS,
+				  policy, LCK_PR | LCK_PW | LCK_GROUP, &flags,
+				  obj, &lockh,
+				  dap_flags & OSC_DAP_FL_CANCELING);
+	if (mode != 0) {
+		lock = ldlm_handle2lock(&lockh);
+		/* RACE: the lock is cancelled so let's try again */
+		if (unlikely(lock == NULL))
+			goto again;
+	}
+
+	RETURN(lock);
+}
+
+/**
+ * Check if page @page is covered by an extra lock or discard it.
+ */
+static int mdc_check_and_discard_cb(const struct lu_env *env, struct cl_io *io,
+				    struct osc_page *ops, void *cbdata)
+{
+	struct osc_thread_info *info = osc_env_info(env);
+	struct osc_object *osc = cbdata;
+	pgoff_t index;
+
+	index = osc_index(ops);
+	if (index >= info->oti_fn_index) {
+		struct ldlm_lock *tmp;
+		struct cl_page *page = ops->ops_cl.cpl_page;
+
+		/* refresh non-overlapped index */
+		tmp = mdc_dlmlock_at_pgoff(env, osc, index,
+					   OSC_DAP_FL_TEST_LOCK);
+		if (tmp != NULL) {
+			info->oti_fn_index = CL_PAGE_EOF;
+			LDLM_LOCK_PUT(tmp);
+		} else if (cl_page_own(env, io, page) == 0) {
+			/* discard the page */
+			cl_page_discard(env, io, page);
+			cl_page_disown(env, io, page);
+		} else {
+			LASSERT(page->cp_state == CPS_FREEING);
+		}
+	}
+
+	info->oti_next_index = index + 1;
+	return CLP_GANG_OKAY;
+}
+
+/**
+ * Discard pages protected by the given lock. This function traverses radix
+ * tree to find all covering pages and discard them. If a page is being covered
+ * by other locks, it should remain in cache.
+ *
+ * If error happens on any step, the process continues anyway (the reasoning
+ * behind this being that lock cancellation cannot be delayed indefinitely).
+ */
+static int mdc_lock_discard_pages(const struct lu_env *env,
+				  struct osc_object *osc,
+				  pgoff_t start, pgoff_t end,
+				  bool discard)
+{
+	struct osc_thread_info *info = osc_env_info(env);
+	struct cl_io *io = &info->oti_io;
+	osc_page_gang_cbt cb;
+	int res;
+	int result;
+
+	ENTRY;
+
+	io->ci_obj = cl_object_top(osc2cl(osc));
+	io->ci_ignore_layout = 1;
+	result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+	if (result != 0)
+		GOTO(out, result);
+
+	cb = discard ? osc_discard_cb : mdc_check_and_discard_cb;
+	info->oti_fn_index = info->oti_next_index = start;
+	do {
+		res = osc_page_gang_lookup(env, io, osc, info->oti_next_index,
+					   end, cb, (void *)osc);
+		if (info->oti_next_index > end)
+			break;
+
+		if (res == CLP_GANG_RESCHED)
+			cond_resched();
+	} while (res != CLP_GANG_OKAY);
+out:
+	cl_io_fini(env, io);
+	RETURN(result);
+}
+
+static int mdc_lock_flush(const struct lu_env *env, struct osc_object *obj,
+			  pgoff_t start, pgoff_t end, enum cl_lock_mode mode,
+			  bool discard)
+{
+	int result = 0;
+	int rc;
+
+	ENTRY;
+
+	if (mode == CLM_WRITE) {
+		result = osc_cache_writeback_range(env, obj, start, end, 1,
+						   discard);
+		CDEBUG(D_CACHE, "object %p: [%lu -> %lu] %d pages were %s.\n",
+		       obj, start, end, result,
+		       discard ? "discarded" : "written back");
+		if (result > 0)
+			result = 0;
+	}
+
+	rc = mdc_lock_discard_pages(env, obj, start, end, discard);
+	if (result == 0 && rc < 0)
+		result = rc;
+
+	RETURN(result);
+}
+
+void mdc_lock_lockless_cancel(const struct lu_env *env,
+			      const struct cl_lock_slice *slice)
+{
+	struct osc_lock *ols = cl2osc_lock(slice);
+	struct osc_object *osc = cl2osc(slice->cls_obj);
+	struct cl_lock_descr *descr = &slice->cls_lock->cll_descr;
+	int rc;
+
+	LASSERT(ols->ols_dlmlock == NULL);
+	rc = mdc_lock_flush(env, osc, descr->cld_start, descr->cld_end,
+			    descr->cld_mode, 0);
+	if (rc != 0)
+		CERROR("Pages for lockless lock %p were not purged(%d)\n",
+		       ols, rc);
+
+	osc_lock_wake_waiters(env, osc, ols);
+}
+
+/**
+ * Helper for osc_dlm_blocking_ast() handling discrepancies between cl_lock
+ * and ldlm_lock caches.
+ */
+static int mdc_dlm_blocking_ast0(const struct lu_env *env,
+				 struct ldlm_lock *dlmlock,
+				 int flag)
+{
+	struct cl_object *obj = NULL;
+	int result = 0;
+	bool discard;
+	enum cl_lock_mode mode = CLM_READ;
+
+	ENTRY;
+
+	LASSERT(flag == LDLM_CB_CANCELING);
+	LASSERT(dlmlock != NULL);
+
+	lock_res_and_lock(dlmlock);
+	if (dlmlock->l_granted_mode != dlmlock->l_req_mode) {
+		dlmlock->l_ast_data = NULL;
+		unlock_res_and_lock(dlmlock);
+		RETURN(0);
+	}
+
+	discard = ldlm_is_discard_data(dlmlock);
+	if (dlmlock->l_granted_mode & (LCK_PW | LCK_GROUP))
+		mode = CLM_WRITE;
+
+	if (dlmlock->l_ast_data != NULL) {
+		obj = osc2cl(dlmlock->l_ast_data);
+		dlmlock->l_ast_data = NULL;
+		cl_object_get(obj);
+	}
+	unlock_res_and_lock(dlmlock);
+
+	/* if l_ast_data is NULL, the dlmlock was enqueued by AGL or
+	 * the object has been destroyed. */
+	if (obj != NULL) {
+		struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+
+		/* Destroy pages covered by the extent of the DLM lock */
+		result = mdc_lock_flush(env, cl2osc(obj), cl_index(obj, 0),
+					CL_PAGE_EOF, mode, discard);
+		/* Losing a lock, set KMS to 0.
+		 * NB: assumed that DOM lock covers whole data on MDT.
+		 */
+		/* losing a lock, update kms */
+		lock_res_and_lock(dlmlock);
+		cl_object_attr_lock(obj);
+		attr->cat_kms = 0;
+		cl_object_attr_update(env, obj, attr, CAT_KMS);
+		cl_object_attr_unlock(obj);
+		unlock_res_and_lock(dlmlock);
+		cl_object_put(env, obj);
+	}
+	RETURN(result);
+}
+
+int mdc_ldlm_blocking_ast(struct ldlm_lock *dlmlock,
+			  struct ldlm_lock_desc *new, void *data, int flag)
+{
+	int rc = 0;
+
+	ENTRY;
+
+	switch (flag) {
+	case LDLM_CB_BLOCKING: {
+		struct lustre_handle lockh;
+
+		ldlm_lock2handle(dlmlock, &lockh);
+		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+		if (rc == -ENODATA)
+			rc = 0;
+		break;
+	}
+	case LDLM_CB_CANCELING: {
+		struct lu_env *env;
+		__u16 refcheck;
+
+		/*
+		 * This can be called in the context of outer IO, e.g.,
+		 *
+		 *    osc_enqueue_base()->...
+		 *      ->ldlm_prep_elc_req()->...
+		 *        ->ldlm_cancel_callback()->...
+		 *          ->osc_ldlm_blocking_ast()
+		 *
+		 * new environment has to be created to not corrupt outer
+		 * context.
+		 */
+		env = cl_env_get(&refcheck);
+		if (IS_ERR(env)) {
+			rc = PTR_ERR(env);
+			break;
+		}
+
+		rc = mdc_dlm_blocking_ast0(env, dlmlock, flag);
+		cl_env_put(env, &refcheck);
+		break;
+	}
+	default:
+		LBUG();
+	}
+	RETURN(rc);
+}
+
+/**
+ * Updates object attributes from a lock value block (lvb) received together
+ * with the DLM lock reply from the server.
+ * This can be optimized to not update attributes when lock is a result of a
+ * local match.
+ *
+ * Called under lock and resource spin-locks.
+ */
+void mdc_lock_lvb_update(const struct lu_env *env, struct osc_object *osc,
+			 struct ldlm_lock *dlmlock, struct ost_lvb *lvb)
+{
+	struct cl_object *obj = osc2cl(osc);
+	struct lov_oinfo *oinfo = osc->oo_oinfo;
+	struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+	unsigned valid = CAT_BLOCKS | CAT_ATIME | CAT_CTIME | CAT_MTIME |
+			 CAT_SIZE;
+	unsigned int setkms = 0;
+
+	ENTRY;
+
+	if (lvb == NULL) {
+		LASSERT(dlmlock != NULL);
+		lvb = &dlmlock->l_ost_lvb;
+	}
+	cl_lvb2attr(attr, lvb);
+
+	cl_object_attr_lock(obj);
+	if (dlmlock != NULL) {
+		__u64 size;
+
+		check_res_locked(dlmlock->l_resource);
+		size = lvb->lvb_size;
+
+		if (size >= oinfo->loi_kms) {
+			valid |= CAT_KMS;
+			attr->cat_kms = size;
+			setkms = 1;
+		}
+	}
+
+	/* The size should not be less than the kms */
+	if (attr->cat_size < oinfo->loi_kms)
+		attr->cat_size = oinfo->loi_kms;
+
+	LDLM_DEBUG(dlmlock, "acquired size %llu, setting rss=%llu;%s "
+		   "kms=%llu, end=%llu", lvb->lvb_size, attr->cat_size,
+		   setkms ? "" : " leaving",
+		   setkms ? attr->cat_kms : oinfo->loi_kms,
+		   dlmlock ? dlmlock->l_policy_data.l_extent.end : -1ull);
+
+	cl_object_attr_update(env, obj, attr, valid);
+	cl_object_attr_unlock(obj);
+	EXIT;
+}
+
+static void mdc_lock_granted(const struct lu_env *env, struct osc_lock *oscl,
+			     struct lustre_handle *lockh)
+{
+	struct osc_object *osc = cl2osc(oscl->ols_cl.cls_obj);
+	struct ldlm_lock *dlmlock;
+
+	ENTRY;
+
+	dlmlock = ldlm_handle2lock_long(lockh, 0);
+	LASSERT(dlmlock != NULL);
+
+	/* lock reference taken by ldlm_handle2lock_long() is
+	 * owned by osc_lock and released in osc_lock_detach()
+	 */
+	lu_ref_add(&dlmlock->l_reference, "osc_lock", oscl);
+	oscl->ols_has_ref = 1;
+
+	LASSERT(oscl->ols_dlmlock == NULL);
+	oscl->ols_dlmlock = dlmlock;
+
+	/* This may be a matched lock for glimpse request, do not hold
+	 * lock reference in that case. */
+	if (!oscl->ols_glimpse) {
+		/* hold a refc for non glimpse lock which will
+		 * be released in osc_lock_cancel() */
+		lustre_handle_copy(&oscl->ols_handle, lockh);
+		ldlm_lock_addref(lockh, oscl->ols_einfo.ei_mode);
+		oscl->ols_hold = 1;
+	}
+
+	/* Lock must have been granted. */
+	lock_res_and_lock(dlmlock);
+	if (dlmlock->l_granted_mode == dlmlock->l_req_mode) {
+		struct cl_lock_descr *descr = &oscl->ols_cl.cls_lock->cll_descr;
+
+		/* extend the lock extent, otherwise it will have problem when
+		 * we decide whether to grant a lockless lock. */
+		descr->cld_mode = osc_ldlm2cl_lock(dlmlock->l_granted_mode);
+		descr->cld_start = cl_index(descr->cld_obj, 0);
+		descr->cld_end = CL_PAGE_EOF;
+
+		/* no lvb update for matched lock */
+		if (!ldlm_is_lvb_cached(dlmlock)) {
+			LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY);
+			LASSERT(osc == dlmlock->l_ast_data);
+			mdc_lock_lvb_update(env, osc, dlmlock, NULL);
+			ldlm_set_lvb_cached(dlmlock);
+		}
+	}
+	unlock_res_and_lock(dlmlock);
+
+	LASSERT(oscl->ols_state != OLS_GRANTED);
+	oscl->ols_state = OLS_GRANTED;
+	EXIT;
+}
+
+/**
+ * Lock upcall function that is executed either when a reply to ENQUEUE rpc is
+ * received from a server, or after osc_enqueue_base() matched a local DLM
+ * lock.
+ */
+static int mdc_lock_upcall(void *cookie, struct lustre_handle *lockh,
+			   int errcode)
+{
+	struct osc_lock *oscl = cookie;
+	struct cl_lock_slice *slice = &oscl->ols_cl;
+	struct lu_env *env;
+	int rc;
+
+	ENTRY;
+
+	env = cl_env_percpu_get();
+	/* should never happen, similar to osc_ldlm_blocking_ast(). */
+	LASSERT(!IS_ERR(env));
+
+	rc = ldlm_error2errno(errcode);
+	if (oscl->ols_state == OLS_ENQUEUED) {
+		oscl->ols_state = OLS_UPCALL_RECEIVED;
+	} else if (oscl->ols_state == OLS_CANCELLED) {
+		rc = -EIO;
+	} else {
+		CERROR("Impossible state: %d\n", oscl->ols_state);
+		LBUG();
+	}
+
+	CDEBUG(D_INODE, "rc %d, err %d\n", rc, errcode);
+	if (rc == 0)
+		mdc_lock_granted(env, oscl, lockh);
+
+	/* Error handling, some errors are tolerable. */
+	if (oscl->ols_locklessable && rc == -EUSERS) {
+		/* This is a tolerable error, turn this lock into
+		 * lockless lock.
+		 */
+		osc_object_set_contended(cl2osc(slice->cls_obj));
+		LASSERT(slice->cls_ops != oscl->ols_lockless_ops);
+
+		/* Change this lock to ldlmlock-less lock. */
+		osc_lock_to_lockless(env, oscl, 1);
+		oscl->ols_state = OLS_GRANTED;
+		rc = 0;
+	} else if (oscl->ols_glimpse && rc == -ENAVAIL) {
+		LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY);
+		mdc_lock_lvb_update(env, cl2osc(slice->cls_obj),
+				    NULL, &oscl->ols_lvb);
+		/* Hide the error. */
+		rc = 0;
+	}
+
+	if (oscl->ols_owner != NULL)
+		cl_sync_io_note(env, oscl->ols_owner, rc);
+	cl_env_percpu_put(env);
+
+	RETURN(rc);
+}
+
+int mdc_fill_lvb(struct ptlrpc_request *req, struct ost_lvb *lvb)
+{
+	struct mdt_body *body;
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (!body)
+		RETURN(-EPROTO);
+
+	lvb->lvb_mtime = body->mbo_mtime;
+	lvb->lvb_atime = body->mbo_atime;
+	lvb->lvb_ctime = body->mbo_ctime;
+	lvb->lvb_blocks = body->mbo_dom_blocks;
+	lvb->lvb_size = body->mbo_dom_size;
+
+	RETURN(0);
+}
+
+int mdc_enqueue_fini(struct ptlrpc_request *req, osc_enqueue_upcall_f upcall,
+		     void *cookie, struct lustre_handle *lockh,
+		     enum ldlm_mode mode, __u64 *flags, int errcode)
+{
+	struct osc_lock *ols = cookie;
+	struct ldlm_lock *lock;
+	int rc = 0;
+
+	ENTRY;
+
+	/* The request was created before ldlm_cli_enqueue call. */
+	if (errcode == ELDLM_LOCK_ABORTED) {
+		struct ldlm_reply *rep;
+
+		rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+		LASSERT(rep != NULL);
+
+		rep->lock_policy_res2 =
+			ptlrpc_status_ntoh(rep->lock_policy_res2);
+		if (rep->lock_policy_res2)
+			errcode = rep->lock_policy_res2;
+
+		rc = mdc_fill_lvb(req, &ols->ols_lvb);
+		*flags |= LDLM_FL_LVB_READY;
+	} else if (errcode == ELDLM_OK) {
+		/* Callers have references, should be valid always */
+		lock = ldlm_handle2lock(lockh);
+		LASSERT(lock);
+
+		rc = mdc_fill_lvb(req, &lock->l_ost_lvb);
+		LDLM_LOCK_PUT(lock);
+		*flags |= LDLM_FL_LVB_READY;
+	}
+
+	/* Call the update callback. */
+	rc = (*upcall)(cookie, lockh, rc < 0 ? rc : errcode);
+
+	/* release the reference taken in ldlm_cli_enqueue() */
+	if (errcode == ELDLM_LOCK_MATCHED)
+		errcode = ELDLM_OK;
+	if (errcode == ELDLM_OK && lustre_handle_is_used(lockh))
+		ldlm_lock_decref(lockh, mode);
+
+	RETURN(rc);
+}
+
+int mdc_enqueue_interpret(const struct lu_env *env, struct ptlrpc_request *req,
+			  struct osc_enqueue_args *aa, int rc)
+{
+	struct ldlm_lock *lock;
+	struct lustre_handle *lockh = &aa->oa_lockh;
+	enum ldlm_mode mode = aa->oa_mode;
+
+	ENTRY;
+
+	LASSERT(!aa->oa_speculative);
+
+	/* ldlm_cli_enqueue is holding a reference on the lock, so it must
+	 * be valid. */
+	lock = ldlm_handle2lock(lockh);
+	LASSERTF(lock != NULL,
+		 "lockh %#llx, req %p, aa %p - client evicted?\n",
+		 lockh->cookie, req, aa);
+
+	/* Take an additional reference so that a blocking AST that
+	 * ldlm_cli_enqueue_fini() might post for a failed lock, is guaranteed
+	 * to arrive after an upcall has been executed by
+	 * osc_enqueue_fini(). */
+	ldlm_lock_addref(lockh, mode);
+
+	/* Let cl_lock_state_wait fail with -ERESTARTSYS to unuse sublocks. */
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_HANG, 2);
+
+	/* Let CP AST to grant the lock first. */
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1);
+
+	/* Complete obtaining the lock procedure. */
+	rc = ldlm_cli_enqueue_fini(aa->oa_exp, req, aa->oa_type, 1,
+				   aa->oa_mode, aa->oa_flags, NULL, 0,
+				   lockh, rc);
+	/* Complete mdc stuff. */
+	rc = mdc_enqueue_fini(req, aa->oa_upcall, aa->oa_cookie, lockh, mode,
+			      aa->oa_flags, rc);
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
+
+	ldlm_lock_decref(lockh, mode);
+	LDLM_LOCK_PUT(lock);
+	RETURN(rc);
+}
+
+/* When enqueuing asynchronously, locks are not ordered, we can obtain a lock
+ * from the 2nd OSC before a lock from the 1st one. This does not deadlock with
+ * other synchronous requests, however keeping some locks and trying to obtain
+ * others may take a considerable amount of time in a case of ost failure; and
+ * when other sync requests do not get released lock from a client, the client
+ * is excluded from the cluster -- such scenarious make the life difficult, so
+ * release locks just after they are obtained. */
+int mdc_enqueue_send(const struct lu_env *env, struct obd_export *exp,
+		     struct ldlm_res_id *res_id, __u64 *flags,
+		     union ldlm_policy_data *policy,
+		     struct ost_lvb *lvb, int kms_valid,
+		     osc_enqueue_upcall_f upcall, void *cookie,
+		     struct ldlm_enqueue_info *einfo, int async)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct lustre_handle lockh = { 0 };
+	struct ptlrpc_request *req = NULL;
+	struct ldlm_intent *lit;
+	enum ldlm_mode mode;
+	bool glimpse = *flags & LDLM_FL_HAS_INTENT;
+	__u64 match_flags = *flags;
+	int rc;
+
+	ENTRY;
+
+	mode = einfo->ei_mode;
+	if (einfo->ei_mode == LCK_PR)
+		mode |= LCK_PW;
+
+	if (glimpse)
+		match_flags |= LDLM_FL_BLOCK_GRANTED;
+	/* DOM locking uses LDLM_FL_KMS_IGNORE to mark locks wich have no valid
+	 * LVB information, e.g. canceled locks or locks of just pruned object,
+	 * such locks should be skipped.
+	 */
+	mode = ldlm_lock_match(obd->obd_namespace, match_flags, res_id,
+			       einfo->ei_type, policy, mode, &lockh, 0);
+	if (mode) {
+		struct ldlm_lock *matched;
+
+		if (*flags & LDLM_FL_TEST_LOCK)
+			RETURN(ELDLM_OK);
+
+		matched = ldlm_handle2lock(&lockh);
+
+		if (OBD_FAIL_CHECK(OBD_FAIL_MDC_GLIMPSE_DDOS))
+			ldlm_set_kms_ignore(matched);
+
+		if (mdc_set_dom_lock_data(matched, einfo->ei_cbdata)) {
+			*flags |= LDLM_FL_LVB_READY;
+
+			/* We already have a lock, and it's referenced. */
+			(*upcall)(cookie, &lockh, ELDLM_LOCK_MATCHED);
+
+			ldlm_lock_decref(&lockh, mode);
+			LDLM_LOCK_PUT(matched);
+			RETURN(ELDLM_OK);
+		}
+		ldlm_lock_decref(&lockh, mode);
+		LDLM_LOCK_PUT(matched);
+	}
+
+	if (*flags & (LDLM_FL_TEST_LOCK | LDLM_FL_MATCH_LOCK))
+		RETURN(-ENOLCK);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_INTENT);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	/* pack the intent */
+	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+	lit->opc = glimpse ? IT_GLIMPSE : IT_BRW;
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, 0);
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, 0);
+	ptlrpc_request_set_replen(req);
+
+	/* users of mdc_enqueue() can pass this flag for ldlm_lock_match() */
+	*flags &= ~LDLM_FL_BLOCK_GRANTED;
+	/* All MDC IO locks are intents */
+	*flags |= LDLM_FL_HAS_INTENT;
+	rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, NULL,
+			      0, LVB_T_NONE, &lockh, async);
+	if (async) {
+		if (!rc) {
+			struct osc_enqueue_args *aa;
+
+			CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+			aa = ptlrpc_req_async_args(req);
+			aa->oa_exp = exp;
+			aa->oa_mode = einfo->ei_mode;
+			aa->oa_type = einfo->ei_type;
+			lustre_handle_copy(&aa->oa_lockh, &lockh);
+			aa->oa_upcall = upcall;
+			aa->oa_cookie = cookie;
+			aa->oa_speculative = false;
+			aa->oa_flags = flags;
+			aa->oa_lvb = lvb;
+
+			req->rq_interpret_reply =
+				(ptlrpc_interpterer_t)mdc_enqueue_interpret;
+			ptlrpcd_add_req(req);
+		} else {
+			ptlrpc_req_finished(req);
+		}
+		RETURN(rc);
+	}
+
+	rc = mdc_enqueue_fini(req, upcall, cookie, &lockh, einfo->ei_mode,
+			      flags, rc);
+	ptlrpc_req_finished(req);
+	RETURN(rc);
+}
+
+/**
+ * Implementation of cl_lock_operations::clo_enqueue() method for osc
+ * layer. This initiates ldlm enqueue:
+ *
+ *     - cancels conflicting locks early (osc_lock_enqueue_wait());
+ *
+ *     - calls osc_enqueue_base() to do actual enqueue.
+ *
+ * osc_enqueue_base() is supplied with an upcall function that is executed
+ * when lock is received either after a local cached ldlm lock is matched, or
+ * when a reply from the server is received.
+ *
+ * This function does not wait for the network communication to complete.
+ */
+static int mdc_lock_enqueue(const struct lu_env *env,
+			    const struct cl_lock_slice *slice,
+			    struct cl_io *unused, struct cl_sync_io *anchor)
+{
+	struct osc_thread_info *info = osc_env_info(env);
+	struct osc_io *oio = osc_env_io(env);
+	struct osc_object *osc = cl2osc(slice->cls_obj);
+	struct osc_lock *oscl = cl2osc_lock(slice);
+	struct cl_lock *lock = slice->cls_lock;
+	struct ldlm_res_id *resname = &info->oti_resname;
+	union ldlm_policy_data *policy = &info->oti_policy;
+	osc_enqueue_upcall_f upcall = mdc_lock_upcall;
+	void *cookie = (void *)oscl;
+	bool async = false;
+	int result;
+
+	ENTRY;
+
+	LASSERTF(ergo(oscl->ols_glimpse, lock->cll_descr.cld_mode <= CLM_READ),
+		"lock = %p, ols = %p\n", lock, oscl);
+
+	if (oscl->ols_state == OLS_GRANTED)
+		RETURN(0);
+
+	/* Lockahead is not supported on MDT yet */
+	if (oscl->ols_flags & LDLM_FL_NO_EXPANSION) {
+		result = -EOPNOTSUPP;
+		RETURN(result);
+	}
+
+	if (oscl->ols_flags & LDLM_FL_TEST_LOCK)
+		GOTO(enqueue_base, 0);
+
+	if (oscl->ols_glimpse) {
+		LASSERT(equi(oscl->ols_speculative, anchor == NULL));
+		async = true;
+		GOTO(enqueue_base, 0);
+	}
+
+	result = osc_lock_enqueue_wait(env, osc, oscl);
+	if (result < 0)
+		GOTO(out, result);
+
+	/* we can grant lockless lock right after all conflicting locks
+	 * are canceled. */
+	if (osc_lock_is_lockless(oscl)) {
+		oscl->ols_state = OLS_GRANTED;
+		oio->oi_lockless = 1;
+		RETURN(0);
+	}
+
+enqueue_base:
+	oscl->ols_state = OLS_ENQUEUED;
+	if (anchor != NULL) {
+		atomic_inc(&anchor->csi_sync_nr);
+		oscl->ols_owner = anchor;
+	}
+
+	/**
+	 * DLM lock's ast data must be osc_object;
+	 * DLM's enqueue callback set to osc_lock_upcall() with cookie as
+	 * osc_lock.
+	 */
+	fid_build_reg_res_name(lu_object_fid(osc2lu(osc)), resname);
+	mdc_lock_build_policy(env, policy);
+	LASSERT(!oscl->ols_speculative);
+	result = mdc_enqueue_send(env, osc_export(osc), resname,
+				  &oscl->ols_flags, policy,
+				  &oscl->ols_lvb, osc->oo_oinfo->loi_kms_valid,
+				  upcall, cookie, &oscl->ols_einfo, async);
+	if (result == 0) {
+		if (osc_lock_is_lockless(oscl)) {
+			oio->oi_lockless = 1;
+		} else if (!async) {
+			LASSERT(oscl->ols_state == OLS_GRANTED);
+			LASSERT(oscl->ols_hold);
+			LASSERT(oscl->ols_dlmlock != NULL);
+		}
+	}
+out:
+	if (result < 0) {
+		oscl->ols_state = OLS_CANCELLED;
+		osc_lock_wake_waiters(env, osc, oscl);
+
+		if (anchor != NULL)
+			cl_sync_io_note(env, anchor, result);
+	}
+	RETURN(result);
+}
+
+static const struct cl_lock_operations mdc_lock_lockless_ops = {
+	.clo_fini = osc_lock_fini,
+	.clo_enqueue = mdc_lock_enqueue,
+	.clo_cancel = mdc_lock_lockless_cancel,
+	.clo_print = osc_lock_print
+};
+
+static const struct cl_lock_operations mdc_lock_ops = {
+	.clo_fini	= osc_lock_fini,
+	.clo_enqueue	= mdc_lock_enqueue,
+	.clo_cancel	= osc_lock_cancel,
+	.clo_print	= osc_lock_print,
+};
+
+int mdc_lock_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_lock *lock, const struct cl_io *io)
+{
+	struct osc_lock *ols;
+	__u32 enqflags = lock->cll_descr.cld_enq_flags;
+	__u64 flags = osc_enq2ldlm_flags(enqflags);
+
+	ENTRY;
+
+	/* Ignore AGL for Data-on-MDT, stat returns size data */
+	if ((enqflags & CEF_SPECULATIVE) != 0)
+		RETURN(0);
+
+	OBD_SLAB_ALLOC_PTR_GFP(ols, osc_lock_kmem, GFP_NOFS);
+	if (unlikely(ols == NULL))
+		RETURN(-ENOMEM);
+
+	ols->ols_state = OLS_NEW;
+	spin_lock_init(&ols->ols_lock);
+	INIT_LIST_HEAD(&ols->ols_waiting_list);
+	INIT_LIST_HEAD(&ols->ols_wait_entry);
+	INIT_LIST_HEAD(&ols->ols_nextlock_oscobj);
+	ols->ols_lockless_ops = &mdc_lock_lockless_ops;
+
+	ols->ols_flags = flags;
+	ols->ols_speculative = !!(enqflags & CEF_SPECULATIVE);
+
+	if (ols->ols_flags & LDLM_FL_HAS_INTENT) {
+		ols->ols_flags |= LDLM_FL_BLOCK_GRANTED;
+		ols->ols_glimpse = 1;
+	}
+	mdc_lock_build_einfo(env, lock, cl2osc(obj), &ols->ols_einfo);
+
+	cl_lock_slice_add(lock, &ols->ols_cl, obj, &mdc_lock_ops);
+
+	if (!(enqflags & CEF_MUST))
+		osc_lock_to_lockless(env, ols, (enqflags & CEF_NEVER));
+	if (ols->ols_locklessable && !(enqflags & CEF_DISCARD_DATA))
+		ols->ols_flags |= LDLM_FL_DENY_ON_CONTENTION;
+
+	if (io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io))
+		osc_lock_set_writer(env, io, obj, ols);
+
+	LDLM_DEBUG_NOLOCK("lock %p, mdc lock %p, flags %llx\n",
+			  lock, ols, ols->ols_flags);
+	RETURN(0);
+}
+
+/**
+ * IO operations.
+ *
+ * An implementation of cl_io_operations specific methods for MDC layer.
+ *
+ */
+static int mdc_async_upcall(void *a, int rc)
+{
+	struct osc_async_cbargs *args = a;
+
+	args->opc_rc = rc;
+	complete(&args->opc_sync);
+	return 0;
+}
+
+static int mdc_get_lock_handle(const struct lu_env *env, struct osc_object *osc,
+			       pgoff_t index, struct lustre_handle *lh)
+{
+	struct ldlm_lock *lock;
+
+	/* find DOM lock protecting object */
+	lock = mdc_dlmlock_at_pgoff(env, osc, index,
+				    OSC_DAP_FL_TEST_LOCK |
+				    OSC_DAP_FL_CANCELING);
+	if (lock == NULL) {
+		struct ldlm_resource *res;
+		struct ldlm_res_id *resname;
+
+		resname = &osc_env_info(env)->oti_resname;
+		fid_build_reg_res_name(lu_object_fid(osc2lu(osc)), resname);
+		res = ldlm_resource_get(osc_export(osc)->exp_obd->obd_namespace,
+					NULL, resname, LDLM_IBITS, 0);
+		ldlm_resource_dump(D_ERROR, res);
+		libcfs_debug_dumpstack(NULL);
+		return -ENOENT;
+	} else {
+		*lh = lock->l_remote_handle;
+		LDLM_LOCK_PUT(lock);
+	}
+	return 0;
+}
+
+static int mdc_io_setattr_start(const struct lu_env *env,
+				const struct cl_io_slice *slice)
+{
+	struct cl_io *io = slice->cis_io;
+	struct osc_io *oio = cl2osc_io(env, slice);
+	struct cl_object *obj = slice->cis_obj;
+	struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo;
+	struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+	struct obdo *oa = &oio->oi_oa;
+	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+	__u64 size = io->u.ci_setattr.sa_attr.lvb_size;
+	unsigned int ia_avalid = io->u.ci_setattr.sa_avalid;
+	enum op_xvalid ia_xvalid = io->u.ci_setattr.sa_xvalid;
+	int rc;
+
+	/* silently ignore non-truncate setattr for Data-on-MDT object */
+	if (cl_io_is_trunc(io)) {
+		/* truncate cache dirty pages first */
+		rc = osc_cache_truncate_start(env, cl2osc(obj), size,
+					      &oio->oi_trunc);
+		if (rc < 0)
+			return rc;
+	}
+
+	if (oio->oi_lockless == 0) {
+		cl_object_attr_lock(obj);
+		rc = cl_object_attr_get(env, obj, attr);
+		if (rc == 0) {
+			struct ost_lvb *lvb = &io->u.ci_setattr.sa_attr;
+			unsigned int cl_valid = 0;
+
+			if (ia_avalid & ATTR_SIZE) {
+				attr->cat_size = size;
+				attr->cat_kms = size;
+				cl_valid = (CAT_SIZE | CAT_KMS);
+			}
+			if (ia_avalid & ATTR_MTIME_SET) {
+				attr->cat_mtime = lvb->lvb_mtime;
+				cl_valid |= CAT_MTIME;
+			}
+			if (ia_avalid & ATTR_ATIME_SET) {
+				attr->cat_atime = lvb->lvb_atime;
+				cl_valid |= CAT_ATIME;
+			}
+			if (ia_xvalid & OP_XVALID_CTIME_SET) {
+				attr->cat_ctime = lvb->lvb_ctime;
+				cl_valid |= CAT_CTIME;
+			}
+			rc = cl_object_attr_update(env, obj, attr, cl_valid);
+		}
+		cl_object_attr_unlock(obj);
+		if (rc < 0)
+			return rc;
+	}
+
+	if (!(ia_avalid & ATTR_SIZE))
+		return 0;
+
+	memset(oa, 0, sizeof(*oa));
+	oa->o_oi = loi->loi_oi;
+	oa->o_mtime = attr->cat_mtime;
+	oa->o_atime = attr->cat_atime;
+	oa->o_ctime = attr->cat_ctime;
+
+	oa->o_size = size;
+	oa->o_blocks = OBD_OBJECT_EOF;
+	oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLATIME |
+		      OBD_MD_FLCTIME | OBD_MD_FLMTIME | OBD_MD_FLSIZE |
+		      OBD_MD_FLBLOCKS;
+	if (oio->oi_lockless) {
+		oa->o_flags = OBD_FL_SRVLOCK;
+		oa->o_valid |= OBD_MD_FLFLAGS;
+	} else {
+		rc = mdc_get_lock_handle(env, cl2osc(obj), CL_PAGE_EOF,
+					 &oa->o_handle);
+		if (!rc)
+			oa->o_valid |= OBD_MD_FLHANDLE;
+	}
+
+	init_completion(&cbargs->opc_sync);
+
+	rc = osc_punch_send(osc_export(cl2osc(obj)), oa,
+			    mdc_async_upcall, cbargs);
+	cbargs->opc_rpc_sent = rc == 0;
+	return rc;
+}
+
+static int mdc_io_read_ahead(const struct lu_env *env,
+			     const struct cl_io_slice *ios,
+			     pgoff_t start, struct cl_read_ahead *ra)
+{
+	struct osc_object *osc = cl2osc(ios->cis_obj);
+	struct ldlm_lock *dlmlock;
+
+	ENTRY;
+
+	dlmlock = mdc_dlmlock_at_pgoff(env, osc, start, 0);
+	if (dlmlock == NULL)
+		RETURN(-ENODATA);
+
+	if (dlmlock->l_req_mode != LCK_PR) {
+		struct lustre_handle lockh;
+
+		ldlm_lock2handle(dlmlock, &lockh);
+		ldlm_lock_addref(&lockh, LCK_PR);
+		ldlm_lock_decref(&lockh, dlmlock->l_req_mode);
+	}
+
+	ra->cra_rpc_size = osc_cli(osc)->cl_max_pages_per_rpc;
+	ra->cra_end = CL_PAGE_EOF;
+	ra->cra_release = osc_read_ahead_release;
+	ra->cra_cbdata = dlmlock;
+
+	RETURN(0);
+}
+
+int mdc_io_fsync_start(const struct lu_env *env,
+		       const struct cl_io_slice *slice)
+{
+	struct cl_io *io = slice->cis_io;
+	struct cl_fsync_io *fio = &io->u.ci_fsync;
+	struct cl_object *obj = slice->cis_obj;
+	struct osc_object *osc = cl2osc(obj);
+	int result = 0;
+
+	ENTRY;
+
+	/* a MDC lock always covers whole object, do sync for whole
+	 * possible range despite of supplied start/end values.
+	 */
+	result = osc_cache_writeback_range(env, osc, 0, CL_PAGE_EOF, 0,
+					   fio->fi_mode == CL_FSYNC_DISCARD);
+	if (result > 0) {
+		fio->fi_nr_written += result;
+		result = 0;
+	}
+	if (fio->fi_mode == CL_FSYNC_ALL) {
+		int rc;
+
+		rc = osc_cache_wait_range(env, osc, 0, CL_PAGE_EOF);
+		if (result == 0)
+			result = rc;
+		/* Use OSC sync code because it is asynchronous.
+		 * It is to be added into MDC and avoid the using of
+		 * OST_SYNC at both MDC and MDT.
+		 */
+		rc = osc_fsync_ost(env, osc, fio);
+		if (result == 0)
+			result = rc;
+	}
+
+	RETURN(result);
+}
+
+struct mdc_data_version_args {
+	struct osc_io *dva_oio;
+};
+
+static int
+mdc_data_version_interpret(const struct lu_env *env, struct ptlrpc_request *req,
+			   void *arg, int rc)
+{
+	struct mdc_data_version_args *dva = arg;
+	struct osc_io *oio = dva->dva_oio;
+	const struct mdt_body *body;
+
+	ENTRY;
+	if (rc < 0)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	/* Prepare OBDO from mdt_body for CLIO */
+	oio->oi_oa.o_valid = body->mbo_valid;
+	oio->oi_oa.o_flags = body->mbo_flags;
+	oio->oi_oa.o_data_version = body->mbo_version;
+	oio->oi_oa.o_layout_version = body->mbo_layout_gen;
+	EXIT;
+out:
+	oio->oi_cbarg.opc_rc = rc;
+	complete(&oio->oi_cbarg.opc_sync);
+	return 0;
+}
+
+static int mdc_io_data_version_start(const struct lu_env *env,
+				     const struct cl_io_slice *slice)
+{
+	struct cl_data_version_io *dv = &slice->cis_io->u.ci_data_version;
+	struct osc_io *oio = cl2osc_io(env, slice);
+	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+	struct osc_object *obj = cl2osc(slice->cis_obj);
+	struct obd_export *exp = osc_export(obj);
+	struct ptlrpc_request *req;
+	struct mdt_body *body;
+	struct mdc_data_version_args *dva;
+	int rc;
+
+	ENTRY;
+
+	memset(&oio->oi_oa, 0, sizeof(oio->oi_oa));
+	oio->oi_oa.o_oi.oi_fid = *lu_object_fid(osc2lu(obj));
+	oio->oi_oa.o_valid = OBD_MD_FLID;
+
+	init_completion(&cbargs->opc_sync);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_MDT_BODY);
+	body->mbo_fid1 = *lu_object_fid(osc2lu(obj));
+	body->mbo_valid = OBD_MD_FLID;
+	/* Indicate that data version is needed */
+	body->mbo_valid |= OBD_MD_FLDATAVERSION;
+	body->mbo_flags = 0;
+
+	if (dv->dv_flags & (LL_DV_RD_FLUSH | LL_DV_WR_FLUSH)) {
+		body->mbo_valid |= OBD_MD_FLFLAGS;
+		body->mbo_flags |= OBD_FL_SRVLOCK;
+		if (dv->dv_flags & LL_DV_WR_FLUSH)
+			body->mbo_flags |= OBD_FL_FLUSH;
+	}
+
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, 0);
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, 0);
+	ptlrpc_request_set_replen(req);
+
+	req->rq_interpret_reply = mdc_data_version_interpret;
+	CLASSERT(sizeof(*dva) <= sizeof(req->rq_async_args));
+	dva = ptlrpc_req_async_args(req);
+	dva->dva_oio = oio;
+
+	ptlrpcd_add_req(req);
+
+	RETURN(0);
+}
+
+static void mdc_io_data_version_end(const struct lu_env *env,
+				    const struct cl_io_slice *slice)
+{
+	struct cl_data_version_io *dv = &slice->cis_io->u.ci_data_version;
+	struct osc_io *oio = cl2osc_io(env, slice);
+	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+
+	ENTRY;
+	wait_for_completion(&cbargs->opc_sync);
+
+	if (cbargs->opc_rc != 0) {
+		slice->cis_io->ci_result = cbargs->opc_rc;
+	} else {
+		slice->cis_io->ci_result = 0;
+		if (!(oio->oi_oa.o_valid &
+		      (OBD_MD_LAYOUT_VERSION | OBD_MD_FLDATAVERSION)))
+			slice->cis_io->ci_result = -ENOTSUPP;
+
+		if (oio->oi_oa.o_valid & OBD_MD_LAYOUT_VERSION)
+			dv->dv_layout_version = oio->oi_oa.o_layout_version;
+		if (oio->oi_oa.o_valid & OBD_MD_FLDATAVERSION)
+			dv->dv_data_version = oio->oi_oa.o_data_version;
+	}
+
+	EXIT;
+}
+
+static struct cl_io_operations mdc_io_ops = {
+	.op = {
+		[CIT_READ] = {
+			.cio_iter_init = osc_io_iter_init,
+			.cio_iter_fini = osc_io_iter_fini,
+			.cio_start     = osc_io_read_start,
+		},
+		[CIT_WRITE] = {
+			.cio_iter_init = osc_io_write_iter_init,
+			.cio_iter_fini = osc_io_write_iter_fini,
+			.cio_start     = osc_io_write_start,
+			.cio_end       = osc_io_end,
+		},
+		[CIT_SETATTR] = {
+			.cio_iter_init = osc_io_iter_init,
+			.cio_iter_fini = osc_io_iter_fini,
+			.cio_start     = mdc_io_setattr_start,
+			.cio_end       = osc_io_setattr_end,
+		},
+		[CIT_DATA_VERSION] = {
+			.cio_start = mdc_io_data_version_start,
+			.cio_end   = mdc_io_data_version_end,
+		},
+		[CIT_FAULT] = {
+			.cio_iter_init = osc_io_iter_init,
+			.cio_iter_fini = osc_io_iter_fini,
+			.cio_start     = osc_io_fault_start,
+			.cio_end       = osc_io_end,
+		},
+		[CIT_FSYNC] = {
+			.cio_start = mdc_io_fsync_start,
+			.cio_end   = osc_io_fsync_end,
+		},
+	},
+	.cio_read_ahead   = mdc_io_read_ahead,
+	.cio_submit	  = osc_io_submit,
+	.cio_commit_async = osc_io_commit_async,
+};
+
+int mdc_io_init(const struct lu_env *env, struct cl_object *obj,
+		struct cl_io *io)
+{
+	struct osc_io *oio = osc_env_io(env);
+
+	CL_IO_SLICE_CLEAN(oio, oi_cl);
+	cl_io_slice_add(io, &oio->oi_cl, obj, &mdc_io_ops);
+	return 0;
+}
+
+static void mdc_build_res_name(struct osc_object *osc,
+				   struct ldlm_res_id *resname)
+{
+	fid_build_reg_res_name(lu_object_fid(osc2lu(osc)), resname);
+}
+
+/**
+ * Implementation of struct cl_req_operations::cro_attr_set() for MDC
+ * layer. MDC is responsible for struct obdo::o_id and struct obdo::o_seq
+ * fields.
+ */
+static void mdc_req_attr_set(const struct lu_env *env, struct cl_object *obj,
+			     struct cl_req_attr *attr)
+{
+	u64 flags = attr->cra_flags;
+
+	/* Copy object FID to cl_attr */
+	attr->cra_oa->o_oi.oi_fid = *lu_object_fid(&obj->co_lu);
+
+	if (flags & OBD_MD_FLGROUP)
+		attr->cra_oa->o_valid |= OBD_MD_FLGROUP;
+
+	if (flags & OBD_MD_FLID)
+		attr->cra_oa->o_valid |= OBD_MD_FLID;
+
+	if (flags & OBD_MD_FLHANDLE) {
+		struct osc_page *opg;
+
+		opg = osc_cl_page_osc(attr->cra_page, cl2osc(obj));
+		if (!opg->ops_srvlock) {
+			int rc;
+
+			rc = mdc_get_lock_handle(env, cl2osc(obj),
+						 osc_index(opg),
+						 &attr->cra_oa->o_handle);
+			if (rc) {
+				CL_PAGE_DEBUG(D_ERROR, env, attr->cra_page,
+					      "uncovered page!\n");
+				LBUG();
+			} else {
+				attr->cra_oa->o_valid |= OBD_MD_FLHANDLE;
+			}
+		}
+	}
+}
+
+static int mdc_attr_get(const struct lu_env *env, struct cl_object *obj,
+			struct cl_attr *attr)
+{
+	struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+
+	if (OST_LVB_IS_ERR(oinfo->loi_lvb.lvb_blocks))
+		return OST_LVB_GET_ERR(oinfo->loi_lvb.lvb_blocks);
+
+	return osc_attr_get(env, obj, attr);
+}
+
+static int mdc_object_ast_clear(struct ldlm_lock *lock, void *data)
+{
+	struct osc_object *osc = (struct osc_object *)data;
+	struct ost_lvb *lvb = &lock->l_ost_lvb;
+	struct lov_oinfo *oinfo;
+	ENTRY;
+
+	if (lock->l_ast_data == data) {
+		lock->l_ast_data = NULL;
+
+		LASSERT(osc != NULL);
+		LASSERT(osc->oo_oinfo != NULL);
+		LASSERT(lvb != NULL);
+
+		/* Updates lvb in lock by the cached oinfo */
+		oinfo = osc->oo_oinfo;
+
+		LDLM_DEBUG(lock, "update lock size %llu blocks %llu [cma]time: "
+			   "%llu %llu %llu by oinfo size %llu blocks %llu "
+			   "[cma]time %llu %llu %llu", lvb->lvb_size,
+			   lvb->lvb_blocks, lvb->lvb_ctime, lvb->lvb_mtime,
+			   lvb->lvb_atime, oinfo->loi_lvb.lvb_size,
+			   oinfo->loi_lvb.lvb_blocks, oinfo->loi_lvb.lvb_ctime,
+			   oinfo->loi_lvb.lvb_mtime, oinfo->loi_lvb.lvb_atime);
+		LASSERT(oinfo->loi_lvb.lvb_size >= oinfo->loi_kms);
+
+		cl_object_attr_lock(&osc->oo_cl);
+		memcpy(lvb, &oinfo->loi_lvb, sizeof(oinfo->loi_lvb));
+		cl_object_attr_unlock(&osc->oo_cl);
+		ldlm_clear_lvb_cached(lock);
+	}
+	RETURN(LDLM_ITER_CONTINUE);
+}
+
+int mdc_object_prune(const struct lu_env *env, struct cl_object *obj)
+{
+	struct osc_object *osc = cl2osc(obj);
+	struct ldlm_res_id *resname = &osc_env_info(env)->oti_resname;
+
+	/* DLM locks don't hold a reference of osc_object so we have to
+	 * clear it before the object is being destroyed. */
+	osc_build_res_name(osc, resname);
+	ldlm_resource_iterate(osc_export(osc)->exp_obd->obd_namespace, resname,
+			      mdc_object_ast_clear, osc);
+	return 0;
+}
+
+static int mdc_object_flush(const struct lu_env *env, struct cl_object *obj,
+			    struct ldlm_lock *lock)
+{
+	/* if lock cancel is initiated from llite then it is combined
+	 * lock with DOM bit and it may have no l_ast_data initialized yet,
+	 * so init it here with given osc_object.
+	 */
+	mdc_set_dom_lock_data(lock, cl2osc(obj));
+	RETURN(mdc_dlm_blocking_ast0(env, lock, LDLM_CB_CANCELING));
+}
+
+static const struct cl_object_operations mdc_ops = {
+	.coo_page_init = osc_page_init,
+	.coo_lock_init = mdc_lock_init,
+	.coo_io_init = mdc_io_init,
+	.coo_attr_get = mdc_attr_get,
+	.coo_attr_update = osc_attr_update,
+	.coo_glimpse = osc_object_glimpse,
+	.coo_req_attr_set = mdc_req_attr_set,
+	.coo_prune = mdc_object_prune,
+	.coo_object_flush = mdc_object_flush
+};
+
+static const struct osc_object_operations mdc_object_ops = {
+	.oto_build_res_name = mdc_build_res_name,
+	.oto_dlmlock_at_pgoff = mdc_dlmlock_at_pgoff,
+};
+
+static int mdc_object_init(const struct lu_env *env, struct lu_object *obj,
+			   const struct lu_object_conf *conf)
+{
+	struct osc_object *osc = lu2osc(obj);
+
+	if (osc->oo_initialized)
+		return 0;
+
+	osc->oo_initialized = true;
+
+	return osc_object_init(env, obj, conf);
+}
+
+static void mdc_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+	osc_object_free(env, obj);
+}
+
+static const struct lu_object_operations mdc_lu_obj_ops = {
+	.loo_object_init = mdc_object_init,
+	.loo_object_delete = NULL,
+	.loo_object_release = NULL,
+	.loo_object_free = mdc_object_free,
+	.loo_object_print = osc_object_print,
+	.loo_object_invariant = NULL
+};
+
+struct lu_object *mdc_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *unused,
+				   struct lu_device *dev)
+{
+	struct osc_object *osc;
+	struct lu_object  *obj;
+
+	OBD_SLAB_ALLOC_PTR_GFP(osc, osc_object_kmem, GFP_NOFS);
+	if (osc != NULL) {
+		obj = osc2lu(osc);
+		lu_object_init(obj, NULL, dev);
+		osc->oo_cl.co_ops = &mdc_ops;
+		obj->lo_ops = &mdc_lu_obj_ops;
+		osc->oo_obj_ops = &mdc_object_ops;
+		osc->oo_initialized = false;
+	} else {
+		obj = NULL;
+	}
+	return obj;
+}
+
+static int mdc_cl_process_config(const struct lu_env *env,
+				 struct lu_device *d, struct lustre_cfg *cfg)
+{
+	return mdc_process_config(d->ld_obd, 0, cfg);
+}
+
+const struct lu_device_operations mdc_lu_ops = {
+	.ldo_object_alloc = mdc_object_alloc,
+	.ldo_process_config = mdc_cl_process_config,
+	.ldo_recovery_complete = NULL,
+};
+
+static struct lu_device *mdc_device_alloc(const struct lu_env *env,
+					  struct lu_device_type *t,
+					  struct lustre_cfg *cfg)
+{
+	struct lu_device *d;
+	struct osc_device *od;
+	struct obd_device *obd;
+	int rc;
+
+	OBD_ALLOC_PTR(od);
+	if (od == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	cl_device_init(&od->od_cl, t);
+	d = osc2lu_dev(od);
+	d->ld_ops = &mdc_lu_ops;
+
+	/* Setup MDC OBD */
+	obd = class_name2obd(lustre_cfg_string(cfg, 0));
+	if (obd == NULL)
+		RETURN(ERR_PTR(-ENODEV));
+
+	rc = mdc_setup(obd, cfg);
+	if (rc < 0) {
+		osc_device_free(env, d);
+		RETURN(ERR_PTR(rc));
+	}
+	od->od_exp = obd->obd_self_export;
+	RETURN(d);
+}
+
+static const struct lu_device_type_operations mdc_device_type_ops = {
+	.ldto_device_alloc = mdc_device_alloc,
+	.ldto_device_free = osc_device_free,
+	.ldto_device_init = osc_device_init,
+	.ldto_device_fini = osc_device_fini
+};
+
+struct lu_device_type mdc_device_type = {
+	.ldt_tags = LU_DEVICE_CL,
+	.ldt_name = LUSTRE_MDC_NAME,
+	.ldt_ops = &mdc_device_type_ops,
+	.ldt_ctx_tags = LCT_CL_THREAD
+};
+
+/** @} osc */
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_internal.h b/drivers/staging/lustrefsx/lustre/mdc/mdc_internal.h
index 98773524caee9..c0df4152bf80f 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/mdc_internal.h
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -35,9 +35,7 @@
 
 #include <lustre_mdc.h>
 
-#ifdef CONFIG_PROC_FS
-extern struct lprocfs_vars lprocfs_mdc_obd_vars[];
-#endif
+int mdc_tunables_init(struct obd_device *obd);
 
 void mdc_pack_body(struct ptlrpc_request *req, const struct lu_fid *fid,
 		   u64 valid, size_t ea_size, u32 suppgid, u32 flags);
@@ -58,6 +56,7 @@ void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
 void mdc_file_secctx_pack(struct ptlrpc_request *req,
 			  const char *secctx_name,
 			  const void *secctx, size_t secctx_size);
+void mdc_file_sepol_pack(struct ptlrpc_request *req);
 
 void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
 void mdc_getxattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
@@ -65,6 +64,8 @@ void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
 void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
 		     const char *old, size_t oldlen,
 		     const char *new, size_t newlen);
+void mdc_migrate_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+			const char *name, size_t namelen);
 void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
 
 /* mdc/mdc_locks.c */
@@ -95,6 +96,8 @@ int mdc_save_lovea(struct ptlrpc_request *req,
 /* mdc/mdc_request.c */
 int mdc_fid_alloc(const struct lu_env *env, struct obd_export *exp,
 		  struct lu_fid *fid, struct md_op_data *op_data);
+int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg);
+int mdc_process_config(struct obd_device *obd, size_t len, void *buf);
 
 struct obd_client_handle;
 
@@ -127,6 +130,7 @@ int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data,
 		void *ea, size_t ealen, struct ptlrpc_request **request);
 int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
 	       struct ptlrpc_request **request);
+int mdc_file_resync(struct obd_export *exp, struct md_op_data *data);
 int mdc_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
 		      union ldlm_policy_data *policy, enum ldlm_mode mode,
 		      enum ldlm_cancel_flags flags, void *opaque);
@@ -143,6 +147,11 @@ enum ldlm_mode mdc_lock_match(struct obd_export *exp, __u64 flags,
 			      enum ldlm_mode mode, struct lustre_handle *lockh);
 
 
+#define MDC_CHANGELOG_DEV_COUNT LMV_MAX_STRIPE_COUNT
+#define MDC_CHANGELOG_DEV_NAME	"changelog"
+extern struct class *mdc_changelog_class;
+extern dev_t mdc_changelog_dev;
+
 int mdc_changelog_cdev_init(struct obd_device *obd);
 
 void mdc_changelog_cdev_finish(struct obd_device *obd);
@@ -163,4 +172,15 @@ static inline unsigned long hash_x_index(__u64 hash, int hash64)
 	return ~0UL - (hash + !hash);
 }
 
+/* mdc_dev.c */
+extern struct lu_device_type mdc_device_type;
+int mdc_ldlm_blocking_ast(struct ldlm_lock *dlmlock,
+			  struct ldlm_lock_desc *new, void *data, int flag);
+int mdc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data);
+int mdc_fill_lvb(struct ptlrpc_request *req, struct ost_lvb *lvb);
+
+/* the minimum inline repsize should be PAGE_SIZE at least */
+#define MDC_DOM_DEF_INLINE_REPSIZE max(8192UL, PAGE_SIZE)
+#define MDC_DOM_MAX_INLINE_REPSIZE XATTR_SIZE_MAX
+
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c
index c93ec985f6581..dcc42508aca98 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_lib.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -36,7 +36,6 @@
 # include <linux/uidgid.h>
 #endif
 #include <lustre_net.h>
-#include <lustre/lustre_idl.h>
 #include <obd_class.h>
 #include <obd.h>
 #include <cl_object.h>
@@ -148,6 +147,22 @@ void mdc_file_secctx_pack(struct ptlrpc_request *req, const char *secctx_name,
 	memcpy(buf, secctx, buf_size);
 }
 
+void mdc_file_sepol_pack(struct ptlrpc_request *req)
+{
+	void *buf;
+	size_t buf_size;
+
+	if (strlen(req->rq_sepol) == 0)
+		return;
+
+	buf = req_capsule_client_get(&req->rq_pill, &RMF_SELINUX_POL);
+	buf_size = req_capsule_get_size(&req->rq_pill, &RMF_SELINUX_POL,
+					RCL_CLIENT);
+
+	LASSERT(buf_size == strlen(req->rq_sepol) + 1);
+	snprintf(buf, strlen(req->rq_sepol) + 1, "%s", req->rq_sepol);
+}
+
 void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff, size_t size,
 		      const struct lu_fid *fid)
 {
@@ -166,9 +181,9 @@ void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
 		     const void *data, size_t datalen, umode_t mode,
 		     uid_t uid, gid_t gid, cfs_cap_t cap_effective, __u64 rdev)
 {
-	struct mdt_rec_create	*rec;
-	char			*tmp;
-	__u64			 flags;
+	struct mdt_rec_create *rec;
+	char *tmp;
+	__u64 flags;
 
 	CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_create));
 	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
@@ -201,13 +216,19 @@ void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
 	mdc_file_secctx_pack(req, op_data->op_file_secctx_name,
 			     op_data->op_file_secctx,
 			     op_data->op_file_secctx_size);
+
+	/* pack SELinux policy info if any */
+	mdc_file_sepol_pack(req);
 }
 
 static inline __u64 mds_pack_open_flags(__u64 flags)
 {
-	__u64 cr_flags = (flags & (FMODE_READ | FMODE_WRITE |
-				   MDS_OPEN_FL_INTERNAL));
+	__u64 cr_flags = (flags & MDS_OPEN_FL_INTERNAL);
 
+	if (flags & FMODE_READ)
+		cr_flags |= MDS_FMODE_READ;
+	if (flags & FMODE_WRITE)
+		cr_flags |= MDS_FMODE_WRITE;
 	if (flags & O_CREAT)
 		cr_flags |= MDS_OPEN_CREAT;
 	if (flags & O_EXCL)
@@ -261,7 +282,7 @@ void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
 		rec->cr_suppgid1   = op_data->op_suppgids[0];
 		rec->cr_suppgid2   = op_data->op_suppgids[1];
 		rec->cr_bias       = op_data->op_bias;
-		rec->cr_old_handle = op_data->op_handle;
+		rec->cr_open_handle_old = op_data->op_open_handle;
 
 		if (op_data->op_name) {
 			mdc_pack_name(req, &RMF_NAME, op_data->op_name,
@@ -274,6 +295,9 @@ void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
 		mdc_file_secctx_pack(req, op_data->op_file_secctx_name,
 				     op_data->op_file_secctx,
 				     op_data->op_file_secctx_size);
+
+		/* pack SELinux policy info if any */
+		mdc_file_sepol_pack(req);
 	}
 
 	if (lmm) {
@@ -284,8 +308,9 @@ void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
 	set_mrc_cr_flags(rec, cr_flags);
 }
 
-static inline __u64 attr_pack(unsigned int ia_valid) {
-        __u64 sa_valid = 0;
+static inline u64 attr_pack(unsigned int ia_valid, enum op_xvalid ia_xvalid)
+{
+	u64 sa_valid = 0;
 
         if (ia_valid & ATTR_MODE)
                 sa_valid |= MDS_ATTR_MODE;
@@ -307,23 +332,27 @@ static inline __u64 attr_pack(unsigned int ia_valid) {
                 sa_valid |= MDS_ATTR_MTIME_SET;
         if (ia_valid & ATTR_FORCE)
                 sa_valid |= MDS_ATTR_FORCE;
-        if (ia_valid & ATTR_ATTR_FLAG)
-                sa_valid |= MDS_ATTR_ATTR_FLAG;
-        if (ia_valid & ATTR_KILL_SUID)
-                sa_valid |=  MDS_ATTR_KILL_SUID;
-        if (ia_valid & ATTR_KILL_SGID)
-                sa_valid |= MDS_ATTR_KILL_SGID;
-        if (ia_valid & ATTR_CTIME_SET)
-                sa_valid |= MDS_ATTR_CTIME_SET;
-        if (ia_valid & ATTR_FROM_OPEN)
-                sa_valid |= MDS_ATTR_FROM_OPEN;
-        if (ia_valid & ATTR_BLOCKS)
-                sa_valid |= MDS_ATTR_BLOCKS;
-        if (ia_valid & MDS_OPEN_OWNEROVERRIDE)
-                /* NFSD hack (see bug 5781) */
-                sa_valid |= MDS_OPEN_OWNEROVERRIDE;
-	if (ia_valid & MDS_ATTR_PROJID)
+	if (ia_xvalid & OP_XVALID_FLAGS)
+		sa_valid |= MDS_ATTR_ATTR_FLAG;
+	if (ia_valid & ATTR_KILL_SUID)
+		sa_valid |=  MDS_ATTR_KILL_SUID;
+	if (ia_valid & ATTR_KILL_SGID)
+		sa_valid |= MDS_ATTR_KILL_SGID;
+	if (ia_xvalid & OP_XVALID_CTIME_SET)
+		sa_valid |= MDS_ATTR_CTIME_SET;
+	if (ia_valid & ATTR_OPEN)
+		sa_valid |= MDS_ATTR_FROM_OPEN;
+	if (ia_xvalid & OP_XVALID_BLOCKS)
+		sa_valid |= MDS_ATTR_BLOCKS;
+	if (ia_xvalid & OP_XVALID_OWNEROVERRIDE)
+		/* NFSD hack (see bug 5781) */
+		sa_valid |= MDS_OPEN_OWNEROVERRIDE;
+	if (ia_xvalid & OP_XVALID_PROJID)
 		sa_valid |= MDS_ATTR_PROJID;
+	if (ia_xvalid & OP_XVALID_LAZYSIZE)
+		sa_valid |= MDS_ATTR_LSIZE;
+	if (ia_xvalid & OP_XVALID_LAZYBLOCKS)
+		sa_valid |= MDS_ATTR_LBLOCKS;
         return sa_valid;
 }
 
@@ -337,7 +366,8 @@ static void mdc_setattr_pack_rec(struct mdt_rec_setattr *rec,
 	rec->sa_suppgid = -1;
 
 	rec->sa_fid    = op_data->op_fid1;
-	rec->sa_valid  = attr_pack(op_data->op_attr.ia_valid);
+	rec->sa_valid  = attr_pack(op_data->op_attr.ia_valid,
+				   op_data->op_xvalid);
 	rec->sa_mode   = op_data->op_attr.ia_mode;
 	rec->sa_uid    = from_kuid(&init_user_ns, op_data->op_attr.ia_uid);
 	rec->sa_gid    = from_kgid(&init_user_ns, op_data->op_attr.ia_gid);
@@ -361,7 +391,7 @@ static void mdc_setattr_pack_rec(struct mdt_rec_setattr *rec,
 static void mdc_ioepoch_pack(struct mdt_ioepoch *epoch,
 			     struct md_op_data *op_data)
 {
-	epoch->mio_handle = op_data->op_handle;
+	epoch->mio_open_handle = op_data->op_open_handle;
 	epoch->mio_unused1 = 0;
 	epoch->mio_unused2 = 0;
 	epoch->mio_padding = 0;
@@ -414,6 +444,9 @@ void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
         rec->ul_bias    = op_data->op_bias;
 
 	mdc_pack_name(req, &RMF_NAME, op_data->op_name, op_data->op_namelen);
+
+	/* pack SELinux policy info if any */
+	mdc_file_sepol_pack(req);
 }
 
 void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
@@ -436,17 +469,19 @@ void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
         rec->lk_bias     = op_data->op_bias;
 
 	mdc_pack_name(req, &RMF_NAME, op_data->op_name, op_data->op_namelen);
+
+	/* pack SELinux policy info if any */
+	mdc_file_sepol_pack(req);
 }
 
-static void mdc_intent_close_pack(struct ptlrpc_request *req,
+static void mdc_close_intent_pack(struct ptlrpc_request *req,
 				  struct md_op_data *op_data)
 {
 	struct close_data	*data;
 	struct ldlm_lock	*lock;
 	enum mds_op_bias	 bias = op_data->op_bias;
 
-	if (!(bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP |
-		      MDS_RENAME_MIGRATE)))
+	if (!(bias & (MDS_CLOSE_INTENT | MDS_CLOSE_MIGRATE)))
 		return;
 
 	data = req_capsule_client_get(&req->rq_pill, &RMF_CLOSE_DATA);
@@ -461,44 +496,90 @@ static void mdc_intent_close_pack(struct ptlrpc_request *req,
 
 	data->cd_data_version = op_data->op_data_version;
 	data->cd_fid = op_data->op_fid2;
+
+	if (bias & MDS_CLOSE_LAYOUT_SPLIT) {
+		data->cd_mirror_id = op_data->op_mirror_id;
+	} else if (bias & MDS_CLOSE_RESYNC_DONE) {
+		struct close_data_resync_done *sync = &data->cd_resync;
+
+		CLASSERT(sizeof(data->cd_resync) <= sizeof(data->cd_reserved));
+		sync->resync_count = op_data->op_data_size / sizeof(__u32);
+		if (sync->resync_count <= INLINE_RESYNC_ARRAY_SIZE) {
+			memcpy(sync->resync_ids_inline, op_data->op_data,
+			       op_data->op_data_size);
+		} else {
+			size_t count = sync->resync_count;
+
+			memcpy(req_capsule_client_get(&req->rq_pill, &RMF_U32),
+				op_data->op_data, count * sizeof(__u32));
+		}
+	}
 }
 
 void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
 		     const char *old, size_t oldlen,
 		     const char *new, size_t newlen)
 {
-        struct mdt_rec_rename *rec;
+	struct mdt_rec_rename *rec;
 
-        CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_rename));
-        rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+	CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_rename));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
 
-        /* XXX do something about time, uid, gid */
-	rec->rn_opcode  = op_data->op_cli_flags & CLI_MIGRATE ?
-					REINT_MIGRATE : REINT_RENAME;
-        rec->rn_fsuid    = op_data->op_fsuid;
-        rec->rn_fsgid    = op_data->op_fsgid;
-        rec->rn_cap      = op_data->op_cap;
-        rec->rn_suppgid1 = op_data->op_suppgids[0];
-        rec->rn_suppgid2 = op_data->op_suppgids[1];
-        rec->rn_fid1     = op_data->op_fid1;
-        rec->rn_fid2     = op_data->op_fid2;
-        rec->rn_time     = op_data->op_mod_time;
-        rec->rn_mode     = op_data->op_mode;
-        rec->rn_bias     = op_data->op_bias;
+	/* XXX do something about time, uid, gid */
+	rec->rn_opcode	 = REINT_RENAME;
+	rec->rn_fsuid    = op_data->op_fsuid;
+	rec->rn_fsgid    = op_data->op_fsgid;
+	rec->rn_cap      = op_data->op_cap;
+	rec->rn_suppgid1 = op_data->op_suppgids[0];
+	rec->rn_suppgid2 = op_data->op_suppgids[1];
+	rec->rn_fid1     = op_data->op_fid1;
+	rec->rn_fid2     = op_data->op_fid2;
+	rec->rn_time     = op_data->op_mod_time;
+	rec->rn_mode     = op_data->op_mode;
+	rec->rn_bias     = op_data->op_bias;
 
 	mdc_pack_name(req, &RMF_NAME, old, oldlen);
 
 	if (new != NULL)
 		mdc_pack_name(req, &RMF_SYMTGT, new, newlen);
 
-	if (op_data->op_cli_flags & CLI_MIGRATE &&
-	    op_data->op_bias & MDS_RENAME_MIGRATE) {
+	/* pack SELinux policy info if any */
+	mdc_file_sepol_pack(req);
+}
+
+void mdc_migrate_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		      const char *name, size_t namelen)
+{
+	struct mdt_rec_rename *rec;
+	char *ea;
+
+	CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_rename));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+	rec->rn_opcode	 = REINT_MIGRATE;
+	rec->rn_fsuid    = op_data->op_fsuid;
+	rec->rn_fsgid    = op_data->op_fsgid;
+	rec->rn_cap      = op_data->op_cap;
+	rec->rn_suppgid1 = op_data->op_suppgids[0];
+	rec->rn_suppgid2 = op_data->op_suppgids[1];
+	rec->rn_fid1     = op_data->op_fid1;
+	rec->rn_fid2	 = op_data->op_fid4;
+	rec->rn_time     = op_data->op_mod_time;
+	rec->rn_mode     = op_data->op_mode;
+	rec->rn_bias     = op_data->op_bias;
+
+	mdc_pack_name(req, &RMF_NAME, name, namelen);
+
+	if (op_data->op_bias & MDS_CLOSE_MIGRATE) {
 		struct mdt_ioepoch *epoch;
 
-		mdc_intent_close_pack(req, op_data);
+		mdc_close_intent_pack(req, op_data);
 		epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
 		mdc_ioepoch_pack(epoch, op_data);
 	}
+
+	ea = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+	memcpy(ea, op_data->op_data, op_data->op_data_size);
 }
 
 void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, __u32 flags,
@@ -508,8 +589,6 @@ void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, __u32 flags,
                                                     &RMF_MDT_BODY);
 
 	b->mbo_valid = valid;
-	if (op_data->op_bias & MDS_CHECK_SPLIT)
-		b->mbo_valid |= OBD_MD_FLCKSPLIT;
 	if (op_data->op_bias & MDS_CROSS_REF)
 		b->mbo_valid |= OBD_MD_FLCROSSREF;
 	b->mbo_eadatasize = ea_size;
@@ -547,5 +626,5 @@ void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
 		rec->sa_valid &= ~MDS_ATTR_ATIME;
 
 	mdc_ioepoch_pack(epoch, op_data);
-	mdc_intent_close_pack(req, op_data);
+	mdc_close_intent_pack(req, op_data);
 }
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c
index cb809c2ce4b89..1c1e54b87590f 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -43,6 +43,7 @@
 #include <lustre_net.h>
 #include <lustre_req_layout.h>
 #include <lustre_swab.h>
+#include <lustre_acl.h>
 
 #include "mdc_internal.h"
 
@@ -244,7 +245,7 @@ int mdc_save_lovea(struct ptlrpc_request *req,
 
 static struct ptlrpc_request *
 mdc_intent_open_pack(struct obd_export *exp, struct lookup_intent *it,
-		     struct md_op_data *op_data)
+		     struct md_op_data *op_data, __u32 acl_bufsize)
 {
 	struct ptlrpc_request	*req;
 	struct obd_device	*obddev = class_exp2obd(exp);
@@ -255,6 +256,8 @@ mdc_intent_open_pack(struct obd_export *exp, struct lookup_intent *it,
 	int			 count = 0;
 	enum ldlm_mode		 mode;
 	int			 rc;
+	int repsize, repsize_estimate;
+
 	ENTRY;
 
 	it->it_create_mode = (it->it_create_mode & ~S_IFMT) | S_IFREG;
@@ -263,12 +266,12 @@ mdc_intent_open_pack(struct obd_export *exp, struct lookup_intent *it,
 	/* If inode is known, cancel conflicting OPEN locks. */
 	if (fid_is_sane(&op_data->op_fid2)) {
 		if (it->it_flags & MDS_OPEN_LEASE) { /* try to get lease */
-			if (it->it_flags & FMODE_WRITE)
+			if (it->it_flags & MDS_FMODE_WRITE)
 				mode = LCK_EX;
 			else
 				mode = LCK_PR;
 		} else {
-			if (it->it_flags & (FMODE_WRITE|MDS_OPEN_TRUNC))
+			if (it->it_flags & (MDS_FMODE_WRITE | MDS_OPEN_TRUNC))
 				mode = LCK_CW;
 #ifdef FMODE_EXEC
 			else if (it->it_flags & FMODE_EXEC)
@@ -300,16 +303,32 @@ mdc_intent_open_pack(struct obd_export *exp, struct lookup_intent *it,
 
         req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
                              op_data->op_namelen + 1);
-	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+	if (cl_is_lov_delay_create(it->it_flags)) {
+		/* open(O_LOV_DELAY_CREATE) won't pack lmm */
+		LASSERT(lmmsize == 0);
+		req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, 0);
+	} else {
+		req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
 			     max(lmmsize, obddev->u.cli.cl_default_mds_easize));
+	}
 
 	req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX_NAME,
 			     RCL_CLIENT, op_data->op_file_secctx_name != NULL ?
-			     strlen(op_data->op_file_secctx_name) + 1 : 0);
+			     op_data->op_file_secctx_name_size : 0);
 
 	req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX, RCL_CLIENT,
 			     op_data->op_file_secctx_size);
 
+	/* get SELinux policy info if any */
+	rc = sptlrpc_get_sepol(req);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(ERR_PTR(rc));
+	}
+	req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
+			     strlen(req->rq_sepol) ?
+			     strlen(req->rq_sepol) + 1 : 0);
+
 	rc = ldlm_prep_enqueue_req(exp, req, &cancels, count);
 	if (rc < 0) {
 		ptlrpc_request_free(req);
@@ -330,10 +349,71 @@ mdc_intent_open_pack(struct obd_export *exp, struct lookup_intent *it,
 
 	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
 			     obddev->u.cli.cl_max_mds_easize);
-	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
-			     req->rq_import->imp_connect_data.ocd_max_easize);
-        ptlrpc_request_set_replen(req);
-        return req;
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, acl_bufsize);
+
+	if (!(it->it_op & IT_CREAT) && it->it_op & IT_OPEN &&
+	    req_capsule_has_field(&req->rq_pill, &RMF_FILE_SECCTX_NAME,
+				  RCL_CLIENT) &&
+	    op_data->op_file_secctx_name_size > 0 &&
+	    op_data->op_file_secctx_name != NULL) {
+		char *secctx_name;
+
+		secctx_name = req_capsule_client_get(&req->rq_pill,
+						     &RMF_FILE_SECCTX_NAME);
+		memcpy(secctx_name, op_data->op_file_secctx_name,
+		       op_data->op_file_secctx_name_size);
+		req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX,
+				     RCL_SERVER,
+				     obddev->u.cli.cl_max_mds_easize);
+
+		CDEBUG(D_SEC, "packed '%.*s' as security xattr name\n",
+		       op_data->op_file_secctx_name_size,
+		       op_data->op_file_secctx_name);
+
+	} else {
+		req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX,
+				     RCL_SERVER, 0);
+	}
+
+	/**
+	 * Inline buffer for possible data from Data-on-MDT files.
+	 */
+	req_capsule_set_size(&req->rq_pill, &RMF_NIOBUF_INLINE, RCL_SERVER,
+			     sizeof(struct niobuf_remote));
+	ptlrpc_request_set_replen(req);
+
+	/* Get real repbuf allocated size as rounded up power of 2 */
+	repsize = size_roundup_power2(req->rq_replen +
+				      lustre_msg_early_size());
+	/* Estimate free space for DoM files in repbuf */
+	repsize_estimate = repsize - (req->rq_replen -
+			   obddev->u.cli.cl_max_mds_easize +
+			   sizeof(struct lov_comp_md_v1) +
+			   sizeof(struct lov_comp_md_entry_v1) +
+			   lov_mds_md_size(0, LOV_MAGIC_V3));
+
+	if (repsize_estimate < obddev->u.cli.cl_dom_min_inline_repsize) {
+		repsize = obddev->u.cli.cl_dom_min_inline_repsize -
+			  repsize_estimate + sizeof(struct niobuf_remote);
+		req_capsule_set_size(&req->rq_pill, &RMF_NIOBUF_INLINE,
+				     RCL_SERVER,
+				     sizeof(struct niobuf_remote) + repsize);
+		ptlrpc_request_set_replen(req);
+		CDEBUG(D_INFO, "Increase repbuf by %d bytes, total: %d\n",
+		       repsize, req->rq_replen);
+		repsize = size_roundup_power2(req->rq_replen +
+					      lustre_msg_early_size());
+	}
+	/* The only way to report real allocated repbuf size to the server
+	 * is the lm_repsize but it must be set prior buffer allocation itself
+	 * due to security reasons - it is part of buffer used in signature
+	 * calculation (see LU-11414). Therefore the saved size is predicted
+	 * value as rq_replen rounded to the next higher power of 2.
+	 * Such estimation is safe. Though the final allocated buffer might
+	 * be even larger, it is not possible to know that at this point.
+	 */
+	req->rq_reqmsg->lm_repsize = repsize;
+	return req;
 }
 
 #define GA_DEFAULT_EA_NAME_LEN 20
@@ -349,7 +429,7 @@ mdc_intent_getxattr_pack(struct obd_export *exp,
 	struct ldlm_intent	*lit;
 	int			rc, count = 0;
 	struct list_head	cancels = LIST_HEAD_INIT(cancels);
-	u32 min_buf_size = 0;
+	u32 ea_vals_buf_size = GA_DEFAULT_EA_VAL_LEN * GA_DEFAULT_EA_NUM;
 
 	ENTRY;
 
@@ -358,6 +438,16 @@ mdc_intent_getxattr_pack(struct obd_export *exp,
 	if (req == NULL)
 		RETURN(ERR_PTR(-ENOMEM));
 
+	/* get SELinux policy info if any */
+	rc = sptlrpc_get_sepol(req);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(ERR_PTR(rc));
+	}
+	req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
+			     strlen(req->rq_sepol) ?
+			     strlen(req->rq_sepol) + 1 : 0);
+
 	rc = ldlm_prep_enqueue_req(exp, req, &cancels, count);
 	if (rc) {
 		ptlrpc_request_free(req);
@@ -367,6 +457,8 @@ mdc_intent_getxattr_pack(struct obd_export *exp,
 	/* pack the intent */
 	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
 	lit->opc = IT_GETXATTR;
+	CDEBUG(D_INFO, "%s: get xattrs for "DFID"\n",
+	       exp->exp_obd->obd_name, PFID(&op_data->op_fid1));
 
 #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
 	/* If the supplied buffer is too small then the server will
@@ -378,26 +470,25 @@ mdc_intent_getxattr_pack(struct obd_export *exp,
 	 * of LU-9417 when it would be *more* likely to crash the
 	 * server. See LU-9856. */
 	if (exp->exp_connect_data.ocd_version < OBD_OCD_VERSION(2, 10, 1, 0))
-		min_buf_size = exp->exp_connect_data.ocd_max_easize;
+		ea_vals_buf_size = max_t(u32, ea_vals_buf_size,
+					 exp->exp_connect_data.ocd_max_easize);
 #endif
 
 	/* pack the intended request */
 	mdc_pack_body(req, &op_data->op_fid1, op_data->op_valid,
-		      max_t(u32, min_buf_size,
-			    GA_DEFAULT_EA_VAL_LEN * GA_DEFAULT_EA_NUM),
-		      -1, 0);
+		      ea_vals_buf_size, -1, 0);
+
+	/* get SELinux policy info if any */
+	mdc_file_sepol_pack(req);
 
 	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER,
-			     max_t(u32, min_buf_size,
-				 GA_DEFAULT_EA_NAME_LEN * GA_DEFAULT_EA_NUM));
+			     GA_DEFAULT_EA_NAME_LEN * GA_DEFAULT_EA_NUM);
 
 	req_capsule_set_size(&req->rq_pill, &RMF_EAVALS, RCL_SERVER,
-			     max_t(u32, min_buf_size,
-				 GA_DEFAULT_EA_VAL_LEN * GA_DEFAULT_EA_NUM));
+			     ea_vals_buf_size);
 
 	req_capsule_set_size(&req->rq_pill, &RMF_EAVALS_LENS, RCL_SERVER,
-			     max_t(u32, min_buf_size,
-				 sizeof(__u32) * GA_DEFAULT_EA_NUM));
+			     sizeof(u32) * GA_DEFAULT_EA_NUM);
 
 	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, 0);
 
@@ -406,46 +497,9 @@ mdc_intent_getxattr_pack(struct obd_export *exp,
 	RETURN(req);
 }
 
-static struct ptlrpc_request *mdc_intent_unlink_pack(struct obd_export *exp,
-                                                     struct lookup_intent *it,
-                                                     struct md_op_data *op_data)
-{
-        struct ptlrpc_request *req;
-        struct obd_device     *obddev = class_exp2obd(exp);
-        struct ldlm_intent    *lit;
-        int                    rc;
-        ENTRY;
-
-        req = ptlrpc_request_alloc(class_exp2cliimp(exp),
-                                   &RQF_LDLM_INTENT_UNLINK);
-        if (req == NULL)
-                RETURN(ERR_PTR(-ENOMEM));
-
-        req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
-                             op_data->op_namelen + 1);
-
-        rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
-        if (rc) {
-                ptlrpc_request_free(req);
-                RETURN(ERR_PTR(rc));
-        }
-
-        /* pack the intent */
-        lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
-        lit->opc = (__u64)it->it_op;
-
-        /* pack the intended request */
-        mdc_unlink_pack(req, op_data);
-
-	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
-			     obddev->u.cli.cl_default_mds_easize);
-	ptlrpc_request_set_replen(req);
-	RETURN(req);
-}
-
-static struct ptlrpc_request *mdc_intent_getattr_pack(struct obd_export *exp,
-                                                      struct lookup_intent *it,
-                                                      struct md_op_data *op_data)
+static struct ptlrpc_request *
+mdc_intent_getattr_pack(struct obd_export *exp, struct lookup_intent *it,
+			struct md_op_data *op_data, __u32 acl_bufsize)
 {
 	struct ptlrpc_request	*req;
 	struct obd_device	*obddev = class_exp2obd(exp);
@@ -455,25 +509,38 @@ static struct ptlrpc_request *mdc_intent_getattr_pack(struct obd_export *exp,
 	struct ldlm_intent	*lit;
 	int			 rc;
 	__u32			 easize;
+	bool			 have_secctx = false;
 	ENTRY;
 
-        req = ptlrpc_request_alloc(class_exp2cliimp(exp),
-                                   &RQF_LDLM_INTENT_GETATTR);
-        if (req == NULL)
-                RETURN(ERR_PTR(-ENOMEM));
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_LDLM_INTENT_GETATTR);
+	if (req == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
 
-        req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
-                             op_data->op_namelen + 1);
+	/* send name of security xattr to get upon intent */
+	if (it->it_op & (IT_LOOKUP | IT_GETATTR) &&
+	    req_capsule_has_field(&req->rq_pill, &RMF_FILE_SECCTX_NAME,
+				  RCL_CLIENT) &&
+	    op_data->op_file_secctx_name_size > 0 &&
+	    op_data->op_file_secctx_name != NULL) {
+		have_secctx = true;
+		req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX_NAME,
+				     RCL_CLIENT,
+				     op_data->op_file_secctx_name_size);
+	}
 
-        rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
-        if (rc) {
-                ptlrpc_request_free(req);
-                RETURN(ERR_PTR(rc));
-        }
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+			     op_data->op_namelen + 1);
+
+	rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(ERR_PTR(rc));
+	}
 
         /* pack the intent */
-        lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
-        lit->opc = (__u64)it->it_op;
+	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+	lit->opc = (__u64)it->it_op;
 
 	if (obddev->u.cli.cl_default_mds_easize > 0)
 		easize = obddev->u.cli.cl_default_mds_easize;
@@ -484,8 +551,27 @@ static struct ptlrpc_request *mdc_intent_getattr_pack(struct obd_export *exp,
 	mdc_getattr_pack(req, valid, it->it_flags, op_data, easize);
 
 	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, easize);
-	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
-			     req->rq_import->imp_connect_data.ocd_max_easize);
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, acl_bufsize);
+
+	if (have_secctx) {
+		char *secctx_name;
+
+		secctx_name = req_capsule_client_get(&req->rq_pill,
+						     &RMF_FILE_SECCTX_NAME);
+		memcpy(secctx_name, op_data->op_file_secctx_name,
+		       op_data->op_file_secctx_name_size);
+
+		req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX,
+				     RCL_SERVER, easize);
+
+		CDEBUG(D_SEC, "packed '%.*s' as security xattr name\n",
+		       op_data->op_file_secctx_name_size,
+		       op_data->op_file_secctx_name);
+	} else {
+		req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX,
+				     RCL_SERVER, 0);
+	}
+
 	ptlrpc_request_set_replen(req);
 	RETURN(req);
 }
@@ -562,8 +648,10 @@ static int mdc_finish_enqueue(struct obd_export *exp,
 	struct ldlm_request *lockreq;
 	struct ldlm_reply   *lockrep;
 	struct ldlm_lock    *lock;
+	struct mdt_body     *body = NULL;
 	void                *lvb_data = NULL;
 	__u32                lvb_len = 0;
+
         ENTRY;
 
         LASSERT(rc >= 0);
@@ -622,8 +710,6 @@ static int mdc_finish_enqueue(struct obd_export *exp,
 
 	/* We know what to expect, so we do any byte flipping required here */
 	if (it_has_reply_body(it)) {
-                struct mdt_body *body;
-
                 body = req_capsule_server_get(pill, &RMF_MDT_BODY);
                 if (body == NULL) {
                         CERROR ("Can't swab mdt_body\n");
@@ -641,6 +727,12 @@ static int mdc_finish_enqueue(struct obd_export *exp,
 			mdc_set_open_replay_data(NULL, NULL, it);
 		}
 
+		if (it_disposition(it, DISP_OPEN_CREATE) &&
+		    !it_open_error(DISP_OPEN_CREATE, it)) {
+			lprocfs_counter_incr(exp->exp_obd->obd_md_stats,
+					     LPROC_MD_CREATE);
+		}
+
 		if (body->mbo_valid & (OBD_MD_FLDIREA | OBD_MD_FLEASIZE)) {
                         void *eadata;
 
@@ -708,7 +800,10 @@ static int mdc_finish_enqueue(struct obd_export *exp,
 	 * client still does this checking in case it's talking with an old
 	 * server. - Jinshan */
 	lock = ldlm_handle2lock(lockh);
-	if (lock != NULL && ldlm_has_layout(lock) && lvb_data != NULL &&
+	if (lock == NULL)
+		RETURN(rc);
+
+	if (ldlm_has_layout(lock) && lvb_data != NULL &&
 	    !(lockrep->lock_flags & LDLM_FL_BLOCKED_MASK)) {
 		void *lmm;
 
@@ -716,10 +811,9 @@ static int mdc_finish_enqueue(struct obd_export *exp,
 			ldlm_it2str(it->it_op), lvb_len);
 
 		OBD_ALLOC_LARGE(lmm, lvb_len);
-		if (lmm == NULL) {
-			LDLM_LOCK_PUT(lock);
-			RETURN(-ENOMEM);
-		}
+		if (lmm == NULL)
+			GOTO(out_lock, rc = -ENOMEM);
+
 		memcpy(lmm, lvb_data, lvb_len);
 
 		/* install lvb_data */
@@ -734,8 +828,24 @@ static int mdc_finish_enqueue(struct obd_export *exp,
 		if (lmm != NULL)
 			OBD_FREE_LARGE(lmm, lvb_len);
 	}
-	if (lock != NULL)
-		LDLM_LOCK_PUT(lock);
+
+	if (ldlm_has_dom(lock)) {
+		LASSERT(lock->l_glimpse_ast == mdc_ldlm_glimpse_ast);
+
+		body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+		if (!(body->mbo_valid & OBD_MD_DOM_SIZE)) {
+			LDLM_ERROR(lock, "%s: DoM lock without size.",
+				   exp->exp_obd->obd_name);
+			GOTO(out_lock, rc = -EPROTO);
+		}
+
+		LDLM_DEBUG(lock, "DoM lock is returned by: %s, size: %llu",
+			   ldlm_it2str(it->it_op), body->mbo_dom_size);
+
+		rc = mdc_fill_lvb(req, &lock->l_ost_lvb);
+	}
+out_lock:
+	LDLM_LOCK_PUT(lock);
 
 	RETURN(rc);
 }
@@ -764,6 +874,8 @@ static int mdc_enqueue_base(struct obd_export *exp,
 				  .l_inodebits = { MDS_INODELOCK_XATTR } };
 	int generation, resends = 0;
 	struct ldlm_reply *lockrep;
+	struct obd_import *imp = class_exp2cliimp(exp);
+	__u32 acl_bufsize;
 	enum lvb_type lvb_type = 0;
 	int rc;
 	ENTRY;
@@ -776,34 +888,37 @@ static int mdc_enqueue_base(struct obd_export *exp,
 		LASSERT(policy == NULL);
 
 		saved_flags |= LDLM_FL_HAS_INTENT;
-		if (it->it_op & (IT_UNLINK | IT_GETATTR | IT_READDIR))
+		if (it->it_op & (IT_GETATTR | IT_READDIR))
 			policy = &update_policy;
 		else if (it->it_op & IT_LAYOUT)
 			policy = &layout_policy;
-		else if (it->it_op & (IT_GETXATTR | IT_SETXATTR))
+		else if (it->it_op & IT_GETXATTR)
 			policy = &getxattr_policy;
 		else
 			policy = &lookup_policy;
 	}
 
-        generation = obddev->u.cli.cl_import->imp_generation;
+	generation = obddev->u.cli.cl_import->imp_generation;
+	if (!it || (it->it_op & (IT_OPEN | IT_CREAT)))
+		acl_bufsize = imp->imp_connect_data.ocd_max_easize;
+	else
+		acl_bufsize = LUSTRE_POSIX_ACL_MAX_SIZE_OLD;
+
 resend:
-        flags = saved_flags;
+	flags = saved_flags;
 	if (it == NULL) {
 		/* The only way right now is FLOCK. */
 		LASSERTF(einfo->ei_type == LDLM_FLOCK, "lock type %d\n",
 			 einfo->ei_type);
 		res_id.name[3] = LDLM_FLOCK;
 	} else if (it->it_op & IT_OPEN) {
-		req = mdc_intent_open_pack(exp, it, op_data);
-	} else if (it->it_op & IT_UNLINK) {
-		req = mdc_intent_unlink_pack(exp, it, op_data);
+		req = mdc_intent_open_pack(exp, it, op_data, acl_bufsize);
 	} else if (it->it_op & (IT_GETATTR | IT_LOOKUP)) {
-		req = mdc_intent_getattr_pack(exp, it, op_data);
+		req = mdc_intent_getattr_pack(exp, it, op_data, acl_bufsize);
 	} else if (it->it_op & IT_READDIR) {
 		req = mdc_enqueue_pack(exp, 0);
 	} else if (it->it_op & IT_LAYOUT) {
-		if (!imp_connect_lvb_type(class_exp2cliimp(exp)))
+		if (!imp_connect_lvb_type(imp))
 			RETURN(-EOPNOTSUPP);
 		req = mdc_intent_layout_pack(exp, it, op_data);
 		lvb_type = LVB_T_LAYOUT;
@@ -832,18 +947,25 @@ static int mdc_enqueue_base(struct obd_export *exp,
 		rc = obd_get_request_slot(&obddev->u.cli);
 		if (rc != 0) {
 			mdc_put_mod_rpc_slot(req, it);
-                        mdc_clear_replay_flag(req, 0);
-                        ptlrpc_req_finished(req);
-                        RETURN(rc);
-                }
-        }
+			mdc_clear_replay_flag(req, 0);
+			ptlrpc_req_finished(req);
+			RETURN(rc);
+		}
+	}
+
+	/* With Data-on-MDT the glimpse callback is needed too.
+	 * It is set here in advance but not in mdc_finish_enqueue()
+	 * to avoid possible races. It is safe to have glimpse handler
+	 * for non-DOM locks and costs nothing.*/
+	if (einfo->ei_cb_gl == NULL)
+		einfo->ei_cb_gl = mdc_ldlm_glimpse_ast;
 
-        rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, policy, &flags, NULL,
+	rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, policy, &flags, NULL,
 			      0, lvb_type, lockh, 0);
-        if (!it) {
-                /* For flock requests we immediatelly return without further
-                   delay and let caller deal with the rest, since rest of
-                   this function metadata processing makes no sense for flock
+	if (!it) {
+		/* For flock requests we immediatelly return without further
+		   delay and let caller deal with the rest, since rest of
+		   this function metadata processing makes no sense for flock
 		   requests anyway. But in case of problem during comms with
 		   Server (ETIMEDOUT) or any signal/kill attempt (EINTR), we
 		   can not rely on caller and this mainly for F_UNLCKs
@@ -898,6 +1020,15 @@ static int mdc_enqueue_base(struct obd_export *exp,
 		}
 	}
 
+	if ((int)lockrep->lock_policy_res2 == -ERANGE &&
+	    it->it_op & (IT_OPEN | IT_GETATTR | IT_LOOKUP) &&
+	    acl_bufsize != imp->imp_connect_data.ocd_max_easize) {
+		mdc_clear_replay_flag(req, -ERANGE);
+		ptlrpc_req_finished(req);
+		acl_bufsize = imp->imp_connect_data.ocd_max_easize;
+		goto resend;
+	}
+
 	rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc);
 	if (rc < 0) {
 		if (lustre_handle_is_used(lockh)) {
@@ -1071,7 +1202,6 @@ int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
 			 * but for old MDTs (< 2.4), permission is covered
 			 * by LOOKUP lock, so it needs to match all bits here.*/
 			policy.l_inodebits.bits = MDS_INODELOCK_UPDATE |
-						  MDS_INODELOCK_LOOKUP |
 						  MDS_INODELOCK_PERM;
 			break;
 		case IT_READDIR:
@@ -1138,6 +1268,7 @@ int mdc_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
 		.ei_mode	= it_to_lock_mode(it),
 		.ei_cb_bl	= cb_blocking,
 		.ei_cb_cp	= ldlm_completion_ast,
+		.ei_cb_gl	= mdc_ldlm_glimpse_ast,
 	};
 	struct lustre_handle lockh;
 	int rc = 0;
@@ -1254,7 +1385,10 @@ int mdc_intent_getattr_async(struct obd_export *exp,
 		PFID(&op_data->op_fid1), ldlm_it2str(it->it_op), it->it_flags);
 
 	fid_build_reg_res_name(&op_data->op_fid1, &res_id);
-	req = mdc_intent_getattr_pack(exp, it, op_data);
+	/* If the MDT return -ERANGE because of large ACL, then the sponsor
+	 * of the async getattr RPC will handle that by itself. */
+	req = mdc_intent_getattr_pack(exp, it, op_data,
+				      LUSTRE_POSIX_ACL_MAX_SIZE_OLD);
 	if (IS_ERR(req))
 		RETURN(PTR_ERR(req));
 
@@ -1264,6 +1398,13 @@ int mdc_intent_getattr_async(struct obd_export *exp,
 		RETURN(rc);
 	}
 
+	/* With Data-on-MDT the glimpse callback is needed too.
+	 * It is set here in advance but not in mdc_finish_enqueue()
+	 * to avoid possible races. It is safe to have glimpse handler
+	 * for non-DOM locks and costs nothing.*/
+	if (minfo->mi_einfo.ei_cb_gl == NULL)
+		minfo->mi_einfo.ei_cb_gl = mdc_ldlm_glimpse_ast;
+
 	rc = ldlm_cli_enqueue(exp, &req, &minfo->mi_einfo, &res_id, &policy,
 			      &flags, NULL, 0, LVB_T_NONE, &minfo->mi_lockh, 1);
 	if (rc < 0) {
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c
index db2e665658746..096b20fd4847a 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_reint.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -201,6 +201,16 @@ int mdc_create(struct obd_export *exp, struct md_op_data *op_data,
 	req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX, RCL_CLIENT,
 			     op_data->op_file_secctx_size);
 
+	/* get SELinux policy info if any */
+	rc = sptlrpc_get_sepol(req);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
+			     strlen(req->rq_sepol) ?
+			     strlen(req->rq_sepol) + 1 : 0);
+
 	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
 	if (rc) {
 		ptlrpc_request_free(req);
@@ -275,9 +285,10 @@ int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
 						MDS_INODELOCK_UPDATE);
 	if ((op_data->op_flags & MF_MDC_CANCEL_FID3) &&
 	    (fid_is_sane(&op_data->op_fid3)))
+		/* don't cancel DoM lock which may cause data flush */
 		count += mdc_resource_get_unused(exp, &op_data->op_fid3,
 						 &cancels, LCK_EX,
-						 MDS_INODELOCK_FULL);
+						 MDS_INODELOCK_ELC);
         req = ptlrpc_request_alloc(class_exp2cliimp(exp),
                                    &RQF_MDS_REINT_UNLINK);
         if (req == NULL) {
@@ -288,6 +299,16 @@ int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
         req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
                              op_data->op_namelen + 1);
 
+	/* get SELinux policy info if any */
+	rc = sptlrpc_get_sepol(req);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
+			     strlen(req->rq_sepol) ?
+			     strlen(req->rq_sepol) + 1 : 0);
+
 	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
 	if (rc) {
 		ptlrpc_request_free(req);
@@ -336,6 +357,16 @@ int mdc_link(struct obd_export *exp, struct md_op_data *op_data,
         req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
                              op_data->op_namelen + 1);
 
+	/* get SELinux policy info if any */
+	rc = sptlrpc_get_sepol(req);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
+			     strlen(req->rq_sepol) ?
+			     strlen(req->rq_sepol) + 1 : 0);
+
 	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
 	if (rc) {
 		ptlrpc_request_free(req);
@@ -358,31 +389,32 @@ int mdc_rename(struct obd_export *exp, struct md_op_data *op_data,
 		struct ptlrpc_request **request)
 {
 	struct list_head cancels = LIST_HEAD_INIT(cancels);
-        struct obd_device *obd = exp->exp_obd;
-        struct ptlrpc_request *req;
-        int count = 0, rc;
-        ENTRY;
+	struct obd_device *obd = exp->exp_obd;
+	struct ptlrpc_request *req;
+	int count = 0, rc;
 
-        if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
-            (fid_is_sane(&op_data->op_fid1)))
-                count = mdc_resource_get_unused(exp, &op_data->op_fid1,
-                                                &cancels, LCK_EX,
-                                                MDS_INODELOCK_UPDATE);
-        if ((op_data->op_flags & MF_MDC_CANCEL_FID2) &&
-            (fid_is_sane(&op_data->op_fid2)))
-                count += mdc_resource_get_unused(exp, &op_data->op_fid2,
-                                                 &cancels, LCK_EX,
-                                                 MDS_INODELOCK_UPDATE);
-        if ((op_data->op_flags & MF_MDC_CANCEL_FID3) &&
-            (fid_is_sane(&op_data->op_fid3)))
-                count += mdc_resource_get_unused(exp, &op_data->op_fid3,
-                                                 &cancels, LCK_EX,
-                                                 MDS_INODELOCK_LOOKUP);
-        if ((op_data->op_flags & MF_MDC_CANCEL_FID4) &&
-             (fid_is_sane(&op_data->op_fid4)))
-                count += mdc_resource_get_unused(exp, &op_data->op_fid4,
-                                                 &cancels, LCK_EX,
-                                                 MDS_INODELOCK_FULL);
+	ENTRY;
+
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+	    (fid_is_sane(&op_data->op_fid1)))
+		count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+						&cancels, LCK_EX,
+						MDS_INODELOCK_UPDATE);
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID2) &&
+	    (fid_is_sane(&op_data->op_fid2)))
+		count += mdc_resource_get_unused(exp, &op_data->op_fid2,
+						 &cancels, LCK_EX,
+						 MDS_INODELOCK_UPDATE);
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID3) &&
+	    (fid_is_sane(&op_data->op_fid3)))
+		count += mdc_resource_get_unused(exp, &op_data->op_fid3,
+						 &cancels, LCK_EX,
+						 MDS_INODELOCK_LOOKUP);
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID4) &&
+	    (fid_is_sane(&op_data->op_fid4)))
+		count += mdc_resource_get_unused(exp, &op_data->op_fid4,
+						 &cancels, LCK_EX,
+						 MDS_INODELOCK_ELC);
 
 	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
 			   op_data->op_cli_flags & CLI_MIGRATE ?
@@ -392,8 +424,21 @@ int mdc_rename(struct obd_export *exp, struct md_op_data *op_data,
 		RETURN(-ENOMEM);
 	}
 
-        req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, oldlen + 1);
-        req_capsule_set_size(&req->rq_pill, &RMF_SYMTGT, RCL_CLIENT, newlen+1);
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, oldlen + 1);
+	req_capsule_set_size(&req->rq_pill, &RMF_SYMTGT, RCL_CLIENT, newlen+1);
+	if (op_data->op_cli_flags & CLI_MIGRATE)
+		req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+				     op_data->op_data_size);
+
+	/* get SELinux policy info if any */
+	rc = sptlrpc_get_sepol(req);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
+			     strlen(req->rq_sepol) ?
+			     strlen(req->rq_sepol) + 1 : 0);
 
 	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
 	if (rc) {
@@ -401,34 +446,76 @@ int mdc_rename(struct obd_export *exp, struct md_op_data *op_data,
 		RETURN(rc);
 	}
 
-	if (op_data->op_cli_flags & CLI_MIGRATE && op_data->op_data != NULL) {
-		struct md_open_data *mod = op_data->op_data;
+	if (exp_connect_cancelset(exp) && req)
+		ldlm_cli_cancel_list(&cancels, count, req, 0);
+
+	if (op_data->op_cli_flags & CLI_MIGRATE)
+		mdc_migrate_pack(req, op_data, old, oldlen);
+	else
+		mdc_rename_pack(req, op_data, old, oldlen, new, newlen);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     obd->u.cli.cl_default_mds_easize);
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_reint(req, LUSTRE_IMP_FULL);
+	*request = req;
+	if (rc == -ERESTARTSYS)
+		rc = 0;
+
+	RETURN(rc);
+}
 
-		LASSERTF(mod->mod_open_req != NULL &&
-			 mod->mod_open_req->rq_type != LI_POISON,
-			 "POISONED open %p!\n", mod->mod_open_req);
+int mdc_file_resync(struct obd_export *exp, struct md_op_data *op_data)
+{
+	struct list_head cancels = LIST_HEAD_INIT(cancels);
+	struct ptlrpc_request *req;
+	struct ldlm_lock *lock;
+	struct mdt_rec_resync *rec;
+	int count = 0, rc;
+	ENTRY;
 
-		DEBUG_REQ(D_HA, mod->mod_open_req, "matched open");
-		/* We no longer want to preserve this open for replay even
-		 * though the open was committed. b=3632, b=3633 */
-		spin_lock(&mod->mod_open_req->rq_lock);
-		mod->mod_open_req->rq_replay = 0;
-		spin_unlock(&mod->mod_open_req->rq_lock);
+	if (op_data->op_flags & MF_MDC_CANCEL_FID1 &&
+	    fid_is_sane(&op_data->op_fid1))
+		count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+						&cancels, LCK_EX,
+						MDS_INODELOCK_LAYOUT);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_REINT_RESYNC);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		RETURN(-ENOMEM);
 	}
 
-        if (exp_connect_cancelset(exp) && req)
-                ldlm_cli_cancel_list(&cancels, count, req, 0);
+	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
 
-        mdc_rename_pack(req, op_data, old, oldlen, new, newlen);
+	CLASSERT(sizeof(*rec) == sizeof(struct mdt_rec_reint));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+	rec->rs_opcode	= REINT_RESYNC;
+	rec->rs_fsuid	= op_data->op_fsuid;
+	rec->rs_fsgid	= op_data->op_fsgid;
+	rec->rs_cap	= op_data->op_cap;
+	rec->rs_fid	= op_data->op_fid1;
+	rec->rs_bias	= op_data->op_bias;
+	rec->rs_mirror_id = op_data->op_mirror_id;
+
+	lock = ldlm_handle2lock(&op_data->op_lease_handle);
+	if (lock != NULL) {
+		rec->rs_lease_handle = lock->l_remote_handle;
+		LDLM_LOCK_PUT(lock);
+	}
 
-	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
-			     obd->u.cli.cl_default_mds_easize);
 	ptlrpc_request_set_replen(req);
 
 	rc = mdc_reint(req, LUSTRE_IMP_FULL);
-        *request = req;
-        if (rc == -ERESTARTSYS)
-                rc = 0;
+	if (rc == -ERESTARTSYS)
+		rc = 0;
 
-        RETURN(rc);
+	ptlrpc_req_finished(req);
+	RETURN(rc);
 }
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c
index 6c8da5866a8b9..5a29a285e5943 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -41,21 +41,23 @@
 #ifdef HAVE_UIDGID_HEADER
 # include <linux/uidgid.h>
 #endif
+#include <linux/device.h>
 
-#include <lustre/lustre_errno.h>
+#include <lustre_errno.h>
 
 #include <cl_object.h>
 #include <llog_swab.h>
 #include <lprocfs_status.h>
 #include <lustre_acl.h>
 #include <lustre_fid.h>
-#include <uapi/linux/lustre_ioctl.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
 #include <lustre_kernelcomm.h>
 #include <lustre_lmv.h>
 #include <lustre_log.h>
-#include <uapi/linux/lustre_param.h>
+#include <uapi/linux/lustre/lustre_param.h>
 #include <lustre_swab.h>
 #include <obd_class.h>
+#include <lustre_osc.h>
 
 #include "mdc_internal.h"
 
@@ -191,20 +193,34 @@ static int mdc_getattr_common(struct obd_export *exp,
         RETURN(0);
 }
 
+static void mdc_reset_acl_req(struct ptlrpc_request *req)
+{
+	spin_lock(&req->rq_early_free_lock);
+	sptlrpc_cli_free_repbuf(req);
+	req->rq_repbuf = NULL;
+	req->rq_repbuf_len = 0;
+	req->rq_repdata = NULL;
+	req->rq_reqdata_len = 0;
+	spin_unlock(&req->rq_early_free_lock);
+}
+
 static int mdc_getattr(struct obd_export *exp, struct md_op_data *op_data,
 		       struct ptlrpc_request **request)
 {
-        struct ptlrpc_request *req;
-        int                    rc;
-        ENTRY;
+	struct ptlrpc_request *req;
+	struct obd_import *imp = class_exp2cliimp(exp);
+	__u32 acl_bufsize = LUSTRE_POSIX_ACL_MAX_SIZE_OLD;
+	int rc;
+	ENTRY;
 
 	/* Single MDS without an LMV case */
 	if (op_data->op_flags & MF_GET_MDT_IDX) {
 		op_data->op_mds = 0;
 		RETURN(0);
 	}
-        *request = NULL;
-        req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR);
+
+	*request = NULL;
+	req = ptlrpc_request_alloc(imp, &RQF_MDS_GETATTR);
         if (req == NULL)
                 RETURN(-ENOMEM);
 
@@ -214,33 +230,42 @@ static int mdc_getattr(struct obd_export *exp, struct md_op_data *op_data,
                 RETURN(rc);
         }
 
+again:
 	mdc_pack_body(req, &op_data->op_fid1, op_data->op_valid,
 		      op_data->op_mode, -1, 0);
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, acl_bufsize);
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     op_data->op_mode);
+	ptlrpc_request_set_replen(req);
 
-	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
-			     req->rq_import->imp_connect_data.ocd_max_easize);
-        req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
-                             op_data->op_mode);
-        ptlrpc_request_set_replen(req);
+	rc = mdc_getattr_common(exp, req);
+	if (rc) {
+		if (rc == -ERANGE &&
+		    acl_bufsize != imp->imp_connect_data.ocd_max_easize) {
+			acl_bufsize = imp->imp_connect_data.ocd_max_easize;
+			mdc_reset_acl_req(req);
+			goto again;
+		}
 
-        rc = mdc_getattr_common(exp, req);
-        if (rc)
-                ptlrpc_req_finished(req);
-        else
-                *request = req;
-        RETURN(rc);
+		ptlrpc_req_finished(req);
+	} else {
+		*request = req;
+	}
+
+	RETURN(rc);
 }
 
 static int mdc_getattr_name(struct obd_export *exp, struct md_op_data *op_data,
 			    struct ptlrpc_request **request)
 {
-        struct ptlrpc_request *req;
-        int                    rc;
-        ENTRY;
+	struct ptlrpc_request *req;
+	struct obd_import *imp = class_exp2cliimp(exp);
+	__u32 acl_bufsize = LUSTRE_POSIX_ACL_MAX_SIZE_OLD;
+	int rc;
+	ENTRY;
 
-        *request = NULL;
-        req = ptlrpc_request_alloc(class_exp2cliimp(exp),
-                                   &RQF_MDS_GETATTR_NAME);
+	*request = NULL;
+	req = ptlrpc_request_alloc(imp, &RQF_MDS_GETATTR_NAME);
         if (req == NULL)
                 RETURN(-ENOMEM);
 
@@ -253,9 +278,6 @@ static int mdc_getattr_name(struct obd_export *exp, struct md_op_data *op_data,
                 RETURN(rc);
         }
 
-	mdc_pack_body(req, &op_data->op_fid1, op_data->op_valid,
-		      op_data->op_mode, op_data->op_suppgids[0], 0);
-
         if (op_data->op_name) {
                 char *name = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
                 LASSERT(strnlen(op_data->op_name, op_data->op_namelen) ==
@@ -263,18 +285,29 @@ static int mdc_getattr_name(struct obd_export *exp, struct md_op_data *op_data,
                 memcpy(name, op_data->op_name, op_data->op_namelen);
         }
 
-        req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
-                             op_data->op_mode);
-	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
-			     req->rq_import->imp_connect_data.ocd_max_easize);
-        ptlrpc_request_set_replen(req);
+again:
+	mdc_pack_body(req, &op_data->op_fid1, op_data->op_valid,
+		      op_data->op_mode, op_data->op_suppgids[0], 0);
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     op_data->op_mode);
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, acl_bufsize);
+	ptlrpc_request_set_replen(req);
 
-        rc = mdc_getattr_common(exp, req);
-        if (rc)
-                ptlrpc_req_finished(req);
-        else
-                *request = req;
-        RETURN(rc);
+	rc = mdc_getattr_common(exp, req);
+	if (rc) {
+		if (rc == -ERANGE &&
+		    acl_bufsize != imp->imp_connect_data.ocd_max_easize) {
+			acl_bufsize = imp->imp_connect_data.ocd_max_easize;
+			mdc_reset_acl_req(req);
+			goto again;
+		}
+
+		ptlrpc_req_finished(req);
+	} else {
+		*request = req;
+	}
+
+	RETURN(rc);
 }
 
 static int mdc_xattr_common(struct obd_export *exp,const struct req_format *fmt,
@@ -294,16 +327,25 @@ static int mdc_xattr_common(struct obd_export *exp,const struct req_format *fmt,
         if (req == NULL)
                 RETURN(-ENOMEM);
 
-        if (xattr_name) {
-                xattr_namelen = strlen(xattr_name) + 1;
-                req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
-                                     xattr_namelen);
-        }
-        if (input_size) {
-                LASSERT(input);
-                req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
-                                     input_size);
-        }
+	if (xattr_name) {
+		xattr_namelen = strlen(xattr_name) + 1;
+		req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+				     xattr_namelen);
+	}
+	if (input_size)
+		LASSERT(input);
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+			     input_size);
+
+	/* get SELinux policy info if any */
+	rc = sptlrpc_get_sepol(req);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	req_capsule_set_size(&req->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
+			     strlen(req->rq_sepol) ?
+			     strlen(req->rq_sepol) + 1 : 0);
 
 	/* Flush local XATTR locks to get rid of a possible cancel RPC */
 	if (opcode == MDS_REINT && fid_is_sane(fid) &&
@@ -333,11 +375,11 @@ static int mdc_xattr_common(struct obd_export *exp,const struct req_format *fmt,
 		}
 	}
 
-        if (opcode == MDS_REINT) {
-                struct mdt_rec_setxattr *rec;
+	if (opcode == MDS_REINT) {
+		struct mdt_rec_setxattr *rec;
 
-                CLASSERT(sizeof(struct mdt_rec_setxattr) ==
-                         sizeof(struct mdt_rec_reint));
+		CLASSERT(sizeof(struct mdt_rec_setxattr) ==
+			 sizeof(struct mdt_rec_reint));
 		rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
 		rec->sx_opcode = REINT_SETXATTR;
 		rec->sx_fsuid  = from_kuid(&init_user_ns, current_fsuid());
@@ -363,6 +405,8 @@ static int mdc_xattr_common(struct obd_export *exp,const struct req_format *fmt,
                 memcpy(tmp, input, input_size);
         }
 
+	mdc_file_sepol_pack(req);
+
         if (req_capsule_has_field(&req->rq_pill, &RMF_EADATA, RCL_SERVER))
                 req_capsule_set_size(&req->rq_pill, &RMF_EADATA,
                                      RCL_SERVER, output_size);
@@ -385,26 +429,77 @@ static int mdc_xattr_common(struct obd_export *exp,const struct req_format *fmt,
 }
 
 static int mdc_setxattr(struct obd_export *exp, const struct lu_fid *fid,
-			u64 valid, const char *xattr_name,
-			const char *input, int input_size, int output_size,
-			int flags, __u32 suppgid,
-			struct ptlrpc_request **request)
+			u64 obd_md_valid, const char *name,
+			const void *value, size_t value_size,
+			unsigned int xattr_flags, u32 suppgid,
+			struct ptlrpc_request **req)
 {
+	LASSERT(obd_md_valid == OBD_MD_FLXATTR ||
+		obd_md_valid == OBD_MD_FLXATTRRM);
+
 	return mdc_xattr_common(exp, &RQF_MDS_REINT_SETXATTR,
-				fid, MDS_REINT, valid, xattr_name,
-				input, input_size, output_size, flags,
-				suppgid, request);
+				fid, MDS_REINT, obd_md_valid, name,
+				value, value_size, 0, xattr_flags, suppgid,
+				req);
 }
 
 static int mdc_getxattr(struct obd_export *exp, const struct lu_fid *fid,
-			u64 valid, const char *xattr_name,
-			const char *input, int input_size, int output_size,
-			int flags, struct ptlrpc_request **request)
+			u64 obd_md_valid, const char *name, size_t buf_size,
+			struct ptlrpc_request **req)
 {
-	return mdc_xattr_common(exp, &RQF_MDS_GETXATTR,
-				fid, MDS_GETXATTR, valid, xattr_name,
-				input, input_size, output_size, flags,
-				-1, request);
+	struct mdt_body *body;
+	int rc;
+
+	LASSERT(obd_md_valid == OBD_MD_FLXATTR ||
+		obd_md_valid == OBD_MD_FLXATTRLS);
+
+	CDEBUG(D_INFO, "%s: get xattr '%s' for "DFID"\n",
+	       exp->exp_obd->obd_name, name, PFID(fid));
+	rc = mdc_xattr_common(exp, &RQF_MDS_GETXATTR, fid, MDS_GETXATTR,
+			      obd_md_valid, name, NULL, 0, buf_size, 0, -1,
+			      req);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&(*req)->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	/* only detect the xattr size */
+	if (buf_size == 0) {
+		/* LU-11109: Older MDTs do not distinguish
+		 * between nonexistent xattrs and zero length
+		 * values in this case. Newer MDTs will return
+		 * -ENODATA or set OBD_MD_FLXATTR. */
+		GOTO(out, rc = body->mbo_eadatasize);
+	}
+
+	if (body->mbo_eadatasize == 0) {
+		/* LU-11109: Newer MDTs set OBD_MD_FLXATTR on
+		 * success so that we can distinguish between
+		 * zero length value and nonexistent xattr.
+		 *
+		 * If OBD_MD_FLXATTR is not set then we keep
+		 * the old behavior and return -ENODATA for
+		 * getxattr() when mbo_eadatasize is 0. But
+		 * -ENODATA only makes sense for getxattr()
+		 * and not for listxattr(). */
+		if (body->mbo_valid & OBD_MD_FLXATTR)
+			GOTO(out, rc = 0);
+		else if (obd_md_valid == OBD_MD_FLXATTR)
+			GOTO(out, rc = -ENODATA);
+		else
+			GOTO(out, rc = 0);
+	}
+
+	GOTO(out, rc = body->mbo_eadatasize);
+out:
+	if (rc < 0) {
+		ptlrpc_req_finished(*req);
+		*req = NULL;
+	}
+
+	return rc;
 }
 
 #ifdef CONFIG_FS_POSIX_ACL
@@ -552,41 +647,41 @@ int mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md)
 
 void mdc_replay_open(struct ptlrpc_request *req)
 {
-        struct md_open_data *mod = req->rq_cb_data;
-        struct ptlrpc_request *close_req;
-        struct obd_client_handle *och;
-        struct lustre_handle old;
-        struct mdt_body *body;
-        ENTRY;
+	struct md_open_data *mod = req->rq_cb_data;
+	struct ptlrpc_request *close_req;
+	struct obd_client_handle *och;
+	struct lustre_handle old_open_handle = { };
+	struct mdt_body *body;
+	ENTRY;
 
-        if (mod == NULL) {
-                DEBUG_REQ(D_ERROR, req,
-                          "Can't properly replay without open data.");
-                EXIT;
-                return;
-        }
+	if (mod == NULL) {
+		DEBUG_REQ(D_ERROR, req,
+			  "Can't properly replay without open data.");
+		EXIT;
+		return;
+	}
 
-        body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
-        LASSERT(body != NULL);
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body != NULL);
 
 	spin_lock(&req->rq_lock);
 	och = mod->mod_och;
-	if (och && och->och_fh.cookie)
+	if (och && och->och_open_handle.cookie)
 		req->rq_early_free_repbuf = 1;
 	else
 		req->rq_early_free_repbuf = 0;
 	spin_unlock(&req->rq_lock);
 
 	if (req->rq_early_free_repbuf) {
-		struct lustre_handle *file_fh;
+		struct lustre_handle *file_open_handle;
 
 		LASSERT(och->och_magic == OBD_CLIENT_HANDLE_MAGIC);
 
-		file_fh = &och->och_fh;
+		file_open_handle = &och->och_open_handle;
 		CDEBUG(D_HA, "updating handle from %#llx to %#llx\n",
-		       file_fh->cookie, body->mbo_handle.cookie);
-		old = *file_fh;
-		*file_fh = body->mbo_handle;
+		       file_open_handle->cookie, body->mbo_open_handle.cookie);
+		old_open_handle = *file_open_handle;
+		*file_open_handle = body->mbo_open_handle;
 	}
 
 	close_req = mod->mod_close_req;
@@ -600,10 +695,11 @@ void mdc_replay_open(struct ptlrpc_request *req)
 		LASSERT(epoch);
 
 		if (req->rq_early_free_repbuf)
-			LASSERT(!memcmp(&old, &epoch->mio_handle, sizeof(old)));
+			LASSERT(old_open_handle.cookie ==
+				epoch->mio_open_handle.cookie);
 
 		DEBUG_REQ(D_HA, close_req, "updating close body with new fh");
-		epoch->mio_handle = body->mbo_handle;
+		epoch->mio_open_handle = body->mbo_open_handle;
 	}
 	EXIT;
 }
@@ -685,20 +781,20 @@ int mdc_set_open_replay_data(struct obd_export *exp,
 		open_req->rq_commit_cb = mdc_commit_open;
 		open_req->rq_early_free_repbuf = 1;
 		spin_unlock(&open_req->rq_lock);
-        }
+	}
 
 	rec->cr_fid2 = body->mbo_fid1;
-	rec->cr_ioepoch = body->mbo_ioepoch;
-	rec->cr_old_handle.cookie = body->mbo_handle.cookie;
+	rec->cr_open_handle_old = body->mbo_open_handle;
 	open_req->rq_replay_cb = mdc_replay_open;
 	if (!fid_is_sane(&body->mbo_fid1)) {
-                DEBUG_REQ(D_ERROR, open_req, "Saving replay request with "
-                          "insane fid");
-                LBUG();
-        }
+		DEBUG_REQ(D_ERROR, open_req,
+			  "saving replay request with insane FID " DFID,
+			  PFID(&body->mbo_fid1));
+		LBUG();
+	}
 
-        DEBUG_REQ(D_RPCTRACE, open_req, "Set up open replay data");
-        RETURN(0);
+	DEBUG_REQ(D_RPCTRACE, open_req, "Set up open replay data");
+	RETURN(0);
 }
 
 static void mdc_free_open(struct md_open_data *mod)
@@ -742,7 +838,7 @@ int mdc_clear_open_replay_data(struct obd_export *exp,
 
 	spin_lock(&mod->mod_open_req->rq_lock);
 	if (mod->mod_och)
-		mod->mod_och->och_fh.cookie = 0;
+		mod->mod_och->och_open_handle.cookie = 0;
 	mod->mod_open_req->rq_early_free_repbuf = 0;
 	spin_unlock(&mod->mod_open_req->rq_lock);
 	mdc_free_open(mod);
@@ -760,23 +856,35 @@ static int mdc_close(struct obd_export *exp, struct md_op_data *op_data,
 	struct obd_device     *obd = class_exp2obd(exp);
 	struct ptlrpc_request *req;
 	struct req_format     *req_fmt;
+	size_t		       u32_count = 0;
 	int                    rc;
 	int		       saved_rc = 0;
 	ENTRY;
 
-	if (op_data->op_bias & MDS_HSM_RELEASE) {
-		req_fmt = &RQF_MDS_INTENT_CLOSE;
+	CDEBUG(D_INODE, "%s: "DFID" file closed with intent: %x\n",
+	       exp->exp_obd->obd_name, PFID(&op_data->op_fid1),
+	       op_data->op_bias);
+
+	if (op_data->op_bias & MDS_CLOSE_INTENT) {
+		req_fmt = &RQF_MDS_CLOSE_INTENT;
+		if (op_data->op_bias & MDS_HSM_RELEASE) {
+			/* allocate a FID for volatile file */
+			rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2,
+					   op_data);
+			if (rc < 0) {
+				CERROR("%s: "DFID" allocating FID: rc = %d\n",
+				       obd->obd_name, PFID(&op_data->op_fid1),
+				       rc);
+				/* save the errcode and proceed to close */
+				saved_rc = rc;
+			}
+		}
+		if (op_data->op_bias & MDS_CLOSE_RESYNC_DONE) {
+			size_t count = op_data->op_data_size / sizeof(__u32);
 
-		/* allocate a FID for volatile file */
-		rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
-		if (rc < 0) {
-			CERROR("%s: "DFID" failed to allocate FID: %d\n",
-			       obd->obd_name, PFID(&op_data->op_fid1), rc);
-			/* save the errcode and proceed to close */
-			saved_rc = rc;
+			if (count > INLINE_RESYNC_ARRAY_SIZE)
+				u32_count = count;
 		}
-	} else if (op_data->op_bias & MDS_CLOSE_LAYOUT_SWAP) {
-		req_fmt = &RQF_MDS_INTENT_CLOSE;
 	} else {
 		req_fmt = &RQF_MDS_CLOSE;
 	}
@@ -814,6 +922,10 @@ static int mdc_close(struct obd_export *exp, struct md_op_data *op_data,
 		GOTO(out, rc = -ENOMEM);
 	}
 
+	if (u32_count > 0)
+		req_capsule_set_size(&req->rq_pill, &RMF_U32, RCL_CLIENT,
+				     u32_count * sizeof(__u32));
+
 	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_CLOSE);
 	if (rc) {
 		ptlrpc_request_free(req);
@@ -827,6 +939,9 @@ static int mdc_close(struct obd_export *exp, struct md_op_data *op_data,
         req->rq_request_portal = MDS_READPAGE_PORTAL;
         ptlrpc_at_set_req_timeout(req);
 
+	if (!(exp_connect_flags2(exp) & OBD_CONNECT2_LSOM))
+		op_data->op_xvalid &= ~(OP_XVALID_LAZYSIZE |
+					OP_XVALID_LAZYBLOCKS);
 
         mdc_close_pack(req, op_data);
 
@@ -1110,12 +1225,12 @@ static void mdc_adjust_dirpages(struct page **pages, int cfs_pgs, int lu_pgs)
 	int i;
 
 	for (i = 0; i < cfs_pgs; i++) {
-		struct lu_dirpage	*dp = kmap(pages[i]);
-		struct lu_dirpage	*first = dp;
-		struct lu_dirent	*end_dirent = NULL;
-		struct lu_dirent	*ent;
-		__u64		hash_end = le64_to_cpu(dp->ldp_hash_end);
-		__u32		flags = le32_to_cpu(dp->ldp_flags);
+		struct lu_dirpage *dp = kmap(pages[i]);
+		struct lu_dirpage *first = dp;
+		struct lu_dirent *end_dirent = NULL;
+		struct lu_dirent *ent;
+		__u64 hash_end = dp->ldp_hash_end;
+		__u32 flags = dp->ldp_flags;
 
 		while (--lu_pgs > 0) {
 			ent = lu_dirent_start(dp);
@@ -1130,8 +1245,8 @@ static void mdc_adjust_dirpages(struct page **pages, int cfs_pgs, int lu_pgs)
 				break;
 
 			/* Save the hash and flags of this lu_dirpage. */
-			hash_end = le64_to_cpu(dp->ldp_hash_end);
-			flags = le32_to_cpu(dp->ldp_flags);
+			hash_end = dp->ldp_hash_end;
+			flags = dp->ldp_flags;
 
 			/* Check if lu_dirpage contains no entries. */
 			if (end_dirent == NULL)
@@ -1429,33 +1544,48 @@ static int mdc_read_page(struct obd_export *exp, struct md_op_data *op_data,
 	goto out_unlock;
 }
 
-
 static int mdc_statfs(const struct lu_env *env,
                       struct obd_export *exp, struct obd_statfs *osfs,
-                      __u64 max_age, __u32 flags)
+		      time64_t max_age, __u32 flags)
 {
-        struct obd_device     *obd = class_exp2obd(exp);
-        struct ptlrpc_request *req;
-        struct obd_statfs     *msfs;
-        struct obd_import     *imp = NULL;
-        int                    rc;
-        ENTRY;
+	struct obd_device *obd = class_exp2obd(exp);
+	struct req_format *fmt;
+	struct ptlrpc_request *req;
+	struct obd_statfs *msfs;
+	struct obd_import *imp = NULL;
+	int rc;
+	ENTRY;
 
         /*
          * Since the request might also come from lprocfs, so we need
          * sync this with client_disconnect_export Bug15684
          */
 	down_read(&obd->u.cli.cl_sem);
-        if (obd->u.cli.cl_import)
-                imp = class_import_get(obd->u.cli.cl_import);
+	if (obd->u.cli.cl_import)
+		imp = class_import_get(obd->u.cli.cl_import);
 	up_read(&obd->u.cli.cl_sem);
-        if (!imp)
-                RETURN(-ENODEV);
+	if (!imp)
+		RETURN(-ENODEV);
+
+	fmt = &RQF_MDS_STATFS;
+	if ((exp_connect_flags2(exp) & OBD_CONNECT2_SUM_STATFS) &&
+	    (flags & OBD_STATFS_SUM))
+		fmt = &RQF_MDS_STATFS_NEW;
+	req = ptlrpc_request_alloc_pack(imp, fmt, LUSTRE_MDS_VERSION,
+					MDS_STATFS);
+	if (req == NULL)
+		GOTO(output, rc = -ENOMEM);
 
-        req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_STATFS,
-                                        LUSTRE_MDS_VERSION, MDS_STATFS);
-        if (req == NULL)
-                GOTO(output, rc = -ENOMEM);
+	if ((flags & OBD_STATFS_SUM) &&
+	    (exp_connect_flags2(exp) & OBD_CONNECT2_SUM_STATFS)) {
+		/* request aggregated states */
+		struct mdt_body *body;
+
+		body = req_capsule_client_get(&req->rq_pill, &RMF_MDT_BODY);
+		if (body == NULL)
+			GOTO(out, rc = -EPROTO);
+		body->mbo_valid = OBD_MD_FLAGSTATFS;
+	}
 
         ptlrpc_request_set_replen(req);
 
@@ -1571,29 +1701,53 @@ static int mdc_ioc_hsm_progress(struct obd_export *exp,
 	ptlrpc_req_finished(req);
 	return rc;
 }
-
-static int mdc_ioc_hsm_ct_register(struct obd_import *imp, __u32 archives)
+/**
+ * Send hsm_ct_register to MDS
+ *
+ * \param[in]	imp		import
+ * \param[in]	archive_count	if in bitmap format, it is the bitmap,
+ *				else it is the count of archive_ids
+ * \param[in]	archives	if in bitmap format, it is NULL,
+ *				else it is archive_id lists
+ */
+static int mdc_ioc_hsm_ct_register(struct obd_import *imp, __u32 archive_count,
+				   __u32 *archives)
 {
-	__u32			*archive_mask;
-	struct ptlrpc_request	*req;
-	int			 rc;
+	struct ptlrpc_request *req;
+	__u32 *archive_array;
+	size_t archives_size;
+	int rc;
 	ENTRY;
 
-	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_CT_REGISTER,
-					LUSTRE_MDS_VERSION,
-					MDS_HSM_CT_REGISTER);
+	req = ptlrpc_request_alloc(imp, &RQF_MDS_HSM_CT_REGISTER);
 	if (req == NULL)
-		GOTO(out, rc = -ENOMEM);
+		RETURN(-ENOMEM);
+
+	if (archives != NULL)
+		archives_size = sizeof(*archive_array) * archive_count;
+	else
+		archives_size = sizeof(archive_count);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDS_HSM_ARCHIVE,
+			     RCL_CLIENT, archives_size);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_CT_REGISTER);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(-ENOMEM);
+	}
 
 	mdc_pack_body(req, NULL, 0, 0, -1, 0);
 
-	/* Copy hsm_progress struct */
-	archive_mask = req_capsule_client_get(&req->rq_pill,
-					      &RMF_MDS_HSM_ARCHIVE);
-	if (archive_mask == NULL)
+	archive_array = req_capsule_client_get(&req->rq_pill,
+					       &RMF_MDS_HSM_ARCHIVE);
+	if (archive_array == NULL)
 		GOTO(out, rc = -EPROTO);
 
-	*archive_mask = archives;
+	if (archives != NULL)
+		memcpy(archive_array, archives, archives_size);
+	else
+		*archive_array = archive_count;
 
 	ptlrpc_request_set_replen(req);
 
@@ -1977,7 +2131,7 @@ static int mdc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
 			GOTO(out, rc = -EFAULT);
 
 		rc = mdc_statfs(NULL, obd->obd_self_export, &stat_buf,
-				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
 				0);
 		if (rc != 0)
 			GOTO(out, rc);
@@ -2061,6 +2215,13 @@ static int mdc_get_info_rpc(struct obd_export *exp,
                              RCL_SERVER, vallen);
         ptlrpc_request_set_replen(req);
 
+	/* if server failed to resolve FID, and OI scrub not able to fix it, it
+	 * will return -EINPROGRESS, ptlrpc_queue_wait() will keep retrying,
+	 * set request interruptible to avoid deadlock.
+	 */
+	if (KEY_IS(KEY_FID2PATH))
+		req->rq_allow_intr = 1;
+
 	rc = ptlrpc_queue_wait(req);
 	/* -EREMOTE means the get_info result is partial, and it needs to
 	 * continue on another MDT, see fid2path part in lmv_iocontrol */
@@ -2114,9 +2275,8 @@ static void lustre_swab_kuch(struct kuc_hdr *l)
 static int mdc_ioc_hsm_ct_start(struct obd_export *exp,
 				struct lustre_kernelcomm *lk)
 {
-	struct obd_import  *imp = class_exp2cliimp(exp);
-	__u32		    archive = lk->lk_data;
-	int		    rc = 0;
+	struct obd_import *imp = class_exp2cliimp(exp);
+	int rc = 0;
 
 	if (lk->lk_group != KUC_GRP_HSM) {
 		CERROR("Bad copytool group %d\n", lk->lk_group);
@@ -2130,7 +2290,12 @@ static int mdc_ioc_hsm_ct_start(struct obd_export *exp,
 		/* Unregister with the coordinator */
 		rc = mdc_ioc_hsm_ct_unregister(imp);
 	} else {
-		rc = mdc_ioc_hsm_ct_register(imp, archive);
+		__u32 *archives = NULL;
+
+		if ((lk->lk_flags & LK_FLG_DATANR) && lk->lk_data_count > 0)
+			archives = lk->lk_data;
+
+		rc = mdc_ioc_hsm_ct_register(imp, lk->lk_data_count, archives);
 	}
 
 	return rc;
@@ -2181,17 +2346,29 @@ static int mdc_hsm_copytool_send(const struct obd_uuid *uuid,
  */
 static int mdc_hsm_ct_reregister(void *data, void *cb_arg)
 {
-	struct kkuc_ct_data	*kcd = data;
-	struct obd_import	*imp = (struct obd_import *)cb_arg;
-	int			 rc;
+	struct obd_import *imp = (struct obd_import *)cb_arg;
+	struct kkuc_ct_data *kcd = data;
+	__u32 *archives = NULL;
+	int rc;
 
-	if (kcd == NULL || kcd->kcd_magic != KKUC_CT_DATA_MAGIC)
+	if (kcd == NULL ||
+	    (kcd->kcd_magic != KKUC_CT_DATA_ARRAY_MAGIC &&
+	     kcd->kcd_magic != KKUC_CT_DATA_BITMAP_MAGIC))
 		return -EPROTO;
 
-	CDEBUG(D_HA, "%s: recover copytool registration to MDT (archive=%#x)\n",
-	       imp->imp_obd->obd_name, kcd->kcd_archive);
-	rc = mdc_ioc_hsm_ct_register(imp, kcd->kcd_archive);
+	if (kcd->kcd_magic == KKUC_CT_DATA_BITMAP_MAGIC) {
+		CDEBUG(D_HA, "%s: recover copytool registration to MDT "
+		       "(archive=%#x)\n", imp->imp_obd->obd_name,
+		       kcd->kcd_nr_archives);
+	} else {
+		CDEBUG(D_HA, "%s: recover copytool registration to MDT "
+		       "(archive nr = %u)\n",
+		       imp->imp_obd->obd_name, kcd->kcd_nr_archives);
+		if (kcd->kcd_nr_archives != 0)
+			archives = kcd->kcd_archives;
+	}
 
+	rc = mdc_ioc_hsm_ct_register(imp, kcd->kcd_nr_archives, archives);
 	/* ignore error if the copytool is already registered */
 	return (rc == -EEXIST) ? 0 : rc;
 }
@@ -2237,14 +2414,6 @@ static int mdc_set_info_async(const struct lu_env *env,
                                        keylen, key, vallen, val, set);
                 RETURN(rc);
         }
-        if (KEY_IS(KEY_SPTLRPC_CONF)) {
-                sptlrpc_conf_client_adapt(exp->exp_obd);
-                RETURN(0);
-        }
-        if (KEY_IS(KEY_FLUSH_CTX)) {
-                sptlrpc_import_flush_my_ctx(imp);
-                RETURN(0);
-        }
         if (KEY_IS(KEY_CHANGELOG_CLEAR)) {
                 rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION,
                                        keylen, key, vallen, val, set);
@@ -2263,8 +2432,8 @@ static int mdc_set_info_async(const struct lu_env *env,
 		RETURN(0);
 	}
 
-	CERROR("Unknown key %s\n", (char *)key);
-	RETURN(-EINVAL);
+	rc = osc_set_info_async(env, exp, keylen, key, vallen, val, set);
+	RETURN(rc);
 }
 
 static int mdc_get_info(const struct lu_env *env, struct obd_export *exp,
@@ -2340,17 +2509,97 @@ static int mdc_fsync(struct obd_export *exp, const struct lu_fid *fid,
         RETURN(rc);
 }
 
+struct mdc_rmfid_args {
+	int *mra_rcs;
+	int mra_nr;
+};
+
+int mdc_rmfid_interpret(const struct lu_env *env, struct ptlrpc_request *req,
+			  void *args, int rc)
+{
+	struct mdc_rmfid_args *aa;
+	int *rcs, size;
+	ENTRY;
+
+	if (!rc) {
+		aa = ptlrpc_req_async_args(req);
+
+		size = req_capsule_get_size(&req->rq_pill, &RMF_RCS,
+					    RCL_SERVER);
+		LASSERT(size == sizeof(int) * aa->mra_nr);
+		rcs = req_capsule_server_get(&req->rq_pill, &RMF_RCS);
+		LASSERT(rcs);
+		LASSERT(aa->mra_rcs);
+		LASSERT(aa->mra_nr);
+		memcpy(aa->mra_rcs, rcs, size);
+	}
+
+	RETURN(rc);
+}
+
+static int mdc_rmfid(struct obd_export *exp, struct fid_array *fa,
+		     int *rcs, struct ptlrpc_request_set *set)
+{
+	struct ptlrpc_request *req;
+	struct mdc_rmfid_args *aa;
+	struct mdt_body *b;
+	struct lu_fid *tmp;
+	int rc, flen;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_RMFID);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	flen = fa->fa_nr * sizeof(struct lu_fid);
+	req_capsule_set_size(&req->rq_pill, &RMF_FID_ARRAY,
+			     RCL_CLIENT, flen);
+	req_capsule_set_size(&req->rq_pill, &RMF_FID_ARRAY,
+			     RCL_SERVER, flen);
+	req_capsule_set_size(&req->rq_pill, &RMF_RCS,
+			     RCL_SERVER, fa->fa_nr * sizeof(__u32));
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_RMFID);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_FID_ARRAY);
+	memcpy(tmp, fa->fa_fids, flen);
+
+	mdc_pack_body(req, NULL, 0, 0, -1, 0);
+	b = req_capsule_client_get(&req->rq_pill, &RMF_MDT_BODY);
+	b->mbo_ctime = ktime_get_real_seconds();
+
+	ptlrpc_request_set_replen(req);
+
+	LASSERT(rcs);
+	aa = ptlrpc_req_async_args(req);
+	aa->mra_rcs = rcs;
+	aa->mra_nr = fa->fa_nr;
+	req->rq_interpret_reply = mdc_rmfid_interpret;
+
+	ptlrpc_set_add_req(set, req);
+	ptlrpc_check_set(NULL, set);
+
+	RETURN(rc);
+}
+
 static int mdc_import_event(struct obd_device *obd, struct obd_import *imp,
 			    enum obd_import_event event)
 {
+	struct client_obd *cli = &obd->u.cli;
 	int rc = 0;
 
 	LASSERT(imp->imp_obd == obd);
 
 	switch (event) {
-
-	case IMP_EVENT_INACTIVE: {
-		struct client_obd *cli = &obd->u.cli;
+	case IMP_EVENT_DISCON:
+		spin_lock(&cli->cl_loi_list_lock);
+		cli->cl_avail_grant = 0;
+		cli->cl_lost_grant = 0;
+		spin_unlock(&cli->cl_loi_list_lock);
+		break;
+	case IMP_EVENT_INACTIVE:
 		/*
 		 * Flush current sequence to make client obtain new one
 		 * from server in case of disconnect/reconnect.
@@ -2362,12 +2611,28 @@ static int mdc_import_event(struct obd_device *obd, struct obd_import *imp,
 
 		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE);
 		break;
-	}
 	case IMP_EVENT_INVALIDATE: {
 		struct ldlm_namespace *ns = obd->obd_namespace;
+		struct lu_env *env;
+		__u16 refcheck;
 
 		ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
 
+		env = cl_env_get(&refcheck);
+		if (!IS_ERR(env)) {
+			/* Reset grants. All pages go to failing rpcs due to
+			 * the invalid import.
+			 */
+			osc_io_unplug(env, cli, NULL);
+
+			cfs_hash_for_each_nolock(ns->ns_rs_hash,
+						 osc_ldlm_resource_invalidate,
+						 env, 0);
+			cl_env_put(env, &refcheck);
+			ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+		} else {
+			rc = PTR_ERR(env);
+		}
 		break;
 	}
 	case IMP_EVENT_ACTIVE:
@@ -2376,10 +2641,15 @@ static int mdc_import_event(struct obd_device *obd, struct obd_import *imp,
 		if (rc == 0)
 			rc = mdc_kuc_reregister(imp);
 		break;
-	case IMP_EVENT_OCD:
+	case IMP_EVENT_OCD: {
+		struct obd_connect_data *ocd = &imp->imp_connect_data;
+
+		if (OCD_HAS_FLAG(ocd, GRANT))
+			osc_init_grant(cli, ocd);
+
 		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD);
 		break;
-	case IMP_EVENT_DISCON:
+	}
 	case IMP_EVENT_DEACTIVATE:
 	case IMP_EVENT_ACTIVATE:
 		break;
@@ -2428,6 +2698,12 @@ static int mdc_cancel_weight(struct ldlm_lock *lock)
 	if (lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_OPEN)
 		RETURN(0);
 
+	/* Special case for DoM locks, cancel only unused and granted locks */
+	if (ldlm_has_dom(lock) &&
+	    (lock->l_granted_mode != lock->l_req_mode ||
+	     osc_ldlm_weigh_ast(lock) != 0))
+		RETURN(0);
+
 	RETURN(1);
 }
 
@@ -2476,25 +2752,21 @@ static void mdc_llog_finish(struct obd_device *obd)
 	EXIT;
 }
 
-static int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg)
+int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg)
 {
-	int				rc;
+	int rc;
+
 	ENTRY;
 
-	rc = ptlrpcd_addref();
+	rc = osc_setup_common(obd, cfg);
 	if (rc < 0)
 		RETURN(rc);
 
-        rc = client_obd_setup(obd, cfg);
-        if (rc)
-		GOTO(err_ptlrpcd_decref, rc);
-#ifdef CONFIG_PROC_FS
-	obd->obd_vars = lprocfs_mdc_obd_vars;
-	lprocfs_obd_setup(obd);
-	lprocfs_alloc_md_stats(obd, 0);
-#endif
-	sptlrpc_lprocfs_cliobd_attach(obd);
-	ptlrpc_lprocfs_register_obd(obd);
+	rc = mdc_tunables_init(obd);
+	if (rc)
+		GOTO(err_osc_cleanup, rc);
+
+	obd->u.cli.cl_dom_min_inline_repsize = MDC_DOM_DEF_INLINE_REPSIZE;
 
 	ns_register_cancel(obd->obd_namespace, mdc_cancel_weight);
 
@@ -2504,25 +2776,26 @@ static int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg)
         if (rc) {
                 CERROR("%s: failed to setup llogging subsystems: rc = %d\n",
 		       obd->obd_name, rc);
-		GOTO(err_mdc_cleanup, rc);
+		GOTO(err_llog_cleanup, rc);
         }
 
 	rc = mdc_changelog_cdev_init(obd);
 	if (rc) {
 		CERROR("%s: failed to setup changelog char device: rc = %d\n",
 		       obd->obd_name, rc);
-		GOTO(err_mdc_cleanup, rc);
+		GOTO(err_changelog_cleanup, rc);
 	}
 
-	EXIT;
-err_mdc_cleanup:
-	if (rc)
-		client_obd_cleanup(obd);
+	RETURN(rc);
 
-err_ptlrpcd_decref:
-	if (rc)
-	        ptlrpcd_decref();
-        return rc;
+err_changelog_cleanup:
+	mdc_llog_finish(obd);
+err_llog_cleanup:
+	lprocfs_free_md_stats(obd);
+	ptlrpc_lprocfs_unregister_obd(obd);
+err_osc_cleanup:
+	osc_cleanup_common(obd);
+	return rc;
 }
 
 /* Initialize the default and maximum LOV EA sizes.  This allows
@@ -2553,6 +2826,7 @@ static int mdc_precleanup(struct obd_device *obd)
 {
 	ENTRY;
 
+	osc_precleanup_common(obd);
 	mdc_changelog_cdev_finish(obd);
 
 	obd_cleanup_client_import(obd);
@@ -2564,16 +2838,16 @@ static int mdc_precleanup(struct obd_device *obd)
 
 static int mdc_cleanup(struct obd_device *obd)
 {
-        ptlrpcd_decref();
-
-        return client_obd_cleanup(obd);
+	return osc_cleanup_common(obd);
 }
 
-static int mdc_process_config(struct obd_device *obd, size_t len, void *buf)
+int mdc_process_config(struct obd_device *obd, size_t len, void *buf)
 {
-        struct lustre_cfg *lcfg = buf;
-	int rc = class_process_proc_param(PARAM_MDC, obd->obd_vars, lcfg, obd);
-	return (rc > 0 ? 0: rc);
+	struct lustre_cfg *lcfg = buf;
+	size_t count  = class_modify_config(lcfg, PARAM_MDC,
+					    &obd->obd_kset.kobj);
+
+	return count > 0 ? 0 : count;
 }
 
 static struct obd_ops mdc_obd_ops = {
@@ -2584,7 +2858,8 @@ static struct obd_ops mdc_obd_ops = {
         .o_add_conn         = client_import_add_conn,
         .o_del_conn         = client_import_del_conn,
         .o_connect          = client_connect_import,
-        .o_disconnect       = client_disconnect_export,
+	.o_reconnect	    = osc_reconnect,
+	.o_disconnect	    = osc_disconnect,
         .o_iocontrol        = mdc_iocontrol,
         .o_set_info_async   = mdc_set_info_async,
         .o_statfs           = mdc_statfs,
@@ -2600,42 +2875,69 @@ static struct obd_ops mdc_obd_ops = {
 
 static struct md_ops mdc_md_ops = {
 	.m_get_root	    = mdc_get_root,
-        .m_null_inode	    = mdc_null_inode,
-        .m_close            = mdc_close,
-        .m_create           = mdc_create,
-        .m_enqueue          = mdc_enqueue,
-        .m_getattr          = mdc_getattr,
-        .m_getattr_name     = mdc_getattr_name,
-        .m_intent_lock      = mdc_intent_lock,
-        .m_link             = mdc_link,
-        .m_rename           = mdc_rename,
-        .m_setattr          = mdc_setattr,
-        .m_setxattr         = mdc_setxattr,
-        .m_getxattr         = mdc_getxattr,
+	.m_null_inode	    = mdc_null_inode,
+	.m_close            = mdc_close,
+	.m_create           = mdc_create,
+	.m_enqueue          = mdc_enqueue,
+	.m_getattr          = mdc_getattr,
+	.m_getattr_name     = mdc_getattr_name,
+	.m_intent_lock      = mdc_intent_lock,
+	.m_link             = mdc_link,
+	.m_rename           = mdc_rename,
+	.m_setattr          = mdc_setattr,
+	.m_setxattr         = mdc_setxattr,
+	.m_getxattr         = mdc_getxattr,
 	.m_fsync		= mdc_fsync,
+	.m_file_resync		= mdc_file_resync,
 	.m_read_page		= mdc_read_page,
-        .m_unlink           = mdc_unlink,
-        .m_cancel_unused    = mdc_cancel_unused,
-        .m_init_ea_size     = mdc_init_ea_size,
-        .m_set_lock_data    = mdc_set_lock_data,
-        .m_lock_match       = mdc_lock_match,
-        .m_get_lustre_md    = mdc_get_lustre_md,
-        .m_free_lustre_md   = mdc_free_lustre_md,
-        .m_set_open_replay_data = mdc_set_open_replay_data,
-        .m_clear_open_replay_data = mdc_clear_open_replay_data,
-        .m_intent_getattr_async = mdc_intent_getattr_async,
-        .m_revalidate_lock      = mdc_revalidate_lock
+	.m_unlink           = mdc_unlink,
+	.m_cancel_unused    = mdc_cancel_unused,
+	.m_init_ea_size     = mdc_init_ea_size,
+	.m_set_lock_data    = mdc_set_lock_data,
+	.m_lock_match       = mdc_lock_match,
+	.m_get_lustre_md    = mdc_get_lustre_md,
+	.m_free_lustre_md   = mdc_free_lustre_md,
+	.m_set_open_replay_data = mdc_set_open_replay_data,
+	.m_clear_open_replay_data = mdc_clear_open_replay_data,
+	.m_intent_getattr_async = mdc_intent_getattr_async,
+	.m_revalidate_lock      = mdc_revalidate_lock,
+	.m_rmfid		= mdc_rmfid,
 };
 
+dev_t mdc_changelog_dev;
+struct class *mdc_changelog_class;
 static int __init mdc_init(void)
 {
-	return class_register_type(&mdc_obd_ops, &mdc_md_ops, true, NULL,
-				   LUSTRE_MDC_NAME, NULL);
+	int rc = 0;
+	rc = alloc_chrdev_region(&mdc_changelog_dev, 0,
+				 MDC_CHANGELOG_DEV_COUNT,
+				 MDC_CHANGELOG_DEV_NAME);
+	if (rc)
+		return rc;
+
+	mdc_changelog_class = class_create(THIS_MODULE, MDC_CHANGELOG_DEV_NAME);
+	if (IS_ERR(mdc_changelog_class)) {
+		rc = PTR_ERR(mdc_changelog_class);
+		goto out_dev;
+	}
+
+	rc = class_register_type(&mdc_obd_ops, &mdc_md_ops, true, NULL,
+				 LUSTRE_MDC_NAME, &mdc_device_type);
+	if (rc)
+		goto out_dev;
+
+	return 0;
+
+out_dev:
+	unregister_chrdev_region(mdc_changelog_dev, MDC_CHANGELOG_DEV_COUNT);
+	return rc;
 }
 
 static void __exit mdc_exit(void)
 {
-        class_unregister_type(LUSTRE_MDC_NAME);
+	class_destroy(mdc_changelog_class);
+	unregister_chrdev_region(mdc_changelog_dev, MDC_CHANGELOG_DEV_COUNT);
+	class_unregister_type(LUSTRE_MDC_NAME);
 }
 
 MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
diff --git a/drivers/staging/lustrefsx/lustre/mgc/lproc_mgc.c b/drivers/staging/lustrefsx/lustre/mgc/lproc_mgc.c
index ab1985d9d9d24..f277d3e489e70 100644
--- a/drivers/staging/lustrefsx/lustre/mgc/lproc_mgc.c
+++ b/drivers/staging/lustrefsx/lustre/mgc/lproc_mgc.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -39,33 +39,26 @@
 
 #ifdef CONFIG_PROC_FS
 
-LPROC_SEQ_FOPS_RO_TYPE(mgc, uuid);
-LPROC_SEQ_FOPS_RO_TYPE(mgc, connect_flags);
-LPROC_SEQ_FOPS_RO_TYPE(mgc, server_uuid);
-LPROC_SEQ_FOPS_RO_TYPE(mgc, conn_uuid);
-LPROC_SEQ_FOPS_RO_TYPE(mgc, import);
-LPROC_SEQ_FOPS_RO_TYPE(mgc, state);
+LDEBUGFS_SEQ_FOPS_RO_TYPE(mgc, connect_flags);
 
-LPROC_SEQ_FOPS_WO_TYPE(mgc, ping);
+LDEBUGFS_SEQ_FOPS_RO_TYPE(mgc, server_uuid);
+
+LDEBUGFS_SEQ_FOPS_RO_TYPE(mgc, import);
+
+LDEBUGFS_SEQ_FOPS_RO_TYPE(mgc, state);
 
 static int mgc_ir_state_seq_show(struct seq_file *m, void *v)
 {
 	return lprocfs_mgc_rd_ir_state(m, m->private);
 }
-LPROC_SEQ_FOPS_RO(mgc_ir_state);
 
-struct lprocfs_vars lprocfs_mgc_obd_vars[] = {
-	{ .name	=	"uuid",
-	  .fops	=	&mgc_uuid_fops		},
-	{ .name	=	"ping",
-	  .fops	=	&mgc_ping_fops,
-	  .proc_mode =	0222			},
+LDEBUGFS_SEQ_FOPS_RO(mgc_ir_state);
+
+struct ldebugfs_vars ldebugfs_mgc_obd_vars[] = {
 	{ .name	=	"connect_flags",
 	  .fops	=	&mgc_connect_flags_fops	},
 	{ .name	=	"mgs_server_uuid",
 	  .fops	=	&mgc_server_uuid_fops	},
-	{ .name	=	"mgs_conn_uuid",
-	  .fops	=	&mgc_conn_uuid_fops	},
 	{ .name	=	"import",
 	  .fops	=	&mgc_import_fops	},
 	{ .name	=	"state",
@@ -75,3 +68,28 @@ struct lprocfs_vars lprocfs_mgc_obd_vars[] = {
 	{ NULL }
 };
 #endif /* CONFIG_PROC_FS */
+
+LUSTRE_ATTR(mgs_conn_uuid, 0444, conn_uuid_show, NULL);
+LUSTRE_RO_ATTR(conn_uuid);
+
+LUSTRE_RW_ATTR(ping);
+
+static struct attribute *mgc_attrs[] = {
+	&lustre_attr_mgs_conn_uuid.attr,
+	&lustre_attr_conn_uuid.attr,
+	&lustre_attr_ping.attr,
+	NULL,
+};
+
+int mgc_tunables_init(struct obd_device *obd)
+{
+	int rc;
+
+	obd->obd_ktype.default_attrs = mgc_attrs;
+	obd->obd_debugfs_vars = ldebugfs_mgc_obd_vars;
+	rc = lprocfs_obd_setup(obd, true);
+	if (rc)
+		return rc;
+
+	return sptlrpc_lprocfs_cliobd_attach(obd);
+}
diff --git a/drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h b/drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h
index 50b4b602e17eb..cd49fa2e47ffe 100644
--- a/drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h
+++ b/drivers/staging/lustrefsx/lustre/mgc/mgc_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -34,16 +34,13 @@
 #define _MGC_INTERNAL_H
 
 #include <libcfs/libcfs.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_lib.h>
 #include <lustre_dlm.h>
 #include <lustre_log.h>
 #include <lustre_export.h>
 
-#ifdef CONFIG_PROC_FS
-extern struct lprocfs_vars lprocfs_mgc_obd_vars[];
+int mgc_tunables_init(struct obd_device *obd);
 int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data);
-#endif /* CONFIG_PROC_FS */
 
 int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld);
 
diff --git a/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c b/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c
index 2bd0f39dbed4b..ab588e1d100af 100644
--- a/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c
+++ b/drivers/staging/lustrefsx/lustre/mgc/mgc_request.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -120,7 +120,7 @@ EXPORT_SYMBOL(mgc_logname2resid);
 
 /********************** config llog list **********************/
 static struct list_head config_llog_list = LIST_HEAD_INIT(config_llog_list);
-static DEFINE_SPINLOCK(config_list_lock);
+static DEFINE_SPINLOCK(config_list_lock);	/* protects config_llog_list */
 
 /* Take a reference to a config log */
 static int config_log_get(struct config_llog_data *cld)
@@ -539,16 +539,15 @@ static int config_log_end(char *logname, struct config_llog_instance *cfg)
 	RETURN(rc);
 }
 
-#ifdef CONFIG_PROC_FS
 int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data)
 {
 	struct obd_device       *obd = data;
 	struct obd_import       *imp;
 	struct obd_connect_data *ocd;
 	struct config_llog_data *cld;
-	ENTRY;
 
-	LASSERT(obd != NULL);
+	ENTRY;
+	LASSERT(obd);
 	LPROCFS_CLIMP_CHECK(obd);
 	imp = obd->u.cli.cl_import;
 	ocd = &imp->imp_connect_data;
@@ -570,7 +569,6 @@ int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data)
 	LPROCFS_CLIMP_EXIT(obd);
 	RETURN(0);
 }
-#endif
 
 /* reenqueue any lost locks */
 #define RQ_RUNNING	0x1
@@ -968,11 +966,9 @@ static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 		GOTO(err_cleanup, rc);
 	}
 
-#ifdef CONFIG_PROC_FS
-	obd->obd_vars = lprocfs_mgc_obd_vars;
-	lprocfs_obd_setup(obd);
-#endif
-	sptlrpc_lprocfs_cliobd_attach(obd);
+	rc = mgc_tunables_init(obd);
+	if (rc)
+		GOTO(err_sysfs, rc);
 
 	if (atomic_inc_return(&mgc_count) == 1) {
 		rq_state = 0;
@@ -985,7 +981,7 @@ static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 			CERROR("%s: cannot start requeue thread: rc = %d; "
 			       "no more log updates\n",
 			       obd->obd_name, rc);
-			GOTO(err_cleanup, rc);
+			GOTO(err_sysfs, rc);
 		}
 		/* rc is the task_struct pointer of mgc_requeue_thread. */
 		rc = 0;
@@ -994,6 +990,8 @@ static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 
 	RETURN(rc);
 
+err_sysfs:
+	lprocfs_obd_cleanup(obd);
 err_cleanup:
 	client_obd_cleanup(obd);
 err_decref:
@@ -1642,8 +1640,7 @@ static int mgc_process_recover_nodemap_log(struct obd_device *obd,
 	mgc_conn = class_exp2cliimp(cld->cld_mgcexp)->imp_connection;
 
 	/* don't need to get local config */
-	if (cld_is_nodemap(cld) &&
-	    (LNET_NETTYP(LNET_NIDNET(mgc_conn->c_peer.nid)) == LOLND))
+	if (cld_is_nodemap(cld) && LNetIsPeerLocal(mgc_conn->c_peer.nid))
 		GOTO(out, rc = 0);
 
         /* allocate buffer for bulk transfer.
@@ -1754,15 +1751,8 @@ static int mgc_process_recover_nodemap_log(struct obd_device *obd,
 #ifdef HAVE_SERVER_SUPPORT
 		/* config changed since first read RPC */
 		if (cld_is_nodemap(cld) && config_read_offset == 0) {
-			recent_nodemap = NULL;
-			nodemap_config_dealloc(new_config);
-			new_config = NULL;
-
 			CDEBUG(D_INFO, "nodemap config changed in transit, retrying\n");
-
-			/* setting eof to false, we request config again */
-			eof = false;
-			GOTO(out, rc = 0);
+			GOTO(out, rc = -EAGAIN);
 		}
 #endif
 		if (!eof)
@@ -1770,13 +1760,7 @@ static int mgc_process_recover_nodemap_log(struct obd_device *obd,
 		GOTO(out, rc);
 	}
 
-	mne_swab = !!ptlrpc_rep_need_swab(req);
-#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
-	/* This import flag means the server did an extra swab of IR MNE
-	 * records (fixed in LU-1252), reverse it here if needed. LU-1644 */
-	if (unlikely(req->rq_import->imp_need_mne_swab))
-		mne_swab = !mne_swab;
-#endif
+	mne_swab = ptlrpc_rep_need_swab(req);
 
 	/* When a nodemap config is received, we build a new nodemap config,
 	 * with new nodemap structs. We keep track of the most recently added
@@ -2115,6 +2099,11 @@ int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld)
 				goto restart;
 			} else {
 				mutex_lock(&cld->cld_lock);
+				/* unlock/lock mutex, so check stopping again */
+				if (cld->cld_stopping) {
+					mutex_unlock(&cld->cld_lock);
+					RETURN(0);
+				}
 				spin_lock(&config_list_lock);
 				cld->cld_lostlock = 1;
 				spin_unlock(&config_list_lock);
@@ -2160,6 +2149,12 @@ int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld)
 			CERROR("Can't drop cfg lock: %d\n", rcl);
 	}
 
+	/* requeue nodemap lock immediately if transfer was interrupted */
+	if (cld_is_nodemap(cld) && rc == -EAGAIN) {
+		mgc_requeue_add(cld);
+		rc = 0;
+	}
+
 	RETURN(rc);
 }
 
@@ -2218,11 +2213,6 @@ static int mgc_process_config(struct obd_device *obd, size_t len, void *buf)
 			break;
 		}
 
-		/* COMPAT_146 */
-		/* FIXME only set this for old logs!  Right now this forces
-		   us to always skip the "inside markers" check */
-		cld->cld_cfg.cfg_flags |= CFG_F_COMPAT146;
-
 		rc = mgc_process_log(obd, cld);
 		if (rc == 0 && cld->cld_recover != NULL) {
 			if (OCD_HAS_FLAG(&obd->u.cli.cl_import->
@@ -2293,7 +2283,7 @@ static struct obd_ops mgc_obd_ops = {
 
 static int __init mgc_init(void)
 {
-	return class_register_type(&mgc_obd_ops, NULL, true, NULL,
+	return class_register_type(&mgc_obd_ops, NULL, false, NULL,
 				   LUSTRE_MGC_NAME, NULL);
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/Makefile b/drivers/staging/lustrefsx/lustre/obdclass/Makefile
index 57450ea2824c1..449ebf4b70c86 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/Makefile
+++ b/drivers/staging/lustrefsx/lustre/obdclass/Makefile
@@ -1,16 +1,13 @@
 obj-$(CONFIG_LUSTREFSX_FS)	+= obdclass.o
 
-obdclass-linux-objs := linux-module.o linux-obdo.o linux-sysctl.o
-obdclass-linux-objs := $(addprefix linux/,$(obdclass-linux-objs))
-
-obdclass-y := $(obdclass-linux-objs)
-obdclass-y += llog.o llog_cat.o llog_obd.o llog_swab.o llog_osd.o
-obdclass-y += class_obd.o debug.o genops.o uuid.o llog_ioctl.o
+obdclass-y := llog.o llog_cat.o llog_obd.o llog_swab.o llog_osd.o
+obdclass-y += class_obd.o debug.o genops.o llog_ioctl.o
 obdclass-y += lprocfs_status.o lprocfs_counters.o
 obdclass-y += lustre_handles.o lustre_peer.o local_storage.o
-obdclass-y += statfs_pack.o obdo.o obd_config.o obd_mount.o
+obdclass-y += statfs_pack.o obdo.o obd_config.o obd_mount.o obd_sysfs.o
 obdclass-y += lu_object.o dt_object.o
 obdclass-y += cl_object.o cl_page.o cl_lock.o cl_io.o lu_ref.o
-obdclass-y += linkea.o kernelcomm.o
+obdclass-y += linkea.o kernelcomm.o jobid.o 
+obdclass-y += integrity.o obd_cksum.o
 
 include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/acl.c b/drivers/staging/lustrefsx/lustre/obdclass/acl.c
index 77ea22644e27b..599946f846ec3 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/acl.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/acl.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2013, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -49,20 +49,22 @@
 #ifdef CONFIG_FS_POSIX_ACL
 
 static inline void lustre_posix_acl_le_to_cpu(posix_acl_xattr_entry *d,
-                                              posix_acl_xattr_entry *s)
+					      posix_acl_xattr_entry *s)
 {
-        d->e_tag        = le16_to_cpu(s->e_tag);
-        d->e_perm       = le16_to_cpu(s->e_perm);
-        d->e_id         = le32_to_cpu(s->e_id);
+	d->e_tag = le16_to_cpu(s->e_tag);
+	d->e_perm = le16_to_cpu(s->e_perm);
+	d->e_id = le32_to_cpu(s->e_id);
 }
 
-/*static inline void lustre_posix_acl_cpu_to_le(posix_acl_xattr_entry *d,
-                                              posix_acl_xattr_entry *s)
+#if 0
+static inline void lustre_posix_acl_cpu_to_le(posix_acl_xattr_entry *d,
+					      posix_acl_xattr_entry *s)
 {
-        d->e_tag        = cpu_to_le16(s->e_tag);
-        d->e_perm       = cpu_to_le16(s->e_perm);
-        d->e_id         = cpu_to_le32(s->e_id);
-}*/
+	d->e_tag = cpu_to_le16(s->e_tag);
+	d->e_perm = cpu_to_le16(s->e_perm);
+	d->e_id = cpu_to_le32(s->e_id);
+}
+#endif
 
 /*
  * Check permission based on POSIX ACL.
@@ -71,80 +73,79 @@ int lustre_posix_acl_permission(struct lu_ucred *mu, const struct lu_attr *la,
 				int want, posix_acl_xattr_entry *entry,
 				int count)
 {
-        posix_acl_xattr_entry *pa, *pe, *mask_obj;
-        posix_acl_xattr_entry ae, me;
-        int found = 0;
+	posix_acl_xattr_entry *pa, *pe, *mask_obj;
+	posix_acl_xattr_entry ae, me;
+	int found = 0;
 
-        if (count <= 0)
-                return -EACCES;
+	if (count <= 0)
+		return -EACCES;
 
-        for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) {
-                lustre_posix_acl_le_to_cpu(&ae, pa);
-                switch (ae.e_tag) {
-                case ACL_USER_OBJ:
-                        /* (May have been checked already) */
+	for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) {
+		lustre_posix_acl_le_to_cpu(&ae, pa);
+		switch (ae.e_tag) {
+		case ACL_USER_OBJ:
+			/* (May have been checked already) */
 			if (la->la_uid == mu->uc_fsuid)
 				goto check_perm;
-                        break;
-                case ACL_USER:
+			break;
+		case ACL_USER:
 			if (ae.e_id == mu->uc_fsuid)
 				goto mask;
-                        break;
-                case ACL_GROUP_OBJ:
-                        if (lustre_in_group_p(mu, la->la_gid)) {
-                                found = 1;
-                                if ((ae.e_perm & want) == want)
-                                        goto mask;
-                        }
-                        break;
-                case ACL_GROUP:
-                        if (lustre_in_group_p(mu, ae.e_id)) {
-                                found = 1;
-                                if ((ae.e_perm & want) == want)
-                                        goto mask;
-                        }
-                        break;
-                case ACL_MASK:
-                        break;
-                case ACL_OTHER:
-                        if (found)
-                                return -EACCES;
-                        else
-                                goto check_perm;
-                default:
-                        return -EIO;
-                }
-        }
-        return -EIO;
+			break;
+		case ACL_GROUP_OBJ:
+			if (lustre_in_group_p(mu, la->la_gid)) {
+				found = 1;
+				if ((ae.e_perm & want) == want)
+					goto mask;
+			}
+			break;
+		case ACL_GROUP:
+			if (lustre_in_group_p(mu, ae.e_id)) {
+				found = 1;
+				if ((ae.e_perm & want) == want)
+					goto mask;
+			}
+			break;
+		case ACL_MASK:
+			break;
+		case ACL_OTHER:
+			if (found)
+				return -EACCES;
+			goto check_perm;
+		default:
+			return -EIO;
+}
+	}
+	return -EIO;
 
 mask:
-        for (mask_obj = pa + 1; mask_obj <= pe; mask_obj++) {
-                lustre_posix_acl_le_to_cpu(&me, mask_obj);
-                if (me.e_tag == ACL_MASK) {
-                        if ((ae.e_perm & me.e_perm & want) == want)
-                                return 0;
+	for (mask_obj = pa + 1; mask_obj <= pe; mask_obj++) {
+		lustre_posix_acl_le_to_cpu(&me, mask_obj);
+		if (me.e_tag == ACL_MASK) {
+			if ((ae.e_perm & me.e_perm & want) == want)
+				return 0;
 
-                        return -EACCES;
-                }
-        }
+			return -EACCES;
+		}
+	}
 
 check_perm:
-        if ((ae.e_perm & want) == want)
-                return 0;
+	if ((ae.e_perm & want) == want)
+		return 0;
 
-        return -EACCES;
+	return -EACCES;
 }
 EXPORT_SYMBOL(lustre_posix_acl_permission);
 
 /*
  * Modify the ACL for the chmod.
  */
-int lustre_posix_acl_chmod_masq(posix_acl_xattr_entry *entry, __u32 mode,
-                                int count)
+int lustre_posix_acl_chmod_masq(posix_acl_xattr_entry *entry, u32 mode,
+				int count)
 {
 	posix_acl_xattr_entry *group_obj = NULL, *mask_obj = NULL, *pa, *pe;
 
-        for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) {
+	for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) {
 		switch (le16_to_cpu(pa->e_tag)) {
 		case ACL_USER_OBJ:
 			pa->e_perm = cpu_to_le16((mode & S_IRWXU) >> 6);
@@ -187,8 +188,8 @@ lustre_posix_acl_equiv_mode(posix_acl_xattr_entry *entry, mode_t *mode_p,
 			    int count)
 {
 	posix_acl_xattr_entry *pa, *pe;
-	mode_t                 mode = 0;
-	int                    not_equiv = 0;
+	mode_t mode = 0;
+	int not_equiv = 0;
 
 	for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) {
 		__u16 perm = le16_to_cpu(pa->e_perm);
@@ -224,19 +225,19 @@ EXPORT_SYMBOL(lustre_posix_acl_equiv_mode);
 /*
  * Modify acl when creating a new object.
  */
-int lustre_posix_acl_create_masq(posix_acl_xattr_entry *entry, __u32 *pmode,
-                                 int count)
+int lustre_posix_acl_create_masq(posix_acl_xattr_entry *entry, u32 *pmode,
+				 int count)
 {
-        posix_acl_xattr_entry *group_obj = NULL, *mask_obj = NULL, *pa, *pe;
-        posix_acl_xattr_entry ae;
-	__u32 mode = *pmode;
+	posix_acl_xattr_entry *group_obj = NULL, *mask_obj = NULL, *pa, *pe;
+	posix_acl_xattr_entry ae;
+	u32 mode = *pmode;
 	int not_equiv = 0;
 
-        for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) {
-                lustre_posix_acl_le_to_cpu(&ae, pa);
-                switch (ae.e_tag) {
-                case ACL_USER_OBJ:
-                        ae.e_perm &= (mode >> 6) | ~S_IRWXO;
+	for (pa = &entry[0], pe = &entry[count - 1]; pa <= pe; pa++) {
+		lustre_posix_acl_le_to_cpu(&ae, pa);
+		switch (ae.e_tag) {
+		case ACL_USER_OBJ:
+			ae.e_perm &= (mode >> 6) | ~(0007);
 			pa->e_perm = cpu_to_le16(ae.e_perm);
 			mode &= (ae.e_perm << 6) | ~S_IRWXU;
 			break;
@@ -244,39 +245,39 @@ int lustre_posix_acl_create_masq(posix_acl_xattr_entry *entry, __u32 *pmode,
 		case ACL_GROUP:
 			not_equiv = 1;
 			break;
-                case ACL_GROUP_OBJ:
+		case ACL_GROUP_OBJ:
 			group_obj = pa;
-                        break;
-                case ACL_OTHER:
-                        ae.e_perm &= mode | ~S_IRWXO;
+			break;
+		case ACL_OTHER:
+			ae.e_perm &= mode | ~(0007);
 			pa->e_perm = cpu_to_le16(ae.e_perm);
-			mode &= ae.e_perm | ~S_IRWXO;
-                        break;
-                case ACL_MASK:
+			mode &= ae.e_perm | ~(0007);
+			break;
+		case ACL_MASK:
 			mask_obj = pa;
 			not_equiv = 1;
-                        break;
+			break;
 		default:
 			return -EIO;
-                }
-        }
+		}
+	}
 
 	if (mask_obj) {
 		ae.e_perm = le16_to_cpu(mask_obj->e_perm) &
-                            ((mode >> 3) | ~S_IRWXO);
+					((mode >> 3) | ~(0007));
 		mode &= (ae.e_perm << 3) | ~S_IRWXG;
-                mask_obj->e_perm = cpu_to_le16(ae.e_perm);
+		mask_obj->e_perm = cpu_to_le16(ae.e_perm);
 	} else {
 		if (!group_obj)
 			return -EIO;
 		ae.e_perm = le16_to_cpu(group_obj->e_perm) &
-                            ((mode >> 3) | ~S_IRWXO);
+					((mode >> 3) | ~(0007));
 		mode &= (ae.e_perm << 3) | ~S_IRWXG;
-                group_obj->e_perm = cpu_to_le16(ae.e_perm);
+		group_obj->e_perm = cpu_to_le16(ae.e_perm);
 	}
 
 	*pmode = (*pmode & ~S_IRWXUGO) | mode;
-        return not_equiv;
+	return not_equiv;
 }
 EXPORT_SYMBOL(lustre_posix_acl_create_masq);
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_internal.h b/drivers/staging/lustrefsx/lustre/obdclass/cl_internal.h
index 0f95caf310755..0c1276deb37bc 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/cl_internal.h
+++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_io.c b/drivers/staging/lustrefsx/lustre/obdclass/cl_io.c
index fc22b2c89f17d..181ef89299b2d 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/cl_io.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_io.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -44,7 +44,6 @@
 #include <lustre_fid.h>
 #include <cl_object.h>
 #include "cl_internal.h"
-#include <lustre_compat.h>
 
 /*****************************************************************************
  *
@@ -122,6 +121,7 @@ void cl_io_fini(const struct lu_env *env, struct cl_io *io)
 		/* Check ignore layout change conf */
 		LASSERT(ergo(io->ci_ignore_layout || !io->ci_verify_layout,
 				!io->ci_need_restart));
+	case CIT_GLIMPSE:
 		break;
 	case CIT_LADVISE:
 		break;
@@ -188,9 +188,12 @@ EXPORT_SYMBOL(cl_io_sub_init);
 int cl_io_init(const struct lu_env *env, struct cl_io *io,
                enum cl_io_type iot, struct cl_object *obj)
 {
-        LASSERT(obj == cl_object_top(obj));
+	LASSERT(obj == cl_object_top(obj));
 
-        return cl_io_init0(env, io, iot, obj);
+	/* clear I/O restart from previous instance */
+	io->ci_need_restart = 0;
+
+	return cl_io_init0(env, io, iot, obj);
 }
 EXPORT_SYMBOL(cl_io_init);
 
@@ -200,33 +203,24 @@ EXPORT_SYMBOL(cl_io_init);
  * \pre iot == CIT_READ || iot == CIT_WRITE
  */
 int cl_io_rw_init(const struct lu_env *env, struct cl_io *io,
-                  enum cl_io_type iot, loff_t pos, size_t count)
+		  enum cl_io_type iot, loff_t pos, size_t count)
 {
 	LINVRNT(iot == CIT_READ || iot == CIT_WRITE);
 	LINVRNT(io->ci_obj != NULL);
 	ENTRY;
 
-	if (cfs_ptengine_weight(cl_io_engine) < 2)
-		io->ci_pio = 0;
-
 	LU_OBJECT_HEADER(D_VFSTRACE, env, &io->ci_obj->co_lu,
-			 "io %s range: [%llu, %llu) %s %s %s %s\n",
-			 iot == CIT_READ ? "read" : "write",
-			 pos, pos + count,
-			 io->u.ci_rw.rw_nonblock ? "nonblock" : "block",
-			 io->u.ci_rw.rw_append ? "append" : "-",
-			 io->u.ci_rw.rw_sync ? "sync" : "-",
-			 io->ci_pio ? "pio" : "-");
-
-	io->u.ci_rw.rw_range.cir_pos   = pos;
-	io->u.ci_rw.rw_range.cir_count = count;
-
+			 "io range: %u [%llu, %llu) %u %u\n",
+			 iot, (__u64)pos, (__u64)pos + count,
+			 io->u.ci_rw.crw_nonblock, io->u.ci_wr.wr_append);
+	io->u.ci_rw.crw_pos    = pos;
+	io->u.ci_rw.crw_count  = count;
 	RETURN(cl_io_init(env, io, iot, io->ci_obj));
 }
 EXPORT_SYMBOL(cl_io_rw_init);
 
 static int cl_lock_descr_sort(const struct cl_lock_descr *d0,
-                              const struct cl_lock_descr *d1)
+			      const struct cl_lock_descr *d1)
 {
 	return lu_fid_cmp(lu_object_fid(&d0->cld_obj->co_lu),
 			  lu_object_fid(&d1->cld_obj->co_lu));
@@ -470,25 +464,25 @@ EXPORT_SYMBOL(cl_io_iter_fini);
  */
 void cl_io_rw_advance(const struct lu_env *env, struct cl_io *io, size_t nob)
 {
-        const struct cl_io_slice *scan;
+	const struct cl_io_slice *scan;
 
-        LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE ||
-                nob == 0);
-        LINVRNT(cl_io_is_loopable(io));
-        LINVRNT(cl_io_invariant(io));
+	ENTRY;
 
-        ENTRY;
+	LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE ||
+		nob == 0);
+	LINVRNT(cl_io_is_loopable(io));
+	LINVRNT(cl_io_invariant(io));
 
-	io->u.ci_rw.rw_range.cir_pos   += nob;
-	io->u.ci_rw.rw_range.cir_count -= nob;
+	io->u.ci_rw.crw_pos   += nob;
+	io->u.ci_rw.crw_count -= nob;
 
-        /* layers have to be notified. */
+	/* layers have to be notified. */
 	list_for_each_entry_reverse(scan, &io->ci_layers, cis_linkage) {
 		if (scan->cis_iop->op[io->ci_type].cio_advance != NULL)
 			scan->cis_iop->op[io->ci_type].cio_advance(env, scan,
 								   nob);
 	}
-        EXIT;
+	EXIT;
 }
 
 /**
@@ -687,6 +681,7 @@ int cl_io_submit_sync(const struct lu_env *env, struct cl_io *io,
 	struct cl_sync_io *anchor = &cl_env_info(env)->clt_anchor;
 	struct cl_page *pg;
 	int rc;
+	ENTRY;
 
 	cl_page_list_for_each(pg, &queue->c2_qin) {
 		LASSERT(pg->cp_sync_io == NULL);
@@ -715,7 +710,7 @@ int cl_io_submit_sync(const struct lu_env *env, struct cl_io *io,
 		cl_page_list_for_each(pg, &queue->c2_qin)
 			pg->cp_sync_io = NULL;
 	}
-	return rc;
+	RETURN(rc);
 }
 EXPORT_SYMBOL(cl_io_submit_sync);
 
@@ -738,53 +733,6 @@ int cl_io_cancel(const struct lu_env *env, struct cl_io *io,
         return result;
 }
 
-static
-struct cl_io_pt *cl_io_submit_pt(struct cl_io *io, loff_t pos, size_t count)
-{
-	struct cl_io_pt *pt;
-	int rc;
-
-	OBD_ALLOC(pt, sizeof(*pt));
-	if (pt == NULL)
-		RETURN(ERR_PTR(-ENOMEM));
-
-	pt->cip_next = NULL;
-	init_sync_kiocb(&pt->cip_iocb, io->u.ci_rw.rw_file);
-	pt->cip_iocb.ki_pos = pos;
-#ifdef HAVE_KIOCB_KI_LEFT
-	pt->cip_iocb.ki_left = count;
-#elif defined(HAVE_KI_NBYTES)
-	pt->cip_iocb.ki_nbytes = count;
-#endif
-	pt->cip_iter = io->u.ci_rw.rw_iter;
-	iov_iter_truncate(&pt->cip_iter, count);
-	pt->cip_file   = io->u.ci_rw.rw_file;
-	pt->cip_iot    = io->ci_type;
-	pt->cip_pos    = pos;
-	pt->cip_count  = count;
-	pt->cip_result = 0;
-
-	rc = cfs_ptask_init(&pt->cip_task, io->u.ci_rw.rw_ptask, pt,
-			    PTF_ORDERED | PTF_COMPLETE |
-			    PTF_USER_MM | PTF_RETRY, smp_processor_id());
-	if (rc)
-		GOTO(out_error, rc);
-
-	CDEBUG(D_VFSTRACE, "submit %s range: [%llu, %llu)\n",
-		io->ci_type == CIT_READ ? "read" : "write",
-		pos, pos + count);
-
-	rc = cfs_ptask_submit(&pt->cip_task, cl_io_engine);
-	if (rc)
-		GOTO(out_error, rc);
-
-	RETURN(pt);
-
-out_error:
-	OBD_FREE(pt, sizeof(*pt));
-	RETURN(ERR_PTR(rc));
-}
-
 /**
  * Main io loop.
  *
@@ -806,124 +754,50 @@ struct cl_io_pt *cl_io_submit_pt(struct cl_io *io, loff_t pos, size_t count)
  */
 int cl_io_loop(const struct lu_env *env, struct cl_io *io)
 {
-	struct cl_io_pt *pt = NULL, *head = NULL;
-	struct cl_io_pt **tail = &head;
-	loff_t pos;
-	size_t count;
-	size_t last_chunk_count = 0;
-	bool short_io = false;
-	int rc = 0;
-	ENTRY;
+	int result   = 0;
 
 	LINVRNT(cl_io_is_loopable(io));
+	ENTRY;
 
 	do {
-		io->ci_continue = 0;
-
-		rc = cl_io_iter_init(env, io);
-		if (rc) {
-			cl_io_iter_fini(env, io);
-			break;
-		}
-
-		pos   = io->u.ci_rw.rw_range.cir_pos;
-		count = io->u.ci_rw.rw_range.cir_count;
-
-		if (io->ci_pio) {
-			/* submit this range for parallel execution */
-			pt = cl_io_submit_pt(io, pos, count);
-			if (IS_ERR(pt)) {
-				cl_io_iter_fini(env, io);
-				rc = PTR_ERR(pt);
-				break;
-			}
-
-			*tail = pt;
-			tail = &pt->cip_next;
-		} else {
-			size_t nob = io->ci_nob;
-
-			CDEBUG(D_VFSTRACE,
-				"execute type %u range: [%llu, %llu) nob: %zu %s\n",
-				io->ci_type, pos, pos + count, nob,
-				io->ci_continue ? "continue" : "stop");
+		size_t nob;
 
-			rc = cl_io_lock(env, io);
-			if (rc) {
-				cl_io_iter_fini(env, io);
-				break;
+		io->ci_continue = 0;
+		result = cl_io_iter_init(env, io);
+		if (result == 0) {
+			nob    = io->ci_nob;
+			result = cl_io_lock(env, io);
+			if (result == 0) {
+				/*
+				 * Notify layers that locks has been taken,
+				 * and do actual i/o.
+				 *
+				 *   - llite: kms, short read;
+				 *   - llite: generic_file_read();
+				 */
+				result = cl_io_start(env, io);
+				/*
+				 * Send any remaining pending
+				 * io, etc.
+				 *
+				 **   - llite: ll_rw_stats_tally.
+				 */
+				cl_io_end(env, io);
+				cl_io_unlock(env, io);
+				cl_io_rw_advance(env, io, io->ci_nob - nob);
 			}
-
-			/*
-			 * Notify layers that locks has been taken,
-			 * and do actual i/o.
-			 *
-			 *   - llite: kms, short read;
-			 *   - llite: generic_file_read();
-			 */
-			rc = cl_io_start(env, io);
-
-			/*
-			 * Send any remaining pending
-			 * io, etc.
-			 *
-			 *   - llite: ll_rw_stats_tally.
-			 */
-			cl_io_end(env, io);
-			cl_io_unlock(env, io);
-
-			count = io->ci_nob - nob;
-			last_chunk_count = count;
 		}
-
-		cl_io_rw_advance(env, io, count);
 		cl_io_iter_fini(env, io);
-	} while (!rc && io->ci_continue);
-
-	CDEBUG(D_VFSTRACE, "loop type %u done: nob: %zu, rc: %d %s\n",
-		io->ci_type, io->ci_nob, rc,
-		io->ci_continue ? "continue" : "stop");
-
-	while (head != NULL) {
-		int rc2;
-
-		pt = head;
-		head = head->cip_next;
-
-		rc2 = cfs_ptask_wait_for(&pt->cip_task);
-		LASSERTF(!rc2, "wait for task error: %d\n", rc2);
-
-		rc2 = cfs_ptask_result(&pt->cip_task);
-		CDEBUG(D_VFSTRACE,
-			"done %s range: [%llu, %llu) ret: %zd, rc: %d\n",
-			pt->cip_iot == CIT_READ ? "read" : "write",
-			pt->cip_pos, pt->cip_pos + pt->cip_count,
-			pt->cip_result, rc2);
-		if (rc2)
-			rc = rc ? rc : rc2;
-		if (!short_io) {
-			if (!rc2) /* IO is done by this task successfully */
-				io->ci_nob += pt->cip_result;
-			if (pt->cip_result < pt->cip_count) {
-				/* short IO happened.
-				 * Not necessary to be an error */
-				CDEBUG(D_VFSTRACE,
-					"incomplete range: [%llu, %llu) "
-					"last_chunk_count: %zu\n",
-					pt->cip_pos,
-					pt->cip_pos + pt->cip_count,
-					last_chunk_count);
-				io->ci_nob -= last_chunk_count;
-				short_io = true;
-			}
-		}
-		OBD_FREE(pt, sizeof(*pt));
-	}
+	} while (result == 0 && io->ci_continue);
 
-	CDEBUG(D_VFSTRACE, "return nob: %zu (%s io), rc: %d\n",
-		io->ci_nob, short_io ? "short" : "full", rc);
+	if (result == -EWOULDBLOCK && io->ci_ndelay) {
+		io->ci_need_restart = 1;
+		result = 0;
+	}
 
-	RETURN(rc < 0 ? rc : io->ci_result);
+	if (result == 0)
+		result = io->ci_result;
+	RETURN(result < 0 ? result : 0);
 }
 EXPORT_SYMBOL(cl_io_loop);
 
@@ -937,20 +811,20 @@ EXPORT_SYMBOL(cl_io_loop);
  * \see cl_lock_slice_add(), cl_req_slice_add(), cl_page_slice_add()
  */
 void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice,
-                     struct cl_object *obj,
-                     const struct cl_io_operations *ops)
+		     struct cl_object *obj,
+		     const struct cl_io_operations *ops)
 {
 	struct list_head *linkage = &slice->cis_linkage;
 
-        LASSERT((linkage->prev == NULL && linkage->next == NULL) ||
+	LASSERT((linkage->prev == NULL && linkage->next == NULL) ||
 		list_empty(linkage));
-        ENTRY;
+	ENTRY;
 
 	list_add_tail(linkage, &io->ci_layers);
-        slice->cis_io  = io;
-        slice->cis_obj = obj;
-        slice->cis_iop = ops;
-        EXIT;
+	slice->cis_io  = io;
+	slice->cis_obj = obj;
+	slice->cis_iop = ops;
+	EXIT;
 }
 EXPORT_SYMBOL(cl_io_slice_add);
 
@@ -1145,6 +1019,7 @@ void cl_page_list_discard(const struct lu_env *env, struct cl_io *io,
 		cl_page_discard(env, io, page);
 	EXIT;
 }
+EXPORT_SYMBOL(cl_page_list_discard);
 
 /**
  * Initialize dual page queue.
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_lock.c b/drivers/staging/lustrefsx/lustre/obdclass/cl_lock.c
index e92dbaf4fda68..30c7186651dba 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/cl_lock.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_lock.c
@@ -111,7 +111,10 @@ int cl_lock_init(const struct lu_env *env, struct cl_lock *lock,
 	INIT_LIST_HEAD(&lock->cll_layers);
 	list_for_each_entry(scan, &obj->co_lu.lo_header->loh_layers,
 			    co_lu.lo_linkage) {
-		result = scan->co_ops->coo_lock_init(env, scan, lock, io);
+		if (scan->co_ops->coo_lock_init != NULL)
+			result = scan->co_ops->coo_lock_init(env, scan, lock,
+							     io);
+
 		if (result != 0) {
 			cl_lock_fini(env, lock);
 			break;
@@ -167,8 +170,8 @@ EXPORT_SYMBOL(cl_lock_cancel);
 int cl_lock_enqueue(const struct lu_env *env, struct cl_io *io,
 		    struct cl_lock *lock, struct cl_sync_io *anchor)
 {
-	const struct cl_lock_slice	*slice;
-	int				rc = -ENOSYS;
+	const struct cl_lock_slice *slice;
+	int rc = 0;
 
 	ENTRY;
 
@@ -200,7 +203,7 @@ int cl_lock_request(const struct lu_env *env, struct cl_io *io,
 	if (rc < 0)
 		RETURN(rc);
 
-	if ((enq_flags & CEF_ASYNC) && !(enq_flags & CEF_AGL)) {
+	if ((enq_flags & CEF_GLIMPSE) && !(enq_flags & CEF_SPECULATIVE)) {
 		anchor = &cl_env_info(env)->clt_anchor;
 		cl_sync_io_init(anchor, 1, cl_sync_io_end);
 	}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_object.c b/drivers/staging/lustrefsx/lustre/obdclass/cl_object.c
index ddf97fc2cf057..5aa59de91b53e 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/cl_object.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_object.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -422,6 +422,24 @@ loff_t cl_object_maxbytes(struct cl_object *obj)
 }
 EXPORT_SYMBOL(cl_object_maxbytes);
 
+int cl_object_flush(const struct lu_env *env, struct cl_object *obj,
+			 struct ldlm_lock *lock)
+{
+	struct lu_object_header *top = obj->co_lu.lo_header;
+	int rc = 0;
+	ENTRY;
+
+	list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) {
+		if (obj->co_ops->coo_object_flush) {
+			rc = obj->co_ops->coo_object_flush(env, obj, lock);
+			if (rc)
+				break;
+		}
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(cl_object_flush);
+
 /**
  * Helper function removing all object locks, and marking object for
  * deletion. All object pages must have been deleted at this point.
@@ -550,19 +568,16 @@ EXPORT_SYMBOL(cl_site_stats_print);
 
 /**
  * The most efficient way is to store cl_env pointer in task specific
- * structures. On Linux, it wont' be easy to use task_struct->journal_info
- * because Lustre code may call into other fs which has certain assumptions
- * about journal_info. Currently following fields in task_struct are identified
- * can be used for this purpose:
- *  - cl_env: for liblustre.
- *  - tux_info: ony on RedHat kernel.
- *  - ...
+ * structures. On Linux, it isn't easy to use task_struct->journal_info
+ * because Lustre code may call into other fs during memory reclaim, which
+ * has certain assumptions about journal_info. There are not currently any
+ * fields in task_struct that can be used for this purpose.
  * \note As long as we use task_struct to store cl_env, we assume that once
  * called into Lustre, we'll never call into the other part of the kernel
  * which will use those fields in task_struct without explicitly exiting
  * Lustre.
  *
- * If there's no space in task_struct is available, hash will be used.
+ * Since there's no space in task_struct is available, hash will be used.
  * bz20044, bz22683.
  */
 
@@ -595,17 +610,20 @@ struct cl_env {
         void             *ce_debug;
 };
 
+static void cl_env_inc(enum cache_stats_item item)
+{
 #ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
-#define CL_ENV_INC(counter) atomic_inc(&cl_env_stats.cs_stats[CS_##counter])
-
-#define CL_ENV_DEC(counter) do {                                              \
-	LASSERT(atomic_read(&cl_env_stats.cs_stats[CS_##counter]) > 0);   \
-	atomic_dec(&cl_env_stats.cs_stats[CS_##counter]);                 \
-} while (0)
-#else
-#define CL_ENV_INC(counter)
-#define CL_ENV_DEC(counter)
+	atomic_inc(&cl_env_stats.cs_stats[item]);
 #endif
+}
+
+static void cl_env_dec(enum cache_stats_item item)
+{
+#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
+	LASSERT(atomic_read(&cl_env_stats.cs_stats[item]) > 0);
+	atomic_dec(&cl_env_stats.cs_stats[item]);
+#endif
+}
 
 static void cl_env_init0(struct cl_env *cle, void *debug)
 {
@@ -615,7 +633,7 @@ static void cl_env_init0(struct cl_env *cle, void *debug)
 
 	cle->ce_ref = 1;
 	cle->ce_debug = debug;
-	CL_ENV_INC(busy);
+	cl_env_inc(CS_busy);
 }
 
 static struct lu_env *cl_env_new(__u32 ctx_tags, __u32 ses_tags, void *debug)
@@ -645,8 +663,8 @@ static struct lu_env *cl_env_new(__u32 ctx_tags, __u32 ses_tags, void *debug)
 			OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
 			env = ERR_PTR(rc);
 		} else {
-			CL_ENV_INC(create);
-			CL_ENV_INC(total);
+			cl_env_inc(CS_create);
+			cl_env_inc(CS_total);
 		}
 	} else
 		env = ERR_PTR(-ENOMEM);
@@ -655,10 +673,10 @@ static struct lu_env *cl_env_new(__u32 ctx_tags, __u32 ses_tags, void *debug)
 
 static void cl_env_fini(struct cl_env *cle)
 {
-        CL_ENV_DEC(total);
-        lu_context_fini(&cle->ce_lu.le_ctx);
-        lu_context_fini(&cle->ce_ses);
-        OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
+	cl_env_dec(CS_total);
+	lu_context_fini(&cle->ce_lu.le_ctx);
+	lu_context_fini(&cle->ce_ses);
+	OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
 }
 
 static struct lu_env *cl_env_obtain(void *debug)
@@ -814,15 +832,15 @@ void cl_env_put(struct lu_env *env, __u16 *refcheck)
         if (--cle->ce_ref == 0) {
 		int cpu = get_cpu();
 
-                CL_ENV_DEC(busy);
-                cle->ce_debug = NULL;
-                cl_env_exit(cle);
-                /*
-                 * Don't bother to take a lock here.
-                 *
-                 * Return environment to the cache only when it was allocated
-                 * with the standard tags.
-                 */
+		cl_env_dec(CS_busy);
+		cle->ce_debug = NULL;
+		cl_env_exit(cle);
+		/*
+		 * Don't bother to take a lock here.
+		 *
+		 * Return environment to the cache only when it was allocated
+		 * with the standard tags.
+		 */
 		if (cl_envs[cpu].cec_count < cl_envs_cached_max &&
 		    (env->le_ctx.lc_tags & ~LCT_HAS_EXIT) == LCT_CL_THREAD &&
 		    (env->le_ses->lc_tags & ~LCT_HAS_EXIT) == LCT_SESSION) {
@@ -844,13 +862,11 @@ EXPORT_SYMBOL(cl_env_put);
  */
 void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr)
 {
-        ENTRY;
         lvb->lvb_size   = attr->cat_size;
         lvb->lvb_mtime  = attr->cat_mtime;
         lvb->lvb_atime  = attr->cat_atime;
         lvb->lvb_ctime  = attr->cat_ctime;
         lvb->lvb_blocks = attr->cat_blocks;
-        EXIT;
 }
 
 /**
@@ -860,13 +876,11 @@ void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr)
  */
 void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb)
 {
-        ENTRY;
         attr->cat_size   = lvb->lvb_size;
         attr->cat_mtime  = lvb->lvb_mtime;
         attr->cat_atime  = lvb->lvb_atime;
         attr->cat_ctime  = lvb->lvb_ctime;
         attr->cat_blocks = lvb->lvb_blocks;
-        EXIT;
 }
 EXPORT_SYMBOL(cl_lvb2attr);
 
@@ -951,7 +965,7 @@ void cl_env_percpu_put(struct lu_env *env)
 	cle->ce_ref--;
 	LASSERT(cle->ce_ref == 0);
 
-	CL_ENV_DEC(busy);
+	cl_env_dec(CS_busy);
 	cle->ce_debug = NULL;
 
 	put_cpu();
@@ -1043,8 +1057,6 @@ static struct lu_kmem_descr cl_object_caches[] = {
         }
 };
 
-struct cfs_ptask_engine *cl_io_engine;
-
 /**
  * Global initialization of cl-data. Create kmem caches, register
  * lu_context_key's, etc.
@@ -1072,17 +1084,8 @@ int cl_global_init(void)
 	if (result) /* no cl_env_percpu_fini on error */
 		GOTO(out_keys, result);
 
-	cl_io_engine = cfs_ptengine_init("clio", cpu_online_mask);
-	if (IS_ERR(cl_io_engine)) {
-		result = PTR_ERR(cl_io_engine);
-		cl_io_engine = NULL;
-		GOTO(out_percpu, result);
-	}
-
 	return 0;
 
-out_percpu:
-	cl_env_percpu_fini();
 out_keys:
 	lu_context_key_degister(&cl_key);
 out_kmem:
@@ -1098,8 +1101,6 @@ int cl_global_init(void)
  */
 void cl_global_fini(void)
 {
-	cfs_ptengine_fini(cl_io_engine);
-	cl_io_engine = NULL;
 	cl_env_percpu_fini();
 	lu_context_key_degister(&cl_key);
 	lu_kmem_fini(cl_object_caches);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/cl_page.c b/drivers/staging/lustrefsx/lustre/obdclass/cl_page.c
index 74f9225ec1d59..a1b1e130f31c6 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/cl_page.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/cl_page.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -74,21 +74,37 @@ static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg);
 #endif /* !CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK */
 
 /* Disable page statistic by default due to huge performance penalty. */
+static void cs_page_inc(const struct cl_object *obj,
+			enum cache_stats_item item)
+{
+#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
+	atomic_inc(&cl_object_site(obj)->cs_pages.cs_stats[item]);
+#endif
+}
+
+static void cs_page_dec(const struct cl_object *obj,
+			enum cache_stats_item item)
+{
 #ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
-#define CS_PAGE_INC(o, item) \
-	atomic_inc(&cl_object_site(o)->cs_pages.cs_stats[CS_##item])
-#define CS_PAGE_DEC(o, item) \
-	atomic_dec(&cl_object_site(o)->cs_pages.cs_stats[CS_##item])
-#define CS_PAGESTATE_INC(o, state) \
-	atomic_inc(&cl_object_site(o)->cs_pages_state[state])
-#define CS_PAGESTATE_DEC(o, state) \
-	atomic_dec(&cl_object_site(o)->cs_pages_state[state])
-#else
-#define CS_PAGE_INC(o, item)
-#define CS_PAGE_DEC(o, item)
-#define CS_PAGESTATE_INC(o, state)
-#define CS_PAGESTATE_DEC(o, state)
+	atomic_dec(&cl_object_site(obj)->cs_pages.cs_stats[item]);
 #endif
+}
+
+static void cs_pagestate_inc(const struct cl_object *obj,
+			     enum cl_page_state state)
+{
+#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
+	atomic_inc(&cl_object_site(obj)->cs_pages_state[state]);
+#endif
+}
+
+static void cs_pagestate_dec(const struct cl_object *obj,
+			      enum cl_page_state state)
+{
+#ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
+	atomic_dec(&cl_object_site(obj)->cs_pages_state[state]);
+#endif
+}
 
 /**
  * Internal version of cl_page_get().
@@ -126,7 +142,8 @@ cl_page_at_trusted(const struct cl_page *page,
 	RETURN(NULL);
 }
 
-static void cl_page_free(const struct lu_env *env, struct cl_page *page)
+static void cl_page_free(const struct lu_env *env, struct cl_page *page,
+			 struct pagevec *pvec)
 {
 	struct cl_object *obj  = page->cp_obj;
 	int pagesize = cl_object_header(obj)->coh_page_bufsize;
@@ -143,10 +160,10 @@ static void cl_page_free(const struct lu_env *env, struct cl_page *page)
 				   struct cl_page_slice, cpl_linkage);
 		list_del_init(page->cp_layers.next);
 		if (unlikely(slice->cpl_ops->cpo_fini != NULL))
-			slice->cpl_ops->cpo_fini(env, slice);
+			slice->cpl_ops->cpo_fini(env, slice, pvec);
 	}
-	CS_PAGE_DEC(obj, total);
-	CS_PAGESTATE_DEC(obj, page->cp_state);
+	cs_page_dec(obj, CS_total);
+	cs_pagestate_dec(obj, page->cp_state);
 	lu_object_ref_del_at(&obj->co_lu, &page->cp_obj_ref, "cl_page", page);
 	cl_object_put(env, obj);
 	lu_ref_fini(&page->cp_reference);
@@ -196,16 +213,16 @@ struct cl_page *cl_page_alloc(const struct lu_env *env,
 								  ind);
 				if (result != 0) {
 					cl_page_delete0(env, page);
-					cl_page_free(env, page);
+					cl_page_free(env, page, NULL);
 					page = ERR_PTR(result);
 					break;
 				}
 			}
 		}
 		if (result == 0) {
-			CS_PAGE_INC(o, total);
-			CS_PAGE_INC(o, create);
-			CS_PAGESTATE_DEC(o, CPS_CACHED);
+			cs_page_inc(o, CS_total);
+			cs_page_inc(o, CS_create);
+			cs_pagestate_dec(o, CPS_CACHED);
 		}
 	} else {
 		page = ERR_PTR(-ENOMEM);
@@ -238,7 +255,7 @@ struct cl_page *cl_page_find(const struct lu_env *env,
 	ENTRY;
 
 	hdr = cl_object_header(o);
-	CS_PAGE_INC(o, lookup);
+	cs_page_inc(o, CS_lookup);
 
         CDEBUG(D_PAGE, "%lu@"DFID" %p %lx %d\n",
                idx, PFID(&hdr->coh_lu.loh_fid), vmpage, vmpage->private, type);
@@ -258,7 +275,7 @@ struct cl_page *cl_page_find(const struct lu_env *env,
                  */
                 page = cl_vmpage_page(vmpage, o);
 		if (page != NULL) {
-			CS_PAGE_INC(o, hit);
+			cs_page_inc(o, CS_hit);
 			RETURN(page);
 		}
         }
@@ -328,8 +345,8 @@ static void cl_page_state_set0(const struct lu_env *env,
 	PASSERT(env, page, page->cp_state == old);
 	PASSERT(env, page, equi(state == CPS_OWNED, page->cp_owner != NULL));
 
-	CS_PAGESTATE_DEC(page->cp_obj, page->cp_state);
-	CS_PAGESTATE_INC(page->cp_obj, state);
+	cs_pagestate_dec(page->cp_obj, page->cp_state);
+	cs_pagestate_inc(page->cp_obj, state);
 	cl_page_state_set_trust(page, state);
 	EXIT;
 }
@@ -357,15 +374,13 @@ void cl_page_get(struct cl_page *page)
 EXPORT_SYMBOL(cl_page_get);
 
 /**
- * Releases a reference to a page.
+ * Releases a reference to a page, use the pagevec to release the pages
+ * in batch if provided.
  *
- * When last reference is released, page is returned to the cache, unless it
- * is in cl_page_state::CPS_FREEING state, in which case it is immediately
- * destroyed.
- *
- * \see cl_object_put(), cl_lock_put().
+ * Users need to do a final pagevec_release() to release any trailing pages.
  */
-void cl_page_put(const struct lu_env *env, struct cl_page *page)
+void cl_pagevec_put(const struct lu_env *env, struct cl_page *page,
+		  struct pagevec *pvec)
 {
         ENTRY;
         CL_PAGE_HEADER(D_TRACE, env, page, "%d\n",
@@ -381,11 +396,26 @@ void cl_page_put(const struct lu_env *env, struct cl_page *page)
 		 * Page is no longer reachable by other threads. Tear
 		 * it down.
 		 */
-		cl_page_free(env, page);
+		cl_page_free(env, page, pvec);
 	}
 
 	EXIT;
 }
+EXPORT_SYMBOL(cl_pagevec_put);
+
+/**
+ * Releases a reference to a page, wrapper to cl_pagevec_put
+ *
+ * When last reference is released, page is returned to the cache, unless it
+ * is in cl_page_state::CPS_FREEING state, in which case it is immediately
+ * destroyed.
+ *
+ * \see cl_object_put(), cl_lock_put().
+ */
+void cl_page_put(const struct lu_env *env, struct cl_page *page)
+{
+	cl_pagevec_put(env, page, NULL);
+}
 EXPORT_SYMBOL(cl_page_put);
 
 /**
@@ -788,6 +818,22 @@ int cl_page_is_vmlocked(const struct lu_env *env, const struct cl_page *pg)
 }
 EXPORT_SYMBOL(cl_page_is_vmlocked);
 
+void cl_page_touch(const struct lu_env *env, const struct cl_page *pg,
+		  size_t to)
+{
+	const struct cl_page_slice *slice;
+
+	ENTRY;
+
+	list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) {
+		if (slice->cpl_ops->cpo_page_touch != NULL)
+			(*slice->cpl_ops->cpo_page_touch)(env, slice, to);
+	}
+
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_touch);
+
 static enum cl_page_state cl_req_type_state(enum cl_req_type crt)
 {
         ENTRY;
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c b/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c
index b6576eb9b52e0..3cf9b86b2835a 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/class_obd.c
@@ -23,7 +23,7 @@
  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -42,17 +42,16 @@
 
 #include <obd_support.h>
 #include <obd_class.h>
-#include <lnet/lnetctl.h>
+#include <uapi/linux/lnet/lnetctl.h>
 #include <lustre_debug.h>
 #include <lustre_kernelcomm.h>
 #include <lprocfs_status.h>
-#include <lustre_ver.h>
 #include <cl_object.h>
 #ifdef HAVE_SERVER_SUPPORT
 # include <dt_object.h>
 # include <md_object.h>
 #endif /* HAVE_SERVER_SUPPORT */
-#include <uapi/linux/lustre_ioctl.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
 #include "llog_internal.h"
 
 #ifdef CONFIG_PROC_FS
@@ -70,6 +69,8 @@ unsigned int obd_dump_on_timeout;
 EXPORT_SYMBOL(obd_dump_on_timeout);
 unsigned int obd_dump_on_eviction;
 EXPORT_SYMBOL(obd_dump_on_eviction);
+unsigned int obd_lbug_on_eviction;
+EXPORT_SYMBOL(obd_lbug_on_eviction);
 unsigned long obd_max_dirty_pages;
 EXPORT_SYMBOL(obd_max_dirty_pages);
 atomic_long_t obd_dirty_pages;
@@ -97,92 +98,11 @@ EXPORT_SYMBOL(at_early_margin);
 int at_extra = 30;
 EXPORT_SYMBOL(at_extra);
 
-atomic_long_t obd_dirty_transit_pages;
-EXPORT_SYMBOL(obd_dirty_transit_pages);
-
-char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE;
-
 #ifdef CONFIG_PROC_FS
 struct lprocfs_stats *obd_memory = NULL;
 EXPORT_SYMBOL(obd_memory);
 #endif
 
-char obd_jobid_node[LUSTRE_JOBID_SIZE + 1];
-
-/* Get jobid of current process by reading the environment variable
- * stored in between the "env_start" & "env_end" of task struct.
- *
- * TODO:
- * It's better to cache the jobid for later use if there is any
- * efficient way, the cl_env code probably could be reused for this
- * purpose.
- *
- * If some job scheduler doesn't store jobid in the "env_start/end",
- * then an upcall could be issued here to get the jobid by utilizing
- * the userspace tools/api. Then, the jobid must be cached.
- */
-int lustre_get_jobid(char *jobid)
-{
-	int jobid_len = LUSTRE_JOBID_SIZE;
-	char tmp_jobid[LUSTRE_JOBID_SIZE] = { 0 };
-	int rc = 0;
-	ENTRY;
-
-	/* Jobstats isn't enabled */
-	if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0)
-		GOTO(out, rc = 0);
-
-	/* Whole node dedicated to single job */
-	if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) == 0) {
-		memcpy(tmp_jobid, obd_jobid_node, LUSTRE_JOBID_SIZE);
-		GOTO(out, rc = 0);
-	}
-
-	/* Use process name + fsuid as jobid */
-	if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) {
-		snprintf(tmp_jobid, LUSTRE_JOBID_SIZE, "%s.%u",
-			 current_comm(),
-			 from_kuid(&init_user_ns, current_fsuid()));
-		GOTO(out, rc = 0);
-	}
-
-	rc = cfs_get_environ(obd_jobid_var, tmp_jobid, &jobid_len);
-	if (rc) {
-		if (rc == -EOVERFLOW) {
-			/* For the PBS_JOBID and LOADL_STEP_ID keys (which are
-			 * variable length strings instead of just numbers), it
-			 * might make sense to keep the unique parts for JobID,
-			 * instead of just returning an error.  That means a
-			 * larger temp buffer for cfs_get_environ(), then
-			 * truncating the string at some separator to fit into
-			 * the specified jobid_len.  Fix later if needed. */
-			static bool printed;
-			if (unlikely(!printed)) {
-				LCONSOLE_ERROR_MSG(0x16b, "%s value too large "
-						   "for JobID buffer (%d)\n",
-						   obd_jobid_var, jobid_len);
-				printed = true;
-			}
-		} else {
-			CDEBUG((rc == -ENOENT || rc == -EINVAL ||
-				rc == -EDEADLK) ? D_INFO : D_ERROR,
-			       "Get jobid for (%s) failed: rc = %d\n",
-			       obd_jobid_var, rc);
-		}
-	}
-
-out:
-	if (rc != 0)
-		RETURN(rc);
-
-	/* Only replace the job ID if it changed. */
-	if (strcmp(jobid, tmp_jobid) != 0)
-		memcpy(jobid, tmp_jobid, jobid_len);
-
-	RETURN(0);
-}
-EXPORT_SYMBOL(lustre_get_jobid);
-
 static int class_resolve_dev_name(__u32 len, const char *name)
 {
         int rc;
@@ -212,6 +132,159 @@ static int class_resolve_dev_name(__u32 len, const char *name)
         RETURN(rc);
 }
 
+#define OBD_MAX_IOCTL_BUFFER	8192
+
+static int obd_ioctl_is_invalid(struct obd_ioctl_data *data)
+{
+	if (data->ioc_len > BIT(30)) {
+		CERROR("OBD ioctl: ioc_len larger than 1<<30\n");
+		return 1;
+	}
+
+	if (data->ioc_inllen1 > BIT(30)) {
+		CERROR("OBD ioctl: ioc_inllen1 larger than 1<<30\n");
+		return 1;
+	}
+
+	if (data->ioc_inllen2 > BIT(30)) {
+		CERROR("OBD ioctl: ioc_inllen2 larger than 1<<30\n");
+		return 1;
+	}
+
+	if (data->ioc_inllen3 > BIT(30)) {
+		CERROR("OBD ioctl: ioc_inllen3 larger than 1<<30\n");
+		return 1;
+	}
+
+	if (data->ioc_inllen4 > BIT(30)) {
+		CERROR("OBD ioctl: ioc_inllen4 larger than 1<<30\n");
+		return 1;
+	}
+
+	if (data->ioc_inlbuf1 && data->ioc_inllen1 == 0) {
+		CERROR("OBD ioctl: inlbuf1 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (data->ioc_inlbuf2 && data->ioc_inllen2 == 0) {
+		CERROR("OBD ioctl: inlbuf2 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (data->ioc_inlbuf3 && data->ioc_inllen3 == 0) {
+		CERROR("OBD ioctl: inlbuf3 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (data->ioc_inlbuf4 && data->ioc_inllen4 == 0) {
+		CERROR("OBD ioctl: inlbuf4 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (data->ioc_pbuf1 && data->ioc_plen1 == 0) {
+		CERROR("OBD ioctl: pbuf1 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (data->ioc_pbuf2 && data->ioc_plen2 == 0) {
+		CERROR("OBD ioctl: pbuf2 pointer but 0 length\n");
+		return 1;
+	}
+
+	if (!data->ioc_pbuf1 && data->ioc_plen1 != 0) {
+		CERROR("OBD ioctl: plen1 set but NULL pointer\n");
+		return 1;
+	}
+
+	if (!data->ioc_pbuf2 && data->ioc_plen2 != 0) {
+		CERROR("OBD ioctl: plen2 set but NULL pointer\n");
+		return 1;
+	}
+
+	if (obd_ioctl_packlen(data) > data->ioc_len) {
+		CERROR("OBD ioctl: packlen exceeds ioc_len (%d > %d)\n",
+		       obd_ioctl_packlen(data), data->ioc_len);
+		return 1;
+	}
+
+	return 0;
+}
+
+/* buffer MUST be at least the size of obd_ioctl_hdr */
+int obd_ioctl_getdata(char **buf, int *len, void __user *arg)
+{
+	struct obd_ioctl_hdr hdr;
+	struct obd_ioctl_data *data;
+	int offset = 0;
+
+	ENTRY;
+	if (copy_from_user(&hdr, arg, sizeof(hdr)))
+		RETURN(-EFAULT);
+
+	if (hdr.ioc_version != OBD_IOCTL_VERSION) {
+		CERROR("Version mismatch kernel (%x) vs application (%x)\n",
+		       OBD_IOCTL_VERSION, hdr.ioc_version);
+		RETURN(-EINVAL);
+	}
+
+	if (hdr.ioc_len > OBD_MAX_IOCTL_BUFFER) {
+		CERROR("User buffer len %d exceeds %d max buffer\n",
+		       hdr.ioc_len, OBD_MAX_IOCTL_BUFFER);
+		RETURN(-EINVAL);
+	}
+
+	if (hdr.ioc_len < sizeof(struct obd_ioctl_data)) {
+		CERROR("User buffer too small for ioctl (%d)\n", hdr.ioc_len);
+		RETURN(-EINVAL);
+	}
+
+	/* When there are lots of processes calling vmalloc on multi-core
+	 * system, the high lock contention will hurt performance badly,
+	 * obdfilter-survey is an example, which relies on ioctl. So we'd
+	 * better avoid vmalloc on ioctl path. LU-66
+	 */
+	OBD_ALLOC_LARGE(*buf, hdr.ioc_len);
+	if (!*buf) {
+		CERROR("Cannot allocate control buffer of len %d\n",
+		       hdr.ioc_len);
+		RETURN(-EINVAL);
+	}
+	*len = hdr.ioc_len;
+	data = (struct obd_ioctl_data *)*buf;
+
+	if (copy_from_user(*buf, arg, hdr.ioc_len)) {
+		OBD_FREE_LARGE(*buf, hdr.ioc_len);
+		RETURN(-EFAULT);
+	}
+
+	if (obd_ioctl_is_invalid(data)) {
+		CERROR("ioctl not correctly formatted\n");
+		OBD_FREE_LARGE(*buf, hdr.ioc_len);
+		RETURN(-EINVAL);
+	}
+
+	if (data->ioc_inllen1) {
+		data->ioc_inlbuf1 = &data->ioc_bulk[0];
+		offset += cfs_size_round(data->ioc_inllen1);
+	}
+
+	if (data->ioc_inllen2) {
+		data->ioc_inlbuf2 = &data->ioc_bulk[0] + offset;
+		offset += cfs_size_round(data->ioc_inllen2);
+	}
+
+	if (data->ioc_inllen3) {
+		data->ioc_inlbuf3 = &data->ioc_bulk[0] + offset;
+		offset += cfs_size_round(data->ioc_inllen3);
+	}
+
+	if (data->ioc_inllen4)
+		data->ioc_inlbuf4 = &data->ioc_bulk[0] + offset;
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(obd_ioctl_getdata);
+
 int class_handle_ioctl(unsigned int cmd, unsigned long arg)
 {
         char *buf = NULL;
@@ -427,8 +500,57 @@ int class_handle_ioctl(unsigned int cmd, unsigned long arg)
 	RETURN(err);
 } /* class_handle_ioctl */
 
-#define OBD_INIT_CHECK
-#ifdef OBD_INIT_CHECK
+/*  opening /dev/obd */
+static int obd_class_open(struct inode * inode, struct file * file)
+{
+	ENTRY;
+	try_module_get(THIS_MODULE);
+	RETURN(0);
+}
+
+/*  closing /dev/obd */
+static int obd_class_release(struct inode * inode, struct file * file)
+{
+	ENTRY;
+
+	module_put(THIS_MODULE);
+	RETURN(0);
+}
+
+/* to control /dev/obd */
+static long obd_class_ioctl(struct file *filp, unsigned int cmd,
+			    unsigned long arg)
+{
+	int err = 0;
+
+	ENTRY;
+	/* Allow non-root access for OBD_IOC_PING_TARGET - used by lfs check */
+	if (!cfs_capable(CFS_CAP_SYS_ADMIN) && (cmd != OBD_IOC_PING_TARGET))
+		RETURN(err = -EACCES);
+
+	if ((cmd & 0xffffff00) == ((int)'T') << 8) /* ignore all tty ioctls */
+		RETURN(err = -ENOTTY);
+
+	err = class_handle_ioctl(cmd, (unsigned long)arg);
+
+	RETURN(err);
+}
+
+/* declare character device */
+static struct file_operations obd_psdev_fops = {
+	.owner		= THIS_MODULE,
+	.unlocked_ioctl	= obd_class_ioctl,	/* unlocked_ioctl */
+	.open		= obd_class_open,	/* open */
+	.release	= obd_class_release,	/* release */
+};
+
+/* modules setup */
+struct miscdevice obd_psdev = {
+	.minor	= MISC_DYNAMIC_MINOR,
+	.name	= OBD_DEV_NAME,
+	.fops	= &obd_psdev_fops,
+};
+
 static int obd_init_checks(void)
 {
         __u64 u64val, div64val;
@@ -494,9 +616,6 @@ static int obd_init_checks(void)
 
         return ret;
 }
-#else
-#define obd_init_checks() do {} while(0)
-#endif
 
 static int __init obdclass_init(void)
 {
@@ -613,7 +732,6 @@ static int __init obdclass_init(void)
 	lu_global_fini();
 
 cleanup_class_procfs:
-	obd_sysctl_clean();
 	class_procfs_clean();
 
 cleanup_caches:
@@ -683,7 +801,6 @@ static void __exit obdclass_exit(void)
 	lu_global_fini();
 
         obd_cleanup_caches();
-        obd_sysctl_clean();
 
         class_procfs_clean();
 
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/dt_object.c b/drivers/staging/lustrefsx/lustre/obdclass/dt_object.c
index a48e7cbe7ec18..68952df7e1242 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/dt_object.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/dt_object.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,7 +40,7 @@
 #define DEBUG_SUBSYSTEM S_CLASS
 
 #include <linux/list.h>
-#include <obd.h>
+#include <obd_class.h>
 #include <dt_object.h>
 /* fid_be_to_cpu() */
 #include <lustre_fid.h>
@@ -53,12 +53,13 @@ LU_KEY_INIT(dt_global, struct dt_thread_info);
 LU_KEY_FINI(dt_global, struct dt_thread_info);
 
 struct lu_context_key dt_key = {
-        .lct_tags = LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD | LCT_LOCAL,
-        .lct_init = dt_global_key_init,
-        .lct_fini = dt_global_key_fini
+	.lct_tags = LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD | LCT_LOCAL,
+	.lct_init = dt_global_key_init,
+	.lct_fini = dt_global_key_fini
 };
 
-/* no lock is necessary to protect the list, because call-backs
+/*
+ * no lock is necessary to protect the list, because call-backs
  * are added during system startup. Please refer to "struct dt_device".
  */
 void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb)
@@ -74,7 +75,7 @@ void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb)
 EXPORT_SYMBOL(dt_txn_callback_del);
 
 int dt_txn_hook_start(const struct lu_env *env,
-                      struct dt_device *dev, struct thandle *th)
+		      struct dt_device *dev, struct thandle *th)
 {
 	int rc = 0;
 	struct dt_txn_callback *cb;
@@ -89,9 +90,11 @@ int dt_txn_hook_start(const struct lu_env *env,
 		    !(cb->dtc_tag & env->le_ctx.lc_tags))
 			continue;
 
-		/* Usually dt_txn_hook_start is called from bottom device,
+		/*
+		 * Usually dt_txn_hook_start is called from bottom device,
 		 * and if the thandle has th_top, then we need use top
-		 * thandle for the callback in the top thandle layer */
+		 * thandle for the callback in the top thandle layer
+		 */
 		if (th->th_top != NULL)
 			dtc_th = th->th_top;
 
@@ -105,9 +108,9 @@ EXPORT_SYMBOL(dt_txn_hook_start);
 
 int dt_txn_hook_stop(const struct lu_env *env, struct thandle *th)
 {
-	struct dt_device       *dev = th->th_dev;
+	struct dt_device *dev = th->th_dev;
 	struct dt_txn_callback *cb;
-	int                     rc = 0;
+	int rc = 0;
 
 	if (th->th_local)
 		return 0;
@@ -122,9 +125,11 @@ int dt_txn_hook_stop(const struct lu_env *env, struct thandle *th)
 		    !(cb->dtc_tag & env->le_ctx.lc_tags))
 			continue;
 
-		/* Usually dt_txn_hook_stop is called from bottom device,
+		/*
+		 * Usually dt_txn_hook_stop is called from bottom device,
 		 * and if the thandle has th_top, then we need use top
-		 * thandle for the callback in the top thandle layer */
+		 * thandle for the callback in the top thandle layer
+		 */
 		if (th->th_top != NULL)
 			dtc_th = th->th_top;
 
@@ -145,53 +150,53 @@ EXPORT_SYMBOL(dt_device_init);
 
 void dt_device_fini(struct dt_device *dev)
 {
-        lu_device_fini(&dev->dd_lu_dev);
+	lu_device_fini(&dev->dd_lu_dev);
 }
 EXPORT_SYMBOL(dt_device_fini);
 
 int dt_object_init(struct dt_object *obj,
-                   struct lu_object_header *h, struct lu_device *d)
+		   struct lu_object_header *h, struct lu_device *d)
 
 {
-        return lu_object_init(&obj->do_lu, h, d);
+	return lu_object_init(&obj->do_lu, h, d);
 }
 EXPORT_SYMBOL(dt_object_init);
 
 void dt_object_fini(struct dt_object *obj)
 {
-        lu_object_fini(&obj->do_lu);
+	lu_object_fini(&obj->do_lu);
 }
 EXPORT_SYMBOL(dt_object_fini);
 
 int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj)
 {
-        if (obj->do_index_ops == NULL)
-                obj->do_ops->do_index_try(env, obj, &dt_directory_features);
-        return obj->do_index_ops != NULL;
+	if (obj->do_index_ops == NULL)
+		obj->do_ops->do_index_try(env, obj, &dt_directory_features);
+	return obj->do_index_ops != NULL;
 }
 EXPORT_SYMBOL(dt_try_as_dir);
 
 enum dt_format_type dt_mode_to_dft(__u32 mode)
 {
-        enum dt_format_type result;
-
-        switch (mode & S_IFMT) {
-        case S_IFDIR:
-                result = DFT_DIR;
-                break;
-        case S_IFREG:
-                result = DFT_REGULAR;
-                break;
-        case S_IFLNK:
-                result = DFT_SYM;
-                break;
-        case S_IFCHR:
-        case S_IFBLK:
-        case S_IFIFO:
-        case S_IFSOCK:
-                result = DFT_NODE;
-                break;
-        default:
+	enum dt_format_type result;
+
+	switch (mode & S_IFMT) {
+	case S_IFDIR:
+		result = DFT_DIR;
+		break;
+	case S_IFREG:
+		result = DFT_REGULAR;
+		break;
+	case S_IFLNK:
+		result = DFT_SYM;
+		break;
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFIFO:
+	case S_IFSOCK:
+		result = DFT_NODE;
+		break;
+	default:
 		LASSERTF(0, "invalid mode %o\n", mode);
 		result = 0; /* Just for satisfying compiler. */
 		break;
@@ -214,8 +219,10 @@ int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir,
 }
 EXPORT_SYMBOL(dt_lookup_dir);
 
-/* this differs from dt_locate by top_dev as parameter
- * but not one from lu_site */
+/*
+ * this differs from dt_locate by top_dev as parameter
+ * but not one from lu_site
+ */
 struct dt_object *dt_locate_at(const struct lu_env *env,
 			       struct dt_device *dev,
 			       const struct lu_fid *fid,
@@ -236,6 +243,7 @@ struct dt_object *dt_locate_at(const struct lu_env *env,
 			return container_of0(n, struct dt_object, do_lu);
 	}
 
+	lu_object_put(env, lo);
 	return ERR_PTR(-ENOENT);
 }
 EXPORT_SYMBOL(dt_locate_at);
@@ -269,28 +277,28 @@ static int dt_find_entry(const struct lu_env *env, const char *entry,
  * path component to \a entry_func.
  */
 int dt_path_parser(const struct lu_env *env,
-                   char *path, dt_entry_func_t entry_func,
-                   void *data)
+		   char *path, dt_entry_func_t entry_func,
+		   void *data)
 {
-        char *e;
-        int rc = 0;
-
-        while (1) {
-                e = strsep(&path, "/");
-                if (e == NULL)
-                        break;
-
-                if (e[0] == 0) {
-                        if (!path || path[0] == '\0')
-                                break;
-                        continue;
-                }
-                rc = entry_func(env, e, data);
-                if (rc)
-                        break;
-        }
-
-        return rc;
+	char *e;
+	int rc = 0;
+
+	while (1) {
+		e = strsep(&path, "/");
+		if (e == NULL)
+			break;
+
+		if (e[0] == 0) {
+			if (!path || path[0] == '\0')
+				break;
+			continue;
+		}
+		rc = entry_func(env, e, data);
+		if (rc)
+			break;
+	}
+
+	return rc;
 }
 
 struct dt_object *
@@ -298,51 +306,50 @@ dt_store_resolve(const struct lu_env *env, struct dt_device *dt,
 		 const char *path, struct lu_fid *fid)
 {
 	struct dt_thread_info *info = dt_info(env);
-	struct dt_find_hint   *dfh = &info->dti_dfh;
-	struct dt_object      *obj;
-	int		       result;
+	struct dt_find_hint *dfh = &info->dti_dfh;
+	struct dt_object *obj;
+	int result;
 
 
-        dfh->dfh_dt = dt;
-        dfh->dfh_fid = fid;
+	dfh->dfh_dt = dt;
+	dfh->dfh_fid = fid;
 
 	strlcpy(info->dti_buf, path, sizeof(info->dti_buf));
 
-        result = dt->dd_ops->dt_root_get(env, dt, fid);
-        if (result == 0) {
-                obj = dt_locate(env, dt, fid);
-                if (!IS_ERR(obj)) {
-                        dfh->dfh_o = obj;
+	result = dt->dd_ops->dt_root_get(env, dt, fid);
+	if (result == 0) {
+		obj = dt_locate(env, dt, fid);
+		if (!IS_ERR(obj)) {
+			dfh->dfh_o = obj;
 			result = dt_path_parser(env, info->dti_buf,
 						dt_find_entry, dfh);
-                        if (result != 0)
-                                obj = ERR_PTR(result);
-                        else
-                                obj = dfh->dfh_o;
-                }
-        } else {
-                obj = ERR_PTR(result);
-        }
-        return obj;
+			if (result != 0)
+				obj = ERR_PTR(result);
+			else
+				obj = dfh->dfh_o;
+		}
+	} else {
+		obj = ERR_PTR(result);
+	}
+	return obj;
 }
 
 static struct dt_object *dt_reg_open(const struct lu_env *env,
-                                     struct dt_device *dt,
-                                     struct dt_object *p,
-                                     const char *name,
-                                     struct lu_fid *fid)
+				     struct dt_device *dt,
+				     struct dt_object *p,
+				     const char *name,
+				     struct lu_fid *fid)
 {
-        struct dt_object *o;
-        int result;
+	struct dt_object *o;
+	int result;
 
-        result = dt_lookup_dir(env, p, name, fid);
-        if (result == 0){
-                o = dt_locate(env, dt, fid);
-        }
-        else
-                o = ERR_PTR(result);
+	result = dt_lookup_dir(env, p, name, fid);
+	if (result == 0)
+		o = dt_locate(env, dt, fid);
+	else
+		o = ERR_PTR(result);
 
-        return o;
+	return o;
 }
 
 /**
@@ -369,47 +376,47 @@ struct dt_object *dt_store_open(const struct lu_env *env, struct dt_device *dt,
 }
 
 struct dt_object *dt_find_or_create(const struct lu_env *env,
-                                    struct dt_device *dt,
-                                    const struct lu_fid *fid,
-                                    struct dt_object_format *dof,
-                                    struct lu_attr *at)
+				    struct dt_device *dt,
+				    const struct lu_fid *fid,
+				    struct dt_object_format *dof,
+				    struct lu_attr *at)
 {
-        struct dt_object *dto;
-        struct thandle *th;
-        int rc;
+	struct dt_object *dto;
+	struct thandle *th;
+	int rc;
 
-        ENTRY;
+	ENTRY;
 
-        dto = dt_locate(env, dt, fid);
-        if (IS_ERR(dto))
-                RETURN(dto);
+	dto = dt_locate(env, dt, fid);
+	if (IS_ERR(dto))
+		RETURN(dto);
 
-        LASSERT(dto != NULL);
-        if (dt_object_exists(dto))
-                RETURN(dto);
+	LASSERT(dto != NULL);
+	if (dt_object_exists(dto))
+		RETURN(dto);
 
-        th = dt_trans_create(env, dt);
-        if (IS_ERR(th))
-                GOTO(out, rc = PTR_ERR(th));
+	th = dt_trans_create(env, dt);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
 
-        rc = dt_declare_create(env, dto, at, NULL, dof, th);
-        if (rc)
-                GOTO(trans_stop, rc);
+	rc = dt_declare_create(env, dto, at, NULL, dof, th);
+	if (rc)
+		GOTO(trans_stop, rc);
 
-        rc = dt_trans_start_local(env, dt, th);
-        if (rc)
-                GOTO(trans_stop, rc);
+	rc = dt_trans_start_local(env, dt, th);
+	if (rc)
+		GOTO(trans_stop, rc);
 
-        dt_write_lock(env, dto, 0);
-        if (dt_object_exists(dto))
-                GOTO(unlock, rc = 0);
+	dt_write_lock(env, dto, 0);
+	if (dt_object_exists(dto))
+		GOTO(unlock, rc = 0);
 
-        CDEBUG(D_OTHER, "create new object "DFID"\n", PFID(fid));
+	CDEBUG(D_OTHER, "create new object "DFID"\n", PFID(fid));
 
-        rc = dt_create(env, dto, at, NULL, dof, th);
-        if (rc)
+	rc = dt_create(env, dto, at, NULL, dof, th);
+	if (rc)
                 GOTO(unlock, rc);
-        LASSERT(dt_object_exists(dto));
+	LASSERT(dt_object_exists(dto));
 unlock:
 	dt_write_unlock(env, dto);
 trans_stop:
@@ -427,16 +434,16 @@ EXPORT_SYMBOL(dt_find_or_create);
 /* dt class init function. */
 int dt_global_init(void)
 {
-        int result;
+	int result;
 
-        LU_CONTEXT_KEY_INIT(&dt_key);
-        result = lu_context_key_register(&dt_key);
-        return result;
+	LU_CONTEXT_KEY_INIT(&dt_key);
+	result = lu_context_key_register(&dt_key);
+	return result;
 }
 
 void dt_global_fini(void)
 {
-        lu_context_key_degister(&dt_key);
+	lu_context_key_degister(&dt_key);
 }
 
 /**
@@ -451,7 +458,7 @@ void dt_global_fini(void)
  * \retval -ve errno on failure
  */
 int dt_read(const struct lu_env *env, struct dt_object *dt,
-            struct lu_buf *buf, loff_t *pos)
+	    struct lu_buf *buf, loff_t *pos)
 {
 	LASSERTF(dt != NULL, "dt is NULL when we want to read record\n");
 	return dt->do_body_ops->dbo_read(env, dt, buf, pos);
@@ -486,7 +493,7 @@ int dt_record_read(const struct lu_env *env, struct dt_object *dt,
 EXPORT_SYMBOL(dt_record_read);
 
 int dt_record_write(const struct lu_env *env, struct dt_object *dt,
-                    const struct lu_buf *buf, loff_t *pos, struct thandle *th)
+		    const struct lu_buf *buf, loff_t *pos, struct thandle *th)
 {
 	ssize_t size;
 
@@ -495,7 +502,7 @@ int dt_record_write(const struct lu_env *env, struct dt_object *dt,
 	LASSERT(dt->do_body_ops);
 	LASSERT(dt->do_body_ops->dbo_write);
 
-	size = dt->do_body_ops->dbo_write(env, dt, buf, pos, th, 1);
+	size = dt->do_body_ops->dbo_write(env, dt, buf, pos, th);
 	if (size < 0)
 		return size;
 	return (size == (ssize_t)buf->lb_len) ? 0 : -EFAULT;
@@ -503,53 +510,53 @@ int dt_record_write(const struct lu_env *env, struct dt_object *dt,
 EXPORT_SYMBOL(dt_record_write);
 
 int dt_declare_version_set(const struct lu_env *env, struct dt_object *o,
-                           struct thandle *th)
+			   struct thandle *th)
 {
-        struct lu_buf vbuf;
-        char *xname = XATTR_NAME_VERSION;
+	struct lu_buf vbuf;
+	char *xname = XATTR_NAME_VERSION;
 
-        LASSERT(o);
-        vbuf.lb_buf = NULL;
-        vbuf.lb_len = sizeof(dt_obj_version_t);
-        return dt_declare_xattr_set(env, o, &vbuf, xname, 0, th);
+	LASSERT(o);
+	vbuf.lb_buf = NULL;
+	vbuf.lb_len = sizeof(dt_obj_version_t);
+	return dt_declare_xattr_set(env, o, &vbuf, xname, 0, th);
 
 }
 EXPORT_SYMBOL(dt_declare_version_set);
 
 void dt_version_set(const struct lu_env *env, struct dt_object *o,
-                    dt_obj_version_t version, struct thandle *th)
+		    dt_obj_version_t version, struct thandle *th)
 {
-        struct lu_buf vbuf;
-        char *xname = XATTR_NAME_VERSION;
-        int rc;
+	struct lu_buf vbuf;
+	char *xname = XATTR_NAME_VERSION;
+	int rc;
 
-        LASSERT(o);
-        vbuf.lb_buf = &version;
-        vbuf.lb_len = sizeof(version);
+	LASSERT(o);
+	vbuf.lb_buf = &version;
+	vbuf.lb_len = sizeof(version);
 
 	rc = dt_xattr_set(env, o, &vbuf, xname, 0, th);
-        if (rc < 0)
-                CDEBUG(D_INODE, "Can't set version, rc %d\n", rc);
-        return;
+	if (rc < 0)
+		CDEBUG(D_INODE, "Can't set version, rc %d\n", rc);
+	return;
 }
 EXPORT_SYMBOL(dt_version_set);
 
 dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o)
 {
-        struct lu_buf vbuf;
-        char *xname = XATTR_NAME_VERSION;
-        dt_obj_version_t version;
-        int rc;
-
-        LASSERT(o);
-        vbuf.lb_buf = &version;
-        vbuf.lb_len = sizeof(version);
+	struct lu_buf vbuf;
+	char *xname = XATTR_NAME_VERSION;
+	dt_obj_version_t version;
+	int rc;
+
+	LASSERT(o);
+	vbuf.lb_buf = &version;
+	vbuf.lb_len = sizeof(version);
 	rc = dt_xattr_get(env, o, &vbuf, xname);
-        if (rc != sizeof(version)) {
-                CDEBUG(D_INODE, "Can't get version, rc %d\n", rc);
-                version = 0;
-        }
-        return version;
+	if (rc != sizeof(version)) {
+		CDEBUG(D_INODE, "Can't get version, rc %d\n", rc);
+		version = 0;
+	}
+	return version;
 }
 EXPORT_SYMBOL(dt_version_get);
 
@@ -568,8 +575,8 @@ const struct dt_index_features dt_lfsck_layout_orphan_features = {
 	.dif_flags		= 0,
 	.dif_keysize_min	= sizeof(struct lu_fid),
 	.dif_keysize_max	= sizeof(struct lu_fid),
-	.dif_recsize_min	= sizeof(struct lu_orphan_rec_v2),
-	.dif_recsize_max	= sizeof(struct lu_orphan_rec_v2),
+	.dif_recsize_min	= sizeof(struct lu_orphan_rec_v3),
+	.dif_recsize_max	= sizeof(struct lu_orphan_rec_v3),
 	.dif_ptrsize		= 4
 };
 EXPORT_SYMBOL(dt_lfsck_layout_orphan_features);
@@ -642,8 +649,10 @@ const struct dt_index_features dt_nodemap_features = {
 };
 EXPORT_SYMBOL(dt_nodemap_features);
 
-/* helper function returning what dt_index_features structure should be used
- * based on the FID sequence. This is used by OBD_IDX_READ RPC */
+/*
+ * helper function returning what dt_index_features structure should be used
+ * based on the FID sequence. This is used by OBD_IDX_READ RPC
+ */
 static inline const struct dt_index_features *dt_index_feat_select(__u64 seq,
 								   __u32 mode)
 {
@@ -689,11 +698,15 @@ static int dt_index_page_build(const struct lu_env *env, union lu_page *lp,
 			       size_t nob, const struct dt_it_ops *iops,
 			       struct dt_it *it, __u32 attr, void *arg)
 {
-	struct idx_info		*ii = (struct idx_info *)arg;
-	struct lu_idxpage	*lip = &lp->lp_idx;
-	char			*entry;
-	size_t			 size;
-	int			 rc;
+	struct idx_info *ii = (struct idx_info *)arg;
+	struct lu_idxpage *lip = &lp->lp_idx;
+	char *entry;
+	__u64 hash;
+	__u16 hashsize = 0;
+	__u16 keysize = 0;
+	__u16 recsize;
+	int rc;
+
 	ENTRY;
 
 	if (nob < LIP_HDR_SIZE)
@@ -704,20 +717,12 @@ static int dt_index_page_build(const struct lu_env *env, union lu_page *lp,
 	lip->lip_magic = LIP_MAGIC;
 	nob           -= LIP_HDR_SIZE;
 
-	/* compute size needed to store a key/record pair */
-	size = ii->ii_recsize + ii->ii_keysize;
-	if ((ii->ii_flags & II_FL_NOHASH) == 0)
-		/* add hash if the client wants it */
-		size += sizeof(__u64);
+	/* client wants to the 64-bit hash value associated with each record */
+	if (!(ii->ii_flags & II_FL_NOHASH))
+		hashsize = sizeof(hash);
 
 	entry = lip->lip_entries;
 	do {
-		char		*tmp_entry = entry;
-		struct dt_key	*key;
-		__u64		hash;
-		__u16		keysize;
-		__u16		recsize;
-
 		/* fetch 64-bit hash value */
 		hash = iops->store(env, it);
 		ii->ii_hash_end = hash;
@@ -727,56 +732,54 @@ static int dt_index_page_build(const struct lu_env *env, union lu_page *lp,
 				GOTO(out, rc = 0);
 		}
 
-		if (nob < size) {
-			if (lip->lip_nr == 0)
+		if (!(ii->ii_flags & II_FL_NOKEY)) {
+			keysize = iops->key_size(env, it);
+			if (!(ii->ii_flags & II_FL_VARKEY) &&
+			    keysize != ii->ii_keysize) {
+				CERROR("keysize mismatch %hu != %hu.\n",
+				       keysize, ii->ii_keysize);
 				GOTO(out, rc = -EINVAL);
-			GOTO(out, rc = 0);
-		}
-
-		if (!(ii->ii_flags & II_FL_NOHASH)) {
-			/* client wants to the 64-bit hash value associated with
-			 * each record */
-			memcpy(tmp_entry, &hash, sizeof(hash));
-			tmp_entry += sizeof(hash);
+			}
 		}
 
-		if (ii->ii_flags & II_FL_VARKEY)
-			keysize = iops->key_size(env, it);
+		/* and finally the record */
+		if (ii->ii_flags & II_FL_VARREC)
+			recsize = iops->rec_size(env, it, attr);
 		else
-			keysize = ii->ii_keysize;
+			recsize = ii->ii_recsize;
 
-		if (!(ii->ii_flags & II_FL_NOKEY)) {
-			/* then the key value */
-			key = iops->key(env, it);
-			memcpy(tmp_entry, key, keysize);
-			tmp_entry += keysize;
+		if (nob < hashsize + keysize + recsize) {
+			if (lip->lip_nr == 0)
+				GOTO(out, rc = -E2BIG);
+			GOTO(out, rc = 0);
 		}
 
-		/* and finally the record */
-		rc = iops->rec(env, it, (struct dt_rec *)tmp_entry, attr);
-		if (rc != -ESTALE) {
-			if (rc != 0)
-				GOTO(out, rc);
-
+		rc = iops->rec(env, it,
+			       (struct dt_rec *)(entry + hashsize + keysize),
+			       attr);
+		if (!rc) {
+			if (hashsize)
+				memcpy(entry, &hash, hashsize);
+			if (keysize) {
+				struct dt_key *key;
+
+				key = iops->key(env, it);
+				memcpy(entry + hashsize, key, keysize);
+			}
 			/* hash/key/record successfully copied! */
 			lip->lip_nr++;
 			if (unlikely(lip->lip_nr == 1 && ii->ii_count == 0))
 				ii->ii_hash_start = hash;
-
-			if (ii->ii_flags & II_FL_VARREC)
-				recsize = iops->rec_size(env, it, attr);
-			else
-				recsize = ii->ii_recsize;
-
-			entry = tmp_entry + recsize;
-			nob -= size;
+			entry += hashsize + keysize + recsize;
+			nob -= hashsize + keysize + recsize;
+		} else if (rc != -ESTALE) {
+			GOTO(out, rc);
 		}
 
 		/* move on to the next record */
 		do {
 			rc = iops->next(env, it);
 		} while (rc == -ESTALE);
-
 	} while (rc == 0);
 
 	GOTO(out, rc);
@@ -809,10 +812,10 @@ int dt_index_walk(const struct lu_env *env, struct dt_object *obj,
 		  const struct lu_rdpg *rdpg, dt_index_page_build_t filler,
 		  void *arg)
 {
-	struct dt_it		*it;
-	const struct dt_it_ops	*iops;
-	size_t			 pageidx, nob, nlupgs = 0;
-	int			 rc;
+	struct dt_it *it;
+	const struct dt_it_ops *iops;
+	size_t pageidx, nob, nlupgs = 0;
+	int rc;
 	ENTRY;
 
 	LASSERT(rdpg->rp_pages != NULL);
@@ -853,13 +856,15 @@ int dt_index_walk(const struct lu_env *env, struct dt_object *obj,
 		GOTO(out, rc);
 	}
 
-	/* Fill containers one after the other. There might be multiple
+	/*
+	 * Fill containers one after the other. There might be multiple
 	 * containers per physical page.
 	 *
 	 * At this point and across for-loop:
 	 *  rc == 0 -> ok, proceed.
 	 *  rc >  0 -> end of index.
-	 *  rc <  0 -> error. */
+	 *  rc <  0 -> error.
+	 */
 	for (pageidx = 0; rc == 0 && nob > 0; pageidx++) {
 		union lu_page	*lp;
 		int		 i;
@@ -915,8 +920,10 @@ int dt_index_read(const struct lu_env *env, struct dt_device *dev,
 	int				 rc;
 	ENTRY;
 
-	/* rp_count shouldn't be null and should be a multiple of the container
-	 * size */
+	/*
+	 * rp_count shouldn't be null and should be a multiple of the container
+	 * size
+	 */
 	if (rdpg->rp_count == 0 || (rdpg->rp_count & (LU_PAGE_SIZE - 1)) != 0)
 		RETURN(-EFAULT);
 
@@ -1077,3 +1084,221 @@ int lprocfs_dt_filesfree_seq_show(struct seq_file *m, void *v)
 EXPORT_SYMBOL(lprocfs_dt_filesfree_seq_show);
 
 #endif /* CONFIG_PROC_FS */
+
+static ssize_t uuid_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf)
+{
+	struct dt_device *dt = container_of(kobj, struct dt_device,
+					    dd_kobj);
+	struct lu_device *lu = dt2lu_dev(dt);
+
+	if (!lu->ld_obd)
+		return -ENODEV;
+
+	return sprintf(buf, "%s\n", lu->ld_obd->obd_uuid.uuid);
+}
+LUSTRE_RO_ATTR(uuid);
+
+static ssize_t blocksize_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
+{
+	struct dt_device *dt = container_of(kobj, struct dt_device,
+					    dd_kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = dt_statfs(NULL, dt, &osfs);
+	if (rc)
+		return rc;
+
+	return sprintf(buf, "%u\n", (unsigned) osfs.os_bsize);
+}
+LUSTRE_RO_ATTR(blocksize);
+
+static ssize_t kbytestotal_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	struct dt_device *dt = container_of(kobj, struct dt_device,
+					    dd_kobj);
+	struct obd_statfs osfs;
+	u32 blk_size;
+	u64 result;
+	int rc;
+
+	rc = dt_statfs(NULL, dt, &osfs);
+	if (rc)
+		return rc;
+
+	blk_size = osfs.os_bsize >> 10;
+	result = osfs.os_blocks;
+
+	while (blk_size >>= 1)
+		result <<= 1;
+
+	return sprintf(buf, "%llu\n", result);
+}
+LUSTRE_RO_ATTR(kbytestotal);
+
+static ssize_t kbytesfree_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	struct dt_device *dt = container_of(kobj, struct dt_device,
+					    dd_kobj);
+	struct obd_statfs osfs;
+	u32 blk_size;
+	u64 result;
+	int rc;
+
+	rc = dt_statfs(NULL, dt, &osfs);
+	if (rc)
+		return rc;
+
+	blk_size = osfs.os_bsize >> 10;
+	result = osfs.os_bfree;
+
+	while (blk_size >>= 1)
+		result <<= 1;
+
+	return sprintf(buf, "%llu\n", result);
+}
+LUSTRE_RO_ATTR(kbytesfree);
+
+static ssize_t kbytesavail_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	struct dt_device *dt = container_of(kobj, struct dt_device,
+					    dd_kobj);
+	struct obd_statfs osfs;
+	u32 blk_size;
+	u64 result;
+	int rc;
+
+	rc = dt_statfs(NULL, dt, &osfs);
+	if (rc)
+		return rc;
+
+	blk_size = osfs.os_bsize >> 10;
+	result = osfs.os_bavail;
+
+	while (blk_size >>= 1)
+		result <<= 1;
+
+	return sprintf(buf, "%llu\n", result);
+}
+LUSTRE_RO_ATTR(kbytesavail);
+
+static ssize_t filestotal_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	struct dt_device *dt = container_of(kobj, struct dt_device,
+					    dd_kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = dt_statfs(NULL, dt, &osfs);
+	if (rc)
+		return rc;
+
+	return sprintf(buf, "%llu\n", osfs.os_files);
+}
+LUSTRE_RO_ATTR(filestotal);
+
+static ssize_t filesfree_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
+{
+	struct dt_device *dt = container_of(kobj, struct dt_device,
+					    dd_kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = dt_statfs(NULL, dt, &osfs);
+	if (rc)
+		return rc;
+
+	return sprintf(buf, "%llu\n", osfs.os_ffree);
+}
+LUSTRE_RO_ATTR(filesfree);
+
+static const struct attribute *dt_def_attrs[] = {
+	&lustre_attr_uuid.attr,
+	&lustre_attr_blocksize.attr,
+	&lustre_attr_kbytestotal.attr,
+	&lustre_attr_kbytesfree.attr,
+	&lustre_attr_kbytesavail.attr,
+	&lustre_attr_filestotal.attr,
+	&lustre_attr_filesfree.attr,
+	NULL,
+};
+
+static void dt_sysfs_release(struct kobject *kobj)
+{
+	struct dt_device *dt = container_of(kobj, struct dt_device,
+					    dd_kobj);
+
+	complete(&dt->dd_kobj_unregister);
+}
+
+int dt_tunables_fini(struct dt_device *dt)
+{
+	if (!dt)
+		return -EINVAL;
+
+	if (!IS_ERR_OR_NULL(dt->dd_debugfs_entry))
+		ldebugfs_remove(&dt->dd_debugfs_entry);
+
+	if (dt->dd_def_attrs)
+		sysfs_remove_files(&dt->dd_kobj, dt->dd_def_attrs);
+
+	kobject_put(&dt->dd_kobj);
+	wait_for_completion(&dt->dd_kobj_unregister);
+
+	return 0;
+}
+EXPORT_SYMBOL(dt_tunables_fini);
+
+int dt_tunables_init(struct dt_device *dt, struct obd_type *type,
+		     const char *name, struct ldebugfs_vars *list)
+{
+	int rc;
+
+	dt->dd_ktype.sysfs_ops = &lustre_sysfs_ops;
+	dt->dd_ktype.release = dt_sysfs_release;
+
+	init_completion(&dt->dd_kobj_unregister);
+	rc = kobject_init_and_add(&dt->dd_kobj, &dt->dd_ktype, type->typ_kobj,
+				  "%s", name);
+	if (rc)
+		return rc;
+
+	dt->dd_def_attrs = dt_def_attrs;
+
+	rc = sysfs_create_files(&dt->dd_kobj, dt->dd_def_attrs);
+	if (rc) {
+		kobject_put(&dt->dd_kobj);
+		return rc;
+	}
+
+	/*
+	 * No need to register debugfs if no enteries. This allows us to
+	 * choose between using dt_device or obd_device for debugfs.
+	 */
+	if (!list)
+		return rc;
+
+	dt->dd_debugfs_entry = ldebugfs_register(name,
+						 type->typ_debugfs_entry,
+						 list, dt);
+	if (IS_ERR_OR_NULL(dt->dd_debugfs_entry)) {
+		rc = dt->dd_debugfs_entry ? PTR_ERR(dt->dd_debugfs_entry)
+					  : -ENOMEM;
+		CERROR("%s: error %d setting up debugfs\n",
+		       name, rc);
+		dt->dd_debugfs_entry = NULL;
+		sysfs_remove_files(&dt->dd_kobj, dt->dd_def_attrs);
+		kobject_put(&dt->dd_kobj);
+		return rc;
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(dt_tunables_init);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/genops.c b/drivers/staging/lustrefsx/lustre/obdclass/genops.c
index 2c8e4db905d01..bd9330daafd8a 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/genops.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/genops.c
@@ -23,7 +23,7 @@
  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,8 +38,10 @@
 #define DEBUG_SUBSYSTEM S_CLASS
 
 #include <linux/pid_namespace.h>
-#include <linux/kthread.h>
+#include <linux/workqueue.h>
+#include <lustre_compat.h>
 #include <obd_class.h>
+#include <lustre_log.h>
 #include <lprocfs_status.h>
 #include <lustre_disk.h>
 #include <lustre_kernelcomm.h>
@@ -50,15 +52,9 @@ DEFINE_RWLOCK(obd_dev_lock);
 static struct obd_device *obd_devs[MAX_OBD_DEVICES];
 
 static struct kmem_cache *obd_device_cachep;
-struct kmem_cache *obdo_cachep;
-EXPORT_SYMBOL(obdo_cachep);
-static struct kmem_cache *import_cachep;
 
-static LIST_HEAD(obd_zombie_imports);
-static LIST_HEAD(obd_zombie_exports);
-static DEFINE_SPINLOCK(obd_zombie_impexp_lock);
+static struct workqueue_struct *zombie_wq;
 
-static void obd_zombie_impexp_notify(void);
 static void obd_zombie_export_add(struct obd_export *exp);
 static void obd_zombie_import_add(struct obd_import *imp);
 static void print_export_data(struct obd_export *exp,
@@ -162,18 +158,57 @@ void class_put_type(struct obd_type *type)
 	spin_unlock(&type->obd_type_lock);
 }
 
+static void class_sysfs_release(struct kobject *kobj)
+{
+	OBD_FREE(kobj, sizeof(*kobj));
+}
+
+static struct kobj_type class_ktype = {
+	.sysfs_ops      = &lustre_sysfs_ops,
+	.release        = class_sysfs_release,
+};
+
+struct kobject *class_setup_tunables(const char *name)
+{
+	struct kobject *kobj;
+	int rc;
+
+#ifdef HAVE_SERVER_SUPPORT
+	kobj = kset_find_obj(lustre_kset, name);
+	if (kobj)
+		return kobj;
+#endif
+	OBD_ALLOC(kobj, sizeof(*kobj));
+	if (!kobj)
+		return ERR_PTR(-ENOMEM);
+
+	kobj->kset = lustre_kset;
+	kobject_init(kobj, &class_ktype);
+	rc = kobject_add(kobj, &lustre_kset->kobj, "%s", name);
+	if (rc) {
+		kobject_put(kobj);
+		return ERR_PTR(rc);
+	}
+	return kobj;
+}
+EXPORT_SYMBOL(class_setup_tunables);
+
 #define CLASS_MAX_NAME 1024
 
-int class_register_type(struct obd_ops *dt_ops, struct md_ops *md_ops,
-			bool enable_proc, struct lprocfs_vars *vars,
+int class_register_type(const struct obd_ops *dt_ops,
+			const struct md_ops *md_ops,
+			bool enable_proc, struct ldebugfs_vars *vars,
 			const char *name, struct lu_device_type *ldt)
 {
-        struct obd_type *type;
-        int rc = 0;
-        ENTRY;
+	struct obd_type *type;
+#ifdef HAVE_SERVER_SUPPORT
+	struct qstr dname;
+#endif /* HAVE_SERVER_SUPPORT */
+	int rc = 0;
 
-        /* sanity check */
-        LASSERT(strnlen(name, CLASS_MAX_NAME) < CLASS_MAX_NAME);
+	ENTRY;
+	/* sanity check */
+	LASSERT(strnlen(name, CLASS_MAX_NAME) < CLASS_MAX_NAME);
 
         if (class_search_type(name)) {
                 CDEBUG(D_IOCTL, "Type %s already registered\n", name);
@@ -205,7 +240,7 @@ int class_register_type(struct obd_ops *dt_ops, struct md_ops *md_ops,
 	if (enable_proc) {
 		type->typ_procroot = lprocfs_register(type->typ_name,
 						      proc_lustre_root,
-						      vars, type);
+						      NULL, type);
 		if (IS_ERR(type->typ_procroot)) {
 			rc = PTR_ERR(type->typ_procroot);
 			type->typ_procroot = NULL;
@@ -213,20 +248,57 @@ int class_register_type(struct obd_ops *dt_ops, struct md_ops *md_ops,
 		}
 	}
 #endif
-        if (ldt != NULL) {
-                type->typ_lu = ldt;
-                rc = lu_device_type_init(ldt);
-                if (rc != 0)
-                        GOTO (failed, rc);
-        }
+#ifdef HAVE_SERVER_SUPPORT
+	dname.name = name;
+	dname.len = strlen(dname.name);
+	dname.hash = ll_full_name_hash(debugfs_lustre_root, dname.name,
+				       dname.len);
+	type->typ_debugfs_entry = d_lookup(debugfs_lustre_root, &dname);
+	if (type->typ_debugfs_entry) {
+		dput(type->typ_debugfs_entry);
+		type->typ_sym_filter = true;
+		goto dir_exist;
+	}
+#endif /* HAVE_SERVER_SUPPORT */
+
+	type->typ_debugfs_entry = ldebugfs_register(type->typ_name,
+						    debugfs_lustre_root,
+						    vars, type);
+	if (IS_ERR_OR_NULL(type->typ_debugfs_entry)) {
+		rc = type->typ_debugfs_entry ? PTR_ERR(type->typ_debugfs_entry)
+					     : -ENOMEM;
+		type->typ_debugfs_entry = NULL;
+		GOTO(failed, rc);
+	}
+#ifdef HAVE_SERVER_SUPPORT
+dir_exist:
+#endif
+	type->typ_kobj = class_setup_tunables(type->typ_name);
+	if (IS_ERR(type->typ_kobj))
+		GOTO(failed, rc = PTR_ERR(type->typ_kobj));
+
+	if (ldt) {
+		type->typ_lu = ldt;
+		rc = lu_device_type_init(ldt);
+		if (rc) {
+			kobject_put(type->typ_kobj);
+			GOTO(failed, rc);
+		}
+	}
 
 	spin_lock(&obd_types_lock);
 	list_add(&type->typ_chain, &obd_types);
 	spin_unlock(&obd_types_lock);
 
-        RETURN (0);
+	RETURN(0);
 
 failed:
+#ifdef HAVE_SERVER_SUPPORT
+	if (type->typ_sym_filter)
+		type->typ_debugfs_entry = NULL;
+#endif
+	if (!IS_ERR_OR_NULL(type->typ_debugfs_entry))
+		ldebugfs_remove(&type->typ_debugfs_entry);
 	if (type->typ_name != NULL) {
 #ifdef CONFIG_PROC_FS
 		if (type->typ_procroot != NULL)
@@ -262,6 +334,8 @@ int class_unregister_type(const char *name)
                 RETURN(-EBUSY);
         }
 
+	kobject_put(type->typ_kobj);
+
 	/* we do not use type->typ_procroot as for compatibility purposes
 	 * other modules can share names (i.e. lod can use lov entry). so
 	 * we can't reference pointer as it can get invalided when another
@@ -272,6 +346,13 @@ int class_unregister_type(const char *name)
 	if (type->typ_procsym != NULL)
 		lprocfs_remove(&type->typ_procsym);
 #endif
+#ifdef HAVE_SERVER_SUPPORT
+	if (type->typ_sym_filter)
+		type->typ_debugfs_entry = NULL;
+#endif
+	if (!IS_ERR_OR_NULL(type->typ_debugfs_entry))
+		ldebugfs_remove(&type->typ_debugfs_entry);
+
         if (type->typ_lu)
                 lu_device_type_fini(type->typ_lu);
 
@@ -291,22 +372,20 @@ EXPORT_SYMBOL(class_unregister_type);
 /**
  * Create a new obd device.
  *
- * Find an empty slot in ::obd_devs[], create a new obd device in it.
+ * Allocate the new obd_device and initialize it.
  *
  * \param[in] type_name obd device type string.
  * \param[in] name      obd device name.
+ * \param[in] uuid      obd device UUID
  *
- * \retval NULL if create fails, otherwise return the obd device
- *         pointer created.
+ * \retval newdev         pointer to created obd_device
+ * \retval ERR_PTR(errno) on error
  */
-struct obd_device *class_newdev(const char *type_name, const char *name)
+struct obd_device *class_newdev(const char *type_name, const char *name,
+				const char *uuid)
 {
-        struct obd_device *result = NULL;
         struct obd_device *newdev;
         struct obd_type *type = NULL;
-        int i;
-        int new_obd_minor = 0;
-        bool retried = false;
         ENTRY;
 
         if (strlen(name) >= MAX_OBD_NAME) {
@@ -321,106 +400,197 @@ struct obd_device *class_newdev(const char *type_name, const char *name)
         }
 
         newdev = obd_device_alloc();
-	if (newdev == NULL)
-		GOTO(out_type, result = ERR_PTR(-ENOMEM));
-
+	if (newdev == NULL) {
+		class_put_type(type);
+		RETURN(ERR_PTR(-ENOMEM));
+	}
         LASSERT(newdev->obd_magic == OBD_DEVICE_MAGIC);
+	strncpy(newdev->obd_name, name, sizeof(newdev->obd_name) - 1);
+	newdev->obd_type = type;
+	newdev->obd_minor = -1;
+
+	rwlock_init(&newdev->obd_pool_lock);
+	newdev->obd_pool_limit = 0;
+	newdev->obd_pool_slv = 0;
+
+	INIT_LIST_HEAD(&newdev->obd_exports);
+	INIT_LIST_HEAD(&newdev->obd_unlinked_exports);
+	INIT_LIST_HEAD(&newdev->obd_delayed_exports);
+	INIT_LIST_HEAD(&newdev->obd_exports_timed);
+	INIT_LIST_HEAD(&newdev->obd_nid_stats);
+	spin_lock_init(&newdev->obd_nid_lock);
+	spin_lock_init(&newdev->obd_dev_lock);
+	mutex_init(&newdev->obd_dev_mutex);
+	spin_lock_init(&newdev->obd_osfs_lock);
+	/* newdev->obd_osfs_age must be set to a value in the distant
+	 * past to guarantee a fresh statfs is fetched on mount. */
+	newdev->obd_osfs_age = ktime_get_seconds() - 1000;
+
+	/* XXX belongs in setup not attach  */
+	init_rwsem(&newdev->obd_observer_link_sem);
+	/* recovery data */
+	spin_lock_init(&newdev->obd_recovery_task_lock);
+	init_waitqueue_head(&newdev->obd_next_transno_waitq);
+	init_waitqueue_head(&newdev->obd_evict_inprogress_waitq);
+	INIT_LIST_HEAD(&newdev->obd_req_replay_queue);
+	INIT_LIST_HEAD(&newdev->obd_lock_replay_queue);
+	INIT_LIST_HEAD(&newdev->obd_final_req_queue);
+	INIT_LIST_HEAD(&newdev->obd_evict_list);
+	INIT_LIST_HEAD(&newdev->obd_lwp_list);
+
+	llog_group_init(&newdev->obd_olg);
+	/* Detach drops this */
+	atomic_set(&newdev->obd_refcount, 1);
+	lu_ref_init(&newdev->obd_reference);
+	lu_ref_add(&newdev->obd_reference, "newdev", newdev);
+
+	newdev->obd_conn_inprogress = 0;
+
+	strncpy(newdev->obd_uuid.uuid, uuid, UUID_MAX);
+
+	CDEBUG(D_IOCTL, "Allocate new device %s (%p)\n",
+	       newdev->obd_name, newdev);
+
+	return newdev;
+}
 
-        again:
-	write_lock(&obd_dev_lock);
-        for (i = 0; i < class_devno_max(); i++) {
-                struct obd_device *obd = class_num2obd(i);
-
-		if (obd && (strcmp(name, obd->obd_name) == 0)) {
+/**
+ * Free obd device.
+ *
+ * \param[in] obd obd_device to be freed
+ *
+ * \retval none
+ */
+void class_free_dev(struct obd_device *obd)
+{
+	struct obd_type *obd_type = obd->obd_type;
 
-                        if (!retried) {
-                                write_unlock(&obd_dev_lock);
+	LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x "
+		 "!= %08x\n", obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+	LASSERTF(obd->obd_minor == -1 || obd_devs[obd->obd_minor] == obd,
+		 "obd %p != obd_devs[%d] %p\n",
+		 obd, obd->obd_minor, obd_devs[obd->obd_minor]);
+	LASSERTF(atomic_read(&obd->obd_refcount) == 0,
+		 "obd_refcount should be 0, not %d\n",
+		 atomic_read(&obd->obd_refcount));
+	LASSERT(obd_type != NULL);
 
-                                /* the obd_device could be waited to be
-                                 * destroyed by the "obd_zombie_impexp_thread".
-                                 */
-                                obd_zombie_barrier();
-                                retried = true;
-                                goto again;
-                        }
+	CDEBUG(D_INFO, "Release obd device %s obd_type name = %s\n",
+	       obd->obd_name, obd->obd_type->typ_name);
 
-                        CERROR("Device %s already exists at %d, won't add\n",
-                               name, i);
-                        if (result) {
-                                LASSERTF(result->obd_magic == OBD_DEVICE_MAGIC,
-                                         "%p obd_magic %08x != %08x\n", result,
-                                         result->obd_magic, OBD_DEVICE_MAGIC);
-                                LASSERTF(result->obd_minor == new_obd_minor,
-                                         "%p obd_minor %d != %d\n", result,
-                                         result->obd_minor, new_obd_minor);
-
-                                obd_devs[result->obd_minor] = NULL;
-                                result->obd_name[0]='\0';
-                         }
-                        result = ERR_PTR(-EEXIST);
-                        break;
-                }
-                if (!result && !obd) {
-                        result = newdev;
-                        result->obd_minor = i;
-                        new_obd_minor = i;
-                        result->obd_type = type;
-                        strncpy(result->obd_name, name,
-                                sizeof(result->obd_name) - 1);
-                        obd_devs[i] = result;
-                }
-        }
-	write_unlock(&obd_dev_lock);
-
-        if (result == NULL && i >= class_devno_max()) {
-                CERROR("all %u OBD devices used, increase MAX_OBD_DEVICES\n",
-                       class_devno_max());
-		GOTO(out, result = ERR_PTR(-EOVERFLOW));
-        }
+	CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n",
+			 obd->obd_name, obd->obd_uuid.uuid);
+	if (obd->obd_stopping) {
+		int err;
 
-	if (IS_ERR(result))
-	        GOTO(out, result);
+		/* If we're not stopping, we were never set up */
+		err = obd_cleanup(obd);
+		if (err)
+			CERROR("Cleanup %s returned %d\n",
+				obd->obd_name, err);
+	}
 
-	CDEBUG(D_IOCTL, "Adding new device %s (%p)\n",
-	       result->obd_name, result);
+	obd_device_free(obd);
 
-	RETURN(result);
-out:
-	obd_device_free(newdev);
-out_type:
-	class_put_type(type);
-	return result;
+	class_put_type(obd_type);
 }
 
-void class_release_dev(struct obd_device *obd)
+/**
+ * Unregister obd device.
+ *
+ * Free slot in obd_dev[] used by \a obd.
+ *
+ * \param[in] new_obd obd_device to be unregistered
+ *
+ * \retval none
+ */
+void class_unregister_device(struct obd_device *obd)
 {
-        struct obd_type *obd_type = obd->obd_type;
-
-        LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x != %08x\n",
-                 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
-        LASSERTF(obd == obd_devs[obd->obd_minor], "obd %p != obd_devs[%d] %p\n",
-                 obd, obd->obd_minor, obd_devs[obd->obd_minor]);
-        LASSERT(obd_type != NULL);
+	write_lock(&obd_dev_lock);
+	if (obd->obd_minor >= 0) {
+		LASSERT(obd_devs[obd->obd_minor] == obd);
+		obd_devs[obd->obd_minor] = NULL;
+		obd->obd_minor = -1;
+	}
+	write_unlock(&obd_dev_lock);
+}
 
-        CDEBUG(D_INFO, "Release obd device %s at %d obd_type name =%s\n",
-               obd->obd_name, obd->obd_minor, obd->obd_type->typ_name);
+/**
+ * Register obd device.
+ *
+ * Find free slot in obd_devs[], fills it with \a new_obd.
+ *
+ * \param[in] new_obd obd_device to be registered
+ *
+ * \retval 0          success
+ * \retval -EEXIST    device with this name is registered
+ * \retval -EOVERFLOW obd_devs[] is full
+ */
+int class_register_device(struct obd_device *new_obd)
+{
+	int ret = 0;
+	int i;
+	int new_obd_minor = 0;
+	bool minor_assign = false;
+	bool retried = false;
 
+again:
 	write_lock(&obd_dev_lock);
-        obd_devs[obd->obd_minor] = NULL;
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+
+		if (obd != NULL &&
+		    (strcmp(new_obd->obd_name, obd->obd_name) == 0)) {
+
+			if (!retried) {
+				write_unlock(&obd_dev_lock);
+
+				/* the obd_device could be waited to be
+ 				 * destroyed by the "obd_zombie_impexp_thread".
+ 				 */
+				obd_zombie_barrier();
+				retried = true;
+				goto again;
+			}
+
+			CERROR("%s: already exists, won't add\n",
+			       obd->obd_name);
+			/* in case we found a free slot before duplicate */
+			minor_assign = false;
+			ret = -EEXIST;
+			break;
+		}
+		if (!minor_assign && obd == NULL) {
+			new_obd_minor = i;
+			minor_assign = true;
+		}
+	}
+
+	if (minor_assign) {
+		new_obd->obd_minor = new_obd_minor;
+		LASSERTF(obd_devs[new_obd_minor] == NULL, "obd_devs[%d] "
+			 "%p\n", new_obd_minor, obd_devs[new_obd_minor]);
+		obd_devs[new_obd_minor] = new_obd;
+	} else {
+		if (ret == 0) {
+			ret = -EOVERFLOW;
+			CERROR("%s: all %u/%u devices used, increase "
+			       "MAX_OBD_DEVICES: rc = %d\n", new_obd->obd_name,
+			       i, class_devno_max(), ret);
+		}
+	}
 	write_unlock(&obd_dev_lock);
-        obd_device_free(obd);
 
-        class_put_type(obd_type);
+	RETURN(ret);
 }
 
-int class_name2dev(const char *name)
+static int class_name2dev_nolock(const char *name)
 {
         int i;
 
         if (!name)
                 return -1;
 
-	read_lock(&obd_dev_lock);
         for (i = 0; i < class_devno_max(); i++) {
                 struct obd_device *obd = class_num2obd(i);
 
@@ -429,17 +599,30 @@ int class_name2dev(const char *name)
                            out any references */
                         LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
                         if (obd->obd_attached) {
-				read_unlock(&obd_dev_lock);
                                 return i;
                         }
                         break;
                 }
         }
-	read_unlock(&obd_dev_lock);
 
         return -1;
 }
 
+int class_name2dev(const char *name)
+{
+	int i;
+
+	if (!name)
+		return -1;
+
+	read_lock(&obd_dev_lock);
+	i = class_name2dev_nolock(name);
+	read_unlock(&obd_dev_lock);
+
+	return i;
+}
+EXPORT_SYMBOL(class_name2dev);
+
 struct obd_device *class_name2obd(const char *name)
 {
         int dev = class_name2dev(name);
@@ -450,25 +633,34 @@ struct obd_device *class_name2obd(const char *name)
 }
 EXPORT_SYMBOL(class_name2obd);
 
-int class_uuid2dev(struct obd_uuid *uuid)
+int class_uuid2dev_nolock(struct obd_uuid *uuid)
 {
         int i;
 
-	read_lock(&obd_dev_lock);
         for (i = 0; i < class_devno_max(); i++) {
                 struct obd_device *obd = class_num2obd(i);
 
                 if (obd && obd_uuid_equals(uuid, &obd->obd_uuid)) {
                         LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
-			read_unlock(&obd_dev_lock);
                         return i;
                 }
         }
-	read_unlock(&obd_dev_lock);
 
         return -1;
 }
 
+int class_uuid2dev(struct obd_uuid *uuid)
+{
+	int i;
+
+	read_lock(&obd_dev_lock);
+	i = class_uuid2dev_nolock(uuid);
+	read_unlock(&obd_dev_lock);
+
+	return i;
+}
+EXPORT_SYMBOL(class_uuid2dev);
+
 struct obd_device *class_uuid2obd(struct obd_uuid *uuid)
 {
         int dev = class_uuid2dev(uuid);
@@ -506,6 +698,40 @@ struct obd_device *class_num2obd(int num)
         return obd;
 }
 
+/**
+ * Find obd in obd_dev[] by name or uuid.
+ *
+ * Increment obd's refcount if found.
+ *
+ * \param[in] str obd name or uuid
+ *
+ * \retval NULL    if not found
+ * \retval target  pointer to found obd_device
+ */
+struct obd_device *class_dev_by_str(const char *str)
+{
+	struct obd_device *target = NULL;
+	struct obd_uuid tgtuuid;
+	int rc;
+
+	obd_str2uuid(&tgtuuid, str);
+
+	read_lock(&obd_dev_lock);
+	rc = class_uuid2dev_nolock(&tgtuuid);
+	if (rc < 0)
+		rc = class_name2dev_nolock(str);
+
+	if (rc >= 0)
+		target = class_num2obd(rc);
+
+	if (target != NULL)
+		class_incref(target, "find", current);
+	read_unlock(&obd_dev_lock);
+
+	RETURN(target);
+}
+EXPORT_SYMBOL(class_dev_by_str);
+
 /**
  * Get obd devices count. Device in any
  *    state are counted
@@ -675,14 +901,6 @@ void obd_cleanup_caches(void)
 		kmem_cache_destroy(obd_device_cachep);
                 obd_device_cachep = NULL;
         }
-        if (obdo_cachep) {
-		kmem_cache_destroy(obdo_cachep);
-                obdo_cachep = NULL;
-        }
-        if (import_cachep) {
-		kmem_cache_destroy(import_cachep);
-                import_cachep = NULL;
-        }
 
         EXIT;
 }
@@ -699,19 +917,6 @@ int obd_init_caches(void)
 	if (!obd_device_cachep)
 		GOTO(out, rc = -ENOMEM);
 
-	LASSERT(obdo_cachep == NULL);
-	obdo_cachep = kmem_cache_create("ll_obdo_cache", sizeof(struct obdo),
-					0, 0, NULL);
-	if (!obdo_cachep)
-		GOTO(out, rc = -ENOMEM);
-
-	LASSERT(import_cachep == NULL);
-	import_cachep = kmem_cache_create("ll_import_cache",
-					  sizeof(struct obd_import),
-					  0, 0, NULL);
-	if (!import_cachep)
-		GOTO(out, rc = -ENOMEM);
-
 	RETURN(0);
 out:
 	obd_cleanup_caches();
@@ -748,18 +953,6 @@ struct obd_device *class_exp2obd(struct obd_export *exp)
 }
 EXPORT_SYMBOL(class_exp2obd);
 
-struct obd_device *class_conn2obd(struct lustre_handle *conn)
-{
-        struct obd_export *export;
-        export = class_conn2export(conn);
-        if (export) {
-                struct obd_device *obd = export->exp_obd;
-                class_export_put(export);
-                return obd;
-        }
-        return NULL;
-}
-
 struct obd_import *class_exp2cliimp(struct obd_export *exp)
 {
         struct obd_device *obd = exp->exp_obd;
@@ -769,14 +962,6 @@ struct obd_import *class_exp2cliimp(struct obd_export *exp)
 }
 EXPORT_SYMBOL(class_exp2cliimp);
 
-struct obd_import *class_conn2cliimp(struct lustre_handle *conn)
-{
-        struct obd_device *obd = class_conn2obd(conn);
-        if (obd == NULL)
-                return NULL;
-        return obd->u.cli.cl_import;
-}
-
 /* Export management functions */
 static void class_export_destroy(struct obd_export *exp)
 {
@@ -798,7 +983,10 @@ static void class_export_destroy(struct obd_export *exp)
 	LASSERT(list_empty(&exp->exp_req_replay_queue));
 	LASSERT(list_empty(&exp->exp_hp_rpcs));
         obd_destroy_export(exp);
-        class_decref(obd, "export", exp);
+	/* self export doesn't hold a reference to an obd, although it
+	 * exists until freeing of the obd */
+	if (exp != obd->obd_self_export)
+		class_decref(obd, "export", exp);
 
         OBD_FREE_RCU(exp, sizeof(*exp), &exp->exp_handle);
         EXIT;
@@ -831,24 +1019,46 @@ void class_export_put(struct obd_export *exp)
 	       atomic_read(&exp->exp_refcount) - 1);
 
 	if (atomic_dec_and_test(&exp->exp_refcount)) {
-		LASSERT(!list_empty(&exp->exp_obd_chain));
-		LASSERT(list_empty(&exp->exp_stale_list));
+		struct obd_device *obd = exp->exp_obd;
+
 		CDEBUG(D_IOCTL, "final put %p/%s\n",
 		       exp, exp->exp_client_uuid.uuid);
 
 		/* release nid stat refererence */
 		lprocfs_exp_cleanup(exp);
 
-		obd_zombie_export_add(exp);
+		if (exp == obd->obd_self_export) {
+			/* self export should be destroyed without
+			 * zombie thread as it doesn't hold a
+			 * reference to obd and doesn't hold any
+			 * resources */
+			class_export_destroy(exp);
+			/* self export is destroyed, no class
+			 * references exist and it is safe to free
+			 * obd */
+			class_free_dev(obd);
+		} else {
+			LASSERT(!list_empty(&exp->exp_obd_chain));
+			obd_zombie_export_add(exp);
+		}
+
 	}
 }
 EXPORT_SYMBOL(class_export_put);
 
+static void obd_zombie_exp_cull(struct work_struct *ws)
+{
+	struct obd_export *export;
+
+	export = container_of(ws, struct obd_export, exp_zombie_work);
+	class_export_destroy(export);
+}
+
 /* Creates a new export, adds it to the hash table, and returns a
  * pointer to it. The refcount is 2: one for the hash reference, and
  * one for the pointer returned by this function. */
-struct obd_export *class_new_export(struct obd_device *obd,
-                                    struct obd_uuid *cluuid)
+struct obd_export *__class_new_export(struct obd_device *obd,
+				      struct obd_uuid *cluuid, bool is_self)
 {
         struct obd_export *export;
 	struct cfs_hash *hash = NULL;
@@ -862,6 +1072,7 @@ struct obd_export *class_new_export(struct obd_device *obd,
         export->exp_conn_cnt = 0;
         export->exp_lock_hash = NULL;
 	export->exp_flock_hash = NULL;
+	/* 2 = class_handle_hash + last */
 	atomic_set(&export->exp_refcount, 2);
 	atomic_set(&export->exp_rpc_count, 0);
 	atomic_set(&export->exp_cb_count, 0);
@@ -876,11 +1087,11 @@ struct obd_export *class_new_export(struct obd_device *obd,
 	spin_lock_init(&export->exp_uncommitted_replies_lock);
 	INIT_LIST_HEAD(&export->exp_uncommitted_replies);
 	INIT_LIST_HEAD(&export->exp_req_replay_queue);
-	INIT_LIST_HEAD(&export->exp_handle.h_link);
+	INIT_LIST_HEAD_RCU(&export->exp_handle.h_link);
 	INIT_LIST_HEAD(&export->exp_hp_rpcs);
 	INIT_LIST_HEAD(&export->exp_reg_rpcs);
 	class_handle_hash(&export->exp_handle, &export_handle_ops);
-	export->exp_last_request_time = cfs_time_current_sec();
+	export->exp_last_request_time = ktime_get_real_seconds();
 	spin_lock_init(&export->exp_lock);
 	spin_lock_init(&export->exp_rpc_lock);
 	INIT_HLIST_NODE(&export->exp_uuid_hash);
@@ -889,23 +1100,24 @@ struct obd_export *class_new_export(struct obd_device *obd,
 	spin_lock_init(&export->exp_bl_list_lock);
 	INIT_LIST_HEAD(&export->exp_bl_list);
 	INIT_LIST_HEAD(&export->exp_stale_list);
+	INIT_WORK(&export->exp_zombie_work, obd_zombie_exp_cull);
 
 	export->exp_sp_peer = LUSTRE_SP_ANY;
 	export->exp_flvr.sf_rpc = SPTLRPC_FLVR_INVALID;
 	export->exp_client_uuid = *cluuid;
 	obd_init_export(export);
 
-	spin_lock(&obd->obd_dev_lock);
-	/* shouldn't happen, but might race */
-	if (obd->obd_stopping)
-		GOTO(exit_unlock, rc = -ENODEV);
+	if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) {
+		spin_lock(&obd->obd_dev_lock);
+		/* shouldn't happen, but might race */
+		if (obd->obd_stopping)
+			GOTO(exit_unlock, rc = -ENODEV);
 
-	hash = cfs_hash_getref(obd->obd_uuid_hash);
-	if (hash == NULL)
-		GOTO(exit_unlock, rc = -ENODEV);
-	spin_unlock(&obd->obd_dev_lock);
+		hash = cfs_hash_getref(obd->obd_uuid_hash);
+		if (hash == NULL)
+			GOTO(exit_unlock, rc = -ENODEV);
+		spin_unlock(&obd->obd_dev_lock);
 
-        if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) {
                 rc = cfs_hash_add_unique(hash, cluuid, &export->exp_uuid_hash);
                 if (rc != 0) {
                         LCONSOLE_WARN("%s: denying duplicate export for %s, %d\n",
@@ -917,17 +1129,24 @@ struct obd_export *class_new_export(struct obd_device *obd,
 	at_init(&export->exp_bl_lock_at, obd_timeout, 0);
 	spin_lock(&obd->obd_dev_lock);
         if (obd->obd_stopping) {
-                cfs_hash_del(hash, cluuid, &export->exp_uuid_hash);
-                GOTO(exit_unlock, rc = -ENODEV);
+		if (hash)
+			cfs_hash_del(hash, cluuid, &export->exp_uuid_hash);
+		GOTO(exit_unlock, rc = -ESHUTDOWN);
         }
 
-        class_incref(obd, "export", export);
-	list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports);
-	list_add_tail(&export->exp_obd_chain_timed,
-		      &export->exp_obd->obd_exports_timed);
-        export->exp_obd->obd_num_exports++;
+	if (!is_self) {
+		class_incref(obd, "export", export);
+		list_add_tail(&export->exp_obd_chain_timed,
+			      &obd->obd_exports_timed);
+		list_add(&export->exp_obd_chain, &obd->obd_exports);
+		obd->obd_num_exports++;
+	} else {
+		INIT_LIST_HEAD(&export->exp_obd_chain_timed);
+		INIT_LIST_HEAD(&export->exp_obd_chain);
+	}
 	spin_unlock(&obd->obd_dev_lock);
-	cfs_hash_putref(hash);
+	if (hash)
+		cfs_hash_putref(hash);
 	RETURN(export);
 
 exit_unlock:
@@ -941,12 +1160,29 @@ struct obd_export *class_new_export(struct obd_device *obd,
         OBD_FREE_PTR(export);
         return ERR_PTR(rc);
 }
+
+struct obd_export *class_new_export(struct obd_device *obd,
+				    struct obd_uuid *uuid)
+{
+	return __class_new_export(obd, uuid, false);
+}
 EXPORT_SYMBOL(class_new_export);
 
+struct obd_export *class_new_export_self(struct obd_device *obd,
+					 struct obd_uuid *uuid)
+{
+	return __class_new_export(obd, uuid, true);
+}
+
 void class_unlink_export(struct obd_export *exp)
 {
 	class_handle_unhash(&exp->exp_handle);
 
+	if (exp->exp_obd->obd_self_export == exp) {
+		class_export_put(exp);
+		return;
+	}
+
 	spin_lock(&exp->exp_obd->obd_dev_lock);
 	/* delete an uuid-export hashitem from hashtables */
 	if (!hlist_unhashed(&exp->exp_uuid_hash))
@@ -981,7 +1217,7 @@ void class_unlink_export(struct obd_export *exp)
 EXPORT_SYMBOL(class_unlink_export);
 
 /* Import management functions */
-static void class_import_destroy(struct obd_import *imp)
+static void obd_zombie_import_free(struct obd_import *imp)
 {
         ENTRY;
 
@@ -1003,21 +1239,13 @@ static void class_import_destroy(struct obd_import *imp)
         }
 
         LASSERT(imp->imp_sec == NULL);
+	LASSERTF(atomic_read(&imp->imp_reqs) == 0, "%s: imp_reqs = %d\n",
+		 imp->imp_obd->obd_name, atomic_read(&imp->imp_reqs));
         class_decref(imp->imp_obd, "import", imp);
-        OBD_FREE_RCU(imp, sizeof(*imp), &imp->imp_handle);
-        EXIT;
-}
-
-static void import_handle_addref(void *import)
-{
-        class_import_get(import);
+	OBD_FREE_PTR(imp);
+	EXIT;
 }
 
-static struct portals_handle_ops import_handle_ops = {
-	.hop_addref = import_handle_addref,
-	.hop_free   = NULL,
-};
-
 struct obd_import *class_import_get(struct obd_import *import)
 {
 	atomic_inc(&import->imp_refcount);
@@ -1032,7 +1260,6 @@ void class_import_put(struct obd_import *imp)
 {
 	ENTRY;
 
-	LASSERT(list_empty(&imp->imp_zombie_chain));
         LASSERT_ATOMIC_GT_LT(&imp->imp_refcount, 0, LI_POISON);
 
         CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", imp,
@@ -1044,8 +1271,6 @@ void class_import_put(struct obd_import *imp)
                 obd_zombie_import_add(imp);
         }
 
-	/* catch possible import put race */
-	LASSERT_ATOMIC_GE_LT(&imp->imp_refcount, 0, LI_POISON);
 	EXIT;
 }
 EXPORT_SYMBOL(class_import_put);
@@ -1062,6 +1287,14 @@ static void init_imp_at(struct imp_at *at) {
         }
 }
 
+static void obd_zombie_imp_cull(struct work_struct *ws)
+{
+	struct obd_import *import;
+
+	import = container_of(ws, struct obd_import, imp_zombie_work);
+	obd_zombie_import_free(import);
+}
+
 struct obd_import *class_new_import(struct obd_device *obd)
 {
 	struct obd_import *imp;
@@ -1072,7 +1305,6 @@ struct obd_import *class_new_import(struct obd_device *obd)
 		return NULL;
 
 	INIT_LIST_HEAD(&imp->imp_pinger_chain);
-	INIT_LIST_HEAD(&imp->imp_zombie_chain);
 	INIT_LIST_HEAD(&imp->imp_replay_list);
 	INIT_LIST_HEAD(&imp->imp_sending_list);
 	INIT_LIST_HEAD(&imp->imp_delayed_list);
@@ -1086,20 +1318,21 @@ struct obd_import *class_new_import(struct obd_device *obd)
 	imp->imp_obd = class_incref(obd, "import", imp);
 	mutex_init(&imp->imp_sec_mutex);
 	init_waitqueue_head(&imp->imp_recovery_waitq);
+	INIT_WORK(&imp->imp_zombie_work, obd_zombie_imp_cull);
 
-	if (curr_pid_ns->child_reaper)
+	if (curr_pid_ns && curr_pid_ns->child_reaper)
 		imp->imp_sec_refpid = curr_pid_ns->child_reaper->pid;
 	else
 		imp->imp_sec_refpid = 1;
 
 	atomic_set(&imp->imp_refcount, 2);
 	atomic_set(&imp->imp_unregistering, 0);
+	atomic_set(&imp->imp_reqs, 0);
 	atomic_set(&imp->imp_inflight, 0);
 	atomic_set(&imp->imp_replay_inflight, 0);
+	init_waitqueue_head(&imp->imp_replay_waitq);
 	atomic_set(&imp->imp_inval_count, 0);
 	INIT_LIST_HEAD(&imp->imp_conn_list);
-	INIT_LIST_HEAD(&imp->imp_handle.h_link);
-	class_handle_hash(&imp->imp_handle, &import_handle_ops);
 	init_imp_at(&imp->imp_at);
 
 	/* the default magic is V2, will be used in connect RPC, and
@@ -1115,8 +1348,6 @@ void class_destroy_import(struct obd_import *import)
 	LASSERT(import != NULL);
 	LASSERT(import != LP_POISON);
 
-	class_handle_unhash(&import->imp_handle);
-
 	spin_lock(&import->imp_lock);
 	import->imp_generation++;
 	spin_unlock(&import->imp_lock);
@@ -1329,7 +1560,7 @@ static void class_disconnect_export_list(struct list_head *list,
 
                 class_export_get(exp);
                 CDEBUG(D_HA, "%s: disconnecting export at %s (%p), "
-		       "last request at %ld\n",
+		       "last request at %lld\n",
                        exp->exp_obd->obd_name, obd_export_nid2str(exp),
                        exp, exp->exp_last_request_time);
                 /* release one export reference anyway */
@@ -1399,13 +1630,12 @@ void class_disconnect_stale_exports(struct obd_device *obd,
 		spin_unlock(&exp->exp_lock);
 
 		list_move(&exp->exp_obd_chain, &work_list);
-                evicted++;
-                CDEBUG(D_HA, "%s: disconnect stale client %s@%s\n",
-                       obd->obd_name, exp->exp_client_uuid.uuid,
-                       exp->exp_connection == NULL ? "<unknown>" :
-                       libcfs_nid2str(exp->exp_connection->c_peer.nid));
-                print_export_data(exp, "EVICTING", 0, D_HA);
-        }
+		evicted++;
+		CDEBUG(D_HA, "%s: disconnect stale client %s@%s\n",
+		       obd->obd_name, exp->exp_client_uuid.uuid,
+		       obd_export_nid2str(exp));
+		print_export_data(exp, "EVICTING", 0, D_HA);
+	}
 	spin_unlock(&obd->obd_dev_lock);
 
 	if (evicted)
@@ -1456,15 +1686,6 @@ void class_fail_export(struct obd_export *exp)
 }
 EXPORT_SYMBOL(class_fail_export);
 
-char *obd_export_nid2str(struct obd_export *exp)
-{
-        if (exp->exp_connection != NULL)
-                return libcfs_nid2str(exp->exp_connection->c_peer.nid);
-
-        return "(no nid)";
-}
-EXPORT_SYMBOL(obd_export_nid2str);
-
 int obd_export_evict_by_nid(struct obd_device *obd, const char *nid)
 {
 	struct cfs_hash *nid_hash;
@@ -1602,10 +1823,6 @@ void dump_exports(struct obd_device *obd, int locks, int debug_level)
 	list_for_each_entry(exp, &obd->obd_delayed_exports, exp_obd_chain)
 		print_export_data(exp, "DELAYED", locks, debug_level);
 	spin_unlock(&obd->obd_dev_lock);
-	spin_lock(&obd_zombie_impexp_lock);
-	list_for_each_entry(exp, &obd_zombie_exports, exp_obd_chain)
-		print_export_data(exp, "ZOMBIE", locks, debug_level);
-	spin_unlock(&obd_zombie_impexp_lock);
 }
 
 void obd_exports_barrier(struct obd_device *obd)
@@ -1632,83 +1849,6 @@ void obd_exports_barrier(struct obd_device *obd)
 }
 EXPORT_SYMBOL(obd_exports_barrier);
 
-/* Total amount of zombies to be destroyed */
-static int zombies_count = 0;
-
-/**
- * kill zombie imports and exports
- */
-void obd_zombie_impexp_cull(void)
-{
-	struct obd_import *import;
-	struct obd_export *export;
-	ENTRY;
-
-	do {
-		spin_lock(&obd_zombie_impexp_lock);
-
-		import = NULL;
-		if (!list_empty(&obd_zombie_imports)) {
-			import = list_entry(obd_zombie_imports.next,
-					    struct obd_import,
-					    imp_zombie_chain);
-			list_del_init(&import->imp_zombie_chain);
-		}
-
-		export = NULL;
-		if (!list_empty(&obd_zombie_exports)) {
-			export = list_entry(obd_zombie_exports.next,
-					    struct obd_export,
-					    exp_obd_chain);
-			list_del_init(&export->exp_obd_chain);
-		}
-
-		spin_unlock(&obd_zombie_impexp_lock);
-
-		if (import != NULL) {
-			class_import_destroy(import);
-			spin_lock(&obd_zombie_impexp_lock);
-			zombies_count--;
-			spin_unlock(&obd_zombie_impexp_lock);
-		}
-
-		if (export != NULL) {
-			class_export_destroy(export);
-			spin_lock(&obd_zombie_impexp_lock);
-			zombies_count--;
-			spin_unlock(&obd_zombie_impexp_lock);
-		}
-
-		cond_resched();
-	} while (import != NULL || export != NULL);
-	EXIT;
-}
-
-static DECLARE_COMPLETION(obd_zombie_start);
-static DECLARE_COMPLETION(obd_zombie_stop);
-static unsigned long obd_zombie_flags;
-static DECLARE_WAIT_QUEUE_HEAD(obd_zombie_waitq);
-static pid_t obd_zombie_pid;
-
-enum {
-	OBD_ZOMBIE_STOP		= 0x0001,
-};
-
-/**
- * check for work for kill zombie import/export thread.
- */
-static int obd_zombie_impexp_check(void *arg)
-{
-	int rc;
-
-	spin_lock(&obd_zombie_impexp_lock);
-	rc = (zombies_count == 0) &&
-	     !test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags);
-	spin_unlock(&obd_zombie_impexp_lock);
-
-	RETURN(rc);
-}
-
 /**
  * Add export to the obd_zombe thread and notify it.
  */
@@ -1718,12 +1858,8 @@ static void obd_zombie_export_add(struct obd_export *exp) {
 	LASSERT(!list_empty(&exp->exp_obd_chain));
 	list_del_init(&exp->exp_obd_chain);
 	spin_unlock(&exp->exp_obd->obd_dev_lock);
-	spin_lock(&obd_zombie_impexp_lock);
-	zombies_count++;
-	list_add(&exp->exp_obd_chain, &obd_zombie_exports);
-	spin_unlock(&obd_zombie_impexp_lock);
 
-	obd_zombie_impexp_notify();
+	queue_work(zombie_wq, &exp->exp_zombie_work);
 }
 
 /**
@@ -1731,40 +1867,8 @@ static void obd_zombie_export_add(struct obd_export *exp) {
  */
 static void obd_zombie_import_add(struct obd_import *imp) {
 	LASSERT(imp->imp_sec == NULL);
-	spin_lock(&obd_zombie_impexp_lock);
-	LASSERT(list_empty(&imp->imp_zombie_chain));
-	zombies_count++;
-	list_add(&imp->imp_zombie_chain, &obd_zombie_imports);
-	spin_unlock(&obd_zombie_impexp_lock);
 
-	obd_zombie_impexp_notify();
-}
-
-/**
- * notify import/export destroy thread about new zombie.
- */
-static void obd_zombie_impexp_notify(void)
-{
-	/*
-	 * Make sure obd_zomebie_impexp_thread get this notification.
-	 * It is possible this signal only get by obd_zombie_barrier, and
-	 * barrier gulps this notification and sleeps away and hangs ensues
-	 */
-	wake_up_all(&obd_zombie_waitq);
-}
-
-/**
- * check whether obd_zombie is idle
- */
-static int obd_zombie_is_idle(void)
-{
-	int rc;
-
-	LASSERT(!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags));
-	spin_lock(&obd_zombie_impexp_lock);
-	rc = (zombies_count == 0);
-	spin_unlock(&obd_zombie_impexp_lock);
-	return rc;
+	queue_work(zombie_wq, &imp->imp_zombie_work);
 }
 
 /**
@@ -1772,12 +1876,7 @@ static int obd_zombie_is_idle(void)
  */
 void obd_zombie_barrier(void)
 {
-	struct l_wait_info lwi = { 0 };
-
-	if (obd_zombie_pid == current_pid())
-		/* don't wait for myself */
-		return;
-	l_wait_event(obd_zombie_waitq, obd_zombie_is_idle(), &lwi);
+	flush_workqueue(zombie_wq);
 }
 EXPORT_SYMBOL(obd_zombie_barrier);
 
@@ -1852,58 +1951,24 @@ void obd_stale_export_adjust(struct obd_export *exp)
 }
 EXPORT_SYMBOL(obd_stale_export_adjust);
 
-/**
- * destroy zombie export/import thread.
- */
-static int obd_zombie_impexp_thread(void *unused)
-{
-	unshare_fs_struct();
-	complete(&obd_zombie_start);
-
-	obd_zombie_pid = current_pid();
-
-	while (!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags)) {
-		struct l_wait_info lwi = { 0 };
-
-		l_wait_event(obd_zombie_waitq,
-			     !obd_zombie_impexp_check(NULL), &lwi);
-		obd_zombie_impexp_cull();
-
-		/*
-		 * Notify obd_zombie_barrier callers that queues
-		 * may be empty.
-		 */
-		wake_up(&obd_zombie_waitq);
-	}
-
-	complete(&obd_zombie_stop);
-
-	RETURN(0);
-}
-
-
 /**
  * start destroy zombie import/export thread
  */
 int obd_zombie_impexp_init(void)
 {
-	struct task_struct *task;
-
-	task = kthread_run(obd_zombie_impexp_thread, NULL, "obd_zombid");
-	if (IS_ERR(task))
-		RETURN(PTR_ERR(task));
+	zombie_wq = alloc_workqueue("obd_zombid", 0, 0);
+	if (!zombie_wq)
+		return -ENOMEM;
 
-	wait_for_completion(&obd_zombie_start);
-	RETURN(0);
+	return 0;
 }
+
 /**
  * stop destroy zombie import/export thread
  */
 void obd_zombie_impexp_stop(void)
 {
-	set_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags);
-        obd_zombie_impexp_notify();
-	wait_for_completion(&obd_zombie_stop);
+	destroy_workqueue(zombie_wq);
 	LASSERT(list_empty(&obd_stale_exports));
 }
 
@@ -1989,14 +2054,14 @@ int obd_get_request_slot(struct client_obd *cli)
 	int				 rc;
 
 	spin_lock(&cli->cl_loi_list_lock);
-	if (cli->cl_r_in_flight < cli->cl_max_rpcs_in_flight) {
-		cli->cl_r_in_flight++;
+	if (cli->cl_rpcs_in_flight < cli->cl_max_rpcs_in_flight) {
+		cli->cl_rpcs_in_flight++;
 		spin_unlock(&cli->cl_loi_list_lock);
 		return 0;
 	}
 
 	init_waitqueue_head(&orsw.orsw_waitq);
-	list_add_tail(&orsw.orsw_entry, &cli->cl_loi_read_list);
+	list_add_tail(&orsw.orsw_entry, &cli->cl_flight_waiters);
 	orsw.orsw_signaled = false;
 	spin_unlock(&cli->cl_loi_list_lock);
 
@@ -2012,7 +2077,7 @@ int obd_get_request_slot(struct client_obd *cli)
 	if (rc != 0) {
 		if (!orsw.orsw_signaled) {
 			if (list_empty(&orsw.orsw_entry))
-				cli->cl_r_in_flight--;
+				cli->cl_rpcs_in_flight--;
 			else
 				list_del(&orsw.orsw_entry);
 		}
@@ -2034,15 +2099,15 @@ void obd_put_request_slot(struct client_obd *cli)
 	struct obd_request_slot_waiter *orsw;
 
 	spin_lock(&cli->cl_loi_list_lock);
-	cli->cl_r_in_flight--;
+	cli->cl_rpcs_in_flight--;
 
 	/* If there is free slot, wakeup the first waiter. */
-	if (!list_empty(&cli->cl_loi_read_list) &&
-	    likely(cli->cl_r_in_flight < cli->cl_max_rpcs_in_flight)) {
-		orsw = list_entry(cli->cl_loi_read_list.next,
+	if (!list_empty(&cli->cl_flight_waiters) &&
+	    likely(cli->cl_rpcs_in_flight < cli->cl_max_rpcs_in_flight)) {
+		orsw = list_entry(cli->cl_flight_waiters.next,
 				  struct obd_request_slot_waiter, orsw_entry);
 		list_del_init(&orsw->orsw_entry);
-		cli->cl_r_in_flight++;
+		cli->cl_rpcs_in_flight++;
 		wake_up(&orsw->orsw_waitq);
 	}
 	spin_unlock(&cli->cl_loi_list_lock);
@@ -2061,20 +2126,21 @@ int obd_set_max_rpcs_in_flight(struct client_obd *cli, __u32 max)
 	__u32				old;
 	int				diff;
 	int				i;
-	char				*typ_name;
 	int				rc;
 
 	if (max > OBD_MAX_RIF_MAX || max < 1)
 		return -ERANGE;
 
-	typ_name = cli->cl_import->imp_obd->obd_type->typ_name;
-	if (strcmp(typ_name, LUSTRE_MDC_NAME) == 0) {
+	CDEBUG(D_INFO, "%s: max = %hu max_mod = %u rif = %u\n",
+	       cli->cl_import->imp_obd->obd_name, max,
+	       cli->cl_max_mod_rpcs_in_flight, cli->cl_max_rpcs_in_flight);
+
+	if (strcmp(cli->cl_import->imp_obd->obd_type->typ_name,
+		   LUSTRE_MDC_NAME) == 0) {
 		/* adjust max_mod_rpcs_in_flight to ensure it is always
 		 * strictly lower that max_rpcs_in_flight */
 		if (max < 2) {
-			CERROR("%s: cannot set max_rpcs_in_flight to 1 "
-			       "because it must be higher than "
-			       "max_mod_rpcs_in_flight value",
+			CERROR("%s: cannot set mdc.*.max_rpcs_in_flight=1\n",
 			       cli->cl_import->imp_obd->obd_name);
 			return -ERANGE;
 		}
@@ -2088,17 +2154,19 @@ int obd_set_max_rpcs_in_flight(struct client_obd *cli, __u32 max)
 	spin_lock(&cli->cl_loi_list_lock);
 	old = cli->cl_max_rpcs_in_flight;
 	cli->cl_max_rpcs_in_flight = max;
+	client_adjust_max_dirty(cli);
+
 	diff = max - old;
 
 	/* We increase the max_rpcs_in_flight, then wakeup some waiters. */
 	for (i = 0; i < diff; i++) {
-		if (list_empty(&cli->cl_loi_read_list))
+		if (list_empty(&cli->cl_flight_waiters))
 			break;
 
-		orsw = list_entry(cli->cl_loi_read_list.next,
+		orsw = list_entry(cli->cl_flight_waiters.next,
 				  struct obd_request_slot_waiter, orsw_entry);
 		list_del_init(&orsw->orsw_entry);
-		cli->cl_r_in_flight++;
+		cli->cl_rpcs_in_flight++;
 		wake_up(&orsw->orsw_waitq);
 	}
 	spin_unlock(&cli->cl_loi_list_lock);
@@ -2115,32 +2183,50 @@ EXPORT_SYMBOL(obd_get_max_mod_rpcs_in_flight);
 
 int obd_set_max_mod_rpcs_in_flight(struct client_obd *cli, __u16 max)
 {
-	struct obd_connect_data	*ocd;
+	struct obd_connect_data *ocd;
 	__u16 maxmodrpcs;
 	__u16 prev;
 
 	if (max > OBD_MAX_RIF_MAX || max < 1)
 		return -ERANGE;
 
-	/* cannot exceed or equal max_rpcs_in_flight */
+	ocd = &cli->cl_import->imp_connect_data;
+	CDEBUG(D_INFO, "%s: max = %hu flags = %llx, max_mod = %u rif = %u\n",
+	       cli->cl_import->imp_obd->obd_name, max, ocd->ocd_connect_flags,
+	       ocd->ocd_maxmodrpcs, cli->cl_max_rpcs_in_flight);
+
+	if (max == OBD_MAX_RIF_MAX)
+		max = OBD_MAX_RIF_MAX - 1;
+
+	/* Cannot exceed or equal max_rpcs_in_flight.  If we are asked to
+	 * increase this value, also bump up max_rpcs_in_flight to match.
+	 */
 	if (max >= cli->cl_max_rpcs_in_flight) {
-		CERROR("%s: can't set max_mod_rpcs_in_flight to a value (%hu) "
-		       "higher or equal to max_rpcs_in_flight value (%u)\n",
-		       cli->cl_import->imp_obd->obd_name,
-		       max, cli->cl_max_rpcs_in_flight);
-		return -ERANGE;
+		CDEBUG(D_INFO,
+		       "%s: increasing max_rpcs_in_flight=%hu to allow larger max_mod_rpcs_in_flight=%u\n",
+		       cli->cl_import->imp_obd->obd_name, max + 1, max);
+		obd_set_max_rpcs_in_flight(cli, max + 1);
 	}
 
-	/* cannot exceed max modify RPCs in flight supported by the server */
-	ocd = &cli->cl_import->imp_connect_data;
-	if (ocd->ocd_connect_flags & OBD_CONNECT_MULTIMODRPCS)
+	/* cannot exceed max modify RPCs in flight supported by the server,
+	 * but verify ocd_connect_flags is at least initialized first.  If
+	 * not, allow it and fix value later in ptlrpc_connect_set_flags().
+	 */
+	if (!ocd->ocd_connect_flags) {
+		maxmodrpcs = cli->cl_max_rpcs_in_flight - 1;
+	} else if (ocd->ocd_connect_flags & OBD_CONNECT_MULTIMODRPCS) {
 		maxmodrpcs = ocd->ocd_maxmodrpcs;
-	else
+		if (maxmodrpcs == 0) { /* connection not finished yet */
+			maxmodrpcs = cli->cl_max_rpcs_in_flight - 1;
+			CDEBUG(D_INFO,
+			       "%s: partial connect, assume maxmodrpcs=%hu\n",
+			       cli->cl_import->imp_obd->obd_name, maxmodrpcs);
+		}
+	} else {
 		maxmodrpcs = 1;
+	}
 	if (max > maxmodrpcs) {
-		CERROR("%s: can't set max_mod_rpcs_in_flight to a value (%hu) "
-		       "higher than max_mod_rpcs_per_client value (%hu) "
-		       "returned by the server at connection\n",
+		CERROR("%s: can't set max_mod_rpcs_in_flight=%hu higher than ocd_maxmodrpcs=%hu returned by the server at connection\n",
 		       cli->cl_import->imp_obd->obd_name,
 		       max, maxmodrpcs);
 		return -ERANGE;
@@ -2161,8 +2247,6 @@ int obd_set_max_mod_rpcs_in_flight(struct client_obd *cli, __u16 max)
 }
 EXPORT_SYMBOL(obd_set_max_mod_rpcs_in_flight);
 
-
-#define pct(a, b) (b ? a * 100 / b : 0)
 int obd_mod_rpc_stats_seq_show(struct client_obd *cli,
 			       struct seq_file *seq)
 {
@@ -2188,7 +2272,7 @@ int obd_mod_rpc_stats_seq_show(struct client_obd *cli,
 	for (i = 0; i < OBD_HIST_MAX; i++) {
 		unsigned long mod = cli->cl_mod_rpcs_hist.oh_buckets[i];
 		mod_cum += mod;
-		seq_printf(seq, "%d:\t\t%10lu %3lu %3lu\n",
+		seq_printf(seq, "%d:\t\t%10lu %3u %3u\n",
 			   i, mod, pct(mod, mod_tot),
 			   pct(mod_cum, mod_tot));
 		if (mod_cum == mod_tot)
@@ -2200,8 +2284,6 @@ int obd_mod_rpc_stats_seq_show(struct client_obd *cli,
 	return 0;
 }
 EXPORT_SYMBOL(obd_mod_rpc_stats_seq_show);
-#undef pct
-
 
 /* The number of modify RPCs sent in parallel is limited
  * because the server has a finite number of slots per client to
@@ -2243,7 +2325,7 @@ static inline bool obd_skip_mod_rpc_slot(const struct lookup_intent *it)
 	if (it != NULL &&
 	    (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP ||
 	     it->it_op == IT_READDIR ||
-	     (it->it_op == IT_LAYOUT && !(it->it_flags & FMODE_WRITE))))
+	     (it->it_op == IT_LAYOUT && !(it->it_flags & MDS_FMODE_WRITE))))
 			return true;
 	return false;
 }
@@ -2297,8 +2379,9 @@ __u16 obd_get_mod_rpc_slot(struct client_obd *cli, __u32 opc,
 		       "opc %u, max %hu\n",
 		       cli->cl_import->imp_obd->obd_name, opc, max);
 
-		l_wait_event(cli->cl_mod_rpcs_waitq,
-			     obd_mod_rpc_slot_avail(cli, close_req), &lwi);
+		l_wait_event_exclusive(cli->cl_mod_rpcs_waitq,
+				       obd_mod_rpc_slot_avail(cli, close_req),
+				       &lwi);
 	} while (true);
 }
 EXPORT_SYMBOL(obd_get_mod_rpc_slot);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/idmap.c b/drivers/staging/lustrefsx/lustre/obdclass/idmap.c
index b45c6d6a55357..1fcbb2a839f9d 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/idmap.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/idmap.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -47,15 +47,6 @@
 #include <md_object.h>
 #include <obd_support.h>
 
-#define lustre_get_group_info(group_info) do {		\
-	atomic_inc(&(group_info)->usage);		\
-} while (0)
-
-#define lustre_put_group_info(group_info) do {		\
-	if (atomic_dec_and_test(&(group_info)->usage))	\
-		groups_free(group_info);		\
-} while (0)
-
 /*
  * groups_search() is copied from linux kernel!
  * A simple bsearch.
@@ -110,12 +101,12 @@ EXPORT_SYMBOL(lustre_groups_from_list);
 /* a simple shell-metzner sort */
 void lustre_groups_sort(struct group_info *group_info)
 {
-        int base, max, stride;
-        int gidsetsize = group_info->ngroups;
+	int base, max, stride;
+	int gidsetsize = group_info->ngroups;
 
-        for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
-                ; /* nothing */
-        stride /= 3;
+	for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
+		; /* nothing */
+	stride /= 3;
 
 	while (stride) {
 		max = gidsetsize - stride;
@@ -162,9 +153,10 @@ int lustre_in_group_p(struct lu_ucred *mu, gid_t grp)
 		if (!group_info)
 			return 0;
 
-		lustre_get_group_info(group_info);
+		atomic_inc(&group_info->usage);
 		rc = lustre_groups_search(group_info, grp);
-		lustre_put_group_info(group_info);
+		if (atomic_dec_and_test(&group_info->usage))
+			groups_free(group_info);
 	}
 	return rc;
 }
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/integrity.c b/drivers/staging/lustrefsx/lustre/obdclass/integrity.c
new file mode 100644
index 0000000000000..4a6d27aa6ae36
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/integrity.c
@@ -0,0 +1,277 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2018, DataDirect Networks Storage.
+ * Author: Li Xi.
+ *
+ * General data integrity functions
+ */
+#include <linux/blkdev.h>
+#include <linux/crc-t10dif.h>
+#include <asm/checksum.h>
+#include <obd_class.h>
+#include <obd_cksum.h>
+
+#if IS_ENABLED(CONFIG_CRC_T10DIF)
+__u16 obd_dif_crc_fn(void *data, unsigned int len)
+{
+	return cpu_to_be16(crc_t10dif(data, len));
+}
+EXPORT_SYMBOL(obd_dif_crc_fn);
+
+__u16 obd_dif_ip_fn(void *data, unsigned int len)
+{
+	return ip_compute_csum(data, len);
+}
+EXPORT_SYMBOL(obd_dif_ip_fn);
+
+int obd_page_dif_generate_buffer(const char *obd_name, struct page *page,
+				 __u32 offset, __u32 length,
+				 __u16 *guard_start, int guard_number,
+				 int *used_number, int sector_size,
+				 obd_dif_csum_fn *fn)
+{
+	unsigned int i = offset;
+	unsigned int end = offset + length;
+	char *data_buf;
+	__u16 *guard_buf = guard_start;
+	unsigned int data_size;
+	int used = 0;
+
+	data_buf = kmap(page) + offset;
+	while (i < end) {
+		if (used >= guard_number) {
+			CERROR("%s: unexpected used guard number of DIF %u/%u, "
+			       "data length %u, sector size %u: rc = %d\n",
+			       obd_name, used, guard_number, length,
+			       sector_size, -E2BIG);
+			return -E2BIG;
+		}
+		data_size = min(round_up(i + 1, sector_size), end) - i;
+		*guard_buf = fn(data_buf, data_size);
+		guard_buf++;
+		data_buf += data_size;
+		i += data_size;
+		used++;
+	}
+	kunmap(page);
+	*used_number = used;
+
+	return 0;
+}
+EXPORT_SYMBOL(obd_page_dif_generate_buffer);
+
+static int __obd_t10_performance_test(const char *obd_name,
+				      enum cksum_types cksum_type,
+				      struct page *data_page,
+				      int repeat_number)
+{
+	unsigned char cfs_alg = cksum_obd2cfs(OBD_CKSUM_T10_TOP);
+	struct ahash_request *req;
+	obd_dif_csum_fn *fn = NULL;
+	unsigned int bufsize;
+	unsigned char *buffer;
+	struct page *__page;
+	__u16 *guard_start;
+	int guard_number;
+	int used_number = 0;
+	int sector_size = 0;
+	__u32 cksum;
+	int rc = 0;
+	int rc2;
+	int used;
+	int i;
+
+	obd_t10_cksum2dif(cksum_type, &fn, &sector_size);
+	if (!fn)
+		return -EINVAL;
+
+	__page = alloc_page(GFP_KERNEL);
+	if (__page == NULL)
+		return -ENOMEM;
+
+	req = cfs_crypto_hash_init(cfs_alg, NULL, 0);
+	if (IS_ERR(req)) {
+		rc = PTR_ERR(req);
+		CERROR("%s: unable to initialize checksum hash %s: rc = %d\n",
+		       obd_name, cfs_crypto_hash_name(cfs_alg), rc);
+		GOTO(out, rc);
+	}
+
+	buffer = kmap(__page);
+	guard_start = (__u16 *)buffer;
+	guard_number = PAGE_SIZE / sizeof(*guard_start);
+	for (i = 0; i < repeat_number; i++) {
+		/*
+		 * The left guard number should be able to hold checksums of a
+		 * whole page
+		 */
+		rc = obd_page_dif_generate_buffer(obd_name, data_page, 0,
+						  PAGE_SIZE,
+						  guard_start + used_number,
+						  guard_number - used_number,
+						  &used, sector_size, fn);
+		if (rc)
+			break;
+
+		used_number += used;
+		if (used_number == guard_number) {
+			cfs_crypto_hash_update_page(req, __page, 0,
+				used_number * sizeof(*guard_start));
+			used_number = 0;
+		}
+	}
+	kunmap(__page);
+	if (rc)
+		GOTO(out_final, rc);
+
+	if (used_number != 0)
+		cfs_crypto_hash_update_page(req, __page, 0,
+			used_number * sizeof(*guard_start));
+
+	bufsize = sizeof(cksum);
+out_final:
+	rc2 = cfs_crypto_hash_final(req, (unsigned char *)&cksum, &bufsize);
+	rc = rc ? rc : rc2;
+out:
+	__free_page(__page);
+
+	return rc;
+}
+
+/**
+ *  Array of T10PI checksum algorithm speed in MByte per second
+ */
+static int obd_t10_cksum_speeds[OBD_T10_CKSUM_MAX];
+
+static enum obd_t10_cksum_type
+obd_t10_cksum2type(enum cksum_types cksum_type)
+{
+	switch (cksum_type) {
+	case OBD_CKSUM_T10IP512:
+		return OBD_T10_CKSUM_IP512;
+	case OBD_CKSUM_T10IP4K:
+		return OBD_T10_CKSUM_IP4K;
+	case OBD_CKSUM_T10CRC512:
+		return OBD_T10_CKSUM_CRC512;
+	case OBD_CKSUM_T10CRC4K:
+		return OBD_T10_CKSUM_CRC4K;
+	default:
+		return OBD_T10_CKSUM_UNKNOWN;
+	}
+}
+
+static const char *obd_t10_cksum_name(enum obd_t10_cksum_type index)
+{
+	DECLARE_CKSUM_NAME;
+
+	/* Need to skip "crc32", "adler", "crc32c", "reserved" */
+	return cksum_name[3 + index];
+}
+
+/**
+ * Compute the speed of specified T10PI checksum type
+ *
+ * Run a speed test on the given T10PI checksum on buffer using a 1MB buffer
+ * size. This is a reasonable buffer size for Lustre RPCs, even if the actual
+ * RPC size is larger or smaller.
+ *
+ * The speed is stored internally in the obd_t10_cksum_speeds[] array, and
+ * is available through the obd_t10_cksum_speed() function.
+ *
+ * This function needs to stay the same as cfs_crypto_performance_test() so
+ * that the speeds are comparable. And this function should reflect the real
+ * cost of the checksum calculation.
+ *
+ * \param[in] obd_name		name of the OBD device
+ * \param[in] cksum_type	checksum type (OBD_CKSUM_T10*)
+ */
+static void obd_t10_performance_test(const char *obd_name,
+				     enum cksum_types cksum_type)
+{
+	enum obd_t10_cksum_type index = obd_t10_cksum2type(cksum_type);
+	const int buf_len = max(PAGE_SIZE, 1048576UL);
+	unsigned long bcount;
+	unsigned long start;
+	unsigned long end;
+	struct page *page;
+	int rc = 0;
+	void *buf;
+
+	page = alloc_page(GFP_KERNEL);
+	if (page == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	buf = kmap(page);
+	memset(buf, 0xAD, PAGE_SIZE);
+	kunmap(page);
+
+	for (start = jiffies, end = start + msecs_to_jiffies(MSEC_PER_SEC / 4),
+	     bcount = 0; time_before(jiffies, end) && rc == 0; bcount++) {
+		rc = __obd_t10_performance_test(obd_name, cksum_type, page,
+						buf_len / PAGE_SIZE);
+		if (rc)
+			break;
+	}
+	end = jiffies;
+	__free_page(page);
+out:
+	if (rc) {
+		obd_t10_cksum_speeds[index] = rc;
+		CDEBUG(D_INFO, "%s: T10 checksum algorithm %s test error: "
+		       "rc = %d\n", obd_name, obd_t10_cksum_name(index), rc);
+	} else {
+		unsigned long tmp;
+
+		tmp = ((bcount * buf_len / jiffies_to_msecs(end - start)) *
+		       1000) / (1024 * 1024);
+		obd_t10_cksum_speeds[index] = (int)tmp;
+		CDEBUG(D_CONFIG, "%s: T10 checksum algorithm %s speed = %d "
+		       "MB/s\n", obd_name, obd_t10_cksum_name(index),
+		       obd_t10_cksum_speeds[index]);
+	}
+}
+#endif /* CONFIG_CRC_T10DIF */
+
+int obd_t10_cksum_speed(const char *obd_name,
+			enum cksum_types cksum_type)
+{
+#if IS_ENABLED(CONFIG_CRC_T10DIF)
+	enum obd_t10_cksum_type index = obd_t10_cksum2type(cksum_type);
+
+	if (unlikely(obd_t10_cksum_speeds[index] == 0)) {
+		static DEFINE_MUTEX(obd_t10_cksum_speed_mutex);
+
+		mutex_lock(&obd_t10_cksum_speed_mutex);
+		if (obd_t10_cksum_speeds[index] == 0)
+			obd_t10_performance_test(obd_name, cksum_type);
+		mutex_unlock(&obd_t10_cksum_speed_mutex);
+	}
+
+	return obd_t10_cksum_speeds[index];
+#else /* !CONFIG_CRC_T10DIF */
+	return 0;
+#endif /* !CONFIG_CRC_T10DIF */
+}
+EXPORT_SYMBOL(obd_t10_cksum_speed);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/jobid.c b/drivers/staging/lustrefsx/lustre/obdclass/jobid.c
new file mode 100644
index 0000000000000..b7a08d495b2ce
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/jobid.c
@@ -0,0 +1,575 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ *
+ * Copyright 2017 Cray Inc, all rights reserved.
+ * Author: Ben Evans.
+ *
+ * Store PID->JobID mappings
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <linux/user_namespace.h>
+#ifdef HAVE_UIDGID_HEADER
+#include <linux/uidgid.h>
+#endif
+#include <linux/utsname.h>
+
+#include <libcfs/libcfs.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+
+static struct cfs_hash *jobid_hash;
+static struct cfs_hash_ops jobid_hash_ops;
+spinlock_t jobid_hash_lock;
+
+#define RESCAN_INTERVAL 30
+#define DELETE_INTERVAL 300
+
+char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE;
+char obd_jobid_name[LUSTRE_JOBID_SIZE] = "%e.%u";
+
+/**
+ * Structure to store a single PID->JobID mapping
+ */
+struct jobid_pid_map {
+	struct hlist_node	jp_hash;
+	time64_t		jp_time;
+	spinlock_t		jp_lock; /* protects jp_jobid */
+	char			jp_jobid[LUSTRE_JOBID_SIZE];
+	unsigned int		jp_joblen;
+	atomic_t		jp_refcount;
+	pid_t			jp_pid;
+};
+
+/*
+ * Get jobid of current process by reading the environment variable
+ * stored in between the "env_start" & "env_end" of task struct.
+ *
+ * If some job scheduler doesn't store jobid in the "env_start/end",
+ * then an upcall could be issued here to get the jobid by utilizing
+ * the userspace tools/API. Then, the jobid must be cached.
+ */
+int jobid_get_from_environ(char *jobid_var, char *jobid, int *jobid_len)
+{
+	int rc;
+
+	rc = cfs_get_environ(jobid_var, jobid, jobid_len);
+	if (!rc)
+		goto out;
+
+	if (rc == -EOVERFLOW) {
+		/* For the PBS_JOBID and LOADL_STEP_ID keys (which are
+		 * variable length strings instead of just numbers), it
+		 * might make sense to keep the unique parts for JobID,
+		 * instead of just returning an error.  That means a
+		 * larger temp buffer for cfs_get_environ(), then
+		 * truncating the string at some separator to fit into
+		 * the specified jobid_len.  Fix later if needed. */
+		static ktime_t printed;
+
+		if (unlikely(ktime_to_ns(printed) == 0 ||
+			     ktime_after(ktime_get(),
+					 ktime_add_ns(printed,
+						      3600*24*NSEC_PER_SEC)))) {
+			LCONSOLE_WARN("jobid: '%s' value too large (%d)\n",
+				      obd_jobid_var, *jobid_len);
+			printed = ktime_get();
+		}
+
+		rc = 0;
+	} else {
+		CDEBUG_LIMIT((rc == -ENOENT || rc == -EINVAL ||
+			      rc == -EDEADLK) ? D_INFO : D_ERROR,
+			     "jobid: get '%s' failed: rc = %d\n",
+			     obd_jobid_var, rc);
+	}
+
+out:
+	return rc;
+}
+
+/*
+ * jobid_should_free_item
+ *
+ * Each item is checked to see if it should be released
+ * Removed from hash table by caller
+ * Actually freed in jobid_put_locked
+ *
+ * Returns 1 if item is to be freed, 0 if it is to be kept
+ */
+
+static int jobid_should_free_item(void *obj, void *data)
+{
+	char *jobid = data;
+	struct jobid_pid_map *pidmap = obj;
+	int rc = 0;
+
+	if (obj == NULL)
+		return 0;
+
+	if (jobid == NULL) {
+		WARN_ON_ONCE(atomic_read(&pidmap->jp_refcount) != 1);
+		return 1;
+	}
+
+	spin_lock(&pidmap->jp_lock);
+	/* prevent newly inserted items from deleting */
+	if (jobid[0] == '\0' && atomic_read(&pidmap->jp_refcount) == 1)
+		rc = 1;
+	else if (ktime_get_real_seconds() - pidmap->jp_time > DELETE_INTERVAL)
+		rc = 1;
+	else if (strcmp(pidmap->jp_jobid, jobid) == 0)
+		rc = 1;
+	spin_unlock(&pidmap->jp_lock);
+
+	return rc;
+}
+
+/*
+ * jobid_name_is_valid
+ *
+ * Checks if the jobid is a Lustre process
+ *
+ * Returns true if jobid is valid
+ * Returns false if jobid looks like it's a Lustre process
+ */
+static bool jobid_name_is_valid(char *jobid)
+{
+	const char *const lustre_reserved[] = { "ll_ping", "ptlrpc",
+						"ldlm", "ll_sa", NULL };
+	int i;
+
+	if (jobid[0] == '\0')
+		return false;
+
+	for (i = 0; lustre_reserved[i] != NULL; i++) {
+		if (strncmp(jobid, lustre_reserved[i],
+			    strlen(lustre_reserved[i])) == 0)
+			return false;
+	}
+	return true;
+}
+
+/*
+ * jobid_get_from_cache()
+ *
+ * Returns contents of jobid_var from process environment for current PID.
+ * This will be cached for some time to avoid overhead scanning environment.
+ *
+ * Return: -ENOMEM if allocating a new pidmap fails
+ *         -ENOENT if no entry could be found
+ *         +ve string length for success (something was returned in jobid)
+ */
+static int jobid_get_from_cache(char *jobid, size_t joblen)
+{
+	static time64_t last_expire;
+	bool expire_cache = false;
+	pid_t pid = current_pid();
+	struct jobid_pid_map *pidmap = NULL;
+	time64_t now = ktime_get_real_seconds();
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(jobid_hash != NULL);
+
+	/* scan hash periodically to remove old PID entries from cache */
+	spin_lock(&jobid_hash_lock);
+	if (unlikely(last_expire + DELETE_INTERVAL <= now)) {
+		expire_cache = true;
+		last_expire = now;
+	}
+	spin_unlock(&jobid_hash_lock);
+
+	if (expire_cache)
+		cfs_hash_cond_del(jobid_hash, jobid_should_free_item,
+				  "intentionally_bad_jobid");
+
+	/* first try to find PID in the hash and use that value */
+	pidmap = cfs_hash_lookup(jobid_hash, &pid);
+	if (pidmap == NULL) {
+		struct jobid_pid_map *pidmap2;
+
+		OBD_ALLOC_PTR(pidmap);
+		if (pidmap == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		pidmap->jp_pid = pid;
+		pidmap->jp_time = 0;
+		pidmap->jp_jobid[0] = '\0';
+		spin_lock_init(&pidmap->jp_lock);
+		INIT_HLIST_NODE(&pidmap->jp_hash);
+		/*
+		 * @pidmap might be reclaimed just after it is added into
+		 * hash list, init @jp_refcount as 1 to make sure memory
+		 * could be not freed during access.
+		 */
+		atomic_set(&pidmap->jp_refcount, 1);
+
+		/*
+		 * Add the newly created map to the hash, on key collision we
+		 * lost a racing addition and must destroy our newly allocated
+		 * map.  The object which exists in the hash will be returned.
+		 */
+		pidmap2 = cfs_hash_findadd_unique(jobid_hash, &pid,
+						  &pidmap->jp_hash);
+		if (unlikely(pidmap != pidmap2)) {
+			CDEBUG(D_INFO, "jobid: duplicate found for PID=%u\n",
+			       pid);
+			OBD_FREE_PTR(pidmap);
+			pidmap = pidmap2;
+		}
+	}
+
+	/*
+	 * If pidmap is old (this is always true for new entries) refresh it.
+	 * If obd_jobid_var is not found, cache empty entry and try again
+	 * later, to avoid repeat lookups for PID if obd_jobid_var missing.
+	 */
+	spin_lock(&pidmap->jp_lock);
+	if (pidmap->jp_time + RESCAN_INTERVAL <= now) {
+		char env_jobid[LUSTRE_JOBID_SIZE] = "";
+		int env_len = sizeof(env_jobid);
+
+		pidmap->jp_time = now;
+
+		spin_unlock(&pidmap->jp_lock);
+		rc = jobid_get_from_environ(obd_jobid_var, env_jobid, &env_len);
+
+		CDEBUG(D_INFO, "jobid: PID mapping established: %d->%s\n",
+		       pidmap->jp_pid, env_jobid);
+		spin_lock(&pidmap->jp_lock);
+		if (!rc) {
+			pidmap->jp_joblen = env_len;
+			strlcpy(pidmap->jp_jobid, env_jobid,
+				sizeof(pidmap->jp_jobid));
+			rc = 0;
+		} else if (rc == -ENOENT) {
+			/* It might have been deleted, clear out old entry */
+			pidmap->jp_joblen = 0;
+			pidmap->jp_jobid[0] = '\0';
+		}
+	}
+
+	/*
+	 * Regardless of how pidmap was found, if it contains a valid entry
+	 * use that for now.  If there was a technical error (e.g. -ENOMEM)
+	 * use the old cached value until it can be looked up again properly.
+	 * If a cached missing entry was found, return -ENOENT.
+	 */
+	if (pidmap->jp_joblen) {
+		strlcpy(jobid, pidmap->jp_jobid, joblen);
+		joblen = pidmap->jp_joblen;
+		rc = 0;
+	} else if (!rc) {
+		rc = -ENOENT;
+	}
+	spin_unlock(&pidmap->jp_lock);
+
+	cfs_hash_put(jobid_hash, &pidmap->jp_hash);
+
+	EXIT;
+out:
+	return rc < 0 ? rc : joblen;
+}
+
+/*
+ * jobid_interpret_string()
+ *
+ * Interpret the jobfmt string to expand specified fields, like coredumps do:
+ *   %e = executable
+ *   %g = gid
+ *   %h = hostname
+ *   %j = jobid from environment
+ *   %p = pid
+ *   %u = uid
+ *
+ * Unknown escape strings are dropped.  Other characters are copied through,
+ * excluding whitespace (to avoid making jobid parsing difficult).
+ *
+ * Return: -EOVERFLOW if the expanded string does not fit within @joblen
+ *         0 for success
+ */
+static int jobid_interpret_string(const char *jobfmt, char *jobid,
+				  ssize_t joblen)
+{
+	char c;
+
+	while ((c = *jobfmt++) && joblen > 1) {
+		char f;
+		int l;
+
+		if (isspace(c)) /* Don't allow embedded spaces */
+			continue;
+
+		if (c != '%') {
+			*jobid = c;
+			joblen--;
+			jobid++;
+			continue;
+		}
+
+		switch ((f = *jobfmt++)) {
+		case 'e': /* executable name */
+			l = snprintf(jobid, joblen, "%s", current_comm());
+			break;
+		case 'g': /* group ID */
+			l = snprintf(jobid, joblen, "%u",
+				     from_kgid(&init_user_ns, current_fsgid()));
+			break;
+		case 'h': /* hostname */
+			l = snprintf(jobid, joblen, "%s",
+				     init_utsname()->nodename);
+			break;
+		case 'j': /* jobid stored in process environment */
+			l = jobid_get_from_cache(jobid, joblen);
+			if (l < 0)
+				l = 0;
+			break;
+		case 'p': /* process ID */
+			l = snprintf(jobid, joblen, "%u", current_pid());
+			break;
+		case 'u': /* user ID */
+			l = snprintf(jobid, joblen, "%u",
+				     from_kuid(&init_user_ns, current_fsuid()));
+			break;
+		case '\0': /* '%' at end of format string */
+			l = 0;
+			goto out;
+		default: /* drop unknown %x format strings */
+			l = 0;
+			break;
+		}
+		jobid += l;
+		joblen -= l;
+	}
+	/*
+	 * This points at the end of the buffer, so long as jobid is always
+	 * incremented the same amount as joblen is decremented.
+	 */
+out:
+	jobid[joblen - 1] = '\0';
+
+	return joblen < 0 ? -EOVERFLOW : 0;
+}
+
+/*
+ * Hash initialization, copied from server-side job stats bucket sizes
+ */
+#define HASH_JOBID_BKT_BITS 5
+#define HASH_JOBID_CUR_BITS 7
+#define HASH_JOBID_MAX_BITS 12
+
+int jobid_cache_init(void)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (jobid_hash)
+		return 0;
+
+	spin_lock_init(&jobid_hash_lock);
+	jobid_hash = cfs_hash_create("JOBID_HASH", HASH_JOBID_CUR_BITS,
+				     HASH_JOBID_MAX_BITS, HASH_JOBID_BKT_BITS,
+				     0, CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA,
+				     &jobid_hash_ops, CFS_HASH_DEFAULT);
+	if (!jobid_hash)
+		rc = -ENOMEM;
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(jobid_cache_init);
+
+void jobid_cache_fini(void)
+{
+	struct cfs_hash *tmp_hash;
+	ENTRY;
+
+	spin_lock(&jobid_hash_lock);
+	tmp_hash = jobid_hash;
+	jobid_hash = NULL;
+	spin_unlock(&jobid_hash_lock);
+
+	if (tmp_hash != NULL) {
+		cfs_hash_cond_del(tmp_hash, jobid_should_free_item, NULL);
+		cfs_hash_putref(tmp_hash);
+	}
+
+	EXIT;
+}
+EXPORT_SYMBOL(jobid_cache_fini);
+
+/*
+ * Hash operations for pid<->jobid
+ */
+static unsigned jobid_hashfn(struct cfs_hash *hs, const void *key,
+			     unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, sizeof(pid_t), mask);
+}
+
+static void *jobid_key(struct hlist_node *hnode)
+{
+	struct jobid_pid_map *pidmap;
+
+	pidmap = hlist_entry(hnode, struct jobid_pid_map, jp_hash);
+	return &pidmap->jp_pid;
+}
+
+static int jobid_keycmp(const void *key, struct hlist_node *hnode)
+{
+	const pid_t *pid_key1;
+	const pid_t *pid_key2;
+
+	LASSERT(key != NULL);
+	pid_key1 = (pid_t *)key;
+	pid_key2 = (pid_t *)jobid_key(hnode);
+
+	return *pid_key1 == *pid_key2;
+}
+
+static void *jobid_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct jobid_pid_map, jp_hash);
+}
+
+static void jobid_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct jobid_pid_map *pidmap;
+
+	pidmap = hlist_entry(hnode, struct jobid_pid_map, jp_hash);
+
+	atomic_inc(&pidmap->jp_refcount);
+}
+
+static void jobid_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct jobid_pid_map *pidmap;
+
+	if (hnode == NULL)
+		return;
+
+	pidmap = hlist_entry(hnode, struct jobid_pid_map, jp_hash);
+	LASSERT(atomic_read(&pidmap->jp_refcount) > 0);
+	if (atomic_dec_and_test(&pidmap->jp_refcount)) {
+		CDEBUG(D_INFO, "Freeing: %d->%s\n",
+		       pidmap->jp_pid, pidmap->jp_jobid);
+
+		OBD_FREE_PTR(pidmap);
+	}
+}
+
+static struct cfs_hash_ops jobid_hash_ops = {
+	.hs_hash	= jobid_hashfn,
+	.hs_keycmp	= jobid_keycmp,
+	.hs_key		= jobid_key,
+	.hs_object	= jobid_object,
+	.hs_get		= jobid_get,
+	.hs_put		= jobid_put_locked,
+	.hs_put_locked	= jobid_put_locked,
+};
+
+/**
+ * Generate the job identifier string for this process for tracking purposes.
+ *
+ * Fill in @jobid string based on the value of obd_jobid_var:
+ * JOBSTATS_DISABLE:      none
+ * JOBSTATS_NODELOCAL:    content of obd_jobid_node (jobid_interpret_string())
+ * JOBSTATS_PROCNAME_UID: process name/UID
+ * anything else:         look up obd_jobid_var in the processes environment
+ *
+ * Return -ve error number, 0 on success.
+ */
+int lustre_get_jobid(char *jobid, size_t joblen)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (unlikely(joblen < 2)) {
+		if (joblen == 1)
+			jobid[0] = '\0';
+		RETURN(-EINVAL);
+	}
+
+	if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0) {
+		/* Jobstats isn't enabled */
+		memset(jobid, 0, joblen);
+	} else if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) == 0) {
+		/* Whole node dedicated to single job */
+		rc = jobid_interpret_string(obd_jobid_name, jobid, joblen);
+	} else if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) {
+		rc = jobid_interpret_string("%e.%u", jobid, joblen);
+	} else if (jobid_name_is_valid(current_comm())) {
+		/*
+		 * obd_jobid_var holds the jobid environment variable name.
+		 * Skip initial check if obd_jobid_name already uses "%j",
+		 * otherwise try just "%j" first, then fall back to whatever
+		 * is in obd_jobid_name if obd_jobid_var is not found.
+		 */
+		rc = -EAGAIN;
+		if (!strnstr(obd_jobid_name, "%j", joblen))
+			rc = jobid_get_from_cache(jobid, joblen);
+
+		/* fall back to jobid_node if jobid_var not in environment */
+		if (rc < 0) {
+			int rc2 = jobid_interpret_string(obd_jobid_name,
+							 jobid, joblen);
+			if (!rc2)
+				rc = 0;
+		}
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_get_jobid);
+
+/*
+ * lustre_jobid_clear
+ *
+ * Search cache for JobID given by @find_jobid.
+ * If any entries in the hash table match the value, they are removed
+ */
+void lustre_jobid_clear(const char *find_jobid)
+{
+	char jobid[LUSTRE_JOBID_SIZE];
+	char *end;
+
+	if (jobid_hash == NULL)
+		return;
+
+	strlcpy(jobid, find_jobid, sizeof(jobid));
+	/* trim \n off the end of the incoming jobid */
+	end = strchr(jobid, '\n');
+	if (end && *end == '\n')
+		*end = '\0';
+
+	CDEBUG(D_INFO, "Clearing Jobid: %s\n", jobid);
+	cfs_hash_cond_del(jobid_hash, jobid_should_free_item, jobid);
+
+	CDEBUG(D_INFO, "%d items remain in jobID table\n",
+	       atomic_read(&jobid_hash->hs_count));
+}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/kernelcomm.c b/drivers/staging/lustrefsx/lustre/obdclass/kernelcomm.c
index 79d176dcd3d53..7afb9484a8a69 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/kernelcomm.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/kernelcomm.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2015, Intel Corporation.
+ * Copyright (c) 2015, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -35,7 +35,8 @@
  */
 
 #define DEBUG_SUBSYSTEM S_CLASS
-#define D_KUC D_OTHER
+
+#include <linux/file.h>
 
 #include <obd_support.h>
 #include <lustre_kernelcomm.h>
@@ -73,7 +74,7 @@ int libcfs_kkuc_msg_put(struct file *filp, void *payload)
 	if (rc < 0)
 		CWARN("message send failed (%d)\n", rc);
 	else
-		CDEBUG(D_KUC, "Sent message rc=%d, fp=%p\n", rc, filp);
+		CDEBUG(D_HSM, "Sent message rc=%d, fp=%p\n", rc, filp);
 
 	return rc;
 }
@@ -142,7 +143,7 @@ int libcfs_kkuc_group_add(struct file *filp, const struct obd_uuid *uuid,
 	list_add(&reg->kr_chain, &kkuc_groups[group]);
 	up_write(&kg_sem);
 
-	CDEBUG(D_KUC, "Added uid=%d fp=%p to group %d\n", uid, filp, group);
+	CDEBUG(D_HSM, "Added uid=%d fp=%p to group %d\n", uid, filp, group);
 
 	return 0;
 }
@@ -174,7 +175,7 @@ int libcfs_kkuc_group_rem(const struct obd_uuid *uuid, int uid, int group)
 		if (obd_uuid_equals(uuid, &reg->kr_uuid) &&
 		    (uid == 0 || uid == reg->kr_uid)) {
 			list_del(&reg->kr_chain);
-			CDEBUG(D_KUC, "Removed uid=%d fp=%p from group %d\n",
+			CDEBUG(D_HSM, "Removed uid=%d fp=%p from group %d\n",
 				reg->kr_uid, reg->kr_fp, group);
 			if (reg->kr_fp != NULL)
 				fput(reg->kr_fp);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/linkea.c b/drivers/staging/lustrefsx/lustre/obdclass/linkea.c
index a1bcc3d7de608..cf17a50999f8d 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/linkea.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/linkea.c
@@ -21,13 +21,12 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2013, 2016, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
  * Use is subject to license terms.
  *
  * Author: Di Wang <di.wang@intel.com>
  */
 
-#include <lustre/lustre_idl.h>
 #include <obd.h>
 #include <lustre_linkea.h>
 
@@ -144,10 +143,11 @@ int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname,
 	reclen = lname->ln_namelen + sizeof(struct link_ea_entry);
 	if (unlikely(leh->leh_len + reclen > MAX_LINKEA_SIZE)) {
 		/* Use 32-bits to save the overflow time, although it will
-		 * shrink the cfs_time_current_sec() returned 64-bits value
+		 * shrink the ktime_get_real_seconds() returned 64-bits value
 		 * to 32-bits value, it is still quite large and can be used
-		 * for about 140 years. That is enough. */
-		leh->leh_overflow_time = cfs_time_current_sec();
+		 * for about 140 years. That is enough.
+		 */
+		leh->leh_overflow_time = ktime_get_real_seconds();
 		if (unlikely(leh->leh_overflow_time == 0))
 			leh->leh_overflow_time++;
 
@@ -236,7 +236,7 @@ int linkea_overflow_shrink(struct linkea_data *ldata)
 	if (unlikely(leh->leh_reccount == 0))
 		return 0;
 
-	leh->leh_overflow_time = cfs_time_current_sec();
+	leh->leh_overflow_time = ktime_get_real_seconds();
 	if (unlikely(leh->leh_overflow_time == 0))
 		leh->leh_overflow_time++;
 	ldata->ld_reclen = 0;
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-module.c b/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-module.c
deleted file mode 100644
index dabbf58057caf..0000000000000
--- a/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-module.c
+++ /dev/null
@@ -1,582 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2011, 2016, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lustre/obdclass/linux/linux-module.c
- *
- * Object Devices Class Driver
- * These are the only exported functions, they provide some generic
- * infrastructure for managing object devices
- */
-
-#define DEBUG_SUBSYSTEM S_CLASS
-
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/lp.h>
-#include <linux/slab.h>
-#include <linux/ioport.h>
-#include <linux/fcntl.h>
-#include <linux/delay.h>
-#include <linux/skbuff.h>
-#include <linux/proc_fs.h>
-#include <linux/fs.h>
-#include <linux/poll.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/highmem.h>
-#include <asm/io.h>
-#include <asm/ioctls.h>
-#include <asm/poll.h>
-#include <asm/uaccess.h>
-#include <linux/miscdevice.h>
-#include <linux/seq_file.h>
-#include <linux/kobject.h>
-
-#include <libcfs/libcfs.h>
-#include <obd_support.h>
-#include <obd_class.h>
-#include <lnet/lnetctl.h>
-#include <lprocfs_status.h>
-#include <uapi/linux/lustre_ioctl.h>
-#include <lustre_ver.h>
-
-static int obd_ioctl_is_invalid(struct obd_ioctl_data *data)
-{
-	if (data->ioc_len > BIT(30)) {
-		CERROR("OBD ioctl: ioc_len larger than 1<<30\n");
-		return 1;
-	}
-
-	if (data->ioc_inllen1 > BIT(30)) {
-		CERROR("OBD ioctl: ioc_inllen1 larger than 1<<30\n");
-		return 1;
-	}
-
-	if (data->ioc_inllen2 > BIT(30)) {
-		CERROR("OBD ioctl: ioc_inllen2 larger than 1<<30\n");
-		return 1;
-	}
-
-	if (data->ioc_inllen3 > BIT(30)) {
-		CERROR("OBD ioctl: ioc_inllen3 larger than 1<<30\n");
-		return 1;
-	}
-
-	if (data->ioc_inllen4 > BIT(30)) {
-		CERROR("OBD ioctl: ioc_inllen4 larger than 1<<30\n");
-		return 1;
-	}
-
-	if (data->ioc_inlbuf1 && data->ioc_inllen1 == 0) {
-		CERROR("OBD ioctl: inlbuf1 pointer but 0 length\n");
-		return 1;
-	}
-
-	if (data->ioc_inlbuf2 && data->ioc_inllen2 == 0) {
-		CERROR("OBD ioctl: inlbuf2 pointer but 0 length\n");
-		return 1;
-	}
-
-	if (data->ioc_inlbuf3 && data->ioc_inllen3 == 0) {
-		CERROR("OBD ioctl: inlbuf3 pointer but 0 length\n");
-		return 1;
-	}
-
-	if (data->ioc_inlbuf4 && data->ioc_inllen4 == 0) {
-		CERROR("OBD ioctl: inlbuf4 pointer but 0 length\n");
-		return 1;
-	}
-
-	if (data->ioc_pbuf1 && data->ioc_plen1 == 0) {
-		CERROR("OBD ioctl: pbuf1 pointer but 0 length\n");
-		return 1;
-	}
-
-	if (data->ioc_pbuf2 && data->ioc_plen2 == 0) {
-		CERROR("OBD ioctl: pbuf2 pointer but 0 length\n");
-		return 1;
-	}
-
-	if (!data->ioc_pbuf1 && data->ioc_plen1 != 0) {
-		CERROR("OBD ioctl: plen1 set but NULL pointer\n");
-		return 1;
-	}
-
-	if (!data->ioc_pbuf2 && data->ioc_plen2 != 0) {
-		CERROR("OBD ioctl: plen2 set but NULL pointer\n");
-		return 1;
-	}
-
-	if (obd_ioctl_packlen(data) > data->ioc_len) {
-		CERROR("OBD ioctl: packlen exceeds ioc_len (%d > %d)\n",
-		       obd_ioctl_packlen(data), data->ioc_len);
-		return 1;
-	}
-
-	return 0;
-}
-
-/* buffer MUST be at least the size of obd_ioctl_hdr */
-int obd_ioctl_getdata(char **buf, int *len, void __user *arg)
-{
-	struct obd_ioctl_hdr hdr;
-	struct obd_ioctl_data *data;
-	int offset = 0;
-	ENTRY;
-
-	if (copy_from_user(&hdr, arg, sizeof(hdr)))
-		RETURN(-EFAULT);
-
-        if (hdr.ioc_version != OBD_IOCTL_VERSION) {
-                CERROR("Version mismatch kernel (%x) vs application (%x)\n",
-                       OBD_IOCTL_VERSION, hdr.ioc_version);
-                RETURN(-EINVAL);
-        }
-
-        if (hdr.ioc_len > OBD_MAX_IOCTL_BUFFER) {
-                CERROR("User buffer len %d exceeds %d max buffer\n",
-                       hdr.ioc_len, OBD_MAX_IOCTL_BUFFER);
-                RETURN(-EINVAL);
-        }
-
-        if (hdr.ioc_len < sizeof(struct obd_ioctl_data)) {
-                CERROR("User buffer too small for ioctl (%d)\n", hdr.ioc_len);
-                RETURN(-EINVAL);
-        }
-
-        /* When there are lots of processes calling vmalloc on multi-core
-         * system, the high lock contention will hurt performance badly,
-         * obdfilter-survey is an example, which relies on ioctl. So we'd
-         * better avoid vmalloc on ioctl path. LU-66 */
-        OBD_ALLOC_LARGE(*buf, hdr.ioc_len);
-        if (*buf == NULL) {
-                CERROR("Cannot allocate control buffer of len %d\n",
-                       hdr.ioc_len);
-                RETURN(-EINVAL);
-        }
-        *len = hdr.ioc_len;
-        data = (struct obd_ioctl_data *)*buf;
-
-	if (copy_from_user(*buf, arg, hdr.ioc_len)) {
-		OBD_FREE_LARGE(*buf, hdr.ioc_len);
-		RETURN(-EFAULT);
-	}
-
-        if (obd_ioctl_is_invalid(data)) {
-                CERROR("ioctl not correctly formatted\n");
-                OBD_FREE_LARGE(*buf, hdr.ioc_len);
-                RETURN(-EINVAL);
-        }
-
-        if (data->ioc_inllen1) {
-                data->ioc_inlbuf1 = &data->ioc_bulk[0];
-                offset += cfs_size_round(data->ioc_inllen1);
-        }
-
-        if (data->ioc_inllen2) {
-                data->ioc_inlbuf2 = &data->ioc_bulk[0] + offset;
-                offset += cfs_size_round(data->ioc_inllen2);
-        }
-
-        if (data->ioc_inllen3) {
-                data->ioc_inlbuf3 = &data->ioc_bulk[0] + offset;
-                offset += cfs_size_round(data->ioc_inllen3);
-        }
-
-	if (data->ioc_inllen4)
-		data->ioc_inlbuf4 = &data->ioc_bulk[0] + offset;
-
-	RETURN(0);
-}
-EXPORT_SYMBOL(obd_ioctl_getdata);
-
-/*  opening /dev/obd */
-static int obd_class_open(struct inode * inode, struct file * file)
-{
-	ENTRY;
-
-	try_module_get(THIS_MODULE);
-	RETURN(0);
-}
-
-/*  closing /dev/obd */
-static int obd_class_release(struct inode * inode, struct file * file)
-{
-	ENTRY;
-
-	module_put(THIS_MODULE);
-	RETURN(0);
-}
-
-/* to control /dev/obd */
-static long obd_class_ioctl(struct file *filp, unsigned int cmd,
-			    unsigned long arg)
-{
-        int err = 0;
-        ENTRY;
-
-        /* Allow non-root access for OBD_IOC_PING_TARGET - used by lfs check */
-        if (!cfs_capable(CFS_CAP_SYS_ADMIN) && (cmd != OBD_IOC_PING_TARGET))
-                RETURN(err = -EACCES);
-        if ((cmd & 0xffffff00) == ((int)'T') << 8) /* ignore all tty ioctls */
-                RETURN(err = -ENOTTY);
-
-        err = class_handle_ioctl(cmd, (unsigned long)arg);
-
-        RETURN(err);
-}
-
-/* declare character device */
-static struct file_operations obd_psdev_fops = {
-	.owner          = THIS_MODULE,
-	.unlocked_ioctl = obd_class_ioctl, /* unlocked_ioctl */
-	.open           = obd_class_open,      /* open */
-	.release        = obd_class_release,   /* release */
-};
-
-/* modules setup */
-struct miscdevice obd_psdev = {
-	.minor	= MISC_DYNAMIC_MINOR,
-	.name	= OBD_DEV_NAME,
-	.fops	= &obd_psdev_fops,
-};
-
-static ssize_t version_show(struct kobject *kobj, struct attribute *attr,
-			    char *buf)
-{
-	return sprintf(buf, "%s\n", LUSTRE_VERSION_STRING);
-}
-
-static ssize_t pinger_show(struct kobject *kobj, struct attribute *attr,
-			   char *buf)
-{
-#ifdef ENABLE_PINGER
-	const char *state = "on";
-#else
-	const char *state = "off";
-#endif
-	return sprintf(buf, "%s\n", state);
-}
-
-/**
- * Check all obd devices health
- *
- * \param kobj
- * \param buf [in]
- *
- * \retval number of characters printed if healthy
- */
-static ssize_t
-health_check_show(struct kobject *kobj, struct attribute *attr, char *buf)
-{
-	bool healthy = true;
-	size_t len = 0;
-	int i;
-
-	if (libcfs_catastrophe) {
-		len = sprintf(buf, "LBUG\n");
-		healthy = false;
-	}
-
-	read_lock(&obd_dev_lock);
-	for (i = 0; i < class_devno_max(); i++) {
-		struct obd_device *obd;
-
-		obd = class_num2obd(i);
-		if (obd == NULL || !obd->obd_attached || !obd->obd_set_up)
-			continue;
-
-		LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
-		if (obd->obd_stopping)
-			continue;
-
-		class_incref(obd, __FUNCTION__, current);
-		read_unlock(&obd_dev_lock);
-
-		if (obd_health_check(NULL, obd)) {
-			len = sprintf(buf, "device %s reported unhealthy\n",
-				      obd->obd_name);
-			healthy = false;
-		}
-		class_decref(obd, __FUNCTION__, current);
-		read_lock(&obd_dev_lock);
-	}
-	read_unlock(&obd_dev_lock);
-
-	if (healthy)
-		len = sprintf(buf, "healthy\n");
-	else
-		len = sprintf(buf, "NOT HEALTHY\n");
-
-	return len;
-}
-
-static ssize_t jobid_var_show(struct kobject *kobj, struct attribute *attr,
-			      char *buf)
-{
-	int rc = 0;
-
-	if (strlen(obd_jobid_var))
-		rc = snprintf(buf, PAGE_SIZE, "%s\n", obd_jobid_var);
-	return rc;
-}
-
-static ssize_t jobid_var_store(struct kobject *kobj, struct attribute *attr,
-			       const char *buffer, size_t count)
-{
-	if (!count || count > JOBSTATS_JOBID_VAR_MAX_LEN)
-		return -EINVAL;
-
-	memset(obd_jobid_var, 0, JOBSTATS_JOBID_VAR_MAX_LEN + 1);
-
-	memcpy(obd_jobid_var, buffer, count);
-
-	/* Trim the trailing '\n' if any */
-	if (obd_jobid_var[count - 1] == '\n')
-		obd_jobid_var[count - 1] = 0;
-
-	return count;
-}
-
-static ssize_t jobid_name_show(struct kobject *kobj, struct attribute *attr,
-			       char *buf)
-{
-	int rc = 0;
-
-	if (strlen(obd_jobid_node))
-		rc = snprintf(buf, PAGE_SIZE, "%s\n", obd_jobid_node);
-	return rc;
-}
-
-static ssize_t jobid_name_store(struct kobject *kobj, struct attribute *attr,
-				const char *buffer, size_t count)
-{
-	if (!count || count > LUSTRE_JOBID_SIZE)
-		return -EINVAL;
-
-	/* clear previous value */
-	memset(obd_jobid_node, 0, LUSTRE_JOBID_SIZE);
-
-	memcpy(obd_jobid_node, buffer, count);
-
-	/* Trim the trailing '\n' if any */
-	if (obd_jobid_node[count - 1] == '\n') {
-		/* Don't echo just a newline */
-		if (count == 1)
-			return -EINVAL;
-		obd_jobid_node[count - 1] = 0;
-	}
-
-	return count;
-}
-
-/* Root for /sys/kernel/debug/lustre */
-struct dentry *debugfs_lustre_root;
-EXPORT_SYMBOL_GPL(debugfs_lustre_root);
-
-#ifdef CONFIG_PROC_FS
-/* Root for /proc/fs/lustre */
-struct proc_dir_entry *proc_lustre_root = NULL;
-EXPORT_SYMBOL(proc_lustre_root);
-#else
-#define lprocfs_base NULL
-#endif /* CONFIG_PROC_FS */
-
-LUSTRE_RO_ATTR(version);
-LUSTRE_RO_ATTR(pinger);
-LUSTRE_RO_ATTR(health_check);
-LUSTRE_RW_ATTR(jobid_var);
-LUSTRE_RW_ATTR(jobid_name);
-
-static struct attribute *lustre_attrs[] = {
-	&lustre_attr_version.attr,
-	&lustre_attr_pinger.attr,
-	&lustre_attr_health_check.attr,
-	&lustre_attr_jobid_name.attr,
-	&lustre_attr_jobid_var.attr,
-	NULL,
-};
-
-static void *obd_device_list_seq_start(struct seq_file *p, loff_t *pos)
-{
-        if (*pos >= class_devno_max())
-                return NULL;
-
-        return pos;
-}
-
-static void obd_device_list_seq_stop(struct seq_file *p, void *v)
-{
-}
-
-static void *obd_device_list_seq_next(struct seq_file *p, void *v, loff_t *pos)
-{
-        ++*pos;
-        if (*pos >= class_devno_max())
-                return NULL;
-
-        return pos;
-}
-
-static int obd_device_list_seq_show(struct seq_file *p, void *v)
-{
-        loff_t index = *(loff_t *)v;
-        struct obd_device *obd = class_num2obd((int)index);
-        char *status;
-
-        if (obd == NULL)
-                return 0;
-
-        LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
-        if (obd->obd_stopping)
-                status = "ST";
-        else if (obd->obd_inactive)
-                status = "IN";
-        else if (obd->obd_set_up)
-                status = "UP";
-        else if (obd->obd_attached)
-                status = "AT";
-        else
-                status = "--";
-
-	seq_printf(p, "%3d %s %s %s %s %d\n",
-		   (int)index, status, obd->obd_type->typ_name,
-		   obd->obd_name, obd->obd_uuid.uuid,
-		   atomic_read(&obd->obd_refcount));
-	return 0;
-}
-
-static const struct seq_operations obd_device_list_sops = {
-        .start = obd_device_list_seq_start,
-        .stop = obd_device_list_seq_stop,
-        .next = obd_device_list_seq_next,
-        .show = obd_device_list_seq_show,
-};
-
-static int obd_device_list_open(struct inode *inode, struct file *file)
-{
-	struct seq_file *seq;
-	int rc = seq_open(file, &obd_device_list_sops);
-
-	if (rc)
-		return rc;
-
-	seq = file->private_data;
-	seq->private = inode->i_private;
-	return 0;
-}
-
-static const struct file_operations obd_device_list_fops = {
-        .owner   = THIS_MODULE,
-        .open    = obd_device_list_open,
-        .read    = seq_read,
-        .llseek  = seq_lseek,
-        .release = seq_release,
-};
-
-struct kobject *lustre_kobj;
-EXPORT_SYMBOL_GPL(lustre_kobj);
-
-static struct attribute_group lustre_attr_group = {
-	.attrs = lustre_attrs,
-};
-
-int class_procfs_init(void)
-{
-	struct proc_dir_entry *entry;
-	struct dentry *file;
-	int rc = -ENOMEM;
-	ENTRY;
-
-	lustre_kobj = kobject_create_and_add("lustre", fs_kobj);
-	if (lustre_kobj == NULL)
-		goto out;
-
-	/* Create the files associated with this kobject */
-	rc = sysfs_create_group(lustre_kobj, &lustre_attr_group);
-	if (rc) {
-		kobject_put(lustre_kobj);
-		goto out;
-	}
-
-	rc = obd_sysctl_init();
-	if (rc) {
-		kobject_put(lustre_kobj);
-		goto out;
-	}
-
-	debugfs_lustre_root = debugfs_create_dir("lustre", NULL);
-	if (IS_ERR_OR_NULL(debugfs_lustre_root)) {
-		rc = debugfs_lustre_root ? PTR_ERR(debugfs_lustre_root)
-					 : -ENOMEM;
-		debugfs_lustre_root = NULL;
-		kobject_put(lustre_kobj);
-		goto out;
-	}
-
-	file = debugfs_create_file("devices", 0444, debugfs_lustre_root, NULL,
-				   &obd_device_list_fops);
-	if (IS_ERR_OR_NULL(file)) {
-		rc = file ? PTR_ERR(file) : -ENOMEM;
-		kobject_put(lustre_kobj);
-		goto out;
-	}
-
-	entry = lprocfs_register("fs/lustre", NULL, NULL, NULL);
-	if (IS_ERR(entry)) {
-		rc = PTR_ERR(entry);
-		CERROR("cannot create '/proc/fs/lustre': rc = %d\n", rc);
-		kobject_put(lustre_kobj);
-		goto out;
-	}
-
-	proc_lustre_root = entry;
-out:
-	RETURN(rc);
-}
-
-int class_procfs_clean(void)
-{
-	ENTRY;
-
-	debugfs_remove_recursive(debugfs_lustre_root);
-
-	debugfs_lustre_root = NULL;
-
-	if (proc_lustre_root)
-		lprocfs_remove(&proc_lustre_root);
-
-	kobject_put(lustre_kobj);
-
-	RETURN(0);
-}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-obdo.c b/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-obdo.c
deleted file mode 100644
index 5f8e2b55d7258..0000000000000
--- a/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-obdo.c
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2012, 2014, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lustre/obdclass/linux/linux-obdo.c
- *
- * Object Devices Class Driver
- * These are the only exported functions, they provide some generic
- * infrastructure for managing object devices
- */
-
-#define DEBUG_SUBSYSTEM S_CLASS
-
-#include <linux/fs.h>
-#include <linux/module.h>
-#include <linux/pagemap.h> /* for PAGE_SIZE */
-#include <lustre/lustre_idl.h>
-#include <obd_class.h>
-
-/*FIXME: Just copy from obdo_from_inode*/
-void obdo_from_la(struct obdo *dst, const struct lu_attr *la, u64 valid)
-{
-	u64 newvalid = 0;
-
-        if (valid & LA_ATIME) {
-                dst->o_atime = la->la_atime;
-                newvalid |= OBD_MD_FLATIME;
-        }
-        if (valid & LA_MTIME) {
-                dst->o_mtime = la->la_mtime;
-                newvalid |= OBD_MD_FLMTIME;
-        }
-        if (valid & LA_CTIME) {
-                dst->o_ctime = la->la_ctime;
-                newvalid |= OBD_MD_FLCTIME;
-        }
-        if (valid & LA_SIZE) {
-                dst->o_size = la->la_size;
-                newvalid |= OBD_MD_FLSIZE;
-        }
-        if (valid & LA_BLOCKS) {  /* allocation of space (x512 bytes) */
-                dst->o_blocks = la->la_blocks;
-                newvalid |= OBD_MD_FLBLOCKS;
-        }
-        if (valid & LA_TYPE) {
-                dst->o_mode = (dst->o_mode & S_IALLUGO) |
-                              (la->la_mode & S_IFMT);
-                newvalid |= OBD_MD_FLTYPE;
-        }
-        if (valid & LA_MODE) {
-                dst->o_mode = (dst->o_mode & S_IFMT) |
-                              (la->la_mode & S_IALLUGO);
-                newvalid |= OBD_MD_FLMODE;
-        }
-        if (valid & LA_UID) {
-                dst->o_uid = la->la_uid;
-                newvalid |= OBD_MD_FLUID;
-        }
-        if (valid & LA_GID) {
-                dst->o_gid = la->la_gid;
-                newvalid |= OBD_MD_FLGID;
-        }
-	if (valid & LA_PROJID) {
-		dst->o_projid = la->la_projid;
-		newvalid |= OBD_MD_FLPROJID;
-	}
-	if (valid & LA_FLAGS) {
-		dst->o_flags = la->la_flags;
-		newvalid |= OBD_MD_FLFLAGS;
-	}
-	dst->o_valid |= newvalid;
-}
-EXPORT_SYMBOL(obdo_from_la);
-
-/*FIXME: Just copy from obdo_from_inode*/
-void la_from_obdo(struct lu_attr *dst, const struct obdo *obdo, u64 valid)
-{
-	u64 newvalid = 0;
-
-        valid &= obdo->o_valid;
-
-        if (valid & OBD_MD_FLATIME) {
-                dst->la_atime = obdo->o_atime;
-                newvalid |= LA_ATIME;
-        }
-        if (valid & OBD_MD_FLMTIME) {
-                dst->la_mtime = obdo->o_mtime;
-                newvalid |= LA_MTIME;
-        }
-        if (valid & OBD_MD_FLCTIME) {
-                dst->la_ctime = obdo->o_ctime;
-                newvalid |= LA_CTIME;
-        }
-        if (valid & OBD_MD_FLSIZE) {
-                dst->la_size = obdo->o_size;
-                newvalid |= LA_SIZE;
-        }
-        if (valid & OBD_MD_FLBLOCKS) {
-                dst->la_blocks = obdo->o_blocks;
-                newvalid |= LA_BLOCKS;
-        }
-        if (valid & OBD_MD_FLTYPE) {
-                dst->la_mode = (dst->la_mode & S_IALLUGO) |
-                               (obdo->o_mode & S_IFMT);
-                newvalid |= LA_TYPE;
-        }
-        if (valid & OBD_MD_FLMODE) {
-                dst->la_mode = (dst->la_mode & S_IFMT) |
-                               (obdo->o_mode & S_IALLUGO);
-                newvalid |= LA_MODE;
-        }
-        if (valid & OBD_MD_FLUID) {
-                dst->la_uid = obdo->o_uid;
-                newvalid |= LA_UID;
-        }
-        if (valid & OBD_MD_FLGID) {
-                dst->la_gid = obdo->o_gid;
-                newvalid |= LA_GID;
-        }
-	if (valid & OBD_MD_FLPROJID) {
-		dst->la_projid = obdo->o_projid;
-		newvalid |= LA_PROJID;
-	}
-	if (valid & OBD_MD_FLFLAGS) {
-		dst->la_flags = obdo->o_flags;
-		newvalid |= LA_FLAGS;
-	}
-	dst->la_valid = newvalid;
-}
-EXPORT_SYMBOL(la_from_obdo);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-sysctl.c b/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-sysctl.c
deleted file mode 100644
index e8016c77c7506..0000000000000
--- a/drivers/staging/lustrefsx/lustre/obdclass/linux/linux-sysctl.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2011, 2015, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- */
-
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/stat.h>
-#include <linux/ctype.h>
-#include <linux/bitops.h>
-#include <linux/uaccess.h>
-#include <linux/utsname.h>
-
-#define DEBUG_SUBSYSTEM S_CLASS
-
-#include <obd_support.h>
-#include <lprocfs_status.h>
-#include <obd_class.h>
-
-struct static_lustre_uintvalue_attr {
-	struct {
-		struct attribute attr;
-		ssize_t (*show)(struct kobject *kobj, struct attribute *attr,
-				char *buf);
-		ssize_t (*store)(struct kobject *kobj, struct attribute *attr,
-				 const char *buf, size_t len);
-	} u;
-	int *value;
-};
-
-static ssize_t static_uintvalue_show(struct kobject *kobj,
-				     struct attribute *attr,
-				     char *buf)
-{
-	struct static_lustre_uintvalue_attr *lattr = (void *)attr;
-
-	return sprintf(buf, "%d\n", *lattr->value);
-}
-
-static ssize_t static_uintvalue_store(struct kobject *kobj,
-				      struct attribute *attr,
-				      const char *buffer, size_t count)
-{
-	struct static_lustre_uintvalue_attr *lattr  = (void *)attr;
-	unsigned int val;
-	int rc;
-
-	rc = kstrtouint(buffer, 10, &val);
-	if (rc)
-		return rc;
-
-	*lattr->value = val;
-
-	return count;
-}
-
-#define LUSTRE_STATIC_UINT_ATTR(name, value) \
-static struct static_lustre_uintvalue_attr lustre_sattr_##name =	\
-					{__ATTR(name, 0644,		\
-						static_uintvalue_show,	\
-						static_uintvalue_store),\
-					  value }
-
-LUSTRE_STATIC_UINT_ATTR(timeout, &obd_timeout);
-
-static ssize_t max_dirty_mb_show(struct kobject *kobj, struct attribute *attr,
-				 char *buf)
-{
-	return sprintf(buf, "%lu\n",
-		       obd_max_dirty_pages / (1 << (20 - PAGE_SHIFT)));
-}
-
-static ssize_t max_dirty_mb_store(struct kobject *kobj, struct attribute *attr,
-				  const char *buffer, size_t count)
-{
-	unsigned long val;
-	int rc;
-
-	rc = kstrtoul(buffer, 10, &val);
-	if (rc)
-		return rc;
-
-	val *= 1 << (20 - PAGE_SHIFT); /* convert to pages */
-
-	if (val > ((cfs_totalram_pages() / 10) * 9)) {
-		/* Somebody wants to assign too much memory to dirty pages */
-		return -EINVAL;
-	}
-
-	if (val < 4 << (20 - PAGE_SHIFT)) {
-		/* Less than 4 Mb for dirty cache is also bad */
-		return -EINVAL;
-	}
-
-	obd_max_dirty_pages = val;
-
-	return count;
-}
-LUSTRE_RW_ATTR(max_dirty_mb);
-
-LUSTRE_STATIC_UINT_ATTR(debug_peer_on_timeout, &obd_debug_peer_on_timeout);
-LUSTRE_STATIC_UINT_ATTR(dump_on_timeout, &obd_dump_on_timeout);
-LUSTRE_STATIC_UINT_ATTR(dump_on_eviction, &obd_dump_on_eviction);
-LUSTRE_STATIC_UINT_ATTR(at_min, &at_min);
-LUSTRE_STATIC_UINT_ATTR(at_max, &at_max);
-LUSTRE_STATIC_UINT_ATTR(at_extra, &at_extra);
-LUSTRE_STATIC_UINT_ATTR(at_early_margin, &at_early_margin);
-LUSTRE_STATIC_UINT_ATTR(at_history, &at_history);
-
-#ifdef HAVE_SERVER_SUPPORT
-LUSTRE_STATIC_UINT_ATTR(ldlm_timeout, &ldlm_timeout);
-LUSTRE_STATIC_UINT_ATTR(bulk_timeout, &bulk_timeout);
-#endif
-
-static ssize_t memused_show(struct kobject *kobj, struct attribute *attr,
-			    char *buf)
-{
-	return sprintf(buf, "%llu\n", obd_memory_sum());
-}
-LUSTRE_RO_ATTR(memused);
-
-static ssize_t memused_max_show(struct kobject *kobj, struct attribute *attr,
-				char *buf)
-{
-	return sprintf(buf, "%llu\n", obd_memory_max());
-}
-LUSTRE_RO_ATTR(memused_max);
-
-static struct attribute *lustre_attrs[] = {
-	&lustre_sattr_timeout.u.attr,
-	&lustre_attr_max_dirty_mb.attr,
-	&lustre_sattr_debug_peer_on_timeout.u.attr,
-	&lustre_sattr_dump_on_timeout.u.attr,
-	&lustre_sattr_dump_on_eviction.u.attr,
-	&lustre_sattr_at_min.u.attr,
-	&lustre_sattr_at_max.u.attr,
-	&lustre_sattr_at_extra.u.attr,
-	&lustre_sattr_at_early_margin.u.attr,
-	&lustre_sattr_at_history.u.attr,
-	&lustre_attr_memused_max.attr,
-	&lustre_attr_memused.attr,
-#ifdef HAVE_SERVER_SUPPORT
-	&lustre_sattr_ldlm_timeout.u.attr,
-	&lustre_sattr_bulk_timeout.u.attr,
-#endif
-	NULL,
-};
-
-static struct attribute_group lustre_attr_group = {
-	.attrs = lustre_attrs,
-};
-
-int obd_sysctl_init(void)
-{
-	return sysfs_create_group(lustre_kobj, &lustre_attr_group);
-}
-
-void obd_sysctl_clean(void)
-{
-	sysfs_remove_group(lustre_kobj, &lustre_attr_group);
-}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog.c b/drivers/staging/lustrefsx/lustre/obdclass/llog.c
index 61c9a1d1f4e8a..e9228b33339f3 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/llog.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -47,8 +47,10 @@
 #include <linux/kthread.h>
 #include <llog_swab.h>
 #include <lustre_log.h>
+#include <obd_support.h>
 #include <obd_class.h>
 #include "llog_internal.h"
+
 /*
  * Allocate a new log or catalog handle
  * Used inside llog_open().
@@ -63,6 +65,7 @@ static struct llog_handle *llog_alloc_handle(void)
 
 	init_rwsem(&loghandle->lgh_lock);
 	mutex_init(&loghandle->lgh_hdr_mutex);
+	init_rwsem(&loghandle->lgh_last_sem);
 	INIT_LIST_HEAD(&loghandle->u.phd.phd_entry);
 	atomic_set(&loghandle->lgh_refcount, 1);
 
@@ -89,16 +92,30 @@ static void llog_free_handle(struct llog_handle *loghandle)
 	OBD_FREE_PTR(loghandle);
 }
 
-void llog_handle_get(struct llog_handle *loghandle)
+struct llog_handle *llog_handle_get(struct llog_handle *loghandle)
 {
-	atomic_inc(&loghandle->lgh_refcount);
+	if (atomic_inc_not_zero(&loghandle->lgh_refcount))
+		return loghandle;
+	return NULL;
 }
 
-void llog_handle_put(struct llog_handle *loghandle)
+int llog_handle_put(const struct lu_env *env, struct llog_handle *loghandle)
 {
-	LASSERT(atomic_read(&loghandle->lgh_refcount) > 0);
-	if (atomic_dec_and_test(&loghandle->lgh_refcount))
+	int rc = 0;
+
+	if (atomic_dec_and_test(&loghandle->lgh_refcount)) {
+		struct llog_operations *lop;
+
+		rc = llog_handle2ops(loghandle, &lop);
+		if (!rc) {
+			if (lop->lop_close)
+				rc = lop->lop_close(env, loghandle);
+			else
+				rc = -EOPNOTSUPP;
+		}
 		llog_free_handle(loghandle);
+	}
+	return rc;
 }
 
 static int llog_declare_destroy(const struct lu_env *env,
@@ -135,7 +152,7 @@ int llog_trans_destroy(const struct lu_env *env, struct llog_handle *handle,
 		RETURN(-EOPNOTSUPP);
 
 	LASSERT(handle->lgh_obj != NULL);
-	if (!dt_object_exists(handle->lgh_obj))
+	if (!llog_exist(handle))
 		RETURN(0);
 
 	rc = lop->lop_destroy(env, handle, th);
@@ -164,11 +181,14 @@ int llog_destroy(const struct lu_env *env, struct llog_handle *handle)
 		RETURN(rc);
 	}
 
-	if (!dt_object_exists(handle->lgh_obj))
+	if (!llog_exist(handle))
 		RETURN(0);
 
 	dt = lu2dt_dev(handle->lgh_obj->do_lu.lo_dev);
 
+	if (unlikely(unlikely(dt->dd_rdonly)))
+		RETURN(-EROFS);
+
 	th = dt_trans_create(env, dt);
 	if (IS_ERR(th))
 		RETURN(PTR_ERR(th));
@@ -196,14 +216,21 @@ int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
 {
 	struct llog_thread_info *lgi = llog_info(env);
 	struct dt_device	*dt;
-	struct llog_log_hdr	*llh = loghandle->lgh_hdr;
+	struct llog_log_hdr	*llh;
 	struct thandle		*th;
+	__u32			 tmp_lgc_index;
 	int			 rc;
 	int rc1;
 	bool subtract_count = false;
 
 	ENTRY;
 
+	LASSERT(loghandle != NULL);
+	LASSERT(loghandle->lgh_ctxt != NULL);
+	LASSERT(loghandle->lgh_obj != NULL);
+
+	llh = loghandle->lgh_hdr;
+
 	CDEBUG(D_RPCTRACE, "Canceling %d in log "DFID"\n", index,
 	       PFID(&loghandle->lgh_id.lgl_oi.oi_fid));
 
@@ -212,12 +239,11 @@ int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
 		RETURN(-EINVAL);
 	}
 
-	LASSERT(loghandle != NULL);
-	LASSERT(loghandle->lgh_ctxt != NULL);
-	LASSERT(loghandle->lgh_obj != NULL);
-
 	dt = lu2dt_dev(loghandle->lgh_obj->do_lu.lo_dev);
 
+	if (unlikely(unlikely(dt->dd_rdonly)))
+		RETURN(0);
+
 	th = dt_trans_create(env, dt);
 	if (IS_ERR(th))
 		RETURN(PTR_ERR(th));
@@ -247,12 +273,19 @@ int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
 
 	loghandle->lgh_hdr->llh_count--;
 	subtract_count = true;
+
+	/* Since llog_process_thread use lgi_cookie, it`s better to save them
+	 * and restore after using
+	 */
+	tmp_lgc_index = lgi->lgi_cookie.lgc_index;
 	/* Pass this index to llog_osd_write_rec(), which will use the index
 	 * to only update the necesary bitmap. */
 	lgi->lgi_cookie.lgc_index = index;
 	/* update header */
 	rc = llog_write_rec(env, loghandle, &llh->llh_hdr, &lgi->lgi_cookie,
 			    LLOG_HEADER_IDX, th);
+	lgi->lgi_cookie.lgc_index = tmp_lgc_index;
+
 	if (rc != 0)
 		GOTO(out_unlock, rc);
 
@@ -271,7 +304,7 @@ int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
 			 * be accessed anymore, let's return 0 for now, and
 			 * the orphan will be handled by LFSCK. */
 			CERROR("%s: can't destroy empty llog "DFID": rc = %d\n",
-			       loghandle->lgh_ctxt->loc_obd->obd_name,
+			       loghandle2name(loghandle),
 			       PFID(&loghandle->lgh_id.lgl_oi.oi_fid), rc);
 			GOTO(out_unlock, rc = 0);
 		}
@@ -366,7 +399,7 @@ int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
 			     (llh->llh_flags & LLOG_F_IS_CAT &&
 			      flags & LLOG_F_IS_PLAIN))) {
 			CERROR("%s: llog type is %s but initializing %s\n",
-			       handle->lgh_ctxt->loc_obd->obd_name,
+			       loghandle2name(handle),
 			       llh->llh_flags & LLOG_F_IS_CAT ?
 			       "catalog" : "plain",
 			       flags & LLOG_F_IS_CAT ? "catalog" : "plain");
@@ -386,7 +419,7 @@ int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
 		if (unlikely(uuid &&
 			     !obd_uuid_equals(uuid, &llh->llh_tgtuuid))) {
 			CERROR("%s: llog uuid mismatch: %s/%s\n",
-			       handle->lgh_ctxt->loc_obd->obd_name,
+			       loghandle2name(handle),
 			       (char *)uuid->uuid,
 			       (char *)llh->llh_tgtuuid.uuid);
 			GOTO(out, rc = -EEXIST);
@@ -399,8 +432,8 @@ int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
 		llh->llh_flags |= LLOG_F_IS_FIXSIZE;
 	} else if (!(flags & LLOG_F_IS_PLAIN)) {
 		CERROR("%s: unknown flags: %#x (expected %#x or %#x)\n",
-		       handle->lgh_ctxt->loc_obd->obd_name,
-		       flags, LLOG_F_IS_CAT, LLOG_F_IS_PLAIN);
+		       loghandle2name(handle), flags, LLOG_F_IS_CAT,
+		       LLOG_F_IS_PLAIN);
 		rc = -EINVAL;
 	}
 	llh->llh_flags |= fmt;
@@ -413,12 +446,37 @@ int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
 }
 EXPORT_SYMBOL(llog_init_handle);
 
+int llog_verify_record(const struct llog_handle *llh, struct llog_rec_hdr *rec)
+{
+	int chunk_size = llh->lgh_hdr->llh_hdr.lrh_len;
+
+	if (rec->lrh_len == 0 || rec->lrh_len > chunk_size) {
+		CERROR("%s: record is too large: %d > %d\n",
+		       loghandle2name(llh), rec->lrh_len, chunk_size);
+		return -EINVAL;
+	}
+	if (rec->lrh_index >= LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr)) {
+		CERROR("%s: index is too high: %d\n",
+		       loghandle2name(llh), rec->lrh_index);
+		return -EINVAL;
+	}
+	if ((rec->lrh_type & LLOG_OP_MASK) != LLOG_OP_MAGIC) {
+		CERROR("%s: magic %x is bad\n",
+		       loghandle2name(llh), rec->lrh_type);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(llog_verify_record);
+
 static int llog_process_thread(void *arg)
 {
 	struct llog_process_info	*lpi = arg;
 	struct llog_handle		*loghandle = lpi->lpi_loghandle;
 	struct llog_log_hdr		*llh = loghandle->lgh_hdr;
 	struct llog_process_cat_data	*cd  = lpi->lpi_catdata;
+	struct llog_thread_info		*lti;
 	char				*buf;
 	size_t				 chunk_size;
 	__u64				 cur_offset;
@@ -426,12 +484,15 @@ static int llog_process_thread(void *arg)
 	int				 saved_index = 0;
 	int				 last_called_index = 0;
 	bool				 repeated = false;
+	bool				refresh_idx = false;
 
 	ENTRY;
 
 	if (llh == NULL)
 		RETURN(-EINVAL);
 
+	lti = lpi->lpi_env == NULL ? NULL : llog_info(lpi->lpi_env);
+
 	cur_offset = chunk_size = llh->llh_hdr.lrh_len;
 	/* expect chunk_size to be power of two */
 	LASSERT(is_power_of_2(chunk_size));
@@ -457,6 +518,7 @@ static int llog_process_thread(void *arg)
 		unsigned int buf_offset = 0;
 		bool partial_chunk;
 		int	lh_last_idx;
+		int	synced_idx = 0;
 
 		/* skip records not set in bitmap */
 		while (index <= last_index &&
@@ -474,7 +536,8 @@ static int llog_process_thread(void *arg)
 		/* get the buf with our target record; avoid old garbage */
 		memset(buf, 0, chunk_size);
 		/* the record index for outdated chunk data */
-		lh_last_idx = loghandle->lgh_last_idx + 1;
+		/* it is safe to process buffer until saved lgh_last_idx */
+		lh_last_idx = LLOG_HDR_TAIL(llh)->lrt_index;
 		rc = llog_next_block(lpi->lpi_env, loghandle, &saved_index,
 				     index, &cur_offset, buf, chunk_size);
 		if (repeated && rc)
@@ -518,60 +581,72 @@ static int llog_process_thread(void *arg)
 			CDEBUG(D_OTHER, "after swabbing, type=%#x idx=%d\n",
 			       rec->lrh_type, rec->lrh_index);
 
+			if (index == (synced_idx + 1) &&
+			    synced_idx == LLOG_HDR_TAIL(llh)->lrt_index)
+				GOTO(out, rc = 0);
+
+			if (OBD_FAIL_PRECHECK(OBD_FAIL_LLOG_PROCESS_TIMEOUT) &&
+				cfs_fail_val == (unsigned int)
+					(loghandle->lgh_id.lgl_oi.oi.oi_id &
+					 0xFFFFFFFF)) {
+				OBD_RACE(OBD_FAIL_LLOG_PROCESS_TIMEOUT);
+			}
+
 			/* the bitmap could be changed during processing
 			 * records from the chunk. For wrapped catalog
 			 * it means we can read deleted record and try to
-			 * process it. Check this case and reread the chunk. */
-
-			/* for partial chunk the end of it is zeroed, check
-			 * for index 0 to distinguish it. */
-			if ((partial_chunk && rec->lrh_index == 0) ||
-			     (index == lh_last_idx &&
-			      lh_last_idx != (loghandle->lgh_last_idx + 1))) {
-				/* concurrent llog_add() might add new records
-				 * while llog_processing, check this is not
-				 * the case and re-read the current chunk
-				 * otherwise. */
-				int records;
-				/* lgh_last_idx could be less then index
-				 * for catalog, if catalog is wrapped */
-				if ((index > loghandle->lgh_last_idx &&
-				    !(loghandle->lgh_hdr->llh_flags &
-				      LLOG_F_IS_CAT)) || repeated ||
-				    (loghandle->lgh_obj != NULL &&
-				     dt_object_remote(loghandle->lgh_obj)))
-					GOTO(out, rc = 0);
-				/* <2 records means no more records
-				 * if the last record we processed was
-				 * the final one, then the underlying
-				 * object might have been destroyed yet.
-				 * we better don't access that.. */
-				mutex_lock(&loghandle->lgh_hdr_mutex);
-				records = loghandle->lgh_hdr->llh_count;
-				mutex_unlock(&loghandle->lgh_hdr_mutex);
-				if (records <= 1)
-					GOTO(out, rc = 0);
-				CDEBUG(D_OTHER, "Re-read last llog buffer for "
-				       "new records, index %u, last %u\n",
-				       index, loghandle->lgh_last_idx);
+			 * process it. Check this case and reread the chunk.
+			 * It is safe to process to lh_last_idx, including
+			 * lh_last_idx if it was synced. We can not do <=
+			 * comparison, cause for wrapped catalog lgh_last_idx
+			 * could be less than index. So we detect last index
+			 * for processing as index == lh_last_idx+1. But when
+			 * catalog is wrapped and full lgh_last_idx=llh_cat_idx,
+			 * the first processing index is llh_cat_idx+1.The
+			 * exception is !(lgh_last_idx == llh_cat_idx &&
+			 * index == llh_cat_idx + 1), and after simplification
+			 * it turns to
+			 * lh_last_idx != LLOG_HDR_TAIL(llh)->lrt_index
+			 * This exception is working for catalog only.
+			 */
+
+			if ((index == lh_last_idx && synced_idx != index) ||
+			    (index == (lh_last_idx + 1) &&
+			     lh_last_idx != LLOG_HDR_TAIL(llh)->lrt_index) ||
+			    (rec->lrh_index == 0 && !repeated)) {
+
 				/* save offset inside buffer for the re-read */
 				buf_offset = (char *)rec - (char *)buf;
 				cur_offset = chunk_offset;
 				repeated = true;
+				/* We need to be sure lgh_last_idx
+				 * record was saved to disk
+				 */
+				down_read(&loghandle->lgh_last_sem);
+				synced_idx = LLOG_HDR_TAIL(llh)->lrt_index;
+				up_read(&loghandle->lgh_last_sem);
+				CDEBUG(D_OTHER, "synced_idx: %d\n", synced_idx);
 				goto repeat;
+
 			}
 
 			repeated = false;
 
-			if (rec->lrh_len == 0 || rec->lrh_len > chunk_size) {
-				CWARN("%s: invalid length %d in llog "DFID
-				      "record for index %d/%d\n",
-				       loghandle->lgh_ctxt->loc_obd->obd_name,
-				       rec->lrh_len,
+			rc = llog_verify_record(loghandle, rec);
+			if (rc) {
+				CERROR("%s: invalid record in llog "DFID
+				       " record for index %d/%d: rc = %d\n",
+				       loghandle2name(loghandle),
 				       PFID(&loghandle->lgh_id.lgl_oi.oi_fid),
-				       rec->lrh_index, index);
-
-				GOTO(out, rc = -EINVAL);
+				       rec->lrh_index, index, rc);
+				/*
+				 * the block seem to be corrupted, let's try
+				 * with the next one. reset rc to go to the
+				 * next chunk.
+				 */
+				refresh_idx = true;
+				index = 0;
+				GOTO(repeat, rc = 0);
 			}
 
 			if (rec->lrh_index < index) {
@@ -581,12 +656,22 @@ static int llog_process_thread(void *arg)
 			}
 
 			if (rec->lrh_index != index) {
-				CERROR("%s: "DFID" Invalid record: index %u"
-				       " but expected %u\n",
-				       loghandle->lgh_ctxt->loc_obd->obd_name,
-				       PFID(&loghandle->lgh_id.lgl_oi.oi_fid),
-				       rec->lrh_index, index);
-				GOTO(out, rc = -ERANGE);
+				/*
+				 * the last time we couldn't parse the block due
+				 * to corruption, thus has no idea about the
+				 * next index, take it from the block, once.
+				 */
+				if (refresh_idx) {
+					refresh_idx = false;
+					index = rec->lrh_index;
+				} else {
+					CERROR("%s: "DFID" Invalid record: index"
+					       " %u but expected %u\n",
+					       loghandle2name(loghandle),
+					       PFID(&loghandle->lgh_id.lgl_oi.oi_fid),
+					       rec->lrh_index, index);
+					GOTO(out, rc = -ERANGE);
+				}
 			}
 
 			CDEBUG(D_OTHER,
@@ -594,15 +679,44 @@ static int llog_process_thread(void *arg)
 			       rec->lrh_index, rec->lrh_len,
 			       (int)(buf + chunk_size - (char *)rec));
 
-			loghandle->lgh_cur_idx = rec->lrh_index;
+			/* lgh_cur_offset is used only at llog_test_3 */
 			loghandle->lgh_cur_offset = (char *)rec - (char *)buf +
 						    chunk_offset;
 
 			/* if set, process the callback on this record */
 			if (ext2_test_bit(index, LLOG_HDR_BITMAP(llh))) {
+				struct llog_cookie *lgc;
+				__u64	tmp_off;
+				int	tmp_idx;
+
+				CDEBUG((llh->llh_flags & LLOG_F_IS_CAT ?
+					D_HA : D_OTHER),
+				       "index: %d, lh_last_idx: %d "
+				       "synced_idx: %d lgh_last_idx: %d\n",
+				       index, lh_last_idx, synced_idx,
+				       loghandle->lgh_last_idx);
+
+				if (lti != NULL) {
+					lgc = &lti->lgi_cookie;
+					/* store lu_env for recursive calls */
+					tmp_off = lgc->lgc_offset;
+					tmp_idx = lgc->lgc_index;
+
+					lgc->lgc_offset = (char *)rec -
+						(char *)buf + chunk_offset;
+					lgc->lgc_index = rec->lrh_index;
+				}
+				/* using lu_env for passing record offset to
+				 * llog_write through various callbacks */
 				rc = lpi->lpi_cb(lpi->lpi_env, loghandle, rec,
 						 lpi->lpi_cbdata);
 				last_called_index = index;
+
+				if (lti != NULL) {
+					lgc->lgc_offset = tmp_off;
+					lgc->lgc_index = tmp_idx;
+				}
+
 				if (rc == LLOG_PROC_BREAK) {
 					GOTO(out, rc);
 				} else if (rc == LLOG_DEL_RECORD) {
@@ -627,6 +741,11 @@ static int llog_process_thread(void *arg)
 	}
 
 out:
+	CDEBUG(D_HA, "stop processing %s "DOSTID":%x index %d count %d\n",
+	       ((llh->llh_flags & LLOG_F_IS_CAT) ? "catalog" : "plain"),
+	       POSTID(&loghandle->lgh_id.lgl_oi), loghandle->lgh_id.lgl_ogen,
+	       index, llh->llh_count);
+
 	if (cd != NULL)
 		cd->lpcd_last_idx = last_called_index;
 
@@ -638,7 +757,7 @@ static int llog_process_thread(void *arg)
 			 * retry until the umount or abort recovery, see
 			 * lod_sub_recovery_thread() */
 			CERROR("%s retry remote llog process\n",
-			       loghandle->lgh_ctxt->loc_obd->obd_name);
+			       loghandle2name(loghandle));
 			rc = -EAGAIN;
 		} else {
 			/* something bad happened to the processing of a local
@@ -647,7 +766,7 @@ static int llog_process_thread(void *arg)
 			 * discard any remaining bits in the header */
 			CERROR("%s: Local llog found corrupted #"DOSTID":%x"
 			       " %s index %d count %d\n",
-			       loghandle->lgh_ctxt->loc_obd->obd_name,
+			       loghandle2name(loghandle),
 			       POSTID(&loghandle->lgh_id.lgl_oi),
 			       loghandle->lgh_id.lgl_ogen,
 			       ((llh->llh_flags & LLOG_F_IS_CAT) ? "catalog" :
@@ -687,7 +806,8 @@ static int llog_process_thread_daemonize(void *arg)
 		 * used outside of the kernel itself, because it calls
 		 * free_nsproxy() which is not exported by the kernel
 		 * (defined in kernel/nsproxy.c) */
-		atomic_dec(&curr_ns->count);
+		if (curr_ns)
+			atomic_dec(&curr_ns->count);
 	}
 	task_unlock(lpi->lpi_reftask);
 
@@ -742,7 +862,7 @@ int llog_process_or_fork(const struct lu_env *env,
 		if (IS_ERR(task)) {
 			rc = PTR_ERR(task);
 			CERROR("%s: cannot start thread: rc = %d\n",
-			       loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+			       loghandle2name(loghandle), rc);
 			GOTO(out_lpi, rc);
 		}
 		wait_for_completion(&lpi->lpi_completion);
@@ -979,12 +1099,11 @@ int llog_write_rec(const struct lu_env *env, struct llog_handle *handle,
 		RETURN(-EPROTO);
 	} else if (th == NULL) {
 		CERROR("%s: missed transaction handle\n",
-			handle->lgh_obj->do_lu.lo_dev->ld_obd->obd_name);
+		       loghandle2name(handle));
 		RETURN(-EPROTO);
 	} else if (handle->lgh_hdr == NULL) {
 		CERROR("%s: loghandle %p with no header\n",
-			handle->lgh_obj->do_lu.lo_dev->ld_obd->obd_name,
-			handle);
+		       loghandle2name(handle), handle);
 		RETURN(-EPROTO);
 	}
 
@@ -1073,6 +1192,9 @@ int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt,
 
 	d = lu2dt_dev((*res)->lgh_obj->do_lu.lo_dev);
 
+	if (unlikely(unlikely(d->dd_rdonly)))
+		RETURN(-EROFS);
+
 	th = dt_trans_create(env, d);
 	if (IS_ERR(th))
 		GOTO(out, rc = PTR_ERR(th));
@@ -1140,7 +1262,8 @@ int llog_write(const struct lu_env *env, struct llog_handle *loghandle,
 {
 	struct dt_device	*dt;
 	struct thandle		*th;
-	int			 rc;
+	bool			need_cookie;
+	int			rc;
 
 	ENTRY;
 
@@ -1150,6 +1273,9 @@ int llog_write(const struct lu_env *env, struct llog_handle *loghandle,
 
 	dt = lu2dt_dev(loghandle->lgh_obj->do_lu.lo_dev);
 
+	if (unlikely(unlikely(dt->dd_rdonly)))
+		RETURN(-EROFS);
+
 	th = dt_trans_create(env, dt);
 	if (IS_ERR(th))
 		RETURN(PTR_ERR(th));
@@ -1163,8 +1289,21 @@ int llog_write(const struct lu_env *env, struct llog_handle *loghandle,
 	if (rc)
 		GOTO(out_trans, rc);
 
+	need_cookie = !(idx == LLOG_HEADER_IDX || idx == LLOG_NEXT_IDX);
+
 	down_write(&loghandle->lgh_lock);
-	rc = llog_write_rec(env, loghandle, rec, NULL, idx, th);
+	if (need_cookie) {
+		struct llog_thread_info *lti = llog_info(env);
+
+		/* cookie comes from llog_process_thread */
+		rc = llog_write_rec(env, loghandle, rec, &lti->lgi_cookie,
+				    rec->lrh_index, th);
+		/* upper layer didn`t pass cookie so change rc */
+		rc = (rc == 1 ? 0 : rc);
+	} else {
+		rc = llog_write_rec(env, loghandle, rec, NULL, idx, th);
+	}
+
 	up_write(&loghandle->lgh_lock);
 out_trans:
 	dt_trans_stop(env, dt, th);
@@ -1211,20 +1350,7 @@ EXPORT_SYMBOL(llog_open);
 
 int llog_close(const struct lu_env *env, struct llog_handle *loghandle)
 {
-	struct llog_operations	*lop;
-	int			 rc;
-
-	ENTRY;
-
-	rc = llog_handle2ops(loghandle, &lop);
-	if (rc)
-		GOTO(out, rc);
-	if (lop->lop_close == NULL)
-		GOTO(out, rc = -EOPNOTSUPP);
-	rc = lop->lop_close(env, loghandle);
-out:
-	llog_handle_put(loghandle);
-	RETURN(rc);
+	return llog_handle_put(env, loghandle);
 }
 EXPORT_SYMBOL(llog_close);
 
@@ -1348,8 +1474,9 @@ __u64 llog_size(const struct lu_env *env, struct llog_handle *llh)
 
 	rc = llh->lgh_obj->do_ops->do_attr_get(env, llh->lgh_obj, &la);
 	if (rc) {
-		CERROR("%s: attr_get failed, rc = %d\n",
-		       llh->lgh_ctxt->loc_obd->obd_name, rc);
+		CERROR("%s: attr_get failed for "DFID": rc = %d\n",
+		       loghandle2name(llh), PFID(&llh->lgh_id.lgl_oi.oi_fid),
+		       rc);
 		return 0;
 	}
 
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c
index e85e08bbd10c6..91f029052585e 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_cat.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -88,13 +88,12 @@ static int llog_cat_new_log(const struct lu_env *env,
 		if (cathandle->lgh_name == NULL) {
 			CWARN("%s: there are no more free slots in catalog "
 			      DFID":%x\n",
-			      loghandle->lgh_ctxt->loc_obd->obd_name,
+			      loghandle2name(loghandle),
 			      PFID(&cathandle->lgh_id.lgl_oi.oi_fid),
 			      cathandle->lgh_id.lgl_ogen);
 		} else {
 			CWARN("%s: there are no more free slots in "
-			      "catalog %s\n",
-			      loghandle->lgh_ctxt->loc_obd->obd_name,
+			      "catalog %s\n", loghandle2name(loghandle),
 			      cathandle->lgh_name);
 		}
 		RETURN(-ENOSPC);
@@ -153,7 +152,7 @@ static int llog_cat_new_log(const struct lu_env *env,
 		GOTO(out, rc = 0);
 	} else if (rc != 0) {
 		CERROR("%s: can't create new plain llog in catalog: rc = %d\n",
-		       loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+		       loghandle2name(loghandle), rc);
 		GOTO(out, rc);
 	}
 
@@ -213,11 +212,137 @@ static int llog_cat_new_log(const struct lu_env *env,
 	loghandle->lgh_hdr->llh_flags &= ~LLOG_F_ZAP_WHEN_EMPTY;
 	/* this is to mimic full log, so another llog_cat_current_log()
 	 * can skip it and ask for another onet */
-	loghandle->lgh_last_idx = LLOG_HDR_BITMAP_SIZE(llh) + 1;
+	loghandle->lgh_last_idx = LLOG_HDR_BITMAP_SIZE(loghandle->lgh_hdr) + 1;
 	llog_trans_destroy(env, loghandle, th);
+	if (handle != NULL)
+		dt_trans_stop(env, dt, handle);
 	RETURN(rc);
 }
 
+static int llog_cat_refresh(const struct lu_env *env,
+			    struct llog_handle *cathandle)
+{
+	struct llog_handle *loghandle;
+	int rc;
+
+	down_write(&cathandle->lgh_lock);
+	list_for_each_entry(loghandle, &cathandle->u.chd.chd_head,
+			    u.phd.phd_entry) {
+		if (!llog_exist(loghandle))
+			continue;
+
+		rc = llog_read_header(env, loghandle, NULL);
+		if (rc)
+			goto unlock;
+	}
+
+	rc = llog_read_header(env, cathandle, NULL);
+unlock:
+	up_write(&loghandle->lgh_lock);
+
+	return rc;
+}
+
+/*
+ * prepare current/next log for catalog.
+ *
+ * if \a *ploghandle is NULL, open it, and declare create, NB, if \a
+ * *ploghandle is remote, create it synchronously here, see comments
+ * below.
+ *
+ * \a cathandle->lgh_lock is down_read-ed, it gets down_write-ed if \a
+ * *ploghandle has to be opened.
+ */
+static int llog_cat_prep_log(const struct lu_env *env,
+			     struct llog_handle *cathandle,
+			     struct llog_handle **ploghandle,
+			     struct thandle *th)
+{
+	int rc;
+	int sem_upgraded;
+
+start:
+	rc = 0;
+	sem_upgraded = 0;
+	if (IS_ERR_OR_NULL(*ploghandle)) {
+		up_read(&cathandle->lgh_lock);
+		down_write(&cathandle->lgh_lock);
+		sem_upgraded = 1;
+		if (IS_ERR_OR_NULL(*ploghandle)) {
+			struct llog_handle *loghandle;
+
+			rc = llog_open(env, cathandle->lgh_ctxt, &loghandle,
+				       NULL, NULL, LLOG_OPEN_NEW);
+			if (!rc) {
+				*ploghandle = loghandle;
+				list_add_tail(&loghandle->u.phd.phd_entry,
+					      &cathandle->u.chd.chd_head);
+			}
+		}
+		if (rc)
+			GOTO(out, rc);
+	}
+
+	rc = llog_exist(*ploghandle);
+	if (rc < 0)
+		GOTO(out, rc);
+	if (rc)
+		GOTO(out, rc = 0);
+
+	if (dt_object_remote(cathandle->lgh_obj)) {
+		down_write_nested(&(*ploghandle)->lgh_lock, LLOGH_LOG);
+		if (!llog_exist(*ploghandle)) {
+			/* For remote operation, if we put the llog object
+			 * creation in the current transaction, then the
+			 * llog object will not be created on the remote
+			 * target until the transaction stop, if other
+			 * operations start before the transaction stop,
+			 * and use the same llog object, will be dependent
+			 * on the success of this transaction. So let's
+			 * create the llog object synchronously here to
+			 * remove the dependency. */
+			rc = llog_cat_new_log(env, cathandle, *ploghandle,
+					      NULL);
+			if (rc == -ESTALE) {
+				up_write(&(*ploghandle)->lgh_lock);
+				if (sem_upgraded)
+					up_write(&cathandle->lgh_lock);
+				else
+					up_read(&cathandle->lgh_lock);
+
+				rc = llog_cat_refresh(env, cathandle);
+				down_read_nested(&cathandle->lgh_lock,
+						 LLOGH_CAT);
+				if (rc)
+					return rc;
+				/* *ploghandle might become NULL, restart */
+				goto start;
+			}
+		}
+		up_write(&(*ploghandle)->lgh_lock);
+	} else {
+		struct llog_thread_info	*lgi = llog_info(env);
+		struct llog_logid_rec *lirec = &lgi->lgi_logid;
+
+		rc = llog_declare_create(env, *ploghandle, th);
+		if (rc)
+			GOTO(out, rc);
+
+		lirec->lid_hdr.lrh_len = sizeof(*lirec);
+		rc = llog_declare_write_rec(env, cathandle, &lirec->lid_hdr, -1,
+					    th);
+	}
+
+out:
+	if (sem_upgraded) {
+		up_write(&cathandle->lgh_lock);
+		down_read_nested(&cathandle->lgh_lock, LLOGH_CAT);
+		if (rc == 0)
+			goto start;
+	}
+	return rc;
+}
+
 /* Open an existent log handle and add it to the open list.
  * This log handle will be closed when all of the records in it are removed.
  *
@@ -249,14 +374,21 @@ int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
 		    ostid_seq(&cgl->lgl_oi) == ostid_seq(&logid->lgl_oi)) {
 			if (cgl->lgl_ogen != logid->lgl_ogen) {
 				CWARN("%s: log "DFID" generation %x != %x\n",
-				      loghandle->lgh_ctxt->loc_obd->obd_name,
+				      loghandle2name(loghandle),
 				      PFID(&logid->lgl_oi.oi_fid),
 				      cgl->lgl_ogen, logid->lgl_ogen);
 				continue;
 			}
+			*res = llog_handle_get(loghandle);
+			if (!*res) {
+				CERROR("%s: log "DFID" refcount is zero!\n",
+				       loghandle2name(loghandle),
+				       PFID(&logid->lgl_oi.oi_fid));
+				continue;
+			}
 			loghandle->u.phd.phd_cat_handle = cathandle;
 			up_write(&cathandle->lgh_lock);
-			GOTO(out, rc = 0);
+			RETURN(rc);
 		}
 	}
 	up_write(&cathandle->lgh_lock);
@@ -265,18 +397,20 @@ int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
 		       LLOG_OPEN_EXISTS);
 	if (rc < 0) {
 		CERROR("%s: error opening log id "DFID":%x: rc = %d\n",
-		       cathandle->lgh_ctxt->loc_obd->obd_name,
-		       PFID(&logid->lgl_oi.oi_fid), logid->lgl_ogen, rc);
+		       loghandle2name(cathandle), PFID(&logid->lgl_oi.oi_fid),
+		       logid->lgl_ogen, rc);
 		RETURN(rc);
 	}
 
 	rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN | fmt, NULL);
 	if (rc < 0) {
 		llog_close(env, loghandle);
-		loghandle = NULL;
+		*res = NULL;
 		RETURN(rc);
 	}
 
+	*res = llog_handle_get(loghandle);
+	LASSERT(*res);
 	down_write(&cathandle->lgh_lock);
 	list_add_tail(&loghandle->u.phd.phd_entry, &cathandle->u.chd.chd_head);
 	up_write(&cathandle->lgh_lock);
@@ -285,11 +419,7 @@ int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
 	loghandle->u.phd.phd_cookie.lgc_lgl = cathandle->lgh_id;
 	loghandle->u.phd.phd_cookie.lgc_index =
 				loghandle->lgh_hdr->llh_cat_idx;
-	EXIT;
-out:
-	llog_handle_get(loghandle);
-	*res = loghandle;
-	return 0;
+	RETURN(0);
 }
 
 int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle)
@@ -314,8 +444,7 @@ int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle)
 			if (rc)
 				CERROR("%s: failure destroying log during "
 				       "cleanup: rc = %d\n",
-				       loghandle->lgh_ctxt->loc_obd->obd_name,
-				       rc);
+				       loghandle2name(loghandle), rc);
 
 			index = loghandle->u.phd.phd_cookie.lgc_index;
 			llog_cat_cleanup(env, cathandle, NULL, index);
@@ -401,7 +530,7 @@ static struct llog_handle *llog_cat_current_log(struct llog_handle *cathandle,
 	 * meet this situation. */
 	if (IS_ERR_OR_NULL(cathandle->u.chd.chd_next_log)) {
 		CERROR("%s: next log does not exist!\n",
-		       cathandle->lgh_ctxt->loc_obd->obd_name);
+		       loghandle2name(cathandle));
 		loghandle = ERR_PTR(-EIO);
 		if (cathandle->u.chd.chd_next_log == NULL) {
 			/* Store the error in chd_next_log, so
@@ -425,40 +554,6 @@ static struct llog_handle *llog_cat_current_log(struct llog_handle *cathandle,
 	RETURN(loghandle);
 }
 
-static int llog_cat_update_header(const struct lu_env *env,
-			   struct llog_handle *cathandle)
-{
-	struct llog_handle *loghandle;
-	int rc;
-	ENTRY;
-
-	/* refresh llog */
-	down_write(&cathandle->lgh_lock);
-	if (!cathandle->lgh_stale) {
-		up_write(&cathandle->lgh_lock);
-		RETURN(0);
-	}
-	list_for_each_entry(loghandle, &cathandle->u.chd.chd_head,
-			    u.phd.phd_entry) {
-		if (!llog_exist(loghandle))
-			continue;
-
-		rc = llog_read_header(env, loghandle, NULL);
-		if (rc != 0) {
-			up_write(&cathandle->lgh_lock);
-			GOTO(out, rc);
-		}
-	}
-	rc = llog_read_header(env, cathandle, NULL);
-	if (rc == 0)
-		cathandle->lgh_stale = 0;
-	up_write(&cathandle->lgh_lock);
-	if (rc != 0)
-		GOTO(out, rc);
-out:
-	RETURN(rc);
-}
-
 /* Add a single record to the recovery log(s) using a catalog
  * Returns as llog_write_record
  *
@@ -512,7 +607,7 @@ int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle,
 		if (retried++ == 0)
 			GOTO(retry, rc);
 		CERROR("%s: error on 2nd llog: rc = %d\n",
-		       cathandle->lgh_ctxt->loc_obd->obd_name, rc);
+		       loghandle2name(cathandle), rc);
 	}
 
 	RETURN(rc);
@@ -523,167 +618,43 @@ int llog_cat_declare_add_rec(const struct lu_env *env,
 			     struct llog_handle *cathandle,
 			     struct llog_rec_hdr *rec, struct thandle *th)
 {
-	struct llog_thread_info	*lgi = llog_info(env);
-	struct llog_logid_rec	*lirec = &lgi->lgi_logid;
-	struct llog_handle	*loghandle, *next;
-	int			 rc = 0;
+	int rc;
 
 	ENTRY;
 
-	if (cathandle->u.chd.chd_current_log == NULL) {
-		/* declare new plain llog */
-		down_write(&cathandle->lgh_lock);
-		if (cathandle->u.chd.chd_current_log == NULL) {
-			rc = llog_open(env, cathandle->lgh_ctxt, &loghandle,
-				       NULL, NULL, LLOG_OPEN_NEW);
-			if (rc == 0) {
-				cathandle->u.chd.chd_current_log = loghandle;
-				list_add_tail(&loghandle->u.phd.phd_entry,
-					      &cathandle->u.chd.chd_head);
-			}
-		}
-		up_write(&cathandle->lgh_lock);
-	} else if (cathandle->u.chd.chd_next_log == NULL ||
-		   IS_ERR(cathandle->u.chd.chd_next_log)) {
-		/* declare next plain llog */
-		down_write(&cathandle->lgh_lock);
-		if (cathandle->u.chd.chd_next_log == NULL ||
-		    IS_ERR(cathandle->u.chd.chd_next_log)) {
-			rc = llog_open(env, cathandle->lgh_ctxt, &loghandle,
-				       NULL, NULL, LLOG_OPEN_NEW);
-			if (rc == 0) {
-				cathandle->u.chd.chd_next_log = loghandle;
-				list_add_tail(&loghandle->u.phd.phd_entry,
-					      &cathandle->u.chd.chd_head);
-			}
-		}
-		up_write(&cathandle->lgh_lock);
-	}
+start:
+	down_read_nested(&cathandle->lgh_lock, LLOGH_CAT);
+	rc = llog_cat_prep_log(env, cathandle,
+			       &cathandle->u.chd.chd_current_log, th);
 	if (rc)
-		GOTO(out, rc);
+		GOTO(unlock, rc);
 
-	lirec->lid_hdr.lrh_len = sizeof(*lirec);
-
-	if (!llog_exist(cathandle->u.chd.chd_current_log)) {
-		if (dt_object_remote(cathandle->lgh_obj)) {
-			/* For remote operation, if we put the llog object
-			 * creation in the current transaction, then the
-			 * llog object will not be created on the remote
-			 * target until the transaction stop, if other
-			 * operations start before the transaction stop,
-			 * and use the same llog object, will be dependent
-			 * on the success of this transaction. So let's
-			 * create the llog object synchronously here to
-			 * remove the dependency. */
-create_again:
-			down_read_nested(&cathandle->lgh_lock, LLOGH_CAT);
-			loghandle = cathandle->u.chd.chd_current_log;
-			down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
-			if (cathandle->lgh_stale) {
-				up_write(&loghandle->lgh_lock);
-				up_read(&cathandle->lgh_lock);
-				GOTO(out, rc = -EIO);
-			}
-			if (!llog_exist(loghandle)) {
-				rc = llog_cat_new_log(env, cathandle, loghandle,
-						      NULL);
-				if (rc == -ESTALE)
-					cathandle->lgh_stale = 1;
-			}
-			up_write(&loghandle->lgh_lock);
-			up_read(&cathandle->lgh_lock);
-			if (rc == -ESTALE) {
-				rc = llog_cat_update_header(env, cathandle);
-				if (rc != 0)
-					GOTO(out, rc);
-				goto create_again;
-			} else if (rc < 0) {
-				GOTO(out, rc);
-			}
-		} else {
-			rc = llog_declare_create(env,
-					cathandle->u.chd.chd_current_log, th);
-			if (rc)
-				GOTO(out, rc);
-			llog_declare_write_rec(env, cathandle,
-					       &lirec->lid_hdr, -1, th);
-		}
-	}
+	rc = llog_cat_prep_log(env, cathandle, &cathandle->u.chd.chd_next_log,
+			       th);
+	if (rc)
+		GOTO(unlock, rc);
 
-write_again:
-	/* declare records in the llogs */
 	rc = llog_declare_write_rec(env, cathandle->u.chd.chd_current_log,
 				    rec, -1, th);
-	if (rc == -ESTALE) {
-		down_write(&cathandle->lgh_lock);
-		if (cathandle->lgh_stale) {
-			up_write(&cathandle->lgh_lock);
-			GOTO(out, rc = -EIO);
-		}
-
-		cathandle->lgh_stale = 1;
-		up_write(&cathandle->lgh_lock);
-		rc = llog_cat_update_header(env, cathandle);
-		if (rc != 0)
-			GOTO(out, rc);
-		goto write_again;
-	} else if (rc < 0) {
-		GOTO(out, rc);
+	if (rc == -ESTALE && dt_object_remote(cathandle->lgh_obj)) {
+		up_read(&cathandle->lgh_lock);
+		rc = llog_cat_refresh(env, cathandle);
+		if (rc)
+			RETURN(rc);
+		goto start;
 	}
 
-	next = cathandle->u.chd.chd_next_log;
-	if (!IS_ERR_OR_NULL(next)) {
-		if (!llog_exist(next)) {
-			if (dt_object_remote(cathandle->lgh_obj)) {
-				/* For remote operation, if we put the llog
-				 * object creation in the current transaction,
-				 * then the llog object will not be created on
-				 * the remote target until the transaction stop,
-				 * if other operations start before the
-				 * transaction stop, and use the same llog
-				 * object, will be dependent on the success of
-				 * this transaction. So let's create the llog
-				 * object synchronously here to remove the
-				 * dependency. */
-				down_write_nested(&cathandle->lgh_lock,
-						 LLOGH_CAT);
-				next = cathandle->u.chd.chd_next_log;
-				if (IS_ERR_OR_NULL(next)) {
-					/* Sigh, another thread just tried,
-					 * let's fail as well */
-					up_write(&cathandle->lgh_lock);
-					if (next == NULL)
-						rc = -EIO;
-					else
-						rc = PTR_ERR(next);
-					GOTO(out, rc);
-				}
-
-				down_write_nested(&next->lgh_lock, LLOGH_LOG);
-				if (!llog_exist(next)) {
-					rc = llog_cat_new_log(env, cathandle,
-							      next, NULL);
-					if (rc < 0)
-						cathandle->u.chd.chd_next_log =
-								ERR_PTR(rc);
-				}
-				up_write(&next->lgh_lock);
-				up_write(&cathandle->lgh_lock);
-				if (rc < 0)
-					GOTO(out, rc);
-			} else {
-				rc = llog_declare_create(env, next, th);
-				llog_declare_write_rec(env, cathandle,
-						&lirec->lid_hdr, -1, th);
-			}
-		}
-		/* XXX: we hope for declarations made for existing llog
-		 *	this might be not correct with some backends
-		 *	where declarations are expected against specific
-		 *	object like ZFS with full debugging enabled */
-		/*llog_declare_write_rec(env, next, rec, -1, th);*/
-	}
-out:
+#if 0
+	/*
+	 * XXX: we hope for declarations made for existing llog this might be
+	 * not correct with some backends where declarations are expected
+	 * against specific object like ZFS with full debugging enabled.
+	 */
+	rc = llog_declare_write_rec(env, cathandle->u.chd.chd_next_log, rec, -1,
+				    th);
+#endif
+unlock:
+	up_read(&cathandle->lgh_lock);
 	RETURN(rc);
 }
 EXPORT_SYMBOL(llog_cat_declare_add_rec);
@@ -746,8 +717,7 @@ int llog_cat_cancel_records(const struct lu_env *env,
 		rc = llog_cat_id2handle(env, cathandle, &loghandle, lgl);
 		if (rc) {
 			CDEBUG(D_HA, "%s: cannot find llog for handle "DFID":%x"
-			       ": rc = %d\n",
-			       cathandle->lgh_ctxt->loc_obd->obd_name,
+			       ": rc = %d\n", loghandle2name(cathandle),
 			       PFID(&lgl->lgl_oi.oi_fid), lgl->lgl_ogen, rc);
 			failed++;
 			continue;
@@ -762,8 +732,7 @@ int llog_cat_cancel_records(const struct lu_env *env,
 			 */
 			lrc = -ENOENT;
 			CDEBUG(D_HA, "%s: llog "DFID":%x does not exist"
-			       ": rc = %d\n",
-			       cathandle->lgh_ctxt->loc_obd->obd_name,
+			       ": rc = %d\n", loghandle2name(cathandle),
 			       PFID(&lgl->lgl_oi.oi_fid), lgl->lgl_ogen, lrc);
 			failed++;
 			if (rc == 0)
@@ -786,68 +755,86 @@ int llog_cat_cancel_records(const struct lu_env *env,
 			if (rc == 0)
 				rc = lrc;
 		}
-		llog_handle_put(loghandle);
+		llog_handle_put(env, loghandle);
 	}
 	if (rc)
 		CERROR("%s: fail to cancel %d of %d llog-records: rc = %d\n",
-		       cathandle->lgh_ctxt->loc_obd->obd_name, failed, count,
-		       rc);
+		       loghandle2name(cathandle), failed, count, rc);
 
 	RETURN(rc);
 }
 EXPORT_SYMBOL(llog_cat_cancel_records);
 
-static int llog_cat_process_cb(const struct lu_env *env,
-			       struct llog_handle *cat_llh,
-			       struct llog_rec_hdr *rec, void *data)
+static int llog_cat_process_common(const struct lu_env *env,
+				   struct llog_handle *cat_llh,
+				   struct llog_rec_hdr *rec,
+				   struct llog_handle **llhp)
 {
-	struct llog_process_data *d = data;
-	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
-	struct llog_handle *llh;
+	struct llog_logid_rec *lir = container_of(rec, typeof(*lir), lid_hdr);
 	struct llog_log_hdr *hdr;
 	int rc;
 
 	ENTRY;
-	if (rec->lrh_type != LLOG_LOGID_MAGIC) {
-		CERROR("invalid record in catalog\n");
-		RETURN(-EINVAL);
+	if (rec->lrh_type != le32_to_cpu(LLOG_LOGID_MAGIC)) {
+		rc = -EINVAL;
+		CWARN("%s: invalid record in catalog "DFID":%x: rc = %d\n",
+		      loghandle2name(cat_llh),
+		      PFID(&cat_llh->lgh_id.lgl_oi.oi_fid),
+		      cat_llh->lgh_id.lgl_ogen, rc);
+		RETURN(rc);
 	}
-	CDEBUG(D_HA, "processing log "DFID":%x at index %u of catalog "
-	       DFID"\n", PFID(&lir->lid_id.lgl_oi.oi_fid), lir->lid_id.lgl_ogen,
-	       rec->lrh_index, PFID(&cat_llh->lgh_id.lgl_oi.oi_fid));
+	CDEBUG(D_HA, "processing log "DFID":%x at index %u of catalog "DFID"\n",
+	       PFID(&lir->lid_id.lgl_oi.oi_fid), lir->lid_id.lgl_ogen,
+	       le32_to_cpu(rec->lrh_index),
+	       PFID(&cat_llh->lgh_id.lgl_oi.oi_fid));
 
-	rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id);
+	rc = llog_cat_id2handle(env, cat_llh, llhp, &lir->lid_id);
 	if (rc) {
-		CERROR("%s: cannot find handle for llog "DFID": rc = %d\n",
-		       cat_llh->lgh_ctxt->loc_obd->obd_name,
-		       PFID(&lir->lid_id.lgl_oi.oi_fid), rc);
-		if (rc == -ENOENT || rc == -ESTALE) {
-			/* After a server crash, a stub of index
-			 * record in catlog could be kept, because
-			 * plain log destroy + catlog index record
-			 * deletion are not atomic. So we end up with
-			 * an index but no actual record. Destroy the
-			 * index and move on. */
-			rc = llog_cat_cleanup(env, cat_llh, NULL,
-					      rec->lrh_index);
-		}
+		/* After a server crash, a stub of index record in catlog could
+		 * be kept, because plain log destroy + catlog index record
+		 * deletion are not atomic. So we end up with an index but no
+		 * actual record. Destroy the index and move on. */
+		if (rc == -ENOENT || rc == -ESTALE)
+			rc = LLOG_DEL_RECORD;
+		else if (rc)
+			CWARN("%s: can't find llog handle "DFID":%x: rc = %d\n",
+			      loghandle2name(cat_llh),
+			      PFID(&lir->lid_id.lgl_oi.oi_fid),
+			      lir->lid_id.lgl_ogen, rc);
 
 		RETURN(rc);
 	}
 
 	/* clean old empty llogs, do not consider current llog in use */
-	/* ignore remote (lgh_obj=NULL) llogs */
-	hdr = llh->lgh_hdr;
+	/* ignore remote (lgh_obj == NULL) llogs */
+	hdr = (*llhp)->lgh_hdr;
 	if ((hdr->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
 	    hdr->llh_count == 1 && cat_llh->lgh_obj != NULL &&
-	    llh != cat_llh->u.chd.chd_current_log) {
-		rc = llog_destroy(env, llh);
+	    *llhp != cat_llh->u.chd.chd_current_log) {
+		rc = llog_destroy(env, *llhp);
 		if (rc)
-			CERROR("%s: fail to destroy empty log: rc = %d\n",
-			       llh->lgh_ctxt->loc_obd->obd_name, rc);
-		GOTO(out, rc = LLOG_DEL_PLAIN);
+			CWARN("%s: can't destroy empty log "DFID": rc = %d\n",
+			      loghandle2name((*llhp)),
+			      PFID(&lir->lid_id.lgl_oi.oi_fid), rc);
+		rc = LLOG_DEL_PLAIN;
 	}
 
+	RETURN(rc);
+}
+
+static int llog_cat_process_cb(const struct lu_env *env,
+			       struct llog_handle *cat_llh,
+			       struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_process_data *d = data;
+	struct llog_handle *llh = NULL;
+	int rc;
+
+	ENTRY;
+	rc = llog_cat_process_common(env, cat_llh, rec, &llh);
+	if (rc)
+		GOTO(out, rc);
+
 	if (rec->lrh_index < d->lpd_startcat) {
 		/* Skip processing of the logs until startcat */
 		rc = 0;
@@ -864,13 +851,29 @@ static int llog_cat_process_cb(const struct lu_env *env,
 		rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data,
 					  NULL, false);
 	}
+	if (rc == -ENOENT && (cat_llh->lgh_hdr->llh_flags & LLOG_F_RM_ON_ERR)) {
+		/*
+		 * plain llog is reported corrupted, so better to just remove
+		 * it if the caller is fine with that.
+		 */
+		CERROR("%s: remove corrupted/missing llog "DFID"\n",
+		       loghandle2name(cat_llh),
+		       PFID(&llh->lgh_id.lgl_oi.oi_fid));
+		rc = LLOG_DEL_PLAIN;
+	}
 
 out:
 	/* The empty plain log was destroyed while processing */
-	if (rc == LLOG_DEL_PLAIN)
+	if (rc == LLOG_DEL_PLAIN) {
 		rc = llog_cat_cleanup(env, cat_llh, llh,
 				      llh->u.phd.phd_cookie.lgc_index);
-	llog_handle_put(llh);
+	} else if (rc == LLOG_DEL_RECORD) {
+		/* clear wrong catalog entry */
+		rc = llog_cat_cleanup(env, cat_llh, NULL, rec->lrh_index);
+	}
+
+	if (llh)
+		llog_handle_put(env, llh);
 
 	RETURN(rc);
 }
@@ -880,43 +883,62 @@ int llog_cat_process_or_fork(const struct lu_env *env,
 			     llog_cb_t cb, void *data, int startcat,
 			     int startidx, bool fork)
 {
-        struct llog_process_data d;
-        struct llog_log_hdr *llh = cat_llh->lgh_hdr;
-        int rc;
-        ENTRY;
+	struct llog_process_data d;
+	struct llog_log_hdr *llh = cat_llh->lgh_hdr;
+	int rc;
 
-        LASSERT(llh->llh_flags & LLOG_F_IS_CAT);
-        d.lpd_data = data;
-        d.lpd_cb = cb;
-        d.lpd_startcat = startcat;
-        d.lpd_startidx = startidx;
+	ENTRY;
+
+	LASSERT(llh->llh_flags & LLOG_F_IS_CAT);
+	d.lpd_data = data;
+	d.lpd_cb = cb;
+	d.lpd_startcat = (startcat == LLOG_CAT_FIRST ? 0 : startcat);
+	d.lpd_startidx = startidx;
 
 	if (llh->llh_cat_idx >= cat_llh->lgh_last_idx &&
 	    llh->llh_count > 1) {
 		struct llog_process_cat_data cd;
 
 		CWARN("%s: catlog "DFID" crosses index zero\n",
-		      cat_llh->lgh_ctxt->loc_obd->obd_name,
+		      loghandle2name(cat_llh),
 		      PFID(&cat_llh->lgh_id.lgl_oi.oi_fid));
-
-		cd.lpcd_first_idx = llh->llh_cat_idx;
-		cd.lpcd_last_idx = 0;
-		rc = llog_process_or_fork(env, cat_llh, cat_cb,
-					  &d, &cd, fork);
-		if (rc != 0)
-			RETURN(rc);
-
-		cd.lpcd_first_idx = 0;
+		/*startcat = 0 is default value for general processing */
+		if ((startcat != LLOG_CAT_FIRST &&
+		    startcat >= llh->llh_cat_idx) || !startcat) {
+			/* processing the catalog part at the end */
+			cd.lpcd_first_idx = (startcat ? startcat :
+					     llh->llh_cat_idx);
+			if (OBD_FAIL_PRECHECK(OBD_FAIL_CAT_RECORDS))
+				cd.lpcd_last_idx = cfs_fail_val;
+			else
+				cd.lpcd_last_idx = 0;
+			rc = llog_process_or_fork(env, cat_llh, cat_cb,
+						  &d, &cd, fork);
+			/* Reset the startcat becasue it has already reached
+			 * catalog bottom.
+			 */
+			startcat = 0;
+			if (rc != 0)
+				RETURN(rc);
+		}
+		/* processing the catalog part at the begining */
+		cd.lpcd_first_idx = (startcat == LLOG_CAT_FIRST) ? 0 : startcat;
+		/* Note, the processing will stop at the lgh_last_idx value,
+		 * and it could be increased during processing. So records
+		 * between current lgh_last_idx and lgh_last_idx in future
+		 * would left unprocessed.
+		 */
 		cd.lpcd_last_idx = cat_llh->lgh_last_idx;
 		rc = llog_process_or_fork(env, cat_llh, cat_cb,
 					  &d, &cd, fork);
-        } else {
+	} else {
 		rc = llog_process_or_fork(env, cat_llh, cat_cb,
 					  &d, NULL, fork);
-        }
+	}
 
-        RETURN(rc);
+	RETURN(rc);
 }
+EXPORT_SYMBOL(llog_cat_process_or_fork);
 
 int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh,
 		     llog_cb_t cb, void *data, int startcat, int startidx)
@@ -931,39 +953,33 @@ static int llog_cat_size_cb(const struct lu_env *env,
 			     struct llog_rec_hdr *rec, void *data)
 {
 	struct llog_process_data *d = data;
-	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
-	struct llog_handle *llh;
-	int rc;
+	struct llog_handle *llh = NULL;
 	__u64 *cum_size = d->lpd_data;
 	__u64 size;
+	int rc;
 
 	ENTRY;
-	if (rec->lrh_type != LLOG_LOGID_MAGIC) {
-		CERROR("%s: invalid record in catalog, rc = %d\n",
-		       cat_llh->lgh_ctxt->loc_obd->obd_name, -EINVAL);
-		RETURN(-EINVAL);
-	}
-	CDEBUG(D_HA, "processing log "DFID":%x at index %u of catalog "
-	       DFID"\n", PFID(&lir->lid_id.lgl_oi.oi_fid), lir->lid_id.lgl_ogen,
-	       rec->lrh_index, PFID(&cat_llh->lgh_id.lgl_oi.oi_fid));
+	rc = llog_cat_process_common(env, cat_llh, rec, &llh);
 
-	rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id);
-	if (rc) {
-		CWARN("%s: cannot find handle for llog "DFID": rc = %d\n",
-		      cat_llh->lgh_ctxt->loc_obd->obd_name,
-		      PFID(&lir->lid_id.lgl_oi.oi_fid), rc);
-		RETURN(0);
-	}
-	size = llog_size(env, llh);
-	*cum_size += size;
+	if (rc == LLOG_DEL_PLAIN) {
+		/* empty log was deleted, don't count it */
+		rc = llog_cat_cleanup(env, cat_llh, llh,
+				      llh->u.phd.phd_cookie.lgc_index);
+	} else if (rc == LLOG_DEL_RECORD) {
+		/* clear wrong catalog entry */
+		rc = llog_cat_cleanup(env, cat_llh, NULL, rec->lrh_index);
+	} else {
+		size = llog_size(env, llh);
+		*cum_size += size;
 
-	CDEBUG(D_INFO, "Add llog entry "DFID" size %llu\n",
-	       PFID(&llh->lgh_id.lgl_oi.oi_fid), size);
+		CDEBUG(D_INFO, "Add llog entry "DFID" size=%llu, tot=%llu\n",
+		       PFID(&llh->lgh_id.lgl_oi.oi_fid), size, *cum_size);
+	}
 
-	llog_handle_put(llh);
+	if (llh != NULL)
+		llog_handle_put(env, llh);
 
 	RETURN(0);
-
 }
 
 __u64 llog_cat_size(const struct lu_env *env, struct llog_handle *cat_llh)
@@ -977,65 +993,58 @@ __u64 llog_cat_size(const struct lu_env *env, struct llog_handle *cat_llh)
 }
 EXPORT_SYMBOL(llog_cat_size);
 
+/* currently returns the number of "free" entries in catalog,
+ * ie the available entries for a new plain LLOG file creation,
+ * even if catalog has wrapped
+ */
+__u32 llog_cat_free_space(struct llog_handle *cat_llh)
+{
+	/* simulate almost full Catalog */
+	if (OBD_FAIL_CHECK(OBD_FAIL_CAT_FREE_RECORDS))
+		return cfs_fail_val;
+
+	if (cat_llh->lgh_hdr->llh_count == 1)
+		return LLOG_HDR_BITMAP_SIZE(cat_llh->lgh_hdr) - 1;
+
+	if (cat_llh->lgh_last_idx > cat_llh->lgh_hdr->llh_cat_idx)
+		return LLOG_HDR_BITMAP_SIZE(cat_llh->lgh_hdr) - 1 +
+		       cat_llh->lgh_hdr->llh_cat_idx - cat_llh->lgh_last_idx;
+
+	/* catalog is presently wrapped */
+	return cat_llh->lgh_hdr->llh_cat_idx - cat_llh->lgh_last_idx;
+}
+EXPORT_SYMBOL(llog_cat_free_space);
+
 static int llog_cat_reverse_process_cb(const struct lu_env *env,
 				       struct llog_handle *cat_llh,
 				       struct llog_rec_hdr *rec, void *data)
 {
 	struct llog_process_data *d = data;
-	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
 	struct llog_handle *llh;
-	struct llog_log_hdr *hdr;
 	int rc;
 
-	if (le32_to_cpu(rec->lrh_type) != LLOG_LOGID_MAGIC) {
-		CERROR("invalid record in catalog\n");
-		RETURN(-EINVAL);
-	}
-	CDEBUG(D_HA, "processing log "DFID":%x at index %u of catalog "
-	       DFID"\n", PFID(&lir->lid_id.lgl_oi.oi_fid), lir->lid_id.lgl_ogen,
-	       le32_to_cpu(rec->lrh_index),
-	       PFID(&cat_llh->lgh_id.lgl_oi.oi_fid));
-
-	rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id);
-	if (rc) {
-		CERROR("%s: cannot find handle for llog "DFID": rc = %d\n",
-		       cat_llh->lgh_ctxt->loc_obd->obd_name,
-		       PFID(&lir->lid_id.lgl_oi.oi_fid), rc);
-		if (rc == -ENOENT || rc == -ESTALE) {
-			/* After a server crash, a stub of index
-			 * record in catlog could be kept, because
-			 * plain log destroy + catlog index record
-			 * deletion are not atomic. So we end up with
-			 * an index but no actual record. Destroy the
-			 * index and move on. */
-			rc = llog_cat_cleanup(env, cat_llh, NULL,
-					      rec->lrh_index);
-		}
-
-		RETURN(rc);
-	}
+	ENTRY;
+	rc = llog_cat_process_common(env, cat_llh, rec, &llh);
 
-	/* clean old empty llogs, do not consider current llog in use */
-	hdr = llh->lgh_hdr;
-	if ((hdr->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
-	    hdr->llh_count == 1 &&
-	    llh != cat_llh->u.chd.chd_current_log) {
-		rc = llog_destroy(env, llh);
-		if (rc)
-			CERROR("%s: fail to destroy empty log: rc = %d\n",
-			       llh->lgh_ctxt->loc_obd->obd_name, rc);
-		GOTO(out, rc = LLOG_DEL_PLAIN);
+	/* The empty plain log was destroyed while processing */
+	if (rc == LLOG_DEL_PLAIN) {
+		rc = llog_cat_cleanup(env, cat_llh, llh,
+				      llh->u.phd.phd_cookie.lgc_index);
+	} else if (rc == LLOG_DEL_RECORD) {
+		/* clear wrong catalog entry */
+		rc = llog_cat_cleanup(env, cat_llh, NULL, rec->lrh_index);
 	}
+	if (rc)
+		RETURN(rc);
 
 	rc = llog_reverse_process(env, llh, d->lpd_cb, d->lpd_data, NULL);
 
-out:
 	/* The empty plain was destroyed while processing */
 	if (rc == LLOG_DEL_PLAIN)
 		rc = llog_cat_cleanup(env, cat_llh, llh,
 				      llh->u.phd.phd_cookie.lgc_index);
 
-	llog_handle_put(llh);
+	llog_handle_put(env, llh);
 	RETURN(rc);
 }
 
@@ -1056,7 +1065,7 @@ int llog_cat_reverse_process(const struct lu_env *env,
 	if (llh->llh_cat_idx >= cat_llh->lgh_last_idx &&
 	    llh->llh_count > 1) {
 		CWARN("%s: catalog "DFID" crosses index zero\n",
-		      cat_llh->lgh_ctxt->loc_obd->obd_name,
+		      loghandle2name(cat_llh),
 		      PFID(&cat_llh->lgh_id.lgl_oi.oi_fid));
 
 		cd.lpcd_first_idx = 0;
@@ -1114,7 +1123,7 @@ static int llog_cat_set_first_idx(struct llog_handle *cathandle, int idx)
 			}
 		}
 
-		CDEBUG(D_RPCTRACE, "catlog "DFID" first idx %u, last_idx %u\n",
+		CDEBUG(D_HA, "catlog "DFID" first idx %u, last_idx %u\n",
 		       PFID(&cathandle->lgh_id.lgl_oi.oi_fid),
 		       llh->llh_cat_idx, cathandle->lgh_last_idx);
 	}
@@ -1127,11 +1136,13 @@ int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle,
 		     struct llog_handle *loghandle, int index)
 {
 	int rc;
+	struct lu_fid fid = {.f_seq = 0, .f_oid = 0, .f_ver = 0};
 
 	LASSERT(index);
 	if (loghandle != NULL) {
 		/* remove destroyed llog from catalog list and
 		 * chd_current_log variable */
+		fid = loghandle->lgh_id.lgl_oi.oi_fid;
 		down_write(&cathandle->lgh_lock);
 		if (cathandle->u.chd.chd_current_log == loghandle)
 			cathandle->u.chd.chd_current_log = NULL;
@@ -1150,7 +1161,9 @@ int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle,
 	llog_cat_set_first_idx(cathandle, index);
 	rc = llog_cancel_rec(env, cathandle, index);
 	if (rc == 0)
-		CDEBUG(D_HA, "cancel plain log at index %u of catalog "DFID"\n",
-		       index, PFID(&cathandle->lgh_id.lgl_oi.oi_fid));
+		CDEBUG(D_HA,
+		       "cancel plain log "DFID" at index %u of catalog "DFID"\n",
+		       PFID(&fid), index,
+		       PFID(&cathandle->lgh_id.lgl_oi.oi_fid));
 	return rc;
 }
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_internal.h b/drivers/staging/lustrefsx/lustre/obdclass/llog_internal.h
index eb9526ad504d0..c42f13ea6824f 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/llog_internal.h
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -74,8 +74,8 @@ static inline struct llog_thread_info *llog_info(const struct lu_env *env)
 int llog_info_init(void);
 void llog_info_fini(void);
 
-void llog_handle_get(struct llog_handle *loghandle);
-void llog_handle_put(struct llog_handle *loghandle);
+struct llog_handle *llog_handle_get(struct llog_handle *loghandle);
+int llog_handle_put(const struct lu_env *env, struct llog_handle *loghandle);
 int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
 		       struct llog_handle **res, struct llog_logid *logid);
 int class_config_dump_handler(const struct lu_env *env,
@@ -92,4 +92,9 @@ static inline struct llog_rec_hdr *llog_rec_hdr_next(struct llog_rec_hdr *rec)
 {
 	return (struct llog_rec_hdr *)((char *)rec + rec->lrh_len);
 }
+int llog_verify_record(const struct llog_handle *llh, struct llog_rec_hdr *rec);
+static inline char *loghandle2name(const struct llog_handle *lgh)
+{
+	return lgh->lgh_ctxt->loc_obd->obd_name;
+}
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_ioctl.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_ioctl.c
index 906e6e64ef4e6..276ffa8280c84 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/llog_ioctl.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_ioctl.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -33,14 +33,16 @@
 #define DEBUG_SUBSYSTEM S_LOG
 
 #include <obd_class.h>
-#include <uapi/linux/lustre_ioctl.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
 #include <lustre_log.h>
 #include "llog_internal.h"
 
 static int str2logid(struct llog_logid *logid, char *str, int len)
 {
-	char *start, *end, *endp;
-	__u64 id, seq;
+	unsigned long long id, seq;
+	char *start, *end;
+	u32 ogen;
+	int rc;
 
 	ENTRY;
 	start = str;
@@ -56,10 +58,12 @@ static int str2logid(struct llog_logid *logid, char *str, int len)
 	}
 
 #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 1, 53, 0)
-	/* logids used to be input in the form "#id#seq:ogen" before they
+	/*
+	 * logids used to be input in the form "#id#seq:ogen" before they
 	 * were changed over to accept the FID [seq:oid:ver] format.
 	 * This is accepted for compatibility reasons, though I doubt
-	 * anyone is actually using this for anything. */
+	 * anyone is actually using this for anything.
+	 */
 	if (start[0] != '#')
 		RETURN(-EINVAL);
 
@@ -71,34 +75,37 @@ static int str2logid(struct llog_logid *logid, char *str, int len)
 		RETURN(-EINVAL);
 
 	*end = '\0';
-	id = simple_strtoull(start, &endp, 0);
-        if (endp != end)
-                RETURN(-EINVAL);
+	rc = kstrtoull(start, 0, &id);
+	if (rc)
+		RETURN(rc);
 
-        start = ++end;
-        if (start - str >= len - 1)
-                RETURN(-EINVAL);
-        end = strchr(start, '#');
-        if (end == NULL || end == start)
-                RETURN(-EINVAL);
+	start = ++end;
+	if (start - str >= len - 1)
+		RETURN(-EINVAL);
 
-        *end = '\0';
-	seq = simple_strtoull(start, &endp, 0);
-        if (endp != end)
-                RETURN(-EINVAL);
+	end = strchr(start, '#');
+	if (!end || end == start)
+		RETURN(-EINVAL);
+
+	*end = '\0';
+	rc = kstrtoull(start, 0, &seq);
+	if (rc)
+		RETURN(rc);
 
 	ostid_set_seq(&logid->lgl_oi, seq);
 	if (ostid_set_id(&logid->lgl_oi, id))
 		RETURN(-EINVAL);
 
 	start = ++end;
-        if (start - str >= len - 1)
-                RETURN(-EINVAL);
-        logid->lgl_ogen = simple_strtoul(start, &endp, 16);
-        if (*endp != '\0')
+	if (start - str >= len - 1)
+		RETURN(-EINVAL);
+
+	rc = kstrtouint(start, 16, &ogen);
+	if (rc)
                 RETURN(-EINVAL);
+	logid->lgl_ogen = ogen;
 
-        RETURN(0);
+	RETURN(0);
 #else
 	RETURN(-EINVAL);
 #endif
@@ -107,29 +114,31 @@ static int str2logid(struct llog_logid *logid, char *str, int len)
 static int llog_check_cb(const struct lu_env *env, struct llog_handle *handle,
 			 struct llog_rec_hdr *rec, void *data)
 {
-        struct obd_ioctl_data *ioc_data = (struct obd_ioctl_data *)data;
+	struct obd_ioctl_data *ioc_data = data;
 	static int l, remains;
 	static long from, to;
-        static char *out;
-        char *endp;
-        int cur_index, rc = 0;
-
-        ENTRY;
+	static char *out;
+	int cur_index;
+	int rc = 0;
 
+	ENTRY;
 	if (ioc_data && ioc_data->ioc_inllen1 > 0) {
-                l = 0;
-                remains = ioc_data->ioc_inllen4 +
-                        cfs_size_round(ioc_data->ioc_inllen1) +
-                        cfs_size_round(ioc_data->ioc_inllen2) +
-                        cfs_size_round(ioc_data->ioc_inllen3);
-                from = simple_strtol(ioc_data->ioc_inlbuf2, &endp, 0);
-                if (*endp != '\0')
-                        RETURN(-EINVAL);
-                to = simple_strtol(ioc_data->ioc_inlbuf3, &endp, 0);
-                if (*endp != '\0')
-                        RETURN(-EINVAL);
-                ioc_data->ioc_inllen1 = 0;
-                out = ioc_data->ioc_bulk;
+		l = 0;
+		remains = ioc_data->ioc_inllen4 +
+			  round_up(ioc_data->ioc_inllen1, 8) +
+			  round_up(ioc_data->ioc_inllen2, 8) +
+			  round_up(ioc_data->ioc_inllen3, 8);
+
+		rc = kstrtol(ioc_data->ioc_inlbuf2, 0, &from);
+		if (rc)
+			RETURN(rc);
+
+		rc = kstrtol(ioc_data->ioc_inlbuf3, 0, &to);
+		if (rc)
+			RETURN(rc);
+
+		ioc_data->ioc_inllen1 = 0;
+		out = ioc_data->ioc_bulk;
 	}
 
 	cur_index = rec->lrh_index;
@@ -139,17 +148,17 @@ static int llog_check_cb(const struct lu_env *env, struct llog_handle *handle,
 		RETURN(-LLOG_EEMPTY);
 
 	if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) {
-		struct llog_logid_rec	*lir = (struct llog_logid_rec *)rec;
-		struct llog_handle	*loghandle;
-
-                if (rec->lrh_type != LLOG_LOGID_MAGIC) {
-                        l = snprintf(out, remains, "[index]: %05d  [type]: "
-                                     "%02x  [len]: %04d failed\n",
-                                     cur_index, rec->lrh_type,
-                                     rec->lrh_len);
-                }
-                if (handle->lgh_ctxt == NULL)
-                        RETURN(-EOPNOTSUPP);
+		struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+		struct llog_handle *loghandle;
+
+		if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+			l = snprintf(out, remains,
+				     "[index]: %05d  [type]: %02x  [len]: %04d failed\n",
+				     cur_index, rec->lrh_type,
+				     rec->lrh_len);
+		}
+		if (handle->lgh_ctxt == NULL)
+			RETURN(-EOPNOTSUPP);
 		rc = llog_cat_id2handle(env, handle, &loghandle, &lir->lid_id);
 		if (rc) {
 			CDEBUG(D_IOCTL, "cannot find log "DFID":%x\n",
@@ -158,16 +167,16 @@ static int llog_check_cb(const struct lu_env *env, struct llog_handle *handle,
 			RETURN(rc);
 		}
 		rc = llog_process(env, loghandle, llog_check_cb, NULL, NULL);
-		llog_handle_put(loghandle);
+		llog_handle_put(env, loghandle);
 	} else {
 		bool ok;
 
-                switch (rec->lrh_type) {
-                case OST_SZ_REC:
-                case MDS_UNLINK_REC:
+		switch (rec->lrh_type) {
+		case OST_SZ_REC:
+		case MDS_UNLINK_REC:
 		case MDS_UNLINK64_REC:
-                case MDS_SETATTR64_REC:
-                case OBD_CFG_REC:
+		case MDS_SETATTR64_REC:
+		case OBD_CFG_REC:
 		case LLOG_GEN_REC:
 		case LLOG_HDR_MAGIC:
 			ok = true;
@@ -194,43 +203,46 @@ static int llog_check_cb(const struct lu_env *env, struct llog_handle *handle,
 static int llog_print_cb(const struct lu_env *env, struct llog_handle *handle,
 			 struct llog_rec_hdr *rec, void *data)
 {
-        struct obd_ioctl_data *ioc_data = (struct obd_ioctl_data *)data;
+	struct obd_ioctl_data *ioc_data = data;
 	static int l, remains;
 	static long from, to;
-        static char *out;
-        char *endp;
-        int cur_index;
-
-        ENTRY;
-	if (ioc_data != NULL && ioc_data->ioc_inllen1 > 0) {
-                l = 0;
-                remains = ioc_data->ioc_inllen4 +
-                        cfs_size_round(ioc_data->ioc_inllen1) +
-                        cfs_size_round(ioc_data->ioc_inllen2) +
-                        cfs_size_round(ioc_data->ioc_inllen3);
-                from = simple_strtol(ioc_data->ioc_inlbuf2, &endp, 0);
-                if (*endp != '\0')
-                        RETURN(-EINVAL);
-                to = simple_strtol(ioc_data->ioc_inlbuf3, &endp, 0);
-                if (*endp != '\0')
-                        RETURN(-EINVAL);
-                out = ioc_data->ioc_bulk;
-                ioc_data->ioc_inllen1 = 0;
-        }
-
-        cur_index = rec->lrh_index;
-        if (cur_index < from)
-                RETURN(0);
-        if (to > 0 && cur_index > to)
-                RETURN(-LLOG_EEMPTY);
-
-        if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) {
-                struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
-
-                if (rec->lrh_type != LLOG_LOGID_MAGIC) {
-                        CERROR("invalid record in catalog\n");
-                        RETURN(-EINVAL);
-                }
+	static char *out;
+	int cur_index;
+	int rc;
+
+	ENTRY;
+	if (ioc_data && ioc_data->ioc_inllen1 > 0) {
+		l = 0;
+		remains = ioc_data->ioc_inllen4 +
+			  round_up(ioc_data->ioc_inllen1, 8) +
+			  round_up(ioc_data->ioc_inllen2, 8) +
+			  round_up(ioc_data->ioc_inllen3, 8);
+
+		rc = kstrtol(ioc_data->ioc_inlbuf2, 0, &from);
+		if (rc)
+			RETURN(rc);
+
+		rc = kstrtol(ioc_data->ioc_inlbuf3, 0, &to);
+		if (rc)
+			RETURN(rc);
+
+		out = ioc_data->ioc_bulk;
+		ioc_data->ioc_inllen1 = 0;
+	}
+
+	cur_index = rec->lrh_index;
+	if (cur_index < from)
+		RETURN(0);
+	if (to > 0 && cur_index > to)
+		RETURN(-LLOG_EEMPTY);
+
+	if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) {
+		struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+
+		if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+			CERROR("invalid record in catalog\n");
+			RETURN(-EINVAL);
+		}
 
 		l = snprintf(out, remains,
 			     "[index]: %05d  [logid]: "DFID":%x\n",
@@ -247,21 +259,21 @@ static int llog_print_cb(const struct lu_env *env, struct llog_handle *handle,
 		l = snprintf(out, remains,
 			     "[index]: %05d  [type]: %02x  [len]: %04d\n",
 			     cur_index, rec->lrh_type, rec->lrh_len);
-        }
-        out += l;
-        remains -= l;
-        if (remains <= 0) {
-                CERROR("not enough space for print log records\n");
-                RETURN(-LLOG_EEMPTY);
-        }
-
-        RETURN(0);
+	}
+	out += l;
+	remains -= l;
+	if (remains <= 0) {
+		CERROR("not enough space for print log records\n");
+		RETURN(-LLOG_EEMPTY);
+	}
+
+	RETURN(0);
 }
 static int llog_remove_log(const struct lu_env *env, struct llog_handle *cat,
 			   struct llog_logid *logid)
 {
-	struct llog_handle	*log;
-	int			 rc;
+	struct llog_handle *log;
+	int rc;
 
 	ENTRY;
 
@@ -280,7 +292,7 @@ static int llog_remove_log(const struct lu_env *env, struct llog_handle *cat,
 	}
 	llog_cat_cleanup(env, cat, log, log->u.phd.phd_cookie.lgc_index);
 out:
-	llog_handle_put(log);
+	llog_handle_put(env, log);
 	RETURN(rc);
 
 }
@@ -288,8 +300,8 @@ static int llog_remove_log(const struct lu_env *env, struct llog_handle *cat,
 static int llog_delete_cb(const struct lu_env *env, struct llog_handle *handle,
 			  struct llog_rec_hdr *rec, void *data)
 {
-	struct llog_logid_rec	*lir = (struct llog_logid_rec *)rec;
-	int			 rc;
+	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+	int rc;
 
 	ENTRY;
 	if (rec->lrh_type != LLOG_LOGID_MAGIC)
@@ -303,15 +315,16 @@ static int llog_delete_cb(const struct lu_env *env, struct llog_handle *handle,
 int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
 	       struct obd_ioctl_data *data)
 {
-	struct llog_logid	 logid;
-	int			 rc = 0;
-	struct llog_handle	*handle = NULL;
-	char *logname;
+	struct llog_logid logid;
+	int rc = 0;
+	struct llog_handle *handle = NULL;
+	char *logname, start;
 
 	ENTRY;
 
 	logname = data->ioc_inlbuf1;
-	if (logname[0] == '#' || logname[0] == '[') {
+	start = logname[0];
+	if (start == '#' || start == '[') {
 		rc = str2logid(&logid, logname, data->ioc_inllen1);
 		if (rc)
 			RETURN(rc);
@@ -319,8 +332,8 @@ int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
 			       LLOG_OPEN_EXISTS);
 		if (rc)
 			RETURN(rc);
-	} else if (logname[0] == '$' || isalpha(logname[0])) {
-		if (logname[0] == '$')
+	} else if (start == '$' || isalpha(start) || isdigit(start)) {
+		if (start == '$')
 			logname++;
 
 		rc = llog_open(env, ctxt, &handle, NULL, logname,
@@ -328,7 +341,10 @@ int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
 		if (rc)
 			RETURN(rc);
 	} else {
-		RETURN(-EINVAL);
+		rc = -EINVAL;
+		CDEBUG(D_INFO, "%s: invalid log name '%s': rc = %d\n",
+		      ctxt->loc_obd->obd_name, logname, rc);
+		RETURN(rc);
 	}
 
 	rc = llog_init_handle(env, handle, 0, NULL);
@@ -337,10 +353,10 @@ int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
 
 	switch (cmd) {
 	case OBD_IOC_LLOG_INFO: {
-		int	 l;
-		int	 remains = data->ioc_inllen2 +
+		int l;
+		int remains = data->ioc_inllen2 +
 				   cfs_size_round(data->ioc_inllen1);
-		char	*out = data->ioc_bulk;
+		char *out = data->ioc_bulk;
 
 		l = snprintf(out, remains,
 			     "logid:            "DFID":%x\n"
@@ -382,11 +398,12 @@ int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
 	case OBD_IOC_LLOG_CANCEL: {
 		struct llog_cookie cookie;
 		struct llog_logid plain;
-		char *endp;
+		u32 lgc_index;
 
-		cookie.lgc_index = simple_strtoul(data->ioc_inlbuf3, &endp, 0);
-		if (*endp != '\0')
-			GOTO(out_close, rc = -EINVAL);
+		rc = kstrtouint(data->ioc_inlbuf3, 0, &lgc_index);
+		if (rc)
+			GOTO(out_close, rc);
+		cookie.lgc_index = lgc_index;
 
 		if (handle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) {
 			rc = llog_cancel_rec(env, handle, cookie.lgc_index);
@@ -453,11 +470,11 @@ int llog_catalog_list(const struct lu_env *env, struct dt_device *d,
 		      int count, struct obd_ioctl_data *data,
 		      const struct lu_fid *fid)
 {
-	int			 size, i;
-	struct llog_catid	*idarray;
-	struct llog_logid	*id;
-	char			*out;
-	int			 l, remains, rc = 0;
+	int size, i;
+	struct llog_catid *idarray;
+	struct llog_logid *id;
+	char *out;
+	int l, remains, rc = 0;
 
 	ENTRY;
 
@@ -480,15 +497,28 @@ int llog_catalog_list(const struct lu_env *env, struct dt_device *d,
 
 	out = data->ioc_bulk;
 	remains = data->ioc_inllen1;
-	for (i = 0; i < count; i++) {
+	/* OBD_FAIL: fetch the catalog records from the specified one */
+	if (OBD_FAIL_CHECK(OBD_FAIL_CATLIST))
+		data->ioc_count = cfs_fail_val - 1;
+	for (i = data->ioc_count; i < count; i++) {
 		id = &idarray[i].lci_logid;
 		l = snprintf(out, remains, "catalog_log: "DFID":%x\n",
-			     PFID(&id->lgl_oi.oi_fid), id->lgl_ogen);
+			      PFID(&id->lgl_oi.oi_fid), id->lgl_ogen);
 		out += l;
 		remains -= l;
-		if (remains <= 0)
-			break;
+		if (remains <= 0) {
+			if (remains < 0) {
+				/* the print is not complete */
+				remains += l;
+				data->ioc_bulk[out - data->ioc_bulk - l] = '\0';
+				data->ioc_count = i;
+			} else {
+				data->ioc_count = i++;
+			}
+			goto out;
+		}
 	}
+	data->ioc_count = 0;
 out:
 	OBD_FREE_LARGE(idarray, size);
 	RETURN(rc);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_obd.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_obd.c
index a5cdc6e184185..1d1f953992301 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/llog_obd.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_obd.c
@@ -40,36 +40,36 @@
 /* helper functions for calling the llog obd methods */
 static struct llog_ctxt* llog_new_ctxt(struct obd_device *obd)
 {
-        struct llog_ctxt *ctxt;
+	struct llog_ctxt *ctxt;
 
-        OBD_ALLOC_PTR(ctxt);
-        if (!ctxt)
-                return NULL;
+	OBD_ALLOC_PTR(ctxt);
+	if (!ctxt)
+		return NULL;
 
-        ctxt->loc_obd = obd;
+	ctxt->loc_obd = obd;
 	atomic_set(&ctxt->loc_refcount, 1);
 
-        return ctxt;
+	return ctxt;
 }
 
 static void llog_ctxt_destroy(struct llog_ctxt *ctxt)
 {
-        if (ctxt->loc_exp) {
-                class_export_put(ctxt->loc_exp);
-                ctxt->loc_exp = NULL;
-        }
-        if (ctxt->loc_imp) {
-                class_import_put(ctxt->loc_imp);
-                ctxt->loc_imp = NULL;
-        }
-        OBD_FREE_PTR(ctxt);
+	if (ctxt->loc_exp) {
+		class_export_put(ctxt->loc_exp);
+		ctxt->loc_exp = NULL;
+	}
+	if (ctxt->loc_imp) {
+		class_import_put(ctxt->loc_imp);
+		ctxt->loc_imp = NULL;
+	}
+	OBD_FREE_PTR(ctxt);
 }
 
 int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt)
 {
-        struct obd_llog_group *olg = ctxt->loc_olg;
-        struct obd_device *obd;
-        int rc = 0;
+	struct obd_llog_group *olg = ctxt->loc_olg;
+	struct obd_device *obd;
+	int rc = 0;
 
 	spin_lock(&olg->olg_lock);
 	if (!atomic_dec_and_test(&ctxt->loc_refcount)) {
@@ -84,16 +84,18 @@ int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt)
 	/* sync with llog ctxt user thread */
 	spin_unlock(&obd->obd_dev_lock);
 
-        /* obd->obd_starting is needed for the case of cleanup
-         * in error case while obd is starting up. */
-        LASSERTF(obd->obd_starting == 1 ||
-                 obd->obd_stopping == 1 || obd->obd_set_up == 0,
-                 "wrong obd state: %d/%d/%d\n", !!obd->obd_starting,
-                 !!obd->obd_stopping, !!obd->obd_set_up);
+	/*
+	 * obd->obd_starting is needed for the case of cleanup
+	 * in error case while obd is starting up.
+	 */
+	LASSERTF(obd->obd_starting == 1 ||
+		 obd->obd_stopping == 1 || obd->obd_set_up == 0,
+		 "wrong obd state: %d/%d/%d\n", !!obd->obd_starting,
+		 !!obd->obd_stopping, !!obd->obd_set_up);
 
-        /* cleanup the llog ctxt here */
-        if (CTXTP(ctxt, cleanup))
-		rc = CTXTP(ctxt, cleanup)(env, ctxt);
+	/* cleanup the llog ctxt here */
+	if (ctxt->loc_logops->lop_cleanup)
+		rc = ctxt->loc_logops->lop_cleanup(env, ctxt);
 
 	llog_ctxt_destroy(ctxt);
 	wake_up(&olg->olg_waitq);
@@ -103,39 +105,40 @@ EXPORT_SYMBOL(__llog_ctxt_put);
 
 int llog_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt)
 {
-        struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
-        struct obd_llog_group *olg;
-        int rc, idx;
-        ENTRY;
+	struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+	struct obd_llog_group *olg;
+	int rc, idx;
+
+	ENTRY;
 
-        LASSERT(ctxt != NULL);
-        LASSERT(ctxt != LP_POISON);
+	LASSERT(ctxt != NULL);
+	LASSERT(ctxt != LP_POISON);
 
-        olg = ctxt->loc_olg;
-        LASSERT(olg != NULL);
-        LASSERT(olg != LP_POISON);
+	olg = ctxt->loc_olg;
+	LASSERT(olg != NULL);
+	LASSERT(olg != LP_POISON);
 
-        idx = ctxt->loc_idx;
+	idx = ctxt->loc_idx;
 
 	/*
-         * Banlance the ctxt get when calling llog_cleanup()
-         */
+	 * Banlance the ctxt get when calling llog_cleanup()
+	 */
 	LASSERT(atomic_read(&ctxt->loc_refcount) < LI_POISON);
 	LASSERT(atomic_read(&ctxt->loc_refcount) > 1);
-        llog_ctxt_put(ctxt);
+	llog_ctxt_put(ctxt);
 
 	/*
 	 * Try to free the ctxt.
 	 */
 	rc = __llog_ctxt_put(env, ctxt);
-        if (rc)
-                CERROR("Error %d while cleaning up ctxt %p\n",
-                       rc, ctxt);
+	if (rc)
+		CERROR("Error %d while cleaning up ctxt %p\n",
+			rc, ctxt);
 
-        l_wait_event(olg->olg_waitq,
-                     llog_group_ctxt_null(olg, idx), &lwi);
+	l_wait_event(olg->olg_waitq,
+		     llog_group_ctxt_null(olg, idx), &lwi);
 
-        RETURN(rc);
+	RETURN(rc);
 }
 EXPORT_SYMBOL(llog_cleanup);
 
@@ -143,23 +146,24 @@ int llog_setup(const struct lu_env *env, struct obd_device *obd,
 	       struct obd_llog_group *olg, int index,
 	       struct obd_device *disk_obd, struct llog_operations *op)
 {
-        struct llog_ctxt *ctxt;
-        int rc = 0;
-        ENTRY;
+	struct llog_ctxt *ctxt;
+	int rc = 0;
+
+	ENTRY;
 
-        if (index < 0 || index >= LLOG_MAX_CTXTS)
-                RETURN(-EINVAL);
+	if (index < 0 || index >= LLOG_MAX_CTXTS)
+		RETURN(-EINVAL);
 
-        LASSERT(olg != NULL);
+	LASSERT(olg != NULL);
 
-        ctxt = llog_new_ctxt(obd);
-        if (!ctxt)
-                RETURN(-ENOMEM);
+	ctxt = llog_new_ctxt(obd);
+	if (!ctxt)
+		RETURN(-ENOMEM);
 
-        ctxt->loc_obd = obd;
-        ctxt->loc_olg = olg;
-        ctxt->loc_idx = index;
-        ctxt->loc_logops = op;
+	ctxt->loc_obd = obd;
+	ctxt->loc_olg = olg;
+	ctxt->loc_idx = index;
+	ctxt->loc_logops = op;
 	mutex_init(&ctxt->loc_mutex);
 	if (disk_obd != NULL)
 		ctxt->loc_exp = class_export_get(disk_obd->obd_self_export);
@@ -169,11 +173,11 @@ int llog_setup(const struct lu_env *env, struct obd_device *obd,
 	ctxt->loc_flags = LLOG_CTXT_FLAG_UNINITIALIZED;
 	ctxt->loc_chunk_size = LLOG_MIN_CHUNK_SIZE;
 
-        rc = llog_group_set_ctxt(olg, ctxt, index);
-        if (rc) {
-                llog_ctxt_destroy(ctxt);
-                if (rc == -EEXIST) {
-                        ctxt = llog_group_get_ctxt(olg, index);
+	rc = llog_group_set_ctxt(olg, ctxt, index);
+	if (rc) {
+		llog_ctxt_destroy(ctxt);
+		if (rc == -EEXIST) {
+			ctxt = llog_group_get_ctxt(olg, index);
 			if (ctxt) {
 				CDEBUG(D_CONFIG, "%s: ctxt %d already set up\n",
 				       obd->obd_name, index);
@@ -188,10 +192,10 @@ int llog_setup(const struct lu_env *env, struct obd_device *obd,
 				LASSERT(ctxt->loc_logops == op);
 				llog_ctxt_put(ctxt);
 			}
-                        rc = 0;
-                }
-                RETURN(rc);
-        }
+			rc = 0;
+		}
+		RETURN(rc);
+	}
 
 	if (op->lop_setup) {
 		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LLOG_SETUP))
@@ -205,48 +209,28 @@ int llog_setup(const struct lu_env *env, struct obd_device *obd,
 		       obd->obd_name, index, op->lop_setup, rc);
 		llog_group_clear_ctxt(olg, index);
 		llog_ctxt_destroy(ctxt);
-        } else {
-                CDEBUG(D_CONFIG, "obd %s ctxt %d is initialized\n",
-                       obd->obd_name, index);
-                ctxt->loc_flags &= ~LLOG_CTXT_FLAG_UNINITIALIZED;
-        }
+	} else {
+		CDEBUG(D_CONFIG, "obd %s ctxt %d is initialized\n",
+		       obd->obd_name, index);
+		ctxt->loc_flags &= ~LLOG_CTXT_FLAG_UNINITIALIZED;
+	}
 
-        RETURN(rc);
+	RETURN(rc);
 }
 EXPORT_SYMBOL(llog_setup);
 
 int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags)
 {
-        int rc = 0;
-        ENTRY;
+	int rc = 0;
 
-        if (!ctxt)
-                RETURN(0);
+	ENTRY;
+	if (ctxt && ctxt->loc_logops->lop_sync)
+		rc = ctxt->loc_logops->lop_sync(ctxt, exp, flags);
 
-        if (CTXTP(ctxt, sync))
-		rc = CTXTP(ctxt, sync)(ctxt, exp, flags);
-
-        RETURN(rc);
+	RETURN(rc);
 }
 EXPORT_SYMBOL(llog_sync);
 
-int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt,
-		struct llog_cookie *cookies, int flags)
-{
-        int rc;
-        ENTRY;
-
-        if (!ctxt) {
-                CERROR("No ctxt\n");
-                RETURN(-ENODEV);
-        }
-
-        CTXT_CHECK_OP(ctxt, cancel, -EOPNOTSUPP);
-	rc = CTXTP(ctxt, cancel)(env, ctxt, cookies, flags);
-        RETURN(rc);
-}
-EXPORT_SYMBOL(llog_cancel);
-
 /* context key constructor/destructor: llog_key_init, llog_key_fini */
 LU_KEY_INIT_FINI(llog, struct llog_thread_info);
 /* context key: llog_thread_key */
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_osd.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_osd.c
index ffa1ad0149b25..55088d417146d 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/llog_osd.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_osd.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -44,6 +44,8 @@
 
 #define DEBUG_SUBSYSTEM S_LOG
 
+#include <linux/delay.h>
+
 #include <dt_object.h>
 #include <llog_swab.h>
 #include <lustre_fid.h>
@@ -124,8 +126,7 @@ static int llog_osd_create_new_object(const struct lu_env *env,
 static int llog_osd_exist(struct llog_handle *handle)
 {
 	LASSERT(handle->lgh_obj);
-	return dt_object_exists(handle->lgh_obj) &&
-		!lu_object_is_dying(handle->lgh_obj->do_lu.lo_header);
+	return dt_object_exists(handle->lgh_obj) && !handle->lgh_destroyed;
 }
 
 static void *rec_tail(struct llog_rec_hdr *rec)
@@ -362,7 +363,7 @@ static int llog_osd_declare_write_rec(const struct lu_env *env,
  *				the full llog record to write. This is
  *				the beginning of buffer to write, the length
  *				of buffer is stored in \a rec::lrh_len
- * \param[out] reccookie	pointer to the cookie to return back if needed.
+ * \param[in,out] reccookie	pointer to the cookie to return back if needed.
  *				It is used for further cancel of this llog
  *				record.
  * \param[in]  idx		index of the llog record. If \a idx == -1 then
@@ -490,26 +491,26 @@ static int llog_osd_write_rec(const struct lu_env *env,
 					     &lgi->lgi_off, th);
 
 			RETURN(rc);
-		} else if (loghandle->lgh_cur_idx > 0) {
+		} else if (llh->llh_flags & LLOG_F_IS_FIXSIZE) {
+			lgi->lgi_off = llh->llh_hdr.lrh_len +
+				       (idx - 1) * reclen;
+		} else if (reccookie != NULL && reccookie->lgc_index > 0) {
 			/**
-			 * The lgh_cur_offset can be used only if index is
+			 * The lgc_offset can be used only if index is
 			 * the same.
 			 */
-			if (idx != loghandle->lgh_cur_idx) {
+			if (idx != reccookie->lgc_index) {
 				CERROR("%s: modify index mismatch %d %d\n",
 				       o->do_lu.lo_dev->ld_obd->obd_name, idx,
-				       loghandle->lgh_cur_idx);
+				       reccookie->lgc_index);
 				RETURN(-EFAULT);
 			}
 
-			lgi->lgi_off = loghandle->lgh_cur_offset;
+			lgi->lgi_off = reccookie->lgc_offset;
 			CDEBUG(D_OTHER, "modify record "DFID": idx:%u, "
 			       "len:%u offset %llu\n",
 			       PFID(&loghandle->lgh_id.lgl_oi.oi_fid), idx,
 			       rec->lrh_len, (long long)lgi->lgi_off);
-		} else if (llh->llh_flags & LLOG_F_IS_FIXSIZE) {
-			lgi->lgi_off = llh->llh_hdr.lrh_len +
-				       (idx - 1) * reclen;
 		} else {
 			/* This can be result of lgh_cur_idx is not set during
 			 * llog processing or llh_size is not set to proper
@@ -590,6 +591,7 @@ static int llog_osd_write_rec(const struct lu_env *env,
 			RETURN(-ENOSPC);
 	}
 
+	down_write(&loghandle->lgh_last_sem);
 	/* increment the last_idx along with llh_tail index, they should
 	 * be equal for a llog lifetime */
 	loghandle->lgh_last_idx++;
@@ -673,6 +675,12 @@ static int llog_osd_write_rec(const struct lu_env *env,
 	if (rc)
 		GOTO(out, rc);
 
+	if (OBD_FAIL_PRECHECK(OBD_FAIL_LLOG_PROCESS_TIMEOUT) &&
+	   cfs_fail_val == (unsigned int)(loghandle->lgh_id.lgl_oi.oi.oi_id &
+					  0xFFFFFFFF)) {
+		OBD_RACE(OBD_FAIL_LLOG_PROCESS_TIMEOUT);
+		msleep(1 * MSEC_PER_SEC);
+	}
 	/* computed index can be used to determine offset for fixed-size
 	 * records. This also allows to handle Catalog wrap around case */
 	if (llh->llh_flags & LLOG_F_IS_FIXSIZE) {
@@ -693,6 +701,8 @@ static int llog_osd_write_rec(const struct lu_env *env,
 	if (rc < 0)
 		GOTO(out, rc);
 
+	up_write(&loghandle->lgh_last_sem);
+
 	CDEBUG(D_HA, "added record "DFID".%u, %u off%llu\n",
 	       PFID(lu_object_fid(&o->do_lu)), index, rec->lrh_len,
 	       lgi->lgi_off);
@@ -726,6 +736,7 @@ static int llog_osd_write_rec(const struct lu_env *env,
 	}
 
 	LLOG_HDR_TAIL(llh)->lrt_index = loghandle->lgh_last_idx;
+	up_write(&loghandle->lgh_last_sem);
 
 	RETURN(rc);
 }
@@ -781,19 +792,46 @@ static inline void llog_skip_over(struct llog_handle *lgh, __u64 *off,
  * big enough to handle the remapped records. It is also assumed that records
  * of a block have the same format (i.e.: the same features enabled).
  *
- * \param[in,out]    hdr	Header of the block of records to remap.
- * \param[in,out]    last_hdr   Last header, don't read past this point.
- * \param[in]        flags	Flags describing the fields to keep.
+ * \param[in,out]    hdr	   Header of the block of records to remap.
+ * \param[in,out]    last_hdr      Last header, don't read past this point.
+ * \param[in]        flags	   Flags describing the fields to keep.
+ * \param[in]        extra_flags   Flags describing the extra fields to keep.
  */
 static void changelog_block_trim_ext(struct llog_rec_hdr *hdr,
 				     struct llog_rec_hdr *last_hdr,
-				     enum changelog_rec_flags flags)
+				     struct llog_handle *loghandle)
 {
+	enum changelog_rec_flags flags = CLF_SUPPORTED;
+	enum changelog_rec_extra_flags extra_flags = CLFE_SUPPORTED;
+
+	if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_X_XATTR))
+		extra_flags &= ~CLFE_XATTR;
+	if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_X_OMODE))
+		extra_flags &= ~CLFE_OPEN;
+	if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_X_NID))
+		extra_flags &= ~CLFE_NID;
+	if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_X_UIDGID))
+		extra_flags &= ~CLFE_UIDGID;
+	if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_EXTRA_FLAGS))
+		flags &= ~CLF_EXTRA_FLAGS;
+	if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_JOBID))
+		flags &= ~CLF_JOBID;
+
+	if (flags == CLF_SUPPORTED && extra_flags == CLFE_SUPPORTED)
+		return;
+
 	if (hdr->lrh_type != CHANGELOG_REC)
 		return;
 
 	do {
 		struct changelog_rec *rec = (struct changelog_rec *)(hdr + 1);
+		enum changelog_rec_extra_flags xflag = CLFE_INVALID;
+
+		if (flags & CLF_EXTRA_FLAGS &&
+		    rec->cr_flags & CLF_EXTRA_FLAGS) {
+			xflag = changelog_rec_extra_flags(rec)->cr_extra_flags &
+				extra_flags;
+		}
 
 		if (unlikely(hdr->lrh_len == 0)) {
 			/* It is corruption case, we cannot know the next rec,
@@ -810,7 +848,7 @@ static void changelog_block_trim_ext(struct llog_rec_hdr *hdr,
 			break;
 		}
 
-		changelog_remap_rec(rec, rec->cr_flags & flags);
+		changelog_remap_rec(rec, rec->cr_flags & flags, xflag);
 		hdr = llog_rec_hdr_next(hdr);
 		/* Yield CPU to avoid soft-lockup if there are too many records
 		 * to be handled. */
@@ -864,7 +902,7 @@ static int llog_osd_next_block(const struct lu_env *env,
 
 	o = loghandle->lgh_obj;
 	LASSERT(o);
-	LASSERT(dt_object_exists(o));
+	LASSERT(llog_osd_exist(loghandle));
 	dt = lu2dt_dev(o->do_lu.lo_dev);
 	LASSERT(dt);
 
@@ -928,9 +966,25 @@ static int llog_osd_next_block(const struct lu_env *env,
 		rec = buf;
 		if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
 			lustre_swab_llog_rec(rec);
-
 		tail = (struct llog_rec_tail *)((char *)buf + rc -
 						sizeof(struct llog_rec_tail));
+
+		if (llog_verify_record(loghandle, rec)) {
+			/*
+			 * the block seems corrupted. make a pad record so the
+			 * caller can skip the block and try with the next one
+			 */
+			rec->lrh_len = rc;
+			rec->lrh_index = next_idx;
+			rec->lrh_type = LLOG_PAD_MAGIC;
+
+			tail = rec_tail(rec);
+			tail->lrt_len = rc;
+			tail->lrt_index = next_idx;
+
+			GOTO(out, rc = 0);
+		}
+
 		/* get the last record in block */
 		last_rec = (struct llog_rec_hdr *)((char *)buf + rc -
 						   tail->lrt_len);
@@ -969,7 +1023,7 @@ static int llog_osd_next_block(const struct lu_env *env,
 
 		/* sanity check that the start of the new buffer is no farther
 		 * than the record that we wanted.  This shouldn't happen. */
-		if (rec->lrh_index > next_idx) {
+		if (next_idx && rec->lrh_index > next_idx) {
 			if (!force_mini_rec && next_idx > last_idx)
 				goto retry;
 
@@ -980,9 +1034,7 @@ static int llog_osd_next_block(const struct lu_env *env,
 		}
 
 		/* Trim unsupported extensions for compat w/ older clients */
-		if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_JOBID))
-			changelog_block_trim_ext(rec, last_rec,
-						 CLF_VERSION | CLF_RENAME);
+		changelog_block_trim_ext(rec, last_rec, loghandle);
 
 		GOTO(out, rc = 0);
 
@@ -1040,7 +1092,7 @@ static int llog_osd_prev_block(const struct lu_env *env,
 
 	o = loghandle->lgh_obj;
 	LASSERT(o);
-	LASSERT(dt_object_exists(o));
+	LASSERT(llog_osd_exist(loghandle));
 	dt = lu2dt_dev(o->do_lu.lo_dev);
 	LASSERT(dt);
 
@@ -1117,9 +1169,7 @@ static int llog_osd_prev_block(const struct lu_env *env,
 		}
 
 		/* Trim unsupported extensions for compat w/ older clients */
-		if (!(loghandle->lgh_hdr->llh_flags & LLOG_F_EXT_JOBID))
-			changelog_block_trim_ext(rec, last_rec,
-						 CLF_VERSION | CLF_RENAME);
+		changelog_block_trim_ext(rec, last_rec, loghandle);
 
 		GOTO(out, rc = 0);
 	}
@@ -1408,7 +1458,7 @@ llog_osd_regular_fid_add_name_entry(const struct lu_env *env,
 			       (struct dt_key *)name, th);
 	} else {
 		rc = dt_insert(env, dir, (struct dt_rec *)rec,
-			       (struct dt_key *)name, th, 1);
+			       (struct dt_key *)name, th);
 	}
 	dt_write_unlock(env, dir);
 
@@ -1575,8 +1625,7 @@ static int llog_osd_create(const struct lu_env *env, struct llog_handle *res,
 		rec->rec_type = S_IFREG;
 		dt_read_lock(env, llog_dir, 0);
 		rc = dt_insert(env, llog_dir, (struct dt_rec *)rec,
-			       (struct dt_key *)res->lgh_name,
-			       th, 1);
+			       (struct dt_key *)res->lgh_name, th);
 		dt_read_unlock(env, llog_dir);
 		dt_object_put(env, llog_dir);
 		if (rc)
@@ -1766,7 +1815,7 @@ static int llog_osd_destroy(const struct lu_env *env,
 	LASSERT(o != NULL);
 
 	dt_write_lock(env, o, 0);
-	if (!dt_object_exists(o))
+	if (!llog_osd_exist(loghandle))
 		GOTO(out_unlock, rc = 0);
 
 	if (loghandle->lgh_name) {
@@ -1792,6 +1841,7 @@ static int llog_osd_destroy(const struct lu_env *env,
 	if (rc < 0)
 		GOTO(out_unlock, rc);
 
+	loghandle->lgh_destroyed = true;
 	if (loghandle->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) {
 		rc = llog_osd_regular_fid_del_name_entry(env, o, th, false);
 		if (rc < 0)
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_swab.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_swab.c
index 3ab0b430fca14..c644efb64ac1f 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/llog_swab.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_swab.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -58,9 +58,9 @@ static void print_llogd_body(struct llogd_body *d)
 
 void lustre_swab_lu_fid(struct lu_fid *fid)
 {
-        __swab64s (&fid->f_seq);
-        __swab32s (&fid->f_oid);
-        __swab32s (&fid->f_ver);
+	__swab64s(&fid->f_seq);
+	__swab32s(&fid->f_oid);
+	__swab32s(&fid->f_ver);
 }
 EXPORT_SYMBOL(lustre_swab_lu_fid);
 
@@ -80,47 +80,47 @@ void lustre_swab_llog_id(struct llog_logid *log_id)
 {
 	__swab64s(&log_id->lgl_oi.oi.oi_id);
 	__swab64s(&log_id->lgl_oi.oi.oi_seq);
-        __swab32s(&log_id->lgl_ogen);
+	__swab32s(&log_id->lgl_ogen);
 }
 
 void lustre_swab_llogd_body (struct llogd_body *d)
 {
-        ENTRY;
-        print_llogd_body(d);
+	ENTRY;
+	print_llogd_body(d);
 	lustre_swab_llog_id(&d->lgd_logid);
-        __swab32s (&d->lgd_ctxt_idx);
-        __swab32s (&d->lgd_llh_flags);
-        __swab32s (&d->lgd_index);
-        __swab32s (&d->lgd_saved_index);
-        __swab32s (&d->lgd_len);
-        __swab64s (&d->lgd_cur_offset);
-        print_llogd_body(d);
-        EXIT;
+	__swab32s(&d->lgd_ctxt_idx);
+	__swab32s(&d->lgd_llh_flags);
+	__swab32s(&d->lgd_index);
+	__swab32s(&d->lgd_saved_index);
+	__swab32s(&d->lgd_len);
+	__swab64s(&d->lgd_cur_offset);
+	print_llogd_body(d);
+	EXIT;
 }
 EXPORT_SYMBOL(lustre_swab_llogd_body);
 
 void lustre_swab_llogd_conn_body (struct llogd_conn_body *d)
 {
-        __swab64s (&d->lgdc_gen.mnt_cnt);
-        __swab64s (&d->lgdc_gen.conn_cnt);
+	__swab64s(&d->lgdc_gen.mnt_cnt);
+	__swab64s(&d->lgdc_gen.conn_cnt);
 	lustre_swab_llog_id(&d->lgdc_logid);
-        __swab32s (&d->lgdc_ctxt_idx);
+	__swab32s(&d->lgdc_ctxt_idx);
 }
 EXPORT_SYMBOL(lustre_swab_llogd_conn_body);
 
 void lustre_swab_ll_fid(struct ll_fid *fid)
 {
-        __swab64s (&fid->id);
-        __swab32s (&fid->generation);
-        __swab32s (&fid->f_type);
+	__swab64s(&fid->id);
+	__swab32s(&fid->generation);
+	__swab32s(&fid->f_type);
 }
 
 void lustre_swab_lu_seq_range(struct lu_seq_range *range)
 {
-        __swab64s (&range->lsr_start);
-        __swab64s (&range->lsr_end);
-        __swab32s (&range->lsr_index);
-        __swab32s (&range->lsr_flags);
+	__swab64s(&range->lsr_start);
+	__swab64s(&range->lsr_end);
+	__swab32s(&range->lsr_index);
+	__swab32s(&range->lsr_flags);
 }
 EXPORT_SYMBOL(lustre_swab_lu_seq_range);
 
@@ -143,32 +143,32 @@ void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
 {
 	struct llog_rec_tail *tail = NULL;
 
-        __swab32s(&rec->lrh_len);
-        __swab32s(&rec->lrh_index);
-        __swab32s(&rec->lrh_type);
+	__swab32s(&rec->lrh_len);
+	__swab32s(&rec->lrh_index);
+	__swab32s(&rec->lrh_type);
 	__swab32s(&rec->lrh_id);
 
-        switch (rec->lrh_type) {
+	switch (rec->lrh_type) {
 	case OST_SZ_REC:
 	{
-                struct llog_size_change_rec *lsc =
-                        (struct llog_size_change_rec *)rec;
+		struct llog_size_change_rec *lsc =
+			(struct llog_size_change_rec *)rec;
 
-                lustre_swab_ll_fid(&lsc->lsc_fid);
-                __swab32s(&lsc->lsc_ioepoch);
+		lustre_swab_ll_fid(&lsc->lsc_fid);
+		__swab32s(&lsc->lsc_ioepoch);
 		tail = &lsc->lsc_tail;
-                break;
-        }
+		break;
+	}
 	case MDS_UNLINK_REC:
 	{
-                struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec;
+		struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec;
 
-                __swab64s(&lur->lur_oid);
-                __swab32s(&lur->lur_oseq);
-                __swab32s(&lur->lur_count);
+		__swab64s(&lur->lur_oid);
+		__swab32s(&lur->lur_oseq);
+		__swab32s(&lur->lur_count);
 		tail = &lur->lur_tail;
-                break;
-        }
+		break;
+	}
 	case MDS_UNLINK64_REC:
 	{
 		struct llog_unlink64_rec *lur =
@@ -199,8 +199,10 @@ void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
 			lustre_swab_lu_fid(&rnm->cr_sfid);
 			lustre_swab_lu_fid(&rnm->cr_spfid);
 		}
-		/* Because the tail follows a variable-length structure we need
-		 * to compute its location at runtime */
+		/*
+		 * Because the tail follows a variable-length structure we need
+		 * to compute its location at runtime
+		 */
 		tail = (struct llog_rec_tail *)((char *)&cr->cr +
 						changelog_rec_size(&cr->cr) +
 						cr->cr.cr_namelen);
@@ -209,14 +211,15 @@ void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
 
 	case CHANGELOG_USER_REC:
 	{
-                struct llog_changelog_user_rec *cur =
-                        (struct llog_changelog_user_rec*)rec;
+		struct llog_changelog_user_rec *cur =
+			(struct llog_changelog_user_rec *)rec;
 
-                __swab32s(&cur->cur_id);
-                __swab64s(&cur->cur_endrec);
+		__swab32s(&cur->cur_id);
+		__swab64s(&cur->cur_endrec);
+		__swab32s(&cur->cur_time);
 		tail = &cur->cur_tail;
-                break;
-        }
+		break;
+	}
 
 	case HSM_AGENT_REC: {
 		struct llog_agent_req_rec *arr =
@@ -230,8 +233,10 @@ void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
 		__swab64s(&arr->arr_hai.hai_extent.offset);
 		__swab64s(&arr->arr_hai.hai_extent.length);
 		__swab64s(&arr->arr_hai.hai_gid);
-		/* no swabing for opaque data */
-		/* hai_data[0]; */
+		/*
+		 * no swabing for opaque data
+		 * hai_data[0];
+		 */
 		break;
 	}
 
@@ -252,6 +257,7 @@ void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
 				(struct llog_setattr64_rec_v2 *)rec;
 
 			__swab32s(&lsr2->lsr_projid);
+			__swab32s(&lsr2->lsr_layout_version);
 			tail = &lsr2->lsr_tail;
 		} else {
 			tail = &lsr->lsr_tail;
@@ -291,8 +297,8 @@ void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
 		tail = &lgr->lgr_tail;
 		break;
 	}
-        case LLOG_PAD_MAGIC:
-                break;
+	case LLOG_PAD_MAGIC:
+		break;
 	case UPDATE_REC:
 	{
 		struct llog_update_record *lur =
@@ -312,10 +318,10 @@ void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
 						update_records_size(record));
 		break;
 	}
-        default:
-                CERROR("Unknown llog rec type %#x swabbing rec %p\n",
-                       rec->lrh_type, rec);
-        }
+	default:
+		CERROR("Unknown llog rec type %#x swabbing rec %p\n",
+			rec->lrh_type, rec);
+	}
 
 	if (tail) {
 		__swab32s(&tail->lrt_len);
@@ -343,31 +349,33 @@ static void print_llog_hdr(struct llog_log_hdr *h)
 
 void lustre_swab_llog_hdr (struct llog_log_hdr *h)
 {
-        ENTRY;
-        print_llog_hdr(h);
+	ENTRY;
+	print_llog_hdr(h);
 
 	lustre_swab_llog_rec(&h->llh_hdr);
 
-        print_llog_hdr(h);
-        EXIT;
+	print_llog_hdr(h);
+	EXIT;
 }
 EXPORT_SYMBOL(lustre_swab_llog_hdr);
 
 void print_lustre_cfg(struct lustre_cfg *lcfg)
 {
-        int i;
-        ENTRY;
+	int i;
 
-        if (!(libcfs_debug & D_OTHER)) /* don't loop on nothing */
-                return;
+	ENTRY;
 
-        CDEBUG(D_OTHER, "lustre_cfg: %p\n", lcfg);
-        CDEBUG(D_OTHER, "\tlcfg->lcfg_version: %#x\n", lcfg->lcfg_version);
+	if (!(libcfs_debug & D_OTHER)) /* don't loop on nothing */
+		return;
 
-        CDEBUG(D_OTHER, "\tlcfg->lcfg_command: %#x\n", lcfg->lcfg_command);
-        CDEBUG(D_OTHER, "\tlcfg->lcfg_num: %#x\n", lcfg->lcfg_num);
-        CDEBUG(D_OTHER, "\tlcfg->lcfg_flags: %#x\n", lcfg->lcfg_flags);
-        CDEBUG(D_OTHER, "\tlcfg->lcfg_nid: %s\n", libcfs_nid2str(lcfg->lcfg_nid));
+	CDEBUG(D_OTHER, "lustre_cfg: %p\n", lcfg);
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_version: %#x\n", lcfg->lcfg_version);
+
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_command: %#x\n", lcfg->lcfg_command);
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_num: %#x\n", lcfg->lcfg_num);
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_flags: %#x\n", lcfg->lcfg_flags);
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_nid: %s\n",
+	       libcfs_nid2str(lcfg->lcfg_nid));
 
 	CDEBUG(D_OTHER, "\tlcfg->lcfg_bufcount: %d\n", lcfg->lcfg_bufcount);
 	if (lcfg->lcfg_bufcount < LUSTRE_CFG_MAX_BUFCOUNT)
@@ -377,47 +385,48 @@ void print_lustre_cfg(struct lustre_cfg *lcfg)
 			       lustre_cfg_string(lcfg, i));
 		}
 
-        EXIT;
+	EXIT;
 }
 EXPORT_SYMBOL(print_lustre_cfg);
 
 void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg)
 {
-        int i;
-        ENTRY;
-
-        __swab32s(&lcfg->lcfg_version);
-
-        if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) {
-                CERROR("not swabbing lustre_cfg version %#x (expecting %#x)\n",
-                       lcfg->lcfg_version, LUSTRE_CFG_VERSION);
-                EXIT;
-                return;
-        }
-
-        __swab32s(&lcfg->lcfg_command);
-        __swab32s(&lcfg->lcfg_num);
-        __swab32s(&lcfg->lcfg_flags);
-        __swab64s(&lcfg->lcfg_nid);
-        __swab32s(&lcfg->lcfg_bufcount);
-        for (i = 0; i < lcfg->lcfg_bufcount && i < LUSTRE_CFG_MAX_BUFCOUNT; i++)
-                __swab32s(&lcfg->lcfg_buflens[i]);
-
-        print_lustre_cfg(lcfg);
-        EXIT;
-        return;
+	int i;
+
+	ENTRY;
+
+	__swab32s(&lcfg->lcfg_version);
+
+	if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) {
+		CERROR("not swabbing lustre_cfg version %#x (expecting %#x)\n",
+			lcfg->lcfg_version, LUSTRE_CFG_VERSION);
+		EXIT;
+		return;
+	}
+
+	__swab32s(&lcfg->lcfg_command);
+	__swab32s(&lcfg->lcfg_num);
+	__swab32s(&lcfg->lcfg_flags);
+	__swab64s(&lcfg->lcfg_nid);
+	__swab32s(&lcfg->lcfg_bufcount);
+	for (i = 0; i < lcfg->lcfg_bufcount && i < LUSTRE_CFG_MAX_BUFCOUNT; i++)
+		__swab32s(&lcfg->lcfg_buflens[i]);
+
+	print_lustre_cfg(lcfg);
+	EXIT;
+	return;
 }
 
 /* used only for compatibility with old on-disk cfg_marker data */
 struct cfg_marker32 {
-        __u32   cm_step;
-        __u32   cm_flags;
-        __u32   cm_vers;
-        __u32   padding;
-        __u32   cm_createtime;
-        __u32   cm_canceltime;
-        char    cm_tgtname[MTI_NAME_MAXLEN];
-        char    cm_comment[MTI_NAME_MAXLEN];
+	__u32   cm_step;
+	__u32   cm_flags;
+	__u32   cm_vers;
+	__u32   padding;
+	__u32   cm_createtime;
+	__u32   cm_canceltime;
+	char    cm_tgtname[MTI_NAME_MAXLEN];
+	char    cm_comment[MTI_NAME_MAXLEN];
 };
 
 #define MTI_NAMELEN32    (MTI_NAME_MAXLEN - \
@@ -425,48 +434,51 @@ struct cfg_marker32 {
 
 void lustre_swab_cfg_marker(struct cfg_marker *marker, int swab, int size)
 {
-        struct cfg_marker32 *cm32 = (struct cfg_marker32*)marker;
-        ENTRY;
-
-        if (swab) {
-                __swab32s(&marker->cm_step);
-                __swab32s(&marker->cm_flags);
-                __swab32s(&marker->cm_vers);
-        }
-        if (size == sizeof(*cm32)) {
-                __u32 createtime, canceltime;
-                /* There was a problem with the original declaration of
-                 * cfg_marker on 32-bit systems because it used time_t as
-                 * a wire protocol structure, and didn't verify this in
-                 * wirecheck.  We now have to convert the offsets of the
-                 * later fields in order to work on 32- and 64-bit systems.
-                 *
-                 * Fortunately, the cm_comment field has no functional use
-                 * so can be sacrificed when converting the timestamp size.
-                 *
-                 * Overwrite fields from the end first, so they are not
-                 * clobbered, and use memmove() instead of memcpy() because
-                 * the source and target buffers overlap.  bug 16771 */
-                createtime = cm32->cm_createtime;
-                canceltime = cm32->cm_canceltime;
-                memmove(marker->cm_comment, cm32->cm_comment, MTI_NAMELEN32);
-                marker->cm_comment[MTI_NAMELEN32 - 1] = '\0';
-                memmove(marker->cm_tgtname, cm32->cm_tgtname,
-                        sizeof(marker->cm_tgtname));
-                if (swab) {
-                        __swab32s(&createtime);
-                        __swab32s(&canceltime);
-                }
-                marker->cm_createtime = createtime;
-                marker->cm_canceltime = canceltime;
-                CDEBUG(D_CONFIG, "Find old cfg_marker(Srv32b,Clt64b) "
-                       "for target %s, converting\n",
-                       marker->cm_tgtname);
-        } else if (swab) {
-                __swab64s(&marker->cm_createtime);
-                __swab64s(&marker->cm_canceltime);
-        }
-
-        EXIT;
-        return;
+	struct cfg_marker32 *cm32 = (struct cfg_marker32 *)marker;
+
+	ENTRY;
+
+	if (swab) {
+		__swab32s(&marker->cm_step);
+		__swab32s(&marker->cm_flags);
+		__swab32s(&marker->cm_vers);
+	}
+	if (size == sizeof(*cm32)) {
+		__u32 createtime, canceltime;
+		/*
+		 * There was a problem with the original declaration of
+		 * cfg_marker on 32-bit systems because it used time_t as
+		 * a wire protocol structure, and didn't verify this in
+		 * wirecheck.  We now have to convert the offsets of the
+		 * later fields in order to work on 32- and 64-bit systems.
+		 *
+		 * Fortunately, the cm_comment field has no functional use
+		 * so can be sacrificed when converting the timestamp size.
+		 *
+		 * Overwrite fields from the end first, so they are not
+		 * clobbered, and use memmove() instead of memcpy() because
+		 * the source and target buffers overlap.  bug 16771
+		 */
+		createtime = cm32->cm_createtime;
+		canceltime = cm32->cm_canceltime;
+		memmove(marker->cm_comment, cm32->cm_comment, MTI_NAMELEN32);
+		marker->cm_comment[MTI_NAMELEN32 - 1] = '\0';
+		memmove(marker->cm_tgtname, cm32->cm_tgtname,
+			sizeof(marker->cm_tgtname));
+		if (swab) {
+			__swab32s(&createtime);
+			__swab32s(&canceltime);
+		}
+		marker->cm_createtime = createtime;
+		marker->cm_canceltime = canceltime;
+		CDEBUG(D_CONFIG,
+		       "Find old cfg_marker(Srv32b,Clt64b) for target %s, converting\n",
+		       marker->cm_tgtname);
+	} else if (swab) {
+		__swab64s(&marker->cm_createtime);
+		__swab64s(&marker->cm_canceltime);
+	}
+
+	EXIT;
+	return;
 }
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/llog_test.c b/drivers/staging/lustrefsx/lustre/obdclass/llog_test.c
index 27f52aa15078b..f1517ceef7198 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/llog_test.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/llog_test.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -39,6 +39,8 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
 
 #include <obd_class.h>
 #include <lustre_fid.h>
@@ -47,15 +49,14 @@
 /* This is slightly more than the number of records that can fit into a
  * single llog file, because the llog_log_header takes up some of the
  * space in the first block that cannot be used for the bitmap. */
-#define LLOG_TEST_RECNUM  (LLOG_MIN_CHUNK_SIZE * 8)
-
+static int llog_test_recnum = (LLOG_MIN_CHUNK_SIZE * 8);
 static int llog_test_rand;
 static struct obd_uuid uuid = { .uuid = "test_uuid" };
 static struct llog_logid cat_logid;
 
 struct llog_mini_rec {
-        struct llog_rec_hdr     lmr_hdr;
-        struct llog_rec_tail    lmr_tail;
+	struct llog_rec_hdr lmr_hdr;
+	struct llog_rec_tail lmr_tail;
 } __attribute__((packed));
 
 static int verify_handle(char *test, struct llog_handle *llh, int num_recs)
@@ -101,8 +102,8 @@ static int verify_handle(char *test, struct llog_handle *llh, int num_recs)
 static int llog_test_1(const struct lu_env *env,
 		       struct obd_device *obd, char *name)
 {
-	struct llog_handle	*llh;
-	struct llog_ctxt	*ctxt;
+	struct llog_handle *llh;
+	struct llog_ctxt *ctxt;
 	int rc;
 	int rc2;
 
@@ -148,11 +149,11 @@ static int test_2_cancel_cb(const struct lu_env *env, struct llog_handle *llh,
 static int llog_test_2(const struct lu_env *env, struct obd_device *obd,
 		       char *name, struct llog_handle **llh)
 {
-	struct llog_ctxt	*ctxt;
-	struct llog_handle	*lgh;
-	struct llog_logid	 logid;
-	int			 rc;
-	struct llog_mini_rec	 lmr;
+	struct llog_ctxt *ctxt;
+	struct llog_handle *lgh;
+	struct llog_logid  logid;
+	int rc;
+	struct llog_mini_rec lmr;
 
 	ENTRY;
 
@@ -191,7 +192,7 @@ static int llog_test_2(const struct lu_env *env, struct obd_device *obd,
 	logid = lgh->lgh_id;
 
 	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
-	lmr.lmr_hdr.lrh_type = 0xf02f02;
+	lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC;
 
 	/* Check llog header values are correct after record add/cancel */
 	CWARN("2b: write 1 llog records, check llh_count\n");
@@ -301,8 +302,10 @@ static int test3_check_n_add_cb(const struct lu_env *env,
 	} else {
 		size_t chunk_size = lgh->lgh_hdr->llh_hdr.lrh_len;
 
-		/* For variable size records the start offset is unknown, trust
-		 * the first value and check others are consistent with it. */
+		/*
+		 * For variable size records the start offset is unknown, trust
+		 * the first value and check others are consistent with it.
+		 */
 		if (test_3_rec_off == 0)
 			test_3_rec_off = lgh->lgh_cur_offset;
 
@@ -337,8 +340,10 @@ static int test3_check_n_add_cb(const struct lu_env *env,
 	if (rc < 0)
 		CERROR("cb_test_3: cannot modify record while processing\n");
 
-	/* Add new record to the llog at *last_rec position one by one to
-	 * check that last block is re-read during processing */
+	/*
+	 * Add new record to the llog at *last_rec position one by one to
+	 * check that last block is re-read during processing
+	 */
 	if (cur_idx == *last_rec || cur_idx == (*last_rec + 1)) {
 		rc = llog_write(env, lgh, rec, LLOG_NEXT_IDX);
 		if (rc < 0)
@@ -404,7 +409,8 @@ static int llog_test_3(const struct lu_env *env, struct obd_device *obd,
 	llh->lgh_hdr->llh_size = sizeof(struct llog_gen_rec);
 	llh->lgh_hdr->llh_flags |= LLOG_F_IS_FIXSIZE;
 
-	/* Fill the llog with 64-bytes records, use 1023 records,
+	/*
+	 * Fill the llog with 64-bytes records, use 1023 records,
 	 * so last chunk will be partially full. Don't change this
 	 * value until record size is changed.
 	 */
@@ -466,14 +472,17 @@ static int llog_test_3(const struct lu_env *env, struct obd_device *obd,
 
 	CWARN("3b: write 566 variable size llog records\n");
 
-	/* Drop llh_size to 0 to mark llog as variable-size and write
-	 * header to make this change permanent. */
+	/*
+	 * Drop llh_size to 0 to mark llog as variable-size and write
+	 * header to make this change permanent.
+	 */
 	llh->lgh_hdr->llh_flags &= ~LLOG_F_IS_FIXSIZE;
 	llog_write(env, llh, &llh->lgh_hdr->llh_hdr, LLOG_HEADER_IDX);
 
 	hdr->lrh_type = OBD_CFG_REC;
 
-	/* there are 1025 64-bytes records in llog already,
+	/*
+	 * there are 1025 64-bytes records in llog already,
 	 * the last chunk contains single record, i.e. 64 bytes.
 	 * Each pair of variable size records is 200 bytes, so
 	 * we will have the following distribution per chunks:
@@ -566,15 +575,15 @@ static int llog_test_3(const struct lu_env *env, struct obd_device *obd,
 /* Test catalogue additions */
 static int llog_test_4(const struct lu_env *env, struct obd_device *obd)
 {
-	struct llog_handle	*cath;
-	char			 name[10];
-	int			 rc, rc2, i, buflen;
-	struct llog_mini_rec	 lmr;
-	struct llog_cookie	 cookie;
-	struct llog_ctxt	*ctxt;
-	int			 num_recs = 0;
-	char			*buf;
-	struct llog_rec_hdr	*rec;
+	struct llog_handle *cath, *llh;
+	char name[10];
+	int rc, rc2, i, buflen;
+	struct llog_mini_rec lmr;
+	struct llog_cookie cookie;
+	struct llog_ctxt *ctxt;
+	int num_recs = 0;
+	char *buf;
+	struct llog_rec_hdr *rec;
 
 	ENTRY;
 
@@ -582,7 +591,7 @@ static int llog_test_4(const struct lu_env *env, struct obd_device *obd)
 	LASSERT(ctxt);
 
 	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
-	lmr.lmr_hdr.lrh_type = 0xf00f00;
+	lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC;
 
 	sprintf(name, "%x", llog_test_rand + 1);
 	CWARN("4a: create a catalog log with name: %s\n", name);
@@ -615,6 +624,18 @@ static int llog_test_4(const struct lu_env *env, struct obd_device *obd)
 	if (rc)
 		GOTO(out, rc);
 
+	/* estimate the max number of record for the plain llog
+	 * cause it depends on disk size
+	 */
+	llh = cath->u.chd.chd_current_log;
+	if (llh->lgh_max_size != 0) {
+		llog_test_recnum = (llh->lgh_max_size -
+			sizeof(struct llog_log_hdr)) / LLOG_MIN_REC_SIZE;
+	}
+
+	if (llog_test_recnum >= LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr))
+		llog_test_recnum = LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr) - 1;
+
 	CWARN("4c: cancel 1 log record\n");
 	rc = llog_cat_cancel_records(env, cath, 1, &cookie);
 	if (rc) {
@@ -627,12 +648,12 @@ static int llog_test_4(const struct lu_env *env, struct obd_device *obd)
 	if (rc)
 		GOTO(out, rc);
 
-	CWARN("4d: write %d more log records\n", LLOG_TEST_RECNUM);
-	for (i = 0; i < LLOG_TEST_RECNUM; i++) {
+	CWARN("4d: write %d more log records\n", llog_test_recnum);
+	for (i = 0; i < llog_test_recnum; i++) {
 		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
 		if (rc) {
 			CERROR("4d: write %d records failed at #%d: %d\n",
-			       LLOG_TEST_RECNUM, i + 1, rc);
+			       llog_test_recnum, i + 1, rc);
 			GOTO(out, rc);
 		}
 		num_recs++;
@@ -680,8 +701,8 @@ static int cat_counter;
 static int cat_print_cb(const struct lu_env *env, struct llog_handle *llh,
 			struct llog_rec_hdr *rec, void *data)
 {
-	struct llog_logid_rec	*lir = (struct llog_logid_rec *)rec;
-	struct lu_fid		 fid = {0};
+	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+	struct lu_fid fid = {0};
 
 	if (rec->lrh_type != LLOG_LOGID_MAGIC) {
 		CERROR("invalid record in catalog\n");
@@ -739,7 +760,7 @@ static int llog_cancel_rec_cb(const struct lu_env *env,
 
 	llog_cat_cancel_records(env, llh->u.phd.phd_cat_handle, 1, &cookie);
 	cancel_count++;
-	if (cancel_count == LLOG_TEST_RECNUM)
+	if (cancel_count == llog_test_recnum)
 		RETURN(-LLOG_EEMPTY);
 	RETURN(0);
 }
@@ -747,11 +768,11 @@ static int llog_cancel_rec_cb(const struct lu_env *env,
 /* Test log and catalogue processing */
 static int llog_test_5(const struct lu_env *env, struct obd_device *obd)
 {
-	struct llog_handle	*llh = NULL;
-	char			 name[10];
-	int			 rc, rc2;
-	struct llog_mini_rec	 lmr;
-	struct llog_ctxt	*ctxt;
+	struct llog_handle *llh = NULL;
+	char name[10];
+	int rc, rc2;
+	struct llog_mini_rec lmr;
+	struct llog_ctxt *ctxt;
 
 	ENTRY;
 
@@ -759,7 +780,7 @@ static int llog_test_5(const struct lu_env *env, struct obd_device *obd)
 	LASSERT(ctxt);
 
 	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
-	lmr.lmr_hdr.lrh_type = 0xf00f00;
+	lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC;
 
 	CWARN("5a: re-open catalog by id\n");
 	rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS);
@@ -786,7 +807,7 @@ static int llog_test_5(const struct lu_env *env, struct obd_device *obd)
 		GOTO(out, rc = -EINVAL);
 	}
 
-	CWARN("5c: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
+	CWARN("5c: Cancel %d records, see one log zapped\n", llog_test_recnum);
 	cancel_count = 0;
 	rc = llog_cat_process(env, llh, llog_cancel_rec_cb, "foobar", 0, 0);
 	if (rc != -LLOG_EEMPTY) {
@@ -857,14 +878,14 @@ static int llog_test_5(const struct lu_env *env, struct obd_device *obd)
 static int llog_test_6(const struct lu_env *env, struct obd_device *obd,
 		       char *name)
 {
-	struct obd_device	*mgc_obd;
-	struct llog_ctxt	*ctxt;
-	struct obd_uuid		*mgs_uuid;
-	struct obd_export	*exp;
-	struct obd_uuid		 uuid = { "LLOG_TEST6_UUID" };
-	struct llog_handle	*llh = NULL;
-	struct llog_ctxt	*nctxt;
-	int			 rc, rc2;
+	struct obd_device *mgc_obd;
+	struct llog_ctxt *ctxt;
+	struct obd_uuid *mgs_uuid;
+	struct obd_export *exp;
+	struct obd_uuid uuid = { "LLOG_TEST6_UUID" };
+	struct llog_handle *llh = NULL;
+	struct llog_ctxt *nctxt;
+	int rc, rc2;
 
 	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
 	LASSERT(ctxt);
@@ -973,9 +994,9 @@ static int test_7_cancel_cb(const struct lu_env *env, struct llog_handle *llh,
 
 static int llog_test_7_sub(const struct lu_env *env, struct llog_ctxt *ctxt)
 {
-	struct llog_handle	*llh;
-	int			 rc = 0, i, process_count;
-	int			 num_recs = 0;
+	struct llog_handle *llh;
+	int rc = 0, i, process_count;
+	int num_recs = 0;
 
 	ENTRY;
 
@@ -1058,8 +1079,8 @@ static int llog_test_7_sub(const struct lu_env *env, struct llog_ctxt *ctxt)
 /* Test all llog records writing and processing */
 static int llog_test_7(const struct lu_env *env, struct obd_device *obd)
 {
-	struct llog_ctxt	*ctxt;
-	int			 rc;
+	struct llog_ctxt *ctxt;
+	int rc;
 
 	ENTRY;
 
@@ -1158,61 +1179,6 @@ static int llog_test_7(const struct lu_env *env, struct obd_device *obd)
 	RETURN(rc);
 }
 
-static int llog_truncate(const struct lu_env *env, struct dt_object *o)
-{
-	struct lu_attr		 la;
-	struct thandle		*th;
-	struct dt_device	*d;
-	int			 rc;
-	ENTRY;
-
-	LASSERT(o);
-	d = lu2dt_dev(o->do_lu.lo_dev);
-	LASSERT(d);
-
-	rc = dt_attr_get(env, o, &la);
-	if (rc)
-		RETURN(rc);
-
-	CDEBUG(D_OTHER, "original size %llu\n", la.la_size);
-	rc = sizeof(struct llog_log_hdr) + sizeof(struct llog_mini_rec);
-	if (la.la_size < rc) {
-		CERROR("too small llog: %llu\n", la.la_size);
-		RETURN(0);
-	}
-
-	/* drop 2 records */
-	la.la_size = la.la_size - (sizeof(struct llog_mini_rec) * 2);
-	la.la_valid = LA_SIZE;
-
-	th = dt_trans_create(env, d);
-	if (IS_ERR(th))
-		RETURN(PTR_ERR(th));
-
-	rc = dt_declare_attr_set(env, o, &la, th);
-	if (rc)
-		GOTO(stop, rc);
-
-	rc = dt_declare_punch(env, o, la.la_size, OBD_OBJECT_EOF, th);
-
-	rc = dt_trans_start_local(env, d, th);
-	if (rc)
-		GOTO(stop, rc);
-
-	rc = dt_punch(env, o, la.la_size, OBD_OBJECT_EOF, th);
-	if (rc)
-		GOTO(stop, rc);
-
-	rc = dt_attr_set(env, o, &la, th);
-	if (rc)
-		GOTO(stop, rc);
-
-stop:
-	dt_trans_stop(env, d, th);
-
-	RETURN(rc);
-}
-
 static int test_8_cb(const struct lu_env *env, struct llog_handle *llh,
 			  struct llog_rec_hdr *rec, void *data)
 {
@@ -1222,13 +1188,13 @@ static int test_8_cb(const struct lu_env *env, struct llog_handle *llh,
 
 static int llog_test_8(const struct lu_env *env, struct obd_device *obd)
 {
-	struct llog_handle	*llh = NULL;
-	char			 name[10];
-	int			 rc, rc2, i;
-	int			 orig_counter;
-	struct llog_mini_rec	 lmr;
-	struct llog_ctxt	*ctxt;
-	struct dt_object	*obj = NULL;
+	struct llog_handle *llh = NULL;
+	char name[10];
+	int rc, rc2, i;
+	int orig_counter;
+	struct llog_mini_rec lmr;
+	struct llog_ctxt *ctxt;
+	struct dt_object *obj = NULL;
 
 	ENTRY;
 
@@ -1236,7 +1202,7 @@ static int llog_test_8(const struct lu_env *env, struct obd_device *obd)
 	LASSERT(ctxt);
 
 	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
-	lmr.lmr_hdr.lrh_type = 0xf00f00;
+	lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC;
 
 	CWARN("8a: fill the first plain llog\n");
 	rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS);
@@ -1302,7 +1268,7 @@ static int llog_test_8(const struct lu_env *env, struct obd_device *obd)
 		}
 	}
 	CWARN("8b: second llog "DFID"\n",
-		PFID(lu_object_fid(&llh->u.chd.chd_current_log->lgh_obj->do_lu)));
+	      PFID(lu_object_fid(&llh->u.chd.chd_current_log->lgh_obj->do_lu)));
 
 	rc2 = llog_cat_close(env, llh);
 	if (rc2) {
@@ -1312,8 +1278,10 @@ static int llog_test_8(const struct lu_env *env, struct obd_device *obd)
 		GOTO(out_put, rc);
 	}
 
-	CWARN("8c: drop two records from the first plain llog\n");
-	llog_truncate(env, obj);
+	/* Here was 8c: drop two records from the first plain llog
+	 * llog_truncate was bad idea cause it creates a wrong state,
+	 * lgh_last_idx is wrong and two records belongs to zeroed buffer
+	 */
 
 	CWARN("8d: count survived records\n");
 	rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS);
@@ -1335,9 +1303,9 @@ static int llog_test_8(const struct lu_env *env, struct obd_device *obd)
 		GOTO(out, rc);
 	}
 
-	if (orig_counter + 200 - 2 != plain_counter) {
+	if (orig_counter + 200 != plain_counter) {
 		CERROR("found %d records (expected %d)\n", plain_counter,
-		       orig_counter + 200 - 2);
+		       orig_counter + 200);
 		rc = -EIO;
 	}
 
@@ -1360,9 +1328,9 @@ static int llog_test_8(const struct lu_env *env, struct obd_device *obd)
 
 static int llog_test_9_sub(const struct lu_env *env, struct llog_ctxt *ctxt)
 {
-	struct llog_handle	*llh;
-	struct lu_fid		 fid;
-	int			 rc = 0;
+	struct llog_handle *llh;
+	struct lu_fid fid;
+	int rc = 0;
 
 	ENTRY;
 
@@ -1397,8 +1365,8 @@ static int llog_test_9_sub(const struct lu_env *env, struct llog_ctxt *ctxt)
 /* Prepare different types of llog records for llog_reader test*/
 static int llog_test_9(const struct lu_env *env, struct obd_device *obd)
 {
-	struct llog_ctxt	*ctxt;
-	int			 rc;
+	struct llog_ctxt *ctxt;
+	int rc;
 
 	ENTRY;
 
@@ -1454,17 +1422,80 @@ static int llog_test_9(const struct lu_env *env, struct obd_device *obd)
 	RETURN(rc);
 }
 
+struct llog_process_info {
+	struct llog_handle *lpi_loghandle;
+	llog_cb_t lpi_cb;
+	void *lpi_cbdata;
+	void *lpi_catdata;
+	int lpi_rc;
+	struct completion lpi_completion;
+	const struct lu_env *lpi_env;
+	struct task_struct *lpi_reftask;
+};
+
+
+static int llog_test_process_thread(void *arg)
+{
+	struct llog_process_info *lpi = arg;
+	int rc;
+
+	rc = llog_cat_process_or_fork(NULL, lpi->lpi_loghandle, lpi->lpi_cb,
+				      NULL, lpi->lpi_cbdata, 1, 0, true);
+
+	complete(&lpi->lpi_completion);
+
+	lpi->lpi_rc = rc;
+	if (rc)
+		CWARN("10h: Error during catalog processing %d\n", rc);
+	return rc;
+}
+
+static int cat_check_old_cb(const struct lu_env *env, struct llog_handle *llh,
+			struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+	struct lu_fid fid = {0};
+	struct lu_fid *prev_fid = data;
+
+	if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+		CERROR("invalid record in catalog\n");
+		RETURN(-EINVAL);
+	}
+
+	logid_to_fid(&lir->lid_id, &fid);
+
+	CWARN("seeing record at index %d - "DFID" in log "DFID"\n",
+	      rec->lrh_index, PFID(&fid),
+	      PFID(lu_object_fid(&llh->lgh_obj->do_lu)));
+
+	if (prev_fid->f_oid > fid.f_oid) {
+		CWARN("processing old record, fail\n");
+		prev_fid->f_oid = 0xbad;
+		RETURN(-LLOG_EEMPTY);
+	}
+
+	if (prev_fid->f_oid == 0) {
+		cfs_fail_loc = OBD_FAIL_ONCE | OBD_FAIL_LLOG_PROCESS_TIMEOUT;
+		cfs_fail_val = (unsigned int) (llh->lgh_id.lgl_oi.oi.oi_id &
+					       0xFFFFFFFF);
+		msleep(1 * MSEC_PER_SEC);
+	}
+	*prev_fid = fid;
+
+	RETURN(0);
+}
+
 /* test catalog wrap around */
 static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 {
-	struct llog_handle	*cath;
-	char			 name[10];
-	int			 rc, rc2, i, enospc, eok;
-	struct llog_mini_rec	 lmr;
-	struct llog_ctxt	*ctxt;
-	struct lu_attr		 la;
-	__u64			 cat_max_size;
-	struct dt_device	*dt;
+	struct llog_handle *cath;
+	char name[10];
+	int rc, rc2, i, enospc, eok;
+	struct llog_mini_rec lmr;
+	struct llog_ctxt *ctxt;
+	struct lu_attr la;
+	__u64 cat_max_size;
+	struct dt_device *dt;
 
 	ENTRY;
 
@@ -1472,7 +1503,7 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	LASSERT(ctxt);
 
 	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
-	lmr.lmr_hdr.lrh_type = 0xf00f00;
+	lmr.lmr_hdr.lrh_type = LLOG_OP_MAGIC;
 
 	snprintf(name, sizeof(name), "%x", llog_test_rand + 2);
 	CWARN("10a: create a catalog log with name: %s\n", name);
@@ -1490,9 +1521,11 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	cat_logid = cath->lgh_id;
 	dt = lu2dt_dev(cath->lgh_obj->do_lu.lo_dev);
 
-	/* sync device to commit all recent LLOG changes to disk and avoid
+	/*
+	 * sync device to commit all recent LLOG changes to disk and avoid
 	 * to consume a huge space with delayed journal commit callbacks
-	 * particularly on low memory nodes or VMs */
+	 * particularly on low memory nodes or VMs
+	 */
 	rc = dt_sync(env, dt);
 	if (rc) {
 		CERROR("10c: sync failed: %d\n", rc);
@@ -1503,12 +1536,12 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	cfs_fail_loc = CFS_FAIL_SKIP|OBD_FAIL_CAT_RECORDS;
 	cfs_fail_val = 4;
 
-	CWARN("10b: write %d log records\n", LLOG_TEST_RECNUM);
-	for (i = 0; i < LLOG_TEST_RECNUM; i++) {
+	CWARN("10b: write %d log records\n", llog_test_recnum);
+	for (i = 0; i < llog_test_recnum; i++) {
 		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
 		if (rc) {
 			CERROR("10b: write %d records failed at #%d: %d\n",
-			       LLOG_TEST_RECNUM, i + 1, rc);
+			       llog_test_recnum, i + 1, rc);
 			GOTO(out, rc);
 		}
 	}
@@ -1518,21 +1551,23 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	if (rc)
 		GOTO(out, rc);
 
-	/* sync device to commit all recent LLOG changes to disk and avoid
+	/*
+	 * sync device to commit all recent LLOG changes to disk and avoid
 	 * to consume a huge space with delayed journal commit callbacks
-	 * particularly on low memory nodes or VMs */
+	 * particularly on low memory nodes or VMs
+	 */
 	rc = dt_sync(env, dt);
 	if (rc) {
 		CERROR("10b: sync failed: %d\n", rc);
 		GOTO(out, rc);
 	}
 
-	CWARN("10c: write %d more log records\n", 2 * LLOG_TEST_RECNUM);
-	for (i = 0; i < 2 * LLOG_TEST_RECNUM; i++) {
+	CWARN("10c: write %d more log records\n", 2 * llog_test_recnum);
+	for (i = 0; i < 2 * llog_test_recnum; i++) {
 		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
 		if (rc) {
 			CERROR("10c: write %d records failed at #%d: %d\n",
-			       2*LLOG_TEST_RECNUM, i + 1, rc);
+			       2*llog_test_recnum, i + 1, rc);
 			GOTO(out, rc);
 		}
 	}
@@ -1542,29 +1577,35 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	if (rc)
 		GOTO(out, rc);
 
-	/* sync device to commit all recent LLOG changes to disk and avoid
+	/*
+	 * sync device to commit all recent LLOG changes to disk and avoid
 	 * to consume a huge space with delayed journal commit callbacks
-	 * particularly on low memory nodes or VMs */
+	 * particularly on low memory nodes or VMs
+	 */
 	rc = dt_sync(env, dt);
 	if (rc) {
 		CERROR("10c: sync failed: %d\n", rc);
 		GOTO(out, rc);
 	}
 
-	/* fill last allocated plain LLOG and reach -ENOSPC condition
-	 * because no slot available in Catalog */
+	/*
+	 * fill last allocated plain LLOG and reach -ENOSPC condition
+	 * because no slot available in Catalog
+	 */
 	enospc = 0;
 	eok = 0;
-	CWARN("10c: write %d more log records\n", LLOG_TEST_RECNUM);
-	for (i = 0; i < LLOG_TEST_RECNUM; i++) {
+	CWARN("10c: write %d more log records\n", llog_test_recnum);
+	for (i = 0; i < llog_test_recnum; i++) {
 		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
 		if (rc && rc != -ENOSPC) {
 			CERROR("10c: write %d records failed at #%d: %d\n",
-			       LLOG_TEST_RECNUM, i + 1, rc);
+			       llog_test_recnum, i + 1, rc);
 			GOTO(out, rc);
 		}
-		/* after last added plain LLOG has filled up, all new
-		 * records add should fail with -ENOSPC */
+		/*
+		 * after last added plain LLOG has filled up, all new
+		 * records add should fail with -ENOSPC
+		 */
 		if (rc == -ENOSPC) {
 			enospc++;
 		} else {
@@ -1573,7 +1614,7 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 		}
 	}
 
-	if ((enospc == 0) && (enospc+eok != LLOG_TEST_RECNUM)) {
+	if ((enospc == 0) && (enospc+eok != llog_test_recnum)) {
 		CERROR("10c: all last records adds should have failed with"
 		       " -ENOSPC\n");
 		GOTO(out, rc = -EINVAL);
@@ -1595,15 +1636,19 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	}
 	cat_max_size = la.la_size;
 
-	/* cancel all 1st plain llog records to empty it, this will also cause
-	 * its catalog entry to be freed for next forced wrap in 10e */
-	CWARN("10d: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
+	/*
+	 * cancel all 1st plain llog records to empty it, this will also cause
+	 * its catalog entry to be freed for next forced wrap in 10e
+	 */
+	CWARN("10d: Cancel %d records, see one log zapped\n", llog_test_recnum);
 	cancel_count = 0;
 	rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0);
 	if (rc != -LLOG_EEMPTY) {
 		CERROR("10d: process with llog_cancel_rec_cb failed: %d\n", rc);
-		/* need to indicate error if for any reason LLOG_TEST_RECNUM is
-		 * not reached */
+		/*
+		 * need to indicate error if for any reason llog_test_recnum is
+		 * not reached
+		 */
 		if (rc == 0)
 			rc = -ERANGE;
 		GOTO(out, rc);
@@ -1626,9 +1671,11 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	if (rc)
 		GOTO(out, rc);
 
-	/* sync device to commit all recent LLOG changes to disk and avoid
+	/*
+	 * sync device to commit all recent LLOG changes to disk and avoid
 	 * to consume a huge space with delayed journal commit callbacks
-	 * particularly on low memory nodes or VMs */
+	 * particularly on low memory nodes or VMs
+	 */
 	rc = dt_sync(env, dt);
 	if (rc) {
 		CERROR("10d: sync failed: %d\n", rc);
@@ -1637,16 +1684,18 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 
 	enospc = 0;
 	eok = 0;
-	CWARN("10e: write %d more log records\n", LLOG_TEST_RECNUM);
-	for (i = 0; i < LLOG_TEST_RECNUM; i++) {
+	CWARN("10e: write %d more log records\n", llog_test_recnum);
+	for (i = 0; i < llog_test_recnum; i++) {
 		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
 		if (rc && rc != -ENOSPC) {
 			CERROR("10e: write %d records failed at #%d: %d\n",
-			       LLOG_TEST_RECNUM, i + 1, rc);
+			       llog_test_recnum, i + 1, rc);
 			GOTO(out, rc);
 		}
-		/* after last added plain LLOG has filled up, all new
-		 * records add should fail with -ENOSPC */
+		/*
+		 * after last added plain LLOG has filled up, all new
+		 * records add should fail with -ENOSPC
+		 */
 		if (rc == -ENOSPC) {
 			enospc++;
 		} else {
@@ -1655,7 +1704,7 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 		}
 	}
 
-	if ((enospc == 0) && (enospc+eok != LLOG_TEST_RECNUM)) {
+	if ((enospc == 0) && (enospc+eok != llog_test_recnum)) {
 		CERROR("10e: all last records adds should have failed with"
 		       " -ENOSPC\n");
 		GOTO(out, rc = -EINVAL);
@@ -1666,13 +1715,14 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 
 	CWARN("10e: print the catalog entries.. we expect 4\n");
 	cat_counter = 0;
-	rc = llog_process(env, cath, cat_print_cb, "test 10", NULL);
+	rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10",
+				      0, 0, false);
 	if (rc) {
-		CERROR("10d: process with cat_print_cb failed: %d\n", rc);
+		CERROR("10e: process with cat_print_cb failed: %d\n", rc);
 		GOTO(out, rc);
 	}
 	if (cat_counter != 4) {
-		CERROR("10d: %d entries in catalog\n", cat_counter);
+		CERROR("10e: %d entries in catalog\n", cat_counter);
 		GOTO(out, rc = -EINVAL);
 	}
 
@@ -1702,24 +1752,30 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	CWARN("10e: catalog successfully wrap around, last_idx %d, first %d\n",
 	      cath->lgh_last_idx, cath->lgh_hdr->llh_cat_idx);
 
-	/* sync device to commit all recent LLOG changes to disk and avoid
+	/*
+	 * sync device to commit all recent LLOG changes to disk and avoid
 	 * to consume a huge space with delayed journal commit callbacks
-	 * particularly on low memory nodes or VMs */
+	 * particularly on low memory nodes or VMs
+	 */
 	rc = dt_sync(env, dt);
 	if (rc) {
 		CERROR("10e: sync failed: %d\n", rc);
 		GOTO(out, rc);
 	}
 
-	/* cancel more records to free one more slot in Catalog
-	 * see if it is re-allocated when adding more records */
-	CWARN("10f: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
+	/*
+	 * cancel more records to free one more slot in Catalog
+	 * see if it is re-allocated when adding more records
+	 */
+	CWARN("10f: Cancel %d records, see one log zapped\n", llog_test_recnum);
 	cancel_count = 0;
 	rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0);
 	if (rc != -LLOG_EEMPTY) {
 		CERROR("10f: process with llog_cancel_rec_cb failed: %d\n", rc);
-		/* need to indicate error if for any reason LLOG_TEST_RECNUM is
-		 * not reached */
+		/*
+		 * need to indicate error if for any reason llog_test_recnum is
+		 * not reached
+		 */
 		if (rc == 0)
 			rc = -ERANGE;
 		GOTO(out, rc);
@@ -1727,7 +1783,8 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 
 	CWARN("10f: print the catalog entries.. we expect 3\n");
 	cat_counter = 0;
-	rc = llog_process(env, cath, cat_print_cb, "test 10", NULL);
+	rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10",
+				      0, 0, false);
 	if (rc) {
 		CERROR("10f: process with cat_print_cb failed: %d\n", rc);
 		GOTO(out, rc);
@@ -1742,9 +1799,11 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	if (rc)
 		GOTO(out, rc);
 
-	/* sync device to commit all recent LLOG changes to disk and avoid
+	/*
+	 * sync device to commit all recent LLOG changes to disk and avoid
 	 * to consume a huge space with delayed journal commit callbacks
-	 * particularly on low memory nodes or VMs */
+	 * particularly on low memory nodes or VMs
+	 */
 	rc = dt_sync(env, dt);
 	if (rc) {
 		CERROR("10f: sync failed: %d\n", rc);
@@ -1753,16 +1812,18 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 
 	enospc = 0;
 	eok = 0;
-	CWARN("10f: write %d more log records\n", LLOG_TEST_RECNUM);
-	for (i = 0; i < LLOG_TEST_RECNUM; i++) {
+	CWARN("10f: write %d more log records\n", llog_test_recnum);
+	for (i = 0; i < llog_test_recnum; i++) {
 		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
 		if (rc && rc != -ENOSPC) {
 			CERROR("10f: write %d records failed at #%d: %d\n",
-			       LLOG_TEST_RECNUM, i + 1, rc);
+			       llog_test_recnum, i + 1, rc);
 			GOTO(out, rc);
 		}
-		/* after last added plain LLOG has filled up, all new
-		 * records add should fail with -ENOSPC */
+		/*
+		 * after last added plain LLOG has filled up, all new
+		 * records add should fail with -ENOSPC
+		 */
 		if (rc == -ENOSPC) {
 			enospc++;
 		} else {
@@ -1771,7 +1832,7 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 		}
 	}
 
-	if ((enospc == 0) && (enospc+eok != LLOG_TEST_RECNUM)) {
+	if ((enospc == 0) && (enospc+eok != llog_test_recnum)) {
 		CERROR("10f: all last records adds should have failed with"
 		       " -ENOSPC\n");
 		GOTO(out, rc = -EINVAL);
@@ -1806,9 +1867,11 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 		GOTO(out, rc = -EINVAL);
 	}
 
-	/* sync device to commit all recent LLOG changes to disk and avoid
+	/*
+	 * sync device to commit all recent LLOG changes to disk and avoid
 	 * to consume a huge space with delayed journal commit callbacks
-	 * particularly on low memory nodes or VMs */
+	 * particularly on low memory nodes or VMs
+	 */
 	rc = dt_sync(env, dt);
 	if (rc) {
 		CERROR("10f: sync failed: %d\n", rc);
@@ -1817,16 +1880,18 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 
 	/* will llh_cat_idx also successfully wrap ? */
 
-	/* cancel all records in the plain LLOGs referenced by 2 last indexes in
-	 * Catalog */
+	/*
+	 * cancel all records in the plain LLOGs referenced by 2 last indexes in
+	 * Catalog
+	 */
 
 	/* cancel more records to free one more slot in Catalog */
-	CWARN("10g: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
+	CWARN("10g: Cancel %d records, see one log zapped\n", llog_test_recnum);
 	cancel_count = 0;
 	rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0);
 	if (rc != -LLOG_EEMPTY) {
 		CERROR("10g: process with llog_cancel_rec_cb failed: %d\n", rc);
-		/* need to indicate error if for any reason LLOG_TEST_RECNUM is
+		/* need to indicate error if for any reason llog_test_recnum is
 		 * not reached */
 		if (rc == 0)
 			rc = -ERANGE;
@@ -1835,7 +1900,8 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 
 	CWARN("10g: print the catalog entries.. we expect 3\n");
 	cat_counter = 0;
-	rc = llog_process(env, cath, cat_print_cb, "test 10", NULL);
+	rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10",
+				      0, 0, false);
 	if (rc) {
 		CERROR("10g: process with cat_print_cb failed: %d\n", rc);
 		GOTO(out, rc);
@@ -1850,9 +1916,11 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	if (rc)
 		GOTO(out, rc);
 
-	/* sync device to commit all recent LLOG changes to disk and avoid
+	/*
+	 * sync device to commit all recent LLOG changes to disk and avoid
 	 * to consume a huge space with delayed journal commit callbacks
-	 * particularly on low memory nodes or VMs */
+	 * particularly on low memory nodes or VMs
+	 */
 	rc = dt_sync(env, dt);
 	if (rc) {
 		CERROR("10g: sync failed: %d\n", rc);
@@ -1860,13 +1928,15 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	}
 
 	/* cancel more records to free one more slot in Catalog */
-	CWARN("10g: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
+	CWARN("10g: Cancel %d records, see one log zapped\n", llog_test_recnum);
 	cancel_count = 0;
 	rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0);
 	if (rc != -LLOG_EEMPTY) {
 		CERROR("10g: process with llog_cancel_rec_cb failed: %d\n", rc);
-		/* need to indicate error if for any reason LLOG_TEST_RECNUM is
-		 * not reached */
+		/*
+		 * need to indicate error if for any reason llog_test_recnum is
+		 * not reached
+		 */
 		if (rc == 0)
 			rc = -ERANGE;
 		GOTO(out, rc);
@@ -1874,7 +1944,8 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 
 	CWARN("10g: print the catalog entries.. we expect 2\n");
 	cat_counter = 0;
-	rc = llog_process(env, cath, cat_print_cb, "test 10", NULL);
+	rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10",
+				      0, 0, false);
 	if (rc) {
 		CERROR("10g: process with cat_print_cb failed: %d\n", rc);
 		GOTO(out, rc);
@@ -1897,9 +1968,11 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 		GOTO(out, rc = -EINVAL);
 	}
 
-	/* sync device to commit all recent LLOG changes to disk and avoid
+	/*
+	 * sync device to commit all recent LLOG changes to disk and avoid
 	 * to consume a huge space with delayed journal commit callbacks
-	 * particularly on low memory nodes or VMs */
+	 * particularly on low memory nodes or VMs
+	 */
 	rc = dt_sync(env, dt);
 	if (rc) {
 		CERROR("10g: sync failed: %d\n", rc);
@@ -1907,13 +1980,15 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	}
 
 	/* cancel more records to free one more slot in Catalog */
-	CWARN("10g: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
+	CWARN("10g: Cancel %d records, see one log zapped\n", llog_test_recnum);
 	cancel_count = 0;
 	rc = llog_cat_process(env, cath, llog_cancel_rec_cb, "foobar", 0, 0);
 	if (rc != -LLOG_EEMPTY) {
 		CERROR("10g: process with llog_cancel_rec_cb failed: %d\n", rc);
-		/* need to indicate error if for any reason LLOG_TEST_RECNUM is
-		 * not reached */
+		/*
+		 * need to indicate error if for any reason llog_test_recnum is
+		 * not reached
+		 */
 		if (rc == 0)
 			rc = -ERANGE;
 		GOTO(out, rc);
@@ -1921,7 +1996,8 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 
 	CWARN("10g: print the catalog entries.. we expect 1\n");
 	cat_counter = 0;
-	rc = llog_process(env, cath, cat_print_cb, "test 10", NULL);
+	rc = llog_cat_process_or_fork(env, cath, cat_print_cb, NULL, "test 10",
+				      0, 0, false);
 	if (rc) {
 		CERROR("10g: process with cat_print_cb failed: %d\n", rc);
 		GOTO(out, rc);
@@ -1946,6 +2022,64 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 
 	CWARN("10g: llh_cat_idx has also successfully wrapped!\n");
 
+	/*
+	 * catalog has only one valid entry other slots has outdated
+	 * records. Trying to race the llog_thread_process with llog_add
+	 * llog_thread_process read buffer and loop record on it.
+	 * llog_add adds a record and mark a record in bitmap.
+	 * llog_thread_process process record with old data.
+	 */
+	{
+	struct llog_process_info lpi;
+	struct lu_fid test_fid = {0};
+
+	lpi.lpi_loghandle = cath;
+	lpi.lpi_cb = cat_check_old_cb;
+	lpi.lpi_catdata = NULL;
+	lpi.lpi_cbdata = &test_fid;
+	init_completion(&lpi.lpi_completion);
+
+	kthread_run(llog_test_process_thread, &lpi, "llog_test_process_thread");
+
+	msleep(1 * MSEC_PER_SEC / 2);
+	enospc = 0;
+	eok = 0;
+	CWARN("10h: write %d more log records\n", llog_test_recnum);
+	for (i = 0; i < llog_test_recnum; i++) {
+		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL);
+		if (rc && rc != -ENOSPC) {
+			CERROR("10h: write %d records failed at #%d: %d\n",
+			       llog_test_recnum, i + 1, rc);
+			GOTO(out, rc);
+		}
+		/*
+		 * after last added plain LLOG has filled up, all new
+		 * records add should fail with -ENOSPC
+		 */
+		if (rc == -ENOSPC) {
+			enospc++;
+		} else {
+			enospc = 0;
+			eok++;
+		}
+	}
+
+	if ((enospc == 0) && (enospc+eok != llog_test_recnum)) {
+		CERROR("10h: all last records adds should have failed with"
+		       " -ENOSPC\n");
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CWARN("10h: wrote %d records then %d failed with ENOSPC\n", eok,
+	      enospc);
+
+	wait_for_completion(&lpi.lpi_completion);
+
+	if (lpi.lpi_rc != 0) {
+		CERROR("10h: race happened, old record was processed\n");
+		GOTO(out, rc = -EINVAL);
+	}
+	}
 out:
 	cfs_fail_loc = 0;
 	cfs_fail_val = 0;
@@ -1962,15 +2096,17 @@ static int llog_test_10(const struct lu_env *env, struct obd_device *obd)
 	RETURN(rc);
 }
 
-/* -------------------------------------------------------------------------
+/*
+ * -------------------------------------------------------------------------
  * Tests above, boring obd functions below
- * ------------------------------------------------------------------------- */
+ * -------------------------------------------------------------------------
+ */
 static int llog_run_tests(const struct lu_env *env, struct obd_device *obd)
 {
-	struct llog_handle	*llh = NULL;
-	struct llog_ctxt	*ctxt;
-	int			 rc, err;
-	char			 name[10];
+	struct llog_handle *llh = NULL;
+	struct llog_ctxt *ctxt;
+	int rc, err;
+	char name[10];
 
 	ENTRY;
 	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
@@ -2032,9 +2168,9 @@ static int llog_run_tests(const struct lu_env *env, struct obd_device *obd)
 
 static int llog_test_cleanup(struct obd_device *obd)
 {
-	struct obd_device	*tgt;
-	struct lu_env		 env;
-	int			 rc;
+	struct obd_device *tgt;
+	struct lu_env env;
+	int rc;
 
 	ENTRY;
 
@@ -2052,32 +2188,32 @@ static int llog_test_cleanup(struct obd_device *obd)
 
 static int llog_test_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 {
-	struct obd_device	*tgt;
-	struct llog_ctxt	*ctxt;
-	struct dt_object	*o;
-	struct lu_env		 env;
-	struct lu_context	 test_session;
-	int			 rc;
-
-        ENTRY;
-
-        if (lcfg->lcfg_bufcount < 2) {
-                CERROR("requires a TARGET OBD name\n");
-                RETURN(-EINVAL);
-        }
+	struct obd_device *tgt;
+	struct llog_ctxt *ctxt;
+	struct dt_object *o;
+	struct lu_env env;
+	struct lu_context test_session;
+	int rc;
 
-        if (lcfg->lcfg_buflens[1] < 1) {
-                CERROR("requires a TARGET OBD name\n");
-                RETURN(-EINVAL);
-        }
+	ENTRY;
 
-        /* disk obd */
-        tgt = class_name2obd(lustre_cfg_string(lcfg, 1));
-        if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) {
-                CERROR("target device not attached or not set up (%s)\n",
-                       lustre_cfg_string(lcfg, 1));
-                RETURN(-EINVAL);
-        }
+	if (lcfg->lcfg_bufcount < 2) {
+		CERROR("requires a TARGET OBD name\n");
+		RETURN(-EINVAL);
+	}
+
+	if (lcfg->lcfg_buflens[1] < 1) {
+		CERROR("requires a TARGET OBD name\n");
+		RETURN(-EINVAL);
+	}
+
+	/* disk obd */
+	tgt = class_name2obd(lustre_cfg_string(lcfg, 1));
+	if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) {
+		CERROR("target device not attached or not set up (%s)\n",
+			lustre_cfg_string(lcfg, 1));
+		RETURN(-EINVAL);
+	}
 
 	rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
 	if (rc)
@@ -2126,14 +2262,14 @@ static int llog_test_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 }
 
 static struct obd_ops llog_obd_ops = {
-        .o_owner       = THIS_MODULE,
-        .o_setup       = llog_test_setup,
-        .o_cleanup     = llog_test_cleanup,
+	.o_owner       = THIS_MODULE,
+	.o_setup       = llog_test_setup,
+	.o_cleanup     = llog_test_cleanup,
 };
 
 static int __init llog_test_init(void)
 {
-	return class_register_type(&llog_obd_ops, NULL, true, NULL,
+	return class_register_type(&llog_obd_ops, NULL, false, NULL,
 				   "llog_test", NULL);
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/local_storage.c b/drivers/staging/lustrefsx/lustre/obdclass/local_storage.c
index 89b227b0cfa09..04c25ebd88274 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/local_storage.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/local_storage.c
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * lustre/obdclass/local_storage.c
@@ -388,14 +388,14 @@ static struct dt_object *__local_file_create(const struct lu_env *env,
 		rec->rec_fid = fid;
 		/* Add "." and ".." for newly created dir */
 		rc = dt_insert(env, dto, (const struct dt_rec *)rec,
-			       (const struct dt_key *)".", th, 1);
+			       (const struct dt_key *)".", th);
 		if (rc != 0)
 			GOTO(destroy, rc);
 
 		dt_ref_add(env, dto, th);
 		rec->rec_fid = lu_object_fid(&parent->do_lu);
 		rc = dt_insert(env, dto, (const struct dt_rec *)rec,
-			       (const struct dt_key *)"..", th, 1);
+			       (const struct dt_key *)"..", th);
 		if (rc != 0)
 			GOTO(destroy, rc);
 	}
@@ -404,7 +404,7 @@ static struct dt_object *__local_file_create(const struct lu_env *env,
 	rec->rec_type = dto->do_lu.lo_header->loh_attr;
 	dt_write_lock(env, parent, LOS_PARENT);
 	rc = dt_insert(env, parent, (const struct dt_rec *)rec,
-		       (const struct dt_key *)name, th, 1);
+		       (const struct dt_key *)name, th);
 	if (dti->dti_dof.dof_type == DFT_DIR)
 		dt_ref_add(env, parent, th);
 	dt_write_unlock(env, parent);
@@ -684,7 +684,7 @@ int local_object_unlink(const struct lu_env *env, struct dt_device *dt,
 		rec->rec_fid = &dti->dti_fid;
 		rec->rec_type = dto->do_lu.lo_header->loh_attr;
 		rc = dt_insert(env, parent, (const struct dt_rec *)rec,
-			       (const struct dt_key *)name, th, 1);
+			       (const struct dt_key *)name, th);
 		GOTO(unlock, rc);
 	}
 
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c
index 00395af273593..37d749d199275 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_jobstats.c
@@ -30,10 +30,8 @@
 
 #define DEBUG_SUBSYSTEM S_CLASS
 
-
 #include <obd_class.h>
 #include <lprocfs_status.h>
-#include <lustre/lustre_idl.h>
 
 #ifdef CONFIG_PROC_FS
 
@@ -67,8 +65,8 @@ struct job_stat {
 	struct hlist_node	js_hash;	/* hash struct for this jobid */
 	struct list_head	js_list;	/* on ojs_list, with ojs_lock */
 	atomic_t		js_refcount;	/* num users of this struct */
-	char			js_jobid[LUSTRE_JOBID_SIZE]; /* job name */
-	time_t			js_timestamp;	/* seconds of most recent stat*/
+	char			js_jobid[LUSTRE_JOBID_SIZE]; /* job name + NUL*/
+	time64_t		js_timestamp;	/* seconds of most recent stat*/
 	struct lprocfs_stats	*js_stats;	/* per-job statistics */
 	struct obd_job_stats	*js_jobstats;	/* for accessing ojs_lock */
 };
@@ -164,7 +162,7 @@ static int job_cleanup_iter_callback(struct cfs_hash *hs,
 				     struct cfs_hash_bd *bd,
 				     struct hlist_node *hnode, void *data)
 {
-	time_t oldest_time = *((time_t *)data);
+	time64_t oldest_time = *((time64_t *)data);
 	struct job_stat *job;
 
 	job = hlist_entry(hnode, struct job_stat, js_hash);
@@ -193,8 +191,8 @@ static int job_cleanup_iter_callback(struct cfs_hash *hs,
  */
 static void lprocfs_job_cleanup(struct obd_job_stats *stats, int before)
 {
-	time_t now = cfs_time_current_sec();
-	time_t oldest;
+	time64_t now = ktime_get_real_seconds();
+	time64_t oldest;
 
 	if (likely(before >= 0)) {
 		unsigned int cleanup_interval = stats->ojs_cleanup_interval;
@@ -234,7 +232,7 @@ static void lprocfs_job_cleanup(struct obd_job_stats *stats, int before)
 
 	write_lock(&stats->ojs_lock);
 	stats->ojs_cleaning = false;
-	stats->ojs_last_cleanup = cfs_time_current_sec();
+	stats->ojs_last_cleanup = ktime_get_real_seconds();
 	write_unlock(&stats->ojs_lock);
 }
 
@@ -254,8 +252,8 @@ static struct job_stat *job_alloc(char *jobid, struct obd_job_stats *jobs)
 
 	jobs->ojs_cntr_init_fn(job->js_stats);
 
-	memcpy(job->js_jobid, jobid, LUSTRE_JOBID_SIZE);
-	job->js_timestamp = cfs_time_current_sec();
+	memcpy(job->js_jobid, jobid, sizeof(job->js_jobid));
+	job->js_timestamp = ktime_get_real_seconds();
 	job->js_jobstats = jobs;
 	INIT_HLIST_NODE(&job->js_hash);
 	INIT_LIST_HEAD(&job->js_list);
@@ -315,7 +313,7 @@ int lprocfs_job_stats_log(struct obd_device *obd, char *jobid,
 
 found:
 	LASSERT(stats == job->js_jobstats);
-	job->js_timestamp = cfs_time_current_sec();
+	job->js_timestamp = ktime_get_real_seconds();
 	lprocfs_counter_add(job->js_stats, event, amount);
 
 	job_putref(job);
@@ -444,7 +442,7 @@ static int lprocfs_jobstats_seq_show(struct seq_file *p, void *v)
 	}
 	seq_putc(p, '\n');
 
-	seq_printf(p, "  %-16s %ld\n", "snapshot_time:", job->js_timestamp);
+	seq_printf(p, "  %-16s %lld\n", "snapshot_time:", job->js_timestamp);
 
 	s = job->js_stats;
 	for (i = 0; i < s->ls_num; i++) {
@@ -515,7 +513,7 @@ static ssize_t lprocfs_jobstats_seq_write(struct file *file,
 	if (stats->ojs_hash == NULL)
 		return -ENODEV;
 
-	if (lprocfs_copy_from_user(file, jobid, buf, len))
+	if (copy_from_user(jobid, buf, len))
 		return -EFAULT;
 	jobid[len] = 0;
 
@@ -615,7 +613,7 @@ int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
 	stats->ojs_cntr_num = cntr_num;
 	stats->ojs_cntr_init_fn = init_fn;
 	stats->ojs_cleanup_interval = 600; /* 10 mins by default */
-	stats->ojs_last_cleanup = cfs_time_current_sec();
+	stats->ojs_last_cleanup = ktime_get_real_seconds();
 
 	entry = lprocfs_add_simple(obd->obd_proc_entry, "job_stats", stats,
 				   &lprocfs_jobstats_seq_fops);
@@ -626,45 +624,38 @@ int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
 	RETURN(0);
 }
 EXPORT_SYMBOL(lprocfs_job_stats_init);
+#endif /* CONFIG_PROC_FS*/
 
-int lprocfs_job_interval_seq_show(struct seq_file *m, void *data)
+ssize_t job_cleanup_interval_show(struct kobject *kobj, struct attribute *attr,
+				  char *buf)
 {
-	struct obd_device *obd = m->private;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 	struct obd_job_stats *stats;
 
-	if (obd == NULL)
-		return -ENODEV;
-
 	stats = &obd->u.obt.obt_jobstats;
-	seq_printf(m, "%d\n", stats->ojs_cleanup_interval);
-	return 0;
+	return scnprintf(buf, PAGE_SIZE, "%d\n", stats->ojs_cleanup_interval);
 }
-EXPORT_SYMBOL(lprocfs_job_interval_seq_show);
+EXPORT_SYMBOL(job_cleanup_interval_show);
 
-ssize_t
-lprocfs_job_interval_seq_write(struct file *file, const char __user *buffer,
-				size_t count, loff_t *off)
+ssize_t job_cleanup_interval_store(struct kobject *kobj,
+				   struct attribute *attr,
+				   const char *buffer, size_t count)
 {
-	struct obd_device *obd;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 	struct obd_job_stats *stats;
+	unsigned int val;
 	int rc;
-	__s64 val;
-
-	obd = ((struct seq_file *)file->private_data)->private;
-	if (obd == NULL)
-		return -ENODEV;
 
 	stats = &obd->u.obt.obt_jobstats;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtouint(buffer, 0, &val);
 	if (rc)
 		return rc;
-	if (val < 0 || val > UINT_MAX)
-		return -ERANGE;
 
 	stats->ojs_cleanup_interval = val;
 	lprocfs_job_cleanup(stats, stats->ojs_cleanup_interval);
 	return count;
 }
-EXPORT_SYMBOL(lprocfs_job_interval_seq_write);
-#endif /* CONFIG_PROC_FS*/
+EXPORT_SYMBOL(job_cleanup_interval_store);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
index f3d2efc8403ba..8b8a12539da61 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,7 +38,6 @@
 
 #include <obd_class.h>
 #include <lprocfs_status.h>
-#include <lustre/lustre_idl.h>
 
 #ifdef CONFIG_PROC_FS
 
@@ -48,52 +47,15 @@ MODULE_PARM_DESC(lprocfs_no_percpu_stats, "Do not alloc percpu data for lprocfs
 
 #define MAX_STRING_SIZE 128
 
-static const struct file_operations lprocfs_kernel_dummy = {};
-
-/*
- * Awful hacks to mark procfs seq writes as going to kernel space. Used
- * to be done with set_fs(KERNEL_DS), but that function is no more.
- * This should only be called from class_process_proc_param(), which passes
- * in a fake file structure. It should never, ever be used for anything else.
- */
-void lprocfs_file_set_kernel(struct file *file)
-{
-	LASSERT(file->f_op == NULL);
-	file->f_op = &lprocfs_kernel_dummy;
-}
-EXPORT_SYMBOL(lprocfs_file_set_kernel);
-
-bool lprocfs_file_is_kernel(struct file *file)
-{
-	return (file->f_op == &lprocfs_kernel_dummy);
-}
-EXPORT_SYMBOL(lprocfs_file_is_kernel);
-
-unsigned long
-lprocfs_copy_from_user(struct file *file, void *to,
-		       const void __user *from, unsigned long n)
-{
-	unsigned long res;
-
-	if (lprocfs_file_is_kernel(file)) {
-		memcpy(to, from, n);
-		res = 0;
-	} else
-		res = copy_from_user(to, from, n);
-
-	return res;
-}
-EXPORT_SYMBOL(lprocfs_copy_from_user);
-
 int lprocfs_single_release(struct inode *inode, struct file *file)
 {
-        return single_release(inode, file);
+	return single_release(inode, file);
 }
 EXPORT_SYMBOL(lprocfs_single_release);
 
 int lprocfs_seq_release(struct inode *inode, struct file *file)
 {
-        return seq_release(inode, file);
+	return seq_release(inode, file);
 }
 EXPORT_SYMBOL(lprocfs_seq_release);
 
@@ -116,8 +78,8 @@ lprocfs_add_simple(struct proc_dir_entry *root, char *name,
 	struct proc_dir_entry *proc;
 	umode_t mode;
 
-	if (root == NULL || name == NULL || fops == NULL)
-                return ERR_PTR(-EINVAL);
+	if (!root || !name || !fops)
+		return ERR_PTR(-EINVAL);
 
 	mode = default_mode(fops);
 	proc = proc_create_data(name, mode, root, fops, data);
@@ -126,42 +88,43 @@ lprocfs_add_simple(struct proc_dir_entry *root, char *name,
 		       name);
 		return ERR_PTR(-ENOMEM);
 	}
-        return proc;
+	return proc;
 }
 EXPORT_SYMBOL(lprocfs_add_simple);
 
 struct proc_dir_entry *lprocfs_add_symlink(const char *name,
-                        struct proc_dir_entry *parent, const char *format, ...)
+					   struct proc_dir_entry *parent,
+					   const char *format, ...)
 {
-        struct proc_dir_entry *entry;
-        char *dest;
-        va_list ap;
+	struct proc_dir_entry *entry;
+	char *dest;
+	va_list ap;
 
-        if (parent == NULL || format == NULL)
-                return NULL;
+	if (!parent || !format)
+		return NULL;
 
-        OBD_ALLOC_WAIT(dest, MAX_STRING_SIZE + 1);
-        if (dest == NULL)
-                return NULL;
+	OBD_ALLOC_WAIT(dest, MAX_STRING_SIZE + 1);
+	if (!dest)
+		return NULL;
 
-        va_start(ap, format);
-        vsnprintf(dest, MAX_STRING_SIZE, format, ap);
-        va_end(ap);
+	va_start(ap, format);
+	vsnprintf(dest, MAX_STRING_SIZE, format, ap);
+	va_end(ap);
 
-        entry = proc_symlink(name, parent, dest);
-	if (entry == NULL)
+	entry = proc_symlink(name, parent, dest);
+	if (!entry)
 		CERROR("LprocFS: Could not create symbolic link from "
 		       "%s to %s\n", name, dest);
 
-        OBD_FREE(dest, MAX_STRING_SIZE + 1);
-        return entry;
+	OBD_FREE(dest, MAX_STRING_SIZE + 1);
+	return entry;
 }
 EXPORT_SYMBOL(lprocfs_add_symlink);
 
 static const struct file_operations ldebugfs_empty_ops = { };
 
 int ldebugfs_add_vars(struct dentry *parent, struct ldebugfs_vars *list,
-		      void *data)
+		       void *data)
 {
 	if (IS_ERR_OR_NULL(parent) || IS_ERR_OR_NULL(list))
 		return -EINVAL;
@@ -206,10 +169,10 @@ int
 lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list,
 		 void *data)
 {
-	if (root == NULL || list == NULL)
+	if (!root || !list)
 		return -EINVAL;
 
-	while (list->name != NULL) {
+	while (list->name) {
 		struct proc_dir_entry *proc;
 		umode_t mode = 0;
 
@@ -220,7 +183,7 @@ lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list,
 		proc = proc_create_data(list->name, mode, root,
 					list->fops ?: &lprocfs_empty_ops,
 					list->data ?: data);
-		if (proc == NULL)
+		if (!proc)
 			return -ENOMEM;
 		list++;
 	}
@@ -230,7 +193,7 @@ EXPORT_SYMBOL(lprocfs_add_vars);
 
 void ldebugfs_remove(struct dentry **entryp)
 {
-	debugfs_remove(*entryp);
+	debugfs_remove_recursive(*entryp);
 	*entryp = NULL;
 }
 EXPORT_SYMBOL_GPL(ldebugfs_remove);
@@ -248,36 +211,38 @@ static void lprocfs_remove_nolock(struct proc_dir_entry **proot)
 	struct proc_dir_entry *parent;
 
 	*proot = NULL;
-	if (root == NULL || IS_ERR(root))
+	if (!root || IS_ERR(root))
 		return;
 
-        parent = root->parent;
-        LASSERT(parent != NULL);
+	parent = root->parent;
+	LASSERT(parent != NULL);
 
-        while (1) {
-                while (temp->subdir != NULL)
-                        temp = temp->subdir;
+	while (1) {
+		while (temp->subdir)
+			temp = temp->subdir;
 
-                rm_entry = temp;
-                temp = temp->parent;
+		rm_entry = temp;
+		temp = temp->parent;
 
-                /* Memory corruption once caused this to fail, and
-                   without this LASSERT we would loop here forever. */
-                LASSERTF(strlen(rm_entry->name) == rm_entry->namelen,
-                         "0x%p  %s/%s len %d\n", rm_entry, temp->name,
-                         rm_entry->name, (int)strlen(rm_entry->name));
+		/*
+		 * Memory corruption once caused this to fail, and
+		 * without this LASSERT we would loop here forever.
+		 */
+		LASSERTF(strlen(rm_entry->name) == rm_entry->namelen,
+			 "0x%p  %s/%s len %d\n", rm_entry, temp->name,
+			 rm_entry->name, (int)strlen(rm_entry->name));
 
-                remove_proc_entry(rm_entry->name, temp);
-                if (temp == parent)
-                        break;
-        }
+		remove_proc_entry(rm_entry->name, temp);
+		if (temp == parent)
+			break;
+	}
 }
 
 int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
 {
-	struct proc_dir_entry	 *t = NULL;
-	struct proc_dir_entry	**p;
-	int			  len, busy = 0;
+	struct proc_dir_entry *t = NULL;
+	struct proc_dir_entry **p;
+	int len, busy = 0;
 
 	LASSERT(parent != NULL);
 	len = strlen(name);
@@ -376,10 +341,10 @@ lprocfs_register(const char *name, struct proc_dir_entry *parent,
 	struct proc_dir_entry *newchild;
 
 	newchild = proc_mkdir(name, parent);
-	if (newchild == NULL)
+	if (!newchild)
 		return ERR_PTR(-ENOMEM);
 
-	if (list != NULL) {
+	if (list) {
 		int rc = lprocfs_add_vars(newchild, list, data);
 		if (rc) {
 			lprocfs_remove(&newchild);
@@ -391,93 +356,6 @@ lprocfs_register(const char *name, struct proc_dir_entry *parent,
 EXPORT_SYMBOL(lprocfs_register);
 
 /* Generic callbacks */
-int lprocfs_uint_seq_show(struct seq_file *m, void *data)
-{
-	seq_printf(m, "%u\n", *(unsigned int *)data);
-	return 0;
-}
-EXPORT_SYMBOL(lprocfs_uint_seq_show);
-
-int lprocfs_wr_uint(struct file *file, const char __user *buffer,
-                    unsigned long count, void *data)
-{
-	unsigned	*p = data;
-	char		 dummy[MAX_STRING_SIZE + 1];
-	char		*end;
-	unsigned long	 tmp;
-
-	if (count >= sizeof(dummy))
-		return -EINVAL;
-
-	if (count == 0)
-		return 0;
-
-	if (lprocfs_copy_from_user(file, dummy, buffer, count))
-		return -EFAULT;
-
-	dummy[count] = 0;
-
-	tmp = simple_strtoul(dummy, &end, 0);
-	if (dummy == end)
-		return -EINVAL;
-
-	*p = (unsigned int)tmp;
-	return count;
-}
-EXPORT_SYMBOL(lprocfs_wr_uint);
-
-ssize_t lprocfs_uint_seq_write(struct file *file, const char __user *buffer,
-			       size_t count, loff_t *off)
-{
-	int *data = ((struct seq_file *)file->private_data)->private;
-	int rc;
-	__s64 val = 0;
-
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
-	if (rc < 0)
-		return rc;
-
-	return lprocfs_wr_uint(file, buffer, count, data);
-}
-EXPORT_SYMBOL(lprocfs_uint_seq_write);
-
-int lprocfs_u64_seq_show(struct seq_file *m, void *data)
-{
-	LASSERT(data != NULL);
-	seq_printf(m, "%llu\n", *(__u64 *)data);
-	return 0;
-}
-EXPORT_SYMBOL(lprocfs_u64_seq_show);
-
-int lprocfs_atomic_seq_show(struct seq_file *m, void *data)
-{
-	atomic_t *atom = data;
-	LASSERT(atom != NULL);
-	seq_printf(m, "%d\n", atomic_read(atom));
-	return 0;
-}
-EXPORT_SYMBOL(lprocfs_atomic_seq_show);
-
-ssize_t
-lprocfs_atomic_seq_write(struct file *file, const char __user *buffer,
-			size_t count, loff_t *off)
-{
-	atomic_t *atm = ((struct seq_file *)file->private_data)->private;
-	__s64 val = 0;
-	int rc;
-
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
-	if (rc < 0)
-		return rc;
-
-	if (val <= 0 || val > INT_MAX)
-		return -ERANGE;
-
-	atomic_set(atm, val);
-	return count;
-}
-EXPORT_SYMBOL(lprocfs_atomic_seq_write);
-
 int lprocfs_uuid_seq_show(struct seq_file *m, void *data)
 {
 	struct obd_device *obd = data;
@@ -488,114 +366,163 @@ int lprocfs_uuid_seq_show(struct seq_file *m, void *data)
 }
 EXPORT_SYMBOL(lprocfs_uuid_seq_show);
 
-int lprocfs_name_seq_show(struct seq_file *m, void *data)
+static ssize_t uuid_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf)
 {
-	struct obd_device *dev = data;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 
-	LASSERT(dev != NULL);
-	seq_printf(m, "%s\n", dev->obd_name);
-	return 0;
+	return sprintf(buf, "%s\n", obd->obd_uuid.uuid);
 }
-EXPORT_SYMBOL(lprocfs_name_seq_show);
+LUSTRE_RO_ATTR(uuid);
 
-int lprocfs_blksize_seq_show(struct seq_file *m, void *data)
+static ssize_t blocksize_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
 {
-	struct obd_device *obd = data;
-	struct obd_statfs  osfs;
-	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
-			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
-			    OBD_STATFS_NODELAY);
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
+			OBD_STATFS_NODELAY);
 	if (!rc)
-		seq_printf(m, "%u\n", osfs.os_bsize);
+		return sprintf(buf, "%u\n", osfs.os_bsize);
+
 	return rc;
 }
-EXPORT_SYMBOL(lprocfs_blksize_seq_show);
+LUSTRE_RO_ATTR(blocksize);
 
-int lprocfs_kbytestotal_seq_show(struct seq_file *m, void *data)
+static ssize_t kbytestotal_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
 {
-	struct obd_device *obd = data;
-	struct obd_statfs  osfs;
-	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
-			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
-			    OBD_STATFS_NODELAY);
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
+			OBD_STATFS_NODELAY);
 	if (!rc) {
-		__u32 blk_size = osfs.os_bsize >> 10;
-		__u64 result = osfs.os_blocks;
+		u32 blk_size = osfs.os_bsize >> 10;
+		u64 result = osfs.os_blocks;
 
 		while (blk_size >>= 1)
 			result <<= 1;
 
-		seq_printf(m, "%llu\n", result);
+		return sprintf(buf, "%llu\n", result);
 	}
+
 	return rc;
 }
-EXPORT_SYMBOL(lprocfs_kbytestotal_seq_show);
+LUSTRE_RO_ATTR(kbytestotal);
 
-int lprocfs_kbytesfree_seq_show(struct seq_file *m, void *data)
+static ssize_t kbytesfree_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
 {
-	struct obd_device *obd = data;
-	struct obd_statfs  osfs;
-	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
-			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
-			    OBD_STATFS_NODELAY);
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
+			OBD_STATFS_NODELAY);
 	if (!rc) {
-		__u32 blk_size = osfs.os_bsize >> 10;
-		__u64 result = osfs.os_bfree;
+		u32 blk_size = osfs.os_bsize >> 10;
+		u64 result = osfs.os_bfree;
 
 		while (blk_size >>= 1)
 			result <<= 1;
 
-		seq_printf(m, "%llu\n", result);
+		return sprintf(buf, "%llu\n", result);
 	}
+
 	return rc;
 }
-EXPORT_SYMBOL(lprocfs_kbytesfree_seq_show);
+LUSTRE_RO_ATTR(kbytesfree);
 
-int lprocfs_kbytesavail_seq_show(struct seq_file *m, void *data)
+static ssize_t kbytesavail_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
 {
-	struct obd_device *obd = data;
-	struct obd_statfs  osfs;
-	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
-			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
-			    OBD_STATFS_NODELAY);
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
+			OBD_STATFS_NODELAY);
 	if (!rc) {
-		__u32 blk_size = osfs.os_bsize >> 10;
-		__u64 result = osfs.os_bavail;
+		u32 blk_size = osfs.os_bsize >> 10;
+		u64 result = osfs.os_bavail;
 
 		while (blk_size >>= 1)
 			result <<= 1;
 
-		seq_printf(m, "%llu\n", result);
+		return sprintf(buf, "%llu\n", result);
 	}
+
 	return rc;
 }
-EXPORT_SYMBOL(lprocfs_kbytesavail_seq_show);
+LUSTRE_RO_ATTR(kbytesavail);
 
-int lprocfs_filestotal_seq_show(struct seq_file *m, void *data)
+static ssize_t filestotal_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
 {
-	struct obd_device *obd = data;
-	struct obd_statfs  osfs;
-	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
-			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
-			    OBD_STATFS_NODELAY);
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
+			OBD_STATFS_NODELAY);
 	if (!rc)
-		seq_printf(m, "%llu\n", osfs.os_files);
+		return sprintf(buf, "%llu\n", osfs.os_files);
+
 	return rc;
 }
-EXPORT_SYMBOL(lprocfs_filestotal_seq_show);
+LUSTRE_RO_ATTR(filestotal);
 
-int lprocfs_filesfree_seq_show(struct seq_file *m, void *data)
+static ssize_t filesfree_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
 {
-	struct obd_device *obd = data;
-	struct obd_statfs  osfs;
-	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
-			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
-			    OBD_STATFS_NODELAY);
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_statfs osfs;
+	int rc;
+
+	rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS,
+			OBD_STATFS_NODELAY);
 	if (!rc)
-		seq_printf(m, "%llu\n", osfs.os_ffree);
+		return sprintf(buf, "%llu\n", osfs.os_ffree);
+
 	return rc;
 }
-EXPORT_SYMBOL(lprocfs_filesfree_seq_show);
+LUSTRE_RO_ATTR(filesfree);
+
+ssize_t conn_uuid_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct ptlrpc_connection *conn;
+	ssize_t count;
+
+	LPROCFS_CLIMP_CHECK(obd);
+	conn = obd->u.cli.cl_import->imp_connection;
+	if (conn && obd->u.cli.cl_import)
+		count = sprintf(buf, "%s\n", conn->c_remote_uuid.uuid);
+	else
+		count = sprintf(buf, "%s\n", "<none>");
+
+	LPROCFS_CLIMP_EXIT(obd);
+	return count;
+}
+EXPORT_SYMBOL(conn_uuid_show);
 
 int lprocfs_server_uuid_seq_show(struct seq_file *m, void *data)
 {
@@ -616,26 +543,6 @@ int lprocfs_server_uuid_seq_show(struct seq_file *m, void *data)
 }
 EXPORT_SYMBOL(lprocfs_server_uuid_seq_show);
 
-int lprocfs_conn_uuid_seq_show(struct seq_file *m, void *data)
-{
-	struct obd_device *obd = data;
-	struct ptlrpc_connection *conn;
-	int rc = 0;
-
-	LASSERT(obd != NULL);
-
-	LPROCFS_CLIMP_CHECK(obd);
-	conn = obd->u.cli.cl_import->imp_connection;
-	if (conn && obd->u.cli.cl_import)
-		seq_printf(m, "%s\n", conn->c_remote_uuid.uuid);
-	else
-		seq_printf(m, "%s\n", "<none>");
-
-	LPROCFS_CLIMP_EXIT(obd);
-	return rc;
-}
-EXPORT_SYMBOL(lprocfs_conn_uuid_seq_show);
-
 /** add up per-cpu counters */
 
 /**
@@ -729,14 +636,14 @@ void lprocfs_stats_unlock(struct lprocfs_stats *stats,
 void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
 			   struct lprocfs_counter *cnt)
 {
-	unsigned int			num_entry;
-	struct lprocfs_counter		*percpu_cntr;
-	int				i;
-	unsigned long			flags = 0;
+	unsigned int num_entry;
+	struct lprocfs_counter *percpu_cntr;
+	int i;
+	unsigned long flags = 0;
 
 	memset(cnt, 0, sizeof(*cnt));
 
-	if (stats == NULL) {
+	if (!stats) {
 		/* set count to 1 to avoid divide-by-zero errs in callers */
 		cnt->lc_count = 1;
 		return;
@@ -747,7 +654,7 @@ void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
 	num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
 
 	for (i = 0; i < num_entry; i++) {
-		if (stats->ls_percpu[i] == NULL)
+		if (!stats->ls_percpu[i])
 			continue;
 		percpu_cntr = lprocfs_stats_counter_get(stats, i, idx);
 
@@ -763,16 +670,6 @@ void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
 	lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
 }
 
-/**
- * Append a space separated list of current set flags to str.
- */
-#define flag2str(flag)						\
-	do {								\
-		if (imp->imp_##flag) {					\
-			seq_printf(m, "%s" #flag, first ? "" : ", ");	\
-			first = false;					\
-		}							\
-	} while (0)
 static void obd_import_flags2str(struct obd_import *imp, struct seq_file *m)
 {
 	bool first = true;
@@ -782,19 +679,16 @@ static void obd_import_flags2str(struct obd_import *imp, struct seq_file *m)
 		first = false;
 	}
 
-	flag2str(invalid);
-	flag2str(deactive);
-	flag2str(replayable);
-	flag2str(delayed_recovery);
-	flag2str(no_lock_replay);
-	flag2str(vbr_failed);
-	flag2str(pingable);
-	flag2str(resend_replay);
-	flag2str(no_pinger_recover);
-	flag2str(need_mne_swab);
-	flag2str(connect_tried);
+	flag2str(imp, invalid);
+	flag2str(imp, deactive);
+	flag2str(imp, replayable);
+	flag2str(imp, delayed_recovery);
+	flag2str(imp, vbr_failed);
+	flag2str(imp, pingable);
+	flag2str(imp, resend_replay);
+	flag2str(imp, no_pinger_recover);
+	flag2str(imp, connect_tried);
 }
-#undef flag2str
 
 static const char *obd_connect_names[] = {
 	/* flags names  */
@@ -858,17 +752,43 @@ static const char *obd_connect_names[] = {
 	"multi_mod_rpcs",
 	"dir_stripe",
 	"subtree",
-	"lock_ahead",
+	"lockahead",
 	"bulk_mbits",
 	"compact_obdo",
 	"second_flags",
 	/* flags2 names */
-	"file_secctx",
+	"file_secctx",	/* 0x01 */
+	"lockaheadv2",	/* 0x02 */
+	"dir_migrate",	/* 0x04 */
+	"sum_statfs",	/* 0x08 */
+	"overstriping",	/* 0x10 */
+	"flr",		/* 0x20 */
+	"wbc",		/* 0x40 */
+	"lock_convert",  /* 0x80 */
+	"archive_id_array",	/* 0x100 */
+	"increasing_xid",	/* 0x200 */
+	"selinux_policy",	/* 0x400 */
+	"lsom",			/* 0x800 */
+	"pcc",			/* 0x1000 */
+	"unknown",		/* 0x2000 */
+	"async_discard",	/* 0x4000 */
+	"client_encryption",	/* 0x8000 */
+	"fidmap",		/* 0x10000 */
+	"getattr_pfid",		/* 0x20000 */
+	"",	"",	"",	"",	"",	"",	"",	"",
+	"",	"",	"",	"",	"",	"",	"",	"",
+	"",	"",	"",	"",	"",	"",	"",	"",
+	"",	"",	"",	"",	"",	"",	"",	"",
+	"",	"",	"",	"",	"",	"",	"",	"",
+	"",	"",
+	"mdll", /*  0x1000000000000000 */
+	"mdll_auto_refresh", /* 0x2000000000000000 */
+	"", "",
 	NULL
 };
 
-static void obd_connect_seq_flags2str(struct seq_file *m, __u64 flags,
-				      __u64 flags2, const char *sep)
+void obd_connect_seq_flags2str(struct seq_file *m, __u64 flags, __u64 flags2,
+			       const char *sep)
 {
 	bool first = true;
 	__u64 mask;
@@ -905,6 +825,7 @@ static void obd_connect_seq_flags2str(struct seq_file *m, __u64 flags,
 		first = false;
 	}
 }
+EXPORT_SYMBOL(obd_connect_seq_flags2str);
 
 int obd_connect_flags2str(char *page, int count, __u64 flags, __u64 flags2,
 			  const char *sep)
@@ -941,8 +862,8 @@ int obd_connect_flags2str(char *page, int count, __u64 flags, __u64 flags2,
 }
 EXPORT_SYMBOL(obd_connect_flags2str);
 
-static void obd_connect_data_seqprint(struct seq_file *m,
-				      struct obd_connect_data *ocd)
+void
+obd_connect_data_seqprint(struct seq_file *m, struct obd_connect_data *ocd)
 {
 	__u64 flags;
 
@@ -998,16 +919,16 @@ static void obd_connect_data_seqprint(struct seq_file *m,
 
 int lprocfs_import_seq_show(struct seq_file *m, void *data)
 {
-	char				nidstr[LNET_NIDSTR_SIZE];
-	struct lprocfs_counter          ret;
-	struct lprocfs_counter_header   *header;
-	struct obd_device               *obd    = (struct obd_device *)data;
-	struct obd_import               *imp;
-	struct obd_import_conn          *conn;
-	struct obd_connect_data		*ocd;
-	int                             j;
-	int                             k;
-	int                             rw      = 0;
+	char nidstr[LNET_NIDSTR_SIZE];
+	struct lprocfs_counter ret;
+	struct lprocfs_counter_header *header;
+	struct obd_device *obd = (struct obd_device *)data;
+	struct obd_import *imp;
+	struct obd_import_conn *conn;
+	struct obd_connect_data *ocd;
+	int j;
+	int k;
+	int rw = 0;
 
 	LASSERT(obd != NULL);
 	LPROCFS_CLIMP_CHECK(obd);
@@ -1041,7 +962,7 @@ int lprocfs_import_seq_show(struct seq_file *m, void *data)
 		seq_printf(m, "%s%s", j ? ", " : "", nidstr);
 		j++;
 	}
-	if (imp->imp_connection != NULL)
+	if (imp->imp_connection)
 		libcfs_nid2str_r(imp->imp_connection->c_peer.nid,
 				 nidstr, sizeof(nidstr));
 	else
@@ -1050,14 +971,16 @@ int lprocfs_import_seq_show(struct seq_file *m, void *data)
 		   "       current_connection: %s\n"
 		   "       connection_attempts: %u\n"
 		   "       generation: %u\n"
-		   "       in-progress_invalidations: %u\n",
+		   "       in-progress_invalidations: %u\n"
+		   "       idle: %lld sec\n",
 		   nidstr,
 		   imp->imp_conn_cnt,
 		   imp->imp_generation,
-		   atomic_read(&imp->imp_inval_count));
+		   atomic_read(&imp->imp_inval_count),
+		   ktime_get_real_seconds() - imp->imp_last_reply_time);
 	spin_unlock(&imp->imp_lock);
 
-	if (obd->obd_svc_stats == NULL)
+	if (!obd->obd_svc_stats)
 		goto out_climp;
 
 	header = &obd->obd_svc_stats->ls_cnt_header[PTLRPC_REQWAIT_CNTR];
@@ -1239,14 +1162,83 @@ int lprocfs_connect_flags_seq_show(struct seq_file *m, void *data)
 }
 EXPORT_SYMBOL(lprocfs_connect_flags_seq_show);
 
-int
-lprocfs_obd_setup(struct obd_device *obd)
+static const struct attribute *obd_def_uuid_attrs[] = {
+	&lustre_attr_uuid.attr,
+	NULL,
+};
+
+static const struct attribute *obd_def_attrs[] = {
+	&lustre_attr_blocksize.attr,
+	&lustre_attr_kbytestotal.attr,
+	&lustre_attr_kbytesfree.attr,
+	&lustre_attr_kbytesavail.attr,
+	&lustre_attr_filestotal.attr,
+	&lustre_attr_filesfree.attr,
+	&lustre_attr_uuid.attr,
+	NULL,
+};
+
+static void obd_sysfs_release(struct kobject *kobj)
 {
-	int rc = 0;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 
-	LASSERT(obd != NULL);
-	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
-	LASSERT(obd->obd_type->typ_procroot != NULL);
+	complete(&obd->obd_kobj_unregister);
+}
+
+int lprocfs_obd_setup(struct obd_device *obd, bool uuid_only)
+{
+	struct ldebugfs_vars *debugfs_vars = NULL;
+	int rc;
+
+	if (!obd || obd->obd_magic != OBD_DEVICE_MAGIC)
+		return -ENODEV;
+
+	rc = kobject_set_name(&obd->obd_kset.kobj, "%s", obd->obd_name);
+	if (rc)
+		return rc;
+
+	obd->obd_ktype.sysfs_ops = &lustre_sysfs_ops;
+	obd->obd_ktype.release = obd_sysfs_release;
+
+	obd->obd_kset.kobj.parent = obd->obd_type->typ_kobj;
+	obd->obd_kset.kobj.ktype = &obd->obd_ktype;
+	init_completion(&obd->obd_kobj_unregister);
+	rc = kset_register(&obd->obd_kset);
+	if (rc)
+		return rc;
+
+	if (uuid_only)
+		obd->obd_attrs = obd_def_uuid_attrs;
+	else
+		obd->obd_attrs = obd_def_attrs;
+
+	rc = sysfs_create_files(&obd->obd_kset.kobj, obd->obd_attrs);
+	if (rc) {
+		kset_unregister(&obd->obd_kset);
+		return rc;
+	}
+
+	if (!obd->obd_type->typ_procroot)
+		debugfs_vars = obd->obd_debugfs_vars;
+	obd->obd_debugfs_entry = ldebugfs_register(obd->obd_name,
+						   obd->obd_type->typ_debugfs_entry,
+						   debugfs_vars, obd);
+	if (IS_ERR_OR_NULL(obd->obd_debugfs_entry)) {
+		rc = obd->obd_debugfs_entry ? PTR_ERR(obd->obd_debugfs_entry)
+					    : -ENOMEM;
+		CERROR("error %d setting up debugfs for %s\n",
+		       rc, obd->obd_name);
+		obd->obd_debugfs_entry = NULL;
+
+		sysfs_remove_files(&obd->obd_kset.kobj, obd->obd_attrs);
+		obd->obd_attrs = NULL;
+		kset_unregister(&obd->obd_kset);
+		return rc;
+	}
+
+	if (obd->obd_proc_entry || !obd->obd_type->typ_procroot)
+		GOTO(already_registered, rc);
 
 	obd->obd_proc_entry = lprocfs_register(obd->obd_name,
 					       obd->obd_type->typ_procroot,
@@ -1255,42 +1247,66 @@ lprocfs_obd_setup(struct obd_device *obd)
 		rc = PTR_ERR(obd->obd_proc_entry);
 		CERROR("error %d setting up lprocfs for %s\n",rc,obd->obd_name);
 		obd->obd_proc_entry = NULL;
+
+		ldebugfs_remove(&obd->obd_debugfs_entry);
+		sysfs_remove_files(&obd->obd_kset.kobj, obd->obd_attrs);
+		obd->obd_attrs = NULL;
+		kset_unregister(&obd->obd_kset);
+		return rc;
 	}
+already_registered:
 	return rc;
 }
 EXPORT_SYMBOL(lprocfs_obd_setup);
 
 int lprocfs_obd_cleanup(struct obd_device *obd)
 {
-        if (!obd)
-                return -EINVAL;
-        if (obd->obd_proc_exports_entry) {
-                /* Should be no exports left */
-                lprocfs_remove(&obd->obd_proc_exports_entry);
-                obd->obd_proc_exports_entry = NULL;
-        }
-        if (obd->obd_proc_entry) {
-                lprocfs_remove(&obd->obd_proc_entry);
-                obd->obd_proc_entry = NULL;
-        }
-        return 0;
+	if (!obd)
+		return -EINVAL;
+
+	if (obd->obd_proc_exports_entry) {
+		/* Should be no exports left */
+		lprocfs_remove(&obd->obd_proc_exports_entry);
+		obd->obd_proc_exports_entry = NULL;
+	}
+
+	if (obd->obd_proc_entry) {
+		lprocfs_remove(&obd->obd_proc_entry);
+		obd->obd_proc_entry = NULL;
+	}
+
+	if (!IS_ERR_OR_NULL(obd->obd_debugfs_entry))
+		ldebugfs_remove(&obd->obd_debugfs_entry);
+
+	/* obd device never allocated a kset */
+	if (!obd->obd_kset.kobj.state_initialized)
+		return 0;
+
+	if (obd->obd_attrs) {
+		sysfs_remove_files(&obd->obd_kset.kobj, obd->obd_attrs);
+		obd->obd_attrs = NULL;
+	}
+
+	kset_unregister(&obd->obd_kset);
+	wait_for_completion(&obd->obd_kobj_unregister);
+	return 0;
 }
 EXPORT_SYMBOL(lprocfs_obd_cleanup);
 
 int lprocfs_stats_alloc_one(struct lprocfs_stats *stats, unsigned int cpuid)
 {
-	struct lprocfs_counter  *cntr;
-	unsigned int            percpusize;
-	int                     rc = -ENOMEM;
-	unsigned long           flags = 0;
-	int                     i;
+	struct lprocfs_counter *cntr;
+	unsigned int percpusize;
+	int rc = -ENOMEM;
+	unsigned long flags = 0;
+	int i;
 
 	LASSERT(stats->ls_percpu[cpuid] == NULL);
 	LASSERT((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0);
 
 	percpusize = lprocfs_stats_counter_size(stats);
 	LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[cpuid], percpusize);
-	if (stats->ls_percpu[cpuid] != NULL) {
+	if (stats->ls_percpu[cpuid]) {
 		rc = 0;
 		if (unlikely(stats->ls_biggest_alloc_num <= cpuid)) {
 			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
@@ -1317,16 +1333,16 @@ int lprocfs_stats_alloc_one(struct lprocfs_stats *stats, unsigned int cpuid)
 struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num,
                                           enum lprocfs_stats_flags flags)
 {
-	struct lprocfs_stats	*stats;
-	unsigned int		num_entry;
-	unsigned int		percpusize = 0;
-	int			i;
+	struct lprocfs_stats *stats;
+	unsigned int num_entry;
+	unsigned int percpusize = 0;
+	int i;
 
-        if (num == 0)
-                return NULL;
+	if (num == 0)
+		return NULL;
 
-        if (lprocfs_no_percpu_stats != 0)
-                flags |= LPROCFS_STATS_FLAG_NOPERCPU;
+	if (lprocfs_no_percpu_stats != 0)
+		flags |= LPROCFS_STATS_FLAG_NOPERCPU;
 
 	if (flags & LPROCFS_STATS_FLAG_NOPERCPU)
 		num_entry = 1;
@@ -1335,7 +1351,7 @@ struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num,
 
 	/* alloc percpu pointers for all possible cpu slots */
 	LIBCFS_ALLOC(stats, offsetof(typeof(*stats), ls_percpu[num_entry]));
-	if (stats == NULL)
+	if (!stats)
 		return NULL;
 
 	stats->ls_num = num;
@@ -1345,14 +1361,14 @@ struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num,
 	/* alloc num of counter headers */
 	LIBCFS_ALLOC(stats->ls_cnt_header,
 		     stats->ls_num * sizeof(struct lprocfs_counter_header));
-	if (stats->ls_cnt_header == NULL)
+	if (!stats->ls_cnt_header)
 		goto fail;
 
 	if ((flags & LPROCFS_STATS_FLAG_NOPERCPU) != 0) {
 		/* contains only one set counters */
 		percpusize = lprocfs_stats_counter_size(stats);
 		LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[0], percpusize);
-		if (stats->ls_percpu[0] == NULL)
+		if (!stats->ls_percpu[0])
 			goto fail;
 		stats->ls_biggest_alloc_num = 1;
 	} else if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) {
@@ -1377,9 +1393,9 @@ void lprocfs_free_stats(struct lprocfs_stats **statsh)
 	unsigned int percpusize;
 	unsigned int i;
 
-        if (stats == NULL || stats->ls_num == 0)
-                return;
-        *statsh = NULL;
+	if (!stats || stats->ls_num == 0)
+		return;
+	*statsh = NULL;
 
 	if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU)
 		num_entry = 1;
@@ -1388,9 +1404,9 @@ void lprocfs_free_stats(struct lprocfs_stats **statsh)
 
 	percpusize = lprocfs_stats_counter_size(stats);
 	for (i = 0; i < num_entry; i++)
-		if (stats->ls_percpu[i] != NULL)
+		if (stats->ls_percpu[i])
 			LIBCFS_FREE(stats->ls_percpu[i], percpusize);
-	if (stats->ls_cnt_header != NULL)
+	if (stats->ls_cnt_header)
 		LIBCFS_FREE(stats->ls_cnt_header, stats->ls_num *
 					sizeof(struct lprocfs_counter_header));
 	LIBCFS_FREE(stats, offsetof(typeof(*stats), ls_percpu[num_entry]));
@@ -1425,16 +1441,16 @@ EXPORT_SYMBOL(lprocfs_stats_collector);
 
 void lprocfs_clear_stats(struct lprocfs_stats *stats)
 {
-	struct lprocfs_counter		*percpu_cntr;
-	int				i;
-	int				j;
-	unsigned int			num_entry;
-	unsigned long			flags = 0;
+	struct lprocfs_counter *percpu_cntr;
+	int i;
+	int j;
+	unsigned int num_entry;
+	unsigned long flags = 0;
 
 	num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
 
 	for (i = 0; i < num_entry; i++) {
-		if (stats->ls_percpu[i] == NULL)
+		if (!stats->ls_percpu[i])
 			continue;
 		for (j = 0; j < stats->ls_num; j++) {
 			percpu_cntr = lprocfs_stats_counter_get(stats, i, j);
@@ -1456,12 +1472,12 @@ static ssize_t lprocfs_stats_seq_write(struct file *file,
 				       const char __user *buf,
 				       size_t len, loff_t *off)
 {
-        struct seq_file *seq = file->private_data;
-        struct lprocfs_stats *stats = seq->private;
+	struct seq_file *seq = file->private_data;
+	struct lprocfs_stats *stats = seq->private;
 
-        lprocfs_clear_stats(stats);
+	lprocfs_clear_stats(stats);
 
-        return len;
+	return len;
 }
 
 static void *lprocfs_stats_seq_start(struct seq_file *p, loff_t *pos)
@@ -1485,10 +1501,10 @@ static void *lprocfs_stats_seq_next(struct seq_file *p, void *v, loff_t *pos)
 /* seq file export of one lprocfs counter */
 static int lprocfs_stats_seq_show(struct seq_file *p, void *v)
 {
-	struct lprocfs_stats		*stats	= p->private;
-	struct lprocfs_counter_header	*hdr;
-	struct lprocfs_counter		 ctr;
-	int				 idx	= *(loff_t *)v;
+	struct lprocfs_stats *stats = p->private;
+	struct lprocfs_counter_header *hdr;
+	struct lprocfs_counter ctr;
+	int idx = *(loff_t *)v;
 
 	if (idx == 0) {
 		struct timespec64 now;
@@ -1537,10 +1553,20 @@ static int lprocfs_stats_seq_open(struct inode *inode, struct file *file)
 	if (rc)
 		return rc;
 	seq = file->private_data;
-	seq->private = inode->i_private ? : PDE_DATA(inode);
+	seq->private = inode->i_private ? inode->i_private : PDE_DATA(inode);
 	return 0;
 }
 
+const struct file_operations ldebugfs_stats_seq_fops = {
+	.owner   = THIS_MODULE,
+	.open    = lprocfs_stats_seq_open,
+	.read    = seq_read,
+	.write   = lprocfs_stats_seq_write,
+	.llseek  = seq_lseek,
+	.release = lprocfs_seq_release,
+};
+EXPORT_SYMBOL(ldebugfs_stats_seq_fops);
+
 static const struct proc_ops lprocfs_stats_seq_fops = {
 	PROC_OWNER(THIS_MODULE)
 	.proc_open	= lprocfs_stats_seq_open,
@@ -1550,15 +1576,6 @@ static const struct proc_ops lprocfs_stats_seq_fops = {
 	.proc_release	= lprocfs_seq_release,
 };
 
-static const struct file_operations ldebugfs_stats_seq_fops = {
-	.owner	 = THIS_MODULE,
-	.open	 = lprocfs_stats_seq_open,
-	.read	 = seq_read,
-	.write	 = lprocfs_stats_seq_write,
-	.llseek	 = seq_lseek,
-	.release = lprocfs_seq_release,
-};
-
 int ldebugfs_register_stats(struct dentry *parent, const char *name,
 			    struct lprocfs_stats *stats)
 {
@@ -1583,7 +1600,7 @@ int lprocfs_register_stats(struct proc_dir_entry *root, const char *name,
 
 	entry = proc_create_data(name, 0644, root,
 				 &lprocfs_stats_seq_fops, stats);
-	if (entry == NULL)
+	if (!entry)
 		return -ENOMEM;
 	return 0;
 }
@@ -1592,11 +1609,11 @@ EXPORT_SYMBOL(lprocfs_register_stats);
 void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
 			  unsigned conf, const char *name, const char *units)
 {
-	struct lprocfs_counter_header	*header;
-	struct lprocfs_counter		*percpu_cntr;
-	unsigned long			flags = 0;
-	unsigned int			i;
-	unsigned int			num_cpu;
+	struct lprocfs_counter_header *header;
+	struct lprocfs_counter *percpu_cntr;
+	unsigned long flags = 0;
+	unsigned int i;
+	unsigned int num_cpu;
 
 	LASSERT(stats != NULL);
 
@@ -1610,7 +1627,7 @@ void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
 
 	num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
 	for (i = 0; i < num_cpu; ++i) {
-		if (stats->ls_percpu[i] == NULL)
+		if (!stats->ls_percpu[i])
 			continue;
 		percpu_cntr = lprocfs_stats_counter_get(stats, i, index);
 		percpu_cntr->lc_count		= 0;
@@ -1625,49 +1642,23 @@ void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
 }
 EXPORT_SYMBOL(lprocfs_counter_init);
 
-/* Note that we only init md counters for ops whose offset is less
- * than NUM_MD_STATS. This is explained in a comment in the definition
- * of struct md_ops. */
-#define LPROCFS_MD_OP_INIT(base, stats, op)				       \
-	do {								       \
-		unsigned int _idx = base + MD_COUNTER_OFFSET(op);	       \
-									       \
-		if (MD_COUNTER_OFFSET(op) < NUM_MD_STATS) {		       \
-			LASSERT(_idx < stats->ls_num);			       \
-			lprocfs_counter_init(stats, _idx, 0, #op, "reqs");     \
-		}							       \
-	} while (0)
-
-void lprocfs_init_mps_stats(int num_private_stats, struct lprocfs_stats *stats)
-{
-	LPROCFS_MD_OP_INIT(num_private_stats, stats, get_root);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, null_inode);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, close);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, create);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, enqueue);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr_name);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_lock);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, link);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, rename);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, setattr);
-	LPROCFS_MD_OP_INIT(num_private_stats, stats, fsync);
-	LPROCFS_MD_OP_INIT(num_private_stats, stats, read_page);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, unlink);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, setxattr);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, getxattr);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, init_ea_size);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, get_lustre_md);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, free_lustre_md);
-	LPROCFS_MD_OP_INIT(num_private_stats, stats, merge_attr);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, set_open_replay_data);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, clear_open_replay_data);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, set_lock_data);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, lock_match);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, cancel_unused);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_getattr_async);
-        LPROCFS_MD_OP_INIT(num_private_stats, stats, revalidate_lock);
-}
+static const char * const mps_stats[] = {
+	[LPROC_MD_CLOSE]		= "close",
+	[LPROC_MD_CREATE]		= "create",
+	[LPROC_MD_ENQUEUE]		= "enqueue",
+	[LPROC_MD_GETATTR]		= "getattr",
+	[LPROC_MD_INTENT_LOCK]		= "intent_lock",
+	[LPROC_MD_LINK]			= "link",
+	[LPROC_MD_RENAME]		= "rename",
+	[LPROC_MD_SETATTR]		= "setattr",
+	[LPROC_MD_FSYNC]		= "fsync",
+	[LPROC_MD_READ_PAGE]		= "read_page",
+	[LPROC_MD_UNLINK]		= "unlink",
+	[LPROC_MD_SETXATTR]		= "setxattr",
+	[LPROC_MD_GETXATTR]		= "getxattr",
+	[LPROC_MD_INTENT_GETATTR_ASYNC]	= "intent_getattr_async",
+	[LPROC_MD_REVALIDATE_LOCK]	= "revalidate_lock",
+};
 
 int lprocfs_alloc_md_stats(struct obd_device *obd,
 			   unsigned int num_private_stats)
@@ -1676,11 +1667,8 @@ int lprocfs_alloc_md_stats(struct obd_device *obd,
 	unsigned int num_stats;
 	int rc, i;
 
-	CLASSERT(offsetof(struct md_ops, MD_STATS_FIRST_OP) == 0);
-	CLASSERT(_MD_COUNTER_OFFSET(MD_STATS_FIRST_OP) == 0);
-	CLASSERT(_MD_COUNTER_OFFSET(MD_STATS_LAST_OP) > 0);
-
-	/* TODO Ensure that this function is only used where
+	/*
+	 * TODO Ensure that this function is only used where
 	 * appropriate by adding an assertion to the effect that
 	 * obd->obd_type->typ_md_ops is not NULL. We can't do this now
 	 * because mdt_procfs_init() uses this function to allocate
@@ -1690,20 +1678,17 @@ int lprocfs_alloc_md_stats(struct obd_device *obd,
 	 */
 	LASSERT(obd->obd_proc_entry != NULL);
 	LASSERT(obd->obd_md_stats == NULL);
-	LASSERT(obd->obd_md_cntr_base == 0);
 
-	num_stats = NUM_MD_STATS + num_private_stats;
+	num_stats = ARRAY_SIZE(mps_stats) + num_private_stats;
 	stats = lprocfs_alloc_stats(num_stats, 0);
-	if (stats == NULL)
+	if (!stats)
 		return -ENOMEM;
 
-	lprocfs_init_mps_stats(num_private_stats, stats);
-
-	for (i = num_private_stats; i < num_stats; i++) {
-		if (stats->ls_cnt_header[i].lc_name == NULL) {
-			CERROR("Missing md_stat initializer md_op "
-			       "operation at offset %d. Aborting.\n",
-			       i - num_private_stats);
+	for (i = 0; i < ARRAY_SIZE(mps_stats); i++) {
+		lprocfs_counter_init(stats, i, 0, mps_stats[i], "reqs");
+		if (!stats->ls_cnt_header[i].lc_name) {
+			CERROR("Missing md_stat initializer md_op operation at offset %d. Aborting.\n",
+			       i);
 			LBUG();
 		}
 	}
@@ -1713,7 +1698,6 @@ int lprocfs_alloc_md_stats(struct obd_device *obd,
 		lprocfs_free_stats(&stats);
 	} else {
 		obd->obd_md_stats = stats;
-		obd->obd_md_cntr_base = num_private_stats;
 	}
 
 	return rc;
@@ -1724,9 +1708,8 @@ void lprocfs_free_md_stats(struct obd_device *obd)
 {
 	struct lprocfs_stats *stats = obd->obd_md_stats;
 
-	if (stats != NULL) {
+	if (stats) {
 		obd->obd_md_stats = NULL;
-		obd->obd_md_cntr_base = 0;
 		lprocfs_free_stats(&stats);
 	}
 }
@@ -1734,24 +1717,24 @@ EXPORT_SYMBOL(lprocfs_free_md_stats);
 
 void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
 {
-        lprocfs_counter_init(ldlm_stats,
-                             LDLM_ENQUEUE - LDLM_FIRST_OPC,
-                             0, "ldlm_enqueue", "reqs");
-        lprocfs_counter_init(ldlm_stats,
-                             LDLM_CONVERT - LDLM_FIRST_OPC,
-                             0, "ldlm_convert", "reqs");
-        lprocfs_counter_init(ldlm_stats,
-                             LDLM_CANCEL - LDLM_FIRST_OPC,
-                             0, "ldlm_cancel", "reqs");
-        lprocfs_counter_init(ldlm_stats,
-                             LDLM_BL_CALLBACK - LDLM_FIRST_OPC,
-                             0, "ldlm_bl_callback", "reqs");
-        lprocfs_counter_init(ldlm_stats,
-                             LDLM_CP_CALLBACK - LDLM_FIRST_OPC,
-                             0, "ldlm_cp_callback", "reqs");
-        lprocfs_counter_init(ldlm_stats,
-                             LDLM_GL_CALLBACK - LDLM_FIRST_OPC,
-                             0, "ldlm_gl_callback", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_ENQUEUE - LDLM_FIRST_OPC,
+			     0, "ldlm_enqueue", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_CONVERT - LDLM_FIRST_OPC,
+			     0, "ldlm_convert", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_CANCEL - LDLM_FIRST_OPC,
+			     0, "ldlm_cancel", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_BL_CALLBACK - LDLM_FIRST_OPC,
+			     0, "ldlm_bl_callback", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_CP_CALLBACK - LDLM_FIRST_OPC,
+			     0, "ldlm_cp_callback", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_GL_CALLBACK - LDLM_FIRST_OPC,
+			     0, "ldlm_gl_callback", "reqs");
 }
 EXPORT_SYMBOL(lprocfs_init_ldlm_stats);
 
@@ -1762,7 +1745,7 @@ __s64 lprocfs_read_helper(struct lprocfs_counter *lc,
 {
 	__s64 ret = 0;
 
-	if (lc == NULL || header == NULL)
+	if (!lc || !header)
 		RETURN(0);
 
 	switch (field) {
@@ -1796,86 +1779,6 @@ __s64 lprocfs_read_helper(struct lprocfs_counter *lc,
 }
 EXPORT_SYMBOL(lprocfs_read_helper);
 
-int lprocfs_read_frac_helper(char *buffer, unsigned long count, long val,
-                             int mult)
-{
-        long decimal_val, frac_val;
-        int prtn;
-
-        if (count < 10)
-                return -EINVAL;
-
-        decimal_val = val / mult;
-        prtn = snprintf(buffer, count, "%ld", decimal_val);
-        frac_val = val % mult;
-
-        if (prtn < (count - 4) && frac_val > 0) {
-                long temp_frac;
-                int i, temp_mult = 1, frac_bits = 0;
-
-                temp_frac = frac_val * 10;
-                buffer[prtn++] = '.';
-                while (frac_bits < 2 && (temp_frac / mult) < 1 ) {
-                        /* only reserved 2 bits fraction */
-                        buffer[prtn++] ='0';
-                        temp_frac *= 10;
-                        frac_bits++;
-                }
-                /*
-                 * Need to think these cases :
-                 *      1. #echo x.00 > /proc/xxx       output result : x
-                 *      2. #echo x.0x > /proc/xxx       output result : x.0x
-                 *      3. #echo x.x0 > /proc/xxx       output result : x.x
-                 *      4. #echo x.xx > /proc/xxx       output result : x.xx
-                 *      Only reserved 2 bits fraction.
-                 */
-                for (i = 0; i < (5 - prtn); i++)
-                        temp_mult *= 10;
-
-                frac_bits = min((int)count - prtn, 3 - frac_bits);
-                prtn += snprintf(buffer + prtn, frac_bits, "%ld",
-                                 frac_val * temp_mult / mult);
-
-                prtn--;
-                while(buffer[prtn] < '1' || buffer[prtn] > '9') {
-                        prtn--;
-                        if (buffer[prtn] == '.') {
-                                prtn--;
-                                break;
-                        }
-                }
-                prtn++;
-        }
-        buffer[prtn++] ='\n';
-        return prtn;
-}
-EXPORT_SYMBOL(lprocfs_read_frac_helper);
-
-int lprocfs_seq_read_frac_helper(struct seq_file *m, long val, int mult)
-{
-	long decimal_val, frac_val;
-
-	decimal_val = val / mult;
-	seq_printf(m, "%ld", decimal_val);
-	frac_val = val % mult;
-
-	if (frac_val > 0) {
-		frac_val *= 100;
-		frac_val /= mult;
-	}
-	if (frac_val > 0) {
-		/* Three cases: x0, xx, 0x */
-		if ((frac_val % 10) != 0)
-			seq_printf(m, ".%ld", frac_val);
-		else
-			seq_printf(m, ".%ld", frac_val / 10);
-	}
-
-	seq_printf(m, "\n");
-	return 0;
-}
-EXPORT_SYMBOL(lprocfs_seq_read_frac_helper);
-
 /* Obtains the conversion factor for the unit specified */
 static int get_mult(char unit, __u64 *mult)
 {
@@ -1886,19 +1789,19 @@ static int get_mult(char unit, __u64 *mult)
 	case 'p':
 	case 'P':
 		units <<= 10;
-		/* Fall through */
+		fallthrough;
 	case 't':
 	case 'T':
 		units <<= 10;
-		/* Fall through */
+		fallthrough;
 	case 'g':
 	case 'G':
 		units <<= 10;
-		/* Fall through */
+		fallthrough;
 	case 'm':
 	case 'M':
 		units <<= 10;
-		/* Fall through */
+		fallthrough;
 	case 'k':
 	case 'K':
 		units <<= 10;
@@ -2043,7 +1946,7 @@ static int str_to_u64_parse(char *buffer, unsigned long count,
 	}
 
 	/* the multiplier limits how large the value can be */
-	wrap_indicator /=  mult;
+	wrap_indicator = div64_u64(wrap_indicator, mult);
 
 	if (strwhole) {
 		rc = kstrtoull(strwhole, base, &whole);
@@ -2094,8 +1997,7 @@ static int str_to_u64_parse(char *buffer, unsigned long count,
  * of the signed integer.
  */
 static int str_to_s64_internal(const char __user *buffer, unsigned long count,
-			       __s64 *val, __u64 def_mult, bool allow_units,
-			       bool kernel_space)
+			       __s64 *val, __u64 def_mult, bool allow_units)
 {
 	char kernbuf[22];
 	__u64 tmp;
@@ -2107,12 +2009,8 @@ static int str_to_s64_internal(const char __user *buffer, unsigned long count,
 	if (count > (sizeof(kernbuf) - 1))
 		return -EINVAL;
 
-	if (kernel_space) {
-		memcpy(kernbuf, buffer, count);
-	} else {
-		if (copy_from_user(kernbuf, buffer, count))
-			return -EFAULT;
-	}
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
 
 	kernbuf[count] = '\0';
 
@@ -2138,29 +2036,6 @@ static int str_to_s64_internal(const char __user *buffer, unsigned long count,
 	return 0;
 }
 
-/**
- * Convert a user string into a signed 64 bit number. This function produces
- * an error when the value parsed from the string underflows or
- * overflows. This function accepts strings which contain digits and
- * optionally a decimal or hex strings which are prefixed with "0x".
- *
- * \param[in] buffer	string consisting of numbers and optionally a decimal
- * \param[in] count	buffer length
- * \param[in] val	if successful, the value represented by the string
- *
- * \retval		0 on success
- * \retval		negative number on error
- */
-int lprocfs_str_to_s64(struct file *file, const char __user *buffer,
-		       unsigned long count, __s64 *val)
-{
-	bool kernel_space;
-
-	kernel_space = lprocfs_file_is_kernel(file);
-	return str_to_s64_internal(buffer, count, val, 1, false, kernel_space);
-}
-EXPORT_SYMBOL(lprocfs_str_to_s64);
-
 /**
  * Convert a user string into a signed 64 bit number. This function produces
  * an error when the value parsed from the string times multiplier underflows or
@@ -2178,12 +2053,11 @@ EXPORT_SYMBOL(lprocfs_str_to_s64);
  * \retval		0 on success
  * \retval		negative number on error
  */
-int lprocfs_str_with_units_to_s64(struct file *file, const char __user *buffer,
+int lprocfs_str_with_units_to_s64(const char __user *buffer,
 				  unsigned long count, __s64 *val, char defunit)
 {
 	__u64 mult = 1;
 	int rc;
-	bool kernel_space;
 
 	if (defunit != '1') {
 		rc = get_mult(defunit, &mult);
@@ -2191,10 +2065,7 @@ int lprocfs_str_with_units_to_s64(struct file *file, const char __user *buffer,
 			return rc;
 	}
 
-	kernel_space = lprocfs_file_is_kernel(file);
-
-	return str_to_s64_internal(buffer, count, val, mult, true,
-			kernel_space);
+	return str_to_s64_internal(buffer, count, val, mult, true);
 }
 EXPORT_SYMBOL(lprocfs_str_with_units_to_s64);
 
@@ -2228,7 +2099,7 @@ char *lprocfs_find_named_value(const char *buffer, const char *name,
 
 	/* there is no strnstr() in rhel5 and ubuntu kernels */
 	val = lprocfs_strnstr(buffer, name, buflen);
-	if (val == NULL)
+	if (!val)
 		return (char *)buffer;
 
 	val += strlen(name);                             /* skip prefix */
@@ -2275,7 +2146,7 @@ int lprocfs_seq_create(struct proc_dir_entry *parent,
 
 	entry = proc_create_data(name, mode, parent, seq_fops, data);
 
-	if (entry == NULL)
+	if (!entry)
 		RETURN(-ENOMEM);
 
 	RETURN(0);
@@ -2317,12 +2188,12 @@ EXPORT_SYMBOL(lprocfs_oh_tally_log2);
 
 unsigned long lprocfs_oh_sum(struct obd_histogram *oh)
 {
-        unsigned long ret = 0;
-        int i;
+	unsigned long ret = 0;
+	int i;
 
-        for (i = 0; i < OBD_HIST_MAX; i++)
-                ret +=  oh->oh_buckets[i];
-        return ret;
+	for (i = 0; i < OBD_HIST_MAX; i++)
+		ret +=  oh->oh_buckets[i];
+	return ret;
 }
 EXPORT_SYMBOL(lprocfs_oh_sum);
 
@@ -2379,9 +2250,9 @@ ssize_t lprocfs_obd_max_pages_per_rpc_seq_write(struct file *file,
 	struct client_obd *cli = &dev->u.cli;
 	struct obd_connect_data *ocd = &cli->cl_import->imp_connect_data;
 	int chunk_mask, rc;
-	__s64 val;
+	s64 val;
 
-	rc = lprocfs_str_with_units_to_s64(file, buffer, count, &val, '1');
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &val, '1');
 	if (rc)
 		return rc;
 	if (val < 0)
@@ -2411,9 +2282,59 @@ ssize_t lprocfs_obd_max_pages_per_rpc_seq_write(struct file *file,
 }
 EXPORT_SYMBOL(lprocfs_obd_max_pages_per_rpc_seq_write);
 
-int lprocfs_wr_root_squash(struct file *file, const char __user *buffer,
-			   unsigned long count, struct root_squash_info *squash,
-			   char *name)
+ssize_t short_io_bytes_show(struct kobject *kobj, struct attribute *attr,
+			    char *buf)
+{
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct client_obd *cli = &dev->u.cli;
+	int rc;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	rc = sprintf(buf, "%d\n", cli->cl_max_short_io_bytes);
+	spin_unlock(&cli->cl_loi_list_lock);
+	return rc;
+}
+EXPORT_SYMBOL(short_io_bytes_show);
+
+/* Used to catch people who think they're specifying pages. */
+#define MIN_SHORT_IO_BYTES 64U
+
+ssize_t short_io_bytes_store(struct kobject *kobj, struct attribute *attr,
+			     const char *buffer, size_t count)
+{
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct client_obd *cli = &dev->u.cli;
+	u32 val;
+	int rc;
+
+	LPROCFS_CLIMP_CHECK(dev);
+
+	rc = kstrtouint(buffer, 0, &val);
+	if (rc)
+		GOTO(out, rc);
+
+	if (val && (val < MIN_SHORT_IO_BYTES || val > OBD_MAX_SHORT_IO_BYTES))
+		GOTO(out, rc = -ERANGE);
+
+	rc = count;
+
+	spin_lock(&cli->cl_loi_list_lock);
+	if (val > (cli->cl_max_pages_per_rpc << PAGE_SHIFT))
+		rc = -ERANGE;
+	else
+		cli->cl_max_short_io_bytes = val;
+	spin_unlock(&cli->cl_loi_list_lock);
+
+out:
+	LPROCFS_CLIMP_EXIT(dev);
+	return rc;
+}
+EXPORT_SYMBOL(short_io_bytes_store);
+
+int lprocfs_wr_root_squash(const char __user *buffer, unsigned long count,
+			   struct root_squash_info *squash, char *name)
 {
 	int rc;
 	char kernbuf[64], *tmp, *errmsg;
@@ -2424,7 +2345,7 @@ int lprocfs_wr_root_squash(struct file *file, const char __user *buffer,
 		errmsg = "string too long";
 		GOTO(failed_noprint, rc = -EINVAL);
 	}
-	if (lprocfs_copy_from_user(file, kernbuf, buffer, count)) {
+	if (copy_from_user(kernbuf, buffer, count)) {
 		errmsg = "bad address";
 		GOTO(failed_noprint, rc = -EFAULT);
 	}
@@ -2432,7 +2353,7 @@ int lprocfs_wr_root_squash(struct file *file, const char __user *buffer,
 
 	/* look for uid gid separator */
 	tmp = strchr(kernbuf, ':');
-	if (tmp == NULL) {
+	if (!tmp) {
 		errmsg = "needs uid:gid format";
 		GOTO(failed, rc = -EINVAL);
 	}
@@ -2459,7 +2380,7 @@ int lprocfs_wr_root_squash(struct file *file, const char __user *buffer,
 	RETURN(count);
 
 failed:
-	if (tmp != NULL) {
+	if (tmp) {
 		tmp--;
 		*tmp = ':';
 	}
@@ -2474,8 +2395,7 @@ int lprocfs_wr_root_squash(struct file *file, const char __user *buffer,
 EXPORT_SYMBOL(lprocfs_wr_root_squash);
 
 
-int lprocfs_wr_nosquash_nids(struct file *file, const char __user *buffer,
-			     unsigned long count,
+int lprocfs_wr_nosquash_nids(const char __user *buffer, unsigned long count,
 			     struct root_squash_info *squash, char *name)
 {
 	int rc;
@@ -2491,11 +2411,11 @@ int lprocfs_wr_nosquash_nids(struct file *file, const char __user *buffer,
 	}
 
 	OBD_ALLOC(kernbuf, count + 1);
-	if (kernbuf == NULL) {
+	if (!kernbuf) {
 		errmsg = "no memory";
 		GOTO(failed, rc = -ENOMEM);
 	}
-	if (lprocfs_copy_from_user(file, kernbuf, buffer, count)) {
+	if (copy_from_user(kernbuf, buffer, count)) {
 		errmsg = "bad address";
 		GOTO(failed, rc = -EFAULT);
 	}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c
index 6d78831dd37fe..4df66a941e535 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status_server.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2014, 2016, Intel Corporation.
+ * Copyright (c) 2014, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -34,12 +34,57 @@
 
 #define DEBUG_SUBSYSTEM S_CLASS
 
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
 
 #include <obd_class.h>
 #include <lprocfs_status.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_nodemap.h>
 
+#define MAX_STRING_SIZE 128
+
+struct dentry *ldebugfs_add_symlink(const char *name, const char *target,
+				    const char *format, ...)
+{
+	struct dentry *entry = NULL;
+	struct dentry *parent;
+	struct qstr dname;
+	va_list ap;
+	char *dest;
+
+	if (!target || !format)
+		return NULL;
+
+	dname.name = target;
+	dname.len = strlen(dname.name);
+	dname.hash = ll_full_name_hash(debugfs_lustre_root,
+				       dname.name, dname.len);
+	parent = d_lookup(debugfs_lustre_root, &dname);
+	if (!parent)
+		return NULL;
+
+	OBD_ALLOC_WAIT(dest, MAX_STRING_SIZE + 1);
+	if (!dest)
+		goto no_entry;
+
+	va_start(ap, format);
+	vsnprintf(dest, MAX_STRING_SIZE, format, ap);
+	va_end(ap);
+
+	entry = debugfs_create_symlink(name, parent, dest);
+	if (IS_ERR_OR_NULL(entry)) {
+		CERROR("LdebugFS: Could not create symbolic link from %s to %s\n",
+		       name, dest);
+		entry = NULL;
+	}
+
+	OBD_FREE(dest, MAX_STRING_SIZE + 1);
+no_entry:
+	dput(parent);
+	return entry;
+}
+EXPORT_SYMBOL(ldebugfs_add_symlink);
+
 #ifdef CONFIG_PROC_FS
 
 int lprocfs_evict_client_open(struct inode *inode, struct file *f)
@@ -79,7 +124,7 @@ lprocfs_evict_client_seq_write(struct file *file, const char __user *buffer,
 	 * bytes into kbuf, to ensure that the string is NUL-terminated.
 	 * UUID_MAX should include a trailing NUL already.
 	 */
-	if (lprocfs_copy_from_user(file, kbuf, buffer,
+	if (copy_from_user(kbuf, buffer,
 			   min_t(unsigned long, BUFLEN - 1, count))) {
 		count = -EFAULT;
 		goto out;
@@ -104,15 +149,108 @@ EXPORT_SYMBOL(lprocfs_evict_client_seq_write);
 
 #undef BUFLEN
 
-int lprocfs_num_exports_seq_show(struct seq_file *m, void *data)
+ssize_t num_exports_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf)
 {
-	struct obd_device *obd = data;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", obd->obd_num_exports);
+}
+EXPORT_SYMBOL(num_exports_show);
+
+static int obd_export_flags2str(struct obd_export *exp, struct seq_file *m)
+{
+	bool first = true;
+
+	flag2str(exp, failed);
+	flag2str(exp, in_recovery);
+	flag2str(exp, disconnected);
+	flag2str(exp, connecting);
 
-	LASSERT(obd != NULL);
-	seq_printf(m, "%u\n", obd->obd_num_exports);
 	return 0;
 }
-EXPORT_SYMBOL(lprocfs_num_exports_seq_show);
+
+static int
+lprocfs_exp_print_export_seq(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+			     struct hlist_node *hnode, void *cb_data)
+{
+	struct seq_file		*m = cb_data;
+	struct obd_export	*exp = cfs_hash_object(hs, hnode);
+	struct obd_device	*obd;
+	struct obd_connect_data	*ocd;
+
+	LASSERT(exp != NULL);
+	if (exp->exp_nid_stats == NULL)
+		goto out;
+	obd = exp->exp_obd;
+	ocd = &exp->exp_connect_data;
+
+	seq_printf(m, "%s:\n"
+		   "    name: %s\n"
+		   "    client: %s\n"
+		   "    connect_flags: [ ",
+		   obd_uuid2str(&exp->exp_client_uuid),
+		   obd->obd_name,
+		   obd_export_nid2str(exp));
+	obd_connect_seq_flags2str(m, ocd->ocd_connect_flags,
+				  ocd->ocd_connect_flags2, ", ");
+	seq_printf(m, " ]\n");
+	obd_connect_data_seqprint(m, ocd);
+	seq_printf(m, "    export_flags: [ ");
+	obd_export_flags2str(exp, m);
+	seq_printf(m, " ]\n");
+
+	if (obd->obd_type &&
+	    strcmp(obd->obd_type->typ_name, "obdfilter") == 0) {
+		struct filter_export_data *fed = &exp->exp_filter_data;
+
+		seq_printf(m, "    grant:\n");
+		seq_printf(m, "       granted: %ld\n",
+			fed->fed_ted.ted_grant);
+		seq_printf(m, "       dirty: %ld\n",
+			fed->fed_ted.ted_dirty);
+		seq_printf(m, "       pending: %ld\n",
+			fed->fed_ted.ted_pending);
+	}
+
+out:
+	return 0;
+}
+
+/**
+ * RPC connections are composed of an import and an export. Using the
+ * lctl utility we can extract important information about the state.
+ * The lprocfs_exp_export_seq_show routine displays the state information
+ * for the export.
+ *
+ * \param[in] m		seq file
+ * \param[in] data	unused
+ *
+ * \retval		0 on success
+ *
+ * The format of the export state information is like:
+ * a793e354-49c0-aa11-8c4f-a4f2b1a1a92b:
+ *     name: MGS
+ *     client: 10.211.55.10@tcp
+ *     connect_flags: [ version, barrier, adaptive_timeouts, ... ]
+ *     connect_data:
+ *        flags: 0x2000011005002020
+ *        instance: 0
+ *        target_version: 2.10.51.0
+ *        export_flags: [ ... ]
+ *
+ */
+static int lprocfs_exp_export_seq_show(struct seq_file *m, void *data)
+{
+	struct nid_stat *stats = m->private;
+	struct obd_device *obd = stats->nid_obd;
+
+	cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
+			      lprocfs_exp_print_export_seq, m);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(lprocfs_exp_export);
 
 static void lprocfs_free_client_stats(struct nid_stat *client_stat)
 {
@@ -259,6 +397,30 @@ int lprocfs_exp_replydata_seq_show(struct seq_file *m, void *data)
 }
 LPROC_SEQ_FOPS_RO(lprocfs_exp_replydata);
 
+int lprocfs_exp_print_fmd_count_seq(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+				    struct hlist_node *hnode, void *cb_data)
+
+{
+	struct obd_export *exp = cfs_hash_object(hs, hnode);
+	struct seq_file *m = cb_data;
+	struct tg_export_data *ted = &exp->exp_target_data;
+
+	seq_printf(m, "%d\n", ted->ted_fmd_count);
+
+	return 0;
+}
+
+int lprocfs_exp_fmd_count_seq_show(struct seq_file *m, void *data)
+{
+	struct nid_stat *stats = m->private;
+	struct obd_device *obd = stats->nid_obd;
+
+	cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
+			      lprocfs_exp_print_fmd_count_seq, m);
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(lprocfs_exp_fmd_count);
+
 int lprocfs_nid_stats_clear_seq_show(struct seq_file *m, void *data)
 {
 	seq_puts(m, "Write into this file to clear all nid stats and stale nid entries\n");
@@ -384,7 +546,8 @@ int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid)
 				   &lprocfs_exp_nodemap_fops);
 	if (IS_ERR(entry)) {
 		rc = PTR_ERR(entry);
-		CWARN("Error adding the nodemap file: rc = %d\n", rc);
+		CWARN("%s: error adding the nodemap file: rc = %d\n",
+		      obd->obd_name, rc);
 		GOTO(destroy_new_ns, rc);
 	}
 
@@ -392,7 +555,8 @@ int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid)
 				   &lprocfs_exp_uuid_fops);
 	if (IS_ERR(entry)) {
 		rc = PTR_ERR(entry);
-		CWARN("Error adding the NID stats file: rc = %d\n", rc);
+		CWARN("%s: error adding the NID stats file: rc = %d\n",
+		      obd->obd_name, rc);
 		GOTO(destroy_new_ns, rc);
 	}
 
@@ -400,7 +564,17 @@ int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid)
 				   &lprocfs_exp_hash_fops);
 	if (IS_ERR(entry)) {
 		rc = PTR_ERR(entry);
-		CWARN("Error adding the hash file: rc = %d\n", rc);
+		CWARN("%s: error adding the hash file: rc = %d\n",
+		      obd->obd_name, rc);
+		GOTO(destroy_new_ns, rc);
+	}
+
+	entry = lprocfs_add_simple(new_stat->nid_proc, "export",
+				   new_stat, &lprocfs_exp_export_fops);
+	if (IS_ERR(entry)) {
+		rc = PTR_ERR(entry);
+		CWARN("%s: error adding the export file: rc = %d\n",
+		      obd->obd_name, rc);
 		GOTO(destroy_new_ns, rc);
 	}
 
@@ -408,7 +582,16 @@ int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid)
 				   &lprocfs_exp_replydata_fops);
 	if (IS_ERR(entry)) {
 		rc = PTR_ERR(entry);
-		CWARN("%s: Error adding the reply_data file: rc = %d\n",
+		CWARN("%s: error adding the reply_data file: rc = %d\n",
+		      obd->obd_name, rc);
+		GOTO(destroy_new_ns, rc);
+	}
+
+	entry = lprocfs_add_simple(new_stat->nid_proc, "fmd_count", new_stat,
+				   &lprocfs_exp_fmd_count_fops);
+	if (IS_ERR(entry)) {
+		rc = PTR_ERR(entry);
+		CWARN("%s: error adding the fmd_count file: rc = %d\n",
 		      obd->obd_name, rc);
 		GOTO(destroy_new_ns, rc);
 	}
@@ -449,92 +632,24 @@ int lprocfs_exp_cleanup(struct obd_export *exp)
 	return 0;
 }
 
-#define LPROCFS_OBD_OP_INIT(base, stats, op)			\
-do {								\
-	unsigned int coffset = base + OBD_COUNTER_OFFSET(op);	\
-	LASSERT(coffset < stats->ls_num);			\
-	lprocfs_counter_init(stats, coffset, 0, #op, "reqs");	\
-} while (0)
-
-void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats)
-{
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, iocontrol);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_info);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, set_info_async);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, setup);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, precleanup);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, cleanup);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, process_config);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, postrecov);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, add_conn);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, del_conn);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, connect);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, reconnect);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, disconnect);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_init);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_fini);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_alloc);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs_async);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, create);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, preprw);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, commitrw);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, init_export);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy_export);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, import_event);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, notify);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, health_check);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_uuid);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotactl);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, ping);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_new);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_rem);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_add);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_del);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, getref);
-	LPROCFS_OBD_OP_INIT(num_private_stats, stats, putref);
-
-	CLASSERT(NUM_OBD_STATS == OBD_COUNTER_OFFSET(putref) + 1);
-}
-EXPORT_SYMBOL(lprocfs_init_ops_stats);
-
-int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats)
+int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned int num_stats)
 {
 	struct lprocfs_stats *stats;
-	unsigned int num_stats;
-	int rc, i;
+	int rc;
 
 	LASSERT(obd->obd_stats == NULL);
 	LASSERT(obd->obd_proc_entry != NULL);
-	LASSERT(obd->obd_cntr_base == 0);
 
-	num_stats = NUM_OBD_STATS + num_private_stats;
 	stats = lprocfs_alloc_stats(num_stats, 0);
 	if (stats == NULL)
 		return -ENOMEM;
 
-	lprocfs_init_ops_stats(num_private_stats, stats);
-
-	for (i = num_private_stats; i < num_stats; i++) {
-		/* If this LBUGs, it is likely that an obd
-		 * operation was added to struct obd_ops in
-		 * <obd.h>, and that the corresponding line item
-		 * LPROCFS_OBD_OP_INIT(.., .., opname)
-		 * is missing from the list above. */
-		LASSERTF(stats->ls_cnt_header[i].lc_name != NULL,
-			 "Missing obd_stat initializer obd_op "
-			 "operation at offset %d.\n", i - num_private_stats);
-	}
 	rc = lprocfs_register_stats(obd->obd_proc_entry, "stats", stats);
-	if (rc < 0) {
+	if (rc < 0)
 		lprocfs_free_stats(&stats);
-	} else {
-		obd->obd_stats  = stats;
-		obd->obd_cntr_base = num_private_stats;
-	}
+	else
+		obd->obd_stats = stats;
+
 	return rc;
 }
 EXPORT_SYMBOL(lprocfs_alloc_obd_stats);
@@ -569,7 +684,7 @@ int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data)
 	LASSERT(obd != NULL);
 
 	seq_printf(m, "status: ");
-	if (obd->obd_max_recoverable_clients == 0) {
+	if (atomic_read(&obd->obd_max_recoverable_clients) == 0) {
 		seq_printf(m, "INACTIVE\n");
 		goto out;
 	}
@@ -585,9 +700,9 @@ int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data)
 			   ktime_get_real_seconds() - obd->obd_recovery_start);
 		/* Number of clients that have completed recovery */
 		seq_printf(m, "completed_clients: %d/%d\n",
-			   obd->obd_max_recoverable_clients -
+			   atomic_read(&obd->obd_max_recoverable_clients) -
 			   obd->obd_stale_clients,
-			   obd->obd_max_recoverable_clients);
+			   atomic_read(&obd->obd_max_recoverable_clients));
 		seq_printf(m, "replayed_requests: %d\n",
 			   obd->obd_replayed_requests);
 		seq_printf(m, "last_transno: %lld\n",
@@ -643,7 +758,7 @@ int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data)
 			 ktime_get_real_seconds()));
 	seq_printf(m, "connected_clients: %d/%d\n",
 		   atomic_read(&obd->obd_connected_clients),
-		   obd->obd_max_recoverable_clients);
+		   atomic_read(&obd->obd_max_recoverable_clients));
 	/* Number of clients that have completed recovery */
 	seq_printf(m, "req_replay_clients: %d\n",
 		   atomic_read(&obd->obd_req_replay_clients));
@@ -663,27 +778,25 @@ int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data)
 }
 EXPORT_SYMBOL(lprocfs_recovery_status_seq_show);
 
-int lprocfs_ir_factor_seq_show(struct seq_file *m, void *data)
+ssize_t ir_factor_show(struct kobject *kobj, struct attribute *attr,
+		       char *buf)
 {
-	struct obd_device *obd = m->private;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 
-	LASSERT(obd != NULL);
-	seq_printf(m, "%d\n", obd->obd_recovery_ir_factor);
-	return 0;
+	return scnprintf(buf, PAGE_SIZE, "%d\n", obd->obd_recovery_ir_factor);
 }
-EXPORT_SYMBOL(lprocfs_ir_factor_seq_show);
+EXPORT_SYMBOL(ir_factor_show);
 
-ssize_t
-lprocfs_ir_factor_seq_write(struct file *file, const char __user *buffer,
-			    size_t count, loff_t *off)
+ssize_t ir_factor_store(struct kobject *kobj, struct attribute *attr,
+			const char *buffer, size_t count)
 {
-	struct seq_file *m = file->private_data;
-	struct obd_device *obd = m->private;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	int val;
 	int rc;
-	__s64 val;
 
-	LASSERT(obd != NULL);
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtoint(buffer, 10, &val);
 	if (rc)
 		return rc;
 
@@ -693,7 +806,7 @@ lprocfs_ir_factor_seq_write(struct file *file, const char __user *buffer,
 	obd->obd_recovery_ir_factor = val;
 	return count;
 }
-EXPORT_SYMBOL(lprocfs_ir_factor_seq_write);
+EXPORT_SYMBOL(ir_factor_store);
 
 int lprocfs_checksum_dump_seq_show(struct seq_file *m, void *data)
 {
@@ -711,93 +824,85 @@ lprocfs_checksum_dump_seq_write(struct file *file, const char __user *buffer,
 {
 	struct seq_file *m = file->private_data;
 	struct obd_device *obd = m->private;
+	bool val;
 	int rc;
-	__s64 val;
 
 	LASSERT(obd != NULL);
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtobool_from_user(buffer, count, &val);
 	if (rc)
 		return rc;
 
-	obd->obd_checksum_dump = !!val;
+	obd->obd_checksum_dump = val;
 	return count;
 }
 EXPORT_SYMBOL(lprocfs_checksum_dump_seq_write);
 
-int lprocfs_recovery_time_soft_seq_show(struct seq_file *m, void *data)
+ssize_t recovery_time_soft_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
 {
-	struct obd_device *obd = m->private;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 
-	LASSERT(obd != NULL);
-	seq_printf(m, "%llu\n", obd->obd_recovery_timeout);
-	return 0;
+	return scnprintf(buf, PAGE_SIZE, "%ld\n", obd->obd_recovery_timeout);
 }
-EXPORT_SYMBOL(lprocfs_recovery_time_soft_seq_show);
+EXPORT_SYMBOL(recovery_time_soft_show);
 
-ssize_t
-lprocfs_recovery_time_soft_seq_write(struct file *file,
-				     const char __user *buffer,
-				     size_t count, loff_t *off)
+ssize_t recovery_time_soft_store(struct kobject *kobj,
+				 struct attribute *attr,
+				 const char *buffer, size_t count)
 {
-	struct seq_file *m = file->private_data;
-	struct obd_device *obd = m->private;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	unsigned int val;
 	int rc;
-	__s64 val;
 
-	LASSERT(obd != NULL);
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtouint(buffer, 0, &val);
 	if (rc)
 		return rc;
-	if (val < 0 || val > INT_MAX)
-		return -ERANGE;
 
 	obd->obd_recovery_timeout = val;
 	return count;
 }
-EXPORT_SYMBOL(lprocfs_recovery_time_soft_seq_write);
+EXPORT_SYMBOL(recovery_time_soft_store);
 
-int lprocfs_recovery_time_hard_seq_show(struct seq_file *m, void *data)
+ssize_t recovery_time_hard_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
 {
-	struct obd_device *obd = m->private;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 
-	LASSERT(obd != NULL);
-	seq_printf(m, "%lld\n", obd->obd_recovery_time_hard);
-	return 0;
+	return scnprintf(buf, PAGE_SIZE, "%ld\n", obd->obd_recovery_time_hard);
 }
-EXPORT_SYMBOL(lprocfs_recovery_time_hard_seq_show);
+EXPORT_SYMBOL(recovery_time_hard_show);
 
-ssize_t
-lprocfs_recovery_time_hard_seq_write(struct file *file,
-				     const char __user *buffer,
-				     size_t count, loff_t *off)
+ssize_t recovery_time_hard_store(struct kobject *kobj,
+				 struct attribute *attr,
+				 const char *buffer, size_t count)
 {
-	struct seq_file *m = file->private_data;
-	struct obd_device *obd = m->private;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	unsigned int val;
 	int rc;
-	__s64 val;
 
-	LASSERT(obd != NULL);
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtouint(buffer, 0, &val);
 	if (rc)
 		return rc;
-	if (val < 0 || val > INT_MAX)
-		return -ERANGE;
 
 	obd->obd_recovery_time_hard = val;
 	return count;
 }
-EXPORT_SYMBOL(lprocfs_recovery_time_hard_seq_write);
+EXPORT_SYMBOL(recovery_time_hard_store);
 
-int lprocfs_target_instance_seq_show(struct seq_file *m, void *data)
+ssize_t instance_show(struct kobject *kobj, struct attribute *attr,
+		      char *buf)
 {
-	struct obd_device *obd = m->private;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 	struct obd_device_target *target = &obd->u.obt;
 
-	LASSERT(obd != NULL);
 	LASSERT(target->obt_magic == OBT_MAGIC);
-	seq_printf(m, "%u\n", obd->u.obt.obt_instance);
-	return 0;
+	return scnprintf(buf, PAGE_SIZE, "%u\n", obd->u.obt.obt_instance);
 }
-EXPORT_SYMBOL(lprocfs_target_instance_seq_show);
+EXPORT_SYMBOL(instance_show);
 
 #endif /* CONFIG_PROC_FS*/
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c b/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c
index 21a137bad0bae..42e880e8a3948 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lu_object.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -44,6 +44,8 @@
 #include <linux/list.h>
 #include <libcfs/libcfs.h>
 #include <libcfs/libcfs_hash.h> /* hash_long() */
+#include <libcfs/linux/linux-mem.h>
+#include <libcfs/linux/linux-hash.h>
 #include <obd_class.h>
 #include <obd_support.h>
 #include <lustre_disk.h>
@@ -51,6 +53,28 @@
 #include <lu_object.h>
 #include <lu_ref.h>
 
+struct lu_site_bkt_data {
+	/**
+	 * LRU list, updated on each access to object. Protected by
+	 * bucket lock of lu_site::ls_obj_hash.
+	 *
+	 * "Cold" end of LRU is lu_site::ls_lru.next. Accessed object are
+	 * moved to the lu_site::ls_lru.prev (this is due to the non-existence
+	 * of list_for_each_entry_safe_reverse()).
+	 */
+	struct list_head		lsb_lru;
+	/**
+	 * Wait-queue signaled when an object in this site is ultimately
+	 * destroyed (lu_object_free()) or initialized (lu_object_start()).
+	 * It is used by lu_object_find() to wait before re-trying when
+	 * object in the process of destruction is found in the hash table;
+	 * or wait object to be initialized by the allocator.
+	 *
+	 * \see htable_lookup().
+	 */
+	wait_queue_head_t		lsb_waitq;
+};
+
 enum {
 	LU_CACHE_PERCENT_MAX     = 50,
 	LU_CACHE_PERCENT_DEFAULT = 20
@@ -85,6 +109,18 @@ MODULE_PARM_DESC(lu_cache_nr, "Maximum number of objects in lu_object cache");
 static void lu_object_free(const struct lu_env *env, struct lu_object *o);
 static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx);
 
+wait_queue_head_t *
+lu_site_wq_from_fid(struct lu_site *site, struct lu_fid *fid)
+{
+	struct cfs_hash_bd bd;
+	struct lu_site_bkt_data *bkt;
+
+	cfs_hash_bd_get(site->ls_obj_hash, fid, &bd);
+	bkt = cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
+	return &bkt->lsb_waitq;
+}
+EXPORT_SYMBOL(lu_site_wq_from_fid);
+
 /**
  * Decrease reference counter on object. If last reference is freed, return
  * object to the cache, unless lu_object_is_dying(o) holds. In the latter
@@ -93,22 +129,18 @@ static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx);
 void lu_object_put(const struct lu_env *env, struct lu_object *o)
 {
 	struct lu_site_bkt_data *bkt;
-	struct lu_object_header *top;
-	struct lu_site *site;
-	struct lu_object *orig;
+	struct lu_object_header *top = o->lo_header;
+	struct lu_site *site = o->lo_dev->ld_site;
+	struct lu_object *orig = o;
 	struct cfs_hash_bd bd;
-	const struct lu_fid *fid;
-
-	top  = o->lo_header;
-	site = o->lo_dev->ld_site;
-	orig = o;
+	const struct lu_fid *fid = lu_object_fid(o);
+	bool is_dying;
 
 	/*
 	 * till we have full fids-on-OST implemented anonymous objects
 	 * are possible in OSP. such an object isn't listed in the site
 	 * so we should not remove it from the site.
 	 */
-	fid = lu_object_fid(o);
 	if (fid_is_zero(fid)) {
 		LASSERT(top->loh_hash.next == NULL
 			&& top->loh_hash.pprev == NULL);
@@ -126,13 +158,19 @@ void lu_object_put(const struct lu_env *env, struct lu_object *o)
 	cfs_hash_bd_get(site->ls_obj_hash, &top->loh_fid, &bd);
 	bkt = cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
 
+	is_dying = lu_object_is_dying(top);
 	if (!cfs_hash_bd_dec_and_lock(site->ls_obj_hash, &bd, &top->loh_ref)) {
-		if (lu_object_is_dying(top)) {
+		/* at this point the object reference is dropped and lock is
+		 * not taken, so lu_object should not be touched because it
+		 * can be freed by concurrent thread. Use local variable for
+		 * check.
+		 */
+		if (is_dying) {
 			/*
 			 * somebody may be waiting for this, currently only
 			 * used for cl_object, see cl_object_put_last().
 			 */
-			wake_up_all(&bkt->lsb_marche_funebre);
+			wake_up_all(&bkt->lsb_waitq);
 		}
 		return;
 	}
@@ -146,15 +184,17 @@ void lu_object_put(const struct lu_env *env, struct lu_object *o)
 			o->lo_ops->loo_object_release(env, o);
 	}
 
+	/* don't use local 'is_dying' here because if was taken without lock
+	 * but here we need the latest actual value of it so check lu_object
+	 * directly here.
+	 */
 	if (!lu_object_is_dying(top) &&
 	    (lu_object_exists(orig) || lu_object_is_cl(orig))) {
 		LASSERT(list_empty(&top->loh_lru));
 		list_add_tail(&top->loh_lru, &bkt->lsb_lru);
-		bkt->lsb_lru_len++;
 		percpu_counter_inc(&site->ls_lru_len_counter);
-		CDEBUG(D_INODE, "Add %p to site lru. hash: %p, bkt: %p, "
-		       "lru_len: %ld\n",
-		       o, site->ls_obj_hash, bkt, bkt->lsb_lru_len);
+		CDEBUG(D_INODE, "Add %p/%p to site lru. hash: %p, bkt: %p\n",
+		       orig, top, site->ls_obj_hash, bkt);
 		cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1);
 		return;
 	}
@@ -213,7 +253,6 @@ void lu_object_unhash(const struct lu_env *env, struct lu_object *o)
 
 			list_del_init(&top->loh_lru);
 			bkt = cfs_hash_bd_extra_get(obj_hash, &bd);
-			bkt->lsb_lru_len--;
 			percpu_counter_dec(&site->ls_lru_len_counter);
 		}
 		cfs_hash_bd_del_locked(obj_hash, &bd, &top->loh_hash);
@@ -230,17 +269,9 @@ EXPORT_SYMBOL(lu_object_unhash);
  */
 static struct lu_object *lu_object_alloc(const struct lu_env *env,
 					 struct lu_device *dev,
-					 const struct lu_fid *f,
-					 const struct lu_object_conf *conf)
+					 const struct lu_fid *f)
 {
-	struct lu_object *scan;
 	struct lu_object *top;
-	struct list_head *layers;
-	unsigned int init_mask = 0;
-	unsigned int init_flag;
-	int clean;
-	int result;
-	ENTRY;
 
 	/*
 	 * Create top-level object slice. This will also create
@@ -248,15 +279,36 @@ static struct lu_object *lu_object_alloc(const struct lu_env *env,
 	 */
 	top = dev->ld_ops->ldo_object_alloc(env, NULL, dev);
 	if (top == NULL)
-		RETURN(ERR_PTR(-ENOMEM));
+		return ERR_PTR(-ENOMEM);
 	if (IS_ERR(top))
-		RETURN(top);
-        /*
-         * This is the only place where object fid is assigned. It's constant
-         * after this point.
-         */
-        top->lo_header->loh_fid = *f;
-        layers = &top->lo_header->loh_layers;
+		return top;
+	/*
+	 * This is the only place where object fid is assigned. It's constant
+	 * after this point.
+	 */
+	top->lo_header->loh_fid = *f;
+
+	return top;
+}
+
+/**
+ * Initialize object.
+ *
+ * This is called after object hash insertion to avoid returning an object with
+ * stale attributes.
+ */
+static int lu_object_start(const struct lu_env *env, struct lu_device *dev,
+			   struct lu_object *top,
+			   const struct lu_object_conf *conf)
+{
+	struct lu_object *scan;
+	struct list_head *layers;
+	unsigned int init_mask = 0;
+	unsigned int init_flag;
+	int clean;
+	int result;
+
+	layers = &top->lo_header->loh_layers;
 
 	do {
 		/*
@@ -271,10 +323,9 @@ static struct lu_object *lu_object_alloc(const struct lu_env *env,
 			clean = 0;
 			scan->lo_header = top->lo_header;
 			result = scan->lo_ops->loo_object_init(env, scan, conf);
-			if (result != 0) {
-				lu_object_free(env, top);
-				RETURN(ERR_PTR(result));
-			}
+			if (result)
+				return result;
+
 			init_mask |= init_flag;
 next:
 			init_flag <<= 1;
@@ -282,17 +333,18 @@ static struct lu_object *lu_object_alloc(const struct lu_env *env,
 	} while (!clean);
 
 	list_for_each_entry_reverse(scan, layers, lo_linkage) {
-                if (scan->lo_ops->loo_object_start != NULL) {
-                        result = scan->lo_ops->loo_object_start(env, scan);
-                        if (result != 0) {
-                                lu_object_free(env, top);
-                                RETURN(ERR_PTR(result));
-                        }
-                }
-        }
+		if (scan->lo_ops->loo_object_start != NULL) {
+			result = scan->lo_ops->loo_object_start(env, scan);
+			if (result)
+				return result;
+		}
+	}
+
+	lprocfs_counter_incr(dev->ld_site->ls_stats, LU_SS_CREATED);
 
-        lprocfs_counter_incr(dev->ld_site->ls_stats, LU_SS_CREATED);
-        RETURN(top);
+	set_bit(LU_OBJECT_INITED, &top->lo_header->loh_flags);
+
+	return 0;
 }
 
 /**
@@ -300,15 +352,15 @@ static struct lu_object *lu_object_alloc(const struct lu_env *env,
  */
 static void lu_object_free(const struct lu_env *env, struct lu_object *o)
 {
-	struct lu_site_bkt_data *bkt;
+	wait_queue_head_t *wq;
 	struct lu_site		*site;
 	struct lu_object	*scan;
 	struct list_head	*layers;
 	struct list_head	 splice;
 
-        site   = o->lo_dev->ld_site;
-        layers = &o->lo_header->loh_layers;
-        bkt    = lu_site_bkt_from_fid(site, &o->lo_header->loh_fid);
+	site = o->lo_dev->ld_site;
+	layers = &o->lo_header->loh_layers;
+	wq = lu_site_wq_from_fid(site, &o->lo_header->loh_fid);
         /*
          * First call ->loo_object_delete() method to release all resources.
          */
@@ -337,8 +389,8 @@ static void lu_object_free(const struct lu_env *env, struct lu_object *o)
 		o->lo_ops->loo_object_free(env, o);
 	}
 
-	if (waitqueue_active(&bkt->lsb_marche_funebre))
-		wake_up_all(&bkt->lsb_marche_funebre);
+	if (waitqueue_active(wq))
+		wake_up_all(wq);
 }
 
 /**
@@ -399,7 +451,6 @@ int lu_site_purge_objects(const struct lu_env *env, struct lu_site *s,
                         cfs_hash_bd_del_locked(s->ls_obj_hash,
                                                &bd2, &h->loh_hash);
 			list_move(&h->loh_lru, &dispose);
-			bkt->lsb_lru_len--;
 			percpu_counter_dec(&s->ls_lru_len_counter);
                         if (did_sth == 0)
                                 did_sth = 1;
@@ -591,7 +642,6 @@ static struct lu_object *htable_lookup(struct lu_site *s,
 				       const struct lu_fid *f,
 				       __u64 *version)
 {
-	struct lu_site_bkt_data	*bkt;
 	struct lu_object_header	*h;
 	struct hlist_node *hnode;
 	__u64 ver = cfs_hash_bd_version_get(bd);
@@ -600,7 +650,6 @@ static struct lu_object *htable_lookup(struct lu_site *s,
 		return ERR_PTR(-ENOENT);
 
 	*version = ver;
-	bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, bd);
 	/* cfs_hash_bd_peek_locked is a somehow "internal" function
 	 * of cfs_hash, it doesn't add refcount on object. */
 	hnode = cfs_hash_bd_peek_locked(s->ls_obj_hash, bd, (void *)f);
@@ -614,7 +663,6 @@ static struct lu_object *htable_lookup(struct lu_site *s,
 	lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_HIT);
 	if (!list_empty(&h->loh_lru)) {
 		list_del_init(&h->loh_lru);
-		bkt->lsb_lru_len--;
 		percpu_counter_dec(&s->ls_lru_len_counter);
 	}
 	return lu_object_top(h);
@@ -657,29 +705,6 @@ static void lu_object_limit(const struct lu_env *env,
 			      MIN(size - nr, LU_CACHE_NR_MAX_ADJUST), 0);
 }
 
-static struct lu_object *lu_object_new(const struct lu_env *env,
-				       struct lu_device *dev,
-				       const struct lu_fid *f,
-				       const struct lu_object_conf *conf)
-{
-	struct lu_object *o;
-	struct cfs_hash *hs;
-	struct cfs_hash_bd bd;
-
-	o = lu_object_alloc(env, dev, f, conf);
-	if (unlikely(IS_ERR(o)))
-		return o;
-
-	hs = dev->ld_site->ls_obj_hash;
-	cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1);
-	cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
-	cfs_hash_bd_unlock(hs, &bd, 1);
-
-	lu_object_limit(env, dev);
-
-	return o;
-}
-
 /**
  * Core logic of lu_object_find*() functions.
  *
@@ -697,7 +722,19 @@ struct lu_object *lu_object_find_at(const struct lu_env *env,
 	struct lu_site *s;
 	struct cfs_hash *hs;
 	struct cfs_hash_bd bd;
+	struct lu_site_bkt_data *bkt;
+	struct l_wait_info lwi = { 0 };
 	__u64 version = 0;
+	int rc;
+
+	ENTRY;
+
+	/* FID is from disk or network, zero FID is meaningless, return error
+	 * early to avoid assertion in lu_object_put. If a zero FID is wanted,
+	 * it should be allocated via lu_object_anon().
+	 */
+	if (fid_is_zero(f))
+		RETURN(ERR_PTR(-EINVAL));
 
 	/*
 	 * This uses standard index maintenance protocol:
@@ -716,46 +753,99 @@ struct lu_object *lu_object_find_at(const struct lu_env *env,
 	 * It is unnecessary to perform lookup-alloc-lookup-insert, instead,
 	 * just alloc and insert directly.
 	 *
-	 * If dying object is found during index search, add @waiter to the
-	 * site wait-queue and return ERR_PTR(-EAGAIN).
 	 */
-	if (conf && conf->loc_flags & LOC_F_NEW)
-		return lu_object_new(env, dev, f, conf);
-
 	s  = dev->ld_site;
 	hs = s->ls_obj_hash;
-	cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1);
-	o = htable_lookup(s, &bd, f, &version);
-	cfs_hash_bd_unlock(hs, &bd, 1);
-	if (!IS_ERR(o) || PTR_ERR(o) != -ENOENT)
-		return o;
+
+	if (unlikely(OBD_FAIL_PRECHECK(OBD_FAIL_OBD_ZERO_NLINK_RACE)))
+		lu_site_purge(env, s, -1);
+
+	cfs_hash_bd_get(hs, f, &bd);
+	bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd);
+	if (!(conf && conf->loc_flags & LOC_F_NEW)) {
+		cfs_hash_bd_lock(hs, &bd, 1);
+		o = htable_lookup(s, &bd, f, &version);
+		cfs_hash_bd_unlock(hs, &bd, 1);
+
+		if (!IS_ERR(o)) {
+			if (likely(lu_object_is_inited(o->lo_header)))
+				RETURN(o);
+
+			l_wait_event(bkt->lsb_waitq,
+				     lu_object_is_inited(o->lo_header) ||
+				     lu_object_is_dying(o->lo_header), &lwi);
+
+			if (lu_object_is_dying(o->lo_header)) {
+				lu_object_put(env, o);
+
+				RETURN(ERR_PTR(-ENOENT));
+			}
+
+			RETURN(o);
+		}
+
+		if (PTR_ERR(o) != -ENOENT)
+			RETURN(o);
+	}
 
 	/*
-	 * Allocate new object. This may result in rather complicated
-	 * operations, including fld queries, inode loading, etc.
+	 * Allocate new object, NB, object is unitialized in case object
+	 * is changed between allocation and hash insertion, thus the object
+	 * with stale attributes is returned.
 	 */
-	o = lu_object_alloc(env, dev, f, conf);
-	if (unlikely(IS_ERR(o)))
-		return o;
+	o = lu_object_alloc(env, dev, f);
+	if (IS_ERR(o))
+		RETURN(o);
 
 	LASSERT(lu_fid_eq(lu_object_fid(o), f));
 
+	CFS_RACE_WAIT(OBD_FAIL_OBD_ZERO_NLINK_RACE);
+
 	cfs_hash_bd_lock(hs, &bd, 1);
 
-	shadow = htable_lookup(s, &bd, f, &version);
-	if (likely(IS_ERR(shadow) && PTR_ERR(shadow) == -ENOENT)) {
+	if (conf && conf->loc_flags & LOC_F_NEW)
+		shadow = ERR_PTR(-ENOENT);
+	else
+		shadow = htable_lookup(s, &bd, f, &version);
+	if (likely(PTR_ERR(shadow) == -ENOENT)) {
 		cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
 		cfs_hash_bd_unlock(hs, &bd, 1);
 
+		/*
+		 * This may result in rather complicated operations, including
+		 * fld queries, inode loading, etc.
+		 */
+		rc = lu_object_start(env, dev, o, conf);
+		if (rc) {
+			lu_object_put_nocache(env, o);
+			RETURN(ERR_PTR(rc));
+		}
+
+		wake_up_all(&bkt->lsb_waitq);
+
 		lu_object_limit(env, dev);
 
-		return o;
+		RETURN(o);
 	}
 
 	lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_RACE);
 	cfs_hash_bd_unlock(hs, &bd, 1);
 	lu_object_free(env, o);
-	return shadow;
+
+	if (!(conf && conf->loc_flags & LOC_F_NEW) &&
+	    !lu_object_is_inited(shadow->lo_header)) {
+		l_wait_event(bkt->lsb_waitq,
+			     lu_object_is_inited(shadow->lo_header) ||
+			     lu_object_is_dying(shadow->lo_header), &lwi);
+
+		if (lu_object_is_dying(shadow->lo_header)) {
+			lu_object_put(env, shadow);
+
+			RETURN(ERR_PTR(-ENOENT));
+		}
+	}
+
+	RETURN(shadow);
 }
 EXPORT_SYMBOL(lu_object_find_at);
 
@@ -1042,7 +1132,7 @@ int lu_site_init(struct lu_site *s, struct lu_device *top)
 	cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) {
 		bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd);
 		INIT_LIST_HEAD(&bkt->lsb_lru);
-		init_waitqueue_head(&bkt->lsb_marche_funebre);
+		init_waitqueue_head(&bkt->lsb_waitq);
 	}
 
         s->ls_stats = lprocfs_alloc_stats(LU_SS_LAST_STAT, 0);
@@ -1386,7 +1476,8 @@ static void key_fini(struct lu_context *ctx, int index)
 
                 key->lct_fini(ctx, key, ctx->lc_value[index]);
                 lu_ref_del(&key->lct_reference, "ctx", ctx);
-		atomic_dec(&key->lct_used);
+		if (atomic_dec_and_test(&key->lct_used))
+			wake_up_var(&key->lct_used);
 
 		LASSERT(key->lct_owner != NULL);
 		if ((ctx->lc_tags & LCT_NOREF) == 0) {
@@ -1407,29 +1498,23 @@ void lu_context_key_degister(struct lu_context_key *key)
 
 	lu_context_key_quiesce(key);
 
-	write_lock(&lu_keys_guard);
-	++key_set_version;
 	key_fini(&lu_shrink_env.le_ctx, key->lct_index);
 
 	/**
 	 * Wait until all transient contexts referencing this key have
 	 * run lu_context_key::lct_fini() method.
 	 */
-	while (atomic_read(&key->lct_used) > 1) {
-		write_unlock(&lu_keys_guard);
-		CDEBUG(D_INFO, "lu_context_key_degister: \"%s\" %p, %d\n",
-		       key->lct_owner ? key->lct_owner->name : "", key,
-		       atomic_read(&key->lct_used));
-		schedule();
-		write_lock(&lu_keys_guard);
-	}
+	atomic_dec(&key->lct_used);
+	wait_var_event(&key->lct_used, atomic_read(&key->lct_used) == 0);
+
+	write_lock(&lu_keys_guard);
 	if (lu_keys[key->lct_index]) {
 		lu_keys[key->lct_index] = NULL;
 		lu_ref_fini(&key->lct_reference);
 	}
 	write_unlock(&lu_keys_guard);
 
-	LASSERTF(atomic_read(&key->lct_used) == 1,
+	LASSERTF(atomic_read(&key->lct_used) == 0,
 		 "key has instances: %d\n",
 		 atomic_read(&key->lct_used));
 }
@@ -1893,6 +1978,119 @@ int lu_env_refill_by_tags(struct lu_env *env, __u32 ctags,
 }
 EXPORT_SYMBOL(lu_env_refill_by_tags);
 
+#ifdef HAVE_SERVER_SUPPORT
+struct lu_env_item {
+	struct task_struct *lei_task;	/* rhashtable key */
+	struct rhash_head lei_linkage;
+	struct lu_env *lei_env;
+	struct rcu_head lei_rcu_head;
+};
+
+static const struct rhashtable_params lu_env_rhash_params = {
+	.key_len     = sizeof(struct task_struct *),
+	.key_offset  = offsetof(struct lu_env_item, lei_task),
+	.head_offset = offsetof(struct lu_env_item, lei_linkage),
+};
+
+struct rhashtable lu_env_rhash;
+
+struct lu_env_percpu {
+	struct task_struct *lep_task;
+	struct lu_env *lep_env ____cacheline_aligned_in_smp;
+};
+
+static struct lu_env_percpu lu_env_percpu[NR_CPUS];
+
+int lu_env_add(struct lu_env *env)
+{
+	struct lu_env_item *lei, *old;
+
+	LASSERT(env);
+
+	OBD_ALLOC_PTR(lei);
+	if (!lei)
+		return -ENOMEM;
+
+	lei->lei_task = current;
+	lei->lei_env = env;
+
+	old = rhashtable_lookup_get_insert_fast(&lu_env_rhash,
+						&lei->lei_linkage,
+						lu_env_rhash_params);
+	LASSERT(!old);
+
+	return 0;
+}
+EXPORT_SYMBOL(lu_env_add);
+
+static void lu_env_item_free(struct rcu_head *head)
+{
+	struct lu_env_item *lei;
+
+	lei = container_of(head, struct lu_env_item, lei_rcu_head);
+	OBD_FREE_PTR(lei);
+}
+
+void lu_env_remove(struct lu_env *env)
+{
+	struct lu_env_item *lei;
+	const void *task = current;
+	int i;
+
+	for_each_possible_cpu(i) {
+		if (lu_env_percpu[i].lep_env == env) {
+			LASSERT(lu_env_percpu[i].lep_task == task);
+			lu_env_percpu[i].lep_task = NULL;
+			lu_env_percpu[i].lep_env = NULL;
+		}
+	}
+
+	/* The rcu_lock is not taking in this case since the key
+	 * used is the actual task_struct. This implies that each
+	 * object is only removed by the owning thread, so there
+	 * can never be a race on a particular object.
+	 */
+	lei = rhashtable_lookup_fast(&lu_env_rhash, &task,
+				     lu_env_rhash_params);
+	if (lei && rhashtable_remove_fast(&lu_env_rhash, &lei->lei_linkage,
+					  lu_env_rhash_params) == 0)
+		call_rcu(&lei->lei_rcu_head, lu_env_item_free);
+}
+EXPORT_SYMBOL(lu_env_remove);
+
+struct lu_env *lu_env_find(void)
+{
+	struct lu_env *env = NULL;
+	struct lu_env_item *lei;
+	const void *task = current;
+	int i = get_cpu();
+
+	if (lu_env_percpu[i].lep_task == current) {
+		env = lu_env_percpu[i].lep_env;
+		put_cpu();
+		LASSERT(env);
+		return env;
+	}
+
+	lei = rhashtable_lookup_fast(&lu_env_rhash, &task,
+				     lu_env_rhash_params);
+	if (lei) {
+		env = lei->lei_env;
+		lu_env_percpu[i].lep_task = current;
+		lu_env_percpu[i].lep_env = env;
+	}
+	put_cpu();
+
+	return env;
+}
+EXPORT_SYMBOL(lu_env_find);
+#define lu_env_rhash_init(rhash, params) rhashtable_init(rhash, params)
+#define lu_env_rhash_destroy(rhash)	 rhashtable_destroy(rhash)
+#else
+#define lu_env_rhash_init(rhash, params) 0
+#define lu_env_rhash_destroy(rhash)	 do {} while (0)
+#endif /* HAVE_SERVER_SUPPORT */
+
 static struct shrinker *lu_site_shrinker;
 
 typedef struct lu_site_stats{
@@ -1902,19 +2100,24 @@ typedef struct lu_site_stats{
         unsigned        lss_busy;
 } lu_site_stats_t;
 
-static void lu_site_stats_get(struct cfs_hash *hs,
+static void lu_site_stats_get(const struct lu_site *s,
                               lu_site_stats_t *stats, int populated)
 {
+	struct cfs_hash *hs = s->ls_obj_hash;
 	struct cfs_hash_bd bd;
-	unsigned int  i;
+	unsigned int i;
+	/*
+	 * percpu_counter_sum_positive() won't accept a const pointer
+	 * as it does modify the struct by taking a spinlock
+	 */
+	struct lu_site *s2 = (struct lu_site *)s;
 
+	stats->lss_busy += cfs_hash_size_get(hs) -
+		percpu_counter_sum_positive(&s2->ls_lru_len_counter);
         cfs_hash_for_each_bucket(hs, &bd, i) {
-                struct lu_site_bkt_data *bkt = cfs_hash_bd_extra_get(hs, &bd);
-		struct hlist_head	*hhead;
+		struct hlist_head *hhead;
 
                 cfs_hash_bd_lock(hs, &bd, 1);
-		stats->lss_busy  +=
-			cfs_hash_bd_count_get(&bd) - bkt->lsb_lru_len;
                 stats->lss_total += cfs_hash_bd_count_get(&bd);
                 stats->lss_max_search = max((int)stats->lss_max_search,
                                             cfs_hash_bd_depmax_get(&bd));
@@ -2103,7 +2306,7 @@ void lu_context_keys_dump(void)
  */
 int lu_global_init(void)
 {
-        int result;
+	int result;
 	DEF_SHRINKER_VAR(shvar, lu_cache_shrink,
 			 lu_cache_shrink_count, lu_cache_shrink_scan);
 
@@ -2138,6 +2341,8 @@ int lu_global_init(void)
         if (lu_site_shrinker == NULL)
                 return -ENOMEM;
 
+	result = lu_env_rhash_init(&lu_env_rhash, &lu_env_rhash_params);
+
         return result;
 }
 
@@ -2161,6 +2366,8 @@ void lu_global_fini(void)
         lu_env_fini(&lu_shrink_env);
 	up_write(&lu_sites_guard);
 
+	lu_env_rhash_destroy(&lu_env_rhash);
+
         lu_ref_global_fini();
 }
 
@@ -2185,7 +2392,7 @@ int lu_site_stats_seq_print(const struct lu_site *s, struct seq_file *m)
 	lu_site_stats_t stats;
 
 	memset(&stats, 0, sizeof(stats));
-	lu_site_stats_get(s->ls_obj_hash, &stats, 1);
+	lu_site_stats_get(s, &stats, 1);
 
 	seq_printf(m, "%d/%d %d/%d %d %d %d %d %d %d %d\n",
 		   stats.lss_busy,
@@ -2283,11 +2490,19 @@ struct lu_object *lu_object_anon(const struct lu_env *env,
 				 struct lu_device *dev,
 				 const struct lu_object_conf *conf)
 {
-	struct lu_fid     fid;
+	struct lu_fid fid;
 	struct lu_object *o;
+	int rc;
 
 	fid_zero(&fid);
-	o = lu_object_alloc(env, dev, &fid, conf);
+	o = lu_object_alloc(env, dev, &fid);
+	if (!IS_ERR(o)) {
+		rc = lu_object_start(env, dev, o, conf);
+		if (rc) {
+			lu_object_free(env, o);
+			return ERR_PTR(rc);
+		}
+	}
 
 	return o;
 }
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c b/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c
index bef29033f30ee..e0a75791f1e6e 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lu_ref.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -65,14 +65,14 @@
 static struct kmem_cache *lu_ref_link_kmem;
 
 static struct lu_kmem_descr lu_ref_caches[] = {
-        {
-                .ckd_cache = &lu_ref_link_kmem,
-                .ckd_name  = "lu_ref_link_kmem",
-                .ckd_size  = sizeof (struct lu_ref_link)
-        },
-        {
-                .ckd_cache = NULL
-        }
+	{
+		.ckd_cache = &lu_ref_link_kmem,
+		.ckd_name  = "lu_ref_link_kmem",
+		.ckd_size  = sizeof(struct lu_ref_link)
+	},
+	{
+		.ckd_cache = NULL
+	}
 };
 
 /**
@@ -90,18 +90,18 @@ static struct lu_ref lu_ref_marker = {
 
 void lu_ref_print(const struct lu_ref *ref)
 {
-        struct lu_ref_link *link;
+	struct lu_ref_link *link;
 
-        CERROR("lu_ref: %p %d %d %s:%d\n",
-               ref, ref->lf_refs, ref->lf_failed, ref->lf_func, ref->lf_line);
+	CERROR("lu_ref: %p %d %d %s:%d\n",
+	       ref, ref->lf_refs, ref->lf_failed, ref->lf_func, ref->lf_line);
 	list_for_each_entry(link, &ref->lf_list, ll_linkage) {
-                CERROR("     link: %s %p\n", link->ll_scope, link->ll_source);
-        }
+		CERROR("     link: %s %p\n", link->ll_scope, link->ll_source);
+	}
 }
 
 static int lu_ref_is_marker(const struct lu_ref *ref)
 {
-        return (ref == &lu_ref_marker);
+	return ref == &lu_ref_marker;
 }
 
 void lu_ref_print_all(void)
@@ -146,19 +146,19 @@ void lu_ref_fini(struct lu_ref *ref)
 EXPORT_SYMBOL(lu_ref_fini);
 
 static struct lu_ref_link *lu_ref_add_context(struct lu_ref *ref,
-                                              int flags,
-                                              const char *scope,
-                                              const void *source)
+					      int flags,
+					      const char *scope,
+					      const void *source)
 {
-        struct lu_ref_link *link;
-
-        link = NULL;
-        if (lu_ref_link_kmem != NULL) {
-                OBD_SLAB_ALLOC_PTR_GFP(link, lu_ref_link_kmem, flags);
-                if (link != NULL) {
-                        link->ll_ref    = ref;
-                        link->ll_scope  = scope;
-                        link->ll_source = source;
+	struct lu_ref_link *link;
+
+	link = NULL;
+	if (lu_ref_link_kmem != NULL) {
+		OBD_SLAB_ALLOC_PTR_GFP(link, lu_ref_link_kmem, flags);
+		if (link != NULL) {
+			link->ll_ref = ref;
+			link->ll_scope = scope;
+			link->ll_source = source;
 			spin_lock(&ref->lf_guard);
 			list_add_tail(&link->ll_linkage, &ref->lf_list);
 			ref->lf_refs++;
@@ -207,9 +207,10 @@ void lu_ref_add_atomic(struct lu_ref *ref, const char *scope,
 EXPORT_SYMBOL(lu_ref_add_atomic);
 
 static inline int lu_ref_link_eq(const struct lu_ref_link *link,
-                                 const char *scope, const void *source)
+				 const char *scope,
+				 const void *source)
 {
-        return link->ll_source == source && !strcmp(link->ll_scope, scope);
+	return link->ll_source == source && !strcmp(link->ll_scope, scope);
 }
 
 /**
@@ -223,22 +224,22 @@ static unsigned lu_ref_chain_max_length = 127;
 static struct lu_ref_link *lu_ref_find(struct lu_ref *ref, const char *scope,
                                        const void *source)
 {
-        struct lu_ref_link *link;
-        unsigned            iterations;
+	struct lu_ref_link *link;
+	unsigned int iterations;
 
-        iterations = 0;
+	iterations = 0;
 	list_for_each_entry(link, &ref->lf_list, ll_linkage) {
-                ++iterations;
-                if (lu_ref_link_eq(link, scope, source)) {
-                        if (iterations > lu_ref_chain_max_length) {
-                                CWARN("Long lu_ref chain %d \"%s\":%p\n",
-                                      iterations, scope, source);
-                                lu_ref_chain_max_length = iterations * 3 / 2;
-                        }
-                        return link;
-                }
-        }
-        return NULL;
+		++iterations;
+		if (lu_ref_link_eq(link, scope, source)) {
+			if (iterations > lu_ref_chain_max_length) {
+				CWARN("Long lu_ref chain %d \"%s\":%p\n",
+				      iterations, scope, source);
+				lu_ref_chain_max_length = iterations * 3 / 2;
+			}
+			return link;
+		}
+	}
+	return NULL;
 }
 
 void lu_ref_del(struct lu_ref *ref, const char *scope, const void *source)
@@ -302,10 +303,10 @@ static void *lu_ref_seq_start(struct seq_file *seq, loff_t *pos)
 
 static void *lu_ref_seq_next(struct seq_file *seq, void *p, loff_t *pos)
 {
-        struct lu_ref *ref = p;
-        struct lu_ref *next;
+	struct lu_ref *ref = p;
+	struct lu_ref *next;
 
-        LASSERT(seq->private == p);
+	LASSERT(seq->private == p);
 	LASSERT(!list_empty(&ref->lf_linkage));
 
 	spin_lock(&lu_ref_refs_guard);
@@ -322,7 +323,7 @@ static void *lu_ref_seq_next(struct seq_file *seq, void *p, loff_t *pos)
 
 static void lu_ref_seq_stop(struct seq_file *seq, void *p)
 {
-        /* Nothing to do */
+	/* Nothing to do */
 }
 
 
@@ -340,19 +341,19 @@ static int lu_ref_seq_show(struct seq_file *seq, void *p)
 
 	/* print the entry */
 	spin_lock(&next->lf_guard);
-        seq_printf(seq, "lu_ref: %p %d %d %s:%d\n",
-                   next, next->lf_refs, next->lf_failed,
-                   next->lf_func, next->lf_line);
-        if (next->lf_refs > 64) {
-                seq_printf(seq, "  too many references, skip\n");
-        } else {
-                struct lu_ref_link *link;
-                int i = 0;
+	seq_printf(seq, "lu_ref: %p %d %d %s:%d\n",
+		   next, next->lf_refs, next->lf_failed,
+		   next->lf_func, next->lf_line);
+	if (next->lf_refs > 64) {
+		seq_puts(seq, "  too many references, skip\n");
+	} else {
+		struct lu_ref_link *link;
+		int i = 0;
 
 		list_for_each_entry(link, &next->lf_list, ll_linkage)
-                        seq_printf(seq, "  #%d link: %s %p\n",
-                                   i++, link->ll_scope, link->ll_source);
-        }
+			seq_printf(seq, "  #%d link: %s %p\n",
+				   i++, link->ll_scope, link->ll_source);
+	}
 	spin_unlock(&next->lf_guard);
 	spin_unlock(&lu_ref_refs_guard);
 
@@ -360,10 +361,10 @@ static int lu_ref_seq_show(struct seq_file *seq, void *p)
 }
 
 static struct seq_operations lu_ref_seq_ops = {
-        .start = lu_ref_seq_start,
-        .stop  = lu_ref_seq_stop,
-        .next  = lu_ref_seq_next,
-        .show  = lu_ref_seq_show
+	.start = lu_ref_seq_start,
+	.stop  = lu_ref_seq_stop,
+	.next  = lu_ref_seq_next,
+	.show  = lu_ref_seq_show
 };
 
 static int lu_ref_seq_open(struct inode *inode, struct file *file)
@@ -380,15 +381,16 @@ static int lu_ref_seq_open(struct inode *inode, struct file *file)
 			list_add(&marker->lf_linkage, &lu_ref_refs);
 		spin_unlock(&lu_ref_refs_guard);
 
-                if (result == 0) {
-                        struct seq_file *f = file->private_data;
-                        f->private = marker;
-                } else {
-                        seq_release(inode, file);
-                }
-        }
+		if (result == 0) {
+			struct seq_file *f = file->private_data;
+
+			f->private = marker;
+		} else {
+			seq_release(inode, file);
+		}
+	}
 
-        return result;
+	return result;
 }
 
 static int lu_ref_seq_release(struct inode *inode, struct file *file)
@@ -403,11 +405,11 @@ static int lu_ref_seq_release(struct inode *inode, struct file *file)
 }
 
 static struct file_operations lu_ref_dump_fops = {
-        .owner   = THIS_MODULE,
-        .open    = lu_ref_seq_open,
-        .read    = seq_read,
-        .llseek  = seq_lseek,
-        .release = lu_ref_seq_release
+	.owner   = THIS_MODULE,
+	.open    = lu_ref_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = lu_ref_seq_release
 };
 
 #endif /* CONFIG_PROC_FS */
@@ -419,26 +421,26 @@ int lu_ref_global_init(void)
 	CDEBUG(D_CONSOLE,
 	       "lu_ref tracking is enabled. Performance isn't.\n");
 
-        result = lu_kmem_init(lu_ref_caches);
+	result = lu_kmem_init(lu_ref_caches);
 
 #ifdef CONFIG_PROC_FS
-        if (result == 0) {
-                result = lprocfs_seq_create(proc_lustre_root, "lu_refs",
-                                            0444, &lu_ref_dump_fops, NULL);
-                if (result)
-                        lu_kmem_fini(lu_ref_caches);
-        }
+	if (result == 0) {
+		result = lprocfs_seq_create(proc_lustre_root, "lu_refs",
+					    0444, &lu_ref_dump_fops, NULL);
+		if (result)
+			lu_kmem_fini(lu_ref_caches);
+	}
 #endif /* CONFIG_PROC_FS */
 
-        return result;
+	return result;
 }
 
 void lu_ref_global_fini(void)
 {
 #ifdef CONFIG_PROC_FS
-        lprocfs_remove_proc_entry("lu_refs", proc_lustre_root);
+	lprocfs_remove_proc_entry("lu_refs", proc_lustre_root);
 #endif /* CONFIG_PROC_FS */
-        lu_kmem_fini(lu_ref_caches);
+	lu_kmem_fini(lu_ref_caches);
 }
 
 #endif /* USE_LU_REF */
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lustre_handles.c b/drivers/staging/lustrefsx/lustre/obdclass/lustre_handles.c
index bd149ddf7a967..4161b2dabfd72 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lustre_handles.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lustre_handles.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -46,7 +46,7 @@ static __u64 handle_base;
 static DEFINE_SPINLOCK(handle_base_lock);
 
 static struct handle_bucket {
-	spinlock_t	 lock;
+	spinlock_t lock;
 	struct list_head head;
 } *handle_hash;
 
@@ -60,16 +60,17 @@ static struct handle_bucket {
 void class_handle_hash(struct portals_handle *h,
 		       struct portals_handle_ops *ops)
 {
-        struct handle_bucket *bucket;
-        ENTRY;
+	struct handle_bucket *bucket;
+
+	ENTRY;
 
-        LASSERT(h != NULL);
+	LASSERT(h != NULL);
 	LASSERT(list_empty(&h->h_link));
 
-        /*
-         * This is fast, but simplistic cookie generation algorithm, it will
-         * need a re-do at some point in the future for security.
-         */
+	/*
+	 * This is fast, but simplistic cookie generation algorithm, it will
+	 * need a re-do at some point in the future for security.
+	 */
 	spin_lock(&handle_base_lock);
 	handle_base += HANDLE_INCR;
 
@@ -104,12 +105,12 @@ static void class_handle_unhash_nolock(struct portals_handle *h)
 {
 	if (list_empty(&h->h_link)) {
 		CERROR("removing an already-removed handle (%#llx)\n",
-                       h->h_cookie);
-                return;
-        }
+		       h->h_cookie);
+		return;
+	}
 
 	CDEBUG(D_INFO, "removing object %p with handle %#llx from hash\n",
-               h, h->h_cookie);
+	       h, h->h_cookie);
 
 	spin_lock(&h->h_lock);
 	if (h->h_in == 0) {
@@ -150,21 +151,24 @@ EXPORT_SYMBOL(class_handle_hash_back);
 
 void *class_handle2object(__u64 cookie, const void *owner)
 {
-        struct handle_bucket *bucket;
-        struct portals_handle *h;
-        void *retval = NULL;
-        ENTRY;
+	struct handle_bucket *bucket;
+	struct portals_handle *h;
+	void *retval = NULL;
+
+	ENTRY;
 
-        LASSERT(handle_hash != NULL);
+	LASSERT(handle_hash != NULL);
 
-	/* Be careful when you want to change this code. See the
-	 * rcu_read_lock() definition on top this file. - jxiong */
-        bucket = handle_hash + (cookie & HANDLE_HASH_MASK);
+	/*
+	 * Be careful when you want to change this code. See the
+	 * rcu_read_lock() definition on top this file. - jxiong
+	 */
+	bucket = handle_hash + (cookie & HANDLE_HASH_MASK);
 
-        rcu_read_lock();
-        list_for_each_entry_rcu(h, &bucket->head, h_link) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(h, &bucket->head, h_link) {
 		if (h->h_cookie != cookie || h->h_owner != owner)
-                        continue;
+			continue;
 
 		spin_lock(&h->h_lock);
 		if (likely(h->h_in != 0)) {
@@ -197,15 +201,15 @@ EXPORT_SYMBOL(class_handle_free_cb);
 
 int class_handle_init(void)
 {
-        struct handle_bucket *bucket;
+	struct handle_bucket *bucket;
 	struct timespec64 ts;
-        int seed[2];
+	int seed[2];
 
-        LASSERT(handle_hash == NULL);
+	LASSERT(handle_hash == NULL);
 
-        OBD_ALLOC_LARGE(handle_hash, sizeof(*bucket) * HANDLE_HASH_SIZE);
-        if (handle_hash == NULL)
-                return -ENOMEM;
+	OBD_ALLOC_LARGE(handle_hash, sizeof(*bucket) * HANDLE_HASH_SIZE);
+	if (handle_hash == NULL)
+		return -ENOMEM;
 
 	for (bucket = handle_hash + HANDLE_HASH_SIZE - 1; bucket >= handle_hash;
 	     bucket--) {
@@ -218,10 +222,10 @@ int class_handle_init(void)
 	ktime_get_ts64(&ts);
 	cfs_srand(ts.tv_sec ^ seed[0], ts.tv_nsec ^ seed[1]);
 
-        cfs_get_random_bytes(&handle_base, sizeof(handle_base));
-        LASSERT(handle_base != 0ULL);
+	cfs_get_random_bytes(&handle_base, sizeof(handle_base));
+	LASSERT(handle_base != 0ULL);
 
-        return 0;
+	return 0;
 }
 
 static int cleanup_all_handles(void)
@@ -248,14 +252,15 @@ static int cleanup_all_handles(void)
 
 void class_handle_cleanup(void)
 {
-        int count;
-        LASSERT(handle_hash != NULL);
+	int count;
+
+	LASSERT(handle_hash != NULL);
 
-        count = cleanup_all_handles();
+	count = cleanup_all_handles();
 
-        OBD_FREE_LARGE(handle_hash, sizeof(*handle_hash) * HANDLE_HASH_SIZE);
-        handle_hash = NULL;
+	OBD_FREE_LARGE(handle_hash, sizeof(*handle_hash) * HANDLE_HASH_SIZE);
+	handle_hash = NULL;
 
-        if (count != 0)
-                CERROR("handle_count at cleanup: %d\n", count);
+	if (count != 0)
+		CERROR("handle_count at cleanup: %d\n", count);
 }
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lustre_peer.c b/drivers/staging/lustrefsx/lustre/obdclass/lustre_peer.c
index 95716e1ccac88..535d78eac5578 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lustre_peer.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lustre_peer.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2012, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -80,51 +80,51 @@ EXPORT_SYMBOL(lustre_uuid_to_peer);
    LNET will choose the best one. */
 int class_add_uuid(const char *uuid, __u64 nid)
 {
-        struct uuid_nid_data *data, *entry;
-        int found = 0;
+	struct uuid_nid_data *data, *entry;
+	int found = 0;
 
-        LASSERT(nid != 0);  /* valid newconfig NID is never zero */
+	LASSERT(nid != 0);  /* valid newconfig NID is never zero */
 
-        if (strlen(uuid) > UUID_MAX - 1)
-                return -EOVERFLOW;
+	if (strlen(uuid) > UUID_MAX - 1)
+		return -EOVERFLOW;
 
-        OBD_ALLOC_PTR(data);
-        if (data == NULL)
-                return -ENOMEM;
+	OBD_ALLOC_PTR(data);
+	if (data == NULL)
+		return -ENOMEM;
 
-        obd_str2uuid(&data->un_uuid, uuid);
-        data->un_nids[0] = nid;
-        data->un_nid_count = 1;
+	obd_str2uuid(&data->un_uuid, uuid);
+	data->un_nids[0] = nid;
+	data->un_nid_count = 1;
 
 	spin_lock(&g_uuid_lock);
 	list_for_each_entry(entry, &g_uuid_list, un_list) {
-                if (obd_uuid_equals(&entry->un_uuid, &data->un_uuid)) {
-                        int i;
-
-                        found = 1;
-                        for (i = 0; i < entry->un_nid_count; i++)
-                                if (nid == entry->un_nids[i])
-                                        break;
-
-                        if (i == entry->un_nid_count) {
-                                LASSERT(entry->un_nid_count < NIDS_MAX);
-                                entry->un_nids[entry->un_nid_count++] = nid;
-                        }
-                        break;
-                }
-        }
-        if (!found)
+		if (obd_uuid_equals(&entry->un_uuid, &data->un_uuid)) {
+			int i;
+
+			found = 1;
+			for (i = 0; i < entry->un_nid_count; i++)
+				if (nid == entry->un_nids[i])
+					break;
+
+			if (i == entry->un_nid_count) {
+				LASSERT(entry->un_nid_count < NIDS_MAX);
+				entry->un_nids[entry->un_nid_count++] = nid;
+			}
+			break;
+		}
+	}
+	if (!found)
 		list_add(&data->un_list, &g_uuid_list);
 	spin_unlock(&g_uuid_lock);
 
-        if (found) {
-                CDEBUG(D_INFO, "found uuid %s %s cnt=%d\n", uuid,
-                       libcfs_nid2str(nid), entry->un_nid_count);
-                OBD_FREE(data, sizeof(*data));
-        } else {
-                CDEBUG(D_INFO, "add uuid %s %s\n", uuid, libcfs_nid2str(nid));
-        }
-        return 0;
+	if (found) {
+		CDEBUG(D_INFO, "found uuid %s %s cnt=%d\n", uuid,
+		       libcfs_nid2str(nid), entry->un_nid_count);
+		OBD_FREE(data, sizeof(*data));
+	} else {
+		CDEBUG(D_INFO, "add uuid %s %s\n", uuid, libcfs_nid2str(nid));
+	}
+	return 0;
 }
 
 /* Delete the nids for one uuid if specified, otherwise delete all */
@@ -173,29 +173,30 @@ int class_del_uuid(const char *uuid)
 /* check if @nid exists in nid list of @uuid */
 int class_check_uuid(struct obd_uuid *uuid, __u64 nid)
 {
-        struct uuid_nid_data *entry;
-        int found = 0;
-        ENTRY;
+	struct uuid_nid_data *entry;
+	int found = 0;
 
-        CDEBUG(D_INFO, "check if uuid %s has %s.\n",
-               obd_uuid2str(uuid), libcfs_nid2str(nid));
+	ENTRY;
+
+	CDEBUG(D_INFO, "check if uuid %s has %s.\n",
+	       obd_uuid2str(uuid), libcfs_nid2str(nid));
 
 	spin_lock(&g_uuid_lock);
 	list_for_each_entry(entry, &g_uuid_list, un_list) {
-                int i;
+		int i;
 
-                if (!obd_uuid_equals(&entry->un_uuid, uuid))
+		if (!obd_uuid_equals(&entry->un_uuid, uuid))
                         continue;
 
-                /* found the uuid, check if it has @nid */
-                for (i = 0; i < entry->un_nid_count; i++) {
-                        if (entry->un_nids[i] == nid) {
-                                found = 1;
-                                break;
-                        }
-                }
-                break;
-        }
+		/* found the uuid, check if it has @nid */
+		for (i = 0; i < entry->un_nid_count; i++) {
+			if (entry->un_nids[i] == nid) {
+				found = 1;
+				break;
+			}
+		}
+		break;
+	}
 	spin_unlock(&g_uuid_lock);
 	RETURN(found);
 }
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/md_attrs.c b/drivers/staging/lustrefsx/lustre/obdclass/md_attrs.c
index 85003937e7466..d0ca4f17b1cb3 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/md_attrs.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/md_attrs.c
@@ -21,14 +21,11 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  * Use is subject to license terms.
  *
  * Author: Johann Lombardi <johann.lombardi@intel.com>
  */
-
-#include <lustre/lustre_idl.h>
-
 #include <llog_swab.h>
 #include <lustre_swab.h>
 #include <obd.h>
@@ -50,9 +47,9 @@ void lustre_lma_init(struct lustre_mdt_attrs *lma, const struct lu_fid *fid,
 
 	/* If a field is added in struct lustre_mdt_attrs, zero it explicitly
 	 * and change the test below. */
-	LASSERT(sizeof(*lma) ==
-		(offsetof(struct lustre_mdt_attrs, lma_self_fid) +
-		 sizeof(lma->lma_self_fid)));
+	CLASSERT(sizeof(*lma) ==
+		 (offsetof(struct lustre_mdt_attrs, lma_self_fid) +
+		  sizeof(lma->lma_self_fid)));
 }
 EXPORT_SYMBOL(lustre_lma_init);
 
@@ -114,6 +111,22 @@ void lustre_loa_swab(struct lustre_ost_attrs *loa, bool to_cpu)
 }
 EXPORT_SYMBOL(lustre_loa_swab);
 
+/**
+ * Swab, if needed, SOM structure which is stored on-disk in little-endian
+ * order.
+ *
+ * \param attrs - is a pointer to the SOM structure to be swabbed.
+ */
+void lustre_som_swab(struct lustre_som_attrs *attrs)
+{
+#ifdef __BIG_ENDIAN
+	__swab16s(&attrs->lsa_valid);
+	__swab64s(&attrs->lsa_size);
+	__swab64s(&attrs->lsa_blocks);
+#endif
+}
+EXPORT_SYMBOL(lustre_som_swab);
+
 /**
  * Swab, if needed, HSM structure which is stored on-disk in little-endian
  * order.
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_cksum.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_cksum.c
new file mode 100644
index 0000000000000..16e6f12f8a05c
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_cksum.c
@@ -0,0 +1,149 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2018, DataDirect Networks Storage.
+ * Author: Li Xi.
+ *
+ * Checksum functions
+ */
+#include <obd_class.h>
+#include <obd_cksum.h>
+
+/* Server uses algos that perform at 50% or better of the Adler */
+enum cksum_types obd_cksum_types_supported_server(const char *obd_name)
+{
+	enum cksum_types ret = OBD_CKSUM_ADLER;
+	int base_speed;
+
+	CDEBUG(D_INFO, "%s: checksum speed: crc %d, crc32c %d, adler %d, "
+	       "t10ip512 %d, t10ip4k %d, t10crc512 %d, t10crc4k %d\n",
+	       obd_name,
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)),
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)),
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)),
+	       obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP512),
+	       obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP4K),
+	       obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC512),
+	       obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC4K));
+
+	base_speed = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)) / 2;
+
+	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) >=
+	    base_speed)
+		ret |= OBD_CKSUM_CRC32C;
+
+	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) >=
+	    base_speed)
+		ret |= OBD_CKSUM_CRC32;
+
+	if (obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP512) >= base_speed)
+		ret |= OBD_CKSUM_T10IP512;
+
+	if (obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP4K) >= base_speed)
+		ret |= OBD_CKSUM_T10IP4K;
+
+	if (obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC512) >= base_speed)
+		ret |= OBD_CKSUM_T10CRC512;
+
+	if (obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC4K) >= base_speed)
+		ret |= OBD_CKSUM_T10CRC4K;
+
+	return ret;
+}
+EXPORT_SYMBOL(obd_cksum_types_supported_server);
+
+/* The OBD_FL_CKSUM_* flags is packed into 5 bits of o_flags, since there can
+ * only be a single checksum type per RPC.
+ *
+ * The OBD_CKSUM_* type bits passed in ocd_cksum_types are a 32-bit bitmask
+ * since they need to represent the full range of checksum algorithms that
+ * both the client and server can understand.
+ *
+ * In case of an unsupported types/flags we fall back to ADLER
+ * because that is supported by all clients since 1.8
+ *
+ * In case multiple algorithms are supported the best one is used. */
+u32 obd_cksum_type_pack(const char *obd_name, enum cksum_types cksum_type)
+{
+	unsigned int performance = 0, tmp;
+	u32 flag = OBD_FL_CKSUM_ADLER;
+
+	if (cksum_type & OBD_CKSUM_CRC32) {
+		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32));
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_CRC32;
+		}
+	}
+	if (cksum_type & OBD_CKSUM_CRC32C) {
+		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C));
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_CRC32C;
+		}
+	}
+	if (cksum_type & OBD_CKSUM_ADLER) {
+		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER));
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_ADLER;
+		}
+	}
+
+	if (cksum_type & OBD_CKSUM_T10IP512) {
+		tmp = obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP512);
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_T10IP512;
+		}
+	}
+
+	if (cksum_type & OBD_CKSUM_T10IP4K) {
+		tmp = obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10IP4K);
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_T10IP4K;
+		}
+	}
+
+	if (cksum_type & OBD_CKSUM_T10CRC512) {
+		tmp = obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC512);
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_T10CRC512;
+		}
+	}
+
+	if (cksum_type & OBD_CKSUM_T10CRC4K) {
+		tmp = obd_t10_cksum_speed(obd_name, OBD_CKSUM_T10CRC4K);
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_T10CRC4K;
+		}
+	}
+
+	if (unlikely(cksum_type && !(cksum_type & OBD_CKSUM_ALL)))
+		CWARN("%s: unknown cksum type %x\n", obd_name, cksum_type);
+
+	return flag;
+}
+EXPORT_SYMBOL(obd_cksum_type_pack);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
index 9f5fffd48bd61..a5b5dcfe572fe 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_config.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -36,14 +36,15 @@
 
 #define DEBUG_SUBSYSTEM S_CLASS
 
+#include <linux/kobject.h>
 #include <linux/string.h>
 
 #include <llog_swab.h>
 #include <lprocfs_status.h>
 #include <lustre_disk.h>
-#include <uapi/linux/lustre_ioctl.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
 #include <lustre_log.h>
-#include <uapi/linux/lustre_param.h>
+#include <uapi/linux/lustre/lustre_param.h>
 #include <obd_class.h>
 
 #include "llog_internal.h"
@@ -365,6 +366,7 @@ EXPORT_SYMBOL(lustre_cfg_string);
  */
 int class_attach(struct lustre_cfg *lcfg)
 {
+	struct obd_export *exp;
         struct obd_device *obd = NULL;
         char *typename, *name, *uuid;
         int rc, len;
@@ -381,90 +383,54 @@ int class_attach(struct lustre_cfg *lcfg)
                 RETURN(-EINVAL);
         }
         name = lustre_cfg_string(lcfg, 0);
-
         if (!LUSTRE_CFG_BUFLEN(lcfg, 2)) {
                 CERROR("No UUID passed!\n");
                 RETURN(-EINVAL);
         }
-        uuid = lustre_cfg_string(lcfg, 2);
 
-        CDEBUG(D_IOCTL, "attach type %s name: %s uuid: %s\n",
-               MKSTR(typename), MKSTR(name), MKSTR(uuid));
+	uuid = lustre_cfg_string(lcfg, 2);
+	len = strlen(uuid);
+	if (len >= sizeof(obd->obd_uuid)) {
+		CERROR("%s: uuid must be < %d bytes long\n",
+		       name, (int)sizeof(obd->obd_uuid));
+		RETURN(-EINVAL);
+	}
 
-        obd = class_newdev(typename, name);
-        if (IS_ERR(obd)) {
-                /* Already exists or out of obds */
-                rc = PTR_ERR(obd);
-                obd = NULL;
+	obd = class_newdev(typename, name, uuid);
+	if (IS_ERR(obd)) { /* Already exists or out of obds */
+		rc = PTR_ERR(obd);
                 CERROR("Cannot create device %s of type %s : %d\n",
                        name, typename, rc);
-                GOTO(out, rc);
+		RETURN(rc);
         }
-        LASSERTF(obd != NULL, "Cannot get obd device %s of type %s\n",
-                 name, typename);
         LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
                  "obd %p obd_magic %08X != %08X\n",
                  obd, obd->obd_magic, OBD_DEVICE_MAGIC);
         LASSERTF(strncmp(obd->obd_name, name, strlen(name)) == 0,
                  "%p obd_name %s != %s\n", obd, obd->obd_name, name);
 
-	rwlock_init(&obd->obd_pool_lock);
-	obd->obd_pool_limit = 0;
-	obd->obd_pool_slv = 0;
-
-	INIT_LIST_HEAD(&obd->obd_exports);
-	INIT_LIST_HEAD(&obd->obd_unlinked_exports);
-	INIT_LIST_HEAD(&obd->obd_delayed_exports);
-	INIT_LIST_HEAD(&obd->obd_exports_timed);
-	INIT_LIST_HEAD(&obd->obd_nid_stats);
-	spin_lock_init(&obd->obd_nid_lock);
-	spin_lock_init(&obd->obd_dev_lock);
-	mutex_init(&obd->obd_dev_mutex);
-	spin_lock_init(&obd->obd_osfs_lock);
-	/* obd->obd_osfs_age must be set to a value in the distant
-	 * past to guarantee a fresh statfs is fetched on mount. */
-	obd->obd_osfs_age = cfs_time_shift_64(-1000);
-
-	/* XXX belongs in setup not attach  */
-	init_rwsem(&obd->obd_observer_link_sem);
-	/* recovery data */
-	spin_lock_init(&obd->obd_recovery_task_lock);
-	init_waitqueue_head(&obd->obd_next_transno_waitq);
-	init_waitqueue_head(&obd->obd_evict_inprogress_waitq);
-	INIT_LIST_HEAD(&obd->obd_req_replay_queue);
-	INIT_LIST_HEAD(&obd->obd_lock_replay_queue);
-	INIT_LIST_HEAD(&obd->obd_final_req_queue);
-	INIT_LIST_HEAD(&obd->obd_evict_list);
-	INIT_LIST_HEAD(&obd->obd_lwp_list);
-
-	llog_group_init(&obd->obd_olg);
-
-	obd->obd_conn_inprogress = 0;
-
-        len = strlen(uuid);
-        if (len >= sizeof(obd->obd_uuid)) {
-                CERROR("uuid must be < %d bytes long\n",
-                       (int)sizeof(obd->obd_uuid));
-                GOTO(out, rc = -EINVAL);
-        }
-        memcpy(obd->obd_uuid.uuid, uuid, len);
+	exp = class_new_export_self(obd, &obd->obd_uuid);
+	if (IS_ERR(exp)) {
+		rc = PTR_ERR(exp);
+		class_free_dev(obd);
+		RETURN(rc);
+	}
 
-        /* Detach drops this */
-	spin_lock(&obd->obd_dev_lock);
-	atomic_set(&obd->obd_refcount, 1);
-	spin_unlock(&obd->obd_dev_lock);
-        lu_ref_init(&obd->obd_reference);
-        lu_ref_add(&obd->obd_reference, "attach", obd);
+	obd->obd_self_export = exp;
+	list_del_init(&exp->exp_obd_chain_timed);
+	class_export_put(exp);
+
+	rc = class_register_device(obd);
+	if (rc != 0) {
+		class_decref(obd, "newdev", obd);
+		RETURN(rc);
+	}
 
-        obd->obd_attached = 1;
-        CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n",
+	obd->obd_attached = 1;
+	CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n",
 	       obd->obd_minor, typename, atomic_read(&obd->obd_refcount));
-        RETURN(0);
- out:
-        if (obd != NULL) {
-                class_release_dev(obd);
-        }
-        return rc;
+
+	RETURN(0);
 }
 EXPORT_SYMBOL(class_attach);
 
@@ -474,7 +440,6 @@ EXPORT_SYMBOL(class_attach);
 int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 {
         int err = 0;
-        struct obd_export *exp;
         ENTRY;
 
         LASSERT(obd != NULL);
@@ -523,7 +488,7 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
                                              CFS_HASH_MAX_THETA,
                                              &uuid_hash_ops, CFS_HASH_DEFAULT);
         if (!obd->obd_uuid_hash)
-                GOTO(err_hash, err = -ENOMEM);
+		GOTO(err_exit, err = -ENOMEM);
 
         /* create a nid-export lustre hash */
         obd->obd_nid_hash = cfs_hash_create("NID_HASH",
@@ -534,7 +499,7 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
                                             CFS_HASH_MAX_THETA,
                                             &nid_hash_ops, CFS_HASH_DEFAULT);
         if (!obd->obd_nid_hash)
-                GOTO(err_hash, err = -ENOMEM);
+		GOTO(err_exit, err = -ENOMEM);
 
         /* create a nid-stats lustre hash */
         obd->obd_nid_stats_hash = cfs_hash_create("NID_STATS",
@@ -544,8 +509,8 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
                                                   CFS_HASH_MIN_THETA,
                                                   CFS_HASH_MAX_THETA,
                                                   &nid_stat_hash_ops, CFS_HASH_DEFAULT);
-        if (!obd->obd_nid_stats_hash)
-                GOTO(err_hash, err = -ENOMEM);
+	if (!obd->obd_nid_stats_hash)
+		GOTO(err_exit, err = -ENOMEM);
 
 	/* create a client_generation-export lustre hash */
 	obd->obd_gen_hash = cfs_hash_create("UUID_HASH",
@@ -556,21 +521,13 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 					    CFS_HASH_MAX_THETA,
 					    &gen_hash_ops, CFS_HASH_DEFAULT);
 	if (!obd->obd_gen_hash)
-		GOTO(err_hash, err = -ENOMEM);
-
-        exp = class_new_export(obd, &obd->obd_uuid);
-        if (IS_ERR(exp))
-                GOTO(err_hash, err = PTR_ERR(exp));
+		GOTO(err_exit, err = -ENOMEM);
 
-        obd->obd_self_export = exp;
-	list_del_init(&exp->exp_obd_chain_timed);
-        class_export_put(exp);
-
-        err = obd_setup(obd, lcfg);
-        if (err)
-                GOTO(err_exp, err);
+	err = obd_setup(obd, lcfg);
+	if (err)
+		GOTO(err_exit, err);
 
-        obd->obd_set_up = 1;
+	obd->obd_set_up = 1;
 
 	spin_lock(&obd->obd_dev_lock);
 	/* cleanup drops this */
@@ -581,12 +538,7 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
                obd->obd_name, obd->obd_uuid.uuid);
 
         RETURN(0);
-err_exp:
-        if (obd->obd_self_export) {
-                class_unlink_export(obd->obd_self_export);
-                obd->obd_self_export = NULL;
-        }
-err_hash:
+err_exit:
         if (obd->obd_uuid_hash) {
                 cfs_hash_putref(obd->obd_uuid_hash);
                 obd->obd_uuid_hash = NULL;
@@ -630,10 +582,14 @@ int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg)
 	obd->obd_attached = 0;
 	spin_unlock(&obd->obd_dev_lock);
 
+	/* cleanup in progress. we don't like to find this device after now */
+	class_unregister_device(obd);
+
         CDEBUG(D_IOCTL, "detach on obd %s (uuid %s)\n",
                obd->obd_name, obd->obd_uuid.uuid);
 
-        class_decref(obd, "attach", obd);
+	class_decref(obd, "newdev", obd);
+
         RETURN(0);
 }
 EXPORT_SYMBOL(class_detach);
@@ -663,6 +619,9 @@ int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
 	}
 	/* Leave this on forever */
 	obd->obd_stopping = 1;
+	/* function can't return error after that point, so clear setup flag
+	 * as early as possible to avoid finding via obd_devs / hash */
+	obd->obd_set_up = 0;
 	spin_unlock(&obd->obd_dev_lock);
 
 	/* wait for already-arrived-connections to finish. */
@@ -695,17 +654,11 @@ int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
 
 	LASSERT(obd->obd_self_export);
 
-	/* The three references that should be remaining are the
-	 * obd_self_export and the attach and setup references. */
-	if (atomic_read(&obd->obd_refcount) > 3) {
-		/* refcounf - 3 might be the number of real exports
-		   (excluding self export). But class_incref is called
-		   by other things as well, so don't count on it. */
-		CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d\n",
-		       obd->obd_name, atomic_read(&obd->obd_refcount) - 3);
-		dump_exports(obd, 0, D_HA);
-		class_disconnect_exports(obd);
-	}
+	CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d/%d\n",
+	       obd->obd_name, obd->obd_num_exports,
+	       atomic_read(&obd->obd_refcount) - 2);
+	dump_exports(obd, 0, D_HA);
+	class_disconnect_exports(obd);
 
 	/* Precleanup, we must make sure all exports get destroyed. */
 	err = obd_precleanup(obd);
@@ -757,43 +710,27 @@ EXPORT_SYMBOL(class_incref);
 
 void class_decref(struct obd_device *obd, const char *scope, const void *source)
 {
-	int err;
-	int refs;
+	int last;
 
-	spin_lock(&obd->obd_dev_lock);
-	atomic_dec(&obd->obd_refcount);
-	refs = atomic_read(&obd->obd_refcount);
-	spin_unlock(&obd->obd_dev_lock);
+	CDEBUG(D_INFO, "Decref %s (%p) now %d - %s\n", obd->obd_name, obd,
+	       atomic_read(&obd->obd_refcount), scope);
+
+	LASSERT(obd->obd_num_exports >= 0);
+	last = atomic_dec_and_test(&obd->obd_refcount);
 	lu_ref_del(&obd->obd_reference, scope, source);
 
-	CDEBUG(D_INFO, "Decref %s (%p) now %d\n", obd->obd_name, obd, refs);
+	if (last) {
+		struct obd_export *exp;
 
-	if ((refs == 1) && obd->obd_stopping) {
+		LASSERT(!obd->obd_attached);
 		/* All exports have been destroyed; there should
-		   be no more in-progress ops by this point.*/
-
-		spin_lock(&obd->obd_self_export->exp_lock);
-		obd->obd_self_export->exp_flags |= exp_flags_from_obd(obd);
-		spin_unlock(&obd->obd_self_export->exp_lock);
-
-                /* note that we'll recurse into class_decref again */
-                class_unlink_export(obd->obd_self_export);
-                return;
-        }
+		 * be no more in-progress ops by this point.*/
+		exp = obd->obd_self_export;
 
-        if (refs == 0) {
-                CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n",
-                       obd->obd_name, obd->obd_uuid.uuid);
-                LASSERT(!obd->obd_attached);
-                if (obd->obd_stopping) {
-                        /* If we're not stopping, we were never set up */
-                        err = obd_cleanup(obd);
-                        if (err)
-                                CERROR("Cleanup %s returned %d\n",
-                                       obd->obd_name, err);
+		if (exp) {
+			exp->exp_flags |= exp_flags_from_obd(obd);
+			class_unlink_export(exp);
                 }
-
-                class_release_dev(obd);
         }
 }
 EXPORT_SYMBOL(class_decref);
@@ -1011,40 +948,12 @@ void class_del_profiles(void)
 }
 EXPORT_SYMBOL(class_del_profiles);
 
-static int class_set_global(char *ptr, int val, struct lustre_cfg *lcfg)
-{
-	ENTRY;
-	if (class_match_param(ptr, PARAM_AT_MIN, NULL) == 0)
-		at_min = val;
-	else if (class_match_param(ptr, PARAM_AT_MAX, NULL) == 0)
-		at_max = val;
-	else if (class_match_param(ptr, PARAM_AT_EXTRA, NULL) == 0)
-		at_extra = val;
-	else if (class_match_param(ptr, PARAM_AT_EARLY_MARGIN, NULL) == 0)
-		at_early_margin = val;
-	else if (class_match_param(ptr, PARAM_AT_HISTORY, NULL) == 0)
-		at_history = val;
-	else if (class_match_param(ptr, PARAM_JOBID_VAR, NULL) == 0)
-		strlcpy(obd_jobid_var, lustre_cfg_string(lcfg, 2),
-			JOBSTATS_JOBID_VAR_MAX_LEN + 1);
-	else
-		RETURN(-EINVAL);
-
-	CDEBUG(D_IOCTL, "global %s = %d\n", ptr, val);
-	RETURN(0);
-}
-
-
-/* We can't call ll_process_config or lquota_process_config directly because
- * it lives in a module that must be loaded after this one. */
-static int (*client_process_config)(struct lustre_cfg *lcfg) = NULL;
+/* We can't call lquota_process_config directly because
+ * it lives in a module that must be loaded after this one.
+ */
+#ifdef HAVE_SERVER_SUPPORT
 static int (*quota_process_config)(struct lustre_cfg *lcfg) = NULL;
-
-void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg))
-{
-        client_process_config = cpc;
-}
-EXPORT_SYMBOL(lustre_register_client_process_config);
+#endif /* HAVE_SERVER_SUPPORT */
 
 /**
  * Rename the proc parameter in \a cfg with a new name \a new_name.
@@ -1121,10 +1030,12 @@ struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg,
 }
 EXPORT_SYMBOL(lustre_cfg_rename);
 
-static int process_param2_config(struct lustre_cfg *lcfg)
+static ssize_t process_param2_config(struct lustre_cfg *lcfg)
 {
 	char *param = lustre_cfg_string(lcfg, 1);
 	char *upcall = lustre_cfg_string(lcfg, 2);
+	struct kobject *kobj = NULL;
+	const char *subsys = param;
 	char *argv[] = {
 		[0] = "/usr/sbin/lctl",
 		[1] = "set_param",
@@ -1133,8 +1044,44 @@ static int process_param2_config(struct lustre_cfg *lcfg)
 	};
 	ktime_t start;
 	ktime_t end;
-	int		rc;
+	size_t len;
+	int rc;
+
 	ENTRY;
+	print_lustre_cfg(lcfg);
+
+	len = strcspn(param, ".=");
+	if (!len)
+		return -EINVAL;
+
+	/* If we find '=' then its the top level sysfs directory */
+	if (param[len] == '=')
+		return class_set_global(param);
+
+	subsys = kstrndup(param, len, GFP_KERNEL);
+	if (!subsys)
+		return -ENOMEM;
+
+	kobj = kset_find_obj(lustre_kset, subsys);
+	kfree(subsys);
+	if (kobj) {
+		char *value = param;
+		char *envp[3];
+		int i;
+
+		param = strsep(&value, "=");
+		envp[0] = kasprintf(GFP_KERNEL, "PARAM=%s", param);
+		envp[1] = kasprintf(GFP_KERNEL, "SETTING=%s", value);
+		envp[2] = NULL;
+
+		rc = kobject_uevent_env(kobj, KOBJ_CHANGE, envp);
+		for (i = 0; i < ARRAY_SIZE(envp); i++)
+			kfree(envp[i]);
+
+		kobject_put(kobj);
+
+		RETURN(rc);
+	}
 
 	/* Add upcall processing here. Now only lctl is supported */
 	if (strcmp(upcall, LCTL_UPCALL) != 0) {
@@ -1160,11 +1107,13 @@ static int process_param2_config(struct lustre_cfg *lcfg)
 	RETURN(rc);
 }
 
+#ifdef HAVE_SERVER_SUPPORT
 void lustre_register_quota_process_config(int (*qpc)(struct lustre_cfg *lcfg))
 {
 	quota_process_config = qpc;
 }
 EXPORT_SYMBOL(lustre_register_quota_process_config);
+#endif /* HAVE_SERVER_SUPPORT */
 
 /** Process configuration commands given in lustre_cfg form.
  * These may come from direct calls (e.g. class_manual_cleanup)
@@ -1251,29 +1200,51 @@ int class_process_config(struct lustre_cfg *lcfg)
         }
         case LCFG_PARAM: {
                 char *tmp;
+
                 /* llite has no obd */
-                if ((class_match_param(lustre_cfg_string(lcfg, 1),
-				       PARAM_LLITE, NULL) == 0) &&
-                    client_process_config) {
-                        err = (*client_process_config)(lcfg);
-                        GOTO(out, err);
+		if (class_match_param(lustre_cfg_string(lcfg, 1),
+				      PARAM_LLITE, NULL) == 0) {
+			struct lustre_sb_info *lsi;
+			unsigned long addr;
+			ssize_t count;
+
+			/* The instance name contains the sb:
+			 * lustre-client-aacfe000
+			 */
+			tmp = strrchr(lustre_cfg_string(lcfg, 0), '-');
+			if (!tmp || !*(++tmp))
+				GOTO(out, err = -EINVAL);
+
+			if (sscanf(tmp, "%lx", &addr) != 1)
+				GOTO(out, err = -EINVAL);
+
+			lsi = s2lsi((struct super_block *)addr);
+			/* This better be a real Lustre superblock! */
+			LASSERT(lsi->lsi_lmd->lmd_magic == LMD_MAGIC);
+
+			count = class_modify_config(lcfg, PARAM_LLITE,
+						    lsi->lsi_kobj);
+			err = count < 0 ? count : 0;
+			GOTO(out, err);
                 } else if ((class_match_param(lustre_cfg_string(lcfg, 1),
                                               PARAM_SYS, &tmp) == 0)) {
                         /* Global param settings */
-			err = class_set_global(tmp, lcfg->lcfg_num, lcfg);
+			err = class_set_global(tmp);
 			/*
 			 * Client or server should not fail to mount if
 			 * it hits an unknown configuration parameter.
 			 */
-			if (err != 0)
+			if (err < 0)
 				CWARN("Ignoring unknown param %s\n", tmp);
 
 			GOTO(out, err = 0);
+#ifdef HAVE_SERVER_SUPPORT
 		} else if ((class_match_param(lustre_cfg_string(lcfg, 1),
 					      PARAM_QUOTA, &tmp) == 0) &&
 			   quota_process_config) {
 			err = (*quota_process_config)(lcfg);
 			GOTO(out, err);
+#endif /* HAVE_SERVER_SUPPORT */
 		}
 
 		break;
@@ -1294,7 +1265,6 @@ int class_process_config(struct lustre_cfg *lcfg)
 
                 GOTO(out, err = -EINVAL);
         }
-
 	switch(lcfg->lcfg_command) {
 	case LCFG_SETUP: {
 		err = class_setup(obd, lcfg);
@@ -1334,12 +1304,47 @@ int class_process_config(struct lustre_cfg *lcfg)
                 err = obd_pool_del(obd, lustre_cfg_string(lcfg, 2));
                 GOTO(out, err = 0);
         }
-        default: {
-                err = obd_process_config(obd, sizeof(*lcfg), lcfg);
-                GOTO(out, err);
+	/* Process config log ADD_MDC record twice to add MDC also to LOV
+	 * for Data-on-MDT:
+	 *
+	 * add 0:lustre-clilmv 1:lustre-MDT0000_UUID 2:0 3:1
+	 *     4:lustre-MDT0000-mdc_UUID
+	 */
+	case LCFG_ADD_MDC: {
+		struct obd_device *lov_obd;
+		char *clilmv;
+
+		err = obd_process_config(obd, sizeof(*lcfg), lcfg);
+		if (err)
+			GOTO(out, err);
+
+		/* make sure this is client LMV log entry */
+		clilmv = strstr(lustre_cfg_string(lcfg, 0), "clilmv");
+		if (!clilmv)
+			GOTO(out, err);
+
+		/* replace 'lmv' with 'lov' name to address LOV device and
+		 * process llog record to add MDC there. */
+		clilmv[4] = 'o';
+		lov_obd = class_name2obd(lustre_cfg_string(lcfg, 0));
+		if (lov_obd == NULL) {
+			err = -ENOENT;
+			CERROR("%s: Cannot find LOV by %s name, rc = %d\n",
+			       obd->obd_name, lustre_cfg_string(lcfg, 0), err);
+		} else {
+			err = obd_process_config(lov_obd, sizeof(*lcfg), lcfg);
+		}
+		/* restore 'lmv' name */
+		clilmv[4] = 'm';
+		GOTO(out, err);
+	}
+	default: {
+		err = obd_process_config(obd, sizeof(*lcfg), lcfg);
+		GOTO(out, err);
 
         }
         }
+	EXIT;
 out:
         if ((err < 0) && !(lcfg->lcfg_command & LCFG_REQUIRED)) {
                 CWARN("Ignoring error %d on optional command %#x\n", err,
@@ -1350,97 +1355,89 @@ int class_process_config(struct lustre_cfg *lcfg)
 }
 EXPORT_SYMBOL(class_process_config);
 
-int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
-			     struct lustre_cfg *lcfg, void *data)
+ssize_t class_modify_config(struct lustre_cfg *lcfg, const char *prefix,
+			    struct kobject *kobj)
 {
-	struct lprocfs_vars *var;
-	struct file fakefile = {};
-	struct seq_file fake_seqfile;
-	char *key, *sval;
-	int i, keylen, vallen;
-	int matched = 0, j = 0;
-	int rc = 0;
-	int skip = 0;
-	ENTRY;
+	struct kobj_type *typ;
+	ssize_t count = 0;
+	int i;
 
 	if (lcfg->lcfg_command != LCFG_PARAM) {
 		CERROR("Unknown command: %d\n", lcfg->lcfg_command);
-		RETURN(-EINVAL);
+		return -EINVAL;
 	}
 
-	/* fake a seq file so that var->fops->proc_write can work... */
-	lprocfs_file_set_kernel(&fakefile);
-	fakefile.private_data = &fake_seqfile;
-	fake_seqfile.private = data;
-	/* e.g. tunefs.lustre --param mdt.group_upcall=foo /r/tmp/lustre-mdt
-	   or   lctl conf_param lustre-MDT0000.mdt.group_upcall=bar
-	   or   lctl conf_param lustre-OST0000.osc.max_dirty_mb=36 */
+	typ = get_ktype(kobj);
+	if (!typ || !typ->default_attrs)
+		return -ENODEV;
+
+	print_lustre_cfg(lcfg);
+
+	/*
+	 * e.g. tunefs.lustre --param mdt.group_upcall=foo /r/tmp/lustre-mdt
+	 * or   lctl conf_param lustre-MDT0000.mdt.group_upcall=bar
+	 * or   lctl conf_param lustre-OST0000.osc.max_dirty_mb=36
+	 */
 	for (i = 1; i < lcfg->lcfg_bufcount; i++) {
+		struct attribute *attr;
+		size_t keylen;
+		char *value;
+		char *key;
+		int j;
+
 		key = lustre_cfg_buf(lcfg, i);
 		/* Strip off prefix */
 		if (class_match_param(key, prefix, &key))
 			/* If the prefix doesn't match, return error so we
-			 * can pass it down the stack */
-			RETURN(-ENOSYS);
-		sval = strchr(key, '=');
-		if (!sval || *(sval + 1) == 0) {
+			 * can pass it down the stack
+			 */
+			return -EINVAL;
+
+		value = strchr(key, '=');
+		if (!value || *(value + 1) == 0) {
 			CERROR("%s: can't parse param '%s' (missing '=')\n",
 			       lustre_cfg_string(lcfg, 0),
 			       lustre_cfg_string(lcfg, i));
-			/* rc = -EINVAL;        continue parsing other params */
+			/* continue parsing other params */
 			continue;
 		}
-		keylen = sval - key;
-		sval++;
-		vallen = strlen(sval);
-		matched = 0;
-		j = 0;
-		/* Search proc entries */
-		while (lvars[j].name) {
-			var = &lvars[j];
-			if (class_match_param(key, var->name, NULL) == 0 &&
-			    keylen == strlen(var->name)) {
-				matched++;
-				rc = -EROFS;
-
-				if (var->fops && var->fops->proc_write) {
-					rc = (var->fops->proc_write)(&fakefile,
-								     sval,
-								     vallen,
-								     NULL);
-				}
+		keylen = value - key;
+		value++;
+
+		attr = NULL;
+		for (j = 0; typ->default_attrs[j]; j++) {
+			if (!strncmp(typ->default_attrs[j]->name, key,
+				     keylen)) {
+				attr = typ->default_attrs[j];
 				break;
 			}
-			j++;
 		}
-		if (!matched) {
-			/* It was upgraded from old MDT/OST device,
-			 * ignore the obsolete "sec_level" parameter. */
-			if (strncmp("sec_level", key, keylen) == 0)
-				continue;
 
-			CERROR("%s: unknown config parameter '%s'\n",
-			       lustre_cfg_string(lcfg, 0),
-			       lustre_cfg_string(lcfg, i));
-			/* rc = -EINVAL;        continue parsing other params */
-			skip++;
-		} else if (rc < 0) {
-			CERROR("%s: error writing parameter '%s': rc = %d\n",
-			       lustre_cfg_string(lcfg, 0), key, rc);
-			rc = 0;
+		if (!attr) {
+			char *envp[3];
+
+			envp[0] = kasprintf(GFP_KERNEL, "PARAM=%s.%s.%.*s",
+					    kobject_name(kobj->parent),
+					    kobject_name(kobj),
+					    (int) keylen, key);
+			envp[1] = kasprintf(GFP_KERNEL, "SETTING=%s", value);
+			envp[2] = NULL;
+
+			if (kobject_uevent_env(kobj, KOBJ_CHANGE, envp)) {
+				CERROR("%s: failed to send uevent %s\n",
+				       kobject_name(kobj), key);
+			}
+
+			for (i = 0; i < ARRAY_SIZE(envp); i++)
+				kfree(envp[i]);
 		} else {
-			CDEBUG(D_CONFIG, "%s: set parameter '%s'\n",
-			       lustre_cfg_string(lcfg, 0), key);
+			count += lustre_attr_store(kobj, attr, value,
+						   strlen(value));
 		}
 	}
-
-	if (rc > 0)
-		rc = 0;
-	if (!rc && skip)
-		rc = skip;
-	RETURN(rc);
+	return count;
 }
-EXPORT_SYMBOL(class_process_proc_param);
+EXPORT_SYMBOL(class_modify_config);
 
 /*
  * Supplemental functions for config logs, it allocates lustre_cfg
@@ -1542,9 +1539,9 @@ int class_config_llog_handler(const struct lu_env *env,
 			}
 		}
 		/* A config command without a start marker before it is
-		   illegal (post 146) */
-		if (!(cfg->cfg_flags & CFG_F_COMPAT146) &&
-		    !(cfg->cfg_flags & CFG_F_MARKER) &&
+		 * illegal
+		 */
+		if (!(cfg->cfg_flags & CFG_F_MARKER) &&
 		    (lcfg->lcfg_command != LCFG_MARKER)) {
 			CWARN("Skip config outside markers, (inst: %016lx, uuid: %s, flags: %#x)\n",
 				cfg->cfg_instance,
@@ -1793,55 +1790,6 @@ int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
 }
 EXPORT_SYMBOL(class_config_parse_llog);
 
-static struct lcfg_type_data {
-	__u32	 ltd_type;
-	char	*ltd_name;
-	char	*ltd_bufs[4];
-} lcfg_data_table[] = {
-	{ LCFG_ATTACH, "attach", { "type", "UUID", "3", "4" } },
-	{ LCFG_DETACH, "detach", { "1", "2", "3", "4" } },
-	{ LCFG_SETUP, "setup", { "UUID", "node", "options", "failout" } },
-	{ LCFG_CLEANUP, "cleanup", { "1", "2", "3", "4" } },
-	{ LCFG_ADD_UUID, "add_uuid", { "node", "2", "3", "4" }  },
-	{ LCFG_DEL_UUID, "del_uuid", { "1", "2", "3", "4" }  },
-	{ LCFG_MOUNTOPT, "new_profile", { "name", "lov", "lmv", "4" }  },
-	{ LCFG_DEL_MOUNTOPT, "del_mountopt", { "1", "2", "3", "4" } , },
-	{ LCFG_SET_TIMEOUT, "set_timeout", { "parameter", "2", "3", "4" }  },
-	{ LCFG_SET_UPCALL, "set_upcall", { "1", "2", "3", "4" }  },
-	{ LCFG_ADD_CONN, "add_conn", { "node", "2", "3", "4" }  },
-	{ LCFG_DEL_CONN, "del_conn", { "1", "2", "3", "4" }  },
-	{ LCFG_LOV_ADD_OBD, "add_osc", { "ost", "index", "gen", "UUID" } },
-	{ LCFG_LOV_DEL_OBD, "del_osc", { "1", "2", "3", "4" } },
-	{ LCFG_PARAM, "conf_param", { "parameter", "value", "3", "4" } },
-	{ LCFG_MARKER, "marker", { "1", "2", "3", "4" } },
-	{ LCFG_LOG_START, "log_start", { "1", "2", "3", "4" } },
-	{ LCFG_LOG_END, "log_end", { "1", "2", "3", "4" } },
-	{ LCFG_LOV_ADD_INA, "add_osc_inactive", { "1", "2", "3", "4" }  },
-	{ LCFG_ADD_MDC, "add_mdc", { "mdt", "index", "gen", "UUID" } },
-	{ LCFG_DEL_MDC, "del_mdc", { "1", "2", "3", "4" } },
-	{ LCFG_SPTLRPC_CONF, "security", { "parameter", "2", "3", "4" } },
-	{ LCFG_POOL_NEW, "new_pool", { "fsname", "pool", "3", "4" }  },
-	{ LCFG_POOL_ADD, "add_pool", { "fsname", "pool", "ost", "4" } },
-	{ LCFG_POOL_REM, "remove_pool", { "fsname", "pool", "ost", "4" } },
-	{ LCFG_POOL_DEL, "del_pool", { "fsname", "pool", "3", "4" } },
-	{ LCFG_SET_LDLM_TIMEOUT, "set_ldlm_timeout",
-	  { "parameter", "2", "3", "4" } },
-	{ LCFG_SET_PARAM, "set_param", { "parameter", "value", "3", "4" } },
-	{ 0, NULL, { NULL, NULL, NULL, NULL } }
-};
-
-static struct lcfg_type_data *lcfg_cmd2data(__u32 cmd)
-{
-	int i = 0;
-
-	while (lcfg_data_table[i].ltd_type != 0) {
-		if (lcfg_data_table[i].ltd_type == cmd)
-			return &lcfg_data_table[i];
-		i++;
-	}
-	return NULL;
-}
-
 /**
  * Parse config record and output dump in supplied buffer.
  *
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c
index e3390507d900e..3c7a51ffd38a1 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -43,11 +43,10 @@
 
 #include <obd.h>
 #include <obd_class.h>
-#include <lustre/lustre_user.h>
 #include <linux/version.h>
 #include <lustre_log.h>
 #include <lustre_disk.h>
-#include <uapi/linux/lustre_param.h>
+#include <uapi/linux/lustre/lustre_param.h>
 
 static int (*client_fill_super)(struct super_block *sb,
 				struct vfsmount *mnt);
@@ -220,7 +219,7 @@ int lustre_start_mgc(struct super_block *sb)
         struct lustre_sb_info *lsi = s2lsi(sb);
         struct obd_device *obd;
         struct obd_export *exp;
-        struct obd_uuid *uuid;
+	struct obd_uuid *uuid = NULL;
         class_uuid_t uuidc;
         lnet_nid_t nid;
 	char nidstr[LNET_NIDSTR_SIZE];
@@ -243,7 +242,7 @@ int lustre_start_mgc(struct super_block *sb)
 			struct lnet_process_id id;
 
                         while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
-                                if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
+                                if (id.nid == LNET_NID_LO_0)
                                         continue;
                                 nid = id.nid;
                                 i++;
@@ -409,7 +408,6 @@ int lustre_start_mgc(struct super_block *sb)
         rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME,
 				 (char *)uuid->uuid, LUSTRE_MGS_OBDNAME,
 				 niduuid, NULL, NULL);
-        OBD_FREE_PTR(uuid);
         if (rc)
                 GOTO(out_free, rc);
 
@@ -470,7 +468,7 @@ int lustre_start_mgc(struct super_block *sb)
             lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR)
                 data->ocd_connect_flags &= ~OBD_CONNECT_IMP_RECOV;
         data->ocd_version = LUSTRE_VERSION_CODE;
-        rc = obd_connect(NULL, &exp, obd, &(obd->obd_uuid), data, NULL);
+	rc = obd_connect(NULL, &exp, obd, uuid, data, NULL);
         if (rc) {
                 CERROR("connect failed %d\n", rc);
                 GOTO(out, rc);
@@ -485,6 +483,8 @@ int lustre_start_mgc(struct super_block *sb)
 out_free:
 	mutex_unlock(&mgc_start_lock);
 
+	if (uuid)
+		OBD_FREE_PTR(uuid);
         if (data)
                 OBD_FREE_PTR(data);
         if (mgcname)
@@ -591,7 +591,7 @@ static struct lustre_sb_info *lustre_init_lsi(struct super_block *sb)
         /* Default umount style */
         lsi->lsi_flags = LSI_UMOUNT_FAILOVER;
 	INIT_LIST_HEAD(&lsi->lsi_lwp_list);
-	spin_lock_init(&lsi->lsi_lwp_lock);
+	mutex_init(&lsi->lsi_lwp_mutex);
 
 	RETURN(lsi);
 }
@@ -1156,37 +1156,52 @@ static int lmd_parse_mgs(struct lustre_mount_data *lmd, char **ptr)
  * make \a *endh point to the string starting with the delimiter. The commas
  * in expression list [...] will be skipped.
  *
- * \param[in] buf	a delimiter-separated string
- * \param[in] endh	a pointer to a pointer that will point to the string
- *			starting with the delimiter
+ * @buf		a delimiter-separated string
+ * @endh	a pointer to a pointer that will point to the string
+ *		starting with the delimiter
  *
- * \retval 0		if delimiter is found
- * \retval 1		if delimiter is not found
+ * RETURNS	true if delimiter is found, false if delimiter is not found
  */
-static int lmd_find_delimiter(char *buf, char **endh)
+static bool lmd_find_delimiter(char *buf, char **endh)
 {
 	char *c = buf;
-	int   skip = 0;
-
-	if (buf == NULL)
-		return 1;
+	size_t pos;
+	bool found;
+
+	if (!buf)
+		return false;
+try_again:
+	if (*c == ',' || *c == ':')
+		return true;
+
+	pos = strcspn(c, "[:,]");
+	if (!pos)
+		return false;
+
+	/* Not a valid mount string */
+	if (*c == ']') {
+		CWARN("invalid mount string format\n");
+		return false;
+	}
 
-	while (*c != '\0') {
-		if (*c == '[')
-			skip++;
-		else if (*c == ']')
-			skip--;
+	c += pos;
+	if (*c == '[') {
+		c = strchr(c, ']');
 
-		if ((*c == ',' || *c == ':') && skip == 0) {
-			if (endh != NULL)
-				*endh = c;
-			return 0;
+		/* invalid mount string */
+		if (!c) {
+			CWARN("invalid mount string format\n");
+			return false;
 		}
-
 		c++;
+		goto try_again;
 	}
 
-	return 1;
+	found = *c != '\0';
+	if (found && endh)
+		*endh = c;
+
+	return found;
 }
 
 /**
@@ -1215,7 +1230,7 @@ static int lmd_parse_nidlist(char *buf, char **endh)
 	if (*buf == ' ' || *buf == '/' || *buf == '\0')
 		return 1;
 
-	if (lmd_find_delimiter(buf, &endp) != 0)
+	if (!lmd_find_delimiter(buf, &endp))
 		endp = buf + strlen(buf);
 
 	tmp = *endp;
@@ -1360,9 +1375,8 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd)
 		} else if (strncmp(s1, "param=", 6) == 0) {
 			size_t length, params_length;
 			char  *tail = s1;
-			if (lmd_find_delimiter(s1 + 6, &tail) != 0)
-				length = strlen(s1);
-			else {
+
+			if (lmd_find_delimiter(s1 + 6, &tail)) {
 				char *param_str = tail + 1;
 				int   supplementary = 1;
 				while (lmd_parse_nidlist(param_str,
@@ -1370,6 +1384,8 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd)
 					supplementary = 0;
 				}
 				length = param_str - s1 - supplementary;
+			} else {
+				length = strlen(s1);
 			}
 			length -= 6;
 			params_length = strlen(lmd->lmd_params);
@@ -1398,6 +1414,15 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd)
 			rc = lmd_parse_network(lmd, s1 + 8);
 			if (rc)
 				goto invalid;
+
+			/* check if LNet dynamic peer discovery is activated */
+			if (LNetGetPeerDiscoveryStatus()) {
+				CERROR("LNet Dynamic Peer Discovery is enabled "
+				       "on this node. 'network' mount option "
+				       "cannot be taken into account.\n");
+				goto invalid;
+			}
+
 			clear++;
 		}
 
@@ -1476,6 +1501,8 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd)
 	s1 = options + strlen(options) - 1;
 	while (s1 >= options && (*s1 == ',' || *s1 == ' '))
 		*s1-- = 0;
+	while (*options && (*options == ',' || *options == ' '))
+		options++;
 	if (*options != 0) {
 		/* Freed in lustre_free_lsi */
 		OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1);
@@ -1648,7 +1675,12 @@ static struct file_system_type lustre_fs_type = {
         .get_sb       = lustre_get_sb,
 #endif
         .kill_sb      = lustre_kill_super,
-	.fs_flags     = FS_REQUIRES_DEV | FS_HAS_FIEMAP | FS_RENAME_DOES_D_MOVE,
+	.fs_flags     = FS_HAS_FIEMAP | FS_RENAME_DOES_D_MOVE |
+#ifdef HAVE_SERVER_SUPPORT
+			FS_REQUIRES_DEV,
+#else
+			0,
+#endif
 };
 MODULE_ALIAS_FS("lustre");
 
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c
index 6bec75198e190..b23a4ccf0bd9d 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_mount_server.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2013, 2016, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -52,14 +52,11 @@
 #include <linux/smp_lock.h>
 #endif
 
-#include <lustre/lustre_idl.h>
-#include <lustre/lustre_user.h>
-
 #include <llog_swab.h>
 #include <lustre_disk.h>
-#include <uapi/linux/lustre_ioctl.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
 #include <lustre_log.h>
-#include <uapi/linux/lustre_param.h>
+#include <uapi/linux/lustre/lustre_param.h>
 #include <obd.h>
 #include <obd_class.h>
 
@@ -511,7 +508,7 @@ struct obd_export *lustre_find_lwp_by_index(const char *dev, __u32 idx)
 	}
 
 	snprintf(lwp_name, sizeof(lwp_name), "%s-MDT%04x", fsname, idx);
-	spin_lock(&lsi->lsi_lwp_lock);
+	mutex_lock(&lsi->lsi_lwp_mutex);
 	list_for_each_entry(lwp, &lsi->lsi_lwp_list, obd_lwp_list) {
 		char *ptr = strstr(lwp->obd_name, lwp_name);
 
@@ -520,7 +517,7 @@ struct obd_export *lustre_find_lwp_by_index(const char *dev, __u32 idx)
 			break;
 		}
 	}
-	spin_unlock(&lsi->lsi_lwp_lock);
+	mutex_unlock(&lsi->lsi_lwp_mutex);
 
 err_lmi:
 	server_put_mount(dev, false);
@@ -681,9 +678,9 @@ static int lustre_lwp_setup(struct lustre_cfg *lcfg, struct lustre_sb_info *lsi,
 	rc = lustre_lwp_connect(obd, strstr(lsi->lsi_svname, "-MDT") != NULL);
 	if (rc == 0) {
 		obd->u.cli.cl_max_mds_easize = MAX_MD_SIZE;
-		spin_lock(&lsi->lsi_lwp_lock);
+		mutex_lock(&lsi->lsi_lwp_mutex);
 		list_add_tail(&obd->obd_lwp_list, &lsi->lsi_lwp_list);
-		spin_unlock(&lsi->lsi_lwp_lock);
+		mutex_unlock(&lsi->lsi_lwp_mutex);
 	} else {
 		CERROR("%s: connect failed: rc = %d\n", lwpname, rc);
 	}
@@ -951,6 +948,7 @@ static int lustre_disconnect_lwp(struct super_block *sb)
 	if (bufs == NULL)
 		GOTO(out, rc = -ENOMEM);
 
+	mutex_lock(&lsi->lsi_lwp_mutex);
 	list_for_each_entry(lwp, &lsi->lsi_lwp_list, obd_lwp_list) {
 		struct lustre_cfg *lcfg;
 
@@ -963,8 +961,10 @@ static int lustre_disconnect_lwp(struct super_block *sb)
 		lustre_cfg_bufs_set_string(bufs, 1, NULL);
 		OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount,
 					       bufs->lcfg_buflen));
-		if (!lcfg)
-			GOTO(out, rc = -ENOMEM);
+		if (!lcfg) {
+			rc = -ENOMEM;
+			break;
+		}
 		lustre_cfg_init(lcfg, LCFG_CLEANUP, bufs);
 
 		/* Disconnect import first. NULL is passed for the '@env',
@@ -979,6 +979,7 @@ static int lustre_disconnect_lwp(struct super_block *sb)
 			rc1 = rc;
 		}
 	}
+	mutex_unlock(&lsi->lsi_lwp_mutex);
 
 	GOTO(out, rc);
 
@@ -1004,18 +1005,23 @@ static int lustre_stop_lwp(struct super_block *sb)
 	int			 rc1 = 0;
 	ENTRY;
 
+	mutex_lock(&lsi->lsi_lwp_mutex);
 	while (!list_empty(&lsi->lsi_lwp_list)) {
 		lwp = list_entry(lsi->lsi_lwp_list.next, struct obd_device,
 				 obd_lwp_list);
 		list_del_init(&lwp->obd_lwp_list);
 		lwp->obd_force = 1;
+		mutex_unlock(&lsi->lsi_lwp_mutex);
+
 		rc = class_manual_cleanup(lwp);
 		if (rc != 0) {
 			CERROR("%s: fail to stop LWP: rc = %d\n",
 			       lwp->obd_name, rc);
 			rc1 = rc;
 		}
+		mutex_lock(&lsi->lsi_lwp_mutex);
 	}
+	mutex_unlock(&lsi->lsi_lwp_mutex);
 
 	RETURN(rc1 != 0 ? rc1 : rc);
 }
@@ -1133,7 +1139,7 @@ static int server_lsi2mti(struct lustre_sb_info *lsi,
 
 	mti->mti_nid_count = 0;
 	while (LNetGetId(i++, &id) != -ENOENT) {
-		if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
+		if (id.nid == LNET_NID_LO_0)
 			continue;
 
 		/* server use --servicenode param, only allow specified
@@ -1682,6 +1688,63 @@ static int server_statfs(struct dentry *dentry, struct kstatfs *buf)
 	RETURN(0);
 }
 
+#ifdef HAVE_SUPEROPS_USE_DENTRY
+int server_show_options(struct seq_file *seq, struct dentry *dentry)
+#else
+int server_show_options(struct seq_file *seq, struct vfsmount *vfs)
+#endif
+{
+	struct lustre_sb_info *lsi;
+	struct lustre_mount_data *lmd;
+
+#ifdef HAVE_SUPEROPS_USE_DENTRY
+	LASSERT(seq != NULL && dentry != NULL);
+	lsi = s2lsi(dentry->d_sb);
+#else
+	LASSERT(seq != NULL && vfs != NULL);
+	lsi = s2lsi(vfs->mnt_sb);
+#endif
+
+	lmd = lsi->lsi_lmd;
+	seq_printf(seq, ",svname=%s", lmd->lmd_profile);
+
+	if  (lmd->lmd_flags & LMD_FLG_ABORT_RECOV)
+		seq_puts(seq, ",abort_recov");
+
+	if (lmd->lmd_flags & LMD_FLG_NOIR)
+		seq_puts(seq, ",noir");
+
+	if (lmd->lmd_flags & LMD_FLG_NOSVC)
+		seq_puts(seq, ",nosvc");
+
+	if (lmd->lmd_flags & LMD_FLG_NOMGS)
+		seq_puts(seq, ",nomgs");
+
+	if (lmd->lmd_flags & LMD_FLG_NOSCRUB)
+		seq_puts(seq, ",noscrub");
+	if (lmd->lmd_flags & LMD_FLG_SKIP_LFSCK)
+		seq_puts(seq, ",skip_lfsck");
+
+	if (lmd->lmd_flags & LMD_FLG_DEV_RDONLY)
+		seq_puts(seq, ",rdonly_dev");
+
+	if (lmd->lmd_flags & LMD_FLG_MGS)
+		seq_puts(seq, ",mgs");
+
+	if (lmd->lmd_mgs != NULL)
+		seq_printf(seq, ",mgsnode=%s", lmd->lmd_mgs);
+
+	if (lmd->lmd_osd_type != NULL)
+		seq_printf(seq, ",osd=%s", lmd->lmd_osd_type);
+
+	if (lmd->lmd_opts != NULL) {
+		seq_putc(seq, ',');
+		seq_puts(seq, lmd->lmd_opts);
+	}
+
+	RETURN(0);
+}
+
 /** The operations we support directly on the superblock:
  * mount, umount, and df.
  */
@@ -1689,6 +1752,7 @@ static struct super_operations server_ops = {
 	.put_super	= server_put_super,
 	.umount_begin	= server_umount_begin, /* umount -f */
 	.statfs		= server_statfs,
+	.show_options	= server_show_options,
 };
 
 /*
@@ -1716,6 +1780,43 @@ static ssize_t lustre_listxattr(struct dentry *d_entry, char *name,
 	return -EOPNOTSUPP;
 }
 
+static bool is_cmd_supported(unsigned int command)
+{
+	switch (command) {
+	case FITRIM:
+		return true;
+	default:
+		return false;
+	}
+
+	return false;
+}
+
+static long server_ioctl(struct file *filp, unsigned int command,
+			 unsigned long arg)
+{
+	struct file active_filp;
+	struct inode *inode = file_inode(filp);
+	struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
+	struct super_block *dd_sb = dt_mnt_sb_get(lsi->lsi_dt_dev);
+	struct inode *active_inode;
+	int err = -EOPNOTSUPP;
+
+	if (IS_ERR(dd_sb) || !is_cmd_supported(command))
+		return err;
+
+	active_inode = igrab(dd_sb->s_root->d_inode);
+	if (!active_inode)
+		return -EACCES;
+
+	active_filp.f_inode = active_inode;
+	if (active_inode->i_fop && active_inode->i_fop->unlocked_ioctl)
+		err = active_inode->i_fop->unlocked_ioctl(&active_filp,
+							  command, arg);
+	iput(active_inode);
+	return err;
+}
+
 static const struct inode_operations server_inode_operations = {
 #ifdef HAVE_IOP_XATTR
 	.setxattr       = lustre_setxattr,
@@ -1724,6 +1825,10 @@ static const struct inode_operations server_inode_operations = {
 	.listxattr      = lustre_listxattr,
 };
 
+static const struct file_operations server_file_operations = {
+	.unlocked_ioctl = server_ioctl,
+};
+
 #define log2(n) ffz(~(n))
 #define LUSTRE_SUPER_MAGIC 0x0BD00BD1
 
@@ -1752,6 +1857,7 @@ static int server_fill_super_common(struct super_block *sb)
 	/* apparently we need to be a directory for the mount to finish */
 	root->i_mode = S_IFDIR;
 	root->i_op = &server_inode_operations;
+	root->i_fop = &server_file_operations;
 	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
 		CERROR("%s: can't make root dentry\n", sb->s_id);
@@ -1764,10 +1870,10 @@ static int server_fill_super_common(struct super_block *sb)
 static int osd_start(struct lustre_sb_info *lsi, unsigned long mflags)
 {
 	struct lustre_mount_data *lmd = lsi->lsi_lmd;
-	struct obd_device	 *obd;
-	struct dt_device_param    p;
-	char			  flagstr[16];
-	int			  rc;
+	struct obd_device *obd;
+	struct dt_device_param p;
+	char flagstr[20 + 1 + 10 + 1];
+	int rc;
 	ENTRY;
 
 	CDEBUG(D_MOUNT,
@@ -1777,7 +1883,7 @@ static int osd_start(struct lustre_sb_info *lsi, unsigned long mflags)
 	sprintf(lsi->lsi_osd_obdname, "%s-osd", lsi->lsi_svname);
 	strcpy(lsi->lsi_osd_uuid, lsi->lsi_osd_obdname);
 	strcat(lsi->lsi_osd_uuid, "_UUID");
-	sprintf(flagstr, "%lu:%lu", mflags, (unsigned long) lmd->lmd_flags);
+	snprintf(flagstr, sizeof(flagstr), "%lu:%u", mflags, lmd->lmd_flags);
 
 	obd = class_name2obd(lsi->lsi_osd_obdname);
 	if (obd == NULL) {
@@ -1840,8 +1946,10 @@ int server_fill_super(struct super_block *sb)
 	OBD_RACE(OBD_FAIL_TGT_MOUNT_RACE);
 
 	rc = lsi_prepare(lsi);
-	if (rc)
+	if (rc) {
+		lustre_put_lsi(sb);
 		RETURN(rc);
+	}
 
 	/* Start low level OSD */
 	rc = osd_start(lsi, sb->s_flags);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obd_sysfs.c b/drivers/staging/lustrefsx/lustre/obdclass/obd_sysfs.c
new file mode 100644
index 0000000000000..53b0b3130b717
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obd_sysfs.c
@@ -0,0 +1,535 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obd_sysfs.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/lp.h>
+#include <linux/slab.h>
+#include <linux/ioport.h>
+#include <linux/fcntl.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <asm/io.h>
+#include <asm/ioctls.h>
+#include <asm/poll.h>
+#include <asm/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/seq_file.h>
+#include <linux/kobject.h>
+
+#include <libcfs/libcfs.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <uapi/linux/lnet/lnetctl.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
+#include <uapi/linux/lustre/lustre_ver.h>
+
+struct static_lustre_uintvalue_attr {
+	struct {
+		struct attribute attr;
+		ssize_t (*show)(struct kobject *kobj, struct attribute *attr,
+				char *buf);
+		ssize_t (*store)(struct kobject *kobj, struct attribute *attr,
+				 const char *buf, size_t len);
+	} u;
+	int *value;
+};
+
+static ssize_t static_uintvalue_show(struct kobject *kobj,
+				     struct attribute *attr,
+				     char *buf)
+{
+	struct static_lustre_uintvalue_attr *lattr = (void *)attr;
+
+	return sprintf(buf, "%d\n", *lattr->value);
+}
+
+static ssize_t static_uintvalue_store(struct kobject *kobj,
+				      struct attribute *attr,
+				      const char *buffer, size_t count)
+{
+	struct static_lustre_uintvalue_attr *lattr = (void *)attr;
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buffer, 10, &val);
+	if (rc)
+		return rc;
+
+	*lattr->value = val;
+
+	return count;
+}
+
+#define LUSTRE_STATIC_UINT_ATTR(name, value)				\
+static struct static_lustre_uintvalue_attr lustre_sattr_##name =	\
+	{ __ATTR(name, 0644, static_uintvalue_show,			\
+		 static_uintvalue_store), value }
+
+LUSTRE_STATIC_UINT_ATTR(timeout, &obd_timeout);
+LUSTRE_STATIC_UINT_ATTR(debug_peer_on_timeout, &obd_debug_peer_on_timeout);
+LUSTRE_STATIC_UINT_ATTR(dump_on_timeout, &obd_dump_on_timeout);
+LUSTRE_STATIC_UINT_ATTR(dump_on_eviction, &obd_dump_on_eviction);
+LUSTRE_STATIC_UINT_ATTR(at_min, &at_min);
+LUSTRE_STATIC_UINT_ATTR(at_max, &at_max);
+LUSTRE_STATIC_UINT_ATTR(at_extra, &at_extra);
+LUSTRE_STATIC_UINT_ATTR(at_early_margin, &at_early_margin);
+LUSTRE_STATIC_UINT_ATTR(at_history, &at_history);
+LUSTRE_STATIC_UINT_ATTR(lbug_on_eviction, &obd_lbug_on_eviction);
+
+#ifdef HAVE_SERVER_SUPPORT
+LUSTRE_STATIC_UINT_ATTR(ldlm_timeout, &ldlm_timeout);
+LUSTRE_STATIC_UINT_ATTR(bulk_timeout, &bulk_timeout);
+#endif
+
+static ssize_t memused_show(struct kobject *kobj, struct attribute *attr,
+			    char *buf)
+{
+	return sprintf(buf, "%llu\n", obd_memory_sum());
+}
+LUSTRE_RO_ATTR(memused);
+
+static ssize_t memused_max_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	return sprintf(buf, "%llu\n", obd_memory_max());
+}
+LUSTRE_RO_ATTR(memused_max);
+
+static ssize_t max_dirty_mb_show(struct kobject *kobj, struct attribute *attr,
+				 char *buf)
+{
+	return sprintf(buf, "%lu\n",
+		       obd_max_dirty_pages / (1 << (20 - PAGE_SHIFT)));
+}
+
+static ssize_t max_dirty_mb_store(struct kobject *kobj, struct attribute *attr,
+				  const char *buffer, size_t count)
+{
+	unsigned long val;
+	int rc;
+
+	rc = kstrtoul(buffer, 10, &val);
+	if (rc)
+		return rc;
+
+	val *= 1 << (20 - PAGE_SHIFT); /* convert to pages */
+
+	if (val > ((cfs_totalram_pages() / 10) * 9)) {
+		/* Somebody wants to assign too much memory to dirty pages */
+		return -EINVAL;
+	}
+
+	if (val < 4 << (20 - PAGE_SHIFT)) {
+		/* Less than 4 Mb for dirty cache is also bad */
+		return -EINVAL;
+	}
+
+	obd_max_dirty_pages = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(max_dirty_mb);
+
+static ssize_t version_show(struct kobject *kobj, struct attribute *attr,
+			    char *buf)
+{
+	return sprintf(buf, "%s\n", LUSTRE_VERSION_STRING);
+}
+
+static ssize_t pinger_show(struct kobject *kobj, struct attribute *attr,
+			   char *buf)
+{
+#ifdef ENABLE_PINGER
+	const char *state = "on";
+#else
+	const char *state = "off";
+#endif
+	return sprintf(buf, "%s\n", state);
+}
+
+/**
+ * Check all obd devices health
+ *
+ * \param kobj
+ * \param buf [in]
+ *
+ * \retval number of characters printed if healthy
+ */
+static ssize_t
+health_check_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	bool healthy = true;
+	size_t len = 0;
+	int i;
+
+	if (libcfs_catastrophe) {
+		len = sprintf(buf, "LBUG\n");
+		healthy = false;
+	}
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd;
+
+		obd = class_num2obd(i);
+		if (obd == NULL || !obd->obd_attached || !obd->obd_set_up)
+			continue;
+
+		LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+		if (obd->obd_stopping)
+			continue;
+
+		class_incref(obd, __FUNCTION__, current);
+		read_unlock(&obd_dev_lock);
+
+		if (obd_health_check(NULL, obd)) {
+			len = sprintf(buf, "device %s reported unhealthy\n",
+				      obd->obd_name);
+			healthy = false;
+		}
+		class_decref(obd, __FUNCTION__, current);
+		read_lock(&obd_dev_lock);
+	}
+	read_unlock(&obd_dev_lock);
+
+	if (healthy)
+		len = sprintf(buf, "healthy\n");
+	else
+		len = sprintf(buf, "NOT HEALTHY\n");
+
+	return len;
+}
+
+static ssize_t jobid_var_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
+{
+	int rc = 0;
+
+	if (strlen(obd_jobid_var))
+		rc = snprintf(buf, PAGE_SIZE, "%s\n", obd_jobid_var);
+	return rc;
+}
+
+static ssize_t jobid_var_store(struct kobject *kobj, struct attribute *attr,
+			       const char *buffer, size_t count)
+{
+	if (!count || count > JOBSTATS_JOBID_VAR_MAX_LEN)
+		return -EINVAL;
+
+	memset(obd_jobid_var, 0, JOBSTATS_JOBID_VAR_MAX_LEN + 1);
+
+	memcpy(obd_jobid_var, buffer, count);
+
+	/* Trim the trailing '\n' if any */
+	if (obd_jobid_var[count - 1] == '\n')
+		obd_jobid_var[count - 1] = 0;
+
+	return count;
+}
+
+static ssize_t jobid_name_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	int rc = 0;
+
+	if (strlen(obd_jobid_name))
+		rc = snprintf(buf, PAGE_SIZE, "%s\n", obd_jobid_name);
+	return rc;
+}
+
+static ssize_t jobid_name_store(struct kobject *kobj, struct attribute *attr,
+				const char *buffer, size_t count)
+{
+	if (!count || count > LUSTRE_JOBID_SIZE)
+		return -EINVAL;
+
+	if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) != 0 &&
+	    !strchr(buffer, '%')) {
+		lustre_jobid_clear(buffer);
+		return count;
+	}
+
+	/* clear previous value */
+	memset(obd_jobid_name, 0, LUSTRE_JOBID_SIZE);
+
+	memcpy(obd_jobid_name, buffer, count);
+
+	/* Trim the trailing '\n' if any */
+	if (obd_jobid_name[count - 1] == '\n') {
+		/* Don't echo just a newline */
+		if (count == 1)
+			return -EINVAL;
+		obd_jobid_name[count - 1] = 0;
+	}
+
+	return count;
+}
+
+/* Root for /sys/kernel/debug/lustre */
+struct dentry *debugfs_lustre_root;
+EXPORT_SYMBOL_GPL(debugfs_lustre_root);
+
+#ifdef CONFIG_PROC_FS
+/* Root for /proc/fs/lustre */
+struct proc_dir_entry *proc_lustre_root;
+EXPORT_SYMBOL(proc_lustre_root);
+#else
+#define lprocfs_base NULL
+#endif /* CONFIG_PROC_FS */
+
+LUSTRE_RO_ATTR(version);
+LUSTRE_RO_ATTR(pinger);
+LUSTRE_RO_ATTR(health_check);
+LUSTRE_RW_ATTR(jobid_var);
+LUSTRE_RW_ATTR(jobid_name);
+
+static struct attribute *lustre_attrs[] = {
+	&lustre_attr_version.attr,
+	&lustre_attr_pinger.attr,
+	&lustre_attr_health_check.attr,
+	&lustre_attr_jobid_name.attr,
+	&lustre_attr_jobid_var.attr,
+	&lustre_sattr_timeout.u.attr,
+	&lustre_attr_max_dirty_mb.attr,
+	&lustre_sattr_debug_peer_on_timeout.u.attr,
+	&lustre_sattr_dump_on_timeout.u.attr,
+	&lustre_sattr_dump_on_eviction.u.attr,
+	&lustre_sattr_at_min.u.attr,
+	&lustre_sattr_at_max.u.attr,
+	&lustre_sattr_at_extra.u.attr,
+	&lustre_sattr_at_early_margin.u.attr,
+	&lustre_sattr_at_history.u.attr,
+	&lustre_attr_memused_max.attr,
+	&lustre_attr_memused.attr,
+#ifdef HAVE_SERVER_SUPPORT
+	&lustre_sattr_ldlm_timeout.u.attr,
+	&lustre_sattr_bulk_timeout.u.attr,
+#endif
+	&lustre_sattr_lbug_on_eviction.u.attr,
+	NULL,
+};
+
+static void *obd_device_list_seq_start(struct seq_file *p, loff_t *pos)
+{
+	if (*pos >= class_devno_max())
+		return NULL;
+
+	return pos;
+}
+
+static void obd_device_list_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *obd_device_list_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	++*pos;
+	if (*pos >= class_devno_max())
+		return NULL;
+
+	return pos;
+}
+
+static int obd_device_list_seq_show(struct seq_file *p, void *v)
+{
+	loff_t index = *(loff_t *)v;
+	struct obd_device *obd = class_num2obd((int)index);
+	char *status;
+
+	if (obd == NULL)
+		return 0;
+
+	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+	if (obd->obd_stopping)
+		status = "ST";
+	else if (obd->obd_inactive)
+		status = "IN";
+	else if (obd->obd_set_up)
+		status = "UP";
+	else if (obd->obd_attached)
+		status = "AT";
+	else
+		status = "--";
+
+	seq_printf(p, "%3d %s %s %s %s %d\n",
+		   (int)index, status, obd->obd_type->typ_name,
+		   obd->obd_name, obd->obd_uuid.uuid,
+		   atomic_read(&obd->obd_refcount));
+	return 0;
+}
+
+static const struct seq_operations obd_device_list_sops = {
+	.start = obd_device_list_seq_start,
+	.stop = obd_device_list_seq_stop,
+	.next = obd_device_list_seq_next,
+	.show = obd_device_list_seq_show,
+};
+
+static int obd_device_list_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc = seq_open(file, &obd_device_list_sops);
+
+	if (rc)
+		return rc;
+
+	seq = file->private_data;
+	seq->private = inode->i_private;
+	return 0;
+}
+
+static const struct file_operations obd_device_list_fops = {
+	.owner   = THIS_MODULE,
+	.open    = obd_device_list_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+struct kset *lustre_kset;
+EXPORT_SYMBOL_GPL(lustre_kset);
+
+static struct attribute_group lustre_attr_group = {
+	.attrs = lustre_attrs,
+};
+
+ssize_t class_set_global(const char *param)
+{
+	const char *value = strchr(param, '=') + 1;
+	size_t off = value - param - 1;
+	ssize_t count = -ENOENT;
+	int i;
+
+	for (i = 0; lustre_attrs[i]; i++) {
+		if (!strncmp(lustre_attrs[i]->name, param, off)) {
+			count = lustre_attr_store(&lustre_kset->kobj,
+						  lustre_attrs[i], value,
+						  strlen(value));
+			break;
+		}
+	}
+	return count;
+}
+
+int class_procfs_init(void)
+{
+	struct proc_dir_entry *entry;
+	struct dentry *file;
+	int rc = -ENOMEM;
+
+	ENTRY;
+
+	lustre_kset = kset_create_and_add("lustre", NULL, fs_kobj);
+	if (!lustre_kset)
+		goto out;
+
+	/* Create the files associated with this kobject */
+	rc = sysfs_create_group(&lustre_kset->kobj, &lustre_attr_group);
+	if (rc) {
+		kset_unregister(lustre_kset);
+		goto out;
+	}
+
+	rc = jobid_cache_init();
+	if (rc) {
+		kset_unregister(lustre_kset);
+		goto out;
+	}
+
+	debugfs_lustre_root = debugfs_create_dir("lustre", NULL);
+	if (IS_ERR_OR_NULL(debugfs_lustre_root)) {
+		rc = debugfs_lustre_root ? PTR_ERR(debugfs_lustre_root)
+					 : -ENOMEM;
+		debugfs_lustre_root = NULL;
+		kset_unregister(lustre_kset);
+		goto out;
+	}
+
+	file = debugfs_create_file("devices", 0444, debugfs_lustre_root, NULL,
+				   &obd_device_list_fops);
+	if (IS_ERR_OR_NULL(file)) {
+		rc = file ? PTR_ERR(file) : -ENOMEM;
+		debugfs_remove(debugfs_lustre_root);
+		kset_unregister(lustre_kset);
+		goto out;
+	}
+
+	entry = lprocfs_register("fs/lustre", NULL, NULL, NULL);
+	if (IS_ERR(entry)) {
+		rc = PTR_ERR(entry);
+		CERROR("cannot create '/proc/fs/lustre': rc = %d\n", rc);
+		debugfs_remove_recursive(debugfs_lustre_root);
+		kset_unregister(lustre_kset);
+		goto out;
+	}
+
+	proc_lustre_root = entry;
+out:
+	RETURN(rc);
+}
+
+int class_procfs_clean(void)
+{
+	ENTRY;
+
+	debugfs_remove_recursive(debugfs_lustre_root);
+
+	debugfs_lustre_root = NULL;
+	jobid_cache_fini();
+
+	if (proc_lustre_root)
+		lprocfs_remove(&proc_lustre_root);
+
+	sysfs_remove_group(&lustre_kset->kobj, &lustre_attr_group);
+
+	kset_unregister(lustre_kset);
+
+	RETURN(0);
+}
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obdo.c b/drivers/staging/lustrefsx/lustre/obdclass/obdo.c
index 7d14851f799f0..0367cfd1bef67 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/obdo.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obdo.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2014, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -43,15 +43,14 @@
 # include <linux/uidgid.h>
 #endif
 #include <obd_class.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_obdo.h>
 
 void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent)
 {
-        dst->o_parent_oid = fid_oid(parent);
-        dst->o_parent_seq = fid_seq(parent);
-        dst->o_parent_ver = fid_ver(parent);
-        dst->o_valid |= OBD_MD_FLGENER | OBD_MD_FLFID;
+	dst->o_parent_oid = fid_oid(parent);
+	dst->o_parent_seq = fid_seq(parent);
+	dst->o_parent_ver = fid_ver(parent);
+	dst->o_valid |= OBD_MD_FLPARENT | OBD_MD_FLFID;
 }
 EXPORT_SYMBOL(obdo_set_parent_fid);
 
@@ -62,8 +61,10 @@ void obdo_set_o_projid(struct obdo *dst, u32 projid)
 }
 EXPORT_SYMBOL(obdo_set_o_projid);
 
-/* WARNING: the file systems must take care not to tinker with
-   attributes they don't manage (such as blocks). */
+/*
+ * WARNING: the file systems must take care not to tinker with
+ * attributes they don't manage (such as blocks).
+ */
 void obdo_from_inode(struct obdo *dst, struct inode *src, u64 valid)
 {
 	u64 newvalid = 0;
@@ -73,40 +74,40 @@ void obdo_from_inode(struct obdo *dst, struct inode *src, u64 valid)
 		       valid, (s64) src->i_mtime.tv_sec,
 		       (s64) src->i_ctime.tv_sec);
 
-        if (valid & OBD_MD_FLATIME) {
+	if (valid & OBD_MD_FLATIME) {
 		dst->o_atime = src->i_atime.tv_sec;
-                newvalid |= OBD_MD_FLATIME;
-        }
-        if (valid & OBD_MD_FLMTIME) {
+		newvalid |= OBD_MD_FLATIME;
+	}
+	if (valid & OBD_MD_FLMTIME) {
 		dst->o_mtime = src->i_mtime.tv_sec;
-                newvalid |= OBD_MD_FLMTIME;
-        }
-        if (valid & OBD_MD_FLCTIME) {
+		newvalid |= OBD_MD_FLMTIME;
+	}
+	if (valid & OBD_MD_FLCTIME) {
 		dst->o_ctime = src->i_ctime.tv_sec;
-                newvalid |= OBD_MD_FLCTIME;
-        }
-        if (valid & OBD_MD_FLSIZE) {
-                dst->o_size = i_size_read(src);
-                newvalid |= OBD_MD_FLSIZE;
-        }
-        if (valid & OBD_MD_FLBLOCKS) {  /* allocation of space (x512 bytes) */
-                dst->o_blocks = src->i_blocks;
-                newvalid |= OBD_MD_FLBLOCKS;
-        }
-        if (valid & OBD_MD_FLBLKSZ) {   /* optimal block size */
+		newvalid |= OBD_MD_FLCTIME;
+	}
+	if (valid & OBD_MD_FLSIZE) {
+		dst->o_size = i_size_read(src);
+		newvalid |= OBD_MD_FLSIZE;
+	}
+	if (valid & OBD_MD_FLBLOCKS) { /* allocation of space (x512 bytes) */
+		dst->o_blocks = src->i_blocks;
+		newvalid |= OBD_MD_FLBLOCKS;
+	}
+	if (valid & OBD_MD_FLBLKSZ) { /* optimal block size */
 		dst->o_blksize = 1U << src->i_blkbits;
-                newvalid |= OBD_MD_FLBLKSZ;
-        }
-        if (valid & OBD_MD_FLTYPE) {
-                dst->o_mode = (dst->o_mode & S_IALLUGO) |
-                              (src->i_mode & S_IFMT);
-                newvalid |= OBD_MD_FLTYPE;
-        }
-        if (valid & OBD_MD_FLMODE) {
-                dst->o_mode = (dst->o_mode & S_IFMT) |
-                              (src->i_mode & S_IALLUGO);
-                newvalid |= OBD_MD_FLMODE;
-        }
+		newvalid |= OBD_MD_FLBLKSZ;
+	}
+	if (valid & OBD_MD_FLTYPE) {
+		dst->o_mode = (dst->o_mode & S_IALLUGO) |
+			      (src->i_mode & S_IFMT);
+		newvalid |= OBD_MD_FLTYPE;
+	}
+	if (valid & OBD_MD_FLMODE) {
+		dst->o_mode = (dst->o_mode & S_IFMT) |
+			      (src->i_mode & S_IALLUGO);
+		newvalid |= OBD_MD_FLMODE;
+	}
 	if (valid & OBD_MD_FLUID) {
 		dst->o_uid = from_kuid(&init_user_ns, src->i_uid);
 		newvalid |= OBD_MD_FLUID;
@@ -126,39 +127,39 @@ EXPORT_SYMBOL(obdo_from_inode);
 void obdo_cpy_md(struct obdo *dst, const struct obdo *src, u64 valid)
 {
 	CDEBUG(D_INODE, "src obdo "DOSTID" valid %#llx, dst obdo "DOSTID"\n",
-               POSTID(&src->o_oi), src->o_valid, POSTID(&dst->o_oi));
-        if (valid & OBD_MD_FLATIME)
-                dst->o_atime = src->o_atime;
-        if (valid & OBD_MD_FLMTIME)
-                dst->o_mtime = src->o_mtime;
-        if (valid & OBD_MD_FLCTIME)
-                dst->o_ctime = src->o_ctime;
-        if (valid & OBD_MD_FLSIZE)
-                dst->o_size = src->o_size;
-        if (valid & OBD_MD_FLBLOCKS) /* allocation of space */
-                dst->o_blocks = src->o_blocks;
-        if (valid & OBD_MD_FLBLKSZ)
-                dst->o_blksize = src->o_blksize;
-        if (valid & OBD_MD_FLTYPE)
-                dst->o_mode = (dst->o_mode & ~S_IFMT) | (src->o_mode & S_IFMT);
-        if (valid & OBD_MD_FLMODE)
-                dst->o_mode = (dst->o_mode & S_IFMT) | (src->o_mode & ~S_IFMT);
-        if (valid & OBD_MD_FLUID)
-                dst->o_uid = src->o_uid;
-        if (valid & OBD_MD_FLGID)
-                dst->o_gid = src->o_gid;
-        if (valid & OBD_MD_FLFLAGS)
-                dst->o_flags = src->o_flags;
-        if (valid & OBD_MD_FLFID) {
-                dst->o_parent_seq = src->o_parent_seq;
-                dst->o_parent_ver = src->o_parent_ver;
-        }
-        if (valid & OBD_MD_FLGENER)
-                dst->o_parent_oid = src->o_parent_oid;
-        if (valid & OBD_MD_FLHANDLE)
-                dst->o_handle = src->o_handle;
-
-        dst->o_valid |= valid;
+	       POSTID(&src->o_oi), src->o_valid, POSTID(&dst->o_oi));
+	if (valid & OBD_MD_FLATIME)
+		dst->o_atime = src->o_atime;
+	if (valid & OBD_MD_FLMTIME)
+		dst->o_mtime = src->o_mtime;
+	if (valid & OBD_MD_FLCTIME)
+		dst->o_ctime = src->o_ctime;
+	if (valid & OBD_MD_FLSIZE)
+		dst->o_size = src->o_size;
+	if (valid & OBD_MD_FLBLOCKS) /* allocation of space */
+		dst->o_blocks = src->o_blocks;
+	if (valid & OBD_MD_FLBLKSZ)
+		dst->o_blksize = src->o_blksize;
+	if (valid & OBD_MD_FLTYPE)
+		dst->o_mode = (dst->o_mode & ~S_IFMT) | (src->o_mode & S_IFMT);
+	if (valid & OBD_MD_FLMODE)
+		dst->o_mode = (dst->o_mode & S_IFMT) | (src->o_mode & ~S_IFMT);
+	if (valid & OBD_MD_FLUID)
+		dst->o_uid = src->o_uid;
+	if (valid & OBD_MD_FLGID)
+		dst->o_gid = src->o_gid;
+	if (valid & OBD_MD_FLFLAGS)
+		dst->o_flags = src->o_flags;
+	if (valid & OBD_MD_FLFID) {
+		dst->o_parent_seq = src->o_parent_seq;
+		dst->o_parent_ver = src->o_parent_ver;
+	}
+	if (valid & OBD_MD_FLPARENT)
+		dst->o_parent_oid = src->o_parent_oid;
+	if (valid & OBD_MD_FLHANDLE)
+		dst->o_handle = src->o_handle;
+
+	dst->o_valid |= valid;
 }
 EXPORT_SYMBOL(obdo_cpy_md);
 
@@ -168,39 +169,48 @@ void obdo_to_ioobj(const struct obdo *oa, struct obd_ioobj *ioobj)
 	if (unlikely(!(oa->o_valid & OBD_MD_FLGROUP)))
 		ostid_set_seq_mdt0(&ioobj->ioo_oid);
 
-	/* Since 2.4 this does not contain o_mode in the low 16 bits.
-	 * Instead, it holds (bd_md_max_brw - 1) for multi-bulk BRW RPCs */
+	/*
+	 * Since 2.4 this does not contain o_mode in the low 16 bits.
+	 * Instead, it holds (bd_md_max_brw - 1) for multi-bulk BRW RPCs
+	 */
 	ioobj->ioo_max_brw = 0;
 }
 EXPORT_SYMBOL(obdo_to_ioobj);
 
-/**
+/*
  * Create an obdo to send over the wire
  */
 void lustre_set_wire_obdo(const struct obd_connect_data *ocd,
-				 struct obdo *wobdo,
-				 const struct obdo *lobdo)
+			  struct obdo *wobdo,
+			  const struct obdo *lobdo)
 {
 	*wobdo = *lobdo;
 	if (ocd == NULL)
 		return;
 
+	if (!(wobdo->o_valid & OBD_MD_FLUID))
+		wobdo->o_uid = from_kuid(&init_user_ns, current_uid());
+	if (!(wobdo->o_valid & OBD_MD_FLGID))
+		wobdo->o_gid = from_kgid(&init_user_ns, current_gid());
+
 	if (unlikely(!(ocd->ocd_connect_flags & OBD_CONNECT_FID)) &&
 	    fid_seq_is_echo(ostid_seq(&lobdo->o_oi))) {
-		/* Currently OBD_FL_OSTID will only be used when 2.4 echo
-		 * client communicate with pre-2.4 server */
+		/*
+		 * Currently OBD_FL_OSTID will only be used when 2.4 echo
+		 * client communicate with pre-2.4 server
+		 */
 		wobdo->o_oi.oi.oi_id = fid_oid(&lobdo->o_oi.oi_fid);
 		wobdo->o_oi.oi.oi_seq = fid_seq(&lobdo->o_oi.oi_fid);
 	}
 }
 EXPORT_SYMBOL(lustre_set_wire_obdo);
 
-/**
+/*
  * Create a local obdo from a wire based odbo
  */
 void lustre_get_wire_obdo(const struct obd_connect_data *ocd,
-				 struct obdo *lobdo,
-				 const struct obdo *wobdo)
+			  struct obdo *lobdo,
+			  const struct obdo *wobdo)
 {
 	*lobdo = *wobdo;
 	if (ocd == NULL)
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/obdo_server.c b/drivers/staging/lustrefsx/lustre/obdclass/obdo_server.c
new file mode 100644
index 0000000000000..0f7f474f7fbb9
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/obdo_server.c
@@ -0,0 +1,156 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/linux/linux-obdo.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/pagemap.h> /* for PAGE_SIZE */
+#include <obd_class.h>
+
+/*FIXME: Just copy from obdo_from_inode*/
+void obdo_from_la(struct obdo *dst, const struct lu_attr *la, u64 valid)
+{
+	u64 newvalid = 0;
+
+	if (valid & LA_ATIME) {
+		dst->o_atime = la->la_atime;
+		newvalid |= OBD_MD_FLATIME;
+	}
+	if (valid & LA_MTIME) {
+		dst->o_mtime = la->la_mtime;
+		newvalid |= OBD_MD_FLMTIME;
+	}
+	if (valid & LA_CTIME) {
+		dst->o_ctime = la->la_ctime;
+		newvalid |= OBD_MD_FLCTIME;
+	}
+	if (valid & LA_SIZE) {
+		dst->o_size = la->la_size;
+		newvalid |= OBD_MD_FLSIZE;
+	}
+	if (valid & LA_BLOCKS) {  /* allocation of space (x512 bytes) */
+		dst->o_blocks = la->la_blocks;
+		newvalid |= OBD_MD_FLBLOCKS;
+	}
+	if (valid & LA_TYPE) {
+		dst->o_mode = (dst->o_mode & S_IALLUGO) |
+			(la->la_mode & S_IFMT);
+		newvalid |= OBD_MD_FLTYPE;
+	}
+	if (valid & LA_MODE) {
+		dst->o_mode = (dst->o_mode & S_IFMT) |
+			(la->la_mode & S_IALLUGO);
+		newvalid |= OBD_MD_FLMODE;
+	}
+	if (valid & LA_UID) {
+		dst->o_uid = la->la_uid;
+		newvalid |= OBD_MD_FLUID;
+	}
+	if (valid & LA_GID) {
+		dst->o_gid = la->la_gid;
+		newvalid |= OBD_MD_FLGID;
+	}
+	if (valid & LA_PROJID) {
+		dst->o_projid = la->la_projid;
+		newvalid |= OBD_MD_FLPROJID;
+	}
+	if (valid & LA_FLAGS) {
+		dst->o_flags = la->la_flags;
+		newvalid |= OBD_MD_FLFLAGS;
+	}
+	dst->o_valid |= newvalid;
+}
+EXPORT_SYMBOL(obdo_from_la);
+
+/*FIXME: Just copy from obdo_from_inode*/
+void la_from_obdo(struct lu_attr *dst, const struct obdo *obdo, u64 valid)
+{
+	u64 newvalid = 0;
+
+	valid &= obdo->o_valid;
+
+	if (valid & OBD_MD_FLATIME) {
+		dst->la_atime = obdo->o_atime;
+		newvalid |= LA_ATIME;
+	}
+	if (valid & OBD_MD_FLMTIME) {
+		dst->la_mtime = obdo->o_mtime;
+		newvalid |= LA_MTIME;
+	}
+	if (valid & OBD_MD_FLCTIME) {
+		dst->la_ctime = obdo->o_ctime;
+		newvalid |= LA_CTIME;
+	}
+	if (valid & OBD_MD_FLSIZE) {
+		dst->la_size = obdo->o_size;
+		newvalid |= LA_SIZE;
+	}
+	if (valid & OBD_MD_FLBLOCKS) {
+		dst->la_blocks = obdo->o_blocks;
+		newvalid |= LA_BLOCKS;
+	}
+	if (valid & OBD_MD_FLTYPE) {
+		dst->la_mode = (dst->la_mode & S_IALLUGO) |
+			(obdo->o_mode & S_IFMT);
+		newvalid |= LA_TYPE;
+	}
+	if (valid & OBD_MD_FLMODE) {
+		dst->la_mode = (dst->la_mode & S_IFMT) |
+			(obdo->o_mode & S_IALLUGO);
+		newvalid |= LA_MODE;
+	}
+	if (valid & OBD_MD_FLUID) {
+		dst->la_uid = obdo->o_uid;
+		newvalid |= LA_UID;
+	}
+	if (valid & OBD_MD_FLGID) {
+		dst->la_gid = obdo->o_gid;
+		newvalid |= LA_GID;
+	}
+	if (valid & OBD_MD_FLPROJID) {
+		dst->la_projid = obdo->o_projid;
+		newvalid |= LA_PROJID;
+	}
+	if (valid & OBD_MD_FLFLAGS) {
+		dst->la_flags = obdo->o_flags;
+		newvalid |= LA_FLAGS;
+	}
+	dst->la_valid = newvalid;
+}
+EXPORT_SYMBOL(la_from_obdo);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/scrub.c b/drivers/staging/lustrefsx/lustre/obdclass/scrub.c
new file mode 100644
index 0000000000000..b2e93c6dcc408
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/scrub.c
@@ -0,0 +1,1216 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Intel Corporation.
+ */
+/*
+ * lustre/obdclass/scrub.c
+ *
+ * The OI scrub is used for checking and (re)building Object Index files
+ * that are usually backend special. Here are some general scrub related
+ * functions that can be shared by different backends for OI scrub.
+ *
+ * Author: Fan Yong <fan.yong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LFSCK
+
+#include <linux/kthread.h>
+#include <lustre_scrub.h>
+#include <lustre_lib.h>
+#include <lustre_fid.h>
+
+static inline struct dt_device *scrub_obj2dev(struct dt_object *obj)
+{
+	return container_of0(obj->do_lu.lo_dev, struct dt_device, dd_lu_dev);
+}
+
+static void scrub_file_to_cpu(struct scrub_file *des, struct scrub_file *src)
+{
+	memcpy(des->sf_uuid, src->sf_uuid, 16);
+	des->sf_flags	= le64_to_cpu(src->sf_flags);
+	des->sf_magic	= le32_to_cpu(src->sf_magic);
+	des->sf_status	= le16_to_cpu(src->sf_status);
+	des->sf_param	= le16_to_cpu(src->sf_param);
+	des->sf_time_last_complete      =
+				le64_to_cpu(src->sf_time_last_complete);
+	des->sf_time_latest_start       =
+				le64_to_cpu(src->sf_time_latest_start);
+	des->sf_time_last_checkpoint    =
+				le64_to_cpu(src->sf_time_last_checkpoint);
+	des->sf_pos_latest_start	=
+				le64_to_cpu(src->sf_pos_latest_start);
+	des->sf_pos_last_checkpoint     =
+				le64_to_cpu(src->sf_pos_last_checkpoint);
+	des->sf_pos_first_inconsistent  =
+				le64_to_cpu(src->sf_pos_first_inconsistent);
+	des->sf_items_checked		=
+				le64_to_cpu(src->sf_items_checked);
+	des->sf_items_updated		=
+				le64_to_cpu(src->sf_items_updated);
+	des->sf_items_failed		=
+				le64_to_cpu(src->sf_items_failed);
+	des->sf_items_updated_prior     =
+				le64_to_cpu(src->sf_items_updated_prior);
+	des->sf_run_time	= le32_to_cpu(src->sf_run_time);
+	des->sf_success_count   = le32_to_cpu(src->sf_success_count);
+	des->sf_oi_count	= le16_to_cpu(src->sf_oi_count);
+	des->sf_internal_flags	= le16_to_cpu(src->sf_internal_flags);
+	memcpy(des->sf_oi_bitmap, src->sf_oi_bitmap, SCRUB_OI_BITMAP_SIZE);
+}
+
+static void scrub_file_to_le(struct scrub_file *des, struct scrub_file *src)
+{
+	memcpy(des->sf_uuid, src->sf_uuid, 16);
+	des->sf_flags	= cpu_to_le64(src->sf_flags);
+	des->sf_magic	= cpu_to_le32(src->sf_magic);
+	des->sf_status	= cpu_to_le16(src->sf_status);
+	des->sf_param	= cpu_to_le16(src->sf_param);
+	des->sf_time_last_complete      =
+				cpu_to_le64(src->sf_time_last_complete);
+	des->sf_time_latest_start       =
+				cpu_to_le64(src->sf_time_latest_start);
+	des->sf_time_last_checkpoint    =
+				cpu_to_le64(src->sf_time_last_checkpoint);
+	des->sf_pos_latest_start	=
+				cpu_to_le64(src->sf_pos_latest_start);
+	des->sf_pos_last_checkpoint     =
+				cpu_to_le64(src->sf_pos_last_checkpoint);
+	des->sf_pos_first_inconsistent  =
+				cpu_to_le64(src->sf_pos_first_inconsistent);
+	des->sf_items_checked		=
+				cpu_to_le64(src->sf_items_checked);
+	des->sf_items_updated		=
+				cpu_to_le64(src->sf_items_updated);
+	des->sf_items_failed		=
+				cpu_to_le64(src->sf_items_failed);
+	des->sf_items_updated_prior     =
+				cpu_to_le64(src->sf_items_updated_prior);
+	des->sf_run_time	= cpu_to_le32(src->sf_run_time);
+	des->sf_success_count   = cpu_to_le32(src->sf_success_count);
+	des->sf_oi_count	= cpu_to_le16(src->sf_oi_count);
+	des->sf_internal_flags	= cpu_to_le16(src->sf_internal_flags);
+	memcpy(des->sf_oi_bitmap, src->sf_oi_bitmap, SCRUB_OI_BITMAP_SIZE);
+}
+
+void scrub_file_init(struct lustre_scrub *scrub, __u8 *uuid)
+{
+	struct scrub_file *sf = &scrub->os_file;
+
+	memset(sf, 0, sizeof(*sf));
+	memcpy(sf->sf_uuid, uuid, 16);
+	sf->sf_magic = SCRUB_MAGIC_V1;
+	sf->sf_status = SS_INIT;
+}
+EXPORT_SYMBOL(scrub_file_init);
+
+void scrub_file_reset(struct lustre_scrub *scrub, __u8 *uuid, __u64 flags)
+{
+	struct scrub_file *sf = &scrub->os_file;
+
+	CDEBUG(D_LFSCK, "%s: reset OI scrub file, old flags = "
+	       "%#llx, add flags = %#llx\n",
+	       scrub->os_name, sf->sf_flags, flags);
+
+	memcpy(sf->sf_uuid, uuid, 16);
+	sf->sf_status = SS_INIT;
+	sf->sf_flags |= flags;
+	sf->sf_flags &= ~SF_AUTO;
+	sf->sf_run_time = 0;
+	sf->sf_time_latest_start = 0;
+	sf->sf_time_last_checkpoint = 0;
+	sf->sf_pos_latest_start = 0;
+	sf->sf_pos_last_checkpoint = 0;
+	sf->sf_pos_first_inconsistent = 0;
+	sf->sf_items_checked = 0;
+	sf->sf_items_updated = 0;
+	sf->sf_items_failed = 0;
+	sf->sf_items_noscrub = 0;
+	sf->sf_items_igif = 0;
+	if (!scrub->os_in_join)
+		sf->sf_items_updated_prior = 0;
+}
+EXPORT_SYMBOL(scrub_file_reset);
+
+int scrub_file_load(const struct lu_env *env, struct lustre_scrub *scrub)
+{
+	struct scrub_file *sf = &scrub->os_file;
+	struct lu_buf buf = {
+		.lb_buf = &scrub->os_file_disk,
+		.lb_len = sizeof(scrub->os_file_disk)
+	};
+	loff_t pos = 0;
+	int rc;
+
+	rc = dt_read(env, scrub->os_obj, &buf, &pos);
+	/* failure */
+	if (rc < 0) {
+		CERROR("%s: fail to load scrub file: rc = %d\n",
+		       scrub->os_name, rc);
+		return rc;
+	}
+
+	/* empty */
+	if (!rc)
+		return -ENOENT;
+
+	/* corrupted */
+	if (rc < buf.lb_len) {
+		CDEBUG(D_LFSCK, "%s: fail to load scrub file, "
+		       "expected = %d: rc = %d\n",
+		       scrub->os_name, (int)buf.lb_len, rc);
+		return -EFAULT;
+	}
+
+	scrub_file_to_cpu(sf, &scrub->os_file_disk);
+	if (sf->sf_magic != SCRUB_MAGIC_V1) {
+		CDEBUG(D_LFSCK, "%s: invalid scrub magic 0x%x != 0x%x\n",
+		       scrub->os_name, sf->sf_magic, SCRUB_MAGIC_V1);
+		return -EFAULT;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(scrub_file_load);
+
+int scrub_file_store(const struct lu_env *env, struct lustre_scrub *scrub)
+{
+	struct scrub_file *sf = &scrub->os_file_disk;
+	struct dt_object *obj = scrub->os_obj;
+	struct dt_device *dev = scrub_obj2dev(obj);
+	struct lu_buf buf = {
+		.lb_buf = sf,
+		.lb_len = sizeof(*sf)
+	};
+	struct thandle *th;
+	loff_t pos = 0;
+	int rc;
+	ENTRY;
+
+	/* Skip store under rdonly mode. */
+	if (dev->dd_rdonly)
+		RETURN(0);
+
+	scrub_file_to_le(sf, &scrub->os_file);
+	th = dt_trans_create(env, dev);
+	if (IS_ERR(th))
+		GOTO(log, rc = PTR_ERR(th));
+
+	rc = dt_declare_record_write(env, obj, &buf, pos, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_trans_start_local(env, dev, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_record_write(env, obj, &buf, &pos, th);
+
+	GOTO(stop, rc);
+
+stop:
+	dt_trans_stop(env, dev, th);
+
+log:
+	if (rc)
+		CERROR("%s: store scrub file: rc = %d\n",
+		       scrub->os_name, rc);
+	else
+		CDEBUG(D_LFSCK, "%s: store scrub file: rc = %d\n",
+		       scrub->os_name, rc);
+
+	scrub->os_time_last_checkpoint = ktime_get_seconds();
+	scrub->os_time_next_checkpoint = scrub->os_time_last_checkpoint +
+					 SCRUB_CHECKPOINT_INTERVAL;
+	return rc;
+}
+EXPORT_SYMBOL(scrub_file_store);
+
+int scrub_checkpoint(const struct lu_env *env, struct lustre_scrub *scrub)
+{
+	struct scrub_file *sf = &scrub->os_file;
+	time64_t now = ktime_get_seconds();
+	int rc;
+
+	if (likely(now < scrub->os_time_next_checkpoint ||
+		   scrub->os_new_checked == 0))
+		return 0;
+
+	CDEBUG(D_LFSCK, "%s: OI scrub checkpoint at pos %llu\n",
+	       scrub->os_name, scrub->os_pos_current);
+
+	down_write(&scrub->os_rwsem);
+	sf->sf_items_checked += scrub->os_new_checked;
+	scrub->os_new_checked = 0;
+	sf->sf_pos_last_checkpoint = scrub->os_pos_current;
+	sf->sf_time_last_checkpoint = ktime_get_real_seconds();
+	sf->sf_run_time += now - scrub->os_time_last_checkpoint;
+	rc = scrub_file_store(env, scrub);
+	up_write(&scrub->os_rwsem);
+
+	return rc;
+}
+EXPORT_SYMBOL(scrub_checkpoint);
+
+int scrub_start(int (*threadfn)(void *data), struct lustre_scrub *scrub,
+		void *data, __u32 flags)
+{
+	struct ptlrpc_thread *thread = &scrub->os_thread;
+	struct l_wait_info lwi = { 0 };
+	struct task_struct *task;
+	int rc;
+	ENTRY;
+
+again:
+	/* os_lock: sync status between stop and scrub thread */
+	spin_lock(&scrub->os_lock);
+	if (thread_is_running(thread)) {
+		spin_unlock(&scrub->os_lock);
+		RETURN(-EALREADY);
+	}
+
+	if (unlikely(thread_is_stopping(thread))) {
+		spin_unlock(&scrub->os_lock);
+		l_wait_event(thread->t_ctl_waitq,
+			     thread_is_stopped(thread),
+			     &lwi);
+		goto again;
+	}
+	spin_unlock(&scrub->os_lock);
+
+	if (scrub->os_file.sf_status == SS_COMPLETED) {
+		if (!(flags & SS_SET_FAILOUT))
+			flags |= SS_CLEAR_FAILOUT;
+
+		if (!(flags & SS_SET_DRYRUN))
+			flags |= SS_CLEAR_DRYRUN;
+
+		flags |= SS_RESET;
+	}
+
+	scrub->os_start_flags = flags;
+	thread_set_flags(thread, 0);
+	task = kthread_run(threadfn, data, "OI_scrub");
+	if (IS_ERR(task)) {
+		rc = PTR_ERR(task);
+		CERROR("%s: cannot start iteration thread: rc = %d\n",
+		       scrub->os_name, rc);
+		RETURN(rc);
+	}
+
+	l_wait_event(thread->t_ctl_waitq,
+		     thread_is_running(thread) || thread_is_stopped(thread),
+		     &lwi);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(scrub_start);
+
+void scrub_stop(struct lustre_scrub *scrub)
+{
+	struct ptlrpc_thread *thread = &scrub->os_thread;
+	struct l_wait_info lwi = { 0 };
+
+	/* os_lock: sync status between stop and scrub thread */
+	spin_lock(&scrub->os_lock);
+	if (!thread_is_init(thread) && !thread_is_stopped(thread)) {
+		thread_set_flags(thread, SVC_STOPPING);
+		spin_unlock(&scrub->os_lock);
+		wake_up_all(&thread->t_ctl_waitq);
+		l_wait_event(thread->t_ctl_waitq,
+			     thread_is_stopped(thread),
+			     &lwi);
+		/* Do not skip the last lock/unlock, which can guarantee that
+		 * the caller cannot return until the OI scrub thread exit. */
+		spin_lock(&scrub->os_lock);
+	}
+	spin_unlock(&scrub->os_lock);
+}
+EXPORT_SYMBOL(scrub_stop);
+
+const char *scrub_status_names[] = {
+	"init",
+	"scanning",
+	"completed",
+	"failed",
+	"stopped",
+	"paused",
+	"crashed",
+	NULL
+};
+
+const char *scrub_flags_names[] = {
+	"recreated",
+	"inconsistent",
+	"auto",
+	"upgrade",
+	NULL
+};
+
+const char *scrub_param_names[] = {
+	"failout",
+	"dryrun",
+	NULL
+};
+
+static void scrub_bits_dump(struct seq_file *m, int bits, const char *names[],
+			    const char *prefix)
+{
+	int flag;
+	int i;
+
+	seq_printf(m, "%s:%c", prefix, bits != 0 ? ' ' : '\n');
+
+	for (i = 0, flag = 1; bits != 0; i++, flag = 1 << i) {
+		if (flag & bits) {
+			bits &= ~flag;
+			seq_printf(m, "%s%c", names[i],
+				   bits != 0 ? ',' : '\n');
+		}
+	}
+}
+
+static void scrub_time_dump(struct seq_file *m, time64_t time,
+			    const char *prefix)
+{
+	if (time != 0)
+		seq_printf(m, "%s: %llu seconds\n", prefix,
+			   ktime_get_real_seconds() - time);
+	else
+		seq_printf(m, "%s: N/A\n", prefix);
+}
+
+static void scrub_pos_dump(struct seq_file *m, __u64 pos, const char *prefix)
+{
+	if (pos != 0)
+		seq_printf(m, "%s: %llu\n", prefix, pos);
+	else
+		seq_printf(m, "%s: N/A\n", prefix);
+}
+
+void scrub_dump(struct seq_file *m, struct lustre_scrub *scrub)
+{
+	struct scrub_file *sf = &scrub->os_file;
+	u64 checked;
+	s64 speed;
+
+	down_read(&scrub->os_rwsem);
+	seq_printf(m, "name: OI_scrub\n"
+		   "magic: 0x%x\n"
+		   "oi_files: %d\n"
+		   "status: %s\n",
+		   sf->sf_magic, (int)sf->sf_oi_count,
+		   scrub_status_names[sf->sf_status]);
+
+	scrub_bits_dump(m, sf->sf_flags, scrub_flags_names, "flags");
+
+	scrub_bits_dump(m, sf->sf_param, scrub_param_names, "param");
+
+	scrub_time_dump(m, sf->sf_time_last_complete,
+			"time_since_last_completed");
+
+	scrub_time_dump(m, sf->sf_time_latest_start,
+			"time_since_latest_start");
+
+	scrub_time_dump(m, sf->sf_time_last_checkpoint,
+			"time_since_last_checkpoint");
+
+	scrub_pos_dump(m, sf->sf_pos_latest_start,
+			"latest_start_position");
+
+	scrub_pos_dump(m, sf->sf_pos_last_checkpoint,
+			"last_checkpoint_position");
+
+	scrub_pos_dump(m, sf->sf_pos_first_inconsistent,
+			"first_failure_position");
+
+	checked = sf->sf_items_checked + scrub->os_new_checked;
+	seq_printf(m, "checked: %llu\n"
+		   "%s: %llu\n"
+		   "failed: %llu\n"
+		   "prior_%s: %llu\n"
+		   "noscrub: %llu\n"
+		   "igif: %llu\n"
+		   "success_count: %u\n",
+		   checked,
+		   sf->sf_param & SP_DRYRUN ? "inconsistent" : "updated",
+		   sf->sf_items_updated, sf->sf_items_failed,
+		   sf->sf_param & SP_DRYRUN ? "inconsistent" : "updated",
+		   sf->sf_items_updated_prior, sf->sf_items_noscrub,
+		   sf->sf_items_igif, sf->sf_success_count);
+
+	speed = checked;
+	if (thread_is_running(&scrub->os_thread)) {
+		s64 new_checked = scrub->os_new_checked;
+		time64_t duration;
+		time64_t rtime;
+
+		/* Since the time resolution is in seconds for new system
+		 * or small devices it ismore likely that duration will be
+		 * zero which will lead to inaccurate results.
+		 */
+		duration = ktime_get_seconds() -
+			   scrub->os_time_last_checkpoint;
+		if (duration != 0)
+			new_checked = div_s64(new_checked, duration);
+
+		rtime = sf->sf_run_time + duration;
+		if (rtime != 0)
+			speed = div_s64(speed, rtime);
+
+		seq_printf(m, "run_time: %lld seconds\n"
+			   "average_speed: %lld objects/sec\n"
+			   "real-time_speed: %lld objects/sec\n"
+			   "current_position: %llu\n"
+			   "scrub_in_prior: %s\n"
+			   "scrub_full_speed: %s\n"
+			   "partial_scan: %s\n",
+			   rtime, speed, new_checked,
+			   scrub->os_pos_current,
+			   scrub->os_in_prior ? "yes" : "no",
+			   scrub->os_full_speed ? "yes" : "no",
+			   scrub->os_partial_scan ? "yes" : "no");
+	} else {
+		if (sf->sf_run_time != 0)
+			speed = div_s64(speed, sf->sf_run_time);
+		seq_printf(m, "run_time: %ld seconds\n"
+			   "average_speed: %lld objects/sec\n"
+			   "real-time_speed: N/A\n"
+			   "current_position: N/A\n",
+			   sf->sf_run_time, speed);
+	}
+
+	up_read(&scrub->os_rwsem);
+}
+EXPORT_SYMBOL(scrub_dump);
+
+int lustre_liru_new(struct list_head *head, const struct lu_fid *pfid,
+		    const struct lu_fid *cfid, __u64 child,
+		    const char *name, int namelen)
+{
+	struct lustre_index_restore_unit *liru;
+	int len = sizeof(*liru) + namelen + 1;
+
+	OBD_ALLOC(liru, len);
+	if (!liru)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&liru->liru_link);
+	liru->liru_pfid = *pfid;
+	liru->liru_cfid = *cfid;
+	liru->liru_clid = child;
+	liru->liru_len = len;
+	memcpy(liru->liru_name, name, namelen);
+	liru->liru_name[namelen] = 0;
+	list_add_tail(&liru->liru_link, head);
+
+	return 0;
+}
+EXPORT_SYMBOL(lustre_liru_new);
+
+int lustre_index_register(struct dt_device *dev, const char *devname,
+			  struct list_head *head, spinlock_t *lock, int *guard,
+			  const struct lu_fid *fid,
+			  __u32 keysize, __u32 recsize)
+{
+	struct lustre_index_backup_unit *libu, *pos;
+	int rc = 0;
+	ENTRY;
+
+	if (dev->dd_rdonly || *guard)
+		RETURN(1);
+
+	OBD_ALLOC_PTR(libu);
+	if (!libu)
+		RETURN(-ENOMEM);
+
+	INIT_LIST_HEAD(&libu->libu_link);
+	libu->libu_keysize = keysize;
+	libu->libu_recsize = recsize;
+	libu->libu_fid = *fid;
+
+	spin_lock(lock);
+	if (unlikely(*guard)) {
+		spin_unlock(lock);
+		OBD_FREE_PTR(libu);
+
+		RETURN(1);
+	}
+
+	list_for_each_entry_reverse(pos, head, libu_link) {
+		rc = lu_fid_cmp(&pos->libu_fid, fid);
+		if (rc < 0) {
+			list_add(&libu->libu_link, &pos->libu_link);
+			spin_unlock(lock);
+
+			RETURN(0);
+		}
+
+		if (!rc) {
+			/* Registered already. But the former registered one
+			 * has different keysize/recsize. It may because that
+			 * the former values are from disk and corrupted, then
+			 * replace it with new values. */
+			if (unlikely(keysize != pos->libu_keysize ||
+				     recsize != pos->libu_recsize)) {
+				CWARN("%s: the index "DFID" has registered "
+				      "with %u/%u, may be invalid, replace "
+				      "with %u/%u\n",
+				      devname, PFID(fid), pos->libu_keysize,
+				      pos->libu_recsize, keysize, recsize);
+
+				pos->libu_keysize = keysize;
+				pos->libu_recsize = recsize;
+			} else {
+				rc = 1;
+			}
+
+			spin_unlock(lock);
+			OBD_FREE_PTR(libu);
+
+			RETURN(rc);
+		}
+	}
+
+	list_add(&libu->libu_link, head);
+	spin_unlock(lock);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lustre_index_register);
+
+static void lustre_index_degister(struct list_head *head, spinlock_t *lock,
+				  const struct lu_fid *fid)
+{
+	struct lustre_index_backup_unit *libu;
+	int rc = -ENOENT;
+
+	spin_lock(lock);
+	list_for_each_entry_reverse(libu, head, libu_link) {
+		rc = lu_fid_cmp(&libu->libu_fid, fid);
+		/* NOT registered. */
+		if (rc < 0)
+			break;
+
+		if (!rc) {
+			list_del(&libu->libu_link);
+			break;
+		}
+	}
+	spin_unlock(lock);
+
+	if (!rc)
+		OBD_FREE_PTR(libu);
+}
+
+static void
+lustre_index_backup_make_header(struct lustre_index_backup_header *header,
+				__u32 keysize, __u32 recsize,
+				const struct lu_fid *fid, __u32 count)
+{
+	memset(header, 0, sizeof(*header));
+	header->libh_magic = cpu_to_le32(INDEX_BACKUP_MAGIC_V1);
+	header->libh_count = cpu_to_le32(count);
+	header->libh_keysize = cpu_to_le32(keysize);
+	header->libh_recsize = cpu_to_le32(recsize);
+	fid_cpu_to_le(&header->libh_owner, fid);
+}
+
+static int lustre_index_backup_body(const struct lu_env *env,
+				    struct dt_object *obj, loff_t *pos,
+				    void *buf, int bufsize)
+{
+	struct dt_device *dev = lu2dt_dev(obj->do_lu.lo_dev);
+	struct thandle *th;
+	struct lu_buf lbuf = {
+		.lb_buf = buf,
+		.lb_len = bufsize
+	};
+	int rc;
+	ENTRY;
+
+	th = dt_trans_create(env, dev);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	rc = dt_declare_record_write(env, obj, &lbuf, *pos, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_trans_start_local(env, dev, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_record_write(env, obj, &lbuf, pos, th);
+
+	GOTO(stop, rc);
+
+stop:
+	dt_trans_stop(env, dev, th);
+	return rc;
+}
+
+static int lustre_index_backup_header(const struct lu_env *env,
+				      struct dt_object *obj,
+				      const struct lu_fid *tgt_fid,
+				      __u32 keysize, __u32 recsize,
+				      void *buf, int bufsize, int count)
+{
+	struct dt_device *dev = lu2dt_dev(obj->do_lu.lo_dev);
+	struct lustre_index_backup_header *header = buf;
+	struct lu_attr *la = buf;
+	struct thandle *th;
+	struct lu_buf lbuf = {
+		.lb_buf = header,
+		.lb_len = sizeof(*header)
+	};
+	loff_t size = sizeof(*header) + (keysize + recsize) * count;
+	loff_t pos = 0;
+	int rc;
+	bool punch = false;
+	ENTRY;
+
+	LASSERT(sizeof(*la) <= bufsize);
+	LASSERT(sizeof(*header) <= bufsize);
+
+	rc = dt_attr_get(env, obj, la);
+	if (rc)
+		RETURN(rc);
+
+	if (la->la_size > size)
+		punch = true;
+
+	lustre_index_backup_make_header(header, keysize, recsize,
+					tgt_fid, count);
+	th = dt_trans_create(env, dev);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	rc = dt_declare_record_write(env, obj, &lbuf, pos, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	if (punch) {
+		rc = dt_declare_punch(env, obj, size, OBD_OBJECT_EOF, th);
+		if (rc)
+			GOTO(stop, rc);
+	}
+
+	rc = dt_trans_start_local(env, dev, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_record_write(env, obj, &lbuf, &pos, th);
+	if (!rc && punch)
+		rc = dt_punch(env, obj, size, OBD_OBJECT_EOF, th);
+
+	GOTO(stop, rc);
+
+stop:
+	dt_trans_stop(env, dev, th);
+	return rc;
+}
+
+static int lustre_index_update_lma(const struct lu_env *env,
+				   struct dt_object *obj,
+				   void *buf, int bufsize)
+{
+	struct dt_device *dev = lu2dt_dev(obj->do_lu.lo_dev);
+	struct lustre_mdt_attrs *lma = buf;
+	struct lu_buf lbuf = {
+		.lb_buf = lma,
+		.lb_len = sizeof(struct lustre_ost_attrs)
+	};
+	struct thandle *th;
+	int fl = LU_XATTR_REPLACE;
+	int rc;
+	ENTRY;
+
+	LASSERT(bufsize >= lbuf.lb_len);
+
+	rc = dt_xattr_get(env, obj, &lbuf, XATTR_NAME_LMA);
+	if (unlikely(rc == -ENODATA)) {
+		fl = LU_XATTR_CREATE;
+		lustre_lma_init(lma, lu_object_fid(&obj->do_lu),
+				LMAC_IDX_BACKUP, 0);
+		rc = sizeof(*lma);
+	} else if (rc < sizeof(*lma)) {
+		RETURN(rc < 0 ? rc : -EFAULT);
+	} else {
+		lustre_lma_swab(lma);
+		if (lma->lma_compat & LMAC_IDX_BACKUP)
+			RETURN(0);
+
+		lma->lma_compat |= LMAC_IDX_BACKUP;
+	}
+
+	lustre_lma_swab(lma);
+	lbuf.lb_len = rc;
+	th = dt_trans_create(env, dev);
+	if (IS_ERR(th))
+		RETURN(rc);
+
+	rc = dt_declare_xattr_set(env, obj, &lbuf, XATTR_NAME_LMA, fl, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_trans_start_local(env, dev, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_xattr_set(env, obj, &lbuf, XATTR_NAME_LMA, fl, th);
+
+	GOTO(stop, rc);
+
+stop:
+	dt_trans_stop(env, dev, th);
+	return rc;
+}
+
+static int lustre_index_backup_one(const struct lu_env *env,
+				   struct local_oid_storage *los,
+				   struct dt_object *parent,
+				   struct lustre_index_backup_unit *libu,
+				   char *buf, int bufsize)
+{
+	struct dt_device *dev = scrub_obj2dev(parent);
+	struct dt_object *tgt_obj = NULL;
+	struct dt_object *bak_obj = NULL;
+	const struct dt_it_ops *iops;
+	struct dt_it *di;
+	loff_t pos = sizeof(struct lustre_index_backup_header);
+	int count = 0;
+	int size = 0;
+	int rc;
+	ENTRY;
+
+	tgt_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev,
+					     &libu->libu_fid, NULL));
+	if (IS_ERR_OR_NULL(tgt_obj))
+		GOTO(out, rc = tgt_obj ? PTR_ERR(tgt_obj) : -ENOENT);
+
+	if (!dt_object_exists(tgt_obj))
+		GOTO(out, rc = 0);
+
+	if (!tgt_obj->do_index_ops) {
+		struct dt_index_features feat;
+
+		feat.dif_flags = DT_IND_UPDATE;
+		feat.dif_keysize_min = libu->libu_keysize;
+		feat.dif_keysize_max = libu->libu_keysize;
+		feat.dif_recsize_min = libu->libu_recsize;
+		feat.dif_recsize_max = libu->libu_recsize;
+		feat.dif_ptrsize = 4;
+		rc = tgt_obj->do_ops->do_index_try(env, tgt_obj, &feat);
+		if (rc)
+			GOTO(out, rc);
+	}
+
+	lustre_fid2lbx(buf, &libu->libu_fid, bufsize);
+	bak_obj = local_file_find_or_create(env, los, parent, buf,
+					    S_IFREG | S_IRUGO | S_IWUSR);
+	if (IS_ERR_OR_NULL(bak_obj))
+		GOTO(out, rc = bak_obj ? PTR_ERR(bak_obj) : -ENOENT);
+
+	iops = &tgt_obj->do_index_ops->dio_it;
+	di = iops->init(env, tgt_obj, 0);
+	if (IS_ERR(di))
+		GOTO(out, rc = PTR_ERR(di));
+
+	rc = iops->load(env, di, 0);
+	if (!rc)
+		rc = iops->next(env, di);
+	else if (rc > 0)
+		rc = 0;
+
+	while (!rc) {
+		void *key;
+		void *rec;
+
+		key = iops->key(env, di);
+		memcpy(&buf[size], key, libu->libu_keysize);
+		size += libu->libu_keysize;
+		rec = &buf[size];
+		rc = iops->rec(env, di, rec, 0);
+		if (rc)
+			GOTO(fini, rc);
+
+		size += libu->libu_recsize;
+		count++;
+		if (size + libu->libu_keysize + libu->libu_recsize > bufsize) {
+			rc = lustre_index_backup_body(env, bak_obj, &pos,
+						      buf, size);
+			if (rc)
+				GOTO(fini, rc);
+
+			size = 0;
+		}
+
+		rc = iops->next(env, di);
+	}
+
+	if (rc >= 0 && size > 0)
+		rc = lustre_index_backup_body(env, bak_obj, &pos, buf, size);
+
+	if (rc < 0)
+		GOTO(fini, rc);
+
+	rc = lustre_index_backup_header(env, bak_obj, &libu->libu_fid,
+					libu->libu_keysize, libu->libu_recsize,
+					buf, bufsize, count);
+	if (!rc)
+		rc = lustre_index_update_lma(env, tgt_obj, buf, bufsize);
+
+	if (!rc && OBD_FAIL_CHECK(OBD_FAIL_OSD_INDEX_CRASH)) {
+		LASSERT(bufsize >= 512);
+
+		pos = 0;
+		memset(buf, 0, 512);
+		lustre_index_backup_body(env, tgt_obj, &pos, buf, 512);
+	}
+
+	GOTO(fini, rc);
+
+fini:
+	iops->fini(env, di);
+out:
+	if (!IS_ERR_OR_NULL(tgt_obj))
+		dt_object_put_nocache(env, tgt_obj);
+	if (!IS_ERR_OR_NULL(bak_obj))
+		dt_object_put_nocache(env, bak_obj);
+	return rc;
+}
+
+void lustre_index_backup(const struct lu_env *env, struct dt_device *dev,
+			 const char *devname, struct list_head *head,
+			 spinlock_t *lock, int *guard, bool backup)
+{
+	struct lustre_index_backup_unit *libu;
+	struct local_oid_storage *los = NULL;
+	struct dt_object *parent = NULL;
+	char *buf = NULL;
+	struct lu_fid fid;
+	int rc;
+	ENTRY;
+
+	if (dev->dd_rdonly || *guard)
+		RETURN_EXIT;
+
+	spin_lock(lock);
+	*guard = 1;
+	spin_unlock(lock);
+
+	if (list_empty(head))
+		RETURN_EXIT;
+
+	/* Handle kinds of failures during mount process. */
+	if (!dev->dd_lu_dev.ld_site || !dev->dd_lu_dev.ld_site->ls_top_dev)
+		backup = false;
+
+	if (backup) {
+		OBD_ALLOC_LARGE(buf, INDEX_BACKUP_BUFSIZE);
+		if (!buf) {
+			backup = false;
+			goto scan;
+		}
+
+		lu_local_obj_fid(&fid, INDEX_BACKUP_OID);
+		parent = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev,
+						    &fid, NULL));
+		if (IS_ERR_OR_NULL(parent)) {
+			CERROR("%s: failed to locate backup dir: rc = %ld\n",
+			       devname, parent ? PTR_ERR(parent) : -ENOENT);
+			backup = false;
+			goto scan;
+		}
+
+		lu_local_name_obj_fid(&fid, 1);
+		rc = local_oid_storage_init(env, dev, &fid, &los);
+		if (rc) {
+			CERROR("%s: failed to init local storage: rc = %d\n",
+			       devname, rc);
+			backup = false;
+		}
+	}
+
+scan:
+	spin_lock(lock);
+	while (!list_empty(head)) {
+		libu = list_entry(head->next,
+				  struct lustre_index_backup_unit, libu_link);
+		list_del_init(&libu->libu_link);
+		spin_unlock(lock);
+
+		if (backup) {
+			rc = lustre_index_backup_one(env, los, parent, libu,
+						     buf, INDEX_BACKUP_BUFSIZE);
+			CDEBUG(D_WARNING, "%s: backup index "DFID": rc = %d\n",
+			       devname, PFID(&libu->libu_fid), rc);
+		}
+
+		OBD_FREE_PTR(libu);
+		spin_lock(lock);
+	}
+	spin_unlock(lock);
+
+	if (los)
+		local_oid_storage_fini(env, los);
+	if (parent)
+		dt_object_put_nocache(env, parent);
+	if (buf)
+		OBD_FREE_LARGE(buf, INDEX_BACKUP_BUFSIZE);
+
+	EXIT;
+}
+EXPORT_SYMBOL(lustre_index_backup);
+
+int lustre_index_restore(const struct lu_env *env, struct dt_device *dev,
+			 const struct lu_fid *parent_fid,
+			 const struct lu_fid *tgt_fid,
+			 const struct lu_fid *bak_fid, const char *name,
+			 struct list_head *head, spinlock_t *lock,
+			 char *buf, int bufsize)
+{
+	struct dt_object *parent_obj = NULL;
+	struct dt_object *tgt_obj = NULL;
+	struct dt_object *bak_obj = NULL;
+	struct lustre_index_backup_header *header;
+	struct dt_index_features *feat;
+	struct dt_object_format *dof;
+	struct lu_attr *la;
+	struct thandle *th;
+	struct lu_object_conf conf;
+	struct dt_insert_rec ent;
+	struct lu_buf lbuf;
+	struct lu_fid tfid;
+	loff_t pos = 0;
+	__u32 keysize;
+	__u32 recsize;
+	__u32 pairsize;
+	int count;
+	int rc;
+	bool registered = false;
+	ENTRY;
+
+	LASSERT(bufsize >= sizeof(*la) + sizeof(*dof) +
+		sizeof(*feat) + sizeof(*header));
+
+	memset(buf, 0, bufsize);
+	la = (struct lu_attr *)buf;
+	dof = (void *)la + sizeof(*la);
+	feat = (void *)dof + sizeof(*dof);
+	header = (void *)feat + sizeof(*feat);
+	lbuf.lb_buf = header;
+	lbuf.lb_len = sizeof(*header);
+
+	tgt_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev,
+					     tgt_fid, NULL));
+	if (IS_ERR_OR_NULL(tgt_obj))
+		GOTO(out, rc = tgt_obj ? PTR_ERR(tgt_obj) : -ENOENT);
+
+	bak_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev,
+					     bak_fid, NULL));
+	if (IS_ERR_OR_NULL(bak_obj))
+		GOTO(out, rc = bak_obj ? PTR_ERR(bak_obj) : -ENOENT);
+
+	if (!dt_object_exists(bak_obj))
+		GOTO(out, rc = -ENOENT);
+
+	parent_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev,
+						parent_fid, NULL));
+	if (IS_ERR_OR_NULL(parent_obj))
+		GOTO(out, rc = parent_obj ? PTR_ERR(parent_obj) : -ENOENT);
+
+	LASSERT(dt_object_exists(parent_obj));
+
+	if (unlikely(!dt_try_as_dir(env, parent_obj)))
+		GOTO(out, rc = -ENOTDIR);
+
+	rc = dt_attr_get(env, tgt_obj, la);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = dt_record_read(env, bak_obj, &lbuf, &pos);
+	if (rc)
+		GOTO(out, rc);
+
+	if (le32_to_cpu(header->libh_magic) != INDEX_BACKUP_MAGIC_V1)
+		GOTO(out, rc = -EINVAL);
+
+	fid_le_to_cpu(&tfid, &header->libh_owner);
+	if (unlikely(!lu_fid_eq(tgt_fid, &tfid)))
+		GOTO(out, rc = -EINVAL);
+
+	keysize = le32_to_cpu(header->libh_keysize);
+	recsize = le32_to_cpu(header->libh_recsize);
+	pairsize = keysize + recsize;
+
+	memset(feat, 0, sizeof(*feat));
+	feat->dif_flags = DT_IND_UPDATE;
+	feat->dif_keysize_min = feat->dif_keysize_max = keysize;
+	feat->dif_recsize_min = feat->dif_recsize_max = recsize;
+	feat->dif_ptrsize = 4;
+
+	/* T1: remove old name entry and destroy old index. */
+	th = dt_trans_create(env, dev);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
+
+	rc = dt_declare_delete(env, parent_obj,
+			       (const struct dt_key *)name, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_declare_destroy(env, tgt_obj, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_trans_start_local(env, dev, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_delete(env, parent_obj, (const struct dt_key *)name, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	dt_write_lock(env, tgt_obj, 0);
+	rc = dt_destroy(env, tgt_obj, th);
+	dt_write_unlock(env, tgt_obj);
+	dt_trans_stop(env, dev, th);
+	if (rc)
+		GOTO(out, rc);
+
+	la->la_valid = LA_MODE | LA_UID | LA_GID;
+	conf.loc_flags = LOC_F_NEW;
+	dof->u.dof_idx.di_feat = feat;
+	dof->dof_type = DFT_INDEX;
+	ent.rec_type = S_IFREG;
+	ent.rec_fid = tgt_fid;
+
+	/* Drop cache before re-create it. */
+	dt_object_put_nocache(env, tgt_obj);
+	tgt_obj = lu2dt(lu_object_find_slice(env, &dev->dd_lu_dev,
+					     tgt_fid, &conf));
+	if (IS_ERR_OR_NULL(tgt_obj))
+		GOTO(out, rc = tgt_obj ? PTR_ERR(tgt_obj) : -ENOENT);
+
+	LASSERT(!dt_object_exists(tgt_obj));
+
+	/* T2: create new index and insert new name entry. */
+	th = dt_trans_create(env, dev);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
+
+	rc = dt_declare_create(env, tgt_obj, la, NULL, dof, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_declare_insert(env, parent_obj, (const struct dt_rec *)&ent,
+			       (const struct dt_key *)name, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_trans_start_local(env, dev, th);
+	if (rc)
+		GOTO(stop, rc);
+
+	dt_write_lock(env, tgt_obj, 0);
+	rc = dt_create(env, tgt_obj, la, NULL, dof, th);
+	dt_write_unlock(env, tgt_obj);
+	if (rc)
+		GOTO(stop, rc);
+
+	rc = dt_insert(env, parent_obj, (const struct dt_rec *)&ent,
+		       (const struct dt_key *)name, th);
+	dt_trans_stop(env, dev, th);
+	/* Some index name may has been inserted by OSD
+	 * automatically when create the index object. */
+	if (unlikely(rc == -EEXIST))
+		rc = 0;
+	if (rc)
+		GOTO(out, rc);
+
+	/* The new index will register via index_try. */
+	rc = tgt_obj->do_ops->do_index_try(env, tgt_obj, feat);
+	if (rc)
+		GOTO(out, rc);
+
+	registered = true;
+	count = le32_to_cpu(header->libh_count);
+	while (!rc && count > 0) {
+		int size = pairsize * count;
+		int items = count;
+		int i;
+
+		if (size > bufsize) {
+			items = bufsize / pairsize;
+			size = pairsize * items;
+		}
+
+		lbuf.lb_buf = buf;
+		lbuf.lb_len = size;
+		rc = dt_record_read(env, bak_obj, &lbuf, &pos);
+		for (i = 0; i < items && !rc; i++) {
+			void *key = &buf[i * pairsize];
+			void *rec = &buf[i * pairsize + keysize];
+
+			/* Tn: restore the records. */
+			th = dt_trans_create(env, dev);
+			if (!th)
+				GOTO(out, rc = -ENOMEM);
+
+			rc = dt_declare_insert(env, tgt_obj, rec, key, th);
+			if (rc)
+				GOTO(stop, rc);
+
+			rc = dt_trans_start_local(env, dev, th);
+			if (rc)
+				GOTO(stop, rc);
+
+			rc = dt_insert(env, tgt_obj, rec, key, th);
+			if (unlikely(rc == -EEXIST))
+				rc = 0;
+
+			dt_trans_stop(env, dev, th);
+		}
+
+		count -= items;
+	}
+
+	GOTO(out, rc);
+
+stop:
+	dt_trans_stop(env, dev, th);
+	if (rc && registered)
+		/* Degister the index to avoid overwriting the backup. */
+		lustre_index_degister(head, lock, tgt_fid);
+
+out:
+	if (!IS_ERR_OR_NULL(tgt_obj))
+		dt_object_put_nocache(env, tgt_obj);
+	if (!IS_ERR_OR_NULL(bak_obj))
+		dt_object_put_nocache(env, bak_obj);
+	if (!IS_ERR_OR_NULL(parent_obj))
+		dt_object_put_nocache(env, parent_obj);
+	return rc;
+}
+EXPORT_SYMBOL(lustre_index_restore);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/statfs_pack.c b/drivers/staging/lustrefsx/lustre/obdclass/statfs_pack.c
index 2a36051e52356..9c52f8094e9fe 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/statfs_pack.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/statfs_pack.c
@@ -46,28 +46,28 @@
 
 void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs)
 {
-        memset(osfs, 0, sizeof(*osfs));
-        osfs->os_type = sfs->f_type;
-        osfs->os_blocks = sfs->f_blocks;
-        osfs->os_bfree = sfs->f_bfree;
-        osfs->os_bavail = sfs->f_bavail;
-        osfs->os_files = sfs->f_files;
-        osfs->os_ffree = sfs->f_ffree;
-        osfs->os_bsize = sfs->f_bsize;
-        osfs->os_namelen = sfs->f_namelen;
+	memset(osfs, 0, sizeof(*osfs));
+	osfs->os_type = sfs->f_type;
+	osfs->os_blocks = sfs->f_blocks;
+	osfs->os_bfree = sfs->f_bfree;
+	osfs->os_bavail = sfs->f_bavail;
+	osfs->os_files = sfs->f_files;
+	osfs->os_ffree = sfs->f_ffree;
+	osfs->os_bsize = sfs->f_bsize;
+	osfs->os_namelen = sfs->f_namelen;
 }
 EXPORT_SYMBOL(statfs_pack);
 
 void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs)
 {
-        memset(sfs, 0, sizeof(*sfs));
-        sfs->f_type = osfs->os_type;
-        sfs->f_blocks = osfs->os_blocks;
-        sfs->f_bfree = osfs->os_bfree;
-        sfs->f_bavail = osfs->os_bavail;
-        sfs->f_files = osfs->os_files;
-        sfs->f_ffree = osfs->os_ffree;
-        sfs->f_bsize = osfs->os_bsize;
-        sfs->f_namelen = osfs->os_namelen;
+	memset(sfs, 0, sizeof(*sfs));
+	sfs->f_type = osfs->os_type;
+	sfs->f_blocks = osfs->os_blocks;
+	sfs->f_bfree = osfs->os_bfree;
+	sfs->f_bavail = osfs->os_bavail;
+	sfs->f_files = osfs->os_files;
+	sfs->f_ffree = osfs->os_ffree;
+	sfs->f_bsize = osfs->os_bsize;
+	sfs->f_namelen = osfs->os_namelen;
 }
 EXPORT_SYMBOL(statfs_unpack);
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/upcall_cache.c b/drivers/staging/lustrefsx/lustre/obdclass/upcall_cache.c
index 2112733e50c54..5622410784d7a 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/upcall_cache.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/upcall_cache.c
@@ -35,9 +35,8 @@
  */
 #define DEBUG_SUBSYSTEM S_SEC
 
-#include <libcfs/linux/linux-misc.h>
 #include <libcfs/libcfs.h>
-#include <lnet/types.h>
+#include <uapi/linux/lnet/lnet-types.h>
 #include <upcall_cache.h>
 
 static struct upcall_cache_entry *alloc_entry(struct upcall_cache *cache,
@@ -115,14 +114,14 @@ static inline void put_entry(struct upcall_cache *cache,
 static int check_unlink_entry(struct upcall_cache *cache,
 			      struct upcall_cache_entry *entry)
 {
-	if (UC_CACHE_IS_VALID(entry) &&
-	    cfs_time_before(cfs_time_current(), entry->ue_expire))
+	time64_t now = ktime_get_seconds();
+
+	if (UC_CACHE_IS_VALID(entry) && now < entry->ue_expire)
 		return 0;
 
 	if (UC_CACHE_IS_ACQUIRING(entry)) {
 		if (entry->ue_acquire_expire == 0 ||
-		    cfs_time_before(cfs_time_current(),
-				    entry->ue_acquire_expire))
+		    now < entry->ue_acquire_expire)
 			return 0;
 
 		UC_CACHE_SET_EXPIRED(entry);
@@ -198,8 +197,8 @@ struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *cache,
 		spin_unlock(&cache->uc_lock);
 		rc = refresh_entry(cache, entry);
 		spin_lock(&cache->uc_lock);
-		entry->ue_acquire_expire =
-			cfs_time_shift(cache->uc_acquire_expire);
+		entry->ue_acquire_expire = ktime_get_seconds() +
+					   cache->uc_acquire_expire;
 		if (rc < 0) {
 			UC_CACHE_CLEAR_ACQUIRING(entry);
 			UC_CACHE_SET_INVALID(entry);
@@ -340,7 +339,7 @@ int upcall_cache_downcall(struct upcall_cache *cache, __u32 err, __u64 key,
 	if (rc)
 		GOTO(out, rc);
 
-	entry->ue_expire = cfs_time_shift(cache->uc_entry_expire);
+	entry->ue_expire = ktime_get_seconds() + cache->uc_entry_expire;
 	UC_CACHE_SET_VALID(entry);
 	CDEBUG(D_OTHER, "%s: created upcall cache entry %p for key %llu\n",
 	       cache->uc_name, entry, entry->ue_key);
@@ -400,10 +399,10 @@ void upcall_cache_flush_one(struct upcall_cache *cache, __u64 key, void *args)
 
 	if (found) {
 		CWARN("%s: flush entry %p: key %llu, ref %d, fl %x, "
-		      "cur %lu, ex %ld/%ld\n",
+		      "cur %lld, ex %lld/%lld\n",
 		      cache->uc_name, entry, entry->ue_key,
 		      atomic_read(&entry->ue_refcount), entry->ue_flags,
-		      cfs_time_current_sec(), entry->ue_acquire_expire,
+		      ktime_get_real_seconds(), entry->ue_acquire_expire,
 		      entry->ue_expire);
 		UC_CACHE_SET_EXPIRED(entry);
 		if (!atomic_read(&entry->ue_refcount))
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/uuid.c b/drivers/staging/lustrefsx/lustre/obdclass/uuid.c
deleted file mode 100644
index cc0092687511b..0000000000000
--- a/drivers/staging/lustrefsx/lustre/obdclass/uuid.c
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2014, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lustre/obdclass/uuid.c
- *
- * Public include file for the UUID library
- */
-
-#define DEBUG_SUBSYSTEM S_CLASS
-
-#include <libcfs/libcfs.h>
-#include <obd_support.h>
-#include <obd_class.h>
-
-static inline size_t consume(size_t nob, __u8 **ptr)
-{
-	size_t value;
-
-	LASSERT(nob <= sizeof(value));
-
-	for (value = 0; nob > 0; --nob)
-		value = (value << 8) | *((*ptr)++);
-	return value;
-}
-
-#define CONSUME(val, ptr) (val) = consume(sizeof(val), (ptr))
-
-static void uuid_unpack(class_uuid_t in, __u16 *uu, size_t nr)
-{
-	__u8 *ptr = in;
-
-	LASSERT(nr * sizeof(*uu) == sizeof(class_uuid_t));
-
-	while (nr-- > 0)
-		CONSUME(uu[nr], &ptr);
-}
-
-void class_uuid_unparse(class_uuid_t uu, struct obd_uuid *out)
-{
-	/* uu as an array of __u16's */
-        __u16 uuid[sizeof(class_uuid_t) / sizeof(__u16)];
-
-	CLASSERT(ARRAY_SIZE(uuid) == 8);
-
-        uuid_unpack(uu, uuid, ARRAY_SIZE(uuid));
-        sprintf(out->uuid, "%04x%04x-%04x-%04x-%04x-%04x%04x%04x",
-		uuid[0], uuid[1], uuid[2], uuid[3],
-		uuid[4], uuid[5], uuid[6], uuid[7]);
-}
-EXPORT_SYMBOL(class_uuid_unparse);
diff --git a/drivers/staging/lustrefsx/lustre/obdecho/echo.c b/drivers/staging/lustrefsx/lustre/obdecho/echo.c
index de7fd77920392..0f97a830f9b37 100644
--- a/drivers/staging/lustrefsx/lustre/obdecho/echo.c
+++ b/drivers/staging/lustrefsx/lustre/obdecho/echo.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2016, Intel Corporation.
+ * Copyright (c) 2010, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -59,6 +59,21 @@ enum {
         LPROC_ECHO_LAST = LPROC_ECHO_WRITE_BYTES +1
 };
 
+struct echo_srv_device {
+	struct lu_device esd_dev;
+	struct lu_target esd_lut;
+};
+
+static inline struct echo_srv_device *echo_srv_dev(struct lu_device *d)
+{
+	return container_of0(d, struct echo_srv_device, esd_dev);
+}
+
+static inline struct obd_device *echo_srv_obd(struct echo_srv_device *esd)
+{
+	return esd->esd_dev.ld_obd;
+}
+
 static int echo_connect(const struct lu_env *env,
                         struct obd_export **exp, struct obd_device *obd,
                         struct obd_uuid *cluuid, struct obd_connect_data *data,
@@ -115,115 +130,6 @@ static u64 echo_next_id(struct obd_device *obddev)
 	return id;
 }
 
-static int echo_create(const struct lu_env *env, struct obd_export *exp,
-		       struct obdo *oa)
-{
-        struct obd_device *obd = class_exp2obd(exp);
-
-        if (!obd) {
-		CERROR("invalid client cookie %#llx\n",
-                       exp->exp_handle.h_cookie);
-                return -EINVAL;
-        }
-
-	if (!(oa->o_mode & S_IFMT)) {
-		CERROR("echo obd: no type!\n");
-		return -ENOENT;
-	}
-
-        if (!(oa->o_valid & OBD_MD_FLTYPE)) {
-		CERROR("invalid o_valid %#llx\n", oa->o_valid);
-                return -EINVAL;
-        }
-
-	ostid_set_seq_echo(&oa->o_oi);
-	if (ostid_set_id(&oa->o_oi, echo_next_id(obd))) {
-		CERROR("Bad %llu to set " DOSTID "\n",
-		       echo_next_id(obd), POSTID(&oa->o_oi));
-		return -EINVAL;
-	}
-	oa->o_valid = OBD_MD_FLID;
-
-	return 0;
-}
-
-static int echo_destroy(const struct lu_env *env, struct obd_export *exp,
-			struct obdo *oa)
-{
-        struct obd_device *obd = class_exp2obd(exp);
-
-        ENTRY;
-        if (!obd) {
-		CERROR("invalid client cookie %#llx\n",
-                       exp->exp_handle.h_cookie);
-                RETURN(-EINVAL);
-        }
-
-        if (!(oa->o_valid & OBD_MD_FLID)) {
-		CERROR("obdo missing FLID valid flag: %#llx\n", oa->o_valid);
-                RETURN(-EINVAL);
-        }
-
-	if (ostid_id(&oa->o_oi) > obd->u.echo.eo_lastino ||
-	    ostid_id(&oa->o_oi) < ECHO_INIT_OID) {
-		CERROR("bad destroy objid: "DOSTID"\n", POSTID(&oa->o_oi));
-		RETURN(-EINVAL);
-	}
-
-        RETURN(0);
-}
-
-static int echo_getattr(const struct lu_env *env, struct obd_export *exp,
-			struct obdo *oa)
-{
-	struct obd_device *obd = class_exp2obd(exp);
-	u64 id = ostid_id(&oa->o_oi);
-
-	ENTRY;
-	if (!obd) {
-		CERROR("invalid client cookie %#llx\n",
-		       exp->exp_handle.h_cookie);
-		RETURN(-EINVAL);
-	}
-
-	if (!(oa->o_valid & OBD_MD_FLID)) {
-		CERROR("obdo missing FLID valid flag: %#llx\n", oa->o_valid);
-		RETURN(-EINVAL);
-	}
-
-	obdo_cpy_md(oa, &obd->u.echo.eo_oa, oa->o_valid);
-	ostid_set_seq_echo(&oa->o_oi);
-	if (ostid_set_id(&oa->o_oi, id)) {
-		CERROR("Bad %llu to set " DOSTID "\n",
-		       id, POSTID(&oa->o_oi));
-		RETURN(-EINVAL);
-	}
-
-	RETURN(0);
-}
-
-static int echo_setattr(const struct lu_env *env, struct obd_export *exp,
-			struct obdo *oa)
-{
-	struct obd_device *obd = class_exp2obd(exp);
-
-	ENTRY;
-	if (!obd) {
-		CERROR("invalid client cookie %#llx\n",
-		       exp->exp_handle.h_cookie);
-		RETURN(-EINVAL);
-	}
-
-	if (!(oa->o_valid & OBD_MD_FLID)) {
-		CERROR("obdo missing FLID valid flag: %#llx\n", oa->o_valid);
-		RETURN(-EINVAL);
-	}
-
-	obd->u.echo.eo_oa = *oa;
-
-	RETURN(0);
-}
-
 static void
 echo_page_debug_setup(struct page *page, int rw, u64 id,
 		      __u64 offset, int len)
@@ -548,41 +454,317 @@ static int echo_commitrw(const struct lu_env *env, int cmd,
 
 LPROC_SEQ_FOPS_RO_TYPE(echo, uuid);
 static struct lprocfs_vars lprocfs_echo_obd_vars[] = {
-	{ .name	=	"uuid",
-	  .fops	=	&echo_uuid_fops		},
+	{ .name =       "uuid",
+	  .fops =       &echo_uuid_fops         },
 	{ NULL }
 };
 
-static int echo_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+struct obd_ops echo_obd_ops = {
+	.o_owner           = THIS_MODULE,
+	.o_connect         = echo_connect,
+	.o_disconnect      = echo_disconnect,
+	.o_init_export     = echo_init_export,
+	.o_destroy_export  = echo_destroy_export,
+	.o_preprw          = echo_preprw,
+	.o_commitrw        = echo_commitrw,
+};
+
+/**
+ * Echo Server request handler for OST_CREATE RPC.
+ *
+ * This is part of request processing. Its simulates the object
+ * creation on OST.
+ *
+ * \param[in] tsi	target session environment for this request
+ *
+ * \retval		0 if successful
+ * \retval		negative value on error
+ */
+static int esd_create_hdl(struct tgt_session_info *tsi)
+{
+	const struct obdo *oa = &tsi->tsi_ost_body->oa;
+	struct obd_device *obd = tsi->tsi_exp->exp_obd;
+	struct ost_body *repbody;
+	struct obdo *rep_oa;
+
+	ENTRY;
+
+	repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY);
+	if (repbody == NULL)
+		RETURN(-ENOMEM);
+
+	if (!(oa->o_mode & S_IFMT)) {
+		CERROR("%s: no type is set in obdo!\n",
+		       tsi->tsi_exp->exp_obd->obd_name);
+		RETURN(-ENOENT);
+	}
+
+	if (!(oa->o_valid & OBD_MD_FLTYPE)) {
+		CERROR("%s: invalid o_valid in obdo: %#llx\n",
+		       tsi->tsi_exp->exp_obd->obd_name, oa->o_valid);
+		RETURN(-EINVAL);
+	}
+
+	rep_oa = &repbody->oa;
+
+	if (!fid_seq_is_echo(ostid_seq(&oa->o_oi))) {
+		CERROR("%s: invalid seq %#llx\n",
+		       tsi->tsi_exp->exp_obd->obd_name, ostid_seq(&oa->o_oi));
+		return -EINVAL;
+	}
+
+	ostid_set_seq_echo(&rep_oa->o_oi);
+	ostid_set_id(&rep_oa->o_oi, echo_next_id(obd));
+
+	CDEBUG(D_INFO, "%s: Create object "DOSTID"\n",
+	       tsi->tsi_exp->exp_obd->obd_name, POSTID(&rep_oa->o_oi));
+
+	rep_oa->o_valid |= OBD_MD_FLID | OBD_MD_FLGROUP;
+
+	RETURN(0);
+}
+
+/**
+ * Echo Server request handler for OST_DESTROY RPC.
+ *
+ * This is Echo Server part of request handling. It simulates the objects
+ * destroy on OST.
+ *
+ * \param[in] tsi	target session environment for this request
+ *
+ * \retval		0 if successful
+ * \retval		negative value on error
+ */
+static int esd_destroy_hdl(struct tgt_session_info *tsi)
+{
+	const struct obdo *oa = &tsi->tsi_ost_body->oa;
+	struct obd_device *obd = tsi->tsi_exp->exp_obd;
+	struct ost_body *repbody;
+	u64 oid;
+
+	ENTRY;
+
+	oid = ostid_id(&oa->o_oi);
+	LASSERT(oid != 0);
+
+	if (!(oa->o_valid & OBD_MD_FLID)) {
+		CERROR("%s: obdo missing FLID valid flag: %#llx\n",
+		       tsi->tsi_exp->exp_obd->obd_name, oa->o_valid);
+		RETURN(-EINVAL);
+	}
+
+	repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY);
+
+	if (ostid_id(&oa->o_oi) > obd->u.echo.eo_lastino ||
+	    ostid_id(&oa->o_oi) < ECHO_INIT_OID) {
+		CERROR("%s: bad objid to destroy: "DOSTID"\n",
+		       tsi->tsi_exp->exp_obd->obd_name, POSTID(&oa->o_oi));
+		RETURN(-EINVAL);
+	}
+
+	CDEBUG(D_INFO, "%s: Destroy object "DOSTID"\n",
+	       tsi->tsi_exp->exp_obd->obd_name, POSTID(&oa->o_oi));
+
+	repbody->oa.o_oi = oa->o_oi;
+	RETURN(0);
+}
+
+/**
+ * Echo Server request handler for OST_GETATTR RPC.
+ *
+ * This is Echo Server part of request handling. It returns an object
+ * attributes to the client. All objects have the same attributes in
+ * Echo Server.
+ *
+ * \param[in] tsi	target session environment for this request
+ *
+ * \retval		0 if successful
+ * \retval		negative value on error
+ */
+static int esd_getattr_hdl(struct tgt_session_info *tsi)
+{
+	const struct obdo *oa = &tsi->tsi_ost_body->oa;
+	struct obd_device *obd = tsi->tsi_exp->exp_obd;
+	struct ost_body *repbody;
+
+	ENTRY;
+
+	if (!(oa->o_valid & OBD_MD_FLID)) {
+		CERROR("%s: obdo missing FLID valid flag: %#llx\n",
+		       tsi->tsi_exp->exp_obd->obd_name, oa->o_valid);
+		RETURN(-EINVAL);
+	}
+
+	repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY);
+	if (repbody == NULL)
+		RETURN(-ENOMEM);
+
+	repbody->oa.o_oi = oa->o_oi;
+	repbody->oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+
+	obdo_cpy_md(&repbody->oa, &obd->u.echo.eo_oa, oa->o_valid);
+
+	repbody->oa.o_valid |= OBD_MD_FLFLAGS;
+	repbody->oa.o_flags = OBD_FL_FLUSH;
+
+	RETURN(0);
+}
+
+/**
+ * Echo Server request handler for OST_SETATTR RPC.
+ *
+ * This is Echo Server part of request handling. It sets common
+ * attributes from request to the Echo Server objects.
+ *
+ * \param[in] tsi	target session environment for this request
+ *
+ * \retval		0 if successful
+ * \retval		negative value on error
+ */
+static int esd_setattr_hdl(struct tgt_session_info *tsi)
+{
+	struct ost_body *body = tsi->tsi_ost_body;
+	struct obd_device *obd = tsi->tsi_exp->exp_obd;
+	struct ost_body *repbody;
+
+	ENTRY;
+
+	if (!(body->oa.o_valid & OBD_MD_FLID)) {
+		CERROR("%s: obdo missing FLID valid flag: %#llx\n",
+		       tsi->tsi_exp->exp_obd->obd_name,
+		       body->oa.o_valid);
+		RETURN(-EINVAL);
+	}
+
+	repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY);
+	if (repbody == NULL)
+		RETURN(-ENOMEM);
+
+	repbody->oa.o_oi = body->oa.o_oi;
+	repbody->oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+
+	obd->u.echo.eo_oa = body->oa;
+
+	RETURN(0);
+}
+
+#define OBD_FAIL_OST_READ_NET	OBD_FAIL_OST_BRW_NET
+#define OBD_FAIL_OST_WRITE_NET	OBD_FAIL_OST_BRW_NET
+#define OST_BRW_READ	OST_READ
+#define OST_BRW_WRITE	OST_WRITE
+
+/**
+ * Table of Echo Server specific request handlers
+ *
+ * This table contains all opcodes accepted by Echo Server and
+ * specifies handlers for them. The tgt_request_handler()
+ * uses such table from each target to process incoming
+ * requests.
+ */
+static struct tgt_handler esd_tgt_handlers[] = {
+TGT_RPC_HANDLER(OST_FIRST_OPC, 0, OST_CONNECT, tgt_connect,
+		&RQF_CONNECT, LUSTRE_OBD_VERSION),
+TGT_RPC_HANDLER(OST_FIRST_OPC, 0, OST_DISCONNECT, tgt_disconnect,
+		&RQF_OST_DISCONNECT, LUSTRE_OBD_VERSION),
+TGT_OST_HDL(HABEO_CORPUS | HABEO_REFERO, OST_GETATTR, esd_getattr_hdl),
+TGT_OST_HDL(HABEO_CORPUS | HABEO_REFERO | MUTABOR, OST_SETATTR,
+	    esd_setattr_hdl),
+TGT_OST_HDL(HABEO_REFERO | MUTABOR, OST_CREATE, esd_create_hdl),
+TGT_OST_HDL(HABEO_REFERO | MUTABOR, OST_DESTROY, esd_destroy_hdl),
+TGT_OST_HDL(HABEO_CORPUS | HABEO_REFERO, OST_BRW_READ, tgt_brw_read),
+TGT_OST_HDL(HABEO_CORPUS | MUTABOR, OST_BRW_WRITE, tgt_brw_write),
+};
+
+static struct tgt_opc_slice esd_common_slice[] = {
+	{
+		.tos_opc_start	= OST_FIRST_OPC,
+		.tos_opc_end	= OST_LAST_OPC,
+		.tos_hs		= esd_tgt_handlers
+	},
+	{
+		.tos_opc_start	= OBD_FIRST_OPC,
+		.tos_opc_end	= OBD_LAST_OPC,
+		.tos_hs		= tgt_obd_handlers
+	},
+	{
+		.tos_opc_start	= LDLM_FIRST_OPC,
+		.tos_opc_end	= LDLM_LAST_OPC,
+		.tos_hs		= tgt_dlm_handlers
+	},
+	{
+		.tos_opc_start  = SEC_FIRST_OPC,
+		.tos_opc_end    = SEC_LAST_OPC,
+		.tos_hs         = tgt_sec_ctx_handlers
+	},
+	{
+		.tos_hs		= NULL
+	}
+};
+
+/**
+ * lu_device_operations matrix for ECHO SRV device is NULL,
+ * this device is just serving incoming requests immediately
+ * without building a stack of lu_devices.
+ */
+static struct lu_device_operations echo_srv_lu_ops = { 0 };
+
+/**
+ * Initialize Echo Server device with parameters in the config log \a cfg.
+ *
+ * This is the main starting point of Echo Server initialization. It fills all
+ * parameters with their initial values and starts Echo Server.
+ *
+ * \param[in] env	execution environment
+ * \param[in] m		Echo Server device
+ * \param[in] ldt	LU device type of Echo Server
+ * \param[in] cfg	configuration log
+ *
+ * \retval		0 if successful
+ * \retval		negative value on error
+ */
+static int echo_srv_init0(const struct lu_env *env,
+			  struct echo_srv_device *esd,
+			  struct lu_device_type *ldt, struct lustre_cfg *cfg)
 {
-	int			rc;
-	__u64			lock_flags = 0;
-	struct ldlm_res_id	res_id = {.name = {1}};
-	char			ns_name[48];
+	const char *dev = lustre_cfg_string(cfg, 0);
+	struct obd_device *obd;
+	char ns_name[48];
+	int rc;
+
 	ENTRY;
 
-        obd->u.echo.eo_obt.obt_magic = OBT_MAGIC;
+	obd = class_name2obd(dev);
+	if (obd == NULL) {
+		CERROR("Cannot find obd with name %s\n", dev);
+		RETURN(-ENODEV);
+	}
+
 	spin_lock_init(&obd->u.echo.eo_lock);
-        obd->u.echo.eo_lastino = ECHO_INIT_OID;
-
-        sprintf(ns_name, "echotgt-%s", obd->obd_uuid.uuid);
-        obd->obd_namespace = ldlm_namespace_new(obd, ns_name,
-                                                LDLM_NAMESPACE_SERVER,
-                                                LDLM_NAMESPACE_MODEST,
-                                                LDLM_NS_TYPE_OST);
-        if (obd->obd_namespace == NULL) {
-                LBUG();
-                RETURN(-ENOMEM);
-        }
+	obd->u.echo.eo_lastino = ECHO_INIT_OID;
 
-        rc = ldlm_cli_enqueue_local(obd->obd_namespace, &res_id, LDLM_PLAIN,
-                                    NULL, LCK_NL, &lock_flags, NULL,
-				    ldlm_completion_ast, NULL, NULL, 0,
-				    LVB_T_NONE, NULL, &obd->u.echo.eo_nl_lock);
-        LASSERT (rc == ELDLM_OK);
+	esd->esd_dev.ld_ops = &echo_srv_lu_ops;
+	esd->esd_dev.ld_obd = obd;
+	/* set this lu_device to obd, because error handling need it */
+	obd->obd_lu_dev = &esd->esd_dev;
+
+	/* No connection accepted until configurations will finish */
+	spin_lock(&obd->obd_dev_lock);
+	obd->obd_no_conn = 1;
+	spin_unlock(&obd->obd_dev_lock);
+
+	/* non-replayable target */
+	obd->obd_replayable = 0;
+
+	snprintf(ns_name, sizeof(ns_name), "echotgt-%s", obd->obd_uuid.uuid);
+	obd->obd_namespace = ldlm_namespace_new(obd, ns_name,
+						LDLM_NAMESPACE_SERVER,
+						LDLM_NAMESPACE_MODEST,
+						LDLM_NS_TYPE_OST);
+	if (obd->obd_namespace == NULL)
+		RETURN(-ENOMEM);
 
 	obd->obd_vars = lprocfs_echo_obd_vars;
-	if (lprocfs_obd_setup(obd) == 0 &&
+	if (!lprocfs_obd_setup(obd, true) &&
             lprocfs_alloc_obd_stats(obd, LPROC_ECHO_LAST) == 0) {
                 lprocfs_counter_init(obd->obd_stats, LPROC_ECHO_READ_BYTES,
                                      LPROCFS_CNTR_AVGMINMAX,
@@ -594,48 +776,158 @@ static int echo_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 
 	ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL,
 			   "echo_ldlm_cb_client", &obd->obd_ldlm_client);
-        RETURN(0);
+
+	rc = tgt_init(env, &esd->esd_lut, obd, NULL, esd_common_slice,
+		      OBD_FAIL_OST_ALL_REQUEST_NET,
+		      OBD_FAIL_OST_ALL_REPLY_NET);
+	if (rc)
+		GOTO(err_out, rc);
+
+	spin_lock(&obd->obd_dev_lock);
+	obd->obd_no_conn = 0;
+	spin_unlock(&obd->obd_dev_lock);
+
+	RETURN(0);
+
+err_out:
+	ldlm_namespace_free(obd->obd_namespace, NULL, obd->obd_force);
+	obd->obd_namespace = NULL;
+
+	lprocfs_obd_cleanup(obd);
+	lprocfs_free_obd_stats(obd);
+	RETURN(rc);
 }
 
-static int echo_cleanup(struct obd_device *obd)
+/**
+ * Stop the Echo Server device.
+ *
+ * This function stops the Echo Server device and all its subsystems.
+ * This is the end of Echo Server lifecycle.
+ *
+ * \param[in] env	execution environment
+ * \param[in] esd		ESD device
+ */
+static void echo_srv_fini(const struct lu_env *env,
+			  struct echo_srv_device *esd)
 {
+	struct obd_device *obd = echo_srv_obd(esd);
+	struct lu_device *d = &esd->esd_dev;
 	int leaked;
+
 	ENTRY;
 
-	lprocfs_obd_cleanup(obd);
-	lprocfs_free_obd_stats(obd);
+	class_disconnect_exports(obd);
+	if (obd->obd_namespace != NULL)
+		ldlm_namespace_free_prior(obd->obd_namespace, NULL,
+					  obd->obd_force);
 
-	ldlm_lock_decref(&obd->u.echo.eo_nl_lock, LCK_NL);
+	obd_exports_barrier(obd);
+	obd_zombie_barrier();
 
-	/* XXX Bug 3413; wait for a bit to ensure the BL callback has
-	 * happened before calling ldlm_namespace_free() */
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	schedule_timeout(cfs_time_seconds(1));
+	tgt_fini(env, &esd->esd_lut);
 
-	ldlm_namespace_free(obd->obd_namespace, NULL, obd->obd_force);
-	obd->obd_namespace = NULL;
+	if (obd->obd_namespace != NULL) {
+		ldlm_namespace_free_post(obd->obd_namespace);
+		obd->obd_namespace = NULL;
+	}
+
+	lprocfs_obd_cleanup(obd);
+	lprocfs_free_obd_stats(obd);
 
 	leaked = atomic_read(&obd->u.echo.eo_prep);
 	if (leaked != 0)
 		CERROR("%d prep/commitrw pages leaked\n", leaked);
 
-	RETURN(0);
+	LASSERT(atomic_read(&d->ld_ref) == 0);
+	EXIT;
 }
 
-struct obd_ops echo_obd_ops = {
-        .o_owner           = THIS_MODULE,
-        .o_connect         = echo_connect,
-        .o_disconnect      = echo_disconnect,
-        .o_init_export     = echo_init_export,
-        .o_destroy_export  = echo_destroy_export,
-        .o_create          = echo_create,
-        .o_destroy         = echo_destroy,
-        .o_getattr         = echo_getattr,
-        .o_setattr         = echo_setattr,
-        .o_preprw          = echo_preprw,
-        .o_commitrw        = echo_commitrw,
-        .o_setup           = echo_setup,
-        .o_cleanup         = echo_cleanup
+/**
+ * Implementation of lu_device_type_operations::ldto_device_fini.
+ *
+ * Finalize device. Dual to echo_srv_device_init(). It is called from
+ * obd_precleanup() and stops the current device.
+ *
+ * \param[in] env	execution environment
+ * \param[in] d		LU device of ESD
+ *
+ * \retval		NULL
+ */
+static struct lu_device *echo_srv_device_fini(const struct lu_env *env,
+					      struct lu_device *d)
+{
+	ENTRY;
+	echo_srv_fini(env, echo_srv_dev(d));
+	RETURN(NULL);
+}
+
+/**
+ * Implementation of lu_device_type_operations::ldto_device_free.
+ *
+ * Free Echo Server device. Dual to echo_srv_device_alloc().
+ *
+ * \param[in] env	execution environment
+ * \param[in] d		LU device of ESD
+ *
+ * \retval		NULL
+ */
+static struct lu_device *echo_srv_device_free(const struct lu_env *env,
+					      struct lu_device *d)
+{
+	struct echo_srv_device *esd = echo_srv_dev(d);
+
+	lu_device_fini(&esd->esd_dev);
+	OBD_FREE_PTR(esd);
+	RETURN(NULL);
+}
+
+/**
+ * Implementation of lu_device_type_operations::ldto_device_alloc.
+ *
+ * This function allocates the new Echo Server device. It is called from
+ * obd_setup() if OBD device had lu_device_type defined.
+ *
+ * \param[in] env	execution environment
+ * \param[in] t		lu_device_type of ESD device
+ * \param[in] cfg	configuration log
+ *
+ * \retval		pointer to the lu_device of just allocated OFD
+ * \retval		ERR_PTR of return value on error
+ */
+static struct lu_device *echo_srv_device_alloc(const struct lu_env *env,
+					       struct lu_device_type *t,
+					       struct lustre_cfg *cfg)
+{
+	struct echo_srv_device *esd;
+	struct lu_device *l;
+	int rc;
+
+	OBD_ALLOC_PTR(esd);
+	if (esd == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	l = &esd->esd_dev;
+	lu_device_init(l, t);
+	rc = echo_srv_init0(env, esd, t, cfg);
+	if (rc != 0) {
+		echo_srv_device_free(env, l);
+		l = ERR_PTR(rc);
+	}
+
+	return l;
+}
+
+static const struct lu_device_type_operations echo_srv_type_ops = {
+	.ldto_device_alloc = echo_srv_device_alloc,
+	.ldto_device_free = echo_srv_device_free,
+	.ldto_device_fini = echo_srv_device_fini
+};
+
+struct lu_device_type echo_srv_type = {
+	.ldt_tags = LU_DEVICE_DT,
+	.ldt_name = LUSTRE_ECHO_NAME,
+	.ldt_ops = &echo_srv_type_ops,
+	.ldt_ctx_tags = LCT_DT_THREAD,
 };
 
 void echo_persistent_pages_fini(void)
diff --git a/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c b/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c
index 26065b110e592..b9357e77b980f 100644
--- a/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c
+++ b/drivers/staging/lustrefsx/lustre/obdecho/echo_client.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -47,7 +47,7 @@
 #include <lustre_fid.h>
 #include <lustre_lmv.h>
 #include <lustre_acl.h>
-#include <uapi/linux/lustre_ioctl.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
 #include <lustre_net.h>
 #ifdef HAVE_SERVER_SUPPORT
 # include <md_object.h>
@@ -328,7 +328,8 @@ static void echo_page_completion(const struct lu_env *env,
 }
 
 static void echo_page_fini(const struct lu_env *env,
-			   struct cl_page_slice *slice)
+			   struct cl_page_slice *slice,
+			   struct pagevec *pvec)
 {
 	struct echo_object *eco = cl2echo_obj(slice->cpl_obj);
 	ENTRY;
@@ -506,11 +507,18 @@ static int echo_object_init(const struct lu_env *env, struct lu_object *obj,
 	RETURN(0);
 }
 
-static void echo_object_free(const struct lu_env *env, struct lu_object *obj)
+static void echo_object_delete(const struct lu_env *env, struct lu_object *obj)
 {
-        struct echo_object *eco    = cl2echo_obj(lu2cl(obj));
-        struct echo_client_obd *ec = eco->eo_dev->ed_ec;
-        ENTRY;
+	struct echo_object *eco    = cl2echo_obj(lu2cl(obj));
+	struct echo_client_obd *ec;
+
+	ENTRY;
+
+	/* object delete called unconditolally - layer init or not */
+	if (eco->eo_dev == NULL)
+		return;
+
+	ec = eco->eo_dev->ed_ec;
 
 	LASSERT(atomic_read(&eco->eo_npages) == 0);
 
@@ -518,11 +526,18 @@ static void echo_object_free(const struct lu_env *env, struct lu_object *obj)
 	list_del_init(&eco->eo_obj_chain);
 	spin_unlock(&ec->ec_lock);
 
-        lu_object_fini(obj);
-        lu_object_header_fini(obj->lo_header);
-
 	if (eco->eo_oinfo != NULL)
 		OBD_FREE_PTR(eco->eo_oinfo);
+}
+
+static void echo_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+	struct echo_object *eco    = cl2echo_obj(lu2cl(obj));
+
+	ENTRY;
+
+	lu_object_fini(obj);
+	lu_object_header_fini(obj->lo_header);
 
 	OBD_SLAB_FREE_PTR(eco, echo_object_kmem);
 	EXIT;
@@ -537,12 +552,12 @@ static int echo_object_print(const struct lu_env *env, void *cookie,
 }
 
 static const struct lu_object_operations echo_lu_obj_ops = {
-        .loo_object_init      = echo_object_init,
-        .loo_object_delete    = NULL,
-        .loo_object_release   = NULL,
-        .loo_object_free      = echo_object_free,
-        .loo_object_print     = echo_object_print,
-        .loo_object_invariant = NULL
+	.loo_object_init      = echo_object_init,
+	.loo_object_delete    = echo_object_delete,
+	.loo_object_release   = NULL,
+	.loo_object_free      = echo_object_free,
+	.loo_object_print     = echo_object_print,
+	.loo_object_invariant = NULL
 };
 /** @} echo_lu_ops */
 
@@ -962,19 +977,18 @@ static struct lu_device *echo_device_alloc(const struct lu_env *env,
                         CERROR("Cleanup obd device %s error(%d)\n",
                                obd->obd_name, rc2);
         }
-	/* Fall through */
+	fallthrough;
 
         case 3:
                 echo_site_fini(env, ed);
-		/* Fall through */
+		fallthrough;
         case 2:
                 cl_device_fini(&ed->ed_cl);
-		/* Fall through */
+		fallthrough;
         case 1:
                 OBD_FREE_PTR(ed);
-		/* Fall through */
+		fallthrough;
         case 0:
-		/* Fall through */
         default:
                 break;
         }
@@ -1714,7 +1728,7 @@ static int echo_create_md_object(const struct lu_env *env,
 	memset(spec, 0, sizeof(*spec));
 	echo_set_lmm_size(env, ld, ma);
 	if (stripe_count != 0) {
-		spec->sp_cr_flags |= FMODE_WRITE;
+		spec->sp_cr_flags |= MDS_FMODE_WRITE;
 		if (stripe_count != -1) {
 			if (S_ISDIR(mode)) {
 				struct lmv_user_md *lmu;
@@ -1742,7 +1756,7 @@ static int echo_create_md_object(const struct lu_env *env,
 
 	ma->ma_attr.la_mode = mode;
 	ma->ma_attr.la_valid = LA_CTIME | LA_MODE;
-        ma->ma_attr.la_ctime = cfs_time_current_64();
+	ma->ma_attr.la_ctime = ktime_get_real_seconds();
 
 	if (name != NULL) {
 		lname->ln_name = name;
@@ -2085,7 +2099,7 @@ static int echo_destroy_object(const struct lu_env *env,
         memset(ma, 0, sizeof(*ma));
         ma->ma_attr.la_mode = mode;
         ma->ma_attr.la_valid = LA_CTIME;
-        ma->ma_attr.la_ctime = cfs_time_current_64();
+	ma->ma_attr.la_ctime = ktime_get_real_seconds();
         ma->ma_need = MA_INODE;
         ma->ma_valid = 0;
 
@@ -2579,11 +2593,11 @@ static int echo_client_prep_commit(const struct lu_env *env,
 				   u64 offset, u64 count,
 				   u64 batch, int async)
 {
-	struct obd_ioobj	 ioo;
-	struct niobuf_local	*lnb;
-	struct niobuf_remote	 rnb;
-	u64			 off;
-	u64			 npages, tot_pages, apc;
+	struct obd_ioobj ioo;
+	struct niobuf_local *lnb;
+	struct niobuf_remote rnb;
+	u64 off;
+	u64 npages, tot_pages, apc;
 	int i, ret = 0, brw_flags = 0;
 
 	ENTRY;
@@ -2594,7 +2608,7 @@ static int echo_client_prep_commit(const struct lu_env *env,
 	apc = npages = batch >> PAGE_SHIFT;
 	tot_pages = count >> PAGE_SHIFT;
 
-	OBD_ALLOC(lnb, apc * sizeof(struct niobuf_local));
+	OBD_ALLOC_LARGE(lnb, apc * sizeof(struct niobuf_local));
 	if (lnb == NULL)
 		RETURN(-ENOMEM);
 
@@ -2660,7 +2674,7 @@ static int echo_client_prep_commit(const struct lu_env *env,
 	}
 
 out:
-	OBD_FREE(lnb, apc * sizeof(struct niobuf_local));
+	OBD_FREE_LARGE(lnb, apc * sizeof(struct niobuf_local));
 
 	RETURN(ret);
 }
@@ -2703,7 +2717,7 @@ static int echo_client_brw_ioctl(const struct lu_env *env, int rw,
 
         switch (test_mode) {
         case 1:
-                /* fall through */
+                fallthrough;
         case 2:
 		rc = echo_client_kbrw(ed, rw, oa, eco, data->ioc_offset,
 				      data->ioc_count, async);
@@ -2762,6 +2776,9 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
 	rc = lu_env_init(env, LCT_DT_THREAD);
 	if (rc)
 		GOTO(out_alloc, rc = -ENOMEM);
+	lu_env_add(env);
+	if (rc)
+		GOTO(out_env_fini, rc = -ENOMEM);
 
 #ifdef HAVE_SERVER_SUPPORT
 	env->le_ses = &echo_session;
@@ -2886,7 +2903,7 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                         GOTO (out, rc = -EPERM);
 
                 rw = OBD_BRW_WRITE;
-                /* fall through */
+                fallthrough;
         case OBD_IOC_BRW_READ:
 		rc = echo_client_brw_ioctl(env, rw, exp, data);
                 GOTO(out, rc);
@@ -2903,6 +2920,8 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
 	lu_context_fini(env->le_ses);
 out_env:
 #endif
+	lu_env_remove(env);
+out_env_fini:
         lu_env_fini(env);
 out_alloc:
         OBD_FREE_PTR(env);
@@ -3072,15 +3091,15 @@ static int __init obdecho_init(void)
                 goto failed_0;
 
 	rc = class_register_type(&echo_obd_ops, NULL, true, NULL,
-				 LUSTRE_ECHO_NAME, NULL);
+				 LUSTRE_ECHO_NAME, &echo_srv_type);
 	if (rc != 0)
 		goto failed_1;
 # endif
 
 	rc = lu_kmem_init(echo_caches);
 	if (rc == 0) {
-		rc = class_register_type(&echo_client_obd_ops, NULL, true, NULL,
-					 LUSTRE_ECHO_CLIENT_NAME,
+		rc = class_register_type(&echo_client_obd_ops, NULL, false,
+					 NULL, LUSTRE_ECHO_CLIENT_NAME,
 					 &echo_device_type);
 		if (rc)
 			lu_kmem_fini(echo_caches);
diff --git a/drivers/staging/lustrefsx/lustre/obdecho/echo_internal.h b/drivers/staging/lustrefsx/lustre/obdecho/echo_internal.h
index 8c72c40ebb767..469d68e94f02f 100644
--- a/drivers/staging/lustrefsx/lustre/obdecho/echo_internal.h
+++ b/drivers/staging/lustrefsx/lustre/obdecho/echo_internal.h
@@ -45,6 +45,7 @@
 
 #ifdef HAVE_SERVER_SUPPORT
 extern struct obd_ops echo_obd_ops;
+extern struct lu_device_type echo_srv_type;
 int echo_persistent_pages_init(void);
 void echo_persistent_pages_fini(void);
 #endif /* HAVE_SERVER_SUPPORT */
diff --git a/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c b/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c
index d6123c61af113..ab8cfca3601eb 100644
--- a/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c
+++ b/drivers/staging/lustrefsx/lustre/osc/lproc_osc.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -37,69 +37,78 @@
 #include <obd_class.h>
 #include <lprocfs_status.h>
 #include <linux/seq_file.h>
+#include <lustre_osc.h>
+
 #include "osc_internal.h"
 
-#ifdef CONFIG_PROC_FS
-static int osc_active_seq_show(struct seq_file *m, void *v)
+static ssize_t active_show(struct kobject *kobj, struct attribute *attr,
+			   char *buf)
 {
-	struct obd_device *dev = m->private;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	int rc;
 
 	LPROCFS_CLIMP_CHECK(dev);
-	seq_printf(m, "%d\n", !dev->u.cli.cl_import->imp_deactive);
+	rc = sprintf(buf, "%d\n", !dev->u.cli.cl_import->imp_deactive);
 	LPROCFS_CLIMP_EXIT(dev);
-	return 0;
+	return rc;
 }
 
-static ssize_t osc_active_seq_write(struct file *file,
-				    const char __user *buffer,
-				    size_t count, loff_t *off)
+static ssize_t active_store(struct kobject *kobj, struct attribute *attr,
+			    const char *buffer, size_t count)
 {
-	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	bool val;
 	int rc;
-	__s64 val;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtobool(buffer, &val);
 	if (rc)
 		return rc;
-	if (val < 0 || val > 1)
-		return -ERANGE;
 
 	/* opposite senses */
 	if (dev->u.cli.cl_import->imp_deactive == val)
 		rc = ptlrpc_set_import_active(dev->u.cli.cl_import, val);
 	else
-		CDEBUG(D_CONFIG, "activate %d: ignoring repeat request\n",
-			(int)val);
+		CDEBUG(D_CONFIG, "activate %u: ignoring repeat request\n",
+		       (unsigned int)val);
 
 	return count;
 }
-LPROC_SEQ_FOPS(osc_active);
+LUSTRE_RW_ATTR(active);
 
-static int osc_max_rpcs_in_flight_seq_show(struct seq_file *m, void *v)
+static ssize_t max_rpcs_in_flight_show(struct kobject *kobj,
+				       struct attribute *attr,
+				       char *buf)
 {
-	struct obd_device *dev = m->private;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 	struct client_obd *cli = &dev->u.cli;
+	ssize_t len;
 
 	spin_lock(&cli->cl_loi_list_lock);
-	seq_printf(m, "%u\n", cli->cl_max_rpcs_in_flight);
+	len = sprintf(buf, "%u\n", cli->cl_max_rpcs_in_flight);
 	spin_unlock(&cli->cl_loi_list_lock);
-	return 0;
+	return len;
 }
 
-static ssize_t osc_max_rpcs_in_flight_seq_write(struct file *file,
-						const char __user *buffer,
-						size_t count, loff_t *off)
+static ssize_t max_rpcs_in_flight_store(struct kobject *kobj,
+					struct attribute *attr,
+					const char *buffer,
+					size_t count)
 {
-	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 	struct client_obd *cli = &dev->u.cli;
-	int rc;
 	int adding, added, req_count;
-	__s64 val;
+	unsigned int val;
+	int rc;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtouint(buffer, 0, &val);
 	if (rc)
 		return rc;
-	if (val < 1 || val > OSC_MAX_RIF_MAX)
+
+	if (val == 0 || val > OSC_MAX_RIF_MAX)
 		return -ERANGE;
 
 	LPROCFS_CLIMP_CHECK(dev);
@@ -126,41 +135,42 @@ static ssize_t osc_max_rpcs_in_flight_seq_write(struct file *file,
 	LPROCFS_CLIMP_EXIT(dev);
 	return count;
 }
-LPROC_SEQ_FOPS(osc_max_rpcs_in_flight);
+LUSTRE_RW_ATTR(max_rpcs_in_flight);
 
-static int osc_max_dirty_mb_seq_show(struct seq_file *m, void *v)
+static ssize_t max_dirty_mb_show(struct kobject *kobj,
+				 struct attribute *attr,
+				 char *buf)
 {
-	struct obd_device *dev = m->private;
-        struct client_obd *cli = &dev->u.cli;
-        long val;
-        int mult;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct client_obd *cli = &dev->u.cli;
+	unsigned long val;
 
 	spin_lock(&cli->cl_loi_list_lock);
-	val = cli->cl_dirty_max_pages;
+	val = PAGES_TO_MiB(cli->cl_dirty_max_pages);
 	spin_unlock(&cli->cl_loi_list_lock);
 
-	mult = 1 << (20 - PAGE_SHIFT);
-	return lprocfs_seq_read_frac_helper(m, val, mult);
+	return sprintf(buf, "%lu\n", val);
 }
 
-static ssize_t osc_max_dirty_mb_seq_write(struct file *file,
-					  const char __user *buffer,
-					  size_t count, loff_t *off)
+static ssize_t max_dirty_mb_store(struct kobject *kobj,
+				  struct attribute *attr,
+				  const char *buffer,
+				  size_t count)
 {
-	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 	struct client_obd *cli = &dev->u.cli;
+	unsigned long pages_number, max_dirty_mb;
 	int rc;
-	__s64 pages_number;
 
-	rc = lprocfs_str_with_units_to_s64(file, buffer, count,
-			&pages_number, 'M');
+	rc = kstrtoul(buffer, 10, &max_dirty_mb);
 	if (rc)
 		return rc;
 
-	pages_number >>= PAGE_SHIFT;
+	pages_number = MiB_TO_PAGES(max_dirty_mb);
 
-	if (pages_number <= 0 ||
-	    pages_number >= OSC_MAX_DIRTY_MB_MAX << (20 - PAGE_SHIFT) ||
+	if (pages_number >= MiB_TO_PAGES(OSC_MAX_DIRTY_MB_MAX) ||
 	    pages_number > cfs_totalram_pages() / 4) /* 1/4 of RAM */
 		return -ERANGE;
 
@@ -171,7 +181,12 @@ static ssize_t osc_max_dirty_mb_seq_write(struct file *file,
 
 	return count;
 }
-LPROC_SEQ_FOPS(osc_max_dirty_mb);
+LUSTRE_RW_ATTR(max_dirty_mb);
+
+LUSTRE_ATTR(ost_conn_uuid, 0444, conn_uuid_show, NULL);
+LUSTRE_RO_ATTR(conn_uuid);
+
+LUSTRE_RW_ATTR(ping);
 
 static int osc_cached_mb_seq_show(struct seq_file *m, void *v)
 {
@@ -191,9 +206,9 @@ static int osc_cached_mb_seq_show(struct seq_file *m, void *v)
 }
 
 /* shrink the number of caching pages to a specific number */
-static ssize_t
-osc_cached_mb_seq_write(struct file *file, const char __user *buffer,
-			size_t count, loff_t *off)
+static ssize_t osc_cached_mb_seq_write(struct file *file,
+				       const char __user *buffer,
+				       size_t count, loff_t *off)
 {
 	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
 	struct client_obd *cli = &dev->u.cli;
@@ -204,14 +219,13 @@ osc_cached_mb_seq_write(struct file *file, const char __user *buffer,
 	if (count >= sizeof(kernbuf))
 		return -EINVAL;
 
-	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
+	if (copy_from_user(kernbuf, buffer, count))
 		return -EFAULT;
 	kernbuf[count] = 0;
 
 	buffer += lprocfs_find_named_value(kernbuf, "used_mb:", &count) -
 		  kernbuf;
-	rc = lprocfs_str_with_units_to_s64(file, buffer, count,
-			&pages_number, 'M');
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &pages_number, 'M');
 	if (rc)
 		return rc;
 
@@ -234,19 +248,25 @@ osc_cached_mb_seq_write(struct file *file, const char __user *buffer,
 
 	return count;
 }
+
 LPROC_SEQ_FOPS(osc_cached_mb);
 
-static int osc_cur_dirty_bytes_seq_show(struct seq_file *m, void *v)
+static ssize_t cur_dirty_bytes_show(struct kobject *kobj,
+				    struct attribute *attr,
+				    char *buf)
 {
-	struct obd_device *dev = m->private;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 	struct client_obd *cli = &dev->u.cli;
+	ssize_t len;
 
 	spin_lock(&cli->cl_loi_list_lock);
-	seq_printf(m, "%lu\n", cli->cl_dirty_pages << PAGE_SHIFT);
+	len = sprintf(buf, "%lu\n", cli->cl_dirty_pages << PAGE_SHIFT);
 	spin_unlock(&cli->cl_loi_list_lock);
-	return 0;
+
+	return len;
 }
-LPROC_SEQ_FOPS_RO(osc_cur_dirty_bytes);
+LUSTRE_RO_ATTR(cur_dirty_bytes);
 
 static int osc_cur_grant_bytes_seq_show(struct seq_file *m, void *v)
 {
@@ -265,17 +285,17 @@ static ssize_t osc_cur_grant_bytes_seq_write(struct file *file,
 {
 	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
 	struct client_obd *cli = &obd->u.cli;
-	int                rc;
-	__s64              val;
+	s64 val;
+	int rc;
 
 	if (obd == NULL)
 		return 0;
 
-	rc = lprocfs_str_with_units_to_s64(file, buffer, count, &val, '1');
+	rc = lprocfs_str_with_units_to_s64(buffer, count, &val, '1');
 	if (rc)
 		return rc;
 	if (val < 0)
-		return -ERANGE;
+		return val;
 
 	/* this is only for shrinking grant */
 	spin_lock(&cli->cl_loi_list_lock);
@@ -290,102 +310,89 @@ static ssize_t osc_cur_grant_bytes_seq_write(struct file *file,
 	if (cli->cl_import->imp_state == LUSTRE_IMP_FULL)
 		rc = osc_shrink_grant_to_target(cli, val);
 	LPROCFS_CLIMP_EXIT(obd);
-	if (rc)
-		return rc;
-	return count;
-}
-LPROC_SEQ_FOPS(osc_cur_grant_bytes);
-
-static int osc_cur_lost_grant_bytes_seq_show(struct seq_file *m, void *v)
-{
-	struct obd_device *dev = m->private;
-	struct client_obd *cli = &dev->u.cli;
 
-	spin_lock(&cli->cl_loi_list_lock);
-	seq_printf(m, "%lu\n", cli->cl_lost_grant);
-	spin_unlock(&cli->cl_loi_list_lock);
-	return 0;
+	return rc ? rc : count;
 }
-LPROC_SEQ_FOPS_RO(osc_cur_lost_grant_bytes);
+LPROC_SEQ_FOPS(osc_cur_grant_bytes);
 
-static int osc_cur_dirty_grant_bytes_seq_show(struct seq_file *m, void *v)
+static ssize_t cur_lost_grant_bytes_show(struct kobject *kobj,
+					 struct attribute *attr,
+					 char *buf)
 {
-	struct obd_device *dev = m->private;
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 	struct client_obd *cli = &dev->u.cli;
+	ssize_t len;
 
 	spin_lock(&cli->cl_loi_list_lock);
-	seq_printf(m, "%lu\n", cli->cl_dirty_grant);
+	len = sprintf(buf, "%lu\n", cli->cl_lost_grant);
 	spin_unlock(&cli->cl_loi_list_lock);
-	return 0;
+	return len;
 }
-LPROC_SEQ_FOPS_RO(osc_cur_dirty_grant_bytes);
+LUSTRE_RO_ATTR(cur_lost_grant_bytes);
 
-static int osc_grant_shrink_interval_seq_show(struct seq_file *m, void *v)
+static ssize_t grant_shrink_interval_show(struct kobject *kobj,
+					  struct attribute *attr,
+					  char *buf)
 {
-	struct obd_device *obd = m->private;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 
-	if (obd == NULL)
-		return 0;
-	seq_printf(m, "%d\n",
-		   obd->u.cli.cl_grant_shrink_interval);
-	return 0;
+	return sprintf(buf, "%lld\n", obd->u.cli.cl_grant_shrink_interval);
 }
 
-static ssize_t osc_grant_shrink_interval_seq_write(struct file *file,
-						   const char __user *buffer,
-						   size_t count, loff_t *off)
+static ssize_t grant_shrink_interval_store(struct kobject *kobj,
+					   struct attribute *attr,
+					   const char *buffer,
+					   size_t count)
 {
-	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	unsigned int val;
 	int rc;
-	__s64 val;
 
-	if (obd == NULL)
-		return 0;
-
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtouint(buffer, 0, &val);
 	if (rc)
 		return rc;
 
-	if (val <= 0 || val > INT_MAX)
+	if (val == 0)
 		return -ERANGE;
 
 	obd->u.cli.cl_grant_shrink_interval = val;
 
 	return count;
 }
-LPROC_SEQ_FOPS(osc_grant_shrink_interval);
+LUSTRE_RW_ATTR(grant_shrink_interval);
 
-static int osc_checksum_seq_show(struct seq_file *m, void *v)
+static ssize_t checksums_show(struct kobject *kobj,
+			      struct attribute *attr,
+			      char *buf)
 {
-	struct obd_device *obd = m->private;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 
-	if (obd == NULL)
-		return 0;
-
-	seq_printf(m, "%d\n", obd->u.cli.cl_checksum ? 1 : 0);
-	return 0;
+	return sprintf(buf, "%d\n", obd->u.cli.cl_checksum ? 1 : 0);
 }
 
-static ssize_t osc_checksum_seq_write(struct file *file,
-				      const char __user *buffer,
-				      size_t count, loff_t *off)
+static ssize_t checksums_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buffer,
+			       size_t count)
 {
-	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	bool val;
 	int rc;
-	__s64 val;
 
-	if (obd == NULL)
-		return 0;
-
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtobool(buffer, &val);
 	if (rc)
 		return rc;
 
-	obd->u.cli.cl_checksum = !!val;
+	obd->u.cli.cl_checksum = val;
 
 	return count;
 }
-LPROC_SEQ_FOPS(osc_checksum);
+LUSTRE_RW_ATTR(checksums);
 
 static int osc_checksum_type_seq_show(struct seq_file *m, void *v)
 {
@@ -422,7 +429,7 @@ static ssize_t osc_checksum_type_seq_write(struct file *file,
 
         if (count > sizeof(kernbuf) - 1)
                 return -EINVAL;
-	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
+	if (copy_from_user(kernbuf, buffer, count))
                 return -EFAULT;
         if (count > 0 && kernbuf[count - 1] == '\n')
                 kernbuf[count - 1] = '\0';
@@ -441,139 +448,147 @@ static ssize_t osc_checksum_type_seq_write(struct file *file,
 }
 LPROC_SEQ_FOPS(osc_checksum_type);
 
-static int osc_resend_count_seq_show(struct seq_file *m, void *v)
+static ssize_t resend_count_show(struct kobject *kobj,
+				 struct attribute *attr,
+				 char *buf)
 {
-	struct obd_device *obd = m->private;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 
-	seq_printf(m, "%u\n", atomic_read(&obd->u.cli.cl_resends));
-	return 0;
+	return sprintf(buf, "%u\n", atomic_read(&obd->u.cli.cl_resends));
 }
 
-static ssize_t osc_resend_count_seq_write(struct file *file,
-					  const char __user *buffer,
-					  size_t count, loff_t *off)
+static ssize_t resend_count_store(struct kobject *kobj,
+				  struct attribute *attr,
+				  const char *buffer,
+				  size_t count)
 {
-	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	unsigned int val;
 	int rc;
-	__s64 val;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtouint(buffer, 10, &val);
 	if (rc)
 		return rc;
 
-	if (val < 0 || val > INT_MAX)
-		return -EINVAL;
-
 	atomic_set(&obd->u.cli.cl_resends, val);
 
 	return count;
 }
-LPROC_SEQ_FOPS(osc_resend_count);
+LUSTRE_RW_ATTR(resend_count);
 
-static int osc_checksum_dump_seq_show(struct seq_file *m, void *v)
+static ssize_t checksum_dump_show(struct kobject *kobj,
+				  struct attribute *attr,
+				  char *buf)
 {
-	struct obd_device *obd = m->private;
-
-	if (obd == NULL)
-		return 0;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
 
-	seq_printf(m, "%d\n", obd->u.cli.cl_checksum_dump ? 1 : 0);
-	return 0;
+	return sprintf(buf, "%d\n", obd->u.cli.cl_checksum_dump ? 1 : 0);
 }
 
-static ssize_t osc_checksum_dump_seq_write(struct file *file,
-					   const char __user *buffer,
-					   size_t count, loff_t *off)
+static ssize_t checksum_dump_store(struct kobject *kobj,
+				   struct attribute *attr,
+				   const char *buffer,
+				   size_t count)
 {
-	struct obd_device *obd;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	bool val;
 	int rc;
-	__s64 val;
 
-	obd = ((struct seq_file *)file->private_data)->private;
-	if (obd == NULL)
-		return 0;
-
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtobool(buffer, &val);
 	if (rc)
 		return rc;
 
-	obd->u.cli.cl_checksum_dump = (val ? 1 : 0);
+	obd->u.cli.cl_checksum_dump = val;
 
 	return count;
 }
-LPROC_SEQ_FOPS(osc_checksum_dump);
+LUSTRE_RW_ATTR(checksum_dump);
 
-static int osc_contention_seconds_seq_show(struct seq_file *m, void *v)
+static ssize_t contention_seconds_show(struct kobject *kobj,
+				       struct attribute *attr,
+				       char *buf)
 {
-	struct obd_device *obd = m->private;
-	struct osc_device *od  = obd2osc_dev(obd);
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct osc_device *od = obd2osc_dev(obd);
 
-	seq_printf(m, "%u\n", od->od_contention_time);
-	return 0;
+	return sprintf(buf, "%lld\n", od->od_contention_time);
 }
 
-static ssize_t osc_contention_seconds_seq_write(struct file *file,
-						const char __user *buffer,
-						size_t count, loff_t *off)
+static ssize_t contention_seconds_store(struct kobject *kobj,
+					struct attribute *attr,
+					const char *buffer,
+					size_t count)
 {
-	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
-	struct osc_device *od  = obd2osc_dev(obd);
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct osc_device *od = obd2osc_dev(obd);
+	unsigned int val;
 	int rc;
-	__s64 val;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtouint(buffer, 0, &val);
 	if (rc)
 		return rc;
-	if (val < 0 || val > INT_MAX)
-		return -ERANGE;
 
 	od->od_contention_time = val;
 
 	return count;
 }
-LPROC_SEQ_FOPS(osc_contention_seconds);
+LUSTRE_RW_ATTR(contention_seconds);
 
-static int osc_lockless_truncate_seq_show(struct seq_file *m, void *v)
+static ssize_t lockless_truncate_show(struct kobject *kobj,
+				      struct attribute *attr,
+				      char *buf)
 {
-	struct obd_device *obd = m->private;
-	struct osc_device *od  = obd2osc_dev(obd);
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct osc_device *od = obd2osc_dev(obd);
 
-	seq_printf(m, "%u\n", od->od_lockless_truncate);
-	return 0;
+	return sprintf(buf, "%u\n", od->od_lockless_truncate);
 }
 
-static ssize_t osc_lockless_truncate_seq_write(struct file *file,
-					       const char __user *buffer,
-				    size_t count, loff_t *off)
+static ssize_t lockless_truncate_store(struct kobject *kobj,
+				       struct attribute *attr,
+				       const char *buffer,
+				       size_t count)
 {
-	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
-        struct osc_device *od  = obd2osc_dev(obd);
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct osc_device *od = obd2osc_dev(obd);
+	bool val;
 	int rc;
-	__s64 val;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtobool(buffer, &val);
 	if (rc)
 		return rc;
-	if (val < 0)
-		return -ERANGE;
 
-	od->od_lockless_truncate = !!val;
+	od->od_lockless_truncate = val;
 
 	return count;
 }
-LPROC_SEQ_FOPS(osc_lockless_truncate);
+LUSTRE_RW_ATTR(lockless_truncate);
 
-static int osc_destroys_in_flight_seq_show(struct seq_file *m, void *v)
+static ssize_t destroys_in_flight_show(struct kobject *kobj,
+				       struct attribute *attr,
+				       char *buf)
 {
-	struct obd_device *obd = m->private;
-	seq_printf(m, "%u\n",
-		   atomic_read(&obd->u.cli.cl_destroy_in_flight));
-	return 0;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+
+	return sprintf(buf, "%u\n",
+		       atomic_read(&obd->u.cli.cl_destroy_in_flight));
 }
-LPROC_SEQ_FOPS_RO(osc_destroys_in_flight);
+LUSTRE_RO_ATTR(destroys_in_flight);
 
 LPROC_SEQ_FOPS_RW_TYPE(osc, obd_max_pages_per_rpc);
 
+LUSTRE_RW_ATTR(short_io_bytes);
+
+#ifdef CONFIG_PROC_FS
 static int osc_unstable_stats_seq_show(struct seq_file *m, void *v)
 {
 	struct obd_device *dev = m->private;
@@ -591,84 +606,154 @@ static int osc_unstable_stats_seq_show(struct seq_file *m, void *v)
 }
 LPROC_SEQ_FOPS_RO(osc_unstable_stats);
 
-LPROC_SEQ_FOPS_RO_TYPE(osc, uuid);
+static ssize_t idle_timeout_show(struct kobject *kobj, struct attribute *attr,
+				 char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct client_obd *cli = &obd->u.cli;
+	int ret;
+
+	LPROCFS_CLIMP_CHECK(obd);
+	ret = sprintf(buf, "%u\n", cli->cl_import->imp_idle_timeout);
+	LPROCFS_CLIMP_EXIT(obd);
+
+	return ret;
+}
+
+static ssize_t idle_timeout_store(struct kobject *kobj, struct attribute *attr,
+				  const char *buffer, size_t count)
+{
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct client_obd *cli = &dev->u.cli;
+	struct ptlrpc_request *req;
+	unsigned int idle_debug = 0;
+	unsigned int val;
+	int rc;
+
+	if (strncmp(buffer, "debug", 5) == 0) {
+		idle_debug = D_CONSOLE;
+	} else if (strncmp(buffer, "nodebug", 6) == 0) {
+		idle_debug = D_HA;
+	} else {
+		rc = kstrtouint(buffer, 10, &val);
+		if (rc)
+			return rc;
+
+		if (val > CONNECTION_SWITCH_MAX)
+			return -ERANGE;
+	}
+
+	LPROCFS_CLIMP_CHECK(dev);
+	if (idle_debug) {
+		cli->cl_import->imp_idle_debug = idle_debug;
+	} else {
+		if (!val) {
+			/* initiate the connection if it's in IDLE state */
+			req = ptlrpc_request_alloc(cli->cl_import,
+						   &RQF_OST_STATFS);
+			if (req != NULL)
+				ptlrpc_req_finished(req);
+		}
+		cli->cl_import->imp_idle_timeout = val;
+	}
+	LPROCFS_CLIMP_EXIT(dev);
+
+	return count;
+}
+LUSTRE_RW_ATTR(idle_timeout);
+
+static ssize_t idle_connect_store(struct kobject *kobj, struct attribute *attr,
+				  const char *buffer, size_t count)
+{
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct client_obd *cli = &dev->u.cli;
+	struct ptlrpc_request *req;
+
+	LPROCFS_CLIMP_CHECK(dev);
+	/* to initiate the connection if it's in IDLE state */
+	req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_STATFS);
+	if (req)
+		ptlrpc_req_finished(req);
+	ptlrpc_pinger_force(cli->cl_import);
+	LPROCFS_CLIMP_EXIT(dev);
+
+	return count;
+}
+LUSTRE_WO_ATTR(idle_connect);
+
+static ssize_t grant_shrink_show(struct kobject *kobj, struct attribute *attr,
+				 char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_import *imp;
+	ssize_t len;
+
+	LPROCFS_CLIMP_CHECK(obd);
+	imp = obd->u.cli.cl_import;
+	len = snprintf(buf, PAGE_SIZE, "%d\n",
+		       !imp->imp_grant_shrink_disabled &&
+		       OCD_HAS_FLAG(&imp->imp_connect_data, GRANT_SHRINK));
+	LPROCFS_CLIMP_EXIT(obd);
+
+	return len;
+}
+
+static ssize_t grant_shrink_store(struct kobject *kobj, struct attribute *attr,
+				  const char *buffer, size_t count)
+{
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct obd_import *imp;
+	bool val;
+	int rc;
+
+	if (dev == NULL)
+		return 0;
+
+	rc = kstrtobool(buffer, &val);
+	if (rc)
+		return rc;
+
+	LPROCFS_CLIMP_CHECK(dev);
+
+	imp = dev->u.cli.cl_import;
+	spin_lock(&imp->imp_lock);
+	imp->imp_grant_shrink_disabled = !val;
+	spin_unlock(&imp->imp_lock);
+
+	LPROCFS_CLIMP_EXIT(dev);
+
+	return count;
+}
+LUSTRE_RW_ATTR(grant_shrink);
+
 LPROC_SEQ_FOPS_RO_TYPE(osc, connect_flags);
-LPROC_SEQ_FOPS_RO_TYPE(osc, blksize);
-LPROC_SEQ_FOPS_RO_TYPE(osc, kbytestotal);
-LPROC_SEQ_FOPS_RO_TYPE(osc, kbytesfree);
-LPROC_SEQ_FOPS_RO_TYPE(osc, kbytesavail);
-LPROC_SEQ_FOPS_RO_TYPE(osc, filestotal);
-LPROC_SEQ_FOPS_RO_TYPE(osc, filesfree);
 LPROC_SEQ_FOPS_RO_TYPE(osc, server_uuid);
-LPROC_SEQ_FOPS_RO_TYPE(osc, conn_uuid);
 LPROC_SEQ_FOPS_RO_TYPE(osc, timeouts);
 LPROC_SEQ_FOPS_RO_TYPE(osc, state);
 
-LPROC_SEQ_FOPS_WO_TYPE(osc, ping);
-
 LPROC_SEQ_FOPS_RW_TYPE(osc, import);
 LPROC_SEQ_FOPS_RW_TYPE(osc, pinger_recov);
 
 struct lprocfs_vars lprocfs_osc_obd_vars[] = {
-	{ .name	=	"uuid",
-	  .fops	=	&osc_uuid_fops			},
-	{ .name	=	"ping",
-	  .fops	=	&osc_ping_fops,
-	  .proc_mode =	0222				},
 	{ .name	=	"connect_flags",
 	  .fops	=	&osc_connect_flags_fops		},
-	{ .name	=	"blocksize",
-	  .fops	=	&osc_blksize_fops		},
-	{ .name	=	"kbytestotal",
-	  .fops	=	&osc_kbytestotal_fops		},
-	{ .name	=	"kbytesfree",
-	  .fops	=	&osc_kbytesfree_fops		},
-	{ .name	=	"kbytesavail",
-	  .fops	=	&osc_kbytesavail_fops		},
-	{ .name	=	"filestotal",
-	  .fops	=	&osc_filestotal_fops		},
-	{ .name	=	"filesfree",
-	  .fops	=	&osc_filesfree_fops		},
 	{ .name	=	"ost_server_uuid",
 	  .fops	=	&osc_server_uuid_fops		},
-	{ .name	=	"ost_conn_uuid",
-	  .fops	=	&osc_conn_uuid_fops		},
-	{ .name	=	"active",
-	  .fops	=	&osc_active_fops		},
-	{ .name	=	"max_pages_per_rpc",
-	  .fops	=	&osc_obd_max_pages_per_rpc_fops	},
-	{ .name	=	"max_rpcs_in_flight",
-	  .fops	=	&osc_max_rpcs_in_flight_fops	},
-	{ .name	=	"destroys_in_flight",
-	  .fops	=	&osc_destroys_in_flight_fops	},
-	{ .name	=	"max_dirty_mb",
-	  .fops	=	&osc_max_dirty_mb_fops		},
+	{ .name =	"max_pages_per_rpc",
+	  .fops =	&osc_obd_max_pages_per_rpc_fops	},
 	{ .name	=	"osc_cached_mb",
 	  .fops	=	&osc_cached_mb_fops		},
-	{ .name	=	"cur_dirty_bytes",
-	  .fops	=	&osc_cur_dirty_bytes_fops	},
-	{ .name	=	"cur_grant_bytes",
-	  .fops	=	&osc_cur_grant_bytes_fops	},
-	{ .name	=	"cur_lost_grant_bytes",
-	  .fops	=	&osc_cur_lost_grant_bytes_fops	},
-	{ .name	=	"cur_dirty_grant_bytes",
-	  .fops	=	&osc_cur_dirty_grant_bytes_fops	},
-	{ .name	=	"grant_shrink_interval",
-	  .fops	=	&osc_grant_shrink_interval_fops	},
-	{ .name	=	"checksums",
-	  .fops	=	&osc_checksum_fops		},
+	{ .name =	"cur_grant_bytes",
+	  .fops =	&osc_cur_grant_bytes_fops	},
 	{ .name	=	"checksum_type",
 	  .fops	=	&osc_checksum_type_fops		},
-	{ .name	=	"checksum_dump",
-	  .fops	=	&osc_checksum_dump_fops		},
-	{ .name	=	"resend_count",
-	  .fops	=	&osc_resend_count_fops		},
 	{ .name	=	"timeouts",
 	  .fops	=	&osc_timeouts_fops		},
-	{ .name	=	"contention_seconds",
-	  .fops	=	&osc_contention_seconds_fops	},
-	{ .name	=	"lockless_truncate",
-	  .fops	=	&osc_lockless_truncate_fops	},
 	{ .name	=	"import",
 	  .fops	=	&osc_import_fops		},
 	{ .name	=	"state",
@@ -680,8 +765,6 @@ struct lprocfs_vars lprocfs_osc_obd_vars[] = {
 	{ NULL }
 };
 
-#define pct(a,b) (b ? a * 100 / b : 0)
-
 static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v)
 {
 	struct timespec64 now;
@@ -720,7 +803,7 @@ static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v)
 
 		read_cum += r;
 		write_cum += w;
-		seq_printf(seq, "%d:\t\t%10lu %3lu %3lu   | %10lu %3lu %3lu\n",
+		seq_printf(seq, "%d:\t\t%10lu %3u %3u   | %10lu %3u %3u\n",
 			   1 << i, r, pct(r, read_tot),
 			   pct(read_cum, read_tot), w,
 			   pct(w, write_tot),
@@ -743,7 +826,7 @@ static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v)
                 unsigned long w = cli->cl_write_rpc_hist.oh_buckets[i];
                 read_cum += r;
                 write_cum += w;
-		seq_printf(seq, "%d:\t\t%10lu %3lu %3lu   | %10lu %3lu %3lu\n",
+		seq_printf(seq, "%d:\t\t%10lu %3u %3u   | %10lu %3u %3u\n",
 			   i, r, pct(r, read_tot),
 			   pct(read_cum, read_tot), w,
 			   pct(w, write_tot),
@@ -766,10 +849,10 @@ static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v)
                 unsigned long w = cli->cl_write_offset_hist.oh_buckets[i];
                 read_cum += r;
                 write_cum += w;
-                seq_printf(seq, "%d:\t\t%10lu %3lu %3lu   | %10lu %3lu %3lu\n",
-                           (i == 0) ? 0 : 1 << (i - 1),
-                           r, pct(r, read_tot), pct(read_cum, read_tot),
-                           w, pct(w, write_tot), pct(write_cum, write_tot));
+		seq_printf(seq, "%d:\t\t%10lu %3u %3u   | %10lu %3u %3u\n",
+			   (i == 0) ? 0 : 1 << (i - 1),
+			   r, pct(r, read_tot), pct(read_cum, read_tot),
+			   w, pct(w, write_tot), pct(write_cum, write_tot));
                 if (read_cum == read_tot && write_cum == write_tot)
                         break;
         }
@@ -778,7 +861,6 @@ static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v)
 
         return 0;
 }
-#undef pct
 
 static ssize_t osc_rpc_stats_seq_write(struct file *file,
 				       const char __user *buf,
@@ -832,7 +914,7 @@ static ssize_t osc_stats_seq_write(struct file *file,
 
 LPROC_SEQ_FOPS(osc_stats);
 
-int lproc_osc_attach_seqstat(struct obd_device *dev)
+int lprocfs_osc_attach_seqstat(struct obd_device *dev)
 {
 	int rc;
 
@@ -845,3 +927,77 @@ int lproc_osc_attach_seqstat(struct obd_device *dev)
 	return rc;
 }
 #endif /* CONFIG_PROC_FS */
+
+static struct attribute *osc_attrs[] = {
+	&lustre_attr_active.attr,
+	&lustre_attr_checksums.attr,
+	&lustre_attr_checksum_dump.attr,
+	&lustre_attr_contention_seconds.attr,
+	&lustre_attr_cur_dirty_bytes.attr,
+	&lustre_attr_cur_lost_grant_bytes.attr,
+	&lustre_attr_destroys_in_flight.attr,
+	&lustre_attr_grant_shrink_interval.attr,
+	&lustre_attr_lockless_truncate.attr,
+	&lustre_attr_max_dirty_mb.attr,
+	&lustre_attr_max_rpcs_in_flight.attr,
+	&lustre_attr_short_io_bytes.attr,
+	&lustre_attr_resend_count.attr,
+	&lustre_attr_ost_conn_uuid.attr,
+	&lustre_attr_conn_uuid.attr,
+	&lustre_attr_ping.attr,
+	&lustre_attr_idle_timeout.attr,
+	&lustre_attr_idle_connect.attr,
+	&lustre_attr_grant_shrink.attr,
+	NULL,
+};
+
+int osc_tunables_init(struct obd_device *obd)
+{
+#if defined(CONFIG_PROC_FS) && defined(HAVE_SERVER_SUPPORT)
+	struct obd_type *type;
+#endif
+	int rc;
+
+	obd->obd_vars = lprocfs_osc_obd_vars;
+#if defined(CONFIG_PROC_FS) && defined(HAVE_SERVER_SUPPORT)
+	/* If this is true then both client (osc) and server (osp) are on the
+	 * same node. The osp layer if loaded first will register the osc proc
+	 * directory. In that case this obd_device will be attached its proc
+	 * tree to type->typ_procsym instead of obd->obd_type->typ_procroot.
+	 */
+	type = class_search_type(LUSTRE_OSP_NAME);
+	if (type && type->typ_procsym) {
+		obd->obd_proc_entry = lprocfs_register(obd->obd_name,
+						       type->typ_procsym,
+						       obd->obd_vars, obd);
+		if (IS_ERR(obd->obd_proc_entry)) {
+			rc = PTR_ERR(obd->obd_proc_entry);
+			CERROR("error %d setting up lprocfs for %s\n", rc,
+			       obd->obd_name);
+			obd->obd_proc_entry = NULL;
+		}
+	}
+#endif
+	obd->obd_ktype.default_attrs = osc_attrs;
+	rc = lprocfs_obd_setup(obd, false);
+	if (rc)
+		return rc;
+#ifdef CONFIG_PROC_FS
+	/* If the basic OSC proc tree construction succeeded then
+	 * lets do the rest.
+	 */
+	rc = lprocfs_osc_attach_seqstat(obd);
+	if (rc)
+		goto obd_cleanup;
+
+#endif /* CONFIG_PROC_FS */
+	rc = sptlrpc_lprocfs_cliobd_attach(obd);
+	if (rc)
+		goto obd_cleanup;
+
+	ptlrpc_lprocfs_register_obd(obd);
+obd_cleanup:
+	if (rc)
+		lprocfs_obd_cleanup(obd);
+	return rc;
+}
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_cache.c b/drivers/staging/lustrefsx/lustre/osc/osc_cache.c
index 178340e255ac9..4bae208d145f5 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_cache.c
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_cache.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  *
  */
 /*
@@ -37,7 +37,9 @@
 
 #define DEBUG_SUBSYSTEM S_OSC
 
-#include "osc_cl_internal.h"
+#include <lustre_osc.h>
+#include <lustre_dlm.h>
+
 #include "osc_internal.h"
 
 static int extent_debug; /* set it to be true for more debug */
@@ -214,7 +216,7 @@ static int osc_extent_sanity_check0(struct osc_extent *ext,
 			GOTO(out, rc = 60);
 		if (ext->oe_fsync_wait && !ext->oe_urgent && !ext->oe_hp)
 			GOTO(out, rc = 65);
-		/* Fall through */
+		fallthrough;
 	default:
 		if (atomic_read(&ext->oe_users) > 0)
 			GOTO(out, rc = 70);
@@ -226,7 +228,9 @@ static int osc_extent_sanity_check0(struct osc_extent *ext,
 	if (ext->oe_sync && ext->oe_grants > 0)
 		GOTO(out, rc = 90);
 
-	if (ext->oe_dlmlock != NULL && !ldlm_is_failed(ext->oe_dlmlock)) {
+	if (ext->oe_dlmlock != NULL &&
+	    ext->oe_dlmlock->l_resource->lr_type == LDLM_EXTENT &&
+	    !ldlm_is_failed(ext->oe_dlmlock)) {
 		struct ldlm_extent *extent;
 
 		extent = &ext->oe_dlmlock->l_policy_data.l_extent;
@@ -592,7 +596,10 @@ int osc_extent_release(const struct lu_env *env, struct osc_extent *ext)
 			if (grant > 0)
 				osc_unreserve_grant(cli, 0, grant);
 
-			if (ext->oe_urgent)
+			if (ext->oe_hp)
+				list_move_tail(&ext->oe_link,
+					       &obj->oo_hp_exts);
+			else if (ext->oe_urgent)
 				list_move_tail(&ext->oe_link,
 					       &obj->oo_urgent_exts);
 			else if (ext->oe_nr_pages == ext->oe_mppr) {
@@ -697,7 +704,7 @@ static struct osc_extent *osc_extent_find(const struct lu_env *env,
 		pgoff_t ext_chk_end   = ext->oe_end   >> ppc_bits;
 
 		LASSERT(sanity_check_nolock(ext) == 0);
-		if (chunk > ext_chk_end + 1)
+		if (chunk > ext_chk_end + 1 || chunk < ext_chk_start)
 			break;
 
 		/* if covering by different locks, no chance to match */
@@ -974,6 +981,7 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
 	struct client_obd     *cli = osc_cli(obj);
 	struct osc_async_page *oap;
 	struct osc_async_page *tmp;
+	struct pagevec        *pvec;
 	int                    pages_in_chunk = 0;
 	int                    ppc_bits    = cli->cl_chunkbits -
 					     PAGE_SHIFT;
@@ -995,9 +1003,11 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
 	if (IS_ERR(env))
 		RETURN(PTR_ERR(env));
 
-	io  = &osc_env_info(env)->oti_io;
+	io  = osc_env_thread_io(env);
 	io->ci_obj = cl_object_top(osc2cl(obj));
 	io->ci_ignore_layout = 1;
+	pvec = &osc_env_info(env)->oti_pagevec;
+	ll_pagevec_init(pvec, 0);
 	rc = cl_io_init(env, io, CIT_MISC, io->ci_obj);
 	if (rc < 0)
 		GOTO(out, rc);
@@ -1035,11 +1045,13 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
 		}
 
 		lu_ref_del(&page->cp_reference, "truncate", current);
-		cl_page_put(env, page);
+		cl_pagevec_put(env, page, pvec);
 
 		--ext->oe_nr_pages;
 		++nr_pages;
 	}
+	pagevec_release(pvec);
+
 	EASSERTF(ergo(ext->oe_start >= trunc_index + !!partial,
 		      ext->oe_nr_pages == 0),
 		ext, "trunc_index %lu, partial %d\n", trunc_index, partial);
@@ -1284,7 +1296,7 @@ static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap,
 	ENTRY;
 	result = cl_page_make_ready(env, page, CRT_WRITE);
 	if (result == 0)
-		opg->ops_submit_time = cfs_time_current();
+		opg->ops_submit_time = ktime_get();
 	RETURN(result);
 }
 
@@ -1295,7 +1307,6 @@ static int osc_refresh_count(const struct lu_env *env,
 	pgoff_t index = osc_index(oap2osc(oap));
 	struct cl_object *obj;
 	struct cl_attr   *attr = &osc_env_info(env)->oti_attr;
-
 	int result;
 	loff_t kms;
 
@@ -1341,7 +1352,7 @@ static int osc_completion(const struct lu_env *env, struct osc_async_page *oap,
 	/* Clear opg->ops_transfer_pinned before VM lock is released. */
 	opg->ops_transfer_pinned = 0;
 
-	opg->ops_submit_time = 0;
+	opg->ops_submit_time = ktime_set(0, 0);
 	srvlock = oap->oap_brw_flags & OBD_BRW_SRVLOCK;
 
 	/* statistic */
@@ -1392,7 +1403,6 @@ static void osc_consume_write_grant(struct client_obd *cli,
 {
 	assert_spin_locked(&cli->cl_loi_list_lock);
 	LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT));
-	atomic_long_inc(&obd_dirty_pages);
 	cli->cl_dirty_pages++;
 	pga->flag |= OBD_BRW_FROM_GRANT;
 	CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n",
@@ -1416,11 +1426,6 @@ static void osc_release_write_grant(struct client_obd *cli,
 	pga->flag &= ~OBD_BRW_FROM_GRANT;
 	atomic_long_dec(&obd_dirty_pages);
 	cli->cl_dirty_pages--;
-	if (pga->flag & OBD_BRW_NOCACHE) {
-		pga->flag &= ~OBD_BRW_NOCACHE;
-		atomic_long_dec(&obd_dirty_transit_pages);
-		cli->cl_dirty_transit--;
-	}
 	EXIT;
 }
 
@@ -1526,7 +1531,7 @@ static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap)
  */
 static int osc_enter_cache_try(struct client_obd *cli,
 			       struct osc_async_page *oap,
-			       int bytes, int transient)
+			       int bytes)
 {
 	int rc;
 
@@ -1536,31 +1541,38 @@ static int osc_enter_cache_try(struct client_obd *cli,
 	if (rc < 0)
 		return 0;
 
-	if (cli->cl_dirty_pages < cli->cl_dirty_max_pages &&
-	    1 + atomic_long_read(&obd_dirty_pages) <= obd_max_dirty_pages) {
-		osc_consume_write_grant(cli, &oap->oap_brw_page);
-		if (transient) {
-			cli->cl_dirty_transit++;
-			atomic_long_inc(&obd_dirty_transit_pages);
-			oap->oap_brw_flags |= OBD_BRW_NOCACHE;
+	if (cli->cl_dirty_pages < cli->cl_dirty_max_pages) {
+		if (atomic_long_add_return(1, &obd_dirty_pages) <=
+		    obd_max_dirty_pages) {
+			osc_consume_write_grant(cli, &oap->oap_brw_page);
+			return 1;
 		}
-		rc = 1;
-	} else {
-		__osc_unreserve_grant(cli, bytes, bytes);
-		rc = 0;
+		atomic_long_dec(&obd_dirty_pages);
 	}
-	return rc;
+	__osc_unreserve_grant(cli, bytes, bytes);
+	return 0;
 }
 
-static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw)
+/* Following two inlines exist to pass code fragments
+ * to wait_event_idle_exclusive_timeout_cmd().  Passing
+ * code fragments as macro args can look confusing, so
+ * we provide inlines to encapsulate them.
+ */
+static inline void cli_unlock_and_unplug(const struct lu_env *env,
+					 struct client_obd *cli,
+					 struct osc_async_page *oap)
 {
-	int rc;
-	spin_lock(&cli->cl_loi_list_lock);
-	rc = list_empty(&ocw->ocw_entry);
 	spin_unlock(&cli->cl_loi_list_lock);
-	return rc;
+	osc_io_unplug_async(env, cli, NULL);
+	CDEBUG(D_CACHE,
+	       "%s: sleeping for cache space for %p\n",
+	       cli_name(cli), oap);
 }
 
+static inline void cli_lock_after_unplug(struct client_obd *cli)
+{
+	spin_lock(&cli->cl_loi_list_lock);
+}
 /**
  * The main entry to reserve dirty page accounting. Usually the grant reserved
  * in this function will be freed in bulk in osc_free_grant() unless it fails
@@ -1571,15 +1583,23 @@ static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw)
 static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli,
 			   struct osc_async_page *oap, int bytes)
 {
-	struct osc_object	*osc = oap->oap_obj;
-	struct lov_oinfo	*loi = osc->oo_oinfo;
-	struct osc_cache_waiter	 ocw;
-	struct l_wait_info	 lwi;
-	int			 rc = -EDQUOT;
-	ENTRY;
+	struct osc_object *osc = oap->oap_obj;
+	struct lov_oinfo *loi = osc->oo_oinfo;
+	int rc = -EDQUOT;
+	int remain;
+	bool entered = false;
+	/* We cannot wait for a long time here since we are holding ldlm lock
+	 * across the actual IO. If no requests complete fast (e.g. due to
+	 * overloaded OST that takes a long time to process everything, we'd
+	 * get evicted if we wait for a normal obd_timeout or some such.
+	 * So we try to wait half the time it would take the client to be
+	 * evicted by server which is half obd_timeout when AT is off
+	 * or at least ldlm_enqueue_min with AT on.
+	 * See LU-13131 */
+	unsigned long timeout = cfs_time_seconds(AT_OFF ? obd_timeout / 2 :
+							  ldlm_enqueue_min / 2);
 
-	lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(AT_OFF ? obd_timeout : at_max),
-			       NULL, LWI_ON_SIGNAL_NOOP, NULL);
+	ENTRY;
 
 	OSC_DUMP_GRANT(D_CACHE, cli, "need:%d\n", bytes);
 
@@ -1594,76 +1614,40 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli,
 		GOTO(out, rc = -EDQUOT);
 	}
 
-	/* Hopefully normal case - cache space and write credits available */
-	if (osc_enter_cache_try(cli, oap, bytes, 0)) {
-		OSC_DUMP_GRANT(D_CACHE, cli, "granted from cache\n");
-		GOTO(out, rc = 0);
-	}
-
-	/* We can get here for two reasons: too many dirty pages in cache, or
+	/*
+	 * We can wait here for two reasons: too many dirty pages in cache, or
 	 * run out of grants. In both cases we should write dirty pages out.
 	 * Adding a cache waiter will trigger urgent write-out no matter what
 	 * RPC size will be.
-	 * The exiting condition is no avail grants and no dirty pages caching,
-	 * that really means there is no space on the OST. */
-	init_waitqueue_head(&ocw.ocw_waitq);
-	ocw.ocw_oap   = oap;
-	ocw.ocw_grant = bytes;
-	while (cli->cl_dirty_pages > 0 || cli->cl_w_in_flight > 0) {
-		list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters);
-		ocw.ocw_rc = 0;
-		spin_unlock(&cli->cl_loi_list_lock);
-
-		osc_io_unplug_async(env, cli, NULL);
-
-		CDEBUG(D_CACHE, "%s: sleeping for cache space @ %p for %p\n",
-		       cli_name(cli), &ocw, oap);
-
-		rc = l_wait_event(ocw.ocw_waitq, ocw_granted(cli, &ocw), &lwi);
-
-		spin_lock(&cli->cl_loi_list_lock);
-
-		if (rc < 0) {
-			/* l_wait_event is interrupted by signal or timed out */
-			list_del_init(&ocw.ocw_entry);
-			break;
-		}
-		LASSERT(list_empty(&ocw.ocw_entry));
-		rc = ocw.ocw_rc;
-
-		if (rc != -EDQUOT)
-			break;
-		if (osc_enter_cache_try(cli, oap, bytes, 0)) {
-			rc = 0;
-			break;
-		}
-	}
-
-	switch (rc) {
-	case 0:
-		OSC_DUMP_GRANT(D_CACHE, cli, "finally got grant space\n");
-		break;
-	case -ETIMEDOUT:
+	 * The exiting condition (other than success) is no avail grants
+	 * and no dirty pages caching, that really means there is no space
+	 * on the OST.
+	 */
+	remain = wait_event_idle_exclusive_timeout_cmd(
+		cli->cl_cache_waiters,
+		(entered = osc_enter_cache_try(cli, oap, bytes)) ||
+		(cli->cl_dirty_pages == 0 && cli->cl_w_in_flight == 0),
+		timeout,
+		cli_unlock_and_unplug(env, cli, oap),
+		cli_lock_after_unplug(cli));
+
+	if (entered) {
+		if (remain == timeout)
+			OSC_DUMP_GRANT(D_CACHE, cli, "granted from cache\n");
+		else
+			OSC_DUMP_GRANT(D_CACHE, cli,
+				       "finally got grant space\n");
+		wake_up(&cli->cl_cache_waiters);
+		rc = 0;
+	} else if (remain == 0) {
 		OSC_DUMP_GRANT(D_CACHE, cli,
 			       "timeout, fall back to sync i/o\n");
 		osc_extent_tree_dump(D_CACHE, osc);
 		/* fall back to synchronous I/O */
-		rc = -EDQUOT;
-		break;
-	case -EINTR:
-		/* Ensures restartability - LU-3581 */
-		OSC_DUMP_GRANT(D_CACHE, cli, "interrupted\n");
-		rc = -ERESTARTSYS;
-		break;
-	case -EDQUOT:
+	} else {
 		OSC_DUMP_GRANT(D_CACHE, cli,
 			       "no grant space, fall back to sync i/o\n");
-		break;
-	default:
-		CDEBUG(D_CACHE, "%s: event for cache space @ %p never arrived "
-		       "due to %d, fall back to sync i/o\n",
-		       cli_name(cli), &ocw, rc);
-		break;
+		wake_up_all(&cli->cl_cache_waiters);
 	}
 	EXIT;
 out:
@@ -1671,41 +1655,6 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli,
 	RETURN(rc);
 }
 
-/* caller must hold loi_list_lock */
-void osc_wake_cache_waiters(struct client_obd *cli)
-{
-	struct list_head *l, *tmp;
-	struct osc_cache_waiter *ocw;
-
-	ENTRY;
-	list_for_each_safe(l, tmp, &cli->cl_cache_waiters) {
-		ocw = list_entry(l, struct osc_cache_waiter, ocw_entry);
-		list_del_init(&ocw->ocw_entry);
-
-		ocw->ocw_rc = -EDQUOT;
-		/* we can't dirty more */
-		if ((cli->cl_dirty_pages  >= cli->cl_dirty_max_pages) ||
-		    (1 + atomic_long_read(&obd_dirty_pages) >
-		     obd_max_dirty_pages)) {
-			CDEBUG(D_CACHE, "no dirty room: dirty: %ld "
-			       "osc max %ld, sys max %ld\n",
-			       cli->cl_dirty_pages, cli->cl_dirty_max_pages,
-			       obd_max_dirty_pages);
-			goto wakeup;
-		}
-
-		if (osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant, 0))
-			ocw->ocw_rc = 0;
-wakeup:
-		CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant %ld, %d\n",
-		       ocw, ocw->ocw_oap, cli->cl_avail_grant, ocw->ocw_rc);
-
-		wake_up(&ocw->ocw_waitq);
-	}
-
-	EXIT;
-}
-
 static int osc_max_rpc_in_flight(struct client_obd *cli, struct osc_object *osc)
 {
 	int hprpc = !!list_empty(&osc->oo_hp_exts);
@@ -1745,8 +1694,9 @@ static int osc_makes_rpc(struct client_obd *cli, struct osc_object *osc,
 		}
 		/* trigger a write rpc stream as long as there are dirtiers
 		 * waiting for space.  as they're waiting, they're not going to
-		 * create more pages to coalesce with what's waiting.. */
-		if (!list_empty(&cli->cl_cache_waiters)) {
+		 * create more pages to coalesce with what's waiting..
+		 */
+		if (waitqueue_active(&cli->cl_cache_waiters)) {
 			CDEBUG(D_CACHE, "cache waiters forcing RPC\n");
 			RETURN(1);
 		}
@@ -1968,6 +1918,7 @@ static int try_to_add_extent_for_io(struct client_obd *cli,
 
 		if (tmp->oe_srvlock != ext->oe_srvlock ||
 		    !tmp->oe_grants != !ext->oe_grants ||
+		    tmp->oe_ndelay != ext->oe_ndelay ||
 		    tmp->oe_no_merge || ext->oe_no_merge)
 			RETURN(0);
 
@@ -2043,7 +1994,6 @@ static unsigned int get_write_extents(struct osc_object *obj,
 	while (!list_empty(&obj->oo_hp_exts)) {
 		ext = list_entry(obj->oo_hp_exts.next, struct osc_extent,
 				 oe_link);
-		LASSERT(ext->oe_state == OES_CACHE);
 		if (!try_to_add_extent_for_io(cli, ext, &data))
 			return data.erd_page_count;
 		EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext);
@@ -2229,8 +2179,9 @@ static struct osc_object *osc_next_obj(struct client_obd *cli)
 	/* then if we have cache waiters, return all objects with queued
 	 * writes.  This is especially important when many small files
 	 * have filled up the cache and not been fired into rpcs because
-	 * they don't pass the nr_pending/object threshhold */
-	if (!list_empty(&cli->cl_cache_waiters) &&
+	 * they don't pass the nr_pending/object threshhold
+	 */
+	if (waitqueue_active(&cli->cl_cache_waiters) &&
 	    !list_empty(&cli->cl_loi_write_list))
 		RETURN(list_to_obj(&cli->cl_loi_write_list, write_item));
 
@@ -2261,7 +2212,12 @@ __must_hold(&cli->cl_loi_list_lock)
 
 		OSC_IO_DEBUG(osc, "%lu in flight\n", rpcs_in_flight(cli));
 
-		if (osc_max_rpc_in_flight(cli, osc)) {
+		/* even if we have reached our max in flight RPCs, we still
+		 * allow all high-priority RPCs through to prevent their
+		 * starvation and leading to server evicting us for not
+		 * writing out pages in a timely manner LU-13131 */
+		if (osc_max_rpc_in_flight(cli, osc) &&
+		    list_empty(&osc->oo_hp_exts)) {
 			__osc_list_maint(cli, osc);
 			break;
 		}
@@ -2316,8 +2272,8 @@ __must_hold(&cli->cl_loi_list_lock)
 	}
 }
 
-static int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli,
-			  struct osc_object *osc, int async)
+int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli,
+		   struct osc_object *osc, int async)
 {
 	int rc = 0;
 
@@ -2335,18 +2291,7 @@ static int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli,
 	}
 	return rc;
 }
-
-static int osc_io_unplug_async(const struct lu_env *env,
-				struct client_obd *cli, struct osc_object *osc)
-{
-	return osc_io_unplug0(env, cli, osc, 1);
-}
-
-void osc_io_unplug(const struct lu_env *env, struct client_obd *cli,
-		   struct osc_object *osc)
-{
-	(void)osc_io_unplug0(env, cli, osc, 0);
-}
+EXPORT_SYMBOL(osc_io_unplug0);
 
 int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
 			struct page *page, loff_t offset)
@@ -2366,9 +2311,6 @@ int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
 	oap->oap_obj_off = offset;
 	LASSERT(!(offset & ~PAGE_MASK));
 
-	if (cfs_capable(CFS_CAP_SYS_RESOURCE))
-		oap->oap_brw_flags = OBD_BRW_NOQUOTA;
-
 	INIT_LIST_HEAD(&oap->oap_pending_item);
 	INIT_LIST_HEAD(&oap->oap_rpc_item);
 
@@ -2377,6 +2319,7 @@ int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
 	       oap, page, oap->oap_obj_off);
 	RETURN(0);
 }
+EXPORT_SYMBOL(osc_prep_async_page);
 
 int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
 		       struct osc_page *ops)
@@ -2407,7 +2350,7 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
 
 	/* Set the OBD_BRW_SRVLOCK before the page is queued. */
 	brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0;
-	if (cfs_capable(CFS_CAP_SYS_RESOURCE)) {
+	if (oio->oi_cap_sys_resource || io->ci_noquota) {
 		brw_flags |= OBD_BRW_NOQUOTA;
 		cmd |= OBD_BRW_NOQUOTA;
 	}
@@ -2463,7 +2406,7 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
 
 		/* it doesn't need any grant to dirty this page */
 		spin_lock(&cli->cl_loi_list_lock);
-		rc = osc_enter_cache_try(cli, oap, grants, 0);
+		rc = osc_enter_cache_try(cli, oap, grants);
 		spin_unlock(&cli->cl_loi_list_lock);
 		if (rc == 0) { /* try failed */
 			grants = 0;
@@ -2540,7 +2483,11 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
 		++ext->oe_nr_pages;
 		list_add_tail(&oap->oap_pending_item, &ext->oe_pages);
 		osc_object_unlock(osc);
+
+		if (!ext->oe_layout_version)
+			ext->oe_layout_version = io->ci_layout_version;
 	}
+
 	RETURN(rc);
 }
 
@@ -2726,8 +2673,9 @@ int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops)
 	RETURN(rc);
 }
 
-int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
-			 struct list_head *list, int cmd, int brw_flags)
+int osc_queue_sync_pages(const struct lu_env *env, const struct cl_io *io,
+			 struct osc_object *obj, struct list_head *list,
+			 int brw_flags)
 {
 	struct client_obd     *cli = osc_cli(obj);
 	struct osc_extent     *ext;
@@ -2765,7 +2713,7 @@ int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
 		RETURN(-ENOMEM);
 	}
 
-	ext->oe_rw = !!(cmd & OBD_BRW_READ);
+	ext->oe_rw = !!(brw_flags & OBD_BRW_READ);
 	ext->oe_sync = 1;
 	ext->oe_no_merge = !can_merge;
 	ext->oe_urgent = 1;
@@ -2773,15 +2721,52 @@ int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
 	ext->oe_end = ext->oe_max_end = end;
 	ext->oe_obj = obj;
 	ext->oe_srvlock = !!(brw_flags & OBD_BRW_SRVLOCK);
+	ext->oe_ndelay = !!(brw_flags & OBD_BRW_NDELAY);
+	if (brw_flags & OBD_BRW_NOCACHE && !ext->oe_rw) { /* direct io write */
+		int grants;
+		int ppc;
+
+		ppc = 1 << (cli->cl_chunkbits - PAGE_SHIFT);
+		grants = cli->cl_grant_extent_tax;
+		grants += (1 << cli->cl_chunkbits) *
+			((page_count + ppc - 1) / ppc);
+
+		spin_lock(&cli->cl_loi_list_lock);
+		if (osc_reserve_grant(cli, grants) == 0) {
+			list_for_each_entry(oap, list, oap_pending_item) {
+				osc_consume_write_grant(cli,
+							&oap->oap_brw_page);
+				atomic_long_inc(&obd_dirty_pages);
+			}
+			__osc_unreserve_grant(cli, grants, 0);
+			ext->oe_grants = grants;
+		}
+		spin_unlock(&cli->cl_loi_list_lock);
+	}
 	ext->oe_nr_pages = page_count;
 	ext->oe_mppr = mppr;
 	list_splice_init(list, &ext->oe_pages);
+	ext->oe_layout_version = io->ci_layout_version;
 
 	osc_object_lock(obj);
 	/* Reuse the initial refcount for RPC, don't drop it */
 	osc_extent_state_set(ext, OES_LOCK_DONE);
-	if (cmd & OBD_BRW_WRITE) {
-		list_add_tail(&ext->oe_link, &obj->oo_urgent_exts);
+	if (!ext->oe_rw) { /* write */
+		if (!ext->oe_srvlock) {
+			/* The most likely case here is from lack of grants
+			 * so we are either out of quota or out of space.
+			 * Since this means we are holding locks across
+			 * potentially multi-striped IO, we must send out
+			 * everything out instantly to avoid prolonged
+			 * waits resulting in lock eviction (likely since
+			 * the extended wait in osc_cache_enter() did not
+			 * yield any additional grant due to a timeout.
+			 * LU-13131 */
+			ext->oe_hp = 1;
+			list_add_tail(&ext->oe_link, &obj->oo_hp_exts);
+		} else {
+			list_add_tail(&ext->oe_link, &obj->oo_urgent_exts);
+		}
 		osc_update_pending(obj, OBD_BRW_WRITE, page_count);
 	} else {
 		list_add_tail(&ext->oe_link, &obj->oo_reading_exts);
@@ -2919,6 +2904,7 @@ int osc_cache_truncate_start(const struct lu_env *env, struct osc_object *obj,
 	}
 	RETURN(result);
 }
+EXPORT_SYMBOL(osc_cache_truncate_start);
 
 /**
  * Called after osc_io_setattr_end to add oio->oi_trunc back to cache.
@@ -3005,6 +2991,7 @@ int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
 	OSC_IO_DEBUG(obj, "sync file range.\n");
 	RETURN(result);
 }
+EXPORT_SYMBOL(osc_cache_wait_range);
 
 /**
  * Called to write out a range of osc object.
@@ -3044,7 +3031,7 @@ int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
 					EASSERT(!ext->oe_hp, ext);
 					ext->oe_hp = 1;
 					list = &obj->oo_hp_exts;
-				} else if (!ext->oe_urgent) {
+				} else if (!ext->oe_urgent && !ext->oe_hp) {
 					ext->oe_urgent = 1;
 					list = &obj->oo_urgent_exts;
 				}
@@ -3052,10 +3039,25 @@ int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
 					list_move_tail(&ext->oe_link, list);
 				unplug = true;
 			} else {
+				struct client_obd *cli = osc_cli(obj);
+				int pcc_bits = cli->cl_chunkbits - PAGE_SHIFT;
+				pgoff_t align_by = (1 << pcc_bits);
+				pgoff_t a_start = round_down(start, align_by);
+				pgoff_t a_end = round_up(end, align_by);
+
+				/* overflow case */
+				if (end && !a_end)
+					a_end = CL_PAGE_EOF;
 				/* the only discarder is lock cancelling, so
-				 * [start, end] must contain this extent */
-				EASSERT(ext->oe_start >= start &&
-					ext->oe_max_end <= end, ext);
+				 * [start, end], aligned by chunk size, must
+				 * contain this extent */
+				LASSERTF(ext->oe_start >= a_start &&
+					 ext->oe_end <= a_end,
+					 "ext [%lu, %lu] reg [%lu, %lu] "
+					 "orig [%lu %lu] align %lu bits "
+					 "%d\n", ext->oe_start, ext->oe_end,
+					 a_start, a_end, start, end,
+					 align_by, pcc_bits);
 				osc_extent_state_set(ext, OES_LOCKING);
 				ext->oe_owner = current;
 				list_move_tail(&ext->oe_link,
@@ -3121,6 +3123,7 @@ int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
 	OSC_IO_DEBUG(obj, "pageout [%lu, %lu], %d.\n", start, end, result);
 	RETURN(result);
 }
+EXPORT_SYMBOL(osc_cache_writeback_range);
 
 /**
  * Returns a list of pages by a given [start, end] of \a obj.
@@ -3139,6 +3142,7 @@ int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
 			osc_page_gang_cbt cb, void *cbdata)
 {
 	struct osc_page *ops;
+	struct pagevec	*pagevec;
 	void            **pvec;
 	pgoff_t         idx;
 	unsigned int    nr;
@@ -3150,6 +3154,8 @@ int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
 
 	idx = start;
 	pvec = osc_env_info(env)->oti_pvec;
+	pagevec = &osc_env_info(env)->oti_pagevec;
+	ll_pagevec_init(pagevec, 0);
 	spin_lock(&osc->oo_tree_lock);
 	while ((nr = radix_tree_gang_lookup(&osc->oo_tree, pvec,
 					    idx, OTI_PVEC_SIZE)) > 0) {
@@ -3196,8 +3202,10 @@ int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
 
 			page = ops->ops_cl.cpl_page;
 			lu_ref_del(&page->cp_reference, "gang_lookup", current);
-			cl_page_put(env, page);
+			cl_pagevec_put(env, page, pagevec);
 		}
+		pagevec_release(pagevec);
+
 		if (nr < OTI_PVEC_SIZE || end_of_region)
 			break;
 
@@ -3213,6 +3221,7 @@ int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
 		spin_unlock(&osc->oo_tree_lock);
 	RETURN(res);
 }
+EXPORT_SYMBOL(osc_page_gang_lookup);
 
 /**
  * Check if page @page is covered by an extra lock or discard it.
@@ -3255,8 +3264,8 @@ static int check_and_discard_cb(const struct lu_env *env, struct cl_io *io,
 	return CLP_GANG_OKAY;
 }
 
-static int discard_cb(const struct lu_env *env, struct cl_io *io,
-		      struct osc_page *ops, void *cbdata)
+int osc_discard_cb(const struct lu_env *env, struct cl_io *io,
+		   struct osc_page *ops, void *cbdata)
 {
 	struct osc_thread_info *info = osc_env_info(env);
 	struct cl_page *page = ops->ops_cl.cpl_page;
@@ -3278,6 +3287,7 @@ static int discard_cb(const struct lu_env *env, struct cl_io *io,
 
 	return CLP_GANG_OKAY;
 }
+EXPORT_SYMBOL(osc_discard_cb);
 
 /**
  * Discard pages protected by the given lock. This function traverses radix
@@ -3291,7 +3301,7 @@ int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc,
 			   pgoff_t start, pgoff_t end, bool discard)
 {
 	struct osc_thread_info *info = osc_env_info(env);
-	struct cl_io *io = &info->oti_io;
+	struct cl_io *io = osc_env_thread_io(env);
 	osc_page_gang_cbt cb;
 	int res;
 	int result;
@@ -3304,7 +3314,7 @@ int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc,
 	if (result != 0)
 		GOTO(out, result);
 
-	cb = discard ? discard_cb : check_and_discard_cb;
+	cb = discard ? osc_discard_cb : check_and_discard_cb;
 	info->oti_fn_index = info->oti_next_index = start;
 	do {
 		res = osc_page_gang_lookup(env, io, osc,
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_dev.c b/drivers/staging/lustrefsx/lustre/osc/osc_dev.c
index c06a5deb339b7..cbddab5c0f319 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_dev.c
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_dev.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,19 +38,24 @@
 
 /* class_name2obd() */
 #include <obd_class.h>
+#include <lustre_osc.h>
 
-#include "osc_cl_internal.h"
+#include "osc_internal.h"
 
-/** \addtogroup osc 
- * @{ 
+/** \addtogroup osc
+ * @{
  */
 
 struct kmem_cache *osc_lock_kmem;
+EXPORT_SYMBOL(osc_lock_kmem);
 struct kmem_cache *osc_object_kmem;
+EXPORT_SYMBOL(osc_object_kmem);
+
 struct kmem_cache *osc_thread_kmem;
 struct kmem_cache *osc_session_kmem;
 struct kmem_cache *osc_extent_kmem;
 struct kmem_cache *osc_quota_kmem;
+struct kmem_cache *osc_obdo_kmem;
 
 struct lu_kmem_descr osc_caches[] = {
         {
@@ -84,21 +89,15 @@ struct lu_kmem_descr osc_caches[] = {
 		.ckd_size  = sizeof(struct osc_quota_info)
 	},
 	{
-                .ckd_cache = NULL
-        }
+		.ckd_cache = &osc_obdo_kmem,
+		.ckd_name  = "osc_obdo_kmem",
+		.ckd_size  = sizeof(struct obdo)
+	},
+	{
+		.ckd_cache = NULL
+	}
 };
 
-/*****************************************************************************
- *
- * Type conversions.
- *
- */
-
-static struct lu_device *osc2lu_dev(struct osc_device *osc)
-{
-        return &osc->od_cl.cd_lu_dev;
-}
-
 /*****************************************************************************
  *
  * Osc device and device type functions.
@@ -130,6 +129,7 @@ struct lu_context_key osc_key = {
         .lct_init = osc_key_init,
         .lct_fini = osc_key_fini
 };
+EXPORT_SYMBOL(osc_key);
 
 static void *osc_session_init(const struct lu_context *ctx,
 			      struct lu_context_key *key)
@@ -154,6 +154,7 @@ struct lu_context_key osc_session_key = {
         .lct_init = osc_session_init,
         .lct_fini = osc_session_fini
 };
+EXPORT_SYMBOL(osc_session_key);
 
 /* type constructor/destructor: osc_type_{init,fini,start,stop}(). */
 LU_TYPE_INIT_FINI(osc, &osc_key, &osc_session_key);
@@ -171,27 +172,30 @@ static const struct lu_device_operations osc_lu_ops = {
         .ldo_recovery_complete = NULL
 };
 
-static int osc_device_init(const struct lu_env *env, struct lu_device *d,
-                           const char *name, struct lu_device *next)
+int osc_device_init(const struct lu_env *env, struct lu_device *d,
+		    const char *name, struct lu_device *next)
 {
         RETURN(0);
 }
+EXPORT_SYMBOL(osc_device_init);
 
-static struct lu_device *osc_device_fini(const struct lu_env *env,
-                                         struct lu_device *d)
+struct lu_device *osc_device_fini(const struct lu_env *env,
+				  struct lu_device *d)
 {
 	return NULL;
 }
+EXPORT_SYMBOL(osc_device_fini);
 
-static struct lu_device *osc_device_free(const struct lu_env *env,
-                                         struct lu_device *d)
+struct lu_device *osc_device_free(const struct lu_env *env,
+				  struct lu_device *d)
 {
-        struct osc_device *od = lu2osc_dev(d);
+	struct osc_device *od = lu2osc_dev(d);
 
-        cl_device_fini(lu2cl_dev(d));
-        OBD_FREE_PTR(od);
-        return NULL;
+	cl_device_fini(lu2cl_dev(d));
+	OBD_FREE_PTR(od);
+	return NULL;
 }
+EXPORT_SYMBOL(osc_device_free);
 
 static struct lu_device *osc_device_alloc(const struct lu_env *env,
                                           struct lu_device_type *t,
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_internal.h b/drivers/staging/lustrefsx/lustre/osc/osc_internal.h
index 24766263514a6..519a4d1f4b57e 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_internal.h
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -35,93 +35,45 @@
 
 #define OAP_MAGIC 8675309
 
+#include <libcfs/linux/linux-mem.h>
+#include <lustre_osc.h>
+
 extern atomic_t osc_pool_req_count;
 extern unsigned int osc_reqpool_maxreqcount;
 extern struct ptlrpc_request_pool *osc_rq_pool;
 
-struct lu_env;
-
-enum async_flags {
-        ASYNC_READY = 0x1, /* ap_make_ready will not be called before this
-                              page is added to an rpc */
-        ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */
-        ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called
-                                     to give the caller a chance to update
-                                     or cancel the size of the io */
-        ASYNC_HP = 0x10,
-};
-
-struct osc_async_page {
-        int                     oap_magic;
-        unsigned short          oap_cmd;
-        unsigned short          oap_interrupted:1;
-
-	struct list_head	oap_pending_item;
-	struct list_head	oap_rpc_item;
-
-	loff_t			oap_obj_off;
-        unsigned                oap_page_off;
-        enum async_flags        oap_async_flags;
-
-        struct brw_page         oap_brw_page;
-
-        struct ptlrpc_request   *oap_request;
-        struct client_obd       *oap_cli;
-	struct osc_object       *oap_obj;
-
-	spinlock_t		 oap_lock;
-};
-
-#define oap_page        oap_brw_page.pg
-#define oap_count       oap_brw_page.count
-#define oap_brw_flags   oap_brw_page.flag
-
-static inline struct osc_async_page *brw_page2oap(struct brw_page *pga)
-{
-	return (struct osc_async_page *)container_of(pga, struct osc_async_page,
-						     oap_brw_page);
-}
-
-struct osc_cache_waiter {
-	struct list_head	ocw_entry;
-	wait_queue_head_t	ocw_waitq;
-	struct osc_async_page  *ocw_oap;
-	int                     ocw_grant;
-	int                     ocw_rc;
-};
-
-void osc_wake_cache_waiters(struct client_obd *cli);
 int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes);
 void osc_update_next_shrink(struct client_obd *cli);
-
-/*
- * cl integration.
- */
-#include <cl_object.h>
+int lru_queue_work(const struct lu_env *env, void *data);
+int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
+		      int sent, int rc);
+int osc_extent_release(const struct lu_env *env, struct osc_extent *ext);
+int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc,
+			   pgoff_t start, pgoff_t end, bool discard);
 
 extern struct ptlrpc_request_set *PTLRPCD_SET;
 
-typedef int (*osc_enqueue_upcall_f)(void *cookie, struct lustre_handle *lockh,
-				    int rc);
+void osc_lock_lvb_update(const struct lu_env *env,
+			 struct osc_object *osc,
+			 struct ldlm_lock *dlmlock,
+			 struct ost_lvb *lvb);
 
 int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
 		     __u64 *flags, union ldlm_policy_data *policy,
-		     struct ost_lvb *lvb, int kms_valid,
-		     osc_enqueue_upcall_f upcall,
+		     struct ost_lvb *lvb, osc_enqueue_upcall_f upcall,
 		     void *cookie, struct ldlm_enqueue_info *einfo,
-		     struct ptlrpc_request_set *rqset, int async, int agl);
+		     struct ptlrpc_request_set *rqset, int async,
+		     bool speculative);
 
-int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id,
-		   enum ldlm_type type, union ldlm_policy_data *policy,
-		   enum ldlm_mode mode, __u64 *flags, void *data,
+int osc_match_base(const struct lu_env *env, struct obd_export *exp,
+		   struct ldlm_res_id *res_id, enum ldlm_type type,
+		   union ldlm_policy_data *policy, enum ldlm_mode mode,
+		   __u64 *flags, struct osc_object *obj,
 		   struct lustre_handle *lockh, int unref);
 
 int osc_setattr_async(struct obd_export *exp, struct obdo *oa,
 		      obd_enqueue_update_f upcall, void *cookie,
 		      struct ptlrpc_request_set *rqset);
-int osc_punch_base(struct obd_export *exp, struct obdo *oa,
-                   obd_enqueue_update_f upcall, void *cookie,
-                   struct ptlrpc_request_set *rqset);
 int osc_sync_base(struct osc_object *obj, struct obdo *oa,
 		  obd_enqueue_update_f upcall, void *cookie,
 		  struct ptlrpc_request_set *rqset);
@@ -132,8 +84,6 @@ int osc_ladvise_base(struct obd_export *exp, struct obdo *oa,
 int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *cfg);
 int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 		  struct list_head *ext_list, int cmd);
-long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
-		   long target, bool force);
 unsigned long osc_lru_reserve(struct client_obd *cli, unsigned long npages);
 void osc_lru_unreserve(struct client_obd *cli, unsigned long npages);
 
@@ -144,15 +94,36 @@ unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock);
 int osc_cleanup(struct obd_device *obd);
 int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
 
-#ifdef CONFIG_PROC_FS
-extern struct lprocfs_vars lprocfs_osc_obd_vars[];
-int lproc_osc_attach_seqstat(struct obd_device *dev);
-#else
-static inline int lproc_osc_attach_seqstat(struct obd_device *dev) {return 0;}
-#endif
+int osc_tunables_init(struct obd_device *obd);
 
 extern struct lu_device_type osc_device_type;
 
+static inline struct cl_io *osc_env_thread_io(const struct lu_env *env)
+{
+	struct cl_io *io = &osc_env_info(env)->oti_io;
+
+	memset(io, 0, sizeof(*io));
+	return io;
+}
+
+static inline int osc_is_object(const struct lu_object *obj)
+{
+	return obj->lo_dev->ld_type == &osc_device_type;
+}
+
+static inline struct osc_lock *osc_lock_at(const struct cl_lock *lock)
+{
+	return cl2osc_lock(cl_lock_at(lock, &osc_device_type));
+}
+
+int osc_lock_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_lock *lock, const struct cl_io *io);
+int osc_io_init(const struct lu_env *env, struct cl_object *obj,
+		struct cl_io *io);
+struct lu_object *osc_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *hdr,
+				   struct lu_device *dev);
+
 static inline int osc_recoverable_error(int rc)
 {
         return (rc == -EIO || rc == -EROFS || rc == -ENOMEM ||
@@ -174,41 +145,13 @@ static inline char *cli_name(struct client_obd *cli)
         ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; })
 #endif
 
-struct osc_device {
-        struct cl_device    od_cl;
-        struct obd_export  *od_exp;
-
-        /* Write stats is actually protected by client_obd's lock. */
-        struct osc_stats {
-                uint64_t     os_lockless_writes;          /* by bytes */
-                uint64_t     os_lockless_reads;           /* by bytes */
-                uint64_t     os_lockless_truncates;       /* by times */
-        } od_stats;
-
-        /* configuration item(s) */
-        int                 od_contention_time;
-        int                 od_lockless_truncate;
-};
-
-static inline struct osc_device *obd2osc_dev(const struct obd_device *d)
-{
-        return container_of0(d->obd_lu_dev, struct osc_device, od_cl.cd_lu_dev);
-}
-
-extern struct kmem_cache *osc_quota_kmem;
-struct osc_quota_info {
-	/** linkage for quota hash table */
-	struct hlist_node oqi_hash;
-	u32		  oqi_id;
-};
-
 struct osc_async_args {
 	struct obd_info	*aa_oi;
 };
 
 int osc_quota_setup(struct obd_device *obd);
 int osc_quota_cleanup(struct obd_device *obd);
-int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[],
+int osc_quota_setdq(struct client_obd *cli, __u64 xid, const unsigned int qid[],
 		    u64 valid, u32 flags);
 int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[]);
 int osc_quotactl(struct obd_device *unused, struct obd_export *exp,
@@ -216,24 +159,14 @@ int osc_quotactl(struct obd_device *unused, struct obd_export *exp,
 void osc_inc_unstable_pages(struct ptlrpc_request *req);
 void osc_dec_unstable_pages(struct ptlrpc_request *req);
 bool osc_over_unstable_soft_limit(struct client_obd *cli);
-/**
- * Bit flags for osc_dlm_lock_at_pageoff().
- */
-enum osc_dap_flags {
-	/**
-	 * Just check if the desired lock exists, it won't hold reference
-	 * count on lock.
-	 */
-	OSC_DAP_FL_TEST_LOCK = 1 << 0,
-	/**
-	 * Return the lock even if it is being canceled.
-	 */
-	OSC_DAP_FL_CANCELING = 1 << 1
-};
-struct ldlm_lock *osc_dlmlock_at_pgoff(const struct lu_env *env,
-				       struct osc_object *obj, pgoff_t index,
-				       enum osc_dap_flags flags);
-void osc_pack_req_body(struct ptlrpc_request *req, struct obdo *oa);
+void osc_page_touch_at(const struct lu_env *env, struct cl_object *obj,
+		       pgoff_t idx, size_t to);
+
+struct ldlm_lock *osc_obj_dlmlock_at_pgoff(const struct lu_env *env,
+					   struct osc_object *obj,
+					   pgoff_t index,
+					   enum osc_dap_flags flags);
+
 int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc);
 
 /** osc shrink list to link all osc client obd */
@@ -245,4 +178,14 @@ extern unsigned long osc_cache_shrink_count(struct shrinker *sk,
 extern unsigned long osc_cache_shrink_scan(struct shrinker *sk,
 					   struct shrink_control *sc);
 
+static inline void osc_set_io_portal(struct ptlrpc_request *req)
+{
+	struct obd_import *imp = req->rq_import;
+
+	/* Distinguish OSC from MDC here to use OST or MDS portal */
+	if (OCD_HAS_FLAG(&imp->imp_connect_data, IBITS))
+		req->rq_request_portal = MDS_IO_PORTAL;
+	else
+		req->rq_request_portal = OST_IO_PORTAL;
+}
 #endif /* OSC_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_io.c b/drivers/staging/lustrefsx/lustre/osc/osc_io.c
index 38fe2532829fd..4a51b9912d72f 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_io.c
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_io.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -38,27 +38,14 @@
 #define DEBUG_SUBSYSTEM S_OSC
 
 #include <lustre_obdo.h>
+#include <lustre_osc.h>
 
-#include "osc_cl_internal.h"
+#include "osc_internal.h"
 
-/** \addtogroup osc 
- *  @{ 
+/** \addtogroup osc
+ *  @{
  */
 
-/*****************************************************************************
- *
- * Type conversions.
- *
- */
-
-static struct osc_io *cl2osc_io(const struct lu_env *env,
-                                const struct cl_io_slice *slice)
-{
-        struct osc_io *oio = container_of0(slice, struct osc_io, oi_cl);
-        LINVRNT(oio == osc_env_io(env));
-        return oio;
-}
-
 /*****************************************************************************
  *
  * io operations.
@@ -69,8 +56,7 @@ static void osc_io_fini(const struct lu_env *env, const struct cl_io_slice *io)
 {
 }
 
-static void osc_read_ahead_release(const struct lu_env *env,
-				   void *cbdata)
+void osc_read_ahead_release(const struct lu_env *env, void *cbdata)
 {
 	struct ldlm_lock *dlmlock = cbdata;
 	struct lustre_handle lockh;
@@ -79,6 +65,7 @@ static void osc_read_ahead_release(const struct lu_env *env,
 	ldlm_lock_decref(&lockh, LCK_PR);
 	LDLM_LOCK_PUT(dlmlock);
 }
+EXPORT_SYMBOL(osc_read_ahead_release);
 
 static int osc_io_read_ahead(const struct lu_env *env,
 			     const struct cl_io_slice *ios,
@@ -117,9 +104,8 @@ static int osc_io_read_ahead(const struct lu_env *env,
  * or, if page is already submitted, changes osc flags through
  * osc_set_async_flags().
  */
-static int osc_io_submit(const struct lu_env *env,
-                         const struct cl_io_slice *ios,
-			 enum cl_req_type crt, struct cl_2queue *queue)
+int osc_io_submit(const struct lu_env *env, const struct cl_io_slice *ios,
+		  enum cl_req_type crt, struct cl_2queue *queue)
 {
 	struct cl_page	  *page;
 	struct cl_page	  *tmp;
@@ -133,7 +119,6 @@ static int osc_io_submit(const struct lu_env *env,
 	struct cl_page_list *qout     = &queue->c2_qout;
 	unsigned int queued = 0;
 	int result = 0;
-	int cmd;
 	int brw_flags;
 	unsigned int max_pages;
 
@@ -145,8 +130,14 @@ static int osc_io_submit(const struct lu_env *env,
 	cli = osc_cli(osc);
 	max_pages = cli->cl_max_pages_per_rpc;
 
-	cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
 	brw_flags = osc_io_srvlock(cl2osc_io(env, ios)) ? OBD_BRW_SRVLOCK : 0;
+	brw_flags |= crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
+	if (crt == CRT_READ && ios->cis_io->ci_ndelay)
+		brw_flags |= OBD_BRW_NDELAY;
+
+	page = cl_page_list_first(qin);
+	if (page->cp_type == CPT_TRANSIENT)
+		brw_flags |= OBD_BRW_NOCACHE;
 
         /*
          * NOTE: here @page is a top-level page. This is done to avoid
@@ -200,7 +191,7 @@ static int osc_io_submit(const struct lu_env *env,
 
 		if (++queued == max_pages) {
 			queued = 0;
-			result = osc_queue_sync_pages(env, osc, &list, cmd,
+			result = osc_queue_sync_pages(env, io, osc, &list,
 						      brw_flags);
 			if (result < 0)
 				break;
@@ -208,7 +199,7 @@ static int osc_io_submit(const struct lu_env *env,
 	}
 
 	if (queued > 0)
-		result = osc_queue_sync_pages(env, osc, &list, cmd, brw_flags);
+		result = osc_queue_sync_pages(env, io, osc, &list, brw_flags);
 
 	/* Update c/mtime for sync write. LU-7310 */
 	if (crt == CRT_WRITE && qout->pl_nr > 0 && result == 0) {
@@ -224,36 +215,31 @@ static int osc_io_submit(const struct lu_env *env,
 	CDEBUG(D_INFO, "%d/%d %d\n", qin->pl_nr, qout->pl_nr, result);
 	return qout->pl_nr > 0 ? 0 : result;
 }
+EXPORT_SYMBOL(osc_io_submit);
 
 /**
- * This is called when a page is accessed within file in a way that creates
- * new page, if one were missing (i.e., if there were a hole at that place in
- * the file, or accessed page is beyond the current file size).
+ * This is called to update the attributes when modifying a specific page,
+ * both when making new pages and when doing updates to existing cached pages.
  *
  * Expand stripe KMS if necessary.
  */
-static void osc_page_touch_at(const struct lu_env *env,
-			      struct cl_object *obj, pgoff_t idx, size_t to)
+void osc_page_touch_at(const struct lu_env *env, struct cl_object *obj,
+		       pgoff_t idx, size_t to)
 {
-        struct lov_oinfo  *loi  = cl2osc(obj)->oo_oinfo;
-        struct cl_attr    *attr = &osc_env_info(env)->oti_attr;
-        int valid;
-        __u64 kms;
+	struct lov_oinfo  *loi  = cl2osc(obj)->oo_oinfo;
+	struct cl_attr    *attr = &osc_env_info(env)->oti_attr;
+	int valid;
+	__u64 kms;
 
-        /* offset within stripe */
-        kms = cl_offset(obj, idx) + to;
+	ENTRY;
 
-        cl_object_attr_lock(obj);
-        /*
-         * XXX old code used
-         *
-         *         ll_inode_size_lock(inode, 0); lov_stripe_lock(lsm);
-         *
-         * here
-         */
+	/* offset within stripe */
+	kms = cl_offset(obj, idx) + to;
+
+	cl_object_attr_lock(obj);
 	CDEBUG(D_INODE, "stripe KMS %sincreasing %llu->%llu %llu\n",
-               kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms,
-               loi->loi_lvb.lvb_size);
+	       kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms,
+	       loi->loi_lvb.lvb_size);
 
 	attr->cat_mtime = attr->cat_ctime = ktime_get_real_seconds();
 	valid = CAT_MTIME | CAT_CTIME;
@@ -267,12 +253,14 @@ static void osc_page_touch_at(const struct lu_env *env,
 	}
 	cl_object_attr_update(env, obj, attr, valid);
 	cl_object_attr_unlock(obj);
+
+	EXIT;
 }
 
-static int osc_io_commit_async(const struct lu_env *env,
-				const struct cl_io_slice *ios,
-				struct cl_page_list *qin, int from, int to,
-				cl_commit_cbt cb)
+int osc_io_commit_async(const struct lu_env *env,
+			const struct cl_io_slice *ios,
+			struct cl_page_list *qin, int from, int to,
+			cl_commit_cbt cb)
 {
 	struct cl_io    *io = ios->cis_io;
 	struct osc_io   *oio = cl2osc_io(env, ios);
@@ -306,6 +294,9 @@ static int osc_io_commit_async(const struct lu_env *env,
 		opg = osc_cl_page_osc(page, osc);
 		oap = &opg->ops_oap;
 
+		LASSERTF(osc == oap->oap_obj,
+			 "obj mismatch: %p / %p\n", osc, oap->oap_obj);
+
 		if (!list_empty(&oap->oap_rpc_item)) {
 			CDEBUG(D_CACHE, "Busy oap %p page %p for submit.\n",
 			       oap, opg);
@@ -341,29 +332,47 @@ static int osc_io_commit_async(const struct lu_env *env,
 	CDEBUG(D_INFO, "%d %d\n", qin->pl_nr, result);
 	RETURN(result);
 }
+EXPORT_SYMBOL(osc_io_commit_async);
 
-static int osc_io_iter_init(const struct lu_env *env,
-			    const struct cl_io_slice *ios)
+static bool osc_import_not_healthy(struct obd_import *imp)
+{
+	return imp->imp_invalid || imp->imp_deactive ||
+	       !(imp->imp_state == LUSTRE_IMP_FULL ||
+		 imp->imp_state == LUSTRE_IMP_IDLE);
+}
+
+int osc_io_iter_init(const struct lu_env *env, const struct cl_io_slice *ios)
 {
 	struct osc_object *osc = cl2osc(ios->cis_obj);
 	struct obd_import *imp = osc_cli(osc)->cl_import;
+	struct osc_io *oio = osc_env_io(env);
 	int rc = -EIO;
+	ENTRY;
 
 	spin_lock(&imp->imp_lock);
-	if (likely(!imp->imp_invalid)) {
-		struct osc_io *oio = osc_env_io(env);
-
+	/**
+	 * check whether this OSC device is available for non-delay read,
+	 * fast switching mirror if we haven't tried all mirrors.
+	 */
+	if (ios->cis_io->ci_type == CIT_READ && ios->cis_io->ci_ndelay &&
+	    !ios->cis_io->ci_tried_all_mirrors && osc_import_not_healthy(imp)) {
+		rc = -EWOULDBLOCK;
+	} else if (likely(!imp->imp_invalid)) {
 		atomic_inc(&osc->oo_nr_ios);
 		oio->oi_is_active = 1;
 		rc = 0;
 	}
 	spin_unlock(&imp->imp_lock);
 
-	return rc;
+	if (cfs_capable(CFS_CAP_SYS_RESOURCE))
+		oio->oi_cap_sys_resource = 1;
+
+	RETURN(rc);
 }
+EXPORT_SYMBOL(osc_io_iter_init);
 
-static int osc_io_write_iter_init(const struct lu_env *env,
-				  const struct cl_io_slice *ios)
+int osc_io_write_iter_init(const struct lu_env *env,
+			   const struct cl_io_slice *ios)
 {
 	struct cl_io *io = ios->cis_io;
 	struct osc_io *oio = osc_env_io(env);
@@ -374,17 +383,18 @@ static int osc_io_write_iter_init(const struct lu_env *env,
 	if (cl_io_is_append(io))
 		RETURN(osc_io_iter_init(env, ios));
 
-	npages = io->u.ci_rw.rw_range.cir_count >> PAGE_SHIFT;
-	if (io->u.ci_rw.rw_range.cir_pos & ~PAGE_MASK)
+	npages = io->u.ci_rw.crw_count >> PAGE_SHIFT;
+	if (io->u.ci_rw.crw_pos & ~PAGE_MASK)
 		++npages;
 
 	oio->oi_lru_reserved = osc_lru_reserve(osc_cli(osc), npages);
 
 	RETURN(osc_io_iter_init(env, ios));
 }
+EXPORT_SYMBOL(osc_io_write_iter_init);
 
-static void osc_io_iter_fini(const struct lu_env *env,
-			     const struct cl_io_slice *ios)
+void osc_io_iter_fini(const struct lu_env *env,
+		      const struct cl_io_slice *ios)
 {
 	struct osc_io *oio = osc_env_io(env);
 
@@ -397,9 +407,10 @@ static void osc_io_iter_fini(const struct lu_env *env,
 			wake_up_all(&osc->oo_io_waitq);
 	}
 }
+EXPORT_SYMBOL(osc_io_iter_fini);
 
-static void osc_io_write_iter_fini(const struct lu_env *env,
-				   const struct cl_io_slice *ios)
+void osc_io_write_iter_fini(const struct lu_env *env,
+			    const struct cl_io_slice *ios)
 {
 	struct osc_io *oio = osc_env_io(env);
 	struct osc_object *osc = cl2osc(ios->cis_obj);
@@ -412,9 +423,9 @@ static void osc_io_write_iter_fini(const struct lu_env *env,
 
 	osc_io_iter_fini(env, ios);
 }
+EXPORT_SYMBOL(osc_io_write_iter_fini);
 
-static int osc_io_fault_start(const struct lu_env *env,
-			      const struct cl_io_slice *ios)
+int osc_io_fault_start(const struct lu_env *env, const struct cl_io_slice *ios)
 {
 	struct cl_io       *io;
 	struct cl_fault_io *fio;
@@ -434,6 +445,8 @@ static int osc_io_fault_start(const struct lu_env *env,
 				  fio->ft_index, fio->ft_nob);
 	RETURN(0);
 }
+EXPORT_SYMBOL(osc_io_fault_start);
+
 
 static int osc_async_upcall(void *a, int rc)
 {
@@ -497,10 +510,11 @@ static int osc_io_setattr_start(const struct lu_env *env,
         struct obdo             *oa     = &oio->oi_oa;
 	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
 	__u64                    size   = io->u.ci_setattr.sa_attr.lvb_size;
-	unsigned int             ia_valid = io->u.ci_setattr.sa_valid;
-	int                      result = 0;
-	ENTRY;
+	unsigned int ia_avalid = io->u.ci_setattr.sa_avalid;
+	enum op_xvalid ia_xvalid = io->u.ci_setattr.sa_xvalid;
+	int result = 0;
 
+	ENTRY;
 	/* truncate cache dirty pages first */
 	if (cl_io_is_trunc(io))
 		result = osc_cache_truncate_start(env, cl2osc(obj), size,
@@ -513,19 +527,20 @@ static int osc_io_setattr_start(const struct lu_env *env,
 			struct ost_lvb *lvb = &io->u.ci_setattr.sa_attr;
 			unsigned int cl_valid = 0;
 
-			if (ia_valid & ATTR_SIZE) {
-				attr->cat_size = attr->cat_kms = size;
+			if (ia_avalid & ATTR_SIZE) {
+				attr->cat_size = size;
+				attr->cat_kms = size;
 				cl_valid = (CAT_SIZE | CAT_KMS);
 			}
-			if (ia_valid & ATTR_MTIME_SET) {
+			if (ia_avalid & ATTR_MTIME_SET) {
 				attr->cat_mtime = lvb->lvb_mtime;
 				cl_valid |= CAT_MTIME;
 			}
-			if (ia_valid & ATTR_ATIME_SET) {
+			if (ia_avalid & ATTR_ATIME_SET) {
 				attr->cat_atime = lvb->lvb_atime;
 				cl_valid |= CAT_ATIME;
 			}
-			if (ia_valid & ATTR_CTIME_SET) {
+			if (ia_xvalid & OP_XVALID_CTIME_SET) {
 				attr->cat_ctime = lvb->lvb_ctime;
 				cl_valid |= CAT_CTIME;
 			}
@@ -542,42 +557,47 @@ static int osc_io_setattr_start(const struct lu_env *env,
 		oa->o_layout = io->u.ci_setattr.sa_layout;
 		oa->o_valid |= OBD_MD_FLID | OBD_MD_FLGROUP |
 			OBD_MD_FLOSTLAYOUT;
-		if (ia_valid & ATTR_CTIME) {
+		if (ia_avalid & ATTR_CTIME) {
 			oa->o_valid |= OBD_MD_FLCTIME;
 			oa->o_ctime = attr->cat_ctime;
 		}
-		if (ia_valid & ATTR_ATIME) {
+		if (ia_avalid & ATTR_ATIME) {
 			oa->o_valid |= OBD_MD_FLATIME;
 			oa->o_atime = attr->cat_atime;
 		}
-		if (ia_valid & ATTR_MTIME) {
+		if (ia_avalid & ATTR_MTIME) {
 			oa->o_valid |= OBD_MD_FLMTIME;
 			oa->o_mtime = attr->cat_mtime;
 		}
-                if (ia_valid & ATTR_SIZE) {
-                        oa->o_size = size;
-                        oa->o_blocks = OBD_OBJECT_EOF;
-                        oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
-
-                        if (oio->oi_lockless) {
-                                oa->o_flags = OBD_FL_SRVLOCK;
-                                oa->o_valid |= OBD_MD_FLFLAGS;
-                        }
-                } else {
-                        LASSERT(oio->oi_lockless == 0);
-                }
+		if (ia_avalid & ATTR_SIZE) {
+			oa->o_size = size;
+			oa->o_blocks = OBD_OBJECT_EOF;
+			oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+
+			if (oio->oi_lockless) {
+				oa->o_flags = OBD_FL_SRVLOCK;
+				oa->o_valid |= OBD_MD_FLFLAGS;
+			}
+
+			if (io->ci_layout_version > 0) {
+				/* verify layout version */
+				oa->o_valid |= OBD_MD_LAYOUT_VERSION;
+				oa->o_layout_version = io->ci_layout_version;
+			}
+		} else {
+			LASSERT(oio->oi_lockless == 0);
+		}
 
-		if (ia_valid & ATTR_ATTR_FLAG) {
+		if (ia_xvalid & OP_XVALID_FLAGS) {
 			oa->o_flags = io->u.ci_setattr.sa_attr_flags;
 			oa->o_valid |= OBD_MD_FLFLAGS;
 		}
 
 		init_completion(&cbargs->opc_sync);
 
-		if (ia_valid & ATTR_SIZE)
-			result = osc_punch_base(osc_export(cl2osc(obj)),
-						oa, osc_async_upcall,
-						cbargs, PTLRPCD_SET);
+		if (ia_avalid & ATTR_SIZE)
+			result = osc_punch_send(osc_export(cl2osc(obj)),
+						oa, osc_async_upcall, cbargs);
 		else
 			result = osc_setattr_async(osc_export(cl2osc(obj)),
 						   oa, osc_async_upcall,
@@ -589,37 +609,50 @@ static int osc_io_setattr_start(const struct lu_env *env,
 	RETURN(result);
 }
 
-static void osc_io_setattr_end(const struct lu_env *env,
-                               const struct cl_io_slice *slice)
+void osc_io_setattr_end(const struct lu_env *env,
+			const struct cl_io_slice *slice)
 {
 	struct cl_io     *io  = slice->cis_io;
 	struct osc_io    *oio = cl2osc_io(env, slice);
 	struct cl_object *obj = slice->cis_obj;
 	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
-        int result = 0;
+	struct cl_attr  *attr = &osc_env_info(env)->oti_attr;
+	struct obdo *oa = &oio->oi_oa;
+	unsigned int cl_valid = 0;
+	int result = 0;
 
 	if (cbargs->opc_rpc_sent) {
 		wait_for_completion(&cbargs->opc_sync);
 		result = io->ci_result = cbargs->opc_rc;
 	}
-        if (result == 0) {
-                if (oio->oi_lockless) {
-                        /* lockless truncate */
-                        struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
-
-                        LASSERT(cl_io_is_trunc(io));
-                        /* XXX: Need a lock. */
-                        osd->od_stats.os_lockless_truncates++;
-                }
-        }
+
+	if (result == 0) {
+		if (oio->oi_lockless) {
+			/* lockless truncate */
+			struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
+
+			LASSERT(cl_io_is_trunc(io));
+			/* XXX: Need a lock. */
+			osd->od_stats.os_lockless_truncates++;
+		}
+	}
 
 	if (cl_io_is_trunc(io)) {
 		__u64 size = io->u.ci_setattr.sa_attr.lvb_size;
+		cl_object_attr_lock(obj);
+		if (oa->o_valid & OBD_MD_FLBLOCKS) {
+			attr->cat_blocks = oa->o_blocks;
+			cl_valid |= CAT_BLOCKS;
+		}
+
+		cl_object_attr_update(env, obj, attr, cl_valid);
+		cl_object_attr_unlock(obj);
 		osc_trunc_check(env, io, oio, size);
 		osc_cache_truncate_end(env, oio->oi_trunc);
 		oio->oi_trunc = NULL;
 	}
 }
+EXPORT_SYMBOL(osc_io_setattr_end);
 
 struct osc_data_version_args {
 	struct osc_io *dva_oio;
@@ -716,18 +749,23 @@ static void osc_io_data_version_end(const struct lu_env *env,
 
 	if (cbargs->opc_rc != 0) {
 		slice->cis_io->ci_result = cbargs->opc_rc;
-	} else if (!(oio->oi_oa.o_valid & OBD_MD_FLDATAVERSION)) {
-		slice->cis_io->ci_result = -EOPNOTSUPP;
 	} else {
-		dv->dv_data_version = oio->oi_oa.o_data_version;
 		slice->cis_io->ci_result = 0;
+		if (!(oio->oi_oa.o_valid &
+		      (OBD_MD_LAYOUT_VERSION | OBD_MD_FLDATAVERSION)))
+			slice->cis_io->ci_result = -ENOTSUPP;
+
+		if (oio->oi_oa.o_valid & OBD_MD_LAYOUT_VERSION)
+			dv->dv_layout_version = oio->oi_oa.o_layout_version;
+		if (oio->oi_oa.o_valid & OBD_MD_FLDATAVERSION)
+			dv->dv_data_version = oio->oi_oa.o_data_version;
 	}
 
 	EXIT;
 }
 
-static int osc_io_read_start(const struct lu_env *env,
-                             const struct cl_io_slice *slice)
+int osc_io_read_start(const struct lu_env *env,
+		      const struct cl_io_slice *slice)
 {
 	struct cl_object *obj  = slice->cis_obj;
 	struct cl_attr	 *attr = &osc_env_info(env)->oti_attr;
@@ -743,9 +781,10 @@ static int osc_io_read_start(const struct lu_env *env,
 
 	RETURN(rc);
 }
+EXPORT_SYMBOL(osc_io_read_start);
 
-static int osc_io_write_start(const struct lu_env *env,
-                              const struct cl_io_slice *slice)
+int osc_io_write_start(const struct lu_env *env,
+		       const struct cl_io_slice *slice)
 {
 	struct cl_object *obj   = slice->cis_obj;
 	struct cl_attr   *attr  = &osc_env_info(env)->oti_attr;
@@ -760,9 +799,10 @@ static int osc_io_write_start(const struct lu_env *env,
 
 	RETURN(rc);
 }
+EXPORT_SYMBOL(osc_io_write_start);
 
-static int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj,
-			 struct cl_fsync_io *fio)
+int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj,
+		  struct cl_fsync_io *fio)
 {
 	struct osc_io    *oio   = osc_env_io(env);
 	struct obdo      *oa    = &oio->oi_oa;
@@ -787,9 +827,10 @@ static int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj,
 	rc = osc_sync_base(obj, oa, osc_async_upcall, cbargs, PTLRPCD_SET);
 	RETURN(rc);
 }
+EXPORT_SYMBOL(osc_fsync_ost);
 
-static int osc_io_fsync_start(const struct lu_env *env,
-			      const struct cl_io_slice *slice)
+int osc_io_fsync_start(const struct lu_env *env,
+		       const struct cl_io_slice *slice)
 {
 	struct cl_io       *io  = slice->cis_io;
 	struct cl_fsync_io *fio = &io->u.ci_fsync;
@@ -828,8 +869,8 @@ static int osc_io_fsync_start(const struct lu_env *env,
 	RETURN(result);
 }
 
-static void osc_io_fsync_end(const struct lu_env *env,
-			     const struct cl_io_slice *slice)
+void osc_io_fsync_end(const struct lu_env *env,
+		      const struct cl_io_slice *slice)
 {
 	struct cl_fsync_io *fio = &slice->cis_io->u.ci_fsync;
 	struct cl_object   *obj = slice->cis_obj;
@@ -849,6 +890,7 @@ static void osc_io_fsync_end(const struct lu_env *env,
 	}
 	slice->cis_io->ci_result = result;
 }
+EXPORT_SYMBOL(osc_io_fsync_end);
 
 static int osc_io_ladvise_start(const struct lu_env *env,
 				const struct cl_io_slice *slice)
@@ -920,8 +962,7 @@ static void osc_io_ladvise_end(const struct lu_env *env,
 	slice->cis_io->ci_result = result;
 }
 
-static void osc_io_end(const struct lu_env *env,
-		       const struct cl_io_slice *slice)
+void osc_io_end(const struct lu_env *env, const struct cl_io_slice *slice)
 {
 	struct osc_io *oio = cl2osc_io(env, slice);
 
@@ -930,6 +971,7 @@ static void osc_io_end(const struct lu_env *env,
 		oio->oi_active = NULL;
 	}
 }
+EXPORT_SYMBOL(osc_io_end);
 
 static const struct cl_io_operations osc_io_ops = {
 	.op = {
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_lock.c b/drivers/staging/lustrefsx/lustre/osc/osc_lock.c
index 6d53b5b80c580..dd956fd8532b2 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_lock.c
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_lock.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -37,32 +37,16 @@
 
 #define DEBUG_SUBSYSTEM S_OSC
 
-#include <libcfs/libcfs.h>
 /* fid_build_reg_res_name() */
 #include <lustre_fid.h>
+#include <lustre_osc.h>
 
-#include "osc_cl_internal.h"
+#include "osc_internal.h"
 
 /** \addtogroup osc
  *  @{
  */
 
-/*****************************************************************************
- *
- * Type conversions.
- *
- */
-
-static const struct cl_lock_operations osc_lock_ops;
-static const struct cl_lock_operations osc_lock_lockless_ops;
-static void osc_lock_to_lockless(const struct lu_env *env,
-                                 struct osc_lock *ols, int force);
-
-int osc_lock_is_lockless(const struct osc_lock *olck)
-{
-        return (olck->ols_cl.cls_ops == &osc_lock_lockless_ops);
-}
-
 /**
  * Returns a weak pointer to the ldlm lock identified by a handle. Returned
  * pointer cannot be dereferenced, as lock is not protected from concurrent
@@ -122,7 +106,7 @@ static int osc_lock_invariant(struct osc_lock *ols)
 
 	if (! ergo(ols->ols_state == OLS_GRANTED,
 		   olock != NULL &&
-		   olock->l_req_mode == olock->l_granted_mode &&
+		   ldlm_is_granted(olock) &&
 		   ols->ols_hold))
 		return 0;
 	return 1;
@@ -134,8 +118,7 @@ static int osc_lock_invariant(struct osc_lock *ols)
  *
  */
 
-static void osc_lock_fini(const struct lu_env *env,
-                          struct cl_lock_slice *slice)
+void osc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice)
 {
 	struct osc_lock  *ols = cl2osc_lock(slice);
 
@@ -144,6 +127,7 @@ static void osc_lock_fini(const struct lu_env *env,
 
 	OBD_SLAB_FREE_PTR(ols, osc_lock_kmem);
 }
+EXPORT_SYMBOL(osc_lock_fini);
 
 static void osc_lock_build_policy(const struct lu_env *env,
 				  const struct cl_lock *lock,
@@ -155,44 +139,22 @@ static void osc_lock_build_policy(const struct lu_env *env,
 	policy->l_extent.gid = d->cld_gid;
 }
 
-static __u64 osc_enq2ldlm_flags(__u32 enqflags)
-{
-	__u64 result = 0;
-
-	LASSERT((enqflags & ~CEF_MASK) == 0);
-
-	if (enqflags & CEF_NONBLOCK)
-		result |= LDLM_FL_BLOCK_NOWAIT;
-	if (enqflags & CEF_ASYNC)
-		result |= LDLM_FL_HAS_INTENT;
-	if (enqflags & CEF_DISCARD_DATA)
-		result |= LDLM_FL_AST_DISCARD_DATA;
-	if (enqflags & CEF_PEEK)
-		result |= LDLM_FL_TEST_LOCK;
-	if (enqflags & CEF_LOCK_MATCH)
-		result |= LDLM_FL_MATCH_LOCK;
-	return result;
-}
-
 /**
  * Updates object attributes from a lock value block (lvb) received together
  * with the DLM lock reply from the server. Copy of osc_update_enqueue()
  * logic.
  *
- * This can be optimized to not update attributes when lock is a result of a
- * local match.
- *
  * Called under lock and resource spin-locks.
  */
-static void osc_lock_lvb_update(const struct lu_env *env,
-				struct osc_object *osc,
-				struct ldlm_lock *dlmlock,
-				struct ost_lvb *lvb)
+void osc_lock_lvb_update(const struct lu_env *env,
+			 struct osc_object *osc,
+			 struct ldlm_lock *dlmlock,
+			 struct ost_lvb *lvb)
 {
-	struct cl_object  *obj = osc2cl(osc);
-	struct lov_oinfo  *oinfo = osc->oo_oinfo;
-	struct cl_attr    *attr = &osc_env_info(env)->oti_attr;
-	unsigned           valid;
+	struct cl_object *obj = osc2cl(osc);
+	struct lov_oinfo *oinfo = osc->oo_oinfo;
+	struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+	unsigned valid, setkms = 0;
 
 	ENTRY;
 
@@ -217,19 +179,23 @@ static void osc_lock_lvb_update(const struct lu_env *env,
                 if (size > dlmlock->l_policy_data.l_extent.end)
                         size = dlmlock->l_policy_data.l_extent.end + 1;
                 if (size >= oinfo->loi_kms) {
-			LDLM_DEBUG(dlmlock, "lock acquired, setting rss=%llu"
-				   ", kms=%llu", lvb->lvb_size, size);
                         valid |= CAT_KMS;
                         attr->cat_kms = size;
-                } else {
-                        LDLM_DEBUG(dlmlock, "lock acquired, setting rss="
-				   "%llu; leaving kms=%llu, end=%llu",
-                                   lvb->lvb_size, oinfo->loi_kms,
-                                   dlmlock->l_policy_data.l_extent.end);
+			setkms = 1;
                 }
 		ldlm_lock_allow_match_locked(dlmlock);
 	}
 
+	/* The size should not be less than the kms */
+	if (attr->cat_size < oinfo->loi_kms)
+		attr->cat_size = oinfo->loi_kms;
+
+	LDLM_DEBUG(dlmlock, "acquired size %llu, setting rss=%llu;%s "
+		   "kms=%llu, end=%llu", lvb->lvb_size, attr->cat_size,
+		   setkms ? "" : " leaving",
+		   setkms ? attr->cat_kms : oinfo->loi_kms,
+		   dlmlock ? dlmlock->l_policy_data.l_extent.end : -1ull);
+
 	cl_object_attr_update(env, obj, attr, valid);
 	cl_object_attr_unlock(obj);
 
@@ -237,8 +203,9 @@ static void osc_lock_lvb_update(const struct lu_env *env,
 }
 
 static void osc_lock_granted(const struct lu_env *env, struct osc_lock *oscl,
-			     struct lustre_handle *lockh, bool lvb_update)
+			     struct lustre_handle *lockh)
 {
+	struct osc_object *osc = cl2osc(oscl->ols_cl.cls_obj);
 	struct ldlm_lock *dlmlock;
 
 	dlmlock = ldlm_handle2lock_long(lockh, 0);
@@ -265,7 +232,7 @@ static void osc_lock_granted(const struct lu_env *env, struct osc_lock *oscl,
 
 	/* Lock must have been granted. */
 	lock_res_and_lock(dlmlock);
-	if (dlmlock->l_granted_mode == dlmlock->l_req_mode) {
+	if (ldlm_is_granted(dlmlock)) {
 		struct ldlm_extent *ext = &dlmlock->l_policy_data.l_extent;
 		struct cl_lock_descr *descr = &oscl->ols_cl.cls_lock->cll_descr;
 
@@ -277,10 +244,11 @@ static void osc_lock_granted(const struct lu_env *env, struct osc_lock *oscl,
 		descr->cld_gid   = ext->gid;
 
 		/* no lvb update for matched lock */
-		if (lvb_update) {
+		if (!ldlm_is_lvb_cached(dlmlock)) {
 			LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY);
-			osc_lock_lvb_update(env, cl2osc(oscl->ols_cl.cls_obj),
-					    dlmlock, NULL);
+			LASSERT(osc == dlmlock->l_ast_data);
+			osc_lock_lvb_update(env, osc, dlmlock, NULL);
+			ldlm_set_lvb_cached(dlmlock);
 		}
 		LINVRNT(osc_lock_invariant(oscl));
 	}
@@ -320,7 +288,7 @@ static int osc_lock_upcall(void *cookie, struct lustre_handle *lockh,
 	}
 
 	if (rc == 0)
-		osc_lock_granted(env, oscl, lockh, errcode == ELDLM_OK);
+		osc_lock_granted(env, oscl, lockh);
 
 	/* Error handling, some errors are tolerable. */
 	if (oscl->ols_locklessable && rc == -EUSERS) {
@@ -328,7 +296,7 @@ static int osc_lock_upcall(void *cookie, struct lustre_handle *lockh,
 		 * lockless lock.
 		 */
 		osc_object_set_contended(cl2osc(slice->cls_obj));
-		LASSERT(slice->cls_ops == &osc_lock_ops);
+		LASSERT(slice->cls_ops != oscl->ols_lockless_ops);
 
 		/* Change this lock to ldlmlock-less lock. */
 		osc_lock_to_lockless(env, oscl, 1);
@@ -340,6 +308,8 @@ static int osc_lock_upcall(void *cookie, struct lustre_handle *lockh,
 				    NULL, &oscl->ols_lvb);
 		/* Hide the error. */
 		rc = 0;
+	} else if (rc < 0 && oscl->ols_flags & LDLM_FL_NDELAY) {
+		rc = -EWOULDBLOCK;
 	}
 
 	if (oscl->ols_owner != NULL)
@@ -349,8 +319,9 @@ static int osc_lock_upcall(void *cookie, struct lustre_handle *lockh,
 	RETURN(rc);
 }
 
-static int osc_lock_upcall_agl(void *cookie, struct lustre_handle *lockh,
-			       int errcode)
+static int osc_lock_upcall_speculative(void *cookie,
+				       struct lustre_handle *lockh,
+				       int errcode)
 {
 	struct osc_object	*osc = cookie;
 	struct ldlm_lock	*dlmlock;
@@ -371,9 +342,10 @@ static int osc_lock_upcall_agl(void *cookie, struct lustre_handle *lockh,
 	LASSERT(dlmlock != NULL);
 
 	lock_res_and_lock(dlmlock);
-	LASSERT(dlmlock->l_granted_mode == dlmlock->l_req_mode);
+	LASSERT(ldlm_is_granted(dlmlock));
 
-	/* there is no osc_lock associated with AGL lock */
+	/* there is no osc_lock associated with speculative locks
+	 * thus no need to set LDLM_FL_LVB_CACHED */
 	osc_lock_lvb_update(env, osc, dlmlock, NULL);
 
 	unlock_res_and_lock(dlmlock);
@@ -409,7 +381,12 @@ static int osc_lock_flush(struct osc_object *obj, pgoff_t start, pgoff_t end,
 			rc = 0;
 	}
 
-	rc2 = osc_lock_discard_pages(env, obj, start, end, discard);
+	/*
+	 * Do not try to match other locks with CLM_WRITE since we already
+	 * know there're none
+	 */
+	rc2 = osc_lock_discard_pages(env, obj, start, end,
+				     mode == CLM_WRITE || discard);
 	if (rc == 0 && rc2 < 0)
 		rc = rc2;
 
@@ -434,7 +411,7 @@ static int osc_dlm_blocking_ast0(const struct lu_env *env,
 	LASSERT(flag == LDLM_CB_CANCELING);
 
 	lock_res_and_lock(dlmlock);
-	if (dlmlock->l_granted_mode != dlmlock->l_req_mode) {
+	if (!ldlm_is_granted(dlmlock)) {
 		dlmlock->l_ast_data = NULL;
 		unlock_res_and_lock(dlmlock);
 		RETURN(0);
@@ -574,13 +551,17 @@ static int osc_ldlm_blocking_ast(struct ldlm_lock *dlmlock,
 	RETURN(result);
 }
 
-static int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
+int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
 {
 	struct ptlrpc_request	*req  = data;
 	struct lu_env		*env;
 	struct ost_lvb		*lvb;
 	struct req_capsule	*cap;
 	struct cl_object	*obj = NULL;
+	struct ldlm_resource	*res = dlmlock->l_resource;
+	struct ldlm_match_data  matchdata = { 0 };
+	union ldlm_policy_data  policy;
+	enum ldlm_mode		mode = LCK_PW | LCK_GROUP | LCK_PR;
 	int			result;
 	__u16			refcheck;
 
@@ -592,13 +573,40 @@ static int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
 	if (IS_ERR(env))
 		GOTO(out, result = PTR_ERR(env));
 
+	policy.l_extent.start = 0;
+	policy.l_extent.end = LUSTRE_EOF;
 
-	lock_res_and_lock(dlmlock);
-	if (dlmlock->l_ast_data != NULL) {
-		obj = osc2cl(dlmlock->l_ast_data);
-		cl_object_get(obj);
+	matchdata.lmd_mode = &mode;
+	matchdata.lmd_policy = &policy;
+	matchdata.lmd_flags = LDLM_FL_TEST_LOCK | LDLM_FL_CBPENDING;
+	matchdata.lmd_unref = 1;
+	matchdata.lmd_has_ast_data = true;
+
+	LDLM_LOCK_GET(dlmlock);
+
+	/* If any dlmlock has l_ast_data set, we must find it or we risk
+	 * missing a size update done under a different lock.
+	 */
+	while (dlmlock) {
+		lock_res_and_lock(dlmlock);
+		if (dlmlock->l_ast_data) {
+			obj = osc2cl(dlmlock->l_ast_data);
+			cl_object_get(obj);
+		}
+		unlock_res_and_lock(dlmlock);
+		LDLM_LOCK_RELEASE(dlmlock);
+
+		dlmlock = NULL;
+
+		if (obj == NULL && res->lr_type == LDLM_EXTENT) {
+			if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_SIZE_DATA))
+				break;
+
+			lock_res(res);
+			dlmlock = search_itree(res, &matchdata);
+			unlock_res(res);
+		}
 	}
-	unlock_res_and_lock(dlmlock);
 
 	if (obj != NULL) {
 		/* Do not grab the mutex of cl_lock for glimpse.
@@ -636,15 +644,15 @@ static int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
 	req->rq_status = result;
 	RETURN(result);
 }
+EXPORT_SYMBOL(osc_ldlm_glimpse_ast);
 
 static int weigh_cb(const struct lu_env *env, struct cl_io *io,
 		    struct osc_page *ops, void *cbdata)
 {
 	struct cl_page *page = ops->ops_cl.cpl_page;
 
-	if (cl_page_is_vmlocked(env, page)
-	    || PageDirty(page->cp_vmpage) || PageWriteback(page->cp_vmpage)
-	   )
+	if (cl_page_is_vmlocked(env, page) || PageDirty(page->cp_vmpage) ||
+	    PageWriteback(page->cp_vmpage))
 		return CLP_GANG_ABORT;
 
 	*(pgoff_t *)cbdata = osc_index(ops) + 1;
@@ -653,12 +661,13 @@ static int weigh_cb(const struct lu_env *env, struct cl_io *io,
 
 static unsigned long osc_lock_weight(const struct lu_env *env,
 				     struct osc_object *oscobj,
-				     struct ldlm_extent *extent)
+				     loff_t start, loff_t end)
 {
-	struct cl_io     *io = &osc_env_info(env)->oti_io;
+	struct cl_io *io = osc_env_thread_io(env);
 	struct cl_object *obj = cl_object_top(&oscobj->oo_cl);
-	pgoff_t          page_index;
-	int              result;
+	pgoff_t page_index;
+	int result;
+
 	ENTRY;
 
 	io->ci_obj = obj;
@@ -667,11 +676,10 @@ static unsigned long osc_lock_weight(const struct lu_env *env,
 	if (result != 0)
 		RETURN(result);
 
-	page_index = cl_index(obj, extent->start);
+	page_index = cl_index(obj, start);
 	do {
 		result = osc_page_gang_lookup(env, io, oscobj,
-					      page_index,
-					      cl_index(obj, extent->end),
+					      page_index, cl_index(obj, end),
 					      weigh_cb, (void *)&page_index);
 		if (result == CLP_GANG_ABORT)
 			break;
@@ -688,12 +696,13 @@ static unsigned long osc_lock_weight(const struct lu_env *env,
  */
 unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock)
 {
-	struct lu_env           *env;
-	struct osc_object	*obj;
-	struct osc_lock		*oscl;
-	unsigned long            weight;
-	bool			found = false;
-	__u16			refcheck;
+	struct lu_env *env;
+	struct osc_object *obj;
+	struct osc_lock *oscl;
+	unsigned long weight;
+	bool found = false;
+	__u16 refcheck;
+
 	ENTRY;
 
 	might_sleep();
@@ -709,7 +718,9 @@ unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock)
 		/* Mostly because lack of memory, do not eliminate this lock */
 		RETURN(1);
 
-	LASSERT(dlmlock->l_resource->lr_type == LDLM_EXTENT);
+	LASSERT(dlmlock->l_resource->lr_type == LDLM_EXTENT ||
+		dlmlock->l_resource->lr_type == LDLM_IBITS);
+
 	lock_res_and_lock(dlmlock);
 	obj = dlmlock->l_ast_data;
 	if (obj)
@@ -721,9 +732,10 @@ unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock)
 
 	spin_lock(&obj->oo_ol_spin);
 	list_for_each_entry(oscl, &obj->oo_ol_list, ols_nextlock_oscobj) {
-		if (oscl->ols_dlmlock != NULL && oscl->ols_dlmlock != dlmlock)
-			continue;
-		found = true;
+		if (oscl->ols_dlmlock == dlmlock) {
+			found = true;
+			break;
+		}
 	}
 	spin_unlock(&obj->oo_ol_spin);
 	if (found) {
@@ -733,7 +745,18 @@ unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock)
 		GOTO(out, weight = 1);
 	}
 
-	weight = osc_lock_weight(env, obj, &dlmlock->l_policy_data.l_extent);
+	if (dlmlock->l_resource->lr_type == LDLM_EXTENT)
+		weight = osc_lock_weight(env, obj,
+					 dlmlock->l_policy_data.l_extent.start,
+					 dlmlock->l_policy_data.l_extent.end);
+	else if (ldlm_has_dom(dlmlock))
+		weight = osc_lock_weight(env, obj, 0, OBD_OBJECT_EOF);
+	/* The DOM bit can be cancelled at any time; in that case, we know
+	 * there are no pages, so just return weight of 0
+	 */
+	else
+		weight = 0;
+
 	EXIT;
 
 out:
@@ -743,6 +766,7 @@ unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock)
 	cl_env_put(env, &refcheck);
 	return weight;
 }
+EXPORT_SYMBOL(osc_ldlm_weigh_ast);
 
 static void osc_lock_build_einfo(const struct lu_env *env,
 				 const struct cl_lock *lock,
@@ -769,46 +793,46 @@ static void osc_lock_build_einfo(const struct lu_env *env,
  *  Additional policy can be implemented here, e.g., never do lockless-io
  *  for large extents.
  */
-static void osc_lock_to_lockless(const struct lu_env *env,
-                                 struct osc_lock *ols, int force)
+void osc_lock_to_lockless(const struct lu_env *env,
+			  struct osc_lock *ols, int force)
 {
-        struct cl_lock_slice *slice = &ols->ols_cl;
-
-        LASSERT(ols->ols_state == OLS_NEW ||
-                ols->ols_state == OLS_UPCALL_RECEIVED);
-
-        if (force) {
-                ols->ols_locklessable = 1;
-                slice->cls_ops = &osc_lock_lockless_ops;
-        } else {
-                struct osc_io *oio     = osc_env_io(env);
-                struct cl_io  *io      = oio->oi_cl.cis_io;
-                struct cl_object *obj  = slice->cls_obj;
-                struct osc_object *oob = cl2osc(obj);
-                const struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
-                struct obd_connect_data *ocd;
-
-                LASSERT(io->ci_lockreq == CILR_MANDATORY ||
-                        io->ci_lockreq == CILR_MAYBE ||
-                        io->ci_lockreq == CILR_NEVER);
-
-                ocd = &class_exp2cliimp(osc_export(oob))->imp_connect_data;
-                ols->ols_locklessable = (io->ci_type != CIT_SETATTR) &&
-                                (io->ci_lockreq == CILR_MAYBE) &&
-                                (ocd->ocd_connect_flags & OBD_CONNECT_SRVLOCK);
-                if (io->ci_lockreq == CILR_NEVER ||
-                        /* lockless IO */
-                    (ols->ols_locklessable && osc_object_is_contended(oob)) ||
-                        /* lockless truncate */
-                    (cl_io_is_trunc(io) &&
-                     (ocd->ocd_connect_flags & OBD_CONNECT_TRUNCLOCK) &&
-                      osd->od_lockless_truncate)) {
-                        ols->ols_locklessable = 1;
-                        slice->cls_ops = &osc_lock_lockless_ops;
-                }
-        }
-        LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols)));
+	struct cl_lock_slice *slice = &ols->ols_cl;
+	struct osc_io *oio = osc_env_io(env);
+	struct cl_io *io = oio->oi_cl.cis_io;
+	struct cl_object *obj = slice->cls_obj;
+	struct osc_object *oob = cl2osc(obj);
+	const struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
+	struct obd_connect_data *ocd;
+
+	LASSERT(ols->ols_state == OLS_NEW ||
+		ols->ols_state == OLS_UPCALL_RECEIVED);
+
+	if (force) {
+		ols->ols_locklessable = 1;
+		slice->cls_ops = ols->ols_lockless_ops;
+	} else {
+		LASSERT(io->ci_lockreq == CILR_MANDATORY ||
+			io->ci_lockreq == CILR_MAYBE ||
+			io->ci_lockreq == CILR_NEVER);
+
+		ocd = &class_exp2cliimp(osc_export(oob))->imp_connect_data;
+		ols->ols_locklessable = (io->ci_type != CIT_SETATTR) &&
+					(io->ci_lockreq == CILR_MAYBE) &&
+					(ocd->ocd_connect_flags &
+					 OBD_CONNECT_SRVLOCK);
+		if (io->ci_lockreq == CILR_NEVER ||
+		    /* lockless IO */
+		    (ols->ols_locklessable && osc_object_is_contended(oob)) ||
+		    /* lockless truncate */
+		    (cl_io_is_trunc(io) && osd->od_lockless_truncate &&
+		     (ocd->ocd_connect_flags & OBD_CONNECT_TRUNCLOCK))) {
+			ols->ols_locklessable = 1;
+			slice->cls_ops = ols->ols_lockless_ops;
+		}
+	}
+	LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols)));
 }
+EXPORT_SYMBOL(osc_lock_to_lockless);
 
 static bool osc_lock_compatible(const struct osc_lock *qing,
 				const struct osc_lock *qed)
@@ -816,7 +840,7 @@ static bool osc_lock_compatible(const struct osc_lock *qing,
 	struct cl_lock_descr *qed_descr = &qed->ols_cl.cls_lock->cll_descr;
 	struct cl_lock_descr *qing_descr = &qing->ols_cl.cls_lock->cll_descr;
 
-	if (qed->ols_glimpse)
+	if (qed->ols_glimpse || qed->ols_speculative)
 		return true;
 
 	if (qing_descr->cld_mode == CLM_READ && qed_descr->cld_mode == CLM_READ)
@@ -833,9 +857,8 @@ static bool osc_lock_compatible(const struct osc_lock *qing,
 	return false;
 }
 
-static void osc_lock_wake_waiters(const struct lu_env *env,
-				  struct osc_object *osc,
-				  struct osc_lock *oscl)
+void osc_lock_wake_waiters(const struct lu_env *env, struct osc_object *osc,
+			   struct osc_lock *oscl)
 {
 	spin_lock(&osc->oo_ol_spin);
 	list_del_init(&oscl->ols_nextlock_oscobj);
@@ -853,14 +876,16 @@ static void osc_lock_wake_waiters(const struct lu_env *env,
 	}
 	spin_unlock(&oscl->ols_lock);
 }
+EXPORT_SYMBOL(osc_lock_wake_waiters);
 
-static int osc_lock_enqueue_wait(const struct lu_env *env,
-		struct osc_object *obj, struct osc_lock *oscl)
+int osc_lock_enqueue_wait(const struct lu_env *env, struct osc_object *obj,
+			  struct osc_lock *oscl)
 {
 	struct osc_lock         *tmp_oscl;
 	struct cl_lock_descr    *need = &oscl->ols_cl.cls_lock->cll_descr;
 	struct cl_sync_io       *waiter = &osc_env_info(env)->oti_anchor;
 	int rc = 0;
+
 	ENTRY;
 
 	spin_lock(&obj->oo_ol_spin);
@@ -911,6 +936,7 @@ static int osc_lock_enqueue_wait(const struct lu_env *env,
 
 	RETURN(rc);
 }
+EXPORT_SYMBOL(osc_lock_enqueue_wait);
 
 /**
  * Implementation of cl_lock_operations::clo_enqueue() method for osc
@@ -934,6 +960,7 @@ static int osc_lock_enqueue(const struct lu_env *env,
 	struct osc_io			*oio   = osc_env_io(env);
 	struct osc_object		*osc   = cl2osc(slice->cls_obj);
 	struct osc_lock			*oscl  = cl2osc_lock(slice);
+	struct obd_export		*exp   = osc_export(osc);
 	struct cl_lock			*lock  = slice->cls_lock;
 	struct ldlm_res_id		*resname = &info->oti_resname;
 	union ldlm_policy_data		*policy  = &info->oti_policy;
@@ -950,11 +977,22 @@ static int osc_lock_enqueue(const struct lu_env *env,
 	if (oscl->ols_state == OLS_GRANTED)
 		RETURN(0);
 
+	if ((oscl->ols_flags & LDLM_FL_NO_EXPANSION) &&
+	    !(exp_connect_lockahead_old(exp) || exp_connect_lockahead(exp))) {
+		result = -EOPNOTSUPP;
+		CERROR("%s: server does not support lockahead/locknoexpand:"
+		       "rc = %d\n", exp->exp_obd->obd_name, result);
+		RETURN(result);
+	}
+
 	if (oscl->ols_flags & LDLM_FL_TEST_LOCK)
 		GOTO(enqueue_base, 0);
 
-	if (oscl->ols_glimpse) {
-		LASSERT(equi(oscl->ols_agl, anchor == NULL));
+	/* For glimpse and/or speculative locks, do not wait for reply from
+	 * server on LDLM request */
+	if (oscl->ols_glimpse || oscl->ols_speculative) {
+		/* Speculative and glimpse locks do not have an anchor */
+		LASSERT(equi(oscl->ols_speculative, anchor == NULL));
 		async = true;
 		GOTO(enqueue_base, 0);
 	}
@@ -980,25 +1018,30 @@ static int osc_lock_enqueue(const struct lu_env *env,
 
 	/**
 	 * DLM lock's ast data must be osc_object;
-	 * if glimpse or AGL lock, async of osc_enqueue_base() must be true,
+	 * if glimpse or speculative lock, async of osc_enqueue_base()
+	 * must be true
+	 *
+	 * For non-speculative locks:
 	 * DLM's enqueue callback set to osc_lock_upcall() with cookie as
 	 * osc_lock.
+	 * For speculative locks:
+	 * osc_lock_upcall_speculative & cookie is the osc object, since
+	 * there is no osc_lock
 	 */
 	ostid_build_res_name(&osc->oo_oinfo->loi_oi, resname);
 	osc_lock_build_policy(env, lock, policy);
-	if (oscl->ols_agl) {
+	if (oscl->ols_speculative) {
 		oscl->ols_einfo.ei_cbdata = NULL;
 		/* hold a reference for callback */
 		cl_object_get(osc2cl(osc));
-		upcall = osc_lock_upcall_agl;
+		upcall = osc_lock_upcall_speculative;
 		cookie = osc;
 	}
-	result = osc_enqueue_base(osc_export(osc), resname, &oscl->ols_flags,
+	result = osc_enqueue_base(exp, resname, &oscl->ols_flags,
 				  policy, &oscl->ols_lvb,
-				  osc->oo_oinfo->loi_kms_valid,
 				  upcall, cookie,
 				  &oscl->ols_einfo, PTLRPCD_SET, async,
-				  oscl->ols_agl);
+				  oscl->ols_speculative);
 	if (result == 0) {
 		if (osc_lock_is_lockless(oscl)) {
 			oio->oi_lockless = 1;
@@ -1007,9 +1050,12 @@ static int osc_lock_enqueue(const struct lu_env *env,
 			LASSERT(oscl->ols_hold);
 			LASSERT(oscl->ols_dlmlock != NULL);
 		}
-	} else if (oscl->ols_agl) {
+	} else if (oscl->ols_speculative) {
 		cl_object_put(env, osc2cl(osc));
-		result = 0;
+		if (oscl->ols_glimpse) {
+			/* hide error for AGL request */
+			result = 0;
+		}
 	}
 
 out:
@@ -1067,8 +1113,8 @@ static void osc_lock_detach(const struct lu_env *env, struct osc_lock *olck)
  *
  *     - cancels ldlm lock (ldlm_cli_cancel()).
  */
-static void osc_lock_cancel(const struct lu_env *env,
-                            const struct cl_lock_slice *slice)
+void osc_lock_cancel(const struct lu_env *env,
+		     const struct cl_lock_slice *slice)
 {
 	struct osc_object *obj  = cl2osc(slice->cls_obj);
 	struct osc_lock	  *oscl = cl2osc_lock(slice);
@@ -1084,9 +1130,10 @@ static void osc_lock_cancel(const struct lu_env *env,
 	osc_lock_wake_waiters(env, obj, oscl);
 	EXIT;
 }
+EXPORT_SYMBOL(osc_lock_cancel);
 
-static int osc_lock_print(const struct lu_env *env, void *cookie,
-			  lu_printer_t p, const struct cl_lock_slice *slice)
+int osc_lock_print(const struct lu_env *env, void *cookie,
+		   lu_printer_t p, const struct cl_lock_slice *slice)
 {
 	struct osc_lock *lock = cl2osc_lock(slice);
 
@@ -1096,6 +1143,7 @@ static int osc_lock_print(const struct lu_env *env, void *cookie,
 	osc_lvb_print(env, cookie, p, &lock->ols_lvb);
 	return 0;
 }
+EXPORT_SYMBOL(osc_lock_print);
 
 static const struct cl_lock_operations osc_lock_ops = {
         .clo_fini    = osc_lock_fini,
@@ -1129,9 +1177,8 @@ static const struct cl_lock_operations osc_lock_lockless_ops = {
         .clo_print     = osc_lock_print
 };
 
-static void osc_lock_set_writer(const struct lu_env *env,
-				const struct cl_io *io,
-				struct cl_object *obj, struct osc_lock *oscl)
+void osc_lock_set_writer(const struct lu_env *env, const struct cl_io *io,
+			 struct cl_object *obj, struct osc_lock *oscl)
 {
 	struct cl_lock_descr *descr = &oscl->ols_cl.cls_lock->cll_descr;
 	pgoff_t io_start;
@@ -1141,9 +1188,9 @@ static void osc_lock_set_writer(const struct lu_env *env,
 		return;
 
 	if (likely(io->ci_type == CIT_WRITE)) {
-		io_start = cl_index(obj, io->u.ci_rw.rw_range.cir_pos);
-		io_end = cl_index(obj, io->u.ci_rw.rw_range.cir_pos +
-				  io->u.ci_rw.rw_range.cir_count - 1);
+		io_start = cl_index(obj, io->u.ci_rw.crw_pos);
+		io_end = cl_index(obj, io->u.ci_rw.crw_pos +
+						io->u.ci_rw.crw_count - 1);
 	} else {
 		LASSERT(cl_io_is_mkwrite(io));
 		io_start = io_end = io->u.ci_fault.ft_index;
@@ -1159,6 +1206,7 @@ static void osc_lock_set_writer(const struct lu_env *env,
 		oio->oi_write_osclock = oscl;
 	}
 }
+EXPORT_SYMBOL(osc_lock_set_writer);
 
 int osc_lock_init(const struct lu_env *env,
 		  struct cl_object *obj, struct cl_lock *lock,
@@ -1176,15 +1224,23 @@ int osc_lock_init(const struct lu_env *env,
 	INIT_LIST_HEAD(&oscl->ols_waiting_list);
 	INIT_LIST_HEAD(&oscl->ols_wait_entry);
 	INIT_LIST_HEAD(&oscl->ols_nextlock_oscobj);
+	oscl->ols_lockless_ops = &osc_lock_lockless_ops;
+
+	/* Speculative lock requests must be either no_expand or glimpse
+	 * request (CEF_GLIMPSE).  non-glimpse no_expand speculative extent
+	 * locks will break ofd_intent_cb. (see comment there)*/
+	LASSERT(ergo((enqflags & CEF_SPECULATIVE) != 0,
+		(enqflags & (CEF_LOCK_NO_EXPAND | CEF_GLIMPSE)) != 0));
 
 	oscl->ols_flags = osc_enq2ldlm_flags(enqflags);
-	oscl->ols_agl = !!(enqflags & CEF_AGL);
-	if (oscl->ols_agl)
-		oscl->ols_flags |= LDLM_FL_BLOCK_NOWAIT;
+	oscl->ols_speculative = !!(enqflags & CEF_SPECULATIVE);
+
 	if (oscl->ols_flags & LDLM_FL_HAS_INTENT) {
 		oscl->ols_flags |= LDLM_FL_BLOCK_GRANTED;
 		oscl->ols_glimpse = 1;
 	}
+	if (io->ci_ndelay && cl_object_same(io->ci_obj, obj))
+		oscl->ols_flags |= LDLM_FL_NDELAY;
 	osc_lock_build_einfo(env, lock, cl2osc(obj), &oscl->ols_einfo);
 
 	cl_lock_slice_add(lock, &oscl->ols_cl, obj, &osc_lock_ops);
@@ -1208,9 +1264,10 @@ int osc_lock_init(const struct lu_env *env,
  * Finds an existing lock covering given index and optionally different from a
  * given \a except lock.
  */
-struct ldlm_lock *osc_dlmlock_at_pgoff(const struct lu_env *env,
-				       struct osc_object *obj, pgoff_t index,
-				       enum osc_dap_flags dap_flags)
+struct ldlm_lock *osc_obj_dlmlock_at_pgoff(const struct lu_env *env,
+					   struct osc_object *obj,
+					   pgoff_t index,
+					   enum osc_dap_flags dap_flags)
 {
 	struct osc_thread_info *info = osc_env_info(env);
 	struct ldlm_res_id *resname = &info->oti_resname;
@@ -1234,9 +1291,9 @@ struct ldlm_lock *osc_dlmlock_at_pgoff(const struct lu_env *env,
 	 * with a uniq gid and it conflicts with all other lock modes too
 	 */
 again:
-	mode = osc_match_base(osc_export(obj), resname, LDLM_EXTENT, policy,
-			       LCK_PR | LCK_PW | LCK_GROUP, &flags, obj, &lockh,
-			       dap_flags & OSC_DAP_FL_CANCELING);
+	mode = osc_match_base(env, osc_export(obj), resname, LDLM_EXTENT,
+			      policy, LCK_PR | LCK_PW | LCK_GROUP, &flags,
+			      obj, &lockh, dap_flags & OSC_DAP_FL_CANCELING);
 	if (mode != 0) {
 		lock = ldlm_handle2lock(&lockh);
 		/* RACE: the lock is cancelled so let's try again */
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_object.c b/drivers/staging/lustrefsx/lustre/osc/osc_object.c
index 052f8bc90525c..a99747cecf011 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_object.c
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_object.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -36,8 +36,9 @@
  */
 
 #define DEBUG_SUBSYSTEM S_OSC
+#include <lustre_osc.h>
 
-#include "osc_cl_internal.h"
+#include "osc_internal.h"
 
 /** \addtogroup osc
  *  @{
@@ -45,34 +46,27 @@
 
 /*****************************************************************************
  *
- * Type conversions.
+ * Object operations.
  *
  */
-
-static struct lu_object *osc2lu(struct osc_object *osc)
-{
-        return &osc->oo_cl.co_lu;
-}
-
-static struct osc_object *lu2osc(const struct lu_object *obj)
+static void osc_obj_build_res_name(struct osc_object *osc,
+				   struct ldlm_res_id *resname)
 {
-        LINVRNT(osc_is_object(obj));
-        return container_of0(obj, struct osc_object, oo_cl.co_lu);
+	ostid_build_res_name(&osc->oo_oinfo->loi_oi, resname);
 }
 
-/*****************************************************************************
- *
- * Object operations.
- *
- */
+static const struct osc_object_operations osc_object_ops = {
+	.oto_build_res_name = osc_obj_build_res_name,
+	.oto_dlmlock_at_pgoff = osc_obj_dlmlock_at_pgoff,
+};
 
-static int osc_object_init(const struct lu_env *env, struct lu_object *obj,
-                           const struct lu_object_conf *conf)
+int osc_object_init(const struct lu_env *env, struct lu_object *obj,
+		    const struct lu_object_conf *conf)
 {
         struct osc_object           *osc   = lu2osc(obj);
         const struct cl_object_conf *cconf = lu2cl_conf(conf);
 
-        osc->oo_oinfo = cconf->u.coc_oinfo;
+	osc->oo_oinfo = cconf->u.coc_oinfo;
 #ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
 	mutex_init(&osc->oo_debug_mutex);
 #endif
@@ -96,12 +90,15 @@ static int osc_object_init(const struct lu_env *env, struct lu_object *obj,
 	atomic_set(&osc->oo_nr_ios, 0);
 	init_waitqueue_head(&osc->oo_io_waitq);
 
+	LASSERT(osc->oo_obj_ops != NULL);
+
 	cl_object_page_init(lu2cl(obj), sizeof(struct osc_page));
 
 	return 0;
 }
+EXPORT_SYMBOL(osc_object_init);
 
-static void osc_object_free(const struct lu_env *env, struct lu_object *obj)
+void osc_object_free(const struct lu_env *env, struct lu_object *obj)
 {
 	struct osc_object *osc = lu2osc(obj);
 
@@ -123,22 +120,24 @@ static void osc_object_free(const struct lu_env *env, struct lu_object *obj)
 	lu_object_fini(obj);
 	OBD_SLAB_FREE_PTR(osc, osc_object_kmem);
 }
+EXPORT_SYMBOL(osc_object_free);
 
 int osc_lvb_print(const struct lu_env *env, void *cookie,
-                  lu_printer_t p, const struct ost_lvb *lvb)
+		  lu_printer_t p, const struct ost_lvb *lvb)
 {
 	return (*p)(env, cookie, "size: %llu mtime: %llu atime: %llu "
 		    "ctime: %llu blocks: %llu",
                     lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime,
                     lvb->lvb_ctime, lvb->lvb_blocks);
 }
+EXPORT_SYMBOL(osc_lvb_print);
 
-static int osc_object_print(const struct lu_env *env, void *cookie,
-                            lu_printer_t p, const struct lu_object *obj)
+int osc_object_print(const struct lu_env *env, void *cookie,
+		     lu_printer_t p, const struct lu_object *obj)
 {
-	struct osc_object   *osc   = lu2osc(obj);
-	struct lov_oinfo    *oinfo = osc->oo_oinfo;
-	struct osc_async_rc *ar    = &oinfo->loi_ar;
+	struct osc_object *osc = lu2osc(obj);
+	struct lov_oinfo *oinfo = osc->oo_oinfo;
+	struct osc_async_rc *ar = &oinfo->loi_ar;
 
 	(*p)(env, cookie, "id: "DOSTID" "
 	     "idx: %d gen: %d kms_valid: %u kms %llu "
@@ -149,20 +148,22 @@ static int osc_object_print(const struct lu_env *env, void *cookie,
 	osc_lvb_print(env, cookie, p, &oinfo->loi_lvb);
 	return 0;
 }
+EXPORT_SYMBOL(osc_object_print);
 
 
-static int osc_attr_get(const struct lu_env *env, struct cl_object *obj,
-                        struct cl_attr *attr)
+int osc_attr_get(const struct lu_env *env, struct cl_object *obj,
+		 struct cl_attr *attr)
 {
-        struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+	struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
 
-        cl_lvb2attr(attr, &oinfo->loi_lvb);
-        attr->cat_kms = oinfo->loi_kms_valid ? oinfo->loi_kms : 0;
-        return 0;
+	cl_lvb2attr(attr, &oinfo->loi_lvb);
+	attr->cat_kms = oinfo->loi_kms_valid ? oinfo->loi_kms : 0;
+	return 0;
 }
+EXPORT_SYMBOL(osc_attr_get);
 
-static int osc_attr_update(const struct lu_env *env, struct cl_object *obj,
-			   const struct cl_attr *attr, unsigned valid)
+int osc_attr_update(const struct lu_env *env, struct cl_object *obj,
+		    const struct cl_attr *attr, unsigned valid)
 {
 	struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
 	struct ost_lvb   *lvb   = &oinfo->loi_lvb;
@@ -184,39 +185,66 @@ static int osc_attr_update(const struct lu_env *env, struct cl_object *obj,
 	}
 	return 0;
 }
+EXPORT_SYMBOL(osc_attr_update);
 
-static int osc_object_glimpse(const struct lu_env *env,
-                              const struct cl_object *obj, struct ost_lvb *lvb)
+int osc_object_glimpse(const struct lu_env *env, const struct cl_object *obj,
+		       struct ost_lvb *lvb)
 {
-        struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+	struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
 
-        ENTRY;
-        lvb->lvb_size   = oinfo->loi_kms;
-        lvb->lvb_blocks = oinfo->loi_lvb.lvb_blocks;
-        RETURN(0);
+	lvb->lvb_size = oinfo->loi_kms;
+	lvb->lvb_blocks = oinfo->loi_lvb.lvb_blocks;
+	return 0;
 }
+EXPORT_SYMBOL(osc_object_glimpse);
 
 static int osc_object_ast_clear(struct ldlm_lock *lock, void *data)
 {
+	struct osc_object *osc = (struct osc_object *)data;
+	struct ost_lvb *lvb = lock->l_lvb_data;
+	struct lov_oinfo *oinfo;
 	ENTRY;
 
-	if (lock->l_ast_data == data)
+	if (lock->l_ast_data == data) {
 		lock->l_ast_data = NULL;
+
+		LASSERT(osc != NULL);
+		LASSERT(osc->oo_oinfo != NULL);
+		LASSERT(lvb != NULL);
+
+		/* Updates lvb in lock by the cached oinfo */
+		oinfo = osc->oo_oinfo;
+
+		LDLM_DEBUG(lock, "update lock size %llu blocks %llu [cma]time: "
+			   "%llu %llu %llu by oinfo size %llu blocks %llu "
+			   "[cma]time %llu %llu %llu", lvb->lvb_size,
+			   lvb->lvb_blocks, lvb->lvb_ctime, lvb->lvb_mtime,
+			   lvb->lvb_atime, oinfo->loi_lvb.lvb_size,
+			   oinfo->loi_lvb.lvb_blocks, oinfo->loi_lvb.lvb_ctime,
+			   oinfo->loi_lvb.lvb_mtime, oinfo->loi_lvb.lvb_atime);
+		LASSERT(oinfo->loi_lvb.lvb_size >= oinfo->loi_kms);
+
+		cl_object_attr_lock(&osc->oo_cl);
+		memcpy(lvb, &oinfo->loi_lvb, sizeof(oinfo->loi_lvb));
+		cl_object_attr_unlock(&osc->oo_cl);
+		ldlm_clear_lvb_cached(lock);
+	}
 	RETURN(LDLM_ITER_CONTINUE);
 }
 
-static int osc_object_prune(const struct lu_env *env, struct cl_object *obj)
+int osc_object_prune(const struct lu_env *env, struct cl_object *obj)
 {
-	struct osc_object       *osc = cl2osc(obj);
-	struct ldlm_res_id      *resname = &osc_env_info(env)->oti_resname;
+	struct osc_object  *osc = cl2osc(obj);
+	struct ldlm_res_id *resname = &osc_env_info(env)->oti_resname;
 
 	/* DLM locks don't hold a reference of osc_object so we have to
 	 * clear it before the object is being destroyed. */
-	ostid_build_res_name(&osc->oo_oinfo->loi_oi, resname);
+	osc_build_res_name(osc, resname);
 	ldlm_resource_iterate(osc_export(osc)->exp_obd->obd_namespace, resname,
 			      osc_object_ast_clear, osc);
 	return 0;
 }
+EXPORT_SYMBOL(osc_object_prune);
 
 static int osc_object_fiemap(const struct lu_env *env, struct cl_object *obj,
 			     struct ll_fiemap_info_key *fmkey,
@@ -303,24 +331,11 @@ static int osc_object_fiemap(const struct lu_env *env, struct cl_object *obj,
 	RETURN(rc);
 }
 
-void osc_object_set_contended(struct osc_object *obj)
-{
-        obj->oo_contention_time = cfs_time_current();
-        /* mb(); */
-        obj->oo_contended = 1;
-}
-
-void osc_object_clear_contended(struct osc_object *obj)
-{
-        obj->oo_contended = 0;
-}
-
 int osc_object_is_contended(struct osc_object *obj)
 {
-        struct osc_device *dev  = lu2osc_dev(obj->oo_cl.co_lu.lo_dev);
-        int osc_contention_time = dev->od_contention_time;
-        cfs_time_t cur_time     = cfs_time_current();
-        cfs_time_t retry_time;
+	struct osc_device *dev = lu2osc_dev(obj->oo_cl.co_lu.lo_dev);
+	time64_t osc_contention_time = dev->od_contention_time;
+	ktime_t retry_time;
 
         if (OBD_FAIL_CHECK(OBD_FAIL_OSC_OBJECT_CONTENTION))
                 return 1;
@@ -328,18 +343,19 @@ int osc_object_is_contended(struct osc_object *obj)
         if (!obj->oo_contended)
                 return 0;
 
-        /*
-         * I like copy-paste. the code is copied from
-         * ll_file_is_contended.
-         */
-        retry_time = cfs_time_add(obj->oo_contention_time,
-                                  cfs_time_seconds(osc_contention_time));
-        if (cfs_time_after(cur_time, retry_time)) {
-                osc_object_clear_contended(obj);
-                return 0;
-        }
-        return 1;
+	/*
+	 * I like copy-paste. the code is copied from
+	 * ll_file_is_contended.
+	 */
+	retry_time = ktime_add_ns(obj->oo_contention_time,
+				  osc_contention_time * NSEC_PER_SEC);
+	if (ktime_after(ktime_get(), retry_time)) {
+		osc_object_clear_contended(obj);
+		return 0;
+	}
+	return 1;
 }
+EXPORT_SYMBOL(osc_object_is_contended);
 
 /**
  * Implementation of struct cl_object_operations::coo_req_attr_set() for osc
@@ -452,6 +468,7 @@ struct lu_object *osc_object_alloc(const struct lu_env *env,
 		lu_object_init(obj, NULL, dev);
 		osc->oo_cl.co_ops = &osc_ops;
 		obj->lo_ops = &osc_lu_obj_ops;
+		osc->oo_obj_ops = &osc_object_ops;
 	} else
 		obj = NULL;
 	return obj;
@@ -478,5 +495,5 @@ int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc)
 
 	RETURN(0);
 }
-
+EXPORT_SYMBOL(osc_object_invalidate);
 /** @} osc */
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_page.c b/drivers/staging/lustrefsx/lustre/osc/osc_page.c
index c89d11333357d..a37c185772a00 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_page.c
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_page.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -36,8 +36,9 @@
  */
 
 #define DEBUG_SUBSYSTEM S_OSC
+#include <lustre_osc.h>
 
-#include "osc_cl_internal.h"
+#include "osc_internal.h"
 
 static void osc_lru_del(struct client_obd *cli, struct osc_page *opg);
 static void osc_lru_use(struct client_obd *cli, struct osc_page *opg);
@@ -118,12 +119,12 @@ static const char *osc_list(struct list_head *head)
 	return list_empty(head) ? "-" : "+";
 }
 
-static inline cfs_time_t osc_submit_duration(struct osc_page *opg)
+static inline s64 osc_submit_duration(struct osc_page *opg)
 {
-        if (opg->ops_submit_time == 0)
-                return 0;
+	if (ktime_to_ns(opg->ops_submit_time) == 0)
+		return 0;
 
-        return (cfs_time_current() - opg->ops_submit_time);
+	return ktime_ms_delta(ktime_get(), opg->ops_submit_time);
 }
 
 static int osc_page_print(const struct lu_env *env,
@@ -138,8 +139,8 @@ static int osc_page_print(const struct lu_env *env,
 	return (*printer)(env, cookie, LUSTRE_OSC_NAME"-page@%p %lu: "
 			  "1< %#x %d %u %s %s > "
 			  "2< %lld %u %u %#x %#x | %p %p %p > "
-			  "3< %d %lu %d > "
-			  "4< %d %d %d %lu %s | %s %s %s %s > "
+			  "3< %d %lld %d > "
+			  "4< %d %d %d %lu %c | %s %s %s %s > "
 			  "5< %s %s %s %s | %d %s | %d %s %s>\n",
 			  opg, osc_index(opg),
                           /* 1 */
@@ -158,7 +159,7 @@ static int osc_page_print(const struct lu_env *env,
                           cli->cl_r_in_flight, cli->cl_w_in_flight,
                           cli->cl_max_rpcs_in_flight,
                           cli->cl_avail_grant,
-                          osc_list(&cli->cl_cache_waiters),
+			  waitqueue_active(&cli->cl_cache_waiters) ? '+' : '-',
                           osc_list(&cli->cl_loi_ready_list),
                           osc_list(&cli->cl_loi_hp_ready_list),
                           osc_list(&cli->cl_loi_write_list),
@@ -254,12 +255,22 @@ static int osc_page_flush(const struct lu_env *env,
 	RETURN(rc);
 }
 
+static void osc_page_touch(const struct lu_env *env,
+			  const struct cl_page_slice *slice, size_t to)
+{
+	struct osc_page *opg = cl2osc_page(slice);
+	struct cl_object *obj = opg->ops_cl.cpl_obj;
+
+	osc_page_touch_at(env, obj, osc_index(opg), to);
+}
+
 static const struct cl_page_operations osc_page_ops = {
 	.cpo_print         = osc_page_print,
 	.cpo_delete        = osc_page_delete,
 	.cpo_clip           = osc_page_clip,
 	.cpo_cancel         = osc_page_cancel,
-	.cpo_flush          = osc_page_flush
+	.cpo_flush          = osc_page_flush,
+	.cpo_page_touch	   = osc_page_touch,
 };
 
 int osc_page_init(const struct lu_env *env, struct cl_object *obj,
@@ -307,6 +318,7 @@ int osc_page_init(const struct lu_env *env, struct cl_object *obj,
 
 	return result;
 }
+EXPORT_SYMBOL(osc_page_init);
 
 /**
  * Helper function called by osc_io_submit() for every page in an immediate
@@ -315,6 +327,7 @@ int osc_page_init(const struct lu_env *env, struct cl_object *obj,
 void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
 		     enum cl_req_type crt, int brw_flags)
 {
+	struct osc_io *oio = osc_env_io(env);
 	struct osc_async_page *oap = &opg->ops_oap;
 
 	LASSERTF(oap->oap_magic == OAP_MAGIC, "Bad oap magic: oap %p, "
@@ -327,12 +340,12 @@ void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
 	oap->oap_count     = opg->ops_to - opg->ops_from;
 	oap->oap_brw_flags = OBD_BRW_SYNC | brw_flags;
 
-	if (cfs_capable(CFS_CAP_SYS_RESOURCE)) {
+	if (oio->oi_cap_sys_resource) {
 		oap->oap_brw_flags |= OBD_BRW_NOQUOTA;
 		oap->oap_cmd |= OBD_BRW_NOQUOTA;
 	}
 
-	opg->ops_submit_time = cfs_time_current();
+	opg->ops_submit_time = ktime_get();
 	osc_page_transfer_get(opg, "transfer\0imm");
 	osc_page_transfer_add(env, opg, crt);
 }
@@ -516,19 +529,22 @@ static void osc_lru_use(struct client_obd *cli, struct osc_page *opg)
 static void discard_pagevec(const struct lu_env *env, struct cl_io *io,
 				struct cl_page **pvec, int max_index)
 {
-        int i;
+	struct pagevec *pagevec = &osc_env_info(env)->oti_pagevec;
+	int i;
 
-        for (i = 0; i < max_index; i++) {
-                struct cl_page *page = pvec[i];
+	ll_pagevec_init(pagevec, 0);
+	for (i = 0; i < max_index; i++) {
+		struct cl_page *page = pvec[i];
 
 		LASSERT(cl_page_is_owned(page, io));
 		cl_page_delete(env, page);
 		cl_page_discard(env, io, page);
 		cl_page_disown(env, io, page);
-                cl_page_put(env, page);
+		cl_pagevec_put(env, page, pagevec);
 
-                pvec[i] = NULL;
-        }
+		pvec[i] = NULL;
+	}
+	pagevec_release(pagevec);
 }
 
 /**
@@ -588,7 +604,7 @@ long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
 	}
 
 	pvec = (struct cl_page **)osc_env_info(env)->oti_pvec;
-	io = &osc_env_info(env)->oti_io;
+	io = osc_env_thread_io(env);
 
 	spin_lock(&cli->cl_lru_list_lock);
 	if (force)
@@ -690,6 +706,7 @@ long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
 	}
 	RETURN(count > 0 ? count : rc);
 }
+EXPORT_SYMBOL(osc_lru_shrink);
 
 /**
  * Reclaim LRU pages by an IO thread. The caller wants to reclaim at least
@@ -782,6 +799,7 @@ static int osc_lru_alloc(const struct lu_env *env, struct client_obd *cli,
 	struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
 	struct osc_io *oio = osc_env_io(env);
 	int rc = 0;
+
 	ENTRY;
 
 	if (cli->cl_cache == NULL) /* shall not be in LRU */
@@ -887,17 +905,27 @@ void osc_lru_unreserve(struct client_obd *cli, unsigned long npages)
 #endif
 
 static inline void unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
+					    struct osc_brw_async_args *aa,
 					    int factor)
 {
-	int page_count = desc->bd_iov_count;
+	int page_count;
 	void *zone = NULL;
 	int count = 0;
 	int i;
 
-	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
+	if (desc != NULL) {
+		LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
+		page_count = desc->bd_iov_count;
+	} else {
+		page_count = aa->aa_page_count;
+	}
 
 	for (i = 0; i < page_count; i++) {
-		void *pz = page_zone(BD_GET_KIOV(desc, i).kiov_page);
+		void *pz;
+		if (desc)
+			pz = page_zone(BD_GET_KIOV(desc, i).kiov_page);
+		else
+			pz = page_zone(aa->aa_ppga[i]->pg);
 
 		if (likely(pz == zone)) {
 			++count;
@@ -916,14 +944,16 @@ static inline void unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
 		mod_zone_page_state(zone, NR_WRITEBACK, factor * count);
 }
 
-static inline void add_unstable_page_accounting(struct ptlrpc_bulk_desc *desc)
+static inline void add_unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
+						struct osc_brw_async_args *aa)
 {
-	unstable_page_accounting(desc, 1);
+	unstable_page_accounting(desc, aa, 1);
 }
 
-static inline void dec_unstable_page_accounting(struct ptlrpc_bulk_desc *desc)
+static inline void dec_unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
+						struct osc_brw_async_args *aa)
 {
-	unstable_page_accounting(desc, -1);
+	unstable_page_accounting(desc, aa, -1);
 }
 
 /**
@@ -940,12 +970,19 @@ static inline void dec_unstable_page_accounting(struct ptlrpc_bulk_desc *desc)
 void osc_dec_unstable_pages(struct ptlrpc_request *req)
 {
 	struct ptlrpc_bulk_desc *desc       = req->rq_bulk;
+	struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
 	struct client_obd       *cli        = &req->rq_import->imp_obd->u.cli;
-	int			 page_count = desc->bd_iov_count;
+	int			 page_count;
 	long			 unstable_count;
 
+	if (desc)
+		page_count = desc->bd_iov_count;
+	else
+		page_count = aa->aa_page_count;
+
 	LASSERT(page_count >= 0);
-	dec_unstable_page_accounting(desc);
+
+	dec_unstable_page_accounting(desc, aa);
 
 	unstable_count = atomic_long_sub_return(page_count,
 						&cli->cl_unstable_count);
@@ -967,14 +1004,20 @@ void osc_dec_unstable_pages(struct ptlrpc_request *req)
 void osc_inc_unstable_pages(struct ptlrpc_request *req)
 {
 	struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+	struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
 	struct client_obd       *cli  = &req->rq_import->imp_obd->u.cli;
-	long			 page_count = desc->bd_iov_count;
+	long			 page_count;
 
 	/* No unstable page tracking */
 	if (cli->cl_cache == NULL || !cli->cl_cache->ccc_unstable_check)
 		return;
 
-	add_unstable_page_accounting(desc);
+	if (desc)
+		page_count = desc->bd_iov_count;
+	else
+		page_count = aa->aa_page_count;
+
+	add_unstable_page_accounting(desc, aa);
 	atomic_long_add(page_count, &cli->cl_unstable_count);
 	atomic_long_add(page_count, &cli->cl_cache->ccc_unstable_nr);
 
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_quota.c b/drivers/staging/lustrefsx/lustre/osc/osc_quota.c
index 7dcbbd79a5de0..a0aaae784515a 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_quota.c
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_quota.c
@@ -23,12 +23,14 @@
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  *
  * Code originally extracted from quota directory
  */
 
 #include <obd.h>
+#include <lustre_osc.h>
+
 #include "osc_internal.h"
 
 static inline struct osc_quota_info *osc_oqi_alloc(u32 id)
@@ -94,7 +96,7 @@ static inline u32 fl_quota_flag(int qtype)
 	}
 }
 
-int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[],
+int osc_quota_setdq(struct client_obd *cli, __u64 xid, const unsigned int qid[],
 		    u64 valid, u32 flags)
 {
 	int type;
@@ -105,6 +107,17 @@ int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[],
 	if ((valid & (OBD_MD_FLALLQUOTA)) == 0)
 		RETURN(0);
 
+	mutex_lock(&cli->cl_quota_mutex);
+	/* still mark the quots is running out for the old request, because it
+	 * could be processed after the new request at OST, the side effect is
+	 * the following request will be processed synchronously, but it will
+	 * not break the quota enforcement. */
+	if (cli->cl_quota_last_xid > xid && !(flags & OBD_FL_NO_QUOTA_ALL))
+		GOTO(out_unlock, rc);
+
+	if (cli->cl_quota_last_xid < xid)
+		cli->cl_quota_last_xid = xid;
+
 	for (type = 0; type < LL_MAXQUOTAS; type++) {
 		struct osc_quota_info *oqi;
 
@@ -151,6 +164,8 @@ int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[],
 		}
 	}
 
+out_unlock:
+	mutex_unlock(&cli->cl_quota_mutex);
 	RETURN(rc);
 }
 
@@ -230,6 +245,8 @@ int osc_quota_setup(struct obd_device *obd)
 	int i, type;
 	ENTRY;
 
+	mutex_init(&cli->cl_quota_mutex);
+
 	for (type = 0; type < LL_MAXQUOTAS; type++) {
 		cli->cl_quota_hash[type] = cfs_hash_create("QUOTA_HASH",
 							   HASH_QUOTA_CUR_BITS,
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_request.c b/drivers/staging/lustrefsx/lustre/osc/osc_request.c
index b50f4d6ee5019..80695d5805915 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_request.c
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_request.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -32,24 +32,21 @@
 
 #define DEBUG_SUBSYSTEM S_OSC
 
-#include <libcfs/libcfs.h>
-
-#include <lustre/lustre_user.h>
-
+#include <linux/workqueue.h>
 #include <lprocfs_status.h>
 #include <lustre_debug.h>
 #include <lustre_dlm.h>
 #include <lustre_fid.h>
 #include <lustre_ha.h>
-#include <uapi/linux/lustre_ioctl.h>
+#include <uapi/linux/lustre/lustre_ioctl.h>
 #include <lustre_net.h>
 #include <lustre_obdo.h>
-#include <uapi/linux/lustre_param.h>
+#include <uapi/linux/lustre/lustre_param.h>
 #include <obd.h>
 #include <obd_cksum.h>
 #include <obd_class.h>
+#include <lustre_osc.h>
 
-#include "osc_cl_internal.h"
 #include "osc_internal.h"
 
 atomic_t osc_pool_req_count;
@@ -60,17 +57,8 @@ struct ptlrpc_request_pool *osc_rq_pool;
 static unsigned int osc_reqpool_mem_max = 5;
 module_param(osc_reqpool_mem_max, uint, 0444);
 
-struct osc_brw_async_args {
-	struct obdo		 *aa_oa;
-	int			  aa_requested_nob;
-	int			  aa_nio_count;
-	u32			  aa_page_count;
-	int			  aa_resends;
-	struct brw_page	**aa_ppga;
-	struct client_obd	 *aa_cli;
-	struct list_head	  aa_oaps;
-	struct list_head	  aa_exts;
-};
+static int osc_idle_timeout = 20;
+module_param(osc_idle_timeout, uint, 0644);
 
 #define osc_grant_args osc_brw_async_args
 
@@ -93,18 +81,6 @@ struct osc_ladvise_args {
 	void			*la_cookie;
 };
 
-struct osc_enqueue_args {
-	struct obd_export	*oa_exp;
-	enum ldlm_type		oa_type;
-	enum ldlm_mode		oa_mode;
-	__u64			*oa_flags;
-	osc_enqueue_upcall_f	oa_upcall;
-	void			*oa_cookie;
-	struct ost_lvb		*oa_lvb;
-	struct lustre_handle	oa_lockh;
-	unsigned int		oa_agl:1;
-};
-
 static void osc_release_ppga(struct brw_page **ppga, size_t count);
 static int brw_interpret(const struct lu_env *env, struct ptlrpc_request *req,
 			 void *data, int rc);
@@ -410,31 +386,34 @@ static int osc_create(const struct lu_env *env, struct obd_export *exp,
 	RETURN(rc);
 }
 
-int osc_punch_base(struct obd_export *exp, struct obdo *oa,
-                   obd_enqueue_update_f upcall, void *cookie,
-                   struct ptlrpc_request_set *rqset)
+int osc_punch_send(struct obd_export *exp, struct obdo *oa,
+		   obd_enqueue_update_f upcall, void *cookie)
 {
-        struct ptlrpc_request   *req;
-        struct osc_setattr_args *sa;
-        struct ost_body         *body;
-        int                      rc;
-        ENTRY;
+	struct ptlrpc_request *req;
+	struct osc_setattr_args *sa;
+	struct obd_import *imp = class_exp2cliimp(exp);
+	struct ost_body *body;
+	int rc;
 
-        req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_PUNCH);
-        if (req == NULL)
-                RETURN(-ENOMEM);
+	ENTRY;
 
-        rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH);
-        if (rc) {
-                ptlrpc_request_free(req);
-                RETURN(rc);
-        }
-        req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
-        ptlrpc_at_set_req_timeout(req);
+	req = ptlrpc_request_alloc(imp, &RQF_OST_PUNCH);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH);
+	if (rc < 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	osc_set_io_portal(req);
+
+	ptlrpc_at_set_req_timeout(req);
 
 	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
-	LASSERT(body);
-	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+
+	lustre_set_wire_obdo(&imp->imp_connect_data, &body->oa, oa);
 
 	ptlrpc_request_set_replen(req);
 
@@ -444,13 +423,12 @@ int osc_punch_base(struct obd_export *exp, struct obdo *oa,
 	sa->sa_oa = oa;
 	sa->sa_upcall = upcall;
 	sa->sa_cookie = cookie;
-	if (rqset == PTLRPCD_SET)
-		ptlrpcd_add_req(req);
-	else
-		ptlrpc_set_add_req(rqset, req);
+
+	ptlrpcd_add_req(req);
 
 	RETURN(0);
 }
+EXPORT_SYMBOL(osc_punch_send);
 
 static int osc_sync_interpret(const struct lu_env *env,
                               struct ptlrpc_request *req,
@@ -673,21 +651,18 @@ static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
 		oa->o_dirty = cli->cl_dirty_grant;
 	else
 		oa->o_dirty = cli->cl_dirty_pages << PAGE_SHIFT;
-	if (unlikely(cli->cl_dirty_pages - cli->cl_dirty_transit >
-		     cli->cl_dirty_max_pages)) {
-		CERROR("dirty %lu - %lu > dirty_max %lu\n",
-		       cli->cl_dirty_pages, cli->cl_dirty_transit,
+	if (unlikely(cli->cl_dirty_pages > cli->cl_dirty_max_pages)) {
+		CERROR("dirty %lu > dirty_max %lu\n",
+		       cli->cl_dirty_pages,
 		       cli->cl_dirty_max_pages);
 		oa->o_undirty = 0;
-	} else if (unlikely(atomic_long_read(&obd_dirty_pages) -
-			    atomic_long_read(&obd_dirty_transit_pages) >
+	} else if (unlikely(atomic_long_read(&obd_dirty_pages) >
 			    (long)(obd_max_dirty_pages + 1))) {
 		/* The atomic_read() allowing the atomic_inc() are
 		 * not covered by a lock thus they may safely race and trip
 		 * this CERROR() unless we add in a small fudge factor (+1). */
-		CERROR("%s: dirty %ld - %ld > system dirty_max %ld\n",
+		CERROR("%s: dirty %ld > system dirty_max %ld\n",
 		       cli_name(cli), atomic_long_read(&obd_dirty_pages),
-		       atomic_long_read(&obd_dirty_transit_pages),
 		       obd_max_dirty_pages);
 		oa->o_undirty = 0;
 	} else if (unlikely(cli->cl_dirty_max_pages - cli->cl_dirty_pages >
@@ -716,23 +691,33 @@ static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
 		/* Do not ask for more than OBD_MAX_GRANT - a margin for server
 		 * to add extent tax, etc.
 		 */
-		oa->o_undirty = min(undirty, OBD_MAX_GRANT -
-				    (PTLRPC_MAX_BRW_PAGES << PAGE_SHIFT)*4UL);
+		oa->o_undirty = min(undirty, OBD_MAX_GRANT &
+				    ~(PTLRPC_MAX_BRW_SIZE * 4UL));
         }
 	oa->o_grant = cli->cl_avail_grant + cli->cl_reserved_grant;
-        oa->o_dropped = cli->cl_lost_grant;
-        cli->cl_lost_grant = 0;
+	/* o_dropped AKA o_misc is 32 bits, but cl_lost_grant is 64 bits */
+	if (cli->cl_lost_grant > INT_MAX) {
+		CDEBUG(D_CACHE,
+		      "%s: avoided o_dropped overflow: cl_lost_grant %lu\n",
+		      cli_name(cli), cli->cl_lost_grant);
+		oa->o_dropped = INT_MAX;
+	} else {
+		oa->o_dropped = cli->cl_lost_grant;
+	}
+	cli->cl_lost_grant -= oa->o_dropped;
 	spin_unlock(&cli->cl_loi_list_lock);
-	CDEBUG(D_CACHE, "dirty: %llu undirty: %u dropped %u grant: %llu\n",
-               oa->o_dirty, oa->o_undirty, oa->o_dropped, oa->o_grant);
+	CDEBUG(D_CACHE, "%s: dirty: %llu undirty: %u dropped %u grant: %llu"
+	       " cl_lost_grant %lu\n", cli_name(cli), oa->o_dirty,
+	       oa->o_undirty, oa->o_dropped, oa->o_grant, cli->cl_lost_grant);
 }
 
 void osc_update_next_shrink(struct client_obd *cli)
 {
-        cli->cl_next_shrink_grant =
-                cfs_time_shift(cli->cl_grant_shrink_interval);
-        CDEBUG(D_CACHE, "next time %ld to shrink grant \n",
-               cli->cl_next_shrink_grant);
+	cli->cl_next_shrink_grant = ktime_get_seconds() +
+				    cli->cl_grant_shrink_interval;
+
+	CDEBUG(D_CACHE, "next time %lld to shrink grant\n",
+	       cli->cl_next_shrink_grant);
 }
 
 static void __osc_update_grant(struct client_obd *cli, u64 grant)
@@ -750,30 +735,36 @@ static void osc_update_grant(struct client_obd *cli, struct ost_body *body)
         }
 }
 
-static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
-			      u32 keylen, void *key,
-			      u32 vallen, void *val,
-			      struct ptlrpc_request_set *set);
+/**
+ * grant thread data for shrinking space.
+ */
+struct grant_thread_data {
+	struct list_head	gtd_clients;
+	struct mutex		gtd_mutex;
+	unsigned long		gtd_stopped:1;
+};
+static struct grant_thread_data client_gtd;
 
 static int osc_shrink_grant_interpret(const struct lu_env *env,
-                                      struct ptlrpc_request *req,
-                                      void *aa, int rc)
+				      struct ptlrpc_request *req,
+				      void *aa, int rc)
 {
-        struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
-        struct obdo *oa = ((struct osc_grant_args *)aa)->aa_oa;
-        struct ost_body *body;
+	struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+	struct obdo *oa = ((struct osc_grant_args *)aa)->aa_oa;
+	struct ost_body *body;
 
-        if (rc != 0) {
-                __osc_update_grant(cli, oa->o_grant);
-                GOTO(out, rc);
-        }
+	if (rc != 0) {
+		__osc_update_grant(cli, oa->o_grant);
+		GOTO(out, rc);
+	}
 
-        body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
-        LASSERT(body);
-        osc_update_grant(cli, body);
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+	osc_update_grant(cli, body);
 out:
-        OBDO_FREE(oa);
-        return rc;
+	OBD_SLAB_FREE_PTR(oa, osc_obdo_kmem);
+	oa = NULL;
+	return rc;
 }
 
 static void osc_shrink_grant_local(struct client_obd *cli, struct obdo *oa)
@@ -833,6 +824,11 @@ int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes)
 	osc_announce_cached(cli, &body->oa, 0);
 
 	spin_lock(&cli->cl_loi_list_lock);
+	if (target_bytes >= cli->cl_avail_grant) {
+		/* available grant has changed since target calculation */
+		spin_unlock(&cli->cl_loi_list_lock);
+		GOTO(out_free, rc = 0);
+	}
 	body->oa.o_grant = cli->cl_avail_grant - target_bytes;
 	cli->cl_avail_grant = target_bytes;
 	spin_unlock(&cli->cl_loi_list_lock);
@@ -848,20 +844,25 @@ int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes)
                                 sizeof(*body), body, NULL);
         if (rc != 0)
                 __osc_update_grant(cli, body->oa.o_grant);
+out_free:
         OBD_FREE_PTR(body);
         RETURN(rc);
 }
 
 static int osc_should_shrink_grant(struct client_obd *client)
 {
-        cfs_time_t time = cfs_time_current();
-        cfs_time_t next_shrink = client->cl_next_shrink_grant;
+	time64_t next_shrink = client->cl_next_shrink_grant;
 
-        if ((client->cl_import->imp_connect_data.ocd_connect_flags &
-             OBD_CONNECT_GRANT_SHRINK) == 0)
-                return 0;
+	if (client->cl_import == NULL)
+		return 0;
 
-	if (cfs_time_aftereq(time, next_shrink - 5 * CFS_TICK)) {
+	if (!OCD_HAS_FLAG(&client->cl_import->imp_connect_data, GRANT_SHRINK) ||
+	    client->cl_import->imp_grant_shrink_disabled) {
+		osc_update_next_shrink(client);
+		return 0;
+	}
+
+	if (ktime_get_seconds() >= next_shrink - 5) {
 		/* Get the current RPC size directly, instead of going via:
 		 * cli_brw_size(obd->u.cli.cl_import->imp_obd->obd_self_export)
 		 * Keep comment here so that it can be found by searching. */
@@ -876,41 +877,88 @@ static int osc_should_shrink_grant(struct client_obd *client)
         return 0;
 }
 
-static int osc_grant_shrink_grant_cb(struct timeout_item *item, void *data)
+#define GRANT_SHRINK_RPC_BATCH	100
+
+static struct delayed_work work;
+
+static void osc_grant_work_handler(struct work_struct *data)
 {
-	struct client_obd *client;
+	struct client_obd *cli;
+	int rpc_sent;
+	bool init_next_shrink = true;
+	time64_t next_shrink = ktime_get_seconds() + GRANT_SHRINK_INTERVAL;
+
+	rpc_sent = 0;
+	mutex_lock(&client_gtd.gtd_mutex);
+	list_for_each_entry(cli, &client_gtd.gtd_clients,
+			    cl_grant_chain) {
+		if (rpc_sent < GRANT_SHRINK_RPC_BATCH &&
+		    osc_should_shrink_grant(cli)) {
+			osc_shrink_grant(cli);
+			rpc_sent++;
+		}
 
-	list_for_each_entry(client, &item->ti_obd_list, cl_grant_shrink_list) {
-		if (osc_should_shrink_grant(client))
-			osc_shrink_grant(client);
+		if (!init_next_shrink) {
+			if (cli->cl_next_shrink_grant < next_shrink &&
+			    cli->cl_next_shrink_grant > ktime_get_seconds())
+				next_shrink = cli->cl_next_shrink_grant;
+		} else {
+			init_next_shrink = false;
+			next_shrink = cli->cl_next_shrink_grant;
+		}
 	}
-	return 0;
+	mutex_unlock(&client_gtd.gtd_mutex);
+
+	if (client_gtd.gtd_stopped == 1)
+		return;
+
+	if (next_shrink > ktime_get_seconds())
+		schedule_delayed_work(&work, msecs_to_jiffies(
+					(next_shrink - ktime_get_seconds()) *
+					MSEC_PER_SEC));
+	else
+		schedule_work(&work.work);
 }
 
-static int osc_add_shrink_grant(struct client_obd *client)
+/**
+ * Start grant thread for returing grant to server for idle clients.
+ */
+static int osc_start_grant_work(void)
 {
-	int rc;
+	client_gtd.gtd_stopped = 0;
+	mutex_init(&client_gtd.gtd_mutex);
+	INIT_LIST_HEAD(&client_gtd.gtd_clients);
+
+	INIT_DELAYED_WORK(&work, osc_grant_work_handler);
+	schedule_work(&work.work);
 
-	rc = ptlrpc_add_timeout_client(client->cl_grant_shrink_interval,
-				       TIMEOUT_GRANT,
-				       osc_grant_shrink_grant_cb, NULL,
-				       &client->cl_grant_shrink_list);
-	if (rc) {
-		CERROR("add grant client %s error %d\n", cli_name(client), rc);
-		return rc;
-	}
-	CDEBUG(D_CACHE, "add grant client %s\n", cli_name(client));
-	osc_update_next_shrink(client);
 	return 0;
 }
 
-static int osc_del_shrink_grant(struct client_obd *client)
+static void osc_stop_grant_work(void)
+{
+	client_gtd.gtd_stopped = 1;
+	cancel_delayed_work_sync(&work);
+}
+
+static void osc_add_grant_list(struct client_obd *client)
 {
-        return ptlrpc_del_timeout_client(&client->cl_grant_shrink_list,
-                                         TIMEOUT_GRANT);
+	mutex_lock(&client_gtd.gtd_mutex);
+	list_add(&client->cl_grant_chain, &client_gtd.gtd_clients);
+	mutex_unlock(&client_gtd.gtd_mutex);
 }
 
-static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
+static void osc_del_grant_list(struct client_obd *client)
+{
+	if (list_empty(&client->cl_grant_chain))
+		return;
+
+	mutex_lock(&client_gtd.gtd_mutex);
+	list_del_init(&client->cl_grant_chain);
+	mutex_unlock(&client_gtd.gtd_mutex);
+}
+
+void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
 {
 	/*
 	 * ocd_grant is the total grant amount we're expect to hold: if we've
@@ -924,12 +972,19 @@ static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
 	spin_lock(&cli->cl_loi_list_lock);
 	cli->cl_avail_grant = ocd->ocd_grant;
 	if (cli->cl_import->imp_state != LUSTRE_IMP_EVICTED) {
-		cli->cl_avail_grant -= cli->cl_reserved_grant;
+		unsigned long consumed = cli->cl_reserved_grant;
+
 		if (OCD_HAS_FLAG(ocd, GRANT_PARAM))
-			cli->cl_avail_grant -= cli->cl_dirty_grant;
+			consumed += cli->cl_dirty_grant;
 		else
-			cli->cl_avail_grant -=
-					cli->cl_dirty_pages << PAGE_SHIFT;
+			consumed += cli->cl_dirty_pages << PAGE_SHIFT;
+		if (cli->cl_avail_grant < consumed) {
+			CERROR("%s: granted %ld but already consumed %ld\n",
+			       cli_name(cli), cli->cl_avail_grant, consumed);
+			cli->cl_avail_grant = 0;
+		} else {
+			cli->cl_avail_grant -= consumed;
+		}
 	}
 
 	if (OCD_HAS_FLAG(ocd, GRANT_PARAM)) {
@@ -963,10 +1018,10 @@ static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
 		cli->cl_avail_grant, cli->cl_lost_grant, cli->cl_chunkbits,
 		cli->cl_max_extent_pages);
 
-	if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT_SHRINK &&
-	    list_empty(&cli->cl_grant_shrink_list))
-		osc_add_shrink_grant(cli);
+	if (OCD_HAS_FLAG(ocd, GRANT_SHRINK) && list_empty(&cli->cl_grant_chain))
+		osc_add_grant_list(cli);
 }
+EXPORT_SYMBOL(osc_init_grant);
 
 /* We assume that the reason this OSC got a short read is because it read
  * beyond the end of a stripe file; i.e. lustre is reading a sparse file
@@ -1033,8 +1088,8 @@ static int check_write_rcs(struct ptlrpc_request *req,
                         return(-EPROTO);
                 }
         }
-
-        if (req->rq_bulk->bd_nob_transferred != requested_nob) {
+	if (req->rq_bulk != NULL &&
+	    req->rq_bulk->bd_nob_transferred != requested_nob) {
                 CERROR("Unexpected # bytes transferred: %d (requested %d)\n",
                        req->rq_bulk->bd_nob_transferred, requested_nob);
                 return(-EPROTO);
@@ -1046,9 +1101,9 @@ static int check_write_rcs(struct ptlrpc_request *req,
 static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2)
 {
         if (p1->flag != p2->flag) {
-		unsigned mask = ~(OBD_BRW_FROM_GRANT | OBD_BRW_NOCACHE |
-				  OBD_BRW_SYNC       | OBD_BRW_ASYNC   |
-				  OBD_BRW_NOQUOTA    | OBD_BRW_SOFT_SYNC);
+		unsigned mask = ~(OBD_BRW_FROM_GRANT | OBD_BRW_SYNC |
+				  OBD_BRW_ASYNC | OBD_BRW_NOQUOTA |
+				  OBD_BRW_SOFT_SYNC);
 
                 /* warn if we try to combine flags that we don't know to be
                  * safe to combine */
@@ -1063,23 +1118,128 @@ static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2)
         return (p1->off + p1->count == p2->off);
 }
 
-static u32 osc_checksum_bulk(int nob, size_t pg_count,
+#if IS_ENABLED(CONFIG_CRC_T10DIF)
+static int osc_checksum_bulk_t10pi(const char *obd_name, int nob,
+				   size_t pg_count, struct brw_page **pga,
+				   int opc, obd_dif_csum_fn *fn,
+				   int sector_size,
+				   u32 *check_sum)
+{
+	struct ahash_request *req;
+	/* Used Adler as the default checksum type on top of DIF tags */
+	unsigned char cfs_alg = cksum_obd2cfs(OBD_CKSUM_T10_TOP);
+	struct page *__page;
+	unsigned char *buffer;
+	__u16 *guard_start;
+	unsigned int bufsize;
+	int guard_number;
+	int used_number = 0;
+	int used;
+	u32 cksum;
+	int rc = 0;
+	int i = 0;
+
+	LASSERT(pg_count > 0);
+
+	__page = alloc_page(GFP_KERNEL);
+	if (__page == NULL)
+		return -ENOMEM;
+
+	req = cfs_crypto_hash_init(cfs_alg, NULL, 0);
+	if (IS_ERR(req)) {
+		rc = PTR_ERR(req);
+		CERROR("%s: unable to initialize checksum hash %s: rc = %d\n",
+		       obd_name, cfs_crypto_hash_name(cfs_alg), rc);
+		GOTO(out, rc);
+	}
+
+	buffer = kmap(__page);
+	guard_start = (__u16 *)buffer;
+	guard_number = PAGE_SIZE / sizeof(*guard_start);
+	while (nob > 0 && pg_count > 0) {
+		unsigned int count = pga[i]->count > nob ? nob : pga[i]->count;
+
+		/* corrupt the data before we compute the checksum, to
+		 * simulate an OST->client data error */
+		if (unlikely(i == 0 && opc == OST_READ &&
+			     OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE))) {
+			unsigned char *ptr = kmap(pga[i]->pg);
+			int off = pga[i]->off & ~PAGE_MASK;
+
+			memcpy(ptr + off, "bad1", min_t(typeof(nob), 4, nob));
+			kunmap(pga[i]->pg);
+		}
+
+		/*
+		 * The left guard number should be able to hold checksums of a
+		 * whole page
+		 */
+		rc = obd_page_dif_generate_buffer(obd_name, pga[i]->pg,
+						  pga[i]->off & ~PAGE_MASK,
+						  count,
+						  guard_start + used_number,
+						  guard_number - used_number,
+						  &used, sector_size,
+						  fn);
+		if (rc)
+			break;
+
+		used_number += used;
+		if (used_number == guard_number) {
+			cfs_crypto_hash_update_page(req, __page, 0,
+				used_number * sizeof(*guard_start));
+			used_number = 0;
+		}
+
+		nob -= pga[i]->count;
+		pg_count--;
+		i++;
+	}
+	kunmap(__page);
+	if (rc)
+		GOTO(out, rc);
+
+	if (used_number != 0)
+		cfs_crypto_hash_update_page(req, __page, 0,
+			used_number * sizeof(*guard_start));
+
+	bufsize = sizeof(cksum);
+	cfs_crypto_hash_final(req, (unsigned char *)&cksum, &bufsize);
+
+	/* For sending we only compute the wrong checksum instead
+	 * of corrupting the data so it is still correct on a redo */
+	if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND))
+		cksum++;
+
+	*check_sum = cksum;
+out:
+	__free_page(__page);
+	return rc;
+}
+#else /* !CONFIG_CRC_T10DIF */
+#define obd_dif_ip_fn NULL
+#define obd_dif_crc_fn NULL
+#define osc_checksum_bulk_t10pi(name, nob, pgc, pga, opc, fn, ssize, csum)  \
+	-EOPNOTSUPP
+#endif /* CONFIG_CRC_T10DIF */
+
+static int osc_checksum_bulk(int nob, size_t pg_count,
 			     struct brw_page **pga, int opc,
-			     cksum_type_t cksum_type)
+			     enum cksum_types cksum_type,
+			     u32 *cksum)
 {
-	u32				cksum;
 	int				i = 0;
-	struct cfs_crypto_hash_desc	*hdesc;
+	struct ahash_request	       *req;
 	unsigned int			bufsize;
 	unsigned char			cfs_alg = cksum_obd2cfs(cksum_type);
 
 	LASSERT(pg_count > 0);
 
-	hdesc = cfs_crypto_hash_init(cfs_alg, NULL, 0);
-	if (IS_ERR(hdesc)) {
+	req = cfs_crypto_hash_init(cfs_alg, NULL, 0);
+	if (IS_ERR(req)) {
 		CERROR("Unable to initialize checksum hash %s\n",
 		       cfs_crypto_hash_name(cfs_alg));
-		return PTR_ERR(hdesc);
+		return PTR_ERR(req);
 	}
 
 	while (nob > 0 && pg_count > 0) {
@@ -1095,7 +1255,7 @@ static u32 osc_checksum_bulk(int nob, size_t pg_count,
 			memcpy(ptr + off, "bad1", min_t(typeof(nob), 4, nob));
 			kunmap(pga[i]->pg);
 		}
-		cfs_crypto_hash_update_page(hdesc, pga[i]->pg,
+		cfs_crypto_hash_update_page(req, pga[i]->pg,
 					    pga[i]->off & ~PAGE_MASK,
 					    count);
 		LL_CDEBUG_PAGE(D_PAGE, pga[i]->pg, "off %d\n",
@@ -1106,15 +1266,38 @@ static u32 osc_checksum_bulk(int nob, size_t pg_count,
 		i++;
 	}
 
-	bufsize = sizeof(cksum);
-	cfs_crypto_hash_final(hdesc, (unsigned char *)&cksum, &bufsize);
+	bufsize = sizeof(*cksum);
+	cfs_crypto_hash_final(req, (unsigned char *)cksum, &bufsize);
 
 	/* For sending we only compute the wrong checksum instead
 	 * of corrupting the data so it is still correct on a redo */
 	if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND))
-		cksum++;
+		(*cksum)++;
 
-	return cksum;
+	return 0;
+}
+
+static int osc_checksum_bulk_rw(const char *obd_name,
+				enum cksum_types cksum_type,
+				int nob, size_t pg_count,
+				struct brw_page **pga, int opc,
+				u32 *check_sum)
+{
+	obd_dif_csum_fn *fn = NULL;
+	int sector_size = 0;
+	int rc;
+
+	ENTRY;
+	obd_t10_cksum2dif(cksum_type, &fn, &sector_size);
+
+	if (fn)
+		rc = osc_checksum_bulk_t10pi(obd_name, nob, pg_count, pga,
+					     opc, fn, sector_size, check_sum);
+	else
+		rc = osc_checksum_bulk(nob, pg_count, pga, opc, cksum_type,
+				       check_sum);
+
+	RETURN(rc);
 }
 
 static int
@@ -1127,10 +1310,12 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
         struct ost_body         *body;
         struct obd_ioobj        *ioobj;
         struct niobuf_remote    *niobuf;
-        int niocount, i, requested_nob, opc, rc;
+	int niocount, i, requested_nob, opc, rc, short_io_size = 0;
         struct osc_brw_async_args *aa;
         struct req_capsule      *pill;
         struct brw_page *pg_prev;
+	void *short_io_buf;
+	const char *obd_name = cli->cl_import->imp_obd->obd_name;
 
         ENTRY;
         if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ))
@@ -1161,17 +1346,38 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
         req_capsule_set_size(pill, &RMF_NIOBUF_REMOTE, RCL_CLIENT,
                              niocount * sizeof(*niobuf));
 
+	for (i = 0; i < page_count; i++)
+		short_io_size += pga[i]->count;
+
+	/* Check if read/write is small enough to be a short io. */
+	if (short_io_size > cli->cl_max_short_io_bytes || niocount > 1 ||
+	    !imp_connect_shortio(cli->cl_import))
+		short_io_size = 0;
+
+	req_capsule_set_size(pill, &RMF_SHORT_IO, RCL_CLIENT,
+			     opc == OST_READ ? 0 : short_io_size);
+	if (opc == OST_READ)
+		req_capsule_set_size(pill, &RMF_SHORT_IO, RCL_SERVER,
+				     short_io_size);
+
         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, opc);
         if (rc) {
                 ptlrpc_request_free(req);
                 RETURN(rc);
         }
-        req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
-        ptlrpc_at_set_req_timeout(req);
+	osc_set_io_portal(req);
+
+	ptlrpc_at_set_req_timeout(req);
 	/* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own
 	 * retry logic */
 	req->rq_no_retry_einprogress = 1;
 
+	if (short_io_size != 0) {
+		desc = NULL;
+		short_io_buf = NULL;
+		goto no_bulk;
+	}
+
 	desc = ptlrpc_prep_bulk_imp(req, page_count,
 		cli->cl_import->imp_connect_data.ocd_brw_size >> LNET_MTU_BITS,
 		(opc == OST_WRITE ? PTLRPC_BULK_GET_SOURCE :
@@ -1183,7 +1389,7 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
         if (desc == NULL)
                 GOTO(out, rc = -ENOMEM);
         /* NB request now owns desc and will free it when it gets freed */
-
+no_bulk:
         body = req_capsule_client_get(pill, &RMF_OST_BODY);
         ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ);
         niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
@@ -1191,6 +1397,15 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
 
 	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
 
+	/* For READ and WRITE, we can't fill o_uid and o_gid using from_kuid()
+	 * and from_kgid(), because they are asynchronous. Fortunately, variable
+	 * oa contains valid o_uid and o_gid in these two operations.
+	 * Besides, filling o_uid and o_gid is enough for nrs-tbf, see LU-9658.
+	 * OBD_MD_FLUID and OBD_MD_FLUID is not set in order to avoid breaking
+	 * other process logic */
+	body->oa.o_uid = oa->o_uid;
+	body->oa.o_gid = oa->o_gid;
+
 	obdo_to_ioobj(oa, ioobj);
 	ioobj->ioo_bufcnt = niocount;
 	/* The high bits of ioo_max_brw tells server _maximum_ number of bulks
@@ -1198,7 +1413,26 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
 	 * when the RPC is finally sent in ptlrpc_register_bulk(). It sends
 	 * "max - 1" for old client compatibility sending "0", and also so the
 	 * the actual maximum is a power-of-two number, not one less. LU-1431 */
-	ioobj_max_brw_set(ioobj, desc->bd_md_max_brw);
+	if (desc != NULL)
+		ioobj_max_brw_set(ioobj, desc->bd_md_max_brw);
+	else /* short io */
+		ioobj_max_brw_set(ioobj, 0);
+
+	if (short_io_size != 0) {
+		if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
+			body->oa.o_valid |= OBD_MD_FLFLAGS;
+			body->oa.o_flags = 0;
+		}
+		body->oa.o_flags |= OBD_FL_SHORT_IO;
+		CDEBUG(D_CACHE, "Using short io for data transfer, size = %d\n",
+		       short_io_size);
+		if (opc == OST_WRITE) {
+			short_io_buf = req_capsule_client_get(pill,
+							      &RMF_SHORT_IO);
+			LASSERT(short_io_buf != NULL);
+		}
+	}
+
 	LASSERT(page_count > 0);
 	pg_prev = pga[0];
         for (requested_nob = i = 0; i < page_count; i++, niobuf++) {
@@ -1223,9 +1457,19 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
                          pg_prev->pg->index, pg_prev->off);
                 LASSERT((pga[0]->flag & OBD_BRW_SRVLOCK) ==
                         (pg->flag & OBD_BRW_SRVLOCK));
-
-		desc->bd_frag_ops->add_kiov_frag(desc, pg->pg, poff, pg->count);
-                requested_nob += pg->count;
+		if (short_io_size != 0 && opc == OST_WRITE) {
+			unsigned char *ptr = ll_kmap_atomic(pg->pg, KM_USER0);
+
+			LASSERT(short_io_size >= requested_nob + pg->count);
+			memcpy(short_io_buf + requested_nob,
+			       ptr + poff,
+			       pg->count);
+			ll_kunmap_atomic(ptr, KM_USER0);
+		} else if (short_io_size == 0) {
+			desc->bd_frag_ops->add_kiov_frag(desc, pg->pg, poff,
+							 pg->count);
+		}
+		requested_nob += pg->count;
 
                 if (i > 0 && can_merge_pages(pg_prev, pg)) {
                         niobuf--;
@@ -1261,22 +1505,31 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
                     !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
                         /* store cl_cksum_type in a local variable since
                          * it can be changed via lprocfs */
-                        cksum_type_t cksum_type = cli->cl_cksum_type;
+			enum cksum_types cksum_type = cli->cl_cksum_type;
 
                         if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0)
                                 body->oa.o_flags = 0;
 
-                        body->oa.o_flags |= cksum_type_pack(cksum_type);
+			body->oa.o_flags |= obd_cksum_type_pack(obd_name,
+								cksum_type);
                         body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
-                        body->oa.o_cksum = osc_checksum_bulk(requested_nob,
-                                                             page_count, pga,
-                                                             OST_WRITE,
-                                                             cksum_type);
+
+			rc = osc_checksum_bulk_rw(obd_name, cksum_type,
+						  requested_nob, page_count,
+						  pga, OST_WRITE,
+						  &body->oa.o_cksum);
+			if (rc < 0) {
+				CDEBUG(D_PAGE, "failed to checksum, rc = %d\n",
+				       rc);
+				GOTO(out, rc);
+			}
                         CDEBUG(D_PAGE, "checksum at write origin: %x\n",
                                body->oa.o_cksum);
+
                         /* save this in 'oa', too, for later checking */
                         oa->o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
-                        oa->o_flags |= cksum_type_pack(cksum_type);
+			oa->o_flags |= obd_cksum_type_pack(obd_name,
+							   cksum_type);
                 } else {
                         /* clear out the checksum flag, in case this is a
                          * resend but cl_checksum is no longer set. b=11238 */
@@ -1291,26 +1544,27 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
                     !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
                         if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0)
                                 body->oa.o_flags = 0;
-                        body->oa.o_flags |= cksum_type_pack(cli->cl_cksum_type);
+			body->oa.o_flags |= obd_cksum_type_pack(obd_name,
+				cli->cl_cksum_type);
                         body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
-                }
+		}
 
 		/* Client cksum has been already copied to wire obdo in previous
 		 * lustre_set_wire_obdo(), and in the case a bulk-read is being
 		 * resent due to cksum error, this will allow Server to
 		 * check+dump pages on its side */
 	}
-        ptlrpc_request_set_replen(req);
+	ptlrpc_request_set_replen(req);
 
-        CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
-        aa = ptlrpc_req_async_args(req);
-        aa->aa_oa = oa;
-        aa->aa_requested_nob = requested_nob;
-        aa->aa_nio_count = niocount;
-        aa->aa_page_count = page_count;
-        aa->aa_resends = 0;
-        aa->aa_ppga = pga;
-        aa->aa_cli = cli;
+	CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+	aa = ptlrpc_req_async_args(req);
+	aa->aa_oa = oa;
+	aa->aa_requested_nob = requested_nob;
+	aa->aa_nio_count = niocount;
+	aa->aa_page_count = page_count;
+	aa->aa_resends = 0;
+	aa->aa_ppga = pga;
+	aa->aa_cli = cli;
 	INIT_LIST_HEAD(&aa->aa_oaps);
 
 	*reqp = req;
@@ -1389,13 +1643,17 @@ static void dump_all_bulk_pages(struct obdo *oa, __u32 page_count,
 }
 
 static int
-check_write_checksum(struct obdo *oa, const lnet_process_id_t *peer,
-				__u32 client_cksum, __u32 server_cksum,
-				struct osc_brw_async_args *aa)
-{
-        __u32 new_cksum;
-        char *msg;
-        cksum_type_t cksum_type;
+check_write_checksum(struct obdo *oa, const struct lnet_process_id *peer,
+		     __u32 client_cksum, __u32 server_cksum,
+		     struct osc_brw_async_args *aa)
+{
+	const char *obd_name = aa->aa_cli->cl_import->imp_obd->obd_name;
+	enum cksum_types cksum_type;
+	obd_dif_csum_fn *fn = NULL;
+	int sector_size = 0;
+	__u32 new_cksum;
+	char *msg;
+	int rc;
 
         if (server_cksum == client_cksum) {
                 CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
@@ -1406,12 +1664,43 @@ check_write_checksum(struct obdo *oa, const lnet_process_id_t *peer,
 		dump_all_bulk_pages(oa, aa->aa_page_count, aa->aa_ppga,
 				    server_cksum, client_cksum);
 
-	cksum_type = cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
-				       oa->o_flags : 0);
-	new_cksum = osc_checksum_bulk(aa->aa_requested_nob, aa->aa_page_count,
-				      aa->aa_ppga, OST_WRITE, cksum_type);
+	cksum_type = obd_cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
+					   oa->o_flags : 0);
+
+	switch (cksum_type) {
+	case OBD_CKSUM_T10IP512:
+		fn = obd_dif_ip_fn;
+		sector_size = 512;
+		break;
+	case OBD_CKSUM_T10IP4K:
+		fn = obd_dif_ip_fn;
+		sector_size = 4096;
+		break;
+	case OBD_CKSUM_T10CRC512:
+		fn = obd_dif_crc_fn;
+		sector_size = 512;
+		break;
+	case OBD_CKSUM_T10CRC4K:
+		fn = obd_dif_crc_fn;
+		sector_size = 4096;
+		break;
+	default:
+		break;
+	}
+
+	if (fn)
+		rc = osc_checksum_bulk_t10pi(obd_name, aa->aa_requested_nob,
+					     aa->aa_page_count, aa->aa_ppga,
+					     OST_WRITE, fn, sector_size,
+					     &new_cksum);
+	else
+		rc = osc_checksum_bulk(aa->aa_requested_nob, aa->aa_page_count,
+				       aa->aa_ppga, OST_WRITE, cksum_type,
+				       &new_cksum);
 
-	if (cksum_type != cksum_type_unpack(aa->aa_oa->o_flags))
+	if (rc < 0)
+		msg = "failed to calculate the client write checksum";
+	else if (cksum_type != obd_cksum_type_unpack(aa->aa_oa->o_flags))
                 msg = "the server did not use the checksum type specified in "
                       "the original request - likely a protocol problem";
         else if (new_cksum == server_cksum)
@@ -1427,15 +1716,15 @@ check_write_checksum(struct obdo *oa, const lnet_process_id_t *peer,
 			   DFID " object "DOSTID" extent [%llu-%llu], original "
 			   "client csum %x (type %x), server csum %x (type %x),"
 			   " client csum now %x\n",
-			   aa->aa_cli->cl_import->imp_obd->obd_name,
-			   msg, libcfs_nid2str(peer->nid),
+			   obd_name, msg, libcfs_nid2str(peer->nid),
 			   oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0,
 			   oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
 			   oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
 			   POSTID(&oa->o_oi), aa->aa_ppga[0]->off,
 			   aa->aa_ppga[aa->aa_page_count - 1]->off +
 				aa->aa_ppga[aa->aa_page_count-1]->count - 1,
-			   client_cksum, cksum_type_unpack(aa->aa_oa->o_flags),
+			   client_cksum,
+			   obd_cksum_type_unpack(aa->aa_oa->o_flags),
 			   server_cksum, cksum_type, new_cksum);
 	return 1;
 }
@@ -1443,11 +1732,12 @@ check_write_checksum(struct obdo *oa, const lnet_process_id_t *peer,
 /* Note rc enters this function as number of bytes transferred */
 static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
 {
-        struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
+	struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
+	struct client_obd *cli = aa->aa_cli;
+	const char *obd_name = cli->cl_import->imp_obd->obd_name;
 	const struct lnet_process_id *peer =
-                        &req->rq_import->imp_connection->c_peer;
-        struct client_obd *cli = aa->aa_cli;
-        struct ost_body *body;
+		&req->rq_import->imp_connection->c_peer;
+	struct ost_body *body;
 	u32 client_cksum = 0;
         ENTRY;
 
@@ -1472,7 +1762,7 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
 		CDEBUG(D_QUOTA, "setdq for [%u %u %u] with valid %#llx, flags %x\n",
 		       body->oa.o_uid, body->oa.o_gid, body->oa.o_projid,
 		       body->oa.o_valid, body->oa.o_flags);
-		       osc_quota_setdq(cli, qid, body->oa.o_valid,
+		       osc_quota_setdq(cli, req->rq_xid, qid, body->oa.o_valid,
 				       body->oa.o_flags);
         }
 
@@ -1489,9 +1779,9 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
                         CERROR("Unexpected +ve rc %d\n", rc);
                         RETURN(-EPROTO);
                 }
-                LASSERT(req->rq_bulk->bd_nob == aa->aa_requested_nob);
 
-                if (sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk))
+		if (req->rq_bulk != NULL &&
+		    sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk))
                         RETURN(-EAGAIN);
 
                 if ((aa->aa_oa->o_valid & OBD_MD_FLCKSUM) && client_cksum &&
@@ -1506,8 +1796,14 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
 
         /* The rest of this function executes only for OST_READs */
 
-        /* if unwrap_bulk failed, return -EAGAIN to retry */
-        rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc);
+	if (req->rq_bulk == NULL) {
+		rc = req_capsule_get_size(&req->rq_pill, &RMF_SHORT_IO,
+					  RCL_SERVER);
+		LASSERT(rc == req->rq_status);
+	} else {
+		/* if unwrap_bulk failed, return -EAGAIN to retry */
+		rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc);
+	}
         if (rc < 0)
                 GOTO(out, rc = -EAGAIN);
 
@@ -1517,12 +1813,41 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
                 RETURN(-EPROTO);
         }
 
-        if (rc != req->rq_bulk->bd_nob_transferred) {
+	if (req->rq_bulk != NULL && rc != req->rq_bulk->bd_nob_transferred) {
                 CERROR ("Unexpected rc %d (%d transferred)\n",
                         rc, req->rq_bulk->bd_nob_transferred);
                 return (-EPROTO);
         }
 
+	if (req->rq_bulk == NULL) {
+		/* short io */
+		int nob, pg_count, i = 0;
+		unsigned char *buf;
+
+		CDEBUG(D_CACHE, "Using short io read, size %d\n", rc);
+		pg_count = aa->aa_page_count;
+		buf = req_capsule_server_sized_get(&req->rq_pill, &RMF_SHORT_IO,
+						   rc);
+		nob = rc;
+		while (nob > 0 && pg_count > 0) {
+			unsigned char *ptr;
+			int count = aa->aa_ppga[i]->count > nob ?
+				    nob : aa->aa_ppga[i]->count;
+
+			CDEBUG(D_CACHE, "page %p count %d\n",
+			       aa->aa_ppga[i]->pg, count);
+			ptr = ll_kmap_atomic(aa->aa_ppga[i]->pg, KM_USER0);
+			memcpy(ptr + (aa->aa_ppga[i]->off & ~PAGE_MASK), buf,
+			       count);
+			ll_kunmap_atomic((void *) ptr, KM_USER0);
+
+			buf += count;
+			nob -= count;
+			i++;
+			pg_count--;
+		}
+	}
+
         if (rc < aa->aa_requested_nob)
                 handle_short_read(rc, aa->aa_page_count, aa->aa_ppga);
 
@@ -1531,15 +1856,19 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
 		u32        server_cksum = body->oa.o_cksum;
 		char      *via = "";
 		char      *router = "";
-                cksum_type_t cksum_type;
-
-                cksum_type = cksum_type_unpack(body->oa.o_valid &OBD_MD_FLFLAGS?
-                                               body->oa.o_flags : 0);
-                client_cksum = osc_checksum_bulk(rc, aa->aa_page_count,
-                                                 aa->aa_ppga, OST_READ,
-                                                 cksum_type);
-
-		if (peer->nid != req->rq_bulk->bd_sender) {
+		enum cksum_types cksum_type;
+		u32 o_flags = body->oa.o_valid & OBD_MD_FLFLAGS ?
+			body->oa.o_flags : 0;
+
+		cksum_type = obd_cksum_type_unpack(o_flags);
+		rc = osc_checksum_bulk_rw(obd_name, cksum_type, rc,
+					  aa->aa_page_count, aa->aa_ppga,
+					  OST_READ, &client_cksum);
+		if (rc < 0)
+			GOTO(out, rc);
+
+		if (req->rq_bulk != NULL &&
+		    peer->nid != req->rq_bulk->bd_sender) {
 			via = " via ";
 			router = libcfs_nid2str(req->rq_bulk->bd_sender);
 		}
@@ -1559,7 +1888,7 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
 					   "%s%s%s inode "DFID" object "DOSTID
 					   " extent [%llu-%llu], client %x, "
 					   "server %x, cksum_type %x\n",
-					   req->rq_import->imp_obd->obd_name,
+					   obd_name,
 					   libcfs_nid2str(peer->nid),
 					   via, router,
 					   clbody->oa.o_valid & OBD_MD_FLFID ?
@@ -1713,13 +2042,14 @@ static int brw_interpret(const struct lu_env *env,
 	struct osc_extent *ext;
 	struct osc_extent *tmp;
 	struct client_obd *cli = aa->aa_cli;
+	unsigned long		transferred = 0;
         ENTRY;
 
         rc = osc_brw_fini_request(req, rc);
         CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc);
         /* When server return -EINPROGRESS, client should always retry
          * regardless of the number of times the bulk was resent already. */
-	if (osc_recoverable_error(rc)) {
+	if (osc_recoverable_error(rc) && !req->rq_no_delay) {
 		if (req->rq_import_generation !=
 		    req->rq_import->imp_generation) {
 			CDEBUG(D_HA, "%s: resend cross eviction for object: "
@@ -1793,20 +2123,26 @@ static int brw_interpret(const struct lu_env *env,
 			cl_object_attr_update(env, obj, attr, valid);
 		cl_object_attr_unlock(obj);
 	}
-	OBDO_FREE(aa->aa_oa);
+	OBD_SLAB_FREE_PTR(aa->aa_oa, osc_obdo_kmem);
+	aa->aa_oa = NULL;
 
 	if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE && rc == 0)
 		osc_inc_unstable_pages(req);
 
 	list_for_each_entry_safe(ext, tmp, &aa->aa_exts, oe_link) {
 		list_del_init(&ext->oe_link);
-		osc_extent_finish(env, ext, 1, rc);
+		osc_extent_finish(env, ext, 1,
+				  rc && req->rq_no_delay ? -EWOULDBLOCK : rc);
 	}
 	LASSERT(list_empty(&aa->aa_exts));
 	LASSERT(list_empty(&aa->aa_oaps));
 
+	transferred = (req->rq_bulk == NULL ? /* short io */
+		       aa->aa_requested_nob :
+		       req->rq_bulk->bd_nob_transferred);
+
 	osc_release_ppga(aa->aa_ppga, aa->aa_page_count);
-	ptlrpc_lprocfs_brw(req, req->rq_bulk->bd_nob_transferred);
+	ptlrpc_lprocfs_brw(req, transferred);
 
 	spin_lock(&cli->cl_loi_list_lock);
 	/* We need to decrement before osc_ap_completion->osc_wake_cache_waiters
@@ -1864,9 +2200,11 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 	int				page_count = 0;
 	bool				soft_sync = false;
 	bool				interrupted = false;
+	bool				ndelay = false;
 	int				i;
 	int				grant = 0;
 	int				rc;
+	__u32				layout_version = 0;
 	struct list_head		rpc_list = LIST_HEAD_INIT(rpc_list);
 	struct ost_body			*body;
 	ENTRY;
@@ -1878,6 +2216,7 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 		mem_tight |= ext->oe_memalloc;
 		grant += ext->oe_grants;
 		page_count += ext->oe_nr_pages;
+		layout_version = MAX(layout_version, ext->oe_layout_version);
 		if (obj == NULL)
 			obj = ext->oe_obj;
 	}
@@ -1890,7 +2229,7 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 	if (pga == NULL)
 		GOTO(out, rc = -ENOMEM);
 
-	OBDO_ALLOC(oa);
+	OBD_SLAB_ALLOC_PTR_GFP(oa, osc_obdo_kmem, GFP_NOFS);
 	if (oa == NULL)
 		GOTO(out, rc = -ENOMEM);
 
@@ -1920,6 +2259,8 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 			if (oap->oap_interrupted)
 				interrupted = true;
 		}
+		if (ext->oe_ndelay)
+			ndelay = true;
 	}
 
 	/* first page in the list */
@@ -1933,8 +2274,16 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 	crattr->cra_oa = oa;
 	cl_req_attr_set(env, osc2cl(obj), crattr);
 
-	if (cmd == OBD_BRW_WRITE)
+	if (cmd == OBD_BRW_WRITE) {
 		oa->o_grant_used = grant;
+		if (layout_version > 0) {
+			CDEBUG(D_LAYOUT, DFID": write with layout version %u\n",
+			       PFID(&oa->o_oi.oi_fid), layout_version);
+
+			oa->o_layout_version = layout_version;
+			oa->o_valid |= OBD_MD_LAYOUT_VERSION;
+		}
+	}
 
 	sort_brw_pages(pga, page_count);
 	rc = osc_brw_prep_request(cmd, cli, oa, page_count, pga, &req, 0);
@@ -1949,6 +2298,12 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 	oap->oap_request = ptlrpc_request_addref(req);
 	if (interrupted && !req->rq_intr)
 		ptlrpc_mark_interrupted(req);
+	if (ndelay) {
+		req->rq_no_resend = req->rq_no_delay = 1;
+		/* probably set a shorter timeout value.
+		 * to handle ETIMEDOUT in brw_interpret() correctly. */
+		/* lustre_msg_set_timeout(req, req->rq_timeout / 2); */
+	}
 
 	/* Need to update the timestamps after the request is built in case
 	 * we race with setattr (locally or in queue at OST).  If OST gets
@@ -1957,7 +2312,7 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 	 * way to do this in a single call.  bug 10150 */
 	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
 	crattr->cra_oa = &body->oa;
-	crattr->cra_flags = OBD_MD_FLMTIME|OBD_MD_FLCTIME|OBD_MD_FLATIME;
+	crattr->cra_flags = OBD_MD_FLMTIME | OBD_MD_FLCTIME | OBD_MD_FLATIME;
 	cl_req_attr_set(env, osc2cl(obj), crattr);
 	lustre_msg_set_jobid(req->rq_reqmsg, crattr->cra_jobid);
 
@@ -2002,7 +2357,7 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 		LASSERT(req == NULL);
 
 		if (oa)
-			OBDO_FREE(oa);
+			OBD_SLAB_FREE_PTR(oa, osc_obdo_kmem);
 		if (pga)
 			OBD_FREE(pga, sizeof(*pga) * page_count);
 		/* this should happen rarely and is pretty bad, it makes the
@@ -2035,10 +2390,10 @@ static int osc_set_lock_data(struct ldlm_lock *lock, void *data)
 	return set;
 }
 
-static int osc_enqueue_fini(struct ptlrpc_request *req,
-			    osc_enqueue_upcall_f upcall, void *cookie,
-			    struct lustre_handle *lockh, enum ldlm_mode mode,
-			    __u64 *flags, int agl, int errcode)
+int osc_enqueue_fini(struct ptlrpc_request *req, osc_enqueue_upcall_f upcall,
+		     void *cookie, struct lustre_handle *lockh,
+		     enum ldlm_mode mode, __u64 *flags, bool speculative,
+		     int errcode)
 {
 	bool intent = *flags & LDLM_FL_HAS_INTENT;
 	int rc;
@@ -2055,7 +2410,7 @@ static int osc_enqueue_fini(struct ptlrpc_request *req,
 			ptlrpc_status_ntoh(rep->lock_policy_res1);
 		if (rep->lock_policy_res1)
 			errcode = rep->lock_policy_res1;
-		if (!agl)
+		if (!speculative)
 			*flags |= LDLM_FL_LVB_READY;
 	} else if (errcode == ELDLM_OK) {
 		*flags |= LDLM_FL_LVB_READY;
@@ -2070,12 +2425,11 @@ static int osc_enqueue_fini(struct ptlrpc_request *req,
 	if (errcode == ELDLM_OK && lustre_handle_is_used(lockh))
 		ldlm_lock_decref(lockh, mode);
 
-        RETURN(rc);
+	RETURN(rc);
 }
 
-static int osc_enqueue_interpret(const struct lu_env *env,
-				 struct ptlrpc_request *req,
-				 struct osc_enqueue_args *aa, int rc)
+int osc_enqueue_interpret(const struct lu_env *env, struct ptlrpc_request *req,
+			  struct osc_enqueue_args *aa, int rc)
 {
 	struct ldlm_lock *lock;
 	struct lustre_handle *lockh = &aa->oa_lockh;
@@ -2105,7 +2459,7 @@ static int osc_enqueue_interpret(const struct lu_env *env,
 	/* Let CP AST to grant the lock first. */
 	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1);
 
-	if (aa->oa_agl) {
+	if (aa->oa_speculative) {
 		LASSERT(aa->oa_lvb == NULL);
 		LASSERT(aa->oa_flags == NULL);
 		aa->oa_flags = &flags;
@@ -2117,9 +2471,9 @@ static int osc_enqueue_interpret(const struct lu_env *env,
 				   lockh, rc);
 	/* Complete osc stuff. */
 	rc = osc_enqueue_fini(req, aa->oa_upcall, aa->oa_cookie, lockh, mode,
-			      aa->oa_flags, aa->oa_agl, rc);
+			      aa->oa_flags, aa->oa_speculative, rc);
 
-        OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
 
 	ldlm_lock_decref(lockh, mode);
 	LDLM_LOCK_PUT(lock);
@@ -2137,10 +2491,10 @@ struct ptlrpc_request_set *PTLRPCD_SET = (void *)1;
  * release locks just after they are obtained. */
 int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
 		     __u64 *flags, union ldlm_policy_data *policy,
-		     struct ost_lvb *lvb, int kms_valid,
-		     osc_enqueue_upcall_f upcall, void *cookie,
-		     struct ldlm_enqueue_info *einfo,
-		     struct ptlrpc_request_set *rqset, int async, int agl)
+		     struct ost_lvb *lvb, osc_enqueue_upcall_f upcall,
+		     void *cookie, struct ldlm_enqueue_info *einfo,
+		     struct ptlrpc_request_set *rqset, int async,
+		     bool speculative)
 {
 	struct obd_device *obd = exp->exp_obd;
 	struct lustre_handle lockh = { 0 };
@@ -2156,15 +2510,6 @@ int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
 	policy->l_extent.start -= policy->l_extent.start & ~PAGE_MASK;
 	policy->l_extent.end |= ~PAGE_MASK;
 
-        /*
-         * kms is not valid when either object is completely fresh (so that no
-         * locks are cached), or object was evicted. In the latter case cached
-         * lock cannot be used, because it would prime inode state with
-         * potentially stale LVB.
-         */
-        if (!kms_valid)
-                goto no_match;
-
         /* Next, search for already existing extent locks that will cover us */
         /* If we're trying to read, we also search for an existing PW lock.  The
          * VFS and page cache already protect us locally, so lots of readers/
@@ -2180,7 +2525,10 @@ int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
         mode = einfo->ei_mode;
         if (einfo->ei_mode == LCK_PR)
                 mode |= LCK_PW;
-	if (agl == 0)
+	/* Normal lock requests must wait for the LVB to be ready before
+	 * matching a lock; speculative lock requests do not need to,
+	 * because they will not actually use the lock. */
+	if (!speculative)
 		match_flags |= LDLM_FL_LVB_READY;
 	if (intent != 0)
 		match_flags |= LDLM_FL_BLOCK_GRANTED;
@@ -2193,13 +2541,22 @@ int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
 			RETURN(ELDLM_OK);
 
 		matched = ldlm_handle2lock(&lockh);
-		if (agl) {
-			/* AGL enqueues DLM locks speculatively. Therefore if
-			 * it already exists a DLM lock, it wll just inform the
-			 * caller to cancel the AGL process for this stripe. */
+		if (speculative) {
+			/* This DLM lock request is speculative, and does not
+			 * have an associated IO request. Therefore if there
+			 * is already a DLM lock, it wll just inform the
+			 * caller to cancel the request for this stripe.*/
+			lock_res_and_lock(matched);
+			if (ldlm_extent_equal(&policy->l_extent,
+			    &matched->l_policy_data.l_extent))
+				rc = -EEXIST;
+			else
+				rc = -ECANCELED;
+			unlock_res_and_lock(matched);
+
 			ldlm_lock_decref(&lockh, mode);
 			LDLM_LOCK_PUT(matched);
-			RETURN(-ECANCELED);
+			RETURN(rc);
 		} else if (osc_set_lock_data(matched, einfo->ei_cbdata)) {
 			*flags |= LDLM_FL_LVB_READY;
 
@@ -2215,7 +2572,6 @@ int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
 		}
 	}
 
-no_match:
 	if (*flags & (LDLM_FL_TEST_LOCK | LDLM_FL_MATCH_LOCK))
 		RETURN(-ENOLCK);
 
@@ -2246,20 +2602,20 @@ int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
 			struct osc_enqueue_args *aa;
 			CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
 			aa = ptlrpc_req_async_args(req);
-			aa->oa_exp    = exp;
-			aa->oa_mode   = einfo->ei_mode;
-			aa->oa_type   = einfo->ei_type;
+			aa->oa_exp	   = exp;
+			aa->oa_mode	   = einfo->ei_mode;
+			aa->oa_type	   = einfo->ei_type;
 			lustre_handle_copy(&aa->oa_lockh, &lockh);
-			aa->oa_upcall = upcall;
-			aa->oa_cookie = cookie;
-			aa->oa_agl    = !!agl;
-			if (!agl) {
+			aa->oa_upcall	   = upcall;
+			aa->oa_cookie	   = cookie;
+			aa->oa_speculative = speculative;
+			if (!speculative) {
 				aa->oa_flags  = flags;
 				aa->oa_lvb    = lvb;
 			} else {
-				/* AGL is essentially to enqueue an DLM lock
-				 * in advance, so we don't care about the
-				 * result of AGL enqueue. */
+				/* speculative locks are essentially to enqueue
+				 * a DLM lock  in advance, so we don't care
+				 * about the result of the enqueue. */
 				aa->oa_lvb    = NULL;
 				aa->oa_flags  = NULL;
 			}
@@ -2277,16 +2633,17 @@ int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
 	}
 
 	rc = osc_enqueue_fini(req, upcall, cookie, &lockh, einfo->ei_mode,
-			      flags, agl, rc);
+			      flags, speculative, rc);
 	if (intent)
 		ptlrpc_req_finished(req);
 
 	RETURN(rc);
 }
 
-int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id,
-		   enum ldlm_type type, union ldlm_policy_data *policy,
-		   enum ldlm_mode mode, __u64 *flags, void *data,
+int osc_match_base(const struct lu_env *env, struct obd_export *exp,
+		   struct ldlm_res_id *res_id, enum ldlm_type type,
+		   union ldlm_policy_data *policy, enum ldlm_mode mode,
+		   __u64 *flags, struct osc_object *obj,
 		   struct lustre_handle *lockh, int unref)
 {
 	struct obd_device *obd = exp->exp_obd;
@@ -2314,11 +2671,19 @@ int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id,
 	if (rc == 0 || lflags & LDLM_FL_TEST_LOCK)
 		RETURN(rc);
 
-	if (data != NULL) {
+	if (obj != NULL) {
 		struct ldlm_lock *lock = ldlm_handle2lock(lockh);
 
 		LASSERT(lock != NULL);
-		if (!osc_set_lock_data(lock, data)) {
+		if (osc_set_lock_data(lock, obj)) {
+			lock_res_and_lock(lock);
+			if (!ldlm_is_lvb_cached(lock)) {
+				LASSERT(lock->l_ast_data == obj);
+				osc_lock_lvb_update(env, obj, lock, NULL);
+				ldlm_set_lvb_cached(lock);
+			}
+			unlock_res_and_lock(lock);
+		} else {
 			ldlm_lock_decref(lockh, rc);
 			rc = 0;
 		}
@@ -2361,13 +2726,13 @@ static int osc_statfs_interpret(const struct lu_env *env,
 }
 
 static int osc_statfs_async(struct obd_export *exp,
-                            struct obd_info *oinfo, __u64 max_age,
+			    struct obd_info *oinfo, time64_t max_age,
                             struct ptlrpc_request_set *rqset)
 {
         struct obd_device     *obd = class_exp2obd(exp);
         struct ptlrpc_request *req;
         struct osc_async_args *aa;
-        int                    rc;
+	int rc;
         ENTRY;
 
         /* We could possibly pass max_age in the request (as an absolute
@@ -2385,34 +2750,35 @@ static int osc_statfs_async(struct obd_export *exp,
                 ptlrpc_request_free(req);
                 RETURN(rc);
         }
-        ptlrpc_request_set_replen(req);
-        req->rq_request_portal = OST_CREATE_PORTAL;
-        ptlrpc_at_set_req_timeout(req);
+	ptlrpc_request_set_replen(req);
+	req->rq_request_portal = OST_CREATE_PORTAL;
+	ptlrpc_at_set_req_timeout(req);
 
-        if (oinfo->oi_flags & OBD_STATFS_NODELAY) {
-                /* procfs requests not want stat in wait for avoid deadlock */
-                req->rq_no_resend = 1;
-                req->rq_no_delay = 1;
-        }
+	if (oinfo->oi_flags & OBD_STATFS_NODELAY) {
+		/* procfs requests not want stat in wait for avoid deadlock */
+		req->rq_no_resend = 1;
+		req->rq_no_delay = 1;
+	}
 
-        req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_statfs_interpret;
-        CLASSERT (sizeof(*aa) <= sizeof(req->rq_async_args));
-        aa = ptlrpc_req_async_args(req);
-        aa->aa_oi = oinfo;
+	req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_statfs_interpret;
+	CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+	aa = ptlrpc_req_async_args(req);
+	aa->aa_oi = oinfo;
 
-        ptlrpc_set_add_req(rqset, req);
-        RETURN(0);
+	ptlrpc_set_add_req(rqset, req);
+	RETURN(0);
 }
 
 static int osc_statfs(const struct lu_env *env, struct obd_export *exp,
-                      struct obd_statfs *osfs, __u64 max_age, __u32 flags)
+		      struct obd_statfs *osfs, time64_t max_age, __u32 flags)
 {
-        struct obd_device     *obd = class_exp2obd(exp);
-        struct obd_statfs     *msfs;
-        struct ptlrpc_request *req;
-        struct obd_import     *imp = NULL;
-        int rc;
-        ENTRY;
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct obd_statfs     *msfs;
+	struct ptlrpc_request *req;
+	struct obd_import     *imp = NULL;
+	int rc;
+	ENTRY;
+
 
         /*Since the request might also come from lprocfs, so we need
          *sync this with client_disconnect_export Bug15684*/
@@ -2423,49 +2789,48 @@ static int osc_statfs(const struct lu_env *env, struct obd_export *exp,
         if (!imp)
                 RETURN(-ENODEV);
 
-        /* We could possibly pass max_age in the request (as an absolute
-         * timestamp or a "seconds.usec ago") so the target can avoid doing
-         * extra calls into the filesystem if that isn't necessary (e.g.
-         * during mount that would help a bit).  Having relative timestamps
-         * is not so great if request processing is slow, while absolute
-         * timestamps are not ideal because they need time synchronization. */
-        req = ptlrpc_request_alloc(imp, &RQF_OST_STATFS);
+	/* We could possibly pass max_age in the request (as an absolute
+	 * timestamp or a "seconds.usec ago") so the target can avoid doing
+	 * extra calls into the filesystem if that isn't necessary (e.g.
+	 * during mount that would help a bit).  Having relative timestamps
+	 * is not so great if request processing is slow, while absolute
+	 * timestamps are not ideal because they need time synchronization. */
+	req = ptlrpc_request_alloc(imp, &RQF_OST_STATFS);
 
-        class_import_put(imp);
+	class_import_put(imp);
 
-        if (req == NULL)
-                RETURN(-ENOMEM);
+	if (req == NULL)
+		RETURN(-ENOMEM);
 
-        rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
-        if (rc) {
-                ptlrpc_request_free(req);
-                RETURN(rc);
-        }
-        ptlrpc_request_set_replen(req);
-        req->rq_request_portal = OST_CREATE_PORTAL;
-        ptlrpc_at_set_req_timeout(req);
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	ptlrpc_request_set_replen(req);
+	req->rq_request_portal = OST_CREATE_PORTAL;
+	ptlrpc_at_set_req_timeout(req);
 
-        if (flags & OBD_STATFS_NODELAY) {
-                /* procfs requests not want stat in wait for avoid deadlock */
-                req->rq_no_resend = 1;
-                req->rq_no_delay = 1;
-        }
+	if (flags & OBD_STATFS_NODELAY) {
+		/* procfs requests not want stat in wait for avoid deadlock */
+		req->rq_no_resend = 1;
+		req->rq_no_delay = 1;
+	}
 
-        rc = ptlrpc_queue_wait(req);
-        if (rc)
-                GOTO(out, rc);
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
 
-        msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
-        if (msfs == NULL) {
-                GOTO(out, rc = -EPROTO);
-        }
+	msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+	if (msfs == NULL)
+		GOTO(out, rc = -EPROTO);
 
-        *osfs = *msfs;
+	*osfs = *msfs;
 
-        EXIT;
- out:
-        ptlrpc_req_finished(req);
-        return rc;
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+	return rc;
 }
 
 static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
@@ -2505,10 +2870,9 @@ static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
 	return err;
 }
 
-static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
-			      u32 keylen, void *key,
-			      u32 vallen, void *val,
-			      struct ptlrpc_request_set *set)
+int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+		       u32 keylen, void *key, u32 vallen, void *val,
+		       struct ptlrpc_request_set *set)
 {
         struct ptlrpc_request *req;
         struct obd_device     *obd = exp->exp_obd;
@@ -2595,23 +2959,23 @@ static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
 	tmp = req_capsule_client_get(&req->rq_pill, KEY_IS(KEY_GRANT_SHRINK) ?
 							&RMF_OST_BODY :
 							&RMF_SETINFO_VAL);
-        memcpy(tmp, val, vallen);
+	memcpy(tmp, val, vallen);
 
 	if (KEY_IS(KEY_GRANT_SHRINK)) {
-                struct osc_grant_args *aa;
-                struct obdo *oa;
-
-                CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
-                aa = ptlrpc_req_async_args(req);
-                OBDO_ALLOC(oa);
-                if (!oa) {
-                        ptlrpc_req_finished(req);
-                        RETURN(-ENOMEM);
-                }
-                *oa = ((struct ost_body *)val)->oa;
-                aa->aa_oa = oa;
-                req->rq_interpret_reply = osc_shrink_grant_interpret;
-        }
+		struct osc_grant_args *aa;
+		struct obdo *oa;
+
+		CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+		aa = ptlrpc_req_async_args(req);
+		OBD_SLAB_ALLOC_PTR_GFP(oa, osc_obdo_kmem, GFP_NOFS);
+		if (!oa) {
+			ptlrpc_req_finished(req);
+			RETURN(-ENOMEM);
+		}
+		*oa = ((struct ost_body *)val)->oa;
+		aa->aa_oa = oa;
+		req->rq_interpret_reply = osc_shrink_grant_interpret;
+	}
 
 	ptlrpc_request_set_replen(req);
 	if (!KEY_IS(KEY_GRANT_SHRINK)) {
@@ -2624,25 +2988,27 @@ static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
 
 	RETURN(0);
 }
+EXPORT_SYMBOL(osc_set_info_async);
 
-static int osc_reconnect(const struct lu_env *env,
-                         struct obd_export *exp, struct obd_device *obd,
-                         struct obd_uuid *cluuid,
-                         struct obd_connect_data *data,
-                         void *localdata)
+int osc_reconnect(const struct lu_env *env, struct obd_export *exp,
+		  struct obd_device *obd, struct obd_uuid *cluuid,
+		  struct obd_connect_data *data, void *localdata)
 {
-        struct client_obd *cli = &obd->u.cli;
+	struct client_obd *cli = &obd->u.cli;
 
-        if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) {
-                long lost_grant;
+	if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) {
+		long lost_grant;
 		long grant;
 
 		spin_lock(&cli->cl_loi_list_lock);
 		grant = cli->cl_avail_grant + cli->cl_reserved_grant;
-		if (data->ocd_connect_flags & OBD_CONNECT_GRANT_PARAM)
+		if (data->ocd_connect_flags & OBD_CONNECT_GRANT_PARAM) {
+			/* restore ocd_grant_blkbits as client page bits */
+			data->ocd_grant_blkbits = PAGE_SHIFT;
 			grant += cli->cl_dirty_grant;
-		else
+		} else {
 			grant += cli->cl_dirty_pages << PAGE_SHIFT;
+		}
 		data->ocd_grant = grant ? : 2 * cli_brw_size(obd);
 		lost_grant = cli->cl_lost_grant;
 		cli->cl_lost_grant = 0;
@@ -2655,37 +3021,36 @@ static int osc_reconnect(const struct lu_env *env,
 
 	RETURN(0);
 }
+EXPORT_SYMBOL(osc_reconnect);
 
-static int osc_disconnect(struct obd_export *exp)
+int osc_disconnect(struct obd_export *exp)
 {
 	struct obd_device *obd = class_exp2obd(exp);
 	int rc;
 
-        rc = client_disconnect_export(exp);
-        /**
-         * Initially we put del_shrink_grant before disconnect_export, but it
-         * causes the following problem if setup (connect) and cleanup
-         * (disconnect) are tangled together.
-         *      connect p1                     disconnect p2
-         *   ptlrpc_connect_import
-         *     ...............               class_manual_cleanup
-         *                                     osc_disconnect
-         *                                     del_shrink_grant
-         *   ptlrpc_connect_interrupt
-         *     init_grant_shrink
-         *   add this client to shrink list
-         *                                      cleanup_osc
-         * Bang! pinger trigger the shrink.
-         * So the osc should be disconnected from the shrink list, after we
-         * are sure the import has been destroyed. BUG18662
-         */
-        if (obd->u.cli.cl_import == NULL)
-                osc_del_shrink_grant(&obd->u.cli);
-        return rc;
-}
-
-static int osc_ldlm_resource_invalidate(struct cfs_hash *hs,
-	struct cfs_hash_bd *bd, struct hlist_node *hnode, void *arg)
+	rc = client_disconnect_export(exp);
+	/**
+	 * Initially we put del_shrink_grant before disconnect_export, but it
+	 * causes the following problem if setup (connect) and cleanup
+	 * (disconnect) are tangled together.
+	 *      connect p1                     disconnect p2
+	 *   ptlrpc_connect_import
+	 *     ...............               class_manual_cleanup
+	 *                                     osc_disconnect
+	 *                                     del_shrink_grant
+	 *   ptlrpc_connect_interrupt
+	 *     osc_init_grant
+	 *   add this client to shrink list
+	 *                                      cleanup_osc
+	 * Bang! grant shrink thread trigger the shrink. BUG18662
+	 */
+	osc_del_grant_list(&obd->u.cli);
+	return rc;
+}
+EXPORT_SYMBOL(osc_disconnect);
+
+int osc_ldlm_resource_invalidate(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+				 struct hlist_node *hnode, void *arg)
 {
 	struct lu_env *env = arg;
 	struct ldlm_resource *res = cfs_hash_object(hs, hnode);
@@ -2714,6 +3079,7 @@ static int osc_ldlm_resource_invalidate(struct cfs_hash *hs,
 
 	RETURN(0);
 }
+EXPORT_SYMBOL(osc_ldlm_resource_invalidate);
 
 static int osc_import_event(struct obd_device *obd,
                             struct obd_import *imp,
@@ -2804,7 +3170,7 @@ static int osc_cancel_weight(struct ldlm_lock *lock)
 	 * Cancel all unused and granted extent lock.
 	 */
 	if (lock->l_resource->lr_type == LDLM_EXTENT &&
-	    lock->l_granted_mode == lock->l_req_mode &&
+	    ldlm_is_granted(lock) &&
 	    osc_ldlm_weigh_ast(lock) == 0)
 		RETURN(1);
 
@@ -2821,15 +3187,12 @@ static int brw_queue_work(const struct lu_env *env, void *data)
 	RETURN(0);
 }
 
-int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+int osc_setup_common(struct obd_device *obd, struct lustre_cfg *lcfg)
 {
 	struct client_obd *cli = &obd->u.cli;
-	struct obd_type	  *type;
-	void		  *handler;
-	int		   rc;
-	int		   adding;
-	int		   added;
-	int		   req_count;
+	void *handler;
+	int rc;
+
 	ENTRY;
 
 	rc = ptlrpcd_addref();
@@ -2840,9 +3203,10 @@ int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 	if (rc)
 		GOTO(out_ptlrpcd, rc);
 
+
 	handler = ptlrpcd_alloc_work(cli->cl_import, brw_queue_work, cli);
 	if (IS_ERR(handler))
-		GOTO(out_client_setup, rc = PTR_ERR(handler));
+		GOTO(out_ptlrpcd_work, rc = PTR_ERR(handler));
 	cli->cl_writeback_work = handler;
 
 	handler = ptlrpcd_alloc_work(cli->cl_import, lru_queue_work, cli);
@@ -2855,36 +3219,43 @@ int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 		GOTO(out_ptlrpcd_work, rc);
 
 	cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL;
+	osc_update_next_shrink(cli);
 
-#ifdef CONFIG_PROC_FS
-	obd->obd_vars = lprocfs_osc_obd_vars;
-#endif
-	/* If this is true then both client (osc) and server (osp) are on the
-	 * same node. The osp layer if loaded first will register the osc proc
-	 * directory. In that case this obd_device will be attached its proc
-	 * tree to type->typ_procsym instead of obd->obd_type->typ_procroot. */
-	type = class_search_type(LUSTRE_OSP_NAME);
-	if (type && type->typ_procsym) {
-		obd->obd_proc_entry = lprocfs_register(obd->obd_name,
-						       type->typ_procsym,
-						       obd->obd_vars, obd);
-		if (IS_ERR(obd->obd_proc_entry)) {
-			rc = PTR_ERR(obd->obd_proc_entry);
-			CERROR("error %d setting up lprocfs for %s\n", rc,
-			       obd->obd_name);
-			obd->obd_proc_entry = NULL;
-		}
-	} else {
-		rc = lprocfs_obd_setup(obd);
-	}
+	RETURN(rc);
 
-	/* If the basic OSC proc tree construction succeeded then
-	 * lets do the rest. */
-	if (rc == 0) {
-		lproc_osc_attach_seqstat(obd);
-		sptlrpc_lprocfs_cliobd_attach(obd);
-		ptlrpc_lprocfs_register_obd(obd);
+out_ptlrpcd_work:
+	if (cli->cl_writeback_work != NULL) {
+		ptlrpcd_destroy_work(cli->cl_writeback_work);
+		cli->cl_writeback_work = NULL;
 	}
+	if (cli->cl_lru_work != NULL) {
+		ptlrpcd_destroy_work(cli->cl_lru_work);
+		cli->cl_lru_work = NULL;
+	}
+	client_obd_cleanup(obd);
+out_ptlrpcd:
+	ptlrpcd_decref();
+	RETURN(rc);
+}
+EXPORT_SYMBOL(osc_setup_common);
+
+int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct client_obd *cli = &obd->u.cli;
+	int		   adding;
+	int		   added;
+	int		   req_count;
+	int		   rc;
+
+	ENTRY;
+
+	rc = osc_setup_common(obd, lcfg);
+	if (rc < 0)
+		RETURN(rc);
+
+	rc = osc_tunables_init(obd);
+	if (rc)
+		RETURN(rc);
 
 	/*
 	 * We try to control the total number of requests with a upper limit
@@ -2901,32 +3272,18 @@ int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 		atomic_add(added, &osc_pool_req_count);
 	}
 
-	INIT_LIST_HEAD(&cli->cl_grant_shrink_list);
 	ns_register_cancel(obd->obd_namespace, osc_cancel_weight);
 
 	spin_lock(&osc_shrink_lock);
 	list_add_tail(&cli->cl_shrink_list, &osc_shrink_list);
 	spin_unlock(&osc_shrink_lock);
+	cli->cl_import->imp_idle_timeout = osc_idle_timeout;
+	cli->cl_import->imp_idle_debug = D_HA;
 
 	RETURN(0);
-
-out_ptlrpcd_work:
-	if (cli->cl_writeback_work != NULL) {
-		ptlrpcd_destroy_work(cli->cl_writeback_work);
-		cli->cl_writeback_work = NULL;
-	}
-	if (cli->cl_lru_work != NULL) {
-		ptlrpcd_destroy_work(cli->cl_lru_work);
-		cli->cl_lru_work = NULL;
-	}
-out_client_setup:
-	client_obd_cleanup(obd);
-out_ptlrpcd:
-	ptlrpcd_decref();
-	RETURN(rc);
 }
 
-static int osc_precleanup(struct obd_device *obd)
+int osc_precleanup_common(struct obd_device *obd)
 {
 	struct client_obd *cli = &obd->u.cli;
 	ENTRY;
@@ -2952,11 +3309,21 @@ static int osc_precleanup(struct obd_device *obd)
 	}
 
 	obd_cleanup_client_import(obd);
+	RETURN(0);
+}
+EXPORT_SYMBOL(osc_precleanup_common);
+
+static int osc_precleanup(struct obd_device *obd)
+{
+	ENTRY;
+
+	osc_precleanup_common(obd);
+
 	ptlrpc_lprocfs_unregister_obd(obd);
 	RETURN(0);
 }
 
-int osc_cleanup(struct obd_device *obd)
+int osc_cleanup_common(struct obd_device *obd)
 {
 	struct client_obd *cli = &obd->u.cli;
 	int rc;
@@ -2986,11 +3353,13 @@ int osc_cleanup(struct obd_device *obd)
 	ptlrpcd_decref();
 	RETURN(rc);
 }
+EXPORT_SYMBOL(osc_cleanup_common);
 
 int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg)
 {
-	int rc = class_process_proc_param(PARAM_OSC, obd->obd_vars, lcfg, obd);
-	return rc > 0 ? 0: rc;
+	ssize_t count  = class_modify_config(lcfg, PARAM_OSC,
+					     &obd->obd_kset.kobj);
+	return count > 0 ? 0 : count;
 }
 
 static int osc_process_config(struct obd_device *obd, size_t len, void *buf)
@@ -3002,7 +3371,7 @@ static struct obd_ops osc_obd_ops = {
         .o_owner                = THIS_MODULE,
         .o_setup                = osc_setup,
         .o_precleanup           = osc_precleanup,
-        .o_cleanup              = osc_cleanup,
+	.o_cleanup              = osc_cleanup_common,
         .o_add_conn             = client_import_add_conn,
         .o_del_conn             = client_import_del_conn,
         .o_connect              = client_connect_import,
@@ -3095,19 +3464,28 @@ static int __init osc_init(void)
 	osc_rq_pool = ptlrpc_init_rq_pool(0, OST_IO_MAXREQSIZE,
 					  ptlrpc_add_rqs_to_pool);
 
-	if (osc_rq_pool != NULL)
-		GOTO(out, rc);
-	rc = -ENOMEM;
+	if (osc_rq_pool == NULL)
+		GOTO(out_type, rc = -ENOMEM);
+
+	rc = osc_start_grant_work();
+	if (rc != 0)
+		GOTO(out_req_pool, rc);
+
+	RETURN(rc);
+
+out_req_pool:
+	ptlrpc_free_rq_pool(osc_rq_pool);
 out_type:
 	class_unregister_type(LUSTRE_OSC_NAME);
 out_kmem:
 	lu_kmem_fini(osc_caches);
-out:
+
 	RETURN(rc);
 }
 
 static void __exit osc_exit(void)
 {
+	osc_stop_grant_work();
 	remove_shrinker(osc_cache_shrinker);
 	class_unregister_type(LUSTRE_OSC_NAME);
 	lu_kmem_fini(osc_caches);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/Makefile b/drivers/staging/lustrefsx/lustre/ptlrpc/Makefile
index bf464b8d7b53b..f192313597822 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/Makefile
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/Makefile
@@ -15,7 +15,7 @@ ptlrpc_objs := client.o recover.o connection.o niobuf.o pack_generic.o
 ptlrpc_objs += events.o ptlrpc_module.o service.o pinger.o
 ptlrpc_objs += llog_net.o llog_client.o llog_server.o import.o ptlrpcd.o
 ptlrpc_objs += pers.o lproc_ptlrpc.o wiretest.o layout.o
-ptlrpc_objs += sec.o sec_bulk.o sec_gc.o sec_config.o sec_lproc.o
+ptlrpc_objs += sec.o sec_ctx.o sec_bulk.o sec_gc.o sec_config.o sec_lproc.o
 ptlrpc_objs += sec_null.o sec_plain.o nrs.o nrs_fifo.o nrs_crr.o nrs_orr.o
 ptlrpc_objs += nrs_tbf.o nrs_delay.o errno.o
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/client.c b/drivers/staging/lustrefsx/lustre/ptlrpc/client.c
index 9642a5644009f..b9888d92b1fd8 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/client.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/client.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -34,6 +34,7 @@
 
 #define DEBUG_SUBSYSTEM S_RPC
 
+#include <linux/delay.h>
 #include <obd_support.h>
 #include <obd_class.h>
 #include <lustre_lib.h>
@@ -126,6 +127,12 @@ struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned nfrags, unsigned max_brw,
 		(ptlrpc_is_bulk_desc_kvec(type) &&
 		 ops->add_iov_frag != NULL));
 
+	if (max_brw > PTLRPC_BULK_OPS_COUNT)
+		RETURN(NULL);
+
+	if (nfrags > LNET_MAX_IOV * max_brw)
+		RETURN(NULL);
+
 	OBD_ALLOC_PTR(desc);
 	if (desc == NULL)
 		return NULL;
@@ -148,6 +155,7 @@ struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned nfrags, unsigned max_brw,
 	desc->bd_portal = portal;
 	desc->bd_type = type;
 	desc->bd_md_count = 0;
+	desc->bd_nob_last = LNET_MTU;
 	desc->bd_frag_ops = (struct ptlrpc_bulk_frag_ops *) ops;
 	LASSERT(max_brw > 0);
 	desc->bd_md_max_brw = min(max_brw, PTLRPC_BULK_OPS_COUNT);
@@ -214,7 +222,15 @@ void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
 	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
 
 	kiov = &BD_GET_KIOV(desc, desc->bd_iov_count);
+	if (((desc->bd_iov_count % LNET_MAX_IOV) == 0) ||
+	     ((desc->bd_nob_last + len) > LNET_MTU)) {
+		desc->bd_mds_off[desc->bd_md_count] = desc->bd_iov_count;
+		desc->bd_md_count++;
+		desc->bd_nob_last = 0;
+		LASSERT(desc->bd_md_count <= PTLRPC_BULK_OPS_COUNT);
+	}
 
+	desc->bd_nob_last += len;
 	desc->bd_nob += len;
 
 	if (pin)
@@ -240,7 +256,15 @@ int ptlrpc_prep_bulk_frag(struct ptlrpc_bulk_desc *desc,
 	LASSERT(ptlrpc_is_bulk_desc_kvec(desc->bd_type));
 
 	iovec = &BD_GET_KVEC(desc, desc->bd_iov_count);
+	if (((desc->bd_iov_count % LNET_MAX_IOV) == 0) ||
+	     ((desc->bd_nob_last + len) > LNET_MTU)) {
+		desc->bd_mds_off[desc->bd_md_count] = desc->bd_iov_count;
+		desc->bd_md_count++;
+		desc->bd_nob_last = 0;
+		LASSERT(desc->bd_md_count <= PTLRPC_BULK_OPS_COUNT);
+	}
 
+	desc->bd_nob_last += len;
 	desc->bd_nob += len;
 
 	iovec->iov_base = frag;
@@ -258,7 +282,7 @@ void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc)
 
 	LASSERT(desc != NULL);
 	LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */
-	LASSERT(desc->bd_md_count == 0);         /* network hands off */
+	LASSERT(desc->bd_refs == 0);         /* network hands off */
 	LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL));
 	LASSERT(desc->bd_frag_ops != NULL);
 
@@ -353,7 +377,7 @@ int ptlrpc_at_get_net_latency(struct ptlrpc_request *req)
 
 /* Adjust expected network latency */
 void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
-			       unsigned int service_time)
+			       timeout_t service_timeout)
 {
         unsigned int nl, oldnl;
         struct imp_at *at;
@@ -361,8 +385,9 @@ void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
 
         LASSERT(req->rq_import);
 
-	if (service_time > now - req->rq_sent + 3) {
-		/* bz16408, however, this can also happen if early reply
+	if (service_timeout > now - req->rq_sent + 3) {
+		/*
+		 * b=16408, however, this can also happen if early reply
 		 * is lost and client RPC is expired and resent, early reply
 		 * or reply of original RPC can still be fit in reply buffer
 		 * of resent RPC, now client is measuring time from the
@@ -372,13 +397,13 @@ void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
 		CDEBUG((lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) ?
 		       D_ADAPTTO : D_WARNING,
 		       "Reported service time %u > total measured time %lld\n",
-		       service_time, now - req->rq_sent);
+		       service_timeout, now - req->rq_sent);
 		return;
 	}
 
         /* Network latency is total time less server processing time */
 	nl = max_t(int, now - req->rq_sent -
-			service_time, 0) + 1; /* st rounding */
+			service_timeout, 0) + 1; /* st rounding */
 	at = &req->rq_import->imp_at;
 
         oldnl = at_measured(&at->iat_net_latency, nl);
@@ -419,6 +444,7 @@ static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req)
 __must_hold(&req->rq_lock)
 {
 	struct ptlrpc_request *early_req;
+	timeout_t service_timeout;
 	time64_t olddl;
 	int rc;
 
@@ -448,8 +474,8 @@ __must_hold(&req->rq_lock)
 	lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
 
 	/* Network latency can be adjusted, it is pure network delays */
-	ptlrpc_at_adj_net_latency(req,
-			lustre_msg_get_service_time(early_req->rq_repmsg));
+	service_timeout = lustre_msg_get_service_timeout(early_req->rq_repmsg);
+	ptlrpc_at_adj_net_latency(req, service_timeout);
 
 	sptlrpc_cli_finish_early_reply(early_req);
 
@@ -777,6 +803,7 @@ int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
 	LASSERT(!request->rq_pool);
 	sptlrpc_cli_ctx_put(request->rq_cli_ctx, 1);
 out_free:
+	atomic_dec(&imp->imp_reqs);
 	class_import_put(imp);
 
 	return rc;
@@ -845,6 +872,7 @@ struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp,
 		LASSERT(imp->imp_client != LP_POISON);
 
 		request->rq_import = class_import_get(imp);
+		atomic_inc(&imp->imp_reqs);
 	} else {
 		CERROR("request allocation out of memory\n");
 	}
@@ -852,6 +880,33 @@ struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp,
 	return request;
 }
 
+static int ptlrpc_reconnect_if_idle(struct obd_import *imp)
+{
+	int rc;
+
+	/*
+	 * initiate connection if needed when the import has been
+	 * referenced by the new request to avoid races with disconnect.
+	 * serialize this check against conditional state=IDLE
+	 * in ptlrpc_disconnect_idle_interpret()
+	 */
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_IDLE) {
+		imp->imp_generation++;
+		imp->imp_initiated_at = imp->imp_generation;
+		imp->imp_state = LUSTRE_IMP_NEW;
+
+		/* connect_import_locked releases imp_lock */
+		rc = ptlrpc_connect_import_locked(imp);
+		if (rc)
+			return rc;
+		ptlrpc_pinger_add_import(imp);
+	} else {
+		spin_unlock(&imp->imp_lock);
+	}
+	return 0;
+}
+
 /**
  * Helper function for creating a request.
  * Calls __ptlrpc_request_alloc to allocate new request sturcture and inits
@@ -863,11 +918,21 @@ ptlrpc_request_alloc_internal(struct obd_import *imp,
                               struct ptlrpc_request_pool * pool,
                               const struct req_format *format)
 {
-        struct ptlrpc_request *request;
+	struct ptlrpc_request *request;
 
-        request = __ptlrpc_request_alloc(imp, pool);
-        if (request == NULL)
-                return NULL;
+	request = __ptlrpc_request_alloc(imp, pool);
+	if (request == NULL)
+		return NULL;
+
+	/* don't make expensive check for idling connection
+	 * if it's already connected */
+	if (unlikely(imp->imp_state != LUSTRE_IMP_FULL)) {
+		if (ptlrpc_reconnect_if_idle(imp) < 0) {
+			atomic_dec(&imp->imp_reqs);
+			ptlrpc_request_free(request);
+			return NULL;
+		}
+	}
 
         req_capsule_init(&request->rq_pill, request, RCL_CLIENT);
         req_capsule_set(&request->rq_pill, format);
@@ -956,7 +1021,6 @@ struct ptlrpc_request_set *ptlrpc_prep_set(void)
 	atomic_set(&set->set_remaining, 0);
 	spin_lock_init(&set->set_new_req_lock);
 	INIT_LIST_HEAD(&set->set_new_requests);
-	INIT_LIST_HEAD(&set->set_cblist);
 	set->set_max_inflight = UINT_MAX;
 	set->set_producer     = NULL;
 	set->set_producer_arg = NULL;
@@ -1051,27 +1115,6 @@ void ptlrpc_set_destroy(struct ptlrpc_request_set *set)
 }
 EXPORT_SYMBOL(ptlrpc_set_destroy);
 
-/**
- * Add a callback function \a fn to the set.
- * This function would be called when all requests on this set are completed.
- * The function will be passed \a data argument.
- */
-int ptlrpc_set_add_cb(struct ptlrpc_request_set *set,
-                      set_interpreter_func fn, void *data)
-{
-	struct ptlrpc_set_cbdata *cbdata;
-
-	OBD_ALLOC_PTR(cbdata);
-	if (cbdata == NULL)
-		RETURN(-ENOMEM);
-
-	cbdata->psc_interpret = fn;
-	cbdata->psc_data = data;
-	list_add_tail(&cbdata->psc_item, &set->set_cblist);
-
-	RETURN(0);
-}
-
 /**
  * Add a new request to the general purpose request set.
  * Assumes request reference from the caller.
@@ -1079,6 +1122,7 @@ int ptlrpc_set_add_cb(struct ptlrpc_request_set *set,
 void ptlrpc_set_add_req(struct ptlrpc_request_set *set,
                         struct ptlrpc_request *req)
 {
+	LASSERT(req->rq_import->imp_state != LUSTRE_IMP_IDLE);
 	LASSERT(list_empty(&req->rq_set_chain));
 
 	if (req->rq_allow_intr)
@@ -1088,7 +1132,7 @@ void ptlrpc_set_add_req(struct ptlrpc_request_set *set,
 	list_add_tail(&req->rq_set_chain, &set->set_requests);
 	req->rq_set = set;
 	atomic_inc(&set->set_remaining);
-	req->rq_queued_time = cfs_time_current();
+	req->rq_queued_time = ktime_get_seconds();
 
 	if (req->rq_reqmsg != NULL)
 		lustre_msg_set_jobid(req->rq_reqmsg, NULL);
@@ -1119,7 +1163,7 @@ void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
 	 * The set takes over the caller's request reference.
 	 */
 	req->rq_set = set;
-	req->rq_queued_time = cfs_time_current();
+	req->rq_queued_time = ktime_get_seconds();
 	list_add_tail(&req->rq_set_chain, &set->set_new_requests);
 	count = atomic_inc_return(&set->set_new_count);
 	spin_unlock(&set->set_new_req_lock);
@@ -1155,17 +1199,19 @@ static int ptlrpc_import_delay_req(struct obd_import *imp,
         LASSERT (status != NULL);
         *status = 0;
 
-        if (req->rq_ctx_init || req->rq_ctx_fini) {
-                /* always allow ctx init/fini rpc go through */
-        } else if (imp->imp_state == LUSTRE_IMP_NEW) {
-                DEBUG_REQ(D_ERROR, req, "Uninitialized import.");
-                *status = -EIO;
+	if (req->rq_ctx_init || req->rq_ctx_fini) {
+		/* always allow ctx init/fini rpc go through */
+	} else if (imp->imp_state == LUSTRE_IMP_NEW) {
+		DEBUG_REQ(D_ERROR, req, "Uninitialized import.");
+		*status = -EIO;
 	} else if (imp->imp_state == LUSTRE_IMP_CLOSED) {
-		/* pings may safely race with umount */
-		DEBUG_REQ(lustre_msg_get_opc(req->rq_reqmsg) == OBD_PING ?
+		unsigned int opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+		/* pings or MDS-equivalent STATFS may safely race with umount */
+		DEBUG_REQ((opc == OBD_PING || opc == OST_STATFS) ?
 			  D_HA : D_ERROR, req, "IMP_CLOSED ");
 		*status = -EIO;
-        } else if (ptlrpc_send_limit_expired(req)) {
+	} else if (ptlrpc_send_limit_expired(req)) {
 		/* probably doesn't need to be a D_ERROR after initial testing*/
 		DEBUG_REQ(D_HA, req, "send limit expired ");
 		*status = -ETIMEDOUT;
@@ -1188,7 +1234,9 @@ static int ptlrpc_import_delay_req(struct obd_import *imp,
 		if (atomic_read(&imp->imp_inval_count) != 0) {
                         DEBUG_REQ(D_ERROR, req, "invalidate in flight");
                         *status = -EIO;
-		} else if (req->rq_no_delay) {
+		} else if (req->rq_no_delay &&
+			   imp->imp_generation != imp->imp_initiated_at) {
+			/* ignore nodelay for requests initiating connections */
                         *status = -EWOULDBLOCK;
 		} else if (req->rq_allow_replay &&
 			  (imp->imp_state == LUSTRE_IMP_REPLAY ||
@@ -1213,16 +1261,12 @@ static int ptlrpc_import_delay_req(struct obd_import *imp,
  * \retval false if no message should be printed
  * \retval true  if console message should be printed
  */
-static bool ptlrpc_console_allow(struct ptlrpc_request *req)
+static bool ptlrpc_console_allow(struct ptlrpc_request *req, __u32 opc, int err)
 {
-	__u32 opc;
-
 	LASSERT(req->rq_reqmsg != NULL);
-	opc = lustre_msg_get_opc(req->rq_reqmsg);
 
 	/* Suppress particular reconnect errors which are to be expected. */
 	if (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT) {
-		int err;
 
 		/* Suppress timed out reconnect requests */
 		if (lustre_handle_is_used(&req->rq_import->imp_remote_handle) ||
@@ -1232,12 +1276,20 @@ static bool ptlrpc_console_allow(struct ptlrpc_request *req)
 		/* Suppress most unavailable/again reconnect requests, but
 		 * print occasionally so it is clear client is trying to
 		 * connect to a server where no target is running. */
-		err = lustre_msg_get_status(req->rq_repmsg);
 		if ((err == -ENODEV || err == -EAGAIN) &&
 		    req->rq_import->imp_conn_cnt % 30 != 20)
 			return false;
 	}
 
+	if (opc == LDLM_ENQUEUE && err == -EAGAIN)
+		/* -EAGAIN is normal when using POSIX flocks */
+		return false;
+
+	if (opc == OBD_PING && (err == -ENODEV || err == -ENOTCONN) &&
+	    (req->rq_xid & 0xf) != 10)
+		/* Suppress most ping requests, they may fail occasionally */
+		return false;
+
 	return true;
 }
 
@@ -1256,9 +1308,7 @@ static int ptlrpc_check_status(struct ptlrpc_request *req)
 		lnet_nid_t nid = imp->imp_connection->c_peer.nid;
 		__u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
 
-		/* -EAGAIN is normal when using POSIX flocks */
-		if (ptlrpc_console_allow(req) &&
-		    !(opc == LDLM_ENQUEUE && err == -EAGAIN))
+		if (ptlrpc_console_allow(req, opc, err))
 			LCONSOLE_ERROR_MSG(0x11, "%s: operation %s to node %s "
 					   "failed: rc = %d\n",
 					   imp->imp_obd->obd_name,
@@ -1429,8 +1479,8 @@ static int after_reply(struct ptlrpc_request *req)
         if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING)
                 CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, cfs_fail_val);
         ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg));
-        ptlrpc_at_adj_net_latency(req,
-                                  lustre_msg_get_service_time(req->rq_repmsg));
+	ptlrpc_at_adj_net_latency(req,
+				  lustre_msg_get_service_timeout(req->rq_repmsg));
 
         rc = ptlrpc_check_status(req);
         imp->imp_connect_error = rc;
@@ -1557,8 +1607,7 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req)
 		req->rq_waiting = 1;
 		spin_unlock(&req->rq_lock);
 
-		DEBUG_REQ(D_HA, req, "req from PID %d waiting for recovery: "
-			  "(%s != %s)", lustre_msg_get_status(req->rq_reqmsg),
+		DEBUG_REQ(D_HA, req, "req waiting for recovery: (%s != %s)",
 			  ptlrpc_import_state_name(req->rq_send_state),
 			  ptlrpc_import_state_name(imp->imp_state));
 		LASSERT(list_empty(&req->rq_list));
@@ -1616,8 +1665,7 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req)
 	       " %s:%s:%d:%llu:%s:%d\n", current_comm(),
 	       imp->imp_obd->obd_uuid.uuid,
 	       lustre_msg_get_status(req->rq_reqmsg), req->rq_xid,
-	       libcfs_nid2str(imp->imp_connection->c_peer.nid),
-	       lustre_msg_get_opc(req->rq_reqmsg));
+	       obd_import_nid2str(imp), lustre_msg_get_opc(req->rq_reqmsg));
 
         rc = ptl_send_rpc(req, 0);
 	if (rc == -ENOMEM) {
@@ -1871,8 +1919,11 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
 					spin_unlock(&imp->imp_lock);
 					GOTO(interpret, req->rq_status);
 				}
+				/* ignore on just initiated connections */
 				if (ptlrpc_no_resend(req) &&
-				    !req->rq_wait_ctx) {
+				    !req->rq_wait_ctx &&
+				    imp->imp_generation !=
+				    imp->imp_initiated_at) {
 					req->rq_status = -ENOTCONN;
 					ptlrpc_rqphase_move(req,
 							    RQ_PHASE_INTERPRET);
@@ -2043,7 +2094,7 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
 			       imp->imp_obd->obd_uuid.uuid,
 			       lustre_msg_get_status(req->rq_reqmsg),
 			       req->rq_xid,
-			       libcfs_nid2str(imp->imp_connection->c_peer.nid),
+			       obd_import_nid2str(imp),
 			       lustre_msg_get_opc(req->rq_reqmsg));
 
 		spin_lock(&imp->imp_lock);
@@ -2100,6 +2151,7 @@ EXPORT_SYMBOL(ptlrpc_check_set);
 int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
 {
 	struct obd_import *imp = req->rq_import;
+	unsigned int debug_mask = D_RPCTRACE;
 	int rc = 0;
 	ENTRY;
 
@@ -2107,12 +2159,15 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
 	req->rq_timedout = 1;
 	spin_unlock(&req->rq_lock);
 
-	DEBUG_REQ(D_WARNING, req, "Request sent has %s: [sent %lld/real %lld]",
-                  req->rq_net_err ? "failed due to network error" :
-                     ((req->rq_real_sent == 0 ||
+	if (ptlrpc_console_allow(req, lustre_msg_get_opc(req->rq_reqmsg),
+				  lustre_msg_get_status(req->rq_reqmsg)))
+		debug_mask = D_WARNING;
+	DEBUG_REQ(debug_mask, req, "Request sent has %s: [sent %lld/real %lld]",
+		  req->rq_net_err ? "failed due to network error" :
+		     ((req->rq_real_sent == 0 ||
 		       req->rq_real_sent < req->rq_sent ||
 		       req->rq_real_sent >= req->rq_deadline) ?
-                      "timed out for sent delay" : "timed out for slow reply"),
+		      "timed out for sent delay" : "timed out for slow reply"),
 		  (s64)req->rq_sent, (s64)req->rq_real_sent);
 
 	if (imp != NULL && obd_debug_peer_on_timeout)
@@ -2253,7 +2308,7 @@ static void ptlrpc_interrupted_set(void *data)
 /**
  * Get the smallest timeout in the set; this does NOT set a timeout.
  */
-int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
+time64_t ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
 {
 	struct list_head *tmp;
 	time64_t now = ktime_get_real_seconds();
@@ -2306,13 +2361,14 @@ int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
  * error or otherwise be interrupted).
  * Returns 0 on success or error code otherwise.
  */
-int ptlrpc_set_wait(struct ptlrpc_request_set *set)
+int ptlrpc_set_wait(const struct lu_env *env, struct ptlrpc_request_set *set)
 {
-	struct list_head            *tmp;
-        struct ptlrpc_request *req;
-        struct l_wait_info     lwi;
-        int                    rc, timeout;
-        ENTRY;
+	struct list_head *tmp;
+	struct ptlrpc_request *req;
+	struct l_wait_info lwi;
+	time64_t timeout;
+	int rc;
+	ENTRY;
 
 	if (set->set_producer)
 		(void)ptlrpc_set_producer(set);
@@ -2327,13 +2383,13 @@ int ptlrpc_set_wait(struct ptlrpc_request_set *set)
 	if (list_empty(&set->set_requests))
                 RETURN(0);
 
-        do {
-                timeout = ptlrpc_set_next_timeout(set);
+	do {
+		timeout = ptlrpc_set_next_timeout(set);
 
                 /* wait until all complete, interrupted, or an in-flight
                  * req times out */
-                CDEBUG(D_RPCTRACE, "set %p going to sleep for %d seconds\n",
-                       set, timeout);
+		CDEBUG(D_RPCTRACE, "set %p going to sleep for %lld seconds\n",
+			set, timeout);
 
 		if ((timeout == 0 && !signal_pending(current)) ||
 		    set->set_allow_intr)
@@ -2354,7 +2410,8 @@ int ptlrpc_set_wait(struct ptlrpc_request_set *set)
                         lwi = LWI_TIMEOUT(cfs_time_seconds(timeout? timeout : 1),
                                           ptlrpc_expired_set, set);
 
-                rc = l_wait_event(set->set_waitq, ptlrpc_check_set(NULL, set), &lwi);
+		rc = l_wait_event(set->set_waitq,
+				  ptlrpc_check_set(NULL, set), &lwi);
 
                 /* LU-769 - if we ignored the signal because it was already
                  * pending when we started, we need to handle it now or we risk
@@ -2405,25 +2462,7 @@ int ptlrpc_set_wait(struct ptlrpc_request_set *set)
                         rc = req->rq_status;
         }
 
-        if (set->set_interpret != NULL) {
-                int (*interpreter)(struct ptlrpc_request_set *set,void *,int) =
-                        set->set_interpret;
-                rc = interpreter (set, set->set_arg, rc);
-        } else {
-                struct ptlrpc_set_cbdata *cbdata, *n;
-                int err;
-
-		list_for_each_entry_safe(cbdata, n,
-                                         &set->set_cblist, psc_item) {
-			list_del_init(&cbdata->psc_item);
-                        err = cbdata->psc_interpret(set, cbdata->psc_data, rc);
-                        if (err && !rc)
-                                rc = err;
-                        OBD_FREE_PTR(cbdata);
-                }
-        }
-
-        RETURN(rc);
+	RETURN(rc);
 }
 EXPORT_SYMBOL(ptlrpc_set_wait);
 
@@ -2473,9 +2512,13 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
                 sptlrpc_cli_free_repbuf(request);
 
         if (request->rq_import != NULL) {
-                class_import_put(request->rq_import);
-                request->rq_import = NULL;
-        }
+		if (!ptlrpcd_check_work(request)) {
+			LASSERT(atomic_read(&request->rq_import->imp_reqs) > 0);
+			atomic_dec(&request->rq_import->imp_reqs);
+		}
+		class_import_put(request->rq_import);
+		request->rq_import = NULL;
+	}
 	if (request->rq_bulk != NULL)
 		ptlrpc_free_bulk(request->rq_bulk);
 
@@ -2679,8 +2722,11 @@ void ptlrpc_request_committed(struct ptlrpc_request *req, int force)
 		return;
 	}
 
-	if (force || req->rq_transno <= imp->imp_peer_committed_transno)
+	if (force || req->rq_transno <= imp->imp_peer_committed_transno) {
+		if (imp->imp_replay_cursor == &req->rq_replay_list)
+			imp->imp_replay_cursor = req->rq_replay_list.next;
 		ptlrpc_free_request(req);
+	}
 
 	spin_unlock(&imp->imp_lock);
 }
@@ -2792,7 +2838,7 @@ void ptlrpc_cleanup_client(struct obd_import *imp)
  */
 void ptlrpc_resend_req(struct ptlrpc_request *req)
 {
-        DEBUG_REQ(D_HA, req, "going to resend");
+	DEBUG_REQ(D_HA, req, "going to resend");
 	spin_lock(&req->rq_lock);
 
 	/* Request got reply but linked to the import list still.
@@ -2803,14 +2849,13 @@ void ptlrpc_resend_req(struct ptlrpc_request *req)
 		return;
 	}
 
-        lustre_msg_set_handle(req->rq_reqmsg, &(struct lustre_handle){ 0 });
-        req->rq_status = -EAGAIN;
+	req->rq_status = -EAGAIN;
 
-        req->rq_resend = 1;
-        req->rq_net_err = 0;
-        req->rq_timedout = 0;
+	req->rq_resend = 1;
+	req->rq_net_err = 0;
+	req->rq_timedout = 0;
 
-        ptlrpc_client_wake_req(req);
+	ptlrpc_client_wake_req(req);
 	spin_unlock(&req->rq_lock);
 }
 
@@ -2920,13 +2965,13 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
 	/* for distributed debugging */
 	lustre_msg_set_status(req->rq_reqmsg, current_pid());
 
-        /* add a ref for the set (see comment in ptlrpc_set_add_req) */
-        ptlrpc_request_addref(req);
-        ptlrpc_set_add_req(set, req);
-        rc = ptlrpc_set_wait(set);
-        ptlrpc_set_destroy(set);
+	/* add a ref for the set (see comment in ptlrpc_set_add_req) */
+	ptlrpc_request_addref(req);
+	ptlrpc_set_add_req(set, req);
+	rc = ptlrpc_set_wait(NULL, set);
+	ptlrpc_set_destroy(set);
 
-        RETURN(rc);
+	RETURN(rc);
 }
 EXPORT_SYMBOL(ptlrpc_queue_wait);
 
@@ -2966,7 +3011,6 @@ static int ptlrpc_replay_interpret(const struct lu_env *env,
 		DEBUG_REQ(D_WARNING, req, "Version mismatch during replay\n");
 		spin_lock(&imp->imp_lock);
 		imp->imp_vbr_failed = 1;
-		imp->imp_no_lock_replay = 1;
 		spin_unlock(&imp->imp_lock);
 		lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
         } else {
@@ -2980,9 +3024,6 @@ static int ptlrpc_replay_interpret(const struct lu_env *env,
         }
 
 	spin_lock(&imp->imp_lock);
-	/** if replays by version then gap occur on server, no trust to locks */
-	if (lustre_msg_get_flags(req->rq_repmsg) & MSG_VERSION_REPLAY)
-		imp->imp_no_lock_replay = 1;
 	imp->imp_last_replay_transno = lustre_msg_get_transno(req->rq_reqmsg);
 	spin_unlock(&imp->imp_lock);
         LASSERT(imp->imp_last_replay_transno);
@@ -3081,14 +3122,15 @@ static int ptlrpc_replay_interpret(const struct lu_env *env,
  */
 int ptlrpc_replay_req(struct ptlrpc_request *req)
 {
-        struct ptlrpc_replay_async_args *aa;
-        ENTRY;
+	struct ptlrpc_replay_async_args *aa;
 
-        LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY);
+	ENTRY;
 
-        LASSERT (sizeof (*aa) <= sizeof (req->rq_async_args));
-        aa = ptlrpc_req_async_args(req);
-        memset(aa, 0, sizeof *aa);
+	LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY);
+
+	CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+	aa = ptlrpc_req_async_args(req);
+	memset(aa, 0, sizeof(*aa));
 
         /* Prepare request to be resent with ptlrpcd */
         aa->praa_old_state = req->rq_send_state;
@@ -3104,8 +3146,8 @@ int ptlrpc_replay_req(struct ptlrpc_request *req)
 
         /* Tell server the net_latency, so the server can calculate how long
          * it should wait for next replay */
-        lustre_msg_set_service_time(req->rq_reqmsg,
-                                    ptlrpc_at_get_net_latency(req));
+	lustre_msg_set_service_timeout(req->rq_reqmsg,
+				       ptlrpc_at_get_net_latency(req));
         DEBUG_REQ(D_HA, req, "REPLAY");
 
 	atomic_inc(&req->rq_import->imp_replay_inflight);
@@ -3126,11 +3168,12 @@ void ptlrpc_abort_inflight(struct obd_import *imp)
 	struct list_head *tmp, *n;
 	ENTRY;
 
-	/* Make sure that no new requests get processed for this import.
+	/*
+	 * Make sure that no new requests get processed for this import.
 	 * ptlrpc_{queue,set}_wait must (and does) hold imp_lock while testing
 	 * this flag and then putting requests on sending_list or delayed_list.
 	 */
-	spin_lock(&imp->imp_lock);
+	assert_spin_locked(&imp->imp_lock);
 
 	/* XXX locking?  Maybe we should remove each request with the list
 	 * locked?  Also, how do we know if the requests on the list are
@@ -3172,8 +3215,6 @@ void ptlrpc_abort_inflight(struct obd_import *imp)
 	if (imp->imp_replayable)
 		ptlrpc_free_committed(imp);
 
-	spin_unlock(&imp->imp_lock);
-
 	EXIT;
 }
 
@@ -3313,8 +3354,7 @@ void ptlrpc_set_bulk_mbits(struct ptlrpc_request *req)
 	/* For multi-bulk RPCs, rq_mbits is the last mbits needed for bulks so
 	 * that server can infer the number of bulks that were prepared,
 	 * see LU-1431 */
-	req->rq_mbits += ((bd->bd_iov_count + LNET_MAX_IOV - 1) /
-			  LNET_MAX_IOV) - 1;
+	req->rq_mbits += bd->bd_md_count - 1;
 
 	/* Set rq_xid as rq_mbits to indicate the final bulk for the old
 	 * server which does not support OBD_CONNECT_BULK_MBITS. LU-6808.
@@ -3442,7 +3482,7 @@ void *ptlrpcd_alloc_work(struct obd_import *imp,
 	req->rq_no_delay = req->rq_no_resend = 1;
 	req->rq_pill.rc_fmt = (void *)&worker_format;
 
-	CLASSERT (sizeof(*args) <= sizeof(req->rq_async_args));
+	CLASSERT(sizeof(*args) <= sizeof(req->rq_async_args));
 	args = ptlrpc_req_async_args(req);
 	args->cb     = cb;
 	args->cbdata = cbdata;
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/errno.c b/drivers/staging/lustrefsx/lustre/ptlrpc/errno.c
index fb302c70d08be..a3d31a853244c 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/errno.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/errno.c
@@ -26,9 +26,10 @@
  */
 
 #include <libcfs/libcfs.h>
-#include <lustre/lustre_errno.h>
+#include <lustre_errno.h>
 
 #ifdef LUSTRE_TRANSLATE_ERRNOS
+#include <lustre_dlm.h>
 
 /*
  * The two translation tables below must define a one-to-one mapping between
@@ -185,7 +186,20 @@ static int lustre_errno_hton_mapping[] = {
 	[ESERVERFAULT]		= LUSTRE_ESERVERFAULT,
 	[EBADTYPE]		= LUSTRE_EBADTYPE,
 	[EJUKEBOX]		= LUSTRE_EJUKEBOX,
-	[EIOCBQUEUED]		= LUSTRE_EIOCBQUEUED
+	[EIOCBQUEUED]		= LUSTRE_EIOCBQUEUED,
+
+	/*
+	 * The ELDLM errors are Lustre specific errors whose ranges
+	 * lie in the middle of the above system errors. The ELDLM
+	 * numbers must be preserved to avoid LU-9793.
+	 */
+	[ELDLM_LOCK_CHANGED]	= ELDLM_LOCK_CHANGED,
+	[ELDLM_LOCK_ABORTED]	= ELDLM_LOCK_ABORTED,
+	[ELDLM_LOCK_REPLACED]	= ELDLM_LOCK_REPLACED,
+	[ELDLM_NO_LOCK_DATA]	= ELDLM_NO_LOCK_DATA,
+	[ELDLM_LOCK_WOULDBLOCK]	= ELDLM_LOCK_WOULDBLOCK,
+	[ELDLM_NAMESPACE_EXISTS]= ELDLM_NAMESPACE_EXISTS,
+	[ELDLM_BAD_NAMESPACE]	= ELDLM_BAD_NAMESPACE
 };
 
 static int lustre_errno_ntoh_mapping[] = {
@@ -331,7 +345,20 @@ static int lustre_errno_ntoh_mapping[] = {
 	[LUSTRE_ESERVERFAULT]		= ESERVERFAULT,
 	[LUSTRE_EBADTYPE]		= EBADTYPE,
 	[LUSTRE_EJUKEBOX]		= EJUKEBOX,
-	[LUSTRE_EIOCBQUEUED]		= EIOCBQUEUED
+	[LUSTRE_EIOCBQUEUED]		= EIOCBQUEUED,
+
+	/*
+	 * The ELDLM errors are Lustre specific errors whose ranges
+	 * lie in the middle of the above system errors. The ELDLM
+	 * numbers must be preserved to avoid LU-9793.
+	 */
+	[ELDLM_LOCK_CHANGED]		= ELDLM_LOCK_CHANGED,
+	[ELDLM_LOCK_ABORTED]		= ELDLM_LOCK_ABORTED,
+	[ELDLM_LOCK_REPLACED]		= ELDLM_LOCK_REPLACED,
+	[ELDLM_NO_LOCK_DATA]		= ELDLM_NO_LOCK_DATA,
+	[ELDLM_LOCK_WOULDBLOCK]		= ELDLM_LOCK_WOULDBLOCK,
+	[ELDLM_NAMESPACE_EXISTS]	= ELDLM_NAMESPACE_EXISTS,
+	[ELDLM_BAD_NAMESPACE]		= ELDLM_BAD_NAMESPACE
 };
 
 unsigned int lustre_errno_hton(unsigned int h)
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/events.c b/drivers/staging/lustrefsx/lustre/ptlrpc/events.c
index 28533cca19a32..6c713b22b94ae 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/events.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/events.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -56,6 +56,11 @@ void request_out_callback(struct lnet_event *ev)
 
 	DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status);
 
+	/* Do not update imp_next_ping for connection request */
+	if (lustre_msg_get_opc(req->rq_reqmsg) !=
+	    req->rq_import->imp_connect_op)
+		ptlrpc_pinger_sending_on_import(req->rq_import);
+
 	sptlrpc_request_out_callback(req);
 
 	spin_lock(&req->rq_lock);
@@ -161,12 +166,13 @@ void reply_in_callback(struct lnet_event *ev)
                           ev->mlength, ev->offset, req->rq_replen);
         }
 
-	req->rq_import->imp_last_reply_time = ktime_get_real_seconds();
+	if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING)
+		req->rq_import->imp_last_reply_time = ktime_get_real_seconds();
 
 out_wake:
-        /* NB don't unlock till after wakeup; req can disappear under us
-         * since we don't have our own ref */
-        ptlrpc_client_wake_req(req);
+	/* NB don't unlock till after wakeup; req can disappear under us
+	 * since we don't have our own ref */
+	ptlrpc_client_wake_req(req);
 	spin_unlock(&req->rq_lock);
 	EXIT;
 }
@@ -200,8 +206,8 @@ void client_bulk_callback(struct lnet_event *ev)
 
 	spin_lock(&desc->bd_lock);
 	req = desc->bd_req;
-	LASSERT(desc->bd_md_count > 0);
-	desc->bd_md_count--;
+	LASSERT(desc->bd_refs > 0);
+	desc->bd_refs--;
 
 	if (ev->type != LNET_EVENT_UNLINK && ev->status == 0) {
 		desc->bd_nob_transferred += ev->mlength;
@@ -218,7 +224,7 @@ void client_bulk_callback(struct lnet_event *ev)
 
 	/* NB don't unlock till after wakeup; desc can disappear under us
 	 * otherwise */
-	if (desc->bd_md_count == 0)
+	if (desc->bd_refs == 0)
 		ptlrpc_client_wake_req(desc->bd_req);
 
 	spin_unlock(&desc->bd_lock);
@@ -450,7 +456,7 @@ void server_bulk_callback(struct lnet_event *ev)
 
 	spin_lock(&desc->bd_lock);
 
-	LASSERT(desc->bd_md_count > 0);
+	LASSERT(desc->bd_refs > 0);
 
 	if ((ev->type == LNET_EVENT_ACK ||
 	     ev->type == LNET_EVENT_REPLY) &&
@@ -466,9 +472,9 @@ void server_bulk_callback(struct lnet_event *ev)
 		desc->bd_failure = 1;
 
 	if (ev->unlinked) {
-		desc->bd_md_count--;
+		desc->bd_refs--;
 		/* This is the last callback no matter what... */
-		if (desc->bd_md_count == 0)
+		if (desc->bd_refs == 0)
 			wake_up(&desc->bd_waitq);
 	}
 
@@ -500,14 +506,14 @@ static void ptlrpc_master_callback(struct lnet_event *ev)
 int ptlrpc_uuid_to_peer(struct obd_uuid *uuid,
 			struct lnet_process_id *peer, lnet_nid_t *self)
 {
-	int               best_dist = 0;
-	__u32             best_order = 0;
-	int               count = 0;
-	int               rc = -ENOENT;
-	int               dist;
-	__u32             order;
-	lnet_nid_t        dst_nid;
-	lnet_nid_t        src_nid;
+	int best_dist = 0;
+	__u32 best_order = 0;
+	int count = 0;
+	int rc = -ENOENT;
+	int dist;
+	__u32 order;
+	lnet_nid_t dst_nid;
+	lnet_nid_t src_nid;
 
 	peer->pid = LNET_PID_LUSTRE;
 
@@ -522,7 +528,7 @@ int ptlrpc_uuid_to_peer(struct obd_uuid *uuid,
 			continue;
 
 		if (dist == 0) {                /* local! use loopback LND */
-			peer->nid = *self = LNET_MKNID(LNET_MKNET(LOLND, 0), 0);
+			peer->nid = *self = LNET_NID_LO_0;
 			rc = 0;
 			break;
 		}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_api.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_api.h
index a5bbaea6065d3..a5f203e215389 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_api.h
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_api.h
@@ -21,10 +21,16 @@
 
 struct gss_api_mech;
 
+typedef int (*digest_hash)(
+	struct ahash_request *req, rawobj_t *hdr,
+	int msgcnt, rawobj_t *msgs,
+	int iovcnt, lnet_kiov_t *iovs);
+
 /* The mechanism-independent gss-api context: */
 struct gss_ctx {
-        struct gss_api_mech    *mech_type;
-        void                   *internal_ctx_id;
+	struct gss_api_mech *mech_type;
+	void *internal_ctx_id;
+	digest_hash hash_func;
 };
 
 #define GSS_C_NO_BUFFER         ((rawobj_t) 0)
@@ -44,7 +50,7 @@ __u32 lgss_copy_reverse_context(
                 struct gss_ctx         **ctx_new);
 __u32 lgss_inquire_context(
                 struct gss_ctx          *ctx,
-                unsigned long           *endtime);
+		time64_t *endtime);
 __u32 lgss_get_mic(
                 struct gss_ctx          *ctx,
                 int                      msgcnt,
@@ -119,7 +125,7 @@ struct gss_api_ops {
                         struct gss_ctx         *ctx_new);
         __u32 (*gss_inquire_context)(
                         struct gss_ctx         *ctx,
-                        unsigned long          *endtime);
+			time64_t *endtime);
         __u32 (*gss_get_mic)(
                         struct gss_ctx         *ctx,
                         int                     msgcnt,
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_bulk.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_bulk.c
index 3f703372d272f..041dd12dac593 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_bulk.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_bulk.c
@@ -46,7 +46,6 @@
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_net.h>
 #include <lustre_import.h>
 #include <lustre_sec.h>
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_cli_upcall.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_cli_upcall.c
index d1fa9200452ba..70d4711c67a96 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_cli_upcall.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_cli_upcall.c
@@ -45,7 +45,6 @@
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_net.h>
 #include <lustre_import.h>
 #include <lustre_sec.h>
@@ -60,82 +59,85 @@
 
 static
 int ctx_init_pack_request(struct obd_import *imp,
-                          struct ptlrpc_request *req,
-                          int lustre_srv,
-                          uid_t uid, gid_t gid,
-                          long token_size,
-                          char __user *token)
+			  struct ptlrpc_request *req,
+			  int lustre_srv,
+			  uid_t uid, gid_t gid,
+			  long token_size,
+			  char __user *token)
 {
-        struct lustre_msg       *msg = req->rq_reqbuf;
-        struct gss_sec          *gsec;
-        struct gss_header       *ghdr;
-        struct ptlrpc_user_desc *pud;
-        __u32                   *p, size, offset = 2;
-        rawobj_t                 obj;
-
-        LASSERT(msg->lm_bufcount <= 4);
-        LASSERT(req->rq_cli_ctx);
-        LASSERT(req->rq_cli_ctx->cc_sec);
-
-        /* gss hdr */
-        ghdr = lustre_msg_buf(msg, 0, sizeof(*ghdr));
-        ghdr->gh_version = PTLRPC_GSS_VERSION;
-        ghdr->gh_sp = (__u8) imp->imp_sec->ps_part;
-        ghdr->gh_flags = 0;
-        ghdr->gh_proc = PTLRPC_GSS_PROC_INIT;
-        ghdr->gh_seq = 0;
-        ghdr->gh_svc = SPTLRPC_SVC_NULL;
-        ghdr->gh_handle.len = 0;
-
-        /* fix the user desc */
-        if (req->rq_pack_udesc) {
-                ghdr->gh_flags |= LUSTRE_GSS_PACK_USER;
-
-                pud = lustre_msg_buf(msg, offset, sizeof(*pud));
-                LASSERT(pud);
-                pud->pud_uid = pud->pud_fsuid = uid;
-                pud->pud_gid = pud->pud_fsgid = gid;
-                pud->pud_cap = 0;
-                pud->pud_ngroups = 0;
-                offset++;
-        }
+	struct lustre_msg       *msg = req->rq_reqbuf;
+	struct gss_sec          *gsec;
+	struct gss_header       *ghdr;
+	struct ptlrpc_user_desc *pud;
+	__u32                   *p, size, offset = 2;
+	rawobj_t                 obj;
+
+	LASSERT(msg->lm_bufcount <= 4);
+	LASSERT(req->rq_cli_ctx);
+	LASSERT(req->rq_cli_ctx->cc_sec);
+
+	/* gss hdr */
+	ghdr = lustre_msg_buf(msg, 0, sizeof(*ghdr));
+	ghdr->gh_version = PTLRPC_GSS_VERSION;
+	ghdr->gh_sp = (__u8) imp->imp_sec->ps_part;
+	ghdr->gh_flags = 0;
+	ghdr->gh_proc = PTLRPC_GSS_PROC_INIT;
+	ghdr->gh_seq = 0;
+	ghdr->gh_svc = SPTLRPC_SVC_NULL;
+	ghdr->gh_handle.len = 0;
+
+	/* fix the user desc */
+	if (req->rq_pack_udesc) {
+		ghdr->gh_flags |= LUSTRE_GSS_PACK_USER;
+
+		pud = lustre_msg_buf(msg, offset, sizeof(*pud));
+		LASSERT(pud);
+		pud->pud_uid = pud->pud_fsuid = uid;
+		pud->pud_gid = pud->pud_fsgid = gid;
+		pud->pud_cap = 0;
+		pud->pud_ngroups = 0;
+		offset++;
+	}
 
-        /* security payload */
-        p = lustre_msg_buf(msg, offset, 0);
-        size = msg->lm_buflens[offset];
-        LASSERT(p);
-
-        /* 1. lustre svc type */
-        LASSERT(size > 4);
-        *p++ = cpu_to_le32(lustre_srv);
-        size -= 4;
-
-        /* 2. target uuid */
-        obj.len = strlen(imp->imp_obd->u.cli.cl_target_uuid.uuid) + 1;
-        obj.data = imp->imp_obd->u.cli.cl_target_uuid.uuid;
-        if (rawobj_serialize(&obj, &p, &size))
-                LBUG();
-
-        /* 3. reverse context handle. actually only needed by root user,
-         *    but we send it anyway. */
-        gsec = sec2gsec(req->rq_cli_ctx->cc_sec);
-        obj.len = sizeof(gsec->gs_rvs_hdl);
-        obj.data = (__u8 *) &gsec->gs_rvs_hdl;
-        if (rawobj_serialize(&obj, &p, &size))
-                LBUG();
-
-        /* 4. now the token */
-        LASSERT(size >= (sizeof(__u32) + token_size));
-        *p++ = cpu_to_le32(((__u32) token_size));
+	/* new clients are expected to set KCSUM flag */
+	ghdr->gh_flags |= LUSTRE_GSS_PACK_KCSUM;
+
+	/* security payload */
+	p = lustre_msg_buf(msg, offset, 0);
+	size = msg->lm_buflens[offset];
+	LASSERT(p);
+
+	/* 1. lustre svc type */
+	LASSERT(size > 4);
+	*p++ = cpu_to_le32(lustre_srv);
+	size -= 4;
+
+	/* 2. target uuid */
+	obj.len = strlen(imp->imp_obd->u.cli.cl_target_uuid.uuid) + 1;
+	obj.data = imp->imp_obd->u.cli.cl_target_uuid.uuid;
+	if (rawobj_serialize(&obj, &p, &size))
+		LBUG();
+
+	/* 3. reverse context handle. actually only needed by root user,
+	 *    but we send it anyway. */
+	gsec = sec2gsec(req->rq_cli_ctx->cc_sec);
+	obj.len = sizeof(gsec->gs_rvs_hdl);
+	obj.data = (__u8 *) &gsec->gs_rvs_hdl;
+	if (rawobj_serialize(&obj, &p, &size))
+		LBUG();
+
+	/* 4. now the token */
+	LASSERT(size >= (sizeof(__u32) + token_size));
+	*p++ = cpu_to_le32(((__u32) token_size));
 	if (copy_from_user(p, token, token_size)) {
-                CERROR("can't copy token\n");
-                return -EFAULT;
-        }
-        size -= sizeof(__u32) + cfs_size_round4(token_size);
+		CERROR("can't copy token\n");
+		return -EFAULT;
+	}
+	size -= sizeof(__u32) + cfs_size_round4(token_size);
 
-        req->rq_reqdata_len = lustre_shrink_msg(req->rq_reqbuf, offset,
-                                                msg->lm_buflens[offset] - size, 0);
-        return 0;
+	req->rq_reqdata_len = lustre_shrink_msg(req->rq_reqbuf, offset,
+					     msg->lm_buflens[offset] - size, 0);
+	return 0;
 }
 
 static
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.c
index 17fd9cf3c00c1..7be412d2d4a72 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.c
@@ -55,12 +55,12 @@
 #include "gss_internal.h"
 #include "gss_crypto.h"
 
-int gss_keyblock_init(struct gss_keyblock *kb, char *alg_name,
+int gss_keyblock_init(struct gss_keyblock *kb, const char *alg_name,
 		      const int alg_mode)
 {
 	int rc;
 
-	kb->kb_tfm = crypto_alloc_blkcipher(alg_name, alg_mode, 0);
+	kb->kb_tfm = crypto_alloc_sync_skcipher(alg_name, alg_mode, 0);
 	if (IS_ERR(kb->kb_tfm)) {
 		rc = PTR_ERR(kb->kb_tfm);
 		kb->kb_tfm = NULL;
@@ -69,8 +69,8 @@ int gss_keyblock_init(struct gss_keyblock *kb, char *alg_name,
 		return rc;
 	}
 
-	rc = crypto_blkcipher_setkey(kb->kb_tfm, kb->kb_key.data,
-				     kb->kb_key.len);
+	rc = crypto_sync_skcipher_setkey(kb->kb_tfm, kb->kb_key.data,
+					 kb->kb_key.len);
 	if (rc) {
 		CERROR("failed to set %s key, len %d, rc = %d\n", alg_name,
 		       kb->kb_key.len, rc);
@@ -84,7 +84,7 @@ void gss_keyblock_free(struct gss_keyblock *kb)
 {
 	rawobj_free(&kb->kb_key);
 	if (kb->kb_tfm)
-		crypto_free_blkcipher(kb->kb_tfm);
+		crypto_free_sync_skcipher(kb->kb_tfm);
 }
 
 int gss_keyblock_dup(struct gss_keyblock *new, struct gss_keyblock *kb)
@@ -226,86 +226,76 @@ void gss_teardown_sgtable(struct sg_table *sgt)
 		sg_free_table(sgt);
 }
 
-int gss_crypt_generic(struct crypto_blkcipher *tfm, int decrypt, const void *iv,
-		      const void *in, void *out, size_t length)
+int gss_crypt_generic(struct crypto_sync_skcipher *tfm, int decrypt,
+		      const void *iv, const void *in, void *out, size_t length)
 {
-	struct blkcipher_desc desc;
 	struct scatterlist sg;
 	struct sg_table sg_out;
 	__u8 local_iv[16] = {0};
 	__u32 ret = -EINVAL;
+	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
 
 	LASSERT(tfm);
-	desc.tfm = tfm;
-	desc.info = local_iv;
-	desc.flags = 0;
 
-	if (length % crypto_blkcipher_blocksize(tfm) != 0) {
+	if (length % crypto_sync_skcipher_blocksize(tfm) != 0) {
 		CERROR("output length %zu mismatch blocksize %d\n",
-		       length, crypto_blkcipher_blocksize(tfm));
+		       length, crypto_sync_skcipher_blocksize(tfm));
 		goto out;
 	}
 
-	if (crypto_blkcipher_ivsize(tfm) > ARRAY_SIZE(local_iv)) {
-		CERROR("iv size too large %d\n", crypto_blkcipher_ivsize(tfm));
+	if (crypto_sync_skcipher_ivsize(tfm) > ARRAY_SIZE(local_iv)) {
+		CERROR("iv size too large %d\n",
+			crypto_sync_skcipher_ivsize(tfm));
 		goto out;
 	}
 
 	if (iv)
-		memcpy(local_iv, iv, crypto_blkcipher_ivsize(tfm));
+		memcpy(local_iv, iv, crypto_sync_skcipher_ivsize(tfm));
 
-	memcpy(out, in, length);
+	if (in != out)
+		memmove(out, in, length);
 
 	ret = gss_setup_sgtable(&sg_out, &sg, out, length);
 	if (ret != 0)
 		goto out;
 
+	skcipher_request_set_sync_tfm(req, tfm);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, &sg, &sg, length, local_iv);
+
 	if (decrypt)
-		ret = crypto_blkcipher_decrypt_iv(&desc, &sg, &sg, length);
+		ret = crypto_skcipher_decrypt_iv(req, &sg, &sg, length);
 	else
-		ret = crypto_blkcipher_encrypt_iv(&desc, &sg, &sg, length);
+		ret = crypto_skcipher_encrypt_iv(req, &sg, &sg, length);
 
+	skcipher_request_zero(req);
 	gss_teardown_sgtable(&sg_out);
 out:
 	return ret;
 }
 
-int gss_digest_hmac(struct crypto_hash *tfm,
-		    rawobj_t *key,
-		    rawobj_t *hdr,
-		    int msgcnt, rawobj_t *msgs,
-		    int iovcnt, lnet_kiov_t *iovs,
-		    rawobj_t *cksum)
+int gss_digest_hash(struct ahash_request *req,
+		    rawobj_t *hdr, int msgcnt, rawobj_t *msgs,
+		    int iovcnt, lnet_kiov_t *iovs)
 {
-	struct hash_desc desc = {
-		.tfm = tfm,
-		.flags = 0,
-	};
 	struct scatterlist sg[1];
 	struct sg_table sgt;
+	int rc = 0;
 	int i;
-	int rc;
-
-	rc = crypto_hash_setkey(tfm, key->data, key->len);
-	if (rc)
-		return rc;
-
-	rc = crypto_hash_init(&desc);
-	if (rc)
-		return rc;
 
 	for (i = 0; i < msgcnt; i++) {
 		if (msgs[i].len == 0)
 			continue;
 
 		rc = gss_setup_sgtable(&sgt, sg, msgs[i].data, msgs[i].len);
-		if (rc != 0)
-			return rc;
-		rc = crypto_hash_update(&desc, sg, msgs[i].len);
 		if (rc)
 			return rc;
 
+		ahash_request_set_crypt(req, sg, NULL, msgs[i].len);
+		rc = crypto_ahash_update(req);
 		gss_teardown_sgtable(&sgt);
+		if (rc)
+			return rc;
 	}
 
 	for (i = 0; i < iovcnt; i++) {
@@ -315,59 +305,50 @@ int gss_digest_hmac(struct crypto_hash *tfm,
 		sg_init_table(sg, 1);
 		sg_set_page(&sg[0], iovs[i].kiov_page, iovs[i].kiov_len,
 			    iovs[i].kiov_offset);
-		rc = crypto_hash_update(&desc, sg, iovs[i].kiov_len);
+
+		ahash_request_set_crypt(req, sg, NULL, iovs[i].kiov_len);
+		rc = crypto_ahash_update(req);
 		if (rc)
 			return rc;
 	}
 
 	if (hdr) {
-		rc = gss_setup_sgtable(&sgt, sg, hdr, sizeof(*hdr));
-		if (rc != 0)
-			return rc;
-		rc = crypto_hash_update(&desc, sg, sizeof(hdr->len));
+		rc = gss_setup_sgtable(&sgt, sg, hdr->data, hdr->len);
 		if (rc)
 			return rc;
 
+		ahash_request_set_crypt(req, sg, NULL, hdr->len);
+		rc = crypto_ahash_update(req);
 		gss_teardown_sgtable(&sgt);
+		if (rc)
+			return rc;
 	}
 
-	return crypto_hash_final(&desc, cksum->data);
+	return rc;
 }
 
-int gss_digest_norm(struct crypto_hash *tfm,
-		    struct gss_keyblock *kb,
-		    rawobj_t *hdr,
-		    int msgcnt, rawobj_t *msgs,
-		    int iovcnt, lnet_kiov_t *iovs,
-		    rawobj_t *cksum)
+int gss_digest_hash_compat(struct ahash_request *req,
+			   rawobj_t *hdr, int msgcnt, rawobj_t *msgs,
+			   int iovcnt, lnet_kiov_t *iovs)
 {
-	struct hash_desc   desc;
 	struct scatterlist sg[1];
 	struct sg_table sgt;
-	int                i;
-	int                rc;
-
-	LASSERT(kb->kb_tfm);
-	desc.tfm = tfm;
-	desc.flags = 0;
-
-	rc = crypto_hash_init(&desc);
-	if (rc)
-		return rc;
+	int rc = 0;
+	int i;
 
 	for (i = 0; i < msgcnt; i++) {
 		if (msgs[i].len == 0)
 			continue;
 
 		rc = gss_setup_sgtable(&sgt, sg, msgs[i].data, msgs[i].len);
-		if (rc != 0)
-			return rc;
-
-		rc = crypto_hash_update(&desc, sg, msgs[i].len);
 		if (rc)
 			return rc;
 
+		ahash_request_set_crypt(req, sg, NULL, msgs[i].len);
+		rc = crypto_ahash_update(req);
 		gss_teardown_sgtable(&sgt);
+		if (rc)
+			return rc;
 	}
 
 	for (i = 0; i < iovcnt; i++) {
@@ -377,29 +358,26 @@ int gss_digest_norm(struct crypto_hash *tfm,
 		sg_init_table(sg, 1);
 		sg_set_page(&sg[0], iovs[i].kiov_page, iovs[i].kiov_len,
 			    iovs[i].kiov_offset);
-		rc = crypto_hash_update(&desc, sg, iovs[i].kiov_len);
+
+		ahash_request_set_crypt(req, sg, NULL, iovs[i].kiov_len);
+		rc = crypto_ahash_update(req);
 		if (rc)
 			return rc;
 	}
 
 	if (hdr) {
-		rc = gss_setup_sgtable(&sgt, sg, hdr, sizeof(*hdr));
-		if (rc != 0)
-			return rc;
-
-		rc = crypto_hash_update(&desc, sg, sizeof(*hdr));
+		rc = gss_setup_sgtable(&sgt, sg, &(hdr->len), sizeof(hdr->len));
 		if (rc)
 			return rc;
 
+		ahash_request_set_crypt(req, sg, NULL, sizeof(hdr->len));
+		rc = crypto_ahash_update(req);
 		gss_teardown_sgtable(&sgt);
+		if (rc)
+			return rc;
 	}
 
-	rc = crypto_hash_final(&desc, cksum->data);
-	if (rc)
-		return rc;
-
-	return gss_crypt_generic(kb->kb_tfm, 0, NULL, cksum->data,
-				 cksum->data, cksum->len);
+	return rc;
 }
 
 int gss_add_padding(rawobj_t *msg, int msg_buflen, int blocksize)
@@ -422,11 +400,10 @@ int gss_add_padding(rawobj_t *msg, int msg_buflen, int blocksize)
 	return 0;
 }
 
-int gss_crypt_rawobjs(struct crypto_blkcipher *tfm, __u8 *iv,
+int gss_crypt_rawobjs(struct crypto_sync_skcipher *tfm, __u8 *iv,
 		      int inobj_cnt, rawobj_t *inobjs, rawobj_t *outobj,
 		      int enc)
 {
-	struct blkcipher_desc desc;
 	struct scatterlist src;
 	struct scatterlist dst;
 	struct sg_table sg_dst;
@@ -434,12 +411,13 @@ int gss_crypt_rawobjs(struct crypto_blkcipher *tfm, __u8 *iv,
 	__u8 *buf;
 	__u32 datalen = 0;
 	int i, rc;
+	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
+
 	ENTRY;
 
 	buf = outobj->data;
-	desc.tfm  = tfm;
-	desc.info = iv;
-	desc.flags = 0;
+	skcipher_request_set_sync_tfm(req, tfm);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
 
 	for (i = 0; i < inobj_cnt; i++) {
 		LASSERT(buf + inobjs[i].len <= outobj->data + outobj->len);
@@ -456,35 +434,30 @@ int gss_crypt_rawobjs(struct crypto_blkcipher *tfm, __u8 *iv,
 			RETURN(rc);
 		}
 
-		if (iv) {
-			if (enc)
-				rc = crypto_blkcipher_encrypt_iv(&desc, &dst,
-								 &src,
-								 src.length);
-			else
-				rc = crypto_blkcipher_decrypt_iv(&desc, &dst,
-								 &src,
-								 src.length);
-		} else {
-			if (enc)
-				rc = crypto_blkcipher_encrypt(&desc, &dst, &src,
-							      src.length);
-			else
-				rc = crypto_blkcipher_decrypt(&desc, &dst, &src,
-							      src.length);
-		}
+		skcipher_request_set_crypt(req, &src, &dst, src.length, iv);
+		if (!iv)
+			skcipher_request_set_crypt_iv(req);
+
+		if (enc)
+			rc = crypto_skcipher_encrypt_iv(req, &dst, &src,
+							src.length);
+		else
+			rc = crypto_skcipher_decrypt_iv(req, &dst, &src,
+							src.length);
 
 		gss_teardown_sgtable(&sg_src);
 		gss_teardown_sgtable(&sg_dst);
 
 		if (rc) {
 			CERROR("encrypt error %d\n", rc);
+			skcipher_request_zero(req);
 			RETURN(rc);
 		}
 
 		datalen += inobjs[i].len;
 		buf += inobjs[i].len;
 	}
+	skcipher_request_zero(req);
 
 	outobj->len = datalen;
 	RETURN(0);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.h
index ad15cdedd66d5..7ed680a4c8430 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.h
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_crypto.h
@@ -1,14 +1,79 @@
 #ifndef PTLRPC_GSS_CRYPTO_H
 #define PTLRPC_GSS_CRYPTO_H
 
+#include <linux/scatterlist.h>
+
 #include "gss_internal.h"
 
+#include <crypto/skcipher.h>
+
+/*
+ * linux v4.19-rc2-66-gb350bee5ea0f
+ * crypto: skcipher - Introduce crypto_sync_skcipher
+ *
+ * crypto_sync_skcipher will replace crypto_blkcipher so start using
+ * crypto_sync_skcipher and provide wrappers for older kernels
+ */
+#ifdef SYNC_SKCIPHER_REQUEST_ON_STACK
+
+#define crypto_skcipher_encrypt_iv(desc, dst, src, blocksize)		\
+	crypto_skcipher_encrypt((desc))
+
+#define crypto_skcipher_decrypt_iv(desc, dst, src, blocksize)		\
+	crypto_skcipher_decrypt((desc))
+
+#define skcipher_request_set_crypt_iv(d)
+
+#else /* ! SYNC_SKCIPHER_REQUEST_ON_STACK */
+
+#define	crypto_sync_skcipher		crypto_blkcipher
+
+#define SYNC_SKCIPHER_REQUEST_ON_STACK(name, tfm)			\
+	struct blkcipher_desc __##name##_obj, *name = (void *)&__##name##_obj
+
+#define skcipher_request_set_sync_tfm(d, _tfm)				\
+	do { (d)->tfm = _tfm; } while (0)
+
+#define skcipher_request_set_callback(d, f, c, data)			\
+	do { (d)->flags = f; } while (0)
+
+#define skcipher_request_set_crypt(d, src, dst, cryptlen, iv)		\
+	do { (d)->info = iv; } while (0)
+
+#define skcipher_request_set_crypt_iv(d)				\
+	do { (d)->info = crypto_blkcipher_crt((d)->tfm)->iv; } while (0)
+
+#define crypto_sync_skcipher_blocksize(tfm)				\
+	crypto_blkcipher_blocksize((tfm))
+
+#define crypto_sync_skcipher_setkey(tfm, key, keylen)			\
+	crypto_blkcipher_setkey((tfm), (key), (keylen))
+
+#define crypto_alloc_sync_skcipher(name, type, mask)			\
+	crypto_alloc_blkcipher((name), (type), (mask))
+
+#define crypto_free_sync_skcipher(tfm)					\
+	crypto_free_blkcipher((tfm))
+
+#define crypto_sync_skcipher_ivsize(tfm)				\
+	crypto_blkcipher_ivsize((tfm))
+
+#define crypto_skcipher_encrypt_iv(desc, dst, src, len)			\
+	crypto_blkcipher_encrypt_iv((desc), (dst), (src), (len))
+
+#define crypto_skcipher_decrypt_iv(desc, dst, src, len)			\
+	crypto_blkcipher_decrypt_iv((desc), (dst), (src), (len))
+
+#define skcipher_request_zero(req) /* nop */
+
+#endif /* SYNC_SKCIPHER_REQUEST_ON_STACK */
+
 struct gss_keyblock {
-	rawobj_t		 kb_key;
-	struct crypto_blkcipher *kb_tfm;
+	rawobj_t kb_key;
+	struct crypto_sync_skcipher *kb_tfm;
 };
 
-int gss_keyblock_init(struct gss_keyblock *kb, char *alg_name,
+int gss_keyblock_init(struct gss_keyblock *kb, const char *alg_name,
 		      const int alg_mode);
 void gss_keyblock_free(struct gss_keyblock *kb);
 int gss_keyblock_dup(struct gss_keyblock *new, struct gss_keyblock *kb);
@@ -19,16 +84,15 @@ int gss_get_keyblock(char **ptr, const char *end, struct gss_keyblock *kb,
 int gss_setup_sgtable(struct sg_table *sgt, struct scatterlist *prealloc_sg,
 		      const void *buf, unsigned int buf_len);
 void gss_teardown_sgtable(struct sg_table *sgt);
-int gss_crypt_generic(struct crypto_blkcipher *tfm, int decrypt, const void *iv,
-		      const void *in, void *out, size_t length);
-int gss_digest_hmac(struct crypto_hash *tfm, rawobj_t *key, rawobj_t *hdr,
-		    int msgcnt, rawobj_t *msgs, int iovcnt, lnet_kiov_t *iovs,
-		    rawobj_t *cksum);
-int gss_digest_norm(struct crypto_hash *tfm, struct gss_keyblock *kb,
-		    rawobj_t *hdr, int msgcnt, rawobj_t *msgs, int iovcnt,
-		    lnet_kiov_t *iovs, rawobj_t *cksum);
+int gss_crypt_generic(struct crypto_sync_skcipher *tfm, int decrypt,
+		      const void *iv, const void *in, void *out, size_t length);
+int gss_digest_hash(struct ahash_request *req, rawobj_t *hdr,
+		    int msgcnt, rawobj_t *msgs, int iovcnt, lnet_kiov_t *iovs);
+int gss_digest_hash_compat(struct ahash_request *req,
+			   rawobj_t *hdr, int msgcnt, rawobj_t *msgs,
+			   int iovcnt, lnet_kiov_t *iovs);
 int gss_add_padding(rawobj_t *msg, int msg_buflen, int blocksize);
-int gss_crypt_rawobjs(struct crypto_blkcipher *tfm, __u8 *iv,
+int gss_crypt_rawobjs(struct crypto_sync_skcipher *tfm, __u8 *iv,
 		      int inobj_cnt, rawobj_t *inobjs, rawobj_t *outobj,
 		      int enc);
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_generic_token.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_generic_token.c
index 3c4e63b992bee..23506f89d67c2 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_generic_token.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_generic_token.c
@@ -50,7 +50,6 @@
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_net.h>
 #include <lustre_import.h>
 #include <lustre_sec.h>
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h
index eb86ba1627103..c49a54021688f 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_internal.h
@@ -11,7 +11,8 @@
 #ifndef __PTLRPC_GSS_GSS_INTERNAL_H_
 #define __PTLRPC_GSS_GSS_INTERNAL_H_
 
-#include <linux/crypto.h>
+#include <crypto/hash.h>
+#include <libcfs/libcfs_crypto.h>
 #include <lustre_sec.h>
 
 /*
@@ -72,17 +73,16 @@ int buffer_extract_bytes(const void **buf, __u32 *buflen,
  */
 #define GSS_GC_INTERVAL                 (60 * 60) /* 60 minutes */
 
-static inline
-unsigned long gss_round_ctx_expiry(unsigned long expiry,
-                                   unsigned long sec_flags)
+static inline time64_t gss_round_ctx_expiry(time64_t expiry,
+					    unsigned long sec_flags)
 {
-        if (sec_flags & PTLRPC_SEC_FL_REVERSE)
-                return expiry;
+	if (sec_flags & PTLRPC_SEC_FL_REVERSE)
+		return expiry;
 
-        if (ktime_get_real_seconds() + __TIMEOUT_DELTA <= expiry)
-                return expiry - __TIMEOUT_DELTA;
+	if (ktime_get_real_seconds() + __TIMEOUT_DELTA <= expiry)
+		return expiry - __TIMEOUT_DELTA;
 
-        return expiry;
+	return expiry;
 }
 
 /*
@@ -117,8 +117,9 @@ enum ptlrpc_gss_tgt {
 };
 
 enum ptlrpc_gss_header_flags {
-        LUSTRE_GSS_PACK_BULK            = 1,
-        LUSTRE_GSS_PACK_USER            = 2,
+	LUSTRE_GSS_PACK_BULK            = 1,
+	LUSTRE_GSS_PACK_USER            = 2,
+	LUSTRE_GSS_PACK_KCSUM           = 4,
 };
 
 static inline
@@ -286,9 +287,9 @@ struct gss_cli_ctx {
 };
 
 struct gss_cli_ctx_keyring {
-        struct gss_cli_ctx      gck_base;
-        struct key             *gck_key;
-        struct timer_list      *gck_timer;
+	struct gss_cli_ctx      gck_base;
+	struct key             *gck_key;
+	struct timer_list       gck_timer;
 };
 
 struct gss_sec {
@@ -357,6 +358,14 @@ static inline struct gss_sec_keyring *sec2gsec_keyring(struct ptlrpc_sec *sec)
         return container_of(sec2gsec(sec), struct gss_sec_keyring, gsk_base);
 }
 
+#ifdef HAVE_CACHE_HASH_SPINLOCK
+# define sunrpc_cache_lookup(c, i, h) sunrpc_cache_lookup_rcu((c), (i), (h))
+# define cache_read_lock(cdetail)   spin_lock(&((cdetail)->hash_lock))
+# define cache_read_unlock(cdetail) spin_unlock(&((cdetail)->hash_lock))
+#else /* ! HAVE_CACHE_HASH_SPINLOCK */
+# define cache_read_lock(cdetail)   read_lock(&((cdetail)->hash_lock))
+# define cache_read_unlock(cdetail) read_unlock(&((cdetail)->hash_lock))
+#endif
 
 #define GSS_CTX_INIT_MAX_LEN            (1024)
 
@@ -509,6 +518,7 @@ void gss_svc_upcall_destroy_ctx(struct gss_svc_ctx *ctx);
 
 int  __init gss_init_svc_upcall(void);
 void gss_exit_svc_upcall(void);
+extern unsigned int krb5_allow_old_client_csum;
 
 /* lproc_gss.c */
 void gss_stat_oos_record_cli(int behind);
@@ -554,4 +564,13 @@ void __dbg_memdump(char *name, void *ptr, int size)
         OBD_FREE(buf, bufsize);
 }
 
+static inline unsigned int ll_read_key_usage(struct key *key)
+{
+#ifdef HAVE_KEY_USAGE_REFCOUNT
+	return refcount_read(&key->usage);
+#else
+	return atomic_read(&key->usage);
+#endif
+}
+
 #endif /* __PTLRPC_GSS_GSS_INTERNAL_H_ */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_keyring.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_keyring.c
index 81aad1ffea6e2..845269c8acec3 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_keyring.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_keyring.c
@@ -51,7 +51,7 @@
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 #include <lustre_sec.h>
 #include <lustre_net.h>
 #include <lustre_import.h>
@@ -60,6 +60,10 @@
 #include "gss_internal.h"
 #include "gss_api.h"
 
+#ifdef HAVE_GET_REQUEST_KEY_AUTH
+#include <keys/request_key_auth-type.h>
+#endif
+
 static struct ptlrpc_sec_policy gss_policy_keyring;
 static struct ptlrpc_ctx_ops gss_keyring_ctxops;
 static struct key_type gss_key_type;
@@ -82,45 +86,6 @@ static int sec_install_rctx_kr(struct ptlrpc_sec *sec,
  * internal helpers                     *
  ****************************************/
 
-#define DUMP_PROCESS_KEYRINGS(tsk)					\
-{									\
-	CWARN("DUMP PK: %s[%u,%u/%u](<-%s[%u,%u/%u]): "			\
-	      "a %d, t %d, p %d, s %d, u %d, us %d, df %d\n",		\
-	      tsk->comm, tsk->pid, tsk->uid, tsk->fsuid,		\
-	      tsk->parent->comm, tsk->parent->pid,			\
-	      tsk->parent->uid, tsk->parent->fsuid,			\
-	      tsk->request_key_auth ?					\
-	      tsk->request_key_auth->serial : 0,			\
-	      key_cred(tsk)->thread_keyring ?				\
-	      key_cred(tsk)->thread_keyring->serial : 0,		\
-	      key_tgcred(tsk)->process_keyring ?			\
-	      key_tgcred(tsk)->process_keyring->serial : 0,		\
-	      key_tgcred(tsk)->session_keyring ?			\
-	      key_tgcred(tsk)->session_keyring->serial : 0,		\
-	      key_cred(tsk)->user->uid_keyring ?			\
-	      key_cred(tsk)->user->uid_keyring->serial : 0,		\
-	      key_cred(tsk)->user->session_keyring ?			\
-	      key_cred(tsk)->user->session_keyring->serial : 0,		\
-	      key_cred(tsk)->jit_keyring				\
-	     );								\
-}
-
-#define DUMP_KEY(key)                                                   \
-{                                                                       \
-        CWARN("DUMP KEY: %p(%d) ref %d u%u/g%u desc %s\n",              \
-              key, key->serial, atomic_read(&key->usage),               \
-              key->uid, key->gid,                                       \
-              key->description ? key->description : "n/a"               \
-             );                                                         \
-}
-
-#define key_cred(tsk)   ((tsk)->cred)
-#ifdef HAVE_CRED_TGCRED
-#define key_tgcred(tsk) ((tsk)->cred->tgcred)
-#else
-#define key_tgcred(tsk) key_cred(tsk)
-#endif
-
 static inline void keyring_upcall_lock(struct gss_sec_keyring *gsec_kr)
 {
 #ifdef HAVE_KEYRING_UPCALL_SERIALIZED
@@ -140,10 +105,12 @@ static inline void key_revoke_locked(struct key *key)
         set_bit(KEY_FLAG_REVOKED, &key->flags);
 }
 
-static void ctx_upcall_timeout_kr(unsigned long data)
+static void ctx_upcall_timeout_kr(cfs_timer_cb_arg_t data)
 {
-        struct ptlrpc_cli_ctx *ctx = (struct ptlrpc_cli_ctx *) data;
-        struct key            *key = ctx2gctx_keyring(ctx)->gck_key;
+	struct gss_cli_ctx_keyring *gctx_kr = cfs_from_timer(gctx_kr,
+							     data, gck_timer);
+	struct ptlrpc_cli_ctx *ctx = &(gctx_kr->gck_base.gc_base);
+	struct key *key	= gctx_kr->gck_key;
 
         CWARN("ctx %p, key %p\n", ctx, key);
 
@@ -153,22 +120,18 @@ static void ctx_upcall_timeout_kr(unsigned long data)
         key_revoke_locked(key);
 }
 
-static void ctx_start_timer_kr(struct ptlrpc_cli_ctx *ctx, long timeout)
+static void ctx_start_timer_kr(struct ptlrpc_cli_ctx *ctx, time64_t timeout)
 {
 	struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx);
-	struct timer_list          *timer = gctx_kr->gck_timer;
+	struct timer_list *timer = &gctx_kr->gck_timer;
 
 	LASSERT(timer);
 
-	CDEBUG(D_SEC, "ctx %p: start timer %lds\n", ctx, timeout);
-	timeout = msecs_to_jiffies(timeout * MSEC_PER_SEC) +
-		  cfs_time_current();
-
-	init_timer(timer);
-	timer->expires = timeout;
-	timer->data = (unsigned long ) ctx;
-	timer->function = ctx_upcall_timeout_kr;
+	CDEBUG(D_SEC, "ctx %p: start timer %llds\n", ctx, timeout);
 
+	cfs_timer_setup(timer, ctx_upcall_timeout_kr,
+			(unsigned long)gctx_kr, 0);
+	timer->expires = cfs_time_seconds(timeout) + jiffies;
 	add_timer(timer);
 }
 
@@ -179,47 +142,34 @@ static
 void ctx_clear_timer_kr(struct ptlrpc_cli_ctx *ctx)
 {
         struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx);
-        struct timer_list          *timer = gctx_kr->gck_timer;
-
-        if (timer == NULL)
-                return;
+	struct timer_list          *timer = &gctx_kr->gck_timer;
 
         CDEBUG(D_SEC, "ctx %p, key %p\n", ctx, gctx_kr->gck_key);
 
-        gctx_kr->gck_timer = NULL;
-
         del_singleshot_timer_sync(timer);
-
-        OBD_FREE_PTR(timer);
 }
 
 static
 struct ptlrpc_cli_ctx *ctx_create_kr(struct ptlrpc_sec *sec,
                                      struct vfs_cred *vcred)
 {
-        struct ptlrpc_cli_ctx      *ctx;
-        struct gss_cli_ctx_keyring *gctx_kr;
+	struct ptlrpc_cli_ctx      *ctx;
+	struct gss_cli_ctx_keyring *gctx_kr;
 
-        OBD_ALLOC_PTR(gctx_kr);
-        if (gctx_kr == NULL)
-                return NULL;
+	OBD_ALLOC_PTR(gctx_kr);
+	if (gctx_kr == NULL)
+		return NULL;
 
-        OBD_ALLOC_PTR(gctx_kr->gck_timer);
-        if (gctx_kr->gck_timer == NULL) {
-                OBD_FREE_PTR(gctx_kr);
-                return NULL;
-        }
-        init_timer(gctx_kr->gck_timer);
+	cfs_timer_setup(&gctx_kr->gck_timer, NULL, 0, 0);
 
-        ctx = &gctx_kr->gck_base.gc_base;
+	ctx = &gctx_kr->gck_base.gc_base;
 
-        if (gss_cli_ctx_init_common(sec, ctx, &gss_keyring_ctxops, vcred)) {
-                OBD_FREE_PTR(gctx_kr->gck_timer);
-                OBD_FREE_PTR(gctx_kr);
-                return NULL;
-        }
+	if (gss_cli_ctx_init_common(sec, ctx, &gss_keyring_ctxops, vcred)) {
+		OBD_FREE_PTR(gctx_kr);
+		return NULL;
+	}
 
-	ctx->cc_expire = cfs_time_current_sec() + KEYRING_UPCALL_TIMEOUT;
+	ctx->cc_expire = ktime_get_real_seconds() + KEYRING_UPCALL_TIMEOUT;
 	clear_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags);
 	atomic_inc(&ctx->cc_refcount); /* for the caller */
 
@@ -241,7 +191,6 @@ static void ctx_destroy_kr(struct ptlrpc_cli_ctx *ctx)
         LASSERT(gctx_kr->gck_key == NULL);
 
 	ctx_clear_timer_kr(ctx);
-	LASSERT(gctx_kr->gck_timer == NULL);
 
 	if (gss_cli_ctx_fini_common(sec, ctx))
 		return;
@@ -388,7 +337,7 @@ static int key_set_payload(struct key *key, unsigned int index,
 static void bind_key_ctx(struct key *key, struct ptlrpc_cli_ctx *ctx)
 {
 	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
-        LASSERT(atomic_read(&key->usage) > 0);
+	LASSERT(ll_read_key_usage(key) > 0);
 	LASSERT(ctx2gctx_keyring(ctx)->gck_key == NULL);
 	LASSERT(!key_get_payload(key, 0));
 
@@ -561,17 +510,17 @@ void rvs_sec_install_root_ctx_kr(struct ptlrpc_sec *sec,
                                  struct ptlrpc_cli_ctx *new_ctx,
                                  struct key *key)
 {
-	struct gss_sec_keyring	*gsec_kr = sec2gsec_keyring(sec);
-	struct hlist_node	__maybe_unused *hnode;
-	struct ptlrpc_cli_ctx	*ctx;
-	cfs_time_t		now;
-	ENTRY;
+	struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec);
+	struct hlist_node __maybe_unused *hnode;
+	struct ptlrpc_cli_ctx *ctx;
+	time64_t now;
 
-        LASSERT(sec_is_reverse(sec));
+	ENTRY;
+	LASSERT(sec_is_reverse(sec));
 
 	spin_lock(&sec->ps_lock);
 
-        now = cfs_time_current_sec();
+	now = ktime_get_real_seconds();
 
         /* set all existing ctxs short expiry */
         cfs_hlist_for_each_entry(ctx, hnode, &gsec_kr->gsk_clist, cc_cache) {
@@ -666,39 +615,104 @@ static inline int user_is_root(struct ptlrpc_sec *sec, struct vfs_cred *vcred)
                 return 0;
 }
 
+/*
+ * kernel 5.3: commit 0f44e4d976f96c6439da0d6717238efa4b91196e
+ * keys: Move the user and user-session keyrings to the user_namespace
+ *
+ * When lookup_user_key is available use the kernel API rather than directly
+ * accessing the uid_keyring and session_keyring via the current process
+ * credentials.
+ */
+#ifdef HAVE_LOOKUP_USER_KEY
+
+/* from Linux security/keys/internal.h: */
+#ifndef KEY_LOOKUP_FOR_UNLINK
+#define KEY_LOOKUP_FOR_UNLINK		0x04
+#endif
+
+static struct key *_user_key(key_serial_t id)
+{
+	key_ref_t ref;
+
+	might_sleep();
+	ref = lookup_user_key(id, KEY_LOOKUP_FOR_UNLINK, 0);
+	if (IS_ERR(ref))
+		return NULL;
+	return key_ref_to_ptr(ref);
+}
+
+static inline struct key *get_user_session_keyring(const struct cred *cred)
+{
+	return _user_key(KEY_SPEC_USER_SESSION_KEYRING);
+}
+
+static inline struct key *get_user_keyring(const struct cred *cred)
+{
+	return _user_key(KEY_SPEC_USER_KEYRING);
+}
+#else
+static inline struct key *get_user_session_keyring(const struct cred *cred)
+{
+	return key_get(cred->user->session_keyring);
+}
+
+static inline struct key *get_user_keyring(const struct cred *cred)
+{
+	return key_get(cred->user->uid_keyring);
+}
+#endif
+
 /*
  * unlink request key from it's ring, which is linked during request_key().
  * sadly, we have to 'guess' which keyring it's linked to.
  *
- * FIXME this code is fragile, depend on how request_key_link() is implemented.
+ * FIXME this code is fragile, it depends on how request_key() is implemented.
  */
 static void request_key_unlink(struct key *key)
 {
-	struct task_struct *tsk = current;
-	struct key *ring;
+	const struct cred *cred = current_cred();
+	struct key *ring = NULL;
 
-	switch (key_cred(tsk)->jit_keyring) {
+	switch (cred->jit_keyring) {
 	case KEY_REQKEY_DEFL_DEFAULT:
+	case KEY_REQKEY_DEFL_REQUESTOR_KEYRING:
+#ifdef HAVE_GET_REQUEST_KEY_AUTH
+		if (cred->request_key_auth) {
+			struct request_key_auth *rka;
+			struct key *authkey = cred->request_key_auth;
+
+			down_read(&authkey->sem);
+			rka = get_request_key_auth(authkey);
+			if (!test_bit(KEY_FLAG_REVOKED, &authkey->flags))
+				ring = key_get(rka->dest_keyring);
+			up_read(&authkey->sem);
+			if (ring)
+				break;
+		}
+#endif
+		fallthrough;
 	case KEY_REQKEY_DEFL_THREAD_KEYRING:
-		ring = key_get(key_cred(tsk)->thread_keyring);
+		ring = key_get(cred->thread_keyring);
 		if (ring)
 			break;
+		fallthrough;
 	case KEY_REQKEY_DEFL_PROCESS_KEYRING:
-		ring = key_get(key_tgcred(tsk)->process_keyring);
+		ring = key_get(cred->process_keyring);
 		if (ring)
 			break;
+		fallthrough;
 	case KEY_REQKEY_DEFL_SESSION_KEYRING:
 		rcu_read_lock();
-		ring = key_get(rcu_dereference(key_tgcred(tsk)
-					       ->session_keyring));
+		ring = key_get(rcu_dereference(cred->session_keyring));
 		rcu_read_unlock();
 		if (ring)
 			break;
+		fallthrough;
 	case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
-		ring = key_get(key_cred(tsk)->user->session_keyring);
+		ring = get_user_session_keyring(cred);
 		break;
 	case KEY_REQKEY_DEFL_USER_KEYRING:
-		ring = key_get(key_cred(tsk)->user->uid_keyring);
+		ring = get_user_keyring(cred);
 		break;
 	case KEY_REQKEY_DEFL_GROUP_KEYRING:
 	default:
@@ -863,7 +877,7 @@ struct ptlrpc_cli_ctx * gss_sec_lookup_ctx_kr(struct ptlrpc_sec *sec,
 	if (likely(ctx)) {
 		LASSERT(atomic_read(&ctx->cc_refcount) >= 1);
 		LASSERT(ctx2gctx_keyring(ctx)->gck_key == key);
-		LASSERT(atomic_read(&key->usage) >= 2);
+		LASSERT(ll_read_key_usage(key) >= 2);
 
 		/* simply take a ref and return. it's upper layer's
 		 * responsibility to detect & replace dead ctx. */
@@ -1067,13 +1081,13 @@ void gss_sec_gc_ctx_kr(struct ptlrpc_sec *sec)
 static
 int gss_sec_display_kr(struct ptlrpc_sec *sec, struct seq_file *seq)
 {
-	struct gss_sec_keyring	*gsec_kr = sec2gsec_keyring(sec);
-	struct hlist_node	__maybe_unused *pos, *next;
-	struct ptlrpc_cli_ctx	*ctx;
-	struct gss_cli_ctx	*gctx;
-	time_t			 now = cfs_time_current_sec();
-	ENTRY;
+	struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec);
+	struct hlist_node __maybe_unused *pos, *next;
+	struct ptlrpc_cli_ctx *ctx;
+	struct gss_cli_ctx *gctx;
+	time64_t now = ktime_get_real_seconds();
 
+	ENTRY;
 	spin_lock(&sec->ps_lock);
         cfs_hlist_for_each_entry_safe(ctx, pos, next,
 				      &gsec_kr->gsk_clist, cc_cache) {
@@ -1093,9 +1107,8 @@ int gss_sec_display_kr(struct ptlrpc_sec *sec, struct seq_file *seq)
                         snprintf(mech, sizeof(mech), "N/A");
                 mech[sizeof(mech) - 1] = '\0';
 
-		seq_printf(seq, "%p: uid %u, ref %d, expire %lu(%+ld), fl %s, "
-			   "seq %d, win %u, key %08x(ref %d), "
-			   "hdl %#llx:%#llx, mech: %s\n",
+		seq_printf(seq,
+			   "%p: uid %u, ref %d, expire %lld(%+lld), fl %s, seq %d, win %u, key %08x(ref %d), hdl %#llx:%#llx, mech: %s\n",
 			   ctx, ctx->cc_vcred.vc_uid,
 			   atomic_read(&ctx->cc_refcount),
 			   ctx->cc_expire,
@@ -1104,7 +1117,7 @@ int gss_sec_display_kr(struct ptlrpc_sec *sec, struct seq_file *seq)
 			   atomic_read(&gctx->gc_seq),
 			   gctx->gc_win,
 			   key ? key->serial : 0,
-			   key ? atomic_read(&key->usage) : 0,
+			   key ? ll_read_key_usage(key) : 0,
 			   gss_handle_to_u64(&gctx->gc_handle),
 			   gss_handle_to_u64(&gctx->gc_svc_handle),
 			   mech);
@@ -1121,8 +1134,16 @@ int gss_sec_display_kr(struct ptlrpc_sec *sec, struct seq_file *seq)
 static
 int gss_cli_ctx_refresh_kr(struct ptlrpc_cli_ctx *ctx)
 {
-        /* upcall is already on the way */
-        return 0;
+	/* upcall is already on the way */
+	struct gss_cli_ctx *gctx = ctx ? ctx2gctx(ctx) : NULL;
+
+	/* record latest sequence number in buddy svcctx */
+	if (gctx && !rawobj_empty(&gctx->gc_svc_handle) &&
+	    sec_is_reverse(gctx->gc_base.cc_sec)) {
+		return gss_svc_upcall_update_sequence(&gctx->gc_svc_handle,
+					     (__u32)atomic_read(&gctx->gc_seq));
+	}
+	return 0;
 }
 
 static
@@ -1325,15 +1346,15 @@ int gss_kt_instantiate(struct key *key, const void *data, size_t datalen)
          * the session keyring is created upon upcall, and don't change all
          * the way until upcall finished, so rcu lock is not needed here.
          */
-	LASSERT(key_tgcred(current)->session_keyring);
+	LASSERT(current_cred()->session_keyring);
 
 	lockdep_off();
-	rc = key_link(key_tgcred(current)->session_keyring, key);
+	rc = key_link(current_cred()->session_keyring, key);
 	lockdep_on();
 	if (unlikely(rc)) {
 		CERROR("failed to link key %08x to keyring %08x: %d\n",
 		       key->serial,
-		       key_tgcred(current)->session_keyring->serial, rc);
+		       current_cred()->session_keyring->serial, rc);
 		RETURN(rc);
 	}
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5.h b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5.h
index 97ad55e3025c0..611160458d9b1 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5.h
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5.h
@@ -80,7 +80,7 @@ struct krb5_ctx {
 				kc_cfx:1,
 				kc_seed_init:1,
 				kc_have_acceptor_subkey:1;
-	__s32			kc_endtime;
+	time64_t		kc_endtime;
 	__u8			kc_seed[16];
 	__u64			kc_seq_send;
 	__u64			kc_seq_recv;
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5_mech.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5_mech.c
index 000d7a8e87b47..bd3a94ba162b3 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5_mech.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_krb5_mech.c
@@ -58,7 +58,6 @@
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_net.h>
 #include <lustre_import.h>
 #include <lustre_sec.h>
@@ -95,18 +94,20 @@ static struct krb5_enctype enctypes[] = {
 		.ke_hash_size	= 16,
 		.ke_conf_size	= 8,
 	},
+#ifdef HAVE_DES3_SUPPORT
 	[ENCTYPE_DES3_CBC_RAW] = {		/* des3-hmac-sha1 */
 		.ke_dispname	= "des3-hmac-sha1",
 		.ke_enc_name	= "cbc(des3_ede)",
-		.ke_hash_name	= "hmac(sha1)",
+		.ke_hash_name	= "sha1",
 		.ke_hash_size	= 20,
 		.ke_conf_size	= 8,
 		.ke_hash_hmac	= 1,
 	},
+#endif
 	[ENCTYPE_AES128_CTS_HMAC_SHA1_96] = {	/* aes128-cts */
 		.ke_dispname	= "aes128-cts-hmac-sha1-96",
 		.ke_enc_name	= "cbc(aes)",
-		.ke_hash_name	= "hmac(sha1)",
+		.ke_hash_name	= "sha1",
 		.ke_hash_size	= 12,
 		.ke_conf_size	= 16,
 		.ke_hash_hmac	= 1,
@@ -114,7 +115,7 @@ static struct krb5_enctype enctypes[] = {
 	[ENCTYPE_AES256_CTS_HMAC_SHA1_96] = {	/* aes256-cts */
 		.ke_dispname	= "aes256-cts-hmac-sha1-96",
 		.ke_enc_name	= "cbc(aes)",
-		.ke_hash_name	= "hmac(sha1)",
+		.ke_hash_name	= "sha1",
 		.ke_hash_size	= 12,
 		.ke_conf_size	= 16,
 		.ke_hash_hmac	= 1,
@@ -122,33 +123,31 @@ static struct krb5_enctype enctypes[] = {
 	[ENCTYPE_ARCFOUR_HMAC] = {		/* arcfour-hmac-md5 */
 		.ke_dispname	= "arcfour-hmac-md5",
 		.ke_enc_name	= "ecb(arc4)",
-		.ke_hash_name	= "hmac(md5)",
+		.ke_hash_name	= "md5",
 		.ke_hash_size	= 16,
 		.ke_conf_size	= 8,
 		.ke_hash_hmac	= 1,
 	}
 };
 
-#define MAX_ENCTYPES    sizeof(enctypes)/sizeof(struct krb5_enctype)
-
 static const char * enctype2str(__u32 enctype)
 {
-        if (enctype < MAX_ENCTYPES && enctypes[enctype].ke_dispname)
-                return enctypes[enctype].ke_dispname;
+	if (enctype < ARRAY_SIZE(enctypes) && enctypes[enctype].ke_dispname)
+		return enctypes[enctype].ke_dispname;
 
-        return "unknown";
+	return "unknown";
 }
 
 static
 int krb5_init_keys(struct krb5_ctx *kctx)
 {
-        struct krb5_enctype *ke;
+	struct krb5_enctype *ke;
 
-        if (kctx->kc_enctype >= MAX_ENCTYPES ||
-            enctypes[kctx->kc_enctype].ke_hash_size == 0) {
-                CERROR("unsupported enctype %x\n", kctx->kc_enctype);
-                return -1;
-        }
+	if (kctx->kc_enctype >= ARRAY_SIZE(enctypes) ||
+	    enctypes[kctx->kc_enctype].ke_hash_size == 0) {
+		CERROR("unsupported enctype %x\n", kctx->kc_enctype);
+		return -1;
+	}
 
         ke = &enctypes[kctx->kc_enctype];
 
@@ -197,8 +196,13 @@ __u32 import_context_rfc1964(struct krb5_ctx *kctx, char *p, char *end)
 	    gss_get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
 		goto out_err;
 
-	/* end time */
-	if (gss_get_bytes(&p, end, &kctx->kc_endtime, sizeof(kctx->kc_endtime)))
+	/* end time. While kc_endtime might be 64 bit the krb5 API
+	 * still uses 32 bits. To delay the 2038 bug see the incoming
+	 * value as a u32 which give us until 2106. See the link for details:
+	 *
+	 * http://web.mit.edu/kerberos/www/krb5-current/doc/appdev/y2038.html
+	 */
+	if (gss_get_bytes(&p, end, &kctx->kc_endtime, sizeof(u32)))
 		goto out_err;
 
 	/* seq send */
@@ -262,8 +266,13 @@ __u32 import_context_rfc4121(struct krb5_ctx *kctx, char *p, char *end)
 {
 	unsigned int    tmp_uint, keysize;
 
-	/* end time */
-	if (gss_get_bytes(&p, end, &kctx->kc_endtime, sizeof(kctx->kc_endtime)))
+	/* end time. While kc_endtime might be 64 bit the krb5 API
+	 * still uses 32 bits. To delay the 2038 bug see the incoming
+	 * value as a u32 which give us until 2106. See the link for details:
+	 *
+	 * http://web.mit.edu/kerberos/www/krb5-current/doc/appdev/y2038.html
+	 */
+	if (gss_get_bytes(&p, end, &kctx->kc_endtime, sizeof(u32)))
 		goto out_err;
 
 	/* flags */
@@ -411,11 +420,11 @@ __u32 gss_copy_reverse_context_kerberos(struct gss_ctx *gctx,
 
 static
 __u32 gss_inquire_context_kerberos(struct gss_ctx *gctx,
-                                   unsigned long  *endtime)
+				   time64_t *endtime)
 {
         struct krb5_ctx *kctx = gctx->internal_ctx_id;
 
-	*endtime = (unsigned long)((__u32) kctx->kc_endtime);
+	*endtime = kctx->kc_endtime;
         return GSS_S_COMPLETE;
 }
 
@@ -438,41 +447,66 @@ __s32 krb5_make_checksum(__u32 enctype,
 			 struct krb5_header *khdr,
 			 int msgcnt, rawobj_t *msgs,
 			 int iovcnt, lnet_kiov_t *iovs,
-			 rawobj_t *cksum)
+			 rawobj_t *cksum,
+			 digest_hash hash_func)
 {
-        struct krb5_enctype   *ke = &enctypes[enctype];
-	struct crypto_hash    *tfm;
-	rawobj_t	       hdr;
-        __u32                  code = GSS_S_FAILURE;
-        int                    rc;
-
-	if (!(tfm = crypto_alloc_hash(ke->ke_hash_name, 0, 0))) {
-                CERROR("failed to alloc TFM: %s\n", ke->ke_hash_name);
-                return GSS_S_FAILURE;
-        }
+	struct krb5_enctype *ke = &enctypes[enctype];
+	struct ahash_request *req = NULL;
+	enum cfs_crypto_hash_alg hash_algo;
+	rawobj_t hdr;
+	int rc;
+
+	hash_algo = cfs_crypto_hash_alg(ke->ke_hash_name);
+
+	/* For the cbc(des) case we want md5 instead of hmac(md5) */
+	if (strcmp(ke->ke_enc_name, "cbc(des)"))
+		req = cfs_crypto_hash_init(hash_algo, kb->kb_key.data,
+					   kb->kb_key.len);
+	else
+		req = cfs_crypto_hash_init(hash_algo, NULL, 0);
+	if (IS_ERR(req)) {
+		rc = PTR_ERR(req);
+		CERROR("failed to alloc hash %s : rc = %d\n",
+		       ke->ke_hash_name, rc);
+		goto out_no_hash;
+	}
 
-	cksum->len = crypto_hash_digestsize(tfm);
-        OBD_ALLOC_LARGE(cksum->data, cksum->len);
-        if (!cksum->data) {
-                cksum->len = 0;
-                goto out_tfm;
-        }
+	cksum->len = cfs_crypto_hash_digestsize(hash_algo);
+	OBD_ALLOC_LARGE(cksum->data, cksum->len);
+	if (!cksum->data) {
+		cksum->len = 0;
+		rc = -ENOMEM;
+		goto out_free_hash;
+	}
 
 	hdr.data = (__u8 *)khdr;
 	hdr.len = sizeof(*khdr);
 
-        if (ke->ke_hash_hmac)
-		rc = gss_digest_hmac(tfm, &kb->kb_key,
-				     &hdr, msgcnt, msgs, iovcnt, iovs, cksum);
-        else
-		rc = gss_digest_norm(tfm, kb,
-				     &hdr, msgcnt, msgs, iovcnt, iovs, cksum);
+	if (!hash_func) {
+		rc = -EPROTO;
+		CERROR("hash function for %s undefined\n",
+		       ke->ke_hash_name);
+		goto out_free_hash;
+	}
+	rc = hash_func(req, &hdr, msgcnt, msgs, iovcnt, iovs);
+	if (rc)
+		goto out_free_hash;
+
+	if (!ke->ke_hash_hmac) {
+		LASSERT(kb->kb_tfm);
+
+		cfs_crypto_hash_final(req, cksum->data, &cksum->len);
+		rc = gss_crypt_generic(kb->kb_tfm, 0, NULL,
+				       cksum->data, cksum->data,
+				       cksum->len);
+		goto out_no_hash;
+	}
 
-        if (rc == 0)
-                code = GSS_S_COMPLETE;
-out_tfm:
-	crypto_free_hash(tfm);
-        return code;
+out_free_hash:
+	if (req)
+		cfs_crypto_hash_final(req, cksum->data, &cksum->len);
+out_no_hash:
+	return rc ? GSS_S_FAILURE : GSS_S_COMPLETE;
 }
 
 static void fill_krb5_header(struct krb5_ctx *kctx,
@@ -545,118 +579,118 @@ static __u32 verify_krb5_header(struct krb5_ctx *kctx,
 
 static
 __u32 gss_get_mic_kerberos(struct gss_ctx *gctx,
-                           int msgcnt,
-                           rawobj_t *msgs,
-                           int iovcnt,
-                           lnet_kiov_t *iovs,
-                           rawobj_t *token)
+			   int msgcnt,
+			   rawobj_t *msgs,
+			   int iovcnt,
+			   lnet_kiov_t *iovs,
+			   rawobj_t *token)
 {
-        struct krb5_ctx     *kctx = gctx->internal_ctx_id;
-        struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
-        struct krb5_header  *khdr;
-        rawobj_t             cksum = RAWOBJ_EMPTY;
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header  *khdr;
+	rawobj_t cksum = RAWOBJ_EMPTY;
+	u32 major;
 
-        /* fill krb5 header */
-        LASSERT(token->len >= sizeof(*khdr));
+	/* fill krb5 header */
+	LASSERT(token->len >= sizeof(*khdr));
 	khdr = (struct krb5_header *)token->data;
-        fill_krb5_header(kctx, khdr, 0);
-
-        /* checksum */
-        if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc,
-                               khdr, msgcnt, msgs, iovcnt, iovs, &cksum))
-                return GSS_S_FAILURE;
+	fill_krb5_header(kctx, khdr, 0);
 
-        LASSERT(cksum.len >= ke->ke_hash_size);
-        LASSERT(token->len >= sizeof(*khdr) + ke->ke_hash_size);
-        memcpy(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size,
-               ke->ke_hash_size);
+	/* checksum */
+	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc, khdr,
+			       msgcnt, msgs, iovcnt, iovs, &cksum,
+			       gctx->hash_func))
+		GOTO(out_free_cksum, major = GSS_S_FAILURE);
 
-        token->len = sizeof(*khdr) + ke->ke_hash_size;
-        rawobj_free(&cksum);
-        return GSS_S_COMPLETE;
+	LASSERT(cksum.len >= ke->ke_hash_size);
+	LASSERT(token->len >= sizeof(*khdr) + ke->ke_hash_size);
+	memcpy(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size,
+	       ke->ke_hash_size);
+
+	token->len = sizeof(*khdr) + ke->ke_hash_size;
+	major = GSS_S_COMPLETE;
+out_free_cksum:
+	rawobj_free(&cksum);
+	return major;
 }
 
 static
 __u32 gss_verify_mic_kerberos(struct gss_ctx *gctx,
-                              int msgcnt,
-                              rawobj_t *msgs,
-                              int iovcnt,
-                              lnet_kiov_t *iovs,
-                              rawobj_t *token)
+			      int msgcnt,
+			      rawobj_t *msgs,
+			      int iovcnt,
+			      lnet_kiov_t *iovs,
+			      rawobj_t *token)
 {
-        struct krb5_ctx     *kctx = gctx->internal_ctx_id;
-        struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
-        struct krb5_header  *khdr;
-        rawobj_t             cksum = RAWOBJ_EMPTY;
-        __u32                major;
-
-        if (token->len < sizeof(*khdr)) {
-                CERROR("short signature: %u\n", token->len);
-                return GSS_S_DEFECTIVE_TOKEN;
-        }
+	struct krb5_ctx *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header *khdr;
+	rawobj_t cksum = RAWOBJ_EMPTY;
+	u32 major;
+
+	if (token->len < sizeof(*khdr)) {
+		CERROR("short signature: %u\n", token->len);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
 
 	khdr = (struct krb5_header *)token->data;
 
-        major = verify_krb5_header(kctx, khdr, 0);
-        if (major != GSS_S_COMPLETE) {
-                CERROR("bad krb5 header\n");
-                return major;
-        }
-
-        if (token->len < sizeof(*khdr) + ke->ke_hash_size) {
-                CERROR("short signature: %u, require %d\n",
-                       token->len, (int) sizeof(*khdr) + ke->ke_hash_size);
-                return GSS_S_FAILURE;
-        }
+	major = verify_krb5_header(kctx, khdr, 0);
+	if (major != GSS_S_COMPLETE) {
+		CERROR("bad krb5 header\n");
+		goto out;
+	}
 
-        if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc,
-                               khdr, msgcnt, msgs, iovcnt, iovs, &cksum)) {
-                CERROR("failed to make checksum\n");
-                return GSS_S_FAILURE;
-        }
+	if (token->len < sizeof(*khdr) + ke->ke_hash_size) {
+		CERROR("short signature: %u, require %d\n",
+		       token->len, (int) sizeof(*khdr) + ke->ke_hash_size);
+		GOTO(out, major = GSS_S_FAILURE);
+	}
 
-        LASSERT(cksum.len >= ke->ke_hash_size);
-        if (memcmp(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size,
-                   ke->ke_hash_size)) {
-                CERROR("checksum mismatch\n");
-                rawobj_free(&cksum);
-                return GSS_S_BAD_SIG;
-        }
+	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc,
+			       khdr, msgcnt, msgs, iovcnt, iovs, &cksum,
+			       gctx->hash_func))
+		GOTO(out_free_cksum, major = GSS_S_FAILURE);
 
-        rawobj_free(&cksum);
-        return GSS_S_COMPLETE;
+	LASSERT(cksum.len >= ke->ke_hash_size);
+	if (memcmp(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size,
+		   ke->ke_hash_size)) {
+		CERROR("checksum mismatch\n");
+		GOTO(out_free_cksum, major = GSS_S_BAD_SIG);
+	}
+	major = GSS_S_COMPLETE;
+out_free_cksum:
+	rawobj_free(&cksum);
+out:
+	return major;
 }
 
 /*
  * if adj_nob != 0, we adjust desc->bd_nob to the actual cipher text size.
  */
 static
-int krb5_encrypt_bulk(struct crypto_blkcipher *tfm,
-                      struct krb5_header *khdr,
-                      char *confounder,
-                      struct ptlrpc_bulk_desc *desc,
-                      rawobj_t *cipher,
-                      int adj_nob)
+int krb5_encrypt_bulk(struct crypto_sync_skcipher *tfm,
+		      struct krb5_header *khdr,
+		      char *confounder,
+		      struct ptlrpc_bulk_desc *desc,
+		      rawobj_t *cipher,
+		      int adj_nob)
 {
-        struct blkcipher_desc   ciph_desc;
-        __u8                    local_iv[16] = {0};
-        struct scatterlist      src, dst;
-	struct sg_table		sg_src, sg_dst;
-        int                     blocksize, i, rc, nob = 0;
+	__u8 local_iv[16] = {0};
+	struct scatterlist src, dst;
+	struct sg_table sg_src, sg_dst;
+	int blocksize, i, rc, nob = 0;
+	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
 
 	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
-        LASSERT(desc->bd_iov_count);
+	LASSERT(desc->bd_iov_count);
 	LASSERT(GET_ENC_KIOV(desc));
 
-	blocksize = crypto_blkcipher_blocksize(tfm);
-        LASSERT(blocksize > 1);
-        LASSERT(cipher->len == blocksize + sizeof(*khdr));
-
-        ciph_desc.tfm  = tfm;
-        ciph_desc.info = local_iv;
-        ciph_desc.flags = 0;
+	blocksize = crypto_sync_skcipher_blocksize(tfm);
+	LASSERT(blocksize > 1);
+	LASSERT(cipher->len == blocksize + sizeof(*khdr));
 
-        /* encrypt confounder */
+	/* encrypt confounder */
 	rc = gss_setup_sgtable(&sg_src, &src, confounder, blocksize);
 	if (rc != 0)
 		return rc;
@@ -666,20 +700,24 @@ int krb5_encrypt_bulk(struct crypto_blkcipher *tfm,
 		gss_teardown_sgtable(&sg_src);
 		return rc;
 	}
+	skcipher_request_set_sync_tfm(req, tfm);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl,
+				   blocksize, local_iv);
 
-	rc = crypto_blkcipher_encrypt_iv(&ciph_desc, sg_dst.sgl,
-					 sg_src.sgl, blocksize);
+	rc = crypto_skcipher_encrypt_iv(req, sg_dst.sgl, sg_src.sgl, blocksize);
 
 	gss_teardown_sgtable(&sg_dst);
 	gss_teardown_sgtable(&sg_src);
 
-        if (rc) {
-                CERROR("error to encrypt confounder: %d\n", rc);
-                return rc;
-        }
+	if (rc) {
+		CERROR("error to encrypt confounder: %d\n", rc);
+		skcipher_request_zero(req);
+		return rc;
+	}
 
-        /* encrypt clear pages */
-        for (i = 0; i < desc->bd_iov_count; i++) {
+	/* encrypt clear pages */
+	for (i = 0; i < desc->bd_iov_count; i++) {
 		sg_init_table(&src, 1);
 		sg_set_page(&src, BD_GET_KIOV(desc, i).kiov_page,
 			    (BD_GET_KIOV(desc, i).kiov_len +
@@ -695,28 +733,36 @@ int krb5_encrypt_bulk(struct crypto_blkcipher *tfm,
 		BD_GET_ENC_KIOV(desc, i).kiov_offset = dst.offset;
 		BD_GET_ENC_KIOV(desc, i).kiov_len = dst.length;
 
-		rc = crypto_blkcipher_encrypt_iv(&ciph_desc, &dst, &src,
-                                                    src.length);
-                if (rc) {
-                        CERROR("error to encrypt page: %d\n", rc);
-                        return rc;
-                }
-        }
+		skcipher_request_set_crypt(req, &src, &dst,
+					  src.length, local_iv);
+		rc = crypto_skcipher_encrypt_iv(req, &dst, &src, src.length);
+		if (rc) {
+			CERROR("error to encrypt page: %d\n", rc);
+			skcipher_request_zero(req);
+			return rc;
+		}
+	}
 
-        /* encrypt krb5 header */
+	/* encrypt krb5 header */
 	rc = gss_setup_sgtable(&sg_src, &src, khdr, sizeof(*khdr));
-	if (rc != 0)
+	if (rc != 0) {
+		skcipher_request_zero(req);
 		return rc;
+	}
 
 	rc = gss_setup_sgtable(&sg_dst, &dst, cipher->data + blocksize,
 			   sizeof(*khdr));
 	if (rc != 0) {
 		gss_teardown_sgtable(&sg_src);
+		skcipher_request_zero(req);
 		return rc;
 	}
 
-	rc = crypto_blkcipher_encrypt_iv(&ciph_desc, sg_dst.sgl, sg_src.sgl,
-					 sizeof(*khdr));
+	skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl,
+				   sizeof(*khdr), local_iv);
+	rc = crypto_skcipher_encrypt_iv(req, sg_dst.sgl, sg_src.sgl,
+					sizeof(*khdr));
+	skcipher_request_zero(req);
 
 	gss_teardown_sgtable(&sg_dst);
 	gss_teardown_sgtable(&sg_src);
@@ -751,39 +797,35 @@ int krb5_encrypt_bulk(struct crypto_blkcipher *tfm,
  *   should have been done by prep_bulk().
  */
 static
-int krb5_decrypt_bulk(struct crypto_blkcipher *tfm,
-                      struct krb5_header *khdr,
-                      struct ptlrpc_bulk_desc *desc,
-                      rawobj_t *cipher,
-                      rawobj_t *plain,
-                      int adj_nob)
+int krb5_decrypt_bulk(struct crypto_sync_skcipher *tfm,
+		      struct krb5_header *khdr,
+		      struct ptlrpc_bulk_desc *desc,
+		      rawobj_t *cipher,
+		      rawobj_t *plain,
+		      int adj_nob)
 {
-        struct blkcipher_desc   ciph_desc;
-        __u8                    local_iv[16] = {0};
-        struct scatterlist      src, dst;
-	struct sg_table		sg_src, sg_dst;
-        int                     ct_nob = 0, pt_nob = 0;
-        int                     blocksize, i, rc;
+	__u8 local_iv[16] = {0};
+	struct scatterlist src, dst;
+	struct sg_table sg_src, sg_dst;
+	int ct_nob = 0, pt_nob = 0;
+	int blocksize, i, rc;
+	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
 
 	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
-        LASSERT(desc->bd_iov_count);
+	LASSERT(desc->bd_iov_count);
 	LASSERT(GET_ENC_KIOV(desc));
-        LASSERT(desc->bd_nob_transferred);
-
-	blocksize = crypto_blkcipher_blocksize(tfm);
-        LASSERT(blocksize > 1);
-        LASSERT(cipher->len == blocksize + sizeof(*khdr));
+	LASSERT(desc->bd_nob_transferred);
 
-        ciph_desc.tfm  = tfm;
-        ciph_desc.info = local_iv;
-        ciph_desc.flags = 0;
+	blocksize = crypto_sync_skcipher_blocksize(tfm);
+	LASSERT(blocksize > 1);
+	LASSERT(cipher->len == blocksize + sizeof(*khdr));
 
-        if (desc->bd_nob_transferred % blocksize) {
-                CERROR("odd transferred nob: %d\n", desc->bd_nob_transferred);
-                return -EPROTO;
-        }
+	if (desc->bd_nob_transferred % blocksize) {
+		CERROR("odd transferred nob: %d\n", desc->bd_nob_transferred);
+		return -EPROTO;
+	}
 
-        /* decrypt head (confounder) */
+	/* decrypt head (confounder) */
 	rc = gss_setup_sgtable(&sg_src, &src, cipher->data, blocksize);
 	if (rc != 0)
 		return rc;
@@ -794,27 +836,31 @@ int krb5_decrypt_bulk(struct crypto_blkcipher *tfm,
 		return rc;
 	}
 
-	rc = crypto_blkcipher_decrypt_iv(&ciph_desc, sg_dst.sgl,
-					 sg_src.sgl, blocksize);
+	skcipher_request_set_sync_tfm(req, tfm);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl,
+				   blocksize, local_iv);
+
+	rc = crypto_skcipher_encrypt_iv(req, sg_dst.sgl, sg_src.sgl, blocksize);
 
 	gss_teardown_sgtable(&sg_dst);
 	gss_teardown_sgtable(&sg_src);
 
-        if (rc) {
-                CERROR("error to decrypt confounder: %d\n", rc);
-                return rc;
-        }
+	if (rc) {
+		CERROR("error to decrypt confounder: %d\n", rc);
+		skcipher_request_zero(req);
+		return rc;
+	}
 
 	for (i = 0; i < desc->bd_iov_count && ct_nob < desc->bd_nob_transferred;
 	     i++) {
-		if (BD_GET_ENC_KIOV(desc, i).kiov_offset % blocksize
-		    != 0 ||
-		    BD_GET_ENC_KIOV(desc, i).kiov_len % blocksize
-		    != 0) {
+		if (BD_GET_ENC_KIOV(desc, i).kiov_offset % blocksize != 0 ||
+		    BD_GET_ENC_KIOV(desc, i).kiov_len % blocksize != 0) {
 			CERROR("page %d: odd offset %u len %u, blocksize %d\n",
 			       i, BD_GET_ENC_KIOV(desc, i).kiov_offset,
 			       BD_GET_ENC_KIOV(desc, i).kiov_len,
 			       blocksize);
+			skcipher_request_zero(req);
 			return -EFAULT;
 		}
 
@@ -851,12 +897,14 @@ int krb5_decrypt_bulk(struct crypto_blkcipher *tfm,
 			sg_assign_page(&dst,
 				       BD_GET_KIOV(desc, i).kiov_page);
 
-		rc = crypto_blkcipher_decrypt_iv(&ciph_desc, &dst, &src,
-						 src.length);
-                if (rc) {
-                        CERROR("error to decrypt page: %d\n", rc);
-                        return rc;
-                }
+		skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl,
+					   src.length, local_iv);
+		rc = crypto_skcipher_decrypt_iv(req, &dst, &src, src.length);
+		if (rc) {
+			CERROR("error to decrypt page: %d\n", rc);
+			skcipher_request_zero(req);
+			return rc;
+		}
 
 		if (BD_GET_KIOV(desc, i).kiov_len % blocksize != 0) {
 			memcpy(page_address(BD_GET_KIOV(desc, i).kiov_page) +
@@ -871,24 +919,26 @@ int krb5_decrypt_bulk(struct crypto_blkcipher *tfm,
 		pt_nob += BD_GET_KIOV(desc, i).kiov_len;
 	}
 
-        if (unlikely(ct_nob != desc->bd_nob_transferred)) {
-                CERROR("%d cipher text transferred but only %d decrypted\n",
-                       desc->bd_nob_transferred, ct_nob);
-                return -EFAULT;
-        }
+	if (unlikely(ct_nob != desc->bd_nob_transferred)) {
+		CERROR("%d cipher text transferred but only %d decrypted\n",
+		       desc->bd_nob_transferred, ct_nob);
+		skcipher_request_zero(req);
+		return -EFAULT;
+	}
 
-        if (unlikely(!adj_nob && pt_nob != desc->bd_nob)) {
-                CERROR("%d plain text expected but only %d received\n",
-                       desc->bd_nob, pt_nob);
-                return -EFAULT;
-        }
+	if (unlikely(!adj_nob && pt_nob != desc->bd_nob)) {
+		CERROR("%d plain text expected but only %d received\n",
+		       desc->bd_nob, pt_nob);
+		skcipher_request_zero(req);
+		return -EFAULT;
+	}
 
 	/* if needed, clear up the rest unused iovs */
 	if (adj_nob)
 		while (i < desc->bd_iov_count)
 			BD_GET_KIOV(desc, i++).kiov_len = 0;
 
-        /* decrypt tail (krb5 header) */
+	/* decrypt tail (krb5 header) */
 	rc = gss_setup_sgtable(&sg_src, &src, cipher->data + blocksize,
 			       sizeof(*khdr));
 	if (rc != 0)
@@ -901,166 +951,170 @@ int krb5_decrypt_bulk(struct crypto_blkcipher *tfm,
 		return rc;
 	}
 
-	rc = crypto_blkcipher_decrypt_iv(&ciph_desc, sg_dst.sgl, sg_src.sgl,
-					 sizeof(*khdr));
-
+	skcipher_request_set_crypt(req, sg_src.sgl, sg_dst.sgl,
+				  src.length, local_iv);
+	rc = crypto_skcipher_decrypt_iv(req, sg_dst.sgl, sg_src.sgl,
+					sizeof(*khdr));
 	gss_teardown_sgtable(&sg_src);
 	gss_teardown_sgtable(&sg_dst);
 
-        if (rc) {
-                CERROR("error to decrypt tail: %d\n", rc);
-                return rc;
-        }
+	skcipher_request_zero(req);
+	if (rc) {
+		CERROR("error to decrypt tail: %d\n", rc);
+		return rc;
+	}
 
-        if (memcmp(cipher->data + blocksize, khdr, sizeof(*khdr))) {
-                CERROR("krb5 header doesn't match\n");
-                return -EACCES;
-        }
+	if (memcmp(cipher->data + blocksize, khdr, sizeof(*khdr))) {
+		CERROR("krb5 header doesn't match\n");
+		return -EACCES;
+	}
 
-        return 0;
+	return 0;
 }
 
 static
 __u32 gss_wrap_kerberos(struct gss_ctx *gctx,
-                        rawobj_t *gsshdr,
-                        rawobj_t *msg,
-                        int msg_buflen,
-                        rawobj_t *token)
+			rawobj_t *gsshdr,
+			rawobj_t *msg,
+			int msg_buflen,
+			rawobj_t *token)
 {
-        struct krb5_ctx     *kctx = gctx->internal_ctx_id;
-        struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
-        struct krb5_header  *khdr;
-        int                  blocksize;
-        rawobj_t             cksum = RAWOBJ_EMPTY;
-        rawobj_t             data_desc[3], cipher;
-        __u8                 conf[GSS_MAX_CIPHER_BLOCK];
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header  *khdr;
+	int                  blocksize;
+	rawobj_t             cksum = RAWOBJ_EMPTY;
+	rawobj_t             data_desc[3], cipher;
+	__u8                 conf[GSS_MAX_CIPHER_BLOCK];
 	__u8                 local_iv[16] = {0};
-        int                  rc = 0;
-
-        LASSERT(ke);
-        LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK);
-        LASSERT(kctx->kc_keye.kb_tfm == NULL ||
-                ke->ke_conf_size >=
-		crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm));
-
-        /*
-         * final token format:
-         * ---------------------------------------------------
-         * | krb5 header | cipher text | checksum (16 bytes) |
-         * ---------------------------------------------------
-         */
-
-        /* fill krb5 header */
-        LASSERT(token->len >= sizeof(*khdr));
+	u32 major;
+	int                  rc = 0;
+
+	LASSERT(ke);
+	LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK);
+	LASSERT(kctx->kc_keye.kb_tfm == NULL ||
+		ke->ke_conf_size >=
+		crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm));
+
+	/*
+	 * final token format:
+	 * ---------------------------------------------------
+	 * | krb5 header | cipher text | checksum (16 bytes) |
+	 * ---------------------------------------------------
+	 */
+
+	/* fill krb5 header */
+	LASSERT(token->len >= sizeof(*khdr));
 	khdr = (struct krb5_header *)token->data;
-        fill_krb5_header(kctx, khdr, 1);
+	fill_krb5_header(kctx, khdr, 1);
 
-        /* generate confounder */
-        cfs_get_random_bytes(conf, ke->ke_conf_size);
+	/* generate confounder */
+	cfs_get_random_bytes(conf, ke->ke_conf_size);
 
-        /* get encryption blocksize. note kc_keye might not associated with
-         * a tfm, currently only for arcfour-hmac */
-        if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
-                LASSERT(kctx->kc_keye.kb_tfm == NULL);
-                blocksize = 1;
-        } else {
-                LASSERT(kctx->kc_keye.kb_tfm);
-		blocksize = crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
-        }
-        LASSERT(blocksize <= ke->ke_conf_size);
+	/* get encryption blocksize. note kc_keye might not associated with
+	 * a tfm, currently only for arcfour-hmac */
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		LASSERT(kctx->kc_keye.kb_tfm == NULL);
+		blocksize = 1;
+	} else {
+		LASSERT(kctx->kc_keye.kb_tfm);
+		blocksize = crypto_sync_skcipher_blocksize(
+							kctx->kc_keye.kb_tfm);
+	}
+	LASSERT(blocksize <= ke->ke_conf_size);
 
 	/* padding the message */
 	if (gss_add_padding(msg, msg_buflen, blocksize))
 		return GSS_S_FAILURE;
 
-        /*
-         * clear text layout for checksum:
-         * ------------------------------------------------------
-         * | confounder | gss header | clear msgs | krb5 header |
-         * ------------------------------------------------------
-         */
-        data_desc[0].data = conf;
-        data_desc[0].len = ke->ke_conf_size;
-        data_desc[1].data = gsshdr->data;
-        data_desc[1].len = gsshdr->len;
-        data_desc[2].data = msg->data;
-        data_desc[2].len = msg->len;
-
-        /* compute checksum */
-        if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
-                               khdr, 3, data_desc, 0, NULL, &cksum))
-                return GSS_S_FAILURE;
-        LASSERT(cksum.len >= ke->ke_hash_size);
-
-        /*
-         * clear text layout for encryption:
-         * -----------------------------------------
-         * | confounder | clear msgs | krb5 header |
-         * -----------------------------------------
-         */
-        data_desc[0].data = conf;
-        data_desc[0].len = ke->ke_conf_size;
-        data_desc[1].data = msg->data;
-        data_desc[1].len = msg->len;
-        data_desc[2].data = (__u8 *) khdr;
-        data_desc[2].len = sizeof(*khdr);
-
-        /* cipher text will be directly inplace */
+	/*
+	 * clear text layout for checksum:
+	 * ------------------------------------------------------
+	 * | confounder | gss header | clear msgs | krb5 header |
+	 * ------------------------------------------------------
+	 */
+	data_desc[0].data = conf;
+	data_desc[0].len = ke->ke_conf_size;
+	data_desc[1].data = gsshdr->data;
+	data_desc[1].len = gsshdr->len;
+	data_desc[2].data = msg->data;
+	data_desc[2].len = msg->len;
+
+	/* compute checksum */
+	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+			       khdr, 3, data_desc, 0, NULL, &cksum,
+			       gctx->hash_func))
+		GOTO(out_free_cksum, major = GSS_S_FAILURE);
+	LASSERT(cksum.len >= ke->ke_hash_size);
+
+	/*
+	 * clear text layout for encryption:
+	 * -----------------------------------------
+	 * | confounder | clear msgs | krb5 header |
+	 * -----------------------------------------
+	 */
+	data_desc[0].data = conf;
+	data_desc[0].len = ke->ke_conf_size;
+	data_desc[1].data = msg->data;
+	data_desc[1].len = msg->len;
+	data_desc[2].data = (__u8 *) khdr;
+	data_desc[2].len = sizeof(*khdr);
+
+	/* cipher text will be directly inplace */
 	cipher.data = (__u8 *)(khdr + 1);
-        cipher.len = token->len - sizeof(*khdr);
-        LASSERT(cipher.len >= ke->ke_conf_size + msg->len + sizeof(*khdr));
+	cipher.len = token->len - sizeof(*khdr);
+	LASSERT(cipher.len >= ke->ke_conf_size + msg->len + sizeof(*khdr));
 
 	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
-		rawobj_t		 arc4_keye;
-		struct crypto_blkcipher *arc4_tfm;
+		rawobj_t arc4_keye = RAWOBJ_EMPTY;
+		struct crypto_sync_skcipher *arc4_tfm;
 
 		if (krb5_make_checksum(ENCTYPE_ARCFOUR_HMAC, &kctx->kc_keyi,
-				       NULL, 1, &cksum, 0, NULL, &arc4_keye)) {
+				       NULL, 1, &cksum, 0, NULL, &arc4_keye,
+				       gctx->hash_func)) {
 			CERROR("failed to obtain arc4 enc key\n");
-			GOTO(arc4_out, rc = -EACCES);
+			GOTO(arc4_out_key, rc = -EACCES);
 		}
 
-		arc4_tfm = crypto_alloc_blkcipher("ecb(arc4)", 0, 0);
+		arc4_tfm = crypto_alloc_sync_skcipher("ecb(arc4)", 0, 0);
 		if (IS_ERR(arc4_tfm)) {
 			CERROR("failed to alloc tfm arc4 in ECB mode\n");
 			GOTO(arc4_out_key, rc = -EACCES);
 		}
 
-		if (crypto_blkcipher_setkey(arc4_tfm, arc4_keye.data,
-                                               arc4_keye.len)) {
-                        CERROR("failed to set arc4 key, len %d\n",
-                               arc4_keye.len);
-                        GOTO(arc4_out_tfm, rc = -EACCES);
-                }
+		if (crypto_sync_skcipher_setkey(arc4_tfm, arc4_keye.data,
+						arc4_keye.len)) {
+			CERROR("failed to set arc4 key, len %d\n",
+			       arc4_keye.len);
+			GOTO(arc4_out_tfm, rc = -EACCES);
+		}
 
 		rc = gss_crypt_rawobjs(arc4_tfm, NULL, 3, data_desc,
 				       &cipher, 1);
 arc4_out_tfm:
-		crypto_free_blkcipher(arc4_tfm);
+		crypto_free_sync_skcipher(arc4_tfm);
 arc4_out_key:
-                rawobj_free(&arc4_keye);
-arc4_out:
-                do {} while(0); /* just to avoid compile warning */
-        } else {
+		rawobj_free(&arc4_keye);
+	} else {
 		rc = gss_crypt_rawobjs(kctx->kc_keye.kb_tfm, local_iv, 3,
 				       data_desc, &cipher, 1);
-        }
-
-        if (rc != 0) {
-                rawobj_free(&cksum);
-                return GSS_S_FAILURE;
-        }
-
-        /* fill in checksum */
-        LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size);
-        memcpy((char *)(khdr + 1) + cipher.len,
-               cksum.data + cksum.len - ke->ke_hash_size,
-               ke->ke_hash_size);
-        rawobj_free(&cksum);
+	}
 
-        /* final token length */
-        token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size;
-        return GSS_S_COMPLETE;
+	if (rc)
+		GOTO(out_free_cksum, major = GSS_S_FAILURE);
+
+	/* fill in checksum */
+	LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size);
+	memcpy((char *)(khdr + 1) + cipher.len,
+	       cksum.data + cksum.len - ke->ke_hash_size,
+	       ke->ke_hash_size);
+
+	/* final token length */
+	token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size;
+	major = GSS_S_COMPLETE;
+out_free_cksum:
+	rawobj_free(&cksum);
+	return major;
 }
 
 static
@@ -1075,7 +1129,7 @@ __u32 gss_prep_bulk_kerberos(struct gss_ctx *gctx,
 	LASSERT(GET_ENC_KIOV(desc));
 	LASSERT(kctx->kc_keye.kb_tfm);
 
-	blocksize = crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+	blocksize = crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm);
 
 	for (i = 0; i < desc->bd_iov_count; i++) {
 		LASSERT(BD_GET_ENC_KIOV(desc, i).kiov_page);
@@ -1101,375 +1155,377 @@ __u32 gss_prep_bulk_kerberos(struct gss_ctx *gctx,
 
 static
 __u32 gss_wrap_bulk_kerberos(struct gss_ctx *gctx,
-                             struct ptlrpc_bulk_desc *desc,
-                             rawobj_t *token, int adj_nob)
+			     struct ptlrpc_bulk_desc *desc,
+			     rawobj_t *token, int adj_nob)
 {
-        struct krb5_ctx     *kctx = gctx->internal_ctx_id;
-        struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
-        struct krb5_header  *khdr;
-        int                  blocksize;
-        rawobj_t             cksum = RAWOBJ_EMPTY;
-        rawobj_t             data_desc[1], cipher;
-        __u8                 conf[GSS_MAX_CIPHER_BLOCK];
-        int                  rc = 0;
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header  *khdr;
+	int                  blocksz;
+	rawobj_t             cksum = RAWOBJ_EMPTY;
+	rawobj_t             data_desc[1], cipher;
+	__u8                 conf[GSS_MAX_CIPHER_BLOCK];
+	int rc = 0;
+	u32 major;
 
 	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
-        LASSERT(ke);
-        LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK);
-
-        /*
-         * final token format:
-         * --------------------------------------------------
-         * | krb5 header | head/tail cipher text | checksum |
-         * --------------------------------------------------
-         */
-
-        /* fill krb5 header */
-        LASSERT(token->len >= sizeof(*khdr));
+	LASSERT(ke);
+	LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK);
+
+	/*
+	 * final token format:
+	 * --------------------------------------------------
+	 * | krb5 header | head/tail cipher text | checksum |
+	 * --------------------------------------------------
+	 */
+
+	/* fill krb5 header */
+	LASSERT(token->len >= sizeof(*khdr));
 	khdr = (struct krb5_header *)token->data;
-        fill_krb5_header(kctx, khdr, 1);
+	fill_krb5_header(kctx, khdr, 1);
 
-        /* generate confounder */
-        cfs_get_random_bytes(conf, ke->ke_conf_size);
+	/* generate confounder */
+	cfs_get_random_bytes(conf, ke->ke_conf_size);
 
-        /* get encryption blocksize. note kc_keye might not associated with
-         * a tfm, currently only for arcfour-hmac */
-        if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
-                LASSERT(kctx->kc_keye.kb_tfm == NULL);
-                blocksize = 1;
-        } else {
-                LASSERT(kctx->kc_keye.kb_tfm);
-		blocksize = crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
-        }
+	/* get encryption blocksize. note kc_keye might not associated with
+	 * a tfm, currently only for arcfour-hmac */
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		LASSERT(kctx->kc_keye.kb_tfm == NULL);
+		blocksz = 1;
+	} else {
+		LASSERT(kctx->kc_keye.kb_tfm);
+		blocksz = crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm);
+	}
 
-        /*
-         * we assume the size of krb5_header (16 bytes) must be n * blocksize.
-         * the bulk token size would be exactly (sizeof(krb5_header) +
-         * blocksize + sizeof(krb5_header) + hashsize)
-         */
-        LASSERT(blocksize <= ke->ke_conf_size);
-        LASSERT(sizeof(*khdr) >= blocksize && sizeof(*khdr) % blocksize == 0);
-        LASSERT(token->len >= sizeof(*khdr) + blocksize + sizeof(*khdr) + 16);
-
-        /*
-         * clear text layout for checksum:
-         * ------------------------------------------
-         * | confounder | clear pages | krb5 header |
-         * ------------------------------------------
-         */
-        data_desc[0].data = conf;
-        data_desc[0].len = ke->ke_conf_size;
+	/*
+	 * we assume the size of krb5_header (16 bytes) must be n * blocksize.
+	 * the bulk token size would be exactly (sizeof(krb5_header) +
+	 * blocksize + sizeof(krb5_header) + hashsize)
+	 */
+	LASSERT(blocksz <= ke->ke_conf_size);
+	LASSERT(sizeof(*khdr) >= blocksz && sizeof(*khdr) % blocksz == 0);
+	LASSERT(token->len >= sizeof(*khdr) + blocksz + sizeof(*khdr) + 16);
+
+	/*
+	 * clear text layout for checksum:
+	 * ------------------------------------------
+	 * | confounder | clear pages | krb5 header |
+	 * ------------------------------------------
+	 */
+	data_desc[0].data = conf;
+	data_desc[0].len = ke->ke_conf_size;
 
 	/* compute checksum */
 	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
 			       khdr, 1, data_desc,
 			       desc->bd_iov_count, GET_KIOV(desc),
-			       &cksum))
-		return GSS_S_FAILURE;
+			       &cksum, gctx->hash_func))
+		GOTO(out_free_cksum, major = GSS_S_FAILURE);
 	LASSERT(cksum.len >= ke->ke_hash_size);
 
-        /*
-         * clear text layout for encryption:
-         * ------------------------------------------
-         * | confounder | clear pages | krb5 header |
-         * ------------------------------------------
-         *        |              |             |
-         *        ----------  (cipher pages)   |
-         * result token:   |                   |
-         * -------------------------------------------
-         * | krb5 header | cipher text | cipher text |
-         * -------------------------------------------
-         */
-        data_desc[0].data = conf;
-        data_desc[0].len = ke->ke_conf_size;
+	/*
+	 * clear text layout for encryption:
+	 * ------------------------------------------
+	 * | confounder | clear pages | krb5 header |
+	 * ------------------------------------------
+	 *        |              |             |
+	 *        ----------  (cipher pages)   |
+	 * result token:   |                   |
+	 * -------------------------------------------
+	 * | krb5 header | cipher text | cipher text |
+	 * -------------------------------------------
+	 */
+	data_desc[0].data = conf;
+	data_desc[0].len = ke->ke_conf_size;
 
 	cipher.data = (__u8 *)(khdr + 1);
-        cipher.len = blocksize + sizeof(*khdr);
-
-        if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
-                LBUG();
-                rc = 0;
-        } else {
-                rc = krb5_encrypt_bulk(kctx->kc_keye.kb_tfm, khdr,
-                                       conf, desc, &cipher, adj_nob);
-        }
+	cipher.len = blocksz + sizeof(*khdr);
 
-        if (rc != 0) {
-                rawobj_free(&cksum);
-                return GSS_S_FAILURE;
-        }
-
-        /* fill in checksum */
-        LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size);
-        memcpy((char *)(khdr + 1) + cipher.len,
-               cksum.data + cksum.len - ke->ke_hash_size,
-               ke->ke_hash_size);
-        rawobj_free(&cksum);
-
-        /* final token length */
-        token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size;
-        return GSS_S_COMPLETE;
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		LBUG();
+		rc = 0;
+	} else {
+		rc = krb5_encrypt_bulk(kctx->kc_keye.kb_tfm, khdr,
+				       conf, desc, &cipher, adj_nob);
+	}
+	if (rc)
+		GOTO(out_free_cksum, major = GSS_S_FAILURE);
+
+	/* fill in checksum */
+	LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size);
+	memcpy((char *)(khdr + 1) + cipher.len,
+	       cksum.data + cksum.len - ke->ke_hash_size,
+	       ke->ke_hash_size);
+
+	/* final token length */
+	token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size;
+	major = GSS_S_COMPLETE;
+out_free_cksum:
+	rawobj_free(&cksum);
+	return major;
 }
 
 static
 __u32 gss_unwrap_kerberos(struct gss_ctx  *gctx,
-                          rawobj_t        *gsshdr,
-                          rawobj_t        *token,
-                          rawobj_t        *msg)
+			  rawobj_t        *gsshdr,
+			  rawobj_t        *token,
+			  rawobj_t        *msg)
 {
-        struct krb5_ctx     *kctx = gctx->internal_ctx_id;
-        struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
-        struct krb5_header  *khdr;
-        unsigned char       *tmpbuf;
-        int                  blocksize, bodysize;
-        rawobj_t             cksum = RAWOBJ_EMPTY;
-        rawobj_t             cipher_in, plain_out;
-        rawobj_t             hash_objs[3];
-        int                  rc = 0;
-        __u32                major;
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header  *khdr;
+	unsigned char       *tmpbuf;
+	int                  blocksz, bodysize;
+	rawobj_t             cksum = RAWOBJ_EMPTY;
+	rawobj_t             cipher_in, plain_out;
+	rawobj_t             hash_objs[3];
+	int                  rc = 0;
+	__u32                major;
 	__u8                 local_iv[16] = {0};
 
-        LASSERT(ke);
+	LASSERT(ke);
 
-        if (token->len < sizeof(*khdr)) {
-                CERROR("short signature: %u\n", token->len);
-                return GSS_S_DEFECTIVE_TOKEN;
-        }
+	if (token->len < sizeof(*khdr)) {
+		CERROR("short signature: %u\n", token->len);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
 
 	khdr = (struct krb5_header *)token->data;
 
-        major = verify_krb5_header(kctx, khdr, 1);
-        if (major != GSS_S_COMPLETE) {
-                CERROR("bad krb5 header\n");
-                return major;
-        }
+	major = verify_krb5_header(kctx, khdr, 1);
+	if (major != GSS_S_COMPLETE) {
+		CERROR("bad krb5 header\n");
+		return major;
+	}
 
-        /* block size */
-        if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
-                LASSERT(kctx->kc_keye.kb_tfm == NULL);
-                blocksize = 1;
-        } else {
-                LASSERT(kctx->kc_keye.kb_tfm);
-		blocksize = crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
-        }
+	/* block size */
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		LASSERT(kctx->kc_keye.kb_tfm == NULL);
+		blocksz = 1;
+	} else {
+		LASSERT(kctx->kc_keye.kb_tfm);
+		blocksz = crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm);
+	}
 
-        /* expected token layout:
-         * ----------------------------------------
-         * | krb5 header | cipher text | checksum |
-         * ----------------------------------------
-         */
-        bodysize = token->len - sizeof(*khdr) - ke->ke_hash_size;
+	/* expected token layout:
+	 * ----------------------------------------
+	 * | krb5 header | cipher text | checksum |
+	 * ----------------------------------------
+	 */
+	bodysize = token->len - sizeof(*khdr) - ke->ke_hash_size;
 
-        if (bodysize % blocksize) {
-                CERROR("odd bodysize %d\n", bodysize);
-                return GSS_S_DEFECTIVE_TOKEN;
-        }
+	if (bodysize % blocksz) {
+		CERROR("odd bodysize %d\n", bodysize);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
 
-        if (bodysize <= ke->ke_conf_size + sizeof(*khdr)) {
-                CERROR("incomplete token: bodysize %d\n", bodysize);
-                return GSS_S_DEFECTIVE_TOKEN;
-        }
+	if (bodysize <= ke->ke_conf_size + sizeof(*khdr)) {
+		CERROR("incomplete token: bodysize %d\n", bodysize);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
 
-        if (msg->len < bodysize - ke->ke_conf_size - sizeof(*khdr)) {
-                CERROR("buffer too small: %u, require %d\n",
-                       msg->len, bodysize - ke->ke_conf_size);
-                return GSS_S_FAILURE;
-        }
+	if (msg->len < bodysize - ke->ke_conf_size - sizeof(*khdr)) {
+		CERROR("buffer too small: %u, require %d\n",
+		       msg->len, bodysize - ke->ke_conf_size);
+		return GSS_S_FAILURE;
+	}
 
-        /* decrypting */
-        OBD_ALLOC_LARGE(tmpbuf, bodysize);
-        if (!tmpbuf)
-                return GSS_S_FAILURE;
+	/* decrypting */
+	OBD_ALLOC_LARGE(tmpbuf, bodysize);
+	if (!tmpbuf)
+		return GSS_S_FAILURE;
 
-        major = GSS_S_FAILURE;
+	major = GSS_S_FAILURE;
 
 	cipher_in.data = (__u8 *)(khdr + 1);
-        cipher_in.len = bodysize;
-        plain_out.data = tmpbuf;
-        plain_out.len = bodysize;
+	cipher_in.len = bodysize;
+	plain_out.data = tmpbuf;
+	plain_out.len = bodysize;
 
 	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
 		rawobj_t		 arc4_keye;
-		struct crypto_blkcipher *arc4_tfm;
+		struct crypto_sync_skcipher *arc4_tfm;
 
 		cksum.data = token->data + token->len - ke->ke_hash_size;
 		cksum.len = ke->ke_hash_size;
 
 		if (krb5_make_checksum(ENCTYPE_ARCFOUR_HMAC, &kctx->kc_keyi,
-				       NULL, 1, &cksum, 0, NULL, &arc4_keye)) {
+				       NULL, 1, &cksum, 0, NULL, &arc4_keye,
+				       gctx->hash_func)) {
 			CERROR("failed to obtain arc4 enc key\n");
 			GOTO(arc4_out, rc = -EACCES);
 		}
 
-		arc4_tfm = crypto_alloc_blkcipher("ecb(arc4)", 0, 0);
+		arc4_tfm = crypto_alloc_sync_skcipher("ecb(arc4)", 0, 0);
 		if (IS_ERR(arc4_tfm)) {
 			CERROR("failed to alloc tfm arc4 in ECB mode\n");
 			GOTO(arc4_out_key, rc = -EACCES);
 		}
 
-		if (crypto_blkcipher_setkey(arc4_tfm,
-                                         arc4_keye.data, arc4_keye.len)) {
-                        CERROR("failed to set arc4 key, len %d\n",
-                               arc4_keye.len);
-                        GOTO(arc4_out_tfm, rc = -EACCES);
-                }
+		if (crypto_sync_skcipher_setkey(arc4_tfm, arc4_keye.data,
+						arc4_keye.len)) {
+			CERROR("failed to set arc4 key, len %d\n",
+			       arc4_keye.len);
+			GOTO(arc4_out_tfm, rc = -EACCES);
+		}
 
 		rc = gss_crypt_rawobjs(arc4_tfm, NULL, 1, &cipher_in,
 				       &plain_out, 0);
 arc4_out_tfm:
-		crypto_free_blkcipher(arc4_tfm);
+		crypto_free_sync_skcipher(arc4_tfm);
 arc4_out_key:
-                rawobj_free(&arc4_keye);
+		rawobj_free(&arc4_keye);
 arc4_out:
-                cksum = RAWOBJ_EMPTY;
-        } else {
+		cksum = RAWOBJ_EMPTY;
+	} else {
 		rc = gss_crypt_rawobjs(kctx->kc_keye.kb_tfm, local_iv, 1,
 				       &cipher_in, &plain_out, 0);
-        }
+	}
 
-        if (rc != 0) {
-                CERROR("error decrypt\n");
-                goto out_free;
-        }
-        LASSERT(plain_out.len == bodysize);
-
-        /* expected clear text layout:
-         * -----------------------------------------
-         * | confounder | clear msgs | krb5 header |
-         * -----------------------------------------
-         */
-
-        /* verify krb5 header in token is not modified */
-        if (memcmp(khdr, plain_out.data + plain_out.len - sizeof(*khdr),
-                   sizeof(*khdr))) {
-                CERROR("decrypted krb5 header mismatch\n");
-                goto out_free;
-        }
+	if (rc != 0) {
+		CERROR("error decrypt\n");
+		goto out_free;
+	}
+	LASSERT(plain_out.len == bodysize);
 
-        /* verify checksum, compose clear text as layout:
-         * ------------------------------------------------------
-         * | confounder | gss header | clear msgs | krb5 header |
-         * ------------------------------------------------------
-         */
-        hash_objs[0].len = ke->ke_conf_size;
-        hash_objs[0].data = plain_out.data;
-        hash_objs[1].len = gsshdr->len;
-        hash_objs[1].data = gsshdr->data;
-        hash_objs[2].len = plain_out.len - ke->ke_conf_size - sizeof(*khdr);
-        hash_objs[2].data = plain_out.data + ke->ke_conf_size;
-        if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
-                               khdr, 3, hash_objs, 0, NULL, &cksum))
-                goto out_free;
-
-        LASSERT(cksum.len >= ke->ke_hash_size);
-        if (memcmp((char *)(khdr + 1) + bodysize,
-                   cksum.data + cksum.len - ke->ke_hash_size,
-                   ke->ke_hash_size)) {
-                CERROR("checksum mismatch\n");
-                goto out_free;
-        }
+	/* expected clear text layout:
+	 * -----------------------------------------
+	 * | confounder | clear msgs | krb5 header |
+	 * -----------------------------------------
+	 */
+
+	/* verify krb5 header in token is not modified */
+	if (memcmp(khdr, plain_out.data + plain_out.len - sizeof(*khdr),
+		   sizeof(*khdr))) {
+		CERROR("decrypted krb5 header mismatch\n");
+		goto out_free;
+	}
+
+	/* verify checksum, compose clear text as layout:
+	 * ------------------------------------------------------
+	 * | confounder | gss header | clear msgs | krb5 header |
+	 * ------------------------------------------------------
+	 */
+	hash_objs[0].len = ke->ke_conf_size;
+	hash_objs[0].data = plain_out.data;
+	hash_objs[1].len = gsshdr->len;
+	hash_objs[1].data = gsshdr->data;
+	hash_objs[2].len = plain_out.len - ke->ke_conf_size - sizeof(*khdr);
+	hash_objs[2].data = plain_out.data + ke->ke_conf_size;
+	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+			       khdr, 3, hash_objs, 0, NULL, &cksum,
+			       gctx->hash_func))
+		goto out_free;
+
+	LASSERT(cksum.len >= ke->ke_hash_size);
+	if (memcmp((char *)(khdr + 1) + bodysize,
+		   cksum.data + cksum.len - ke->ke_hash_size,
+		   ke->ke_hash_size)) {
+		CERROR("checksum mismatch\n");
+		goto out_free;
+	}
 
-        msg->len =  bodysize - ke->ke_conf_size - sizeof(*khdr);
-        memcpy(msg->data, tmpbuf + ke->ke_conf_size, msg->len);
+	msg->len =  bodysize - ke->ke_conf_size - sizeof(*khdr);
+	memcpy(msg->data, tmpbuf + ke->ke_conf_size, msg->len);
 
-        major = GSS_S_COMPLETE;
+	major = GSS_S_COMPLETE;
 out_free:
-        OBD_FREE_LARGE(tmpbuf, bodysize);
-        rawobj_free(&cksum);
-        return major;
+	OBD_FREE_LARGE(tmpbuf, bodysize);
+	rawobj_free(&cksum);
+	return major;
 }
 
 static
 __u32 gss_unwrap_bulk_kerberos(struct gss_ctx *gctx,
-                               struct ptlrpc_bulk_desc *desc,
-                               rawobj_t *token, int adj_nob)
+			       struct ptlrpc_bulk_desc *desc,
+			       rawobj_t *token, int adj_nob)
 {
-        struct krb5_ctx     *kctx = gctx->internal_ctx_id;
-        struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
-        struct krb5_header  *khdr;
-        int                  blocksize;
-        rawobj_t             cksum = RAWOBJ_EMPTY;
-        rawobj_t             cipher, plain;
-        rawobj_t             data_desc[1];
-        int                  rc;
-        __u32                major;
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header  *khdr;
+	int                  blocksz;
+	rawobj_t             cksum = RAWOBJ_EMPTY;
+	rawobj_t             cipher, plain;
+	rawobj_t             data_desc[1];
+	int                  rc;
+	__u32                major;
 
 	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
-        LASSERT(ke);
+	LASSERT(ke);
 
-        if (token->len < sizeof(*khdr)) {
-                CERROR("short signature: %u\n", token->len);
-                return GSS_S_DEFECTIVE_TOKEN;
-        }
+	if (token->len < sizeof(*khdr)) {
+		CERROR("short signature: %u\n", token->len);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
 
 	khdr = (struct krb5_header *)token->data;
 
-        major = verify_krb5_header(kctx, khdr, 1);
-        if (major != GSS_S_COMPLETE) {
-                CERROR("bad krb5 header\n");
-                return major;
-        }
-
-        /* block size */
-        if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
-                LASSERT(kctx->kc_keye.kb_tfm == NULL);
-                blocksize = 1;
-                LBUG();
-        } else {
-                LASSERT(kctx->kc_keye.kb_tfm);
-		blocksize = crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
-        }
-        LASSERT(sizeof(*khdr) >= blocksize && sizeof(*khdr) % blocksize == 0);
-
-        /*
-         * token format is expected as:
-         * -----------------------------------------------
-         * | krb5 header | head/tail cipher text | cksum |
-         * -----------------------------------------------
-         */
-        if (token->len < sizeof(*khdr) + blocksize + sizeof(*khdr) +
-                         ke->ke_hash_size) {
-                CERROR("short token size: %u\n", token->len);
-                return GSS_S_DEFECTIVE_TOKEN;
-        }
+	major = verify_krb5_header(kctx, khdr, 1);
+	if (major != GSS_S_COMPLETE) {
+		CERROR("bad krb5 header\n");
+		return major;
+	}
 
-        cipher.data = (__u8 *) (khdr + 1);
-        cipher.len = blocksize + sizeof(*khdr);
-        plain.data = cipher.data;
-        plain.len = cipher.len;
+	/* block size */
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		LASSERT(kctx->kc_keye.kb_tfm == NULL);
+		blocksz = 1;
+		LBUG();
+	} else {
+		LASSERT(kctx->kc_keye.kb_tfm);
+		blocksz = crypto_sync_skcipher_blocksize(kctx->kc_keye.kb_tfm);
+	}
+	LASSERT(sizeof(*khdr) >= blocksz && sizeof(*khdr) % blocksz == 0);
 
-        rc = krb5_decrypt_bulk(kctx->kc_keye.kb_tfm, khdr,
-                               desc, &cipher, &plain, adj_nob);
-        if (rc)
-                return GSS_S_DEFECTIVE_TOKEN;
+	/*
+	 * token format is expected as:
+	 * -----------------------------------------------
+	 * | krb5 header | head/tail cipher text | cksum |
+	 * -----------------------------------------------
+	 */
+	if (token->len < sizeof(*khdr) + blocksz + sizeof(*khdr) +
+	    ke->ke_hash_size) {
+		CERROR("short token size: %u\n", token->len);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
 
-        /*
-         * verify checksum, compose clear text as layout:
-         * ------------------------------------------
-         * | confounder | clear pages | krb5 header |
-         * ------------------------------------------
-         */
-        data_desc[0].data = plain.data;
-        data_desc[0].len = blocksize;
+	cipher.data = (__u8 *) (khdr + 1);
+	cipher.len = blocksz + sizeof(*khdr);
+	plain.data = cipher.data;
+	plain.len = cipher.len;
+
+	rc = krb5_decrypt_bulk(kctx->kc_keye.kb_tfm, khdr,
+			       desc, &cipher, &plain, adj_nob);
+	if (rc)
+		return GSS_S_DEFECTIVE_TOKEN;
+
+	/*
+	 * verify checksum, compose clear text as layout:
+	 * ------------------------------------------
+	 * | confounder | clear pages | krb5 header |
+	 * ------------------------------------------
+	 */
+	data_desc[0].data = plain.data;
+	data_desc[0].len = blocksz;
 
 	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
 			       khdr, 1, data_desc,
 			       desc->bd_iov_count,
 			       GET_KIOV(desc),
-			       &cksum))
+			       &cksum, gctx->hash_func))
 		return GSS_S_FAILURE;
 	LASSERT(cksum.len >= ke->ke_hash_size);
 
-        if (memcmp(plain.data + blocksize + sizeof(*khdr),
-                   cksum.data + cksum.len - ke->ke_hash_size,
-                   ke->ke_hash_size)) {
-                CERROR("checksum mismatch\n");
-                rawobj_free(&cksum);
-                return GSS_S_BAD_SIG;
-        }
+	if (memcmp(plain.data + blocksz + sizeof(*khdr),
+		   cksum.data + cksum.len - ke->ke_hash_size,
+		   ke->ke_hash_size)) {
+		CERROR("checksum mismatch\n");
+		rawobj_free(&cksum);
+		return GSS_S_BAD_SIG;
+	}
 
-        rawobj_free(&cksum);
-        return GSS_S_COMPLETE;
+	rawobj_free(&cksum);
+	return GSS_S_COMPLETE;
 }
 
 int gss_display_kerberos(struct gss_ctx        *ctx,
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_mech_switch.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_mech_switch.c
index be66ffde266d4..3ee125f1070bf 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_mech_switch.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_mech_switch.c
@@ -52,7 +52,6 @@
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_net.h>
 #include <lustre_import.h>
 #include <lustre_sec.h>
@@ -60,6 +59,7 @@
 #include "gss_err.h"
 #include "gss_internal.h"
 #include "gss_api.h"
+#include "gss_crypto.h"
 
 static struct list_head registered_mechs = LIST_HEAD_INIT(registered_mechs);
 static DEFINE_SPINLOCK(registered_mechs_lock);
@@ -69,7 +69,7 @@ int lgss_mech_register(struct gss_api_mech *gm)
 	spin_lock(&registered_mechs_lock);
 	list_add(&gm->gm_list, &registered_mechs);
 	spin_unlock(&registered_mechs_lock);
-	CWARN("Register %s mechanism\n", gm->gm_name);
+	CDEBUG(D_SEC, "register %s mechanism\n", gm->gm_name);
 	return 0;
 }
 
@@ -78,7 +78,7 @@ void lgss_mech_unregister(struct gss_api_mech *gm)
 	spin_lock(&registered_mechs_lock);
 	list_del(&gm->gm_list);
 	spin_unlock(&registered_mechs_lock);
-	CWARN("Unregister %s mechanism\n", gm->gm_name);
+	CDEBUG(D_SEC, "Unregister %s mechanism\n", gm->gm_name);
 }
 
 
@@ -148,50 +148,52 @@ __u32 lgss_import_sec_context(rawobj_t *input_token,
                               struct gss_api_mech *mech,
                               struct gss_ctx **ctx_id)
 {
-        OBD_ALLOC_PTR(*ctx_id);
-        if (*ctx_id == NULL)
-                return GSS_S_FAILURE;
+	OBD_ALLOC_PTR(*ctx_id);
+	if (*ctx_id == NULL)
+		return GSS_S_FAILURE;
 
-        (*ctx_id)->mech_type = lgss_mech_get(mech);
+	(*ctx_id)->mech_type = lgss_mech_get(mech);
+	(*ctx_id)->hash_func = gss_digest_hash;
 
-        LASSERT(mech);
-        LASSERT(mech->gm_ops);
-        LASSERT(mech->gm_ops->gss_import_sec_context);
-        return mech->gm_ops->gss_import_sec_context(input_token, *ctx_id);
+	LASSERT(mech);
+	LASSERT(mech->gm_ops);
+	LASSERT(mech->gm_ops->gss_import_sec_context);
+	return mech->gm_ops->gss_import_sec_context(input_token, *ctx_id);
 }
 
 __u32 lgss_copy_reverse_context(struct gss_ctx *ctx_id,
-                                struct gss_ctx **ctx_id_new)
+				struct gss_ctx **ctx_id_new)
 {
-        struct gss_api_mech *mech = ctx_id->mech_type;
-        __u32                major;
+	struct gss_api_mech *mech = ctx_id->mech_type;
+	__u32                major;
 
-        LASSERT(mech);
+	LASSERT(mech);
 
-        OBD_ALLOC_PTR(*ctx_id_new);
-        if (*ctx_id_new == NULL)
-                return GSS_S_FAILURE;
+	OBD_ALLOC_PTR(*ctx_id_new);
+	if (*ctx_id_new == NULL)
+		return GSS_S_FAILURE;
 
-        (*ctx_id_new)->mech_type = lgss_mech_get(mech);
+	(*ctx_id_new)->mech_type = lgss_mech_get(mech);
+	(*ctx_id_new)->hash_func = ctx_id->hash_func;
 
-        LASSERT(mech);
-        LASSERT(mech->gm_ops);
-        LASSERT(mech->gm_ops->gss_copy_reverse_context);
+	LASSERT(mech);
+	LASSERT(mech->gm_ops);
+	LASSERT(mech->gm_ops->gss_copy_reverse_context);
 
-        major = mech->gm_ops->gss_copy_reverse_context(ctx_id, *ctx_id_new);
-        if (major != GSS_S_COMPLETE) {
-                lgss_mech_put(mech);
-                OBD_FREE_PTR(*ctx_id_new);
-                *ctx_id_new = NULL;
-        }
-        return major;
+	major = mech->gm_ops->gss_copy_reverse_context(ctx_id, *ctx_id_new);
+	if (major != GSS_S_COMPLETE) {
+		lgss_mech_put(mech);
+		OBD_FREE_PTR(*ctx_id_new);
+		*ctx_id_new = NULL;
+	}
+	return major;
 }
 
 /*
  * this interface is much simplified, currently we only need endtime.
  */
 __u32 lgss_inquire_context(struct gss_ctx *context_handle,
-                           unsigned long  *endtime)
+			   time64_t *endtime)
 {
         LASSERT(context_handle);
         LASSERT(context_handle->mech_type);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_null_mech.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_null_mech.c
index fddd3ed3443c1..1e946f8ba2aff 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_null_mech.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_null_mech.c
@@ -92,10 +92,10 @@ __u32 gss_copy_reverse_context_null(struct gss_ctx *gss_context_old,
 
 static
 __u32 gss_inquire_context_null(struct gss_ctx *gss_context,
-			       unsigned long *endtime)
+			       time64_t *endtime)
 {
 	/* quick timeout for testing purposes */
-	*endtime = cfs_time_current_sec() + 60;
+	*endtime = ktime_get_real_seconds() + 60;
 	return GSS_S_COMPLETE;
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c
index 016d455040972..5e1e7caa1aae6 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_pipefs.c
@@ -62,7 +62,7 @@ struct rpc_clnt; /* for rpc_pipefs */
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 #include <lustre_sec.h>
 #include <lustre_net.h>
 #include <lustre_import.h>
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c
index fd1b071d6f549..69e92bcb28311 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_sk_mech.c
@@ -39,7 +39,6 @@
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
-#include <lustre/lustre_user.h>
 
 #include "gss_err.h"
 #include "gss_crypto.h"
@@ -62,14 +61,14 @@
 #define SK_IV_REV_START (1ULL << 63)
 
 struct sk_ctx {
-	__u16			sc_hmac;
-	__u16			sc_crypt;
-	__u32			sc_expire;
-	__u32			sc_host_random;
-	__u32			sc_peer_random;
-	atomic64_t		sc_iv;
-	rawobj_t		sc_hmac_key;
-	struct gss_keyblock	sc_session_kb;
+	enum cfs_crypto_crypt_alg sc_crypt;
+	enum cfs_crypto_hash_alg  sc_hmac;
+	__u32			  sc_expire;
+	__u32			  sc_host_random;
+	__u32			  sc_peer_random;
+	atomic64_t		  sc_iv;
+	rawobj_t		  sc_hmac_key;
+	struct gss_keyblock	  sc_session_kb;
 };
 
 struct sk_hdr {
@@ -88,24 +87,6 @@ struct sk_wire {
 	rawobj_t		skw_hmac;
 };
 
-static struct sk_crypt_type sk_crypt_types[] = {
-	[SK_CRYPT_AES256_CTR] = {
-		.sct_name = "ctr(aes)",
-		.sct_bytes = 32,
-	},
-};
-
-static struct sk_hmac_type sk_hmac_types[] = {
-	[SK_HMAC_SHA256] = {
-		.sht_name = "hmac(sha256)",
-		.sht_bytes = 32,
-	},
-	[SK_HMAC_SHA512] = {
-		.sht_name = "hmac(sha512)",
-		.sht_bytes = 64,
-	},
-};
-
 static inline unsigned long sk_block_mask(unsigned long len, int blocksize)
 {
 	return (len + blocksize - 1) & (~(blocksize - 1));
@@ -148,22 +129,18 @@ void sk_construct_rfc3686_iv(__u8 *iv, __u32 nonce, __u64 partial_iv)
 	memcpy(iv, &ctr, sizeof(ctr));
 }
 
-static int sk_init_keys(struct sk_ctx *skc)
-{
-	return gss_keyblock_init(&skc->sc_session_kb,
-				 sk_crypt_types[skc->sc_crypt].sct_name, 0);
-}
-
 static int sk_fill_context(rawobj_t *inbuf, struct sk_ctx *skc)
 {
 	char *ptr = inbuf->data;
 	char *end = inbuf->data + inbuf->len;
-	__u32 tmp;
+	char sk_hmac[CRYPTO_MAX_ALG_NAME];
+	char sk_crypt[CRYPTO_MAX_ALG_NAME];
+	u32 tmp;
 
 	/* see sk_serialize_kctx() for format from userspace side */
 	/*  1. Version */
 	if (gss_get_bytes(&ptr, end, &tmp, sizeof(tmp))) {
-		CERROR("Failed to read shared key interface version");
+		CERROR("Failed to read shared key interface version\n");
 		return -1;
 	}
 	if (tmp != SK_INTERFACE_VERSION) {
@@ -172,49 +149,55 @@ static int sk_fill_context(rawobj_t *inbuf, struct sk_ctx *skc)
 	}
 
 	/* 2. HMAC type */
-	if (gss_get_bytes(&ptr, end, &skc->sc_hmac, sizeof(skc->sc_hmac))) {
-		CERROR("Failed to read HMAC algorithm type");
+	if (gss_get_bytes(&ptr, end, &sk_hmac, sizeof(sk_hmac))) {
+		CERROR("Failed to read HMAC algorithm type\n");
 		return -1;
 	}
-	if (skc->sc_hmac <= SK_HMAC_EMPTY || skc->sc_hmac >= SK_HMAC_MAX) {
-		CERROR("Invalid hmac type: %d\n", skc->sc_hmac);
+
+	skc->sc_hmac = cfs_crypto_hash_alg(sk_hmac);
+	if (skc->sc_hmac != CFS_HASH_ALG_NULL &&
+	    skc->sc_hmac != CFS_HASH_ALG_SHA256 &&
+	    skc->sc_hmac != CFS_HASH_ALG_SHA512) {
+		CERROR("Invalid hmac type: %s\n", sk_hmac);
 		return -1;
 	}
 
 	/* 3. crypt type */
-	if (gss_get_bytes(&ptr, end, &skc->sc_crypt, sizeof(skc->sc_crypt))) {
-		CERROR("Failed to read crypt algorithm type");
+	if (gss_get_bytes(&ptr, end, &sk_crypt, sizeof(sk_crypt))) {
+		CERROR("Failed to read crypt algorithm type\n");
 		return -1;
 	}
-	if (skc->sc_crypt <= SK_CRYPT_EMPTY || skc->sc_crypt >= SK_CRYPT_MAX) {
-		CERROR("Invalid crypt type: %d\n", skc->sc_crypt);
+
+	skc->sc_crypt = cfs_crypto_crypt_alg(sk_crypt);
+	if (skc->sc_crypt == CFS_CRYPT_ALG_UNKNOWN) {
+		CERROR("Invalid crypt type: %s\n", sk_crypt);
 		return -1;
 	}
 
 	/* 4. expiration time */
 	if (gss_get_bytes(&ptr, end, &tmp, sizeof(tmp))) {
-		CERROR("Failed to read context expiration time");
+		CERROR("Failed to read context expiration time\n");
 		return -1;
 	}
-	skc->sc_expire = tmp + cfs_time_current_sec();
+	skc->sc_expire = tmp + ktime_get_real_seconds();
 
 	/* 5. host random is used as nonce for encryption */
 	if (gss_get_bytes(&ptr, end, &skc->sc_host_random,
 			  sizeof(skc->sc_host_random))) {
-		CERROR("Failed to read host random ");
+		CERROR("Failed to read host random\n");
 		return -1;
 	}
 
 	/* 6. peer random is used as nonce for decryption */
 	if (gss_get_bytes(&ptr, end, &skc->sc_peer_random,
 			  sizeof(skc->sc_peer_random))) {
-		CERROR("Failed to read peer random ");
+		CERROR("Failed to read peer random\n");
 		return -1;
 	}
 
 	/* 7. HMAC key */
 	if (gss_get_rawobj(&ptr, end, &skc->sc_hmac_key)) {
-		CERROR("Failed to read HMAC key");
+		CERROR("Failed to read HMAC key\n");
 		return -1;
 	}
 	if (skc->sc_hmac_key.len <= SK_MIN_SIZE) {
@@ -225,7 +208,7 @@ static int sk_fill_context(rawobj_t *inbuf, struct sk_ctx *skc)
 
 	/* 8. Session key, can be empty if not using privacy mode */
 	if (gss_get_rawobj(&ptr, end, &skc->sc_session_kb.kb_key)) {
-		CERROR("Failed to read session key");
+		CERROR("Failed to read session key\n");
 		return -1;
 	}
 
@@ -263,13 +246,14 @@ __u32 gss_import_sec_context_sk(rawobj_t *inbuf, struct gss_ctx *gss_context)
 	/* Only privacy mode needs to initialize keys */
 	if (skc->sc_session_kb.kb_key.len > 0) {
 		privacy = true;
-		if (sk_init_keys(skc))
+		if (gss_keyblock_init(&skc->sc_session_kb,
+				      cfs_crypto_crypt_name(skc->sc_crypt), 0))
 			goto out_err;
 	}
 
 	gss_context->internal_ctx_id = skc;
 	CDEBUG(D_SEC, "successfully imported sk%s context\n",
-	       privacy ? "pi" : "i");
+	       privacy ? " (with privacy)" : "");
 
 	return GSS_S_COMPLETE;
 
@@ -304,7 +288,9 @@ __u32 gss_copy_reverse_context_sk(struct gss_ctx *gss_context_old,
 
 	/* Only privacy mode needs to initialize keys */
 	if (skc_new->sc_session_kb.kb_key.len > 0)
-		if (sk_init_keys(skc_new))
+		if (gss_keyblock_init(&skc_new->sc_session_kb,
+				      cfs_crypto_crypt_name(skc_new->sc_crypt),
+				      0))
 			goto out_err;
 
 	gss_context_new->internal_ctx_id = skc_new;
@@ -319,7 +305,7 @@ __u32 gss_copy_reverse_context_sk(struct gss_ctx *gss_context_old,
 
 static
 __u32 gss_inquire_context_sk(struct gss_ctx *gss_context,
-			     unsigned long *endtime)
+			     time64_t *endtime)
 {
 	struct sk_ctx *skc = gss_context->internal_ctx_id;
 
@@ -328,24 +314,32 @@ __u32 gss_inquire_context_sk(struct gss_ctx *gss_context,
 }
 
 static
-__u32 sk_make_hmac(char *alg_name, rawobj_t *key, int msg_count, rawobj_t *msgs,
-		   int iov_count, lnet_kiov_t *iovs, rawobj_t *token)
+u32 sk_make_hmac(enum cfs_crypto_hash_alg algo, rawobj_t *key, int msg_count,
+		 rawobj_t *msgs, int iov_count, lnet_kiov_t *iovs,
+		 rawobj_t *token, digest_hash hash_func)
 {
-	struct crypto_hash *tfm;
-	int rc;
+	struct ahash_request *req;
+	int rc2, rc;
 
-	tfm = crypto_alloc_hash(alg_name, 0, 0);
-	if (IS_ERR(tfm))
-		return GSS_S_FAILURE;
+	req = cfs_crypto_hash_init(algo, key->data, key->len);
+	if (IS_ERR(req)) {
+		rc = PTR_ERR(req);
+		goto out_init_failed;
+	}
 
-	rc = GSS_S_FAILURE;
-	LASSERT(token->len >= crypto_hash_digestsize(tfm));
-	if (!gss_digest_hmac(tfm, key, NULL, msg_count, msgs, iov_count, iovs,
-			    token))
-		rc = GSS_S_COMPLETE;
 
-	crypto_free_hash(tfm);
-	return rc;
+	if (hash_func)
+		rc2 = hash_func(req, NULL, msg_count, msgs, iov_count,
+				iovs);
+	else
+		rc2 = gss_digest_hash(req, NULL, msg_count, msgs, iov_count,
+				      iovs);
+
+	rc = cfs_crypto_hash_final(req, token->data, &token->len);
+	if (!rc && rc2)
+		rc = rc2;
+out_init_failed:
+	return rc ? GSS_S_FAILURE : GSS_S_COMPLETE;
 }
 
 static
@@ -357,20 +351,22 @@ __u32 gss_get_mic_sk(struct gss_ctx *gss_context,
 		     rawobj_t *token)
 {
 	struct sk_ctx *skc = gss_context->internal_ctx_id;
-	return sk_make_hmac(sk_hmac_types[skc->sc_hmac].sht_name,
+
+	return sk_make_hmac(skc->sc_hmac,
 			    &skc->sc_hmac_key, message_count, messages,
-			    iov_count, iovs, token);
+			    iov_count, iovs, token, gss_context->hash_func);
 }
 
 static
-__u32 sk_verify_hmac(struct sk_hmac_type *sht, rawobj_t *key, int message_count,
-			 rawobj_t *messages, int iov_count, lnet_kiov_t *iovs,
-			 rawobj_t *token)
+u32 sk_verify_hmac(enum cfs_crypto_hash_alg algo, rawobj_t *key,
+		   int message_count, rawobj_t *messages,
+		   int iov_count, lnet_kiov_t *iovs,
+		   rawobj_t *token, digest_hash hash_func)
 {
 	rawobj_t checksum = RAWOBJ_EMPTY;
 	__u32 rc = GSS_S_FAILURE;
 
-	checksum.len = sht->sht_bytes;
+	checksum.len = cfs_crypto_hash_digestsize(algo);
 	if (token->len < checksum.len) {
 		CDEBUG(D_SEC, "Token received too short, expected %d "
 		       "received %d\n", token->len, checksum.len);
@@ -381,8 +377,9 @@ __u32 sk_verify_hmac(struct sk_hmac_type *sht, rawobj_t *key, int message_count,
 	if (!checksum.data)
 		return rc;
 
-	if (sk_make_hmac(sht->sht_name, key, message_count, messages,
-			 iov_count, iovs, &checksum)) {
+	if (sk_make_hmac(algo, key, message_count,
+			 messages, iov_count, iovs, &checksum,
+			 hash_func)) {
 		CDEBUG(D_SEC, "Failed to create checksum to validate\n");
 		goto cleanup;
 	}
@@ -405,23 +402,19 @@ __u32 sk_verify_hmac(struct sk_hmac_type *sht, rawobj_t *key, int message_count,
  * to decrypt up to the number of bytes actually specified from the sender
  * (bd_nob) otherwise the calulated HMAC will be incorrect. */
 static
-__u32 sk_verify_bulk_hmac(struct sk_hmac_type *sht, rawobj_t *key,
-			  int msgcnt, rawobj_t *msgs, int iovcnt,
-			  lnet_kiov_t *iovs, int iov_bytes, rawobj_t *token)
+u32 sk_verify_bulk_hmac(enum cfs_crypto_hash_alg sc_hmac, rawobj_t *key,
+			int msgcnt, rawobj_t *msgs, int iovcnt,
+			lnet_kiov_t *iovs, int iov_bytes, rawobj_t *token)
 {
 	rawobj_t checksum = RAWOBJ_EMPTY;
-	struct crypto_hash *tfm;
-	struct hash_desc desc = {
-		.tfm = NULL,
-		.flags = 0,
-	};
+	struct ahash_request *req;
 	struct scatterlist sg[1];
+	int rc = 0;
 	struct sg_table sgt;
 	int bytes;
 	int i;
-	int rc = GSS_S_FAILURE;
 
-	checksum.len = sht->sht_bytes;
+	checksum.len = cfs_crypto_hash_digestsize(sc_hmac);
 	if (token->len < checksum.len) {
 		CDEBUG(D_SEC, "Token received too short, expected %d "
 		       "received %d\n", token->len, checksum.len);
@@ -430,33 +423,24 @@ __u32 sk_verify_bulk_hmac(struct sk_hmac_type *sht, rawobj_t *key,
 
 	OBD_ALLOC_LARGE(checksum.data, checksum.len);
 	if (!checksum.data)
-		return rc;
+		return GSS_S_FAILURE;
 
-	tfm = crypto_alloc_hash(sht->sht_name, 0, 0);
-	if (IS_ERR(tfm))
+	req = cfs_crypto_hash_init(sc_hmac, key->data, key->len);
+	if (IS_ERR(req)) {
+		rc = GSS_S_FAILURE;
 		goto cleanup;
-
-	desc.tfm = tfm;
-
-	LASSERT(token->len >= crypto_hash_digestsize(tfm));
-
-	rc = crypto_hash_setkey(tfm, key->data, key->len);
-	if (rc)
-		goto hash_cleanup;
-
-	rc = crypto_hash_init(&desc);
-	if (rc)
-		goto hash_cleanup;
+	}
 
 	for (i = 0; i < msgcnt; i++) {
-		if (msgs[i].len == 0)
+		if (!msgs[i].len)
 			continue;
 
 		rc = gss_setup_sgtable(&sgt, sg, msgs[i].data, msgs[i].len);
 		if (rc != 0)
 			goto hash_cleanup;
 
-		rc = crypto_hash_update(&desc, sg, msgs[i].len);
+		ahash_request_set_crypt(req, sg, NULL, msgs[i].len);
+		rc = crypto_ahash_update(req);
 		if (rc) {
 			gss_teardown_sgtable(&sgt);
 			goto hash_cleanup;
@@ -475,22 +459,21 @@ __u32 sk_verify_bulk_hmac(struct sk_hmac_type *sht, rawobj_t *key,
 		sg_init_table(sg, 1);
 		sg_set_page(&sg[0], iovs[i].kiov_page, bytes,
 			    iovs[i].kiov_offset);
-		rc = crypto_hash_update(&desc, sg, bytes);
+		ahash_request_set_crypt(req, sg, NULL, bytes);
+		rc = crypto_ahash_update(req);
 		if (rc)
 			goto hash_cleanup;
 	}
 
-	crypto_hash_final(&desc, checksum.data);
+hash_cleanup:
+	cfs_crypto_hash_final(req, checksum.data, &checksum.len);
+	if (rc)
+		goto cleanup;
 
-	if (memcmp(token->data, checksum.data, checksum.len)) {
+	if (memcmp(token->data, checksum.data, checksum.len))
 		rc = GSS_S_BAD_SIG;
-		goto hash_cleanup;
-	}
-
-	rc = GSS_S_COMPLETE;
-
-hash_cleanup:
-	crypto_free_hash(tfm);
+	else
+		rc = GSS_S_COMPLETE;
 
 cleanup:
 	OBD_FREE_LARGE(checksum.data, checksum.len);
@@ -507,8 +490,10 @@ __u32 gss_verify_mic_sk(struct gss_ctx *gss_context,
 			rawobj_t *token)
 {
 	struct sk_ctx *skc = gss_context->internal_ctx_id;
-	return sk_verify_hmac(&sk_hmac_types[skc->sc_hmac], &skc->sc_hmac_key,
-			      message_count, messages, iov_count, iovs, token);
+
+	return sk_verify_hmac(skc->sc_hmac, &skc->sc_hmac_key,
+			      message_count, messages, iov_count, iovs, token,
+			      gss_context->hash_func);
 }
 
 static
@@ -517,7 +502,7 @@ __u32 gss_wrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header,
 		    rawobj_t *token)
 {
 	struct sk_ctx *skc = gss_context->internal_ctx_id;
-	struct sk_hmac_type *sht = &sk_hmac_types[skc->sc_hmac];
+	size_t sht_bytes = cfs_crypto_hash_digestsize(skc->sc_hmac);
 	struct sk_wire skw;
 	struct sk_hdr skh;
 	rawobj_t msgbufs[3];
@@ -526,7 +511,7 @@ __u32 gss_wrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header,
 
 	LASSERT(skc->sc_session_kb.kb_tfm);
 
-	blocksize = crypto_blkcipher_blocksize(skc->sc_session_kb.kb_tfm);
+	blocksize = crypto_sync_skcipher_blocksize(skc->sc_session_kb.kb_tfm);
 	if (gss_add_padding(message, message_buffer_length, blocksize))
 		return GSS_S_FAILURE;
 
@@ -541,7 +526,7 @@ __u32 gss_wrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header,
 
 	sk_construct_rfc3686_iv(local_iv, skc->sc_host_random, skh.skh_iv);
 	skw.skw_cipher.data = skw.skw_header.data + skw.skw_header.len;
-	skw.skw_cipher.len = token->len - skw.skw_header.len - sht->sht_bytes;
+	skw.skw_cipher.len = token->len - skw.skw_header.len - sht_bytes;
 	if (gss_crypt_rawobjs(skc->sc_session_kb.kb_tfm, local_iv, 1, message,
 			      &skw.skw_cipher, 1))
 		return GSS_S_FAILURE;
@@ -552,9 +537,10 @@ __u32 gss_wrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header,
 	msgbufs[2] = skw.skw_cipher;
 
 	skw.skw_hmac.data = skw.skw_cipher.data + skw.skw_cipher.len;
-	skw.skw_hmac.len = sht->sht_bytes;
-	if (sk_make_hmac(sht->sht_name, &skc->sc_hmac_key, 3, msgbufs, 0,
-			 NULL, &skw.skw_hmac))
+	skw.skw_hmac.len = sht_bytes;
+	if (sk_make_hmac(skc->sc_hmac, &skc->sc_hmac_key,
+			 3, msgbufs, 0, NULL, &skw.skw_hmac,
+			 gss_context->hash_func))
 		return GSS_S_FAILURE;
 
 	token->len = skw.skw_header.len + skw.skw_cipher.len + skw.skw_hmac.len;
@@ -567,7 +553,7 @@ __u32 gss_unwrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header,
 		      rawobj_t *token, rawobj_t *message)
 {
 	struct sk_ctx *skc = gss_context->internal_ctx_id;
-	struct sk_hmac_type *sht = &sk_hmac_types[skc->sc_hmac];
+	size_t sht_bytes = cfs_crypto_hash_digestsize(skc->sc_hmac);
 	struct sk_wire skw;
 	struct sk_hdr *skh;
 	rawobj_t msgbufs[3];
@@ -577,17 +563,17 @@ __u32 gss_unwrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header,
 
 	LASSERT(skc->sc_session_kb.kb_tfm);
 
-	if (token->len < sizeof(skh) + sht->sht_bytes)
+	if (token->len < sizeof(skh) + sht_bytes)
 		return GSS_S_DEFECTIVE_TOKEN;
 
 	skw.skw_header.data = token->data;
 	skw.skw_header.len = sizeof(struct sk_hdr);
 	skw.skw_cipher.data = skw.skw_header.data + skw.skw_header.len;
-	skw.skw_cipher.len = token->len - skw.skw_header.len - sht->sht_bytes;
+	skw.skw_cipher.len = token->len - skw.skw_header.len - sht_bytes;
 	skw.skw_hmac.data = skw.skw_cipher.data + skw.skw_cipher.len;
-	skw.skw_hmac.len = sht->sht_bytes;
+	skw.skw_hmac.len = sht_bytes;
 
-	blocksize = crypto_blkcipher_blocksize(skc->sc_session_kb.kb_tfm);
+	blocksize = crypto_sync_skcipher_blocksize(skc->sc_session_kb.kb_tfm);
 	if (skw.skw_cipher.len % blocksize != 0)
 		return GSS_S_DEFECTIVE_TOKEN;
 
@@ -600,8 +586,8 @@ __u32 gss_unwrap_sk(struct gss_ctx *gss_context, rawobj_t *gss_header,
 	msgbufs[0] = skw.skw_header;
 	msgbufs[1] = *gss_header;
 	msgbufs[2] = skw.skw_cipher;
-	rc = sk_verify_hmac(sht, &skc->sc_hmac_key, 3, msgbufs, 0, NULL,
-			    &skw.skw_hmac);
+	rc = sk_verify_hmac(skc->sc_hmac, &skc->sc_hmac_key, 3, msgbufs,
+			    0, NULL, &skw.skw_hmac, gss_context->hash_func);
 	if (rc)
 		return rc;
 
@@ -623,7 +609,7 @@ __u32 gss_prep_bulk_sk(struct gss_ctx *gss_context,
 	int i;
 
 	LASSERT(skc->sc_session_kb.kb_tfm);
-	blocksize = crypto_blkcipher_blocksize(skc->sc_session_kb.kb_tfm);
+	blocksize = crypto_sync_skcipher_blocksize(skc->sc_session_kb.kb_tfm);
 
 	for (i = 0; i < desc->bd_iov_count; i++) {
 		if (BD_GET_KIOV(desc, i).kiov_offset & blocksize) {
@@ -641,27 +627,26 @@ __u32 gss_prep_bulk_sk(struct gss_ctx *gss_context,
 	return GSS_S_COMPLETE;
 }
 
-static __u32 sk_encrypt_bulk(struct crypto_blkcipher *tfm, __u8 *iv,
+static __u32 sk_encrypt_bulk(struct crypto_sync_skcipher *tfm, __u8 *iv,
 			     struct ptlrpc_bulk_desc *desc, rawobj_t *cipher,
 			     int adj_nob)
 {
-	struct blkcipher_desc cdesc = {
-		.tfm = tfm,
-		.info = iv,
-		.flags = 0,
-	};
 	struct scatterlist ptxt;
 	struct scatterlist ctxt;
 	int blocksize;
 	int i;
 	int rc;
 	int nob = 0;
+	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
 
-	blocksize = crypto_blkcipher_blocksize(tfm);
+	blocksize = crypto_sync_skcipher_blocksize(tfm);
 
 	sg_init_table(&ptxt, 1);
 	sg_init_table(&ctxt, 1);
 
+	skcipher_request_set_sync_tfm(req, tfm);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+
 	for (i = 0; i < desc->bd_iov_count; i++) {
 		sg_set_page(&ptxt, BD_GET_KIOV(desc, i).kiov_page,
 			    sk_block_mask(BD_GET_KIOV(desc, i).kiov_len,
@@ -675,13 +660,15 @@ static __u32 sk_encrypt_bulk(struct crypto_blkcipher *tfm, __u8 *iv,
 		BD_GET_ENC_KIOV(desc, i).kiov_offset = ctxt.offset;
 		BD_GET_ENC_KIOV(desc, i).kiov_len = ctxt.length;
 
-		rc = crypto_blkcipher_encrypt_iv(&cdesc, &ctxt, &ptxt,
-						 ptxt.length);
+		skcipher_request_set_crypt(req, &ptxt, &ctxt, ptxt.length, iv);
+		rc = crypto_skcipher_encrypt_iv(req, &ctxt, &ptxt, ptxt.length);
 		if (rc) {
 			CERROR("failed to encrypt page: %d\n", rc);
+			skcipher_request_zero(req);
 			return rc;
 		}
 	}
+	skcipher_request_zero(req);
 
 	if (adj_nob)
 		desc->bd_nob = nob;
@@ -689,15 +676,10 @@ static __u32 sk_encrypt_bulk(struct crypto_blkcipher *tfm, __u8 *iv,
 	return 0;
 }
 
-static __u32 sk_decrypt_bulk(struct crypto_blkcipher *tfm, __u8 *iv,
+static __u32 sk_decrypt_bulk(struct crypto_sync_skcipher *tfm, __u8 *iv,
 			     struct ptlrpc_bulk_desc *desc, rawobj_t *cipher,
 			     int adj_nob)
 {
-	struct blkcipher_desc cdesc = {
-		.tfm = tfm,
-		.info = iv,
-		.flags = 0,
-	};
 	struct scatterlist ptxt;
 	struct scatterlist ctxt;
 	int blocksize;
@@ -705,17 +687,21 @@ static __u32 sk_decrypt_bulk(struct crypto_blkcipher *tfm, __u8 *iv,
 	int rc;
 	int pnob = 0;
 	int cnob = 0;
+	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
 
 	sg_init_table(&ptxt, 1);
 	sg_init_table(&ctxt, 1);
 
-	blocksize = crypto_blkcipher_blocksize(tfm);
+	blocksize = crypto_sync_skcipher_blocksize(tfm);
 	if (desc->bd_nob_transferred % blocksize != 0) {
 		CERROR("Transfer not a multiple of block size: %d\n",
 		       desc->bd_nob_transferred);
 		return GSS_S_DEFECTIVE_TOKEN;
 	}
 
+	skcipher_request_set_sync_tfm(req, tfm);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+
 	for (i = 0; i < desc->bd_iov_count && cnob < desc->bd_nob_transferred;
 	     i++) {
 		lnet_kiov_t *piov = &BD_GET_KIOV(desc, i);
@@ -724,6 +710,7 @@ static __u32 sk_decrypt_bulk(struct crypto_blkcipher *tfm, __u8 *iv,
 		if (ciov->kiov_offset % blocksize != 0 ||
 		    ciov->kiov_len % blocksize != 0) {
 			CERROR("Invalid bulk descriptor vector\n");
+			skcipher_request_zero(req);
 			return GSS_S_DEFECTIVE_TOKEN;
 		}
 
@@ -747,6 +734,7 @@ static __u32 sk_decrypt_bulk(struct crypto_blkcipher *tfm, __u8 *iv,
 			if (ciov->kiov_len + cnob > desc->bd_nob_transferred ||
 			    piov->kiov_len > ciov->kiov_len) {
 				CERROR("Invalid decrypted length\n");
+				skcipher_request_zero(req);
 				return GSS_S_FAILURE;
 			}
 		}
@@ -765,10 +753,11 @@ static __u32 sk_decrypt_bulk(struct crypto_blkcipher *tfm, __u8 *iv,
 		if (piov->kiov_len % blocksize == 0)
 			sg_assign_page(&ptxt, piov->kiov_page);
 
-		rc = crypto_blkcipher_decrypt_iv(&cdesc, &ptxt, &ctxt,
-						 ctxt.length);
+		skcipher_request_set_crypt(req, &ctxt, &ptxt, ptxt.length, iv);
+		rc = crypto_skcipher_decrypt_iv(req, &ptxt, &ctxt, ptxt.length);
 		if (rc) {
 			CERROR("Decryption failed for page: %d\n", rc);
+			skcipher_request_zero(req);
 			return GSS_S_FAILURE;
 		}
 
@@ -783,6 +772,7 @@ static __u32 sk_decrypt_bulk(struct crypto_blkcipher *tfm, __u8 *iv,
 		cnob += ciov->kiov_len;
 		pnob += piov->kiov_len;
 	}
+	skcipher_request_zero(req);
 
 	/* if needed, clear up the rest unused iovs */
 	if (adj_nob)
@@ -810,7 +800,7 @@ __u32 gss_wrap_bulk_sk(struct gss_ctx *gss_context,
 		       int adj_nob)
 {
 	struct sk_ctx *skc = gss_context->internal_ctx_id;
-	struct sk_hmac_type *sht = &sk_hmac_types[skc->sc_hmac];
+	size_t sht_bytes = cfs_crypto_hash_digestsize(skc->sc_hmac);
 	struct sk_wire skw;
 	struct sk_hdr skh;
 	__u8 local_iv[SK_IV_SIZE];
@@ -827,15 +817,16 @@ __u32 gss_wrap_bulk_sk(struct gss_ctx *gss_context,
 
 	sk_construct_rfc3686_iv(local_iv, skc->sc_host_random, skh.skh_iv);
 	skw.skw_cipher.data = skw.skw_header.data + skw.skw_header.len;
-	skw.skw_cipher.len = token->len - skw.skw_header.len - sht->sht_bytes;
+	skw.skw_cipher.len = token->len - skw.skw_header.len - sht_bytes;
 	if (sk_encrypt_bulk(skc->sc_session_kb.kb_tfm, local_iv,
 			    desc, &skw.skw_cipher, adj_nob))
 		return GSS_S_FAILURE;
 
 	skw.skw_hmac.data = skw.skw_cipher.data + skw.skw_cipher.len;
-	skw.skw_hmac.len = sht->sht_bytes;
-	if (sk_make_hmac(sht->sht_name, &skc->sc_hmac_key, 1, &skw.skw_cipher,
-			 desc->bd_iov_count, GET_ENC_KIOV(desc), &skw.skw_hmac))
+	skw.skw_hmac.len = sht_bytes;
+	if (sk_make_hmac(skc->sc_hmac, &skc->sc_hmac_key, 1, &skw.skw_cipher,
+			 desc->bd_iov_count, GET_ENC_KIOV(desc), &skw.skw_hmac,
+			 gss_context->hash_func))
 		return GSS_S_FAILURE;
 
 	return GSS_S_COMPLETE;
@@ -847,7 +838,7 @@ __u32 gss_unwrap_bulk_sk(struct gss_ctx *gss_context,
 			   rawobj_t *token, int adj_nob)
 {
 	struct sk_ctx *skc = gss_context->internal_ctx_id;
-	struct sk_hmac_type *sht = &sk_hmac_types[skc->sc_hmac];
+	size_t sht_bytes = cfs_crypto_hash_digestsize(skc->sc_hmac);
 	struct sk_wire skw;
 	struct sk_hdr *skh;
 	__u8 local_iv[SK_IV_SIZE];
@@ -855,25 +846,25 @@ __u32 gss_unwrap_bulk_sk(struct gss_ctx *gss_context,
 
 	LASSERT(skc->sc_session_kb.kb_tfm);
 
-	if (token->len < sizeof(skh) + sht->sht_bytes)
+	if (token->len < sizeof(skh) + sht_bytes)
 		return GSS_S_DEFECTIVE_TOKEN;
 
 	skw.skw_header.data = token->data;
 	skw.skw_header.len = sizeof(struct sk_hdr);
 	skw.skw_cipher.data = skw.skw_header.data + skw.skw_header.len;
-	skw.skw_cipher.len = token->len - skw.skw_header.len - sht->sht_bytes;
+	skw.skw_cipher.len = token->len - skw.skw_header.len - sht_bytes;
 	skw.skw_hmac.data = skw.skw_cipher.data + skw.skw_cipher.len;
-	skw.skw_hmac.len = sht->sht_bytes;
+	skw.skw_hmac.len = sht_bytes;
 
 	skh = (struct sk_hdr *)skw.skw_header.data;
 	rc = sk_verify_header(skh);
 	if (rc != GSS_S_COMPLETE)
 		return rc;
 
-	rc = sk_verify_bulk_hmac(&sk_hmac_types[skc->sc_hmac],
-				 &skc->sc_hmac_key, 1, &skw.skw_cipher,
-				 desc->bd_iov_count, GET_ENC_KIOV(desc),
-				 desc->bd_nob, &skw.skw_hmac);
+	rc = sk_verify_bulk_hmac(skc->sc_hmac, &skc->sc_hmac_key, 1,
+				 &skw.skw_cipher, desc->bd_iov_count,
+				 GET_ENC_KIOV(desc), desc->bd_nob,
+				 &skw.skw_hmac);
 	if (rc)
 		return rc;
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_svc_upcall.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_svc_upcall.c
index 4798711dbe983..2202e3f56f8c5 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_svc_upcall.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/gss_svc_upcall.c
@@ -60,7 +60,6 @@
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_import.h>
 #include <lustre_net.h>
 #include <lustre_nodemap.h>
@@ -69,12 +68,15 @@
 #include "gss_err.h"
 #include "gss_internal.h"
 #include "gss_api.h"
+#include "gss_crypto.h"
 
 #define GSS_SVC_UPCALL_TIMEOUT  (20)
 
 static spinlock_t __ctx_index_lock;
 static __u64 __ctx_index;
 
+unsigned int krb5_allow_old_client_csum;
+
 __u64 gss_get_next_ctx_index(void)
 {
 	__u64 idx;
@@ -160,6 +162,18 @@ static struct cache_detail rsi_cache;
 static struct rsi *rsi_update(struct rsi *new, struct rsi *old);
 static struct rsi *rsi_lookup(struct rsi *item);
 
+#ifdef HAVE_CACHE_DETAIL_WRITERS
+static inline int channel_users(struct cache_detail *cd)
+{
+	return atomic_read(&cd->writers);
+}
+#else
+static inline int channel_users(struct cache_detail *cd)
+{
+	return atomic_read(&cd->readers);
+}
+#endif
+
 static inline int rsi_hash(struct rsi *item)
 {
         return hash_mem((char *)item->in_handle.data, item->in_handle.len,
@@ -299,10 +313,9 @@ static struct cache_head *rsi_alloc(void)
 static int rsi_parse(struct cache_detail *cd, char *mesg, int mlen)
 {
         char           *buf = mesg;
-        char           *ep;
         int             len;
         struct rsi      rsii, *rsip = NULL;
-        time_t          expiry;
+	time64_t expiry;
         int             status = -EINVAL;
         ENTRY;
 
@@ -341,18 +354,21 @@ static int rsi_parse(struct cache_detail *cd, char *mesg, int mlen)
         if (len <= 0)
                 goto out;
 
-        /* major */
-        rsii.major_status = simple_strtol(buf, &ep, 10);
-        if (*ep)
-                goto out;
+	/* major */
+	status = kstrtoint(buf, 10, &rsii.major_status);
+	if (status)
+		goto out;
 
-        /* minor */
-        len = qword_get(&mesg, buf, mlen);
-        if (len <= 0)
-                goto out;
-        rsii.minor_status = simple_strtol(buf, &ep, 10);
-        if (*ep)
-                goto out;
+	/* minor */
+	len = qword_get(&mesg, buf, mlen);
+	if (len <= 0) {
+		status = -EINVAL;
+		goto out;
+	}
+
+	status = kstrtoint(buf, 10, &rsii.minor_status);
+	if (status)
+		goto out;
 
         /* out_handle */
         len = qword_get(&mesg, buf, mlen);
@@ -544,7 +560,7 @@ static int rsc_parse(struct cache_detail *cd, char *mesg, int mlen)
         char                *buf = mesg;
         int                  len, rv, tmp_int;
         struct rsc           rsci, *rscp = NULL;
-        time_t               expiry;
+	time64_t expiry;
         int                  status = -EINVAL;
         struct gss_api_mech *gm = NULL;
 
@@ -649,8 +665,7 @@ static int rsc_parse(struct cache_detail *cd, char *mesg, int mlen)
 		/* currently the expiry time passed down from user-space
 		 * is invalid, here we retrive it from mech.
 		 */
-		if (lgss_inquire_context(rsci.ctx.gsc_mechctx,
-					 (unsigned long *)&ctx_expiry)) {
+		if (lgss_inquire_context(rsci.ctx.gsc_mechctx, &ctx_expiry)) {
 			CERROR("unable to get expire time, drop it\n");
 			goto out;
 		}
@@ -720,85 +735,6 @@ static struct rsc *rsc_update(struct rsc *new, struct rsc *old)
  * rsc cache flush                      *
  ****************************************/
 
-typedef int rsc_entry_match(struct rsc *rscp, long data);
-
-static void rsc_flush(rsc_entry_match *match, long data)
-{
-#ifdef HAVE_CACHE_HEAD_HLIST
-	struct cache_head *ch = NULL;
-	struct hlist_head *head;
-#else
-	struct cache_head **ch;
-#endif
-        struct rsc *rscp;
-        int n;
-        ENTRY;
-
-	write_lock(&rsc_cache.hash_lock);
-        for (n = 0; n < RSC_HASHMAX; n++) {
-#ifdef HAVE_CACHE_HEAD_HLIST
-		head = &rsc_cache.hash_table[n];
-		hlist_for_each_entry(ch, head, cache_list) {
-			rscp = container_of(ch, struct rsc, h);
-#else
-		for (ch = &rsc_cache.hash_table[n]; *ch;) {
-			rscp = container_of(*ch, struct rsc, h);
-#endif
-
-                        if (!match(rscp, data)) {
-#ifndef HAVE_CACHE_HEAD_HLIST
-				ch = &((*ch)->next);
-#endif
-                                continue;
-                        }
-
-                        /* it seems simply set NEGATIVE doesn't work */
-#ifdef HAVE_CACHE_HEAD_HLIST
-			hlist_del_init(&ch->cache_list);
-#else
-			*ch = (*ch)->next;
-			rscp->h.next = NULL;
-#endif
-                        cache_get(&rscp->h);
-			set_bit(CACHE_NEGATIVE, &rscp->h.flags);
-                        COMPAT_RSC_PUT(&rscp->h, &rsc_cache);
-                        rsc_cache.entries--;
-                }
-        }
-	write_unlock(&rsc_cache.hash_lock);
-        EXIT;
-}
-
-static int match_uid(struct rsc *rscp, long uid)
-{
-        if ((int) uid == -1)
-                return 1;
-        return ((int) rscp->ctx.gsc_uid == (int) uid);
-}
-
-static int match_target(struct rsc *rscp, long target)
-{
-        return (rscp->target == (struct obd_device *) target);
-}
-
-static inline void rsc_flush_uid(int uid)
-{
-        if (uid == -1)
-                CWARN("flush all gss contexts...\n");
-
-        rsc_flush(match_uid, (long) uid);
-}
-
-static inline void rsc_flush_target(struct obd_device *target)
-{
-        rsc_flush(match_target, (long) target);
-}
-
-void gss_secsvc_flush(struct obd_device *target)
-{
-        rsc_flush_target(target);
-}
-
 static struct rsc *gss_svc_searchbyctx(rawobj_t *handle)
 {
         struct rsc  rsci;
@@ -822,7 +758,7 @@ int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp,
                                    struct gss_cli_ctx *gctx)
 {
         struct rsc      rsci, *rscp = NULL;
-        unsigned long   ctx_expiry;
+	time64_t ctx_expiry;
         __u32           major;
         int             rc;
         ENTRY;
@@ -846,7 +782,7 @@ int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp,
                 CERROR("unable to get expire time, drop it\n");
                 GOTO(out, rc = -EINVAL);
         }
-        rsci.h.expiry_time = (time_t) ctx_expiry;
+	rsci.h.expiry_time = ctx_expiry;
 
 	switch (imp->imp_obd->u.cli.cl_sp_to) {
 	case LUSTRE_SP_MDT:
@@ -857,6 +793,13 @@ int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp,
 		break;
 	case LUSTRE_SP_CLI:
 		rsci.ctx.gsc_usr_root = 1;
+		break;
+	case LUSTRE_SP_MGS:
+		/* by convention, all 3 set to 1 means MGS */
+		rsci.ctx.gsc_usr_mds = 1;
+		rsci.ctx.gsc_usr_oss = 1;
+		rsci.ctx.gsc_usr_root = 1;
+		break;
 	default:
 		break;
 	}
@@ -884,15 +827,15 @@ int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp,
 
 int gss_svc_upcall_expire_rvs_ctx(rawobj_t *handle)
 {
-        const cfs_time_t        expire = 20;
-        struct rsc             *rscp;
+	const time64_t expire = 20;
+	struct rsc *rscp;
 
         rscp = gss_svc_searchbyctx(handle);
         if (rscp) {
                 CDEBUG(D_SEC, "reverse svcctx %p (rsc %p) expire soon\n",
                        &rscp->ctx, rscp);
 
-                rscp->h.expiry_time = cfs_time_current_sec() + expire;
+		rscp->h.expiry_time = ktime_get_real_seconds() + expire;
                 COMPAT_RSC_PUT(&rscp->h, &rsc_cache);
         }
         return 0;
@@ -946,7 +889,11 @@ int gss_svc_upcall_handle_init(struct ptlrpc_request *req,
 
 	memset(&rsikey, 0, sizeof(rsikey));
 	rsikey.lustre_svc = lustre_svc;
-	rsikey.nid = (__u64) req->rq_peer.nid;
+	/* In case of MR, rq_peer is not the NID from which request is received,
+	 * but primary NID of peer.
+	 * So we need rq_source, which contains the NID actually in use.
+	 */
+	rsikey.nid = (__u64) req->rq_source.nid;
 	nodemap_test_nid(req->rq_peer.nid, rsikey.nm_name,
 			 sizeof(rsikey.nm_name));
 
@@ -991,11 +938,11 @@ int gss_svc_upcall_handle_init(struct ptlrpc_request *req,
 		if (first_check) {
 			first_check = 0;
 
-			read_lock(&rsi_cache.hash_lock);
+			cache_read_lock(&rsi_cache);
 			valid = test_bit(CACHE_VALID, &rsip->h.flags);
 			if (valid == 0)
 				set_current_state(TASK_INTERRUPTIBLE);
-			read_unlock(&rsi_cache.hash_lock);
+			cache_read_unlock(&rsi_cache);
 
 			if (valid == 0) {
 				unsigned long jiffies;
@@ -1044,6 +991,20 @@ int gss_svc_upcall_handle_init(struct ptlrpc_request *req,
                 grctx->src_ctx = &rsci->ctx;
         }
 
+	if (gw->gw_flags & LUSTRE_GSS_PACK_KCSUM) {
+		grctx->src_ctx->gsc_mechctx->hash_func = gss_digest_hash;
+	} else if (!strcmp(grctx->src_ctx->gsc_mechctx->mech_type->gm_name,
+			   "krb5") &&
+		   !krb5_allow_old_client_csum) {
+		CWARN("%s: deny connection from '%s' due to missing 'krb_csum' feature, set 'sptlrpc.gss.krb5_allow_old_client_csum=1' to allow, but recommend client upgrade: rc = %d\n",
+		      target->obd_name, libcfs_nid2str(req->rq_peer.nid),
+		      -EPROTO);
+		GOTO(out, rc = SECSVC_DROP);
+	} else {
+		grctx->src_ctx->gsc_mechctx->hash_func =
+			gss_digest_hash_compat;
+	}
+
         if (rawobj_dup(&rsci->ctx.gsc_rvs_hdl, rvs_hdl)) {
                 CERROR("failed duplicate reverse handle\n");
                 GOTO(out, rc);
@@ -1172,17 +1133,18 @@ int __init gss_init_svc_upcall(void)
 	/* FIXME this looks stupid. we intend to give lsvcgssd a chance to open
 	 * the init upcall channel, otherwise there's big chance that the first
 	 * upcall issued before the channel be opened thus nfsv4 cache code will
-	 * drop the request direclty, thus lead to unnecessary recovery time.
-	 * here we wait at miximum 1.5 seconds. */
+	 * drop the request directly, thus lead to unnecessary recovery time.
+	 * Here we wait at minimum 1.5 seconds.
+	 */
 	for (i = 0; i < 6; i++) {
-		if (atomic_read(&rsi_cache.readers) > 0)
+		if (channel_users(&rsi_cache) > 0)
 			break;
 		set_current_state(TASK_UNINTERRUPTIBLE);
-		LASSERT(msecs_to_jiffies(MSEC_PER_SEC) >= 4);
+		LASSERT(msecs_to_jiffies(MSEC_PER_SEC / 4) > 0);
 		schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC / 4));
 	}
 
-	if (atomic_read(&rsi_cache.readers) == 0)
+	if (channel_users(&rsi_cache) == 0)
 		CWARN("Init channel is not opened by lsvcgssd, following "
 		      "request might be dropped until lsvcgssd is active\n");
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/lproc_gss.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/lproc_gss.c
index 610f0b38c8d4f..f2943207b34fd 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/lproc_gss.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/lproc_gss.c
@@ -41,7 +41,6 @@
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_net.h>
 #include <lustre_import.h>
 #include <lprocfs_status.h>
@@ -133,7 +132,29 @@ static const struct file_operations gss_proc_secinit = {
 	.write = gss_proc_write_secinit,
 };
 
-static struct lprocfs_vars gss_lprocfs_vars[] = {
+int sptlrpc_krb5_allow_old_client_csum_seq_show(struct seq_file *m, void *data)
+{
+	seq_printf(m, "%u\n", krb5_allow_old_client_csum);
+	return 0;
+}
+
+ssize_t sptlrpc_krb5_allow_old_client_csum_seq_write(struct file *file,
+						     const char __user *buffer,
+						     size_t count, loff_t *off)
+{
+	bool val;
+	int rc;
+
+	rc = kstrtobool_from_user(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	krb5_allow_old_client_csum = val;
+	return count;
+}
+LPROC_SEQ_FOPS(sptlrpc_krb5_allow_old_client_csum);
+
+static struct ldebugfs_vars gss_debugfs_vars[] = {
 	{ .name	=	"replays",
 	  .fops	=	&gss_proc_oos_fops	},
 	{ .name	=	"init_channel",
@@ -142,6 +163,12 @@ static struct lprocfs_vars gss_lprocfs_vars[] = {
 	{ NULL }
 };
 
+static struct lprocfs_vars gss_lprocfs_vars[] = {
+	{ .name	=	"krb5_allow_old_client_csum",
+	  .fops	=	&sptlrpc_krb5_allow_old_client_csum_fops },
+	{ NULL }
+};
+
 /*
  * for userspace helper lgss_keyring.
  *
@@ -159,14 +186,14 @@ static ssize_t
 gss_lk_proc_dl_seq_write(struct file *file, const char __user *buffer,
 				size_t count, loff_t *off)
 {
+	unsigned int val;
 	int rc;
-	__s64 val;
 
-	rc = lprocfs_str_to_s64(buffer, count, &val);
+	rc = kstrtouint_from_user(buffer, count, 0, &val);
 	if (rc < 0)
 		return rc;
 
-	if (val < 0 || val > 4)
+	if (val > 4)
 		return -ERANGE;
 
 	gss_lk_debug_level = val;
@@ -175,7 +202,7 @@ gss_lk_proc_dl_seq_write(struct file *file, const char __user *buffer,
 }
 LPROC_SEQ_FOPS(gss_lk_proc_dl);
 
-static struct lprocfs_vars gss_lk_lprocfs_vars[] = {
+static struct ldebugfs_vars gss_lk_debugfs_vars[] = {
 	{ .name	=	"debug_level",
 	  .fops	=	&gss_lk_proc_dl_fops	},
 	{ NULL }
@@ -209,7 +236,7 @@ int gss_init_lproc(void)
 	}
 
 	gss_proc_lk = lprocfs_register("lgss_keyring", gss_proc_root,
-				       gss_lk_lprocfs_vars, NULL);
+				       gss_lk_debugfs_vars, NULL);
 	if (IS_ERR(gss_proc_lk)) {
 		rc = PTR_ERR(gss_proc_lk);
 		gss_proc_lk = NULL;
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/sec_gss.c b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/sec_gss.c
index bee52f3751356..17e8f0a258c6d 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/gss/sec_gss.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/gss/sec_gss.c
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2015, Intel Corporation.
  *
  * Author: Eric Mei <ericm@clusterfs.com>
  */
@@ -59,7 +59,6 @@
 #include <obd_class.h>
 #include <obd_support.h>
 #include <obd_cksum.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_net.h>
 #include <lustre_import.h>
 #include <lustre_sec.h>
@@ -309,11 +308,11 @@ int cli_ctx_expire(struct ptlrpc_cli_ctx *ctx)
 		if (!ctx->cc_early_expire)
 			clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags);
 
-		CWARN("ctx %p(%u->%s) get expired: %lu(%+lds)\n",
+		CWARN("ctx %p(%u->%s) get expired: %lld(%+llds)\n",
 		      ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
 		      ctx->cc_expire,
 		      ctx->cc_expire == 0 ? 0 :
-		      cfs_time_sub(ctx->cc_expire, cfs_time_current_sec()));
+		      ctx->cc_expire - ktime_get_real_seconds());
 
 		sptlrpc_cli_ctx_wakeup(ctx);
 		return 1;
@@ -336,7 +335,7 @@ int cli_ctx_check_death(struct ptlrpc_cli_ctx *ctx)
                 return 0;
 
         /* check real expiration */
-        if (cfs_time_after(ctx->cc_expire, cfs_time_current_sec()))
+	if (ctx->cc_expire > ktime_get_real_seconds())
                 return 0;
 
         cli_ctx_expire(ctx);
@@ -345,8 +344,8 @@ int cli_ctx_check_death(struct ptlrpc_cli_ctx *ctx)
 
 void gss_cli_ctx_uptodate(struct gss_cli_ctx *gctx)
 {
-        struct ptlrpc_cli_ctx  *ctx = &gctx->gc_base;
-        unsigned long           ctx_expiry;
+	struct ptlrpc_cli_ctx *ctx = &gctx->gc_base;
+	time64_t ctx_expiry;
 
         if (lgss_inquire_context(gctx->gc_mechctx, &ctx_expiry)) {
                 CERROR("ctx %p(%u): unable to inquire, expire it now\n",
@@ -365,17 +364,17 @@ void gss_cli_ctx_uptodate(struct gss_cli_ctx *gctx)
 
 	if (sec_is_reverse(ctx->cc_sec)) {
 		CWARN("server installed reverse ctx %p idx %#llx, "
-		      "expiry %lu(%+lds)\n", ctx,
+		      "expiry %lld(%+llds)\n", ctx,
 		      gss_handle_to_u64(&gctx->gc_handle),
 		      ctx->cc_expire,
-		      cfs_time_sub(ctx->cc_expire, cfs_time_current_sec()));
+		      ctx->cc_expire - ktime_get_real_seconds());
         } else {
 		CWARN("client refreshed ctx %p idx %#llx (%u->%s), "
-		      "expiry %lu(%+lds)\n", ctx,
+		      "expiry %lld(%+llds)\n", ctx,
 		      gss_handle_to_u64(&gctx->gc_handle),
 		      ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
 		      ctx->cc_expire,
-		      cfs_time_sub(ctx->cc_expire, cfs_time_current_sec()));
+		      ctx->cc_expire - ktime_get_real_seconds());
 
 		/* install reverse svc ctx for root context */
 		if (ctx->cc_vcred.vc_uid == 0)
@@ -535,7 +534,7 @@ int gss_check_seq_num(struct gss_svc_seq_data *ssd, __u32 seq_num, int set)
                 switch (rc) {
                 case -1:
                         gss_stat_oos_record_svc(1, 1);
-                        /* fall through */
+			fallthrough;
                 case 0:
                         goto exit;
                 }
@@ -1103,6 +1102,9 @@ int gss_sec_create_common(struct gss_sec *gsec,
 	sec->ps_import = class_import_get(imp);
 	spin_lock_init(&sec->ps_lock);
 	INIT_LIST_HEAD(&sec->ps_gc_list);
+	sec->ps_sepol_mtime = ktime_set(0, 0);
+	sec->ps_sepol_checknext = ktime_set(0, 0);
+	sec->ps_sepol[0] = '\0';
 
         if (!svcctx) {
                 sec->ps_gc_interval = GSS_GC_INTERVAL;
@@ -2055,16 +2057,17 @@ int gss_svc_handle_init(struct ptlrpc_request *req,
         if (rc != SECSVC_OK)
                 RETURN(rc);
 
-        if (grctx->src_ctx->gsc_usr_mds || grctx->src_ctx->gsc_usr_oss ||
-            grctx->src_ctx->gsc_usr_root)
-                CWARN("create svc ctx %p: user from %s authenticated as %s\n",
-                      grctx->src_ctx, libcfs_nid2str(req->rq_peer.nid),
-                      grctx->src_ctx->gsc_usr_mds ? "mds" :
-                        (grctx->src_ctx->gsc_usr_oss ? "oss" : "root"));
-        else
-                CWARN("create svc ctx %p: accept user %u from %s\n",
-                      grctx->src_ctx, grctx->src_ctx->gsc_uid,
-                      libcfs_nid2str(req->rq_peer.nid));
+	if (grctx->src_ctx->gsc_usr_mds || grctx->src_ctx->gsc_usr_oss ||
+	    grctx->src_ctx->gsc_usr_root)
+		CWARN("create svc ctx %p: user from %s authenticated as %s\n",
+		      grctx->src_ctx, libcfs_nid2str(req->rq_peer.nid),
+		      grctx->src_ctx->gsc_usr_root ? "root" :
+		      (grctx->src_ctx->gsc_usr_mds ? "mds" :
+		       (grctx->src_ctx->gsc_usr_oss ? "oss" : "null")));
+	else
+		CWARN("create svc ctx %p: accept user %u from %s\n",
+		      grctx->src_ctx, grctx->src_ctx->gsc_uid,
+		      libcfs_nid2str(req->rq_peer.nid));
 
         if (gw->gw_flags & LUSTRE_GSS_PACK_USER) {
                 if (reqbuf->lm_bufcount < 4) {
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/import.c b/drivers/staging/lustrefsx/lustre/ptlrpc/import.c
index 827a989f1e139..46d92bf4ed2d0 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/import.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/import.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -56,10 +56,10 @@ struct ptlrpc_connect_async_args {
 
 /**
  * Updates import \a imp current state to provided \a state value
- * Helper function. Must be called under imp_lock.
+ * Helper function.
  */
-static void __import_set_state(struct obd_import *imp,
-                               enum lustre_imp_state state)
+static void import_set_state_nolock(struct obd_import *imp,
+				    enum lustre_imp_state state)
 {
 	switch (state) {
 	case LUSTRE_IMP_CLOSED:
@@ -72,7 +72,20 @@ static void __import_set_state(struct obd_import *imp,
 		break;
 	default:
 		imp->imp_replay_state = LUSTRE_IMP_REPLAY;
+		break;
 	}
+
+	/* A CLOSED import should remain so. */
+	if (imp->imp_state == LUSTRE_IMP_CLOSED)
+		return;
+
+	if (imp->imp_state != LUSTRE_IMP_NEW) {
+		CDEBUG(D_HA, "%p %s: changing import state from %s to %s\n",
+		       imp, obd2cli_tgt(imp->imp_obd),
+		       ptlrpc_import_state_name(imp->imp_state),
+		       ptlrpc_import_state_name(state));
+	}
+
         imp->imp_state = state;
         imp->imp_state_hist[imp->imp_state_hist_idx].ish_state = state;
         imp->imp_state_hist[imp->imp_state_hist_idx].ish_time =
@@ -81,28 +94,17 @@ static void __import_set_state(struct obd_import *imp,
                 IMP_STATE_HIST_LEN;
 }
 
-/* A CLOSED import should remain so. */
-#define IMPORT_SET_STATE_NOLOCK(imp, state)                                    \
-do {                                                                           \
-        if (imp->imp_state != LUSTRE_IMP_CLOSED) {                             \
-               CDEBUG(D_HA, "%p %s: changing import state from %s to %s\n",    \
-                      imp, obd2cli_tgt(imp->imp_obd),                          \
-                      ptlrpc_import_state_name(imp->imp_state),                \
-                      ptlrpc_import_state_name(state));                        \
-               __import_set_state(imp, state);                                 \
-        }                                                                      \
-} while(0)
-
-#define IMPORT_SET_STATE(imp, state)					\
-do {									\
-	spin_lock(&imp->imp_lock);					\
-	IMPORT_SET_STATE_NOLOCK(imp, state);				\
-	spin_unlock(&imp->imp_lock);					\
-} while(0)
+static void import_set_state(struct obd_import *imp,
+			     enum lustre_imp_state new_state)
+{
+	spin_lock(&imp->imp_lock);
+	import_set_state_nolock(imp, new_state);
+	spin_unlock(&imp->imp_lock);
+}
 
 void ptlrpc_import_enter_resend(struct obd_import *imp)
 {
-	IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+	import_set_state(imp, LUSTRE_IMP_RECOVER);
 }
 EXPORT_SYMBOL(ptlrpc_import_enter_resend);
 
@@ -146,6 +148,21 @@ void deuuidify(char *uuid, const char *prefix, char **uuid_start, int *uuid_len)
                 *uuid_len -= strlen(UUID_STR);
 }
 
+/* Must be called with imp_lock held! */
+static void ptlrpc_deactivate_import_nolock(struct obd_import *imp)
+{
+	ENTRY;
+
+	assert_spin_locked(&imp->imp_lock);
+	CDEBUG(D_HA, "setting import %s INVALID\n", obd2cli_tgt(imp->imp_obd));
+	imp->imp_invalid = 1;
+	imp->imp_generation++;
+
+	ptlrpc_abort_inflight(imp);
+
+	EXIT;
+}
+
 /**
  * Returns true if import was FULL, false if import was already not
  * connected.
@@ -156,8 +173,10 @@ void deuuidify(char *uuid, const char *prefix, char **uuid_start, int *uuid_len)
  *             bulk requests) and if one has already caused a reconnection
  *             (increasing the import->conn_cnt) the older failure should
  *             not also cause a reconnection.  If zero it forces a reconnect.
+ * @invalid - set import invalid flag
  */
-int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt)
+int ptlrpc_set_import_discon(struct obd_import *imp,
+			     __u32 conn_cnt, bool invalid)
 {
 	int rc = 0;
 
@@ -167,31 +186,43 @@ int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt)
             (conn_cnt == 0 || conn_cnt == imp->imp_conn_cnt)) {
                 char *target_start;
                 int   target_len;
+		bool  inact = false;
 
                 deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
                           &target_start, &target_len);
 
+		import_set_state_nolock(imp, LUSTRE_IMP_DISCON);
                 if (imp->imp_replayable) {
                         LCONSOLE_WARN("%s: Connection to %.*s (at %s) was "
                                "lost; in progress operations using this "
                                "service will wait for recovery to complete\n",
                                imp->imp_obd->obd_name, target_len, target_start,
-                               libcfs_nid2str(imp->imp_connection->c_peer.nid));
-                } else {
-                        LCONSOLE_ERROR_MSG(0x166, "%s: Connection to "
-                               "%.*s (at %s) was lost; in progress "
-                               "operations using this service will fail\n",
-                               imp->imp_obd->obd_name,
-                               target_len, target_start,
-                               libcfs_nid2str(imp->imp_connection->c_peer.nid));
-                }
-                IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
+			       obd_import_nid2str(imp));
+		} else {
+			LCONSOLE_ERROR_MSG(0x166, "%s: Connection to "
+			       "%.*s (at %s) was lost; in progress "
+			       "operations using this service will fail\n",
+			       imp->imp_obd->obd_name, target_len, target_start,
+			       obd_import_nid2str(imp));
+			if (invalid) {
+				CDEBUG(D_HA, "import %s@%s for %s not "
+				       "replayable, auto-deactivating\n",
+				       obd2cli_tgt(imp->imp_obd),
+				       imp->imp_connection->c_remote_uuid.uuid,
+				       imp->imp_obd->obd_name);
+				ptlrpc_deactivate_import_nolock(imp);
+				inact = true;
+			}
+		}
 		spin_unlock(&imp->imp_lock);
 
 		if (obd_dump_on_timeout)
 			libcfs_debug_dumplog();
 
 		obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON);
+
+		if (inact)
+			obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
 		rc = 1;
 	} else {
 		spin_unlock(&imp->imp_lock);
@@ -206,23 +237,6 @@ int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt)
         return rc;
 }
 
-/* Must be called with imp_lock held! */
-static void ptlrpc_deactivate_and_unlock_import(struct obd_import *imp)
-{
-	ENTRY;
-	assert_spin_locked(&imp->imp_lock);
-
-	CDEBUG(D_HA, "setting import %s INVALID\n", obd2cli_tgt(imp->imp_obd));
-	imp->imp_invalid = 1;
-	imp->imp_generation++;
-	spin_unlock(&imp->imp_lock);
-
-	ptlrpc_abort_inflight(imp);
-	obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
-
-	EXIT;
-}
-
 /*
  * This acts as a barrier; all existing requests are rejected, and
  * no new requests will be accepted until the import is valid again.
@@ -230,14 +244,17 @@ static void ptlrpc_deactivate_and_unlock_import(struct obd_import *imp)
 void ptlrpc_deactivate_import(struct obd_import *imp)
 {
 	spin_lock(&imp->imp_lock);
-	ptlrpc_deactivate_and_unlock_import(imp);
+	ptlrpc_deactivate_import_nolock(imp);
+	spin_unlock(&imp->imp_lock);
+
+	obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
 }
 EXPORT_SYMBOL(ptlrpc_deactivate_import);
 
-static unsigned int
-ptlrpc_inflight_deadline(struct ptlrpc_request *req, time64_t now)
+static time64_t ptlrpc_inflight_deadline(struct ptlrpc_request *req,
+					 time64_t now)
 {
-        long dl;
+	time64_t dl;
 
         if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
               (req->rq_phase == RQ_PHASE_BULK) ||
@@ -258,12 +275,12 @@ ptlrpc_inflight_deadline(struct ptlrpc_request *req, time64_t now)
         return dl - now;
 }
 
-static unsigned int ptlrpc_inflight_timeout(struct obd_import *imp)
+static time64_t ptlrpc_inflight_timeout(struct obd_import *imp)
 {
 	time64_t now = ktime_get_real_seconds();
 	struct list_head *tmp, *n;
 	struct ptlrpc_request *req;
-	unsigned int timeout = 0;
+	time64_t timeout = 0;
 
 	spin_lock(&imp->imp_lock);
 	list_for_each_safe(tmp, n, &imp->imp_sending_list) {
@@ -285,7 +302,7 @@ void ptlrpc_invalidate_import(struct obd_import *imp)
 	struct list_head *tmp, *n;
 	struct ptlrpc_request *req;
 	struct l_wait_info lwi;
-	unsigned int timeout;
+	time64_t timeout;
 	int rc;
 
 	atomic_inc(&imp->imp_inval_count);
@@ -305,30 +322,35 @@ void ptlrpc_invalidate_import(struct obd_import *imp)
          * unlink. We can't do anything before that because there is really
          * no guarantee that some rdma transfer is not in progress right now. */
         do {
+		long timeout_jiffies;
+
                 /* Calculate max timeout for waiting on rpcs to error
                  * out. Use obd_timeout if calculated value is smaller
-                 * than it. */
-                if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
-                        timeout = ptlrpc_inflight_timeout(imp);
-                        timeout += timeout / 3;
-
-                        if (timeout == 0)
-                                timeout = obd_timeout;
-                } else {
-                        /* decrease the interval to increase race condition */
-                        timeout = 1;
-                }
+		 * than it.
+		 */
+		if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
+			timeout = ptlrpc_inflight_timeout(imp);
+			timeout += div_u64(timeout, 3);
+
+			if (timeout == 0)
+				timeout = obd_timeout;
+		} else {
+			/* decrease the interval to increase race condition */
+			timeout = 1;
+		}
 
-                CDEBUG(D_RPCTRACE,"Sleeping %d sec for inflight to error out\n",
-                       timeout);
+		CDEBUG(D_RPCTRACE, "Sleeping %llds for inflight to error out\n",
+		       timeout);
 
 		/* Wait for all requests to error out and call completion
 		 * callbacks. Cap it at obd_timeout -- these should all
-		 * have been locally cancelled by ptlrpc_abort_inflight. */
-		lwi = LWI_TIMEOUT_INTERVAL(
-			cfs_timeout_cap(cfs_time_seconds(timeout)),
-			(timeout > 1)?cfs_time_seconds(1):cfs_time_seconds(1)/2,
-			NULL, NULL);
+		 * have been locally cancelled by ptlrpc_abort_inflight.
+		 */
+		timeout_jiffies = max_t(long, cfs_time_seconds(timeout), 1);
+		lwi = LWI_TIMEOUT_INTERVAL(timeout_jiffies,
+					   (timeout > 1) ? cfs_time_seconds(1) :
+							   cfs_time_seconds(1) / 2,
+							   NULL, NULL);
 		rc = l_wait_event(imp->imp_recovery_waitq,
 				  (atomic_read(&imp->imp_inflight) == 0),
 				  &lwi);
@@ -396,17 +418,23 @@ void ptlrpc_invalidate_import(struct obd_import *imp)
 EXPORT_SYMBOL(ptlrpc_invalidate_import);
 
 /* unset imp_invalid */
-void ptlrpc_activate_import(struct obd_import *imp)
+void ptlrpc_activate_import(struct obd_import *imp, bool set_state_full)
 {
 	struct obd_device *obd = imp->imp_obd;
 
 	spin_lock(&imp->imp_lock);
 	if (imp->imp_deactive != 0) {
+		LASSERT(imp->imp_state != LUSTRE_IMP_FULL);
+		if (imp->imp_state != LUSTRE_IMP_DISCON)
+			import_set_state_nolock(imp, LUSTRE_IMP_DISCON);
 		spin_unlock(&imp->imp_lock);
 		return;
 	}
+	if (set_state_full)
+		import_set_state_nolock(imp, LUSTRE_IMP_FULL);
 
 	imp->imp_invalid = 0;
+
 	spin_unlock(&imp->imp_lock);
 	obd_import_event(obd, imp, IMP_EVENT_ACTIVE);
 }
@@ -428,45 +456,36 @@ EXPORT_SYMBOL(ptlrpc_pinger_force);
 
 void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt)
 {
-        ENTRY;
-
-        LASSERT(!imp->imp_dlm_fake);
+	ENTRY;
 
-        if (ptlrpc_set_import_discon(imp, conn_cnt)) {
-                if (!imp->imp_replayable) {
-                        CDEBUG(D_HA, "import %s@%s for %s not replayable, "
-                               "auto-deactivating\n",
-                               obd2cli_tgt(imp->imp_obd),
-                               imp->imp_connection->c_remote_uuid.uuid,
-                               imp->imp_obd->obd_name);
-                        ptlrpc_deactivate_import(imp);
-                }
+	LASSERT(!imp->imp_dlm_fake);
 
+	if (ptlrpc_set_import_discon(imp, conn_cnt, true))
 		ptlrpc_pinger_force(imp);
-	}
+
 	EXIT;
 }
 
 int ptlrpc_reconnect_import(struct obd_import *imp)
 {
 #ifdef ENABLE_PINGER
+	long timeout_jiffies = cfs_time_seconds(obd_timeout);
 	struct l_wait_info lwi;
-	int secs = cfs_time_seconds(obd_timeout);
 	int rc;
 
 	ptlrpc_pinger_force(imp);
 
 	CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n",
-	       obd2cli_tgt(imp->imp_obd), secs);
+	       obd2cli_tgt(imp->imp_obd), obd_timeout);
 
-	lwi = LWI_TIMEOUT(secs, NULL, NULL);
+	lwi = LWI_TIMEOUT(timeout_jiffies, NULL, NULL);
 	rc = l_wait_event(imp->imp_recovery_waitq,
 			  !ptlrpc_import_in_recovery(imp), &lwi);
 	CDEBUG(D_HA, "%s: recovery finished s:%s\n", obd2cli_tgt(imp->imp_obd),
 	       ptlrpc_import_state_name(imp->imp_state));
 	return rc;
 #else
-	ptlrpc_set_import_discon(imp, 0);
+	ptlrpc_set_import_discon(imp, 0, false);
 	/* Force a new connect attempt */
 	ptlrpc_invalidate_import(imp);
 	/* Do a fresh connect next time by zeroing the handle */
@@ -487,7 +506,7 @@ int ptlrpc_reconnect_import(struct obd_import *imp)
 	/* Allow reconnect attempts */
 	imp->imp_obd->obd_no_recov = 0;
 	/* Remove 'invalid' flag */
-	ptlrpc_activate_import(imp);
+	ptlrpc_activate_import(imp, false);
 	/* Attempt a new connect */
 	ptlrpc_recover_import(imp, NULL, 0);
 	return 0;
@@ -518,7 +537,7 @@ static int import_select_connection(struct obd_import *imp)
 	}
 
 	list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
-		CDEBUG(D_HA, "%s: connect to NID %s last attempt %llu\n",
+		CDEBUG(D_HA, "%s: connect to NID %s last attempt %lld\n",
                        imp->imp_obd->obd_name,
                        libcfs_nid2str(conn->oic_conn->c_peer.nid),
                        conn->oic_last_attempt);
@@ -526,8 +545,7 @@ static int import_select_connection(struct obd_import *imp)
                 /* If we have not tried this connection since
                    the last successful attempt, go with this one */
                 if ((conn->oic_last_attempt == 0) ||
-                    cfs_time_beforeq_64(conn->oic_last_attempt,
-                                       imp->imp_last_success_conn)) {
+		    conn->oic_last_attempt <= imp->imp_last_success_conn) {
                         imp_conn = conn;
                         tried_all = 0;
                         break;
@@ -538,8 +556,7 @@ static int import_select_connection(struct obd_import *imp)
                    least recently used */
                 if (!imp_conn)
                         imp_conn = conn;
-                else if (cfs_time_before_64(conn->oic_last_attempt,
-                                            imp_conn->oic_last_attempt))
+		else if (imp_conn->oic_last_attempt > conn->oic_last_attempt)
                         imp_conn = conn;
         }
 
@@ -568,7 +585,7 @@ static int import_select_connection(struct obd_import *imp)
 			"to %ds\n", imp->imp_obd->obd_name, at_get(at));
 	}
 
-        imp_conn->oic_last_attempt = cfs_time_current_64();
+	imp_conn->oic_last_attempt = ktime_get_seconds();
 
         /* switch connection, don't mind if it's same as the current one */
         if (imp->imp_connection)
@@ -639,29 +656,41 @@ static int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno)
 	return 0;
 }
 
+int ptlrpc_connect_import(struct obd_import *imp)
+{
+	spin_lock(&imp->imp_lock);
+	return ptlrpc_connect_import_locked(imp);
+}
+
 /**
  * Attempt to (re)connect import \a imp. This includes all preparations,
  * initializing CONNECT RPC request and passing it to ptlrpcd for
  * actual sending.
+ *
+ * Assumes imp->imp_lock is held, and releases it.
+ *
  * Returns 0 on success or error code.
  */
-int ptlrpc_connect_import(struct obd_import *imp)
+int ptlrpc_connect_import_locked(struct obd_import *imp)
 {
 	struct obd_device *obd = imp->imp_obd;
 	int initial_connect = 0;
 	int set_transno = 0;
 	__u64 committed_before_reconnect = 0;
 	struct ptlrpc_request *request;
+	struct obd_connect_data ocd;
 	char *bufs[] = { NULL,
 			 obd2cli_tgt(imp->imp_obd),
 			 obd->obd_uuid.uuid,
 			 (char *)&imp->imp_dlm_handle,
-			 (char *)&imp->imp_connect_data };
+			 (char *)&ocd,
+			 NULL };
 	struct ptlrpc_connect_async_args *aa;
 	int rc;
 	ENTRY;
 
-	spin_lock(&imp->imp_lock);
+	assert_spin_locked(&imp->imp_lock);
+
 	if (imp->imp_state == LUSTRE_IMP_CLOSED) {
 		spin_unlock(&imp->imp_lock);
 		CERROR("can't connect to a closed import\n");
@@ -678,7 +707,7 @@ int ptlrpc_connect_import(struct obd_import *imp)
 		RETURN(-EALREADY);
 	}
 
-	IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CONNECTING);
+	import_set_state_nolock(imp, LUSTRE_IMP_CONNECTING);
 
 	imp->imp_conn_cnt++;
 	imp->imp_resend_replay = 0;
@@ -702,15 +731,16 @@ int ptlrpc_connect_import(struct obd_import *imp)
 
 	/* Reset connect flags to the originally requested flags, in case
 	 * the server is updated on-the-fly we will get the new features. */
-	imp->imp_connect_data.ocd_connect_flags = imp->imp_connect_flags_orig;
-	imp->imp_connect_data.ocd_connect_flags2 = imp->imp_connect_flags2_orig;
+	ocd = imp->imp_connect_data;
+	ocd.ocd_connect_flags = imp->imp_connect_flags_orig;
+	ocd.ocd_connect_flags2 = imp->imp_connect_flags2_orig;
 	/* Reset ocd_version each time so the server knows the exact versions */
-	imp->imp_connect_data.ocd_version = LUSTRE_VERSION_CODE;
+	ocd.ocd_version = LUSTRE_VERSION_CODE;
 	imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
 	imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18;
 
 	rc = obd_reconnect(NULL, imp->imp_obd->obd_self_export, obd,
-			   &obd->obd_uuid, &imp->imp_connect_data, NULL);
+			   &obd->obd_uuid, &ocd, NULL);
 	if (rc)
 		GOTO(out, rc);
 
@@ -718,6 +748,19 @@ int ptlrpc_connect_import(struct obd_import *imp)
 	if (request == NULL)
 		GOTO(out, rc = -ENOMEM);
 
+	/* get SELinux policy info if any */
+	rc = sptlrpc_get_sepol(request);
+	if (rc < 0) {
+		ptlrpc_request_free(request);
+		GOTO(out, rc);
+	}
+
+	bufs[5] = request->rq_sepol;
+
+	req_capsule_set_size(&request->rq_pill, &RMF_SELINUX_POL, RCL_CLIENT,
+			     strlen(request->rq_sepol) ?
+			     strlen(request->rq_sepol) + 1 : 0);
+
 	rc = ptlrpc_request_bufs_pack(request, LUSTRE_OBD_VERSION,
 				      imp->imp_connect_op, bufs, NULL);
 	if (rc) {
@@ -727,8 +770,8 @@ int ptlrpc_connect_import(struct obd_import *imp)
 
 	/* Report the rpc service time to the server so that it knows how long
 	 * to wait for clients to join recovery */
-	lustre_msg_set_service_time(request->rq_reqmsg,
-				    at_timeout2est(request->rq_timeout));
+	lustre_msg_set_service_timeout(request->rq_reqmsg,
+				       at_timeout2est(request->rq_timeout));
 
 	/* The amount of time we give the server to process the connect req.
 	 * import_select_connection will increase the net latency on
@@ -771,7 +814,7 @@ int ptlrpc_connect_import(struct obd_import *imp)
 	rc = 0;
 out:
 	if (rc != 0)
-		IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
+		import_set_state(imp, LUSTRE_IMP_DISCON);
 
 	RETURN(rc);
 }
@@ -795,9 +838,9 @@ static int ptlrpc_busy_reconnect(int rc)
 }
 
 static int ptlrpc_connect_set_flags(struct obd_import *imp,
-				     struct obd_connect_data *ocd,
-				     __u64 old_connect_flags,
-				     struct obd_export *exp, int init_connect)
+				    struct obd_connect_data *ocd,
+				    __u64 old_connect_flags,
+				    struct obd_export *exp, int init_connect)
 {
 	static bool warned;
 	struct client_obd *cli = &imp->imp_obd->u.cli;
@@ -811,7 +854,6 @@ static int ptlrpc_connect_set_flags(struct obd_import *imp,
 
 	spin_unlock(&imp->imp_lock);
 
-
 	if (!warned && (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
 	    (ocd->ocd_version > LUSTRE_VERSION_CODE +
 				LUSTRE_VERSION_OFFSET_WARN ||
@@ -822,7 +864,7 @@ static int ptlrpc_connect_set_flags(struct obd_import *imp,
 		const char *older = "older than client. "
 				    "Consider upgrading server";
 		const char *newer = "newer than client. "
-				    "Consider recompiling application";
+				    "Consider upgrading client";
 
 		LCONSOLE_WARN("Server %s version (%d.%d.%d.%d) "
 			      "is much %s (%s)\n",
@@ -836,37 +878,18 @@ static int ptlrpc_connect_set_flags(struct obd_import *imp,
 		warned = true;
 	}
 
-#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
-	/* Check if server has LU-1252 fix applied to not always swab
-	 * the IR MNE entries. Do this only once per connection.  This
-	 * fixup is version-limited, because we don't want to carry the
-	 * OBD_CONNECT_MNE_SWAB flag around forever, just so long as we
-	 * need interop with unpatched 2.2 servers.  For newer servers,
-	 * the client will do MNE swabbing only as needed.  LU-1644 */
-	if (unlikely((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
-		     !(ocd->ocd_connect_flags & OBD_CONNECT_MNE_SWAB) &&
-		     OBD_OCD_VERSION_MAJOR(ocd->ocd_version) == 2 &&
-		     OBD_OCD_VERSION_MINOR(ocd->ocd_version) == 2 &&
-		     OBD_OCD_VERSION_PATCH(ocd->ocd_version) < 55 &&
-		     strcmp(imp->imp_obd->obd_type->typ_name,
-			    LUSTRE_MGC_NAME) == 0))
-		imp->imp_need_mne_swab = 1;
-	else /* clear if server was upgraded since last connect */
-		imp->imp_need_mne_swab = 0;
-#endif
-
 	if (ocd->ocd_connect_flags & OBD_CONNECT_CKSUM) {
 		/* We sent to the server ocd_cksum_types with bits set
 		 * for algorithms we understand. The server masked off
 		 * the checksum types it doesn't support */
 		if ((ocd->ocd_cksum_types &
-		     cksum_types_supported_client()) == 0) {
+		     obd_cksum_types_supported_client()) == 0) {
 			LCONSOLE_ERROR("The negotiation of the checksum "
 				       "alogrithm to use with server %s "
 				       "failed (%x/%x)\n",
 				       obd2cli_tgt(imp->imp_obd),
 				       ocd->ocd_cksum_types,
-				       cksum_types_supported_client());
+				       obd_cksum_types_supported_client());
 			return -EPROTO;
 		} else {
 			cli->cl_supp_cksum_types = ocd->ocd_cksum_types;
@@ -876,7 +899,8 @@ static int ptlrpc_connect_set_flags(struct obd_import *imp,
 		 * Enforce ADLER for backward compatibility*/
 		cli->cl_supp_cksum_types = OBD_CKSUM_ADLER;
 	}
-	cli->cl_cksum_type = cksum_type_select(cli->cl_supp_cksum_types);
+	cli->cl_cksum_type = obd_cksum_type_select(imp->imp_obd->obd_name,
+						   cli->cl_supp_cksum_types);
 
 	if (ocd->ocd_connect_flags & OBD_CONNECT_BRW_SIZE)
 		cli->cl_max_pages_per_rpc =
@@ -905,13 +929,17 @@ static int ptlrpc_connect_set_flags(struct obd_import *imp,
 	 * this leads to losing user settings done before such as
 	 * disable lru_resize, etc. */
 	if (old_connect_flags != exp_connect_flags(exp) || init_connect) {
+		struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
+		__u64 changed_flags;
+
+		changed_flags =
+			ns->ns_connect_flags ^ ns->ns_orig_connect_flags;
 		CDEBUG(D_HA, "%s: Resetting ns_connect_flags to server "
 			     "flags: %#llx\n", imp->imp_obd->obd_name,
 			     ocd->ocd_connect_flags);
-		imp->imp_obd->obd_namespace->ns_connect_flags =
-			ocd->ocd_connect_flags;
-		imp->imp_obd->obd_namespace->ns_orig_connect_flags =
-			ocd->ocd_connect_flags;
+		ns->ns_connect_flags = (ns->ns_connect_flags & changed_flags) |
+				      (ocd->ocd_connect_flags & ~changed_flags);
+		ns->ns_orig_connect_flags = ocd->ocd_connect_flags;
 	}
 
 	if (ocd->ocd_connect_flags & OBD_CONNECT_AT)
@@ -977,6 +1005,7 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
         struct obd_import *imp = request->rq_import;
         struct lustre_handle old_hdl;
         __u64 old_connect_flags;
+	timeout_t service_timeout;
         int msg_flags;
 	struct obd_connect_data *ocd;
 	struct obd_export *exp = NULL;
@@ -991,11 +1020,25 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
 	}
 
 	if (rc) {
+		struct ptlrpc_request *free_req;
+		struct ptlrpc_request *tmp;
+
+		/* abort all delayed requests initiated connection */
+		list_for_each_entry_safe(free_req, tmp, &imp->imp_delayed_list,
+					 rq_list) {
+			spin_lock(&free_req->rq_lock);
+			if (free_req->rq_no_resend) {
+				free_req->rq_err = 1;
+				free_req->rq_status = -EIO;
+				ptlrpc_client_wake_req(free_req);
+			}
+			spin_unlock(&free_req->rq_lock);
+		}
+
 		/* if this reconnect to busy export - not need select new target
 		 * for connecting*/
 		imp->imp_force_reconnect = ptlrpc_busy_reconnect(rc);
 		spin_unlock(&imp->imp_lock);
-		ptlrpc_maybe_ping_import_soon(imp);
 		GOTO(out, rc);
 	}
 
@@ -1095,10 +1138,11 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
 	imp->imp_obd->obd_self_export->exp_connect_data = *ocd;
 
 	/* The net statistics after (re-)connect is not valid anymore,
-	 * because may reflect other routing, etc. */
+	 * because may reflect other routing, etc.
+	 */
+	service_timeout = lustre_msg_get_service_timeout(request->rq_repmsg);
 	at_reinit(&imp->imp_at.iat_net_latency, 0, 0);
-	ptlrpc_at_adj_net_latency(request,
-			lustre_msg_get_service_time(request->rq_repmsg));
+	ptlrpc_at_adj_net_latency(request, service_timeout);
 
 	/* Import flags should be updated before waking import at FULL state */
 	rc = ptlrpc_connect_set_flags(imp, ocd, old_connect_flags, exp,
@@ -1115,12 +1159,10 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
 		spin_lock(&imp->imp_lock);
 		if (msg_flags & MSG_CONNECT_REPLAYABLE) {
 			imp->imp_replayable = 1;
-			spin_unlock(&imp->imp_lock);
 			CDEBUG(D_HA, "connected to replayable target: %s\n",
 			       obd2cli_tgt(imp->imp_obd));
 		} else {
 			imp->imp_replayable = 0;
-			spin_unlock(&imp->imp_lock);
 		}
 
                 /* if applies, adjust the imp->imp_msg_magic here
@@ -1135,10 +1177,11 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
                 if (msg_flags & MSG_CONNECT_RECOVERING) {
                         CDEBUG(D_HA, "connect to %s during recovery\n",
                                obd2cli_tgt(imp->imp_obd));
-                        IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
+			import_set_state_nolock(imp, LUSTRE_IMP_REPLAY_LOCKS);
+			spin_unlock(&imp->imp_lock);
                 } else {
-			IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
-			ptlrpc_activate_import(imp);
+			spin_unlock(&imp->imp_lock);
+			ptlrpc_activate_import(imp, true);
                 }
 
                 GOTO(finish, rc = 0);
@@ -1196,7 +1239,7 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
                                      *lustre_msg_get_handle(request->rq_repmsg);
 
                         if (!(MSG_CONNECT_RECOVERING & msg_flags)) {
-                                IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+				import_set_state(imp, LUSTRE_IMP_EVICTED);
                                 GOTO(finish, rc = 0);
                         }
 
@@ -1209,7 +1252,7 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
                 if (imp->imp_invalid) {
                         CDEBUG(D_HA, "%s: reconnected but import is invalid; "
                                "marking evicted\n", imp->imp_obd->obd_name);
-                        IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+			import_set_state(imp, LUSTRE_IMP_EVICTED);
                 } else if (MSG_CONNECT_RECOVERING & msg_flags) {
                         CDEBUG(D_HA, "%s: reconnected to %s during replay\n",
                                imp->imp_obd->obd_name,
@@ -1219,9 +1262,9 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
 			imp->imp_resend_replay = 1;
 			spin_unlock(&imp->imp_lock);
 
-			IMPORT_SET_STATE(imp, imp->imp_replay_state);
+			import_set_state(imp, imp->imp_replay_state);
                 } else {
-                        IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+			import_set_state(imp, LUSTRE_IMP_RECOVER);
                 }
         } else if ((MSG_CONNECT_RECOVERING & msg_flags) && !imp->imp_invalid) {
                 LASSERT(imp->imp_replayable);
@@ -1229,13 +1272,13 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
                                 *lustre_msg_get_handle(request->rq_repmsg);
                 imp->imp_last_replay_transno = 0;
 		imp->imp_replay_cursor = &imp->imp_committed_list;
-                IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
-        } else {
+		import_set_state(imp, LUSTRE_IMP_REPLAY);
+	} else {
                 DEBUG_REQ(D_HA, request, "%s: evicting (reconnect/recover flags"
                           " not set: %x)", imp->imp_obd->obd_name, msg_flags);
                 imp->imp_remote_handle =
                                 *lustre_msg_get_handle(request->rq_repmsg);
-                IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+		import_set_state(imp, LUSTRE_IMP_EVICTED);
         }
 
         /* Sanity checks for a reconnected import. */
@@ -1272,40 +1315,45 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
 	}
 
 out:
+	if (exp != NULL)
+		class_export_put(exp);
+
 	spin_lock(&imp->imp_lock);
 	imp->imp_connected = 0;
 	imp->imp_connect_tried = 1;
-	spin_unlock(&imp->imp_lock);
 
-	if (exp != NULL)
-		class_export_put(exp);
-
-        if (rc != 0) {
-                IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
-                if (rc == -EACCES) {
-                        /*
-                         * Give up trying to reconnect
-                         * EACCES means client has no permission for connection
-                         */
-                        imp->imp_obd->obd_no_recov = 1;
-                        ptlrpc_deactivate_import(imp);
-                }
+	if (rc != 0) {
+		bool inact = false;
+		time64_t now = ktime_get_seconds();
+		time64_t next_connect;
+
+		import_set_state_nolock(imp, LUSTRE_IMP_DISCON);
+		if (rc == -EACCES) {
+			/*
+			 * Give up trying to reconnect
+			 * EACCES means client has no permission for connection
+			 */
+			imp->imp_obd->obd_no_recov = 1;
+			ptlrpc_deactivate_import_nolock(imp);
+			inact = true;
+		} else if (rc == -EPROTO) {
+			struct obd_connect_data *ocd;
+
+			/* reply message might not be ready */
+			if (request->rq_repmsg == NULL) {
+				spin_unlock(&imp->imp_lock);
+				RETURN(-EPROTO);
+			}
 
-                if (rc == -EPROTO) {
-                        struct obd_connect_data *ocd;
-
-                        /* reply message might not be ready */
-                        if (request->rq_repmsg == NULL)
-                                RETURN(-EPROTO);
-
-                        ocd = req_capsule_server_get(&request->rq_pill,
-                                                     &RMF_CONNECT_DATA);
-                        if (ocd &&
-                            (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
-                            (ocd->ocd_version != LUSTRE_VERSION_CODE)) {
-                           /* Actually servers are only supposed to refuse
-                              connection from liblustre clients, so we should
-                              never see this from VFS context */
+			ocd = req_capsule_server_get(&request->rq_pill,
+						     &RMF_CONNECT_DATA);
+			/* Servers are not supposed to refuse connections from
+			 * clients based on version, only connection feature
+			 * flags.  We should never see this from llite, but it
+			 * may be useful for debugging in the future. */
+			if (ocd &&
+			    (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+			    (ocd->ocd_version != LUSTRE_VERSION_CODE)) {
                                 LCONSOLE_ERROR_MSG(0x16a, "Server %s version "
                                         "(%d.%d.%d.%d)"
                                         " refused connection from this client "
@@ -1317,17 +1365,59 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
                                         OBD_OCD_VERSION_PATCH(ocd->ocd_version),
                                         OBD_OCD_VERSION_FIX(ocd->ocd_version),
                                         LUSTRE_VERSION_STRING);
-                                ptlrpc_deactivate_import(imp);
-                                IMPORT_SET_STATE(imp, LUSTRE_IMP_CLOSED);
-                        }
-                        RETURN(-EPROTO);
-                }
+				ptlrpc_deactivate_import_nolock(imp);
+				import_set_state_nolock(imp, LUSTRE_IMP_CLOSED);
+				inact = true;
+			}
+		} else if (rc == -ENODEV || rc == -ETIMEDOUT) {
+			/* ENODEV means there is no service, force reconnection
+			 * to a pair if attempt happen ptlrpc_next_reconnect
+			 * before now. ETIMEDOUT could be set during network
+			 * error and do not guarantee request deadline happened.
+			 */
+			struct obd_import_conn *conn;
+			time64_t reconnect_time;
+
+			/* Same as ptlrpc_next_reconnect, but in past */
+			reconnect_time = now - INITIAL_CONNECT_TIMEOUT;
+			list_for_each_entry(conn, &imp->imp_conn_list,
+					    oic_item) {
+				if (conn->oic_last_attempt <= reconnect_time) {
+					imp->imp_force_verify = 1;
+					break;
+				}
+			}
+		}
+
+		next_connect = imp->imp_conn_current->oic_last_attempt +
+			       (request->rq_deadline - request->rq_sent);
+		spin_unlock(&imp->imp_lock);
+
+		if (inact)
+			obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
+
+		if (rc == -EPROTO)
+			RETURN(rc);
+
+		/* adjust imp_next_ping to request deadline + 1 and reschedule
+		 * a pinger if import lost processing during CONNECTING or far
+		 * away from request deadline. It could happen when connection
+		 * was initiated outside of pinger, like
+		 * ptlrpc_set_import_discon().
+		 */
+		if (!imp->imp_force_verify && (imp->imp_next_ping <= now ||
+		    imp->imp_next_ping > next_connect)) {
+			imp->imp_next_ping = max(now, next_connect) + 1;
+			ptlrpc_pinger_wake_up();
+		}
 
 		ptlrpc_maybe_ping_import_soon(imp);
 
 		CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n",
 		       obd2cli_tgt(imp->imp_obd),
 		       (char *)imp->imp_connection->c_remote_uuid.uuid, rc);
+	} else {
+		spin_unlock(&imp->imp_lock);
 	}
 
 	wake_up_all(&imp->imp_recovery_waitq);
@@ -1376,8 +1466,8 @@ static int signal_completed_replay(struct obd_import *imp)
 	if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_FINISH_REPLAY)))
 		RETURN(0);
 
-	LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
-	atomic_inc(&imp->imp_replay_inflight);
+	if (!atomic_add_unless(&imp->imp_replay_inflight, 1, 1))
+		RETURN(0);
 
 	req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING, LUSTRE_OBD_VERSION,
 					OBD_PING);
@@ -1422,7 +1512,7 @@ static int ptlrpc_invalidate_import_thread(void *data)
                 libcfs_debug_dumplog();
         }
 
-        IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+	import_set_state(imp, LUSTRE_IMP_RECOVER);
         ptlrpc_import_recovery_state_machine(imp);
 
         class_import_put(imp);
@@ -1458,6 +1548,8 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
 
         ENTRY;
         if (imp->imp_state == LUSTRE_IMP_EVICTED) {
+		struct task_struct *task;
+
                 deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
                           &target_start, &target_len);
                 /* Don't care about MGC eviction */
@@ -1468,6 +1560,7 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
 					   "using this service will fail.\n",
 					   imp->imp_obd->obd_name, target_len,
 					   target_start);
+			LASSERTF(!obd_lbug_on_eviction, "LBUG upon eviction");
                 }
                 CDEBUG(D_HA, "evicted from %s@%s; invalidating\n",
                        obd2cli_tgt(imp->imp_obd),
@@ -1477,24 +1570,22 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
 		imp->imp_vbr_failed = 0;
 		spin_unlock(&imp->imp_lock);
 
-		{
-		struct task_struct *task;
 		/* bug 17802:  XXX client_disconnect_export vs connect request
 		 * race. if client is evicted at this time then we start
 		 * invalidate thread without reference to import and import can
 		 * be freed at same time. */
 		class_import_get(imp);
 		task = kthread_run(ptlrpc_invalidate_import_thread, imp,
-				     "ll_imp_inval");
+				   "ll_imp_inval");
 		if (IS_ERR(task)) {
 			class_import_put(imp);
-			CERROR("error starting invalidate thread: %d\n", rc);
 			rc = PTR_ERR(task);
+			CERROR("%s: can't start invalidate thread: rc = %d\n",
+			       imp->imp_obd->obd_name, rc);
 		} else {
 			rc = 0;
 		}
 		RETURN(rc);
-		}
         }
 
 	if (imp->imp_state == LUSTRE_IMP_REPLAY) {
@@ -1503,7 +1594,7 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
 		rc = ptlrpc_replay_next(imp, &inflight);
 		if (inflight == 0 &&
 		    atomic_read(&imp->imp_replay_inflight) == 0) {
-			IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
+			import_set_state(imp, LUSTRE_IMP_REPLAY_LOCKS);
 			rc = ldlm_replay_locks(imp);
 			if (rc)
 				GOTO(out, rc);
@@ -1513,7 +1604,7 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
 
 	if (imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS) {
 		if (atomic_read(&imp->imp_replay_inflight) == 0) {
-			IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_WAIT);
+			import_set_state(imp, LUSTRE_IMP_REPLAY_WAIT);
 			rc = signal_completed_replay(imp);
 			if (rc)
 				GOTO(out, rc);
@@ -1522,24 +1613,28 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
 
 	if (imp->imp_state == LUSTRE_IMP_REPLAY_WAIT) {
 		if (atomic_read(&imp->imp_replay_inflight) == 0) {
-			IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+			import_set_state(imp, LUSTRE_IMP_RECOVER);
 		}
 	}
 
-        if (imp->imp_state == LUSTRE_IMP_RECOVER) {
+	if (imp->imp_state == LUSTRE_IMP_RECOVER) {
 		struct ptlrpc_connection *conn = imp->imp_connection;
 
-                rc = ptlrpc_resend(imp);
-                if (rc)
-                        GOTO(out, rc);
-                IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
-                ptlrpc_activate_import(imp);
-
-		LCONSOLE_INFO("%s: Connection restored to %s (at %s)\n",
-			      imp->imp_obd->obd_name,
-			      obd_uuid2str(&conn->c_remote_uuid),
-			      libcfs_nid2str(imp->imp_connection->c_peer.nid));
-        }
+		rc = ptlrpc_resend(imp);
+		if (rc)
+			GOTO(out, rc);
+		ptlrpc_activate_import(imp, true);
+
+		CDEBUG_LIMIT(imp->imp_was_idle ?
+				imp->imp_idle_debug : D_CONSOLE,
+			     "%s: Connection restored to %s (at %s)\n",
+			     imp->imp_obd->obd_name,
+			     obd_uuid2str(&conn->c_remote_uuid),
+			     obd_import_nid2str(imp));
+		spin_lock(&imp->imp_lock);
+		imp->imp_was_idle = 0;
+		spin_unlock(&imp->imp_lock);
+	}
 
 	if (imp->imp_state == LUSTRE_IMP_FULL) {
 		wake_up_all(&imp->imp_recovery_waitq);
@@ -1550,15 +1645,12 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
 	RETURN(rc);
 }
 
-int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
+static struct ptlrpc_request *ptlrpc_disconnect_prep_req(struct obd_import *imp)
 {
 	struct ptlrpc_request *req;
 	int rq_opc, rc = 0;
 	ENTRY;
 
-	if (imp->imp_obd->obd_force)
-		GOTO(set_state, rc);
-
 	switch (imp->imp_connect_op) {
 	case OST_CONNECT:
 		rq_opc = OST_DISCONNECT;
@@ -1575,26 +1667,67 @@ int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
 		       "(connect_op %d): rc = %d\n",
 		       imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
 		       imp->imp_connect_op, rc);
-		RETURN(rc);
+		RETURN(ERR_PTR(rc));
 	}
 
-        if (ptlrpc_import_in_recovery(imp)) {
-                struct l_wait_info lwi;
-                cfs_duration_t timeout;
+	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_DISCONNECT,
+					LUSTRE_OBD_VERSION, rq_opc);
+	if (req == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
 
-                if (AT_OFF) {
-                        if (imp->imp_server_timeout)
-                                timeout = cfs_time_seconds(obd_timeout / 2);
-                        else
-                                timeout = cfs_time_seconds(obd_timeout);
-                } else {
-                        int idx = import_at_get_index(imp,
-                                imp->imp_client->cli_request_portal);
-                        timeout = cfs_time_seconds(
-                                at_get(&imp->imp_at.iat_service_estimate[idx]));
+	/* We are disconnecting, do not retry a failed DISCONNECT rpc if
+	 * it fails.  We can get through the above with a down server
+	 * if the client doesn't know the server is gone yet. */
+	req->rq_no_resend = 1;
+
+	/* We want client umounts to happen quickly, no matter the
+	   server state... */
+	req->rq_timeout = min_t(timeout_t, req->rq_timeout,
+				INITIAL_CONNECT_TIMEOUT);
+
+	import_set_state(imp, LUSTRE_IMP_CONNECTING);
+	req->rq_send_state =  LUSTRE_IMP_CONNECTING;
+	ptlrpc_request_set_replen(req);
+
+	RETURN(req);
+}
+
+int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
+{
+	struct ptlrpc_request *req;
+	int rc = 0;
+	ENTRY;
+
+	if (imp->imp_obd->obd_force)
+		GOTO(set_state, rc);
+
+	/* probably the import has been disconnected already being idle */
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_IDLE)
+		GOTO(out, rc);
+	spin_unlock(&imp->imp_lock);
+
+	if (ptlrpc_import_in_recovery(imp)) {
+		struct l_wait_info lwi;
+		long timeout_jiffies;
+		time64_t timeout;
+
+		if (AT_OFF) {
+			if (imp->imp_server_timeout)
+				timeout = obd_timeout >> 1;
+			else
+				timeout = obd_timeout;
+		} else {
+			u32 req_portal;
+			int idx;
+
+			req_portal = imp->imp_client->cli_request_portal;
+			idx = import_at_get_index(imp, req_portal);
+			timeout = at_get(&imp->imp_at.iat_service_estimate[idx]);
                 }
 
-                lwi = LWI_TIMEOUT_INTR(cfs_timeout_cap(timeout),
+		timeout_jiffies = cfs_time_seconds(timeout);
+		lwi = LWI_TIMEOUT_INTR(max_t(long, timeout_jiffies, 1),
                                        back_to_sleep, LWI_ON_SIGNAL_NOOP, NULL);
                 rc = l_wait_event(imp->imp_recovery_waitq,
                                   !ptlrpc_import_in_recovery(imp), &lwi);
@@ -1606,33 +1739,19 @@ int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
 		GOTO(out, rc);
 	spin_unlock(&imp->imp_lock);
 
-        req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_DISCONNECT,
-                                        LUSTRE_OBD_VERSION, rq_opc);
-        if (req) {
-                /* We are disconnecting, do not retry a failed DISCONNECT rpc if
-                 * it fails.  We can get through the above with a down server
-                 * if the client doesn't know the server is gone yet. */
-                req->rq_no_resend = 1;
-
-                /* We want client umounts to happen quickly, no matter the
-                   server state... */
-                req->rq_timeout = min_t(int, req->rq_timeout,
-                                        INITIAL_CONNECT_TIMEOUT);
-
-                IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING);
-                req->rq_send_state =  LUSTRE_IMP_CONNECTING;
-                ptlrpc_request_set_replen(req);
-                rc = ptlrpc_queue_wait(req);
-                ptlrpc_req_finished(req);
-        }
+	req = ptlrpc_disconnect_prep_req(imp);
+	if (IS_ERR(req))
+		GOTO(set_state, rc = PTR_ERR(req));
+	rc = ptlrpc_queue_wait(req);
+	ptlrpc_req_finished(req);
 
 set_state:
 	spin_lock(&imp->imp_lock);
 out:
 	if (noclose)
-		IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
+		import_set_state_nolock(imp, LUSTRE_IMP_DISCON);
 	else
-		IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
+		import_set_state_nolock(imp, LUSTRE_IMP_CLOSED);
 	memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle));
 	spin_unlock(&imp->imp_lock);
 
@@ -1642,16 +1761,116 @@ int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
 }
 EXPORT_SYMBOL(ptlrpc_disconnect_import);
 
+static void ptlrpc_reset_reqs_generation(struct obd_import *imp)
+{
+	struct ptlrpc_request *old, *tmp;
+
+	/* tag all resendable requests generated before disconnection
+	 * notice this code is part of disconnect-at-idle path only */
+	list_for_each_entry_safe(old, tmp, &imp->imp_delayed_list,
+			rq_list) {
+		spin_lock(&old->rq_lock);
+		if (old->rq_import_generation == imp->imp_generation - 1 &&
+		    !old->rq_no_resend)
+			old->rq_import_generation = imp->imp_generation;
+		spin_unlock(&old->rq_lock);
+	}
+}
+
+static int ptlrpc_disconnect_idle_interpret(const struct lu_env *env,
+					    struct ptlrpc_request *req,
+					    void *data, int rc)
+{
+	struct obd_import *imp = req->rq_import;
+	int connect = 0;
+
+	DEBUG_REQ(D_HA, req, "inflight=%d, refcount=%d: rc = %d ",
+		  atomic_read(&imp->imp_inflight),
+		  atomic_read(&imp->imp_refcount), rc);
+
+	spin_lock(&imp->imp_lock);
+	/* DISCONNECT reply can be late and another connection can just
+	 * be initiated. so we have to abort disconnection. */
+	if (req->rq_import_generation == imp->imp_generation &&
+	    imp->imp_state != LUSTRE_IMP_CLOSED) {
+		LASSERTF(imp->imp_state == LUSTRE_IMP_CONNECTING,
+			 "%s\n", ptlrpc_import_state_name(imp->imp_state));
+		memset(&imp->imp_remote_handle, 0,
+		       sizeof(imp->imp_remote_handle));
+		/* take our DISCONNECT into account */
+		if (atomic_read(&imp->imp_reqs) > 1) {
+			imp->imp_generation++;
+			imp->imp_initiated_at = imp->imp_generation;
+			import_set_state_nolock(imp, LUSTRE_IMP_NEW);
+			ptlrpc_reset_reqs_generation(imp);
+			connect = 1;
+		} else {
+			/* do not expose transient IDLE state */
+			import_set_state_nolock(imp, LUSTRE_IMP_IDLE);
+		}
+	}
+
+	if (connect) {
+		rc = ptlrpc_connect_import_locked(imp);
+		if (rc >= 0)
+			ptlrpc_pinger_add_import(imp);
+	} else {
+		spin_unlock(&imp->imp_lock);
+	}
+
+	return 0;
+}
+
+int ptlrpc_disconnect_and_idle_import(struct obd_import *imp)
+{
+	struct ptlrpc_request *req;
+	ENTRY;
+
+	if (imp->imp_obd->obd_force)
+		RETURN(0);
+
+	if (ptlrpc_import_in_recovery(imp))
+		RETURN(0);
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state != LUSTRE_IMP_FULL) {
+		spin_unlock(&imp->imp_lock);
+		RETURN(0);
+	}
+	spin_unlock(&imp->imp_lock);
+
+	req = ptlrpc_disconnect_prep_req(imp);
+	if (IS_ERR(req))
+		RETURN(PTR_ERR(req));
+
+	CDEBUG_LIMIT(imp->imp_idle_debug, "%s: disconnect after %llus idle\n",
+		     imp->imp_obd->obd_name,
+		     ktime_get_real_seconds() - imp->imp_last_reply_time);
+
+	/* don't make noise at reconnection */
+	spin_lock(&imp->imp_lock);
+	imp->imp_was_idle = 1;
+	spin_unlock(&imp->imp_lock);
+
+	req->rq_interpret_reply = ptlrpc_disconnect_idle_interpret;
+	ptlrpcd_add_req(req);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_disconnect_and_idle_import);
+
 void ptlrpc_cleanup_imp(struct obd_import *imp)
 {
 	ENTRY;
 
 	spin_lock(&imp->imp_lock);
-	IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
+
+	import_set_state_nolock(imp, LUSTRE_IMP_CLOSED);
 	imp->imp_generation++;
-	spin_unlock(&imp->imp_lock);
 	ptlrpc_abort_inflight(imp);
 
+	spin_unlock(&imp->imp_lock);
+
 	EXIT;
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/layout.c b/drivers/staging/lustrefsx/lustre/ptlrpc/layout.c
index d720645bafc16..7db9465a3569f 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/layout.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/layout.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -46,18 +46,16 @@
 
 #include <linux/module.h>
 
-#include <lustre/lustre_idl.h>
-
 #include <llog_swab.h>
 #include <lustre_debug.h>
 #include <lustre_swab.h>
-#include <lustre_ver.h>
 #include <obd.h>
 #include <obd_support.h>
 
 /* struct ptlrpc_request, lustre_msg* */
 #include <lustre_req_layout.h>
 #include <lustre_acl.h>
+#include <lustre_nodemap.h>
 
 /*
  * RQFs (see below) refer to two struct req_msg_field arrays describing the
@@ -90,11 +88,6 @@ static const struct req_msg_field *mgs_config_read_server[] = {
         &RMF_MGS_CONFIG_RES
 };
 
-static const struct req_msg_field *log_cancel_client[] = {
-        &RMF_PTLRPC_BODY,
-        &RMF_LOGCOOKIES
-};
-
 static const struct req_msg_field *mdt_body_only[] = {
         &RMF_PTLRPC_BODY,
         &RMF_MDT_BODY
@@ -137,12 +130,13 @@ static const struct req_msg_field *mdt_close_client[] = {
         &RMF_CAPA1
 };
 
-static const struct req_msg_field *mdt_intent_close_client[] = {
+static const struct req_msg_field *mdt_close_intent_client[] = {
 	&RMF_PTLRPC_BODY,
 	&RMF_MDT_EPOCH,
 	&RMF_REC_REINT,
 	&RMF_CAPA1,
-	&RMF_CLOSE_DATA
+	&RMF_CLOSE_DATA,
+	&RMF_U32
 };
 
 static const struct req_msg_field *obd_statfs_server[] = {
@@ -218,7 +212,8 @@ static const struct req_msg_field *mds_reint_create_acl_client[] = {
 	&RMF_EADATA,
 	&RMF_DLM_REQ,
 	&RMF_FILE_SECCTX_NAME,
-	&RMF_FILE_SECCTX
+	&RMF_FILE_SECCTX,
+	&RMF_SELINUX_POL
 };
 
 static const struct req_msg_field *mds_reint_create_sym_client[] = {
@@ -229,7 +224,8 @@ static const struct req_msg_field *mds_reint_create_sym_client[] = {
 	&RMF_SYMTGT,
 	&RMF_DLM_REQ,
 	&RMF_FILE_SECCTX_NAME,
-	&RMF_FILE_SECCTX
+	&RMF_FILE_SECCTX,
+	&RMF_SELINUX_POL
 };
 
 static const struct req_msg_field *mds_reint_open_client[] = {
@@ -240,7 +236,8 @@ static const struct req_msg_field *mds_reint_open_client[] = {
 	&RMF_NAME,
 	&RMF_EADATA,
 	&RMF_FILE_SECCTX_NAME,
-	&RMF_FILE_SECCTX
+	&RMF_FILE_SECCTX,
+	&RMF_SELINUX_POL
 };
 
 static const struct req_msg_field *mds_reint_open_server[] = {
@@ -253,30 +250,33 @@ static const struct req_msg_field *mds_reint_open_server[] = {
 };
 
 static const struct req_msg_field *mds_reint_unlink_client[] = {
-        &RMF_PTLRPC_BODY,
-        &RMF_REC_REINT,
-        &RMF_CAPA1,
-        &RMF_NAME,
-        &RMF_DLM_REQ
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_DLM_REQ,
+	&RMF_SELINUX_POL
 };
 
 static const struct req_msg_field *mds_reint_link_client[] = {
-        &RMF_PTLRPC_BODY,
-        &RMF_REC_REINT,
-        &RMF_CAPA1,
-        &RMF_CAPA2,
-        &RMF_NAME,
-        &RMF_DLM_REQ
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_NAME,
+	&RMF_DLM_REQ,
+	&RMF_SELINUX_POL
 };
 
 static const struct req_msg_field *mds_reint_rename_client[] = {
-        &RMF_PTLRPC_BODY,
-        &RMF_REC_REINT,
-        &RMF_CAPA1,
-        &RMF_CAPA2,
-        &RMF_NAME,
-        &RMF_SYMTGT,
-        &RMF_DLM_REQ
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_NAME,
+	&RMF_SYMTGT,
+	&RMF_DLM_REQ,
+	&RMF_SELINUX_POL
 };
 
 static const struct req_msg_field *mds_reint_migrate_client[] = {
@@ -287,8 +287,10 @@ static const struct req_msg_field *mds_reint_migrate_client[] = {
 	&RMF_NAME,
 	&RMF_SYMTGT,
 	&RMF_DLM_REQ,
+	&RMF_SELINUX_POL,
 	&RMF_MDT_EPOCH,
-	&RMF_CLOSE_DATA
+	&RMF_CLOSE_DATA,
+	&RMF_EADATA
 };
 
 static const struct req_msg_field *mds_last_unlink_server[] = {
@@ -316,6 +318,13 @@ static const struct req_msg_field *mds_reint_setxattr_client[] = {
         &RMF_CAPA1,
         &RMF_NAME,
         &RMF_EADATA,
+	&RMF_DLM_REQ,
+	&RMF_SELINUX_POL
+};
+
+static const struct req_msg_field *mds_reint_resync[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
 	&RMF_DLM_REQ
 };
 
@@ -328,12 +337,28 @@ static const struct req_msg_field *mdt_swap_layouts[] = {
 	&RMF_DLM_REQ
 };
 
+static const struct req_msg_field *mds_rmfid_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_FID_ARRAY,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+};
+
+static const struct req_msg_field *mds_rmfid_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_FID_ARRAY,
+	&RMF_RCS,
+};
+
 static const struct req_msg_field *obd_connect_client[] = {
-        &RMF_PTLRPC_BODY,
-        &RMF_TGTUUID,
-        &RMF_CLUUID,
-        &RMF_CONN,
-        &RMF_CONNECT_DATA
+	&RMF_PTLRPC_BODY,
+	&RMF_TGTUUID,
+	&RMF_CLUUID,
+	&RMF_CONN,
+	&RMF_CONNECT_DATA,
+	&RMF_SELINUX_POL
 };
 
 static const struct req_msg_field *obd_connect_server[] = {
@@ -425,32 +450,37 @@ static const struct req_msg_field *ldlm_intent_layout_client[] = {
 	&RMF_LAYOUT_INTENT,
 	&RMF_EADATA /* for new layout to be set up */
 };
+
 static const struct req_msg_field *ldlm_intent_open_server[] = {
-        &RMF_PTLRPC_BODY,
-        &RMF_DLM_REP,
-        &RMF_MDT_BODY,
-        &RMF_MDT_MD,
-        &RMF_ACL,
-        &RMF_CAPA1,
-        &RMF_CAPA2
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_ACL,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_NIOBUF_INLINE,
+	&RMF_FILE_SECCTX
 };
 
 static const struct req_msg_field *ldlm_intent_getattr_client[] = {
-        &RMF_PTLRPC_BODY,
-        &RMF_DLM_REQ,
-        &RMF_LDLM_INTENT,
-        &RMF_MDT_BODY,     /* coincides with mds_getattr_name_client[] */
-        &RMF_CAPA1,
-        &RMF_NAME
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_MDT_BODY,     /* coincides with mds_getattr_name_client[] */
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_FILE_SECCTX_NAME
 };
 
 static const struct req_msg_field *ldlm_intent_getattr_server[] = {
-        &RMF_PTLRPC_BODY,
-        &RMF_DLM_REP,
-        &RMF_MDT_BODY,
-        &RMF_MDT_MD,
-        &RMF_ACL,
-        &RMF_CAPA1
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_ACL,
+	&RMF_CAPA1,
+	&RMF_FILE_SECCTX
 };
 
 static const struct req_msg_field *ldlm_intent_create_client[] = {
@@ -462,7 +492,8 @@ static const struct req_msg_field *ldlm_intent_create_client[] = {
 	&RMF_NAME,
 	&RMF_EADATA,
 	&RMF_FILE_SECCTX_NAME,
-	&RMF_FILE_SECCTX
+	&RMF_FILE_SECCTX,
+	&RMF_SELINUX_POL
 };
 
 static const struct req_msg_field *ldlm_intent_open_client[] = {
@@ -475,16 +506,8 @@ static const struct req_msg_field *ldlm_intent_open_client[] = {
 	&RMF_NAME,
 	&RMF_EADATA,
 	&RMF_FILE_SECCTX_NAME,
-	&RMF_FILE_SECCTX
-};
-
-static const struct req_msg_field *ldlm_intent_unlink_client[] = {
-        &RMF_PTLRPC_BODY,
-        &RMF_DLM_REQ,
-        &RMF_LDLM_INTENT,
-        &RMF_REC_REINT,    /* coincides with mds_reint_unlink_client[] */
-        &RMF_CAPA1,
-        &RMF_NAME
+	&RMF_FILE_SECCTX,
+	&RMF_SELINUX_POL
 };
 
 static const struct req_msg_field *ldlm_intent_getxattr_client[] = {
@@ -493,6 +516,7 @@ static const struct req_msg_field *ldlm_intent_getxattr_client[] = {
 	&RMF_LDLM_INTENT,
 	&RMF_MDT_BODY,
 	&RMF_CAPA1,
+	&RMF_SELINUX_POL
 };
 
 static const struct req_msg_field *ldlm_intent_getxattr_server[] = {
@@ -513,11 +537,12 @@ static const struct req_msg_field *mds_get_root_client[] = {
 };
 
 static const struct req_msg_field *mds_getxattr_client[] = {
-        &RMF_PTLRPC_BODY,
-        &RMF_MDT_BODY,
-        &RMF_CAPA1,
-        &RMF_NAME,
-        &RMF_EADATA
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_EADATA,
+	&RMF_SELINUX_POL
 };
 
 static const struct req_msg_field *mds_getxattr_server[] = {
@@ -571,11 +596,6 @@ static const struct req_msg_field *llog_log_hdr_only[] = {
         &RMF_LLOG_LOG_HDR
 };
 
-static const struct req_msg_field *llogd_conn_body_only[] = {
-        &RMF_PTLRPC_BODY,
-        &RMF_LLOGD_CONN_BODY
-};
-
 static const struct req_msg_field *llog_origin_handle_next_block_server[] = {
         &RMF_PTLRPC_BODY,
         &RMF_LLOGD_BODY,
@@ -612,16 +632,18 @@ static const struct req_msg_field *ost_destroy_client[] = {
 
 
 static const struct req_msg_field *ost_brw_client[] = {
-        &RMF_PTLRPC_BODY,
-        &RMF_OST_BODY,
-        &RMF_OBD_IOOBJ,
-        &RMF_NIOBUF_REMOTE,
-        &RMF_CAPA1
+	&RMF_PTLRPC_BODY,
+	&RMF_OST_BODY,
+	&RMF_OBD_IOOBJ,
+	&RMF_NIOBUF_REMOTE,
+	&RMF_CAPA1,
+	&RMF_SHORT_IO
 };
 
 static const struct req_msg_field *ost_brw_read_server[] = {
-        &RMF_PTLRPC_BODY,
-        &RMF_OST_BODY
+	&RMF_PTLRPC_BODY,
+	&RMF_OST_BODY,
+	&RMF_SHORT_IO
 };
 
 static const struct req_msg_field *ost_brw_write_server[] = {
@@ -729,43 +751,45 @@ static const struct req_msg_field *obd_lfsck_reply[] = {
 };
 
 static struct req_format *req_formats[] = {
-        &RQF_OBD_PING,
-        &RQF_OBD_SET_INFO,
+	&RQF_OBD_PING,
+	&RQF_OBD_SET_INFO,
 	&RQF_OBD_IDX_READ,
-        &RQF_SEC_CTX,
-        &RQF_MGS_TARGET_REG,
+	&RQF_SEC_CTX,
+	&RQF_MGS_TARGET_REG,
 #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 13, 53, 0)
-        &RQF_MGS_SET_INFO,
+	&RQF_MGS_SET_INFO,
 #endif
-        &RQF_MGS_CONFIG_READ,
-        &RQF_SEQ_QUERY,
-        &RQF_FLD_QUERY,
+	&RQF_MGS_CONFIG_READ,
+	&RQF_SEQ_QUERY,
+	&RQF_FLD_QUERY,
 	&RQF_FLD_READ,
-        &RQF_MDS_CONNECT,
-        &RQF_MDS_DISCONNECT,
-        &RQF_MDS_GET_INFO,
+	&RQF_MDS_CONNECT,
+	&RQF_MDS_DISCONNECT,
+	&RQF_MDS_GET_INFO,
 	&RQF_MDS_GET_ROOT,
-        &RQF_MDS_STATFS,
-        &RQF_MDS_GETATTR,
-        &RQF_MDS_GETATTR_NAME,
-        &RQF_MDS_GETXATTR,
-        &RQF_MDS_SYNC,
-        &RQF_MDS_CLOSE,
-	&RQF_MDS_INTENT_CLOSE,
+	&RQF_MDS_STATFS,
+	&RQF_MDS_STATFS_NEW,
+	&RQF_MDS_GETATTR,
+	&RQF_MDS_GETATTR_NAME,
+	&RQF_MDS_GETXATTR,
+	&RQF_MDS_SYNC,
+	&RQF_MDS_CLOSE,
+	&RQF_MDS_CLOSE_INTENT,
 	&RQF_MDS_READPAGE,
 	&RQF_MDS_REINT,
 	&RQF_MDS_REINT_CREATE,
 	&RQF_MDS_REINT_CREATE_ACL,
-        &RQF_MDS_REINT_CREATE_SLAVE,
-        &RQF_MDS_REINT_CREATE_SYM,
-        &RQF_MDS_REINT_OPEN,
-        &RQF_MDS_REINT_UNLINK,
-        &RQF_MDS_REINT_LINK,
-        &RQF_MDS_REINT_RENAME,
+	&RQF_MDS_REINT_CREATE_SLAVE,
+	&RQF_MDS_REINT_CREATE_SYM,
+	&RQF_MDS_REINT_OPEN,
+	&RQF_MDS_REINT_UNLINK,
+	&RQF_MDS_REINT_LINK,
+	&RQF_MDS_REINT_RENAME,
 	&RQF_MDS_REINT_MIGRATE,
-        &RQF_MDS_REINT_SETATTR,
-        &RQF_MDS_REINT_SETXATTR,
-        &RQF_MDS_QUOTACTL,
+	&RQF_MDS_REINT_SETATTR,
+	&RQF_MDS_REINT_SETXATTR,
+	&RQF_MDS_REINT_RESYNC,
+	&RQF_MDS_QUOTACTL,
 	&RQF_MDS_HSM_PROGRESS,
 	&RQF_MDS_HSM_CT_REGISTER,
 	&RQF_MDS_HSM_CT_UNREGISTER,
@@ -774,22 +798,23 @@ static struct req_format *req_formats[] = {
 	&RQF_MDS_HSM_ACTION,
 	&RQF_MDS_HSM_REQUEST,
 	&RQF_MDS_SWAP_LAYOUTS,
+	&RQF_MDS_RMFID,
 	&RQF_OUT_UPDATE,
-        &RQF_OST_CONNECT,
-        &RQF_OST_DISCONNECT,
-        &RQF_OST_QUOTACTL,
-        &RQF_OST_GETATTR,
-        &RQF_OST_SETATTR,
-        &RQF_OST_CREATE,
-        &RQF_OST_PUNCH,
-        &RQF_OST_SYNC,
-        &RQF_OST_DESTROY,
-        &RQF_OST_BRW_READ,
-        &RQF_OST_BRW_WRITE,
-        &RQF_OST_STATFS,
-        &RQF_OST_SET_GRANT_INFO,
+	&RQF_OST_CONNECT,
+	&RQF_OST_DISCONNECT,
+	&RQF_OST_QUOTACTL,
+	&RQF_OST_GETATTR,
+	&RQF_OST_SETATTR,
+	&RQF_OST_CREATE,
+	&RQF_OST_PUNCH,
+	&RQF_OST_SYNC,
+	&RQF_OST_DESTROY,
+	&RQF_OST_BRW_READ,
+	&RQF_OST_BRW_WRITE,
+	&RQF_OST_STATFS,
+	&RQF_OST_SET_GRANT_INFO,
 	&RQF_OST_GET_INFO,
-        &RQF_OST_GET_INFO_LAST_ID,
+	&RQF_OST_GET_INFO_LAST_ID,
 	&RQF_OST_GET_INFO_LAST_FID,
 	&RQF_OST_SET_INFO_LAST_FID,
 	&RQF_OST_GET_INFO_FIEMAP,
@@ -799,27 +824,23 @@ static struct req_format *req_formats[] = {
 	&RQF_LDLM_CONVERT,
 	&RQF_LDLM_CANCEL,
 	&RQF_LDLM_CALLBACK,
-        &RQF_LDLM_CP_CALLBACK,
-        &RQF_LDLM_BL_CALLBACK,
-        &RQF_LDLM_GL_CALLBACK,
-	&RQF_LDLM_GL_DESC_CALLBACK,
-        &RQF_LDLM_INTENT,
+	&RQF_LDLM_CP_CALLBACK,
+	&RQF_LDLM_BL_CALLBACK,
+	&RQF_LDLM_GL_CALLBACK,
+	&RQF_LDLM_GL_CALLBACK_DESC,
+	&RQF_LDLM_INTENT,
 	&RQF_LDLM_INTENT_BASIC,
-        &RQF_LDLM_INTENT_LAYOUT,
-        &RQF_LDLM_INTENT_GETATTR,
-        &RQF_LDLM_INTENT_OPEN,
-        &RQF_LDLM_INTENT_CREATE,
-        &RQF_LDLM_INTENT_UNLINK,
+	&RQF_LDLM_INTENT_LAYOUT,
+	&RQF_LDLM_INTENT_GETATTR,
+	&RQF_LDLM_INTENT_OPEN,
+	&RQF_LDLM_INTENT_CREATE,
 	&RQF_LDLM_INTENT_GETXATTR,
 	&RQF_LDLM_INTENT_QUOTA,
 	&RQF_QUOTA_DQACQ,
-        &RQF_LOG_CANCEL,
-        &RQF_LLOG_ORIGIN_HANDLE_CREATE,
-        &RQF_LLOG_ORIGIN_HANDLE_DESTROY,
-        &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK,
-        &RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK,
-        &RQF_LLOG_ORIGIN_HANDLE_READ_HEADER,
-	&RQF_LLOG_ORIGIN_CONNECT,
+	&RQF_LLOG_ORIGIN_HANDLE_CREATE,
+	&RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK,
+	&RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK,
+	&RQF_LLOG_ORIGIN_HANDLE_READ_HEADER,
 	&RQF_CONNECT,
 	&RQF_LFSCK_NOTIFY,
 	&RQF_LFSCK_QUERY,
@@ -901,8 +922,8 @@ struct req_msg_field RMF_MGS_CONFIG_RES =
 EXPORT_SYMBOL(RMF_MGS_CONFIG_RES);
 
 struct req_msg_field RMF_U32 =
-        DEFINE_MSGF("generic u32", 0,
-                    sizeof(__u32), lustre_swab_generic_32s, NULL);
+	DEFINE_MSGF("generic u32", RMF_F_STRUCT_ARRAY,
+		    sizeof(__u32), lustre_swab_generic_32s, NULL);
 EXPORT_SYMBOL(RMF_U32);
 
 struct req_msg_field RMF_SETINFO_VAL =
@@ -988,6 +1009,10 @@ struct req_msg_field RMF_NAME =
         DEFINE_MSGF("name", RMF_F_STRING, -1, NULL, NULL);
 EXPORT_SYMBOL(RMF_NAME);
 
+struct req_msg_field RMF_FID_ARRAY =
+	DEFINE_MSGF("fid_array", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_FID_ARRAY);
+
 struct req_msg_field RMF_SYMTGT =
         DEFINE_MSGF("symtgt", RMF_F_STRING, -1, NULL, NULL);
 EXPORT_SYMBOL(RMF_SYMTGT);
@@ -1011,7 +1036,7 @@ struct req_msg_field RMF_FILE_SECCTX_NAME =
 EXPORT_SYMBOL(RMF_FILE_SECCTX_NAME);
 
 struct req_msg_field RMF_FILE_SECCTX =
-	DEFINE_MSGF("file_secctx", 0, -1, NULL, NULL);
+	DEFINE_MSGF("file_secctx", RMF_F_NO_SIZE_CHECK, -1, NULL, NULL);
 EXPORT_SYMBOL(RMF_FILE_SECCTX);
 
 struct req_msg_field RMF_LLOGD_BODY =
@@ -1098,13 +1123,11 @@ struct req_msg_field RMF_LOGCOOKIES =
 EXPORT_SYMBOL(RMF_LOGCOOKIES);
 
 struct req_msg_field RMF_CAPA1 =
-        DEFINE_MSGF("capa", 0, sizeof(struct lustre_capa),
-                    lustre_swab_lustre_capa, NULL);
+	DEFINE_MSGF("capa", 0, 0, NULL, NULL);
 EXPORT_SYMBOL(RMF_CAPA1);
 
 struct req_msg_field RMF_CAPA2 =
-        DEFINE_MSGF("capa", 0, sizeof(struct lustre_capa),
-                    lustre_swab_lustre_capa, NULL);
+	DEFINE_MSGF("capa", 0, 0, NULL, NULL);
 EXPORT_SYMBOL(RMF_CAPA2);
 
 struct req_msg_field RMF_LAYOUT_INTENT =
@@ -1113,6 +1136,10 @@ struct req_msg_field RMF_LAYOUT_INTENT =
 		    NULL);
 EXPORT_SYMBOL(RMF_LAYOUT_INTENT);
 
+struct req_msg_field RMF_SELINUX_POL =
+	DEFINE_MSGF("selinux_pol", RMF_F_STRING, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SELINUX_POL);
+
 /*
  * OST request field.
  */
@@ -1133,9 +1160,15 @@ struct req_msg_field RMF_NIOBUF_REMOTE =
                     dump_rniobuf);
 EXPORT_SYMBOL(RMF_NIOBUF_REMOTE);
 
+struct req_msg_field RMF_NIOBUF_INLINE =
+	DEFINE_MSGF("niobuf_inline", RMF_F_NO_SIZE_CHECK,
+		    sizeof(struct niobuf_remote), lustre_swab_niobuf_remote,
+		    dump_rniobuf);
+EXPORT_SYMBOL(RMF_NIOBUF_INLINE);
+
 struct req_msg_field RMF_RCS =
-        DEFINE_MSGF("niobuf_remote", RMF_F_STRUCT_ARRAY, sizeof(__u32),
-                    lustre_swab_generic_32s, dump_rcs);
+	DEFINE_MSGF("niobuf_rcs", RMF_F_STRUCT_ARRAY, sizeof(__u32),
+		    lustre_swab_generic_32s, dump_rcs);
 EXPORT_SYMBOL(RMF_RCS);
 
 struct req_msg_field RMF_EAVALS_LENS =
@@ -1159,8 +1192,8 @@ struct req_msg_field RMF_OST_ID =
 EXPORT_SYMBOL(RMF_OST_ID);
 
 struct req_msg_field RMF_FIEMAP_KEY =
-        DEFINE_MSGF("fiemap", 0, sizeof(struct ll_fiemap_info_key),
-                    lustre_swab_fiemap, NULL);
+	DEFINE_MSGF("fiemap_key", 0, sizeof(struct ll_fiemap_info_key),
+		    lustre_swab_fiemap_info_key, NULL);
 EXPORT_SYMBOL(RMF_FIEMAP_KEY);
 
 struct req_msg_field RMF_FIEMAP_VAL =
@@ -1171,6 +1204,9 @@ struct req_msg_field RMF_IDX_INFO =
 	DEFINE_MSGF("idx_info", 0, sizeof(struct idx_info),
 		    lustre_swab_idx_info, NULL);
 EXPORT_SYMBOL(RMF_IDX_INFO);
+struct req_msg_field RMF_SHORT_IO =
+	DEFINE_MSGF("short_io", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SHORT_IO);
 struct req_msg_field RMF_HSM_USER_STATE =
 	DEFINE_MSGF("hsm_user_state", 0, sizeof(struct hsm_user_state),
 		    lustre_swab_hsm_user_state, NULL);
@@ -1198,7 +1234,7 @@ struct req_msg_field RMF_MDS_HSM_USER_ITEM =
 EXPORT_SYMBOL(RMF_MDS_HSM_USER_ITEM);
 
 struct req_msg_field RMF_MDS_HSM_ARCHIVE =
-	DEFINE_MSGF("hsm_archive", 0,
+	DEFINE_MSGF("hsm_archive", RMF_F_STRUCT_ARRAY,
 		    sizeof(__u32), lustre_swab_generic_32s, NULL);
 EXPORT_SYMBOL(RMF_MDS_HSM_ARCHIVE);
 
@@ -1344,10 +1380,6 @@ struct req_format RQF_FLD_READ =
 	DEFINE_REQ_FMT0("FLD_READ", fld_read_client, fld_read_server);
 EXPORT_SYMBOL(RQF_FLD_READ);
 
-struct req_format RQF_LOG_CANCEL =
-        DEFINE_REQ_FMT0("OBD_LOG_CANCEL", log_cancel_client, empty);
-EXPORT_SYMBOL(RQF_LOG_CANCEL);
-
 struct req_format RQF_MDS_QUOTACTL =
         DEFINE_REQ_FMT0("MDS_QUOTACTL", quotactl_only, quotactl_only);
 EXPORT_SYMBOL(RQF_MDS_QUOTACTL);
@@ -1371,9 +1403,13 @@ struct req_format RQF_MDS_GET_ROOT =
 EXPORT_SYMBOL(RQF_MDS_GET_ROOT);
 
 struct req_format RQF_MDS_STATFS =
-        DEFINE_REQ_FMT0("MDS_STATFS", empty, obd_statfs_server);
+	DEFINE_REQ_FMT0("MDS_STATFS", empty, obd_statfs_server);
 EXPORT_SYMBOL(RQF_MDS_STATFS);
 
+struct req_format RQF_MDS_STATFS_NEW =
+	DEFINE_REQ_FMT0("MDS_STATFS_NEW", mdt_body_only, obd_statfs_server);
+EXPORT_SYMBOL(RQF_MDS_STATFS_NEW);
+
 struct req_format RQF_MDS_SYNC =
         DEFINE_REQ_FMT0("MDS_SYNC", mdt_body_capa, mdt_body_only);
 EXPORT_SYMBOL(RQF_MDS_SYNC);
@@ -1451,6 +1487,10 @@ struct req_format RQF_MDS_REINT_SETXATTR =
 			mds_reint_setxattr_client, mdt_body_only);
 EXPORT_SYMBOL(RQF_MDS_REINT_SETXATTR);
 
+struct req_format RQF_MDS_REINT_RESYNC =
+	DEFINE_REQ_FMT0("MDS_REINT_RESYNC", mds_reint_resync, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT_RESYNC);
+
 struct req_format RQF_MDS_CONNECT =
         DEFINE_REQ_FMT0("MDS_CONNECT",
                         obd_connect_client, obd_connect_server);
@@ -1506,10 +1546,10 @@ struct req_format RQF_LDLM_GL_CALLBACK =
                         ldlm_gl_callback_server);
 EXPORT_SYMBOL(RQF_LDLM_GL_CALLBACK);
 
-struct req_format RQF_LDLM_GL_DESC_CALLBACK =
+struct req_format RQF_LDLM_GL_CALLBACK_DESC =
 	DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_gl_callback_desc_client,
 			ldlm_gl_callback_server);
-EXPORT_SYMBOL(RQF_LDLM_GL_DESC_CALLBACK);
+EXPORT_SYMBOL(RQF_LDLM_GL_CALLBACK_DESC);
 
 struct req_format RQF_LDLM_INTENT_BASIC =
 	DEFINE_REQ_FMT0("LDLM_INTENT_BASIC",
@@ -1522,7 +1562,7 @@ struct req_format RQF_LDLM_INTENT =
 EXPORT_SYMBOL(RQF_LDLM_INTENT);
 
 struct req_format RQF_LDLM_INTENT_LAYOUT =
-	DEFINE_REQ_FMT0("LDLM_INTENT_LAYOUT ",
+	DEFINE_REQ_FMT0("LDLM_INTENT_LAYOUT",
 			ldlm_intent_layout_client, ldlm_enqueue_lvb_server);
 EXPORT_SYMBOL(RQF_LDLM_INTENT_LAYOUT);
 
@@ -1541,11 +1581,6 @@ struct req_format RQF_LDLM_INTENT_CREATE =
                         ldlm_intent_create_client, ldlm_intent_getattr_server);
 EXPORT_SYMBOL(RQF_LDLM_INTENT_CREATE);
 
-struct req_format RQF_LDLM_INTENT_UNLINK =
-        DEFINE_REQ_FMT0("LDLM_INTENT_UNLINK",
-                        ldlm_intent_unlink_client, ldlm_intent_server);
-EXPORT_SYMBOL(RQF_LDLM_INTENT_UNLINK);
-
 struct req_format RQF_LDLM_INTENT_GETXATTR =
 	DEFINE_REQ_FMT0("LDLM_INTENT_GETXATTR",
 			ldlm_intent_getxattr_client,
@@ -1557,10 +1592,10 @@ struct req_format RQF_MDS_CLOSE =
                         mdt_close_client, mds_last_unlink_server);
 EXPORT_SYMBOL(RQF_MDS_CLOSE);
 
-struct req_format RQF_MDS_INTENT_CLOSE =
-	DEFINE_REQ_FMT0("MDS_CLOSE",
-			mdt_intent_close_client, mds_last_unlink_server);
-EXPORT_SYMBOL(RQF_MDS_INTENT_CLOSE);
+struct req_format RQF_MDS_CLOSE_INTENT =
+	DEFINE_REQ_FMT0("MDS_CLOSE_INTENT",
+			mdt_close_intent_client, mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_CLOSE_INTENT);
 
 struct req_format RQF_MDS_READPAGE =
         DEFINE_REQ_FMT0("MDS_READPAGE",
@@ -1601,16 +1636,16 @@ struct req_format RQF_MDS_SWAP_LAYOUTS =
 			mdt_swap_layouts, empty);
 EXPORT_SYMBOL(RQF_MDS_SWAP_LAYOUTS);
 
+struct req_format RQF_MDS_RMFID =
+	DEFINE_REQ_FMT0("MDS_RMFID", mds_rmfid_client,
+			mds_rmfid_server);
+EXPORT_SYMBOL(RQF_MDS_RMFID);
+
 struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE =
         DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_CREATE",
                         llog_origin_handle_create_client, llogd_body_only);
 EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_CREATE);
 
-struct req_format RQF_LLOG_ORIGIN_HANDLE_DESTROY =
-        DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_DESTROY",
-                        llogd_body_only, llogd_body_only);
-EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_DESTROY);
-
 struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK =
         DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_NEXT_BLOCK",
                         llogd_body_only, llog_origin_handle_next_block_server);
@@ -1626,10 +1661,6 @@ struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER =
                         llogd_body_only, llog_log_hdr_only);
 EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_READ_HEADER);
 
-struct req_format RQF_LLOG_ORIGIN_CONNECT =
-        DEFINE_REQ_FMT0("LLOG_ORIGIN_CONNECT", llogd_conn_body_only, empty);
-EXPORT_SYMBOL(RQF_LLOG_ORIGIN_CONNECT);
-
 struct req_format RQF_CONNECT =
 	DEFINE_REQ_FMT0("CONNECT", obd_connect_client, obd_connect_server);
 EXPORT_SYMBOL(RQF_CONNECT);
@@ -2340,12 +2371,13 @@ __u32 req_capsule_fmt_size(__u32 magic, const struct req_format *fmt,
 	if (size == 0)
 		return size;
 
-        for (; i < fmt->rf_fields[loc].nr; ++i)
-                if (fmt->rf_fields[loc].d[i]->rmf_size != -1)
-                        size += cfs_size_round(fmt->rf_fields[loc].d[i]->
-                                               rmf_size);
-        return size;
+	for (; i < fmt->rf_fields[loc].nr; ++i)
+		if (fmt->rf_fields[loc].d[i]->rmf_size != -1)
+			size += cfs_size_round(fmt->rf_fields[loc].d[i]->
+					       rmf_size);
+	return size;
 }
+EXPORT_SYMBOL(req_capsule_fmt_size);
 
 /**
  * Changes the format of an RPC.
@@ -2539,3 +2571,46 @@ int req_capsule_server_grow(struct req_capsule *pill,
         return 0;
 }
 EXPORT_SYMBOL(req_capsule_server_grow);
+
+int req_check_sepol(struct req_capsule *pill)
+{
+	int rc = 0;
+#ifdef HAVE_SERVER_SUPPORT
+	struct obd_export *export;
+	struct lu_nodemap *nm = NULL;
+	const char *sepol = NULL;
+	const char *nm_sepol = NULL;
+
+	if (!pill->rc_req)
+		return -EPROTO;
+
+	export = pill->rc_req->rq_export;
+	if (!export || !exp_connect_sepol(export) ||
+	    !req_capsule_has_field(pill, &RMF_SELINUX_POL, RCL_CLIENT))
+		goto nm;
+
+	if (req_capsule_get_size(pill, &RMF_SELINUX_POL, RCL_CLIENT) == 0)
+		goto nm;
+
+	sepol = req_capsule_client_get(pill, &RMF_SELINUX_POL);
+	CDEBUG(D_SEC, "retrieved sepol %s\n", sepol);
+
+nm:
+	if (export) {
+		nm = nodemap_get_from_exp(export);
+		if (!IS_ERR_OR_NULL(nm)) {
+			nm_sepol = nodemap_get_sepol(nm);
+			if (nm_sepol && nm_sepol[0])
+				if (sepol == NULL ||
+				    strcmp(sepol, nm_sepol) != 0)
+					rc = -EACCES;
+		}
+	}
+
+	if (!IS_ERR_OR_NULL(nm))
+		nodemap_putref(nm);
+#endif
+
+	return rc;
+}
+EXPORT_SYMBOL(req_check_sepol);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/llog_client.c b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_client.c
index a39db55028dc5..0f149b692362c 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/llog_client.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_client.c
@@ -136,41 +136,6 @@ static int llog_client_open(const struct lu_env *env,
 	return rc;
 }
 
-static int llog_client_destroy(const struct lu_env *env,
-			       struct llog_handle *loghandle,
-			       struct thandle *th)
-{
-        struct obd_import     *imp;
-        struct ptlrpc_request *req = NULL;
-        struct llogd_body     *body;
-        int                    rc;
-        ENTRY;
-
-        LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp);
-        req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_DESTROY,
-                                        LUSTRE_LOG_VERSION,
-                                        LLOG_ORIGIN_HANDLE_DESTROY);
-        if (req == NULL)
-                GOTO(err_exit, rc =-ENOMEM);
-
-        body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
-        body->lgd_logid = loghandle->lgh_id;
-        body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags;
-
-	if (!(body->lgd_llh_flags & LLOG_F_IS_PLAIN))
-		CERROR("%s: wrong llog flags %x\n", imp->imp_obd->obd_name,
-		       body->lgd_llh_flags);
-
-        ptlrpc_request_set_replen(req);
-        rc = ptlrpc_queue_wait(req);
-
-        ptlrpc_req_finished(req);
-err_exit:
-        LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp);
-        RETURN(rc);
-}
-
-
 static int llog_client_next_block(const struct lu_env *env,
 				  struct llog_handle *loghandle,
 				  int *cur_idx, int next_idx,
@@ -368,7 +333,6 @@ struct llog_operations llog_client_ops = {
 	.lop_prev_block		= llog_client_prev_block,
 	.lop_read_header	= llog_client_read_header,
 	.lop_open		= llog_client_open,
-	.lop_destroy		= llog_client_destroy,
 	.lop_close		= llog_client_close,
 };
 EXPORT_SYMBOL(llog_client_ops);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/llog_server.c b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_server.c
index 4864b499120df..ca91a1c9491ac 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/llog_server.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/llog_server.c
@@ -111,45 +111,6 @@ int llog_origin_handle_open(struct ptlrpc_request *req)
 	return rc;
 }
 
-int llog_origin_handle_destroy(struct ptlrpc_request *req)
-{
-	struct llogd_body	*body;
-	struct llog_logid	*logid = NULL;
-	struct llog_ctxt	*ctxt;
-	int			 rc;
-
-	ENTRY;
-
-	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
-	if (body == NULL)
-		RETURN(err_serious(-EFAULT));
-
-	rc = req_capsule_server_pack(&req->rq_pill);
-	if (rc < 0)
-		RETURN(err_serious(-ENOMEM));
-
-	if (ostid_id(&body->lgd_logid.lgl_oi) > 0)
-		logid = &body->lgd_logid;
-
-	if (!(body->lgd_llh_flags & LLOG_F_IS_PLAIN))
-		CERROR("%s: wrong llog flags %x\n",
-		       req->rq_export->exp_obd->obd_name, body->lgd_llh_flags);
-
-	if (body->lgd_ctxt_idx >= LLOG_MAX_CTXTS) {
-		CDEBUG(D_WARNING, "%s: bad ctxt ID: idx=%d\n",
-		       req->rq_export->exp_obd->obd_name, body->lgd_ctxt_idx);
-		RETURN(-EPROTO);
-	}
-
-	ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx);
-	if (ctxt == NULL)
-		RETURN(-ENODEV);
-
-	rc = llog_erase(req->rq_svc_thread->t_env, ctxt, logid, NULL);
-	llog_ctxt_put(ctxt);
-	RETURN(rc);
-}
-
 int llog_origin_handle_next_block(struct ptlrpc_request *req)
 {
 	struct llog_handle	*loghandle;
@@ -324,15 +285,3 @@ int llog_origin_handle_read_header(struct ptlrpc_request *req)
 	llog_ctxt_put(ctxt);
 	return rc;
 }
-
-int llog_origin_handle_close(struct ptlrpc_request *req)
-{
-	int	 rc;
-
-	ENTRY;
-
-	rc = req_capsule_server_pack(&req->rq_pill);
-	if (rc)
-		RETURN(err_serious(-ENOMEM));
-	RETURN(0);
-}
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c b/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
index 933183a83dbb3..bf7d4164cc071 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/lproc_ptlrpc.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -35,7 +35,6 @@
 #include <obd_support.h>
 #include <obd.h>
 #include <lprocfs_status.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_net.h>
 #include <obd_class.h>
 #include "ptlrpc_internal.h"
@@ -96,6 +95,7 @@ static struct ll_rpc_opcode {
 	{ MDS_HSM_CT_REGISTER, "mds_hsm_ct_register" },
 	{ MDS_HSM_CT_UNREGISTER, "mds_hsm_ct_unregister" },
 	{ MDS_SWAP_LAYOUTS,	"mds_swap_layouts" },
+	{ MDS_RMFID,		"mds_rmfid" },
         { LDLM_ENQUEUE,     "ldlm_enqueue" },
         { LDLM_CONVERT,     "ldlm_convert" },
         { LDLM_CANCEL,      "ldlm_cancel" },
@@ -110,17 +110,17 @@ static struct ll_rpc_opcode {
         { MGS_TARGET_DEL,   "mgs_target_del" },
         { MGS_SET_INFO,     "mgs_set_info" },
         { MGS_CONFIG_READ,  "mgs_config_read" },
-        { OBD_PING,         "obd_ping" },
-	{ OBD_LOG_CANCEL,	"llog_cancel" },
-        { OBD_QC_CALLBACK,  "obd_quota_callback" },
-	{ OBD_IDX_READ,	    "dt_index_read" },
+	{ OBD_PING,			 "obd_ping" },
+	{ 401, /* was OBD_LOG_CANCEL */	 "llog_cancel" },
+	{ 402, /* was OBD_QC_CALLBACK */ "obd_quota_callback" },
+	{ OBD_IDX_READ,			 "dt_index_read" },
 	{ LLOG_ORIGIN_HANDLE_CREATE,	 "llog_origin_handle_open" },
         { LLOG_ORIGIN_HANDLE_NEXT_BLOCK, "llog_origin_handle_next_block" },
         { LLOG_ORIGIN_HANDLE_READ_HEADER,"llog_origin_handle_read_header" },
-        { LLOG_ORIGIN_HANDLE_WRITE_REC,  "llog_origin_handle_write_rec" },
-        { LLOG_ORIGIN_HANDLE_CLOSE,      "llog_origin_handle_close" },
-        { LLOG_ORIGIN_CONNECT,           "llog_origin_connect" },
-        { LLOG_CATINFO,                  "llog_catinfo" },
+        { 504, /*LLOG_ORIGIN_HANDLE_WRITE_REC*/"llog_origin_handle_write_rec" },
+        { 505, /* was LLOG_ORIGIN_HANDLE_CLOSE */ "llog_origin_handle_close" },
+        { 506, /* was LLOG_ORIGIN_CONNECT */ "llog_origin_connect" },
+        { 507, /* was LLOG_CATINFO */	 "llog_catinfo" },
         { LLOG_ORIGIN_HANDLE_PREV_BLOCK, "llog_origin_handle_prev_block" },
         { LLOG_ORIGIN_HANDLE_DESTROY,    "llog_origin_handle_destroy" },
         { QUOTA_DQACQ,      "quota_acquire" },
@@ -140,20 +140,21 @@ static struct ll_eopcode {
      __u32       opcode;
      const char *opname;
 } ll_eopcode_table[EXTRA_LAST_OPC] = {
-        { LDLM_GLIMPSE_ENQUEUE, "ldlm_glimpse_enqueue" },
-        { LDLM_PLAIN_ENQUEUE,   "ldlm_plain_enqueue" },
-        { LDLM_EXTENT_ENQUEUE,  "ldlm_extent_enqueue" },
-        { LDLM_FLOCK_ENQUEUE,   "ldlm_flock_enqueue" },
-        { LDLM_IBITS_ENQUEUE,   "ldlm_ibits_enqueue" },
-        { MDS_REINT_SETATTR,    "mds_reint_setattr" },
-        { MDS_REINT_CREATE,     "mds_reint_create" },
-        { MDS_REINT_LINK,       "mds_reint_link" },
-        { MDS_REINT_UNLINK,     "mds_reint_unlink" },
-        { MDS_REINT_RENAME,     "mds_reint_rename" },
-        { MDS_REINT_OPEN,       "mds_reint_open" },
-        { MDS_REINT_SETXATTR,   "mds_reint_setxattr" },
-        { BRW_READ_BYTES,       "read_bytes" },
-        { BRW_WRITE_BYTES,      "write_bytes" },
+	{ LDLM_GLIMPSE_ENQUEUE, "ldlm_glimpse_enqueue" },
+	{ LDLM_PLAIN_ENQUEUE,   "ldlm_plain_enqueue" },
+	{ LDLM_EXTENT_ENQUEUE,  "ldlm_extent_enqueue" },
+	{ LDLM_FLOCK_ENQUEUE,   "ldlm_flock_enqueue" },
+	{ LDLM_IBITS_ENQUEUE,   "ldlm_ibits_enqueue" },
+	{ MDS_REINT_SETATTR,    "mds_reint_setattr" },
+	{ MDS_REINT_CREATE,     "mds_reint_create" },
+	{ MDS_REINT_LINK,       "mds_reint_link" },
+	{ MDS_REINT_UNLINK,     "mds_reint_unlink" },
+	{ MDS_REINT_RENAME,     "mds_reint_rename" },
+	{ MDS_REINT_OPEN,       "mds_reint_open" },
+	{ MDS_REINT_SETXATTR,   "mds_reint_setxattr" },
+	{ MDS_REINT_RESYNC,	"mds_reint_resync" },
+	{ BRW_READ_BYTES,       "read_bytes" },
+	{ BRW_WRITE_BYTES,      "write_bytes" },
 };
 
 const char *ll_opcode2str(__u32 opcode)
@@ -194,32 +195,33 @@ static const char *ll_eopcode2str(__u32 opcode)
         return ll_eopcode_table[opcode].opname;
 }
 
-#ifdef CONFIG_PROC_FS
-static void ptlrpc_lprocfs_register(struct proc_dir_entry *root, char *dir,
-                             char *name, struct proc_dir_entry **procroot_ret,
-                             struct lprocfs_stats **stats_ret)
+static void
+ptlrpc_ldebugfs_register(struct dentry *root, char *dir, char *name,
+			 struct dentry **debugfs_root_ret,
+			 struct lprocfs_stats **stats_ret)
 {
-        struct proc_dir_entry *svc_procroot;
+	struct dentry *svc_debugfs_entry;
         struct lprocfs_stats *svc_stats;
         int i, rc;
         unsigned int svc_counter_config = LPROCFS_CNTR_AVGMINMAX |
                                           LPROCFS_CNTR_STDDEV;
 
-        LASSERT(*procroot_ret == NULL);
-        LASSERT(*stats_ret == NULL);
+	LASSERT(!*debugfs_root_ret);
+	LASSERT(!*stats_ret);
 
-        svc_stats = lprocfs_alloc_stats(EXTRA_MAX_OPCODES+LUSTRE_MAX_OPCODES,0);
-        if (svc_stats == NULL)
+	svc_stats = lprocfs_alloc_stats(EXTRA_MAX_OPCODES + LUSTRE_MAX_OPCODES,
+					0);
+	if (!svc_stats)
                 return;
 
         if (dir) {
-		svc_procroot = lprocfs_register(dir, root, NULL, NULL);
-                if (IS_ERR(svc_procroot)) {
+		svc_debugfs_entry = ldebugfs_register(dir, root, NULL, NULL);
+		if (IS_ERR(svc_debugfs_entry)) {
                         lprocfs_free_stats(&svc_stats);
                         return;
                 }
         } else {
-                svc_procroot = root;
+		svc_debugfs_entry = root;
         }
 
         lprocfs_counter_init(svc_stats, PTLRPC_REQWAIT_CNTR,
@@ -235,7 +237,7 @@ static void ptlrpc_lprocfs_register(struct proc_dir_entry *root, char *dir,
         for (i = 0; i < EXTRA_LAST_OPC; i++) {
                 char *units;
 
-                switch(i) {
+		switch (i) {
                 case BRW_WRITE_BYTES:
                 case BRW_READ_BYTES:
                         units = "bytes";
@@ -255,14 +257,14 @@ static void ptlrpc_lprocfs_register(struct proc_dir_entry *root, char *dir,
                                      ll_opcode2str(opcode), "usec");
         }
 
-        rc = lprocfs_register_stats(svc_procroot, name, svc_stats);
+	rc = ldebugfs_register_stats(svc_debugfs_entry, name, svc_stats);
         if (rc < 0) {
                 if (dir)
-                        lprocfs_remove(&svc_procroot);
+			ldebugfs_remove(&svc_debugfs_entry);
                 lprocfs_free_stats(&svc_stats);
         } else {
                 if (dir)
-                        *procroot_ret = svc_procroot;
+			*debugfs_root_ret = svc_debugfs_entry;
                 *stats_ret = svc_stats;
         }
 }
@@ -281,7 +283,9 @@ ptlrpc_lprocfs_req_history_len_seq_show(struct seq_file *m, void *v)
 	seq_printf(m, "%d\n", total);
 	return 0;
 }
-LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_req_history_len);
+
+
+LDEBUGFS_SEQ_FOPS_RO(ptlrpc_lprocfs_req_history_len);
 
 static int
 ptlrpc_lprocfs_req_history_max_seq_show(struct seq_file *m, void *n)
@@ -305,11 +309,12 @@ ptlrpc_lprocfs_req_history_max_seq_write(struct file *file,
 {
 	struct seq_file *m = file->private_data;
 	struct ptlrpc_service *svc = m->private;
+	unsigned long long val;
+	unsigned long long limit;
 	int bufpages;
-	__s64 val;
 	int rc;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtoull_from_user(buffer, count, 0, &val);
 	if (rc < 0)
 		return rc;
 
@@ -318,10 +323,15 @@ ptlrpc_lprocfs_req_history_max_seq_write(struct file *file,
 
 	/* This sanity check is more of an insanity check; we can still
 	 * hose a kernel by allowing the request history to grow too
-	 * far. */
-	bufpages = (svc->srv_buf_size + PAGE_SIZE - 1) >>
+	 * far. The roundup to the next power of two is an empirical way
+	 * to take care that request buffer is allocated in Slab and thus
+	 * will be upgraded */
+	bufpages = (roundup_pow_of_two(svc->srv_buf_size) + PAGE_SIZE - 1) >>
 							PAGE_SHIFT;
-	if (val > cfs_totalram_pages() / (2 * bufpages))
+	limit = cfs_totalram_pages() / (2 * bufpages);
+	/* do not allow history to consume more than half max number of rqbds */
+	if ((svc->srv_nrqbds_max == 0 && val > limit) ||
+	    (svc->srv_nrqbds_max != 0 && val > svc->srv_nrqbds_max / 2))
 		return -ERANGE;
 
 	spin_lock(&svc->srv_lock);
@@ -336,28 +346,64 @@ ptlrpc_lprocfs_req_history_max_seq_write(struct file *file,
 
 	return count;
 }
-LPROC_SEQ_FOPS(ptlrpc_lprocfs_req_history_max);
+
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_req_history_max);
 
 static int
-ptlrpc_lprocfs_threads_min_seq_show(struct seq_file *m, void *n)
+ptlrpc_lprocfs_req_buffers_max_seq_show(struct seq_file *m, void *n)
 {
 	struct ptlrpc_service *svc = m->private;
 
-	seq_printf(m, "%d\n",
-		   svc->srv_nthrs_cpt_init * svc->srv_ncpts);
+	seq_printf(m, "%d\n", svc->srv_nrqbds_max);
 	return 0;
 }
 
 static ssize_t
-ptlrpc_lprocfs_threads_min_seq_write(struct file *file,
-				     const char __user *buffer,
-				     size_t count, loff_t *off)
+ptlrpc_lprocfs_req_buffers_max_seq_write(struct file *file,
+					 const char __user *buffer,
+					 size_t count, loff_t *off)
 {
 	struct seq_file *m = file->private_data;
 	struct ptlrpc_service *svc = m->private;
-	__s64 val;
-	int rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	int val;
+	int rc;
 
+	rc = kstrtoint_from_user(buffer, count, 0, &val);
+	if (rc < 0)
+		return rc;
+
+	if (val < svc->srv_nbuf_per_group && val != 0)
+		return -ERANGE;
+
+	spin_lock(&svc->srv_lock);
+
+	svc->srv_nrqbds_max = (uint)val;
+
+	spin_unlock(&svc->srv_lock);
+
+	return count;
+}
+
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_req_buffers_max);
+
+static ssize_t threads_min_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
+						  srv_kobj);
+
+	return sprintf(buf, "%d\n", svc->srv_nthrs_cpt_init * svc->srv_ncpts);
+}
+
+static ssize_t threads_min_store(struct kobject *kobj, struct attribute *attr,
+				 const char *buffer, size_t count)
+{
+	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
+						  srv_kobj);
+	unsigned long val;
+	int rc;
+
+	rc = kstrtoul(buffer, 10, &val);
 	if (rc < 0)
 		return rc;
 
@@ -376,44 +422,43 @@ ptlrpc_lprocfs_threads_min_seq_write(struct file *file,
 
 	return count;
 }
-LPROC_SEQ_FOPS(ptlrpc_lprocfs_threads_min);
+LUSTRE_RW_ATTR(threads_min);
 
-static int
-ptlrpc_lprocfs_threads_started_seq_show(struct seq_file *m, void *n)
+static ssize_t threads_started_show(struct kobject *kobj,
+				    struct attribute *attr,
+				    char *buf)
 {
-	struct ptlrpc_service		*svc = m->private;
-	struct ptlrpc_service_part	*svcpt;
-	int	total = 0;
-	int	i;
+	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
+						  srv_kobj);
+	struct ptlrpc_service_part *svcpt;
+	int total = 0;
+	int i;
 
 	ptlrpc_service_for_each_part(svcpt, i, svc)
 		total += svcpt->scp_nthrs_running;
 
-	seq_printf(m, "%d\n", total);
-	return 0;
+	return sprintf(buf, "%d\n", total);
 }
-LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_threads_started);
+LUSTRE_RO_ATTR(threads_started);
 
-static int
-ptlrpc_lprocfs_threads_max_seq_show(struct seq_file *m, void *n)
+static ssize_t threads_max_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
 {
-	struct ptlrpc_service *svc = m->private;
+	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
+						  srv_kobj);
 
-	seq_printf(m, "%d\n",
-		   svc->srv_nthrs_cpt_limit * svc->srv_ncpts);
-	return 0;
+	return sprintf(buf, "%d\n", svc->srv_nthrs_cpt_limit * svc->srv_ncpts);
 }
 
-static ssize_t
-ptlrpc_lprocfs_threads_max_seq_write(struct file *file,
-				     const char __user *buffer,
-				     size_t count, loff_t *off)
+static ssize_t threads_max_store(struct kobject *kobj, struct attribute *attr,
+				 const char *buffer, size_t count)
 {
-	struct seq_file *m = file->private_data;
-	struct ptlrpc_service *svc = m->private;
-	__s64 val;
-	int rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
+						  srv_kobj);
+	unsigned long val;
+	int rc;
 
+	rc = kstrtoul(buffer, 10, &val);
 	if (rc < 0)
 		return rc;
 
@@ -432,7 +477,7 @@ ptlrpc_lprocfs_threads_max_seq_write(struct file *file,
 
 	return count;
 }
-LPROC_SEQ_FOPS(ptlrpc_lprocfs_threads_max);
+LUSTRE_RW_ATTR(threads_max);
 
 /**
  * Translates \e ptlrpc_nrs_pol_state values to human-readable strings.
@@ -472,7 +517,7 @@ void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy,
 	LASSERT(info != NULL);
 	assert_spin_locked(&policy->pol_nrs->nrs_lock);
 
-	LASSERT(sizeof(info->pi_arg) == sizeof(policy->pol_arg));
+	CLASSERT(sizeof(info->pi_arg) == sizeof(policy->pol_arg));
 	memcpy(info->pi_name, policy->pol_desc->pd_name, NRS_POL_NAME_MAX);
 	memcpy(info->pi_arg, policy->pol_arg, sizeof(policy->pol_arg));
 
@@ -555,20 +600,39 @@ static int ptlrpc_lprocfs_nrs_seq_show(struct seq_file *m, void *n)
 				 * sanity-check the values we get.
 				 */
 			} else {
-				LASSERT(strncmp(infos[pol_idx].pi_name,
-						tmp.pi_name,
-						NRS_POL_NAME_MAX) == 0);
-				LASSERT(strncmp(infos[pol_idx].pi_arg,
-						tmp.pi_arg,
-						sizeof(tmp.pi_arg)) == 0);
+				if (strncmp(infos[pol_idx].pi_name,
+					    tmp.pi_name,
+					    NRS_POL_NAME_MAX) != 0) {
+					spin_unlock(&nrs->nrs_lock);
+					rc = -EINVAL;
+					CERROR("%s: failed to check pi_name: rc = %d\n",
+					       svc->srv_thread_name, rc);
+					GOTO(out, rc);
+				}
+				if (strncmp(infos[pol_idx].pi_arg,
+					    tmp.pi_arg,
+					    sizeof(tmp.pi_arg)) != 0) {
+					spin_unlock(&nrs->nrs_lock);
+					rc = -EINVAL;
+					CERROR("%s: failed to check pi_arg: rc = %d\n",
+					       svc->srv_thread_name, rc);
+					GOTO(out, rc);
+				}
 				/**
-				 * Not asserting ptlrpc_nrs_pol_info::pi_state,
+				 * Not checking ptlrpc_nrs_pol_info::pi_state,
 				 * because it may be different between
 				 * instances of the same policy in different
 				 * service partitions.
 				 */
-				LASSERT(infos[pol_idx].pi_fallback ==
-					tmp.pi_fallback);
+
+				if (infos[pol_idx].pi_fallback !=
+				    tmp.pi_fallback) {
+					spin_unlock(&nrs->nrs_lock);
+					rc = -EINVAL;
+					CERROR("%s: failed to check pi_fallback: rc = %d\n",
+					       svc->srv_thread_name, rc);
+					GOTO(out, rc);
+				}
 			}
 
 			infos[pol_idx].pi_req_queued += tmp.pi_req_queued;
@@ -692,7 +756,7 @@ ptlrpc_lprocfs_nrs_seq_write(struct file *file, const char __user *buffer,
 	 */
 	cmd_copy = cmd;
 
-	if (lprocfs_copy_from_user(file, cmd, buffer, count))
+	if (copy_from_user(cmd, buffer, count))
 		GOTO(out, rc = -EFAULT);
 
 	cmd[count] = '\0';
@@ -747,7 +811,8 @@ ptlrpc_lprocfs_nrs_seq_write(struct file *file, const char __user *buffer,
 
 	RETURN(rc < 0 ? rc : count);
 }
-LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs);
+
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs);
 
 /** @} nrs */
 
@@ -867,10 +932,12 @@ ptlrpc_lprocfs_svc_req_history_start(struct seq_file *s, loff_t *pos)
 		if (i > cpt) /* make up the lowest position for this CPT */
 			*pos = PTLRPC_REQ_CPT2POS(svc, i);
 
+		mutex_lock(&svcpt->scp_mutex);
 		spin_lock(&svcpt->scp_lock);
 		rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi,
 				PTLRPC_REQ_POS2SEQ(svc, *pos));
 		spin_unlock(&svcpt->scp_lock);
+		mutex_unlock(&svcpt->scp_mutex);
 		if (rc == 0) {
 			*pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq);
 			srhi->srhi_idx = i;
@@ -912,9 +979,11 @@ ptlrpc_lprocfs_svc_req_history_next(struct seq_file *s,
 			seq = srhi->srhi_seq + (1 << svc->srv_cpt_bits);
 		}
 
+		mutex_lock(&svcpt->scp_mutex);
 		spin_lock(&svcpt->scp_lock);
 		rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, seq);
 		spin_unlock(&svcpt->scp_lock);
+		mutex_unlock(&svcpt->scp_mutex);
 		if (rc == 0) {
 			*pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq);
 			srhi->srhi_idx = i;
@@ -945,6 +1014,7 @@ void target_print_req(void *seq_file, struct ptlrpc_request *req)
         case RQ_PHASE_INTERPRET:
                 /* being handled, so basic msg swabbed, and opc is valid
                  * but racing with mds_handle() */
+                 fallthrough;
         case RQ_PHASE_COMPLETE:
                 /* been handled by mds_handle() reply state possibly still
                  * volatile */
@@ -968,6 +1038,7 @@ static int ptlrpc_lprocfs_svc_req_history_show(struct seq_file *s, void *iter)
 
 	svcpt = svc->srv_parts[srhi->srhi_idx];
 
+	mutex_lock(&svcpt->scp_mutex);
 	spin_lock(&svcpt->scp_lock);
 
 	rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, srhi->srhi_seq);
@@ -1008,6 +1079,8 @@ static int ptlrpc_lprocfs_svc_req_history_show(struct seq_file *s, void *iter)
 	}
 
 	spin_unlock(&svcpt->scp_lock);
+	mutex_unlock(&svcpt->scp_mutex);
+
 	return rc;
 }
 
@@ -1032,7 +1105,7 @@ ptlrpc_lprocfs_svc_req_history_open(struct inode *inode, struct file *file)
 		return rc;
 
 	seqf = file->private_data;
-	seqf->private = PDE_DATA(inode);
+	seqf->private = inode->i_private;
 	return 0;
 }
 
@@ -1066,98 +1139,130 @@ static int ptlrpc_lprocfs_timeouts_seq_show(struct seq_file *m, void *n)
 
 	return 0;
 }
-LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_timeouts);
 
-static int ptlrpc_lprocfs_hp_ratio_seq_show(struct seq_file *m, void *v)
+LDEBUGFS_SEQ_FOPS_RO(ptlrpc_lprocfs_timeouts);
+
+static ssize_t high_priority_ratio_show(struct kobject *kobj,
+					struct attribute *attr,
+					char *buf)
 {
-	struct ptlrpc_service *svc = m->private;
-	seq_printf(m, "%d\n", svc->srv_hpreq_ratio);
-	return 0;
+	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
+						  srv_kobj);
+
+	return sprintf(buf, "%d\n", svc->srv_hpreq_ratio);
 }
 
-static ssize_t
-ptlrpc_lprocfs_hp_ratio_seq_write(struct file *file, const char __user *buffer,
-				  size_t count, loff_t *off)
+static ssize_t high_priority_ratio_store(struct kobject *kobj,
+					 struct attribute *attr,
+					 const char *buffer,
+					 size_t count)
 {
-	struct seq_file *m = file->private_data;
-	struct ptlrpc_service *svc = m->private;
+	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
+						  srv_kobj);
 	int rc;
-	__s64 val;
+	unsigned long val;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtoul(buffer, 10, &val);
 	if (rc < 0)
 		return rc;
 
-	if (val < 0 || val > INT_MAX)
-		return -ERANGE;
-
 	spin_lock(&svc->srv_lock);
 	svc->srv_hpreq_ratio = val;
 	spin_unlock(&svc->srv_lock);
 
 	return count;
 }
-LPROC_SEQ_FOPS(ptlrpc_lprocfs_hp_ratio);
+LUSTRE_RW_ATTR(high_priority_ratio);
+
+static struct attribute *ptlrpc_svc_attrs[] = {
+	&lustre_attr_threads_min.attr,
+	&lustre_attr_threads_started.attr,
+	&lustre_attr_threads_max.attr,
+	&lustre_attr_high_priority_ratio.attr,
+	NULL,
+};
 
-void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry,
-                                     struct ptlrpc_service *svc)
+static void ptlrpc_sysfs_svc_release(struct kobject *kobj)
 {
-	struct lprocfs_vars lproc_vars[] = {
-		{ .name	= "high_priority_ratio",
-		  .fops	= &ptlrpc_lprocfs_hp_ratio_fops,
-		  .data = svc },
+	struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service,
+						  srv_kobj);
+
+	complete(&svc->srv_kobj_unregister);
+}
+
+static struct kobj_type ptlrpc_svc_ktype = {
+	.default_attrs	= ptlrpc_svc_attrs,
+	.sysfs_ops	= &lustre_sysfs_ops,
+	.release	= ptlrpc_sysfs_svc_release,
+};
+
+void ptlrpc_sysfs_unregister_service(struct ptlrpc_service *svc)
+{
+	/* Let's see if we had a chance at initialization first */
+	if (svc->srv_kobj.kset) {
+		kobject_put(&svc->srv_kobj);
+		wait_for_completion(&svc->srv_kobj_unregister);
+	}
+}
+
+int ptlrpc_sysfs_register_service(struct kset *parent,
+				  struct ptlrpc_service *svc)
+{
+	svc->srv_kobj.kset = parent;
+	init_completion(&svc->srv_kobj_unregister);
+	return kobject_init_and_add(&svc->srv_kobj, &ptlrpc_svc_ktype,
+				    &parent->kobj, "%s", svc->srv_name);
+}
+
+void ptlrpc_ldebugfs_register_service(struct dentry *entry,
+				      struct ptlrpc_service *svc)
+{
+	struct ldebugfs_vars ldebugfs_vars[] = {
 		{ .name	= "req_buffer_history_len",
 		  .fops	= &ptlrpc_lprocfs_req_history_len_fops,
 		  .data	= svc },
 		{ .name = "req_buffer_history_max",
 		  .fops	= &ptlrpc_lprocfs_req_history_max_fops,
 		  .data	= svc },
-		{ .name = "threads_min",
-		  .fops = &ptlrpc_lprocfs_threads_min_fops,
-		  .data = svc },
-		{ .name = "threads_max",
-		  .fops = &ptlrpc_lprocfs_threads_max_fops,
-		  .data = svc },
-		{ .name = "threads_started",
-		  .fops = &ptlrpc_lprocfs_threads_started_fops,
-		  .data = svc },
 		{ .name = "timeouts",
 		  .fops = &ptlrpc_lprocfs_timeouts_fops,
 		  .data = svc },
 		{ .name = "nrs_policies",
 		  .fops = &ptlrpc_lprocfs_nrs_fops,
 		  .data = svc },
+		{ .name = "req_buffers_max",
+		  .fops = &ptlrpc_lprocfs_req_buffers_max_fops,
+		  .data = svc },
 		{ NULL }
         };
-        static struct proc_ops req_history_fops = {
-		PROC_OWNER(THIS_MODULE)
-                .proc_open    = ptlrpc_lprocfs_svc_req_history_open,
-                .proc_read    = seq_read,
-                .proc_lseek   = seq_lseek,
-                .proc_release = lprocfs_seq_release,
+        static struct file_operations req_history_fops = {
+                .owner       = THIS_MODULE,
+                .open        = ptlrpc_lprocfs_svc_req_history_open,
+                .read        = seq_read,
+                .llseek      = seq_lseek,
+                .release     = lprocfs_seq_release,
         };
 
         int rc;
 
-        ptlrpc_lprocfs_register(entry, svc->srv_name,
-				"stats", &svc->srv_procroot,
-				&svc->srv_stats);
-	if (svc->srv_procroot == NULL)
+	ptlrpc_ldebugfs_register(entry, svc->srv_name, "stats",
+				 &svc->srv_debugfs_entry, &svc->srv_stats);
+	if (IS_ERR_OR_NULL(svc->srv_debugfs_entry))
 		return;
 
-	lprocfs_add_vars(svc->srv_procroot, lproc_vars, NULL);
+	ldebugfs_add_vars(svc->srv_debugfs_entry, ldebugfs_vars, NULL);
 
-	rc = lprocfs_seq_create(svc->srv_procroot, "req_history",
-				0400, &req_history_fops, svc);
+	rc = ldebugfs_seq_create(svc->srv_debugfs_entry, "req_history",
+				 0400, &req_history_fops, svc);
 	if (rc)
 		CWARN("Error adding the req_history file\n");
 }
 
 void ptlrpc_lprocfs_register_obd(struct obd_device *obddev)
 {
-        ptlrpc_lprocfs_register(obddev->obd_proc_entry, NULL, "stats",
-                                &obddev->obd_svc_procroot,
-                                &obddev->obd_svc_stats);
+	ptlrpc_ldebugfs_register(obddev->obd_debugfs_entry, NULL, "stats",
+				 &obddev->obd_svc_debugfs_entry,
+				 &obddev->obd_svc_stats);
 }
 EXPORT_SYMBOL(ptlrpc_lprocfs_register_obd);
 
@@ -1205,8 +1310,8 @@ EXPORT_SYMBOL(ptlrpc_lprocfs_brw);
 
 void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc)
 {
-        if (svc->srv_procroot != NULL)
-                lprocfs_remove(&svc->srv_procroot);
+	if (!IS_ERR_OR_NULL(svc->srv_debugfs_entry))
+		ldebugfs_remove(&svc->srv_debugfs_entry);
 
         if (svc->srv_stats)
                 lprocfs_free_stats(&svc->srv_stats);
@@ -1219,48 +1324,53 @@ void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd)
 	 */
 	lprocfs_obd_cleanup(obd);
 
-        if (obd->obd_svc_procroot)
-                lprocfs_remove(&obd->obd_svc_procroot);
+	if (!IS_ERR_OR_NULL(obd->obd_svc_debugfs_entry))
+		ldebugfs_remove(&obd->obd_svc_debugfs_entry);
 
         if (obd->obd_svc_stats)
                 lprocfs_free_stats(&obd->obd_svc_stats);
 }
 EXPORT_SYMBOL(ptlrpc_lprocfs_unregister_obd);
 
-ssize_t
-lprocfs_ping_seq_write(struct file *file, const char __user *buffer,
-		       size_t count, loff_t *off)
+ssize_t ping_show(struct kobject *kobj, struct attribute *attr,
+		  char *buffer)
 {
-	struct seq_file		*m = file->private_data;
-	struct obd_device	*obd = m->private;
-	struct ptlrpc_request	*req;
-	int			rc;
-	ENTRY;
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct ptlrpc_request *req;
+	int rc;
 
+	ENTRY;
 	LPROCFS_CLIMP_CHECK(obd);
 	req = ptlrpc_prep_ping(obd->u.cli.cl_import);
 	LPROCFS_CLIMP_EXIT(obd);
-	if (req == NULL)
+	if (!req)
 		RETURN(-ENOMEM);
 
 	req->rq_send_state = LUSTRE_IMP_FULL;
 
 	rc = ptlrpc_queue_wait(req);
-
 	ptlrpc_req_finished(req);
-	if (rc >= 0)
-		RETURN(count);
+
 	RETURN(rc);
 }
-EXPORT_SYMBOL(lprocfs_ping_seq_write);
+EXPORT_SYMBOL(ping_show);
+
+/* kept for older verison of tools. */
+ssize_t ping_store(struct kobject *kobj, struct attribute *attr,
+		   const char *buffer, size_t count)
+{
+	return ping_show(kobj, attr, (char *)buffer);
+}
+EXPORT_SYMBOL(ping_store);
 
 /* Write the connection UUID to this file to attempt to connect to that node.
  * The connection UUID is a node's primary NID. For example,
  * "echo connection=192.168.0.1@tcp0::instance > .../import".
  */
 ssize_t
-lprocfs_import_seq_write(struct file *file, const char __user *buffer,
-			 size_t count, loff_t *off)
+ldebugfs_import_seq_write(struct file *file, const char __user *buffer,
+			  size_t count, loff_t *off)
 {
 	struct seq_file	  *m	= file->private_data;
 	struct obd_device *obd	= m->private;
@@ -1279,7 +1389,7 @@ lprocfs_import_seq_write(struct file *file, const char __user *buffer,
 	if (kbuf == NULL)
 		return -ENOMEM;
 
-	if (lprocfs_copy_from_user(file, kbuf, buffer, count))
+	if (copy_from_user(kbuf, buffer, count))
 		GOTO(out, count = -EFAULT);
 
 	kbuf[count] = 0;
@@ -1291,14 +1401,14 @@ lprocfs_import_seq_write(struct file *file, const char __user *buffer,
 	uuid = kbuf + prefix_len;
 	ptr = strstr(uuid, "::");
 	if (ptr) {
-		__u32 inst;
-		char *endptr;
+		u32 inst;
+		int rc;
 
 		*ptr = 0;
 		do_reconn = 0;
 		ptr += 2; /* Skip :: */
-		inst = simple_strtol(ptr, &endptr, 10);
-		if (*endptr) {
+		rc = kstrtouint(ptr, 10, &inst);
+		if (rc) {
 			CERROR("config: wrong instance # %s\n", ptr);
 		} else if (inst != imp->imp_connect_data.ocd_instance) {
 			CDEBUG(D_INFO, "IR: %s is connecting to an obsoleted "
@@ -1320,7 +1430,7 @@ lprocfs_import_seq_write(struct file *file, const char __user *buffer,
 	OBD_FREE(kbuf, count + 1);
 	return count;
 }
-EXPORT_SYMBOL(lprocfs_import_seq_write);
+EXPORT_SYMBOL(ldebugfs_import_seq_write);
 
 int lprocfs_pinger_recov_seq_show(struct seq_file *m, void *n)
 {
@@ -1342,16 +1452,13 @@ lprocfs_pinger_recov_seq_write(struct file *file, const char __user *buffer,
 	struct obd_device *obd = m->private;
 	struct client_obd *cli = &obd->u.cli;
 	struct obd_import *imp = cli->cl_import;
+	bool val;
 	int rc;
-	__s64 val;
 
-	rc = lprocfs_str_to_s64(file, buffer, count, &val);
+	rc = kstrtobool_from_user(buffer, count, &val);
 	if (rc < 0)
 		return rc;
 
-	if (val != 0 && val != 1)
-		return -ERANGE;
-
 	LPROCFS_CLIMP_CHECK(obd);
 	spin_lock(&imp->imp_lock);
 	imp->imp_no_pinger_recover = !val;
@@ -1360,5 +1467,3 @@ lprocfs_pinger_recov_seq_write(struct file *file, const char __user *buffer,
 	return count;
 }
 EXPORT_SYMBOL(lprocfs_pinger_recov_seq_write);
-
-#endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/niobuf.c b/drivers/staging/lustrefsx/lustre/ptlrpc/niobuf.c
index 999869000c35b..f6e0f57e2c785 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/niobuf.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/niobuf.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -167,7 +167,6 @@ int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc)
 		RETURN(0);
 
 	/* NB no locking required until desc is on the network */
-	LASSERT(desc->bd_md_count == 0);
 	LASSERT(ptlrpc_is_bulk_op_active(desc->bd_type));
 
 	LASSERT(desc->bd_cbid.cbid_fn == server_bulk_callback);
@@ -190,7 +189,7 @@ int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc)
 	mbits = desc->bd_req->rq_mbits & ~((__u64)desc->bd_md_max_brw - 1);
 	total_md = desc->bd_req->rq_mbits - mbits + 1;
 
-	desc->bd_md_count = total_md;
+	desc->bd_refs = total_md;
 	desc->bd_failure = 0;
 
 	md.user_ptr = &desc->bd_cbid;
@@ -231,7 +230,7 @@ int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc)
 				     desc->bd_portal, mbits, 0, 0);
 		else
 			rc = LNetGet(self_nid, desc->bd_mds[posted_md],
-				     peer_id, desc->bd_portal, mbits, 0);
+				     peer_id, desc->bd_portal, mbits, 0, false);
 
 		posted_md++;
 		if (rc != 0) {
@@ -248,9 +247,9 @@ int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc)
 		 * event this creates will signal completion with failure,
 		 * so we return SUCCESS here! */
 		spin_lock(&desc->bd_lock);
-		desc->bd_md_count -= total_md - posted_md;
+		desc->bd_refs -= total_md - posted_md;
 		spin_unlock(&desc->bd_lock);
-		LASSERT(desc->bd_md_count >= 0);
+		LASSERT(desc->bd_refs >= 0);
 
 		mdunlink_iterate_helper(desc->bd_mds, posted_md);
 		RETURN(0);
@@ -327,7 +326,6 @@ int ptlrpc_register_bulk(struct ptlrpc_request *req)
 
 	/* NB no locking required until desc is on the network */
 	LASSERT(desc->bd_nob > 0);
-	LASSERT(desc->bd_md_count == 0);
 	LASSERT(desc->bd_md_max_brw <= PTLRPC_BULK_OPS_COUNT);
 	LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES);
 	LASSERT(desc->bd_req != NULL);
@@ -349,9 +347,9 @@ int ptlrpc_register_bulk(struct ptlrpc_request *req)
 	LASSERT(desc->bd_cbid.cbid_fn == client_bulk_callback);
 	LASSERT(desc->bd_cbid.cbid_arg == desc);
 
-	total_md = (desc->bd_iov_count + LNET_MAX_IOV - 1) / LNET_MAX_IOV;
+	total_md = desc->bd_md_count;
 	/* rq_mbits is matchbits of the final bulk */
-	mbits = req->rq_mbits - total_md + 1;
+	mbits = req->rq_mbits - desc->bd_md_count + 1;
 
 	LASSERTF(mbits == (req->rq_mbits & PTLRPC_BULK_OPS_MASK),
 		 "first mbits = x%llu, last mbits = x%llu\n",
@@ -364,19 +362,25 @@ int ptlrpc_register_bulk(struct ptlrpc_request *req)
 
 	desc->bd_registered = 1;
 	desc->bd_last_mbits = mbits;
-	desc->bd_md_count = total_md;
+	desc->bd_refs = total_md;
 	md.user_ptr = &desc->bd_cbid;
 	md.eq_handle = ptlrpc_eq_h;
 	md.threshold = 1;                       /* PUT or GET */
 
-	for (posted_md = 0; posted_md < total_md; posted_md++, mbits++) {
+	for (posted_md = 0; posted_md < desc->bd_md_count;
+	     posted_md++, mbits++) {
 		md.options = PTLRPC_MD_OPTIONS |
 			     (ptlrpc_is_bulk_op_get(desc->bd_type) ?
 			      LNET_MD_OP_GET : LNET_MD_OP_PUT);
 		ptlrpc_fill_bulk_md(&md, desc, posted_md);
 
-		rc = LNetMEAttach(desc->bd_portal, peer, mbits, 0,
+		if (posted_md > 0 && posted_md + 1 == desc->bd_md_count &&
+		    OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_ATTACH)) {
+			rc = -ENOMEM;
+		} else {
+			rc = LNetMEAttach(desc->bd_portal, peer, mbits, 0,
 				  LNET_UNLINK, LNET_INS_AFTER, &me_h);
+		}
 		if (rc != 0) {
 			CERROR("%s: LNetMEAttach failed x%llu/%d: rc = %d\n",
 			       desc->bd_import->imp_obd->obd_name, mbits,
@@ -400,24 +404,26 @@ int ptlrpc_register_bulk(struct ptlrpc_request *req)
 	if (rc != 0) {
 		LASSERT(rc == -ENOMEM);
 		spin_lock(&desc->bd_lock);
-		desc->bd_md_count -= total_md - posted_md;
+		desc->bd_refs -= total_md - posted_md;
 		spin_unlock(&desc->bd_lock);
-		LASSERT(desc->bd_md_count >= 0);
+		LASSERT(desc->bd_refs >= 0);
 		mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw);
 		req->rq_status = -ENOMEM;
+		desc->bd_registered = 0;
 		RETURN(-ENOMEM);
 	}
 
 	spin_lock(&desc->bd_lock);
 	/* Holler if peer manages to touch buffers before he knows the mbits */
-	if (desc->bd_md_count != total_md)
+	if (desc->bd_refs != total_md)
 		CWARN("%s: Peer %s touched %d buffers while I registered\n",
 		      desc->bd_import->imp_obd->obd_name, libcfs_id2str(peer),
-		      total_md - desc->bd_md_count);
+		      total_md - desc->bd_refs);
 	spin_unlock(&desc->bd_lock);
 
-	CDEBUG(D_NET, "Setup %u bulk %s buffers: %u pages %u bytes, "
-	       "mbits x%#llx-%#llx, portal %u\n", desc->bd_md_count,
+	CDEBUG(D_NET,
+	       "Setup %u bulk %s buffers: %u pages %u bytes, mbits x%#llx-%#llx, portal %u\n",
+	       desc->bd_refs,
 	       ptlrpc_is_bulk_op_get(desc->bd_type) ? "get-source" : "put-sink",
 	       desc->bd_iov_count, desc->bd_nob,
 	       desc->bd_last_mbits, req->rq_mbits, desc->bd_portal);
@@ -492,9 +498,11 @@ static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags)
 {
 	struct ptlrpc_service_part	*svcpt = req->rq_rqbd->rqbd_svcpt;
 	struct ptlrpc_service		*svc = svcpt->scp_service;
-	int service_time = max_t(int, ktime_get_real_seconds() -
-                                 req->rq_arrival_time.tv_sec, 1);
+	timeout_t service_timeout;
 
+	service_timeout = clamp_t(timeout_t, ktime_get_real_seconds() -
+					     req->rq_arrival_time.tv_sec, 1,
+				  (AT_OFF ? obd_timeout * 3 / 2 : at_max));
         if (!(flags & PTLRPC_REPLY_EARLY) &&
             (req->rq_type != PTL_RPC_MSG_ERR) &&
             (req->rq_reqmsg != NULL) &&
@@ -503,7 +511,8 @@ static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags)
                MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE))) {
                 /* early replies, errors and recovery requests don't count
                  * toward our service time estimate */
-		int oldse = at_measured(&svcpt->scp_at_estimate, service_time);
+		int oldse = at_measured(&svcpt->scp_at_estimate,
+					service_timeout);
 
 		if (oldse != 0) {
 			DEBUG_REQ(D_ADAPTTO, req,
@@ -513,7 +522,7 @@ static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags)
 		}
         }
         /* Report actual service time for client latency calc */
-        lustre_msg_set_service_time(req->rq_repmsg, service_time);
+	lustre_msg_set_service_timeout(req->rq_repmsg, service_timeout);
 	/* Report service time estimate for future client reqs, but report 0
 	 * (to be ignored by client) if it's an error reply during recovery.
 	 * b=15815
@@ -780,8 +789,8 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
 		if (request->rq_resend_cb != NULL)
 			request->rq_resend_cb(request, &request->rq_async_args);
 	}
-        if (request->rq_memalloc)
-                mpflag = cfs_memory_pressure_get_and_set();
+	if (request->rq_memalloc)
+		mpflag = cfs_memory_pressure_get_and_set();
 
 	rc = sptlrpc_cli_wrap_request(request);
 	if (rc)
@@ -791,7 +800,7 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
 	if (request->rq_bulk != NULL) {
 		rc = ptlrpc_register_bulk (request);
 		if (rc != 0)
-			GOTO(out, rc);
+			GOTO(cleanup_bulk, rc);
 		/*
 		 * All the mds in the request will have the same cpt
 		 * encoded in the cookie. So we can just get the first
@@ -813,13 +822,13 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
 				spin_lock(&request->rq_lock);
 				request->rq_err = 1;
 				spin_unlock(&request->rq_lock);
-                                request->rq_status = rc;
-                                GOTO(cleanup_bulk, rc);
-                        }
-                } else {
-                        request->rq_repdata = NULL;
-                        request->rq_repmsg = NULL;
-                }
+				request->rq_status = rc;
+				GOTO(cleanup_bulk, rc);
+			}
+		} else {
+			request->rq_repdata = NULL;
+			request->rq_repmsg = NULL;
+		}
 
                 rc = LNetMEAttach(request->rq_reply_portal,/*XXX FIXME bug 249*/
                                   connection->c_peer, request->rq_xid, 0,
@@ -893,8 +902,6 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
         request->rq_deadline = request->rq_sent + request->rq_timeout +
                 ptlrpc_at_get_net_latency(request);
 
-	ptlrpc_pinger_sending_on_import(imp);
-
 	DEBUG_REQ(D_INFO, request, "send flg=%x",
 		  lustre_msg_get_flags(request->rq_reqmsg));
 	rc = ptl_send_buf(&request->rq_req_md_h,
@@ -912,18 +919,20 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
                 GOTO(out, rc);
 
  cleanup_me:
-        /* MEUnlink is safe; the PUT didn't even get off the ground, and
-         * nobody apart from the PUT's target has the right nid+XID to
-         * access the reply buffer. */
-        rc2 = LNetMEUnlink(reply_me_h);
-        LASSERT (rc2 == 0);
-        /* UNLINKED callback called synchronously */
-        LASSERT(!request->rq_receiving_reply);
+	/* MEUnlink is safe; the PUT didn't even get off the ground, and
+	 * nobody apart from the PUT's target has the right nid+XID to
+	 * access the reply buffer. */
+	rc2 = LNetMEUnlink(reply_me_h);
+	LASSERT (rc2 == 0);
+	/* UNLINKED callback called synchronously */
+	LASSERT(!request->rq_receiving_reply);
 
  cleanup_bulk:
-        /* We do sync unlink here as there was no real transfer here so
-         * the chance to have long unlink to sluggish net is smaller here. */
+	/* We do sync unlink here as there was no real transfer here so
+	 * the chance to have long unlink to sluggish net is smaller here. */
         ptlrpc_unregister_bulk(request, 0);
+	if (request->rq_bulk != NULL)
+		request->rq_bulk->bd_registered = 0;
  out:
 	if (rc == -ENOMEM) {
 		/* set rq_sent so that this request is treated
@@ -944,7 +953,10 @@ EXPORT_SYMBOL(ptl_send_rpc);
 int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd)
 {
 	struct ptlrpc_service *service = rqbd->rqbd_svcpt->scp_service;
-	static struct lnet_process_id match_id = {LNET_NID_ANY, LNET_PID_ANY};
+	static struct lnet_process_id match_id = {
+		.nid = LNET_NID_ANY,
+		.pid = LNET_PID_ANY
+	};
 	int rc;
 	struct lnet_md md;
 	struct lnet_handle_me me_h;
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nodemap_internal.h b/drivers/staging/lustrefsx/lustre/ptlrpc/nodemap_internal.h
index 851bdc0dc354a..6d6b9d7a04541 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/nodemap_internal.h
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nodemap_internal.h
@@ -22,7 +22,7 @@
 /*
  * Copyright (C) 2013, Trustees of Indiana University
  *
- * Copyright (c) 2013, 2014, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
  *
  * Author: Joshua Walgenbach <jjw@iu.edu>
  */
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c
index 7423e981d9e37..94d21d42f87df 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_crr.c
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2013, 2016, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
  *
  * Copyright 2012 Xyratex Technology Limited
  */
@@ -610,10 +610,8 @@ static void nrs_crrn_req_stop(struct ptlrpc_nrs_policy *policy,
 	       libcfs_id2str(req->rq_peer), nrq->nr_u.crr.cr_round);
 }
 
-#ifdef CONFIG_PROC_FS
-
 /**
- * lprocfs interface
+ * debugfs interface
  */
 
 /**
@@ -718,7 +716,7 @@ ptlrpc_lprocfs_nrs_crrn_quantum_seq_write(struct file *file,
         if (count > (sizeof(kernbuf) - 1))
                 return -EINVAL;
 
-	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
+	if (copy_from_user(kernbuf, buffer, count))
 		return -EFAULT;
 
         kernbuf[count] = '\0';
@@ -731,7 +729,9 @@ ptlrpc_lprocfs_nrs_crrn_quantum_seq_write(struct file *file,
 	val = lprocfs_find_named_value(kernbuf, NRS_LPROCFS_QUANTUM_NAME_REG,
 				       &count_copy);
 	if (val != kernbuf) {
-		quantum_reg = simple_strtol(val, NULL, 10);
+		rc = kstrtol(val, 10, &quantum_reg);
+		if (rc)
+			return rc;
 
 		queue |= PTLRPC_NRS_QUEUE_REG;
 	}
@@ -747,7 +747,9 @@ ptlrpc_lprocfs_nrs_crrn_quantum_seq_write(struct file *file,
 		if (!nrs_svc_has_hp(svc))
 			return -ENODEV;
 
-		quantum_hp = simple_strtol(val, NULL, 10);
+		rc = kstrtol(val, 10, &quantum_hp);
+		if (rc)
+			return rc;
 
 		queue |= PTLRPC_NRS_QUEUE_HP;
 	}
@@ -757,10 +759,9 @@ ptlrpc_lprocfs_nrs_crrn_quantum_seq_write(struct file *file,
 	 * value
 	 */
 	if (queue == 0) {
-		if (!isdigit(kernbuf[0]))
-			return -EINVAL;
-
-		quantum_reg = simple_strtol(kernbuf, NULL, 10);
+		rc = kstrtol(kernbuf, 10, &quantum_reg);
+		if (rc)
+			return rc;
 
 		queue = PTLRPC_NRS_QUEUE_REG;
 
@@ -808,7 +809,8 @@ ptlrpc_lprocfs_nrs_crrn_quantum_seq_write(struct file *file,
 
 	return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count;
 }
-LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_crrn_quantum);
+
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_crrn_quantum);
 
 /**
  * Initializes a CRR-N policy's lprocfs interface for service \a svc
@@ -820,34 +822,19 @@ LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_crrn_quantum);
  */
 static int nrs_crrn_lprocfs_init(struct ptlrpc_service *svc)
 {
-	struct lprocfs_vars nrs_crrn_lprocfs_vars[] = {
+	struct ldebugfs_vars nrs_crrn_lprocfs_vars[] = {
 		{ .name		= "nrs_crrn_quantum",
 		  .fops		= &ptlrpc_lprocfs_nrs_crrn_quantum_fops,
 		  .data = svc },
 		{ NULL }
 	};
 
-	if (svc->srv_procroot == NULL)
+	if (IS_ERR_OR_NULL(svc->srv_debugfs_entry))
 		return 0;
 
-	return lprocfs_add_vars(svc->srv_procroot, nrs_crrn_lprocfs_vars, NULL);
-}
-
-/**
- * Cleans up a CRR-N policy's lprocfs interface for service \a svc
- *
- * \param[in] svc the service
- */
-static void nrs_crrn_lprocfs_fini(struct ptlrpc_service *svc)
-{
-	if (svc->srv_procroot == NULL)
-		return;
-
-	lprocfs_remove_proc_entry("nrs_crrn_quantum", svc->srv_procroot);
+	return ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_crrn_lprocfs_vars, NULL);
 }
 
-#endif /* CONFIG_PROC_FS */
-
 /**
  * CRR-N policy operations
  */
@@ -861,10 +848,7 @@ static const struct ptlrpc_nrs_pol_ops nrs_crrn_ops = {
 	.op_req_enqueue		= nrs_crrn_req_add,
 	.op_req_dequeue		= nrs_crrn_req_del,
 	.op_req_stop		= nrs_crrn_req_stop,
-#ifdef CONFIG_PROC_FS
 	.op_lprocfs_init	= nrs_crrn_lprocfs_init,
-	.op_lprocfs_fini	= nrs_crrn_lprocfs_fini,
-#endif
 };
 
 /**
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c
index 403b74efe6415..c8a1e6637d261 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_delay.c
@@ -362,11 +362,9 @@ static int nrs_delay_ctl(struct ptlrpc_nrs_policy *policy,
 }
 
 /**
- * lprocfs interface
+ * debugfs interface
  */
 
-#ifdef CONFIG_PROC_FS
-
 /* nrs_delay_min and nrs_delay_max are bounded by these values */
 #define LPROCFS_NRS_DELAY_LOWER_BOUND		0
 #define LPROCFS_NRS_DELAY_UPPER_BOUND		65535
@@ -419,7 +417,7 @@ static int nrs_delay_ctl(struct ptlrpc_nrs_policy *policy,
  * Helper for delay's seq_write functions.
  */
 static ssize_t
-lprocfs_nrs_delay_seq_write_common(struct file *file, const char __user *buffer,
+lprocfs_nrs_delay_seq_write_common(const char __user *buffer,
 				   unsigned int bufsize, size_t count,
 				   const char *var_name, unsigned int min_val,
 				   unsigned int max_val,
@@ -443,7 +441,7 @@ lprocfs_nrs_delay_seq_write_common(struct file *file, const char __user *buffer,
 	if (kernbuf == NULL)
 		return -ENOMEM;
 
-	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
+	if (copy_from_user(kernbuf, buffer, count))
 		GOTO(free_kernbuf, rc = -EFAULT);
 
 	tmpsize = strlen("reg_") + strlen(var_name) + 1;
@@ -598,7 +596,7 @@ ptlrpc_lprocfs_nrs_delay_min_seq_write(struct file *file,
 	struct seq_file *m = file->private_data;
 	struct ptlrpc_service *svc = m->private;
 
-	return lprocfs_nrs_delay_seq_write_common(file, buffer,
+	return lprocfs_nrs_delay_seq_write_common(buffer,
 						  LPROCFS_NRS_DELAY_MIN_SIZE,
 						  count,
 						  LPROCFS_NRS_DELAY_MIN_NAME,
@@ -607,7 +605,7 @@ ptlrpc_lprocfs_nrs_delay_min_seq_write(struct file *file,
 						  svc, NRS_POL_NAME_DELAY,
 						  NRS_CTL_DELAY_WR_MIN, false);
 }
-LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_min);
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_min);
 
 /**
  * Retrieves the value of the maximum delay for delay policy instances on both
@@ -681,7 +679,7 @@ ptlrpc_lprocfs_nrs_delay_max_seq_write(struct file *file,
 	struct seq_file *m = file->private_data;
 	struct ptlrpc_service *svc = m->private;
 
-	return lprocfs_nrs_delay_seq_write_common(file, buffer,
+	return lprocfs_nrs_delay_seq_write_common(buffer,
 						  LPROCFS_NRS_DELAY_MAX_SIZE,
 						  count,
 						  LPROCFS_NRS_DELAY_MAX_NAME,
@@ -690,7 +688,7 @@ ptlrpc_lprocfs_nrs_delay_max_seq_write(struct file *file,
 						  svc, NRS_POL_NAME_DELAY,
 						  NRS_CTL_DELAY_WR_MAX, false);
 }
-LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_max);
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_max);
 
 /**
  * Retrieves the value of the percentage of requests which should be delayed
@@ -765,7 +763,7 @@ ptlrpc_lprocfs_nrs_delay_pct_seq_write(struct file *file,
 	struct seq_file *m = file->private_data;
 	struct ptlrpc_service *svc = m->private;
 
-	return lprocfs_nrs_delay_seq_write_common(file, buffer,
+	return lprocfs_nrs_delay_seq_write_common(buffer,
 						  LPROCFS_NRS_DELAY_PCT_SIZE,
 						  count,
 						  LPROCFS_NRS_DELAY_PCT_NAME,
@@ -774,11 +772,12 @@ ptlrpc_lprocfs_nrs_delay_pct_seq_write(struct file *file,
 						  svc, NRS_POL_NAME_DELAY,
 						  NRS_CTL_DELAY_WR_PCT, false);
 }
-LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_pct);
+
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_pct);
 
 static int nrs_delay_lprocfs_init(struct ptlrpc_service *svc)
 {
-	struct lprocfs_vars nrs_delay_lprocfs_vars[] = {
+	struct ldebugfs_vars nrs_delay_lprocfs_vars[] = {
 		{ .name		= "nrs_delay_min",
 		  .fops		= &ptlrpc_lprocfs_nrs_delay_min_fops,
 		  .data		= svc },
@@ -791,25 +790,13 @@ static int nrs_delay_lprocfs_init(struct ptlrpc_service *svc)
 		{ NULL }
 	};
 
-	if (svc->srv_procroot == NULL)
+	if (IS_ERR_OR_NULL(svc->srv_debugfs_entry))
 		return 0;
 
-	return lprocfs_add_vars(svc->srv_procroot, nrs_delay_lprocfs_vars,
-				NULL);
+	return ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_delay_lprocfs_vars,
+				 NULL);
 }
 
-static void nrs_delay_lprocfs_fini(struct ptlrpc_service *svc)
-{
-	if (svc->srv_procroot == NULL)
-		return;
-
-	lprocfs_remove_proc_entry("nrs_delay_min", svc->srv_procroot);
-	lprocfs_remove_proc_entry("nrs_delay_max", svc->srv_procroot);
-	lprocfs_remove_proc_entry("nrs_delay_pct", svc->srv_procroot);
-}
-
-#endif /* CONFIG_PROC_FS */
-
 /**
  * Delay policy operations
  */
@@ -822,10 +809,7 @@ static const struct ptlrpc_nrs_pol_ops nrs_delay_ops = {
 	.op_req_enqueue		= nrs_delay_req_add,
 	.op_req_dequeue		= nrs_delay_req_del,
 	.op_req_stop		= nrs_delay_req_stop,
-#ifdef CONFIG_PROC_FS
 	.op_lprocfs_init	= nrs_delay_lprocfs_init,
-	.op_lprocfs_fini	= nrs_delay_lprocfs_fini,
-#endif
 };
 
 /**
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c
index 96c3a6593d2dd..8b8e092dd8209 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_orr.c
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2013, 2016, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
  *
  * Copyright 2012 Xyratex Technology Limited
  */
@@ -45,7 +45,6 @@
 #include <obd_support.h>
 #include <obd_class.h>
 #include <lustre_net.h>
-#include <lustre/lustre_idl.h>
 #include <lustre_req_layout.h>
 #include "ptlrpc_internal.h"
 
@@ -1161,11 +1160,9 @@ static void nrs_orr_req_stop(struct ptlrpc_nrs_policy *policy,
 }
 
 /**
- * lprocfs interface
+ * debugfs interface
  */
 
-#ifdef CONFIG_PROC_FS
-
 /**
  * This allows to bundle the policy name into the lprocfs_vars::data pointer
  * so that lprocfs read/write functions can be used by both the ORR and TRR
@@ -1297,7 +1294,7 @@ ptlrpc_lprocfs_nrs_orr_quantum_seq_write(struct file *file,
         if (count > (sizeof(kernbuf) - 1))
                 return -EINVAL;
 
-	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
+	if (copy_from_user(kernbuf, buffer, count))
 		return -EFAULT;
 
         kernbuf[count] = '\0';
@@ -1310,8 +1307,9 @@ ptlrpc_lprocfs_nrs_orr_quantum_seq_write(struct file *file,
 	val = lprocfs_find_named_value(kernbuf, NRS_LPROCFS_QUANTUM_NAME_REG,
 				       &count_copy);
 	if (val != kernbuf) {
-		quantum_reg = simple_strtol(val, NULL, 10);
-
+		rc = kstrtol(val, 10, &quantum_reg);
+		if (rc)
+			return rc;
 		queue |= PTLRPC_NRS_QUEUE_REG;
 	}
 
@@ -1326,7 +1324,9 @@ ptlrpc_lprocfs_nrs_orr_quantum_seq_write(struct file *file,
 		if (!nrs_svc_has_hp(svc))
 			return -ENODEV;
 
-		quantum_hp = simple_strtol(val, NULL, 10);
+		rc = kstrtol(val, 10, &quantum_hp);
+		if (rc)
+			return rc;
 
 		queue |= PTLRPC_NRS_QUEUE_HP;
 	}
@@ -1336,10 +1336,9 @@ ptlrpc_lprocfs_nrs_orr_quantum_seq_write(struct file *file,
 	 * value
 	 */
 	if (queue == 0) {
-		if (!isdigit(kernbuf[0]))
-			return -EINVAL;
-
-		quantum_reg = simple_strtol(kernbuf, NULL, 10);
+		rc = kstrtol(kernbuf, 10, &quantum_reg);
+		if (rc)
+			return rc;
 
 		queue = PTLRPC_NRS_QUEUE_REG;
 
@@ -1387,7 +1386,8 @@ ptlrpc_lprocfs_nrs_orr_quantum_seq_write(struct file *file,
 
 	return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count;
 }
-LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_quantum);
+
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_quantum);
 
 #define LPROCFS_NRS_OFF_NAME_REG		"reg_offset_type:"
 #define LPROCFS_NRS_OFF_NAME_HP			"hp_offset_type:"
@@ -1512,7 +1512,7 @@ ptlrpc_lprocfs_nrs_orr_offset_type_seq_write(struct file *file,
         if (count > (sizeof(kernbuf) - 1))
                 return -EINVAL;
 
-	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
+	if (copy_from_user(kernbuf, buffer, count))
 		return -EFAULT;
 
         kernbuf[count] = '\0';
@@ -1607,7 +1607,8 @@ ptlrpc_lprocfs_nrs_orr_offset_type_seq_write(struct file *file,
 
 	return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count;
 }
-LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_offset_type);
+
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_offset_type);
 
 #define NRS_LPROCFS_REQ_SUPP_NAME_REG		"reg_supported:"
 #define NRS_LPROCFS_REQ_SUPP_NAME_HP		"hp_supported:"
@@ -1772,7 +1773,7 @@ ptlrpc_lprocfs_nrs_orr_supported_seq_write(struct file *file,
         if (count > (sizeof(kernbuf) - 1))
                 return -EINVAL;
 
-	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
+	if (copy_from_user(kernbuf, buffer, count))
 		return -EFAULT;
 
         kernbuf[count] = '\0';
@@ -1858,13 +1859,14 @@ ptlrpc_lprocfs_nrs_orr_supported_seq_write(struct file *file,
 
 	return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count;
 }
-LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_supported);
+
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_orr_supported);
 
 static int nrs_orr_lprocfs_init(struct ptlrpc_service *svc)
 {
 	int	i;
 
-	struct lprocfs_vars nrs_orr_lprocfs_vars[] = {
+	struct ldebugfs_vars nrs_orr_lprocfs_vars[] = {
 		{ .name		= "nrs_orr_quantum",
 		  .fops		= &ptlrpc_lprocfs_nrs_orr_quantum_fops	},
 		{ .name		= "nrs_orr_offset_type",
@@ -1874,7 +1876,7 @@ static int nrs_orr_lprocfs_init(struct ptlrpc_service *svc)
 		{ NULL }
 	};
 
-	if (svc->srv_procroot == NULL)
+	if (IS_ERR_OR_NULL(svc->srv_debugfs_entry))
 		return 0;
 
 	lprocfs_orr_data.svc = svc;
@@ -1882,21 +1884,10 @@ static int nrs_orr_lprocfs_init(struct ptlrpc_service *svc)
 	for (i = 0; i < ARRAY_SIZE(nrs_orr_lprocfs_vars); i++)
 		nrs_orr_lprocfs_vars[i].data = &lprocfs_orr_data;
 
-	return lprocfs_add_vars(svc->srv_procroot, nrs_orr_lprocfs_vars, NULL);
-}
-
-static void nrs_orr_lprocfs_fini(struct ptlrpc_service *svc)
-{
-	if (svc->srv_procroot == NULL)
-		return;
-
-	lprocfs_remove_proc_entry("nrs_orr_quantum", svc->srv_procroot);
-	lprocfs_remove_proc_entry("nrs_orr_offset_type", svc->srv_procroot);
-	lprocfs_remove_proc_entry("nrs_orr_supported", svc->srv_procroot);
+	return ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_orr_lprocfs_vars,
+				 NULL);
 }
 
-#endif /* CONFIG_PROC_FS */
-
 static const struct ptlrpc_nrs_pol_ops nrs_orr_ops = {
 	.op_policy_init		= nrs_orr_init,
 	.op_policy_start	= nrs_orr_start,
@@ -1908,10 +1899,7 @@ static const struct ptlrpc_nrs_pol_ops nrs_orr_ops = {
 	.op_req_enqueue		= nrs_orr_req_add,
 	.op_req_dequeue		= nrs_orr_req_del,
 	.op_req_stop		= nrs_orr_req_stop,
-#ifdef CONFIG_PROC_FS
 	.op_lprocfs_init	= nrs_orr_lprocfs_init,
-	.op_lprocfs_fini	= nrs_orr_lprocfs_fini,
-#endif
 };
 
 struct ptlrpc_nrs_pol_conf nrs_conf_orr = {
@@ -1926,14 +1914,11 @@ struct ptlrpc_nrs_pol_conf nrs_conf_orr = {
  *
  * TRR reuses much of the functions and data structures of ORR
  */
-
-#ifdef CONFIG_PROC_FS
-
 static int nrs_trr_lprocfs_init(struct ptlrpc_service *svc)
 {
 	int	i;
 
-	struct lprocfs_vars nrs_trr_lprocfs_vars[] = {
+	struct ldebugfs_vars nrs_trr_lprocfs_vars[] = {
 		{ .name		= "nrs_trr_quantum",
 		  .fops		= &ptlrpc_lprocfs_nrs_orr_quantum_fops },
 		{ .name		= "nrs_trr_offset_type",
@@ -1943,7 +1928,7 @@ static int nrs_trr_lprocfs_init(struct ptlrpc_service *svc)
 		{ NULL }
 	};
 
-	if (svc->srv_procroot == NULL)
+	if (IS_ERR_OR_NULL(svc->srv_debugfs_entry))
 		return 0;
 
 	lprocfs_trr_data.svc = svc;
@@ -1951,21 +1936,10 @@ static int nrs_trr_lprocfs_init(struct ptlrpc_service *svc)
 	for (i = 0; i < ARRAY_SIZE(nrs_trr_lprocfs_vars); i++)
 		nrs_trr_lprocfs_vars[i].data = &lprocfs_trr_data;
 
-	return lprocfs_add_vars(svc->srv_procroot, nrs_trr_lprocfs_vars, NULL);
+	return ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_trr_lprocfs_vars,
+				 NULL);
 }
 
-static void nrs_trr_lprocfs_fini(struct ptlrpc_service *svc)
-{
-	if (svc->srv_procroot == NULL)
-		return;
-
-	lprocfs_remove_proc_entry("nrs_trr_quantum", svc->srv_procroot);
-	lprocfs_remove_proc_entry("nrs_trr_offset_type", svc->srv_procroot);
-	lprocfs_remove_proc_entry("nrs_trr_supported", svc->srv_procroot);
-}
-
-#endif /* CONFIG_PROC_FS */
-
 /**
  * Reuse much of the ORR functionality for TRR.
  */
@@ -1980,10 +1954,7 @@ static const struct ptlrpc_nrs_pol_ops nrs_trr_ops = {
 	.op_req_enqueue		= nrs_orr_req_add,
 	.op_req_dequeue		= nrs_orr_req_del,
 	.op_req_stop		= nrs_orr_req_stop,
-#ifdef CONFIG_PROC_FS
 	.op_lprocfs_init	= nrs_trr_lprocfs_init,
-	.op_lprocfs_fini	= nrs_trr_lprocfs_fini,
-#endif
 };
 
 struct ptlrpc_nrs_pol_conf nrs_conf_trr = {
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c
index a81485554013b..07710bdb7bfd9 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/nrs_tbf.c
@@ -42,6 +42,7 @@
 #include <obd_support.h>
 #include <obd_class.h>
 #include <libcfs/libcfs.h>
+#include <lustre_req_layout.h>
 #include "ptlrpc_internal.h"
 
 /**
@@ -300,6 +301,7 @@ nrs_tbf_rule_start(struct ptlrpc_nrs_policy *policy,
 
 	memcpy(rule->tr_name, start->tc_name, strlen(start->tc_name));
 	rule->tr_rpc_rate = start->u.tc_start.ts_rpc_rate;
+	rule->tr_flags = start->u.tc_start.ts_rule_flags;
 	rule->tr_nsecs = NSEC_PER_SEC;
 	do_div(rule->tr_nsecs, rule->tr_rpc_rate);
 	rule->tr_depth = tbf_depth;
@@ -521,11 +523,9 @@ tbf_cli_compare(struct cfs_binheap_node *e1, struct cfs_binheap_node *e2)
 	cli1 = container_of(e1, struct nrs_tbf_client, tc_node);
 	cli2 = container_of(e2, struct nrs_tbf_client, tc_node);
 
-	if (cli1->tc_check_time + cli1->tc_nsecs <
-	    cli2->tc_check_time + cli2->tc_nsecs)
+	if (cli1->tc_deadline < cli2->tc_deadline)
 		return 1;
-	else if (cli1->tc_check_time + cli1->tc_nsecs >
-		 cli2->tc_check_time + cli2->tc_nsecs)
+	else if (cli1->tc_deadline > cli2->tc_deadline)
 		return 0;
 
 	if (cli1->tc_check_time < cli2->tc_check_time)
@@ -570,7 +570,7 @@ static void *nrs_tbf_jobid_hop_key(struct hlist_node *hnode)
 	return cli->tc_jobid;
 }
 
-static void *nrs_tbf_jobid_hop_object(struct hlist_node *hnode)
+static void *nrs_tbf_hop_object(struct hlist_node *hnode)
 {
 	return hlist_entry(hnode, struct nrs_tbf_client, tc_hnode);
 }
@@ -609,7 +609,7 @@ static struct cfs_hash_ops nrs_tbf_jobid_hash_ops = {
 	.hs_hash	= nrs_tbf_jobid_hop_hash,
 	.hs_keycmp	= nrs_tbf_jobid_hop_keycmp,
 	.hs_key		= nrs_tbf_jobid_hop_key,
-	.hs_object	= nrs_tbf_jobid_hop_object,
+	.hs_object	= nrs_tbf_hop_object,
 	.hs_get		= nrs_tbf_jobid_hop_get,
 	.hs_put		= nrs_tbf_jobid_hop_put,
 	.hs_put_locked	= nrs_tbf_jobid_hop_put,
@@ -1071,11 +1071,6 @@ static void *nrs_tbf_nid_hop_key(struct hlist_node *hnode)
 	return &cli->tc_nid;
 }
 
-static void *nrs_tbf_nid_hop_object(struct hlist_node *hnode)
-{
-	return hlist_entry(hnode, struct nrs_tbf_client, tc_hnode);
-}
-
 static void nrs_tbf_nid_hop_get(struct cfs_hash *hs, struct hlist_node *hnode)
 {
 	struct nrs_tbf_client *cli = hlist_entry(hnode,
@@ -1111,7 +1106,7 @@ static struct cfs_hash_ops nrs_tbf_nid_hash_ops = {
 	.hs_hash	= nrs_tbf_nid_hop_hash,
 	.hs_keycmp	= nrs_tbf_nid_hop_keycmp,
 	.hs_key		= nrs_tbf_nid_hop_key,
-	.hs_object	= nrs_tbf_nid_hop_object,
+	.hs_object	= nrs_tbf_hop_object,
 	.hs_get		= nrs_tbf_nid_hop_get,
 	.hs_put		= nrs_tbf_nid_hop_put,
 	.hs_put_locked	= nrs_tbf_nid_hop_put,
@@ -1307,11 +1302,6 @@ static void *nrs_tbf_hop_key(struct hlist_node *hnode)
 	return cli->tc_key;
 }
 
-static void *nrs_tbf_hop_object(struct hlist_node *hnode)
-{
-	return hlist_entry(hnode, struct nrs_tbf_client, tc_hnode);
-}
-
 static void nrs_tbf_hop_get(struct cfs_hash *hs, struct hlist_node *hnode)
 {
 	struct nrs_tbf_client *cli = hlist_entry(hnode,
@@ -1415,23 +1405,263 @@ nrs_tbf_cli_hash_lookup(struct cfs_hash *hs, struct cfs_hash_bd *bd,
 	return cli;
 }
 
+/**
+ * ONLY opcode presented in this function will be checked in
+ * nrs_tbf_id_cli_set(). That means, we can add or remove an
+ * opcode to enable or disable requests handled in nrs_tbf
+ */
+static struct req_format *req_fmt(__u32 opcode)
+{
+	switch (opcode) {
+	case OST_GETATTR:
+		return &RQF_OST_GETATTR;
+	case OST_SETATTR:
+		return &RQF_OST_SETATTR;
+	case OST_READ:
+		return &RQF_OST_BRW_READ;
+	case OST_WRITE:
+		return &RQF_OST_BRW_WRITE;
+	/* FIXME: OST_CREATE and OST_DESTROY comes from MDS
+	 * in most case. Should they be removed? */
+	case OST_CREATE:
+		return &RQF_OST_CREATE;
+	case OST_DESTROY:
+		return &RQF_OST_DESTROY;
+	case OST_PUNCH:
+		return &RQF_OST_PUNCH;
+	case OST_SYNC:
+		return &RQF_OST_SYNC;
+	case OST_LADVISE:
+		return &RQF_OST_LADVISE;
+	case MDS_GETATTR:
+		return &RQF_MDS_GETATTR;
+	case MDS_GETATTR_NAME:
+		return &RQF_MDS_GETATTR_NAME;
+	/* close is skipped to avoid LDLM cancel slowness */
+#if 0
+	case MDS_CLOSE:
+		return &RQF_MDS_CLOSE;
+#endif
+	case MDS_REINT:
+		return &RQF_MDS_REINT;
+	case MDS_READPAGE:
+		return &RQF_MDS_READPAGE;
+	case MDS_GET_ROOT:
+		return &RQF_MDS_GET_ROOT;
+	case MDS_STATFS:
+		return &RQF_MDS_STATFS;
+	case MDS_SYNC:
+		return &RQF_MDS_SYNC;
+	case MDS_QUOTACTL:
+		return &RQF_MDS_QUOTACTL;
+	case MDS_GETXATTR:
+		return &RQF_MDS_GETXATTR;
+	case MDS_GET_INFO:
+		return &RQF_MDS_GET_INFO;
+	/* HSM op is skipped */
+#if 0 
+	case MDS_HSM_STATE_GET:
+		return &RQF_MDS_HSM_STATE_GET;
+	case MDS_HSM_STATE_SET:
+		return &RQF_MDS_HSM_STATE_SET;
+	case MDS_HSM_ACTION:
+		return &RQF_MDS_HSM_ACTION;
+	case MDS_HSM_CT_REGISTER:
+		return &RQF_MDS_HSM_CT_REGISTER;
+	case MDS_HSM_CT_UNREGISTER:
+		return &RQF_MDS_HSM_CT_UNREGISTER;
+#endif
+	case MDS_SWAP_LAYOUTS:
+		return &RQF_MDS_SWAP_LAYOUTS;
+	case LDLM_ENQUEUE:
+		return &RQF_LDLM_ENQUEUE;
+	default:
+		return NULL;
+	}
+}
+
+static struct req_format *intent_req_fmt(__u32 it_opc)
+{
+	if (it_opc & (IT_OPEN | IT_CREAT))
+		return &RQF_LDLM_INTENT_OPEN;
+	else if (it_opc & (IT_GETATTR | IT_LOOKUP))
+		return &RQF_LDLM_INTENT_GETATTR;
+	else if (it_opc & IT_GETXATTR)
+		return &RQF_LDLM_INTENT_GETXATTR;
+	else if (it_opc & (IT_GLIMPSE | IT_BRW))
+		return &RQF_LDLM_INTENT;
+	else
+		return NULL;
+}
+
+static int ost_tbf_id_cli_set(struct ptlrpc_request *req,
+			      struct tbf_id *id)
+{
+	struct ost_body *body;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body != NULL) {
+		id->ti_uid = body->oa.o_uid;
+		id->ti_gid = body->oa.o_gid;
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static void unpack_ugid_from_mdt_body(struct ptlrpc_request *req,
+				      struct tbf_id *id)
+{
+	struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+						    &RMF_MDT_BODY);
+	LASSERT(b != NULL);
+
+	/* TODO: nodemaping feature converts {ug}id from individual
+	 * clients to the actual ones of the file system. Some work
+	 * may be needed to fix this. */
+	id->ti_uid = b->mbo_uid;
+	id->ti_gid = b->mbo_gid;
+}
+
+static void unpack_ugid_from_mdt_rec_reint(struct ptlrpc_request *req,
+					   struct tbf_id *id)
+{
+	struct mdt_rec_reint *rec;
+
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+	LASSERT(rec != NULL);
+
+	/* use the fs{ug}id as {ug}id of the process */
+	id->ti_uid = rec->rr_fsuid;
+	id->ti_gid = rec->rr_fsgid;
+}
+
+static int mdt_tbf_id_cli_set(struct ptlrpc_request *req,
+			      struct tbf_id *id)
+{
+	u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
+	int rc = 0;
+
+	switch (opc) {
+	case MDS_GETATTR:
+	case MDS_GETATTR_NAME:
+	case MDS_GET_ROOT:
+	case MDS_READPAGE:
+	case MDS_SYNC:
+	case MDS_GETXATTR:
+	case MDS_HSM_STATE_GET ... MDS_SWAP_LAYOUTS:
+		unpack_ugid_from_mdt_body(req, id);
+		break;
+	case MDS_CLOSE:
+	case MDS_REINT:
+		unpack_ugid_from_mdt_rec_reint(req, id);
+		break;
+	default:
+		rc = -EINVAL;
+		break;
+	}
+	return rc;
+}
+
+static int ldlm_tbf_id_cli_set(struct ptlrpc_request *req,
+			      struct tbf_id *id)
+{
+	struct ldlm_intent *lit;
+	struct req_format *fmt;
+
+	if (req->rq_reqmsg->lm_bufcount <= DLM_INTENT_IT_OFF)
+		return -EINVAL;
+
+	req_capsule_extend(&req->rq_pill, &RQF_LDLM_INTENT_BASIC);
+	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+	if (lit == NULL)
+		return -EINVAL;
+
+	fmt = intent_req_fmt(lit->opc);
+	if (fmt == NULL)
+		return -EINVAL;
+
+	req_capsule_extend(&req->rq_pill, fmt);
+
+	if (lit->opc & (IT_GETXATTR | IT_GETATTR | IT_LOOKUP))
+		unpack_ugid_from_mdt_body(req, id);
+	else if (lit->opc & (IT_OPEN | IT_OPEN | IT_GLIMPSE | IT_BRW))
+		unpack_ugid_from_mdt_rec_reint(req, id);
+	else
+		return -EINVAL;
+	return 0;
+}
+
+static int nrs_tbf_id_cli_set(struct ptlrpc_request *req, struct tbf_id *id,
+			      enum nrs_tbf_flag ti_type)
+{
+	u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
+	struct req_format *fmt = req_fmt(opc);
+	bool fmt_unset = false;
+	int rc;
+
+	memset(id, 0, sizeof(struct tbf_id));
+	id->ti_type = ti_type;
+
+	if (fmt == NULL)
+		return -EINVAL;
+	req_capsule_init(&req->rq_pill, req, RCL_SERVER);
+	if (req->rq_pill.rc_fmt == NULL) {
+		req_capsule_set(&req->rq_pill, fmt);
+		fmt_unset = true;
+	}
+
+	if (opc < OST_LAST_OPC)
+		rc = ost_tbf_id_cli_set(req, id);
+	else if (opc >= MDS_FIRST_OPC && opc < MDS_LAST_OPC)
+		rc = mdt_tbf_id_cli_set(req, id);
+	else if (opc == LDLM_ENQUEUE)
+		rc = ldlm_tbf_id_cli_set(req, id);
+	else
+		rc = -EINVAL;
+
+	/* restore it to the initialized state */
+	if (fmt_unset)
+		req->rq_pill.rc_fmt = NULL;
+	return rc;
+}
+
+static inline void nrs_tbf_cli_gen_key(struct nrs_tbf_client *cli,
+				       struct ptlrpc_request *req,
+				       char *keystr, size_t keystr_sz)
+{
+	const char *jobid;
+	u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
+	struct tbf_id id;
+
+	nrs_tbf_id_cli_set(req, &id, NRS_TBF_FLAG_UID | NRS_TBF_FLAG_GID);
+	jobid = lustre_msg_get_jobid(req->rq_reqmsg);
+	if (jobid == NULL)
+		jobid = NRS_TBF_JOBID_NULL;
+
+	snprintf(keystr, keystr_sz, "%s_%s_%d_%u_%u", jobid,
+		 libcfs_nid2str(req->rq_peer.nid), opc, id.ti_uid,
+		 id.ti_gid);
+
+	if (cli) {
+		INIT_LIST_HEAD(&cli->tc_lru);
+		strlcpy(cli->tc_key, keystr, sizeof(cli->tc_key));
+		strlcpy(cli->tc_jobid, jobid, sizeof(cli->tc_jobid));
+		cli->tc_nid = req->rq_peer.nid;
+		cli->tc_opcode = opc;
+		cli->tc_id = id;
+	}
+}
+
 static struct nrs_tbf_client *
 nrs_tbf_cli_find(struct nrs_tbf_head *head, struct ptlrpc_request *req)
 {
 	struct nrs_tbf_client *cli;
 	struct cfs_hash *hs = head->th_cli_hash;
 	struct cfs_hash_bd bd;
-	char keystr[NRS_TBF_KEY_LEN] = { '\0' };
-	const char *jobid;
-	__u32 opc;
+	char keystr[NRS_TBF_KEY_LEN];
 
-	jobid = lustre_msg_get_jobid(req->rq_reqmsg);
-	if (jobid == NULL)
-		jobid = NRS_TBF_JOBID_NULL;
-	opc = lustre_msg_get_opc(req->rq_reqmsg);
-	snprintf(keystr, sizeof(keystr), "%s_%s_%d", jobid,
-		 libcfs_nid2str(req->rq_peer.nid), opc);
-	LASSERT(strlen(keystr) < NRS_TBF_KEY_LEN);
+	nrs_tbf_cli_gen_key(NULL, req, keystr, sizeof(keystr));
 	cfs_hash_bd_get_and_lock(hs, (void *)keystr, &bd, 1);
 	cli = nrs_tbf_cli_hash_lookup(hs, &bd, keystr);
 	cfs_hash_bd_unlock(hs, &bd, 1);
@@ -1506,22 +1736,19 @@ nrs_tbf_generic_cli_init(struct nrs_tbf_client *cli,
 			 struct ptlrpc_request *req)
 {
 	char keystr[NRS_TBF_KEY_LEN];
-	const char *jobid;
-	__u32 opc;
 
-	jobid = lustre_msg_get_jobid(req->rq_reqmsg);
-	if (jobid == NULL)
-		jobid = NRS_TBF_JOBID_NULL;
-	opc = lustre_msg_get_opc(req->rq_reqmsg);
-	snprintf(keystr, sizeof(keystr), "%s_%s_%d", jobid,
-		 libcfs_nid2str(req->rq_peer.nid), opc);
+	nrs_tbf_cli_gen_key(cli, req, keystr, sizeof(keystr));
+}
 
-	LASSERT(strlen(keystr) < NRS_TBF_KEY_LEN);
-	INIT_LIST_HEAD(&cli->tc_lru);
-	memcpy(cli->tc_key, keystr, strlen(keystr));
-	memcpy(cli->tc_jobid, jobid, strlen(jobid));
-	cli->tc_nid = req->rq_peer.nid;
-	cli->tc_opcode = opc;
+static void
+nrs_tbf_id_list_free(struct list_head *uid_list)
+{
+	struct nrs_tbf_id *nti_id, *n;
+
+	list_for_each_entry_safe(nti_id, n, uid_list, nti_linkage) {
+		list_del_init(&nti_id->nti_linkage);
+		OBD_FREE_PTR(nti_id);
+	}
 }
 
 static void
@@ -1539,6 +1766,10 @@ nrs_tbf_expression_free(struct nrs_tbf_expression *expr)
 	case NRS_TBF_FIELD_OPCODE:
 		CFS_FREE_BITMAP(expr->te_opcodes);
 		break;
+	case NRS_TBF_FIELD_UID:
+	case NRS_TBF_FIELD_GID:
+		nrs_tbf_id_list_free(&expr->te_cond);
+		break;
 	default:
 		LBUG();
 	}
@@ -1598,6 +1829,9 @@ nrs_tbf_check_field(struct cfs_lstr *field, char *str)
 
 static int
 nrs_tbf_opcode_list_parse(char *str, int len, struct cfs_bitmap **bitmaptr);
+static int
+nrs_tbf_id_list_parse(char *str, int len, struct list_head *id_list,
+		      enum nrs_tbf_flag tif);
 
 static int
 nrs_tbf_expression_parse(struct cfs_lstr *src, struct list_head *cond_list)
@@ -1637,8 +1871,23 @@ nrs_tbf_expression_parse(struct cfs_lstr *src, struct list_head *cond_list)
 					      &expr->te_opcodes) < 0)
 			GOTO(out, rc = -EINVAL);
 		expr->te_field = NRS_TBF_FIELD_OPCODE;
-	} else
+	} else if (nrs_tbf_check_field(&field, "uid")) {
+		if (nrs_tbf_id_list_parse(src->ls_str,
+					  src->ls_len,
+					  &expr->te_cond,
+					  NRS_TBF_FLAG_UID) < 0)
+			GOTO(out, rc = -EINVAL);
+		expr->te_field = NRS_TBF_FIELD_UID;
+	} else if (nrs_tbf_check_field(&field, "gid")) {
+		if (nrs_tbf_id_list_parse(src->ls_str,
+					  src->ls_len,
+					  &expr->te_cond,
+					  NRS_TBF_FLAG_GID) < 0)
+			GOTO(out, rc = -EINVAL);
+		expr->te_field = NRS_TBF_FIELD_GID;
+	} else {
 		GOTO(out, rc = -EINVAL);
+	}
 
 	list_add_tail(&expr->te_linkage, cond_list);
 	return 0;
@@ -1719,6 +1968,9 @@ nrs_tbf_generic_parse(struct nrs_tbf_cmd *cmd, const char *id)
 	return rc;
 }
 
+static int
+nrs_tbf_id_list_match(struct list_head *id_list, struct tbf_id id);
+
 static int
 nrs_tbf_expression_match(struct nrs_tbf_expression *expr,
 			 struct nrs_tbf_rule *rule,
@@ -1731,6 +1983,9 @@ nrs_tbf_expression_match(struct nrs_tbf_expression *expr,
 		return nrs_tbf_jobid_list_match(&expr->te_cond, cli->tc_jobid);
 	case NRS_TBF_FIELD_OPCODE:
 		return cfs_bitmap_check(expr->te_opcodes, cli->tc_opcode);
+	case NRS_TBF_FIELD_UID:
+	case NRS_TBF_FIELD_GID:
+		return nrs_tbf_id_list_match(&expr->te_cond, cli->tc_id);
 	default:
 		return 0;
 	}
@@ -1868,11 +2123,6 @@ static void *nrs_tbf_opcode_hop_key(struct hlist_node *hnode)
 	return &cli->tc_opcode;
 }
 
-static void *nrs_tbf_opcode_hop_object(struct hlist_node *hnode)
-{
-	return hlist_entry(hnode, struct nrs_tbf_client, tc_hnode);
-}
-
 static void nrs_tbf_opcode_hop_get(struct cfs_hash *hs,
 				   struct hlist_node *hnode)
 {
@@ -1911,7 +2161,7 @@ static struct cfs_hash_ops nrs_tbf_opcode_hash_ops = {
 	.hs_hash	= nrs_tbf_opcode_hop_hash,
 	.hs_keycmp	= nrs_tbf_opcode_hop_keycmp,
 	.hs_key		= nrs_tbf_opcode_hop_key,
-	.hs_object	= nrs_tbf_opcode_hop_object,
+	.hs_object	= nrs_tbf_hop_object,
 	.hs_get		= nrs_tbf_opcode_hop_get,
 	.hs_put		= nrs_tbf_opcode_hop_put,
 	.hs_put_locked	= nrs_tbf_opcode_hop_put,
@@ -2127,6 +2377,340 @@ struct nrs_tbf_ops nrs_tbf_opcode_ops = {
 	.o_rule_fini = nrs_tbf_opcode_rule_fini,
 };
 
+static unsigned nrs_tbf_id_hop_hash(struct cfs_hash *hs, const void *key,
+				    unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, sizeof(struct tbf_id), mask);
+}
+
+static int nrs_tbf_id_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+	const struct tbf_id *opc = key;
+	enum nrs_tbf_flag ntf;
+	struct nrs_tbf_client *cli = hlist_entry(hnode, struct nrs_tbf_client,
+						 tc_hnode);
+	ntf = opc->ti_type & cli->tc_id.ti_type;
+	if ((ntf & NRS_TBF_FLAG_UID) && opc->ti_uid != cli->tc_id.ti_uid)
+		return 0;
+
+	if ((ntf & NRS_TBF_FLAG_GID) && opc->ti_gid != cli->tc_id.ti_gid)
+		return 0;
+
+	return 1;
+}
+
+static void *nrs_tbf_id_hop_key(struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+	return &cli->tc_id;
+}
+
+static void nrs_tbf_id_hop_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	atomic_inc(&cli->tc_ref);
+}
+
+static void nrs_tbf_id_hop_put(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	atomic_dec(&cli->tc_ref);
+}
+
+static void
+nrs_tbf_id_hop_exit(struct cfs_hash *hs, struct hlist_node *hnode)
+
+{
+	struct nrs_tbf_client *cli = hlist_entry(hnode,
+						 struct nrs_tbf_client,
+						 tc_hnode);
+
+	LASSERT(atomic_read(&cli->tc_ref) == 0);
+	nrs_tbf_cli_fini(cli);
+}
+
+static struct cfs_hash_ops nrs_tbf_id_hash_ops = {
+	.hs_hash	= nrs_tbf_id_hop_hash,
+	.hs_keycmp	= nrs_tbf_id_hop_keycmp,
+	.hs_key		= nrs_tbf_id_hop_key,
+	.hs_object	= nrs_tbf_hop_object,
+	.hs_get		= nrs_tbf_id_hop_get,
+	.hs_put		= nrs_tbf_id_hop_put,
+	.hs_put_locked	= nrs_tbf_id_hop_put,
+	.hs_exit	= nrs_tbf_id_hop_exit,
+};
+
+static int
+nrs_tbf_id_startup(struct ptlrpc_nrs_policy *policy,
+		   struct nrs_tbf_head *head)
+{
+	struct nrs_tbf_cmd start;
+	int rc;
+
+	head->th_cli_hash = cfs_hash_create("nrs_tbf_id_hash",
+					    NRS_TBF_NID_BITS,
+					    NRS_TBF_NID_BITS,
+					    NRS_TBF_NID_BKT_BITS, 0,
+					    CFS_HASH_MIN_THETA,
+					    CFS_HASH_MAX_THETA,
+					    &nrs_tbf_id_hash_ops,
+					    CFS_HASH_RW_BKTLOCK);
+	if (head->th_cli_hash == NULL)
+		return -ENOMEM;
+
+	memset(&start, 0, sizeof(start));
+	start.u.tc_start.ts_ids_str = "*";
+	start.u.tc_start.ts_rpc_rate = tbf_rate;
+	start.u.tc_start.ts_rule_flags = NTRS_DEFAULT;
+	start.tc_name = NRS_TBF_DEFAULT_RULE;
+	INIT_LIST_HEAD(&start.u.tc_start.ts_ids);
+	rc = nrs_tbf_rule_start(policy, head, &start);
+	if (rc) {
+		cfs_hash_putref(head->th_cli_hash);
+		head->th_cli_hash = NULL;
+	}
+
+	return rc;
+}
+
+static struct nrs_tbf_client *
+nrs_tbf_id_cli_find(struct nrs_tbf_head *head,
+		    struct ptlrpc_request *req)
+{
+	struct tbf_id id;
+
+	LASSERT(head->th_type_flag == NRS_TBF_FLAG_UID ||
+		head->th_type_flag == NRS_TBF_FLAG_GID);
+
+	nrs_tbf_id_cli_set(req, &id, head->th_type_flag);
+	return cfs_hash_lookup(head->th_cli_hash, &id);
+}
+
+static struct nrs_tbf_client *
+nrs_tbf_id_cli_findadd(struct nrs_tbf_head *head,
+		       struct nrs_tbf_client *cli)
+{
+	return cfs_hash_findadd_unique(head->th_cli_hash, &cli->tc_id,
+				       &cli->tc_hnode);
+}
+
+static void
+nrs_tbf_uid_cli_init(struct nrs_tbf_client *cli,
+		     struct ptlrpc_request *req)
+{
+	nrs_tbf_id_cli_set(req, &cli->tc_id, NRS_TBF_FLAG_UID);
+}
+
+static void
+nrs_tbf_gid_cli_init(struct nrs_tbf_client *cli,
+		     struct ptlrpc_request *req)
+{
+	nrs_tbf_id_cli_set(req, &cli->tc_id, NRS_TBF_FLAG_GID);
+}
+
+static int
+nrs_tbf_id_list_match(struct list_head *id_list, struct tbf_id id)
+{
+	struct nrs_tbf_id *nti_id;
+	enum nrs_tbf_flag flag;
+
+	list_for_each_entry(nti_id, id_list, nti_linkage) {
+		flag = id.ti_type & nti_id->nti_id.ti_type;
+		if (!flag)
+			continue;
+
+		if ((flag & NRS_TBF_FLAG_UID) &&
+		    (id.ti_uid != nti_id->nti_id.ti_uid))
+			continue;
+
+		if ((flag & NRS_TBF_FLAG_GID) &&
+		    (id.ti_gid != nti_id->nti_id.ti_gid))
+			continue;
+
+		return 1;
+	}
+	return 0;
+}
+
+static int
+nrs_tbf_id_rule_match(struct nrs_tbf_rule *rule,
+		      struct nrs_tbf_client *cli)
+{
+	return nrs_tbf_id_list_match(&rule->tr_ids, cli->tc_id);
+}
+
+static void nrs_tbf_id_cmd_fini(struct nrs_tbf_cmd *cmd)
+{
+	nrs_tbf_id_list_free(&cmd->u.tc_start.ts_ids);
+
+	if (cmd->u.tc_start.ts_ids_str)
+		OBD_FREE(cmd->u.tc_start.ts_ids_str,
+			 strlen(cmd->u.tc_start.ts_ids_str) + 1);
+}
+
+static int
+nrs_tbf_id_list_parse(char *str, int len, struct list_head *id_list,
+		      enum nrs_tbf_flag tif)
+{
+	struct cfs_lstr src;
+	struct cfs_lstr res;
+	int rc = 0;
+	struct tbf_id id = { 0 };
+	ENTRY;
+
+	if (tif != NRS_TBF_FLAG_UID && tif != NRS_TBF_FLAG_GID)
+		RETURN(-EINVAL);
+
+	src.ls_str = str;
+	src.ls_len = len;
+	INIT_LIST_HEAD(id_list);
+	while (src.ls_str) {
+		struct nrs_tbf_id *nti_id;
+
+		if (cfs_gettok(&src, ' ', &res) == 0)
+			GOTO(out, rc = -EINVAL);
+
+		id.ti_type = tif;
+		if (tif == NRS_TBF_FLAG_UID) {
+			if (!cfs_str2num_check(res.ls_str, res.ls_len,
+					       &id.ti_uid, 0, (u32)~0U))
+				GOTO(out, rc = -EINVAL);
+		} else {
+			if (!cfs_str2num_check(res.ls_str, res.ls_len,
+					       &id.ti_gid, 0, (u32)~0U))
+				GOTO(out, rc = -EINVAL);
+		}
+
+		OBD_ALLOC_PTR(nti_id);
+		if (nti_id == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		nti_id->nti_id = id;
+		list_add_tail(&nti_id->nti_linkage, id_list);
+	}
+out:
+	if (rc)
+		nrs_tbf_id_list_free(id_list);
+	RETURN(rc);
+}
+
+static int nrs_tbf_ug_id_parse(struct nrs_tbf_cmd *cmd, char *id)
+{
+	struct cfs_lstr src;
+	int rc;
+	enum nrs_tbf_flag tif;
+
+	tif = cmd->u.tc_start.ts_valid_type;
+
+	src.ls_str = id;
+	src.ls_len = strlen(id);
+
+	rc = nrs_tbf_check_id_value(&src,
+				    tif == NRS_TBF_FLAG_UID ? "uid" : "gid");
+	if (rc)
+		return rc;
+
+	OBD_ALLOC(cmd->u.tc_start.ts_ids_str, src.ls_len + 1);
+	if (cmd->u.tc_start.ts_ids_str == NULL)
+		return -ENOMEM;
+
+	strlcpy(cmd->u.tc_start.ts_ids_str, src.ls_str, src.ls_len + 1);
+
+	rc = nrs_tbf_id_list_parse(cmd->u.tc_start.ts_ids_str,
+				   strlen(cmd->u.tc_start.ts_ids_str),
+				   &cmd->u.tc_start.ts_ids, tif);
+	if (rc)
+		nrs_tbf_id_cmd_fini(cmd);
+
+	return rc;
+}
+
+static int
+nrs_tbf_id_rule_init(struct ptlrpc_nrs_policy *policy,
+		     struct nrs_tbf_rule *rule,
+		     struct nrs_tbf_cmd *start)
+{
+	struct nrs_tbf_head *head = rule->tr_head;
+	int rc = 0;
+	enum nrs_tbf_flag tif = head->th_type_flag;
+	int ids_len = strlen(start->u.tc_start.ts_ids_str) + 1;
+
+	LASSERT(start->u.tc_start.ts_ids_str);
+	INIT_LIST_HEAD(&rule->tr_ids);
+
+	OBD_ALLOC(rule->tr_ids_str, ids_len);
+	if (rule->tr_ids_str == NULL)
+		return -ENOMEM;
+
+	strlcpy(rule->tr_ids_str, start->u.tc_start.ts_ids_str,
+		ids_len);
+
+	if (!list_empty(&start->u.tc_start.ts_ids)) {
+		rc = nrs_tbf_id_list_parse(rule->tr_ids_str,
+					   strlen(rule->tr_ids_str),
+					   &rule->tr_ids, tif);
+		if (rc)
+			CERROR("%ss {%s} illegal\n",
+			       tif == NRS_TBF_FLAG_UID ? "uid" : "gid",
+			       rule->tr_ids_str);
+	}
+	if (rc) {
+		OBD_FREE(rule->tr_ids_str, ids_len);
+		rule->tr_ids_str = NULL;
+	}
+	return rc;
+}
+
+static int
+nrs_tbf_id_rule_dump(struct nrs_tbf_rule *rule, struct seq_file *m)
+{
+	seq_printf(m, "%s {%s} %llu, ref %d\n", rule->tr_name,
+		   rule->tr_ids_str, rule->tr_rpc_rate,
+		   atomic_read(&rule->tr_ref) - 1);
+	return 0;
+}
+
+static void nrs_tbf_id_rule_fini(struct nrs_tbf_rule *rule)
+{
+	nrs_tbf_id_list_free(&rule->tr_ids);
+	if (rule->tr_ids_str != NULL)
+		OBD_FREE(rule->tr_ids_str, strlen(rule->tr_ids_str) + 1);
+}
+
+struct nrs_tbf_ops nrs_tbf_uid_ops = {
+	.o_name = NRS_TBF_TYPE_UID,
+	.o_startup = nrs_tbf_id_startup,
+	.o_cli_find = nrs_tbf_id_cli_find,
+	.o_cli_findadd = nrs_tbf_id_cli_findadd,
+	.o_cli_put = nrs_tbf_nid_cli_put,
+	.o_cli_init = nrs_tbf_uid_cli_init,
+	.o_rule_init = nrs_tbf_id_rule_init,
+	.o_rule_dump = nrs_tbf_id_rule_dump,
+	.o_rule_match = nrs_tbf_id_rule_match,
+	.o_rule_fini = nrs_tbf_id_rule_fini,
+};
+
+struct nrs_tbf_ops nrs_tbf_gid_ops = {
+	.o_name = NRS_TBF_TYPE_GID,
+	.o_startup = nrs_tbf_id_startup,
+	.o_cli_find = nrs_tbf_id_cli_find,
+	.o_cli_findadd = nrs_tbf_id_cli_findadd,
+	.o_cli_put = nrs_tbf_nid_cli_put,
+	.o_cli_init = nrs_tbf_gid_cli_init,
+	.o_rule_init = nrs_tbf_id_rule_init,
+	.o_rule_dump = nrs_tbf_id_rule_dump,
+	.o_rule_match = nrs_tbf_id_rule_match,
+	.o_rule_fini = nrs_tbf_id_rule_fini,
+};
+
 static struct nrs_tbf_type nrs_tbf_types[] = {
 	{
 		.ntt_name = NRS_TBF_TYPE_JOBID,
@@ -2148,6 +2732,16 @@ static struct nrs_tbf_type nrs_tbf_types[] = {
 		.ntt_flag = NRS_TBF_FLAG_GENERIC,
 		.ntt_ops = &nrs_tbf_generic_ops,
 	},
+	{
+		.ntt_name = NRS_TBF_TYPE_UID,
+		.ntt_flag = NRS_TBF_FLAG_UID,
+		.ntt_ops = &nrs_tbf_uid_ops,
+	},
+	{
+		.ntt_name = NRS_TBF_TYPE_GID,
+		.ntt_flag = NRS_TBF_FLAG_GID,
+		.ntt_ops = &nrs_tbf_gid_ops,
+	},
 };
 
 /**
@@ -2476,10 +3070,12 @@ struct ptlrpc_nrs_request *nrs_tbf_req_get(struct ptlrpc_nrs_policy *policy,
 				     struct ptlrpc_nrs_request,
 				     nr_u.tbf.tr_list);
 	} else {
+		struct nrs_tbf_rule *rule = cli->tc_rule;
 		__u64 now = ktime_to_ns(ktime_get());
 		__u64 passed;
 		__u64 ntoken;
 		__u64 deadline;
+		__u64 old_resid = 0;
 
 		deadline = cli->tc_check_time +
 			  cli->tc_nsecs;
@@ -2487,9 +3083,19 @@ struct ptlrpc_nrs_request *nrs_tbf_req_get(struct ptlrpc_nrs_policy *policy,
 		passed = now - cli->tc_check_time;
 		ntoken = passed * cli->tc_rpc_rate;
 		do_div(ntoken, NSEC_PER_SEC);
+
 		ntoken += cli->tc_ntoken;
-		if (ntoken > cli->tc_depth)
+		if (rule->tr_flags & NTRS_REALTIME) {
+			LASSERT(cli->tc_nsecs_resid < cli->tc_nsecs);
+			old_resid = cli->tc_nsecs_resid;
+			cli->tc_nsecs_resid += passed % cli->tc_nsecs;
+			if (cli->tc_nsecs_resid > cli->tc_nsecs) {
+				ntoken++;
+				cli->tc_nsecs_resid -= cli->tc_nsecs;
+			}
+		} else if (ntoken > cli->tc_depth)
 			ntoken = cli->tc_depth;
+
 		if (ntoken > 0) {
 			struct ptlrpc_request *req;
 			nrq = list_entry(cli->tc_list.next,
@@ -2507,6 +3113,8 @@ struct ptlrpc_nrs_request *nrs_tbf_req_get(struct ptlrpc_nrs_policy *policy,
 						   &cli->tc_node);
 				cli->tc_in_heap = false;
 			} else {
+				if (!(rule->tr_flags & NTRS_REALTIME))
+					cli->tc_deadline = now + cli->tc_nsecs;
 				cfs_binheap_relocate(head->th_binheap,
 						     &cli->tc_node);
 			}
@@ -2520,6 +3128,15 @@ struct ptlrpc_nrs_request *nrs_tbf_req_get(struct ptlrpc_nrs_policy *policy,
 		} else {
 			ktime_t time;
 
+			if (rule->tr_flags & NTRS_REALTIME) {
+				cli->tc_deadline = deadline;
+				cli->tc_nsecs_resid = old_resid;
+				cfs_binheap_relocate(head->th_binheap,
+						     &cli->tc_node);
+				if (node != cfs_binheap_root(head->th_binheap))
+					return nrs_tbf_req_get(policy,
+							       peek, force);
+			}
 			policy->pol_nrs->nrs_throttling = 1;
 			head->th_deadline = deadline;
 			time = ktime_set(0, 0);
@@ -2555,6 +3172,7 @@ static int nrs_tbf_req_add(struct ptlrpc_nrs_policy *policy,
 			    struct nrs_tbf_head, th_res);
 	if (list_empty(&cli->tc_list)) {
 		LASSERT(!cli->tc_in_heap);
+		cli->tc_deadline = cli->tc_check_time + cli->tc_nsecs;
 		rc = cfs_binheap_insert(head->th_binheap, &cli->tc_node);
 		if (rc == 0) {
 			cli->tc_in_heap = true;
@@ -2562,8 +3180,7 @@ static int nrs_tbf_req_add(struct ptlrpc_nrs_policy *policy,
 			list_add_tail(&nrq->nr_u.tbf.tr_list,
 					  &cli->tc_list);
 			if (policy->pol_nrs->nrs_throttling) {
-				__u64 deadline = cli->tc_check_time +
-						 cli->tc_nsecs;
+				__u64 deadline = cli->tc_deadline;
 				if ((head->th_deadline > deadline) &&
 				    (hrtimer_try_to_cancel(&head->th_timer)
 				     >= 0)) {
@@ -2649,10 +3266,8 @@ static void nrs_tbf_req_stop(struct ptlrpc_nrs_policy *policy,
 	       nrq->nr_u.tbf.tr_sequence);
 }
 
-#ifdef CONFIG_PROC_FS
-
 /**
- * lprocfs interface
+ * debugfs interface
  */
 
 /**
@@ -2719,6 +3334,7 @@ ptlrpc_lprocfs_nrs_tbf_rule_seq_show(struct seq_file *m, void *data)
 static int nrs_tbf_id_parse(struct nrs_tbf_cmd *cmd, char *token)
 {
 	int rc;
+	ENTRY;
 
 	switch (cmd->u.tc_start.ts_valid_type) {
 	case NRS_TBF_FLAG_JOBID:
@@ -2733,24 +3349,41 @@ static int nrs_tbf_id_parse(struct nrs_tbf_cmd *cmd, char *token)
 	case NRS_TBF_FLAG_GENERIC:
 		rc = nrs_tbf_generic_parse(cmd, token);
 		break;
+	case NRS_TBF_FLAG_UID:
+	case NRS_TBF_FLAG_GID:
+		rc = nrs_tbf_ug_id_parse(cmd, token);
+		break;
 	default:
 		RETURN(-EINVAL);
 	}
 
-	return rc;
+	RETURN(rc);
 }
 
 static void nrs_tbf_cmd_fini(struct nrs_tbf_cmd *cmd)
 {
 	if (cmd->tc_cmd == NRS_CTL_TBF_START_RULE) {
-		if (cmd->u.tc_start.ts_valid_type == NRS_TBF_FLAG_JOBID)
+		switch (cmd->u.tc_start.ts_valid_type) {
+		case NRS_TBF_FLAG_JOBID:
 			nrs_tbf_jobid_cmd_fini(cmd);
-		else if (cmd->u.tc_start.ts_valid_type == NRS_TBF_FLAG_NID)
+			break;
+		case NRS_TBF_FLAG_NID:
 			nrs_tbf_nid_cmd_fini(cmd);
-		else if (cmd->u.tc_start.ts_valid_type == NRS_TBF_FLAG_OPCODE)
+			break;
+		case NRS_TBF_FLAG_OPCODE:
 			nrs_tbf_opcode_cmd_fini(cmd);
-		else if (cmd->u.tc_start.ts_valid_type == NRS_TBF_FLAG_GENERIC)
+			break;
+		case NRS_TBF_FLAG_GENERIC:
 			nrs_tbf_generic_cmd_fini(cmd);
+			break;
+		case NRS_TBF_FLAG_UID:
+		case NRS_TBF_FLAG_GID:
+			nrs_tbf_id_cmd_fini(cmd);
+			break;
+		default:
+			CWARN("unknown NRS_TBF_FLAGS:0x%x\n",
+			      cmd->u.tc_start.ts_valid_type);
+		}
 	}
 }
 
@@ -2804,6 +3437,15 @@ nrs_tbf_parse_value_pair(struct nrs_tbf_cmd *cmd, char *buffer)
 			cmd->u.tc_change.tc_next_name = val;
 		else
 			return -EINVAL;
+	} else if (strcmp(key, "realtime") == 0) {
+		unsigned long realtime;
+
+		rc = kstrtoul(val, 10, &realtime);
+		if (rc)
+			return rc;
+
+		if (realtime > 0)
+			cmd->u.tc_start.ts_rule_flags |= NTRS_REALTIME;
 	} else {
 		return -EINVAL;
 	}
@@ -2965,7 +3607,7 @@ ptlrpc_lprocfs_nrs_tbf_rule_seq_write(struct file *file,
 	if (count > LPROCFS_WR_NRS_TBF_MAX_CMD - 1)
 		GOTO(out_free_kernbuff, rc = -EINVAL);
 
-	if (lprocfs_copy_from_user(file, kernbuf, buffer, count))
+	if (copy_from_user(kernbuf, buffer, count))
 		GOTO(out_free_kernbuff, rc = -EFAULT);
 
 	val = kernbuf;
@@ -3013,7 +3655,8 @@ ptlrpc_lprocfs_nrs_tbf_rule_seq_write(struct file *file,
 out:
 	return rc ? rc : count;
 }
-LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_tbf_rule);
+
+LDEBUGFS_SEQ_FOPS(ptlrpc_lprocfs_nrs_tbf_rule);
 
 /**
  * Initializes a TBF policy's lprocfs interface for service \a svc
@@ -3025,34 +3668,20 @@ LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_tbf_rule);
  */
 static int nrs_tbf_lprocfs_init(struct ptlrpc_service *svc)
 {
-	struct lprocfs_vars nrs_tbf_lprocfs_vars[] = {
+	struct ldebugfs_vars nrs_tbf_lprocfs_vars[] = {
 		{ .name		= "nrs_tbf_rule",
 		  .fops		= &ptlrpc_lprocfs_nrs_tbf_rule_fops,
 		  .data = svc },
 		{ NULL }
 	};
 
-	if (svc->srv_procroot == NULL)
+	if (IS_ERR_OR_NULL(svc->srv_debugfs_entry))
 		return 0;
 
-	return lprocfs_add_vars(svc->srv_procroot, nrs_tbf_lprocfs_vars, NULL);
+	return ldebugfs_add_vars(svc->srv_debugfs_entry, nrs_tbf_lprocfs_vars,
+				 NULL);
 }
 
-/**
- * Cleans up a TBF policy's lprocfs interface for service \a svc
- *
- * \param[in] svc the service
- */
-static void nrs_tbf_lprocfs_fini(struct ptlrpc_service *svc)
-{
-	if (svc->srv_procroot == NULL)
-		return;
-
-	lprocfs_remove_proc_entry("nrs_tbf_rule", svc->srv_procroot);
-}
-
-#endif /* CONFIG_PROC_FS */
-
 /**
  * TBF policy operations
  */
@@ -3066,10 +3695,7 @@ static const struct ptlrpc_nrs_pol_ops nrs_tbf_ops = {
 	.op_req_enqueue		= nrs_tbf_req_add,
 	.op_req_dequeue		= nrs_tbf_req_del,
 	.op_req_stop		= nrs_tbf_req_stop,
-#ifdef CONFIG_PROC_FS
 	.op_lprocfs_init	= nrs_tbf_lprocfs_init,
-	.op_lprocfs_fini	= nrs_tbf_lprocfs_fini,
-#endif
 };
 
 /**
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c b/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c
index 3e97aa6332ed3..81d7ba5812233 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -42,8 +42,6 @@
 
 #include <libcfs/libcfs.h>
 
-#include <lustre/ll_fiemap.h>
-
 #include <llog_swab.h>
 #include <lustre_net.h>
 #include <lustre_swab.h>
@@ -62,13 +60,15 @@ static inline __u32 lustre_msg_hdr_size_v2(__u32 count)
 
 __u32 lustre_msg_hdr_size(__u32 magic, __u32 count)
 {
-        switch (magic) {
-        case LUSTRE_MSG_MAGIC_V2:
-                return lustre_msg_hdr_size_v2(count);
-        default:
-                LASSERTF(0, "incorrect message magic: %08x\n", magic);
+	LASSERT(count > 0);
+
+	switch (magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_msg_hdr_size_v2(count);
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", magic);
 		return 0;
-        }
+	}
 }
 
 void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout,
@@ -80,25 +80,26 @@ void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout,
                 lustre_set_rep_swabbed(req, index);
 }
 
-int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout,
-			 __u32 index)
+bool ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout,
+			  __u32 index)
 {
-        if (inout)
-                return (ptlrpc_req_need_swab(req) &&
-                        !lustre_req_swabbed(req, index));
-        else
-                return (ptlrpc_rep_need_swab(req) &&
-                        !lustre_rep_swabbed(req, index));
+	if (inout)
+		return (ptlrpc_req_need_swab(req) &&
+			!lustre_req_swabbed(req, index));
+
+	return (ptlrpc_rep_need_swab(req) && !lustre_rep_swabbed(req, index));
 }
 
 static inline int lustre_msg_check_version_v2(struct lustre_msg_v2 *msg,
-						__u32 version)
+					      enum lustre_msg_version version)
 {
-        __u32 ver = lustre_msg_get_version(msg);
-        return (ver & LUSTRE_VERSION_MASK) != version;
+	enum lustre_msg_version ver = lustre_msg_get_version(msg);
+
+	return (ver & LUSTRE_VERSION_MASK) != version;
 }
 
-int lustre_msg_check_version(struct lustre_msg *msg, __u32 version)
+int lustre_msg_check_version(struct lustre_msg *msg,
+			     enum lustre_msg_version version)
 {
 #define LUSTRE_MSG_MAGIC_V1 0x0BD00BD0
 	switch (msg->lm_magic) {
@@ -136,13 +137,14 @@ EXPORT_SYMBOL(lustre_msg_early_size);
 __u32 lustre_msg_size_v2(int count, __u32 *lengths)
 {
 	__u32 size;
-        int i;
+	int i;
 
-        size = lustre_msg_hdr_size_v2(count);
-        for (i = 0; i < count; i++)
-                size += cfs_size_round(lengths[i]);
+	LASSERT(count > 0);
+	size = lustre_msg_hdr_size_v2(count);
+	for (i = 0; i < count; i++)
+		size += cfs_size_round(lengths[i]);
 
-        return size;
+	return size;
 }
 EXPORT_SYMBOL(lustre_msg_size_v2);
 
@@ -185,22 +187,25 @@ __u32 lustre_packed_msg_size(struct lustre_msg *msg)
                 return 0;
         }
 }
+EXPORT_SYMBOL(lustre_packed_msg_size);
 
 void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens,
-                        char **bufs)
+			char **bufs)
 {
-        char *ptr;
-        int i;
+	char *ptr;
+	int i;
 
-        msg->lm_bufcount = count;
-        /* XXX: lm_secflvr uninitialized here */
-        msg->lm_magic = LUSTRE_MSG_MAGIC_V2;
+	LASSERT(count > 0);
 
-        for (i = 0; i < count; i++)
-                msg->lm_buflens[i] = lens[i];
+	msg->lm_bufcount = count;
+	/* XXX: lm_secflvr uninitialized here */
+	msg->lm_magic = LUSTRE_MSG_MAGIC_V2;
 
-        if (bufs == NULL)
-                return;
+	for (i = 0; i < count; i++)
+		msg->lm_buflens[i] = lens[i];
+
+	if (bufs == NULL)
+		return;
 
 	ptr = (char *)msg + lustre_msg_hdr_size_v2(count);
 	for (i = 0; i < count; i++) {
@@ -327,24 +332,25 @@ void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs)
 }
 
 int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
-                         __u32 *lens, char **bufs, int flags)
+			 __u32 *lens, char **bufs, int flags)
 {
-        struct ptlrpc_reply_state *rs;
-        int                        msg_len, rc;
-        ENTRY;
+	struct ptlrpc_reply_state *rs;
+	int                        msg_len, rc;
+	ENTRY;
 
-        LASSERT(req->rq_reply_state == NULL);
+	LASSERT(req->rq_reply_state == NULL);
+	LASSERT(count > 0);
 
-        if ((flags & LPRFL_EARLY_REPLY) == 0) {
+	if ((flags & LPRFL_EARLY_REPLY) == 0) {
 		spin_lock(&req->rq_lock);
 		req->rq_packed_final = 1;
 		spin_unlock(&req->rq_lock);
-        }
+	}
 
-        msg_len = lustre_msg_size_v2(count, lens);
-        rc = sptlrpc_svc_alloc_rs(req, msg_len);
-        if (rc)
-                RETURN(rc);
+	msg_len = lustre_msg_size_v2(count, lens);
+	rc = sptlrpc_svc_alloc_rs(req, msg_len);
+	if (rc)
+		RETURN(rc);
 
 	rs = req->rq_reply_state;
 	atomic_set(&rs->rs_refcount, 1);	/* 1 ref for rq_reply_state */
@@ -356,16 +362,16 @@ int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
 	INIT_LIST_HEAD(&rs->rs_list);
 	spin_lock_init(&rs->rs_lock);
 
-        req->rq_replen = msg_len;
-        req->rq_reply_state = rs;
-        req->rq_repmsg = rs->rs_msg;
+	req->rq_replen = msg_len;
+	req->rq_reply_state = rs;
+	req->rq_repmsg = rs->rs_msg;
 
-        lustre_init_msg_v2(rs->rs_msg, count, lens, bufs);
-        lustre_msg_add_version(rs->rs_msg, PTLRPC_MSG_VERSION);
+	lustre_init_msg_v2(rs->rs_msg, count, lens, bufs);
+	lustre_msg_add_version(rs->rs_msg, PTLRPC_MSG_VERSION);
 
-        PTLRPC_RS_DEBUG_LRU_ADD(rs);
+	PTLRPC_RS_DEBUG_LRU_ADD(rs);
 
-        RETURN(0);
+	RETURN(0);
 }
 EXPORT_SYMBOL(lustre_pack_reply_v2);
 
@@ -409,28 +415,29 @@ void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, __u32 n, __u32 min_size)
 {
 	__u32 i, offset, buflen, bufcount;
 
-        LASSERT(m != NULL);
+	LASSERT(m != NULL);
+	LASSERT(m->lm_bufcount > 0);
 
-        bufcount = m->lm_bufcount;
-        if (unlikely(n >= bufcount)) {
-                CDEBUG(D_INFO, "msg %p buffer[%d] not present (count %d)\n",
-                       m, n, bufcount);
-                return NULL;
-        }
+	bufcount = m->lm_bufcount;
+	if (unlikely(n >= bufcount)) {
+		CDEBUG(D_INFO, "msg %p buffer[%d] not present (count %d)\n",
+		       m, n, bufcount);
+		return NULL;
+	}
 
-        buflen = m->lm_buflens[n];
-        if (unlikely(buflen < min_size)) {
-                CERROR("msg %p buffer[%d] size %d too small "
-                       "(required %d, opc=%d)\n", m, n, buflen, min_size,
-                       n == MSG_PTLRPC_BODY_OFF ? -1 : lustre_msg_get_opc(m));
-                return NULL;
-        }
+	buflen = m->lm_buflens[n];
+	if (unlikely(buflen < min_size)) {
+		CERROR("msg %p buffer[%d] size %d too small "
+		       "(required %d, opc=%d)\n", m, n, buflen, min_size,
+		       n == MSG_PTLRPC_BODY_OFF ? -1 : lustre_msg_get_opc(m));
+		return NULL;
+	}
 
-        offset = lustre_msg_hdr_size_v2(bufcount);
-        for (i = 0; i < n; i++)
-                offset += cfs_size_round(m->lm_buflens[i]);
+	offset = lustre_msg_hdr_size_v2(bufcount);
+	for (i = 0; i < n; i++)
+		offset += cfs_size_round(m->lm_buflens[i]);
 
-        return (char *)m + offset;
+	return (char *)m + offset;
 }
 
 void *lustre_msg_buf(struct lustre_msg *m, __u32 n, __u32 min_size)
@@ -523,52 +530,60 @@ void lustre_free_reply_state(struct ptlrpc_reply_state *rs)
 
 static int lustre_unpack_msg_v2(struct lustre_msg_v2 *m, int len)
 {
-        int swabbed, required_len, i;
+	int swabbed, required_len, i, buflen;
 
-        /* Now we know the sender speaks my language. */
-        required_len = lustre_msg_hdr_size_v2(0);
-        if (len < required_len) {
-                /* can't even look inside the message */
-                CERROR("message length %d too small for lustre_msg\n", len);
-                return -EINVAL;
-        }
+	/* Now we know the sender speaks my language. */
+	required_len = lustre_msg_hdr_size_v2(0);
+	if (len < required_len) {
+		/* can't even look inside the message */
+		CERROR("message length %d too small for lustre_msg\n", len);
+		return -EINVAL;
+	}
 
-        swabbed = (m->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED);
-
-        if (swabbed) {
-                __swab32s(&m->lm_magic);
-                __swab32s(&m->lm_bufcount);
-                __swab32s(&m->lm_secflvr);
-                __swab32s(&m->lm_repsize);
-                __swab32s(&m->lm_cksum);
-                __swab32s(&m->lm_flags);
-                CLASSERT(offsetof(typeof(*m), lm_padding_2) != 0);
-                CLASSERT(offsetof(typeof(*m), lm_padding_3) != 0);
-        }
+	swabbed = (m->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED);
 
-        required_len = lustre_msg_hdr_size_v2(m->lm_bufcount);
-        if (len < required_len) {
-                /* didn't receive all the buffer lengths */
-                CERROR ("message length %d too small for %d buflens\n",
-                        len, m->lm_bufcount);
-                return -EINVAL;
-        }
+	if (swabbed) {
+		__swab32s(&m->lm_magic);
+		__swab32s(&m->lm_bufcount);
+		__swab32s(&m->lm_secflvr);
+		__swab32s(&m->lm_repsize);
+		__swab32s(&m->lm_cksum);
+		__swab32s(&m->lm_flags);
+		CLASSERT(offsetof(typeof(*m), lm_padding_2) != 0);
+		CLASSERT(offsetof(typeof(*m), lm_padding_3) != 0);
+	}
 
-        for (i = 0; i < m->lm_bufcount; i++) {
-                if (swabbed)
-                        __swab32s(&m->lm_buflens[i]);
-                required_len += cfs_size_round(m->lm_buflens[i]);
-        }
+	if (m->lm_bufcount == 0 || m->lm_bufcount > PTLRPC_MAX_BUFCOUNT) {
+		CERROR("message bufcount %d is not valid\n", m->lm_bufcount);
+		return -EINVAL;
+	}
+	required_len = lustre_msg_hdr_size_v2(m->lm_bufcount);
+	if (len < required_len) {
+		/* didn't receive all the buffer lengths */
+		CERROR("message length %d too small for %d buflens\n",
+		       len, m->lm_bufcount);
+		return -EINVAL;
+	}
 
-        if (len < required_len) {
-                CERROR("len: %d, required_len %d\n", len, required_len);
-                CERROR("bufcount: %d\n", m->lm_bufcount);
-                for (i = 0; i < m->lm_bufcount; i++)
-                        CERROR("buffer %d length %d\n", i, m->lm_buflens[i]);
-                return -EINVAL;
-        }
+	for (i = 0; i < m->lm_bufcount; i++) {
+		if (swabbed)
+			__swab32s(&m->lm_buflens[i]);
+		buflen = cfs_size_round(m->lm_buflens[i]);
+		if (buflen < 0 || buflen > PTLRPC_MAX_BUFLEN) {
+			CERROR("buffer %d length %d is not valid\n", i, buflen);
+			return -EINVAL;
+		}
+		required_len += buflen;
+	}
+	if (len < required_len || required_len > PTLRPC_MAX_BUFLEN) {
+		CERROR("len: %d, required_len %d, bufcount: %d\n",
+		       len, required_len, m->lm_bufcount);
+		for (i = 0; i < m->lm_bufcount; i++)
+			CERROR("buffer %d length %d\n", i, m->lm_buflens[i]);
+		return -EINVAL;
+	}
 
-        return swabbed;
+	return swabbed;
 }
 
 int __lustre_unpack_msg(struct lustre_msg *m, int len)
@@ -757,6 +772,11 @@ char *lustre_msg_string(struct lustre_msg *m, __u32 index, __u32 max_len)
                         "msg %p buffer[%d] len %d\n", m, index, blen);
                 return NULL;
         }
+	if (blen > PTLRPC_MAX_BUFLEN) {
+		CERROR("buffer length of msg %p buffer[%d] is invalid(%d)\n",
+		       m, index, blen);
+		return NULL;
+	}
 
         if (max_len == 0) {
                 if (slen != blen - 1) {
@@ -802,7 +822,7 @@ static inline struct ptlrpc_body *lustre_msg_ptlrpc_body(struct lustre_msg *msg)
 				 sizeof(struct ptlrpc_body_v2));
 }
 
-__u32 lustre_msghdr_get_flags(struct lustre_msg *msg)
+enum lustre_msghdr lustre_msghdr_get_flags(struct lustre_msg *msg)
 {
 	switch (msg->lm_magic) {
 	case LUSTRE_MSG_MAGIC_V2:
@@ -836,7 +856,7 @@ __u32 lustre_msg_get_flags(struct lustre_msg *msg)
 
 		CERROR("invalid msg %p: no ptlrpc body!\n", msg);
 	}
-	/* Fall through */
+	fallthrough;
 	default:
 		/* flags might be printed in debug code while message
 		 * uninitialized */
@@ -880,7 +900,8 @@ void lustre_msg_clear_flags(struct lustre_msg *msg, __u32 flags)
 	case LUSTRE_MSG_MAGIC_V2: {
 		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
 		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
-		pb->pb_flags &= ~(MSG_GEN_FLAG_MASK & flags);
+		pb->pb_flags &= ~flags;
+
 		return;
 	}
 	default:
@@ -899,7 +920,7 @@ __u32 lustre_msg_get_op_flags(struct lustre_msg *msg)
 
 		CERROR("invalid msg %p: no ptlrpc body!\n", msg);
 	}
-	/* Fall through */
+	fallthrough;
 	default:
 		return 0;
 	}
@@ -955,7 +976,7 @@ __u32 lustre_msg_get_type(struct lustre_msg *msg)
 }
 EXPORT_SYMBOL(lustre_msg_get_type);
 
-__u32 lustre_msg_get_version(struct lustre_msg *msg)
+enum lustre_msg_version lustre_msg_get_version(struct lustre_msg *msg)
 {
 	switch (msg->lm_magic) {
 	case LUSTRE_MSG_MAGIC_V2: {
@@ -1104,7 +1125,7 @@ int lustre_msg_get_status(struct lustre_msg *msg)
 			return pb->pb_status;
 		CERROR("invalid msg %p: no ptlrpc body!\n", msg);
 	}
-	/* Fall through */
+	fallthrough;
 	default:
 		/* status might be printed in debug code while message
 		* uninitialized */
@@ -1214,11 +1235,12 @@ __u32 lustre_msg_get_magic(struct lustre_msg *msg)
 	}
 }
 
-__u32 lustre_msg_get_timeout(struct lustre_msg *msg)
+timeout_t lustre_msg_get_timeout(struct lustre_msg *msg)
 {
 	switch (msg->lm_magic) {
 	case LUSTRE_MSG_MAGIC_V2: {
 		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+
 		if (pb == NULL) {
 			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
 			return 0;
@@ -1231,11 +1253,12 @@ __u32 lustre_msg_get_timeout(struct lustre_msg *msg)
 	}
 }
 
-__u32 lustre_msg_get_service_time(struct lustre_msg *msg)
+timeout_t lustre_msg_get_service_timeout(struct lustre_msg *msg)
 {
 	switch (msg->lm_magic) {
 	case LUSTRE_MSG_MAGIC_V2: {
 		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+
 		if (pb == NULL) {
 			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
 			return 0;
@@ -1465,11 +1488,13 @@ void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt)
 	}
 }
 
-void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout)
+void lustre_msg_set_timeout(struct lustre_msg *msg, timeout_t timeout)
 {
 	switch (msg->lm_magic) {
 	case LUSTRE_MSG_MAGIC_V2: {
 		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+
+		LASSERT(timeout >= 0);
 		LASSERTF(pb != NULL, "invalid msg %p: no ptlrpc body!\n", msg);
 		pb->pb_timeout = timeout;
 		return;
@@ -1479,13 +1504,16 @@ void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout)
 	}
 }
 
-void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time)
+void lustre_msg_set_service_timeout(struct lustre_msg *msg,
+				    timeout_t service_timeout)
 {
 	switch (msg->lm_magic) {
 	case LUSTRE_MSG_MAGIC_V2: {
 		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+
+		LASSERT(service_timeout >= 0);
 		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
-		pb->pb_service_time = service_time;
+		pb->pb_service_time = service_timeout;
 		return;
 	}
 	default:
@@ -1511,9 +1539,9 @@ void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid)
 		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
 
 		if (jobid != NULL)
-			memcpy(pb->pb_jobid, jobid, LUSTRE_JOBID_SIZE);
+			memcpy(pb->pb_jobid, jobid, sizeof(pb->pb_jobid));
 		else if (pb->pb_jobid[0] == '\0')
-			lustre_get_jobid(pb->pb_jobid);
+			lustre_get_jobid(pb->pb_jobid, sizeof(pb->pb_jobid));
 		return;
 	}
 	default:
@@ -1618,39 +1646,40 @@ EXPORT_SYMBOL(do_set_info_async);
 /* byte flipping routines for all wire types declared in
  * lustre_idl.h implemented here.
  */
-void lustre_swab_ptlrpc_body(struct ptlrpc_body *b)
-{
-        __swab32s (&b->pb_type);
-        __swab32s (&b->pb_version);
-        __swab32s (&b->pb_opc);
-        __swab32s (&b->pb_status);
-        __swab64s (&b->pb_last_xid);
-	__swab16s (&b->pb_tag);
-        __swab64s (&b->pb_last_committed);
-        __swab64s (&b->pb_transno);
-        __swab32s (&b->pb_flags);
-        __swab32s (&b->pb_op_flags);
-        __swab32s (&b->pb_conn_cnt);
-        __swab32s (&b->pb_timeout);
-        __swab32s (&b->pb_service_time);
-        __swab32s (&b->pb_limit);
-        __swab64s (&b->pb_slv);
-        __swab64s (&b->pb_pre_versions[0]);
-        __swab64s (&b->pb_pre_versions[1]);
-        __swab64s (&b->pb_pre_versions[2]);
-        __swab64s (&b->pb_pre_versions[3]);
-	__swab64s(&b->pb_mbits);
-	CLASSERT(offsetof(typeof(*b), pb_padding0) != 0);
-	CLASSERT(offsetof(typeof(*b), pb_padding1) != 0);
-	CLASSERT(offsetof(typeof(*b), pb_padding64_0) != 0);
-	CLASSERT(offsetof(typeof(*b), pb_padding64_1) != 0);
-	CLASSERT(offsetof(typeof(*b), pb_padding64_2) != 0);
+void lustre_swab_ptlrpc_body(struct ptlrpc_body *body)
+{
+	__swab32s(&body->pb_type);
+	__swab32s(&body->pb_version);
+	__swab32s(&body->pb_opc);
+	__swab32s(&body->pb_status);
+	__swab64s(&body->pb_last_xid);
+	__swab16s(&body->pb_tag);
+	CLASSERT(offsetof(typeof(*body), pb_padding0) != 0);
+	CLASSERT(offsetof(typeof(*body), pb_padding1) != 0);
+	__swab64s(&body->pb_last_committed);
+	__swab64s(&body->pb_transno);
+	__swab32s(&body->pb_flags);
+	__swab32s(&body->pb_op_flags);
+	__swab32s(&body->pb_conn_cnt);
+	__swab32s(&body->pb_timeout);
+	__swab32s(&body->pb_service_time);
+	__swab32s(&body->pb_limit);
+	__swab64s(&body->pb_slv);
+	__swab64s(&body->pb_pre_versions[0]);
+	__swab64s(&body->pb_pre_versions[1]);
+	__swab64s(&body->pb_pre_versions[2]);
+	__swab64s(&body->pb_pre_versions[3]);
+	__swab64s(&body->pb_mbits);
+	CLASSERT(offsetof(typeof(*body), pb_padding64_0) != 0);
+	CLASSERT(offsetof(typeof(*body), pb_padding64_1) != 0);
+	CLASSERT(offsetof(typeof(*body), pb_padding64_2) != 0);
 	/* While we need to maintain compatibility between
 	 * clients and servers without ptlrpc_body_v2 (< 2.3)
 	 * do not swab any fields beyond pb_jobid, as we are
 	 * using this swab function for both ptlrpc_body
 	 * and ptlrpc_body_v2. */
-	CLASSERT(offsetof(typeof(*b), pb_jobid) != 0);
+	/* pb_jobid is an ASCII string and should not be swabbed */
+	CLASSERT(offsetof(typeof(*body), pb_jobid) != 0);
 }
 
 void lustre_swab_connect(struct obd_connect_data *ocd)
@@ -1730,7 +1759,7 @@ void lustre_swab_obdo (struct obdo  *o)
 	__swab32s(&o->o_stripe_idx);
 	__swab32s(&o->o_parent_ver);
 	lustre_swab_ost_layout(&o->o_layout);
-	CLASSERT(offsetof(typeof(*o), o_padding_3) != 0);
+	__swab32s(&o->o_layout_version);
 	__swab32s(&o->o_uid_h);
 	__swab32s(&o->o_gid_h);
 	__swab64s(&o->o_data_version);
@@ -1744,26 +1773,26 @@ EXPORT_SYMBOL(lustre_swab_obdo);
 
 void lustre_swab_obd_statfs (struct obd_statfs *os)
 {
-        __swab64s (&os->os_type);
-        __swab64s (&os->os_blocks);
-        __swab64s (&os->os_bfree);
-        __swab64s (&os->os_bavail);
-        __swab64s (&os->os_files);
-        __swab64s (&os->os_ffree);
-        /* no need to swab os_fsid */
-        __swab32s (&os->os_bsize);
-        __swab32s (&os->os_namelen);
-        __swab64s (&os->os_maxbytes);
-        __swab32s (&os->os_state);
-	CLASSERT(offsetof(typeof(*os), os_fprecreated) != 0);
-        CLASSERT(offsetof(typeof(*os), os_spare2) != 0);
-        CLASSERT(offsetof(typeof(*os), os_spare3) != 0);
-        CLASSERT(offsetof(typeof(*os), os_spare4) != 0);
-        CLASSERT(offsetof(typeof(*os), os_spare5) != 0);
-        CLASSERT(offsetof(typeof(*os), os_spare6) != 0);
-        CLASSERT(offsetof(typeof(*os), os_spare7) != 0);
-        CLASSERT(offsetof(typeof(*os), os_spare8) != 0);
-        CLASSERT(offsetof(typeof(*os), os_spare9) != 0);
+	__swab64s(&os->os_type);
+	__swab64s(&os->os_blocks);
+	__swab64s(&os->os_bfree);
+	__swab64s(&os->os_bavail);
+	__swab64s(&os->os_files);
+	__swab64s(&os->os_ffree);
+	/* no need to swab os_fsid */
+	__swab32s(&os->os_bsize);
+	__swab32s(&os->os_namelen);
+	__swab64s(&os->os_maxbytes);
+	__swab32s(&os->os_state);
+	__swab32s(&os->os_fprecreated);
+	__swab32s(&os->os_granted);
+	CLASSERT(offsetof(typeof(*os), os_spare3) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare4) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare5) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare6) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare7) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare8) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare9) != 0);
 }
 
 void lustre_swab_obd_ioobj(struct obd_ioobj *ioo)
@@ -1868,7 +1897,7 @@ void lustre_swab_mdt_body (struct mdt_body *b)
 	__swab64s(&b->mbo_atime);
 	__swab64s(&b->mbo_ctime);
 	__swab64s(&b->mbo_blocks);
-	__swab64s(&b->mbo_ioepoch);
+	__swab64s(&b->mbo_version);
 	__swab64s(&b->mbo_t_state);
 	__swab32s(&b->mbo_fsuid);
 	__swab32s(&b->mbo_fsgid);
@@ -1879,7 +1908,7 @@ void lustre_swab_mdt_body (struct mdt_body *b)
 	__swab32s(&b->mbo_flags);
 	__swab32s(&b->mbo_rdev);
 	__swab32s(&b->mbo_nlink);
-	CLASSERT(offsetof(typeof(*b), mbo_unused2) != 0);
+	__swab32s(&b->mbo_layout_gen);
 	__swab32s(&b->mbo_suppgid);
 	__swab32s(&b->mbo_eadatasize);
 	__swab32s(&b->mbo_aclsize);
@@ -1888,8 +1917,8 @@ void lustre_swab_mdt_body (struct mdt_body *b)
 	__swab32s(&b->mbo_uid_h);
 	__swab32s(&b->mbo_gid_h);
 	__swab32s(&b->mbo_projid);
-	CLASSERT(offsetof(typeof(*b), mbo_padding_6) != 0);
-	CLASSERT(offsetof(typeof(*b), mbo_padding_7) != 0);
+	__swab64s(&b->mbo_dom_size);
+	__swab64s(&b->mbo_dom_blocks);
 	CLASSERT(offsetof(typeof(*b), mbo_padding_8) != 0);
 	CLASSERT(offsetof(typeof(*b), mbo_padding_9) != 0);
 	CLASSERT(offsetof(typeof(*b), mbo_padding_10) != 0);
@@ -1897,7 +1926,7 @@ void lustre_swab_mdt_body (struct mdt_body *b)
 
 void lustre_swab_mdt_ioepoch(struct mdt_ioepoch *b)
 {
-	/* mio_handle is opaque */
+	/* mio_open_handle is opaque */
 	CLASSERT(offsetof(typeof(*b), mio_unused1) != 0);
 	CLASSERT(offsetof(typeof(*b), mio_unused2) != 0);
 	CLASSERT(offsetof(typeof(*b), mio_padding) != 0);
@@ -1905,38 +1934,39 @@ void lustre_swab_mdt_ioepoch(struct mdt_ioepoch *b)
 
 void lustre_swab_mgs_target_info(struct mgs_target_info *mti)
 {
-        int i;
-        __swab32s(&mti->mti_lustre_ver);
-        __swab32s(&mti->mti_stripe_index);
-        __swab32s(&mti->mti_config_ver);
-        __swab32s(&mti->mti_flags);
-        __swab32s(&mti->mti_instance);
-        __swab32s(&mti->mti_nid_count);
-        CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
-        for (i = 0; i < MTI_NIDS_MAX; i++)
-                __swab64s(&mti->mti_nids[i]);
+	int i;
+
+	__swab32s(&mti->mti_lustre_ver);
+	__swab32s(&mti->mti_stripe_index);
+	__swab32s(&mti->mti_config_ver);
+	__swab32s(&mti->mti_flags);
+	__swab32s(&mti->mti_instance);
+	__swab32s(&mti->mti_nid_count);
+	CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
+	for (i = 0; i < MTI_NIDS_MAX; i++)
+		__swab64s(&mti->mti_nids[i]);
 }
 
 void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *entry)
 {
 	__u8 i;
 
-        __swab64s(&entry->mne_version);
-        __swab32s(&entry->mne_instance);
-        __swab32s(&entry->mne_index);
-        __swab32s(&entry->mne_length);
-
-        /* mne_nid_(count|type) must be one byte size because we're gonna
-         * access it w/o swapping. */
-        CLASSERT(sizeof(entry->mne_nid_count) == sizeof(__u8));
-        CLASSERT(sizeof(entry->mne_nid_type) == sizeof(__u8));
-
-        /* remove this assertion if ipv6 is supported. */
-        LASSERT(entry->mne_nid_type == 0);
-        for (i = 0; i < entry->mne_nid_count; i++) {
-                CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
-                __swab64s(&entry->u.nids[i]);
-        }
+	__swab64s(&entry->mne_version);
+	__swab32s(&entry->mne_instance);
+	__swab32s(&entry->mne_index);
+	__swab32s(&entry->mne_length);
+
+	/* mne_nid_(count|type) must be one byte size because we're gonna
+	 * access it w/o swapping. */
+	CLASSERT(sizeof(entry->mne_nid_count) == sizeof(__u8));
+	CLASSERT(sizeof(entry->mne_nid_type) == sizeof(__u8));
+
+	/* remove this assertion if ipv6 is supported. */
+	LASSERT(entry->mne_nid_type == 0);
+	for (i = 0; i < entry->mne_nid_count; i++) {
+		CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
+		__swab64s(&entry->u.nids[i]);
+	}
 }
 EXPORT_SYMBOL(lustre_swab_mgs_nidtbl_entry);
 
@@ -2003,21 +2033,32 @@ static void lustre_swab_fiemap_extent(struct fiemap_extent *fm_extent)
         __swab32s(&fm_extent->fe_device);
 }
 
+static void lustre_swab_fiemap_hdr(struct fiemap *fiemap)
+{
+	__swab64s(&fiemap->fm_start);
+	__swab64s(&fiemap->fm_length);
+	__swab32s(&fiemap->fm_flags);
+	__swab32s(&fiemap->fm_mapped_extents);
+	__swab32s(&fiemap->fm_extent_count);
+	__swab32s(&fiemap->fm_reserved);
+}
+
 void lustre_swab_fiemap(struct fiemap *fiemap)
 {
 	__u32 i;
 
-        __swab64s(&fiemap->fm_start);
-        __swab64s(&fiemap->fm_length);
-        __swab32s(&fiemap->fm_flags);
-        __swab32s(&fiemap->fm_mapped_extents);
-        __swab32s(&fiemap->fm_extent_count);
-        __swab32s(&fiemap->fm_reserved);
+	lustre_swab_fiemap_hdr(fiemap);
 
         for (i = 0; i < fiemap->fm_mapped_extents; i++)
                 lustre_swab_fiemap_extent(&fiemap->fm_extents[i]);
 }
 
+void lustre_swab_fiemap_info_key(struct ll_fiemap_info_key *fiemap_info)
+{
+	lustre_swab_obdo(&fiemap_info->lfik_oa);
+	lustre_swab_fiemap_hdr(&fiemap_info->lfik_fiemap);
+}
+
 void lustre_swab_idx_info(struct idx_info *ii)
 {
 	__swab32s(&ii->ii_magic);
@@ -2065,6 +2106,7 @@ void lustre_swab_mdt_rec_reint (struct mdt_rec_reint *rr)
 	__swab32s(&rr->rr_flags);
 	__swab32s(&rr->rr_flags_h);
 	__swab32s(&rr->rr_umask);
+	__swab16s(&rr->rr_mirror_id);
 
 	CLASSERT(offsetof(typeof(*rr), rr_padding_4) != 0);
 };
@@ -2119,14 +2161,37 @@ void lustre_swab_lmv_mds_md(union lmv_mds_md *lmm)
 }
 EXPORT_SYMBOL(lustre_swab_lmv_mds_md);
 
+void lustre_swab_lmv_user_md_objects(struct lmv_user_mds_data *lmd,
+				     int stripe_count)
+{
+	int i;
+
+	for (i = 0; i < stripe_count; i++)
+		__swab32s(&(lmd[i].lum_mds));
+}
+EXPORT_SYMBOL(lustre_swab_lmv_user_md_objects);
+
+
 void lustre_swab_lmv_user_md(struct lmv_user_md *lum)
 {
+	__u32 count = lum->lum_stripe_count;
+
 	__swab32s(&lum->lum_magic);
 	__swab32s(&lum->lum_stripe_count);
 	__swab32s(&lum->lum_stripe_offset);
 	__swab32s(&lum->lum_hash_type);
 	__swab32s(&lum->lum_type);
 	CLASSERT(offsetof(typeof(*lum), lum_padding1) != 0);
+	switch (lum->lum_magic) {
+	case LMV_USER_MAGIC_SPECIFIC:
+		count = lum->lum_stripe_count;
+		fallthrough;
+	case __swab32(LMV_USER_MAGIC_SPECIFIC):
+		lustre_swab_lmv_user_md_objects(lum->lum_objects, count);
+		break;
+	default:
+		break;
+	}
 }
 EXPORT_SYMBOL(lustre_swab_lmv_user_md);
 
@@ -2186,6 +2251,7 @@ void lustre_print_user_md(unsigned int lvl, struct lov_user_md *lum,
 	CDEBUG(lvl, "\tlcm_layout_gen: %#x\n", comp_v1->lcm_layout_gen);
 	CDEBUG(lvl, "\tlcm_flags: %#x\n", comp_v1->lcm_flags);
 	CDEBUG(lvl, "\tlcm_entry_count: %#x\n\n", comp_v1->lcm_entry_count);
+	CDEBUG(lvl, "\tlcm_mirror_count: %#x\n\n", comp_v1->lcm_mirror_count);
 
 	for (i = 0; i < comp_v1->lcm_entry_count; i++) {
 		struct lov_comp_md_entry_v1 *ent = &comp_v1->lcm_entries[i];
@@ -2194,6 +2260,9 @@ void lustre_print_user_md(unsigned int lvl, struct lov_user_md *lum,
 		CDEBUG(lvl, "\tentry %d:\n", i);
 		CDEBUG(lvl, "\tlcme_id: %#x\n", ent->lcme_id);
 		CDEBUG(lvl, "\tlcme_flags: %#x\n", ent->lcme_flags);
+		if (ent->lcme_flags & LCME_FL_NOSYNC)
+			CDEBUG(lvl, "\tlcme_timestamp: %llu\n",
+					ent->lcme_timestamp);
 		CDEBUG(lvl, "\tlcme_extent.e_start: %llu\n",
 		       ent->lcme_extent.e_start);
 		CDEBUG(lvl, "\tlcme_extent.e_end: %llu\n",
@@ -2267,6 +2336,7 @@ void lustre_swab_lov_comp_md_v1(struct lov_comp_md_v1 *lum)
 	__swab32s(&lum->lcm_layout_gen);
 	__swab16s(&lum->lcm_flags);
 	__swab16s(&lum->lcm_entry_count);
+	__swab16s(&lum->lcm_mirror_count);
 	CLASSERT(offsetof(typeof(*lum), lcm_padding1) != 0);
 	CLASSERT(offsetof(typeof(*lum), lcm_padding2) != 0);
 
@@ -2281,11 +2351,13 @@ void lustre_swab_lov_comp_md_v1(struct lov_comp_md_v1 *lum)
 		}
 		__swab32s(&ent->lcme_id);
 		__swab32s(&ent->lcme_flags);
+		__swab64s(&ent->lcme_timestamp);
 		__swab64s(&ent->lcme_extent.e_start);
 		__swab64s(&ent->lcme_extent.e_end);
 		__swab32s(&ent->lcme_offset);
 		__swab32s(&ent->lcme_size);
-		CLASSERT(offsetof(typeof(*ent), lcme_padding) != 0);
+		__swab32s(&ent->lcme_layout_gen);
+		CLASSERT(offsetof(typeof(*ent), lcme_padding_1) != 0);
 
 		v1 = (struct lov_user_md_v1 *)((char *)lum + off);
 		stripe_count = v1->lmm_stripe_count;
@@ -2314,20 +2386,6 @@ void lustre_swab_lov_comp_md_v1(struct lov_comp_md_v1 *lum)
 }
 EXPORT_SYMBOL(lustre_swab_lov_comp_md_v1);
 
-void lustre_swab_lov_mds_md(struct lov_mds_md *lmm)
-{
-	ENTRY;
-	CDEBUG(D_IOCTL, "swabbing lov_mds_md\n");
-	__swab32s(&lmm->lmm_magic);
-	__swab32s(&lmm->lmm_pattern);
-	lustre_swab_lmm_oi(&lmm->lmm_oi);
-	__swab32s(&lmm->lmm_stripe_size);
-	__swab16s(&lmm->lmm_stripe_count);
-	__swab16s(&lmm->lmm_layout_gen);
-	EXIT;
-}
-EXPORT_SYMBOL(lustre_swab_lov_mds_md);
-
 void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
                                      int stripe_count)
 {
@@ -2342,6 +2400,83 @@ void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
 }
 EXPORT_SYMBOL(lustre_swab_lov_user_md_objects);
 
+void lustre_swab_lov_user_md(struct lov_user_md *lum, size_t size)
+{
+	struct lov_user_md_v1 *v1;
+	struct lov_user_md_v3 *v3;
+	__u16 stripe_count;
+	ENTRY;
+
+	CDEBUG(D_IOCTL, "swabbing lov_user_md\n");
+	switch (lum->lmm_magic) {
+	case __swab32(LOV_MAGIC_V1):
+	case LOV_USER_MAGIC_V1:
+	{
+		v1 = (struct lov_user_md_v1 *)lum;
+		stripe_count = v1->lmm_stripe_count;
+
+		if (lum->lmm_magic != LOV_USER_MAGIC_V1)
+			__swab16s(&stripe_count);
+
+		lustre_swab_lov_user_md_v1(v1);
+		if (size > sizeof(*v1))
+			lustre_swab_lov_user_md_objects(v1->lmm_objects,
+							stripe_count);
+
+		break;
+	}
+	case __swab32(LOV_MAGIC_V3):
+	case LOV_USER_MAGIC_V3:
+	{
+		v3 = (struct lov_user_md_v3 *)lum;
+		stripe_count = v3->lmm_stripe_count;
+
+		if (lum->lmm_magic != LOV_USER_MAGIC_V3)
+			__swab16s(&stripe_count);
+
+		lustre_swab_lov_user_md_v3(v3);
+		if (size > sizeof(*v3))
+			lustre_swab_lov_user_md_objects(v3->lmm_objects,
+							stripe_count);
+		break;
+	}
+	case __swab32(LOV_USER_MAGIC_SPECIFIC):
+	case LOV_USER_MAGIC_SPECIFIC:
+	{
+		v3 = (struct lov_user_md_v3 *)lum;
+		stripe_count = v3->lmm_stripe_count;
+
+		if (lum->lmm_magic != LOV_USER_MAGIC_SPECIFIC)
+			__swab16s(&stripe_count);
+
+		lustre_swab_lov_user_md_v3(v3);
+		lustre_swab_lov_user_md_objects(v3->lmm_objects, stripe_count);
+		break;
+	}
+	case __swab32(LOV_MAGIC_COMP_V1):
+	case LOV_USER_MAGIC_COMP_V1:
+		lustre_swab_lov_comp_md_v1((struct lov_comp_md_v1 *)lum);
+		break;
+	default:
+		CDEBUG(D_IOCTL, "Invalid LOV magic %08x\n", lum->lmm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_swab_lov_user_md);
+
+void lustre_swab_lov_mds_md(struct lov_mds_md *lmm)
+{
+	ENTRY;
+	CDEBUG(D_IOCTL, "swabbing lov_mds_md\n");
+	__swab32s(&lmm->lmm_magic);
+	__swab32s(&lmm->lmm_pattern);
+	lustre_swab_lmm_oi(&lmm->lmm_oi);
+	__swab32s(&lmm->lmm_stripe_size);
+	__swab16s(&lmm->lmm_stripe_count);
+	__swab16s(&lmm->lmm_layout_gen);
+	EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_lov_mds_md);
+
 void lustre_swab_ldlm_res_id (struct ldlm_res_id *id)
 {
         int  i;
@@ -2435,54 +2570,51 @@ void dump_obdo(struct obdo *oa)
 	if (valid & OBD_MD_FLFID)
 		CDEBUG(D_RPCTRACE, "obdo: o_parent_seq = %#llx\n",
 		       oa->o_parent_seq);
-        if (valid & OBD_MD_FLSIZE)
+	if (valid & OBD_MD_FLSIZE)
 		CDEBUG(D_RPCTRACE, "obdo: o_size = %lld\n", oa->o_size);
-        if (valid & OBD_MD_FLMTIME)
+	if (valid & OBD_MD_FLMTIME)
 		CDEBUG(D_RPCTRACE, "obdo: o_mtime = %lld\n", oa->o_mtime);
-        if (valid & OBD_MD_FLATIME)
+	if (valid & OBD_MD_FLATIME)
 		CDEBUG(D_RPCTRACE, "obdo: o_atime = %lld\n", oa->o_atime);
-        if (valid & OBD_MD_FLCTIME)
+	if (valid & OBD_MD_FLCTIME)
 		CDEBUG(D_RPCTRACE, "obdo: o_ctime = %lld\n", oa->o_ctime);
-        if (valid & OBD_MD_FLBLOCKS)   /* allocation of space */
+	if (valid & OBD_MD_FLBLOCKS)   /* allocation of space */
 		CDEBUG(D_RPCTRACE, "obdo: o_blocks = %lld\n", oa->o_blocks);
-        if (valid & OBD_MD_FLGRANT)
+	if (valid & OBD_MD_FLGRANT)
 		CDEBUG(D_RPCTRACE, "obdo: o_grant = %lld\n", oa->o_grant);
-        if (valid & OBD_MD_FLBLKSZ)
-                CDEBUG(D_RPCTRACE, "obdo: o_blksize = %d\n", oa->o_blksize);
-        if (valid & (OBD_MD_FLTYPE | OBD_MD_FLMODE))
-                CDEBUG(D_RPCTRACE, "obdo: o_mode = %o\n",
-                       oa->o_mode & ((valid & OBD_MD_FLTYPE ?  S_IFMT : 0) |
-                                     (valid & OBD_MD_FLMODE ? ~S_IFMT : 0)));
-        if (valid & OBD_MD_FLUID)
-                CDEBUG(D_RPCTRACE, "obdo: o_uid = %u\n", oa->o_uid);
-        if (valid & OBD_MD_FLUID)
-                CDEBUG(D_RPCTRACE, "obdo: o_uid_h = %u\n", oa->o_uid_h);
-        if (valid & OBD_MD_FLGID)
-                CDEBUG(D_RPCTRACE, "obdo: o_gid = %u\n", oa->o_gid);
-        if (valid & OBD_MD_FLGID)
-                CDEBUG(D_RPCTRACE, "obdo: o_gid_h = %u\n", oa->o_gid_h);
-        if (valid & OBD_MD_FLFLAGS)
-                CDEBUG(D_RPCTRACE, "obdo: o_flags = %x\n", oa->o_flags);
-        if (valid & OBD_MD_FLNLINK)
-                CDEBUG(D_RPCTRACE, "obdo: o_nlink = %u\n", oa->o_nlink);
-        else if (valid & OBD_MD_FLCKSUM)
-                CDEBUG(D_RPCTRACE, "obdo: o_checksum (o_nlink) = %u\n",
-                       oa->o_nlink);
-        if (valid & OBD_MD_FLGENER)
-                CDEBUG(D_RPCTRACE, "obdo: o_parent_oid = %x\n",
-                       oa->o_parent_oid);
-        if (valid & OBD_MD_FLEPOCH)
-		CDEBUG(D_RPCTRACE, "obdo: o_ioepoch = %lld\n",
-                       oa->o_ioepoch);
-        if (valid & OBD_MD_FLFID) {
-                CDEBUG(D_RPCTRACE, "obdo: o_stripe_idx = %u\n",
-                       oa->o_stripe_idx);
-                CDEBUG(D_RPCTRACE, "obdo: o_parent_ver = %x\n",
-                       oa->o_parent_ver);
-        }
-        if (valid & OBD_MD_FLHANDLE)
+	if (valid & OBD_MD_FLBLKSZ)
+		CDEBUG(D_RPCTRACE, "obdo: o_blksize = %d\n", oa->o_blksize);
+	if (valid & (OBD_MD_FLTYPE | OBD_MD_FLMODE))
+		CDEBUG(D_RPCTRACE, "obdo: o_mode = %o\n",
+		       oa->o_mode & ((valid & OBD_MD_FLTYPE ?  S_IFMT : 0) |
+				     (valid & OBD_MD_FLMODE ? ~S_IFMT : 0)));
+	if (valid & OBD_MD_FLUID)
+		CDEBUG(D_RPCTRACE, "obdo: o_uid = %u\n", oa->o_uid);
+	if (valid & OBD_MD_FLUID)
+		CDEBUG(D_RPCTRACE, "obdo: o_uid_h = %u\n", oa->o_uid_h);
+	if (valid & OBD_MD_FLGID)
+		CDEBUG(D_RPCTRACE, "obdo: o_gid = %u\n", oa->o_gid);
+	if (valid & OBD_MD_FLGID)
+		CDEBUG(D_RPCTRACE, "obdo: o_gid_h = %u\n", oa->o_gid_h);
+	if (valid & OBD_MD_FLFLAGS)
+		CDEBUG(D_RPCTRACE, "obdo: o_flags = %x\n", oa->o_flags);
+	if (valid & OBD_MD_FLNLINK)
+		CDEBUG(D_RPCTRACE, "obdo: o_nlink = %u\n", oa->o_nlink);
+	else if (valid & OBD_MD_FLCKSUM)
+		CDEBUG(D_RPCTRACE, "obdo: o_checksum (o_nlink) = %u\n",
+		       oa->o_nlink);
+	if (valid & OBD_MD_FLPARENT)
+		CDEBUG(D_RPCTRACE, "obdo: o_parent_oid = %x\n",
+		       oa->o_parent_oid);
+	if (valid & OBD_MD_FLFID) {
+		CDEBUG(D_RPCTRACE, "obdo: o_stripe_idx = %u\n",
+		       oa->o_stripe_idx);
+		CDEBUG(D_RPCTRACE, "obdo: o_parent_ver = %x\n",
+		       oa->o_parent_ver);
+	}
+	if (valid & OBD_MD_FLHANDLE)
 		CDEBUG(D_RPCTRACE, "obdo: o_handle = %lld\n",
-                       oa->o_handle.cookie);
+		       oa->o_handle.cookie);
 }
 
 void dump_ost_body(struct ost_body *ob)
@@ -2629,12 +2761,17 @@ void lustre_swab_hsm_user_item(struct hsm_user_item *hui)
 	lustre_swab_hsm_extent(&hui->hui_extent);
 }
 
+void lustre_swab_lu_extent(struct lu_extent *le)
+{
+	__swab64s(&le->e_start);
+	__swab64s(&le->e_end);
+}
+
 void lustre_swab_layout_intent(struct layout_intent *li)
 {
 	__swab32s(&li->li_opc);
 	__swab32s(&li->li_flags);
-	__swab64s(&li->li_start);
-	__swab64s(&li->li_end);
+	lustre_swab_lu_extent(&li->li_extent);
 }
 
 void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk)
@@ -2746,6 +2883,19 @@ void lustre_swab_close_data(struct close_data *cd)
 	__swab64s(&cd->cd_data_version);
 }
 
+void lustre_swab_close_data_resync_done(struct close_data_resync_done *resync)
+{
+	int i;
+
+	__swab32s(&resync->resync_count);
+	/* after swab, resync_count must in CPU endian */
+	if (resync->resync_count <= INLINE_RESYNC_ARRAY_SIZE) {
+		for (i = 0; i < resync->resync_count; i++)
+			__swab32s(&resync->resync_ids_inline[i]);
+	}
+}
+EXPORT_SYMBOL(lustre_swab_close_data_resync_done);
+
 void lustre_swab_lfsck_request(struct lfsck_request *lr)
 {
 	__swab32s(&lr->lr_event);
@@ -2797,6 +2947,18 @@ void lustre_swab_orphan_ent_v2(struct lu_orphan_ent_v2 *ent)
 }
 EXPORT_SYMBOL(lustre_swab_orphan_ent_v2);
 
+void lustre_swab_orphan_ent_v3(struct lu_orphan_ent_v3 *ent)
+{
+	lustre_swab_lu_fid(&ent->loe_key);
+	lustre_swab_orphan_rec(&ent->loe_rec.lor_rec);
+	lustre_swab_ost_layout(&ent->loe_rec.lor_layout);
+	__swab32s(&ent->loe_rec.lor_layout_version);
+	__swab32s(&ent->loe_rec.lor_range);
+	CLASSERT(offsetof(typeof(ent->loe_rec), lor_padding_1) != 0);
+	CLASSERT(offsetof(typeof(ent->loe_rec), lor_padding_2) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_orphan_ent_v3);
+
 void lustre_swab_ladvise(struct lu_ladvise *ladvise)
 {
 	__swab16s(&ladvise->lla_advice);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/pers.c b/drivers/staging/lustrefsx/lustre/ptlrpc/pers.c
index 51e17e2c2b459..d0c8fa7a1e6ac 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/pers.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/pers.c
@@ -44,6 +44,8 @@
 void ptlrpc_fill_bulk_md(struct lnet_md *md, struct ptlrpc_bulk_desc *desc,
 			 int mdidx)
 {
+	unsigned int start = desc->bd_mds_off[mdidx];
+
 	CLASSERT(PTLRPC_MAX_BRW_PAGES < LI_POISON);
 
 	LASSERT(mdidx < desc->bd_md_max_brw);
@@ -51,23 +53,34 @@ void ptlrpc_fill_bulk_md(struct lnet_md *md, struct ptlrpc_bulk_desc *desc,
 	LASSERT(!(md->options & (LNET_MD_IOVEC | LNET_MD_KIOV |
 				 LNET_MD_PHYS)));
 
-	md->length = max(0, desc->bd_iov_count - mdidx * LNET_MAX_IOV);
-	md->length = min_t(unsigned int, LNET_MAX_IOV, md->length);
+	/* just send a lnet header */
+	if (mdidx >= desc->bd_md_count) {
+		if (ptlrpc_is_bulk_desc_kiov(desc->bd_type))
+			md->options |= LNET_MD_KIOV;
+		else if (ptlrpc_is_bulk_desc_kvec(desc->bd_type))
+			md->options |= LNET_MD_IOVEC;
+		md->length = 0;
+		md->start = NULL;
+		return;
+	}
+
+	if (mdidx == (desc->bd_md_count - 1))
+		md->length = desc->bd_iov_count - start;
+	else
+		md->length = desc->bd_mds_off[mdidx + 1] - start;
 
 	if (ptlrpc_is_bulk_desc_kiov(desc->bd_type)) {
 		md->options |= LNET_MD_KIOV;
 		if (GET_ENC_KIOV(desc))
-			md->start = &BD_GET_ENC_KIOV(desc, mdidx *
-						     LNET_MAX_IOV);
+			md->start = &BD_GET_ENC_KIOV(desc, start);
 		else
-			md->start = &BD_GET_KIOV(desc, mdidx * LNET_MAX_IOV);
+			md->start = &BD_GET_KIOV(desc, start);
 	} else if (ptlrpc_is_bulk_desc_kvec(desc->bd_type)) {
 		md->options |= LNET_MD_IOVEC;
 		if (GET_ENC_KVEC(desc))
-			md->start = &BD_GET_ENC_KVEC(desc, mdidx *
-						      LNET_MAX_IOV);
+			md->start = &BD_GET_ENC_KVEC(desc, start);
 		else
-			md->start = &BD_GET_KVEC(desc, mdidx * LNET_MAX_IOV);
+			md->start = &BD_GET_KVEC(desc, start);
 	}
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/pinger.c b/drivers/staging/lustrefsx/lustre/ptlrpc/pinger.c
index 15fb0965241eb..d965c0838d8d5 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/pinger.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/pinger.c
@@ -37,6 +37,7 @@
 #define DEBUG_SUBSYSTEM S_RPC
 
 #include <linux/kthread.h>
+#include <linux/workqueue.h>
 #include <obd_support.h>
 #include <obd_class.h>
 #include "ptlrpc_internal.h"
@@ -48,8 +49,6 @@ MODULE_PARM_DESC(suppress_pings, "Suppress pings");
 struct mutex pinger_mutex;
 static struct list_head pinger_imports =
 		LIST_HEAD_INIT(pinger_imports);
-static struct list_head timeout_list =
-		LIST_HEAD_INIT(timeout_list);
 
 int ptlrpc_pinger_suppress_pings()
 {
@@ -91,11 +90,51 @@ int ptlrpc_obd_ping(struct obd_device *obd)
 }
 EXPORT_SYMBOL(ptlrpc_obd_ping);
 
+static bool ptlrpc_check_import_is_idle(struct obd_import *imp)
+{
+	struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
+	time64_t now;
+
+	if (!imp->imp_idle_timeout)
+		return false;
+
+	if (atomic_read(&imp->imp_reqs) > 0)
+		return false;
+
+	/* any lock increases ns_bref being a resource holder */
+	if (ns && atomic_read(&ns->ns_bref) > 0)
+		return false;
+
+	now = ktime_get_real_seconds();
+	if (now - imp->imp_last_reply_time < imp->imp_idle_timeout)
+		return false;
+
+	return true;
+}
+
+static void ptlrpc_update_next_ping(struct obd_import *imp, int soon)
+{
+#ifdef CONFIG_LUSTRE_FS_PINGER
+	time64_t time = soon ? PING_INTERVAL_SHORT : PING_INTERVAL;
+
+	if (imp->imp_state == LUSTRE_IMP_DISCON) {
+		time64_t dtime = max_t(time64_t, CONNECTION_SWITCH_MIN,
+				       AT_OFF ? 0 :
+				       at_get(&imp->imp_at.iat_net_latency));
+		time = min(time, dtime);
+	}
+	imp->imp_next_ping = ktime_get_seconds() + time;
+#endif /* CONFIG_LUSTRE_FS_PINGER */
+}
+
 static int ptlrpc_ping(struct obd_import *imp)
 {
 	struct ptlrpc_request	*req;
 	ENTRY;
 
+	if (ptlrpc_check_import_is_idle(imp))
+		RETURN(ptlrpc_disconnect_and_idle_import(imp));
+
 	req = ptlrpc_prep_ping(imp);
 	if (req == NULL) {
 		CERROR("OOM trying to ping %s->%s\n",
@@ -106,28 +145,20 @@ static int ptlrpc_ping(struct obd_import *imp)
 
 	DEBUG_REQ(D_INFO, req, "pinging %s->%s",
 		  imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
+	/* Updating imp_next_ping early, it allows pinger_check_timeout to
+	 * see an actual time for next awake. request_out_callback update
+	 * happens at another thread, and ptlrpc_pinger_main may sleep
+	 * already.
+	 */
+	ptlrpc_update_next_ping(imp, 0);
 	ptlrpcd_add_req(req);
 
 	RETURN(0);
 }
 
-static void ptlrpc_update_next_ping(struct obd_import *imp, int soon)
-{
-#ifdef ENABLE_PINGER
-        int time = soon ? PING_INTERVAL_SHORT : PING_INTERVAL;
-        if (imp->imp_state == LUSTRE_IMP_DISCON) {
-                int dtime = max_t(int, CONNECTION_SWITCH_MIN,
-                                  AT_OFF ? 0 :
-                                  at_get(&imp->imp_at.iat_net_latency));
-                time = min(time, dtime);
-        }
-        imp->imp_next_ping = cfs_time_shift(time);
-#endif /* ENABLE_PINGER */
-}
-
 void ptlrpc_ping_import_soon(struct obd_import *imp)
 {
-        imp->imp_next_ping = cfs_time_current();
+	imp->imp_next_ping = ktime_get_seconds();
 }
 
 static inline int imp_is_deactive(struct obd_import *imp)
@@ -136,34 +167,36 @@ static inline int imp_is_deactive(struct obd_import *imp)
                 OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_IMP_DEACTIVE));
 }
 
-static inline int ptlrpc_next_reconnect(struct obd_import *imp)
+static inline time64_t ptlrpc_next_reconnect(struct obd_import *imp)
 {
-        if (imp->imp_server_timeout)
-                return cfs_time_shift(obd_timeout / 2);
-        else
-                return cfs_time_shift(obd_timeout);
+	return ktime_get_seconds() + INITIAL_CONNECT_TIMEOUT;
 }
 
-static cfs_duration_t pinger_check_timeout(cfs_time_t time)
+static s32 pinger_check_timeout(time64_t time)
 {
-        struct timeout_item *item;
-        cfs_time_t timeout = PING_INTERVAL;
+	s32 timeout = PING_INTERVAL;
+	s32 next_timeout;
+	time64_t now;
+	struct list_head *iter;
+	struct obd_import *imp;
 
-	/* This list is sorted in increasing timeout order */
 	mutex_lock(&pinger_mutex);
-	list_for_each_entry(item, &timeout_list, ti_chain) {
-		int ti_timeout = item->ti_timeout;
-		if (timeout > ti_timeout)
-			timeout = ti_timeout;
-		break;
+	now = ktime_get_seconds();
+	/* Process imports to find a nearest next ping */
+	list_for_each(iter, &pinger_imports) {
+		imp = list_entry(iter, struct obd_import, imp_pinger_chain);
+		if (!imp->imp_pingable || imp->imp_next_ping < now)
+			continue;
+		next_timeout = imp->imp_next_ping - now;
+		/* make sure imp_next_ping in the future from time */
+		if (next_timeout > (now - time) && timeout > next_timeout)
+			timeout = next_timeout;
 	}
 	mutex_unlock(&pinger_mutex);
 
-        return cfs_time_sub(cfs_time_add(time, cfs_time_seconds(timeout)),
-                                         cfs_time_current());
+	return timeout - (now - time);
 }
 
-
 static bool ir_up;
 
 void ptlrpc_pinger_ir_up(void)
@@ -181,7 +214,7 @@ void ptlrpc_pinger_ir_down(void)
 EXPORT_SYMBOL(ptlrpc_pinger_ir_down);
 
 static void ptlrpc_pinger_process_import(struct obd_import *imp,
-                                         unsigned long this_ping)
+					 time64_t this_ping)
 {
 	int level;
 	int force;
@@ -200,16 +233,13 @@ static void ptlrpc_pinger_process_import(struct obd_import *imp,
 
 	imp->imp_force_verify = 0;
 
-	if (cfs_time_aftereq(imp->imp_next_ping - 5 * CFS_TICK, this_ping) &&
-	    !force) {
+	if (imp->imp_next_ping - 5 >= this_ping && !force) {
 		spin_unlock(&imp->imp_lock);
 		return;
 	}
 
 	imp->imp_force_next_verify = 0;
 
-	spin_unlock(&imp->imp_lock);
-
 	CDEBUG(level == LUSTRE_IMP_FULL ? D_INFO : D_HA, "%s->%s: level %s/%u "
 	       "force %u force_next %u deactive %u pingable %u suppress %u\n",
 	       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
@@ -219,130 +249,91 @@ static void ptlrpc_pinger_process_import(struct obd_import *imp,
         if (level == LUSTRE_IMP_DISCON && !imp_is_deactive(imp)) {
                 /* wait for a while before trying recovery again */
                 imp->imp_next_ping = ptlrpc_next_reconnect(imp);
+		spin_unlock(&imp->imp_lock);
                 if (!imp->imp_no_pinger_recover)
                         ptlrpc_initiate_recovery(imp);
-        } else if (level != LUSTRE_IMP_FULL ||
-                   imp->imp_obd->obd_no_recov ||
-                   imp_is_deactive(imp)) {
+	} else if (level != LUSTRE_IMP_FULL || imp->imp_obd->obd_no_recov ||
+		   imp_is_deactive(imp)) {
 		CDEBUG(D_HA, "%s->%s: not pinging (in recovery "
 		       "or recovery disabled: %s)\n",
 		       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
 		       ptlrpc_import_state_name(level));
-		if (force) {
-			spin_lock(&imp->imp_lock);
+		if (force)
 			imp->imp_force_verify = 1;
-			spin_unlock(&imp->imp_lock);
-		}
+		spin_unlock(&imp->imp_lock);
 	} else if ((imp->imp_pingable && !suppress) || force_next || force) {
+		spin_unlock(&imp->imp_lock);
 		ptlrpc_ping(imp);
+	} else {
+		spin_unlock(&imp->imp_lock);
 	}
 }
 
-static int ptlrpc_pinger_main(void *arg)
-{
-	struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg;
-	ENTRY;
+static struct workqueue_struct *pinger_wq;
+static void ptlrpc_pinger_main(struct work_struct *ws);
+static DECLARE_DELAYED_WORK(ping_work, ptlrpc_pinger_main);
 
-	/* Record that the thread is running */
-	thread_set_flags(thread, SVC_RUNNING);
-	wake_up(&thread->t_ctl_waitq);
+static void ptlrpc_pinger_main(struct work_struct *ws)
+{
+	time64_t this_ping, time_after_ping;
+	s32 time_to_next_wake;
+	struct obd_import *imp;
+	struct list_head *iter;
 
-	/* And now, loop forever, pinging as needed. */
-	while (1) {
-		cfs_time_t this_ping = cfs_time_current();
-		struct l_wait_info lwi;
-		cfs_duration_t time_to_next_wake;
-		struct timeout_item *item;
-		struct list_head *iter;
+	do {
+		this_ping = ktime_get_seconds();
 
 		mutex_lock(&pinger_mutex);
-		list_for_each_entry(item, &timeout_list, ti_chain)
-                        item->ti_cb(item, item->ti_cb_data);
 
 		list_for_each(iter, &pinger_imports) {
-			struct obd_import *imp = list_entry(iter,
-							    struct obd_import,
-							    imp_pinger_chain);
-
-                        ptlrpc_pinger_process_import(imp, this_ping);
-                        /* obd_timeout might have changed */
-                        if (imp->imp_pingable && imp->imp_next_ping &&
-                            cfs_time_after(imp->imp_next_ping,
-                                           cfs_time_add(this_ping,
-                                                        cfs_time_seconds(PING_INTERVAL))))
-                                ptlrpc_update_next_ping(imp, 0);
-                }
+			imp = list_entry(iter, struct obd_import,
+					 imp_pinger_chain);
+
+			ptlrpc_pinger_process_import(imp, this_ping);
+			/* obd_timeout might have changed */
+			if (imp->imp_pingable && imp->imp_next_ping &&
+			    imp->imp_next_ping > this_ping + PING_INTERVAL)
+				ptlrpc_update_next_ping(imp, 0);
+		}
 		mutex_unlock(&pinger_mutex);
-                /* update memory usage info */
-                obd_update_maxusage();
-
-                /* Wait until the next ping time, or until we're stopped. */
-                time_to_next_wake = pinger_check_timeout(this_ping);
-                /* The ping sent by ptlrpc_send_rpc may get sent out
-                   say .01 second after this.
-                   ptlrpc_pinger_sending_on_import will then set the
-                   next ping time to next_ping + .01 sec, which means
-                   we will SKIP the next ping at next_ping, and the
-                   ping will get sent 2 timeouts from now!  Beware. */
-		CDEBUG(D_INFO, "next wakeup in "CFS_DURATION_T" (%ld)\n",
-		       time_to_next_wake,
-                       cfs_time_add(this_ping,cfs_time_seconds(PING_INTERVAL)));
-                if (time_to_next_wake > 0) {
-                        lwi = LWI_TIMEOUT(max_t(cfs_duration_t,
-                                                time_to_next_wake,
-                                                cfs_time_seconds(1)),
-                                          NULL, NULL);
-                        l_wait_event(thread->t_ctl_waitq,
-                                     thread_is_stopping(thread) ||
-                                     thread_is_event(thread),
-                                     &lwi);
-                        if (thread_test_and_clear_flags(thread, SVC_STOPPING)) {
-                                EXIT;
-                                break;
-                        } else {
-                                /* woken after adding import to reset timer */
-                                thread_test_and_clear_flags(thread, SVC_EVENT);
-                        }
-                }
-        }
-
-	thread_set_flags(thread, SVC_STOPPED);
-	wake_up(&thread->t_ctl_waitq);
 
-	CDEBUG(D_NET, "pinger thread exiting, process %d\n", current_pid());
-	return 0;
+		time_after_ping = ktime_get_seconds();
+		/* update memory usage info */
+		obd_update_maxusage();
+
+		if ((ktime_get_seconds() - this_ping - 3) > PING_INTERVAL)
+			CDEBUG(D_HA, "long time to ping: %lld, %lld, %lld\n",
+			       this_ping, time_after_ping, ktime_get_seconds());
+
+		/* Wait until the next ping time, or until we're stopped. */
+		time_to_next_wake = pinger_check_timeout(this_ping);
+		/* The ping sent by ptlrpc_send_rpc may get sent out
+		 * say .01 second after this.
+		 * ptlrpc_pinger_sending_on_import will then set the
+		 * next ping time to next_ping + .01 sec, which means
+		 * we will SKIP the next ping at next_ping, and the
+		 * ping will get sent 2 timeouts from now!  Beware. */
+		CDEBUG(D_INFO, "next wakeup in %d (%lld)\n",
+		       time_to_next_wake, this_ping + PING_INTERVAL);
+	} while (time_to_next_wake <= 0);
+
+	queue_delayed_work(pinger_wq, &ping_work,
+			   cfs_time_seconds(max(time_to_next_wake, 1)));
 }
 
-static struct ptlrpc_thread pinger_thread;
-
 int ptlrpc_start_pinger(void)
 {
-	struct l_wait_info lwi = { 0 };
-	struct task_struct *task;
-	int rc;
-#ifndef ENABLE_PINGER
-	return 0;
-#endif
-	ENTRY;
-
-	if (!thread_is_init(&pinger_thread) &&
-	    !thread_is_stopped(&pinger_thread))
-		RETURN(-EALREADY);
-
-	init_waitqueue_head(&pinger_thread.t_ctl_waitq);
-
-	strcpy(pinger_thread.t_name, "ll_ping");
+#ifdef ENABLE_PINGER
+	if (pinger_wq)
+		return -EALREADY;
 
-	task = kthread_run(ptlrpc_pinger_main, &pinger_thread,
-			   pinger_thread.t_name);
-	if (IS_ERR(task)) {
-		rc = PTR_ERR(task);
-		CERROR("cannot start pinger thread: rc = %d\n", rc);
-		RETURN(rc);
+	pinger_wq = alloc_workqueue("ptlrpc_pinger", 0, 1);
+	if (!pinger_wq) {
+		CERROR("cannot start pinger workqueue\n");
+		return -ENOMEM;
 	}
 
-	l_wait_event(pinger_thread.t_ctl_waitq,
-		     thread_is_running(&pinger_thread), &lwi);
+	queue_delayed_work(pinger_wq, &ping_work, 0);
 
 	if (suppress_pings)
 		CWARN("Pings will be suppressed at the request of the "
@@ -350,32 +341,21 @@ int ptlrpc_start_pinger(void)
 		      "additional requirements described in the manual.  "
 		      "(Search for the \"suppress_pings\" kernel module "
 		      "parameter.)\n");
-
-	RETURN(0);
+#endif
+	return 0;
 }
 
-int ptlrpc_pinger_remove_timeouts(void);
-
 int ptlrpc_stop_pinger(void)
 {
-	struct l_wait_info lwi = { 0 };
-#ifndef ENABLE_PINGER
-	return 0;
-#endif
-	ENTRY;
-
-	if (thread_is_init(&pinger_thread) ||
-	    thread_is_stopped(&pinger_thread))
-		RETURN(-EALREADY);
-
-	ptlrpc_pinger_remove_timeouts();
-
-	thread_set_flags(&pinger_thread, SVC_STOPPING);
-	wake_up(&pinger_thread.t_ctl_waitq);
+#ifdef ENABLE_PINGER
+	if (!pinger_wq)
+		return -EALREADY;
 
-	l_wait_event(pinger_thread.t_ctl_waitq,
-		     thread_is_stopped(&pinger_thread), &lwi);
-	RETURN(0);
+	cancel_delayed_work_sync(&ping_work);
+	destroy_workqueue(pinger_wq);
+	pinger_wq = NULL;
+#endif
+	return 0;
 }
 
 void ptlrpc_pinger_sending_on_import(struct obd_import *imp)
@@ -440,129 +420,10 @@ int ptlrpc_pinger_del_import(struct obd_import *imp)
 }
 EXPORT_SYMBOL(ptlrpc_pinger_del_import);
 
-/**
- * Register a timeout callback to the pinger list, and the callback will
- * be called when timeout happens.
- */
-static struct timeout_item *ptlrpc_new_timeout(int time,
-					       enum timeout_event event,
-					       timeout_cb_t cb, void *data)
-{
-        struct timeout_item *ti;
-
-        OBD_ALLOC_PTR(ti);
-        if (!ti)
-                return(NULL);
-
-	INIT_LIST_HEAD(&ti->ti_obd_list);
-	INIT_LIST_HEAD(&ti->ti_chain);
-        ti->ti_timeout = time;
-        ti->ti_event = event;
-        ti->ti_cb = cb;
-        ti->ti_cb_data = data;
-
-        return ti;
-}
-
-/**
- * Register timeout event on the the pinger thread.
- * Note: the timeout list is an sorted list with increased timeout value.
- */
-static struct timeout_item*
-ptlrpc_pinger_register_timeout(int time, enum timeout_event event,
-                               timeout_cb_t cb, void *data)
-{
-	struct timeout_item *item, *tmp;
-
-	LASSERT(mutex_is_locked(&pinger_mutex));
-
-	list_for_each_entry(item, &timeout_list, ti_chain)
-		if (item->ti_event == event)
-			goto out;
-
-	item = ptlrpc_new_timeout(time, event, cb, data);
-	if (item) {
-		list_for_each_entry_reverse(tmp, &timeout_list, ti_chain) {
-			if (tmp->ti_timeout < time) {
-				list_add(&item->ti_chain, &tmp->ti_chain);
-				goto out;
-			}
-		}
-		list_add(&item->ti_chain, &timeout_list);
-	}
-out:
-	return item;
-}
-
-/* Add a client_obd to the timeout event list, when timeout(@time)
- * happens, the callback(@cb) will be called.
- */
-int ptlrpc_add_timeout_client(int time, enum timeout_event event,
-                              timeout_cb_t cb, void *data,
-			      struct list_head *obd_list)
-{
-        struct timeout_item *ti;
-
-	mutex_lock(&pinger_mutex);
-        ti = ptlrpc_pinger_register_timeout(time, event, cb, data);
-        if (!ti) {
-		mutex_unlock(&pinger_mutex);
-                return (-EINVAL);
-        }
-	list_add(obd_list, &ti->ti_obd_list);
-	mutex_unlock(&pinger_mutex);
-        return 0;
-}
-EXPORT_SYMBOL(ptlrpc_add_timeout_client);
-
-int ptlrpc_del_timeout_client(struct list_head *obd_list,
-			      enum timeout_event event)
-{
-	struct timeout_item *ti = NULL, *item;
-
-	if (list_empty(obd_list))
-		return 0;
-	mutex_lock(&pinger_mutex);
-	list_del_init(obd_list);
-	/**
-	 * If there are no obd attached to the timeout event
-	 * list, remove this timeout event from the pinger
-	 */
-	list_for_each_entry(item, &timeout_list, ti_chain) {
-		if (item->ti_event == event) {
-			ti = item;
-			break;
-		}
-	}
-	LASSERTF(ti != NULL, "ti is NULL !\n");
-	if (list_empty(&ti->ti_obd_list)) {
-		list_del(&ti->ti_chain);
-		OBD_FREE_PTR(ti);
-	}
-	mutex_unlock(&pinger_mutex);
-	return 0;
-}
-EXPORT_SYMBOL(ptlrpc_del_timeout_client);
-
-int ptlrpc_pinger_remove_timeouts(void)
-{
-        struct timeout_item *item, *tmp;
-
-	mutex_lock(&pinger_mutex);
-	list_for_each_entry_safe(item, tmp, &timeout_list, ti_chain) {
-		LASSERT(list_empty(&item->ti_obd_list));
-		list_del(&item->ti_chain);
-                OBD_FREE_PTR(item);
-        }
-	mutex_unlock(&pinger_mutex);
-        return 0;
-}
-
 void ptlrpc_pinger_wake_up()
 {
 #ifdef ENABLE_PINGER
-	thread_add_flags(&pinger_thread, SVC_EVENT);
-	wake_up(&pinger_thread.t_ctl_waitq);
+	mod_delayed_work(pinger_wq, &ping_work, 0);
 #endif
 }
 
@@ -600,12 +461,12 @@ int ping_evictor_wake(struct obd_export *exp)
 
 static int ping_evictor_main(void *arg)
 {
-        struct obd_device *obd;
-        struct obd_export *exp;
-        struct l_wait_info lwi = { 0 };
-        time_t expire_time;
-        ENTRY;
+	struct obd_device *obd;
+	struct obd_export *exp;
+	struct l_wait_info lwi = { 0 };
+	time64_t expire_time;
 
+	ENTRY;
 	unshare_fs_struct();
 
 	CDEBUG(D_HA, "Starting Ping Evictor\n");
@@ -626,9 +487,9 @@ static int ping_evictor_main(void *arg)
 				 obd_evict_list);
 		spin_unlock(&pet_lock);
 
-		expire_time = cfs_time_current_sec() - PING_EVICT_TIMEOUT;
+		expire_time = ktime_get_real_seconds() - PING_EVICT_TIMEOUT;
 
-		CDEBUG(D_HA, "evicting all exports of obd %s older than %ld\n",
+		CDEBUG(D_HA, "evicting all exports of obd %s older than %lld\n",
 		       obd->obd_name, expire_time);
 
 		/* Exports can't be deleted out of the list while we hold
@@ -644,19 +505,19 @@ static int ping_evictor_main(void *arg)
 				class_export_get(exp);
 				spin_unlock(&obd->obd_dev_lock);
 				LCONSOLE_WARN("%s: haven't heard from client %s"
-                                              " (at %s) in %ld seconds. I think"
+					      " (at %s) in %lld seconds. I think"
                                               " it's dead, and I am evicting"
-                                              " it. exp %p, cur %ld expire %ld"
-                                              " last %ld\n",
+					      " it. exp %p, cur %lld expire %lld"
+					      " last %lld\n",
                                               obd->obd_name,
                                               obd_uuid2str(&exp->exp_client_uuid),
                                               obd_export_nid2str(exp),
-                                              (long)(cfs_time_current_sec() -
-                                                     exp->exp_last_request_time),
-                                              exp, (long)cfs_time_current_sec(),
-                                              (long)expire_time,
-                                              (long)exp->exp_last_request_time);
-                                CDEBUG(D_HA, "Last request was at %ld\n",
+					      ktime_get_real_seconds() -
+					      exp->exp_last_request_time,
+					      exp, ktime_get_real_seconds(),
+					      expire_time,
+					      exp->exp_last_request_time);
+				CDEBUG(D_HA, "Last request was at %lld\n",
                                        exp->exp_last_request_time);
                                 class_fail_export(exp);
                                 class_export_put(exp);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_internal.h b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_internal.h
index cfd1de5bb3d45..41b9a268d52a6 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_internal.h
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpc_internal.h
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -69,7 +69,7 @@ int ptlrpcd_start(struct ptlrpcd_ctl *pc);
 
 /* client.c */
 void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
-			       unsigned int service_time);
+			       timeout_t service_timeout);
 struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw,
 					 enum ptlrpc_bulk_op_type type,
 					 unsigned portal,
@@ -83,7 +83,7 @@ void ptlrpc_init_xid(void);
 void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
 			    struct ptlrpc_request *req);
 int ptlrpc_expired_set(void *data);
-int ptlrpc_set_next_timeout(struct ptlrpc_request_set *);
+time64_t ptlrpc_set_next_timeout(struct ptlrpc_request_set *);
 void ptlrpc_resend_req(struct ptlrpc_request *request);
 void ptlrpc_set_bulk_mbits(struct ptlrpc_request *req);
 void ptlrpc_assign_next_xid_nolock(struct ptlrpc_request *req);
@@ -97,7 +97,8 @@ void ptlrpc_exit_portals(void);
 void ptlrpc_request_handle_notconn(struct ptlrpc_request *);
 void lustre_assert_wire_constants(void);
 int ptlrpc_import_in_recovery(struct obd_import *imp);
-int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt);
+int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt,
+			     bool invalid);
 void ptlrpc_handle_failed_import(struct obd_import *imp);
 int ptlrpc_replay_next(struct obd_import *imp, int *inflight);
 void ptlrpc_initiate_recovery(struct obd_import *imp);
@@ -105,15 +106,18 @@ void ptlrpc_initiate_recovery(struct obd_import *imp);
 int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset);
 int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset);
 
+int ptlrpc_sysfs_register_service(struct kset *parent,
+				  struct ptlrpc_service *svc);
+void ptlrpc_sysfs_unregister_service(struct ptlrpc_service *svc);
+
+void ptlrpc_ldebugfs_register_service(struct dentry *debugfs_entry,
+				      struct ptlrpc_service *svc);
 #ifdef CONFIG_PROC_FS
-void ptlrpc_lprocfs_register_service(struct proc_dir_entry *proc_entry,
-                                     struct ptlrpc_service *svc);
 void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc);
 void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount);
 void ptlrpc_lprocfs_do_request_stat (struct ptlrpc_request *req,
                                      long q_usec, long work_usec);
 #else
-#define ptlrpc_lprocfs_register_service(params...) do{}while(0)
 #define ptlrpc_lprocfs_unregister_service(params...) do{}while(0)
 #define ptlrpc_lprocfs_rpc_sent(params...) do{}while(0)
 #define ptlrpc_lprocfs_do_request_stat(params...) do{}while(0)
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c
index 0532c4d22d8bd..b98d082660628 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/ptlrpcd.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -212,7 +212,7 @@ void ptlrpcd_add_rqset(struct ptlrpc_request_set *set)
 
 		LASSERT(req->rq_phase == RQ_PHASE_NEW);
 		req->rq_set = new;
-		req->rq_queued_time = cfs_time_current();
+		req->rq_queued_time = ktime_get_seconds();
 	}
 
 	spin_lock(&new->set_new_req_lock);
@@ -476,7 +476,7 @@ static int ptlrpcd(void *arg)
          */
         do {
                 struct l_wait_info lwi;
-                int timeout;
+		time64_t timeout;
 
                 timeout = ptlrpc_set_next_timeout(set);
 		lwi = LWI_TIMEOUT(cfs_time_seconds(timeout),
@@ -503,11 +503,11 @@ static int ptlrpcd(void *arg)
                  */
         } while (exit < 2);
 
-        /*
-         * Wait for inflight requests to drain.
-         */
+	/*
+	 * Wait for inflight requests to drain.
+	 */
 	if (!list_empty(&set->set_requests))
-                ptlrpc_set_wait(set);
+		ptlrpc_set_wait(&env, set);
 	lu_context_fini(&env.le_ctx);
 	lu_context_fini(env.le_ses);
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/recover.c b/drivers/staging/lustrefsx/lustre/ptlrpc/recover.c
index aacb929beae23..c923ab9386901 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/recover.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/recover.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -228,30 +228,22 @@ void ptlrpc_wake_delayed(struct obd_import *imp)
 
 void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req)
 {
-        struct obd_import *imp = failed_req->rq_import;
-        ENTRY;
+	struct obd_import *imp = failed_req->rq_import;
+	int conn = lustre_msg_get_conn_cnt(failed_req->rq_reqmsg);
+	ENTRY;
 
-        CDEBUG(D_HA, "import %s of %s@%s abruptly disconnected: reconnecting\n",
-               imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
-               imp->imp_connection->c_remote_uuid.uuid);
-
-        if (ptlrpc_set_import_discon(imp,
-                              lustre_msg_get_conn_cnt(failed_req->rq_reqmsg))) {
-                if (!imp->imp_replayable) {
-                        CDEBUG(D_HA, "import %s@%s for %s not replayable, "
-                               "auto-deactivating\n",
-                               obd2cli_tgt(imp->imp_obd),
-                               imp->imp_connection->c_remote_uuid.uuid,
-                               imp->imp_obd->obd_name);
-                        ptlrpc_deactivate_import(imp);
-                }
-                /* to control recovery via lctl {disable|enable}_recovery */
-                if (imp->imp_deactive == 0)
-                        ptlrpc_connect_import(imp);
-        }
+	CDEBUG(D_HA, "import %s of %s@%s abruptly disconnected: reconnecting\n",
+		imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
+		imp->imp_connection->c_remote_uuid.uuid);
+
+	if (ptlrpc_set_import_discon(imp, conn, true)) {
+		/* to control recovery via lctl {disable|enable}_recovery */
+		if (imp->imp_deactive == 0)
+			ptlrpc_connect_import(imp);
+	}
 
-        /* Wait for recovery to complete and resend. If evicted, then
-           this request will be errored out later.*/
+	/* Wait for recovery to complete and resend. If evicted, then
+	   this request will be errored out later.*/
 	spin_lock(&failed_req->rq_lock);
 	if (!failed_req->rq_no_resend)
 		failed_req->rq_resend = 1;
@@ -261,7 +253,7 @@ void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req)
 }
 
 /**
- * Administratively active/deactive a client. 
+ * Administratively active/deactive a client.
  * This should only be called by the ioctl interface, currently
  *  - the lctl deactivate and activate commands
  *  - echo 0/1 >> /proc/osc/XXX/active
@@ -320,21 +312,21 @@ int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async)
 	    atomic_read(&imp->imp_inval_count))
 		rc = -EINVAL;
 	spin_unlock(&imp->imp_lock);
-        if (rc)
-                GOTO(out, rc);
+	if (rc)
+		GOTO(out, rc);
 
-        /* force import to be disconnected. */
-        ptlrpc_set_import_discon(imp, 0);
+	/* force import to be disconnected. */
+	ptlrpc_set_import_discon(imp, 0, false);
 
-        if (new_uuid) {
-                struct obd_uuid uuid;
+	if (new_uuid) {
+		struct obd_uuid uuid;
 
-                /* intruct import to use new uuid */
-                obd_str2uuid(&uuid, new_uuid);
-                rc = import_set_conn_priority(imp, &uuid);
-                if (rc)
-                        GOTO(out, rc);
-        }
+		/* intruct import to use new uuid */
+		obd_str2uuid(&uuid, new_uuid);
+		rc = import_set_conn_priority(imp, &uuid);
+		if (rc)
+			GOTO(out, rc);
+	}
 
         /* Check if reconnect is already in progress */
 	spin_lock(&imp->imp_lock);
@@ -354,9 +346,9 @@ int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async)
 
         if (!async) {
                 struct l_wait_info lwi;
-                int secs = cfs_time_seconds(obd_timeout);
+		long secs = cfs_time_seconds(obd_timeout);
 
-                CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n",
+		CDEBUG(D_HA, "%s: recovery started, waiting %lu seconds\n",
                        obd2cli_tgt(imp->imp_obd), secs);
 
                 lwi = LWI_TIMEOUT(secs, NULL, NULL);
@@ -377,9 +369,8 @@ int ptlrpc_import_in_recovery(struct obd_import *imp)
 	int in_recovery = 1;
 
 	spin_lock(&imp->imp_lock);
-	if (imp->imp_state == LUSTRE_IMP_FULL ||
-	    imp->imp_state == LUSTRE_IMP_CLOSED ||
-	    imp->imp_state == LUSTRE_IMP_DISCON ||
+	if (imp->imp_state <= LUSTRE_IMP_DISCON ||
+	    imp->imp_state >= LUSTRE_IMP_FULL ||
 	    imp->imp_obd->obd_no_recov)
 		in_recovery = 0;
 	spin_unlock(&imp->imp_lock);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec.c
index 92d39ece51d16..78c07fcefec3a 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -43,6 +43,10 @@
 #include <linux/crypto.h>
 #include <linux/key.h>
 
+#ifdef HAVE_LINUX_SELINUX_IS_ENABLED
+#include <linux/selinux.h>
+#endif
+
 #include <libcfs/libcfs.h>
 #include <obd.h>
 #include <obd_class.h>
@@ -54,6 +58,10 @@
 
 #include "ptlrpc_internal.h"
 
+static int send_sepol;
+module_param(send_sepol, int, 0644);
+MODULE_PARM_DESC(send_sepol, "Client sends SELinux policy status");
+
 /***********************************************
  * policy registers                            *
  ***********************************************/
@@ -402,11 +410,12 @@ static int import_sec_validate_get(struct obd_import *imp,
 	}
 
 	*sec = sptlrpc_import_sec_ref(imp);
-	/* Only output an error when the import is still active */
 	if (*sec == NULL) {
-		if (list_empty(&imp->imp_zombie_chain))
+		/* Only output an error when the import is still active */
+		if (!test_bit(WORK_STRUCT_PENDING_BIT,
+			      work_data_bits(&imp->imp_zombie_work)))
 			CERROR("import %p (%s) with no sec\n",
-				imp, ptlrpc_import_state_name(imp->imp_state));
+			       imp, ptlrpc_import_state_name(imp->imp_state));
 		return -EACCES;
 	}
 
@@ -709,12 +718,12 @@ int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout)
         sptlrpc_sec_put(sec);
 
         if (cli_ctx_is_eternal(ctx))
-                RETURN(0);
+		RETURN(0);
 
 	if (unlikely(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags))) {
-                LASSERT(ctx->cc_ops->refresh);
-                ctx->cc_ops->refresh(ctx);
-        }
+		if (ctx->cc_ops->refresh)
+			ctx->cc_ops->refresh(ctx);
+	}
 	LASSERT(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags) == 0);
 
         LASSERT(ctx->cc_ops->validate);
@@ -836,7 +845,30 @@ int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout)
                 RETURN(rc);
         }
 
-        goto again;
+	goto again;
+}
+
+/* Bring ptlrpc_sec context up-to-date */
+int sptlrpc_export_update_ctx(struct obd_export *exp)
+{
+	struct obd_import *imp = exp ? exp->exp_imp_reverse : NULL;
+	struct ptlrpc_sec *sec = NULL;
+	struct ptlrpc_cli_ctx *ctx = NULL;
+	int rc = 0;
+
+	if (imp)
+		sec = sptlrpc_import_sec_ref(imp);
+	if (sec) {
+		ctx = get_my_ctx(sec);
+		sptlrpc_sec_put(sec);
+	}
+
+	if (ctx) {
+		if (ctx->cc_ops->refresh)
+			rc = ctx->cc_ops->refresh(ctx);
+		sptlrpc_cli_ctx_put(ctx, 1);
+	}
+	return rc;
 }
 
 /**
@@ -1726,6 +1758,7 @@ void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req)
         req->rq_repmsg = NULL;
         EXIT;
 }
+EXPORT_SYMBOL(sptlrpc_cli_free_repbuf);
 
 int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp,
                                 struct ptlrpc_cli_ctx *ctx)
@@ -1747,6 +1780,128 @@ int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp,
         return policy->sp_sops->install_rctx(imp, ctx);
 }
 
+/* Get SELinux policy info from userspace */
+static int sepol_helper(struct obd_import *imp)
+{
+	char mtime_str[21] = { 0 }, mode_str[2] = { 0 };
+	char *argv[] = {
+		[0] = "/usr/sbin/l_getsepol",
+		[1] = "-o",
+		[2] = NULL,	    /* obd type */
+		[3] = "-n",
+		[4] = NULL,	    /* obd name */
+		[5] = "-t",
+		[6] = mtime_str,    /* policy mtime */
+		[7] = "-m",
+		[8] = mode_str,	    /* enforcing mode */
+		[9] = NULL
+	};
+	char *envp[] = {
+		[0] = "HOME=/",
+		[1] = "PATH=/sbin:/usr/sbin",
+		[2] = NULL
+	};
+	signed short ret;
+	int rc = 0;
+
+	if (imp == NULL || imp->imp_obd == NULL ||
+	    imp->imp_obd->obd_type == NULL) {
+		rc = -EINVAL;
+	} else {
+		argv[2] = imp->imp_obd->obd_type->typ_name;
+		argv[4] = imp->imp_obd->obd_name;
+		spin_lock(&imp->imp_sec->ps_lock);
+		if (ktime_to_ns(imp->imp_sec->ps_sepol_mtime) == 0 &&
+		    imp->imp_sec->ps_sepol[0] == '\0') {
+			/* ps_sepol has not been initialized */
+			argv[5] = NULL;
+			argv[7] = NULL;
+		} else {
+			time64_t mtime_ms;
+
+			mtime_ms = ktime_to_ms(imp->imp_sec->ps_sepol_mtime);
+			snprintf(mtime_str, sizeof(mtime_str), "%lld",
+				 mtime_ms / MSEC_PER_SEC);
+			mode_str[0] = imp->imp_sec->ps_sepol[0];
+		}
+		spin_unlock(&imp->imp_sec->ps_lock);
+		ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+		rc = ret>>8;
+	}
+
+	return rc;
+}
+
+static inline int sptlrpc_sepol_needs_check(struct ptlrpc_sec *imp_sec)
+{
+	ktime_t checknext;
+
+	if (send_sepol == 0 || !selinux_is_enabled())
+		return 0;
+
+	if (send_sepol == -1)
+		/* send_sepol == -1 means fetch sepol status every time */
+		return 1;
+
+	spin_lock(&imp_sec->ps_lock);
+	checknext = imp_sec->ps_sepol_checknext;
+	spin_unlock(&imp_sec->ps_lock);
+
+	/* next check is too far in time, please update */
+	if (ktime_after(checknext,
+			ktime_add(ktime_get(), ktime_set(send_sepol, 0))))
+		goto setnext;
+
+	if (ktime_before(ktime_get(), checknext))
+		/* too early to fetch sepol status */
+		return 0;
+
+setnext:
+	/* define new sepol_checknext time */
+	spin_lock(&imp_sec->ps_lock);
+	imp_sec->ps_sepol_checknext = ktime_add(ktime_get(),
+						ktime_set(send_sepol, 0));
+	spin_unlock(&imp_sec->ps_lock);
+
+	return 1;
+}
+
+int sptlrpc_get_sepol(struct ptlrpc_request *req)
+{
+	struct ptlrpc_sec *imp_sec = req->rq_import->imp_sec;
+	int rc = 0;
+
+	ENTRY;
+
+	(req->rq_sepol)[0] = '\0';
+
+#ifndef HAVE_SELINUX
+	if (unlikely(send_sepol != 0))
+		CDEBUG(D_SEC, "Client cannot report SELinux status, "
+			      "it was not built against libselinux.\n");
+	RETURN(0);
+#endif
+
+	if (send_sepol == 0 || !selinux_is_enabled())
+		RETURN(0);
+
+	if (imp_sec == NULL)
+		RETURN(-EINVAL);
+
+	/* Retrieve SELinux status info */
+	if (sptlrpc_sepol_needs_check(imp_sec))
+		rc = sepol_helper(req->rq_import);
+	if (likely(rc == 0)) {
+		spin_lock(&imp_sec->ps_lock);
+		memcpy(req->rq_sepol, imp_sec->ps_sepol,
+		       sizeof(req->rq_sepol));
+		spin_unlock(&imp_sec->ps_lock);
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(sptlrpc_get_sepol);
+
 /****************************************
  * server side security                 *
  ****************************************/
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c
index 42841f0c0aaf1..216c2f2a0820b 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_bulk.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -36,7 +36,7 @@
 
 #define DEBUG_SUBSYSTEM S_SEC
 
-#include <libcfs/libcfs.h>
+#include <libcfs/linux/linux-mem.h>
 
 #include <obd.h>
 #include <obd_cksum.h>
@@ -114,7 +114,7 @@ static struct ptlrpc_enc_page_pool {
         unsigned long    epp_st_missings;       /* # of cache missing */
         unsigned long    epp_st_lowfree;        /* lowest free pages reached */
         unsigned int     epp_st_max_wqlen;      /* highest waitqueue length */
-        cfs_time_t       epp_st_max_wait;       /* in jeffies */
+	ktime_t		epp_st_max_wait;	/* in nanoseconds */
 	unsigned long	 epp_st_outofmem;	/* # of out of mem requests */
 	/*
 	 * pointers to pools, may be vmalloc'd
@@ -143,8 +143,8 @@ int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v)
 		   "total pages:             %lu\n"
 		   "total free:              %lu\n"
 		   "idle index:              %lu/100\n"
-		   "last shrink:             %lds\n"
-		   "last access:             %lds\n"
+		   "last shrink:             %llds\n"
+		   "last access:             %llds\n"
 		   "max pages reached:       %lu\n"
 		   "grows:                   %u\n"
 		   "grows failure:           %u\n"
@@ -153,7 +153,7 @@ int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v)
 		   "cache missing:           %lu\n"
 		   "low free mark:           %lu\n"
 		   "max waitqueue depth:     %u\n"
-		   "max wait time:           %ld/%lu\n"
+		   "max wait time ms:        %lld\n"
 		   "out of mem:              %lu\n",
 		   cfs_totalram_pages(), PAGES_PER_POOL,
 		   page_pools.epp_max_pages,
@@ -161,8 +161,8 @@ int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v)
 		   page_pools.epp_total_pages,
 		   page_pools.epp_free_pages,
 		   page_pools.epp_idle_idx,
-		   (long)(ktime_get_seconds() - page_pools.epp_last_shrink),
-		   (long)(ktime_get_seconds() - page_pools.epp_last_access),
+		   ktime_get_seconds() - page_pools.epp_last_shrink,
+		   ktime_get_seconds() - page_pools.epp_last_access,
 		   page_pools.epp_st_max_pages,
 		   page_pools.epp_st_grows,
 		   page_pools.epp_st_grow_fails,
@@ -171,8 +171,7 @@ int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v)
 		   page_pools.epp_st_missings,
 		   page_pools.epp_st_lowfree,
 		   page_pools.epp_st_max_wqlen,
-		   page_pools.epp_st_max_wait,
-		   msecs_to_jiffies(MSEC_PER_SEC),
+		   ktime_to_ms(page_pools.epp_st_max_wait),
 		   page_pools.epp_st_outofmem);
 
 	spin_unlock(&page_pools.epp_lock);
@@ -234,7 +233,7 @@ static unsigned long enc_pools_shrink_count(struct shrinker *s,
 	 * if no pool access for a long time, we consider it's fully idle.
 	 * a little race here is fine.
 	 */
-	if (unlikely(ktime_get_real_seconds() - page_pools.epp_last_access >
+	if (unlikely(ktime_get_seconds() - page_pools.epp_last_access >
 		     CACHE_QUIESCENT_PERIOD)) {
 		spin_lock(&page_pools.epp_lock);
 		page_pools.epp_idle_idx = IDLE_IDX_MAX;
@@ -265,7 +264,7 @@ static unsigned long enc_pools_shrink_scan(struct shrinker *s,
 		       (long)sc->nr_to_scan, page_pools.epp_free_pages);
 
 		page_pools.epp_st_shrinks++;
-		page_pools.epp_last_shrink = ktime_get_real_seconds();
+		page_pools.epp_last_shrink = ktime_get_seconds();
 	}
 	spin_unlock(&page_pools.epp_lock);
 
@@ -273,7 +272,7 @@ static unsigned long enc_pools_shrink_scan(struct shrinker *s,
 	 * if no pool access for a long time, we consider it's fully idle.
 	 * a little race here is fine.
 	 */
-	if (unlikely(ktime_get_real_seconds() - page_pools.epp_last_access >
+	if (unlikely(ktime_get_seconds() - page_pools.epp_last_access >
 		     CACHE_QUIESCENT_PERIOD)) {
 		spin_lock(&page_pools.epp_lock);
 		page_pools.epp_idle_idx = IDLE_IDX_MAX;
@@ -542,11 +541,11 @@ EXPORT_SYMBOL(pool_is_at_full_capacity);
 int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc)
 {
 	wait_queue_entry_t waitlink;
-	unsigned long   this_idle = -1;
-	cfs_time_t      tick = 0;
-	long            now;
-	int             p_idx, g_idx;
-	int             i;
+	unsigned long this_idle = -1;
+	u64 tick_ns = 0;
+	time64_t now;
+	int p_idx, g_idx;
+	int i;
 
 	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
 	LASSERT(desc->bd_iov_count > 0);
@@ -566,8 +565,8 @@ int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc)
 	page_pools.epp_st_access++;
 again:
 	if (unlikely(page_pools.epp_free_pages < desc->bd_iov_count)) {
-		if (tick == 0)
-			tick = cfs_time_current();
+		if (tick_ns == 0)
+			tick_ns = ktime_get_ns();
 
 		now = ktime_get_real_seconds();
 
@@ -625,12 +624,13 @@ int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc)
 		goto again;
 	}
 
-        /* record max wait time */
-        if (unlikely(tick != 0)) {
-                tick = cfs_time_current() - tick;
-                if (tick > page_pools.epp_st_max_wait)
-                        page_pools.epp_st_max_wait = tick;
-        }
+	/* record max wait time */
+	if (unlikely(tick_ns)) {
+		ktime_t tick = ktime_sub_ns(ktime_get(), tick_ns);
+
+		if (ktime_after(tick, page_pools.epp_st_max_wait))
+			page_pools.epp_st_max_wait = tick;
+	}
 
         /* proceed with rest of allocation */
         page_pools.epp_free_pages -= desc->bd_iov_count;
@@ -664,7 +664,7 @@ int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc)
                                    this_idle) /
                                   (IDLE_IDX_WEIGHT + 1);
 
-	page_pools.epp_last_access = ktime_get_real_seconds();
+	page_pools.epp_last_access = ktime_get_seconds();
 
 	spin_unlock(&page_pools.epp_lock);
 	return 0;
@@ -789,8 +789,8 @@ int sptlrpc_enc_pool_init(void)
         page_pools.epp_growing = 0;
 
         page_pools.epp_idle_idx = 0;
-	page_pools.epp_last_shrink = ktime_get_real_seconds();
-	page_pools.epp_last_access = ktime_get_real_seconds();
+	page_pools.epp_last_shrink = ktime_get_seconds();
+	page_pools.epp_last_access = ktime_get_seconds();
 
 	spin_lock_init(&page_pools.epp_lock);
         page_pools.epp_total_pages = 0;
@@ -804,7 +804,7 @@ int sptlrpc_enc_pool_init(void)
         page_pools.epp_st_missings = 0;
         page_pools.epp_st_lowfree = 0;
         page_pools.epp_st_max_wqlen = 0;
-        page_pools.epp_st_max_wait = 0;
+	page_pools.epp_st_max_wait = ktime_set(0, 0);
 	page_pools.epp_st_outofmem = 0;
 
         enc_pools_alloc();
@@ -838,13 +838,12 @@ void sptlrpc_enc_pool_fini(void)
 
 	if (page_pools.epp_st_access > 0) {
 		CDEBUG(D_SEC,
-		       "max pages %lu, grows %u, grow fails %u, shrinks %u, access %lu, missing %lu, max qlen %u, max wait %ld/%lu, out of mem %lu\n",
+		       "max pages %lu, grows %u, grow fails %u, shrinks %u, access %lu, missing %lu, max qlen %u, max wait ms %lld, out of mem %lu\n",
 		       page_pools.epp_st_max_pages, page_pools.epp_st_grows,
 		       page_pools.epp_st_grow_fails,
 		       page_pools.epp_st_shrinks, page_pools.epp_st_access,
 		       page_pools.epp_st_missings, page_pools.epp_st_max_wqlen,
-		       page_pools.epp_st_max_wait,
-		       msecs_to_jiffies(MSEC_PER_SEC),
+		       ktime_to_ms(page_pools.epp_st_max_wait),
 		       page_pools.epp_st_outofmem);
 	}
 }
@@ -917,7 +916,7 @@ EXPORT_SYMBOL(bulk_sec_desc_unpack);
 int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
 			      void *buf, int buflen)
 {
-	struct cfs_crypto_hash_desc	*hdesc;
+	struct ahash_request	       *req;
 	int				hashsize;
 	unsigned int			bufsize;
 	int				i, err;
@@ -926,17 +925,17 @@ int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
 	LASSERT(alg > BULK_HASH_ALG_NULL && alg < BULK_HASH_ALG_MAX);
 	LASSERT(buflen >= 4);
 
-	hdesc = cfs_crypto_hash_init(cfs_hash_alg_id[alg], NULL, 0);
-	if (IS_ERR(hdesc)) {
+	req = cfs_crypto_hash_init(cfs_hash_alg_id[alg], NULL, 0);
+	if (IS_ERR(req)) {
 		CERROR("Unable to initialize checksum hash %s\n",
 		       cfs_crypto_hash_name(cfs_hash_alg_id[alg]));
-		return PTR_ERR(hdesc);
+		return PTR_ERR(req);
 	}
 
 	hashsize = cfs_crypto_hash_digestsize(cfs_hash_alg_id[alg]);
 
 	for (i = 0; i < desc->bd_iov_count; i++) {
-		cfs_crypto_hash_update_page(hdesc,
+		cfs_crypto_hash_update_page(req,
 				  BD_GET_KIOV(desc, i).kiov_page,
 				  BD_GET_KIOV(desc, i).kiov_offset &
 					      ~PAGE_MASK,
@@ -949,11 +948,11 @@ int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
 		bufsize = sizeof(hashbuf);
 		LASSERTF(bufsize >= hashsize, "bufsize = %u < hashsize %u\n",
 			 bufsize, hashsize);
-		err = cfs_crypto_hash_final(hdesc, hashbuf, &bufsize);
+		err = cfs_crypto_hash_final(req, hashbuf, &bufsize);
 		memcpy(buf, hashbuf, buflen);
 	} else {
 		bufsize = buflen;
-		err = cfs_crypto_hash_final(hdesc, buf, &bufsize);
+		err = cfs_crypto_hash_final(req, buf, &bufsize);
 	}
 
 	return err;
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_config.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_config.c
index 550abeafceea0..b661ff8696530 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_config.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_config.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -44,7 +44,7 @@
 #include <lustre_log.h>
 #include <lustre_disk.h>
 #include <lustre_dlm.h>
-#include <uapi/linux/lustre_param.h>
+#include <uapi/linux/lustre/lustre_param.h>
 #include <lustre_sec.h>
 
 #include "ptlrpc_internal.h"
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c
index 766b21d10c20c..dc9f38c7036ba 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_ctx.c
@@ -21,7 +21,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2014, Intel Corporation.
+ * Copyright (c) 2014, 2017, Intel Corporation.
  */
 
 #define DEBUG_SUBSYSTEM S_FILTER
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_gc.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_gc.c
index f8ec60b1adb01..042a632390cfe 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_gc.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_gc.c
@@ -36,7 +36,7 @@
 
 #define DEBUG_SUBSYSTEM S_SEC
 
-#include <linux/kthread.h>
+#include <linux/workqueue.h>
 #include <libcfs/libcfs.h>
 
 #include <obd_support.h>
@@ -48,7 +48,6 @@
 
 #define SEC_GC_INTERVAL (30 * 60)
 
-
 static struct mutex sec_gc_mutex;
 static spinlock_t sec_gc_list_lock;
 static struct list_head sec_gc_list;
@@ -56,10 +55,8 @@ static struct list_head sec_gc_list;
 static spinlock_t sec_gc_ctx_list_lock;
 static struct list_head sec_gc_ctx_list;
 
-static struct ptlrpc_thread sec_gc_thread;
 static atomic_t sec_gc_wait_del = ATOMIC_INIT(0);
 
-
 void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec)
 {
         LASSERT(sec->ps_policy->sp_cops->gc_ctx);
@@ -98,6 +95,9 @@ void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec)
 	CDEBUG(D_SEC, "del sec %p(%s)\n", sec, sec->ps_policy->sp_name);
 }
 
+static void sec_gc_main(struct work_struct *ws);
+static DECLARE_DELAYED_WORK(sec_gc_work, sec_gc_main);
+
 void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx)
 {
 	LASSERT(list_empty(&ctx->cc_gc_chain));
@@ -108,8 +108,7 @@ void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx)
 	list_add(&ctx->cc_gc_chain, &sec_gc_ctx_list);
 	spin_unlock(&sec_gc_ctx_list_lock);
 
-	thread_add_flags(&sec_gc_thread, SVC_SIGNAL);
-	wake_up(&sec_gc_thread.t_ctl_waitq);
+	mod_delayed_work(system_wq, &sec_gc_work, 0);
 }
 EXPORT_SYMBOL(sptlrpc_gc_add_ctx);
 
@@ -156,68 +155,41 @@ static void sec_do_gc(struct ptlrpc_sec *sec)
 	sec->ps_gc_next = ktime_get_real_seconds() + sec->ps_gc_interval;
 }
 
-static int sec_gc_main(void *arg)
+static void sec_gc_main(struct work_struct *ws)
 {
-	struct ptlrpc_thread *thread = (struct ptlrpc_thread *) arg;
-	struct l_wait_info    lwi;
-
-	unshare_fs_struct();
-
-	/* Record that the thread is running */
-	thread_set_flags(thread, SVC_RUNNING);
-	wake_up(&thread->t_ctl_waitq);
+	struct ptlrpc_sec *sec;
 
-	while (1) {
-		struct ptlrpc_sec *sec;
-
-		thread_clear_flags(thread, SVC_SIGNAL);
-		sec_process_ctx_list();
+	sec_process_ctx_list();
 again:
-		/* go through sec list do gc.
-		 * FIXME here we iterate through the whole list each time which
-		 * is not optimal. we perhaps want to use balanced binary tree
-		 * to trace each sec as order of expiry time.
-		 * another issue here is we wakeup as fixed interval instead of
-		 * according to each sec's expiry time */
-		mutex_lock(&sec_gc_mutex);
-		list_for_each_entry(sec, &sec_gc_list, ps_gc_list) {
-			/* if someone is waiting to be deleted, let it
-			 * proceed as soon as possible. */
-			if (atomic_read(&sec_gc_wait_del)) {
-				CDEBUG(D_SEC, "deletion pending, start over\n");
-				mutex_unlock(&sec_gc_mutex);
-				goto again;
-			}
-
-			sec_do_gc(sec);
+	/* go through sec list do gc.
+	 * FIXME here we iterate through the whole list each time which
+	 * is not optimal. we perhaps want to use balanced binary tree
+	 * to trace each sec as order of expiry time.
+	 * another issue here is we wakeup as fixed interval instead of
+	 * according to each sec's expiry time
+	 */
+	mutex_lock(&sec_gc_mutex);
+	list_for_each_entry(sec, &sec_gc_list, ps_gc_list) {
+		/* if someone is waiting to be deleted, let it
+		 * proceed as soon as possible.
+		 */
+		if (atomic_read(&sec_gc_wait_del)) {
+			CDEBUG(D_SEC, "deletion pending, start over\n");
+			mutex_unlock(&sec_gc_mutex);
+			goto again;
 		}
-		mutex_unlock(&sec_gc_mutex);
-
-		/* check ctx list again before sleep */
-		sec_process_ctx_list();
-
-		lwi = LWI_TIMEOUT(msecs_to_jiffies(SEC_GC_INTERVAL *
-						   MSEC_PER_SEC),
-				  NULL, NULL);
-		l_wait_event(thread->t_ctl_waitq,
-			     thread_is_stopping(thread) ||
-			     thread_is_signal(thread),
-			     &lwi);
 
-		if (thread_test_and_clear_flags(thread, SVC_STOPPING))
-			break;
+		sec_do_gc(sec);
 	}
+	mutex_unlock(&sec_gc_mutex);
 
-	thread_set_flags(thread, SVC_STOPPED);
-	wake_up(&thread->t_ctl_waitq);
-	return 0;
+	/* check ctx list again before sleep */
+	sec_process_ctx_list();
+	schedule_delayed_work(&sec_gc_work, cfs_time_seconds(SEC_GC_INTERVAL));
 }
 
 int sptlrpc_gc_init(void)
 {
-	struct l_wait_info lwi = { 0 };
-	struct task_struct *task;
-
 	mutex_init(&sec_gc_mutex);
 	spin_lock_init(&sec_gc_list_lock);
 	spin_lock_init(&sec_gc_ctx_list_lock);
@@ -225,28 +197,11 @@ int sptlrpc_gc_init(void)
 	INIT_LIST_HEAD(&sec_gc_list);
 	INIT_LIST_HEAD(&sec_gc_ctx_list);
 
-	/* initialize thread control */
-	memset(&sec_gc_thread, 0, sizeof(sec_gc_thread));
-	init_waitqueue_head(&sec_gc_thread.t_ctl_waitq);
-
-	task = kthread_run(sec_gc_main, &sec_gc_thread, "sptlrpc_gc");
-	if (IS_ERR(task)) {
-		CERROR("can't start gc thread: %ld\n", PTR_ERR(task));
-		return PTR_ERR(task);
-	}
-
-	l_wait_event(sec_gc_thread.t_ctl_waitq,
-		     thread_is_running(&sec_gc_thread), &lwi);
+	schedule_delayed_work(&sec_gc_work, cfs_time_seconds(SEC_GC_INTERVAL));
 	return 0;
 }
 
 void sptlrpc_gc_fini(void)
 {
-	struct l_wait_info lwi = { 0 };
-
-	thread_set_flags(&sec_gc_thread, SVC_STOPPING);
-	wake_up(&sec_gc_thread.t_ctl_waitq);
-
-	l_wait_event(sec_gc_thread.t_ctl_waitq,
-		     thread_is_stopped(&sec_gc_thread), &lwi);
+	cancel_delayed_work_sync(&sec_gc_work);
 }
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_lproc.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_lproc.c
index 96acb183270e4..4f8efe44aa678 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_lproc.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_lproc.c
@@ -110,7 +110,8 @@ static int sptlrpc_info_lprocfs_seq_show(struct seq_file *seq, void *v)
 out:
         return 0;
 }
-LPROC_SEQ_FOPS_RO(sptlrpc_info_lprocfs);
+
+LDEBUGFS_SEQ_FOPS_RO(sptlrpc_info_lprocfs);
 
 static int sptlrpc_ctxs_lprocfs_seq_show(struct seq_file *seq, void *v)
 {
@@ -136,11 +137,81 @@ static int sptlrpc_ctxs_lprocfs_seq_show(struct seq_file *seq, void *v)
 out:
         return 0;
 }
-LPROC_SEQ_FOPS_RO(sptlrpc_ctxs_lprocfs);
+
+LDEBUGFS_SEQ_FOPS_RO(sptlrpc_ctxs_lprocfs);
+
+static ssize_t
+ldebugfs_sptlrpc_sepol_seq_write(struct file *file, const char __user *buffer,
+				size_t count, void *data)
+{
+	struct seq_file	*seq = file->private_data;
+	struct obd_device *dev = seq->private;
+	struct client_obd *cli = &dev->u.cli;
+	struct obd_import *imp = cli->cl_import;
+	struct sepol_downcall_data *param;
+	int size = sizeof(*param);
+	int rc = 0;
+
+	if (count < size) {
+		CERROR("%s: invalid data count = %lu, size = %d\n",
+		       dev->obd_name, (unsigned long) count, size);
+		return -EINVAL;
+	}
+
+	OBD_ALLOC(param, size);
+	if (param == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(param, buffer, size)) {
+		CERROR("%s: bad sepol data\n", dev->obd_name);
+		GOTO(out, rc = -EFAULT);
+	}
+
+	if (param->sdd_magic != SEPOL_DOWNCALL_MAGIC) {
+		CERROR("%s: sepol downcall bad params\n",
+		       dev->obd_name);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	if (param->sdd_sepol_len == 0 ||
+	    param->sdd_sepol_len >= sizeof(imp->imp_sec->ps_sepol)) {
+		CERROR("%s: invalid sepol data returned\n",
+		       dev->obd_name);
+		GOTO(out, rc = -EINVAL);
+	}
+	rc = param->sdd_sepol_len; /* save sdd_sepol_len */
+	OBD_FREE(param, size);
+	size = offsetof(struct sepol_downcall_data,
+			sdd_sepol[rc]);
+
+	/* alloc again with real size */
+	rc = 0;
+	OBD_ALLOC(param, size);
+	if (param == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(param, buffer, size)) {
+		CERROR("%s: bad sepol data\n", dev->obd_name);
+		GOTO(out, rc = -EFAULT);
+	}
+
+	spin_lock(&imp->imp_sec->ps_lock);
+	snprintf(imp->imp_sec->ps_sepol, param->sdd_sepol_len + 1, "%s",
+		 param->sdd_sepol);
+	imp->imp_sec->ps_sepol_mtime = ktime_set(param->sdd_sepol_mtime, 0);
+	spin_unlock(&imp->imp_sec->ps_lock);
+
+out:
+	if (param != NULL)
+		OBD_FREE(param, size);
+
+	return rc ? rc : count;
+}
+LDEBUGFS_FOPS_WR_ONLY(srpc, sptlrpc_sepol);
 
 int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev)
 {
-        int     rc;
+	int     rc;
 
 	if (strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) != 0 &&
 	    strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) != 0 &&
@@ -152,23 +223,31 @@ int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev)
 		return -EINVAL;
 	}
 
-        rc = lprocfs_obd_seq_create(dev, "srpc_info", 0444,
-                                    &sptlrpc_info_lprocfs_fops, dev);
-        if (rc) {
-                CERROR("create proc entry srpc_info for %s: %d\n",
-                       dev->obd_name, rc);
-                return rc;
-        }
+	rc = ldebugfs_seq_create(dev->obd_debugfs_entry, "srpc_info", 0444,
+				 &sptlrpc_info_lprocfs_fops, dev);
+	if (rc) {
+		CERROR("create proc entry srpc_info for %s: %d\n",
+		       dev->obd_name, rc);
+		return rc;
+	}
 
-        rc = lprocfs_obd_seq_create(dev, "srpc_contexts", 0444,
-                                    &sptlrpc_ctxs_lprocfs_fops, dev);
-        if (rc) {
-                CERROR("create proc entry srpc_contexts for %s: %d\n",
-                       dev->obd_name, rc);
-                return rc;
-        }
+	rc = ldebugfs_seq_create(dev->obd_debugfs_entry, "srpc_contexts",
+				 0444, &sptlrpc_ctxs_lprocfs_fops, dev);
+	if (rc) {
+		CERROR("create proc entry srpc_contexts for %s: %d\n",
+		       dev->obd_name, rc);
+		return rc;
+	}
 
-        return 0;
+	rc = ldebugfs_seq_create(dev->obd_debugfs_entry, "srpc_sepol",
+				 0200, &srpc_sptlrpc_sepol_fops, dev);
+	if (rc) {
+		CERROR("create proc entry srpc_sepol for %s: %d\n",
+		       dev->obd_name, rc);
+		return rc;
+	}
+
+	return 0;
 }
 EXPORT_SYMBOL(sptlrpc_lprocfs_cliobd_attach);
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_null.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_null.c
index 52af519a291d7..a17a4e182233e 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_null.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_null.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -63,14 +63,7 @@ void null_encode_sec_part(struct lustre_msg *msg, enum lustre_sec_part sp)
 static inline
 enum lustre_sec_part null_decode_sec_part(struct lustre_msg *msg)
 {
-        return (msg->lm_secflvr >> 24) & 0xFF;
-}
-
-static int null_ctx_refresh(struct ptlrpc_cli_ctx *ctx)
-{
-        /* should never reach here */
-        LBUG();
-        return 0;
+	return (msg->lm_secflvr >> 24) & 0xFF;
 }
 
 static
@@ -370,11 +363,9 @@ int null_authorize(struct ptlrpc_request *req)
 }
 
 static struct ptlrpc_ctx_ops null_ctx_ops = {
-        .refresh                = null_ctx_refresh,
-        .sign                   = null_ctx_sign,
-        .verify                 = null_ctx_verify,
+	.sign                   = null_ctx_sign,
+	.verify                 = null_ctx_verify,
 };
-
 static struct ptlrpc_sec_cops null_sec_cops = {
         .create_sec             = null_create_sec,
         .destroy_sec            = null_destroy_sec,
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_plain.c b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_plain.c
index a0f192cecf633..dea70d160b54e 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/sec_plain.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/sec_plain.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -215,12 +215,12 @@ int plain_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
 static
 int plain_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
 {
-        struct lustre_msg   *msg = req->rq_repdata;
-        struct plain_header *phdr;
-        __u32                cksum;
-        int                  swabbed;
-        ENTRY;
+	struct lustre_msg *msg = req->rq_repdata;
+	struct plain_header *phdr;
+	__u32 cksum;
+	bool swabbed;
 
+	ENTRY;
         if (msg->lm_bufcount != PLAIN_PACK_SEGMENTS) {
                 CERROR("unexpected reply buf count %u\n", msg->lm_bufcount);
                 RETURN(-EPROTO);
@@ -723,16 +723,15 @@ static struct ptlrpc_svc_ctx plain_svc_ctx = {
         .sc_policy      = &plain_policy,
 };
 
-static
-int plain_accept(struct ptlrpc_request *req)
+static int plain_accept(struct ptlrpc_request *req)
 {
-        struct lustre_msg   *msg = req->rq_reqbuf;
-        struct plain_header *phdr;
-        int                  swabbed;
-        ENTRY;
+	struct lustre_msg *msg = req->rq_reqbuf;
+	struct plain_header *phdr;
+	bool swabbed;
 
-        LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) ==
-                SPTLRPC_POLICY_PLAIN);
+	ENTRY;
+	LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) ==
+		SPTLRPC_POLICY_PLAIN);
 
         if (SPTLRPC_FLVR_BASE(req->rq_flvr.sf_rpc) !=
             SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN) ||
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/service.c b/drivers/staging/lustrefsx/lustre/ptlrpc/service.c
index 6e3172cdeb5a7..6373c36865f3d 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/service.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/service.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2010, 2016, Intel Corporation.
+ * Copyright (c) 2010, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -31,13 +31,15 @@
  */
 
 #define DEBUG_SUBSYSTEM S_RPC
+
 #include <linux/kthread.h>
 #include <obd_support.h>
 #include <obd_class.h>
 #include <lustre_net.h>
 #include <lu_object.h>
-#include <lnet/types.h>
+#include <uapi/linux/lnet/lnet-types.h>
 #include "ptlrpc_internal.h"
+#include <linux/delay.h>
 
 /* The following are visible and mutable through /sys/module/ptlrpc */
 int test_req_buffer_pressure = 0;
@@ -139,7 +141,9 @@ ptlrpc_grow_req_bufs(struct ptlrpc_service_part *svcpt, int post)
         for (i = 0; i < svc->srv_nbuf_per_group; i++) {
                 /* NB: another thread might have recycled enough rqbds, we
 		 * need to make sure it wouldn't over-allocate, see LU-1212. */
-		if (svcpt->scp_nrqbds_posted >= svc->srv_nbuf_per_group)
+		if (svcpt->scp_nrqbds_posted >= svc->srv_nbuf_per_group ||
+		    (svc->srv_nrqbds_max != 0 &&
+		     svcpt->scp_nrqbds_total > svc->srv_nrqbds_max))
 			break;
 
 		rqbd = ptlrpc_alloc_rqbd(svcpt);
@@ -479,7 +483,7 @@ static void ptlrpc_at_timer(cfs_timer_cb_arg_t data)
 	svcpt = cfs_from_timer(svcpt, data, scp_at_timer);
 
 	svcpt->scp_at_check = 1;
-	svcpt->scp_at_checktime = cfs_time_current();
+	svcpt->scp_at_checktime = ktime_get();
 	wake_up(&svcpt->scp_waitq);
 }
 
@@ -602,6 +606,7 @@ ptlrpc_service_part_init(struct ptlrpc_service *svc,
 
 	/* rqbd and incoming request queue */
 	spin_lock_init(&svcpt->scp_lock);
+	mutex_init(&svcpt->scp_mutex);
 	INIT_LIST_HEAD(&svcpt->scp_rqbd_idle);
 	INIT_LIST_HEAD(&svcpt->scp_rqbd_posted);
 	INIT_LIST_HEAD(&svcpt->scp_req_incoming);
@@ -683,7 +688,8 @@ ptlrpc_service_part_init(struct ptlrpc_service *svc,
  */
 struct ptlrpc_service *
 ptlrpc_register_service(struct ptlrpc_service_conf *conf,
-			struct proc_dir_entry *proc_entry)
+			struct kset *parent,
+			struct dentry *debugfs_entry)
 {
 	struct ptlrpc_service_cpt_conf	*cconf = &conf->psc_cpt;
 	struct ptlrpc_service		*service;
@@ -705,7 +711,13 @@ ptlrpc_register_service(struct ptlrpc_service_conf *conf,
 	if (cptable == NULL)
 		cptable = cfs_cpt_table;
 
-	if (!conf->psc_thr.tc_cpu_affinity) {
+	if (conf->psc_thr.tc_cpu_bind > 1) {
+		CERROR("%s: Invalid cpu bind value %d, only 1 or 0 allowed\n",
+		       conf->psc_name, conf->psc_thr.tc_cpu_bind);
+		RETURN(ERR_PTR(-EINVAL));
+	}
+
+	if (!cconf->cc_affinity) {
 		ncpts = 1;
 	} else {
 		ncpts = cfs_cpt_number(cptable);
@@ -744,6 +756,7 @@ ptlrpc_register_service(struct ptlrpc_service_conf *conf,
 	service->srv_cptable		= cptable;
 	service->srv_cpts		= cpts;
 	service->srv_ncpts		= ncpts;
+	service->srv_cpt_bind		= conf->psc_thr.tc_cpu_bind;
 
 	service->srv_cpt_bits = 0; /* it's zero already, easy to read... */
 	while ((1 << service->srv_cpt_bits) < cfs_cpt_number(cptable))
@@ -758,6 +771,9 @@ ptlrpc_register_service(struct ptlrpc_service_conf *conf,
 	/* buffer configuration */
 	service->srv_nbuf_per_group	= test_req_buffer_pressure ?
 					  1 : conf->psc_buf.bc_nbufs;
+	/* do not limit max number of rqbds by default */
+	service->srv_nrqbds_max		= 0;
+
 	service->srv_max_req_size	= conf->psc_buf.bc_req_max_size +
 					  SPTLRPC_MAX_PAYLOAD;
 	service->srv_buf_size		= conf->psc_buf.bc_buf_size;
@@ -776,7 +792,7 @@ ptlrpc_register_service(struct ptlrpc_service_conf *conf,
 	service->srv_ops		= conf->psc_ops;
 
 	for (i = 0; i < ncpts; i++) {
-		if (!conf->psc_thr.tc_cpu_affinity)
+		if (!cconf->cc_affinity)
 			cpt = CFS_CPT_ANY;
 		else
 			cpt = cpts != NULL ? cpts[i] : i;
@@ -800,8 +816,14 @@ ptlrpc_register_service(struct ptlrpc_service_conf *conf,
 	list_add(&service->srv_list, &ptlrpc_all_services);
 	mutex_unlock(&ptlrpc_all_services_mutex);
 
-	if (proc_entry != NULL)
-		ptlrpc_lprocfs_register_service(proc_entry, service);
+	if (parent) {
+		rc = ptlrpc_sysfs_register_service(parent, service);
+		if (rc)
+			GOTO(failed, rc);
+	}
+
+	if (debugfs_entry != NULL)
+		ptlrpc_ldebugfs_register_service(debugfs_entry, service);
 
 	rc = ptlrpc_service_nrs_setup(service);
 	if (rc != 0)
@@ -939,8 +961,10 @@ void ptlrpc_server_drop_request(struct ptlrpc_request *req)
 			 */
 			LASSERT(atomic_read(&rqbd->rqbd_req.rq_refcount) == 0);
 			if (svcpt->scp_nrqbds_posted >=
-			    svc->srv_nbuf_per_group &&
-			    !test_req_buffer_pressure) {
+			    svc->srv_nbuf_per_group ||
+			    (svc->srv_nrqbds_max != 0 &&
+			     svcpt->scp_nrqbds_total > svc->srv_nrqbds_max) ||
+			    test_req_buffer_pressure) {
 				/* like in ptlrpc_free_rqbd() */
 				svcpt->scp_nrqbds_total--;
 				OBD_FREE_LARGE(rqbd->rqbd_buffer,
@@ -977,18 +1001,18 @@ void ptlrpc_request_change_export(struct ptlrpc_request *req,
 	if (req->rq_export != NULL) {
 		LASSERT(!list_empty(&req->rq_exp_list));
 		/* remove rq_exp_list from last export */
-		spin_lock_bh(&req->rq_export->exp_rpc_lock);
+		spin_lock(&req->rq_export->exp_rpc_lock);
 		list_del_init(&req->rq_exp_list);
-		spin_unlock_bh(&req->rq_export->exp_rpc_lock);
+		spin_unlock(&req->rq_export->exp_rpc_lock);
 		/* export has one reference already, so it`s safe to
 		 * add req to export queue here and get another
 		 * reference for request later */
-		spin_lock_bh(&export->exp_rpc_lock);
+		spin_lock(&export->exp_rpc_lock);
 		if (req->rq_ops != NULL) /* hp request */
 			list_add(&req->rq_exp_list, &export->exp_hp_rpcs);
 		else
 			list_add(&req->rq_exp_list, &export->exp_reg_rpcs);
-		spin_unlock_bh(&export->exp_rpc_lock);
+		spin_unlock(&export->exp_rpc_lock);
 
 		class_export_rpc_dec(req->rq_export);
 		class_export_put(req->rq_export);
@@ -1041,10 +1065,10 @@ static void ptlrpc_server_finish_active_request(
  * This function is only called when some export receives a message (i.e.,
  * the network is up.)
  */
-void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay)
+void ptlrpc_update_export_timer(struct obd_export *exp, time64_t extra_delay)
 {
-        struct obd_export *oldest_exp;
-        time_t oldest_time, new_time;
+	struct obd_export *oldest_exp;
+	time64_t oldest_time, new_time;
 
         ENTRY;
 
@@ -1057,7 +1081,7 @@ void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay)
            will make it to the top of the list. */
 
         /* Do not pay attention on 1sec or smaller renewals. */
-        new_time = cfs_time_current_sec() + extra_delay;
+	new_time = ktime_get_real_seconds() + extra_delay;
         if (exp->exp_last_request_time + 1 /*second */ >= new_time)
                 RETURN_EXIT;
 
@@ -1088,33 +1112,35 @@ void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay)
                 return;
         }
 
-        /* Note - racing to start/reset the obd_eviction timer is safe */
-        if (exp->exp_obd->obd_eviction_timer == 0) {
-                /* Check if the oldest entry is expired. */
-                if (cfs_time_current_sec() > (oldest_time + PING_EVICT_TIMEOUT +
-                                              extra_delay)) {
-                        /* We need a second timer, in case the net was down and
-                         * it just came back. Since the pinger may skip every
-                         * other PING_INTERVAL (see note in ptlrpc_pinger_main),
-                         * we better wait for 3. */
-                        exp->exp_obd->obd_eviction_timer =
-                                cfs_time_current_sec() + 3 * PING_INTERVAL;
-			CDEBUG(D_HA, "%s: Think about evicting %s from %ld\n",
-                               exp->exp_obd->obd_name,
-                               obd_export_nid2str(oldest_exp), oldest_time);
-                }
-        } else {
-                if (cfs_time_current_sec() >
-                    (exp->exp_obd->obd_eviction_timer + extra_delay)) {
-                        /* The evictor won't evict anyone who we've heard from
-                         * recently, so we don't have to check before we start
-                         * it. */
-                        if (!ping_evictor_wake(exp))
-                                exp->exp_obd->obd_eviction_timer = 0;
-                }
-        }
+	/* Note - racing to start/reset the obd_eviction timer is safe */
+	if (exp->exp_obd->obd_eviction_timer == 0) {
+		/* Check if the oldest entry is expired. */
+		if (ktime_get_real_seconds() >
+		    oldest_time + PING_EVICT_TIMEOUT + extra_delay) {
+			/* We need a second timer, in case the net was down and
+			 * it just came back. Since the pinger may skip every
+			 * other PING_INTERVAL (see note in ptlrpc_pinger_main),
+			 * we better wait for 3.
+			 */
+			exp->exp_obd->obd_eviction_timer =
+				ktime_get_real_seconds() + 3 * PING_INTERVAL;
+			CDEBUG(D_HA, "%s: Think about evicting %s from %lld\n",
+			       exp->exp_obd->obd_name,
+			       obd_export_nid2str(oldest_exp), oldest_time);
+		}
+	} else {
+		if (ktime_get_real_seconds() >
+		    (exp->exp_obd->obd_eviction_timer + extra_delay)) {
+			/* The evictor won't evict anyone who we've heard from
+			 * recently, so we don't have to check before we start
+			 * it.
+			 */
+			if (!ping_evictor_wake(exp))
+				exp->exp_obd->obd_eviction_timer = 0;
+		}
+	}
 
-        EXIT;
+	EXIT;
 }
 
 /**
@@ -1166,7 +1192,7 @@ static int ptlrpc_check_req(struct ptlrpc_request *req)
 static void ptlrpc_at_set_timer(struct ptlrpc_service_part *svcpt)
 {
 	struct ptlrpc_at_array *array = &svcpt->scp_at_array;
-	__s32 next;
+	time64_t next;
 
 	if (array->paa_count == 0) {
 		del_timer(&svcpt->scp_at_timer);
@@ -1174,13 +1200,14 @@ static void ptlrpc_at_set_timer(struct ptlrpc_service_part *svcpt)
 	}
 
 	/* Set timer for closest deadline */
-	next = (__s32)(array->paa_deadline - ktime_get_real_seconds() -
-		       at_early_margin);
+	next = array->paa_deadline - ktime_get_real_seconds() -
+	       at_early_margin;
 	if (next <= 0) {
 		ptlrpc_at_timer(cfs_timer_cb_arg(svcpt, scp_at_timer));
 	} else {
-		mod_timer(&svcpt->scp_at_timer, cfs_time_shift(next));
-		CDEBUG(D_INFO, "armed %s at %+ds\n",
+		mod_timer(&svcpt->scp_at_timer,
+			  jiffies + nsecs_to_jiffies(next * NSEC_PER_SEC));
+		CDEBUG(D_INFO, "armed %s at %+llds\n",
 		       svcpt->scp_service->srv_name, next);
 	}
 }
@@ -1432,16 +1459,16 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt)
         __u32  index, count;
 	time64_t deadline;
 	time64_t now = ktime_get_real_seconds();
-        cfs_duration_t delay;
-        int first, counter = 0;
-        ENTRY;
+	s64 delay;
+	int first, counter = 0;
 
+	ENTRY;
 	spin_lock(&svcpt->scp_at_lock);
 	if (svcpt->scp_at_check == 0) {
 		spin_unlock(&svcpt->scp_at_lock);
 		RETURN(0);
 	}
-	delay = cfs_time_sub(cfs_time_current(), svcpt->scp_at_checktime);
+	delay = ktime_ms_delta(ktime_get(), svcpt->scp_at_checktime);
 	svcpt->scp_at_check = 0;
 
 	if (array->paa_count == 0) {
@@ -1477,14 +1504,18 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt)
 				break;
 			}
 
-			ptlrpc_at_remove_timed(rq);
 			/**
 			 * ptlrpc_server_drop_request() may drop
 			 * refcount to 0 already. Let's check this and
 			 * don't add entry to work_list
 			 */
-			if (likely(atomic_inc_not_zero(&rq->rq_refcount)))
+			if (likely(atomic_inc_not_zero(&rq->rq_refcount))) {
+				ptlrpc_at_remove_timed(rq);
 				list_add(&rq->rq_timed_list, &work_list);
+			} else {
+				ptlrpc_at_remove_timed(rq);
+			}
+
 			counter++;
 		}
 
@@ -1505,7 +1536,7 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt)
                 LCONSOLE_WARN("%s: This server is not able to keep up with "
 			      "request traffic (cpu-bound).\n",
 			      svcpt->scp_service->srv_name);
-		CWARN("earlyQ=%d reqQ=%d recA=%d, svcEst=%d, delay=%ld(jiff)\n",
+		CWARN("earlyQ=%d reqQ=%d recA=%d, svcEst=%d, delay=%lld\n",
 		      counter, svcpt->scp_nreqs_incoming,
 		      svcpt->scp_nreqs_active,
 		      at_get(&svcpt->scp_at_estimate), delay);
@@ -1529,18 +1560,14 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt)
 
 /* Check if we are already handling earlier incarnation of this request.
  * Called under &req->rq_export->exp_rpc_lock locked */
-static int ptlrpc_server_check_resend_in_progress(struct ptlrpc_request *req)
+static struct ptlrpc_request*
+ptlrpc_server_check_resend_in_progress(struct ptlrpc_request *req)
 {
 	struct ptlrpc_request	*tmp = NULL;
 
 	if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) ||
 	    (atomic_read(&req->rq_export->exp_rpc_count) == 0))
-		return 0;
-
-	/* bulk request are aborted upon reconnect, don't try to
-	 * find a match */
-	if (req->rq_bulk_write || req->rq_bulk_read)
-		return 0;
+		return NULL;
 
 	/* This list should not be longer than max_requests in
 	 * flights on the client, so it is not all that long.
@@ -1558,12 +1585,12 @@ static int ptlrpc_server_check_resend_in_progress(struct ptlrpc_request *req)
 		if (tmp->rq_xid == req->rq_xid)
 			goto found;
 	}
-	return 0;
+	return NULL;
 
 found:
 	DEBUG_REQ(D_HA, req, "Found duplicate req in processing");
 	DEBUG_REQ(D_HA, tmp, "Request being processed");
-	return -EBUSY;
+	return tmp;
 }
 
 /**
@@ -1617,9 +1644,9 @@ static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req)
 		if (req->rq_ops && req->rq_ops->hpreq_fini)
 			req->rq_ops->hpreq_fini(req);
 
-		spin_lock_bh(&req->rq_export->exp_rpc_lock);
+		spin_lock(&req->rq_export->exp_rpc_lock);
 		list_del_init(&req->rq_exp_list);
-		spin_unlock_bh(&req->rq_export->exp_rpc_lock);
+		spin_unlock(&req->rq_export->exp_rpc_lock);
 	}
 	EXIT;
 }
@@ -1653,6 +1680,7 @@ static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt,
 {
 	int rc;
 	bool hp;
+	struct ptlrpc_request *orig;
 	ENTRY;
 
 	rc = ptlrpc_server_hpreq_init(svcpt, req);
@@ -1662,18 +1690,43 @@ static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt,
 	hp = rc > 0;
 	ptlrpc_nrs_req_initialize(svcpt, req, hp);
 
-	if (req->rq_export != NULL) {
+	while (req->rq_export != NULL) {
 		struct obd_export *exp = req->rq_export;
 
 		/* do search for duplicated xid and the adding to the list
 		 * atomically */
 		spin_lock_bh(&exp->exp_rpc_lock);
-		rc = ptlrpc_server_check_resend_in_progress(req);
-		if (rc < 0) {
+		orig = ptlrpc_server_check_resend_in_progress(req);
+		if (orig && OBD_FAIL_PRECHECK(OBD_FAIL_PTLRPC_RESEND_RACE)) {
+			spin_unlock_bh(&exp->exp_rpc_lock);
+
+			OBD_RACE(OBD_FAIL_PTLRPC_RESEND_RACE);
+			msleep(4 * MSEC_PER_SEC);
+			continue;
+		}
+		if (orig && likely(atomic_inc_not_zero(&orig->rq_refcount))) {
+			bool linked;
+
 			spin_unlock_bh(&exp->exp_rpc_lock);
 
+			/*
+			 * When the client resend request and the server has
+			 * the previous copy of it, we need to update deadlines,
+			 * to be sure that the client and the server have equal
+			 *  request deadlines.
+			 */
+
+			spin_lock(&orig->rq_rqbd->rqbd_svcpt->scp_at_lock);
+			linked = orig->rq_at_linked;
+			if (likely(linked))
+				ptlrpc_at_remove_timed(orig);
+			spin_unlock(&orig->rq_rqbd->rqbd_svcpt->scp_at_lock);
+			orig->rq_deadline = req->rq_deadline;
+			if (likely(linked))
+				ptlrpc_at_add_timed(orig);
+			ptlrpc_server_drop_request(orig);
 			ptlrpc_nrs_req_finalize(req);
-			RETURN(rc);
+			RETURN(-EBUSY);
 		}
 
 		if (hp || req->rq_ops != NULL)
@@ -1681,6 +1734,7 @@ static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt,
 		else
 			list_add(&req->rq_exp_list, &exp->exp_reg_rpcs);
 		spin_unlock_bh(&exp->exp_rpc_lock);
+		break;
 	}
 
 	/* the current thread is not the processing thread for this request
@@ -2064,7 +2118,8 @@ ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
 		if (unlikely(ptlrpc_check_req(request)))
 			goto put_conn;
 		ptlrpc_update_export_timer(request->rq_export,
-					   timediff_usecs >> 19);
+					   div_u64(timediff_usecs,
+						   USEC_PER_SEC / 2));
         }
 
         /* Discard requests queued for longer than the deadline.
@@ -2151,7 +2206,7 @@ ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
 		DEBUG_REQ(D_ADAPTTO, request,
 			  "sent %d early replies before finishing in %llds",
 			  request->rq_early_count,
-			  arrived_usecs / USEC_PER_SEC);
+			  div_u64(arrived_usecs, USEC_PER_SEC));
 	}
 
 	ptlrpc_server_finish_active_request(svcpt, request);
@@ -2239,7 +2294,7 @@ ptlrpc_handle_rs(struct ptlrpc_reply_state *rs)
 
 				while (nlocks-- > 0) {
 					lock = ack_locks[nlocks];
-					ldlm_lock_downgrade(lock, LCK_COS);
+					ldlm_lock_mode_downgrade(lock, LCK_COS);
 					LDLM_LOCK_PUT(lock);
 				}
 				RETURN(0);
@@ -2453,40 +2508,39 @@ static int ptlrpc_main(void *arg)
 	thread->t_pid = current_pid();
 	unshare_fs_struct();
 
-	/* NB: we will call cfs_cpt_bind() for all threads, because we
-	 * might want to run lustre server only on a subset of system CPUs,
-	 * in that case ->scp_cpt is CFS_CPT_ANY */
-	rc = cfs_cpt_bind(svc->srv_cptable, svcpt->scp_cpt);
-	if (rc != 0) {
-		CWARN("%s: failed to bind %s on CPT %d\n",
-		      svc->srv_name, thread->t_name, svcpt->scp_cpt);
+	if (svc->srv_cpt_bind) {
+		rc = cfs_cpt_bind(svc->srv_cptable, svcpt->scp_cpt);
+		if (rc != 0) {
+			CWARN("%s: failed to bind %s on CPT %d\n",
+			      svc->srv_name, thread->t_name, svcpt->scp_cpt);
+		}
 	}
 
 	ginfo = groups_alloc(0);
-	if (!ginfo) {
-		rc = -ENOMEM;
-		goto out;
-	}
+	if (!ginfo)
+		GOTO(out, rc = -ENOMEM);
 
 	set_current_groups(ginfo);
 	put_group_info(ginfo);
 
 	if (svc->srv_ops.so_thr_init != NULL) {
 		rc = svc->srv_ops.so_thr_init(thread);
-                if (rc)
-                        goto out;
-        }
 
-        OBD_ALLOC_PTR(env);
-        if (env == NULL) {
-                rc = -ENOMEM;
-                goto out_srv_fini;
-        }
+		if (rc)
+			GOTO(out, rc);
+	}
 
-        rc = lu_context_init(&env->le_ctx,
-                             svc->srv_ctx_tags|LCT_REMEMBER|LCT_NOREF);
-        if (rc)
-                goto out_srv_fini;
+	OBD_ALLOC_PTR(env);
+	if (env == NULL)
+		GOTO(out_srv_fini, rc = -ENOMEM);
+	rc = lu_env_add(env);
+	if (rc)
+		GOTO(out_env, rc);
+
+	rc = lu_context_init(&env->le_ctx,
+			     svc->srv_ctx_tags|LCT_REMEMBER|LCT_NOREF);
+	if (rc)
+		GOTO(out_env_remove, rc);
 
         thread->t_env = env;
         env->le_ctx.lc_thread = thread;
@@ -2499,15 +2553,13 @@ static int ptlrpc_main(void *arg)
 
 		CERROR("Failed to post rqbd for %s on CPT %d: %d\n",
 			svc->srv_name, svcpt->scp_cpt, rc);
-		goto out_srv_fini;
+		GOTO(out_ctx_fini, rc);
 	}
 
 	/* Alloc reply state structure for this one */
 	OBD_ALLOC_LARGE(rs, svc->srv_max_reply_size);
-	if (!rs) {
-		rc = -ENOMEM;
-		goto out_srv_fini;
-	}
+	if (!rs)
+		GOTO(out_ctx_fini, rc = -ENOMEM);
 
 	spin_lock(&svcpt->scp_lock);
 
@@ -2553,6 +2605,9 @@ static int ptlrpc_main(void *arg)
 
 		/* reset le_ses to initial state */
 		env->le_ses = NULL;
+		/* Refill the context before execution to make sure
+		 * all thread keys are allocated */
+		lu_env_refill(env);
 		/* Process all incoming reqs before handling any */
 		if (ptlrpc_server_request_incoming(svcpt)) {
 			lu_context_enter(&env->le_ctx);
@@ -2588,17 +2643,18 @@ static int ptlrpc_main(void *arg)
         lc_watchdog_delete(thread->t_watchdog);
         thread->t_watchdog = NULL;
 
+out_ctx_fini:
+	lu_context_fini(&env->le_ctx);
+out_env_remove:
+	lu_env_remove(env);
+out_env:
+	OBD_FREE_PTR(env);
 out_srv_fini:
         /*
          * deconstruct service specific state created by ptlrpc_start_thread()
          */
 	if (svc->srv_ops.so_thr_done != NULL)
 		svc->srv_ops.so_thr_done(thread);
-
-        if (env != NULL) {
-                lu_context_fini(&env->le_ctx);
-                OBD_FREE_PTR(env);
-        }
 out:
         CDEBUG(D_RPCTRACE, "service thread [ %p : %u ] %d exiting: rc %d\n",
                thread, thread->t_pid, thread->t_id, rc);
@@ -2644,8 +2700,13 @@ static int ptlrpc_hr_main(void *arg)
 	struct ptlrpc_hr_thread		*hrt = (struct ptlrpc_hr_thread *)arg;
 	struct ptlrpc_hr_partition	*hrp = hrt->hrt_partition;
 	struct list_head		replies;
+	struct lu_env			*env;
 	int				rc;
 
+	OBD_ALLOC_PTR(env);
+	if (env == NULL)
+		RETURN(-ENOMEM);
+
 	INIT_LIST_HEAD(&replies);
 	unshare_fs_struct();
 
@@ -2659,6 +2720,15 @@ static int ptlrpc_hr_main(void *arg)
 		      threadname, hrp->hrp_cpt, ptlrpc_hr.hr_cpt_table, rc);
 	}
 
+	rc = lu_context_init(&env->le_ctx, LCT_MD_THREAD | LCT_DT_THREAD |
+			     LCT_REMEMBER | LCT_NOREF);
+	if (rc)
+		GOTO(out_env, rc);
+
+	rc = lu_env_add(env);
+	if (rc)
+		GOTO(out_ctx_fini, rc);
+
 	atomic_inc(&hrp->hrp_nstarted);
 	wake_up(&ptlrpc_hr.hr_waitq);
 
@@ -2672,13 +2742,22 @@ static int ptlrpc_hr_main(void *arg)
 					struct ptlrpc_reply_state,
 					rs_list);
 			list_del_init(&rs->rs_list);
+			/* refill keys if needed */
+			lu_env_refill(env);
+			lu_context_enter(&env->le_ctx);
 			ptlrpc_handle_rs(rs);
+			lu_context_exit(&env->le_ctx);
 		}
 	}
 
 	atomic_inc(&hrp->hrp_nstopped);
 	wake_up(&ptlrpc_hr.hr_waitq);
 
+	lu_env_remove(env);
+out_ctx_fini:
+	lu_context_fini(&env->le_ctx);
+out_env:
+	OBD_FREE_PTR(env);
 	return 0;
 }
 
@@ -3243,6 +3322,7 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service)
 	ptlrpc_service_nrs_cleanup(service);
 
 	ptlrpc_lprocfs_unregister_service(service);
+	ptlrpc_sysfs_unregister_service(service);
 
 	ptlrpc_service_free(service);
 
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/wirehdr.c b/drivers/staging/lustrefsx/lustre/ptlrpc/wirehdr.c
index 3a9daf899c26e..7f9fb09ee4ffd 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/wirehdr.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/wirehdr.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,5 +40,7 @@
 #include <obd_support.h>
 #include <obd_class.h>
 #include <lustre_net.h>
-#include <lustre/lustre_lfsck_user.h>
 #include <lustre_disk.h>
+#include <uapi/linux/lustre/lustre_lfsck_user.h>
+#include <uapi/linux/lustre/lustre_cfg.h>
+
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c b/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c
index 3fdaf4e78ff65..78e5b21335b69 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -40,15 +40,16 @@
 #include <obd_support.h>
 #include <obd_class.h>
 #include <lustre_net.h>
-#include <lustre/lustre_lfsck_user.h>
 #include <lustre_disk.h>
+#include <uapi/linux/lustre/lustre_lfsck_user.h>
+#include <uapi/linux/lustre/lustre_cfg.h>
+
+
 void lustre_assert_wire_constants(void)
 {
-	 /* Wire protocol assertions generated by 'wirecheck'
-	  * (make -C lustre/utils newwiretest)
-	  * running on Linux centss05 2.6.32.431.29.2.el6_lustre #1 SMP Tue Sep 23 16:06:38 CDT 2014 x
-	  * with gcc version 4.4.7 20120313 (Red Hat 4.4.7-4) (GCC)  */
-
+	/* Wire protocol assertions generated by 'wirecheck'
+	 * (make -C lustre/utils newwiretest)
+	 */
 
 	/* Constants... */
 	LASSERTF(PTL_RPC_MSG_REQUEST == 4711, "found %lld\n",
@@ -174,7 +175,9 @@ void lustre_assert_wire_constants(void)
 		 (long long)MDS_HSM_CT_UNREGISTER);
 	LASSERTF(MDS_SWAP_LAYOUTS == 61, "found %lld\n",
 		 (long long)MDS_SWAP_LAYOUTS);
-	LASSERTF(MDS_LAST_OPC == 62, "found %lld\n",
+	LASSERTF(MDS_RMFID == 62, "found %lld\n",
+		 (long long)MDS_RMFID);
+	LASSERTF(MDS_LAST_OPC == 63, "found %lld\n",
 		 (long long)MDS_LAST_OPC);
 	LASSERTF(REINT_SETATTR == 1, "found %lld\n",
 		 (long long)REINT_SETATTR);
@@ -194,7 +197,7 @@ void lustre_assert_wire_constants(void)
 		 (long long)REINT_RMENTRY);
 	LASSERTF(REINT_MIGRATE == 9, "found %lld\n",
 		 (long long)REINT_MIGRATE);
-	LASSERTF(REINT_MAX == 10, "found %lld\n",
+	LASSERTF(REINT_MAX == 11, "found %lld\n",
 		 (long long)REINT_MAX);
 	LASSERTF(DISP_IT_EXECD == 0x00000001UL, "found 0x%.8xUL\n",
 		(unsigned)DISP_IT_EXECD);
@@ -252,9 +255,14 @@ void lustre_assert_wire_constants(void)
 			(long long)MDS_ATTR_FROM_OPEN);
 	LASSERTF(MDS_ATTR_BLOCKS == 0x0000000000008000ULL, "found 0x%.16llxULL\n",
 			(long long)MDS_ATTR_BLOCKS);
-
 	LASSERTF(MDS_ATTR_PROJID == 0x0000000000010000ULL, "found 0x%.16llxULL\n",
 			(long long)MDS_ATTR_PROJID);
+	LASSERTF(MDS_ATTR_LSIZE == 0x0000000000020000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_LSIZE);
+	LASSERTF(MDS_ATTR_LBLOCKS == 0x0000000000040000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_LBLOCKS);
+	LASSERTF(MDS_ATTR_OVERRIDE == 0x0000000002000000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_OVERRIDE);
 	LASSERTF(FLD_QUERY == 900, "found %lld\n",
 		 (long long)FLD_QUERY);
 	LASSERTF(FLD_READ == 901, "found %lld\n",
@@ -339,10 +347,6 @@ void lustre_assert_wire_constants(void)
 	CLASSERT(LQUOTA_RES_DT == 2);
 	LASSERTF(OBD_PING == 400, "found %lld\n",
 		 (long long)OBD_PING);
-	LASSERTF(OBD_LOG_CANCEL == 401, "found %lld\n",
-		 (long long)OBD_LOG_CANCEL);
-	LASSERTF(OBD_QC_CALLBACK == 402, "found %lld\n",
-		 (long long)OBD_QC_CALLBACK);
 	LASSERTF(OBD_IDX_READ == 403, "found %lld\n",
 		 (long long)OBD_IDX_READ);
 	LASSERTF(OBD_LAST_OPC == 404, "found %lld\n",
@@ -365,6 +369,8 @@ void lustre_assert_wire_constants(void)
 		 (long long)MGS_TARGET_DEL);
 	LASSERTF(MGS_SET_INFO == 255, "found %lld\n",
 		 (long long)MGS_SET_INFO);
+	LASSERTF(MGS_CONFIG_READ == 256, "found %lld\n",
+		 (long long)MGS_CONFIG_READ);
 	LASSERTF(MGS_LAST_OPC == 257, "found %lld\n",
 		 (long long)MGS_LAST_OPC);
 	LASSERTF(SEC_CTX_INIT == 801, "found %lld\n",
@@ -500,6 +506,30 @@ void lustre_assert_wire_constants(void)
 		 (long long)OUT_PUNCH);
 	LASSERTF(OUT_READ == 15, "found %lld\n",
 		 (long long)OUT_READ);
+	LASSERTF(OUT_NOOP == 16, "found %lld\n",
+		 (long long)OUT_NOOP);
+	LASSERTF(OUT_XATTR_LIST == 17, "found %lld\n",
+		 (long long)OUT_XATTR_LIST);
+
+	/* Checks for struct lustre_som_attrs */
+	LASSERTF((int)sizeof(struct lustre_som_attrs) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_som_attrs));
+	LASSERTF((int)offsetof(struct lustre_som_attrs, lsa_valid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_som_attrs, lsa_valid));
+	LASSERTF((int)sizeof(((struct lustre_som_attrs *)0)->lsa_valid) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_som_attrs *)0)->lsa_valid));
+	LASSERTF((int)offsetof(struct lustre_som_attrs, lsa_reserved) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_som_attrs, lsa_reserved));
+	LASSERTF((int)sizeof(((struct lustre_som_attrs *)0)->lsa_reserved) == 6, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_som_attrs *)0)->lsa_reserved));
+	LASSERTF((int)offsetof(struct lustre_som_attrs, lsa_size) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_som_attrs, lsa_size));
+	LASSERTF((int)sizeof(((struct lustre_som_attrs *)0)->lsa_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_som_attrs *)0)->lsa_size));
+	LASSERTF((int)offsetof(struct lustre_som_attrs, lsa_blocks) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_som_attrs, lsa_blocks));
+	LASSERTF((int)sizeof(((struct lustre_som_attrs *)0)->lsa_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_som_attrs *)0)->lsa_blocks));
 
 	/* Checks for struct hsm_attrs */
 	LASSERTF((int)sizeof(struct hsm_attrs) == 24, "found %lld\n",
@@ -656,6 +686,78 @@ void lustre_assert_wire_constants(void)
 	LASSERTF((int)sizeof(union lu_page) == 4096, "found %lld\n",
 		 (long long)(int)sizeof(union lu_page));
 
+	/* Checks for struct lu_ladvise */
+	LASSERTF((int)sizeof(struct lu_ladvise) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lu_ladvise));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_advice) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_advice));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_advice) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_advice));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_value1) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_value1));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value1) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value1));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_value2) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_value2));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value2));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_start) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_start));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_start));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_end) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_end));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_end));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_value3) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_value3));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value3));
+	LASSERTF((int)offsetof(struct lu_ladvise, lla_value4) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_ladvise, lla_value4));
+	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value4));
+	LASSERTF(LU_LADVISE_WILLREAD == 1, "found %lld\n",
+		 (long long)LU_LADVISE_WILLREAD);
+	LASSERTF(LU_LADVISE_DONTNEED == 2, "found %lld\n",
+		 (long long)LU_LADVISE_DONTNEED);
+
+	/* Checks for struct ladvise_hdr */
+	LASSERTF((int)sizeof(struct ladvise_hdr) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct ladvise_hdr));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_magic));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_magic));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_count));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_count));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_flags) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_flags));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_flags));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_value1) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_value1));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value1));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_value2) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_value2));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value2));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_value3) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_value3));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value3));
+	LASSERTF((int)offsetof(struct ladvise_hdr, lah_advise) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ladvise_hdr, lah_advise));
+	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_advise) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_advise));
+	LASSERTF(LF_ASYNC == 1, "found %lld\n",
+		 (long long)LF_ASYNC);
+	LASSERTF(LADVISE_MAGIC == 450829536, "found %lld\n",
+		 (long long)LADVISE_MAGIC);
+
 	/* Checks for struct lustre_handle */
 	LASSERTF((int)sizeof(struct lustre_handle) == 8, "found %lld\n",
 		 (long long)(int)sizeof(struct lustre_handle));
@@ -703,10 +805,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct lustre_msg_v2, lm_buflens[0]));
 	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0]) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0]));
-	LASSERTF(LUSTRE_MSG_MAGIC_V2 == 0x0BD00BD3, "found 0x%.8x\n",
-		LUSTRE_MSG_MAGIC_V2);
-	LASSERTF(LUSTRE_MSG_MAGIC_V2_SWABBED == 0xD30BD00B, "found 0x%.8x\n",
-		LUSTRE_MSG_MAGIC_V2_SWABBED);
+	LASSERTF(LUSTRE_MSG_MAGIC_V2 == 0x0bd00bd3UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_MSG_MAGIC_V2);
+	LASSERTF(LUSTRE_MSG_MAGIC_V2_SWABBED == 0xd30bd00bUL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_MSG_MAGIC_V2_SWABBED);
 
 	/* Checks for struct ptlrpc_body */
 	LASSERTF((int)sizeof(struct ptlrpc_body_v3) == 184, "found %lld\n",
@@ -921,42 +1023,30 @@ void lustre_assert_wire_constants(void)
 		 (long long)DLM_REPLY_REC_OFF);
 	LASSERTF(MSG_PTLRPC_HEADER_OFF == 31, "found %lld\n",
 		 (long long)MSG_PTLRPC_HEADER_OFF);
-	LASSERTF(PTLRPC_MSG_VERSION == 0x00000003, "found 0x%.8x\n",
-		PTLRPC_MSG_VERSION);
-	LASSERTF(LUSTRE_VERSION_MASK == 0xffff0000, "found 0x%.8x\n",
-		LUSTRE_VERSION_MASK);
-	LASSERTF(LUSTRE_OBD_VERSION == 0x00010000, "found 0x%.8x\n",
-		LUSTRE_OBD_VERSION);
-	LASSERTF(LUSTRE_MDS_VERSION == 0x00020000, "found 0x%.8x\n",
-		LUSTRE_MDS_VERSION);
-	LASSERTF(LUSTRE_OST_VERSION == 0x00030000, "found 0x%.8x\n",
-		LUSTRE_OST_VERSION);
-	LASSERTF(LUSTRE_DLM_VERSION == 0x00040000, "found 0x%.8x\n",
-		LUSTRE_DLM_VERSION);
-	LASSERTF(LUSTRE_LOG_VERSION == 0x00050000, "found 0x%.8x\n",
-		LUSTRE_LOG_VERSION);
-	LASSERTF(LUSTRE_MGS_VERSION == 0x00060000, "found 0x%.8x\n",
-		LUSTRE_MGS_VERSION);
+	LASSERTF(PTLRPC_MSG_VERSION == 0x00000003UL, "found 0x%.8xUL\n",
+		(unsigned)PTLRPC_MSG_VERSION);
+	LASSERTF(LUSTRE_VERSION_MASK == 0xffff0000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_VERSION_MASK);
+	LASSERTF(LUSTRE_OBD_VERSION == 0x00010000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_OBD_VERSION);
+	LASSERTF(LUSTRE_MDS_VERSION == 0x00020000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_MDS_VERSION);
+	LASSERTF(LUSTRE_OST_VERSION == 0x00030000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_OST_VERSION);
+	LASSERTF(LUSTRE_DLM_VERSION == 0x00040000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_DLM_VERSION);
+	LASSERTF(LUSTRE_LOG_VERSION == 0x00050000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_LOG_VERSION);
+	LASSERTF(LUSTRE_MGS_VERSION == 0x00060000UL, "found 0x%.8xUL\n",
+		(unsigned)LUSTRE_MGS_VERSION);
 	LASSERTF(MSGHDR_AT_SUPPORT == 1, "found %lld\n",
 		 (long long)MSGHDR_AT_SUPPORT);
 	LASSERTF(MSGHDR_CKSUM_INCOMPAT18 == 2, "found %lld\n",
 		 (long long)MSGHDR_CKSUM_INCOMPAT18);
-	LASSERTF(MSG_OP_FLAG_MASK == 0xffff0000UL, "found 0x%.8xUL\n",
-		(unsigned)MSG_OP_FLAG_MASK);
-	LASSERTF(MSG_OP_FLAG_SHIFT == 16, "found %lld\n",
-		 (long long)MSG_OP_FLAG_SHIFT);
-	LASSERTF(MSG_GEN_FLAG_MASK == 0x0000ffffUL, "found 0x%.8xUL\n",
-		(unsigned)MSG_GEN_FLAG_MASK);
-	LASSERTF(MSG_LAST_REPLAY == 0x00000001UL, "found 0x%.8xUL\n",
-		(unsigned)MSG_LAST_REPLAY);
 	LASSERTF(MSG_RESENT == 0x00000002UL, "found 0x%.8xUL\n",
 		(unsigned)MSG_RESENT);
 	LASSERTF(MSG_REPLAY == 0x00000004UL, "found 0x%.8xUL\n",
 		(unsigned)MSG_REPLAY);
-	LASSERTF(MSG_DELAY_REPLAY == 0x00000010UL, "found 0x%.8xUL\n",
-		(unsigned)MSG_DELAY_REPLAY);
-	LASSERTF(MSG_VERSION_REPLAY == 0x00000020UL, "found 0x%.8xUL\n",
-		(unsigned)MSG_VERSION_REPLAY);
 	LASSERTF(MSG_REQ_REPLAY_DONE == 0x00000040UL, "found 0x%.8xUL\n",
 		(unsigned)MSG_REQ_REPLAY_DONE);
 	LASSERTF(MSG_LOCK_REPLAY_DONE == 0x00000080UL, "found 0x%.8xUL\n",
@@ -971,8 +1061,6 @@ void lustre_assert_wire_constants(void)
 		(unsigned)MSG_CONNECT_LIBCLIENT);
 	LASSERTF(MSG_CONNECT_INITIAL == 0x00000020UL, "found 0x%.8xUL\n",
 		(unsigned)MSG_CONNECT_INITIAL);
-	LASSERTF(MSG_CONNECT_ASYNC == 0x00000040UL, "found 0x%.8xUL\n",
-		(unsigned)MSG_CONNECT_ASYNC);
 	LASSERTF(MSG_CONNECT_NEXT_VER == 0x00000080UL, "found 0x%.8xUL\n",
 		(unsigned)MSG_CONNECT_NEXT_VER);
 	LASSERTF(MSG_CONNECT_TRANSNO == 0x00000100UL, "found 0x%.8xUL\n",
@@ -1229,8 +1317,8 @@ void lustre_assert_wire_constants(void)
 		 OBD_CONNECT_DIR_STRIPE);
 	LASSERTF(OBD_CONNECT_SUBTREE == 0x800000000000000ULL, "found 0x%.16llxULL\n",
 		 OBD_CONNECT_SUBTREE);
-	LASSERTF(OBD_CONNECT_LOCK_AHEAD == 0x1000000000000000ULL, "found 0x%.16llxULL\n",
-		 OBD_CONNECT_LOCK_AHEAD);
+	LASSERTF(OBD_CONNECT_LOCKAHEAD_OLD == 0x1000000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LOCKAHEAD_OLD);
 	LASSERTF(OBD_CONNECT_BULK_MBITS == 0x2000000000000000ULL, "found 0x%.16llxULL\n",
 		 OBD_CONNECT_BULK_MBITS);
 	LASSERTF(OBD_CONNECT_OBDOPACK == 0x4000000000000000ULL, "found 0x%.16llxULL\n",
@@ -1239,12 +1327,52 @@ void lustre_assert_wire_constants(void)
 		 OBD_CONNECT_FLAGS2);
 	LASSERTF(OBD_CONNECT2_FILE_SECCTX == 0x1ULL, "found 0x%.16llxULL\n",
 		 OBD_CONNECT2_FILE_SECCTX);
+	LASSERTF(OBD_CONNECT2_LOCKAHEAD == 0x2ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_LOCKAHEAD);
+	LASSERTF(OBD_CONNECT2_DIR_MIGRATE == 0x4ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_DIR_MIGRATE);
+	LASSERTF(OBD_CONNECT2_FLR == 0x20ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_FLR);
+	LASSERTF(OBD_CONNECT2_WBC_INTENTS == 0x40ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_WBC_INTENTS);
+	LASSERTF(OBD_CONNECT2_LOCK_CONVERT == 0x80ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_LOCK_CONVERT);
+	LASSERTF(OBD_CONNECT2_ARCHIVE_ID_ARRAY == 0x100ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_ARCHIVE_ID_ARRAY);
+	LASSERTF(OBD_CONNECT2_SELINUX_POLICY == 0x400ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_SELINUX_POLICY);
+	LASSERTF(OBD_CONNECT2_LSOM == 0x800ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_LSOM);
+	LASSERTF(OBD_CONNECT2_ASYNC_DISCARD == 0x4000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_ASYNC_DISCARD);
+	LASSERTF(OBD_CONNECT2_ENCRYPT == 0x8000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_ENCRYPT);
+	LASSERTF(OBD_CONNECT2_FIDMAP== 0x10000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_FIDMAP);
+	LASSERTF(OBD_CONNECT2_GETATTR_PFID== 0x20000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_GETATTR_PFID);
+	LASSERTF(OBD_CONNECT2_MDLL == 0x1000000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_MDLL);
+	LASSERTF(OBD_CONNECT2_MDLL_AUTO_REFRESH == 0x2000000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_MDLL_AUTO_REFRESH);
 	LASSERTF(OBD_CKSUM_CRC32 == 0x00000001UL, "found 0x%.8xUL\n",
 		(unsigned)OBD_CKSUM_CRC32);
 	LASSERTF(OBD_CKSUM_ADLER == 0x00000002UL, "found 0x%.8xUL\n",
 		(unsigned)OBD_CKSUM_ADLER);
 	LASSERTF(OBD_CKSUM_CRC32C == 0x00000004UL, "found 0x%.8xUL\n",
 		(unsigned)OBD_CKSUM_CRC32C);
+	LASSERTF(OBD_CKSUM_RESERVED == 0x00000008UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_RESERVED);
+	LASSERTF(OBD_CKSUM_T10IP512 == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_T10IP512);
+	LASSERTF(OBD_CKSUM_T10IP4K == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_T10IP4K);
+	LASSERTF(OBD_CKSUM_T10CRC512 == 0x00000040UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_T10CRC512);
+	LASSERTF(OBD_CKSUM_T10CRC4K == 0x00000080UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_T10CRC4K);
+	LASSERTF(OBD_CKSUM_T10_TOP == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_T10_TOP);
 
 	/* Checks for struct ost_layout */
 	LASSERTF((int)sizeof(struct ost_layout) == 28, "found %lld\n",
@@ -1361,10 +1489,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct obdo, o_layout));
 	LASSERTF((int)sizeof(((struct obdo *)0)->o_layout) == 28, "found %lld\n",
 		 (long long)(int)sizeof(((struct obdo *)0)->o_layout));
-	LASSERTF((int)offsetof(struct obdo, o_padding_3) == 164, "found %lld\n",
-		 (long long)(int)offsetof(struct obdo, o_padding_3));
-	LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_3) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct obdo *)0)->o_padding_3));
+	LASSERTF((int)offsetof(struct obdo, o_layout_version) == 164, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_layout_version));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_layout_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_layout_version));
 	LASSERTF((int)offsetof(struct obdo, o_uid_h) == 168, "found %lld\n",
 		 (long long)(int)offsetof(struct obdo, o_uid_h));
 	LASSERTF((int)sizeof(((struct obdo *)0)->o_uid_h) == 4, "found %lld\n",
@@ -1419,8 +1547,8 @@ void lustre_assert_wire_constants(void)
 		 OBD_MD_FLFLAGS);
 	LASSERTF(OBD_MD_FLNLINK == (0x00002000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLNLINK);
-	LASSERTF(OBD_MD_FLGENER == (0x00004000ULL), "found 0x%.16llxULL\n",
-		 OBD_MD_FLGENER);
+	LASSERTF(OBD_MD_FLPARENT == (0x00004000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLPARENT);
 	LASSERTF(OBD_MD_FLRDEV == (0x00010000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLRDEV);
 	LASSERTF(OBD_MD_FLEASIZE == (0x00020000ULL), "found 0x%.16llxULL\n",
@@ -1431,14 +1559,10 @@ void lustre_assert_wire_constants(void)
 		 OBD_MD_FLHANDLE);
 	LASSERTF(OBD_MD_FLCKSUM == (0x00100000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLCKSUM);
-	LASSERTF(OBD_MD_FLQOS == (0x00200000ULL), "found 0x%.16llxULL\n",
-		 OBD_MD_FLQOS);
 	LASSERTF(OBD_MD_FLGROUP == (0x01000000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLGROUP);
 	LASSERTF(OBD_MD_FLFID == (0x02000000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLFID);
-	LASSERTF(OBD_MD_FLEPOCH == (0x04000000ULL), "found 0x%.16llxULL\n",
-		 OBD_MD_FLEPOCH);
 	LASSERTF(OBD_MD_FLGRANT == (0x08000000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLGRANT);
 	LASSERTF(OBD_MD_FLDIREA == (0x10000000ULL), "found 0x%.16llxULL\n",
@@ -1451,8 +1575,6 @@ void lustre_assert_wire_constants(void)
 		 OBD_MD_FLMODEASIZE);
 	LASSERTF(OBD_MD_MDS == (0x0000000100000000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_MDS);
-	LASSERTF(OBD_MD_REINT == (0x0000000200000000ULL), "found 0x%.16llxULL\n",
-		 OBD_MD_REINT);
 	LASSERTF(OBD_MD_MEA == (0x0000000400000000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_MEA);
 	LASSERTF(OBD_MD_TSTATE == (0x0000000800000000ULL), "found 0x%.16llxULL\n",
@@ -1465,12 +1587,6 @@ void lustre_assert_wire_constants(void)
 		 OBD_MD_FLXATTRRM);
 	LASSERTF(OBD_MD_FLACL == (0x0000008000000000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLACL);
-	LASSERTF(OBD_MD_FLMDSCAPA == (0x0000020000000000ULL), "found 0x%.16llxULL\n",
-		 OBD_MD_FLMDSCAPA);
-	LASSERTF(OBD_MD_FLOSSCAPA == (0x0000040000000000ULL), "found 0x%.16llxULL\n",
-		 OBD_MD_FLOSSCAPA);
-	LASSERTF(OBD_MD_FLCKSPLIT == (0x0000080000000000ULL), "found 0x%.16llxULL\n",
-		 OBD_MD_FLCKSPLIT);
 	LASSERTF(OBD_MD_FLCROSSREF == (0x0000100000000000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLCROSSREF);
 	LASSERTF(OBD_MD_FLGETATTRLOCK == (0x0000200000000000ULL), "found 0x%.16llxULL\n",
@@ -1483,7 +1599,6 @@ void lustre_assert_wire_constants(void)
 		 OBD_MD_DEFAULT_MEA);
 	LASSERTF(OBD_MD_FLOSTLAYOUT == (0x0080000000000000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLOSTLAYOUT);
-
 	LASSERTF(OBD_MD_FLPROJID == (0x0100000000000000ULL), "found 0x%.16llxULL\n",
 		 OBD_MD_FLPROJID);
 	CLASSERT(OBD_FL_INLINEDATA == 0x00000001);
@@ -1500,7 +1615,10 @@ void lustre_assert_wire_constants(void)
 	CLASSERT(OBD_FL_CKSUM_CRC32 == 0x00001000);
 	CLASSERT(OBD_FL_CKSUM_ADLER == 0x00002000);
 	CLASSERT(OBD_FL_CKSUM_CRC32C == 0x00004000);
-	CLASSERT(OBD_FL_CKSUM_RSVD2 == 0x00008000);
+	CLASSERT(OBD_FL_CKSUM_T10IP512 == 0x00005000);
+	CLASSERT(OBD_FL_CKSUM_T10IP4K == 0x00006000);
+	CLASSERT(OBD_FL_CKSUM_T10CRC512 == 0x00007000);
+	CLASSERT(OBD_FL_CKSUM_T10CRC4K == 0x00008000);
 	CLASSERT(OBD_FL_CKSUM_RSVD3 == 0x00010000);
 	CLASSERT(OBD_FL_SHRINK_GRANT == 0x00020000);
 	CLASSERT(OBD_FL_MMAP == 0x00040000);
@@ -1599,8 +1717,8 @@ void lustre_assert_wire_constants(void)
 		(unsigned)LOV_PATTERN_RAID0);
 	LASSERTF(LOV_PATTERN_RAID1 == 0x00000002UL, "found 0x%.8xUL\n",
 		(unsigned)LOV_PATTERN_RAID1);
-	LASSERTF(LOV_PATTERN_FIRST == 0x00000100UL, "found 0x%.8xUL\n",
-		(unsigned)LOV_PATTERN_FIRST);
+	LASSERTF(LOV_PATTERN_MDT == 0x00000100UL, "found 0x%.8xUL\n",
+		(unsigned)LOV_PATTERN_MDT);
 	LASSERTF(LOV_PATTERN_CMOBD == 0x00000200UL, "found 0x%.8xUL\n",
 		(unsigned)LOV_PATTERN_CMOBD);
 
@@ -1627,12 +1745,22 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_size));
 	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_size) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_size));
-	LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_padding) == 32, "found %lld\n",
-		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_padding));
-	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_padding) == 16, "found %lld\n",
-		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_padding));
+	LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_layout_gen) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_layout_gen));
+	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_layout_gen) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_layout_gen));
+	LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_timestamp) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_timestamp));
+	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_timestamp) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_timestamp));
+	LASSERTF((int)offsetof(struct lov_comp_md_entry_v1, lcme_padding_1) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_entry_v1, lcme_padding_1));
+	LASSERTF((int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_padding_1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_padding_1));
 	LASSERTF(LCME_FL_INIT == 0x00000010UL, "found 0x%.8xUL\n",
 		(unsigned)LCME_FL_INIT);
+	LASSERTF(LCME_FL_NEG == 0x80000000UL, "found 0x%.8xUL\n",
+		(unsigned)LCME_FL_NEG);
 
 	/* Checks for struct lov_comp_md_v1 */
 	LASSERTF((int)sizeof(struct lov_comp_md_v1) == 32, "found %lld\n",
@@ -1657,9 +1785,13 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_entry_count));
 	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entry_count) == 2, "found %lld\n",
 		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entry_count));
-	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding1) == 16, "found %lld\n",
+	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_mirror_count) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_mirror_count));
+	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_mirror_count) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_mirror_count));
+	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding1) == 18, "found %lld\n",
 		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_padding1));
-	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1) == 8, "found %lld\n",
+	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1) == 6, "found %lld\n",
 		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1));
 	LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding2) == 24, "found %lld\n",
 		 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_padding2));
@@ -1670,6 +1802,14 @@ void lustre_assert_wire_constants(void)
 	LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entries[0]) == 48, "found %lld\n",
 		 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entries[0]));
 	CLASSERT(LOV_MAGIC_COMP_V1 == (0x0BD60000 | 0x0BD0));
+	LASSERTF(LCM_FL_NONE == 0, "found %lld\n",
+		 (long long)LCM_FL_NONE);
+	LASSERTF(LCM_FL_RDONLY == 1, "found %lld\n",
+		 (long long)LCM_FL_RDONLY);
+	LASSERTF(LCM_FL_WRITE_PENDING == 2, "found %lld\n",
+		 (long long)LCM_FL_WRITE_PENDING);
+	LASSERTF(LCM_FL_SYNC_PENDING == 3, "found %lld\n",
+		 (long long)LCM_FL_SYNC_PENDING);
 
 	/* Checks for struct lmv_mds_md_v1 */
 	LASSERTF((int)sizeof(struct lmv_mds_md_v1) == 56, "found %lld\n",
@@ -1694,13 +1834,17 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_layout_version));
 	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_layout_version) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_layout_version));
-	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_padding1) == 20, "found %lld\n",
-		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_padding1));
-	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding1) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding1));
-	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_padding2) == 24, "found %lld\n",
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_migrate_offset) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_migrate_offset));
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_migrate_offset) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_migrate_offset));
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_migrate_hash) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_migrate_hash));
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_migrate_hash) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_migrate_hash));
+	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_padding2) == 28, "found %lld\n",
 		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_padding2));
-	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding2) == 8, "found %lld\n",
+	LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding2) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding2));
 	LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_padding3) == 32, "found %lld\n",
 		 (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_padding3));
@@ -1741,6 +1885,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct obd_statfs, os_bavail));
 	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bavail) == 8, "found %lld\n",
 		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_bavail));
+	LASSERTF((int)offsetof(struct obd_statfs, os_files) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_files));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_files) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_files));
 	LASSERTF((int)offsetof(struct obd_statfs, os_ffree) == 40, "found %lld\n",
 		 (long long)(int)offsetof(struct obd_statfs, os_ffree));
 	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_ffree) == 8, "found %lld\n",
@@ -1757,6 +1905,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct obd_statfs, os_namelen));
 	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_namelen) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_namelen));
+	LASSERTF((int)offsetof(struct obd_statfs, os_maxbytes) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_maxbytes));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_maxbytes) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_maxbytes));
 	LASSERTF((int)offsetof(struct obd_statfs, os_state) == 104, "found %lld\n",
 		 (long long)(int)offsetof(struct obd_statfs, os_state));
 	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_state) == 4, "found %lld\n",
@@ -1765,10 +1917,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct obd_statfs, os_fprecreated));
 	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fprecreated) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_fprecreated));
-	LASSERTF((int)offsetof(struct obd_statfs, os_spare2) == 112, "found %lld\n",
-		 (long long)(int)offsetof(struct obd_statfs, os_spare2));
-	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare2) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare2));
+	LASSERTF((int)offsetof(struct obd_statfs, os_granted) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_granted));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_granted) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_granted));
 	LASSERTF((int)offsetof(struct obd_statfs, os_spare3) == 116, "found %lld\n",
 		 (long long)(int)offsetof(struct obd_statfs, os_spare3));
 	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare3) == 4, "found %lld\n",
@@ -1797,6 +1949,20 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct obd_statfs, os_spare9));
 	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare9) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare9));
+	LASSERTF(OS_STATE_DEGRADED == 0x1, "found %lld\n",
+		 (long long)OS_STATE_DEGRADED);
+	LASSERTF(OS_STATE_READONLY == 0x2, "found %lld\n",
+		 (long long)OS_STATE_READONLY);
+	LASSERTF(OS_STATE_NOPRECREATE == 0x4, "found %lld\n",
+		 (long long)OS_STATE_NOPRECREATE);
+	LASSERTF(OS_STATE_ENOSPC == 0x20, "found %lld\n",
+		 (long long)OS_STATE_ENOSPC);
+	LASSERTF(OS_STATE_ENOINO == 0x40, "found %lld\n",
+		 (long long)OS_STATE_ENOINO);
+	LASSERTF(OS_STATE_SUM == 0x100, "found %lld\n",
+		 (long long)OS_STATE_SUM);
+	LASSERTF(OS_STATE_NONROT == 0x200, "found %lld\n",
+		 (long long)OS_STATE_NONROT);
 
 	/* Checks for struct obd_ioobj */
 	LASSERTF((int)sizeof(struct obd_ioobj) == 24, "found %lld\n",
@@ -2127,6 +2293,33 @@ void lustre_assert_wire_constants(void)
 	LASSERTF((int)sizeof(((struct ll_fid *)0)->f_type) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct ll_fid *)0)->f_type));
 
+	LASSERTF(MDS_CROSS_REF == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_CROSS_REF);
+	LASSERTF(MDS_PERM_BYPASS == 0x00000008UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_PERM_BYPASS);
+	LASSERTF(MDS_QUOTA_IGNORE == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_QUOTA_IGNORE);
+	LASSERTF(MDS_KEEP_ORPHAN == 0x00000080UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_KEEP_ORPHAN);
+	LASSERTF(MDS_RECOV_OPEN == 0x00000100UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_RECOV_OPEN);
+	LASSERTF(MDS_DATA_MODIFIED == 0x00000200UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_DATA_MODIFIED);
+	LASSERTF(MDS_CREATE_VOLATILE == 0x00000400UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_CREATE_VOLATILE);
+	LASSERTF(MDS_OWNEROVERRIDE == 0x00000800UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_OWNEROVERRIDE);
+	LASSERTF(MDS_HSM_RELEASE == 0x00001000UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_HSM_RELEASE);
+	LASSERTF(MDS_CLOSE_LAYOUT_SWAP == 0x00004000UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_CLOSE_LAYOUT_SWAP);
+	LASSERTF(MDS_CLOSE_LAYOUT_MERGE == 0x00008000UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_CLOSE_LAYOUT_MERGE);
+	LASSERTF(MDS_CLOSE_RESYNC_DONE == 0x00010000UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_CLOSE_RESYNC_DONE);
+	LASSERTF(MDS_CLOSE_LAYOUT_SPLIT == 0x00020000UL, "found 0x%.8xUL\n",
+		(unsigned)MDS_CLOSE_LAYOUT_SPLIT);
+
 	/* Checks for struct mdt_body */
 	LASSERTF((int)sizeof(struct mdt_body) == 216, "found %lld\n",
 		 (long long)(int)sizeof(struct mdt_body));
@@ -2138,10 +2331,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct mdt_body, mbo_fid2));
 	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_fid2) == 16, "found %lld\n",
 		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_fid2));
-	LASSERTF((int)offsetof(struct mdt_body, mbo_handle) == 32, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_body, mbo_handle));
-	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_handle) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_handle));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_open_handle) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_open_handle));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_open_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_open_handle));
 	LASSERTF((int)offsetof(struct mdt_body, mbo_valid) == 40, "found %lld\n",
 		 (long long)(int)offsetof(struct mdt_body, mbo_valid));
 	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_valid) == 8, "found %lld\n",
@@ -2166,6 +2359,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct mdt_body, mbo_blocks));
 	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_blocks) == 8, "found %lld\n",
 		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_blocks));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_version) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_version));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_version) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_version));
 	LASSERTF((int)offsetof(struct mdt_body, mbo_t_state) == 96, "found %lld\n",
 		 (long long)(int)offsetof(struct mdt_body, mbo_t_state));
 	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_t_state) == 8, "found %lld\n",
@@ -2206,10 +2403,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct mdt_body, mbo_nlink));
 	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_nlink) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_nlink));
-	LASSERTF((int)offsetof(struct mdt_body, mbo_unused2) == 140, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_body, mbo_unused2));
-	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_unused2) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_unused2));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_layout_gen) == 140, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_layout_gen));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_layout_gen) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_layout_gen));
 	LASSERTF((int)offsetof(struct mdt_body, mbo_suppgid) == 144, "found %lld\n",
 		 (long long)(int)offsetof(struct mdt_body, mbo_suppgid));
 	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_suppgid) == 4, "found %lld\n",
@@ -2242,14 +2439,14 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct mdt_body, mbo_projid));
 	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_projid) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_projid));
-	LASSERTF((int)offsetof(struct mdt_body, mbo_padding_6) == 176, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_body, mbo_padding_6));
-	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_6) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_6));
-	LASSERTF((int)offsetof(struct mdt_body, mbo_padding_7) == 184, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_body, mbo_padding_7));
-	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_7) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_7));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_dom_size) == 176, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_dom_size));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_dom_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_dom_size));
+	LASSERTF((int)offsetof(struct mdt_body, mbo_dom_blocks) == 184, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mbo_dom_blocks));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_dom_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mbo_dom_blocks));
 	LASSERTF((int)offsetof(struct mdt_body, mbo_padding_8) == 192, "found %lld\n",
 		 (long long)(int)offsetof(struct mdt_body, mbo_padding_8));
 	LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_8) == 8, "found %lld\n",
@@ -2268,8 +2465,6 @@ void lustre_assert_wire_constants(void)
 		MDS_FMODE_EXEC);
 	LASSERTF(MDS_OPEN_CREATED == 000000000010UL, "found 0%.11oUL\n",
 		MDS_OPEN_CREATED);
-	LASSERTF(MDS_OPEN_CROSS == 000000000020UL, "found 0%.11oUL\n",
-		MDS_OPEN_CROSS);
 	LASSERTF(MDS_OPEN_CREAT == 000000000100UL, "found 0%.11oUL\n",
 		MDS_OPEN_CREAT);
 	LASSERTF(MDS_OPEN_EXCL == 000000000200UL, "found 0%.11oUL\n",
@@ -2332,14 +2527,20 @@ void lustre_assert_wire_constants(void)
 		MDS_INODELOCK_OPEN);
 	LASSERTF(MDS_INODELOCK_LAYOUT == 0x000008, "found 0x%.8x\n",
 		MDS_INODELOCK_LAYOUT);
+	LASSERTF(MDS_INODELOCK_PERM == 0x000010, "found 0x%.8x\n",
+		MDS_INODELOCK_PERM);
+	LASSERTF(MDS_INODELOCK_XATTR == 0x000020, "found 0x%.8x\n",
+		MDS_INODELOCK_XATTR);
+	LASSERTF(MDS_INODELOCK_DOM == 0x000040, "found 0x%.8x\n",
+		MDS_INODELOCK_DOM);
 
 	/* Checks for struct mdt_ioepoch */
 	LASSERTF((int)sizeof(struct mdt_ioepoch) == 24, "found %lld\n",
 		 (long long)(int)sizeof(struct mdt_ioepoch));
-	LASSERTF((int)offsetof(struct mdt_ioepoch, mio_handle) == 0, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_ioepoch, mio_handle));
-	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_handle) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_ioepoch *)0)->mio_handle));
+	LASSERTF((int)offsetof(struct mdt_ioepoch, mio_open_handle) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_ioepoch, mio_open_handle));
+	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_open_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_ioepoch *)0)->mio_open_handle));
 	LASSERTF((int)offsetof(struct mdt_ioepoch, mio_unused1) == 8, "found %lld\n",
 		 (long long)(int)offsetof(struct mdt_ioepoch, mio_unused1));
 	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_unused1) == 8, "found %lld\n",
@@ -2508,10 +2709,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct mdt_rec_create, cr_fid2));
 	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid2) == 16, "found %lld\n",
 		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid2));
-	LASSERTF((int)offsetof(struct mdt_rec_create, cr_old_handle) == 72, "found %lld\n",
-		 (long long)(int)offsetof(struct mdt_rec_create, cr_old_handle));
-	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_old_handle) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_old_handle));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_open_handle_old) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_open_handle_old));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_open_handle_old) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_open_handle_old));
 	LASSERTF((int)offsetof(struct mdt_rec_create, cr_time) == 80, "found %lld\n",
 		 (long long)(int)offsetof(struct mdt_rec_create, cr_time));
 	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_time) == 8, "found %lld\n",
@@ -2945,6 +3146,102 @@ void lustre_assert_wire_constants(void)
 	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11));
 
+	/* Checks for struct mdt_rec_resync */
+	LASSERTF((int)sizeof(struct mdt_rec_resync) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_resync));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fid) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_fid));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fid));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding0) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding0));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding0) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding0));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding1) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding1));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding1));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding2) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding2));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding2));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding3) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding3));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding3));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding4) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding4));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding4));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_bias) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding5) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding5));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding5) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding5));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding6) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding6));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding6) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding6));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding7) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding7));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding7) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding7));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding8) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding8));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding8) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding8));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_mirror_id) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_mirror_id));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_mirror_id) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_mirror_id));
+	LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding9) == 134, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_resync, rs_padding9));
+	LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding9) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding9));
+
 	/* Checks for struct mdt_rec_reint */
 	LASSERTF((int)sizeof(struct mdt_rec_reint) == 136, "found %lld\n",
 		 (long long)(int)sizeof(struct mdt_rec_reint));
@@ -3036,9 +3333,13 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct mdt_rec_reint, rr_umask));
 	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_umask) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_umask));
-	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_padding_4) == 132, "found %lld\n",
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mirror_id) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_mirror_id));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mirror_id) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mirror_id));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_padding_4) == 134, "found %lld\n",
 		 (long long)(int)offsetof(struct mdt_rec_reint, rr_padding_4));
-	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4) == 4, "found %lld\n",
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4) == 2, "found %lld\n",
 		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4));
 
 	/* Checks for struct lmv_desc */
@@ -3164,12 +3465,16 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)sizeof(((struct ldlm_extent *)0)->gid));
 
 	/* Checks for struct ldlm_inodebits */
-	LASSERTF((int)sizeof(struct ldlm_inodebits) == 8, "found %lld\n",
+	LASSERTF((int)sizeof(struct ldlm_inodebits) == 16, "found %lld\n",
 		 (long long)(int)sizeof(struct ldlm_inodebits));
 	LASSERTF((int)offsetof(struct ldlm_inodebits, bits) == 0, "found %lld\n",
 		 (long long)(int)offsetof(struct ldlm_inodebits, bits));
 	LASSERTF((int)sizeof(((struct ldlm_inodebits *)0)->bits) == 8, "found %lld\n",
 		 (long long)(int)sizeof(((struct ldlm_inodebits *)0)->bits));
+	LASSERTF((int)offsetof(struct ldlm_inodebits, try_bits) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_inodebits, try_bits));
+	LASSERTF((int)sizeof(((struct ldlm_inodebits *)0)->try_bits) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_inodebits *)0)->try_bits));
 
 	/* Checks for struct ldlm_flock_wire */
 	LASSERTF((int)sizeof(struct ldlm_flock_wire) == 32, "found %lld\n",
@@ -3212,24 +3517,14 @@ void lustre_assert_wire_constants(void)
 		 (long long)IT_GETATTR);
 	LASSERTF(IT_LOOKUP == 16, "found %lld\n",
 		 (long long)IT_LOOKUP);
-	LASSERTF(IT_UNLINK == 32, "found %lld\n",
-		 (long long)IT_UNLINK);
-	LASSERTF(IT_TRUNC == 64, "found %lld\n",
-		 (long long)IT_TRUNC);
 	LASSERTF(IT_GETXATTR == 128, "found %lld\n",
 		 (long long)IT_GETXATTR);
-	LASSERTF(IT_EXEC == 256, "found %lld\n",
-		 (long long)IT_EXEC);
-	LASSERTF(IT_PIN == 512, "found %lld\n",
-		 (long long)IT_PIN);
 	LASSERTF(IT_LAYOUT == 1024, "found %lld\n",
 		 (long long)IT_LAYOUT);
 	LASSERTF(IT_QUOTA_DQACQ == 2048, "found %lld\n",
 		 (long long)IT_QUOTA_DQACQ);
 	LASSERTF(IT_QUOTA_CONN == 4096, "found %lld\n",
 		 (long long)IT_QUOTA_CONN);
-	LASSERTF(IT_SETXATTR == 8192, "found %lld\n",
-		 (long long)IT_SETXATTR);
 
 	/* Checks for struct ldlm_resource_desc */
 	LASSERTF((int)sizeof(struct ldlm_resource_desc) == 40, "found %lld\n",
@@ -3702,14 +3997,14 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_valid));
 	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_valid) == 8, "found %lld\n",
 		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_valid));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_tail) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_tail));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail));
 	LASSERTF((int)offsetof(struct llog_setattr64_rec_v2, lsr_projid) == 56, "found %lld\n",
 		 (long long)(int)offsetof(struct llog_setattr64_rec_v2, lsr_projid));
 	LASSERTF((int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_projid) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_projid));
-	LASSERTF((int)offsetof(struct llog_setattr64_rec_v2, lsr_tail) == 80, "found %lld\n",
-		 (long long)(int)offsetof(struct llog_setattr64_rec_v2, lsr_tail));
-	LASSERTF((int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_tail) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct llog_setattr64_rec_v2 *)0)->lsr_tail));
 
 	/* Checks for struct llog_size_change_rec */
 	LASSERTF((int)sizeof(struct llog_size_change_rec) == 64, "found %lld\n",
@@ -3838,10 +4133,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_id));
 	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id));
-	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_padding) == 20, "found %lld\n",
-		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_padding));
-	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_padding) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_padding));
+	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_time) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_time));
+	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_time) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_time));
 	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_endrec) == 24, "found %lld\n",
 		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_endrec));
 	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec) == 8, "found %lld\n",
@@ -3949,12 +4244,7 @@ void lustre_assert_wire_constants(void)
 	CLASSERT(LLOG_ORIGIN_HANDLE_CREATE == 501);
 	CLASSERT(LLOG_ORIGIN_HANDLE_NEXT_BLOCK == 502);
 	CLASSERT(LLOG_ORIGIN_HANDLE_READ_HEADER == 503);
-	CLASSERT(LLOG_ORIGIN_HANDLE_WRITE_REC == 504);
-	CLASSERT(LLOG_ORIGIN_HANDLE_CLOSE == 505);
-	CLASSERT(LLOG_ORIGIN_CONNECT == 506);
-	CLASSERT(LLOG_CATINFO == 507);
 	CLASSERT(LLOG_ORIGIN_HANDLE_PREV_BLOCK == 508);
-	CLASSERT(LLOG_ORIGIN_HANDLE_DESTROY == 509);
 	CLASSERT(LLOG_FIRST_OPC == 501);
 	CLASSERT(LLOG_LAST_OPC == 510);
 	CLASSERT(LLOG_CONFIG_ORIG_CTXT == 0);
@@ -4426,14 +4716,10 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct layout_intent, li_flags));
 	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_flags) == 4, "found %lld\n",
 		 (long long)(int)sizeof(((struct layout_intent *)0)->li_flags));
-	LASSERTF((int)offsetof(struct layout_intent, li_start) == 8, "found %lld\n",
-		 (long long)(int)offsetof(struct layout_intent, li_start));
-	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_start) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct layout_intent *)0)->li_start));
-	LASSERTF((int)offsetof(struct layout_intent, li_end) == 16, "found %lld\n",
-		 (long long)(int)offsetof(struct layout_intent, li_end));
-	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_end) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct layout_intent *)0)->li_end));
+	LASSERTF((int)offsetof(struct layout_intent, li_extent) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct layout_intent, li_extent));
+	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_extent) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct layout_intent *)0)->li_extent));
 	LASSERTF(LAYOUT_INTENT_ACCESS == 0, "found %lld\n",
 		 (long long)LAYOUT_INTENT_ACCESS);
 	LASSERTF(LAYOUT_INTENT_READ == 1, "found %lld\n",
@@ -5089,12 +5375,14 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)offsetof(struct lfsck_request, lr_padding_3));
 	LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_padding_3) == 8, "found %lld\n",
 		 (long long)(int)sizeof(((struct lfsck_request *)0)->lr_padding_3));
+#ifdef HAVE_SERVER_SUPPORT
 	LASSERTF(LFSCK_TYPE_SCRUB == 0x00000000UL, "found 0x%.8xUL\n",
 		(unsigned)LFSCK_TYPE_SCRUB);
 	LASSERTF(LFSCK_TYPE_LAYOUT == 0x00000001UL, "found 0x%.8xUL\n",
 		(unsigned)LFSCK_TYPE_LAYOUT);
 	LASSERTF(LFSCK_TYPE_NAMESPACE == 0x00000004UL, "found 0x%.8xUL\n",
 		(unsigned)LFSCK_TYPE_NAMESPACE);
+#endif
 	LASSERTF(LE_LASTID_REBUILDING == 1, "found %lld\n",
 		 (long long)LE_LASTID_REBUILDING);
 	LASSERTF(LE_LASTID_REBUILT == 2, "found %lld\n",
@@ -5157,7 +5445,7 @@ void lustre_assert_wire_constants(void)
 		 (long long)(int)sizeof(((struct update_params *)0)->up_params));
 
 	/* Checks for struct update_op */
-	LASSERTF((int)sizeof(struct update_op) == 24, "found %lld\n",
+	LASSERTF((int)sizeof(struct update_op) == 20, "found %lld\n",
 		 (long long)(int)sizeof(struct update_op));
 	LASSERTF((int)offsetof(struct update_op, uop_fid) == 0, "found %lld\n",
 		 (long long)(int)offsetof(struct update_op, uop_fid));
@@ -5226,75 +5514,145 @@ void lustre_assert_wire_constants(void)
 	LASSERTF((int)sizeof(((struct llog_update_record *)0)->lur_update_rec) == 32, "found %lld\n",
 		 (long long)(int)sizeof(((struct llog_update_record *)0)->lur_update_rec));
 
-	/* Checks for struct lu_ladvise */
-	LASSERTF((int)sizeof(struct lu_ladvise) == 32, "found %lld\n",
-		 (long long)(int)sizeof(struct lu_ladvise));
-	LASSERTF((int)offsetof(struct lu_ladvise, lla_advice) == 0, "found %lld\n",
-		 (long long)(int)offsetof(struct lu_ladvise, lla_advice));
-	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_advice) == 2, "found %lld\n",
-		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_advice));
-	LASSERTF((int)offsetof(struct lu_ladvise, lla_value1) == 2, "found %lld\n",
-		 (long long)(int)offsetof(struct lu_ladvise, lla_value1));
-	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value1) == 2, "found %lld\n",
-		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value1));
-	LASSERTF((int)offsetof(struct lu_ladvise, lla_value2) == 4, "found %lld\n",
-		 (long long)(int)offsetof(struct lu_ladvise, lla_value2));
-	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value2) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value2));
-	LASSERTF((int)offsetof(struct lu_ladvise, lla_start) == 8, "found %lld\n",
-		 (long long)(int)offsetof(struct lu_ladvise, lla_start));
-	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_start) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_start));
-	LASSERTF((int)offsetof(struct lu_ladvise, lla_end) == 16, "found %lld\n",
-		 (long long)(int)offsetof(struct lu_ladvise, lla_end));
-	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_end) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_end));
-	LASSERTF((int)offsetof(struct lu_ladvise, lla_value3) == 24, "found %lld\n",
-		 (long long)(int)offsetof(struct lu_ladvise, lla_value3));
-	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value3) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value3));
-	LASSERTF((int)offsetof(struct lu_ladvise, lla_value4) == 28, "found %lld\n",
-		 (long long)(int)offsetof(struct lu_ladvise, lla_value4));
-	LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value4) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value4));
-	LASSERTF(LU_LADVISE_WILLREAD == 1, "found %lld\n",
-		 (long long)LU_LADVISE_WILLREAD);
-	LASSERTF(LU_LADVISE_DONTNEED == 2, "found %lld\n",
-		 (long long)LU_LADVISE_DONTNEED);
-
-	/* Checks for struct ladvise_hdr */
-	LASSERTF(LADVISE_MAGIC == 0x1ADF1CE0, "found 0x%.8x\n",
-		 LADVISE_MAGIC);
-	LASSERTF((int)sizeof(struct ladvise_hdr) == 32, "found %lld\n",
-		 (long long)(int)sizeof(struct ladvise_hdr));
-	LASSERTF((int)offsetof(struct ladvise_hdr, lah_magic) == 0, "found %lld\n",
-		 (long long)(int)offsetof(struct ladvise_hdr, lah_magic));
-	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_magic) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_magic));
-	LASSERTF((int)offsetof(struct ladvise_hdr, lah_count) == 4, "found %lld\n",
-		 (long long)(int)offsetof(struct ladvise_hdr, lah_count));
-	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_count) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_count));
-	LASSERTF((int)offsetof(struct ladvise_hdr, lah_flags) == 8, "found %lld\n",
-		 (long long)(int)offsetof(struct ladvise_hdr, lah_flags));
-	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_flags) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_flags));
-	LASSERTF((int)offsetof(struct ladvise_hdr, lah_value1) == 16, "found %lld\n",
-		 (long long)(int)offsetof(struct ladvise_hdr, lah_value1));
-	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value1) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value1));
-	LASSERTF((int)offsetof(struct ladvise_hdr, lah_value2) == 20, "found %lld\n",
-		 (long long)(int)offsetof(struct ladvise_hdr, lah_value2));
-	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value2) == 4, "found %lld\n",
-		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value2));
-	LASSERTF((int)offsetof(struct ladvise_hdr, lah_value3) == 24, "found %lld\n",
-		 (long long)(int)offsetof(struct ladvise_hdr, lah_value3));
-	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_value3) == 8, "found %lld\n",
-		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_value3));
-	LASSERTF((int)offsetof(struct ladvise_hdr, lah_advise) == 32, "found %lld\n",
-		 (long long)(int)offsetof(struct ladvise_hdr, lah_advise));
-	LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_advise) == 0, "found %lld\n",
-		 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_advise));
-	LASSERTF(LF_ASYNC == 0x00000001UL, "found 0x%.8xUL\n",
-		(unsigned)LF_ASYNC);
+	/* Checks for struct lustre_cfg */
+	LASSERTF((int)sizeof(struct lustre_cfg) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_cfg));
+	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_version) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_cfg, lcfg_version));
+	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_version));
+	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_command) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_cfg, lcfg_command));
+	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_command) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_command));
+	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_num) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_cfg, lcfg_num));
+	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_num) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_num));
+	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_flags) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_cfg, lcfg_flags));
+	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_flags));
+	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_nid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_cfg, lcfg_nid));
+	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_nid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_nid));
+	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_nal) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_cfg, lcfg_nal));
+	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_nal) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_nal));
+	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_bufcount) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_cfg, lcfg_bufcount));
+	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_bufcount) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_bufcount));
+	LASSERTF((int)offsetof(struct lustre_cfg, lcfg_buflens[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_cfg, lcfg_buflens[0]));
+	LASSERTF((int)sizeof(((struct lustre_cfg *)0)->lcfg_buflens[0]) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_cfg *)0)->lcfg_buflens[0]));
+	LASSERTF(LCFG_ATTACH == 0x000cf001UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_ATTACH);
+	LASSERTF(LCFG_DETACH == 0x000cf002UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_DETACH);
+	LASSERTF(LCFG_SETUP == 0x000cf003UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_SETUP);
+	LASSERTF(LCFG_CLEANUP == 0x000cf004UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_CLEANUP);
+	LASSERTF(LCFG_ADD_UUID == 0x000cf005UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_ADD_UUID);
+	LASSERTF(LCFG_DEL_UUID == 0x000cf006UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_DEL_UUID);
+	LASSERTF(LCFG_MOUNTOPT == 0x000cf007UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_MOUNTOPT);
+	LASSERTF(LCFG_DEL_MOUNTOPT == 0x000cf008UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_DEL_MOUNTOPT);
+	LASSERTF(LCFG_SET_TIMEOUT == 0x000cf009UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_SET_TIMEOUT);
+	LASSERTF(LCFG_SET_UPCALL == 0x000cf00aUL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_SET_UPCALL);
+	LASSERTF(LCFG_ADD_CONN == 0x000cf00bUL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_ADD_CONN);
+	LASSERTF(LCFG_DEL_CONN == 0x000cf00cUL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_DEL_CONN);
+	LASSERTF(LCFG_LOV_ADD_OBD == 0x000cf00dUL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_LOV_ADD_OBD);
+	LASSERTF(LCFG_LOV_DEL_OBD == 0x000cf00eUL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_LOV_DEL_OBD);
+	LASSERTF(LCFG_PARAM == 0x000cf00fUL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_PARAM);
+	LASSERTF(LCFG_MARKER == 0x000cf010UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_MARKER);
+	LASSERTF(LCFG_LOG_START == 0x000ce011UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_LOG_START);
+	LASSERTF(LCFG_LOG_END == 0x000ce012UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_LOG_END);
+	LASSERTF(LCFG_LOV_ADD_INA == 0x000ce013UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_LOV_ADD_INA);
+	LASSERTF(LCFG_ADD_MDC == 0x000cf014UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_ADD_MDC);
+	LASSERTF(LCFG_DEL_MDC == 0x000cf015UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_DEL_MDC);
+	LASSERTF(LCFG_SPTLRPC_CONF == 0x000ce016UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_SPTLRPC_CONF);
+	LASSERTF(LCFG_POOL_NEW == 0x000ce020UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_POOL_NEW);
+	LASSERTF(LCFG_POOL_ADD == 0x000ce021UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_POOL_ADD);
+	LASSERTF(LCFG_POOL_REM == 0x000ce022UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_POOL_REM);
+	LASSERTF(LCFG_POOL_DEL == 0x000ce023UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_POOL_DEL);
+	LASSERTF(LCFG_SET_LDLM_TIMEOUT == 0x000ce030UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_SET_LDLM_TIMEOUT);
+	LASSERTF(LCFG_PRE_CLEANUP == 0x000cf031UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_PRE_CLEANUP);
+	LASSERTF(LCFG_SET_PARAM == 0x000ce032UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_SET_PARAM);
+	LASSERTF(LCFG_NODEMAP_ADD == 0x000ce040UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_ADD);
+	LASSERTF(LCFG_NODEMAP_DEL == 0x000ce041UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_DEL);
+	LASSERTF(LCFG_NODEMAP_ADD_RANGE == 0x000ce042UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_ADD_RANGE);
+	LASSERTF(LCFG_NODEMAP_DEL_RANGE == 0x000ce043UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_DEL_RANGE);
+	LASSERTF(LCFG_NODEMAP_ADD_UIDMAP == 0x000ce044UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_ADD_UIDMAP);
+	LASSERTF(LCFG_NODEMAP_DEL_UIDMAP == 0x000ce045UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_DEL_UIDMAP);
+	LASSERTF(LCFG_NODEMAP_ADD_GIDMAP == 0x000ce046UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_ADD_GIDMAP);
+	LASSERTF(LCFG_NODEMAP_DEL_GIDMAP == 0x000ce047UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_DEL_GIDMAP);
+	LASSERTF(LCFG_NODEMAP_ACTIVATE == 0x000ce048UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_ACTIVATE);
+	LASSERTF(LCFG_NODEMAP_ADMIN == 0x000ce049UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_ADMIN);
+	LASSERTF(LCFG_NODEMAP_TRUSTED == 0x000ce050UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_TRUSTED);
+	LASSERTF(LCFG_NODEMAP_SQUASH_UID == 0x000ce051UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_SQUASH_UID);
+	LASSERTF(LCFG_NODEMAP_SQUASH_GID == 0x000ce052UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_SQUASH_GID);
+	LASSERTF(LCFG_NODEMAP_ADD_SHKEY == 0x000ce053UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_ADD_SHKEY);
+	LASSERTF(LCFG_NODEMAP_DEL_SHKEY == 0x000ce054UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_DEL_SHKEY);
+	LASSERTF(LCFG_NODEMAP_TEST_NID == 0x000ce055UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_TEST_NID);
+	LASSERTF(LCFG_NODEMAP_TEST_ID == 0x000ce056UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_TEST_ID);
+	LASSERTF(LCFG_NODEMAP_SET_FILESET == 0x000ce057UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_SET_FILESET);
+	LASSERTF(LCFG_NODEMAP_DENY_UNKNOWN == 0x000ce058UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_DENY_UNKNOWN);
+	LASSERTF(LCFG_NODEMAP_MAP_MODE == 0x000ce059UL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_MAP_MODE);
+	LASSERTF(LCFG_NODEMAP_AUDIT_MODE == 0x000ce05aUL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_AUDIT_MODE);
+	LASSERTF(LCFG_NODEMAP_SET_SEPOL == 0x000ce05bUL, "found 0x%.8xUL\n",
+		(unsigned)LCFG_NODEMAP_SET_SEPOL);
+	LASSERTF(PORTALS_CFG_TYPE == 1, "found %lld\n",
+		 (long long)PORTALS_CFG_TYPE);
+	LASSERTF(LUSTRE_CFG_TYPE == 123, "found %lld\n",
+		 (long long)LUSTRE_CFG_TYPE);
 }
diff --git a/drivers/staging/lustrefsx/lustre/target/barrier.c b/drivers/staging/lustrefsx/lustre/target/barrier.c
index 6145e0e37a711..54b3e567b3605 100644
--- a/drivers/staging/lustrefsx/lustre/target/barrier.c
+++ b/drivers/staging/lustrefsx/lustre/target/barrier.c
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2016, Intel Corporation.
+ * Copyright (c) 2017, Intel Corporation.
  *
  * lustre/target/barrier.c
  *
@@ -35,12 +35,11 @@
 
 #include <linux/percpu_counter.h>
 
-#include <lustre/lustre_idl.h>
 #include <dt_object.h>
 #include <obd.h>
 #include <obd_class.h>
 #include <lustre_barrier.h>
-#include <lustre/lustre_barrier_user.h>
+#include <uapi/linux/lustre/lustre_barrier_user.h>
 
 static LIST_HEAD(barrier_instance_list);
 static DEFINE_SPINLOCK(barrier_instance_lock);
@@ -53,7 +52,7 @@ struct barrier_instance {
 	rwlock_t		 bi_rwlock;
 	struct percpu_counter	 bi_writers;
 	atomic_t		 bi_ref;
-	time_t			 bi_deadline;
+	time64_t		 bi_deadline;
 	__u32			 bi_status;
 };
 
@@ -173,7 +172,7 @@ static void barrier_set(struct barrier_instance *barrier, __u32 status)
 static int barrier_freeze(const struct lu_env *env,
 			  struct barrier_instance *barrier, bool phase1)
 {
-	int left;
+	time64_t left;
 	int rc = 0;
 	__s64 inflight = 0;
 	ENTRY;
@@ -195,7 +194,7 @@ static int barrier_freeze(const struct lu_env *env,
 
 	LASSERT(barrier->bi_deadline != 0);
 
-	left = barrier->bi_deadline - cfs_time_current_sec();
+	left = barrier->bi_deadline - ktime_get_real_seconds();
 	if (left <= 0)
 		RETURN(1);
 
@@ -214,8 +213,7 @@ static int barrier_freeze(const struct lu_env *env,
 		if (rc)
 			RETURN(rc);
 
-		if (cfs_time_beforeq(barrier->bi_deadline,
-				     cfs_time_current_sec()))
+		if (ktime_get_real_seconds() > barrier->bi_deadline)
 			RETURN(1);
 	}
 
@@ -252,7 +250,7 @@ bool barrier_entry(struct dt_device *key)
 	if (likely(barrier->bi_status != BS_FREEZING_P1 &&
 		   barrier->bi_status != BS_FREEZING_P2 &&
 		   barrier->bi_status != BS_FROZEN) ||
-	    cfs_time_beforeq(barrier->bi_deadline, cfs_time_current_sec())) {
+	    ktime_get_real_seconds() > barrier->bi_deadline) {
 		percpu_counter_inc(&barrier->bi_writers);
 		entered = true;
 	}
@@ -292,7 +290,7 @@ int barrier_handler(struct dt_device *key, struct ptlrpc_request *req)
 	ENTRY;
 
 	/* glimpse on barrier locks always packs a glimpse descriptor */
-	req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_DESC_CALLBACK);
+	req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK_DESC);
 	desc = req_capsule_client_get(&req->rq_pill, &RMF_DLM_GL_DESC);
 	if (!desc)
 		GOTO(out, rc = -EPROTO);
@@ -326,8 +324,8 @@ int barrier_handler(struct dt_device *key, struct ptlrpc_request *req)
 		if (OBD_FAIL_CHECK(OBD_FAIL_BARRIER_FAILURE))
 			GOTO(fini, rc = -EINVAL);
 
-		barrier->bi_deadline = cfs_time_current_sec() +
-					desc->lgbd_timeout;
+		barrier->bi_deadline = ktime_get_real_seconds() +
+				       desc->lgbd_timeout;
 		rc = barrier_freeze(&env, barrier,
 				    desc->lgbd_status == BS_FREEZING_P1);
 		break;
@@ -358,7 +356,7 @@ int barrier_handler(struct dt_device *key, struct ptlrpc_request *req)
 	lvb->lvb_index = barrier_dev_idx(barrier);
 
 	CDEBUG(D_SNAPSHOT, "%s: handled barrier request: status %u, "
-	       "deadline %lu: rc = %d\n", barrier_barrier2name(barrier),
+	       "deadline %lld: rc = %d\n", barrier_barrier2name(barrier),
 	       lvb->lvb_status, barrier->bi_deadline, rc);
 
 	barrier_instance_put(barrier);
diff --git a/drivers/staging/lustrefsx/lustre/target/out_handler.c b/drivers/staging/lustrefsx/lustre/target/out_handler.c
index c342ae41f95c0..a238f588e0cd1 100644
--- a/drivers/staging/lustrefsx/lustre/target/out_handler.c
+++ b/drivers/staging/lustrefsx/lustre/target/out_handler.c
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2013, 2016, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
  *
  * lustre/target/out_handler.c
  *
@@ -52,7 +52,7 @@ static void out_reconstruct(const struct lu_env *env, struct dt_device *dt,
 			    struct object_update_reply *reply,
 			    int index)
 {
-	CDEBUG(D_INFO, "%s: fork reply reply %p index %d: rc = %d\n",
+	CDEBUG(D_HA, "%s: fork reply reply %p index %d: rc = %d\n",
 	       dt_obd_name(dt), reply, index, 0);
 
 	object_update_result_insert(reply, NULL, 0, index, 0);
@@ -65,16 +65,10 @@ typedef void (*out_reconstruct_t)(const struct lu_env *env,
 				  struct object_update_reply *reply,
 				  int index);
 
-static inline int out_check_resent(const struct lu_env *env,
-				   struct dt_device *dt,
-				   struct dt_object *obj,
-				   struct ptlrpc_request *req,
-				   out_reconstruct_t reconstruct,
-				   struct object_update_reply *reply,
-				   int index)
+static inline bool out_check_resent(struct ptlrpc_request *req)
 {
 	if (likely(!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT)))
-		return 0;
+		return false;
 
 	if (req_xid_is_last(req)) {
 		struct lsd_client_data *lcd;
@@ -90,14 +84,12 @@ static inline int out_check_resent(const struct lu_env *env,
 		lustre_msg_set_transno(req->rq_repmsg, req->rq_transno);
 		lustre_msg_set_status(req->rq_repmsg, req->rq_status);
 
-		DEBUG_REQ(D_RPCTRACE, req, "restoring resent RPC");
-
-		reconstruct(env, dt, obj, reply, index);
-		return 1;
+		DEBUG_REQ(D_HA, req, "reconstruct resent RPC");
+		return true;
 	}
-	DEBUG_REQ(D_HA, req, "no reply for RESENT req (have %lld)",
-		 req->rq_export->exp_target_data.ted_lcd->lcd_last_xid);
-	return 0;
+	DEBUG_REQ(D_HA, req, "reprocess RESENT req, last_xid is %lld",
+		  req->rq_export->exp_target_data.ted_lcd->lcd_last_xid);
+	return false;
 }
 
 static int out_create(struct tgt_session_info *tsi)
@@ -289,10 +281,62 @@ static int out_xattr_get(struct tgt_session_info *tsi)
 	} else if (lbuf->lb_buf) {
 		lbuf->lb_len = rc;
 	}
-
-	CDEBUG(D_INFO, "%s: "DFID" get xattr %s len %d: rc = %d\n",
+	CDEBUG(D_INFO, "%s: "DFID" get xattr %s len %d\n",
 	       tgt_name(tsi->tsi_tgt), PFID(lu_object_fid(&obj->do_lu)),
-	       name, (int)lbuf->lb_len, rc);
+	       name, rc);
+
+	GOTO(out, rc);
+
+out:
+	object_update_result_insert(reply, lbuf->lb_buf, lbuf->lb_len, idx, rc);
+	RETURN(0);
+}
+
+static int out_xattr_list(struct tgt_session_info *tsi)
+{
+	const struct lu_env *env = tsi->tsi_env;
+	struct tgt_thread_info *tti = tgt_th_info(env);
+	struct lu_buf *lbuf = &tti->tti_buf;
+	struct object_update_reply *reply = tti->tti_u.update.tti_update_reply;
+	struct dt_object *obj = tti->tti_u.update.tti_dt_object;
+	struct object_update_result *update_result;
+	int idx = tti->tti_u.update.tti_update_reply_index;
+	int rc;
+
+	ENTRY;
+
+	if (!lu_object_exists(&obj->do_lu)) {
+		set_bit(LU_OBJECT_HEARD_BANSHEE,
+			&obj->do_lu.lo_header->loh_flags);
+		RETURN(-ENOENT);
+	}
+
+	update_result = object_update_result_get(reply, 0, NULL);
+	if (!update_result) {
+		rc = -EPROTO;
+		CERROR("%s: empty buf for xattr list: rc = %d\n",
+		       tgt_name(tsi->tsi_tgt), rc);
+		RETURN(rc);
+	}
+
+	lbuf->lb_len = (int)tti->tti_u.update.tti_update->ou_result_size;
+	lbuf->lb_buf = update_result->our_data;
+	if (lbuf->lb_len == 0)
+		lbuf->lb_buf = 0;
+
+	dt_read_lock(env, obj, MOR_TGT_CHILD);
+	rc = dt_xattr_list(env, obj, lbuf);
+	dt_read_unlock(env, obj);
+	if (rc <= 0) {
+		lbuf->lb_len = 0;
+		if (unlikely(!rc))
+			rc = -ENODATA;
+	} else if (lbuf->lb_buf) {
+		lbuf->lb_len = rc;
+	}
+
+	CDEBUG(D_INFO, "%s: "DFID" list xattr len %d\n",
+	       tgt_name(tsi->tsi_tgt), PFID(lu_object_fid(&obj->do_lu)), rc);
 
 	/* Since we directly use update_result->our_data as the lbuf->lb_buf,
 	 * then use NULL for result_insert to avoid unnecessary memory copy. */
@@ -759,6 +803,8 @@ static struct tgt_handler out_update_ops[] = {
 	DEF_OUT_HNDL(OUT_WRITE, "out_write", MUTABOR | HABEO_REFERO, out_write),
 	DEF_OUT_HNDL(OUT_READ, "out_read", HABEO_REFERO, out_read),
 	DEF_OUT_HNDL(OUT_NOOP, "out_noop", HABEO_REFERO, out_noop),
+	DEF_OUT_HNDL(OUT_XATTR_LIST, "out_xattr_list", HABEO_REFERO,
+		     out_xattr_list),
 };
 
 static struct tgt_handler *out_handler_find(__u32 opc)
@@ -917,6 +963,8 @@ int out_handle(struct tgt_session_info *tsi)
 	int				rc1 = 0;
 	int				ouh_size, reply_size;
 	int				updates;
+	bool need_reconstruct;
+
 	ENTRY;
 
 	req_capsule_set(pill, &RQF_OUT_UPDATE);
@@ -1054,6 +1102,8 @@ int out_handle(struct tgt_session_info *tsi)
 	tti->tti_u.update.tti_update_reply = reply;
 	tti->tti_mult_trans = !req_is_replay(tgt_ses_req(tsi));
 
+	need_reconstruct = out_check_resent(pill->rc_req);
+
 	/* Walk through updates in the request to execute them */
 	for (i = 0; i < update_buf_count; i++) {
 		struct tgt_handler	*h;
@@ -1101,12 +1151,19 @@ int out_handle(struct tgt_session_info *tsi)
 
 			/* Check resend case only for modifying RPC */
 			if (h->th_flags & MUTABOR) {
-				struct ptlrpc_request *req = tgt_ses_req(tsi);
+				/* sanity check for last XID changing */
+				if (unlikely(!need_reconstruct &&
+					     req_xid_is_last(pill->rc_req))) {
+					DEBUG_REQ(D_ERROR, pill->rc_req,
+						  "unexpected last XID change");
+					GOTO(next, rc = -EINVAL);
+				}
 
-				if (out_check_resent(env, dt, dt_obj, req,
-						     out_reconstruct, reply,
-						     reply_index))
+				if (need_reconstruct) {
+					out_reconstruct(env, dt, dt_obj, reply,
+							reply_index);
 					GOTO(next, rc = 0);
+				}
 
 				if (dt->dd_rdonly)
 					GOTO(next, rc = -EROFS);
@@ -1115,6 +1172,10 @@ int out_handle(struct tgt_session_info *tsi)
 			/* start transaction for modification RPC only */
 			if (h->th_flags & MUTABOR && current_batchid == -1) {
 				current_batchid = update->ou_batchid;
+
+				if (reply_index == 0)
+					CFS_RACE(OBD_FAIL_PTLRPC_RESEND_RACE);
+
 				rc = out_tx_start(env, dt, ta, tsi->tsi_exp);
 				if (rc != 0)
 					GOTO(next, rc);
diff --git a/drivers/staging/lustrefsx/lustre/target/out_lib.c b/drivers/staging/lustrefsx/lustre/target/out_lib.c
index c267ed20bf485..e8ebf95f4786c 100644
--- a/drivers/staging/lustrefsx/lustre/target/out_lib.c
+++ b/drivers/staging/lustrefsx/lustre/target/out_lib.c
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2014, 2015, Intel Corporation.
+ * Copyright (c) 2014, 2017, Intel Corporation.
  */
 /*
  * lustre/target/out_lib.c
@@ -53,6 +53,7 @@ const char *update_op_str(__u16 opc)
 		[OUT_ATTR_GET] = "attr_get",
 		[OUT_XATTR_SET] = "xattr_set",
 		[OUT_XATTR_GET] = "xattr_get",
+		[OUT_XATTR_LIST] = "xattr_list",
 		[OUT_INDEX_LOOKUP] = "lookup",
 		[OUT_INDEX_INSERT] = "insert",
 		[OUT_INDEX_DELETE] = "delete",
@@ -102,7 +103,7 @@ int out_update_header_pack(const struct lu_env *env,
 	unsigned int			i;
 	size_t				update_size;
 
-	if (((reply_size + 7) >> 3) >= 1ULL << 16)
+	if (reply_size  >= LNET_MTU)
 		return -EINVAL;
 
 	/* Check whether the packing exceeding the maxima update length */
@@ -404,6 +405,15 @@ int out_xattr_get_pack(const struct lu_env *env, struct object_update *update,
 }
 EXPORT_SYMBOL(out_xattr_get_pack);
 
+int out_xattr_list_pack(const struct lu_env *env, struct object_update *update,
+		       size_t *max_update_size, const struct lu_fid *fid,
+		       const int bufsize)
+{
+	return out_update_pack(env, update, max_update_size, OUT_XATTR_LIST,
+			       fid, 0, NULL, NULL, bufsize);
+}
+EXPORT_SYMBOL(out_xattr_list_pack);
+
 int out_read_pack(const struct lu_env *env, struct object_update *update,
 		  size_t *max_update_size, const struct lu_fid *fid,
 		  size_t size, loff_t pos)
@@ -588,6 +598,10 @@ int out_create_add_exec(const struct lu_env *env, struct dt_object *obj,
 	struct tx_arg *arg;
 	int rc;
 
+	/* LU-13653: ignore quota for DNE directory creation */
+	if (dof->dof_type == DFT_DIR)
+		th->th_ignore_quota = 1;
+
 	rc = dt_declare_create(env, obj, attr, NULL, dof, th);
 	if (rc != 0)
 		return rc;
@@ -657,6 +671,10 @@ int out_attr_set_add_exec(const struct lu_env *env, struct dt_object *dt_obj,
 	if (rc != 0)
 		return rc;
 
+	if (attr->la_valid & LA_FLAGS &&
+	    attr->la_flags & LUSTRE_SET_SYNC_FL)
+		th->th_sync |= 1;
+
 	arg = tx_add_exec(ta, out_tx_attr_set_exec, out_tx_attr_set_undo,
 			  file, line);
 	if (IS_ERR(arg))
@@ -797,8 +815,7 @@ static int out_tx_xattr_set_exec(const struct lu_env *env,
 
 				lu_buf_free(&tbuf);
 				if (update) {
-					leh->leh_overflow_time =
-							cfs_time_current_sec();
+					leh->leh_overflow_time = ktime_get_real_seconds();
 					if (unlikely(!leh->leh_overflow_time))
 						leh->leh_overflow_time++;
 				}
@@ -1060,7 +1077,7 @@ static int out_obj_index_insert(const struct lu_env *env,
 		return -ENOTDIR;
 
 	dt_write_lock(env, dt_obj, MOR_TGT_CHILD);
-	rc = dt_insert(env, dt_obj, rec, key, th, 0);
+	rc = dt_insert(env, dt_obj, rec, key, th);
 	dt_write_unlock(env, dt_obj);
 
 	return rc;
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_fmd.c b/drivers/staging/lustrefsx/lustre/target/tgt_fmd.c
new file mode 100644
index 0000000000000..afbf668e38a70
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_fmd.c
@@ -0,0 +1,363 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2014, Intel Corporation.
+ *
+ * Copyright (c) 2019, DDN Storage Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/target/tgt_fmd.c
+ *
+ * This file provides functions to handle Filter Modification Data (FMD).
+ * The FMD is responsible for file attributes to be applied in
+ * Transaction ID (XID) order, so older requests can't re-write newer
+ * attributes.
+ *
+ * FMD is organized as per-client list and identified by FID of object. Each
+ * FMD stores FID of object and the highest received XID of modification
+ * request for this object.
+ *
+ * FMD can expire if there are no updates for a long time to keep the list
+ * reasonably small.
+ *
+ * Author: Andreas Dilger <adilger@whamcloud.com>
+ * Author: Mike Pershin <mpershin@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd.h>
+#include <obd_class.h>
+
+#include "tgt_internal.h"
+
+/**
+ * Drop FMD reference and free it if reference drops to zero.
+ *
+ * Must be called with ted_fmd_lock held.
+ *
+ * \param[in] exp	OBD export
+ * \param[in] fmd	FMD to put
+ */
+static inline void tgt_fmd_put_nolock(struct obd_export *exp,
+				      struct tgt_fmd_data *fmd)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+
+	assert_spin_locked(&ted->ted_fmd_lock);
+	if (--fmd->fmd_refcount == 0) {
+		ted->ted_fmd_count--;
+		list_del(&fmd->fmd_list);
+		OBD_SLAB_FREE_PTR(fmd, tgt_fmd_kmem);
+	}
+}
+
+/**
+ * Wrapper to drop FMD reference with ted_fmd_lock held.
+ *
+ * \param[in] exp	OBD export
+ * \param[in] fmd	FMD to put
+ */
+void tgt_fmd_put(struct obd_export *exp, struct tgt_fmd_data *fmd)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+
+	spin_lock(&ted->ted_fmd_lock);
+	tgt_fmd_put_nolock(exp, fmd); /* caller reference */
+	spin_unlock(&ted->ted_fmd_lock);
+}
+
+/**
+ * Expire FMD entries.
+ *
+ * Expire entries from the FMD list if there are too many
+ * of them or they are too old.
+ *
+ * This function must be called with ted_fmd_lock held.
+ *
+ * The \a keep FMD is not to be expired in any case. This parameter is used
+ * by ofd_fmd_find_nolock() to prohibit a FMD that was just found from
+ * expiring.
+ *
+ * \param[in] exp	OBD export
+ * \param[in] keep	FMD to keep always
+ */
+static void tgt_fmd_expire_nolock(struct obd_export *exp,
+				  struct tgt_fmd_data *keep)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+	struct lu_target *lut = exp->exp_obd->u.obt.obt_lut;
+	time64_t now = ktime_get_seconds();
+	struct tgt_fmd_data *fmd, *tmp;
+
+	list_for_each_entry_safe(fmd, tmp, &ted->ted_fmd_list, fmd_list) {
+		if (fmd == keep)
+			break;
+
+		if (now < fmd->fmd_expire &&
+		    ted->ted_fmd_count < lut->lut_fmd_max_num)
+			break;
+
+		list_del_init(&fmd->fmd_list);
+		tgt_fmd_put_nolock(exp, fmd); /* list reference */
+	}
+}
+
+/**
+ * Expire FMD entries.
+ *
+ * This is a wrapper to call ofd_fmd_expire_nolock() with the required lock.
+ *
+ * \param[in] exp	OBD export
+ */
+void tgt_fmd_expire(struct obd_export *exp)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+
+	spin_lock(&ted->ted_fmd_lock);
+	tgt_fmd_expire_nolock(exp, NULL);
+	spin_unlock(&ted->ted_fmd_lock);
+}
+
+/**
+ * Find FMD by specified FID.
+ *
+ * Function finds FMD entry by FID in the tg_export_data::ted_fmd_list.
+ *
+ * Caller must hold tg_export_data::ted_fmd_lock and take FMD reference.
+ *
+ * \param[in] exp	OBD export
+ * \param[in] fid	FID of FMD to find
+ *
+ * \retval		struct tgt_fmd_data found by FID
+ * \retval		NULL is FMD is not found
+ */
+static struct tgt_fmd_data *tgt_fmd_find_nolock(struct obd_export *exp,
+						const struct lu_fid *fid)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+	struct tgt_fmd_data *found = NULL, *fmd;
+	struct lu_target *lut = exp->exp_obd->u.obt.obt_lut;
+	time64_t now = ktime_get_seconds();
+
+	assert_spin_locked(&ted->ted_fmd_lock);
+
+	list_for_each_entry_reverse(fmd, &ted->ted_fmd_list, fmd_list) {
+		if (lu_fid_eq(&fmd->fmd_fid, fid)) {
+			found = fmd;
+			list_move_tail(&fmd->fmd_list, &ted->ted_fmd_list);
+			fmd->fmd_expire = now + lut->lut_fmd_max_age;
+			break;
+		}
+	}
+
+	tgt_fmd_expire_nolock(exp, found);
+
+	return found;
+}
+
+/**
+ * Find FMD by specified FID with locking.
+ *
+ * Wrapper to the ofd_fmd_find_nolock() with correct locks.
+ *
+ * \param[in] exp	OBD export
+ * \param[in] fid	FID of FMD to find
+ *
+ * \retval		struct tgt_fmd_data found by FID
+ * \retval		NULL indicates FMD is not found
+ */
+struct tgt_fmd_data *tgt_fmd_find(struct obd_export *exp,
+				  const struct lu_fid *fid)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+	struct tgt_fmd_data *fmd;
+
+	spin_lock(&ted->ted_fmd_lock);
+	fmd = tgt_fmd_find_nolock(exp, fid);
+	if (fmd)
+		fmd->fmd_refcount++;    /* caller reference */
+	spin_unlock(&ted->ted_fmd_lock);
+
+	return fmd;
+}
+
+/**
+ * Find FMD by FID or create a new one if none is found.
+ *
+ * It is possible for this function to return NULL under memory pressure,
+ * or if the passed FID is zero (which will only cause old entries to expire).
+ * Currently this is not fatal because any FMD state is transient and
+ * may also be freed when it gets sufficiently old.
+ *
+ * \param[in] exp	OBD export
+ * \param[in] fid	FID of FMD to find
+ *
+ * \retval		struct tgt_fmd_data found by FID
+ * \retval		NULL indicates FMD is not found
+ */
+struct tgt_fmd_data *tgt_fmd_get(struct obd_export *exp,
+				 const struct lu_fid *fid)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+	struct tgt_fmd_data *found = NULL, *fmd_new = NULL;
+
+	OBD_SLAB_ALLOC_PTR(fmd_new, tgt_fmd_kmem);
+
+	spin_lock(&ted->ted_fmd_lock);
+	found = tgt_fmd_find_nolock(exp, fid);
+	if (fmd_new) {
+		if (!found) {
+			list_add_tail(&fmd_new->fmd_list, &ted->ted_fmd_list);
+			fmd_new->fmd_fid = *fid;
+			fmd_new->fmd_refcount++;   /* list reference */
+			found = fmd_new;
+			ted->ted_fmd_count++;
+		} else {
+			OBD_SLAB_FREE_PTR(fmd_new, tgt_fmd_kmem);
+		}
+	}
+	if (found) {
+		found->fmd_refcount++; /* caller reference */
+		found->fmd_expire = ktime_get_seconds() +
+			class_exp2tgt(exp)->lut_fmd_max_age;
+	} else {
+		LCONSOLE_WARN("%s: cannot allocate FMD for "DFID
+			      ", timestamps may be out of sync\n",
+			      exp->exp_obd->obd_name, PFID(fid));
+	}
+	spin_unlock(&ted->ted_fmd_lock);
+
+	return found;
+}
+
+#ifdef DO_FMD_DROP
+/**
+ * Drop FMD list reference so it will disappear when last reference is dropped
+ * to zero.
+ *
+ * This function is called from ofd_destroy() and may only affect
+ * the one client that is doing the unlink and at worst we have an stale entry
+ * referencing an object that should never be used again.
+ *
+ * NB: this function is used only if DO_FMD_DROP is defined. It is not
+ * currently defined, so FMD drop doesn't happen and FMD are dropped only
+ * when expired.
+ *
+ * \param[in] exp	OBD export
+ * \param[in] fid	FID of FMD to drop
+ */
+void tgt_fmd_drop(struct obd_export *exp, const struct lu_fid *fid)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+	struct tgt_fmd_data *fmd = NULL;
+
+	spin_lock(&ted->ted_fmd_lock);
+	fmd = tgt_fmd_find_nolock(exp, fid);
+	if (fmd) {
+		list_del_init(&fmd->fmd_list);
+		tgt_fmd_put_nolock(exp, fmd);
+	}
+	spin_unlock(&ted->ted_fmd_lock);
+}
+EXPORT_SYMBOL(tgt_fmd_drop);
+#endif
+
+/**
+ * Remove all entries from FMD list.
+ *
+ * Cleanup function to free all FMD enries on the given export.
+ *
+ * \param[in] exp	OBD export
+ */
+void tgt_fmd_cleanup(struct obd_export *exp)
+{
+	struct tg_export_data *ted = &exp->exp_target_data;
+	struct tgt_fmd_data *fmd = NULL, *tmp;
+
+	spin_lock(&ted->ted_fmd_lock);
+	list_for_each_entry_safe(fmd, tmp, &ted->ted_fmd_list, fmd_list) {
+		list_del_init(&fmd->fmd_list);
+		if (fmd->fmd_refcount > 1) {
+			CDEBUG(D_INFO,
+			       "fmd %p still referenced (refcount = %d)\n",
+			       fmd, fmd->fmd_refcount);
+		}
+		tgt_fmd_put_nolock(exp, fmd);
+	}
+	spin_unlock(&ted->ted_fmd_lock);
+	LASSERT(list_empty(&exp->exp_target_data.ted_fmd_list));
+}
+
+/**
+ * Update FMD with the latest request XID.
+ *
+ * Save a new setattr/punch XID in FMD if exists.
+ *
+ * \param[in] exp	OBD export
+ * \param[in] fid	FID of FMD to find
+ * \param[in] xid	request XID
+ */
+void tgt_fmd_update(struct obd_export *exp, const struct lu_fid *fid, __u64 xid)
+{
+	struct tgt_fmd_data *fmd;
+
+	fmd = tgt_fmd_get(exp, fid);
+	if (fmd) {
+		if (fmd->fmd_mactime_xid < xid)
+			fmd->fmd_mactime_xid = xid;
+		tgt_fmd_put(exp, fmd);
+	}
+}
+EXPORT_SYMBOL(tgt_fmd_update);
+
+/**
+ * Chech that time can be updated by the request with given XID.
+ *
+ * Check FMD XID if exists to be less than supplied XID
+ *
+ * \param[in] exp	OBD export
+ * \param[in] fid	FID of FMD to find
+ * \param[in] xid	request XID
+ *
+ * \retval true if FMD has no greater XID, so time attr can be updated
+ */
+bool tgt_fmd_check(struct obd_export *exp, const struct lu_fid *fid, __u64 xid)
+{
+	struct tgt_fmd_data *fmd;
+	bool can_update = true;
+
+	fmd = tgt_fmd_find(exp, fid);
+	if (fmd) {
+		can_update = fmd->fmd_mactime_xid < xid;
+		tgt_fmd_put(exp, fmd);
+	}
+
+	return can_update;
+}
+EXPORT_SYMBOL(tgt_fmd_check);
+
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_grant.c b/drivers/staging/lustrefsx/lustre/target/tgt_grant.c
index 083e40020f1fc..3c5eec062cb4e 100644
--- a/drivers/staging/lustrefsx/lustre/target/tgt_grant.c
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_grant.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * lustre/target/tgt_grant.c
@@ -71,7 +71,7 @@
  * Author: Johann Lombardi <johann.lombardi@intel.com>
  */
 
-#define DEBUG_SUBSYSTEM S_FILTER
+#define DEBUG_SUBSYSTEM S_CLASS
 
 #include <obd.h>
 #include <obd_class.h>
@@ -138,11 +138,6 @@ static int tgt_check_export_grants(struct obd_export *exp, u64 *dirty,
 	struct tg_export_data *ted = &exp->exp_target_data;
 	int level = D_CACHE;
 
-	if (exp->exp_obd->obd_self_export == exp)
-		CDEBUG(D_CACHE, "%s: processing self export: %ld %ld "
-		       "%ld\n", exp->exp_obd->obd_name, ted->ted_grant,
-		       ted->ted_pending, ted->ted_dirty);
-
 	if (ted->ted_grant < 0 || ted->ted_pending < 0 || ted->ted_dirty < 0)
 		level = D_ERROR;
 	CDEBUG_LIMIT(level, "%s: cli %s/%p dirty %ld pend %ld grant %ld\n",
@@ -188,6 +183,7 @@ void tgt_grant_sanity_check(struct obd_device *obd, const char *func)
 	struct lu_target *lut = obd->u.obt.obt_lut;
 	struct tg_grants_data *tgd = &lut->lut_tgd;
 	struct obd_export *exp;
+	struct tg_export_data *ted;
 	u64		   maxsize;
 	u64		   tot_dirty = 0;
 	u64		   tot_pending = 0;
@@ -209,6 +205,15 @@ void tgt_grant_sanity_check(struct obd_device *obd, const char *func)
 
 	spin_lock(&obd->obd_dev_lock);
 	spin_lock(&tgd->tgd_grant_lock);
+	exp = obd->obd_self_export;
+	ted = &exp->exp_target_data;
+	CDEBUG(D_CACHE, "%s: processing self export: %ld %ld "
+	       "%ld\n", obd->obd_name, ted->ted_grant,
+	       ted->ted_pending, ted->ted_dirty);
+	tot_granted += ted->ted_grant + ted->ted_pending;
+	tot_pending += ted->ted_pending;
+	tot_dirty += ted->ted_dirty;
+
 	list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) {
 		error = tgt_check_export_grants(exp, &tot_dirty, &tot_pending,
 						&tot_granted, maxsize);
@@ -275,14 +280,14 @@ EXPORT_SYMBOL(tgt_grant_sanity_check);
  * \retval		negative value on error
  */
 int tgt_statfs_internal(const struct lu_env *env, struct lu_target *lut,
-			struct obd_statfs *osfs, __u64 max_age, int *from_cache)
+			struct obd_statfs *osfs, time64_t max_age, int *from_cache)
 {
 	struct tg_grants_data *tgd = &lut->lut_tgd;
 	int rc = 0;
 	ENTRY;
 
 	spin_lock(&tgd->tgd_osfs_lock);
-	if (cfs_time_before_64(tgd->tgd_osfs_age, max_age) || max_age == 0) {
+	if (tgd->tgd_osfs_age < max_age || max_age == 0) {
 		u64 unstable;
 
 		/* statfs data are too old, get up-to-date one.
@@ -308,6 +313,8 @@ int tgt_statfs_internal(const struct lu_env *env, struct lu_target *lut,
 		if (unlikely(rc))
 			GOTO(out, rc);
 
+		osfs->os_namelen = min_t(__u32, osfs->os_namelen, NAME_MAX);
+
 		spin_lock(&tgd->tgd_grant_lock);
 		spin_lock(&tgd->tgd_osfs_lock);
 		/* calculate how much space was written while we released the
@@ -337,7 +344,7 @@ int tgt_statfs_internal(const struct lu_env *env, struct lu_target *lut,
 
 		/* finally udpate cached statfs data */
 		tgd->tgd_osfs = *osfs;
-		tgd->tgd_osfs_age = cfs_time_current_64();
+		tgd->tgd_osfs_age = ktime_get_seconds();
 
 		tgd->tgd_statfs_inflight--; /* stop tracking */
 		if (tgd->tgd_statfs_inflight == 0)
@@ -383,13 +390,13 @@ static void tgt_grant_statfs(const struct lu_env *env, struct obd_export *exp,
 	struct tg_grants_data	*tgd = &lut->lut_tgd;
 	struct tgt_thread_info	*tti;
 	struct obd_statfs	*osfs;
-	__u64			 max_age;
-	int			 rc;
+	time64_t max_age;
+	int rc;
 
 	if (force)
 		max_age = 0; /* get fresh statfs data */
 	else
-		max_age = cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS);
+		max_age = ktime_get_seconds() - OBD_STATFS_CACHE_SECONDS;
 
 	tti = tgt_th_info(env);
 	osfs = &tti->tti_u.osfs;
@@ -428,6 +435,7 @@ static u64 tgt_grant_space_left(struct obd_export *exp)
 	u64			 left;
 	u64			 avail;
 	u64			 unstable;
+	u64			 reserved;
 
 	ENTRY;
 	assert_spin_locked(&tgd->tgd_grant_lock);
@@ -438,7 +446,8 @@ static u64 tgt_grant_space_left(struct obd_export *exp)
 	unstable = tgd->tgd_osfs_unstable; /* those might be accounted twice */
 	spin_unlock(&tgd->tgd_osfs_lock);
 
-	tot_granted = tgd->tgd_tot_granted;
+	reserved = left * tgd->tgd_reserved_pcnt / 100;
+	tot_granted = tgd->tgd_tot_granted + reserved;
 
 	if (left < tot_granted) {
 		int mask = (left + unstable <
@@ -490,8 +499,7 @@ static void tgt_grant_incoming(const struct lu_env *env, struct obd_export *exp,
 	struct tg_export_data	*ted = &exp->exp_target_data;
 	struct obd_device	*obd = exp->exp_obd;
 	struct tg_grants_data	*tgd = &obd->u.obt.obt_lut->lut_tgd;
-	long			 dirty;
-	long			 dropped;
+	long long		 dirty, dropped;
 	ENTRY;
 
 	assert_spin_locked(&tgd->tgd_grant_lock);
@@ -515,10 +523,19 @@ static void tgt_grant_incoming(const struct lu_env *env, struct obd_export *exp,
 
 	/* inflate grant counters if required */
 	if (!exp_grant_param_supp(exp)) {
+		u64 tmp;
 		oa->o_grant	= tgt_grant_inflate(tgd, oa->o_grant);
 		oa->o_dirty	= tgt_grant_inflate(tgd, oa->o_dirty);
-		oa->o_dropped	= tgt_grant_inflate(tgd, (u64)oa->o_dropped);
-		oa->o_undirty	= tgt_grant_inflate(tgd, oa->o_undirty);
+		/* inflation can bump client's wish to >4GB which doesn't fit
+		 * 32bit o_undirty, limit that ..  */
+		tmp = tgt_grant_inflate(tgd, oa->o_undirty);
+		if (tmp >= OBD_MAX_GRANT)
+			tmp = OBD_MAX_GRANT & ~(1ULL << tgd->tgd_blockbits);
+		oa->o_undirty = tmp;
+		tmp = tgt_grant_inflate(tgd, oa->o_dropped);
+		if (tmp >= OBD_MAX_GRANT)
+			tmp = OBD_MAX_GRANT & ~(1ULL << tgd->tgd_blockbits);
+		oa->o_dropped = tmp;
 	}
 
 	dirty = oa->o_dirty;
@@ -533,13 +550,13 @@ static void tgt_grant_incoming(const struct lu_env *env, struct obd_export *exp,
 	tgd->tgd_tot_dirty += dirty - ted->ted_dirty;
 	if (ted->ted_grant < dropped) {
 		CDEBUG(D_CACHE,
-		       "%s: cli %s/%p reports %lu dropped > grant %lu\n",
+		       "%s: cli %s/%p reports %llu dropped > grant %lu\n",
 		       obd->obd_name, exp->exp_client_uuid.uuid, exp, dropped,
 		       ted->ted_grant);
 		dropped = 0;
 	}
 	if (tgd->tgd_tot_granted < dropped) {
-		CERROR("%s: cli %s/%p reports %lu dropped > tot_grant %llu\n",
+		CERROR("%s: cli %s/%p reports %llu dropped > tot_grant %llu\n",
 		       obd->obd_name, exp->exp_client_uuid.uuid, exp,
 		       dropped, tgd->tgd_tot_granted);
 		dropped = 0;
@@ -588,6 +605,14 @@ static void tgt_grant_shrink(struct obd_export *exp, struct obdo *oa,
 
 	grant_shrink = oa->o_grant;
 
+	if (ted->ted_grant < grant_shrink) {
+		CDEBUG(D_CACHE,
+		       "%s: cli %s/%p wants %lu shrinked > grant %lu\n",
+		       obd->obd_name, exp->exp_client_uuid.uuid, exp,
+		       grant_shrink, ted->ted_grant);
+		grant_shrink = ted->ted_grant;
+	}
+
 	ted->ted_grant -= grant_shrink;
 	tgd->tgd_tot_granted -= grant_shrink;
 
@@ -859,6 +884,7 @@ static void tgt_grant_check(const struct lu_env *env, struct obd_export *exp,
  *				have
  * \param[in] left		remaining free space with granted space taken
  *				out
+ * \param[in] chunk		grant allocation unit
  * \param[in] conservative	if set to true, the server should be cautious
  *				and limit how much space is granted back to the
  *				client. Otherwise, the server should try hard to
@@ -877,6 +903,9 @@ static long tgt_grant_alloc(struct obd_export *exp, u64 curgrant,
 
 	ENTRY;
 
+	if (OBD_FAIL_CHECK(OBD_FAIL_TGT_NO_GRANT))
+		RETURN(0);
+
 	/* When tgd_grant_compat_disable is set, we don't grant any space to
 	 * clients not supporting OBD_CONNECT_GRANT_PARAM.
 	 * Otherwise, space granted to such a client is inflated since it
@@ -928,18 +957,19 @@ static long tgt_grant_alloc(struct obd_export *exp, u64 curgrant,
 	 * client would like to have by more than grants for 2 full
 	 * RPCs
 	 */
+	if (want + chunk <= ted->ted_grant)
+		RETURN(0);
 	if (ted->ted_grant + grant > want + chunk)
 		grant = want + chunk - ted->ted_grant;
 
 	tgd->tgd_tot_granted += grant;
 	ted->ted_grant += grant;
 
-	if (ted->ted_grant < 0) {
+	if (unlikely(ted->ted_grant < 0 || ted->ted_grant > want + chunk)) {
 		CERROR("%s: cli %s/%p grant %ld want %llu current %llu\n",
 		       obd->obd_name, exp->exp_client_uuid.uuid, exp,
 		       ted->ted_grant, want, curgrant);
 		spin_unlock(&tgd->tgd_grant_lock);
-		LBUG();
 	}
 
 	CDEBUG(D_CACHE,
@@ -1053,28 +1083,51 @@ EXPORT_SYMBOL(tgt_grant_connect);
 void tgt_grant_discard(struct obd_export *exp)
 {
 	struct obd_device	*obd = exp->exp_obd;
-	struct tg_grants_data	*tgd = &obd->u.obt.obt_lut->lut_tgd;
+	struct lu_target        *lut = class_exp2tgt(exp);
 	struct tg_export_data	*ted = &exp->exp_target_data;
+	struct tg_grants_data	*tgd;
+
+	if (!lut)
+		return;
 
+	tgd = &lut->lut_tgd;
 	spin_lock(&tgd->tgd_grant_lock);
-	LASSERTF(tgd->tgd_tot_granted >= ted->ted_grant,
-		 "%s: tot_granted %llu cli %s/%p ted_grant %ld\n",
-		 obd->obd_name, tgd->tgd_tot_granted,
-		 exp->exp_client_uuid.uuid, exp, ted->ted_grant);
-	tgd->tgd_tot_granted -= ted->ted_grant;
+	if (unlikely(tgd->tgd_tot_granted < ted->ted_grant ||
+		     tgd->tgd_tot_dirty < ted->ted_dirty)) {
+		struct obd_export *e;
+		u64 ttg = 0;
+		u64 ttd = 0;
+
+		list_for_each_entry(e, &obd->obd_exports, exp_obd_chain) {
+			LASSERT(exp != e);
+			ttg += e->exp_target_data.ted_grant;
+			ttg += e->exp_target_data.ted_pending;
+			ttd += e->exp_target_data.ted_dirty;
+		}
+		if (tgd->tgd_tot_granted < ted->ted_grant)
+			CERROR("%s: cli %s/%p: tot_granted %llu < ted_grant %ld, corrected to %llu",
+			       obd->obd_name,  exp->exp_client_uuid.uuid, exp,
+			       tgd->tgd_tot_granted, ted->ted_grant, ttg);
+		if (tgd->tgd_tot_dirty < ted->ted_dirty)
+			CERROR("%s: cli %s/%p: tot_dirty %llu < ted_dirty %ld, corrected to %llu",
+			       obd->obd_name, exp->exp_client_uuid.uuid, exp,
+			       tgd->tgd_tot_dirty, ted->ted_dirty, ttd);
+		tgd->tgd_tot_granted = ttg;
+		tgd->tgd_tot_dirty = ttd;
+	} else {
+		tgd->tgd_tot_granted -= ted->ted_grant;
+		tgd->tgd_tot_dirty -= ted->ted_dirty;
+	}
 	ted->ted_grant = 0;
-	LASSERTF(tgd->tgd_tot_pending >= ted->ted_pending,
-		 "%s: tot_pending %llu cli %s/%p ted_pending %ld\n",
-		 obd->obd_name, tgd->tgd_tot_pending,
-		 exp->exp_client_uuid.uuid, exp, ted->ted_pending);
+	ted->ted_dirty = 0;
+
+	if (tgd->tgd_tot_pending < ted->ted_pending) {
+		CERROR("%s: tot_pending %llu < cli %s/%p ted_pending %ld\n",
+		       obd->obd_name, tgd->tgd_tot_pending,
+		       exp->exp_client_uuid.uuid, exp, ted->ted_pending);
+	}
 	/* tgd_tot_pending is handled in tgt_grant_commit as bulk
 	 * commmits */
-	LASSERTF(tgd->tgd_tot_dirty >= ted->ted_dirty,
-		 "%s: tot_dirty %llu cli %s/%p ted_dirty %ld\n",
-		 obd->obd_name, tgd->tgd_tot_dirty,
-		 exp->exp_client_uuid.uuid, exp, ted->ted_dirty);
-	tgd->tgd_tot_dirty -= ted->ted_dirty;
-	ted->ted_dirty = 0;
 	spin_unlock(&tgd->tgd_grant_lock);
 }
 EXPORT_SYMBOL(tgt_grant_discard);
@@ -1509,3 +1562,131 @@ int tgt_grant_commit_cb_add(struct thandle *th, struct obd_export *exp,
 	RETURN(rc);
 }
 EXPORT_SYMBOL(tgt_grant_commit_cb_add);
+
+/**
+ * Show estimate of total amount of dirty data on clients.
+ *
+ * @kobj		kobject embedded in obd_device
+ * @attr		unused
+ * @buf			buf used by sysfs to print out data
+ *
+ * Return:		0 on success
+ *			negative value on error
+ */
+ssize_t tot_dirty_show(struct kobject *kobj, struct attribute *attr,
+		       char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct tg_grants_data *tgd;
+
+	tgd = &obd->u.obt.obt_lut->lut_tgd;
+	return scnprintf(buf, PAGE_SIZE, "%llu\n", tgd->tgd_tot_dirty);
+}
+EXPORT_SYMBOL(tot_dirty_show);
+
+/**
+ * Show total amount of space granted to clients.
+ *
+ * @kobj		kobject embedded in obd_device
+ * @attr		unused
+ * @buf			buf used by sysfs to print out data
+ *
+ * Return:		0 on success
+ *			negative value on error
+ */
+ssize_t tot_granted_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct tg_grants_data *tgd;
+
+	tgd = &obd->u.obt.obt_lut->lut_tgd;
+	return scnprintf(buf, PAGE_SIZE, "%llu\n", tgd->tgd_tot_granted);
+}
+EXPORT_SYMBOL(tot_granted_show);
+
+/**
+ * Show total amount of space used by IO in progress.
+ *
+ * @kobj		kobject embedded in obd_device
+ * @attr		unused
+ * @buf			buf used by sysfs to print out data
+ *
+ * Return:		0 on success
+ *			negative value on error
+ */
+ssize_t tot_pending_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct tg_grants_data *tgd;
+
+	tgd = &obd->u.obt.obt_lut->lut_tgd;
+	return scnprintf(buf, PAGE_SIZE, "%llu\n", tgd->tgd_tot_pending);
+}
+EXPORT_SYMBOL(tot_pending_show);
+
+/**
+ * Show if grants compatibility mode is disabled.
+ *
+ * When tgd_grant_compat_disable is set, we don't grant any space to clients
+ * not supporting OBD_CONNECT_GRANT_PARAM. Otherwise, space granted to such
+ * a client is inflated since it consumes PAGE_SIZE of grant space per
+ * block, (i.e. typically 4kB units), but underlaying file system might have
+ * block size bigger than page size, e.g. ZFS. See LU-2049 for details.
+ *
+ * @kobj		kobject embedded in obd_device
+ * @attr		unused
+ * @buf			buf used by sysfs to print out data
+ *
+ * Return:		string length of @buf output on success
+ */
+ssize_t grant_compat_disable_show(struct kobject *kobj, struct attribute *attr,
+				  char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", tgd->tgd_grant_compat_disable);
+}
+EXPORT_SYMBOL(grant_compat_disable_show);
+
+/**
+ * Change grant compatibility mode.
+ *
+ * Setting tgd_grant_compat_disable prohibit any space granting to clients
+ * not supporting OBD_CONNECT_GRANT_PARAM. See details above.
+ *
+ * @kobj	kobject embedded in obd_device
+ * @attr	unused
+ * @buffer	string which represents mode
+ *		1: disable compatibility mode
+ *		0: enable compatibility mode
+ * @count	@buffer length
+ *
+ * Return:	@count on success
+ *		negative number on error
+ */
+ssize_t grant_compat_disable_store(struct kobject *kobj,
+				   struct attribute *attr,
+				   const char *buffer, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
+	bool val;
+	int rc;
+
+	rc = kstrtobool(buffer, &val);
+	if (rc)
+		return rc;
+
+	tgd->tgd_grant_compat_disable = val;
+
+	return count;
+}
+EXPORT_SYMBOL(grant_compat_disable_store);
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_handler.c b/drivers/staging/lustrefsx/lustre/target/tgt_handler.c
index d2113af69436b..2ec6d01e60d91 100644
--- a/drivers/staging/lustrefsx/lustre/target/tgt_handler.c
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_handler.c
@@ -21,7 +21,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2013, 2016, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
  */
 /*
  * lustre/target/tgt_handler.c
@@ -343,10 +343,13 @@ static int tgt_request_preprocess(struct tgt_session_info *tsi,
 
 		dlm_req = req_capsule_client_get(pill, &RMF_DLM_REQ);
 		if (dlm_req != NULL) {
+			union ldlm_wire_policy_data *policy =
+					&dlm_req->lock_desc.l_policy_data;
+
 			if (unlikely(dlm_req->lock_desc.l_resource.lr_type ==
 				     LDLM_IBITS &&
-				     dlm_req->lock_desc.l_policy_data.\
-				     l_inodebits.bits == 0)) {
+				     (policy->l_inodebits.bits |
+				      policy->l_inodebits.try_bits) == 0)) {
 				/*
 				 * Lock without inodebits makes no sense and
 				 * will oops later in ldlm. If client miss to
@@ -431,6 +434,20 @@ static int tgt_handle_request0(struct tgt_session_info *tsi,
 					     &RMF_ACL, RCL_SERVER,
 					     LUSTRE_POSIX_ACL_MAX_SIZE_OLD);
 
+		if (req_capsule_has_field(tsi->tsi_pill, &RMF_SHORT_IO,
+					  RCL_SERVER)) {
+			struct niobuf_remote *remote_nb =
+				req_capsule_client_get(tsi->tsi_pill,
+						       &RMF_NIOBUF_REMOTE);
+			struct ost_body *body = tsi->tsi_ost_body;
+
+			req_capsule_set_size(tsi->tsi_pill, &RMF_SHORT_IO,
+					 RCL_SERVER,
+					 (body->oa.o_valid & OBD_MD_FLFLAGS &&
+					  body->oa.o_flags & OBD_FL_SHORT_IO) ?
+					 remote_nb[0].rnb_len : 0);
+		}
+
 		rc = req_capsule_server_pack(tsi->tsi_pill);
 	}
 
@@ -596,8 +613,14 @@ static struct tgt_handler *tgt_handler_find_check(struct ptlrpc_request *req)
 
 	/* opcode was not found in slice */
 	if (unlikely(s->tos_hs == NULL)) {
-		CERROR("%s: no handlers for opcode 0x%x\n", tgt_name(tgt),
-		       opc);
+		static bool printed;
+
+		/* don't print error messages for known unhandled RPCs */
+		if (opc != OST_FALLOCATE && opc != OST_SEEK && !printed) {
+			CERROR("%s: no handler for opcode 0x%x from %s\n",
+			       tgt_name(tgt), opc, libcfs_id2str(req->rq_peer));
+			printed = true;
+		}
 		RETURN(ERR_PTR(-ENOTSUPP));
 	}
 
@@ -645,6 +668,19 @@ static int process_req_last_xid(struct ptlrpc_request *req)
 			RETURN(-EPROTO);
 	}
 
+	/* The "last_xid" is the minimum xid among unreplied requests,
+	 * if the request is from the previous connection, its xid can
+	 * still be larger than "exp_last_xid", then the above check of
+	 * xid is not enough to determine whether the request is delayed.
+	 *
+	 * For example, if some replay request was delayed and caused
+	 * timeout at client and the replay is restarted, the delayed
+	 * replay request will have the larger xid than "exp_last_xid"
+	 */
+	if (req->rq_export->exp_conn_cnt >
+	    lustre_msg_get_conn_cnt(req->rq_reqmsg))
+		RETURN(-ESTALE);
+
 	/* try to release in-memory reply data */
 	if (tgt_is_multimodrpcs_client(req->rq_export)) {
 		tgt_handle_received_xid(req->rq_export,
@@ -671,8 +707,18 @@ int tgt_request_handle(struct ptlrpc_request *req)
 	bool			 is_connect = false;
 	ENTRY;
 
-	/* Refill the context, to make sure all thread keys are allocated */
-	lu_env_refill(req->rq_svc_thread->t_env);
+	if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_TGT_RECOVERY_REQ_RACE))) {
+		if (cfs_fail_val == 0 &&
+		    lustre_msg_get_opc(msg) != OBD_PING &&
+		    lustre_msg_get_flags(msg) & MSG_REQ_REPLAY_DONE) {
+			struct l_wait_info lwi =  { 0 };
+
+			cfs_fail_val = 1;
+			cfs_race_state = 0;
+			l_wait_event(cfs_race_waitq, (cfs_race_state == 1),
+				     &lwi);
+		}
+	}
 
 	req_capsule_init(&req->rq_pill, req, RCL_SERVER);
 	tsi->tsi_pill = &req->rq_pill;
@@ -836,9 +882,9 @@ EXPORT_SYMBOL(tgt_counter_incr);
 
 int tgt_connect_check_sptlrpc(struct ptlrpc_request *req, struct obd_export *exp)
 {
-	struct lu_target	*tgt = class_exp2tgt(exp);
-	struct sptlrpc_flavor	 flvr;
-	int			 rc = 0;
+	struct lu_target *tgt = class_exp2tgt(exp);
+	struct sptlrpc_flavor flvr;
+	int rc = 0;
 
 	LASSERT(tgt);
 	LASSERT(tgt->lut_obd);
@@ -863,13 +909,13 @@ int tgt_connect_check_sptlrpc(struct ptlrpc_request *req, struct obd_export *exp
 		exp->exp_sp_peer = req->rq_sp_from;
 		exp->exp_flvr = flvr;
 
-		/* when on mgs, if no restriction is set, or if client
-		 * is loopback, allow any flavor */
+		/* when on mgs, if no restriction is set, or if the client
+		 * NID is on the local node, allow any flavor
+		 */
 		if ((strcmp(exp->exp_obd->obd_type->typ_name,
 			   LUSTRE_MGS_NAME) == 0) &&
 		     (exp->exp_flvr.sf_rpc == SPTLRPC_FLVR_NULL ||
-		      LNET_NETTYP(LNET_NIDNET(exp->exp_connection->c_peer.nid))
-		      == LOLND))
+		      LNetIsPeerLocal(exp->exp_connection->c_peer.nid)))
 			exp->exp_flvr.sf_rpc = SPTLRPC_FLVR_ANY;
 
 		if (exp->exp_flvr.sf_rpc != SPTLRPC_FLVR_ANY &&
@@ -949,9 +995,19 @@ int tgt_connect(struct tgt_session_info *tsi)
 	reply = req_capsule_server_get(tsi->tsi_pill, &RMF_CONNECT_DATA);
 	spin_lock(&tsi->tsi_exp->exp_lock);
 	*exp_connect_flags_ptr(tsi->tsi_exp) = reply->ocd_connect_flags;
+	if (reply->ocd_connect_flags & OBD_CONNECT_FLAGS2)
+		*exp_connect_flags2_ptr(tsi->tsi_exp) =
+			reply->ocd_connect_flags2;
 	tsi->tsi_exp->exp_connect_data.ocd_brw_size = reply->ocd_brw_size;
 	spin_unlock(&tsi->tsi_exp->exp_lock);
 
+	if (strcmp(tsi->tsi_exp->exp_obd->obd_type->typ_name,
+		   LUSTRE_MDT_NAME) == 0) {
+		rc = req_check_sepol(tsi->tsi_pill);
+		if (rc)
+			GOTO(out, rc);
+	}
+
 	RETURN(0);
 out:
 	obd_disconnect(class_export_get(tsi->tsi_exp));
@@ -965,6 +1021,8 @@ int tgt_disconnect(struct tgt_session_info *tsi)
 
 	ENTRY;
 
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OST_DISCONNECT_DELAY, cfs_fail_val);
+
 	rc = target_handle_disconnect(tgt_ses_req(tsi));
 	if (rc)
 		RETURN(err_serious(rc));
@@ -982,7 +1040,16 @@ int tgt_obd_ping(struct tgt_session_info *tsi)
 
 	ENTRY;
 
-	rc = target_handle_ping(tgt_ses_req(tsi));
+	/* The target-specific part of OBD_PING request handling.
+	 * It controls Filter Modification Data (FMD) expiration each time
+	 * PING is received.
+	 *
+	 * Valid only for replayable targets, e.g. MDT and OFD
+	 */
+	if (tsi->tsi_exp->exp_obd->obd_replayable)
+		tgt_fmd_expire(tsi->tsi_exp);
+
+	rc = req_capsule_server_pack(tsi->tsi_pill);
 	if (rc)
 		RETURN(err_serious(rc));
 
@@ -1152,7 +1219,6 @@ static int tgt_obd_idx_read(struct tgt_session_info *tsi)
 
 struct tgt_handler tgt_obd_handlers[] = {
 TGT_OBD_HDL    (0,	OBD_PING,		tgt_obd_ping),
-TGT_OBD_HDL_VAR(0,	OBD_LOG_CANCEL,		tgt_obd_log_cancel),
 TGT_OBD_HDL    (0,	OBD_IDX_READ,		tgt_obd_idx_read)
 };
 EXPORT_SYMBOL(tgt_obd_handlers);
@@ -1216,8 +1282,8 @@ static int tgt_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
 
 	if (flag == LDLM_CB_CANCELING &&
 	    (lock->l_granted_mode & (LCK_EX | LCK_PW | LCK_GROUP)) &&
-	    (tgt->lut_sync_lock_cancel == ALWAYS_SYNC_ON_CANCEL ||
-	     (tgt->lut_sync_lock_cancel == BLOCKING_SYNC_ON_CANCEL &&
+	    (tgt->lut_sync_lock_cancel == SYNC_LOCK_CANCEL_ALWAYS ||
+	     (tgt->lut_sync_lock_cancel == SYNC_LOCK_CANCEL_BLOCKING &&
 	      ldlm_is_cbpending(lock))) &&
 	    ((exp_connect_flags(lock->l_export) & OBD_CONNECT_MDS_MDS) ||
 	     lock->l_resource->lr_type == LDLM_EXTENT)) {
@@ -1226,7 +1292,7 @@ static int tgt_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
 
 		rc = lu_env_init(&env, LCT_DT_THREAD);
 		if (unlikely(rc != 0))
-			RETURN(rc);
+			GOTO(err, rc);
 
 		ost_fid_from_resid(&fid, &lock->l_resource->lr_name,
 				   tgt->lut_lsd.lsd_osd_index);
@@ -1257,7 +1323,7 @@ static int tgt_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
 err_env:
 		lu_env_fini(&env);
 	}
-
+err:
 	rc = ldlm_server_blocking_ast(lock, desc, data, flag);
 	RETURN(rc);
 }
@@ -1329,7 +1395,7 @@ int tgt_cp_callback(struct tgt_session_info *tsi)
 /* generic LDLM target handler */
 struct tgt_handler tgt_dlm_handlers[] = {
 TGT_DLM_HDL    (HABEO_CLAVIS,	LDLM_ENQUEUE,		tgt_enqueue),
-TGT_DLM_HDL_VAR(HABEO_CLAVIS,	LDLM_CONVERT,		tgt_convert),
+TGT_DLM_HDL    (HABEO_CLAVIS,	LDLM_CONVERT,		tgt_convert),
 TGT_DLM_HDL_VAR(0,		LDLM_BL_CALLBACK,	tgt_bl_callback),
 TGT_DLM_HDL_VAR(0,		LDLM_CP_CALLBACK,	tgt_cp_callback)
 };
@@ -1350,30 +1416,6 @@ int tgt_llog_open(struct tgt_session_info *tsi)
 }
 EXPORT_SYMBOL(tgt_llog_open);
 
-int tgt_llog_close(struct tgt_session_info *tsi)
-{
-	int rc;
-
-	ENTRY;
-
-	rc = llog_origin_handle_close(tgt_ses_req(tsi));
-
-	RETURN(rc);
-}
-EXPORT_SYMBOL(tgt_llog_close);
-
-
-int tgt_llog_destroy(struct tgt_session_info *tsi)
-{
-	int rc;
-
-	ENTRY;
-
-	rc = llog_origin_handle_destroy(tgt_ses_req(tsi));
-
-	RETURN(rc);
-}
-
 int tgt_llog_read_header(struct tgt_session_info *tsi)
 {
 	int rc;
@@ -1416,8 +1458,6 @@ TGT_LLOG_HDL    (0,	LLOG_ORIGIN_HANDLE_CREATE,	tgt_llog_open),
 TGT_LLOG_HDL    (0,	LLOG_ORIGIN_HANDLE_NEXT_BLOCK,	tgt_llog_next_block),
 TGT_LLOG_HDL    (0,	LLOG_ORIGIN_HANDLE_READ_HEADER,	tgt_llog_read_header),
 TGT_LLOG_HDL    (0,	LLOG_ORIGIN_HANDLE_PREV_BLOCK,	tgt_llog_prev_block),
-TGT_LLOG_HDL    (0,	LLOG_ORIGIN_HANDLE_DESTROY,	tgt_llog_destroy),
-TGT_LLOG_HDL_VAR(0,	LLOG_ORIGIN_HANDLE_CLOSE,	tgt_llog_close),
 };
 EXPORT_SYMBOL(tgt_llog_handlers);
 
@@ -1567,13 +1607,48 @@ void tgt_io_thread_done(struct ptlrpc_thread *thread)
 	EXIT;
 }
 EXPORT_SYMBOL(tgt_io_thread_done);
+
+/**
+ * Helper function for getting Data-on-MDT file server DLM lock
+ * if asked by client.
+ */
+int tgt_mdt_data_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
+		      struct lustre_handle *lh, int mode, __u64 *flags)
+{
+	union ldlm_policy_data policy = {
+		.l_inodebits.bits = MDS_INODELOCK_DOM,
+	};
+	int rc;
+
+	ENTRY;
+
+	LASSERT(lh != NULL);
+	LASSERT(ns != NULL);
+	LASSERT(!lustre_handle_is_used(lh));
+
+	rc = ldlm_cli_enqueue_local(NULL, ns, res_id, LDLM_IBITS, &policy, mode,
+				    flags, ldlm_blocking_ast,
+				    ldlm_completion_ast, ldlm_glimpse_ast,
+				    NULL, 0, LVB_T_NONE, NULL, lh);
+
+	RETURN(rc == ELDLM_OK ? 0 : -EIO);
+}
+EXPORT_SYMBOL(tgt_mdt_data_lock);
+
+void tgt_mdt_data_unlock(struct lustre_handle *lh, enum ldlm_mode mode)
+{
+	LASSERT(lustre_handle_is_used(lh));
+	ldlm_lock_decref(lh, mode);
+}
+EXPORT_SYMBOL(tgt_mdt_data_unlock);
+
 /**
  * Helper function for getting server side [start, start+count] DLM lock
  * if asked by client.
  */
-int tgt_extent_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
-		    __u64 start, __u64 end, struct lustre_handle *lh,
-		    int mode, __u64 *flags)
+int tgt_extent_lock(const struct lu_env *env, struct ldlm_namespace *ns,
+		    struct ldlm_res_id *res_id, __u64 start, __u64 end,
+		    struct lustre_handle *lh, int mode, __u64 *flags)
 {
 	union ldlm_policy_data policy;
 	int rc;
@@ -1596,8 +1671,8 @@ int tgt_extent_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
 	else
 		policy.l_extent.end = end | ~PAGE_MASK;
 
-	rc = ldlm_cli_enqueue_local(ns, res_id, LDLM_EXTENT, &policy, mode,
-				    flags, ldlm_blocking_ast,
+	rc = ldlm_cli_enqueue_local(env, ns, res_id, LDLM_EXTENT, &policy,
+				    mode, flags, ldlm_blocking_ast,
 				    ldlm_completion_ast, ldlm_glimpse_ast,
 				    NULL, 0, LVB_T_NONE, NULL, lh);
 	RETURN(rc == ELDLM_OK ? 0 : -EIO);
@@ -1611,13 +1686,16 @@ void tgt_extent_unlock(struct lustre_handle *lh, enum ldlm_mode mode)
 }
 EXPORT_SYMBOL(tgt_extent_unlock);
 
-int tgt_brw_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
-		 struct obd_ioobj *obj, struct niobuf_remote *nb,
-		 struct lustre_handle *lh, enum ldlm_mode mode)
+static int tgt_brw_lock(const struct lu_env *env, struct obd_export *exp,
+			struct ldlm_res_id *res_id, struct obd_ioobj *obj,
+			struct niobuf_remote *nb, struct lustre_handle *lh,
+			enum ldlm_mode mode)
 {
+	struct ldlm_namespace	*ns = exp->exp_obd->obd_namespace;
 	__u64			 flags = 0;
 	int			 nrbufs = obj->ioo_bufcnt;
 	int			 i;
+	int			 rc;
 
 	ENTRY;
 
@@ -1634,14 +1712,19 @@ int tgt_brw_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
 		if (!(nb[i].rnb_flags & OBD_BRW_SRVLOCK))
 			RETURN(-EFAULT);
 
-	RETURN(tgt_extent_lock(ns, res_id, nb[0].rnb_offset,
-			       nb[nrbufs - 1].rnb_offset +
-			       nb[nrbufs - 1].rnb_len - 1,
-			       lh, mode, &flags));
+	/* MDT IO for data-on-mdt */
+	if (exp->exp_connect_data.ocd_connect_flags & OBD_CONNECT_IBITS)
+		rc = tgt_mdt_data_lock(ns, res_id, lh, mode, &flags);
+	else
+		rc = tgt_extent_lock(env, ns, res_id, nb[0].rnb_offset,
+				     nb[nrbufs - 1].rnb_offset +
+				     nb[nrbufs - 1].rnb_len - 1,
+				     lh, mode, &flags);
+	RETURN(rc);
 }
 
-void tgt_brw_unlock(struct obd_ioobj *obj, struct niobuf_remote *niob,
-		    struct lustre_handle *lh, enum ldlm_mode mode)
+static void tgt_brw_unlock(struct obd_ioobj *obj, struct niobuf_remote *niob,
+			   struct lustre_handle *lh, enum ldlm_mode mode)
 {
 	ENTRY;
 
@@ -1654,86 +1737,82 @@ void tgt_brw_unlock(struct obd_ioobj *obj, struct niobuf_remote *niob,
 		tgt_extent_unlock(lh, mode);
 	EXIT;
 }
-
-static __u32 tgt_checksum_bulk(struct lu_target *tgt,
-			       struct ptlrpc_bulk_desc *desc, int opc,
-			       cksum_type_t cksum_type)
+static int tgt_checksum_niobuf(struct lu_target *tgt,
+				 struct niobuf_local *local_nb, int npages,
+				 int opc, enum cksum_types cksum_type,
+				 __u32 *cksum)
 {
-	struct cfs_crypto_hash_desc	*hdesc;
+	struct ahash_request	       *req;
 	unsigned int			bufsize;
 	int				i, err;
 	unsigned char			cfs_alg = cksum_obd2cfs(cksum_type);
-	__u32				cksum;
-
-	LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
 
-	hdesc = cfs_crypto_hash_init(cfs_alg, NULL, 0);
-	if (IS_ERR(hdesc)) {
+	req = cfs_crypto_hash_init(cfs_alg, NULL, 0);
+	if (IS_ERR(req)) {
 		CERROR("%s: unable to initialize checksum hash %s\n",
 		       tgt_name(tgt), cfs_crypto_hash_name(cfs_alg));
-		return PTR_ERR(hdesc);
+		return PTR_ERR(req);
 	}
 
 	CDEBUG(D_INFO, "Checksum for algo %s\n", cfs_crypto_hash_name(cfs_alg));
-	for (i = 0; i < desc->bd_iov_count; i++) {
+	for (i = 0; i < npages; i++) {
 		/* corrupt the data before we compute the checksum, to
 		 * simulate a client->OST data error */
 		if (i == 0 && opc == OST_WRITE &&
 		    OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_RECEIVE)) {
-			int off = BD_GET_KIOV(desc, i).kiov_offset &
-				~PAGE_MASK;
-			int len = BD_GET_KIOV(desc, i).kiov_len;
+			int off = local_nb[i].lnb_page_offset & ~PAGE_MASK;
+			int len = local_nb[i].lnb_len;
 			struct page *np = tgt_page_to_corrupt;
-			char *ptr = kmap(BD_GET_KIOV(desc, i).kiov_page) + off;
 
 			if (np) {
-				char *ptr2 = kmap(np) + off;
+				char *ptr = ll_kmap_atomic(local_nb[i].lnb_page,
+							KM_USER0);
+				char *ptr2 = page_address(np);
 
-				memcpy(ptr2, ptr, len);
-				memcpy(ptr2, "bad3", min(4, len));
-				kunmap(np);
+				memcpy(ptr2 + off, ptr + off, len);
+				memcpy(ptr2 + off, "bad3", min(4, len));
+				ll_kunmap_atomic(ptr, KM_USER0);
 
 				/* LU-8376 to preserve original index for
 				 * display in dump_all_bulk_pages() */
-				np->index = BD_GET_KIOV(desc,
-							i).kiov_page->index;
+				np->index = i;
 
-				BD_GET_KIOV(desc, i).kiov_page = np;
+				cfs_crypto_hash_update_page(req, np, off,
+							    len);
+				continue;
 			} else {
 				CERROR("%s: can't alloc page for corruption\n",
 				       tgt_name(tgt));
 			}
 		}
-		cfs_crypto_hash_update_page(hdesc,
-				  BD_GET_KIOV(desc, i).kiov_page,
-				  BD_GET_KIOV(desc, i).kiov_offset &
-					~PAGE_MASK,
-				  BD_GET_KIOV(desc, i).kiov_len);
+		cfs_crypto_hash_update_page(req, local_nb[i].lnb_page,
+				  local_nb[i].lnb_page_offset & ~PAGE_MASK,
+				  local_nb[i].lnb_len);
 
 		 /* corrupt the data after we compute the checksum, to
 		 * simulate an OST->client data error */
 		if (i == 0 && opc == OST_READ &&
 		    OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_SEND)) {
-			int off = BD_GET_KIOV(desc, i).kiov_offset
-			  & ~PAGE_MASK;
-			int len = BD_GET_KIOV(desc, i).kiov_len;
+			int off = local_nb[i].lnb_page_offset & ~PAGE_MASK;
+			int len = local_nb[i].lnb_len;
 			struct page *np = tgt_page_to_corrupt;
-			char *ptr =
-			  kmap(BD_GET_KIOV(desc, i).kiov_page) + off;
 
 			if (np) {
-				char *ptr2 = kmap(np) + off;
+				char *ptr = ll_kmap_atomic(local_nb[i].lnb_page,
+							KM_USER0);
+				char *ptr2 = page_address(np);
 
-				memcpy(ptr2, ptr, len);
-				memcpy(ptr2, "bad4", min(4, len));
-				kunmap(np);
+				memcpy(ptr2 + off, ptr + off, len);
+				memcpy(ptr2 + off, "bad4", min(4, len));
+				ll_kunmap_atomic(ptr, KM_USER0);
 
 				/* LU-8376 to preserve original index for
 				 * display in dump_all_bulk_pages() */
-				np->index = BD_GET_KIOV(desc,
-							i).kiov_page->index;
+				np->index = i;
 
-				BD_GET_KIOV(desc, i).kiov_page = np;
+				cfs_crypto_hash_update_page(req, np, off,
+							    len);
+				continue;
 			} else {
 				CERROR("%s: can't alloc page for corruption\n",
 				       tgt_name(tgt));
@@ -1741,17 +1820,17 @@ static __u32 tgt_checksum_bulk(struct lu_target *tgt,
 		}
 	}
 
-	bufsize = sizeof(cksum);
-	err = cfs_crypto_hash_final(hdesc, (unsigned char *)&cksum, &bufsize);
+	bufsize = sizeof(*cksum);
+	err = cfs_crypto_hash_final(req, (unsigned char *)cksum, &bufsize);
 
-	return cksum;
+	return 0;
 }
 
 char dbgcksum_file_name[PATH_MAX];
 
 static void dump_all_bulk_pages(struct obdo *oa, int count,
-				    lnet_kiov_t *iov, __u32 server_cksum,
-				    __u32 client_cksum)
+				struct niobuf_local *local_nb,
+				__u32 server_cksum, __u32 client_cksum)
 {
 	struct file *filp;
 	int rc, i;
@@ -1768,9 +1847,9 @@ static void dump_all_bulk_pages(struct obdo *oa, int count,
 		 oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0,
 		 oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
 		 oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
-		 (__u64)iov[0].kiov_page->index << PAGE_SHIFT,
-		 ((__u64)iov[count - 1].kiov_page->index << PAGE_SHIFT) +
-		 iov[count - 1].kiov_len - 1, client_cksum, server_cksum);
+		 local_nb[0].lnb_file_offset,
+		 local_nb[count-1].lnb_file_offset +
+		 local_nb[count-1].lnb_len - 1, client_cksum, server_cksum);
 	filp = filp_open(dbgcksum_file_name,
 			 O_CREAT | O_EXCL | O_WRONLY | O_LARGEFILE, 0600);
 	if (IS_ERR(filp)) {
@@ -1786,8 +1865,8 @@ static void dump_all_bulk_pages(struct obdo *oa, int count,
 	}
 
 	for (i = 0; i < count; i++) {
-		len = iov[i].kiov_len;
-		buf = kmap(iov[i].kiov_page);
+		len = local_nb[i].lnb_len;
+		buf = kmap(local_nb[i].lnb_page);
 		while (len != 0) {
 			rc = cfs_kernel_write(filp, buf, len, &filp->f_pos);
 			if (rc < 0) {
@@ -1800,7 +1879,7 @@ static void dump_all_bulk_pages(struct obdo *oa, int count,
 			CDEBUG(D_INFO, "%s: wrote %d bytes\n",
 			       dbgcksum_file_name, rc);
 		}
-		kunmap(iov[i].kiov_page);
+		kunmap(local_nb[i].lnb_page);
 	}
 
 	rc = ll_vfs_fsync_range(filp, 0, LLONG_MAX, 1);
@@ -1810,13 +1889,15 @@ static void dump_all_bulk_pages(struct obdo *oa, int count,
 	return;
 }
 
-static int check_read_checksum(struct ptlrpc_bulk_desc *desc, struct obdo *oa,
-			       const lnet_process_id_t *peer,
+static int check_read_checksum(struct niobuf_local *local_nb, int npages,
+			       struct obd_export *exp, struct obdo *oa,
+			       const struct lnet_process_id *peer,
 			       __u32 client_cksum, __u32 server_cksum,
-			       cksum_type_t server_cksum_type)
+			       enum cksum_types server_cksum_type)
 {
 	char *msg;
-	cksum_type_t cksum_type;
+	enum cksum_types cksum_type;
+	loff_t start, end;
 
 	/* unlikely to happen and only if resend does not occur due to cksum
 	 * control failure on Client */
@@ -1826,13 +1907,12 @@ static int check_read_checksum(struct ptlrpc_bulk_desc *desc, struct obdo *oa,
 		return 0;
 	}
 
-	if (desc->bd_export->exp_obd->obd_checksum_dump)
-		dump_all_bulk_pages(oa, desc->bd_iov_count,
-				    &BD_GET_KIOV(desc, 0), server_cksum,
+	if (exp->exp_obd->obd_checksum_dump)
+		dump_all_bulk_pages(oa, npages, local_nb, server_cksum,
 				    client_cksum);
 
-	cksum_type = cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
-				       oa->o_flags : 0);
+	cksum_type = obd_cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
+					   oa->o_flags : 0);
 
 	if (cksum_type != server_cksum_type)
 		msg = "the server may have not used the checksum type specified"
@@ -1840,24 +1920,237 @@ static int check_read_checksum(struct ptlrpc_bulk_desc *desc, struct obdo *oa,
 	else
 		msg = "should have changed on the client or in transit";
 
+	start = local_nb[0].lnb_file_offset;
+	end = local_nb[npages-1].lnb_file_offset +
+					local_nb[npages-1].lnb_len - 1;
+
 	LCONSOLE_ERROR_MSG(0x132, "%s: BAD READ CHECKSUM: %s: from %s inode "
 		DFID " object "DOSTID" extent [%llu-%llu], client returned csum"
 		" %x (type %x), server csum %x (type %x)\n",
-		desc->bd_export->exp_obd->obd_name,
+		exp->exp_obd->obd_name,
 		msg, libcfs_nid2str(peer->nid),
 		oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : 0ULL,
 		oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
 		oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
 		POSTID(&oa->o_oi),
-		(__u64)BD_GET_KIOV(desc, 0).kiov_page->index << PAGE_SHIFT,
-		((__u64)BD_GET_KIOV(desc,
-				    desc->bd_iov_count - 1).kiov_page->index
-			<< PAGE_SHIFT) +
-			BD_GET_KIOV(desc, desc->bd_iov_count - 1).kiov_len - 1,
-		client_cksum, cksum_type, server_cksum, server_cksum_type);
+		start, end, client_cksum, cksum_type, server_cksum,
+		server_cksum_type);
+
 	return 1;
 }
 
+static int tgt_pages2shortio(struct niobuf_local *local, int npages,
+			     unsigned char *buf, int size)
+{
+	int	i, off, len, copied = size;
+	char	*ptr;
+
+	for (i = 0; i < npages; i++) {
+		off = local[i].lnb_page_offset & ~PAGE_MASK;
+		len = local[i].lnb_len;
+
+		CDEBUG(D_PAGE, "index %d offset = %d len = %d left = %d\n",
+		       i, off, len, size);
+		if (len > size)
+			return -EINVAL;
+
+		ptr = ll_kmap_atomic(local[i].lnb_page, KM_USER0);
+		memcpy(buf + off, ptr, len);
+		ll_kunmap_atomic(ptr, KM_USER0);
+		buf += len;
+		size -= len;
+	}
+	return copied - size;
+}
+
+static int tgt_checksum_niobuf_t10pi(struct lu_target *tgt,
+				     struct niobuf_local *local_nb,
+				     int npages, int opc,
+				     obd_dif_csum_fn *fn,
+				     int sector_size,
+				     u32 *check_sum)
+{
+	enum cksum_types t10_cksum_type = tgt->lut_dt_conf.ddp_t10_cksum_type;
+	unsigned char cfs_alg = cksum_obd2cfs(OBD_CKSUM_T10_TOP);
+	const char *obd_name = tgt->lut_obd->obd_name;
+	struct ahash_request *req;
+	unsigned int bufsize;
+	unsigned char *buffer;
+	struct page *__page;
+	__u16 *guard_start;
+	int guard_number;
+	int used_number = 0;
+	__u32 cksum;
+	int rc = 0;
+	int used;
+	int i;
+
+	__page = alloc_page(GFP_KERNEL);
+	if (__page == NULL)
+		return -ENOMEM;
+
+	req = cfs_crypto_hash_init(cfs_alg, NULL, 0);
+	if (IS_ERR(req)) {
+		CERROR("%s: unable to initialize checksum hash %s\n",
+		       tgt_name(tgt), cfs_crypto_hash_name(cfs_alg));
+		return PTR_ERR(req);
+	}
+
+	buffer = kmap(__page);
+	guard_start = (__u16 *)buffer;
+	guard_number = PAGE_SIZE / sizeof(*guard_start);
+	for (i = 0; i < npages; i++) {
+		/* corrupt the data before we compute the checksum, to
+		 * simulate a client->OST data error */
+		if (i == 0 && opc == OST_WRITE &&
+		    OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_RECEIVE)) {
+			int off = local_nb[i].lnb_page_offset & ~PAGE_MASK;
+			int len = local_nb[i].lnb_len;
+			struct page *np = tgt_page_to_corrupt;
+
+			if (np) {
+				char *ptr = ll_kmap_atomic(local_nb[i].lnb_page,
+							KM_USER0);
+				char *ptr2 = page_address(np);
+
+				memcpy(ptr2 + off, ptr + off, len);
+				memcpy(ptr2 + off, "bad3", min(4, len));
+				ll_kunmap_atomic(ptr, KM_USER0);
+
+				/* LU-8376 to preserve original index for
+				 * display in dump_all_bulk_pages() */
+				np->index = i;
+
+				cfs_crypto_hash_update_page(req, np, off,
+							    len);
+				continue;
+			} else {
+				CERROR("%s: can't alloc page for corruption\n",
+				       tgt_name(tgt));
+			}
+		}
+
+		/*
+		 * The left guard number should be able to hold checksums of a
+		 * whole page
+		 */
+		if (t10_cksum_type && opc == OST_READ &&
+		    local_nb[i].lnb_guard_disk) {
+			used = DIV_ROUND_UP(local_nb[i].lnb_len, sector_size);
+			if (used > (guard_number - used_number)) {
+				rc = -E2BIG;
+				break;
+			}
+			memcpy(guard_start + used_number,
+			       local_nb[i].lnb_guards,
+			       used * sizeof(*local_nb[i].lnb_guards));
+		} else {
+			rc = obd_page_dif_generate_buffer(obd_name,
+				local_nb[i].lnb_page,
+				local_nb[i].lnb_page_offset & ~PAGE_MASK,
+				local_nb[i].lnb_len, guard_start + used_number,
+				guard_number - used_number, &used, sector_size,
+				fn);
+			if (rc)
+				break;
+		}
+
+		LASSERT(used <= MAX_GUARD_NUMBER);
+		/*
+		 * If disk support T10PI checksum, copy guards to local_nb.
+		 * If the write is partial page, do not use the guards for bio
+		 * submission since the data might not be full-sector. The bio
+		 * guards will be generated later based on the full sectors. If
+		 * the sector size is 512B rather than 4 KB, or the page size
+		 * is larger than 4KB, this might drop some useful guards for
+		 * partial page write, but it will only add minimal extra time
+		 * of checksum calculation.
+		 */
+		if (t10_cksum_type && opc == OST_WRITE &&
+		    local_nb[i].lnb_len == PAGE_SIZE) {
+			local_nb[i].lnb_guard_rpc = 1;
+			memcpy(local_nb[i].lnb_guards,
+			       guard_start + used_number,
+			       used * sizeof(*local_nb[i].lnb_guards));
+		}
+
+		used_number += used;
+		if (used_number == guard_number) {
+			cfs_crypto_hash_update_page(req, __page, 0,
+				used_number * sizeof(*guard_start));
+			used_number = 0;
+		}
+
+		 /* corrupt the data after we compute the checksum, to
+		 * simulate an OST->client data error */
+		if (unlikely(i == 0 && opc == OST_READ &&
+			     OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_SEND))) {
+			int off = local_nb[i].lnb_page_offset & ~PAGE_MASK;
+			int len = local_nb[i].lnb_len;
+			struct page *np = tgt_page_to_corrupt;
+
+			if (np) {
+				char *ptr = ll_kmap_atomic(local_nb[i].lnb_page,
+							KM_USER0);
+				char *ptr2 = page_address(np);
+
+				memcpy(ptr2 + off, ptr + off, len);
+				memcpy(ptr2 + off, "bad4", min(4, len));
+				ll_kunmap_atomic(ptr, KM_USER0);
+
+				/* LU-8376 to preserve original index for
+				 * display in dump_all_bulk_pages() */
+				np->index = i;
+
+				cfs_crypto_hash_update_page(req, np, off,
+							    len);
+				continue;
+			} else {
+				CERROR("%s: can't alloc page for corruption\n",
+				       tgt_name(tgt));
+			}
+		}
+	}
+	kunmap(__page);
+	if (rc)
+		GOTO(out, rc);
+
+	if (used_number != 0)
+		cfs_crypto_hash_update_page(req, __page, 0,
+			used_number * sizeof(*guard_start));
+
+	bufsize = sizeof(cksum);
+	rc = cfs_crypto_hash_final(req, (unsigned char *)&cksum, &bufsize);
+
+	if (rc == 0)
+		*check_sum = cksum;
+out:
+	__free_page(__page);
+	return rc;
+}
+
+static int tgt_checksum_niobuf_rw(struct lu_target *tgt,
+				  enum cksum_types cksum_type,
+				  struct niobuf_local *local_nb,
+				  int npages, int opc, u32 *check_sum)
+{
+	obd_dif_csum_fn *fn = NULL;
+	int sector_size = 0;
+	int rc;
+
+	ENTRY;
+	obd_t10_cksum2dif(cksum_type, &fn, &sector_size);
+
+	if (fn)
+		rc = tgt_checksum_niobuf_t10pi(tgt, local_nb, npages,
+					       opc, fn, sector_size,
+					       check_sum);
+	else
+		rc = tgt_checksum_niobuf(tgt, local_nb, npages, opc,
+					 cksum_type, check_sum);
+	RETURN(rc);
+}
+
 int tgt_brw_read(struct tgt_session_info *tsi)
 {
 	struct ptlrpc_request	*req = tgt_ses_req(tsi);
@@ -1869,12 +2162,15 @@ int tgt_brw_read(struct tgt_session_info *tsi)
 	struct ost_body		*body, *repbody;
 	struct l_wait_info	 lwi;
 	struct lustre_handle	 lockh = { 0 };
-	int			 npages, nob = 0, rc, i, no_reply = 0;
+	int			 npages, nob = 0, rc, i, no_reply = 0,
+				 npages_read;
 	struct tgt_thread_big_cache *tbc = req->rq_svc_thread->t_data;
+	const char *obd_name = exp->exp_obd->obd_name;
 
 	ENTRY;
 
-	if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL) {
+	if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL &&
+	    ptlrpc_req2svc(req)->srv_req_portal != MDS_IO_PORTAL) {
 		CERROR("%s: deny read request from %s to portal %u\n",
 		       tgt_name(tsi->tsi_tgt),
 		       obd_export_nid2str(req->rq_export),
@@ -1917,8 +2213,8 @@ int tgt_brw_read(struct tgt_session_info *tsi)
 
 	local_nb = tbc->local;
 
-	rc = tgt_brw_lock(exp->exp_obd->obd_namespace, &tsi->tsi_resid, ioo,
-			  remote_nb, &lockh, LCK_PR);
+	rc = tgt_brw_lock(tsi->tsi_env, exp, &tsi->tsi_resid, ioo, remote_nb,
+			  &lockh, LCK_PR);
 	if (rc != 0)
 		RETURN(rc);
 
@@ -1936,6 +2232,17 @@ int tgt_brw_read(struct tgt_session_info *tsi)
 		GOTO(out_lock, rc = -ETIMEDOUT);
 	}
 
+	/*
+	 * Because we already sync grant info with client when
+	 * reconnect, grant info will be cleared for resent req,
+	 * otherwise, outdated grant count in the rpc would de-sync
+	 * grant counters in case of shrink
+	 */
+	if (lustre_msg_get_flags(req->rq_reqmsg) & (MSG_RESENT | MSG_REPLAY)) {
+		DEBUG_REQ(D_CACHE, req, "clear resent/replay req grant info");
+		body->oa.o_valid &= ~OBD_MD_FLGRANT;
+	}
+
 	repbody = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
 	repbody->oa = body->oa;
 
@@ -1945,33 +2252,42 @@ int tgt_brw_read(struct tgt_session_info *tsi)
 	if (rc != 0)
 		GOTO(out_lock, rc);
 
-	desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
-				    PTLRPC_BULK_PUT_SOURCE |
-					PTLRPC_BULK_BUF_KIOV,
-				    OST_BULK_PORTAL,
-				    &ptlrpc_bulk_kiov_nopin_ops);
-	if (desc == NULL)
-		GOTO(out_commitrw, rc = -ENOMEM);
+	if (body->oa.o_valid & OBD_MD_FLFLAGS &&
+	    body->oa.o_flags & OBD_FL_SHORT_IO) {
+		desc = NULL;
+	} else {
+		desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
+					    PTLRPC_BULK_PUT_SOURCE |
+						PTLRPC_BULK_BUF_KIOV,
+					    OST_BULK_PORTAL,
+					    &ptlrpc_bulk_kiov_nopin_ops);
+		if (desc == NULL)
+			GOTO(out_commitrw, rc = -ENOMEM);
+	}
 
 	nob = 0;
+	npages_read = npages;
 	for (i = 0; i < npages; i++) {
 		int page_rc = local_nb[i].lnb_rc;
 
 		if (page_rc < 0) {
 			rc = page_rc;
+			npages_read = i;
 			break;
 		}
 
 		nob += page_rc;
-		if (page_rc != 0) { /* some data! */
+		if (page_rc != 0 && desc != NULL) { /* some data! */
 			LASSERT(local_nb[i].lnb_page != NULL);
 			desc->bd_frag_ops->add_kiov_frag
 			  (desc, local_nb[i].lnb_page,
-			   local_nb[i].lnb_page_offset,
+			   local_nb[i].lnb_page_offset & ~PAGE_MASK,
 			   page_rc);
 		}
 
 		if (page_rc != local_nb[i].lnb_len) { /* short read */
+			local_nb[i].lnb_len = page_rc;
+			npages_read = i + (page_rc != 0 ? 1 : 0);
 			/* All subsequent pages should be 0 */
 			while (++i < npages)
 				LASSERT(local_nb[i].lnb_rc == 0);
@@ -1983,14 +2299,19 @@ int tgt_brw_read(struct tgt_session_info *tsi)
 		rc = -E2BIG;
 
 	if (body->oa.o_valid & OBD_MD_FLCKSUM) {
-		cksum_type_t cksum_type =
-			cksum_type_unpack(body->oa.o_valid & OBD_MD_FLFLAGS ?
-					  body->oa.o_flags : 0);
+		u32 flag = body->oa.o_valid & OBD_MD_FLFLAGS ?
+			   body->oa.o_flags : 0;
+		enum cksum_types cksum_type = obd_cksum_type_unpack(flag);
 
-		repbody->oa.o_flags = cksum_type_pack(cksum_type);
+		repbody->oa.o_flags = obd_cksum_type_pack(obd_name,
+							  cksum_type);
 		repbody->oa.o_valid = OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
-		repbody->oa.o_cksum = tgt_checksum_bulk(tsi->tsi_tgt, desc,
-							OST_READ, cksum_type);
+
+		rc = tgt_checksum_niobuf_rw(tsi->tsi_tgt, cksum_type,
+					    local_nb, npages_read, OST_READ,
+					    &repbody->oa.o_cksum);
+		if (rc < 0)
+			GOTO(out_commitrw, rc);
 		CDEBUG(D_PAGE, "checksum at read origin: %x\n",
 		       repbody->oa.o_cksum);
 
@@ -1999,21 +2320,46 @@ int tgt_brw_read(struct tgt_session_info *tsi)
 		 * zero-cksum case) */
 		if ((body->oa.o_valid & OBD_MD_FLFLAGS) &&
 		    (body->oa.o_flags & OBD_FL_RECOV_RESEND))
-			check_read_checksum(desc, &body->oa, &req->rq_peer,
+			check_read_checksum(local_nb, npages_read, exp,
+					    &body->oa, &req->rq_peer,
 					    body->oa.o_cksum,
 					    repbody->oa.o_cksum, cksum_type);
 	} else {
 		repbody->oa.o_valid = 0;
 	}
+	if (body->oa.o_valid & OBD_MD_FLGRANT)
+		repbody->oa.o_valid |= OBD_MD_FLGRANT;
 	/* We're finishing using body->oa as an input variable */
 
 	/* Check if client was evicted while we were doing i/o before touching
 	 * network */
-	if (likely(rc == 0 &&
-		   !CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2) &&
-		   !CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_BULK))) {
-		rc = target_bulk_io(exp, desc, &lwi);
+	if (rc == 0) {
+		if (body->oa.o_valid & OBD_MD_FLFLAGS &&
+		    body->oa.o_flags & OBD_FL_SHORT_IO) {
+			unsigned char *short_io_buf;
+			int short_io_size;
+
+			short_io_buf = req_capsule_server_get(&req->rq_pill,
+							      &RMF_SHORT_IO);
+			short_io_size = req_capsule_get_size(&req->rq_pill,
+							     &RMF_SHORT_IO,
+							     RCL_SERVER);
+			rc = tgt_pages2shortio(local_nb, npages_read,
+					       short_io_buf, short_io_size);
+			if (rc >= 0)
+				req_capsule_shrink(&req->rq_pill,
+						   &RMF_SHORT_IO, rc,
+						   RCL_SERVER);
+			rc = rc > 0 ? 0 : rc;
+		} else if (!CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)) {
+			rc = target_bulk_io(exp, desc, &lwi);
+		}
 		no_reply = rc != 0;
+	} else {
+		if (body->oa.o_valid & OBD_MD_FLFLAGS &&
+		    body->oa.o_flags & OBD_FL_SHORT_IO)
+			req_capsule_shrink(&req->rq_pill, &RMF_SHORT_IO, 0,
+					   RCL_SERVER);
 	}
 
 out_commitrw:
@@ -2036,13 +2382,15 @@ int tgt_brw_read(struct tgt_session_info *tsi)
 		ptlrpc_req_drop_rs(req);
 		LCONSOLE_WARN("%s: Bulk IO read error with %s (at %s), "
 			      "client will retry: rc %d\n",
-			      exp->exp_obd->obd_name,
+			      obd_name,
 			      obd_uuid2str(&exp->exp_client_uuid),
 			      obd_export_nid2str(exp), rc);
 	}
 	/* send a bulk after reply to simulate a network delay or reordering
-	 * by a router */
-	if (unlikely(CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2))) {
+	 * by a router - Note that !desc implies short io, so there is no bulk
+	 * to reorder. */
+	if (unlikely(CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)) &&
+	    desc) {
 		wait_queue_head_t	 waitq;
 		struct l_wait_info	 lwi1;
 
@@ -2059,6 +2407,32 @@ int tgt_brw_read(struct tgt_session_info *tsi)
 }
 EXPORT_SYMBOL(tgt_brw_read);
 
+static int tgt_shortio2pages(struct niobuf_local *local, int npages,
+			     unsigned char *buf, unsigned int size)
+{
+	int	i, off, len;
+	char	*ptr;
+
+	for (i = 0; i < npages; i++) {
+		off = local[i].lnb_page_offset & ~PAGE_MASK;
+		len = local[i].lnb_len;
+
+		if (len == 0)
+			continue;
+
+		CDEBUG(D_PAGE, "index %d offset = %d len = %d left = %d\n",
+		       i, off, len, size);
+		ptr = ll_kmap_atomic(local[i].lnb_page, KM_USER0);
+		if (ptr == NULL)
+			return -EINVAL;
+		memcpy(ptr + off, buf, len < size ? len : size);
+		ll_kunmap_atomic(ptr, KM_USER0);
+		buf += len;
+		size -= len;
+	}
+	return 0;
+}
+
 static void tgt_warn_on_cksum(struct ptlrpc_request *req,
 			      struct ptlrpc_bulk_desc *desc,
 			      struct niobuf_local *local_nb, int npages,
@@ -2073,14 +2447,13 @@ static void tgt_warn_on_cksum(struct ptlrpc_request *req,
 	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
 	LASSERT(body != NULL);
 
-	if (req->rq_peer.nid != desc->bd_sender) {
+	if (desc && req->rq_peer.nid != desc->bd_sender) {
 		via = " via ";
 		router = libcfs_nid2str(desc->bd_sender);
 	}
 
 	if (exp->exp_obd->obd_checksum_dump)
-		dump_all_bulk_pages(&body->oa, desc->bd_iov_count,
-				    &BD_GET_KIOV(desc, 0), server_cksum,
+		dump_all_bulk_pages(&body->oa, npages, local_nb, server_cksum,
 				    client_cksum);
 
 	if (mmap) {
@@ -2121,14 +2494,16 @@ int tgt_brw_write(struct tgt_session_info *tsi)
 	__u32			*rcs;
 	int			 objcount, niocount, npages;
 	int			 rc, i, j;
-	cksum_type_t		 cksum_type = OBD_CKSUM_CRC32;
+	enum cksum_types cksum_type = OBD_CKSUM_CRC32;
 	bool			 no_reply = false, mmap;
 	struct tgt_thread_big_cache *tbc = req->rq_svc_thread->t_data;
 	bool wait_sync = false;
+	const char *obd_name = exp->exp_obd->obd_name;
 
 	ENTRY;
 
-	if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL) {
+	if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL &&
+	    ptlrpc_req2svc(req)->srv_req_portal != MDS_IO_PORTAL) {
 		CERROR("%s: deny write request from %s to portal %u\n",
 		       tgt_name(tsi->tsi_tgt),
 		       obd_export_nid2str(req->rq_export),
@@ -2152,6 +2527,9 @@ int tgt_brw_write(struct tgt_session_info *tsi)
 	CFS_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK, cfs_fail_val > 0 ?
 			 cfs_fail_val : (obd_timeout + 1) / 4);
 
+	/* Delay write commit to show stale size information */
+	CFS_FAIL_TIMEOUT(OBD_FAIL_OSC_NO_SIZE_DATA, cfs_fail_val);
+
 	/* There must be big cache in current thread to process this request
 	 * if it is NULL then something went wrong and it wasn't allocated,
 	 * report -ENOMEM in that case */
@@ -2192,8 +2570,8 @@ int tgt_brw_write(struct tgt_session_info *tsi)
 
 	local_nb = tbc->local;
 
-	rc = tgt_brw_lock(exp->exp_obd->obd_namespace, &tsi->tsi_resid, ioo,
-			  remote_nb, &lockh, LCK_PW);
+	rc = tgt_brw_lock(tsi->tsi_env, exp, &tsi->tsi_resid, ioo, remote_nb,
+			  &lockh, LCK_PW);
 	if (rc != 0)
 		GOTO(out, rc);
 
@@ -2230,26 +2608,46 @@ int tgt_brw_write(struct tgt_session_info *tsi)
 			objcount, ioo, remote_nb, &npages, local_nb);
 	if (rc < 0)
 		GOTO(out_lock, rc);
+	if (body->oa.o_valid & OBD_MD_FLFLAGS &&
+	    body->oa.o_flags & OBD_FL_SHORT_IO) {
+		unsigned int short_io_size;
+		unsigned char *short_io_buf;
+
+		short_io_size = req_capsule_get_size(&req->rq_pill,
+						     &RMF_SHORT_IO,
+						     RCL_CLIENT);
+		short_io_buf = req_capsule_client_get(&req->rq_pill,
+						      &RMF_SHORT_IO);
+		CDEBUG(D_INFO, "Client use short io for data transfer,"
+			       " size = %d\n", short_io_size);
+
+		/* Copy short io buf to pages */
+		rc = tgt_shortio2pages(local_nb, npages, short_io_buf,
+				       short_io_size);
+		desc = NULL;
+	} else {
+		desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
+					    PTLRPC_BULK_GET_SINK |
+					    PTLRPC_BULK_BUF_KIOV,
+					    OST_BULK_PORTAL,
+					    &ptlrpc_bulk_kiov_nopin_ops);
+		if (desc == NULL)
+			GOTO(skip_transfer, rc = -ENOMEM);
+
+		/* NB Having prepped, we must commit... */
+		for (i = 0; i < npages; i++)
+			desc->bd_frag_ops->add_kiov_frag(desc,
+					local_nb[i].lnb_page,
+					local_nb[i].lnb_page_offset & ~PAGE_MASK,
+					local_nb[i].lnb_len);
+
+		rc = sptlrpc_svc_prep_bulk(req, desc);
+		if (rc != 0)
+			GOTO(skip_transfer, rc);
 
-	desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
-				    PTLRPC_BULK_GET_SINK | PTLRPC_BULK_BUF_KIOV,
-				    OST_BULK_PORTAL,
-				    &ptlrpc_bulk_kiov_nopin_ops);
-	if (desc == NULL)
-		GOTO(skip_transfer, rc = -ENOMEM);
-
-	/* NB Having prepped, we must commit... */
-	for (i = 0; i < npages; i++)
-		desc->bd_frag_ops->add_kiov_frag(desc,
-						 local_nb[i].lnb_page,
-						 local_nb[i].lnb_page_offset,
-						 local_nb[i].lnb_len);
-
-	rc = sptlrpc_svc_prep_bulk(req, desc);
-	if (rc != 0)
-		GOTO(skip_transfer, rc);
+		rc = target_bulk_io(exp, desc, &lwi);
+	}
 
-	rc = target_bulk_io(exp, desc, &lwi);
 	no_reply = rc != 0;
 
 skip_transfer:
@@ -2257,13 +2655,19 @@ int tgt_brw_write(struct tgt_session_info *tsi)
 		static int cksum_counter;
 
 		if (body->oa.o_valid & OBD_MD_FLFLAGS)
-			cksum_type = cksum_type_unpack(body->oa.o_flags);
+			cksum_type = obd_cksum_type_unpack(body->oa.o_flags);
 
 		repbody->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
 		repbody->oa.o_flags &= ~OBD_FL_CKSUM_ALL;
-		repbody->oa.o_flags |= cksum_type_pack(cksum_type);
-		repbody->oa.o_cksum = tgt_checksum_bulk(tsi->tsi_tgt, desc,
-							OST_WRITE, cksum_type);
+		repbody->oa.o_flags |= obd_cksum_type_pack(obd_name,
+							   cksum_type);
+
+		rc = tgt_checksum_niobuf_rw(tsi->tsi_tgt, cksum_type,
+					    local_nb, npages, OST_WRITE,
+					    &repbody->oa.o_cksum);
+		if (rc < 0)
+			GOTO(out_commitrw, rc);
+
 		cksum_counter++;
 
 		if (unlikely(body->oa.o_cksum != repbody->oa.o_cksum)) {
@@ -2282,6 +2686,7 @@ int tgt_brw_write(struct tgt_session_info *tsi)
 		}
 	}
 
+out_commitrw:
 	/* Must commit after prep above in all cases */
 	rc = obd_commitrw(tsi->tsi_env, OBD_BRW_WRITE, exp, &repbody->oa,
 			  objcount, ioo, remote_nb, npages, local_nb, rc);
@@ -2337,7 +2742,7 @@ int tgt_brw_write(struct tgt_session_info *tsi)
 		if (!exp->exp_obd->obd_no_transno)
 			LCONSOLE_WARN("%s: Bulk IO write error with %s (at %s),"
 				      " client will retry: rc = %d\n",
-				      exp->exp_obd->obd_name,
+				      obd_name,
 				      obd_uuid2str(&exp->exp_client_uuid),
 				      obd_export_nid2str(exp), rc);
 	}
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_internal.h b/drivers/staging/lustrefsx/lustre/target/tgt_internal.h
index 981e2ab9f9ade..ac7c3c17feb9d 100644
--- a/drivers/staging/lustrefsx/lustre/target/tgt_internal.h
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_internal.h
@@ -21,7 +21,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * lustre/target/tgt_internal.h
@@ -35,7 +35,6 @@
 #define _TG_INTERNAL_H
 
 #include <lustre_net.h>
-#include <lustre/lustre_idl.h>
 #include <lu_target.h>
 #include <lustre_export.h>
 #include <lustre_fid.h>
@@ -288,4 +287,19 @@ int top_trans_create_tmt(const struct lu_env *env,
 void tgt_cancel_slc_locks(struct lu_target *tgt, __u64 transno);
 void barrier_init(void);
 void barrier_fini(void);
+
+/* FMD tracking data */
+struct tgt_fmd_data {
+	struct list_head fmd_list;	  /* linked to tgt_fmd_list */
+	struct lu_fid	 fmd_fid;	  /* FID being written to */
+	__u64		 fmd_mactime_xid; /* xid highest {m,a,c}time setattr */
+	time64_t	 fmd_expire;	  /* time when the fmd should expire */
+	int		 fmd_refcount;	  /* reference counter - list holds 1 */
+};
+
+/* tgt_fmd.c */
+extern struct kmem_cache *tgt_fmd_kmem;
+void tgt_fmd_expire(struct obd_export *exp);
+void tgt_fmd_cleanup(struct obd_export *exp);
+
 #endif /* _TG_INTERNAL_H */
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_lastrcvd.c b/drivers/staging/lustrefsx/lustre/target/tgt_lastrcvd.c
index c7aecdf2171ea..0d2fde1be1bc3 100644
--- a/drivers/staging/lustrefsx/lustre/target/tgt_lastrcvd.c
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_lastrcvd.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -148,6 +148,13 @@ static int tgt_clear_reply_slot(struct lu_target *lut, int idx)
 	int chunk;
 	int b;
 
+	if (lut->lut_obd->obd_stopping)
+		/*
+		 * in case of failover keep the bit set in order to
+		 * avoid overwriting slots in reply_data which might
+		 * be required by resent rpcs
+		 */
+		return 0;
 	chunk = idx / LUT_REPLY_SLOTS_PER_CHUNK;
 	b = idx % LUT_REPLY_SLOTS_PER_CHUNK;
 
@@ -388,6 +395,8 @@ int tgt_client_alloc(struct obd_export *exp)
 
 	spin_lock_init(&exp->exp_target_data.ted_nodemap_lock);
 	INIT_LIST_HEAD(&exp->exp_target_data.ted_nodemap_member);
+	spin_lock_init(&exp->exp_target_data.ted_fmd_lock);
+	INIT_LIST_HEAD(&exp->exp_target_data.ted_fmd_list);
 
 	OBD_ALLOC_PTR(exp->exp_target_data.ted_lcd);
 	if (exp->exp_target_data.ted_lcd == NULL)
@@ -411,6 +420,8 @@ void tgt_client_free(struct obd_export *exp)
 
 	LASSERT(exp != exp->exp_obd->obd_self_export);
 
+	tgt_fmd_cleanup(exp);
+
 	/* free reply data */
 	mutex_lock(&ted->ted_lcd_lock);
 	list_for_each_entry_safe(trd, tmp, &ted->ted_reply_list, trd_list) {
@@ -833,7 +844,7 @@ void tgt_boot_epoch_update(struct lu_target *tgt)
 	 * - there is no client to recover or the recovery was aborted
 	 */
 	if (!strncmp(tgt->lut_obd->obd_type->typ_name, LUSTRE_MDT_NAME, 3) &&
-	    (tgt->lut_obd->obd_max_recoverable_clients == 0 ||
+	    (atomic_read(&tgt->lut_obd->obd_max_recoverable_clients) == 0 ||
 	    tgt->lut_obd->obd_abort_recovery))
 		tgt->lut_lsd.lsd_feature_incompat &= ~OBD_INCOMPAT_MULTI_RPCS;
 
@@ -1517,7 +1528,7 @@ static int tgt_clients_data_init(const struct lu_env *env,
 		exp->exp_connecting = 0;
 		exp->exp_in_recovery = 0;
 		spin_unlock(&exp->exp_lock);
-		obd->obd_max_recoverable_clients++;
+		atomic_inc(&obd->obd_max_recoverable_clients);
 
 		if (tgt->lut_lsd.lsd_feature_incompat &
 		    OBD_INCOMPAT_MULTI_RPCS &&
@@ -1889,7 +1900,6 @@ int tgt_reply_data_init(const struct lu_env *env, struct lu_target *tgt)
 	unsigned long		 reply_data_size;
 	int			 rc;
 	struct lsd_reply_header	*lrh = NULL;
-	struct lsd_client_data  *lcd = NULL;
 	struct tg_reply_data	*trd = NULL;
 	int                      idx;
 	loff_t			 off;
@@ -1938,10 +1948,6 @@ int tgt_reply_data_init(const struct lu_env *env, struct lu_target *tgt)
 		if (hash == NULL)
 			GOTO(out, rc = -ENODEV);
 
-		OBD_ALLOC_PTR(lcd);
-		if (lcd == NULL)
-			GOTO(out, rc = -ENOMEM);
-
 		OBD_ALLOC_PTR(trd);
 		if (trd == NULL)
 			GOTO(out, rc = -ENOMEM);
@@ -1993,6 +1999,13 @@ int tgt_reply_data_init(const struct lu_env *env, struct lu_target *tgt)
 			/* update export last committed transation */
 			exp->exp_last_committed = max(exp->exp_last_committed,
 						      lrd->lrd_transno);
+			/* Update lcd_last_transno as well for check in
+			 * tgt_release_reply_data() or the latest client
+			 * transno can be lost.
+			 */
+			ted->ted_lcd->lcd_last_transno =
+				max(ted->ted_lcd->lcd_last_transno,
+				    exp->exp_last_committed);
 
 			mutex_unlock(&ted->ted_lcd_lock);
 			class_export_put(exp);
@@ -2024,8 +2037,6 @@ int tgt_reply_data_init(const struct lu_env *env, struct lu_target *tgt)
 out:
 	if (hash != NULL)
 		cfs_hash_putref(hash);
-	if (lcd != NULL)
-		OBD_FREE_PTR(lcd);
 	if (trd != NULL)
 		OBD_FREE_PTR(trd);
 	if (lrh != NULL)
diff --git a/drivers/staging/lustrefsx/lustre/target/tgt_main.c b/drivers/staging/lustrefsx/lustre/target/tgt_main.c
index 12f9fdc1c2138..ce158941f9c06 100644
--- a/drivers/staging/lustrefsx/lustre/target/tgt_main.c
+++ b/drivers/staging/lustrefsx/lustre/target/tgt_main.c
@@ -21,7 +21,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * lustre/target/tgt_main.c
@@ -37,6 +37,243 @@
 #include "tgt_internal.h"
 #include "../ptlrpc/ptlrpc_internal.h"
 
+/* This must be longer than the longest string below */
+#define SYNC_STATES_MAXLEN 16
+static char *sync_lock_cancel_states[] = {
+	[SYNC_LOCK_CANCEL_NEVER]	= "never",
+	[SYNC_LOCK_CANCEL_BLOCKING]	= "blocking",
+	[SYNC_LOCK_CANCEL_ALWAYS]	= "always",
+};
+
+/**
+ * Show policy for handling dirty data under a lock being cancelled.
+ *
+ * \param[in] kobj	sysfs kobject
+ * \param[in] attr	sysfs attribute
+ * \param[in] buf	buffer for data
+ *
+ * \retval		0 and buffer filled with data on success
+ * \retval		negative value on error
+ */
+ssize_t sync_lock_cancel_show(struct kobject *kobj,
+			      struct attribute *attr, char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lu_target *tgt = obd->u.obt.obt_lut;
+
+	return sprintf(buf, "%s\n",
+		       sync_lock_cancel_states[tgt->lut_sync_lock_cancel]);
+}
+EXPORT_SYMBOL(sync_lock_cancel_show);
+
+/**
+ * Change policy for handling dirty data under a lock being cancelled.
+ *
+ * This variable defines what action target takes upon lock cancel
+ * There are three possible modes:
+ * 1) never - never do sync upon lock cancel. This can lead to data
+ *    inconsistencies if both the OST and client crash while writing a file
+ *    that is also concurrently being read by another client. In these cases,
+ *    this may allow the file data to "rewind" to an earlier state.
+ * 2) blocking - do sync only if there is blocking lock, e.g. if another
+ *    client is trying to access this same object
+ * 3) always - do sync always
+ *
+ * \param[in] kobj	kobject
+ * \param[in] attr	attribute to show
+ * \param[in] buf	buffer for data
+ * \param[in] count	buffer size
+ *
+ * \retval		\a count on success
+ * \retval		negative value on error
+ */
+ssize_t sync_lock_cancel_store(struct kobject *kobj, struct attribute *attr,
+			       const char *buffer, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lu_target *tgt = obd->u.obt.obt_lut;
+	int val = -1;
+	enum tgt_sync_lock_cancel slc;
+
+	if (count == 0 || count >= SYNC_STATES_MAXLEN)
+		return -EINVAL;
+
+	for (slc = 0; slc < ARRAY_SIZE(sync_lock_cancel_states); slc++) {
+		if (strcmp(buffer, sync_lock_cancel_states[slc]) == 0) {
+			val = slc;
+			break;
+		}
+	}
+
+	/* Legacy numeric codes */
+	if (val == -1) {
+		int rc = kstrtoint(buffer, 0, &val);
+		if (rc)
+			return rc;
+	}
+
+	if (val < 0 || val > 2)
+		return -EINVAL;
+
+	spin_lock(&tgt->lut_flags_lock);
+	tgt->lut_sync_lock_cancel = val;
+	spin_unlock(&tgt->lut_flags_lock);
+	return count;
+}
+EXPORT_SYMBOL(sync_lock_cancel_store);
+LUSTRE_RW_ATTR(sync_lock_cancel);
+
+/**
+ * Show maximum number of Filter Modification Data (FMD) maintained.
+ *
+ * \param[in] kobj	kobject
+ * \param[in] attr	attribute to show
+ * \param[in] buf	buffer for data
+ *
+ * \retval		0 and buffer filled with data on success
+ * \retval		negative value on error
+ */
+ssize_t tgt_fmd_count_show(struct kobject *kobj, struct attribute *attr,
+			   char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lu_target *lut = obd->u.obt.obt_lut;
+
+	return sprintf(buf, "%u\n", lut->lut_fmd_max_num);
+}
+
+/**
+ * Change number of FMDs maintained by target.
+ *
+ * This defines how large the list of FMDs can be.
+ *
+ * \param[in] kobj	kobject
+ * \param[in] attr	attribute to show
+ * \param[in] buf	buffer for data
+ * \param[in] count	buffer size
+ *
+ * \retval		\a count on success
+ * \retval		negative value on error
+ */
+ssize_t tgt_fmd_count_store(struct kobject *kobj, struct attribute *attr,
+			    const char *buffer, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lu_target *lut = obd->u.obt.obt_lut;
+	int val, rc;
+
+	rc = kstrtoint(buffer, 0, &val);
+	if (rc)
+		return rc;
+
+	if (val < 1 || val > 65536)
+		return -EINVAL;
+
+	lut->lut_fmd_max_num = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(tgt_fmd_count);
+
+/**
+ * Show the maximum age of FMD data in seconds.
+ *
+ * \param[in] kobj	kobject
+ * \param[in] attr	attribute to show
+ * \param[in] buf	buffer for data
+ *
+ * \retval		0 and buffer filled with data on success
+ * \retval		negative value on error
+ */
+ssize_t tgt_fmd_seconds_show(struct kobject *kobj, struct attribute *attr,
+			     char *buf)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lu_target *lut = obd->u.obt.obt_lut;
+
+	return sprintf(buf, "%lld\n", lut->lut_fmd_max_age);
+}
+
+/**
+ * Set the maximum age of FMD data in seconds.
+ *
+ * This defines how long FMD data stays in the FMD list.
+ *
+ * \param[in] kobj	kobject
+ * \param[in] attr	attribute to show
+ * \param[in] buf	buffer for data
+ * \param[in] count	buffer size
+ *
+ * \retval		\a count on success
+ * \retval		negative number on error
+ */
+ssize_t tgt_fmd_seconds_store(struct kobject *kobj, struct attribute *attr,
+			      const char *buffer, size_t count)
+{
+	struct obd_device *obd = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lu_target *lut = obd->u.obt.obt_lut;
+	time64_t val;
+	int rc;
+
+	rc = kstrtoll(buffer, 0, &val);
+	if (rc)
+		return rc;
+
+	if (val < 1 || val > 65536) /* ~ 18 hour max */
+		return -EINVAL;
+
+	lut->lut_fmd_max_age = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(tgt_fmd_seconds);
+
+/* These two aliases are old names and kept for compatibility, they were
+ * changed to 'tgt_fmd_count' and 'tgt_fmd_seconds'.
+ * This change was made in Lustre 2.13, so these aliases can be removed
+ * when back compatibility is not needed with any Lustre version prior 2.13
+ */
+static struct lustre_attr tgt_fmd_count_compat = __ATTR(client_cache_count,
+			0644, tgt_fmd_count_show, tgt_fmd_count_store);
+static struct lustre_attr tgt_fmd_seconds_compat = __ATTR(client_cache_seconds,
+			0644, tgt_fmd_seconds_show, tgt_fmd_seconds_store);
+
+static const struct attribute *tgt_attrs[] = {
+	&lustre_attr_sync_lock_cancel.attr,
+	&lustre_attr_tgt_fmd_count.attr,
+	&lustre_attr_tgt_fmd_seconds.attr,
+	&tgt_fmd_count_compat.attr,
+	&tgt_fmd_seconds_compat.attr,
+	NULL,
+};
+
+int tgt_tunables_init(struct lu_target *lut)
+{
+	int rc;
+
+	rc = sysfs_create_files(&lut->lut_obd->obd_kset.kobj, tgt_attrs);
+	if (!rc)
+		lut->lut_attrs = tgt_attrs;
+	return rc;
+}
+EXPORT_SYMBOL(tgt_tunables_init);
+
+void tgt_tunables_fini(struct lu_target *lut)
+{
+	if (lut->lut_attrs) {
+		sysfs_remove_files(&lut->lut_obd->obd_kset.kobj,
+				   lut->lut_attrs);
+		lut->lut_attrs = NULL;
+	}
+}
+EXPORT_SYMBOL(tgt_tunables_fini);
+
 /*
  * Save cross-MDT lock in lut_slc_locks.
  *
@@ -152,6 +389,8 @@ int tgt_init(const struct lu_env *env, struct lu_target *lut,
 	struct lu_attr		 attr;
 	struct lu_fid		 fid;
 	struct dt_object	*o;
+	struct tg_grants_data	*tgd = &lut->lut_tgd;
+	struct obd_statfs	*osfs;
 	int i, rc = 0;
 
 	ENTRY;
@@ -179,7 +418,7 @@ int tgt_init(const struct lu_env *env, struct lu_target *lut,
 	sptlrpc_rule_set_init(&lut->lut_sptlrpc_rset);
 
 	spin_lock_init(&lut->lut_flags_lock);
-	lut->lut_sync_lock_cancel = NEVER_SYNC_ON_CANCEL;
+	lut->lut_sync_lock_cancel = SYNC_LOCK_CANCEL_NEVER;
 
 	spin_lock_init(&lut->lut_slc_locks_guard);
 	INIT_LIST_HEAD(&lut->lut_slc_locks);
@@ -188,6 +427,38 @@ int tgt_init(const struct lu_env *env, struct lu_target *lut,
 	if (!obd->obd_replayable)
 		RETURN(0);
 
+	/* initialize grant and statfs data in target */
+	dt_conf_get(env, lut->lut_bottom, &lut->lut_dt_conf);
+
+	/* statfs data */
+	spin_lock_init(&tgd->tgd_osfs_lock);
+	tgd->tgd_osfs_age = ktime_get_seconds() - 1000;
+	tgd->tgd_osfs_unstable = 0;
+	tgd->tgd_statfs_inflight = 0;
+	tgd->tgd_osfs_inflight = 0;
+
+	/* grant data */
+	spin_lock_init(&tgd->tgd_grant_lock);
+	tgd->tgd_tot_dirty = 0;
+	tgd->tgd_tot_granted = 0;
+	tgd->tgd_tot_pending = 0;
+	tgd->tgd_grant_compat_disable = 0;
+
+	/* populate cached statfs data */
+	osfs = &tgt_th_info(env)->tti_u.osfs;
+	rc = tgt_statfs_internal(env, lut, osfs, 0, NULL);
+	if (rc != 0) {
+		CERROR("%s: can't get statfs data, rc %d\n", tgt_name(lut),
+			rc);
+		GOTO(out, rc);
+	}
+	if (!is_power_of_2(osfs->os_bsize)) {
+		CERROR("%s: blocksize (%d) is not a power of 2\n",
+			tgt_name(lut), osfs->os_bsize);
+		GOTO(out, rc = -EPROTO);
+	}
+	tgd->tgd_blockbits = fls(osfs->os_bsize) - 1;
+
 	spin_lock_init(&lut->lut_translock);
 	spin_lock_init(&lut->lut_client_bitmap_lock);
 
@@ -225,6 +496,11 @@ int tgt_init(const struct lu_env *env, struct lu_target *lut,
 	dt_txn_callback_add(lut->lut_bottom, &lut->lut_txn_cb);
 	lut->lut_bottom->dd_lu_dev.ld_site->ls_tgt = lut;
 
+	lut->lut_fmd_max_num = LUT_FMD_MAX_NUM_DEFAULT;
+	lut->lut_fmd_max_age = LUT_FMD_MAX_AGE_DEFAULT;
+
+	atomic_set(&lut->lut_sync_count, 0);
+
 	/* reply_data is supported by MDT targets only for now */
 	if (strncmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME, 3) != 0)
 		RETURN(0);
@@ -254,8 +530,6 @@ int tgt_init(const struct lu_env *env, struct lu_target *lut,
 	if (rc < 0)
 		GOTO(out, rc);
 
-	atomic_set(&lut->lut_sync_count, 0);
-
 	RETURN(0);
 
 out:
@@ -337,8 +611,44 @@ void tgt_fini(const struct lu_env *env, struct lu_target *lut)
 }
 EXPORT_SYMBOL(tgt_fini);
 
+static struct kmem_cache *tgt_thread_kmem;
+static struct kmem_cache *tgt_session_kmem;
+struct kmem_cache *tgt_fmd_kmem;
+
+static struct lu_kmem_descr tgt_caches[] = {
+	{
+		.ckd_cache = &tgt_thread_kmem,
+		.ckd_name  = "tgt_thread_kmem",
+		.ckd_size  = sizeof(struct tgt_thread_info),
+	},
+	{
+		.ckd_cache = &tgt_session_kmem,
+		.ckd_name  = "tgt_session_kmem",
+		.ckd_size  = sizeof(struct tgt_session_info)
+	},
+	{
+		.ckd_cache = &tgt_fmd_kmem,
+		.ckd_name  = "tgt_fmd_cache",
+		.ckd_size  = sizeof(struct tgt_fmd_data)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+
 /* context key constructor/destructor: tg_key_init, tg_key_fini */
-LU_KEY_INIT(tgt, struct tgt_thread_info);
+static void *tgt_key_init(const struct lu_context *ctx,
+				  struct lu_context_key *key)
+{
+	struct tgt_thread_info *thread;
+
+	OBD_SLAB_ALLOC_PTR_GFP(thread, tgt_thread_kmem, GFP_NOFS);
+	if (thread == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	return thread;
+}
 
 static void tgt_key_fini(const struct lu_context *ctx,
 			 struct lu_context_key *key, void *data)
@@ -355,7 +665,7 @@ static void tgt_key_fini(const struct lu_context *ctx,
 	if (args->ta_args != NULL)
 		OBD_FREE(args->ta_args, sizeof(args->ta_args[0]) *
 					args->ta_alloc_args);
-	OBD_FREE_PTR(info);
+	OBD_SLAB_FREE_PTR(info, tgt_thread_kmem);
 }
 
 static void tgt_key_exit(const struct lu_context *ctx,
@@ -377,8 +687,25 @@ struct lu_context_key tgt_thread_key = {
 
 LU_KEY_INIT_GENERIC(tgt);
 
-/* context key constructor/destructor: tgt_ses_key_init, tgt_ses_key_fini */
-LU_KEY_INIT_FINI(tgt_ses, struct tgt_session_info);
+static void *tgt_ses_key_init(const struct lu_context *ctx,
+			      struct lu_context_key *key)
+{
+	struct tgt_session_info *session;
+
+	OBD_SLAB_ALLOC_PTR_GFP(session, tgt_session_kmem, GFP_NOFS);
+	if (session == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	return session;
+}
+
+static void tgt_ses_key_fini(const struct lu_context *ctx,
+			     struct lu_context_key *key, void *data)
+{
+	struct tgt_session_info *session = data;
+
+	OBD_SLAB_FREE_PTR(session, tgt_session_kmem);
+}
 
 /* context key: tgt_session_key */
 struct lu_context_key tgt_session_key = {
@@ -401,8 +728,13 @@ struct page *tgt_page_to_corrupt;
 
 int tgt_mod_init(void)
 {
+	int	result;
 	ENTRY;
 
+	result = lu_kmem_init(tgt_caches);
+	if (result != 0)
+		RETURN(result);
+
 	tgt_page_to_corrupt = alloc_page(GFP_KERNEL);
 
 	tgt_key_init_generic(&tgt_thread_key, NULL);
@@ -426,5 +758,7 @@ void tgt_mod_exit(void)
 	lu_context_key_degister(&tgt_thread_key);
 	lu_context_key_degister(&tgt_session_key);
 	update_info_fini();
+
+	lu_kmem_fini(tgt_caches);
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/target/update_records.c b/drivers/staging/lustrefsx/lustre/target/update_records.c
index a36d554525507..5fb706c5090a5 100644
--- a/drivers/staging/lustrefsx/lustre/target/update_records.c
+++ b/drivers/staging/lustrefsx/lustre/target/update_records.c
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2015, Intel Corporation.
+ * Copyright (c) 2015, 2017, Intel Corporation.
  */
 
 /*
diff --git a/drivers/staging/lustrefsx/lustre/target/update_recovery.c b/drivers/staging/lustrefsx/lustre/target/update_recovery.c
index 3769d09d19282..ac47105a633b9 100644
--- a/drivers/staging/lustrefsx/lustre/target/update_recovery.c
+++ b/drivers/staging/lustrefsx/lustre/target/update_recovery.c
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2015, 2016, Intel Corporation.
+ * Copyright (c) 2015, 2017, Intel Corporation.
  */
 
 /*
diff --git a/drivers/staging/lustrefsx/lustre/target/update_trans.c b/drivers/staging/lustrefsx/lustre/target/update_trans.c
index 6c3e41438347c..b8150fa5c694c 100644
--- a/drivers/staging/lustrefsx/lustre/target/update_trans.c
+++ b/drivers/staging/lustrefsx/lustre/target/update_trans.c
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2015, 2016, Intel Corporation.
+ * Copyright (c) 2015, 2017, Intel Corporation.
  */
 /*
  * lustre/target/update_trans.c
@@ -82,9 +82,11 @@ static void top_multiple_thandle_dump(struct top_multiple_thandle *tmt,
 	list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) {
 		struct sub_thandle_cookie *stc;
 
-		CDEBUG(mask, "st %p obd %s committed %d stopped %d sub_th %p\n",
+		CDEBUG(mask, "st %p obd %s committed %d started %d stopped %d "
+		       "result %d sub_th %p\n",
 		       st, st->st_dt->dd_lu_dev.ld_obd->obd_name,
-		       st->st_committed, st->st_stopped, st->st_sub_th);
+		       st->st_committed, st->st_started, st->st_stopped,
+		       st->st_result, st->st_sub_th);
 
 		list_for_each_entry(stc, &st->st_cookie_list, stc_list) {
 			CDEBUG(mask, " cookie "DFID".%u\n",
@@ -526,6 +528,7 @@ static void sub_trans_stop_cb(struct lu_env *env,
 	struct top_multiple_thandle	*tmt = cb->dcb_data;
 	ENTRY;
 
+	spin_lock(&tmt->tmt_sub_lock);
 	list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) {
 		if (st->st_stopped)
 			continue;
@@ -536,6 +539,7 @@ static void sub_trans_stop_cb(struct lu_env *env,
 			break;
 		}
 	}
+	spin_unlock(&tmt->tmt_sub_lock);
 
 	wake_up(&tmt->tmt_stop_waitq);
 	RETURN_EXIT;
@@ -1016,6 +1020,8 @@ int top_trans_stop(const struct lu_env *env, struct dt_device *master_dev,
 			sub_trans_commit_cb_internal(tmt,
 						master_st->st_sub_th, rc);
 		if (rc < 0) {
+			CERROR("%s: stop trans failed: rc = %d\n",
+			       master_dev->dd_lu_dev.ld_obd->obd_name, rc);
 			th->th_result = rc;
 			GOTO(stop_other_trans, rc);
 		} else if (tur != NULL && tur->tur_update_records != NULL) {
@@ -1053,6 +1059,9 @@ int top_trans_stop(const struct lu_env *env, struct dt_device *master_dev,
 
 			rc = sub_updates_write(env, lur, st);
 			if (rc < 0) {
+				CERROR("%s: write updates failed: rc = %d\n",
+				       st->st_dt->dd_lu_dev.ld_obd->obd_name,
+				       rc);
 				th->th_result = rc;
 				break;
 			}
@@ -1072,8 +1081,12 @@ int top_trans_stop(const struct lu_env *env, struct dt_device *master_dev,
 		st->st_sub_th->th_result = th->th_result;
 		rc = dt_trans_stop(env, st->st_sub_th->th_dev,
 				   st->st_sub_th);
-		if (unlikely(rc < 0 && th->th_result == 0))
-			th->th_result = rc;
+		if (rc < 0) {
+			CERROR("%s: stop trans failed: rc = %d\n",
+			       st->st_dt->dd_lu_dev.ld_obd->obd_name, rc);
+			if (th->th_result == 0)
+				th->th_result = rc;
+		}
 	}
 
 	rc = top_trans_wait_result(top_th);
diff --git a/drivers/staging/lustrefsx/undef.h b/drivers/staging/lustrefsx/undef.h
index 4d27f6dfa46fb..003391836fc68 100644
--- a/drivers/staging/lustrefsx/undef.h
+++ b/drivers/staging/lustrefsx/undef.h
@@ -23,15 +23,9 @@
 /* extened attributes for ldiskfs */
 #undef CONFIG_LDISKFS_FS_XATTR
 
-/* Max LNET payload */
-#undef CONFIG_LNET_MAX_PAYLOAD
-
 /* enable invariant checking */
 #undef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
 
-/* IOCTL Buffer Size */
-#undef CONFIG_LUSTRE_OBD_MAX_IOCTL_BUFFER
-
 /* kernel has cpu affinity support */
 #undef CPU_AFFINITY
 
@@ -56,9 +50,15 @@
 /* do data checksums */
 #undef ENABLE_CHECKSUM
 
+/* enable flock by default */
+#undef ENABLE_FLOCK
+
 /* Use the Pinger */
 #undef ENABLE_PINGER
 
+/* aes-sha2 is supported by krb5 */
+#undef HAVE_AES_SHA2_SUPPORT
+
 /* Define to 1 if you have the <asm/types.h> header file. */
 #undef HAVE_ASM_TYPES_H
 
@@ -77,6 +77,12 @@
 /* 'bio_integrity_enabled' is available */
 #undef HAVE_BIO_INTEGRITY_ENABLED
 
+/* kernel has bio_integrity_prep_fn */
+#undef HAVE_BIO_INTEGRITY_PREP_FN
+
+/* bio_integrity_payload.bip_iter exist */
+#undef HAVE_BIP_ITER_BIO_INTEGRITY_PAYLOAD
+
 /* 'bi_bdev' is available */
 #undef HAVE_BI_BDEV
 
@@ -101,9 +107,18 @@
 /* blk_queue_max_segments is defined */
 #undef HAVE_BLK_QUEUE_MAX_SEGMENTS
 
+/* kernel hash_64() is broken */
+#undef HAVE_BROKEN_HASH_64
+
 /* kernel has struct bvec_iter */
 #undef HAVE_BVEC_ITER
 
+/* struct cache_detail has writers */
+#undef HAVE_CACHE_DETAIL_WRITERS
+
+/* if cache_detail->hash_lock is a spinlock */
+#undef HAVE_CACHE_HASH_SPINLOCK
+
 /* cache_head has hlist cache_list */
 #undef HAVE_CACHE_HEAD_HLIST
 
@@ -116,24 +131,27 @@
 /* kernel has clean_bdev_aliases */
 #undef HAVE_CLEAN_BDEV_ALIASES
 
+/* 'clear_and_wake_up_bit' is available */
+#undef HAVE_CLEAR_AND_WAKE_UP_BIT
+
 /* have clear_inode */
 #undef HAVE_CLEAR_INODE
 
 /* compat rdma found */
 #undef HAVE_COMPAT_RDMA
 
-/* cpumap_print_to_pagebuf is available */
-#undef HAVE_CPUMASK_PRINT_TO_PAGEBUF
+/* 'cpu_read_lock' exist */
+#undef HAVE_CPUS_READ_LOCK
 
 /* kernel compiled with CRC32 functions */
 #undef HAVE_CRC32
 
-/* struct cred has member tgcred */
-#undef HAVE_CRED_TGCRED
-
 /* crypto hash helper functions are available */
 #undef HAVE_CRYPTO_HASH_HELPERS
 
+/* 'CRYPTO_MAX_ALG_NAME' is 128 */
+#undef HAVE_CRYPTO_MAX_ALG_NAME_128
+
 /* current_time() has replaced CURRENT_TIME */
 #undef HAVE_CURRENT_TIME
 
@@ -152,6 +170,9 @@
 /* dentry_open uses struct path as first argument */
 #undef HAVE_DENTRY_OPEN_USE_PATH
 
+/* DES3 enctype is supported by krb5 */
+#undef HAVE_DES3_SUPPORT
+
 /* direct_IO need 2 arguments */
 #undef HAVE_DIRECTIO_2ARGS
 
@@ -233,6 +254,9 @@
 /* d_delete first parameter declared is not const */
 #undef HAVE_D_DELETE_CONST
 
+/* d_hash_and_lookup is exported by the kernel */
+#undef HAVE_D_HASH_AND_LOOKUP
+
 /* have d_make_root */
 #undef HAVE_D_MAKE_ROOT
 
@@ -320,15 +344,18 @@
 /* Define to 1 if you have the `gethostbyname' function. */
 #undef HAVE_GETHOSTBYNAME
 
+/* 'get_acl' has a rcu argument */
+#undef HAVE_GET_ACL_RCU_ARG
+
+/* get_request_key_auth() is available */
+#undef HAVE_GET_REQUEST_KEY_AUTH
+
 /* get_user_pages takes 6 arguments */
 #undef HAVE_GET_USER_PAGES_6ARG
 
 /* get_user_pages takes gup_flags in arguments */
 #undef HAVE_GET_USER_PAGES_GUP_FLAGS
 
-/* get_user_pages takes gup_flags in arguments with 7 args */
-#undef HAVE_GET_USER_PAGES_GUP_FLAGS_7ARGS
-
 /* struct group_info has member gid */
 #undef HAVE_GROUP_INFO_GID
 
@@ -341,6 +368,9 @@
 /* Define this if the Kerberos GSS library supports gss_krb5_ccache_name */
 #undef HAVE_GSS_KRB5_CCACHE_NAME
 
+/* '__rhashtable_insert_fast()' returns int */
+#undef HAVE_HASHTABLE_INSERT_FAST_RETURN_INT
+
 /* Define this if you have Heimdal Kerberos libraries */
 #undef HAVE_HEIMDAL
 
@@ -389,6 +419,9 @@
 /* if ib_sg_dma_address wrapper exists */
 #undef HAVE_IB_SG_DMA_ADDRESS
 
+/* INIT_LIST_HEAD_RCU exists */
+#undef HAVE_INIT_LIST_HEAD_RCU
+
 /* inode_operations .getattr member function can gather advance stats */
 #undef HAVE_INODEOPS_ENHANCED_GETATTR
 
@@ -413,6 +446,15 @@
 /* inode_operations->permission has two args */
 #undef HAVE_INODE_PERMISION_2ARGS
 
+/* inode times are using timespec64 */
+#undef HAVE_INODE_TIMESPEC64
+
+/* blk_integrity.interval exist */
+#undef HAVE_INTERVAL_BLK_INTEGRITY
+
+/* blk_integrity.interval_exp exist */
+#undef HAVE_INTERVAL_EXP_BLK_INTEGRITY
+
 /* Define to 1 if you have the <inttypes.h> header file. */
 #undef HAVE_INTTYPES_H
 
@@ -422,6 +464,9 @@
 /* have in_compat_syscall */
 #undef HAVE_IN_COMPAT_SYSCALL
 
+/* 'in_dev_for_each_ifa_rtnl' is defined */
+#undef HAVE_IN_DEV_FOR_EACH_IFA_RTNL
+
 /* inode_operations->rename need flags as argument */
 #undef HAVE_IOPS_RENAME_WITH_FLAGS
 
@@ -443,7 +488,7 @@
 /* inode_operations has {get,set,remove}xattr members */
 #undef HAVE_IOP_XATTR
 
-/* if iov_iter has member type */
+/* if iov_iter has member iter_type */
 #undef HAVE_IOV_ITER_HAS_TYPE_MEMBER
 
 /* iov_iter_init handles directional tag */
@@ -461,18 +506,27 @@
 /* is_sxid is defined */
 #undef HAVE_IS_SXID
 
+/* 'iterate_shared' is available */
+#undef HAVE_ITERATE_SHARED
+
 /* struct address_space has i_pages */
 #undef HAVE_I_PAGES
 
 /* i_uid_read is present */
 #undef HAVE_I_UID_READ
 
-/* jiffies_to_timespec64() is available */
-#undef HAVE_JIFFIES_TO_TIMESPEC64
+/* kallsyms_lookup_name is exported by kernel */
+#undef HAVE_KALLSYMS_LOOKUP_NAME
 
 /* kernel_locked is defined */
 #undef HAVE_KERNEL_LOCKED
 
+/* 'kernel_param_[un]lock' is available */
+#undef HAVE_KERNEL_PARAM_LOCK
+
+/* 'struct kernel_param_ops' is available */
+#undef HAVE_KERNEL_PARAM_OPS
+
 /* kernel_setsockopt still in use */
 #undef HAVE_KERNEL_SETSOCKOPT
 
@@ -491,6 +545,9 @@
 /* key_type->instantiate has two args */
 #undef HAVE_KEY_TYPE_INSTANTIATE_2ARGS
 
+/* key.usage is of type refcount_t */
+#undef HAVE_KEY_USAGE_REFCOUNT
+
 /* ki_left exist */
 #undef HAVE_KIOCB_KI_LEFT
 
@@ -519,12 +576,15 @@
    available */
 #undef HAVE_KRB5_GET_INIT_CREDS_OPT_SET_ADDRESSLESS
 
+/* kset_find_obj is exported by the kernel */
+#undef HAVE_KSET_FIND_OBJ
+
+/* kernel has kstrtobool_from_user */
+#undef HAVE_KSTRTOBOOL_FROM_USER
+
 /* kernel has kstrtoul */
 #undef HAVE_KSTRTOUL
 
-/* kernel has ksys_close */
-#undef HAVE_KSYS_CLOSE
-
 /* kthread_worker found */
 #undef HAVE_KTHREAD_WORK
 
@@ -552,6 +612,9 @@
 /* 'ktime_get_ts64' is available */
 #undef HAVE_KTIME_GET_TS64
 
+/* 'ktime_ms_delta' is available */
+#undef HAVE_KTIME_MS_DELTA
+
 /* 'ktime_to_timespec64' is available */
 #undef HAVE_KTIME_TO_TIMESPEC64
 
@@ -579,20 +642,14 @@
 /* readline library is available */
 #undef HAVE_LIBREADLINE
 
-/* Define to 1 if you have the <linux/random.h> header file. */
-#undef HAVE_LINUX_RANDOM_H
+/* linux/rhashtable.h is present */
+#undef HAVE_LINUX_RHASHTABLE_H
 
 /* if linux/selinux.h exists */
 #undef HAVE_LINUX_SELINUX_IS_ENABLED
 
-/* Define to 1 if you have the <linux/types.h> header file. */
-#undef HAVE_LINUX_TYPES_H
-
-/* Define to 1 if you have the <linux/unistd.h> header file. */
-#undef HAVE_LINUX_UNISTD_H
-
-/* Define to 1 if you have the <linux/version.h> header file. */
-#undef HAVE_LINUX_VERSION_H
+/* linux/stdarg.h is present */
+#undef HAVE_LINUX_STDARG_HEADER
 
 /* lock_manager_operations has lm_compare_owner */
 #undef HAVE_LM_COMPARE_OWNER
@@ -603,6 +660,9 @@
 /* kernel has locks_lock_file_wait */
 #undef HAVE_LOCKS_LOCK_FILE_WAIT
 
+/* lookup_user_key() is available */
+#undef HAVE_LOOKUP_USER_KEY
+
 /* kernel has LOOP_CTL_GET_FREE */
 #undef HAVE_LOOP_CTL_GET_FREE
 
@@ -631,6 +691,9 @@
 /* kernel module loading is possible */
 #undef HAVE_MODULE_LOADING_SUPPORT
 
+/* locking module param is supported */
+#undef HAVE_MODULE_PARAM_LOCKING
+
 /* Define to 1 if you have the `name_to_handle_at' function. */
 #undef HAVE_NAME_TO_HANDLE_AT
 
@@ -640,21 +703,36 @@
 /* cancel_dirty_page with one arguement is available */
 #undef HAVE_NEW_CANCEL_DIRTY_PAGE
 
+/* DEFINE_TIMER uses only 2 arguements */
+#undef HAVE_NEW_DEFINE_TIMER
+
 /* 'kernel_write' aligns with read/write helpers */
 #undef HAVE_NEW_KERNEL_WRITE
 
 /* NR_UNSTABLE_NFS is still in use. */
 #undef HAVE_NR_UNSTABLE_NFS
 
+/* ns_to_timespec64() is available */
+#undef HAVE_NS_TO_TIMESPEC64
+
 /* with oldsize */
 #undef HAVE_OLDSIZE_TRUNCATE_PAGECACHE
 
+/* openssl-devel is present */
+#undef HAVE_OPENSSL_GETSEPOL
+
 /* OpenSSL HMAC functions needed for SSK */
 #undef HAVE_OPENSSL_SSK
 
 /* 'pagevec_init' takes one parameter */
 #undef HAVE_PAGEVEC_INIT_ONE_PARAM
 
+/* linux/panic_notifier.h is present */
+#undef HAVE_PANIC_NOTIFIER_H
+
+/* 'param_set_uint_minmax' is available */
+#undef HAVE_PARAM_SET_UINT_MINMAX
+
 /* have PCLMULQDQ instruction */
 #undef HAVE_PCLMULQDQ
 
@@ -673,6 +751,9 @@
 /* posix_acl_valid takes struct user_namespace */
 #undef HAVE_POSIX_ACL_VALID_USER_NS
 
+/* 'prepare_to_wait_event' is available */
+#undef HAVE_PREPARE_TO_WAIT_EVENT
+
 /* struct proc_ops exists */
 #undef HAVE_PROC_OPS
 
@@ -685,12 +766,18 @@
 /* inode->i_nlink is protected from direct modification */
 #undef HAVE_PROTECT_I_NLINK
 
+/* 'PTR_ERR_OR_ZERO' exist */
+#undef HAVE_PTR_ERR_OR_ZERO
+
 /* have quota64 */
 #undef HAVE_QUOTA64
 
 /* radix_tree_exceptional_entry exist */
 #undef HAVE_RADIX_EXCEPTION_ENTRY
 
+/* rdma_connect_locked is defined */
+#undef HAVE_RDMA_CONNECT_LOCKED
+
 /* rdma_create_id wants 4 args */
 #undef HAVE_RDMA_CREATE_ID_4ARG
 
@@ -700,15 +787,24 @@
 /* rdma_reject has 4 arguments */
 #undef HAVE_RDMA_REJECT_4ARGS
 
-/* reinit_completion is exist */
-#undef HAVE_REINIT_COMPLETION
-
 /* kernel export remove_from_page_cache */
 #undef HAVE_REMOVE_FROM_PAGE_CACHE
 
 /* remove_proc_subtree is defined */
 #undef HAVE_REMOVE_PROC_SUBTREE
 
+/* rhashtable_lookup() is available */
+#undef HAVE_RHASHTABLE_LOOKUP
+
+/* rhashtable_lookup_get_insert_fast() is available */
+#undef HAVE_RHASHTABLE_LOOKUP_GET_INSERT_FAST
+
+/* struct rhltable exist */
+#undef HAVE_RHLTABLE
+
+/* save_stack_trace_tsk is exported */
+#undef HAVE_SAVE_STACK_TRACE_TSK
+
 /* Have sa_spill_alloc in ZFS */
 #undef HAVE_SA_SPILL_ALLOC
 
@@ -733,6 +829,9 @@
 /* security_inode_init_security takes a 'struct qstr' parameter */
 #undef HAVE_SECURITY_IINITSEC_QSTR
 
+/* security_inode_listsecurity() is available/exported */
+#undef HAVE_SECURITY_INODE_LISTSECURITY
+
 /* security_release_secctx has 1 arg. */
 #undef HAVE_SEC_RELEASE_SECCTX_1ARG
 
@@ -776,36 +875,27 @@
 /* Have spa_maxblocksize in ZFS */
 #undef HAVE_SPA_MAXBLOCKSIZE
 
-/* spinlock_t is defined */
-#undef HAVE_SPINLOCK_T
-
 /* struct stacktrace_ops exists */
 #undef HAVE_STACKTRACE_OPS
 
 /* stacktrace_ops.warning is exist */
 #undef HAVE_STACKTRACE_WARNING
 
-/* stack_trace_print() exists */
-#undef HAVE_STACK_TRACE_PRINT
-
 /* Define to 1 if you have the <stdint.h> header file. */
 #undef HAVE_STDINT_H
 
 /* Define to 1 if you have the <stdlib.h> header file. */
 #undef HAVE_STDLIB_H
 
+/* stringhash.h is present */
+#undef HAVE_STRINGHASH
+
 /* Define to 1 if you have the <strings.h> header file. */
 #undef HAVE_STRINGS_H
 
 /* Define to 1 if you have the <string.h> header file. */
 #undef HAVE_STRING_H
 
-/* Define to 1 if you have the `strlcat' function. */
-#undef HAVE_STRLCAT
-
-/* Define to 1 if you have the `strlcpy' function. */
-#undef HAVE_STRLCPY
-
 /* Define to 1 if you have the `strnlen' function. */
 #undef HAVE_STRNLEN
 
@@ -833,9 +923,6 @@
 /* ctl_table has ctl_name field */
 #undef HAVE_SYSCTL_CTLNAME
 
-/* Define to 1 if you have the <sys/ioctl.h> header file. */
-#undef HAVE_SYS_IOCTL_H
-
 /* Define to 1 if you have <sys/quota.h>. */
 #undef HAVE_SYS_QUOTA_H
 
@@ -845,6 +932,9 @@
 /* Define to 1 if you have the <sys/types.h> header file. */
 #undef HAVE_SYS_TYPES_H
 
+/* task_is_running() is defined */
+#undef HAVE_TASK_IS_RUNNING
+
 /* tcp_sendpage use socket as first parameter */
 #undef HAVE_TCP_SENDPAGE_USE_SOCKET
 
@@ -866,9 +956,6 @@
 /* 'timespec64_to_ktime' is available */
 #undef HAVE_TIMESPEC64_TO_KTIME
 
-/* have_time_t */
-#undef HAVE_TIME_T
-
 /* topology_sibling_cpumask is available */
 #undef HAVE_TOPOLOGY_SIBLING_CPUMASK
 
@@ -920,9 +1007,18 @@
 /* 'struct vm_operations' remove struct vm_area_struct argument */
 #undef HAVE_VM_OPS_USE_VM_FAULT_ONLY
 
+/* wait_bit.h is present */
+#undef HAVE_WAIT_BIT_HEADER_H
+
 /* 'wait_queue_entry_t' is available */
 #undef HAVE_WAIT_QUEUE_ENTRY
 
+/* linux wait_queue_head_t list_head is name head */
+#undef HAVE_WAIT_QUEUE_ENTRY_LIST
+
+/* 'wait_var_event' is available */
+#undef HAVE_WAIT_VAR_EVENT
+
 /* flags field exist */
 #undef HAVE_XATTR_HANDLER_FLAGS
 
@@ -947,9 +1043,18 @@
 /* Have zap_remove_by_dnode() in ZFS */
 #undef HAVE_ZAP_REMOVE_ADD_BY_DNODE
 
+/* Have inode_timespec_t */
+#undef HAVE_ZFS_INODE_TIMESPEC
+
+/* Have multihost protection in ZFS */
+#undef HAVE_ZFS_MULTIHOST
+
 /* Enable zfs osd */
 #undef HAVE_ZFS_OSD
 
+/* Have zfs_refcount_add */
+#undef HAVE_ZFS_REFCOUNT_ADD
+
 /* __add_wait_queue_exclusive exists */
 #undef HAVE___ADD_WAIT_QUEUE_EXCLUSIVE
 
@@ -999,6 +1104,9 @@
 /* need pclmulqdq based crc32 */
 #undef NEED_CRC32_ACCEL
 
+/* 'ktime_get_ns' is not available */
+#undef NEED_KTIME_GET_NS
+
 /* 'ktime_get_real_ns' is not available */
 #undef NEED_KTIME_GET_REAL_NS
 
@@ -1029,9 +1137,6 @@
 /* name of parallel fsck program */
 #undef PFSCK
 
-/* proc handler methods use __user */
-#undef PROC_HANDLER_USE_USER_ATTR
-
 /* enable randomly alloc failure */
 #undef RANDOM_FAIL_ALLOC
 

From d67b15edc4dfd7b197c2acf293108358c62ea78b Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 2 Sep 2022 19:11:49 +0000
Subject: [PATCH 495/737] mm/damon/dbgfs: fix memory leak when using
 debugfs_lookup()

When calling debugfs_lookup() the result must have dput() called on it,
otherwise the memory will leak over time.  Fix this up by properly calling
dput().

Link: https://lkml.kernel.org/r/20220902191149.112434-1-sj@kernel.org
Fixes: 75c1c2b53c78b ("mm/damon/dbgfs: support multiple contexts")
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/dbgfs.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 53ba8b1e619ca..89075fa4e8a9a 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -853,6 +853,7 @@ static int dbgfs_rm_context(char *name)
 	struct dentry *root, *dir, **new_dirs;
 	struct damon_ctx **new_ctxs;
 	int i, j;
+	int ret = 0;
 
 	if (damon_nr_running_ctxs())
 		return -EBUSY;
@@ -867,14 +868,16 @@ static int dbgfs_rm_context(char *name)
 
 	new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs),
 			GFP_KERNEL);
-	if (!new_dirs)
-		return -ENOMEM;
+	if (!new_dirs) {
+		ret = -ENOMEM;
+		goto out_dput;
+	}
 
 	new_ctxs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_ctxs),
 			GFP_KERNEL);
 	if (!new_ctxs) {
-		kfree(new_dirs);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out_new_dirs;
 	}
 
 	for (i = 0, j = 0; i < dbgfs_nr_ctxs; i++) {
@@ -894,7 +897,13 @@ static int dbgfs_rm_context(char *name)
 	dbgfs_ctxs = new_ctxs;
 	dbgfs_nr_ctxs--;
 
-	return 0;
+	goto out_dput;
+
+out_new_dirs:
+	kfree(new_dirs);
+out_dput:
+	dput(dir);
+	return ret;
 }
 
 static ssize_t dbgfs_rm_context_write(struct file *file,

From 766fd6c7dd5db201009752f48b85fcf7df3a4946 Mon Sep 17 00:00:00 2001
From: Levi Yun <ppbuk5246@gmail.com>
Date: Mon, 26 Sep 2022 16:06:11 +0000
Subject: [PATCH 496/737] damon/sysfs: fix possible memleak on
 damon_sysfs_add_target

When damon_sysfs_add_target couldn't find proper task, New allocated
damon_target structure isn't registered yet, So, it's impossible to free
new allocated one by damon_sysfs_destroy_targets.

By calling damon_add_target as soon as allocating new target, Fix this
possible memory leak.

Link: https://lkml.kernel.org/r/20220926160611.48536-1-sj@kernel.org
Fixes: a61ea561c871 ("mm/damon/sysfs: link DAMON for virtual address spaces monitoring")
Signed-off-by: Levi Yun <ppbuk5246@gmail.com>
Signed-off-by: SeongJae Park <sj@kernel.org>
Reviewed-by: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>	[5.17.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: SeongJae Park <sjpark@amazon.com>
---
 mm/damon/sysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 09f9e8ca3d1fa..5b5ee3308d71b 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2181,13 +2181,13 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target,
 
 	if (!t)
 		return -ENOMEM;
+	damon_add_target(ctx, t);
 	if (ctx->ops.id == DAMON_OPS_VADDR ||
 			ctx->ops.id == DAMON_OPS_FVADDR) {
 		t->pid = find_get_pid(sys_target->pid);
 		if (!t->pid)
 			goto destroy_targets_out;
 	}
-	damon_add_target(ctx, t);
 	err = damon_sysfs_set_regions(t, sys_target->regions);
 	if (err)
 		goto destroy_targets_out;

From 60ee0b0f8afa443e6e4b945c52cd09e3591c6a53 Mon Sep 17 00:00:00 2001
From: KP Singh <kpsingh@google.com>
Date: Fri, 6 Nov 2020 10:37:39 +0000
Subject: [PATCH 497/737] bpf: Allow LSM programs to use bpf spin locks

commit 9e7a4d9831e836eb03dedab89902277ee94eb7a6 upstream.

Usage of spin locks was not allowed for tracing programs due to
insufficient preemption checks. The verifier does not currently prevent
LSM programs from using spin locks, but the helpers are not exposed
via bpf_lsm_func_proto.

Based on the discussion in [1], non-sleepable LSM programs should be
able to use bpf_spin_{lock, unlock}.

Sleepable LSM programs can be preempted which means that allowng spin
locks will need more work (disabling preemption and the verifier
ensuring that no sleepable helpers are called when a spin lock is held).

[1]: https://lore.kernel.org/bpf/20201103153132.2717326-1-kpsingh@chromium.org/T/#md601a053229287659071600d3483523f752cd2fb

Signed-off-by: KP Singh <kpsingh@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20201106103747.2780972-2-kpsingh@chromium.org
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
 kernel/bpf/bpf_lsm.c  |  4 ++++
 kernel/bpf/verifier.c | 20 +++++++++++++++-----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 56cc5a915f670..132f52369a46d 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -63,6 +63,10 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_sk_storage_get_proto;
 	case BPF_FUNC_sk_storage_delete:
 		return &bpf_sk_storage_delete_proto;
+	case BPF_FUNC_spin_lock:
+		return &bpf_spin_lock_proto;
+	case BPF_FUNC_spin_unlock:
+		return &bpf_spin_unlock_proto;
 	default:
 		return tracing_prog_func_proto(func_id, prog);
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f58b8506ddf33..765b0b5ceb2f7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -10399,11 +10399,21 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
 		verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n");
 	}
 
-	if ((is_tracing_prog_type(prog_type) ||
-	     prog_type == BPF_PROG_TYPE_SOCKET_FILTER) &&
-	    map_value_has_spin_lock(map)) {
-		verbose(env, "tracing progs cannot use bpf_spin_lock yet\n");
-		return -EINVAL;
+	if (map_value_has_spin_lock(map)) {
+		if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {
+			verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n");
+			return -EINVAL;
+		}
+
+		if (is_tracing_prog_type(prog_type)) {
+			verbose(env, "tracing progs cannot use bpf_spin_lock yet\n");
+			return -EINVAL;
+		}
+
+		if (prog->aux->sleepable) {
+			verbose(env, "sleepable progs cannot use bpf_spin_lock yet\n");
+			return -EINVAL;
+		}
 	}
 
 	if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&

From 7781b515726669ee165c019e0adaf6fb67130b51 Mon Sep 17 00:00:00 2001
From: KP Singh <kpsingh@google.com>
Date: Fri, 6 Nov 2020 10:37:40 +0000
Subject: [PATCH 498/737] bpf: Implement task local storage

commit 4cf1bc1f10452065a29d576fc5693fc4fab5b919 upstream.

Similar to bpf_local_storage for sockets and inodes add local storage
for task_struct.

The life-cycle of storage is managed with the life-cycle of the
task_struct.  i.e. the storage is destroyed along with the owning task
with a callback to the bpf_task_storage_free from the task_free LSM
hook.

The BPF LSM allocates an __rcu pointer to the bpf_local_storage in
the security blob which are now stackable and can co-exist with other
LSMs.

The userspace map operations can be done by using a pid fd as a key
passed to the lookup, update and delete operations.

Signed-off-by: KP Singh <kpsingh@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20201106103747.2780972-3-kpsingh@chromium.org
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
 include/linux/bpf_lsm.h        |  23 +++
 include/linux/bpf_types.h      |   1 +
 include/uapi/linux/bpf.h       |  39 ++++
 kernel/bpf/Makefile            |   1 +
 kernel/bpf/bpf_lsm.c           |   4 +
 kernel/bpf/bpf_task_storage.c  | 315 +++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           |   3 +-
 kernel/bpf/verifier.c          |  10 ++
 security/bpf/hooks.c           |   2 +
 tools/include/uapi/linux/bpf.h |  39 ++++
 10 files changed, 436 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/bpf_task_storage.c

diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
index aaacb6aafc87e..73226181b7448 100644
--- a/include/linux/bpf_lsm.h
+++ b/include/linux/bpf_lsm.h
@@ -7,6 +7,7 @@
 #ifndef _LINUX_BPF_LSM_H
 #define _LINUX_BPF_LSM_H
 
+#include <linux/sched.h>
 #include <linux/bpf.h>
 #include <linux/lsm_hooks.h>
 
@@ -35,9 +36,21 @@ static inline struct bpf_storage_blob *bpf_inode(
 	return inode->i_security + bpf_lsm_blob_sizes.lbs_inode;
 }
 
+static inline struct bpf_storage_blob *bpf_task(
+	const struct task_struct *task)
+{
+	if (unlikely(!task->security))
+		return NULL;
+
+	return task->security + bpf_lsm_blob_sizes.lbs_task;
+}
+
 extern const struct bpf_func_proto bpf_inode_storage_get_proto;
 extern const struct bpf_func_proto bpf_inode_storage_delete_proto;
+extern const struct bpf_func_proto bpf_task_storage_get_proto;
+extern const struct bpf_func_proto bpf_task_storage_delete_proto;
 void bpf_inode_storage_free(struct inode *inode);
+void bpf_task_storage_free(struct task_struct *task);
 
 #else /* !CONFIG_BPF_LSM */
 
@@ -53,10 +66,20 @@ static inline struct bpf_storage_blob *bpf_inode(
 	return NULL;
 }
 
+static inline struct bpf_storage_blob *bpf_task(
+	const struct task_struct *task)
+{
+	return NULL;
+}
+
 static inline void bpf_inode_storage_free(struct inode *inode)
 {
 }
 
+static inline void bpf_task_storage_free(struct task_struct *task)
+{
+}
+
 #endif /* CONFIG_BPF_LSM */
 
 #endif /* _LINUX_BPF_LSM_H */
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index a8137bb6dd3c2..e256d6ef4765b 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -109,6 +109,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
 #endif
 #ifdef CONFIG_BPF_LSM
 BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops)
 #endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
 #if defined(CONFIG_XDP_SOCKETS)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 12c03ebcf81e0..956003b905b38 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -157,6 +157,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_STRUCT_OPS,
 	BPF_MAP_TYPE_RINGBUF,
 	BPF_MAP_TYPE_INODE_STORAGE,
+	BPF_MAP_TYPE_TASK_STORAGE,
 };
 
 /* Note that tracing related programs such as
@@ -3753,6 +3754,42 @@ union bpf_attr {
  * 		The helper returns **TC_ACT_REDIRECT** on success or
  * 		**TC_ACT_SHOT** on error.
  *
+ * void *bpf_task_storage_get(struct bpf_map *map, struct task_struct *task, void *value, u64 flags)
+ *	Description
+ *		Get a bpf_local_storage from the *task*.
+ *
+ *		Logically, it could be thought of as getting the value from
+ *		a *map* with *task* as the **key**.  From this
+ *		perspective,  the usage is not much different from
+ *		**bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this
+ *		helper enforces the key must be an task_struct and the map must also
+ *		be a **BPF_MAP_TYPE_TASK_STORAGE**.
+ *
+ *		Underneath, the value is stored locally at *task* instead of
+ *		the *map*.  The *map* is used as the bpf-local-storage
+ *		"type". The bpf-local-storage "type" (i.e. the *map*) is
+ *		searched against all bpf_local_storage residing at *task*.
+ *
+ *		An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be
+ *		used such that a new bpf_local_storage will be
+ *		created if one does not exist.  *value* can be used
+ *		together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify
+ *		the initial value of a bpf_local_storage.  If *value* is
+ *		**NULL**, the new bpf_local_storage will be zero initialized.
+ *	Return
+ *		A bpf_local_storage pointer is returned on success.
+ *
+ *		**NULL** if not found or there was an error in adding
+ *		a new bpf_local_storage.
+ *
+ * long bpf_task_storage_delete(struct bpf_map *map, struct task_struct *task)
+ *	Description
+ *		Delete a bpf_local_storage from a *task*.
+ *	Return
+ *		0 on success.
+ *
+ *		**-ENOENT** if the bpf_local_storage cannot be found.
+ *
  * struct task_struct *bpf_get_current_task_btf(void)
  *	Description
  *		Return a BTF pointer to the "current" task.
@@ -3918,6 +3955,8 @@ union bpf_attr {
 	FN(per_cpu_ptr),		\
 	FN(this_cpu_ptr),		\
 	FN(redirect_peer),		\
+	FN(task_storage_get),		\
+	FN(task_storage_delete),	\
 	FN(get_current_task_btf),	\
 	/* */
 
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index c1b9f71ee6aac..d1249340fd6ba 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_i
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
 obj-${CONFIG_BPF_LSM}	  += bpf_inode_storage.o
+obj-${CONFIG_BPF_LSM}	  += bpf_task_storage.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 obj-$(CONFIG_BPF_JIT) += trampoline.o
 obj-$(CONFIG_BPF_SYSCALL) += btf.o
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 132f52369a46d..c4898ca2fb594 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -67,6 +67,10 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_spin_lock_proto;
 	case BPF_FUNC_spin_unlock:
 		return &bpf_spin_unlock_proto;
+	case BPF_FUNC_task_storage_get:
+		return &bpf_task_storage_get_proto;
+	case BPF_FUNC_task_storage_delete:
+		return &bpf_task_storage_delete_proto;
 	default:
 		return tracing_prog_func_proto(func_id, prog);
 	}
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
new file mode 100644
index 0000000000000..39a45fba4fb03
--- /dev/null
+++ b/kernel/bpf/bpf_task_storage.c
@@ -0,0 +1,315 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020 Facebook
+ * Copyright 2020 Google LLC.
+ */
+
+#include <linux/pid.h>
+#include <linux/sched.h>
+#include <linux/rculist.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/bpf.h>
+#include <linux/bpf_local_storage.h>
+#include <linux/filter.h>
+#include <uapi/linux/btf.h>
+#include <linux/bpf_lsm.h>
+#include <linux/btf_ids.h>
+#include <linux/fdtable.h>
+
+DEFINE_BPF_STORAGE_CACHE(task_cache);
+
+static struct bpf_local_storage __rcu **task_storage_ptr(void *owner)
+{
+	struct task_struct *task = owner;
+	struct bpf_storage_blob *bsb;
+
+	bsb = bpf_task(task);
+	if (!bsb)
+		return NULL;
+	return &bsb->storage;
+}
+
+static struct bpf_local_storage_data *
+task_storage_lookup(struct task_struct *task, struct bpf_map *map,
+		    bool cacheit_lockit)
+{
+	struct bpf_local_storage *task_storage;
+	struct bpf_local_storage_map *smap;
+	struct bpf_storage_blob *bsb;
+
+	bsb = bpf_task(task);
+	if (!bsb)
+		return NULL;
+
+	task_storage = rcu_dereference(bsb->storage);
+	if (!task_storage)
+		return NULL;
+
+	smap = (struct bpf_local_storage_map *)map;
+	return bpf_local_storage_lookup(task_storage, smap, cacheit_lockit);
+}
+
+void bpf_task_storage_free(struct task_struct *task)
+{
+	struct bpf_local_storage_elem *selem;
+	struct bpf_local_storage *local_storage;
+	bool free_task_storage = false;
+	struct bpf_storage_blob *bsb;
+	struct hlist_node *n;
+
+	bsb = bpf_task(task);
+	if (!bsb)
+		return;
+
+	rcu_read_lock();
+
+	local_storage = rcu_dereference(bsb->storage);
+	if (!local_storage) {
+		rcu_read_unlock();
+		return;
+	}
+
+	/* Neither the bpf_prog nor the bpf-map's syscall
+	 * could be modifying the local_storage->list now.
+	 * Thus, no elem can be added-to or deleted-from the
+	 * local_storage->list by the bpf_prog or by the bpf-map's syscall.
+	 *
+	 * It is racing with bpf_local_storage_map_free() alone
+	 * when unlinking elem from the local_storage->list and
+	 * the map's bucket->list.
+	 */
+	raw_spin_lock_bh(&local_storage->lock);
+	hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
+		/* Always unlink from map before unlinking from
+		 * local_storage.
+		 */
+		bpf_selem_unlink_map(selem);
+		free_task_storage = bpf_selem_unlink_storage_nolock(
+			local_storage, selem, false);
+	}
+	raw_spin_unlock_bh(&local_storage->lock);
+	rcu_read_unlock();
+
+	/* free_task_storage should always be true as long as
+	 * local_storage->list was non-empty.
+	 */
+	if (free_task_storage)
+		kfree_rcu(local_storage, rcu);
+}
+
+static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_local_storage_data *sdata;
+	struct task_struct *task;
+	unsigned int f_flags;
+	struct pid *pid;
+	int fd, err;
+
+	fd = *(int *)key;
+	pid = pidfd_get_pid(fd, &f_flags);
+	if (IS_ERR(pid))
+		return ERR_CAST(pid);
+
+	/* We should be in an RCU read side critical section, it should be safe
+	 * to call pid_task.
+	 */
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	task = pid_task(pid, PIDTYPE_PID);
+	if (!task) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	sdata = task_storage_lookup(task, map, true);
+	put_pid(pid);
+	return sdata ? sdata->data : NULL;
+out:
+	put_pid(pid);
+	return ERR_PTR(err);
+}
+
+static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
+					    void *value, u64 map_flags)
+{
+	struct bpf_local_storage_data *sdata;
+	struct task_struct *task;
+	unsigned int f_flags;
+	struct pid *pid;
+	int fd, err;
+
+	fd = *(int *)key;
+	pid = pidfd_get_pid(fd, &f_flags);
+	if (IS_ERR(pid))
+		return PTR_ERR(pid);
+
+	/* We should be in an RCU read side critical section, it should be safe
+	 * to call pid_task.
+	 */
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	task = pid_task(pid, PIDTYPE_PID);
+	if (!task) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	sdata = bpf_local_storage_update(
+		task, (struct bpf_local_storage_map *)map, value, map_flags);
+
+	err = PTR_ERR_OR_ZERO(sdata);
+out:
+	put_pid(pid);
+	return err;
+}
+
+static int task_storage_delete(struct task_struct *task, struct bpf_map *map)
+{
+	struct bpf_local_storage_data *sdata;
+
+	sdata = task_storage_lookup(task, map, false);
+	if (!sdata)
+		return -ENOENT;
+
+	bpf_selem_unlink(SELEM(sdata));
+
+	return 0;
+}
+
+static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
+{
+	struct task_struct *task;
+	unsigned int f_flags;
+	struct pid *pid;
+	int fd, err;
+
+	fd = *(int *)key;
+	pid = pidfd_get_pid(fd, &f_flags);
+	if (IS_ERR(pid))
+		return PTR_ERR(pid);
+
+	/* We should be in an RCU read side critical section, it should be safe
+	 * to call pid_task.
+	 */
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	task = pid_task(pid, PIDTYPE_PID);
+	if (!task) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	err = task_storage_delete(task, map);
+out:
+	put_pid(pid);
+	return err;
+}
+
+BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
+	   task, void *, value, u64, flags)
+{
+	struct bpf_local_storage_data *sdata;
+
+	if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
+		return (unsigned long)NULL;
+
+	/* explicitly check that the task_storage_ptr is not
+	 * NULL as task_storage_lookup returns NULL in this case and
+	 * bpf_local_storage_update expects the owner to have a
+	 * valid storage pointer.
+	 */
+	if (!task_storage_ptr(task))
+		return (unsigned long)NULL;
+
+	sdata = task_storage_lookup(task, map, true);
+	if (sdata)
+		return (unsigned long)sdata->data;
+
+	/* This helper must only be called from places where the lifetime of the task
+	 * is guaranteed. Either by being refcounted or by being protected
+	 * by an RCU read-side critical section.
+	 */
+	if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
+		sdata = bpf_local_storage_update(
+			task, (struct bpf_local_storage_map *)map, value,
+			BPF_NOEXIST);
+		return IS_ERR(sdata) ? (unsigned long)NULL :
+					     (unsigned long)sdata->data;
+	}
+
+	return (unsigned long)NULL;
+}
+
+BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
+	   task)
+{
+	/* This helper must only be called from places where the lifetime of the task
+	 * is guaranteed. Either by being refcounted or by being protected
+	 * by an RCU read-side critical section.
+	 */
+	return task_storage_delete(task, map);
+}
+
+static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	return -ENOTSUPP;
+}
+
+static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_local_storage_map *smap;
+
+	smap = bpf_local_storage_map_alloc(attr);
+	if (IS_ERR(smap))
+		return ERR_CAST(smap);
+
+	smap->cache_idx = bpf_local_storage_cache_idx_get(&task_cache);
+	return &smap->map;
+}
+
+static void task_storage_map_free(struct bpf_map *map)
+{
+	struct bpf_local_storage_map *smap;
+
+	smap = (struct bpf_local_storage_map *)map;
+	bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx);
+	bpf_local_storage_map_free(smap);
+}
+
+static int task_storage_map_btf_id;
+const struct bpf_map_ops task_storage_map_ops = {
+	.map_meta_equal = bpf_map_meta_equal,
+	.map_alloc_check = bpf_local_storage_map_alloc_check,
+	.map_alloc = task_storage_map_alloc,
+	.map_free = task_storage_map_free,
+	.map_get_next_key = notsupp_get_next_key,
+	.map_lookup_elem = bpf_pid_task_storage_lookup_elem,
+	.map_update_elem = bpf_pid_task_storage_update_elem,
+	.map_delete_elem = bpf_pid_task_storage_delete_elem,
+	.map_check_btf = bpf_local_storage_map_check_btf,
+	.map_btf_name = "bpf_local_storage_map",
+	.map_btf_id = &task_storage_map_btf_id,
+	.map_owner_storage_ptr = task_storage_ptr,
+};
+
+BTF_ID_LIST_SINGLE(bpf_task_storage_btf_ids, struct, task_struct)
+
+const struct bpf_func_proto bpf_task_storage_get_proto = {
+	.func = bpf_task_storage_get,
+	.gpl_only = false,
+	.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+	.arg1_type = ARG_CONST_MAP_PTR,
+	.arg2_type = ARG_PTR_TO_BTF_ID,
+	.arg2_btf_id = &bpf_task_storage_btf_ids[0],
+	.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+	.arg4_type = ARG_ANYTHING,
+};
+
+const struct bpf_func_proto bpf_task_storage_delete_proto = {
+	.func = bpf_task_storage_delete,
+	.gpl_only = false,
+	.ret_type = RET_INTEGER,
+	.arg1_type = ARG_CONST_MAP_PTR,
+	.arg2_type = ARG_PTR_TO_BTF_ID,
+	.arg2_btf_id = &bpf_task_storage_btf_ids[0],
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c8777e574cbaf..3f3b2a26a9743 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -785,7 +785,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 		    map->map_type != BPF_MAP_TYPE_ARRAY &&
 		    map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
 		    map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
-		    map->map_type != BPF_MAP_TYPE_INODE_STORAGE)
+		    map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
+		    map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
 			return -ENOTSUPP;
 		if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
 		    map->value_size) {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 765b0b5ceb2f7..a05b404373a4f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4968,6 +4968,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		    func_id != BPF_FUNC_inode_storage_delete)
 			goto error;
 		break;
+	case BPF_MAP_TYPE_TASK_STORAGE:
+		if (func_id != BPF_FUNC_task_storage_get &&
+		    func_id != BPF_FUNC_task_storage_delete)
+			goto error;
+		break;
 	default:
 		break;
 	}
@@ -5052,6 +5057,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		if (map->map_type != BPF_MAP_TYPE_INODE_STORAGE)
 			goto error;
 		break;
+	case BPF_FUNC_task_storage_get:
+	case BPF_FUNC_task_storage_delete:
+		if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
+			goto error;
+		break;
 	default:
 		break;
 	}
diff --git a/security/bpf/hooks.c b/security/bpf/hooks.c
index 788667d582ae5..e5971fa74fd74 100644
--- a/security/bpf/hooks.c
+++ b/security/bpf/hooks.c
@@ -12,6 +12,7 @@ static struct security_hook_list bpf_lsm_hooks[] __lsm_ro_after_init = {
 	#include <linux/lsm_hook_defs.h>
 	#undef LSM_HOOK
 	LSM_HOOK_INIT(inode_free_security, bpf_inode_storage_free),
+	LSM_HOOK_INIT(task_free, bpf_task_storage_free),
 };
 
 static int __init bpf_lsm_init(void)
@@ -23,6 +24,7 @@ static int __init bpf_lsm_init(void)
 
 struct lsm_blob_sizes bpf_lsm_blob_sizes __lsm_ro_after_init = {
 	.lbs_inode = sizeof(struct bpf_storage_blob),
+	.lbs_task = sizeof(struct bpf_storage_blob),
 };
 
 DEFINE_LSM(bpf) = {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 8d8653a334530..a09181d1039f3 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -157,6 +157,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_STRUCT_OPS,
 	BPF_MAP_TYPE_RINGBUF,
 	BPF_MAP_TYPE_INODE_STORAGE,
+	BPF_MAP_TYPE_TASK_STORAGE,
 };
 
 /* Note that tracing related programs such as
@@ -3753,6 +3754,42 @@ union bpf_attr {
  * 		The helper returns **TC_ACT_REDIRECT** on success or
  * 		**TC_ACT_SHOT** on error.
  *
+ * void *bpf_task_storage_get(struct bpf_map *map, struct task_struct *task, void *value, u64 flags)
+ *	Description
+ *		Get a bpf_local_storage from the *task*.
+ *
+ *		Logically, it could be thought of as getting the value from
+ *		a *map* with *task* as the **key**.  From this
+ *		perspective,  the usage is not much different from
+ *		**bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this
+ *		helper enforces the key must be an task_struct and the map must also
+ *		be a **BPF_MAP_TYPE_TASK_STORAGE**.
+ *
+ *		Underneath, the value is stored locally at *task* instead of
+ *		the *map*.  The *map* is used as the bpf-local-storage
+ *		"type". The bpf-local-storage "type" (i.e. the *map*) is
+ *		searched against all bpf_local_storage residing at *task*.
+ *
+ *		An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be
+ *		used such that a new bpf_local_storage will be
+ *		created if one does not exist.  *value* can be used
+ *		together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify
+ *		the initial value of a bpf_local_storage.  If *value* is
+ *		**NULL**, the new bpf_local_storage will be zero initialized.
+ *	Return
+ *		A bpf_local_storage pointer is returned on success.
+ *
+ *		**NULL** if not found or there was an error in adding
+ *		a new bpf_local_storage.
+ *
+ * long bpf_task_storage_delete(struct bpf_map *map, struct task_struct *task)
+ *	Description
+ *		Delete a bpf_local_storage from a *task*.
+ *	Return
+ *		0 on success.
+ *
+ *		**-ENOENT** if the bpf_local_storage cannot be found.
+ *
  * struct task_struct *bpf_get_current_task_btf(void)
  *	Description
  *		Return a BTF pointer to the "current" task.
@@ -3918,6 +3955,8 @@ union bpf_attr {
 	FN(per_cpu_ptr),		\
 	FN(this_cpu_ptr),		\
 	FN(redirect_peer),		\
+	FN(task_storage_get),		\
+	FN(task_storage_delete),	\
 	FN(get_current_task_btf),	\
 	/* */
 

From 8357514b153011b32a66ca50debafedf3e71cd62 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 3 Oct 2022 13:59:47 +0100
Subject: [PATCH 499/737] io_uring/af_unix: defer registered files gc to
 io_uring release

Instead of putting io_uring's registered files in unix_gc() we want it
to be done by io_uring itself. The trick here is to consider io_uring
registered files for cycle detection but not actually putting them down.
Because io_uring can't register other ring instances, this will remove
all refs to the ring file triggering the ->release path and clean up
with io_ring_ctx_free().

Cc: stable@vger.kernel.org
Fixes: 6b06314c47e1 ("io_uring: add file set registration")
Reported-and-tested-by: David Bouman <dbouman03@gmail.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
[axboe: add kerneldoc comment to skb, fold in skb leak fix]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 92eb4769b0a35..b04f7bb67e564 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -8189,6 +8189,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
 		fpl->max = SCM_MAX_FD;
 		fpl->count = nr_files;
 		UNIXCB(skb).fp = fpl;
+		skb->scm_io_uring = 1;
 		skb->destructor = unix_destruct_scm;
 		refcount_add(skb->truesize, &sk->sk_wmem_alloc);
 		skb_queue_head(&sk->sk_receive_queue, skb);

From 114bbaee8072d930b73ae89a029e7befadf1e0e9 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Tue, 31 May 2022 10:04:21 +0800
Subject: [PATCH 500/737] mm/damon: remove obsolete comments of kdamond_stop

Since commit 0f91d13366a4 ("mm/damon: simplify stop mechanism") delete
kdamond_stop and change to use kthread stop mechanism, these obsolete
comments should be removed accordingly.

Link: https://lkml.kernel.org/r/20220531020421.46849-1-zhouchengming@bytedance.com
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 7c62da31ce4b5..2765c7d99beb3 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -397,7 +397,6 @@ struct damon_callback {
  * detail.
  *
  * @kdamond:		Kernel thread who does the monitoring.
- * @kdamond_stop:	Notifies whether kdamond should stop.
  * @kdamond_lock:	Mutex for the synchronizations with @kdamond.
  *
  * For each monitoring context, one kernel thread for the monitoring is
@@ -406,14 +405,14 @@ struct damon_callback {
  * Once started, the monitoring thread runs until explicitly required to be
  * terminated or every monitoring target is invalid.  The validity of the
  * targets is checked via the &damon_operations.target_valid of @ops.  The
- * termination can also be explicitly requested by writing non-zero to
- * @kdamond_stop.  The thread sets @kdamond to NULL when it terminates.
- * Therefore, users can know whether the monitoring is ongoing or terminated by
- * reading @kdamond.  Reads and writes to @kdamond and @kdamond_stop from
- * outside of the monitoring thread must be protected by @kdamond_lock.
- *
- * Note that the monitoring thread protects only @kdamond and @kdamond_stop via
- * @kdamond_lock.  Accesses to other fields must be protected by themselves.
+ * termination can also be explicitly requested by calling damon_stop().
+ * The thread sets @kdamond to NULL when it terminates. Therefore, users can
+ * know whether the monitoring is ongoing or terminated by reading @kdamond.
+ * Reads and writes to @kdamond from outside of the monitoring thread must
+ * be protected by @kdamond_lock.
+ *
+ * Note that the monitoring thread protects only @kdamond via @kdamond_lock.
+ * Accesses to other fields must be protected by themselves.
  *
  * @ops:	Set of monitoring operations for given use cases.
  * @callback:	Set of callbacks for monitoring events notifications.

From 53177b0099110c61c9ac13fb0ab8af9078ea0339 Mon Sep 17 00:00:00 2001
From: Gautam <gautammenghani201@gmail.com>
Date: Sun, 26 Jun 2022 01:22:45 +0530
Subject: [PATCH 501/737] kselftests/damon: add support for cases where debugfs
 cannot be read

The kernel is in lockdown mode when secureboot is enabled and hence
debugfs cannot be used. Add support for this and other general cases
where debugfs cannot be read and communicate the same to the user before
running tests.

Signed-off-by: Gautam <gautammenghani201@gmail.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/damon/_chk_dependency.sh | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tools/testing/selftests/damon/_chk_dependency.sh b/tools/testing/selftests/damon/_chk_dependency.sh
index 0189db81550be..0328ac0b5a5ed 100644
--- a/tools/testing/selftests/damon/_chk_dependency.sh
+++ b/tools/testing/selftests/damon/_chk_dependency.sh
@@ -26,3 +26,13 @@ do
 		exit 1
 	fi
 done
+
+permission_error="Operation not permitted"
+for f in attrs target_ids monitor_on
+do
+	status=$( cat "$DBGFS/$f" 2>&1 )
+	if [ "${status#*$permission_error}" != "$status" ]; then
+		echo "Permission for reading $DBGFS/$f denied; maybe secureboot enabled?"
+		exit $ksft_skip
+	fi
+done

From 5ca90582a8ccb02a16140963c3f214efc9888e9a Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 6 Jun 2022 18:23:05 +0000
Subject: [PATCH 502/737] Docs/admin-guide/damon/reclaim: remove a paragraph
 that been obsolete due to online tuning support

Patch series "mm/damon: trivial cleanups".

This patchset contains trivial cleansups for DAMON code.

This patch (of 6):

Commit 81a84182c343 ("Docs/admin-guide/mm/damon/reclaim: document
'commit_inputs' parameter") has documented the 'commit_inputs' parameter
which allows online parameter update, but it didn't remove a paragraph
saying the online parameter update is impossible.  This commit removes the
obsolete paragraph.

Link: https://lkml.kernel.org/r/20220606182310.48781-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20220606182310.48781-2-sj@kernel.org
Fixes: 81a84182c343 ("Docs/admin-guide/mm/damon/reclaim: document 'commit_inputs' parameter")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/reclaim.rst | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst
index 46306f1f34b1a..6510baa911097 100644
--- a/Documentation/admin-guide/mm/damon/reclaim.rst
+++ b/Documentation/admin-guide/mm/damon/reclaim.rst
@@ -48,12 +48,6 @@ DAMON_RECLAIM utilizes module parameters.  That is, you can put
 ``damon_reclaim.<parameter>=<value>`` on the kernel boot command line or write
 proper values to ``/sys/modules/damon_reclaim/parameters/<parameter>`` files.
 
-Note that the parameter values except ``enabled`` are applied only when
-DAMON_RECLAIM starts.  Therefore, if you want to apply new parameter values in
-runtime and DAMON_RECLAIM is already enabled, you should disable and re-enable
-it via ``enabled`` parameter file.  Writing of the new values to proper
-parameter values should be done before the re-enablement.
-
 Below are the description of each parameter.
 
 enabled

From 7d621902c372f253700c7fa971cc74fe1a6ce979 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 6 Jun 2022 18:23:06 +0000
Subject: [PATCH 503/737] mm/damon/{dbgfs,sysfs}: move target_has_pid() from
 dbgfs to damon.h

The function for knowing if given monitoring context's targets will have
pid or not is defined and used in dbgfs only.  However, the logic is also
needed for sysfs.  This commit moves the code to damon.h and makes both
dbgfs and sysfs to use it.

Link: https://lkml.kernel.org/r/20220606182310.48781-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h |  6 ++++++
 mm/damon/dbgfs.c      | 15 +++++----------
 mm/damon/sysfs.c      |  8 +++-----
 3 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 2765c7d99beb3..b9aae19fab3e9 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -525,6 +525,12 @@ bool damon_is_registered_ops(enum damon_ops_id id);
 int damon_register_ops(struct damon_operations *ops);
 int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id);
 
+static inline bool damon_target_has_pid(const struct damon_ctx *ctx)
+{
+	return ctx->ops.id == DAMON_OPS_VADDR || ctx->ops.id == DAMON_OPS_FVADDR;
+}
+
+
 int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive);
 int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
 
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 89075fa4e8a9a..74751f332e429 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -275,11 +275,6 @@ static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf,
 	return ret;
 }
 
-static inline bool target_has_pid(const struct damon_ctx *ctx)
-{
-	return ctx->ops.id == DAMON_OPS_VADDR;
-}
-
 static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len)
 {
 	struct damon_target *t;
@@ -288,7 +283,7 @@ static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len)
 	int rc;
 
 	damon_for_each_target(t, ctx) {
-		if (target_has_pid(ctx))
+		if (damon_target_has_pid(ctx))
 			/* Show pid numbers to debugfs users */
 			id = pid_vnr(t->pid);
 		else
@@ -415,7 +410,7 @@ static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets,
 	struct damon_target *t, *next;
 
 	damon_for_each_target_safe(t, next, ctx) {
-		if (target_has_pid(ctx))
+		if (damon_target_has_pid(ctx))
 			put_pid(t->pid);
 		damon_destroy_target(t);
 	}
@@ -425,11 +420,11 @@ static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets,
 		if (!t) {
 			damon_for_each_target_safe(t, next, ctx)
 				damon_destroy_target(t);
-			if (target_has_pid(ctx))
+			if (damon_target_has_pid(ctx))
 				dbgfs_put_pids(pids, nr_targets);
 			return -ENOMEM;
 		}
-		if (target_has_pid(ctx))
+		if (damon_target_has_pid(ctx))
 			t->pid = pids[i];
 		damon_add_target(ctx, t);
 	}
@@ -722,7 +717,7 @@ static void dbgfs_before_terminate(struct damon_ctx *ctx)
 {
 	struct damon_target *t, *next;
 
-	if (!target_has_pid(ctx))
+	if (!damon_target_has_pid(ctx))
 		return;
 
 	mutex_lock(&ctx->kdamond_lock);
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 5b5ee3308d71b..110f4becb4d2b 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2136,8 +2136,7 @@ static void damon_sysfs_destroy_targets(struct damon_ctx *ctx)
 	struct damon_target *t, *next;
 
 	damon_for_each_target_safe(t, next, ctx) {
-		if (ctx->ops.id == DAMON_OPS_VADDR ||
-				ctx->ops.id == DAMON_OPS_FVADDR)
+		if (damon_target_has_pid(ctx))
 			put_pid(t->pid);
 		damon_destroy_target(t);
 	}
@@ -2182,8 +2181,7 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target,
 	if (!t)
 		return -ENOMEM;
 	damon_add_target(ctx, t);
-	if (ctx->ops.id == DAMON_OPS_VADDR ||
-			ctx->ops.id == DAMON_OPS_FVADDR) {
+	if (damon_target_has_pid(ctx)) {
 		t->pid = find_get_pid(sys_target->pid);
 		if (!t->pid)
 			goto destroy_targets_out;
@@ -2210,7 +2208,7 @@ static struct damon_target *damon_sysfs_existing_target(
 	struct pid *pid;
 	struct damon_target *t;
 
-	if (ctx->ops.id == DAMON_OPS_PADDR) {
+	if (!damon_target_has_pid(ctx)) {
 		/* Up to only one target for paddr could exist */
 		damon_for_each_target(t, ctx)
 			return t;

From bae123e7da841ad018b3680efcd57b039b12feaf Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 6 Jun 2022 18:23:07 +0000
Subject: [PATCH 504/737] mm/damon/reclaim: deduplicate 'commit_inputs'
 handling

DAMON_RECLAIM's handling of 'commit_inputs' parameter is duplicated in
'after_aggregation()' and 'after_wmarks_check()' callbacks.  This commit
deduplicates the code for better maintenance.

Link: https://lkml.kernel.org/r/20220606182310.48781-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/reclaim.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 0b3c7396cb90a..9a402b76a2764 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -403,10 +403,21 @@ module_param_cb(enabled, &enabled_param_ops, &enabled, 0600);
 MODULE_PARM_DESC(enabled,
 	"Enable or disable DAMON_RECLAIM (default: disabled)");
 
+static int damon_reclaim_handle_commit_inputs(void)
+{
+	int err;
+
+	if (!commit_inputs)
+		return 0;
+
+	err = damon_reclaim_apply_parameters();
+	commit_inputs = false;
+	return err;
+}
+
 static int damon_reclaim_after_aggregation(struct damon_ctx *c)
 {
 	struct damos *s;
-	int err = 0;
 
 	/* update the stats parameter */
 	damon_for_each_scheme(s, c) {
@@ -417,22 +428,12 @@ static int damon_reclaim_after_aggregation(struct damon_ctx *c)
 		nr_quota_exceeds = s->stat.qt_exceeds;
 	}
 
-	if (commit_inputs) {
-		err = damon_reclaim_apply_parameters();
-		commit_inputs = false;
-	}
-	return err;
+	return damon_reclaim_handle_commit_inputs();
 }
 
 static int damon_reclaim_after_wmarks_check(struct damon_ctx *c)
 {
-	int err = 0;
-
-	if (commit_inputs) {
-		err = damon_reclaim_apply_parameters();
-		commit_inputs = false;
-	}
-	return err;
+	return damon_reclaim_handle_commit_inputs();
 }
 
 static int __init damon_reclaim_init(void)

From f13edef637ae314f7332aebc627184c7ac65678c Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 6 Jun 2022 18:23:08 +0000
Subject: [PATCH 505/737] mm/damon/sysfs: deduplicate inputs applying

DAMON sysfs interface's DAMON context building and its online parameter
update have duplicated code.  This commit removes the duplicate.

Link: https://lkml.kernel.org/r/20220606182310.48781-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs.c | 59 ++++++++++++++++++++----------------------------
 1 file changed, 24 insertions(+), 35 deletions(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 110f4becb4d2b..2b70e3144d7bd 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2357,6 +2357,23 @@ static inline bool damon_sysfs_kdamond_running(
 		damon_sysfs_ctx_running(kdamond->damon_ctx);
 }
 
+static int damon_sysfs_apply_inputs(struct damon_ctx *ctx,
+		struct damon_sysfs_context *sys_ctx)
+{
+	int err;
+
+	err = damon_select_ops(ctx, sys_ctx->ops_id);
+	if (err)
+		return err;
+	err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs);
+	if (err)
+		return err;
+	err = damon_sysfs_set_targets(ctx, sys_ctx->targets);
+	if (err)
+		return err;
+	return damon_sysfs_set_schemes(ctx, sys_ctx->schemes);
+}
+
 /*
  * damon_sysfs_commit_input() - Commit user inputs to a running kdamond.
  * @kdamond:	The kobject wrapper for the associated kdamond.
@@ -2365,31 +2382,14 @@ static inline bool damon_sysfs_kdamond_running(
  */
 static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond)
 {
-	struct damon_ctx *ctx = kdamond->damon_ctx;
-	struct damon_sysfs_context *sys_ctx;
-	int err = 0;
-
 	if (!damon_sysfs_kdamond_running(kdamond))
 		return -EINVAL;
 	/* TODO: Support multiple contexts per kdamond */
 	if (kdamond->contexts->nr != 1)
 		return -EINVAL;
 
-	sys_ctx = kdamond->contexts->contexts_arr[0];
-
-	err = damon_select_ops(ctx, sys_ctx->ops_id);
-	if (err)
-		return err;
-	err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs);
-	if (err)
-		return err;
-	err = damon_sysfs_set_targets(ctx, sys_ctx->targets);
-	if (err)
-		return err;
-	err = damon_sysfs_set_schemes(ctx, sys_ctx->schemes);
-	if (err)
-		return err;
-	return err;
+	return damon_sysfs_apply_inputs(kdamond->damon_ctx,
+			kdamond->contexts->contexts_arr[0]);
 }
 
 /*
@@ -2436,27 +2436,16 @@ static struct damon_ctx *damon_sysfs_build_ctx(
 	if (!ctx)
 		return ERR_PTR(-ENOMEM);
 
-	err = damon_select_ops(ctx, sys_ctx->ops_id);
-	if (err)
-		goto out;
-	err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs);
-	if (err)
-		goto out;
-	err = damon_sysfs_set_targets(ctx, sys_ctx->targets);
-	if (err)
-		goto out;
-	err = damon_sysfs_set_schemes(ctx, sys_ctx->schemes);
-	if (err)
-		goto out;
+	err = damon_sysfs_apply_inputs(ctx, sys_ctx);
+	if (err) {
+		damon_destroy_ctx(ctx);
+		return ERR_PTR(err);
+	}
 
 	ctx->callback.after_wmarks_check = damon_sysfs_cmd_request_callback;
 	ctx->callback.after_aggregation = damon_sysfs_cmd_request_callback;
 	ctx->callback.before_terminate = damon_sysfs_before_terminate;
 	return ctx;
-
-out:
-	damon_destroy_ctx(ctx);
-	return ERR_PTR(err);
 }
 
 static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond)

From de21643b38d1318fc43befdfaa2f56b13909f6db Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 6 Jun 2022 18:23:09 +0000
Subject: [PATCH 506/737] mm/damon/reclaim: make 'enabled' checking timer
 simpler

DAMON_RECLAIM's 'enabled' parameter store callback ('enabled_store()')
schedules the parameter check timer ('damon_reclaim_timer') if the
parameter is set as 'Y'.  Then, the timer schedules itself to check if
user has set the parameter as 'N'.  It's unnecessarily complex.

This commit makes it simpler by making the parameter store callback to
schedule the timer regardless of the parameter value and disabling the
timer's self scheduling.

Link: https://lkml.kernel.org/r/20220606182310.48781-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/reclaim.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 9a402b76a2764..1288ee0b5e86c 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -353,7 +353,6 @@ static int damon_reclaim_turn(bool on)
 	return 0;
 }
 
-#define ENABLE_CHECK_INTERVAL_MS	1000
 static struct delayed_work damon_reclaim_timer;
 static void damon_reclaim_timer_fn(struct work_struct *work)
 {
@@ -367,10 +366,6 @@ static void damon_reclaim_timer_fn(struct work_struct *work)
 		else
 			enabled = last_enabled;
 	}
-
-	if (enabled)
-		schedule_delayed_work(&damon_reclaim_timer,
-			msecs_to_jiffies(ENABLE_CHECK_INTERVAL_MS));
 }
 static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn);
 
@@ -388,9 +383,7 @@ static int enabled_store(const char *val,
 	if (!damon_reclaim_initialized)
 		return rc;
 
-	if (enabled)
-		schedule_delayed_work(&damon_reclaim_timer, 0);
-
+	schedule_delayed_work(&damon_reclaim_timer, 0);
 	return 0;
 }
 

From ac6e6eedcdffcfe9805db773596478eb12535778 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 6 Jun 2022 18:23:10 +0000
Subject: [PATCH 507/737] mm/damon/reclaim: add 'damon_reclaim_' prefix to
 'enabled_store()'

This commit adds 'damon_reclaim_' prefix to 'enabled_store()', so that we
can distinguish it easily from the stack trace using 'faddr2line.sh' like
tools.

Link: https://lkml.kernel.org/r/20220606182310.48781-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/reclaim.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 1288ee0b5e86c..a7faf51b4bd4a 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -371,7 +371,7 @@ static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn);
 
 static bool damon_reclaim_initialized;
 
-static int enabled_store(const char *val,
+static int damon_reclaim_enabled_store(const char *val,
 		const struct kernel_param *kp)
 {
 	int rc = param_set_bool(val, kp);
@@ -388,7 +388,7 @@ static int enabled_store(const char *val,
 }
 
 static const struct kernel_param_ops enabled_param_ops = {
-	.set = enabled_store,
+	.set = damon_reclaim_enabled_store,
 	.get = param_get_bool,
 };
 

From ab09f3ff21b009f95509940c7789d255242095d3 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 13 Jun 2022 19:22:53 +0000
Subject: [PATCH 508/737] mm/damon/dbgfs: add and use mappings between
 'schemes' action inputs and 'damos_action' values

Patch series "Extend DAMOS for Proactive LRU-lists Sorting".

Introduction
============

In short, this patchset 1) extends DAMON-based Operation Schemes (DAMOS)
for low overhead data access pattern based LRU-lists sorting, and 2)
implements a static kernel module for easy use of conservatively-tuned
version of that using the extended DAMOS capability.

Background
----------

As page-granularity access checking overhead could be significant on huge
systems, LRU lists are normally not proactively sorted but partially and
reactively sorted for special events including specific user requests,
system calls and memory pressure.  As a result, LRU lists are sometimes
not so perfectly prepared to be used as a trustworthy access pattern
source for some situations including reclamation target pages selection
under sudden memory pressure.

DAMON-based Proactive LRU-lists Sorting
---------------------------------------

Because DAMON can identify access patterns of best-effort accuracy while
inducing only user-specified range of overhead, using DAMON for Proactive
LRU-lists Sorting (PLRUS) could be helpful for this situation.  The idea
is quite simple.  Find hot pages and cold pages using DAMON, and
prioritize hot pages while deprioritizing cold pages on their LRU-lists.

This patchset extends DAMON to support such schemes by introducing a
couple of new DAMOS actions for prioritizing and deprioritizing memory
regions of specific access patterns on their LRU-lists.  In detail, this
patchset simply uses 'mark_page_accessed()' and 'deactivate_page()'
functions for prioritization and deprioritization of pages on their LRU
lists, respectively.

To make the scheme easy to use without complex tuning for common
situations, this patchset further implements a static kernel module called
'DAMON_LRU_SORT' using the extended DAMOS functionality.  It proactively
sorts LRU-lists using DAMON with conservatively chosen default
hotness/coldness thresholds and small CPU usage quota limit.  That is, the
module under its default parameters will make no harm for common situation
but provide some level of benefit for systems having clear hot/cold access
pattern under only memory pressure while consuming only limited small
portion of CPU time.

Related Works
-------------

Proactive reclamation is well known to be helpful for reducing non-optimal
reclamation target selection caused performance drops.  However, proactive
reclamation is not a best option for some cases, because it could incur
additional I/O.  For an example, it could be prohitive for systems using
storage devices that total number of writes is limited, or cloud block
storages that charges every I/O.

Some proactive reclamation approaches[1,2] induce a level of memory
pressure using memcg files or swappiness while monitoring PSI.  As
reclamation target selection is still relying on the original LRU-lists
mechanism, using DAMON-based proactive reclamation before inducing the
proactive reclamation could allow more memory saving with same level of
performance overhead, or less performance overhead with same level of
memory saving.

[1] https://blogs.oracle.com/linux/post/anticipating-your-memory-needs
[2] https://www.pdl.cmu.edu/ftp/NVM/tmo_asplos22.pdf

Evaluation
==========

In short, PLRUS achieves 10% memory PSI (some) reduction, 14% major page
faults reduction, and 3.74% speedup under memory pressure.

Setup
-----

To show the effect of PLRUS, I run PARSEC3 and SPLASH-2X benchmarks under
below variant systems and measure a few metrics including the runtime of
each workload, number of system-wide major page faults, and system-wide
memory PSI (some).

- orig: v5.18-rc4 based mm-unstable kernel + this patchset, but no DAMON scheme
        applied.
- mprs: Same to 'orig' but artificial memory pressure is induced.
- plrus: Same to 'mprs' but a radically tuned PLRUS scheme is applied to the
         entire physical address space of the system.

For the artificial memory pressure, I set 'memory.limit_in_bytes' to 75%
of the running workload's peak RSS, wait 1 second, remove the pressure by
setting it to 200% of the peak RSS, wait 10 seconds, and repeat the
procedure until the workload finishes[1].  I use zram based swap device.
The tests are automated[2].

[1] https://github.com/awslabs/damon-tests/blob/next/perf/runners/back/0009_memcg_pressure.sh
[2] https://github.com/awslabs/damon-tests/blob/next/perf/full_once_config.sh

Radically Tuned PLRUS
---------------------

To show effect of PLRUS on the PARSEC3/SPLASH-2X workloads which runs for
no long time, we use radically tuned version of PLRUS.  The version asks
DAMON to do the proactive LRU-lists sorting as below.

1. Find any memory regions shown some accesses (approximately >=20 accesses per
   100 sampling) and prioritize pages of the regions on their LRU lists using
   up to 2% CPU time.  Under the CPU time limit, prioritize regions having
   higher access frequency and kept the access frequency longer first.

2. Find any memory regions shown no access for at least >=5 seconds and
   deprioritize pages of the rgions on their LRU lists using up to 2% CPU time.
   Under the CPU time limit, deprioritize regions that not accessed for longer
   time first.

Results
-------

I repeat the tests 25 times and calculate average of the measured numbers.
The results are as below:

    metric               orig        mprs         plrus        plrus/mprs
    runtime_seconds      190.06      292.83       281.87       0.96
    pgmajfaults          852.55      8769420.00   7525040.00   0.86
    memory_psi_some_us   106911.00   6943420.00   6220920.00   0.90

The first row is for legend.  The first cell shows the metric that the
following cells of the row shows.  Second, third, and fourth cells show
the metrics under the configs shown at the first row of the cell, and the
fifth cell shows the metric under 'plrus' divided by the metric under
'mprs'.  Second row shows the averaged runtime of the workloads in
seconds.  Third row shows the number of system-wide major page faults
while the test was ongoing.  Fourth row shows the system-wide memory
pressure stall for some processes in microseconds while the test was
ongoing.

In short, PLRUS achieves 10% memory PSI (some) reduction, 14% major page
faults reduction, and 3.74% speedup under memory pressure.  We also
confirmed the CPU usage of kdamond was 2.61% of single CPU, which is below
4% as expected.

Sequence of Patches
===================

The first and second patch cleans up DAMON debugfs interface and
DAMOS_PAGEOUT handling code of physical address space monitoring
operations implementation for easier extension of the code.

The thrid and fourth patches implement a new DAMOS action called
'lru_prio', which prioritizes pages under memory regions which have a
user-specified access pattern, and document it, respectively.  The fifth
and sixth patches implement yet another new DAMOS action called
'lru_deprio', which deprioritizes pages under memory regions which have a
user-specified access pattern, and document it, respectively.

The seventh patch implements a static kernel module called
'damon_lru_sort', which utilizes the DAMON-based proactive LRU-lists
sorting under conservatively chosen default parameter.  Finally, the
eighth patch documents 'damon_lru_sort'.

This patch (of 8):

DAMON debugfs interface assumes users will write 'damos_action' value
directly to the 'schemes' file.  This makes adding new 'damos_action' in
the middle of its definition breaks the backward compatibility of DAMON
debugfs interface, as values of some 'damos_action' could be changed.  To
mitigate the situation, this commit adds mappings between the user inputs
and 'damos_action' value and makes DAMON debugfs code uses those.

Link: https://lkml.kernel.org/r/20220613192301.8817-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20220613192301.8817-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/dbgfs.c | 64 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 50 insertions(+), 14 deletions(-)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 74751f332e429..4e51466c4e74d 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -97,6 +97,31 @@ static ssize_t dbgfs_attrs_write(struct file *file,
 	return ret;
 }
 
+/*
+ * Return corresponding dbgfs' scheme action value (int) for the given
+ * damos_action if the given damos_action value is valid and supported by
+ * dbgfs, negative error code otherwise.
+ */
+static int damos_action_to_dbgfs_scheme_action(enum damos_action action)
+{
+	switch (action) {
+	case DAMOS_WILLNEED:
+		return 0;
+	case DAMOS_COLD:
+		return 1;
+	case DAMOS_PAGEOUT:
+		return 2;
+	case DAMOS_HUGEPAGE:
+		return 3;
+	case DAMOS_NOHUGEPAGE:
+		return 4;
+	case DAMOS_STAT:
+		return 5;
+	default:
+		return -EINVAL;
+	}
+}
+
 static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
 {
 	struct damos *s;
@@ -109,7 +134,7 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
 				s->min_sz_region, s->max_sz_region,
 				s->min_nr_accesses, s->max_nr_accesses,
 				s->min_age_region, s->max_age_region,
-				s->action,
+				damos_action_to_dbgfs_scheme_action(s->action),
 				s->quota.ms, s->quota.sz,
 				s->quota.reset_interval,
 				s->quota.weight_sz,
@@ -160,18 +185,27 @@ static void free_schemes_arr(struct damos **schemes, ssize_t nr_schemes)
 	kfree(schemes);
 }
 
-static bool damos_action_valid(int action)
+/*
+ * Return corresponding damos_action for the given dbgfs input for a scheme
+ * action if the input is valid, negative error code otherwise.
+ */
+static enum damos_action dbgfs_scheme_action_to_damos_action(int dbgfs_action)
 {
-	switch (action) {
-	case DAMOS_WILLNEED:
-	case DAMOS_COLD:
-	case DAMOS_PAGEOUT:
-	case DAMOS_HUGEPAGE:
-	case DAMOS_NOHUGEPAGE:
-	case DAMOS_STAT:
-		return true;
+	switch (dbgfs_action) {
+	case 0:
+		return DAMOS_WILLNEED;
+	case 1:
+		return DAMOS_COLD;
+	case 2:
+		return DAMOS_PAGEOUT;
+	case 3:
+		return DAMOS_HUGEPAGE;
+	case 4:
+		return DAMOS_NOHUGEPAGE;
+	case 5:
+		return DAMOS_STAT;
 	default:
-		return false;
+		return -EINVAL;
 	}
 }
 
@@ -189,7 +223,8 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
 	int pos = 0, parsed, ret;
 	unsigned long min_sz, max_sz;
 	unsigned int min_nr_a, max_nr_a, min_age, max_age;
-	unsigned int action;
+	unsigned int action_input;
+	enum damos_action action;
 
 	schemes = kmalloc_array(max_nr_schemes, sizeof(scheme),
 			GFP_KERNEL);
@@ -204,7 +239,7 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
 		ret = sscanf(&str[pos],
 				"%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n",
 				&min_sz, &max_sz, &min_nr_a, &max_nr_a,
-				&min_age, &max_age, &action, &quota.ms,
+				&min_age, &max_age, &action_input, &quota.ms,
 				&quota.sz, &quota.reset_interval,
 				&quota.weight_sz, &quota.weight_nr_accesses,
 				&quota.weight_age, &wmarks.metric,
@@ -212,7 +247,8 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
 				&wmarks.low, &parsed);
 		if (ret != 18)
 			break;
-		if (!damos_action_valid(action))
+		action = dbgfs_scheme_action_to_damos_action(action_input);
+		if ((int)action < 0)
 			goto fail;
 
 		if (min_sz > max_sz || min_nr_a > max_nr_a || min_age > max_age)

From 2c615e67da555bc70505c43adb533f3c229b2904 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 13 Jun 2022 19:22:55 +0000
Subject: [PATCH 509/737] mm/damon/paddr: use a separate function for
 'DAMOS_PAGEOUT' handling

This commit moves code for 'DAMOS_PAGEOUT' handling of the physical
address space monitoring operations set to a separate function so that its
caller, 'damon_pa_apply_scheme()', can be more easily extended for
additional DAMOS actions later.

Link: https://lkml.kernel.org/r/20220613192301.8817-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/paddr.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 208fb369f22eb..f10335972ab5e 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -208,16 +208,11 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
 	return max_nr_accesses;
 }
 
-static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
-		struct damon_target *t, struct damon_region *r,
-		struct damos *scheme)
+static unsigned long damon_pa_pageout(struct damon_region *r)
 {
 	unsigned long addr, applied;
 	LIST_HEAD(page_list);
 
-	if (scheme->action != DAMOS_PAGEOUT)
-		return 0;
-
 	for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
 		struct page *page = damon_get_page(PHYS_PFN(addr));
 
@@ -242,6 +237,19 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
 	return applied * PAGE_SIZE;
 }
 
+static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
+		struct damon_target *t, struct damon_region *r,
+		struct damos *scheme)
+{
+	switch (scheme->action) {
+	case DAMOS_PAGEOUT:
+		return damon_pa_pageout(r);
+	default:
+		break;
+	}
+	return 0;
+}
+
 static int damon_pa_scheme_score(struct damon_ctx *context,
 		struct damon_target *t, struct damon_region *r,
 		struct damos *scheme)

From ceec91fb7d30ffa4952b31dcd2f59042fbfcb2f8 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 13 Jun 2022 19:22:56 +0000
Subject: [PATCH 510/737] mm/damon/schemes: add 'LRU_PRIO' DAMOS action

This commit adds a new DAMOS action called 'LRU_PRIO' for the physical
address space.  The action prioritizes pages in the memory regions of the
user-specified target access pattern on their LRU lists.  This is hence
supposed to be used for frequently accessed (hot) memory regions so that
hot pages could be more likely protected under memory pressure.
Internally, it simply calls 'mark_page_accessed()'.

Link: https://lkml.kernel.org/r/20220613192301.8817-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h |  2 ++
 mm/damon/ops-common.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 mm/damon/ops-common.h |  2 ++
 mm/damon/paddr.c      | 20 ++++++++++++++++++++
 mm/damon/sysfs.c      |  1 +
 5 files changed, 67 insertions(+)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index b9aae19fab3e9..4c64e03e94d82 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -86,6 +86,7 @@ struct damon_target {
  * @DAMOS_PAGEOUT:	Call ``madvise()`` for the region with MADV_PAGEOUT.
  * @DAMOS_HUGEPAGE:	Call ``madvise()`` for the region with MADV_HUGEPAGE.
  * @DAMOS_NOHUGEPAGE:	Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
+ * @DAMOS_LRU_PRIO:	Prioritize the region on its LRU lists.
  * @DAMOS_STAT:		Do nothing but count the stat.
  * @NR_DAMOS_ACTIONS:	Total number of DAMOS actions
  */
@@ -95,6 +96,7 @@ enum damos_action {
 	DAMOS_PAGEOUT,
 	DAMOS_HUGEPAGE,
 	DAMOS_NOHUGEPAGE,
+	DAMOS_LRU_PRIO,
 	DAMOS_STAT,		/* Do nothing but only record the stat */
 	NR_DAMOS_ACTIONS,
 };
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index 10ef20b2003f5..b1335de200e77 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -130,3 +130,45 @@ int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
 	/* Return coldness of the region */
 	return DAMOS_MAX_SCORE - hotness;
 }
+
+int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
+			struct damos *s)
+{
+	unsigned int max_nr_accesses;
+	int freq_subscore;
+	unsigned int age_in_sec;
+	int age_in_log, age_subscore;
+	unsigned int freq_weight = s->quota.weight_nr_accesses;
+	unsigned int age_weight = s->quota.weight_age;
+	int hotness;
+
+	max_nr_accesses = c->aggr_interval / c->sample_interval;
+	freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses;
+
+	age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000;
+	for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec;
+			age_in_log++, age_in_sec >>= 1)
+		;
+
+	/* If frequency is 0, higher age means it's colder */
+	if (freq_subscore == 0)
+		age_in_log *= -1;
+
+	/*
+	 * Now age_in_log is in [-DAMON_MAX_AGE_IN_LOG, DAMON_MAX_AGE_IN_LOG].
+	 * Scale it to be in [0, 100] and set it as age subscore.
+	 */
+	age_in_log += DAMON_MAX_AGE_IN_LOG;
+	age_subscore = age_in_log * DAMON_MAX_SUBSCORE /
+		DAMON_MAX_AGE_IN_LOG / 2;
+
+	hotness = (freq_weight * freq_subscore + age_weight * age_subscore);
+	if (freq_weight + age_weight)
+		hotness /= freq_weight + age_weight;
+	/*
+	 * Transform it to fit in [0, DAMOS_MAX_SCORE]
+	 */
+	hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE;
+
+	return hotness;
+}
diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h
index e790cb5f8fe05..52329ff361cd0 100644
--- a/mm/damon/ops-common.h
+++ b/mm/damon/ops-common.h
@@ -14,3 +14,5 @@ void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr);
 
 int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
 			struct damos *s);
+int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
+			struct damos *s);
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index f10335972ab5e..93f1ab354f684 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -237,6 +237,22 @@ static unsigned long damon_pa_pageout(struct damon_region *r)
 	return applied * PAGE_SIZE;
 }
 
+static unsigned long damon_pa_mark_accessed(struct damon_region *r)
+{
+	unsigned long addr, applied = 0;
+
+	for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
+		struct page *page = damon_get_page(PHYS_PFN(addr));
+
+		if (!page)
+			continue;
+		mark_page_accessed(page);
+		put_page(page);
+		applied++;
+	}
+	return applied * PAGE_SIZE;
+}
+
 static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
 		struct damon_target *t, struct damon_region *r,
 		struct damos *scheme)
@@ -244,6 +260,8 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
 	switch (scheme->action) {
 	case DAMOS_PAGEOUT:
 		return damon_pa_pageout(r);
+	case DAMOS_LRU_PRIO:
+		return damon_pa_mark_accessed(r);
 	default:
 		break;
 	}
@@ -257,6 +275,8 @@ static int damon_pa_scheme_score(struct damon_ctx *context,
 	switch (scheme->action) {
 	case DAMOS_PAGEOUT:
 		return damon_pageout_score(context, r, scheme);
+	case DAMOS_LRU_PRIO:
+		return damon_hot_score(context, r, scheme);
 	default:
 		break;
 	}
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 2b70e3144d7bd..7b4d75ecd5c5d 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -762,6 +762,7 @@ static const char * const damon_sysfs_damos_action_strs[] = {
 	"pageout",
 	"hugepage",
 	"nohugepage",
+	"lru_prio",
 	"stat",
 };
 

From 6bbe643ae89131e7958238387b8cacb83e2df355 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 13 Jun 2022 19:22:57 +0000
Subject: [PATCH 511/737] Docs/admin-guide/damon/sysfs: document 'LRU_PRIO'
 scheme action

This commit documents the 'lru_prio' scheme action for DAMON sysfs
interface.

Link: https://lkml.kernel.org/r/20220613192301.8817-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 1bb7b72414b24..af4e15ee81cdc 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -264,6 +264,7 @@ that can be written to and read from the file and their meaning are as below.
  - ``pageout``: Call ``madvise()`` for the region with ``MADV_PAGEOUT``
  - ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``
  - ``nohugepage``: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``
+ - ``lru_prio``: Prioritize the region on its LRU lists.
  - ``stat``: Do nothing but count the statistics
 
 schemes/<N>/access_pattern/

From 71f679a953d89105b9e34ec4fa21b5dba7b324c7 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 13 Jun 2022 19:22:58 +0000
Subject: [PATCH 512/737] mm/damon/schemes: add 'LRU_DEPRIO' action

This commit adds a new DAMON-based operation scheme action called
'LRU_DEPRIO' for physical address space.  The action deprioritizes pages
in the memory area of the target access pattern on their LRU lists.  This
is hence supposed to be used for rarely accessed (cold) memory regions so
that cold pages could be more likely reclaimed first under memory
pressure.  Internally, it simply calls 'lru_deactivate()'.

Using this with 'LRU_PRIO' action for hot pages, users can proactively
sort LRU lists based on the access pattern.  That is, it can make the LRU
lists somewhat more trustworthy source of access temperature.  As a
result, efficiency of LRU-lists based mechanisms including the reclamation
target selection could be improved.

Link: https://lkml.kernel.org/r/20220613192301.8817-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h |  2 ++
 mm/damon/paddr.c      | 20 ++++++++++++++++++++
 mm/damon/sysfs.c      |  1 +
 3 files changed, 23 insertions(+)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 4c64e03e94d82..7b1f4a4882308 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -87,6 +87,7 @@ struct damon_target {
  * @DAMOS_HUGEPAGE:	Call ``madvise()`` for the region with MADV_HUGEPAGE.
  * @DAMOS_NOHUGEPAGE:	Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
  * @DAMOS_LRU_PRIO:	Prioritize the region on its LRU lists.
+ * @DAMOS_LRU_DEPRIO:	Deprioritize the region on its LRU lists.
  * @DAMOS_STAT:		Do nothing but count the stat.
  * @NR_DAMOS_ACTIONS:	Total number of DAMOS actions
  */
@@ -97,6 +98,7 @@ enum damos_action {
 	DAMOS_HUGEPAGE,
 	DAMOS_NOHUGEPAGE,
 	DAMOS_LRU_PRIO,
+	DAMOS_LRU_DEPRIO,
 	DAMOS_STAT,		/* Do nothing but only record the stat */
 	NR_DAMOS_ACTIONS,
 };
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 93f1ab354f684..46565f67dd3f9 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -253,6 +253,22 @@ static unsigned long damon_pa_mark_accessed(struct damon_region *r)
 	return applied * PAGE_SIZE;
 }
 
+static unsigned long damon_pa_deactivate_pages(struct damon_region *r)
+{
+	unsigned long addr, applied = 0;
+
+	for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
+		struct page *page = damon_get_page(PHYS_PFN(addr));
+
+		if (!page)
+			continue;
+		deactivate_page(page);
+		put_page(page);
+		applied++;
+	}
+	return applied * PAGE_SIZE;
+}
+
 static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
 		struct damon_target *t, struct damon_region *r,
 		struct damos *scheme)
@@ -262,6 +278,8 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
 		return damon_pa_pageout(r);
 	case DAMOS_LRU_PRIO:
 		return damon_pa_mark_accessed(r);
+	case DAMOS_LRU_DEPRIO:
+		return damon_pa_deactivate_pages(r);
 	default:
 		break;
 	}
@@ -277,6 +295,8 @@ static int damon_pa_scheme_score(struct damon_ctx *context,
 		return damon_pageout_score(context, r, scheme);
 	case DAMOS_LRU_PRIO:
 		return damon_hot_score(context, r, scheme);
+	case DAMOS_LRU_DEPRIO:
+		return damon_pageout_score(context, r, scheme);
 	default:
 		break;
 	}
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 7b4d75ecd5c5d..bdef9682d0a00 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -763,6 +763,7 @@ static const char * const damon_sysfs_damos_action_strs[] = {
 	"hugepage",
 	"nohugepage",
 	"lru_prio",
+	"lru_deprio",
 	"stat",
 };
 

From 96dbeb51197029d91b95818e145b4d743c437941 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 13 Jun 2022 19:22:59 +0000
Subject: [PATCH 513/737] Docs/admin-guide/damon/sysfs: document 'LRU_DEPRIO'
 scheme action

This commit documents the 'LRU_DEPRIO' scheme action for DAMON sysfs
interface.`

Link: https://lkml.kernel.org/r/20220613192301.8817-8-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index af4e15ee81cdc..d822bf6355ce7 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -265,6 +265,7 @@ that can be written to and read from the file and their meaning are as below.
  - ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``
  - ``nohugepage``: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``
  - ``lru_prio``: Prioritize the region on its LRU lists.
+ - ``lru_deprio``: Deprioritize the region on its LRU lists.
  - ``stat``: Do nothing but count the statistics
 
 schemes/<N>/access_pattern/

From 87fd67d2b4f084e40aa299d11e183ad3419d2e19 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 13 Jun 2022 19:23:00 +0000
Subject: [PATCH 514/737] mm/damon: introduce DAMON-based LRU-lists Sorting

Users can do data access-aware LRU-lists sorting using 'LRU_PRIO' and
'LRU_DEPRIO' DAMOS actions.  However, finding best parameters including
the hotness/coldness thresholds, CPU quota, and watermarks could be
challenging for some users.  To make the scheme easy to be used without
complex tuning for common situations, this commit implements a static
kernel module called 'DAMON_LRU_SORT' using the 'LRU_PRIO' and
'LRU_DEPRIO' DAMOS actions.

It proactively sorts LRU-lists using DAMON with conservatively chosen
default values of the parameters.  That is, the module under its default
parameters will make no harm for common situations but provide some level
of efficiency improvements for systems having clear hot/cold access
pattern under a level of memory pressure while consuming only a limited
small portion of CPU time.

Link: https://lkml.kernel.org/r/20220613192301.8817-9-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/Kconfig    |   8 +
 mm/damon/Makefile   |   1 +
 mm/damon/lru_sort.c | 546 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 555 insertions(+)
 create mode 100644 mm/damon/lru_sort.c

diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index 9b559c76d6dd1..66265e3a9c659 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -92,4 +92,12 @@ config DAMON_RECLAIM
 	  reclamation under light memory pressure, while the traditional page
 	  scanning-based reclamation is used for heavy pressure.
 
+config DAMON_LRU_SORT
+	bool "Build DAMON-based LRU-lists sorting (DAMON_LRU_SORT)"
+	depends on DAMON_PADDR
+	help
+	  This builds the DAMON-based LRU-lists sorting subsystem.  It tries to
+	  protect frequently accessed (hot) pages while rarely accessed (cold)
+	  pages reclaimed first under memory pressure.
+
 endmenu
diff --git a/mm/damon/Makefile b/mm/damon/Makefile
index dbf7190b4144a..3e6b8ad73858a 100644
--- a/mm/damon/Makefile
+++ b/mm/damon/Makefile
@@ -6,3 +6,4 @@ obj-$(CONFIG_DAMON_PADDR)	+= ops-common.o paddr.o
 obj-$(CONFIG_DAMON_SYSFS)	+= sysfs.o
 obj-$(CONFIG_DAMON_DBGFS)	+= dbgfs.o
 obj-$(CONFIG_DAMON_RECLAIM)	+= reclaim.o
+obj-$(CONFIG_DAMON_LRU_SORT)	+= lru_sort.o
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
new file mode 100644
index 0000000000000..c276736a071c4
--- /dev/null
+++ b/mm/damon/lru_sort.c
@@ -0,0 +1,546 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DAMON-based LRU-lists Sorting
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#define pr_fmt(fmt) "damon-lru-sort: " fmt
+
+#include <linux/damon.h>
+#include <linux/ioport.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "damon_lru_sort."
+
+/*
+ * Enable or disable DAMON_LRU_SORT.
+ *
+ * You can enable DAMON_LRU_SORT by setting the value of this parameter as
+ * ``Y``.  Setting it as ``N`` disables DAMON_LRU_SORT.  Note that
+ * DAMON_LRU_SORT could do no real monitoring and LRU-lists sorting due to the
+ * watermarks-based activation condition.  Refer to below descriptions for the
+ * watermarks parameter for this.
+ */
+static bool enabled __read_mostly;
+
+/*
+ * Make DAMON_LRU_SORT reads the input parameters again, except ``enabled``.
+ *
+ * Input parameters that updated while DAMON_LRU_SORT is running are not
+ * applied by default.  Once this parameter is set as ``Y``, DAMON_LRU_SORT
+ * reads values of parametrs except ``enabled`` again.  Once the re-reading is
+ * done, this parameter is set as ``N``.  If invalid parameters are found while
+ * the re-reading, DAMON_LRU_SORT will be disabled.
+ */
+static bool commit_inputs __read_mostly;
+module_param(commit_inputs, bool, 0600);
+
+/*
+ * Access frequency threshold for hot memory regions identification in permil.
+ *
+ * If a memory region is accessed in frequency of this or higher,
+ * DAMON_LRU_SORT identifies the region as hot, and mark it as accessed on the
+ * LRU list, so that it could not be reclaimed under memory pressure.  50% by
+ * default.
+ */
+static unsigned long hot_thres_access_freq = 500;
+module_param(hot_thres_access_freq, ulong, 0600);
+
+/*
+ * Time threshold for cold memory regions identification in microseconds.
+ *
+ * If a memory region is not accessed for this or longer time, DAMON_LRU_SORT
+ * identifies the region as cold, and mark it as unaccessed on the LRU list, so
+ * that it could be reclaimed first under memory pressure.  120 seconds by
+ * default.
+ */
+static unsigned long cold_min_age __read_mostly = 120000000;
+module_param(cold_min_age, ulong, 0600);
+
+/*
+ * Limit of time for trying the LRU lists sorting in milliseconds.
+ *
+ * DAMON_LRU_SORT tries to use only up to this time within a time window
+ * (quota_reset_interval_ms) for trying LRU lists sorting.  This can be used
+ * for limiting CPU consumption of DAMON_LRU_SORT.  If the value is zero, the
+ * limit is disabled.
+ *
+ * 10 ms by default.
+ */
+static unsigned long quota_ms __read_mostly = 10;
+module_param(quota_ms, ulong, 0600);
+
+/*
+ * The time quota charge reset interval in milliseconds.
+ *
+ * The charge reset interval for the quota of time (quota_ms).  That is,
+ * DAMON_LRU_SORT does not try LRU-lists sorting for more than quota_ms
+ * milliseconds or quota_sz bytes within quota_reset_interval_ms milliseconds.
+ *
+ * 1 second by default.
+ */
+static unsigned long quota_reset_interval_ms __read_mostly = 1000;
+module_param(quota_reset_interval_ms, ulong, 0600);
+
+/*
+ * The watermarks check time interval in microseconds.
+ *
+ * Minimal time to wait before checking the watermarks, when DAMON_LRU_SORT is
+ * enabled but inactive due to its watermarks rule.  5 seconds by default.
+ */
+static unsigned long wmarks_interval __read_mostly = 5000000;
+module_param(wmarks_interval, ulong, 0600);
+
+/*
+ * Free memory rate (per thousand) for the high watermark.
+ *
+ * If free memory of the system in bytes per thousand bytes is higher than
+ * this, DAMON_LRU_SORT becomes inactive, so it does nothing but periodically
+ * checks the watermarks.  200 (20%) by default.
+ */
+static unsigned long wmarks_high __read_mostly = 200;
+module_param(wmarks_high, ulong, 0600);
+
+/*
+ * Free memory rate (per thousand) for the middle watermark.
+ *
+ * If free memory of the system in bytes per thousand bytes is between this and
+ * the low watermark, DAMON_LRU_SORT becomes active, so starts the monitoring
+ * and the LRU-lists sorting.  150 (15%) by default.
+ */
+static unsigned long wmarks_mid __read_mostly = 150;
+module_param(wmarks_mid, ulong, 0600);
+
+/*
+ * Free memory rate (per thousand) for the low watermark.
+ *
+ * If free memory of the system in bytes per thousand bytes is lower than this,
+ * DAMON_LRU_SORT becomes inactive, so it does nothing but periodically checks
+ * the watermarks.  50 (5%) by default.
+ */
+static unsigned long wmarks_low __read_mostly = 50;
+module_param(wmarks_low, ulong, 0600);
+
+/*
+ * Sampling interval for the monitoring in microseconds.
+ *
+ * The sampling interval of DAMON for the hot/cold memory monitoring.  Please
+ * refer to the DAMON documentation for more detail.  5 ms by default.
+ */
+static unsigned long sample_interval __read_mostly = 5000;
+module_param(sample_interval, ulong, 0600);
+
+/*
+ * Aggregation interval for the monitoring in microseconds.
+ *
+ * The aggregation interval of DAMON for the hot/cold memory monitoring.
+ * Please refer to the DAMON documentation for more detail.  100 ms by default.
+ */
+static unsigned long aggr_interval __read_mostly = 100000;
+module_param(aggr_interval, ulong, 0600);
+
+/*
+ * Minimum number of monitoring regions.
+ *
+ * The minimal number of monitoring regions of DAMON for the hot/cold memory
+ * monitoring.  This can be used to set lower-bound of the monitoring quality.
+ * But, setting this too high could result in increased monitoring overhead.
+ * Please refer to the DAMON documentation for more detail.  10 by default.
+ */
+static unsigned long min_nr_regions __read_mostly = 10;
+module_param(min_nr_regions, ulong, 0600);
+
+/*
+ * Maximum number of monitoring regions.
+ *
+ * The maximum number of monitoring regions of DAMON for the hot/cold memory
+ * monitoring.  This can be used to set upper-bound of the monitoring overhead.
+ * However, setting this too low could result in bad monitoring quality.
+ * Please refer to the DAMON documentation for more detail.  1000 by default.
+ */
+static unsigned long max_nr_regions __read_mostly = 1000;
+module_param(max_nr_regions, ulong, 0600);
+
+/*
+ * Start of the target memory region in physical address.
+ *
+ * The start physical address of memory region that DAMON_LRU_SORT will do work
+ * against.  By default, biggest System RAM is used as the region.
+ */
+static unsigned long monitor_region_start __read_mostly;
+module_param(monitor_region_start, ulong, 0600);
+
+/*
+ * End of the target memory region in physical address.
+ *
+ * The end physical address of memory region that DAMON_LRU_SORT will do work
+ * against.  By default, biggest System RAM is used as the region.
+ */
+static unsigned long monitor_region_end __read_mostly;
+module_param(monitor_region_end, ulong, 0600);
+
+/*
+ * PID of the DAMON thread
+ *
+ * If DAMON_LRU_SORT is enabled, this becomes the PID of the worker thread.
+ * Else, -1.
+ */
+static int kdamond_pid __read_mostly = -1;
+module_param(kdamond_pid, int, 0400);
+
+/*
+ * Number of hot memory regions that tried to be LRU-sorted.
+ */
+static unsigned long nr_lru_sort_tried_hot_regions __read_mostly;
+module_param(nr_lru_sort_tried_hot_regions, ulong, 0400);
+
+/*
+ * Total bytes of hot memory regions that tried to be LRU-sorted.
+ */
+static unsigned long bytes_lru_sort_tried_hot_regions __read_mostly;
+module_param(bytes_lru_sort_tried_hot_regions, ulong, 0400);
+
+/*
+ * Number of hot memory regions that successfully be LRU-sorted.
+ */
+static unsigned long nr_lru_sorted_hot_regions __read_mostly;
+module_param(nr_lru_sorted_hot_regions, ulong, 0400);
+
+/*
+ * Total bytes of hot memory regions that successfully be LRU-sorted.
+ */
+static unsigned long bytes_lru_sorted_hot_regions __read_mostly;
+module_param(bytes_lru_sorted_hot_regions, ulong, 0400);
+
+/*
+ * Number of times that the time quota limit for hot regions have exceeded
+ */
+static unsigned long nr_hot_quota_exceeds __read_mostly;
+module_param(nr_hot_quota_exceeds, ulong, 0400);
+
+/*
+ * Number of cold memory regions that tried to be LRU-sorted.
+ */
+static unsigned long nr_lru_sort_tried_cold_regions __read_mostly;
+module_param(nr_lru_sort_tried_cold_regions, ulong, 0400);
+
+/*
+ * Total bytes of cold memory regions that tried to be LRU-sorted.
+ */
+static unsigned long bytes_lru_sort_tried_cold_regions __read_mostly;
+module_param(bytes_lru_sort_tried_cold_regions, ulong, 0400);
+
+/*
+ * Number of cold memory regions that successfully be LRU-sorted.
+ */
+static unsigned long nr_lru_sorted_cold_regions __read_mostly;
+module_param(nr_lru_sorted_cold_regions, ulong, 0400);
+
+/*
+ * Total bytes of cold memory regions that successfully be LRU-sorted.
+ */
+static unsigned long bytes_lru_sorted_cold_regions __read_mostly;
+module_param(bytes_lru_sorted_cold_regions, ulong, 0400);
+
+/*
+ * Number of times that the time quota limit for cold regions have exceeded
+ */
+static unsigned long nr_cold_quota_exceeds __read_mostly;
+module_param(nr_cold_quota_exceeds, ulong, 0400);
+
+static struct damon_ctx *ctx;
+static struct damon_target *target;
+
+struct damon_lru_sort_ram_walk_arg {
+	unsigned long start;
+	unsigned long end;
+};
+
+static int walk_system_ram(struct resource *res, void *arg)
+{
+	struct damon_lru_sort_ram_walk_arg *a = arg;
+
+	if (a->end - a->start < resource_size(res)) {
+		a->start = res->start;
+		a->end = res->end;
+	}
+	return 0;
+}
+
+/*
+ * Find biggest 'System RAM' resource and store its start and end address in
+ * @start and @end, respectively.  If no System RAM is found, returns false.
+ */
+static bool get_monitoring_region(unsigned long *start, unsigned long *end)
+{
+	struct damon_lru_sort_ram_walk_arg arg = {};
+
+	walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram);
+	if (arg.end <= arg.start)
+		return false;
+
+	*start = arg.start;
+	*end = arg.end;
+	return true;
+}
+
+/* Create a DAMON-based operation scheme for hot memory regions */
+static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres)
+{
+	struct damos_watermarks wmarks = {
+		.metric = DAMOS_WMARK_FREE_MEM_RATE,
+		.interval = wmarks_interval,
+		.high = wmarks_high,
+		.mid = wmarks_mid,
+		.low = wmarks_low,
+	};
+	struct damos_quota quota = {
+		/*
+		 * Do not try LRU-lists sorting of hot pages for more than half
+		 * of quota_ms milliseconds within quota_reset_interval_ms.
+		 */
+		.ms = quota_ms / 2,
+		.sz = 0,
+		.reset_interval = quota_reset_interval_ms,
+		/* Within the quota, mark hotter regions accessed first. */
+		.weight_sz = 0,
+		.weight_nr_accesses = 1,
+		.weight_age = 0,
+	};
+	struct damos *scheme = damon_new_scheme(
+			/* Find regions having PAGE_SIZE or larger size */
+			PAGE_SIZE, ULONG_MAX,
+			/* and accessed for more than the threshold */
+			hot_thres, UINT_MAX,
+			/* no matter its age */
+			0, UINT_MAX,
+			/* prioritize those on LRU lists, as soon as found */
+			DAMOS_LRU_PRIO,
+			/* under the quota. */
+			&quota,
+			/* (De)activate this according to the watermarks. */
+			&wmarks);
+
+	return scheme;
+}
+
+/* Create a DAMON-based operation scheme for cold memory regions */
+static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres)
+{
+	struct damos_watermarks wmarks = {
+		.metric = DAMOS_WMARK_FREE_MEM_RATE,
+		.interval = wmarks_interval,
+		.high = wmarks_high,
+		.mid = wmarks_mid,
+		.low = wmarks_low,
+	};
+	struct damos_quota quota = {
+		/*
+		 * Do not try LRU-lists sorting of cold pages for more than
+		 * half of quota_ms milliseconds within
+		 * quota_reset_interval_ms.
+		 */
+		.ms = quota_ms / 2,
+		.sz = 0,
+		.reset_interval = quota_reset_interval_ms,
+		/* Within the quota, mark colder regions not accessed first. */
+		.weight_sz = 0,
+		.weight_nr_accesses = 0,
+		.weight_age = 1,
+	};
+	struct damos *scheme = damon_new_scheme(
+			/* Find regions having PAGE_SIZE or larger size */
+			PAGE_SIZE, ULONG_MAX,
+			/* and not accessed at all */
+			0, 0,
+			/* for cold_thres or more micro-seconds, and */
+			cold_thres, UINT_MAX,
+			/* mark those as not accessed, as soon as found */
+			DAMOS_LRU_DEPRIO,
+			/* under the quota. */
+			&quota,
+			/* (De)activate this according to the watermarks. */
+			&wmarks);
+
+	return scheme;
+}
+
+static int damon_lru_sort_apply_parameters(void)
+{
+	struct damos *scheme, *next_scheme;
+	struct damon_addr_range addr_range;
+	unsigned int hot_thres, cold_thres;
+	int err = 0;
+
+	err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0,
+			min_nr_regions, max_nr_regions);
+	if (err)
+		return err;
+
+	/* free previously set schemes */
+	damon_for_each_scheme_safe(scheme, next_scheme, ctx)
+		damon_destroy_scheme(scheme);
+
+	/* aggr_interval / sample_interval is the maximum nr_accesses */
+	hot_thres = aggr_interval / sample_interval * hot_thres_access_freq /
+		1000;
+	scheme = damon_lru_sort_new_hot_scheme(hot_thres);
+	if (!scheme)
+		return -ENOMEM;
+	damon_add_scheme(ctx, scheme);
+
+	cold_thres = cold_min_age / aggr_interval;
+	scheme = damon_lru_sort_new_cold_scheme(cold_thres);
+	if (!scheme)
+		return -ENOMEM;
+	damon_add_scheme(ctx, scheme);
+
+	if (monitor_region_start > monitor_region_end)
+		return -EINVAL;
+	if (!monitor_region_start && !monitor_region_end &&
+			!get_monitoring_region(&monitor_region_start,
+				&monitor_region_end))
+		return -EINVAL;
+	addr_range.start = monitor_region_start;
+	addr_range.end = monitor_region_end;
+	return damon_set_regions(target, &addr_range, 1);
+}
+
+static int damon_lru_sort_turn(bool on)
+{
+	int err;
+
+	if (!on) {
+		err = damon_stop(&ctx, 1);
+		if (!err)
+			kdamond_pid = -1;
+		return err;
+	}
+
+	err = damon_lru_sort_apply_parameters();
+	if (err)
+		return err;
+
+	err = damon_start(&ctx, 1, true);
+	if (err)
+		return err;
+	kdamond_pid = ctx->kdamond->pid;
+	return 0;
+}
+
+static struct delayed_work damon_lru_sort_timer;
+static void damon_lru_sort_timer_fn(struct work_struct *work)
+{
+	static bool last_enabled;
+	bool now_enabled;
+
+	now_enabled = enabled;
+	if (last_enabled != now_enabled) {
+		if (!damon_lru_sort_turn(now_enabled))
+			last_enabled = now_enabled;
+		else
+			enabled = last_enabled;
+	}
+}
+static DECLARE_DELAYED_WORK(damon_lru_sort_timer, damon_lru_sort_timer_fn);
+
+static bool damon_lru_sort_initialized;
+
+static int damon_lru_sort_enabled_store(const char *val,
+		const struct kernel_param *kp)
+{
+	int rc = param_set_bool(val, kp);
+
+	if (rc < 0)
+		return rc;
+
+	if (!damon_lru_sort_initialized)
+		return rc;
+
+	schedule_delayed_work(&damon_lru_sort_timer, 0);
+
+	return 0;
+}
+
+static const struct kernel_param_ops enabled_param_ops = {
+	.set = damon_lru_sort_enabled_store,
+	.get = param_get_bool,
+};
+
+module_param_cb(enabled, &enabled_param_ops, &enabled, 0600);
+MODULE_PARM_DESC(enabled,
+	"Enable or disable DAMON_LRU_SORT (default: disabled)");
+
+static int damon_lru_sort_handle_commit_inputs(void)
+{
+	int err;
+
+	if (!commit_inputs)
+		return 0;
+
+	err = damon_lru_sort_apply_parameters();
+	commit_inputs = false;
+	return err;
+}
+
+static int damon_lru_sort_after_aggregation(struct damon_ctx *c)
+{
+	struct damos *s;
+
+	/* update the stats parameter */
+	damon_for_each_scheme(s, c) {
+		if (s->action == DAMOS_LRU_PRIO) {
+			nr_lru_sort_tried_hot_regions = s->stat.nr_tried;
+			bytes_lru_sort_tried_hot_regions = s->stat.sz_tried;
+			nr_lru_sorted_hot_regions = s->stat.nr_applied;
+			bytes_lru_sorted_hot_regions = s->stat.sz_applied;
+			nr_hot_quota_exceeds = s->stat.qt_exceeds;
+		} else if (s->action == DAMOS_LRU_DEPRIO) {
+			nr_lru_sort_tried_cold_regions = s->stat.nr_tried;
+			bytes_lru_sort_tried_cold_regions = s->stat.sz_tried;
+			nr_lru_sorted_cold_regions = s->stat.nr_applied;
+			bytes_lru_sorted_cold_regions = s->stat.sz_applied;
+			nr_cold_quota_exceeds = s->stat.qt_exceeds;
+		}
+	}
+
+	return damon_lru_sort_handle_commit_inputs();
+}
+
+static int damon_lru_sort_after_wmarks_check(struct damon_ctx *c)
+{
+	return damon_lru_sort_handle_commit_inputs();
+}
+
+static int __init damon_lru_sort_init(void)
+{
+	ctx = damon_new_ctx();
+	if (!ctx)
+		return -ENOMEM;
+
+	if (damon_select_ops(ctx, DAMON_OPS_PADDR))
+		return -EINVAL;
+
+	ctx->callback.after_wmarks_check = damon_lru_sort_after_wmarks_check;
+	ctx->callback.after_aggregation = damon_lru_sort_after_aggregation;
+
+	target = damon_new_target();
+	if (!target) {
+		damon_destroy_ctx(ctx);
+		return -ENOMEM;
+	}
+	damon_add_target(ctx, target);
+
+	schedule_delayed_work(&damon_lru_sort_timer, 0);
+
+	damon_lru_sort_initialized = true;
+	return 0;
+}
+
+module_init(damon_lru_sort_init);

From be7268051dad024d8f7297cd5284a914619ea493 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 13 Jun 2022 19:23:01 +0000
Subject: [PATCH 515/737] Docs/admin-guide/damon: add a document for
 DAMON_LRU_SORT

This commit documents the usage of DAMON_LRU_SORT for admins.

Link: https://lkml.kernel.org/r/20220613192301.8817-10-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/index.rst  |   1 +
 .../admin-guide/mm/damon/lru_sort.rst         | 294 ++++++++++++++++++
 2 files changed, 295 insertions(+)
 create mode 100644 Documentation/admin-guide/mm/damon/lru_sort.rst

diff --git a/Documentation/admin-guide/mm/damon/index.rst b/Documentation/admin-guide/mm/damon/index.rst
index 61aff88347f3c..53762770e0e44 100644
--- a/Documentation/admin-guide/mm/damon/index.rst
+++ b/Documentation/admin-guide/mm/damon/index.rst
@@ -14,3 +14,4 @@ optimize those.
    start
    usage
    reclaim
+   lru_sort
diff --git a/Documentation/admin-guide/mm/damon/lru_sort.rst b/Documentation/admin-guide/mm/damon/lru_sort.rst
new file mode 100644
index 0000000000000..c09cace806516
--- /dev/null
+++ b/Documentation/admin-guide/mm/damon/lru_sort.rst
@@ -0,0 +1,294 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=============================
+DAMON-based LRU-lists Sorting
+=============================
+
+DAMON-based LRU-lists Sorting (DAMON_LRU_SORT) is a static kernel module that
+aimed to be used for proactive and lightweight data access pattern based
+(de)prioritization of pages on their LRU-lists for making LRU-lists a more
+trusworthy data access pattern source.
+
+Where Proactive LRU-lists Sorting is Required?
+==============================================
+
+As page-granularity access checking overhead could be significant on huge
+systems, LRU lists are normally not proactively sorted but partially and
+reactively sorted for special events including specific user requests, system
+calls and memory pressure.  As a result, LRU lists are sometimes not so
+perfectly prepared to be used as a trustworthy access pattern source for some
+situations including reclamation target pages selection under sudden memory
+pressure.
+
+Because DAMON can identify access patterns of best-effort accuracy while
+inducing only user-specified range of overhead, proactively running
+DAMON_LRU_SORT could be helpful for making LRU lists more trustworthy access
+pattern source with low and controlled overhead.
+
+How It Works?
+=============
+
+DAMON_LRU_SORT finds hot pages (pages of memory regions that showing access
+rates that higher than a user-specified threshold) and cold pages (pages of
+memory regions that showing no access for a time that longer than a
+user-specified threshold) using DAMON, and prioritizes hot pages while
+deprioritizing cold pages on their LRU-lists.  To avoid it consuming too much
+CPU for the prioritizations, a CPU time usage limit can be configured.  Under
+the limit, it prioritizes and deprioritizes more hot and cold pages first,
+respectively.  System administrators can also configure under what situation
+this scheme should automatically activated and deactivated with three memory
+pressure watermarks.
+
+Its default parameters for hotness/coldness thresholds and CPU quota limit are
+conservatively chosen.  That is, the module under its default parameters could
+be widely used without harm for common situations while providing a level of
+benefits for systems having clear hot/cold access patterns under memory
+pressure while consuming only a limited small portion of CPU time.
+
+Interface: Module Parameters
+============================
+
+To use this feature, you should first ensure your system is running on a kernel
+that is built with ``CONFIG_DAMON_LRU_SORT=y``.
+
+To let sysadmins enable or disable it and tune for the given system,
+DAMON_LRU_SORT utilizes module parameters.  That is, you can put
+``damon_lru_sort.<parameter>=<value>`` on the kernel boot command line or write
+proper values to ``/sys/modules/damon_lru_sort/parameters/<parameter>`` files.
+
+Below are the description of each parameter.
+
+enabled
+-------
+
+Enable or disable DAMON_LRU_SORT.
+
+You can enable DAMON_LRU_SORT by setting the value of this parameter as ``Y``.
+Setting it as ``N`` disables DAMON_LRU_SORT.  Note that DAMON_LRU_SORT could do
+no real monitoring and LRU-lists sorting due to the watermarks-based activation
+condition.  Refer to below descriptions for the watermarks parameter for this.
+
+commit_inputs
+-------------
+
+Make DAMON_LRU_SORT reads the input parameters again, except ``enabled``.
+
+Input parameters that updated while DAMON_LRU_SORT is running are not applied
+by default.  Once this parameter is set as ``Y``, DAMON_LRU_SORT reads values
+of parametrs except ``enabled`` again.  Once the re-reading is done, this
+parameter is set as ``N``.  If invalid parameters are found while the
+re-reading, DAMON_LRU_SORT will be disabled.
+
+hot_thres_access_freq
+---------------------
+
+Access frequency threshold for hot memory regions identification in permil.
+
+If a memory region is accessed in frequency of this or higher, DAMON_LRU_SORT
+identifies the region as hot, and mark it as accessed on the LRU list, so that
+it could not be reclaimed under memory pressure.  50% by default.
+
+cold_min_age
+------------
+
+Time threshold for cold memory regions identification in microseconds.
+
+If a memory region is not accessed for this or longer time, DAMON_LRU_SORT
+identifies the region as cold, and mark it as unaccessed on the LRU list, so
+that it could be reclaimed first under memory pressure.  120 seconds by
+default.
+
+quota_ms
+--------
+
+Limit of time for trying the LRU lists sorting in milliseconds.
+
+DAMON_LRU_SORT tries to use only up to this time within a time window
+(quota_reset_interval_ms) for trying LRU lists sorting.  This can be used
+for limiting CPU consumption of DAMON_LRU_SORT.  If the value is zero, the
+limit is disabled.
+
+10 ms by default.
+
+quota_reset_interval_ms
+-----------------------
+
+The time quota charge reset interval in milliseconds.
+
+The charge reset interval for the quota of time (quota_ms).  That is,
+DAMON_LRU_SORT does not try LRU-lists sorting for more than quota_ms
+milliseconds or quota_sz bytes within quota_reset_interval_ms milliseconds.
+
+1 second by default.
+
+wmarks_interval
+---------------
+
+The watermarks check time interval in microseconds.
+
+Minimal time to wait before checking the watermarks, when DAMON_LRU_SORT is
+enabled but inactive due to its watermarks rule.  5 seconds by default.
+
+wmarks_high
+-----------
+
+Free memory rate (per thousand) for the high watermark.
+
+If free memory of the system in bytes per thousand bytes is higher than this,
+DAMON_LRU_SORT becomes inactive, so it does nothing but periodically checks the
+watermarks.  200 (20%) by default.
+
+wmarks_mid
+----------
+
+Free memory rate (per thousand) for the middle watermark.
+
+If free memory of the system in bytes per thousand bytes is between this and
+the low watermark, DAMON_LRU_SORT becomes active, so starts the monitoring and
+the LRU-lists sorting.  150 (15%) by default.
+
+wmarks_low
+----------
+
+Free memory rate (per thousand) for the low watermark.
+
+If free memory of the system in bytes per thousand bytes is lower than this,
+DAMON_LRU_SORT becomes inactive, so it does nothing but periodically checks the
+watermarks.  50 (5%) by default.
+
+sample_interval
+---------------
+
+Sampling interval for the monitoring in microseconds.
+
+The sampling interval of DAMON for the cold memory monitoring.  Please refer to
+the DAMON documentation (:doc:`usage`) for more detail.  5ms by default.
+
+aggr_interval
+-------------
+
+Aggregation interval for the monitoring in microseconds.
+
+The aggregation interval of DAMON for the cold memory monitoring.  Please
+refer to the DAMON documentation (:doc:`usage`) for more detail.  100ms by
+default.
+
+min_nr_regions
+--------------
+
+Minimum number of monitoring regions.
+
+The minimal number of monitoring regions of DAMON for the cold memory
+monitoring.  This can be used to set lower-bound of the monitoring quality.
+But, setting this too high could result in increased monitoring overhead.
+Please refer to the DAMON documentation (:doc:`usage`) for more detail.  10 by
+default.
+
+max_nr_regions
+--------------
+
+Maximum number of monitoring regions.
+
+The maximum number of monitoring regions of DAMON for the cold memory
+monitoring.  This can be used to set upper-bound of the monitoring overhead.
+However, setting this too low could result in bad monitoring quality.  Please
+refer to the DAMON documentation (:doc:`usage`) for more detail.  1000 by
+defaults.
+
+monitor_region_start
+--------------------
+
+Start of target memory region in physical address.
+
+The start physical address of memory region that DAMON_LRU_SORT will do work
+against.  By default, biggest System RAM is used as the region.
+
+monitor_region_end
+------------------
+
+End of target memory region in physical address.
+
+The end physical address of memory region that DAMON_LRU_SORT will do work
+against.  By default, biggest System RAM is used as the region.
+
+kdamond_pid
+-----------
+
+PID of the DAMON thread.
+
+If DAMON_LRU_SORT is enabled, this becomes the PID of the worker thread.  Else,
+-1.
+
+nr_lru_sort_tried_hot_regions
+-----------------------------
+
+Number of hot memory regions that tried to be LRU-sorted.
+
+bytes_lru_sort_tried_hot_regions
+--------------------------------
+
+Total bytes of hot memory regions that tried to be LRU-sorted.
+
+nr_lru_sorted_hot_regions
+-------------------------
+
+Number of hot memory regions that successfully be LRU-sorted.
+
+bytes_lru_sorted_hot_regions
+----------------------------
+
+Total bytes of hot memory regions that successfully be LRU-sorted.
+
+nr_hot_quota_exceeds
+--------------------
+
+Number of times that the time quota limit for hot regions have exceeded.
+
+nr_lru_sort_tried_cold_regions
+------------------------------
+
+Number of cold memory regions that tried to be LRU-sorted.
+
+bytes_lru_sort_tried_cold_regions
+---------------------------------
+
+Total bytes of cold memory regions that tried to be LRU-sorted.
+
+nr_lru_sorted_cold_regions
+--------------------------
+
+Number of cold memory regions that successfully be LRU-sorted.
+
+bytes_lru_sorted_cold_regions
+-----------------------------
+
+Total bytes of cold memory regions that successfully be LRU-sorted.
+
+nr_cold_quota_exceeds
+---------------------
+
+Number of times that the time quota limit for cold regions have exceeded.
+
+Example
+=======
+
+Below runtime example commands make DAMON_LRU_SORT to find memory regions
+having >=50% access frequency and LRU-prioritize while LRU-deprioritizing
+memory regions that not accessed for 120 seconds.  The prioritization and
+deprioritization is limited to be done using only up to 1% CPU time to avoid
+DAMON_LRU_SORT consuming too much CPU time for the (de)prioritization.  It also
+asks DAMON_LRU_SORT to do nothing if the system's free memory rate is more than
+50%, but start the real works if it becomes lower than 40%.  If DAMON_RECLAIM
+doesn't make progress and therefore the free memory rate becomes lower than
+20%, it asks DAMON_LRU_SORT to do nothing again, so that we can fall back to
+the LRU-list based page granularity reclamation. ::
+
+    # cd /sys/modules/damon_lru_sort/parameters
+    # echo 500 > hot_thres_access_freq
+    # echo 120000000 > cold_min_age
+    # echo 10 > quota_ms
+    # echo 1000 > quota_reset_interval_ms
+    # echo 500 > wmarks_high
+    # echo 400 > wmarks_mid
+    # echo 200 > wmarks_low
+    # echo Y > enabled

From f8e15637689312b818f1e28be49c136492cada59 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 14 Jul 2022 17:04:58 +0000
Subject: [PATCH 516/737] mm/damon/lru_sort: fix potential memory leak in
 damon_lru_sort_init()

damon_lru_sort_init() returns an error when damon_select_ops() fails
without freeing 'ctx' which allocated before.  This commit fixes the
potential memory leak by freeing 'ctx' under the situation.

Link: https://lkml.kernel.org/r/20220714170458.49727-1-sj@kernel.org
Fixes: 40e983cca927 ("mm/damon: introduce DAMON-based LRU-lists Sorting")
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/lru_sort.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index c276736a071c4..9de6f00a71c5d 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -524,8 +524,10 @@ static int __init damon_lru_sort_init(void)
 	if (!ctx)
 		return -ENOMEM;
 
-	if (damon_select_ops(ctx, DAMON_OPS_PADDR))
+	if (damon_select_ops(ctx, DAMON_OPS_PADDR)) {
+		damon_destroy_ctx(ctx);
 		return -EINVAL;
+	}
 
 	ctx->callback.after_wmarks_check = damon_lru_sort_after_wmarks_check;
 	ctx->callback.after_aggregation = damon_lru_sort_after_aggregation;

From 2f286d6fe474bda4db6bec129b79d211ff6dcfae Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong@tencent.com>
Date: Tue, 23 Aug 2022 19:40:53 +0800
Subject: [PATCH 517/737] Docs/admin-guide/mm/damon/usage: fix the example code
 snip

The workflow example code is not working since it got the file names
wrong. So fix this.

Fixes: b18402726bd1 ("Docs/admin-guide/mm/damon/usage: document DAMON sysfs interface")
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Kairui Song <kasong@tencent.com>
Link: https://lore.kernel.org/r/20220823114053.53305-1-ryncsn@gmail.com
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/admin-guide/mm/damon/usage.rst | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index d822bf6355ce7..bbee99f2f681b 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -50,10 +50,10 @@ For a short example, users can monitor the virtual address space of a given
 workload as below. ::
 
     # cd /sys/kernel/mm/damon/admin/
-    # echo 1 > kdamonds/nr && echo 1 > kdamonds/0/contexts/nr
+    # echo 1 > kdamonds/nr_kdamonds && echo 1 > kdamonds/0/contexts/nr_contexts
     # echo vaddr > kdamonds/0/contexts/0/operations
-    # echo 1 > kdamonds/0/contexts/0/targets/nr
-    # echo $(pidof <workload>) > kdamonds/0/contexts/0/targets/0/pid
+    # echo 1 > kdamonds/0/contexts/0/targets/nr_targets
+    # echo $(pidof <workload>) > kdamonds/0/contexts/0/targets/0/pid_target
     # echo on > kdamonds/0/state
 
 Files Hierarchy
@@ -366,12 +366,12 @@ memory rate becomes larger than 60%, or lower than 30%". ::
     # echo 1 > kdamonds/0/contexts/0/schemes/nr_schemes
     # cd kdamonds/0/contexts/0/schemes/0
     # # set the basic access pattern and the action
-    # echo 4096 > access_patterns/sz/min
-    # echo 8192 > access_patterns/sz/max
-    # echo 0 > access_patterns/nr_accesses/min
-    # echo 5 > access_patterns/nr_accesses/max
-    # echo 10 > access_patterns/age/min
-    # echo 20 > access_patterns/age/max
+    # echo 4096 > access_pattern/sz/min
+    # echo 8192 > access_pattern/sz/max
+    # echo 0 > access_pattern/nr_accesses/min
+    # echo 5 > access_pattern/nr_accesses/max
+    # echo 10 > access_pattern/age/min
+    # echo 20 > access_pattern/age/max
     # echo pageout > action
     # # set quotas
     # echo 10 > quotas/ms

From 6fb5096b7284f3efec25dd73ba0275a983be263e Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Thu, 18 Aug 2022 15:37:43 +0800
Subject: [PATCH 518/737] mm/damon: validate if the pmd entry is present before
 accessing

pmd_huge() is used to validate if the pmd entry is mapped by a huge page,
also including the case of non-present (migration or hwpoisoned) pmd entry
on arm64 or x86 architectures.  This means that pmd_pfn() can not get the
correct pfn number for a non-present pmd entry, which will cause
damon_get_page() to get an incorrect page struct (also may be NULL by
pfn_to_online_page()), making the access statistics incorrect.

This means that the DAMON may make incorrect decision according to the
incorrect statistics, for example, DAMON may can not reclaim cold page
in time due to this cold page was regarded as accessed mistakenly if
DAMOS_PAGEOUT operation is specified.

Moreover it does not make sense that we still waste time to get the page
of the non-present entry.  Just treat it as not-accessed and skip it,
which maintains consistency with non-present pte level entries.

So add pmd entry present validation to fix the above issues.

Link: https://lkml.kernel.org/r/58b1d1f5fbda7db49ca886d9ef6783e3dcbbbc98.1660805030.git.baolin.wang@linux.alibaba.com
Fixes: 3f49584b262c ("mm/damon: implement primitives for the virtual memory address spaces")
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/vaddr.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 3c7b9d6dca95d..1d16c6c796386 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -304,6 +304,11 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
 
 	if (pmd_huge(*pmd)) {
 		ptl = pmd_lock(walk->mm, pmd);
+		if (!pmd_present(*pmd)) {
+			spin_unlock(ptl);
+			return 0;
+		}
+
 		if (pmd_huge(*pmd)) {
 			damon_pmdp_mkold(pmd, walk->mm, addr);
 			spin_unlock(ptl);
@@ -431,6 +436,11 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	if (pmd_huge(*pmd)) {
 		ptl = pmd_lock(walk->mm, pmd);
+		if (!pmd_present(*pmd)) {
+			spin_unlock(ptl);
+			return 0;
+		}
+
 		if (!pmd_huge(*pmd)) {
 			spin_unlock(ptl);
 			goto regular_page;

From 4b674fd27b63737635212100b3a7e741f5fb44fd Mon Sep 17 00:00:00 2001
From: SeongJae Park <sjpark@amazon.de>
Date: Fri, 5 Nov 2021 13:46:01 -0700
Subject: [PATCH 519/737] docs/vm/damon: remove broken reference

DAMON commits listing script had typos, so it was missing three DAMON
documentation fixes.  For the reason, the three fixes were not
backported on this tree.  This and following two commits backport those.

[1] https://git.kernel.org/sj/damon-hack/c/721ba248f9d5

This patch (of 3):

commit 876d0aac2e3af10fbaf1c7a814840c71e470dc5c upstream.

Building DAMON documents warns for a reference to nonexisting doc, as
below:

    $ time make htmldocs
    [...]
    Documentation/vm/damon/index.rst:24: WARNING: toctree contains reference to nonexisting document 'vm/damon/plans'

This fixes the warning by removing the wrong reference.

Link: https://lkml.kernel.org/r/20210917123958.3819-4-sj@kernel.org
Signed-off-by: SeongJae Park <sjpark@amazon.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: SeongJae Park <sjpark@amazon.com>
---
 Documentation/vm/damon/index.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Documentation/vm/damon/index.rst b/Documentation/vm/damon/index.rst
index a2858baf3bf1d..48c0bbff98b2f 100644
--- a/Documentation/vm/damon/index.rst
+++ b/Documentation/vm/damon/index.rst
@@ -27,4 +27,3 @@ workloads and systems.
    faq
    design
    api
-   plans

From e5a5845e9a514597d1da89ed86ac5cd00576e389 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Mar 2022 14:49:12 -0700
Subject: [PATCH 520/737] Docs/vm/damon: call low level monitoring primitives
 the operations

commit 561f4fc4972443f1273f7abbd8270fd949e6584b upstream.

Patch series "Docs/damon: Update documents for better consistency".

Some of DAMON document are not properly updated for latest version.  This
patchset updates such parts.

This patch (of 3):

DAMON code calls the low level monitoring primitives implementations the
monitoring operations.  The documentation would have no problem at still
calling those primitives implementation because there is no real
difference in the concepts, but making it more consistent with the code
would make it better.  This commit therefore convert sentences in the doc
specifically pointing the implementations of the primitives to call it
monitoring operations.

Link: https://lkml.kernel.org/r/20220222170100.17068-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20220222170100.17068-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: SeongJae Park <sjpark@amazon.com>
---
 Documentation/vm/damon/design.rst | 24 ++++++++++++------------
 Documentation/vm/damon/faq.rst    |  2 +-
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/Documentation/vm/damon/design.rst b/Documentation/vm/damon/design.rst
index 60b2c22d4e104..35de31c25e3a3 100644
--- a/Documentation/vm/damon/design.rst
+++ b/Documentation/vm/damon/design.rst
@@ -13,12 +13,13 @@ primitives that dependent on and optimized for the target address space.  On
 the other hand, the accuracy and overhead tradeoff mechanism, which is the core
 of DAMON, is in the pure logic space.  DAMON separates the two parts in
 different layers and defines its interface to allow various low level
-primitives implementations configurable with the core logic.
+primitives implementations configurable with the core logic.  We call the low
+level primitives implementations monitoring operations.
 
 Due to this separated design and the configurable interface, users can extend
-DAMON for any address space by configuring the core logics with appropriate low
-level primitive implementations.  If appropriate one is not provided, users can
-implement the primitives on their own.
+DAMON for any address space by configuring the core logics with appropriate
+monitoring operations.  If appropriate one is not provided, users can implement
+the operations on their own.
 
 For example, physical memory, virtual memory, swap space, those for specific
 processes, NUMA nodes, files, and backing memory devices would be supportable.
@@ -26,25 +27,24 @@ Also, if some architectures or devices support special optimized access check
 primitives, those will be easily configurable.
 
 
-Reference Implementations of Address Space Specific Primitives
-==============================================================
+Reference Implementations of Address Space Specific Monitoring Operations
+=========================================================================
 
-The low level primitives for the fundamental access monitoring are defined in
-two parts:
+The monitoring operations are defined in two parts:
 
 1. Identification of the monitoring target address range for the address space.
 2. Access check of specific address range in the target space.
 
-DAMON currently provides the implementations of the primitives for the physical
+DAMON currently provides the implementations of the operations for the physical
 and virtual address spaces. Below two subsections describe how those work.
 
 
 VMA-based Target Address Range Construction
 -------------------------------------------
 
-This is only for the virtual address space primitives implementation.  That for
-the physical address space simply asks users to manually set the monitoring
-target address ranges.
+This is only for the virtual address space monitoring operations
+implementation.  That for the physical address space simply asks users to
+manually set the monitoring target address ranges.
 
 Only small parts in the super-huge virtual address space of the processes are
 mapped to the physical memory and accessed.  Thus, tracking the unmapped
diff --git a/Documentation/vm/damon/faq.rst b/Documentation/vm/damon/faq.rst
index 11aea40eb328c..dde7e2414ee60 100644
--- a/Documentation/vm/damon/faq.rst
+++ b/Documentation/vm/damon/faq.rst
@@ -31,7 +31,7 @@ Does DAMON support virtual memory only?
 =======================================
 
 No.  The core of the DAMON is address space independent.  The address space
-specific low level primitive parts including monitoring target regions
+specific monitoring operations including monitoring target regions
 constructions and actual access checks can be implemented and configured on the
 DAMON core by the users.  In this way, DAMON users can monitor any address
 space with any access check technique.

From cdbbf41e1fe402f12e12e6101703c00513cdf2ce Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Mar 2022 14:49:15 -0700
Subject: [PATCH 521/737] Docs/vm/damon/design: update DAMON-Idle Page Tracking
 interference handling

commit 742cc2bfce5a94ad629a3a0bd408ef61c8be2826 upstream.

In DAMON's early development stage before it be merged in the mainline, it
was first designed to work exclusively with Idle page tracking to avoid
any interference between each other.  Later, but still before be merged in
the mainline, because Idle page tracking is fully under the control of
sysadmins, we made the resolving of conflict as the responsibility of
sysadmins.  The document is not updated for the change, though.  This
commit updates the document for that.

Link: https://lkml.kernel.org/r/20220222170100.17068-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/damon/design.rst | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/Documentation/vm/damon/design.rst b/Documentation/vm/damon/design.rst
index 35de31c25e3a3..0cff6fac6b7e8 100644
--- a/Documentation/vm/damon/design.rst
+++ b/Documentation/vm/damon/design.rst
@@ -84,9 +84,10 @@ table having a mapping to the address.  In this way, the implementations find
 and clear the bit(s) for next sampling target address and checks whether the
 bit(s) set again after one sampling period.  This could disturb other kernel
 subsystems using the Accessed bits, namely Idle page tracking and the reclaim
-logic.  To avoid such disturbances, DAMON makes it mutually exclusive with Idle
-page tracking and uses ``PG_idle`` and ``PG_young`` page flags to solve the
-conflict with the reclaim logic, as Idle page tracking does.
+logic.  DAMON does nothing to avoid disturbing Idle page tracking, so handling
+the interference is the responsibility of sysadmins.  However, it solves the
+conflict with the reclaim logic using ``PG_idle`` and ``PG_young`` page flags,
+as Idle page tracking does.
 
 
 Address Space Independent Core Mechanisms

From d41d66e1fb277b8e45c9353d3eb37c36cbfe4378 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sjpark@amazon.com>
Date: Wed, 2 Nov 2022 17:28:24 +0000
Subject: [PATCH 522/737] drivers/amazon/net/efa: Add missed object file to
 compile

Makefile of efa driver is missing efa_gdr.c.  Add the file.

Signed-off-by: SeongJae Park <sjpark@amazon.com>
---
 drivers/amazon/net/efa/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/amazon/net/efa/Makefile b/drivers/amazon/net/efa/Makefile
index 9c4acbe2942a4..edf5ed12716cd 100644
--- a/drivers/amazon/net/efa/Makefile
+++ b/drivers/amazon/net/efa/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_AMAZON_EFA_INFINIBAND) += efa.o
 
-efa-y := efa_com.o efa_com_cmd.o efa_main.o efa_verbs.o
+efa-y := efa_com.o efa_com_cmd.o efa_gdr.o efa_main.o efa_verbs.o
 
 efa-$(CONFIG_SYSFS) += efa_sysfs.o
 

From e0ef71c45361e29cd291b122282af0da145197b1 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sjpark@amazon.com>
Date: Wed, 2 Nov 2022 17:53:31 +0000
Subject: [PATCH 523/737] drivers/amazon/net/efa: update to 2.1.0

Update EFA Amazon downstream driver to 2.1.0, to support new features of
Elastic Fabric Adapter.

In detail, copy '*.{c,h}' files from the EFA github repo's 'src/'
directory and 'build/src/config.h' that auto-generated from the github
repo on a machine running AL 5.10 kernel to 'drivers/amazon/net/efa/',
and adds new EFA object files to build in the Makefile.

[1] https://github.com/amzn/amzn-drivers/releases/tag/efa_linux_2.1.0

Signed-off-by: SeongJae Park <sjpark@amazon.com>
---
 drivers/amazon/net/efa/Makefile              |   3 +-
 drivers/amazon/net/efa/config.h              |   5 +-
 drivers/amazon/net/efa/efa-abi.h             |  20 +-
 drivers/amazon/net/efa/efa.h                 |  48 +-
 drivers/amazon/net/efa/efa_admin_cmds_defs.h | 106 +++-
 drivers/amazon/net/efa/efa_admin_defs.h      |  41 ++
 drivers/amazon/net/efa/efa_com.c             | 164 ++++++
 drivers/amazon/net/efa/efa_com.h             |  38 +-
 drivers/amazon/net/efa/efa_com_cmd.c         |  40 +-
 drivers/amazon/net/efa/efa_com_cmd.h         |  11 +-
 drivers/amazon/net/efa/efa_gdr.c             | 208 ++++----
 drivers/amazon/net/efa/efa_gdr.h             |  47 --
 drivers/amazon/net/efa/efa_io_defs.h         | 289 +++++++++++
 drivers/amazon/net/efa/efa_main.c            | 239 +++++++--
 drivers/amazon/net/efa/efa_neuron.c          | 176 +++++++
 drivers/amazon/net/efa/efa_p2p.c             | 121 +++++
 drivers/amazon/net/efa/efa_p2p.h             |  57 +++
 drivers/amazon/net/efa/efa_regs_defs.h       |   7 +-
 drivers/amazon/net/efa/efa_sysfs.c           |  38 +-
 drivers/amazon/net/efa/efa_verbs.c           | 493 +++++++++++++------
 drivers/amazon/net/efa/kcompat.h             |  56 ++-
 drivers/amazon/net/efa/neuron_p2p.h          |  43 ++
 22 files changed, 1850 insertions(+), 400 deletions(-)
 delete mode 100644 drivers/amazon/net/efa/efa_gdr.h
 create mode 100644 drivers/amazon/net/efa/efa_io_defs.h
 create mode 100644 drivers/amazon/net/efa/efa_neuron.c
 create mode 100644 drivers/amazon/net/efa/efa_p2p.c
 create mode 100644 drivers/amazon/net/efa/efa_p2p.h
 create mode 100644 drivers/amazon/net/efa/neuron_p2p.h

diff --git a/drivers/amazon/net/efa/Makefile b/drivers/amazon/net/efa/Makefile
index edf5ed12716cd..4399f594a93bf 100644
--- a/drivers/amazon/net/efa/Makefile
+++ b/drivers/amazon/net/efa/Makefile
@@ -4,7 +4,8 @@
 
 obj-$(CONFIG_AMAZON_EFA_INFINIBAND) += efa.o
 
-efa-y := efa_com.o efa_com_cmd.o efa_gdr.o efa_main.o efa_verbs.o
+efa-y := efa_com.o efa_com_cmd.o efa_gdr.o efa_main.o efa_neuron.o efa_p2p.o
+efa-y += efa_verbs.o
 
 efa-$(CONFIG_SYSFS) += efa_sysfs.o
 
diff --git a/drivers/amazon/net/efa/config.h b/drivers/amazon/net/efa/config.h
index 53a5eb2eeaecb..b86d2e69cd96c 100644
--- a/drivers/amazon/net/efa/config.h
+++ b/drivers/amazon/net/efa/config.h
@@ -17,8 +17,8 @@
 #define HAVE_DEREG_MR_UDATA 1
 #define HAVE_DESTROY_CQ_UDATA 1
 #define HAVE_DESTROY_QP_UDATA 1
-#define HAVE_UPSTREAM_EFA 1
 #define HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE 1
+#define HAVE_UPSTREAM_EFA 1
 #define HAVE_IB_DEVICE_OPS_COMMON 1
 #define HAVE_CQ_CORE_ALLOCATION 1
 #define HAVE_IB_PORT_PHYS_STATE_LINK_UP 1
@@ -43,3 +43,6 @@
 #define HAVE_RDMA_UMEM_FOR_EACH_DMA_BLOCK 1
 #define HAVE_IB_UMEM_NUM_DMA_BLOCKS 1
 #define HAVE_IB_REGISTER_DEVICE_DMA_DEVICE_PARAM 1
+#define HAVE_SYSFS_EMIT 1
+#define HAVE_XARRAY 1
+#define HAVE_EFA_P2P 1
\ No newline at end of file
diff --git a/drivers/amazon/net/efa/efa-abi.h b/drivers/amazon/net/efa/efa-abi.h
index f89fbb5b1e8d5..163ac79556d68 100644
--- a/drivers/amazon/net/efa/efa-abi.h
+++ b/drivers/amazon/net/efa/efa-abi.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */
 /*
- * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef EFA_ABI_USER_H
@@ -52,11 +52,21 @@ struct efa_ibv_alloc_pd_resp {
 	__u8 reserved_30[2];
 };
 
+enum {
+	EFA_CREATE_CQ_WITH_COMPLETION_CHANNEL = 1 << 0,
+	EFA_CREATE_CQ_WITH_SGID               = 1 << 1,
+};
+
 struct efa_ibv_create_cq {
 	__u32 comp_mask;
 	__u32 cq_entry_size;
 	__u16 num_sub_cqs;
-	__u8 reserved_50[6];
+	__u8 flags;
+	__u8 reserved_58[5];
+};
+
+enum {
+	EFA_CREATE_CQ_RESP_DB_OFF = 1 << 0,
 };
 
 struct efa_ibv_create_cq_resp {
@@ -65,7 +75,9 @@ struct efa_ibv_create_cq_resp {
 	__aligned_u64 q_mmap_key;
 	__aligned_u64 q_mmap_size;
 	__u16 cq_idx;
-	__u8 reserved_d0[6];
+	__u8 reserved_d0[2];
+	__u32 db_off;
+	__aligned_u64 db_mmap_key;
 };
 
 enum {
@@ -106,6 +118,8 @@ struct efa_ibv_create_ah_resp {
 enum {
 	EFA_QUERY_DEVICE_CAPS_RDMA_READ = 1 << 0,
 	EFA_QUERY_DEVICE_CAPS_RNR_RETRY = 1 << 1,
+	EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS = 1 << 2,
+	EFA_QUERY_DEVICE_CAPS_CQ_WITH_SGID     = 1 << 3,
 };
 
 struct efa_ibv_ex_query_device_resp {
diff --git a/drivers/amazon/net/efa/efa.h b/drivers/amazon/net/efa/efa.h
index 019cbd632710b..34ccbac76b451 100644
--- a/drivers/amazon/net/efa/efa.h
+++ b/drivers/amazon/net/efa/efa.h
@@ -22,14 +22,14 @@
 
 #define EFA_IRQNAME_SIZE        40
 
-/* 1 for AENQ + ADMIN */
-#define EFA_NUM_MSIX_VEC                  1
 #define EFA_MGMNT_MSIX_VEC_IDX            0
+#define EFA_COMP_EQS_VEC_BASE             1
 
 struct efa_irq {
 	irq_handler_t handler;
 	void *data;
 	u32 irqn;
+	u32 vector;
 	cpumask_t affinity_hint_mask;
 	char name[EFA_IRQNAME_SIZE];
 };
@@ -63,6 +63,19 @@ struct efa_dev {
 	struct efa_irq admin_irq;
 
 	struct efa_stats stats;
+
+	/* Array of completion EQs */
+	struct efa_eq *eqs;
+	unsigned int neqs;
+
+#ifdef HAVE_XARRAY
+	/* Only stores CQs with interrupts enabled */
+	struct xarray cqs_xa;
+#else
+	/* If xarray isn't available keep an array of all possible CQs */
+	struct efa_cq *cqs_arr[BIT(sizeof_field(struct efa_admin_create_cq_resp,
+						cq_idx) * 8)];
+#endif
 };
 
 struct efa_ucontext {
@@ -84,9 +97,9 @@ struct efa_pd {
 struct efa_mr {
 	struct ib_mr ibmr;
 	struct ib_umem *umem;
-#ifdef HAVE_EFA_GDR
-	struct efa_nvmem *nvmem;
-	u64 nvmem_ticket;
+#ifdef HAVE_EFA_P2P
+	struct efa_p2pmem *p2pmem;
+	u64 p2p_ticket;
 #endif
 };
 
@@ -96,8 +109,11 @@ struct efa_cq {
 	dma_addr_t dma_addr;
 	void *cpu_addr;
 	struct rdma_user_mmap_entry *mmap_entry;
+	struct rdma_user_mmap_entry *db_mmap_entry;
 	size_t size;
 	u16 cq_idx;
+	/* NULL when no interrupts requested */
+	struct efa_eq *eq;
 };
 
 struct efa_qp {
@@ -128,6 +144,11 @@ struct efa_ah {
 	u8 id[EFA_GID_SIZE];
 };
 
+struct efa_eq {
+	struct efa_com_eq eeq;
+	struct efa_irq irq;
+};
+
 int efa_query_device(struct ib_device *ibdev,
 		     struct ib_device_attr *props,
 		     struct ib_udata *udata);
@@ -164,9 +185,14 @@ int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
 #else
 int efa_destroy_qp(struct ib_qp *ibqp);
 #endif
-struct ib_qp *efa_create_qp(struct ib_pd *ibpd,
-			    struct ib_qp_init_attr *init_attr,
-			    struct ib_udata *udata);
+#ifdef HAVE_QP_CORE_ALLOCATION
+int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr,
+		  struct ib_udata *udata);
+#else
+struct ib_qp *efa_kzalloc_qp(struct ib_pd *ibpd,
+			     struct ib_qp_init_attr *init_attr,
+			     struct ib_udata *udata);
+#endif
 #ifdef HAVE_IB_INT_DESTROY_CQ
 int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
 #elif defined(HAVE_IB_VOID_DESTROY_CQ)
@@ -193,6 +219,12 @@ struct ib_cq *efa_kzalloc_cq(struct ib_device *ibdev,
 struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
 			 u64 virt_addr, int access_flags,
 			 struct ib_udata *udata);
+#ifdef HAVE_MR_DMABUF
+struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start,
+				     u64 length, u64 virt_addr,
+				     int fd, int access_flags,
+				     struct ib_udata *udata);
+#endif
 #ifdef HAVE_DEREG_MR_UDATA
 int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
 #else
diff --git a/drivers/amazon/net/efa/efa_admin_cmds_defs.h b/drivers/amazon/net/efa/efa_admin_cmds_defs.h
index fa38b34eddb88..d4b9226088bd0 100644
--- a/drivers/amazon/net/efa/efa_admin_cmds_defs.h
+++ b/drivers/amazon/net/efa/efa_admin_cmds_defs.h
@@ -28,7 +28,9 @@ enum efa_admin_aq_opcode {
 	EFA_ADMIN_DEALLOC_PD                        = 15,
 	EFA_ADMIN_ALLOC_UAR                         = 16,
 	EFA_ADMIN_DEALLOC_UAR                       = 17,
-	EFA_ADMIN_MAX_OPCODE                        = 17,
+	EFA_ADMIN_CREATE_EQ                         = 18,
+	EFA_ADMIN_DESTROY_EQ                        = 19,
+	EFA_ADMIN_MAX_OPCODE                        = 19,
 };
 
 enum efa_admin_aq_feature_id {
@@ -38,6 +40,7 @@ enum efa_admin_aq_feature_id {
 	EFA_ADMIN_QUEUE_ATTR                        = 4,
 	EFA_ADMIN_HW_HINTS                          = 5,
 	EFA_ADMIN_HOST_INFO                         = 6,
+	EFA_ADMIN_EVENT_QUEUE_ATTR                  = 7,
 };
 
 /* QP transport type */
@@ -430,8 +433,8 @@ struct efa_admin_create_cq_cmd {
 	/*
 	 * 4:0 : reserved5 - MBZ
 	 * 5 : interrupt_mode_enabled - if set, cq operates
-	 *    in interrupt mode (i.e. CQ events and MSI-X are
-	 *    generated), otherwise - polling
+	 *    in interrupt mode (i.e. CQ events and EQ elements
+	 *    are generated), otherwise - polling
 	 * 6 : virt - If set, ring base address is virtual
 	 *    (IOVA returned by MR registration)
 	 * 7 : reserved6 - MBZ
@@ -441,15 +444,21 @@ struct efa_admin_create_cq_cmd {
 	/*
 	 * 4:0 : cq_entry_size_words - size of CQ entry in
 	 *    32-bit words, valid values: 4, 8.
-	 * 7:5 : reserved7 - MBZ
+	 * 5 : set_src_addr - If set, source address will be
+	 *    filled on RX completions from unknown senders.
+	 *    Requires 8 words CQ entry size.
+	 * 7:6 : reserved7 - MBZ
 	 */
 	u8 cq_caps_2;
 
 	/* completion queue depth in # of entries. must be power of 2 */
 	u16 cq_depth;
 
-	/* msix vector assigned to this cq */
-	u32 msix_vector_idx;
+	/* EQ number assigned to this cq */
+	u16 eqn;
+
+	/* MBZ */
+	u16 reserved;
 
 	/*
 	 * CQ ring base address, virtual or physical depending on 'virt'
@@ -480,6 +489,15 @@ struct efa_admin_create_cq_resp {
 
 	/* actual cq depth in number of entries */
 	u16 cq_actual_depth;
+
+	/* CQ doorbell address, as offset to PCIe DB BAR */
+	u32 db_offset;
+
+	/*
+	 * 0 : db_valid - If set, doorbell offset is valid.
+	 *    Always set when interrupts are requested.
+	 */
+	u32 flags;
 };
 
 struct efa_admin_destroy_cq_cmd {
@@ -669,6 +687,17 @@ struct efa_admin_feature_queue_attr_desc {
 	u16 max_tx_batch;
 };
 
+struct efa_admin_event_queue_attr_desc {
+	/* The maximum number of event queues supported */
+	u32 max_eq;
+
+	/* Maximum number of EQEs per Event Queue */
+	u32 max_eq_depth;
+
+	/* Supported events bitmask */
+	u32 event_bitmask;
+};
+
 struct efa_admin_feature_aenq_desc {
 	/* bitmask for AENQ groups the device can report */
 	u32 supported_groups;
@@ -727,6 +756,8 @@ struct efa_admin_get_feature_resp {
 
 		struct efa_admin_feature_queue_attr_desc queue_attr;
 
+		struct efa_admin_event_queue_attr_desc event_queue_attr;
+
 		struct efa_admin_hw_hints hw_hints;
 	} u;
 };
@@ -810,6 +841,60 @@ struct efa_admin_dealloc_uar_resp {
 	struct efa_admin_acq_common_desc acq_common_desc;
 };
 
+struct efa_admin_create_eq_cmd {
+	struct efa_admin_aq_common_desc aq_common_descriptor;
+
+	/* Size of the EQ in entries, must be power of 2 */
+	u16 depth;
+
+	/* MSI-X table entry index */
+	u8 msix_vec;
+
+	/*
+	 * 4:0 : entry_size_words - size of EQ entry in
+	 *    32-bit words
+	 * 7:5 : reserved - MBZ
+	 */
+	u8 caps;
+
+	/* EQ ring base address */
+	struct efa_common_mem_addr ba;
+
+	/*
+	 * Enabled events on this EQ
+	 * 0 : completion_events - Enable completion events
+	 * 31:1 : reserved - MBZ
+	 */
+	u32 event_bitmask;
+
+	/* MBZ */
+	u32 reserved;
+};
+
+struct efa_admin_create_eq_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	/* EQ number */
+	u16 eqn;
+
+	/* MBZ */
+	u16 reserved;
+};
+
+struct efa_admin_destroy_eq_cmd {
+	struct efa_admin_aq_common_desc aq_common_descriptor;
+
+	/* EQ number */
+	u16 eqn;
+
+	/* MBZ */
+	u16 reserved;
+};
+
+struct efa_admin_destroy_eq_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+};
+
 /* asynchronous event notification groups */
 enum efa_admin_aenq_group {
 	EFA_ADMIN_FATAL_ERROR                       = 1,
@@ -898,11 +983,20 @@ struct efa_admin_host_info {
 #define EFA_ADMIN_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED_MASK BIT(5)
 #define EFA_ADMIN_CREATE_CQ_CMD_VIRT_MASK                   BIT(6)
 #define EFA_ADMIN_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS_MASK    GENMASK(4, 0)
+#define EFA_ADMIN_CREATE_CQ_CMD_SET_SRC_ADDR_MASK           BIT(5)
+
+/* create_cq_resp */
+#define EFA_ADMIN_CREATE_CQ_RESP_DB_VALID_MASK              BIT(0)
 
 /* feature_device_attr_desc */
 #define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RDMA_READ_MASK   BIT(0)
 #define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RNR_RETRY_MASK   BIT(1)
 
+/* create_eq_cmd */
+#define EFA_ADMIN_CREATE_EQ_CMD_ENTRY_SIZE_WORDS_MASK       GENMASK(4, 0)
+#define EFA_ADMIN_CREATE_EQ_CMD_VIRT_MASK                   BIT(6)
+#define EFA_ADMIN_CREATE_EQ_CMD_COMPLETION_EVENTS_MASK      BIT(0)
+
 /* host_info */
 #define EFA_ADMIN_HOST_INFO_DRIVER_MODULE_TYPE_MASK         GENMASK(7, 0)
 #define EFA_ADMIN_HOST_INFO_DRIVER_SUB_MINOR_MASK           GENMASK(15, 8)
diff --git a/drivers/amazon/net/efa/efa_admin_defs.h b/drivers/amazon/net/efa/efa_admin_defs.h
index 78ff9389ae256..83f20c38a8400 100644
--- a/drivers/amazon/net/efa/efa_admin_defs.h
+++ b/drivers/amazon/net/efa/efa_admin_defs.h
@@ -118,6 +118,43 @@ struct efa_admin_aenq_entry {
 	u32 inline_data_w4[12];
 };
 
+enum efa_admin_eqe_event_type {
+	EFA_ADMIN_EQE_EVENT_TYPE_COMPLETION         = 0,
+};
+
+/* Completion event */
+struct efa_admin_comp_event {
+	/* CQ number */
+	u16 cqn;
+
+	/* MBZ */
+	u16 reserved;
+
+	/* MBZ */
+	u32 reserved2;
+};
+
+/* Event Queue Element */
+struct efa_admin_eqe {
+	/*
+	 * 0 : phase
+	 * 8:1 : event_type - Event type
+	 * 31:9 : reserved - MBZ
+	 */
+	u32 common;
+
+	/* MBZ */
+	u32 reserved;
+
+	union {
+		/* Event data */
+		u32 event_data[2];
+
+		/* Completion Event */
+		struct efa_admin_comp_event comp_event;
+	} u;
+};
+
 /* aq_common_desc */
 #define EFA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK            GENMASK(11, 0)
 #define EFA_ADMIN_AQ_COMMON_DESC_PHASE_MASK                 BIT(0)
@@ -131,4 +168,8 @@ struct efa_admin_aenq_entry {
 /* aenq_common_desc */
 #define EFA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK               BIT(0)
 
+/* eqe */
+#define EFA_ADMIN_EQE_PHASE_MASK                            BIT(0)
+#define EFA_ADMIN_EQE_EVENT_TYPE_MASK                       GENMASK(8, 1)
+
 #endif /* _EFA_ADMIN_H_ */
diff --git a/drivers/amazon/net/efa/efa_com.c b/drivers/amazon/net/efa/efa_com.c
index 22793b3959593..d0b13097a0967 100644
--- a/drivers/amazon/net/efa/efa_com.c
+++ b/drivers/amazon/net/efa/efa_com.c
@@ -56,11 +56,19 @@ static const char *efa_com_cmd_str(u8 cmd)
 	EFA_CMD_STR_CASE(DEALLOC_PD);
 	EFA_CMD_STR_CASE(ALLOC_UAR);
 	EFA_CMD_STR_CASE(DEALLOC_UAR);
+	EFA_CMD_STR_CASE(CREATE_EQ);
+	EFA_CMD_STR_CASE(DESTROY_EQ);
 	default: return "unknown command opcode";
 	}
 #undef EFA_CMD_STR_CASE
 }
 
+void efa_com_set_dma_addr(dma_addr_t addr, u32 *addr_high, u32 *addr_low)
+{
+	*addr_low = lower_32_bits(addr);
+	*addr_high = upper_32_bits(addr);
+}
+
 static u32 efa_com_reg_read32(struct efa_com_dev *edev, u16 offset)
 {
 	struct efa_com_mmio_read *mmio_read = &edev->mmio_read;
@@ -1085,3 +1093,159 @@ int efa_com_dev_reset(struct efa_com_dev *edev,
 
 	return 0;
 }
+
+static int efa_com_create_eq(struct efa_com_dev *edev,
+			     struct efa_com_create_eq_params *params,
+			     struct efa_com_create_eq_result *result)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_create_eq_resp resp = {};
+	struct efa_admin_create_eq_cmd cmd = {};
+	int err;
+
+	cmd.aq_common_descriptor.opcode = EFA_ADMIN_CREATE_EQ;
+	EFA_SET(&cmd.caps, EFA_ADMIN_CREATE_EQ_CMD_ENTRY_SIZE_WORDS,
+		params->entry_size_in_bytes / 4);
+	cmd.depth = params->depth;
+	cmd.event_bitmask = params->event_bitmask;
+	cmd.msix_vec = params->msix_vec;
+
+	efa_com_set_dma_addr(params->dma_addr, &cmd.ba.mem_addr_high,
+			     &cmd.ba.mem_addr_low);
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&cmd,
+			       sizeof(cmd),
+			       (struct efa_admin_acq_entry *)&resp,
+			       sizeof(resp));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to create eq[%d]\n", err);
+		return err;
+	}
+
+	result->eqn = resp.eqn;
+
+	return 0;
+}
+
+static void efa_com_destroy_eq(struct efa_com_dev *edev,
+			       struct efa_com_destroy_eq_params *params)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_destroy_eq_resp resp = {};
+	struct efa_admin_destroy_eq_cmd cmd = {};
+	int err;
+
+	cmd.aq_common_descriptor.opcode = EFA_ADMIN_DESTROY_EQ;
+	cmd.eqn = params->eqn;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&cmd,
+			       sizeof(cmd),
+			       (struct efa_admin_acq_entry *)&resp,
+			       sizeof(resp));
+	if (err)
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to destroy EQ-%u [%d]\n", cmd.eqn,
+				      err);
+}
+
+static void efa_com_arm_eq(struct efa_com_dev *edev, struct efa_com_eq *eeq)
+{
+	u32 val = 0;
+
+	EFA_SET(&val, EFA_REGS_EQ_DB_EQN, eeq->eqn);
+	EFA_SET(&val, EFA_REGS_EQ_DB_ARM, 1);
+
+	writel(val, edev->reg_bar + EFA_REGS_EQ_DB_OFF);
+}
+
+void efa_com_eq_comp_intr_handler(struct efa_com_dev *edev,
+				  struct efa_com_eq *eeq)
+{
+	struct efa_admin_eqe *eqe;
+	u32 processed = 0;
+	u8 phase;
+	u32 ci;
+
+	ci = eeq->cc & (eeq->depth - 1);
+	phase = eeq->phase;
+	eqe = &eeq->eqes[ci];
+
+	/* Go over all the events */
+	while ((READ_ONCE(eqe->common) & EFA_ADMIN_EQE_PHASE_MASK) == phase) {
+		/*
+		 * Do not read the rest of the completion entry before the
+		 * phase bit was validated
+		 */
+		dma_rmb();
+
+		eeq->cb(eeq, eqe);
+
+		/* Get next event entry */
+		ci++;
+		processed++;
+
+		if (ci == eeq->depth) {
+			ci = 0;
+			phase = !phase;
+		}
+
+		eqe = &eeq->eqes[ci];
+	}
+
+	eeq->cc += processed;
+	eeq->phase = phase;
+	efa_com_arm_eq(eeq->edev, eeq);
+}
+
+void efa_com_eq_destroy(struct efa_com_dev *edev, struct efa_com_eq *eeq)
+{
+	struct efa_com_destroy_eq_params params = {
+		.eqn = eeq->eqn,
+	};
+
+	efa_com_destroy_eq(edev, &params);
+	dma_free_coherent(edev->dmadev, eeq->depth * sizeof(*eeq->eqes),
+			  eeq->eqes, eeq->dma_addr);
+}
+
+int efa_com_eq_init(struct efa_com_dev *edev, struct efa_com_eq *eeq,
+		    efa_eqe_handler cb, u16 depth, u8 msix_vec)
+{
+	struct efa_com_create_eq_params params = {};
+	struct efa_com_create_eq_result result = {};
+	int err;
+
+	params.depth = depth;
+	params.entry_size_in_bytes = sizeof(*eeq->eqes);
+	EFA_SET(&params.event_bitmask,
+		EFA_ADMIN_CREATE_EQ_CMD_COMPLETION_EVENTS, 1);
+	params.msix_vec = msix_vec;
+
+	eeq->eqes = dma_alloc_coherent(edev->dmadev,
+				       params.depth * sizeof(*eeq->eqes),
+				       &params.dma_addr, GFP_KERNEL);
+	if (!eeq->eqes)
+		return -ENOMEM;
+
+	err = efa_com_create_eq(edev, &params, &result);
+	if (err)
+		goto err_free_coherent;
+
+	eeq->eqn = result.eqn;
+	eeq->edev = edev;
+	eeq->dma_addr = params.dma_addr;
+	eeq->phase = 1;
+	eeq->depth = params.depth;
+	eeq->cb = cb;
+	efa_com_arm_eq(edev, eeq);
+
+	return 0;
+
+err_free_coherent:
+	dma_free_coherent(edev->dmadev, params.depth * sizeof(*eeq->eqes),
+			  eeq->eqes, params.dma_addr);
+	return err;
+}
diff --git a/drivers/amazon/net/efa/efa_com.h b/drivers/amazon/net/efa/efa_com.h
index 3857ec3359f0d..bced7c3981792 100644
--- a/drivers/amazon/net/efa/efa_com.h
+++ b/drivers/amazon/net/efa/efa_com.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _EFA_COM_H_
@@ -81,6 +81,9 @@ struct efa_com_admin_queue {
 };
 
 struct efa_aenq_handlers;
+struct efa_com_eq;
+typedef void (*efa_eqe_handler)(struct efa_com_eq *eeq,
+				struct efa_admin_eqe *eqe);
 
 struct efa_com_aenq {
 	struct efa_admin_aenq_entry *entries;
@@ -113,6 +116,33 @@ struct efa_com_dev {
 	struct efa_com_mmio_read mmio_read;
 };
 
+struct efa_com_eq {
+	struct efa_com_dev *edev;
+	struct efa_admin_eqe *eqes;
+	dma_addr_t dma_addr;
+	u32 cc; /* Consumer counter */
+	u16 eqn;
+	u16 depth;
+	u8 phase;
+	efa_eqe_handler cb;
+};
+
+struct efa_com_create_eq_params {
+	dma_addr_t dma_addr;
+	u32 event_bitmask;
+	u16 depth;
+	u8 entry_size_in_bytes;
+	u8 msix_vec;
+};
+
+struct efa_com_create_eq_result {
+	u16 eqn;
+};
+
+struct efa_com_destroy_eq_params {
+	u16 eqn;
+};
+
 typedef void (*efa_aenq_handler)(void *data,
 	      struct efa_admin_aenq_entry *aenq_e);
 
@@ -122,9 +152,13 @@ struct efa_aenq_handlers {
 	efa_aenq_handler unimplemented_handler;
 };
 
+void efa_com_set_dma_addr(dma_addr_t addr, u32 *addr_high, u32 *addr_low);
 int efa_com_admin_init(struct efa_com_dev *edev,
 		       struct efa_aenq_handlers *aenq_handlers);
 void efa_com_admin_destroy(struct efa_com_dev *edev);
+int efa_com_eq_init(struct efa_com_dev *edev, struct efa_com_eq *eeq,
+		    efa_eqe_handler cb, u16 depth, u8 msix_vec);
+void efa_com_eq_destroy(struct efa_com_dev *edev, struct efa_com_eq *eeq);
 int efa_com_dev_reset(struct efa_com_dev *edev,
 		      enum efa_regs_reset_reason_types reset_reason);
 void efa_com_set_admin_polling_mode(struct efa_com_dev *edev, bool polling);
@@ -141,5 +175,7 @@ int efa_com_cmd_exec(struct efa_com_admin_queue *aq,
 		     struct efa_admin_acq_entry *comp,
 		     size_t comp_size);
 void efa_com_aenq_intr_handler(struct efa_com_dev *edev, void *data);
+void efa_com_eq_comp_intr_handler(struct efa_com_dev *edev,
+				  struct efa_com_eq *eeq);
 
 #endif /* _EFA_COM_H_ */
diff --git a/drivers/amazon/net/efa/efa_com_cmd.c b/drivers/amazon/net/efa/efa_com_cmd.c
index 315ab45612ad3..e107c354bc349 100644
--- a/drivers/amazon/net/efa/efa_com_cmd.c
+++ b/drivers/amazon/net/efa/efa_com_cmd.c
@@ -1,17 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
 /*
- * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include "efa_com.h"
 #include "efa_com_cmd.h"
 
-void efa_com_set_dma_addr(dma_addr_t addr, u32 *addr_high, u32 *addr_low)
-{
-	*addr_low = lower_32_bits(addr);
-	*addr_high = upper_32_bits(addr);
-}
-
 int efa_com_create_qp(struct efa_com_dev *edev,
 		      struct efa_com_create_qp_params *params,
 		      struct efa_com_create_qp_result *res)
@@ -157,7 +151,7 @@ int efa_com_create_cq(struct efa_com_dev *edev,
 		      struct efa_com_create_cq_params *params,
 		      struct efa_com_create_cq_result *result)
 {
-	struct efa_admin_create_cq_resp cmd_completion;
+	struct efa_admin_create_cq_resp cmd_completion = {};
 	struct efa_admin_create_cq_cmd create_cmd = {};
 	struct efa_com_admin_queue *aq = &edev->aq;
 	int err;
@@ -169,7 +163,15 @@ int efa_com_create_cq(struct efa_com_dev *edev,
 	create_cmd.cq_depth = params->cq_depth;
 	create_cmd.num_sub_cqs = params->num_sub_cqs;
 	create_cmd.uar = params->uarn;
-
+	if (params->interrupt_mode_enabled) {
+		EFA_SET(&create_cmd.cq_caps_1,
+			EFA_ADMIN_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED, 1);
+		create_cmd.eqn = params->eqn;
+	}
+	if (params->set_src_addr) {
+		EFA_SET(&create_cmd.cq_caps_2,
+			EFA_ADMIN_CREATE_CQ_CMD_SET_SRC_ADDR, 1);
+	}
 	efa_com_set_dma_addr(params->dma_addr,
 			     &create_cmd.cq_ba.mem_addr_high,
 			     &create_cmd.cq_ba.mem_addr_low);
@@ -187,6 +189,9 @@ int efa_com_create_cq(struct efa_com_dev *edev,
 
 	result->cq_idx = cmd_completion.cq_idx;
 	result->actual_depth = params->cq_depth;
+	result->db_off = cmd_completion.db_offset;
+	result->db_valid = EFA_GET(&cmd_completion.flags,
+				   EFA_ADMIN_CREATE_CQ_RESP_DB_VALID);
 
 	return 0;
 }
@@ -498,6 +503,23 @@ int efa_com_get_device_attr(struct efa_com_dev *edev,
 	       sizeof(resp.u.network_attr.addr));
 	result->mtu = resp.u.network_attr.mtu;
 
+	if (efa_com_check_supported_feature_id(edev,
+					       EFA_ADMIN_EVENT_QUEUE_ATTR)) {
+		err = efa_com_get_feature(edev, &resp,
+					  EFA_ADMIN_EVENT_QUEUE_ATTR);
+		if (err) {
+			ibdev_err_ratelimited(
+				edev->efa_dev,
+				"Failed to get event queue attributes %d\n",
+				err);
+			return err;
+		}
+
+		result->max_eq = resp.u.event_queue_attr.max_eq;
+		result->max_eq_depth = resp.u.event_queue_attr.max_eq_depth;
+		result->event_bitmask = resp.u.event_queue_attr.event_bitmask;
+	}
+
 	return 0;
 }
 
diff --git a/drivers/amazon/net/efa/efa_com_cmd.h b/drivers/amazon/net/efa/efa_com_cmd.h
index eea4ebfbe6ec3..0898ad5bc3405 100644
--- a/drivers/amazon/net/efa/efa_com_cmd.h
+++ b/drivers/amazon/net/efa/efa_com_cmd.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _EFA_COM_CMD_H_
@@ -73,7 +73,10 @@ struct efa_com_create_cq_params {
 	u16 cq_depth;
 	u16 num_sub_cqs;
 	u16 uarn;
+	u16 eqn;
 	u8 entry_size_in_bytes;
+	u8 interrupt_mode_enabled : 1;
+	u8 set_src_addr : 1;
 };
 
 struct efa_com_create_cq_result {
@@ -81,6 +84,8 @@ struct efa_com_create_cq_result {
 	u16 cq_idx;
 	/* actual cq depth in # of entries */
 	u16 actual_depth;
+	u32 db_off;
+	bool db_valid;
 };
 
 struct efa_com_destroy_cq_params {
@@ -125,6 +130,9 @@ struct efa_com_get_device_attr_result {
 	u32 max_llq_size;
 	u32 max_rdma_size;
 	u32 device_caps;
+	u32 max_eq;
+	u32 max_eq_depth;
+	u32 event_bitmask; /* EQ events bitmask */
 	u16 sub_cqs_per_cq;
 	u16 max_sq_sge;
 	u16 max_rq_sge;
@@ -260,7 +268,6 @@ union efa_com_get_stats_result {
 	struct efa_com_rdma_read_stats rdma_read_stats;
 };
 
-void efa_com_set_dma_addr(dma_addr_t addr, u32 *addr_high, u32 *addr_low);
 int efa_com_create_qp(struct efa_com_dev *edev,
 		      struct efa_com_create_qp_params *params,
 		      struct efa_com_create_qp_result *res);
diff --git a/drivers/amazon/net/efa/efa_gdr.c b/drivers/amazon/net/efa/efa_gdr.c
index 2bcd4bec66704..24f8a082d10d5 100644
--- a/drivers/amazon/net/efa/efa_gdr.c
+++ b/drivers/amazon/net/efa/efa_gdr.c
@@ -5,29 +5,41 @@
 
 #include <linux/module.h>
 
-#include "efa_gdr.h"
+#include "efa_p2p.h"
+#include "nv-p2p.h"
 
 #define GPU_PAGE_SHIFT 16
 #define GPU_PAGE_SIZE BIT_ULL(GPU_PAGE_SHIFT)
 
-static struct mutex nvmem_list_lock;
-static struct list_head nvmem_list;
-static atomic64_t next_nvmem_ticket;
+struct efa_nvmem_ops {
+	int (*get_pages)(u64 p2p_token, u32 va_space, u64 virtual_address,
+			 u64 length, struct nvidia_p2p_page_table **page_table,
+			 void (*free_callback)(void *data), void *data);
+	int (*dma_map_pages)(struct pci_dev *peer,
+			     struct nvidia_p2p_page_table *page_table,
+			     struct nvidia_p2p_dma_mapping **dma_mapping);
+	int (*put_pages)(u64 p2p_token, u32 va_space, u64 virtual_address,
+			 struct nvidia_p2p_page_table *page_table);
+	int (*dma_unmap_pages)(struct pci_dev *peer,
+			       struct nvidia_p2p_page_table *page_table,
+			       struct nvidia_p2p_dma_mapping *dma_mapping);
+};
+
+struct efa_nvmem {
+	struct efa_p2pmem p2pmem;
+	struct efa_nvmem_ops ops;
+	struct nvidia_p2p_page_table *pgtbl;
+	struct nvidia_p2p_dma_mapping *dma_mapping;
+	u64 virt_start;
+};
 
-void nvmem_init(void)
+static unsigned int nvmem_pgsz(struct efa_dev *dev, struct efa_p2pmem *p2pmem)
 {
-	mutex_init(&nvmem_list_lock);
-	INIT_LIST_HEAD(&nvmem_list);
-	/*
-	 * Ideally, first ticket would be zero, but that would make callback
-	 * data NULL which is invalid.
-	 */
-	atomic64_set(&next_nvmem_ticket, 1);
-}
+	struct efa_nvmem *nvmem;
 
-static int nvmem_pgsz(enum nvidia_p2p_page_size_type pgszt)
-{
-	switch (pgszt) {
+	nvmem = container_of(p2pmem, struct efa_nvmem, p2pmem);
+
+	switch (nvmem->pgtbl->page_size) {
 	case NVIDIA_P2P_PAGE_SIZE_4KB:
 		return SZ_4K;
 	case NVIDIA_P2P_PAGE_SIZE_64KB:
@@ -39,20 +51,7 @@ static int nvmem_pgsz(enum nvidia_p2p_page_size_type pgszt)
 	}
 }
 
-static struct efa_nvmem *ticket_to_nvmem(u64 ticket)
-{
-	struct efa_nvmem *nvmem;
-
-	lockdep_assert_held(&nvmem_list_lock);
-	list_for_each_entry(nvmem, &nvmem_list, list) {
-		if (nvmem->ticket == ticket)
-			return nvmem;
-	}
-
-	return NULL;
-}
-
-int nvmem_get_fp(struct efa_nvmem *nvmem)
+static int nvmem_get_fp(struct efa_nvmem *nvmem)
 {
 	nvmem->ops.get_pages = symbol_get(nvidia_p2p_get_pages);
 	if (!nvmem->ops.get_pages)
@@ -82,7 +81,7 @@ int nvmem_get_fp(struct efa_nvmem *nvmem)
 	return -EINVAL;
 }
 
-void nvmem_put_fp(void)
+static void nvmem_put_fp(void)
 {
 	symbol_put(nvidia_p2p_dma_unmap_pages);
 	symbol_put(nvidia_p2p_dma_map_pages);
@@ -90,69 +89,19 @@ void nvmem_put_fp(void)
 	symbol_put(nvidia_p2p_get_pages);
 }
 
-static void nvmem_release(struct efa_dev *dev, struct efa_nvmem *nvmem)
-{
-	if (nvmem->dma_mapping)
-		nvmem->ops.dma_unmap_pages(dev->pdev, nvmem->pgtbl,
-					   nvmem->dma_mapping);
-
-	if (nvmem->pgtbl)
-		nvmem->ops.put_pages(0, 0, nvmem->virt_start, nvmem->pgtbl);
-}
-
-int nvmem_put(u64 ticket, bool in_cb)
-{
-	struct efa_com_dereg_mr_params params = {};
-	struct efa_nvmem *nvmem;
-	struct efa_dev *dev;
-	int err;
-
-	mutex_lock(&nvmem_list_lock);
-	nvmem = ticket_to_nvmem(ticket);
-	if (!nvmem) {
-		pr_debug("Ticket %llu not found in the nvmem list\n", ticket);
-		mutex_unlock(&nvmem_list_lock);
-		return 0;
-	}
-
-	dev = nvmem->dev;
-	if (nvmem->needs_dereg) {
-		params.l_key = nvmem->lkey;
-		err = efa_com_dereg_mr(&dev->edev, &params);
-		if (err) {
-			mutex_unlock(&nvmem_list_lock);
-			return err;
-		}
-		nvmem->needs_dereg = false;
-	}
-
-	if (in_cb) {
-		mutex_unlock(&nvmem_list_lock);
-		return 0;
-	}
-
-	list_del(&nvmem->list);
-	mutex_unlock(&nvmem_list_lock);
-	nvmem_release(dev, nvmem);
-	nvmem_put_fp();
-	kfree(nvmem);
-
-	return 0;
-}
-
 static void nvmem_free_cb(void *data)
 {
 	pr_debug("Free callback ticket %llu\n", (u64)data);
-	nvmem_put((u64)data, true);
+	efa_p2p_put((u64)data, true);
 }
 
 static int nvmem_get_pages(struct efa_dev *dev, struct efa_nvmem *nvmem,
-			   u64 addr, u64 size)
+			   u64 addr, u64 size, u64 ticket)
 {
 	int err;
 
 	err = nvmem->ops.get_pages(0, 0, addr, size, &nvmem->pgtbl,
-				   nvmem_free_cb, (void *)nvmem->ticket);
+				   nvmem_free_cb, (void *)ticket);
 	if (err) {
 		ibdev_dbg(&dev->ibdev, "nvidia_p2p_get_pages failed %d\n", err);
 		return err;
@@ -193,11 +142,12 @@ static int nvmem_dma_map(struct efa_dev *dev, struct efa_nvmem *nvmem)
 	return 0;
 }
 
-struct efa_nvmem *nvmem_get(struct efa_dev *dev, struct efa_mr *mr, u64 start,
-			    u64 length, unsigned int *pgsz)
+static struct efa_p2pmem *nvmem_get(struct efa_dev *dev, u64 ticket, u64 start,
+				    u64 length)
 {
 	struct efa_nvmem *nvmem;
 	u64 virt_start;
+	u64 virt_end;
 	u64 pinsz;
 	int err;
 
@@ -205,11 +155,9 @@ struct efa_nvmem *nvmem_get(struct efa_dev *dev, struct efa_mr *mr, u64 start,
 	if (!nvmem)
 		return NULL;
 
-	nvmem->ticket = atomic64_fetch_inc(&next_nvmem_ticket);
-	mr->nvmem_ticket = nvmem->ticket;
-	nvmem->dev = dev;
 	virt_start = ALIGN_DOWN(start, GPU_PAGE_SIZE);
-	pinsz = start + length - virt_start;
+	virt_end = ALIGN(start + length, GPU_PAGE_SIZE);
+	pinsz = virt_end - virt_start;
 	nvmem->virt_start = virt_start;
 
 	err = nvmem_get_fp(nvmem);
@@ -217,30 +165,19 @@ struct efa_nvmem *nvmem_get(struct efa_dev *dev, struct efa_mr *mr, u64 start,
 		/* Nvidia module is not loaded */
 		goto err_free;
 
-	err = nvmem_get_pages(dev, nvmem, virt_start, pinsz);
-	if (err) {
-		/* Most likely cpu pages */
+	err = nvmem_get_pages(dev, nvmem, virt_start, pinsz, ticket);
+	if (err)
+		/* Most likely not our pages */
 		goto err_put_fp;
-	}
 
 	err = nvmem_dma_map(dev, nvmem);
 	if (err)
 		goto err_put;
 
-	*pgsz = nvmem_pgsz(nvmem->pgtbl->page_size);
-	if (!*pgsz)
-		goto err_unmap;
-
-	mutex_lock(&nvmem_list_lock);
-	list_add(&nvmem->list, &nvmem_list);
-	mutex_unlock(&nvmem_list_lock);
-
-	return nvmem;
+	return &nvmem->p2pmem;
 
-err_unmap:
-	nvmem->ops.dma_unmap_pages(dev->pdev, nvmem->pgtbl, nvmem->dma_mapping);
 err_put:
-	nvmem->ops.put_pages(0, 0, start, nvmem->pgtbl);
+	nvmem->ops.put_pages(0, 0, virt_start, nvmem->pgtbl);
 err_put_fp:
 	nvmem_put_fp();
 err_free:
@@ -248,14 +185,67 @@ struct efa_nvmem *nvmem_get(struct efa_dev *dev, struct efa_mr *mr, u64 start,
 	return NULL;
 }
 
-int nvmem_to_page_list(struct efa_dev *dev, struct efa_nvmem *nvmem,
-		       u64 *page_list)
+static int nvmem_to_page_list(struct efa_dev *dev, struct efa_p2pmem *p2pmem,
+			      u64 *page_list)
 {
-	struct nvidia_p2p_dma_mapping *dma_mapping = nvmem->dma_mapping;
+	struct nvidia_p2p_dma_mapping *dma_mapping;
+	struct efa_nvmem *nvmem;
 	int i;
 
+	nvmem = container_of(p2pmem, struct efa_nvmem, p2pmem);
+	dma_mapping = nvmem->dma_mapping;
+
 	for (i = 0; i < dma_mapping->entries; i++)
 		page_list[i] = dma_mapping->dma_addresses[i];
 
 	return 0;
 }
+
+static void nvmem_release(struct efa_dev *dev, struct efa_p2pmem *p2pmem,
+			  bool in_cb)
+{
+	struct efa_nvmem *nvmem;
+
+	nvmem = container_of(p2pmem, struct efa_nvmem, p2pmem);
+
+	if (!in_cb) {
+		nvmem->ops.dma_unmap_pages(dev->pdev, nvmem->pgtbl,
+					   nvmem->dma_mapping);
+		nvmem->ops.put_pages(0, 0, nvmem->virt_start, nvmem->pgtbl);
+	}
+
+	nvmem_put_fp();
+	kfree(nvmem);
+}
+
+bool nvmem_is_supported(void)
+{
+	struct efa_nvmem dummynv = {};
+
+	if (nvmem_get_fp(&dummynv))
+		return false;
+	nvmem_put_fp();
+
+	return true;
+}
+
+struct nvmem_provider {
+	struct efa_p2p_provider p2p;
+};
+
+static const struct nvmem_provider prov = {
+	.p2p = {
+		.ops = {
+			.try_get = nvmem_get,
+			.to_page_list = nvmem_to_page_list,
+			.release = nvmem_release,
+			.get_page_size = nvmem_pgsz,
+		},
+		.type = EFA_P2P_PROVIDER_NVMEM,
+	},
+};
+
+const struct efa_p2p_provider *nvmem_get_provider(void)
+{
+	return &prov.p2p;
+}
diff --git a/drivers/amazon/net/efa/efa_gdr.h b/drivers/amazon/net/efa/efa_gdr.h
deleted file mode 100644
index faa743c09c945..0000000000000
--- a/drivers/amazon/net/efa/efa_gdr.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
-/*
- * Copyright 2019-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
- */
-
-#ifndef _EFA_GDR_H_
-#define _EFA_GDR_H_
-
-#include "efa.h"
-#include "nv-p2p.h"
-
-struct efa_nvmem_ops {
-	int (*get_pages)(u64 p2p_token, u32 va_space, u64 virtual_address,
-			 u64 length, struct nvidia_p2p_page_table **page_table,
-			 void (*free_callback)(void *data), void *data);
-	int (*dma_map_pages)(struct pci_dev *peer,
-			     struct nvidia_p2p_page_table *page_table,
-			     struct nvidia_p2p_dma_mapping **dma_mapping);
-	int (*put_pages)(u64 p2p_token, u32 va_space, u64 virtual_address,
-			 struct nvidia_p2p_page_table *page_table);
-	int (*dma_unmap_pages)(struct pci_dev *peer,
-			       struct nvidia_p2p_page_table *page_table,
-			       struct nvidia_p2p_dma_mapping *dma_mapping);
-};
-
-struct efa_nvmem {
-	struct efa_dev *dev;
-	struct efa_nvmem_ops ops;
-	struct nvidia_p2p_page_table *pgtbl;
-	struct nvidia_p2p_dma_mapping *dma_mapping;
-	u64 virt_start;
-	u64 ticket;
-	u32 lkey;
-	bool needs_dereg;
-	struct list_head list; /* member of nvmem_list */
-};
-
-void nvmem_init(void);
-int nvmem_get_fp(struct efa_nvmem *nvmem);
-void nvmem_put_fp(void);
-struct efa_nvmem *nvmem_get(struct efa_dev *dev, struct efa_mr *mr, u64 start,
-			    u64 length, unsigned int *pgsz);
-int nvmem_to_page_list(struct efa_dev *dev, struct efa_nvmem *nvmem,
-		       u64 *page_list);
-int nvmem_put(u64 ticket, bool in_cb);
-
-#endif /* _EFA_GDR_H_ */
diff --git a/drivers/amazon/net/efa/efa_io_defs.h b/drivers/amazon/net/efa/efa_io_defs.h
new file mode 100644
index 0000000000000..17ba8984b11e9
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_io_defs.h
@@ -0,0 +1,289 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_IO_H_
+#define _EFA_IO_H_
+
+#define EFA_IO_TX_DESC_NUM_BUFS              2
+#define EFA_IO_TX_DESC_NUM_RDMA_BUFS         1
+#define EFA_IO_TX_DESC_INLINE_MAX_SIZE       32
+#define EFA_IO_TX_DESC_IMM_DATA_SIZE         4
+
+enum efa_io_queue_type {
+	/* send queue (of a QP) */
+	EFA_IO_SEND_QUEUE                           = 1,
+	/* recv queue (of a QP) */
+	EFA_IO_RECV_QUEUE                           = 2,
+};
+
+enum efa_io_send_op_type {
+	/* send message */
+	EFA_IO_SEND                                 = 0,
+	/* RDMA read */
+	EFA_IO_RDMA_READ                            = 1,
+};
+
+enum efa_io_comp_status {
+	/* Successful completion */
+	EFA_IO_COMP_STATUS_OK                       = 0,
+	/* Flushed during QP destroy */
+	EFA_IO_COMP_STATUS_FLUSHED                  = 1,
+	/* Internal QP error */
+	EFA_IO_COMP_STATUS_LOCAL_ERROR_QP_INTERNAL_ERROR = 2,
+	/* Bad operation type */
+	EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_OP_TYPE = 3,
+	/* Bad AH */
+	EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_AH   = 4,
+	/* LKEY not registered or does not match IOVA */
+	EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_LKEY = 5,
+	/* Message too long */
+	EFA_IO_COMP_STATUS_LOCAL_ERROR_BAD_LENGTH   = 6,
+	/* Destination ENI is down or does not run EFA */
+	EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_ADDRESS = 7,
+	/* Connection was reset by remote side */
+	EFA_IO_COMP_STATUS_REMOTE_ERROR_ABORT       = 8,
+	/* Bad dest QP number (QP does not exist or is in error state) */
+	EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_DEST_QPN = 9,
+	/* Destination resource not ready (no WQEs posted on RQ) */
+	EFA_IO_COMP_STATUS_REMOTE_ERROR_RNR         = 10,
+	/* Receiver SGL too short */
+	EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_LENGTH  = 11,
+	/* Unexpected status returned by responder */
+	EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_STATUS  = 12,
+	/* Unresponsive remote - detected locally */
+	EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE = 13,
+};
+
+struct efa_io_tx_meta_desc {
+	/* Verbs-generated Request ID */
+	u16 req_id;
+
+	/*
+	 * control flags
+	 * 3:0 : op_type - operation type: send/rdma/fast mem
+	 *    ops/etc
+	 * 4 : has_imm - immediate_data field carries valid
+	 *    data.
+	 * 5 : inline_msg - inline mode - inline message data
+	 *    follows this descriptor (no buffer descriptors).
+	 *    Note that it is different from immediate data
+	 * 6 : meta_extension - Extended metadata. MBZ
+	 * 7 : meta_desc - Indicates metadata descriptor.
+	 *    Must be set.
+	 */
+	u8 ctrl1;
+
+	/*
+	 * control flags
+	 * 0 : phase
+	 * 1 : reserved25 - MBZ
+	 * 2 : first - Indicates first descriptor in
+	 *    transaction. Must be set.
+	 * 3 : last - Indicates last descriptor in
+	 *    transaction. Must be set.
+	 * 4 : comp_req - Indicates whether completion should
+	 *    be posted, after packet is transmitted. Valid only
+	 *    for the first descriptor
+	 * 7:5 : reserved29 - MBZ
+	 */
+	u8 ctrl2;
+
+	u16 dest_qp_num;
+
+	/*
+	 * If inline_msg bit is set, length of inline message in bytes,
+	 *    otherwise length of SGL (number of buffers).
+	 */
+	u16 length;
+
+	/*
+	 * immediate data: if has_imm is set, then this field is included
+	 *    within Tx message and reported in remote Rx completion.
+	 */
+	u32 immediate_data;
+
+	u16 ah;
+
+	u16 reserved;
+
+	/* Queue key */
+	u32 qkey;
+
+	u8 reserved2[12];
+};
+
+/*
+ * Tx queue buffer descriptor, for any transport type. Preceded by metadata
+ * descriptor.
+ */
+struct efa_io_tx_buf_desc {
+	/* length in bytes */
+	u32 length;
+
+	/*
+	 * 23:0 : lkey - local memory translation key
+	 * 31:24 : reserved - MBZ
+	 */
+	u32 lkey;
+
+	/* Buffer address bits[31:0] */
+	u32 buf_addr_lo;
+
+	/* Buffer address bits[63:32] */
+	u32 buf_addr_hi;
+};
+
+struct efa_io_remote_mem_addr {
+	/* length in bytes */
+	u32 length;
+
+	/* remote memory translation key */
+	u32 rkey;
+
+	/* Buffer address bits[31:0] */
+	u32 buf_addr_lo;
+
+	/* Buffer address bits[63:32] */
+	u32 buf_addr_hi;
+};
+
+struct efa_io_rdma_req {
+	/* Remote memory address */
+	struct efa_io_remote_mem_addr remote_mem;
+
+	/* Local memory address */
+	struct efa_io_tx_buf_desc local_mem[1];
+};
+
+/*
+ * Tx WQE, composed of tx meta descriptors followed by either tx buffer
+ * descriptors or inline data
+ */
+struct efa_io_tx_wqe {
+	/* TX meta */
+	struct efa_io_tx_meta_desc meta;
+
+	union {
+		/* Send buffer descriptors */
+		struct efa_io_tx_buf_desc sgl[2];
+
+		u8 inline_data[32];
+
+		/* RDMA local and remote memory addresses */
+		struct efa_io_rdma_req rdma_req;
+	} data;
+};
+
+/*
+ * Rx buffer descriptor; RX WQE is composed of one or more RX buffer
+ * descriptors.
+ */
+struct efa_io_rx_desc {
+	/* Buffer address bits[31:0] */
+	u32 buf_addr_lo;
+
+	/* Buffer Pointer[63:32] */
+	u32 buf_addr_hi;
+
+	/* Verbs-generated request id. */
+	u16 req_id;
+
+	/* Length in bytes. */
+	u16 length;
+
+	/*
+	 * LKey and control flags
+	 * 23:0 : lkey
+	 * 29:24 : reserved - MBZ
+	 * 30 : first - Indicates first descriptor in WQE
+	 * 31 : last - Indicates last descriptor in WQE
+	 */
+	u32 lkey_ctrl;
+};
+
+/* Common IO completion descriptor */
+struct efa_io_cdesc_common {
+	/*
+	 * verbs-generated request ID, as provided in the completed tx or rx
+	 *    descriptor.
+	 */
+	u16 req_id;
+
+	u8 status;
+
+	/*
+	 * flags
+	 * 0 : phase - Phase bit
+	 * 2:1 : q_type - enum efa_io_queue_type: send/recv
+	 * 3 : has_imm - indicates that immediate data is
+	 *    present - for RX completions only
+	 * 7:4 : reserved28 - MBZ
+	 */
+	u8 flags;
+
+	/* local QP number */
+	u16 qp_num;
+
+	/* Transferred length */
+	u16 length;
+};
+
+/* Tx completion descriptor */
+struct efa_io_tx_cdesc {
+	/* Common completion info */
+	struct efa_io_cdesc_common common;
+};
+
+/* Rx Completion Descriptor */
+struct efa_io_rx_cdesc {
+	/* Common completion info */
+	struct efa_io_cdesc_common common;
+
+	/* Remote Address Handle FW index, 0xFFFF indicates invalid ah */
+	u16 ah;
+
+	u16 src_qp_num;
+
+	/* Immediate data */
+	u32 imm;
+};
+
+/* Extended Rx Completion Descriptor */
+struct efa_io_rx_cdesc_ex {
+	/* Base RX completion info */
+	struct efa_io_rx_cdesc rx_cdesc_base;
+
+	/*
+	 * Valid only in case of unknown AH (0xFFFF) and CQ set_src_addr is
+	 * enabled.
+	 */
+	u8 src_addr[16];
+};
+
+/* tx_meta_desc */
+#define EFA_IO_TX_META_DESC_OP_TYPE_MASK                    GENMASK(3, 0)
+#define EFA_IO_TX_META_DESC_HAS_IMM_MASK                    BIT(4)
+#define EFA_IO_TX_META_DESC_INLINE_MSG_MASK                 BIT(5)
+#define EFA_IO_TX_META_DESC_META_EXTENSION_MASK             BIT(6)
+#define EFA_IO_TX_META_DESC_META_DESC_MASK                  BIT(7)
+#define EFA_IO_TX_META_DESC_PHASE_MASK                      BIT(0)
+#define EFA_IO_TX_META_DESC_FIRST_MASK                      BIT(2)
+#define EFA_IO_TX_META_DESC_LAST_MASK                       BIT(3)
+#define EFA_IO_TX_META_DESC_COMP_REQ_MASK                   BIT(4)
+
+/* tx_buf_desc */
+#define EFA_IO_TX_BUF_DESC_LKEY_MASK                        GENMASK(23, 0)
+
+/* rx_desc */
+#define EFA_IO_RX_DESC_LKEY_MASK                            GENMASK(23, 0)
+#define EFA_IO_RX_DESC_FIRST_MASK                           BIT(30)
+#define EFA_IO_RX_DESC_LAST_MASK                            BIT(31)
+
+/* cdesc_common */
+#define EFA_IO_CDESC_COMMON_PHASE_MASK                      BIT(0)
+#define EFA_IO_CDESC_COMMON_Q_TYPE_MASK                     GENMASK(2, 1)
+#define EFA_IO_CDESC_COMMON_HAS_IMM_MASK                    BIT(3)
+
+#endif /* _EFA_IO_H_ */
diff --git a/drivers/amazon/net/efa/efa_main.c b/drivers/amazon/net/efa/efa_main.c
index 1d1b94e800cd9..34a8e13273556 100644
--- a/drivers/amazon/net/efa/efa_main.c
+++ b/drivers/amazon/net/efa/efa_main.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
 /*
- * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include <linux/module.h>
@@ -13,8 +13,8 @@
 #include "efa.h"
 #include "efa_sysfs.h"
 
-#ifdef HAVE_EFA_GDR
-#include "efa_gdr.h"
+#ifdef HAVE_EFA_P2P
+#include "efa_p2p.h"
 #endif
 
 #ifndef HAVE_PCI_VENDOR_ID_AMAZON
@@ -22,16 +22,18 @@
 #endif
 #define PCI_DEV_ID_EFA0_VF 0xefa0
 #define PCI_DEV_ID_EFA1_VF 0xefa1
+#define PCI_DEV_ID_EFA2_VF 0xefa2
 
 static const struct pci_device_id efa_pci_tbl[] = {
 	{ PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA0_VF) },
 	{ PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA1_VF) },
+	{ PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA2_VF) },
 	{ }
 };
 
-#define DRV_MODULE_VER_MAJOR           1
-#define DRV_MODULE_VER_MINOR           14
-#define DRV_MODULE_VER_SUBMINOR        1
+#define DRV_MODULE_VER_MAJOR           2
+#define DRV_MODULE_VER_MINOR           1
+#define DRV_MODULE_VER_SUBMINOR        0
 
 #ifndef DRV_MODULE_VERSION
 #define DRV_MODULE_VERSION \
@@ -49,9 +51,6 @@ MODULE_AUTHOR("Amazon.com, Inc. or its affiliates");
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_DESCRIPTION(DEVICE_NAME);
 MODULE_DEVICE_TABLE(pci, efa_pci_tbl);
-#ifdef HAVE_EFA_GDR
-MODULE_INFO(gdr, "Y");
-#endif
 
 #define EFA_REG_BAR 0
 #define EFA_MEM_BAR 2
@@ -94,6 +93,51 @@ static void efa_release_bars(struct efa_dev *dev, int bars_mask)
 	pci_release_selected_regions(pdev, release_bars);
 }
 
+static void efa_process_comp_eqe(struct efa_dev *dev, struct efa_admin_eqe *eqe)
+{
+	u16 cqn = eqe->u.comp_event.cqn;
+	struct efa_cq *cq;
+
+#ifdef HAVE_XARRAY
+	/* Safe to load as we're in irq and removal calls synchronize_irq() */
+	cq = xa_load(&dev->cqs_xa, cqn);
+#else
+	cq = dev->cqs_arr[cqn];
+#endif
+	if (unlikely(!cq)) {
+		ibdev_err_ratelimited(&dev->ibdev,
+				      "Completion event on non-existent CQ[%u]",
+				      cqn);
+		return;
+	}
+
+	cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+}
+
+static void efa_process_eqe(struct efa_com_eq *eeq, struct efa_admin_eqe *eqe)
+{
+	struct efa_dev *dev = container_of(eeq->edev, struct efa_dev, edev);
+
+	if (likely(EFA_GET(&eqe->common, EFA_ADMIN_EQE_EVENT_TYPE) ==
+			   EFA_ADMIN_EQE_EVENT_TYPE_COMPLETION))
+		efa_process_comp_eqe(dev, eqe);
+	else
+		ibdev_err_ratelimited(&dev->ibdev,
+				      "Unknown event type received %lu",
+				      EFA_GET(&eqe->common,
+					      EFA_ADMIN_EQE_EVENT_TYPE));
+}
+
+static irqreturn_t efa_intr_msix_comp(int irq, void *data)
+{
+	struct efa_eq *eq = data;
+	struct efa_com_dev *edev = eq->eeq.edev;
+
+	efa_com_eq_comp_intr_handler(edev, &eq->eeq);
+
+	return IRQ_HANDLED;
+}
+
 static irqreturn_t efa_intr_msix_mgmnt(int irq, void *data)
 {
 	struct efa_dev *dev = data;
@@ -104,26 +148,43 @@ static irqreturn_t efa_intr_msix_mgmnt(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static int efa_request_mgmnt_irq(struct efa_dev *dev)
+static int efa_request_irq(struct efa_dev *dev, struct efa_irq *irq)
 {
-	struct efa_irq *irq;
 	int err;
 
-	irq = &dev->admin_irq;
 	err = request_irq(irq->irqn, irq->handler, 0, irq->name, irq->data);
 	if (err) {
-		dev_err(&dev->pdev->dev, "Failed to request admin irq (%d)\n",
-			err);
+		dev_err(&dev->pdev->dev, "Failed to request irq %s (%d)\n",
+			irq->name, err);
 		return err;
 	}
 
-	dev_dbg(&dev->pdev->dev, "Set affinity hint of mgmnt irq to %*pbl (irq vector: %d)\n",
-		nr_cpumask_bits, &irq->affinity_hint_mask, irq->irqn);
 	irq_set_affinity_hint(irq->irqn, &irq->affinity_hint_mask);
 
 	return 0;
 }
 
+static void efa_setup_comp_irq(struct efa_dev *dev, struct efa_eq *eq,
+			       int vector)
+{
+	u32 cpu;
+
+	cpu = vector - EFA_COMP_EQS_VEC_BASE;
+	snprintf(eq->irq.name, EFA_IRQNAME_SIZE, "efa-comp%d@pci:%s", cpu,
+		 pci_name(dev->pdev));
+	eq->irq.handler = efa_intr_msix_comp;
+	eq->irq.data = eq;
+	eq->irq.vector = vector;
+	eq->irq.irqn = pci_irq_vector(dev->pdev, vector);
+	cpumask_set_cpu(cpu, &eq->irq.affinity_hint_mask);
+}
+
+static void efa_free_irq(struct efa_dev *dev, struct efa_irq *irq)
+{
+	irq_set_affinity_hint(irq->irqn, NULL);
+	free_irq(irq->irqn, irq->data);
+}
+
 static void efa_setup_mgmnt_irq(struct efa_dev *dev)
 {
 	u32 cpu;
@@ -132,8 +193,9 @@ static void efa_setup_mgmnt_irq(struct efa_dev *dev)
 		 "efa-mgmnt@pci:%s", pci_name(dev->pdev));
 	dev->admin_irq.handler = efa_intr_msix_mgmnt;
 	dev->admin_irq.data = dev;
-	dev->admin_irq.irqn =
-		pci_irq_vector(dev->pdev, dev->admin_msix_vector_idx);
+	dev->admin_irq.vector = dev->admin_msix_vector_idx;
+	dev->admin_irq.irqn = pci_irq_vector(dev->pdev,
+					     dev->admin_msix_vector_idx);
 	cpu = cpumask_first(cpu_online_mask);
 	cpumask_set_cpu(cpu,
 			&dev->admin_irq.affinity_hint_mask);
@@ -142,20 +204,11 @@ static void efa_setup_mgmnt_irq(struct efa_dev *dev)
 		 dev->admin_irq.name);
 }
 
-static void efa_free_mgmnt_irq(struct efa_dev *dev)
-{
-	struct efa_irq *irq;
-
-	irq = &dev->admin_irq;
-	irq_set_affinity_hint(irq->irqn, NULL);
-	free_irq(irq->irqn, irq->data);
-}
-
 static int efa_set_mgmnt_irq(struct efa_dev *dev)
 {
 	efa_setup_mgmnt_irq(dev);
 
-	return efa_request_mgmnt_irq(dev);
+	return efa_request_irq(dev, &dev->admin_irq);
 }
 
 static int efa_request_doorbell_bar(struct efa_dev *dev)
@@ -233,11 +286,11 @@ static void efa_set_host_info(struct efa_dev *dev)
 	if (!hinf)
 		return;
 
-	strlcpy(hinf->os_dist_str, utsname()->release,
-		min(sizeof(hinf->os_dist_str), sizeof(utsname()->release)));
+	strscpy(hinf->os_dist_str, utsname()->release,
+		sizeof(hinf->os_dist_str));
 	hinf->os_type = EFA_ADMIN_OS_LINUX;
-	strlcpy(hinf->kernel_ver_str, utsname()->version,
-		min(sizeof(hinf->kernel_ver_str), sizeof(utsname()->version)));
+	strscpy(hinf->kernel_ver_str, utsname()->version,
+		sizeof(hinf->kernel_ver_str));
 	hinf->kernel_ver = LINUX_VERSION_CODE;
 	EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_MAJOR,
 		DRV_MODULE_VER_MAJOR);
@@ -256,7 +309,7 @@ static void efa_set_host_info(struct efa_dev *dev)
 		EFA_COMMON_SPEC_VERSION_MAJOR);
 	EFA_SET(&hinf->spec_ver, EFA_ADMIN_HOST_INFO_SPEC_MINOR,
 		EFA_COMMON_SPEC_VERSION_MINOR);
-#ifdef HAVE_EFA_GDR
+#ifdef HAVE_EFA_P2P
 	EFA_SET(&hinf->flags, EFA_ADMIN_HOST_INFO_GDR, 1);
 #endif
 
@@ -266,6 +319,72 @@ static void efa_set_host_info(struct efa_dev *dev)
 	dma_free_coherent(&dev->pdev->dev, bufsz, hinf, hinf_dma);
 }
 
+static void efa_destroy_eq(struct efa_dev *dev, struct efa_eq *eq)
+{
+	efa_com_eq_destroy(&dev->edev, &eq->eeq);
+	efa_free_irq(dev, &eq->irq);
+}
+
+static int efa_create_eq(struct efa_dev *dev, struct efa_eq *eq, u8 msix_vec)
+{
+	int err;
+
+	efa_setup_comp_irq(dev, eq, msix_vec);
+	err = efa_request_irq(dev, &eq->irq);
+	if (err)
+		return err;
+
+	err = efa_com_eq_init(&dev->edev, &eq->eeq, efa_process_eqe,
+			      dev->dev_attr.max_eq_depth, msix_vec);
+	if (err)
+		goto err_free_comp_irq;
+
+	return 0;
+
+err_free_comp_irq:
+	efa_free_irq(dev, &eq->irq);
+	return err;
+}
+
+static int efa_create_eqs(struct efa_dev *dev)
+{
+	unsigned int neqs = dev->dev_attr.max_eq;
+	int err;
+	int i;
+
+	neqs = min_t(unsigned int, neqs, num_online_cpus());
+	dev->neqs = neqs;
+	dev->eqs = kcalloc(neqs, sizeof(*dev->eqs), GFP_KERNEL);
+	if (!dev->eqs)
+		return -ENOMEM;
+
+	for (i = 0; i < neqs; i++) {
+		err = efa_create_eq(dev, &dev->eqs[i],
+				    i + EFA_COMP_EQS_VEC_BASE);
+		if (err)
+			goto err_destroy_eqs;
+	}
+
+	return 0;
+
+err_destroy_eqs:
+	for (i--; i >= 0; i--)
+		efa_destroy_eq(dev, &dev->eqs[i]);
+	kfree(dev->eqs);
+
+	return err;
+}
+
+static void efa_destroy_eqs(struct efa_dev *dev)
+{
+	int i;
+
+	for (i = 0; i < dev->neqs; i++)
+		efa_destroy_eq(dev, &dev->eqs[i]);
+
+	kfree(dev->eqs);
+}
+
 #ifdef HAVE_IB_DEV_OPS
 static const struct ib_device_ops efa_dev_ops = {
 #ifdef HAVE_IB_DEVICE_OPS_COMMON
@@ -290,17 +409,23 @@ static const struct ib_device_ops efa_dev_ops = {
 #else
 	.alloc_ucontext = efa_kzalloc_ucontext,
 #endif
+#ifndef HAVE_UVERBS_CMD_MASK_NOT_NEEDED
 #ifdef HAVE_AH_CORE_ALLOCATION
 	.create_ah = efa_create_ah,
 #else
 	.create_ah = efa_kzalloc_ah,
 #endif
+#endif
 #ifdef HAVE_CQ_CORE_ALLOCATION
 	.create_cq = efa_create_cq,
 #else
 	.create_cq = efa_kzalloc_cq,
 #endif
+#ifdef HAVE_QP_CORE_ALLOCATION
 	.create_qp = efa_create_qp,
+#else
+	.create_qp = efa_kzalloc_qp,
+#endif
 #ifdef HAVE_UVERBS_CMD_MASK_NOT_NEEDED
 	.create_user_ah = efa_create_ah,
 #endif
@@ -332,6 +457,9 @@ static const struct ib_device_ops efa_dev_ops = {
 	.query_port = efa_query_port,
 	.query_qp = efa_query_qp,
 	.reg_user_mr = efa_reg_mr,
+#ifdef HAVE_MR_DMABUF
+	.reg_user_mr_dmabuf = efa_reg_user_mr_dmabuf,
+#endif
 #ifndef HAVE_NO_KVERBS_DRIVERS
 	.req_notify_cq = efa_req_notify_cq,
 #endif
@@ -345,6 +473,9 @@ static const struct ib_device_ops efa_dev_ops = {
 #ifdef HAVE_PD_CORE_ALLOCATION
 	INIT_RDMA_OBJ_SIZE(ib_pd, efa_pd, ibpd),
 #endif
+#ifdef HAVE_QP_CORE_ALLOCATION
+	INIT_RDMA_OBJ_SIZE(ib_qp, efa_qp, ibqp),
+#endif
 #ifdef HAVE_UCONTEXT_CORE_ALLOCATION
 	INIT_RDMA_OBJ_SIZE(ib_ucontext, efa_ucontext, ibucontext),
 #endif
@@ -379,11 +510,15 @@ static int efa_ib_device_add(struct efa_dev *dev)
 	if (err)
 		goto err_release_doorbell_bar;
 
+	err = efa_create_eqs(dev);
+	if (err)
+		goto err_release_doorbell_bar;
+
 	efa_set_host_info(dev);
 
 	dev->ibdev.node_type = RDMA_NODE_UNSPECIFIED;
 	dev->ibdev.phys_port_cnt = 1;
-	dev->ibdev.num_comp_vectors = 1;
+	dev->ibdev.num_comp_vectors = dev->neqs ?: 1;
 #ifdef HAVE_DEV_PARENT
 	dev->ibdev.dev.parent = &pdev->dev;
 #else
@@ -430,7 +565,7 @@ static int efa_ib_device_add(struct efa_dev *dev)
 	dev->ibdev.alloc_ucontext = efa_kzalloc_ucontext;
 	dev->ibdev.create_ah = efa_kzalloc_ah;
 	dev->ibdev.create_cq = efa_kzalloc_cq;
-	dev->ibdev.create_qp = efa_create_qp;
+	dev->ibdev.create_qp = efa_kzalloc_qp;
 	dev->ibdev.dealloc_pd = efa_dealloc_pd;
 	dev->ibdev.dealloc_ucontext = efa_dealloc_ucontext;
 	dev->ibdev.dereg_mr = efa_dereg_mr;
@@ -462,18 +597,20 @@ static int efa_ib_device_add(struct efa_dev *dev)
 #elif defined(HAVE_IB_REGISTER_DEVICE_NAME_PARAM)
 	err = ib_register_device(&dev->ibdev, "efa_%d", NULL);
 #else
-	strlcpy(dev->ibdev.name, "efa_%d",
+	strscpy(dev->ibdev.name, "efa_%d",
 		sizeof(dev->ibdev.name));
 
 	err = ib_register_device(&dev->ibdev, NULL);
 #endif
 	if (err)
-		goto err_release_doorbell_bar;
+		goto err_destroy_eqs;
 
 	ibdev_info(&dev->ibdev, "IB device registered\n");
 
 	return 0;
 
+err_destroy_eqs:
+	efa_destroy_eqs(dev);
 err_release_doorbell_bar:
 	efa_release_doorbell_bar(dev);
 	return err;
@@ -481,9 +618,10 @@ static int efa_ib_device_add(struct efa_dev *dev)
 
 static void efa_ib_device_remove(struct efa_dev *dev)
 {
-	efa_com_dev_reset(&dev->edev, EFA_REGS_RESET_NORMAL);
 	ibdev_info(&dev->ibdev, "Unregister ib device\n");
 	ib_unregister_device(&dev->ibdev);
+	efa_destroy_eqs(dev);
+	efa_com_dev_reset(&dev->edev, EFA_REGS_RESET_NORMAL);
 	efa_release_doorbell_bar(dev);
 }
 
@@ -496,8 +634,12 @@ static int efa_enable_msix(struct efa_dev *dev)
 {
 	int msix_vecs, irq_num;
 
-	/* Reserve the max msix vectors we might need */
-	msix_vecs = EFA_NUM_MSIX_VEC;
+	/*
+	 * Reserve the max msix vectors we might need, one vector is reserved
+	 * for admin.
+	 */
+	msix_vecs = min_t(int, pci_msix_vec_count(dev->pdev),
+			  num_online_cpus() + 1);
 	dev_dbg(&dev->pdev->dev, "Trying to enable MSI-X, vectors %d\n",
 		msix_vecs);
 
@@ -546,6 +688,7 @@ static int efa_device_init(struct efa_com_dev *edev, struct pci_dev *pdev)
 		dev_err(&pdev->dev, "dma_set_mask_and_coherent failed %d\n", err);
 		return err;
 	}
+
 	dma_set_max_seg_size(&pdev->dev, UINT_MAX);
 	return 0;
 }
@@ -581,6 +724,11 @@ static struct efa_dev *efa_probe_device(struct pci_dev *pdev)
 	edev->efa_dev = dev;
 	edev->dmadev = &pdev->dev;
 	dev->pdev = pdev;
+#ifdef HAVE_XARRAY
+	xa_init(&dev->cqs_xa);
+#else
+	memset(dev->cqs_arr, 0, sizeof(dev->cqs_arr));
+#endif
 
 	bars = pci_select_bars(pdev, IORESOURCE_MEM) & EFA_BASE_BAR_MASK;
 	err = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME);
@@ -642,7 +790,7 @@ static struct efa_dev *efa_probe_device(struct pci_dev *pdev)
 err_admin_destroy:
 	efa_com_admin_destroy(edev);
 err_free_mgmnt_irq:
-	efa_free_mgmnt_irq(dev);
+	efa_free_irq(dev, &dev->admin_irq);
 err_disable_msix:
 	efa_disable_msix(dev);
 err_reg_read_destroy:
@@ -666,11 +814,14 @@ static void efa_remove_device(struct pci_dev *pdev)
 	edev = &dev->edev;
 	efa_sysfs_destroy(dev);
 	efa_com_admin_destroy(edev);
-	efa_free_mgmnt_irq(dev);
+	efa_free_irq(dev, &dev->admin_irq);
 	efa_disable_msix(dev);
 	efa_com_mmio_reg_read_destroy(edev);
 	devm_iounmap(&pdev->dev, edev->reg_bar);
 	efa_release_bars(dev, EFA_BASE_BAR_MASK);
+#ifdef HAVE_XARRAY
+	xa_destroy(&dev->cqs_xa);
+#endif
 	ib_dealloc_device(&dev->ibdev);
 	pci_disable_device(pdev);
 }
@@ -722,8 +873,8 @@ static int __init efa_init(void)
 		return err;
 	}
 
-#ifdef HAVE_EFA_GDR
-	nvmem_init();
+#ifdef HAVE_EFA_P2P
+	efa_p2p_init();
 #endif
 
 	return 0;
diff --git a/drivers/amazon/net/efa/efa_neuron.c b/drivers/amazon/net/efa/efa_neuron.c
new file mode 100644
index 0000000000000..ec2644e3079c4
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_neuron.c
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright 2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include <linux/module.h>
+
+#include "efa_p2p.h"
+#include "neuron_p2p.h"
+
+#define NEURON_PAGE_SHIFT 12
+#define NEURON_PAGE_SIZE BIT_ULL(NEURON_PAGE_SHIFT)
+
+struct efa_neuronmem_ops {
+	int (*register_va)(u64 virtual_address, u64 length,
+			   struct neuron_p2p_va_info **vainfo,
+			   void (*free_callback)(void *data),
+			   void *data);
+	int (*unregister_va)(struct neuron_p2p_va_info *vainfo);
+};
+
+struct efa_neuronmem {
+	struct efa_p2pmem p2pmem;
+	struct efa_neuronmem_ops ops;
+	struct neuron_p2p_va_info *va_info;
+	u64 virt_start;
+};
+
+static unsigned int neuronmem_pgsz(struct efa_dev *dev,
+				   struct efa_p2pmem *p2pmem)
+{
+	struct efa_neuronmem *neuronmem;
+
+	neuronmem = container_of(p2pmem, struct efa_neuronmem, p2pmem);
+	return BIT(neuronmem->va_info->shift_page_size);
+}
+
+static int neuronmem_get_fp(struct efa_neuronmem *neuronmem)
+{
+	neuronmem->ops.register_va = symbol_get(neuron_p2p_register_va);
+	if (!neuronmem->ops.register_va)
+		goto err_out;
+
+	neuronmem->ops.unregister_va = symbol_get(neuron_p2p_unregister_va);
+	if (!neuronmem->ops.unregister_va)
+		goto err_put_register_va;
+
+	return 0;
+
+err_put_register_va:
+	symbol_put(neuron_p2p_register_va);
+err_out:
+	return -EINVAL;
+}
+
+static void neuronmem_put_fp(void)
+{
+	symbol_put(neuron_p2p_unregister_va);
+	symbol_put(neuron_p2p_register_va);
+}
+
+static void neuronmem_free_cb(void *data)
+{
+	pr_debug("Free callback ticket %llu\n", (u64)data);
+	efa_p2p_put((u64)data, true);
+}
+
+static int neuronmem_register_va(struct efa_dev *dev, struct efa_neuronmem *neuronmem,
+				 u64 addr, u64 size, u64 ticket)
+{
+	int err;
+
+	err = neuronmem->ops.register_va(addr, size, &neuronmem->va_info,
+					 neuronmem_free_cb, (void *)ticket);
+	if (err) {
+		ibdev_dbg(&dev->ibdev, "neuron_p2p_register_va failed %d\n", err);
+		return err;
+	}
+
+	return 0;
+}
+
+static struct efa_p2pmem *neuronmem_get(struct efa_dev *dev, u64 ticket, u64 start,
+					u64 length)
+{
+	struct efa_neuronmem *neuronmem;
+	u64 virt_start;
+	u64 virt_end;
+	u64 pinsz;
+	int err;
+
+	neuronmem = kzalloc(sizeof(*neuronmem), GFP_KERNEL);
+	if (!neuronmem)
+		return NULL;
+
+	virt_start = ALIGN_DOWN(start, NEURON_PAGE_SIZE);
+	virt_end = ALIGN(start + length, NEURON_PAGE_SIZE);
+	pinsz = virt_end - virt_start;
+	neuronmem->virt_start = virt_start;
+
+	err = neuronmem_get_fp(neuronmem);
+	if (err)
+		/* Neuron module is not loaded */
+		goto err_free;
+
+	err = neuronmem_register_va(dev, neuronmem, virt_start, pinsz, ticket);
+	if (err)
+		/* Most likely not our pages */
+		goto err_put_fp;
+
+	return &neuronmem->p2pmem;
+
+err_put_fp:
+	neuronmem_put_fp();
+err_free:
+	kfree(neuronmem);
+	return NULL;
+}
+
+static int neuronmem_to_page_list(struct efa_dev *dev, struct efa_p2pmem *p2pmem,
+				  u64 *page_list)
+{
+	struct neuron_p2p_page_info *pg_info;
+	struct neuron_p2p_va_info *va_info;
+	struct efa_neuronmem *neuronmem;
+	int ent_idx, pa_idx;
+	int pg_idx = 0;
+	u64 pa;
+
+	neuronmem = container_of(p2pmem, struct efa_neuronmem, p2pmem);
+	va_info = neuronmem->va_info;
+
+	for (ent_idx = 0; ent_idx < va_info->entries; ent_idx++) {
+		pg_info = va_info->page_info + ent_idx;
+		pa = pg_info->physical_address;
+		for (pa_idx = 0; pa_idx < pg_info->page_count; pa_idx++) {
+			page_list[pg_idx++] = pa;
+			pa += BIT(va_info->shift_page_size);
+		}
+	}
+
+	return 0;
+}
+
+static void neuronmem_release(struct efa_dev *dev, struct efa_p2pmem *p2pmem,
+			      bool in_cb)
+{
+	struct efa_neuronmem *neuronmem;
+
+	neuronmem = container_of(p2pmem, struct efa_neuronmem, p2pmem);
+
+	neuronmem->ops.unregister_va(neuronmem->va_info);
+	neuronmem_put_fp();
+	kfree(neuronmem);
+}
+
+struct neuronmem_provider {
+	struct efa_p2p_provider p2p;
+};
+
+static const struct neuronmem_provider prov = {
+	.p2p = {
+		.ops = {
+			.try_get = neuronmem_get,
+			.to_page_list = neuronmem_to_page_list,
+			.release = neuronmem_release,
+			.get_page_size = neuronmem_pgsz,
+		},
+		.type = EFA_P2P_PROVIDER_NEURON,
+	},
+};
+
+const struct efa_p2p_provider *neuronmem_get_provider(void)
+{
+	return &prov.p2p;
+}
diff --git a/drivers/amazon/net/efa/efa_p2p.c b/drivers/amazon/net/efa/efa_p2p.c
new file mode 100644
index 0000000000000..9daf101288f43
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_p2p.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright 2019-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include "efa_p2p.h"
+
+static struct mutex p2p_list_lock;
+static struct list_head p2p_list;
+static atomic64_t next_p2p_ticket;
+
+static const struct efa_p2p_provider *prov_arr[EFA_P2P_PROVIDER_MAX];
+
+/* Register all providers here */
+static void p2p_providers_init(void)
+{
+	prov_arr[EFA_P2P_PROVIDER_NVMEM] = nvmem_get_provider();
+	prov_arr[EFA_P2P_PROVIDER_NEURON] = neuronmem_get_provider();
+}
+
+void efa_p2p_init(void)
+{
+	mutex_init(&p2p_list_lock);
+	INIT_LIST_HEAD(&p2p_list);
+	/*
+	 * Ideally, first ticket would be zero, but that would make callback
+	 * data NULL which is invalid.
+	 */
+	atomic64_set(&next_p2p_ticket, 1);
+
+	p2p_providers_init();
+}
+
+static struct efa_p2pmem *ticket_to_p2p(u64 ticket)
+{
+	struct efa_p2pmem *p2pmem;
+
+	lockdep_assert_held(&p2p_list_lock);
+	list_for_each_entry(p2pmem, &p2p_list, list) {
+		if (p2pmem->ticket == ticket)
+			return p2pmem;
+	}
+
+	return NULL;
+}
+
+int efa_p2p_put(u64 ticket, bool in_cb)
+{
+	struct efa_com_dereg_mr_params params = {};
+	struct efa_p2pmem *p2pmem;
+	struct efa_dev *dev;
+	int err;
+
+	mutex_lock(&p2p_list_lock);
+	p2pmem = ticket_to_p2p(ticket);
+	if (!p2pmem) {
+		pr_debug("Ticket %llu not found in the p2pmem list\n", ticket);
+		mutex_unlock(&p2p_list_lock);
+		return 0;
+	}
+
+	dev = p2pmem->dev;
+	if (p2pmem->needs_dereg) {
+		params.l_key = p2pmem->lkey;
+		err = efa_com_dereg_mr(&dev->edev, &params);
+		if (err) {
+			mutex_unlock(&p2p_list_lock);
+			return err;
+		}
+		p2pmem->needs_dereg = false;
+	}
+
+	list_del(&p2pmem->list);
+	mutex_unlock(&p2p_list_lock);
+	p2pmem->prov->ops.release(dev, p2pmem, in_cb);
+
+	return 0;
+}
+
+struct efa_p2pmem *efa_p2p_get(struct efa_dev *dev, struct efa_mr *mr, u64 start,
+			       u64 length)
+{
+	const struct efa_p2p_provider *prov;
+	struct efa_p2pmem *p2pmem;
+	u64 ticket;
+	int i;
+
+	ticket = atomic64_fetch_inc(&next_p2p_ticket);
+	for (i = 0; i < EFA_P2P_PROVIDER_MAX; i++) {
+		prov = prov_arr[i];
+		p2pmem = prov->ops.try_get(dev, ticket, start, length);
+		if (p2pmem)
+			break;
+	}
+	if (!p2pmem)
+		/* No provider was found, most likely cpu pages */
+		return NULL;
+
+	p2pmem->dev = dev;
+	p2pmem->ticket = ticket;
+	p2pmem->prov = prov;
+	mr->p2p_ticket = p2pmem->ticket;
+
+	mutex_lock(&p2p_list_lock);
+	list_add(&p2pmem->list, &p2p_list);
+	mutex_unlock(&p2p_list_lock);
+
+	return p2pmem;
+}
+
+int efa_p2p_to_page_list(struct efa_dev *dev, struct efa_p2pmem *p2pmem,
+			 u64 *page_list)
+{
+	return p2pmem->prov->ops.to_page_list(dev, p2pmem, page_list);
+}
+
+unsigned int efa_p2p_get_page_size(struct efa_dev *dev,
+				   struct efa_p2pmem *p2pmem)
+{
+	return p2pmem->prov->ops.get_page_size(dev, p2pmem);
+}
diff --git a/drivers/amazon/net/efa/efa_p2p.h b/drivers/amazon/net/efa/efa_p2p.h
new file mode 100644
index 0000000000000..89ee7a9935c11
--- /dev/null
+++ b/drivers/amazon/net/efa/efa_p2p.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2019-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_P2P_H_
+#define _EFA_P2P_H_
+
+#include "efa.h"
+
+struct efa_p2p_ops {
+	struct efa_p2pmem *(*try_get)(struct efa_dev *dev, u64 ticket, u64 start,
+				      u64 length);
+	int (*to_page_list)(struct efa_dev *dev, struct efa_p2pmem *p2pmem,
+			    u64 *page_list);
+	void (*release)(struct efa_dev *dev, struct efa_p2pmem *p2pmem,
+			bool in_cb);
+	unsigned int (*get_page_size)(struct efa_dev *dev,
+				      struct efa_p2pmem *p2pmem);
+};
+
+enum efa_p2p_prov {
+	EFA_P2P_PROVIDER_NVMEM,
+	EFA_P2P_PROVIDER_NEURON,
+	EFA_P2P_PROVIDER_MAX,
+};
+
+struct efa_p2p_provider {
+	const struct efa_p2p_ops ops;
+	enum efa_p2p_prov type;
+};
+
+struct efa_p2pmem {
+	struct efa_dev *dev;
+	const struct efa_p2p_provider *prov;
+	u64 ticket;
+	u32 lkey;
+	bool needs_dereg;
+	struct list_head list; /* member of efa_p2p_list */
+};
+
+void efa_p2p_init(void);
+struct efa_p2pmem *efa_p2p_get(struct efa_dev *dev, struct efa_mr *mr, u64 start,
+			       u64 length);
+unsigned int efa_p2p_get_page_size(struct efa_dev *dev,
+				   struct efa_p2pmem *p2pmem);
+int efa_p2p_to_page_list(struct efa_dev *dev, struct efa_p2pmem *p2pmem,
+			 u64 *page_list);
+int efa_p2p_put(u64 ticket, bool in_cb);
+
+/* Provider specific stuff go here */
+const struct efa_p2p_provider *nvmem_get_provider(void);
+bool nvmem_is_supported(void);
+
+const struct efa_p2p_provider *neuronmem_get_provider(void);
+
+#endif /* _EFA_P2P_H_ */
diff --git a/drivers/amazon/net/efa/efa_regs_defs.h b/drivers/amazon/net/efa/efa_regs_defs.h
index 4017982fe13b0..714ae62588004 100644
--- a/drivers/amazon/net/efa/efa_regs_defs.h
+++ b/drivers/amazon/net/efa/efa_regs_defs.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _EFA_REGS_H_
@@ -42,6 +42,7 @@ enum efa_regs_reset_reason_types {
 #define EFA_REGS_MMIO_REG_READ_OFF                          0x5c
 #define EFA_REGS_MMIO_RESP_LO_OFF                           0x60
 #define EFA_REGS_MMIO_RESP_HI_OFF                           0x64
+#define EFA_REGS_EQ_DB_OFF                                  0x68
 
 /* version register */
 #define EFA_REGS_VERSION_MINOR_VERSION_MASK                 0xff
@@ -93,4 +94,8 @@ enum efa_regs_reset_reason_types {
 #define EFA_REGS_MMIO_REG_READ_REQ_ID_MASK                  0xffff
 #define EFA_REGS_MMIO_REG_READ_REG_OFF_MASK                 0xffff0000
 
+/* eq_db register */
+#define EFA_REGS_EQ_DB_EQN_MASK                             0xffff
+#define EFA_REGS_EQ_DB_ARM_MASK                             0x80000000
+
 #endif /* _EFA_REGS_H_ */
diff --git a/drivers/amazon/net/efa/efa_sysfs.c b/drivers/amazon/net/efa/efa_sysfs.c
index c9026c9cfff0f..8e8b2bd210db1 100644
--- a/drivers/amazon/net/efa/efa_sysfs.c
+++ b/drivers/amazon/net/efa/efa_sysfs.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
 /*
- * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include "efa_sysfs.h"
@@ -9,19 +9,35 @@
 #include <linux/device.h>
 #include <linux/sysfs.h>
 
-#ifdef HAVE_EFA_GDR
-#include "efa_gdr.h"
+#ifndef HAVE_SYSFS_EMIT
+#include <linux/mm.h>
+
+static int sysfs_emit(char *buf, const char *fmt, ...)
+{
+	va_list args;
+	int len;
+
+	if (!buf)
+		return 0;
+
+	va_start(args, fmt);
+	len = vscnprintf(buf, PAGE_SIZE, fmt, args);
+	va_end(args);
+
+	return len;
+}
+#endif
+
+#ifdef HAVE_EFA_P2P
+#include "efa_p2p.h"
 
 static ssize_t gdr_show(struct device *dev, struct device_attribute *attr,
 			char *buf)
 {
-	struct efa_nvmem dummynv = {};
-
-	if (nvmem_get_fp(&dummynv))
-		return sprintf(buf, "0\n");
-	nvmem_put_fp();
+	if (nvmem_is_supported())
+		return sysfs_emit(buf, "1\n");
 
-	return sprintf(buf, "1\n");
+	return sysfs_emit(buf, "0\n");
 }
 
 static DEVICE_ATTR_RO(gdr);
@@ -29,7 +45,7 @@ static DEVICE_ATTR_RO(gdr);
 
 int efa_sysfs_init(struct efa_dev *dev)
 {
-#ifdef HAVE_EFA_GDR
+#ifdef HAVE_EFA_P2P
 	struct device *device = &dev->pdev->dev;
 
 	if (device_create_file(device, &dev_attr_gdr))
@@ -40,7 +56,7 @@ int efa_sysfs_init(struct efa_dev *dev)
 
 void efa_sysfs_destroy(struct efa_dev *dev)
 {
-#ifdef HAVE_EFA_GDR
+#ifdef HAVE_EFA_P2P
 	device_remove_file(&dev->pdev->dev, &dev_attr_gdr);
 #endif
 }
diff --git a/drivers/amazon/net/efa/efa_verbs.c b/drivers/amazon/net/efa/efa_verbs.c
index b27c2f5b0fd2a..c9535ee90108b 100644
--- a/drivers/amazon/net/efa/efa_verbs.c
+++ b/drivers/amazon/net/efa/efa_verbs.c
@@ -1,9 +1,13 @@
 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /*
- * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include "kcompat.h"
+#ifdef HAVE_MR_DMABUF
+#include <linux/dma-buf.h>
+#include <linux/dma-resv.h>
+#endif
 #include <linux/vmalloc.h>
 #include <linux/log2.h>
 
@@ -16,9 +20,10 @@
 #endif
 
 #include "efa.h"
+#include "efa_io_defs.h"
 
-#ifdef HAVE_EFA_GDR
-#include "efa_gdr.h"
+#ifdef HAVE_EFA_P2P
+#include "efa_p2p.h"
 #endif
 
 enum {
@@ -70,13 +75,23 @@ struct efa_user_mmap_entry {
 	op(EFA_RDMA_READ_RESP_BYTES, "rdma_read_resp_bytes") \
 
 #define EFA_STATS_ENUM(ename, name) ename,
-#define EFA_STATS_STR(ename, name) [ename] = name,
+#ifdef HAVE_STAT_DESC_STRUCT
+#define EFA_STATS_STR(ename, nam) \
+	[ename].name = nam,
+#else
+#define EFA_STATS_STR(ename, nam) \
+	[ename] = nam,
+#endif
 
 enum efa_hw_device_stats {
 	EFA_DEFINE_DEVICE_STATS(EFA_STATS_ENUM)
 };
 
-static const char *const efa_device_stats_names[] = {
+#ifdef HAVE_STAT_DESC_STRUCT
+static const struct rdma_stat_desc efa_device_stats_descs[] = {
+#else
+static const char *const efa_device_stats_descs[] = {
+#endif
 	EFA_DEFINE_DEVICE_STATS(EFA_STATS_STR)
 };
 
@@ -84,7 +99,11 @@ enum efa_hw_port_stats {
 	EFA_DEFINE_PORT_STATS(EFA_STATS_ENUM)
 };
 
-static const char *const efa_port_stats_names[] = {
+#ifdef HAVE_STAT_DESC_STRUCT
+static const struct rdma_stat_desc efa_port_stats_descs[] = {
+#else
+static const char *const efa_port_stats_descs[] = {
+#endif
 	EFA_DEFINE_PORT_STATS(EFA_STATS_STR)
 };
 
@@ -325,12 +344,16 @@ int efa_query_device(struct ib_device *ibdev,
 		resp.max_rq_wr = dev_attr->max_rq_depth;
 		resp.max_rdma_size = dev_attr->max_rdma_size;
 
+		resp.device_caps |= EFA_QUERY_DEVICE_CAPS_CQ_WITH_SGID;
 		if (EFA_DEV_CAP(dev, RDMA_READ))
 			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RDMA_READ;
 
 		if (EFA_DEV_CAP(dev, RNR_RETRY))
 			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RNR_RETRY;
 
+		if (dev->neqs)
+			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS;
+
 		err = ib_copy_to_udata(udata, &resp,
 				       min(sizeof(resp), udata->outlen));
 		if (err) {
@@ -601,7 +624,9 @@ int efa_destroy_qp(struct ib_qp *ibqp)
 				qp->rq_size, DMA_TO_DEVICE);
 	}
 
+#ifndef HAVE_QP_CORE_ALLOCATION
 	kfree(qp);
+#endif
 	return 0;
 }
 
@@ -802,17 +827,16 @@ static int efa_qp_validate_attr(struct efa_dev *dev,
 	return 0;
 }
 
-struct ib_qp *efa_create_qp(struct ib_pd *ibpd,
-			    struct ib_qp_init_attr *init_attr,
-			    struct ib_udata *udata)
+int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr,
+		  struct ib_udata *udata)
 {
 	struct efa_com_create_qp_params create_qp_params = {};
 	struct efa_com_create_qp_result create_qp_resp;
-	struct efa_dev *dev = to_edev(ibpd->device);
+	struct efa_dev *dev = to_edev(ibqp->device);
 	struct efa_ibv_create_qp_resp resp = {};
 	struct efa_ibv_create_qp cmd = {};
+	struct efa_qp *qp = to_eqp(ibqp);
 	struct efa_ucontext *ucontext;
-	struct efa_qp *qp;
 	int err;
 
 #ifndef HAVE_NO_KVERBS_DRIVERS
@@ -827,8 +851,8 @@ struct ib_qp *efa_create_qp(struct ib_pd *ibpd,
 	ucontext = rdma_udata_to_drv_context(udata, struct efa_ucontext,
 					     ibucontext);
 #else
-	ucontext = ibpd->uobject ? to_eucontext(ibpd->uobject->context) :
-				   NULL;
+	ucontext = ibqp->pd->uobject ? to_eucontext(ibqp->pd->uobject->context) :
+				       NULL;
 #endif
 
 	err = efa_qp_validate_cap(dev, init_attr);
@@ -870,14 +894,8 @@ struct ib_qp *efa_create_qp(struct ib_pd *ibpd,
 		goto err_out;
 	}
 
-	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
-	if (!qp) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-
 	create_qp_params.uarn = ucontext->uarn;
-	create_qp_params.pd = to_epd(ibpd)->pdn;
+	create_qp_params.pd = to_epd(ibqp->pd)->pdn;
 
 	if (init_attr->qp_type == IB_QPT_UD) {
 		create_qp_params.qp_type = EFA_ADMIN_QP_TYPE_UD;
@@ -888,7 +906,7 @@ struct ib_qp *efa_create_qp(struct ib_pd *ibpd,
 			  "Unsupported qp type %d driver qp type %d\n",
 			  init_attr->qp_type, cmd.driver_qp_type);
 		err = -EOPNOTSUPP;
-		goto err_free_qp;
+		goto err_out;
 	}
 
 	ibdev_dbg(&dev->ibdev, "Create QP: qp type %d driver qp type %#x\n",
@@ -906,7 +924,7 @@ struct ib_qp *efa_create_qp(struct ib_pd *ibpd,
 						    qp->rq_size, DMA_TO_DEVICE);
 		if (!qp->rq_cpu_addr) {
 			err = -ENOMEM;
-			goto err_free_qp;
+			goto err_out;
 		}
 
 		ibdev_dbg(&dev->ibdev,
@@ -933,7 +951,6 @@ struct ib_qp *efa_create_qp(struct ib_pd *ibpd,
 
 	qp->qp_handle = create_qp_resp.qp_handle;
 	qp->ibqp.qp_num = create_qp_resp.qp_num;
-	qp->ibqp.qp_type = init_attr->qp_type;
 	qp->max_send_wr = init_attr->cap.max_send_wr;
 	qp->max_recv_wr = init_attr->cap.max_recv_wr;
 	qp->max_send_sge = init_attr->cap.max_send_sge;
@@ -953,7 +970,7 @@ struct ib_qp *efa_create_qp(struct ib_pd *ibpd,
 
 	ibdev_dbg(&dev->ibdev, "Created qp[%d]\n", qp->ibqp.qp_num);
 
-	return &qp->ibqp;
+	return 0;
 
 err_remove_mmap_entries:
 	efa_qp_user_mmap_entries_remove(qp);
@@ -963,12 +980,42 @@ struct ib_qp *efa_create_qp(struct ib_pd *ibpd,
 	if (qp->rq_size)
 		efa_free_mapped(dev, qp->rq_cpu_addr, qp->rq_dma_addr,
 				qp->rq_size, DMA_TO_DEVICE);
+err_out:
+	atomic64_inc(&dev->stats.create_qp_err);
+	return err;
+}
+
+#ifndef HAVE_QP_CORE_ALLOCATION
+struct ib_qp *efa_kzalloc_qp(struct ib_pd *ibpd,
+			     struct ib_qp_init_attr *init_attr,
+			     struct ib_udata *udata)
+{
+	struct efa_dev *dev = to_edev(ibpd->device);
+	struct efa_qp *qp;
+	int err;
+
+	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+	if (!qp) {
+		atomic64_inc(&dev->stats.create_qp_err);
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	qp->ibqp.device = ibpd->device;
+	qp->ibqp.pd = ibpd;
+	qp->ibqp.qp_type = init_attr->qp_type;
+	err = efa_create_qp(&qp->ibqp, init_attr, udata);
+	if (err)
+		goto err_free_qp;
+
+	return &qp->ibqp;
+
 err_free_qp:
 	kfree(qp);
 err_out:
-	atomic64_inc(&dev->stats.create_qp_err);
 	return ERR_PTR(err);
 }
+#endif
 
 static const struct {
 	int			valid;
@@ -1216,6 +1263,12 @@ static int efa_destroy_cq_idx(struct efa_dev *dev, int cq_idx)
 	return efa_com_destroy_cq(&dev->edev, &params);
 }
 
+static void efa_cq_user_mmap_entries_remove(struct efa_cq *cq)
+{
+	rdma_user_mmap_entry_remove(cq->db_mmap_entry);
+	rdma_user_mmap_entry_remove(cq->mmap_entry);
+}
+
 #if defined(HAVE_IB_VOID_DESTROY_CQ) || defined(HAVE_IB_INT_DESTROY_CQ)
 #ifdef HAVE_IB_INT_DESTROY_CQ
 int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
@@ -1230,8 +1283,16 @@ void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 		  "Destroy cq[%d] virt[0x%p] freed: size[%lu], dma[%pad]\n",
 		  cq->cq_idx, cq->cpu_addr, cq->size, &cq->dma_addr);
 
-	rdma_user_mmap_entry_remove(cq->mmap_entry);
+	efa_cq_user_mmap_entries_remove(cq);
 	efa_destroy_cq_idx(dev, cq->cq_idx);
+	if (cq->eq) {
+#ifdef HAVE_XARRAY
+		xa_erase(&dev->cqs_xa, cq->cq_idx);
+#else
+		dev->cqs_arr[cq->cq_idx] = NULL;
+#endif
+		synchronize_irq(cq->eq->irq.irqn);
+	}
 	efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size,
 			DMA_FROM_DEVICE);
 #ifndef HAVE_CQ_CORE_ALLOCATION
@@ -1256,11 +1317,19 @@ int efa_destroy_cq(struct ib_cq *ibcq)
 		  "Destroy cq[%d] virt[0x%p] freed: size[%lu], dma[%pad]\n",
 		  cq->cq_idx, cq->cpu_addr, cq->size, &cq->dma_addr);
 
-	rdma_user_mmap_entry_remove(cq->mmap_entry);
+	efa_cq_user_mmap_entries_remove(cq);
 	err = efa_destroy_cq_idx(dev, cq->cq_idx);
 	if (err)
 		return err;
 
+	if (cq->eq) {
+#ifdef HAVE_XARRAY
+		xa_erase(&dev->cqs_xa, cq->cq_idx);
+#else
+		dev->cqs_arr[cq->cq_idx] = NULL;
+#endif
+		synchronize_irq(cq->eq->irq.irqn);
+	}
 	efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size,
 			DMA_FROM_DEVICE);
 
@@ -1269,8 +1338,14 @@ int efa_destroy_cq(struct ib_cq *ibcq)
 }
 #endif
 
+static struct efa_eq *efa_vec2eq(struct efa_dev *dev, int vec)
+{
+	return &dev->eqs[vec];
+}
+
 static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq,
-				 struct efa_ibv_create_cq_resp *resp)
+				 struct efa_ibv_create_cq_resp *resp,
+				 bool db_valid)
 {
 	resp->q_mmap_size = cq->size;
 	cq->mmap_entry = efa_user_mmap_entry_insert(&cq->ucontext->ibucontext,
@@ -1280,6 +1355,21 @@ static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq,
 	if (!cq->mmap_entry)
 		return -ENOMEM;
 
+	if (db_valid) {
+		cq->db_mmap_entry =
+			efa_user_mmap_entry_insert(&cq->ucontext->ibucontext,
+						   dev->db_bar_addr + resp->db_off,
+						   PAGE_SIZE, EFA_MMAP_IO_NC,
+						   &resp->db_mmap_key);
+		if (!cq->db_mmap_entry) {
+			rdma_user_mmap_entry_remove(cq->mmap_entry);
+			return -ENOMEM;
+		}
+
+		resp->db_off &= ~PAGE_MASK;
+		resp->comp_mask |= EFA_CREATE_CQ_RESP_DB_OFF;
+	}
+
 	return 0;
 }
 
@@ -1292,14 +1382,15 @@ int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 #else
 	struct efa_ucontext *ucontext = to_ecq(ibcq)->ucontext;
 #endif
+	struct efa_com_create_cq_params params = {};
 	struct efa_ibv_create_cq_resp resp = {};
-	struct efa_com_create_cq_params params;
 	struct efa_com_create_cq_result result;
 	struct ib_device *ibdev = ibcq->device;
 	struct efa_dev *dev = to_edev(ibdev);
 	struct efa_ibv_create_cq cmd = {};
 	struct efa_cq *cq = to_ecq(ibcq);
 	int entries = attr->cqe;
+	bool set_src_addr;
 	int err;
 
 	ibdev_dbg(ibdev, "create_cq entries %d\n", entries);
@@ -1346,14 +1437,16 @@ int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		goto err_out;
 	}
 
-	if (cmd.comp_mask || !is_reserved_cleared(cmd.reserved_50)) {
+	if (cmd.comp_mask || !is_reserved_cleared(cmd.reserved_58)) {
 		ibdev_dbg(ibdev,
 			  "Incompatible ABI params, unknown fields in udata\n");
 		err = -EINVAL;
 		goto err_out;
 	}
 
-	if (!cmd.cq_entry_size) {
+	set_src_addr = !!(cmd.flags & EFA_CREATE_CQ_WITH_SGID);
+	if ((cmd.cq_entry_size != sizeof(struct efa_io_rx_cdesc_ex)) &&
+		(set_src_addr || cmd.cq_entry_size != sizeof(struct efa_io_rx_cdesc))) {
 		ibdev_dbg(ibdev,
 			  "Invalid entry size [%u]\n", cmd.cq_entry_size);
 		err = -EINVAL;
@@ -1382,29 +1475,50 @@ int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 	params.dma_addr = cq->dma_addr;
 	params.entry_size_in_bytes = cmd.cq_entry_size;
 	params.num_sub_cqs = cmd.num_sub_cqs;
+	params.set_src_addr = set_src_addr;
+	if (cmd.flags & EFA_CREATE_CQ_WITH_COMPLETION_CHANNEL) {
+		cq->eq = efa_vec2eq(dev, attr->comp_vector);
+		params.eqn = cq->eq->eeq.eqn;
+		params.interrupt_mode_enabled = true;
+	}
+
 	err = efa_com_create_cq(&dev->edev, &params, &result);
 	if (err)
 		goto err_free_mapped;
 
+	resp.db_off = result.db_off;
 	resp.cq_idx = result.cq_idx;
 	cq->cq_idx = result.cq_idx;
 	cq->ibcq.cqe = result.actual_depth;
 	WARN_ON_ONCE(entries != result.actual_depth);
 
-	err = cq_mmap_entries_setup(dev, cq, &resp);
+	err = cq_mmap_entries_setup(dev, cq, &resp, result.db_valid);
 	if (err) {
 		ibdev_dbg(ibdev, "Could not setup cq[%u] mmap entries\n",
 			  cq->cq_idx);
 		goto err_destroy_cq;
 	}
 
+	if (cq->eq) {
+#ifdef HAVE_XARRAY
+		err = xa_err(xa_store(&dev->cqs_xa, cq->cq_idx, cq, GFP_KERNEL));
+#else
+		dev->cqs_arr[cq->cq_idx] = cq;
+#endif
+		if (err) {
+			ibdev_dbg(ibdev, "Failed to store cq[%u] in xarray\n",
+				  cq->cq_idx);
+			goto err_remove_mmap;
+		}
+	}
+
 	if (udata->outlen) {
 		err = ib_copy_to_udata(udata, &resp,
 				       min(sizeof(resp), udata->outlen));
 		if (err) {
 			ibdev_dbg(ibdev,
 				  "Failed to copy udata for create_cq\n");
-			goto err_remove_mmap;
+			goto err_xa_erase;
 		}
 	}
 
@@ -1413,8 +1527,15 @@ int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 
 	return 0;
 
+err_xa_erase:
+	if (cq->eq)
+#ifdef HAVE_XARRAY
+		xa_erase(&dev->cqs_xa, cq->cq_idx);
+#else
+		dev->cqs_arr[cq->cq_idx] = NULL;
+#endif
 err_remove_mmap:
-	rdma_user_mmap_entry_remove(cq->mmap_entry);
+	efa_cq_user_mmap_entries_remove(cq);
 err_destroy_cq:
 	efa_destroy_cq_idx(dev, cq->cq_idx);
 err_free_mapped:
@@ -1797,7 +1918,7 @@ static void pbl_indirect_terminate(struct efa_dev *dev, struct pbl_context *pbl)
 /* create a page buffer list from a mapped user memory region */
 static int pbl_create(struct efa_dev *dev,
 		      struct pbl_context *pbl,
-#ifdef HAVE_EFA_GDR
+#ifdef HAVE_EFA_P2P
 		      struct efa_mr *mr,
 #else
 		      struct ib_umem *umem,
@@ -1814,12 +1935,12 @@ static int pbl_create(struct efa_dev *dev,
 
 	if (is_vmalloc_addr(pbl->pbl_buf)) {
 		pbl->physically_continuous = 0;
-#ifdef HAVE_EFA_GDR
-		if (mr->umem)
+#ifdef HAVE_EFA_P2P
+		if (mr->p2pmem)
+			err = efa_p2p_to_page_list(dev, mr->p2pmem, pbl->pbl_buf);
+		else
 			err = umem_to_page_list(dev, mr->umem, pbl->pbl_buf, hp_cnt,
 						hp_shift);
-		else
-			err = nvmem_to_page_list(dev, mr->nvmem, pbl->pbl_buf);
 #else
 		err = umem_to_page_list(dev, umem, pbl->pbl_buf, hp_cnt,
 					hp_shift);
@@ -1832,12 +1953,12 @@ static int pbl_create(struct efa_dev *dev,
 			goto err_free;
 	} else {
 		pbl->physically_continuous = 1;
-#ifdef HAVE_EFA_GDR
-		if (mr->umem)
+#ifdef HAVE_EFA_P2P
+		if (mr->p2pmem)
+			err = efa_p2p_to_page_list(dev, mr->p2pmem, pbl->pbl_buf);
+		else
 			err = umem_to_page_list(dev, mr->umem, pbl->pbl_buf, hp_cnt,
 						hp_shift);
-		else
-			err = nvmem_to_page_list(dev, mr->nvmem, pbl->pbl_buf);
 #else
 		err = umem_to_page_list(dev, umem, pbl->pbl_buf, hp_cnt,
 					hp_shift);
@@ -1878,13 +1999,13 @@ static int efa_create_inline_pbl(struct efa_dev *dev, struct efa_mr *mr,
 	int err;
 
 	params->inline_pbl = 1;
-#ifdef HAVE_EFA_GDR
-	if (mr->umem)
+#ifdef HAVE_EFA_P2P
+	if (mr->p2pmem)
+		err = efa_p2p_to_page_list(dev, mr->p2pmem,
+					   params->pbl.inline_pbl_array);
+	else
 		err = umem_to_page_list(dev, mr->umem, params->pbl.inline_pbl_array,
 					params->page_num, params->page_shift);
-	else
-		err = nvmem_to_page_list(dev, mr->nvmem,
-					 params->pbl.inline_pbl_array);
 #else
 	err = umem_to_page_list(dev, mr->umem, params->pbl.inline_pbl_array,
 				params->page_num, params->page_shift);
@@ -1905,7 +2026,7 @@ static int efa_create_pbl(struct efa_dev *dev,
 {
 	int err;
 
-#ifdef HAVE_EFA_GDR
+#ifdef HAVE_EFA_P2P
 	err = pbl_create(dev, pbl, mr, params->page_num,
 			 params->page_shift);
 #else
@@ -1985,25 +2106,17 @@ static unsigned long efa_cont_pages(struct ib_umem *umem,
 }
 #endif
 
-struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
-			 u64 virt_addr, int access_flags,
-			 struct ib_udata *udata)
+static struct efa_mr *efa_alloc_mr(struct ib_pd *ibpd, int access_flags,
+				   struct ib_udata *udata)
 {
 	struct efa_dev *dev = to_edev(ibpd->device);
-	struct efa_com_reg_mr_params params = {};
-	struct efa_com_reg_mr_result result = {};
-	struct pbl_context pbl;
 	int supp_access_flags;
-	unsigned int pg_sz;
 	struct efa_mr *mr;
-	int inline_size;
-	int err;
 
 #ifndef HAVE_NO_KVERBS_DRIVERS
 	if (!udata) {
 		ibdev_dbg(&dev->ibdev, "udata is NULL\n");
-		err = -EOPNOTSUPP;
-		goto err_out;
+		return ERR_PTR(-EINVAL);
 	}
 #endif
 
@@ -2011,8 +2124,7 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
 	    !ib_is_udata_cleared(udata, 0, sizeof(udata->inlen))) {
 		ibdev_dbg(&dev->ibdev,
 			  "Incompatible ABI params, udata not cleared\n");
-		err = -EINVAL;
-		goto err_out;
+		return ERR_PTR(-EINVAL);
 	}
 
 	supp_access_flags =
@@ -2026,103 +2138,65 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
 		ibdev_dbg(&dev->ibdev,
 			  "Unsupported access flags[%#x], supported[%#x]\n",
 			  access_flags, supp_access_flags);
-		err = -EOPNOTSUPP;
-		goto err_out;
+		return ERR_PTR(-EOPNOTSUPP);
 	}
 
 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
-	if (!mr) {
-		err = -ENOMEM;
-		goto err_out;
-	}
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
 
-#ifdef HAVE_EFA_GDR
-	mr->nvmem = nvmem_get(dev, mr, start, length, &pg_sz);
-	if (!mr->nvmem) {
-#ifdef HAVE_IB_UMEM_GET_DEVICE_PARAM
-		mr->umem = ib_umem_get(ibpd->device, start, length,
-				       access_flags);
-#elif defined(HAVE_IB_UMEM_GET_NO_DMASYNC)
-		mr->umem = ib_umem_get(udata, start, length, access_flags);
-#elif defined(HAVE_IB_UMEM_GET_UDATA)
-		mr->umem = ib_umem_get(udata, start, length, access_flags, 0);
-#else
-		mr->umem = ib_umem_get(ibpd->uobject->context, start, length,
-				       access_flags, 0);
-#endif
-		if (IS_ERR(mr->umem)) {
-			err = PTR_ERR(mr->umem);
-			ibdev_dbg(&dev->ibdev,
-				  "Failed to pin and map user space memory[%d]\n",
-				  err);
-			goto err_free;
-		}
+	return mr;
+}
 
-#ifdef HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE
-		pg_sz = ib_umem_find_best_pgsz(mr->umem,
-					       dev->dev_attr.page_size_cap,
-					       virt_addr);
-		if (!pg_sz) {
-			err = -EOPNOTSUPP;
-			ibdev_dbg(&dev->ibdev, "Failed to find a suitable page size in page_size_cap %#llx\n",
-				  dev->dev_attr.page_size_cap);
-			goto err_unmap;
-		}
-#else
-		pg_sz = efa_cont_pages(mr->umem, dev->dev_attr.page_size_cap,
-				       virt_addr);
-#endif
-	}
-#else /* !defined(HAVE_EFA_GDR) */
-#ifdef HAVE_IB_UMEM_GET_DEVICE_PARAM
-	mr->umem = ib_umem_get(ibpd->device, start, length, access_flags);
-#elif defined(HAVE_IB_UMEM_GET_NO_DMASYNC)
-	mr->umem = ib_umem_get(udata, start, length, access_flags);
-#elif defined(HAVE_IB_UMEM_GET_UDATA)
-	mr->umem = ib_umem_get(udata, start, length, access_flags, 0);
-#else
-	mr->umem = ib_umem_get(ibpd->uobject->context, start, length,
-			       access_flags, 0);
-#endif
-	if (IS_ERR(mr->umem)) {
-		err = PTR_ERR(mr->umem);
-		ibdev_dbg(&dev->ibdev,
-			  "Failed to pin and map user space memory[%d]\n", err);
-		goto err_free;
-	}
-#endif /* defined(HAVE_EFA_GDR) */
+static int efa_register_mr(struct ib_pd *ibpd, struct efa_mr *mr, u64 start,
+			   u64 length, u64 virt_addr, int access_flags)
+{
+	struct efa_dev *dev = to_edev(ibpd->device);
+	struct efa_com_reg_mr_params params = {};
+	struct efa_com_reg_mr_result result = {};
+	struct pbl_context pbl;
+	unsigned int pg_sz;
+	int inline_size;
+	int err;
 
 	params.pd = to_epd(ibpd)->pdn;
 	params.iova = virt_addr;
 	params.mr_length_in_bytes = length;
 	params.permissions = access_flags;
 
-#ifndef HAVE_EFA_GDR
+#ifdef HAVE_EFA_P2P
+	if (mr->p2pmem) {
+		pg_sz = efa_p2p_get_page_size(dev, mr->p2pmem);
+		goto skip_umem_pg_sz;
+	}
+#endif
+
 #ifdef HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE
 	pg_sz = ib_umem_find_best_pgsz(mr->umem,
 				       dev->dev_attr.page_size_cap,
 				       virt_addr);
 	if (!pg_sz) {
-		err = -EOPNOTSUPP;
 		ibdev_dbg(&dev->ibdev, "Failed to find a suitable page size in page_size_cap %#llx\n",
 			  dev->dev_attr.page_size_cap);
-		goto err_unmap;
+		return -EOPNOTSUPP;
 	}
 #else
 	pg_sz = efa_cont_pages(mr->umem, dev->dev_attr.page_size_cap,
 			       virt_addr);
 #endif /* defined(HAVE_IB_UMEM_FIND_SINGLE_PG_SIZE) */
-#endif /* !defined(HAVE_EFA_GDR) */
 
+#ifdef HAVE_EFA_P2P
+skip_umem_pg_sz:
+#endif
 	params.page_shift = order_base_2(pg_sz);
 #ifdef HAVE_IB_UMEM_NUM_DMA_BLOCKS
-#ifdef HAVE_EFA_GDR
-	if (mr->umem)
-		params.page_num = ib_umem_num_dma_blocks(mr->umem, pg_sz);
-	else
+#ifdef HAVE_EFA_P2P
+	if (mr->p2pmem)
 		params.page_num = DIV_ROUND_UP(length +
 					       (virt_addr & (pg_sz - 1)),
 					       pg_sz);
+	else
+		params.page_num = ib_umem_num_dma_blocks(mr->umem, pg_sz);
 #else
 	params.page_num = ib_umem_num_dma_blocks(mr->umem, pg_sz);
 #endif
@@ -2139,21 +2213,21 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
 	if (params.page_num <= inline_size) {
 		err = efa_create_inline_pbl(dev, mr, &params);
 		if (err)
-			goto err_unmap;
+			return err;
 
 		err = efa_com_register_mr(&dev->edev, &params, &result);
 		if (err)
-			goto err_unmap;
+			return err;
 	} else {
 		err = efa_create_pbl(dev, &pbl, mr, &params);
 		if (err)
-			goto err_unmap;
+			return err;
 
 		err = efa_com_register_mr(&dev->edev, &params, &result);
 		pbl_destroy(dev, &pbl);
 
 		if (err)
-			goto err_unmap;
+			return err;
 	}
 
 	mr->ibmr.lkey = result.l_key;
@@ -2161,20 +2235,116 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
 #ifdef HAVE_IB_MR_LENGTH
 	mr->ibmr.length = length;
 #endif
-#ifdef HAVE_EFA_GDR
-	if (mr->nvmem) {
-		mr->nvmem->lkey = result.l_key;
-		mr->nvmem->needs_dereg = true;
+#ifdef HAVE_EFA_P2P
+	if (mr->p2pmem) {
+		mr->p2pmem->lkey = result.l_key;
+		mr->p2pmem->needs_dereg = true;
 	}
 #endif
 	ibdev_dbg(&dev->ibdev, "Registered mr[%d]\n", mr->ibmr.lkey);
 
+	return 0;
+}
+
+#ifdef HAVE_MR_DMABUF
+struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start,
+				     u64 length, u64 virt_addr,
+				     int fd, int access_flags,
+				     struct ib_udata *udata)
+{
+	struct efa_dev *dev = to_edev(ibpd->device);
+	struct ib_umem_dmabuf *umem_dmabuf;
+	struct efa_mr *mr;
+	int err;
+
+	mr = efa_alloc_mr(ibpd, access_flags, udata);
+	if (IS_ERR(mr)) {
+		err = PTR_ERR(mr);
+		goto err_out;
+	}
+
+	umem_dmabuf = ib_umem_dmabuf_get_pinned(ibpd->device, start, length, fd,
+						access_flags);
+	if (IS_ERR(umem_dmabuf)) {
+		err = PTR_ERR(umem_dmabuf);
+		ibdev_dbg(&dev->ibdev, "Failed to get dmabuf umem[%d]\n", err);
+		goto err_free;
+	}
+
+	mr->umem = &umem_dmabuf->umem;
+	err = efa_register_mr(ibpd, mr, start, length, virt_addr, access_flags);
+	if (err)
+		goto err_release;
+
+	return &mr->ibmr;
+
+err_release:
+#ifndef HAVE_IB_UMEM_DMABUF_PINNED
+	dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
+	dma_buf_unpin(umem_dmabuf->attach);
+	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+#endif
+	ib_umem_release(mr->umem);
+err_free:
+	kfree(mr);
+err_out:
+	atomic64_inc(&dev->stats.reg_mr_err);
+	return ERR_PTR(err);
+}
+#endif
+
+struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
+			 u64 virt_addr, int access_flags,
+			 struct ib_udata *udata)
+{
+	struct efa_dev *dev = to_edev(ibpd->device);
+	struct efa_mr *mr;
+	int err;
+
+	mr = efa_alloc_mr(ibpd, access_flags, udata);
+	if (IS_ERR(mr)) {
+		err = PTR_ERR(mr);
+		goto err_out;
+	}
+
+#ifdef HAVE_IB_UMEM_GET_DEVICE_PARAM
+	mr->umem = ib_umem_get(ibpd->device, start, length, access_flags);
+#elif defined(HAVE_IB_UMEM_GET_NO_DMASYNC)
+	mr->umem = ib_umem_get(udata, start, length, access_flags);
+#elif defined(HAVE_IB_UMEM_GET_UDATA)
+	mr->umem = ib_umem_get(udata, start, length, access_flags, 0);
+#else
+	mr->umem = ib_umem_get(ibpd->uobject->context, start, length,
+			       access_flags, 0);
+#endif
+	if (IS_ERR(mr->umem)) {
+#ifdef HAVE_EFA_P2P
+		mr->p2pmem = efa_p2p_get(dev, mr, start, length);
+		if (mr->p2pmem) {
+			/* Avoid referencing an error-pointer later on */
+			mr->umem = NULL;
+			goto reg_mr;
+		}
+#endif
+		err = PTR_ERR(mr->umem);
+		ibdev_dbg(&dev->ibdev,
+			  "Failed to pin and map user space memory[%d]\n", err);
+		goto err_free;
+	}
+
+#ifdef HAVE_EFA_P2P
+reg_mr:
+#endif
+	err = efa_register_mr(ibpd, mr, start, length, virt_addr, access_flags);
+	if (err)
+		goto err_release;
+
 	return &mr->ibmr;
 
-err_unmap:
-#ifdef HAVE_EFA_GDR
-	if (mr->nvmem)
-		nvmem_put(mr->nvmem->ticket, false);
+err_release:
+#ifdef HAVE_EFA_P2P
+	if (mr->p2pmem)
+		efa_p2p_put(mr->p2pmem->ticket, false);
 	else
 		ib_umem_release(mr->umem);
 #else
@@ -2200,9 +2370,9 @@ int efa_dereg_mr(struct ib_mr *ibmr)
 
 	ibdev_dbg(&dev->ibdev, "Deregister mr[%d]\n", ibmr->lkey);
 
-#ifdef HAVE_EFA_GDR
-	if (mr->nvmem){
-		err = nvmem_put(mr->nvmem_ticket, false);
+#ifdef HAVE_EFA_P2P
+	if (mr->p2pmem) {
+		err = efa_p2p_put(mr->p2p_ticket, false);
 		if (err)
 			return err;
 
@@ -2215,6 +2385,17 @@ int efa_dereg_mr(struct ib_mr *ibmr)
 	if (err)
 		return err;
 
+#if defined(HAVE_MR_DMABUF) && !defined(HAVE_IB_UMEM_DMABUF_PINNED)
+	if (mr->umem->is_dmabuf) {
+		struct ib_umem_dmabuf *umem_dmabuf;
+
+		umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
+		dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
+		dma_buf_unpin(umem_dmabuf->attach);
+		dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+	}
+#endif
+
 	ib_umem_release(mr->umem);
 	kfree(mr);
 
@@ -2630,7 +2811,7 @@ int efa_destroy_ah(struct ib_ah *ibah)
 {
 	struct efa_dev *dev = to_edev(ibah->pd->device);
 	struct efa_ah *ah = to_eah(ibah);
-#ifndef HAVE_AH_CORE_ALLOCATION
+#if !defined(HAVE_AH_CORE_ALLOCATION) && !defined(HAVE_AH_CORE_ALLOCATION_DESTROY_RC)
 	int err;
 #endif
 
@@ -2666,27 +2847,27 @@ int efa_destroy_ah(struct ib_ah *ibah)
 struct rdma_hw_stats *efa_alloc_hw_port_stats(struct ib_device *ibdev,
 					      port_t port_num)
 {
-	return rdma_alloc_hw_stats_struct(efa_port_stats_names,
-					  ARRAY_SIZE(efa_port_stats_names),
+	return rdma_alloc_hw_stats_struct(efa_port_stats_descs,
+					  ARRAY_SIZE(efa_port_stats_descs),
 					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
 }
 
 struct rdma_hw_stats *efa_alloc_hw_device_stats(struct ib_device *ibdev)
 {
-	return rdma_alloc_hw_stats_struct(efa_device_stats_names,
-					  ARRAY_SIZE(efa_device_stats_names),
+	return rdma_alloc_hw_stats_struct(efa_device_stats_descs,
+					  ARRAY_SIZE(efa_device_stats_descs),
 					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
 }
 #else
 struct rdma_hw_stats *efa_alloc_hw_stats(struct ib_device *ibdev, port_t port_num)
 {
 	if (port_num)
-		return rdma_alloc_hw_stats_struct(efa_port_stats_names,
-						  ARRAY_SIZE(efa_port_stats_names),
+		return rdma_alloc_hw_stats_struct(efa_port_stats_descs,
+						  ARRAY_SIZE(efa_port_stats_descs),
 						  RDMA_HW_STATS_DEFAULT_LIFESPAN);
 	else
-		return rdma_alloc_hw_stats_struct(efa_device_stats_names,
-						  ARRAY_SIZE(efa_device_stats_names),
+		return rdma_alloc_hw_stats_struct(efa_device_stats_descs,
+						  ARRAY_SIZE(efa_device_stats_descs),
 						  RDMA_HW_STATS_DEFAULT_LIFESPAN);
 }
 #endif
@@ -2712,7 +2893,7 @@ static int efa_fill_device_stats(struct efa_dev *dev,
 	stats->value[EFA_CREATE_AH_ERR] = atomic64_read(&s->create_ah_err);
 	stats->value[EFA_MMAP_ERR] = atomic64_read(&s->mmap_err);
 
-	return ARRAY_SIZE(efa_device_stats_names);
+	return ARRAY_SIZE(efa_device_stats_descs);
 }
 
 static int efa_fill_port_stats(struct efa_dev *dev, struct rdma_hw_stats *stats,
@@ -2761,7 +2942,7 @@ static int efa_fill_port_stats(struct efa_dev *dev, struct rdma_hw_stats *stats,
 	stats->value[EFA_RDMA_READ_WR_ERR] = rrs->read_wr_err;
 	stats->value[EFA_RDMA_READ_RESP_BYTES] = rrs->read_resp_bytes;
 
-	return ARRAY_SIZE(efa_port_stats_names);
+	return ARRAY_SIZE(efa_port_stats_descs);
 }
 
 int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
diff --git a/drivers/amazon/net/efa/kcompat.h b/drivers/amazon/net/efa/kcompat.h
index d0887952d8c92..713dcc00b394c 100644
--- a/drivers/amazon/net/efa/kcompat.h
+++ b/drivers/amazon/net/efa/kcompat.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _KCOMPAT_H_
@@ -186,4 +186,58 @@ typedef u32 port_t;
 typedef u8 port_t;
 #endif
 
+#if defined(HAVE_MR_DMABUF) && !defined(HAVE_IB_UMEM_DMABUF_PINNED)
+#include <linux/dma-buf.h>
+#include <linux/dma-resv.h>
+#include <rdma/ib_umem.h>
+
+static inline void
+ib_umem_dmabuf_unsupported_move_notify(struct dma_buf_attachment *attach)
+{
+	struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
+
+	ibdev_warn_ratelimited(umem_dmabuf->umem.ibdev,
+			       "Invalidate callback should not be called when memory is pinned\n");
+}
+
+static struct dma_buf_attach_ops ib_umem_dmabuf_attach_pinned_ops = {
+	.allow_peer2peer = true,
+	.move_notify = ib_umem_dmabuf_unsupported_move_notify,
+};
+
+static inline
+struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device,
+						 unsigned long offset,
+						 size_t size, int fd,
+						 int access)
+{
+	struct ib_umem_dmabuf *umem_dmabuf;
+	int err;
+
+	umem_dmabuf = ib_umem_dmabuf_get(device, offset, size, fd, access,
+					 &ib_umem_dmabuf_attach_pinned_ops);
+	if (IS_ERR(umem_dmabuf))
+		return umem_dmabuf;
+
+	dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
+	err = dma_buf_pin(umem_dmabuf->attach);
+	if (err)
+		goto err_release;
+
+	err = ib_umem_dmabuf_map_pages(umem_dmabuf);
+	if (err)
+		goto err_unpin;
+	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+
+	return umem_dmabuf;
+
+err_unpin:
+	dma_buf_unpin(umem_dmabuf->attach);
+err_release:
+	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+	ib_umem_release(&umem_dmabuf->umem);
+	return ERR_PTR(err);
+}
+#endif /* !HAVE_IB_UMEM_DMABUF_PINNED */
+
 #endif /* _KCOMPAT_H_ */
diff --git a/drivers/amazon/net/efa/neuron_p2p.h b/drivers/amazon/net/efa/neuron_p2p.h
new file mode 100644
index 0000000000000..a1ce44003463f
--- /dev/null
+++ b/drivers/amazon/net/efa/neuron_p2p.h
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef __NEURON_P2P_H__
+#define __NEURON_P2P_H__
+
+struct neuron_p2p_page_info {
+    u64 physical_address; // PA's that map to the VA (page aligned as defined in va_info)
+    u32 page_count; // page count each page is shift_page_size size
+};
+
+struct neuron_p2p_va_info {
+    void *virtual_address; // Virtual address for which the PA's need to be obtained
+    u64 size; // The actual size of the memory pointed by the virtual_address
+    u32 shift_page_size; // log2 of the page size
+    u32 device_index; // Neuron Device index.
+    u32 entries; // Number of page_info entries
+    struct neuron_p2p_page_info page_info[];
+};
+
+/** Given the virtual address and length returns the physical address
+ *
+ * @param[in] virtual_address   - Virtual address of device memory
+ * @param[in] length            - Length of the memory
+ * @param[out] va_info          - Set of physical addresses
+ * @param[in] free_callback     - Callback function to be called. This will be called with a lock held.
+ * @param[in] data              - Data to be used for the callback
+ *
+ * @return 0            - Success.
+ */
+int neuron_p2p_register_va(u64 virtual_address, u64 length, struct neuron_p2p_va_info **vainfo, void (*free_callback) (void *data), void *data);
+
+/** Give the pa, release the pa from being used by third-party device
+ *
+ * @param[in] va_info           - Set of physical addresses
+ *
+ * @return 0            - Success.
+ */
+int neuron_p2p_unregister_va(struct neuron_p2p_va_info *vainfo);
+
+#endif

From 914ecf67312c12b219743958a1835b4adc216755 Mon Sep 17 00:00:00 2001
From: Andrew Panyaki <apanyaki@amazon.com>
Date: Wed, 9 Nov 2022 18:31:18 +0000
Subject: [PATCH 524/737] lustre: update to AmazonFSxLustreClient v2.12.8-fsx6

Signed-off-by: Andrew Panyaki <apanyaki@amazon.com>
---
 drivers/staging/lustrefsx/config.h            |   21 +-
 .../lustrefsx/libcfs/include/libcfs/bitmap.h  |    3 +-
 .../staging/lustrefsx/libcfs/libcfs/fail.c    |   19 +-
 drivers/staging/lustrefsx/list                |   26 -
 .../lustrefsx/lustre/include/lprocfs_status.h |    1 +
 .../lustrefsx/lustre/include/lu_object.h      |  200 ++
 .../lustre/include/lustre/lustreapi.h         |   45 +-
 .../lustrefsx/lustre/include/lustre_compat.h  |    7 +
 .../lustrefsx/lustre/include/lustre_export.h  |    5 +
 .../lustrefsx/lustre/include/lustre_lmv.h     |   71 +-
 .../lustre/include/lustre_req_layout.h        |    1 +
 .../lustrefsx/lustre/include/md_object.h      |    2 +
 .../staging/lustrefsx/lustre/include/obd.h    |   86 +-
 .../lustrefsx/lustre/include/obd_class.h      |   17 +-
 .../include/uapi/linux/lustre/lustre_idl.h    |   26 +-
 .../include/uapi/linux/lustre/lustre_user.h   |   84 +-
 .../staging/lustrefsx/lustre/llite/dcache.c   |   17 +-
 drivers/staging/lustrefsx/lustre/llite/dir.c  |  155 +-
 drivers/staging/lustrefsx/lustre/llite/file.c |   60 +-
 .../lustrefsx/lustre/llite/llite_internal.h   |   45 +-
 .../lustrefsx/lustre/llite/llite_lib.c        |  257 ++-
 .../lustrefsx/lustre/llite/lproc_llite.c      |   46 +
 .../staging/lustrefsx/lustre/llite/namei.c    |  141 +-
 .../lustrefsx/lustre/llite/statahead.c        |    5 +-
 .../staging/lustrefsx/lustre/lmv/lmv_fld.c    |   14 +-
 .../staging/lustrefsx/lustre/lmv/lmv_intent.c |   56 +-
 .../lustrefsx/lustre/lmv/lmv_internal.h       |  125 +-
 .../staging/lustrefsx/lustre/lmv/lmv_obd.c    | 2030 +++++++++--------
 .../staging/lustrefsx/lustre/lmv/lproc_lmv.c  |  175 +-
 .../lustrefsx/lustre/lov/lov_internal.h       |   14 +-
 .../staging/lustrefsx/lustre/lov/lov_pool.c   |   10 +-
 .../staging/lustrefsx/lustre/mdc/mdc_locks.c  |   21 +-
 .../lustrefsx/lustre/mdc/mdc_request.c        |  123 +-
 .../lustrefsx/lustre/obdclass/Makefile        |    1 +
 .../lustre/obdclass/lprocfs_status.c          |   44 +-
 .../lustrefsx/lustre/obdclass/lu_tgt_descs.c  |  682 ++++++
 .../lustrefsx/lustre/osc/osc_request.c        |   16 +
 .../staging/lustrefsx/lustre/ptlrpc/layout.c  |    8 +-
 .../lustrefsx/lustre/ptlrpc/pack_generic.c    |    5 +-
 .../lustrefsx/lustre/ptlrpc/wiretest.c        |    7 +-
 drivers/staging/lustrefsx/undef.h             |    3 +
 41 files changed, 3216 insertions(+), 1458 deletions(-)
 delete mode 100644 drivers/staging/lustrefsx/list
 create mode 100644 drivers/staging/lustrefsx/lustre/obdclass/lu_tgt_descs.c

diff --git a/drivers/staging/lustrefsx/config.h b/drivers/staging/lustrefsx/config.h
index 98806290010a0..9a97870d7fcc7 100644
--- a/drivers/staging/lustrefsx/config.h
+++ b/drivers/staging/lustrefsx/config.h
@@ -505,6 +505,9 @@
 /* if iov_iter_type exists */
 #define HAVE_IOV_ITER_TYPE 1
 
+/* is_root_inode defined */
+#define HAVE_IS_ROOT_INODE 1
+
 /* is_sxid is defined */
 #define HAVE_IS_SXID 1
 
@@ -772,7 +775,7 @@
 #define HAVE_PTR_ERR_OR_ZERO 1
 
 /* have quota64 */
-#define HAVE_QUOTA64 1
+/* #undef HAVE_QUOTA64 */
 
 /* radix_tree_exceptional_entry exist */
 /* #undef HAVE_RADIX_EXCEPTION_ENTRY */
@@ -1089,7 +1092,7 @@
 #define LUSTRE_PATCH 8
 
 /* A copy of PACKAGE_VERSION */
-#define LUSTRE_VERSION_STRING "2.12.8"
+#define LUSTRE_VERSION_STRING "2.12.8_163_g540d104"
 
 /* maximum number of MDS threads */
 /* #undef MDS_MAX_THREADS */
@@ -1125,7 +1128,7 @@
 #define PACKAGE_NAME "Lustre"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "Lustre 2.12.8"
+#define PACKAGE_STRING "Lustre 2.12.8_163_g540d104"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "lustre"
@@ -1134,7 +1137,7 @@
 #define PACKAGE_URL ""
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2.12.8"
+#define PACKAGE_VERSION "2.12.8_163_g540d104"
 
 /* name of parallel fsck program */
 #define PFSCK "fsck"
@@ -1175,16 +1178,16 @@
 /* #undef USE_LU_REF */
 
 /* Version number of package */
-#define VERSION "2.12.8"
+#define VERSION "2.12.8_163_g540d104"
 
 /* zfs fix version */
-#define ZFS_FIX 0
+/* #undef ZFS_FIX */
 
 /* zfs major version */
-#define ZFS_MAJOR 
+/* #undef ZFS_MAJOR */
 
 /* zfs minor version */
-#define ZFS_MINOR 
+/* #undef ZFS_MINOR */
 
 /* zfs patch version */
-#define ZFS_PATCH 
+/* #undef ZFS_PATCH */
diff --git a/drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h b/drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h
index 1763da296244d..b4782c4b51094 100644
--- a/drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h
+++ b/drivers/staging/lustrefsx/libcfs/include/libcfs/bitmap.h
@@ -41,8 +41,7 @@ struct cfs_bitmap {
 };
 
 #define CFS_BITMAP_SIZE(nbits) \
-	(((nbits / BITS_PER_LONG) + 1) * sizeof(long) + \
-	sizeof(struct cfs_bitmap))
+	(BITS_TO_LONGS(nbits) * sizeof(long) + sizeof(struct cfs_bitmap))
 
 static inline
 struct cfs_bitmap *CFS_ALLOCATE_BITMAP(int size)
diff --git a/drivers/staging/lustrefsx/libcfs/libcfs/fail.c b/drivers/staging/lustrefsx/libcfs/libcfs/fail.c
index 12addb20803f3..13d31ab16fdf4 100644
--- a/drivers/staging/lustrefsx/libcfs/libcfs/fail.c
+++ b/drivers/staging/lustrefsx/libcfs/libcfs/fail.c
@@ -121,16 +121,23 @@ EXPORT_SYMBOL(__cfs_fail_check_set);
 
 int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set)
 {
+	ktime_t till = ktime_add_ms(ktime_get(), ms);
 	int ret = 0;
 
 	ret = __cfs_fail_check_set(id, value, set);
 	if (ret && likely(ms > 0)) {
-		CERROR("cfs_fail_timeout id %x sleeping for %dms\n",
-		       id, ms);
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		schedule_timeout(cfs_time_seconds(ms) / 1000);
-		set_current_state(TASK_RUNNING);
-		CERROR("cfs_fail_timeout id %x awake\n", id);
+		CERROR("cfs_fail_timeout id %x sleeping for %dms\n", id, ms);
+		while (ktime_before(ktime_get(), till)) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(msecs_to_jiffies(1000) / 10);
+			set_current_state(TASK_RUNNING);
+			if (!cfs_fail_loc) {
+				CERROR("cfs_fail_timeout interrupted\n");
+				break;
+			}
+		}
+		if (cfs_fail_loc)
+			CERROR("cfs_fail_timeout id %x awake\n", id);
 	}
 	return ret;
 }
diff --git a/drivers/staging/lustrefsx/list b/drivers/staging/lustrefsx/list
deleted file mode 100644
index c69f3dc259a22..0000000000000
--- a/drivers/staging/lustrefsx/list
+++ /dev/null
@@ -1,26 +0,0 @@
-./lustre/osc/Makefile
-./lustre/fid/Makefile
-./lustre/mdc/Makefile
-./lustre/Makefile
-./lustre/ptlrpc/Makefile
-./lustre/obdclass/Makefile
-./lustre/llite/Makefile
-./lustre/obdecho/Makefile
-./lustre/lov/Makefile
-./lustre/lmv/Makefile
-./lustre/mgc/Makefile
-./lustre/fld/Makefile
-./Makefile.rules
-./libcfs/libcfs/Makefile
-./libcfs/Makefile
-./Makefile
-./lnet/selftest/Makefile
-./lnet/Makefile
-./lnet/lnet/Makefile
-./lnet/klnds/socklnd/Makefile
-./lnet/klnds/Makefile
-./lnet/klnds/o2iblnd/Makefile
-./lustre/Kconfig
-./Kconfig
-./libcfs/Kconfig
-./lnet/Kconfig
diff --git a/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h b/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h
index 85b66b3af7126..6d032dfec8029 100644
--- a/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h
+++ b/drivers/staging/lustrefsx/lustre/include/lprocfs_status.h
@@ -632,6 +632,7 @@ extern ssize_t
 lprocfs_pinger_recov_seq_write(struct file *file, const char __user *buffer,
 			       size_t count, loff_t *off);
 
+int lu_str_to_s64(char *buffer, unsigned long count, __s64 *val, char defunit);
 extern int lprocfs_str_with_units_to_s64(const char __user *buffer,
 					 unsigned long count, __s64 *val,
 					 char defunit);
diff --git a/drivers/staging/lustrefsx/lustre/include/lu_object.h b/drivers/staging/lustrefsx/lustre/include/lu_object.h
index 93218aa30e7ff..7734bab329e89 100644
--- a/drivers/staging/lustrefsx/lustre/include/lu_object.h
+++ b/drivers/staging/lustrefsx/lustre/include/lu_object.h
@@ -1430,5 +1430,205 @@ static inline bool lu_object_is_cl(const struct lu_object *o)
 	return lu_device_is_cl(o->lo_dev);
 }
 
+/* Generic subset of tgts */
+struct lu_tgt_pool {
+	__u32		   *op_array;	/* array of index of
+					 * lov_obd->lov_tgts */
+	unsigned int	    op_count;	/* number of tgts in the array */
+	unsigned int	    op_size;	/* allocated size of op_array */
+	struct rw_semaphore op_rw_sem;	/* to protect lu_tgt_pool use */
+};
+
+/* bitflags used in rr / qos allocation */
+enum lq_flag {
+	LQ_DIRTY	= 0, /* recalc qos data */
+	LQ_SAME_SPACE,	     /* the OSTs all have approx.
+			      * the same space avail */
+	LQ_RESET,	     /* zero current penalties */
+};
+
+/* round-robin QoS data for LOD/LMV */
+struct lu_qos_rr {
+	spinlock_t		 lqr_alloc;	/* protect allocation index */
+	__u32			 lqr_start_idx;	/* start index of new inode */
+	__u32			 lqr_offset_idx;/* aliasing for start_idx */
+	int			 lqr_start_count;/* reseed counter */
+	struct lu_tgt_pool	 lqr_pool;	/* round-robin optimized list */
+	unsigned long		 lqr_flags;
+};
+
+/* QoS data per MDS/OSS */
+struct lu_svr_qos {
+	struct obd_uuid		 lsq_uuid;	/* ptlrpc's c_remote_uuid */
+	struct list_head	 lsq_svr_list;	/* link to lq_svr_list */
+	__u64			 lsq_bavail;	/* total bytes avail on svr */
+	__u64			 lsq_iavail;	/* total inode avail on svr */
+	__u64			 lsq_penalty;	/* current penalty */
+	__u64			 lsq_penalty_per_obj; /* penalty decrease
+						       * every obj*/
+	time64_t		 lsq_used;	/* last used time, seconds */
+	__u32			 lsq_tgt_count;	/* number of tgts on this svr */
+	__u32			 lsq_id;	/* unique svr id */
+};
+
+/* QoS data per MDT/OST */
+struct lu_tgt_qos {
+	struct lu_svr_qos	*ltq_svr;	/* svr info */
+	__u64			 ltq_penalty;	/* current penalty */
+	__u64			 ltq_penalty_per_obj; /* penalty decrease
+						       * every obj*/
+	__u64			 ltq_avail;	/* bytes/inode avail */
+	__u64			 ltq_weight;	/* net weighting */
+	time64_t		 ltq_used;	/* last used time, seconds */
+	bool			 ltq_usable:1;	/* usable for striping */
+};
+
+/* target descriptor */
+struct lu_tgt_desc {
+	union {
+		struct dt_device	*ltd_tgt;
+		struct obd_device	*ltd_obd;
+	};
+	struct obd_export *ltd_exp;
+	struct obd_uuid    ltd_uuid;
+	__u32              ltd_index;
+	__u32		   ltd_gen;
+	struct list_head   ltd_kill;
+	struct ptlrpc_thread	*ltd_recovery_thread;
+	struct mutex	   ltd_fid_mutex;
+	struct lu_tgt_qos  ltd_qos; /* qos info per target */
+	struct obd_statfs  ltd_statfs;
+	time64_t	   ltd_statfs_age;
+	unsigned long      ltd_active:1,/* is this target up for requests */
+			   ltd_activate:1,/* should target be activated */
+			   ltd_reap:1,  /* should this target be deleted */
+			   ltd_got_update_log:1, /* Already got update log */
+			   ltd_connecting:1; /* target is connecting */
+};
+
+/* number of pointers at 1st level */
+#define TGT_PTRS		(PAGE_SIZE / sizeof(void *))
+/* number of pointers at 2nd level */
+#define TGT_PTRS_PER_BLOCK	(PAGE_SIZE / sizeof(void *))
+
+struct lu_tgt_desc_idx {
+	struct lu_tgt_desc *ldi_tgt[TGT_PTRS_PER_BLOCK];
+};
+
+/* QoS data for LOD/LMV */
+struct lu_qos {
+	struct list_head	 lq_svr_list;	/* lu_svr_qos list */
+	struct rw_semaphore	 lq_rw_sem;
+	__u32			 lq_active_svr_count;
+	unsigned int		 lq_prio_free;   /* priority for free space */
+	unsigned int		 lq_threshold_rr;/* priority for rr */
+	struct lu_qos_rr	 lq_rr;          /* round robin qos data */
+	unsigned long		 lq_flags;
+#if 0
+	unsigned long		 lq_dirty:1,     /* recalc qos data */
+				 lq_same_space:1,/* the servers all have approx.
+						  * the same space avail */
+				 lq_reset:1;     /* zero current penalties */
+#endif
+};
+
+struct lu_tgt_descs {
+	union {
+		struct lov_desc	      ltd_lov_desc;
+		struct lmv_desc	      ltd_lmv_desc;
+	};
+	/* list of known TGTs */
+	struct lu_tgt_desc_idx	*ltd_tgt_idx[TGT_PTRS];
+	/* Size of the lu_tgts array, granted to be a power of 2 */
+	__u32			ltd_tgts_size;
+	/* bitmap of TGTs available */
+	struct cfs_bitmap	*ltd_tgt_bitmap;
+	/* TGTs scheduled to be deleted */
+	__u32			ltd_death_row;
+	/* Table refcount used for delayed deletion */
+	int			ltd_refcount;
+	/* mutex to serialize concurrent updates to the tgt table */
+	struct mutex		ltd_mutex;
+	/* read/write semaphore used for array relocation */
+	struct rw_semaphore	ltd_rw_sem;
+	/* QoS */
+	struct lu_qos		ltd_qos;
+	/* all tgts in a packed array */
+	struct lu_tgt_pool	ltd_tgt_pool;
+	/* true if tgt is MDT */
+	bool			ltd_is_mdt;
+};
+
+#define LTD_TGT(ltd, index)						\
+	 (ltd)->ltd_tgt_idx[(index) /					\
+	 TGT_PTRS_PER_BLOCK]->ldi_tgt[(index) % TGT_PTRS_PER_BLOCK]
+
+u64 lu_prandom_u64_max(u64 ep_ro);
+void lu_qos_rr_init(struct lu_qos_rr *lqr);
+int lu_qos_add_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd);
+void lu_tgt_qos_weight_calc(struct lu_tgt_desc *tgt);
+
+int lu_tgt_descs_init(struct lu_tgt_descs *ltd, bool is_mdt);
+void lu_tgt_descs_fini(struct lu_tgt_descs *ltd);
+int ltd_add_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt);
+void ltd_del_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt);
+int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd);
+int ltd_qos_update(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt,
+		   __u64 *total_wt);
+
+/**
+ * Whether MDT inode and space usages are balanced.
+ */
+static inline bool ltd_qos_is_balanced(struct lu_tgt_descs *ltd)
+{
+	return !test_bit(LQ_DIRTY, &ltd->ltd_qos.lq_flags) &&
+	       test_bit(LQ_SAME_SPACE, &ltd->ltd_qos.lq_flags);
+}
+
+/**
+ * Whether QoS data is up-to-date and QoS can be applied.
+ */
+static inline bool ltd_qos_is_usable(struct lu_tgt_descs *ltd)
+{
+	if (ltd_qos_is_balanced(ltd))
+		return false;
+
+	if (ltd->ltd_lov_desc.ld_active_tgt_count < 2)
+		return false;
+
+	return true;
+}
+
+static inline struct lu_tgt_desc *ltd_first_tgt(struct lu_tgt_descs *ltd)
+{
+	int index;
+
+	index = find_first_bit(ltd->ltd_tgt_bitmap->data,
+			       ltd->ltd_tgt_bitmap->size);
+	return (index < ltd->ltd_tgt_bitmap->size) ? LTD_TGT(ltd, index) : NULL;
+}
+
+static inline struct lu_tgt_desc *ltd_next_tgt(struct lu_tgt_descs *ltd,
+					       struct lu_tgt_desc *tgt)
+{
+	int index;
+
+	if (!tgt)
+		return NULL;
+
+	index = tgt->ltd_index;
+	LASSERT(index < ltd->ltd_tgt_bitmap->size);
+	index = find_next_bit(ltd->ltd_tgt_bitmap->data,
+			      ltd->ltd_tgt_bitmap->size, index + 1);
+	return (index < ltd->ltd_tgt_bitmap->size) ? LTD_TGT(ltd, index) : NULL;
+}
+
+#define ltd_foreach_tgt(ltd, tgt) \
+	for (tgt = ltd_first_tgt(ltd); tgt; tgt = ltd_next_tgt(ltd, tgt))
+
+#define ltd_foreach_tgt_safe(ltd, tgt, tmp)				  \
+	for (tgt = ltd_first_tgt(ltd), tmp = ltd_next_tgt(ltd, tgt); tgt; \
+	     tgt = tmp, tmp = ltd_next_tgt(ltd, tgt))
+
 /** @} lu */
 #endif /* __LUSTRE_LU_OBJECT_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h b/drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h
index 518a00d089e36..6e61cd98ad4ff 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre/lustreapi.h
@@ -129,6 +129,9 @@ struct llapi_stripe_param {
 	/* Number of stripes. Size of lsp_osts[] if lsp_specific is true.*/
 	int			lsp_stripe_count;
 	bool			lsp_is_specific;
+	bool			lsp_is_create;
+	__u8			lsp_max_inherit;
+	__u8			lsp_max_inherit_rr;
 	__u32			lsp_osts[0];
 };
 
@@ -158,24 +161,27 @@ void llapi_set_command_name(const char *cmd);
 void llapi_clear_command_name(void);
 
 enum llapi_layout_verbose  {
-	VERBOSE_STRIPE_COUNT	=     0x1,
-	VERBOSE_STRIPE_SIZE	=     0x2,
-	VERBOSE_STRIPE_OFFSET	=     0x4,
-	VERBOSE_POOL		=     0x8,
-	VERBOSE_DETAIL		=    0x10,
-	VERBOSE_OBJID		=    0x20,
-	VERBOSE_GENERATION	=    0x40,
-	VERBOSE_MDTINDEX	=    0x80,
-	VERBOSE_PATTERN		=   0x100,
-	VERBOSE_COMP_COUNT	=   0x200,
-	VERBOSE_COMP_FLAGS	=   0x400,
-	VERBOSE_COMP_START	=   0x800,
-	VERBOSE_COMP_END	=  0x1000,
-	VERBOSE_COMP_ID		=  0x2000,
-	VERBOSE_DFID		=  0x4000,
-	VERBOSE_HASH_TYPE	=  0x8000,
-	VERBOSE_MIRROR_COUNT	= 0x10000,
-	VERBOSE_MIRROR_ID	= 0x20000,
+	VERBOSE_STRIPE_COUNT	=      0x1,
+	VERBOSE_STRIPE_SIZE	=      0x2,
+	VERBOSE_STRIPE_OFFSET	=      0x4,
+	VERBOSE_POOL		=      0x8,
+	VERBOSE_DETAIL		=     0x10,
+	VERBOSE_OBJID		=     0x20,
+	VERBOSE_GENERATION	=     0x40,
+	VERBOSE_MDTINDEX	=     0x80,
+	VERBOSE_PATTERN		=    0x100,
+	VERBOSE_COMP_COUNT	=    0x200,
+	VERBOSE_COMP_FLAGS	=    0x400,
+	VERBOSE_COMP_START	=    0x800,
+	VERBOSE_COMP_END	=   0x1000,
+	VERBOSE_COMP_ID		=   0x2000,
+	VERBOSE_DFID		=   0x4000,
+	VERBOSE_HASH_TYPE	=   0x8000,
+	VERBOSE_MIRROR_COUNT	=  0x10000,
+	VERBOSE_MIRROR_ID	=  0x20000,
+	VERBOSE_EXT_SIZE	=  0x40000,
+	VERBOSE_INHERIT		=  0x80000,
+	VERBOSE_INHERIT_RR	= 0x100000,
 	VERBOSE_DEFAULT		= VERBOSE_STRIPE_COUNT | VERBOSE_STRIPE_SIZE |
 				  VERBOSE_STRIPE_OFFSET | VERBOSE_POOL |
 				  VERBOSE_OBJID | VERBOSE_GENERATION |
@@ -183,7 +189,8 @@ enum llapi_layout_verbose  {
 				  VERBOSE_COMP_COUNT | VERBOSE_COMP_FLAGS |
 				  VERBOSE_COMP_START | VERBOSE_COMP_END |
 				  VERBOSE_COMP_ID | VERBOSE_MIRROR_COUNT |
-				  VERBOSE_MIRROR_ID
+				  VERBOSE_MIRROR_ID | VERBOSE_EXT_SIZE |
+				  VERBOSE_INHERIT | VERBOSE_INHERIT_RR
 };
 /* Compatibility with original names */
 #define VERBOSE_SIZE	VERBOSE_STRIPE_SIZE
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
index 71af35a8f839e..6306734c9c575 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_compat.h
@@ -875,4 +875,11 @@ static inline int ll_vfs_removexattr(struct dentry *dentry, struct inode *inode,
 #define ll_set_acl(ns, inode, acl, type)	ll_set_acl(inode, acl, type)
 #endif
 
+#ifndef HAVE_IS_ROOT_INODE
+static inline bool is_root_inode(struct inode *inode)
+{
+	return inode == inode->i_sb->s_root->d_inode;
+}
+#endif
+
 #endif /* _LUSTRE_COMPAT_H */
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_export.h b/drivers/staging/lustrefsx/lustre/include/lustre_export.h
index 4fb3a9c4b2d18..aa627e60ffd8c 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_export.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_export.h
@@ -454,6 +454,11 @@ static inline int exp_connect_flr(struct obd_export *exp)
 	return !!(exp_connect_flags2(exp) & OBD_CONNECT2_FLR);
 }
 
+static inline int exp_bypass_mdll(struct obd_export *exp)
+{
+    return !!(exp_connect_flags2(exp) & OBD_CONNECT2_MDLL_BYPASS);
+}
+
 static inline int exp_mdll(struct obd_export *exp)
 {
 	return !!(exp_connect_flags2(exp) & OBD_CONNECT2_MDLL);
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_lmv.h b/drivers/staging/lustrefsx/lustre/include/lustre_lmv.h
index d5fb751524b0b..091b80c59c7d9 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_lmv.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_lmv.h
@@ -45,6 +45,8 @@ struct lmv_stripe_md {
 	__u32	lsm_md_stripe_count;
 	__u32	lsm_md_master_mdt_index;
 	__u32	lsm_md_hash_type;
+	__u8	lsm_md_max_inherit;
+	__u8	lsm_md_max_inherit_rr;
 	__u32	lsm_md_layout_version;
 	__u32	lsm_md_migrate_offset;
 	__u32	lsm_md_migrate_hash;
@@ -54,6 +56,29 @@ struct lmv_stripe_md {
 	struct lmv_oinfo lsm_md_oinfo[0];
 };
 
+static inline bool lmv_dir_striped(const struct lmv_stripe_md *lsm)
+{
+	return lsm && lsm->lsm_md_magic == LMV_MAGIC;
+}
+
+static inline bool lmv_dir_migrating(const struct lmv_stripe_md *lsm)
+{
+	return lmv_dir_striped(lsm) &&
+	       lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION;
+}
+
+static inline bool lmv_dir_bad_hash(const struct lmv_stripe_md *lsm)
+{
+	if (!lmv_dir_striped(lsm))
+		return false;
+
+	if (lmv_dir_migrating(lsm) &&
+	    lsm->lsm_md_stripe_count - lsm->lsm_md_migrate_offset <= 1)
+		return false;
+
+	return !lmv_is_known_hash_type(lsm->lsm_md_hash_type);
+}
+
 static inline bool
 lsm_md_eq(const struct lmv_stripe_md *lsm1, const struct lmv_stripe_md *lsm2)
 {
@@ -64,6 +89,8 @@ lsm_md_eq(const struct lmv_stripe_md *lsm1, const struct lmv_stripe_md *lsm2)
 	    lsm1->lsm_md_master_mdt_index !=
 				lsm2->lsm_md_master_mdt_index ||
 	    lsm1->lsm_md_hash_type != lsm2->lsm_md_hash_type ||
+	    lsm1->lsm_md_max_inherit != lsm2->lsm_md_max_inherit ||
+	    lsm1->lsm_md_max_inherit_rr != lsm2->lsm_md_max_inherit_rr ||
 	    lsm1->lsm_md_layout_version !=
 				lsm2->lsm_md_layout_version ||
 	    lsm1->lsm_md_migrate_offset !=
@@ -74,10 +101,18 @@ lsm_md_eq(const struct lmv_stripe_md *lsm1, const struct lmv_stripe_md *lsm2)
 		      lsm2->lsm_md_pool_name) != 0)
 		return false;
 
-	for (idx = 0; idx < lsm1->lsm_md_stripe_count; idx++) {
-		if (!lu_fid_eq(&lsm1->lsm_md_oinfo[idx].lmo_fid,
-			       &lsm2->lsm_md_oinfo[idx].lmo_fid))
-			return false;
+	if (lmv_dir_striped(lsm1)) {
+		for (idx = 0; idx < lsm1->lsm_md_stripe_count; idx++) {
+			if (!lu_fid_eq(&lsm1->lsm_md_oinfo[idx].lmo_fid,
+				       &lsm2->lsm_md_oinfo[idx].lmo_fid))
+				return false;
+		}
+	} else if (lsm1->lsm_md_magic == LMV_USER_MAGIC_SPECIFIC) {
+		for (idx = 0; idx < lsm1->lsm_md_stripe_count; idx++) {
+			if (lsm1->lsm_md_oinfo[idx].lmo_mds !=
+			    lsm2->lsm_md_oinfo[idx].lmo_mds)
+				return false;
+		}
 	}
 
 	return true;
@@ -87,12 +122,20 @@ static inline void lsm_md_dump(int mask, const struct lmv_stripe_md *lsm)
 {
 	int i;
 
-	CDEBUG(mask, "magic %#x stripe count %d master mdt %d hash type %#x "
-		"version %d migrate offset %d migrate hash %#x pool %s\n",
-		lsm->lsm_md_magic, lsm->lsm_md_stripe_count,
-		lsm->lsm_md_master_mdt_index, lsm->lsm_md_hash_type,
-		lsm->lsm_md_layout_version, lsm->lsm_md_migrate_offset,
-		lsm->lsm_md_migrate_hash, lsm->lsm_md_pool_name);
+	/* If lsm_md_magic == LMV_MAGIC_FOREIGN pool_name may not be a null
+	 * terminated string so only print LOV_MAXPOOLNAME bytes.
+	 */
+	CDEBUG(mask,
+	       "magic %#x stripe count %d master mdt %d hash type %#x max-inherit %hhu max-inherit-rr %hhu version %d migrate offset %d migrate hash %#x pool %.*s\n",
+	       lsm->lsm_md_magic, lsm->lsm_md_stripe_count,
+	       lsm->lsm_md_master_mdt_index,
+	       lsm->lsm_md_hash_type, lsm->lsm_md_max_inherit,
+	       lsm->lsm_md_max_inherit_rr, lsm->lsm_md_layout_version,
+	       lsm->lsm_md_migrate_offset, lsm->lsm_md_migrate_hash,
+	       LOV_MAXPOOLNAME, lsm->lsm_md_pool_name);
+
+	if (!lmv_dir_striped(lsm))
+		return;
 
 	for (i = 0; i < lsm->lsm_md_stripe_count; i++)
 		CDEBUG(mask, "stripe[%d] "DFID"\n",
@@ -114,6 +157,8 @@ static inline void lmv1_le_to_cpu(struct lmv_mds_md_v1 *lmv_dst,
 				le32_to_cpu(lmv_src->lmv_master_mdt_index);
 	lmv_dst->lmv_hash_type = le32_to_cpu(lmv_src->lmv_hash_type);
 	lmv_dst->lmv_layout_version = le32_to_cpu(lmv_src->lmv_layout_version);
+	if (lmv_src->lmv_stripe_count > LMV_MAX_STRIPE_COUNT)
+		return;
 	for (i = 0; i < lmv_src->lmv_stripe_count; i++)
 		fid_le_to_cpu(&lmv_dst->lmv_stripe_fids[i],
 			      &lmv_src->lmv_stripe_fids[i]);
@@ -185,10 +230,10 @@ static inline int lmv_name_to_stripe_index(__u32 lmv_hash_type,
 	return idx;
 }
 
-static inline bool lmv_is_known_hash_type(__u32 type)
+static inline bool lmv_magic_supported(__u32 lum_magic)
 {
-	return (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_FNV_1A_64 ||
-	       (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_ALL_CHARS;
+	return lum_magic == LMV_USER_MAGIC ||
+	       lum_magic == LMV_USER_MAGIC_SPECIFIC;
 }
 
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h b/drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h
index 7b6c03b195624..8b2c9240660a5 100644
--- a/drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h
+++ b/drivers/staging/lustrefsx/lustre/include/lustre_req_layout.h
@@ -271,6 +271,7 @@ extern struct req_msg_field RMF_DLM_GL_DESC;
 extern struct req_msg_field RMF_LDLM_INTENT;
 extern struct req_msg_field RMF_LAYOUT_INTENT;
 extern struct req_msg_field RMF_MDT_MD;
+extern struct req_msg_field RMF_DEFAULT_MDT_MD;
 extern struct req_msg_field RMF_REC_REINT;
 extern struct req_msg_field RMF_EADATA;
 extern struct req_msg_field RMF_EAVALS;
diff --git a/drivers/staging/lustrefsx/lustre/include/md_object.h b/drivers/staging/lustrefsx/lustre/include/md_object.h
index a5f994e36d50b..d84f07e054201 100644
--- a/drivers/staging/lustrefsx/lustre/include/md_object.h
+++ b/drivers/staging/lustrefsx/lustre/include/md_object.h
@@ -130,9 +130,11 @@ struct md_attr {
 	struct md_som		 ma_som;
 	struct lov_mds_md	*ma_lmm;
 	union lmv_mds_md	*ma_lmv;
+	struct lmv_user_md	*ma_default_lmv;
 	void			*ma_acl;
 	int			 ma_lmm_size;
 	int			 ma_lmv_size;
+	int			 ma_default_lmv_size;
 	int			 ma_acl_size;
 	int			 ma_enable_chprojid_gid;
 };
diff --git a/drivers/staging/lustrefsx/lustre/include/obd.h b/drivers/staging/lustrefsx/lustre/include/obd.h
index b80a98332d6d5..62f751a44d0fb 100644
--- a/drivers/staging/lustrefsx/lustre/include/obd.h
+++ b/drivers/staging/lustrefsx/lustre/include/obd.h
@@ -87,6 +87,8 @@ typedef int (*obd_enqueue_update_f)(void *cookie, int rc);
 struct obd_info {
 	/* OBD_STATFS_* flags */
 	__u64                   oi_flags;
+	struct obd_device      *oi_obd;
+	struct lu_tgt_desc     *oi_tgt;
         /* statfs data specific for every OSC, if needed at all. */
         struct obd_statfs      *oi_osfs;
         /* An update callback which is called to update some data on upper
@@ -367,15 +369,6 @@ struct echo_client_obd {
 	__u64			ec_unique;
 };
 
-/* Generic subset of OSTs */
-struct ost_pool {
-        __u32              *op_array;      /* array of index of
-                                                   lov_obd->lov_tgts */
-        unsigned int        op_count;      /* number of OSTs in the array */
-        unsigned int        op_size;       /* allocated size of lp_array */
-	struct rw_semaphore op_rw_sem;     /* to protect ost_pool use */
-};
-
 /* allow statfs data caching for 1 second */
 #define OBD_STATFS_CACHE_SECONDS 1
 /* arbitrary maximum. larger would be useless, allows catching bogus input */
@@ -383,17 +376,7 @@ struct ost_pool {
 /* By default, don't do time based negative cache invalidation */
 #define OBD_NEG_CACHE_TIMEOUT_DEFAULT_SECS (-1) /* seconds */
 
-struct lov_tgt_desc {
-	struct list_head    ltd_kill;
-        struct obd_uuid     ltd_uuid;
-        struct obd_device  *ltd_obd;
-        struct obd_export  *ltd_exp;
-        __u32               ltd_gen;
-        __u32               ltd_index;   /* index in lov_obd->tgts */
-        unsigned long       ltd_active:1,/* is this target up for requests */
-                            ltd_activate:1,/* should  target be activated */
-                            ltd_reap:1;  /* should this target be deleted */
-};
+#define lov_tgt_desc lu_tgt_desc
 
 struct lov_md_tgt_desc {
 	struct obd_device *lmtd_mdc;
@@ -403,7 +386,7 @@ struct lov_md_tgt_desc {
 struct lov_obd {
 	struct lov_desc		desc;
 	struct lov_tgt_desc   **lov_tgts;		/* sparse array */
-	struct ost_pool		lov_packed;		/* all OSTs in a packed
+	struct lu_tgt_pool	lov_packed;		/* all OSTs in a packed
 							   array */
 	struct mutex		lov_lock;
 	struct obd_connect_data	lov_ocd;
@@ -428,33 +411,29 @@ struct lov_obd {
 	struct kobject		*lov_tgts_kobj;
 };
 
-struct lmv_tgt_desc {
-	struct obd_uuid		ltd_uuid;
-	struct obd_device	*ltd_obd;
-	struct obd_export	*ltd_exp;
-	__u32			ltd_idx;
-	struct mutex		ltd_fid_mutex;
-	unsigned long		ltd_active:1; /* target up for requests */
-};
+#define lmv_tgt_desc lu_tgt_desc
 
 struct lmv_obd {
 	struct lu_client_fld	lmv_fld;
 	spinlock_t		lmv_lock;
-	struct lmv_desc		desc;
 
-	struct mutex		lmv_init_mutex;
 	int			connected;
 	int			max_easize;
 	int			max_def_easize;
 	u32			lmv_statfs_start;
 
-	u32			tgts_size; /* size of tgts array */
-	struct lmv_tgt_desc	**tgts;
+	struct lu_tgt_descs	lmv_mdt_descs;
 
 	struct obd_connect_data	conn_data;
 	struct kobject		*lmv_tgts_kobj;
+	void			*lmv_cache;
+
+	__u32			lmv_qos_rr_index;
 };
 
+#define lmv_mdt_count	lmv_mdt_descs.ltd_lmv_desc.ld_tgt_count
+#define lmv_qos		lmv_mdt_descs.ltd_qos
+
 /* Minimum sector size is 512 */
 #define MAX_GUARD_NUMBER (PAGE_SIZE / 512)
 
@@ -820,12 +799,14 @@ static inline int it_to_lock_mode(struct lookup_intent *it)
 }
 
 enum md_op_flags {
-	MF_MDC_CANCEL_FID1	= 1 << 0,
-	MF_MDC_CANCEL_FID2	= 1 << 1,
-	MF_MDC_CANCEL_FID3	= 1 << 2,
-	MF_MDC_CANCEL_FID4	= 1 << 3,
-	MF_GET_MDT_IDX		= 1 << 4,
-	MF_GETATTR_BY_FID	= 1 << 5,
+	MF_MDC_CANCEL_FID1	= BIT(0),
+	MF_MDC_CANCEL_FID2	= BIT(1),
+	MF_MDC_CANCEL_FID3	= BIT(2),
+	MF_MDC_CANCEL_FID4	= BIT(3),
+	MF_GET_MDT_IDX		= BIT(4),
+	MF_GETATTR_BY_FID	= BIT(5),
+	MF_QOS_MKDIR		= BIT(6),
+	MF_RR_MKDIR		= BIT(7),
 };
 
 enum md_cli_flags {
@@ -836,6 +817,14 @@ enum md_cli_flags {
 	CLI_MIGRATE     = 1 << 4,
 };
 
+enum md_op_code {
+	LUSTRE_OPC_MKDIR = 1,
+	LUSTRE_OPC_SYMLINK,
+	LUSTRE_OPC_MKNOD,
+	LUSTRE_OPC_CREATE,
+	LUSTRE_OPC_ANY,
+};
+
 /**
  * GETXATTR is not included as only a couple of fields in the reply body
  * is filled, but not FID which is needed for common intent handling in
@@ -853,6 +842,7 @@ struct md_op_data {
 	struct lu_fid		op_fid4; /* to the operation locks. */
 	u32			op_mds;  /* what mds server open will go to */
 	__u32			op_mode;
+	enum md_op_code		op_code;
 	struct lustre_handle	op_open_handle;
 	s64			op_mod_time;
 	const char		*op_name;
@@ -861,6 +851,7 @@ struct md_op_data {
 	struct rw_semaphore	*op_mea2_sem;
 	struct lmv_stripe_md	*op_mea1;
 	struct lmv_stripe_md	*op_mea2;
+	struct lmv_stripe_md	*op_default_mea1;	/* default LMV */
 	__u32			op_suppgids[2];
 	__u32			op_fsuid;
 	__u32			op_fsgid;
@@ -894,13 +885,14 @@ struct md_op_data {
 	void		       *op_file_secctx;
 	__u32			op_file_secctx_size;
 
-	/* default stripe offset */
-	__u32			op_default_stripe_offset;
-
 	__u32			op_projid;
 
-	/* Used by readdir */
-	unsigned int		op_max_pages;
+	union {
+		/* Used by readdir */
+		unsigned int	op_max_pages;
+		/* mkdir */
+		unsigned short	op_dir_depth;
+	};
 
 	__u16			op_mirror_id;
 
@@ -1035,7 +1027,11 @@ struct obd_ops {
 struct lustre_md {
 	struct mdt_body         *body;
 	struct lu_buf		 layout;
-	struct lmv_stripe_md    *lmv;
+	union {
+		struct lmv_stripe_md    *lmv;
+		struct lmv_foreign_md   *lfm;
+	};
+	struct lmv_stripe_md    *default_lmv;
 #ifdef CONFIG_FS_POSIX_ACL
 	struct posix_acl        *posix_acl;
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/include/obd_class.h b/drivers/staging/lustrefsx/lustre/include/obd_class.h
index f44c4dc42a4ea..b579fc995babb 100644
--- a/drivers/staging/lustrefsx/lustre/include/obd_class.h
+++ b/drivers/staging/lustrefsx/lustre/include/obd_class.h
@@ -1020,21 +1020,8 @@ static inline int obd_statfs_async(struct obd_export *exp,
 
 	CDEBUG(D_SUPER, "%s: age %lld, max_age %lld\n",
 	       obd->obd_name, obd->obd_osfs_age, max_age);
-	if (obd->obd_osfs_age < max_age) {
-		rc = OBP(obd, statfs_async)(exp, oinfo, max_age, rqset);
-	} else {
-		CDEBUG(D_SUPER,
-		       "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n",
-		       obd->obd_name, &obd->obd_osfs,
-		       obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
-		       obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
-		spin_lock(&obd->obd_osfs_lock);
-		memcpy(oinfo->oi_osfs, &obd->obd_osfs, sizeof(*oinfo->oi_osfs));
-		spin_unlock(&obd->obd_osfs_lock);
-		oinfo->oi_flags |= OBD_STATFS_FROM_CACHE;
-		if (oinfo->oi_cb_up)
-			oinfo->oi_cb_up(oinfo, 0);
-	}
+	rc = OBP(obd, statfs_async)(exp, oinfo, max_age, rqset);
+
 	RETURN(rc);
 }
 
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_idl.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_idl.h
index 33462dd2d5e01..43e97f14b3e42 100644
--- a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_idl.h
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_idl.h
@@ -836,6 +836,7 @@ struct ptlrpc_body_v2 {
 #define OBD_CONNECT2_FIDMAP	       0x10000ULL /* FID map */
 #define OBD_CONNECT2_GETATTR_PFID      0x20000ULL /* pack parent FID in getattr */
 /* risk of forwards incompatibility with upstream - use high order bits to mitigate */
+#define OBD_CONNECT2_MDLL_BYPASS 0x800000000000000ULL /* disable metadata lazy load */
 #define OBD_CONNECT2_MDLL           0x1000000000000000ULL /* enable metadata lazy load */
 #define OBD_CONNECT2_MDLL_AUTO_REFRESH 0x2000000000000000ULL /* enable metadata lazy load auto-refresh */
 /* XXX README XXX:
@@ -896,6 +897,7 @@ struct ptlrpc_body_v2 {
 				OBD_CONNECT2_LSOM | \
 				OBD_CONNECT2_ASYNC_DISCARD | \
 				OBD_CONNECT2_GETATTR_PFID | \
+				OBD_CONNECT2_MDLL_BYPASS | \
 				OBD_CONNECT2_MDLL | \
 				OBD_CONNECT2_MDLL_AUTO_REFRESH)
 
@@ -2130,6 +2132,8 @@ struct mdt_rec_reint {
 	__u16           rr_padding_4; /* also fix lustre_swab_mdt_rec_reint */
 };
 
+#define LMV_DESC_QOS_MAXAGE_DEFAULT 60  /* Seconds */
+
 /* lmv structures */
 struct lmv_desc {
 	__u32 ld_tgt_count;		/* how many MDS's */
@@ -2179,28 +2183,6 @@ struct lmv_mds_md_v1 {
 /* #define LMV_USER_MAGIC 0x0CD30CD0 */
 #define LMV_MAGIC_STRIPE 0x0CD40CD0 /* magic for dir sub_stripe */
 
-/* Right now only the lower part(0-16bits) of lmv_hash_type is being used,
- * and the higher part will be the flag to indicate the status of object,
- * for example the object is being migrated. And the hash function
- * might be interpreted differently with different flags. */
-#define LMV_HASH_TYPE_MASK 0x0000ffff
-
-#define LMV_HASH_FLAG_MIGRATION	0x80000000
-
-#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 12, 53, 0)
-/* Since lustre 2.8, this flag will not be needed, instead this DEAD
- * and orphan flags will be stored in LMA (see LMAI_ORPHAN)
- * Keep this flag just for LFSCK, because it still might meet such
- * flag when it checks the old FS */
-#define LMV_HASH_FLAG_DEAD	0x40000000
-#endif
-#define LMV_HASH_FLAG_BAD_TYPE	0x20000000
-
-/* The striped directory has ever lost its master LMV EA, then LFSCK
- * re-generated it. This flag is used to indicate such case. It is an
- * on-disk flag. */
-#define LMV_HASH_FLAG_LOST_LMV	0x10000000
-
 /**
  * The FNV-1a hash algorithm is as follows:
  *	hash = FNV_offset_basis
diff --git a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_user.h b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_user.h
index 470c97e577cc0..d1172a637fcee 100644
--- a/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_user.h
+++ b/drivers/staging/lustrefsx/lustre/include/uapi/linux/lustre/lustre_user.h
@@ -705,6 +705,15 @@ struct fsxattr {
 #define LOV_PATTERN_F_RELEASED	0x80000000 /* HSM released file */
 #define LOV_PATTERN_DEFAULT	0xffffffff
 
+#define LOV_OFFSET_DEFAULT      ((__u16)-1)
+#define LMV_OFFSET_DEFAULT      ((__u32)-1)
+
+#define LOV_QOS_DEF_THRESHOLD_RR_PCT	17
+#define LMV_QOS_DEF_THRESHOLD_RR_PCT	 5
+
+#define LOV_QOS_DEF_PRIO_FREE		90
+#define LMV_QOS_DEF_PRIO_FREE		90
+
 static inline bool lov_pattern_supported(__u32 pattern)
 {
 	return (pattern & ~LOV_PATTERN_F_RELEASED) == LOV_PATTERN_RAID0 ||
@@ -939,9 +948,34 @@ enum lmv_hash_type {
 	LMV_HASH_TYPE_MAX,
 };
 
+#define LMV_HASH_TYPE_DEFAULT LMV_HASH_TYPE_FNV_1A_64
+
 #define LMV_HASH_NAME_ALL_CHARS	"all_char"
 #define LMV_HASH_NAME_FNV_1A_64	"fnv_1a_64"
 
+/* not real hash type, but exposed to user as "space" hash type */
+#define LMV_HASH_NAME_SPACE	"space"
+
+/* Right now only the lower part(0-16bits) of lmv_hash_type is being used,
+ * and the higher part will be the flag to indicate the status of object,
+ * for example the object is being migrated. And the hash function
+ * might be interpreted differently with different flags. */
+#define LMV_HASH_TYPE_MASK 0x0000ffff
+
+static inline bool lmv_is_known_hash_type(__u32 type)
+{
+	return (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_FNV_1A_64 ||
+	       (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_ALL_CHARS;
+}
+
+/* The striped directory has ever lost its master LMV EA, then LFSCK
+ * re-generated it. This flag is used to indicate such case. It is an
+ * on-disk flag. */
+#define LMV_HASH_FLAG_LOST_LMV	0x10000000
+
+#define LMV_HASH_FLAG_BAD_TYPE	0x20000000
+#define LMV_HASH_FLAG_MIGRATION	0x80000000
+
 extern char *mdt_hash_name[LMV_HASH_TYPE_MAX];
 
 /* Got this according to how get LOV_MAX_STRIPE_COUNT, see above,
@@ -949,18 +983,62 @@ extern char *mdt_hash_name[LMV_HASH_TYPE_MAX];
 #define LMV_MAX_STRIPE_COUNT 2000  /* ((12 * 4096 - 256) / 24) */
 #define lmv_user_md lmv_user_md_v1
 struct lmv_user_md_v1 {
-	__u32	lum_magic;	 /* must be the first field */
+	__u32	lum_magic;	   /* must be the first field */
 	__u32	lum_stripe_count;  /* dirstripe count */
 	__u32	lum_stripe_offset; /* MDT idx for default dirstripe */
 	__u32	lum_hash_type;     /* Dir stripe policy */
-	__u32	lum_type;	  /* LMV type: default or normal */
-	__u32	lum_padding1;
+	__u32	lum_type;	   /* LMV type: default */
+	__u8	lum_max_inherit;   /* inherit depth of default LMV */
+	__u8	lum_max_inherit_rr;	/* inherit depth of default LMV to round-robin mkdir */
+	__u16	lum_padding1;
 	__u32	lum_padding2;
 	__u32	lum_padding3;
 	char	lum_pool_name[LOV_MAXPOOLNAME + 1];
 	struct	lmv_user_mds_data  lum_objects[0];
 } __attribute__((packed));
 
+/*
+ * NB, historically default layout didn't set type, but use XATTR name to differ
+ * from normal layout, for backward compatibility, define LMV_TYPE_DEFAULT 0x0,
+ * and still use the same method.
+ */
+enum lmv_type {
+	LMV_TYPE_DEFAULT = 0x0000,
+};
+
+/* lum_max_inherit will be decreased by 1 after each inheritance if it's not
+ * LMV_INHERIT_UNLIMITED or > LMV_INHERIT_MAX.
+ */
+enum {
+	/* for historical reason, 0 means unlimited inheritance */
+	LMV_INHERIT_UNLIMITED	= 0,
+	/* unlimited lum_max_inherit by default */
+	LMV_INHERIT_DEFAULT	= 0,
+	/* not inherit any more */
+	LMV_INHERIT_END		= 1,
+	/* max inherit depth */
+	LMV_INHERIT_MAX		= 250,
+	/* [251, 254] are reserved */
+	/* not set, or when inherit depth goes beyond end,  */
+	LMV_INHERIT_NONE	= 255,
+};
+
+enum {
+	/* not set, or when inherit_rr depth goes beyond end,  */
+	LMV_INHERIT_RR_NONE		= 0,
+	/* disable lum_max_inherit_rr by default */
+	LMV_INHERIT_RR_DEFAULT		= 0,
+	/* not inherit any more */
+	LMV_INHERIT_RR_END		= 1,
+	/* default inherit_rr of ROOT */
+	LMV_INHERIT_RR_ROOT		= 3,
+	/* max inherit depth */
+	LMV_INHERIT_RR_MAX		= 250,
+	/* [251, 254] are reserved */
+	/* unlimited inheritance */
+	LMV_INHERIT_RR_UNLIMITED	= 255,
+};
+
 static inline int lmv_user_md_size(int stripes, int lmm_magic)
 {
 	int size = sizeof(struct lmv_user_md);
diff --git a/drivers/staging/lustrefsx/lustre/llite/dcache.c b/drivers/staging/lustrefsx/lustre/llite/dcache.c
index e0cd72b79e265..801cfc988b273 100644
--- a/drivers/staging/lustrefsx/lustre/llite/dcache.c
+++ b/drivers/staging/lustrefsx/lustre/llite/dcache.c
@@ -262,7 +262,10 @@ int ll_revalidate_it_finish(struct ptlrpc_request *request,
                             struct lookup_intent *it,
                             struct dentry *de)
 {
-        int rc = 0;
+	struct inode *inode = de->d_inode;
+	__u64 bits = 0;
+	int rc = 0;
+
         ENTRY;
 
         if (!request)
@@ -272,6 +275,18 @@ int ll_revalidate_it_finish(struct ptlrpc_request *request,
                 RETURN(-ENOENT);
 
         rc = ll_prep_inode(&de->d_inode, request, NULL, it);
+	if (rc)
+		RETURN(rc);
+
+	ll_set_lock_data(ll_i2sbi(inode)->ll_md_exp, inode, it,
+			 &bits);
+	if (bits & MDS_INODELOCK_LOOKUP) {
+		ll_update_dir_depth(de->d_parent->d_inode, inode);
+		rc = ll_d_init(de);
+		if (rc < 0)
+			RETURN(rc);
+		d_lustre_revalidate(de);
+	}
 
         RETURN(rc);
 }
diff --git a/drivers/staging/lustrefsx/lustre/llite/dir.c b/drivers/staging/lustrefsx/lustre/llite/dir.c
index a6200132a22db..ce74bfb18dc57 100644
--- a/drivers/staging/lustrefsx/lustre/llite/dir.c
+++ b/drivers/staging/lustrefsx/lustre/llite/dir.c
@@ -60,29 +60,38 @@
 
 #include "llite_internal.h"
 
-static void ll_check_and_trigger_restore(struct inode *dir)
+static int ll_check_and_trigger_restore(struct inode *dir)
 {
 	struct ll_sb_info *sbi = ll_i2sbi(dir);
+	const int max_retry = atomic_read(&sbi->ll_dir_restore_max_retry_count);
+	int retry_count = 0;
 	u32 hus_states;
 	__u32 gen = 0;
 	int rc;
 
-	if (!(sbi && (sbi->ll_flags & LL_SBI_MDLL)))
-		return;
+	/* Skip restore if server does not support or if disabled */
+	if (!exp_mdll(sbi->ll_md_exp) || exp_bypass_mdll(sbi->ll_md_exp))
+		return 0;
+
 	/*
 	 * TODO-MDLL:
 	 * use API that does a cached read instead of
 	 * going to the mdt for getting the hsm state.
 	 * Tracked with Simba-21644
 	 */
+try_again:
 	rc = ll_get_hsm_state(dir, &hus_states);
 	if (rc == 0 && (hus_states & HS_RELEASED)) {
-		CDEBUG(D_HSM, "MDLL Calling ll_layout_restore for dir "DFID"\n",
-		       PFID(ll_inode2fid(dir)));
+		CDEBUG(D_HSM,
+		       "MDLL Calling ll_layout_restore for dir "DFID" retry: %d"
+		       "\n", PFID(ll_inode2fid(dir)), retry_count);
 		rc = ll_layout_restore(dir, 0, OBD_OBJECT_EOF);
 		if (rc) {
 			CERROR("MDLL ll_layout_restore ("DFID") error rc: %d\n",
 			       PFID(ll_inode2fid(dir)), rc);
+			rc = -EAGAIN;
+			if (max_retry == 0)
+				goto out_exit;
 		} else {
 			CDEBUG(D_HSM, "MDLL Restore triggered for dir "DFID"\n",
 			       PFID(ll_inode2fid(dir)));
@@ -90,7 +99,46 @@ static void ll_check_and_trigger_restore(struct inode *dir)
 			CDEBUG(D_HSM, "MDLL Restore done for dir "DFID"\n",
 			       PFID(ll_inode2fid(dir)));
 		}
+		/* If the max_retry is set to 0, then the behavior would be
+		 * without a retry. There wont be any check for the hsm state
+		 * after the completed restore. This case would be similar to
+		 * the behaviour without this retry changes. The default
+		 * value of the max_retry would be 1.
+		 * A value of -1 would retry indefinitely.
+		 */
+		/* In case of an mdt restart, the ll_layout_refresh would
+		 * return back only after the mdt has restarted and the
+		 * existing network connection gets a reset. When the retry
+		 * happens, the mdt would be up and running.
+		 * Ideally the directory restore would be done with a single
+		 * retry if the mdt does not crash/restart again.
+		 */
+		if ((max_retry < 0) ||
+		    (max_retry >= 0 && retry_count < max_retry)) {
+			retry_count++;
+			goto try_again;
+		} else if (max_retry > 0 && retry_count >= max_retry) {
+			rc = ll_get_hsm_state(dir, &hus_states);
+			if (rc == 0 && (hus_states & HS_RELEASED)) {
+				CDEBUG(D_HSM,
+				       "MDLL reached max retry %d for ("DFID")"
+				       "hsm_state: %d\n",
+				       retry_count, PFID(ll_inode2fid(dir)),
+				       hus_states);
+				rc = -EAGAIN;
+				goto out_exit;
+			}
+		}
 	}
+	if (rc != 0) {
+		CDEBUG(D_HSM,
+		       "MDLL error calling ll_get_hsm_state for dir "DFID" rc: "
+		       "%d\n", PFID(ll_inode2fid(dir)), rc);
+		rc = -EAGAIN;
+	}
+
+out_exit:
+	return rc;
 }
 
 /*
@@ -181,7 +229,9 @@ struct page *ll_get_dir_page(struct inode *dir, struct md_op_data *op_data,
 	struct page		*page;
 	int			rc;
 
-	ll_check_and_trigger_restore(dir);
+	rc = ll_check_and_trigger_restore(dir);
+	if (rc != 0)
+		return ERR_PTR(rc);
 
 	cb_op.md_blocking_ast = ll_md_blocking_ast;
 	rc = md_read_page(ll_i2mdexp(dir), op_data, &cb_op, offset, &page);
@@ -198,8 +248,7 @@ void ll_release_page(struct inode *inode, struct page *page,
 
 	/* Always remove the page for striped dir, because the page is
 	 * built from temporarily in LMV layer */
-	if (inode != NULL && S_ISDIR(inode->i_mode) &&
-	    ll_i2info(inode)->lli_lsm_md != NULL) {
+	if (inode && ll_dir_striped(inode)) {
 		__free_page(page);
 		return;
 	}
@@ -376,7 +425,7 @@ static int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
 		 */
 		GOTO(out, rc = 0);
 
-	if (unlikely(ll_i2info(inode)->lli_lsm_md != NULL)) {
+	if (unlikely(ll_dir_striped(inode))) {
 		/*
 		 * This is only needed for striped dir to fill ..,
 		 * see lmv_read_page()
@@ -521,6 +570,8 @@ static int ll_dir_setdirstripe(struct dentry *dparent, struct lmv_user_md *lump,
 	if (IS_ERR(op_data))
 		RETURN(PTR_ERR(op_data));
 
+	op_data->op_dir_depth = ll_i2info(parent)->lli_dir_depth;
+
 	if (sbi->ll_flags & LL_SBI_FILE_SECCTX) {
 		/* selinux_dentry_init_security() uses dentry->d_parent and name
 		 * to determine the security context for the file. So our fake
@@ -700,27 +751,23 @@ int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
 	RETURN(rc);
 }
 
-static int ll_dir_get_default_layout(struct inode *inode, void **plmm,
-				     int *plmm_size,
-				     struct ptlrpc_request **request, u64 valid,
-				     enum get_default_layout_type type)
+int ll_dir_get_default_layout(struct inode *inode, void **plmm, int *plmm_size,
+			      struct ptlrpc_request **request, u64 valid,
+			      enum get_default_layout_type type)
 {
 	struct ll_sb_info *sbi = ll_i2sbi(inode);
 	struct mdt_body   *body;
 	struct lov_mds_md *lmm = NULL;
 	struct ptlrpc_request *req = NULL;
-	int rc, lmm_size;
+	int lmm_size = OBD_MAX_DEFAULT_EA_SIZE;
 	struct md_op_data *op_data;
 	struct lu_fid fid;
-	ENTRY;
+	int rc;
 
-	rc = ll_get_default_mdsize(sbi, &lmm_size);
-	if (rc)
-		RETURN(rc);
+	ENTRY;
 
-	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
-				     0, lmm_size, LUSTRE_OPC_ANY,
-				     NULL);
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, lmm_size,
+				     LUSTRE_OPC_ANY, NULL);
 	if (IS_ERR(op_data))
 		RETURN(PTR_ERR(op_data));
 
@@ -813,7 +860,7 @@ int ll_dir_getstripe_default(struct inode *inode, void **plmm, int *plmm_size,
 	rc = ll_dir_get_default_layout(inode, (void **)&lmm, &lmm_size,
 				       &req, valid, 0);
 	if (rc == -ENODATA && !fid_is_root(ll_inode2fid(inode)) &&
-	    !(valid & (OBD_MD_MEA|OBD_MD_DEFAULT_MEA)) && root_request != NULL){
+	    !(valid & OBD_MD_MEA) && root_request != NULL) {
 		int rc2 = ll_dir_get_default_layout(inode, (void **)&lmm,
 						    &lmm_size, &root_req, valid,
 						    GET_DEFAULT_LAYOUT_ROOT);
@@ -948,6 +995,11 @@ static int ll_ioc_copy_start(struct super_block *sb, struct hsm_copy *copy)
 		/* Store in the hsm_copy for later copytool use.
 		 * Always modified even if no lsm. */
 		copy->hc_data_version = data_version;
+
+	} else if (copy->hc_hai.hai_action == HSMA_IMPORT) {
+
+		/* IMPORT sends its progress using alloc fid when possible */
+		hpk.hpk_fid = copy->hc_hai.hai_dfid;
 	}
 
 progress:
@@ -1054,6 +1106,10 @@ static int ll_ioc_copy_end(struct super_block *sb, struct hsm_copy *copy)
 			GOTO(progress, rc);
 		}
 
+	} else if (copy->hc_hai.hai_action == HSMA_IMPORT) {
+
+		/* IMPORT sends its progress using alloc fid when possible */
+		hpk.hpk_fid = copy->hc_hai.hai_dfid;
 	}
 
 progress:
@@ -1547,7 +1603,7 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		if (copy_from_user(&lumv1, lumv1p, sizeof(lumv1)))
 			RETURN(-EFAULT);
 
-		if (inode->i_sb->s_root == file_dentry(file))
+		if (is_root_inode(inode))
 			set_default = 1;
 
 		switch (lumv1.lmm_magic) {
@@ -1617,6 +1673,59 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			if (lmmsize > sizeof(*ulmv))
 				GOTO(finish_req, rc = -EINVAL);
 
+			if (root_request != NULL) {
+				struct lmv_user_md *lum;
+				struct ll_inode_info *lli;
+
+				lum = (struct lmv_user_md *)lmm;
+				lli = ll_i2info(inode);
+				if (lum->lum_max_inherit !=
+				    LMV_INHERIT_UNLIMITED) {
+					if (lum->lum_max_inherit ==
+						LMV_INHERIT_NONE ||
+					    lum->lum_max_inherit <
+						LMV_INHERIT_END ||
+					    lum->lum_max_inherit >
+						LMV_INHERIT_MAX ||
+					    lum->lum_max_inherit <
+						lli->lli_dir_depth)
+						GOTO(finish_req, rc = -ENODATA);
+
+					if (lum->lum_max_inherit ==
+					    lli->lli_dir_depth) {
+						lum->lum_max_inherit =
+							LMV_INHERIT_NONE;
+						lum->lum_max_inherit_rr =
+							LMV_INHERIT_RR_NONE;
+						goto out_copy;
+					}
+
+					lum->lum_max_inherit -=
+						lli->lli_dir_depth;
+				}
+
+				if (lum->lum_max_inherit_rr !=
+					LMV_INHERIT_RR_UNLIMITED) {
+					if (lum->lum_max_inherit_rr ==
+						LMV_INHERIT_NONE ||
+					    lum->lum_max_inherit_rr <
+						LMV_INHERIT_RR_END ||
+					    lum->lum_max_inherit_rr >
+						LMV_INHERIT_RR_MAX ||
+					    lum->lum_max_inherit_rr <=
+						lli->lli_dir_depth) {
+						lum->lum_max_inherit_rr =
+							LMV_INHERIT_RR_NONE;
+						goto out_copy;
+					}
+
+					if (lum->lum_max_inherit_rr >
+						lli->lli_dir_depth)
+						lum->lum_max_inherit_rr -=
+							lli->lli_dir_depth;
+				}
+			}
+out_copy:
 			if (copy_to_user(ulmv, lmm, lmmsize))
 				GOTO(finish_req, rc = -EFAULT);
 
diff --git a/drivers/staging/lustrefsx/lustre/llite/file.c b/drivers/staging/lustrefsx/lustre/llite/file.c
index a37308caf619d..7c1a3f0741cc7 100644
--- a/drivers/staging/lustrefsx/lustre/llite/file.c
+++ b/drivers/staging/lustrefsx/lustre/llite/file.c
@@ -370,8 +370,8 @@ int ll_file_release(struct inode *inode, struct file *file)
 	if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 		ll_deauthorize_statahead(inode, fd);
 
-	if (inode->i_sb->s_root == file_dentry(file)) {
-		LUSTRE_FPRIVATE(file) = NULL;
+	if (is_root_inode(inode)) {
+                LUSTRE_FPRIVATE(file) = NULL;
 		ll_file_data_put(fd);
 		RETURN(0);
 	}
@@ -610,8 +610,10 @@ static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 		 * of kernel will deal with that later.
 		 */
 		ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, &bits);
-		if (bits & MDS_INODELOCK_LOOKUP)
+		if (bits & MDS_INODELOCK_LOOKUP) {
 			d_lustre_revalidate(de);
+			ll_update_dir_depth(parent->d_inode, de->d_inode);
+		}
 
 		/* if DoM bit returned along with LAYOUT bit then there
 		 * can be read-on-open data returned.
@@ -719,10 +721,10 @@ int ll_file_open(struct inode *inode, struct file *file)
 	if (S_ISDIR(inode->i_mode))
 		ll_authorize_statahead(inode, fd);
 
-	if (inode->i_sb->s_root == file_dentry(file)) {
-                LUSTRE_FPRIVATE(file) = fd;
+	if (is_root_inode(inode)) {
+		LUSTRE_FPRIVATE(file) = fd;
                 RETURN(0);
-        }
+	}
 
 	if (!it || !it->it_disposition) {
 		CDEBUG(D_HSM, "MDLL file->f_flags=0x%x/0%o\n",
@@ -2315,26 +2317,26 @@ static int ll_put_grouplock(struct inode *inode, struct file *file,
  */
 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
 {
-        struct inode *inode = dentry->d_inode;
-        struct obd_client_handle *och;
-        int rc;
-        ENTRY;
+	struct inode *inode = dentry->d_inode;
+	struct obd_client_handle *och;
+	int rc;
+	ENTRY;
 
-        LASSERT(inode);
+	LASSERT(inode);
 
-        /* Root ? Do nothing. */
-        if (dentry->d_inode->i_sb->s_root == dentry)
-                RETURN(0);
+	/* Root ? Do nothing. */
+	if (is_root_inode(inode))
+		RETURN(0);
 
-        /* No open handle to close? Move away */
-        if (!it_disposition(it, DISP_OPEN_OPEN))
-                RETURN(0);
+	/* No open handle to close? Move away */
+	if (!it_disposition(it, DISP_OPEN_OPEN))
+		RETURN(0);
 
-        LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
+	LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
 
-        OBD_ALLOC(och, sizeof(*och));
-        if (!och)
-                GOTO(out, rc = -ENOMEM);
+	OBD_ALLOC(och, sizeof(*och));
+	if (!och)
+		GOTO(out, rc = -ENOMEM);
 
 	rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 	if (rc)
@@ -4259,7 +4261,7 @@ int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
 	if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
 	      OBD_CONNECT2_DIR_MIGRATE)) {
 		if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
-		    ll_i2info(child_inode)->lli_lsm_md) {
+		    ll_dir_striped(child_inode)) {
 			CERROR("%s: MDT doesn't support stripe directory "
 			       "migration!\n",
 			       ll_get_fsname(parent->i_sb, NULL, 0));
@@ -4272,7 +4274,7 @@ int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
 	 * by checking the migrate FID against the FID of the
 	 * filesystem root.
 	 */
-	if (child_inode == parent->i_sb->s_root->d_inode)
+	if (is_root_inode(child_inode))
 		GOTO(out_iput, rc = -EINVAL);
 
 	op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
@@ -4445,8 +4447,7 @@ static int ll_inode_revalidate_fini(struct inode *inode, int rc)
 		/* If it is striped directory, and there is bad stripe
 		 * Let's revalidate the dentry again, instead of returning
 		 * error */
-		if (S_ISDIR(inode->i_mode) &&
-		    ll_i2info(inode)->lli_lsm_md != NULL)
+		if (ll_dir_striped(inode))
 			return 0;
 
 		/* This path cannot be hit for regular files unless in
@@ -4535,6 +4536,10 @@ static int ll_merge_md_attr(struct inode *inode)
 	int rc;
 
 	LASSERT(lli->lli_lsm_md != NULL);
+
+	if (!lmv_dir_striped(lli->lli_lsm_md))
+		RETURN(0);
+
 	down_read(&lli->lli_lsm_sem);
 	rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
 			   &attr, ll_md_blocking_ast);
@@ -4600,8 +4605,7 @@ int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
 		}
 	} else {
 		/* If object isn't regular a file then don't validate size. */
-		if (S_ISDIR(inode->i_mode) &&
-		    lli->lli_lsm_md != NULL) {
+		if (ll_dir_striped(inode)) {
 			rc = ll_merge_md_attr(inode);
 			if (rc < 0)
 				RETURN(rc);
@@ -4831,7 +4835,7 @@ int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
        /* as root inode are NOT getting validated in lookup operation,
         * need to do it before permission check. */
 
-        if (inode == inode->i_sb->s_root->d_inode) {
+	if (is_root_inode(inode)) {
 		rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
                 if (rc)
                         RETURN(rc);
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_internal.h b/drivers/staging/lustrefsx/lustre/llite/llite_internal.h
index 83f0e616d83c5..ab9c99eb6139e 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_internal.h
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_internal.h
@@ -158,24 +158,22 @@ struct ll_inode_info {
 			/* "opendir_pid" is the token when lookup/revalid
 			 * -- I am the owner of dir statahead. */
 			pid_t				lli_opendir_pid;
+			/* directory depth to ROOT */
+			unsigned short			lli_dir_depth;
 			/* stat will try to access statahead entries or start
 			 * statahead if this flag is set, and this flag will be
 			 * set upon dir open, and cleared when dir is closed,
 			 * statahead hit ratio is too low, or start statahead
 			 * thread failed. */
-			unsigned int			lli_sa_enabled:1;
+			unsigned short			lli_sa_enabled:1;
 			/* generation for statahead */
 			unsigned int			lli_sa_generation;
 			/* rw lock protects lli_lsm_md */
 			struct rw_semaphore		lli_lsm_sem;
 			/* directory stripe information */
 			struct lmv_stripe_md		*lli_lsm_md;
-			/* default directory stripe offset.  This is extracted
-			 * from the "dmv" xattr in order to decide which MDT to
-			 * create a subdirectory on.  The MDS itself fetches
-			 * "dmv" and gets the rest of the default layout itself
-			 * (count, hash, etc). */
-			__u32				lli_def_stripe_offset;
+			/* directory default LMV */
+			struct lmv_stripe_md		*lli_default_lsm_md;
 		};
 
 		/* for non-directory */
@@ -469,6 +467,7 @@ enum stats_track_type {
 #define LL_SBI_TINY_WRITE   0x2000000 /* tiny write support */
 #define LL_SBI_MDLL_AUTO_REFRESH   0x10000000 /* enable metadata lazy load */
 #define LL_SBI_MDLL   0x20000000 /* enable metadata lazy load auto-refresh */
+#define LL_SBI_MDLL_BYPASS   0x40000000 /* disable metadata lazy load auto-refresh */
 
 #define LL_SBI_FLAGS { 	\
 	"nolck",	\
@@ -594,6 +593,16 @@ struct ll_sb_info {
 	 */
 	int			  ll_neg_dentry_timeout;
 
+	/*
+	 * MDLL directory restore retry count
+	 * This would determine the number of times the restore would be
+	 * retried before returning error to the client. The retry would
+	 * be based on the released bit of the directory.
+	 * A value of -1 would retry indefinitely.
+	 */
+#define LL_MDLL_DIR_RESTORE_DEF_RETRY_COUNT 1
+	atomic_t		  ll_dir_restore_max_retry_count;
+
 	struct kset		  ll_kset;	/* sysfs object */
 	struct completion	  ll_kobj_unregister;
 };
@@ -955,6 +964,9 @@ int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
                              struct ptlrpc_request **request);
 int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
                      int set_default);
+int ll_dir_get_default_layout(struct inode *inode, void **plmm, int *plmm_size,
+			      struct ptlrpc_request **request, u64 valid,
+			      enum get_default_layout_type type);
 int ll_dir_getstripe_default(struct inode *inode, void **lmmp,
 			     int *lmm_size, struct ptlrpc_request **request,
 			     struct ptlrpc_request **root_request, u64 valid);
@@ -1006,6 +1018,7 @@ int ll_statfs_internal(struct ll_sb_info *sbi, struct obd_statfs *osfs,
 		       u32 flags);
 int ll_update_inode(struct inode *inode, struct lustre_md *md);
 void ll_update_inode_flags(struct inode *inode, int ext_flags);
+void ll_update_dir_depth(struct inode *dir, struct inode *inode);
 int ll_read_inode2(struct inode *inode, void *opaque);
 void ll_delete_inode(struct inode *inode);
 int ll_iocontrol(struct inode *inode, struct file *file,
@@ -1026,19 +1039,12 @@ int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize);
 int ll_get_default_mdsize(struct ll_sb_info *sbi, int *default_mdsize);
 int ll_set_default_mdsize(struct ll_sb_info *sbi, int default_mdsize);
 
-enum {
-	LUSTRE_OPC_MKDIR	= 0,
-	LUSTRE_OPC_SYMLINK	= 1,
-	LUSTRE_OPC_MKNOD	= 2,
-	LUSTRE_OPC_CREATE	= 3,
-	LUSTRE_OPC_ANY		= 5,
-};
-
 void ll_unlock_md_op_lsm(struct md_op_data *op_data);
 struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
 				      struct inode *i1, struct inode *i2,
 				      const char *name, size_t namelen,
-				      __u32 mode, __u32 opc, void *data);
+				      __u32 mode, enum md_op_code opc,
+				      void *data);
 void ll_finish_md_op_data(struct md_op_data *op_data);
 int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg);
 char *ll_get_fsname(struct super_block *sb, char *buf, int buflen);
@@ -1216,6 +1222,13 @@ static inline struct lu_fid *ll_inode2fid(struct inode *inode)
         return fid;
 }
 
+static inline bool ll_dir_striped(struct inode *inode)
+{
+	LASSERT(inode);
+	return S_ISDIR(inode->i_mode) &&
+	       lmv_dir_striped(ll_i2info(inode)->lli_lsm_md);
+}
+
 static inline loff_t ll_file_maxbytes(struct inode *inode)
 {
 	struct cl_object *obj = ll_i2info(inode)->lli_clob;
diff --git a/drivers/staging/lustrefsx/lustre/llite/llite_lib.c b/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
index c10a3ebfabafe..d030796c71a0f 100644
--- a/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
+++ b/drivers/staging/lustrefsx/lustre/llite/llite_lib.c
@@ -148,6 +148,9 @@ static struct ll_sb_info *ll_init_sbi(void)
 	INIT_LIST_HEAD(&sbi->ll_squash.rsi_nosquash_nids);
 	init_rwsem(&sbi->ll_squash.rsi_sem);
 
+	atomic_set(&sbi->ll_dir_restore_max_retry_count,
+		   LL_MDLL_DIR_RESTORE_DEF_RETRY_COUNT);
+
 	RETURN(sbi);
 }
 
@@ -234,16 +237,15 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 				   OBD_CONNECT2_ARCHIVE_ID_ARRAY |
 				   OBD_CONNECT2_LSOM |
 				   OBD_CONNECT2_ASYNC_DISCARD |
-				   OBD_CONNECT2_GETATTR_PFID;
-	if (sbi->ll_flags & LL_SBI_MDLL)
-		data->ocd_connect_flags2 |= OBD_CONNECT2_MDLL;
-
-	if (sbi->ll_flags & LL_SBI_MDLL_AUTO_REFRESH)
-		data->ocd_connect_flags2 |= OBD_CONNECT2_MDLL_AUTO_REFRESH;
+				   OBD_CONNECT2_GETATTR_PFID |
+				   OBD_CONNECT2_MDLL;
 
 	if (sbi->ll_flags & LL_SBI_MDLL)
 		data->ocd_connect_flags2 |= OBD_CONNECT2_MDLL;
 
+	if (sbi->ll_flags & LL_SBI_MDLL_BYPASS)
+		data->ocd_connect_flags2 |= OBD_CONNECT2_MDLL_BYPASS;
+
 	if (sbi->ll_flags & LL_SBI_MDLL_AUTO_REFRESH)
 		data->ocd_connect_flags2 |= OBD_CONNECT2_MDLL_AUTO_REFRESH;
 
@@ -857,6 +859,11 @@ static int ll_options(char *options, struct ll_sb_info *sbi)
 			*flags = (*flags & ~LL_SBI_MDLL_AUTO_REFRESH) | tmp;
 			goto next;
 		  }
+		tmp = ll_set_opt("mdll_bypass", s1, LL_SBI_MDLL_BYPASS);
+		if (tmp) {
+			*flags = (*flags & ~LL_SBI_MDLL_BYPASS) | tmp;
+			goto next;
+		}
 		tmp = ll_set_opt("mdll", s1, LL_SBI_MDLL);
 		if (tmp) {
 			*flags = (*flags & ~LL_SBI_MDLL) | tmp;
@@ -997,7 +1004,6 @@ void ll_lli_init(struct ll_inode_info *lli)
 		spin_lock_init(&lli->lli_sa_lock);
 		lli->lli_opendir_pid = 0;
 		lli->lli_sa_enabled = 0;
-		lli->lli_def_stripe_offset = -1;
 		init_rwsem(&lli->lli_lsm_sem);
 	} else {
 		mutex_init(&lli->lli_size_mutex);
@@ -1300,10 +1306,15 @@ void ll_dir_clear_lsm_md(struct inode *inode)
 
 	LASSERT(S_ISDIR(inode->i_mode));
 
-	if (lli->lli_lsm_md != NULL) {
+	if (lli->lli_lsm_md) {
 		lmv_free_memmd(lli->lli_lsm_md);
 		lli->lli_lsm_md = NULL;
 	}
+
+	if (lli->lli_default_lsm_md) {
+		lmv_free_memmd(lli->lli_default_lsm_md);
+		lli->lli_default_lsm_md = NULL;
+	}
 }
 
 static struct inode *ll_iget_anon_dir(struct super_block *sb,
@@ -1373,6 +1384,9 @@ static int ll_init_lsm_md(struct inode *inode, struct lustre_md *md)
 		PFID(&lli->lli_fid));
 	lsm_md_dump(D_INODE, lsm);
 
+	if (!lmv_dir_striped(lsm))
+		goto out;
+
 	/* XXX sigh, this lsm_root initialization should be in
 	 * LMV layer, but it needs ll_iget right now, so we
 	 * put this here right now. */
@@ -1400,17 +1414,56 @@ static int ll_init_lsm_md(struct inode *inode, struct lustre_md *md)
 			return rc;
 		}
 	}
-
+out:
 	lli->lli_lsm_md = lsm;
 
 	return 0;
 }
 
+static void ll_update_default_lsm_md(struct inode *inode, struct lustre_md *md)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+
+	ENTRY;
+
+	if (!md->default_lmv) {
+		/* clear default lsm */
+		if (lli->lli_default_lsm_md) {
+			down_write(&lli->lli_lsm_sem);
+			if (lli->lli_default_lsm_md) {
+				lmv_free_memmd(lli->lli_default_lsm_md);
+				lli->lli_default_lsm_md = NULL;
+			}
+			up_write(&lli->lli_lsm_sem);
+		}
+		RETURN_EXIT;
+	}
+
+	if (lli->lli_default_lsm_md) {
+		/* do nonthing if default lsm isn't changed */
+		down_read(&lli->lli_lsm_sem);
+		if (lli->lli_default_lsm_md &&
+		    lsm_md_eq(lli->lli_default_lsm_md, md->default_lmv)) {
+			up_read(&lli->lli_lsm_sem);
+			RETURN_EXIT;
+		}
+		up_read(&lli->lli_lsm_sem);
+	}
+
+	down_write(&lli->lli_lsm_sem);
+	if (lli->lli_default_lsm_md)
+		lmv_free_memmd(lli->lli_default_lsm_md);
+	lli->lli_default_lsm_md = md->default_lmv;
+	lsm_md_dump(D_INODE, md->default_lmv);
+	md->default_lmv = NULL;
+	up_write(&lli->lli_lsm_sem);
+	RETURN_EXIT;
+}
+
 static int ll_update_lsm_md(struct inode *inode, struct lustre_md *md)
 {
 	struct ll_inode_info *lli = ll_i2info(inode);
 	struct lmv_stripe_md *lsm = md->lmv;
-	struct cl_attr	*attr;
 	int rc = 0;
 
 	ENTRY;
@@ -1419,6 +1472,10 @@ static int ll_update_lsm_md(struct inode *inode, struct lustre_md *md)
 	CDEBUG(D_INODE, "update lsm %p of "DFID"\n", lli->lli_lsm_md,
 	       PFID(ll_inode2fid(inode)));
 
+	/* update default LMV */
+	if (md->default_lmv)
+		ll_update_default_lsm_md(inode, md);
+
 	/*
 	 * no striped information from request, lustre_md from req does not
 	 * include stripeEA, see ll_md_setattr()
@@ -1440,47 +1497,58 @@ static int ll_update_lsm_md(struct inode *inode, struct lustre_md *md)
 	 * which means layout is changed, this happens in dir split/merge and
 	 * lfsck.
 	 */
-	if (lli->lli_lsm_md &&
-	    lsm->lsm_md_layout_version <=
-	    lli->lli_lsm_md->lsm_md_layout_version) {
-		CERROR("%s: "DFID" dir layout mismatch:\n",
-		       ll_get_fsname(inode->i_sb, NULL, 0),
-		       PFID(&lli->lli_fid));
-		lsm_md_dump(D_ERROR, lli->lli_lsm_md);
-		lsm_md_dump(D_ERROR, lsm);
-		GOTO(unlock, rc = -EINVAL);
-	}
-  
-	up_read(&lli->lli_lsm_sem);
-	down_write(&lli->lli_lsm_sem);
-	/* clear existing lsm */
-	if (lli->lli_lsm_md) {
-		lmv_free_memmd(lli->lli_lsm_md);
-		lli->lli_lsm_md = NULL;
- 	}
+	if (lli->lli_lsm_md && !lsm_md_eq(lli->lli_lsm_md, lsm)) {
+		if (lmv_dir_striped(lli->lli_lsm_md) &&
+		    lsm->lsm_md_layout_version <=
+		    lli->lli_lsm_md->lsm_md_layout_version) {
+			CERROR("%s: "DFID" dir layout mismatch:\n",
+			       ll_get_fsname(inode->i_sb, NULL, 0),
+			       PFID(&lli->lli_fid));
+			lsm_md_dump(D_ERROR, lli->lli_lsm_md);
+			lsm_md_dump(D_ERROR, lsm);
+			GOTO(unlock, rc = -EINVAL);
+		}
 
-	rc = ll_init_lsm_md(inode, md);
-	up_write(&lli->lli_lsm_sem);
-	if (rc)
-		RETURN(rc);
- 
-	/* set md->lmv to NULL, so the following free lustre_md will not free
-	 * this lsm.
-	 */
-	md->lmv = NULL;
+		/* layout changed, switch to write lock */
+		up_read(&lli->lli_lsm_sem);
+		down_write(&lli->lli_lsm_sem);
+		ll_dir_clear_lsm_md(inode);
+	}
+
+	/* set directory layout */
+	if (!lli->lli_lsm_md) {
+		struct cl_attr	*attr;
+
+		rc = ll_init_lsm_md(inode, md);
+		up_write(&lli->lli_lsm_sem);
+		if (rc != 0)
+			RETURN(rc);
+
+		/* set md->lmv to NULL, so the following free lustre_md
+		 * will not free this lsm */
+		md->lmv = NULL;
+
+		/*
+		 * md_merge_attr() may take long, since lsm is already set,
+		 * switch to read lock.
+		 */
+		down_read(&lli->lli_lsm_sem);
+
+		if (!lmv_dir_striped(lli->lli_lsm_md))
+			GOTO(unlock, rc);
+
+		OBD_ALLOC_PTR(attr);
+		if (attr == NULL)
+			GOTO(unlock, rc = -ENOMEM);
+
+		/* validate the lsm */
+		rc = md_merge_attr(ll_i2mdexp(inode), lsm, attr,
+				   ll_md_blocking_ast);
+		if (rc != 0) {
+			OBD_FREE_PTR(attr);
+			GOTO(unlock, rc);
+		}
 
-	/* md_merge_attr() may take long, since lsm is already set, switch to
-	 * read lock.
-	 */
-	down_read(&lli->lli_lsm_sem);
-	OBD_ALLOC_PTR(attr);
-	if (!attr)
-		GOTO(unlock, rc = -ENOMEM);
- 
-	/* validate the lsm */
-	rc = md_merge_attr(ll_i2mdexp(inode), lli->lli_lsm_md, attr,
-			   ll_md_blocking_ast);
-	if (!rc) {
 		if (md->body->mbo_valid & OBD_MD_FLNLINK)
 			md->body->mbo_nlink = attr->cat_nlink;
 		if (md->body->mbo_valid & OBD_MD_FLSIZE)
@@ -1491,10 +1559,10 @@ static int ll_update_lsm_md(struct inode *inode, struct lustre_md *md)
 			md->body->mbo_ctime = attr->cat_ctime;
 		if (md->body->mbo_valid & OBD_MD_FLMTIME)
 			md->body->mbo_mtime = attr->cat_mtime;
+
+		OBD_FREE_PTR(attr);
 	}
 
-	OBD_FREE_PTR(attr);
-	GOTO(unlock, rc);
 unlock:
 	up_read(&lli->lli_lsm_sem);
 
@@ -2078,6 +2146,23 @@ int ll_update_inode(struct inode *inode, struct lustre_md *md)
 	return 0;
 }
 
+/* update directory depth to ROOT, called after LOOKUP lock is fetched. */
+void ll_update_dir_depth(struct inode *dir, struct inode *inode)
+{
+	struct ll_inode_info *lli;
+
+	if (!S_ISDIR(inode->i_mode))
+		return;
+
+	if (inode == dir)
+		return;
+
+	lli = ll_i2info(inode);
+	lli->lli_dir_depth = ll_i2info(dir)->lli_dir_depth + 1;
+	CDEBUG(D_INODE, DFID" depth %hu\n",
+	       PFID(&lli->lli_fid), lli->lli_dir_depth);
+}
+
 int ll_read_inode2(struct inode *inode, void *opaque)
 {
         struct lustre_md *md = opaque;
@@ -2413,12 +2498,49 @@ void ll_open_cleanup(struct super_block *sb, struct ptlrpc_request *open_req)
 	EXIT;
 }
 
+/* set filesystem-wide default LMV for subdir mount if it's enabled on ROOT. */
+static int ll_fileset_default_lmv_fixup(struct inode *inode,
+					struct lustre_md *md)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *req = NULL;
+	union lmv_mds_md *lmm = NULL;
+	int size = 0;
+	int rc;
+
+	LASSERT(is_root_inode(inode));
+	LASSERT(!fid_is_root(&sbi->ll_root_fid));
+	LASSERT(!md->default_lmv);
+
+	rc = ll_dir_get_default_layout(inode, (void **)&lmm, &size, &req,
+				       OBD_MD_DEFAULT_MEA,
+				       GET_DEFAULT_LAYOUT_ROOT);
+	if (rc && rc != -ENODATA)
+		GOTO(out, rc);
+
+	rc = 0;
+	if (lmm && size) {
+		rc = md_unpackmd(sbi->ll_md_exp, &md->default_lmv, lmm, size);
+		if (rc < 0)
+			GOTO(out, rc);
+
+		rc = 0;
+	}
+	EXIT;
+out:
+	if (req)
+		ptlrpc_req_finished(req);
+	return rc;
+}
+
 int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req,
 		  struct super_block *sb, struct lookup_intent *it)
 {
 	struct ll_sb_info *sbi = NULL;
 	struct lustre_md md = { NULL };
+	bool default_lmv_deleted = false;
 	int rc;
+
 	ENTRY;
 
 	LASSERT(*inode || sb);
@@ -2428,6 +2550,24 @@ int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req,
 	if (rc != 0)
 		GOTO(cleanup, rc);
 
+	/*
+	 * clear default_lmv only if intent_getattr reply doesn't contain it.
+	 * but it needs to be done after iget, check this early because
+	 * ll_update_lsm_md() may change md.
+	 */
+	if (it && (it->it_op & (IT_LOOKUP | IT_GETATTR)) &&
+	    S_ISDIR(md.body->mbo_mode) && !md.default_lmv) {
+		if (unlikely(*inode && is_root_inode(*inode) &&
+			     !fid_is_root(&sbi->ll_root_fid))) {
+			rc = ll_fileset_default_lmv_fixup(*inode, &md);
+			if (rc)
+				GOTO(out, rc);
+		}
+
+		if (!md.default_lmv)
+			default_lmv_deleted = true;
+	}
+
 	if (*inode) {
 		rc = ll_update_inode(*inode, &md);
 		if (rc != 0)
@@ -2491,6 +2631,9 @@ int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req,
 		LDLM_LOCK_PUT(lock);
 	}
 
+	if (default_lmv_deleted)
+		ll_update_default_lsm_md(*inode, &md);
+
 	GOTO(out, rc = 0);
 
 out:
@@ -2569,7 +2712,8 @@ void ll_unlock_md_op_lsm(struct md_op_data *op_data)
 struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
 				      struct inode *i1, struct inode *i2,
 				      const char *name, size_t namelen,
-				      __u32 mode, __u32 opc, void *data)
+				      __u32 mode, enum md_op_code opc,
+				      void *data)
 {
 	LASSERT(i1 != NULL);
 
@@ -2595,15 +2739,13 @@ struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
 
 	ll_i2gids(op_data->op_suppgids, i1, i2);
 	op_data->op_fid1 = *ll_inode2fid(i1);
-	op_data->op_default_stripe_offset = -1;
+	op_data->op_code = opc;
 
 	if (S_ISDIR(i1->i_mode)) {
 		down_read(&ll_i2info(i1)->lli_lsm_sem);
 		op_data->op_mea1_sem = &ll_i2info(i1)->lli_lsm_sem;
 		op_data->op_mea1 = ll_i2info(i1)->lli_lsm_md;
-		if (opc == LUSTRE_OPC_MKDIR)
-			op_data->op_default_stripe_offset =
-				   ll_i2info(i1)->lli_def_stripe_offset;
+		op_data->op_default_mea1 = ll_i2info(i1)->lli_default_lsm_md;
 	}
 
 	if (i2) {
@@ -2696,6 +2838,9 @@ int ll_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (sbi->ll_flags & LL_SBI_MDLL)
 		seq_puts(seq, ",mdll");
 
+	if (sbi->ll_flags & LL_SBI_MDLL_BYPASS)
+		seq_puts(seq, ",mdll_bypass");
+
 	if (sbi->ll_flags & LL_SBI_MDLL_AUTO_REFRESH)
 		seq_puts(seq, ",mdll_auto_refresh");
 
diff --git a/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c b/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
index 10f058b8256dd..b2c0e28dd658a 100644
--- a/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
+++ b/drivers/staging/lustrefsx/lustre/llite/lproc_llite.c
@@ -1285,6 +1285,50 @@ static ssize_t ll_nosquash_nids_seq_write(struct file *file,
 
 LDEBUGFS_SEQ_FOPS(ll_nosquash_nids);
 
+static int ll_mdll_dir_restore_max_retry_count_seq_show(struct seq_file *m,
+							void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	seq_printf(m, "%d\n",
+		   atomic_read(&sbi->ll_dir_restore_max_retry_count));
+
+	return 0;
+}
+
+static ssize_t
+ll_mdll_dir_restore_max_retry_count_seq_write(struct file *file,
+					  const char __user *buffer,
+					  size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int val, rc;
+
+	rc = kstrtoint_from_user(buffer, count, 0, &val);
+	if (rc)
+		return rc;
+
+	/*
+	 * Right now there is no limitation set on the retry count.
+	 * This is done as we dont know what the right max limit
+	 * would be. The max value would depend on the number of
+	 * files in the directory that is being restored and as well
+	 * if the mdt keeps restarting. The client calls are
+	 * interruptible and can be used to break from long retries.
+	 */
+	if (val < -1)
+		return -EINVAL;
+
+	atomic_set(&sbi->ll_dir_restore_max_retry_count, val);
+
+	return count;
+}
+
+LDEBUGFS_SEQ_FOPS(ll_mdll_dir_restore_max_retry_count);
+
 struct ldebugfs_vars lprocfs_llite_obd_vars[] = {
 	{ .name	=	"site",
 	  .fops	=	&ll_site_stats_fops			},
@@ -1306,6 +1350,8 @@ struct ldebugfs_vars lprocfs_llite_obd_vars[] = {
 	  .fops	=	&ll_root_squash_fops			},
 	{ .name	=	"nosquash_nids",
 	  .fops	=	&ll_nosquash_nids_fops			},
+	{ .name	=	"mdll_dir_restore_max_retry_count",
+	  .fops	=	&ll_mdll_dir_restore_max_retry_count_fops	},
 	{ NULL }
 };
 
diff --git a/drivers/staging/lustrefsx/lustre/llite/namei.c b/drivers/staging/lustrefsx/lustre/llite/namei.c
index 61349f5cb65e5..3856943101ab4 100644
--- a/drivers/staging/lustrefsx/lustre/llite/namei.c
+++ b/drivers/staging/lustrefsx/lustre/llite/namei.c
@@ -223,6 +223,7 @@ static int ll_dom_lock_cancel(struct inode *inode, struct ldlm_lock *lock)
 static void ll_lock_cancel_bits(struct ldlm_lock *lock, __u64 to_cancel)
 {
 	struct inode *inode = ll_inode_from_resource_lock(lock);
+	struct ll_inode_info *lli;
 	__u64 bits = to_cancel;
 	int rc;
 
@@ -249,8 +250,6 @@ static void ll_lock_cancel_bits(struct ldlm_lock *lock, __u64 to_cancel)
 	}
 
 	if (bits & MDS_INODELOCK_XATTR) {
-		if (S_ISDIR(inode->i_mode))
-			ll_i2info(inode)->lli_def_stripe_offset = -1;
 		ll_xattr_cache_destroy(inode);
 		bits &= ~MDS_INODELOCK_XATTR;
 	}
@@ -309,15 +308,12 @@ static void ll_lock_cancel_bits(struct ldlm_lock *lock, __u64 to_cancel)
 			       PFID(ll_inode2fid(inode)), rc);
 	}
 
-	if (bits & MDS_INODELOCK_UPDATE) {
-		struct ll_inode_info *lli = ll_i2info(inode);
+	lli = ll_i2info(inode);
 
+	if (bits & MDS_INODELOCK_UPDATE)
 		lli->lli_update_atime = 1;
-	}
 
 	if ((bits & MDS_INODELOCK_UPDATE) && S_ISDIR(inode->i_mode)) {
-		struct ll_inode_info *lli = ll_i2info(inode);
-
 		CDEBUG(D_INODE, "invalidating inode "DFID" lli = %p, "
 		       "pfid  = "DFID"\n", PFID(ll_inode2fid(inode)),
 		       lli, PFID(&lli->lli_pfid));
@@ -370,8 +366,8 @@ static void ll_lock_cancel_bits(struct ldlm_lock *lock, __u64 to_cancel)
 
 	if ((bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM)) &&
 	    inode->i_sb->s_root != NULL &&
-	    inode != inode->i_sb->s_root->d_inode)
-		ll_invalidate_aliases(inode);
+	    !is_root_inode(inode))	
+                ll_invalidate_aliases(inode);
 
 	if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM))
 		forget_all_cached_acls(inode);
@@ -701,8 +697,10 @@ static int ll_lookup_it_finish(struct ptlrpc_request *request,
 
 	if (!it_disposition(it, DISP_LOOKUP_NEG)) {
 		/* we have lookup look - unhide dentry */
-		if (bits & MDS_INODELOCK_LOOKUP)
+		if (bits & MDS_INODELOCK_LOOKUP) {
 			d_lustre_revalidate(*de);
+			ll_update_dir_depth(parent, (*de)->d_inode);
+		}
 	} else if (!it_disposition(it, DISP_OPEN_CREATE)) {
 		/*
 		 * If file was created on the server, the dentry is revalidated
@@ -715,7 +713,7 @@ static int ll_lookup_it_finish(struct ptlrpc_request *request,
 		struct lu_fid	fid = ll_i2info(parent)->lli_fid;
 
 		/* If it is striped directory, get the real stripe parent */
-		if (unlikely(ll_i2info(parent)->lli_lsm_md != NULL)) {
+		if (unlikely(ll_dir_striped(parent))) {
 			rc = md_get_fid_from_lsm(ll_i2mdexp(parent),
 						 ll_i2info(parent)->lli_lsm_md,
 						 (*de)->d_name.name,
@@ -1252,8 +1250,10 @@ static int ll_create_it(struct inode *dir, struct dentry *dentry,
 	}
 
 	ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, inode, it, &bits);
-	if (bits & MDS_INODELOCK_LOOKUP)
+	if (bits & MDS_INODELOCK_LOOKUP) {
 		d_lustre_revalidate(dentry);
+		ll_update_dir_depth(dir, inode);
+	}
 
 	RETURN(0);
 }
@@ -1278,6 +1278,58 @@ void ll_update_times(struct ptlrpc_request *request, struct inode *inode)
 		inode->i_ctime.tv_sec = body->mbo_ctime;
 }
 
+/* once default LMV (space balanced) is set on ROOT, it should take effect if
+ * default LMV is not set on parent directory.
+ */
+static void ll_qos_mkdir_prep(struct md_op_data *op_data, struct inode *dir)
+{
+	struct inode *root = dir->i_sb->s_root->d_inode;
+	struct ll_inode_info *rlli = ll_i2info(root);
+	struct ll_inode_info *lli = ll_i2info(dir);
+	struct lmv_stripe_md *lsm;
+
+	op_data->op_dir_depth = lli->lli_dir_depth;
+
+	/* parent directory is striped */
+	if (unlikely(lli->lli_lsm_md))
+		return;
+
+	/* default LMV set on parent directory */
+	if (unlikely(lli->lli_default_lsm_md))
+		return;
+
+	/* parent is ROOT */
+	if (unlikely(dir == root))
+		return;
+
+	/* default LMV not set on ROOT */
+	if (!rlli->lli_default_lsm_md)
+		return;
+
+	down_read(&rlli->lli_lsm_sem);
+	lsm = rlli->lli_default_lsm_md;
+	if (!lsm)
+		goto unlock;
+
+	/* not space balanced */
+	if (lsm->lsm_md_master_mdt_index != LMV_OFFSET_DEFAULT)
+		goto unlock;
+
+	if (lsm->lsm_md_max_inherit != LMV_INHERIT_NONE &&
+	    (lsm->lsm_md_max_inherit == LMV_INHERIT_UNLIMITED ||
+	     lsm->lsm_md_max_inherit >= lli->lli_dir_depth)) {
+		op_data->op_flags |= MF_QOS_MKDIR;
+		if (lsm->lsm_md_max_inherit_rr != LMV_INHERIT_RR_NONE &&
+		    (lsm->lsm_md_max_inherit_rr == LMV_INHERIT_RR_UNLIMITED ||
+		     lsm->lsm_md_max_inherit_rr >= lli->lli_dir_depth))
+			op_data->op_flags |= MF_RR_MKDIR;
+		CDEBUG(D_INODE, DFID" requests qos mkdir %#x\n",
+		       PFID(&lli->lli_fid), op_data->op_flags);
+	}
+unlock:
+	up_read(&rlli->lli_lsm_sem);
+}
+
 static int ll_new_node(struct inode *dir, struct dentry *dchild,
 		       const char *tgt, umode_t mode, int rdev, __u32 opc)
 {
@@ -1299,6 +1351,9 @@ static int ll_new_node(struct inode *dir, struct dentry *dchild,
 	if (IS_ERR(op_data))
 		GOTO(err_exit, err = PTR_ERR(op_data));
 
+	if (S_ISDIR(mode))
+		ll_qos_mkdir_prep(op_data, dir);
+
 	if (sbi->ll_flags & LL_SBI_FILE_SECCTX) {
 		err = ll_dentry_init_security(dchild, mode, &dchild->d_name,
 					      &op_data->op_file_secctx_name,
@@ -1312,13 +1367,11 @@ static int ll_new_node(struct inode *dir, struct dentry *dchild,
 			from_kuid(&init_user_ns, current_fsuid()),
 			from_kgid(&init_user_ns, current_fsgid()),
 			cfs_curproc_cap_pack(), rdev, &request);
-	if (err < 0 && err != -EREMOTE)
-		GOTO(err_exit, err);
-
-	/* If the client doesn't know where to create a subdirectory (or
-	 * in case of a race that sends the RPC to the wrong MDS), the
-	 * MDS will return -EREMOTE and the client will fetch the layout
-	 * of the directory, then create the directory on the right MDT. */
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 14, 58, 0)
+	/*
+	 * server < 2.12.58 doesn't pack default LMV in intent_getattr reply,
+	 * fetch default LMV here.
+	 */
 	if (unlikely(err == -EREMOTE)) {
 		struct ll_inode_info	*lli = ll_i2info(dir);
 		struct lmv_user_md	*lum;
@@ -1327,27 +1380,61 @@ static int ll_new_node(struct inode *dir, struct dentry *dchild,
 
 		ptlrpc_req_finished(request);
 		request = NULL;
+		ll_finish_md_op_data(op_data);
+		op_data = NULL;
 
 		err2 = ll_dir_getstripe(dir, (void **)&lum, &lumsize, &request,
 					OBD_MD_DEFAULT_MEA);
 		if (err2 == 0) {
-			/* Update stripe_offset and retry */
-			lli->lli_def_stripe_offset = lum->lum_stripe_offset;
-		} else if (err2 == -ENODATA &&
-			   lli->lli_def_stripe_offset != -1) {
-			/* If there are no default stripe EA on the MDT, but the
+			struct lustre_md md = { NULL };
+
+			md.body = req_capsule_server_get(&request->rq_pill,
+							 &RMF_MDT_BODY);
+			if (!md.body)
+				GOTO(err_exit, err = -EPROTO);
+
+			OBD_ALLOC_PTR(md.default_lmv);
+			if (!md.default_lmv)
+				GOTO(err_exit, err = -ENOMEM);
+
+			md.default_lmv->lsm_md_magic = lum->lum_magic;
+			md.default_lmv->lsm_md_stripe_count =
+				lum->lum_stripe_count;
+			md.default_lmv->lsm_md_master_mdt_index =
+				lum->lum_stripe_offset;
+			md.default_lmv->lsm_md_hash_type = lum->lum_hash_type;
+			md.default_lmv->lsm_md_max_inherit =
+				lum->lum_max_inherit;
+			md.default_lmv->lsm_md_max_inherit_rr =
+				lum->lum_max_inherit_rr;
+
+			err = ll_update_inode(dir, &md);
+			md_free_lustre_md(sbi->ll_md_exp, &md);
+			if (err)
+				GOTO(err_exit, err);
+		} else if (err2 == -ENODATA && lli->lli_default_lsm_md) {
+			/*
+			 * If there are no default stripe EA on the MDT, but the
 			 * client has default stripe, then it probably means
-			 * default stripe EA has just been deleted. */
-			lli->lli_def_stripe_offset = -1;
+			 * default stripe EA has just been deleted.
+			 */
+			down_write(&lli->lli_lsm_sem);
+			if (lli->lli_default_lsm_md)
+				OBD_FREE_PTR(lli->lli_default_lsm_md);
+			lli->lli_default_lsm_md = NULL;
+			up_write(&lli->lli_lsm_sem);
 		} else {
 			GOTO(err_exit, err);
 		}
 
 		ptlrpc_req_finished(request);
 		request = NULL;
-		ll_finish_md_op_data(op_data);
 		goto again;
 	}
+#endif
+
+	if (err < 0)
+		GOTO(err_exit, err);
 
 	ll_update_times(request, dir);
 
diff --git a/drivers/staging/lustrefsx/lustre/llite/statahead.c b/drivers/staging/lustrefsx/lustre/llite/statahead.c
index 397712909b3f4..e4886ca12f025 100644
--- a/drivers/staging/lustrefsx/lustre/llite/statahead.c
+++ b/drivers/staging/lustrefsx/lustre/llite/statahead.c
@@ -1520,8 +1520,11 @@ static int revalidate_statahead_dentry(struct inode *dir,
 			}
 
 			if ((bits & MDS_INODELOCK_LOOKUP) &&
-			    d_lustre_invalid(*dentryp))
+			    d_lustre_invalid(*dentryp)) {
 				d_lustre_revalidate(*dentryp);
+				ll_update_dir_depth(dir, (*dentryp)->d_inode);
+			}
+
 			ll_intent_release(&it);
 		}
 	}
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c b/drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c
index b439d87ae9348..e95930edf1251 100644
--- a/drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_fld.c
@@ -73,11 +73,11 @@ int lmv_fld_lookup(struct lmv_obd *lmv, const struct lu_fid *fid, u32 *mds)
         CDEBUG(D_INODE, "FLD lookup got mds #%x for fid="DFID"\n",
                *mds, PFID(fid));
 
-        if (*mds >= lmv->desc.ld_tgt_count) {
-                CERROR("FLD lookup got invalid mds #%x (max: %x) "
-                       "for fid="DFID"\n", *mds, lmv->desc.ld_tgt_count,
-                       PFID(fid));
-                rc = -EINVAL;
-        }
-        RETURN(rc);
+	if (*mds >= lmv->lmv_mdt_descs.ltd_tgts_size) {
+		rc = -EINVAL;
+		CERROR("%s: FLD lookup got invalid mds #%x (max: %x) for fid="DFID": rc = %d\n",
+		       obd->obd_name, *mds, lmv->lmv_mdt_descs.ltd_tgts_size,
+		       PFID(fid), rc);
+	}
+	RETURN(rc);
 }
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c b/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c
index 24c616b4b6cd9..fade4c9e9c31a 100644
--- a/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_intent.c
@@ -86,7 +86,7 @@ static int lmv_intent_remote(struct obd_export *exp, struct lookup_intent *it,
 
 	LASSERT(fid_is_sane(&body->mbo_fid1));
 
-	tgt = lmv_find_target(lmv, &body->mbo_fid1);
+	tgt = lmv_fid2tgt(lmv, &body->mbo_fid1);
 	if (IS_ERR(tgt))
 		GOTO(out, rc = PTR_ERR(tgt));
 
@@ -106,7 +106,7 @@ static int lmv_intent_remote(struct obd_export *exp, struct lookup_intent *it,
 
 	op_data->op_bias = MDS_CROSS_REF;
 	CDEBUG(D_INODE, "REMOTE_INTENT with fid="DFID" -> mds #%u\n",
-	       PFID(&body->mbo_fid1), tgt->ltd_idx);
+	       PFID(&body->mbo_fid1), tgt->ltd_index);
 
 	/* ask for security context upon intent */
 	if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_OPEN) &&
@@ -207,12 +207,12 @@ int lmv_revalidate_slaves(struct obd_export *exp,
 		 */
 		op_data->op_bias = MDS_CROSS_REF;
 
-		tgt = lmv_get_target(lmv, lsm->lsm_md_oinfo[i].lmo_mds, NULL);
-		if (IS_ERR(tgt))
-			GOTO(cleanup, rc = PTR_ERR(tgt));
+		tgt = lmv_tgt(lmv, lsm->lsm_md_oinfo[i].lmo_mds);
+		if (!tgt)
+			GOTO(cleanup, rc = -ENODEV);
 
 		CDEBUG(D_INODE, "Revalidate slave "DFID" -> mds #%u\n",
-		       PFID(&fid), tgt->ltd_idx);
+		       PFID(&fid), tgt->ltd_index);
 
 		if (req != NULL) {
 			ptlrpc_req_finished(req);
@@ -299,31 +299,18 @@ static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
 
 	if ((it->it_op & IT_CREAT) && !(flags & MDS_OPEN_BY_FID)) {
 		/* don't allow create under dir with bad hash */
-		if (lmv_is_dir_bad_hash(op_data->op_mea1))
+		if (lmv_dir_bad_hash(op_data->op_mea1))
 			RETURN(-EBADF);
 
-		if (lmv_is_dir_migrating(op_data->op_mea1)) {
+		if (lmv_dir_migrating(op_data->op_mea1)) {
 			if (flags & O_EXCL) {
 				/*
 				 * open(O_CREAT | O_EXCL) needs to check
 				 * existing name, which should be done on both
-				 * old and new layout, to avoid creating new
-				 * file under old layout, check old layout on
+				 * old and new layout, check old layout on
 				 * client side.
 				 */
-				tgt = lmv_locate_tgt(lmv, op_data,
-						     &op_data->op_fid1);
-				if (IS_ERR(tgt))
-					RETURN(PTR_ERR(tgt));
-
-				rc = md_getattr_name(tgt->ltd_exp, op_data,
-						     reqp);
-				if (!rc) {
-					ptlrpc_req_finished(*reqp);
-					*reqp = NULL;
-					RETURN(-EEXIST);
-				}
-
+				rc = lmv_migrate_existence_check(lmv, op_data);
 				if (rc != -ENOENT)
 					RETURN(rc);
 
@@ -346,20 +333,20 @@ static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
 		/* for striped directory, we can't know parent stripe fid
 		 * without name, but we can set it to child fid, and MDT
 		 * will obtain it from linkea in open in such case. */
-		if (op_data->op_mea1 != NULL)
+		if (lmv_dir_striped(op_data->op_mea1))
 			op_data->op_fid1 = op_data->op_fid2;
 
-		tgt = lmv_find_target(lmv, &op_data->op_fid2);
+		tgt = lmv_fid2tgt(lmv, &op_data->op_fid2);
 		if (IS_ERR(tgt))
 			RETURN(PTR_ERR(tgt));
 
-		op_data->op_mds = tgt->ltd_idx;
+		op_data->op_mds = tgt->ltd_index;
 	} else {
 		LASSERT(fid_is_sane(&op_data->op_fid1));
 		LASSERT(fid_is_zero(&op_data->op_fid2));
 		LASSERT(op_data->op_name != NULL);
 
-		tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
+		tgt = lmv_locate_tgt(lmv, op_data);
 		if (IS_ERR(tgt))
 			RETURN(PTR_ERR(tgt));
 	}
@@ -378,7 +365,7 @@ static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
 
 	CDEBUG(D_INODE, "OPEN_INTENT with fid1="DFID", fid2="DFID","
 	       " name='%s' -> mds #%u\n", PFID(&op_data->op_fid1),
-	       PFID(&op_data->op_fid2), op_data->op_name, tgt->ltd_idx);
+	       PFID(&op_data->op_fid2), op_data->op_name, tgt->ltd_index);
 
 	rc = md_intent_lock(tgt->ltd_exp, op_data, it, reqp, cb_blocking,
 			    extra_lock_flags);
@@ -455,7 +442,7 @@ lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data,
 		LASSERT(op_data->op_name);
 		if (op_data->op_namelen != 1 ||
 		    strncmp(op_data->op_name, "/", 1) != 0) {
-			tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
+			tgt = lmv_locate_tgt(lmv, op_data);
 			if (IS_ERR(tgt))
 				RETURN(PTR_ERR(tgt));
 		}
@@ -468,16 +455,17 @@ lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data,
 		op_data->op_namelen = 0;
 
 		/* getattr request is sent to MDT where fid2 inode is */
-		tgt = lmv_find_target(lmv, &op_data->op_fid2);
+		tgt = lmv_fid2tgt(lmv, &op_data->op_fid2);
 	} else if (op_data->op_name) {
 		/* getattr by name */
-		tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
+		tgt = lmv_locate_tgt(lmv, op_data);
 		if (!fid_is_sane(&op_data->op_fid2))
 			fid_zero(&op_data->op_fid2);
 	} else {
 		/* old way to getattr by FID, parent FID not packed */
-		tgt = lmv_find_target(lmv, &op_data->op_fid1);
+		tgt = lmv_fid2tgt(lmv, &op_data->op_fid1);
 	}
+
 	if (IS_ERR(tgt))
 		RETURN(PTR_ERR(tgt));
 
@@ -485,7 +473,7 @@ lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data,
 	       ", name='%s' -> mds #%u\n",
 	       PFID(&op_data->op_fid1), PFID(&op_data->op_fid2),
 	       op_data->op_name ? op_data->op_name : "<NULL>",
-	       tgt->ltd_idx);
+	       tgt->ltd_index);
 
 	op_data->op_bias &= ~MDS_CROSS_REF;
 
@@ -497,7 +485,7 @@ lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data,
 	if (*reqp == NULL) {
 		/* If RPC happens, lsm information will be revalidated
 		 * during update_inode process (see ll_update_lsm_md) */
-		if (op_data->op_mea2 != NULL) {
+		if (lmv_dir_striped(op_data->op_mea2)) {
 			rc = lmv_revalidate_slaves(exp, op_data->op_mea2,
 						   cb_blocking,
 						   extra_lock_flags);
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h b/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h
index 0ad743244e93e..84a6d98f44c46 100644
--- a/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_internal.h
@@ -49,7 +49,6 @@ int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
 int lmv_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *,
 		     void *, int);
 int lmv_fld_lookup(struct lmv_obd *lmv, const struct lu_fid *fid, u32 *mds);
-int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, u32 mds);
 int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp,
 		  struct lu_fid *fid, struct md_op_data *op_data);
 
@@ -60,62 +59,91 @@ int lmv_revalidate_slaves(struct obd_export *exp,
 
 int lmv_getattr_name(struct obd_export *exp, struct md_op_data *op_data,
 		     struct ptlrpc_request **preq);
+void lmv_activate_target(struct lmv_obd *lmv, struct lmv_tgt_desc *tgt,
+			 int activate);
+
+int lmv_statfs_check_update(struct obd_device *obd, struct lmv_tgt_desc *tgt);
 
 static inline struct obd_device *lmv2obd_dev(struct lmv_obd *lmv)
 {
 	return container_of0(lmv, struct obd_device, u.lmv);
 }
 
-static inline struct lmv_tgt_desc *
-lmv_get_target(struct lmv_obd *lmv, u32 mdt_idx, int *index)
+static inline struct lu_tgt_desc *
+lmv_tgt(struct lmv_obd *lmv, __u32 index)
 {
-	int i;
+	return index < lmv->lmv_mdt_descs.ltd_tgt_bitmap->size ?
+		LTD_TGT(&lmv->lmv_mdt_descs, index) : NULL;
+}
 
-	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
-		if (lmv->tgts[i] == NULL)
-			continue;
+static inline bool
+lmv_mdt0_inited(struct lmv_obd *lmv)
+{
+	return lmv->lmv_mdt_descs.ltd_tgt_bitmap->size > 0 &&
+	       cfs_bitmap_check(lmv->lmv_mdt_descs.ltd_tgt_bitmap, 0);
+}
 
-		if (lmv->tgts[i]->ltd_idx == mdt_idx) {
-			if (index != NULL)
-				*index = i;
-			return lmv->tgts[i];
-		}
-	}
+#define lmv_foreach_tgt(lmv, tgt) ltd_foreach_tgt(&(lmv)->lmv_mdt_descs, tgt)
 
-	return ERR_PTR(-ENODEV);
+#define lmv_foreach_tgt_safe(lmv, tgt, tmp) \
+	ltd_foreach_tgt_safe(&(lmv)->lmv_mdt_descs, tgt, tmp)
+
+static inline
+struct lu_tgt_desc *lmv_first_connected_tgt(struct lmv_obd *lmv)
+{
+	struct lu_tgt_desc *tgt;
+
+	tgt = ltd_first_tgt(&lmv->lmv_mdt_descs);
+	while (tgt && !tgt->ltd_exp)
+		tgt = ltd_next_tgt(&lmv->lmv_mdt_descs, tgt);
+
+	return tgt;
+}
+
+static inline
+struct lu_tgt_desc *lmv_next_connected_tgt(struct lmv_obd *lmv,
+					   struct lu_tgt_desc *tgt)
+{
+	do {
+		tgt = ltd_next_tgt(&lmv->lmv_mdt_descs, tgt);
+	} while (tgt && !tgt->ltd_exp);
+
+	return tgt;
 }
 
+#define lmv_foreach_connected_tgt(lmv, tgt) \
+	for (tgt = lmv_first_connected_tgt(lmv); tgt; \
+	     tgt = lmv_next_connected_tgt(lmv, tgt))
+
 static inline int
-lmv_find_target_index(struct lmv_obd *lmv, const struct lu_fid *fid)
+lmv_fid2tgt_index(struct lmv_obd *lmv, const struct lu_fid *fid)
 {
-	struct lmv_tgt_desc	*ltd;
-	u32			mdt_idx = 0;
-	int			index = 0;
-
-	if (lmv->desc.ld_tgt_count > 1) {
-		int rc;
-		rc = lmv_fld_lookup(lmv, fid, &mdt_idx);
-		if (rc < 0)
-			return rc;
-	}
+	u32 mdt_idx;
+	int rc;
 
-	ltd = lmv_get_target(lmv, mdt_idx, &index);
-	if (IS_ERR(ltd))
-		return PTR_ERR(ltd);
+	if (lmv->lmv_mdt_count < 2)
+		return 0;
 
-	return index;
+	rc = lmv_fld_lookup(lmv, fid, &mdt_idx);
+	if (rc < 0)
+		return rc;
+
+	return mdt_idx;
 }
 
 static inline struct lmv_tgt_desc *
-lmv_find_target(struct lmv_obd *lmv, const struct lu_fid *fid)
+lmv_fid2tgt(struct lmv_obd *lmv, const struct lu_fid *fid)
 {
+	struct lu_tgt_desc *tgt;
 	int index;
 
-	index = lmv_find_target_index(lmv, fid);
+	index = lmv_fid2tgt_index(lmv, fid);
 	if (index < 0)
 		return ERR_PTR(index);
 
-	return lmv->tgts[index];
+	tgt = lmv_tgt(lmv, index);
+
+	return tgt ? tgt : ERR_PTR(-ENODEV);
 }
 
 static inline int lmv_stripe_md_size(int stripe_count)
@@ -134,6 +162,8 @@ lsm_name_to_stripe_info(const struct lmv_stripe_md *lsm, const char *name,
 	__u32 stripe_count = lsm->lsm_md_stripe_count;
 	int stripe_index;
 
+	LASSERT(lmv_dir_striped(lsm));
+
 	if (hash_type & LMV_HASH_FLAG_MIGRATION) {
 		if (post_migrate) {
 			hash_type &= ~LMV_HASH_FLAG_MIGRATION;
@@ -164,26 +194,6 @@ lsm_name_to_stripe_info(const struct lmv_stripe_md *lsm, const char *name,
 	return &lsm->lsm_md_oinfo[stripe_index];
 }
 
-static inline bool lmv_is_dir_migrating(const struct lmv_stripe_md *lsm)
-{
-	return lsm ? lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION : false;
-}
-
-static inline bool lmv_is_dir_bad_hash(const struct lmv_stripe_md *lsm)
-{
-	if (!lsm)
-		return false;
-
-	if (lmv_is_dir_migrating(lsm)) {
-		if (lsm->lsm_md_stripe_count - lsm->lsm_md_migrate_offset > 1)
-			return !lmv_is_known_hash_type(
-					lsm->lsm_md_migrate_hash);
-		return false;
-	}
-
-	return !lmv_is_known_hash_type(lsm->lsm_md_hash_type);
-}
-
 static inline bool lmv_dir_retry_check_update(struct md_op_data *op_data)
 {
 	const struct lmv_stripe_md *lsm = op_data->op_mea1;
@@ -191,12 +201,12 @@ static inline bool lmv_dir_retry_check_update(struct md_op_data *op_data)
 	if (!lsm)
 		return false;
 
-	if (lmv_is_dir_migrating(lsm) && !op_data->op_post_migrate) {
+	if (lmv_dir_migrating(lsm) && !op_data->op_post_migrate) {
 		op_data->op_post_migrate = true;
 		return true;
 	}
 
-	if (lmv_is_dir_bad_hash(lsm) &&
+	if (lmv_dir_bad_hash(lsm) &&
 	    op_data->op_stripe_index < lsm->lsm_md_stripe_count - 1) {
 		op_data->op_stripe_index++;
 		return true;
@@ -206,9 +216,10 @@ static inline bool lmv_dir_retry_check_update(struct md_op_data *op_data)
 }
 
 struct lmv_tgt_desc *lmv_locate_tgt(struct lmv_obd *lmv,
-				    struct md_op_data *op_data,
-				    struct lu_fid *fid);
+				    struct md_op_data *op_data);
+int lmv_migrate_existence_check(struct lmv_obd *lmv,
+				struct md_op_data *op_data);
+
 /* lproc_lmv.c */
 int lmv_tunables_init(struct obd_device *obd);
-
 #endif
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c b/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c
index 078f6e2a59aad..dce03d45f43e1 100644
--- a/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c
+++ b/drivers/staging/lustrefsx/lustre/lmv/lmv_obd.c
@@ -60,15 +60,15 @@
 
 static int lmv_check_connect(struct obd_device *obd);
 
-static void lmv_activate_target(struct lmv_obd *lmv,
-                                struct lmv_tgt_desc *tgt,
-                                int activate)
+void lmv_activate_target(struct lmv_obd *lmv, struct lmv_tgt_desc *tgt,
+			 int activate)
 {
-        if (tgt->ltd_active == activate)
-                return;
+	if (tgt->ltd_active == activate)
+		return;
 
-        tgt->ltd_active = activate;
-        lmv->desc.ld_active_tgt_count += (activate ? 1 : -1);
+	tgt->ltd_active = activate;
+	lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count +=
+		(activate ? 1 : -1);
 
 	tgt->ltd_exp->exp_obd->obd_inactive = !activate;
 }
@@ -84,50 +84,47 @@ static int lmv_set_mdc_active(struct lmv_obd *lmv,
 			      const struct obd_uuid *uuid,
 			      int activate)
 {
-	struct lmv_tgt_desc	*tgt = NULL;
-	struct obd_device	*obd;
-	__u32			 i;
-	int			 rc = 0;
+	struct lu_tgt_desc *tgt = NULL;
+	struct obd_device *obd;
+	int rc = 0;
+
 	ENTRY;
 
 	CDEBUG(D_INFO, "Searching in lmv %p for uuid %s (activate=%d)\n",
 			lmv, uuid->uuid, activate);
 
 	spin_lock(&lmv->lmv_lock);
-	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
-		tgt = lmv->tgts[i];
-		if (tgt == NULL || tgt->ltd_exp == NULL)
-			continue;
-
-		CDEBUG(D_INFO, "Target idx %d is %s conn %#llx\n", i,
-		       tgt->ltd_uuid.uuid, tgt->ltd_exp->exp_handle.h_cookie);
+	lmv_foreach_connected_tgt(lmv, tgt) {
+		CDEBUG(D_INFO, "Target idx %d is %s conn %#llx\n",
+		       tgt->ltd_index, tgt->ltd_uuid.uuid,
+		       tgt->ltd_exp->exp_handle.h_cookie);
 
 		if (obd_uuid_equals(uuid, &tgt->ltd_uuid))
 			break;
 	}
 
-        if (i == lmv->desc.ld_tgt_count)
-                GOTO(out_lmv_lock, rc = -EINVAL);
+	if (!tgt)
+		GOTO(out_lmv_lock, rc = -EINVAL);
 
-        obd = class_exp2obd(tgt->ltd_exp);
-        if (obd == NULL)
-                GOTO(out_lmv_lock, rc = -ENOTCONN);
+	obd = class_exp2obd(tgt->ltd_exp);
+	if (obd == NULL)
+		GOTO(out_lmv_lock, rc = -ENOTCONN);
 
-        CDEBUG(D_INFO, "Found OBD %s=%s device %d (%p) type %s at LMV idx %d\n",
-               obd->obd_name, obd->obd_uuid.uuid, obd->obd_minor, obd,
-               obd->obd_type->typ_name, i);
-        LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0);
+	CDEBUG(D_INFO, "Found OBD %s=%s device %d (%p) type %s at LMV idx %d\n",
+	       obd->obd_name, obd->obd_uuid.uuid, obd->obd_minor, obd,
+	       obd->obd_type->typ_name, tgt->ltd_index);
+	LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0);
 
-        if (tgt->ltd_active == activate) {
-                CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd,
-                       activate ? "" : "in");
-                GOTO(out_lmv_lock, rc);
-        }
+	if (tgt->ltd_active == activate) {
+		CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd,
+		       activate ? "" : "in");
+		GOTO(out_lmv_lock, rc);
+	}
 
-        CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd,
-               activate ? "" : "in");
-        lmv_activate_target(lmv, tgt, activate);
-        EXIT;
+	CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd,
+	       activate ? "" : "in");
+	lmv_activate_target(lmv, tgt, activate);
+	EXIT;
 
  out_lmv_lock:
 	spin_unlock(&lmv->lmv_lock);
@@ -136,8 +133,8 @@ static int lmv_set_mdc_active(struct lmv_obd *lmv,
 
 struct obd_uuid *lmv_get_uuid(struct obd_export *exp)
 {
-	struct lmv_obd		*lmv = &exp->exp_obd->u.lmv;
-	struct lmv_tgt_desc	*tgt = lmv->tgts[0];
+	struct lmv_obd *lmv = &exp->exp_obd->u.lmv;
+	struct lmv_tgt_desc *tgt = lmv_tgt(lmv, 0);
 
 	return (tgt == NULL) ? NULL : obd_get_uuid(tgt->ltd_exp);
 }
@@ -240,21 +237,22 @@ static int lmv_connect(const struct lu_env *env,
 static int lmv_init_ea_size(struct obd_export *exp, __u32 easize,
 			    __u32 def_easize)
 {
-	struct obd_device	*obd = exp->exp_obd;
-	struct lmv_obd		*lmv = &obd->u.lmv;
-	__u32			 i;
-	int			 rc = 0;
-	int			 change = 0;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int change = 0;
+	int rc = 0;
+
 	ENTRY;
 
-        if (lmv->max_easize < easize) {
-                lmv->max_easize = easize;
-                change = 1;
-        }
-        if (lmv->max_def_easize < def_easize) {
-                lmv->max_def_easize = def_easize;
-                change = 1;
-        }
+	if (lmv->max_easize < easize) {
+		lmv->max_easize = easize;
+		change = 1;
+	}
+	if (lmv->max_def_easize < def_easize) {
+		lmv->max_def_easize = def_easize;
+		change = 1;
+	}
 
 	if (change == 0)
 		RETURN(0);
@@ -262,20 +260,14 @@ static int lmv_init_ea_size(struct obd_export *exp, __u32 easize,
 	if (lmv->connected == 0)
 		RETURN(0);
 
-	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
-		struct lmv_tgt_desc *tgt = lmv->tgts[i];
-
-		if (tgt == NULL || tgt->ltd_exp == NULL) {
-			CWARN("%s: NULL export for %d\n", obd->obd_name, i);
-			continue;
-		}
+	lmv_foreach_connected_tgt(lmv, tgt) {
 		if (!tgt->ltd_active)
 			continue;
 
 		rc = md_init_ea_size(tgt->ltd_exp, easize, def_easize);
 		if (rc) {
 			CERROR("%s: obd_init_ea_size() failed on MDT target %d:"
-			       " rc = %d\n", obd->obd_name, i, rc);
+			       " rc = %d\n", obd->obd_name, tgt->ltd_index, rc);
 			break;
 		}
 	}
@@ -293,21 +285,21 @@ int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
         int                      rc;
         ENTRY;
 
-        mdc_obd = class_find_client_obd(&tgt->ltd_uuid, LUSTRE_MDC_NAME,
-                                        &obd->obd_uuid);
-        if (!mdc_obd) {
-                CERROR("target %s not attached\n", tgt->ltd_uuid.uuid);
-                RETURN(-EINVAL);
-        }
+	mdc_obd = class_find_client_obd(&tgt->ltd_uuid, LUSTRE_MDC_NAME,
+					&obd->obd_uuid);
+	if (!mdc_obd) {
+		CERROR("target %s not attached\n", tgt->ltd_uuid.uuid);
+		RETURN(-EINVAL);
+	}
 
 	CDEBUG(D_CONFIG, "connect to %s(%s) - %s, %s\n",
 	       mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
 	       tgt->ltd_uuid.uuid, obd->obd_uuid.uuid);
 
-        if (!mdc_obd->obd_set_up) {
-                CERROR("target %s is not set up\n", tgt->ltd_uuid.uuid);
-                RETURN(-EINVAL);
-        }
+	if (!mdc_obd->obd_set_up) {
+		CERROR("target %s is not set up\n", tgt->ltd_uuid.uuid);
+		RETURN(-EINVAL);
+	}
 
 	rc = obd_connect(NULL, &mdc_exp, mdc_obd, &obd->obd_uuid,
 			 &lmv->conn_data, NULL);
@@ -323,19 +315,19 @@ int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
 	if (rc)
 		RETURN(rc);
 
-        target.ft_srv = NULL;
-        target.ft_exp = mdc_exp;
-        target.ft_idx = tgt->ltd_idx;
+	target.ft_srv = NULL;
+	target.ft_exp = mdc_exp;
+	target.ft_idx = tgt->ltd_index;
 
-        fld_client_add_target(&lmv->lmv_fld, &target);
+	fld_client_add_target(&lmv->lmv_fld, &target);
 
-        rc = obd_register_observer(mdc_obd, obd);
-        if (rc) {
-                obd_disconnect(mdc_exp);
-                CERROR("target %s register_observer error %d\n",
-                       tgt->ltd_uuid.uuid, rc);
-                RETURN(rc);
-        }
+	rc = obd_register_observer(mdc_obd, obd);
+	if (rc) {
+		obd_disconnect(mdc_exp);
+		CERROR("target %s register_observer error %d\n",
+		       tgt->ltd_uuid.uuid, rc);
+		RETURN(rc);
+	}
 
 	if (obd->obd_observer) {
 		/*
@@ -351,14 +343,22 @@ int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
 
 	tgt->ltd_active = 1;
 	tgt->ltd_exp = mdc_exp;
-	lmv->desc.ld_active_tgt_count++;
+	lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count++;
 
 	md_init_ea_size(tgt->ltd_exp, lmv->max_easize, lmv->max_def_easize);
 
+	rc = lu_qos_add_tgt(&lmv->lmv_qos, tgt);
+	if (rc) {
+		obd_disconnect(mdc_exp);
+		RETURN(rc);
+	}
+
 	CDEBUG(D_CONFIG, "Connected to %s(%s) successfully (%d)\n",
 		mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
 		atomic_read(&obd->obd_refcount));
 
+	lmv_statfs_check_update(obd, tgt);
+
 	if (lmv->lmv_tgts_kobj)
 		/* Even if we failed to create the link, that's fine */
 		rc = sysfs_create_link(lmv->lmv_tgts_kobj,
@@ -367,25 +367,23 @@ int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
 	RETURN(0);
 }
 
-static void lmv_del_target(struct lmv_obd *lmv, int index)
+static void lmv_del_target(struct lmv_obd *lmv, struct lu_tgt_desc *tgt)
 {
-	if (lmv->tgts[index] == NULL)
-		return;
-
-	OBD_FREE_PTR(lmv->tgts[index]);
-	lmv->tgts[index] = NULL;
-	return;
+	LASSERT(tgt);
+	ltd_del_tgt(&lmv->lmv_mdt_descs, tgt);
+	OBD_FREE_PTR(tgt);
 }
 
 static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
 			   __u32 index, int gen)
 {
 	struct obd_device *mdc_obd;
-        struct lmv_obd      *lmv = &obd->u.lmv;
-        struct lmv_tgt_desc *tgt;
-	int		     orig_tgt_count = 0;
-        int                  rc = 0;
-        ENTRY;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	struct lu_tgt_descs *ltd = &lmv->lmv_mdt_descs;
+	int rc = 0;
+
+	ENTRY;
 
 	CDEBUG(D_CONFIG, "Target uuid: %s. index %d\n", uuidp->uuid, index);
 	mdc_obd = class_find_client_obd(uuidp, LUSTRE_MDC_NAME,
@@ -396,159 +394,98 @@ static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
 		RETURN(-EINVAL);
 	}
 
-	mutex_lock(&lmv->lmv_init_mutex);
-	if ((index < lmv->tgts_size) && (lmv->tgts[index] != NULL)) {
-		tgt = lmv->tgts[index];
-		CERROR("%s: UUID %s already assigned at LMV target index %d:"
-		       " rc = %d\n", obd->obd_name,
-		       obd_uuid2str(&tgt->ltd_uuid), index, -EEXIST);
-		mutex_unlock(&lmv->lmv_init_mutex);
-		RETURN(-EEXIST);
-	}
-
-	if (index >= lmv->tgts_size) {
-		/* We need to reallocate the lmv target array. */
-		struct lmv_tgt_desc **newtgts, **old = NULL;
-		__u32 newsize = 1;
-		__u32 oldsize = 0;
-
-		while (newsize < index + 1)
-			newsize = newsize << 1;
-		OBD_ALLOC(newtgts, sizeof(*newtgts) * newsize);
-		if (newtgts == NULL) {
-			mutex_unlock(&lmv->lmv_init_mutex);
-			RETURN(-ENOMEM);
-		}
-
-		if (lmv->tgts_size) {
-			memcpy(newtgts, lmv->tgts,
-			       sizeof(*newtgts) * lmv->tgts_size);
-			old = lmv->tgts;
-			oldsize = lmv->tgts_size;
-		}
-
-		lmv->tgts = newtgts;
-		lmv->tgts_size = newsize;
-		smp_rmb();
-		if (old)
-			OBD_FREE(old, sizeof(*old) * oldsize);
-
-		CDEBUG(D_CONFIG, "tgts: %p size: %d\n", lmv->tgts,
-		       lmv->tgts_size);
-	}
-
 	OBD_ALLOC_PTR(tgt);
-	if (!tgt) {
-		mutex_unlock(&lmv->lmv_init_mutex);
+	if (!tgt)
 		RETURN(-ENOMEM);
-	}
 
 	mutex_init(&tgt->ltd_fid_mutex);
-	tgt->ltd_idx = index;
+	tgt->ltd_index = index;
 	tgt->ltd_uuid = *uuidp;
 	tgt->ltd_active = 0;
-	lmv->tgts[index] = tgt;
-	if (index >= lmv->desc.ld_tgt_count) {
-		orig_tgt_count = lmv->desc.ld_tgt_count;
-		lmv->desc.ld_tgt_count = index + 1;
-	}
 
-	if (lmv->connected == 0) {
+	mutex_lock(&ltd->ltd_mutex);
+	rc = ltd_add_tgt(ltd, tgt);
+	mutex_unlock(&ltd->ltd_mutex);
+
+	if (rc)
+		GOTO(out_tgt, rc);
+
+	if (!lmv->connected)
 		/* lmv_check_connect() will connect this target. */
-		mutex_unlock(&lmv->lmv_init_mutex);
 		RETURN(0);
-	}
 
-	/* Otherwise let's connect it ourselves */
-	mutex_unlock(&lmv->lmv_init_mutex);
 	rc = lmv_connect_mdc(obd, tgt);
-	if (rc != 0) {
-		spin_lock(&lmv->lmv_lock);
-		if (lmv->desc.ld_tgt_count == index + 1)
-			lmv->desc.ld_tgt_count = orig_tgt_count;
-		memset(tgt, 0, sizeof(*tgt));
-		spin_unlock(&lmv->lmv_lock);
-	} else {
+	if (!rc) {
 		int easize = sizeof(struct lmv_stripe_md) +
-			lmv->desc.ld_tgt_count * sizeof(struct lu_fid);
+			lmv->lmv_mdt_count * sizeof(struct lu_fid);
+
 		lmv_init_ea_size(obd->obd_self_export, easize, 0);
 	}
 
 	RETURN(rc);
+
+out_tgt:
+	OBD_FREE_PTR(tgt);
+	return rc;
 }
 
 static int lmv_check_connect(struct obd_device *obd)
 {
-	struct lmv_obd		*lmv = &obd->u.lmv;
-	struct lmv_tgt_desc	*tgt;
-	__u32			 i;
-	int			 rc;
-	int			 easize;
-	ENTRY;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int easize;
+	int rc;
 
-        if (lmv->connected)
-                RETURN(0);
+	ENTRY;
 
-	mutex_lock(&lmv->lmv_init_mutex);
-        if (lmv->connected) {
-		mutex_unlock(&lmv->lmv_init_mutex);
-                RETURN(0);
-        }
+	if (lmv->connected)
+		RETURN(0);
 
-        if (lmv->desc.ld_tgt_count == 0) {
-		mutex_unlock(&lmv->lmv_init_mutex);
-                CERROR("%s: no targets configured.\n", obd->obd_name);
-                RETURN(-EINVAL);
-        }
+	mutex_lock(&lmv->lmv_mdt_descs.ltd_mutex);
+	if (lmv->connected)
+		GOTO(unlock, rc = 0);
 
-	LASSERT(lmv->tgts != NULL);
+	if (!lmv->lmv_mdt_count) {
+		CERROR("%s: no targets configured: rc = -EINVAL\n",
+		       obd->obd_name);
+		GOTO(unlock, rc = -EINVAL);
+	}
 
-	if (lmv->tgts[0] == NULL) {
-		mutex_unlock(&lmv->lmv_init_mutex);
-		CERROR("%s: no target configured for index 0.\n",
+	if (!lmv_mdt0_inited(lmv)) {
+		CERROR("%s: no target configured for index 0: rc = -EINVAL.\n",
 		       obd->obd_name);
-		RETURN(-EINVAL);
+		GOTO(unlock, rc = -EINVAL);
 	}
 
 	CDEBUG(D_CONFIG, "Time to connect %s to %s\n",
 	       obd->obd_uuid.uuid, obd->obd_name);
 
-	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
-		tgt = lmv->tgts[i];
-		if (tgt == NULL)
-			continue;
+	lmv_foreach_tgt(lmv, tgt) {
 		rc = lmv_connect_mdc(obd, tgt);
 		if (rc)
 			GOTO(out_disc, rc);
 	}
 
 	lmv->connected = 1;
-	easize = lmv_mds_md_size(lmv->desc.ld_tgt_count, LMV_MAGIC);
+	easize = lmv_mds_md_size(lmv->lmv_mdt_count, LMV_MAGIC);
 	lmv_init_ea_size(obd->obd_self_export, easize, 0);
-	mutex_unlock(&lmv->lmv_init_mutex);
-	RETURN(0);
+	EXIT;
+unlock:
+	mutex_unlock(&lmv->lmv_mdt_descs.ltd_mutex);
+
+	return rc;
 
- out_disc:
-        while (i-- > 0) {
-                int rc2;
-		tgt = lmv->tgts[i];
-		if (tgt == NULL)
+out_disc:
+	lmv_foreach_tgt(lmv, tgt) {
+		tgt->ltd_active = 0;
+		if (!tgt->ltd_exp)
 			continue;
-                tgt->ltd_active = 0;
-                if (tgt->ltd_exp) {
-                        --lmv->desc.ld_active_tgt_count;
-                        rc2 = obd_disconnect(tgt->ltd_exp);
-                        if (rc2) {
-                                CERROR("LMV target %s disconnect on "
-                                       "MDC idx %d: error %d\n",
-                                       tgt->ltd_uuid.uuid, i, rc2);
-                        }
-                }
-        }
 
-	mutex_unlock(&lmv->lmv_init_mutex);
+		--lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count;
+		obd_disconnect(tgt->ltd_exp);
+	}
 
-	RETURN(rc);
+	goto unlock;
 }
 
 static int lmv_disconnect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
@@ -597,33 +534,22 @@ static int lmv_disconnect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
 
 static int lmv_disconnect(struct obd_export *exp)
 {
-	struct obd_device	*obd = class_exp2obd(exp);
-	struct lmv_obd		*lmv = &obd->u.lmv;
-	int			 rc;
-	__u32			 i;
-	ENTRY;
-
-        if (!lmv->tgts)
-                goto out_local;
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int rc;
 
-        for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
-		if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
-                        continue;
+	ENTRY;
 
-		lmv_disconnect_mdc(obd, lmv->tgts[i]);
-        }
+	lmv_foreach_connected_tgt(lmv, tgt)
+		lmv_disconnect_mdc(obd, tgt);
 
 	if (lmv->lmv_tgts_kobj)
 		kobject_put(lmv->lmv_tgts_kobj);
 
-out_local:
-        /*
-         * This is the case when no real connection is established by
-         * lmv_check_connect().
-         */
-        if (!lmv->connected)
-                class_export_put(exp);
-        rc = class_disconnect(exp);
+	if (!lmv->connected)
+		class_export_put(exp);
+	rc = class_disconnect(exp);
 	lmv->connected = 0;
 
 	RETURN(rc);
@@ -632,17 +558,17 @@ static int lmv_disconnect(struct obd_export *exp)
 static int lmv_fid2path(struct obd_export *exp, int len, void *karg,
 			void __user *uarg)
 {
-	struct obd_device	*obddev = class_exp2obd(exp);
-	struct lmv_obd		*lmv = &obddev->u.lmv;
+	struct obd_device *obddev = class_exp2obd(exp);
+	struct lmv_obd *lmv = &obddev->u.lmv;
 	struct getinfo_fid2path *gf;
-	struct lmv_tgt_desc     *tgt;
+	struct lmv_tgt_desc *tgt;
 	struct getinfo_fid2path *remote_gf = NULL;
-	struct lu_fid		root_fid;
-	int			remote_gf_size = 0;
-	int			rc;
+	struct lu_fid root_fid;
+	int remote_gf_size = 0;
+	int rc;
 
 	gf = karg;
-	tgt = lmv_find_target(lmv, &gf->gf_fid);
+	tgt = lmv_fid2tgt(lmv, &gf->gf_fid);
 	if (IS_ERR(tgt))
 		RETURN(PTR_ERR(tgt));
 
@@ -700,7 +626,7 @@ static int lmv_fid2path(struct obd_export *exp, int len, void *karg,
 		GOTO(out_fid2path, rc = -EINVAL);
 	}
 
-	tgt = lmv_find_target(lmv, &gf->gf_fid);
+	tgt = lmv_fid2tgt(lmv, &gf->gf_fid);
 	if (IS_ERR(tgt))
 		GOTO(out_fid2path, rc = -EINVAL);
 
@@ -722,13 +648,13 @@ static int lmv_hsm_req_count(struct lmv_obd *lmv,
 			     const struct hsm_user_request *hur,
 			     const struct lmv_tgt_desc *tgt_mds)
 {
-	__u32			 i;
-	int			 nr = 0;
-	struct lmv_tgt_desc	*curr_tgt;
+	struct lmv_tgt_desc *curr_tgt;
+	__u32 i;
+	int nr = 0;
 
 	/* count how many requests must be sent to the given target */
 	for (i = 0; i < hur->hur_request.hr_itemcount; i++) {
-		curr_tgt = lmv_find_target(lmv, &hur->hur_user_item[i].hui_fid);
+		curr_tgt = lmv_fid2tgt(lmv, &hur->hur_user_item[i].hui_fid);
 		if (IS_ERR(curr_tgt))
 			RETURN(PTR_ERR(curr_tgt));
 		if (obd_uuid_equals(&curr_tgt->ltd_uuid, &tgt_mds->ltd_uuid))
@@ -742,15 +668,14 @@ static int lmv_hsm_req_build(struct lmv_obd *lmv,
 			      const struct lmv_tgt_desc *tgt_mds,
 			      struct hsm_user_request *hur_out)
 {
-	__u32			 i, nr_out;
-	struct lmv_tgt_desc	*curr_tgt;
+	__u32 i, nr_out;
+	struct lmv_tgt_desc *curr_tgt;
 
 	/* build the hsm_user_request for the given target */
 	hur_out->hur_request = hur_in->hur_request;
 	nr_out = 0;
 	for (i = 0; i < hur_in->hur_request.hr_itemcount; i++) {
-		curr_tgt = lmv_find_target(lmv,
-					   &hur_in->hur_user_item[i].hui_fid);
+		curr_tgt = lmv_fid2tgt(lmv, &hur_in->hur_user_item[i].hui_fid);
 		if (IS_ERR(curr_tgt))
 			RETURN(PTR_ERR(curr_tgt));
 		if (obd_uuid_equals(&curr_tgt->ltd_uuid, &tgt_mds->ltd_uuid)) {
@@ -771,20 +696,16 @@ static int lmv_hsm_ct_unregister(struct obd_device *obd, unsigned int cmd,
 				 void __user *uarg)
 {
 	struct lmv_obd *lmv = &obd->u.lmv;
-	__u32	i;
-	int	rc;
+	struct lu_tgt_desc *tgt;
+	int rc;
+
 	ENTRY;
 
 	/* unregister request (call from llapi_hsm_copytool_fini) */
-	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
-		struct lmv_tgt_desc *tgt = lmv->tgts[i];
-
-		if (tgt == NULL || tgt->ltd_exp == NULL)
-			continue;
+	lmv_foreach_connected_tgt(lmv, tgt)
 		/* best effort: try to clean as much as possible
 		 * (continue on error) */
 		obd_iocontrol(cmd, tgt->ltd_exp, len, lk, uarg);
-	}
 
 	/* Whatever the result, remove copytool from kuc groups.
 	 * Unreached coordinators will get EPIPE on next requests
@@ -801,12 +722,14 @@ static int lmv_hsm_ct_register(struct obd_device *obd, unsigned int cmd,
 {
 	struct lmv_obd *lmv = &obd->u.lmv;
 	struct file *filp;
-	__u32 i, j;
-	int err;
 	bool any_set = false;
 	struct kkuc_ct_data *kcd;
 	size_t kcd_size;
+	struct lu_tgt_desc *tgt;
+	__u32 i;
+	int err;
 	int rc = 0;
+
 	ENTRY;
 
 	filp = fget(lk->lk_wfd);
@@ -842,12 +765,7 @@ static int lmv_hsm_ct_register(struct obd_device *obd, unsigned int cmd,
 	/* All or nothing: try to register to all MDS.
 	 * In case of failure, unregister from previous MDS,
 	 * except if it because of inactive target. */
-	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
-		struct lmv_tgt_desc *tgt = lmv->tgts[i];
-
-		if (tgt == NULL || tgt->ltd_exp == NULL)
-			continue;
-
+	lmv_foreach_connected_tgt(lmv, tgt) {
 		err = obd_iocontrol(cmd, tgt->ltd_exp, len, lk, uarg);
 		if (err) {
 			if (tgt->ltd_active) {
@@ -855,14 +773,16 @@ static int lmv_hsm_ct_register(struct obd_device *obd, unsigned int cmd,
 				CERROR("%s: iocontrol MDC %s on MDT"
 				       " idx %d cmd %x: err = %d\n",
 				       lmv2obd_dev(lmv)->obd_name,
-				       tgt->ltd_uuid.uuid, i, cmd, err);
+				       tgt->ltd_uuid.uuid, tgt->ltd_index, cmd,
+				       err);
 				rc = err;
 				lk->lk_flags |= LK_FLG_STOP;
+				i = tgt->ltd_index;
 				/* unregister from previous MDS */
-				for (j = 0; j < i; j++) {
-					tgt = lmv->tgts[j];
-					if (tgt == NULL || tgt->ltd_exp == NULL)
-						continue;
+				lmv_foreach_connected_tgt(lmv, tgt) {
+					if (tgt->ltd_index >= i)
+						break;
+
 					obd_iocontrol(cmd, tgt->ltd_exp, len,
 						      lk, uarg);
 				}
@@ -890,37 +810,35 @@ static int lmv_hsm_ct_register(struct obd_device *obd, unsigned int cmd,
 	return rc;
 }
 
-
-
-
 static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
 			 int len, void *karg, void __user *uarg)
 {
-	struct obd_device	*obddev = class_exp2obd(exp);
-	struct lmv_obd		*lmv = &obddev->u.lmv;
-	struct lmv_tgt_desc	*tgt = NULL;
-	__u32			 i = 0;
-	int			 rc = 0;
-	int			 set = 0;
-	__u32			 count = lmv->desc.ld_tgt_count;
+	struct obd_device *obddev = class_exp2obd(exp);
+	struct lmv_obd *lmv = &obddev->u.lmv;
+	struct lu_tgt_desc *tgt = NULL;
+	int set = 0;
+	__u32 count = lmv->lmv_mdt_count;
+	int rc = 0;
+
 	ENTRY;
 
-        if (count == 0)
-                RETURN(-ENOTTY);
+	if (count == 0)
+		RETURN(-ENOTTY);
+
+	switch (cmd) {
+	case IOC_OBD_STATFS: {
+		struct obd_ioctl_data *data = karg;
+		struct obd_device *mdc_obd;
+		struct obd_statfs stat_buf = {0};
+		__u32 index;
 
-        switch (cmd) {
-        case IOC_OBD_STATFS: {
-                struct obd_ioctl_data *data = karg;
-                struct obd_device *mdc_obd;
-                struct obd_statfs stat_buf = {0};
-                __u32 index;
+		memcpy(&index, data->ioc_inlbuf2, sizeof(__u32));
 
-                memcpy(&index, data->ioc_inlbuf2, sizeof(__u32));
-                if ((index >= count))
-                        RETURN(-ENODEV);
+		if (index >= lmv->lmv_mdt_descs.ltd_tgts_size)
+			RETURN(-ENODEV);
 
-		tgt = lmv->tgts[index];
-		if (tgt == NULL || !tgt->ltd_active)
+		tgt = lmv_tgt(lmv, index);
+		if (!tgt || !tgt->ltd_active)
 			RETURN(-ENODATA);
 
 		mdc_obd = class_exp2obd(tgt->ltd_exp);
@@ -943,59 +861,50 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
 				     (int) sizeof(stat_buf))))
 			RETURN(-EFAULT);
 		break;
-        }
-        case OBD_IOC_QUOTACTL: {
-                struct if_quotactl *qctl = karg;
-                struct obd_quotactl *oqctl;
+	}
+	case OBD_IOC_QUOTACTL: {
+		struct if_quotactl *qctl = karg;
+		struct obd_quotactl *oqctl;
 
 		if (qctl->qc_valid == QC_MDTIDX) {
-			if (count <= qctl->qc_idx)
-				RETURN(-EINVAL);
-
-			tgt = lmv->tgts[qctl->qc_idx];
-			if (tgt == NULL || tgt->ltd_exp == NULL)
-				RETURN(-EINVAL);
+			tgt = lmv_tgt(lmv, qctl->qc_idx);
 		} else if (qctl->qc_valid == QC_UUID) {
-			for (i = 0; i < count; i++) {
-				tgt = lmv->tgts[i];
-				if (tgt == NULL)
-					continue;
+			lmv_foreach_tgt(lmv, tgt) {
 				if (!obd_uuid_equals(&tgt->ltd_uuid,
 						     &qctl->obd_uuid))
 					continue;
 
-                                if (tgt->ltd_exp == NULL)
-                                        RETURN(-EINVAL);
+				if (!tgt->ltd_exp)
+					RETURN(-EINVAL);
 
-                                break;
-                        }
-                } else {
-                        RETURN(-EINVAL);
-                }
+				break;
+			}
+		} else {
+			RETURN(-EINVAL);
+		}
 
-                if (i >= count)
-                        RETURN(-EAGAIN);
+		if (!tgt || !tgt->ltd_exp)
+			RETURN(-EINVAL);
 
-                LASSERT(tgt != NULL && tgt->ltd_exp != NULL);
-                OBD_ALLOC_PTR(oqctl);
-                if (!oqctl)
-                        RETURN(-ENOMEM);
+		OBD_ALLOC_PTR(oqctl);
+		if (!oqctl)
+			RETURN(-ENOMEM);
 
-                QCTL_COPY(oqctl, qctl);
-                rc = obd_quotactl(tgt->ltd_exp, oqctl);
-                if (rc == 0) {
-                        QCTL_COPY(qctl, oqctl);
-                        qctl->qc_valid = QC_MDTIDX;
-                        qctl->obd_uuid = tgt->ltd_uuid;
-                }
-                OBD_FREE_PTR(oqctl);
-                break;
-        }
+		QCTL_COPY(oqctl, qctl);
+		rc = obd_quotactl(tgt->ltd_exp, oqctl);
+		if (rc == 0) {
+			QCTL_COPY(qctl, oqctl);
+			qctl->qc_valid = QC_MDTIDX;
+			qctl->obd_uuid = tgt->ltd_uuid;
+		}
+		OBD_FREE_PTR(oqctl);
+		break;
+	}
 	case LL_IOC_GET_CONNECT_FLAGS: {
-		tgt = lmv->tgts[0];
-		if (tgt == NULL || tgt->ltd_exp == NULL)
-			RETURN(-ENODATA);
-		rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg);
+		tgt = lmv_tgt(lmv, 0);
+		rc = -ENODATA;
+		if (tgt && tgt->ltd_exp)
+			rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg);
 		break;
 	}
 	case LL_IOC_FID2MDTIDX: {
@@ -1018,9 +927,9 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
 	case LL_IOC_HSM_STATE_GET:
 	case LL_IOC_HSM_STATE_SET:
 	case LL_IOC_HSM_ACTION: {
-		struct md_op_data	*op_data = karg;
+		struct md_op_data *op_data = karg;
 
-		tgt = lmv_find_target(lmv, &op_data->op_fid1);
+		tgt = lmv_fid2tgt(lmv, &op_data->op_fid1);
 		if (IS_ERR(tgt))
 			RETURN(PTR_ERR(tgt));
 
@@ -1033,7 +942,7 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
 	case LL_IOC_HSM_PROGRESS: {
 		const struct hsm_progress_kernel *hpk = karg;
 
-		tgt = lmv_find_target(lmv, &hpk->hpk_fid);
+		tgt = lmv_fid2tgt(lmv, &hpk->hpk_fid);
 		if (IS_ERR(tgt))
 			RETURN(PTR_ERR(tgt));
 		rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg);
@@ -1050,22 +959,17 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
 		 * or if there is a single MDS, no need to split
 		 * the request. */
 		if (reqcount == 1 || count == 1) {
-			tgt = lmv_find_target(lmv,
-					      &hur->hur_user_item[0].hui_fid);
+			tgt = lmv_fid2tgt(lmv, &hur->hur_user_item[0].hui_fid);
 			if (IS_ERR(tgt))
 				RETURN(PTR_ERR(tgt));
 			rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg);
 		} else {
 			/* split fid list to their respective MDS */
-			for (i = 0; i < count; i++) {
+			lmv_foreach_connected_tgt(lmv, tgt) {
 				int nr, rc1;
 				size_t reqlen;
 				struct hsm_user_request *req;
 
-				tgt = lmv->tgts[i];
-				if (tgt == NULL || tgt->ltd_exp == NULL)
-					continue;
-
 				nr = lmv_hsm_req_count(lmv, hur, tgt);
 				if (nr < 0)
 					RETURN(nr);
@@ -1093,14 +997,14 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
 		break;
 	}
 	case LL_IOC_LOV_SWAP_LAYOUTS: {
-		struct md_op_data	*op_data = karg;
-		struct lmv_tgt_desc	*tgt1, *tgt2;
+		struct md_op_data *op_data = karg;
+		struct lmv_tgt_desc *tgt1, *tgt2;
 
-		tgt1 = lmv_find_target(lmv, &op_data->op_fid1);
+		tgt1 = lmv_fid2tgt(lmv, &op_data->op_fid1);
 		if (IS_ERR(tgt1))
 			RETURN(PTR_ERR(tgt1));
 
-		tgt2 = lmv_find_target(lmv, &op_data->op_fid2);
+		tgt2 = lmv_fid2tgt(lmv, &op_data->op_fid2);
 		if (IS_ERR(tgt2))
 			RETURN(PTR_ERR(tgt2));
 
@@ -1108,7 +1012,7 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
 			RETURN(-EINVAL);
 
 		/* only files on same MDT can have their layouts swapped */
-		if (tgt1->ltd_idx != tgt2->ltd_idx)
+		if (tgt1->ltd_index != tgt2->ltd_index)
 			RETURN(-EPERM);
 
 		rc = obd_iocontrol(cmd, tgt1->ltd_exp, len, karg, uarg);
@@ -1123,13 +1027,10 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
 		break;
 	}
 	default:
-		for (i = 0; i < count; i++) {
+		lmv_foreach_connected_tgt(lmv, tgt) {
 			struct obd_device *mdc_obd;
 			int err;
 
-			tgt = lmv->tgts[i];
-			if (tgt == NULL || tgt->ltd_exp == NULL)
-				continue;
 			/* ll_umount_begin() sets force flag but for lmv, not
 			 * mdc. Let's pass it through */
 			mdc_obd = class_exp2obd(tgt->ltd_exp);
@@ -1139,153 +1040,98 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
 				if (tgt->ltd_active) {
 					CERROR("error: iocontrol MDC %s on MDT"
 					       " idx %d cmd %x: err = %d\n",
-					       tgt->ltd_uuid.uuid, i, cmd, err);
+					       tgt->ltd_uuid.uuid,
+					       tgt->ltd_index, cmd, err);
 					if (!rc)
 						rc = err;
 				}
 			} else
 				set = 1;
-                }
-                if (!set && !rc)
-                        rc = -EIO;
-        }
-        RETURN(rc);
+		}
+		if (!set && !rc)
+			rc = -EIO;
+	}
+	RETURN(rc);
 }
 
-/**
- * This is _inode_ placement policy function (not name).
- */
-static int lmv_placement_policy(struct obd_device *obd,
-				struct md_op_data *op_data, u32 *mds)
+int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp,
+		  struct lu_fid *fid, struct md_op_data *op_data)
 {
-	struct lmv_obd	   *lmv = &obd->u.lmv;
-	struct lmv_user_md *lum;
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int rc;
 
 	ENTRY;
 
-	LASSERT(mds != NULL);
-
-	if (lmv->desc.ld_tgt_count == 1) {
-		*mds = 0;
-		RETURN(0);
-	}
+	LASSERT(op_data);
+	LASSERT(fid);
 
-	lum = op_data->op_data;
-	/* Choose MDS by
-	 * 1. See if the stripe offset is specified by lum.
-	 * 2. Then check if there is default stripe offset.
-	 * 3. Finally choose MDS by name hash if the parent
-	 *    is striped directory. (see lmv_locate_tgt()). */
-	if (op_data->op_cli_flags & CLI_SET_MEA && lum != NULL &&
-	    le32_to_cpu(lum->lum_stripe_offset) != (__u32)-1) {
-		*mds = le32_to_cpu(lum->lum_stripe_offset);
-	} else if (op_data->op_default_stripe_offset != (__u32)-1) {
-		*mds = op_data->op_default_stripe_offset;
-		op_data->op_mds = *mds;
-		/* Correct the stripe offset in lum */
-		if (lum != NULL)
-			lum->lum_stripe_offset = cpu_to_le32(*mds);
-	} else {
-		*mds = op_data->op_mds;
-	}
+	tgt = lmv_tgt(lmv, op_data->op_mds);
+	if (!tgt)
+		RETURN(-ENODEV);
 
-	RETURN(0);
-}
-
-int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, u32 mds)
-{
-	struct lmv_tgt_desc	*tgt;
-	int			 rc;
-	ENTRY;
-
-	tgt = lmv_get_target(lmv, mds, NULL);
-	if (IS_ERR(tgt))
-		RETURN(PTR_ERR(tgt));
+	if (!tgt->ltd_active || !tgt->ltd_exp)
+		RETURN(-ENODEV);
 
 	/*
 	 * New seq alloc and FLD setup should be atomic. Otherwise we may find
 	 * on server that seq in new allocated fid is not yet known.
 	 */
 	mutex_lock(&tgt->ltd_fid_mutex);
-
-	if (tgt->ltd_active == 0 || tgt->ltd_exp == NULL)
-		GOTO(out, rc = -ENODEV);
-
-	/*
-	 * Asking underlying tgt layer to allocate new fid.
-	 */
 	rc = obd_fid_alloc(NULL, tgt->ltd_exp, fid, NULL);
+	mutex_unlock(&tgt->ltd_fid_mutex);
 	if (rc > 0) {
 		LASSERT(fid_is_sane(fid));
 		rc = 0;
 	}
 
-        EXIT;
-out:
-	mutex_unlock(&tgt->ltd_fid_mutex);
-        return rc;
-}
-
-int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp,
-		  struct lu_fid *fid, struct md_op_data *op_data)
-{
-        struct obd_device     *obd = class_exp2obd(exp);
-        struct lmv_obd        *lmv = &obd->u.lmv;
-	u32		       mds = 0;
-        int                    rc;
-        ENTRY;
-
-        LASSERT(op_data != NULL);
-        LASSERT(fid != NULL);
-
-        rc = lmv_placement_policy(obd, op_data, &mds);
-        if (rc) {
-                CERROR("Can't get target for allocating fid, "
-                       "rc %d\n", rc);
-                RETURN(rc);
-        }
-
-        rc = __lmv_fid_alloc(lmv, fid, mds);
-        if (rc) {
-                CERROR("Can't alloc new fid, rc %d\n", rc);
-                RETURN(rc);
-        }
-
-        RETURN(rc);
+	RETURN(rc);
 }
 
 static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 {
-	struct lmv_obd	*lmv = &obd->u.lmv;
+	struct lmv_obd *lmv = &obd->u.lmv;
 	struct lmv_desc	*desc;
-	int		rc;
-	ENTRY;
+	struct lnet_process_id lnet_id;
+	int i = 0;
+	int rc;
 
-        if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
-                CERROR("LMV setup requires a descriptor\n");
-                RETURN(-EINVAL);
-        }
+	ENTRY;
 
-        desc = (struct lmv_desc *)lustre_cfg_buf(lcfg, 1);
-        if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) {
-                CERROR("Lmv descriptor size wrong: %d > %d\n",
-                       (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1));
-                RETURN(-EINVAL);
-        }
+	if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+		CERROR("LMV setup requires a descriptor\n");
+		RETURN(-EINVAL);
+	}
 
-	lmv->tgts_size = 32U;
-	OBD_ALLOC(lmv->tgts, sizeof(*lmv->tgts) * lmv->tgts_size);
-	if (lmv->tgts == NULL)
-		RETURN(-ENOMEM);
+	desc = (struct lmv_desc *)lustre_cfg_buf(lcfg, 1);
+	if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) {
+		CERROR("Lmv descriptor size wrong: %d > %d\n",
+		       (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1));
+		RETURN(-EINVAL);
+	}
 
-	obd_str2uuid(&lmv->desc.ld_uuid, desc->ld_uuid.uuid);
-	lmv->desc.ld_tgt_count = 0;
-	lmv->desc.ld_active_tgt_count = 0;
+	obd_str2uuid(&lmv->lmv_mdt_descs.ltd_lmv_desc.ld_uuid,
+		     desc->ld_uuid.uuid);
+	lmv->lmv_mdt_descs.ltd_lmv_desc.ld_tgt_count = 0;
+	lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count = 0;
+	lmv->lmv_mdt_descs.ltd_lmv_desc.ld_qos_maxage =
+		LMV_DESC_QOS_MAXAGE_DEFAULT;
 	lmv->max_def_easize = 0;
 	lmv->max_easize = 0;
 
 	spin_lock_init(&lmv->lmv_lock);
-	mutex_init(&lmv->lmv_init_mutex);
+
+	/*
+	 * initialize rr_index to lower 32bit of netid, so that client
+	 * can distribute subdirs evenly from the beginning.
+	 */
+	while (LNetGetId(i++, &lnet_id) != -ENOENT) {
+		if (LNET_NETTYP(LNET_NIDNET(lnet_id.nid)) != LOLND) {
+			lmv->lmv_qos_rr_index = (u32)lnet_id.nid;
+			break;
+		}
+	}
 
 	rc = lmv_tunables_init(obd);
 	if (rc)
@@ -1294,33 +1140,30 @@ static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 
 	rc = fld_client_init(&lmv->lmv_fld, obd->obd_name,
 			     LUSTRE_CLI_FLD_HASH_DHT);
-	if (rc) {
+	if (rc)
 		CERROR("Can't init FLD, err %d\n", rc);
-		GOTO(out, rc);
-	}
 
-        RETURN(0);
+	rc = lu_tgt_descs_init(&lmv->lmv_mdt_descs, true);
+	if (rc)
+		CWARN("%s: error initialize target table: rc = %d\n",
+		      obd->obd_name, rc);
 
-out:
-        return rc;
+	RETURN(rc);
 }
 
 static int lmv_cleanup(struct obd_device *obd)
 {
-	struct lmv_obd   *lmv = &obd->u.lmv;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lu_tgt_desc *tgt;
+	struct lu_tgt_desc *tmp;
+
 	ENTRY;
 
 	fld_client_fini(&lmv->lmv_fld);
-	if (lmv->tgts != NULL) {
-		int i;
-		for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
-			if (lmv->tgts[i] == NULL)
-				continue;
-			lmv_del_target(lmv, i);
-		}
-		OBD_FREE(lmv->tgts, sizeof(*lmv->tgts) * lmv->tgts_size);
-		lmv->tgts_size = 0;
-	}
+	lmv_foreach_tgt_safe(lmv, tgt, tmp)
+		lmv_del_target(lmv, tgt);
+	lu_tgt_descs_fini(&lmv->lmv_mdt_descs);
+
 	RETURN(0);
 }
 
@@ -1363,7 +1206,7 @@ static int lmv_select_statfs_mdt(struct lmv_obd *lmv, __u32 flags)
 	if (flags & OBD_STATFS_FOR_MDT0)
 		return 0;
 
-	if (lmv->lmv_statfs_start || lmv->desc.ld_tgt_count == 1)
+	if (lmv->lmv_statfs_start || lmv->lmv_mdt_count == 1)
 		return lmv->lmv_statfs_start;
 
 	/* choose initial MDT for this client */
@@ -1376,8 +1219,8 @@ static int lmv_select_statfs_mdt(struct lmv_obd *lmv, __u32 flags)
 			/* We dont need a full 64-bit modulus, just enough
 			 * to distribute the requests across MDTs evenly.
 			 */
-			lmv->lmv_statfs_start =
-				(u32)lnet_id.nid % lmv->desc.ld_tgt_count;
+			lmv->lmv_statfs_start = (u32)lnet_id.nid %
+						lmv->lmv_mdt_count;
 			break;
 		}
 	}
@@ -1388,31 +1231,33 @@ static int lmv_select_statfs_mdt(struct lmv_obd *lmv, __u32 flags)
 static int lmv_statfs(const struct lu_env *env, struct obd_export *exp,
 		      struct obd_statfs *osfs, time64_t max_age, __u32 flags)
 {
-	struct obd_device	*obd = class_exp2obd(exp);
-	struct lmv_obd		*lmv = &obd->u.lmv;
-	struct obd_statfs	*temp;
-	int			 rc = 0;
-	__u32			 i, idx;
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct obd_statfs *temp;
+	struct lu_tgt_desc *tgt;
+	__u32 i;
+	__u32 idx;
+	int rc = 0;
+
 	ENTRY;
 
-        OBD_ALLOC(temp, sizeof(*temp));
-        if (temp == NULL)
-                RETURN(-ENOMEM);
+	OBD_ALLOC(temp, sizeof(*temp));
+	if (temp == NULL)
+		RETURN(-ENOMEM);
 
 	/* distribute statfs among MDTs */
 	idx = lmv_select_statfs_mdt(lmv, flags);
 
-	for (i = 0; i < lmv->desc.ld_tgt_count; i++, idx++) {
-		idx = idx % lmv->desc.ld_tgt_count;
-		if (lmv->tgts[idx] == NULL || lmv->tgts[idx]->ltd_exp == NULL)
+	for (i = 0; i < lmv->lmv_mdt_descs.ltd_tgts_size; i++, idx++) {
+		idx = idx % lmv->lmv_mdt_descs.ltd_tgts_size;
+		tgt = lmv_tgt(lmv, idx);
+		if (!tgt || !tgt->ltd_exp)
 			continue;
 
-		rc = obd_statfs(env, lmv->tgts[idx]->ltd_exp, temp,
-				max_age, flags);
+		rc = obd_statfs(env, tgt->ltd_exp, temp, max_age, flags);
 		if (rc) {
 			CERROR("%s: can't stat MDS #%d: rc = %d\n",
-			       lmv->tgts[idx]->ltd_exp->exp_obd->obd_name, i,
-			       rc);
+			       tgt->ltd_exp->exp_obd->obd_name, i, rc);
 			GOTO(out_free_temp, rc);
 		}
 
@@ -1438,23 +1283,70 @@ static int lmv_statfs(const struct lu_env *env, struct obd_export *exp,
 			osfs->os_files += temp->os_files;
 			osfs->os_granted += temp->os_granted;
 		}
-        }
+	}
 
-        EXIT;
+	EXIT;
 out_free_temp:
-        OBD_FREE(temp, sizeof(*temp));
-        return rc;
+	OBD_FREE(temp, sizeof(*temp));
+	return rc;
+}
+
+static int lmv_statfs_update(void *cookie, int rc)
+{
+	struct obd_info *oinfo = cookie;
+	struct obd_device *obd = oinfo->oi_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt = oinfo->oi_tgt;
+	struct obd_statfs *osfs = oinfo->oi_osfs;
+
+	/*
+	 * NB: don't deactivate TGT upon error, because we may not trigger async
+	 * statfs any longer, then there is no chance to activate TGT.
+	 */
+	if (!rc) {
+		spin_lock(&lmv->lmv_lock);
+		tgt->ltd_statfs = *osfs;
+		tgt->ltd_statfs_age = ktime_get_seconds();
+		spin_unlock(&lmv->lmv_lock);
+		set_bit(LQ_DIRTY, &lmv->lmv_qos.lq_flags);
+	}
+
+	return rc;
+}
+
+/* update tgt statfs async if it's ld_qos_maxage old */
+int lmv_statfs_check_update(struct obd_device *obd, struct lmv_tgt_desc *tgt)
+{
+	struct obd_info oinfo = {
+		.oi_obd	= obd,
+		.oi_tgt = tgt,
+		.oi_cb_up = lmv_statfs_update,
+	};
+	int rc;
+
+	if (ktime_get_seconds() - tgt->ltd_statfs_age <
+	    obd->u.lmv.lmv_mdt_descs.ltd_lmv_desc.ld_qos_maxage)
+		return 0;
+
+	rc = obd_statfs_async(tgt->ltd_exp, &oinfo, 0, NULL);
+
+	return rc;
 }
 
 static int lmv_get_root(struct obd_export *exp, const char *fileset,
 			struct lu_fid *fid)
 {
-        struct obd_device    *obd = exp->exp_obd;
-        struct lmv_obd       *lmv = &obd->u.lmv;
-        int                   rc;
-        ENTRY;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lu_tgt_desc *tgt = lmv_tgt(lmv, 0);
+	int rc;
+
+	ENTRY;
 
-	rc = md_get_root(lmv->tgts[0]->ltd_exp, fileset, fid);
+	if (!tgt)
+		RETURN(-ENODEV);
+
+	rc = md_get_root(tgt->ltd_exp, fileset, fid);
 	RETURN(rc);
 }
 
@@ -1462,15 +1354,16 @@ static int lmv_getxattr(struct obd_export *exp, const struct lu_fid *fid,
 			u64 obd_md_valid, const char *name, size_t buf_size,
 			struct ptlrpc_request **req)
 {
-        struct obd_device      *obd = exp->exp_obd;
-        struct lmv_obd         *lmv = &obd->u.lmv;
-        struct lmv_tgt_desc    *tgt;
-        int                     rc;
-        ENTRY;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int rc;
+
+	ENTRY;
 
-        tgt = lmv_find_target(lmv, fid);
-        if (IS_ERR(tgt))
-                RETURN(PTR_ERR(tgt));
+	tgt = lmv_fid2tgt(lmv, fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
 
 	rc = md_getxattr(tgt->ltd_exp, fid, obd_md_valid, name, buf_size, req);
 
@@ -1483,15 +1376,16 @@ static int lmv_setxattr(struct obd_export *exp, const struct lu_fid *fid,
 			unsigned int xattr_flags, u32 suppgid,
 			struct ptlrpc_request **req)
 {
-        struct obd_device      *obd = exp->exp_obd;
-        struct lmv_obd         *lmv = &obd->u.lmv;
-        struct lmv_tgt_desc    *tgt;
-        int                     rc;
-        ENTRY;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int rc;
+
+	ENTRY;
 
-        tgt = lmv_find_target(lmv, fid);
-        if (IS_ERR(tgt))
-                RETURN(PTR_ERR(tgt));
+	tgt = lmv_fid2tgt(lmv, fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
 
 	rc = md_setxattr(tgt->ltd_exp, fid, obd_md_valid, name,
 			 value, value_size, xattr_flags, suppgid, req);
@@ -1500,84 +1394,186 @@ static int lmv_setxattr(struct obd_export *exp, const struct lu_fid *fid,
 }
 
 static int lmv_getattr(struct obd_export *exp, struct md_op_data *op_data,
-                       struct ptlrpc_request **request)
+		       struct ptlrpc_request **request)
 {
-        struct obd_device       *obd = exp->exp_obd;
-        struct lmv_obd          *lmv = &obd->u.lmv;
-        struct lmv_tgt_desc     *tgt;
-        int                      rc;
-        ENTRY;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int rc;
 
-        tgt = lmv_find_target(lmv, &op_data->op_fid1);
-        if (IS_ERR(tgt))
-                RETURN(PTR_ERR(tgt));
+	ENTRY;
+
+	tgt = lmv_fid2tgt(lmv, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
 
 	if (op_data->op_flags & MF_GET_MDT_IDX) {
-		op_data->op_mds = tgt->ltd_idx;
+		op_data->op_mds = tgt->ltd_index;
 		RETURN(0);
 	}
 
-        rc = md_getattr(tgt->ltd_exp, op_data, request);
+	rc = md_getattr(tgt->ltd_exp, op_data, request);
 
-        RETURN(rc);
+	RETURN(rc);
 }
 
 static int lmv_null_inode(struct obd_export *exp, const struct lu_fid *fid)
 {
-        struct obd_device   *obd = exp->exp_obd;
-        struct lmv_obd      *lmv = &obd->u.lmv;
-	__u32                i;
-        ENTRY;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lu_tgt_desc *tgt;
+
+	ENTRY;
 
-        CDEBUG(D_INODE, "CBDATA for "DFID"\n", PFID(fid));
+	CDEBUG(D_INODE, "CBDATA for "DFID"\n", PFID(fid));
 
 	/*
 	 * With DNE every object can have two locks in different namespaces:
 	 * lookup lock in space of MDT storing direntry and update/open lock in
 	 * space of MDT storing inode.
 	 */
-	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
-		if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
-			continue;
-		md_null_inode(lmv->tgts[i]->ltd_exp, fid);
-	}
+	lmv_foreach_connected_tgt(lmv, tgt)
+		md_null_inode(tgt->ltd_exp, fid);
 
 	RETURN(0);
 }
 
 static int lmv_close(struct obd_export *exp, struct md_op_data *op_data,
-                     struct md_open_data *mod, struct ptlrpc_request **request)
+		     struct md_open_data *mod, struct ptlrpc_request **request)
 {
-        struct obd_device     *obd = exp->exp_obd;
-        struct lmv_obd        *lmv = &obd->u.lmv;
-        struct lmv_tgt_desc   *tgt;
-        int                    rc;
-        ENTRY;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int rc;
+
+	ENTRY;
+
+	tgt = lmv_fid2tgt(lmv, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	CDEBUG(D_INODE, "CLOSE "DFID"\n", PFID(&op_data->op_fid1));
+	rc = md_close(tgt->ltd_exp, op_data, mod, request);
+	RETURN(rc);
+}
+
+static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 *mdt,
+					      unsigned short dir_depth)
+{
+	struct lu_tgt_desc *tgt, *cur = NULL;
+	__u64 total_avail = 0;
+	__u64 total_weight = 0;
+	__u64 cur_weight = 0;
+	int total_usable = 0;
+	__u64 rand;
+	int rc;
+
+	ENTRY;
+
+	if (!ltd_qos_is_usable(&lmv->lmv_mdt_descs))
+		RETURN(ERR_PTR(-EAGAIN));
+
+	down_write(&lmv->lmv_qos.lq_rw_sem);
+
+	if (!ltd_qos_is_usable(&lmv->lmv_mdt_descs))
+		GOTO(unlock, tgt = ERR_PTR(-EAGAIN));
+
+	rc = ltd_qos_penalties_calc(&lmv->lmv_mdt_descs);
+	if (rc)
+		GOTO(unlock, tgt = ERR_PTR(rc));
+
+	lmv_foreach_tgt(lmv, tgt) {
+		if (!tgt->ltd_exp || !tgt->ltd_active) {
+			tgt->ltd_qos.ltq_usable = 0;
+			continue;
+		}
+
+		tgt->ltd_qos.ltq_usable = 1;
+		lu_tgt_qos_weight_calc(tgt);
+		if (tgt->ltd_index == *mdt)
+			cur = tgt;
+		total_avail += tgt->ltd_qos.ltq_avail;
+		total_weight += tgt->ltd_qos.ltq_weight;
+		total_usable++;
+	}
+
+	/* if current MDT has above-average space, within range of the QOS
+	 * threshold, stay on the same MDT to avoid creating needless remote
+	 * MDT directories. It's more likely for low level directories
+	 * "16 / (dir_depth + 10)" is the factor to make it more unlikely for
+	 * top level directories, while more likely for low levels.
+	 */
+	rand = total_avail * 16 / (total_usable * (dir_depth + 10));
+	if (cur && cur->ltd_qos.ltq_avail >= rand) {
+		tgt = cur;
+		GOTO(unlock, rc = 0);
+	}
+
+	rand = lu_prandom_u64_max(total_weight);
+
+	lmv_foreach_connected_tgt(lmv, tgt) {
+		if (!tgt->ltd_qos.ltq_usable)
+			continue;
+
+		cur_weight += tgt->ltd_qos.ltq_weight;
+		if (cur_weight < rand)
+			continue;
+
+		*mdt = tgt->ltd_index;
+		ltd_qos_update(&lmv->lmv_mdt_descs, tgt, &total_weight);
+		GOTO(unlock, rc = 0);
+	}
+
+	/* no proper target found */
+	GOTO(unlock, tgt = ERR_PTR(-EAGAIN));
+unlock:
+	up_write(&lmv->lmv_qos.lq_rw_sem);
+
+	return tgt;
+}
+
+static struct lu_tgt_desc *lmv_locate_tgt_rr(struct lmv_obd *lmv, __u32 *mdt)
+{
+	struct lu_tgt_desc *tgt;
+	int i;
+	int index;
+
+	ENTRY;
+
+	spin_lock(&lmv->lmv_qos.lq_rr.lqr_alloc);
+	for (i = 0; i < lmv->lmv_mdt_descs.ltd_tgts_size; i++) {
+		index = (i + lmv->lmv_qos_rr_index) %
+			lmv->lmv_mdt_descs.ltd_tgts_size;
+		tgt = lmv_tgt(lmv, index);
+		if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
+			continue;
+
+		*mdt = tgt->ltd_index;
+		lmv->lmv_qos_rr_index = (*mdt + 1) %
+					lmv->lmv_mdt_descs.ltd_tgts_size;
+		spin_unlock(&lmv->lmv_qos.lq_rr.lqr_alloc);
 
-        tgt = lmv_find_target(lmv, &op_data->op_fid1);
-        if (IS_ERR(tgt))
-                RETURN(PTR_ERR(tgt));
+		RETURN(tgt);
+	}
+	spin_unlock(&lmv->lmv_qos.lq_rr.lqr_alloc);
 
-        CDEBUG(D_INODE, "CLOSE "DFID"\n", PFID(&op_data->op_fid1));
-        rc = md_close(tgt->ltd_exp, op_data, mod, request);
-        RETURN(rc);
+	RETURN(ERR_PTR(-ENODEV));
 }
 
-struct lmv_tgt_desc*
-__lmv_locate_tgt(struct lmv_obd *lmv, struct lmv_stripe_md *lsm,
-		 const char *name, int namelen, struct lu_fid *fid, u32 *mds,
-		 bool post_migrate)
+static struct lmv_tgt_desc *
+lmv_locate_tgt_by_name(struct lmv_obd *lmv, struct lmv_stripe_md *lsm,
+		       const char *name, int namelen, struct lu_fid *fid,
+		       __u32 *mds, bool post_migrate)
 {
 	struct lmv_tgt_desc *tgt;
 	const struct lmv_oinfo *oinfo;
 
-	if (lsm == NULL || namelen == 0) {
-		tgt = lmv_find_target(lmv, fid);
+	if (!lmv_dir_striped(lsm) || !namelen) {
+		tgt = lmv_fid2tgt(lmv, fid);
 		if (IS_ERR(tgt))
 			return tgt;
 
-		LASSERT(mds);
-		*mds = tgt->ltd_idx;
+		*mds = tgt->ltd_index;
 		return tgt;
 	}
 
@@ -1592,89 +1588,230 @@ __lmv_locate_tgt(struct lmv_obd *lmv, struct lmv_stripe_md *lsm,
 			return ERR_CAST(oinfo);
 	}
 
-	if (fid != NULL)
-		*fid = oinfo->lmo_fid;
-	if (mds != NULL)
-		*mds = oinfo->lmo_mds;
 	/* check stripe FID is sane */
 	if (!fid_is_sane(&oinfo->lmo_fid))
 		return ERR_PTR(-ENODEV);
+	*fid = oinfo->lmo_fid;
+	*mds = oinfo->lmo_mds;
+
+	tgt = lmv_tgt(lmv, oinfo->lmo_mds);
+
+	CDEBUG(D_INODE, "locate MDT %u parent "DFID"\n", *mds, PFID(fid));
+
+	return tgt ? tgt : ERR_PTR(-ENODEV);
+}
+
+/**
+ * Locate MDT of op_data->op_fid1
+ *
+ * For striped directory, it will locate the stripe by name hash, if hash_type
+ * is unknown, it will return the stripe specified by 'op_data->op_stripe_index'
+ * which is set outside, and if dir is migrating, 'op_data->op_post_migrate'
+ * indicates whether old or new layout is used to locate.
+ *
+ * For plain direcotry, it just locate the MDT of op_data->op_fid1.
+ *
+ * \param[in] lmv	LMV device
+ * \param[in] op_data	client MD stack parameters, name, namelen
+ *                      mds_num etc.
+ *
+ * retval		pointer to the lmv_tgt_desc if succeed.
+ *                      ERR_PTR(errno) if failed.
+ */
+struct lmv_tgt_desc *
+lmv_locate_tgt(struct lmv_obd *lmv, struct md_op_data *op_data)
+{
+	struct lmv_stripe_md *lsm = op_data->op_mea1;
+	struct lmv_oinfo *oinfo;
+	struct lmv_tgt_desc *tgt;
+
+	/* During creating VOLATILE file, it should honor the mdt
+	 * index if the file under striped dir is being restored, see
+	 * ct_restore(). */
+	if (op_data->op_bias & MDS_CREATE_VOLATILE &&
+	    op_data->op_mds != LMV_OFFSET_DEFAULT) {
+		tgt = lmv_tgt(lmv, op_data->op_mds);
+		if (!tgt)
+			return ERR_PTR(-ENODEV);
+
+		if (lmv_dir_striped(lsm)) {
+			int i;
+
+			/* refill the right parent fid */
+			for (i = 0; i < lsm->lsm_md_stripe_count; i++) {
+				oinfo = &lsm->lsm_md_oinfo[i];
+				if (oinfo->lmo_mds == op_data->op_mds) {
+					op_data->op_fid1 = oinfo->lmo_fid;
+					break;
+				}
+			}
+
+			if (i == lsm->lsm_md_stripe_count)
+				op_data->op_fid1 = lsm->lsm_md_oinfo[0].lmo_fid;
+		}
+	} else if (lmv_dir_bad_hash(lsm)) {
+		LASSERT(op_data->op_stripe_index < lsm->lsm_md_stripe_count);
+		oinfo = &lsm->lsm_md_oinfo[op_data->op_stripe_index];
+
+		op_data->op_fid1 = oinfo->lmo_fid;
+		op_data->op_mds = oinfo->lmo_mds;
+		tgt = lmv_tgt(lmv, oinfo->lmo_mds);
+		if (!tgt)
+			return ERR_PTR(-ENODEV);
+	} else {
+		tgt = lmv_locate_tgt_by_name(lmv, op_data->op_mea1,
+				op_data->op_name, op_data->op_namelen,
+				&op_data->op_fid1, &op_data->op_mds,
+				op_data->op_post_migrate);
+	}
+
+	return tgt;
+}
+
+/* Locate MDT of op_data->op_fid2 for link/rename */
+static struct lmv_tgt_desc *
+lmv_locate_tgt2(struct lmv_obd *lmv, struct md_op_data *op_data)
+{
+	struct lmv_tgt_desc *tgt;
+	int rc;
+
+	LASSERT(op_data->op_name);
+	if (lmv_dir_migrating(op_data->op_mea2)) {
+		struct lu_fid fid1 = op_data->op_fid1;
+		struct lmv_stripe_md *lsm1 = op_data->op_mea1;
+		struct ptlrpc_request *request = NULL;
+
+		/*
+		 * avoid creating new file under old layout of migrating
+		 * directory, check it here.
+		 */
+		tgt = lmv_locate_tgt_by_name(lmv, op_data->op_mea2,
+				op_data->op_name, op_data->op_namelen,
+				&op_data->op_fid2, &op_data->op_mds, false);
+		if (IS_ERR(tgt))
+			RETURN(tgt);
+
+		op_data->op_fid1 = op_data->op_fid2;
+		op_data->op_mea1 = op_data->op_mea2;
+		rc = md_getattr_name(tgt->ltd_exp, op_data, &request);
+		op_data->op_fid1 = fid1;
+		op_data->op_mea1 = lsm1;
+		if (!rc) {
+			ptlrpc_req_finished(request);
+			RETURN(ERR_PTR(-EEXIST));
+		}
+
+		if (rc != -ENOENT)
+			RETURN(ERR_PTR(rc));
+	}
+
+	return lmv_locate_tgt_by_name(lmv, op_data->op_mea2, op_data->op_name,
+				op_data->op_namelen, &op_data->op_fid2,
+				&op_data->op_mds, true);
+}
+
+int lmv_migrate_existence_check(struct lmv_obd *lmv, struct md_op_data *op_data)
+{
+	struct lu_tgt_desc *tgt;
+	struct ptlrpc_request *request;
+	int rc;
+
+	LASSERT(lmv_dir_migrating(op_data->op_mea1));
+
+	tgt = lmv_locate_tgt(lmv, op_data);
+	if (IS_ERR(tgt))
+		return PTR_ERR(tgt);
+
+	rc = md_getattr_name(tgt->ltd_exp, op_data, &request);
+	if (!rc) {
+		ptlrpc_req_finished(request);
+		return -EEXIST;
+	}
+
+	return rc;
+}
+
+static inline bool lmv_op_user_qos_mkdir(const struct md_op_data *op_data)
+{
+	const struct lmv_user_md *lum = op_data->op_data;
+
+	return (op_data->op_cli_flags & CLI_SET_MEA) && lum &&
+	       le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC &&
+	       le32_to_cpu(lum->lum_stripe_offset) == LMV_OFFSET_DEFAULT;
+}
+
+static inline bool lmv_op_default_qos_mkdir(const struct md_op_data *op_data)
+{
+	const struct lmv_stripe_md *lsm = op_data->op_default_mea1;
+
+	return (op_data->op_flags & MF_QOS_MKDIR) ||
+	       (lsm && lsm->lsm_md_master_mdt_index == LMV_OFFSET_DEFAULT);
+}
+
+/* mkdir by QoS in three cases:
+ * 1. ROOT default LMV is space balanced.
+ * 2. 'lfs mkdir -i -1'
+ * 3. parent default LMV master_mdt_index is -1
+ *
+ * NB, mkdir by QoS only if parent is not striped, this is to avoid remote
+ * directories under striped directory.
+ */
+static inline bool lmv_op_qos_mkdir(const struct md_op_data *op_data)
+{
+	if (op_data->op_code != LUSTRE_OPC_MKDIR)
+		return false;
+
+	if (lmv_dir_striped(op_data->op_mea1))
+		return false;
 
-	tgt = lmv_get_target(lmv, oinfo->lmo_mds, NULL);
+	if (lmv_op_user_qos_mkdir(op_data))
+		return true;
 
-	CDEBUG(D_INFO, "locate on mds %u "DFID"\n", oinfo->lmo_mds,
-	       PFID(&oinfo->lmo_fid));
+	if (lmv_op_default_qos_mkdir(op_data))
+		return true;
 
-	return tgt;
+	return false;
 }
 
-
-/**
- * Locate mdt by fid or name
- *
- * For striped directory, it will locate the stripe by name hash, if hash_type
- * is unknown, it will return the stripe specified by 'op_data->op_stripe_index'
- * which is set outside, and if dir is migrating, 'op_data->op_post_migrate'
- * indicates whether old or new layout is used to locate.
- *
- * For normal direcotry, it will locate MDS by FID directly.
- *
- * \param[in] lmv		LMV device
- * \param[in/out] op_data	client MD stack parameters, name, namelen etc,
- *                      	op_mds and op_fid1 will be updated if op_mea1
- *                      	indicates fid1 represents a striped directory.
- * \param[out] fid		object FID used to locate MDS.
- *
- * retval		pointer to the lmv_tgt_desc if succeed.
- *                      ERR_PTR(errno) if failed.
+/* if parent default LMV is space balanced, and
+ * 1. max_inherit_rr is set
+ * 2. or parent is ROOT
+ * mkdir roundrobin. Or if parent doesn't have default LMV, while ROOT default
+ * LMV requests roundrobin mkdir, do the same.
+ * NB, this needs to check server is balanced, which is done by caller.
  */
-struct lmv_tgt_desc*
-lmv_locate_tgt(struct lmv_obd *lmv, struct md_op_data *op_data,
-	       struct lu_fid *fid)
+static inline bool lmv_op_default_rr_mkdir(const struct md_op_data *op_data)
 {
-	struct lmv_stripe_md *lsm = op_data->op_mea1;
-	struct lmv_oinfo *oinfo;
-	struct lmv_tgt_desc *tgt;
+	const struct lmv_stripe_md *lsm = op_data->op_default_mea1;
 
-	/* During creating VOLATILE file, it should honor the mdt
-	 * index if the file under striped dir is being restored, see
-	 * ct_restore(). */
-	if (op_data->op_bias & MDS_CREATE_VOLATILE &&
-	    (int)op_data->op_mds != -1) {
-		tgt = lmv_get_target(lmv, op_data->op_mds, NULL);
-		if (IS_ERR(tgt))
-			return tgt;
-
-		if (lsm) {
-			int i;
+	if (!lmv_op_default_qos_mkdir(op_data))
+		return false;
 
-			/* refill the right parent fid */
-			for (i = 0; i < lsm->lsm_md_stripe_count; i++) {
-				oinfo = &lsm->lsm_md_oinfo[i];
-				if (oinfo->lmo_mds == op_data->op_mds) {
-					*fid = oinfo->lmo_fid;
-					break;
-				}
-			}
+	return (op_data->op_flags & MF_RR_MKDIR) ||
+	       (lsm && lsm->lsm_md_max_inherit_rr != LMV_INHERIT_RR_NONE) ||
+	       fid_is_root(&op_data->op_fid1);
+}
 
-			if (i == lsm->lsm_md_stripe_count)
-				*fid = lsm->lsm_md_oinfo[0].lmo_fid;
-		}
-	} else if (lmv_is_dir_bad_hash(lsm)) {
-		LASSERT(op_data->op_stripe_index < lsm->lsm_md_stripe_count);
-		oinfo = &lsm->lsm_md_oinfo[op_data->op_stripe_index];
+/* 'lfs mkdir -i <specific_MDT>' */
+static inline bool lmv_op_user_specific_mkdir(const struct md_op_data *op_data)
+{
+	const struct lmv_user_md *lum = op_data->op_data;
 
-		*fid = oinfo->lmo_fid;
-		op_data->op_mds = oinfo->lmo_mds;
-		tgt = lmv_get_target(lmv, oinfo->lmo_mds, NULL);
-	} else {
-		tgt = __lmv_locate_tgt(lmv, lsm, op_data->op_name,
-				       op_data->op_namelen, fid,
-				       &op_data->op_mds,
-				       op_data->op_post_migrate);
-	}
+	return op_data->op_code == LUSTRE_OPC_MKDIR &&
+	       op_data->op_cli_flags & CLI_SET_MEA && lum &&
+	       (le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC ||
+		le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC) &&
+	       le32_to_cpu(lum->lum_stripe_offset) != LMV_OFFSET_DEFAULT;
+}
 
-	return tgt;
+/* parent default LMV master_mdt_index is not -1. */
+static inline bool
+lmv_op_default_specific_mkdir(const struct md_op_data *op_data)
+{
+	return op_data->op_code == LUSTRE_OPC_MKDIR &&
+	       op_data->op_default_mea1 &&
+	       op_data->op_default_mea1->lsm_md_master_mdt_index !=
+			LMV_OFFSET_DEFAULT;
 }
 
 int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
@@ -1682,67 +1819,87 @@ int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
 		gid_t gid, cfs_cap_t cap_effective, __u64 rdev,
 		struct ptlrpc_request **request)
 {
-	struct obd_device       *obd = exp->exp_obd;
-	struct lmv_obd          *lmv = &obd->u.lmv;
-	struct lmv_tgt_desc     *tgt;
-	int                      rc;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int rc;
+
 	ENTRY;
 
-	if (!lmv->desc.ld_active_tgt_count)
+	if (!lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count)
 		RETURN(-EIO);
 
-	if (lmv_is_dir_bad_hash(op_data->op_mea1))
+	if (lmv_dir_bad_hash(op_data->op_mea1))
 		RETURN(-EBADF);
 
-	if (lmv_is_dir_migrating(op_data->op_mea1)) {
+	if (lmv_dir_migrating(op_data->op_mea1)) {
 		/*
 		 * if parent is migrating, create() needs to lookup existing
-		 * name, to avoid creating new file under old layout of
-		 * migrating directory, check old layout here.
+		 * name in both old and new layout, check old layout on client.
 		 */
-		tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
-		if (IS_ERR(tgt))
-			RETURN(PTR_ERR(tgt));
-
-		rc = md_getattr_name(tgt->ltd_exp, op_data, request);
-		if (!rc) {
-			ptlrpc_req_finished(*request);
-			*request = NULL;
-			RETURN(-EEXIST);
-		}
-
+		rc = lmv_migrate_existence_check(lmv, op_data);
 		if (rc != -ENOENT)
 			RETURN(rc);
 
 		op_data->op_post_migrate = true;
 	}
 
-	tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
+	tgt = lmv_locate_tgt(lmv, op_data);
 	if (IS_ERR(tgt))
 		RETURN(PTR_ERR(tgt));
 
-	CDEBUG(D_INODE, "CREATE name '%.*s' on "DFID" -> mds #%x\n",
-		(int)op_data->op_namelen, op_data->op_name,
-		PFID(&op_data->op_fid1), op_data->op_mds);
-
-	rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
-	if (rc)
-		RETURN(rc);
-
-	if (exp_connect_flags(exp) & OBD_CONNECT_DIR_STRIPE) {
-		/* Send the create request to the MDT where the object
-		 * will be located */
-		tgt = lmv_find_target(lmv, &op_data->op_fid2);
+	if (lmv_op_user_specific_mkdir(op_data)) {
+		struct lmv_user_md *lum = op_data->op_data;
+
+		op_data->op_mds = le32_to_cpu(lum->lum_stripe_offset);
+		tgt = lmv_tgt(lmv, op_data->op_mds);
+		if (!tgt)
+			RETURN(-ENODEV);
+	} else if (lmv_op_default_specific_mkdir(op_data)) {
+		op_data->op_mds =
+			op_data->op_default_mea1->lsm_md_master_mdt_index;
+		tgt = lmv_tgt(lmv, op_data->op_mds);
+		if (!tgt)
+			RETURN(-ENODEV);
+	} else if (lmv_op_qos_mkdir(op_data)) {
+		struct lmv_tgt_desc *tmp = tgt;
+
+		tgt = lmv_locate_tgt_qos(lmv, &op_data->op_mds,
+					 op_data->op_dir_depth);
+		if (tgt == ERR_PTR(-EAGAIN)) {
+			if (ltd_qos_is_balanced(&lmv->lmv_mdt_descs) &&
+			    !lmv_op_default_rr_mkdir(op_data) &&
+			    !lmv_op_user_qos_mkdir(op_data))
+				/* if it's not necessary, don't create remote
+				 * directory.
+				 */
+				tgt = tmp;
+			else
+				tgt = lmv_locate_tgt_rr(lmv, &op_data->op_mds);
+		}
 		if (IS_ERR(tgt))
 			RETURN(PTR_ERR(tgt));
 
-		op_data->op_mds = tgt->ltd_idx;
-	} else {
-		CDEBUG(D_CONFIG, "Server doesn't support striped dirs\n");
+		/*
+		 * only update statfs after QoS mkdir, this means the cached
+		 * statfs may be stale, and current mkdir may not follow QoS
+		 * accurately, but it's not serious, and avoids periodic statfs
+		 * when client doesn't mkdir by QoS.
+		 */
+		lmv_statfs_check_update(obd, tgt);
 	}
 
-	CDEBUG(D_INODE, "CREATE obj "DFID" -> mds #%x\n",
-	       PFID(&op_data->op_fid2), op_data->op_mds);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_INODE, "CREATE name '%.*s' "DFID" on "DFID" -> mds #%x\n",
+		(int)op_data->op_namelen, op_data->op_name,
+		PFID(&op_data->op_fid2), PFID(&op_data->op_fid1),
+		op_data->op_mds);
 
 	op_data->op_flags |= MF_MDC_CANCEL_FID1;
 	rc = md_create(tgt->ltd_exp, op_data, data, datalen, mode, uid, gid,
@@ -1760,20 +1917,21 @@ lmv_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
 	    const union ldlm_policy_data *policy, struct md_op_data *op_data,
 	    struct lustre_handle *lockh, __u64 extra_lock_flags)
 {
-	struct obd_device        *obd = exp->exp_obd;
-	struct lmv_obd           *lmv = &obd->u.lmv;
-	struct lmv_tgt_desc      *tgt;
-	int                       rc;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int rc;
+
 	ENTRY;
 
 	CDEBUG(D_INODE, "ENQUEUE on "DFID"\n", PFID(&op_data->op_fid1));
 
-	tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	tgt = lmv_fid2tgt(lmv, &op_data->op_fid1);
 	if (IS_ERR(tgt))
 		RETURN(PTR_ERR(tgt));
 
 	CDEBUG(D_INODE, "ENQUEUE on "DFID" -> mds #%u\n",
-	       PFID(&op_data->op_fid1), tgt->ltd_idx);
+	       PFID(&op_data->op_fid1), tgt->ltd_index);
 
 	rc = md_enqueue(tgt->ltd_exp, einfo, policy, op_data, lockh,
 			extra_lock_flags);
@@ -1794,13 +1952,13 @@ lmv_getattr_name(struct obd_export *exp,struct md_op_data *op_data,
 	ENTRY;
 
 retry:
-	tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
+	tgt = lmv_locate_tgt(lmv, op_data);
 	if (IS_ERR(tgt))
 		RETURN(PTR_ERR(tgt));
 
 	CDEBUG(D_INODE, "GETATTR_NAME for %*s on "DFID" -> mds #%d\n",
 		(int)op_data->op_namelen, op_data->op_name,
-		PFID(&op_data->op_fid1), tgt->ltd_idx);
+		PFID(&op_data->op_fid1), tgt->ltd_index);
 
 	rc = md_getattr_name(tgt->ltd_exp, op_data, preq);
 	if (rc == -ENOENT && lmv_dir_retry_check_update(op_data)) {
@@ -1851,12 +2009,12 @@ static int lmv_early_cancel(struct obd_export *exp, struct lmv_tgt_desc *tgt,
 		RETURN(0);
 
 	if (tgt == NULL) {
-		tgt = lmv_find_target(lmv, fid);
+		tgt = lmv_fid2tgt(lmv, fid);
 		if (IS_ERR(tgt))
 			RETURN(PTR_ERR(tgt));
 	}
 
-	if (tgt->ltd_idx != op_tgt) {
+	if (tgt->ltd_index != op_tgt) {
 		CDEBUG(D_INODE, "EARLY_CANCEL on "DFID"\n", PFID(fid));
 		policy.l_inodebits.bits = bits;
 		rc = md_cancel_unused(tgt->ltd_exp, fid, &policy,
@@ -1895,39 +2053,7 @@ static int lmv_link(struct obd_export *exp, struct md_op_data *op_data,
 	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
 	op_data->op_cap = cfs_curproc_cap_pack();
 
-	if (lmv_is_dir_migrating(op_data->op_mea2)) {
-		struct lu_fid fid1 = op_data->op_fid1;
-		struct lmv_stripe_md *lsm1 = op_data->op_mea1;
-
-		/*
-		 * avoid creating new file under old layout of migrating
-		 * directory, check it here.
-		 */
-		tgt = __lmv_locate_tgt(lmv, op_data->op_mea2, op_data->op_name,
-				       op_data->op_namelen, &op_data->op_fid2,
-				       &op_data->op_mds, false);
-		tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
-		if (IS_ERR(tgt))
-			RETURN(PTR_ERR(tgt));
-
-		op_data->op_fid1 = op_data->op_fid2;
-		op_data->op_mea1 = op_data->op_mea2;
-		rc = md_getattr_name(tgt->ltd_exp, op_data, request);
-		op_data->op_fid1 = fid1;
-		op_data->op_mea1 = lsm1;
-		if (!rc) {
-			ptlrpc_req_finished(*request);
-			*request = NULL;
-			RETURN(-EEXIST);
-		}
-
-		if (rc != -ENOENT)
-			RETURN(rc);
-	}
-
-	tgt = __lmv_locate_tgt(lmv, op_data->op_mea2, op_data->op_name,
-			       op_data->op_namelen, &op_data->op_fid2,
-			       &op_data->op_mds, true);
+	tgt = lmv_locate_tgt2(lmv, op_data);
 	if (IS_ERR(tgt))
 		RETURN(PTR_ERR(tgt));
 
@@ -1935,7 +2061,7 @@ static int lmv_link(struct obd_export *exp, struct md_op_data *op_data,
 	 * Cancel UPDATE lock on child (fid1).
 	 */
 	op_data->op_flags |= MF_MDC_CANCEL_FID2;
-	rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_idx, LCK_EX,
+	rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_index, LCK_EX,
 			      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1);
 	if (rc != 0)
 		RETURN(rc);
@@ -1971,11 +2097,11 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data,
 	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
 	op_data->op_cap = cfs_curproc_cap_pack();
 
-	parent_tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	parent_tgt = lmv_fid2tgt(lmv, &op_data->op_fid1);
 	if (IS_ERR(parent_tgt))
 		RETURN(PTR_ERR(parent_tgt));
 
-	if (lsm) {
+	if (lmv_dir_striped(lsm)) {
 		__u32 hash_type = lsm->lsm_md_hash_type;
 		__u32 stripe_count = lsm->lsm_md_stripe_count;
 
@@ -1983,7 +2109,7 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data,
 		 * old stripes are appended after new stripes for migrating
 		 * directory.
 		 */
-		if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION) {
+		if (lmv_dir_migrating(lsm)) {
 			hash_type = lsm->lsm_md_migrate_hash;
 			stripe_count -= lsm->lsm_md_migrate_offset;
 		}
@@ -1993,21 +2119,20 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data,
 		if (rc < 0)
 			RETURN(rc);
 
-		if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION)
+		if (lmv_dir_migrating(lsm))
 			rc += lsm->lsm_md_migrate_offset;
 
 		/* save it in fid4 temporarily for early cancel */
 		op_data->op_fid4 = lsm->lsm_md_oinfo[rc].lmo_fid;
-		sp_tgt = lmv_get_target(lmv, lsm->lsm_md_oinfo[rc].lmo_mds,
-					NULL);
-		if (IS_ERR(sp_tgt))
-			RETURN(PTR_ERR(sp_tgt));
+		sp_tgt = lmv_tgt(lmv, lsm->lsm_md_oinfo[rc].lmo_mds);
+		if (!sp_tgt)
+			RETURN(-ENODEV);
 
 		/*
 		 * if parent is being migrated too, fill op_fid2 with target
 		 * stripe fid, otherwise the target stripe is not created yet.
 		 */
-		if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION) {
+		if (lmv_dir_migrating(lsm)) {
 			hash_type = lsm->lsm_md_hash_type &
 				    ~LMV_HASH_FLAG_MIGRATION;
 			stripe_count = lsm->lsm_md_migrate_offset;
@@ -2018,24 +2143,32 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data,
 				RETURN(rc);
 
 			op_data->op_fid2 = lsm->lsm_md_oinfo[rc].lmo_fid;
-			tp_tgt = lmv_get_target(lmv,
-						lsm->lsm_md_oinfo[rc].lmo_mds,
-						NULL);
-			if (IS_ERR(tp_tgt))
-				RETURN(PTR_ERR(tp_tgt));
+			tp_tgt = lmv_tgt(lmv, lsm->lsm_md_oinfo[rc].lmo_mds);
+			if (!tp_tgt)
+				RETURN(-ENODEV);
 		}
 	} else {
 		sp_tgt = parent_tgt;
 	}
 
-	child_tgt = lmv_find_target(lmv, &op_data->op_fid3);
+	child_tgt = lmv_fid2tgt(lmv, &op_data->op_fid3);
 	if (IS_ERR(child_tgt))
 		RETURN(PTR_ERR(child_tgt));
 
-	if (!S_ISDIR(op_data->op_mode) && tp_tgt)
-		rc = __lmv_fid_alloc(lmv, &target_fid, tp_tgt->ltd_idx);
-	else
-		rc = lmv_fid_alloc(NULL, exp, &target_fid, op_data);
+	/* for directory, migrate to MDT specified by lum_stripe_offset;
+	 * otherwise migrate to the target stripe of parent, but parent
+	 * directory may have finished migration (normally current file too),
+	 * allocate FID on MDT lum_stripe_offset, and server will check
+	 * whether file was migrated already.
+	 */
+	if (S_ISDIR(op_data->op_mode) || !tp_tgt) {
+		struct lmv_user_md *lum = op_data->op_data;
+
+		op_data->op_mds = le32_to_cpu(lum->lum_stripe_offset);
+	} else  {
+		op_data->op_mds = tp_tgt->ltd_index;
+	}
+	rc = lmv_fid_alloc(NULL, exp, &target_fid, op_data);
 	if (rc)
 		RETURN(rc);
 
@@ -2051,7 +2184,7 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data,
 	 */
 	if (S_ISDIR(op_data->op_mode) &&
 	    (exp_connect_flags2(exp) & OBD_CONNECT2_DIR_MIGRATE)) {
-		tgt = lmv_find_target(lmv, &target_fid);
+		tgt = lmv_fid2tgt(lmv, &target_fid);
 		if (IS_ERR(tgt))
 			RETURN(PTR_ERR(tgt));
 	} else {
@@ -2059,7 +2192,7 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data,
 	}
 
 	/* cancel UPDATE lock of parent master object */
-	rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_idx, LCK_EX,
+	rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_index, LCK_EX,
 			      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1);
 	if (rc)
 		RETURN(rc);
@@ -2084,14 +2217,14 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data,
 	op_data->op_fid4 = target_fid;
 
 	/* cancel UPDATE locks of target parent */
-	rc = lmv_early_cancel(exp, tp_tgt, op_data, tgt->ltd_idx, LCK_EX,
+	rc = lmv_early_cancel(exp, tp_tgt, op_data, tgt->ltd_index, LCK_EX,
 			      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID2);
 	if (rc)
 		RETURN(rc);
 
 	/* cancel LOOKUP lock of source if source is remote object */
 	if (child_tgt != sp_tgt) {
-		rc = lmv_early_cancel(exp, sp_tgt, op_data, tgt->ltd_idx,
+		rc = lmv_early_cancel(exp, sp_tgt, op_data, tgt->ltd_index,
 				      LCK_EX, MDS_INODELOCK_LOOKUP,
 				      MF_MDC_CANCEL_FID3);
 		if (rc)
@@ -2099,7 +2232,7 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data,
 	}
 
 	/* cancel ELC locks of source */
-	rc = lmv_early_cancel(exp, child_tgt, op_data, tgt->ltd_idx, LCK_EX,
+	rc = lmv_early_cancel(exp, child_tgt, op_data, tgt->ltd_index, LCK_EX,
 			      MDS_INODELOCK_ELC, MF_MDC_CANCEL_FID3);
 	if (rc)
 		RETURN(rc);
@@ -2136,44 +2269,10 @@ static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
 	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
 	op_data->op_cap = cfs_curproc_cap_pack();
 
-	if (lmv_is_dir_migrating(op_data->op_mea2)) {
-		struct lu_fid fid1 = op_data->op_fid1;
-		struct lmv_stripe_md *lsm1 = op_data->op_mea1;
-
-		/*
-		 * we avoid creating new file under old layout of migrating
-		 * directory, if there is an existing file with new name under
-		 * old layout, we can't unlink file in old layout and rename to
-		 * new layout in one transaction, so return -EBUSY here.`
-		 */
-		tgt = __lmv_locate_tgt(lmv, op_data->op_mea2, new, newlen,
-				       &op_data->op_fid2, &op_data->op_mds,
-				       false);
-		if (IS_ERR(tgt))
-			RETURN(PTR_ERR(tgt));
-
-		op_data->op_fid1 = op_data->op_fid2;
-		op_data->op_mea1 = op_data->op_mea2;
-		op_data->op_name = new;
-		op_data->op_namelen = newlen;
-		rc = md_getattr_name(tgt->ltd_exp, op_data, request);
-		op_data->op_fid1 = fid1;
-		op_data->op_mea1 = lsm1;
-		op_data->op_name = NULL;
-		op_data->op_namelen = 0;
-		if (!rc) {
-			ptlrpc_req_finished(*request);
-			*request = NULL;
-			RETURN(-EBUSY);
-		}
-
-		if (rc != -ENOENT)
-			RETURN(rc);
-	}
+	op_data->op_name = new;
+	op_data->op_namelen = newlen;
 
-	/* rename to new layout for migrating directory */
-	tp_tgt = __lmv_locate_tgt(lmv, op_data->op_mea2, new, newlen,
-				  &op_data->op_fid2, &op_data->op_mds, true);
+	tp_tgt = lmv_locate_tgt2(lmv, op_data);
 	if (IS_ERR(tp_tgt))
 		RETURN(PTR_ERR(tp_tgt));
 
@@ -2183,7 +2282,7 @@ static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
 	 * target child does not exist, then it will send the request to the
 	 * target parent */
 	if (fid_is_sane(&op_data->op_fid4)) {
-		tgt = lmv_find_target(lmv, &op_data->op_fid4);
+		tgt = lmv_fid2tgt(lmv, &op_data->op_fid4);
 		if (IS_ERR(tgt))
 			RETURN(PTR_ERR(tgt));
 	} else {
@@ -2193,7 +2292,7 @@ static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
 	op_data->op_flags |= MF_MDC_CANCEL_FID4;
 
 	/* cancel UPDATE locks of target parent */
-	rc = lmv_early_cancel(exp, tp_tgt, op_data, tgt->ltd_idx, LCK_EX,
+	rc = lmv_early_cancel(exp, tp_tgt, op_data, tgt->ltd_index, LCK_EX,
 			      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID2);
 	if (rc != 0)
 		RETURN(rc);
@@ -2202,7 +2301,7 @@ static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
 		/* cancel LOOKUP lock of target on target parent */
 		if (tgt != tp_tgt) {
 			rc = lmv_early_cancel(exp, tp_tgt, op_data,
-					      tgt->ltd_idx, LCK_EX,
+					      tgt->ltd_index, LCK_EX,
 					      MDS_INODELOCK_LOOKUP,
 					      MF_MDC_CANCEL_FID4);
 			if (rc != 0)
@@ -2211,27 +2310,27 @@ static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
 	}
 
 	if (fid_is_sane(&op_data->op_fid3)) {
-		src_tgt = lmv_find_target(lmv, &op_data->op_fid3);
+		src_tgt = lmv_fid2tgt(lmv, &op_data->op_fid3);
 		if (IS_ERR(src_tgt))
 			RETURN(PTR_ERR(src_tgt));
 
 		/* cancel ELC locks of source */
-		rc = lmv_early_cancel(exp, src_tgt, op_data, tgt->ltd_idx,
+		rc = lmv_early_cancel(exp, src_tgt, op_data, tgt->ltd_index,
 				      LCK_EX, MDS_INODELOCK_ELC,
 				      MF_MDC_CANCEL_FID3);
 		if (rc != 0)
 			RETURN(rc);
 	}
 
+	op_data->op_name = old;
+	op_data->op_namelen = oldlen;
 retry:
-	sp_tgt = __lmv_locate_tgt(lmv, op_data->op_mea1, old, oldlen,
-				  &op_data->op_fid1, &op_data->op_mds,
-				  op_data->op_post_migrate);
+	sp_tgt = lmv_locate_tgt(lmv, op_data);
 	if (IS_ERR(sp_tgt))
 		RETURN(PTR_ERR(sp_tgt));
 
 	/* cancel UPDATE locks of source parent */
-	rc = lmv_early_cancel(exp, sp_tgt, op_data, tgt->ltd_idx, LCK_EX,
+	rc = lmv_early_cancel(exp, sp_tgt, op_data, tgt->ltd_index, LCK_EX,
 			      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1);
 	if (rc != 0)
 		RETURN(rc);
@@ -2240,7 +2339,7 @@ static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
 		/* cancel LOOKUP lock of source on source parent */
 		if (src_tgt != sp_tgt) {
 			rc = lmv_early_cancel(exp, sp_tgt, op_data,
-					      tgt->ltd_idx, LCK_EX,
+					      tgt->ltd_index, LCK_EX,
 					      MDS_INODELOCK_LOOKUP,
 					      MF_MDC_CANCEL_FID3);
 			if (rc != 0)
@@ -2277,7 +2376,7 @@ static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
 	ptlrpc_req_finished(*request);
 	*request = NULL;
 
-	tgt = lmv_find_target(lmv, &op_data->op_fid4);
+	tgt = lmv_fid2tgt(lmv, &op_data->op_fid4);
 	if (IS_ERR(tgt))
 		RETURN(PTR_ERR(tgt));
 
@@ -2285,7 +2384,7 @@ static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
 		/* cancel LOOKUP lock of target on target parent */
 		if (tgt != tp_tgt) {
 			rc = lmv_early_cancel(exp, tp_tgt, op_data,
-					      tgt->ltd_idx, LCK_EX,
+					      tgt->ltd_index, LCK_EX,
 					      MDS_INODELOCK_LOOKUP,
 					      MF_MDC_CANCEL_FID4);
 			if (rc != 0)
@@ -2299,10 +2398,11 @@ static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
 static int lmv_setattr(struct obd_export *exp, struct md_op_data *op_data,
 		       void *ea, size_t ealen, struct ptlrpc_request **request)
 {
-	struct obd_device       *obd = exp->exp_obd;
-	struct lmv_obd          *lmv = &obd->u.lmv;
-	struct lmv_tgt_desc     *tgt;
-	int                      rc = 0;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int rc = 0;
+
 	ENTRY;
 
 	CDEBUG(D_INODE, "SETATTR for "DFID", valid 0x%x/0x%x\n",
@@ -2310,7 +2410,7 @@ static int lmv_setattr(struct obd_export *exp, struct md_op_data *op_data,
 	       op_data->op_xvalid);
 
 	op_data->op_flags |= MF_MDC_CANCEL_FID1;
-	tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	tgt = lmv_fid2tgt(lmv, &op_data->op_fid1);
 	if (IS_ERR(tgt))
 		RETURN(PTR_ERR(tgt));
 
@@ -2322,13 +2422,14 @@ static int lmv_setattr(struct obd_export *exp, struct md_op_data *op_data,
 static int lmv_fsync(struct obd_export *exp, const struct lu_fid *fid,
 		     struct ptlrpc_request **request)
 {
-	struct obd_device	*obd = exp->exp_obd;
-	struct lmv_obd		*lmv = &obd->u.lmv;
-	struct lmv_tgt_desc	*tgt;
-	int			 rc;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int rc;
+
 	ENTRY;
 
-	tgt = lmv_find_target(lmv, fid);
+	tgt = lmv_fid2tgt(lmv, fid);
 	if (IS_ERR(tgt))
 		RETURN(PTR_ERR(tgt));
 
@@ -2437,9 +2538,9 @@ static struct lu_dirent *stripe_dirent_load(struct lmv_dir_ctxt *ctxt,
 			break;
 		}
 
-		tgt = lmv_get_target(ctxt->ldc_lmv, oinfo->lmo_mds, NULL);
-		if (IS_ERR(tgt)) {
-			rc = PTR_ERR(tgt);
+		tgt = lmv_tgt(ctxt->ldc_lmv, oinfo->lmo_mds);
+		if (!tgt) {
+			rc = -ENODEV;
 			break;
 		}
 
@@ -2480,17 +2581,18 @@ static struct lu_dirent *stripe_dirent_load(struct lmv_dir_ctxt *ctxt,
 
 static int lmv_file_resync(struct obd_export *exp, struct md_op_data *data)
 {
-	struct obd_device	*obd = exp->exp_obd;
-	struct lmv_obd		*lmv = &obd->u.lmv;
-	struct lmv_tgt_desc	*tgt;
-	int			 rc;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int rc;
+
 	ENTRY;
 
 	rc = lmv_check_connect(obd);
 	if (rc != 0)
 		RETURN(rc);
 
-	tgt = lmv_find_target(lmv, &data->op_fid1);
+	tgt = lmv_fid2tgt(lmv, &data->op_fid1);
 	if (IS_ERR(tgt))
 		RETURN(PTR_ERR(tgt));
 
@@ -2696,19 +2798,19 @@ int lmv_read_page(struct obd_export *exp, struct md_op_data *op_data,
 		  struct md_callback *cb_op, __u64 offset,
 		  struct page **ppage)
 {
-	struct obd_device	*obd = exp->exp_obd;
-	struct lmv_obd		*lmv = &obd->u.lmv;
-	struct lmv_stripe_md	*lsm = op_data->op_mea1;
-	struct lmv_tgt_desc	*tgt;
-	int			rc;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int rc;
+
 	ENTRY;
 
-	if (unlikely(lsm != NULL)) {
+	if (unlikely(lmv_dir_striped(op_data->op_mea1))) {
 		rc = lmv_striped_read_page(exp, op_data, cb_op, offset, ppage);
 		RETURN(rc);
 	}
 
-	tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	tgt = lmv_fid2tgt(lmv, &op_data->op_fid1);
 	if (IS_ERR(tgt))
 		RETURN(PTR_ERR(tgt));
 
@@ -2758,12 +2860,12 @@ static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data,
 	op_data->op_cap = cfs_curproc_cap_pack();
 
 retry:
-	parent_tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
+	parent_tgt = lmv_locate_tgt(lmv, op_data);
 	if (IS_ERR(parent_tgt))
 		RETURN(PTR_ERR(parent_tgt));
 
 	if (likely(!fid_is_zero(&op_data->op_fid2))) {
-		tgt = lmv_find_target(lmv, &op_data->op_fid2);
+		tgt = lmv_fid2tgt(lmv, &op_data->op_fid2);
 		if (IS_ERR(tgt))
 			RETURN(PTR_ERR(tgt));
 	} else {
@@ -2780,17 +2882,18 @@ static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data,
 	op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
 
 	if (parent_tgt != tgt)
-		rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_idx,
+		rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_index,
 				      LCK_EX, MDS_INODELOCK_LOOKUP,
 				      MF_MDC_CANCEL_FID3);
 
-	rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_idx, LCK_EX,
+	rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_index, LCK_EX,
 			      MDS_INODELOCK_ELC, MF_MDC_CANCEL_FID3);
 	if (rc)
 		RETURN(rc);
 
 	CDEBUG(D_INODE, "unlink with fid="DFID"/"DFID" -> mds #%u\n",
-	       PFID(&op_data->op_fid1), PFID(&op_data->op_fid2), tgt->ltd_idx);
+	       PFID(&op_data->op_fid1), PFID(&op_data->op_fid2),
+	       tgt->ltd_index);
 
 	rc = md_unlink(tgt->ltd_exp, op_data, request);
 	if (rc == -ENOENT && lmv_dir_retry_check_update(op_data)) {
@@ -2815,7 +2918,7 @@ static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data,
 	ptlrpc_req_finished(*request);
 	*request = NULL;
 
-	tgt = lmv_find_target(lmv, &op_data->op_fid2);
+	tgt = lmv_fid2tgt(lmv, &op_data->op_fid2);
 	if (IS_ERR(tgt))
 		RETURN(PTR_ERR(tgt));
 
@@ -2851,31 +2954,24 @@ static int lmv_precleanup(struct obd_device *obd)
 static int lmv_get_info(const struct lu_env *env, struct obd_export *exp,
 			__u32 keylen, void *key, __u32 *vallen, void *val)
 {
-        struct obd_device       *obd;
-        struct lmv_obd          *lmv;
-        int                      rc = 0;
-        ENTRY;
-
-        obd = class_exp2obd(exp);
-        if (obd == NULL) {
-		CDEBUG(D_IOCTL, "Invalid client cookie %#llx\n",
-                       exp->exp_handle.h_cookie);
-                RETURN(-EINVAL);
-        }
+	struct obd_device *obd;
+	struct lmv_obd *lmv;
+	struct lu_tgt_desc *tgt;
+	int rc = 0;
 
-        lmv = &obd->u.lmv;
-        if (keylen >= strlen("remote_flag") && !strcmp(key, "remote_flag")) {
-                int i;
+	ENTRY;
 
-                LASSERT(*vallen == sizeof(__u32));
-		for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
-			struct lmv_tgt_desc *tgt = lmv->tgts[i];
-			/*
-			 * All tgts should be connected when this gets called.
-			 */
-			if (tgt == NULL || tgt->ltd_exp == NULL)
-				continue;
+	obd = class_exp2obd(exp);
+	if (obd == NULL) {
+		CDEBUG(D_IOCTL, "Invalid client cookie %#llx\n",
+		       exp->exp_handle.h_cookie);
+		RETURN(-EINVAL);
+	}
 
+	lmv = &obd->u.lmv;
+	if (keylen >= strlen("remote_flag") && !strcmp(key, "remote_flag")) {
+		LASSERT(*vallen == sizeof(__u32));
+		lmv_foreach_connected_tgt(lmv, tgt) {
 			if (!obd_get_info(env, tgt->ltd_exp, keylen, key,
 					  vallen, val))
 				RETURN(0);
@@ -2888,18 +2984,21 @@ static int lmv_get_info(const struct lu_env *env, struct obd_export *exp,
 		 * Forwarding this request to first MDS, it should know LOV
 		 * desc.
 		 */
-		rc = obd_get_info(env, lmv->tgts[0]->ltd_exp, keylen, key,
-				  vallen, val);
+		tgt = lmv_tgt(lmv, 0);
+		if (!tgt)
+			RETURN(-ENODEV);
+
+		rc = obd_get_info(env, tgt->ltd_exp, keylen, key, vallen, val);
 		if (!rc && KEY_IS(KEY_CONN_DATA))
 			exp->exp_connect_data = *(struct obd_connect_data *)val;
-                RETURN(rc);
-        } else if (KEY_IS(KEY_TGT_COUNT)) {
-                *((int *)val) = lmv->desc.ld_tgt_count;
-                RETURN(0);
-        }
+		RETURN(rc);
+	} else if (KEY_IS(KEY_TGT_COUNT)) {
+		*((int *)val) = lmv->lmv_mdt_descs.ltd_tgts_size;
+		RETURN(0);
+	}
 
-        CDEBUG(D_IOCTL, "Invalid key\n");
-        RETURN(-EINVAL);
+	CDEBUG(D_IOCTL, "Invalid key\n");
+	RETURN(-EINVAL);
 }
 
 static int lmv_rmfid(struct obd_export *exp, struct fid_array *fa,
@@ -2908,7 +3007,8 @@ static int lmv_rmfid(struct obd_export *exp, struct fid_array *fa,
 	struct obd_device *obddev = class_exp2obd(exp);
 	struct ptlrpc_request_set *set = _set;
 	struct lmv_obd *lmv = &obddev->u.lmv;
-	int tgt_count = lmv->desc.ld_tgt_count;
+	int tgt_count = lmv->lmv_mdt_count;
+	struct lu_tgt_desc *tgt;
 	struct fid_array *fat, **fas = NULL;
 	int i, rc, **rcs = NULL;
 
@@ -2950,11 +3050,11 @@ static int lmv_rmfid(struct obd_export *exp, struct fid_array *fa,
 		fat->fa_fids[fat->fa_nr++] = fa->fa_fids[i];
 	}
 
-	for (i = 0; i < tgt_count; i++) {
-		fat = fas[i];
+	lmv_foreach_connected_tgt(lmv, tgt) {
+		fat = fas[tgt->ltd_index];
 		if (!fat || fat->fa_nr == 0)
 			continue;
-		rc = md_rmfid(lmv->tgts[i]->ltd_exp, fat, rcs[i], set);
+		rc = md_rmfid(tgt->ltd_exp, fat, rcs[tgt->ltd_index], set);
 	}
 
 	rc = ptlrpc_set_wait(NULL, set);
@@ -3028,14 +3128,9 @@ int lmv_set_info_async(const struct lu_env *env, struct obd_export *exp,
 
 	if (KEY_IS(KEY_READ_ONLY) || KEY_IS(KEY_FLUSH_CTX) ||
 	    KEY_IS(KEY_DEFAULT_EASIZE)) {
-		int i, err = 0;
-
-		for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
-			tgt = lmv->tgts[i];
-
-			if (tgt == NULL || tgt->ltd_exp == NULL)
-				continue;
+		int err = 0;
 
+		lmv_foreach_connected_tgt(lmv, tgt) {
 			err = obd_set_info_async(env, tgt->ltd_exp,
 						 keylen, key, vallen, val, set);
 			if (err && rc == 0)
@@ -3087,7 +3182,7 @@ static int lmv_unpack_md_v1(struct obd_export *exp, struct lmv_stripe_md *lsm,
 		 * set default value -1, so lmv_locate_tgt() knows this stripe
 		 * target is not initialized.
 		 */
-		lsm->lsm_md_oinfo[i].lmo_mds = (u32)-1;
+		lsm->lsm_md_oinfo[i].lmo_mds = LMV_OFFSET_DEFAULT;
 		if (!fid_is_sane(&lsm->lsm_md_oinfo[i].lmo_fid))
 			continue;
 
@@ -3106,6 +3201,21 @@ static int lmv_unpack_md_v1(struct obd_export *exp, struct lmv_stripe_md *lsm,
 	RETURN(rc);
 }
 
+static inline int lmv_unpack_user_md(struct obd_export *exp,
+				     struct lmv_stripe_md *lsm,
+				     const struct lmv_user_md *lmu)
+{
+	lsm->lsm_md_magic = le32_to_cpu(lmu->lum_magic);
+	lsm->lsm_md_stripe_count = le32_to_cpu(lmu->lum_stripe_count);
+	lsm->lsm_md_master_mdt_index = le32_to_cpu(lmu->lum_stripe_offset);
+	lsm->lsm_md_hash_type = le32_to_cpu(lmu->lum_hash_type);
+	lsm->lsm_md_max_inherit = lmu->lum_max_inherit;
+	lsm->lsm_md_max_inherit_rr = lmu->lum_max_inherit_rr;
+	lsm->lsm_md_pool_name[LOV_MAXPOOLNAME] = 0;
+
+	return 0;
+}
+
 static int lmv_unpackmd(struct obd_export *exp, struct lmv_stripe_md **lsmp,
 			const union lmv_mds_md *lmm, size_t lmm_size)
 {
@@ -3122,11 +3232,15 @@ static int lmv_unpackmd(struct obd_export *exp, struct lmv_stripe_md **lsmp,
 	if (lsm != NULL && lmm == NULL) {
 		int i;
 
-		for (i = 0; i < lsm->lsm_md_stripe_count; i++) {
-			if (lsm->lsm_md_oinfo[i].lmo_root)
-				iput(lsm->lsm_md_oinfo[i].lmo_root);
+		if (lmv_dir_striped(lsm)) {
+			for (i = 0; i < lsm->lsm_md_stripe_count; i++) {
+				if (lsm->lsm_md_oinfo[i].lmo_root)
+					iput(lsm->lsm_md_oinfo[i].lmo_root);
+			}
+			lsm_size = lmv_stripe_md_size(lsm->lsm_md_stripe_count);
+		} else {
+			lsm_size = lmv_stripe_md_size(0);
 		}
-		lsm_size = lmv_stripe_md_size(lsm->lsm_md_stripe_count);
 		OBD_FREE(lsm, lsm_size);
 		*lsmp = NULL;
 		RETURN(0);
@@ -3153,7 +3267,6 @@ static int lmv_unpackmd(struct obd_export *exp, struct lmv_stripe_md **lsmp,
 		 */
 		lsm_size = lmv_stripe_md_size(0);
 
-	lsm_size = lmv_stripe_md_size(lmv_mds_md_stripe_count_get(lmm));
 	if (lsm == NULL) {
 		OBD_ALLOC(lsm, lsm_size);
 		if (lsm == NULL)
@@ -3166,6 +3279,9 @@ static int lmv_unpackmd(struct obd_export *exp, struct lmv_stripe_md **lsmp,
 	case LMV_MAGIC_V1:
 		rc = lmv_unpack_md_v1(exp, lsm, &lmm->lmv_md_v1);
 		break;
+	case LMV_USER_MAGIC:
+		rc = lmv_unpack_user_md(exp, lsm, &lmm->lmv_user_md);
+		break;
 	default:
 		CERROR("%s: unrecognized magic %x\n", exp->exp_obd->obd_name,
 		       le32_to_cpu(lmm->lmv_magic));
@@ -3193,17 +3309,16 @@ static int lmv_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
 			     void *opaque)
 {
 	struct lmv_obd *lmv = &exp->exp_obd->u.lmv;
+	struct lu_tgt_desc *tgt;
+	int err;
 	int rc = 0;
-	__u32 i;
+
 	ENTRY;
 
 	LASSERT(fid != NULL);
 
-	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
-		struct lmv_tgt_desc *tgt = lmv->tgts[i];
-		int err;
-
-		if (tgt == NULL || tgt->ltd_exp == NULL || !tgt->ltd_active)
+	lmv_foreach_connected_tgt(lmv, tgt) {
+		if (!tgt->ltd_active)
 			continue;
 
 		err = md_cancel_unused(tgt->ltd_exp, fid, policy, mode, flags,
@@ -3218,9 +3333,10 @@ static int lmv_set_lock_data(struct obd_export *exp,
 			     const struct lustre_handle *lockh,
 			     void *data, __u64 *bits)
 {
-	struct lmv_obd		*lmv = &exp->exp_obd->u.lmv;
-	struct lmv_tgt_desc	*tgt = lmv->tgts[0];
-	int			 rc;
+	struct lmv_obd *lmv = &exp->exp_obd->u.lmv;
+	struct lmv_tgt_desc *tgt = lmv_tgt(lmv, 0);
+	int rc;
+
 	ENTRY;
 
 	if (tgt == NULL || tgt->ltd_exp == NULL)
@@ -3234,37 +3350,38 @@ enum ldlm_mode lmv_lock_match(struct obd_export *exp, __u64 flags,
 			      union ldlm_policy_data *policy,
 			      enum ldlm_mode mode, struct lustre_handle *lockh)
 {
-	struct obd_device	*obd = exp->exp_obd;
-	struct lmv_obd		*lmv = &obd->u.lmv;
-	enum ldlm_mode		rc;
-	int			tgt;
-	int			i;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	enum ldlm_mode rc;
+	struct lu_tgt_desc *tgt;
+	int i;
+	int index;
+
 	ENTRY;
 
 	CDEBUG(D_INODE, "Lock match for "DFID"\n", PFID(fid));
 
-        /*
+	/*
 	 * With DNE every object can have two locks in different namespaces:
 	 * lookup lock in space of MDT storing direntry and update/open lock in
 	 * space of MDT storing inode.  Try the MDT that the FID maps to first,
 	 * since this can be easily found, and only try others if that fails.
 	 */
-	for (i = 0, tgt = lmv_find_target_index(lmv, fid);
-	     i < lmv->desc.ld_tgt_count;
-	     i++, tgt = (tgt + 1) % lmv->desc.ld_tgt_count) {
-		if (tgt < 0) {
+	for (i = 0, index = lmv_fid2tgt_index(lmv, fid);
+	     i < lmv->lmv_mdt_descs.ltd_tgts_size;
+	     i++, index = (index + 1) % lmv->lmv_mdt_descs.ltd_tgts_size) {
+		if (index < 0) {
 			CDEBUG(D_HA, "%s: "DFID" is inaccessible: rc = %d\n",
-			       obd->obd_name, PFID(fid), tgt);
-			tgt = 0;
+			       obd->obd_name, PFID(fid), index);
+			index = 0;
 		}
 
-		if (lmv->tgts[tgt] == NULL ||
-		    lmv->tgts[tgt]->ltd_exp == NULL ||
-		    lmv->tgts[tgt]->ltd_active == 0)
+		tgt = lmv_tgt(lmv, index);
+		if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
 			continue;
 
-		rc = md_lock_match(lmv->tgts[tgt]->ltd_exp, flags, fid,
-				   type, policy, mode, lockh);
+		rc = md_lock_match(tgt->ltd_exp, flags, fid, type, policy, mode,
+				   lockh);
 		if (rc)
 			RETURN(rc);
 	}
@@ -3276,41 +3393,47 @@ int lmv_get_lustre_md(struct obd_export *exp, struct ptlrpc_request *req,
 		      struct obd_export *dt_exp, struct obd_export *md_exp,
 		      struct lustre_md *md)
 {
-	struct lmv_obd          *lmv = &exp->exp_obd->u.lmv;
-	struct lmv_tgt_desc	*tgt = lmv->tgts[0];
+	struct lmv_obd *lmv = &exp->exp_obd->u.lmv;
+	struct lmv_tgt_desc *tgt = lmv_tgt(lmv, 0);
 
-	if (tgt == NULL || tgt->ltd_exp == NULL)
-		RETURN(-EINVAL);
+	if (!tgt || !tgt->ltd_exp)
+		return -EINVAL;
 
-	return md_get_lustre_md(lmv->tgts[0]->ltd_exp, req, dt_exp, md_exp, md);
+	return md_get_lustre_md(tgt->ltd_exp, req, dt_exp, md_exp, md);
 }
 
 int lmv_free_lustre_md(struct obd_export *exp, struct lustre_md *md)
 {
-	struct obd_device	*obd = exp->exp_obd;
-	struct lmv_obd		*lmv = &obd->u.lmv;
-	struct lmv_tgt_desc	*tgt = lmv->tgts[0];
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt = lmv_tgt(lmv, 0);
+
 	ENTRY;
 
+	if (md->default_lmv) {
+		lmv_free_memmd(md->default_lmv);
+		md->default_lmv = NULL;
+	}
 	if (md->lmv != NULL) {
 		lmv_free_memmd(md->lmv);
 		md->lmv = NULL;
 	}
-	if (tgt == NULL || tgt->ltd_exp == NULL)
+	if (!tgt || !tgt->ltd_exp)
 		RETURN(-EINVAL);
-	RETURN(md_free_lustre_md(lmv->tgts[0]->ltd_exp, md));
+	RETURN(md_free_lustre_md(tgt->ltd_exp, md));
 }
 
 int lmv_set_open_replay_data(struct obd_export *exp,
 			     struct obd_client_handle *och,
 			     struct lookup_intent *it)
 {
-	struct obd_device	*obd = exp->exp_obd;
-	struct lmv_obd		*lmv = &obd->u.lmv;
-	struct lmv_tgt_desc	*tgt;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+
 	ENTRY;
 
-	tgt = lmv_find_target(lmv, &och->och_fid);
+	tgt = lmv_fid2tgt(lmv, &och->och_fid);
 	if (IS_ERR(tgt))
 		RETURN(PTR_ERR(tgt));
 
@@ -3318,18 +3441,19 @@ int lmv_set_open_replay_data(struct obd_export *exp,
 }
 
 int lmv_clear_open_replay_data(struct obd_export *exp,
-                               struct obd_client_handle *och)
+			       struct obd_client_handle *och)
 {
-        struct obd_device       *obd = exp->exp_obd;
-        struct lmv_obd          *lmv = &obd->u.lmv;
-        struct lmv_tgt_desc     *tgt;
-        ENTRY;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+
+	ENTRY;
 
-        tgt = lmv_find_target(lmv, &och->och_fid);
-        if (IS_ERR(tgt))
-                RETURN(PTR_ERR(tgt));
+	tgt = lmv_fid2tgt(lmv, &och->och_fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
 
-        RETURN(md_clear_open_replay_data(tgt->ltd_exp, och));
+	RETURN(md_clear_open_replay_data(tgt->ltd_exp, och));
 }
 
 int lmv_intent_getattr_async(struct obd_export *exp,
@@ -3347,11 +3471,11 @@ int lmv_intent_getattr_async(struct obd_export *exp,
 	if (!fid_is_sane(&op_data->op_fid2))
 		RETURN(-EINVAL);
 
-	ptgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1);
+	ptgt = lmv_locate_tgt(lmv, op_data);
 	if (IS_ERR(ptgt))
 		RETURN(PTR_ERR(ptgt));
 
-	ctgt = lmv_find_target(lmv, &op_data->op_fid2);
+	ctgt = lmv_fid2tgt(lmv, &op_data->op_fid2);
 	if (IS_ERR(ctgt))
 		RETURN(PTR_ERR(ctgt));
 
@@ -3367,20 +3491,21 @@ int lmv_intent_getattr_async(struct obd_export *exp,
 }
 
 int lmv_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
-                        struct lu_fid *fid, __u64 *bits)
+			struct lu_fid *fid, __u64 *bits)
 {
-        struct obd_device       *obd = exp->exp_obd;
-        struct lmv_obd          *lmv = &obd->u.lmv;
-        struct lmv_tgt_desc     *tgt;
-        int                      rc;
-        ENTRY;
+	struct obd_device *obd = exp->exp_obd;
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int rc;
 
-        tgt = lmv_find_target(lmv, fid);
-        if (IS_ERR(tgt))
-                RETURN(PTR_ERR(tgt));
+	ENTRY;
+
+	tgt = lmv_fid2tgt(lmv, fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
 
-        rc = md_revalidate_lock(tgt->ltd_exp, it, fid, bits);
-        RETURN(rc);
+	rc = md_revalidate_lock(tgt->ltd_exp, it, fid, bits);
+	RETURN(rc);
 }
 
 int lmv_get_fid_from_lsm(struct obd_export *exp,
@@ -3389,7 +3514,8 @@ int lmv_get_fid_from_lsm(struct obd_export *exp,
 {
 	const struct lmv_oinfo *oinfo;
 
-	LASSERT(lsm != NULL);
+	LASSERT(lmv_dir_striped(lsm));
+
 	oinfo = lsm_name_to_stripe_info(lsm, name, namelen, false);
 	if (IS_ERR(oinfo))
 		return PTR_ERR(oinfo);
@@ -3407,49 +3533,46 @@ int lmv_get_fid_from_lsm(struct obd_export *exp,
 int lmv_quotactl(struct obd_device *unused, struct obd_export *exp,
 		 struct obd_quotactl *oqctl)
 {
-	struct obd_device   *obd = class_exp2obd(exp);
-	struct lmv_obd      *lmv = &obd->u.lmv;
-	struct lmv_tgt_desc *tgt = lmv->tgts[0];
-	int                  rc = 0;
-	__u32                i;
-	__u64                curspace, curinodes;
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lmv_obd *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt = lmv_tgt(lmv, 0);
+	__u64 curspace, curinodes;
+	int rc = 0;
+
 	ENTRY;
 
-	if (tgt == NULL ||
-	    tgt->ltd_exp == NULL ||
-	    !tgt->ltd_active ||
-	    lmv->desc.ld_tgt_count == 0) {
+	if (!tgt || !tgt->ltd_exp || !tgt->ltd_active) {
 		CERROR("master lmv inactive\n");
 		RETURN(-EIO);
 	}
 
-        if (oqctl->qc_cmd != Q_GETOQUOTA) {
-                rc = obd_quotactl(tgt->ltd_exp, oqctl);
-                RETURN(rc);
-        }
+	if (oqctl->qc_cmd != Q_GETOQUOTA) {
+		rc = obd_quotactl(tgt->ltd_exp, oqctl);
+		RETURN(rc);
+	}
 
-        curspace = curinodes = 0;
-        for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+	curspace = curinodes = 0;
+	lmv_foreach_connected_tgt(lmv, tgt) {
 		int err;
-		tgt = lmv->tgts[i];
 
-		if (tgt == NULL || tgt->ltd_exp == NULL || !tgt->ltd_active)
+		if (!tgt->ltd_active)
 			continue;
 
-                err = obd_quotactl(tgt->ltd_exp, oqctl);
-                if (err) {
-                        CERROR("getquota on mdt %d failed. %d\n", i, err);
-                        if (!rc)
-                                rc = err;
-                } else {
-                        curspace += oqctl->qc_dqblk.dqb_curspace;
-                        curinodes += oqctl->qc_dqblk.dqb_curinodes;
-                }
-        }
-        oqctl->qc_dqblk.dqb_curspace = curspace;
-        oqctl->qc_dqblk.dqb_curinodes = curinodes;
+		err = obd_quotactl(tgt->ltd_exp, oqctl);
+		if (err) {
+			CERROR("getquota on mdt %d failed. %d\n",
+			       tgt->ltd_index, err);
+			if (!rc)
+				rc = err;
+		} else {
+			curspace += oqctl->qc_dqblk.dqb_curspace;
+			curinodes += oqctl->qc_dqblk.dqb_curinodes;
+		}
+	}
+	oqctl->qc_dqblk.dqb_curspace = curspace;
+	oqctl->qc_dqblk.dqb_curinodes = curinodes;
 
-        RETURN(rc);
+	RETURN(rc);
 }
 
 static int lmv_merge_attr(struct obd_export *exp,
@@ -3460,6 +3583,9 @@ static int lmv_merge_attr(struct obd_export *exp,
 	int rc;
 	int i;
 
+	if (!lmv_dir_striped(lsm))
+		return 0;
+
 	rc = lmv_revalidate_slaves(exp, lsm, cb_blocking, 0);
 	if (rc < 0)
 		return rc;
diff --git a/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c b/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c
index dc35e7d9d9e66..aed88d0f74157 100644
--- a/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c
+++ b/drivers/staging/lustrefsx/lustre/lmv/lproc_lmv.c
@@ -44,10 +44,8 @@ static ssize_t numobd_show(struct kobject *kobj, struct attribute *attr,
 {
 	struct obd_device *dev = container_of(kobj, struct obd_device,
 					      obd_kset.kobj);
-	struct lmv_desc *desc;
 
-        desc = &dev->u.lmv.desc;
-	return sprintf(buf, "%u\n", desc->ld_tgt_count);
+	return sprintf(buf, "%u\n", dev->u.lmv.lmv_mdt_count);
 }
 LUSTRE_RO_ATTR(numobd);
 
@@ -56,10 +54,9 @@ static ssize_t activeobd_show(struct kobject *kobj, struct attribute *attr,
 {
 	struct obd_device *dev = container_of(kobj, struct obd_device,
 					      obd_kset.kobj);
-	struct lmv_desc *desc;
 
-        desc = &dev->u.lmv.desc;
-	return sprintf(buf, "%u\n", desc->ld_active_tgt_count);
+	return sprintf(buf, "%u\n",
+		dev->u.lmv.lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count);
 }
 LUSTRE_RO_ATTR(activeobd);
 
@@ -68,26 +65,154 @@ static ssize_t desc_uuid_show(struct kobject *kobj, struct attribute *attr,
 {
 	struct obd_device *dev = container_of(kobj, struct obd_device,
 					      obd_kset.kobj);
-	struct lmv_desc *desc;
 
-	desc = &dev->u.lmv.desc;
-	return sprintf(buf, "%s\n", desc->ld_uuid.uuid);
+	return sprintf(buf, "%s\n",
+		       dev->u.lmv.lmv_mdt_descs.ltd_lmv_desc.ld_uuid.uuid);
 }
 LUSTRE_RO_ATTR(desc_uuid);
 
+static ssize_t qos_maxage_show(struct kobject *kobj,
+			       struct attribute *attr,
+			       char *buf)
+{
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+
+	return sprintf(buf, "%u\n",
+		       dev->u.lmv.lmv_mdt_descs.ltd_lmv_desc.ld_qos_maxage);
+}
+
+static ssize_t qos_maxage_store(struct kobject *kobj,
+				struct attribute *attr,
+				const char *buffer,
+				size_t count)
+{
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buffer, 0, &val);
+	if (rc)
+		return rc;
+
+	dev->u.lmv.lmv_mdt_descs.ltd_lmv_desc.ld_qos_maxage = val;
+
+	return count;
+}
+LUSTRE_RW_ATTR(qos_maxage);
+
+static ssize_t qos_prio_free_show(struct kobject *kobj,
+				  struct attribute *attr,
+				  char *buf)
+{
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+
+	return sprintf(buf, "%u%%\n",
+		       (dev->u.lmv.lmv_qos.lq_prio_free * 100 + 255) >> 8);
+}
+
+static ssize_t qos_prio_free_store(struct kobject *kobj,
+				   struct attribute *attr,
+				   const char *buffer,
+				   size_t count)
+{
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lmv_obd *lmv = &dev->u.lmv;
+	char buf[6], *tmp;
+	unsigned int val;
+	int rc;
+
+	/* "100%\n\0" should be largest string */
+	if (count >= sizeof(buf))
+		return -ERANGE;
+
+	strncpy(buf, buffer, sizeof(buf));
+	buf[sizeof(buf) - 1] = '\0';
+	tmp = strchr(buf, '%');
+	if (tmp)
+		*tmp = '\0';
+
+	rc = kstrtouint(buf, 0, &val);
+	if (rc)
+		return rc;
+
+	if (val > 100)
+		return -EINVAL;
+
+	lmv->lmv_qos.lq_prio_free = (val << 8) / 100;
+	set_bit(LQ_DIRTY, &lmv->lmv_qos.lq_flags);
+	set_bit(LQ_RESET, &lmv->lmv_qos.lq_flags);
+
+	return count;
+}
+LUSTRE_RW_ATTR(qos_prio_free);
+
+static ssize_t qos_threshold_rr_show(struct kobject *kobj,
+				     struct attribute *attr,
+				     char *buf)
+{
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+
+	return sprintf(buf, "%u%%\n",
+		       (dev->u.lmv.lmv_qos.lq_threshold_rr * 100 + 255) >> 8);
+}
+
+static ssize_t qos_threshold_rr_store(struct kobject *kobj,
+				      struct attribute *attr,
+				      const char *buffer,
+				      size_t count)
+{
+	struct obd_device *dev = container_of(kobj, struct obd_device,
+					      obd_kset.kobj);
+	struct lmv_obd *lmv = &dev->u.lmv;
+	char buf[6], *tmp;
+	unsigned int val;
+	int rc;
+
+	/* "100%\n\0" should be largest string */
+	if (count >= sizeof(buf))
+		return -ERANGE;
+
+	strncpy(buf, buffer, sizeof(buf));
+	buf[sizeof(buf) - 1] = '\0';
+	tmp = strchr(buf, '%');
+	if (tmp)
+		*tmp = '\0';
+
+	rc = kstrtouint(buf, 0, &val);
+	if (rc)
+		return rc;
+
+	if (val > 100)
+		return -EINVAL;
+
+	lmv->lmv_qos.lq_threshold_rr = (val << 8) / 100;
+	set_bit(LQ_DIRTY, &lmv->lmv_qos.lq_flags);
+
+	return count;
+}
+LUSTRE_RW_ATTR(qos_threshold_rr);
+
 #ifdef CONFIG_PROC_FS
 static void *lmv_tgt_seq_start(struct seq_file *p, loff_t *pos)
 {
-	struct obd_device       *dev = p->private;
-	struct lmv_obd          *lmv = &dev->u.lmv;
+	struct obd_device *dev = p->private;
+	struct lmv_obd *lmv = &dev->u.lmv;
+	struct lu_tgt_desc *tgt;
+
+	while (*pos < lmv->lmv_mdt_descs.ltd_tgts_size) {
+		tgt = lmv_tgt(lmv, (__u32)*pos);
+		if (tgt)
+			return tgt;
 
-	while (*pos < lmv->tgts_size) {
-		if (lmv->tgts[*pos])
-			return lmv->tgts[*pos];
 		++*pos;
 	}
 
-	return  NULL;
+	return NULL;
 }
 
 static void lmv_tgt_seq_stop(struct seq_file *p, void *v)
@@ -96,17 +221,20 @@ static void lmv_tgt_seq_stop(struct seq_file *p, void *v)
 
 static void *lmv_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos)
 {
-	struct obd_device       *dev = p->private;
-	struct lmv_obd          *lmv = &dev->u.lmv;
+	struct obd_device *dev = p->private;
+	struct lmv_obd *lmv = &dev->u.lmv;
+	struct lu_tgt_desc *tgt;
 
 	++*pos;
-	while (*pos < lmv->tgts_size) {
-		if (lmv->tgts[*pos])
-			return lmv->tgts[*pos];
+	while (*pos < lmv->lmv_mdt_descs.ltd_tgts_size) {
+		tgt = lmv_tgt(lmv, (__u32)*pos);
+		if (tgt)
+			return tgt;
+
 		++*pos;
 	}
 
-	return  NULL;
+	return NULL;
 }
 
 static int lmv_tgt_seq_show(struct seq_file *p, void *v)
@@ -117,7 +245,7 @@ static int lmv_tgt_seq_show(struct seq_file *p, void *v)
 		return 0;
 
 	seq_printf(p, "%u: %s %sACTIVE\n",
-		   tgt->ltd_idx, tgt->ltd_uuid.uuid,
+		   tgt->ltd_index, tgt->ltd_uuid.uuid,
 		   tgt->ltd_active ? "" : "IN");
 	return 0;
 }
@@ -156,6 +284,9 @@ static struct attribute *lmv_attrs[] = {
 	&lustre_attr_activeobd.attr,
 	&lustre_attr_desc_uuid.attr,
 	&lustre_attr_numobd.attr,
+	&lustre_attr_qos_maxage.attr,
+	&lustre_attr_qos_prio_free.attr,
+	&lustre_attr_qos_threshold_rr.attr,
 	NULL,
 };
 
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_internal.h b/drivers/staging/lustrefsx/lustre/lov/lov_internal.h
index a1cbea9a5c4d4..c4ea3804db4ba 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_internal.h
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_internal.h
@@ -221,7 +221,7 @@ void lsm_free(struct lov_stripe_md *lsm);
 
 struct pool_desc {
 	char			 pool_name[LOV_MAXPOOLNAME + 1];
-	struct ost_pool		 pool_obds;
+	struct lu_tgt_pool	 pool_obds;
 	atomic_t		 pool_refcount;
 	struct hlist_node	 pool_hash;	/* access by poolname */
 	struct list_head	 pool_list;	/* serial access */
@@ -321,12 +321,12 @@ extern struct lu_device_type lov_device_type;
 
 /* pools */
 extern struct cfs_hash_ops pool_hash_operations;
-/* ost_pool methods */
-int lov_ost_pool_init(struct ost_pool *op, unsigned int count);
-int lov_ost_pool_extend(struct ost_pool *op, unsigned int min_count);
-int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count);
-int lov_ost_pool_remove(struct ost_pool *op, __u32 idx);
-int lov_ost_pool_free(struct ost_pool *op);
+/* lu_tgt_pool methods */
+int lov_ost_pool_init(struct lu_tgt_pool *op, unsigned int count);
+int lov_ost_pool_extend(struct lu_tgt_pool *op, unsigned int min_count);
+int lov_ost_pool_add(struct lu_tgt_pool *op, __u32 idx, unsigned int min_count);
+int lov_ost_pool_remove(struct lu_tgt_pool *op, __u32 idx);
+int lov_ost_pool_free(struct lu_tgt_pool *op);
 
 /* high level pool methods */
 int lov_pool_new(struct obd_device *obd, char *poolname);
diff --git a/drivers/staging/lustrefsx/lustre/lov/lov_pool.c b/drivers/staging/lustrefsx/lustre/lov/lov_pool.c
index 6173dbe1429ae..225ba9391cf19 100644
--- a/drivers/staging/lustrefsx/lustre/lov/lov_pool.c
+++ b/drivers/staging/lustrefsx/lustre/lov/lov_pool.c
@@ -314,7 +314,7 @@ void lov_dump_pool(int level, struct pool_desc *pool)
 }
 
 #define LOV_POOL_INIT_COUNT 2
-int lov_ost_pool_init(struct ost_pool *op, unsigned int count)
+int lov_ost_pool_init(struct lu_tgt_pool *op, unsigned int count)
 {
 	ENTRY;
 
@@ -334,7 +334,7 @@ int lov_ost_pool_init(struct ost_pool *op, unsigned int count)
 }
 
 /* Caller must hold write op_rwlock */
-int lov_ost_pool_extend(struct ost_pool *op, unsigned int min_count)
+int lov_ost_pool_extend(struct lu_tgt_pool *op, unsigned int min_count)
 {
 	__u32 *new;
 	__u32 new_size;
@@ -358,7 +358,7 @@ int lov_ost_pool_extend(struct ost_pool *op, unsigned int min_count)
 	return 0;
 }
 
-int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count)
+int lov_ost_pool_add(struct lu_tgt_pool *op, __u32 idx, unsigned int min_count)
 {
         int rc = 0, i;
         ENTRY;
@@ -383,7 +383,7 @@ int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count)
         return rc;
 }
 
-int lov_ost_pool_remove(struct ost_pool *op, __u32 idx)
+int lov_ost_pool_remove(struct lu_tgt_pool *op, __u32 idx)
 {
         int i;
         ENTRY;
@@ -405,7 +405,7 @@ int lov_ost_pool_remove(struct ost_pool *op, __u32 idx)
         RETURN(-EINVAL);
 }
 
-int lov_ost_pool_free(struct ost_pool *op)
+int lov_ost_pool_free(struct lu_tgt_pool *op)
 {
 	ENTRY;
 
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c
index 1c1e54b87590f..11021e8d89dc6 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_locks.c
@@ -501,15 +501,16 @@ static struct ptlrpc_request *
 mdc_intent_getattr_pack(struct obd_export *exp, struct lookup_intent *it,
 			struct md_op_data *op_data, __u32 acl_bufsize)
 {
-	struct ptlrpc_request	*req;
-	struct obd_device	*obddev = class_exp2obd(exp);
-	u64			 valid = OBD_MD_FLGETATTR | OBD_MD_FLEASIZE |
-					 OBD_MD_FLMODEASIZE | OBD_MD_FLDIREA |
-					 OBD_MD_MEA | OBD_MD_FLACL;
-	struct ldlm_intent	*lit;
-	int			 rc;
-	__u32			 easize;
-	bool			 have_secctx = false;
+	struct ptlrpc_request *req;
+	struct obd_device *obddev = class_exp2obd(exp);
+	u64 valid = OBD_MD_FLGETATTR | OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE |
+		    OBD_MD_FLDIREA | OBD_MD_MEA | OBD_MD_FLACL |
+		    OBD_MD_DEFAULT_MEA;
+	struct ldlm_intent *lit;
+	__u32 easize;
+	bool have_secctx = false;
+	int rc;
+
 	ENTRY;
 
 	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
@@ -552,6 +553,8 @@ mdc_intent_getattr_pack(struct obd_export *exp, struct lookup_intent *it,
 
 	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, easize);
 	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, acl_bufsize);
+	req_capsule_set_size(&req->rq_pill, &RMF_DEFAULT_MDT_MD, RCL_SERVER,
+			     sizeof(struct lmv_user_md));
 
 	if (have_secctx) {
 		char *secctx_name;
diff --git a/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c b/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c
index 5a29a285e5943..c91b65eddf39b 100644
--- a/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c
+++ b/drivers/staging/lustrefsx/lustre/mdc/mdc_request.c
@@ -586,14 +586,21 @@ int mdc_get_lustre_md(struct obd_export *exp, struct ptlrpc_request *req,
 			GOTO(out, rc = -EPROTO);
 		}
 
-		lmv_size = md->body->mbo_eadatasize;
-		if (lmv_size == 0) {
-			CDEBUG(D_INFO, "OBD_MD_FLDIREA is set, "
-			       "but eadatasize 0\n");
-			RETURN(-EPROTO);
+		if (md_exp->exp_obd->obd_type->typ_lu == &mdc_device_type) {
+			CERROR("%s: no LMV, upgrading from old version?\n",
+			       md_exp->exp_obd->obd_name);
+
+			GOTO(out_acl, rc = 0);
 		}
 
 		if (md->body->mbo_valid & OBD_MD_MEA) {
+			lmv_size = md->body->mbo_eadatasize;
+			if (lmv_size == 0) {
+				CDEBUG(D_INFO, "OBD_MD_FLDIREA is set, "
+				       "but eadatasize 0\n");
+				RETURN(-EPROTO);
+			}
+
 			lmv = req_capsule_server_sized_get(pill, &RMF_MDT_MD,
 							   lmv_size);
 			if (lmv == NULL)
@@ -602,17 +609,33 @@ int mdc_get_lustre_md(struct obd_export *exp, struct ptlrpc_request *req,
 			rc = md_unpackmd(md_exp, &md->lmv, lmv, lmv_size);
 			if (rc < 0)
 				GOTO(out, rc);
+		}
 
-			if (rc < (typeof(rc))sizeof(*md->lmv)) {
-				CDEBUG(D_INFO, "size too small:  "
-				       "rc < sizeof(*md->lmv) (%d < %d)\n",
-					rc, (int)sizeof(*md->lmv));
+		/* since 2.12.58 intent_getattr fetches default LMV */
+		if (md->body->mbo_valid & OBD_MD_DEFAULT_MEA) {
+			lmv_size = sizeof(struct lmv_user_md);
+			lmv = req_capsule_server_sized_get(pill,
+							   &RMF_DEFAULT_MDT_MD,
+							   lmv_size);
+			if (!lmv)
+				GOTO(out, rc = -EPROTO);
+
+			rc = md_unpackmd(md_exp, &md->default_lmv, lmv,
+					 lmv_size);
+			if (rc < 0)
+				GOTO(out, rc);
+
+			if (rc < (int)sizeof(*md->default_lmv)) {
+				CDEBUG(D_INFO,
+				       "default lmv size too small: %d < %d\n",
+					rc, (int)sizeof(*md->default_lmv));
 				GOTO(out, rc = -EPROTO);
 			}
 		}
-        }
+	}
         rc = 0;
 
+out_acl:
 	if (md->body->mbo_valid & OBD_MD_FLACL) {
 		/* for ACL, it's possible that FLACL is set but aclsize is zero.
 		 * only when aclsize != 0 there's an actual segment for ACL
@@ -1544,6 +1567,54 @@ static int mdc_read_page(struct obd_export *exp, struct md_op_data *op_data,
 	goto out_unlock;
 }
 
+static int mdc_statfs_interpret(const struct lu_env *env,
+				struct ptlrpc_request *req, void *args, int rc)
+{
+	struct obd_info *oinfo = args;
+	struct obd_statfs *osfs;
+
+	if (!rc) {
+		osfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+		if (!osfs)
+			return -EPROTO;
+
+		oinfo->oi_osfs = osfs;
+
+		CDEBUG(D_CACHE, "blocks=%llu free=%llu avail=%llu "
+		       "objects=%llu free=%llu state=%x\n",
+			osfs->os_blocks, osfs->os_bfree, osfs->os_bavail,
+			osfs->os_files, osfs->os_ffree, osfs->os_state);
+	}
+
+	oinfo->oi_cb_up(oinfo, rc);
+
+	return rc;
+}
+
+static int mdc_statfs_async(struct obd_export *exp,
+			    struct obd_info *oinfo, time64_t max_age,
+			    struct ptlrpc_request_set *unused)
+{
+	struct ptlrpc_request *req;
+	struct obd_info *aa;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_MDS_STATFS,
+					LUSTRE_MDS_VERSION, MDS_STATFS);
+	if (req == NULL)
+		return -ENOMEM;
+
+	ptlrpc_request_set_replen(req);
+	req->rq_interpret_reply = mdc_statfs_interpret;
+
+	CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+	aa = ptlrpc_req_async_args(req);
+	*aa = *oinfo;
+
+	ptlrpcd_add_req(req);
+
+	return 0;
+}
+
 static int mdc_statfs(const struct lu_env *env,
                       struct obd_export *exp, struct obd_statfs *osfs,
 		      time64_t max_age, __u32 flags)
@@ -2851,26 +2922,26 @@ int mdc_process_config(struct obd_device *obd, size_t len, void *buf)
 }
 
 static struct obd_ops mdc_obd_ops = {
-        .o_owner            = THIS_MODULE,
-        .o_setup            = mdc_setup,
-        .o_precleanup       = mdc_precleanup,
-        .o_cleanup          = mdc_cleanup,
-        .o_add_conn         = client_import_add_conn,
-        .o_del_conn         = client_import_del_conn,
-        .o_connect          = client_connect_import,
+	.o_owner	    = THIS_MODULE,
+	.o_setup	    = mdc_setup,
+	.o_precleanup       = mdc_precleanup,
+	.o_cleanup	    = mdc_cleanup,
+	.o_add_conn	    = client_import_add_conn,
+	.o_del_conn	    = client_import_del_conn,
+	.o_connect	    = client_connect_import,
 	.o_reconnect	    = osc_reconnect,
 	.o_disconnect	    = osc_disconnect,
-        .o_iocontrol        = mdc_iocontrol,
-        .o_set_info_async   = mdc_set_info_async,
-        .o_statfs           = mdc_statfs,
+	.o_iocontrol	    = mdc_iocontrol,
+	.o_set_info_async   = mdc_set_info_async,
+	.o_statfs	    = mdc_statfs,
+	.o_statfs_async     = mdc_statfs_async,
 	.o_fid_init	    = client_fid_init,
 	.o_fid_fini	    = client_fid_fini,
-        .o_fid_alloc        = mdc_fid_alloc,
-        .o_import_event     = mdc_import_event,
-        .o_get_info         = mdc_get_info,
-        .o_process_config   = mdc_process_config,
-        .o_get_uuid         = mdc_get_uuid,
-        .o_quotactl         = mdc_quotactl,
+	.o_fid_alloc	    = mdc_fid_alloc,
+	.o_import_event     = mdc_import_event,
+	.o_get_info	    = mdc_get_info,
+	.o_get_uuid	    = mdc_get_uuid,
+	.o_quotactl	    = mdc_quotactl,
 };
 
 static struct md_ops mdc_md_ops = {
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/Makefile b/drivers/staging/lustrefsx/lustre/obdclass/Makefile
index 449ebf4b70c86..b2db59390dd4b 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/Makefile
+++ b/drivers/staging/lustrefsx/lustre/obdclass/Makefile
@@ -9,5 +9,6 @@ obdclass-y += lu_object.o dt_object.o
 obdclass-y += cl_object.o cl_page.o cl_lock.o cl_io.o lu_ref.o
 obdclass-y += linkea.o kernelcomm.o jobid.o 
 obdclass-y += integrity.o obd_cksum.o
+obdclass-y += lu_tgt_descs.o
 
 include $(srctree)/drivers/staging/lustrefsx/Makefile.rules
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
index 8b8a12539da61..6cad0a93e7d11 100644
--- a/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lprocfs_status.c
@@ -780,7 +780,8 @@ static const char *obd_connect_names[] = {
 	"",	"",	"",	"",	"",	"",	"",	"",
 	"",	"",	"",	"",	"",	"",	"",	"",
 	"",	"",	"",	"",	"",	"",	"",	"",
-	"",	"",
+	"",
+	"mdll_bypass",  /*  0x800000000000000 */
 	"mdll", /*  0x1000000000000000 */
 	"mdll_auto_refresh", /* 0x2000000000000000 */
 	"", "",
@@ -1996,34 +1997,31 @@ static int str_to_u64_parse(char *buffer, unsigned long count,
  * have a unit as the last character. The function handles overflow/underflow
  * of the signed integer.
  */
-static int str_to_s64_internal(const char __user *buffer, unsigned long count,
-			       __s64 *val, __u64 def_mult, bool allow_units)
+int lu_str_to_s64(char *buffer, unsigned long count, __s64 *val, char defunit)
 {
-	char kernbuf[22];
+	__u64 mult = 1;
 	__u64 tmp;
 	unsigned int offset = 0;
 	int signed sign = 1;
 	__u64 max = LLONG_MAX;
 	int rc = 0;
 
-	if (count > (sizeof(kernbuf) - 1))
-		return -EINVAL;
-
-	if (copy_from_user(kernbuf, buffer, count))
-		return -EFAULT;
-
-	kernbuf[count] = '\0';
+	if (defunit != '1') {
+		rc = get_mult(defunit, &mult);
+		if (rc)
+			return rc;
+	}
 
 	/* keep track of our sign */
-	if (*kernbuf == '-') {
+	if (*buffer == '-') {
 		sign = -1;
 		offset++;
 		/* equivalent to max = -LLONG_MIN, avoids overflow */
 		max++;
 	}
 
-	rc = str_to_u64_parse(kernbuf + offset, count - offset,
-			      &tmp, def_mult, allow_units);
+	rc = str_to_u64_parse(buffer + offset, count - offset,
+			      &tmp, mult, true);
 	if (rc)
 		return rc;
 
@@ -2035,6 +2033,7 @@ static int str_to_s64_internal(const char __user *buffer, unsigned long count,
 
 	return 0;
 }
+EXPORT_SYMBOL(lu_str_to_s64);
 
 /**
  * Convert a user string into a signed 64 bit number. This function produces
@@ -2056,16 +2055,17 @@ static int str_to_s64_internal(const char __user *buffer, unsigned long count,
 int lprocfs_str_with_units_to_s64(const char __user *buffer,
 				  unsigned long count, __s64 *val, char defunit)
 {
-	__u64 mult = 1;
-	int rc;
+	char kernbuf[22];
 
-	if (defunit != '1') {
-		rc = get_mult(defunit, &mult);
-		if (rc)
-			return rc;
-	}
+	if (count > (sizeof(kernbuf) - 1))
+		return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+
+	kernbuf[count] = '\0';
 
-	return str_to_s64_internal(buffer, count, val, mult, true);
+	return lu_str_to_s64(kernbuf, count, val, defunit);
 }
 EXPORT_SYMBOL(lprocfs_str_with_units_to_s64);
 
diff --git a/drivers/staging/lustrefsx/lustre/obdclass/lu_tgt_descs.c b/drivers/staging/lustrefsx/lustre/obdclass/lu_tgt_descs.c
new file mode 100644
index 0000000000000..893a971e486c5
--- /dev/null
+++ b/drivers/staging/lustrefsx/lustre/obdclass/lu_tgt_descs.c
@@ -0,0 +1,682 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdclass/lu_tgt_descs.c
+ *
+ * Lustre target descriptions
+ * These are the only exported functions, they provide some generic
+ * infrastructure for target description management used by LOD/LMV
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/random.h>
+#include <libcfs/libcfs.h>
+#include <libcfs/libcfs_hash.h> /* hash_long() */
+#include <libcfs/linux/linux-mem.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_disk.h>
+#include <lustre_fid.h>
+#include <lu_object.h>
+
+/**
+ * lu_prandom_u64_max - returns a pseudo-random u64 number in interval
+ * [0, ep_ro)
+ *
+ * \param[in] ep_ro	right open interval endpoint
+ *
+ * \retval a pseudo-random 64-bit number that is in interval [0, ep_ro).
+ */
+u64 lu_prandom_u64_max(u64 ep_ro)
+{
+	u64 rand = 0;
+
+	if (ep_ro) {
+#if BITS_PER_LONG == 32
+		/*
+		 * If ep_ro > 32-bit, first generate the high
+		 * 32 bits of the random number, then add in the low
+		 * 32 bits (truncated to the upper limit, if needed)
+		 */
+		if (ep_ro > 0xffffffffULL)
+			rand = prandom_u32_max((u32)(ep_ro >> 32)) << 32;
+
+		if (rand == (ep_ro & 0xffffffff00000000ULL))
+			rand |= prandom_u32_max((u32)ep_ro);
+		else
+			rand |= prandom_u32();
+#else
+		rand = ((u64)prandom_u32() << 32 | prandom_u32()) % ep_ro;
+#endif
+	}
+
+	return rand;
+}
+EXPORT_SYMBOL(lu_prandom_u64_max);
+
+void lu_qos_rr_init(struct lu_qos_rr *lqr)
+{
+	spin_lock_init(&lqr->lqr_alloc);
+	set_bit(LQ_DIRTY, &lqr->lqr_flags);
+}
+EXPORT_SYMBOL(lu_qos_rr_init);
+
+/**
+ * Add a new target to Quality of Service (QoS) target table.
+ *
+ * Add a new MDT/OST target to the structure representing an OSS. Resort the
+ * list of known MDSs/OSSs by the number of MDTs/OSTs attached to each MDS/OSS.
+ * The MDS/OSS list is protected internally and no external locking is required.
+ *
+ * \param[in] qos		lu_qos data
+ * \param[in] tgt		target description
+ *
+ * \retval 0			on success
+ * \retval -ENOMEM		on error
+ */
+int lu_qos_add_tgt(struct lu_qos *qos, struct lu_tgt_desc *tgt)
+{
+	struct lu_svr_qos *svr = NULL;
+	struct lu_svr_qos *tempsvr;
+	struct obd_export *exp = tgt->ltd_exp;
+	int found = 0;
+	__u32 id = 0;
+	int rc = 0;
+
+	ENTRY;
+
+	down_write(&qos->lq_rw_sem);
+	/*
+	 * a bit hacky approach to learn NID of corresponding connection
+	 * but there is no official API to access information like this
+	 * with OSD API.
+	 */
+	list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) {
+		if (obd_uuid_equals(&svr->lsq_uuid,
+				    &exp->exp_connection->c_remote_uuid)) {
+			found++;
+			break;
+		}
+		if (svr->lsq_id > id)
+			id = svr->lsq_id;
+	}
+
+	if (!found) {
+		OBD_ALLOC_PTR(svr);
+		if (!svr)
+			GOTO(out, rc = -ENOMEM);
+		memcpy(&svr->lsq_uuid, &exp->exp_connection->c_remote_uuid,
+		       sizeof(svr->lsq_uuid));
+		++id;
+		svr->lsq_id = id;
+	} else {
+		/* Assume we have to move this one */
+		list_del(&svr->lsq_svr_list);
+	}
+
+	svr->lsq_tgt_count++;
+	tgt->ltd_qos.ltq_svr = svr;
+
+	CDEBUG(D_OTHER, "add tgt %s to server %s (%d targets)\n",
+	       obd_uuid2str(&tgt->ltd_uuid), obd_uuid2str(&svr->lsq_uuid),
+	       svr->lsq_tgt_count);
+
+	/*
+	 * Add sorted by # of tgts.  Find the first entry that we're
+	 * bigger than...
+	 */
+	list_for_each_entry(tempsvr, &qos->lq_svr_list, lsq_svr_list) {
+		if (svr->lsq_tgt_count > tempsvr->lsq_tgt_count)
+			break;
+	}
+	/*
+	 * ...and add before it.  If we're the first or smallest, tempsvr
+	 * points to the list head, and we add to the end.
+	 */
+	list_add_tail(&svr->lsq_svr_list, &tempsvr->lsq_svr_list);
+
+	set_bit(LQ_DIRTY, &qos->lq_flags);
+	set_bit(LQ_DIRTY, &qos->lq_rr.lqr_flags);
+out:
+	up_write(&qos->lq_rw_sem);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lu_qos_add_tgt);
+
+/**
+ * Remove MDT/OST target from QoS table.
+ *
+ * Removes given MDT/OST target from QoS table and releases related
+ * MDS/OSS structure if no target remain on the MDS/OSS.
+ *
+ * \param[in] qos		lu_qos data
+ * \param[in] ltd		target description
+ *
+ * \retval 0			on success
+ * \retval -ENOENT		if no server was found
+ */
+static int lu_qos_del_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd)
+{
+	struct lu_svr_qos *svr;
+	int rc = 0;
+
+	ENTRY;
+
+	down_write(&qos->lq_rw_sem);
+	svr = ltd->ltd_qos.ltq_svr;
+	if (!svr)
+		GOTO(out, rc = -ENOENT);
+
+	svr->lsq_tgt_count--;
+	if (svr->lsq_tgt_count == 0) {
+		CDEBUG(D_OTHER, "removing server %s\n",
+		       obd_uuid2str(&svr->lsq_uuid));
+		list_del(&svr->lsq_svr_list);
+		ltd->ltd_qos.ltq_svr = NULL;
+		OBD_FREE_PTR(svr);
+	}
+
+	set_bit(LQ_DIRTY, &qos->lq_flags);
+	set_bit(LQ_DIRTY, &qos->lq_rr.lqr_flags);
+out:
+	up_write(&qos->lq_rw_sem);
+	RETURN(rc);
+}
+
+static inline __u64 tgt_statfs_bavail(struct lu_tgt_desc *tgt)
+{
+	struct obd_statfs *statfs = &tgt->ltd_statfs;
+
+	return statfs->os_bavail * statfs->os_bsize;
+}
+
+static inline __u64 tgt_statfs_iavail(struct lu_tgt_desc *tgt)
+{
+	return tgt->ltd_statfs.os_ffree;
+}
+
+/**
+ * Calculate weight for a given tgt.
+ *
+ * The final tgt weight is bavail >> 16 * iavail >> 8 minus the tgt and server
+ * penalties.  See ltd_qos_penalties_calc() for how penalties are calculated.
+ *
+ * \param[in] tgt	target descriptor
+ */
+void lu_tgt_qos_weight_calc(struct lu_tgt_desc *tgt)
+{
+	struct lu_tgt_qos *ltq = &tgt->ltd_qos;
+	__u64 penalty;
+
+	ltq->ltq_avail = (tgt_statfs_bavail(tgt) >> 16) *
+			 (tgt_statfs_iavail(tgt) >> 8);
+	penalty = ltq->ltq_penalty + ltq->ltq_svr->lsq_penalty;
+	if (ltq->ltq_avail < penalty)
+		ltq->ltq_weight = 0;
+	else
+		ltq->ltq_weight = ltq->ltq_avail - penalty;
+}
+EXPORT_SYMBOL(lu_tgt_qos_weight_calc);
+
+/**
+ * Allocate and initialize target table.
+ *
+ * A helper function to initialize the target table and allocate
+ * a bitmap of the available targets.
+ *
+ * \param[in] ltd		target's table to initialize
+ * \param[in] is_mdt		target table for MDTs
+ *
+ * \retval 0			on success
+ * \retval negative		negated errno on error
+ **/
+int lu_tgt_descs_init(struct lu_tgt_descs *ltd, bool is_mdt)
+{
+	mutex_init(&ltd->ltd_mutex);
+	init_rwsem(&ltd->ltd_rw_sem);
+
+	/*
+	 * the tgt array and bitmap are allocated/grown dynamically as tgts are
+	 * added to the LOD/LMV, see lu_tgt_descs_add()
+	 */
+	ltd->ltd_tgt_bitmap = CFS_ALLOCATE_BITMAP(BITS_PER_LONG);
+	if (!ltd->ltd_tgt_bitmap)
+		return -ENOMEM;
+
+	ltd->ltd_tgts_size  = BITS_PER_LONG;
+	ltd->ltd_death_row = 0;
+	ltd->ltd_refcount  = 0;
+
+	/* Set up allocation policy (QoS and RR) */
+	INIT_LIST_HEAD(&ltd->ltd_qos.lq_svr_list);
+	init_rwsem(&ltd->ltd_qos.lq_rw_sem);
+	set_bit(LQ_DIRTY, &ltd->ltd_qos.lq_flags);
+	set_bit(LQ_RESET, &ltd->ltd_qos.lq_flags);
+	ltd->ltd_is_mdt = is_mdt;
+
+	/* MDT imbalance threshold is low to balance across MDTs
+	 * relatively quickly, because each directory may result
+	 * in a large number of files/subdirs created therein.
+	 */
+	if (is_mdt) {
+		ltd->ltd_lmv_desc.ld_pattern = LMV_HASH_TYPE_DEFAULT;
+		ltd->ltd_qos.lq_prio_free = LMV_QOS_DEF_PRIO_FREE * 256 / 100;
+		ltd->ltd_qos.lq_threshold_rr =
+			LMV_QOS_DEF_THRESHOLD_RR_PCT * 256 / 100;
+	} else {
+		ltd->ltd_qos.lq_prio_free = LOV_QOS_DEF_PRIO_FREE * 256 / 100;
+		ltd->ltd_qos.lq_threshold_rr =
+			LOV_QOS_DEF_THRESHOLD_RR_PCT * 256 / 100;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(lu_tgt_descs_init);
+
+/**
+ * Free bitmap and target table pages.
+ *
+ * \param[in] ltd	target table
+ */
+void lu_tgt_descs_fini(struct lu_tgt_descs *ltd)
+{
+	int i;
+
+	CFS_FREE_BITMAP(ltd->ltd_tgt_bitmap);
+	for (i = 0; i < TGT_PTRS; i++) {
+		if (ltd->ltd_tgt_idx[i])
+			OBD_FREE_PTR(ltd->ltd_tgt_idx[i]);
+	}
+	ltd->ltd_tgts_size = 0;
+}
+EXPORT_SYMBOL(lu_tgt_descs_fini);
+
+/**
+ * Expand size of target table.
+ *
+ * When the target table is full, we have to extend the table. To do so,
+ * we allocate new memory with some reserve, move data from the old table
+ * to the new one and release memory consumed by the old table.
+ *
+ * \param[in] ltd		target table
+ * \param[in] newsize		new size of the table
+ *
+ * \retval			0 on success
+ * \retval			-ENOMEM if reallocation failed
+ */
+static int lu_tgt_descs_resize(struct lu_tgt_descs *ltd, __u32 newsize)
+{
+	struct cfs_bitmap *new_bitmap, *old_bitmap = NULL;
+
+	/* someone else has already resize the array */
+	if (newsize <= ltd->ltd_tgts_size)
+		return 0;
+
+	new_bitmap = CFS_ALLOCATE_BITMAP(newsize);
+	if (!new_bitmap)
+		return -ENOMEM;
+
+	if (ltd->ltd_tgts_size > 0) {
+		/* the bitmap already exists, copy data from old one */
+		cfs_bitmap_copy(new_bitmap, ltd->ltd_tgt_bitmap);
+		old_bitmap = ltd->ltd_tgt_bitmap;
+	}
+
+	ltd->ltd_tgts_size  = newsize;
+	ltd->ltd_tgt_bitmap = new_bitmap;
+
+	if (old_bitmap)
+		CFS_FREE_BITMAP(old_bitmap);
+
+	CDEBUG(D_CONFIG, "tgt size: %d\n", ltd->ltd_tgts_size);
+
+	return 0;
+}
+
+/**
+ * Add new target to target table.
+ *
+ * Extend target table if it's full, update target table and bitmap.
+ * Notice we need to take ltd_rw_sem exclusively before entry to ensure
+ * atomic switch.
+ *
+ * \param[in] ltd		target table
+ * \param[in] tgt		new target desc
+ *
+ * \retval			0 on success
+ * \retval			-ENOMEM if reallocation failed
+ *				-EEXIST if target existed
+ */
+int ltd_add_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt)
+{
+	__u32 index = tgt->ltd_index;
+	int rc;
+
+	ENTRY;
+
+	if (index >= ltd->ltd_tgts_size) {
+		__u32 newsize = 1;
+
+		while (newsize < index + 1)
+			newsize = newsize << 1;
+
+		rc = lu_tgt_descs_resize(ltd, newsize);
+		if (rc)
+			RETURN(rc);
+	} else if (cfs_bitmap_check(ltd->ltd_tgt_bitmap, index)) {
+		RETURN(-EEXIST);
+	}
+
+	if (ltd->ltd_tgt_idx[index / TGT_PTRS_PER_BLOCK] == NULL) {
+		OBD_ALLOC_PTR(ltd->ltd_tgt_idx[index / TGT_PTRS_PER_BLOCK]);
+		if (ltd->ltd_tgt_idx[index / TGT_PTRS_PER_BLOCK] == NULL)
+			RETURN(-ENOMEM);
+	}
+
+	LTD_TGT(ltd, tgt->ltd_index) = tgt;
+	cfs_bitmap_set(ltd->ltd_tgt_bitmap, tgt->ltd_index);
+
+	ltd->ltd_lov_desc.ld_tgt_count++;
+	if (tgt->ltd_active)
+		ltd->ltd_lov_desc.ld_active_tgt_count++;
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ltd_add_tgt);
+
+/**
+ * Delete target from target table
+ */
+void ltd_del_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt)
+{
+	lu_qos_del_tgt(&ltd->ltd_qos, tgt);
+	LTD_TGT(ltd, tgt->ltd_index) = NULL;
+	cfs_bitmap_clear(ltd->ltd_tgt_bitmap, tgt->ltd_index);
+	ltd->ltd_lov_desc.ld_tgt_count--;
+	if (tgt->ltd_active)
+		ltd->ltd_lov_desc.ld_active_tgt_count--;
+}
+EXPORT_SYMBOL(ltd_del_tgt);
+
+/**
+ * Calculate penalties per-tgt and per-server
+ *
+ * Re-calculate penalties when the configuration changes, active targets
+ * change and after statfs refresh (all these are reflected by lq_dirty flag).
+ * On every tgt and server: decay the penalty by half for every 8x the update
+ * interval that the device has been idle. That gives lots of time for the
+ * statfs information to be updated (which the penalty is only a proxy for),
+ * and avoids penalizing server/tgt under light load.
+ * See lu_qos_tgt_weight_calc() for how penalties are factored into the weight.
+ *
+ * \param[in] ltd		lu_tgt_descs
+ *
+ * \retval 0		on success
+ * \retval -EAGAIN	the number of tgt isn't enough or all tgt spaces are
+ *			almost the same
+ */
+int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd)
+{
+	struct lu_qos *qos = &ltd->ltd_qos;
+	struct lov_desc *desc = &ltd->ltd_lov_desc;
+	struct lu_tgt_desc *tgt;
+	struct lu_svr_qos *svr;
+	__u64 ba_max, ba_min, ba;
+	__u64 ia_max, ia_min, ia = 1;
+	__u32 num_active;
+	int prio_wide;
+	time64_t now, age;
+	int rc;
+
+	ENTRY;
+
+	if (!test_bit(LQ_DIRTY, &qos->lq_flags))
+		GOTO(out, rc = 0);
+
+	num_active = desc->ld_active_tgt_count - 1;
+	if (num_active < 1)
+		GOTO(out, rc = -EAGAIN);
+
+	/* find bavail on each server */
+	list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) {
+		svr->lsq_bavail = 0;
+		/* if inode is not counted, set to 1 to ignore */
+		svr->lsq_iavail = ltd->ltd_is_mdt ? 0 : 1;
+	}
+	qos->lq_active_svr_count = 0;
+
+	/*
+	 * How badly user wants to select targets "widely" (not recently chosen
+	 * and not on recent MDS's).  As opposed to "freely" (free space avail.)
+	 * 0-256
+	 */
+	prio_wide = 256 - qos->lq_prio_free;
+
+	ba_min = (__u64)(-1);
+	ba_max = 0;
+	ia_min = (__u64)(-1);
+	ia_max = 0;
+	now = ktime_get_real_seconds();
+
+	/* Calculate server penalty per object */
+	ltd_foreach_tgt(ltd, tgt) {
+		if (!tgt->ltd_active)
+			continue;
+
+		/* when inode is counted, bavail >> 16 to avoid overflow */
+		ba = tgt_statfs_bavail(tgt);
+		if (ltd->ltd_is_mdt)
+			ba >>= 16;
+		else
+			ba >>= 8;
+		if (!ba)
+			continue;
+
+		ba_min = min(ba, ba_min);
+		ba_max = max(ba, ba_max);
+
+		/* Count the number of usable servers */
+		if (tgt->ltd_qos.ltq_svr->lsq_bavail == 0)
+			qos->lq_active_svr_count++;
+		tgt->ltd_qos.ltq_svr->lsq_bavail += ba;
+
+		if (ltd->ltd_is_mdt) {
+			/* iavail >> 8 to avoid overflow */
+			ia = tgt_statfs_iavail(tgt) >> 8;
+			if (!ia)
+				continue;
+
+			ia_min = min(ia, ia_min);
+			ia_max = max(ia, ia_max);
+
+			tgt->ltd_qos.ltq_svr->lsq_iavail += ia;
+		}
+
+		/*
+		 * per-tgt penalty is
+		 * prio * bavail * iavail / (num_tgt - 1) / 2
+		 */
+		tgt->ltd_qos.ltq_penalty_per_obj = prio_wide * ba * ia >> 8;
+		do_div(tgt->ltd_qos.ltq_penalty_per_obj, num_active);
+		tgt->ltd_qos.ltq_penalty_per_obj >>= 1;
+
+		age = (now - tgt->ltd_qos.ltq_used) >> 3;
+		if (test_bit(LQ_RESET, &qos->lq_flags) || 
+		    age > 32 * desc->ld_qos_maxage)
+			tgt->ltd_qos.ltq_penalty = 0;
+		else if (age > desc->ld_qos_maxage)
+			/* Decay tgt penalty. */
+			tgt->ltd_qos.ltq_penalty >>= age / desc->ld_qos_maxage;
+	}
+
+	num_active = qos->lq_active_svr_count - 1;
+	if (num_active < 1) {
+		/*
+		 * If there's only 1 server, we can't penalize it, so instead
+		 * we have to double the tgt penalty
+		 */
+		num_active = 1;
+		ltd_foreach_tgt(ltd, tgt) {
+			if (!tgt->ltd_active)
+				continue;
+
+			tgt->ltd_qos.ltq_penalty_per_obj <<= 1;
+		}
+	}
+
+	/*
+	 * Per-server penalty is
+	 * prio * bavail * iavail / server_tgts / (num_svr - 1) / 2
+	 */
+	list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) {
+		ba = svr->lsq_bavail;
+		ia = svr->lsq_iavail;
+		svr->lsq_penalty_per_obj = prio_wide * ba  * ia >> 8;
+		do_div(svr->lsq_penalty_per_obj,
+		       svr->lsq_tgt_count * num_active);
+		svr->lsq_penalty_per_obj >>= 1;
+
+		age = (now - svr->lsq_used) >> 3;
+		if (test_bit(LQ_RESET, &qos->lq_flags) || 
+		    age > 32 * desc->ld_qos_maxage)
+			svr->lsq_penalty = 0;
+		else if (age > desc->ld_qos_maxage)
+			/* Decay server penalty. */
+			svr->lsq_penalty >>= age / desc->ld_qos_maxage;
+	}
+
+	clear_bit(LQ_DIRTY, &qos->lq_flags);
+	clear_bit(LQ_RESET, &qos->lq_flags);
+
+	/*
+	 * If each tgt has almost same free space, do rr allocation for better
+	 * creation performance
+	 */
+	clear_bit(LQ_SAME_SPACE, &qos->lq_flags);
+	if ((ba_max * (256 - qos->lq_threshold_rr)) >> 8 < ba_min &&
+	    (ia_max * (256 - qos->lq_threshold_rr)) >> 8 < ia_min) {
+		set_bit(LQ_SAME_SPACE, &qos->lq_flags);
+		/* Reset weights for the next time we enter qos mode */
+		set_bit(LQ_RESET, &qos->lq_flags);
+	}
+	rc = 0;
+
+out:
+	if (!rc && test_bit(LQ_SAME_SPACE, &qos->lq_flags))
+		RETURN(-EAGAIN);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ltd_qos_penalties_calc);
+
+/**
+ * Re-calculate penalties and weights of all tgts.
+ *
+ * The function is called when some target was used for a new object. In
+ * this case we should re-calculate all the weights to keep new allocations
+ * balanced well.
+ *
+ * \param[in] ltd		lu_tgt_descs
+ * \param[in] tgt		recently used tgt
+ * \param[out] total_wt		new total weight for the pool
+ *
+ * \retval		0
+ */
+int ltd_qos_update(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt,
+		   __u64 *total_wt)
+{
+	struct lu_qos *qos = &ltd->ltd_qos;
+	struct lu_tgt_qos *ltq;
+	struct lu_svr_qos *svr;
+
+	ENTRY;
+
+	ltq = &tgt->ltd_qos;
+	LASSERT(ltq);
+
+	/* Don't allocate on this device anymore, until the next alloc_qos */
+	ltq->ltq_usable = 0;
+
+	svr = ltq->ltq_svr;
+
+	/*
+	 * Decay old penalty by half (we're adding max penalty, and don't
+	 * want it to run away.)
+	 */
+	ltq->ltq_penalty >>= 1;
+	svr->lsq_penalty >>= 1;
+
+	/* mark the server and tgt as recently used */
+	ltq->ltq_used = svr->lsq_used = ktime_get_real_seconds();
+
+	/* Set max penalties for this tgt and server */
+	ltq->ltq_penalty += ltq->ltq_penalty_per_obj *
+			    ltd->ltd_lov_desc.ld_active_tgt_count;
+	svr->lsq_penalty += svr->lsq_penalty_per_obj *
+			    qos->lq_active_svr_count;
+
+	/* Decrease all MDS penalties */
+	list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) {
+		if (svr->lsq_penalty < svr->lsq_penalty_per_obj)
+			svr->lsq_penalty = 0;
+		else
+			svr->lsq_penalty -= svr->lsq_penalty_per_obj;
+	}
+
+	*total_wt = 0;
+	/* Decrease all tgt penalties */
+	ltd_foreach_tgt(ltd, tgt) {
+		if (!tgt->ltd_active)
+			continue;
+
+		ltq = &tgt->ltd_qos;
+		if (ltq->ltq_penalty < ltq->ltq_penalty_per_obj)
+			ltq->ltq_penalty = 0;
+		else
+			ltq->ltq_penalty -= ltq->ltq_penalty_per_obj;
+
+		lu_tgt_qos_weight_calc(tgt);
+
+		/* Recalc the total weight of usable osts */
+		if (ltq->ltq_usable)
+			*total_wt += ltq->ltq_weight;
+
+		CDEBUG(D_OTHER, "recalc tgt %d usable=%d bavail=%llu ffree=%llu tgtppo=%llu tgtp=%llu svrppo=%llu svrp=%llu wt=%llu\n",
+			  tgt->ltd_index, ltq->ltq_usable,
+			  tgt_statfs_bavail(tgt) >> 16,
+			  tgt_statfs_iavail(tgt) >> 8,
+			  ltq->ltq_penalty_per_obj >> 10,
+			  ltq->ltq_penalty >> 10,
+			  ltq->ltq_svr->lsq_penalty_per_obj >> 10,
+			  ltq->ltq_svr->lsq_penalty >> 10,
+			  ltq->ltq_weight >> 10);
+	}
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ltd_qos_update);
diff --git a/drivers/staging/lustrefsx/lustre/osc/osc_request.c b/drivers/staging/lustrefsx/lustre/osc/osc_request.c
index 80695d5805915..9a3c3fb092209 100644
--- a/drivers/staging/lustrefsx/lustre/osc/osc_request.c
+++ b/drivers/staging/lustrefsx/lustre/osc/osc_request.c
@@ -2735,6 +2735,22 @@ static int osc_statfs_async(struct obd_export *exp,
 	int rc;
         ENTRY;
 
+	if (obd->obd_osfs_age >= max_age) {
+		CDEBUG(D_SUPER,
+		       "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n",
+		       obd->obd_name, &obd->obd_osfs,
+		       obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
+		       obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
+		spin_lock(&obd->obd_osfs_lock);
+		memcpy(oinfo->oi_osfs, &obd->obd_osfs, sizeof(*oinfo->oi_osfs));
+		spin_unlock(&obd->obd_osfs_lock);
+		oinfo->oi_flags |= OBD_STATFS_FROM_CACHE;
+		if (oinfo->oi_cb_up)
+			oinfo->oi_cb_up(oinfo, 0);
+
+		RETURN(0);
+	}
+
         /* We could possibly pass max_age in the request (as an absolute
          * timestamp or a "seconds.usec ago") so the target can avoid doing
          * extra calls into the filesystem if that isn't necessary (e.g.
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/layout.c b/drivers/staging/lustrefsx/lustre/ptlrpc/layout.c
index 7db9465a3569f..0f9667e4e578b 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/layout.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/layout.c
@@ -480,7 +480,8 @@ static const struct req_msg_field *ldlm_intent_getattr_server[] = {
 	&RMF_MDT_MD,
 	&RMF_ACL,
 	&RMF_CAPA1,
-	&RMF_FILE_SECCTX
+	&RMF_FILE_SECCTX,
+	&RMF_DEFAULT_MDT_MD
 };
 
 static const struct req_msg_field *ldlm_intent_create_client[] = {
@@ -1100,6 +1101,11 @@ struct req_msg_field RMF_MDT_MD =
         DEFINE_MSGF("mdt_md", RMF_F_NO_SIZE_CHECK, MIN_MD_SIZE, NULL, NULL);
 EXPORT_SYMBOL(RMF_MDT_MD);
 
+struct req_msg_field RMF_DEFAULT_MDT_MD =
+	DEFINE_MSGF("default_mdt_md", RMF_F_NO_SIZE_CHECK, MIN_MD_SIZE, NULL,
+		    NULL);
+EXPORT_SYMBOL(RMF_DEFAULT_MDT_MD);
+
 struct req_msg_field RMF_REC_REINT =
         DEFINE_MSGF("rec_reint", 0, sizeof(struct mdt_rec_reint),
                     lustre_swab_mdt_rec_reint, NULL);
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c b/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c
index 81d7ba5812233..d688f34b933b7 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/pack_generic.c
@@ -2181,7 +2181,10 @@ void lustre_swab_lmv_user_md(struct lmv_user_md *lum)
 	__swab32s(&lum->lum_stripe_offset);
 	__swab32s(&lum->lum_hash_type);
 	__swab32s(&lum->lum_type);
-	CLASSERT(offsetof(typeof(*lum), lum_padding1) != 0);
+	/* lum_max_inherit and lum_max_inherit_rr do not need to be swabbed */
+	BUILD_BUG_ON(offsetof(typeof(*lum), lum_padding1) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*lum), lum_padding2) == 0);
+	BUILD_BUG_ON(offsetof(typeof(*lum), lum_padding3) == 0);
 	switch (lum->lum_magic) {
 	case LMV_USER_MAGIC_SPECIFIC:
 		count = lum->lum_stripe_count;
diff --git a/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c b/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c
index 78e5b21335b69..b4e5d7430d949 100644
--- a/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c
+++ b/drivers/staging/lustrefsx/lustre/ptlrpc/wiretest.c
@@ -1351,6 +1351,8 @@ void lustre_assert_wire_constants(void)
 		 OBD_CONNECT2_FIDMAP);
 	LASSERTF(OBD_CONNECT2_GETATTR_PFID== 0x20000ULL, "found 0x%.16llxULL\n",
 		 OBD_CONNECT2_GETATTR_PFID);
+	LASSERTF(OBD_CONNECT2_MDLL_BYPASS == OBD_CONNECT2_MDLL_BYPASS, "found 0x%.16llxULL\n",
+		 OBD_CONNECT2_MDLL_BYPASS);
 	LASSERTF(OBD_CONNECT2_MDLL == 0x1000000000000000ULL, "found 0x%.16llxULL\n",
 		 OBD_CONNECT2_MDLL);
 	LASSERTF(OBD_CONNECT2_MDLL_AUTO_REFRESH == 0x2000000000000000ULL, "found 0x%.16llxULL\n",
@@ -1861,10 +1863,9 @@ void lustre_assert_wire_constants(void)
 	CLASSERT(LMV_MAGIC_V1 == 0x0CD20CD0);
 	CLASSERT(LMV_MAGIC_STRIPE == 0x0CD40CD0);
 	CLASSERT(LMV_HASH_TYPE_MASK == 0x0000ffff);
-	CLASSERT(LMV_HASH_FLAG_MIGRATION == 0x80000000);
-	CLASSERT(LMV_HASH_FLAG_DEAD == 0x40000000);
-	CLASSERT(LMV_HASH_FLAG_BAD_TYPE == 0x20000000);
 	CLASSERT(LMV_HASH_FLAG_LOST_LMV == 0x10000000);
+	CLASSERT(LMV_HASH_FLAG_BAD_TYPE == 0x20000000);
+	CLASSERT(LMV_HASH_FLAG_MIGRATION == 0x80000000);
 
 	/* Checks for struct obd_statfs */
 	LASSERTF((int)sizeof(struct obd_statfs) == 144, "found %lld\n",
diff --git a/drivers/staging/lustrefsx/undef.h b/drivers/staging/lustrefsx/undef.h
index 003391836fc68..366a6d168b2c4 100644
--- a/drivers/staging/lustrefsx/undef.h
+++ b/drivers/staging/lustrefsx/undef.h
@@ -503,6 +503,9 @@
 /* if iov_iter_type exists */
 #undef HAVE_IOV_ITER_TYPE
 
+/* is_root_inode defined */
+#undef HAVE_IS_ROOT_INODE
+
 /* is_sxid is defined */
 #undef HAVE_IS_SXID
 

From 2584ecf1154a32ce9c05802b012f65d20ee2403d Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 7 Nov 2022 16:50:00 +0000
Subject: [PATCH 525/737] mm/damon/dbgfs: check if rm_contexts input is for a
 real context

commit 1de09a7281edecfdba19b3a07417f6d65243ab5f upstream.

A user could write a name of a file under 'damon/' debugfs directory,
which is not a user-created context, to 'rm_contexts' file.  In the case,
'dbgfs_rm_context()' just assumes it's the valid DAMON context directory
only if a file of the name exist.  As a result, invalid memory access
could happen as below.  Fix the bug by checking if the given input is for
a directory.  This check can filter out non-context inputs because
directories under 'damon/' debugfs directory can be created via only
'mk_contexts' file.

This bug has found by syzbot[1].

[1] https://lore.kernel.org/damon/000000000000ede3ac05ec4abf8e@google.com/

Link: https://lkml.kernel.org/r/20221107165001.5717-2-sj@kernel.org
Fixes: 75c1c2b53c78 ("mm/damon/dbgfs: support multiple contexts")
Signed-off-by: SeongJae Park <sj@kernel.org>
Reported-by: syzbot+6087eafb76a94c4ac9eb@syzkaller.appspotmail.com
Cc: <stable@vger.kernel.org>	[5.15.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 1de09a7281edecfdba19b3a07417f6d65243ab5f)
---
 mm/damon/dbgfs.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 4e51466c4e74d..dafe7e71329b8 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -882,6 +882,7 @@ static ssize_t dbgfs_mk_context_write(struct file *file,
 static int dbgfs_rm_context(char *name)
 {
 	struct dentry *root, *dir, **new_dirs;
+	struct inode *inode;
 	struct damon_ctx **new_ctxs;
 	int i, j;
 	int ret = 0;
@@ -897,6 +898,12 @@ static int dbgfs_rm_context(char *name)
 	if (!dir)
 		return -ENOENT;
 
+	inode = d_inode(dir);
+	if (!S_ISDIR(inode->i_mode)) {
+		ret = -EINVAL;
+		goto out_dput;
+	}
+
 	new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs),
 			GFP_KERNEL);
 	if (!new_dirs) {

From a0f31efde6ef6ae0f4102f62dc28fa217f97536d Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 14 Nov 2022 17:55:52 +0000
Subject: [PATCH 526/737] mm/damon/sysfs-schemes: skip stats update if the
 scheme directory is removed

commit 8468b486612c808c9e337708d66a435498f1735c upstream.

A DAMON sysfs interface user can start DAMON with a scheme, remove the
sysfs directory for the scheme, and then ask update of the scheme's stats.
Because the schemes stats update logic isn't aware of the situation, it
results in an invalid memory access.  Fix the bug by checking if the
scheme sysfs directory exists.

Link: https://lkml.kernel.org/r/20221114175552.1951-1-sj@kernel.org
Fixes: 0ac32b8affb5 ("mm/damon/sysfs: support DAMOS stats")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>	[v5.18]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 8468b486612c808c9e337708d66a435498f1735c)
---
 mm/damon/sysfs.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index bdef9682d0a00..b4b9614eecbed 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2342,6 +2342,10 @@ static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond)
 	damon_for_each_scheme(scheme, ctx) {
 		struct damon_sysfs_stats *sysfs_stats;
 
+		/* user could have removed the scheme sysfs dir */
+		if (schemes_idx >= sysfs_schemes->nr)
+			break;
+
 		sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats;
 		sysfs_stats->nr_tried = scheme->stat.nr_tried;
 		sysfs_stats->sz_tried = scheme->stat.sz_tried;

From 8c29e2762df4b1caa6b789c8540fbe44a044a4f1 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 28 Nov 2022 13:35:58 +0000
Subject: [PATCH 527/737] arm64: errata: Fix KVM Spectre-v2 mitigation
 selection for Cortex-A57/A72

Both the Spectre-v2 and Spectre-BHB mitigations involve running a sequence
immediately after exiting a guest, before any branches. In the stable
kernels these sequences are built by copying templates into an empty vector
slot.

For Spectre-BHB, Cortex-A57 and A72 require the branchy loop with k=8.
If Spectre-v2 needs mitigating at the same time, a firmware call to EL3 is
needed. The work EL3 does at this point is also enough to mitigate
Spectre-BHB.

When enabling the Spectre-BHB mitigation, spectre_bhb_enable_mitigation()
should check if a slot has already been allocated for Spectre-v2, meaning
no work is needed for Spectre-BHB.

This check was missed in the earlier backport, add it.

Fixes: e192c8baa69a ("arm64: Mitigate spectre style branch history side channels")
Signed-off-by: James Morse <james.morse@arm.com>
---
 arch/arm64/kernel/proton-pack.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c
index 013e430acb1fd..93fc7f55ae839 100644
--- a/arch/arm64/kernel/proton-pack.c
+++ b/arch/arm64/kernel/proton-pack.c
@@ -1083,7 +1083,13 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry)
 	} else if (spectre_bhb_loop_affected(SCOPE_LOCAL_CPU)) {
 		switch (spectre_bhb_loop_affected(SCOPE_SYSTEM)) {
 		case 8:
-			kvm_setup_bhb_slot(__spectre_bhb_loop_k8);
+			/*
+			 * A57/A72-r0 will already have selected the
+			 * spectre-indirect vector, which is sufficient
+			 * for BHB too.
+			 */
+			if (!__this_cpu_read(bp_hardening_data.fn))
+				kvm_setup_bhb_slot(__spectre_bhb_loop_k8);
 			break;
 		case 24:
 			kvm_setup_bhb_slot(__spectre_bhb_loop_k24);

From c2e9fd6e11a1dfbad3b4eeffccf68ad4a61d219a Mon Sep 17 00:00:00 2001
From: Yajun Deng <yajun.deng@linux.dev>
Date: Thu, 8 Sep 2022 19:14:43 +0000
Subject: [PATCH 528/737] mm/damon: introduce struct damos_access_pattern

commit f5a79d7c0c87c8d88bb5e3f3c898258fdf1b3b05 upstream.

damon_new_scheme() has too many parameters, so introduce struct
damos_access_pattern to simplify it.

In additon, we can't use a bpf trace kprobe that has more than 5
parameters.

Link: https://lkml.kernel.org/r/20220908191443.129534-1-sj@kernel.org
Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
Signed-off-by: SeongJae Park <sj@kernel.org>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit f5a79d7c0c87c8d88bb5e3f3c898258fdf1b3b05)
---
 include/linux/damon.h | 37 ++++++++++++++++++----------------
 mm/damon/core.c       | 31 ++++++++++++++---------------
 mm/damon/dbgfs.c      | 27 +++++++++++++++----------
 mm/damon/lru_sort.c   | 46 ++++++++++++++++++++++++++-----------------
 mm/damon/reclaim.c    | 23 +++++++++++++---------
 mm/damon/sysfs.c      | 17 +++++++++++-----
 6 files changed, 106 insertions(+), 75 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 7b1f4a4882308..98e622c34d44f 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -216,13 +216,26 @@ struct damos_stat {
 };
 
 /**
- * struct damos - Represents a Data Access Monitoring-based Operation Scheme.
+ * struct damos_access_pattern - Target access pattern of the given scheme.
  * @min_sz_region:	Minimum size of target regions.
  * @max_sz_region:	Maximum size of target regions.
  * @min_nr_accesses:	Minimum ``->nr_accesses`` of target regions.
  * @max_nr_accesses:	Maximum ``->nr_accesses`` of target regions.
  * @min_age_region:	Minimum age of target regions.
  * @max_age_region:	Maximum age of target regions.
+ */
+struct damos_access_pattern {
+	unsigned long min_sz_region;
+	unsigned long max_sz_region;
+	unsigned int min_nr_accesses;
+	unsigned int max_nr_accesses;
+	unsigned int min_age_region;
+	unsigned int max_age_region;
+};
+
+/**
+ * struct damos - Represents a Data Access Monitoring-based Operation Scheme.
+ * @pattern:		Access pattern of target regions.
  * @action:		&damo_action to be applied to the target regions.
  * @quota:		Control the aggressiveness of this scheme.
  * @wmarks:		Watermarks for automated (in)activation of this scheme.
@@ -230,10 +243,8 @@ struct damos_stat {
  * @list:		List head for siblings.
  *
  * For each aggregation interval, DAMON finds regions which fit in the
- * condition (&min_sz_region, &max_sz_region, &min_nr_accesses,
- * &max_nr_accesses, &min_age_region, &max_age_region) and applies &action to
- * those.  To avoid consuming too much CPU time or IO resources for the
- * &action, &quota is used.
+ * &pattern and applies &action to those. To avoid consuming too much
+ * CPU time or IO resources for the &action, &quota is used.
  *
  * To do the work only when needed, schemes can be activated for specific
  * system situations using &wmarks.  If all schemes that registered to the
@@ -248,12 +259,7 @@ struct damos_stat {
  * &action is applied.
  */
 struct damos {
-	unsigned long min_sz_region;
-	unsigned long max_sz_region;
-	unsigned int min_nr_accesses;
-	unsigned int max_nr_accesses;
-	unsigned int min_age_region;
-	unsigned int max_age_region;
+	struct damos_access_pattern pattern;
 	enum damos_action action;
 	struct damos_quota quota;
 	struct damos_watermarks wmarks;
@@ -501,12 +507,9 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t);
 int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
 		unsigned int nr_ranges);
 
-struct damos *damon_new_scheme(
-		unsigned long min_sz_region, unsigned long max_sz_region,
-		unsigned int min_nr_accesses, unsigned int max_nr_accesses,
-		unsigned int min_age_region, unsigned int max_age_region,
-		enum damos_action action, struct damos_quota *quota,
-		struct damos_watermarks *wmarks);
+struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
+			enum damos_action action, struct damos_quota *quota,
+			struct damos_watermarks *wmarks);
 void damon_add_scheme(struct damon_ctx *ctx, struct damos *s);
 void damon_destroy_scheme(struct damos *s);
 
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 7d25dc582fe34..7d5a9ae6f4ac9 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -230,24 +230,21 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
 	return 0;
 }
 
-struct damos *damon_new_scheme(
-		unsigned long min_sz_region, unsigned long max_sz_region,
-		unsigned int min_nr_accesses, unsigned int max_nr_accesses,
-		unsigned int min_age_region, unsigned int max_age_region,
-		enum damos_action action, struct damos_quota *quota,
-		struct damos_watermarks *wmarks)
+struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
+			enum damos_action action, struct damos_quota *quota,
+			struct damos_watermarks *wmarks)
 {
 	struct damos *scheme;
 
 	scheme = kmalloc(sizeof(*scheme), GFP_KERNEL);
 	if (!scheme)
 		return NULL;
-	scheme->min_sz_region = min_sz_region;
-	scheme->max_sz_region = max_sz_region;
-	scheme->min_nr_accesses = min_nr_accesses;
-	scheme->max_nr_accesses = max_nr_accesses;
-	scheme->min_age_region = min_age_region;
-	scheme->max_age_region = max_age_region;
+	scheme->pattern.min_sz_region = pattern->min_sz_region;
+	scheme->pattern.max_sz_region = pattern->max_sz_region;
+	scheme->pattern.min_nr_accesses = pattern->min_nr_accesses;
+	scheme->pattern.max_nr_accesses = pattern->max_nr_accesses;
+	scheme->pattern.min_age_region = pattern->min_age_region;
+	scheme->pattern.max_age_region = pattern->max_age_region;
 	scheme->action = action;
 	scheme->stat = (struct damos_stat){};
 	INIT_LIST_HEAD(&scheme->list);
@@ -667,10 +664,12 @@ static bool __damos_valid_target(struct damon_region *r, struct damos *s)
 	unsigned long sz;
 
 	sz = r->ar.end - r->ar.start;
-	return s->min_sz_region <= sz && sz <= s->max_sz_region &&
-		s->min_nr_accesses <= r->nr_accesses &&
-		r->nr_accesses <= s->max_nr_accesses &&
-		s->min_age_region <= r->age && r->age <= s->max_age_region;
+	return s->pattern.min_sz_region <= sz &&
+		sz <= s->pattern.max_sz_region &&
+		s->pattern.min_nr_accesses <= r->nr_accesses &&
+		r->nr_accesses <= s->pattern.max_nr_accesses &&
+		s->pattern.min_age_region <= r->age &&
+		r->age <= s->pattern.max_age_region;
 }
 
 static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t,
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index dafe7e71329b8..61214cb9a5d3c 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -131,9 +131,12 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
 	damon_for_each_scheme(s, c) {
 		rc = scnprintf(&buf[written], len - written,
 				"%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
-				s->min_sz_region, s->max_sz_region,
-				s->min_nr_accesses, s->max_nr_accesses,
-				s->min_age_region, s->max_age_region,
+				s->pattern.min_sz_region,
+				s->pattern.max_sz_region,
+				s->pattern.min_nr_accesses,
+				s->pattern.max_nr_accesses,
+				s->pattern.min_age_region,
+				s->pattern.max_age_region,
 				damos_action_to_dbgfs_scheme_action(s->action),
 				s->quota.ms, s->quota.sz,
 				s->quota.reset_interval,
@@ -221,8 +224,6 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
 	struct damos *scheme, **schemes;
 	const int max_nr_schemes = 256;
 	int pos = 0, parsed, ret;
-	unsigned long min_sz, max_sz;
-	unsigned int min_nr_a, max_nr_a, min_age, max_age;
 	unsigned int action_input;
 	enum damos_action action;
 
@@ -233,13 +234,18 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
 
 	*nr_schemes = 0;
 	while (pos < len && *nr_schemes < max_nr_schemes) {
+		struct damos_access_pattern pattern = {};
 		struct damos_quota quota = {};
 		struct damos_watermarks wmarks;
 
 		ret = sscanf(&str[pos],
 				"%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n",
-				&min_sz, &max_sz, &min_nr_a, &max_nr_a,
-				&min_age, &max_age, &action_input, &quota.ms,
+				&pattern.min_sz_region, &pattern.max_sz_region,
+				&pattern.min_nr_accesses,
+				&pattern.max_nr_accesses,
+				&pattern.min_age_region,
+				&pattern.max_age_region,
+				&action_input, &quota.ms,
 				&quota.sz, &quota.reset_interval,
 				&quota.weight_sz, &quota.weight_nr_accesses,
 				&quota.weight_age, &wmarks.metric,
@@ -251,7 +257,9 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
 		if ((int)action < 0)
 			goto fail;
 
-		if (min_sz > max_sz || min_nr_a > max_nr_a || min_age > max_age)
+		if (pattern.min_sz_region > pattern.max_sz_region ||
+		    pattern.min_nr_accesses > pattern.max_nr_accesses ||
+		    pattern.min_age_region > pattern.max_age_region)
 			goto fail;
 
 		if (wmarks.high < wmarks.mid || wmarks.high < wmarks.low ||
@@ -259,8 +267,7 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
 			goto fail;
 
 		pos += parsed;
-		scheme = damon_new_scheme(min_sz, max_sz, min_nr_a, max_nr_a,
-				min_age, max_age, action, &quota, &wmarks);
+		scheme = damon_new_scheme(&pattern, action, &quota, &wmarks);
 		if (!scheme)
 			goto fail;
 
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 9de6f00a71c5d..0184ed4828b7e 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -293,6 +293,17 @@ static bool get_monitoring_region(unsigned long *start, unsigned long *end)
 /* Create a DAMON-based operation scheme for hot memory regions */
 static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres)
 {
+	struct damos_access_pattern pattern = {
+		/* Find regions having PAGE_SIZE or larger size */
+		.min_sz_region = PAGE_SIZE,
+		.max_sz_region = ULONG_MAX,
+		/* and accessed for more than the threshold */
+		.min_nr_accesses = hot_thres,
+		.max_nr_accesses = UINT_MAX,
+		/* no matter its age */
+		.min_age_region = 0,
+		.max_age_region = UINT_MAX,
+	};
 	struct damos_watermarks wmarks = {
 		.metric = DAMOS_WMARK_FREE_MEM_RATE,
 		.interval = wmarks_interval,
@@ -313,26 +324,31 @@ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres)
 		.weight_nr_accesses = 1,
 		.weight_age = 0,
 	};
-	struct damos *scheme = damon_new_scheme(
-			/* Find regions having PAGE_SIZE or larger size */
-			PAGE_SIZE, ULONG_MAX,
-			/* and accessed for more than the threshold */
-			hot_thres, UINT_MAX,
-			/* no matter its age */
-			0, UINT_MAX,
+
+	return damon_new_scheme(
+			&pattern,
 			/* prioritize those on LRU lists, as soon as found */
 			DAMOS_LRU_PRIO,
 			/* under the quota. */
 			&quota,
 			/* (De)activate this according to the watermarks. */
 			&wmarks);
-
-	return scheme;
 }
 
 /* Create a DAMON-based operation scheme for cold memory regions */
 static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres)
 {
+	struct damos_access_pattern pattern = {
+		/* Find regions having PAGE_SIZE or larger size */
+		.min_sz_region = PAGE_SIZE,
+		.max_sz_region = ULONG_MAX,
+		/* and not accessed at all */
+		.min_nr_accesses = 0,
+		.max_nr_accesses = 0,
+		/* for min_age or more micro-seconds */
+		.min_age_region = cold_thres,
+		.max_age_region = UINT_MAX,
+	};
 	struct damos_watermarks wmarks = {
 		.metric = DAMOS_WMARK_FREE_MEM_RATE,
 		.interval = wmarks_interval,
@@ -354,21 +370,15 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres)
 		.weight_nr_accesses = 0,
 		.weight_age = 1,
 	};
-	struct damos *scheme = damon_new_scheme(
-			/* Find regions having PAGE_SIZE or larger size */
-			PAGE_SIZE, ULONG_MAX,
-			/* and not accessed at all */
-			0, 0,
-			/* for cold_thres or more micro-seconds, and */
-			cold_thres, UINT_MAX,
+
+	return damon_new_scheme(
+			&pattern,
 			/* mark those as not accessed, as soon as found */
 			DAMOS_LRU_DEPRIO,
 			/* under the quota. */
 			&quota,
 			/* (De)activate this according to the watermarks. */
 			&wmarks);
-
-	return scheme;
 }
 
 static int damon_lru_sort_apply_parameters(void)
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index a7faf51b4bd4a..5aeca0b9e88ec 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -264,6 +264,17 @@ static bool get_monitoring_region(unsigned long *start, unsigned long *end)
 
 static struct damos *damon_reclaim_new_scheme(void)
 {
+	struct damos_access_pattern pattern = {
+		/* Find regions having PAGE_SIZE or larger size */
+		.min_sz_region = PAGE_SIZE,
+		.max_sz_region = ULONG_MAX,
+		/* and not accessed at all */
+		.min_nr_accesses = 0,
+		.max_nr_accesses = 0,
+		/* for min_age or more micro-seconds */
+		.min_age_region = min_age / aggr_interval,
+		.max_age_region = UINT_MAX,
+	};
 	struct damos_watermarks wmarks = {
 		.metric = DAMOS_WMARK_FREE_MEM_RATE,
 		.interval = wmarks_interval,
@@ -284,21 +295,15 @@ static struct damos *damon_reclaim_new_scheme(void)
 		.weight_nr_accesses = 0,
 		.weight_age = 1
 	};
-	struct damos *scheme = damon_new_scheme(
-			/* Find regions having PAGE_SIZE or larger size */
-			PAGE_SIZE, ULONG_MAX,
-			/* and not accessed at all */
-			0, 0,
-			/* for min_age or more micro-seconds, and */
-			min_age / aggr_interval, UINT_MAX,
+
+	return damon_new_scheme(
+			&pattern,
 			/* page out those, as soon as found */
 			DAMOS_PAGEOUT,
 			/* under the quota. */
 			&quota,
 			/* (De)activate this according to the watermarks. */
 			&wmarks);
-
-	return scheme;
 }
 
 static int damon_reclaim_apply_parameters(void)
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index b4b9614eecbed..ec88644c51df7 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2259,11 +2259,20 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx,
 static struct damos *damon_sysfs_mk_scheme(
 		struct damon_sysfs_scheme *sysfs_scheme)
 {
-	struct damon_sysfs_access_pattern *pattern =
+	struct damon_sysfs_access_pattern *access_pattern =
 		sysfs_scheme->access_pattern;
 	struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
 	struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
 	struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
+
+	struct damos_access_pattern pattern = {
+		.min_sz_region = access_pattern->sz->min,
+		.max_sz_region = access_pattern->sz->max,
+		.min_nr_accesses = access_pattern->nr_accesses->min,
+		.max_nr_accesses = access_pattern->nr_accesses->max,
+		.min_age_region = access_pattern->age->min,
+		.max_age_region = access_pattern->age->max,
+	};
 	struct damos_quota quota = {
 		.ms = sysfs_quotas->ms,
 		.sz = sysfs_quotas->sz,
@@ -2280,10 +2289,8 @@ static struct damos *damon_sysfs_mk_scheme(
 		.low = sysfs_wmarks->low,
 	};
 
-	return damon_new_scheme(pattern->sz->min, pattern->sz->max,
-			pattern->nr_accesses->min, pattern->nr_accesses->max,
-			pattern->age->min, pattern->age->max,
-			sysfs_scheme->action, &quota, &wmarks);
+	return damon_new_scheme(&pattern, sysfs_scheme->action, &quota,
+			&wmarks);
 }
 
 static int damon_sysfs_set_schemes(struct damon_ctx *ctx,

From ca8346cd8beeab10121d7e739f0e8d882bb1e0f7 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 22 Nov 2022 19:48:31 +0000
Subject: [PATCH 529/737] mm/damon/sysfs: fix wrong empty schemes assumption
 under online tuning in damon_sysfs_set_schemes()

commit 95bc35f9bee5220dad4e8567654ab3288a181639 upstream.

Commit da87878010e5 ("mm/damon/sysfs: support online inputs update") made
'damon_sysfs_set_schemes()' to be called for running DAMON context, which
could have schemes.  In the case, DAMON sysfs interface is supposed to
update, remove, or add schemes to reflect the sysfs files.  However, the
code is assuming the DAMON context wouldn't have schemes at all, and
therefore creates and adds new schemes.  As a result, the code doesn't
work as intended for online schemes tuning and could have more than
expected memory footprint.  The schemes are all in the DAMON context, so
it doesn't leak the memory, though.

Remove the wrong asssumption (the DAMON context wouldn't have schemes) in
'damon_sysfs_set_schemes()' to fix the bug.

Link: https://lkml.kernel.org/r/20221122194831.3472-1-sj@kernel.org
Fixes: da87878010e5 ("mm/damon/sysfs: support online inputs update")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>	[5.19+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 95bc35f9bee5220dad4e8567654ab3288a181639)
---
 mm/damon/sysfs.c | 46 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 44 insertions(+), 2 deletions(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index ec88644c51df7..1b782ca413965 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2293,12 +2293,54 @@ static struct damos *damon_sysfs_mk_scheme(
 			&wmarks);
 }
 
+static void damon_sysfs_update_scheme(struct damos *scheme,
+		struct damon_sysfs_scheme *sysfs_scheme)
+{
+	struct damon_sysfs_access_pattern *access_pattern =
+		sysfs_scheme->access_pattern;
+	struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
+	struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
+	struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
+
+	scheme->pattern.min_sz_region = access_pattern->sz->min;
+	scheme->pattern.max_sz_region = access_pattern->sz->max;
+	scheme->pattern.min_nr_accesses = access_pattern->nr_accesses->min;
+	scheme->pattern.max_nr_accesses = access_pattern->nr_accesses->max;
+	scheme->pattern.min_age_region = access_pattern->age->min;
+	scheme->pattern.max_age_region = access_pattern->age->max;
+
+	scheme->action = sysfs_scheme->action;
+
+	scheme->quota.ms = sysfs_quotas->ms;
+	scheme->quota.sz = sysfs_quotas->sz;
+	scheme->quota.reset_interval = sysfs_quotas->reset_interval_ms;
+	scheme->quota.weight_sz = sysfs_weights->sz;
+	scheme->quota.weight_nr_accesses = sysfs_weights->nr_accesses;
+	scheme->quota.weight_age = sysfs_weights->age;
+
+	scheme->wmarks.metric = sysfs_wmarks->metric;
+	scheme->wmarks.interval = sysfs_wmarks->interval_us;
+	scheme->wmarks.high = sysfs_wmarks->high;
+	scheme->wmarks.mid = sysfs_wmarks->mid;
+	scheme->wmarks.low = sysfs_wmarks->low;
+}
+
 static int damon_sysfs_set_schemes(struct damon_ctx *ctx,
 		struct damon_sysfs_schemes *sysfs_schemes)
 {
-	int i;
+	struct damos *scheme, *next;
+	int i = 0;
+
+	damon_for_each_scheme_safe(scheme, next, ctx) {
+		if (i < sysfs_schemes->nr)
+			damon_sysfs_update_scheme(scheme,
+					sysfs_schemes->schemes_arr[i]);
+		else
+			damon_destroy_scheme(scheme);
+		i++;
+	}
 
-	for (i = 0; i < sysfs_schemes->nr; i++) {
+	for (; i < sysfs_schemes->nr; i++) {
 		struct damos *scheme, *next;
 
 		scheme = damon_sysfs_mk_scheme(sysfs_schemes->schemes_arr[i]);

From e96689e4a671e547fad54ca5a4a408a9fbe77ef4 Mon Sep 17 00:00:00 2001
From: Kenneth Lee <klee33@uw.edu>
Date: Mon, 8 Aug 2022 15:00:19 -0700
Subject: [PATCH 530/737] mm/damon/dbgfs: use kmalloc for allocating only one
 element

Use kmalloc(...) rather than kmalloc_array(1, ...) because the number of
elements we are specifying in this case is 1, kmalloc would accomplish the
same thing and we can simplify.

Link: https://lkml.kernel.org/r/20220808220019.1680469-1-klee33@uw.edu
Signed-off-by: Kenneth Lee <klee33@uw.edu>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/dbgfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 61214cb9a5d3c..124577db70124 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -1067,7 +1067,7 @@ static int __init __damon_dbgfs_init(void)
 				fops[i]);
 	dbgfs_fill_ctx_dir(dbgfs_root, dbgfs_ctxs[0]);
 
-	dbgfs_dirs = kmalloc_array(1, sizeof(dbgfs_root), GFP_KERNEL);
+	dbgfs_dirs = kmalloc(sizeof(dbgfs_root), GFP_KERNEL);
 	if (!dbgfs_dirs) {
 		debugfs_remove(dbgfs_root);
 		return -ENOMEM;

From 8ba46804d1fd8fb5804052b3210d5aa28b11439d Mon Sep 17 00:00:00 2001
From: Kaixu Xia <kaixuxia@tencent.com>
Date: Sat, 13 Aug 2022 23:19:03 +0800
Subject: [PATCH 531/737] mm/damon/core: simplify the parameter passing for
 region split operation

The parameter 'struct damon_ctx *ctx' is unnecessary in damon region split
operation, so we can remove it.

Link: https://lkml.kernel.org/r/1660403943-29124-1-git-send-email-kaixuxia@tencent.com
Signed-off-by: Kaixu Xia <kaixuxia@tencent.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/core-test.h |  6 +++---
 mm/damon/core.c      | 21 +++++++++------------
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h
index 573669566f846..45db79d28fdc3 100644
--- a/mm/damon/core-test.h
+++ b/mm/damon/core-test.h
@@ -126,7 +126,7 @@ static void damon_test_split_at(struct kunit *test)
 	t = damon_new_target();
 	r = damon_new_region(0, 100);
 	damon_add_region(r, t);
-	damon_split_region_at(c, t, r, 25);
+	damon_split_region_at(t, r, 25);
 	KUNIT_EXPECT_EQ(test, r->ar.start, 0ul);
 	KUNIT_EXPECT_EQ(test, r->ar.end, 25ul);
 
@@ -219,14 +219,14 @@ static void damon_test_split_regions_of(struct kunit *test)
 	t = damon_new_target();
 	r = damon_new_region(0, 22);
 	damon_add_region(r, t);
-	damon_split_regions_of(c, t, 2);
+	damon_split_regions_of(t, 2);
 	KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u);
 	damon_free_target(t);
 
 	t = damon_new_target();
 	r = damon_new_region(0, 220);
 	damon_add_region(r, t);
-	damon_split_regions_of(c, t, 4);
+	damon_split_regions_of(t, 4);
 	KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u);
 	damon_free_target(t);
 	damon_destroy_ctx(c);
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 7d5a9ae6f4ac9..57450a0160abf 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -655,9 +655,8 @@ static void kdamond_reset_aggregated(struct damon_ctx *c)
 	}
 }
 
-static void damon_split_region_at(struct damon_ctx *ctx,
-		struct damon_target *t, struct damon_region *r,
-		unsigned long sz_r);
+static void damon_split_region_at(struct damon_target *t,
+				  struct damon_region *r, unsigned long sz_r);
 
 static bool __damos_valid_target(struct damon_region *r, struct damos *s)
 {
@@ -725,7 +724,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
 						continue;
 					sz = DAMON_MIN_REGION;
 				}
-				damon_split_region_at(c, t, r, sz);
+				damon_split_region_at(t, r, sz);
 				r = damon_next_region(r);
 				sz = r->ar.end - r->ar.start;
 			}
@@ -744,7 +743,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
 						DAMON_MIN_REGION);
 				if (!sz)
 					goto update_stat;
-				damon_split_region_at(c, t, r, sz);
+				damon_split_region_at(t, r, sz);
 			}
 			ktime_get_coarse_ts64(&begin);
 			sz_applied = c->ops.apply_scheme(c, t, r, s);
@@ -927,9 +926,8 @@ static void kdamond_merge_regions(struct damon_ctx *c, unsigned int threshold,
  * r		the region to be split
  * sz_r		size of the first sub-region that will be made
  */
-static void damon_split_region_at(struct damon_ctx *ctx,
-		struct damon_target *t, struct damon_region *r,
-		unsigned long sz_r)
+static void damon_split_region_at(struct damon_target *t,
+				  struct damon_region *r, unsigned long sz_r)
 {
 	struct damon_region *new;
 
@@ -946,8 +944,7 @@ static void damon_split_region_at(struct damon_ctx *ctx,
 }
 
 /* Split every region in the given target into 'nr_subs' regions */
-static void damon_split_regions_of(struct damon_ctx *ctx,
-				     struct damon_target *t, int nr_subs)
+static void damon_split_regions_of(struct damon_target *t, int nr_subs)
 {
 	struct damon_region *r, *next;
 	unsigned long sz_region, sz_sub = 0;
@@ -968,7 +965,7 @@ static void damon_split_regions_of(struct damon_ctx *ctx,
 			if (sz_sub == 0 || sz_sub >= sz_region)
 				continue;
 
-			damon_split_region_at(ctx, t, r, sz_sub);
+			damon_split_region_at(t, r, sz_sub);
 			sz_region = sz_sub;
 		}
 	}
@@ -1003,7 +1000,7 @@ static void kdamond_split_regions(struct damon_ctx *ctx)
 		nr_subregions = 3;
 
 	damon_for_each_target(t, ctx)
-		damon_split_regions_of(ctx, t, nr_subregions);
+		damon_split_regions_of(t, nr_subregions);
 
 	last_nr_regions = nr_regions;
 }

From 9f9bb3e3452a06455f9c98056464132c6b249139 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Thu, 18 Aug 2022 15:37:44 +0800
Subject: [PATCH 532/737] mm/damon: replace pmd_huge() with pmd_trans_huge()
 for THP

pmd_huge() is usually used to indicate a pmd level hugetlb.  However a pmd
mapped huge page can only be THP in damon_mkold_pmd_entry() or
damon_young_pmd_entry(), so replace pmd_huge() with pmd_trans_huge() in
this case to make the code more readable according to the discussion [1].

[1] https://lore.kernel.org/all/098c1480-416d-bca9-cedb-ca495df69b64@linux.alibaba.com/

Link: https://lkml.kernel.org/r/a9e010ca5d299e18d740c7c52290ecb6a014dde6.1660805030.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/vaddr.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 1d16c6c796386..cc04d467ba23d 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -302,14 +302,14 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
 	pte_t *pte;
 	spinlock_t *ptl;
 
-	if (pmd_huge(*pmd)) {
+	if (pmd_trans_huge(*pmd)) {
 		ptl = pmd_lock(walk->mm, pmd);
 		if (!pmd_present(*pmd)) {
 			spin_unlock(ptl);
 			return 0;
 		}
 
-		if (pmd_huge(*pmd)) {
+		if (pmd_trans_huge(*pmd)) {
 			damon_pmdp_mkold(pmd, walk->mm, addr);
 			spin_unlock(ptl);
 			return 0;
@@ -434,14 +434,14 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
 	struct damon_young_walk_private *priv = walk->private;
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	if (pmd_huge(*pmd)) {
+	if (pmd_trans_huge(*pmd)) {
 		ptl = pmd_lock(walk->mm, pmd);
 		if (!pmd_present(*pmd)) {
 			spin_unlock(ptl);
 			return 0;
 		}
 
-		if (!pmd_huge(*pmd)) {
+		if (!pmd_trans_huge(*pmd)) {
 			spin_unlock(ptl);
 			goto regular_page;
 		}

From 6a22438d9eb5c1a41bdd6ffd615f0f8ead10b020 Mon Sep 17 00:00:00 2001
From: Kaixu Xia <kaixuxia@tencent.com>
Date: Sat, 27 Aug 2022 17:02:50 +0800
Subject: [PATCH 533/737] mm/damon: simplify the parameter passing for
 'check_accesses'

Patch series "mm/damon: Simplify the damon regions access check", v2.

This patchset simplifies the operations when checking the damon regions
accesses.

This patch (of 2):

The parameter 'struct damon_ctx *ctx' isn't used in the functions
__damon_{p,v}a_check_access(), so we can remove it and simplify the
parameter passing.

Link: https://lkml.kernel.org/r/1661590971-20893-1-git-send-email-kaixuxia@tencent.com
Link: https://lkml.kernel.org/r/1661590971-20893-2-git-send-email-kaixuxia@tencent.com
Signed-off-by: Kaixu Xia <kaixuxia@tencent.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/paddr.c | 5 ++---
 mm/damon/vaddr.c | 6 +++---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 46565f67dd3f9..8dbac307ad5d4 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -170,8 +170,7 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz)
 	return result.accessed;
 }
 
-static void __damon_pa_check_access(struct damon_ctx *ctx,
-				    struct damon_region *r)
+static void __damon_pa_check_access(struct damon_region *r)
 {
 	static unsigned long last_addr;
 	static unsigned long last_page_sz = PAGE_SIZE;
@@ -200,7 +199,7 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
 
 	damon_for_each_target(t, ctx) {
 		damon_for_each_region(r, t) {
-			__damon_pa_check_access(ctx, r);
+			__damon_pa_check_access(r);
 			max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
 		}
 	}
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index cc04d467ba23d..34a72f5e13f5f 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -542,8 +542,8 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
  * mm	'mm_struct' for the given virtual address space
  * r	the region to be checked
  */
-static void __damon_va_check_access(struct damon_ctx *ctx,
-			       struct mm_struct *mm, struct damon_region *r)
+static void __damon_va_check_access(struct mm_struct *mm,
+				struct damon_region *r)
 {
 	static struct mm_struct *last_mm;
 	static unsigned long last_addr;
@@ -578,7 +578,7 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
 		if (!mm)
 			continue;
 		damon_for_each_region(r, t) {
-			__damon_va_check_access(ctx, mm, r);
+			__damon_va_check_access(mm, r);
 			max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
 		}
 		mmput(mm);

From 0d95abf9412c8152cfb0aaa4590f658d43adf108 Mon Sep 17 00:00:00 2001
From: Kaixu Xia <kaixuxia@tencent.com>
Date: Sat, 27 Aug 2022 17:02:51 +0800
Subject: [PATCH 534/737] mm/damon/vaddr: remove comparison between mm and
 last_mm when checking region accesses

The damon regions that belong to the same damon target have the same
'struct mm_struct *mm', so it's unnecessary to compare the mm and last_mm
objects among the damon regions in one damon target when checking
accesses.  But the check is necessary when the target changed in
'__damon_va_check_accesses()', so we can simplify the whole operation by
using the bool 'same_target' to indicate whether the target changed.

Link: https://lkml.kernel.org/r/1661590971-20893-3-git-send-email-kaixuxia@tencent.com
Signed-off-by: Kaixu Xia <kaixuxia@tencent.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/vaddr.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 34a72f5e13f5f..a8505ad47c609 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -543,15 +543,14 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
  * r	the region to be checked
  */
 static void __damon_va_check_access(struct mm_struct *mm,
-				struct damon_region *r)
+				struct damon_region *r, bool same_target)
 {
-	static struct mm_struct *last_mm;
 	static unsigned long last_addr;
 	static unsigned long last_page_sz = PAGE_SIZE;
 	static bool last_accessed;
 
 	/* If the region is in the last checked page, reuse the result */
-	if (mm == last_mm && (ALIGN_DOWN(last_addr, last_page_sz) ==
+	if (same_target && (ALIGN_DOWN(last_addr, last_page_sz) ==
 				ALIGN_DOWN(r->sampling_addr, last_page_sz))) {
 		if (last_accessed)
 			r->nr_accesses++;
@@ -562,7 +561,6 @@ static void __damon_va_check_access(struct mm_struct *mm,
 	if (last_accessed)
 		r->nr_accesses++;
 
-	last_mm = mm;
 	last_addr = r->sampling_addr;
 }
 
@@ -572,14 +570,17 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
 	struct mm_struct *mm;
 	struct damon_region *r;
 	unsigned int max_nr_accesses = 0;
+	bool same_target;
 
 	damon_for_each_target(t, ctx) {
 		mm = damon_get_mm(t);
 		if (!mm)
 			continue;
+		same_target = false;
 		damon_for_each_region(r, t) {
-			__damon_va_check_access(mm, r);
+			__damon_va_check_access(mm, r, same_target);
 			max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
+			same_target = true;
 		}
 		mmput(mm);
 	}

From a343f7c9598646385002a75cc9816c21f24d2329 Mon Sep 17 00:00:00 2001
From: Kaixu Xia <kaixuxia@tencent.com>
Date: Mon, 29 Aug 2022 17:46:06 +0800
Subject: [PATCH 535/737] mm/damon: get the hotness from damon_hot_score() in
 damon_pageout_score()

We can get the hotness value from damon_hot_score() directly in
damon_pageout_score() function and improve the code readability.

Link: https://lkml.kernel.org/r/1661766366-20998-1-git-send-email-kaixuxia@tencent.com
Signed-off-by: Kaixu Xia <kaixuxia@tencent.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/ops-common.c | 46 ++++++-------------------------------------
 1 file changed, 6 insertions(+), 40 deletions(-)

diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index b1335de200e77..f599838b5f648 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -88,7 +88,7 @@ void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr)
 #define DAMON_MAX_SUBSCORE	(100)
 #define DAMON_MAX_AGE_IN_LOG	(32)
 
-int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
+int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
 			struct damos *s)
 {
 	unsigned int max_nr_accesses;
@@ -127,48 +127,14 @@ int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
 	 */
 	hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE;
 
-	/* Return coldness of the region */
-	return DAMOS_MAX_SCORE - hotness;
+	return hotness;
 }
 
-int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
+int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
 			struct damos *s)
 {
-	unsigned int max_nr_accesses;
-	int freq_subscore;
-	unsigned int age_in_sec;
-	int age_in_log, age_subscore;
-	unsigned int freq_weight = s->quota.weight_nr_accesses;
-	unsigned int age_weight = s->quota.weight_age;
-	int hotness;
-
-	max_nr_accesses = c->aggr_interval / c->sample_interval;
-	freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses;
+	int hotness = damon_hot_score(c, r, s);
 
-	age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000;
-	for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec;
-			age_in_log++, age_in_sec >>= 1)
-		;
-
-	/* If frequency is 0, higher age means it's colder */
-	if (freq_subscore == 0)
-		age_in_log *= -1;
-
-	/*
-	 * Now age_in_log is in [-DAMON_MAX_AGE_IN_LOG, DAMON_MAX_AGE_IN_LOG].
-	 * Scale it to be in [0, 100] and set it as age subscore.
-	 */
-	age_in_log += DAMON_MAX_AGE_IN_LOG;
-	age_subscore = age_in_log * DAMON_MAX_SUBSCORE /
-		DAMON_MAX_AGE_IN_LOG / 2;
-
-	hotness = (freq_weight * freq_subscore + age_weight * age_subscore);
-	if (freq_weight + age_weight)
-		hotness /= freq_weight + age_weight;
-	/*
-	 * Transform it to fit in [0, DAMOS_MAX_SCORE]
-	 */
-	hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE;
-
-	return hotness;
+	/* Return coldness of the region */
+	return DAMOS_MAX_SCORE - hotness;
 }

From 7867aca53771b0117f8cd8fcd93ae97203717dc3 Mon Sep 17 00:00:00 2001
From: Kaixu Xia <kaixuxia@tencent.com>
Date: Sun, 4 Sep 2022 22:36:06 +0800
Subject: [PATCH 536/737] mm/damon/sysfs: simplify the judgement whether
 kdamonds are busy

It is unnecessary to get the number of the running kdamond to judge
whether kdamonds are busy.  Here we can use the
damon_sysfs_kdamond_running() helper and return -EBUSY directly when
finding a running kdamond.  Meanwhile, merging with the judgement that a
kdamond has current sysfs command callback request to make the code more
clear.

Link: https://lkml.kernel.org/r/1662302166-13216-1-git-send-email-kaixuxia@tencent.com
Signed-off-by: Kaixu Xia <kaixuxia@tencent.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs.c | 25 +++++++------------------
 1 file changed, 7 insertions(+), 18 deletions(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 1b782ca413965..7213c02c3d258 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2710,23 +2710,18 @@ static void damon_sysfs_kdamonds_rm_dirs(struct damon_sysfs_kdamonds *kdamonds)
 	kdamonds->kdamonds_arr = NULL;
 }
 
-static int damon_sysfs_nr_running_ctxs(struct damon_sysfs_kdamond **kdamonds,
+static bool damon_sysfs_kdamonds_busy(struct damon_sysfs_kdamond **kdamonds,
 		int nr_kdamonds)
 {
-	int nr_running_ctxs = 0;
 	int i;
 
 	for (i = 0; i < nr_kdamonds; i++) {
-		struct damon_ctx *ctx = kdamonds[i]->damon_ctx;
-
-		if (!ctx)
-			continue;
-		mutex_lock(&ctx->kdamond_lock);
-		if (ctx->kdamond)
-			nr_running_ctxs++;
-		mutex_unlock(&ctx->kdamond_lock);
+		if (damon_sysfs_kdamond_running(kdamonds[i]) ||
+		    damon_sysfs_cmd_request.kdamond == kdamonds[i])
+			return true;
 	}
-	return nr_running_ctxs;
+
+	return false;
 }
 
 static int damon_sysfs_kdamonds_add_dirs(struct damon_sysfs_kdamonds *kdamonds,
@@ -2735,15 +2730,9 @@ static int damon_sysfs_kdamonds_add_dirs(struct damon_sysfs_kdamonds *kdamonds,
 	struct damon_sysfs_kdamond **kdamonds_arr, *kdamond;
 	int err, i;
 
-	if (damon_sysfs_nr_running_ctxs(kdamonds->kdamonds_arr, kdamonds->nr))
+	if (damon_sysfs_kdamonds_busy(kdamonds->kdamonds_arr, kdamonds->nr))
 		return -EBUSY;
 
-	for (i = 0; i < kdamonds->nr; i++) {
-		if (damon_sysfs_cmd_request.kdamond ==
-				kdamonds->kdamonds_arr[i])
-			return -EBUSY;
-	}
-
 	damon_sysfs_kdamonds_rm_dirs(kdamonds);
 	if (!nr_kdamonds)
 		return 0;

From e9dca61ce7635f6987803b19469a90a8709f30a4 Mon Sep 17 00:00:00 2001
From: Kaixu Xia <kaixuxia@tencent.com>
Date: Tue, 6 Sep 2022 23:18:47 +0800
Subject: [PATCH 537/737] mm/damon/core: iterate the regions list from current
 point in damon_set_regions()

We iterate the whole regions list every time to get the first/last regions
intersecting with the specific range in damon_set_regions(), in order to
add new region or resize existing regions to fit in the specific range.
Actually, it is unnecessary to iterate the new added regions and the front
regions that have been checked.  Just iterate the regions list from the
current point using list_for_each_entry_from() every time to improve
performance.

The kunit tests passed:
 [PASSED] damon_test_apply_three_regions1
 [PASSED] damon_test_apply_three_regions2
 [PASSED] damon_test_apply_three_regions3
 [PASSED] damon_test_apply_three_regions4

Link: https://lkml.kernel.org/r/1662477527-13003-1-git-send-email-kaixuxia@tencent.com
Signed-off-by: Kaixu Xia <kaixuxia@tencent.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 8 ++++++++
 mm/damon/core.c       | 3 ++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 98e622c34d44f..90f20675da22a 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -469,9 +469,17 @@ static inline struct damon_region *damon_last_region(struct damon_target *t)
 	return list_last_entry(&t->regions_list, struct damon_region, list);
 }
 
+static inline struct damon_region *damon_first_region(struct damon_target *t)
+{
+	return list_first_entry(&t->regions_list, struct damon_region, list);
+}
+
 #define damon_for_each_region(r, t) \
 	list_for_each_entry(r, &t->regions_list, list)
 
+#define damon_for_each_region_from(r, t) \
+	list_for_each_entry_from(r, &t->regions_list, list)
+
 #define damon_for_each_region_safe(r, next, t) \
 	list_for_each_entry_safe(r, next, &t->regions_list, list)
 
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 57450a0160abf..bae41990f4227 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -195,6 +195,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
 			damon_destroy_region(r, t);
 	}
 
+	r = damon_first_region(t);
 	/* Add new regions or resize existing regions to fit in the ranges */
 	for (i = 0; i < nr_ranges; i++) {
 		struct damon_region *first = NULL, *last, *newr;
@@ -202,7 +203,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
 
 		range = &ranges[i];
 		/* Get the first/last regions intersecting with the range */
-		damon_for_each_region(r, t) {
+		damon_for_each_region_from(r, t) {
 			if (damon_intersect(r, range)) {
 				if (!first)
 					first = r;

From 1495a71074de696cf1561cb4d4bc684502e13c39 Mon Sep 17 00:00:00 2001
From: Xin Hao <xhao@linux.alibaba.com>
Date: Wed, 7 Sep 2022 16:41:16 +0800
Subject: [PATCH 538/737] mm/damon: simplify damon_ctx check in
 damon_sysfs_before_terminate

In damon_sysfs_before_terminate(), it needs to check whether ctx->ops.id
supports 'DAMON_OPS_VADDR' or 'DAMON_OPS_FVADDR', there we can use
damon_target_has_pid() instead.

Link: https://lkml.kernel.org/r/20220907084116.62053-1-xhao@linux.alibaba.com
Signed-off-by: Xin Hao <xhao@linux.alibaba.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 7213c02c3d258..02a6cdc17ef89 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2358,7 +2358,7 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
 {
 	struct damon_target *t, *next;
 
-	if (ctx->ops.id != DAMON_OPS_VADDR && ctx->ops.id != DAMON_OPS_FVADDR)
+	if (!damon_target_has_pid(ctx))
 		return;
 
 	mutex_lock(&ctx->kdamond_lock);

From 3ccd35a15f6be2edcefb450d9287789503f88133 Mon Sep 17 00:00:00 2001
From: Kaixu Xia <kaixuxia@tencent.com>
Date: Thu, 8 Sep 2022 11:13:17 +0800
Subject: [PATCH 539/737] mm/damon/vaddr: add a comment for 'default' case in
 damon_va_apply_scheme()

The switch case 'DAMOS_STAT' and switch case 'default' have same return
value in damon_va_apply_scheme(), and the 'default' case is for DAMOS
actions that not supported by 'vaddr'.  It might make sense to add a
comment here.

[akpm@linux-foundation.org: fx comment grammar]
Link: https://lkml.kernel.org/r/1662606797-23534-1-git-send-email-kaixuxia@tencent.com
Signed-off-by: Kaixu Xia <kaixuxia@tencent.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/vaddr.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index a8505ad47c609..03e33d8b1991d 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -657,6 +657,9 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
 	case DAMOS_STAT:
 		return 0;
 	default:
+		/*
+		 * DAMOS actions that are not yet supported by 'vaddr'.
+		 */
 		return 0;
 	}
 

From 0f2ddef7cac580db89277463fc039fe1393a1d56 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 9 Sep 2022 20:28:55 +0000
Subject: [PATCH 540/737] selftest/damon: add a test for duplicate context dirs
 creation

Patch series "mm/damon: minor fixes and cleanups".

This patchset contains minor fixes and cleanups for DAMON including

- selftest for a bug we found before (Patch 1),
- fix of region holes in vaddr corner case and a kunit test for it
  (Patches 2 and 3), and
- documents/Kconfig updates for title wordsmithing (Patch 4) and more
  aggressive DAMON debugfs interface deprecation announcement
  (Patches 5-7).

This patch (of 7):

Commit d26f60703606 ("mm/damon/dbgfs: avoid duplicate context directory
creation") fixes a bug which could result in memory leak and DAMON
disablement.  This commit adds a selftest for verifying the fix and avoid
regression.

Link: https://lkml.kernel.org/r/20220909202901.57977-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20220909202901.57977-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yun Levi <ppbuk5246@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/Makefile        |  1 +
 .../debugfs_duplicate_context_creation.sh     | 27 +++++++++++++++++++
 2 files changed, 28 insertions(+)
 create mode 100644 tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh

diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile
index 0470c5f3e6906..a1fa2eff8192f 100644
--- a/tools/testing/selftests/damon/Makefile
+++ b/tools/testing/selftests/damon/Makefile
@@ -6,6 +6,7 @@ TEST_GEN_FILES += huge_count_read_write
 TEST_FILES = _chk_dependency.sh _debugfs_common.sh
 TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh
 TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh
+TEST_PROGS += debugfs_duplicate_context_creation.sh
 TEST_PROGS += sysfs.sh
 
 include ../lib.mk
diff --git a/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh b/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh
new file mode 100644
index 0000000000000..4a76e37ef16b1
--- /dev/null
+++ b/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source _debugfs_common.sh
+
+# Test duplicated context creation
+# ================================
+
+if ! echo foo > "$DBGFS/mk_contexts"
+then
+	echo "context creation failed"
+	exit 1
+fi
+
+if echo foo > "$DBGFS/mk_contexts"
+then
+	echo "duplicate context creation success"
+	exit 1
+fi
+
+if ! echo foo > "$DBGFS/rm_contexts"
+then
+	echo "context deletion failed"
+	exit 1
+fi
+
+exit 0

From 19c3a0c9aa2b9a44b1f459fd83fd065aafe9cff0 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 9 Sep 2022 20:28:56 +0000
Subject: [PATCH 541/737] mm/damon/core: avoid holes in newly set monitoring
 target ranges

When there are two or more non-contiguous regions intersecting with given
new ranges, 'damon_set_regions()' does not fill the holes.  This commit
makes the function to fill the holes with newly created regions.

[sj@kernel.org: handle error from 'damon_fill_regions_holes()']
  Link: https://lkml.kernel.org/r/20220913215420.57761-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20220909202901.57977-3-sj@kernel.org
Fixes: 3f49584b262c ("mm/damon: implement primitives for the virtual memory address spaces")
Signed-off-by: SeongJae Park <sj@kernel.org>
Reported-by: Yun Levi <ppbuk5246@gmail.com>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/core.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index bae41990f4227..5ad31d2feae40 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -168,6 +168,30 @@ static bool damon_intersect(struct damon_region *r,
 	return !(r->ar.end <= re->start || re->end <= r->ar.start);
 }
 
+/*
+ * Fill holes in regions with new regions.
+ */
+static int damon_fill_regions_holes(struct damon_region *first,
+		struct damon_region *last, struct damon_target *t)
+{
+	struct damon_region *r = first;
+
+	damon_for_each_region_from(r, t) {
+		struct damon_region *next, *newr;
+
+		if (r == last)
+			break;
+		next = damon_next_region(r);
+		if (r->ar.end != next->ar.start) {
+			newr = damon_new_region(r->ar.end, next->ar.start);
+			if (!newr)
+				return -ENOMEM;
+			damon_insert_region(newr, r, next, t);
+		}
+	}
+	return 0;
+}
+
 /*
  * damon_set_regions() - Set regions of a target for given address ranges.
  * @t:		the given target.
@@ -184,6 +208,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
 {
 	struct damon_region *r, *next;
 	unsigned int i;
+	int err;
 
 	/* Remove regions which are not in the new ranges */
 	damon_for_each_region_safe(r, next, t) {
@@ -226,6 +251,11 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
 			first->ar.start = ALIGN_DOWN(range->start,
 					DAMON_MIN_REGION);
 			last->ar.end = ALIGN(range->end, DAMON_MIN_REGION);
+
+			/* fill possible holes in the range */
+			err = damon_fill_regions_holes(first, last, t);
+			if (err)
+				return err;
 		}
 	}
 	return 0;

From 20fc56037994c57f687cc9fdc987ac392dc2203b Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 9 Sep 2022 20:28:57 +0000
Subject: [PATCH 542/737] mm/damon/core-test: test damon_set_regions

Preceding commit fixes a bug in 'damon_set_regions()', which allows holes
in the new monitoring target ranges.  This commit adds a kunit test case
for the problem to avoid any regression.

Link: https://lkml.kernel.org/r/20220909202901.57977-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yun Levi <ppbuk5246@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/core-test.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h
index 45db79d28fdc3..3db9b73687562 100644
--- a/mm/damon/core-test.h
+++ b/mm/damon/core-test.h
@@ -267,6 +267,28 @@ static void damon_test_ops_registration(struct kunit *test)
 	KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL);
 }
 
+static void damon_test_set_regions(struct kunit *test)
+{
+	struct damon_target *t = damon_new_target();
+	struct damon_region *r1 = damon_new_region(4, 16);
+	struct damon_region *r2 = damon_new_region(24, 32);
+	struct damon_addr_range range = {.start = 8, .end = 28};
+	unsigned long expects[] = {8, 16, 16, 24, 24, 28};
+	int expect_idx = 0;
+	struct damon_region *r;
+
+	damon_add_region(r1, t);
+	damon_add_region(r2, t);
+	damon_set_regions(t, &range, 1);
+
+	KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 3);
+	damon_for_each_region(r, t) {
+		KUNIT_EXPECT_EQ(test, r->ar.start, expects[expect_idx++]);
+		KUNIT_EXPECT_EQ(test, r->ar.end, expects[expect_idx++]);
+	}
+	damon_destroy_target(t);
+}
+
 static struct kunit_case damon_test_cases[] = {
 	KUNIT_CASE(damon_test_target),
 	KUNIT_CASE(damon_test_regions),
@@ -276,6 +298,7 @@ static struct kunit_case damon_test_cases[] = {
 	KUNIT_CASE(damon_test_merge_regions_of),
 	KUNIT_CASE(damon_test_split_regions_of),
 	KUNIT_CASE(damon_test_ops_registration),
+	KUNIT_CASE(damon_test_set_regions),
 	{},
 };
 

From 1c3eb3c6ecda5ca21eed2bba3e0ae76d4cf48134 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 9 Sep 2022 20:28:58 +0000
Subject: [PATCH 543/737] Docs/admin-guide/mm/damon: rename the title of the
 document

The title of the DAMON document for admin-guide, 'Monitoring Data
Accesses', could confuse readers in some ways.  First of all, DAMON is not
the only single way for data access monitoring.  And the document is for
not only the data access monitoring but also data access pattern based
memory management optimizations (DAMOS).  This commit updates the title to
'DAMON: Data Access MONitor', which more explicitly explains what the
document describes.

Link: https://lkml.kernel.org/r/20220909202901.57977-5-sj@kernel.org
Fixes: c4ba6014aec3 ("Documentation: add documents for DAMON")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yun Levi <ppbuk5246@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/index.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/index.rst b/Documentation/admin-guide/mm/damon/index.rst
index 53762770e0e44..b4d029f418a91 100644
--- a/Documentation/admin-guide/mm/damon/index.rst
+++ b/Documentation/admin-guide/mm/damon/index.rst
@@ -1,8 +1,8 @@
 .. SPDX-License-Identifier: GPL-2.0
 
-========================
-Monitoring Data Accesses
-========================
+==========================
+DAMON: Data Access MONitor
+==========================
 
 :doc:`DAMON </vm/damon/index>` allows light-weight data access monitoring.
 Using DAMON, users can analyze the memory access patterns of their systems and

From 3732981cbd32cef906557af776ce936b848bc888 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 9 Sep 2022 20:28:59 +0000
Subject: [PATCH 544/737] mm/damon/Kconfig: notify debugfs deprecation plan

Commit b18402726bd1 ("Docs/admin-guide/mm/damon/usage: document DAMON
sysfs interface") announced the DAMON debugfs interface deprecation plan,
but it is not so aggressively announced.  As the deprecation time is
coming, this commit makes the announce more easy to be found by adding the
note to the config menu of DAMON debugfs interface.

Link: https://lkml.kernel.org/r/20220909202901.57977-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yun Levi <ppbuk5246@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/Kconfig | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index 66265e3a9c659..7821fcb3f2586 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -68,6 +68,9 @@ config DAMON_DBGFS
 
 	  If unsure, say N.
 
+	  This will be removed after >5.15.y LTS kernel is released, so users
+	  should move to the sysfs interface (DAMON_SYSFS).
+
 config DAMON_DBGFS_KUNIT_TEST
 	bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS
 	depends on DAMON_DBGFS && KUNIT=y

From 143de8fee0fd779312636c365f3549244c2ce094 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 9 Sep 2022 20:29:00 +0000
Subject: [PATCH 545/737] Docs/admin-guide/mm/damon/start: mention the
 dependency as sysfs instead of debugfs

'Getting Started' document of DAMON says DAMON user-space tool, damo[1],
is using DAMON debugfs interface, and therefore it needs to ensure debugfs
is mounted.  However, the latest version of the tool is using DAMON sysfs
interface.  Moreover, DAMON debugfs interface is going to be deprecated as
announced by commit b18402726bd1 ("Docs/admin-guide/mm/damon/usage:
document DAMON sysfs interface").

This commit therefore update the document to tell readers about DAMON
sysfs interface dependency instead and never mention about debugfs
interface, which will be deprecated.

[1] https://github.com/awslabs/damo

Link: https://lkml.kernel.org/r/20220909202901.57977-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yun Levi <ppbuk5246@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/start.rst | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/start.rst b/Documentation/admin-guide/mm/damon/start.rst
index 4d5ca2c46288a..9f88afc734da4 100644
--- a/Documentation/admin-guide/mm/damon/start.rst
+++ b/Documentation/admin-guide/mm/damon/start.rst
@@ -29,16 +29,9 @@ called DAMON Operator (DAMO).  It is available at
 https://github.com/awslabs/damo.  The examples below assume that ``damo`` is on
 your ``$PATH``.  It's not mandatory, though.
 
-Because DAMO is using the debugfs interface (refer to :doc:`usage` for the
-detail) of DAMON, you should ensure debugfs is mounted.  Mount it manually as
-below::
-
-    # mount -t debugfs none /sys/kernel/debug/
-
-or append the following line to your ``/etc/fstab`` file so that your system
-can automatically mount debugfs upon booting::
-
-    debugfs /sys/kernel/debug debugfs defaults 0 0
+Because DAMO is using the sysfs interface (refer to :doc:`usage` for the
+detail) of DAMON, you should ensure :doc:`sysfs </filesystems/sysfs>` is
+mounted.
 
 
 Recording Data Access Patterns

From 78112660bc666164565e33197ed45e838b78b23b Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 9 Sep 2022 20:29:01 +0000
Subject: [PATCH 546/737] Docs/admin-guide/mm/damon/usage: note DAMON debugfs
 interface deprecation plan

Commit b18402726bd1 ("Docs/admin-guide/mm/damon/usage: document DAMON
sysfs interface") announced the DAMON debugfs interface deprecation plan,
but it is not so aggressively announced.  As the deprecation time is
coming, this commit makes the announce more easy to be found by adding the
note at the beginning of the DAMON debugfs interface usage document.

Link: https://lkml.kernel.org/r/20220909202901.57977-8-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yun Levi <ppbuk5246@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index bbee99f2f681b..6e0402f84a5e9 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -393,6 +393,11 @@ the files as above.  Above is only for an example.
 debugfs Interface
 =================
 
+.. note::
+
+  DAMON debugfs interface will be removed after next LTS kernel is released, so
+  users should move to the :ref:`sysfs interface <sysfs_interface>`.
+
 DAMON exports eight files, ``attrs``, ``target_ids``, ``init_regions``,
 ``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` and
 ``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.

From d1386b648fa7cd0f5c6d71dace1082fe9ee006ee Mon Sep 17 00:00:00 2001
From: Xin Hao <xhao@linux.alibaba.com>
Date: Fri, 9 Sep 2022 21:36:06 +0000
Subject: [PATCH 547/737] mm/damon: remove duplicate get_monitoring_region()
 definitions

In lru_sort.c and reclaim.c, they are all defining get_monitoring_region()
function, there is no need to define it separately.

As 'get_monitoring_region()' is not a 'static' function anymore, we try to
use a prefix to distinguish with other functions, so there rename it to
'damon_find_biggest_system_ram'.

Link: https://lkml.kernel.org/r/20220909213606.136221-1-sj@kernel.org
Signed-off-by: Xin Hao <xhao@linux.alibaba.com>
Signed-off-by: SeongJae Park <sj@kernel.org>
Suggested-by: SeongJae Park <sj@kernel.org>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h |  2 ++
 mm/damon/core.c       | 40 ++++++++++++++++++++++++++++++++++++++++
 mm/damon/lru_sort.c   | 37 ++-----------------------------------
 mm/damon/reclaim.c    | 37 ++-----------------------------------
 4 files changed, 46 insertions(+), 70 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 90f20675da22a..016b6c9c03d62 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -549,6 +549,8 @@ static inline bool damon_target_has_pid(const struct damon_ctx *ctx)
 int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive);
 int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
 
+bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end);
+
 #endif	/* CONFIG_DAMON */
 
 #endif	/* _DAMON_H */
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 5ad31d2feae40..2437c61b0bc0b 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1245,4 +1245,44 @@ static int kdamond_fn(void *data)
 	return 0;
 }
 
+/*
+ * struct damon_system_ram_region - System RAM resource address region of
+ *				    [@start, @end).
+ * @start:	Start address of the region (inclusive).
+ * @end:	End address of the region (exclusive).
+ */
+struct damon_system_ram_region {
+	unsigned long start;
+	unsigned long end;
+};
+
+static int walk_system_ram(struct resource *res, void *arg)
+{
+	struct damon_system_ram_region *a = arg;
+
+	if (a->end - a->start < resource_size(res)) {
+		a->start = res->start;
+		a->end = res->end;
+	}
+	return 0;
+}
+
+/*
+ * Find biggest 'System RAM' resource and store its start and end address in
+ * @start and @end, respectively.  If no System RAM is found, returns false.
+ */
+bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end)
+
+{
+	struct damon_system_ram_region arg = {};
+
+	walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram);
+	if (arg.end <= arg.start)
+		return false;
+
+	*start = arg.start;
+	*end = arg.end;
+	return true;
+}
+
 #include "core-test.h"
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 0184ed4828b7e..8415e18fcf0ef 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -257,39 +257,6 @@ module_param(nr_cold_quota_exceeds, ulong, 0400);
 static struct damon_ctx *ctx;
 static struct damon_target *target;
 
-struct damon_lru_sort_ram_walk_arg {
-	unsigned long start;
-	unsigned long end;
-};
-
-static int walk_system_ram(struct resource *res, void *arg)
-{
-	struct damon_lru_sort_ram_walk_arg *a = arg;
-
-	if (a->end - a->start < resource_size(res)) {
-		a->start = res->start;
-		a->end = res->end;
-	}
-	return 0;
-}
-
-/*
- * Find biggest 'System RAM' resource and store its start and end address in
- * @start and @end, respectively.  If no System RAM is found, returns false.
- */
-static bool get_monitoring_region(unsigned long *start, unsigned long *end)
-{
-	struct damon_lru_sort_ram_walk_arg arg = {};
-
-	walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram);
-	if (arg.end <= arg.start)
-		return false;
-
-	*start = arg.start;
-	*end = arg.end;
-	return true;
-}
-
 /* Create a DAMON-based operation scheme for hot memory regions */
 static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres)
 {
@@ -414,8 +381,8 @@ static int damon_lru_sort_apply_parameters(void)
 	if (monitor_region_start > monitor_region_end)
 		return -EINVAL;
 	if (!monitor_region_start && !monitor_region_end &&
-			!get_monitoring_region(&monitor_region_start,
-				&monitor_region_end))
+	    !damon_find_biggest_system_ram(&monitor_region_start,
+					   &monitor_region_end))
 		return -EINVAL;
 	addr_range.start = monitor_region_start;
 	addr_range.end = monitor_region_end;
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 5aeca0b9e88ec..fe7bc0c55ecb3 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -229,39 +229,6 @@ module_param(nr_quota_exceeds, ulong, 0400);
 static struct damon_ctx *ctx;
 static struct damon_target *target;
 
-struct damon_reclaim_ram_walk_arg {
-	unsigned long start;
-	unsigned long end;
-};
-
-static int walk_system_ram(struct resource *res, void *arg)
-{
-	struct damon_reclaim_ram_walk_arg *a = arg;
-
-	if (a->end - a->start < resource_size(res)) {
-		a->start = res->start;
-		a->end = res->end;
-	}
-	return 0;
-}
-
-/*
- * Find biggest 'System RAM' resource and store its start and end address in
- * @start and @end, respectively.  If no System RAM is found, returns false.
- */
-static bool get_monitoring_region(unsigned long *start, unsigned long *end)
-{
-	struct damon_reclaim_ram_walk_arg arg = {};
-
-	walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram);
-	if (arg.end <= arg.start)
-		return false;
-
-	*start = arg.start;
-	*end = arg.end;
-	return true;
-}
-
 static struct damos *damon_reclaim_new_scheme(void)
 {
 	struct damos_access_pattern pattern = {
@@ -328,8 +295,8 @@ static int damon_reclaim_apply_parameters(void)
 	if (monitor_region_start > monitor_region_end)
 		return -EINVAL;
 	if (!monitor_region_start && !monitor_region_end &&
-			!get_monitoring_region(&monitor_region_start,
-				&monitor_region_end))
+	    !damon_find_biggest_system_ram(&monitor_region_start,
+					   &monitor_region_end))
 		return -EINVAL;
 	addr_range.start = monitor_region_start;
 	addr_range.end = monitor_region_end;

From 5cdf9cdac64605ea4dd4217e9062eec5211e90c7 Mon Sep 17 00:00:00 2001
From: Xin Hao <xhao@linux.alibaba.com>
Date: Thu, 8 Sep 2022 16:19:32 +0800
Subject: [PATCH 548/737] mm/damon/sysfs: change few functions execute order

There's no need to run container_of() as early as we do.

The compiler figures this out, but the resulting code is more readable.

Link: https://lkml.kernel.org/r/20220908081932.77370-1-xhao@linux.alibaba.com
Signed-off-by: Xin Hao <xhao@linux.alibaba.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 02a6cdc17ef89..a981d4dc7d782 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1031,8 +1031,7 @@ static ssize_t nr_schemes_show(struct kobject *kobj,
 static ssize_t nr_schemes_store(struct kobject *kobj,
 		struct kobj_attribute *attr, const char *buf, size_t count)
 {
-	struct damon_sysfs_schemes *schemes = container_of(kobj,
-			struct damon_sysfs_schemes, kobj);
+	struct damon_sysfs_schemes *schemes;
 	int nr, err = kstrtoint(buf, 0, &nr);
 
 	if (err)
@@ -1040,6 +1039,8 @@ static ssize_t nr_schemes_store(struct kobject *kobj,
 	if (nr < 0)
 		return -EINVAL;
 
+	schemes = container_of(kobj, struct damon_sysfs_schemes, kobj);
+
 	if (!mutex_trylock(&damon_sysfs_lock))
 		return -EBUSY;
 	err = damon_sysfs_schemes_add_dirs(schemes, nr);
@@ -1237,8 +1238,7 @@ static ssize_t nr_regions_show(struct kobject *kobj,
 static ssize_t nr_regions_store(struct kobject *kobj,
 		struct kobj_attribute *attr, const char *buf, size_t count)
 {
-	struct damon_sysfs_regions *regions = container_of(kobj,
-			struct damon_sysfs_regions, kobj);
+	struct damon_sysfs_regions *regions;
 	int nr, err = kstrtoint(buf, 0, &nr);
 
 	if (err)
@@ -1246,6 +1246,8 @@ static ssize_t nr_regions_store(struct kobject *kobj,
 	if (nr < 0)
 		return -EINVAL;
 
+	regions = container_of(kobj, struct damon_sysfs_regions, kobj);
+
 	if (!mutex_trylock(&damon_sysfs_lock))
 		return -EBUSY;
 	err = damon_sysfs_regions_add_dirs(regions, nr);
@@ -1440,8 +1442,7 @@ static ssize_t nr_targets_show(struct kobject *kobj,
 static ssize_t nr_targets_store(struct kobject *kobj,
 		struct kobj_attribute *attr, const char *buf, size_t count)
 {
-	struct damon_sysfs_targets *targets = container_of(kobj,
-			struct damon_sysfs_targets, kobj);
+	struct damon_sysfs_targets *targets;
 	int nr, err = kstrtoint(buf, 0, &nr);
 
 	if (err)
@@ -1449,6 +1450,8 @@ static ssize_t nr_targets_store(struct kobject *kobj,
 	if (nr < 0)
 		return -EINVAL;
 
+	targets = container_of(kobj, struct damon_sysfs_targets, kobj);
+
 	if (!mutex_trylock(&damon_sysfs_lock))
 		return -EBUSY;
 	err = damon_sysfs_targets_add_dirs(targets, nr);
@@ -1962,8 +1965,7 @@ static ssize_t nr_contexts_show(struct kobject *kobj,
 static ssize_t nr_contexts_store(struct kobject *kobj,
 		struct kobj_attribute *attr, const char *buf, size_t count)
 {
-	struct damon_sysfs_contexts *contexts = container_of(kobj,
-			struct damon_sysfs_contexts, kobj);
+	struct damon_sysfs_contexts *contexts;
 	int nr, err;
 
 	err = kstrtoint(buf, 0, &nr);
@@ -1973,6 +1975,7 @@ static ssize_t nr_contexts_store(struct kobject *kobj,
 	if (nr < 0 || 1 < nr)
 		return -EINVAL;
 
+	contexts = container_of(kobj, struct damon_sysfs_contexts, kobj);
 	if (!mutex_trylock(&damon_sysfs_lock))
 		return -EBUSY;
 	err = damon_sysfs_contexts_add_dirs(contexts, nr);
@@ -2783,8 +2786,7 @@ static ssize_t nr_kdamonds_show(struct kobject *kobj,
 static ssize_t nr_kdamonds_store(struct kobject *kobj,
 		struct kobj_attribute *attr, const char *buf, size_t count)
 {
-	struct damon_sysfs_kdamonds *kdamonds = container_of(kobj,
-			struct damon_sysfs_kdamonds, kobj);
+	struct damon_sysfs_kdamonds *kdamonds;
 	int nr, err;
 
 	err = kstrtoint(buf, 0, &nr);
@@ -2793,6 +2795,8 @@ static ssize_t nr_kdamonds_store(struct kobject *kobj,
 	if (nr < 0)
 		return -EINVAL;
 
+	kdamonds = container_of(kobj, struct damon_sysfs_kdamonds, kobj);
+
 	if (!mutex_trylock(&damon_sysfs_lock))
 		return -EBUSY;
 	err = damon_sysfs_kdamonds_add_dirs(kdamonds, nr);

From eeaff002b950db17a26e82dde4f7477b8ae2d4b7 Mon Sep 17 00:00:00 2001
From: Kaixu Xia <kaixuxia@tencent.com>
Date: Mon, 12 Sep 2022 23:11:53 +0800
Subject: [PATCH 549/737] mm/damon/sysfs: use the wrapper directly to check if
 the kdamond is running

We can use the 'damon_sysfs_kdamond_running()' wrapper directly to check
if the kdamond is running in 'damon_sysfs_turn_damon_on()'.

Link: https://lkml.kernel.org/r/1662995513-24489-1-git-send-email-kaixuxia@tencent.com
Signed-off-by: Kaixu Xia <kaixuxia@tencent.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index a981d4dc7d782..05dc217b85204 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2511,8 +2511,7 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond)
 	struct damon_ctx *ctx;
 	int err;
 
-	if (kdamond->damon_ctx &&
-			damon_sysfs_ctx_running(kdamond->damon_ctx))
+	if (damon_sysfs_kdamond_running(kdamond))
 		return -EBUSY;
 	if (damon_sysfs_cmd_request.kdamond == kdamond)
 		return -EBUSY;

From e7e6a917b3e13716aeafaaeb9fcab068dad88961 Mon Sep 17 00:00:00 2001
From: Dawei Li <set_pte_at@outlook.com>
Date: Mon, 12 Sep 2022 22:39:03 +0800
Subject: [PATCH 550/737] mm/damon: improve damon_new_region strategy

Kdamond is implemented as a periodical split-merge pattern, which will
create and destroy regions possibly at high frequency (hundreds or even
thousands of per sec), depending on the number of regions and aggregation
period.  In that case, kmalloc and kfree could bring speed and space
overheads, which can be improved by using a private kmem cache.

[set_pte_at@outlook.com: creating kmem cache for damon regions by KMEM_CACHE()]
  Link: https://lkml.kernel.org/r/Message-ID:
Link: https://lkml.kernel.org/r/TYCP286MB2323DA1894FA55BB9CF90978CA449@TYCP286MB2323.JPNP286.PROD.OUTLOOK.COM
Signed-off-by: Dawei Li <set_pte_at@outlook.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/core.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index 2437c61b0bc0b..c9ec2de845b32 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -29,6 +29,8 @@ static bool running_exclusive_ctxs;
 static DEFINE_MUTEX(damon_ops_lock);
 static struct damon_operations damon_registered_ops[NR_DAMON_OPS];
 
+static struct kmem_cache *damon_region_cache __ro_after_init;
+
 /* Should be called under damon_ops_lock with id smaller than NR_DAMON_OPS */
 static bool __damon_is_registered_ops(enum damon_ops_id id)
 {
@@ -119,7 +121,7 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end)
 {
 	struct damon_region *region;
 
-	region = kmalloc(sizeof(*region), GFP_KERNEL);
+	region = kmem_cache_alloc(damon_region_cache, GFP_KERNEL);
 	if (!region)
 		return NULL;
 
@@ -148,7 +150,7 @@ static void damon_del_region(struct damon_region *r, struct damon_target *t)
 
 static void damon_free_region(struct damon_region *r)
 {
-	kfree(r);
+	kmem_cache_free(damon_region_cache, r);
 }
 
 void damon_destroy_region(struct damon_region *r, struct damon_target *t)
@@ -1285,4 +1287,17 @@ bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end)
 	return true;
 }
 
+static int __init damon_init(void)
+{
+	damon_region_cache = KMEM_CACHE(damon_region, 0);
+	if (unlikely(!damon_region_cache)) {
+		pr_err("creating damon_region_cache fails\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+subsys_initcall(damon_init);
+
 #include "core-test.h"

From 760d1ff461e33a843f19f0da23125ca06e021fbf Mon Sep 17 00:00:00 2001
From: Xin Hao <xhao@linux.alibaba.com>
Date: Sun, 11 Sep 2022 08:59:17 +0800
Subject: [PATCH 551/737] mm/damon: simplify scheme create in
 damon_lru_sort_apply_parameters

In damon_lru_sort_apply_parameters(), we can use damon_set_schemes() to
replace the way of creating the first 'scheme' in original code, this
makes the code look cleaner.

Link: https://lkml.kernel.org/r/20220911005917.835-1-xhao@linux.alibaba.com
Signed-off-by: Xin Hao <xhao@linux.alibaba.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/lru_sort.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 8415e18fcf0ef..307ba71adcfa9 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -350,7 +350,7 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres)
 
 static int damon_lru_sort_apply_parameters(void)
 {
-	struct damos *scheme, *next_scheme;
+	struct damos *scheme;
 	struct damon_addr_range addr_range;
 	unsigned int hot_thres, cold_thres;
 	int err = 0;
@@ -360,17 +360,15 @@ static int damon_lru_sort_apply_parameters(void)
 	if (err)
 		return err;
 
-	/* free previously set schemes */
-	damon_for_each_scheme_safe(scheme, next_scheme, ctx)
-		damon_destroy_scheme(scheme);
-
 	/* aggr_interval / sample_interval is the maximum nr_accesses */
 	hot_thres = aggr_interval / sample_interval * hot_thres_access_freq /
 		1000;
 	scheme = damon_lru_sort_new_hot_scheme(hot_thres);
 	if (!scheme)
 		return -ENOMEM;
-	damon_add_scheme(ctx, scheme);
+	err = damon_set_schemes(ctx, &scheme, 1);
+	if (err)
+		return err;
 
 	cold_thres = cold_min_age / aggr_interval;
 	scheme = damon_lru_sort_new_cold_scheme(cold_thres);

From bdc11e05ff1b4c10b03be31dd041a6523f09cfa8 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 13 Sep 2022 17:44:28 +0000
Subject: [PATCH 552/737] mm/damon/paddr: make supported DAMOS actions of paddr
 clear

Patch series "mm/damon: cleanup code".

DAMON code was not so clean from the beginning, but it has been too much
nowadays, especially due to the duplicates in DAMON_RECLAIM and
DAMON_LRU_SORT.  This patchset cleans some of the mess.

This patch (of 22):

The 'switch-case' statement in 'damon_va_apply_scheme()' function provides
a 'case' for every supported DAMOS action while all not-yet-supported
DAMOS actions fall through the 'default' case, and comment it so that
people can easily know which actions are supported.  Its counterpart in
'paddr', 'damon_pa_apply_scheme()', however, doesn't.  This commit makes
the 'paddr' side function follows the pattern of 'vaddr' for better
readability and consistency.

Link: https://lkml.kernel.org/r/20220913174449.50645-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20220913174449.50645-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/paddr.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 8dbac307ad5d4..f8f526fdcb4ee 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -279,7 +279,10 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
 		return damon_pa_mark_accessed(r);
 	case DAMOS_LRU_DEPRIO:
 		return damon_pa_deactivate_pages(r);
+	case DAMOS_STAT:
+		break;
 	default:
+		/* DAMOS actions that not yet supported by 'paddr'. */
 		break;
 	}
 	return 0;

From 45b2c56f99c2bfae7a0f3d064cd41306a6bb3aa2 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 13 Sep 2022 17:44:29 +0000
Subject: [PATCH 553/737] mm/damon/paddr: deduplicate
 damon_pa_{mark_accessed,deactivate_pages}()

The bodies of damon_pa_{mark_accessed,deactivate_pages}() contains
duplicates.  This commit factors out the common part to a separate
function and removes the duplicates.

Link: https://lkml.kernel.org/r/20220913174449.50645-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/paddr.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index f8f526fdcb4ee..962cfba432630 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -236,7 +236,8 @@ static unsigned long damon_pa_pageout(struct damon_region *r)
 	return applied * PAGE_SIZE;
 }
 
-static unsigned long damon_pa_mark_accessed(struct damon_region *r)
+static inline unsigned long damon_pa_mark_accessed_or_deactivate(
+		struct damon_region *r, bool mark_accessed)
 {
 	unsigned long addr, applied = 0;
 
@@ -245,27 +246,24 @@ static unsigned long damon_pa_mark_accessed(struct damon_region *r)
 
 		if (!page)
 			continue;
-		mark_page_accessed(page);
+		if (mark_accessed)
+			mark_page_accessed(page);
+		else
+			deactivate_page(page);
 		put_page(page);
 		applied++;
 	}
 	return applied * PAGE_SIZE;
 }
 
-static unsigned long damon_pa_deactivate_pages(struct damon_region *r)
+static unsigned long damon_pa_mark_accessed(struct damon_region *r)
 {
-	unsigned long addr, applied = 0;
-
-	for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
-		struct page *page = damon_get_page(PHYS_PFN(addr));
+	return damon_pa_mark_accessed_or_deactivate(r, true);
+}
 
-		if (!page)
-			continue;
-		deactivate_page(page);
-		put_page(page);
-		applied++;
-	}
-	return applied * PAGE_SIZE;
+static unsigned long damon_pa_deactivate_pages(struct damon_region *r)
+{
+	return damon_pa_mark_accessed_or_deactivate(r, false);
 }
 
 static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,

From c262a2e141de77e029b6ad07a0559c85c57b28ee Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 13 Sep 2022 17:44:30 +0000
Subject: [PATCH 554/737] mm/damon/core: copy struct-to-struct instead of
 field-to-field in damon_new_scheme()

The function for new 'struct damos' creation, 'damon_new_scheme()', copies
each field of the struct one by one, though it could simply copied via
struct to struct.  This commit replaces the unnecessarily verbose
field-to-field copies with struct-to-struct copies to make code simple and
short.

Link: https://lkml.kernel.org/r/20220913174449.50645-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/core.c | 21 ++++-----------------
 1 file changed, 4 insertions(+), 17 deletions(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index c9ec2de845b32..a564f83e9efe7 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -272,22 +272,13 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
 	scheme = kmalloc(sizeof(*scheme), GFP_KERNEL);
 	if (!scheme)
 		return NULL;
-	scheme->pattern.min_sz_region = pattern->min_sz_region;
-	scheme->pattern.max_sz_region = pattern->max_sz_region;
-	scheme->pattern.min_nr_accesses = pattern->min_nr_accesses;
-	scheme->pattern.max_nr_accesses = pattern->max_nr_accesses;
-	scheme->pattern.min_age_region = pattern->min_age_region;
-	scheme->pattern.max_age_region = pattern->max_age_region;
+	scheme->pattern = *pattern;
 	scheme->action = action;
 	scheme->stat = (struct damos_stat){};
 	INIT_LIST_HEAD(&scheme->list);
 
-	scheme->quota.ms = quota->ms;
-	scheme->quota.sz = quota->sz;
-	scheme->quota.reset_interval = quota->reset_interval;
-	scheme->quota.weight_sz = quota->weight_sz;
-	scheme->quota.weight_nr_accesses = quota->weight_nr_accesses;
-	scheme->quota.weight_age = quota->weight_age;
+	scheme->quota = *quota;
+	/* caller might not zero-initialized the private fileds */
 	scheme->quota.total_charged_sz = 0;
 	scheme->quota.total_charged_ns = 0;
 	scheme->quota.esz = 0;
@@ -296,11 +287,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
 	scheme->quota.charge_target_from = NULL;
 	scheme->quota.charge_addr_from = 0;
 
-	scheme->wmarks.metric = wmarks->metric;
-	scheme->wmarks.interval = wmarks->interval;
-	scheme->wmarks.high = wmarks->high;
-	scheme->wmarks.mid = wmarks->mid;
-	scheme->wmarks.low = wmarks->low;
+	scheme->wmarks = *wmarks;
 	scheme->wmarks.activated = true;
 
 	return scheme;

From 0cbef87be7b480ffb99e4969e91c784ac367a934 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 13 Sep 2022 17:44:31 +0000
Subject: [PATCH 555/737] mm/damon/core: factor out 'damos_quota' private
 fileds initialization

The 'struct damos' creation function, 'damon_new_scheme()', does
initialization of private fileds of 'struct damos_quota' in it.  As its
verbose and makes the function unnecessarily long, this commit factors it
out to separate function.

Link: https://lkml.kernel.org/r/20220913174449.50645-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/core.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index a564f83e9efe7..6d9f4c2dee35c 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -263,6 +263,19 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
 	return 0;
 }
 
+/* initialize private fields of damos_quota and return the pointer */
+static struct damos_quota *damos_quota_init_priv(struct damos_quota *quota)
+{
+	quota->total_charged_sz = 0;
+	quota->total_charged_ns = 0;
+	quota->esz = 0;
+	quota->charged_sz = 0;
+	quota->charged_from = 0;
+	quota->charge_target_from = NULL;
+	quota->charge_addr_from = 0;
+	return quota;
+}
+
 struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
 			enum damos_action action, struct damos_quota *quota,
 			struct damos_watermarks *wmarks)
@@ -277,15 +290,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
 	scheme->stat = (struct damos_stat){};
 	INIT_LIST_HEAD(&scheme->list);
 
-	scheme->quota = *quota;
-	/* caller might not zero-initialized the private fileds */
-	scheme->quota.total_charged_sz = 0;
-	scheme->quota.total_charged_ns = 0;
-	scheme->quota.esz = 0;
-	scheme->quota.charged_sz = 0;
-	scheme->quota.charged_from = 0;
-	scheme->quota.charge_target_from = NULL;
-	scheme->quota.charge_addr_from = 0;
+	scheme->quota = *(damos_quota_init_priv(quota));
 
 	scheme->wmarks = *wmarks;
 	scheme->wmarks.activated = true;

From c82667790c2fe448039e26d7891588ca4fda6e36 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 13 Sep 2022 17:44:32 +0000
Subject: [PATCH 556/737] mm/damon/core: use a dedicated struct for monitoring
 attributes

DAMON monitoring attributes are directly defined as fields of 'struct
damon_ctx'.  This makes 'struct damon_ctx' a little long and complicated.
This commit defines and uses a struct, 'struct damon_attrs', which is
dedicated for only the monitoring attributes to make the purpose of the
five values clearer and simplify 'struct damon_ctx'.

Link: https://lkml.kernel.org/r/20220913174449.50645-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 30 ++++++++++++++++++++----------
 mm/damon/core.c       | 34 +++++++++++++++++-----------------
 mm/damon/dbgfs.c      |  6 +++---
 mm/damon/ops-common.c |  4 ++--
 mm/damon/vaddr.c      |  4 ++--
 5 files changed, 44 insertions(+), 34 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 016b6c9c03d62..2ceee8b07726b 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -389,13 +389,15 @@ struct damon_callback {
 };
 
 /**
- * struct damon_ctx - Represents a context for each monitoring.  This is the
- * main interface that allows users to set the attributes and get the results
- * of the monitoring.
+ * struct damon_attrs - Monitoring attributes for accuracy/overhead control.
  *
  * @sample_interval:		The time between access samplings.
  * @aggr_interval:		The time between monitor results aggregations.
  * @ops_update_interval:	The time between monitoring operations updates.
+ * @min_nr_regions:		The minimum number of adaptive monitoring
+ *				regions.
+ * @max_nr_regions:		The maximum number of adaptive monitoring
+ *				regions.
  *
  * For each @sample_interval, DAMON checks whether each region is accessed or
  * not.  It aggregates and keeps the access information (number of accesses to
@@ -405,7 +407,21 @@ struct damon_callback {
  * @ops_update_interval.  All time intervals are in micro-seconds.
  * Please refer to &struct damon_operations and &struct damon_callback for more
  * detail.
+ */
+struct damon_attrs {
+	unsigned long sample_interval;
+	unsigned long aggr_interval;
+	unsigned long ops_update_interval;
+	unsigned long min_nr_regions;
+	unsigned long max_nr_regions;
+};
+
+/**
+ * struct damon_ctx - Represents a context for each monitoring.  This is the
+ * main interface that allows users to set the attributes and get the results
+ * of the monitoring.
  *
+ * @attrs:		Monitoring attributes for accuracy/overhead control.
  * @kdamond:		Kernel thread who does the monitoring.
  * @kdamond_lock:	Mutex for the synchronizations with @kdamond.
  *
@@ -427,15 +443,11 @@ struct damon_callback {
  * @ops:	Set of monitoring operations for given use cases.
  * @callback:	Set of callbacks for monitoring events notifications.
  *
- * @min_nr_regions:	The minimum number of adaptive monitoring regions.
- * @max_nr_regions:	The maximum number of adaptive monitoring regions.
  * @adaptive_targets:	Head of monitoring targets (&damon_target) list.
  * @schemes:		Head of schemes (&damos) list.
  */
 struct damon_ctx {
-	unsigned long sample_interval;
-	unsigned long aggr_interval;
-	unsigned long ops_update_interval;
+	struct damon_attrs attrs;
 
 /* private: internal use only */
 	struct timespec64 last_aggregation;
@@ -448,8 +460,6 @@ struct damon_ctx {
 	struct damon_operations ops;
 	struct damon_callback callback;
 
-	unsigned long min_nr_regions;
-	unsigned long max_nr_regions;
 	struct list_head adaptive_targets;
 	struct list_head schemes;
 };
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 6d9f4c2dee35c..bbd4c2d991dda 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -382,17 +382,17 @@ struct damon_ctx *damon_new_ctx(void)
 	if (!ctx)
 		return NULL;
 
-	ctx->sample_interval = 5 * 1000;
-	ctx->aggr_interval = 100 * 1000;
-	ctx->ops_update_interval = 60 * 1000 * 1000;
+	ctx->attrs.sample_interval = 5 * 1000;
+	ctx->attrs.aggr_interval = 100 * 1000;
+	ctx->attrs.ops_update_interval = 60 * 1000 * 1000;
 
 	ktime_get_coarse_ts64(&ctx->last_aggregation);
 	ctx->last_ops_update = ctx->last_aggregation;
 
 	mutex_init(&ctx->kdamond_lock);
 
-	ctx->min_nr_regions = 10;
-	ctx->max_nr_regions = 1000;
+	ctx->attrs.min_nr_regions = 10;
+	ctx->attrs.max_nr_regions = 1000;
 
 	INIT_LIST_HEAD(&ctx->adaptive_targets);
 	INIT_LIST_HEAD(&ctx->schemes);
@@ -448,11 +448,11 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
 	if (min_nr_reg > max_nr_reg)
 		return -EINVAL;
 
-	ctx->sample_interval = sample_int;
-	ctx->aggr_interval = aggr_int;
-	ctx->ops_update_interval = ops_upd_int;
-	ctx->min_nr_regions = min_nr_reg;
-	ctx->max_nr_regions = max_nr_reg;
+	ctx->attrs.sample_interval = sample_int;
+	ctx->attrs.aggr_interval = aggr_int;
+	ctx->attrs.ops_update_interval = ops_upd_int;
+	ctx->attrs.min_nr_regions = min_nr_reg;
+	ctx->attrs.max_nr_regions = max_nr_reg;
 
 	return 0;
 }
@@ -507,8 +507,8 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx)
 			sz += r->ar.end - r->ar.start;
 	}
 
-	if (ctx->min_nr_regions)
-		sz /= ctx->min_nr_regions;
+	if (ctx->attrs.min_nr_regions)
+		sz /= ctx->attrs.min_nr_regions;
 	if (sz < DAMON_MIN_REGION)
 		sz = DAMON_MIN_REGION;
 
@@ -657,7 +657,7 @@ static bool damon_check_reset_time_interval(struct timespec64 *baseline,
 static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx)
 {
 	return damon_check_reset_time_interval(&ctx->last_aggregation,
-			ctx->aggr_interval);
+			ctx->attrs.aggr_interval);
 }
 
 /*
@@ -1016,12 +1016,12 @@ static void kdamond_split_regions(struct damon_ctx *ctx)
 	damon_for_each_target(t, ctx)
 		nr_regions += damon_nr_regions(t);
 
-	if (nr_regions > ctx->max_nr_regions / 2)
+	if (nr_regions > ctx->attrs.max_nr_regions / 2)
 		return;
 
 	/* Maybe the middle of the region has different access frequency */
 	if (last_nr_regions == nr_regions &&
-			nr_regions < ctx->max_nr_regions / 3)
+			nr_regions < ctx->attrs.max_nr_regions / 3)
 		nr_subregions = 3;
 
 	damon_for_each_target(t, ctx)
@@ -1039,7 +1039,7 @@ static void kdamond_split_regions(struct damon_ctx *ctx)
 static bool kdamond_need_update_operations(struct damon_ctx *ctx)
 {
 	return damon_check_reset_time_interval(&ctx->last_ops_update,
-			ctx->ops_update_interval);
+			ctx->attrs.ops_update_interval);
 }
 
 /*
@@ -1188,7 +1188,7 @@ static int kdamond_fn(void *data)
 			continue;
 		}
 
-		kdamond_usleep(ctx->sample_interval);
+		kdamond_usleep(ctx->attrs.sample_interval);
 
 		if (ctx->ops.check_accesses)
 			max_nr_accesses = ctx->ops.check_accesses(ctx);
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 124577db70124..51ccb85b3b1db 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -55,9 +55,9 @@ static ssize_t dbgfs_attrs_read(struct file *file,
 
 	mutex_lock(&ctx->kdamond_lock);
 	ret = scnprintf(kbuf, ARRAY_SIZE(kbuf), "%lu %lu %lu %lu %lu\n",
-			ctx->sample_interval, ctx->aggr_interval,
-			ctx->ops_update_interval, ctx->min_nr_regions,
-			ctx->max_nr_regions);
+			ctx->attrs.sample_interval, ctx->attrs.aggr_interval,
+			ctx->attrs.ops_update_interval,
+			ctx->attrs.min_nr_regions, ctx->attrs.max_nr_regions);
 	mutex_unlock(&ctx->kdamond_lock);
 
 	return simple_read_from_buffer(buf, count, ppos, kbuf, ret);
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index f599838b5f648..9310df72e1c54 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -99,10 +99,10 @@ int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
 	unsigned int age_weight = s->quota.weight_age;
 	int hotness;
 
-	max_nr_accesses = c->aggr_interval / c->sample_interval;
+	max_nr_accesses = c->attrs.aggr_interval / c->attrs.sample_interval;
 	freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses;
 
-	age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000;
+	age_in_sec = (unsigned long)r->age * c->attrs.aggr_interval / 1000000;
 	for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec;
 			age_in_log++, age_in_sec >>= 1)
 		;
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 03e33d8b1991d..c9e3d462db54e 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -250,8 +250,8 @@ static void __damon_va_init_regions(struct damon_ctx *ctx,
 
 	for (i = 0; i < 3; i++)
 		sz += regions[i].end - regions[i].start;
-	if (ctx->min_nr_regions)
-		sz /= ctx->min_nr_regions;
+	if (ctx->attrs.min_nr_regions)
+		sz /= ctx->attrs.min_nr_regions;
 	if (sz < DAMON_MIN_REGION)
 		sz = DAMON_MIN_REGION;
 

From 6d15e06e5bed1ce9a098f5aa5a50722e02ac02a7 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 13 Sep 2022 17:44:33 +0000
Subject: [PATCH 557/737] mm/damon/core: reduce parameters for
 damon_set_attrs()

Number of parameters for 'damon_set_attrs()' is six.  As it could be
confusing and verbose, this commit reduces the number by receiving single
pointer to a 'struct damon_attrs'.

Link: https://lkml.kernel.org/r/20220913174449.50645-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h |  4 +---
 mm/damon/core.c       | 21 +++++----------------
 mm/damon/dbgfs.c      |  9 ++++++---
 mm/damon/lru_sort.c   | 10 ++++++++--
 mm/damon/reclaim.c    | 10 ++++++++--
 mm/damon/sysfs.c      | 12 ++++++++----
 6 files changed, 36 insertions(+), 30 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 2ceee8b07726b..c5dc0c77c7722 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -540,9 +540,7 @@ unsigned int damon_nr_regions(struct damon_target *t);
 
 struct damon_ctx *damon_new_ctx(void);
 void damon_destroy_ctx(struct damon_ctx *ctx);
-int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
-		unsigned long aggr_int, unsigned long ops_upd_int,
-		unsigned long min_nr_reg, unsigned long max_nr_reg);
+int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs);
 int damon_set_schemes(struct damon_ctx *ctx,
 			struct damos **schemes, ssize_t nr_schemes);
 int damon_nr_running_ctxs(void);
diff --git a/mm/damon/core.c b/mm/damon/core.c
index bbd4c2d991dda..29635a82cb691 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -428,32 +428,21 @@ void damon_destroy_ctx(struct damon_ctx *ctx)
 /**
  * damon_set_attrs() - Set attributes for the monitoring.
  * @ctx:		monitoring context
- * @sample_int:		time interval between samplings
- * @aggr_int:		time interval between aggregations
- * @ops_upd_int:	time interval between monitoring operations updates
- * @min_nr_reg:		minimal number of regions
- * @max_nr_reg:		maximum number of regions
+ * @attrs:		monitoring attributes
  *
  * This function should not be called while the kdamond is running.
  * Every time interval is in micro-seconds.
  *
  * Return: 0 on success, negative error code otherwise.
  */
-int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
-		    unsigned long aggr_int, unsigned long ops_upd_int,
-		    unsigned long min_nr_reg, unsigned long max_nr_reg)
+int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs)
 {
-	if (min_nr_reg < 3)
+	if (attrs->min_nr_regions < 3)
 		return -EINVAL;
-	if (min_nr_reg > max_nr_reg)
+	if (attrs->min_nr_regions > attrs->max_nr_regions)
 		return -EINVAL;
 
-	ctx->attrs.sample_interval = sample_int;
-	ctx->attrs.aggr_interval = aggr_int;
-	ctx->attrs.ops_update_interval = ops_upd_int;
-	ctx->attrs.min_nr_regions = min_nr_reg;
-	ctx->attrs.max_nr_regions = max_nr_reg;
-
+	ctx->attrs = *attrs;
 	return 0;
 }
 
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 51ccb85b3b1db..8d688a84e52e7 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -67,7 +67,7 @@ static ssize_t dbgfs_attrs_write(struct file *file,
 		const char __user *buf, size_t count, loff_t *ppos)
 {
 	struct damon_ctx *ctx = file->private_data;
-	unsigned long s, a, r, minr, maxr;
+	struct damon_attrs attrs;
 	char *kbuf;
 	ssize_t ret;
 
@@ -76,7 +76,10 @@ static ssize_t dbgfs_attrs_write(struct file *file,
 		return PTR_ERR(kbuf);
 
 	if (sscanf(kbuf, "%lu %lu %lu %lu %lu",
-				&s, &a, &r, &minr, &maxr) != 5) {
+				&attrs.sample_interval, &attrs.aggr_interval,
+				&attrs.ops_update_interval,
+				&attrs.min_nr_regions,
+				&attrs.max_nr_regions) != 5) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -87,7 +90,7 @@ static ssize_t dbgfs_attrs_write(struct file *file,
 		goto unlock_out;
 	}
 
-	ret = damon_set_attrs(ctx, s, a, r, minr, maxr);
+	ret = damon_set_attrs(ctx, &attrs);
 	if (!ret)
 		ret = count;
 unlock_out:
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 307ba71adcfa9..6d5f83965276f 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -350,13 +350,19 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres)
 
 static int damon_lru_sort_apply_parameters(void)
 {
+	struct damon_attrs attrs = {
+		.sample_interval = sample_interval,
+		.aggr_interval = aggr_interval,
+		.ops_update_interval = 0,
+		.min_nr_regions = min_nr_regions,
+		.max_nr_regions = max_nr_regions,
+	};
 	struct damos *scheme;
 	struct damon_addr_range addr_range;
 	unsigned int hot_thres, cold_thres;
 	int err = 0;
 
-	err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0,
-			min_nr_regions, max_nr_regions);
+	err = damon_set_attrs(ctx, &attrs);
 	if (err)
 		return err;
 
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index fe7bc0c55ecb3..bc841efbab45e 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -275,12 +275,18 @@ static struct damos *damon_reclaim_new_scheme(void)
 
 static int damon_reclaim_apply_parameters(void)
 {
+	struct damon_attrs attrs = {
+		.sample_interval = sample_interval,
+		.aggr_interval = aggr_interval,
+		.ops_update_interval = 0,
+		.min_nr_regions = min_nr_regions,
+		.max_nr_regions = max_nr_regions,
+	};
 	struct damos *scheme;
 	struct damon_addr_range addr_range;
 	int err = 0;
 
-	err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0,
-			min_nr_regions, max_nr_regions);
+	err = damon_set_attrs(ctx, &attrs);
 	if (err)
 		return err;
 
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 05dc217b85204..6c45cd78dc505 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2130,10 +2130,14 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx,
 	struct damon_sysfs_intervals *sys_intervals = sys_attrs->intervals;
 	struct damon_sysfs_ul_range *sys_nr_regions =
 		sys_attrs->nr_regions_range;
-
-	return damon_set_attrs(ctx, sys_intervals->sample_us,
-			sys_intervals->aggr_us, sys_intervals->update_us,
-			sys_nr_regions->min, sys_nr_regions->max);
+	struct damon_attrs attrs = {
+		.sample_interval = sys_intervals->sample_us,
+		.aggr_interval = sys_intervals->aggr_us,
+		.ops_update_interval = sys_intervals->update_us,
+		.min_nr_regions = sys_nr_regions->min,
+		.max_nr_regions = sys_nr_regions->max,
+	};
+	return damon_set_attrs(ctx, &attrs);
 }
 
 static void damon_sysfs_destroy_targets(struct damon_ctx *ctx)

From 5fd565c2b249b441b5e318661aa1c22cefdd24ed Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 13 Sep 2022 17:44:34 +0000
Subject: [PATCH 558/737] mm/damon/reclaim: use 'struct damon_attrs' for
 storing parameters for it

DAMON_RECLAIM receives monitoring attributes by parameters one by one to
separate variables, and then combine those into 'struct damon_attrs'.
This commit makes the module directly stores the parameter values to a
static 'struct damon_attrs' variable and use it to simplify the code.

Link: https://lkml.kernel.org/r/20220913174449.50645-8-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/reclaim.c | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index bc841efbab45e..d35a00d8dde2d 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -129,14 +129,22 @@ module_param(wmarks_mid, ulong, 0600);
 static unsigned long wmarks_low __read_mostly = 200;
 module_param(wmarks_low, ulong, 0600);
 
+static struct damon_attrs damon_reclaim_mon_attrs = {
+	.sample_interval = 5000,
+	.aggr_interval = 100000,
+	.ops_update_interval = 0,
+	.min_nr_regions = 10,
+	.max_nr_regions = 1000,
+};
+
 /*
  * Sampling interval for the monitoring in microseconds.
  *
  * The sampling interval of DAMON for the cold memory monitoring.  Please refer
  * to the DAMON documentation for more detail.  5 ms by default.
  */
-static unsigned long sample_interval __read_mostly = 5000;
-module_param(sample_interval, ulong, 0600);
+module_param_named(sample_interval, damon_reclaim_mon_attrs.sample_interval,
+		ulong, 0600);
 
 /*
  * Aggregation interval for the monitoring in microseconds.
@@ -144,8 +152,8 @@ module_param(sample_interval, ulong, 0600);
  * The aggregation interval of DAMON for the cold memory monitoring.  Please
  * refer to the DAMON documentation for more detail.  100 ms by default.
  */
-static unsigned long aggr_interval __read_mostly = 100000;
-module_param(aggr_interval, ulong, 0600);
+module_param_named(aggr_interval, damon_reclaim_mon_attrs.aggr_interval, ulong,
+		0600);
 
 /*
  * Minimum number of monitoring regions.
@@ -155,8 +163,8 @@ module_param(aggr_interval, ulong, 0600);
  * But, setting this too high could result in increased monitoring overhead.
  * Please refer to the DAMON documentation for more detail.  10 by default.
  */
-static unsigned long min_nr_regions __read_mostly = 10;
-module_param(min_nr_regions, ulong, 0600);
+module_param_named(min_nr_regions, damon_reclaim_mon_attrs.min_nr_regions,
+		ulong, 0600);
 
 /*
  * Maximum number of monitoring regions.
@@ -166,8 +174,8 @@ module_param(min_nr_regions, ulong, 0600);
  * However, setting this too low could result in bad monitoring quality.
  * Please refer to the DAMON documentation for more detail.  1000 by default.
  */
-static unsigned long max_nr_regions __read_mostly = 1000;
-module_param(max_nr_regions, ulong, 0600);
+module_param_named(max_nr_regions, damon_reclaim_mon_attrs.max_nr_regions,
+		ulong, 0600);
 
 /*
  * Start of the target memory region in physical address.
@@ -239,7 +247,8 @@ static struct damos *damon_reclaim_new_scheme(void)
 		.min_nr_accesses = 0,
 		.max_nr_accesses = 0,
 		/* for min_age or more micro-seconds */
-		.min_age_region = min_age / aggr_interval,
+		.min_age_region = min_age /
+			damon_reclaim_mon_attrs.aggr_interval,
 		.max_age_region = UINT_MAX,
 	};
 	struct damos_watermarks wmarks = {
@@ -275,18 +284,11 @@ static struct damos *damon_reclaim_new_scheme(void)
 
 static int damon_reclaim_apply_parameters(void)
 {
-	struct damon_attrs attrs = {
-		.sample_interval = sample_interval,
-		.aggr_interval = aggr_interval,
-		.ops_update_interval = 0,
-		.min_nr_regions = min_nr_regions,
-		.max_nr_regions = max_nr_regions,
-	};
 	struct damos *scheme;
 	struct damon_addr_range addr_range;
 	int err = 0;
 
-	err = damon_set_attrs(ctx, &attrs);
+	err = damon_set_attrs(ctx, &damon_reclaim_mon_attrs);
 	if (err)
 		return err;
 

From 1d6c4056f3d958658447042c0d47e1e175c4aa91 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 13 Sep 2022 17:44:35 +0000
Subject: [PATCH 559/737] mm/damon/lru_sort: use 'struct damon_attrs' for
 storing parameters for it

DAMON_LRU_SORT receives monitoring attributes by parameters one by one to
separate variables, and then combines those into 'struct damon_attrs'.
This commit makes the module directly stores the parameter values to a
static 'struct damon_attrs' variable and use it to simplify the code.

Link: https://lkml.kernel.org/r/20220913174449.50645-9-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/lru_sort.c | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 6d5f83965276f..ade985b836527 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -127,14 +127,22 @@ module_param(wmarks_mid, ulong, 0600);
 static unsigned long wmarks_low __read_mostly = 50;
 module_param(wmarks_low, ulong, 0600);
 
+static struct damon_attrs damon_lru_sort_mon_attrs = {
+	.sample_interval = 5000,
+	.aggr_interval = 100000,
+	.ops_update_interval = 0,
+	.min_nr_regions = 10,
+	.max_nr_regions = 1000,
+};
+
 /*
  * Sampling interval for the monitoring in microseconds.
  *
  * The sampling interval of DAMON for the hot/cold memory monitoring.  Please
  * refer to the DAMON documentation for more detail.  5 ms by default.
  */
-static unsigned long sample_interval __read_mostly = 5000;
-module_param(sample_interval, ulong, 0600);
+module_param_named(sample_interval, damon_lru_sort_mon_attrs.sample_interval,
+		ulong, 0600);
 
 /*
  * Aggregation interval for the monitoring in microseconds.
@@ -142,8 +150,8 @@ module_param(sample_interval, ulong, 0600);
  * The aggregation interval of DAMON for the hot/cold memory monitoring.
  * Please refer to the DAMON documentation for more detail.  100 ms by default.
  */
-static unsigned long aggr_interval __read_mostly = 100000;
-module_param(aggr_interval, ulong, 0600);
+module_param_named(aggr_interval, damon_lru_sort_mon_attrs.aggr_interval, ulong,
+		0600);
 
 /*
  * Minimum number of monitoring regions.
@@ -153,8 +161,8 @@ module_param(aggr_interval, ulong, 0600);
  * But, setting this too high could result in increased monitoring overhead.
  * Please refer to the DAMON documentation for more detail.  10 by default.
  */
-static unsigned long min_nr_regions __read_mostly = 10;
-module_param(min_nr_regions, ulong, 0600);
+module_param_named(min_nr_regions, damon_lru_sort_mon_attrs.min_nr_regions,
+		ulong, 0600);
 
 /*
  * Maximum number of monitoring regions.
@@ -164,8 +172,8 @@ module_param(min_nr_regions, ulong, 0600);
  * However, setting this too low could result in bad monitoring quality.
  * Please refer to the DAMON documentation for more detail.  1000 by default.
  */
-static unsigned long max_nr_regions __read_mostly = 1000;
-module_param(max_nr_regions, ulong, 0600);
+module_param_named(max_nr_regions, damon_lru_sort_mon_attrs.max_nr_regions,
+		ulong, 0600);
 
 /*
  * Start of the target memory region in physical address.
@@ -350,25 +358,19 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres)
 
 static int damon_lru_sort_apply_parameters(void)
 {
-	struct damon_attrs attrs = {
-		.sample_interval = sample_interval,
-		.aggr_interval = aggr_interval,
-		.ops_update_interval = 0,
-		.min_nr_regions = min_nr_regions,
-		.max_nr_regions = max_nr_regions,
-	};
 	struct damos *scheme;
 	struct damon_addr_range addr_range;
 	unsigned int hot_thres, cold_thres;
 	int err = 0;
 
-	err = damon_set_attrs(ctx, &attrs);
+	err = damon_set_attrs(ctx, &damon_lru_sort_mon_attrs);
 	if (err)
 		return err;
 
 	/* aggr_interval / sample_interval is the maximum nr_accesses */
-	hot_thres = aggr_interval / sample_interval * hot_thres_access_freq /
-		1000;
+	hot_thres = damon_lru_sort_mon_attrs.aggr_interval /
+		damon_lru_sort_mon_attrs.sample_interval *
+		hot_thres_access_freq / 1000;
 	scheme = damon_lru_sort_new_hot_scheme(hot_thres);
 	if (!scheme)
 		return -ENOMEM;
@@ -376,7 +378,7 @@ static int damon_lru_sort_apply_parameters(void)
 	if (err)
 		return err;
 
-	cold_thres = cold_min_age / aggr_interval;
+	cold_thres = cold_min_age / damon_lru_sort_mon_attrs.aggr_interval;
 	scheme = damon_lru_sort_new_cold_scheme(cold_thres);
 	if (!scheme)
 		return -ENOMEM;

From 906db850fc983daf656c48978a88fbb937def538 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 13 Sep 2022 17:44:36 +0000
Subject: [PATCH 560/737] mm/damon: implement a monitoring attributes module
 parameters generator macro

DAMON_RECLAIM and DAMON_LRU_SORT have module parameters for monitoring
attributes that having same names.  This commot implements a macro for
generating such module parameters so that we can reuse later.

Link: https://lkml.kernel.org/r/20220913174449.50645-10-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/modules-common.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 mm/damon/modules-common.h

diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h
new file mode 100644
index 0000000000000..0abd0636bc649
--- /dev/null
+++ b/mm/damon/modules-common.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common Primitives for DAMON Modules
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#include <linux/moduleparam.h>
+
+#define DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(attrs)			\
+	module_param_named(sample_interval, attrs.sample_interval,	\
+			ulong, 0600);					\
+	module_param_named(aggr_interval, attrs.aggr_interval, ulong,	\
+			0600);						\
+	module_param_named(min_nr_regions, attrs.min_nr_regions, ulong,	\
+			0600);						\
+	module_param_named(max_nr_regions, attrs.max_nr_regions, ulong,	\
+			0600);

From 0704bba60c62e06c320e17a70044bb4cf0934ed0 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 13 Sep 2022 17:44:37 +0000
Subject: [PATCH 561/737] mm/damon/lru_sort: use monitoring attributes
 parameters generaotr macro

This commit makes DAMON_LRU_SORT to generate the module parameters for
DAMON monitoring attributes using the generator macro to simplify the code
and reduce duplicates.

Link: https://lkml.kernel.org/r/20220913174449.50645-11-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/lru_sort.c | 47 +++++----------------------------------------
 1 file changed, 5 insertions(+), 42 deletions(-)

diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index ade985b836527..e95626acee6f9 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -13,6 +13,8 @@
 #include <linux/sched.h>
 #include <linux/workqueue.h>
 
+#include "modules-common.h"
+
 #ifdef MODULE_PARAM_PREFIX
 #undef MODULE_PARAM_PREFIX
 #endif
@@ -128,52 +130,13 @@ static unsigned long wmarks_low __read_mostly = 50;
 module_param(wmarks_low, ulong, 0600);
 
 static struct damon_attrs damon_lru_sort_mon_attrs = {
-	.sample_interval = 5000,
-	.aggr_interval = 100000,
+	.sample_interval = 5000,	/* 5 ms */
+	.aggr_interval = 100000,	/* 100 ms */
 	.ops_update_interval = 0,
 	.min_nr_regions = 10,
 	.max_nr_regions = 1000,
 };
-
-/*
- * Sampling interval for the monitoring in microseconds.
- *
- * The sampling interval of DAMON for the hot/cold memory monitoring.  Please
- * refer to the DAMON documentation for more detail.  5 ms by default.
- */
-module_param_named(sample_interval, damon_lru_sort_mon_attrs.sample_interval,
-		ulong, 0600);
-
-/*
- * Aggregation interval for the monitoring in microseconds.
- *
- * The aggregation interval of DAMON for the hot/cold memory monitoring.
- * Please refer to the DAMON documentation for more detail.  100 ms by default.
- */
-module_param_named(aggr_interval, damon_lru_sort_mon_attrs.aggr_interval, ulong,
-		0600);
-
-/*
- * Minimum number of monitoring regions.
- *
- * The minimal number of monitoring regions of DAMON for the hot/cold memory
- * monitoring.  This can be used to set lower-bound of the monitoring quality.
- * But, setting this too high could result in increased monitoring overhead.
- * Please refer to the DAMON documentation for more detail.  10 by default.
- */
-module_param_named(min_nr_regions, damon_lru_sort_mon_attrs.min_nr_regions,
-		ulong, 0600);
-
-/*
- * Maximum number of monitoring regions.
- *
- * The maximum number of monitoring regions of DAMON for the hot/cold memory
- * monitoring.  This can be used to set upper-bound of the monitoring overhead.
- * However, setting this too low could result in bad monitoring quality.
- * Please refer to the DAMON documentation for more detail.  1000 by default.
- */
-module_param_named(max_nr_regions, damon_lru_sort_mon_attrs.max_nr_regions,
-		ulong, 0600);
+DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_lru_sort_mon_attrs);
 
 /*
  * Start of the target memory region in physical address.

From 5ad0ec4cf784ac8cf4d8ad20900d10a2ed7c3e92 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 13 Sep 2022 17:44:38 +0000
Subject: [PATCH 562/737] mm/damon/reclaim: use monitoring attributes
 parameters generator macro

This commit makes DAMON_RECLAIM to generate the module parameters for
DAMON monitoring attributes using the generator macro to simplify the code
and reduce duplicates.

Link: https://lkml.kernel.org/r/20220913174449.50645-12-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/reclaim.c | 47 +++++-----------------------------------------
 1 file changed, 5 insertions(+), 42 deletions(-)

diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index d35a00d8dde2d..48326bef20f51 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -13,6 +13,8 @@
 #include <linux/sched.h>
 #include <linux/workqueue.h>
 
+#include "modules-common.h"
+
 #ifdef MODULE_PARAM_PREFIX
 #undef MODULE_PARAM_PREFIX
 #endif
@@ -130,52 +132,13 @@ static unsigned long wmarks_low __read_mostly = 200;
 module_param(wmarks_low, ulong, 0600);
 
 static struct damon_attrs damon_reclaim_mon_attrs = {
-	.sample_interval = 5000,
-	.aggr_interval = 100000,
+	.sample_interval = 5000,	/* 5 ms */
+	.aggr_interval = 100000,	/* 100 ms */
 	.ops_update_interval = 0,
 	.min_nr_regions = 10,
 	.max_nr_regions = 1000,
 };
-
-/*
- * Sampling interval for the monitoring in microseconds.
- *
- * The sampling interval of DAMON for the cold memory monitoring.  Please refer
- * to the DAMON documentation for more detail.  5 ms by default.
- */
-module_param_named(sample_interval, damon_reclaim_mon_attrs.sample_interval,
-		ulong, 0600);
-
-/*
- * Aggregation interval for the monitoring in microseconds.
- *
- * The aggregation interval of DAMON for the cold memory monitoring.  Please
- * refer to the DAMON documentation for more detail.  100 ms by default.
- */
-module_param_named(aggr_interval, damon_reclaim_mon_attrs.aggr_interval, ulong,
-		0600);
-
-/*
- * Minimum number of monitoring regions.
- *
- * The minimal number of monitoring regions of DAMON for the cold memory
- * monitoring.  This can be used to set lower-bound of the monitoring quality.
- * But, setting this too high could result in increased monitoring overhead.
- * Please refer to the DAMON documentation for more detail.  10 by default.
- */
-module_param_named(min_nr_regions, damon_reclaim_mon_attrs.min_nr_regions,
-		ulong, 0600);
-
-/*
- * Maximum number of monitoring regions.
- *
- * The maximum number of monitoring regions of DAMON for the cold memory
- * monitoring.  This can be used to set upper-bound of the monitoring overhead.
- * However, setting this too low could result in bad monitoring quality.
- * Please refer to the DAMON documentation for more detail.  1000 by default.
- */
-module_param_named(max_nr_regions, damon_reclaim_mon_attrs.max_nr_regions,
-		ulong, 0600);
+DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_reclaim_mon_attrs);
 
 /*
  * Start of the target memory region in physical address.

From 9f23fce1026587a2d13d839a2a99f6f71ec3df71 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 13 Sep 2022 17:44:39 +0000
Subject: [PATCH 563/737] mm/damon/modules-common: implement a watermarks
 module parameters generator macro

DAMON_RECLAIM and DAMON_LRU_SORT have module parameters for watermarks
that having same names.  This commit implements a macro for generating
such module parameters so that we can reuse later.

Link: https://lkml.kernel.org/r/20220913174449.50645-13-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/modules-common.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h
index 0abd0636bc649..1370590a37d18 100644
--- a/mm/damon/modules-common.h
+++ b/mm/damon/modules-common.h
@@ -16,3 +16,10 @@
 			0600);						\
 	module_param_named(max_nr_regions, attrs.max_nr_regions, ulong,	\
 			0600);
+
+#define DEFINE_DAMON_MODULES_WMARKS_PARAMS(wmarks)			\
+	module_param_named(wmarks_interval, wmarks->interval, ulong,	\
+			0600);						\
+	module_param_named(wmarks_high, wmarks.high, ulong, 0600);	\
+	module_param_named(wmarks_mid, wmarks.mid, ulong, 0600);	\
+	module_param_named(wmarks_low, wmarks.lowulong, 0600);

From 6e9f5f1d64c276d9fb6ff073e0fc55d02687f480 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 13 Sep 2022 17:44:40 +0000
Subject: [PATCH 564/737] mm/damon/lru_sort: use watermarks parameters
 generator macro

This commit makes DAMON_LRU_SORT to generate the module parameters for
DAMOS watermarks using the generator macro to simplify the code and reduce
duplicates.

Link: https://lkml.kernel.org/r/20220913174449.50645-14-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/lru_sort.c       | 64 ++++++---------------------------------
 mm/damon/modules-common.h |  4 +--
 2 files changed, 12 insertions(+), 56 deletions(-)

diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index e95626acee6f9..20760b39b50a4 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -90,44 +90,14 @@ module_param(quota_ms, ulong, 0600);
 static unsigned long quota_reset_interval_ms __read_mostly = 1000;
 module_param(quota_reset_interval_ms, ulong, 0600);
 
-/*
- * The watermarks check time interval in microseconds.
- *
- * Minimal time to wait before checking the watermarks, when DAMON_LRU_SORT is
- * enabled but inactive due to its watermarks rule.  5 seconds by default.
- */
-static unsigned long wmarks_interval __read_mostly = 5000000;
-module_param(wmarks_interval, ulong, 0600);
-
-/*
- * Free memory rate (per thousand) for the high watermark.
- *
- * If free memory of the system in bytes per thousand bytes is higher than
- * this, DAMON_LRU_SORT becomes inactive, so it does nothing but periodically
- * checks the watermarks.  200 (20%) by default.
- */
-static unsigned long wmarks_high __read_mostly = 200;
-module_param(wmarks_high, ulong, 0600);
-
-/*
- * Free memory rate (per thousand) for the middle watermark.
- *
- * If free memory of the system in bytes per thousand bytes is between this and
- * the low watermark, DAMON_LRU_SORT becomes active, so starts the monitoring
- * and the LRU-lists sorting.  150 (15%) by default.
- */
-static unsigned long wmarks_mid __read_mostly = 150;
-module_param(wmarks_mid, ulong, 0600);
-
-/*
- * Free memory rate (per thousand) for the low watermark.
- *
- * If free memory of the system in bytes per thousand bytes is lower than this,
- * DAMON_LRU_SORT becomes inactive, so it does nothing but periodically checks
- * the watermarks.  50 (5%) by default.
- */
-static unsigned long wmarks_low __read_mostly = 50;
-module_param(wmarks_low, ulong, 0600);
+struct damos_watermarks damon_lru_sort_wmarks = {
+	.metric = DAMOS_WMARK_FREE_MEM_RATE,
+	.interval = 5000000,	/* 5 seconds */
+	.high = 200,		/* 20 percent */
+	.mid = 150,		/* 15 percent */
+	.low = 50,		/* 5 percent */
+};
+DEFINE_DAMON_MODULES_WMARKS_PARAMS(damon_lru_sort_wmarks);
 
 static struct damon_attrs damon_lru_sort_mon_attrs = {
 	.sample_interval = 5000,	/* 5 ms */
@@ -242,13 +212,6 @@ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres)
 		.min_age_region = 0,
 		.max_age_region = UINT_MAX,
 	};
-	struct damos_watermarks wmarks = {
-		.metric = DAMOS_WMARK_FREE_MEM_RATE,
-		.interval = wmarks_interval,
-		.high = wmarks_high,
-		.mid = wmarks_mid,
-		.low = wmarks_low,
-	};
 	struct damos_quota quota = {
 		/*
 		 * Do not try LRU-lists sorting of hot pages for more than half
@@ -270,7 +233,7 @@ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres)
 			/* under the quota. */
 			&quota,
 			/* (De)activate this according to the watermarks. */
-			&wmarks);
+			&damon_lru_sort_wmarks);
 }
 
 /* Create a DAMON-based operation scheme for cold memory regions */
@@ -287,13 +250,6 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres)
 		.min_age_region = cold_thres,
 		.max_age_region = UINT_MAX,
 	};
-	struct damos_watermarks wmarks = {
-		.metric = DAMOS_WMARK_FREE_MEM_RATE,
-		.interval = wmarks_interval,
-		.high = wmarks_high,
-		.mid = wmarks_mid,
-		.low = wmarks_low,
-	};
 	struct damos_quota quota = {
 		/*
 		 * Do not try LRU-lists sorting of cold pages for more than
@@ -316,7 +272,7 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres)
 			/* under the quota. */
 			&quota,
 			/* (De)activate this according to the watermarks. */
-			&wmarks);
+			&damon_lru_sort_wmarks);
 }
 
 static int damon_lru_sort_apply_parameters(void)
diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h
index 1370590a37d18..4c2ce84869d58 100644
--- a/mm/damon/modules-common.h
+++ b/mm/damon/modules-common.h
@@ -18,8 +18,8 @@
 			0600);
 
 #define DEFINE_DAMON_MODULES_WMARKS_PARAMS(wmarks)			\
-	module_param_named(wmarks_interval, wmarks->interval, ulong,	\
+	module_param_named(wmarks_interval, wmarks.interval, ulong,	\
 			0600);						\
 	module_param_named(wmarks_high, wmarks.high, ulong, 0600);	\
 	module_param_named(wmarks_mid, wmarks.mid, ulong, 0600);	\
-	module_param_named(wmarks_low, wmarks.lowulong, 0600);
+	module_param_named(wmarks_low, wmarks.low, ulong, 0600);

From 627f6511d85f8ca1351f314f4b9682ac5360ac1b Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 13 Sep 2022 17:44:41 +0000
Subject: [PATCH 565/737] mm/damon/reclaim: use watermarks parameters generator
 macro

This commit makes DAMON_RECLAIM to generate the module parameters for
DAMOS watermarks using the generator macro to simplify the code and reduce
duplicates.

Link: https://lkml.kernel.org/r/20220913174449.50645-15-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/reclaim.c | 56 ++++++++--------------------------------------
 1 file changed, 9 insertions(+), 47 deletions(-)

diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 48326bef20f51..7f845f617dc56 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -91,45 +91,14 @@ module_param(quota_sz, ulong, 0600);
 static unsigned long quota_reset_interval_ms __read_mostly = 1000;
 module_param(quota_reset_interval_ms, ulong, 0600);
 
-/*
- * The watermarks check time interval in microseconds.
- *
- * Minimal time to wait before checking the watermarks, when DAMON_RECLAIM is
- * enabled but inactive due to its watermarks rule.  5 seconds by default.
- */
-static unsigned long wmarks_interval __read_mostly = 5000000;
-module_param(wmarks_interval, ulong, 0600);
-
-/*
- * Free memory rate (per thousand) for the high watermark.
- *
- * If free memory of the system in bytes per thousand bytes is higher than
- * this, DAMON_RECLAIM becomes inactive, so it does nothing but periodically
- * checks the watermarks.  500 (50%) by default.
- */
-static unsigned long wmarks_high __read_mostly = 500;
-module_param(wmarks_high, ulong, 0600);
-
-/*
- * Free memory rate (per thousand) for the middle watermark.
- *
- * If free memory of the system in bytes per thousand bytes is between this and
- * the low watermark, DAMON_RECLAIM becomes active, so starts the monitoring
- * and the reclaiming.  400 (40%) by default.
- */
-static unsigned long wmarks_mid __read_mostly = 400;
-module_param(wmarks_mid, ulong, 0600);
-
-/*
- * Free memory rate (per thousand) for the low watermark.
- *
- * If free memory of the system in bytes per thousand bytes is lower than this,
- * DAMON_RECLAIM becomes inactive, so it does nothing but periodically checks
- * the watermarks.  In the case, the system falls back to the LRU-based page
- * granularity reclamation logic.  200 (20%) by default.
- */
-static unsigned long wmarks_low __read_mostly = 200;
-module_param(wmarks_low, ulong, 0600);
+struct damos_watermarks damon_reclaim_wmarks = {
+	.metric = DAMOS_WMARK_FREE_MEM_RATE,
+	.interval = 5000000,	/* 5 seconds */
+	.high = 500,		/* 50 percent */
+	.mid = 400,		/* 40 percent */
+	.low = 200,		/* 20 percent */
+};
+DEFINE_DAMON_MODULES_WMARKS_PARAMS(damon_reclaim_wmarks);
 
 static struct damon_attrs damon_reclaim_mon_attrs = {
 	.sample_interval = 5000,	/* 5 ms */
@@ -214,13 +183,6 @@ static struct damos *damon_reclaim_new_scheme(void)
 			damon_reclaim_mon_attrs.aggr_interval,
 		.max_age_region = UINT_MAX,
 	};
-	struct damos_watermarks wmarks = {
-		.metric = DAMOS_WMARK_FREE_MEM_RATE,
-		.interval = wmarks_interval,
-		.high = wmarks_high,
-		.mid = wmarks_mid,
-		.low = wmarks_low,
-	};
 	struct damos_quota quota = {
 		/*
 		 * Do not try reclamation for more than quota_ms milliseconds
@@ -242,7 +204,7 @@ static struct damos *damon_reclaim_new_scheme(void)
 			/* under the quota. */
 			&quota,
 			/* (De)activate this according to the watermarks. */
-			&wmarks);
+			&damon_reclaim_wmarks);
 }
 
 static int damon_reclaim_apply_parameters(void)

From b02e909e6478b1798122d584eae7e2ba838f14b4 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 13 Sep 2022 17:44:42 +0000
Subject: [PATCH 566/737] mm/damon/modules-common: implement a stats parameters
 generator macro

DAMON_RECLAIM and DAMON_LRU_SORT have module parameters for DAMOS
statistics that having same names.  This commit implements a macro for
generating such module parameters so that we can reuse later.

Link: https://lkml.kernel.org/r/20220913174449.50645-16-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/modules-common.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h
index 4c2ce84869d58..ed973e0770ae9 100644
--- a/mm/damon/modules-common.h
+++ b/mm/damon/modules-common.h
@@ -23,3 +23,15 @@
 	module_param_named(wmarks_high, wmarks.high, ulong, 0600);	\
 	module_param_named(wmarks_mid, wmarks.mid, ulong, 0600);	\
 	module_param_named(wmarks_low, wmarks.low, ulong, 0600);
+
+#define DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(stat, try_name,		\
+		succ_name, qt_exceed_name)				\
+	module_param_named(nr_##try_name, stat.nr_tried, ulong, 0400);	\
+	module_param_named(bytes_##try_name, stat.sz_tried, ulong,	\
+			0400);						\
+	module_param_named(nr_##succ_name, stat.nr_applied, ulong,	\
+			0400);						\
+	module_param_named(bytes_##succ_name, stat.sz_applied, ulong,	\
+			0400);						\
+	module_param_named(qt_exceed_name, stat.qt_exceeds, ulong,	\
+			0400);

From df9a5605d48f8e4d4d2790f4e353e6a0e6622a9a Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 13 Sep 2022 17:44:43 +0000
Subject: [PATCH 567/737] mm/damon/reclaim: use stat parameters generator

This commit makes DAMON_RECLAIM to generate the module parameters for
DAMOS statistics using the generator macro to simplify the code and
reduce duplicates.

Link: https://lkml.kernel.org/r/20220913174449.50645-17-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/reclaim.c | 41 +++++------------------------------------
 1 file changed, 5 insertions(+), 36 deletions(-)

diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 7f845f617dc56..1ef8353ac15af 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -136,35 +136,9 @@ module_param(monitor_region_end, ulong, 0600);
 static int kdamond_pid __read_mostly = -1;
 module_param(kdamond_pid, int, 0400);
 
-/*
- * Number of memory regions that tried to be reclaimed.
- */
-static unsigned long nr_reclaim_tried_regions __read_mostly;
-module_param(nr_reclaim_tried_regions, ulong, 0400);
-
-/*
- * Total bytes of memory regions that tried to be reclaimed.
- */
-static unsigned long bytes_reclaim_tried_regions __read_mostly;
-module_param(bytes_reclaim_tried_regions, ulong, 0400);
-
-/*
- * Number of memory regions that successfully be reclaimed.
- */
-static unsigned long nr_reclaimed_regions __read_mostly;
-module_param(nr_reclaimed_regions, ulong, 0400);
-
-/*
- * Total bytes of memory regions that successfully be reclaimed.
- */
-static unsigned long bytes_reclaimed_regions __read_mostly;
-module_param(bytes_reclaimed_regions, ulong, 0400);
-
-/*
- * Number of times that the time/space quota limits have exceeded
- */
-static unsigned long nr_quota_exceeds __read_mostly;
-module_param(nr_quota_exceeds, ulong, 0400);
+static struct damos_stat damon_reclaim_stat;
+DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_reclaim_stat,
+		reclaim_tried_regions, reclaimed_regions, quota_exceeds);
 
 static struct damon_ctx *ctx;
 static struct damon_target *target;
@@ -318,13 +292,8 @@ static int damon_reclaim_after_aggregation(struct damon_ctx *c)
 	struct damos *s;
 
 	/* update the stats parameter */
-	damon_for_each_scheme(s, c) {
-		nr_reclaim_tried_regions = s->stat.nr_tried;
-		bytes_reclaim_tried_regions = s->stat.sz_tried;
-		nr_reclaimed_regions = s->stat.nr_applied;
-		bytes_reclaimed_regions = s->stat.sz_applied;
-		nr_quota_exceeds = s->stat.qt_exceeds;
-	}
+	damon_for_each_scheme(s, c)
+		damon_reclaim_stat = s->stat;
 
 	return damon_reclaim_handle_commit_inputs();
 }

From d0fef289efc585f10d34fa1b4bd01346bb5d60f0 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 13 Sep 2022 17:44:44 +0000
Subject: [PATCH 568/737] mm/damon/lru_sort: use stat generator

This commit makes DAMON_LRU_SORT to generate the module parameters for
DAMOS statistics using the generator macro to simplify the code and reduce
duplicates.

Link: https://lkml.kernel.org/r/20220913174449.50645-18-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/lru_sort.c | 83 +++++++--------------------------------------
 1 file changed, 12 insertions(+), 71 deletions(-)

diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 20760b39b50a4..13a752aed2720 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -135,65 +135,15 @@ module_param(monitor_region_end, ulong, 0600);
 static int kdamond_pid __read_mostly = -1;
 module_param(kdamond_pid, int, 0400);
 
-/*
- * Number of hot memory regions that tried to be LRU-sorted.
- */
-static unsigned long nr_lru_sort_tried_hot_regions __read_mostly;
-module_param(nr_lru_sort_tried_hot_regions, ulong, 0400);
-
-/*
- * Total bytes of hot memory regions that tried to be LRU-sorted.
- */
-static unsigned long bytes_lru_sort_tried_hot_regions __read_mostly;
-module_param(bytes_lru_sort_tried_hot_regions, ulong, 0400);
-
-/*
- * Number of hot memory regions that successfully be LRU-sorted.
- */
-static unsigned long nr_lru_sorted_hot_regions __read_mostly;
-module_param(nr_lru_sorted_hot_regions, ulong, 0400);
-
-/*
- * Total bytes of hot memory regions that successfully be LRU-sorted.
- */
-static unsigned long bytes_lru_sorted_hot_regions __read_mostly;
-module_param(bytes_lru_sorted_hot_regions, ulong, 0400);
-
-/*
- * Number of times that the time quota limit for hot regions have exceeded
- */
-static unsigned long nr_hot_quota_exceeds __read_mostly;
-module_param(nr_hot_quota_exceeds, ulong, 0400);
+static struct damos_stat damon_lru_sort_hot_stat;
+DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_hot_stat,
+		lru_sort_tried_hot_regions, lru_sorted_hot_regions,
+		hot_quota_exceeds);
 
-/*
- * Number of cold memory regions that tried to be LRU-sorted.
- */
-static unsigned long nr_lru_sort_tried_cold_regions __read_mostly;
-module_param(nr_lru_sort_tried_cold_regions, ulong, 0400);
-
-/*
- * Total bytes of cold memory regions that tried to be LRU-sorted.
- */
-static unsigned long bytes_lru_sort_tried_cold_regions __read_mostly;
-module_param(bytes_lru_sort_tried_cold_regions, ulong, 0400);
-
-/*
- * Number of cold memory regions that successfully be LRU-sorted.
- */
-static unsigned long nr_lru_sorted_cold_regions __read_mostly;
-module_param(nr_lru_sorted_cold_regions, ulong, 0400);
-
-/*
- * Total bytes of cold memory regions that successfully be LRU-sorted.
- */
-static unsigned long bytes_lru_sorted_cold_regions __read_mostly;
-module_param(bytes_lru_sorted_cold_regions, ulong, 0400);
-
-/*
- * Number of times that the time quota limit for cold regions have exceeded
- */
-static unsigned long nr_cold_quota_exceeds __read_mostly;
-module_param(nr_cold_quota_exceeds, ulong, 0400);
+static struct damos_stat damon_lru_sort_cold_stat;
+DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_cold_stat,
+		lru_sort_tried_cold_regions, lru_sorted_cold_regions,
+		cold_quota_exceeds);
 
 static struct damon_ctx *ctx;
 static struct damon_target *target;
@@ -397,19 +347,10 @@ static int damon_lru_sort_after_aggregation(struct damon_ctx *c)
 
 	/* update the stats parameter */
 	damon_for_each_scheme(s, c) {
-		if (s->action == DAMOS_LRU_PRIO) {
-			nr_lru_sort_tried_hot_regions = s->stat.nr_tried;
-			bytes_lru_sort_tried_hot_regions = s->stat.sz_tried;
-			nr_lru_sorted_hot_regions = s->stat.nr_applied;
-			bytes_lru_sorted_hot_regions = s->stat.sz_applied;
-			nr_hot_quota_exceeds = s->stat.qt_exceeds;
-		} else if (s->action == DAMOS_LRU_DEPRIO) {
-			nr_lru_sort_tried_cold_regions = s->stat.nr_tried;
-			bytes_lru_sort_tried_cold_regions = s->stat.sz_tried;
-			nr_lru_sorted_cold_regions = s->stat.nr_applied;
-			bytes_lru_sorted_cold_regions = s->stat.sz_applied;
-			nr_cold_quota_exceeds = s->stat.qt_exceeds;
-		}
+		if (s->action == DAMOS_LRU_PRIO)
+			damon_lru_sort_hot_stat = s->stat;
+		else if (s->action == DAMOS_LRU_DEPRIO)
+			damon_lru_sort_cold_stat = s->stat;
 	}
 
 	return damon_lru_sort_handle_commit_inputs();

From 26a3e342cbe51fa411fb46e9faec816a49c209ae Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 13 Sep 2022 17:44:45 +0000
Subject: [PATCH 569/737] mm/damon/modules-common: implement a damos quota
 params generator

DAMON_RECLAIM and DAMON_LRU_SORT have module parameters for DAMOS quotas
that having same names.  This commit implements a macro for generating
such module parameters so that we can reuse later.

Link: https://lkml.kernel.org/r/20220913174449.50645-19-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/modules-common.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h
index ed973e0770ae9..3e99810b46899 100644
--- a/mm/damon/modules-common.h
+++ b/mm/damon/modules-common.h
@@ -17,6 +17,12 @@
 	module_param_named(max_nr_regions, attrs.max_nr_regions, ulong,	\
 			0600);
 
+#define DEFINE_DAMON_MODULES_DAMOS_QUOTAS(quota)			\
+	module_param_named(quota_ms, quota.ms, ulong, 0600);		\
+	module_param_named(quota_sz, quota.sz, ulong, 0600);		\
+	module_param_named(quota_reset_interval_ms,			\
+			quota.reset_interval, ulong, 0600);
+
 #define DEFINE_DAMON_MODULES_WMARKS_PARAMS(wmarks)			\
 	module_param_named(wmarks_interval, wmarks.interval, ulong,	\
 			0600);						\
@@ -33,5 +39,5 @@
 			0400);						\
 	module_param_named(bytes_##succ_name, stat.sz_applied, ulong,	\
 			0400);						\
-	module_param_named(qt_exceed_name, stat.qt_exceeds, ulong,	\
+	module_param_named(nr_##qt_exceed_name, stat.qt_exceeds, ulong,	\
 			0400);

From 08661b656aca63555678f21967df6f6331ba32cf Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 13 Sep 2022 17:44:46 +0000
Subject: [PATCH 570/737] mm/damon/modules-common: implement damos time quota
 params generator

DAMON_LRU_SORT have module parameters for DAMOS time quota only but size
quota.  This commit implements a macro for generating the module
parameters so that we can reuse later.

Link: https://lkml.kernel.org/r/20220913174449.50645-20-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/modules-common.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h
index 3e99810b46899..5a4921851d326 100644
--- a/mm/damon/modules-common.h
+++ b/mm/damon/modules-common.h
@@ -17,12 +17,15 @@
 	module_param_named(max_nr_regions, attrs.max_nr_regions, ulong,	\
 			0600);
 
-#define DEFINE_DAMON_MODULES_DAMOS_QUOTAS(quota)			\
+#define DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(quota)			\
 	module_param_named(quota_ms, quota.ms, ulong, 0600);		\
-	module_param_named(quota_sz, quota.sz, ulong, 0600);		\
 	module_param_named(quota_reset_interval_ms,			\
 			quota.reset_interval, ulong, 0600);
 
+#define DEFINE_DAMON_MODULES_DAMOS_QUOTAS(quota)			\
+	DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(quota)			\
+	module_param_named(quota_sz, quota.sz, ulong, 0600);
+
 #define DEFINE_DAMON_MODULES_WMARKS_PARAMS(wmarks)			\
 	module_param_named(wmarks_interval, wmarks.interval, ulong,	\
 			0600);						\

From 4e0f46441be9d9e5c7677cbd521aa7e96a197324 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 13 Sep 2022 17:44:47 +0000
Subject: [PATCH 571/737] mm/damon/reclaim: use the quota params generator
 macro

This commit makes DAMON_RECLAIM to generate the module parameters for
DAMOS quotas using the generator macro to simplify the code and reduce
duplicates.

Link: https://lkml.kernel.org/r/20220913174449.50645-21-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/reclaim.c | 64 +++++++++-------------------------------------
 1 file changed, 12 insertions(+), 52 deletions(-)

diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 1ef8353ac15af..1acf808e16242 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -52,44 +52,17 @@ module_param(commit_inputs, bool, 0600);
 static unsigned long min_age __read_mostly = 120000000;
 module_param(min_age, ulong, 0600);
 
-/*
- * Limit of time for trying the reclamation in milliseconds.
- *
- * DAMON_RECLAIM tries to use only up to this time within a time window
- * (quota_reset_interval_ms) for trying reclamation of cold pages.  This can be
- * used for limiting CPU consumption of DAMON_RECLAIM.  If the value is zero,
- * the limit is disabled.
- *
- * 10 ms by default.
- */
-static unsigned long quota_ms __read_mostly = 10;
-module_param(quota_ms, ulong, 0600);
-
-/*
- * Limit of size of memory for the reclamation in bytes.
- *
- * DAMON_RECLAIM charges amount of memory which it tried to reclaim within a
- * time window (quota_reset_interval_ms) and makes no more than this limit is
- * tried.  This can be used for limiting consumption of CPU and IO.  If this
- * value is zero, the limit is disabled.
- *
- * 128 MiB by default.
- */
-static unsigned long quota_sz __read_mostly = 128 * 1024 * 1024;
-module_param(quota_sz, ulong, 0600);
-
-/*
- * The time/size quota charge reset interval in milliseconds.
- *
- * The charge reset interval for the quota of time (quota_ms) and size
- * (quota_sz).  That is, DAMON_RECLAIM does not try reclamation for more than
- * quota_ms milliseconds or quota_sz bytes within quota_reset_interval_ms
- * milliseconds.
- *
- * 1 second by default.
- */
-static unsigned long quota_reset_interval_ms __read_mostly = 1000;
-module_param(quota_reset_interval_ms, ulong, 0600);
+static struct damos_quota damon_reclaim_quota = {
+	/* use up to 10 ms time, reclaim up to 128 MiB per 1 sec by default */
+	.ms = 10,
+	.sz = 128 * 1024 * 1024,
+	.reset_interval = 1000,
+	/* Within the quota, page out older regions first. */
+	.weight_sz = 0,
+	.weight_nr_accesses = 0,
+	.weight_age = 1
+};
+DEFINE_DAMON_MODULES_DAMOS_QUOTAS(damon_reclaim_quota);
 
 struct damos_watermarks damon_reclaim_wmarks = {
 	.metric = DAMOS_WMARK_FREE_MEM_RATE,
@@ -157,26 +130,13 @@ static struct damos *damon_reclaim_new_scheme(void)
 			damon_reclaim_mon_attrs.aggr_interval,
 		.max_age_region = UINT_MAX,
 	};
-	struct damos_quota quota = {
-		/*
-		 * Do not try reclamation for more than quota_ms milliseconds
-		 * or quota_sz bytes within quota_reset_interval_ms.
-		 */
-		.ms = quota_ms,
-		.sz = quota_sz,
-		.reset_interval = quota_reset_interval_ms,
-		/* Within the quota, page out older regions first. */
-		.weight_sz = 0,
-		.weight_nr_accesses = 0,
-		.weight_age = 1
-	};
 
 	return damon_new_scheme(
 			&pattern,
 			/* page out those, as soon as found */
 			DAMOS_PAGEOUT,
 			/* under the quota. */
-			&quota,
+			&damon_reclaim_quota,
 			/* (De)activate this according to the watermarks. */
 			&damon_reclaim_wmarks);
 }

From 5476f704893b57c7268605d0b2bbd0b96c48ae4f Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 13 Sep 2022 17:44:48 +0000
Subject: [PATCH 572/737] mm/damon/lru_sort: use quotas param generator

This commit makes DAMON_LRU_SORT to generate the module parameters for
DAMOS watermarks using the generator macro to simplify the code and reduce
duplicates.

Link: https://lkml.kernel.org/r/20220913174449.50645-22-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/lru_sort.c | 70 ++++++++++++---------------------------------
 1 file changed, 19 insertions(+), 51 deletions(-)

diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 13a752aed2720..8d9c3d1fd6bef 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -65,30 +65,17 @@ module_param(hot_thres_access_freq, ulong, 0600);
 static unsigned long cold_min_age __read_mostly = 120000000;
 module_param(cold_min_age, ulong, 0600);
 
-/*
- * Limit of time for trying the LRU lists sorting in milliseconds.
- *
- * DAMON_LRU_SORT tries to use only up to this time within a time window
- * (quota_reset_interval_ms) for trying LRU lists sorting.  This can be used
- * for limiting CPU consumption of DAMON_LRU_SORT.  If the value is zero, the
- * limit is disabled.
- *
- * 10 ms by default.
- */
-static unsigned long quota_ms __read_mostly = 10;
-module_param(quota_ms, ulong, 0600);
-
-/*
- * The time quota charge reset interval in milliseconds.
- *
- * The charge reset interval for the quota of time (quota_ms).  That is,
- * DAMON_LRU_SORT does not try LRU-lists sorting for more than quota_ms
- * milliseconds or quota_sz bytes within quota_reset_interval_ms milliseconds.
- *
- * 1 second by default.
- */
-static unsigned long quota_reset_interval_ms __read_mostly = 1000;
-module_param(quota_reset_interval_ms, ulong, 0600);
+static struct damos_quota damon_lru_sort_quota = {
+	/* Use up to 10 ms per 1 sec, by default */
+	.ms = 10,
+	.sz = 0,
+	.reset_interval = 1000,
+	/* Within the quota, mark hotter regions accessed first. */
+	.weight_sz = 0,
+	.weight_nr_accesses = 1,
+	.weight_age = 0,
+};
+DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(damon_lru_sort_quota);
 
 struct damos_watermarks damon_lru_sort_wmarks = {
 	.metric = DAMOS_WMARK_FREE_MEM_RATE,
@@ -162,19 +149,10 @@ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres)
 		.min_age_region = 0,
 		.max_age_region = UINT_MAX,
 	};
-	struct damos_quota quota = {
-		/*
-		 * Do not try LRU-lists sorting of hot pages for more than half
-		 * of quota_ms milliseconds within quota_reset_interval_ms.
-		 */
-		.ms = quota_ms / 2,
-		.sz = 0,
-		.reset_interval = quota_reset_interval_ms,
-		/* Within the quota, mark hotter regions accessed first. */
-		.weight_sz = 0,
-		.weight_nr_accesses = 1,
-		.weight_age = 0,
-	};
+	struct damos_quota quota = damon_lru_sort_quota;
+
+	/* Use half of total quota for hot pages sorting */
+	quota.ms = quota.ms / 2;
 
 	return damon_new_scheme(
 			&pattern,
@@ -200,20 +178,10 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres)
 		.min_age_region = cold_thres,
 		.max_age_region = UINT_MAX,
 	};
-	struct damos_quota quota = {
-		/*
-		 * Do not try LRU-lists sorting of cold pages for more than
-		 * half of quota_ms milliseconds within
-		 * quota_reset_interval_ms.
-		 */
-		.ms = quota_ms / 2,
-		.sz = 0,
-		.reset_interval = quota_reset_interval_ms,
-		/* Within the quota, mark colder regions not accessed first. */
-		.weight_sz = 0,
-		.weight_nr_accesses = 0,
-		.weight_age = 1,
-	};
+	struct damos_quota quota = damon_lru_sort_quota;
+
+	/* Use half of total quota for cold pages sorting */
+	quota.ms = quota.ms / 2;
 
 	return damon_new_scheme(
 			&pattern,

From 39e22361759ff6baf21888c9432efb29a8d56f11 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 13 Sep 2022 17:44:49 +0000
Subject: [PATCH 573/737] mm/damon/lru_sort: deduplicate hot/cold schemes
 generators

damon_lru_sort_new_{hot,cold}_scheme() have quite a lot of duplicates.
This commit factors out the duplicate to a separate function and use it
for reducing the duplicate.

Link: https://lkml.kernel.org/r/20220913174449.50645-23-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/lru_sort.c | 45 +++++++++++++++++++++------------------------
 1 file changed, 21 insertions(+), 24 deletions(-)

diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 8d9c3d1fd6bef..07a0908963fd0 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -135,6 +135,25 @@ DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_cold_stat,
 static struct damon_ctx *ctx;
 static struct damon_target *target;
 
+static struct damos *damon_lru_sort_new_scheme(
+		struct damos_access_pattern *pattern, enum damos_action action)
+{
+	struct damos_quota quota = damon_lru_sort_quota;
+
+	/* Use half of total quota for hot/cold pages sorting */
+	quota.ms = quota.ms / 2;
+
+	return damon_new_scheme(
+			/* find the pattern, and */
+			pattern,
+			/* (de)prioritize on LRU-lists */
+			action,
+			/* under the quota. */
+			&quota,
+			/* (De)activate this according to the watermarks. */
+			&damon_lru_sort_wmarks);
+}
+
 /* Create a DAMON-based operation scheme for hot memory regions */
 static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres)
 {
@@ -149,19 +168,8 @@ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres)
 		.min_age_region = 0,
 		.max_age_region = UINT_MAX,
 	};
-	struct damos_quota quota = damon_lru_sort_quota;
-
-	/* Use half of total quota for hot pages sorting */
-	quota.ms = quota.ms / 2;
 
-	return damon_new_scheme(
-			&pattern,
-			/* prioritize those on LRU lists, as soon as found */
-			DAMOS_LRU_PRIO,
-			/* under the quota. */
-			&quota,
-			/* (De)activate this according to the watermarks. */
-			&damon_lru_sort_wmarks);
+	return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_PRIO);
 }
 
 /* Create a DAMON-based operation scheme for cold memory regions */
@@ -178,19 +186,8 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres)
 		.min_age_region = cold_thres,
 		.max_age_region = UINT_MAX,
 	};
-	struct damos_quota quota = damon_lru_sort_quota;
 
-	/* Use half of total quota for cold pages sorting */
-	quota.ms = quota.ms / 2;
-
-	return damon_new_scheme(
-			&pattern,
-			/* mark those as not accessed, as soon as found */
-			DAMOS_LRU_DEPRIO,
-			/* under the quota. */
-			&quota,
-			/* (De)activate this according to the watermarks. */
-			&damon_lru_sort_wmarks);
+	return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_DEPRIO);
 }
 
 static int damon_lru_sort_apply_parameters(void)

From 9eedb884dccb0befa92910c7a4da888fc205d779 Mon Sep 17 00:00:00 2001
From: Kaixu Xia <kaixuxia@tencent.com>
Date: Tue, 13 Sep 2022 17:11:24 +0800
Subject: [PATCH 574/737] mm/damon: simplify the parameter passing for
 'prepare_access_checks'

Patch series "mm/damon: code simplifications and cleanups".

This patchset contains some code simplifications and cleanups for DAMON.

This patch (of 4):

The parameter 'struct damon_ctx *ctx' isn't used in the functions
__damon_{p,v}a_prepare_access_check(), so we can remove it and simplify
the parameter passing.

Link: https://lkml.kernel.org/r/1663060287-30201-1-git-send-email-kaixuxia@tencent.com
Link: https://lkml.kernel.org/r/1663060287-30201-2-git-send-email-kaixuxia@tencent.com
Signed-off-by: Kaixu Xia <kaixuxia@tencent.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/paddr.c | 5 ++---
 mm/damon/vaddr.c | 6 +++---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 962cfba432630..be7952b7264cf 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -65,8 +65,7 @@ static void damon_pa_mkold(unsigned long paddr)
 	put_page(page);
 }
 
-static void __damon_pa_prepare_access_check(struct damon_ctx *ctx,
-					    struct damon_region *r)
+static void __damon_pa_prepare_access_check(struct damon_region *r)
 {
 	r->sampling_addr = damon_rand(r->ar.start, r->ar.end);
 
@@ -80,7 +79,7 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
 
 	damon_for_each_target(t, ctx) {
 		damon_for_each_region(r, t)
-			__damon_pa_prepare_access_check(ctx, r);
+			__damon_pa_prepare_access_check(r);
 	}
 }
 
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index c9e3d462db54e..8794ce445db05 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -396,8 +396,8 @@ static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
  * Functions for the access checking of the regions
  */
 
-static void __damon_va_prepare_access_check(struct damon_ctx *ctx,
-			struct mm_struct *mm, struct damon_region *r)
+static void __damon_va_prepare_access_check(struct mm_struct *mm,
+					struct damon_region *r)
 {
 	r->sampling_addr = damon_rand(r->ar.start, r->ar.end);
 
@@ -415,7 +415,7 @@ static void damon_va_prepare_access_checks(struct damon_ctx *ctx)
 		if (!mm)
 			continue;
 		damon_for_each_region(r, t)
-			__damon_va_prepare_access_check(ctx, mm, r);
+			__damon_va_prepare_access_check(mm, r);
 		mmput(mm);
 	}
 }

From 8111887776e9ecaaedc4137627689f63028bc96c Mon Sep 17 00:00:00 2001
From: Kaixu Xia <kaixuxia@tencent.com>
Date: Tue, 13 Sep 2022 17:11:25 +0800
Subject: [PATCH 575/737] mm/damon/sysfs: simplify the variable 'pid'
 assignment operation

We can initialize the variable 'pid' with '-1' in pid_show() to simplify
the variable assignment operation and make the code more readable.

Link: https://lkml.kernel.org/r/1663060287-30201-3-git-send-email-kaixuxia@tencent.com
Signed-off-by: Kaixu Xia <kaixuxia@tencent.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 6c45cd78dc505..a782cf7dc8ebc 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2638,19 +2638,16 @@ static ssize_t pid_show(struct kobject *kobj,
 	struct damon_sysfs_kdamond *kdamond = container_of(kobj,
 			struct damon_sysfs_kdamond, kobj);
 	struct damon_ctx *ctx;
-	int pid;
+	int pid = -1;
 
 	if (!mutex_trylock(&damon_sysfs_lock))
 		return -EBUSY;
 	ctx = kdamond->damon_ctx;
-	if (!ctx) {
-		pid = -1;
+	if (!ctx)
 		goto out;
-	}
+
 	mutex_lock(&ctx->kdamond_lock);
-	if (!ctx->kdamond)
-		pid = -1;
-	else
+	if (ctx->kdamond)
 		pid = ctx->kdamond->pid;
 	mutex_unlock(&ctx->kdamond_lock);
 out:

From 033dfe6557c678cab6a9187edda111b826f54ada Mon Sep 17 00:00:00 2001
From: Kaixu Xia <kaixuxia@tencent.com>
Date: Tue, 13 Sep 2022 17:11:26 +0800
Subject: [PATCH 576/737] mm/damon/core: simplify the kdamond stop mechanism by
 removing 'done'

When the 'kdamond_wait_activation()' function or 'after_sampling()' or
'after_aggregation()' DAMON callbacks return an error, it is unnecessary
to use bool 'done' to check if kdamond should be finished.  This commit
simplifies the kdamond stop mechanism by removing 'done' and break the
while loop directly in the cases.

Link: https://lkml.kernel.org/r/1663060287-30201-4-git-send-email-kaixuxia@tencent.com
Signed-off-by: Kaixu Xia <kaixuxia@tencent.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/core.c | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index 29635a82cb691..a843673c11cfc 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1152,30 +1152,25 @@ static int kdamond_fn(void *data)
 	struct damon_region *r, *next;
 	unsigned int max_nr_accesses = 0;
 	unsigned long sz_limit = 0;
-	bool done = false;
 
 	pr_debug("kdamond (%d) starts\n", current->pid);
 
 	if (ctx->ops.init)
 		ctx->ops.init(ctx);
 	if (ctx->callback.before_start && ctx->callback.before_start(ctx))
-		done = true;
+		goto done;
 
 	sz_limit = damon_region_sz_limit(ctx);
 
-	while (!kdamond_need_stop(ctx) && !done) {
-		if (kdamond_wait_activation(ctx)) {
-			done = true;
-			continue;
-		}
+	while (!kdamond_need_stop(ctx)) {
+		if (kdamond_wait_activation(ctx))
+			break;
 
 		if (ctx->ops.prepare_access_checks)
 			ctx->ops.prepare_access_checks(ctx);
 		if (ctx->callback.after_sampling &&
-				ctx->callback.after_sampling(ctx)) {
-			done = true;
-			continue;
-		}
+				ctx->callback.after_sampling(ctx))
+			break;
 
 		kdamond_usleep(ctx->attrs.sample_interval);
 
@@ -1187,10 +1182,8 @@ static int kdamond_fn(void *data)
 					max_nr_accesses / 10,
 					sz_limit);
 			if (ctx->callback.after_aggregation &&
-					ctx->callback.after_aggregation(ctx)) {
-				done = true;
-				continue;
-			}
+					ctx->callback.after_aggregation(ctx))
+				break;
 			kdamond_apply_schemes(ctx);
 			kdamond_reset_aggregated(ctx);
 			kdamond_split_regions(ctx);
@@ -1204,6 +1197,7 @@ static int kdamond_fn(void *data)
 			sz_limit = damon_region_sz_limit(ctx);
 		}
 	}
+done:
 	damon_for_each_target(t, ctx) {
 		damon_for_each_region_safe(r, next, t)
 			damon_destroy_region(r, t);

From febb65141e19be4340aee7945f4a0b575ee98c44 Mon Sep 17 00:00:00 2001
From: Xin Hao <xhao@linux.alibaba.com>
Date: Thu, 15 Sep 2022 22:22:36 +0800
Subject: [PATCH 577/737] mm/damon/sysfs: avoid call damon_target_has_pid()
 repeatedly

In damon_sysfs_destroy_targets(), we call damon_target_has_pid() to check
whether the 'ctx' include a valid pid, but there no need to call
damon_target_has_pid() to check repeatedly, just need call it once.

[xhao@linux.alibaba.com: more simplified code calls damon_target_has_pid()]
  Link: https://lkml.kernel.org/r/20220916133535.7428-1-xhao@linux.alibaba.com
Link: https://lkml.kernel.org/r/20220915142237.92529-1-xhao@linux.alibaba.com
Signed-off-by: Xin Hao <xhao@linux.alibaba.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index a782cf7dc8ebc..313780193b109 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2143,9 +2143,10 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx,
 static void damon_sysfs_destroy_targets(struct damon_ctx *ctx)
 {
 	struct damon_target *t, *next;
+	bool has_pid = damon_target_has_pid(ctx);
 
 	damon_for_each_target_safe(t, next, ctx) {
-		if (damon_target_has_pid(ctx))
+		if (has_pid)
 			put_pid(t->pid);
 		damon_destroy_target(t);
 	}

From 5301b5d5c6b0192afac639e8ca99408a4354cdb5 Mon Sep 17 00:00:00 2001
From: Xin Hao <xhao@linux.alibaba.com>
Date: Thu, 15 Sep 2022 13:30:41 +0000
Subject: [PATCH 578/737] mm/damon: simplify scheme create in lru_sort.c

In damon_lru_sort_new_hot_scheme() and damon_lru_sort_new_cold_scheme(),
they have so much in common, so we can combine them into a single
function, and we just need to distinguish their differences.

[yangyingliang@huawei.com: change damon_lru_sort_stub_pattern to static]
  Link: https://lkml.kernel.org/r/20220917121228.1889699-1-yangyingliang@huawei.com
Link: https://lkml.kernel.org/r/20220915133041.71819-1-sj@kernel.org
Signed-off-by: Xin Hao <xhao@linux.alibaba.com>
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Suggested-by: SeongJae Park <sj@kernel.org>
Reviewed-by: Xin Hao <xhao@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/lru_sort.c | 39 +++++++++++++++++----------------------
 1 file changed, 17 insertions(+), 22 deletions(-)

diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 07a0908963fd0..a91c1e364fc7b 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -132,6 +132,18 @@ DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_cold_stat,
 		lru_sort_tried_cold_regions, lru_sorted_cold_regions,
 		cold_quota_exceeds);
 
+static struct damos_access_pattern damon_lru_sort_stub_pattern = {
+	/* Find regions having PAGE_SIZE or larger size */
+	.min_sz_region = PAGE_SIZE,
+	.max_sz_region = ULONG_MAX,
+	/* no matter its access frequency */
+	.min_nr_accesses = 0,
+	.max_nr_accesses = UINT_MAX,
+	/* no matter its age */
+	.min_age_region = 0,
+	.max_age_region = UINT_MAX,
+};
+
 static struct damon_ctx *ctx;
 static struct damon_target *target;
 
@@ -157,36 +169,19 @@ static struct damos *damon_lru_sort_new_scheme(
 /* Create a DAMON-based operation scheme for hot memory regions */
 static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres)
 {
-	struct damos_access_pattern pattern = {
-		/* Find regions having PAGE_SIZE or larger size */
-		.min_sz_region = PAGE_SIZE,
-		.max_sz_region = ULONG_MAX,
-		/* and accessed for more than the threshold */
-		.min_nr_accesses = hot_thres,
-		.max_nr_accesses = UINT_MAX,
-		/* no matter its age */
-		.min_age_region = 0,
-		.max_age_region = UINT_MAX,
-	};
+	struct damos_access_pattern pattern = damon_lru_sort_stub_pattern;
 
+	pattern.min_nr_accesses = hot_thres;
 	return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_PRIO);
 }
 
 /* Create a DAMON-based operation scheme for cold memory regions */
 static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres)
 {
-	struct damos_access_pattern pattern = {
-		/* Find regions having PAGE_SIZE or larger size */
-		.min_sz_region = PAGE_SIZE,
-		.max_sz_region = ULONG_MAX,
-		/* and not accessed at all */
-		.min_nr_accesses = 0,
-		.max_nr_accesses = 0,
-		/* for min_age or more micro-seconds */
-		.min_age_region = cold_thres,
-		.max_age_region = UINT_MAX,
-	};
+	struct damos_access_pattern pattern = damon_lru_sort_stub_pattern;
 
+	pattern.max_nr_accesses = 0;
+	pattern.min_age_region = cold_thres;
 	return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_DEPRIO);
 }
 

From a0a7299a1b8cf648f0d5e38cebd8c27b91dbba58 Mon Sep 17 00:00:00 2001
From: Kaixu Xia <kaixuxia@tencent.com>
Date: Thu, 15 Sep 2022 19:33:41 +0800
Subject: [PATCH 579/737] mm/damon: use 'struct damon_target *' instead of
 'void *' in target_valid()

We could use 'struct damon_target *' directly instead of 'void *' in
target_valid() operation to make code simple.

Link: https://lkml.kernel.org/r/1663241621-13293-1-git-send-email-kaixuxia@tencent.com
Signed-off-by: Kaixu Xia <kaixuxia@tencent.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 2 +-
 mm/damon/vaddr.c      | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index c5dc0c77c7722..1dda8d0068e54 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -346,7 +346,7 @@ struct damon_operations {
 	unsigned long (*apply_scheme)(struct damon_ctx *context,
 			struct damon_target *t, struct damon_region *r,
 			struct damos *scheme);
-	bool (*target_valid)(void *target);
+	bool (*target_valid)(struct damon_target *t);
 	void (*cleanup)(struct damon_ctx *context);
 };
 
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 8794ce445db05..29c897a92ec20 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -592,9 +592,8 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
  * Functions for the target validity check and cleanup
  */
 
-static bool damon_va_target_valid(void *target)
+static bool damon_va_target_valid(struct damon_target *t)
 {
-	struct damon_target *t = target;
 	struct task_struct *task;
 
 	task = damon_get_task_struct(t);

From f11f6c06937fb41cfd32f35fcaffb06c7bb9a844 Mon Sep 17 00:00:00 2001
From: Yang Yingliang <yangyingliang@huawei.com>
Date: Thu, 15 Sep 2022 10:10:23 +0800
Subject: [PATCH 580/737] mm/damon/reclaim: change damon_reclaim_wmarks to
 static

damon_reclaim_wmarks is only used in reclaim.c now, change it to static.

Link: https://lkml.kernel.org/r/20220915021024.4177940-1-yangyingliang@huawei.com
Fixes: 89dd02d8abd1 ("mm/damon/reclaim: use watermarks parameters generator macro")
Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/reclaim.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 1acf808e16242..039fa55e0ae9c 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -64,7 +64,7 @@ static struct damos_quota damon_reclaim_quota = {
 };
 DEFINE_DAMON_MODULES_DAMOS_QUOTAS(damon_reclaim_quota);
 
-struct damos_watermarks damon_reclaim_wmarks = {
+static struct damos_watermarks damon_reclaim_wmarks = {
 	.metric = DAMOS_WMARK_FREE_MEM_RATE,
 	.interval = 5000000,	/* 5 seconds */
 	.high = 500,		/* 50 percent */

From ecedb3ef7f645bf0d5879b04290b9b9ea9f5a07c Mon Sep 17 00:00:00 2001
From: Yang Yingliang <yangyingliang@huawei.com>
Date: Thu, 15 Sep 2022 10:10:24 +0800
Subject: [PATCH 581/737] mm/damon/lru_sort: change damon_lru_sort_wmarks to
 static

damon_lru_sort_wmarks is only used in lru_sort.c now, change it to static.

Link: https://lkml.kernel.org/r/20220915021024.4177940-2-yangyingliang@huawei.com
Fixes: 189aa3d58206 ("mm/damon/lru_sort: use watermarks parameters generator macro")
Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/lru_sort.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index a91c1e364fc7b..4a40054ba03bf 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -77,7 +77,7 @@ static struct damos_quota damon_lru_sort_quota = {
 };
 DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(damon_lru_sort_quota);
 
-struct damos_watermarks damon_lru_sort_wmarks = {
+static struct damos_watermarks damon_lru_sort_wmarks = {
 	.metric = DAMOS_WMARK_FREE_MEM_RATE,
 	.interval = 5000000,	/* 5 seconds */
 	.high = 200,		/* 20 percent */

From b28dae488a5c696cfab2bcc66519713814e8194f Mon Sep 17 00:00:00 2001
From: Kaixu Xia <kaixuxia@tencent.com>
Date: Fri, 16 Sep 2022 23:20:35 +0800
Subject: [PATCH 582/737] mm/damon: return void from damon_set_schemes()

There is no point in returning an int from damon_set_schemes().  It always
returns 0 which is meaningless for the caller, so change it to return void
directly.

Link: https://lkml.kernel.org/r/1663341635-12675-1-git-send-email-kaixuxia@tencent.com
Signed-off-by: Kaixu Xia <kaixuxia@tencent.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 2 +-
 mm/damon/core.c       | 5 +----
 mm/damon/dbgfs.c      | 8 +++-----
 mm/damon/lru_sort.c   | 4 +---
 mm/damon/reclaim.c    | 4 +---
 5 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 1dda8d0068e54..e7808a84675fb 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -541,7 +541,7 @@ unsigned int damon_nr_regions(struct damon_target *t);
 struct damon_ctx *damon_new_ctx(void);
 void damon_destroy_ctx(struct damon_ctx *ctx);
 int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs);
-int damon_set_schemes(struct damon_ctx *ctx,
+void damon_set_schemes(struct damon_ctx *ctx,
 			struct damos **schemes, ssize_t nr_schemes);
 int damon_nr_running_ctxs(void);
 bool damon_is_registered_ops(enum damon_ops_id id);
diff --git a/mm/damon/core.c b/mm/damon/core.c
index a843673c11cfc..9c80c6eb00c24 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -454,10 +454,8 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs)
  *
  * This function should not be called while the kdamond of the context is
  * running.
- *
- * Return: 0 if success, or negative error code otherwise.
  */
-int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes,
+void damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes,
 			ssize_t nr_schemes)
 {
 	struct damos *s, *next;
@@ -467,7 +465,6 @@ int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes,
 		damon_destroy_scheme(s);
 	for (i = 0; i < nr_schemes; i++)
 		damon_add_scheme(ctx, schemes[i]);
-	return 0;
 }
 
 /**
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 8d688a84e52e7..b3f454a5c6828 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -307,11 +307,9 @@ static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf,
 		goto unlock_out;
 	}
 
-	ret = damon_set_schemes(ctx, schemes, nr_schemes);
-	if (!ret) {
-		ret = count;
-		nr_schemes = 0;
-	}
+	damon_set_schemes(ctx, schemes, nr_schemes);
+	ret = count;
+	nr_schemes = 0;
 
 unlock_out:
 	mutex_unlock(&ctx->kdamond_lock);
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 4a40054ba03bf..d7eb72b41cb67 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -203,9 +203,7 @@ static int damon_lru_sort_apply_parameters(void)
 	scheme = damon_lru_sort_new_hot_scheme(hot_thres);
 	if (!scheme)
 		return -ENOMEM;
-	err = damon_set_schemes(ctx, &scheme, 1);
-	if (err)
-		return err;
+	damon_set_schemes(ctx, &scheme, 1);
 
 	cold_thres = cold_min_age / damon_lru_sort_mon_attrs.aggr_interval;
 	scheme = damon_lru_sort_new_cold_scheme(cold_thres);
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 039fa55e0ae9c..3d59ab11b7b39 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -155,9 +155,7 @@ static int damon_reclaim_apply_parameters(void)
 	scheme = damon_reclaim_new_scheme();
 	if (!scheme)
 		return -ENOMEM;
-	err = damon_set_schemes(ctx, &scheme, 1);
-	if (err)
-		return err;
+	damon_set_schemes(ctx, &scheme, 1);
 
 	if (monitor_region_start > monitor_region_end)
 		return -EINVAL;

From 9b44c6e1122a9b8315bc5a9bced9d522b9f1710b Mon Sep 17 00:00:00 2001
From: Kaixu Xia <kaixuxia@tencent.com>
Date: Sat, 17 Sep 2022 21:56:54 +0800
Subject: [PATCH 583/737] mm/damon: rename damon_pageout_score() to
 damon_cold_score()

In the beginning there is only one damos_action 'DAMOS_PAGEOUT' that need
to get the coldness score of a region for a scheme, which using
damon_pageout_score() to do that.  But now there are also other
damos_action actions need the coldness score, so rename it to
damon_cold_score() to make more sense.

Link: https://lkml.kernel.org/r/1663423014-28907-1-git-send-email-kaixuxia@tencent.com
Signed-off-by: Kaixu Xia <kaixuxia@tencent.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/ops-common.c | 2 +-
 mm/damon/ops-common.h | 2 +-
 mm/damon/paddr.c      | 4 ++--
 mm/damon/vaddr.c      | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index 9310df72e1c54..75409601f9349 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -130,7 +130,7 @@ int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
 	return hotness;
 }
 
-int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
+int damon_cold_score(struct damon_ctx *c, struct damon_region *r,
 			struct damos *s)
 {
 	int hotness = damon_hot_score(c, r, s);
diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h
index 52329ff361cd0..8d82d37222042 100644
--- a/mm/damon/ops-common.h
+++ b/mm/damon/ops-common.h
@@ -12,7 +12,7 @@ struct page *damon_get_page(unsigned long pfn);
 void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr);
 void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr);
 
-int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
+int damon_cold_score(struct damon_ctx *c, struct damon_region *r,
 			struct damos *s);
 int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
 			struct damos *s);
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index be7952b7264cf..2f0196bbf0b22 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -291,11 +291,11 @@ static int damon_pa_scheme_score(struct damon_ctx *context,
 {
 	switch (scheme->action) {
 	case DAMOS_PAGEOUT:
-		return damon_pageout_score(context, r, scheme);
+		return damon_cold_score(context, r, scheme);
 	case DAMOS_LRU_PRIO:
 		return damon_hot_score(context, r, scheme);
 	case DAMOS_LRU_DEPRIO:
-		return damon_pageout_score(context, r, scheme);
+		return damon_cold_score(context, r, scheme);
 	default:
 		break;
 	}
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 29c897a92ec20..95444ee3f802a 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -672,7 +672,7 @@ static int damon_va_scheme_score(struct damon_ctx *context,
 
 	switch (scheme->action) {
 	case DAMOS_PAGEOUT:
-		return damon_pageout_score(context, r, scheme);
+		return damon_cold_score(context, r, scheme);
 	default:
 		break;
 	}

From 672225b3dc19e6a0934cbfe1fcc327afc4f73a1b Mon Sep 17 00:00:00 2001
From: Xin Hao <xhao@linux.alibaba.com>
Date: Tue, 20 Sep 2022 16:35:30 +0000
Subject: [PATCH 584/737] mm/damon/sysfs: return 'err' value when call
 kstrtoul() failed

We had better return the 'err' value when calling kstrtoul() failed, so
the user will know why it really fails, there do little change, let it
return the 'err' value when failed.

Link: https://lkml.kernel.org/r/6329ebe0.050a0220.ec4bd.297cSMTPIN_ADDED_BROKEN@mx.google.com
Suggested-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Xin Hao <xhao@linux.alibaba.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: SeongJae Park <sj@kernel.org>
Reviewed-by: Xin Hao <xhao@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs.c | 46 ++++++++++++++--------------------------------
 1 file changed, 14 insertions(+), 32 deletions(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 313780193b109..07e5f1bdf025f 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -58,7 +58,7 @@ static ssize_t min_store(struct kobject *kobj, struct kobj_attribute *attr,
 
 	err = kstrtoul(buf, 0, &min);
 	if (err)
-		return -EINVAL;
+		return err;
 
 	range->min = min;
 	return count;
@@ -83,7 +83,7 @@ static ssize_t max_store(struct kobject *kobj, struct kobj_attribute *attr,
 
 	err = kstrtoul(buf, 0, &max);
 	if (err)
-		return -EINVAL;
+		return err;
 
 	range->max = max;
 	return count;
@@ -291,9 +291,7 @@ static ssize_t interval_us_store(struct kobject *kobj,
 			struct damon_sysfs_watermarks, kobj);
 	int err = kstrtoul(buf, 0, &watermarks->interval_us);
 
-	if (err)
-		return -EINVAL;
-	return count;
+	return err ? err : count;
 }
 
 static ssize_t high_show(struct kobject *kobj,
@@ -312,9 +310,7 @@ static ssize_t high_store(struct kobject *kobj,
 			struct damon_sysfs_watermarks, kobj);
 	int err = kstrtoul(buf, 0, &watermarks->high);
 
-	if (err)
-		return -EINVAL;
-	return count;
+	return err ? err : count;
 }
 
 static ssize_t mid_show(struct kobject *kobj,
@@ -333,9 +329,7 @@ static ssize_t mid_store(struct kobject *kobj,
 			struct damon_sysfs_watermarks, kobj);
 	int err = kstrtoul(buf, 0, &watermarks->mid);
 
-	if (err)
-		return -EINVAL;
-	return count;
+	return err ? err : count;
 }
 
 static ssize_t low_show(struct kobject *kobj,
@@ -354,9 +348,7 @@ static ssize_t low_store(struct kobject *kobj,
 			struct damon_sysfs_watermarks, kobj);
 	int err = kstrtoul(buf, 0, &watermarks->low);
 
-	if (err)
-		return -EINVAL;
-	return count;
+	return err ? err : count;
 }
 
 static void damon_sysfs_watermarks_release(struct kobject *kobj)
@@ -437,9 +429,7 @@ static ssize_t sz_permil_store(struct kobject *kobj,
 			struct damon_sysfs_weights, kobj);
 	int err = kstrtouint(buf, 0, &weights->sz);
 
-	if (err)
-		return -EINVAL;
-	return count;
+	return err ? err : count;
 }
 
 static ssize_t nr_accesses_permil_show(struct kobject *kobj,
@@ -458,9 +448,7 @@ static ssize_t nr_accesses_permil_store(struct kobject *kobj,
 			struct damon_sysfs_weights, kobj);
 	int err = kstrtouint(buf, 0, &weights->nr_accesses);
 
-	if (err)
-		return -EINVAL;
-	return count;
+	return err ? err : count;
 }
 
 static ssize_t age_permil_show(struct kobject *kobj,
@@ -479,9 +467,7 @@ static ssize_t age_permil_store(struct kobject *kobj,
 			struct damon_sysfs_weights, kobj);
 	int err = kstrtouint(buf, 0, &weights->age);
 
-	if (err)
-		return -EINVAL;
-	return count;
+	return err ? err : count;
 }
 
 static void damon_sysfs_weights_release(struct kobject *kobj)
@@ -1111,9 +1097,7 @@ static ssize_t start_store(struct kobject *kobj, struct kobj_attribute *attr,
 			struct damon_sysfs_region, kobj);
 	int err = kstrtoul(buf, 0, &region->start);
 
-	if (err)
-		return -EINVAL;
-	return count;
+	return err ? err : count;
 }
 
 static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1132,9 +1116,7 @@ static ssize_t end_store(struct kobject *kobj, struct kobj_attribute *attr,
 			struct damon_sysfs_region, kobj);
 	int err = kstrtoul(buf, 0, &region->end);
 
-	if (err)
-		return -EINVAL;
-	return count;
+	return err ? err : count;
 }
 
 static void damon_sysfs_region_release(struct kobject *kobj)
@@ -1528,7 +1510,7 @@ static ssize_t sample_us_store(struct kobject *kobj,
 	int err = kstrtoul(buf, 0, &us);
 
 	if (err)
-		return -EINVAL;
+		return err;
 
 	intervals->sample_us = us;
 	return count;
@@ -1552,7 +1534,7 @@ static ssize_t aggr_us_store(struct kobject *kobj, struct kobj_attribute *attr,
 	int err = kstrtoul(buf, 0, &us);
 
 	if (err)
-		return -EINVAL;
+		return err;
 
 	intervals->aggr_us = us;
 	return count;
@@ -1576,7 +1558,7 @@ static ssize_t update_us_store(struct kobject *kobj,
 	int err = kstrtoul(buf, 0, &us);
 
 	if (err)
-		return -EINVAL;
+		return err;
 
 	intervals->update_us = us;
 	return count;

From 7f779c4bd23faa01da3945b9ca43b078cea63f8e Mon Sep 17 00:00:00 2001
From: Kaixu Xia <kaixuxia@tencent.com>
Date: Tue, 20 Sep 2022 16:53:22 +0000
Subject: [PATCH 585/737] mm/damon: deduplicate
 damon_{reclaim,lru_sort}_apply_parameters()

The bodies of damon_{reclaim,lru_sort}_apply_parameters() contain
duplicates.  This commit adds a common function
damon_set_region_biggest_system_ram_default() to remove the duplicates.

Link: https://lkml.kernel.org/r/6329f00d.a70a0220.9bb29.3678SMTPIN_ADDED_BROKEN@mx.google.com
Signed-off-by: Kaixu Xia <kaixuxia@tencent.com>
Suggested-by: SeongJae Park <sj@kernel.org>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h |  3 ++-
 mm/damon/core.c       | 35 ++++++++++++++++++++++++++++++++++-
 mm/damon/lru_sort.c   | 13 +++----------
 mm/damon/reclaim.c    | 13 +++----------
 4 files changed, 42 insertions(+), 22 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index e7808a84675fb..ed5470f50babd 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -557,7 +557,8 @@ static inline bool damon_target_has_pid(const struct damon_ctx *ctx)
 int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive);
 int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
 
-bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end);
+int damon_set_region_biggest_system_ram_default(struct damon_target *t,
+				unsigned long *start, unsigned long *end);
 
 #endif	/* CONFIG_DAMON */
 
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 9c80c6eb00c24..4de8c7c529794 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1245,7 +1245,8 @@ static int walk_system_ram(struct resource *res, void *arg)
  * Find biggest 'System RAM' resource and store its start and end address in
  * @start and @end, respectively.  If no System RAM is found, returns false.
  */
-bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end)
+static bool damon_find_biggest_system_ram(unsigned long *start,
+						unsigned long *end)
 
 {
 	struct damon_system_ram_region arg = {};
@@ -1259,6 +1260,38 @@ bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end)
 	return true;
 }
 
+/**
+ * damon_set_region_biggest_system_ram_default() - Set the region of the given
+ * monitoring target as requested, or biggest 'System RAM'.
+ * @t:		The monitoring target to set the region.
+ * @start:	The pointer to the start address of the region.
+ * @end:	The pointer to the end address of the region.
+ *
+ * This function sets the region of @t as requested by @start and @end.  If the
+ * values of @start and @end are zero, however, this function finds the biggest
+ * 'System RAM' resource and sets the region to cover the resource.  In the
+ * latter case, this function saves the start and end addresses of the resource
+ * in @start and @end, respectively.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int damon_set_region_biggest_system_ram_default(struct damon_target *t,
+			unsigned long *start, unsigned long *end)
+{
+	struct damon_addr_range addr_range;
+
+	if (*start > *end)
+		return -EINVAL;
+
+	if (!*start && !*end &&
+		!damon_find_biggest_system_ram(start, end))
+		return -EINVAL;
+
+	addr_range.start = *start;
+	addr_range.end = *end;
+	return damon_set_regions(t, &addr_range, 1);
+}
+
 static int __init damon_init(void)
 {
 	damon_region_cache = KMEM_CACHE(damon_region, 0);
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index d7eb72b41cb67..efbc2bda8b9cd 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -188,7 +188,6 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres)
 static int damon_lru_sort_apply_parameters(void)
 {
 	struct damos *scheme;
-	struct damon_addr_range addr_range;
 	unsigned int hot_thres, cold_thres;
 	int err = 0;
 
@@ -211,15 +210,9 @@ static int damon_lru_sort_apply_parameters(void)
 		return -ENOMEM;
 	damon_add_scheme(ctx, scheme);
 
-	if (monitor_region_start > monitor_region_end)
-		return -EINVAL;
-	if (!monitor_region_start && !monitor_region_end &&
-	    !damon_find_biggest_system_ram(&monitor_region_start,
-					   &monitor_region_end))
-		return -EINVAL;
-	addr_range.start = monitor_region_start;
-	addr_range.end = monitor_region_end;
-	return damon_set_regions(target, &addr_range, 1);
+	return damon_set_region_biggest_system_ram_default(target,
+					&monitor_region_start,
+					&monitor_region_end);
 }
 
 static int damon_lru_sort_turn(bool on)
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 3d59ab11b7b39..162c9b1ca00fd 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -144,7 +144,6 @@ static struct damos *damon_reclaim_new_scheme(void)
 static int damon_reclaim_apply_parameters(void)
 {
 	struct damos *scheme;
-	struct damon_addr_range addr_range;
 	int err = 0;
 
 	err = damon_set_attrs(ctx, &damon_reclaim_mon_attrs);
@@ -157,15 +156,9 @@ static int damon_reclaim_apply_parameters(void)
 		return -ENOMEM;
 	damon_set_schemes(ctx, &scheme, 1);
 
-	if (monitor_region_start > monitor_region_end)
-		return -EINVAL;
-	if (!monitor_region_start && !monitor_region_end &&
-	    !damon_find_biggest_system_ram(&monitor_region_start,
-					   &monitor_region_end))
-		return -EINVAL;
-	addr_range.start = monitor_region_start;
-	addr_range.end = monitor_region_end;
-	return damon_set_regions(target, &addr_range, 1);
+	return damon_set_region_biggest_system_ram_default(target,
+					&monitor_region_start,
+					&monitor_region_end);
 }
 
 static int damon_reclaim_turn(bool on)

From 57de028b474076121be79010cedd2d6ead6e3a96 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Sun, 2 Oct 2022 19:31:30 +0000
Subject: [PATCH 586/737] mm/damon/core: initialize damon_target->list in
 damon_new_target()

'struct damon_target' creation function, 'damon_new_target()' is not
initializing its '->list' field, unlike other DAMON structs creator
functions such as 'damon_new_region()'.  Normal users of
'damon_new_target()' initializes the field by adding the target to DAMON
context's targets list, but some code could access the uninitialized
field.

This commit avoids the case by initializing the field in
'damon_new_target()'.

Link: https://lkml.kernel.org/r/20221002193130.8227-1-sj@kernel.org
Fixes: f23b8eee1871 ("mm/damon/core: implement region-based sampling")
Signed-off-by: SeongJae Park <sj@kernel.org>
Reported-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index 4de8c7c529794..8e1ab38d0f1f7 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -335,6 +335,7 @@ struct damon_target *damon_new_target(void)
 	t->pid = NULL;
 	t->nr_regions = 0;
 	INIT_LIST_HEAD(&t->regions_list);
+	INIT_LIST_HEAD(&t->list);
 
 	return t;
 }

From 66bccccb23ab9b3a39c3205ffe7ae67bed190824 Mon Sep 17 00:00:00 2001
From: Xin Hao <xhao@linux.alibaba.com>
Date: Tue, 27 Sep 2022 08:19:45 +0800
Subject: [PATCH 587/737] mm/damon: move sz_damon_region to damon_sz_region

Rename sz_damon_region() to damon_sz_region(), and move it to
"include/linux/damon.h", because in many places, we can to use this func.

Link: https://lkml.kernel.org/r/20220927001946.85375-1-xhao@linux.alibaba.com
Signed-off-by: Xin Hao <xhao@linux.alibaba.com>
Suggested-by: SeongJae Park <sj@kernel.org>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 6 ++++++
 mm/damon/core.c       | 9 ++-------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index ed5470f50babd..620ada094c3b2 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -484,6 +484,12 @@ static inline struct damon_region *damon_first_region(struct damon_target *t)
 	return list_first_entry(&t->regions_list, struct damon_region, list);
 }
 
+static inline unsigned long damon_sz_region(struct damon_region *r)
+{
+	return r->ar.end - r->ar.start;
+}
+
+
 #define damon_for_each_region(r, t) \
 	list_for_each_entry(r, &t->regions_list, list)
 
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 8e1ab38d0f1f7..624ec64993896 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -865,18 +865,13 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
 	}
 }
 
-static inline unsigned long sz_damon_region(struct damon_region *r)
-{
-	return r->ar.end - r->ar.start;
-}
-
 /*
  * Merge two adjacent regions into one region
  */
 static void damon_merge_two_regions(struct damon_target *t,
 		struct damon_region *l, struct damon_region *r)
 {
-	unsigned long sz_l = sz_damon_region(l), sz_r = sz_damon_region(r);
+	unsigned long sz_l = damon_sz_region(l), sz_r = damon_sz_region(r);
 
 	l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) /
 			(sz_l + sz_r);
@@ -905,7 +900,7 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres,
 
 		if (prev && prev->ar.end == r->ar.start &&
 		    abs(prev->nr_accesses - r->nr_accesses) <= thres &&
-		    sz_damon_region(prev) + sz_damon_region(r) <= sz_limit)
+		    damon_sz_region(prev) + damon_sz_region(r) <= sz_limit)
 			damon_merge_two_regions(t, prev, r);
 		else
 			prev = r;

From aab5be7d35c63e0e881afdf97b6976096f926780 Mon Sep 17 00:00:00 2001
From: Xin Hao <xhao@linux.alibaba.com>
Date: Tue, 27 Sep 2022 08:19:46 +0800
Subject: [PATCH 588/737] mm/damon: use damon_sz_region() in appropriate place

In many places we can use damon_sz_region() to instead of "r->ar.end -
r->ar.start".

Link: https://lkml.kernel.org/r/20220927001946.85375-2-xhao@linux.alibaba.com
Signed-off-by: Xin Hao <xhao@linux.alibaba.com>
Suggested-by: SeongJae Park <sj@kernel.org>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/core.c  | 17 ++++++++---------
 mm/damon/vaddr.c |  4 ++--
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index 624ec64993896..36d098d06c558 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -491,7 +491,7 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx)
 
 	damon_for_each_target(t, ctx) {
 		damon_for_each_region(r, t)
-			sz += r->ar.end - r->ar.start;
+			sz += damon_sz_region(r);
 	}
 
 	if (ctx->attrs.min_nr_regions)
@@ -674,7 +674,7 @@ static bool __damos_valid_target(struct damon_region *r, struct damos *s)
 {
 	unsigned long sz;
 
-	sz = r->ar.end - r->ar.start;
+	sz = damon_sz_region(r);
 	return s->pattern.min_sz_region <= sz &&
 		sz <= s->pattern.max_sz_region &&
 		s->pattern.min_nr_accesses <= r->nr_accesses &&
@@ -702,7 +702,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
 
 	damon_for_each_scheme(s, c) {
 		struct damos_quota *quota = &s->quota;
-		unsigned long sz = r->ar.end - r->ar.start;
+		unsigned long sz = damon_sz_region(r);
 		struct timespec64 begin, end;
 		unsigned long sz_applied = 0;
 
@@ -731,14 +731,14 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
 				sz = ALIGN_DOWN(quota->charge_addr_from -
 						r->ar.start, DAMON_MIN_REGION);
 				if (!sz) {
-					if (r->ar.end - r->ar.start <=
-							DAMON_MIN_REGION)
+					if (damon_sz_region(r) <=
+					    DAMON_MIN_REGION)
 						continue;
 					sz = DAMON_MIN_REGION;
 				}
 				damon_split_region_at(t, r, sz);
 				r = damon_next_region(r);
-				sz = r->ar.end - r->ar.start;
+				sz = damon_sz_region(r);
 			}
 			quota->charge_target_from = NULL;
 			quota->charge_addr_from = 0;
@@ -843,8 +843,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
 					continue;
 				score = c->ops.get_scheme_score(
 						c, t, r, s);
-				quota->histogram[score] +=
-					r->ar.end - r->ar.start;
+				quota->histogram[score] += damon_sz_region(r);
 				if (score > max_score)
 					max_score = score;
 			}
@@ -958,7 +957,7 @@ static void damon_split_regions_of(struct damon_target *t, int nr_subs)
 	int i;
 
 	damon_for_each_region_safe(r, next, t) {
-		sz_region = r->ar.end - r->ar.start;
+		sz_region = damon_sz_region(r);
 
 		for (i = 0; i < nr_subs - 1 &&
 				sz_region > 2 * DAMON_MIN_REGION; i++) {
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 95444ee3f802a..4c953e4701f05 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -72,7 +72,7 @@ static int damon_va_evenly_split_region(struct damon_target *t,
 		return -EINVAL;
 
 	orig_end = r->ar.end;
-	sz_orig = r->ar.end - r->ar.start;
+	sz_orig = damon_sz_region(r);
 	sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION);
 
 	if (!sz_piece)
@@ -617,7 +617,7 @@ static unsigned long damos_madvise(struct damon_target *target,
 {
 	struct mm_struct *mm;
 	unsigned long start = PAGE_ALIGN(r->ar.start);
-	unsigned long len = PAGE_ALIGN(r->ar.end - r->ar.start);
+	unsigned long len = PAGE_ALIGN(damon_sz_region(r));
 	unsigned long applied;
 
 	mm = damon_get_mm(target);

From 9d3f2a04a319d7afac3c63d56893c6168f7999a8 Mon Sep 17 00:00:00 2001
From: Andrew Panyakin <apanyaki@amazon.com>
Date: Wed, 18 Jan 2023 20:46:21 +0000
Subject: [PATCH 589/737] ENA: Update to v2.8.1

Source: https://github.com/amzn/amzn-drivers/

Change Log:

## r2.8.1 release notes
**New Features**
* Add extended metrics mechanism support
* Add conntrack customer metric to ethtool

**Bug Fixes**
* Fix compilation issues on SLES 15 SP4
* Fix compilation errors in RHEL 8.7, 9.0
* Configure TX rings mem policy in reset flow

**Minor Changes**
* Add napi_build_skb support
* Add napi_consume_skb
* Align ena_alloc_map_page signature
* Move from strlcpy with unused retval to strscpy
* Add status check for strscpy calls
* Backport napi_alloc_skb usage

Signed-off-by: Andrew Panyakin <apanyaki@amazon.com>
---
 drivers/amazon/net/ena/ena_admin_defs.h |  31 ++++
 drivers/amazon/net/ena/ena_com.c        | 157 ++++++++++++++++----
 drivers/amazon/net/ena/ena_com.h        |  59 ++++++++
 drivers/amazon/net/ena/ena_devlink.c    |   2 +
 drivers/amazon/net/ena/ena_ethtool.c    | 187 +++++++++++++++++-------
 drivers/amazon/net/ena/ena_netdev.c     |  77 ++++++----
 drivers/amazon/net/ena/ena_netdev.h     |   4 +-
 drivers/amazon/net/ena/kcompat.h        | 102 +++++++++++--
 8 files changed, 496 insertions(+), 123 deletions(-)

diff --git a/drivers/amazon/net/ena/ena_admin_defs.h b/drivers/amazon/net/ena/ena_admin_defs.h
index a52f588445039..b3a9f1aec52b3 100644
--- a/drivers/amazon/net/ena/ena_admin_defs.h
+++ b/drivers/amazon/net/ena/ena_admin_defs.h
@@ -10,6 +10,21 @@
 
 #define ENA_ADMIN_RSS_KEY_PARTS              10
 
+#define ENA_ADMIN_CUSTOMER_METRICS_SUPPORT_MASK 0x3F
+#define ENA_ADMIN_CUSTOMER_METRICS_MIN_SUPPORT_MASK 0x1F
+
+ /* customer metrics - in correlation with
+  * ENA_ADMIN_CUSTOMER_METRICS_SUPPORT_MASK
+  */
+enum ena_admin_customer_metrics_id {
+	ENA_ADMIN_BW_IN_ALLOWANCE_EXCEEDED         = 0,
+	ENA_ADMIN_BW_OUT_ALLOWANCE_EXCEEDED        = 1,
+	ENA_ADMIN_PPS_ALLOWANCE_EXCEEDED           = 2,
+	ENA_ADMIN_CONNTRACK_ALLOWANCE_EXCEEDED     = 3,
+	ENA_ADMIN_LINKLOCAL_ALLOWANCE_EXCEEDED     = 4,
+	ENA_ADMIN_CONNTRACK_ALLOWANCE_AVAILABLE    = 5,
+};
+
 enum ena_admin_aq_opcode {
 	ENA_ADMIN_CREATE_SQ                         = 1,
 	ENA_ADMIN_DESTROY_SQ                        = 2,
@@ -59,6 +74,7 @@ enum ena_admin_aq_caps_id {
 	ENA_ADMIN_ENI_STATS                         = 0,
 	/* ENA SRD customer metrics */
 	ENA_ADMIN_ENA_SRD_INFO                      = 1,
+	ENA_ADMIN_CUSTOMER_METRICS                  = 2,
 };
 
 enum ena_admin_placement_policy_type {
@@ -109,6 +125,8 @@ enum ena_admin_get_stats_type {
 	ENA_ADMIN_GET_STATS_TYPE_ENI                = 2,
 	/* extra HW stats for ENA SRD */
 	ENA_ADMIN_GET_STATS_TYPE_ENA_SRD            = 3,
+	ENA_ADMIN_GET_STATS_TYPE_CUSTOMER_METRICS   = 4,
+
 };
 
 enum ena_admin_get_stats_scope {
@@ -387,6 +405,9 @@ struct ena_admin_aq_get_stats_cmd {
 	 * stats of other device
 	 */
 	u16 device_id;
+
+	/* a bitmap representing the requested metric values */
+	u64 requested_metrics;
 };
 
 /* Basic Statistics Command. */
@@ -469,6 +490,14 @@ struct ena_admin_ena_srd_info {
 	struct ena_admin_ena_srd_stats ena_srd_stats;
 };
 
+/* Customer Metrics Command. */
+struct ena_admin_customer_metrics {
+	/* A bitmap representing the reported customer metrics according to
+	 * the order they are reported
+	 */
+	u64 reported_metrics;
+};
+
 struct ena_admin_acq_get_stats_resp {
 	struct ena_admin_acq_common_desc acq_common_desc;
 
@@ -480,6 +509,8 @@ struct ena_admin_acq_get_stats_resp {
 		struct ena_admin_eni_stats eni_stats;
 
 		struct ena_admin_ena_srd_info ena_srd_info;
+
+		struct ena_admin_customer_metrics customer_metrics;
 	} u;
 };
 
diff --git a/drivers/amazon/net/ena/ena_com.c b/drivers/amazon/net/ena/ena_com.c
index 520dad1e549af..9bd064ff0f6c7 100644
--- a/drivers/amazon/net/ena/ena_com.c
+++ b/drivers/amazon/net/ena/ena_com.c
@@ -2176,6 +2176,58 @@ int ena_com_get_link_params(struct ena_com_dev *ena_dev,
 	return ena_com_get_feature(ena_dev, resp, ENA_ADMIN_LINK_CONFIG, 0);
 }
 
+static int ena_get_dev_stats(struct ena_com_dev *ena_dev,
+			     struct ena_com_stats_ctx *ctx,
+			     enum ena_admin_get_stats_type type)
+{
+	struct ena_admin_acq_get_stats_resp *get_resp = &ctx->get_resp;
+	struct ena_admin_aq_get_stats_cmd *get_cmd = &ctx->get_cmd;
+	struct ena_com_admin_queue *admin_queue;
+	int ret;
+
+	admin_queue = &ena_dev->admin_queue;
+
+	get_cmd->aq_common_descriptor.opcode = ENA_ADMIN_GET_STATS;
+	get_cmd->aq_common_descriptor.flags = 0;
+	get_cmd->type = type;
+
+	ret = ena_com_execute_admin_command(admin_queue,
+					    (struct ena_admin_aq_entry *)get_cmd,
+					    sizeof(*get_cmd),
+					    (struct ena_admin_acq_entry *)get_resp,
+					    sizeof(*get_resp));
+
+	if (unlikely(ret))
+		netdev_err(ena_dev->net_device,
+			   "Failed to get stats. error: %d\n", ret);
+
+	return ret;
+}
+
+static void ena_com_set_supported_customer_metrics(struct ena_com_dev *ena_dev)
+{
+	struct ena_customer_metrics *customer_metrics;
+	struct ena_com_stats_ctx ctx;
+	int ret;
+
+	customer_metrics = &ena_dev->customer_metrics;
+	if (!ena_com_get_cap(ena_dev, ENA_ADMIN_CUSTOMER_METRICS)) {
+		customer_metrics->supported_metrics = ENA_ADMIN_CUSTOMER_METRICS_MIN_SUPPORT_MASK;
+		return;
+	}
+
+	memset(&ctx, 0x0, sizeof(ctx));
+	ctx.get_cmd.requested_metrics = ENA_ADMIN_CUSTOMER_METRICS_SUPPORT_MASK;
+	ret = ena_get_dev_stats(ena_dev, &ctx, ENA_ADMIN_GET_STATS_TYPE_CUSTOMER_METRICS);
+	if (likely(ret == 0))
+		customer_metrics->supported_metrics =
+			ctx.get_resp.u.customer_metrics.reported_metrics;
+	else
+		netdev_err(ena_dev->net_device,
+			   "Failed to query customer metrics support. error: %d\n",
+			   ret);
+}
+
 int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev,
 			      struct ena_com_dev_get_features_ctx *get_feat_ctx)
 {
@@ -2259,6 +2311,8 @@ int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev,
 	else
 		return rc;
 
+	ena_com_set_supported_customer_metrics(ena_dev);
+
 	return 0;
 }
 
@@ -2413,34 +2467,6 @@ int ena_com_dev_reset(struct ena_com_dev *ena_dev,
 	return 0;
 }
 
-static int ena_get_dev_stats(struct ena_com_dev *ena_dev,
-			     struct ena_com_stats_ctx *ctx,
-			     enum ena_admin_get_stats_type type)
-{
-	struct ena_admin_aq_get_stats_cmd *get_cmd = &ctx->get_cmd;
-	struct ena_admin_acq_get_stats_resp *get_resp = &ctx->get_resp;
-	struct ena_com_admin_queue *admin_queue;
-	int ret;
-
-	admin_queue = &ena_dev->admin_queue;
-
-	get_cmd->aq_common_descriptor.opcode = ENA_ADMIN_GET_STATS;
-	get_cmd->aq_common_descriptor.flags = 0;
-	get_cmd->type = type;
-
-	ret =  ena_com_execute_admin_command(admin_queue,
-					     (struct ena_admin_aq_entry *)get_cmd,
-					     sizeof(*get_cmd),
-					     (struct ena_admin_acq_entry *)get_resp,
-					     sizeof(*get_resp));
-
-	if (unlikely(ret))
-		netdev_err(ena_dev->net_device,
-			   "Failed to get stats. error: %d\n", ret);
-
-	return ret;
-}
-
 int ena_com_get_eni_stats(struct ena_com_dev *ena_dev,
 			  struct ena_admin_eni_stats *stats)
 {
@@ -2500,6 +2526,53 @@ int ena_com_get_dev_basic_stats(struct ena_com_dev *ena_dev,
 	return ret;
 }
 
+int ena_com_get_customer_metrics(struct ena_com_dev *ena_dev, char *buffer, u32 len)
+{
+	struct ena_admin_aq_get_stats_cmd *get_cmd;
+	struct ena_com_stats_ctx ctx;
+	int ret;
+
+	if (unlikely(len > ena_dev->customer_metrics.buffer_len)) {
+		netdev_err(ena_dev->net_device,
+			   "Invalid buffer size %u. The given buffer is too big.\n",
+			   len);
+		return -EINVAL;
+	}
+
+	if (!ena_com_get_cap(ena_dev, ENA_ADMIN_CUSTOMER_METRICS)) {
+		netdev_err(ena_dev->net_device, "Capability %d not supported.\n",
+			   ENA_ADMIN_CUSTOMER_METRICS);
+		return -EOPNOTSUPP;
+	}
+
+	if (!ena_dev->customer_metrics.supported_metrics) {
+		netdev_err(ena_dev->net_device,
+			   "No supported customer metrics.\n");
+		return -EOPNOTSUPP;
+	}
+
+	get_cmd = &ctx.get_cmd;
+	memset(&ctx, 0x0, sizeof(ctx));
+	ret = ena_com_mem_addr_set(ena_dev,
+		&get_cmd->u.control_buffer.address,
+		ena_dev->customer_metrics.buffer_dma_addr);
+	if (unlikely(ret)) {
+		netdev_err(ena_dev->net_device, "Memory address set failed.\n");
+		return ret;
+	}
+
+	get_cmd->u.control_buffer.length = ena_dev->customer_metrics.buffer_len;
+	get_cmd->requested_metrics = ena_dev->customer_metrics.supported_metrics;
+	ret = ena_get_dev_stats(ena_dev, &ctx, ENA_ADMIN_GET_STATS_TYPE_CUSTOMER_METRICS);
+	if (likely(ret == 0))
+		memcpy(buffer, ena_dev->customer_metrics.buffer_virt_addr, len);
+	else
+		netdev_err(ena_dev->net_device,
+			   "Failed to get customer metrics. error: %d\n", ret);
+
+	return ret;
+}
+
 int ena_com_set_dev_mtu(struct ena_com_dev *ena_dev, u32 mtu)
 {
 	struct ena_com_admin_queue *admin_queue;
@@ -3052,6 +3125,22 @@ int ena_com_allocate_debug_area(struct ena_com_dev *ena_dev,
 	return 0;
 }
 
+int ena_com_allocate_customer_metrics_buffer(struct ena_com_dev *ena_dev)
+{
+	struct ena_customer_metrics *customer_metrics = &ena_dev->customer_metrics;
+
+	customer_metrics->buffer_len = ENA_CUSTOMER_METRICS_BUFFER_SIZE;
+	customer_metrics->buffer_virt_addr =
+		dma_zalloc_coherent(ena_dev->dmadev,
+				    customer_metrics->buffer_len,
+				    &customer_metrics->buffer_dma_addr,
+				    GFP_KERNEL);
+	if (!customer_metrics->buffer_virt_addr)
+		return -ENOMEM;
+
+	return 0;
+}
+
 void ena_com_delete_host_info(struct ena_com_dev *ena_dev)
 {
 	struct ena_host_attribute *host_attr = &ena_dev->host_attr;
@@ -3075,6 +3164,18 @@ void ena_com_delete_debug_area(struct ena_com_dev *ena_dev)
 	}
 }
 
+void ena_com_delete_customer_metrics_buffer(struct ena_com_dev *ena_dev)
+{
+	struct ena_customer_metrics *customer_metrics = &ena_dev->customer_metrics;
+
+	if (customer_metrics->buffer_virt_addr) {
+		dma_free_coherent(ena_dev->dmadev, customer_metrics->buffer_len,
+				  customer_metrics->buffer_virt_addr,
+				  customer_metrics->buffer_dma_addr);
+		customer_metrics->buffer_virt_addr = NULL;
+	}
+}
+
 int ena_com_set_host_attributes(struct ena_com_dev *ena_dev)
 {
 	struct ena_host_attribute *host_attr = &ena_dev->host_attr;
diff --git a/drivers/amazon/net/ena/ena_com.h b/drivers/amazon/net/ena/ena_com.h
index ab17ba125ca3c..3fd86b6f14e6b 100644
--- a/drivers/amazon/net/ena/ena_com.h
+++ b/drivers/amazon/net/ena/ena_com.h
@@ -42,6 +42,8 @@
 #define ADMIN_CQ_SIZE(depth)	((depth) * sizeof(struct ena_admin_acq_entry))
 #define ADMIN_AENQ_SIZE(depth)	((depth) * sizeof(struct ena_admin_aenq_entry))
 
+#define ENA_CUSTOMER_METRICS_BUFFER_SIZE 512
+
 /*****************************************************************************/
 /*****************************************************************************/
 /* ENA adaptive interrupt moderation settings */
@@ -328,6 +330,16 @@ struct ena_rss {
 
 };
 
+struct ena_customer_metrics {
+	/* in correlation with ENA_ADMIN_CUSTOMER_METRICS_SUPPORT_MASK
+	 * and ena_admin_customer_metrics_id
+	 */
+	u64 supported_metrics;
+	dma_addr_t buffer_dma_addr;
+	void *buffer_virt_addr;
+	u32 buffer_len;
+};
+
 struct ena_host_attribute {
 	/* Debug area */
 	u8 *debug_area_virt_addr;
@@ -379,6 +391,8 @@ struct ena_com_dev {
 	struct ena_intr_moder_entry *intr_moder_tbl;
 
 	struct ena_com_llq_info llq_info;
+
+	struct ena_customer_metrics customer_metrics;
 };
 
 struct ena_com_dev_get_features_ctx {
@@ -702,6 +716,15 @@ int ena_com_get_eni_stats(struct ena_com_dev *ena_dev,
 int ena_com_get_ena_srd_info(struct ena_com_dev *ena_dev,
 			     struct ena_admin_ena_srd_info *info);
 
+/* ena_com_get_customer_metrics - Get customer metrics for network interface
+ * @ena_dev: ENA communication layer struct
+ * @buffer: buffer for returned customer metrics
+ * @len: size of the buffer
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_get_customer_metrics(struct ena_com_dev *ena_dev, char *buffer, u32 len);
+
 /* ena_com_set_dev_mtu - Configure the device mtu.
  * @ena_dev: ENA communication layer struct
  * @mtu: mtu value
@@ -912,6 +935,13 @@ int ena_com_allocate_host_info(struct ena_com_dev *ena_dev);
 int ena_com_allocate_debug_area(struct ena_com_dev *ena_dev,
 				u32 debug_area_size);
 
+/* ena_com_allocate_customer_metrics_buffer - Allocate customer metrics resources.
+ * @ena_dev: ENA communication layer struct
+ *
+ * @return: 0 on Success and negative value otherwise.
+ */
+int ena_com_allocate_customer_metrics_buffer(struct ena_com_dev *ena_dev);
+
 /* ena_com_delete_debug_area - Free the debug area resources.
  * @ena_dev: ENA communication layer struct
  *
@@ -926,6 +956,13 @@ void ena_com_delete_debug_area(struct ena_com_dev *ena_dev);
  */
 void ena_com_delete_host_info(struct ena_com_dev *ena_dev);
 
+/* ena_com_delete_customer_metrics_buffer - Free the customer metrics resources.
+ * @ena_dev: ENA communication layer struct
+ *
+ * Free the allocated customer metrics area.
+ */
+void ena_com_delete_customer_metrics_buffer(struct ena_com_dev *ena_dev);
+
 /* ena_com_set_host_attributes - Update the device with the host
  * attributes (debug area and host info) base address.
  * @ena_dev: ENA communication layer struct
@@ -1082,6 +1119,28 @@ static inline bool ena_com_get_cap(struct ena_com_dev *ena_dev,
 	return !!(ena_dev->capabilities & BIT(cap_id));
 }
 
+/* ena_com_get_customer_metric_support - query whether device supports a given customer metric.
+ * @ena_dev: ENA communication layer struct
+ * @metric_id: enum value representing the customer metric
+ *
+ * @return - true if customer metric is supported or false otherwise
+ */
+static inline bool ena_com_get_customer_metric_support(struct ena_com_dev *ena_dev,
+						       enum ena_admin_customer_metrics_id metric_id)
+{
+	return !!(ena_dev->customer_metrics.supported_metrics & BIT(metric_id));
+}
+
+/* ena_com_get_customer_metric_count - return the number of supported customer metrics.
+ * @ena_dev: ENA communication layer struct
+ *
+ * @return - the number of supported customer metrics
+ */
+static inline int ena_com_get_customer_metric_count(struct ena_com_dev *ena_dev)
+{
+	return hweight64(ena_dev->customer_metrics.supported_metrics);
+}
+
 /* ena_com_update_intr_reg - Prepare interrupt register
  * @intr_reg: interrupt register to update.
  * @rx_delay_interval: Rx interval in usecs
diff --git a/drivers/amazon/net/ena/ena_devlink.c b/drivers/amazon/net/ena/ena_devlink.c
index 68b02270786c7..fce8d6c795a8b 100644
--- a/drivers/amazon/net/ena/ena_devlink.c
+++ b/drivers/amazon/net/ena/ena_devlink.c
@@ -3,6 +3,8 @@
  * Copyright 2015-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
+#include "linux/pci.h"
+
 #include "ena_devlink.h"
 #ifdef ENA_DEVLINK_SUPPORT
 
diff --git a/drivers/amazon/net/ena/ena_ethtool.c b/drivers/amazon/net/ena/ena_ethtool.c
index 08f7ee8fc151c..797ca14a28b3a 100644
--- a/drivers/amazon/net/ena/ena_ethtool.c
+++ b/drivers/amazon/net/ena/ena_ethtool.c
@@ -16,6 +16,10 @@ struct ena_stats {
 	int stat_offset;
 };
 
+struct ena_hw_metrics {
+	char name[ETH_GSTRING_LEN];
+};
+
 #define ENA_STAT_ENA_COM_ADMIN_ENTRY(stat) { \
 	.name = #stat, \
 	.stat_offset = offsetof(struct ena_com_stats_admin, stat) / sizeof(u64) \
@@ -56,6 +60,10 @@ struct ena_stats {
 	.stat_offset = offsetof(struct ena_admin_ena_srd_info, flags) / sizeof(u64) \
 }
 
+#define ENA_METRIC_ENI_ENTRY(stat) { \
+	.name = #stat \
+}
+
 static const struct ena_stats ena_stats_global_strings[] = {
 	ENA_STAT_GLOBAL_ENTRY(tx_timeout),
 	ENA_STAT_GLOBAL_ENTRY(suspend),
@@ -67,6 +75,9 @@ static const struct ena_stats ena_stats_global_strings[] = {
 	ENA_STAT_GLOBAL_ENTRY(reset_fail),
 };
 
+/* A partial list of hw stats. Used when admin command
+ * with type ENA_ADMIN_GET_STATS_TYPE_CUSTOMER_METRICS is not supported
+ */
 static const struct ena_stats ena_stats_eni_strings[] = {
 	ENA_STAT_ENI_ENTRY(bw_in_allowance_exceeded),
 	ENA_STAT_ENI_ENTRY(bw_out_allowance_exceeded),
@@ -75,6 +86,15 @@ static const struct ena_stats ena_stats_eni_strings[] = {
 	ENA_STAT_ENI_ENTRY(linklocal_allowance_exceeded),
 };
 
+static const struct ena_hw_metrics ena_hw_stats_strings[] = {
+	ENA_METRIC_ENI_ENTRY(bw_in_allowance_exceeded),
+	ENA_METRIC_ENI_ENTRY(bw_out_allowance_exceeded),
+	ENA_METRIC_ENI_ENTRY(pps_allowance_exceeded),
+	ENA_METRIC_ENI_ENTRY(conntrack_allowance_exceeded),
+	ENA_METRIC_ENI_ENTRY(linklocal_allowance_exceeded),
+	ENA_METRIC_ENI_ENTRY(conntrack_allowance_available),
+};
+
 static const struct ena_stats ena_srd_info_strings[] = {
 	ENA_STAT_ENA_SRD_MODE_ENTRY(ena_srd_mode),
 	ENA_STAT_ENA_SRD_ENTRY(ena_srd_tx_pkts),
@@ -163,6 +183,7 @@ static const struct ena_stats ena_stats_ena_com_phc_strings[] = {
 #define ENA_STATS_ARRAY_ENA_COM_PHC	ARRAY_SIZE(ena_stats_ena_com_phc_strings)
 #define ENA_STATS_ARRAY_ENI		ARRAY_SIZE(ena_stats_eni_strings)
 #define ENA_STATS_ARRAY_ENA_SRD		ARRAY_SIZE(ena_srd_info_strings)
+#define ENA_METRICS_ARRAY_ENI		ARRAY_SIZE(ena_hw_stats_strings)
 
 static const char ena_priv_flags_strings[][ETH_GSTRING_LEN] = {
 #define ENA_PRIV_FLAGS_LPC	BIT(0)
@@ -177,9 +198,61 @@ static void ena_safe_update_stat(u64 *src, u64 *dst,
 	unsigned int start;
 
 	do {
-		start = u64_stats_fetch_begin_irq(syncp);
+		start = ena_u64_stats_fetch_begin(syncp);
 		*(dst) = *src;
-	} while (u64_stats_fetch_retry_irq(syncp, start));
+	} while (ena_u64_stats_fetch_retry(syncp, start));
+}
+
+
+static void ena_metrics_stats(struct ena_adapter *adapter, u64 **data)
+{
+	struct ena_com_dev *dev = adapter->ena_dev;
+	const struct ena_stats *ena_stats;
+	u64 *ptr;
+	int i;
+
+	if (ena_com_get_cap(dev, ENA_ADMIN_CUSTOMER_METRICS)) {
+		u32 supported_metrics_count;
+		int len;
+
+		supported_metrics_count = ena_com_get_customer_metric_count(dev);
+		len = supported_metrics_count * sizeof(u64);
+
+		/* Fill the data buffer, and advance its pointer */
+		ena_com_get_customer_metrics(adapter->ena_dev, (char *)(*data), len);
+		(*data) += supported_metrics_count;
+
+	} else if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENI_STATS)) {
+		ena_com_get_eni_stats(adapter->ena_dev, &adapter->eni_stats);
+		/* Updating regardless of rc - once we told ethtool how many stats we have
+		 * it will print that much stats. We can't leave holes in the stats
+		 */
+		for (i = 0; i < ENA_STATS_ARRAY_ENI; i++) {
+			ena_stats = &ena_stats_eni_strings[i];
+
+			ptr = (u64 *)&adapter->eni_stats +
+				ena_stats->stat_offset;
+
+			ena_safe_update_stat(ptr, (*data)++, &adapter->syncp);
+		}
+	}
+
+	if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENA_SRD_INFO)) {
+		ena_com_get_ena_srd_info(adapter->ena_dev, &adapter->ena_srd_info);
+		/* Get ENA SRD mode */
+		ptr = (u64 *)&adapter->ena_srd_info;
+		ena_safe_update_stat(ptr, (*data)++, &adapter->syncp);
+		for (i = 1; i < ENA_STATS_ARRAY_ENA_SRD; i++) {
+			ena_stats = &ena_srd_info_strings[i];
+			/* Wrapped within an outer struct - need to accommodate an
+			 * additional offset of the ENA SRD mode that was already processed
+			 */
+			ptr = (u64 *)&adapter->ena_srd_info +
+				ena_stats->stat_offset + 1;
+
+			ena_safe_update_stat(ptr, (*data)++, &adapter->syncp);
+		}
+	}
 }
 
 static void ena_queue_stats(struct ena_adapter *adapter, u64 **data)
@@ -263,39 +336,8 @@ static void ena_get_stats(struct ena_adapter *adapter,
 		ena_safe_update_stat(ptr, data++, &adapter->syncp);
 	}
 
-	if (hw_stats_needed) {
-		if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENI_STATS)) {
-			ena_com_get_eni_stats(adapter->ena_dev, &adapter->eni_stats);
-			/* Updating regardless of rc - once we told ethtool how many stats we have
-			 * it will print that much stats. We can't leave holes in the stats
-			 */
-			for (i = 0; i < ENA_STATS_ARRAY_ENI; i++) {
-				ena_stats = &ena_stats_eni_strings[i];
-
-				ptr = (u64 *)&adapter->eni_stats +
-					ena_stats->stat_offset;
-
-				ena_safe_update_stat(ptr, data++, &adapter->syncp);
-			}
-		}
-
-		if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENA_SRD_INFO)) {
-			ena_com_get_ena_srd_info(adapter->ena_dev, &adapter->ena_srd_info);
-			/* Get ENA SRD mode */
-			ptr = (u64 *)&adapter->ena_srd_info;
-			ena_safe_update_stat(ptr, data++, &adapter->syncp);
-			for (i = 1; i < ENA_STATS_ARRAY_ENA_SRD; i++) {
-				ena_stats = &ena_srd_info_strings[i];
-				/* Wrapped within an outer struct - need to accommodate an
-				 * additional offset of the ENA SRD mode that was already processed
-				 */
-				ptr = (u64 *)&adapter->ena_srd_info +
-					ena_stats->stat_offset + 1;
-
-				ena_safe_update_stat(ptr, data++, &adapter->syncp);
-			}
-		}
-	}
+	if (hw_stats_needed)
+		ena_metrics_stats(adapter, &data);
 
 	ena_queue_stats(adapter, &data);
 	ena_com_admin_queue_stats(adapter, &data);
@@ -343,8 +385,16 @@ static int ena_get_sw_stats_count(struct ena_adapter *adapter)
 
 static int ena_get_hw_stats_count(struct ena_adapter *adapter)
 {
-	return ENA_STATS_ARRAY_ENI * ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENI_STATS) +
-	       ENA_STATS_ARRAY_ENA_SRD * ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENA_SRD_INFO);
+	struct ena_com_dev *dev = adapter->ena_dev;
+	int count = ENA_STATS_ARRAY_ENA_SRD *
+			ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENA_SRD_INFO);
+
+	if (ena_com_get_cap(dev, ENA_ADMIN_CUSTOMER_METRICS))
+		count += ena_com_get_customer_metric_count(dev);
+	else if (ena_com_get_cap(dev, ENA_ADMIN_ENI_STATS))
+		count += ENA_STATS_ARRAY_ENI;
+
+	return count;
 }
 
 int ena_get_sset_count(struct net_device *netdev, int sset)
@@ -362,6 +412,35 @@ int ena_get_sset_count(struct net_device *netdev, int sset)
 	return -EOPNOTSUPP;
 }
 
+static void ena_metrics_stats_strings(struct ena_adapter *adapter, u8 **data)
+{
+	struct ena_com_dev *dev = adapter->ena_dev;
+	const struct ena_hw_metrics *ena_metrics;
+	const struct ena_stats *ena_stats;
+	int i;
+
+	if (ena_com_get_cap(dev, ENA_ADMIN_CUSTOMER_METRICS)) {
+		for (i = 0; i < ENA_METRICS_ARRAY_ENI; i++) {
+			if (ena_com_get_customer_metric_support(dev, i)) {
+				ena_metrics = &ena_hw_stats_strings[i];
+				ethtool_sprintf(data, ena_metrics->name);
+			}
+		}
+	} else if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENI_STATS)) {
+		for (i = 0; i < ENA_STATS_ARRAY_ENI; i++) {
+			ena_stats = &ena_stats_eni_strings[i];
+			ethtool_sprintf(data, ena_stats->name);
+		}
+	}
+
+	if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENA_SRD_INFO)) {
+		for (i = 0; i < ENA_STATS_ARRAY_ENA_SRD; i++) {
+			ena_stats = &ena_srd_info_strings[i];
+			ethtool_sprintf(data, ena_stats->name);
+		}
+	}
+}
+
 static void ena_queue_strings(struct ena_adapter *adapter, u8 **data)
 {
 	const struct ena_stats *ena_stats;
@@ -430,20 +509,8 @@ static void ena_get_strings(struct ena_adapter *adapter,
 		ethtool_sprintf(&data, ena_stats->name);
 	}
 
-	if (hw_stats_needed) {
-		if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENI_STATS)) {
-			for (i = 0; i < ENA_STATS_ARRAY_ENI; i++) {
-				ena_stats = &ena_stats_eni_strings[i];
-				ethtool_sprintf(&data, ena_stats->name);
-			}
-		}
-		if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENA_SRD_INFO)) {
-			for (i = 0; i < ENA_STATS_ARRAY_ENA_SRD; i++) {
-				ena_stats = &ena_srd_info_strings[i];
-				ethtool_sprintf(&data, ena_stats->name);
-			}
-		}
-	}
+	if (hw_stats_needed)
+		ena_metrics_stats_strings(adapter, &data);
 
 	ena_queue_strings(adapter, &data);
 	ena_com_admin_strings(&data);
@@ -644,11 +711,23 @@ static void ena_get_drvinfo(struct net_device *dev,
 			    struct ethtool_drvinfo *info)
 {
 	struct ena_adapter *adapter = netdev_priv(dev);
+	ssize_t ret = 0;
+
+	ret = strscpy(info->driver, DRV_MODULE_NAME, sizeof(info->driver));
+	if (ret < 0)
+		netif_info(adapter, drv, dev,
+			   "module name will be truncated, status = %zd\n", ret);
+
+	ret = strscpy(info->version, DRV_MODULE_GENERATION, sizeof(info->version));
+	if (ret < 0)
+		netif_info(adapter, drv, dev,
+			   "module version will be truncated, status = %zd\n", ret);
 
-	strlcpy(info->driver, DRV_MODULE_NAME, sizeof(info->driver));
-	strlcpy(info->version, DRV_MODULE_GENERATION, sizeof(info->version));
-	strlcpy(info->bus_info, pci_name(adapter->pdev),
+	ret = strscpy(info->bus_info, pci_name(adapter->pdev),
 		sizeof(info->bus_info));
+	if (ret < 0)
+		netif_info(adapter, drv, dev,
+			   "bus info will be truncated, status = %zd\n", ret);
 
 	info->n_priv_flags = ENA_PRIV_FLAGS_NR;
 }
diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c
index fbb96d864d8c3..0595bb82a6eb6 100644
--- a/drivers/amazon/net/ena/ena_netdev.c
+++ b/drivers/amazon/net/ena/ena_netdev.c
@@ -57,7 +57,7 @@ MODULE_PARM_DESC(debug, "Debug level (-1=default,0=none,...,16=all)");
 
 static int rx_queue_size = ENA_DEFAULT_RING_SIZE;
 module_param(rx_queue_size, int, 0444);
-MODULE_PARM_DESC(rx_queue_size, "Rx queue size. The size should be a power of 2. Max value is 8K\n");
+MODULE_PARM_DESC(rx_queue_size, "Rx queue size. The size should be a power of 2. Depending on instance type, max value can be up to 16K\n");
 
 static int force_large_llq_header = 0;
 module_param(force_large_llq_header, int, 0444);
@@ -600,7 +600,8 @@ static void ena_free_all_io_rx_resources(struct ena_adapter *adapter)
 		ena_free_rx_resources(adapter, i);
 }
 
-struct page *ena_alloc_map_page(struct ena_ring *rx_ring, dma_addr_t *dma)
+struct page *ena_alloc_map_page(struct ena_ring *rx_ring,
+				dma_addr_t *dma)
 {
 	struct page *page;
 
@@ -869,7 +870,7 @@ static void ena_free_tx_bufs(struct ena_ring *tx_ring)
 
 		ena_unmap_tx_buff(tx_ring, tx_info);
 
-		dev_kfree_skb_any(tx_info->skb);
+		napi_consume_skb(tx_info->skb, 0);
 	}
 	netdev_tx_reset_queue(netdev_get_tx_queue(tx_ring->netdev,
 						  tx_ring->qid));
@@ -1001,7 +1002,7 @@ static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget)
 			  skb);
 
 		tx_bytes += tx_info->total_tx_size;
-		dev_kfree_skb(skb);
+		napi_consume_skb(skb, budget);
 		tx_pkts++;
 		total_done += tx_info->tx_descs;
 
@@ -1050,15 +1051,15 @@ static struct sk_buff *ena_alloc_skb(struct ena_ring *rx_ring, void *first_frag,
 
 #ifdef ENA_LINEAR_FRAG_SUPPORTED
 	if (!first_frag)
-		skb = netdev_alloc_skb_ip_align(rx_ring->netdev, len);
+		skb = napi_alloc_skb(rx_ring->napi, len);
 	else
-		skb = build_skb(first_frag, len);
+		skb = ena_build_skb(first_frag, len);
 #else
 	if (!first_frag)
-		skb = netdev_alloc_skb_ip_align(rx_ring->netdev, len);
+		skb = napi_alloc_skb(rx_ring->napi, len);
 	else
-		skb = netdev_alloc_skb_ip_align(rx_ring->netdev,
-						ENA_SKB_PULL_MIN_LEN);
+		skb = napi_alloc_skb(rx_ring->napi,
+				     ENA_SKB_PULL_MIN_LEN);
 #endif /* ENA_LINEAR_FRAG_SUPPORTED */
 
 	if (unlikely(!skb)) {
@@ -2036,10 +2037,7 @@ static void ena_init_napi_in_range(struct ena_adapter *adapter,
 			napi_handler = ena_xdp_io_poll;
 #endif /* ENA_XDP_SUPPORT */
 
-		netif_napi_add(adapter->netdev,
-			       &napi->napi,
-			       napi_handler,
-			       NAPI_POLL_WEIGHT);
+		ena_netif_napi_add(adapter->netdev, &napi->napi, napi_handler);
 
 #ifdef ENA_BUSY_POLL_SUPPORT
 		napi_hash_add(&adapter->ena_napi[i].napi);
@@ -3006,7 +3004,7 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	tx_info->skb = NULL;
 
 error_drop_packet:
-	dev_kfree_skb(skb);
+	napi_consume_skb(skb, 0);
 	return NETDEV_TX_OK;
 }
 
@@ -3078,6 +3076,7 @@ static void ena_config_host_info(struct ena_com_dev *ena_dev, struct pci_dev *pd
 {
 	struct device *dev = &pdev->dev;
 	struct ena_admin_host_info *host_info;
+	ssize_t ret;
 	int rc;
 
 	/* Allocate only the host info */
@@ -3092,8 +3091,11 @@ static void ena_config_host_info(struct ena_com_dev *ena_dev, struct pci_dev *pd
 	host_info->bdf = (pdev->bus->number << 8) | pdev->devfn;
 	host_info->os_type = ENA_ADMIN_OS_LINUX;
 	host_info->kernel_ver = LINUX_VERSION_CODE;
-	strlcpy(host_info->kernel_ver_str, utsname()->version,
+	ret = strscpy(host_info->kernel_ver_str, utsname()->version,
 		sizeof(host_info->kernel_ver_str) - 1);
+	if (ret < 0)
+		dev_info(dev,
+			 "kernel version string will be truncated, status = %zd\n", ret);
 	host_info->os_dist = 0;
 	strncpy(host_info->os_dist_str, utsname()->release,
 		sizeof(host_info->os_dist_str) - 1);
@@ -3195,10 +3197,10 @@ static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
 		tx_ring = &adapter->tx_ring[i];
 
 		do {
-			start = u64_stats_fetch_begin_irq(&tx_ring->syncp);
+			start = ena_u64_stats_fetch_begin(&tx_ring->syncp);
 			packets = tx_ring->tx_stats.cnt;
 			bytes = tx_ring->tx_stats.bytes;
-		} while (u64_stats_fetch_retry_irq(&tx_ring->syncp, start));
+		} while (ena_u64_stats_fetch_retry(&tx_ring->syncp, start));
 
 		stats->tx_packets += packets;
 		stats->tx_bytes += bytes;
@@ -3210,21 +3212,21 @@ static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
 		rx_ring = &adapter->rx_ring[i];
 
 		do {
-			start = u64_stats_fetch_begin_irq(&rx_ring->syncp);
+			start = ena_u64_stats_fetch_begin(&rx_ring->syncp);
 			packets = rx_ring->rx_stats.cnt;
 			bytes = rx_ring->rx_stats.bytes;
 			xdp_rx_drops += ena_ring_xdp_drops_cnt(rx_ring);
-		} while (u64_stats_fetch_retry_irq(&rx_ring->syncp, start));
+		} while (ena_u64_stats_fetch_retry(&rx_ring->syncp, start));
 
 		stats->rx_packets += packets;
 		stats->rx_bytes += bytes;
 	}
 
 	do {
-		start = u64_stats_fetch_begin_irq(&adapter->syncp);
+		start = ena_u64_stats_fetch_begin(&adapter->syncp);
 		rx_drops = adapter->dev_stats.rx_drops;
 		tx_drops = adapter->dev_stats.tx_drops;
-	} while (u64_stats_fetch_retry_irq(&adapter->syncp, start));
+	} while (ena_u64_stats_fetch_retry(&adapter->syncp, start));
 
 	stats->rx_dropped = rx_drops + xdp_rx_drops;
 	stats->tx_dropped = tx_drops;
@@ -3261,10 +3263,10 @@ static struct net_device_stats *ena_get_stats(struct net_device *netdev)
 
 		tx_ring = &adapter->tx_ring[i];
 		do {
-			start = u64_stats_fetch_begin_irq(&tx_ring->syncp);
+			start = ena_u64_stats_fetch_begin(&tx_ring->syncp);
 			packets = (unsigned long)tx_ring->tx_stats.cnt;
 			bytes = (unsigned long)tx_ring->tx_stats.bytes;
-		} while (u64_stats_fetch_retry_irq(&tx_ring->syncp, start));
+		} while (ena_u64_stats_fetch_retry(&tx_ring->syncp, start));
 
 		stats->tx_packets += packets;
 		stats->tx_bytes += bytes;
@@ -3272,19 +3274,19 @@ static struct net_device_stats *ena_get_stats(struct net_device *netdev)
 		rx_ring = &adapter->rx_ring[i];
 
 		do {
-			start = u64_stats_fetch_begin_irq(&tx_ring->syncp);
+			start = ena_u64_stats_fetch_begin(&tx_ring->syncp);
 			packets = (unsigned long)rx_ring->rx_stats.cnt;
 			bytes = (unsigned long)rx_ring->rx_stats.bytes;
-		} while (u64_stats_fetch_retry_irq(&tx_ring->syncp, start));
+		} while (ena_u64_stats_fetch_retry(&tx_ring->syncp, start));
 
 		stats->rx_packets += packets;
 		stats->rx_bytes += bytes;
 	}
 
 	do {
-		start = u64_stats_fetch_begin_irq(&tx_ring->syncp);
+		start = ena_u64_stats_fetch_begin(&tx_ring->syncp);
 		rx_drops = (unsigned long)adapter->dev_stats.rx_drops;
-	} while (u64_stats_fetch_retry_irq(&tx_ring->syncp, start));
+	} while (ena_u64_stats_fetch_retry(&tx_ring->syncp, start));
 
 	stats->rx_dropped = rx_drops;
 
@@ -3699,8 +3701,9 @@ int ena_restore_device(struct ena_adapter *adapter)
 	struct ena_com_dev_get_features_ctx get_feat_ctx;
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
 	struct pci_dev *pdev = adapter->pdev;
+	struct ena_ring *txr;
+	int rc, count, i;
 	bool wd_state;
-	int rc;
 
 	set_bit(ENA_FLAG_ONGOING_RESET, &adapter->flags);
 	rc = ena_device_init(adapter, adapter->pdev, &get_feat_ctx, &wd_state);
@@ -3710,6 +3713,12 @@ int ena_restore_device(struct ena_adapter *adapter)
 	}
 	adapter->wd_state = wd_state;
 
+	count =  adapter->xdp_num_queues + adapter->num_io_queues;
+	for (i = 0 ; i < count; i++) {
+		txr = &adapter->tx_ring[i];
+		txr->tx_mem_queue_type = ena_dev->tx_mem_queue_type;
+	}
+
 	rc = ena_device_validate_params(adapter, &get_feat_ctx);
 	if (rc) {
 		dev_err(&pdev->dev, "Validation of device parameters failed\n");
@@ -4507,10 +4516,16 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	adapter->large_llq_header_enabled = !!force_large_llq_header;
 
+	rc = ena_com_allocate_customer_metrics_buffer(ena_dev);
+	if (rc) {
+		netdev_err(netdev, "ena_com_allocate_customer_metrics_buffer failed\n");
+		goto err_netdev_destroy;
+	}
+
 	devlink = ena_devlink_alloc(adapter);
 	if (!devlink) {
 		netdev_err(netdev, "ena_devlink_alloc failed\n");
-		goto err_netdev_destroy;
+		goto err_metrics_destroy;
 	}
 
 	rc = ena_map_llq_mem_bar(pdev, ena_dev, bars);
@@ -4671,6 +4686,8 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	ena_devlink_free(devlink);
 err_netdev_destroy:
 	free_netdev(netdev);
+err_metrics_destroy:
+	ena_com_delete_customer_metrics_buffer(ena_dev);
 err_free_region:
 	ena_release_bars(ena_dev, pdev);
 err_free_ena_dev:
@@ -4737,6 +4754,8 @@ static void __ena_shutoff(struct pci_dev *pdev, bool shutdown)
 
 	ena_com_delete_host_info(ena_dev);
 
+	ena_com_delete_customer_metrics_buffer(ena_dev);
+
 	ena_release_bars(ena_dev, pdev);
 
 	pci_disable_device(pdev);
diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h
index 7b373cf6545e9..60409fa4a4b98 100644
--- a/drivers/amazon/net/ena/ena_netdev.h
+++ b/drivers/amazon/net/ena/ena_netdev.h
@@ -29,7 +29,7 @@
 
 #define DRV_MODULE_GEN_MAJOR	2
 #define DRV_MODULE_GEN_MINOR	8
-#define DRV_MODULE_GEN_SUBMINOR 0
+#define DRV_MODULE_GEN_SUBMINOR 1
 
 #define DRV_MODULE_NAME		"ena"
 #ifndef DRV_MODULE_GENERATION
@@ -383,10 +383,12 @@ struct ena_adapter {
 
 	u32 num_io_queues;
 	u32 max_num_io_queues;
+
 	/* Local page cache size when it's enabled */
 	u32 configured_lpc_size;
 	/* Current Local page cache size */
 	u32 used_lpc_size;
+
 #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
 	struct msix_entry *msix_entries;
 #endif
diff --git a/drivers/amazon/net/ena/kcompat.h b/drivers/amazon/net/ena/kcompat.h
index fd7e80d0347ba..8e7aab52fb507 100644
--- a/drivers/amazon/net/ena/kcompat.h
+++ b/drivers/amazon/net/ena/kcompat.h
@@ -73,6 +73,7 @@ Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
 #include <linux/types.h>
 #include <linux/vmalloc.h>
 #include <linux/udp.h>
+#include <linux/u64_stats_sync.h>
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,6,0)
 #include <linux/sizes.h>
@@ -503,6 +504,25 @@ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync
 
 #endif
 
+static inline bool ena_u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
+					     unsigned int start)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0)
+	return u64_stats_fetch_retry_irq(syncp, start);
+#else
+	return u64_stats_fetch_retry(syncp, start);
+#endif
+}
+
+static inline unsigned int ena_u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0)
+	return u64_stats_fetch_begin_irq(syncp);
+#else
+	return u64_stats_fetch_begin(syncp);
+#endif
+}
+
 #if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,16,0) && \
       !(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,1))))
 
@@ -714,7 +734,9 @@ do {									\
 #endif
 
 #if defined(CONFIG_NET_DEVLINK) && \
-	(KERNEL_VERSION(5, 4, 0) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0))
+	(KERNEL_VERSION(5, 4, 0) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0) && \
+	!(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4)) && \
+	!(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0)))
 #define ENA_DEVLINK_RELOAD_ENABLING_REQUIRED
 #endif
 
@@ -728,15 +750,20 @@ do {									\
 #define ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT
 #endif
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) || \
+	(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4)) || \
+	(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
 #define ENA_DEVLINK_RECEIVES_DEVICE_ON_ALLOC
 #endif
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0) || \
+	(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4))
 #define ENA_DEVLINK_RELOAD_SUPPORT_ADVERTISEMENT_NEEDED
 #endif
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0) && \
+	!(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4)) && \
+	!(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
 #define ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
 #endif
 
@@ -839,16 +866,31 @@ static inline int numa_mem_id(void)
 #define fallthrough do {} while (0)  /* fallthrough */
 #endif
 
-#ifndef NAPI_POLL_WEIGHT
-#define NAPI_POLL_WEIGHT 64
-#endif
-
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0)
 #define AF_XDP_BUSY_POLL_SUPPORTED
 #endif
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0)
 #define ENA_LINEAR_FRAG_SUPPORTED
+static __always_inline struct sk_buff*
+ena_build_skb(void *data, unsigned int frag_size)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 12, 0)
+	return napi_build_skb(data, frag_size);
+#else
+	return build_skb(data, frag_size);
+#endif
+}
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 6, 0) && \
+	!(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 3)) && \
+	!(defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(4, 2, 0, 42))
+static __always_inline
+void napi_consume_skb(struct sk_buff *skb, int budget)
+{
+	dev_kfree_skb_any(skb);
+}
 #endif
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0)
@@ -892,7 +934,8 @@ xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start,
 #endif
 
 #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0) && \
-	!(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 6))
+	!(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 6)) && \
+	!(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4))
 static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr)
 {
 	memcpy(dev->dev_addr, addr, ETH_ALEN);
@@ -900,11 +943,15 @@ static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr)
 #endif
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) || \
-	(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 6))
+	(defined(RHEL_RELEASE_CODE) && \
+	RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 6) && \
+	RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(9, 0)) || \
+	(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4))
 #define ENA_EXTENDED_COALESCE_UAPI_WITH_CQE_SUPPORTED
 #endif
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 17, 0)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 17, 0) || \
+	(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE == RHEL_RELEASE_VERSION(8, 7))
 #define ENA_ETHTOOL_RX_BUFF_SIZE_CHANGE
 #endif
 
@@ -984,4 +1031,37 @@ static inline bool ktime_after(const ktime_t cmp1, const ktime_t cmp2)
 
 #endif /* CONFIG_PTP_1588_CLOCK */
 
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0)) && \
+	!(RHEL_RELEASE_CODE && \
+	(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 2)))
+static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi,
+					     unsigned int length)
+{
+	return netdev_alloc_skb_ip_align(napi->dev, length);
+}
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0)) && \
+	!(RHEL_RELEASE_CODE && \
+	(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 7)))
+static inline ssize_t strscpy(char *dest, const char *src, size_t count)
+{
+	return (ssize_t)strlcpy(dest, src, count);
+}
+#endif
+
+static inline void ena_netif_napi_add(struct net_device *dev,
+				      struct napi_struct *napi,
+				      int (*poll)(struct napi_struct *, int))
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0)
+#ifndef NAPI_POLL_WEIGHT
+#define NAPI_POLL_WEIGHT 64
+#endif
+	netif_napi_add(dev, napi, poll, NAPI_POLL_WEIGHT);
+#else
+	netif_napi_add(dev, napi, poll);
+#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0) */
+}
+
 #endif /* _KCOMPAT_H_ */

From 0ae44930c2b34fb04b2d70805217585dd5d788b3 Mon Sep 17 00:00:00 2001
From: Andrew Panyakin <apanyaki@amazon.com>
Date: Fri, 10 Feb 2023 21:21:53 +0000
Subject: [PATCH 590/737] ENA: Update to v2.8.3

Source: https://github.com/amzn/amzn-drivers/

Change Log:

## r2.8.3 release notes
**New Features**
* PHC module param enablement
* PHC devlink param enablement
* Add hint for interrupt moderation for the device
* Change initial static RX interrupt moderation interval
* Enable DIM by default on all CPU Architectures

**Buf Fixes**
* DMA sync for CPU before accessing buffer
* Fix ena_probe destroy order
* Validate completion descriptors consistency
* Fix TX packets missing completion counter

**Minor Changes**
* Compilation fixes for RHEL 9.0, 9.1 and SLES 15SP4
* PHC info dynamic allocation
* Publish devlink reload for RHEL 9.0 and 9.1
* Add ENA Express documentation

## r2.8.2 release notes
**Buf Fixes**
* Fix devlink large LLQ config not fully applied

Signed-off-by: Andrew Panyakin <apanyaki@amazon.com>
---
 drivers/amazon/net/ena/ena_com.c         |  11 +--
 drivers/amazon/net/ena/ena_com.h         |  15 +++-
 drivers/amazon/net/ena/ena_devlink.c     |  80 ++++++++++++++++-
 drivers/amazon/net/ena/ena_devlink.h     |  11 ++-
 drivers/amazon/net/ena/ena_eth_com.c     |  38 +++++---
 drivers/amazon/net/ena/ena_eth_io_defs.h |   5 +-
 drivers/amazon/net/ena/ena_ethtool.c     |  22 +++--
 drivers/amazon/net/ena/ena_netdev.c      | 106 +++++++++++++++++------
 drivers/amazon/net/ena/ena_netdev.h      |  19 ++--
 drivers/amazon/net/ena/ena_phc.c         | 101 ++++++++++++++-------
 drivers/amazon/net/ena/ena_phc.h         |  19 ++--
 drivers/amazon/net/ena/ena_regs_defs.h   |   1 +
 drivers/amazon/net/ena/ena_xdp.h         |   1 +
 drivers/amazon/net/ena/kcompat.h         |  17 +++-
 14 files changed, 335 insertions(+), 111 deletions(-)

diff --git a/drivers/amazon/net/ena/ena_com.c b/drivers/amazon/net/ena/ena_com.c
index 9bd064ff0f6c7..889d3412a72df 100644
--- a/drivers/amazon/net/ena/ena_com.c
+++ b/drivers/amazon/net/ena/ena_com.c
@@ -1823,8 +1823,8 @@ int ena_com_phc_config(struct ena_com_dev *ena_dev)
 		return ret;
 	}
 
-	phc->enabled = true;
-	netdev_dbg(ena_dev->net_device, "PHC is enabled\n");
+	phc->active = true;
+	netdev_dbg(ena_dev->net_device, "PHC is active in the device\n");
 
 	return ret;
 }
@@ -1833,7 +1833,7 @@ void ena_com_phc_destroy(struct ena_com_dev *ena_dev)
 {
 	struct ena_com_phc_info *phc = &ena_dev->phc;
 
-	phc->enabled = false;
+	phc->active = false;
 
 	/* In case PHC is not supported by the device, silently exiting */
 	if (!phc->virt_addr)
@@ -1855,8 +1855,9 @@ int ena_com_phc_get(struct ena_com_dev *ena_dev, u64 *timestamp)
 	ktime_t block_time;
 	int ret = 0;
 
-	if (!phc->enabled) {
-		netdev_err(ena_dev->net_device, "PHC feature is not enabled\n");
+	if (!phc->active) {
+		netdev_err(ena_dev->net_device,
+			   "PHC feature is not active in the device\n");
 		return -EOPNOTSUPP;
 	}
 
diff --git a/drivers/amazon/net/ena/ena_com.h b/drivers/amazon/net/ena/ena_com.h
index 3fd86b6f14e6b..f44e59176e459 100644
--- a/drivers/amazon/net/ena/ena_com.h
+++ b/drivers/amazon/net/ena/ena_com.h
@@ -49,7 +49,7 @@
 /* ENA adaptive interrupt moderation settings */
 
 #define ENA_INTR_INITIAL_TX_INTERVAL_USECS 64
-#define ENA_INTR_INITIAL_RX_INTERVAL_USECS 0
+#define ENA_INTR_INITIAL_RX_INTERVAL_USECS 20
 #define ENA_DEFAULT_INTR_DELAY_RESOLUTION 1
 
 #define ENA_HASH_KEY_SIZE 40
@@ -302,8 +302,8 @@ struct ena_com_phc_info {
 	/* Request id sent to the device */
 	u16 req_id;
 
-	/* True if PHC is enabled */
-	bool enabled;
+	/* True if PHC is active in the device */
+	bool active;
 
 	/* PHC shared memory - memory handle */
 
@@ -1146,13 +1146,16 @@ static inline int ena_com_get_customer_metric_count(struct ena_com_dev *ena_dev)
  * @rx_delay_interval: Rx interval in usecs
  * @tx_delay_interval: Tx interval in usecs
  * @unmask: unmask enable/disable
+ * @no_moderation_update: 0 - Indicates that any of the TX/RX intervals was
+ *                        updated, 1 - otherwise
  *
  * Prepare interrupt update register with the supplied parameters.
  */
 static inline void ena_com_update_intr_reg(struct ena_eth_io_intr_reg *intr_reg,
 					   u32 rx_delay_interval,
 					   u32 tx_delay_interval,
-					   bool unmask)
+					   bool unmask,
+					   bool no_moderation_update)
 {
 	intr_reg->intr_control = 0;
 	intr_reg->intr_control |= rx_delay_interval &
@@ -1164,6 +1167,10 @@ static inline void ena_com_update_intr_reg(struct ena_eth_io_intr_reg *intr_reg,
 
 	if (unmask)
 		intr_reg->intr_control |= ENA_ETH_IO_INTR_REG_INTR_UNMASK_MASK;
+
+	intr_reg->intr_control |=
+		(((u32)no_moderation_update) << ENA_ETH_IO_INTR_REG_NO_MODERATION_UPDATE_SHIFT) &
+			ENA_ETH_IO_INTR_REG_NO_MODERATION_UPDATE_MASK;
 }
 
 static inline u8 *ena_com_get_next_bounce_buffer(struct ena_com_io_bounce_buffer_control *bounce_buf_ctrl)
diff --git a/drivers/amazon/net/ena/ena_devlink.c b/drivers/amazon/net/ena/ena_devlink.c
index fce8d6c795a8b..f140d024ef166 100644
--- a/drivers/amazon/net/ena/ena_devlink.c
+++ b/drivers/amazon/net/ena/ena_devlink.c
@@ -1,12 +1,19 @@
 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /*
- * Copyright 2015-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2015-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include "linux/pci.h"
 
 #include "ena_devlink.h"
 #ifdef ENA_DEVLINK_SUPPORT
+#ifdef ENA_PHC_SUPPORT
+#include "ena_phc.h"
+
+static int ena_devlink_phc_enable_validate(struct devlink *devlink, u32 id,
+					   union devlink_param_value val,
+					   struct netlink_ext_ack *extack);
+#endif /* ENA_PHC_SUPPORT */
 
 static int ena_devlink_llq_header_validate(struct devlink *devlink, u32 id,
 					   union devlink_param_value val,
@@ -15,6 +22,9 @@ static int ena_devlink_llq_header_validate(struct devlink *devlink, u32 id,
 enum ena_devlink_param_id {
 	ENA_DEVLINK_PARAM_ID_BASE = DEVLINK_PARAM_GENERIC_ID_MAX,
 	ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
+#ifdef ENA_PHC_SUPPORT
+	ENA_DEVLINK_PARAM_ID_PHC_ENABLE,
+#endif /* ENA_PHC_SUPPORT */
 };
 
 static const struct devlink_param ena_devlink_params[] = {
@@ -22,6 +32,12 @@ static const struct devlink_param ena_devlink_params[] = {
 			     "large_llq_header", DEVLINK_PARAM_TYPE_BOOL,
 			     BIT(DEVLINK_PARAM_CMODE_DRIVERINIT),
 			     NULL, NULL, ena_devlink_llq_header_validate),
+#ifdef ENA_PHC_SUPPORT
+	DEVLINK_PARAM_DRIVER(ENA_DEVLINK_PARAM_ID_PHC_ENABLE,
+			     "phc_enable", DEVLINK_PARAM_TYPE_BOOL,
+			     BIT(DEVLINK_PARAM_CMODE_DRIVERINIT),
+			     NULL, NULL, ena_devlink_phc_enable_validate),
+ #endif /* ENA_PHC_SUPPORT */
 };
 
 static int ena_devlink_llq_header_validate(struct devlink *devlink, u32 id,
@@ -47,6 +63,25 @@ static int ena_devlink_llq_header_validate(struct devlink *devlink, u32 id,
 	return 0;
 }
 
+#ifdef ENA_PHC_SUPPORT
+static int ena_devlink_phc_enable_validate(struct devlink *devlink, u32 id,
+					   union devlink_param_value val,
+					   struct netlink_ext_ack *extack)
+{
+	struct ena_adapter *adapter = ENA_DEVLINK_PRIV(devlink);
+
+	if (!val.vbool)
+		return 0;
+
+	if (!ena_com_phc_supported(adapter->ena_dev)) {
+		NL_SET_ERR_MSG_MOD(extack, "Device doesn't support PHC");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+#endif /* ENA_PHC_SUPPORT */
 #ifdef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
 /* Determines if ena_devlink_register has been called.
  * Prefer to check if the driver enabled reloading capabilities, but fallback
@@ -82,6 +117,16 @@ void ena_devlink_params_get(struct devlink *devlink)
 	}
 
 	adapter->large_llq_header_enabled = val.vbool;
+#ifdef ENA_PHC_SUPPORT
+
+	err = devlink_param_driverinit_value_get(devlink, ENA_DEVLINK_PARAM_ID_PHC_ENABLE, &val);
+	if (err) {
+		netdev_err(adapter->netdev, "Failed to query PHC param\n");
+		return;
+	}
+
+	ena_phc_enable(adapter, val.vbool);
+#endif /* ENA_PHC_SUPPORT */
 }
 
 void ena_devlink_disable_large_llq_header_param(struct devlink *devlink)
@@ -100,6 +145,22 @@ void ena_devlink_disable_large_llq_header_param(struct devlink *devlink)
 					   value);
 }
 
+#ifdef ENA_PHC_SUPPORT
+void ena_devlink_disable_phc_param(struct devlink *devlink)
+{
+	union devlink_param_value value;
+
+#ifdef ENA_DEVLINK_CONFIGURE_AFTER_REGISTER
+	/* If devlink params aren't registered, don't access them */
+	if (!ena_is_devlink_params_registered(devlink))
+		return;
+
+#endif
+	value.vbool = false;
+	devlink_param_driverinit_value_set(devlink, ENA_DEVLINK_PARAM_ID_PHC_ENABLE, value);
+}
+
+#endif /* ENA_PHC_SUPPORT */
 static int ena_devlink_reload_down(struct devlink *devlink,
 #ifdef ENA_DEVLINK_RELOAD_NS_CHANGE_SUPPORT
 				   bool netns_change,
@@ -164,11 +225,11 @@ static int ena_devlink_reload_up(struct devlink *devlink,
 	rtnl_lock();
 	/* Check that no other routine initialized the device (e.g.
 	 * ena_fw_reset_device()). Also we're under devlink_mutex here,
-	 * so devink (and ena_adapter with it) isn't freed under our
-	 * feet.
+	 * so devlink isn't freed under our feet.
 	 */
 	if (!test_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags))
 		err = ena_restore_device(adapter);
+
 	rtnl_unlock();
 
 #ifdef ENA_DEVLINK_RELOAD_LIMIT_AND_ACTION_SUPPORT
@@ -220,9 +281,18 @@ static int ena_devlink_configure_params(struct devlink *devlink)
 					   ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
 					   value);
 
+#ifdef ENA_PHC_SUPPORT
+	value.vbool = ena_phc_is_enabled(adapter);
+	devlink_param_driverinit_value_set(devlink, ENA_DEVLINK_PARAM_ID_PHC_ENABLE, value);
+
+#endif /* ENA_PHC_SUPPORT */
 #ifdef ENA_DEVLINK_RELOAD_SUPPORT_ADVERTISEMENT_NEEDED
 	devlink_set_features(devlink, DEVLINK_F_RELOAD);
 
+#endif
+#ifdef ENA_DEVLINK_PUBLISH_REQUIRED
+	devlink_params_publish(devlink);
+
 #endif
 #ifdef ENA_DEVLINK_RELOAD_ENABLING_REQUIRED
 	devlink_reload_enable(devlink);
@@ -270,6 +340,10 @@ static void ena_devlink_configure_params_clean(struct devlink *devlink)
 #ifdef ENA_DEVLINK_RELOAD_ENABLING_REQUIRED
 	devlink_reload_disable(devlink);
 
+#endif
+#ifdef ENA_DEVLINK_PUBLISH_REQUIRED
+	devlink_params_unpublish(devlink);
+
 #endif
 	devlink_params_unregister(devlink, ena_devlink_params,
 				  ARRAY_SIZE(ena_devlink_params));
diff --git a/drivers/amazon/net/ena/ena_devlink.h b/drivers/amazon/net/ena/ena_devlink.h
index 8a047654b2f52..85c05cba00bd1 100644
--- a/drivers/amazon/net/ena/ena_devlink.h
+++ b/drivers/amazon/net/ena/ena_devlink.h
@@ -1,6 +1,6 @@
-// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
- * Copyright 2015-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2015-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef DEVLINK_H
@@ -12,9 +12,8 @@
 #endif
 
 #ifdef ENA_DEVLINK_SUPPORT
-
 #define ENA_DEVLINK_PRIV(devlink) \
-	(*(struct ena_adapter **) devlink_priv(devlink))
+	(*(struct ena_adapter **)devlink_priv(devlink))
 
 struct devlink *ena_devlink_alloc(struct ena_adapter *adapter);
 void ena_devlink_free(struct devlink *devlink);
@@ -22,9 +21,9 @@ void ena_devlink_register(struct devlink *devlink, struct device *dev);
 void ena_devlink_unregister(struct devlink *devlink);
 void ena_devlink_params_get(struct devlink *devlink);
 void ena_devlink_disable_large_llq_header_param(struct devlink *devlink);
+void ena_devlink_disable_phc_param(struct devlink *devlink);
 
 #else /* ENA_DEVLINK_SUPPORT */
-
 #ifdef ENA_NO_DEVLINK_HEADERS
 struct devlink {};
 #endif
@@ -39,7 +38,7 @@ static inline void ena_devlink_register(struct devlink *devlink, struct device *
 static inline void ena_devlink_unregister(struct devlink *devlink) { }
 static inline void ena_devlink_params_get(struct devlink *devlink) { }
 static inline void ena_devlink_disable_large_llq_header_param(struct devlink *devlink) { }
+static inline void ena_devlink_disable_phc_param(struct devlink *devlink) { }
 
 #endif /* ENA_DEVLINK_SUPPORT */
-
 #endif /* DEVLINK_H */
diff --git a/drivers/amazon/net/ena/ena_eth_com.c b/drivers/amazon/net/ena/ena_eth_com.c
index f9f886289b970..50afe66efb57a 100644
--- a/drivers/amazon/net/ena/ena_eth_com.c
+++ b/drivers/amazon/net/ena/ena_eth_com.c
@@ -233,31 +233,43 @@ static struct ena_eth_io_rx_cdesc_base *
 		idx * io_cq->cdesc_entry_size_in_bytes);
 }
 
-static u16 ena_com_cdesc_rx_pkt_get(struct ena_com_io_cq *io_cq,
-					   u16 *first_cdesc_idx)
+static int ena_com_cdesc_rx_pkt_get(struct ena_com_io_cq *io_cq,
+				    u16 *first_cdesc_idx,
+				    u16 *num_descs)
 {
+	u16 count = io_cq->cur_rx_pkt_cdesc_count, head_masked;
 	struct ena_eth_io_rx_cdesc_base *cdesc;
-	u16 count = 0, head_masked;
 	u32 last = 0;
 
 	do {
+		u32 status;
+
 		cdesc = ena_com_get_next_rx_cdesc(io_cq);
 		if (!cdesc)
 			break;
+		status = READ_ONCE(cdesc->status);
 
 		ena_com_cq_inc_head(io_cq);
+		if (unlikely((status & ENA_ETH_IO_RX_CDESC_BASE_FIRST_MASK) >>
+		    ENA_ETH_IO_RX_CDESC_BASE_FIRST_SHIFT && count != 0)) {
+			struct ena_com_dev *dev = ena_com_io_cq_to_ena_dev(io_cq);
+
+			netdev_err(dev->net_device,
+				   "First bit is on in descriptor #%d on q_id: %d, req_id: %u\n",
+				   count, io_cq->qid, cdesc->req_id);
+			return -EFAULT;
+		}
 		count++;
-		last = (READ_ONCE(cdesc->status) &
-			ENA_ETH_IO_RX_CDESC_BASE_LAST_MASK) >>
-		       ENA_ETH_IO_RX_CDESC_BASE_LAST_SHIFT;
+		last = (status & ENA_ETH_IO_RX_CDESC_BASE_LAST_MASK) >>
+			ENA_ETH_IO_RX_CDESC_BASE_LAST_SHIFT;
 	} while (!last);
 
 	if (last) {
 		*first_cdesc_idx = io_cq->cur_rx_pkt_cdesc_start_idx;
-		count += io_cq->cur_rx_pkt_cdesc_count;
 
 		head_masked = io_cq->head & (io_cq->q_depth - 1);
 
+		*num_descs = count;
 		io_cq->cur_rx_pkt_cdesc_count = 0;
 		io_cq->cur_rx_pkt_cdesc_start_idx = head_masked;
 
@@ -265,11 +277,11 @@ static u16 ena_com_cdesc_rx_pkt_get(struct ena_com_io_cq *io_cq,
 			   "ENA q_id: %d packets were completed. first desc idx %u descs# %d\n",
 			   io_cq->qid, *first_cdesc_idx, count);
 	} else {
-		io_cq->cur_rx_pkt_cdesc_count += count;
-		count = 0;
+		io_cq->cur_rx_pkt_cdesc_count = count;
+		*num_descs = 0;
 	}
 
-	return count;
+	return 0;
 }
 
 static int ena_com_create_meta(struct ena_com_io_sq *io_sq,
@@ -546,10 +558,14 @@ int ena_com_rx_pkt(struct ena_com_io_cq *io_cq,
 	u16 cdesc_idx = 0;
 	u16 nb_hw_desc;
 	u16 i = 0;
+	int rc;
 
 	WARN(io_cq->direction != ENA_COM_IO_QUEUE_DIRECTION_RX, "wrong Q type");
 
-	nb_hw_desc = ena_com_cdesc_rx_pkt_get(io_cq, &cdesc_idx);
+	rc = ena_com_cdesc_rx_pkt_get(io_cq, &cdesc_idx, &nb_hw_desc);
+	if (unlikely(rc != 0))
+		return -EFAULT;
+
 	if (nb_hw_desc == 0) {
 		ena_rx_ctx->descs = nb_hw_desc;
 		return 0;
diff --git a/drivers/amazon/net/ena/ena_eth_io_defs.h b/drivers/amazon/net/ena/ena_eth_io_defs.h
index 332ac0d28ac7a..a4d6d0ee0193c 100755
--- a/drivers/amazon/net/ena/ena_eth_io_defs.h
+++ b/drivers/amazon/net/ena/ena_eth_io_defs.h
@@ -261,7 +261,8 @@ struct ena_eth_io_intr_reg {
 	/* 14:0 : rx_intr_delay
 	 * 29:15 : tx_intr_delay
 	 * 30 : intr_unmask
-	 * 31 : reserved
+	 * 31 : no_moderation_update - 0 - moderation
+	 *    updated, 1 - moderation not updated
 	 */
 	u32 intr_control;
 };
@@ -381,6 +382,8 @@ struct ena_eth_io_numa_node_cfg_reg {
 #define ENA_ETH_IO_INTR_REG_TX_INTR_DELAY_MASK              GENMASK(29, 15)
 #define ENA_ETH_IO_INTR_REG_INTR_UNMASK_SHIFT               30
 #define ENA_ETH_IO_INTR_REG_INTR_UNMASK_MASK                BIT(30)
+#define ENA_ETH_IO_INTR_REG_NO_MODERATION_UPDATE_SHIFT      31
+#define ENA_ETH_IO_INTR_REG_NO_MODERATION_UPDATE_MASK       BIT(31)
 
 /* numa_node_cfg_reg */
 #define ENA_ETH_IO_NUMA_NODE_CFG_REG_NUMA_MASK              GENMASK(7, 0)
diff --git a/drivers/amazon/net/ena/ena_ethtool.c b/drivers/amazon/net/ena/ena_ethtool.c
index 797ca14a28b3a..f09801591d840 100644
--- a/drivers/amazon/net/ena/ena_ethtool.c
+++ b/drivers/amazon/net/ena/ena_ethtool.c
@@ -342,9 +342,8 @@ static void ena_get_stats(struct ena_adapter *adapter,
 	ena_queue_stats(adapter, &data);
 	ena_com_admin_queue_stats(adapter, &data);
 
-	if (ena_phc_enabled(adapter)) {
+	if (ena_phc_is_active(adapter))
 		ena_com_phc_stats(adapter, &data);
-	}
 }
 
 static void ena_get_ethtool_stats(struct net_device *netdev,
@@ -377,7 +376,7 @@ static int ena_get_sw_stats_count(struct ena_adapter *adapter)
 		    + adapter->xdp_num_queues * ENA_STATS_ARRAY_TX
 		    + ENA_STATS_ARRAY_GLOBAL + ENA_STATS_ARRAY_ENA_COM_ADMIN;
 
-	if (ena_phc_enabled(adapter))
+	if (ena_phc_is_active(adapter))
 		count += ENA_STATS_ARRAY_ENA_COM_PHC;
 
 	return count;
@@ -515,9 +514,8 @@ static void ena_get_strings(struct ena_adapter *adapter,
 	ena_queue_strings(adapter, &data);
 	ena_com_admin_strings(&data);
 
-	if (ena_phc_enabled(adapter)) {
+	if (ena_phc_is_active(adapter))
 		ena_com_phc_strings(&data);
-	}
 }
 
 static void ena_get_ethtool_strings(struct net_device *netdev,
@@ -637,8 +635,11 @@ static void ena_update_tx_rings_nonadaptive_intr_moderation(struct ena_adapter *
 
 	val = ena_com_get_nonadaptive_moderation_interval_tx(adapter->ena_dev);
 
-	for (i = 0; i < adapter->num_io_queues; i++)
-		adapter->tx_ring[i].smoothed_interval = val;
+	for (i = 0; i < adapter->num_io_queues; i++) {
+		adapter->tx_ring[i].interrupt_interval_changed =
+			adapter->tx_ring[i].interrupt_interval != val;
+		adapter->tx_ring[i].interrupt_interval = val;
+	}
 }
 
 static void ena_update_rx_rings_nonadaptive_intr_moderation(struct ena_adapter *adapter)
@@ -648,8 +649,11 @@ static void ena_update_rx_rings_nonadaptive_intr_moderation(struct ena_adapter *
 
 	val = ena_com_get_nonadaptive_moderation_interval_rx(adapter->ena_dev);
 
-	for (i = 0; i < adapter->num_io_queues; i++)
-		adapter->rx_ring[i].smoothed_interval = val;
+	for (i = 0; i < adapter->num_io_queues; i++) {
+		adapter->rx_ring[i].interrupt_interval_changed =
+			adapter->rx_ring[i].interrupt_interval != val;
+		adapter->rx_ring[i].interrupt_interval = val;
+	}
 }
 
 static int ena_set_coalesce(struct net_device *net_dev,
diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c
index 0595bb82a6eb6..5308f35e29f5a 100644
--- a/drivers/amazon/net/ena/ena_netdev.c
+++ b/drivers/amazon/net/ena/ena_netdev.c
@@ -75,6 +75,12 @@ static int lpc_size = ENA_LPC_DEFAULT_MULTIPLIER;
 module_param(lpc_size, uint, 0444);
 MODULE_PARM_DESC(lpc_size, "Each local page cache (lpc) holds N * 1024 pages. This parameter sets N which is rounded up to a multiplier of 2. If zero, the page cache is disabled. Max: 32\n");
 
+#ifdef ENA_PHC_SUPPORT
+static int phc_enable = 0;
+module_param(phc_enable, uint, 0444);
+MODULE_PARM_DESC(phc_enable, "Enable PHC.\n");
+
+#endif /* ENA_PHC_SUPPORT */
 static struct ena_aenq_handlers aenq_handlers;
 
 static struct workqueue_struct *ena_wq;
@@ -319,8 +325,10 @@ void ena_init_io_rings(struct ena_adapter *adapter,
 		txr->tx_mem_queue_type = ena_dev->tx_mem_queue_type;
 		txr->sgl_size = adapter->max_tx_sgl_size;
 		txr->enable_bql = enable_bql;
-		txr->smoothed_interval =
+		txr->interrupt_interval =
 			ena_com_get_nonadaptive_moderation_interval_tx(ena_dev);
+		/* Initial value, mark as true */
+		txr->interrupt_interval_changed = true;
 		txr->disable_meta_caching = adapter->disable_meta_caching;
 #ifdef ENA_XDP_SUPPORT
 		spin_lock_init(&txr->xdp_tx_lock);
@@ -335,8 +343,10 @@ void ena_init_io_rings(struct ena_adapter *adapter,
 			rxr->ring_size = adapter->requested_rx_ring_size;
 			rxr->rx_copybreak = adapter->rx_copybreak;
 			rxr->sgl_size = adapter->max_rx_sgl_size;
-			rxr->smoothed_interval =
+			rxr->interrupt_interval =
 				ena_com_get_nonadaptive_moderation_interval_rx(ena_dev);
+			/* Initial value, mark as true */
+			rxr->interrupt_interval_changed = true;
 			rxr->empty_rx_queue = 0;
 			rxr->rx_headroom = NET_SKB_PAD;
 			adapter->ena_napi[i].dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
@@ -1103,6 +1113,7 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 	struct ena_rx_buffer *rx_info;
 	struct ena_adapter *adapter;
 	int page_offset, pkt_offset;
+	dma_addr_t pre_reuse_paddr;
 	u16 len, req_id, buf = 0;
 	bool reuse_rx_buf_page;
 	struct sk_buff *skb;
@@ -1168,12 +1179,19 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 
 	buf_len = SKB_DATA_ALIGN(len + buf_offset + tailroom);
 
+	pre_reuse_paddr = dma_unmap_addr(&rx_info->ena_buf, paddr);
+
 	/* If XDP isn't loaded try to reuse part of the RX buffer */
 	reuse_rx_buf_page = !is_xdp_loaded &&
 			    ena_try_rx_buf_page_reuse(rx_info, buf_len, len, pkt_offset);
 
 	if (!reuse_rx_buf_page)
 		ena_unmap_rx_buff(rx_ring, rx_info);
+	else
+		dma_sync_single_for_cpu(rx_ring->dev,
+					pre_reuse_paddr + pkt_offset,
+					len,
+					DMA_FROM_DEVICE);
 
 	skb = ena_alloc_skb(rx_ring, buf_addr, buf_len);
 	if (unlikely(!skb))
@@ -1226,11 +1244,18 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 		buf_len = SKB_DATA_ALIGN(len + buf_offset + tailroom);
 		page_offset = rx_info->page_offset;
 
+		pre_reuse_paddr = dma_unmap_addr(&rx_info->ena_buf, paddr);
+
 		reuse_rx_buf_page = !is_xdp_loaded &&
 				    ena_try_rx_buf_page_reuse(rx_info, buf_len, len, pkt_offset);
 
 		if (!reuse_rx_buf_page)
 			ena_unmap_rx_buff(rx_ring, rx_info);
+		else
+			dma_sync_single_for_cpu(rx_ring->dev,
+						pre_reuse_paddr + pkt_offset,
+						len,
+						DMA_FROM_DEVICE);
 
 		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_info->page,
 				page_offset + buf_offset, len, buf_len);
@@ -1528,6 +1553,8 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 		ena_increase_stat(&rx_ring->rx_stats.bad_desc_num, 1,
 				  &rx_ring->syncp);
 		ena_reset_device(adapter, ENA_REGS_RESET_TOO_MANY_RX_DESCS);
+	} else if (rc == -EFAULT) {
+		ena_reset_device(adapter, ENA_REGS_RESET_RX_DESCRIPTOR_MALFORMED);
 	} else {
 		ena_increase_stat(&rx_ring->rx_stats.bad_req_id, 1,
 				  &rx_ring->syncp);
@@ -1543,7 +1570,10 @@ static void ena_dim_work(struct work_struct *w)
 		net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
 	struct ena_napi *ena_napi = container_of(dim, struct ena_napi, dim);
 
-	ena_napi->rx_ring->smoothed_interval = cur_moder.usec;
+	ena_napi->rx_ring->interrupt_interval = cur_moder.usec;
+	/* DIM will schedule the work in case there was a change in the profile. */
+	ena_napi->rx_ring->interrupt_interval_changed = true;
+
 	dim->state = DIM_START_MEASURE;
 }
 
@@ -1570,27 +1600,33 @@ static void ena_adjust_adaptive_rx_intr_moderation(struct ena_napi *ena_napi)
 void ena_unmask_interrupt(struct ena_ring *tx_ring,
 			  struct ena_ring *rx_ring)
 {
+	u32 rx_interval = tx_ring->interrupt_interval;
 	struct ena_eth_io_intr_reg intr_reg;
-#ifdef ENA_XDP_SUPPORT
-	u32 rx_interval = tx_ring->smoothed_interval;
-#else
-	u32 rx_interval = 0;
-#endif
+	bool no_moderation_update = true;
+
 	/* Rx ring can be NULL when for XDP tx queues which don't have an
 	 * accompanying rx_ring pair.
 	 */
-	if (rx_ring)
+	if (rx_ring) {
 		rx_interval = ena_com_get_adaptive_moderation_enabled(rx_ring->ena_dev) ?
-			rx_ring->smoothed_interval :
+			rx_ring->interrupt_interval :
 			ena_com_get_nonadaptive_moderation_interval_rx(rx_ring->ena_dev);
 
+		no_moderation_update &= !rx_ring->interrupt_interval_changed;
+		rx_ring->interrupt_interval_changed = false;
+	}
+
+	no_moderation_update &= !tx_ring->interrupt_interval_changed;
+	tx_ring->interrupt_interval_changed = false;
+
 	/* Update intr register: rx intr delay,
 	 * tx intr delay and interrupt unmask
 	 */
 	ena_com_update_intr_reg(&intr_reg,
 				rx_interval,
-				tx_ring->smoothed_interval,
-				true);
+				tx_ring->interrupt_interval,
+				true,
+				no_moderation_update);
 
 	ena_increase_stat(&tx_ring->tx_stats.unmask_interrupt, 1,
 			  &tx_ring->syncp);
@@ -2439,14 +2475,12 @@ int ena_up(struct ena_adapter *adapter)
 	 */
 	ena_init_napi_in_range(adapter, 0, io_queue_count);
 
-#ifdef CONFIG_ARM64
-	/* enable DIM by default on ARM machines, also needs to happen
-	 * before enabling IRQs since DIM is ran from napi routine
+	/* Enabling DIM needs to happen before enabling IRQs since DIM
+	 * is run from napi routine
 	 */
 	if (ena_com_interrupt_moderation_supported(adapter->ena_dev))
 		ena_com_enable_adaptive_moderation(adapter->ena_dev);
 
-#endif
 	rc = ena_request_io_irq(adapter);
 	if (rc)
 		goto err_req_irq;
@@ -3407,7 +3441,7 @@ static void set_default_llq_configurations(struct ena_adapter *adapter,
 			ENA_ADMIN_LIST_ENTRY_SIZE_256B);
 
 	if ((llq->entry_size_ctrl_supported & ENA_ADMIN_LIST_ENTRY_SIZE_256B) &&
-		adapter->large_llq_header_enabled) {
+	    adapter->large_llq_header_enabled) {
 		llq_config->llq_ring_entry_size = ENA_ADMIN_LIST_ENTRY_SIZE_256B;
 		llq_config->llq_ring_entry_size_value = 256;
 	} else {
@@ -3717,6 +3751,7 @@ int ena_restore_device(struct ena_adapter *adapter)
 	for (i = 0 ; i < count; i++) {
 		txr = &adapter->tx_ring[i];
 		txr->tx_mem_queue_type = ena_dev->tx_mem_queue_type;
+		txr->tx_max_header_size = ena_dev->tx_max_header_size;
 	}
 
 	rc = ena_device_validate_params(adapter, &get_feat_ctx);
@@ -3880,8 +3915,6 @@ static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
 				reset_reason = ENA_REGS_RESET_SUSPECTED_POLL_STARVATION;
 			}
 
-			missed_tx++;
-
 			if (tx_buf->print_once)
 				continue;
 
@@ -3889,6 +3922,7 @@ static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
 				     "TX hasn't completed, qid %d, index %d. %u usecs from last napi execution, napi scheduled: %d\n",
 				     tx_ring->qid, i, time_since_last_napi, napi_scheduled);
 
+			missed_tx++;
 			tx_buf->print_once = 1;
 		}
 	}
@@ -4388,10 +4422,12 @@ static int ena_calc_io_queue_size(struct ena_adapter *adapter,
 		if ((llq->entry_size_ctrl_supported & ENA_ADMIN_LIST_ENTRY_SIZE_256B) &&
 		    (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)) {
 			max_tx_queue_size /= 2;
-			dev_info(&adapter->pdev->dev, "Forcing large headers and decreasing maximum TX queue size to %d\n",
+			dev_info(&adapter->pdev->dev,
+				 "Forcing large headers and decreasing maximum TX queue size to %d\n",
 				 max_tx_queue_size);
 		} else {
-			dev_err(&adapter->pdev->dev, "Forcing large headers failed: LLQ is disabled or device does not support large headers\n");
+			dev_err(&adapter->pdev->dev,
+				"Forcing large headers failed: LLQ is disabled or device does not support large headers\n");
 
 			adapter->large_llq_header_enabled = false;
 			ena_devlink_disable_large_llq_header_param(adapter->devlink);
@@ -4514,12 +4550,22 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	pci_set_drvdata(pdev, adapter);
 
+	rc = ena_phc_alloc(adapter);
+	if (rc) {
+		netdev_err(netdev, "ena_phc_alloc failed\n");
+		goto err_netdev_destroy;
+	}
+
 	adapter->large_llq_header_enabled = !!force_large_llq_header;
 
+#ifdef ENA_PHC_SUPPORT
+	ena_phc_enable(adapter, !!phc_enable);
+
+#endif /* ENA_PHC_SUPPORT */
 	rc = ena_com_allocate_customer_metrics_buffer(ena_dev);
 	if (rc) {
 		netdev_err(netdev, "ena_com_allocate_customer_metrics_buffer failed\n");
-		goto err_netdev_destroy;
+		goto err_free_phc;
 	}
 
 	devlink = ena_devlink_alloc(adapter);
@@ -4684,10 +4730,12 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	ena_com_admin_destroy(ena_dev);
 err_devlink_destroy:
 	ena_devlink_free(devlink);
-err_netdev_destroy:
-	free_netdev(netdev);
 err_metrics_destroy:
 	ena_com_delete_customer_metrics_buffer(ena_dev);
+err_free_phc:
+	ena_phc_free(adapter);
+err_netdev_destroy:
+	free_netdev(netdev);
 err_free_region:
 	ena_release_bars(ena_dev, pdev);
 err_free_ena_dev:
@@ -4756,6 +4804,8 @@ static void __ena_shutoff(struct pci_dev *pdev, bool shutdown)
 
 	ena_com_delete_customer_metrics_buffer(ena_dev);
 
+	ena_phc_free(adapter);
+
 	ena_release_bars(ena_dev, pdev);
 
 	pci_disable_device(pdev);
@@ -4873,13 +4923,19 @@ static struct pci_driver ena_pci_driver = {
 
 static int __init ena_init(void)
 {
+	int ret;
+
 	ena_wq = create_singlethread_workqueue(DRV_MODULE_NAME);
 	if (!ena_wq) {
 		pr_err("Failed to create workqueue\n");
 		return -ENOMEM;
 	}
 
-	return pci_register_driver(&ena_pci_driver);
+	ret = pci_register_driver(&ena_pci_driver);
+	if (ret)
+		destroy_workqueue(ena_wq);
+
+	return ret;
 }
 
 static void __exit ena_cleanup(void)
diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h
index 60409fa4a4b98..5098ac28966c5 100644
--- a/drivers/amazon/net/ena/ena_netdev.h
+++ b/drivers/amazon/net/ena/ena_netdev.h
@@ -29,7 +29,7 @@
 
 #define DRV_MODULE_GEN_MAJOR	2
 #define DRV_MODULE_GEN_MINOR	8
-#define DRV_MODULE_GEN_SUBMINOR 1
+#define DRV_MODULE_GEN_SUBMINOR 3
 
 #define DRV_MODULE_NAME		"ena"
 #ifndef DRV_MODULE_GENERATION
@@ -125,8 +125,10 @@
 
 struct ena_page_cache;
 
+#ifdef ENA_PHC_SUPPORT
 struct ena_phc_info;
 
+#endif
 struct ena_irq {
 	irq_handler_t handler;
 	void *data;
@@ -320,8 +322,13 @@ struct ena_ring {
 	enum ena_admin_placement_policy_type tx_mem_queue_type;
 
 	struct ena_com_rx_buf_info ena_bufs[ENA_PKT_MAX_BUFS];
-	u32  smoothed_interval;
-	u32  per_napi_packets;
+	u32 interrupt_interval;
+	/* Indicates whether interrupt interval has changed since previous set.
+	 * This flag will be kept up, until cleared by the routine which updates
+	 * the device with the modified interrupt interval value.
+	 */
+	bool interrupt_interval_changed;
+	u32 per_napi_packets;
 	u16 non_empty_napi_events;
 	struct u64_stats_sync syncp;
 	union {
@@ -421,6 +428,10 @@ struct ena_adapter {
 	unsigned long missing_tx_completion_to;
 
 	char name[ENA_NAME_MAX_LEN];
+#ifdef ENA_PHC_SUPPORT
+
+	struct ena_phc_info *phc_info;
+#endif
 
 	unsigned long flags;
 	/* TX */
@@ -459,8 +470,6 @@ struct ena_adapter {
 #endif
 	u32 xdp_first_ring;
 	u32 xdp_num_queues;
-
-	struct ena_phc_info *phc_info;
 };
 
 void ena_set_ethtool_ops(struct net_device *netdev);
diff --git a/drivers/amazon/net/ena/ena_phc.c b/drivers/amazon/net/ena/ena_phc.c
index 46e21d3202a1b..8b89ae9efb4ec 100644
--- a/drivers/amazon/net/ena/ena_phc.c
+++ b/drivers/amazon/net/ena/ena_phc.c
@@ -3,6 +3,7 @@
  * Copyright 2015-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
+#include "ena_devlink.h"
 #include "ena_phc.h"
 
 #ifdef ENA_PHC_SUPPORT
@@ -17,7 +18,8 @@ static int ena_phc_adjtime(struct ptp_clock_info *clock_info, s64 delta)
 	return -EOPNOTSUPP;
 }
 
-static int ena_phc_enable(struct ptp_clock_info *clock_info, struct ptp_clock_request *rq, int on)
+static int ena_phc_feature_enable(struct ptp_clock_info *clock_info, struct ptp_clock_request *rq,
+				  int on)
 {
 	return -EOPNOTSUPP;
 }
@@ -120,9 +122,38 @@ static struct ptp_clock_info ena_ptp_clock_info = {
 	.gettime	= ena_phc_gettime,
 	.settime	= ena_phc_settime,
 #endif /* ENA_PHC_SUPPORT_GETTIME64 */
-	.enable		= ena_phc_enable,
+	.enable		= ena_phc_feature_enable,
 };
 
+/* Enable/Disable PHC by the kernel, affects on the next init flow */
+void ena_phc_enable(struct ena_adapter *adapter, bool enable)
+{
+	struct ena_phc_info *phc_info = adapter->phc_info;
+
+	if (!phc_info) {
+		netdev_err(adapter->netdev, "phc_info is not allocated\n");
+		return;
+	}
+
+	phc_info->enabled = enable;
+}
+
+/* Check if PHC is enabled by the kernel */
+bool ena_phc_is_enabled(struct ena_adapter *adapter)
+{
+	struct ena_phc_info *phc_info = adapter->phc_info;
+
+	return (phc_info && phc_info->enabled);
+}
+
+/* PHC is activated if ptp clock is registered in the kernel */
+bool ena_phc_is_active(struct ena_adapter *adapter)
+{
+	struct ena_phc_info *phc_info = adapter->phc_info;
+
+	return (phc_info && phc_info->clock);
+}
+
 static int ena_phc_register(struct ena_adapter *adapter)
 {
 	struct pci_dev *pdev = adapter->pdev;
@@ -154,19 +185,34 @@ static int ena_phc_register(struct ena_adapter *adapter)
 	return rc;
 }
 
-bool ena_phc_enabled(struct ena_adapter *adapter)
+static void ena_phc_unregister(struct ena_adapter *adapter)
 {
 	struct ena_phc_info *phc_info = adapter->phc_info;
 
-	return (phc_info && phc_info->clock);
+	if (ena_phc_is_active(adapter)) {
+		ptp_clock_unregister(phc_info->clock);
+		phc_info->clock = NULL;
+	}
 }
 
-static void ena_phc_unregister(struct ena_adapter *adapter)
+int ena_phc_alloc(struct ena_adapter *adapter)
 {
-	struct ena_phc_info *phc_info = adapter->phc_info;
+	/* Allocate driver specific PHC info */
+	adapter->phc_info = vzalloc(sizeof(*adapter->phc_info));
+	if (unlikely(!adapter->phc_info)) {
+		netdev_err(adapter->netdev, "Failed to alloc phc_info\n");
+		return -ENOMEM;
+	}
 
-	if (ena_phc_enabled(adapter))
-		ptp_clock_unregister(phc_info->clock);
+	return 0;
+}
+
+void ena_phc_free(struct ena_adapter *adapter)
+{
+	if (adapter->phc_info) {
+		vfree(adapter->phc_info);
+		adapter->phc_info = NULL;
+	}
 }
 
 int ena_phc_init(struct ena_adapter *adapter)
@@ -175,13 +221,19 @@ int ena_phc_init(struct ena_adapter *adapter)
 	struct net_device *netdev = adapter->netdev;
 	int rc = -EOPNOTSUPP;
 
-	/* Validate phc feature is supported in the device */
+	/* Validate PHC feature is supported in the device */
 	if (!ena_com_phc_supported(ena_dev)) {
-		netdev_dbg(netdev, "PHC feature is not supported\n");
+		netdev_dbg(netdev, "PHC feature is not supported by the device\n");
+		goto err_ena_com_phc_init;
+	}
+
+	/* Validate PHC feature is enabled by the kernel */
+	if (!ena_phc_is_enabled(adapter)) {
+		netdev_dbg(netdev, "PHC feature is not enabled by the kernel\n");
 		goto err_ena_com_phc_init;
 	}
 
-	/* Allocate and initialize device specific PHC info */
+	/* Initialize device specific PHC info */
 	rc = ena_com_phc_init(ena_dev);
 	if (unlikely(rc)) {
 		netdev_err(netdev, "Failed to init phc, error: %d\n", rc);
@@ -195,50 +247,33 @@ int ena_phc_init(struct ena_adapter *adapter)
 		goto err_ena_com_phc_config;
 	}
 
-	/* Allocate and initialize driver specific PHC info */
-	adapter->phc_info = vzalloc(sizeof(*adapter->phc_info));
-	if (unlikely(!adapter->phc_info)) {
-		rc = -ENOMEM;
-		netdev_err(netdev, "Failed to alloc phc_info, error: %d\n", rc);
-		goto err_ena_com_phc_config;
-	}
-
 	/* Register to PTP class driver */
 	rc = ena_phc_register(adapter);
 	if (unlikely(rc)) {
 		netdev_err(netdev, "Failed to register phc, error: %d\n", rc);
-		goto err_ena_phc_register;
+		goto err_ena_com_phc_config;
 	}
 
 	return 0;
 
-err_ena_phc_register:
-	vfree(adapter->phc_info);
-	adapter->phc_info = NULL;
 err_ena_com_phc_config:
 	ena_com_phc_destroy(ena_dev);
 err_ena_com_phc_init:
+	ena_phc_enable(adapter, false);
+	ena_devlink_disable_phc_param(adapter->devlink);
 	return rc;
 }
 
 void ena_phc_destroy(struct ena_adapter *adapter)
 {
 	ena_phc_unregister(adapter);
-
-	if (likely(adapter->phc_info)) {
-		vfree(adapter->phc_info);
-		adapter->phc_info = NULL;
-	}
-
 	ena_com_phc_destroy(adapter->ena_dev);
 }
 
 int ena_phc_get_index(struct ena_adapter *adapter)
 {
-	struct ena_phc_info *phc_info = adapter->phc_info;
-
-	if (ena_phc_enabled(adapter))
-		return ptp_clock_index(phc_info->clock);
+	if (ena_phc_is_active(adapter))
+		return ptp_clock_index(adapter->phc_info->clock);
 
 	return -1;
 }
diff --git a/drivers/amazon/net/ena/ena_phc.h b/drivers/amazon/net/ena/ena_phc.h
index f08ff473bd1e4..bb644d5f928fa 100644
--- a/drivers/amazon/net/ena/ena_phc.h
+++ b/drivers/amazon/net/ena/ena_phc.h
@@ -6,8 +6,6 @@
 #ifndef ENA_PHC_H
 #define ENA_PHC_H
 
-#include "ena_netdev.h"
-
 #ifdef ENA_PHC_SUPPORT
 
 #include <linux/ptp_clock_kernel.h>
@@ -24,20 +22,29 @@ struct ena_phc_info {
 
 	/* PHC lock */
 	spinlock_t lock;
+
+	/* Enabled by kernel */
+	bool enabled;
 };
 
-bool ena_phc_enabled(struct ena_adapter *adapter);
+void ena_phc_enable(struct ena_adapter *adapter, bool enable);
+bool ena_phc_is_enabled(struct ena_adapter *adapter);
+bool ena_phc_is_active(struct ena_adapter *adapter);
 int ena_phc_get_index(struct ena_adapter *adapter);
 int ena_phc_init(struct ena_adapter *adapter);
 void ena_phc_destroy(struct ena_adapter *adapter);
-
+int ena_phc_alloc(struct ena_adapter *adapter);
+void ena_phc_free(struct ena_adapter *adapter);
 #else /* ENA_PHC_SUPPORT */
 
-static inline bool ena_phc_enabled(struct ena_adapter *adapter) {return false; }
+static inline void ena_phc_enable(struct ena_adapter *adapter, bool enable) { }
+static inline bool ena_phc_is_enabled(struct ena_adapter *adapter) { return false; }
+static inline bool ena_phc_is_active(struct ena_adapter *adapter) { return false; }
 static inline int ena_phc_get_index(struct ena_adapter *adapter) { return -1; }
 static inline int ena_phc_init(struct ena_adapter *adapter) { return 0; }
 static inline void ena_phc_destroy(struct ena_adapter *adapter) { }
-
+static inline int ena_phc_alloc(struct ena_adapter *adapter) { return 0; }
+static inline void ena_phc_free(struct ena_adapter *adapter) { }
 #endif /* ENA_PHC_SUPPORT */
 
 #endif /* ENA_PHC_H */
diff --git a/drivers/amazon/net/ena/ena_regs_defs.h b/drivers/amazon/net/ena/ena_regs_defs.h
index ded18aa5162bc..bdbbc8b18df63 100755
--- a/drivers/amazon/net/ena/ena_regs_defs.h
+++ b/drivers/amazon/net/ena/ena_regs_defs.h
@@ -22,6 +22,7 @@ enum ena_regs_reset_reason_types {
 	ENA_REGS_RESET_GENERIC                      = 13,
 	ENA_REGS_RESET_MISS_INTERRUPT               = 14,
 	ENA_REGS_RESET_SUSPECTED_POLL_STARVATION    = 15,
+	ENA_REGS_RESET_RX_DESCRIPTOR_MALFORMED	    = 16,
 	ENA_REGS_RESET_LAST,
 };
 
diff --git a/drivers/amazon/net/ena/ena_xdp.h b/drivers/amazon/net/ena/ena_xdp.h
index f6b60c0e5d7c6..dde8f9053f707 100644
--- a/drivers/amazon/net/ena/ena_xdp.h
+++ b/drivers/amazon/net/ena/ena_xdp.h
@@ -40,6 +40,7 @@ enum ENA_XDP_ACTIONS {
 	ENA_XDP_REDIRECT	= BIT(1),
 	ENA_XDP_DROP		= BIT(2)
 };
+
 #define ENA_XDP_FORWARDED (ENA_XDP_TX | ENA_XDP_REDIRECT)
 
 int ena_setup_and_create_all_xdp_queues(struct ena_adapter *adapter);
diff --git a/drivers/amazon/net/ena/kcompat.h b/drivers/amazon/net/ena/kcompat.h
index 8e7aab52fb507..fd44a3ebe0414 100644
--- a/drivers/amazon/net/ena/kcompat.h
+++ b/drivers/amazon/net/ena/kcompat.h
@@ -727,6 +727,14 @@ do {									\
 #define ENA_NO_DEVLINK_HEADERS
 #endif
 
+#if defined(CONFIG_NET_DEVLINK) && \
+	(KERNEL_VERSION(5, 1, 0) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0) && \
+	!((SUSE_VERSION != 0) && (SUSE_VERSION == 15 && (SUSE_PATCHLEVEL < 2 || SUSE_PATCHLEVEL >= 4))) && \
+	!(defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE > UBUNTU_VERSION(5, 16, 0, 0)) && \
+	!(RHEL_RELEASE_CODE))
+#define ENA_DEVLINK_PUBLISH_REQUIRED
+#endif
+
 #if defined(CONFIG_NET_DEVLINK) &&					\
 	(LINUX_VERSION_CODE >= KERNEL_VERSION(5, 4, 0) ||		\
 	 (SUSE_VERSION && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 3, 18)))
@@ -757,7 +765,8 @@ do {									\
 #endif
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0) || \
-	(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4))
+	(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4))  || \
+	(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
 #define ENA_DEVLINK_RELOAD_SUPPORT_ADVERTISEMENT_NEEDED
 #endif
 
@@ -945,13 +954,15 @@ static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr)
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) || \
 	(defined(RHEL_RELEASE_CODE) && \
 	RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 6) && \
-	RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(9, 0)) || \
+	RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(9, 0)) || \
 	(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4))
 #define ENA_EXTENDED_COALESCE_UAPI_WITH_CQE_SUPPORTED
 #endif
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 17, 0) || \
-	(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE == RHEL_RELEASE_VERSION(8, 7))
+	(defined(RHEL_RELEASE_CODE) && \
+	RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 7) && \
+	RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(9, 0))
 #define ENA_ETHTOOL_RX_BUFF_SIZE_CHANGE
 #endif
 

From 0e0bbd266e2b6c8250735d7711286089e8fb5427 Mon Sep 17 00:00:00 2001
From: Luiz Capitulino <luizcap@amazon.com>
Date: Fri, 3 Feb 2023 18:27:52 +0000
Subject: [PATCH 591/737] Add mpi3mr 8.2.1.0.0

The diver comes from Broadcom download page:

https://www.broadcom.com/support/download-search?dk=9750-8i&pa=&pf=&pg=&pn=&po
(Go down to "Driver" section).

There's a more recent version of the driver (8.3.0.0), but the customer
has requested 8.2.1.0.0 for now as it was instructed by Broadcom.
Updating the driver version should be trivial though.

Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 Documentation/scsi/mpi3mr.rst              |   38 +
 drivers/scsi/Kconfig                       |    1 +
 drivers/scsi/Makefile                      |    1 +
 drivers/scsi/mpi3mr/GPL_license.txt        |  340 +
 drivers/scsi/mpi3mr/Kconfig                |    8 +
 drivers/scsi/mpi3mr/Makefile               |    9 +
 drivers/scsi/mpi3mr/mpi/mpi30_api.h        |    8 +
 drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h       | 2258 +++++++
 drivers/scsi/mpi3mr/mpi/mpi30_image.h      |  259 +
 drivers/scsi/mpi3mr/mpi/mpi30_init.h       |  164 +
 drivers/scsi/mpi3mr/mpi/mpi30_ioc.h        | 1021 +++
 drivers/scsi/mpi3mr/mpi/mpi30_pci.h        |   43 +
 drivers/scsi/mpi3mr/mpi/mpi30_raid.h       |    7 +
 drivers/scsi/mpi3mr/mpi/mpi30_sas.h        |   45 +
 drivers/scsi/mpi3mr/mpi/mpi30_targ.h       |  194 +
 drivers/scsi/mpi3mr/mpi/mpi30_tool.h       |  289 +
 drivers/scsi/mpi3mr/mpi/mpi30_transport.h  |  454 ++
 drivers/scsi/mpi3mr/mpi/mpi30_type.h       |    7 +
 drivers/scsi/mpi3mr/mpi3mr.h               | 1536 +++++
 drivers/scsi/mpi3mr/mpi3mr_app.c           | 3346 ++++++++++
 drivers/scsi/mpi3mr/mpi3mr_app.h           |  450 ++
 drivers/scsi/mpi3mr/mpi3mr_debug.h         |  179 +
 drivers/scsi/mpi3mr/mpi3mr_debugfs.c       |  224 +
 drivers/scsi/mpi3mr/mpi3mr_fw.c            | 6778 ++++++++++++++++++++
 drivers/scsi/mpi3mr/mpi3mr_kernel_compat.h |  112 +
 drivers/scsi/mpi3mr/mpi3mr_os.c            | 6019 +++++++++++++++++
 drivers/scsi/mpi3mr/mpi3mr_transport.c     | 3374 ++++++++++
 27 files changed, 27164 insertions(+)
 create mode 100644 Documentation/scsi/mpi3mr.rst
 create mode 100644 drivers/scsi/mpi3mr/GPL_license.txt
 create mode 100644 drivers/scsi/mpi3mr/Kconfig
 create mode 100644 drivers/scsi/mpi3mr/Makefile
 create mode 100644 drivers/scsi/mpi3mr/mpi/mpi30_api.h
 create mode 100644 drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h
 create mode 100644 drivers/scsi/mpi3mr/mpi/mpi30_image.h
 create mode 100644 drivers/scsi/mpi3mr/mpi/mpi30_init.h
 create mode 100644 drivers/scsi/mpi3mr/mpi/mpi30_ioc.h
 create mode 100644 drivers/scsi/mpi3mr/mpi/mpi30_pci.h
 create mode 100644 drivers/scsi/mpi3mr/mpi/mpi30_raid.h
 create mode 100644 drivers/scsi/mpi3mr/mpi/mpi30_sas.h
 create mode 100644 drivers/scsi/mpi3mr/mpi/mpi30_targ.h
 create mode 100644 drivers/scsi/mpi3mr/mpi/mpi30_tool.h
 create mode 100644 drivers/scsi/mpi3mr/mpi/mpi30_transport.h
 create mode 100644 drivers/scsi/mpi3mr/mpi/mpi30_type.h
 create mode 100644 drivers/scsi/mpi3mr/mpi3mr.h
 create mode 100644 drivers/scsi/mpi3mr/mpi3mr_app.c
 create mode 100644 drivers/scsi/mpi3mr/mpi3mr_app.h
 create mode 100644 drivers/scsi/mpi3mr/mpi3mr_debug.h
 create mode 100644 drivers/scsi/mpi3mr/mpi3mr_debugfs.c
 create mode 100644 drivers/scsi/mpi3mr/mpi3mr_fw.c
 create mode 100644 drivers/scsi/mpi3mr/mpi3mr_kernel_compat.h
 create mode 100644 drivers/scsi/mpi3mr/mpi3mr_os.c
 create mode 100644 drivers/scsi/mpi3mr/mpi3mr_transport.c

diff --git a/Documentation/scsi/mpi3mr.rst b/Documentation/scsi/mpi3mr.rst
new file mode 100644
index 0000000000000..6b39a91093198
--- /dev/null
+++ b/Documentation/scsi/mpi3mr.rst
@@ -0,0 +1,38 @@
+This file lists the module parameters supported by the mpi3mr driver and their use.
+
+poll_queues:  Number of queues for io_uring poll mode (allowed values: 0 to 126, default=0).
+The mpi3mr driver supports io_uring in the kernel versions >=5.13 and this module parameter allows the user to specify 1 or more queues to be designated as poll_queues. 
+The poll_queues are disabled by default and this value is set to 0 by default.
+
+enable_segqueue: Enable segmented operational request & reply queues in the supported controllers (allowed values: 0 and 1, default = 1)
+Certain controllers managed by the mpi3mr driver can support the creation of operational request and reply queues with non-contiguous(segmented) memory for the queues.
+This option when set to 1, allows the driver to create the queues with segmented memory in the supported controllers.
+This option when set to 0, allows the driver to create the queues with contiguous memory.
+In the controllers that do not support the segmented queue creation, irrespective of this module parameter value, the driver always uses contiguous memory for queue creation.
+
+drv_dbg_level: Driver diagnostic buffer level (allowed values: 0,1 and 2, default=1).
+The mpi3mr driver supports saving some of the driver/kernel log messages from the dmesg log into the controller's persistent memory when certain fault conditions occur in the controller.
+This feature helps to save information that could otherwise get lost in cases like the OS drive present behind the controller which gets into fault.
+This option when set to 
+0(disabled): disables the saving of messages into the controller's persistent memory.
+1(minidump): captures the prints related to the specific controller instance that is faulting to the available persistent memory size.
+2(fulldump): captures the minidump and in addition captures the complete dmesg logs to the available persistent memory size.
+
+logging_level: Enable additional debug prints in the driver (allowed values: 0 to 0x7fffffff, default=0)
+The mpi3mr driver has only mandatorily required information logging by default to avoid cluttering the kernel log. 
+The additional debug logging prints can be dynamically enabled by providing the logging level through this module parameter or dynamically changing the logging level value through sysfs on a per controller basis.
+The logging level set through module parameter will be applicable to all the controllers managed by the driver.
+To turn off the additional logging, the logging level has to be set to 0.
+The logging level is a bitmap and the individual values can be be found in the "mpi3mr_debug.h" file. Setting this value to 0xFFFF will turn on pertinent logs required to support debugging many generic issues.
+
+enable_dif: Enable Data Intgerity Format (DIF) for the supported drives (allowed values: 0 and 1, default=1)
+The controllers managed by the mpi3mr driver are capable of generating, checking, removing Protection Information(PI) for the drives which support DIF.
+The driver by default enables the feature in the controller and let the kernel know that the driver and controller are capable of doing the PI generation and checking.
+When this parameter is set to 0, the driver will inform the kernel that the driver and controllers are not capable of supporting DIF.
+
+enable_dix: Enable Data Intgerity Extension (DIX) for the supported drives (allowed values: 0 and 1, default=0)
+The controllers managed by the mpi3mr driver and the driver are capable of passing the Protection Information(PI) from the upper layers in the operatings system for the DIF supported drives.
+By default, this capability is disabled in the controller and not exposed to the OS by the driver. 
+When this parameter is set to 0, the driver will inform the kernel about the DIX capabilities supported by the driver and controller and will handle the I/O requests sent with PI.
+
+
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 6524e1fe54d2e..79677ed29b1fe 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -482,6 +482,7 @@ config SCSI_ARCMSR
 source "drivers/scsi/esas2r/Kconfig"
 source "drivers/scsi/megaraid/Kconfig.megaraid"
 source "drivers/scsi/mpt3sas/Kconfig"
+source "drivers/scsi/mpi3mr/Kconfig"
 source "drivers/scsi/smartpqi/Kconfig"
 source "drivers/scsi/ufs/Kconfig"
 
diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile
index c00e3dd57990c..c5c1249c1166c 100644
--- a/drivers/scsi/Makefile
+++ b/drivers/scsi/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_MEGARAID_LEGACY)	+= megaraid.o
 obj-$(CONFIG_MEGARAID_NEWGEN)	+= megaraid/
 obj-$(CONFIG_MEGARAID_SAS)	+= megaraid/
 obj-$(CONFIG_SCSI_MPT3SAS)	+= mpt3sas/
+obj-$(CONFIG_SCSI_MPI3MR)       += mpi3mr/
 obj-$(CONFIG_SCSI_UFSHCD)	+= ufs/
 obj-$(CONFIG_SCSI_ACARD)	+= atp870u.o
 obj-$(CONFIG_SCSI_SUNESP)	+= esp_scsi.o	sun_esp.o
diff --git a/drivers/scsi/mpi3mr/GPL_license.txt b/drivers/scsi/mpi3mr/GPL_license.txt
new file mode 100644
index 0000000000000..3912109b5cd65
--- /dev/null
+++ b/drivers/scsi/mpi3mr/GPL_license.txt
@@ -0,0 +1,340 @@
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+                       51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/drivers/scsi/mpi3mr/Kconfig b/drivers/scsi/mpi3mr/Kconfig
new file mode 100644
index 0000000000000..d9846c03effc0
--- /dev/null
+++ b/drivers/scsi/mpi3mr/Kconfig
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+config SCSI_MPI3MR
+	tristate "Broadcom MPI3 Storage Controller Device Driver"
+	depends on PCI && SCSI
+	help
+	This driver supports Broadcom's Unified MPI3 based Storage & RAID Controllers.
+
diff --git a/drivers/scsi/mpi3mr/Makefile b/drivers/scsi/mpi3mr/Makefile
new file mode 100644
index 0000000000000..06e44afa0b189
--- /dev/null
+++ b/drivers/scsi/mpi3mr/Makefile
@@ -0,0 +1,9 @@
+# mpi3mr makefile
+obj-$(CONFIG_SCSI_MPI3MR) += mpi3mr.o
+mpi3mr-y +=  mpi3mr_os.o     \
+		mpi3mr_fw.o \
+		mpi3mr_app.o \
+		mpi3mr_debugfs.o \
+		mpi3mr_transport.o 
+
+
diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_api.h b/drivers/scsi/mpi3mr/mpi/mpi30_api.h
new file mode 100644
index 0000000000000..1a13a68e44d6b
--- /dev/null
+++ b/drivers/scsi/mpi3mr/mpi/mpi30_api.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ *  Copyright 2019-2022 Broadcom Inc. All rights reserved.
+ *
+ */
+#ifndef MPI30_API_H
+#define MPI30_API_H     1
+#endif
diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h b/drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h
new file mode 100644
index 0000000000000..2500844772156
--- /dev/null
+++ b/drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h
@@ -0,0 +1,2258 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ *  Copyright 2016-2022 Broadcom Inc. All rights reserved.
+ */
+#ifndef MPI30_CNFG_H
+#define MPI30_CNFG_H     1
+#define MPI3_CONFIG_PAGETYPE_IO_UNIT                    (0x00)
+#define MPI3_CONFIG_PAGETYPE_MANUFACTURING              (0x01)
+#define MPI3_CONFIG_PAGETYPE_IOC                        (0x02)
+#define MPI3_CONFIG_PAGETYPE_DRIVER                     (0x03)
+#define MPI3_CONFIG_PAGETYPE_SECURITY                   (0x04)
+#define MPI3_CONFIG_PAGETYPE_ENCLOSURE                  (0x11)
+#define MPI3_CONFIG_PAGETYPE_DEVICE                     (0x12)
+#define MPI3_CONFIG_PAGETYPE_SAS_IO_UNIT                (0x20)
+#define MPI3_CONFIG_PAGETYPE_SAS_EXPANDER               (0x21)
+#define MPI3_CONFIG_PAGETYPE_SAS_PHY                    (0x23)
+#define MPI3_CONFIG_PAGETYPE_SAS_PORT                   (0x24)
+#define MPI3_CONFIG_PAGETYPE_PCIE_IO_UNIT               (0x30)
+#define MPI3_CONFIG_PAGETYPE_PCIE_SWITCH                (0x31)
+#define MPI3_CONFIG_PAGETYPE_PCIE_LINK                  (0x33)
+#define MPI3_CONFIG_PAGEATTR_MASK                       (0xf0)
+#define MPI3_CONFIG_PAGEATTR_READ_ONLY                  (0x00)
+#define MPI3_CONFIG_PAGEATTR_CHANGEABLE                 (0x10)
+#define MPI3_CONFIG_PAGEATTR_PERSISTENT                 (0x20)
+#define MPI3_CONFIG_ACTION_PAGE_HEADER                  (0x00)
+#define MPI3_CONFIG_ACTION_READ_DEFAULT                 (0x01)
+#define MPI3_CONFIG_ACTION_READ_CURRENT                 (0x02)
+#define MPI3_CONFIG_ACTION_WRITE_CURRENT                (0x03)
+#define MPI3_CONFIG_ACTION_READ_PERSISTENT              (0x04)
+#define MPI3_CONFIG_ACTION_WRITE_PERSISTENT             (0x05)
+#define MPI3_DEVICE_PGAD_FORM_MASK                      (0xf0000000)
+#define MPI3_DEVICE_PGAD_FORM_GET_NEXT_HANDLE           (0x00000000)
+#define MPI3_DEVICE_PGAD_FORM_HANDLE                    (0x20000000)
+#define MPI3_DEVICE_PGAD_HANDLE_MASK                    (0x0000ffff)
+#define MPI3_SAS_EXPAND_PGAD_FORM_MASK                  (0xf0000000)
+#define MPI3_SAS_EXPAND_PGAD_FORM_GET_NEXT_HANDLE       (0x00000000)
+#define MPI3_SAS_EXPAND_PGAD_FORM_HANDLE_PHY_NUM        (0x10000000)
+#define MPI3_SAS_EXPAND_PGAD_FORM_HANDLE                (0x20000000)
+#define MPI3_SAS_EXPAND_PGAD_PHYNUM_MASK                (0x00ff0000)
+#define MPI3_SAS_EXPAND_PGAD_PHYNUM_SHIFT               (16)
+#define MPI3_SAS_EXPAND_PGAD_HANDLE_MASK                (0x0000ffff)
+#define MPI3_SAS_PHY_PGAD_FORM_MASK                     (0xf0000000)
+#define MPI3_SAS_PHY_PGAD_FORM_PHY_NUMBER               (0x00000000)
+#define MPI3_SAS_PHY_PGAD_PHY_NUMBER_MASK               (0x000000ff)
+#define MPI3_SASPORT_PGAD_FORM_MASK                     (0xf0000000)
+#define MPI3_SASPORT_PGAD_FORM_GET_NEXT_PORT            (0x00000000)
+#define MPI3_SASPORT_PGAD_FORM_PORT_NUM                 (0x10000000)
+#define MPI3_SASPORT_PGAD_PORT_NUMBER_MASK              (0x000000ff)
+#define MPI3_ENCLOS_PGAD_FORM_MASK                      (0xf0000000)
+#define MPI3_ENCLOS_PGAD_FORM_GET_NEXT_HANDLE           (0x00000000)
+#define MPI3_ENCLOS_PGAD_FORM_HANDLE                    (0x10000000)
+#define MPI3_ENCLOS_PGAD_HANDLE_MASK                    (0x0000ffff)
+#define MPI3_PCIE_SWITCH_PGAD_FORM_MASK                 (0xf0000000)
+#define MPI3_PCIE_SWITCH_PGAD_FORM_GET_NEXT_HANDLE      (0x00000000)
+#define MPI3_PCIE_SWITCH_PGAD_FORM_HANDLE_PORT_NUM      (0x10000000)
+#define MPI3_PCIE_SWITCH_PGAD_FORM_HANDLE               (0x20000000)
+#define MPI3_PCIE_SWITCH_PGAD_PORTNUM_MASK              (0x00ff0000)
+#define MPI3_PCIE_SWITCH_PGAD_PORTNUM_SHIFT             (16)
+#define MPI3_PCIE_SWITCH_PGAD_HANDLE_MASK               (0x0000ffff)
+#define MPI3_PCIE_LINK_PGAD_FORM_MASK                   (0xf0000000)
+#define MPI3_PCIE_LINK_PGAD_FORM_GET_NEXT_LINK          (0x00000000)
+#define MPI3_PCIE_LINK_PGAD_FORM_LINK_NUM               (0x10000000)
+#define MPI3_PCIE_LINK_PGAD_LINKNUM_MASK                (0x000000ff)
+#define MPI3_SECURITY_PGAD_FORM_MASK                    (0xf0000000)
+#define MPI3_SECURITY_PGAD_FORM_GET_NEXT_SLOT           (0x00000000)
+#define MPI3_SECURITY_PGAD_FORM_SOT_NUM                 (0x10000000)
+#define MPI3_SECURITY_PGAD_SLOT_GROUP_MASK              (0x0000ff00)
+#define MPI3_SECURITY_PGAD_SLOT_MASK                    (0x000000ff)
+struct mpi3_config_request {
+	__le16             host_tag;
+	u8                 ioc_use_only02;
+	u8                 function;
+	__le16             ioc_use_only04;
+	u8                 ioc_use_only06;
+	u8                 msg_flags;
+	__le16             change_count;
+	__le16             reserved0a;
+	u8                 page_version;
+	u8                 page_number;
+	u8                 page_type;
+	u8                 action;
+	__le32             page_address;
+	__le16             page_length;
+	__le16             reserved16;
+	__le32             reserved18[2];
+	union mpi3_sge_union  sgl;
+};
+struct mpi3_config_page_header {
+	u8                 page_version;
+	u8                 reserved01;
+	u8                 page_number;
+	u8                 page_attribute;
+	__le16             page_length;
+	u8                 page_type;
+	u8                 reserved07;
+};
+#define MPI3_SAS_NEG_LINK_RATE_LOGICAL_MASK                   (0xf0)
+#define MPI3_SAS_NEG_LINK_RATE_LOGICAL_SHIFT                  (4)
+#define MPI3_SAS_NEG_LINK_RATE_PHYSICAL_MASK                  (0x0f)
+#define MPI3_SAS_NEG_LINK_RATE_UNKNOWN_LINK_RATE              (0x00)
+#define MPI3_SAS_NEG_LINK_RATE_PHY_DISABLED                   (0x01)
+#define MPI3_SAS_NEG_LINK_RATE_NEGOTIATION_FAILED             (0x02)
+#define MPI3_SAS_NEG_LINK_RATE_SATA_OOB_COMPLETE              (0x03)
+#define MPI3_SAS_NEG_LINK_RATE_PORT_SELECTOR                  (0x04)
+#define MPI3_SAS_NEG_LINK_RATE_SMP_RESET_IN_PROGRESS          (0x05)
+#define MPI3_SAS_NEG_LINK_RATE_UNSUPPORTED_PHY                (0x06)
+#define MPI3_SAS_NEG_LINK_RATE_1_5                            (0x08)
+#define MPI3_SAS_NEG_LINK_RATE_3_0                            (0x09)
+#define MPI3_SAS_NEG_LINK_RATE_6_0                            (0x0a)
+#define MPI3_SAS_NEG_LINK_RATE_12_0                           (0x0b)
+#define MPI3_SAS_NEG_LINK_RATE_22_5                           (0x0c)
+#define MPI3_SAS_APHYINFO_INSIDE_ZPSDS_PERSISTENT             (0x00000040)
+#define MPI3_SAS_APHYINFO_REQUESTED_INSIDE_ZPSDS              (0x00000020)
+#define MPI3_SAS_APHYINFO_BREAK_REPLY_CAPABLE                 (0x00000010)
+#define MPI3_SAS_APHYINFO_REASON_MASK                         (0x0000000f)
+#define MPI3_SAS_APHYINFO_REASON_UNKNOWN                      (0x00000000)
+#define MPI3_SAS_APHYINFO_REASON_POWER_ON                     (0x00000001)
+#define MPI3_SAS_APHYINFO_REASON_HARD_RESET                   (0x00000002)
+#define MPI3_SAS_APHYINFO_REASON_SMP_PHY_CONTROL              (0x00000003)
+#define MPI3_SAS_APHYINFO_REASON_LOSS_OF_SYNC                 (0x00000004)
+#define MPI3_SAS_APHYINFO_REASON_MULTIPLEXING_SEQ             (0x00000005)
+#define MPI3_SAS_APHYINFO_REASON_IT_NEXUS_LOSS_TIMER          (0x00000006)
+#define MPI3_SAS_APHYINFO_REASON_BREAK_TIMEOUT                (0x00000007)
+#define MPI3_SAS_APHYINFO_REASON_PHY_TEST_STOPPED             (0x00000008)
+#define MPI3_SAS_APHYINFO_REASON_EXP_REDUCED_FUNC             (0x00000009)
+#define MPI3_SAS_PHYINFO_STATUS_MASK                          (0xc0000000)
+#define MPI3_SAS_PHYINFO_STATUS_SHIFT                         (30)
+#define MPI3_SAS_PHYINFO_STATUS_ACCESSIBLE                    (0x00000000)
+#define MPI3_SAS_PHYINFO_STATUS_NOT_EXIST                     (0x40000000)
+#define MPI3_SAS_PHYINFO_STATUS_VACANT                        (0x80000000)
+#define MPI3_SAS_PHYINFO_PHY_POWER_CONDITION_MASK             (0x18000000)
+#define MPI3_SAS_PHYINFO_PHY_POWER_CONDITION_ACTIVE           (0x00000000)
+#define MPI3_SAS_PHYINFO_PHY_POWER_CONDITION_PARTIAL          (0x08000000)
+#define MPI3_SAS_PHYINFO_PHY_POWER_CONDITION_SLUMBER          (0x10000000)
+#define MPI3_SAS_PHYINFO_REQUESTED_INSIDE_ZPSDS_CHANGED_MASK  (0x04000000)
+#define MPI3_SAS_PHYINFO_REQUESTED_INSIDE_ZPSDS_CHANGED_SHIFT (26)
+#define MPI3_SAS_PHYINFO_INSIDE_ZPSDS_PERSISTENT_MASK         (0x02000000)
+#define MPI3_SAS_PHYINFO_INSIDE_ZPSDS_PERSISTENT_SHIFT        (25)
+#define MPI3_SAS_PHYINFO_REQUESTED_INSIDE_ZPSDS_MASK          (0x01000000)
+#define MPI3_SAS_PHYINFO_REQUESTED_INSIDE_ZPSDS_SHIFT         (24)
+#define MPI3_SAS_PHYINFO_ZONE_GROUP_PERSISTENT                (0x00400000)
+#define MPI3_SAS_PHYINFO_INSIDE_ZPSDS_WITHIN                  (0x00200000)
+#define MPI3_SAS_PHYINFO_ZONING_ENABLED                       (0x00100000)
+#define MPI3_SAS_PHYINFO_REASON_MASK                          (0x000f0000)
+#define MPI3_SAS_PHYINFO_REASON_UNKNOWN                       (0x00000000)
+#define MPI3_SAS_PHYINFO_REASON_POWER_ON                      (0x00010000)
+#define MPI3_SAS_PHYINFO_REASON_HARD_RESET                    (0x00020000)
+#define MPI3_SAS_PHYINFO_REASON_SMP_PHY_CONTROL               (0x00030000)
+#define MPI3_SAS_PHYINFO_REASON_LOSS_OF_SYNC                  (0x00040000)
+#define MPI3_SAS_PHYINFO_REASON_MULTIPLEXING_SEQ              (0x00050000)
+#define MPI3_SAS_PHYINFO_REASON_IT_NEXUS_LOSS_TIMER           (0x00060000)
+#define MPI3_SAS_PHYINFO_REASON_BREAK_TIMEOUT                 (0x00070000)
+#define MPI3_SAS_PHYINFO_REASON_PHY_TEST_STOPPED              (0x00080000)
+#define MPI3_SAS_PHYINFO_REASON_EXP_REDUCED_FUNC              (0x00090000)
+#define MPI3_SAS_PHYINFO_SATA_PORT_ACTIVE                     (0x00004000)
+#define MPI3_SAS_PHYINFO_SATA_PORT_SELECTOR_PRESENT           (0x00002000)
+#define MPI3_SAS_PHYINFO_VIRTUAL_PHY                          (0x00001000)
+#define MPI3_SAS_PHYINFO_PARTIAL_PATHWAY_TIME_MASK            (0x00000f00)
+#define MPI3_SAS_PHYINFO_PARTIAL_PATHWAY_TIME_SHIFT           (8)
+#define MPI3_SAS_PHYINFO_ROUTING_ATTRIBUTE_MASK               (0x000000f0)
+#define MPI3_SAS_PHYINFO_ROUTING_ATTRIBUTE_DIRECT             (0x00000000)
+#define MPI3_SAS_PHYINFO_ROUTING_ATTRIBUTE_SUBTRACTIVE        (0x00000010)
+#define MPI3_SAS_PHYINFO_ROUTING_ATTRIBUTE_TABLE              (0x00000020)
+#define MPI3_SAS_PRATE_MAX_RATE_MASK                          (0xf0)
+#define MPI3_SAS_PRATE_MAX_RATE_NOT_PROGRAMMABLE              (0x00)
+#define MPI3_SAS_PRATE_MAX_RATE_1_5                           (0x80)
+#define MPI3_SAS_PRATE_MAX_RATE_3_0                           (0x90)
+#define MPI3_SAS_PRATE_MAX_RATE_6_0                           (0xa0)
+#define MPI3_SAS_PRATE_MAX_RATE_12_0                          (0xb0)
+#define MPI3_SAS_PRATE_MAX_RATE_22_5                          (0xc0)
+#define MPI3_SAS_PRATE_MIN_RATE_MASK                          (0x0f)
+#define MPI3_SAS_PRATE_MIN_RATE_NOT_PROGRAMMABLE              (0x00)
+#define MPI3_SAS_PRATE_MIN_RATE_1_5                           (0x08)
+#define MPI3_SAS_PRATE_MIN_RATE_3_0                           (0x09)
+#define MPI3_SAS_PRATE_MIN_RATE_6_0                           (0x0a)
+#define MPI3_SAS_PRATE_MIN_RATE_12_0                          (0x0b)
+#define MPI3_SAS_PRATE_MIN_RATE_22_5                          (0x0c)
+#define MPI3_SAS_HWRATE_MAX_RATE_MASK                         (0xf0)
+#define MPI3_SAS_HWRATE_MAX_RATE_1_5                          (0x80)
+#define MPI3_SAS_HWRATE_MAX_RATE_3_0                          (0x90)
+#define MPI3_SAS_HWRATE_MAX_RATE_6_0                          (0xa0)
+#define MPI3_SAS_HWRATE_MAX_RATE_12_0                         (0xb0)
+#define MPI3_SAS_HWRATE_MAX_RATE_22_5                         (0xc0)
+#define MPI3_SAS_HWRATE_MIN_RATE_MASK                         (0x0f)
+#define MPI3_SAS_HWRATE_MIN_RATE_1_5                          (0x08)
+#define MPI3_SAS_HWRATE_MIN_RATE_3_0                          (0x09)
+#define MPI3_SAS_HWRATE_MIN_RATE_6_0                          (0x0a)
+#define MPI3_SAS_HWRATE_MIN_RATE_12_0                         (0x0b)
+#define MPI3_SAS_HWRATE_MIN_RATE_22_5                         (0x0c)
+#define MPI3_SLOT_INVALID                                     (0xffff)
+#define MPI3_SLOT_INDEX_INVALID                               (0xffff)
+#define MPI3_LINK_CHANGE_COUNT_INVALID                        (0xffff)
+#define MPI3_RATE_CHANGE_COUNT_INVALID                        (0xffff)
+#define MPI3_TEMP_SENSOR_LOCATION_INTERNAL                    (0x0)
+#define MPI3_TEMP_SENSOR_LOCATION_INLET                       (0x1)
+#define MPI3_TEMP_SENSOR_LOCATION_OUTLET                      (0x2)
+#define MPI3_TEMP_SENSOR_LOCATION_DRAM                        (0x3)
+#define MPI3_MFGPAGE_VENDORID_BROADCOM                        (0x1000)
+#define MPI3_MFGPAGE_DEVID_SAS4116                            (0x00a5)
+struct mpi3_man_page0 {
+	struct mpi3_config_page_header         header;
+	u8                                 chip_revision[8];
+	u8                                 chip_name[32];
+	u8                                 board_name[32];
+	u8                                 board_assembly[32];
+	u8                                 board_tracer_number[32];
+	__le32                             board_power;
+	__le32                             reserved94;
+	__le32                             reserved98;
+	u8                                 oem;
+	u8                                 profile_identifier;
+	__le16                             flags;
+	u8                                 board_mfg_day;
+	u8                                 board_mfg_month;
+	__le16                             board_mfg_year;
+	u8                                 board_rework_day;
+	u8                                 board_rework_month;
+	__le16                             board_rework_year;
+	u8                                 board_revision[8];
+	u8                                 e_pack_fru[16];
+	u8                                 product_name[256];
+};
+#define MPI3_MAN0_PAGEVERSION       (0x00)
+#define MPI3_MAN0_FLAGS_SWITCH_PRESENT                       (0x0002)
+#define MPI3_MAN0_FLAGS_EXPANDER_PRESENT                     (0x0001)
+#define MPI3_MAN1_VPD_SIZE                                   (512)
+struct mpi3_man_page1 {
+	struct mpi3_config_page_header         header;
+	__le32                             reserved08[2];
+	u8                                 vpd[MPI3_MAN1_VPD_SIZE];
+};
+#define MPI3_MAN1_PAGEVERSION                                 (0x00)
+struct mpi3_man_page2 {
+	struct mpi3_config_page_header         header;
+	u8                                 flags;
+	u8                                 reserved09[3];
+	__le32                             reserved0c[3];
+	u8                                 oem_board_tracer_number[32];
+};
+#define MPI3_MAN2_PAGEVERSION                                 (0x00)
+#define MPI3_MAN2_FLAGS_TRACER_PRESENT                        (0x01)
+struct mpi3_man5_phy_entry {
+	__le64     ioc_wwid;
+	__le64     device_name;
+	__le64     sata_wwid;
+};
+#ifndef MPI3_MAN5_PHY_MAX
+#define MPI3_MAN5_PHY_MAX                                   (1)
+#endif
+struct mpi3_man_page5 {
+	struct mpi3_config_page_header         header;
+	u8                                 num_phys;
+	u8                                 reserved09[3];
+	__le32                             reserved0c;
+	struct mpi3_man5_phy_entry             phy[MPI3_MAN5_PHY_MAX];
+};
+#define MPI3_MAN5_PAGEVERSION                                (0x00)
+struct mpi3_man6_gpio_entry {
+	u8         function_code;
+	u8         function_flags;
+	__le16     flags;
+	u8         param1;
+	u8         param2;
+	__le16     reserved06;
+	__le32     param3;
+};
+#define MPI3_MAN6_GPIO_FUNCTION_GENERIC                                       (0x00)
+#define MPI3_MAN6_GPIO_FUNCTION_ALTERNATE                                     (0x01)
+#define MPI3_MAN6_GPIO_FUNCTION_EXT_INTERRUPT                                 (0x02)
+#define MPI3_MAN6_GPIO_FUNCTION_GLOBAL_ACTIVITY                               (0x03)
+#define MPI3_MAN6_GPIO_FUNCTION_OVER_TEMPERATURE                              (0x04)
+#define MPI3_MAN6_GPIO_FUNCTION_PORT_STATUS_GREEN                             (0x05)
+#define MPI3_MAN6_GPIO_FUNCTION_PORT_STATUS_YELLOW                            (0x06)
+#define MPI3_MAN6_GPIO_FUNCTION_CABLE_MANAGEMENT                              (0x07)
+#define MPI3_MAN6_GPIO_FUNCTION_BKPLANE_MGMT_TYPE                             (0x08)
+#define MPI3_MAN6_GPIO_FUNCTION_ISTWI_RESET                                   (0x0a)
+#define MPI3_MAN6_GPIO_FUNCTION_BACKEND_PCIE_RESET                            (0x0b)
+#define MPI3_MAN6_GPIO_FUNCTION_GLOBAL_FAULT                                  (0x0c)
+#define MPI3_MAN6_GPIO_FUNCTION_PBLP_STATUS_CHANGE                            (0x0d)
+#define MPI3_MAN6_GPIO_FUNCTION_EPACK_ONLINE                                  (0x0e)
+#define MPI3_MAN6_GPIO_FUNCTION_EPACK_FAULT                                   (0x0f)
+#define MPI3_MAN6_GPIO_FUNCTION_CTRL_TYPE                                     (0x10)
+#define MPI3_MAN6_GPIO_FUNCTION_LICENSE                                       (0x11)
+#define MPI3_MAN6_GPIO_FUNCTION_REFCLK_CONTROL                                (0x12)
+#define MPI3_MAN6_GPIO_FUNCTION_BACKEND_PCIE_RESET_CLAMP                      (0x13)
+#define MPI3_MAN6_GPIO_FUNCTION_AUXILIARY_POWER                               (0x14)
+#define MPI3_MAN6_GPIO_FUNCTION_RAID_DATA_CACHE_DIRTY                         (0x15)
+#define MPI3_MAN6_GPIO_FUNCTION_BOARD_FAN_CONTROL                             (0x16)
+#define MPI3_MAN6_GPIO_FUNCTION_BOARD_FAN_FAULT                               (0x17)
+#define MPI3_MAN6_GPIO_FUNCTION_POWER_BRAKE                                   (0x18)
+#define MPI3_MAN6_GPIO_ISTWI_RESET_FUNCTIONFLAGS_DEVSELECT_MASK               (0x01)
+#define MPI3_MAN6_GPIO_ISTWI_RESET_FUNCTIONFLAGS_DEVSELECT_ISTWI              (0x00)
+#define MPI3_MAN6_GPIO_ISTWI_RESET_FUNCTIONFLAGS_DEVSELECT_RECEPTACLEID       (0x01)
+#define MPI3_MAN6_GPIO_EXTINT_PARAM1_FLAGS_SOURCE_MASK                        (0xf0)
+#define MPI3_MAN6_GPIO_EXTINT_PARAM1_FLAGS_SOURCE_GENERIC                     (0x00)
+#define MPI3_MAN6_GPIO_EXTINT_PARAM1_FLAGS_SOURCE_CABLE_MGMT                  (0x10)
+#define MPI3_MAN6_GPIO_EXTINT_PARAM1_FLAGS_SOURCE_ACTIVE_CABLE_OVERCURRENT    (0x20)
+#define MPI3_MAN6_GPIO_EXTINT_PARAM1_FLAGS_TRIGGER_MASK                       (0x01)
+#define MPI3_MAN6_GPIO_EXTINT_PARAM1_FLAGS_TRIGGER_EDGE                       (0x00)
+#define MPI3_MAN6_GPIO_EXTINT_PARAM1_FLAGS_TRIGGER_LEVEL                      (0x01)
+#define MPI3_MAN6_GPIO_PORT_GREEN_PARAM1_PHY_STATUS_ALL_UP                    (0x00)
+#define MPI3_MAN6_GPIO_PORT_GREEN_PARAM1_PHY_STATUS_ONE_OR_MORE_UP            (0x01)
+#define MPI3_MAN6_GPIO_CABLE_MGMT_PARAM1_INTERFACE_MODULE_PRESENT             (0x00)
+#define MPI3_MAN6_GPIO_CABLE_MGMT_PARAM1_INTERFACE_ACTIVE_CABLE_ENABLE        (0x01)
+#define MPI3_MAN6_GPIO_CABLE_MGMT_PARAM1_INTERFACE_CABLE_MGMT_ENABLE          (0x02)
+#define MPI3_MAN6_GPIO_LICENSE_PARAM1_TYPE_IBUTTON                            (0x00)
+#define MPI3_MAN6_GPIO_FLAGS_SLEW_RATE_MASK                                   (0x0100)
+#define MPI3_MAN6_GPIO_FLAGS_SLEW_RATE_FAST_EDGE                              (0x0100)
+#define MPI3_MAN6_GPIO_FLAGS_SLEW_RATE_SLOW_EDGE                              (0x0000)
+#define MPI3_MAN6_GPIO_FLAGS_DRIVE_STRENGTH_MASK                              (0x00c0)
+#define MPI3_MAN6_GPIO_FLAGS_DRIVE_STRENGTH_100OHM                            (0x0000)
+#define MPI3_MAN6_GPIO_FLAGS_DRIVE_STRENGTH_66OHM                             (0x0040)
+#define MPI3_MAN6_GPIO_FLAGS_DRIVE_STRENGTH_50OHM                             (0x0080)
+#define MPI3_MAN6_GPIO_FLAGS_DRIVE_STRENGTH_33OHM                             (0x00c0)
+#define MPI3_MAN6_GPIO_FLAGS_ALT_DATA_SEL_MASK                                (0x0030)
+#define MPI3_MAN6_GPIO_FLAGS_ALT_DATA_SEL_SHIFT                               (4)
+#define MPI3_MAN6_GPIO_FLAGS_ACTIVE_HIGH                                      (0x0008)
+#define MPI3_MAN6_GPIO_FLAGS_BI_DIR_ENABLED                                   (0x0004)
+#define MPI3_MAN6_GPIO_FLAGS_DIRECTION_MASK                                   (0x0003)
+#define MPI3_MAN6_GPIO_FLAGS_DIRECTION_INPUT                                  (0x0000)
+#define MPI3_MAN6_GPIO_FLAGS_DIRECTION_OPEN_DRAIN_OUTPUT                      (0x0001)
+#define MPI3_MAN6_GPIO_FLAGS_DIRECTION_OPEN_SOURCE_OUTPUT                     (0x0002)
+#define MPI3_MAN6_GPIO_FLAGS_DIRECTION_PUSH_PULL_OUTPUT                       (0x0003)
+#ifndef MPI3_MAN6_GPIO_MAX
+#define MPI3_MAN6_GPIO_MAX                                                    (1)
+#endif
+struct mpi3_man_page6 {
+	struct mpi3_config_page_header         header;
+	__le16                             flags;
+	__le16                             reserved0a;
+	u8                                 num_gpio;
+	u8                                 reserved0d[3];
+	struct mpi3_man6_gpio_entry            gpio[MPI3_MAN6_GPIO_MAX];
+};
+#define MPI3_MAN6_PAGEVERSION                                                 (0x00)
+#define MPI3_MAN6_FLAGS_HEARTBEAT_LED_DISABLED                                (0x0001)
+struct mpi3_man7_receptacle_info {
+	__le32                             name[4];
+	u8                                 location;
+	u8                                 connector_type;
+	u8                                 ped_clk;
+	u8                                 connector_id;
+	__le32                             reserved14;
+};
+#define MPI3_MAN7_LOCATION_UNKNOWN                         (0x00)
+#define MPI3_MAN7_LOCATION_INTERNAL                        (0x01)
+#define MPI3_MAN7_LOCATION_EXTERNAL                        (0x02)
+#define MPI3_MAN7_LOCATION_VIRTUAL                         (0x03)
+#define MPI3_MAN7_PEDCLK_ROUTING_MASK                      (0x10)
+#define MPI3_MAN7_PEDCLK_ROUTING_DIRECT                    (0x00)
+#define MPI3_MAN7_PEDCLK_ROUTING_CLOCK_BUFFER              (0x10)
+#define MPI3_MAN7_PEDCLK_ID_MASK                           (0x0f)
+#ifndef MPI3_MAN7_RECEPTACLE_INFO_MAX
+#define MPI3_MAN7_RECEPTACLE_INFO_MAX                      (1)
+#endif
+struct mpi3_man_page7 {
+	struct mpi3_config_page_header         header;
+	__le32                             flags;
+	u8                                 num_receptacles;
+	u8                                 reserved0d[3];
+	__le32                             enclosure_name[4];
+	struct mpi3_man7_receptacle_info       receptacle_info[MPI3_MAN7_RECEPTACLE_INFO_MAX];
+};
+#define MPI3_MAN7_PAGEVERSION                              (0x00)
+#define MPI3_MAN7_FLAGS_BASE_ENCLOSURE_LEVEL_MASK          (0x01)
+#define MPI3_MAN7_FLAGS_BASE_ENCLOSURE_LEVEL_0             (0x00)
+#define MPI3_MAN7_FLAGS_BASE_ENCLOSURE_LEVEL_1             (0x01)
+struct mpi3_man8_phy_info {
+	u8                                 receptacle_id;
+	u8                                 connector_lane;
+	__le16                             reserved02;
+	__le16                             slotx1;
+	__le16                             slotx2;
+	__le16                             slotx4;
+	__le16                             reserved0a;
+	__le32                             reserved0c;
+};
+#define MPI3_MAN8_PHY_INFO_RECEPTACLE_ID_HOST_PHY          (0xff)
+#ifndef MPI3_MAN8_PHY_INFO_MAX
+#define MPI3_MAN8_PHY_INFO_MAX                      (1)
+#endif
+struct mpi3_man_page8 {
+	struct mpi3_config_page_header         header;
+	__le32                             reserved08;
+	u8                                 num_phys;
+	u8                                 reserved0d[3];
+	struct mpi3_man8_phy_info              phy_info[MPI3_MAN8_PHY_INFO_MAX];
+};
+#define MPI3_MAN8_PAGEVERSION                   (0x00)
+struct mpi3_man9_rsrc_entry {
+	__le32     maximum;
+	__le32     decrement;
+	__le32     minimum;
+	__le32     actual;
+};
+enum mpi3_man9_resources {
+	MPI3_MAN9_RSRC_OUTSTANDING_REQS    = 0,
+	MPI3_MAN9_RSRC_TARGET_CMDS         = 1,
+	MPI3_MAN9_RSRC_RESERVED02          = 2,
+	MPI3_MAN9_RSRC_NVME                = 3,
+	MPI3_MAN9_RSRC_INITIATORS          = 4,
+	MPI3_MAN9_RSRC_VDS                 = 5,
+	MPI3_MAN9_RSRC_ENCLOSURES          = 6,
+	MPI3_MAN9_RSRC_ENCLOSURE_PHYS      = 7,
+	MPI3_MAN9_RSRC_EXPANDERS           = 8,
+	MPI3_MAN9_RSRC_PCIE_SWITCHES       = 9,
+	MPI3_MAN9_RSRC_RESERVED10          = 10,
+	MPI3_MAN9_RSRC_HOST_PD_DRIVES      = 11,
+	MPI3_MAN9_RSRC_ADV_HOST_PD_DRIVES  = 12,
+	MPI3_MAN9_RSRC_RAID_PD_DRIVES      = 13,
+	MPI3_MAN9_RSRC_DRV_DIAG_BUF        = 14,
+	MPI3_MAN9_RSRC_NAMESPACE_COUNT     = 15,
+	MPI3_MAN9_RSRC_NUM_RESOURCES
+};
+#define MPI3_MAN9_MIN_OUTSTANDING_REQS      (1)
+#define MPI3_MAN9_MAX_OUTSTANDING_REQS      (65000)
+#define MPI3_MAN9_MIN_TARGET_CMDS           (0)
+#define MPI3_MAN9_MAX_TARGET_CMDS           (65535)
+#define MPI3_MAN9_MIN_NVME_TARGETS          (0)
+#define MPI3_MAN9_MIN_INITIATORS            (0)
+#define MPI3_MAN9_MIN_VDS                   (0)
+#define MPI3_MAN9_MIN_ENCLOSURES            (1)
+#define MPI3_MAN9_MAX_ENCLOSURES            (65535)
+#define MPI3_MAN9_MIN_ENCLOSURE_PHYS        (0)
+#define MPI3_MAN9_MIN_EXPANDERS             (0)
+#define MPI3_MAN9_MAX_EXPANDERS             (65535)
+#define MPI3_MAN9_MIN_PCIE_SWITCHES         (0)
+#define MPI3_MAN9_MIN_HOST_PD_DRIVES        (0)
+#define MPI3_MAN9_ADV_HOST_PD_DRIVES        (0)
+#define MPI3_MAN9_RAID_PD_DRIVES            (0)
+#define MPI3_MAN9_DRIVER_DIAG_BUFFER        (0)
+#define MPI3_MAN9_MIN_NAMESPACE_COUNT       (1)
+#define MPI3_MAN9_MIN_EXPANDERS             (0)
+#define MPI3_MAN9_MAX_EXPANDERS             (65535)
+struct mpi3_man_page9 {
+	struct mpi3_config_page_header         header;
+	u8                                 num_resources;
+	u8                                 reserved09;
+	__le16                             reserved0a;
+	__le32                             reserved0c;
+	__le32                             reserved10;
+	__le32                             reserved14;
+	__le32                             reserved18;
+	__le32                             reserved1c;
+	struct mpi3_man9_rsrc_entry            resource[MPI3_MAN9_RSRC_NUM_RESOURCES];
+};
+#define MPI3_MAN9_PAGEVERSION                   (0x00)
+struct mpi3_man10_istwi_ctrlr_entry {
+	__le16     slave_address;
+	__le16     flags;
+	u8         scl_low_override;
+	u8         scl_high_override;
+	__le16     reserved06;
+};
+#define MPI3_MAN10_ISTWI_CTRLR_FLAGS_BUS_SPEED_MASK         (0x000c)
+#define MPI3_MAN10_ISTWI_CTRLR_FLAGS_BUS_SPEED_100K         (0x0000)
+#define MPI3_MAN10_ISTWI_CTRLR_FLAGS_BUS_SPEED_400K         (0x0004)
+#define MPI3_MAN10_ISTWI_CTRLR_FLAGS_SLAVE_ENABLED          (0x0002)
+#define MPI3_MAN10_ISTWI_CTRLR_FLAGS_MASTER_ENABLED         (0x0001)
+#ifndef MPI3_MAN10_ISTWI_CTRLR_MAX
+#define MPI3_MAN10_ISTWI_CTRLR_MAX          (1)
+#endif
+struct mpi3_man_page10 {
+	struct mpi3_config_page_header         header;
+	__le32                             reserved08;
+	u8                                 num_istwi_ctrl;
+	u8                                 reserved0d[3];
+	struct mpi3_man10_istwi_ctrlr_entry    istwi_controller[MPI3_MAN10_ISTWI_CTRLR_MAX];
+};
+#define MPI3_MAN10_PAGEVERSION                  (0x00)
+struct mpi3_man11_mux_device_format {
+	u8         max_channel;
+	u8         reserved01[3];
+	__le32     reserved04;
+};
+struct mpi3_man11_temp_sensor_device_format {
+	u8         type;
+	u8         reserved01[3];
+	u8         temp_channel[4];
+};
+#define MPI3_MAN11_TEMP_SENSOR_TYPE_MAX6654                (0x00)
+#define MPI3_MAN11_TEMP_SENSOR_TYPE_EMC1442                (0x01)
+#define MPI3_MAN11_TEMP_SENSOR_TYPE_ADT7476                (0x02)
+#define MPI3_MAN11_TEMP_SENSOR_TYPE_SE97B                  (0x03)
+#define MPI3_MAN11_TEMP_SENSOR_CHANNEL_LOCATION_MASK       (0xe0)
+#define MPI3_MAN11_TEMP_SENSOR_CHANNEL_LOCATION_SHIFT      (5)
+#define MPI3_MAN11_TEMP_SENSOR_CHANNEL_ENABLED             (0x01)
+struct mpi3_man11_seeprom_device_format {
+	u8         size;
+	u8         page_write_size;
+	__le16     reserved02;
+	__le32     reserved04;
+};
+#define MPI3_MAN11_SEEPROM_SIZE_1KBITS              (0x01)
+#define MPI3_MAN11_SEEPROM_SIZE_2KBITS              (0x02)
+#define MPI3_MAN11_SEEPROM_SIZE_4KBITS              (0x03)
+#define MPI3_MAN11_SEEPROM_SIZE_8KBITS              (0x04)
+#define MPI3_MAN11_SEEPROM_SIZE_16KBITS             (0x05)
+#define MPI3_MAN11_SEEPROM_SIZE_32KBITS             (0x06)
+#define MPI3_MAN11_SEEPROM_SIZE_64KBITS             (0x07)
+#define MPI3_MAN11_SEEPROM_SIZE_128KBITS            (0x08)
+struct mpi3_man11_ddr_spd_device_format {
+	u8         channel;
+	u8         reserved01[3];
+	__le32     reserved04;
+};
+struct mpi3_man11_cable_mgmt_device_format {
+	u8         type;
+	u8         receptacle_id;
+	__le16     reserved02;
+	__le32     reserved04;
+};
+#define MPI3_MAN11_CABLE_MGMT_TYPE_SFF_8636           (0x00)
+struct mpi3_man11_bkplane_spec_ubm_format {
+	__le16     flags;
+	__le16     reserved02;
+};
+#define MPI3_MAN11_BKPLANE_UBM_FLAGS_REFCLK_POLICY_ALWAYS_ENABLED  (0x0200)
+#define MPI3_MAN11_BKPLANE_UBM_FLAGS_FORCE_POLLING                 (0x0100)
+#define MPI3_MAN11_BKPLANE_UBM_FLAGS_MAX_FRU_MASK                  (0x00f0)
+#define MPI3_MAN11_BKPLANE_UBM_FLAGS_MAX_FRU_SHIFT                 (4)
+#define MPI3_MAN11_BKPLANE_UBM_FLAGS_POLL_INTERVAL_MASK            (0x000f)
+#define MPI3_MAN11_BKPLANE_UBM_FLAGS_POLL_INTERVAL_SHIFT           (0)
+struct mpi3_man11_bkplane_spec_non_ubm_format {
+	__le16     flags;
+	u8         reserved02;
+	u8         type;
+};
+#define MPI3_MAN11_BKPLANE_NON_UBM_FLAGS_GROUP_MASK                    (0xf000)
+#define MPI3_MAN11_BKPLANE_NON_UBM_FLAGS_GROUP_SHIFT                   (12)
+#define MPI3_MAN11_BKPLANE_NON_UBM_FLAGS_REFCLK_POLICY_ALWAYS_ENABLED  (0x0200)
+#define MPI3_MAN11_BKPLANE_NON_UBM_FLAGS_LINKWIDTH_MASK                (0x00c0)
+#define MPI3_MAN11_BKPLANE_NON_UBM_FLAGS_LINKWIDTH_4                   (0x0000)
+#define MPI3_MAN11_BKPLANE_NON_UBM_FLAGS_LINKWIDTH_2                   (0x0040)
+#define MPI3_MAN11_BKPLANE_NON_UBM_FLAGS_LINKWIDTH_1                   (0x0080)
+#define MPI3_MAN11_BKPLANE_NON_UBM_FLAGS_PRESENCE_DETECT_MASK          (0x0030)
+#define MPI3_MAN11_BKPLANE_NON_UBM_FLAGS_PRESENCE_DETECT_GPIO          (0x0000)
+#define MPI3_MAN11_BKPLANE_NON_UBM_FLAGS_PRESENCE_DETECT_REG           (0x0010)
+#define MPI3_MAN11_BKPLANE_NON_UBM_FLAGS_POLL_INTERVAL_MASK            (0x000f)
+#define MPI3_MAN11_BKPLANE_NON_UBM_FLAGS_POLL_INTERVAL_SHIFT           (0)
+#define MPI3_MAN11_BKPLANE_NON_UBM_TYPE_VPP                            (0x00)
+union mpi3_man11_bkplane_spec_format {
+	struct mpi3_man11_bkplane_spec_ubm_format         ubm;
+	struct mpi3_man11_bkplane_spec_non_ubm_format     non_ubm;
+};
+struct mpi3_man11_bkplane_mgmt_device_format {
+	u8                                        type;
+	u8                                        receptacle_id;
+	u8                                        reset_info;
+	u8                                        reserved03;
+	union mpi3_man11_bkplane_spec_format         backplane_mgmt_specific;
+};
+#define MPI3_MAN11_BKPLANE_MGMT_TYPE_UBM            (0x00)
+#define MPI3_MAN11_BKPLANE_MGMT_TYPE_NON_UBM        (0x01)
+#define MPI3_MAN11_BACKPLANE_RESETINFO_ASSERT_TIME_MASK       (0xf0)
+#define MPI3_MAN11_BACKPLANE_RESETINFO_ASSERT_TIME_SHIFT      (4)
+#define MPI3_MAN11_BACKPLANE_RESETINFO_READY_TIME_MASK        (0x0f)
+#define MPI3_MAN11_BACKPLANE_RESETINFO_READY_TIME_SHIFT       (0)
+struct mpi3_man11_gas_gauge_device_format {
+	u8         type;
+	u8         reserved01[3];
+	__le32     reserved04;
+};
+#define MPI3_MAN11_GAS_GAUGE_TYPE_STANDARD          (0x00)
+struct mpi3_man11_mgmt_ctrlr_device_format {
+	__le32     reserved00;
+	__le32     reserved04;
+};
+struct mpi3_man11_board_fan_device_format {
+	u8         flags;
+	u8         reserved01;
+	u8         min_fan_speed;
+	u8         max_fan_speed;
+	__le32     reserved04;
+};
+#define MPI3_MAN11_BOARD_FAN_FLAGS_FAN_CTRLR_TYPE_MASK        (0x07)
+#define MPI3_MAN11_BOARD_FAN_FLAGS_FAN_CTRLR_TYPE_AMC6821     (0x00)
+union mpi3_man11_device_specific_format {
+	struct mpi3_man11_mux_device_format            mux;
+	struct mpi3_man11_temp_sensor_device_format    temp_sensor;
+	struct mpi3_man11_seeprom_device_format        seeprom;
+	struct mpi3_man11_ddr_spd_device_format        ddr_spd;
+	struct mpi3_man11_cable_mgmt_device_format     cable_mgmt;
+	struct mpi3_man11_bkplane_mgmt_device_format   bkplane_mgmt;
+	struct mpi3_man11_gas_gauge_device_format      gas_gauge;
+	struct mpi3_man11_mgmt_ctrlr_device_format     mgmt_controller;
+	struct mpi3_man11_board_fan_device_format      board_fan;
+	__le32                                     words[2];
+};
+struct mpi3_man11_istwi_device_format {
+	u8                                     device_type;
+	u8                                     controller;
+	u8                                     reserved02;
+	u8                                     flags;
+	__le16                                 device_address;
+	u8                                     mux_channel;
+	u8                                     mux_index;
+	union mpi3_man11_device_specific_format   device_specific;
+};
+#define MPI3_MAN11_ISTWI_DEVTYPE_MUX                  (0x00)
+#define MPI3_MAN11_ISTWI_DEVTYPE_TEMP_SENSOR          (0x01)
+#define MPI3_MAN11_ISTWI_DEVTYPE_SEEPROM              (0x02)
+#define MPI3_MAN11_ISTWI_DEVTYPE_DDR_SPD              (0x03)
+#define MPI3_MAN11_ISTWI_DEVTYPE_CABLE_MGMT           (0x04)
+#define MPI3_MAN11_ISTWI_DEVTYPE_BACKPLANE_MGMT       (0x05)
+#define MPI3_MAN11_ISTWI_DEVTYPE_GAS_GAUGE            (0x06)
+#define MPI3_MAN11_ISTWI_DEVTYPE_MGMT_CONTROLLER      (0x07)
+#define MPI3_MAN11_ISTWI_DEVTYPE_BOARD_FAN            (0x08)
+#define MPI3_MAN11_ISTWI_FLAGS_MUX_PRESENT            (0x01)
+#ifndef MPI3_MAN11_ISTWI_DEVICE_MAX
+#define MPI3_MAN11_ISTWI_DEVICE_MAX             (1)
+#endif
+struct mpi3_man_page11 {
+	struct mpi3_config_page_header         header;
+	__le32                             reserved08;
+	u8                                 num_istwi_dev;
+	u8                                 reserved0d[3];
+	struct mpi3_man11_istwi_device_format  istwi_device[MPI3_MAN11_ISTWI_DEVICE_MAX];
+};
+#define MPI3_MAN11_PAGEVERSION                  (0x00)
+#ifndef MPI3_MAN12_NUM_SGPIO_MAX
+#define MPI3_MAN12_NUM_SGPIO_MAX                                     (1)
+#endif
+struct mpi3_man12_sgpio_info {
+	u8                                 slot_count;
+	u8                                 reserved01[3];
+	__le32                             reserved04;
+	u8                                 phy_order[32];
+};
+struct mpi3_man_page12 {
+	struct mpi3_config_page_header         header;
+	__le32                             flags;
+	__le32                             s_clock_freq;
+	__le32                             activity_modulation;
+	u8                                 num_sgpio;
+	u8                                 reserved15[3];
+	__le32                             reserved18;
+	__le32                             reserved1c;
+	__le32                             pattern[8];
+	struct mpi3_man12_sgpio_info           sgpio_info[MPI3_MAN12_NUM_SGPIO_MAX];
+};
+#define MPI3_MAN12_PAGEVERSION                                       (0x00)
+#define MPI3_MAN12_FLAGS_ERROR_PRESENCE_ENABLED                      (0x0400)
+#define MPI3_MAN12_FLAGS_ACTIVITY_INVERT_ENABLED                     (0x0200)
+#define MPI3_MAN12_FLAGS_GROUP_ID_DISABLED                           (0x0100)
+#define MPI3_MAN12_FLAGS_SIO_CLK_FILTER_ENABLED                      (0x0004)
+#define MPI3_MAN12_FLAGS_SCLOCK_SLOAD_TYPE_MASK                      (0x0002)
+#define MPI3_MAN12_FLAGS_SCLOCK_SLOAD_TYPE_PUSH_PULL                 (0x0000)
+#define MPI3_MAN12_FLAGS_SCLOCK_SLOAD_TYPE_OPEN_DRAIN                (0x0002)
+#define MPI3_MAN12_FLAGS_SDATAOUT_TYPE_MASK                          (0x0001)
+#define MPI3_MAN12_FLAGS_SDATAOUT_TYPE_PUSH_PULL                     (0x0000)
+#define MPI3_MAN12_FLAGS_SDATAOUT_TYPE_OPEN_DRAIN                    (0x0001)
+#define MPI3_MAN12_SIO_CLK_FREQ_MIN                                  (32)
+#define MPI3_MAN12_SIO_CLK_FREQ_MAX                                  (100000)
+#define MPI3_MAN12_ACTIVITY_MODULATION_FORCE_OFF_MASK                (0x0000f000)
+#define MPI3_MAN12_ACTIVITY_MODULATION_FORCE_OFF_SHIFT               (12)
+#define MPI3_MAN12_ACTIVITY_MODULATION_MAX_ON_MASK                   (0x00000f00)
+#define MPI3_MAN12_ACTIVITY_MODULATION_MAX_ON_SHIFT                  (8)
+#define MPI3_MAN12_ACTIVITY_MODULATION_STRETCH_OFF_MASK              (0x000000f0)
+#define MPI3_MAN12_ACTIVITY_MODULATION_STRETCH_OFF_SHIFT             (4)
+#define MPI3_MAN12_ACTIVITY_MODULATION_STRETCH_ON_MASK               (0x0000000f)
+#define MPI3_MAN12_ACTIVITY_MODULATION_STRETCH_ON_SHIFT              (0)
+#define MPI3_MAN12_PATTERN_RATE_MASK                                 (0xe0000000)
+#define MPI3_MAN12_PATTERN_RATE_2_HZ                                 (0x00000000)
+#define MPI3_MAN12_PATTERN_RATE_4_HZ                                 (0x20000000)
+#define MPI3_MAN12_PATTERN_RATE_8_HZ                                 (0x40000000)
+#define MPI3_MAN12_PATTERN_RATE_16_HZ                                (0x60000000)
+#define MPI3_MAN12_PATTERN_RATE_10_HZ                                (0x80000000)
+#define MPI3_MAN12_PATTERN_RATE_20_HZ                                (0xa0000000)
+#define MPI3_MAN12_PATTERN_RATE_40_HZ                                (0xc0000000)
+#define MPI3_MAN12_PATTERN_LENGTH_MASK                               (0x1f000000)
+#define MPI3_MAN12_PATTERN_LENGTH_SHIFT                              (24)
+#define MPI3_MAN12_PATTERN_BIT_PATTERN_MASK                          (0x00ffffff)
+#define MPI3_MAN12_PATTERN_BIT_PATTERN_SHIFT                         (0)
+#ifndef MPI3_MAN13_NUM_TRANSLATION_MAX
+#define MPI3_MAN13_NUM_TRANSLATION_MAX                               (1)
+#endif
+struct mpi3_man13_translation_info {
+	__le32                             slot_status;
+	__le32                             mask;
+	u8                                 activity;
+	u8                                 locate;
+	u8                                 error;
+	u8                                 reserved0b;
+};
+#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_FAULT                     (0x20000000)
+#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_DEVICE_OFF                (0x10000000)
+#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_DEVICE_ACTIVITY           (0x00800000)
+#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_DO_NOT_REMOVE             (0x00400000)
+#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_DEVICE_MISSING            (0x00100000)
+#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_INSERT                    (0x00080000)
+#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_REMOVAL                   (0x00040000)
+#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_IDENTIFY                  (0x00020000)
+#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_OK                        (0x00008000)
+#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_RESERVED_DEVICE           (0x00004000)
+#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_HOT_SPARE                 (0x00002000)
+#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_CONSISTENCY_CHECK         (0x00001000)
+#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_IN_CRITICAL_ARRAY         (0x00000800)
+#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_IN_FAILED_ARRAY           (0x00000400)
+#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_REBUILD_REMAP             (0x00000200)
+#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_REBUILD_REMAP_ABORT       (0x00000100)
+#define MPI3_MAN13_TRANSLATION_SLOTSTATUS_PREDICTED_FAILURE         (0x00000040)
+#define MPI3_MAN13_BLINK_PATTERN_FORCE_OFF                          (0x00)
+#define MPI3_MAN13_BLINK_PATTERN_FORCE_ON                           (0x01)
+#define MPI3_MAN13_BLINK_PATTERN_PATTERN_0                          (0x02)
+#define MPI3_MAN13_BLINK_PATTERN_PATTERN_1                          (0x03)
+#define MPI3_MAN13_BLINK_PATTERN_PATTERN_2                          (0x04)
+#define MPI3_MAN13_BLINK_PATTERN_PATTERN_3                          (0x05)
+#define MPI3_MAN13_BLINK_PATTERN_PATTERN_4                          (0x06)
+#define MPI3_MAN13_BLINK_PATTERN_PATTERN_5                          (0x07)
+#define MPI3_MAN13_BLINK_PATTERN_PATTERN_6                          (0x08)
+#define MPI3_MAN13_BLINK_PATTERN_PATTERN_7                          (0x09)
+#define MPI3_MAN13_BLINK_PATTERN_ACTIVITY                           (0x0a)
+#define MPI3_MAN13_BLINK_PATTERN_ACTIVITY_TRAIL                     (0x0b)
+struct mpi3_man_page13 {
+	struct mpi3_config_page_header         header;
+	u8                                 num_trans;
+	u8                                 reserved09[3];
+	__le32                             reserved0c;
+	struct mpi3_man13_translation_info     translation[MPI3_MAN13_NUM_TRANSLATION_MAX];
+};
+#define MPI3_MAN13_PAGEVERSION                                       (0x00)
+struct mpi3_man_page14 {
+	struct mpi3_config_page_header         header;
+	__le32                             reserved08;
+	u8                                 num_slot_groups;
+	u8                                 num_slots;
+	__le16                             max_cert_chain_length;
+	__le32                             sealed_slots;
+	__le32                             populated_slots;
+	__le32                             mgmt_pt_updatable_slots;
+};
+#define MPI3_MAN14_PAGEVERSION                                       (0x00)
+#define MPI3_MAN14_NUMSLOTS_MAX                                      (32)
+#ifndef MPI3_MAN15_VERSION_RECORD_MAX
+#define MPI3_MAN15_VERSION_RECORD_MAX      1
+#endif
+struct mpi3_man15_version_record {
+	__le16                             spdm_version;
+	__le16                             reserved02;
+};
+struct mpi3_man_page15 {
+	struct mpi3_config_page_header         header;
+	u8                                 num_version_records;
+	u8                                 reserved09[3];
+	__le32                             reserved0c;
+	struct mpi3_man15_version_record       version_record[MPI3_MAN15_VERSION_RECORD_MAX];
+};
+#define MPI3_MAN15_PAGEVERSION                                       (0x00)
+#ifndef MPI3_MAN16_CERT_ALGO_MAX
+#define MPI3_MAN16_CERT_ALGO_MAX      1
+#endif
+struct mpi3_man16_certificate_algorithm {
+	u8                                      slot_group;
+	u8                                      reserved01[3];
+	__le32                                  base_asym_algo;
+	__le32                                  base_hash_algo;
+	__le32                                  reserved0c[3];
+};
+struct mpi3_man_page16 {
+	struct mpi3_config_page_header              header;
+	__le32                                  reserved08;
+	u8                                      num_cert_algos;
+	u8                                      reserved0d[3];
+	struct mpi3_man16_certificate_algorithm     certificate_algorithm[MPI3_MAN16_CERT_ALGO_MAX];
+};
+#define MPI3_MAN16_PAGEVERSION                                       (0x00)
+#ifndef MPI3_MAN17_HASH_ALGORITHM_MAX
+#define MPI3_MAN17_HASH_ALGORITHM_MAX      1
+#endif
+struct mpi3_man17_hash_algorithm {
+	u8                                 meas_specification;
+	u8                                 reserved01[3];
+	__le32                             measurement_hash_algo;
+	__le32                             reserved08[2];
+};
+struct mpi3_man_page17 {
+	struct mpi3_config_page_header         header;
+	__le32                             reserved08;
+	u8                                 num_hash_algos;
+	u8                                 reserved0d[3];
+	struct mpi3_man17_hash_algorithm       hash_algorithm[MPI3_MAN17_HASH_ALGORITHM_MAX];
+};
+#define MPI3_MAN17_PAGEVERSION                                       (0x00)
+struct mpi3_man_page20 {
+	struct mpi3_config_page_header         header;
+	__le32                             reserved08;
+	__le32                             nonpremium_features;
+	u8                                 allowed_personalities;
+	u8                                 reserved11[3];
+};
+#define MPI3_MAN20_PAGEVERSION                                       (0x00)
+#define MPI3_MAN20_ALLOWEDPERSON_RAID_MASK                           (0x02)
+#define MPI3_MAN20_ALLOWEDPERSON_RAID_ALLOWED                        (0x02)
+#define MPI3_MAN20_ALLOWEDPERSON_RAID_NOT_ALLOWED                    (0x00)
+#define MPI3_MAN20_ALLOWEDPERSON_EHBA_MASK                           (0x01)
+#define MPI3_MAN20_ALLOWEDPERSON_EHBA_ALLOWED                        (0x01)
+#define MPI3_MAN20_ALLOWEDPERSON_EHBA_NOT_ALLOWED                    (0x00)
+#define MPI3_MAN20_NONPREMUIM_DISABLE_PD_DEGRADED_MASK               (0x01)
+#define MPI3_MAN20_NONPREMUIM_DISABLE_PD_DEGRADED_ENABLED            (0x00)
+#define MPI3_MAN20_NONPREMUIM_DISABLE_PD_DEGRADED_DISABLED           (0x01)
+struct mpi3_man_page21 {
+	struct mpi3_config_page_header         header;
+	__le32                             reserved08;
+	__le32                             flags;
+};
+#define MPI3_MAN21_PAGEVERSION                                       (0x00)
+#define MPI3_MAN21_FLAGS_HOST_METADATA_CAPABILITY_MASK               (0x80)
+#define MPI3_MAN21_FLAGS_HOST_METADATA_CAPABILITY_ENABLED            (0x80)
+#define MPI3_MAN21_FLAGS_HOST_METADATA_CAPABILITY_DISABLED           (0x00)
+#define MPI3_MAN21_FLAGS_UNCERTIFIED_DRIVES_MASK                     (0x60)
+#define MPI3_MAN21_FLAGS_UNCERTIFIED_DRIVES_BLOCK                    (0x00)
+#define MPI3_MAN21_FLAGS_UNCERTIFIED_DRIVES_ALLOW                    (0x20)
+#define MPI3_MAN21_FLAGS_UNCERTIFIED_DRIVES_WARN                     (0x40)
+#define MPI3_MAN21_FLAGS_BLOCK_SSD_WR_CACHE_CHANGE_MASK              (0x08)
+#define MPI3_MAN21_FLAGS_BLOCK_SSD_WR_CACHE_CHANGE_ALLOW             (0x00)
+#define MPI3_MAN21_FLAGS_BLOCK_SSD_WR_CACHE_CHANGE_PREVENT           (0x08)
+#define MPI3_MAN21_FLAGS_SES_VPD_ASSOC_MASK                          (0x01)
+#define MPI3_MAN21_FLAGS_SES_VPD_ASSOC_DEFAULT                       (0x00)
+#define MPI3_MAN21_FLAGS_SES_VPD_ASSOC_OEM_SPECIFIC                  (0x01)
+#ifndef MPI3_MAN_PROD_SPECIFIC_MAX
+#define MPI3_MAN_PROD_SPECIFIC_MAX                      (1)
+#endif
+struct mpi3_man_page_product_specific {
+	struct mpi3_config_page_header         header;
+	__le32                             product_specific_info[MPI3_MAN_PROD_SPECIFIC_MAX];
+};
+struct mpi3_io_unit_page0 {
+	struct mpi3_config_page_header         header;
+	__le64                             unique_value;
+	__le32                             nvdata_version_default;
+	__le32                             nvdata_version_persistent;
+};
+#define MPI3_IOUNIT0_PAGEVERSION                (0x00)
+struct mpi3_io_unit_page1 {
+	struct mpi3_config_page_header         header;
+	__le32                             flags;
+	u8                                 dmd_io_delay;
+	u8                                 dmd_report_pcie;
+	u8                                 dmd_report_sata;
+	u8                                 dmd_report_sas;
+};
+#define MPI3_IOUNIT1_PAGEVERSION                (0x00)
+#define MPI3_IOUNIT1_FLAGS_NVME_WRITE_CACHE_MASK                   (0x00000030)
+#define MPI3_IOUNIT1_FLAGS_NVME_WRITE_CACHE_ENABLE                 (0x00000000)
+#define MPI3_IOUNIT1_FLAGS_NVME_WRITE_CACHE_DISABLE                (0x00000010)
+#define MPI3_IOUNIT1_FLAGS_NVME_WRITE_CACHE_NO_MODIFY              (0x00000020)
+#define MPI3_IOUNIT1_FLAGS_ATA_SECURITY_FREEZE_LOCK                (0x00000008)
+#define MPI3_IOUNIT1_FLAGS_WRITE_SAME_BUFFER                       (0x00000004)
+#define MPI3_IOUNIT1_FLAGS_SATA_WRITE_CACHE_MASK                   (0x00000003)
+#define MPI3_IOUNIT1_FLAGS_SATA_WRITE_CACHE_ENABLE                 (0x00000000)
+#define MPI3_IOUNIT1_FLAGS_SATA_WRITE_CACHE_DISABLE                (0x00000001)
+#define MPI3_IOUNIT1_FLAGS_SATA_WRITE_CACHE_UNCHANGED              (0x00000002)
+#define MPI3_IOUNIT1_DMD_REPORT_DELAY_TIME_MASK                    (0x7f)
+#define MPI3_IOUNIT1_DMD_REPORT_UNIT_16_SEC                        (0x80)
+#ifndef MPI3_IO_UNIT2_GPIO_VAL_MAX
+#define MPI3_IO_UNIT2_GPIO_VAL_MAX      (1)
+#endif
+struct mpi3_io_unit_page2 {
+	struct mpi3_config_page_header         header;
+	u8                                 gpio_count;
+	u8                                 reserved09[3];
+	__le16                             gpio_val[MPI3_IO_UNIT2_GPIO_VAL_MAX];
+};
+#define MPI3_IOUNIT2_PAGEVERSION                (0x00)
+#define MPI3_IOUNIT2_GPIO_FUNCTION_MASK         (0xfffc)
+#define MPI3_IOUNIT2_GPIO_FUNCTION_SHIFT        (2)
+#define MPI3_IOUNIT2_GPIO_SETTING_MASK          (0x0001)
+#define MPI3_IOUNIT2_GPIO_SETTING_OFF           (0x0000)
+#define MPI3_IOUNIT2_GPIO_SETTING_ON            (0x0001)
+struct mpi3_io_unit3_sensor {
+	__le16             flags;
+	u8                 threshold_margin;
+	u8                 reserved03;
+	__le16             threshold[3];
+	__le16             reserved0a;
+	__le32             reserved0c;
+	__le32             reserved10;
+	__le32             reserved14;
+};
+#define MPI3_IOUNIT3_SENSOR_FLAGS_FATAL_EVENT_ENABLED           (0x0010)
+#define MPI3_IOUNIT3_SENSOR_FLAGS_FATAL_ACTION_ENABLED          (0x0008)
+#define MPI3_IOUNIT3_SENSOR_FLAGS_CRITICAL_EVENT_ENABLED        (0x0004)
+#define MPI3_IOUNIT3_SENSOR_FLAGS_CRITICAL_ACTION_ENABLED       (0x0002)
+#define MPI3_IOUNIT3_SENSOR_FLAGS_WARNING_EVENT_ENABLED         (0x0001)
+#ifndef MPI3_IO_UNIT3_SENSOR_MAX
+#define MPI3_IO_UNIT3_SENSOR_MAX                                (1)
+#endif
+struct mpi3_io_unit_page3 {
+	struct mpi3_config_page_header         header;
+	__le32                             reserved08;
+	u8                                 num_sensors;
+	u8                                 nominal_poll_interval;
+	u8                                 warning_poll_interval;
+	u8                                 reserved0f;
+	struct mpi3_io_unit3_sensor            sensor[MPI3_IO_UNIT3_SENSOR_MAX];
+};
+#define MPI3_IOUNIT3_PAGEVERSION                (0x00)
+struct mpi3_io_unit4_sensor {
+	__le16             current_temperature;
+	__le16             reserved02;
+	u8                 flags;
+	u8                 reserved05[3];
+	__le16             istwi_index;
+	u8                 channel;
+	u8                 reserved0b;
+	__le32             reserved0c;
+};
+#define MPI3_IOUNIT4_SENSOR_FLAGS_LOC_MASK          (0xe0)
+#define MPI3_IOUNIT4_SENSOR_FLAGS_LOC_SHIFT         (5)
+#define MPI3_IOUNIT4_SENSOR_FLAGS_TEMP_VALID        (0x01)
+#define MPI3_IOUNIT4_SENSOR_ISTWI_INDEX_INTERNAL    (0xffff)
+#define MPI3_IOUNIT4_SENSOR_CHANNEL_RESERVED        (0xff)
+#ifndef MPI3_IO_UNIT4_SENSOR_MAX
+#define MPI3_IO_UNIT4_SENSOR_MAX                                (1)
+#endif
+struct mpi3_io_unit_page4 {
+	struct mpi3_config_page_header         header;
+	__le32                             reserved08;
+	u8                                 num_sensors;
+	u8                                 reserved0d[3];
+	struct mpi3_io_unit4_sensor            sensor[MPI3_IO_UNIT4_SENSOR_MAX];
+};
+#define MPI3_IOUNIT4_PAGEVERSION                (0x00)
+struct mpi3_io_unit5_spinup_group {
+	u8                 max_target_spinup;
+	u8                 spinup_delay;
+	u8                 spinup_flags;
+	u8                 reserved03;
+};
+#define MPI3_IOUNIT5_SPINUP_FLAGS_DISABLE       (0x01)
+#ifndef MPI3_IO_UNIT5_PHY_MAX
+#define MPI3_IO_UNIT5_PHY_MAX       (4)
+#endif
+struct mpi3_io_unit_page5 {
+	struct mpi3_config_page_header         header;
+	struct mpi3_io_unit5_spinup_group      spinup_group_parameters[4];
+	__le32                             reserved18;
+	__le32                             reserved1c;
+	__le16                             device_shutdown;
+	__le16                             reserved22;
+	u8                                 pcie_device_wait_time;
+	u8                                 sata_device_wait_time;
+	u8                                 spinup_encl_drive_count;
+	u8                                 spinup_encl_delay;
+	u8                                 num_phys;
+	u8                                 pe_initial_spinup_delay;
+	u8                                 topology_stable_time;
+	u8                                 flags;
+	u8                                 phy[MPI3_IO_UNIT5_PHY_MAX];
+};
+#define MPI3_IOUNIT5_PAGEVERSION                           (0x00)
+#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_NO_ACTION             (0x00)
+#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_DIRECT_ATTACHED       (0x01)
+#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_EXPANDER_ATTACHED     (0x02)
+#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_SWITCH_ATTACHED       (0x02)
+#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_DIRECT_AND_EXPANDER   (0x03)
+#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_DIRECT_AND_SWITCH     (0x03)
+#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_SATA_HDD_MASK         (0x0300)
+#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_SATA_HDD_SHIFT        (8)
+#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_SAS_HDD_MASK          (0x00c0)
+#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_SAS_HDD_SHIFT         (6)
+#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_NVME_SSD_MASK         (0x0030)
+#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_NVME_SSD_SHIFT        (4)
+#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_SATA_SSD_MASK         (0x000c)
+#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_SATA_SSD_SHIFT        (2)
+#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_SAS_SSD_MASK          (0x0003)
+#define MPI3_IOUNIT5_DEVICE_SHUTDOWN_SAA_SSD_SHIFT         (0)
+#define MPI3_IOUNIT5_FLAGS_SATAPUIS_MASK                   (0x0c)
+#define MPI3_IOUNIT5_FLAGS_SATAPUIS_NOT_SUPPORTED          (0x00)
+#define MPI3_IOUNIT5_FLAGS_SATAPUIS_OS_CONTROLLED          (0x04)
+#define MPI3_IOUNIT5_FLAGS_SATAPUIS_APP_CONTROLLED         (0x08)
+#define MPI3_IOUNIT5_FLAGS_SATAPUIS_BLOCKED                (0x0c)
+#define MPI3_IOUNIT5_FLAGS_POWER_CAPABLE_SPINUP            (0x02)
+#define MPI3_IOUNIT5_FLAGS_AUTO_PORT_ENABLE                (0x01)
+#define MPI3_IOUNIT5_PHY_SPINUP_GROUP_MASK                 (0x03)
+struct mpi3_io_unit_page6 {
+	struct mpi3_config_page_header         header;
+	__le32                             board_power_requirement;
+	__le32                             pci_slot_power_allocation;
+	u8                                 flags;
+	u8                                 reserved11[3];
+};
+#define MPI3_IOUNIT6_PAGEVERSION                (0x00)
+#define MPI3_IOUNIT6_FLAGS_ACT_CABLE_PWR_EXC    (0x01)
+#ifndef MPI3_IOUNIT8_DIGEST_MAX
+#define MPI3_IOUNIT8_DIGEST_MAX                   (1)
+#endif
+union mpi3_iounit8_digest {
+	__le32                             dword[16];
+	__le16                             word[32];
+	u8                                 byte[64];
+};
+struct mpi3_io_unit_page8 {
+	struct mpi3_config_page_header         header;
+	u8                                 sb_mode;
+	u8                                 sb_state;
+	__le16                             reserved0a;
+	u8                                 num_slots;
+	u8                                 slots_available;
+	u8                                 current_key_encryption_algo;
+	u8                                 key_digest_hash_algo;
+	__le32                             reserved10[2];
+	__le32                             current_key[128];
+	union mpi3_iounit8_digest             digest[MPI3_IOUNIT8_DIGEST_MAX];
+};
+#define MPI3_IOUNIT8_PAGEVERSION                  (0x00)
+#define MPI3_IOUNIT8_SBMODE_SECURE_DEBUG          (0x04)
+#define MPI3_IOUNIT8_SBMODE_HARD_SECURE           (0x02)
+#define MPI3_IOUNIT8_SBMODE_CONFIG_SECURE         (0x01)
+#define MPI3_IOUNIT8_SBSTATE_KEY_UPDATE_PENDING   (0x02)
+#define MPI3_IOUNIT8_SBSTATE_SECURE_BOOT_ENABLED  (0x01)
+struct mpi3_io_unit_page9 {
+	struct mpi3_config_page_header         header;
+	__le32                             flags;
+	__le16                             first_device;
+	__le16                             reserved0e;
+};
+#define MPI3_IOUNIT9_PAGEVERSION                  (0x00)
+#define MPI3_IOUNIT9_FLAGS_VDFIRST_ENABLED         (0x01)
+#define MPI3_IOUNIT9_FIRSTDEVICE_UNKNOWN          (0xffff)
+struct mpi3_io_unit_page10 {
+	struct mpi3_config_page_header         header;
+	u8                                 flags;
+	u8                                 reserved09[3];
+	__le32                             silicon_id;
+	u8                                 fw_version_minor;
+	u8                                 fw_version_major;
+	u8                                 hw_version_minor;
+	u8                                 hw_version_major;
+	u8                                 part_number[16];
+};
+#define MPI3_IOUNIT10_PAGEVERSION                  (0x00)
+#define MPI3_IOUNIT10_FLAGS_VALID                  (0x01)
+#define MPI3_IOUNIT10_FLAGS_ACTIVEID_MASK          (0x02)
+#define MPI3_IOUNIT10_FLAGS_ACTIVEID_FIRST_REGION  (0x00)
+#define MPI3_IOUNIT10_FLAGS_ACTIVEID_SECOND_REGION (0x02)
+#define MPI3_IOUNIT10_FLAGS_PBLP_EXPECTED          (0x80)
+#ifndef MPI3_IOUNIT11_PROFILE_MAX
+#define MPI3_IOUNIT11_PROFILE_MAX                   (1)
+#endif
+struct mpi3_iounit11_profile {
+	u8                                 profile_identifier;
+	u8                                 reserved01[3];
+	__le16                             max_vds;
+	__le16                             max_host_pds;
+	__le16                             max_adv_host_pds;
+	__le16                             max_raid_pds;
+	__le16                             max_nvme;
+	__le16                             max_outstanding_requests;
+	__le16                             subsystem_id;
+	__le16                             reserved12;
+	__le32                             reserved14[2];
+};
+struct mpi3_io_unit_page11 {
+	struct mpi3_config_page_header         header;
+	__le32                             reserved08;
+	u8                                 num_profiles;
+	u8                                 current_profile_identifier;
+	__le16                             reserved0e;
+	struct mpi3_iounit11_profile           profile[MPI3_IOUNIT11_PROFILE_MAX];
+};
+#define MPI3_IOUNIT11_PAGEVERSION                  (0x00)
+#ifndef MPI3_IOUNIT12_BUCKET_MAX
+#define MPI3_IOUNIT12_BUCKET_MAX                   (1)
+#endif
+struct mpi3_iounit12_bucket {
+	u8                                 coalescing_depth;
+	u8                                 coalescing_timeout;
+	__le16                             io_count_low_boundary;
+	__le32                             reserved04;
+};
+struct mpi3_io_unit_page12 {
+	struct mpi3_config_page_header         header;
+	__le32                             flags;
+	__le32                             reserved0c[4];
+	u8                                 num_buckets;
+	u8                                 reserved1d[3];
+	struct mpi3_iounit12_bucket            bucket[MPI3_IOUNIT12_BUCKET_MAX];
+};
+#define MPI3_IOUNIT12_PAGEVERSION                  (0x00)
+#define MPI3_IOUNIT12_FLAGS_NUMPASSES_MASK         (0x00000300)
+#define MPI3_IOUNIT12_FLAGS_NUMPASSES_SHIFT        (8)
+#define MPI3_IOUNIT12_FLAGS_NUMPASSES_8            (0x00000000)
+#define MPI3_IOUNIT12_FLAGS_NUMPASSES_16           (0x00000100)
+#define MPI3_IOUNIT12_FLAGS_NUMPASSES_32           (0x00000200)
+#define MPI3_IOUNIT12_FLAGS_NUMPASSES_64           (0x00000300)
+#define MPI3_IOUNIT12_FLAGS_PASSPERIOD_MASK        (0x00000003)
+#define MPI3_IOUNIT12_FLAGS_PASSPERIOD_DISABLED    (0x00000000)
+#define MPI3_IOUNIT12_FLAGS_PASSPERIOD_500US       (0x00000001)
+#define MPI3_IOUNIT12_FLAGS_PASSPERIOD_1MS         (0x00000002)
+#define MPI3_IOUNIT12_FLAGS_PASSPERIOD_2MS         (0x00000003)
+struct mpi3_ioc_page0 {
+	struct mpi3_config_page_header         header;
+	__le32                             reserved08;
+	__le16                             vendor_id;
+	__le16                             device_id;
+	u8                                 revision_id;
+	u8                                 reserved11[3];
+	__le32                             class_code;
+	__le16                             subsystem_vendor_id;
+	__le16                             subsystem_id;
+};
+#define MPI3_IOC0_PAGEVERSION               (0x00)
+struct mpi3_ioc_page1 {
+	struct mpi3_config_page_header         header;
+	__le32                             coalescing_timeout;
+	u8                                 coalescing_depth;
+	u8                                 obsolete;
+	__le16                             reserved0e;
+};
+#define MPI3_IOC1_PAGEVERSION               (0x00)
+#ifndef MPI3_IOC2_EVENTMASK_WORDS
+#define MPI3_IOC2_EVENTMASK_WORDS           (4)
+#endif
+struct mpi3_ioc_page2 {
+	struct mpi3_config_page_header         header;
+	__le32                             reserved08;
+	__le16                             sas_broadcast_primitive_masks;
+	__le16                             sas_notify_primitive_masks;
+	__le32                             event_masks[MPI3_IOC2_EVENTMASK_WORDS];
+};
+#define MPI3_IOC2_PAGEVERSION               (0x00)
+#define MPI3_DRIVER_FLAGS_ADMINRAIDPD_BLOCKED               (0x0010)
+#define MPI3_DRIVER_FLAGS_OOBRAIDPD_BLOCKED                 (0x0008)
+#define MPI3_DRIVER_FLAGS_OOBRAIDVD_BLOCKED                 (0x0004)
+#define MPI3_DRIVER_FLAGS_OOBADVHOSTPD_BLOCKED              (0x0002)
+#define MPI3_DRIVER_FLAGS_OOBHOSTPD_BLOCKED                 (0x0001)
+struct mpi3_allowed_cmd_scsi {
+	__le16                             service_action;
+	u8                                 operation_code;
+	u8                                 command_flags;
+};
+struct mpi3_allowed_cmd_ata {
+	u8                                 subcommand;
+	u8                                 reserved01;
+	u8                                 command;
+	u8                                 command_flags;
+};
+struct mpi3_allowed_cmd_nvme {
+	u8                                 reserved00;
+	u8                                 nvme_cmd_flags;
+	u8                                 op_code;
+	u8                                 command_flags;
+};
+#define MPI3_DRIVER_ALLOWEDCMD_NVMECMDFLAGS_SUBQ_TYPE_MASK     (0x80)
+#define MPI3_DRIVER_ALLOWEDCMD_NVMECMDFLAGS_SUBQ_TYPE_IO       (0x00)
+#define MPI3_DRIVER_ALLOWEDCMD_NVMECMDFLAGS_SUBQ_TYPE_ADMIN    (0x80)
+#define MPI3_DRIVER_ALLOWEDCMD_NVMECMDFLAGS_CMDSET_MASK        (0x3f)
+#define MPI3_DRIVER_ALLOWEDCMD_NVMECMDFLAGS_CMDSET_NVM         (0x00)
+union mpi3_allowed_cmd {
+	struct mpi3_allowed_cmd_scsi           scsi;
+	struct mpi3_allowed_cmd_ata            ata;
+	struct mpi3_allowed_cmd_nvme           nvme;
+};
+#define MPI3_DRIVER_ALLOWEDCMD_CMDFLAGS_ADMINRAIDPD_BLOCKED    (0x20)
+#define MPI3_DRIVER_ALLOWEDCMD_CMDFLAGS_OOBRAIDPD_BLOCKED      (0x10)
+#define MPI3_DRIVER_ALLOWEDCMD_CMDFLAGS_OOBRAIDVD_BLOCKED      (0x08)
+#define MPI3_DRIVER_ALLOWEDCMD_CMDFLAGS_OOBADVHOSTPD_BLOCKED   (0x04)
+#define MPI3_DRIVER_ALLOWEDCMD_CMDFLAGS_OOBHOSTPD_BLOCKED      (0x02)
+#define MPI3_DRIVER_ALLOWEDCMD_CMDFLAGS_CHECKSUBCMD_ENABLED    (0x01)
+#ifndef MPI3_ALLOWED_CMDS_MAX
+#define MPI3_ALLOWED_CMDS_MAX           (1)
+#endif
+struct mpi3_driver_page0 {
+	struct mpi3_config_page_header         header;
+	__le32                             bsd_options;
+	u8                                 ssu_timeout;
+	u8                                 io_timeout;
+	u8                                 tur_retries;
+	u8                                 tur_interval;
+	u8                                 reserved10;
+	u8                                 security_key_timeout;
+	__le16                             reserved12;
+	__le32                             reserved14;
+	__le32                             reserved18;
+};
+#define MPI3_DRIVER0_PAGEVERSION               (0x00)
+#define MPI3_DRIVER0_BSDOPTS_DIS_HII_CONFIG_UTIL            (0x00000004)
+#define MPI3_DRIVER0_BSDOPTS_REGISTRATION_MASK              (0x00000003)
+#define MPI3_DRIVER0_BSDOPTS_REGISTRATION_IOC_AND_DEVS      (0x00000000)
+#define MPI3_DRIVER0_BSDOPTS_REGISTRATION_IOC_ONLY          (0x00000001)
+struct mpi3_driver_page1 {
+	struct mpi3_config_page_header         header;
+	__le32                             flags;
+	__le32                             reserved0c;
+	__le16                             host_diag_trace_max_size;
+	__le16                             host_diag_trace_min_size;
+	__le16                             host_diag_trace_decrement_size;
+	__le16                             reserved16;
+	__le16                             host_diag_fw_max_size;
+	__le16                             host_diag_fw_min_size;
+	__le16                             host_diag_fw_decrement_size;
+	__le16                             reserved1e;
+	__le16                             host_diag_driver_max_size;
+	__le16                             host_diag_driver_min_size;
+	__le16                             host_diag_driver_decrement_size;
+	__le16                             reserved26;
+};
+#define MPI3_DRIVER1_PAGEVERSION               (0x00)
+#ifndef MPI3_DRIVER2_TRIGGER_MAX
+#define MPI3_DRIVER2_TRIGGER_MAX           (1)
+#endif
+struct mpi3_driver2_trigger_event {
+	u8                                 type;
+	u8                                 flags;
+	u8                                 reserved02;
+	u8                                 event;
+	__le32                             reserved04[3];
+};
+struct mpi3_driver2_trigger_scsi_sense {
+	u8                                 type;
+	u8                                 flags;
+	__le16                             reserved02;
+	u8                                 ascq;
+	u8                                 asc;
+	u8                                 sense_key;
+	u8                                 reserved07;
+	__le32                             reserved08[2];
+};
+#define MPI3_DRIVER2_TRIGGER_SCSI_SENSE_ASCQ_MATCH_ALL                        (0xff)
+#define MPI3_DRIVER2_TRIGGER_SCSI_SENSE_ASC_MATCH_ALL                         (0xff)
+#define MPI3_DRIVER2_TRIGGER_SCSI_SENSE_SENSE_KEY_MATCH_ALL                   (0xff)
+struct mpi3_driver2_trigger_reply {
+	u8                                 type;
+	u8                                 flags;
+	__le16                             ioc_status;
+	__le32                             ioc_log_info;
+	__le32                             ioc_log_info_mask;
+	__le32                             reserved0c;
+};
+#define MPI3_DRIVER2_TRIGGER_REPLY_IOCSTATUS_MATCH_ALL                        (0xffff)
+union mpi3_driver2_trigger_element {
+	struct mpi3_driver2_trigger_event             event;
+	struct mpi3_driver2_trigger_scsi_sense        scsi_sense;
+	struct mpi3_driver2_trigger_reply             reply;
+};
+#define MPI3_DRIVER2_TRIGGER_TYPE_EVENT                                       (0x00)
+#define MPI3_DRIVER2_TRIGGER_TYPE_SCSI_SENSE                                  (0x01)
+#define MPI3_DRIVER2_TRIGGER_TYPE_REPLY                                       (0x02)
+#define MPI3_DRIVER2_TRIGGER_FLAGS_DIAG_TRACE_RELEASE                         (0x02)
+#define MPI3_DRIVER2_TRIGGER_FLAGS_DIAG_FW_RELEASE                            (0x01)
+struct mpi3_driver_page2 {
+	struct mpi3_config_page_header         header;
+	__le64                             master_trigger;
+	__le32                             reserved10[3];
+	u8                                 num_triggers;
+	u8                                 reserved1d[3];
+	union mpi3_driver2_trigger_element    trigger[MPI3_DRIVER2_TRIGGER_MAX];
+};
+#define MPI3_DRIVER2_PAGEVERSION               (0x00)
+#define MPI3_DRIVER2_MASTERTRIGGER_DIAG_TRACE_RELEASE                       (0x8000000000000000ULL)
+#define MPI3_DRIVER2_MASTERTRIGGER_DIAG_FW_RELEASE                          (0x4000000000000000ULL)
+#define MPI3_DRIVER2_MASTERTRIGGER_SNAPDUMP                                 (0x2000000000000000ULL)
+#define MPI3_DRIVER2_MASTERTRIGGER_DEVICE_REMOVAL_ENABLED                   (0x0000000000000004ULL)
+#define MPI3_DRIVER2_MASTERTRIGGER_TASK_MANAGEMENT_ENABLED                  (0x0000000000000002ULL)
+struct mpi3_driver_page10 {
+	struct mpi3_config_page_header         header;
+	__le16                             flags;
+	__le16                             reserved0a;
+	u8                                 num_allowed_commands;
+	u8                                 reserved0d[3];
+	union mpi3_allowed_cmd                allowed_command[MPI3_ALLOWED_CMDS_MAX];
+};
+#define MPI3_DRIVER10_PAGEVERSION               (0x00)
+struct mpi3_driver_page20 {
+	struct mpi3_config_page_header         header;
+	__le16                             flags;
+	__le16                             reserved0a;
+	u8                                 num_allowed_commands;
+	u8                                 reserved0d[3];
+	union mpi3_allowed_cmd                allowed_command[MPI3_ALLOWED_CMDS_MAX];
+};
+#define MPI3_DRIVER20_PAGEVERSION               (0x00)
+struct mpi3_driver_page30 {
+	struct mpi3_config_page_header         header;
+	__le16                             flags;
+	__le16                             reserved0a;
+	u8                                 num_allowed_commands;
+	u8                                 reserved0d[3];
+	union mpi3_allowed_cmd                allowed_command[MPI3_ALLOWED_CMDS_MAX];
+};
+#define MPI3_DRIVER30_PAGEVERSION               (0x00)
+union mpi3_security_mac {
+	__le32                             dword[16];
+	__le16                             word[32];
+	u8                                 byte[64];
+};
+union mpi3_security_nonce {
+	__le32                             dword[16];
+	__le16                             word[32];
+	u8                                 byte[64];
+};
+union mpi3_security0_cert_chain {
+	__le32                             dword[1024];
+	__le16                             word[2048];
+	u8                                 byte[4096];
+};
+struct mpi3_security_page0 {
+	struct mpi3_config_page_header         header;
+	u8                                 slot_num_group;
+	u8                                 slot_num;
+	__le16                             cert_chain_length;
+	u8                                 cert_chain_flags;
+	u8                                 reserved0d[3];
+	__le32                             base_asym_algo;
+	__le32                             base_hash_algo;
+	__le32                             reserved18[4];
+	union mpi3_security_mac               mac;
+	union mpi3_security_nonce             nonce;
+	union mpi3_security0_cert_chain       certificate_chain;
+};
+#define MPI3_SECURITY0_PAGEVERSION               (0x00)
+#define MPI3_SECURITY0_CERTCHAIN_FLAGS_AUTH_API_MASK       (0x0e)
+#define MPI3_SECURITY0_CERTCHAIN_FLAGS_AUTH_API_UNUSED     (0x00)
+#define MPI3_SECURITY0_CERTCHAIN_FLAGS_AUTH_API_CERBERUS   (0x02)
+#define MPI3_SECURITY0_CERTCHAIN_FLAGS_AUTH_API_SPDM       (0x04)
+#define MPI3_SECURITY0_CERTCHAIN_FLAGS_SEALED              (0x01)
+#ifndef MPI3_SECURITY1_KEY_RECORD_MAX
+#define MPI3_SECURITY1_KEY_RECORD_MAX      1
+#endif
+#ifndef MPI3_SECURITY1_PAD_MAX
+#define MPI3_SECURITY1_PAD_MAX      1
+#endif
+union mpi3_security1_key_data {
+	__le32                             dword[128];
+	__le16                             word[256];
+	u8                                 byte[512];
+};
+struct mpi3_security1_key_record {
+	u8                                 flags;
+	u8                                 consumer;
+	__le16                             key_data_size;
+	__le32                             additional_key_data;
+	__le32                             reserved08[2];
+	union mpi3_security1_key_data         key_data;
+};
+#define MPI3_SECURITY1_KEY_RECORD_FLAGS_TYPE_MASK            (0x1f)
+#define MPI3_SECURITY1_KEY_RECORD_FLAGS_TYPE_NOT_VALID       (0x00)
+#define MPI3_SECURITY1_KEY_RECORD_FLAGS_TYPE_HMAC            (0x01)
+#define MPI3_SECURITY1_KEY_RECORD_FLAGS_TYPE_AES             (0x02)
+#define MPI3_SECURITY1_KEY_RECORD_FLAGS_TYPE_ECDSA_PRIVATE   (0x03)
+#define MPI3_SECURITY1_KEY_RECORD_FLAGS_TYPE_ECDSA_PUBLIC    (0x04)
+#define MPI3_SECURITY1_KEY_RECORD_CONSUMER_NOT_VALID         (0x00)
+#define MPI3_SECURITY1_KEY_RECORD_CONSUMER_SAFESTORE         (0x01)
+#define MPI3_SECURITY1_KEY_RECORD_CONSUMER_CERT_CHAIN        (0x02)
+#define MPI3_SECURITY1_KEY_RECORD_CONSUMER_DEVICE_KEY        (0x03)
+#define MPI3_SECURITY1_KEY_RECORD_CONSUMER_CACHE_OFFLOAD     (0x04)
+struct mpi3_security_page1 {
+	struct mpi3_config_page_header         header;
+	__le32                             reserved08[2];
+	union mpi3_security_mac               mac;
+	union mpi3_security_nonce             nonce;
+	u8                                 num_keys;
+	u8                                 reserved91[3];
+	__le32                             reserved94[3];
+	struct mpi3_security1_key_record       key_record[MPI3_SECURITY1_KEY_RECORD_MAX];
+	u8                                 pad[MPI3_SECURITY1_PAD_MAX];
+};
+#define MPI3_SECURITY1_PAGEVERSION               (0x00)
+struct mpi3_sas_io_unit0_phy_data {
+	u8                 io_unit_port;
+	u8                 port_flags;
+	u8                 phy_flags;
+	u8                 negotiated_link_rate;
+	__le16             controller_phy_device_info;
+	__le16             reserved06;
+	__le16             attached_dev_handle;
+	__le16             controller_dev_handle;
+	__le32             discovery_status;
+	__le32             reserved10;
+};
+#ifndef MPI3_SAS_IO_UNIT0_PHY_MAX
+#define MPI3_SAS_IO_UNIT0_PHY_MAX           (1)
+#endif
+struct mpi3_sas_io_unit_page0 {
+	struct mpi3_config_page_header         header;
+	__le32                             reserved08;
+	u8                                 num_phys;
+	u8                                 init_status;
+	__le16                             reserved0e;
+	struct mpi3_sas_io_unit0_phy_data      phy_data[MPI3_SAS_IO_UNIT0_PHY_MAX];
+};
+#define MPI3_SASIOUNIT0_PAGEVERSION                          (0x00)
+#define MPI3_SASIOUNIT0_INITSTATUS_NO_ERRORS                 (0x00)
+#define MPI3_SASIOUNIT0_INITSTATUS_NEEDS_INITIALIZATION      (0x01)
+#define MPI3_SASIOUNIT0_INITSTATUS_NO_TARGETS_ALLOCATED      (0x02)
+#define MPI3_SASIOUNIT0_INITSTATUS_BAD_NUM_PHYS              (0x04)
+#define MPI3_SASIOUNIT0_INITSTATUS_UNSUPPORTED_CONFIG        (0x05)
+#define MPI3_SASIOUNIT0_INITSTATUS_HOST_PHYS_ENABLED         (0x06)
+#define MPI3_SASIOUNIT0_INITSTATUS_PRODUCT_SPECIFIC_MIN      (0xf0)
+#define MPI3_SASIOUNIT0_INITSTATUS_PRODUCT_SPECIFIC_MAX      (0xff)
+#define MPI3_SASIOUNIT0_PORTFLAGS_DISC_IN_PROGRESS           (0x08)
+#define MPI3_SASIOUNIT0_PORTFLAGS_AUTO_PORT_CONFIG_MASK      (0x03)
+#define MPI3_SASIOUNIT0_PORTFLAGS_AUTO_PORT_CONFIG_IOUNIT1   (0x00)
+#define MPI3_SASIOUNIT0_PORTFLAGS_AUTO_PORT_CONFIG_DYNAMIC   (0x01)
+#define MPI3_SASIOUNIT0_PORTFLAGS_AUTO_PORT_CONFIG_BACKPLANE (0x02)
+#define MPI3_SASIOUNIT0_PHYFLAGS_INIT_PERSIST_CONNECT        (0x40)
+#define MPI3_SASIOUNIT0_PHYFLAGS_TARG_PERSIST_CONNECT        (0x20)
+#define MPI3_SASIOUNIT0_PHYFLAGS_PHY_DISABLED                (0x08)
+#define MPI3_SASIOUNIT0_PHYFLAGS_VIRTUAL_PHY                 (0x02)
+#define MPI3_SASIOUNIT0_PHYFLAGS_HOST_PHY                    (0x01)
+struct mpi3_sas_io_unit1_phy_data {
+	u8                 io_unit_port;
+	u8                 port_flags;
+	u8                 phy_flags;
+	u8                 max_min_link_rate;
+	__le16             controller_phy_device_info;
+	__le16             max_target_port_connect_time;
+	__le32             reserved08;
+};
+#ifndef MPI3_SAS_IO_UNIT1_PHY_MAX
+#define MPI3_SAS_IO_UNIT1_PHY_MAX           (1)
+#endif
+struct mpi3_sas_io_unit_page1 {
+	struct mpi3_config_page_header         header;
+	__le16                             control_flags;
+	__le16                             sas_narrow_max_queue_depth;
+	__le16                             additional_control_flags;
+	__le16                             sas_wide_max_queue_depth;
+	u8                                 num_phys;
+	u8                                 sata_max_q_depth;
+	__le16                             reserved12;
+	struct mpi3_sas_io_unit1_phy_data      phy_data[MPI3_SAS_IO_UNIT1_PHY_MAX];
+};
+#define MPI3_SASIOUNIT1_PAGEVERSION                                 (0x00)
+#define MPI3_SASIOUNIT1_CONTROL_CONTROLLER_DEVICE_SELF_TEST         (0x8000)
+#define MPI3_SASIOUNIT1_CONTROL_SATA_SW_PRESERVE                    (0x1000)
+#define MPI3_SASIOUNIT1_CONTROL_SATA_48BIT_LBA_REQUIRED             (0x0080)
+#define MPI3_SASIOUNIT1_CONTROL_SATA_SMART_REQUIRED                 (0x0040)
+#define MPI3_SASIOUNIT1_CONTROL_SATA_NCQ_REQUIRED                   (0x0020)
+#define MPI3_SASIOUNIT1_CONTROL_SATA_FUA_REQUIRED                   (0x0010)
+#define MPI3_SASIOUNIT1_CONTROL_TABLE_SUBTRACTIVE_ILLEGAL           (0x0008)
+#define MPI3_SASIOUNIT1_CONTROL_SUBTRACTIVE_ILLEGAL                 (0x0004)
+#define MPI3_SASIOUNIT1_CONTROL_FIRST_LVL_DISC_ONLY                 (0x0002)
+#define MPI3_SASIOUNIT1_CONTROL_HARD_RESET_MASK                     (0x0001)
+#define MPI3_SASIOUNIT1_CONTROL_HARD_RESET_DEVICE_NAME              (0x0000)
+#define MPI3_SASIOUNIT1_CONTROL_HARD_RESET_SAS_ADDRESS              (0x0001)
+#define MPI3_SASIOUNIT1_ACONTROL_DA_PERSIST_CONNECT                 (0x0100)
+#define MPI3_SASIOUNIT1_ACONTROL_MULTI_PORT_DOMAIN_ILLEGAL          (0x0080)
+#define MPI3_SASIOUNIT1_ACONTROL_SATA_ASYNCHROUNOUS_NOTIFICATION    (0x0040)
+#define MPI3_SASIOUNIT1_ACONTROL_INVALID_TOPOLOGY_CORRECTION        (0x0020)
+#define MPI3_SASIOUNIT1_ACONTROL_PORT_ENABLE_ONLY_SATA_LINK_RESET   (0x0010)
+#define MPI3_SASIOUNIT1_ACONTROL_OTHER_AFFILIATION_SATA_LINK_RESET  (0x0008)
+#define MPI3_SASIOUNIT1_ACONTROL_SELF_AFFILIATION_SATA_LINK_RESET   (0x0004)
+#define MPI3_SASIOUNIT1_ACONTROL_NO_AFFILIATION_SATA_LINK_RESET     (0x0002)
+#define MPI3_SASIOUNIT1_ACONTROL_ALLOW_TABLE_TO_TABLE               (0x0001)
+#define MPI3_SASIOUNIT1_PORT_FLAGS_AUTO_PORT_CONFIG                 (0x01)
+#define MPI3_SASIOUNIT1_PHYFLAGS_INIT_PERSIST_CONNECT               (0x40)
+#define MPI3_SASIOUNIT1_PHYFLAGS_TARG_PERSIST_CONNECT               (0x20)
+#define MPI3_SASIOUNIT1_PHYFLAGS_PHY_DISABLE                        (0x08)
+#define MPI3_SASIOUNIT1_MMLR_MAX_RATE_MASK                          (0xf0)
+#define MPI3_SASIOUNIT1_MMLR_MAX_RATE_SHIFT                         (4)
+#define MPI3_SASIOUNIT1_MMLR_MAX_RATE_6_0                           (0xa0)
+#define MPI3_SASIOUNIT1_MMLR_MAX_RATE_12_0                          (0xb0)
+#define MPI3_SASIOUNIT1_MMLR_MAX_RATE_22_5                          (0xc0)
+#define MPI3_SASIOUNIT1_MMLR_MIN_RATE_MASK                          (0x0f)
+#define MPI3_SASIOUNIT1_MMLR_MIN_RATE_6_0                           (0x0a)
+#define MPI3_SASIOUNIT1_MMLR_MIN_RATE_12_0                          (0x0b)
+#define MPI3_SASIOUNIT1_MMLR_MIN_RATE_22_5                          (0x0c)
+struct mpi3_sas_io_unit2_phy_pm_settings {
+	u8                 control_flags;
+	u8                 reserved01;
+	__le16             inactivity_timer_exponent;
+	u8                 sata_partial_timeout;
+	u8                 reserved05;
+	u8                 sata_slumber_timeout;
+	u8                 reserved07;
+	u8                 sas_partial_timeout;
+	u8                 reserved09;
+	u8                 sas_slumber_timeout;
+	u8                 reserved0b;
+};
+#ifndef MPI3_SAS_IO_UNIT2_PHY_MAX
+#define MPI3_SAS_IO_UNIT2_PHY_MAX           (1)
+#endif
+struct mpi3_sas_io_unit_page2 {
+	struct mpi3_config_page_header             header;
+	u8                                     num_phys;
+	u8                                     reserved09[3];
+	__le32                                 reserved0c;
+	struct mpi3_sas_io_unit2_phy_pm_settings   sas_phy_power_management_settings[MPI3_SAS_IO_UNIT2_PHY_MAX];
+};
+#define MPI3_SASIOUNIT2_PAGEVERSION                     (0x00)
+#define MPI3_SASIOUNIT2_CONTROL_SAS_SLUMBER_ENABLE      (0x08)
+#define MPI3_SASIOUNIT2_CONTROL_SAS_PARTIAL_ENABLE      (0x04)
+#define MPI3_SASIOUNIT2_CONTROL_SATA_SLUMBER_ENABLE     (0x02)
+#define MPI3_SASIOUNIT2_CONTROL_SATA_PARTIAL_ENABLE     (0x01)
+#define MPI3_SASIOUNIT2_ITE_SAS_SLUMBER_MASK            (0x7000)
+#define MPI3_SASIOUNIT2_ITE_SAS_SLUMBER_SHIFT           (12)
+#define MPI3_SASIOUNIT2_ITE_SAS_PARTIAL_MASK            (0x0700)
+#define MPI3_SASIOUNIT2_ITE_SAS_PARTIAL_SHIFT           (8)
+#define MPI3_SASIOUNIT2_ITE_SATA_SLUMBER_MASK           (0x0070)
+#define MPI3_SASIOUNIT2_ITE_SATA_SLUMBER_SHIFT          (4)
+#define MPI3_SASIOUNIT2_ITE_SATA_PARTIAL_MASK           (0x0007)
+#define MPI3_SASIOUNIT2_ITE_SATA_PARTIAL_SHIFT          (0)
+#define MPI3_SASIOUNIT2_ITE_EXP_TEN_SECONDS             (7)
+#define MPI3_SASIOUNIT2_ITE_EXP_ONE_SECOND              (6)
+#define MPI3_SASIOUNIT2_ITE_EXP_HUNDRED_MILLISECONDS    (5)
+#define MPI3_SASIOUNIT2_ITE_EXP_TEN_MILLISECONDS        (4)
+#define MPI3_SASIOUNIT2_ITE_EXP_ONE_MILLISECOND         (3)
+#define MPI3_SASIOUNIT2_ITE_EXP_HUNDRED_MICROSECONDS    (2)
+#define MPI3_SASIOUNIT2_ITE_EXP_TEN_MICROSECONDS        (1)
+#define MPI3_SASIOUNIT2_ITE_EXP_ONE_MICROSECOND         (0)
+struct mpi3_sas_io_unit_page3 {
+	struct mpi3_config_page_header         header;
+	__le32                             reserved08;
+	__le32                             power_management_capabilities;
+};
+#define MPI3_SASIOUNIT3_PAGEVERSION                     (0x00)
+#define MPI3_SASIOUNIT3_PM_HOST_SAS_SLUMBER_MODE        (0x00000800)
+#define MPI3_SASIOUNIT3_PM_HOST_SAS_PARTIAL_MODE        (0x00000400)
+#define MPI3_SASIOUNIT3_PM_HOST_SATA_SLUMBER_MODE       (0x00000200)
+#define MPI3_SASIOUNIT3_PM_HOST_SATA_PARTIAL_MODE       (0x00000100)
+#define MPI3_SASIOUNIT3_PM_IOUNIT_SAS_SLUMBER_MODE      (0x00000008)
+#define MPI3_SASIOUNIT3_PM_IOUNIT_SAS_PARTIAL_MODE      (0x00000004)
+#define MPI3_SASIOUNIT3_PM_IOUNIT_SATA_SLUMBER_MODE     (0x00000002)
+#define MPI3_SASIOUNIT3_PM_IOUNIT_SATA_PARTIAL_MODE     (0x00000001)
+struct mpi3_sas_expander_page0 {
+	struct mpi3_config_page_header         header;
+	u8                                 io_unit_port;
+	u8                                 report_gen_length;
+	__le16                             enclosure_handle;
+	__le32                             reserved0c;
+	__le64                             sas_address;
+	__le32                             discovery_status;
+	__le16                             dev_handle;
+	__le16                             parent_dev_handle;
+	__le16                             expander_change_count;
+	__le16                             expander_route_indexes;
+	u8                                 num_phys;
+	u8                                 sas_level;
+	__le16                             flags;
+	__le16                             stp_bus_inactivity_time_limit;
+	__le16                             stp_max_connect_time_limit;
+	__le16                             stp_smp_nexus_loss_time;
+	__le16                             max_num_routed_sas_addresses;
+	__le64                             active_zone_manager_sas_address;
+	__le16                             zone_lock_inactivity_limit;
+	__le16                             reserved3a;
+	u8                                 time_to_reduced_func;
+	u8                                 initial_time_to_reduced_func;
+	u8                                 max_reduced_func_time;
+	u8                                 exp_status;
+};
+#define MPI3_SASEXPANDER0_PAGEVERSION                       (0x00)
+#define MPI3_SASEXPANDER0_FLAGS_REDUCED_FUNCTIONALITY       (0x2000)
+#define MPI3_SASEXPANDER0_FLAGS_ZONE_LOCKED                 (0x1000)
+#define MPI3_SASEXPANDER0_FLAGS_SUPPORTED_PHYSICAL_PRES     (0x0800)
+#define MPI3_SASEXPANDER0_FLAGS_ASSERTED_PHYSICAL_PRES      (0x0400)
+#define MPI3_SASEXPANDER0_FLAGS_ZONING_SUPPORT              (0x0200)
+#define MPI3_SASEXPANDER0_FLAGS_ENABLED_ZONING              (0x0100)
+#define MPI3_SASEXPANDER0_FLAGS_TABLE_TO_TABLE_SUPPORT      (0x0080)
+#define MPI3_SASEXPANDER0_FLAGS_CONNECTOR_END_DEVICE        (0x0010)
+#define MPI3_SASEXPANDER0_FLAGS_OTHERS_CONFIG               (0x0004)
+#define MPI3_SASEXPANDER0_FLAGS_CONFIG_IN_PROGRESS          (0x0002)
+#define MPI3_SASEXPANDER0_FLAGS_ROUTE_TABLE_CONFIG          (0x0001)
+#define MPI3_SASEXPANDER0_ES_NOT_RESPONDING                 (0x02)
+#define MPI3_SASEXPANDER0_ES_RESPONDING                     (0x03)
+#define MPI3_SASEXPANDER0_ES_DELAY_NOT_RESPONDING           (0x04)
+struct mpi3_sas_expander_page1 {
+	struct mpi3_config_page_header         header;
+	u8                                 io_unit_port;
+	u8                                 reserved09[3];
+	u8                                 num_phys;
+	u8                                 phy;
+	__le16                             num_table_entries_programmed;
+	u8                                 programmed_link_rate;
+	u8                                 hw_link_rate;
+	__le16                             attached_dev_handle;
+	__le32                             phy_info;
+	__le16                             attached_device_info;
+	__le16                             reserved1a;
+	__le16                             expander_dev_handle;
+	u8                                 change_count;
+	u8                                 negotiated_link_rate;
+	u8                                 phy_identifier;
+	u8                                 attached_phy_identifier;
+	u8                                 reserved22;
+	u8                                 discovery_info;
+	__le32                             attached_phy_info;
+	u8                                 zone_group;
+	u8                                 self_config_status;
+	__le16                             reserved2a;
+	__le16                             slot;
+	__le16                             slot_index;
+};
+#define MPI3_SASEXPANDER1_PAGEVERSION                   (0x00)
+#define MPI3_SASEXPANDER1_DISCINFO_BAD_PHY_DISABLED     (0x04)
+#define MPI3_SASEXPANDER1_DISCINFO_LINK_STATUS_CHANGE   (0x02)
+#define MPI3_SASEXPANDER1_DISCINFO_NO_ROUTING_ENTRIES   (0x01)
+#ifndef MPI3_SASEXPANDER2_MAX_NUM_PHYS
+#define MPI3_SASEXPANDER2_MAX_NUM_PHYS                               (1)
+#endif
+struct mpi3_sasexpander2_phy_element {
+	u8                                 link_change_count;
+	u8                                 reserved01;
+	__le16                             rate_change_count;
+	__le32                             reserved04;
+};
+struct mpi3_sas_expander_page2 {
+	struct mpi3_config_page_header         header;
+	u8                                 num_phys;
+	u8                                 reserved09;
+	__le16                             dev_handle;
+	__le32                             reserved0c;
+	struct mpi3_sasexpander2_phy_element   phy[MPI3_SASEXPANDER2_MAX_NUM_PHYS];
+};
+#define MPI3_SASEXPANDER2_PAGEVERSION                   (0x00)
+struct mpi3_sas_port_page0 {
+	struct mpi3_config_page_header         header;
+	u8                                 port_number;
+	u8                                 reserved09;
+	u8                                 port_width;
+	u8                                 reserved0b;
+	u8                                 zone_group;
+	u8                                 reserved0d[3];
+	__le64                             sas_address;
+	__le16                             device_info;
+	__le16                             reserved1a;
+	__le32                             reserved1c;
+};
+#define MPI3_SASPORT0_PAGEVERSION                       (0x00)
+struct mpi3_sas_phy_page0 {
+	struct mpi3_config_page_header         header;
+	__le16                             owner_dev_handle;
+	__le16                             reserved0a;
+	__le16                             attached_dev_handle;
+	u8                                 attached_phy_identifier;
+	u8                                 reserved0f;
+	__le32                             attached_phy_info;
+	u8                                 programmed_link_rate;
+	u8                                 hw_link_rate;
+	u8                                 change_count;
+	u8                                 flags;
+	__le32                             phy_info;
+	u8                                 negotiated_link_rate;
+	u8                                 reserved1d[3];
+	__le16                             slot;
+	__le16                             slot_index;
+};
+#define MPI3_SASPHY0_PAGEVERSION                        (0x00)
+#define MPI3_SASPHY0_FLAGS_SGPIO_DIRECT_ATTACH_ENC      (0x01)
+struct mpi3_sas_phy_page1 {
+	struct mpi3_config_page_header         header;
+	__le32                             reserved08;
+	__le32                             invalid_dword_count;
+	__le32                             running_disparity_error_count;
+	__le32                             loss_dword_synch_count;
+	__le32                             phy_reset_problem_count;
+};
+#define MPI3_SASPHY1_PAGEVERSION                        (0x00)
+struct mpi3_sas_phy2_phy_event {
+	u8         phy_event_code;
+	u8         reserved01[3];
+	__le32     phy_event_info;
+};
+#ifndef MPI3_SAS_PHY2_PHY_EVENT_MAX
+#define MPI3_SAS_PHY2_PHY_EVENT_MAX         (1)
+#endif
+struct mpi3_sas_phy_page2 {
+	struct mpi3_config_page_header         header;
+	__le32                             reserved08;
+	u8                                 num_phy_events;
+	u8                                 reserved0d[3];
+	struct mpi3_sas_phy2_phy_event         phy_event[MPI3_SAS_PHY2_PHY_EVENT_MAX];
+};
+#define MPI3_SASPHY2_PAGEVERSION                        (0x00)
+struct mpi3_sas_phy3_phy_event_config {
+	u8         phy_event_code;
+	u8         reserved01[3];
+	u8         counter_type;
+	u8         threshold_window;
+	u8         time_units;
+	u8         reserved07;
+	__le32     event_threshold;
+	__le16     threshold_flags;
+	__le16     reserved0e;
+};
+#define MPI3_SASPHY3_EVENT_CODE_NO_EVENT                    (0x00)
+#define MPI3_SASPHY3_EVENT_CODE_INVALID_DWORD               (0x01)
+#define MPI3_SASPHY3_EVENT_CODE_RUNNING_DISPARITY_ERROR     (0x02)
+#define MPI3_SASPHY3_EVENT_CODE_LOSS_DWORD_SYNC             (0x03)
+#define MPI3_SASPHY3_EVENT_CODE_PHY_RESET_PROBLEM           (0x04)
+#define MPI3_SASPHY3_EVENT_CODE_ELASTICITY_BUF_OVERFLOW     (0x05)
+#define MPI3_SASPHY3_EVENT_CODE_RX_ERROR                    (0x06)
+#define MPI3_SASPHY3_EVENT_CODE_INV_SPL_PACKETS             (0x07)
+#define MPI3_SASPHY3_EVENT_CODE_LOSS_SPL_PACKET_SYNC        (0x08)
+#define MPI3_SASPHY3_EVENT_CODE_RX_ADDR_FRAME_ERROR         (0x20)
+#define MPI3_SASPHY3_EVENT_CODE_TX_AC_OPEN_REJECT           (0x21)
+#define MPI3_SASPHY3_EVENT_CODE_RX_AC_OPEN_REJECT           (0x22)
+#define MPI3_SASPHY3_EVENT_CODE_TX_RC_OPEN_REJECT           (0x23)
+#define MPI3_SASPHY3_EVENT_CODE_RX_RC_OPEN_REJECT           (0x24)
+#define MPI3_SASPHY3_EVENT_CODE_RX_AIP_PARTIAL_WAITING_ON   (0x25)
+#define MPI3_SASPHY3_EVENT_CODE_RX_AIP_CONNECT_WAITING_ON   (0x26)
+#define MPI3_SASPHY3_EVENT_CODE_TX_BREAK                    (0x27)
+#define MPI3_SASPHY3_EVENT_CODE_RX_BREAK                    (0x28)
+#define MPI3_SASPHY3_EVENT_CODE_BREAK_TIMEOUT               (0x29)
+#define MPI3_SASPHY3_EVENT_CODE_CONNECTION                  (0x2a)
+#define MPI3_SASPHY3_EVENT_CODE_PEAKTX_PATHWAY_BLOCKED      (0x2b)
+#define MPI3_SASPHY3_EVENT_CODE_PEAKTX_ARB_WAIT_TIME        (0x2c)
+#define MPI3_SASPHY3_EVENT_CODE_PEAK_ARB_WAIT_TIME          (0x2d)
+#define MPI3_SASPHY3_EVENT_CODE_PEAK_CONNECT_TIME           (0x2e)
+#define MPI3_SASPHY3_EVENT_CODE_PERSIST_CONN                (0x2f)
+#define MPI3_SASPHY3_EVENT_CODE_TX_SSP_FRAMES               (0x40)
+#define MPI3_SASPHY3_EVENT_CODE_RX_SSP_FRAMES               (0x41)
+#define MPI3_SASPHY3_EVENT_CODE_TX_SSP_ERROR_FRAMES         (0x42)
+#define MPI3_SASPHY3_EVENT_CODE_RX_SSP_ERROR_FRAMES         (0x43)
+#define MPI3_SASPHY3_EVENT_CODE_TX_CREDIT_BLOCKED           (0x44)
+#define MPI3_SASPHY3_EVENT_CODE_RX_CREDIT_BLOCKED           (0x45)
+#define MPI3_SASPHY3_EVENT_CODE_TX_SATA_FRAMES              (0x50)
+#define MPI3_SASPHY3_EVENT_CODE_RX_SATA_FRAMES              (0x51)
+#define MPI3_SASPHY3_EVENT_CODE_SATA_OVERFLOW               (0x52)
+#define MPI3_SASPHY3_EVENT_CODE_TX_SMP_FRAMES               (0x60)
+#define MPI3_SASPHY3_EVENT_CODE_RX_SMP_FRAMES               (0x61)
+#define MPI3_SASPHY3_EVENT_CODE_RX_SMP_ERROR_FRAMES         (0x63)
+#define MPI3_SASPHY3_EVENT_CODE_HOTPLUG_TIMEOUT             (0xd0)
+#define MPI3_SASPHY3_EVENT_CODE_MISALIGNED_MUX_PRIMITIVE    (0xd1)
+#define MPI3_SASPHY3_EVENT_CODE_RX_AIP                      (0xd2)
+#define MPI3_SASPHY3_EVENT_CODE_LCARB_WAIT_TIME             (0xd3)
+#define MPI3_SASPHY3_EVENT_CODE_RCVD_CONN_RESP_WAIT_TIME    (0xd4)
+#define MPI3_SASPHY3_EVENT_CODE_LCCONN_TIME                 (0xd5)
+#define MPI3_SASPHY3_EVENT_CODE_SSP_TX_START_TRANSMIT       (0xd6)
+#define MPI3_SASPHY3_EVENT_CODE_SATA_TX_START               (0xd7)
+#define MPI3_SASPHY3_EVENT_CODE_SMP_TX_START_TRANSMT        (0xd8)
+#define MPI3_SASPHY3_EVENT_CODE_TX_SMP_BREAK_CONN           (0xd9)
+#define MPI3_SASPHY3_EVENT_CODE_SSP_RX_START_RECEIVE        (0xda)
+#define MPI3_SASPHY3_EVENT_CODE_SATA_RX_START_RECEIVE       (0xdb)
+#define MPI3_SASPHY3_EVENT_CODE_SMP_RX_START_RECEIVE        (0xdc)
+#define MPI3_SASPHY3_COUNTER_TYPE_WRAPPING                  (0x00)
+#define MPI3_SASPHY3_COUNTER_TYPE_SATURATING                (0x01)
+#define MPI3_SASPHY3_COUNTER_TYPE_PEAK_VALUE                (0x02)
+#define MPI3_SASPHY3_TIME_UNITS_10_MICROSECONDS             (0x00)
+#define MPI3_SASPHY3_TIME_UNITS_100_MICROSECONDS            (0x01)
+#define MPI3_SASPHY3_TIME_UNITS_1_MILLISECOND               (0x02)
+#define MPI3_SASPHY3_TIME_UNITS_10_MILLISECONDS             (0x03)
+#define MPI3_SASPHY3_TFLAGS_PHY_RESET                       (0x0002)
+#define MPI3_SASPHY3_TFLAGS_EVENT_NOTIFY                    (0x0001)
+#ifndef MPI3_SAS_PHY3_PHY_EVENT_MAX
+#define MPI3_SAS_PHY3_PHY_EVENT_MAX         (1)
+#endif
+struct mpi3_sas_phy_page3 {
+	struct mpi3_config_page_header         header;
+	__le32                             reserved08;
+	u8                                 num_phy_events;
+	u8                                 reserved0d[3];
+	struct mpi3_sas_phy3_phy_event_config  phy_event_config[MPI3_SAS_PHY3_PHY_EVENT_MAX];
+};
+#define MPI3_SASPHY3_PAGEVERSION                        (0x00)
+struct mpi3_sas_phy_page4 {
+	struct mpi3_config_page_header         header;
+	u8                                 reserved08[3];
+	u8                                 flags;
+	u8                                 initial_frame[28];
+};
+#define MPI3_SASPHY4_PAGEVERSION                        (0x00)
+#define MPI3_SASPHY4_FLAGS_FRAME_VALID                  (0x02)
+#define MPI3_SASPHY4_FLAGS_SATA_FRAME                   (0x01)
+#define MPI3_PCIE_LINK_RETIMERS_MASK                    (0x30)
+#define MPI3_PCIE_LINK_RETIMERS_SHIFT                   (4)
+#define MPI3_PCIE_NEG_LINK_RATE_MASK                    (0x0f)
+#define MPI3_PCIE_NEG_LINK_RATE_UNKNOWN                 (0x00)
+#define MPI3_PCIE_NEG_LINK_RATE_PHY_DISABLED            (0x01)
+#define MPI3_PCIE_NEG_LINK_RATE_2_5                     (0x02)
+#define MPI3_PCIE_NEG_LINK_RATE_5_0                     (0x03)
+#define MPI3_PCIE_NEG_LINK_RATE_8_0                     (0x04)
+#define MPI3_PCIE_NEG_LINK_RATE_16_0                    (0x05)
+#define MPI3_PCIE_NEG_LINK_RATE_32_0                    (0x06)
+#define MPI3_PCIE_ASPM_ENABLE_NONE                      (0x0)
+#define MPI3_PCIE_ASPM_ENABLE_L0S                       (0x1)
+#define MPI3_PCIE_ASPM_ENABLE_L1                        (0x2)
+#define MPI3_PCIE_ASPM_ENABLE_L0S_L1                    (0x3)
+#define MPI3_PCIE_ASPM_SUPPORT_NONE                     (0x0)
+#define MPI3_PCIE_ASPM_SUPPORT_L0S                      (0x1)
+#define MPI3_PCIE_ASPM_SUPPORT_L1                       (0x2)
+#define MPI3_PCIE_ASPM_SUPPORT_L0S_L1                   (0x3)
+struct mpi3_pcie_io_unit0_phy_data {
+	u8         link;
+	u8         link_flags;
+	u8         phy_flags;
+	u8         negotiated_link_rate;
+	__le16     attached_dev_handle;
+	__le16     controller_dev_handle;
+	__le32     enumeration_status;
+	u8         io_unit_port;
+	u8         reserved0d[3];
+};
+#define MPI3_PCIEIOUNIT0_LINKFLAGS_CONFIG_SOURCE_MASK      (0x10)
+#define MPI3_PCIEIOUNIT0_LINKFLAGS_CONFIG_SOURCE_IOUNIT1   (0x00)
+#define MPI3_PCIEIOUNIT0_LINKFLAGS_CONFIG_SOURCE_BKPLANE   (0x10)
+#define MPI3_PCIEIOUNIT0_LINKFLAGS_ENUM_IN_PROGRESS        (0x08)
+#define MPI3_PCIEIOUNIT0_PHYFLAGS_PHY_DISABLED          (0x08)
+#define MPI3_PCIEIOUNIT0_PHYFLAGS_HOST_PHY              (0x01)
+#define MPI3_PCIEIOUNIT0_ES_MAX_SWITCH_DEPTH_EXCEEDED   (0x80000000)
+#define MPI3_PCIEIOUNIT0_ES_MAX_SWITCHES_EXCEEDED       (0x40000000)
+#define MPI3_PCIEIOUNIT0_ES_MAX_ENDPOINTS_EXCEEDED      (0x20000000)
+#define MPI3_PCIEIOUNIT0_ES_INSUFFICIENT_RESOURCES      (0x10000000)
+#ifndef MPI3_PCIE_IO_UNIT0_PHY_MAX
+#define MPI3_PCIE_IO_UNIT0_PHY_MAX      (1)
+#endif
+struct mpi3_pcie_io_unit_page0 {
+	struct mpi3_config_page_header         header;
+	__le32                             reserved08;
+	u8                                 num_phys;
+	u8                                 init_status;
+	u8                                 aspm;
+	u8                                 reserved0f;
+	struct mpi3_pcie_io_unit0_phy_data     phy_data[MPI3_PCIE_IO_UNIT0_PHY_MAX];
+};
+#define MPI3_PCIEIOUNIT0_PAGEVERSION                        (0x00)
+#define MPI3_PCIEIOUNIT0_INITSTATUS_NO_ERRORS               (0x00)
+#define MPI3_PCIEIOUNIT0_INITSTATUS_NEEDS_INITIALIZATION    (0x01)
+#define MPI3_PCIEIOUNIT0_INITSTATUS_NO_TARGETS_ALLOCATED    (0x02)
+#define MPI3_PCIEIOUNIT0_INITSTATUS_RESOURCE_ALLOC_FAILED   (0x03)
+#define MPI3_PCIEIOUNIT0_INITSTATUS_BAD_NUM_PHYS            (0x04)
+#define MPI3_PCIEIOUNIT0_INITSTATUS_UNSUPPORTED_CONFIG      (0x05)
+#define MPI3_PCIEIOUNIT0_INITSTATUS_HOST_PORT_MISMATCH      (0x06)
+#define MPI3_PCIEIOUNIT0_INITSTATUS_PHYS_NOT_CONSECUTIVE    (0x07)
+#define MPI3_PCIEIOUNIT0_INITSTATUS_BAD_CLOCKING_MODE       (0x08)
+#define MPI3_PCIEIOUNIT0_INITSTATUS_PROD_SPEC_START         (0xf0)
+#define MPI3_PCIEIOUNIT0_INITSTATUS_PROD_SPEC_END           (0xff)
+#define MPI3_PCIEIOUNIT0_ASPM_SWITCH_STATES_MASK            (0xc0)
+#define MPI3_PCIEIOUNIT0_ASPM_SWITCH_STATES_SHIFT              (6)
+#define MPI3_PCIEIOUNIT0_ASPM_DIRECT_STATES_MASK            (0x30)
+#define MPI3_PCIEIOUNIT0_ASPM_DIRECT_STATES_SHIFT              (4)
+#define MPI3_PCIEIOUNIT0_ASPM_SWITCH_SUPPORT_MASK           (0x0c)
+#define MPI3_PCIEIOUNIT0_ASPM_SWITCH_SUPPORT_SHIFT             (2)
+#define MPI3_PCIEIOUNIT0_ASPM_DIRECT_SUPPORT_MASK           (0x03)
+#define MPI3_PCIEIOUNIT0_ASPM_DIRECT_SUPPORT_SHIFT             (0)
+struct mpi3_pcie_io_unit1_phy_data {
+	u8         link;
+	u8         link_flags;
+	u8         phy_flags;
+	u8         max_min_link_rate;
+	__le32     reserved04;
+	__le32     reserved08;
+};
+#define MPI3_PCIEIOUNIT1_LINKFLAGS_PCIE_CLK_MODE_MASK                     (0x03)
+#define MPI3_PCIEIOUNIT1_LINKFLAGS_PCIE_CLK_MODE_DIS_SEPARATE_REFCLK      (0x00)
+#define MPI3_PCIEIOUNIT1_LINKFLAGS_PCIE_CLK_MODE_EN_SRIS                  (0x01)
+#define MPI3_PCIEIOUNIT1_LINKFLAGS_PCIE_CLK_MODE_EN_SRNS                  (0x02)
+#define MPI3_PCIEIOUNIT1_PHYFLAGS_PHY_DISABLE                             (0x08)
+#define MPI3_PCIEIOUNIT1_MMLR_MAX_RATE_MASK                               (0xf0)
+#define MPI3_PCIEIOUNIT1_MMLR_MAX_RATE_SHIFT                                 (4)
+#define MPI3_PCIEIOUNIT1_MMLR_MAX_RATE_2_5                                (0x20)
+#define MPI3_PCIEIOUNIT1_MMLR_MAX_RATE_5_0                                (0x30)
+#define MPI3_PCIEIOUNIT1_MMLR_MAX_RATE_8_0                                (0x40)
+#define MPI3_PCIEIOUNIT1_MMLR_MAX_RATE_16_0                               (0x50)
+#define MPI3_PCIEIOUNIT1_MMLR_MAX_RATE_32_0                               (0x60)
+#ifndef MPI3_PCIE_IO_UNIT1_PHY_MAX
+#define MPI3_PCIE_IO_UNIT1_PHY_MAX                                           (1)
+#endif
+struct mpi3_pcie_io_unit_page1 {
+	struct mpi3_config_page_header         header;
+	__le32                             control_flags;
+	__le32                             reserved0c;
+	u8                                 num_phys;
+	u8                                 reserved11;
+	u8                                 aspm;
+	u8                                 reserved13;
+	struct mpi3_pcie_io_unit1_phy_data     phy_data[MPI3_PCIE_IO_UNIT1_PHY_MAX];
+};
+#define MPI3_PCIEIOUNIT1_PAGEVERSION                                           (0x00)
+#define MPI3_PCIEIOUNIT1_CONTROL_FLAGS_LINK_OVERRIDE_DISABLE                   (0x80)
+#define MPI3_PCIEIOUNIT1_CONTROL_FLAGS_CLOCK_OVERRIDE_DISABLE                  (0x40)
+#define MPI3_PCIEIOUNIT1_CONTROL_FLAGS_CLOCK_OVERRIDE_MODE_MASK                (0x30)
+#define MPI3_PCIEIOUNIT1_CONTROL_FLAGS_CLOCK_OVERRIDE_MODE_SHIFT               (4)
+#define MPI3_PCIEIOUNIT1_CONTROL_FLAGS_CLOCK_OVERRIDE_MODE_SRIS_SRNS_DISABLED  (0x00)
+#define MPI3_PCIEIOUNIT1_CONTROL_FLAGS_CLOCK_OVERRIDE_MODE_SRIS_ENABLED        (0x10)
+#define MPI3_PCIEIOUNIT1_CONTROL_FLAGS_CLOCK_OVERRIDE_MODE_SRNS_ENABLED        (0x20)
+#define MPI3_PCIEIOUNIT1_CONTROL_FLAGS_LINK_RATE_OVERRIDE_MASK                 (0x0f)
+#define MPI3_PCIEIOUNIT1_CONTROL_FLAGS_LINK_RATE_OVERRIDE_MAX_2_5              (0x02)
+#define MPI3_PCIEIOUNIT1_CONTROL_FLAGS_LINK_RATE_OVERRIDE_MAX_5_0              (0x03)
+#define MPI3_PCIEIOUNIT1_CONTROL_FLAGS_LINK_RATE_OVERRIDE_MAX_8_0              (0x04)
+#define MPI3_PCIEIOUNIT1_CONTROL_FLAGS_LINK_RATE_OVERRIDE_MAX_16_0             (0x05)
+#define MPI3_PCIEIOUNIT1_CONTROL_FLAGS_LINK_RATE_OVERRIDE_MAX_32_0             (0x06)
+#define MPI3_PCIEIOUNIT1_ASPM_SWITCH_MASK                                 (0x0c)
+#define MPI3_PCIEIOUNIT1_ASPM_SWITCH_SHIFT                                   (2)
+#define MPI3_PCIEIOUNIT1_ASPM_DIRECT_MASK                                 (0x03)
+#define MPI3_PCIEIOUNIT1_ASPM_DIRECT_SHIFT                                   (0)
+struct mpi3_pcie_io_unit_page2 {
+	struct mpi3_config_page_header         header;
+	__le16                             nvme_max_q_dx1;
+	__le16                             nvme_max_q_dx2;
+	u8                                 nvme_abort_to;
+	u8                                 reserved0d;
+	__le16                             nvme_max_q_dx4;
+};
+#define MPI3_PCIEIOUNIT2_PAGEVERSION                        (0x00)
+#define MPI3_PCIEIOUNIT3_ERROR_RECEIVER_ERROR               (0)
+#define MPI3_PCIEIOUNIT3_ERROR_RECOVERY                     (1)
+#define MPI3_PCIEIOUNIT3_ERROR_CORRECTABLE_ERROR_MSG        (2)
+#define MPI3_PCIEIOUNIT3_ERROR_BAD_DLLP                     (3)
+#define MPI3_PCIEIOUNIT3_ERROR_BAD_TLP                      (4)
+#define MPI3_PCIEIOUNIT3_NUM_ERROR_INDEX                    (5)
+struct mpi3_pcie_io_unit3_error {
+	__le16                             threshold_count;
+	__le16                             reserved02;
+};
+struct mpi3_pcie_io_unit_page3 {
+	struct mpi3_config_page_header         header;
+	u8                                 threshold_window;
+	u8                                 threshold_action;
+	u8                                 escalation_count;
+	u8                                 escalation_action;
+	u8                                 num_errors;
+	u8                                 reserved0d[3];
+	struct mpi3_pcie_io_unit3_error        error[MPI3_PCIEIOUNIT3_NUM_ERROR_INDEX];
+};
+#define MPI3_PCIEIOUNIT3_PAGEVERSION                        (0x00)
+#define MPI3_PCIEIOUNIT3_ACTION_NO_ACTION                   (0x00)
+#define MPI3_PCIEIOUNIT3_ACTION_HOT_RESET                   (0x01)
+#define MPI3_PCIEIOUNIT3_ACTION_REDUCE_LINK_RATE_ONLY       (0x02)
+#define MPI3_PCIEIOUNIT3_ACTION_REDUCE_LINK_RATE_NO_ACCESS  (0x03)
+struct mpi3_pcie_switch_page0 {
+	struct mpi3_config_page_header     header;
+	u8                             io_unit_port;
+	u8                             switch_status;
+	u8                             reserved0a[2];
+	__le16                         dev_handle;
+	__le16                         parent_dev_handle;
+	u8                             num_ports;
+	u8                             pcie_level;
+	__le16                         reserved12;
+	__le32                         reserved14;
+	__le32                         reserved18;
+	__le32                         reserved1c;
+};
+#define MPI3_PCIESWITCH0_PAGEVERSION                  (0x00)
+#define MPI3_PCIESWITCH0_SS_NOT_RESPONDING            (0x02)
+#define MPI3_PCIESWITCH0_SS_RESPONDING                (0x03)
+#define MPI3_PCIESWITCH0_SS_DELAY_NOT_RESPONDING      (0x04)
+struct mpi3_pcie_switch_page1 {
+	struct mpi3_config_page_header     header;
+	u8                             io_unit_port;
+	u8                             flags;
+	__le16                         reserved0a;
+	u8                             num_ports;
+	u8                             port_num;
+	__le16                         attached_dev_handle;
+	__le16                         switch_dev_handle;
+	u8                             negotiated_port_width;
+	u8                             negotiated_link_rate;
+	__le16                         slot;
+	__le16                         slot_index;
+	__le32                         reserved18;
+};
+#define MPI3_PCIESWITCH1_PAGEVERSION        (0x00)
+#define MPI3_PCIESWITCH1_FLAGS_ASPMSTATE_MASK     (0x0c)
+#define MPI3_PCIESWITCH1_FLAGS_ASPMSTATE_SHIFT    (2)
+#define MPI3_PCIESWITCH1_FLAGS_ASPMSUPPORT_MASK     (0x03)
+#define MPI3_PCIESWITCH1_FLAGS_ASPMSUPPORT_SHIFT    (0)
+#ifndef MPI3_PCIESWITCH2_MAX_NUM_PORTS
+#define MPI3_PCIESWITCH2_MAX_NUM_PORTS                               (1)
+#endif
+struct mpi3_pcieswitch2_port_element {
+	__le16                             link_change_count;
+	__le16                             rate_change_count;
+	__le32                             reserved04;
+};
+struct mpi3_pcie_switch_page2 {
+	struct mpi3_config_page_header         header;
+	u8                                 num_ports;
+	u8                                 reserved09;
+	__le16                             dev_handle;
+	__le32                             reserved0c;
+	struct mpi3_pcieswitch2_port_element   port[MPI3_PCIESWITCH2_MAX_NUM_PORTS];
+};
+#define MPI3_PCIESWITCH2_PAGEVERSION        (0x00)
+struct mpi3_pcie_link_page0 {
+	struct mpi3_config_page_header     header;
+	u8                             link;
+	u8                             reserved09[3];
+	__le32                         reserved0c;
+	__le32                         receiver_error_count;
+	__le32                         recovery_count;
+	__le32                         corr_error_msg_count;
+	__le32                         non_fatal_error_msg_count;
+	__le32                         fatal_error_msg_count;
+	__le32                         non_fatal_error_count;
+	__le32                         fatal_error_count;
+	__le32                         bad_dllp_count;
+	__le32                         bad_tlp_count;
+};
+#define MPI3_PCIELINK0_PAGEVERSION          (0x00)
+struct mpi3_enclosure_page0 {
+	struct mpi3_config_page_header         header;
+	__le64                             enclosure_logical_id;
+	__le16                             flags;
+	__le16                             enclosure_handle;
+	__le16                             num_slots;
+	__le16                             reserved16;
+	u8                                 io_unit_port;
+	u8                                 enclosure_level;
+	__le16                             sep_dev_handle;
+	u8                                 chassis_slot;
+	u8                                 reserved1d[3];
+};
+#define MPI3_ENCLOSURE0_PAGEVERSION                     (0x00)
+#define MPI3_ENCLS0_FLAGS_ENCL_TYPE_MASK                (0xc000)
+#define MPI3_ENCLS0_FLAGS_ENCL_TYPE_VIRTUAL             (0x0000)
+#define MPI3_ENCLS0_FLAGS_ENCL_TYPE_SAS                 (0x4000)
+#define MPI3_ENCLS0_FLAGS_ENCL_TYPE_PCIE                (0x8000)
+#define MPI3_ENCLS0_FLAGS_CHASSIS_SLOT_VALID            (0x0020)
+#define MPI3_ENCLS0_FLAGS_ENCL_DEV_PRESENT_MASK         (0x0010)
+#define MPI3_ENCLS0_FLAGS_ENCL_DEV_NOT_FOUND            (0x0000)
+#define MPI3_ENCLS0_FLAGS_ENCL_DEV_PRESENT              (0x0010)
+#define MPI3_ENCLS0_FLAGS_MNG_MASK                      (0x000f)
+#define MPI3_ENCLS0_FLAGS_MNG_UNKNOWN                   (0x0000)
+#define MPI3_ENCLS0_FLAGS_MNG_IOC_SES                   (0x0001)
+#define MPI3_ENCLS0_FLAGS_MNG_SES_ENCLOSURE             (0x0002)
+#define MPI3_DEVICE_DEVFORM_SAS_SATA                    (0x00)
+#define MPI3_DEVICE_DEVFORM_PCIE                        (0x01)
+#define MPI3_DEVICE_DEVFORM_VD                          (0x02)
+struct mpi3_device0_sas_sata_format {
+	__le64     sas_address;
+	__le16     flags;
+	__le16     device_info;
+	u8         phy_num;
+	u8         attached_phy_identifier;
+	u8         max_port_connections;
+	u8         zone_group;
+};
+#define MPI3_DEVICE0_SASSATA_FLAGS_WRITE_SAME_UNMAP_NCQ (0x0400)
+#define MPI3_DEVICE0_SASSATA_FLAGS_SLUMBER_CAP          (0x0200)
+#define MPI3_DEVICE0_SASSATA_FLAGS_PARTIAL_CAP          (0x0100)
+#define MPI3_DEVICE0_SASSATA_FLAGS_ASYNC_NOTIFY         (0x0080)
+#define MPI3_DEVICE0_SASSATA_FLAGS_SW_PRESERVE          (0x0040)
+#define MPI3_DEVICE0_SASSATA_FLAGS_UNSUPP_DEV           (0x0020)
+#define MPI3_DEVICE0_SASSATA_FLAGS_48BIT_LBA            (0x0010)
+#define MPI3_DEVICE0_SASSATA_FLAGS_SMART_SUPP           (0x0008)
+#define MPI3_DEVICE0_SASSATA_FLAGS_NCQ_SUPP             (0x0004)
+#define MPI3_DEVICE0_SASSATA_FLAGS_FUA_SUPP             (0x0002)
+#define MPI3_DEVICE0_SASSATA_FLAGS_PERSIST_CAP          (0x0001)
+struct mpi3_device0_pcie_format {
+	u8         supported_link_rates;
+	u8         max_port_width;
+	u8         negotiated_port_width;
+	u8         negotiated_link_rate;
+	u8         port_num;
+	u8         controller_reset_to;
+	__le16     device_info;
+	__le32     maximum_data_transfer_size;
+	__le32     capabilities;
+	__le16     noiob;
+	u8         nvme_abort_to;
+	u8         page_size;
+	__le16     shutdown_latency;
+	u8         recovery_info;
+	u8         reserved17;
+};
+#define MPI3_DEVICE0_PCIE_LINK_RATE_32_0_SUPP           (0x10)
+#define MPI3_DEVICE0_PCIE_LINK_RATE_16_0_SUPP           (0x08)
+#define MPI3_DEVICE0_PCIE_LINK_RATE_8_0_SUPP            (0x04)
+#define MPI3_DEVICE0_PCIE_LINK_RATE_5_0_SUPP            (0x02)
+#define MPI3_DEVICE0_PCIE_LINK_RATE_2_5_SUPP            (0x01)
+#define MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_MASK             (0x0007)
+#define MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_NO_DEVICE        (0x0000)
+#define MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_NVME_DEVICE      (0x0001)
+#define MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_SWITCH_DEVICE    (0x0002)
+#define MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_SCSI_DEVICE      (0x0003)
+#define MPI3_DEVICE0_PCIE_DEVICE_INFO_ASPM_MASK             (0x0030)
+#define MPI3_DEVICE0_PCIE_DEVICE_INFO_ASPM_SHIFT            (4)
+#define MPI3_DEVICE0_PCIE_DEVICE_INFO_PITYPE_MASK           (0x00c0)
+#define MPI3_DEVICE0_PCIE_DEVICE_INFO_PITYPE_SHIFT          (6)
+#define MPI3_DEVICE0_PCIE_DEVICE_INFO_PITYPE_0              (0x0000)
+#define MPI3_DEVICE0_PCIE_DEVICE_INFO_PITYPE_1              (0x0040)
+#define MPI3_DEVICE0_PCIE_DEVICE_INFO_PITYPE_2              (0x0080)
+#define MPI3_DEVICE0_PCIE_DEVICE_INFO_PITYPE_3              (0x00c0)
+#define MPI3_DEVICE0_PCIE_CAP_SGL_EXTRA_LENGTH_SUPPORTED    (0x00000020)
+#define MPI3_DEVICE0_PCIE_CAP_METADATA_SEPARATED            (0x00000010)
+#define MPI3_DEVICE0_PCIE_CAP_SGL_DWORD_ALIGN_REQUIRED      (0x00000008)
+#define MPI3_DEVICE0_PCIE_CAP_SGL_FORMAT_SGL                (0x00000004)
+#define MPI3_DEVICE0_PCIE_CAP_SGL_FORMAT_PRP                (0x00000000)
+#define MPI3_DEVICE0_PCIE_CAP_BIT_BUCKET_SGL_SUPP           (0x00000002)
+#define MPI3_DEVICE0_PCIE_CAP_SGL_SUPP                      (0x00000001)
+#define MPI3_DEVICE0_PCIE_CAP_ASPM_MASK                     (0x000000c0)
+#define MPI3_DEVICE0_PCIE_CAP_ASPM_SHIFT                    (6)
+#define MPI3_DEVICE0_PCIE_RECOVER_METHOD_MASK               (0xe0)
+#define MPI3_DEVICE0_PCIE_RECOVER_METHOD_NS_MGMT            (0x00)
+#define MPI3_DEVICE0_PCIE_RECOVER_METHOD_FORMAT             (0x20)
+#define MPI3_DEVICE0_PCIE_RECOVER_REASON_MASK               (0x1f)
+#define MPI3_DEVICE0_PCIE_RECOVER_REASON_NO_NS              (0x00)
+#define MPI3_DEVICE0_PCIE_RECOVER_REASON_NO_NSID_1          (0x01)
+#define MPI3_DEVICE0_PCIE_RECOVER_REASON_TOO_MANY_NS        (0x02)
+#define MPI3_DEVICE0_PCIE_RECOVER_REASON_PROTECTION         (0x03)
+#define MPI3_DEVICE0_PCIE_RECOVER_REASON_METADATA_SZ        (0x04)
+#define MPI3_DEVICE0_PCIE_RECOVER_REASON_LBA_DATA_SZ        (0x05)
+struct mpi3_device0_vd_format {
+	u8         vd_state;
+	u8         raid_level;
+	__le16     device_info;
+	__le16     flags;
+	__le16     io_throttle_group;
+	__le16     io_throttle_group_low;
+	__le16     io_throttle_group_high;
+	__le32     reserved0c;
+};
+#define MPI3_DEVICE0_VD_STATE_OFFLINE                       (0x00)
+#define MPI3_DEVICE0_VD_STATE_PARTIALLY_DEGRADED            (0x01)
+#define MPI3_DEVICE0_VD_STATE_DEGRADED                      (0x02)
+#define MPI3_DEVICE0_VD_STATE_OPTIMAL                       (0x03)
+#define MPI3_DEVICE0_VD_RAIDLEVEL_RAID_0                    (0)
+#define MPI3_DEVICE0_VD_RAIDLEVEL_RAID_1                    (1)
+#define MPI3_DEVICE0_VD_RAIDLEVEL_RAID_5                    (5)
+#define MPI3_DEVICE0_VD_RAIDLEVEL_RAID_6                    (6)
+#define MPI3_DEVICE0_VD_RAIDLEVEL_RAID_10                   (10)
+#define MPI3_DEVICE0_VD_RAIDLEVEL_RAID_50                   (50)
+#define MPI3_DEVICE0_VD_RAIDLEVEL_RAID_60                   (60)
+#define MPI3_DEVICE0_VD_DEVICE_INFO_HDD                     (0x0010)
+#define MPI3_DEVICE0_VD_DEVICE_INFO_SSD                     (0x0008)
+#define MPI3_DEVICE0_VD_DEVICE_INFO_NVME                    (0x0004)
+#define MPI3_DEVICE0_VD_DEVICE_INFO_SATA                    (0x0002)
+#define MPI3_DEVICE0_VD_DEVICE_INFO_SAS                     (0x0001)
+#define MPI3_DEVICE0_VD_FLAGS_IO_THROTTLE_GROUP_QD_MASK     (0xf000)
+#define MPI3_DEVICE0_VD_FLAGS_IO_THROTTLE_GROUP_QD_SHIFT    (12)
+#define MPI3_DEVICE0_VD_FLAGS_METADATA_MODE_MASK            (0x0003)
+#define MPI3_DEVICE0_VD_FLAGS_METADATA_MODE_NONE            (0x0000)
+#define MPI3_DEVICE0_VD_FLAGS_METADATA_MODE_HOST            (0x0001)
+#define MPI3_DEVICE0_VD_FLAGS_METADATA_MODE_IOC             (0x0002)
+union mpi3_device0_dev_spec_format {
+	struct mpi3_device0_sas_sata_format        sas_sata_format;
+	struct mpi3_device0_pcie_format            pcie_format;
+	struct mpi3_device0_vd_format              vd_format;
+};
+struct mpi3_device_page0 {
+	struct mpi3_config_page_header         header;
+	__le16                             dev_handle;
+	__le16                             parent_dev_handle;
+	__le16                             slot;
+	__le16                             enclosure_handle;
+	__le64                             wwid;
+	__le16                             persistent_id;
+	u8                                 io_unit_port;
+	u8                                 access_status;
+	__le16                             flags;
+	__le16                             reserved1e;
+	__le16                             slot_index;
+	__le16                             queue_depth;
+	u8                                 reserved24[3];
+	u8                                 device_form;
+	union mpi3_device0_dev_spec_format    device_specific;
+};
+#define MPI3_DEVICE0_PAGEVERSION                        (0x00)
+#define MPI3_DEVICE0_PARENT_INVALID                     (0xffff)
+#define MPI3_DEVICE0_ENCLOSURE_HANDLE_NO_ENCLOSURE      (0x0000)
+#define MPI3_DEVICE0_WWID_INVALID                       (0xffffffffffffffff)
+#define MPI3_DEVICE0_PERSISTENTID_INVALID               (0xffff)
+#define MPI3_DEVICE0_IOUNITPORT_INVALID                 (0xff)
+#define MPI3_DEVICE0_ASTATUS_NO_ERRORS                              (0x00)
+#define MPI3_DEVICE0_ASTATUS_NEEDS_INITIALIZATION                   (0x01)
+#define MPI3_DEVICE0_ASTATUS_CAP_UNSUPPORTED                        (0x02)
+#define MPI3_DEVICE0_ASTATUS_DEVICE_BLOCKED                         (0x03)
+#define MPI3_DEVICE0_ASTATUS_UNAUTHORIZED                           (0x04)
+#define MPI3_DEVICE0_ASTATUS_DEVICE_MISSING_DELAY                   (0x05)
+#define MPI3_DEVICE0_ASTATUS_PREPARE                                (0x06)
+#define MPI3_DEVICE0_ASTATUS_SAFE_MODE                              (0x07)
+#define MPI3_DEVICE0_ASTATUS_GENERIC_MAX                            (0x0f)
+#define MPI3_DEVICE0_ASTATUS_SAS_UNKNOWN                            (0x10)
+#define MPI3_DEVICE0_ASTATUS_ROUTE_NOT_ADDRESSABLE                  (0x11)
+#define MPI3_DEVICE0_ASTATUS_SMP_ERROR_NOT_ADDRESSABLE              (0x12)
+#define MPI3_DEVICE0_ASTATUS_SAS_MAX                                (0x1f)
+#define MPI3_DEVICE0_ASTATUS_SIF_UNKNOWN                            (0x20)
+#define MPI3_DEVICE0_ASTATUS_SIF_AFFILIATION_CONFLICT               (0x21)
+#define MPI3_DEVICE0_ASTATUS_SIF_DIAG                               (0x22)
+#define MPI3_DEVICE0_ASTATUS_SIF_IDENTIFICATION                     (0x23)
+#define MPI3_DEVICE0_ASTATUS_SIF_CHECK_POWER                        (0x24)
+#define MPI3_DEVICE0_ASTATUS_SIF_PIO_SN                             (0x25)
+#define MPI3_DEVICE0_ASTATUS_SIF_MDMA_SN                            (0x26)
+#define MPI3_DEVICE0_ASTATUS_SIF_UDMA_SN                            (0x27)
+#define MPI3_DEVICE0_ASTATUS_SIF_ZONING_VIOLATION                   (0x28)
+#define MPI3_DEVICE0_ASTATUS_SIF_NOT_ADDRESSABLE                    (0x29)
+#define MPI3_DEVICE0_ASTATUS_SIF_MAX                                (0x2f)
+#define MPI3_DEVICE0_ASTATUS_PCIE_UNKNOWN                           (0x30)
+#define MPI3_DEVICE0_ASTATUS_PCIE_MEM_SPACE_ACCESS                  (0x31)
+#define MPI3_DEVICE0_ASTATUS_PCIE_UNSUPPORTED                       (0x32)
+#define MPI3_DEVICE0_ASTATUS_PCIE_MSIX_REQUIRED                     (0x33)
+#define MPI3_DEVICE0_ASTATUS_PCIE_ECRC_REQUIRED                     (0x34)
+#define MPI3_DEVICE0_ASTATUS_PCIE_MAX                               (0x3f)
+#define MPI3_DEVICE0_ASTATUS_NVME_UNKNOWN                           (0x40)
+#define MPI3_DEVICE0_ASTATUS_NVME_READY_TIMEOUT                     (0x41)
+#define MPI3_DEVICE0_ASTATUS_NVME_DEVCFG_UNSUPPORTED                (0x42)
+#define MPI3_DEVICE0_ASTATUS_NVME_IDENTIFY_FAILED                   (0x43)
+#define MPI3_DEVICE0_ASTATUS_NVME_QCONFIG_FAILED                    (0x44)
+#define MPI3_DEVICE0_ASTATUS_NVME_QCREATION_FAILED                  (0x45)
+#define MPI3_DEVICE0_ASTATUS_NVME_EVENTCFG_FAILED                   (0x46)
+#define MPI3_DEVICE0_ASTATUS_NVME_GET_FEATURE_STAT_FAILED           (0x47)
+#define MPI3_DEVICE0_ASTATUS_NVME_IDLE_TIMEOUT                      (0x48)
+#define MPI3_DEVICE0_ASTATUS_NVME_CTRL_FAILURE_STATUS               (0x49)
+#define MPI3_DEVICE0_ASTATUS_NVME_INSUFFICIENT_POWER                (0x4a)
+#define MPI3_DEVICE0_ASTATUS_NVME_DOORBELL_STRIDE                   (0x4b)
+#define MPI3_DEVICE0_ASTATUS_NVME_MEM_PAGE_MIN_SIZE                 (0x4c)
+#define MPI3_DEVICE0_ASTATUS_NVME_MEMORY_ALLOCATION                 (0x4d)
+#define MPI3_DEVICE0_ASTATUS_NVME_COMPLETION_TIME                   (0x4e)
+#define MPI3_DEVICE0_ASTATUS_NVME_BAR                               (0x4f)
+#define MPI3_DEVICE0_ASTATUS_NVME_NS_DESCRIPTOR                     (0x50)
+#define MPI3_DEVICE0_ASTATUS_NVME_INCOMPATIBLE_SETTINGS             (0x51)
+#define MPI3_DEVICE0_ASTATUS_NVME_TOO_MANY_ERRORS                   (0x52)
+#define MPI3_DEVICE0_ASTATUS_NVME_MAX                               (0x5f)
+#define MPI3_DEVICE0_ASTATUS_VD_UNKNOWN                             (0x80)
+#define MPI3_DEVICE0_ASTATUS_VD_MAX                                 (0x8f)
+#define MPI3_DEVICE0_FLAGS_CONTROLLER_DEV_HANDLE        (0x0080)
+#define MPI3_DEVICE0_FLAGS_IO_THROTTLING_REQUIRED       (0x0010)
+#define MPI3_DEVICE0_FLAGS_HIDDEN                       (0x0008)
+#define MPI3_DEVICE0_FLAGS_ATT_METHOD_VIRTUAL           (0x0004)
+#define MPI3_DEVICE0_FLAGS_ATT_METHOD_DIR_ATTACHED      (0x0002)
+#define MPI3_DEVICE0_FLAGS_DEVICE_PRESENT               (0x0001)
+#define MPI3_DEVICE0_QUEUE_DEPTH_NOT_APPLICABLE         (0x0000)
+struct mpi3_device1_sas_sata_format {
+	__le32                             reserved00;
+};
+struct mpi3_device1_pcie_format {
+	__le16                             vendor_id;
+	__le16                             device_id;
+	__le16                             subsystem_vendor_id;
+	__le16                             subsystem_id;
+	__le32                             reserved08;
+	u8                                 revision_id;
+	u8                                 reserved0d;
+	__le16                             pci_parameters;
+};
+#define MPI3_DEVICE1_PCIE_PARAMS_DATA_SIZE_128B              (0x0)
+#define MPI3_DEVICE1_PCIE_PARAMS_DATA_SIZE_256B              (0x1)
+#define MPI3_DEVICE1_PCIE_PARAMS_DATA_SIZE_512B              (0x2)
+#define MPI3_DEVICE1_PCIE_PARAMS_DATA_SIZE_1024B             (0x3)
+#define MPI3_DEVICE1_PCIE_PARAMS_DATA_SIZE_2048B             (0x4)
+#define MPI3_DEVICE1_PCIE_PARAMS_DATA_SIZE_4096B             (0x5)
+#define MPI3_DEVICE1_PCIE_PARAMS_MAX_READ_REQ_MASK           (0x01c0)
+#define MPI3_DEVICE1_PCIE_PARAMS_MAX_READ_REQ_SHIFT          (6)
+#define MPI3_DEVICE1_PCIE_PARAMS_CURR_MAX_PAYLOAD_MASK       (0x0038)
+#define MPI3_DEVICE1_PCIE_PARAMS_CURR_MAX_PAYLOAD_SHIFT      (3)
+#define MPI3_DEVICE1_PCIE_PARAMS_SUPP_MAX_PAYLOAD_MASK       (0x0007)
+#define MPI3_DEVICE1_PCIE_PARAMS_SUPP_MAX_PAYLOAD_SHIFT      (0)
+struct mpi3_device1_vd_format {
+	__le32                             reserved00;
+};
+union mpi3_device1_dev_spec_format {
+	struct mpi3_device1_sas_sata_format    sas_sata_format;
+	struct mpi3_device1_pcie_format        pcie_format;
+	struct mpi3_device1_vd_format          vd_format;
+};
+struct mpi3_device_page1 {
+	struct mpi3_config_page_header         header;
+	__le16                             dev_handle;
+	__le16                             reserved0a;
+	__le16                             link_change_count;
+	__le16                             rate_change_count;
+	__le16                             tm_count;
+	__le16                             reserved12;
+	__le32                             reserved14[10];
+	u8                                 reserved3c[3];
+	u8                                 device_form;
+	union mpi3_device1_dev_spec_format    device_specific;
+};
+#define MPI3_DEVICE1_PAGEVERSION                            (0x00)
+#define MPI3_DEVICE1_COUNTER_MAX                            (0xfffe)
+#define MPI3_DEVICE1_COUNTER_INVALID                        (0xffff)
+#endif
diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_image.h b/drivers/scsi/mpi3mr/mpi/mpi30_image.h
new file mode 100644
index 0000000000000..0d329eb74e083
--- /dev/null
+++ b/drivers/scsi/mpi3mr/mpi/mpi30_image.h
@@ -0,0 +1,259 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ *  Copyright 2016-2022 Broadcom Inc. All rights reserved.
+ */
+#ifndef MPI30_IMAGE_H
+#define MPI30_IMAGE_H     1
+struct mpi3_comp_image_version {
+	__le16     build_num;
+	__le16     customer_id;
+	u8         phase_minor;
+	u8         phase_major;
+	u8         gen_minor;
+	u8         gen_major;
+};
+struct mpi3_hash_exclusion_format {
+	__le32                     offset;
+	__le32                     size;
+};
+#define MPI3_IMAGE_HASH_EXCUSION_NUM                           (4)
+struct mpi3_component_image_header {
+	__le32                            signature0;
+	__le32                            load_address;
+	__le32                            data_size;
+	__le32                            start_offset;
+	__le32                            signature1;
+	__le32                            flash_offset;
+	__le32                            image_size;
+	__le32                            version_string_offset;
+	__le32                            build_date_string_offset;
+	__le32                            build_time_string_offset;
+	__le32                            environment_variable_offset;
+	__le32                            application_specific;
+	__le32                            signature2;
+	__le32                            header_size;
+	__le32                            crc;
+	__le32                            flags;
+	__le32                            secondary_flash_offset;
+	__le32                            etp_offset;
+	__le32                            etp_size;
+	union mpi3_version_union             rmc_interface_version;
+	union mpi3_version_union             etp_interface_version;
+	struct mpi3_comp_image_version        component_image_version;
+	struct mpi3_hash_exclusion_format     hash_exclusion[MPI3_IMAGE_HASH_EXCUSION_NUM];
+	__le32                            next_image_header_offset;
+	union mpi3_version_union             security_version;
+	__le32                            reserved84[31];
+};
+#define MPI3_IMAGE_HEADER_SIGNATURE0_MPI3                     (0xeb00003e)
+#define MPI3_IMAGE_HEADER_LOAD_ADDRESS_INVALID                (0x00000000)
+#define MPI3_IMAGE_HEADER_SIGNATURE1_APPLICATION              (0x20505041)
+#define MPI3_IMAGE_HEADER_SIGNATURE1_FIRST_MUTABLE            (0x20434d46)
+#define MPI3_IMAGE_HEADER_SIGNATURE1_BSP                      (0x20505342)
+#define MPI3_IMAGE_HEADER_SIGNATURE1_ROM_BIOS                 (0x534f4942)
+#define MPI3_IMAGE_HEADER_SIGNATURE1_HII_X64                  (0x4d494948)
+#define MPI3_IMAGE_HEADER_SIGNATURE1_HII_ARM                  (0x41494948)
+#define MPI3_IMAGE_HEADER_SIGNATURE1_CPLD                     (0x444c5043)
+#define MPI3_IMAGE_HEADER_SIGNATURE1_SPD                      (0x20445053)
+#define MPI3_IMAGE_HEADER_SIGNATURE1_GAS_GAUGE                (0x20534147)
+#define MPI3_IMAGE_HEADER_SIGNATURE1_PBLP                     (0x504c4250)
+#define MPI3_IMAGE_HEADER_SIGNATURE1_MANIFEST                 (0x464e414d)
+#define MPI3_IMAGE_HEADER_SIGNATURE1_OEM                      (0x204d454f)
+#define MPI3_IMAGE_HEADER_SIGNATURE2_VALUE                    (0x50584546)
+#define MPI3_IMAGE_HEADER_FLAGS_DEVICE_KEY_BASIS_MASK         (0x00000030)
+#define MPI3_IMAGE_HEADER_FLAGS_DEVICE_KEY_BASIS_CDI          (0x00000000)
+#define MPI3_IMAGE_HEADER_FLAGS_DEVICE_KEY_BASIS_DI           (0x00000010)
+#define MPI3_IMAGE_HEADER_FLAGS_SIGNED_NVDATA                 (0x00000008)
+#define MPI3_IMAGE_HEADER_FLAGS_REQUIRES_ACTIVATION           (0x00000004)
+#define MPI3_IMAGE_HEADER_FLAGS_COMPRESSED                    (0x00000002)
+#define MPI3_IMAGE_HEADER_FLAGS_FLASH                         (0x00000001)
+#define MPI3_IMAGE_HEADER_SIGNATURE0_OFFSET                   (0x00)
+#define MPI3_IMAGE_HEADER_LOAD_ADDRESS_OFFSET                 (0x04)
+#define MPI3_IMAGE_HEADER_DATA_SIZE_OFFSET                    (0x08)
+#define MPI3_IMAGE_HEADER_START_OFFSET_OFFSET                 (0x0c)
+#define MPI3_IMAGE_HEADER_SIGNATURE1_OFFSET                   (0x10)
+#define MPI3_IMAGE_HEADER_FLASH_OFFSET_OFFSET                 (0x14)
+#define MPI3_IMAGE_HEADER_FLASH_SIZE_OFFSET                   (0x18)
+#define MPI3_IMAGE_HEADER_VERSION_STRING_OFFSET_OFFSET        (0x1c)
+#define MPI3_IMAGE_HEADER_BUILD_DATE_STRING_OFFSET_OFFSET     (0x20)
+#define MPI3_IMAGE_HEADER_BUILD_TIME_OFFSET_OFFSET            (0x24)
+#define MPI3_IMAGE_HEADER_ENVIROMENT_VAR_OFFSET_OFFSET        (0x28)
+#define MPI3_IMAGE_HEADER_APPLICATION_SPECIFIC_OFFSET         (0x2c)
+#define MPI3_IMAGE_HEADER_SIGNATURE2_OFFSET                   (0x30)
+#define MPI3_IMAGE_HEADER_HEADER_SIZE_OFFSET                  (0x34)
+#define MPI3_IMAGE_HEADER_CRC_OFFSET                          (0x38)
+#define MPI3_IMAGE_HEADER_FLAGS_OFFSET                        (0x3c)
+#define MPI3_IMAGE_HEADER_SECONDARY_FLASH_OFFSET_OFFSET       (0x40)
+#define MPI3_IMAGE_HEADER_ETP_OFFSET_OFFSET                   (0x44)
+#define MPI3_IMAGE_HEADER_ETP_SIZE_OFFSET                     (0x48)
+#define MPI3_IMAGE_HEADER_RMC_INTERFACE_VER_OFFSET            (0x4c)
+#define MPI3_IMAGE_HEADER_ETP_INTERFACE_VER_OFFSET            (0x50)
+#define MPI3_IMAGE_HEADER_COMPONENT_IMAGE_VER_OFFSET          (0x54)
+#define MPI3_IMAGE_HEADER_HASH_EXCLUSION_OFFSET               (0x5c)
+#define MPI3_IMAGE_HEADER_NEXT_IMAGE_HEADER_OFFSET_OFFSET     (0x7c)
+#define MPI3_IMAGE_HEADER_SIZE                                (0x100)
+#ifndef MPI3_CI_MANIFEST_MPI_MAX
+#define MPI3_CI_MANIFEST_MPI_MAX                               (1)
+#endif
+struct mpi3_ci_manifest_mpi_comp_image_ref {
+	__le32                                signature1;
+	__le32                                reserved04[3];
+	struct mpi3_comp_image_version            component_image_version;
+	__le32                                component_image_version_string_offset;
+	__le32                                crc;
+};
+struct mpi3_ci_manifest_mpi {
+	u8                                       manifest_type;
+	u8                                       reserved01[3];
+	__le32                                   reserved04[3];
+	u8                                       num_image_references;
+	u8                                       release_level;
+	__le16                                   reserved12;
+	__le16                                   reserved14;
+	__le16                                   flags;
+	__le32                                   reserved18[2];
+	__le16                                   vendor_id;
+	__le16                                   device_id;
+	__le16                                   subsystem_vendor_id;
+	__le16                                   subsystem_id;
+	__le32                                   reserved28[2];
+	union mpi3_version_union                    package_security_version;
+	__le32                                   reserved34;
+	struct mpi3_comp_image_version               package_version;
+	__le32                                   package_version_string_offset;
+	__le32                                   package_build_date_string_offset;
+	__le32                                   package_build_time_string_offset;
+	__le32                                   reserved4c;
+	__le32                                   diag_authorization_identifier[16];
+	struct mpi3_ci_manifest_mpi_comp_image_ref   component_image_ref[MPI3_CI_MANIFEST_MPI_MAX];
+};
+#define MPI3_CI_MANIFEST_MPI_RELEASE_LEVEL_DEV                        (0x00)
+#define MPI3_CI_MANIFEST_MPI_RELEASE_LEVEL_PREALPHA                   (0x10)
+#define MPI3_CI_MANIFEST_MPI_RELEASE_LEVEL_ALPHA                      (0x20)
+#define MPI3_CI_MANIFEST_MPI_RELEASE_LEVEL_BETA                       (0x30)
+#define MPI3_CI_MANIFEST_MPI_RELEASE_LEVEL_RC                         (0x40)
+#define MPI3_CI_MANIFEST_MPI_RELEASE_LEVEL_GCA                        (0x50)
+#define MPI3_CI_MANIFEST_MPI_RELEASE_LEVEL_POINT                      (0x60)
+#define MPI3_CI_MANIFEST_MPI_FLAGS_DIAG_AUTHORIZATION                 (0x01)
+#define MPI3_CI_MANIFEST_MPI_SUBSYSTEMID_IGNORED                   (0xffff)
+#define MPI3_CI_MANIFEST_MPI_PKG_VER_STR_OFF_UNSPECIFIED           (0x00000000)
+#define MPI3_CI_MANIFEST_MPI_PKG_BUILD_DATE_STR_OFF_UNSPECIFIED    (0x00000000)
+#define MPI3_CI_MANIFEST_MPI_PKG_BUILD_TIME_STR_OFF_UNSPECIFIED    (0x00000000)
+union mpi3_ci_manifest {
+	struct mpi3_ci_manifest_mpi               mpi;
+	__le32                                dword[1];
+};
+#define MPI3_CI_MANIFEST_TYPE_MPI                                  (0x00)
+struct mpi3_extended_image_header {
+	u8                                image_type;
+	u8                                reserved01[3];
+	__le32                            checksum;
+	__le32                            image_size;
+	__le32                            next_image_header_offset;
+	__le32                            reserved10[4];
+	__le32                            identify_string[8];
+};
+#define MPI3_EXT_IMAGE_IMAGETYPE_OFFSET         (0x00)
+#define MPI3_EXT_IMAGE_IMAGESIZE_OFFSET         (0x08)
+#define MPI3_EXT_IMAGE_NEXTIMAGE_OFFSET         (0x0c)
+#define MPI3_EXT_IMAGE_HEADER_SIZE              (0x40)
+#define MPI3_EXT_IMAGE_TYPE_UNSPECIFIED             (0x00)
+#define MPI3_EXT_IMAGE_TYPE_NVDATA                  (0x03)
+#define MPI3_EXT_IMAGE_TYPE_SUPPORTED_DEVICES       (0x07)
+#define MPI3_EXT_IMAGE_TYPE_ENCRYPTED_HASH          (0x09)
+#define MPI3_EXT_IMAGE_TYPE_RDE                     (0x0a)
+#define MPI3_EXT_IMAGE_TYPE_AUXILIARY_PROCESSOR     (0x0b)
+#define MPI3_EXT_IMAGE_TYPE_MIN_PRODUCT_SPECIFIC    (0x80)
+#define MPI3_EXT_IMAGE_TYPE_MAX_PRODUCT_SPECIFIC    (0xff)
+struct mpi3_supported_device {
+	__le16                     device_id;
+	__le16                     vendor_id;
+	__le16                     device_id_mask;
+	__le16                     reserved06;
+	u8                         low_pci_rev;
+	u8                         high_pci_rev;
+	__le16                     reserved0a;
+	__le32                     reserved0c;
+};
+#ifndef MPI3_SUPPORTED_DEVICE_MAX
+#define MPI3_SUPPORTED_DEVICE_MAX                      (1)
+#endif
+struct mpi3_supported_devices_data {
+	u8                         image_version;
+	u8                         reserved01;
+	u8                         num_devices;
+	u8                         reserved03;
+	__le32                     reserved04;
+	struct mpi3_supported_device   supported_device[MPI3_SUPPORTED_DEVICE_MAX];
+};
+#ifndef MPI3_ENCRYPTED_HASH_MAX
+#define MPI3_ENCRYPTED_HASH_MAX                      (1)
+#endif
+struct mpi3_encrypted_hash_entry {
+	u8                         hash_image_type;
+	u8                         hash_algorithm;
+	u8                         encryption_algorithm;
+	u8                         reserved03;
+	__le32                     reserved04;
+	__le32                     encrypted_hash[MPI3_ENCRYPTED_HASH_MAX];
+};
+#define MPI3_HASH_IMAGE_TYPE_KEY_WITH_SIGNATURE      (0x03)
+#define MPI3_HASH_ALGORITHM_VERSION_MASK             (0xe0)
+#define MPI3_HASH_ALGORITHM_VERSION_NONE             (0x00)
+#define MPI3_HASH_ALGORITHM_VERSION_SHA1             (0x20)
+#define MPI3_HASH_ALGORITHM_VERSION_SHA2             (0x40)
+#define MPI3_HASH_ALGORITHM_VERSION_SHA3             (0x60)
+#define MPI3_HASH_ALGORITHM_SIZE_MASK                (0x1f)
+#define MPI3_HASH_ALGORITHM_SIZE_UNUSED              (0x00)
+#define MPI3_HASH_ALGORITHM_SIZE_SHA256              (0x01)
+#define MPI3_HASH_ALGORITHM_SIZE_SHA512              (0x02)
+#define MPI3_HASH_ALGORITHM_SIZE_SHA384              (0x03)
+#define MPI3_ENCRYPTION_ALGORITHM_UNUSED             (0x00)
+#define MPI3_ENCRYPTION_ALGORITHM_RSA256             (0x01)
+#define MPI3_ENCRYPTION_ALGORITHM_RSA512             (0x02)
+#define MPI3_ENCRYPTION_ALGORITHM_RSA1024            (0x03)
+#define MPI3_ENCRYPTION_ALGORITHM_RSA2048            (0x04)
+#define MPI3_ENCRYPTION_ALGORITHM_RSA4096            (0x05)
+#define MPI3_ENCRYPTION_ALGORITHM_RSA3072            (0x06)
+#ifndef MPI3_PUBLIC_KEY_MAX
+#define MPI3_PUBLIC_KEY_MAX                          (1)
+#endif
+struct mpi3_encrypted_key_with_hash_entry {
+	u8                         hash_image_type;
+	u8                         hash_algorithm;
+	u8                         encryption_algorithm;
+	u8                         reserved03;
+	__le32                     reserved04;
+	__le32                     public_key[MPI3_PUBLIC_KEY_MAX];
+};
+#ifndef MPI3_ENCRYPTED_HASH_ENTRY_MAX
+#define MPI3_ENCRYPTED_HASH_ENTRY_MAX               (1)
+#endif
+struct mpi3_encrypted_hash_data {
+	u8                                  image_version;
+	u8                                  num_hash;
+	__le16                              reserved02;
+	__le32                              reserved04;
+	struct mpi3_encrypted_hash_entry        encrypted_hash_entry[MPI3_ENCRYPTED_HASH_ENTRY_MAX];
+};
+#ifndef MPI3_AUX_PROC_DATA_MAX
+#define MPI3_AUX_PROC_DATA_MAX               (1)
+#endif
+struct mpi3_aux_processor_data {
+	u8                         boot_method;
+	u8                         num_load_addr;
+	u8                         reserved02;
+	u8                         type;
+	__le32                     version;
+	__le32                     load_address[8];
+	__le32                     reserved28[22];
+	__le32                     aux_processor_data[MPI3_AUX_PROC_DATA_MAX];
+};
+#define MPI3_AUX_PROC_DATA_OFFSET                                     (0x80)
+#define MPI3_AUXPROCESSOR_BOOT_METHOD_MO_MSG                          (0x00)
+#define MPI3_AUXPROCESSOR_BOOT_METHOD_MO_DOORBELL                     (0x01)
+#define MPI3_AUXPROCESSOR_BOOT_METHOD_COMPONENT                       (0x02)
+#define MPI3_AUXPROCESSOR_TYPE_ARM_A15                                (0x00)
+#define MPI3_AUXPROCESSOR_TYPE_ARM_M0                                 (0x01)
+#define MPI3_AUXPROCESSOR_TYPE_ARM_R4                                 (0x02)
+#endif
diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_init.h b/drivers/scsi/mpi3mr/mpi/mpi30_init.h
new file mode 100644
index 0000000000000..f3ae04ac86608
--- /dev/null
+++ b/drivers/scsi/mpi3mr/mpi/mpi30_init.h
@@ -0,0 +1,164 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ *  Copyright 2016-2022 Broadcom Inc. All rights reserved.
+ */
+#ifndef MPI30_INIT_H
+#define MPI30_INIT_H     1
+struct mpi3_scsi_io_cdb_eedp32 {
+	u8                 cdb[20];
+	__be32             primary_reference_tag;
+	__le16             primary_application_tag;
+	__le16             primary_application_tag_mask;
+	__le32             transfer_length;
+};
+union mpi3_scsi_io_cdb_union {
+	u8                         cdb32[32];
+	struct mpi3_scsi_io_cdb_eedp32 eedp32;
+	struct mpi3_sge_common         sge;
+};
+struct mpi3_scsi_io_request {
+	__le16                     host_tag;
+	u8                         ioc_use_only02;
+	u8                         function;
+	__le16                     ioc_use_only04;
+	u8                         ioc_use_only06;
+	u8                         msg_flags;
+	__le16                     change_count;
+	__le16                     dev_handle;
+	__le32                     flags;
+	__le32                     skip_count;
+	__le32                     data_length;
+	u8                         lun[8];
+	union mpi3_scsi_io_cdb_union  cdb;
+	union mpi3_sge_union          sgl[4];
+};
+#define MPI3_SCSIIO_MSGFLAGS_METASGL_VALID                  (0x80)
+#define MPI3_SCSIIO_MSGFLAGS_DIVERT_TO_FIRMWARE             (0x40)
+#define MPI3_SCSIIO_FLAGS_LARGE_CDB                         (0x60000000)
+#define MPI3_SCSIIO_FLAGS_CDB_16_OR_LESS                    (0x00000000)
+#define MPI3_SCSIIO_FLAGS_CDB_GREATER_THAN_16               (0x20000000)
+#define MPI3_SCSIIO_FLAGS_CDB_IN_SEPARATE_BUFFER            (0x40000000)
+#define MPI3_SCSIIO_FLAGS_TASKATTRIBUTE_MASK                (0x07000000)
+#define MPI3_SCSIIO_FLAGS_TASKATTRIBUTE_SIMPLEQ             (0x00000000)
+#define MPI3_SCSIIO_FLAGS_TASKATTRIBUTE_HEADOFQ             (0x01000000)
+#define MPI3_SCSIIO_FLAGS_TASKATTRIBUTE_ORDEREDQ            (0x02000000)
+#define MPI3_SCSIIO_FLAGS_TASKATTRIBUTE_ACAQ                (0x04000000)
+#define MPI3_SCSIIO_FLAGS_CMDPRI_MASK                       (0x00f00000)
+#define MPI3_SCSIIO_FLAGS_CMDPRI_SHIFT                      (20)
+#define MPI3_SCSIIO_FLAGS_DATADIRECTION_MASK                (0x000c0000)
+#define MPI3_SCSIIO_FLAGS_DATADIRECTION_NO_DATA_TRANSFER    (0x00000000)
+#define MPI3_SCSIIO_FLAGS_DATADIRECTION_WRITE               (0x00040000)
+#define MPI3_SCSIIO_FLAGS_DATADIRECTION_READ                (0x00080000)
+#define MPI3_SCSIIO_FLAGS_DMAOPERATION_MASK                 (0x00030000)
+#define MPI3_SCSIIO_FLAGS_DMAOPERATION_HOST_PI              (0x00010000)
+#define MPI3_SCSIIO_FLAGS_DIVERT_REASON_MASK                (0x000000f0)
+#define MPI3_SCSIIO_FLAGS_DIVERT_REASON_IO_THROTTLING       (0x00000010)
+#define MPI3_SCSIIO_FLAGS_DIVERT_REASON_PROD_SPECIFIC       (0x00000080)
+#define MPI3_SCSIIO_METASGL_INDEX                           (3)
+struct mpi3_scsi_io_reply {
+	__le16                     host_tag;
+	u8                         ioc_use_only02;
+	u8                         function;
+	__le16                     ioc_use_only04;
+	u8                         ioc_use_only06;
+	u8                         msg_flags;
+	__le16                     ioc_use_only08;
+	__le16                     ioc_status;
+	__le32                     ioc_log_info;
+	u8                         scsi_status;
+	u8                         scsi_state;
+	__le16                     dev_handle;
+	__le32                     transfer_count;
+	__le32                     sense_count;
+	__le32                     response_data;
+	__le16                     task_tag;
+	__le16                     scsi_status_qualifier;
+	__le32                     eedp_error_offset;
+	__le16                     eedp_observed_app_tag;
+	__le16                     eedp_observed_guard;
+	__le32                     eedp_observed_ref_tag;
+	__le64                     sense_data_buffer_address;
+};
+#define MPI3_SCSIIO_REPLY_MSGFLAGS_REFTAG_OBSERVED_VALID        (0x01)
+#define MPI3_SCSIIO_REPLY_MSGFLAGS_APPTAG_OBSERVED_VALID        (0x02)
+#define MPI3_SCSIIO_REPLY_MSGFLAGS_GUARD_OBSERVED_VALID         (0x04)
+#define MPI3_SCSI_STATUS_GOOD                   (0x00)
+#define MPI3_SCSI_STATUS_CHECK_CONDITION        (0x02)
+#define MPI3_SCSI_STATUS_CONDITION_MET          (0x04)
+#define MPI3_SCSI_STATUS_BUSY                   (0x08)
+#define MPI3_SCSI_STATUS_INTERMEDIATE           (0x10)
+#define MPI3_SCSI_STATUS_INTERMEDIATE_CONDMET   (0x14)
+#define MPI3_SCSI_STATUS_RESERVATION_CONFLICT   (0x18)
+#define MPI3_SCSI_STATUS_COMMAND_TERMINATED     (0x22)
+#define MPI3_SCSI_STATUS_TASK_SET_FULL          (0x28)
+#define MPI3_SCSI_STATUS_ACA_ACTIVE             (0x30)
+#define MPI3_SCSI_STATUS_TASK_ABORTED           (0x40)
+#define MPI3_SCSI_STATE_SENSE_MASK              (0x03)
+#define MPI3_SCSI_STATE_SENSE_VALID             (0x00)
+#define MPI3_SCSI_STATE_SENSE_FAILED            (0x01)
+#define MPI3_SCSI_STATE_SENSE_BUFF_Q_EMPTY      (0x02)
+#define MPI3_SCSI_STATE_SENSE_NOT_AVAILABLE     (0x03)
+#define MPI3_SCSI_STATE_NO_SCSI_STATUS          (0x04)
+#define MPI3_SCSI_STATE_TERMINATED              (0x08)
+#define MPI3_SCSI_STATE_RESPONSE_DATA_VALID     (0x10)
+#define MPI3_SCSI_RSP_RESPONSECODE_MASK         (0x000000ff)
+#define MPI3_SCSI_RSP_RESPONSECODE_SHIFT        (0)
+#define MPI3_SCSI_RSP_ARI2_MASK                 (0x0000ff00)
+#define MPI3_SCSI_RSP_ARI2_SHIFT                (8)
+#define MPI3_SCSI_RSP_ARI1_MASK                 (0x00ff0000)
+#define MPI3_SCSI_RSP_ARI1_SHIFT                (16)
+#define MPI3_SCSI_RSP_ARI0_MASK                 (0xff000000)
+#define MPI3_SCSI_RSP_ARI0_SHIFT                (24)
+#define MPI3_SCSI_TASKTAG_UNKNOWN               (0xffff)
+struct mpi3_scsi_task_mgmt_request {
+	__le16                     host_tag;
+	u8                         ioc_use_only02;
+	u8                         function;
+	__le16                     ioc_use_only04;
+	u8                         ioc_use_only06;
+	u8                         msg_flags;
+	__le16                     change_count;
+	__le16                     dev_handle;
+	__le16                     task_host_tag;
+	u8                         task_type;
+	u8                         reserved0f;
+	__le16                     task_request_queue_id;
+	__le16                     reserved12;
+	__le32                     reserved14;
+	u8                         lun[8];
+};
+#define MPI3_SCSITASKMGMT_MSGFLAGS_DO_NOT_SEND_TASK_IU      (0x08)
+#define MPI3_SCSITASKMGMT_TASKTYPE_ABORT_TASK               (0x01)
+#define MPI3_SCSITASKMGMT_TASKTYPE_ABORT_TASK_SET           (0x02)
+#define MPI3_SCSITASKMGMT_TASKTYPE_TARGET_RESET             (0x03)
+#define MPI3_SCSITASKMGMT_TASKTYPE_LOGICAL_UNIT_RESET       (0x05)
+#define MPI3_SCSITASKMGMT_TASKTYPE_CLEAR_TASK_SET           (0x06)
+#define MPI3_SCSITASKMGMT_TASKTYPE_QUERY_TASK               (0x07)
+#define MPI3_SCSITASKMGMT_TASKTYPE_CLEAR_ACA                (0x08)
+#define MPI3_SCSITASKMGMT_TASKTYPE_QUERY_TASK_SET           (0x09)
+#define MPI3_SCSITASKMGMT_TASKTYPE_QUERY_ASYNC_EVENT        (0x0a)
+#define MPI3_SCSITASKMGMT_TASKTYPE_I_T_NEXUS_RESET          (0x0b)
+struct mpi3_scsi_task_mgmt_reply {
+	__le16                     host_tag;
+	u8                         ioc_use_only02;
+	u8                         function;
+	__le16                     ioc_use_only04;
+	u8                         ioc_use_only06;
+	u8                         msg_flags;
+	__le16                     ioc_use_only08;
+	__le16                     ioc_status;
+	__le32                     ioc_log_info;
+	__le32                     termination_count;
+	__le32                     response_data;
+	__le32                     reserved18;
+};
+#define MPI3_SCSITASKMGMT_RSPCODE_TM_COMPLETE                (0x00)
+#define MPI3_SCSITASKMGMT_RSPCODE_INVALID_FRAME              (0x02)
+#define MPI3_SCSITASKMGMT_RSPCODE_TM_FUNCTION_NOT_SUPPORTED  (0x04)
+#define MPI3_SCSITASKMGMT_RSPCODE_TM_FAILED                  (0x05)
+#define MPI3_SCSITASKMGMT_RSPCODE_TM_SUCCEEDED               (0x08)
+#define MPI3_SCSITASKMGMT_RSPCODE_TM_INVALID_LUN             (0x09)
+#define MPI3_SCSITASKMGMT_RSPCODE_TM_OVERLAPPED_TAG          (0x0a)
+#define MPI3_SCSITASKMGMT_RSPCODE_IO_QUEUED_ON_IOC           (0x80)
+#define MPI3_SCSITASKMGMT_RSPCODE_TM_NVME_DENIED             (0x81)
+#endif
diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_ioc.h b/drivers/scsi/mpi3mr/mpi/mpi30_ioc.h
new file mode 100644
index 0000000000000..9fb27cfcf28b4
--- /dev/null
+++ b/drivers/scsi/mpi3mr/mpi/mpi30_ioc.h
@@ -0,0 +1,1021 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ *  Copyright 2016-2022 Broadcom Inc. All rights reserved.
+ */
+#ifndef MPI30_IOC_H
+#define MPI30_IOC_H     1
+struct mpi3_ioc_init_request {
+	__le16                   host_tag;
+	u8                       ioc_use_only02;
+	u8                       function;
+	__le16                   ioc_use_only04;
+	u8                       ioc_use_only06;
+	u8                       msg_flags;
+	__le16                   change_count;
+	__le16                   reserved0a;
+	union mpi3_version_union    mpi_version;
+	__le64                   time_stamp;
+	u8                       reserved18;
+	u8                       who_init;
+	__le16                   reserved1a;
+	__le16                   reply_free_queue_depth;
+	__le16                   reserved1e;
+	__le64                   reply_free_queue_address;
+	__le32                   reserved28;
+	__le16                   sense_buffer_free_queue_depth;
+	__le16                   sense_buffer_length;
+	__le64                   sense_buffer_free_queue_address;
+	__le64                   driver_information_address;
+};
+#define MPI3_IOCINIT_MSGFLAGS_HOSTMETADATA_MASK          (0x03)
+#define MPI3_IOCINIT_MSGFLAGS_HOSTMETADATA_NOT_USED      (0x00)
+#define MPI3_IOCINIT_MSGFLAGS_HOSTMETADATA_SEPARATED     (0x01)
+#define MPI3_IOCINIT_MSGFLAGS_HOSTMETADATA_INLINE        (0x02)
+#define MPI3_IOCINIT_MSGFLAGS_HOSTMETADATA_BOTH          (0x03)
+#define MPI3_WHOINIT_NOT_INITIALIZED                     (0x00)
+#define MPI3_WHOINIT_ROM_BIOS                            (0x02)
+#define MPI3_WHOINIT_HOST_DRIVER                         (0x03)
+#define MPI3_WHOINIT_MANUFACTURER                        (0x04)
+struct mpi3_driver_info_layout {
+	__le32             information_length;
+	u8                 driver_signature[12];
+	u8                 os_name[16];
+	u8                 os_version[12];
+	u8                 driver_name[20];
+	u8                 driver_version[32];
+	u8                 driver_release_date[20];
+	__le32             driver_capabilities;
+};
+struct mpi3_ioc_facts_request {
+	__le16                 host_tag;
+	u8                     ioc_use_only02;
+	u8                     function;
+	__le16                 ioc_use_only04;
+	u8                     ioc_use_only06;
+	u8                     msg_flags;
+	__le16                 change_count;
+	__le16                 reserved0a;
+	__le32                 reserved0c;
+	union mpi3_sge_union      sgl;
+};
+struct mpi3_ioc_facts_data {
+	__le16                     ioc_facts_data_length;
+	__le16                     reserved02;
+	union mpi3_version_union      mpi_version;
+	struct mpi3_comp_image_version fw_version;
+	__le32                     ioc_capabilities;
+	u8                         ioc_number;
+	u8                         who_init;
+	__le16                     max_msix_vectors;
+	__le16                     max_outstanding_requests;
+	__le16                     product_id;
+	__le16                     ioc_request_frame_size;
+	__le16                     reply_frame_size;
+	__le16                     ioc_exceptions;
+	__le16                     max_persistent_id;
+	u8                         sge_modifier_mask;
+	u8                         sge_modifier_value;
+	u8                         sge_modifier_shift;
+	u8                         protocol_flags;
+	__le16                     max_sas_initiators;
+	__le16                     max_data_length;
+	__le16                     max_sas_expanders;
+	__le16                     max_enclosures;
+	__le16                     min_dev_handle;
+	__le16                     max_dev_handle;
+	__le16                     max_pcie_switches;
+	__le16                     max_nvme;
+	__le16                     reserved38;
+	__le16                     max_vds;
+	__le16                     max_host_pds;
+	__le16                     max_adv_host_pds;
+	__le16                     max_raid_pds;
+	__le16                     max_posted_cmd_buffers;
+	__le32                     flags;
+	__le16                     max_operational_request_queues;
+	__le16                     max_operational_reply_queues;
+	__le16                     shutdown_timeout;
+	__le16                     reserved4e;
+	__le32                     diag_trace_size;
+	__le32                     diag_fw_size;
+	__le32                     diag_driver_size;
+	u8                         max_host_pd_ns_count;
+	u8                         max_adv_host_pd_ns_count;
+	u8                         max_raidpd_ns_count;
+	u8                         max_devices_per_throttle_group;
+	__le16                     io_throttle_data_length;
+	__le16                     max_io_throttle_group;
+	__le16                     io_throttle_low;
+	__le16                     io_throttle_high;
+};
+#define MPI3_IOCFACTS_CAPABILITY_NON_SUPERVISOR_MASK          (0x80000000)
+#define MPI3_IOCFACTS_CAPABILITY_SUPERVISOR_IOC               (0x00000000)
+#define MPI3_IOCFACTS_CAPABILITY_NON_SUPERVISOR_IOC           (0x80000000)
+#define MPI3_IOCFACTS_CAPABILITY_INT_COALESCE_MASK            (0x00000600)
+#define MPI3_IOCFACTS_CAPABILITY_INT_COALESCE_FIXED_THRESHOLD (0x00000000)
+#define MPI3_IOCFACTS_CAPABILITY_INT_COALESCE_OUTSTANDING_IO  (0x00000200)
+#define MPI3_IOCFACTS_CAPABILITY_COMPLETE_RESET_CAPABLE       (0x00000100)
+#define MPI3_IOCFACTS_CAPABILITY_SEG_DIAG_TRACE_ENABLED       (0x00000080)
+#define MPI3_IOCFACTS_CAPABILITY_SEG_DIAG_FW_ENABLED          (0x00000040)
+#define MPI3_IOCFACTS_CAPABILITY_SEG_DIAG_DRIVER_ENABLED      (0x00000020)
+#define MPI3_IOCFACTS_CAPABILITY_ADVANCED_HOST_PD_ENABLED     (0x00000010)
+#define MPI3_IOCFACTS_CAPABILITY_RAID_CAPABLE                 (0x00000008)
+#define MPI3_IOCFACTS_CAPABILITY_MULTIPATH_ENABLED            (0x00000002)
+#define MPI3_IOCFACTS_CAPABILITY_COALESCE_CTRL_SUPPORTED      (0x00000001)
+#define MPI3_IOCFACTS_PID_TYPE_MASK                           (0xf000)
+#define MPI3_IOCFACTS_PID_TYPE_SHIFT                          (12)
+#define MPI3_IOCFACTS_PID_PRODUCT_MASK                        (0x0f00)
+#define MPI3_IOCFACTS_PID_PRODUCT_SHIFT                       (8)
+#define MPI3_IOCFACTS_PID_FAMILY_MASK                         (0x00ff)
+#define MPI3_IOCFACTS_PID_FAMILY_SHIFT                        (0)
+#define MPI3_IOCFACTS_EXCEPT_SECURITY_REKEY                   (0x2000)
+#define MPI3_IOCFACTS_EXCEPT_SAS_DISABLED                     (0x1000)
+#define MPI3_IOCFACTS_EXCEPT_SAFE_MODE                        (0x0800)
+#define MPI3_IOCFACTS_EXCEPT_SECURITY_KEY_MASK                (0x0700)
+#define MPI3_IOCFACTS_EXCEPT_SECURITY_KEY_NONE                (0x0000)
+#define MPI3_IOCFACTS_EXCEPT_SECURITY_KEY_LOCAL_VIA_MGMT      (0x0100)
+#define MPI3_IOCFACTS_EXCEPT_SECURITY_KEY_EXT_VIA_MGMT        (0x0200)
+#define MPI3_IOCFACTS_EXCEPT_SECURITY_KEY_DRIVE_EXT_VIA_MGMT  (0x0300)
+#define MPI3_IOCFACTS_EXCEPT_SECURITY_KEY_LOCAL_VIA_OOB       (0x0400)
+#define MPI3_IOCFACTS_EXCEPT_SECURITY_KEY_EXT_VIA_OOB         (0x0500)
+#define MPI3_IOCFACTS_EXCEPT_SECURITY_KEY_DRIVE_EXT_VIA_OOB   (0x0600)
+#define MPI3_IOCFACTS_EXCEPT_PCIE_DISABLED                    (0x0080)
+#define MPI3_IOCFACTS_EXCEPT_PARTIAL_MEMORY_FAILURE           (0x0040)
+#define MPI3_IOCFACTS_EXCEPT_MANUFACT_CHECKSUM_FAIL           (0x0020)
+#define MPI3_IOCFACTS_EXCEPT_FW_CHECKSUM_FAIL                 (0x0010)
+#define MPI3_IOCFACTS_EXCEPT_CONFIG_CHECKSUM_FAIL             (0x0008)
+#define MPI3_IOCFACTS_EXCEPT_BOOTSTAT_MASK                    (0x0001)
+#define MPI3_IOCFACTS_EXCEPT_BOOTSTAT_PRIMARY                 (0x0000)
+#define MPI3_IOCFACTS_EXCEPT_BOOTSTAT_SECONDARY               (0x0001)
+#define MPI3_IOCFACTS_PROTOCOL_SAS                            (0x0010)
+#define MPI3_IOCFACTS_PROTOCOL_SATA                           (0x0008)
+#define MPI3_IOCFACTS_PROTOCOL_NVME                           (0x0004)
+#define MPI3_IOCFACTS_PROTOCOL_SCSI_INITIATOR                 (0x0002)
+#define MPI3_IOCFACTS_PROTOCOL_SCSI_TARGET                    (0x0001)
+#define MPI3_IOCFACTS_MAX_DATA_LENGTH_NOT_REPORTED            (0x0000)
+#define MPI3_IOCFACTS_FLAGS_SIGNED_NVDATA_REQUIRED            (0x00010000)
+#define MPI3_IOCFACTS_FLAGS_DMA_ADDRESS_WIDTH_MASK            (0x0000ff00)
+#define MPI3_IOCFACTS_FLAGS_DMA_ADDRESS_WIDTH_SHIFT           (8)
+#define MPI3_IOCFACTS_FLAGS_INITIAL_PORT_ENABLE_MASK          (0x00000030)
+#define MPI3_IOCFACTS_FLAGS_INITIAL_PORT_ENABLE_NOT_STARTED   (0x00000000)
+#define MPI3_IOCFACTS_FLAGS_INITIAL_PORT_ENABLE_IN_PROGRESS   (0x00000010)
+#define MPI3_IOCFACTS_FLAGS_INITIAL_PORT_ENABLE_COMPLETE      (0x00000020)
+#define MPI3_IOCFACTS_FLAGS_PERSONALITY_MASK                  (0x0000000f)
+#define MPI3_IOCFACTS_FLAGS_PERSONALITY_EHBA                  (0x00000000)
+#define MPI3_IOCFACTS_FLAGS_PERSONALITY_RAID_DDR              (0x00000002)
+#define MPI3_IOCFACTS_IO_THROTTLE_DATA_LENGTH_NOT_REQUIRED    (0x0000)
+#define MPI3_IOCFACTS_MAX_IO_THROTTLE_GROUP_NOT_REQUIRED      (0x0000)
+struct mpi3_mgmt_passthrough_request {
+	__le16                 host_tag;
+	u8                     ioc_use_only02;
+	u8                     function;
+	__le16                 ioc_use_only04;
+	u8                     ioc_use_only06;
+	u8                     msg_flags;
+	__le16                 change_count;
+	__le16                 reserved0a;
+	__le32                 reserved0c[5];
+	union mpi3_sge_union      command_sgl;
+	union mpi3_sge_union      response_sgl;
+};
+struct mpi3_create_request_queue_request {
+	__le16             host_tag;
+	u8                 ioc_use_only02;
+	u8                 function;
+	__le16             ioc_use_only04;
+	u8                 ioc_use_only06;
+	u8                 msg_flags;
+	__le16             change_count;
+	u8                 flags;
+	u8                 burst;
+	__le16             size;
+	__le16             queue_id;
+	__le16             reply_queue_id;
+	__le16             reserved12;
+	__le32             reserved14;
+	__le64             base_address;
+};
+#define MPI3_CREATE_REQUEST_QUEUE_FLAGS_SEGMENTED_MASK          (0x80)
+#define MPI3_CREATE_REQUEST_QUEUE_FLAGS_SEGMENTED_SEGMENTED     (0x80)
+#define MPI3_CREATE_REQUEST_QUEUE_FLAGS_SEGMENTED_CONTIGUOUS    (0x00)
+#define MPI3_CREATE_REQUEST_QUEUE_SIZE_MINIMUM                  (2)
+struct mpi3_delete_request_queue_request {
+	__le16             host_tag;
+	u8                 ioc_use_only02;
+	u8                 function;
+	__le16             ioc_use_only04;
+	u8                 ioc_use_only06;
+	u8                 msg_flags;
+	__le16             change_count;
+	__le16             queue_id;
+};
+struct mpi3_create_reply_queue_request {
+	__le16             host_tag;
+	u8                 ioc_use_only02;
+	u8                 function;
+	__le16             ioc_use_only04;
+	u8                 ioc_use_only06;
+	u8                 msg_flags;
+	__le16             change_count;
+	u8                 flags;
+	u8                 reserved0b;
+	__le16             size;
+	__le16             queue_id;
+	__le16             msix_index;
+	__le16             reserved12;
+	__le32             reserved14;
+	__le64             base_address;
+};
+#define MPI3_CREATE_REPLY_QUEUE_FLAGS_SEGMENTED_MASK            (0x80)
+#define MPI3_CREATE_REPLY_QUEUE_FLAGS_SEGMENTED_SEGMENTED       (0x80)
+#define MPI3_CREATE_REPLY_QUEUE_FLAGS_SEGMENTED_CONTIGUOUS      (0x00)
+#define MPI3_CREATE_REPLY_QUEUE_FLAGS_COALESCE_DISABLE          (0x02)
+#define MPI3_CREATE_REPLY_QUEUE_FLAGS_INT_ENABLE_MASK           (0x01)
+#define MPI3_CREATE_REPLY_QUEUE_FLAGS_INT_ENABLE_DISABLE        (0x00)
+#define MPI3_CREATE_REPLY_QUEUE_FLAGS_INT_ENABLE_ENABLE         (0x01)
+#define MPI3_CREATE_REPLY_QUEUE_SIZE_MINIMUM                    (2)
+struct mpi3_delete_reply_queue_request {
+	__le16             host_tag;
+	u8                 ioc_use_only02;
+	u8                 function;
+	__le16             ioc_use_only04;
+	u8                 ioc_use_only06;
+	u8                 msg_flags;
+	__le16             change_count;
+	__le16             queue_id;
+};
+struct mpi3_port_enable_request {
+	__le16             host_tag;
+	u8                 ioc_use_only02;
+	u8                 function;
+	__le16             ioc_use_only04;
+	u8                 ioc_use_only06;
+	u8                 msg_flags;
+	__le16             change_count;
+	__le16             reserved0a;
+};
+#define MPI3_EVENT_LOG_DATA                         (0x01)
+#define MPI3_EVENT_CHANGE                           (0x02)
+#define MPI3_EVENT_GPIO_INTERRUPT                   (0x04)
+#define MPI3_EVENT_CABLE_MGMT                       (0x06)
+#define MPI3_EVENT_DEVICE_ADDED                     (0x07)
+#define MPI3_EVENT_DEVICE_INFO_CHANGED              (0x08)
+#define MPI3_EVENT_PREPARE_FOR_RESET                (0x09)
+#define MPI3_EVENT_COMP_IMAGE_ACT_START             (0x0a)
+#define MPI3_EVENT_ENCL_DEVICE_ADDED                (0x0b)
+#define MPI3_EVENT_ENCL_DEVICE_STATUS_CHANGE        (0x0c)
+#define MPI3_EVENT_DEVICE_STATUS_CHANGE             (0x0d)
+#define MPI3_EVENT_ENERGY_PACK_CHANGE               (0x0e)
+#define MPI3_EVENT_SAS_DISCOVERY                    (0x11)
+#define MPI3_EVENT_SAS_BROADCAST_PRIMITIVE          (0x12)
+#define MPI3_EVENT_SAS_NOTIFY_PRIMITIVE             (0x13)
+#define MPI3_EVENT_SAS_INIT_DEVICE_STATUS_CHANGE    (0x14)
+#define MPI3_EVENT_SAS_INIT_TABLE_OVERFLOW          (0x15)
+#define MPI3_EVENT_SAS_TOPOLOGY_CHANGE_LIST         (0x16)
+#define MPI3_EVENT_SAS_PHY_COUNTER                  (0x18)
+#define MPI3_EVENT_SAS_DEVICE_DISCOVERY_ERROR       (0x19)
+#define MPI3_EVENT_PCIE_TOPOLOGY_CHANGE_LIST        (0x20)
+#define MPI3_EVENT_PCIE_ENUMERATION                 (0x22)
+#define MPI3_EVENT_PCIE_ERROR_THRESHOLD             (0x23)
+#define MPI3_EVENT_HARD_RESET_RECEIVED              (0x40)
+#define MPI3_EVENT_DIAGNOSTIC_BUFFER_STATUS_CHANGE  (0x50)
+#define MPI3_EVENT_MIN_PRODUCT_SPECIFIC             (0x60)
+#define MPI3_EVENT_MAX_PRODUCT_SPECIFIC             (0x7f)
+#define MPI3_EVENT_NOTIFY_EVENTMASK_WORDS           (4)
+struct mpi3_event_notification_request {
+	__le16             host_tag;
+	u8                 ioc_use_only02;
+	u8                 function;
+	__le16             ioc_use_only04;
+	u8                 ioc_use_only06;
+	u8                 msg_flags;
+	__le16             change_count;
+	__le16             reserved0a;
+	__le16             sas_broadcast_primitive_masks;
+	__le16             sas_notify_primitive_masks;
+	__le32             event_masks[MPI3_EVENT_NOTIFY_EVENTMASK_WORDS];
+};
+struct mpi3_event_notification_reply {
+	__le16             host_tag;
+	u8                 ioc_use_only02;
+	u8                 function;
+	__le16             ioc_use_only04;
+	u8                 ioc_use_only06;
+	u8                 msg_flags;
+	__le16             ioc_use_only08;
+	__le16             ioc_status;
+	__le32             ioc_log_info;
+	u8                 event_data_length;
+	u8                 event;
+	__le16             ioc_change_count;
+	__le32             event_context;
+	__le32             event_data[1];
+};
+#define MPI3_EVENT_NOTIFY_MSGFLAGS_ACK_MASK                        (0x01)
+#define MPI3_EVENT_NOTIFY_MSGFLAGS_ACK_REQUIRED                    (0x01)
+#define MPI3_EVENT_NOTIFY_MSGFLAGS_ACK_NOT_REQUIRED                (0x00)
+#define MPI3_EVENT_NOTIFY_MSGFLAGS_EVENT_ORIGINALITY_MASK          (0x02)
+#define MPI3_EVENT_NOTIFY_MSGFLAGS_EVENT_ORIGINALITY_ORIGINAL      (0x00)
+#define MPI3_EVENT_NOTIFY_MSGFLAGS_EVENT_ORIGINALITY_REPLAY        (0x02)
+struct mpi3_event_data_gpio_interrupt {
+	u8                 gpio_num;
+	u8                 reserved01[3];
+};
+struct mpi3_event_data_cable_management {
+	__le32             active_cable_power_requirement;
+	u8                 status;
+	u8                 receptacle_id;
+	__le16             reserved06;
+};
+#define MPI3_EVENT_CABLE_MGMT_ACT_CABLE_PWR_INVALID     (0xffffffff)
+#define MPI3_EVENT_CABLE_MGMT_STATUS_INSUFFICIENT_POWER        (0x00)
+#define MPI3_EVENT_CABLE_MGMT_STATUS_PRESENT                   (0x01)
+#define MPI3_EVENT_CABLE_MGMT_STATUS_DEGRADED                  (0x02)
+struct mpi3_event_ack_request {
+	__le16             host_tag;
+	u8                 ioc_use_only02;
+	u8                 function;
+	__le16             ioc_use_only04;
+	u8                 ioc_use_only06;
+	u8                 msg_flags;
+	__le16             change_count;
+	__le16             reserved0a;
+	u8                 event;
+	u8                 reserved0d[3];
+	__le32             event_context;
+};
+struct mpi3_event_data_prepare_for_reset {
+	u8                 reason_code;
+	u8                 reserved01;
+	__le16             reserved02;
+};
+#define MPI3_EVENT_PREPARE_RESET_RC_START                (0x01)
+#define MPI3_EVENT_PREPARE_RESET_RC_ABORT                (0x02)
+struct mpi3_event_data_comp_image_activation {
+	__le32            reserved00;
+};
+struct mpi3_event_data_device_status_change {
+	__le16             task_tag;
+	u8                 reason_code;
+	u8                 io_unit_port;
+	__le16             parent_dev_handle;
+	__le16             dev_handle;
+	__le64             wwid;
+	u8                 lun[8];
+};
+#define MPI3_EVENT_DEV_STAT_RC_MOVED                                (0x01)
+#define MPI3_EVENT_DEV_STAT_RC_HIDDEN                               (0x02)
+#define MPI3_EVENT_DEV_STAT_RC_NOT_HIDDEN                           (0x03)
+#define MPI3_EVENT_DEV_STAT_RC_ASYNC_NOTIFICATION                   (0x04)
+#define MPI3_EVENT_DEV_STAT_RC_INT_DEVICE_RESET_STRT                (0x20)
+#define MPI3_EVENT_DEV_STAT_RC_INT_DEVICE_RESET_CMP                 (0x21)
+#define MPI3_EVENT_DEV_STAT_RC_INT_TASK_ABORT_STRT                  (0x22)
+#define MPI3_EVENT_DEV_STAT_RC_INT_TASK_ABORT_CMP                   (0x23)
+#define MPI3_EVENT_DEV_STAT_RC_INT_IT_NEXUS_RESET_STRT              (0x24)
+#define MPI3_EVENT_DEV_STAT_RC_INT_IT_NEXUS_RESET_CMP               (0x25)
+#define MPI3_EVENT_DEV_STAT_RC_PCIE_HOT_RESET_FAILED                (0x30)
+#define MPI3_EVENT_DEV_STAT_RC_EXPANDER_REDUCED_FUNC_STRT           (0x40)
+#define MPI3_EVENT_DEV_STAT_RC_EXPANDER_REDUCED_FUNC_CMP            (0x41)
+#define MPI3_EVENT_DEV_STAT_RC_VD_NOT_RESPONDING                    (0x50)
+struct mpi3_event_data_energy_pack_change {
+	__le32             reserved00;
+	__le16             shutdown_timeout;
+	__le16             reserved06;
+};
+struct mpi3_event_data_sas_discovery {
+	u8                 flags;
+	u8                 reason_code;
+	u8                 io_unit_port;
+	u8                 reserved03;
+	__le32             discovery_status;
+};
+#define MPI3_EVENT_SAS_DISC_FLAGS_DEVICE_CHANGE                 (0x02)
+#define MPI3_EVENT_SAS_DISC_FLAGS_IN_PROGRESS                   (0x01)
+#define MPI3_EVENT_SAS_DISC_RC_STARTED                          (0x01)
+#define MPI3_EVENT_SAS_DISC_RC_COMPLETED                        (0x02)
+#define MPI3_SAS_DISC_STATUS_MAX_ENCLOSURES_EXCEED            (0x80000000)
+#define MPI3_SAS_DISC_STATUS_MAX_EXPANDERS_EXCEED             (0x40000000)
+#define MPI3_SAS_DISC_STATUS_MAX_DEVICES_EXCEED               (0x20000000)
+#define MPI3_SAS_DISC_STATUS_MAX_TOPO_PHYS_EXCEED             (0x10000000)
+#define MPI3_SAS_DISC_STATUS_INVALID_CEI                      (0x00010000)
+#define MPI3_SAS_DISC_STATUS_FECEI_MISMATCH                   (0x00008000)
+#define MPI3_SAS_DISC_STATUS_MULTIPLE_DEVICES_IN_SLOT         (0x00004000)
+#define MPI3_SAS_DISC_STATUS_NECEI_MISMATCH                   (0x00002000)
+#define MPI3_SAS_DISC_STATUS_TOO_MANY_SLOTS                   (0x00001000)
+#define MPI3_SAS_DISC_STATUS_EXP_MULTI_SUBTRACTIVE            (0x00000800)
+#define MPI3_SAS_DISC_STATUS_MULTI_PORT_DOMAIN                (0x00000400)
+#define MPI3_SAS_DISC_STATUS_TABLE_TO_SUBTRACTIVE_LINK        (0x00000200)
+#define MPI3_SAS_DISC_STATUS_UNSUPPORTED_DEVICE               (0x00000100)
+#define MPI3_SAS_DISC_STATUS_TABLE_LINK                       (0x00000080)
+#define MPI3_SAS_DISC_STATUS_SUBTRACTIVE_LINK                 (0x00000040)
+#define MPI3_SAS_DISC_STATUS_SMP_CRC_ERROR                    (0x00000020)
+#define MPI3_SAS_DISC_STATUS_SMP_FUNCTION_FAILED              (0x00000010)
+#define MPI3_SAS_DISC_STATUS_SMP_TIMEOUT                      (0x00000008)
+#define MPI3_SAS_DISC_STATUS_MULTIPLE_PORTS                   (0x00000004)
+#define MPI3_SAS_DISC_STATUS_INVALID_SAS_ADDRESS              (0x00000002)
+#define MPI3_SAS_DISC_STATUS_LOOP_DETECTED                    (0x00000001)
+struct mpi3_event_data_sas_broadcast_primitive {
+	u8                 phy_num;
+	u8                 io_unit_port;
+	u8                 port_width;
+	u8                 primitive;
+};
+#define MPI3_EVENT_BROADCAST_PRIMITIVE_CHANGE                 (0x01)
+#define MPI3_EVENT_BROADCAST_PRIMITIVE_SES                    (0x02)
+#define MPI3_EVENT_BROADCAST_PRIMITIVE_EXPANDER               (0x03)
+#define MPI3_EVENT_BROADCAST_PRIMITIVE_ASYNCHRONOUS_EVENT     (0x04)
+#define MPI3_EVENT_BROADCAST_PRIMITIVE_RESERVED3              (0x05)
+#define MPI3_EVENT_BROADCAST_PRIMITIVE_RESERVED4              (0x06)
+#define MPI3_EVENT_BROADCAST_PRIMITIVE_CHANGE0_RESERVED       (0x07)
+#define MPI3_EVENT_BROADCAST_PRIMITIVE_CHANGE1_RESERVED       (0x08)
+struct mpi3_event_data_sas_notify_primitive {
+	u8                 phy_num;
+	u8                 io_unit_port;
+	u8                 reserved02;
+	u8                 primitive;
+};
+#define MPI3_EVENT_NOTIFY_PRIMITIVE_ENABLE_SPINUP         (0x01)
+#define MPI3_EVENT_NOTIFY_PRIMITIVE_POWER_LOSS_EXPECTED   (0x02)
+#define MPI3_EVENT_NOTIFY_PRIMITIVE_RESERVED1             (0x03)
+#define MPI3_EVENT_NOTIFY_PRIMITIVE_RESERVED2             (0x04)
+#ifndef MPI3_EVENT_SAS_TOPO_PHY_COUNT
+#define MPI3_EVENT_SAS_TOPO_PHY_COUNT           (1)
+#endif
+struct mpi3_event_sas_topo_phy_entry {
+	__le16             attached_dev_handle;
+	u8                 link_rate;
+	u8                 status;
+};
+#define MPI3_EVENT_SAS_TOPO_LR_CURRENT_MASK                 (0xf0)
+#define MPI3_EVENT_SAS_TOPO_LR_CURRENT_SHIFT                (4)
+#define MPI3_EVENT_SAS_TOPO_LR_PREV_MASK                    (0x0f)
+#define MPI3_EVENT_SAS_TOPO_LR_PREV_SHIFT                   (0)
+#define MPI3_EVENT_SAS_TOPO_LR_UNKNOWN_LINK_RATE            (0x00)
+#define MPI3_EVENT_SAS_TOPO_LR_PHY_DISABLED                 (0x01)
+#define MPI3_EVENT_SAS_TOPO_LR_NEGOTIATION_FAILED           (0x02)
+#define MPI3_EVENT_SAS_TOPO_LR_SATA_OOB_COMPLETE            (0x03)
+#define MPI3_EVENT_SAS_TOPO_LR_PORT_SELECTOR                (0x04)
+#define MPI3_EVENT_SAS_TOPO_LR_SMP_RESET_IN_PROGRESS        (0x05)
+#define MPI3_EVENT_SAS_TOPO_LR_UNSUPPORTED_PHY              (0x06)
+#define MPI3_EVENT_SAS_TOPO_LR_RATE_6_0                     (0x0a)
+#define MPI3_EVENT_SAS_TOPO_LR_RATE_12_0                    (0x0b)
+#define MPI3_EVENT_SAS_TOPO_LR_RATE_22_5                    (0x0c)
+#define MPI3_EVENT_SAS_TOPO_PHY_STATUS_MASK                 (0xc0)
+#define MPI3_EVENT_SAS_TOPO_PHY_STATUS_SHIFT                (6)
+#define MPI3_EVENT_SAS_TOPO_PHY_STATUS_ACCESSIBLE           (0x00)
+#define MPI3_EVENT_SAS_TOPO_PHY_STATUS_NO_EXIST             (0x40)
+#define MPI3_EVENT_SAS_TOPO_PHY_STATUS_VACANT               (0x80)
+#define MPI3_EVENT_SAS_TOPO_PHY_RC_MASK                     (0x0f)
+#define MPI3_EVENT_SAS_TOPO_PHY_RC_TARG_NOT_RESPONDING      (0x02)
+#define MPI3_EVENT_SAS_TOPO_PHY_RC_PHY_CHANGED              (0x03)
+#define MPI3_EVENT_SAS_TOPO_PHY_RC_NO_CHANGE                (0x04)
+#define MPI3_EVENT_SAS_TOPO_PHY_RC_DELAY_NOT_RESPONDING     (0x05)
+#define MPI3_EVENT_SAS_TOPO_PHY_RC_RESPONDING               (0x06)
+struct mpi3_event_data_sas_topology_change_list {
+	__le16                             enclosure_handle;
+	__le16                             expander_dev_handle;
+	u8                                 num_phys;
+	u8                                 reserved05[3];
+	u8                                 num_entries;
+	u8                                 start_phy_num;
+	u8                                 exp_status;
+	u8                                 io_unit_port;
+	struct mpi3_event_sas_topo_phy_entry   phy_entry[MPI3_EVENT_SAS_TOPO_PHY_COUNT];
+};
+#define MPI3_EVENT_SAS_TOPO_ES_NO_EXPANDER              (0x00)
+#define MPI3_EVENT_SAS_TOPO_ES_NOT_RESPONDING           (0x02)
+#define MPI3_EVENT_SAS_TOPO_ES_RESPONDING               (0x03)
+#define MPI3_EVENT_SAS_TOPO_ES_DELAY_NOT_RESPONDING     (0x04)
+struct mpi3_event_data_sas_phy_counter {
+	__le64             time_stamp;
+	__le32             reserved08;
+	u8                 phy_event_code;
+	u8                 phy_num;
+	__le16             reserved0e;
+	__le32             phy_event_info;
+	u8                 counter_type;
+	u8                 threshold_window;
+	u8                 time_units;
+	u8                 reserved17;
+	__le32             event_threshold;
+	__le16             threshold_flags;
+	__le16             reserved1e;
+};
+struct mpi3_event_data_sas_device_disc_err {
+	__le16             dev_handle;
+	u8                 reason_code;
+	u8                 io_unit_port;
+	__le32             reserved04;
+	__le64             sas_address;
+};
+#define MPI3_EVENT_SAS_DISC_ERR_RC_SMP_FAILED          (0x01)
+#define MPI3_EVENT_SAS_DISC_ERR_RC_SMP_TIMEOUT         (0x02)
+struct mpi3_event_data_pcie_enumeration {
+	u8                 flags;
+	u8                 reason_code;
+	u8                 io_unit_port;
+	u8                 reserved03;
+	__le32             enumeration_status;
+};
+#define MPI3_EVENT_PCIE_ENUM_FLAGS_DEVICE_CHANGE            (0x02)
+#define MPI3_EVENT_PCIE_ENUM_FLAGS_IN_PROGRESS              (0x01)
+#define MPI3_EVENT_PCIE_ENUM_RC_STARTED                     (0x01)
+#define MPI3_EVENT_PCIE_ENUM_RC_COMPLETED                   (0x02)
+#define MPI3_EVENT_PCIE_ENUM_ES_MAX_SWITCH_DEPTH_EXCEED     (0x80000000)
+#define MPI3_EVENT_PCIE_ENUM_ES_MAX_SWITCHES_EXCEED         (0x40000000)
+#define MPI3_EVENT_PCIE_ENUM_ES_MAX_DEVICES_EXCEED          (0x20000000)
+#define MPI3_EVENT_PCIE_ENUM_ES_RESOURCES_EXHAUSTED         (0x10000000)
+#ifndef MPI3_EVENT_PCIE_TOPO_PORT_COUNT
+#define MPI3_EVENT_PCIE_TOPO_PORT_COUNT         (1)
+#endif
+struct mpi3_event_pcie_topo_port_entry {
+	__le16             attached_dev_handle;
+	u8                 port_status;
+	u8                 reserved03;
+	u8                 current_port_info;
+	u8                 reserved05;
+	u8                 previous_port_info;
+	u8                 reserved07;
+};
+#define MPI3_EVENT_PCIE_TOPO_PS_NOT_RESPONDING          (0x02)
+#define MPI3_EVENT_PCIE_TOPO_PS_PORT_CHANGED            (0x03)
+#define MPI3_EVENT_PCIE_TOPO_PS_NO_CHANGE               (0x04)
+#define MPI3_EVENT_PCIE_TOPO_PS_DELAY_NOT_RESPONDING    (0x05)
+#define MPI3_EVENT_PCIE_TOPO_PS_RESPONDING              (0x06)
+#define MPI3_EVENT_PCIE_TOPO_PI_LANES_MASK              (0xf0)
+#define MPI3_EVENT_PCIE_TOPO_PI_LANES_UNKNOWN           (0x00)
+#define MPI3_EVENT_PCIE_TOPO_PI_LANES_1                 (0x10)
+#define MPI3_EVENT_PCIE_TOPO_PI_LANES_2                 (0x20)
+#define MPI3_EVENT_PCIE_TOPO_PI_LANES_4                 (0x30)
+#define MPI3_EVENT_PCIE_TOPO_PI_LANES_8                 (0x40)
+#define MPI3_EVENT_PCIE_TOPO_PI_LANES_16                (0x50)
+#define MPI3_EVENT_PCIE_TOPO_PI_RATE_MASK               (0x0f)
+#define MPI3_EVENT_PCIE_TOPO_PI_RATE_UNKNOWN            (0x00)
+#define MPI3_EVENT_PCIE_TOPO_PI_RATE_DISABLED           (0x01)
+#define MPI3_EVENT_PCIE_TOPO_PI_RATE_2_5                (0x02)
+#define MPI3_EVENT_PCIE_TOPO_PI_RATE_5_0                (0x03)
+#define MPI3_EVENT_PCIE_TOPO_PI_RATE_8_0                (0x04)
+#define MPI3_EVENT_PCIE_TOPO_PI_RATE_16_0               (0x05)
+#define MPI3_EVENT_PCIE_TOPO_PI_RATE_32_0               (0x06)
+struct mpi3_event_data_pcie_topology_change_list {
+	__le16                                 enclosure_handle;
+	__le16                                 switch_dev_handle;
+	u8                                     num_ports;
+	u8                                     reserved05[3];
+	u8                                     num_entries;
+	u8                                     start_port_num;
+	u8                                     switch_status;
+	u8                                     io_unit_port;
+	__le32                                 reserved0c;
+	struct mpi3_event_pcie_topo_port_entry     port_entry[MPI3_EVENT_PCIE_TOPO_PORT_COUNT];
+};
+#define MPI3_EVENT_PCIE_TOPO_SS_NO_PCIE_SWITCH          (0x00)
+#define MPI3_EVENT_PCIE_TOPO_SS_NOT_RESPONDING          (0x02)
+#define MPI3_EVENT_PCIE_TOPO_SS_RESPONDING              (0x03)
+#define MPI3_EVENT_PCIE_TOPO_SS_DELAY_NOT_RESPONDING    (0x04)
+struct mpi3_event_data_pcie_error_threshold {
+	__le64                                 timestamp;
+	u8                                     reason_code;
+	u8                                     port;
+	__le16                                 switch_dev_handle;
+	u8                                     error;
+	u8                                     action;
+	__le16                                 threshold_count;
+	__le16                                 attached_dev_handle;
+	__le16                                 reserved12;
+};
+#define MPI3_EVENT_PCI_ERROR_RC_THRESHOLD_EXCEEDED          (0x00)
+#define MPI3_EVENT_PCI_ERROR_RC_ESCALATION                  (0x01)
+struct mpi3_event_data_sas_init_dev_status_change {
+	u8                 reason_code;
+	u8                 io_unit_port;
+	__le16             dev_handle;
+	__le32             reserved04;
+	__le64             sas_address;
+};
+#define MPI3_EVENT_SAS_INIT_RC_ADDED                (0x01)
+#define MPI3_EVENT_SAS_INIT_RC_NOT_RESPONDING       (0x02)
+struct mpi3_event_data_sas_init_table_overflow {
+	__le16             max_init;
+	__le16             current_init;
+	__le32             reserved04;
+	__le64             sas_address;
+};
+struct mpi3_event_data_hard_reset_received {
+	u8                 reserved00;
+	u8                 io_unit_port;
+	__le16             reserved02;
+};
+struct mpi3_event_data_diag_buffer_status_change {
+	u8                 type;
+	u8                 reason_code;
+	__le16             reserved02;
+	__le32             reserved04;
+};
+#define MPI3_EVENT_DIAG_BUFFER_STATUS_CHANGE_RC_RELEASED             (0x01)
+#define MPI3_EVENT_DIAG_BUFFER_STATUS_CHANGE_RC_PAUSED               (0x02)
+#define MPI3_EVENT_DIAG_BUFFER_STATUS_CHANGE_RC_RESUMED              (0x03)
+#define MPI3_PEL_LOCALE_FLAGS_NON_BLOCKING_BOOT_EVENT   (0x0200)
+#define MPI3_PEL_LOCALE_FLAGS_BLOCKING_BOOT_EVENT       (0x0100)
+#define MPI3_PEL_LOCALE_FLAGS_PCIE                      (0x0080)
+#define MPI3_PEL_LOCALE_FLAGS_CONFIGURATION             (0x0040)
+#define MPI3_PEL_LOCALE_FLAGS_CONTROLER                 (0x0020)
+#define MPI3_PEL_LOCALE_FLAGS_SAS                       (0x0010)
+#define MPI3_PEL_LOCALE_FLAGS_EPACK                     (0x0008)
+#define MPI3_PEL_LOCALE_FLAGS_ENCLOSURE                 (0x0004)
+#define MPI3_PEL_LOCALE_FLAGS_PD                        (0x0002)
+#define MPI3_PEL_LOCALE_FLAGS_VD                        (0x0001)
+#define MPI3_PEL_CLASS_DEBUG                            (0x00)
+#define MPI3_PEL_CLASS_PROGRESS                         (0x01)
+#define MPI3_PEL_CLASS_INFORMATIONAL                    (0x02)
+#define MPI3_PEL_CLASS_WARNING                          (0x03)
+#define MPI3_PEL_CLASS_CRITICAL                         (0x04)
+#define MPI3_PEL_CLASS_FATAL                            (0x05)
+#define MPI3_PEL_CLASS_FAULT                            (0x06)
+#define MPI3_PEL_CLEARTYPE_CLEAR                        (0x00)
+#define MPI3_PEL_WAITTIME_INFINITE_WAIT                 (0x00)
+#define MPI3_PEL_ACTION_GET_SEQNUM                      (0x01)
+#define MPI3_PEL_ACTION_MARK_CLEAR                      (0x02)
+#define MPI3_PEL_ACTION_GET_LOG                         (0x03)
+#define MPI3_PEL_ACTION_GET_COUNT                       (0x04)
+#define MPI3_PEL_ACTION_WAIT                            (0x05)
+#define MPI3_PEL_ACTION_ABORT                           (0x06)
+#define MPI3_PEL_ACTION_GET_PRINT_STRINGS               (0x07)
+#define MPI3_PEL_ACTION_ACKNOWLEDGE                     (0x08)
+#define MPI3_PEL_STATUS_SUCCESS                         (0x00)
+#define MPI3_PEL_STATUS_NOT_FOUND                       (0x01)
+#define MPI3_PEL_STATUS_ABORTED                         (0x02)
+#define MPI3_PEL_STATUS_NOT_READY                       (0x03)
+struct mpi3_pel_seq {
+	__le32                             newest;
+	__le32                             oldest;
+	__le32                             clear;
+	__le32                             shutdown;
+	__le32                             boot;
+	__le32                             last_acknowledged;
+};
+struct mpi3_pel_entry {
+	__le64                             time_stamp;
+	__le32                             sequence_number;
+	__le16                             log_code;
+	__le16                             arg_type;
+	__le16                             locale;
+	u8                                 class;
+	u8                                 flags;
+	u8                                 ext_num;
+	u8                                 num_exts;
+	u8                                 arg_data_size;
+	u8                                 fixed_format_strings_size;
+	__le32                             reserved18[2];
+	__le32                             pel_info[24];
+};
+#define MPI3_PEL_FLAGS_COMPLETE_RESET_NEEDED                  (0x02)
+#define MPI3_PEL_FLAGS_ACK_NEEDED                             (0x01)
+struct mpi3_pel_list {
+	__le32                             log_count;
+	__le32                             reserved04;
+	struct mpi3_pel_entry                  entry[1];
+};
+struct mpi3_pel_arg_map {
+	u8                                 arg_type;
+	u8                                 length;
+	__le16                             start_location;
+};
+#define MPI3_PEL_ARG_MAP_ARG_TYPE_APPEND_STRING                (0x00)
+#define MPI3_PEL_ARG_MAP_ARG_TYPE_INTEGER                      (0x01)
+#define MPI3_PEL_ARG_MAP_ARG_TYPE_STRING                       (0x02)
+#define MPI3_PEL_ARG_MAP_ARG_TYPE_BIT_FIELD                    (0x03)
+struct mpi3_pel_print_string {
+	__le16                             log_code;
+	__le16                             string_length;
+	u8                                 num_arg_map;
+	u8                                 reserved05[3];
+	struct mpi3_pel_arg_map                arg_map[1];
+};
+struct mpi3_pel_print_string_list {
+	__le32                             num_print_strings;
+	__le32                             residual_bytes_remain;
+	__le32                             reserved08[2];
+	struct mpi3_pel_print_string           print_string[1];
+};
+#ifndef MPI3_PEL_ACTION_SPECIFIC_MAX
+#define MPI3_PEL_ACTION_SPECIFIC_MAX               (1)
+#endif
+struct mpi3_pel_request {
+	__le16                             host_tag;
+	u8                                 ioc_use_only02;
+	u8                                 function;
+	__le16                             ioc_use_only04;
+	u8                                 ioc_use_only06;
+	u8                                 msg_flags;
+	__le16                             change_count;
+	u8                                 action;
+	u8                                 reserved0b;
+	__le32                             action_specific[MPI3_PEL_ACTION_SPECIFIC_MAX];
+};
+struct mpi3_pel_req_action_get_sequence_numbers {
+	__le16                             host_tag;
+	u8                                 ioc_use_only02;
+	u8                                 function;
+	__le16                             ioc_use_only04;
+	u8                                 ioc_use_only06;
+	u8                                 msg_flags;
+	__le16                             change_count;
+	u8                                 action;
+	u8                                 reserved0b;
+	__le32                             reserved0c[5];
+	union mpi3_sge_union                  sgl;
+};
+struct mpi3_pel_req_action_clear_log_marker {
+	__le16                             host_tag;
+	u8                                 ioc_use_only02;
+	u8                                 function;
+	__le16                             ioc_use_only04;
+	u8                                 ioc_use_only06;
+	u8                                 msg_flags;
+	__le16                             change_count;
+	u8                                 action;
+	u8                                 reserved0b;
+	u8                                 clear_type;
+	u8                                 reserved0d[3];
+};
+struct mpi3_pel_req_action_get_log {
+	__le16                             host_tag;
+	u8                                 ioc_use_only02;
+	u8                                 function;
+	__le16                             ioc_use_only04;
+	u8                                 ioc_use_only06;
+	u8                                 msg_flags;
+	__le16                             change_count;
+	u8                                 action;
+	u8                                 reserved0b;
+	__le32                             starting_sequence_number;
+	__le16                             locale;
+	u8                                 class;
+	u8                                 reserved13;
+	__le32                             reserved14[3];
+	union mpi3_sge_union                  sgl;
+};
+struct mpi3_pel_req_action_get_count {
+	__le16                             host_tag;
+	u8                                 ioc_use_only02;
+	u8                                 function;
+	__le16                             ioc_use_only04;
+	u8                                 ioc_use_only06;
+	u8                                 msg_flags;
+	__le16                             change_count;
+	u8                                 action;
+	u8                                 reserved0b;
+	__le32                             starting_sequence_number;
+	__le16                             locale;
+	u8                                 class;
+	u8                                 reserved13;
+	__le32                             reserved14[3];
+	union mpi3_sge_union                  sgl;
+};
+struct mpi3_pel_req_action_wait {
+	__le16                             host_tag;
+	u8                                 ioc_use_only02;
+	u8                                 function;
+	__le16                             ioc_use_only04;
+	u8                                 ioc_use_only06;
+	u8                                 msg_flags;
+	__le16                             change_count;
+	u8                                 action;
+	u8                                 reserved0b;
+	__le32                             starting_sequence_number;
+	__le16                             locale;
+	u8                                 class;
+	u8                                 reserved13;
+	__le16                             wait_time;
+	__le16                             reserved16;
+	__le32                             reserved18[2];
+};
+struct mpi3_pel_req_action_abort {
+	__le16                             host_tag;
+	u8                                 ioc_use_only02;
+	u8                                 function;
+	__le16                             ioc_use_only04;
+	u8                                 ioc_use_only06;
+	u8                                 msg_flags;
+	__le16                             change_count;
+	u8                                 action;
+	u8                                 reserved0b;
+	__le32                             reserved0c;
+	__le16                             abort_host_tag;
+	__le16                             reserved12;
+	__le32                             reserved14;
+};
+struct mpi3_pel_req_action_get_print_strings {
+	__le16                             host_tag;
+	u8                                 ioc_use_only02;
+	u8                                 function;
+	__le16                             ioc_use_only04;
+	u8                                 ioc_use_only06;
+	u8                                 msg_flags;
+	__le16                             change_count;
+	u8                                 action;
+	u8                                 reserved0b;
+	__le32                             reserved0c;
+	__le16                             start_log_code;
+	__le16                             reserved12;
+	__le32                             reserved14[3];
+	union mpi3_sge_union                  sgl;
+};
+struct mpi3_pel_req_action_acknowledge {
+	__le16                             host_tag;
+	u8                                 ioc_use_only02;
+	u8                                 function;
+	__le16                             ioc_use_only04;
+	u8                                 ioc_use_only06;
+	u8                                 msg_flags;
+	__le16                             change_count;
+	u8                                 action;
+	u8                                 reserved0b;
+	__le32                             sequence_number;
+	__le32                             reserved10;
+};
+#define MPI3_PELACKNOWLEDGE_MSGFLAGS_SAFE_MODE_EXIT_MASK                     (0x03)
+#define MPI3_PELACKNOWLEDGE_MSGFLAGS_SAFE_MODE_EXIT_NO_GUIDANCE              (0x00)
+#define MPI3_PELACKNOWLEDGE_MSGFLAGS_SAFE_MODE_EXIT_CONTINUE_OP              (0x01)
+#define MPI3_PELACKNOWLEDGE_MSGFLAGS_SAFE_MODE_EXIT_TRANSITION_TO_FAULT      (0x02)
+struct mpi3_pel_reply {
+	__le16                             host_tag;
+	u8                                 ioc_use_only02;
+	u8                                 function;
+	__le16                             ioc_use_only04;
+	u8                                 ioc_use_only06;
+	u8                                 msg_flags;
+	__le16                             ioc_use_only08;
+	__le16                             ioc_status;
+	__le32                             ioc_log_info;
+	u8                                 action;
+	u8                                 reserved11;
+	__le16                             reserved12;
+	__le16                             pe_log_status;
+	__le16                             reserved16;
+	__le32                             transfer_length;
+};
+struct mpi3_ci_download_request {
+	__le16                             host_tag;
+	u8                                 ioc_use_only02;
+	u8                                 function;
+	__le16                             ioc_use_only04;
+	u8                                 ioc_use_only06;
+	u8                                 msg_flags;
+	__le16                             change_count;
+	u8                                 action;
+	u8                                 reserved0b;
+	__le32                             signature1;
+	__le32                             total_image_size;
+	__le32                             image_offset;
+	__le32                             segment_size;
+	__le32                             reserved1c;
+	union mpi3_sge_union                  sgl;
+};
+#define MPI3_CI_DOWNLOAD_MSGFLAGS_LAST_SEGMENT                 (0x80)
+#define MPI3_CI_DOWNLOAD_MSGFLAGS_FORCE_FMC_ENABLE             (0x40)
+#define MPI3_CI_DOWNLOAD_MSGFLAGS_SIGNED_NVDATA                (0x20)
+#define MPI3_CI_DOWNLOAD_MSGFLAGS_WRITE_CACHE_FLUSH_MASK       (0x03)
+#define MPI3_CI_DOWNLOAD_MSGFLAGS_WRITE_CACHE_FLUSH_FAST       (0x00)
+#define MPI3_CI_DOWNLOAD_MSGFLAGS_WRITE_CACHE_FLUSH_MEDIUM     (0x01)
+#define MPI3_CI_DOWNLOAD_MSGFLAGS_WRITE_CACHE_FLUSH_SLOW       (0x02)
+#define MPI3_CI_DOWNLOAD_ACTION_DOWNLOAD                       (0x01)
+#define MPI3_CI_DOWNLOAD_ACTION_ONLINE_ACTIVATION              (0x02)
+#define MPI3_CI_DOWNLOAD_ACTION_OFFLINE_ACTIVATION             (0x03)
+#define MPI3_CI_DOWNLOAD_ACTION_GET_STATUS                     (0x04)
+#define MPI3_CI_DOWNLOAD_ACTION_CANCEL_OFFLINE_ACTIVATION      (0x05)
+struct mpi3_ci_download_reply {
+	__le16                             host_tag;
+	u8                                 ioc_use_only02;
+	u8                                 function;
+	__le16                             ioc_use_only04;
+	u8                                 ioc_use_only06;
+	u8                                 msg_flags;
+	__le16                             ioc_use_only08;
+	__le16                             ioc_status;
+	__le32                             ioc_log_info;
+	u8                                 flags;
+	u8                                 cache_dirty;
+	u8                                 pending_count;
+	u8                                 reserved13;
+};
+#define MPI3_CI_DOWNLOAD_FLAGS_DOWNLOAD_IN_PROGRESS                  (0x80)
+#define MPI3_CI_DOWNLOAD_FLAGS_ACTIVATION_FAILURE                    (0x40)
+#define MPI3_CI_DOWNLOAD_FLAGS_OFFLINE_ACTIVATION_REQUIRED           (0x20)
+#define MPI3_CI_DOWNLOAD_FLAGS_KEY_UPDATE_PENDING                    (0x10)
+#define MPI3_CI_DOWNLOAD_FLAGS_ACTIVATION_STATUS_MASK                (0x0e)
+#define MPI3_CI_DOWNLOAD_FLAGS_ACTIVATION_STATUS_NOT_NEEDED          (0x00)
+#define MPI3_CI_DOWNLOAD_FLAGS_ACTIVATION_STATUS_AWAITING            (0x02)
+#define MPI3_CI_DOWNLOAD_FLAGS_ACTIVATION_STATUS_ONLINE_PENDING      (0x04)
+#define MPI3_CI_DOWNLOAD_FLAGS_ACTIVATION_STATUS_OFFLINE_PENDING     (0x06)
+#define MPI3_CI_DOWNLOAD_FLAGS_COMPATIBLE                            (0x01)
+struct mpi3_ci_upload_request {
+	__le16                             host_tag;
+	u8                                 ioc_use_only02;
+	u8                                 function;
+	__le16                             ioc_use_only04;
+	u8                                 ioc_use_only06;
+	u8                                 msg_flags;
+	__le16                             change_count;
+	__le16                             reserved0a;
+	__le32                             signature1;
+	__le32                             reserved10;
+	__le32                             image_offset;
+	__le32                             segment_size;
+	__le32                             reserved1c;
+	union mpi3_sge_union                  sgl;
+};
+#define MPI3_CI_UPLOAD_MSGFLAGS_LOCATION_MASK                        (0x01)
+#define MPI3_CI_UPLOAD_MSGFLAGS_LOCATION_PRIMARY                     (0x00)
+#define MPI3_CI_UPLOAD_MSGFLAGS_LOCATION_SECONDARY                   (0x01)
+#define MPI3_CI_UPLOAD_MSGFLAGS_FORMAT_MASK                          (0x02)
+#define MPI3_CI_UPLOAD_MSGFLAGS_FORMAT_FLASH                         (0x00)
+#define MPI3_CI_UPLOAD_MSGFLAGS_FORMAT_EXECUTABLE                    (0x02)
+#define MPI3_CTRL_OP_FORCE_FULL_DISCOVERY                            (0x01)
+#define MPI3_CTRL_OP_LOOKUP_MAPPING                                  (0x02)
+#define MPI3_CTRL_OP_UPDATE_TIMESTAMP                                (0x04)
+#define MPI3_CTRL_OP_GET_TIMESTAMP                                   (0x05)
+#define MPI3_CTRL_OP_GET_IOC_CHANGE_COUNT                            (0x06)
+#define MPI3_CTRL_OP_CHANGE_PROFILE                                  (0x07)
+#define MPI3_CTRL_OP_REMOVE_DEVICE                                   (0x10)
+#define MPI3_CTRL_OP_CLOSE_PERSISTENT_CONNECTION                     (0x11)
+#define MPI3_CTRL_OP_HIDDEN_ACK                                      (0x12)
+#define MPI3_CTRL_OP_CLEAR_DEVICE_COUNTERS                           (0x13)
+#define MPI3_CTRL_OP_SEND_SAS_PRIMITIVE                              (0x20)
+#define MPI3_CTRL_OP_SAS_PHY_CONTROL                                 (0x21)
+#define MPI3_CTRL_OP_READ_INTERNAL_BUS                               (0x23)
+#define MPI3_CTRL_OP_WRITE_INTERNAL_BUS                              (0x24)
+#define MPI3_CTRL_OP_PCIE_LINK_CONTROL                               (0x30)
+#define MPI3_CTRL_OP_LOOKUP_MAPPING_PARAM8_LOOKUP_METHOD_INDEX       (0x00)
+#define MPI3_CTRL_OP_UPDATE_TIMESTAMP_PARAM64_TIMESTAMP_INDEX        (0x00)
+#define MPI3_CTRL_OP_CHANGE_PROFILE_PARAM8_PROFILE_ID_INDEX          (0x00)
+#define MPI3_CTRL_OP_REMOVE_DEVICE_PARAM16_DEVHANDLE_INDEX           (0x00)
+#define MPI3_CTRL_OP_CLOSE_PERSIST_CONN_PARAM16_DEVHANDLE_INDEX      (0x00)
+#define MPI3_CTRL_OP_HIDDEN_ACK_PARAM16_DEVHANDLE_INDEX              (0x00)
+#define MPI3_CTRL_OP_CLEAR_DEVICE_COUNTERS_PARAM16_DEVHANDLE_INDEX   (0x00)
+#define MPI3_CTRL_OP_SEND_SAS_PRIM_PARAM8_PHY_INDEX                  (0x00)
+#define MPI3_CTRL_OP_SEND_SAS_PRIM_PARAM8_PRIMSEQ_INDEX              (0x01)
+#define MPI3_CTRL_OP_SEND_SAS_PRIM_PARAM32_PRIMITIVE_INDEX           (0x00)
+#define MPI3_CTRL_OP_SAS_PHY_CONTROL_PARAM8_ACTION_INDEX             (0x00)
+#define MPI3_CTRL_OP_SAS_PHY_CONTROL_PARAM8_PHY_INDEX                (0x01)
+#define MPI3_CTRL_OP_READ_INTERNAL_BUS_PARAM64_ADDRESS_INDEX         (0x00)
+#define MPI3_CTRL_OP_WRITE_INTERNAL_BUS_PARAM64_ADDRESS_INDEX        (0x00)
+#define MPI3_CTRL_OP_WRITE_INTERNAL_BUS_PARAM32_VALUE_INDEX          (0x00)
+#define MPI3_CTRL_OP_PCIE_LINK_CONTROL_PARAM8_ACTION_INDEX           (0x00)
+#define MPI3_CTRL_OP_PCIE_LINK_CONTROL_PARAM8_LINK_INDEX             (0x01)
+#define MPI3_CTRL_LOOKUP_METHOD_WWID_ADDRESS                         (0x01)
+#define MPI3_CTRL_LOOKUP_METHOD_ENCLOSURE_SLOT                       (0x02)
+#define MPI3_CTRL_LOOKUP_METHOD_SAS_DEVICE_NAME                      (0x03)
+#define MPI3_CTRL_LOOKUP_METHOD_PERSISTENT_ID                        (0x04)
+#define MPI3_CTRL_LOOKUP_METHOD_WWIDADDR_PARAM16_DEVH_INDEX             (0)
+#define MPI3_CTRL_LOOKUP_METHOD_WWIDADDR_PARAM64_WWID_INDEX             (0)
+#define MPI3_CTRL_LOOKUP_METHOD_ENCLSLOT_PARAM16_SLOTNUM_INDEX          (0)
+#define MPI3_CTRL_LOOKUP_METHOD_ENCLSLOT_PARAM64_ENCLOSURELID_INDEX     (0)
+#define MPI3_CTRL_LOOKUP_METHOD_SASDEVNAME_PARAM16_DEVH_INDEX           (0)
+#define MPI3_CTRL_LOOKUP_METHOD_SASDEVNAME_PARAM64_DEVNAME_INDEX        (0)
+#define MPI3_CTRL_LOOKUP_METHOD_PERSISTID_PARAM16_DEVH_INDEX            (0)
+#define MPI3_CTRL_LOOKUP_METHOD_PERSISTID_PARAM16_PERSISTENT_ID_INDEX   (1)
+#define MPI3_CTRL_LOOKUP_METHOD_VALUE16_DEVH_INDEX                      (0)
+#define MPI3_CTRL_GET_TIMESTAMP_VALUE64_TIMESTAMP_INDEX                 (0)
+#define MPI3_CTRL_GET_IOC_CHANGE_COUNT_VALUE16_CHANGECOUNT_INDEX        (0)
+#define MPI3_CTRL_READ_INTERNAL_BUS_VALUE32_VALUE_INDEX                 (0)
+#define MPI3_CTRL_PRIMFLAGS_SINGLE                                   (0x01)
+#define MPI3_CTRL_PRIMFLAGS_TRIPLE                                   (0x03)
+#define MPI3_CTRL_PRIMFLAGS_REDUNDANT                                (0x06)
+#define MPI3_CTRL_ACTION_NOP                                         (0x00)
+#define MPI3_CTRL_ACTION_LINK_RESET                                  (0x01)
+#define MPI3_CTRL_ACTION_HARD_RESET                                  (0x02)
+#define MPI3_CTRL_ACTION_CLEAR_ERROR_LOG                             (0x05)
+struct mpi3_iounit_control_request {
+	__le16                             host_tag;
+	u8                                 ioc_use_only02;
+	u8                                 function;
+	__le16                             ioc_use_only04;
+	u8                                 ioc_use_only06;
+	u8                                 msg_flags;
+	__le16                             change_count;
+	u8                                 reserved0a;
+	u8                                 operation;
+	__le32                             reserved0c;
+	__le64                             param64[2];
+	__le32                             param32[4];
+	__le16                             param16[4];
+	u8                                 param8[8];
+};
+struct mpi3_iounit_control_reply {
+	__le16                             host_tag;
+	u8                                 ioc_use_only02;
+	u8                                 function;
+	__le16                             ioc_use_only04;
+	u8                                 ioc_use_only06;
+	u8                                 msg_flags;
+	__le16                             ioc_use_only08;
+	__le16                             ioc_status;
+	__le32                             ioc_log_info;
+	__le64                             value64[2];
+	__le32                             value32[4];
+	__le16                             value16[4];
+	u8                                 value8[8];
+};
+#endif
diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_pci.h b/drivers/scsi/mpi3mr/mpi/mpi30_pci.h
new file mode 100644
index 0000000000000..3daa16efcc3a3
--- /dev/null
+++ b/drivers/scsi/mpi3mr/mpi/mpi30_pci.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ *  Copyright 2016-2022 Broadcom Inc. All rights reserved.
+ */
+#ifndef MPI30_PCI_H
+#define MPI30_PCI_H     1
+#ifndef MPI3_NVME_ENCAP_CMD_MAX
+#define MPI3_NVME_ENCAP_CMD_MAX               (1)
+#endif
+struct mpi3_nvme_encapsulated_request {
+	__le16                     host_tag;
+	u8                         ioc_use_only02;
+	u8                         function;
+	__le16                     ioc_use_only04;
+	u8                         ioc_use_only06;
+	u8                         msg_flags;
+	__le16                     change_count;
+	__le16                     dev_handle;
+	__le16                     encapsulated_command_length;
+	__le16                     flags;
+	__le32                     data_length;
+	__le32                     reserved14[3];
+	__le32                     command[MPI3_NVME_ENCAP_CMD_MAX];
+};
+#define MPI3_NVME_FLAGS_FORCE_ADMIN_ERR_REPLY_MASK      (0x0002)
+#define MPI3_NVME_FLAGS_FORCE_ADMIN_ERR_REPLY_FAIL_ONLY (0x0000)
+#define MPI3_NVME_FLAGS_FORCE_ADMIN_ERR_REPLY_ALL       (0x0002)
+#define MPI3_NVME_FLAGS_SUBMISSIONQ_MASK                (0x0001)
+#define MPI3_NVME_FLAGS_SUBMISSIONQ_IO                  (0x0000)
+#define MPI3_NVME_FLAGS_SUBMISSIONQ_ADMIN               (0x0001)
+struct mpi3_nvme_encapsulated_error_reply {
+	__le16                     host_tag;
+	u8                         ioc_use_only02;
+	u8                         function;
+	__le16                     ioc_use_only04;
+	u8                         ioc_use_only06;
+	u8                         msg_flags;
+	__le16                     ioc_use_only08;
+	__le16                     ioc_status;
+	__le32                     ioc_log_info;
+	__le32                     nvme_completion_entry[4];
+};
+#endif
diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_raid.h b/drivers/scsi/mpi3mr/mpi/mpi30_raid.h
new file mode 100644
index 0000000000000..7ce3c00d4fbd0
--- /dev/null
+++ b/drivers/scsi/mpi3mr/mpi/mpi30_raid.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ *  Copyright 2016-2022 Broadcom Inc. All rights reserved.
+ */
+#ifndef MPI30_RAID_H
+#define MPI30_RAID_H     1
+#endif
diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_sas.h b/drivers/scsi/mpi3mr/mpi/mpi30_sas.h
new file mode 100644
index 0000000000000..78d8e0ad26757
--- /dev/null
+++ b/drivers/scsi/mpi3mr/mpi/mpi30_sas.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ *  Copyright 2016-2022 Broadcom Inc. All rights reserved.
+ */
+#ifndef MPI30_SAS_H
+#define MPI30_SAS_H     1
+#define MPI3_SAS_DEVICE_INFO_SSP_TARGET             (0x00000100)
+#define MPI3_SAS_DEVICE_INFO_STP_SATA_TARGET        (0x00000080)
+#define MPI3_SAS_DEVICE_INFO_SMP_TARGET             (0x00000040)
+#define MPI3_SAS_DEVICE_INFO_SSP_INITIATOR          (0x00000020)
+#define MPI3_SAS_DEVICE_INFO_STP_INITIATOR          (0x00000010)
+#define MPI3_SAS_DEVICE_INFO_SMP_INITIATOR          (0x00000008)
+#define MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_MASK       (0x00000007)
+#define MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_NO_DEVICE  (0x00000000)
+#define MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_END_DEVICE (0x00000001)
+#define MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_EXPANDER   (0x00000002)
+struct mpi3_smp_passthrough_request {
+	__le16                     host_tag;
+	u8                         ioc_use_only02;
+	u8                         function;
+	__le16                     ioc_use_only04;
+	u8                         ioc_use_only06;
+	u8                         msg_flags;
+	__le16                     change_count;
+	u8                         reserved0a;
+	u8                         io_unit_port;
+	__le32                     reserved0c[3];
+	__le64                     sas_address;
+	struct mpi3_sge_common         request_sge;
+	struct mpi3_sge_common         response_sge;
+};
+struct mpi3_smp_passthrough_reply {
+	__le16                     host_tag;
+	u8                         ioc_use_only02;
+	u8                         function;
+	__le16                     ioc_use_only04;
+	u8                         ioc_use_only06;
+	u8                         msg_flags;
+	__le16                     ioc_use_only08;
+	__le16                     ioc_status;
+	__le32                     ioc_log_info;
+	__le16                     response_data_length;
+	__le16                     reserved12;
+};
+#endif
diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_targ.h b/drivers/scsi/mpi3mr/mpi/mpi30_targ.h
new file mode 100644
index 0000000000000..9fa30ca941f10
--- /dev/null
+++ b/drivers/scsi/mpi3mr/mpi/mpi30_targ.h
@@ -0,0 +1,194 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ *  Copyright 2016-2022 Broadcom Inc. All rights reserved.
+ */
+#ifndef MPI30_TARG_H
+#define MPI30_TARG_H     1
+struct mpi3_target_ssp_cmd_buffer {
+	u8                         frame_type;
+	u8                         reserved01;
+	__le16                     initiator_connection_tag;
+	__le32                     hashed_source_sas_address;
+	__le16                     reserved08;
+	__le16                     flags;
+	__le32                     reserved0c;
+	__le16                     tag;
+	__le16                     target_port_transfer_tag;
+	__le32                     data_offset;
+	u8                         logical_unit_number[8];
+	u8                         reserved20;
+	u8                         task_attribute;
+	u8                         reserved22;
+	u8                         additional_cdb_length;
+	u8                         cdb[16];
+};
+struct mpi3_target_ssp_task_buffer {
+	u8                         frame_type;
+	u8                         reserved01;
+	__le16                     initiator_connection_tag;
+	__le32                     hashed_source_sas_address;
+	__le16                     reserved08;
+	__le16                     flags;
+	__le32                     reserved0c;
+	__le16                     tag;
+	__le16                     target_port_transfer_tag;
+	__le32                     data_offset;
+	u8                         logical_unit_number[8];
+	__le16                     reserved20;
+	u8                         task_management_function;
+	u8                         reserved23;
+	__le16                     managed_task_tag;
+	__le16                     reserved26;
+	__le32                     reserved28[3];
+};
+#define MPI3_TARGET_FRAME_TYPE_COMMAND                      (0x06)
+#define MPI3_TARGET_FRAME_TYPE_TASK                         (0x16)
+#define MPI3_TARGET_HASHED_SAS_ADDRESS_MASK                 (0xffffff00)
+#define MPI3_TARGET_HASHED_SAS_ADDRESS_SHIFT                (8)
+struct mpi3_target_cmd_buf_post_base_request {
+	__le16                     host_tag;
+	u8                         ioc_use_only02;
+	u8                         function;
+	__le16                     ioc_use_only04;
+	u8                         ioc_use_only06;
+	u8                         msg_flags;
+	__le16                     change_count;
+	u8                         buffer_post_flags;
+	u8                         reserved0b;
+	__le16                     min_reply_queue_id;
+	__le16                     max_reply_queue_id;
+	__le64                     base_address;
+	__le16                     cmd_buffer_length;
+	__le16                     total_cmd_buffers;
+	__le32                     reserved1c;
+};
+#define MPI3_CMD_BUF_POST_BASE_FLAGS_DLAS_MASK              (0x0c)
+#define MPI3_CMD_BUF_POST_BASE_FLAGS_DLAS_SYSTEM            (0x00)
+#define MPI3_CMD_BUF_POST_BASE_FLAGS_DLAS_IOCUDP            (0x04)
+#define MPI3_CMD_BUF_POST_BASE_FLAGS_DLAS_IOCCTL            (0x08)
+#define MPI3_CMD_BUF_POST_BASE_FLAGS_AUTO_POST_ALL          (0x01)
+#define MPI3_CMD_BUF_POST_BASE_MIN_BUF_LENGTH               (0x34)
+#define MPI3_CMD_BUF_POST_BASE_MAX_BUF_LENGTH               (0x3fc)
+struct mpi3_target_cmd_buf_post_list_request {
+	__le16                     host_tag;
+	u8                         ioc_use_only02;
+	u8                         function;
+	__le16                     ioc_use_only04;
+	u8                         ioc_use_only06;
+	u8                         msg_flags;
+	__le16                     change_count;
+	__le16                     reserved0a;
+	u8                         cmd_buffer_count;
+	u8                         reserved0d[3];
+	__le16                     io_index[2];
+};
+struct mpi3_target_cmd_buf_post_reply {
+	__le16                     host_tag;
+	u8                         ioc_use_only02;
+	u8                         function;
+	__le16                     ioc_use_only04;
+	u8                         ioc_use_only06;
+	u8                         msg_flags;
+	__le16                     ioc_use_only08;
+	__le16                     ioc_status;
+	__le32                     ioc_log_info;
+	u8                         cmd_buffer_count;
+	u8                         reserved11[3];
+	__le16                     io_index[2];
+};
+struct mpi3_target_assist_request {
+	__le16                     host_tag;
+	u8                         ioc_use_only02;
+	u8                         function;
+	__le16                     ioc_use_only04;
+	u8                         ioc_use_only06;
+	u8                         msg_flags;
+	__le16                     change_count;
+	__le16                     dev_handle;
+	__le32                     flags;
+	__le16                     reserved10;
+	__le16                     queue_tag;
+	__le16                     io_index;
+	__le16                     initiator_connection_tag;
+	__le32                     skip_count;
+	__le32                     data_length;
+	__le32                     port_transfer_length;
+	__le32                     primary_reference_tag;
+	__le16                     primary_application_tag;
+	__le16                     primary_application_tag_mask;
+	__le32                     relative_offset;
+	union mpi3_sge_union          sgl[5];
+};
+#define MPI3_TARGET_ASSIST_MSGFLAGS_METASGL_VALID           (0x80)
+#define MPI3_TARGET_ASSIST_FLAGS_REPOST_CMD_BUFFER          (0x00200000)
+#define MPI3_TARGET_ASSIST_FLAGS_AUTO_STATUS                (0x00100000)
+#define MPI3_TARGET_ASSIST_FLAGS_DATADIRECTION_MASK         (0x000c0000)
+#define MPI3_TARGET_ASSIST_FLAGS_DATADIRECTION_WRITE        (0x00040000)
+#define MPI3_TARGET_ASSIST_FLAGS_DATADIRECTION_READ         (0x00080000)
+#define MPI3_TARGET_ASSIST_FLAGS_DMAOPERATION_MASK          (0x00030000)
+#define MPI3_TARGET_ASSIST_FLAGS_DMAOPERATION_HOST_PI       (0x00010000)
+#define MPI3_TARGET_ASSIST_METASGL_INDEX                    (4)
+struct mpi3_target_status_send_request {
+	__le16                     host_tag;
+	u8                         ioc_use_only02;
+	u8                         function;
+	__le16                     ioc_use_only04;
+	u8                         ioc_use_only06;
+	u8                         msg_flags;
+	__le16                     change_count;
+	__le16                     dev_handle;
+	__le16                     response_iu_length;
+	__le16                     flags;
+	__le16                     reserved10;
+	__le16                     queue_tag;
+	__le16                     io_index;
+	__le16                     initiator_connection_tag;
+	__le32                     ioc_use_only18[6];
+	__le32                     ioc_use_only30[4];
+	union mpi3_sge_union          sgl;
+};
+#define MPI3_TSS_FLAGS_REPOST_CMD_BUFFER                (0x0020)
+#define MPI3_TSS_FLAGS_AUTO_SEND_GOOD_STATUS            (0x0010)
+struct mpi3_target_standard_reply {
+	__le16                     host_tag;
+	u8                         ioc_use_only02;
+	u8                         function;
+	__le16                     ioc_use_only04;
+	u8                         ioc_use_only06;
+	u8                         msg_flags;
+	__le16                     ioc_use_only08;
+	__le16                     ioc_status;
+	__le32                     ioc_log_info;
+	__le32                     transfer_count;
+};
+struct mpi3_target_mode_abort_request {
+	__le16                     host_tag;
+	u8                         ioc_use_only02;
+	u8                         function;
+	__le16                     ioc_use_only04;
+	u8                         ioc_use_only06;
+	u8                         msg_flags;
+	__le16                     change_count;
+	u8                         abort_type;
+	u8                         reserved0b;
+	__le16                     request_queue_id_to_abort;
+	__le16                     host_tag_to_abort;
+	__le16                     dev_handle;
+	__le16                     reserved12;
+};
+#define MPI3_TARGET_MODE_ABORT_ALL_CMD_BUFFERS              (0x00)
+#define MPI3_TARGET_MODE_ABORT_EXACT_IO_REQUEST             (0x01)
+#define MPI3_TARGET_MODE_ABORT_ALL_COMMANDS                 (0x02)
+struct mpi3_target_mode_abort_reply {
+	__le16                     host_tag;
+	u8                         ioc_use_only02;
+	u8                         function;
+	__le16                     ioc_use_only04;
+	u8                         ioc_use_only06;
+	u8                         msg_flags;
+	__le16                     ioc_use_only08;
+	__le16                     ioc_status;
+	__le32                     ioc_log_info;
+	__le32                     abort_count;
+};
+#endif
diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_tool.h b/drivers/scsi/mpi3mr/mpi/mpi30_tool.h
new file mode 100644
index 0000000000000..04c12874cea13
--- /dev/null
+++ b/drivers/scsi/mpi3mr/mpi/mpi30_tool.h
@@ -0,0 +1,289 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ *  Copyright 2016-2022 Broadcom Inc. All rights reserved.
+ */
+#ifndef MPI30_TOOL_H
+#define MPI30_TOOL_H     1
+struct mpi3_tool_clean_request {
+	__le16                     host_tag;
+	u8                         ioc_use_only02;
+	u8                         function;
+	__le16                     ioc_use_only04;
+	u8                         ioc_use_only06;
+	u8                         msg_flags;
+	__le16                     change_count;
+	u8                         tool;
+	u8                         reserved0b;
+	__le32                     area;
+};
+#define MPI3_TOOLBOX_TOOL_CLEAN                             (0x01)
+#define MPI3_TOOLBOX_TOOL_ISTWI_READ_WRITE                  (0x02)
+#define MPI3_TOOLBOX_TOOL_DIAGNOSTIC_CLI                    (0x03)
+#define MPI3_TOOLBOX_TOOL_LANE_MARGINING                    (0x04)
+#define MPI3_TOOLBOX_TOOL_RECOVER_DEVICE                    (0x05)
+#define MPI3_TOOLBOX_TOOL_LOOPBACK                          (0x06)
+#define MPI3_TOOLBOX_CLEAN_AREA_BIOS_BOOT_SERVICES          (0x00000008)
+#define MPI3_TOOLBOX_CLEAN_AREA_ALL_BUT_MFG                 (0x00000002)
+#define MPI3_TOOLBOX_CLEAN_AREA_NVSTORE                     (0x00000001)
+struct mpi3_tool_istwi_read_write_request {
+	__le16                               host_tag;
+	u8                                   ioc_use_only02;
+	u8                                   function;
+	__le16                               ioc_use_only04;
+	u8                                   ioc_use_only06;
+	u8                                   msg_flags;
+	__le16                               change_count;
+	u8                                   tool;
+	u8                                   flags;
+	u8                                   dev_index;
+	u8                                   action;
+	__le16                               reserved0e;
+	__le16                               tx_data_length;
+	__le16                               rx_data_length;
+	__le32                               reserved14[3];
+	struct mpi3_man11_istwi_device_format    istwi_device;
+	union mpi3_sge_union                    sgl;
+};
+#define MPI3_TOOLBOX_ISTWI_FLAGS_AUTO_RESERVE_RELEASE       (0x80)
+#define MPI3_TOOLBOX_ISTWI_FLAGS_ADDRESS_MODE_MASK          (0x04)
+#define MPI3_TOOLBOX_ISTWI_FLAGS_ADDRESS_MODE_DEVINDEX      (0x00)
+#define MPI3_TOOLBOX_ISTWI_FLAGS_ADDRESS_MODE_DEVICE_FIELD  (0x04)
+#define MPI3_TOOLBOX_ISTWI_FLAGS_PAGE_ADDRESS_MASK          (0x03)
+#define MPI3_TOOLBOX_ISTWI_ACTION_RESERVE_BUS               (0x00)
+#define MPI3_TOOLBOX_ISTWI_ACTION_RELEASE_BUS               (0x01)
+#define MPI3_TOOLBOX_ISTWI_ACTION_RESET                     (0x02)
+#define MPI3_TOOLBOX_ISTWI_ACTION_READ_DATA                 (0x03)
+#define MPI3_TOOLBOX_ISTWI_ACTION_WRITE_DATA                (0x04)
+#define MPI3_TOOLBOX_ISTWI_ACTION_SEQUENCE                  (0x05)
+struct mpi3_tool_istwi_read_write_reply {
+	__le16                     host_tag;
+	u8                         ioc_use_only02;
+	u8                         function;
+	__le16                     ioc_use_only04;
+	u8                         ioc_use_only06;
+	u8                         msg_flags;
+	__le16                     ioc_use_only08;
+	__le16                     ioc_status;
+	__le32                     ioc_log_info;
+	__le16                     istwi_status;
+	__le16                     reserved12;
+	__le16                     tx_data_count;
+	__le16                     rx_data_count;
+};
+struct mpi3_tool_diagnostic_cli_request {
+	__le16                     host_tag;
+	u8                         ioc_use_only02;
+	u8                         function;
+	__le16                     ioc_use_only04;
+	u8                         ioc_use_only06;
+	u8                         msg_flags;
+	__le16                     change_count;
+	u8                         tool;
+	u8                         reserved0b;
+	__le32                     command_data_length;
+	__le32                     response_data_length;
+	__le32                     reserved14[3];
+	union mpi3_sge_union          sgl;
+};
+struct mpi3_tool_diagnostic_cli_reply {
+	__le16                     host_tag;
+	u8                         ioc_use_only02;
+	u8                         function;
+	__le16                     ioc_use_only04;
+	u8                         ioc_use_only06;
+	u8                         msg_flags;
+	__le16                     ioc_use_only08;
+	__le16                     ioc_status;
+	__le32                     ioc_log_info;
+	__le32                     returned_data_length;
+};
+struct mpi3_tool_lane_margin_request {
+	__le16                               host_tag;
+	u8                                   ioc_use_only02;
+	u8                                   function;
+	__le16                               ioc_use_only04;
+	u8                                   ioc_use_only06;
+	u8                                   msg_flags;
+	__le16                               change_count;
+	u8                                   tool;
+	u8                                   reserved0b;
+	u8                                   action;
+	u8                                   switch_port;
+	__le16                               dev_handle;
+	u8                                   start_lane;
+	u8                                   num_lanes;
+	__le16                               reserved12;
+	__le32                               reserved14[3];
+	union mpi3_sge_union                    sgl;
+};
+#define MPI3_TOOLBOX_LM_ACTION_ENTER                         (0x00)
+#define MPI3_TOOLBOX_LM_ACTION_EXIT                          (0x01)
+#define MPI3_TOOLBOX_LM_ACTION_READ                          (0x02)
+#define MPI3_TOOLBOX_LM_ACTION_WRITE                         (0x03)
+struct mpi3_lane_margin_element {
+	__le16                               control;
+	__le16                               status;
+};
+struct mpi3_tool_lane_margin_reply {
+	__le16                               host_tag;
+	u8                                   ioc_use_only02;
+	u8                                   function;
+	__le16                               ioc_use_only04;
+	u8                                   ioc_use_only06;
+	u8                                   msg_flags;
+	__le16                               ioc_use_only08;
+	__le16                               ioc_status;
+	__le32                               ioc_log_info;
+	__le32                               returned_data_length;
+};
+struct mpi3_tool_recover_device_request {
+	__le16                               host_tag;
+	u8                                   ioc_use_only02;
+	u8                                   function;
+	__le16                               ioc_use_only04;
+	u8                                   ioc_use_only06;
+	u8                                   msg_flags;
+	__le16                               change_count;
+	u8                                   tool;
+	u8                                   reserved0b;
+	u8                                   action;
+	u8                                   reserved0d;
+	__le16                               dev_handle;
+};
+#define MPI3_TOOLBOX_RD_ACTION_START                        (0x01)
+#define MPI3_TOOLBOX_RD_ACTION_GET_STATUS                   (0x02)
+#define MPI3_TOOLBOX_RD_ACTION_ABORT                        (0x03)
+struct mpi3_tool_recover_device_reply {
+	__le16                     host_tag;
+	u8                         ioc_use_only02;
+	u8                         function;
+	__le16                     ioc_use_only04;
+	u8                         ioc_use_only06;
+	u8                         msg_flags;
+	__le16                     ioc_use_only08;
+	__le16                     ioc_status;
+	__le32                     ioc_log_info;
+	u8                         status;
+	u8                         reserved11;
+	__le16                     reserved1c;
+};
+#define MPI3_TOOLBOX_RD_STATUS_NOT_NEEDED                   (0x01)
+#define MPI3_TOOLBOX_RD_STATUS_NEEDED                       (0x02)
+#define MPI3_TOOLBOX_RD_STATUS_IN_PROGRESS                  (0x03)
+#define MPI3_TOOLBOX_RD_STATUS_ABORTING                     (0x04)
+struct mpi3_tool_loopback_request {
+	__le16                               host_tag;
+	u8                                   ioc_use_only02;
+	u8                                   function;
+	__le16                               ioc_use_only04;
+	u8                                   ioc_use_only06;
+	u8                                   msg_flags;
+	__le16                               change_count;
+	u8                                   tool;
+	u8                                   reserved0b;
+	__le32                               reserved0c;
+	__le64                               phys;
+};
+struct mpi3_tool_loopback_reply {
+	__le16                               host_tag;
+	u8                                   ioc_use_only02;
+	u8                                   function;
+	__le16                               ioc_use_only04;
+	u8                                   ioc_use_only06;
+	u8                                   msg_flags;
+	__le16                               ioc_use_only08;
+	__le16                               ioc_status;
+	__le32                               ioc_log_info;
+	__le64                               tested_phys;
+	__le64                               failed_phys;
+};
+struct mpi3_diag_buffer_post_request {
+	__le16                     host_tag;
+	u8                         ioc_use_only02;
+	u8                         function;
+	__le16                     ioc_use_only04;
+	u8                         ioc_use_only06;
+	u8                         msg_flags;
+	__le16                     change_count;
+	__le16                     reserved0a;
+	u8                         type;
+	u8                         reserved0d;
+	__le16                     reserved0e;
+	__le64                     address;
+	__le32                     length;
+	__le32                     reserved1c;
+};
+#define MPI3_DIAG_BUFFER_POST_MSGFLAGS_SEGMENTED            (0x01)
+#define MPI3_DIAG_BUFFER_TYPE_TRACE                         (0x01)
+#define MPI3_DIAG_BUFFER_TYPE_FW                            (0x02)
+#define MPI3_DIAG_BUFFER_TYPE_DRIVER                        (0x10)
+#define MPI3_DIAG_BUFFER_TYPE_FDL                           (0x20)
+#define MPI3_DIAG_BUFFER_MIN_PRODUCT_SPECIFIC               (0xf0)
+#define MPI3_DIAG_BUFFER_MAX_PRODUCT_SPECIFIC               (0xff)
+struct mpi3_driver_buffer_header {
+	__le32                     signature;
+	__le16                     header_size;
+	__le16                     rtt_file_header_offset;
+	__le32                     flags;
+	__le32                     circular_buffer_size;
+	__le32                     logical_buffer_end;
+	__le32                     logical_buffer_start;
+	__le32                     ioc_use_only18[2];
+	__le32                     reserved20[760];
+	__le32                     reserved_rttrace[256];
+};
+#define MPI3_DRIVER_DIAG_BUFFER_HEADER_SIGNATURE_CIRCULAR                (0x43495243)
+#define MPI3_DRIVER_DIAG_BUFFER_HEADER_FLAGS_CIRCULAR_BUF_FORMAT_MASK    (0x00000003)
+#define MPI3_DRIVER_DIAG_BUFFER_HEADER_FLAGS_CIRCULAR_BUF_FORMAT_ASCII   (0x00000000)
+#define MPI3_DRIVER_DIAG_BUFFER_HEADER_FLAGS_CIRCULAR_BUF_FORMAT_RTTRACE (0x00000001)
+struct mpi3_diag_buffer_manage_request {
+	__le16                     host_tag;
+	u8                         ioc_use_only02;
+	u8                         function;
+	__le16                     ioc_use_only04;
+	u8                         ioc_use_only06;
+	u8                         msg_flags;
+	__le16                     change_count;
+	__le16                     reserved0a;
+	u8                         type;
+	u8                         action;
+	__le16                     reserved0e;
+};
+#define MPI3_DIAG_BUFFER_ACTION_RELEASE                     (0x01)
+#define MPI3_DIAG_BUFFER_ACTION_PAUSE                       (0x02)
+#define MPI3_DIAG_BUFFER_ACTION_RESUME                      (0x03)
+struct mpi3_diag_buffer_upload_request {
+	__le16                     host_tag;
+	u8                         ioc_use_only02;
+	u8                         function;
+	__le16                     ioc_use_only04;
+	u8                         ioc_use_only06;
+	u8                         msg_flags;
+	__le16                     change_count;
+	__le16                     reserved0a;
+	u8                         type;
+	u8                         flags;
+	__le16                     reserved0e;
+	__le64                     context;
+	__le32                     reserved18;
+	__le32                     reserved1c;
+	union mpi3_sge_union          sgl;
+};
+#define MPI3_DIAG_BUFFER_UPLOAD_FLAGS_FORMAT_MASK           (0x01)
+#define MPI3_DIAG_BUFFER_UPLOAD_FLAGS_FORMAT_DECODED        (0x00)
+#define MPI3_DIAG_BUFFER_UPLOAD_FLAGS_FORMAT_ENCODED        (0x01)
+struct mpi3_diag_buffer_upload_reply {
+	__le16                     host_tag;
+	u8                         ioc_use_only02;
+	u8                         function;
+	__le16                     ioc_use_only04;
+	u8                         ioc_use_only06;
+	u8                         msg_flags;
+	__le16                     ioc_use_only08;
+	__le16                     ioc_status;
+	__le32                     ioc_log_info;
+	__le64                     context;
+	__le32                     returned_data_length;
+	__le32                     reserved1c;
+};
+#endif
diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_transport.h b/drivers/scsi/mpi3mr/mpi/mpi30_transport.h
new file mode 100644
index 0000000000000..fd6989c208e21
--- /dev/null
+++ b/drivers/scsi/mpi3mr/mpi/mpi30_transport.h
@@ -0,0 +1,454 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ *  Copyright 2016-2022 Broadcom Inc. All rights reserved.
+ */
+#ifndef MPI30_TRANSPORT_H
+#define MPI30_TRANSPORT_H     1
+struct mpi3_version_struct {
+	u8         dev;
+	u8         unit;
+	u8         minor;
+	u8         major;
+};
+union mpi3_version_union {
+	struct mpi3_version_struct     mpi3_version;
+	__le32                     word;
+};
+#define MPI3_VERSION_MAJOR                                              (3)
+#define MPI3_VERSION_MINOR                                              (0)
+#define MPI3_VERSION_UNIT                                               (25)
+#define MPI3_VERSION_DEV                                                (0)
+#define MPI3_DEVHANDLE_INVALID                                          (0xffff)
+struct mpi3_sysif_oper_queue_indexes {
+	__le16         producer_index;
+	__le16         reserved02;
+	__le16         consumer_index;
+	__le16         reserved06;
+};
+struct mpi3_sysif_registers {
+	__le64                             ioc_information;
+	union mpi3_version_union              version;
+	__le32                             reserved0c[2];
+	__le32                             ioc_configuration;
+	__le32                             reserved18;
+	__le32                             ioc_status;
+	__le32                             reserved20;
+	__le32                             admin_queue_num_entries;
+	__le64                             admin_request_queue_address;
+	__le64                             admin_reply_queue_address;
+	__le32                             reserved38[2];
+	__le32                             coalesce_control;
+	__le32                             reserved44[1007];
+	__le16                             admin_request_queue_pi;
+	__le16                             reserved1002;
+	__le16                             admin_reply_queue_ci;
+	__le16                             reserved1006;
+	struct mpi3_sysif_oper_queue_indexes   oper_queue_indexes[383];
+	__le32                             reserved1c00;
+	__le32                             write_sequence;
+	__le32                             host_diagnostic;
+	__le32                             reserved1c0c;
+	__le32                             fault;
+	__le32                             fault_info[3];
+	__le32                             reserved1c20[4];
+	__le64                             hcb_address;
+	__le32                             hcb_size;
+	__le32                             reserved1c3c;
+	__le32                             reply_free_host_index;
+	__le32                             sense_buffer_free_host_index;
+	__le32                             reserved1c48[2];
+	__le64                             diag_rw_data;
+	__le64                             diag_rw_address;
+	__le16                             diag_rw_control;
+	__le16                             diag_rw_status;
+	__le32                             reserved1c64[35];
+	__le32                             scratchpad[4];
+	__le32                             reserved1d00[192];
+	__le32                             device_assigned_registers[2048];
+};
+#define MPI3_SYSIF_IOC_INFO_LOW_OFFSET                                  (0x00000000)
+#define MPI3_SYSIF_IOC_INFO_HIGH_OFFSET                                 (0x00000004)
+#define MPI3_SYSIF_IOC_INFO_LOW_TIMEOUT_MASK                            (0xff000000)
+#define MPI3_SYSIF_IOC_INFO_LOW_TIMEOUT_SHIFT                           (24)
+#define MPI3_SYSIF_IOC_INFO_LOW_HCB_DISABLED                            (0x00000001)
+#define MPI3_SYSIF_IOC_CONFIG_OFFSET                                    (0x00000014)
+#define MPI3_SYSIF_IOC_CONFIG_OPER_RPY_ENT_SZ                           (0x00f00000)
+#define MPI3_SYSIF_IOC_CONFIG_OPER_RPY_ENT_SZ_SHIFT                     (20)
+#define MPI3_SYSIF_IOC_CONFIG_OPER_REQ_ENT_SZ                           (0x000f0000)
+#define MPI3_SYSIF_IOC_CONFIG_OPER_REQ_ENT_SZ_SHIFT                     (16)
+#define MPI3_SYSIF_IOC_CONFIG_SHUTDOWN_MASK                             (0x0000c000)
+#define MPI3_SYSIF_IOC_CONFIG_SHUTDOWN_NO                               (0x00000000)
+#define MPI3_SYSIF_IOC_CONFIG_SHUTDOWN_NORMAL                           (0x00004000)
+#define MPI3_SYSIF_IOC_CONFIG_DEVICE_SHUTDOWN_SEND_REQ                  (0x00002000)
+#define MPI3_SYSIF_IOC_CONFIG_DIAG_SAVE                                 (0x00000010)
+#define MPI3_SYSIF_IOC_CONFIG_ENABLE_IOC                                (0x00000001)
+#define MPI3_SYSIF_IOC_STATUS_OFFSET                                    (0x0000001c)
+#define MPI3_SYSIF_IOC_STATUS_RESET_HISTORY                             (0x00000010)
+#define MPI3_SYSIF_IOC_STATUS_SHUTDOWN_MASK                             (0x0000000c)
+#define MPI3_SYSIF_IOC_STATUS_SHUTDOWN_SHIFT                            (0x00000002)
+#define MPI3_SYSIF_IOC_STATUS_SHUTDOWN_NONE                             (0x00000000)
+#define MPI3_SYSIF_IOC_STATUS_SHUTDOWN_IN_PROGRESS                      (0x00000004)
+#define MPI3_SYSIF_IOC_STATUS_SHUTDOWN_COMPLETE                         (0x00000008)
+#define MPI3_SYSIF_IOC_STATUS_FAULT                                     (0x00000002)
+#define MPI3_SYSIF_IOC_STATUS_READY                                     (0x00000001)
+#define MPI3_SYSIF_ADMIN_Q_NUM_ENTRIES_OFFSET                           (0x00000024)
+#define MPI3_SYSIF_ADMIN_Q_NUM_ENTRIES_REQ_MASK                         (0x0fff)
+#define MPI3_SYSIF_ADMIN_Q_NUM_ENTRIES_REPLY_OFFSET                     (0x00000026)
+#define MPI3_SYSIF_ADMIN_Q_NUM_ENTRIES_REPLY_MASK                       (0x0fff0000)
+#define MPI3_SYSIF_ADMIN_Q_NUM_ENTRIES_REPLY_SHIFT                      (16)
+#define MPI3_SYSIF_ADMIN_REQ_Q_ADDR_LOW_OFFSET                          (0x00000028)
+#define MPI3_SYSIF_ADMIN_REQ_Q_ADDR_HIGH_OFFSET                         (0x0000002c)
+#define MPI3_SYSIF_ADMIN_REPLY_Q_ADDR_LOW_OFFSET                        (0x00000030)
+#define MPI3_SYSIF_ADMIN_REPLY_Q_ADDR_HIGH_OFFSET                       (0x00000034)
+#define MPI3_SYSIF_COALESCE_CONTROL_OFFSET                              (0x00000040)
+#define MPI3_SYSIF_COALESCE_CONTROL_ENABLE_MASK                         (0xc0000000)
+#define MPI3_SYSIF_COALESCE_CONTROL_ENABLE_NO_CHANGE                    (0x00000000)
+#define MPI3_SYSIF_COALESCE_CONTROL_ENABLE_DISABLE                      (0x40000000)
+#define MPI3_SYSIF_COALESCE_CONTROL_ENABLE_ENABLE                       (0xc0000000)
+#define MPI3_SYSIF_COALESCE_CONTROL_VALID                               (0x20000000)
+#define MPI3_SYSIF_COALESCE_CONTROL_MSIX_IDX_MASK                       (0x01ff0000)
+#define MPI3_SYSIF_COALESCE_CONTROL_MSIX_IDX_SHIFT                      (16)
+#define MPI3_SYSIF_COALESCE_CONTROL_TIMEOUT_MASK                        (0x0000ff00)
+#define MPI3_SYSIF_COALESCE_CONTROL_TIMEOUT_SHIFT                       (8)
+#define MPI3_SYSIF_COALESCE_CONTROL_DEPTH_MASK                          (0x000000ff)
+#define MPI3_SYSIF_COALESCE_CONTROL_DEPTH_SHIFT                         (0)
+#define MPI3_SYSIF_ADMIN_REQ_Q_PI_OFFSET                                (0x00001000)
+#define MPI3_SYSIF_ADMIN_REPLY_Q_CI_OFFSET                              (0x00001004)
+#define MPI3_SYSIF_OPER_REQ_Q_PI_OFFSET                                 (0x00001008)
+#define MPI3_SYSIF_OPER_REQ_Q_N_PI_OFFSET(N)                            (MPI3_SYSIF_OPER_REQ_Q_PI_OFFSET + (((N)-1)*8))
+#define MPI3_SYSIF_OPER_REPLY_Q_CI_OFFSET                               (0x0000100c)
+#define MPI3_SYSIF_OPER_REPLY_Q_N_CI_OFFSET(N)                          (MPI3_SYSIF_OPER_REPLY_Q_CI_OFFSET + (((N)-1)*8))
+#define MPI3_SYSIF_WRITE_SEQUENCE_OFFSET                                (0x00001c04)
+#define MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_MASK                        (0x0000000f)
+#define MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_FLUSH                       (0x0)
+#define MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_1ST                         (0xf)
+#define MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_2ND                         (0x4)
+#define MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_3RD                         (0xb)
+#define MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_4TH                         (0x2)
+#define MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_5TH                         (0x7)
+#define MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_6TH                         (0xd)
+#define MPI3_SYSIF_HOST_DIAG_OFFSET                                     (0x00001c08)
+#define MPI3_SYSIF_HOST_DIAG_RESET_ACTION_MASK                          (0x00000700)
+#define MPI3_SYSIF_HOST_DIAG_RESET_ACTION_NO_RESET                      (0x00000000)
+#define MPI3_SYSIF_HOST_DIAG_RESET_ACTION_SOFT_RESET                    (0x00000100)
+#define MPI3_SYSIF_HOST_DIAG_RESET_ACTION_HOST_CONTROL_BOOT_RESET       (0x00000200)
+#define MPI3_SYSIF_HOST_DIAG_RESET_ACTION_COMPLETE_RESET                (0x00000300)
+#define MPI3_SYSIF_HOST_DIAG_RESET_ACTION_DIAG_FAULT                    (0x00000700)
+#define MPI3_SYSIF_HOST_DIAG_SAVE_IN_PROGRESS                           (0x00000080)
+#define MPI3_SYSIF_HOST_DIAG_SECURE_BOOT                                (0x00000040)
+#define MPI3_SYSIF_HOST_DIAG_CLEAR_INVALID_FW_IMAGE                     (0x00000020)
+#define MPI3_SYSIF_HOST_DIAG_INVALID_FW_IMAGE                           (0x00000010)
+#define MPI3_SYSIF_HOST_DIAG_HCBENABLE                                  (0x00000008)
+#define MPI3_SYSIF_HOST_DIAG_HCBMODE                                    (0x00000004)
+#define MPI3_SYSIF_HOST_DIAG_DIAG_RW_ENABLE                             (0x00000002)
+#define MPI3_SYSIF_HOST_DIAG_DIAG_WRITE_ENABLE                          (0x00000001)
+#define MPI3_SYSIF_FAULT_OFFSET                                         (0x00001c10)
+#define MPI3_SYSIF_FAULT_FUNC_AREA_MASK                                 (0xff000000)
+#define MPI3_SYSIF_FAULT_FUNC_AREA_SHIFT                                (24)
+#define MPI3_SYSIF_FAULT_FUNC_AREA_MPI_DEFINED                          (0x00000000)
+#define MPI3_SYSIF_FAULT_CODE_MASK                                      (0x0000ffff)
+#define MPI3_SYSIF_FAULT_CODE_DIAG_FAULT_RESET                          (0x0000f000)
+#define MPI3_SYSIF_FAULT_CODE_CI_ACTIVATION_RESET                       (0x0000f001)
+#define MPI3_SYSIF_FAULT_CODE_SOFT_RESET_IN_PROGRESS                    (0x0000f002)
+#define MPI3_SYSIF_FAULT_CODE_COMPLETE_RESET_NEEDED                     (0x0000f003)
+#define MPI3_SYSIF_FAULT_CODE_SOFT_RESET_NEEDED                         (0x0000f004)
+#define MPI3_SYSIF_FAULT_CODE_POWER_CYCLE_REQUIRED                      (0x0000f005)
+#define MPI3_SYSIF_FAULT_CODE_TEMP_THRESHOLD_EXCEEDED                   (0x0000f006)
+#define MPI3_SYSIF_FAULT_INFO0_OFFSET                                   (0x00001c14)
+#define MPI3_SYSIF_FAULT_INFO1_OFFSET                                   (0x00001c18)
+#define MPI3_SYSIF_FAULT_INFO2_OFFSET                                   (0x00001c1c)
+#define MPI3_SYSIF_HCB_ADDRESS_LOW_OFFSET                               (0x00001c30)
+#define MPI3_SYSIF_HCB_ADDRESS_HIGH_OFFSET                              (0x00001c34)
+#define MPI3_SYSIF_HCB_SIZE_OFFSET                                      (0x00001c38)
+#define MPI3_SYSIF_HCB_SIZE_SIZE_MASK                                   (0xfffff000)
+#define MPI3_SYSIF_HCB_SIZE_SIZE_SHIFT                                  (12)
+#define MPI3_SYSIF_HCB_SIZE_HCDW_ENABLE                                 (0x00000001)
+#define MPI3_SYSIF_REPLY_FREE_HOST_INDEX_OFFSET                         (0x00001c40)
+#define MPI3_SYSIF_SENSE_BUF_FREE_HOST_INDEX_OFFSET                     (0x00001c44)
+#define MPI3_SYSIF_DIAG_RW_DATA_LOW_OFFSET                              (0x00001c50)
+#define MPI3_SYSIF_DIAG_RW_DATA_HIGH_OFFSET                             (0x00001c54)
+#define MPI3_SYSIF_DIAG_RW_ADDRESS_LOW_OFFSET                           (0x00001c58)
+#define MPI3_SYSIF_DIAG_RW_ADDRESS_HIGH_OFFSET                          (0x00001c5c)
+#define MPI3_SYSIF_DIAG_RW_CONTROL_OFFSET                               (0x00001c60)
+#define MPI3_SYSIF_DIAG_RW_CONTROL_LEN_MASK                             (0x00000030)
+#define MPI3_SYSIF_DIAG_RW_CONTROL_LEN_1BYTE                            (0x00000000)
+#define MPI3_SYSIF_DIAG_RW_CONTROL_LEN_2BYTES                           (0x00000010)
+#define MPI3_SYSIF_DIAG_RW_CONTROL_LEN_4BYTES                           (0x00000020)
+#define MPI3_SYSIF_DIAG_RW_CONTROL_LEN_8BYTES                           (0x00000030)
+#define MPI3_SYSIF_DIAG_RW_CONTROL_RESET                                (0x00000004)
+#define MPI3_SYSIF_DIAG_RW_CONTROL_DIR_MASK                             (0x00000002)
+#define MPI3_SYSIF_DIAG_RW_CONTROL_DIR_READ                             (0x00000000)
+#define MPI3_SYSIF_DIAG_RW_CONTROL_DIR_WRITE                            (0x00000002)
+#define MPI3_SYSIF_DIAG_RW_CONTROL_START                                (0x00000001)
+#define MPI3_SYSIF_DIAG_RW_STATUS_OFFSET                                (0x00001c62)
+#define MPI3_SYSIF_DIAG_RW_STATUS_STATUS_MASK                           (0x0000000e)
+#define MPI3_SYSIF_DIAG_RW_STATUS_STATUS_SUCCESS                        (0x00000000)
+#define MPI3_SYSIF_DIAG_RW_STATUS_STATUS_INV_ADDR                       (0x00000002)
+#define MPI3_SYSIF_DIAG_RW_STATUS_STATUS_ACC_ERR                        (0x00000004)
+#define MPI3_SYSIF_DIAG_RW_STATUS_STATUS_PAR_ERR                        (0x00000006)
+#define MPI3_SYSIF_DIAG_RW_STATUS_BUSY                                  (0x00000001)
+#define MPI3_SYSIF_SCRATCHPAD0_OFFSET                                   (0x00001cf0)
+#define MPI3_SYSIF_SCRATCHPAD1_OFFSET                                   (0x00001cf4)
+#define MPI3_SYSIF_SCRATCHPAD2_OFFSET                                   (0x00001cf8)
+#define MPI3_SYSIF_SCRATCHPAD3_OFFSET                                   (0x00001cfc)
+#define MPI3_SYSIF_DEVICE_ASSIGNED_REGS_OFFSET                          (0x00002000)
+#define MPI3_SYSIF_DIAG_SAVE_TIMEOUT                                    (60)
+struct mpi3_default_reply_descriptor {
+	__le32             descriptor_type_dependent1[2];
+	__le16             request_queue_ci;
+	__le16             request_queue_id;
+	__le16             descriptor_type_dependent2;
+	__le16             reply_flags;
+};
+#define MPI3_REPLY_DESCRIPT_FLAGS_PHASE_MASK                       (0x0001)
+#define MPI3_REPLY_DESCRIPT_FLAGS_TYPE_MASK                        (0xf000)
+#define MPI3_REPLY_DESCRIPT_FLAGS_TYPE_ADDRESS_REPLY               (0x0000)
+#define MPI3_REPLY_DESCRIPT_FLAGS_TYPE_SUCCESS                     (0x1000)
+#define MPI3_REPLY_DESCRIPT_FLAGS_TYPE_TARGET_COMMAND_BUFFER       (0x2000)
+#define MPI3_REPLY_DESCRIPT_FLAGS_TYPE_STATUS                      (0x3000)
+#define MPI3_REPLY_DESCRIPT_REQUEST_QUEUE_ID_INVALID               (0xffff)
+struct mpi3_address_reply_descriptor {
+	__le64             reply_frame_address;
+	__le16             request_queue_ci;
+	__le16             request_queue_id;
+	__le16             reserved0c;
+	__le16             reply_flags;
+};
+struct mpi3_success_reply_descriptor {
+	__le32             reserved00[2];
+	__le16             request_queue_ci;
+	__le16             request_queue_id;
+	__le16             host_tag;
+	__le16             reply_flags;
+};
+struct mpi3_target_command_buffer_reply_descriptor {
+	__le32             reserved00;
+	__le16             initiator_dev_handle;
+	u8                 phy_num;
+	u8                 reserved07;
+	__le16             request_queue_ci;
+	__le16             request_queue_id;
+	__le16             io_index;
+	__le16             reply_flags;
+};
+struct mpi3_status_reply_descriptor {
+	__le16             ioc_status;
+	__le16             reserved02;
+	__le32             ioc_log_info;
+	__le16             request_queue_ci;
+	__le16             request_queue_id;
+	__le16             host_tag;
+	__le16             reply_flags;
+};
+#define MPI3_REPLY_DESCRIPT_STATUS_IOCSTATUS_LOGINFOAVAIL               (0x8000)
+#define MPI3_REPLY_DESCRIPT_STATUS_IOCSTATUS_STATUS_MASK                (0x7fff)
+#define MPI3_REPLY_DESCRIPT_STATUS_IOCLOGINFO_TYPE_MASK                 (0xf0000000)
+#define MPI3_REPLY_DESCRIPT_STATUS_IOCLOGINFO_TYPE_NO_INFO              (0x00000000)
+#define MPI3_REPLY_DESCRIPT_STATUS_IOCLOGINFO_TYPE_SAS                  (0x30000000)
+#define MPI3_REPLY_DESCRIPT_STATUS_IOCLOGINFO_DATA_MASK                 (0x0fffffff)
+union mpi3_reply_descriptors_union {
+	struct mpi3_default_reply_descriptor               default_reply;
+	struct mpi3_address_reply_descriptor               address_reply;
+	struct mpi3_success_reply_descriptor               success;
+	struct mpi3_target_command_buffer_reply_descriptor target_command_buffer;
+	struct mpi3_status_reply_descriptor                status;
+	__le32                                         words[4];
+};
+struct mpi3_sge_common {
+	__le64             address;
+	__le32             length;
+	u8                 reserved0c[3];
+	u8                 flags;
+};
+struct mpi3_sge_bit_bucket {
+	__le64             reserved00;
+	__le32             length;
+	u8                 reserved0c[3];
+	u8                 flags;
+};
+struct mpi3_sge_extended_eedp {
+	u8                 user_data_size;
+	u8                 reserved01;
+	__le16             eedp_flags;
+	__le32             secondary_reference_tag;
+	__le16             secondary_application_tag;
+	__le16             application_tag_translation_mask;
+	__le16             reserved0c;
+	u8                 extended_operation;
+	u8                 flags;
+};
+union mpi3_sge_union {
+	struct mpi3_sge_common                 simple;
+	struct mpi3_sge_common                  chain;
+	struct mpi3_sge_common             last_chain;
+	struct mpi3_sge_bit_bucket             bit_bucket;
+	struct mpi3_sge_extended_eedp          eedp;
+	__le32                             words[4];
+};
+#define MPI3_SGE_FLAGS_ELEMENT_TYPE_MASK        (0xf0)
+#define MPI3_SGE_FLAGS_ELEMENT_TYPE_SIMPLE      (0x00)
+#define MPI3_SGE_FLAGS_ELEMENT_TYPE_BIT_BUCKET  (0x10)
+#define MPI3_SGE_FLAGS_ELEMENT_TYPE_CHAIN       (0x20)
+#define MPI3_SGE_FLAGS_ELEMENT_TYPE_LAST_CHAIN  (0x30)
+#define MPI3_SGE_FLAGS_ELEMENT_TYPE_EXTENDED    (0xf0)
+#define MPI3_SGE_FLAGS_END_OF_LIST              (0x08)
+#define MPI3_SGE_FLAGS_END_OF_BUFFER            (0x04)
+#define MPI3_SGE_FLAGS_DLAS_MASK                (0x03)
+#define MPI3_SGE_FLAGS_DLAS_SYSTEM              (0x00)
+#define MPI3_SGE_FLAGS_DLAS_IOC_UDP             (0x01)
+#define MPI3_SGE_FLAGS_DLAS_IOC_CTL             (0x02)
+#define MPI3_SGE_EXT_OPER_EEDP                  (0x00)
+#define MPI3_EEDPFLAGS_INCR_PRI_REF_TAG             (0x8000)
+#define MPI3_EEDPFLAGS_INCR_SEC_REF_TAG             (0x4000)
+#define MPI3_EEDPFLAGS_INCR_PRI_APP_TAG             (0x2000)
+#define MPI3_EEDPFLAGS_INCR_SEC_APP_TAG             (0x1000)
+#define MPI3_EEDPFLAGS_ESC_PASSTHROUGH              (0x0800)
+#define MPI3_EEDPFLAGS_CHK_REF_TAG                  (0x0400)
+#define MPI3_EEDPFLAGS_CHK_APP_TAG                  (0x0200)
+#define MPI3_EEDPFLAGS_CHK_GUARD                    (0x0100)
+#define MPI3_EEDPFLAGS_ESC_MODE_MASK                (0x00c0)
+#define MPI3_EEDPFLAGS_ESC_MODE_DO_NOT_DISABLE      (0x0040)
+#define MPI3_EEDPFLAGS_ESC_MODE_APPTAG_DISABLE      (0x0080)
+#define MPI3_EEDPFLAGS_ESC_MODE_APPTAG_REFTAG_DISABLE   (0x00c0)
+#define MPI3_EEDPFLAGS_HOST_GUARD_MASK              (0x0030)
+#define MPI3_EEDPFLAGS_HOST_GUARD_T10_CRC           (0x0000)
+#define MPI3_EEDPFLAGS_HOST_GUARD_IP_CHKSUM         (0x0010)
+#define MPI3_EEDPFLAGS_HOST_GUARD_OEM_SPECIFIC      (0x0020)
+#define MPI3_EEDPFLAGS_PT_REF_TAG                   (0x0008)
+#define MPI3_EEDPFLAGS_EEDP_OP_MASK                 (0x0007)
+#define MPI3_EEDPFLAGS_EEDP_OP_CHECK                (0x0001)
+#define MPI3_EEDPFLAGS_EEDP_OP_STRIP                (0x0002)
+#define MPI3_EEDPFLAGS_EEDP_OP_CHECK_REMOVE         (0x0003)
+#define MPI3_EEDPFLAGS_EEDP_OP_INSERT               (0x0004)
+#define MPI3_EEDPFLAGS_EEDP_OP_REPLACE              (0x0006)
+#define MPI3_EEDPFLAGS_EEDP_OP_CHECK_REGEN          (0x0007)
+#define MPI3_EEDP_UDS_512                           (0x01)
+#define MPI3_EEDP_UDS_520                           (0x02)
+#define MPI3_EEDP_UDS_4080                          (0x03)
+#define MPI3_EEDP_UDS_4088                          (0x04)
+#define MPI3_EEDP_UDS_4096                          (0x05)
+#define MPI3_EEDP_UDS_4104                          (0x06)
+#define MPI3_EEDP_UDS_4160                          (0x07)
+struct mpi3_request_header {
+	__le16             host_tag;
+	u8                 ioc_use_only02;
+	u8                 function;
+	__le16             ioc_use_only04;
+	u8                 ioc_use_only06;
+	u8                 msg_flags;
+	__le16             change_count;
+	__le16             function_dependent;
+};
+struct mpi3_default_reply {
+	__le16             host_tag;
+	u8                 ioc_use_only02;
+	u8                 function;
+	__le16             ioc_use_only04;
+	u8                 ioc_use_only06;
+	u8                 msg_flags;
+	__le16             ioc_use_only08;
+	__le16             ioc_status;
+	__le32             ioc_log_info;
+};
+#define MPI3_HOST_TAG_INVALID                       (0xffff)
+#define MPI3_FUNCTION_IOC_FACTS                     (0x01)
+#define MPI3_FUNCTION_IOC_INIT                      (0x02)
+#define MPI3_FUNCTION_PORT_ENABLE                   (0x03)
+#define MPI3_FUNCTION_EVENT_NOTIFICATION            (0x04)
+#define MPI3_FUNCTION_EVENT_ACK                     (0x05)
+#define MPI3_FUNCTION_CI_DOWNLOAD                   (0x06)
+#define MPI3_FUNCTION_CI_UPLOAD                     (0x07)
+#define MPI3_FUNCTION_IO_UNIT_CONTROL               (0x08)
+#define MPI3_FUNCTION_PERSISTENT_EVENT_LOG          (0x09)
+#define MPI3_FUNCTION_MGMT_PASSTHROUGH              (0x0a)
+#define MPI3_FUNCTION_CONFIG                        (0x10)
+#define MPI3_FUNCTION_SCSI_IO                       (0x20)
+#define MPI3_FUNCTION_SCSI_TASK_MGMT                (0x21)
+#define MPI3_FUNCTION_SMP_PASSTHROUGH               (0x22)
+#define MPI3_FUNCTION_NVME_ENCAPSULATED             (0x24)
+#define MPI3_FUNCTION_TARGET_ASSIST                 (0x30)
+#define MPI3_FUNCTION_TARGET_STATUS_SEND            (0x31)
+#define MPI3_FUNCTION_TARGET_MODE_ABORT             (0x32)
+#define MPI3_FUNCTION_TARGET_CMD_BUF_POST_BASE      (0x33)
+#define MPI3_FUNCTION_TARGET_CMD_BUF_POST_LIST      (0x34)
+#define MPI3_FUNCTION_CREATE_REQUEST_QUEUE          (0x70)
+#define MPI3_FUNCTION_DELETE_REQUEST_QUEUE          (0x71)
+#define MPI3_FUNCTION_CREATE_REPLY_QUEUE            (0x72)
+#define MPI3_FUNCTION_DELETE_REPLY_QUEUE            (0x73)
+#define MPI3_FUNCTION_TOOLBOX                       (0x80)
+#define MPI3_FUNCTION_DIAG_BUFFER_POST              (0x81)
+#define MPI3_FUNCTION_DIAG_BUFFER_MANAGE            (0x82)
+#define MPI3_FUNCTION_DIAG_BUFFER_UPLOAD            (0x83)
+#define MPI3_FUNCTION_MIN_IOC_USE_ONLY              (0xc0)
+#define MPI3_FUNCTION_MAX_IOC_USE_ONLY              (0xef)
+#define MPI3_FUNCTION_MIN_PRODUCT_SPECIFIC          (0xf0)
+#define MPI3_FUNCTION_MAX_PRODUCT_SPECIFIC          (0xff)
+#define MPI3_IOCSTATUS_LOG_INFO_AVAIL_MASK          (0x8000)
+#define MPI3_IOCSTATUS_LOG_INFO_AVAILABLE           (0x8000)
+#define MPI3_IOCSTATUS_STATUS_MASK                  (0x7fff)
+#define MPI3_IOCSTATUS_SUCCESS                      (0x0000)
+#define MPI3_IOCSTATUS_INVALID_FUNCTION             (0x0001)
+#define MPI3_IOCSTATUS_BUSY                         (0x0002)
+#define MPI3_IOCSTATUS_INVALID_SGL                  (0x0003)
+#define MPI3_IOCSTATUS_INTERNAL_ERROR               (0x0004)
+#define MPI3_IOCSTATUS_INSUFFICIENT_RESOURCES       (0x0006)
+#define MPI3_IOCSTATUS_INVALID_FIELD                (0x0007)
+#define MPI3_IOCSTATUS_INVALID_STATE                (0x0008)
+#define MPI3_IOCSTATUS_INSUFFICIENT_POWER           (0x000a)
+#define MPI3_IOCSTATUS_INVALID_CHANGE_COUNT         (0x000b)
+#define MPI3_IOCSTATUS_ALLOWED_CMD_BLOCK            (0x000c)
+#define MPI3_IOCSTATUS_SUPERVISOR_ONLY              (0x000d)
+#define MPI3_IOCSTATUS_FAILURE                      (0x001f)
+#define MPI3_IOCSTATUS_CONFIG_INVALID_ACTION        (0x0020)
+#define MPI3_IOCSTATUS_CONFIG_INVALID_TYPE          (0x0021)
+#define MPI3_IOCSTATUS_CONFIG_INVALID_PAGE          (0x0022)
+#define MPI3_IOCSTATUS_CONFIG_INVALID_DATA          (0x0023)
+#define MPI3_IOCSTATUS_CONFIG_NO_DEFAULTS           (0x0024)
+#define MPI3_IOCSTATUS_CONFIG_CANT_COMMIT           (0x0025)
+#define MPI3_IOCSTATUS_SCSI_RECOVERED_ERROR         (0x0040)
+#define MPI3_IOCSTATUS_SCSI_TM_NOT_SUPPORTED        (0x0041)
+#define MPI3_IOCSTATUS_SCSI_INVALID_DEVHANDLE       (0x0042)
+#define MPI3_IOCSTATUS_SCSI_DEVICE_NOT_THERE        (0x0043)
+#define MPI3_IOCSTATUS_SCSI_DATA_OVERRUN            (0x0044)
+#define MPI3_IOCSTATUS_SCSI_DATA_UNDERRUN           (0x0045)
+#define MPI3_IOCSTATUS_SCSI_IO_DATA_ERROR           (0x0046)
+#define MPI3_IOCSTATUS_SCSI_PROTOCOL_ERROR          (0x0047)
+#define MPI3_IOCSTATUS_SCSI_TASK_TERMINATED         (0x0048)
+#define MPI3_IOCSTATUS_SCSI_RESIDUAL_MISMATCH       (0x0049)
+#define MPI3_IOCSTATUS_SCSI_TASK_MGMT_FAILED        (0x004a)
+#define MPI3_IOCSTATUS_SCSI_IOC_TERMINATED          (0x004b)
+#define MPI3_IOCSTATUS_SCSI_EXT_TERMINATED          (0x004c)
+#define MPI3_IOCSTATUS_EEDP_GUARD_ERROR             (0x004d)
+#define MPI3_IOCSTATUS_EEDP_REF_TAG_ERROR           (0x004e)
+#define MPI3_IOCSTATUS_EEDP_APP_TAG_ERROR           (0x004f)
+#define MPI3_IOCSTATUS_TARGET_INVALID_IO_INDEX      (0x0062)
+#define MPI3_IOCSTATUS_TARGET_ABORTED               (0x0063)
+#define MPI3_IOCSTATUS_TARGET_NO_CONN_RETRYABLE     (0x0064)
+#define MPI3_IOCSTATUS_TARGET_NO_CONNECTION         (0x0065)
+#define MPI3_IOCSTATUS_TARGET_XFER_COUNT_MISMATCH   (0x006a)
+#define MPI3_IOCSTATUS_TARGET_DATA_OFFSET_ERROR     (0x006d)
+#define MPI3_IOCSTATUS_TARGET_TOO_MUCH_WRITE_DATA   (0x006e)
+#define MPI3_IOCSTATUS_TARGET_IU_TOO_SHORT          (0x006f)
+#define MPI3_IOCSTATUS_TARGET_ACK_NAK_TIMEOUT       (0x0070)
+#define MPI3_IOCSTATUS_TARGET_NAK_RECEIVED          (0x0071)
+#define MPI3_IOCSTATUS_SAS_SMP_REQUEST_FAILED       (0x0090)
+#define MPI3_IOCSTATUS_SAS_SMP_DATA_OVERRUN         (0x0091)
+#define MPI3_IOCSTATUS_DIAGNOSTIC_RELEASED          (0x00a0)
+#define MPI3_IOCSTATUS_CI_UNSUPPORTED               (0x00b0)
+#define MPI3_IOCSTATUS_CI_UPDATE_SEQUENCE           (0x00b1)
+#define MPI3_IOCSTATUS_CI_VALIDATION_FAILED         (0x00b2)
+#define MPI3_IOCSTATUS_CI_KEY_UPDATE_PENDING        (0x00b3)
+#define MPI3_IOCSTATUS_CI_KEY_UPDATE_NOT_POSSIBLE   (0x00b4)
+#define MPI3_IOCSTATUS_SECURITY_KEY_REQUIRED        (0x00c0)
+#define MPI3_IOCSTATUS_SECURITY_VIOLATION           (0x00c1)
+#define MPI3_IOCSTATUS_INVALID_QUEUE_ID             (0x0f00)
+#define MPI3_IOCSTATUS_INVALID_QUEUE_SIZE           (0x0f01)
+#define MPI3_IOCSTATUS_INVALID_MSIX_VECTOR          (0x0f02)
+#define MPI3_IOCSTATUS_INVALID_REPLY_QUEUE_ID       (0x0f03)
+#define MPI3_IOCSTATUS_INVALID_QUEUE_DELETION       (0x0f04)
+#define MPI3_IOCLOGINFO_TYPE_MASK               (0xf0000000)
+#define MPI3_IOCLOGINFO_TYPE_SHIFT              (28)
+#define MPI3_IOCLOGINFO_TYPE_NONE               (0x0)
+#define MPI3_IOCLOGINFO_TYPE_SAS                (0x3)
+#define MPI3_IOCLOGINFO_LOG_DATA_MASK           (0x0fffffff)
+#endif
diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_type.h b/drivers/scsi/mpi3mr/mpi/mpi30_type.h
new file mode 100644
index 0000000000000..36ec6a76d1a97
--- /dev/null
+++ b/drivers/scsi/mpi3mr/mpi/mpi30_type.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ *  Copyright 2016-2022 Broadcom Inc. All rights reserved.
+ */
+#ifndef MPI30_TYPE_H
+#define MPI30_TYPE_H     1
+#endif
diff --git a/drivers/scsi/mpi3mr/mpi3mr.h b/drivers/scsi/mpi3mr/mpi3mr.h
new file mode 100644
index 0000000000000..f668f4b8ef9d8
--- /dev/null
+++ b/drivers/scsi/mpi3mr/mpi3mr.h
@@ -0,0 +1,1536 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Driver for Broadcom MPI3 Storage Controllers
+ *
+ * Copyright (C) 2017-2022 Broadcom Inc.
+ *  (mailto: mpi3mr-linuxdrv.pdl@broadcom.com)
+ *
+ */
+
+#ifndef MPI3MR_H_INCLUDED
+#define MPI3MR_H_INCLUDED
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/blk-mq-pci.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/aer.h>
+#include <linux/poll.h>
+#include <linux/miscdevice.h>
+#include <linux/dmapool.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/poll.h>
+#include <linux/io.h>
+#include <linux/uaccess.h>
+#include <linux/version.h>
+#include <linux/utsname.h>
+#include <asm/unaligned.h>
+#include <linux/kmsg_dump.h>
+
+#if !((defined(RHEL_MAJOR) && (RHEL_MAJOR == 8) && (RHEL_MINOR > 2)) || \
+	(defined(CONFIG_SUSE_KERNEL) && \
+	((CONFIG_SUSE_VERSION == 15) && (CONFIG_SUSE_PATCHLEVEL >= 3))) || \
+	(LINUX_VERSION_CODE > KERNEL_VERSION(5, 4, 0)))
+#include <linux/pci-aspm.h>
+#endif
+
+#include <scsi/scsi.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_tcq.h>
+#include <scsi/scsi_dbg.h>
+#include <scsi/scsi_transport_sas.h>
+
+#include "mpi/mpi30_transport.h"
+#include "mpi/mpi30_cnfg.h"
+#include "mpi/mpi30_image.h"
+#include "mpi/mpi30_init.h"
+#include "mpi/mpi30_ioc.h"
+#include "mpi/mpi30_sas.h"
+#include "mpi/mpi30_tool.h"
+#include "mpi/mpi30_pci.h"
+#include "mpi3mr_debug.h"
+#include "mpi3mr_kernel_compat.h"
+
+/* Global list and lock for storing multiple adapters managed by the driver */
+extern spinlock_t mrioc_list_lock;
+extern struct list_head mrioc_list;
+extern atomic64_t event_counter;
+
+#define MPI3MR_DRIVER_VERSION	"8.2.1.0.0"
+#define MPI3MR_DRIVER_RELDATE	"03-August-2022"
+
+#define MPI3MR_DRIVER_NAME	"mpi3mr"
+#define MPI3MR_DRIVER_LICENSE	"GPL"
+#define MPI3MR_DRIVER_AUTHOR	"Broadcom Inc. <mpi3mr-linuxdrv.pdl@broadcom.com>"
+#define MPI3MR_DRIVER_DESC	"MPI3 Storage Controller Device Driver"
+
+#define MPI3MR_NAME_LENGTH	32
+#define IOCNAME			"%s: "
+
+/*
+ * The maximum transfer size supported in single I/O is 1M and that is 2K
+ * in 512-byte sized sectors
+*/
+#define MPI3MR_MAX_SECTORS	2048
+
+/* Definitions for internal SGL and Chain SGL buffers */
+#define MPI3MR_PAGE_SIZE_4K		4096
+#define MPI3MR_CHAINSGE_SIZE	MPI3MR_PAGE_SIZE_4K
+#define MPI3MR_SG_DEPTH		(MPI3MR_PAGE_SIZE_4K / \
+	sizeof(struct mpi3_sge_common))
+
+/* Definitions for MAX values for shost */
+#define MPI3MR_MAX_CMDS_LUN	128
+#define MPI3MR_MAX_CDB_LENGTH	32
+
+/* Admin queue management definitions */
+#define MPI3MR_ADMIN_REQ_Q_SIZE		(2 * MPI3MR_PAGE_SIZE_4K)
+#define MPI3MR_ADMIN_REPLY_Q_SIZE	(4 * MPI3MR_PAGE_SIZE_4K)
+#define MPI3MR_ADMIN_REQ_FRAME_SZ	128
+#define MPI3MR_ADMIN_REPLY_FRAME_SZ	16
+
+/* Operational queue management definitions */
+#define MPI3MR_OP_REQ_Q_QD		512
+#define MPI3MR_OP_REP_Q_QD		1024
+#define MPI3MR_OP_REP_Q_QD4K		4096
+#define MPI3MR_OP_REQ_Q_SEG_SIZE	4096
+#define MPI3MR_OP_REP_Q_SEG_SIZE	4096
+#define MPI3MR_MAX_SEG_LIST_SIZE	4096
+
+/* Reserved Host Tag definitions */
+#define MPI3MR_HOSTTAG_INVALID		0xFFFF
+#define MPI3MR_HOSTTAG_INITCMDS		1
+#define MPI3MR_HOSTTAG_BSG_CMDS		2
+#define MPI3MR_HOSTTAG_PEL_ABORT	3
+#define MPI3MR_HOSTTAG_PEL_WAIT		4
+#define MPI3MR_HOSTTAG_BLK_TMS		5
+#define MPI3MR_HOSTTAG_CFG_CMDS		6
+#define MPI3MR_HOSTTAG_TRANSPORT_CMDS	7
+
+#define MPI3MR_NUM_DEVRMCMD		16
+#define MPI3MR_HOSTTAG_DEVRMCMD_MIN	(MPI3MR_HOSTTAG_TRANSPORT_CMDS + 1)
+#define MPI3MR_HOSTTAG_DEVRMCMD_MAX	(MPI3MR_HOSTTAG_DEVRMCMD_MIN + \
+						MPI3MR_NUM_DEVRMCMD - 1)
+
+#define MPI3MR_NUM_EVTACKCMD		4
+#define MPI3MR_HOSTTAG_EVTACKCMD_MIN	(MPI3MR_HOSTTAG_DEVRMCMD_MAX + 1)
+#define MPI3MR_HOSTTAG_EVTACKCMD_MAX	(MPI3MR_HOSTTAG_EVTACKCMD_MIN + \
+						MPI3MR_NUM_EVTACKCMD - 1)
+
+#define MPI3MR_NUM_SYSFS_TM		32
+#define MPI3MR_HOSTTAG_SYSFS_TM_MIN	(MPI3MR_HOSTTAG_EVTACKCMD_MAX + 1)
+#define MPI3MR_HOSTTAG_SYSFS_TM_MAX	(MPI3MR_HOSTTAG_SYSFS_TM_MIN + \
+						MPI3MR_NUM_SYSFS_TM - 1)
+
+#define MPI3MR_INTERNALCMDS_RESVD	MPI3MR_HOSTTAG_SYSFS_TM_MAX
+
+/* Reduced resource count definition for crash kernel */
+#define MPI3MR_HOST_IOS_KDUMP		128
+
+/* command/controller interaction timeout definitions in seconds */
+#define MPI3MR_INTADMCMD_TIMEOUT		60
+#define MPI3MR_PORTENABLE_TIMEOUT		300
+#define MPI3MR_PORTENABLE_POLL_INTERVAL		5
+#define MPI3MR_ABORTTM_TIMEOUT			60
+#define MPI3MR_RESETTM_TIMEOUT			60
+#define MPI3MR_TSUPDATE_INTERVAL		900
+#define MPI3MR_DEFAULT_SHUTDOWN_TIME		120
+#define	MPI3MR_RAID_ERRREC_RESET_TIMEOUT	180
+#define	MPI3MR_RESET_HOST_IOWAIT_TIMEOUT	5
+#define	MPI3MR_PREPARE_FOR_RESET_TIMEOUT	180
+#define	MPI3MR_RESET_ACK_TIMEOUT		30
+
+#define MPI3MR_RESET_TOPOLOGY_SETTLE_TIME	10
+
+#define MPI3MR_SCMD_TIMEOUT			(60 * HZ)
+#define MPI3MR_EH_SCMD_TIMEOUT			(60 * HZ)
+
+#define MPI3MR_WATCHDOG_INTERVAL		1000 /* in milli seconds */
+
+#define MPI3MR_DEFAULT_CFG_PAGE_SZ	1024 /*bytes*/
+
+/* Internal admin command state definitions*/
+#define MPI3MR_CMD_NOTUSED	0x8000
+#define MPI3MR_CMD_COMPLETE	0x0001
+#define MPI3MR_CMD_PENDING	0x0002
+#define MPI3MR_CMD_REPLY_VALID	0x0004
+#define MPI3MR_CMD_RESET	0x0008
+
+/* Definitions for Event replies and sense buffer allocated per controller */
+#define MPI3MR_NUM_EVT_REPLIES	32
+#define MPI3MR_SENSE_BUF_SZ	256
+#define MPI3MR_SENSEBUF_FACTOR	3
+
+/* Invalid target device handle */
+#define MPI3MR_INVALID_DEV_HANDLE	0xFFFF
+
+/* Controller Reset related definitions */
+#define MPI3MR_HOSTDIAG_UNLOCK_RETRY_COUNT	5
+#define MPI3MR_MAX_RESET_RETRY_COUNT		3
+
+#define MPI3MR_RI_MASK_RESPCODE		(0x000000FF)
+
+#define MPI3MR_DEFAULT_MDTS		(128 * 1024)
+#define MPI3MR_DEFAULT_PGSZEXP		(12)
+#define MPI3MR_MAX_APP_XFER_SIZE	(1 * 1024 * 1024)
+#define MPI3MR_MAX_APP_XFER_SECTORS	(2048 + 512)
+#define MPI3MR_MAX_APP_XFER_SEGMENTS	512
+
+
+/* Command retry count definitions */
+#define MPI3MR_DEV_RMHS_RETRY_COUNT 3
+#define MPI3MR_PEL_RETRY_COUNT 3
+
+/* Default target device queue depth */
+#define MPI3MR_DEFAULT_SDEV_QD	32
+
+/* Definitions for Threaded IRQ poll*/
+#define MPI3MR_IRQ_POLL_SLEEP			2
+#define MPI3MR_IRQ_POLL_TRIGGER_IOCOUNT		8
+
+/* Definitions for the controller security status*/
+#define MPI3MR_CTLR_SECURITY_STATUS_MASK	0x0C
+#define MPI3MR_CTLR_SECURE_DBG_STATUS_MASK	0x02
+
+#define MPI3MR_INVALID_DEVICE			0x00
+#define MPI3MR_CONFIG_SECURE_DEVICE		0x04
+#define MPI3MR_HARD_SECURE_DEVICE		0x08
+#define MPI3MR_TAMPERED_DEVICE			0x0C
+
+#define MPI3MR_DEFAULT_HDB_MAX_SZ	(4 * 1024 * 1024)
+#define MPI3MR_DEFAULT_HDB_DEC_SZ	(1 * 1024 * 1024)
+#define MPI3MR_DEFAULT_HDB_MIN_SZ	(2 * 1024 * 1024)
+#define MPI3MR_MAX_NUM_HDB	2
+
+/* Driver Host Diag Buffer (drv_db) */
+#define MPI3MR_MIN_DIAG_HOST_BUFFER_SZ		(32 * 1024) + \
+	sizeof(struct mpi3_driver_buffer_header)
+#define MPI3MR_DEFAULT_DIAG_HOST_BUFFER_SZ	(512 * 1024) + \
+	sizeof(struct mpi3_driver_buffer_header)
+#define MPI3MR_UEFI_DIAG_HOST_BUFFER_OFFSET	(16 * 1024)
+
+/* SGE Flag definition */
+#define MPI3MR_SGEFLAGS_SYSTEM_SIMPLE_END_OF_LIST \
+	(MPI3_SGE_FLAGS_ELEMENT_TYPE_SIMPLE | MPI3_SGE_FLAGS_DLAS_SYSTEM | \
+	MPI3_SGE_FLAGS_END_OF_LIST)
+
+/* MSI Index from Reply Queue Index */
+#define REPLY_QUEUE_IDX_TO_MSIX_IDX(qidx, offset)	(qidx + offset)
+
+/* HBA port flags*/
+#define MPI3MR_HBA_PORT_FLAG_DIRTY	0x01
+#define MPI3MR_HBA_PORT_FLAG_NEW	0x02
+
+/* Driver diag buffer levels */
+enum mpi3mr_drv_db_level {
+	MRIOC_DRV_DB_DISABLED = 0,
+	MRIOC_DRV_DB_MINI = 1,
+	MRIOC_DRV_DB_FULL = 2,
+};
+/* IOC State definitions */
+enum mpi3mr_iocstate {
+	MRIOC_STATE_READY = 1,
+	MRIOC_STATE_RESET,
+	MRIOC_STATE_FAULT,
+	MRIOC_STATE_BECOMING_READY,
+	MRIOC_STATE_RESET_REQUESTED,
+	MRIOC_STATE_UNRECOVERABLE,
+};
+
+/* Reset reason code definitions*/
+enum mpi3mr_reset_reason {
+	MPI3MR_RESET_FROM_BRINGUP = 1,
+	MPI3MR_RESET_FROM_FAULT_WATCH = 2,
+	MPI3MR_RESET_FROM_APP = 3,
+	MPI3MR_RESET_FROM_EH_HOS = 4,
+	MPI3MR_RESET_FROM_TM_TIMEOUT = 5,
+	MPI3MR_RESET_FROM_APP_TIMEOUT = 6,
+	MPI3MR_RESET_FROM_MUR_FAILURE = 7,
+	MPI3MR_RESET_FROM_CTLR_CLEANUP = 8,
+	MPI3MR_RESET_FROM_CIACTIV_FAULT = 9,
+	MPI3MR_RESET_FROM_PE_TIMEOUT = 10,
+	MPI3MR_RESET_FROM_TSU_TIMEOUT = 11,
+	MPI3MR_RESET_FROM_DELREQQ_TIMEOUT = 12,
+	MPI3MR_RESET_FROM_DELREPQ_TIMEOUT = 13,
+	MPI3MR_RESET_FROM_CREATEREPQ_TIMEOUT = 14,
+	MPI3MR_RESET_FROM_CREATEREQQ_TIMEOUT = 15,
+	MPI3MR_RESET_FROM_IOCFACTS_TIMEOUT = 16,
+	MPI3MR_RESET_FROM_IOCINIT_TIMEOUT = 17,
+	MPI3MR_RESET_FROM_EVTNOTIFY_TIMEOUT = 18,
+	MPI3MR_RESET_FROM_EVTACK_TIMEOUT = 19,
+	MPI3MR_RESET_FROM_CIACTVRST_TIMER = 20,
+	MPI3MR_RESET_FROM_GETPKGVER_TIMEOUT = 21,
+	MPI3MR_RESET_FROM_PELABORT_TIMEOUT = 22,
+	MPI3MR_RESET_FROM_SYSFS = 23,
+	MPI3MR_RESET_FROM_SYSFS_TIMEOUT = 24,
+	MPI3MR_RESET_FROM_DIAG_BUFFER_POST_TIMEOUT = 25,
+	MPI3MR_RESET_FROM_DIAG_BUFFER_RELEASE_TIMEOUT = 26,
+	MPI3MR_RESET_FROM_FIRMWARE = 27,
+	MPI3MR_RESET_FROM_DIAG_BUFFER_UPLOAD_TIMEOUT = 28,
+	MPI3MR_RESET_FROM_CFG_REQ_TIMEOUT = 29,
+	MPI3MR_RESET_FROM_SAS_TRANSPORT_TIMEOUT = 30,
+	MPI3MR_RESET_FROM_TRIGGER = 31,
+};
+
+/* Queue type definitions */
+enum queue_type {
+	MPI3MR_DEFAULT_QUEUE = 0,
+	MPI3MR_POLL_QUEUE,
+};
+
+/**
+ * struct mpi3mr_compimg_ver - replica of component image
+ * version defined in mpi30_image.h in host endianness
+ *
+ */
+struct mpi3mr_compimg_ver {
+	u16 build_num;
+	u16 cust_id;
+	u8 ph_minor;
+	u8 ph_major;
+	u8 gen_minor;
+	u8 gen_major;
+};
+
+/**
+ * struct mpi3mr_ioc_facts - replica of IOC facts data defined
+ * in mpi30_ioc.h in host endianness
+ *
+ */
+struct mpi3mr_ioc_facts {
+	u32 ioc_capabilities;
+	struct mpi3mr_compimg_ver fw_ver;
+	u32 mpi_version;
+	u32 diag_trace_sz;
+	u32 diag_fw_sz;
+	u32 diag_drvr_sz;
+	u16 max_reqs;
+	u16 product_id;
+	u16 op_req_sz;
+	u16 reply_sz;
+	u16 exceptions;
+	u16 max_perids;
+	u16 max_sasexpanders;
+	u16 max_sasinitiators;
+	u16 max_enclosures;
+	u16 max_pcie_switches;
+	u16 max_nvme;
+	u16 max_vds;
+	u16 max_hpds;
+	u16 max_advhpds;
+	u16 max_raid_pds;
+	u16 min_devhandle;
+	u16 max_devhandle;
+	u16 max_op_req_q;
+	u16 max_op_reply_q;
+	u16 shutdown_timeout;
+	u16 max_msix_vectors;
+	u8 ioc_num;
+	u8 who_init;
+	u8 personality;
+	u8 dma_mask;
+	u8 protocol_flags;
+	u8 sge_mod_mask;
+	u8 sge_mod_value;
+	u8 sge_mod_shift;
+	u8 max_dev_per_tg;
+	u16 max_io_throttle_group;
+	u16 io_throttle_data_length;
+	u16 io_throttle_low;
+	u16 io_throttle_high;
+
+};
+
+/**
+ * struct mpi3mr_fwevt - Firmware event structure.
+ *
+ * @list: list head
+ * @work: Work structure
+ * @mrioc: Adapter instance reference
+ * @event_id: MPI3 firmware event ID
+ * @send_ack: Event acknowledgment required or not
+ * @process_event: Bottomhalf processing required or not
+ * @event_context: Event context to send in Ack
+ * @event_data_size: size of the event data in bytes
+ * @pending_at_sml: waiting for device add/remove API to complete
+ * @discard: discard this event
+ * @ref_count: kref count
+ * @event_data: Actual MPI3 event data
+ */
+struct mpi3mr_fwevt {
+	struct list_head list;
+	struct work_struct work;
+	struct mpi3mr_ioc *mrioc;
+	u16 event_id;
+	bool send_ack;
+	bool process_event;
+	u32 event_context;
+	u16 event_data_size;
+	bool pending_at_sml;
+	bool discard;
+	struct kref ref_count;
+	char event_data[0] __aligned(4);
+};
+
+/**
+ * struct segments - memory descriptor structure to store
+ * virtual and dma addresses for operational queue segments.
+ *
+ * @segment: virtual address
+ * @segment_dma: dma address
+ */
+struct segments {
+	void *segment;
+	dma_addr_t segment_dma;
+};
+
+/**
+ * struct op_req_qinfo -  Operational Request Queue Information
+ *
+ * @ci: consumer index
+ * @pi: producer index
+ * @num_request: Maximum number of entries in the queue
+ * @qid: Queue Id starting from 1
+ * @reply_qid: Associated reply queue Id
+ * @num_segments: Number of discontiguous memory segments
+ * @segment_qd: Depth of each segments
+ * @q_lock: Concurrent queue access lock
+ * @q_segments: Segment descriptor pointer
+ * @q_segment_list: Segment list base virtual address
+ * @q_segment_list_dma: Segment list base DMA address
+ * @last_full_host_tag: hosttag of last IO returned to SML
+ *                      due to queue full
+ * @qfull_io_count: Number of IOs returned back to SML
+ *                  due to queue full
+ * @qfull_instances: Total queue full occurences. One occurence
+ *                   starts with queue full detection and ends
+ *                   with queue full breaks.
+ * @dbgfs_req_queue: Per request queue debugfs directory
+ */
+struct op_req_qinfo {
+	u16 ci;
+	u16 pi;
+	u16 num_requests;
+	u16  qid;
+	u16  reply_qid;
+	u16 num_segments;
+	u16 segment_qd;
+	spinlock_t q_lock;
+	struct segments *q_segments;
+	void *q_segment_list;
+	dma_addr_t q_segment_list_dma;
+
+	u16 last_full_host_tag;
+	u64 qfull_io_count;
+	u32 qfull_instances;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *dbgfs_req_queue;
+#endif
+};
+
+/**
+ * struct op_reply_qinfo -  Operational Reply Queue Information
+ *
+ * @ci: consumer index
+ * @qid: Queue Id starting from 1
+ * @num_replies: Maximum number of entries in the queue
+ * @num_segments: Number of discontiguous memory segments
+ * @segment_qd: Depth of each segments
+ * @q_segments: Segment descriptor pointer
+ * @q_segment_list: Segment list base virtual address
+ * @q_segment_list_dma: Segment list base DMA address
+ * @ephase: Expected phased identifier for the reply queue
+ * @pend_ios: Number of IOs pending in HW for this queue
+ * @enable_irq_poll: Flag to indicate polling is enabled
+ * @in_use: Queue is handled by poll/ISR
+ * @qtype: Type of queue (types defined in enum queue_type)
+ */
+struct op_reply_qinfo {
+	u16 ci;
+	u16 qid;
+	u16 num_replies;
+	u16 num_segments;
+	u16 segment_qd;
+	struct segments *q_segments;
+	void *q_segment_list;
+	dma_addr_t q_segment_list_dma;
+	u8 ephase;
+	atomic_t pend_ios;
+	bool enable_irq_poll;
+	atomic_t in_use;
+	enum queue_type qtype;
+};
+
+/**
+ * struct mpi3mr_intr_info -  Interrupt cookie information
+ *
+ * @mrioc: Adapter instance reference
+ * @msix_index: MSIx index
+ * @op_reply_q: Associated operational reply queue
+ * @name: Dev name for the irq claiming device
+ */
+struct mpi3mr_intr_info {
+	struct mpi3mr_ioc *mrioc;
+	u16 msix_index;
+	struct op_reply_qinfo *op_reply_q;
+	char name[MPI3MR_NAME_LENGTH];
+};
+
+
+/**
+ * struct mpi3mr_throttle_group_info - Throttle group info
+ *
+ * @io_divert: Flag indicates io divert is on or off for the TG
+ * @needs_qd_reduction: Flag to indicate QD reduction is needed
+ * @qd_reduction: Queue Depth reduction in units of 10%
+ * @fw_qd: QueueDepth value reported by the firmware
+ * @modified_qd: Modified QueueDepth value due to throttling
+ * @id: Throttle Group ID.
+ * @high: High limit to turn on throttling in 512 byte blocks
+ * @low: Low limit to turn off throttling in 512 byte blocks
+ * @pend_large_data_sz: Counter to track pending large data
+ */
+struct mpi3mr_throttle_group_info {
+	u8 io_divert;
+	u8 need_qd_reduction;
+	u8 qd_reduction;
+	u16 fw_qd;
+	u16 modified_qd;
+	u16 id;
+	u32 high;
+	u32 low;
+	atomic_t pend_large_data_sz;
+};
+
+/**
+ * struct mpi3mr_hba_port - HBA's port information
+ * @port_id: Port number
+ * @sas_address: SAS address of this port's attached device
+ * @phy_mask: HBA PHY's belonging to this port
+ * @flags: HBA port flags
+ */
+struct mpi3mr_hba_port {
+	struct list_head list;
+	u8 port_id;
+	u8 flags;
+};
+
+/**
+ * struct mpi3mr_sas_port - Internal SAS port information
+ * @port_list: List of ports belonging to a SAS node
+ * @num_phys: Number of phys associated with port
+ * @marked_responding: used while refresing the sas ports
+ * @lowest_phy: lowest phy ID of current sas port
+ * @phy_mask: phy_mask of current sas port
+ * @hba_port: HBA port entry
+ * @remote_identify: Attached device identification
+ * @rphy: SAS transport layer rphy object
+ * @port: SAS transport layer port object
+ * @phy_list: mpi3mr_sas_phy objects belonging to this port
+ */
+struct mpi3mr_sas_port {
+	struct list_head port_list;
+	u8 num_phys;
+	u8 marked_responding;
+	int lowest_phy;
+	u32 phy_mask;
+	struct mpi3mr_hba_port *hba_port;
+	struct sas_identify remote_identify;
+	struct sas_rphy *rphy;
+	struct sas_port *port;
+	struct list_head phy_list;
+};
+
+/**
+ * struct mpi3mr_sas_phy - Internal SAS Phy information
+ * @port_siblings: List of phys belonging to a port
+ * @identify: Phy identification
+ * @remote_identify: Attached device identification
+ * @phy: SAS transport layer Phy object
+ * @phy_id: Unique phy id within a port
+ * @handle: Firmware device handle for this phy
+ * @attached_handle: Firmware device handle for attached device
+ * @phy_belongs_to_port: Flag to indicate phy belongs to port
+   @hba_port: HBA port entry
+ */
+struct mpi3mr_sas_phy {
+	struct list_head port_siblings;
+	struct sas_identify identify;
+	struct sas_identify remote_identify;
+	struct sas_phy *phy;
+	u8 phy_id;
+	u16 handle;
+	u16 attached_handle;
+	u8 phy_belongs_to_port;
+	struct mpi3mr_hba_port *hba_port;
+};
+
+/**
+ * struct mpi3mr_sas_node - SAS host/expander information
+ * @list: List of sas nodes in a controller
+ * @parent_dev: Parent device class
+ * @num_phys: Number phys belonging to sas_node
+ * @sas_address: SAS address of sas_node
+ * @handle: Firmware device handle for this sas_host/expander
+ * @sas_address_parent: SAS address of parent expander or host
+ * @enclosure_handle: Firmware handle of enclosure of this node
+ * @device_info: Capabilities of this sas_host/expander
+ * @non_responding: used to refresh the expander devices during reset
+ * @host_node: Flag to indicate this is a host_node
+ * @hba_port: HBA port entry
+ * @phy: A list of phys that make up this sas_host/expander
+ * @sas_port_list: List of internal ports of this node
+ * @rphy: sas_rphy object of this expander node
+ */
+struct mpi3mr_sas_node {
+	struct list_head list;
+	struct device *parent_dev;
+	u8 num_phys;
+	u64 sas_address;
+	u16 handle;
+	u64 sas_address_parent;
+	u16 enclosure_handle;
+	u64 enclosure_logical_id;
+	u8 non_responding;
+	u8 host_node;
+	struct mpi3mr_hba_port *hba_port;
+	struct mpi3mr_sas_phy *phy;
+	struct list_head sas_port_list;
+	struct sas_rphy *rphy;
+};
+
+/**
+ * struct mpi3mr_enclosure_node - enclosure information
+ * @list: List of enclosures
+ * @pg0: Enclosure page 0;
+ */
+struct mpi3mr_enclosure_node {
+	struct list_head list;
+	struct mpi3_enclosure_page0 pg0;
+};
+
+/**
+ * struct tgt_dev_sas_sata - SAS/SATA device specific
+ * information cached from firmware given data
+ *
+ * @sas_address: World wide unique SAS address
+ * @sas_address_parent: Sas address of parent expander or host
+ * @dev_info: Device information bits
+ * @phy_id: Phy identifier provided in device page 0
+ * @phy_id: Attached phy identifier provided in device page 0
+ * @sas_transport_attached: Is this device exposed to transport
+ * @pend_sas_rphy_add: Flag to check device is in process of add
+ * @hba_port: HBA port entry
+ * @rphy: SAS transport layer rphy object
+ */
+struct tgt_dev_sas_sata {
+	u64 sas_address;
+	u64 sas_address_parent;
+	u16 dev_info;
+	u8 phy_id;
+	u8 attached_phy_id;
+	u8 sas_transport_attached;
+	u8 pend_sas_rphy_add;
+	struct mpi3mr_hba_port *hba_port;
+	struct sas_rphy *rphy;
+};
+
+/**
+ * struct trigger_event_data - store trigger related
+ * information.
+ *
+ * @trace_hdb: Trace diag buffer descriptor reference
+ * @fw_hdb: FW diag buffer descriptor reference
+ * @trigger_type: Trigger type
+ * @trigger_specific_data: Trigger specific data
+ * @snapdump: Snapdump enable or disable flag
+ */
+struct trigger_event_data {
+	struct diag_buffer_desc *trace_hdb;
+	struct diag_buffer_desc *fw_hdb;
+	u8 trigger_type;
+	u64 trigger_specific_data;
+	bool snapdump;
+};
+
+/**
+ * struct tgt_dev_pcie - PCIe device specific information cached
+ * from firmware given data
+ *
+ * @mdts: Maximum data transfer size
+ * @capb: Device capabilities
+ * @pgsz: Device page size
+ * @abort_to: Timeout for abort TM
+ * @reset_to: Timeout for Target/LUN reset TM
+ * @dev_info: Device information bits
+ */
+struct tgt_dev_pcie {
+	u32 mdts;
+	u16 capb;
+	u8 pgsz;
+	u8 abort_to;
+	u8 reset_to;
+	u16 dev_info;
+};
+
+/**
+ * struct tgt_dev_vd - virtual device specific information
+ * cached from firmware given data
+ *
+ * @state: State of the VD
+ * @qd_reduction: Queue Depth reduction in units of 10%
+ * @tg_id: VDs throttle group ID
+ * @high: High limit to turn on throttling in 512 byte blocks
+ * @low: Low limit to turn off throttling in 512 byte blocks
+ * @tg: Pointer to throttle group info
+ */
+struct tgt_dev_vd {
+	u8 state;
+	u8 tg_qd_reduction;
+	u16 tg_id;
+	u32 tg_high;
+	u32 tg_low;
+	struct mpi3mr_throttle_group_info *tg;
+};
+
+
+/**
+ * union _form_spec_inf - union of device specific information
+ */
+union _form_spec_inf {
+	struct tgt_dev_sas_sata sas_sata_inf;
+	struct tgt_dev_pcie pcie_inf;
+	struct tgt_dev_vd vd_inf;
+};
+
+
+/**
+ * struct mpi3mr_tgt_dev - target device data structure
+ *
+ * @list: List pointer
+ * @starget: Scsi_target pointer
+ * @dev_handle: FW device handle
+ * @parent_handle: FW parent device handle
+ * @slot: Slot number
+ * @encl_handle: FW enclosure handle
+ * @perst_id: FW assigned Persistent ID
+ * @devpg0_flag: Device Page0 flag
+ * @dev_type: SAS/SATA/PCIE device type
+ * @is_hidden: Should be exposed to upper layers or not
+ * @host_exposed: Already exposed to host or not
+ * @io_unit_port: IO Unit port ID
+ * @non_stl: Is this device not to be attached with SAS TL
+ * @io_throttle_enabled: I/O throttling needed or not
+ * @q_depth: Device specific Queue Depth
+ * @wwid: World wide ID
+ * @enclosure_logical_id: Enclosure logical identifier
+ * @dev_spec: Device type specific information
+ * @ref_count: Reference count
+ */
+struct mpi3mr_tgt_dev {
+	struct list_head list;
+	struct scsi_target *starget;
+	u16 dev_handle;
+	u16 parent_handle;
+	u16 slot;
+	u16 encl_handle;
+	u16 perst_id;
+	u16 devpg0_flag;
+	u8 dev_type;
+	u8 is_hidden;
+	u8 host_exposed;
+	u8 io_unit_port;
+	u8 non_stl;
+	u8 io_throttle_enabled;
+	u16 q_depth;
+	u64 wwid;
+	u64 enclosure_logical_id;
+	union _form_spec_inf dev_spec;
+	struct kref ref_count;
+};
+
+/**
+ * mpi3mr_tgtdev_get - k reference incrementor
+ * @s: Target device reference
+ *
+ * Increment target device reference count.
+ */
+static inline void mpi3mr_tgtdev_get(struct mpi3mr_tgt_dev *s)
+{
+	kref_get(&s->ref_count);
+}
+
+/**
+ * mpi3mr_free_tgtdev - target device memory dealloctor
+ * @r: k reference pointer of the target device
+ *
+ * Free target device memory when no reference.
+ */
+static inline void mpi3mr_free_tgtdev(struct kref *r)
+{
+	kfree(container_of(r, struct mpi3mr_tgt_dev, ref_count));
+}
+
+/**
+ * mpi3mr_tgtdev_put - k reference decrementor
+ * @s: Target device reference
+ *
+ * Decrement target device reference count.
+ */
+static inline void mpi3mr_tgtdev_put(struct mpi3mr_tgt_dev *s)
+{
+	kref_put(&s->ref_count, mpi3mr_free_tgtdev);
+}
+
+/**
+ * struct mpi3mr_stgt_priv_data - SCSI target private structure
+ *
+ * @starget: Scsi_target pointer
+ * @dev_handle: FW device handle
+ * @perst_id: FW assigned Persistent ID
+ * @num_luns: Number of Logical Units
+ * @block_io: I/O blocked to the device or not
+ * @dev_removed: Device removed in the Firmware
+ * @dev_removedelay: Device is waiting to be removed in FW
+ * @dev_type: Device type
+ * @dev_nvme_dif: Device is NVMe DIF enabled
+ * @io_throttle_enabled: I/O throttling needed or not
+ * @io_divert: Flag indicates io divert is on or off for the dev
+ * @throttle_group: Pointer to throttle group info
+ * @tgt_dev: Internal target device pointer
+ * @pend_count: Counter to track pending I/Os during error
+ *            handling
+ */
+struct mpi3mr_stgt_priv_data {
+	struct scsi_target *starget;
+	u16 dev_handle;
+	u16 perst_id;
+	u32 num_luns;
+	atomic_t block_io;
+	u8 dev_removed;
+	u8 dev_removedelay;
+	u8 dev_type;
+	u8 dev_nvme_dif;
+	u8 io_throttle_enabled;
+	u8 io_divert;
+	struct mpi3mr_throttle_group_info *throttle_group;
+	struct mpi3mr_tgt_dev *tgt_dev;
+	u32 pend_count;
+};
+
+/**
+ * struct mpi3mr_sdev_priv_data - SCSI device private structure
+ *
+ * @tgt_priv_data: Scsi_target private data pointer
+ * @lun_id: LUN ID of the device
+ * @ncq_prio_enable: NCQ priority enable for SATA device
+ * @pend_count: Counter to track pending I/Os during error
+ *            handling
+ */
+struct mpi3mr_sdev_priv_data {
+	struct mpi3mr_stgt_priv_data *tgt_priv_data;
+	u32 lun_id;
+	u8 ncq_prio_enable;
+	u32 pend_count;
+};
+
+/**
+ * struct mpi3mr_drv_cmd - Internal command tracker
+ *
+ * @mutex: Command mutex
+ * @done: Completor for wakeup
+ * @reply: Firmware reply for internal commands
+ * @sensebuf: Sensebuf for SCSI IO commands
+ * @iou_rc: IO Unit control reason code
+ * @state: Command State
+ * @dev_handle: Firmware handle for device specific commands
+ * @ioc_status: IOC status from the firmware
+ * @ioc_loginfo:IOC log info from the firmware
+ * @is_waiting: Is the command issued in block mode
+ * @is_sense: Is Sense data present
+ * @retry_count: Retry count for retriable commands
+ * @host_tag: Host tag used by the command
+ * @callback: Callback for non blocking commands
+ */
+struct mpi3mr_drv_cmd {
+	struct mutex mutex;
+	struct completion done;
+	void *reply;
+	u8 *sensebuf;
+	u8 iou_rc;
+	u16 state;
+	u16 dev_handle;
+	u16 ioc_status;
+	u32 ioc_loginfo;
+	u8 is_waiting;
+	u8 is_sense;
+	u8 retry_count;
+	u16 host_tag;
+	void (*callback)(struct mpi3mr_ioc *mrioc,
+	    struct mpi3mr_drv_cmd *drv_cmd);
+};
+
+
+/**
+ * struct chain_element - memory descriptor structure to store
+ * virtual and dma addresses for chain elements.
+ *
+ * @addr: virtual address
+ * @dma_addr: dma address
+ */
+struct chain_element {
+	void *addr;
+	dma_addr_t dma_addr;
+};
+
+/**
+ * struct scmd_priv - SCSI command private data
+ *
+ * @host_tag: Host tag specific to operational queue
+ * @in_lld_scope: Command in LLD scope or not
+ * @meta_sg_valid: DIX command with meta data SGL or not
+ * @scmd: SCSI Command pointer
+ * @req_q_idx: Operational request queue index
+ * @chain_idx: Chain frame index
+ * @meta_chain_idx: Chain frame index of meta data SGL
+ * @mpi3mr_scsiio_req: MPI SCSI IO request
+ */
+struct scmd_priv {
+	u16 host_tag;
+	u8 in_lld_scope;
+	u8 meta_sg_valid;
+	struct scsi_cmnd *scmd;
+	u16 req_q_idx;
+	int chain_idx;
+	int meta_chain_idx;
+	u8 mpi3mr_scsiio_req[MPI3MR_ADMIN_REQ_FRAME_SZ];
+};
+
+/**
+ * struct diag_buffer_desc - memory descriptor structure to
+ * store virtual, dma addresses, size, buffer status for host
+ * diagnostic buffers.
+ *
+ * @type: Buffer type
+ * @trigger_data: Trigger data
+ * @trigger_type: Trigger type
+ * @status: Buffer status
+ * @size: Buffer size
+ * @addr: Virtual address
+ * @dma_addr: Buffer DMA address
+ */
+struct diag_buffer_desc {
+	u8 type;
+	u64 trigger_data;
+	u8 trigger_type;
+	u8 status;
+	u32 size;
+	void *addr;
+	dma_addr_t dma_addr;
+};
+
+/**
+ * struct dma_memory_desc - memory descriptor structure to store
+ * virtual address, dma address and size for any generic dma
+ * memory allocations in the driver.
+ *
+ * @size: Buffer size
+ * @addr: Virtual address
+ * @dma_addr: DMA address
+ */
+struct dma_memory_desc {
+	u32 size;
+	void *addr;
+	dma_addr_t dma_addr;
+};
+
+
+/**
+ * struct mpi3mr_ioc - Adapter anchor structure stored in shost
+ * private data
+ *
+ * @list: List pointer
+ * @pdev: PCI device pointer
+ * @shost: Scsi_Host pointer
+ * @id: Controller ID
+ * @cpu_count: Number of online CPUs
+ * @dbgfs_adapter: Debugfs directory per controller
+ * @name: Controller ASCII name
+ * @driver_name: Driver ASCII name
+ * @sysif_regs: System interface registers virtual address
+ * @sysif_regs_phys: System interface registers physical address
+ * @bars: PCI BARS
+ * @dma_mask: DMA mask
+ * @msix_count: Number of MSIX vectors used
+ * @intr_enabled: Is interrupts enabled
+ * @num_admin_req: Number of admin requests
+ * @admin_req_q_sz: Admin request queue size
+ * @admin_req_pi: Admin request queue producer index
+ * @admin_req_ci: Admin request queue consumer index
+ * @admin_req_base: Admin request queue base virtual address
+ * @admin_req_dma: Admin request queue base dma address
+ * @admin_req_lock: Admin queue access lock
+ * @num_admin_replies: Number of admin replies
+ * @admin_reply_q_sz: Admin reply queue size
+ * @admin_reply_ci: Admin reply queue consumer index
+ * @admin_reply_ephase:Admin reply queue expected phase
+ * @admin_reply_base: Admin reply queue base virtual address
+ * @admin_reply_dma: Admin reply queue base dma address
+ * @admin_reply_q_in_use: Queue is handled by poll/ISR
+ * @ready_timeout: Controller ready timeout
+ * @intr_info: Interrupt cookie pointer
+ * @intr_info_count: Number of interrupt cookies
+ * @is_intr_info_set: Flag to indicate intr info is setup
+ * @num_queues: Number of operational queues
+ * @is_segqueue_enabled: Flag to indicate segmented q is enabled
+ * @num_op_req_q: Number of operational request queues
+ * @req_qinfo: Operational request queue info pointer
+ * @num_op_reply_q: Number of operational reply queues
+ * @op_reply_qinfo: Operational reply queue info pointer
+ * @init_cmds: Command tracker for initialization commands
+ * @cfg_cmds: Command tracker for configuration requests
+ * @facts: Cached IOC facts data
+ * @op_reply_desc_sz: Operational reply descriptor size
+ * @num_reply_bufs: Number of reply buffers allocated
+ * @reply_buf_pool: Reply buffer pool
+ * @reply_buf: Reply buffer base virtual address
+ * @reply_buf_dma: Reply buffer DMA address
+ * @reply_buf_dma_max_address: Reply DMA address max limit
+ * @reply_free_qsz: Reply free queue size
+ * @reply_sz: Cached Reply size reported in IOC facts
+ * @reply_free_q_pool: Reply free queue pool
+ * @reply_free_q: Reply free queue base virtual address
+ * @reply_free_q_dma: Reply free queue base DMA address
+ * @reply_free_queue_lock: Reply free queue lock
+ * @reply_free_queue_host_index: Reply free queue host index
+ * @num_sense_bufs: Number of sense buffers
+ * @sense_buf_pool: Sense buffer pool
+ * @sense_buf: Sense buffer base virtual address
+ * @sense_buf_dma: Sense buffer base DMA address
+ * @sense_buf_q_sz: Sense buffer queue size
+ * @sense_buf_q_pool: Sense buffer queue pool
+ * @sense_buf_q: Sense buffer queue virtual address
+ * @sense_buf_q_dma: Sense buffer queue DMA address
+ * @sbq_lock: Sense buffer queue lock
+ * @sbq_host_index: Sense buffer queuehost index
+ * @event_masks: Event mask bitmap
+ * @fwevt_worker_name: Firmware event worker thread name
+ * @fwevt_worker_thread: Firmware event worker thread
+ * @fwevt_lock: Firmware event lock
+ * @fwevt_list: Firmware event list
+ * @watchdog_work_q_name: Fault watchdog worker thread name
+ * @watchdog_work_q: Fault watchdog worker thread
+ * @watchdog_work: Fault watchdog work
+ * @watchdog_lock: Fault watchdog lock
+ * @is_driver_loading: Is driver still loading
+ * @scan_started: Async scan started
+ * @scan_failed: Asycn scan failed
+ * @stop_drv_processing: Stop all command processing
+ * @device_refresh_on: Don't process the events untill devices are refreshed
+ * @max_host_ios: Maximum host I/O count
+ * @tgtdev_lock: Target device list lock
+ * @tgtdev_list: Target device lock
+ * @chain_buf_count: Chain buffer count
+ * @chain_buf_pool: Chain buffer pool
+ * @chain_sgl_list: Chain SGL list
+ * @chain_bitmap_sz: Chain buffer allocator bitmap size
+ * @chain_bitmap: Chain buffer allocator bitmap
+ * @chain_buf_lock: Chain buffer list lock
+ * @bsg_cmds: Command tracker for BSG command
+ * @host_tm_cmds: Command tracker for task management commands
+ * @dev_rmhs_cmds: Command tracker for device removal commands
+ * @evtack_cmds: Command tracker for event ack commands
+ * @sysfs_tm_cmds: Command tracker for sysfs TM commands
+ * @devrem_bitmap_sz: Device removal bitmap size
+ * @devrem_bitmap: Device removal bitmap
+ * @dev_handle_bitmap_sz: Device handle bitmap size
+ * @removepend_bitmap: Remove pending bitmap
+ * @delayed_rmhs_list: Delayed device removal list
+ * @evtack_cmds_bitmap_sz: Event Ack bitmap size
+ * @evtack_cmds_bitmap: Event Ack bitmap
+ * @delayed_evtack_cmds_list: Delayed event acknowledgment list
+ * @ts_update_counter: Timestamp update counter
+ * @reset_in_progress: Reset in progress flag
+ * @unrecoverable: Controller unrecoverable flag
+ * @block_bsgs: Block BSG flag
+ * @prev_reset_result: Result of previous reset
+ * @reset_mutex: Controller reset mutex
+ * @reset_waitq: Controller reset  wait queue
+ * @prepare_for_reset: Prepare for reset event received
+ * @prepare_for_reset_timeout_counter: Prepare for reset timeout
+ * @prp_list_virt: NVMe encapsulated PRP list virtual base
+ * @prp_list_dma: NVMe encapsulated PRP list DMA
+ * @prp_sz: NVME encapsulated PRP list size
+ * @diagsave_timeout: Diagnostic information save timeout
+ * @logging_level: Controller debug logging level
+ * @flush_io_count: I/O count to flush after reset
+ * @current_event: Firmware event currently in process
+ * @driver_info: Driver, Kernel, OS information to firmware
+ * @change_count: Topology change count
+ * @pel_enabled: Persistent Event Log(PEL) enabled or not
+ * @pel_abort_requested: PEL abort is requested or not
+ * @pel_class: PEL Class identifier
+ * @pel_locale: PEL Locale identifier
+ * @pel_cmds: Command tracker for PEL wait command
+ * @pel_abort_cmd: Command tracker for PEL abort command
+ * @pel_newest_seqnum: Newest PEL sequenece number
+ * @pel_seqnum_virt: PEL sequence number virtual address
+ * @pel_seqnum_dma: PEL sequence number DMA address
+ * @pel_seqnum_sz: PEL sequenece number size
+ * @op_reply_q_offset: Operational reply queue offset with MSIx
+ * @sysfs_tm_pending: Pending TMs issued through SysFS
+ * @sysfs_tm_issued: TMs issued through SysFS
+ * @sysfs_tm_terminated_io_count:I/Os terminated by SysFS TMs
+ * @sysfs_pending_tm_wq: SysFS TM pending work queue
+ * @diag_buffers: Host diagnostic buffers
+ * @reply_trigger_present: Reply trigger present flag
+ * @event_trigger_present: Event trigger present flag
+ * @scsisense_trigger_present: Scsi sense trigger present flag
+ * @snapdump_trigger_active: Snapdump trigger active flag
+ * @fw_release_trigger_active: Fw release trigger active flag
+ * @trace_release_trigger_active: Trace trigger active flag
+ * @driver_pg2:  Driver page 2 pointer
+ * @dump: kmsg dumper interface for snapdump
+ * @drv_diag_buffer: Diagnostic host buffer virtual address
+ * @drv_diag_buffer_dma: Diagnostic host buffer DMA address
+ * @drv_diag_buffer_sz: Diagnostic host buffer size
+ * @default_qcount: Total Default queues
+ * @active_poll_qcount: Currently active poll queue count
+ * @requested_poll_qcount: User requested poll queue count
+ * @check_xprotect_nvme: Flag to check xprotect for nvme or not
+ * @skip_uefi_snapdump: Skip copying UEFI logs into snapdump
+ * @logdata_buf: Circular buffer to store log data entries
+ * @logdata_buf_idx: Index of entry in buffer to store
+ * @logdata_entry_sz: log data entry size
+ * @adm_req_q_bar_writeq_lock: Admin request queue lock
+ * @adm_reply_q_bar_writeq_lock: Admin reply queue lock
+ * @pend_ios: Pending IO Count
+ * @cfg_page: Default memory for configuration pages
+ * @cfg_page_dma: Configuration page DMA address
+ * @cfg_page_sz: Default configuration page memory size
+ * @sas_transport_enabled: SAS transport enabled or not
+ * @scsi_device_channel: Channel ID for SCSI devices
+ * @transport_cmds: Command tracker for SAS transport commands
+ * @sas_hba: SAS node for the controller
+ * @sas_expander_list: SAS node list of expanders
+ * @sas_node_lock: Lock to protect SAS node list
+ * @hba_port_table_list: List of HBA Ports
+ * @enclosure_list: List of Enclosure objects
+ * @pend_large_data_sz: Counter to track pending large data
+ * @io_throttle_data_length: I/O size to track in 512b blocks
+ * @io_throttle_high: I/O size to start throttle in 512b blocks
+ * @io_throttle_low: I/O size to stop throttle in 512b blocks
+ * @num_io_throttle_group: Maximum number of throttle groups
+ * @throttle_groups: Pointer to throttle group info structures
+ * @bsg_dev: BSG device structure
+ * @bsg_queue: Request queue for BSG device
+ */
+struct mpi3mr_ioc {
+	struct list_head list;
+	struct pci_dev *pdev;
+	struct Scsi_Host *shost;
+	u8 id;
+	int cpu_count;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *dbgfs_adapter;
+ 	struct dentry *dmesg_dump;
+ 	struct dentry *uefi_logs_dump;
+#endif
+
+	char name[MPI3MR_NAME_LENGTH];
+	char driver_name[MPI3MR_NAME_LENGTH];
+
+	volatile struct mpi3_sysif_registers __iomem *sysif_regs;
+	resource_size_t sysif_regs_phys;
+	int bars;
+	u64 dma_mask;
+
+	u16 msix_count;
+	u8 intr_enabled;
+
+	u16 num_admin_req;
+	u32 admin_req_q_sz;
+	u16 admin_req_pi;
+	u16 admin_req_ci;
+	void *admin_req_base;
+	dma_addr_t admin_req_dma;
+	spinlock_t admin_req_lock;
+
+	u16 num_admin_replies;
+	u32 admin_reply_q_sz;
+	u16 admin_reply_ci;
+	u8 admin_reply_ephase;
+	void *admin_reply_base;
+	dma_addr_t admin_reply_dma;
+	atomic_t admin_reply_q_in_use;
+
+	u32 ready_timeout;
+
+	struct mpi3mr_intr_info *intr_info;
+	u16 intr_info_count;
+	bool is_intr_info_set;
+
+	u16 num_queues;
+	bool is_segqueue_enabled;
+	u16 num_op_req_q;
+	struct op_req_qinfo *req_qinfo;
+
+	u16 num_op_reply_q;
+	struct op_reply_qinfo *op_reply_qinfo;
+
+	struct mpi3mr_drv_cmd init_cmds;
+	struct mpi3mr_drv_cmd cfg_cmds;
+	struct mpi3mr_ioc_facts facts;
+	u16 op_reply_desc_sz;
+
+	u32 num_reply_bufs;
+	struct dma_pool *reply_buf_pool;
+	u8 *reply_buf;
+	dma_addr_t reply_buf_dma;
+	dma_addr_t reply_buf_dma_max_address;
+
+	u16 reply_free_qsz;
+	u16 reply_sz;
+	struct dma_pool *reply_free_q_pool;
+	__le64 *reply_free_q;
+	dma_addr_t reply_free_q_dma;
+	spinlock_t reply_free_queue_lock;
+	u32 reply_free_queue_host_index;
+
+	u32 num_sense_bufs;
+	struct dma_pool *sense_buf_pool;
+	u8 *sense_buf;
+	dma_addr_t sense_buf_dma;
+
+	u16 sense_buf_q_sz;
+	struct dma_pool *sense_buf_q_pool;
+	__le64 *sense_buf_q;
+	dma_addr_t sense_buf_q_dma;
+	spinlock_t sbq_lock;
+	u32 sbq_host_index;
+
+	u32 event_masks[MPI3_EVENT_NOTIFY_EVENTMASK_WORDS];
+
+	char fwevt_worker_name[MPI3MR_NAME_LENGTH];
+	struct workqueue_struct	*fwevt_worker_thread;
+	spinlock_t fwevt_lock;
+	struct list_head fwevt_list;
+
+	char watchdog_work_q_name[20];
+	struct workqueue_struct *watchdog_work_q;
+	struct delayed_work watchdog_work;
+	spinlock_t watchdog_lock;
+
+	u8 is_driver_loading;
+	u8 scan_started;
+	u16 scan_failed;
+	u8 stop_drv_processing;
+	u8 device_refresh_on;
+
+	u16 max_host_ios;
+
+	spinlock_t tgtdev_lock;
+	struct list_head tgtdev_list;
+
+	u32 chain_buf_count;
+	struct dma_pool *chain_buf_pool;
+	struct chain_element *chain_sgl_list;
+	u16  chain_bitmap_sz;
+	void *chain_bitmap;
+	spinlock_t chain_buf_lock;
+
+	struct mpi3mr_drv_cmd bsg_cmds;
+	struct mpi3mr_drv_cmd host_tm_cmds;
+	struct mpi3mr_drv_cmd dev_rmhs_cmds[MPI3MR_NUM_DEVRMCMD];
+	struct mpi3mr_drv_cmd evtack_cmds[MPI3MR_NUM_EVTACKCMD];
+	struct mpi3mr_drv_cmd sysfs_tm_cmds[MPI3MR_NUM_SYSFS_TM];
+
+	u16 devrem_bitmap_sz;
+	void *devrem_bitmap;
+	u16 dev_handle_bitmap_sz;
+	void *removepend_bitmap;
+	struct list_head delayed_rmhs_list;
+	u16 evtack_cmds_bitmap_sz;
+	void *evtack_cmds_bitmap;
+	struct list_head delayed_evtack_cmds_list;
+
+	u32 ts_update_counter;
+
+	u8 reset_in_progress;
+	u8 unrecoverable;
+	u8 block_bsgs;
+	int prev_reset_result;
+	struct mutex reset_mutex;
+	wait_queue_head_t reset_waitq;
+
+	u8 prepare_for_reset;
+	u16 prepare_for_reset_timeout_counter;
+
+	void *prp_list_virt;
+	dma_addr_t prp_list_dma;
+	u32 prp_sz;
+
+	u16 diagsave_timeout;
+	int logging_level;
+	u16 flush_io_count;
+
+	struct mpi3mr_fwevt *current_event;
+	struct mpi3_driver_info_layout driver_info;
+	u16 change_count;
+
+	u8 pel_enabled;
+	u8 pel_abort_requested;
+	u8 pel_class;
+	u16 pel_locale;
+	struct mpi3mr_drv_cmd pel_cmds;
+	struct mpi3mr_drv_cmd pel_abort_cmd;
+
+	u32 pel_newest_seqnum;
+	void *pel_seqnum_virt;
+	dma_addr_t pel_seqnum_dma;
+	u32 pel_seqnum_sz;
+	u16 op_reply_q_offset;
+
+	atomic_t sysfs_tm_pending;
+	u16 sysfs_tm_issued;
+	u16 sysfs_tm_terminated_io_count;
+	wait_queue_head_t sysfs_pending_tm_wq;
+
+	struct diag_buffer_desc diag_buffers[MPI3MR_MAX_NUM_HDB];
+	bool reply_trigger_present;
+	bool event_trigger_present;
+	bool scsisense_trigger_present;
+	bool snapdump_trigger_active;
+	bool fw_release_trigger_active;
+	bool trace_release_trigger_active;
+	struct mpi3_driver_page2 *driver_pg2;
+	spinlock_t trigger_lock;
+
+	struct mpi3mr_kmsg_dumper dump;
+	void *drv_diag_buffer;
+	dma_addr_t drv_diag_buffer_dma;
+	u32 drv_diag_buffer_sz;
+
+	void *uefi_logs;
+	u32 uefi_logs_sz;
+
+	u16 default_qcount;
+	u16 active_poll_qcount;
+	u16 requested_poll_qcount;
+
+	bool check_xprotect_nvme;
+	bool skip_uefi_snapdump;
+
+	u8 *logdata_buf;
+	u16 logdata_buf_idx;
+	u16 logdata_entry_sz;
+	spinlock_t adm_req_q_bar_writeq_lock;
+	spinlock_t adm_reply_q_bar_writeq_lock;
+
+#if defined(IO_COUNTER_SUPPORT)
+	atomic_t pend_ios;
+#endif
+
+	void *cfg_page;
+	dma_addr_t cfg_page_dma;
+	u16 cfg_page_sz;
+
+	u8 sas_transport_enabled;
+	u8 scsi_device_channel;
+	struct mpi3mr_drv_cmd transport_cmds;
+	struct mpi3mr_sas_node sas_hba;
+	struct list_head sas_expander_list;
+	spinlock_t sas_node_lock;
+	struct list_head hba_port_table_list;
+	struct list_head enclosure_list;
+
+	atomic_t pend_large_data_sz;
+	u32 io_throttle_data_length;
+	u32 io_throttle_high;
+	u32 io_throttle_low;
+	u16 num_io_throttle_group;
+	struct mpi3mr_throttle_group_info *throttle_groups;
+
+	struct device *bsg_dev;
+	struct request_queue *bsg_queue;
+};
+
+int mpi3mr_setup_resources(struct mpi3mr_ioc *mrioc);
+void mpi3mr_cleanup_resources(struct mpi3mr_ioc *mrioc);
+int mpi3mr_init_ioc(struct mpi3mr_ioc *mrioc);
+int mpi3mr_reinit_ioc(struct mpi3mr_ioc *mrioc, u8 is_resume);
+void mpi3mr_cleanup_ioc(struct mpi3mr_ioc *mrioc);
+int mpi3mr_issue_port_enable(struct mpi3mr_ioc *mrioc, u8 async);
+int mpi3mr_admin_request_post(struct mpi3mr_ioc *mrioc, void *admin_req,
+    u16 admin_req_sz, u8 ignore_reset);
+int mpi3mr_op_request_post(struct mpi3mr_ioc *mrioc,
+    struct op_req_qinfo *opreqq, u8 *req);
+void mpi3mr_add_sg_single(void *paddr, u8 flags, u32 length,
+    dma_addr_t dma_addr);
+void mpi3mr_build_zero_len_sge(void *paddr);
+void *mpi3mr_get_sensebuf_virt_addr(struct mpi3mr_ioc *mrioc,
+    dma_addr_t phys_addr);
+void *mpi3mr_get_reply_virt_addr(struct mpi3mr_ioc *mrioc,
+    dma_addr_t phys_addr);
+void mpi3mr_repost_sense_buf(struct mpi3mr_ioc *mrioc,
+    u64 sense_buf_dma);
+void mpi3mr_pel_get_seqnum_complete(struct mpi3mr_ioc *mrioc,
+    struct mpi3mr_drv_cmd *drv_cmd);
+void mpi3mr_memset_buffers(struct mpi3mr_ioc *mrioc);
+void mpi3mr_free_mem(struct mpi3mr_ioc *mrioc);
+
+void mpi3mr_os_handle_events(struct mpi3mr_ioc *mrioc,
+    struct mpi3_event_notification_reply *event_reply);
+struct mpi3mr_tgt_dev *mpi3mr_get_tgtdev_by_handle(
+    struct mpi3mr_ioc *mrioc, u16 handle);
+struct mpi3mr_tgt_dev *mpi3mr_get_tgtdev_by_perst_id(
+    struct mpi3mr_ioc *mrioc, u16 persist_id);
+struct mpi3mr_tgt_dev *mpi3mr_get_tgtdev_from_tgtpriv(
+    struct mpi3mr_ioc *mrioc, struct mpi3mr_stgt_priv_data *tgt_priv);
+void mpi3mr_process_op_reply_desc(struct mpi3mr_ioc *mrioc,
+    struct mpi3_default_reply_descriptor *reply_desc, u64 *reply_dma, u16 qidx);
+
+void mpi3mr_start_watchdog(struct mpi3mr_ioc *mrioc);
+void mpi3mr_stop_watchdog(struct mpi3mr_ioc *mrioc);
+
+int mpi3mr_soft_reset_handler(struct mpi3mr_ioc *mrioc,
+    u32 reset_reason, u8 snapdump);
+int mpi3mr_issue_tm(struct mpi3mr_ioc *mrioc, u8 tm_type,
+	u16 handle, uint lun, u16 htag, ulong timeout,
+	struct mpi3mr_drv_cmd *drv_cmd,
+	u8 *resp_code, struct scsi_cmnd *scmd);
+int mpi3mr_diagfault_reset_handler(struct mpi3mr_ioc *mrioc,
+    u32 reset_reason);
+void mpi3mr_ioc_disable_intr(struct mpi3mr_ioc *mrioc);
+void mpi3mr_ioc_enable_intr(struct mpi3mr_ioc *mrioc);
+
+enum mpi3mr_iocstate mpi3mr_get_iocstate(struct mpi3mr_ioc *mrioc);
+int mpi3mr_process_event_ack(struct mpi3mr_ioc *mrioc, u8 event,
+    u32 event_ctx);
+int mpi3mr_pel_get_seqnum_post(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_drv_cmd *drv_cmd);
+
+void mpi3mr_wait_for_host_io(struct mpi3mr_ioc *mrioc, u32 timeout);
+void mpi3mr_cleanup_fwevt_list(struct mpi3mr_ioc *mrioc);
+void mpi3mr_flush_host_io(struct mpi3mr_ioc *mrioc);
+void mpi3mr_invalidate_devhandles(struct mpi3mr_ioc *mrioc);
+void mpi3mr_refresh_tgtdevs(struct mpi3mr_ioc *mrioc);
+void mpi3mr_flush_delayed_cmd_lists(struct mpi3mr_ioc *mrioc);
+
+void mpi3mr_bsg_init(struct mpi3mr_ioc *mrioc);
+void mpi3mr_bsg_exit(struct mpi3mr_ioc *mrioc);
+void mpi3mr_app_save_logdata(struct mpi3mr_ioc *mrioc, char *event_data,
+    u16 event_data_size);
+int mpi3mr_process_op_reply_q(struct mpi3mr_ioc *mrioc,
+    struct op_reply_qinfo *op_reply_q);
+
+#if (KERNEL_VERSION(5, 16, 0) > LINUX_VERSION_CODE)
+extern struct device_attribute *mpi3mr_host_attrs[];
+extern struct device_attribute *mpi3mr_dev_attrs[];
+#else
+extern const struct attribute_group *mpi3mr_host_groups[];
+extern const struct attribute_group *mpi3mr_dev_groups[];
+#endif
+
+u8 mpi3mr_scsih_ncq_prio_supp(struct scsi_device *sdev);
+
+void mpi3mr_print_fault_info(struct mpi3mr_ioc *mrioc);
+
+int mpi3mr_post_diag_bufs(struct mpi3mr_ioc *mrioc);
+void mpi3mr_release_diag_bufs(struct mpi3mr_ioc *mrioc, u8 skip_rel_action);
+void mpi3mr_hdb_trigger_data_event(struct mpi3mr_ioc *mrioc,
+	struct trigger_event_data *event_data);
+void mpi3mr_alloc_diag_bufs(struct mpi3mr_ioc *mrioc);
+int mpi3mr_refresh_trigger(struct mpi3mr_ioc *mrioc, u8 page_type);
+void mpi3mr_master_trigger(struct mpi3mr_ioc *mrioc, u64 trigger_data);
+void mpi3mr_scsisense_trigger(struct mpi3mr_ioc *mrioc, u8 senseky, u8 asc,
+    u8 ascq);
+void mpi3mr_event_trigger(struct mpi3mr_ioc *mrioc, u8 event);
+void mpi3mr_reply_trigger(struct mpi3mr_ioc *mrioc, u16 iocstatus,
+    u32 iocloginfo);
+void mpi3mr_hdbstatuschg_evt_th(struct mpi3mr_ioc *mrioc,
+    struct mpi3_event_notification_reply *event_reply);
+int mpi3mr_issue_diag_buf_release(struct mpi3mr_ioc *mrioc,
+    struct diag_buffer_desc *diag_buffer);
+void mpi3mr_check_rh_fault_ioc(struct mpi3mr_ioc *mrioc, u32 reason_code);
+
+extern struct sas_function_template mpi3mr_transport_functions;
+extern struct scsi_transport_template *mpi3mr_transport_template;
+
+int mpi3mr_cfg_get_dev_pg0(struct mpi3mr_ioc *mrioc, u16 *ioc_status,
+	struct mpi3_device_page0 *dev_pg0, u16 pg_sz, u32 form, u32 form_spec);
+int mpi3mr_cfg_get_sas_phy_pg0(struct mpi3mr_ioc *mrioc, u16 *ioc_status,
+	struct mpi3_sas_phy_page0 *phy_pg0, u16 pg_sz, u32 form,
+	u32 form_spec);
+int mpi3mr_cfg_get_sas_phy_pg1(struct mpi3mr_ioc *mrioc, u16 *ioc_status,
+	struct mpi3_sas_phy_page1 *phy_pg1, u16 pg_sz, u32 form,
+	u32 form_spec);
+int mpi3mr_cfg_get_sas_exp_pg0(struct mpi3mr_ioc *mrioc, u16 *ioc_status,
+	struct mpi3_sas_expander_page0 *exp_pg0, u16 pg_sz, u32 form,
+	u32 form_spec);
+int mpi3mr_cfg_get_sas_exp_pg1(struct mpi3mr_ioc *mrioc, u16 *ioc_status,
+	struct mpi3_sas_expander_page1 *exp_pg1, u16 pg_sz, u32 form,
+	u32 form_spec);
+int mpi3mr_cfg_get_enclosure_pg0(struct mpi3mr_ioc *mrioc, u16 *ioc_status,
+	struct mpi3_enclosure_page0 *encl_pg0, u16 pg_sz, u32 form,
+	u32 form_spec);
+int mpi3mr_cfg_get_sas_io_unit_pg0(struct mpi3mr_ioc *mrioc,
+	struct mpi3_sas_io_unit_page0 *sas_io_unit_pg0, u16 pg_sz);
+int mpi3mr_cfg_get_sas_io_unit_pg1(struct mpi3mr_ioc *mrioc,
+	struct mpi3_sas_io_unit_page1 *sas_io_unit_pg1, u16 pg_sz);
+int mpi3mr_cfg_set_sas_io_unit_pg1(struct mpi3mr_ioc *mrioc,
+	struct mpi3_sas_io_unit_page1 *sas_io_unit_pg1, u16 pg_sz);
+int mpi3mr_cfg_get_driver_pg1(struct mpi3mr_ioc *mrioc,
+	struct mpi3_driver_page1 *driver_pg1, u16 pg_sz);
+int mpi3mr_cfg_get_driver_pg2(struct mpi3mr_ioc *mrioc,
+	struct mpi3_driver_page2 *driver_pg2, u16 pg_sz, u8 page_type);
+void mpi3mr_remove_tgtdev_from_host(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_tgt_dev *tgtdev);
+struct mpi3mr_enclosure_node *mpi3mr_enclosure_find_by_handle(
+	struct mpi3mr_ioc*mrioc, u16 handle);
+u8 mpi3mr_is_expander_device(u16 device_info);
+int mpi3mr_expander_add(struct mpi3mr_ioc *mrioc, u16 handle);
+void mpi3mr_expander_remove(struct mpi3mr_ioc *mrioc, u64 sas_address,
+	struct mpi3mr_hba_port *hba_port);
+struct mpi3mr_sas_node *__mpi3mr_expander_find_by_handle(struct mpi3mr_ioc
+	*mrioc, u16 handle);
+struct mpi3mr_hba_port * mpi3mr_get_hba_port_by_id(struct mpi3mr_ioc *mrioc,
+	u8 port_id, u8 skip_dirty_flag);
+void mpi3mr_sas_host_refresh(struct mpi3mr_ioc *mrioc);
+void mpi3mr_sas_host_add(struct mpi3mr_ioc *mrioc);
+int mpi3mr_report_tgtdev_to_sas_transport(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_tgt_dev *tgtdev);
+void mpi3mr_remove_tgtdev_from_sas_transport(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_tgt_dev *tgtdev);
+struct mpi3mr_tgt_dev *__mpi3mr_get_tgtdev_by_addr_and_rphy(
+	struct mpi3mr_ioc *mrioc, u64 sas_address, struct sas_rphy *rphy);
+void mpi3mr_update_links(struct mpi3mr_ioc *mrioc,
+	u64 sas_address_parent, u16 handle, u8 phy_number, u8 link_rate,
+	struct mpi3mr_hba_port *hba_port);
+void mpi3mr_refresh_sas_ports(struct mpi3mr_ioc *mrioc);
+void mpi3mr_refresh_expanders(struct mpi3mr_ioc *mrioc);
+void mpi3mr_add_event_wait_for_device_refresh(struct mpi3mr_ioc *mrioc);
+void mpi3mr_free_enclosure_list(struct mpi3mr_ioc *mrioc);
+
+void mpi3mr_flush_drv_cmds(struct mpi3mr_ioc *mrioc);
+void mpi3mr_flush_cmds_for_unrecovered_controller(struct mpi3mr_ioc *mrioc);
+void mpi3mr_set_trigger_data_in_hdb(struct diag_buffer_desc *hdb,
+	u8 type, u64 data, bool force);
+void mpi3mr_set_trigger_data_in_all_hdb(struct mpi3mr_ioc *mrioc,
+	u8 type, u64 data, bool force);
+int mpi3mr_process_admin_reply_q(struct mpi3mr_ioc *mrioc);
+inline void mpi3mr_print_discard_event_notice(struct mpi3mr_ioc *mrioc,
+	bool device_add);
+#endif /*MPI3MR_H_INCLUDED*/
diff --git a/drivers/scsi/mpi3mr/mpi3mr_app.c b/drivers/scsi/mpi3mr/mpi3mr_app.c
new file mode 100644
index 0000000000000..c7c35b6c41384
--- /dev/null
+++ b/drivers/scsi/mpi3mr/mpi3mr_app.c
@@ -0,0 +1,3346 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Driver for Broadcom MPI3 Storage Controllers
+ *
+ * Copyright (C) 2017-2022 Broadcom Inc.
+ *  (mailto: mpi3mr-linuxdrv.pdl@broadcom.com)
+ *
+ */
+
+#include "mpi3mr.h"
+#include "mpi3mr_app.h"
+
+/* SysFS task management type definitions*/
+enum mpi3mr_sysfs_tm {
+	MPI3MR_SYSFS_TM_SOFT_RESET = 1,
+	MPI3MR_SYSFS_TM_ABORT_TASK = 3,
+	MPI3MR_SYSFS_TM_TARGET_RESET,
+	MPI3MR_SYSFS_TM_LUN_RESET,
+	MPI3MR_SYSFS_TM_ABORT_TASK_SET,
+	MPI3MR_SYSFS_TM_DIAG_FAULT_RESET,
+};
+
+#define MPI3MR_SYSFS_TM_TIMEOUT		120 /*in seconds*/
+
+/* Encapsulated NVMe command definitions */
+#define	MPI3MR_NVME_PRP_SIZE		8 /* PRP size */
+#define	MPI3MR_NVME_CMD_PRP1_OFFSET	24 /* PRP1 offset in NVMe cmd */
+#define	MPI3MR_NVME_CMD_PRP2_OFFSET	32 /* PRP2 offset in NVMe cmd */
+#define	MPI3MR_NVME_CMD_SGL_OFFSET	24 /* SGL offset in NVMe cmd */
+#define MPI3MR_NVME_DATA_FORMAT_PRP	0
+#define MPI3MR_NVME_DATA_FORMAT_SGL1	1
+#define MPI3MR_NVME_DATA_FORMAT_SGL2	2
+
+/**
+ * struct mpi3mr_nvme_pt_sge -  Structure to store SGEs for NVMe
+ * Encapsulated commands.
+ *
+ * @base_addr: Physical address
+ * @length: SGE length
+ * @rsvd: Reserved
+ * @rsvd1: Reserved
+ * @sgl_type: sgl type
+ */
+struct mpi3mr_nvme_pt_sge {
+	u64 base_addr;
+	u32 length;
+	u16 rsvd;
+	u8 rsvd1;
+	u8 sgl_type;
+};
+
+
+/**
+ * struct mpi3mr_buf_map -  local structure to
+ * track kernel and user buffers associated with an BSG
+ * structure.
+ *
+ * @bsg_buf: BSG buffer virtual address
+ * @bsg_buf_len:  BSG buffer length
+ * @kern_buf: Kernel buffer virtual address
+ * @kern_buf_len: Kernel buffer length
+ * @kern_buf_dma: Kernel buffer DMA address
+ * @data_dir: Data direction.
+ * @is_dma: Whether DMA transfer applies to the buffer type
+ */
+struct mpi3mr_buf_map {
+	void *bsg_buf;
+	u32 bsg_buf_len;
+	void *kern_buf;
+	u32 kern_buf_len;
+	dma_addr_t kern_buf_dma;
+	u8 data_dir;
+	bool is_dma;
+};
+
+/**
+ * mpi3mr_diag_buffer_for_type - returns buffer desc for type
+ * @mrioc: Adapter instance reference
+ * @buffer_type: Diagnostic buffer type
+ *
+ * Identifies matching diag descriptor from mrioc for given diag
+ * buffer type.
+ *
+ * Return: diag buffer descriptor on success, NULL on failures.
+ */
+
+static inline struct diag_buffer_desc *
+mpi3mr_diag_buffer_for_type(struct mpi3mr_ioc *mrioc, u8 buf_type)
+{
+	u8 i;
+
+	for (i = 0; i < MPI3MR_MAX_NUM_HDB; i++) {
+		if (mrioc->diag_buffers[i].type == buf_type)
+			return &mrioc->diag_buffers[i];
+	}
+	return NULL;
+}
+
+ /**
+ * mpi3mr_set_trigger_data_in_hdb - Updates HDB trigger type and
+ * trigger data
+ *
+ * @hdb: HDB pointer
+ * @type: Trigger type
+ * @data: Trigger data
+ * @force: Trigger overwrite flag
+ *
+ * Updates trigger type and trigger data based on parameter
+ * passed to this function
+ *
+ * Return: Nothing
+ */
+void mpi3mr_set_trigger_data_in_hdb(struct diag_buffer_desc *hdb,
+	u8 type, u64 data, bool force)
+{
+	if ((!force) && (hdb->trigger_type != MPI3MR_HDB_TRIGGER_TYPE_UNKNOWN))
+		return;
+	hdb->trigger_type = type;
+	hdb->trigger_data = data;
+}
+
+ /**
+ * mpi3mr_set_trigger_data_in_all_hdb - Updates HDB trigger type
+ * and trigger data for all HDB
+ *
+ * @type: Trigger type
+ * @data: Trigger data
+ * @force: Trigger overwrite flag
+ *
+ * Updates trigger type and trigger data based on parameter
+ * passed to this function
+ *
+ * Return: Nothing
+ */
+void mpi3mr_set_trigger_data_in_all_hdb(struct mpi3mr_ioc *mrioc,
+	u8 type, u64 data, bool force)
+{
+	struct diag_buffer_desc *hdb = NULL;
+
+	hdb = mpi3mr_diag_buffer_for_type(mrioc, MPI3_DIAG_BUFFER_TYPE_TRACE);
+	if (hdb)
+		mpi3mr_set_trigger_data_in_hdb(hdb, type, data, force);
+	hdb = mpi3mr_diag_buffer_for_type(mrioc, MPI3_DIAG_BUFFER_TYPE_FW);
+	if (hdb)
+		mpi3mr_set_trigger_data_in_hdb(hdb, type, data, force);
+}
+
+ /**
+ * mpi3mr_hdbstatuschg_evt_th - HDB status change evt tophalf
+ * @mrioc: Adapter instance reference
+ * @event_reply: event data
+ *
+ * Modifies the status of the applicable diag buffer descriptors
+ *
+ * Return: Nothing
+ */
+void mpi3mr_hdbstatuschg_evt_th(struct mpi3mr_ioc *mrioc,
+	struct mpi3_event_notification_reply *event_reply)
+{
+	struct mpi3_event_data_diag_buffer_status_change *evtdata;
+	struct diag_buffer_desc *diag_buffer;
+
+	evtdata = (struct mpi3_event_data_diag_buffer_status_change *)
+	    event_reply->event_data;
+
+	diag_buffer = mpi3mr_diag_buffer_for_type(mrioc, evtdata->type);
+	if (!diag_buffer)
+		return;
+	if ((diag_buffer->status != MPI3MR_HDB_BUFSTATUS_POSTED_UNPAUSED) &&
+	    (diag_buffer->status != MPI3MR_HDB_BUFSTATUS_POSTED_PAUSED))
+		return;
+	switch (evtdata->reason_code) {
+	case MPI3_EVENT_DIAG_BUFFER_STATUS_CHANGE_RC_RELEASED:
+	{
+		diag_buffer->status = MPI3MR_HDB_BUFSTATUS_RELEASED;
+		mpi3mr_set_trigger_data_in_hdb(diag_buffer,
+		    MPI3MR_HDB_TRIGGER_TYPE_FW_RELEASED, 0, 0);
+		atomic64_inc(&event_counter);
+		break;
+	}
+	case MPI3_EVENT_DIAG_BUFFER_STATUS_CHANGE_RC_RESUMED:
+	{
+		diag_buffer->status = MPI3MR_HDB_BUFSTATUS_POSTED_UNPAUSED;
+		break;
+	}
+	case MPI3_EVENT_DIAG_BUFFER_STATUS_CHANGE_RC_PAUSED:
+	{
+		diag_buffer->status = MPI3MR_HDB_BUFSTATUS_POSTED_PAUSED;
+		break;
+	}
+	default:
+		dprint_event_th(mrioc, "%s: unknown reason_code(%d)\n",
+		    __func__, evtdata->reason_code);
+		break;
+	}
+}
+
+/**
+ * mpi3mr_alloc_diag_bufs - Allocate memory for diag buffers
+ * @mrioc: Adapter instance reference
+ *
+ * This functions checks whether the driver defined buffer sizes
+ * are greater than IOCFacts provided controller local buffer
+ * sizes and if the driver defined sizes are more then the
+ * driver allocates the specific buffer by reading driver page1
+ *
+ * Return: Nothing.
+ */
+void mpi3mr_alloc_diag_bufs(struct mpi3mr_ioc *mrioc)
+{
+	struct diag_buffer_desc *diag_buffer;
+	struct mpi3_driver_page1 driver_pg1;
+	u32 trace_dec_size, trace_min_size, fw_dec_size, fw_min_size,
+		trace_size, fw_size;
+	u16 pg_sz = sizeof(driver_pg1);
+	int retval = 0;
+
+	if (mrioc->diag_buffers[0].addr || mrioc->diag_buffers[1].addr)
+		return;
+
+	retval = mpi3mr_cfg_get_driver_pg1(mrioc, &driver_pg1, pg_sz);
+	if (retval) {
+		ioc_warn(mrioc, "%s: driver page 1 read failed, allocating "
+			   "default trace/fw diag buffer sizes\n", __func__);
+		trace_size = fw_size = MPI3MR_DEFAULT_HDB_MAX_SZ;
+		trace_dec_size = fw_dec_size = MPI3MR_DEFAULT_HDB_DEC_SZ;
+		trace_min_size = fw_min_size = MPI3MR_DEFAULT_HDB_MIN_SZ;
+
+	} else {
+		trace_size = driver_pg1.host_diag_trace_max_size * 1024;
+		trace_dec_size = driver_pg1.host_diag_trace_decrement_size
+			 * 1024;
+		trace_min_size = driver_pg1.host_diag_trace_min_size * 1024;
+		fw_size = driver_pg1.host_diag_fw_max_size * 1024;
+		fw_dec_size = driver_pg1.host_diag_fw_decrement_size * 1024;
+		fw_min_size = driver_pg1.host_diag_fw_min_size * 1024;
+		if ((trace_size == 0) && (fw_size == 0)) {
+			dprint_init(mrioc, "%s:Invalid buffer sizes read from "
+					   "driver page1 tracesize = %dKB,"
+					   "fwsize = %dKB\n",
+					__func__, trace_size, fw_size);
+			return;
+		}
+	}
+
+retry_trace:
+
+	diag_buffer = &mrioc->diag_buffers[0];
+	diag_buffer->type = MPI3_DIAG_BUFFER_TYPE_TRACE;
+	diag_buffer->status = MPI3MR_HDB_BUFSTATUS_NOT_ALLOCATED;
+	if ((mrioc->facts.diag_trace_sz < trace_size) && (trace_size >=
+		trace_min_size)) {
+		diag_buffer->addr = dma_zalloc_coherent(&mrioc->pdev->dev,
+		    trace_size, &diag_buffer->dma_addr, GFP_KERNEL);
+		if (diag_buffer->addr) {
+			dprint_init(mrioc, "%s: host diag trace memory "
+			    "allocated = %dKB\n", __func__, trace_size / 1024);
+			diag_buffer->size = trace_size;
+		} else {
+			trace_size -= trace_dec_size;
+			goto retry_trace;
+		}
+	}
+
+retry_fw:
+
+	diag_buffer = &mrioc->diag_buffers[1];
+
+	diag_buffer->type = MPI3_DIAG_BUFFER_TYPE_FW;
+	diag_buffer->status = MPI3MR_HDB_BUFSTATUS_NOT_ALLOCATED;
+	if ((mrioc->facts.diag_fw_sz < fw_size) && (fw_size >= fw_min_size)) {
+		diag_buffer->addr = dma_zalloc_coherent(&mrioc->pdev->dev,
+		    fw_size, &diag_buffer->dma_addr, GFP_KERNEL);
+		if (diag_buffer->addr) {
+			dprint_init(mrioc, "%s: host diag fw memory "
+			    "allocated = %dKB\n", __func__, fw_size / 1024);
+			diag_buffer->size = fw_size;
+		} else {
+			fw_size -= fw_dec_size;
+			goto retry_fw;
+		}
+	}
+}
+
+/**
+ * mpi3mr_issue_diag_buf_post - Send diag buffer post req
+ * @mrioc: Adapter instance reference
+ * @diag_buffer: Diagnostic buffer descriptor
+ *
+ * Issue diagnostic buffer post MPI request through admin queue
+ * and wait for the completion of it or time out.
+ *
+ * Return: 0 on success, non-zero on failures.
+ */
+static int mpi3mr_issue_diag_buf_post(struct mpi3mr_ioc *mrioc,
+    struct diag_buffer_desc *diag_buffer)
+{
+	struct mpi3_diag_buffer_post_request diag_buf_post_req;
+	u8 prev_status;
+	int retval = 0;
+
+	memset(&diag_buf_post_req, 0, sizeof(diag_buf_post_req));
+	mutex_lock(&mrioc->init_cmds.mutex);
+	if (mrioc->init_cmds.state & MPI3MR_CMD_PENDING) {
+		dprint_bsg_err(mrioc, "%s: command is in use\n", __func__);
+		mutex_unlock(&mrioc->init_cmds.mutex);
+		return -1;
+	}
+	mrioc->init_cmds.state = MPI3MR_CMD_PENDING;
+	mrioc->init_cmds.is_waiting = 1;
+	mrioc->init_cmds.callback = NULL;
+	diag_buf_post_req.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INITCMDS);
+	diag_buf_post_req.function = MPI3_FUNCTION_DIAG_BUFFER_POST;
+	diag_buf_post_req.type = diag_buffer->type;
+	diag_buf_post_req.address = le64_to_cpu(diag_buffer->dma_addr);
+	diag_buf_post_req.length = le32_to_cpu(diag_buffer->size);
+
+	dprint_bsg_info(mrioc, "%s: posting diag buffer type %d\n", __func__,
+	    diag_buffer->type);
+	prev_status = diag_buffer->status;
+	diag_buffer->status = MPI3MR_HDB_BUFSTATUS_POSTED_UNPAUSED;
+	init_completion(&mrioc->init_cmds.done);
+	retval = mpi3mr_admin_request_post(mrioc, &diag_buf_post_req,
+	    sizeof(diag_buf_post_req), 1);
+	if (retval) {
+		dprint_bsg_err(mrioc, "%s: admin request post failed\n",
+		    __func__);
+		goto out_unlock;
+	}
+	wait_for_completion_timeout(&mrioc->init_cmds.done,
+	    (MPI3MR_INTADMCMD_TIMEOUT * HZ));
+	if (!(mrioc->init_cmds.state & MPI3MR_CMD_COMPLETE)) {
+		mrioc->init_cmds.is_waiting = 0;
+		dprint_bsg_err(mrioc, "%s: command timedout\n", __func__);
+		mpi3mr_check_rh_fault_ioc(mrioc,
+		    MPI3MR_RESET_FROM_DIAG_BUFFER_POST_TIMEOUT);
+		retval = -1;
+		goto out_unlock;
+	}
+	if ((mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK)
+	    != MPI3_IOCSTATUS_SUCCESS) {
+		dprint_bsg_err(mrioc,
+		    "%s: command failed, buffer_type (%d) ioc_status(0x%04x) log_info(0x%08x)\n",
+		    __func__, diag_buffer->type,
+		    (mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK),
+		    mrioc->init_cmds.ioc_loginfo);
+		retval = -1;
+		goto out_unlock;
+	}
+	dprint_bsg_info(mrioc, "%s: diag buffer type %d posted successfully\n",
+	    __func__, diag_buffer->type);
+
+out_unlock:
+	if (retval)
+		diag_buffer->status = prev_status;
+	mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED;
+	mutex_unlock(&mrioc->init_cmds.mutex);
+	return retval;
+}
+
+/**
+ * mpi3mr_post_diag_bufs - Post diag buffers to the controller
+ * @mrioc: Adapter instance reference
+ *
+ * This function calls helper function to post both trace and
+ * firmware buffers to the controller.
+ *
+ * Return: None
+ */
+int mpi3mr_post_diag_bufs(struct mpi3mr_ioc *mrioc)
+{
+	u8 i;
+	struct diag_buffer_desc *diag_buffer;
+
+	for (i = 0; i < MPI3MR_MAX_NUM_HDB; i++) {
+		diag_buffer = &mrioc->diag_buffers[i];
+		if (!(diag_buffer->addr))
+			continue;
+		if (mpi3mr_issue_diag_buf_post(mrioc, diag_buffer))
+			return -1;
+	}
+	return 0;
+}
+
+/**
+ * mpi3mr_issue_diag_buf_release - Send diag buffer release req
+ * @mrioc: Adapter instance reference
+ * @diag_buffer: Diagnostic buffer descriptor
+ *
+ * Issue diagnostic buffer manage MPI request with release
+ * action request through admin queue and wait for the
+ * completion of it or time out.
+ *
+ * Return: 0 on success, non-zero on failures.
+ */
+int mpi3mr_issue_diag_buf_release(struct mpi3mr_ioc *mrioc,
+    struct diag_buffer_desc *diag_buffer)
+{
+	struct mpi3_diag_buffer_manage_request diag_buf_manage_req;
+	int retval = 0;
+
+	if ((diag_buffer->status != MPI3MR_HDB_BUFSTATUS_POSTED_UNPAUSED) &&
+	    (diag_buffer->status != MPI3MR_HDB_BUFSTATUS_POSTED_PAUSED))
+		return retval;
+
+	memset(&diag_buf_manage_req, 0, sizeof(diag_buf_manage_req));
+	mutex_lock(&mrioc->init_cmds.mutex);
+	if (mrioc->init_cmds.state & MPI3MR_CMD_PENDING) {
+		dprint_reset(mrioc, "%s: command is in use\n", __func__);
+		mutex_unlock(&mrioc->init_cmds.mutex);
+		return -1;
+	}
+	mrioc->init_cmds.state = MPI3MR_CMD_PENDING;
+	mrioc->init_cmds.is_waiting = 1;
+	mrioc->init_cmds.callback = NULL;
+	diag_buf_manage_req.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INITCMDS);
+	diag_buf_manage_req.function = MPI3_FUNCTION_DIAG_BUFFER_MANAGE;
+	diag_buf_manage_req.type = diag_buffer->type;
+	diag_buf_manage_req.action = MPI3_DIAG_BUFFER_ACTION_RELEASE;
+
+
+	dprint_reset(mrioc, "%s: releasing diag buffer type %d\n", __func__,
+	    diag_buffer->type);
+	init_completion(&mrioc->init_cmds.done);
+	retval = mpi3mr_admin_request_post(mrioc, &diag_buf_manage_req,
+	    sizeof(diag_buf_manage_req), 1);
+	if (retval) {
+		dprint_reset(mrioc, "%s: admin request post failed\n", __func__);
+		mpi3mr_set_trigger_data_in_hdb(diag_buffer,
+		    MPI3MR_HDB_TRIGGER_TYPE_UNKNOWN, 0, 1);
+		goto out_unlock;
+	}
+	wait_for_completion_timeout(&mrioc->init_cmds.done,
+	    (MPI3MR_INTADMCMD_TIMEOUT * HZ));
+	if (!(mrioc->init_cmds.state & MPI3MR_CMD_COMPLETE)) {
+		mrioc->init_cmds.is_waiting = 0;
+		dprint_reset(mrioc, "%s: command timedout\n", __func__);
+		mpi3mr_check_rh_fault_ioc(mrioc,
+		    MPI3MR_RESET_FROM_DIAG_BUFFER_RELEASE_TIMEOUT);
+		retval = -1;
+		goto out_unlock;
+	}
+	if ((mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK)
+	    != MPI3_IOCSTATUS_SUCCESS) {
+		dprint_reset(mrioc,
+		    "%s: command failed, buffer_type (%d) ioc_status(0x%04x) log_info(0x%08x)\n",
+		    __func__, diag_buffer->type,
+		    (mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK),
+		    mrioc->init_cmds.ioc_loginfo);
+		retval = -1;
+		goto out_unlock;
+	}
+	dprint_reset(mrioc, "%s: diag buffer type %d released successfully\n",
+	    __func__, diag_buffer->type);
+
+out_unlock:
+	mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED;
+	mutex_unlock(&mrioc->init_cmds.mutex);
+	return retval;
+}
+
+/**
+ * mpi3mr_process_trigger - Generic HDB Trigger handler
+ * @mrioc: Adapter instance reference
+ * @trigger_type: Trigger type
+ * @trigger_data: Trigger data
+ * @trigger_flags: Trigger flags
+ *
+ * This function checks validity of HDB, triggers and based on
+ * trigger information, creates an event to be processed in the
+ * firmware event worker thread .
+ *
+ * This function should be called with trigger spinlock held
+ *
+ * Return: Nothing
+ */
+static void mpi3mr_process_trigger(struct mpi3mr_ioc *mrioc, u8 trigger_type,
+    u64 trigger_data, u8 trigger_flags)
+{
+	struct trigger_event_data event_data;
+	struct diag_buffer_desc *trace_hdb = NULL;
+	struct diag_buffer_desc *fw_hdb = NULL;
+	u64 master_trigger;
+
+	trace_hdb = mpi3mr_diag_buffer_for_type(mrioc,
+	    MPI3_DIAG_BUFFER_TYPE_TRACE);
+	fw_hdb = mpi3mr_diag_buffer_for_type(mrioc, MPI3_DIAG_BUFFER_TYPE_FW);
+
+	if (mrioc->snapdump_trigger_active || (mrioc->fw_release_trigger_active
+	    && mrioc->trace_release_trigger_active) ||
+	    (!trace_hdb && !fw_hdb) || (!mrioc->driver_pg2) ||
+	    ((trigger_type == MPI3MR_HDB_TRIGGER_TYPE_ELEMENT)
+	     && (!mrioc->driver_pg2->num_triggers)))
+		return;
+	memset(&event_data, 0, sizeof(event_data));
+	event_data.trigger_type = trigger_type;
+	event_data.trigger_specific_data = trigger_data;
+	master_trigger = le64_to_cpu(mrioc->driver_pg2->master_trigger);
+
+	if (master_trigger & MPI3_DRIVER2_MASTERTRIGGER_SNAPDUMP) {
+		event_data.snapdump = true;
+		event_data.trace_hdb = trace_hdb;
+		event_data.fw_hdb = fw_hdb;
+		mrioc->snapdump_trigger_active = true;
+	} else if (trigger_type == MPI3MR_HDB_TRIGGER_TYPE_MASTER) {
+		if ((trace_hdb) && (master_trigger &
+		    MPI3_DRIVER2_MASTERTRIGGER_DIAG_TRACE_RELEASE) &&
+		    (!mrioc->trace_release_trigger_active)) {
+			event_data.trace_hdb = trace_hdb;
+			mrioc->trace_release_trigger_active = true;
+		}
+		if ((fw_hdb) && (master_trigger &
+		    MPI3_DRIVER2_MASTERTRIGGER_DIAG_FW_RELEASE) &&
+		    (!mrioc->fw_release_trigger_active)) {
+			event_data.fw_hdb = fw_hdb;
+			mrioc->fw_release_trigger_active = true;
+		}
+	} else if (trigger_type == MPI3MR_HDB_TRIGGER_TYPE_ELEMENT) {
+		if ((trace_hdb) && (trigger_flags &
+		    MPI3_DRIVER2_TRIGGER_FLAGS_DIAG_TRACE_RELEASE) &&
+		    (!mrioc->trace_release_trigger_active)) {
+			event_data.trace_hdb = trace_hdb;
+			mrioc->trace_release_trigger_active = true;
+		}
+		if ((fw_hdb) && (trigger_flags &
+		    MPI3_DRIVER2_TRIGGER_FLAGS_DIAG_FW_RELEASE) &&
+		    (!mrioc->fw_release_trigger_active)) {
+			event_data.fw_hdb = fw_hdb;
+			mrioc->fw_release_trigger_active = true;
+		}
+	}
+
+	if (event_data.trace_hdb || event_data.fw_hdb)
+		mpi3mr_hdb_trigger_data_event(mrioc, &event_data);
+}
+
+/**
+ * mpi3mr_master_trigger - Master HDB trigger handler
+ * @mrioc: Adapter instance reference
+ * @trigger_data: Trigger data
+ *
+ * This function checks whether the given master trigger is
+ * enabled in the driver page 2 and if so calls generic trigger
+ * handler to queue event for HDB release.
+ *
+ * Return: Nothing
+ */
+void mpi3mr_master_trigger(struct mpi3mr_ioc *mrioc, u64 trigger_data)
+{
+	u64 master_trigger;
+	unsigned long flags;
+
+	spin_lock_irqsave(&mrioc->trigger_lock, flags);
+	master_trigger = le64_to_cpu(mrioc->driver_pg2->master_trigger);
+	if (master_trigger & trigger_data)
+		mpi3mr_process_trigger(mrioc, MPI3MR_HDB_TRIGGER_TYPE_MASTER,
+		    trigger_data, 0);
+	spin_unlock_irqrestore(&mrioc->trigger_lock, flags);
+}
+/**
+ * mpi3mr_scsisense_trigger - SCSI sense HDB trigger handler
+ * @mrioc: Adapter instance reference
+ * @sensekey: Sense Key
+ * @asc: Additional Sense Code
+ * @ascq: Additional Sense Code Qualifier
+ *
+ * This function compares SCSI sense trigger values with driver
+ * page 2 values and calls generic trigger handler to release
+ * HDBs if match found
+ *
+ * Return: Nothing
+ */
+void mpi3mr_scsisense_trigger(struct mpi3mr_ioc *mrioc, u8 sensekey, u8 asc,
+    u8 ascq)
+{
+	struct mpi3_driver2_trigger_scsi_sense *scsi_sense_trigger = NULL;
+	u64 i = 0;
+	unsigned long flags;
+	u8 num_triggers, trigger_flags;
+
+	if (mrioc->scsisense_trigger_present) {
+		spin_lock_irqsave(&mrioc->trigger_lock, flags);
+		scsi_sense_trigger = (struct mpi3_driver2_trigger_scsi_sense *)
+			mrioc->driver_pg2->trigger;
+		num_triggers = mrioc->driver_pg2->num_triggers;
+		for (i = 0; i < num_triggers; i++, scsi_sense_trigger++) {
+			if (scsi_sense_trigger->type !=
+			    MPI3_DRIVER2_TRIGGER_TYPE_SCSI_SENSE)
+				continue;
+			if (!(scsi_sense_trigger->sense_key ==
+			    MPI3_DRIVER2_TRIGGER_SCSI_SENSE_SENSE_KEY_MATCH_ALL
+			      || scsi_sense_trigger->sense_key == sensekey))
+				continue;
+			if (!(scsi_sense_trigger->asc ==
+			    MPI3_DRIVER2_TRIGGER_SCSI_SENSE_ASC_MATCH_ALL ||
+			    scsi_sense_trigger->asc == asc))
+				continue;
+			if (!(scsi_sense_trigger->ascq ==
+			    MPI3_DRIVER2_TRIGGER_SCSI_SENSE_ASCQ_MATCH_ALL ||
+			    scsi_sense_trigger->ascq == ascq))
+				continue;
+			trigger_flags = scsi_sense_trigger->flags;
+			mpi3mr_process_trigger(mrioc,
+			    MPI3MR_HDB_TRIGGER_TYPE_ELEMENT, i, trigger_flags);
+			break;
+		}
+		spin_unlock_irqrestore(&mrioc->trigger_lock, flags);
+	}
+}
+/**
+ * mpi3mr_event_trigger - MPI event HDB trigger handler
+ * @mrioc: Adapter instance reference
+ * @event: MPI Event
+ *
+ * This function compares event trigger values with driver page
+ * 2 values and calls generic trigger handler to release
+ * HDBs if match found.
+ *
+ * Return: Nothing
+ */
+void mpi3mr_event_trigger(struct mpi3mr_ioc *mrioc, u8 event)
+{
+	struct mpi3_driver2_trigger_event *event_trigger = NULL;
+	u64 i = 0;
+	unsigned long flags;
+	u8 num_triggers, trigger_flags;
+
+	if (mrioc->event_trigger_present) {
+		spin_lock_irqsave(&mrioc->trigger_lock, flags);
+		event_trigger = (struct mpi3_driver2_trigger_event *)
+			mrioc->driver_pg2->trigger;
+		num_triggers = mrioc->driver_pg2->num_triggers;
+
+		for (i = 0; i < num_triggers; i++, event_trigger++) {
+			if (event_trigger->type !=
+			    MPI3_DRIVER2_TRIGGER_TYPE_EVENT)
+				continue;
+			if (event_trigger->event != event)
+				continue;
+			trigger_flags = event_trigger->flags;
+			mpi3mr_process_trigger(mrioc,
+			    MPI3MR_HDB_TRIGGER_TYPE_ELEMENT, i, trigger_flags);
+			break;
+		}
+		spin_unlock_irqrestore(&mrioc->trigger_lock, flags);
+	}
+}
+/**
+ * mpi3mr_reply_trigger - MPI Reply HDB trigger handler
+ * @mrioc: Adapter instance reference
+ * @iocstatus: Masked value of IOC Status from MPI Reply
+ * @iocloginfo: IOC Log Info from MPI Reply
+ *
+ * This function compares IOC status and IOC log info trigger
+ * values with driver page 2 values and calls generic trigger
+ * handler to release HDBs if match found.
+ *
+ * Return: Nothing
+ */
+void mpi3mr_reply_trigger(struct mpi3mr_ioc *mrioc, u16 ioc_status,
+    u32 ioc_loginfo)
+{
+	struct mpi3_driver2_trigger_reply *reply_trigger = NULL;
+	u64 i = 0;
+	unsigned long flags;
+	u8 num_triggers, trigger_flags;
+
+	if (mrioc->reply_trigger_present) {
+		spin_lock_irqsave(&mrioc->trigger_lock, flags);
+		reply_trigger = (struct mpi3_driver2_trigger_reply *)
+			mrioc->driver_pg2->trigger;
+		num_triggers = mrioc->driver_pg2->num_triggers;
+		for (i = 0; i < num_triggers; i++, reply_trigger++) {
+			if (reply_trigger->type !=
+			    MPI3_DRIVER2_TRIGGER_TYPE_REPLY)
+				continue;
+			if ((le16_to_cpu(reply_trigger->ioc_status) !=
+			     ioc_status)
+			    && (le16_to_cpu(reply_trigger->ioc_status) !=
+			    MPI3_DRIVER2_TRIGGER_REPLY_IOCSTATUS_MATCH_ALL))
+				continue;
+			if ((le32_to_cpu(reply_trigger->ioc_log_info) !=
+			    (le32_to_cpu(reply_trigger->ioc_log_info_mask) &
+			     ioc_loginfo)))
+				continue;
+			trigger_flags = reply_trigger->flags;
+			mpi3mr_process_trigger(mrioc,
+			    MPI3MR_HDB_TRIGGER_TYPE_ELEMENT, i, trigger_flags);
+			break;
+		}
+		spin_unlock_irqrestore(&mrioc->trigger_lock, flags);
+	}
+}
+/**
+ * mpi3mr_get_num_trigger - Gets number of HDB triggers
+ * @mrioc: Adapter instance reference
+ * @num_triggers: Number of triggers
+ * @page_action: Page action
+ *
+ * This function reads number of triggers by reading driver page
+ * 2
+ *
+ * Return: 0 on success and proper error codes on failure
+ */
+static int mpi3mr_get_num_trigger(struct mpi3mr_ioc *mrioc, u8 *num_triggers,
+    u8 page_action)
+{
+	struct mpi3_driver_page2 drvr_page2;
+	int retval = 0;
+
+	*num_triggers = 0;
+
+	retval = mpi3mr_cfg_get_driver_pg2(mrioc, &drvr_page2,
+	    sizeof(struct mpi3_driver_page2), page_action);
+
+	if (retval) {
+		dprint_init(mrioc, "%s: driver page 2 read failed\n", __func__);
+		return retval;
+	}
+	*num_triggers = drvr_page2.num_triggers;
+	return retval;
+}
+/**
+ * mpi3mr_refresh_trigger - Handler for Refresh trigger BSG
+ * @mrioc: Adapter instance reference
+ * @page_action: Page action
+ *
+ * This function caches the driver page 2 in the driver's memory
+ * by reading driver page 2 from the controller for a given page
+ * type and updates the HDB trigger values
+ *
+ * Return: 0 on success and proper error codes on failure
+ */
+int mpi3mr_refresh_trigger(struct mpi3mr_ioc *mrioc, u8 page_action)
+{
+	u16 pg_sz = sizeof(struct mpi3_driver_page2);
+	struct mpi3_driver_page2 *drvr_page2 = NULL;
+	u8 trigger_type, num_triggers;
+	int retval;
+	int i = 0;
+	unsigned long flags;
+
+	retval = mpi3mr_get_num_trigger(mrioc, &num_triggers, page_action);
+
+	if (retval)
+		goto out;
+
+	pg_sz = offsetof(struct mpi3_driver_page2, trigger) +
+		(num_triggers * sizeof(union mpi3_driver2_trigger_element));
+	drvr_page2 = kzalloc(pg_sz, GFP_KERNEL);
+	if (!drvr_page2) {
+		retval = -ENOMEM;
+		goto out;
+	}
+
+	retval = mpi3mr_cfg_get_driver_pg2(mrioc, drvr_page2, pg_sz, page_action);
+	if (retval) {
+		dprint_init(mrioc, "%s: driver page 2 read failed\n", __func__);
+		kfree(drvr_page2);
+		goto out;
+	}
+	spin_lock_irqsave(&mrioc->trigger_lock, flags);
+	kfree(mrioc->driver_pg2);
+	mrioc->driver_pg2 = drvr_page2;
+	mrioc->reply_trigger_present = false;
+	mrioc->event_trigger_present = false;
+	mrioc->scsisense_trigger_present = false;
+
+	for (i = 0; (i < mrioc->driver_pg2->num_triggers); i++) {
+		trigger_type = mrioc->driver_pg2->trigger[i].event.type;
+		switch (trigger_type) {
+		case MPI3_DRIVER2_TRIGGER_TYPE_REPLY:
+			mrioc->reply_trigger_present = true;
+			break;
+		case MPI3_DRIVER2_TRIGGER_TYPE_EVENT:
+			mrioc->event_trigger_present = true;
+			break;
+		case MPI3_DRIVER2_TRIGGER_TYPE_SCSI_SENSE:
+			mrioc->scsisense_trigger_present = true;
+			break;
+		default:
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&mrioc->trigger_lock, flags);
+out:
+	return retval;
+}
+
+/**
+ * mpi3mr_release_diag_bufs - Release diag buffers
+ * @mrioc: Adapter instance reference
+ * @skip_rel_action: Skip release action and set buffer state
+ *
+ * This function calls helper function to release both trace and
+ * firmware buffers from the controller.
+ *
+ * Return: None
+ */
+void mpi3mr_release_diag_bufs(struct mpi3mr_ioc *mrioc, u8 skip_rel_action)
+{
+	u8 i;
+	struct diag_buffer_desc *diag_buffer;
+
+	for (i = 0; i < MPI3MR_MAX_NUM_HDB; i++) {
+		diag_buffer = &mrioc->diag_buffers[i];
+		if (!(diag_buffer->addr))
+			continue;
+		if (diag_buffer->status == MPI3MR_HDB_BUFSTATUS_RELEASED)
+			continue;
+		if (!skip_rel_action)
+			mpi3mr_issue_diag_buf_release(mrioc, diag_buffer);
+		diag_buffer->status = MPI3MR_HDB_BUFSTATUS_RELEASED;
+		atomic64_inc(&event_counter);
+	}
+}
+/**
+ * mpi3mr_bsg_pel_abort - sends PEL abort request
+ * @mrioc: Adapter instance reference
+ *
+ * This function sends PEL abort request to the firmware through
+ * admin request queue.
+ *
+ * Return: 0 on success, -1 on failure
+ */
+static int mpi3mr_bsg_pel_abort(struct mpi3mr_ioc *mrioc)
+{
+	struct mpi3_pel_req_action_abort pel_abort_req;
+	struct mpi3_pel_reply *pel_reply;
+	int retval = 0;
+	u16 pe_log_status;
+
+	if (mrioc->reset_in_progress) {
+		dprint_bsg_err(mrioc, "%s: reset in progress\n", __func__);
+		return -1;
+	}
+	if (mrioc->block_bsgs) {
+		dprint_bsg_err(mrioc, "%s: bsgs are blocked\n", __func__);
+		return -1;
+	}
+
+	memset(&pel_abort_req, 0, sizeof(pel_abort_req));
+	mutex_lock(&mrioc->pel_abort_cmd.mutex);
+	if (mrioc->pel_abort_cmd.state & MPI3MR_CMD_PENDING) {
+		dprint_bsg_err(mrioc, "%s: command is in use\n", __func__);
+		mutex_unlock(&mrioc->pel_abort_cmd.mutex);
+		return -1;
+	}
+	mrioc->pel_abort_cmd.state = MPI3MR_CMD_PENDING;
+	mrioc->pel_abort_cmd.is_waiting = 1;
+	mrioc->pel_abort_cmd.callback = NULL;
+	pel_abort_req.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_PEL_ABORT);
+	pel_abort_req.function = MPI3_FUNCTION_PERSISTENT_EVENT_LOG;
+	pel_abort_req.action = MPI3_PEL_ACTION_ABORT;
+	pel_abort_req.abort_host_tag = cpu_to_le16(MPI3MR_HOSTTAG_PEL_WAIT);
+
+	mrioc->pel_abort_requested = 1;
+	init_completion(&mrioc->pel_abort_cmd.done);
+	retval = mpi3mr_admin_request_post(mrioc, &pel_abort_req,
+	    sizeof(pel_abort_req), 0);
+	if (retval) {
+		retval = -1;
+		dprint_bsg_err(mrioc, "%s: admin request post failed\n",
+		    __func__);
+		mrioc->pel_abort_requested = 0;
+		goto out_unlock;
+	}
+
+	wait_for_completion_timeout(&mrioc->pel_abort_cmd.done,
+	    (MPI3MR_INTADMCMD_TIMEOUT * HZ));
+	if (!(mrioc->pel_abort_cmd.state & MPI3MR_CMD_COMPLETE)) {
+		mrioc->pel_abort_cmd.is_waiting = 0;
+		dprint_bsg_err(mrioc, "%s: command timedout\n", __func__);
+		if (!(mrioc->pel_abort_cmd.state & MPI3MR_CMD_RESET))
+			mpi3mr_soft_reset_handler(mrioc,
+			    MPI3MR_RESET_FROM_PELABORT_TIMEOUT, 1);
+		retval = -1;
+		goto out_unlock;
+	}
+	if ((mrioc->pel_abort_cmd.ioc_status & MPI3_IOCSTATUS_STATUS_MASK)
+	     != MPI3_IOCSTATUS_SUCCESS) {
+		dprint_bsg_err(mrioc,
+		    "%s: command failed, ioc_status(0x%04x) log_info(0x%08x)\n",
+		    __func__, (mrioc->pel_abort_cmd.ioc_status &
+		    MPI3_IOCSTATUS_STATUS_MASK),
+		    mrioc->pel_abort_cmd.ioc_loginfo);
+		retval = -1;
+		goto out_unlock;
+	}
+	if (mrioc->pel_abort_cmd.state & MPI3MR_CMD_REPLY_VALID) {
+		pel_reply = (struct mpi3_pel_reply *)mrioc->pel_abort_cmd.reply;
+		pe_log_status = le16_to_cpu(pel_reply->pe_log_status);
+		if (pe_log_status != MPI3_PEL_STATUS_SUCCESS) {
+			dprint_bsg_err(mrioc,
+			    "%s: command failed, pel_status(0x%04x)\n",
+			    __func__, pe_log_status);
+			retval = -1;
+		}
+	}
+
+out_unlock:
+	mrioc->pel_abort_cmd.state = MPI3MR_CMD_NOTUSED;
+	mutex_unlock(&mrioc->pel_abort_cmd.mutex);
+	return retval;
+}
+/**
+ * mpi3mr_bsg_verify_adapter - verify adapter number is valid
+ * @ioc_number: Adapter number
+ * @mriocpp: Pointer to hold per adapter instance
+ *
+ * This function checks whether given adapter number matches
+ * with an adapter id in the driver's list and if so fills
+ * pointer to the per adapter instance in mriocpp else set that
+ * to NULL.
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_bsg_verify_adapter(int ioc_number,
+    struct mpi3mr_ioc **mriocpp)
+{
+	struct mpi3mr_ioc *mrioc;
+
+	spin_lock(&mrioc_list_lock);
+	list_for_each_entry(mrioc, &mrioc_list, list) {
+		if (mrioc->id != ioc_number)
+			continue;
+		spin_unlock(&mrioc_list_lock);
+		*mriocpp = mrioc;
+		return;
+	}
+	spin_unlock(&mrioc_list_lock);
+	*mriocpp = NULL;
+}
+
+
+/**
+ * mpi3mr_bsg_refresh_hdb_triggers - Refresh HDB trigger data
+ * @mrioc: Adapter instance reference
+ * @job: BSG Job pointer
+ *
+ * This function reads the controller trigger config page as
+ * defined by the input page type and refreshes the driver's
+ * local trigger information structures with the controller's
+ * config page data.
+ *
+ * Return: 0 on success and proper error codes on failure
+ */
+static long
+mpi3mr_bsg_refresh_hdb_triggers(struct mpi3mr_ioc *mrioc,
+				struct bsg_job *job)
+{
+	struct mpi3mr_bsg_out_refresh_hdb_triggers refresh_triggers;
+	uint32_t data_out_sz;
+	u8 page_action;
+	long rval = -EINVAL;
+
+	data_out_sz = job->request_payload.payload_len;
+
+	if (data_out_sz != sizeof(refresh_triggers)) {
+		dprint_bsg_err(mrioc, "%s: invalid size argument\n",
+		    __func__);
+		return rval;
+	}
+	sg_copy_to_buffer(job->request_payload.sg_list,
+	    job->request_payload.sg_cnt,
+	    &refresh_triggers, sizeof(refresh_triggers));
+
+	switch (refresh_triggers.page_type) {
+	case MPI3MR_HDB_REFRESH_TYPE_CURRENT:
+		page_action = MPI3_CONFIG_ACTION_READ_CURRENT;
+		break;
+	case MPI3MR_HDB_REFRESH_TYPE_DEFAULT:
+		page_action = MPI3_CONFIG_ACTION_READ_DEFAULT;
+		break;
+	case MPI3MR_HDB_HDB_REFRESH_TYPE_PERSISTENT:
+		page_action = MPI3_CONFIG_ACTION_READ_PERSISTENT;
+		break;
+	default:
+		dprint_bsg_err(mrioc,
+		    "%s: unsupported refresh trigger, page_type %d\n",
+		    __func__, refresh_triggers.page_type);
+		return rval;
+	}
+	rval = mpi3mr_refresh_trigger(mrioc, page_action);
+
+	return rval;
+}
+
+/**
+ * mpi3mr_bsg_upload_hdb - Upload a specific HDB to user space
+ * @mrioc: Adapter instance reference
+ * @job: BSG Job pointer
+ *
+ * Return: 0 on success and proper error codes on failure
+ */
+static long mpi3mr_bsg_upload_hdb(struct mpi3mr_ioc *mrioc,
+				  struct bsg_job *job)
+{
+	struct mpi3mr_bsg_out_upload_hdb upload_hdb;
+	struct diag_buffer_desc *diag_buffer;
+	uint32_t data_out_size;
+	uint32_t data_in_size;
+
+	data_out_size = job->request_payload.payload_len;
+	data_in_size = job->reply_payload.payload_len;
+
+	if (data_out_size != sizeof(upload_hdb)) {
+		dprint_bsg_err(mrioc, "%s: invalid size argument\n",
+		    __func__);
+		return -EINVAL;
+	}
+
+	sg_copy_to_buffer(job->request_payload.sg_list,
+			  job->request_payload.sg_cnt,
+			  &upload_hdb, sizeof(upload_hdb));
+
+	if ((!upload_hdb.length) || (data_in_size != upload_hdb.length)) {
+		dprint_bsg_err(mrioc, "%s: invalid length argument\n",
+		    __func__);
+		return -EINVAL;
+	}
+	diag_buffer = mpi3mr_diag_buffer_for_type(mrioc, upload_hdb.buf_type);
+	if ((!diag_buffer) || (!diag_buffer->addr)) {
+		dprint_bsg_err(mrioc, "%s: invalid buffer type %d\n",
+		    __func__, upload_hdb.buf_type);
+		return -EINVAL;
+	}
+
+	if ((diag_buffer->status != MPI3MR_HDB_BUFSTATUS_RELEASED) &&
+	    (diag_buffer->status != MPI3MR_HDB_BUFSTATUS_POSTED_PAUSED)) {
+		dprint_bsg_err(mrioc,
+		    "%s: invalid buffer status %d for type %d\n",
+		    __func__, diag_buffer->status, upload_hdb.buf_type);
+		return -EINVAL;
+	}
+
+	if ((upload_hdb.start_offset + upload_hdb.length) > diag_buffer->size) {
+		dprint_bsg_err(mrioc,
+		    "%s: invalid start offset %d, length %d for type %d\n",
+		    __func__, upload_hdb.start_offset, upload_hdb.length,
+		    upload_hdb.buf_type);
+		return -EINVAL;
+	}
+	if (job->reply_payload.payload_len >= upload_hdb.length) {
+		sg_copy_from_buffer(job->reply_payload.sg_list,
+				    job->reply_payload.sg_cnt,
+				    (diag_buffer->addr + upload_hdb.start_offset), upload_hdb.length);
+		return 0;
+	}
+	return -EFAULT;
+}
+
+
+/**
+ * mpi3mr_bsg_repost_hdb - Re-post HDB
+ * @mrioc: Adapter instance reference
+ * @job: BSG job pointer
+ *
+ * This function retrieves the HDB descriptor corresponding to a
+ * given buffer type and if the HDB is in released status then
+ * posts the HDB with the firmware.
+ *
+ * Return: 0 on success and proper error codes on failure
+ */
+static long mpi3mr_bsg_repost_hdb(struct mpi3mr_ioc *mrioc,
+				  struct bsg_job *job)
+{
+	struct mpi3mr_bsg_out_repost_hdb repost_hdb;
+	struct diag_buffer_desc *diag_buffer;
+	uint32_t data_out_sz;
+
+	data_out_sz = job->request_payload.payload_len;
+
+	if (data_out_sz != sizeof(repost_hdb)) {
+		dprint_bsg_err(mrioc, "%s: invalid size argument\n",
+		    __func__);
+		return -EINVAL;
+	}
+
+	sg_copy_to_buffer(job->request_payload.sg_list,
+			  job->request_payload.sg_cnt,
+			  &repost_hdb, sizeof(repost_hdb));
+
+	diag_buffer = mpi3mr_diag_buffer_for_type(mrioc, repost_hdb.buf_type);
+	if ((!diag_buffer) || (!diag_buffer->addr)) {
+		dprint_bsg_err(mrioc, "%s: invalid buffer type %d\n",
+		    __func__, repost_hdb.buf_type);
+		return -EINVAL;
+	}
+
+	if (diag_buffer->status != MPI3MR_HDB_BUFSTATUS_RELEASED) {
+		dprint_bsg_err(mrioc,
+		    "%s: invalid buffer status %d for type %d\n",
+		    __func__, diag_buffer->status, repost_hdb.buf_type);
+		return -EINVAL;
+	}
+
+	if (mpi3mr_issue_diag_buf_post(mrioc, diag_buffer)) {
+		dprint_bsg_err(mrioc, "%s: post failed for type %d\n",
+		    __func__, repost_hdb.buf_type);
+		return -EFAULT;
+	}
+	mpi3mr_set_trigger_data_in_hdb(diag_buffer,
+	    MPI3MR_HDB_TRIGGER_TYPE_UNKNOWN, 0, 1);
+
+	return 0;
+}
+
+/**
+ * mpi3mr_bsg_query_hdb - Handler for query HDB command
+ * @mrioc: Adapter instance reference
+ * @job: BSG job pointer
+ *
+ * This function prepares and copies the host diagnostic buffer
+ * entries to the user buffer.
+ *
+ * Return: 0 on success and proper error codes on failure
+ */
+static long mpi3mr_bsg_query_hdb(struct mpi3mr_ioc *mrioc,
+				 struct bsg_job *job)
+{
+	long rval = 0;
+	struct mpi3mr_bsg_in_hdb_status *hbd_status;
+	struct mpi3mr_hdb_entry *hbd_status_entry;
+	u32 length, min_length;
+	u8 i;
+	struct diag_buffer_desc *diag_buffer;
+	uint32_t data_in_sz = 0;
+
+	data_in_sz = job->request_payload.payload_len;
+
+	length = (sizeof(*hbd_status) + ((MPI3MR_MAX_NUM_HDB - 1) *
+		    sizeof(*hbd_status_entry)));
+	hbd_status = kmalloc(length, GFP_KERNEL);
+	if (!hbd_status)
+		return -ENOMEM;
+	hbd_status_entry = &hbd_status->entry[0];
+
+	hbd_status->num_hdb_types = MPI3MR_MAX_NUM_HDB;
+	for (i = 0; i < MPI3MR_MAX_NUM_HDB; i++) {
+		diag_buffer = &mrioc->diag_buffers[i];
+		hbd_status_entry->buf_type = diag_buffer->type;
+		hbd_status_entry->status = diag_buffer->status;
+		hbd_status_entry->trigger_type = diag_buffer->trigger_type;
+		hbd_status_entry->trigger_data = diag_buffer->trigger_data;
+		hbd_status_entry->size = (diag_buffer->size / 1024);
+		hbd_status_entry++;
+	}
+
+	if (data_in_sz < 4) {
+		dprint_bsg_err(mrioc, "%s: invalid size passed\n", __func__);
+		rval = -EINVAL;
+		goto out;
+	}
+	min_length = min(data_in_sz, length);
+	if (job->request_payload.payload_len >= min_length) {
+		sg_copy_from_buffer(job->request_payload.sg_list,
+				    job->request_payload.sg_cnt,
+				    hbd_status, min_length);
+		rval = 0;
+	}
+out:
+	kfree(hbd_status);
+	return rval;
+}
+/**
+ * mpi3mr_enable_logdata - Handler for log data enable
+ * @mrioc: Adapter instance reference
+ * @job: BSG job reference
+ *
+ * This function enables log data caching in the driver if not
+ * already enabled and return the maximum number of log data
+ * entries that can be cached in the driver.
+ *
+ * Return: 0 on success and proper error codes on failure
+ */
+static long mpi3mr_enable_logdata(struct mpi3mr_ioc *mrioc,
+				  struct bsg_job *job)
+{
+	long rval = -EINVAL;
+	struct mpi3mr_logdata_enable logdata_enable;
+
+	if (mrioc->logdata_buf)
+		goto copy_user_data;
+
+	mrioc->logdata_entry_sz =
+	    (mrioc->reply_sz - (sizeof(struct mpi3_event_notification_reply) - 4))
+	    + MPI3MR_BSG_LOGDATA_ENTRY_HEADER_SZ;
+	mrioc->logdata_buf_idx = 0;
+
+	mrioc->logdata_buf = kcalloc(MPI3MR_BSG_LOGDATA_MAX_ENTRIES,
+	    mrioc->logdata_entry_sz, GFP_KERNEL);
+	if (!mrioc->logdata_buf)
+		return -ENOMEM;
+
+copy_user_data:
+	memset(&logdata_enable, 0, sizeof(logdata_enable));
+	logdata_enable.max_entries =
+	    MPI3MR_BSG_LOGDATA_MAX_ENTRIES;
+	if (job->request_payload.payload_len >= sizeof(logdata_enable)) {
+		sg_copy_from_buffer(job->request_payload.sg_list,
+				    job->request_payload.sg_cnt,
+				    &logdata_enable, sizeof(logdata_enable));
+		rval = 0;
+	}
+	return rval;
+}
+/**
+ * mpi3mr_get_logdata - Handler for get log data
+ * @mrioc: Adapter instance reference
+ * @job: BSG job pointer
+ * This function copies the log data entries to the user buffer
+ * when log caching is enabled in the driver.
+ *
+ * Return: 0 on success and proper error codes on failure
+ */
+static long mpi3mr_get_logdata(struct mpi3mr_ioc *mrioc,
+			       struct bsg_job *job)
+{
+	u16 num_entries, sz, entry_sz = mrioc->logdata_entry_sz;
+
+	if ((!mrioc->logdata_buf) || (job->request_payload.payload_len < entry_sz))
+		return -EINVAL;
+
+	num_entries = job->request_payload.payload_len / entry_sz;
+	if (num_entries > MPI3MR_BSG_LOGDATA_MAX_ENTRIES)
+		num_entries = MPI3MR_BSG_LOGDATA_MAX_ENTRIES;
+	sz = num_entries * entry_sz;
+
+	if (job->request_payload.payload_len >= sz) {
+		sg_copy_from_buffer(job->request_payload.sg_list,
+				    job->request_payload.sg_cnt,
+				    mrioc->logdata_buf, sz);
+		return 0;
+	}
+	return -EINVAL;
+}
+
+/**
+ * mpi3mr_bsg_pel_enable - Handler for PEL enable driver
+ * @mrioc: Adapter instance reference
+ * @job: BSG job pointer
+ *
+ * This function is the handler for PEL enable driver.
+ * Validates the application given class and locale and if
+ * requires aborts the existing PEL wait request and/or issues
+ * new PEL wait request to the firmware and returns.
+ *
+ * Return: 0 on success and proper error codes on failure.
+ */
+static long mpi3mr_bsg_pel_enable(struct mpi3mr_ioc *mrioc,
+				  struct bsg_job *job)
+{
+	long rval = -EINVAL;
+	struct mpi3mr_bsg_out_pel_enable pel_enable;
+	u8 issue_pel_wait;
+	u8 tmp_class;
+	u16 tmp_locale;
+
+	if (job->request_payload.payload_len != sizeof(pel_enable)) {
+		dprint_bsg_err(mrioc, "%s: invalid size argument\n",
+		    __func__);
+		return rval;
+	}
+
+	sg_copy_to_buffer(job->request_payload.sg_list,
+			  job->request_payload.sg_cnt,
+			  &pel_enable, sizeof(pel_enable));
+
+	if (pel_enable.pel_class > MPI3_PEL_CLASS_FAULT) {
+		dprint_bsg_err(mrioc, "%s: out of range class %d sent\n",
+			__func__, pel_enable.pel_class);
+		rval = 0;
+		goto out;
+	}
+	if (!mrioc->pel_enabled)
+		issue_pel_wait = 1;
+	else {
+		if ((mrioc->pel_class <= pel_enable.pel_class) &&
+		    !((mrioc->pel_locale & pel_enable.pel_locale) ^
+		      pel_enable.pel_locale)) {
+			issue_pel_wait = 0;
+			rval = 0;
+		} else {
+			pel_enable.pel_locale |= mrioc->pel_locale;
+
+			if (mrioc->pel_class < pel_enable.pel_class)
+				pel_enable.pel_class = mrioc->pel_class;
+
+			rval = mpi3mr_bsg_pel_abort(mrioc);
+			if (rval) {
+				dprint_bsg_err(mrioc,
+				    "%s: pel_abort failed, status(%ld)\n",
+				    __func__, rval);
+				goto out;
+			}
+			issue_pel_wait = 1;
+		}
+	}
+	if (issue_pel_wait) {
+		tmp_class = mrioc->pel_class;
+		tmp_locale = mrioc->pel_locale;
+		mrioc->pel_class = pel_enable.pel_class;
+		mrioc->pel_locale = pel_enable.pel_locale;
+		mrioc->pel_enabled = 1;
+		rval = mpi3mr_pel_get_seqnum_post(mrioc, NULL);
+		if (rval) {
+			mrioc->pel_class = tmp_class;
+			mrioc->pel_locale = tmp_locale;
+			mrioc->pel_enabled = 0;
+			dprint_bsg_err(mrioc,
+			    "%s: pel get sequence number failed, status(%ld)\n",
+			    __func__, rval);
+		}
+	}
+
+out:
+	return rval;
+}
+
+/**
+ * mpi3mr_get_all_tgt_info - Get all target information
+ * @mrioc: Adapter instance reference
+ * @job: BSG job reference
+ *
+ * This function copies the driver managed target devices device
+ * handle, persistent ID, bus ID and taret ID to the user
+ * provided buffer for the specific controller. This function
+ * also provides the number of devices managed by the driver for
+ * the specific controller.
+ *
+ * Return: 0 on success and proper error codes on failure
+ */
+static long mpi3mr_get_all_tgt_info(struct mpi3mr_ioc *mrioc,
+				    struct bsg_job *job)
+{
+	long rval = -EINVAL;
+	u16 num_devices = 0, i = 0, size;
+	unsigned long flags;
+	struct mpi3mr_tgt_dev *tgtdev;
+	struct mpi3mr_device_map_info *devmap_info = NULL;
+	struct mpi3mr_all_tgt_info *alltgt_info = NULL;
+	uint32_t min_entrylen = 0, kern_entrylen = 0, usr_entrylen = 0;
+
+	if (job->request_payload.payload_len < sizeof(u32)) {
+		dprint_bsg_err(mrioc, "%s: invalid size argument\n",
+		    __func__);
+		return rval;
+	}
+
+	spin_lock_irqsave(&mrioc->tgtdev_lock, flags);
+	list_for_each_entry(tgtdev, &mrioc->tgtdev_list, list)
+		num_devices++;
+	spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags);
+
+	if ((job->request_payload.payload_len == sizeof(u32)) ||
+		list_empty(&mrioc->tgtdev_list)) {
+		sg_copy_from_buffer(job->request_payload.sg_list,
+				    job->request_payload.sg_cnt,
+				    &num_devices, sizeof(num_devices));
+		return 0;
+	}
+
+	kern_entrylen = (num_devices - 1) * sizeof(*devmap_info);
+	size = sizeof(*alltgt_info) + kern_entrylen;
+	alltgt_info = kzalloc(size, GFP_KERNEL);
+	if (!alltgt_info)
+		return -ENOMEM;
+
+	devmap_info = alltgt_info->dmi;
+	memset((u8 *)devmap_info, 0xFF, (kern_entrylen + sizeof(*devmap_info)));
+	spin_lock_irqsave(&mrioc->tgtdev_lock, flags);
+	list_for_each_entry(tgtdev, &mrioc->tgtdev_list, list) {
+		if (i < num_devices) {
+			devmap_info[i].handle = tgtdev->dev_handle;
+			devmap_info[i].perst_id = tgtdev->perst_id;
+			if (tgtdev->host_exposed && tgtdev->starget) {
+				devmap_info[i].target_id = tgtdev->starget->id;
+				devmap_info[i].bus_id =
+				    tgtdev->starget->channel;
+			}
+			i++;
+		}
+	}
+	num_devices = i;
+	spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags);
+
+	memcpy(&alltgt_info->num_devices, &num_devices, sizeof(num_devices));
+
+	usr_entrylen = (job->request_payload.payload_len - sizeof(u32)) / sizeof(*devmap_info);
+	usr_entrylen *= sizeof(*devmap_info);
+	min_entrylen = min(usr_entrylen, kern_entrylen);
+	if (min_entrylen && (!memcpy(&alltgt_info->dmi, devmap_info, min_entrylen))) {
+		dprint_bsg_err(mrioc, "%s:%d: device map info copy failed\n",
+		    __func__, __LINE__);
+		rval = -EFAULT;
+		goto out;
+	}
+
+	sg_copy_from_buffer(job->request_payload.sg_list,
+			    job->request_payload.sg_cnt,
+			    alltgt_info, job->request_payload.payload_len);
+	rval = 0;
+out:
+	kfree(alltgt_info);
+	return rval;
+}
+/**
+ * mpi3mr_get_change_count - Get topology change count
+ * @mrioc: Adapter instance reference
+ * @job: BSG job reference
+ *
+ * This function copies the toplogy change count provided by the
+ * driver in events and cached in the driver to the user
+ * provided buffer for the specific controller.
+ *
+ * Return: 0 on success and proper error codes on failure
+ */
+static long mpi3mr_get_change_count(struct mpi3mr_ioc *mrioc,
+				    struct bsg_job *job)
+{
+	long rval = -EINVAL;
+	struct mpi3mr_change_count chgcnt;
+
+	memset(&chgcnt, 0, sizeof(chgcnt));
+	chgcnt.change_count = mrioc->change_count;
+	if (job->request_payload.payload_len >= sizeof(chgcnt)) {
+		sg_copy_from_buffer(job->request_payload.sg_list,
+				    job->request_payload.sg_cnt,
+				    &chgcnt, sizeof(chgcnt));
+		rval = 0;
+	}
+	return rval;
+}
+
+/**
+ * mpi3mr_bsg_adp_reset - Issue controller reset
+ * @mrioc: Adapter instance reference
+ * @job: BSG job reference
+ *
+ * This function identifies the user provided reset type and
+ * issues approporiate reset to the controller and wait for that
+ * to complete and reinitialize the controller and then returns
+ *
+ * Return: 0 on success and proper error codes on failure
+ */
+static long mpi3mr_bsg_adp_reset(struct mpi3mr_ioc *mrioc,
+				 struct bsg_job *job)
+{
+	long rval = -EINVAL;
+	u8 save_snapdump;
+	struct mpi3mr_bsg_adp_reset adpreset;
+
+	if (job->request_payload.payload_len !=
+			sizeof(adpreset)) {
+		dprint_bsg_err(mrioc, "%s: invalid size argument\n",
+		    __func__);
+		goto out;
+	}
+
+	sg_copy_to_buffer(job->request_payload.sg_list,
+			  job->request_payload.sg_cnt,
+			  &adpreset, sizeof(adpreset));
+
+	switch (adpreset.reset_type) {
+	case MPI3MR_BSG_ADPRESET_SOFT:
+		save_snapdump = 0;
+		break;
+	case MPI3MR_BSG_ADPRESET_DIAG_FAULT:
+		save_snapdump = 1;
+		break;
+	default:
+		dprint_bsg_err(mrioc, "%s: unknown reset_type(%d)\n",
+		    __func__, adpreset.reset_type);
+		goto out;
+	}
+
+	rval = mpi3mr_soft_reset_handler(mrioc, MPI3MR_RESET_FROM_APP,
+	    save_snapdump);
+
+	if (rval)
+		dprint_bsg_err(mrioc,
+		    "%s: reset handler returned error(%ld) for reset type %d\n",
+		    __func__, rval, adpreset.reset_type);
+out:
+	return rval;
+}
+
+/**
+ * mpi3mr_bsg_populate_adpinfo - Get adapter info command handler
+ * @mrioc: Adapter instance reference
+ * @job: BSG job reference
+ *
+ * This function provides adapter information for the given
+ * controller
+ *
+ * Return: 0 on success and proper error codes on failure
+ */
+static long mpi3mr_bsg_populate_adpinfo(struct mpi3mr_ioc *mrioc,
+					struct bsg_job *job)
+{
+	enum mpi3mr_iocstate ioc_state;
+	struct mpi3mr_bsg_in_adpinfo adpinfo;
+
+	memset(&adpinfo, 0, sizeof(adpinfo));
+	adpinfo.adp_type = MPI3MR_BSG_ADPTYPE_AVGFAMILY;
+	adpinfo.pci_dev_id = mrioc->pdev->device;
+	adpinfo.pci_dev_hw_rev = mrioc->pdev->revision;
+	adpinfo.pci_subsys_dev_id = mrioc->pdev->subsystem_device;
+	adpinfo.pci_subsys_ven_id = mrioc->pdev->subsystem_vendor;
+	adpinfo.pci_bus = mrioc->pdev->bus->number;
+	adpinfo.pci_dev = PCI_SLOT(mrioc->pdev->devfn);
+	adpinfo.pci_func = PCI_FUNC(mrioc->pdev->devfn);
+	adpinfo.pci_seg_id = pci_domain_nr(mrioc->pdev->bus);
+	adpinfo.app_intfc_ver = MPI3MR_IOCTL_VERSION;
+
+	ioc_state = mpi3mr_get_iocstate(mrioc);
+	if (ioc_state == MRIOC_STATE_UNRECOVERABLE)
+		adpinfo.adp_state = MPI3MR_BSG_ADPSTATE_UNRECOVERABLE;
+	else if ((mrioc->reset_in_progress) || (mrioc->block_bsgs))
+		adpinfo.adp_state = MPI3MR_BSG_ADPSTATE_IN_RESET;
+	else if (ioc_state == MRIOC_STATE_FAULT)
+		adpinfo.adp_state = MPI3MR_BSG_ADPSTATE_FAULT;
+	else
+		adpinfo.adp_state = MPI3MR_BSG_ADPSTATE_OPERATIONAL;
+
+	memcpy((u8 *)&adpinfo.driver_info, (u8 *)&mrioc->driver_info,
+	    sizeof(adpinfo.driver_info));
+
+	if (job->request_payload.payload_len >= sizeof(adpinfo)) {
+		sg_copy_from_buffer(job->request_payload.sg_list,
+				    job->request_payload.sg_cnt,
+				    &adpinfo, sizeof(adpinfo));
+		return 0;
+	}
+	return -EINVAL;
+}
+
+/**
+ * mpi3mr_bsg_process_drv_cmds - Driver Command handler
+ * @job: BSG job reference
+ *
+ * This function is the top level handler for driver commands,
+ * this does basic validation of the buffer and identifies the
+ * opcode and switches to correct sub handler.
+ *
+ * Return: 0 on success and proper error codes on failure
+ */
+static long mpi3mr_bsg_process_drv_cmds(struct bsg_job *job)
+{
+	long rval = -EINVAL;
+	struct mpi3mr_ioc *mrioc = NULL;
+	struct mpi3mr_bsg_packet *bsg_req = NULL;
+	struct mpi3mr_bsg_drv_cmd *drvrcmd = NULL;
+
+	bsg_req = job->request;
+	drvrcmd = &bsg_req->cmd.drvrcmd;
+
+	mpi3mr_bsg_verify_adapter(drvrcmd->mrioc_id, &mrioc);
+	if (!mrioc)
+		return -ENODEV;
+
+	if (drvrcmd->opcode == MPI3MR_DRVBSG_OPCODE_ADPINFO) {
+		rval = mpi3mr_bsg_populate_adpinfo(mrioc, job);
+		return rval;
+	}
+
+	if (mutex_lock_interruptible(&mrioc->bsg_cmds.mutex))
+		return -ERESTARTSYS;
+
+	switch (drvrcmd->opcode) {
+	case MPI3MR_DRVBSG_OPCODE_ADPRESET:
+		rval = mpi3mr_bsg_adp_reset(mrioc, job);
+		break;
+	case MPI3MR_DRVBSG_OPCODE_ALLTGTDEVINFO:
+		rval = mpi3mr_get_all_tgt_info(mrioc, job);
+		break;
+	case MPI3MR_DRVBSG_OPCODE_GETCHGCNT:
+		rval = mpi3mr_get_change_count(mrioc, job);
+		break;
+	case MPI3MR_DRVBSG_OPCODE_LOGDATAENABLE:
+		rval = mpi3mr_enable_logdata(mrioc, job);
+		break;
+	case MPI3MR_DRVBSG_OPCODE_GETLOGDATA:
+		rval = mpi3mr_get_logdata(mrioc, job);
+		break;
+	case MPI3MR_DRVBSG_OPCODE_PELENABLE:
+		rval = mpi3mr_bsg_pel_enable(mrioc, job);
+		break;
+	case MPI3MR_DRVBSG_OPCODE_QUERY_HDB:
+		rval = mpi3mr_bsg_query_hdb(mrioc, job);
+		break;
+	case MPI3MR_DRVBSG_OPCODE_REPOST_HDB:
+		rval = mpi3mr_bsg_repost_hdb(mrioc, job);
+		break;
+	case MPI3MR_DRVBSG_OPCODE_UPLOAD_HDB:
+		rval = mpi3mr_bsg_upload_hdb(mrioc, job);
+		break;
+	case MPI3MR_DRVBSG_OPCODE_REFRESH_HDB_TRIGGERS:
+		rval = mpi3mr_bsg_refresh_hdb_triggers(mrioc, job);
+		break;
+	case MPI3MR_DRVBSG_OPCODE_UNKNOWN:
+	default:
+		pr_err("%s: unsupported driver command opcode %d\n",
+		    MPI3MR_DRIVER_NAME, drvrcmd->opcode);
+		break;
+	}
+	mutex_unlock(&mrioc->bsg_cmds.mutex);
+	return rval;
+}
+/**
+ * mpi3mr_bsg_build_sgl - SGL construction for MPI commands
+ * @mpi_req: MPI request
+ * @sgl_offset: offset to start sgl in the MPI request
+ * @drv_bufs: DMA address of the buffers to be placed in sgl
+ * @bufcnt: Number of DMA buffers
+ * @is_rmc: Does the buffer list has management command buffer
+ * @is_rmr: Does the buffer list has management response buffer
+ * @num_datasges: Number of data buffers in the list
+ *
+ * This function places the DMA address of the given buffers in
+ * proper format as SGEs in the given MPI request.
+ *
+ * Return: Nothing
+ */
+static void mpi3mr_bsg_build_sgl(u8 *mpi_req, uint32_t sgl_offset,
+    struct mpi3mr_buf_map *drv_bufs, u8 bufcnt, u8 is_rmc,
+    u8 is_rmr, u8 num_datasges)
+{
+	u8 *sgl = (mpi_req + sgl_offset), count = 0;
+	struct mpi3_mgmt_passthrough_request *rmgmt_req =
+	    (struct mpi3_mgmt_passthrough_request *)mpi_req;
+	struct mpi3mr_buf_map *drv_buf_iter = drv_bufs;
+	u8 sgl_flags, sgl_flags_last;
+
+	sgl_flags = MPI3_SGE_FLAGS_ELEMENT_TYPE_SIMPLE |
+		MPI3_SGE_FLAGS_DLAS_SYSTEM | MPI3_SGE_FLAGS_END_OF_BUFFER;
+	sgl_flags_last = sgl_flags | MPI3_SGE_FLAGS_END_OF_LIST;
+
+	if (is_rmc) {
+		mpi3mr_add_sg_single(&rmgmt_req->command_sgl,
+		    sgl_flags_last, drv_buf_iter->kern_buf_len,
+		    drv_buf_iter->kern_buf_dma);
+		sgl = (u8 *)drv_buf_iter->kern_buf + drv_buf_iter->bsg_buf_len;
+		drv_buf_iter++;
+		count++;
+		if (is_rmr) {
+			mpi3mr_add_sg_single(&rmgmt_req->response_sgl,
+			    sgl_flags_last, drv_buf_iter->kern_buf_len,
+			    drv_buf_iter->kern_buf_dma);
+			drv_buf_iter++;
+			count++;
+		} else
+			mpi3mr_build_zero_len_sge(
+			    &rmgmt_req->response_sgl);
+	}
+	if (!num_datasges) {
+		mpi3mr_build_zero_len_sge(sgl);
+		return;
+	}
+	for (; count < bufcnt; count++, drv_buf_iter++) {
+		if (drv_buf_iter->is_dma == false)
+			continue;
+		if (num_datasges == 1 || !is_rmc)
+			mpi3mr_add_sg_single(sgl, sgl_flags_last,
+			    drv_buf_iter->kern_buf_len, drv_buf_iter->kern_buf_dma);
+		else
+			mpi3mr_add_sg_single(sgl, sgl_flags,
+			    drv_buf_iter->kern_buf_len, drv_buf_iter->kern_buf_dma);
+		sgl += sizeof(struct mpi3_sge_common);
+		num_datasges--;
+	}
+}
+
+/**
+ * mpi3mr_get_nvme_data_fmt - returns the NVMe data format
+ * @nvme_encap_request: NVMe encapsulated MPI request
+ *
+ * This function returns the type of the data format specified
+ * in user provided NVMe command in NVMe encapsulated request.
+ *
+ * Return: Data format of the NVMe command (PRP/SGL etc)
+ */
+static unsigned int mpi3mr_get_nvme_data_fmt(
+	struct mpi3_nvme_encapsulated_request *nvme_encap_request)
+{
+	u8 format = 0;
+
+	format = ((nvme_encap_request->command[0] & 0xc000) >> 14);
+	return format;
+
+}
+
+/**
+ * mpi3mr_build_nvme_sgl - SGL constructor for NVME
+ *				   encapsulated request
+ * @mrioc: Adapter instance reference
+ * @nvme_encap_request: NVMe encapsulated MPI request
+ * @drv_bufs: DMA address of the buffers to be placed in sgl
+ * @bufcnt: Number of DMA buffers
+ *
+ * This function places the DMA address of the given buffers in
+ * proper format as SGEs in the given NVMe encapsulated request.
+ *
+ * Return: 0 on success, -1 on failure
+ */
+static int mpi3mr_build_nvme_sgl(struct mpi3mr_ioc *mrioc,
+	struct mpi3_nvme_encapsulated_request *nvme_encap_request,
+	struct mpi3mr_buf_map *drv_bufs, u8 bufcnt)
+{
+	struct mpi3mr_nvme_pt_sge *nvme_sgl;
+	u64 sgl_ptr;
+	u8 count;
+	size_t length = 0;
+	struct mpi3mr_buf_map *drv_buf_iter = drv_bufs;
+	u64 sgemod_mask = ((u64)((mrioc->facts.sge_mod_mask) <<
+			    mrioc->facts.sge_mod_shift) << 32);
+	u64 sgemod_val = ((u64)(mrioc->facts.sge_mod_value) <<
+			  mrioc->facts.sge_mod_shift) << 32;
+
+	/*
+	 * Not all commands require a data transfer. If no data, just return
+	 * without constructing any sgl.
+	 */
+	for (count = 0; count < bufcnt; count++, drv_buf_iter++) {
+		if (drv_buf_iter->is_dma == false)
+			continue;
+		sgl_ptr = (u64)drv_buf_iter->kern_buf_dma;
+		length = drv_buf_iter->kern_buf_len;
+		break;
+	}
+	if (!length)
+		return 0;
+
+	if (sgl_ptr & sgemod_mask) {
+		dprint_bsg_err(mrioc,
+		    "%s: SGL address collides with SGE modifier\n",
+		    __func__);
+		return -1;
+	}
+
+	sgl_ptr &= ~sgemod_mask;
+	sgl_ptr |= sgemod_val;
+	nvme_sgl = (struct mpi3mr_nvme_pt_sge *)
+	    ((u8 *)(nvme_encap_request->command) + MPI3MR_NVME_CMD_SGL_OFFSET);
+	memset(nvme_sgl, 0, sizeof(struct mpi3mr_nvme_pt_sge));
+	nvme_sgl->base_addr = sgl_ptr;
+	nvme_sgl->length = length;
+	return 0;
+}
+
+/**
+ * mpi3mr_build_nvme_prp - PRP constructor for NVME
+ *			       encapsulated request
+ * @mrioc: Adapter instance reference
+ * @nvme_encap_request: NVMe encapsulated MPI request
+ * @drv_bufs: DMA address of the buffers to be placed in SGL
+ * @bufcnt: Number of DMA buffers
+ *
+ * This function places the DMA address of the given buffers in
+ * proper format as PRP entries in the given NVMe encapsulated
+ * request.
+ *
+ * Return: 0 on success, -1 on failure
+ */
+static int mpi3mr_build_nvme_prp(struct mpi3mr_ioc *mrioc,
+    struct mpi3_nvme_encapsulated_request *nvme_encap_request,
+    struct mpi3mr_buf_map *drv_bufs, u8 bufcnt)
+{
+	int prp_size = MPI3MR_NVME_PRP_SIZE;
+	__le64 *prp_entry, *prp1_entry, *prp2_entry;
+	__le64 *prp_page;
+	dma_addr_t prp_entry_dma, prp_page_dma, dma_addr;
+	u32 offset, entry_len, dev_pgsz;
+	u32 page_mask_result, page_mask;
+	size_t length = 0;
+	u8 count;
+	struct mpi3mr_buf_map *drv_buf_iter = drv_bufs;
+	u64 sgemod_mask = ((u64)((mrioc->facts.sge_mod_mask) <<
+			    mrioc->facts.sge_mod_shift) << 32);
+	u64 sgemod_val = ((u64)(mrioc->facts.sge_mod_value) <<
+			  mrioc->facts.sge_mod_shift) << 32;
+	u16 dev_handle = nvme_encap_request->dev_handle;
+	struct mpi3mr_tgt_dev *tgtdev;
+
+	tgtdev = mpi3mr_get_tgtdev_by_handle(mrioc, dev_handle);
+	if (!tgtdev) {
+		dprint_bsg_err(mrioc, "%s: invalid device handle 0x%04x\n",
+			__func__, dev_handle);
+		return -1;
+	}
+
+	if (tgtdev->dev_spec.pcie_inf.pgsz == 0) {
+		dprint_bsg_err(mrioc,
+		    "%s: NVMe device page size is zero for handle 0x%04x\n",
+		    __func__, dev_handle);
+		mpi3mr_tgtdev_put(tgtdev);
+		return -1;
+	}
+
+	dev_pgsz = 1 << (tgtdev->dev_spec.pcie_inf.pgsz);
+	mpi3mr_tgtdev_put(tgtdev);
+
+	/*
+	 * Not all commands require a data transfer. If no data, just return
+	 * without constructing any PRP.
+	 */
+	for (count = 0; count < bufcnt; count++, drv_buf_iter++) {
+		if (drv_buf_iter->is_dma == false)
+			continue;
+		dma_addr = drv_buf_iter->kern_buf_dma;
+		length = drv_buf_iter->kern_buf_len;
+		break;
+	}
+
+	if (!length)
+		return 0;
+
+	mrioc->prp_sz = 0;
+	mrioc->prp_list_virt = dma_zalloc_coherent(&mrioc->pdev->dev,
+	    dev_pgsz, &mrioc->prp_list_dma, GFP_KERNEL);
+
+	if (!mrioc->prp_list_virt)
+		return -1;
+	mrioc->prp_sz = dev_pgsz;
+
+	/*
+	 * Set pointers to PRP1 and PRP2, which are in the NVMe command.
+	 * PRP1 is located at a 24 byte offset from the start of the NVMe
+	 * command.  Then set the current PRP entry pointer to PRP1.
+	 */
+	prp1_entry = (__le64 *)((u8 *)(nvme_encap_request->command) +
+	    MPI3MR_NVME_CMD_PRP1_OFFSET);
+	prp2_entry = (__le64 *)((u8 *)(nvme_encap_request->command) +
+	    MPI3MR_NVME_CMD_PRP2_OFFSET);
+	prp_entry = prp1_entry;
+	/*
+	 * For the PRP entries, use the specially allocated buffer of
+	 * contiguous memory.
+	 */
+	prp_page = (__le64 *)mrioc->prp_list_virt;
+	prp_page_dma = mrioc->prp_list_dma;
+
+	/*
+	 * Check if we are within 1 entry of a page boundary we don't
+	 * want our first entry to be a PRP List entry.
+	 */
+	page_mask = dev_pgsz - 1;
+	page_mask_result = (uintptr_t)((u8 *)prp_page + prp_size) & page_mask;
+	if (!page_mask_result) {
+		dprint_bsg_err(mrioc, "%s: PRP page is not page aligned\n",
+		    __func__);
+		goto err_out;
+	}
+
+	/*
+	 * Set PRP physical pointer, which initially points to the current PRP
+	 * DMA memory page.
+	 */
+	prp_entry_dma = prp_page_dma;
+
+
+	/* Loop while the length is not zero. */
+	while (length) {
+		page_mask_result = (prp_entry_dma + prp_size) & page_mask;
+		if (!page_mask_result && (length >  dev_pgsz)) {
+			dprint_bsg_err(mrioc,
+			    "%s: single PRP page is not sufficient\n",
+			    __func__);
+			goto err_out;
+		}
+
+		/* Need to handle if entry will be part of a page. */
+		offset = dma_addr & page_mask;
+		entry_len = dev_pgsz - offset;
+
+		if (prp_entry == prp1_entry) {
+			/*
+			 * Must fill in the first PRP pointer (PRP1) before
+			 * moving on.
+			 */
+			*prp1_entry = cpu_to_le64(dma_addr);
+			if (*prp1_entry & sgemod_mask) {
+				dprint_bsg_err(mrioc,
+				    "%s: PRP1 address collides with SGE modifier\n",
+				    __func__);
+				goto err_out;
+			}
+			*prp1_entry &= ~sgemod_mask;
+			*prp1_entry |= sgemod_val;
+
+			/*
+			 * Now point to the second PRP entry within the
+			 * command (PRP2).
+			 */
+			prp_entry = prp2_entry;
+		} else if (prp_entry == prp2_entry) {
+			/*
+			 * Should the PRP2 entry be a PRP List pointer or just
+			 * a regular PRP pointer?  If there is more than one
+			 * more page of data, must use a PRP List pointer.
+			 */
+			if (length > dev_pgsz) {
+				/*
+				 * PRP2 will contain a PRP List pointer because
+				 * more PRP's are needed with this command. The
+				 * list will start at the beginning of the
+				 * contiguous buffer.
+				 */
+				*prp2_entry = cpu_to_le64(prp_entry_dma);
+				if (*prp2_entry & sgemod_mask) {
+					dprint_bsg_err(mrioc,
+					    "%s: PRP list address collides with SGE modifier\n",
+					    __func__);
+					goto err_out;
+				}
+				*prp2_entry &= ~sgemod_mask;
+				*prp2_entry |= sgemod_val;
+
+				/*
+				 * The next PRP Entry will be the start of the
+				 * first PRP List.
+				 */
+				prp_entry = prp_page;
+				continue;
+			} else {
+				/*
+				 * After this, the PRP Entries are complete.
+				 * This command uses 2 PRP's and no PRP list.
+				 */
+				*prp2_entry = cpu_to_le64(dma_addr);
+				if (*prp2_entry & sgemod_mask) {
+					dprint_bsg_err(mrioc,
+					    "%s: PRP2 collides with SGE modifier\n",
+					    __func__);
+					goto err_out;
+				}
+				*prp2_entry &= ~sgemod_mask;
+				*prp2_entry |= sgemod_val;
+			}
+		} else {
+			/*
+			 * Put entry in list and bump the addresses.
+			 *
+			 * After PRP1 and PRP2 are filled in, this will fill in
+			 * all remaining PRP entries in a PRP List, one per
+			 * each time through the loop.
+			 */
+			*prp_entry = cpu_to_le64(dma_addr);
+			if (*prp1_entry & sgemod_mask) {
+				dprint_bsg_err(mrioc,
+				    "%s: PRP address collides with SGE modifier\n",
+				    __func__);
+				goto err_out;
+			}
+			*prp_entry &= ~sgemod_mask;
+			*prp_entry |= sgemod_val;
+			prp_entry++;
+			prp_entry_dma++;
+		}
+
+		/*
+		 * Bump the phys address of the command's data buffer by the
+		 * entry_len.
+		 */
+		dma_addr += entry_len;
+
+		/* decrement length accounting for last partial page. */
+		if (entry_len > length)
+			length = 0;
+		else
+			length -= entry_len;
+	}
+	return 0;
+err_out:
+	if (mrioc->prp_list_virt) {
+		dma_free_coherent(&mrioc->pdev->dev, mrioc->prp_sz,
+		    mrioc->prp_list_virt, mrioc->prp_list_dma);
+		mrioc->prp_list_virt = NULL;
+	}
+	return -1;
+}
+
+/**
+ * mpi3mr_bsg_process_mpt_cmds - MPI Pass through BSG handler
+ * @job: BSG job reference
+ *
+ * This function is the top level handler for MPI Pass through
+ * command, this does basic validation of the input data buffers,
+ * identifies the given buffer types and MPI command, allocates
+ * DMAable memory for user given buffers, construstcs SGL
+ * properly and passes the command to the firmware.
+ *
+ * Once the MPI command is completed the driver copies the data
+ * if any and reply, sense information to user provided buffers.
+ * If the command is timed out then issues controller reset
+ * prior to returning.
+ *
+ * Return: 0 on success and proper error codes on failure
+ */
+
+static long mpi3mr_bsg_process_mpt_cmds(struct bsg_job *job)
+{
+	long rval = -EINVAL;
+
+	struct mpi3mr_ioc *mrioc = NULL;
+	u8 *mpi_req = NULL, *sense_buff_k = NULL;
+	u8 mpi_msg_size = 0;
+	struct mpi3mr_bsg_packet *bsg_req = NULL;
+	struct mpi3mr_bsg_mptcmd *karg;
+	struct mpi3mr_buf_entry *buf_entries = NULL;
+	struct mpi3mr_buf_map *drv_bufs = NULL, *drv_buf_iter = NULL;
+	u8 count, bufcnt = 0, is_rmcb = 0, is_rmrb = 0, din_cnt = 0, dout_cnt = 0;
+	u8 invalid_be = 0, erb_offset = 0xFF, mpirep_offset = 0xFF, sg_entries = 0;
+	u8 block_io = 0, nvme_fmt = 0, resp_code = 0;
+	struct mpi3_request_header *mpi_header = NULL;
+	struct mpi3_status_reply_descriptor *status_desc;
+	struct mpi3_scsi_task_mgmt_request *tm_req;
+	u32 erbsz = MPI3MR_SENSE_BUF_SZ, tmplen;
+	u16 dev_handle;
+	struct mpi3mr_tgt_dev *tgtdev;
+	struct mpi3mr_stgt_priv_data *stgt_priv = NULL;
+	struct mpi3mr_bsg_in_reply_buf *bsg_reply_buf = NULL;
+	u32 din_size = 0, dout_size = 0;
+	u8 *din_buf = NULL, *dout_buf = NULL;
+	u8 *sgl_iter = NULL, *sgl_din_iter = NULL, *sgl_dout_iter = NULL;
+
+	bsg_req = job->request;
+	karg = (struct mpi3mr_bsg_mptcmd *)&bsg_req->cmd.mptcmd;
+
+	mpi3mr_bsg_verify_adapter(karg->mrioc_id, &mrioc);
+	if (!mrioc)
+		return -ENODEV;
+
+	if (karg->timeout < MPI3MR_APP_DEFAULT_TIMEOUT)
+		karg->timeout = MPI3MR_APP_DEFAULT_TIMEOUT;
+
+	mpi_req = kzalloc(MPI3MR_ADMIN_REQ_FRAME_SZ, GFP_KERNEL);
+	if (!mpi_req)
+		return -ENOMEM;
+	mpi_header = (struct mpi3_request_header *)mpi_req;
+
+	bufcnt = karg->buf_entry_list.num_of_entries;
+	drv_bufs = kzalloc((sizeof(*drv_bufs) * bufcnt), GFP_KERNEL);
+	if (!drv_bufs) {
+		rval = -ENOMEM;
+		goto out;
+	}
+
+	dout_buf = (uint8_t *)kzalloc(job->request_payload.payload_len,
+				      GFP_KERNEL);
+	if (!dout_buf) {
+		rval = -ENOMEM;
+		goto out;
+	}
+
+	din_buf = (uint8_t *)kzalloc(job->reply_payload.payload_len,
+				     GFP_KERNEL);
+	if (!din_buf) {
+		rval = -ENOMEM;
+		goto out;
+	}
+
+	sg_copy_to_buffer(job->request_payload.sg_list,
+			  job->request_payload.sg_cnt,
+			  dout_buf, job->request_payload.payload_len);
+
+	buf_entries = karg->buf_entry_list.buf_entry;
+	sgl_din_iter = din_buf;
+	sgl_dout_iter = dout_buf;
+	drv_buf_iter = drv_bufs;
+
+	for (count = 0; count < bufcnt; count++, buf_entries++, drv_buf_iter++) {
+
+		switch (buf_entries->buf_type) {
+		case MPI3MR_BSG_BUFTYPE_RAIDMGMT_CMD:
+			sgl_iter = sgl_dout_iter;
+			drv_buf_iter->data_dir = DATA_OUT;
+			drv_buf_iter->is_dma = true;
+			is_rmcb = 1;
+			if (count != 0)
+				invalid_be = 1;
+			break;
+		case MPI3MR_BSG_BUFTYPE_RAIDMGMT_RESP:
+			sgl_iter = sgl_din_iter;
+			drv_buf_iter->data_dir = DATA_IN;
+			drv_buf_iter->is_dma = true;
+			is_rmrb = 1;
+			if (count != 1 || !is_rmcb)
+				invalid_be = 1;
+			break;
+		case MPI3MR_BSG_BUFTYPE_DATA_IN:
+			sgl_iter = sgl_din_iter;
+			drv_buf_iter->data_dir = DATA_IN;
+			drv_buf_iter->is_dma = true;
+			din_cnt++;
+			din_size += drv_buf_iter->bsg_buf_len;
+			if ((din_cnt > 1) && !is_rmcb)
+				invalid_be = 1;
+			break;
+		case MPI3MR_BSG_BUFTYPE_DATA_OUT:
+			sgl_iter = sgl_dout_iter;
+			drv_buf_iter->data_dir = DATA_OUT;
+			drv_buf_iter->is_dma = true;
+			dout_cnt++;
+			dout_size += drv_buf_iter->bsg_buf_len;
+			if ((dout_cnt > 1) && !is_rmcb)
+				invalid_be = 1;
+			break;
+		case MPI3MR_BSG_BUFTYPE_MPI_REPLY:
+			sgl_iter = sgl_din_iter;
+			drv_buf_iter->data_dir = DATA_IN;
+			drv_buf_iter->is_dma = false;
+			mpirep_offset = count;
+			break;
+		case MPI3MR_BSG_BUFTYPE_ERR_RESPONSE:
+			sgl_iter = sgl_din_iter;
+			drv_buf_iter->data_dir = DATA_IN;
+			drv_buf_iter->is_dma = false;
+			erb_offset = count;
+			break;
+		case MPI3MR_BSG_BUFTYPE_MPI_REQUEST:
+			sgl_iter = sgl_dout_iter;
+			drv_buf_iter->data_dir = DATA_OUT;
+			drv_buf_iter->is_dma = false;
+			mpi_msg_size = buf_entries->buf_len;
+			if ((!mpi_msg_size || (mpi_msg_size % 4)) ||
+					(mpi_msg_size > MPI3MR_ADMIN_REQ_FRAME_SZ)) {
+				dprint_bsg_err(mrioc, "%s: invalid MPI message size\n",
+					__func__);
+				rval = -EINVAL;
+				goto out;
+			}
+			memcpy(mpi_req, sgl_iter, buf_entries->buf_len);
+			break;
+		default:
+			invalid_be = 1;
+			break;
+		}
+		if (invalid_be) {
+			dprint_bsg_err(mrioc, "%s: invalid buffer entries passed\n",
+				__func__);
+			rval = -EINVAL;
+			goto out;
+		}
+
+		if ((drv_buf_iter->data_dir == DATA_OUT)) {
+			sgl_dout_iter += buf_entries->buf_len;
+			if (sgl_dout_iter > (dout_buf + job->request_payload.payload_len)) {
+				dprint_bsg_err(mrioc, "%s: data_out buffer length mismatch\n",
+					__func__);
+				rval = -EINVAL;
+				goto out;
+			}
+		} else {
+			sgl_din_iter += buf_entries->buf_len;
+			if (sgl_din_iter > (din_buf + job->reply_payload.payload_len)) {
+				dprint_bsg_err(mrioc, "%s: data_in buffer length mismatch\n",
+					__func__);
+				rval = -EINVAL;
+				goto out;
+			}
+		}
+
+		drv_buf_iter->bsg_buf = sgl_iter;
+		drv_buf_iter->bsg_buf_len = buf_entries->buf_len;
+
+	}
+	if (!is_rmcb && (dout_cnt || din_cnt)) {
+		sg_entries = dout_cnt + din_cnt;
+		if (((mpi_msg_size) + (sg_entries *
+		      sizeof(struct mpi3_sge_common))) > MPI3MR_ADMIN_REQ_FRAME_SZ) {
+			dprint_bsg_err(mrioc,
+			    "%s:%d: invalid message size passed\n",
+			    __func__, __LINE__);
+			rval = -EINVAL;
+			goto out;
+		}
+	}
+	if (din_size > MPI3MR_MAX_APP_XFER_SIZE) {
+		dprint_bsg_err(mrioc,
+		    "%s:%d: invalid data transfer size passed for function 0x%x din_size=%d\n",
+		    __func__, __LINE__, mpi_header->function, din_size);
+		rval = -EINVAL;
+		goto out;
+	}
+	if (dout_size > MPI3MR_MAX_APP_XFER_SIZE) {
+		dprint_bsg_err(mrioc,
+		    "%s:%d: invalid data transfer size passed for function 0x%x dout_size = %d\n",
+		    __func__, __LINE__, mpi_header->function, dout_size);
+		rval = -EINVAL;
+		goto out;
+	}
+
+	drv_buf_iter = drv_bufs;
+	for (count = 0; count < bufcnt; count++, drv_buf_iter++) {
+		if (drv_buf_iter->is_dma == false)
+			continue;
+
+		drv_buf_iter->kern_buf_len = drv_buf_iter->bsg_buf_len;
+		if (is_rmcb && !count)
+			drv_buf_iter->kern_buf_len += ((dout_cnt + din_cnt) *
+			    sizeof(struct mpi3_sge_common));
+
+		if (!drv_buf_iter->kern_buf_len)
+			continue;
+
+		drv_buf_iter->kern_buf = dma_zalloc_coherent(&mrioc->pdev->dev,
+		    drv_buf_iter->kern_buf_len, &drv_buf_iter->kern_buf_dma,
+		    GFP_KERNEL);
+		if (!drv_buf_iter->kern_buf) {
+			rval = -ENOMEM;
+			goto out;
+		}
+		if ((drv_buf_iter->data_dir == DATA_OUT)) {
+			tmplen = min(drv_buf_iter->kern_buf_len,
+			    drv_buf_iter->bsg_buf_len);
+			memcpy(drv_buf_iter->kern_buf, drv_buf_iter->bsg_buf, tmplen);
+		}
+	}
+
+	if (erb_offset != 0xFF) {
+		sense_buff_k = kzalloc(erbsz, GFP_KERNEL);
+		if (!sense_buff_k) {
+			rval = -ENOMEM;
+			goto out;
+		}
+	}
+
+	if (mutex_lock_interruptible(&mrioc->bsg_cmds.mutex)) {
+		rval = -ERESTARTSYS;
+		goto out;
+	}
+	if (mrioc->bsg_cmds.state & MPI3MR_CMD_PENDING) {
+		rval = -EAGAIN;
+		dprint_bsg_err(mrioc, "%s: command is in use\n", __func__);
+		mutex_unlock(&mrioc->bsg_cmds.mutex);
+		goto out;
+	}
+	if (mrioc->unrecoverable) {
+		dprint_bsg_err(mrioc, "%s: unrecoverable controller\n",
+		    __func__);
+		rval = -EFAULT;
+		mutex_unlock(&mrioc->bsg_cmds.mutex);
+		goto out;
+	}
+	if (mrioc->reset_in_progress) {
+		dprint_bsg_err(mrioc, "%s: reset in progress\n", __func__);
+		rval = -EAGAIN;
+		mutex_unlock(&mrioc->bsg_cmds.mutex);
+		goto out;
+	}
+	if (mrioc->block_bsgs) {
+		dprint_bsg_err(mrioc, "%s: bsgs are blocked\n", __func__);
+		rval = -EAGAIN;
+		mutex_unlock(&mrioc->bsg_cmds.mutex);
+		goto out;
+	}
+
+	if (mpi_header->function != MPI3_FUNCTION_NVME_ENCAPSULATED) {
+		mpi3mr_bsg_build_sgl(mpi_req, (mpi_msg_size),
+		    drv_bufs, bufcnt, is_rmcb, is_rmrb,
+		    (dout_cnt + din_cnt));
+	}
+
+	if (mpi_header->function == MPI3_FUNCTION_NVME_ENCAPSULATED) {
+		nvme_fmt = mpi3mr_get_nvme_data_fmt(
+			(struct mpi3_nvme_encapsulated_request *)mpi_req);
+		if (nvme_fmt == MPI3MR_NVME_DATA_FORMAT_PRP) {
+			if (mpi3mr_build_nvme_prp(mrioc,
+			    (struct mpi3_nvme_encapsulated_request *)mpi_req,
+			    drv_bufs, bufcnt)) {
+				rval = -ENOMEM;
+				mutex_unlock(&mrioc->bsg_cmds.mutex);
+				goto out;
+			}
+		} else if (nvme_fmt == MPI3MR_NVME_DATA_FORMAT_SGL1 ||
+			nvme_fmt == MPI3MR_NVME_DATA_FORMAT_SGL2) {
+			if (mpi3mr_build_nvme_sgl(mrioc,
+			    (struct mpi3_nvme_encapsulated_request *)mpi_req,
+			    drv_bufs, bufcnt)) {
+				rval = -EINVAL;
+				mutex_unlock(&mrioc->bsg_cmds.mutex);
+				goto out;
+			}
+		} else {
+			dprint_bsg_err(mrioc,
+			    "%s:invalid NVMe command format\n", __func__);
+			rval = -EINVAL;
+			mutex_unlock(&mrioc->bsg_cmds.mutex);
+			goto out;
+		}
+	}
+	if (mpi_header->function == MPI3_FUNCTION_SCSI_TASK_MGMT) {
+		tm_req = (struct mpi3_scsi_task_mgmt_request *)mpi_req;
+		if (tm_req->task_type !=
+		    MPI3_SCSITASKMGMT_TASKTYPE_ABORT_TASK) {
+			dev_handle = tm_req->dev_handle;
+			block_io = 1;
+		}
+	}
+	if (block_io) {
+		tgtdev = mpi3mr_get_tgtdev_by_handle(mrioc, dev_handle);
+		if (tgtdev && tgtdev->starget && tgtdev->starget->hostdata) {
+			stgt_priv = (struct mpi3mr_stgt_priv_data *)
+			    tgtdev->starget->hostdata;
+			atomic_inc(&stgt_priv->block_io);
+			mpi3mr_tgtdev_put(tgtdev);
+		}
+	}
+
+	mrioc->bsg_cmds.state = MPI3MR_CMD_PENDING;
+	mrioc->bsg_cmds.is_waiting = 1;
+	mrioc->bsg_cmds.callback = NULL;
+	mrioc->bsg_cmds.is_sense = 0;
+	mrioc->bsg_cmds.sensebuf = sense_buff_k;
+	memset(mrioc->bsg_cmds.reply, 0, mrioc->reply_sz);
+	mpi_header->host_tag = cpu_to_le16(MPI3MR_HOSTTAG_BSG_CMDS);
+	if (mrioc->logging_level & MPI3_DEBUG_BSG_INFO) {
+		dprint_bsg_info(mrioc,
+		    "%s: posting bsg request to the controller\n", __func__);
+		dprint_dump(mpi_req, MPI3MR_ADMIN_REQ_FRAME_SZ,
+		    "bsg_mpi3_req");
+		if (mpi_header->function == MPI3_FUNCTION_MGMT_PASSTHROUGH) {
+			drv_buf_iter = &drv_bufs[0];
+			dprint_dump(drv_buf_iter->kern_buf,
+			    drv_buf_iter->kern_buf_len, "mpi3_mgmt_req");
+		}
+	}
+
+	init_completion(&mrioc->bsg_cmds.done);
+	rval = mpi3mr_admin_request_post(mrioc, mpi_req,
+	    MPI3MR_ADMIN_REQ_FRAME_SZ, 0);
+
+
+	if (rval) {
+		mrioc->bsg_cmds.is_waiting = 0;
+		dprint_bsg_err(mrioc,
+		    "%s: posting bsg request is failed\n", __func__);
+		rval = -EAGAIN;
+		goto out_unlock;
+	}
+	wait_for_completion_timeout(&mrioc->bsg_cmds.done,
+	    (karg->timeout * HZ));
+	if (block_io && stgt_priv)
+		atomic_dec(&stgt_priv->block_io);
+	if (!(mrioc->bsg_cmds.state & MPI3MR_CMD_COMPLETE)) {
+		mrioc->bsg_cmds.is_waiting = 0;
+		rval = -EAGAIN;
+		if (mrioc->bsg_cmds.state & MPI3MR_CMD_RESET)
+			goto out_unlock;
+		dprint_bsg_err(mrioc,
+		    "%s: bsg request timedout after %d seconds\n", __func__,
+		    karg->timeout);
+		if (mrioc->logging_level & MPI3_DEBUG_BSG_ERROR) {
+			dprint_dump(mpi_req, MPI3MR_ADMIN_REQ_FRAME_SZ,
+			    "bsg_mpi3_req");
+			if (mpi_header->function ==
+			    MPI3_FUNCTION_MGMT_PASSTHROUGH) {
+				drv_buf_iter = &drv_bufs[0];
+				dprint_dump(drv_buf_iter->kern_buf,
+				    drv_buf_iter->kern_buf_len, "mpi3_mgmt_req");
+			}
+		}
+		if ((mpi_header->function == MPI3_FUNCTION_NVME_ENCAPSULATED) ||
+		    (mpi_header->function == MPI3_FUNCTION_SCSI_IO))
+			mpi3mr_issue_tm(mrioc,
+			    MPI3_SCSITASKMGMT_TASKTYPE_TARGET_RESET,
+			    mpi_header->function_dependent, 0,
+			    MPI3MR_HOSTTAG_BLK_TMS, MPI3MR_RESETTM_TIMEOUT,
+			    &mrioc->host_tm_cmds, &resp_code, NULL);
+		if (!(mrioc->bsg_cmds.state & MPI3MR_CMD_COMPLETE) &&
+		    !(mrioc->bsg_cmds.state & MPI3MR_CMD_RESET))
+			mpi3mr_soft_reset_handler(mrioc,
+			    MPI3MR_RESET_FROM_APP_TIMEOUT, 1);
+		goto out_unlock;
+	}
+	dprint_bsg_info(mrioc, "%s: bsg request is completed\n", __func__);
+
+	if (mrioc->prp_list_virt) {
+		dma_free_coherent(&mrioc->pdev->dev, mrioc->prp_sz,
+		    mrioc->prp_list_virt, mrioc->prp_list_dma);
+		mrioc->prp_list_virt = NULL;
+	}
+
+	if ((mrioc->bsg_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK)
+	     != MPI3_IOCSTATUS_SUCCESS) {
+		dprint_bsg_info(mrioc,
+		    "%s: command failed, ioc_status(0x%04x) log_info(0x%08x)\n",
+		    __func__,
+		    (mrioc->bsg_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK),
+		    mrioc->bsg_cmds.ioc_loginfo);
+	}
+
+	if ((mpirep_offset != 0xFF) &&
+	    drv_bufs[mpirep_offset].bsg_buf_len) {
+		drv_buf_iter = &drv_bufs[mpirep_offset];
+		drv_buf_iter->kern_buf_len = (sizeof(*bsg_reply_buf) - 1 +
+					   mrioc->reply_sz);
+		bsg_reply_buf = kzalloc(drv_buf_iter->kern_buf_len, GFP_KERNEL);
+
+		if (!bsg_reply_buf) {
+			rval = -ENOMEM;
+			goto out_unlock;
+		}
+		if (mrioc->bsg_cmds.state & MPI3MR_CMD_REPLY_VALID) {
+			bsg_reply_buf->mpi_reply_type =
+				MPI3MR_BSG_MPI_REPLY_BUFTYPE_ADDRESS;
+			memcpy(bsg_reply_buf->reply_buf,
+			    mrioc->bsg_cmds.reply, mrioc->reply_sz);
+		} else {
+			bsg_reply_buf->mpi_reply_type =
+				MPI3MR_BSG_MPI_REPLY_BUFTYPE_STATUS;
+			status_desc = (struct mpi3_status_reply_descriptor *)
+			    bsg_reply_buf->reply_buf;
+			status_desc->ioc_status = mrioc->bsg_cmds.ioc_status;
+			status_desc->ioc_log_info = mrioc->bsg_cmds.ioc_loginfo;
+		}
+		tmplen = min(drv_buf_iter->kern_buf_len,
+			drv_buf_iter->bsg_buf_len);
+		memcpy(drv_buf_iter->bsg_buf, bsg_reply_buf, tmplen);
+	}
+
+	if (erb_offset != 0xFF && mrioc->bsg_cmds.sensebuf &&
+	    mrioc->bsg_cmds.is_sense) {
+		drv_buf_iter = &drv_bufs[erb_offset];
+		tmplen = min(erbsz, drv_buf_iter->bsg_buf_len);
+		memcpy(drv_buf_iter->bsg_buf, sense_buff_k, tmplen);
+	}
+
+	drv_buf_iter = drv_bufs;
+	for (count = 0; count < bufcnt; count++, drv_buf_iter++) {
+		if (drv_buf_iter->is_dma == false)
+			continue;
+		if (drv_buf_iter->data_dir == DATA_IN) {
+			tmplen = min(drv_buf_iter->kern_buf_len,
+				     drv_buf_iter->bsg_buf_len);
+			memcpy(drv_buf_iter->bsg_buf,
+			       drv_buf_iter->kern_buf, tmplen);
+		}
+	}
+
+out_unlock:
+	if (din_buf) {
+		job->reply_payload_rcv_len =
+			sg_copy_from_buffer(job->reply_payload.sg_list,
+					    job->reply_payload.sg_cnt,
+					    din_buf, job->reply_payload.payload_len);
+	}
+	mrioc->bsg_cmds.is_sense = 0;
+	mrioc->bsg_cmds.sensebuf = NULL;
+	mrioc->bsg_cmds.state = MPI3MR_CMD_NOTUSED;
+	mutex_unlock(&mrioc->bsg_cmds.mutex);
+out:
+	kfree(sense_buff_k);
+	kfree(dout_buf);
+	kfree(din_buf);
+	kfree(mpi_req);
+	if (drv_bufs) {
+		drv_buf_iter = drv_bufs;
+		for (count = 0; count < bufcnt; count++, drv_buf_iter++) {
+			if (drv_buf_iter->kern_buf && drv_buf_iter->kern_buf_dma)
+				dma_free_coherent(&mrioc->pdev->dev,
+				    drv_buf_iter->kern_buf_len,
+				    drv_buf_iter->kern_buf,
+				    drv_buf_iter->kern_buf_dma);
+		}
+		kfree(drv_bufs);
+	}
+	kfree(bsg_reply_buf);
+	return rval;
+}
+
+/**
+ * mpi3mr_bsg_request - bsg request entry point
+ * @job: BSG job reference
+ *
+ * This the the drivers entry point for bsg requests coming
+ * bsg layer
+ *
+ * Return: 0 on success and proper error codes on failure
+ */
+int mpi3mr_bsg_request(struct bsg_job *job)
+{
+	long rval = -EINVAL;
+	struct mpi3mr_bsg_packet *bsg_req = job->request;
+
+	switch (bsg_req->cmd_type) {
+	case MPI3MRDRVCMD:
+		rval = mpi3mr_bsg_process_drv_cmds(job);
+		break;
+	case MPI3MRMPTCMD:
+		rval = mpi3mr_bsg_process_mpt_cmds(job);
+		break;
+	default:
+		pr_err("%s: unsupported BSG command(0x%08x)\n",
+		    MPI3MR_DRIVER_NAME, bsg_req->cmd_type);
+		break;
+	}
+
+	bsg_job_done(job, rval, job->reply_payload_rcv_len);
+
+	return 0;
+}
+
+/**
+ * mpi3mr_app_save_logdata - Save Log Data events
+ * @mrioc: Adapter instance reference
+ * @event_data: event data associated with log data event
+ * @event_data_size: event data size to copy
+ *
+ * If log data event caching is enabled by the applicatiobns,
+ * then this function saves the log data in the circular queue
+ * and Sends async signal SIGIO to indicate there is an async
+ * event from the firmware to the event monitoring applications.
+ *
+ * Return:Nothing
+ */
+void mpi3mr_app_save_logdata(struct mpi3mr_ioc *mrioc, char *event_data,
+    u16 event_data_size)
+{
+	u32 index = mrioc->logdata_buf_idx, sz;
+	struct mpi3mr_logdata_entry *entry;
+
+	if (!(mrioc->logdata_buf))
+		return;
+
+	entry = (struct mpi3mr_logdata_entry *)
+		(mrioc->logdata_buf + (index * mrioc->logdata_entry_sz));
+	entry->valid_entry = 1;
+	sz = min(mrioc->logdata_entry_sz, event_data_size);
+	memcpy(entry->data, event_data, sz);
+	mrioc->logdata_buf_idx =
+		((++index) % MPI3MR_BSG_LOGDATA_MAX_ENTRIES);
+	atomic64_inc(&event_counter);
+}
+
+/**
+ * mpi3mr_bsg_exit - de-registration from bsg layer
+ *
+ * This will be called during driver unload and all
+ * bsg resources allocated during load will be freed.
+ *
+ * Return:Nothing
+ */
+void mpi3mr_bsg_exit(struct mpi3mr_ioc *mrioc)
+{
+	if (!mrioc->bsg_queue)
+		return;
+
+	bsg_remove_queue(mrioc->bsg_queue);
+	mrioc->bsg_queue = NULL;
+
+	device_del(mrioc->bsg_dev);
+	kfree(mrioc->bsg_dev);
+	return;
+}
+
+/**
+ * mpi3mr_bsg_node_release -release bsg device node
+ * @dev: bsg device node
+ *
+ * decrements bsg dev reference count
+ *
+ * Return:Nothing
+ */
+void mpi3mr_bsg_node_release(struct device *dev)
+{
+	put_device(dev);
+	return;
+}
+
+/**
+ * mpi3mr_bsg_init -  registration with bsg layer
+ *
+ * This will be called during driver load and it will
+ * register driver with bsg layer
+ *
+ * Return:Nothing
+ */
+void mpi3mr_bsg_init(struct mpi3mr_ioc *mrioc)
+{
+	mrioc->bsg_dev = kzalloc(sizeof(struct device), GFP_KERNEL);
+	if (!mrioc->bsg_dev) {
+		ioc_err(mrioc, "bsg device mem allocation failed\n");
+		return;
+	}
+
+	device_initialize(mrioc->bsg_dev);
+	dev_set_name(mrioc->bsg_dev, "mpi3mrctl%u", mrioc->id);
+
+	if (device_add(mrioc->bsg_dev)) {
+		ioc_err(mrioc, "%s: bsg device add failed\n",
+		    dev_name(mrioc->bsg_dev));
+		goto err_device_add;
+	}
+
+	mrioc->bsg_dev->release = mpi3mr_bsg_node_release;
+
+	mrioc->bsg_queue = bsg_setup_queue(mrioc->bsg_dev, dev_name(mrioc->bsg_dev),
+			mpi3mr_bsg_request, NULL, 0);
+	if (!mrioc->bsg_queue) {
+		ioc_err(mrioc, "%s: bsg registration failed\n",
+		    dev_name(mrioc->bsg_dev));
+		goto err_setup_queue;
+	}
+
+	blk_queue_max_segments(mrioc->bsg_queue, MPI3MR_MAX_APP_XFER_SEGMENTS);
+	blk_queue_max_hw_sectors(mrioc->bsg_queue, MPI3MR_MAX_APP_XFER_SECTORS);
+
+	return;
+
+err_setup_queue:
+	device_del(mrioc->bsg_dev);
+
+err_device_add:
+	kfree(mrioc->bsg_dev);
+	return;
+}
+
+/*
+ * SCSI Host attributes under sysfs
+ */
+/**
+ * version_fw_show - SysFS callback for firmware version read
+ * @dev: class device
+ * @attr: Device attributes
+ * @buf: Buffer to copy
+ *
+ * Return: snprintf() return after copying firmware version
+ */
+static ssize_t
+version_fw_show(struct device *dev, struct device_attribute *attr,
+	char *buf)
+{
+	struct Scsi_Host *shost = class_to_shost(dev);
+	struct mpi3mr_ioc *mrioc = shost_priv(shost);
+	struct mpi3mr_compimg_ver *fwver = &mrioc->facts.fw_ver;
+
+	return snprintf(buf, PAGE_SIZE, "%d.%d.%d.%d.%05d-%05d\n",
+	    fwver->gen_major, fwver->gen_minor, fwver->ph_major,
+	    fwver->ph_minor, fwver->cust_id, fwver->build_num);
+}
+static DEVICE_ATTR_RO(version_fw);
+
+/**
+ * fw_queue_depth_show - SysFS callback for firmware max cmds
+ * @dev: class device
+ * @attr: Device attributes
+ * @buf: Buffer to copy
+ *
+ * Return: snprintf() return after copying firmware max commands
+ */
+static ssize_t
+fw_queue_depth_show(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct Scsi_Host *shost = class_to_shost(dev);
+	struct mpi3mr_ioc *mrioc = shost_priv(shost);
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", mrioc->facts.max_reqs);
+}
+static DEVICE_ATTR_RO(fw_queue_depth);
+
+/**
+ * op_req_q_count_show - SysFS callback for request queue count
+ * @dev: class device
+ * @attr: Device attributes
+ * @buf: Buffer to copy
+ *
+ * Return: snprintf() return after copying request queue count
+ */
+static ssize_t
+op_req_q_count_show(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct Scsi_Host *shost = class_to_shost(dev);
+	struct mpi3mr_ioc *mrioc = shost_priv(shost);
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", mrioc->num_op_req_q);
+}
+static DEVICE_ATTR_RO(op_req_q_count);
+
+/**
+ * reply_queue_count_show - SysFS callback for reply queue count
+ * @dev: class device
+ * @attr: Device attributes
+ * @buf: Buffer to copy
+ *
+ * Return: snprintf() return after copying reply queue count
+ */
+static ssize_t
+reply_queue_count_show(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct Scsi_Host *shost = class_to_shost(dev);
+	struct mpi3mr_ioc *mrioc = shost_priv(shost);
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", mrioc->num_op_reply_q);
+}
+
+static DEVICE_ATTR_RO(reply_queue_count);
+
+/**
+ * mpi3mr_app_logging_level_show - Show controller debug level
+ * @dev: class device
+ * @attr: Device attributes
+ * @buf: Buffer to copy
+ *
+ * A sysfs 'read/write' shost attribute, to show the current
+ * debug log level used by the driver for the specific
+ * controller.
+ *
+ * Return: snprintf() return
+ */
+static ssize_t
+mpi3mr_app_logging_level_show(struct device *dev,
+	struct device_attribute *attr, char *buf)
+
+{
+	struct Scsi_Host *shost = class_to_shost(dev);
+	struct mpi3mr_ioc *mrioc = shost_priv(shost);
+	return snprintf(buf, PAGE_SIZE, "%08xh\n", mrioc->logging_level);
+}
+
+/**
+ * mpi3mr_app_logging_level_store- Change controller debug level
+ * @dev: class device
+ * @attr: Device attributes
+ * @buf: Buffer to copy
+ * @count: size of the buffer
+ *
+ * A sysfs 'read/write' shost attribute, to change the current
+ * debug log level used by the driver for the specific
+ * controller.
+ *
+ * Return: strlen() return
+ */
+static ssize_t
+mpi3mr_app_logging_level_store(struct device *dev,
+	struct device_attribute *attr,
+	const char *buf, size_t count)
+{
+	struct Scsi_Host *shost = class_to_shost(dev);
+	struct mpi3mr_ioc *mrioc = shost_priv(shost);
+	int val = 0;
+
+	if (sscanf(buf, "%x", &val) != 1)
+		return -EINVAL;
+
+	mrioc->logging_level = val;
+	ioc_info(mrioc, "logging_level=%08xh\n", mrioc->logging_level);
+	return strlen(buf);
+}
+static DEVICE_ATTR(logging_level, 0644,
+	mpi3mr_app_logging_level_show,
+	mpi3mr_app_logging_level_store);
+
+/**
+ * adapter_state_show - SysFS callback for adapter state show
+ * @dev: class device
+ * @attr: Device attributes
+ * @buf: Buffer to copy
+ *
+ * Return: snprintf() return after copying adapter state
+ */
+static ssize_t
+adp_state_show(struct device *dev, struct device_attribute *attr,
+	char *buf)
+{
+	struct Scsi_Host *shost = class_to_shost(dev);
+	struct mpi3mr_ioc *mrioc = shost_priv(shost);
+	enum mpi3mr_iocstate ioc_state;
+	uint8_t adp_state;
+
+	ioc_state = mpi3mr_get_iocstate(mrioc);
+	if (ioc_state == MRIOC_STATE_UNRECOVERABLE)
+		adp_state = MPI3MR_BSG_ADPSTATE_UNRECOVERABLE;
+	else if ((mrioc->reset_in_progress) || (mrioc->block_bsgs))
+		adp_state = MPI3MR_BSG_ADPSTATE_IN_RESET;
+	else if (ioc_state == MRIOC_STATE_FAULT)
+		adp_state = MPI3MR_BSG_ADPSTATE_FAULT;
+	else
+		adp_state = MPI3MR_BSG_ADPSTATE_OPERATIONAL;
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", adp_state);
+}
+static DEVICE_ATTR_RO(adp_state);
+
+
+/**
+ * mpi3mr_app_complete_tm - SysFS TM completion callback
+ * @mrioc: Adapter instance reference
+ * @drv_cmd: Internal command tracker
+ *
+ * This is a call back handler for the TM requests issued to the
+ * firmware through SysFS interface in non blocking mode. This
+ * functions wakes up pending TM wait queue event when all TMs
+ * issued are completed.
+ *
+ * Return: nothing
+ */
+static void mpi3mr_app_complete_tm(struct mpi3mr_ioc *mrioc,
+    struct mpi3mr_drv_cmd *drv_cmd)
+{
+	struct mpi3_scsi_task_mgmt_reply *tm_reply = NULL;
+	u16 cmd_idx = drv_cmd->host_tag - MPI3MR_HOSTTAG_SYSFS_TM_MIN;
+
+	if (drv_cmd->state & MPI3MR_CMD_RESET)
+		goto clear_drv_cmd;
+
+	if (drv_cmd->state & MPI3MR_CMD_REPLY_VALID) {
+		tm_reply = (struct mpi3_scsi_task_mgmt_reply *)drv_cmd->reply;
+		ioc_info(mrioc,
+		    "%s:TM[%d]: completed, handle(0x%04x), ioc_status(0x%04x), log_info(0x%08x), termination_count(%d), response_code(0x%02x)\n",
+		    __func__, cmd_idx+1, drv_cmd->dev_handle,
+		    drv_cmd->ioc_status, drv_cmd->ioc_loginfo,
+		    le32_to_cpu(tm_reply->termination_count),
+		    (le32_to_cpu(tm_reply->response_data) &
+		     MPI3MR_RI_MASK_RESPCODE));
+		mrioc->sysfs_tm_terminated_io_count +=
+			le32_to_cpu(tm_reply->termination_count);
+	}
+clear_drv_cmd:
+	atomic_dec(&mrioc->sysfs_tm_pending);
+	if (!atomic_read(&mrioc->sysfs_tm_pending))
+		wake_up(&mrioc->sysfs_pending_tm_wq);
+	drv_cmd->state = MPI3MR_CMD_NOTUSED;
+	drv_cmd->callback = NULL;
+	drv_cmd->dev_handle = MPI3MR_INVALID_DEV_HANDLE;
+	return;
+}
+
+/**
+ * mpi3mr_app_issue_tm - sends Task Management request
+ * @mrioc: Adapter instance reference
+ * @tm_type: Task Management type
+ * @handle: Firmware device handle
+ * @lun: lun ID
+ * @cmd_priv: SCSI command private data
+ *
+ * This function sends Task Management request to the firmware
+ * through admin request queue.
+ *
+ * Return: 0 on success, -1 on failure
+ */
+static int mpi3mr_app_issue_tm(struct mpi3mr_ioc *mrioc, u8 tm_type,
+	u16 handle, uint lun, struct scmd_priv *cmd_priv)
+{
+	struct mpi3_scsi_task_mgmt_request tm_req;
+	int r = -1;
+	struct op_req_qinfo *op_req_q = NULL;
+	struct mpi3mr_drv_cmd *drv_cmd;
+
+	if ((mrioc->unrecoverable) || (mrioc->reset_in_progress) ||
+	    (mrioc->sysfs_tm_issued >= MPI3MR_NUM_SYSFS_TM)) {
+		return r;
+	}
+	drv_cmd = &mrioc->sysfs_tm_cmds[mrioc->sysfs_tm_issued];
+
+	if (drv_cmd->state & MPI3MR_CMD_PENDING)
+		return r;
+
+	memset(&tm_req, 0, sizeof(tm_req));
+	drv_cmd->state = MPI3MR_CMD_PENDING;
+	drv_cmd->is_waiting = 0;
+	drv_cmd->callback = mpi3mr_app_complete_tm;
+	drv_cmd->dev_handle = handle;
+	tm_req.dev_handle = cpu_to_le16(handle);
+	tm_req.host_tag = cpu_to_le16(drv_cmd->host_tag);
+	tm_req.function = MPI3_FUNCTION_SCSI_TASK_MGMT;
+	tm_req.task_type = tm_type;
+
+	int_to_scsilun(lun, (struct scsi_lun *)tm_req.lun);
+
+	if (cmd_priv) {
+		op_req_q = &mrioc->req_qinfo[cmd_priv->req_q_idx];
+		tm_req.task_host_tag = cpu_to_le16(cmd_priv->host_tag);
+		tm_req.task_request_queue_id = cpu_to_le16(op_req_q->qid);
+	}
+
+	ioc_info(mrioc, "%s: TM[%d] type (0x%02x) issued for handle (0x%04x)\n",
+		__func__, mrioc->sysfs_tm_issued + 1, tm_type, handle);
+	atomic_inc(&mrioc->sysfs_tm_pending);
+	r = mpi3mr_admin_request_post(mrioc, &tm_req, sizeof(tm_req), 1);
+	if (r) {
+		atomic_dec(&mrioc->sysfs_tm_pending);
+		ioc_err(mrioc, "%s : posting TM[%d] failed\n", __func__,
+		    mrioc->sysfs_tm_issued + 1);
+	} else
+		mrioc->sysfs_tm_issued++;
+
+	return r;
+}
+
+/**
+ * mpi3mr_app_issue_abort_task - sends Task Abort
+ * @rq: Block I/O request
+ * @data: Adapter instance
+ * @reserved: Unused
+ *
+ * This function sends Abort Task Management request to the
+ * firmware, this is iterator callback for every I/O.
+ *
+ * Return: 0 on success, -1 on failure
+ */
+static BLK_ITER_CALLBACK_RET_TYPE mpi3mr_app_issue_abort_task(
+    struct request *rq, void *data, bool reserved)
+{
+	struct mpi3mr_ioc *mrioc = (struct mpi3mr_ioc *)data;
+	struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq);
+	struct scmd_priv *priv = NULL;
+	struct mpi3mr_stgt_priv_data *stgt_priv_data;
+	struct mpi3mr_sdev_priv_data *sdev_priv_data;
+	u16 dev_handle;
+
+	if (scmd) {
+		sdev_priv_data = scmd->device->hostdata;
+		if (sdev_priv_data && sdev_priv_data->tgt_priv_data) {
+			stgt_priv_data = sdev_priv_data->tgt_priv_data;
+			dev_handle = stgt_priv_data->dev_handle;
+			priv = scsi_cmd_priv(scmd);
+		}
+	}
+
+	if (priv && priv->in_lld_scope &&
+	    (mrioc->sysfs_tm_issued < MPI3MR_NUM_SYSFS_TM)) {
+		mpi3mr_app_issue_tm(mrioc,
+		    MPI3_SCSITASKMGMT_TASKTYPE_ABORT_TASK, dev_handle,
+		    sdev_priv_data->lun_id, priv);
+		BLK_ITER_CALLBACK_RET_VAL(true);
+	}
+
+	BLK_ITER_CALLBACK_RET_VAL(false);
+}
+
+/**
+ * mpi3mr_app_tm_sysfs - sends TM of given type
+ * @mrioc: Adapter instance reference
+ * @tm_type: Task Management type
+ *
+ * This function checks TM type and issue appropriate number of
+ * specific TM to the devices/IO requests under the scope of the
+ * TM and the controller and waits the TM requests to complete.
+ * If TM requests are not completed within predefined timeout
+ * then issues controller reset
+ *
+ * Return: Nothing
+ */
+static void mpi3mr_app_tm_sysfs(struct mpi3mr_ioc *mrioc, u8 tm_type)
+{
+	struct mpi3mr_tgt_dev *tgtdev;
+	struct mpi3mr_stgt_priv_data *stgt_priv_data;
+	struct mpi3mr_sdev_priv_data *sdev_priv_data;
+	struct scsi_device *sdev;
+	unsigned long flags, r;
+
+	if ((mrioc->unrecoverable) || (mrioc->reset_in_progress))
+		return;
+
+	init_waitqueue_head(&mrioc->sysfs_pending_tm_wq);
+	atomic_set(&mrioc->sysfs_tm_pending, 0);
+	mrioc->sysfs_tm_issued = 0;
+	mrioc->sysfs_tm_terminated_io_count = 0;
+
+	scsi_block_requests(mrioc->shost);
+
+	switch (tm_type) {
+	case MPI3_SCSITASKMGMT_TASKTYPE_ABORT_TASK:
+		blk_mq_tagset_busy_iter(&mrioc->shost->tag_set,
+				mpi3mr_app_issue_abort_task, (void *)mrioc);
+		break;
+
+	case MPI3_SCSITASKMGMT_TASKTYPE_TARGET_RESET:
+		spin_lock_irqsave(&mrioc->tgtdev_lock, flags);
+		list_for_each_entry(tgtdev, &mrioc->tgtdev_list, list)
+			if (mrioc->sysfs_tm_issued < MPI3MR_NUM_SYSFS_TM)
+				mpi3mr_app_issue_tm(mrioc, tm_type,
+				    tgtdev->dev_handle, 0, NULL);
+		spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags);
+		break;
+
+	case MPI3_SCSITASKMGMT_TASKTYPE_LOGICAL_UNIT_RESET:
+	case MPI3_SCSITASKMGMT_TASKTYPE_ABORT_TASK_SET:
+		shost_for_each_device(sdev, mrioc->shost) {
+			 /* wait for free hpr message frames */
+			sdev_priv_data = sdev->hostdata;
+			if (!sdev_priv_data ||
+			    !sdev_priv_data->tgt_priv_data)
+				continue;
+			stgt_priv_data = sdev_priv_data->tgt_priv_data;
+			if (mrioc->sysfs_tm_issued < MPI3MR_NUM_SYSFS_TM)
+				mpi3mr_app_issue_tm(mrioc, tm_type,
+				    stgt_priv_data->dev_handle,
+				    sdev_priv_data->lun_id, NULL);
+		}
+		break;
+	}
+	scsi_unblock_requests(mrioc->shost);
+
+	if (atomic_read(&mrioc->sysfs_tm_pending)) {
+		r = wait_event_timeout(mrioc->sysfs_pending_tm_wq,
+		    !atomic_read(&mrioc->sysfs_tm_pending),
+		    MPI3MR_SYSFS_TM_TIMEOUT*HZ);
+		if (!r) {
+			ioc_err(mrioc,
+			    "%s: %d TM requests timed out\n", __func__,
+			    atomic_read(&mrioc->sysfs_tm_pending));
+			mpi3mr_soft_reset_handler(mrioc,
+			    MPI3MR_RESET_FROM_SYSFS_TIMEOUT, 1);
+		}
+	}
+
+	ioc_info(mrioc, "%s: task management requests issued(%d)\n", __func__,
+	    mrioc->sysfs_tm_issued);
+	ioc_info(mrioc, "%s: number of IOs terminated(%d)\n", __func__,
+	    mrioc->sysfs_tm_terminated_io_count);
+}
+
+/**
+ * mpi3mr_app_task_management_store- Issue a TM/controller reset
+ * @cdev: class device
+ * @attr: Device attributes
+ * @buf: Buffer to copy
+ * @count: size of the buffer
+ *
+ * A sysfs 'read/write' shost attribute, to issue a set of TMs
+ * or a controller reset to validate the controller firmware for
+ * user applications.
+ *
+ * Return: strlen() return
+ */
+static ssize_t
+mpi3mr_app_task_management_store(struct device *cdev,
+    struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct Scsi_Host *shost = class_to_shost(cdev);
+	struct mpi3mr_ioc *mrioc = shost_priv(shost);
+	int opcode = 0;
+
+	if (sscanf(buf, "%d", &opcode) != 1)
+		return -EINVAL;
+	if (mrioc->unrecoverable)
+		return -EINVAL;
+
+	switch (opcode) {
+
+	case MPI3MR_SYSFS_TM_SOFT_RESET:
+		scsi_block_requests(mrioc->shost);
+		ioc_info(mrioc, "%s: soft reset issued, status=%s\n", __func__,
+		    ((!mpi3mr_soft_reset_handler(mrioc, MPI3MR_RESET_FROM_SYSFS,
+			0)) ? "SUCCESS" : "FAILED"));
+		scsi_unblock_requests(mrioc->shost);
+		break;
+
+	case MPI3MR_SYSFS_TM_DIAG_FAULT_RESET:
+		scsi_block_requests(mrioc->shost);
+		ioc_info(mrioc, "%s: diag fault reset issued, status=%s\n",
+		    __func__,
+		    ((!mpi3mr_soft_reset_handler(mrioc, MPI3MR_RESET_FROM_SYSFS,
+			1)) ? "SUCCESS" : "FAILED"));
+		scsi_unblock_requests(mrioc->shost);
+		break;
+
+	case MPI3MR_SYSFS_TM_ABORT_TASK:
+		ioc_info(mrioc, "%s: abort task issued\n", __func__);
+		mpi3mr_app_tm_sysfs(mrioc,
+		    MPI3_SCSITASKMGMT_TASKTYPE_ABORT_TASK);
+		break;
+
+	case MPI3MR_SYSFS_TM_TARGET_RESET:
+		ioc_info(mrioc, "%s: target reset issued\n", __func__);
+		mpi3mr_app_tm_sysfs(mrioc,
+		    MPI3_SCSITASKMGMT_TASKTYPE_TARGET_RESET);
+		break;
+
+	case MPI3MR_SYSFS_TM_LUN_RESET:
+		ioc_info(mrioc, "%s: lun reset issued\n", __func__);
+		mpi3mr_app_tm_sysfs(mrioc,
+		    MPI3_SCSITASKMGMT_TASKTYPE_LOGICAL_UNIT_RESET);
+		break;
+
+	case MPI3MR_SYSFS_TM_ABORT_TASK_SET:
+		ioc_info(mrioc, "%s: abort task set issued\n", __func__);
+		mpi3mr_app_tm_sysfs(mrioc,
+		    MPI3_SCSITASKMGMT_TASKTYPE_ABORT_TASK_SET);
+		break;
+
+	default:
+		ioc_warn(mrioc, "%s: unsupported opcode(%d)\n",
+		    __func__, opcode);
+		break;
+	};
+
+	return strlen(buf);
+}
+static DEVICE_ATTR(task_management, 0200, NULL,
+	mpi3mr_app_task_management_store);
+
+#if (KERNEL_VERSION(5, 16, 0) > LINUX_VERSION_CODE)
+struct device_attribute *mpi3mr_host_attrs[] = {
+	&dev_attr_version_fw,
+	&dev_attr_fw_queue_depth,
+	&dev_attr_op_req_q_count,
+	&dev_attr_reply_queue_count,
+	&dev_attr_logging_level,
+	&dev_attr_adp_state,
+	&dev_attr_task_management,
+	NULL,
+};
+#else
+static struct attribute *mpi3mr_host_attrs[] = {
+	&dev_attr_version_fw.attr,
+	&dev_attr_fw_queue_depth.attr,
+	&dev_attr_op_req_q_count.attr,
+	&dev_attr_reply_queue_count.attr,
+	&dev_attr_logging_level.attr,
+	&dev_attr_adp_state.attr,
+	&dev_attr_task_management.attr,
+	NULL,
+};
+
+static const struct attribute_group mpi3mr_host_attr_group = {
+	.attrs = mpi3mr_host_attrs
+};
+
+const struct attribute_group *mpi3mr_host_groups[] = {
+	&mpi3mr_host_attr_group,
+	NULL,
+};
+#endif
+
+/*
+ * SCSI Device attributes under sysfs
+ */
+
+/**
+ * mpi3mr_scsih_ncq_prio_supp - Check ncq priority is supported
+ * @sdev: scsi device struct
+ *
+ * This function returns whether the given sdev is capable for
+ * setting NCQ priority or not.
+ *
+ * Return: 0 when device doesn't support NCQ, 1 otherwise
+ */
+u8 mpi3mr_scsih_ncq_prio_supp(struct scsi_device *sdev)
+{
+	unsigned char *buf;
+	u8 ncq_prio_supp = 0;
+
+	if (!scsi_device_supports_vpd(sdev))
+		return ncq_prio_supp;
+
+	buf = kmalloc(SCSI_VPD_PG_LEN, GFP_KERNEL);
+	if (!buf)
+		return ncq_prio_supp;
+
+	if (!scsi_get_vpd_page(sdev, 0x89, buf, SCSI_VPD_PG_LEN))
+		ncq_prio_supp = (buf[213] >> 4) & 1;
+
+	kfree(buf);
+	return ncq_prio_supp;
+}
+
+/**
+ * mpi3mr_app_device_ncq_prio_enable_show - NCQ priority value
+ * @dev: class device
+ * @attr: Device attributes
+ * @buf: Buffer to copy
+ *
+ * A sysfs 'read/write' sdev attribute, to display NCQ priority
+ * value. only works with SATA.
+ *
+ * Return: snprintf() return
+ */
+static ssize_t
+mpi3mr_app_device_ncq_prio_enable_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct scsi_device *sdev = to_scsi_device(dev);
+	struct mpi3mr_sdev_priv_data *sdev_priv_data = sdev->hostdata;
+
+	return snprintf(buf, PAGE_SIZE, "%d\n",
+			sdev_priv_data->ncq_prio_enable);
+}
+
+/**
+ * mpi3mr_app_device_ncq_prio_enable_store - NCQ priority change
+ * @dev: class device
+ * @attr: Device attributes
+ * @buf: Buffer to copy
+ * @count: size of the buffer
+ *
+ * A sysfs 'read/write' sdev attribute, to store NCQ priority
+ * value. only works with SATA.
+ *
+ * Return: strlen() return
+ */
+static ssize_t
+mpi3mr_app_device_ncq_prio_enable_store(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	struct scsi_device *sdev = to_scsi_device(dev);
+	struct mpi3mr_sdev_priv_data *sdev_priv_data = sdev->hostdata;
+	int ncq_prio_enable = 0;
+
+	if (sscanf(buf, "%d", &ncq_prio_enable) != 1)
+		return -EINVAL;
+
+	if (!mpi3mr_scsih_ncq_prio_supp(sdev))
+		return -EINVAL;
+
+	sdev_priv_data->ncq_prio_enable = ncq_prio_enable;
+	return strlen(buf);
+}
+static DEVICE_ATTR(sata_ncq_prio_enable, 0644,
+			mpi3mr_app_device_ncq_prio_enable_show,
+			mpi3mr_app_device_ncq_prio_enable_store);
+
+/**
+ * sas_address_show - SysFS callback for dev SASaddress display
+ * @dev: class device
+ * @attr: Device attributes
+ * @buf: Buffer to copy
+ *
+ * Return: snprintf() return after copying SAS address of the
+ * specific SAS/SATA end device.
+ */
+static ssize_t
+sas_address_show(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct scsi_device *sdev = to_scsi_device(dev);
+	struct mpi3mr_sdev_priv_data *sdev_priv_data;
+	struct mpi3mr_stgt_priv_data *tgt_priv_data;
+	struct mpi3mr_tgt_dev *tgtdev;
+
+	sdev_priv_data = sdev->hostdata;
+	if (!sdev_priv_data)
+		return 0;
+
+	tgt_priv_data = sdev_priv_data->tgt_priv_data;
+	if (!tgt_priv_data)
+		return 0;
+	tgtdev = tgt_priv_data->tgt_dev;
+	if (!tgtdev || tgtdev->dev_type != MPI3_DEVICE_DEVFORM_SAS_SATA)
+		return 0;
+	return snprintf(buf, PAGE_SIZE, "0x%016llx\n",
+	    (unsigned long long)tgtdev->dev_spec.sas_sata_inf.sas_address);
+}
+
+static DEVICE_ATTR_RO(sas_address);
+
+/**
+ * device_handle_show - SysFS callback for device handle display
+ * @dev: class device
+ * @attr: Device attributes
+ * @buf: Buffer to copy
+ *
+ * Return: snprintf() return after copying firmware internal
+ * device handle of the specific device.
+ */
+static ssize_t
+device_handle_show(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct scsi_device *sdev = to_scsi_device(dev);
+	struct mpi3mr_sdev_priv_data *sdev_priv_data;
+	struct mpi3mr_stgt_priv_data *tgt_priv_data;
+	struct mpi3mr_tgt_dev *tgtdev;
+
+	sdev_priv_data = sdev->hostdata;
+	if (!sdev_priv_data)
+		return 0;
+
+	tgt_priv_data = sdev_priv_data->tgt_priv_data;
+	if (!tgt_priv_data)
+		return 0;
+	tgtdev = tgt_priv_data->tgt_dev;
+	if (!tgtdev)
+		return 0;
+	return snprintf(buf, PAGE_SIZE, "0x%04x\n", tgtdev->dev_handle);
+}
+
+static DEVICE_ATTR_RO(device_handle);
+
+/**
+ * persistent_id_show - SysFS callback for persisten ID display
+ * @dev: class device
+ * @attr: Device attributes
+ * @buf: Buffer to copy
+ *
+ * Return: snprintf() return after copying persistent ID of the
+ * of the specific device.
+ */
+static ssize_t
+persistent_id_show(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct scsi_device *sdev = to_scsi_device(dev);
+	struct mpi3mr_sdev_priv_data *sdev_priv_data;
+	struct mpi3mr_stgt_priv_data *tgt_priv_data;
+	struct mpi3mr_tgt_dev *tgtdev;
+
+	sdev_priv_data = sdev->hostdata;
+	if (!sdev_priv_data)
+		return 0;
+
+	tgt_priv_data = sdev_priv_data->tgt_priv_data;
+	if (!tgt_priv_data)
+		return 0;
+	tgtdev = tgt_priv_data->tgt_dev;
+	if (!tgtdev)
+		return 0;
+	return snprintf(buf, PAGE_SIZE, "%d\n", tgtdev->perst_id);
+}
+static DEVICE_ATTR_RO(persistent_id);
+
+#if (KERNEL_VERSION(5, 16, 0) > LINUX_VERSION_CODE)
+struct device_attribute *mpi3mr_dev_attrs[] = {
+	&dev_attr_sata_ncq_prio_enable,
+	&dev_attr_sas_address,
+	&dev_attr_device_handle,
+	&dev_attr_persistent_id,
+	NULL,
+};
+#else
+static struct attribute *mpi3mr_dev_attrs[] = {
+	&dev_attr_sata_ncq_prio_enable.attr,
+	&dev_attr_sas_address.attr,
+	&dev_attr_device_handle.attr,
+	&dev_attr_persistent_id.attr,
+	NULL,
+};
+
+static const struct attribute_group mpi3mr_dev_attr_group = {
+	.attrs = mpi3mr_dev_attrs
+};
+
+const struct attribute_group *mpi3mr_dev_groups[] = {
+	&mpi3mr_dev_attr_group,
+	NULL,
+};
+#endif
+
diff --git a/drivers/scsi/mpi3mr/mpi3mr_app.h b/drivers/scsi/mpi3mr/mpi3mr_app.h
new file mode 100644
index 0000000000000..b68269336d357
--- /dev/null
+++ b/drivers/scsi/mpi3mr/mpi3mr_app.h
@@ -0,0 +1,450 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Driver for Broadcom MPI3 Storage Controllers
+ *
+ * Copyright (C) 2017-2022 Broadcom Inc.
+ *  (mailto: mpi3mr-linuxdrv.pdl@broadcom.com)
+ *
+ */
+
+#ifndef MPI3MR_APP_H_INCLUDED
+#define MPI3MR_APP_H_INCLUDED
+
+#include <linux/bsg-lib.h>
+
+/* Definitions for BSG commands */
+#define MPI3MR_DEV_NAME				"mpi3mrctl"
+
+#define MPI3MR_IOCTL_VERSION			0x06
+
+#define MPI3MR_APP_DEFAULT_TIMEOUT		(60) /*seconds*/
+
+#define MPI3MR_BSG_ADPTYPE_UNKNOWN		0
+#define MPI3MR_BSG_ADPTYPE_AVGFAMILY		1
+
+#define MPI3MR_BSG_ADPSTATE_UNKNOWN		0
+#define MPI3MR_BSG_ADPSTATE_OPERATIONAL		1
+#define MPI3MR_BSG_ADPSTATE_FAULT		2
+#define MPI3MR_BSG_ADPSTATE_IN_RESET		3
+#define MPI3MR_BSG_ADPSTATE_UNRECOVERABLE	4
+
+#define MPI3MR_BSG_ADPRESET_UNKNOWN		0
+#define MPI3MR_BSG_ADPRESET_SOFT		1
+#define MPI3MR_BSG_ADPRESET_DIAG_FAULT		2
+
+#define MPI3MR_BSG_LOGDATA_MAX_ENTRIES		400
+#define MPI3MR_BSG_LOGDATA_ENTRY_HEADER_SZ	4
+
+#define MPI3MR_DRVBSG_OPCODE_UNKNOWN		0
+#define MPI3MR_DRVBSG_OPCODE_ADPINFO		1
+#define MPI3MR_DRVBSG_OPCODE_ADPRESET		2
+#define MPI3MR_DRVBSG_OPCODE_ALLTGTDEVINFO	4
+#define MPI3MR_DRVBSG_OPCODE_GETCHGCNT		5
+#define MPI3MR_DRVBSG_OPCODE_LOGDATAENABLE	6
+#define MPI3MR_DRVBSG_OPCODE_PELENABLE		7
+#define MPI3MR_DRVBSG_OPCODE_GETLOGDATA		8
+#define MPI3MR_DRVBSG_OPCODE_QUERY_HDB		9
+#define MPI3MR_DRVBSG_OPCODE_REPOST_HDB		10
+#define MPI3MR_DRVBSG_OPCODE_UPLOAD_HDB		11
+#define MPI3MR_DRVBSG_OPCODE_REFRESH_HDB_TRIGGERS	12
+
+
+#define MPI3MR_BSG_BUFTYPE_UNKNOWN		0
+#define MPI3MR_BSG_BUFTYPE_RAIDMGMT_CMD		1
+#define MPI3MR_BSG_BUFTYPE_RAIDMGMT_RESP	2
+#define MPI3MR_BSG_BUFTYPE_DATA_IN		3
+#define MPI3MR_BSG_BUFTYPE_DATA_OUT		4
+#define MPI3MR_BSG_BUFTYPE_MPI_REPLY		5
+#define MPI3MR_BSG_BUFTYPE_ERR_RESPONSE		6
+#define MPI3MR_BSG_BUFTYPE_MPI_REQUEST		0xFE
+
+#define MPI3MR_BSG_MPI_REPLY_BUFTYPE_UNKNOWN	0
+#define MPI3MR_BSG_MPI_REPLY_BUFTYPE_STATUS	1
+#define MPI3MR_BSG_MPI_REPLY_BUFTYPE_ADDRESS	2
+
+#define MPI3MR_HDB_BUFTYPE_UNKNOWN		0
+#define MPI3MR_HDB_BUFTYPE_TRACE		1
+#define MPI3MR_HDB_BUFTYPE_FIRMWARE		2
+#define MPI3MR_HDB_BUFTYPE_RESERVED		3
+
+#define MPI3MR_HDB_BUFSTATUS_UNKNOWN		0
+#define MPI3MR_HDB_BUFSTATUS_NOT_ALLOCATED	1
+#define MPI3MR_HDB_BUFSTATUS_POSTED_UNPAUSED	2
+#define MPI3MR_HDB_BUFSTATUS_POSTED_PAUSED	3
+#define MPI3MR_HDB_BUFSTATUS_RELEASED		4
+
+#define MPI3MR_HDB_TRIGGER_TYPE_UNKNOWN		0
+#define MPI3MR_HDB_TRIGGER_TYPE_FAULT		1
+#define MPI3MR_HDB_TRIGGER_TYPE_ELEMENT		2
+#define MPI3MR_HDB_TRIGGER_TYPE_MASTER		3
+#define MPI3MR_HDB_TRIGGER_TYPE_SOFT_RESET	4
+#define MPI3MR_HDB_TRIGGER_TYPE_FW_RELEASED	5
+
+#define MPI3MR_HDB_REFRESH_TYPE_RESERVED	0
+#define MPI3MR_HDB_REFRESH_TYPE_CURRENT		1
+#define MPI3MR_HDB_REFRESH_TYPE_DEFAULT		2
+#define MPI3MR_HDB_HDB_REFRESH_TYPE_PERSISTENT	3
+
+/* Supported BSG commands */
+enum command {
+	MPI3MRDRVCMD = 1,
+	MPI3MRMPTCMD = 2,
+};
+
+/* Data direction definitions */
+enum data_direction {
+	DATA_IN = 1,
+	DATA_OUT = 2,
+};
+
+/**
+ * struct mpi3mr_bsg_in_adpinfo - Adapter information request
+ * data returned by the driver.
+ *
+ * @adp_type: Adapter type
+ * @rsvd1: Reserved
+ * @pci_dev_id: PCI device ID of the adapter
+ * @pci_dev_hw_rev: PCI revision of the adapter
+ * @pci_subsys_dev_id: PCI subsystem device ID of the adapter
+ * @pci_subsys_ven_id: PCI subsystem vendor ID of the adapter
+ * @pci_dev: PCI device
+ * @pci_func: PCI function
+ * @pci_bus: PCI bus
+ * @rsvd2: Reserved
+ * @pci_seg_id: PCI segment ID
+ * @app_intfc_ver: version of the application interface definition
+ * @rsvd3: Reserved
+ * @rsvd4: Reserved
+ * @rsvd5: Reserved
+ * @driver_info: Driver Information (Version/Name)
+ */
+struct mpi3mr_bsg_in_adpinfo {
+	uint32_t adp_type;
+	uint32_t rsvd1;
+	uint32_t pci_dev_id;
+	uint32_t pci_dev_hw_rev;
+	uint32_t pci_subsys_dev_id;
+	uint32_t pci_subsys_ven_id;
+	uint32_t pci_dev:5;
+	uint32_t pci_func:3;
+	uint32_t pci_bus:8;
+	uint16_t rsvd2;
+	uint32_t pci_seg_id;
+	uint32_t app_intfc_ver;
+	uint8_t adp_state;
+	uint8_t rsvd3;
+	uint16_t rsvd4;
+	uint32_t rsvd5[2];
+	struct mpi3_driver_info_layout driver_info;
+};
+
+/**
+ * struct mpi3mr_bsg_adp_reset - Adapter reset request
+ * payload data to the driver.
+ *
+ * @reset_type: Reset type
+ * @rsvd1: Reserved
+ * @rsvd2: Reserved
+ */
+struct mpi3mr_bsg_adp_reset {
+	uint8_t reset_type;
+	uint8_t rsvd1;
+	uint16_t rsvd2;
+};
+
+/**
+ * struct mpi3mr_change_count - Topology change count
+ * returned by the driver.
+ *
+ * @change_count: Topology change count
+ * @rsvd: Reserved
+ */
+struct mpi3mr_change_count {
+	uint16_t change_count;
+	uint16_t rsvd;
+};
+
+/**
+ * struct mpi3mr_device_map_info - Target device mapping
+ * information
+ *
+ * @handle: Firmware device handle
+ * @perst_id: Persistent ID assigned by the firmware
+ * @target_id: Target ID assigned by the driver
+ * @bus_id: Bus ID assigned by the driver
+ * @rsvd1: Reserved
+ * @rsvd2: Reserved
+ */
+struct mpi3mr_device_map_info {
+	uint16_t handle;
+	uint16_t perst_id;
+	uint32_t target_id;
+	uint8_t bus_id;
+	uint8_t rsvd1;
+	uint16_t rsvd2;
+};
+
+/**
+ * struct mpi3mr_all_tgt_info - Target device mapping
+ * information returned by the driver
+ *
+ * @num_devices: The number of devices in driver's inventory
+ * @rsvd1: Reserved
+ * @rsvd2: Reserved
+ * @dmi: Variable length array of mapping information of targets
+ */
+struct mpi3mr_all_tgt_info {
+	uint16_t num_devices; //The number of devices in driver's inventory
+	uint16_t rsvd1;
+	uint32_t rsvd2;
+	struct mpi3mr_device_map_info dmi[1]; //Variable length Array
+};
+
+/**
+ * struct mpi3mr_logdata_enable - Number of log data
+ * entries saved by the driver returned as payload data for
+ * enable logdata BSG request by the driver.
+ *
+ * @max_entries: Number of log data entries cached by the driver
+ * @rsvd: Reserved
+ */
+struct mpi3mr_logdata_enable {
+	uint16_t max_entries;
+	uint16_t rsvd;
+};
+
+/**
+ * struct mpi3mr_bsg_out_pel_enable - PEL enable request payload
+ * data to the driver.
+ *
+ * @pel_locale: PEL locale to the firmware
+ * @pel_class: PEL class to the firmware
+ * @rsvd: Reserved
+ */
+struct mpi3mr_bsg_out_pel_enable {
+	uint16_t pel_locale;
+	uint8_t pel_class;
+	uint8_t rsvd;
+};
+
+/**
+ * struct mpi3mr_logdata_entry - Log data entry cached by the
+ * driver.
+ *
+ * @valid_entry: Is the entry valid
+ * @rsvd1: Reserved
+ * @rsvd2: Reserved
+ * @data: Log entry data of controller specific size
+ */
+struct mpi3mr_logdata_entry {
+	uint8_t valid_entry;
+	uint8_t rsvd1;
+	uint16_t rsvd2;
+	uint8_t data[1]; //Variable length Array
+};
+
+/**
+ * struct mpi3mr_bsg_in_log_data - Log data entries saved by
+ * the driver returned as payload data for Get logdata request
+ * by the driver.
+ *
+ * @entry: Log data entry
+ */
+struct mpi3mr_bsg_in_log_data {
+	struct mpi3mr_logdata_entry entry[1]; //Variable length Array
+};
+
+/**
+ * struct mpi3mr_hdb_entry - host diag buffer entry.
+ *
+ * @buf_type: Buffer type
+ * @status: Buffer status
+ * @trigger_type: Trigger type
+ * @rsvd1: Reserved
+ * @size: Buffer size
+ * @rsvd2: Reserved
+ * @trigger_data: Trigger specific data
+ * @rsvd3: Reserved
+ * @rsvd4: Reserved
+ */
+struct mpi3mr_hdb_entry {
+	uint8_t buf_type;
+	uint8_t status;
+	uint8_t trigger_type;
+	uint8_t rsvd1;
+	uint16_t size;
+	uint16_t rsvd2;
+	uint64_t trigger_data;
+	uint32_t rsvd3;
+	uint32_t rsvd4;
+};
+
+
+/**
+ * struct mpi3mr_bsg_in_hdb_status - This structure contains
+ * return data for the BSG request to retrieve the number of host
+ * diagnostic buffers supported by the driver and their current
+ * status and additional status specific data if any in forms of
+ * multiple hdb entries.
+ *
+ * @num_hdb_types: Number of host diag buffer types supported
+ * @rsvd1: Reserved
+ * @rsvd2: Reserved
+ * @rsvd3: Reserved
+ * @entry: Diag buffer status entry
+ */
+struct mpi3mr_bsg_in_hdb_status {
+	uint8_t num_hdb_types;
+	uint8_t rsvd1;
+	uint16_t rsvd2;
+	uint32_t rsvd3;
+	struct mpi3mr_hdb_entry entry[1]; //Variable length Array
+};
+
+/**
+ * struct mpi3mr_bsg_out_repost_hdb - Repost host diagnostic
+ * buffer request payload data to the driver.
+ *
+ * @buf_type: Buffer type
+ * @rsvd1: Reserved
+ * @rsvd2: Reserved
+ */
+struct mpi3mr_bsg_out_repost_hdb {
+	uint8_t buf_type;
+	uint8_t rsvd1;
+	uint16_t rsvd2;
+};
+
+/**
+ * struct mpi3mr_bsg_out_upload_hdb - Upload host diagnostic
+ * buffer request payload data to the driver.
+ *
+ * @buf_type: Buffer type
+ * @rsvd1: Reserved
+ * @rsvd2: Reserved
+ * @start_offset: Start offset of the buffer from where to copy
+ * @length: Length of the buffer to copy
+ */
+struct mpi3mr_bsg_out_upload_hdb {
+	uint8_t buf_type;
+	uint8_t rsvd1;
+	uint16_t rsvd2;
+	uint32_t start_offset;
+	uint32_t length;
+};
+
+/**
+ * struct mpi3mr_bsg_out_refresh_hdb_triggers - Refresh host
+ * diagnostic buffer triggers request payload data to the driver.
+ *
+ * @page_type: Page type
+ * @rsvd1: Reserved
+ * @rsvd2: Reserved
+ */
+struct mpi3mr_bsg_out_refresh_hdb_triggers {
+	uint8_t page_type;
+	uint8_t rsvd1;
+	uint16_t rsvd2;
+};
+
+/**
+ * struct mpi3mr_bsg_drv_cmd -  Generic bsg data
+ * structure for all driver specific requests.
+ *
+ * @mrioc_id: Controller ID
+ * @opcode: Driver specific opcode
+ * @rsvd1: Reserved
+ * @rsvd2: Reserved
+ */
+struct mpi3mr_bsg_drv_cmd {
+	uint8_t mrioc_id;
+	uint8_t opcode;
+	uint16_t rsvd1;
+	uint32_t rsvd2[4];
+};
+/**
+ * struct mpi3mr_bsg_in_reply_buf - MPI reply buffer returned
+ * for MPI Passthrough request .
+ *
+ * @mpi_reply_type: Type of MPI reply
+ * @rsvd1: Reserved
+ * @rsvd2: Reserved
+ * @reply_buf: Variable Length buffer based on mpirep type
+ */
+struct mpi3mr_bsg_in_reply_buf {
+	uint8_t mpi_reply_type;
+	uint8_t rsvd1;
+	uint16_t rsvd2;
+	uint8_t reply_buf[1]; /*Variable Length buffer based on mpirep type*/
+};
+/**
+ * struct mpi3mr_buf_entry - User buffer descriptor for MPI
+ * Passthrough requests.
+ *
+ * @buf_type: Buffer type
+ * @rsvd1: Reserved
+ * @rsvd2: Reserved
+ * @buf_len: Buffer length
+ */
+struct mpi3mr_buf_entry {
+	uint8_t buf_type;
+	uint8_t rsvd1;
+	uint16_t rsvd2;
+	uint32_t buf_len;
+};
+/**
+ * struct mpi3mr_bsg_buf_entry_list - list of user buffer
+ * descriptor for MPI Passthrough requests.
+ *
+ * @num_of_entries: Number of buffer descriptors
+ * @rsvd1: Reserved
+ * @rsvd2: Reserved
+ * @rsvd3: Reserved
+ * @buf_entry: Variable length array of buffer descriptors
+ */
+struct mpi3mr_buf_entry_list {
+	uint8_t num_of_entries;
+	uint8_t rsvd1;
+	uint16_t rsvd2;
+	uint32_t rsvd3;
+	struct mpi3mr_buf_entry buf_entry[1];
+};
+/**
+ * struct mpi3mr_bsg_mptcmd -  Generic bsg data
+ * structure for all MPI Passthrough requests .
+ *
+ * @mrioc_id: Controller ID
+ * @rsvd1: Reserved
+ * @timeout: MPI request timeout
+ * @buf_entry_list: Buffer descriptor list
+ */
+struct mpi3mr_bsg_mptcmd {
+	uint8_t mrioc_id;
+	uint8_t rsvd1;
+	uint16_t timeout;
+	uint32_t rsvd2;
+	struct mpi3mr_buf_entry_list buf_entry_list;
+};
+
+/**
+ * struct mpi3mr_bsg_packet -  Generic bsg data
+ * structure for all supported requests .
+ *
+ * @cmd_type: represents drvrcmd or mptcmd
+ * @rsvd1: Reserved
+ * @rsvd2: Reserved
+ * @drvrcmd: driver request structure
+ * @mptcmd: mpt request structure
+ */
+struct mpi3mr_bsg_packet {
+        uint8_t cmd_type;
+        uint8_t rsvd1;
+        uint16_t rsvd2;
+        uint32_t rsvd3;
+        union {
+                struct mpi3mr_bsg_drv_cmd drvrcmd;
+                struct mpi3mr_bsg_mptcmd mptcmd;
+        } cmd;
+};
+
+#endif
diff --git a/drivers/scsi/mpi3mr/mpi3mr_debug.h b/drivers/scsi/mpi3mr/mpi3mr_debug.h
new file mode 100644
index 0000000000000..010dfcfac94b2
--- /dev/null
+++ b/drivers/scsi/mpi3mr/mpi3mr_debug.h
@@ -0,0 +1,179 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Driver for Broadcom MPI3 Storage Controllers
+ *
+ * Copyright (C) 2017-2022 Broadcom Inc.
+ *  (mailto: mpi3mr-linuxdrv.pdl@broadcom.com)
+ *
+ */
+
+#ifndef MPI3SAS_DEBUG_H_INCLUDED
+
+#define MPI3SAS_DEBUG_H_INCLUDED
+
+/*
+ * debug levels
+ */
+
+#define MPI3_DEBUG_EVENT		0x00000001
+#define MPI3_DEBUG_EVENT_WORK_TASK	0x00000002
+#define MPI3_DEBUG_INIT			0x00000004
+#define MPI3_DEBUG_EXIT			0x00000008
+#define MPI3_DEBUG_TM			0x00000010
+#define MPI3_DEBUG_RESET		0x00000020
+#define MPI3_DEBUG_SCSI_ERROR		0x00000040
+#define MPI3_DEBUG_REPLY		0x00000080
+#define MPI3_DEBUG_CFG_ERROR		0x00000100
+#define MPI3_DEBUG_TRANSPORT_ERROR	0x00000200
+#define MPI3_DEBUG_BSG_ERROR		0x00008000
+#define MPI3_DEBUG_BSG_INFO		0x00010000
+#define MPI3_DEBUG_SCSI_INFO		0x00020000
+#define MPI3_DEBUG_CFG_INFO		0x00040000
+#define MPI3_DEBUG_TRANSPORT_INFO	0x00080000
+#define MPI3_DEBUG			0x01000000
+#define MPI3_DEBUG_SG			0x02000000
+
+
+/*
+ * debug macros
+ */
+
+#define ioc_err(ioc, fmt, ...) \
+	pr_err("%s: " fmt, (ioc)->name, ##__VA_ARGS__)
+#define ioc_notice(ioc, fmt, ...) \
+	pr_notice("%s: " fmt, (ioc)->name, ##__VA_ARGS__)
+#define ioc_warn(ioc, fmt, ...) \
+	pr_warn("%s: " fmt, (ioc)->name, ##__VA_ARGS__)
+#define ioc_info(ioc, fmt, ...) \
+	pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__)
+
+#define dprint(ioc, fmt, ...) \
+	do { \
+		if (ioc->logging_level & MPI3_DEBUG) \
+			pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \
+	} while (0)
+
+#define dprint_event_th(ioc, fmt, ...) \
+	do { \
+		if (ioc->logging_level & MPI3_DEBUG_EVENT) \
+			pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \
+	} while (0)
+
+#define dprint_event_bh(ioc, fmt, ...) \
+	do { \
+		if (ioc->logging_level & MPI3_DEBUG_EVENT_WORK_TASK) \
+			pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \
+	} while (0)
+
+#define dprint_init(ioc, fmt, ...) \
+	do { \
+		if (ioc->logging_level & MPI3_DEBUG_INIT) \
+			pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \
+	} while (0)
+
+#define dprint_exit(ioc, fmt, ...) \
+	do { \
+		if (ioc->logging_level & MPI3_DEBUG_EXIT) \
+			pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \
+	} while (0)
+
+#define dprint_tm(ioc, fmt, ...) \
+	do { \
+		if (ioc->logging_level & MPI3_DEBUG_TM) \
+			pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \
+	} while (0)
+
+#define dprint_reply(ioc, fmt, ...) \
+	do { \
+		if (ioc->logging_level & MPI3_DEBUG_REPLY) \
+			pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \
+	} while (0)
+
+#define dprint_reset(ioc, fmt, ...) \
+	do { \
+		if (ioc->logging_level & MPI3_DEBUG_RESET) \
+			pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \
+	} while (0)
+
+#define dprint_scsi_info(ioc, fmt, ...) \
+	do { \
+		if (ioc->logging_level & MPI3_DEBUG_SCSI_INFO) \
+			pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \
+	} while (0)
+
+#define dprint_scsi_err(ioc, fmt, ...) \
+	do { \
+		if (ioc->logging_level & MPI3_DEBUG_SCSI_ERROR) \
+			pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \
+	} while (0)
+
+#define dprint_scsi_command(ioc, SCMD, LOG_LEVEL) \
+	do { \
+		if (ioc->logging_level & LOG_LEVEL) \
+			scsi_print_command(SCMD); \
+	} while (0)
+
+
+#define dprint_bsg_info(ioc, fmt, ...) \
+	do { \
+		if (ioc->logging_level & MPI3_DEBUG_BSG_INFO) \
+			pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \
+	} while (0)
+
+#define dprint_bsg_err(ioc, fmt, ...) \
+	do { \
+		if (ioc->logging_level & MPI3_DEBUG_BSG_ERROR) \
+			pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \
+	} while (0)
+
+#define dprint_cfg_info(ioc, fmt, ...) \
+	do { \
+		if (ioc->logging_level & MPI3_DEBUG_CFG_INFO) \
+			pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \
+	} while (0)
+
+#define dprint_cfg_err(ioc, fmt, ...) \
+	do { \
+		if (ioc->logging_level & MPI3_DEBUG_CFG_ERROR) \
+			pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \
+	} while (0)
+#define dprint_transport_info(ioc, fmt, ...) \
+	do { \
+		if (ioc->logging_level & MPI3_DEBUG_TRANSPORT_INFO) \
+			pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \
+	} while (0)
+
+#define dprint_transport_err(ioc, fmt, ...) \
+	do { \
+		if (ioc->logging_level & MPI3_DEBUG_TRANSPORT_ERROR) \
+			pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__); \
+	} while (0)
+
+#endif /* MPT3SAS_DEBUG_H_INCLUDED */
+
+/**
+ * dprint_dump - print contents of a memory buffer
+ * @req: Pointer to a memory buffer
+ * @sz: Memory buffer size
+ * @namestr: Name String to identify the buffer type
+ */
+static inline void
+dprint_dump(void *req, int sz, const char *name_string)
+{
+	int i;
+	__le32 *mfp = (__le32 *)req;
+	sz = sz/4;
+
+	if (name_string)
+		pr_info("%s:\n\t", name_string);
+	else
+		pr_info("request:\n\t");
+	for (i = 0; i < sz; i++) {
+		if (i && ((i % 8) == 0))
+			pr_info("\n\t");
+		pr_info("%08x ", le32_to_cpu(mfp[i]));
+	}
+	pr_info("\n");
+}
+
+
diff --git a/drivers/scsi/mpi3mr/mpi3mr_debugfs.c b/drivers/scsi/mpi3mr/mpi3mr_debugfs.c
new file mode 100644
index 0000000000000..62201826bcd9e
--- /dev/null
+++ b/drivers/scsi/mpi3mr/mpi3mr_debugfs.c
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Driver for Broadcom MPI3 Storage Controllers
+ *
+ * Copyright (C) 2017-2022 Broadcom Inc.
+ *  (mailto: mpi3mr-linuxdrv.pdl@broadcom.com)
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/compat.h>
+#include <linux/uio.h>
+
+#include <scsi/scsi.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_host.h>
+
+
+#include "mpi3mr.h"
+
+#ifdef CONFIG_DEBUG_FS
+#include <linux/debugfs.h>
+
+struct dentry *mpi3mr_dbgfs_root;
+
+struct mpi3mr_debugfs_buffer {
+	void *buf;
+	u32 len;
+};
+
+static ssize_t
+mpi3mr_debugfs_read(struct file *filp, char __user *ubuf, size_t cnt,
+	loff_t *ppos)
+
+{
+	struct mpi3mr_debugfs_buffer *debug = filp->private_data;
+
+	if (!debug || !debug->buf)
+		return 0;
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, debug->buf, debug->len);
+}
+
+static int
+mpi3mr_debugfs_dmesg_open(struct inode *inode, struct file *file)
+{
+	struct mpi3mr_ioc *mrioc = inode->i_private;
+	struct mpi3mr_debugfs_buffer *debug;
+
+	if (!mrioc->drv_diag_buffer)
+		return -EPERM;
+
+	debug = kzalloc(sizeof(struct mpi3mr_debugfs_buffer), GFP_KERNEL);
+	if (!debug)
+		return -ENOMEM;
+
+	debug->buf = (void *)mrioc->drv_diag_buffer + sizeof(struct mpi3_driver_buffer_header);
+	debug->len = mrioc->drv_diag_buffer_sz - sizeof(struct mpi3_driver_buffer_header);
+
+	file->private_data = debug;
+
+	return 0;
+}
+
+static int
+mpi3mr_debugfs_uefi_logs_open(struct inode *inode, struct file *file)
+{
+	struct mpi3mr_ioc *mrioc = inode->i_private;
+	struct mpi3mr_debugfs_buffer *debug;
+
+	if (!mrioc->uefi_logs)
+		return -EPERM;
+
+	debug = kzalloc(sizeof(struct mpi3mr_debugfs_buffer), GFP_KERNEL);
+	if (!debug)
+		return -ENOMEM;
+
+	debug->buf = (void *)mrioc->uefi_logs;
+	debug->len = mrioc->uefi_logs_sz;
+
+	file->private_data = debug;
+
+	return 0;
+}
+static int
+mpi3mr_debugfs_release(struct inode *inode, struct file *file)
+{
+	struct mpi3mr_debug_buffer *debug = file->private_data;
+
+	if (!debug)
+		return 0;
+
+	file->private_data = NULL;
+	kfree(debug);
+	return 0;
+}
+
+static const struct file_operations mpi3mr_debugfs_dmesg_fops = {
+	.owner		= THIS_MODULE,
+	.open           = mpi3mr_debugfs_dmesg_open,
+	.read           = mpi3mr_debugfs_read,
+	.release        = mpi3mr_debugfs_release,
+};
+
+static const struct file_operations mpi3mr_debugfs_uefi_logs_fops = {
+	.owner		= THIS_MODULE,
+	.open           = mpi3mr_debugfs_uefi_logs_open,
+	.read           = mpi3mr_debugfs_read,
+	.release        = mpi3mr_debugfs_release,
+};
+
+/*
+ * mpi3mr_init_debugfs :	Create debugfs root for mpi3mr driver
+ */
+void mpi3mr_init_debugfs(void)
+{
+	mpi3mr_dbgfs_root = debugfs_create_dir(MPI3MR_DRIVER_NAME, NULL);
+	if (!mpi3mr_dbgfs_root)
+		pr_info("Cannot create debugfs root\n");
+}
+
+/*
+ * mpi3mr_exit_debugfs :	Remove debugfs root for mpi3mr driver
+ */
+void mpi3mr_exit_debugfs(void)
+{
+	debugfs_remove_recursive(mpi3mr_dbgfs_root);
+}
+
+/*
+ * mpi3mr_setup_debugfs :	Setup debugfs per adapter
+ * mrioc:			Soft instance of adapter
+ */
+void
+mpi3mr_setup_debugfs(struct mpi3mr_ioc *mrioc)
+{
+	char name[64];
+	int i;
+
+	snprintf(name, sizeof(name), "scsi_host%d", mrioc->shost->host_no);
+
+	if (!mrioc->dbgfs_adapter) {
+		mrioc->dbgfs_adapter =
+		    debugfs_create_dir(name, mpi3mr_dbgfs_root);
+
+		if (!mrioc->dbgfs_adapter) {
+			ioc_err(mrioc,
+			    "failed to create per adapter debugfs directory\n");
+			return;
+		}
+	}
+
+	for (i = 0; i < mrioc->num_queues; i++) {
+		snprintf(name, sizeof(name), "queue%d", mrioc->req_qinfo[i].qid);
+		mrioc->req_qinfo[i].dbgfs_req_queue =
+		    debugfs_create_dir(name, mrioc->dbgfs_adapter);
+
+		if (!mrioc->req_qinfo[i].dbgfs_req_queue) {
+			ioc_err(mrioc,
+			    "failed to create per request queue debugfs directory\n");
+			debugfs_remove_recursive(mrioc->dbgfs_adapter);
+			mrioc->dbgfs_adapter = NULL;
+			return;
+		}
+
+		debugfs_create_u32("qfull_instances", 0444,
+		    mrioc->req_qinfo[i].dbgfs_req_queue,
+		    &mrioc->req_qinfo[i].qfull_instances);
+
+		debugfs_create_u64("qfull_io_count", 0644,
+		    mrioc->req_qinfo[i].dbgfs_req_queue,
+		    &mrioc->req_qinfo[i].qfull_io_count);
+	}
+
+	/* This interface to dump system logs in host space is for test/verify purpose only */
+	snprintf(name, sizeof(name), "dmesg");
+	mrioc->dmesg_dump =
+		debugfs_create_file(name, 0444,
+				mrioc->dbgfs_adapter,
+				mrioc, &mpi3mr_debugfs_dmesg_fops);
+	if (!mrioc->dmesg_dump) {
+		ioc_err(mrioc, "cannot create dmesg debugfs file\n");
+		debugfs_remove(mrioc->dbgfs_adapter);
+	}
+
+	snprintf(name, sizeof(name), "uefi_logs");
+	mrioc->uefi_logs_dump =
+		debugfs_create_file(name, 0444,
+				mrioc->dbgfs_adapter,
+				mrioc, &mpi3mr_debugfs_uefi_logs_fops);
+	if (!mrioc->uefi_logs_dump) {
+		ioc_err(mrioc, "cannot create uefi debugfs file\n");
+		debugfs_remove(mrioc->dbgfs_adapter);
+	}
+}
+
+/*
+ * mpi3mr_destroy_debugfs :	Destroy debugfs per adapter
+ * mrioc:			Soft instance of adapter
+ */
+void mpi3mr_destroy_debugfs(struct mpi3mr_ioc *mrioc)
+{
+	debugfs_remove_recursive(mrioc->dbgfs_adapter);
+	mrioc->dbgfs_adapter = NULL;
+}
+
+#else
+void mpi3mr_init_debugfs(void)
+{
+}
+void mpi3mr_exit_debugfs(void)
+{
+}
+void mpi3mr_setup_debugfs(struct mpi3mr_ioc *mrioc)
+{
+}
+void mpi3mr_destroy_debugfs(struct mpi3mr_ioc *mrioc)
+{
+}
+#endif /*CONFIG_DEBUG_FS*/
diff --git a/drivers/scsi/mpi3mr/mpi3mr_fw.c b/drivers/scsi/mpi3mr/mpi3mr_fw.c
new file mode 100644
index 0000000000000..20bee864977c5
--- /dev/null
+++ b/drivers/scsi/mpi3mr/mpi3mr_fw.c
@@ -0,0 +1,6778 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Driver for Broadcom MPI3 Storage Controllers
+ *
+ * Copyright (C) 2017-2022 Broadcom Inc.
+ *  (mailto: mpi3mr-linuxdrv.pdl@broadcom.com)
+ *
+ */
+
+#include "mpi3mr.h"
+#include "mpi3mr_app.h"
+
+int poll_queues;
+module_param(poll_queues, int, 0444);
+#if (KERNEL_VERSION(5, 13, 0) <= LINUX_VERSION_CODE)
+MODULE_PARM_DESC(poll_queues, "Number of queues for io_uring poll mode. (Range 1 - 126)");
+#else
+MODULE_PARM_DESC(poll_queues, "This parameter is unused in this version of the kernel (Try kernel >= 5.13)");
+#endif
+
+bool enable_segqueue = true;
+module_param(enable_segqueue, bool, 0444);
+MODULE_PARM_DESC(enable_segqueue,
+    "Enable segmented operational request & reply queues in supported controllers (Default = 1)");
+
+int drv_db_level = 1;
+module_param(drv_db_level, int, 0444);
+MODULE_PARM_DESC(drv_db_level, "Driver diagnostic buffer level(Default=1).\n\t\t"
+		"options:\n\t\t"
+		"0 = disabled:  Driver diagnostic buffer not captured\n\t\t"
+		"1 = minidump:  Driver diagnostic buffer captures prints\n\t\t"
+		"related to specific mrioc instance\n\t\t"
+		"2 = fulldump:  Driver diagnostic buffer captures prints\n\t\t"
+		"related to specific mrioc instance and complete dmesg logs"
+		);
+
+extern int enable_dix;
+
+static void mpi3mr_pel_wait_complete(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_drv_cmd *drv_cmd);
+
+#if defined(writeq) && defined(CONFIG_64BIT)
+static inline void mpi3mr_writeq(__u64 b, volatile void __iomem *addr,
+	spinlock_t *write_queue_lock)
+{
+	writeq(b, addr);
+}
+#else
+static inline void mpi3mr_writeq(__u64 b, volatile void __iomem *addr,
+	spinlock_t *write_queue_lock)
+{
+	__u64 data_out = b;
+	unsigned long flags;
+
+	spin_lock_irqsave(write_queue_lock, flags);
+
+	writel((u32)(data_out), addr);
+	writel((u32)(data_out >> 32), (addr + 4));
+
+	spin_unlock_irqrestore(write_queue_lock, flags);
+}
+#endif
+
+#if defined(readq) && defined(CONFIG_64BIT)
+static inline __u64 mpi3mr_readq(const volatile void __iomem *addr)
+{
+	return readq(addr);
+}
+#else
+static inline __u64 mpi3mr_readq(const volatile void __iomem *addr)
+{
+	const volatile u32 __iomem *p = addr;
+	u32 low, high;
+
+	low = readl(p);
+	high = readl(p + 1);
+	return low + ((u64)high << 32);
+}
+#endif
+/**
+ * mpi3mr_check_req_qfull - Check request queue is full or not
+ * @op_req_q: Operational reply queue info
+ *
+ * Return: true when queue full, false otherwise.
+ */
+static inline bool
+mpi3mr_check_req_qfull(struct op_req_qinfo *op_req_q)
+{
+	u16 pi, ci, max_entries;
+	bool is_qfull = false;
+
+	pi = op_req_q->pi;
+	ci = READ_ONCE(op_req_q->ci);
+	max_entries = op_req_q->num_requests;
+
+	if ((ci == (pi + 1)) || ((!ci) && (pi == (max_entries - 1))))
+		is_qfull = true;
+
+	return is_qfull;
+}
+
+/**
+ * mpi3mr_sync_irqs - Synchronize all IRQs
+ * @mrioc: Adapter instance reference
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_sync_irqs(struct mpi3mr_ioc *mrioc)
+{
+	u16 i, max_vectors;
+
+	max_vectors = mrioc->intr_info_count;
+
+	for (i = 0; i < max_vectors; i++)
+		synchronize_irq(pci_irq_vector(mrioc->pdev, i));
+}
+
+/**
+ * mpi3mr_ioc_disable_intr - Disable controller interrupts
+ * @mrioc: Adapter instance reference
+ *
+ * Return: Nothing.
+ */
+void mpi3mr_ioc_disable_intr(struct mpi3mr_ioc *mrioc)
+{
+	mrioc->intr_enabled = 0;
+	mpi3mr_sync_irqs(mrioc);
+}
+
+/**
+ * mpi3mr_ioc_enable_intr - Enable controller interrupts
+ * @mrioc: Adapter instance reference
+ *
+ * Return: Nothing.
+ */
+void mpi3mr_ioc_enable_intr(struct mpi3mr_ioc *mrioc)
+{
+	mrioc->intr_enabled = 1;
+}
+
+/**
+ * mpi3mr_cleanup_isr - Cleanup IRQs
+ * @mrioc: Adapter instance reference
+ *
+ * Disable interrupts, Free all IRQs, free memory for interrupt
+ * information and free IRQ vectors.
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_cleanup_isr(struct mpi3mr_ioc *mrioc)
+{
+	u16 i;
+
+	mpi3mr_ioc_disable_intr(mrioc);
+
+	if (!mrioc->intr_info)
+		return;
+
+	for (i = 0; i < mrioc->intr_info_count; i++)
+		free_irq(pci_irq_vector(mrioc->pdev, i),
+		    (mrioc->intr_info + i));
+
+	kfree(mrioc->intr_info);
+	mrioc->intr_info = NULL;
+	mrioc->intr_info_count = 0;
+	mrioc->is_intr_info_set = false;
+	pci_free_irq_vectors(mrioc->pdev);
+}
+
+/**
+ * mpi3mr_add_sg_single - Build a scatter gather element(sge)
+ * @paddr: SGE address
+ * @flags: SGE flags
+ * @length: SGE length
+ * @dma_addr: DMA address
+ *
+ * Set the SGE element in the given paddr.
+ *
+ * Return: Nothing.
+ */
+void mpi3mr_add_sg_single(void *paddr, u8 flags, u32 length,
+	dma_addr_t dma_addr)
+{
+	struct mpi3_sge_common *sgel = paddr;
+
+	sgel->flags = flags;
+	sgel->length = cpu_to_le32(length);
+	sgel->address = cpu_to_le64(dma_addr);
+}
+
+/**
+ * mpi3mr_build_zero_len_sge - Build zero length SGE
+ * @paddr: SGE address
+ *
+ * Set the length of SGE as 0 and address as all FFs to indicate
+ * this is a zero length SGE (for no data transfer).
+ *
+ * Return: Nothing.
+ */
+void mpi3mr_build_zero_len_sge(void *paddr)
+{
+	u8 sgl_flags = MPI3MR_SGEFLAGS_SYSTEM_SIMPLE_END_OF_LIST;
+
+	mpi3mr_add_sg_single(paddr, sgl_flags, 0, -1);
+
+}
+
+/**
+ * mpi3mr_get_reply_virt_addr - Map reply buffer DMA address
+ * @mrioc: Adapter instance reference
+ * @phys_addr: reply buffer DMA address
+ *
+ * Map reply buffer DMA address to virtual address.
+ *
+ * Return: NULL on failure, virtual address on success.
+ */
+void *mpi3mr_get_reply_virt_addr(struct mpi3mr_ioc *mrioc,
+	dma_addr_t phys_addr)
+{
+	if (!phys_addr)
+		return NULL;
+
+	if ((phys_addr < mrioc->reply_buf_dma) ||
+	    (phys_addr > mrioc->reply_buf_dma_max_address))
+		return NULL;
+
+	return mrioc->reply_buf + (phys_addr - mrioc->reply_buf_dma);
+}
+
+/**
+ * mpi3mr_get_sensebuf_virt_addr - Map sense buffer DMA address
+ * @mrioc: Adapter instance reference
+ * @phys_addr: Sense buffer DMA address
+ *
+ * Map sense buffer DMA address to virtual address.
+ *
+ * Return: NULL on failure, virtual address on success.
+ */
+void *mpi3mr_get_sensebuf_virt_addr(struct mpi3mr_ioc *mrioc,
+	dma_addr_t phys_addr)
+{
+	if (!phys_addr)
+		return NULL;
+
+	return mrioc->sense_buf + (phys_addr - mrioc->sense_buf_dma);
+}
+/**
+ * mpi3mr_repost_reply_buf - Post replybuffer to queue
+ * @mrioc: Adapter instance reference
+ * @reply_dma: Reply buffer DMA address
+ *
+ * Store the reply buffer DMA address into a free element in the
+ * reply buffer free queue and write the host index to the
+ * reply_free_host_index to let the hardware know a free reply
+ * buffer is available.
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_repost_reply_buf(struct mpi3mr_ioc *mrioc,
+	u64 reply_dma)
+{
+	u32 old_idx = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&mrioc->reply_free_queue_lock, flags);
+	old_idx  =  mrioc->reply_free_queue_host_index;
+	mrioc->reply_free_queue_host_index = (
+	    (mrioc->reply_free_queue_host_index ==
+	    (mrioc->reply_free_qsz - 1)) ? 0 :
+	    (mrioc->reply_free_queue_host_index + 1));
+	mrioc->reply_free_q[old_idx] = cpu_to_le64(reply_dma);
+	writel(mrioc->reply_free_queue_host_index,
+	    &mrioc->sysif_regs->reply_free_host_index);
+	spin_unlock_irqrestore(&mrioc->reply_free_queue_lock, flags);
+}
+
+/**
+ * mpi3mr_repost_sense_buf - Post sensebuffer to queue
+ * @mrioc: Adapter instance reference
+ * @sense_buf_dma: Sense buffer DMA address
+ *
+ * Store the sense buffer DMA address into a free element in the
+ * sense buffer free queue and write the host index to the
+ * sense_buffer_free_host_index to let the hardware know a free
+ * buffer is available.
+ *
+ * Return: Nothing.
+ */
+void mpi3mr_repost_sense_buf(struct mpi3mr_ioc *mrioc,
+	u64 sense_buf_dma)
+{
+	u32 old_idx = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&mrioc->sbq_lock, flags);
+	old_idx  =  mrioc->sbq_host_index;
+	mrioc->sbq_host_index = ((mrioc->sbq_host_index ==
+	    (mrioc->sense_buf_q_sz - 1)) ? 0 :
+	    (mrioc->sbq_host_index + 1));
+	mrioc->sense_buf_q[old_idx] = cpu_to_le64(sense_buf_dma);
+	writel(mrioc->sbq_host_index,
+	    &mrioc->sysif_regs->sense_buffer_free_host_index);
+	spin_unlock_irqrestore(&mrioc->sbq_lock, flags);
+}
+
+
+/**
+ * mpi3mr_print_event_data - Print event details
+ * @mrioc: Adapter instance reference
+ * @event_reply: MPI3 event
+ *
+ * Prints the event details when debug level is enabled to print
+ * events.
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_print_event_data(struct mpi3mr_ioc *mrioc,
+	struct mpi3_event_notification_reply *event_reply)
+{
+	char *desc = NULL;
+	u16 event;
+
+	if (!(mrioc->logging_level & MPI3_DEBUG_EVENT))
+		return;
+	event = event_reply->event;
+
+	switch (event) {
+	case MPI3_EVENT_LOG_DATA:
+		desc = "Log Data";
+		break;
+	case MPI3_EVENT_CHANGE:
+		desc = "Event Change";
+		break;
+	case MPI3_EVENT_GPIO_INTERRUPT:
+		desc = "GPIO Interrupt";
+		break;
+	case MPI3_EVENT_CABLE_MGMT:
+		desc = "Cable Management";
+		break;
+	case MPI3_EVENT_ENERGY_PACK_CHANGE:
+		desc = "Energy Pack Change";
+		break;
+	case MPI3_EVENT_DEVICE_ADDED:
+	{
+		struct mpi3_device_page0 *event_data =
+		    (struct mpi3_device_page0 *)event_reply->event_data;
+		ioc_info(mrioc, "Device Added: handle(0x%04x), perst_id(%d), form(0x%02x)\n",
+		    le16_to_cpu(event_data->dev_handle),
+		    le16_to_cpu(event_data->persistent_id),
+		    event_data->device_form);
+		return;
+	}
+	case MPI3_EVENT_DEVICE_INFO_CHANGED:
+	{
+		struct mpi3_device_page0 *event_data =
+		    (struct mpi3_device_page0 *)event_reply->event_data;
+		ioc_info(mrioc, "Device Info Changed: handle(0x%04x), perst_id(%d), form(0x%02x)\n",
+		    le16_to_cpu(event_data->dev_handle),
+		    le16_to_cpu(event_data->persistent_id),
+		    event_data->device_form);
+		return;
+	}
+	case MPI3_EVENT_DEVICE_STATUS_CHANGE:
+	{
+		struct mpi3_event_data_device_status_change *event_data =
+		    (struct mpi3_event_data_device_status_change *)
+		    event_reply->event_data;
+		ioc_info(mrioc, "Device Status Change: handle(0x%04x), reason_code(0x%02x)\n",
+		    le16_to_cpu(event_data->dev_handle), event_data->reason_code);
+		return;
+	}
+	case MPI3_EVENT_SAS_DISCOVERY:
+	{
+		struct mpi3_event_data_sas_discovery *event_data =
+		    (struct mpi3_event_data_sas_discovery *)
+		    event_reply->event_data;
+		ioc_info(mrioc, "SAS Discovery: (%s) status (0x%08x)",
+		    (event_data->reason_code == MPI3_EVENT_SAS_DISC_RC_STARTED)
+		    ? "start" : "stop",
+		    le32_to_cpu(event_data->discovery_status));
+		return;
+	}
+	case MPI3_EVENT_SAS_BROADCAST_PRIMITIVE:
+		desc = "SAS Broadcast Primitive";
+		break;
+	case MPI3_EVENT_SAS_NOTIFY_PRIMITIVE:
+		desc = "SAS Notify Primitive";
+		break;
+	case MPI3_EVENT_SAS_INIT_DEVICE_STATUS_CHANGE:
+		desc = "SAS Init Device Status Change";
+		break;
+	case MPI3_EVENT_SAS_INIT_TABLE_OVERFLOW:
+		desc = "SAS Init Table Overflow";
+		break;
+	case MPI3_EVENT_SAS_TOPOLOGY_CHANGE_LIST:
+		desc = "SAS Topology Change List";
+		break;
+	case MPI3_EVENT_ENCL_DEVICE_STATUS_CHANGE:
+		desc = "Enclosure Device Status Change";
+		break;
+	case MPI3_EVENT_ENCL_DEVICE_ADDED:
+		desc = "Enclosure Added";
+		break;
+	case MPI3_EVENT_HARD_RESET_RECEIVED:
+		desc = "Hard Reset Received";
+		break;
+	case MPI3_EVENT_SAS_PHY_COUNTER:
+		desc = "SAS PHY Counter";
+		break;
+	case MPI3_EVENT_SAS_DEVICE_DISCOVERY_ERROR:
+		desc = "SAS Device Discovery Error";
+		break;
+	case MPI3_EVENT_PCIE_TOPOLOGY_CHANGE_LIST:
+		desc = "PCIE Topology Change List";
+		break;
+	case MPI3_EVENT_PCIE_ENUMERATION:
+	{
+		struct mpi3_event_data_pcie_enumeration *event_data =
+		    (struct mpi3_event_data_pcie_enumeration *)
+		    event_reply->event_data;
+		ioc_info(mrioc, "PCIE Enumeration: (%s)",
+		    (event_data->reason_code ==
+		    MPI3_EVENT_PCIE_ENUM_RC_STARTED) ? "start" : "stop");
+		if (event_data->enumeration_status)
+			ioc_info(mrioc, "enumeration_status(0x%08x)\n",
+			    le32_to_cpu(event_data->enumeration_status));
+		return;
+	}
+	case MPI3_EVENT_PREPARE_FOR_RESET:
+		desc = "Prepare For Reset";
+		break;
+	case MPI3_EVENT_DIAGNOSTIC_BUFFER_STATUS_CHANGE:
+		desc = "Diagnostic Buffer Status Change";
+		break;
+	}
+
+	if (!desc)
+		return;
+
+	ioc_info(mrioc, "%s\n", desc);
+}
+
+/**
+ * mpi3mr_handle_events - Handle events
+ * @mrioc: Adapter instance reference
+ * @def_reply: MPI3 default reply
+ *
+ * Prints the event details and call the consumer of the events.
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_handle_events(struct mpi3mr_ioc *mrioc,
+	struct mpi3_default_reply *def_reply)
+{
+	struct mpi3_event_notification_reply *event_reply =
+	    (struct mpi3_event_notification_reply *)def_reply;
+
+	mrioc->change_count = le16_to_cpu(event_reply->ioc_change_count);
+	mpi3mr_print_event_data(mrioc, event_reply);
+
+	mpi3mr_os_handle_events(mrioc, event_reply);
+}
+
+/**
+ * mpi3mr_get_drv_cmd - Get driver command from host tag
+ * @mrioc: Adapter instance reference
+ * @host_tag: Host tag
+ * @def_reply: MPI3 default reply
+ *
+ * Checks the host tag and if it is driver's internal identify
+ * the corresponding command tracker reference and return. If
+ * the hosttag is invalid then it is an MPI3 event and event
+ * processing routine is called and NULL is returned. If the
+ * host tag is unindentifiable then also NULL is returned.
+ *
+ * Return: Null for events/failure or internal command tracker.
+ */
+static struct mpi3mr_drv_cmd *
+mpi3mr_get_drv_cmd(struct mpi3mr_ioc *mrioc, u16 host_tag,
+	struct mpi3_default_reply *def_reply)
+{
+	u16 idx;
+
+	switch (host_tag) {
+	case MPI3MR_HOSTTAG_INITCMDS:
+		return &mrioc->init_cmds;
+	case MPI3MR_HOSTTAG_CFG_CMDS:
+		return &mrioc->cfg_cmds;
+	case MPI3MR_HOSTTAG_BSG_CMDS:
+		return &mrioc->bsg_cmds;
+	case MPI3MR_HOSTTAG_BLK_TMS:
+		return &mrioc->host_tm_cmds;
+	case MPI3MR_HOSTTAG_PEL_ABORT:
+		return &mrioc->pel_abort_cmd;
+	case MPI3MR_HOSTTAG_PEL_WAIT:
+		return &mrioc->pel_cmds;
+	case MPI3MR_HOSTTAG_TRANSPORT_CMDS:
+		return &mrioc->transport_cmds;
+	case MPI3MR_HOSTTAG_INVALID:
+		if (def_reply && def_reply->function ==
+		    MPI3_FUNCTION_EVENT_NOTIFICATION)
+			mpi3mr_handle_events(mrioc, def_reply);
+		return NULL;
+	default:
+		break;
+	}
+	if (host_tag >= MPI3MR_HOSTTAG_DEVRMCMD_MIN &&
+	    host_tag <= MPI3MR_HOSTTAG_DEVRMCMD_MAX) {
+		idx = host_tag - MPI3MR_HOSTTAG_DEVRMCMD_MIN;
+		return &mrioc->dev_rmhs_cmds[idx];
+	}
+	if (host_tag >= MPI3MR_HOSTTAG_SYSFS_TM_MIN &&
+	    host_tag <= MPI3MR_HOSTTAG_SYSFS_TM_MAX) {
+		idx = host_tag - MPI3MR_HOSTTAG_SYSFS_TM_MIN;
+		return &mrioc->sysfs_tm_cmds[idx];
+	}
+	if (host_tag >= MPI3MR_HOSTTAG_EVTACKCMD_MIN &&
+	    host_tag <= MPI3MR_HOSTTAG_EVTACKCMD_MAX) {
+		idx = host_tag - MPI3MR_HOSTTAG_EVTACKCMD_MIN;
+		return &mrioc->evtack_cmds[idx];
+	}
+
+	return NULL;
+}
+
+/**
+ * mpi3mr_process_admin_reply_desc - Admin reply descriptor
+ *				     handler
+ * @mrioc: Adapter instance reference
+ * @reply_desc: Reply descriptor
+ * @reply_dma: Place holder for reply frames dma
+ *
+ * Checks the type of the reply descriptor and infer the
+ * descriptor as defined in MPI3.0 specification and wake any of
+ * the functions waiting for the reply.
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_process_admin_reply_desc(struct mpi3mr_ioc *mrioc,
+	struct mpi3_default_reply_descriptor *reply_desc, u64 *reply_dma)
+{
+	u16 reply_desc_type, host_tag = 0;
+	u16 ioc_status = MPI3_IOCSTATUS_SUCCESS;
+	u32 ioc_loginfo = 0;
+	struct mpi3_status_reply_descriptor *status_desc;
+	struct mpi3_address_reply_descriptor *addr_desc;
+	struct mpi3_success_reply_descriptor *success_desc;
+	struct mpi3_default_reply *def_reply = NULL;
+	struct mpi3mr_drv_cmd *cmdptr = NULL;
+	struct mpi3_scsi_io_reply *scsi_reply;
+	u8 *sense_buf = NULL;
+
+	*reply_dma = 0;
+	reply_desc_type = le16_to_cpu(reply_desc->reply_flags) &
+	    MPI3_REPLY_DESCRIPT_FLAGS_TYPE_MASK;
+	switch (reply_desc_type) {
+	case MPI3_REPLY_DESCRIPT_FLAGS_TYPE_STATUS:
+		status_desc = (struct mpi3_status_reply_descriptor *)reply_desc;
+		host_tag = le16_to_cpu(status_desc->host_tag);
+		ioc_status = le16_to_cpu(status_desc->ioc_status);
+		if (ioc_status &
+		    MPI3_REPLY_DESCRIPT_STATUS_IOCSTATUS_LOGINFOAVAIL)
+			ioc_loginfo = le32_to_cpu(status_desc->ioc_log_info);
+		ioc_status &= MPI3_REPLY_DESCRIPT_STATUS_IOCSTATUS_STATUS_MASK;
+		mpi3mr_reply_trigger(mrioc, ioc_status, ioc_loginfo);
+		break;
+	case MPI3_REPLY_DESCRIPT_FLAGS_TYPE_ADDRESS_REPLY:
+		addr_desc = (struct mpi3_address_reply_descriptor *)reply_desc;
+		*reply_dma = le64_to_cpu(addr_desc->reply_frame_address);
+		def_reply = mpi3mr_get_reply_virt_addr(mrioc, *reply_dma);
+		if (!def_reply)
+			goto out;
+		host_tag = le16_to_cpu(def_reply->host_tag);
+		ioc_status = le16_to_cpu(def_reply->ioc_status);
+		if (ioc_status &
+		    MPI3_REPLY_DESCRIPT_STATUS_IOCSTATUS_LOGINFOAVAIL)
+			ioc_loginfo = le32_to_cpu(def_reply->ioc_log_info);
+		ioc_status &= MPI3_REPLY_DESCRIPT_STATUS_IOCSTATUS_STATUS_MASK;
+		if (def_reply->function == MPI3_FUNCTION_SCSI_IO) {
+			scsi_reply = (struct mpi3_scsi_io_reply *)def_reply;
+			sense_buf = mpi3mr_get_sensebuf_virt_addr(mrioc,
+			    le64_to_cpu(scsi_reply->sense_data_buffer_address));
+		}
+		mpi3mr_reply_trigger(mrioc, ioc_status, ioc_loginfo);
+		break;
+	case MPI3_REPLY_DESCRIPT_FLAGS_TYPE_SUCCESS:
+		success_desc = (struct mpi3_success_reply_descriptor *)
+		    reply_desc;
+		host_tag = le16_to_cpu(success_desc->host_tag);
+		break;
+	default:
+		break;
+	}
+
+	cmdptr = mpi3mr_get_drv_cmd(mrioc, host_tag, def_reply);
+	if (cmdptr) {
+		if (cmdptr->state & MPI3MR_CMD_PENDING) {
+			cmdptr->state |= MPI3MR_CMD_COMPLETE;
+			cmdptr->ioc_loginfo = ioc_loginfo;
+			cmdptr->ioc_status = ioc_status;
+			cmdptr->state &= ~MPI3MR_CMD_PENDING;
+			if (def_reply) {
+				cmdptr->state |= MPI3MR_CMD_REPLY_VALID;
+				memcpy((u8 *)cmdptr->reply, (u8 *)def_reply,
+				    mrioc->reply_sz);
+			}
+			if (sense_buf && cmdptr->sensebuf) {
+				cmdptr->is_sense = 1;
+				memcpy(cmdptr->sensebuf, sense_buf,
+				    MPI3MR_SENSE_BUF_SZ);
+			}
+			if (cmdptr->is_waiting) {
+				complete(&cmdptr->done);
+				cmdptr->is_waiting = 0;
+			} else if (cmdptr->callback)
+				cmdptr->callback(mrioc, cmdptr);
+		}
+	}
+out:
+	if (sense_buf)
+		mpi3mr_repost_sense_buf(mrioc,
+		    le64_to_cpu(scsi_reply->sense_data_buffer_address));
+}
+
+/**
+ * mpi3mr_process_admin_reply_q - Admin reply queue handler
+ * @mrioc: Adapter instance reference
+ *
+ * Checks the admin reply queue and drains the reply queue until
+ * the queue is empty and process the individual reply
+ * descriptors. Post the controller with proper consumer index.
+ *
+ * Return: Number of reply descriptors processed.
+ */
+int mpi3mr_process_admin_reply_q(struct mpi3mr_ioc *mrioc)
+{
+	u32 exp_phase = mrioc->admin_reply_ephase;
+	u32 admin_reply_ci = mrioc->admin_reply_ci;
+	u32 num_admin_replies = 0;
+	u64 reply_dma = 0;
+	struct mpi3_default_reply_descriptor *reply_desc;
+
+	if (!atomic_add_unless(&mrioc->admin_reply_q_in_use, 1, 1))
+		return 0;
+
+	reply_desc = (struct mpi3_default_reply_descriptor *)
+	    mrioc->admin_reply_base + admin_reply_ci;
+
+	if ((le16_to_cpu(reply_desc->reply_flags) &
+	    MPI3_REPLY_DESCRIPT_FLAGS_PHASE_MASK) != exp_phase) {
+		atomic_dec(&mrioc->admin_reply_q_in_use);
+		return 0;
+	}
+
+	do {
+		if (mrioc->unrecoverable)
+			break;
+		mrioc->admin_req_ci = le16_to_cpu(reply_desc->request_queue_ci);
+		mpi3mr_process_admin_reply_desc(mrioc, reply_desc, &reply_dma);
+		if (reply_dma)
+			mpi3mr_repost_reply_buf(mrioc, reply_dma);
+		num_admin_replies++;
+		if (++admin_reply_ci == mrioc->num_admin_replies) {
+			admin_reply_ci = 0;
+			exp_phase ^= 1;
+		}
+		reply_desc =
+		    (struct mpi3_default_reply_descriptor *)
+		    mrioc->admin_reply_base + admin_reply_ci;
+		if ((le16_to_cpu(reply_desc->reply_flags) &
+		    MPI3_REPLY_DESCRIPT_FLAGS_PHASE_MASK) != exp_phase)
+			break;
+	} while (1);
+
+	writel(admin_reply_ci, &mrioc->sysif_regs->admin_reply_queue_ci);
+	mrioc->admin_reply_ci = admin_reply_ci;
+	mrioc->admin_reply_ephase = exp_phase;
+	atomic_dec(&mrioc->admin_reply_q_in_use);
+
+	return num_admin_replies;
+}
+
+/**
+ * mpi3mr_get_reply_desc - Get reply descriptor
+ * @op_reply_q: Operational reply queue info
+ * @reply_ci: Operational reply queue consumer index
+ *
+ * Get reply descriptor frame corresponding to a operational
+ * reply queue's consumer index
+ *
+ * Return: Reply descriptor address
+ */
+static inline struct mpi3_default_reply_descriptor *
+mpi3mr_get_reply_desc(struct op_reply_qinfo *op_reply_q, u32 reply_ci)
+{
+	void *segment_base_addr;
+	struct segments *segments = op_reply_q->q_segments;
+	struct mpi3_default_reply_descriptor *reply_desc = NULL;
+
+	segment_base_addr =
+	    segments[reply_ci / op_reply_q->segment_qd].segment;
+	reply_desc = (struct mpi3_default_reply_descriptor *)segment_base_addr +
+	    (reply_ci % op_reply_q->segment_qd);
+	return reply_desc;
+}
+
+/**
+ * mpi3mr_process_op_reply_q - Operational reply queue handler
+ * @mrioc: Adapter instance reference
+ * @op_reply_q: Operational reply queue info
+ *
+ * Checks the specific operational reply queue and drains the
+ * reply queue entries until the queue is empty and process the
+ * individual reply descriptors.
+ *
+ * Return: 0 if queue is already processed,or number of reply
+ *	    descriptors processed.
+ */
+int mpi3mr_process_op_reply_q(struct mpi3mr_ioc *mrioc,
+	struct op_reply_qinfo *op_reply_q)
+{
+	struct op_req_qinfo *op_req_q;
+	u32 exp_phase;
+	u32 reply_ci;
+	u32 num_op_reply = 0;
+	u64 reply_dma = 0;
+	struct mpi3_default_reply_descriptor *reply_desc;
+	u16 req_q_idx = 0, reply_qidx;
+
+	reply_qidx = op_reply_q->qid - 1;
+
+	if (!atomic_add_unless(&op_reply_q->in_use, 1, 1))
+		return 0;
+
+	exp_phase = op_reply_q->ephase;
+	reply_ci = op_reply_q->ci;
+
+	reply_desc = mpi3mr_get_reply_desc(op_reply_q, reply_ci);
+	if ((le16_to_cpu(reply_desc->reply_flags) &
+	    MPI3_REPLY_DESCRIPT_FLAGS_PHASE_MASK) != exp_phase) {
+		atomic_dec(&op_reply_q->in_use);
+		return 0;
+	}
+
+	do {
+		if (mrioc->unrecoverable)
+			break;
+		req_q_idx = le16_to_cpu(reply_desc->request_queue_id) - 1;
+		op_req_q = &mrioc->req_qinfo[req_q_idx];
+
+		WRITE_ONCE(op_req_q->ci,
+		    le16_to_cpu(reply_desc->request_queue_ci));
+
+		mpi3mr_process_op_reply_desc(mrioc, reply_desc, &reply_dma,
+		    reply_qidx);
+		atomic_dec(&op_reply_q->pend_ios);
+
+#if defined(IO_COUNTER_SUPPORT)
+		atomic_dec(&mrioc->pend_ios);
+#endif
+		if (reply_dma)
+			mpi3mr_repost_reply_buf(mrioc, reply_dma);
+		num_op_reply++;
+
+		if (++reply_ci == op_reply_q->num_replies) {
+			reply_ci = 0;
+			exp_phase ^= 1;
+		}
+
+		reply_desc = mpi3mr_get_reply_desc(op_reply_q, reply_ci);
+
+		if ((le16_to_cpu(reply_desc->reply_flags) &
+		    MPI3_REPLY_DESCRIPT_FLAGS_PHASE_MASK) != exp_phase)
+			break;
+#ifndef CONFIG_PREEMPT_RT
+		/*
+		 * Exit completion loop to avoid CPU lockup
+		 * Ensure remaining completion happens from threaded ISR.
+		 */
+		if (num_op_reply > mrioc->max_host_ios) {
+			op_reply_q->enable_irq_poll = true;
+			break;
+		}
+#endif
+	} while (1);
+
+
+	writel(reply_ci,
+	    &mrioc->sysif_regs->oper_queue_indexes[reply_qidx].consumer_index);
+	op_reply_q->ci = reply_ci;
+	op_reply_q->ephase = exp_phase;
+	atomic_dec(&op_reply_q->in_use);
+
+	return num_op_reply;
+}
+
+#if (KERNEL_VERSION(5, 12, 0) <= LINUX_VERSION_CODE)
+/**
+ * mpi3mr_blk_mq_poll - Operational reply queue handler
+ * @shost: SCSI Host reference
+ * @queue_num: Request queue number (w.r.t OS it is hardware context number)
+ *
+ * Checks the specific operational reply queue and drains the
+ * reply queue entries until the queue is empty and process the
+ * individual reply descriptors.
+ *
+ * Return: 0 if queue is already processed,or number of reply
+ *	    descriptors processed.
+ */
+int mpi3mr_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num)
+{
+	int num_entries = 0;
+	struct mpi3mr_ioc *mrioc;
+
+	mrioc = (struct mpi3mr_ioc *)shost->hostdata;
+
+	if ((mrioc->reset_in_progress || mrioc->prepare_for_reset
+	    || mrioc->unrecoverable))
+		return 0;
+
+	num_entries = mpi3mr_process_op_reply_q(mrioc,
+			&mrioc->op_reply_qinfo[queue_num]);
+
+	return num_entries;
+}
+#else
+int mpi3mr_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num)
+{
+	return 0;
+}
+#endif
+/**
+ * mpi3mr_isr_primary - Interrupt Handler worker
+ * @irq: IRQ
+ * @privdata: Interrupt info
+ *
+ * Checks for the MSIx index, if it is 0 calls admin reply queue
+ * processing routine. If the MSIx has an associated operational
+ * reply queue, the operational reply processing routine is
+ * called too.
+ *
+ * Return: IRQ_NONE or IRQ_HANDLED
+ */
+static irqreturn_t mpi3mr_isr_primary(int irq, void *privdata)
+{
+	struct mpi3mr_intr_info *intr_info = privdata;
+	struct mpi3mr_ioc *mrioc;
+	u16 midx;
+	u32 num_admin_replies = 0, num_op_reply = 0;
+
+	if (!intr_info)
+		return IRQ_NONE;
+
+	mrioc = intr_info->mrioc;
+
+	if (!mrioc->intr_enabled)
+		return IRQ_NONE;
+
+	midx = intr_info->msix_index;
+
+	if (!midx)
+		num_admin_replies = mpi3mr_process_admin_reply_q(mrioc);
+	if (intr_info->op_reply_q)
+		num_op_reply = mpi3mr_process_op_reply_q(mrioc,
+		    intr_info->op_reply_q);
+
+	if (num_admin_replies || num_op_reply)
+		return IRQ_HANDLED;
+	else
+		return IRQ_NONE;
+}
+
+#ifndef CONFIG_PREEMPT_RT
+/**
+ * mpi3mr_isr - Interrupt Handler
+ * @irq: IRQ
+ * @privdata: Interrupt info
+ *
+ * Executes reply queue draining and processing of reply
+ * descriptors by calling mpi3mr_isr_primary and if more replies
+ * are exepcted, schedule an IRQ polling thread.
+ *
+ * Return: IRQ_NONE or IRQ_HANDLED or IRQ_WAKE_THREAD
+ */
+static irqreturn_t mpi3mr_isr(int irq, void *privdata)
+{
+	struct mpi3mr_intr_info *intr_info = privdata;
+	struct mpi3mr_ioc *mrioc;
+	u16 midx;
+	int ret;
+
+	if (!intr_info)
+		return IRQ_NONE;
+
+	mrioc = intr_info->mrioc;
+	midx = intr_info->msix_index;
+	/* Call primary ISR routine */
+	ret = mpi3mr_isr_primary(irq, privdata);
+
+	/*
+	 * If more IOs are expected, schedule IRQ polling thread.
+	 * Otherwise exit from ISR.
+	 */
+	if (!intr_info->op_reply_q)
+		return ret;
+
+	if (!intr_info->op_reply_q->enable_irq_poll ||
+	    !atomic_read(&intr_info->op_reply_q->pend_ios))
+		return ret;
+
+	disable_irq_nosync(pci_irq_vector(mrioc->pdev, midx));
+
+	return IRQ_WAKE_THREAD;
+}
+
+/**
+ * mpi3mr_isr_poll - ISR thread. Reply queue polling routine
+ * @irq: IRQ
+ * @privdata: Interrupt info
+ *
+ * Threaded ISR, polls for pending I/O completions in a loop
+ * until pending I/Os present or controller queue depth I/Os are
+ * processed.
+ *
+ * Return: IRQ_NONE or IRQ_HANDLED
+ */
+static irqreturn_t mpi3mr_isr_poll(int irq, void *privdata)
+{
+	struct mpi3mr_intr_info *intr_info = privdata;
+	struct mpi3mr_ioc *mrioc;
+	u16 midx;
+	u32 num_op_reply = 0;
+
+	if (!intr_info || !intr_info->op_reply_q)
+		return IRQ_NONE;
+
+	mrioc = intr_info->mrioc;
+	midx = intr_info->msix_index;
+
+	/* Poll for pending IOs completions */
+	do {
+		if (!mrioc->intr_enabled || mrioc->unrecoverable)
+			break;
+
+		if (!midx)
+			mpi3mr_process_admin_reply_q(mrioc);
+		if (intr_info->op_reply_q)
+			num_op_reply +=
+			    mpi3mr_process_op_reply_q(mrioc,
+				intr_info->op_reply_q);
+
+		usleep_range(MPI3MR_IRQ_POLL_SLEEP, 10 * MPI3MR_IRQ_POLL_SLEEP);
+
+	} while (atomic_read(&intr_info->op_reply_q->pend_ios) &&
+	    (num_op_reply < mrioc->max_host_ios));
+
+	/*SP2DO - There can be some IO timeout/pause if driver exit above loop
+	 * because of num_orep < mrioc->max_host_ios check.
+	 * It will happen only if application has stopped IO activity and above
+	 * check this just prior to application stopped. Very difficult to
+	 * reproduce. Investigate and fix this area later.
+	 */
+	intr_info->op_reply_q->enable_irq_poll = false;
+	enable_irq(pci_irq_vector(mrioc->pdev, midx));
+
+	return IRQ_HANDLED;
+}
+#endif
+
+/**
+ * mpi3mr_request_irq - Request IRQ and register ISR
+ * @mrioc: Adapter instance reference
+ * @index: IRQ vector index
+ *
+ * Request threaded ISR with primary ISR and secondary
+ *
+ * Return: 0 on success and non zero on failures.
+ */
+static inline int mpi3mr_request_irq(struct mpi3mr_ioc *mrioc, u16 index)
+{
+	struct pci_dev *pdev = mrioc->pdev;
+	struct mpi3mr_intr_info *intr_info = mrioc->intr_info + index;
+	int retval = 0;
+
+	intr_info->mrioc = mrioc;
+	intr_info->msix_index = index;
+	intr_info->op_reply_q = NULL;
+
+	snprintf(intr_info->name, MPI3MR_NAME_LENGTH, "%s%d-msix%d",
+	    mrioc->driver_name, mrioc->id, index);
+
+#ifndef CONFIG_PREEMPT_RT
+	retval = request_threaded_irq(pci_irq_vector(pdev, index), mpi3mr_isr,
+	    mpi3mr_isr_poll, IRQF_SHARED, intr_info->name, intr_info);
+#else
+	retval = request_threaded_irq(pci_irq_vector(pdev, index), mpi3mr_isr_primary,
+	    NULL, IRQF_SHARED, intr_info->name, intr_info);
+#endif
+	if (retval) {
+		ioc_err(mrioc, "%s: unable to allocate interrupt %d!\n",
+		    intr_info->name, pci_irq_vector(pdev, index));
+		return retval;
+	}
+
+	return retval;
+}
+
+static void mpi3mr_calc_poll_queues(struct mpi3mr_ioc *mrioc, u16 max_vectors)
+{
+	if (!mrioc->requested_poll_qcount)
+		return;
+
+	/* Reserved for Admin and Default Queue */
+	if (max_vectors > 2 &&
+		(mrioc->requested_poll_qcount < max_vectors - 2)) {
+		ioc_info(mrioc,
+		    "enabled polled queues (%d) msix (%d)\n",
+		    mrioc->requested_poll_qcount, max_vectors);
+	} else {
+		ioc_info(mrioc,
+		    "disabled polled queues (%d) msix (%d) because of no resources for default queue\n",
+		    mrioc->requested_poll_qcount, max_vectors);
+		mrioc->requested_poll_qcount = 0;
+	}
+}
+
+/**
+ * mpi3mr_setup_isr - Setup ISR for the controller
+ * @mrioc: Adapter instance reference
+ * @setup_one: Request one IRQ or more
+ *
+ * Allocate IRQ vectors and call mpi3mr_request_irq to setup ISR
+ *
+ * Return: 0 on success and non zero on failures.
+ */
+static int mpi3mr_setup_isr(struct mpi3mr_ioc *mrioc, u8 setup_one)
+{
+	unsigned int irq_flags = PCI_IRQ_MSIX;
+	int max_vectors, min_vec;
+	int retval;
+	int i;
+	struct irq_affinity desc = { .pre_vectors =  1, .post_vectors = 1 };
+
+	if (mrioc->is_intr_info_set)
+		return 0;
+
+	mpi3mr_cleanup_isr(mrioc);
+
+	if (setup_one || reset_devices) {
+		max_vectors = 1;
+		retval = pci_alloc_irq_vectors(mrioc->pdev,
+		    1, max_vectors, irq_flags);
+		if (retval < 0) {
+			ioc_err(mrioc, "cannot allocate irq vectors, ret %d\n",
+			    retval);
+			goto out_failed;
+		}
+	} else {
+		max_vectors =
+		    min_t(int, mrioc->cpu_count + 1 +
+			mrioc->requested_poll_qcount, mrioc->msix_count);
+
+		mpi3mr_calc_poll_queues(mrioc, max_vectors);
+
+		ioc_info(mrioc,
+		    "MSI-X vectors supported: %d, no of cores: %d,",
+		    mrioc->msix_count, mrioc->cpu_count);
+		ioc_info(mrioc,
+		    "MSI-x vectors requested: %d poll_queues %d\n",
+		    max_vectors, mrioc->requested_poll_qcount);
+
+		desc.post_vectors = mrioc->requested_poll_qcount;
+		min_vec = desc.pre_vectors + desc.post_vectors;
+		irq_flags |= PCI_IRQ_AFFINITY | PCI_IRQ_ALL_TYPES;
+
+		retval = pci_alloc_irq_vectors_affinity(mrioc->pdev,
+			min_vec, max_vectors, irq_flags, &desc);
+
+		if (retval < 0) {
+			ioc_err(mrioc, "cannot allocate irq vectors, ret %d\n",
+			    retval);
+			goto out_failed;
+		}
+
+
+		/*
+		 * If only one MSI-x is allocated, then MSI-x 0 will be shared
+		 * between Admin queue and operational queue
+		 */
+		if (retval == min_vec)
+			mrioc->op_reply_q_offset = 0;
+		else if (retval != (max_vectors)) {
+			ioc_info(mrioc,
+			    "allocated vectors (%d) are less than configured (%d)\n",
+			    retval, max_vectors);
+		}
+
+		max_vectors = retval;
+		mrioc->op_reply_q_offset = (max_vectors > 1) ? 1 : 0;
+
+		mpi3mr_calc_poll_queues(mrioc, max_vectors);
+
+	}
+
+	mrioc->intr_info = kzalloc(sizeof(struct mpi3mr_intr_info)*max_vectors,
+	    GFP_KERNEL);
+	if (!mrioc->intr_info) {
+		retval = -ENOMEM;
+		pci_free_irq_vectors(mrioc->pdev);
+		goto out_failed;
+	}
+	for (i = 0; i < max_vectors; i++) {
+		retval = mpi3mr_request_irq(mrioc, i);
+		if (retval) {
+			mrioc->intr_info_count = i;
+			goto out_failed;
+		}
+	}
+	if (reset_devices || !setup_one)
+		mrioc->is_intr_info_set = true;
+	mrioc->intr_info_count = max_vectors;
+	mpi3mr_ioc_enable_intr(mrioc);
+	return 0;
+out_failed:
+	mpi3mr_cleanup_isr(mrioc);
+
+	return retval;
+}
+
+static const struct {
+	enum mpi3mr_drv_db_level value;
+	char *name;
+} mpi3mr_drv_db[] = {
+	{ MRIOC_DRV_DB_DISABLED, "disabled (uefi dump is enabled)" },
+	{ MRIOC_DRV_DB_MINI, "minidump" },
+	{ MRIOC_DRV_DB_FULL, "fulldump" },
+};
+static const char *mpi3mr_drv_db_name(enum mpi3mr_drv_db_level drv_db_level)
+{
+	int i;
+	char *name = NULL;
+
+	/* Start with Disabled */
+	name = mpi3mr_drv_db[0].name;
+
+	for (i = 0; i < ARRAY_SIZE(mpi3mr_drv_db); i++) {
+		if (mpi3mr_drv_db[i].value == drv_db_level) {
+			name = mpi3mr_drv_db[i].name;
+			break;
+		}
+	}
+	return name;
+}
+
+static const struct {
+	enum mpi3mr_iocstate value;
+	char *name;
+} mrioc_states[] = {
+	{ MRIOC_STATE_READY, "ready" },
+	{ MRIOC_STATE_FAULT, "fault" },
+	{ MRIOC_STATE_RESET, "reset" },
+	{ MRIOC_STATE_BECOMING_READY, "becoming ready" },
+	{ MRIOC_STATE_RESET_REQUESTED, "reset requested" },
+	{ MRIOC_STATE_UNRECOVERABLE, "unrecoverable error" },
+};
+
+static const char *mpi3mr_iocstate_name(enum mpi3mr_iocstate mrioc_state)
+{
+	int i;
+	char *name = NULL;
+
+	for (i = 0; i < ARRAY_SIZE(mrioc_states); i++) {
+		if (mrioc_states[i].value == mrioc_state) {
+			name = mrioc_states[i].name;
+			break;
+		}
+	}
+	return name;
+}
+
+/* Reset reason to name mapper structure*/
+static const struct {
+	enum mpi3mr_reset_reason value;
+	char *name;
+} mpi3mr_reset_reason_codes[] = {
+	{ MPI3MR_RESET_FROM_BRINGUP, "bringup" },
+	{ MPI3MR_RESET_FROM_FAULT_WATCH, "fault" },
+	{ MPI3MR_RESET_FROM_APP, "application invocation" },
+	{ MPI3MR_RESET_FROM_EH_HOS, "host reset from the OS" },
+	{ MPI3MR_RESET_FROM_TM_TIMEOUT, "task management timeout" },
+	{ MPI3MR_RESET_FROM_APP_TIMEOUT, "application command timeout" },
+	{ MPI3MR_RESET_FROM_MUR_FAILURE, "message unit reset failure" },
+	{ MPI3MR_RESET_FROM_CTLR_CLEANUP, "controller cleanup" },
+	{ MPI3MR_RESET_FROM_CIACTIV_FAULT, "component image activation fault" },
+	{ MPI3MR_RESET_FROM_PE_TIMEOUT, "port enable timeout" },
+	{ MPI3MR_RESET_FROM_TSU_TIMEOUT, "time stamp update timeout" },
+	{ MPI3MR_RESET_FROM_DELREQQ_TIMEOUT, "delete request queue timeout" },
+	{ MPI3MR_RESET_FROM_DELREPQ_TIMEOUT, "delete reply queue timeout" },
+	{
+		MPI3MR_RESET_FROM_CREATEREPQ_TIMEOUT,
+		"create request queue timeout"
+	},
+	{
+		MPI3MR_RESET_FROM_CREATEREQQ_TIMEOUT,
+		"create reply queue timeout"
+	},
+	{ MPI3MR_RESET_FROM_IOCFACTS_TIMEOUT, "ioc_facts timeout" },
+	{ MPI3MR_RESET_FROM_IOCINIT_TIMEOUT, "ioc_init timeout" },
+	{ MPI3MR_RESET_FROM_EVTNOTIFY_TIMEOUT, "event notification timeout" },
+	{ MPI3MR_RESET_FROM_EVTACK_TIMEOUT, "event acknowledgment timeout" },
+	{
+		MPI3MR_RESET_FROM_CIACTVRST_TIMER,
+		"component image activation timeout"
+	},
+	{
+		MPI3MR_RESET_FROM_GETPKGVER_TIMEOUT,
+		"get package version timeout"
+	},
+	{
+		MPI3MR_RESET_FROM_PELABORT_TIMEOUT,
+		"persistent event log abort timeout"
+	},
+	{ MPI3MR_RESET_FROM_SYSFS, "sysfs invocation" },
+	{ MPI3MR_RESET_FROM_SYSFS_TIMEOUT, "sysfs task management timeout" },
+	{
+		MPI3MR_RESET_FROM_DIAG_BUFFER_POST_TIMEOUT,
+		"diagnostic buffer post timeout"
+	},
+	{
+		MPI3MR_RESET_FROM_DIAG_BUFFER_RELEASE_TIMEOUT,
+		"diagnostic buffer release timeout"
+	},
+	{ MPI3MR_RESET_FROM_FIRMWARE, "firmware asynchronus reset" },
+	{
+		MPI3MR_RESET_FROM_DIAG_BUFFER_UPLOAD_TIMEOUT,
+		"diagnostic buffer upload timeout"
+	},
+	{ MPI3MR_RESET_FROM_CFG_REQ_TIMEOUT, "configuration request timeout"},
+	{
+		MPI3MR_RESET_FROM_SAS_TRANSPORT_TIMEOUT,
+		"timeout of a SAS transport layer request"
+	},
+	{ MPI3MR_RESET_FROM_TRIGGER, "automatic firmware diagnostic trigger"},
+};
+
+/**
+ * mpi3mr_reset_rc_name - get reset reason code name
+ * @reason_code: reset reason code value
+ *
+ * Map reset reason to an NULL terminated ASCII string
+ *
+ * Return: name corresponding to reset reason value or NULL.
+ */
+static const char *mpi3mr_reset_rc_name(enum mpi3mr_reset_reason reason_code)
+{
+	int i;
+	char *name = NULL;
+
+	for (i = 0; i < ARRAY_SIZE(mpi3mr_reset_reason_codes); i++) {
+		if (mpi3mr_reset_reason_codes[i].value == reason_code) {
+			name = mpi3mr_reset_reason_codes[i].name;
+			break;
+		}
+	}
+	return name;
+}
+
+/* Reset type to name mapper structure*/
+static const struct {
+	u16 reset_type;
+	char *name;
+} mpi3mr_reset_types[] = {
+	{ MPI3_SYSIF_HOST_DIAG_RESET_ACTION_SOFT_RESET, "soft" },
+	{ MPI3_SYSIF_HOST_DIAG_RESET_ACTION_DIAG_FAULT, "diag fault" },
+};
+
+/**
+ * mpi3mr_reset_type_name - get reset type name
+ * @reset_type: reset type value
+ *
+ * Map reset type to an NULL terminated ASCII string
+ *
+ * Return: name corresponding to reset type value or NULL.
+ */
+static const char *mpi3mr_reset_type_name(u16 reset_type)
+{
+	int i;
+	char *name = NULL;
+
+	for (i = 0; i < ARRAY_SIZE(mpi3mr_reset_types); i++) {
+		if (mpi3mr_reset_types[i].reset_type == reset_type) {
+			name = mpi3mr_reset_types[i].name;
+			break;
+		}
+	}
+	return name;
+}
+
+/**
+ * mpi3mr_print_fault_info - Display fault information
+ * @mrioc: Adapter instance reference
+ *
+ * Display the controller fault information if there is a
+ * controller fault.
+ *
+ * Return: Nothing.
+ */
+void mpi3mr_print_fault_info(struct mpi3mr_ioc *mrioc)
+{
+	u32 ioc_status, code, code1, code2, code3;
+
+	ioc_status = readl(&mrioc->sysif_regs->ioc_status);
+
+	if (ioc_status & MPI3_SYSIF_IOC_STATUS_FAULT) {
+		code = readl(&mrioc->sysif_regs->fault);
+		code1 = readl(&mrioc->sysif_regs->fault_info[0]);
+		code2 = readl(&mrioc->sysif_regs->fault_info[1]);
+		code3 = readl(&mrioc->sysif_regs->fault_info[2]);
+
+		ioc_info(mrioc,
+		    "fault code(0x%08X): additional code: (0x%08X:0x%08X:0x%08X)\n",
+		    code, code1, code2, code3);
+	}
+}
+
+/**
+ * mpi3mr_get_iocstate - Get IOC State
+ * @mrioc: Adapter instance reference
+ *
+ * Return a proper IOC state enum based on the IOC status and
+ * IOC configuration and unrecoverable state of the controller.
+ *
+ * Return: Current IOC state.
+ */
+enum mpi3mr_iocstate mpi3mr_get_iocstate(struct mpi3mr_ioc *mrioc)
+{
+	u32 ioc_status, ioc_config;
+	u8 ready, enabled;
+
+	ioc_status = readl(&mrioc->sysif_regs->ioc_status);
+	ioc_config = readl(&mrioc->sysif_regs->ioc_configuration);
+
+	if (mrioc->unrecoverable)
+		return MRIOC_STATE_UNRECOVERABLE;
+	if (ioc_status & MPI3_SYSIF_IOC_STATUS_FAULT)
+		return MRIOC_STATE_FAULT;
+
+	ready = (ioc_status & MPI3_SYSIF_IOC_STATUS_READY);
+	enabled = (ioc_config & MPI3_SYSIF_IOC_CONFIG_ENABLE_IOC);
+
+	if (ready && enabled)
+		return MRIOC_STATE_READY;
+	if ((!ready) && (!enabled))
+		return MRIOC_STATE_RESET;
+	if ((!ready) && (enabled))
+		return MRIOC_STATE_BECOMING_READY;
+
+	return MRIOC_STATE_RESET_REQUESTED;
+}
+
+/**
+ * mpi3mr_do_uefi_dump - copy uefi logs
+ * @mrioc: Adapter instance reference
+ *
+ * Return: next available location in driver diag buffer.
+ */
+static int mpi3mr_do_uefi_dump(struct mpi3mr_ioc *mrioc)
+{
+	struct mpi3_driver_buffer_header *drv_buff_header = NULL;
+	int pos_uefi_dump = 0, pos_uefi_end = 0;
+	u32 data_len;
+
+	if (!mrioc->uefi_logs)
+		return pos_uefi_dump;
+
+	data_len = mrioc->uefi_logs_sz;
+	memcpy(mrioc->drv_diag_buffer, mrioc->uefi_logs, data_len);
+	drv_buff_header =
+		(struct mpi3_driver_buffer_header *)mrioc->drv_diag_buffer;
+
+	pos_uefi_dump = sizeof(struct mpi3_driver_buffer_header);
+	if (drv_buff_header->signature == 0x43495243) {
+		pos_uefi_end =
+			min_t(int,
+			    data_len - sizeof(struct mpi3_driver_buffer_header),
+			    drv_buff_header->circular_buffer_size - 1);
+		ioc_info(mrioc,
+			"UEFI logs has valid header size %d\n",
+			drv_buff_header->circular_buffer_size);
+		pos_uefi_dump += pos_uefi_end;
+	} else {
+		pos_uefi_dump +=
+			min_t(int, data_len,
+			    MPI3MR_UEFI_DIAG_HOST_BUFFER_OFFSET);
+		ioc_info(mrioc, "UEFI logs has invalid header\n");
+	}
+
+	drv_buff_header->signature = 0x43495243;
+	drv_buff_header->logical_buffer_start = 0;
+	drv_buff_header->circular_buffer_size = mrioc->drv_diag_buffer_sz
+			- sizeof(struct mpi3_driver_buffer_header);
+	drv_buff_header->flags =
+	    MPI3_DRIVER_DIAG_BUFFER_HEADER_FLAGS_CIRCULAR_BUF_FORMAT_ASCII;
+	drv_buff_header->logical_buffer_end =
+	    pos_uefi_dump - sizeof(struct mpi3_driver_buffer_header);
+	ioc_info(mrioc, "UEFI Logs offset 0x%04x logical_buffer_end 0x%04x\n",
+	    pos_uefi_dump, drv_buff_header->logical_buffer_end);
+
+	return pos_uefi_dump;
+}
+
+/**
+ * mpi3mr_do_mini_dump - copy system logs associated with mrioc.
+ * @mrioc: Adapter instance reference
+ * @prev_offset: offset returned from previous operation
+ *
+ * Read system logs and search for pattern mpi3mr%d and copy the lines
+ * into driver diag buffer
+ *
+ * Return: next available location in driver diag buffer.
+ */
+static int mpi3mr_do_mini_dump(struct mpi3mr_ioc *mrioc, int prev_offset)
+{
+	int n = 0, lines, pos_mini_dump;
+	struct mpi3mr_kmsg_dumper dumper;
+	size_t len;
+	char buf[201];
+	char *mini_start = "<6> Minidump start\n";
+	char *mini_end = "<6> Minidump end\n";
+
+	struct mpi3_driver_buffer_header *drv_buff_header = NULL;
+
+	dumper = mrioc->dump;
+	mpi3mr_set_dumper_active(&dumper);
+
+	kmsg_dump_rewind(&dumper.kdumper);
+	while (kmsg_dump_get_line(&dumper.kdumper, 1, NULL, 0, NULL))
+		n++;
+
+	lines = n;
+	kmsg_dump_rewind(&dumper.kdumper);
+
+	drv_buff_header = (struct mpi3_driver_buffer_header *)mrioc->drv_diag_buffer;
+	drv_buff_header->signature = 0x43495243;
+	drv_buff_header->logical_buffer_start = 0;
+	drv_buff_header->circular_buffer_size =
+		mrioc->drv_diag_buffer_sz - sizeof(struct mpi3_driver_buffer_header);
+	drv_buff_header->flags =
+		MPI3_DRIVER_DIAG_BUFFER_HEADER_FLAGS_CIRCULAR_BUF_FORMAT_ASCII;
+	pos_mini_dump =
+	    prev_offset ? prev_offset : sizeof(struct mpi3_driver_buffer_header);
+
+	if ((pos_mini_dump + strlen(mini_start)
+			    < mrioc->drv_diag_buffer_sz)) {
+		sprintf((char *)mrioc->drv_diag_buffer + pos_mini_dump,
+			"%s\n", mini_start);
+		pos_mini_dump += strlen(mini_start);
+	} else {
+		ioc_info(mrioc, "driver diag buffer is full. minidump is not started\n");
+		goto out;
+	}
+
+	while (kmsg_dump_get_line(&dumper.kdumper, 1, buf, sizeof(buf), &len)) {
+		if (!lines--)
+			break;
+		if (strstr(buf, mrioc->name) &&
+			((pos_mini_dump + len + strlen(mini_end))
+			    < mrioc->drv_diag_buffer_sz)) {
+			sprintf((char *)mrioc->drv_diag_buffer
+			    + pos_mini_dump, "%s", buf);
+			pos_mini_dump += len;
+		}
+	}
+
+	if ((pos_mini_dump + strlen(mini_end)
+			    < mrioc->drv_diag_buffer_sz)) {
+		sprintf((char *)mrioc->drv_diag_buffer + pos_mini_dump,
+			"%s\n", mini_end);
+		pos_mini_dump += strlen(mini_end);
+	}
+
+out:
+	drv_buff_header->logical_buffer_end =
+		pos_mini_dump - sizeof(struct mpi3_driver_buffer_header);
+
+	ioc_info(mrioc, "driver diag buffer base_address(including 4K header) 0x%016llx, end_address 0x%016llx\n",
+	    (unsigned long long)mrioc->drv_diag_buffer_dma,
+	    (unsigned long long)mrioc->drv_diag_buffer_dma +
+	    mrioc->drv_diag_buffer_sz);
+	ioc_info(mrioc, "logical_buffer end_address 0x%016llx, logical_buffer_end 0x%08x\n",
+	    (unsigned long long)mrioc->drv_diag_buffer_dma +
+	    drv_buff_header->logical_buffer_end,
+	    drv_buff_header->logical_buffer_end);
+
+	return pos_mini_dump;
+}
+
+/**
+ * mpi3mr_do_dump - copy system logs into driver diag buffer.
+ * @mrioc: Adapter instance reference
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_do_dump(struct mpi3mr_ioc *mrioc)
+{
+	int offset = 0, uefi_offset = 0;
+	size_t dump_size;
+	struct mpi3_driver_buffer_header *drv_buff_header = NULL;
+
+	if (!mrioc->drv_diag_buffer)
+		return;
+
+	memset(mrioc->drv_diag_buffer, 0, mrioc->drv_diag_buffer_sz);
+
+	/* Copy uefi boot logs */
+	if (mrioc->skip_uefi_snapdump == false)
+		uefi_offset = mpi3mr_do_uefi_dump(mrioc);
+	else
+		mrioc->skip_uefi_snapdump = true;
+
+	if (drv_db_level == MRIOC_DRV_DB_DISABLED)
+		return;
+
+	/* Copy controller specific logs */
+	offset += mpi3mr_do_mini_dump(mrioc, uefi_offset);
+	if (drv_db_level != MRIOC_DRV_DB_FULL)
+		return;
+
+	mpi3mr_set_dumper_active(&mrioc->dump);
+	kmsg_dump_rewind(&mrioc->dump.kdumper);
+	kmsg_dump_get_buffer(&mrioc->dump.kdumper, true,
+		mrioc->drv_diag_buffer + offset,
+		mrioc->drv_diag_buffer_sz - offset, &dump_size);
+
+	drv_buff_header = (struct mpi3_driver_buffer_header *)
+	    mrioc->drv_diag_buffer;
+	drv_buff_header->logical_buffer_end += dump_size;
+	ioc_info(mrioc, "logical_buffer end_address(0x%016llx), logical_buffer_end(0x%08x)\n",
+	    (unsigned long long)mrioc->drv_diag_buffer_dma +
+	    drv_buff_header->logical_buffer_end,
+	    drv_buff_header->logical_buffer_end);
+}
+
+/**
+ * mpi3mr_clear_reset_history - clear reset history
+ * @mrioc: Adapter instance reference
+ *
+ * Write the reset history bit in IOC status to clear the bit,
+ * if it is already set.
+ *
+ * Return: Nothing.
+ */
+static inline void mpi3mr_clear_reset_history(struct mpi3mr_ioc *mrioc)
+{
+	u32 ioc_status;
+
+	ioc_status = readl(&mrioc->sysif_regs->ioc_status);
+	if (ioc_status & MPI3_SYSIF_IOC_STATUS_RESET_HISTORY)
+		writel(ioc_status, &mrioc->sysif_regs->ioc_status);
+
+}
+
+/**
+ * mpi3mr_issue_and_process_mur - Message Unit Reset handler
+ * @mrioc: Adapter instance reference
+ * @reset_reason: Reset reason code
+ *
+ * Issue Message Unit Reset to the controller and wait for it to
+ * be complete.
+ *
+ * Return: 0 on success, -1 on failure.
+ */
+static int mpi3mr_issue_and_process_mur(struct mpi3mr_ioc *mrioc,
+    u32 reset_reason)
+{
+	u32 ioc_config, timeout, ioc_status;
+	int retval = -1;
+
+	ioc_info(mrioc, "issuing message unit reset(MUR)\n");
+	if (mrioc->unrecoverable) {
+		ioc_info(mrioc, "controller is unrecoverable message unit reset is not issued\n");
+		return retval;
+	}
+	mpi3mr_clear_reset_history(mrioc);
+	writel(reset_reason, &mrioc->sysif_regs->scratchpad[0]);
+	ioc_config = readl(&mrioc->sysif_regs->ioc_configuration);
+	ioc_config &= ~MPI3_SYSIF_IOC_CONFIG_ENABLE_IOC;
+	writel(ioc_config, &mrioc->sysif_regs->ioc_configuration);
+
+	timeout = MPI3MR_RESET_ACK_TIMEOUT * 10;
+	do {
+		ioc_status = readl(&mrioc->sysif_regs->ioc_status);
+		if ((ioc_status & MPI3_SYSIF_IOC_STATUS_RESET_HISTORY)) {
+			mpi3mr_clear_reset_history(mrioc);
+			break;
+		}
+		if (ioc_status & MPI3_SYSIF_IOC_STATUS_FAULT) {
+			mpi3mr_print_fault_info(mrioc);
+			break;
+		}
+		msleep(100);
+	} while (--timeout);
+
+	ioc_config = readl(&mrioc->sysif_regs->ioc_configuration);
+	if (timeout && !((ioc_status & MPI3_SYSIF_IOC_STATUS_READY) ||
+	      (ioc_status & MPI3_SYSIF_IOC_STATUS_FAULT) ||
+	      (ioc_config & MPI3_SYSIF_IOC_CONFIG_ENABLE_IOC)))
+		retval = 0;
+
+	ioc_info(mrioc, "ioc_status/ioc_config after %s message unit reset is (0x%x)/(0x%x)\n",
+	    (!retval) ? "successful" : "failed", ioc_status, ioc_config);
+	return retval;
+}
+
+
+/**
+ * mpi3mr_soft_reset_success - Check softreset is success or not
+ * @ioc_status: IOC status register value
+ * @ioc_config: IOC config register value
+ *
+ * Check whether the soft reset is successful or not based on
+ * IOC status and IOC config register values.
+ *
+ * Return: True when the soft reset is success, false otherwise.
+ */
+static inline bool
+mpi3mr_soft_reset_success(u32 ioc_status, u32 ioc_config)
+{
+	if (!((ioc_status & MPI3_SYSIF_IOC_STATUS_READY) ||
+	    (ioc_config & MPI3_SYSIF_IOC_CONFIG_ENABLE_IOC)))
+		return true;
+	return false;
+}
+
+/**
+ * mpi3mr_diagfault_success - Check diag fault is success or not
+ * @mrioc: Adapter reference
+ * @ioc_status: IOC status register value
+ *
+ * Check whether the controller hit diag reset fault code.
+ *
+ * Return: True when there is diag fault, false otherwise.
+ */
+static inline bool mpi3mr_diagfault_success(struct mpi3mr_ioc *mrioc,
+	u32 ioc_status)
+{
+	u32 fault;
+
+	if (!(ioc_status & MPI3_SYSIF_IOC_STATUS_FAULT))
+		return false;
+	fault = readl(&mrioc->sysif_regs->fault) & MPI3_SYSIF_FAULT_CODE_MASK;
+	if (fault == MPI3_SYSIF_FAULT_CODE_DIAG_FAULT_RESET) {
+		mpi3mr_print_fault_info(mrioc);
+		return true;
+	}
+	return false;
+}
+
+/**
+ * mpi3mr_set_diagsave - Set diag save bit for snapdump
+ * @mrioc: Adapter reference
+ *
+ * Set diag save bit in IOC configuration register to enable
+ * snapdump.
+ *
+ * Return: Nothing.
+ */
+static inline void mpi3mr_set_diagsave(struct mpi3mr_ioc *mrioc)
+{
+	u32 ioc_config;
+
+	ioc_config = readl(&mrioc->sysif_regs->ioc_configuration);
+	ioc_config |= MPI3_SYSIF_IOC_CONFIG_DIAG_SAVE;
+	writel(ioc_config, &mrioc->sysif_regs->ioc_configuration);
+}
+
+/**
+ * mpi3mr_issue_reset - Issue reset to the controller
+ * @mrioc: Adapter reference
+ * @reset_type: Reset type
+ * @reset_reason: Reset reason code
+ *
+ * Unlock the host diagnostic registers and write the specific
+ * reset type to that, wait for reset acknowledgment from the
+ * controller, if the reset is not successful retry for the
+ * predefined number of times.
+ *
+ * Return: 0 on success, non-zero on failure.
+ */
+static int mpi3mr_issue_reset(struct mpi3mr_ioc *mrioc, u16 reset_type,
+    u32 reset_reason)
+{
+	int retval = -1;
+	u8 unlock_retry_count = 0;
+	u32 host_diagnostic, ioc_status, ioc_config;
+	u32 timeout = MPI3MR_RESET_ACK_TIMEOUT * 10;
+
+	if ((reset_type != MPI3_SYSIF_HOST_DIAG_RESET_ACTION_SOFT_RESET) &&
+	    (reset_type != MPI3_SYSIF_HOST_DIAG_RESET_ACTION_DIAG_FAULT))
+		return retval;
+	if (mrioc->unrecoverable)
+		return retval;
+	if (reset_reason == MPI3MR_RESET_FROM_FIRMWARE) {
+		retval = 0;
+		return retval;
+	}
+
+	ioc_info(mrioc, "%s reset due to %s(0x%x)\n",
+	    mpi3mr_reset_type_name(reset_type),
+	    mpi3mr_reset_rc_name(reset_reason), reset_reason);
+
+
+	mpi3mr_clear_reset_history(mrioc);
+	do {
+		ioc_info(mrioc,
+		    "writing magic sequence to unlock host diag register (retry=%d)\n",
+		    ++unlock_retry_count);
+		if (unlock_retry_count >= MPI3MR_HOSTDIAG_UNLOCK_RETRY_COUNT) {
+			ioc_err(mrioc,
+			    "%s reset failed due to unlock failure, host_diagnostic(0x%08x)\n",
+			    mpi3mr_reset_type_name(reset_type),
+			    host_diagnostic);
+			mrioc->unrecoverable = 1;
+			return retval;
+		}
+
+		writel(MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_FLUSH,
+		    &mrioc->sysif_regs->write_sequence);
+		writel(MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_1ST,
+		    &mrioc->sysif_regs->write_sequence);
+		writel(MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_2ND,
+		    &mrioc->sysif_regs->write_sequence);
+		writel(MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_3RD,
+		    &mrioc->sysif_regs->write_sequence);
+		writel(MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_4TH,
+		    &mrioc->sysif_regs->write_sequence);
+		writel(MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_5TH,
+		    &mrioc->sysif_regs->write_sequence);
+		writel(MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_6TH,
+		    &mrioc->sysif_regs->write_sequence);
+		usleep_range(1000, 1100);
+		host_diagnostic = readl(&mrioc->sysif_regs->host_diagnostic);
+		ioc_info(mrioc,
+		    "wrote magic sequence: retry_count(%d), host_diagnostic(0x%08x)\n",
+		    unlock_retry_count, host_diagnostic);
+	} while (!(host_diagnostic & MPI3_SYSIF_HOST_DIAG_DIAG_WRITE_ENABLE));
+
+	writel(reset_reason, &mrioc->sysif_regs->scratchpad[0]);
+	writel(host_diagnostic | reset_type,
+	    &mrioc->sysif_regs->host_diagnostic);
+	switch (reset_type) {
+	case MPI3_SYSIF_HOST_DIAG_RESET_ACTION_SOFT_RESET:
+		do {
+			ioc_status = readl(&mrioc->sysif_regs->ioc_status);
+			ioc_config =
+			    readl(&mrioc->sysif_regs->ioc_configuration);
+			if ((ioc_status & MPI3_SYSIF_IOC_STATUS_RESET_HISTORY)
+			    && mpi3mr_soft_reset_success(ioc_status, ioc_config)
+			    ) {
+				mpi3mr_clear_reset_history(mrioc);
+				retval = 0;
+				break;
+			}
+			msleep(100);
+		} while (--timeout);
+		mpi3mr_print_fault_info(mrioc);
+		break;
+	case MPI3_SYSIF_HOST_DIAG_RESET_ACTION_DIAG_FAULT:
+		do {
+			ioc_status = readl(&mrioc->sysif_regs->ioc_status);
+			if (mpi3mr_diagfault_success(mrioc, ioc_status)) {
+				retval = 0;
+				break;
+			}
+			msleep(100);
+		} while (--timeout);
+		break;
+	default:
+		break;
+	}
+
+	writel(MPI3_SYSIF_WRITE_SEQUENCE_KEY_VALUE_2ND,
+	    &mrioc->sysif_regs->write_sequence);
+
+	ioc_config = readl(&mrioc->sysif_regs->ioc_configuration);
+	ioc_status = readl(&mrioc->sysif_regs->ioc_status);
+	ioc_info(mrioc,
+	    "ioc_status/ioc_config after %s reset is (0x%x)/(0x%x)\n",
+	    (!retval)?"successful":"failed", ioc_status,
+	    ioc_config);
+	if (retval)
+		mrioc->unrecoverable = 1;
+	return retval;
+}
+
+/**
+ * mpi3mr_admin_request_post - Post request to admin queue
+ * @mrioc: Adapter reference
+ * @admin_req: MPI3 request
+ * @admin_req_sz: Request size
+ * @ignore_reset: Ignore reset in process
+ *
+ * Post the MPI3 request into admin request queue and
+ * inform the controller, if the queue is full return
+ * appropriate error.
+ *
+ * Return: 0 on success, non-zero on failure.
+ */
+int mpi3mr_admin_request_post(struct mpi3mr_ioc *mrioc, void *admin_req,
+	u16 admin_req_sz, u8 ignore_reset)
+{
+	u16 areq_pi = 0, areq_ci = 0, max_entries = 0;
+	int retval = 0;
+	unsigned long flags;
+	u8 *areq_entry;
+
+	if (mrioc->unrecoverable) {
+		ioc_err(mrioc, "admin request queue submission failed due to unrecoverable controller\n");
+		return -EFAULT;
+	}
+
+	spin_lock_irqsave(&mrioc->admin_req_lock, flags);
+	areq_pi = mrioc->admin_req_pi;
+	areq_ci = mrioc->admin_req_ci;
+	max_entries = mrioc->num_admin_req;
+	if ((areq_ci == (areq_pi + 1)) || ((!areq_ci) &&
+	    (areq_pi == (max_entries - 1)))) {
+		ioc_err(mrioc, "admin request queue submission failed due to queue full\n");
+		retval = -EAGAIN;
+		goto out;
+	}
+	if (!ignore_reset && mrioc->reset_in_progress) {
+		ioc_err(mrioc, "admin request queue submission failed due to reset in progress\n");
+		retval = -EAGAIN;
+		goto out;
+	}
+	areq_entry = (u8 *)mrioc->admin_req_base +
+	    (areq_pi * MPI3MR_ADMIN_REQ_FRAME_SZ);
+	memset(areq_entry, 0, MPI3MR_ADMIN_REQ_FRAME_SZ);
+	memcpy(areq_entry, (u8 *)admin_req, admin_req_sz);
+
+	if (++areq_pi == max_entries)
+		areq_pi = 0;
+	mrioc->admin_req_pi = areq_pi;
+
+	writel(mrioc->admin_req_pi, &mrioc->sysif_regs->admin_request_queue_pi);
+
+out:
+	spin_unlock_irqrestore(&mrioc->admin_req_lock, flags);
+
+	return retval;
+}
+
+/**
+ * mpi3mr_op_request_post - Post request to operational queue
+ * @mrioc: Adapter reference
+ * @op_req_q: Operational request queue info
+ * @req: MPI3 request
+ *
+ * Post the MPI3 request into operational request queue and
+ * inform the controller, if the queue is full return
+ * appropriate error.
+ *
+ * Return: 0 on success, non-zero on failure.
+ */
+int mpi3mr_op_request_post(struct mpi3mr_ioc *mrioc,
+	struct op_req_qinfo *op_req_q, u8 *req)
+{
+	u16 pi = 0, max_entries, reply_qidx = 0, midx;
+	int retval = 0;
+	unsigned long flags;
+	u8 *req_entry;
+	void *segment_base_addr;
+	u16 req_sz = mrioc->facts.op_req_sz;
+	struct mpi3_scsi_io_request *scsiio_req =
+	    (struct mpi3_scsi_io_request *)req;
+	struct segments *segments = op_req_q->q_segments;
+
+	reply_qidx = op_req_q->reply_qid - 1;
+
+	if (mrioc->unrecoverable)
+		return -EFAULT;
+
+	spin_lock_irqsave(&op_req_q->q_lock, flags);
+	pi = op_req_q->pi;
+	max_entries = op_req_q->num_requests;
+
+	if (mpi3mr_check_req_qfull(op_req_q)) {
+		midx = REPLY_QUEUE_IDX_TO_MSIX_IDX(
+		    reply_qidx, mrioc->op_reply_q_offset);
+		mpi3mr_process_op_reply_q(mrioc,
+		    mrioc->intr_info[midx].op_reply_q);
+
+		if (mpi3mr_check_req_qfull(op_req_q)) {
+			if (op_req_q->last_full_host_tag ==
+			    MPI3MR_HOSTTAG_INVALID)
+				op_req_q->qfull_instances++;
+
+			op_req_q->last_full_host_tag = scsiio_req->host_tag;
+			op_req_q->qfull_io_count++;
+			retval = -EAGAIN;
+			goto out;
+		}
+	}
+
+	if (op_req_q->last_full_host_tag != MPI3MR_HOSTTAG_INVALID)
+		op_req_q->last_full_host_tag = MPI3MR_HOSTTAG_INVALID;
+
+	if (mrioc->reset_in_progress) {
+		ioc_err(mrioc, "operation request queue submission failed due to reset in progress\n");
+		retval = -EAGAIN;
+		goto out;
+	}
+
+	segment_base_addr = segments[pi / op_req_q->segment_qd].segment;
+	req_entry = (u8 *)segment_base_addr +
+	    ((pi % op_req_q->segment_qd) * req_sz);
+
+	memset(req_entry, 0, req_sz);
+	memcpy(req_entry, req, MPI3MR_ADMIN_REQ_FRAME_SZ);
+
+	if (++pi == max_entries)
+		pi = 0;
+	op_req_q->pi = pi;
+
+#ifndef CONFIG_PREEMPT_RT
+	if (atomic_inc_return(&mrioc->op_reply_qinfo[reply_qidx].pend_ios)
+	    > MPI3MR_IRQ_POLL_TRIGGER_IOCOUNT)
+		mrioc->op_reply_qinfo[reply_qidx].enable_irq_poll = true;
+#else
+	atomic_inc_return(&mrioc->op_reply_qinfo[reply_qidx].pend_ios);
+#endif
+
+#if defined(IO_COUNTER_SUPPORT)
+	atomic_inc(&mrioc->pend_ios);
+#endif
+
+	writel(op_req_q->pi,
+	    &mrioc->sysif_regs->oper_queue_indexes[reply_qidx].producer_index);
+
+out:
+	spin_unlock_irqrestore(&op_req_q->q_lock, flags);
+	return retval;
+}
+
+/**
+ * mpi3mr_check_rh_fault_ioc - check reset history and fault
+ * controller
+ * @mrioc: Adapter instance reference
+ * @reason_code, reason code for the fault.
+ *
+ * This routine will save snapdump and fault the controller with
+ * the given reason code if it is not already in the fault or
+ * not asynchronosuly reset. This will be used to handle
+ * initilaization time faults/resets/timeout as in those cases
+ * immediate soft reset invocation is not required.
+ *
+ * Return:  None.
+ */
+void mpi3mr_check_rh_fault_ioc(struct mpi3mr_ioc *mrioc, u32 reason_code)
+{
+	u32 ioc_status, host_diagnostic, timeout, fault;
+
+	if (mrioc->unrecoverable) {
+		ioc_err(mrioc, "controller is unrecoverable\n");
+		return;
+	}
+
+	if (!pci_device_is_present(mrioc->pdev))
+	{
+		mrioc->unrecoverable = 1;
+		ioc_err(mrioc, "controller is not present\n");
+		return;
+	}
+	ioc_status = readl(&mrioc->sysif_regs->ioc_status);
+	if (ioc_status & MPI3_SYSIF_IOC_STATUS_RESET_HISTORY) {
+		mpi3mr_set_trigger_data_in_all_hdb(mrioc,
+		    MPI3MR_HDB_TRIGGER_TYPE_FW_RELEASED, 0, 0);
+		return;
+	} else if (ioc_status & MPI3_SYSIF_IOC_STATUS_FAULT) {
+		fault = readl(&mrioc->sysif_regs->fault);
+		mpi3mr_set_trigger_data_in_all_hdb(mrioc,
+		    MPI3MR_HDB_TRIGGER_TYPE_FAULT, fault, 0);
+		mpi3mr_print_fault_info(mrioc);
+		return;
+	}
+	mpi3mr_set_diagsave(mrioc);
+	mpi3mr_issue_reset(mrioc, MPI3_SYSIF_HOST_DIAG_RESET_ACTION_DIAG_FAULT,
+	    reason_code);
+	fault = readl(&mrioc->sysif_regs->fault);
+	mpi3mr_set_trigger_data_in_all_hdb(mrioc, MPI3MR_HDB_TRIGGER_TYPE_FAULT,
+	    fault, 0);
+	timeout = MPI3_SYSIF_DIAG_SAVE_TIMEOUT * 10;
+	do {
+		host_diagnostic = readl(&mrioc->sysif_regs->host_diagnostic);
+		if (!(host_diagnostic & MPI3_SYSIF_HOST_DIAG_SAVE_IN_PROGRESS))
+			break;
+		msleep(100);
+	} while (--timeout);
+}
+
+/**
+ * mpi3mr_sync_timestamp - Issue time stamp sync request
+ * @mrioc: Adapter reference
+ *
+ * Issue IO Unit Control MPI request to synchornize firmware
+ * timestamp with host time.
+ *
+ * Return: 0 on success, non-zero on failure.
+ */
+static int mpi3mr_sync_timestamp(struct mpi3mr_ioc *mrioc)
+{
+	ktime_t current_time;
+	struct mpi3_iounit_control_request iou_ctrl;
+	int retval = 0;
+
+	memset(&iou_ctrl, 0, sizeof(iou_ctrl));
+	mutex_lock(&mrioc->init_cmds.mutex);
+	if (mrioc->init_cmds.state & MPI3MR_CMD_PENDING) {
+		retval = -1;
+		ioc_err(mrioc, "sending time stamp update failed due to command in use\n");
+		mutex_unlock(&mrioc->init_cmds.mutex);
+		goto out;
+	}
+	mrioc->init_cmds.state = MPI3MR_CMD_PENDING;
+	mrioc->init_cmds.is_waiting = 1;
+	mrioc->init_cmds.callback = NULL;
+	iou_ctrl.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INITCMDS);
+	iou_ctrl.function = MPI3_FUNCTION_IO_UNIT_CONTROL;
+	iou_ctrl.operation = MPI3_CTRL_OP_UPDATE_TIMESTAMP;
+	current_time = ktime_get_real();
+	iou_ctrl.param64[0] = cpu_to_le64(ktime_to_ms(current_time));
+
+	init_completion(&mrioc->init_cmds.done);
+	retval = mpi3mr_admin_request_post(mrioc, &iou_ctrl,
+	    sizeof(iou_ctrl), 0);
+	if (retval) {
+		ioc_err(mrioc, "posting time stame update failed\n");
+		goto out_unlock;
+	}
+
+	wait_for_completion_timeout(&mrioc->init_cmds.done,
+	    (MPI3MR_INTADMCMD_TIMEOUT * HZ));
+	if (!(mrioc->init_cmds.state & MPI3MR_CMD_COMPLETE)) {
+		ioc_err(mrioc, "time stamp update timed out\n");
+		mrioc->init_cmds.is_waiting = 0;
+		if (!(mrioc->init_cmds.state & MPI3MR_CMD_RESET))
+			mpi3mr_soft_reset_handler(mrioc,
+			    MPI3MR_RESET_FROM_TSU_TIMEOUT, 1);
+		retval = -1;
+		goto out_unlock;
+	}
+	if ((mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK)
+	    != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc,
+		    "time stamp update returned with ioc_status(0x%04x), log_info(0x%08x)\n",
+		    (mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK),
+		    mrioc->init_cmds.ioc_loginfo);
+		retval = -1;
+		goto out_unlock;
+	}
+
+out_unlock:
+	mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED;
+	mutex_unlock(&mrioc->init_cmds.mutex);
+
+out:
+	return retval;
+}
+
+/**
+ * mpi3mr_print_pkg_ver - display controller fw package version
+ * @mrioc: Adapter reference
+ *
+ * Retrieve firmware package version from the component image
+ * manifest of the controller flash and display it.
+ *
+ * Return: 0 on success and non-zero on failure.
+ */
+static int mpi3mr_print_pkg_ver(struct mpi3mr_ioc *mrioc)
+{
+	struct mpi3_ci_upload_request ci_upload;
+	int retval = -1;
+	void *data = NULL;
+	dma_addr_t data_dma;
+	struct mpi3_ci_manifest_mpi *manifest;
+	u32 data_len = sizeof(struct mpi3_ci_manifest_mpi);
+
+	u8 sgl_flags = MPI3MR_SGEFLAGS_SYSTEM_SIMPLE_END_OF_LIST;
+
+	data = dma_zalloc_coherent(&mrioc->pdev->dev, data_len, &data_dma,
+	    GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	memset(&ci_upload, 0, sizeof(ci_upload));
+	mutex_lock(&mrioc->init_cmds.mutex);
+	if (mrioc->init_cmds.state & MPI3MR_CMD_PENDING) {
+		ioc_err(mrioc,
+		    "issue ci manifest upload failed due to command in use\n");
+		mutex_unlock(&mrioc->init_cmds.mutex);
+		goto out;
+	}
+	mrioc->init_cmds.state = MPI3MR_CMD_PENDING;
+	mrioc->init_cmds.is_waiting = 1;
+	mrioc->init_cmds.callback = NULL;
+	ci_upload.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INITCMDS);
+	ci_upload.function = MPI3_FUNCTION_CI_UPLOAD;
+	ci_upload.msg_flags = MPI3_CI_UPLOAD_MSGFLAGS_LOCATION_PRIMARY;
+	ci_upload.signature1 = MPI3_IMAGE_HEADER_SIGNATURE1_MANIFEST;
+	ci_upload.image_offset = MPI3_IMAGE_HEADER_SIZE;
+	ci_upload.segment_size = data_len;
+
+	mpi3mr_add_sg_single(&ci_upload.sgl, sgl_flags, data_len,
+	    data_dma);
+	init_completion(&mrioc->init_cmds.done);
+	retval = mpi3mr_admin_request_post(mrioc, &ci_upload,
+	    sizeof(ci_upload), 1);
+	if (retval) {
+		ioc_err(mrioc, "issue ci manifest upload failed\n");
+		goto out_unlock;
+	}
+	wait_for_completion_timeout(&mrioc->init_cmds.done,
+	    (MPI3MR_INTADMCMD_TIMEOUT * HZ));
+	if (!(mrioc->init_cmds.state & MPI3MR_CMD_COMPLETE)) {
+		ioc_err(mrioc, "issue ci manifest upload timed out\n");
+		mpi3mr_check_rh_fault_ioc(mrioc,
+		    MPI3MR_RESET_FROM_GETPKGVER_TIMEOUT);
+		retval = -1;
+		goto out_unlock;
+	}
+	if ((mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK)
+	    == MPI3_IOCSTATUS_SUCCESS) {
+		manifest = (struct mpi3_ci_manifest_mpi *) data;
+		if (manifest->manifest_type == MPI3_CI_MANIFEST_TYPE_MPI) {
+			ioc_info(mrioc,
+			    "firmware package version(%d.%d.%d.%d.%05d-%05d)\n",
+			    manifest->package_version.gen_major,
+			    manifest->package_version.gen_minor,
+			    manifest->package_version.phase_major,
+			    manifest->package_version.phase_minor,
+			    manifest->package_version.customer_id,
+			    manifest->package_version.build_num);
+		}
+	}
+	retval = 0;
+out_unlock:
+	mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED;
+	mutex_unlock(&mrioc->init_cmds.mutex);
+
+out:
+	if (data)
+		dma_free_coherent(&mrioc->pdev->dev, data_len, data,
+		    data_dma);
+	return retval;
+}
+
+/**
+ * mpi3mr_upload_drv_diag_buffer - upload the driver diag log
+ * @mrioc: Adapter reference
+ *
+ * Uploads the driver buffer to driver internal memory from the
+ * firmware which might have UEFI boot Services log and cache
+ * the returned data length from the upload into the per adapter
+ * structure.
+ *
+ * Return: Nothing.
+ */
+static void
+mpi3mr_upload_drv_diag_buffer(struct mpi3mr_ioc *mrioc)
+{
+	struct mpi3_diag_buffer_upload_request diag_upload;
+	struct mpi3_diag_buffer_upload_reply *diag_upload_reply;
+	int retval = 0;
+	dma_addr_t data_dma;
+	void *data;
+	u32 data_len;
+	u8 sgl_flags = MPI3MR_SGEFLAGS_SYSTEM_SIMPLE_END_OF_LIST;
+
+	data_dma = mrioc->drv_diag_buffer_dma;
+	data = mrioc->drv_diag_buffer;
+
+	/* At max half of the diag buffer can be used for uefi logs */
+	data_len = min_t(int, mrioc->facts.diag_drvr_sz,
+			mrioc->drv_diag_buffer_sz/2);
+
+	memset(&diag_upload, 0, sizeof(diag_upload));
+	mutex_lock(&mrioc->init_cmds.mutex);
+	if (mrioc->init_cmds.state & MPI3MR_CMD_PENDING) {
+		ioc_err(mrioc, "sending driver diag upload failed due to command in use\n");
+		mutex_unlock(&mrioc->init_cmds.mutex);
+		return;
+	}
+	mrioc->init_cmds.state = MPI3MR_CMD_PENDING;
+	mrioc->init_cmds.is_waiting = 1;
+	mrioc->init_cmds.callback = NULL;
+	diag_upload.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INITCMDS);
+	diag_upload.function = MPI3_FUNCTION_DIAG_BUFFER_UPLOAD;
+	diag_upload.msg_flags = 0;
+	diag_upload.change_count = 0;
+	diag_upload.type = MPI3_DIAG_BUFFER_TYPE_DRIVER;
+	diag_upload.flags =
+		MPI3_DRIVER_DIAG_BUFFER_HEADER_FLAGS_CIRCULAR_BUF_FORMAT_ASCII;
+	diag_upload.context = 0;
+
+	mpi3mr_add_sg_single(&diag_upload.sgl, sgl_flags, data_len,
+	    data_dma);
+	init_completion(&mrioc->init_cmds.done);
+	retval = mpi3mr_admin_request_post(mrioc, &diag_upload,
+	    sizeof(diag_upload), 1);
+	if (retval) {
+		ioc_err(mrioc, "posting driver diag upload failed\n");
+		goto out_unlock;
+	}
+	wait_for_completion_timeout(&mrioc->init_cmds.done,
+	    (MPI3MR_INTADMCMD_TIMEOUT * HZ));
+	if (!(mrioc->init_cmds.state & MPI3MR_CMD_COMPLETE)) {
+		ioc_err(mrioc, "driver diag upload timed out\n");
+		mpi3mr_check_rh_fault_ioc(mrioc,
+		    MPI3MR_RESET_FROM_DIAG_BUFFER_UPLOAD_TIMEOUT);
+		goto out_unlock;
+	}
+
+	mrioc->uefi_logs_sz = 0;
+	if ((mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK)
+	    != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc,
+		    "driver diag upload returned with ioc_status(0x%04x),log_info(0x%08x)\n",
+		    (mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK),
+		    mrioc->init_cmds.ioc_loginfo);
+	} else {
+		diag_upload_reply = (struct mpi3_diag_buffer_upload_reply *)
+			mrioc->init_cmds.reply;
+		mrioc->uefi_logs_sz =
+			le32_to_cpu(diag_upload_reply->returned_data_length);
+		if (mrioc->uefi_logs_sz) {
+			mrioc->uefi_logs = vzalloc(mrioc->uefi_logs_sz);
+			if (!mrioc->uefi_logs)
+				mrioc->uefi_logs_sz = 0;
+			else
+				memcpy(mrioc->uefi_logs, data, data_len);
+		}
+		ioc_info(mrioc,
+		    "driver diag buffer upload is success size drv/fw/final(%d/%d/%d)\n",
+		    data_len,
+		    le32_to_cpu(diag_upload_reply->returned_data_length),
+		    mrioc->uefi_logs_sz);
+	}
+
+out_unlock:
+	mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED;
+	mutex_unlock(&mrioc->init_cmds.mutex);
+	return;
+}
+
+/**
+ * mpi3mr_watchdog_work - watchdog thread to monitor faults
+ * @work: work struct
+ *
+ * Watch dog work periodically executed (1 second interval) to
+ * monitor firmware fault and to issue periodic timer sync to
+ * the firmware.
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_watchdog_work(struct work_struct *work)
+{
+	struct mpi3mr_ioc *mrioc =
+	    container_of(work, struct mpi3mr_ioc, watchdog_work.work);
+	unsigned long flags;
+	enum mpi3mr_iocstate ioc_state;
+	u32 fault, host_diagnostic, ioc_status;
+	u32 reset_reason = MPI3MR_RESET_FROM_FAULT_WATCH;
+
+	if (mrioc->reset_in_progress)
+		return;
+
+	if (!mrioc->unrecoverable && !pci_device_is_present(mrioc->pdev)) {
+		ioc_err(mrioc, "watchdog could not detect the controller\n");
+		mrioc->unrecoverable = 1;
+	}
+
+	if (mrioc->unrecoverable) {
+		ioc_err(mrioc, "flush pending commands for unrecoverable controller\n");
+		mpi3mr_flush_cmds_for_unrecovered_controller(mrioc);
+		return;
+	}
+
+	if (mrioc->ts_update_counter++ >= MPI3MR_TSUPDATE_INTERVAL) {
+		/* No need to capture uefi snapdump
+		 * after certain time elapsed. */
+		mrioc->skip_uefi_snapdump = true;
+		mrioc->ts_update_counter = 0;
+		mpi3mr_sync_timestamp(mrioc);
+	}
+
+	if ((mrioc->prepare_for_reset) &&
+	    ((mrioc->prepare_for_reset_timeout_counter++) >=
+	     MPI3MR_PREPARE_FOR_RESET_TIMEOUT)) {
+		mpi3mr_soft_reset_handler(mrioc,
+		    MPI3MR_RESET_FROM_CIACTVRST_TIMER, 1);
+		return;
+	}
+
+	ioc_status = readl(&mrioc->sysif_regs->ioc_status);
+	if (ioc_status & MPI3_SYSIF_IOC_STATUS_RESET_HISTORY) {
+		mpi3mr_set_trigger_data_in_all_hdb(mrioc,
+		    MPI3MR_HDB_TRIGGER_TYPE_FW_RELEASED, 0, 0);
+		mpi3mr_soft_reset_handler(mrioc, MPI3MR_RESET_FROM_FIRMWARE, 0);
+		return;
+	}
+
+	/*Check for fault state every one second and issue Soft reset*/
+	ioc_state = mpi3mr_get_iocstate(mrioc);
+	if (ioc_state != MRIOC_STATE_FAULT)
+		goto schedule_work;
+
+	fault = readl(&mrioc->sysif_regs->fault);
+	mpi3mr_set_trigger_data_in_all_hdb(mrioc,
+	    MPI3MR_HDB_TRIGGER_TYPE_FAULT, fault, 0);
+	fault = fault & MPI3_SYSIF_FAULT_CODE_MASK;
+	host_diagnostic = readl(&mrioc->sysif_regs->host_diagnostic);
+	if (host_diagnostic & MPI3_SYSIF_HOST_DIAG_SAVE_IN_PROGRESS) {
+		if (!mrioc->diagsave_timeout) {
+			mpi3mr_print_fault_info(mrioc);
+			ioc_warn(mrioc, "diag save in progress\n");
+			mpi3mr_do_dump(mrioc);
+		}
+		if ((mrioc->diagsave_timeout++) <= MPI3_SYSIF_DIAG_SAVE_TIMEOUT)
+			goto schedule_work;
+	}
+
+	mpi3mr_print_fault_info(mrioc);
+	mrioc->diagsave_timeout = 0;
+
+	switch (fault) {
+	case MPI3_SYSIF_FAULT_CODE_COMPLETE_RESET_NEEDED:
+	case MPI3_SYSIF_FAULT_CODE_POWER_CYCLE_REQUIRED:
+		ioc_warn(mrioc,
+		    "controller requires system power cycle, marking controller as unrecoverable\n");
+		mrioc->unrecoverable = 1;
+		goto schedule_work;
+	case MPI3_SYSIF_FAULT_CODE_SOFT_RESET_IN_PROGRESS:
+		return;
+	case MPI3_SYSIF_FAULT_CODE_CI_ACTIVATION_RESET:
+		reset_reason = MPI3MR_RESET_FROM_CIACTIV_FAULT;
+		break;
+	default:
+		break;
+	}
+	mpi3mr_soft_reset_handler(mrioc, reset_reason, 0);
+	return;
+
+schedule_work:
+	spin_lock_irqsave(&mrioc->watchdog_lock, flags);
+	if (mrioc->watchdog_work_q)
+		queue_delayed_work(mrioc->watchdog_work_q,
+		    &mrioc->watchdog_work,
+		    msecs_to_jiffies(MPI3MR_WATCHDOG_INTERVAL));
+	spin_unlock_irqrestore(&mrioc->watchdog_lock, flags);
+	return;
+}
+
+/**
+ * mpi3mr_start_watchdog - Start watchdog
+ * @mrioc: Adapter instance reference
+ *
+ * Create and start the watchdog thread to monitor controller
+ * faults.
+ *
+ * Return: Nothing.
+ */
+void mpi3mr_start_watchdog(struct mpi3mr_ioc *mrioc)
+{
+	if (mrioc->watchdog_work_q)
+		return;
+
+	INIT_DELAYED_WORK(&mrioc->watchdog_work, mpi3mr_watchdog_work);
+	snprintf(mrioc->watchdog_work_q_name,
+	    sizeof(mrioc->watchdog_work_q_name), "watchdog_%s%d", mrioc->name,
+	    mrioc->id);
+	mrioc->watchdog_work_q =
+	    create_singlethread_workqueue(mrioc->watchdog_work_q_name);
+	if (!mrioc->watchdog_work_q) {
+		ioc_err(mrioc, "%s: failed (line=%d)\n", __func__, __LINE__);
+		return;
+	}
+
+	if (mrioc->watchdog_work_q)
+		queue_delayed_work(mrioc->watchdog_work_q,
+		    &mrioc->watchdog_work,
+		    msecs_to_jiffies(MPI3MR_WATCHDOG_INTERVAL));
+}
+
+/**
+ * mpi3mr_stop_watchdog - Stop watchdog
+ * @mrioc: Adapter instance reference
+ *
+ * Stop the watchdog thread created to monitor controller
+ * faults.
+ *
+ * Return: Nothing.
+ */
+void mpi3mr_stop_watchdog(struct mpi3mr_ioc *mrioc)
+{
+	unsigned long flags;
+	struct workqueue_struct *wq;
+
+	spin_lock_irqsave(&mrioc->watchdog_lock, flags);
+	wq = mrioc->watchdog_work_q;
+	mrioc->watchdog_work_q = NULL;
+	spin_unlock_irqrestore(&mrioc->watchdog_lock, flags);
+	if (wq) {
+		if (!cancel_delayed_work_sync(&mrioc->watchdog_work))
+			flush_workqueue(wq);
+		destroy_workqueue(wq);
+	}
+}
+
+
+/**
+ * mpi3mr_free_op_req_q_segments - free request memory segments
+ * @mrioc: Adapter instance reference
+ * @q_idx: operational request queue index
+ *
+ * Free memory segments allocated for operational request queue
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_free_op_req_q_segments(struct mpi3mr_ioc *mrioc, u16 q_idx)
+{
+	u16 j;
+	int size;
+	struct segments *segments;
+
+	segments = mrioc->req_qinfo[q_idx].q_segments;
+	if (!segments)
+		return;
+
+	if (mrioc->is_segqueue_enabled) {
+		size = MPI3MR_OP_REQ_Q_SEG_SIZE;
+		if (mrioc->req_qinfo[q_idx].q_segment_list) {
+			dma_free_coherent(&mrioc->pdev->dev,
+			    MPI3MR_MAX_SEG_LIST_SIZE,
+			    mrioc->req_qinfo[q_idx].q_segment_list,
+			    mrioc->req_qinfo[q_idx].q_segment_list_dma);
+			mrioc->req_qinfo[q_idx].q_segment_list = NULL;
+		}
+	} else
+		size = mrioc->req_qinfo[q_idx].segment_qd *
+		    mrioc->facts.op_req_sz;
+
+	for (j = 0; j < mrioc->req_qinfo[q_idx].num_segments; j++) {
+		if (!segments[j].segment)
+			continue;
+		dma_free_coherent(&mrioc->pdev->dev,
+		    size, segments[j].segment, segments[j].segment_dma);
+		segments[j].segment = NULL;
+	}
+	kfree(mrioc->req_qinfo[q_idx].q_segments);
+	mrioc->req_qinfo[q_idx].q_segments = NULL;
+	mrioc->req_qinfo[q_idx].qid = 0;
+}
+
+/**
+ * mpi3mr_free_op_reply_q_segments - free reply memory segments
+ * @mrioc: Adapter instance reference
+ * @q_idx: operational reply queue index
+ *
+ * Free memory segments allocated for operational reply queue
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_free_op_reply_q_segments(struct mpi3mr_ioc *mrioc, u16 q_idx)
+{
+	u16 j;
+	int size;
+	struct segments *segments;
+
+	segments = mrioc->op_reply_qinfo[q_idx].q_segments;
+	if (!segments)
+		return;
+
+	if (mrioc->is_segqueue_enabled) {
+		size = MPI3MR_OP_REP_Q_SEG_SIZE;
+		if (mrioc->op_reply_qinfo[q_idx].q_segment_list) {
+			dma_free_coherent(&mrioc->pdev->dev,
+			    MPI3MR_MAX_SEG_LIST_SIZE,
+			    mrioc->op_reply_qinfo[q_idx].q_segment_list,
+			    mrioc->op_reply_qinfo[q_idx].q_segment_list_dma);
+			mrioc->op_reply_qinfo[q_idx].q_segment_list = NULL;
+		}
+	} else
+		size = mrioc->op_reply_qinfo[q_idx].segment_qd *
+		    mrioc->op_reply_desc_sz;
+
+	for (j = 0; j < mrioc->op_reply_qinfo[q_idx].num_segments; j++) {
+		if (!segments[j].segment)
+			continue;
+		dma_free_coherent(&mrioc->pdev->dev,
+		    size, segments[j].segment, segments[j].segment_dma);
+		segments[j].segment = NULL;
+	}
+
+	kfree(mrioc->op_reply_qinfo[q_idx].q_segments);
+	mrioc->op_reply_qinfo[q_idx].q_segments = NULL;
+	mrioc->op_reply_qinfo[q_idx].qid = 0;
+}
+
+
+/**
+ * mpi3mr_delete_op_reply_q - delete operational reply queue
+ * @mrioc: Adapter instance reference
+ * @qidx: operational reply queue index
+ *
+ * Delete operational reply queue by issuing MPI request
+ * through admin queue.
+ *
+ * Return:  0 on success, non-zero on failure.
+ */
+static int mpi3mr_delete_op_reply_q(struct mpi3mr_ioc *mrioc, u16 qidx)
+{
+	struct mpi3_delete_reply_queue_request delq_req;
+	struct op_reply_qinfo *op_reply_q = mrioc->op_reply_qinfo + qidx;
+	int retval = 0;
+	u16 reply_qid = 0, midx;
+
+	reply_qid = op_reply_q->qid;
+
+	midx = REPLY_QUEUE_IDX_TO_MSIX_IDX(qidx, mrioc->op_reply_q_offset);
+
+	if (!reply_qid)	{
+		retval = -1;
+		ioc_err(mrioc, "delete reply queue failed due to invalid reply queue id\n");
+		goto out;
+	}
+
+	(op_reply_q->qtype == MPI3MR_DEFAULT_QUEUE) ? mrioc->default_qcount-- :
+	    mrioc->active_poll_qcount--;
+
+	memset(&delq_req, 0, sizeof(delq_req));
+	mutex_lock(&mrioc->init_cmds.mutex);
+	if (mrioc->init_cmds.state & MPI3MR_CMD_PENDING) {
+		retval = -1;
+		ioc_err(mrioc, "sending delete reply queue failed due to command in use\n");
+		mutex_unlock(&mrioc->init_cmds.mutex);
+		goto out;
+	}
+	mrioc->init_cmds.state = MPI3MR_CMD_PENDING;
+	mrioc->init_cmds.is_waiting = 1;
+	mrioc->init_cmds.callback = NULL;
+	delq_req.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INITCMDS);
+	delq_req.function = MPI3_FUNCTION_DELETE_REPLY_QUEUE;
+	delq_req.queue_id = cpu_to_le16(reply_qid);
+
+	init_completion(&mrioc->init_cmds.done);
+	retval = mpi3mr_admin_request_post(mrioc, &delq_req, sizeof(delq_req),
+	    1);
+	if (retval) {
+		ioc_err(mrioc, "posting delete reply queue failed\n");
+		goto out_unlock;
+	}
+	wait_for_completion_timeout(&mrioc->init_cmds.done,
+	    (MPI3MR_INTADMCMD_TIMEOUT * HZ));
+	if (!(mrioc->init_cmds.state & MPI3MR_CMD_COMPLETE)) {
+		ioc_err(mrioc, "delete reply queue timed out\n");
+		mpi3mr_check_rh_fault_ioc(mrioc,
+		    MPI3MR_RESET_FROM_DELREPQ_TIMEOUT);
+		retval = -1;
+		goto out_unlock;
+	}
+	if ((mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK)
+	    != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc,
+		    "delete reply queue returned with ioc_status(0x%04x), log_info(0x%08x)\n",
+		    (mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK),
+		    mrioc->init_cmds.ioc_loginfo);
+		retval = -1;
+		goto out_unlock;
+	}
+	mrioc->intr_info[midx].op_reply_q = NULL;
+
+	mpi3mr_free_op_reply_q_segments(mrioc, qidx);
+out_unlock:
+	mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED;
+	mutex_unlock(&mrioc->init_cmds.mutex);
+out:
+	return retval;
+}
+
+/**
+ * mpi3mr_alloc_op_reply_q_segments -Alloc segmented reply pool
+ * @mrioc: Adapter instance reference
+ * @qidx: request queue index
+ *
+ * Allocate segmented memory pools for operational reply
+ * queue.
+ *
+ * Return: 0 on success, non-zero on failure.
+ */
+static int mpi3mr_alloc_op_reply_q_segments(struct mpi3mr_ioc *mrioc, u16 qidx)
+{
+	struct op_reply_qinfo *op_reply_q = mrioc->op_reply_qinfo + qidx;
+	int i, size;
+	u64 *q_segment_list_entry = NULL;
+	struct segments *segments;
+
+	if (mrioc->is_segqueue_enabled) {
+		op_reply_q->segment_qd =
+		    MPI3MR_OP_REP_Q_SEG_SIZE / mrioc->op_reply_desc_sz;
+
+		size = MPI3MR_OP_REP_Q_SEG_SIZE;
+
+		op_reply_q->q_segment_list =
+			dma_zalloc_coherent(&mrioc->pdev->dev,
+			    MPI3MR_MAX_SEG_LIST_SIZE,
+			    &op_reply_q->q_segment_list_dma, GFP_KERNEL);
+		if (!op_reply_q->q_segment_list)
+			return -ENOMEM;
+		q_segment_list_entry = (u64 *)op_reply_q->q_segment_list;
+	} else {
+		op_reply_q->segment_qd = op_reply_q->num_replies;
+		size = op_reply_q->num_replies * mrioc->op_reply_desc_sz;
+	}
+
+	op_reply_q->num_segments = DIV_ROUND_UP(op_reply_q->num_replies,
+	    op_reply_q->segment_qd);
+
+	op_reply_q->q_segments = kcalloc(op_reply_q->num_segments,
+	    sizeof(struct segments), GFP_KERNEL);
+	if (!op_reply_q->q_segments)
+		return -ENOMEM;
+
+	segments = op_reply_q->q_segments;
+	for (i = 0; i < op_reply_q->num_segments; i++) {
+		segments[i].segment =
+		    dma_zalloc_coherent(&mrioc->pdev->dev,
+		    size, &segments[i].segment_dma, GFP_KERNEL);
+		if (!segments[i].segment)
+			return -ENOMEM;
+		if (mrioc->is_segqueue_enabled)
+			q_segment_list_entry[i] =
+			    (unsigned long)segments[i].segment_dma;
+	}
+
+	return 0;
+}
+
+/**
+ * mpi3mr_alloc_op_req_q_segments - Alloc segmented req pool.
+ * @mrioc: Adapter instance reference
+ * @qidx: request queue index
+ *
+ * Allocate segmented memory pools for operational request
+ * queue.
+ *
+ * Return: 0 on success, non-zero on failure.
+ */
+static int mpi3mr_alloc_op_req_q_segments(struct mpi3mr_ioc *mrioc, u16 qidx)
+{
+	struct op_req_qinfo *op_req_q = mrioc->req_qinfo + qidx;
+	int i, size;
+	u64 *q_segment_list_entry = NULL;
+	struct segments *segments;
+
+	if (mrioc->is_segqueue_enabled) {
+		op_req_q->segment_qd =
+		    MPI3MR_OP_REQ_Q_SEG_SIZE / mrioc->facts.op_req_sz;
+
+		size = MPI3MR_OP_REQ_Q_SEG_SIZE;
+
+		op_req_q->q_segment_list =
+			dma_zalloc_coherent(&mrioc->pdev->dev,
+			    MPI3MR_MAX_SEG_LIST_SIZE,
+			    &op_req_q->q_segment_list_dma, GFP_KERNEL);
+		if (!op_req_q->q_segment_list)
+			return -ENOMEM;
+		q_segment_list_entry = (u64 *)op_req_q->q_segment_list;
+
+	} else {
+		op_req_q->segment_qd = op_req_q->num_requests;
+		size = op_req_q->num_requests * mrioc->facts.op_req_sz;
+	}
+
+	op_req_q->num_segments = DIV_ROUND_UP(op_req_q->num_requests,
+	    op_req_q->segment_qd);
+
+	op_req_q->q_segments = kcalloc(op_req_q->num_segments,
+	    sizeof(struct segments), GFP_KERNEL);
+	if (!op_req_q->q_segments)
+		return -ENOMEM;
+
+	segments = op_req_q->q_segments;
+	for (i = 0; i < op_req_q->num_segments; i++) {
+		segments[i].segment =
+		    dma_zalloc_coherent(&mrioc->pdev->dev,
+		    size, &segments[i].segment_dma, GFP_KERNEL);
+		if (!segments[i].segment)
+			return -ENOMEM;
+		if (mrioc->is_segqueue_enabled)
+			q_segment_list_entry[i] =
+			    (unsigned long)segments[i].segment_dma;
+	}
+
+	return 0;
+}
+
+/**
+ * mpi3mr_create_op_reply_q - create operational reply queue
+ * @mrioc: Adapter instance reference
+ * @qidx: operational reply queue index
+ *
+ * Create operational reply queue by issuing MPI request
+ * through admin queue.
+ *
+ * Return:  0 on success, non-zero on failure.
+ */
+static int mpi3mr_create_op_reply_q(struct mpi3mr_ioc *mrioc, u16 qidx)
+{
+	struct mpi3_create_reply_queue_request create_req;
+	struct op_reply_qinfo *op_reply_q = mrioc->op_reply_qinfo + qidx;
+	int retval = 0;
+	u16 reply_qid = 0, midx;
+
+	reply_qid = op_reply_q->qid;
+
+	midx = REPLY_QUEUE_IDX_TO_MSIX_IDX(qidx, mrioc->op_reply_q_offset);
+
+	if (reply_qid) {
+		retval = -1;
+		ioc_err(mrioc, "create reply queue failed due to duplicate qid(%d)\n",
+		    reply_qid);
+
+		return retval;
+	}
+
+	reply_qid = qidx + 1;
+	op_reply_q->num_replies = MPI3MR_OP_REP_Q_QD;
+	if (!mrioc->pdev->revision)
+		op_reply_q->num_replies = MPI3MR_OP_REP_Q_QD4K;
+	op_reply_q->ci = 0;
+	op_reply_q->ephase = 1;
+	atomic_set(&op_reply_q->pend_ios, 0);
+	atomic_set(&op_reply_q->in_use, 0);
+	op_reply_q->enable_irq_poll = false;
+
+	if (!op_reply_q->q_segments) {
+		retval = mpi3mr_alloc_op_reply_q_segments(mrioc, qidx);
+		if (retval) {
+			mpi3mr_free_op_reply_q_segments(mrioc, qidx);
+			goto out;
+		}
+	}
+
+	memset(&create_req, 0, sizeof(create_req));
+	mutex_lock(&mrioc->init_cmds.mutex);
+	if (mrioc->init_cmds.state & MPI3MR_CMD_PENDING) {
+		retval = -1;
+		ioc_err(mrioc, "sending create reply queue failed due to command in use\n");
+		goto out_unlock;
+	}
+	mrioc->init_cmds.state = MPI3MR_CMD_PENDING;
+	mrioc->init_cmds.is_waiting = 1;
+	mrioc->init_cmds.callback = NULL;
+	create_req.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INITCMDS);
+	create_req.function = MPI3_FUNCTION_CREATE_REPLY_QUEUE;
+	create_req.queue_id = cpu_to_le16(reply_qid);
+
+	if (midx < (mrioc->intr_info_count - mrioc->requested_poll_qcount))
+		op_reply_q->qtype = MPI3MR_DEFAULT_QUEUE;
+	else
+		op_reply_q->qtype = MPI3MR_POLL_QUEUE;
+
+	if (op_reply_q->qtype == MPI3MR_DEFAULT_QUEUE) {
+		create_req.flags =
+			MPI3_CREATE_REPLY_QUEUE_FLAGS_INT_ENABLE_ENABLE;
+		create_req.msix_index =
+			cpu_to_le16(mrioc->intr_info[midx].msix_index);
+	} else {
+		create_req.msix_index = cpu_to_le16(mrioc->intr_info_count - 1);
+		ioc_info(mrioc, "create reply queue(polled): for qid(%d), midx(%d)\n",
+			reply_qid, midx);
+		if (!mrioc->active_poll_qcount)
+			disable_irq_nosync(pci_irq_vector(mrioc->pdev,
+			    mrioc->intr_info_count - 1));
+	}
+
+	if (mrioc->is_segqueue_enabled) {
+		create_req.flags |=
+		    MPI3_CREATE_REQUEST_QUEUE_FLAGS_SEGMENTED_SEGMENTED;
+		create_req.base_address = cpu_to_le64(
+		    op_reply_q->q_segment_list_dma);
+	} else
+		create_req.base_address = cpu_to_le64(
+		    op_reply_q->q_segments[0].segment_dma);
+
+	create_req.size = cpu_to_le16(op_reply_q->num_replies);
+
+	init_completion(&mrioc->init_cmds.done);
+	retval = mpi3mr_admin_request_post(mrioc, &create_req,
+	    sizeof(create_req), 1);
+	if (retval) {
+		ioc_err(mrioc, "posting create reply queue failed\n");
+		goto out_unlock;
+	}
+	wait_for_completion_timeout(&mrioc->init_cmds.done,
+	    (MPI3MR_INTADMCMD_TIMEOUT * HZ));
+	if (!(mrioc->init_cmds.state & MPI3MR_CMD_COMPLETE)) {
+		ioc_err(mrioc, "create reply queue timed out\n");
+		mpi3mr_check_rh_fault_ioc(mrioc,
+		    MPI3MR_RESET_FROM_CREATEREPQ_TIMEOUT);
+		retval = -1;
+		goto out_unlock;
+	}
+	if ((mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK)
+	    != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc,
+		    "create reply queue returned with ioc_status(0x%04x), log_info(0x%08x)\n",
+		    (mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK),
+		    mrioc->init_cmds.ioc_loginfo);
+		retval = -1;
+		goto out_unlock;
+	}
+	op_reply_q->qid = reply_qid;
+	if (midx < mrioc->intr_info_count)
+		mrioc->intr_info[midx].op_reply_q = op_reply_q;
+
+	(op_reply_q->qtype == MPI3MR_DEFAULT_QUEUE) ? mrioc->default_qcount++ :
+	    mrioc->active_poll_qcount++;
+
+out_unlock:
+	mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED;
+	mutex_unlock(&mrioc->init_cmds.mutex);
+out:
+
+	return retval;
+}
+
+/**
+ * mpi3mr_create_op_req_q - create operational request queue
+ * @mrioc: Adapter instance reference
+ * @idx: operational request queue index
+ * @reply_qid: Reply queue ID
+ *
+ * Create operational request queue by issuing MPI request
+ * through admin queue.
+ *
+ * Return:  0 on success, non-zero on failure.
+ */
+static int mpi3mr_create_op_req_q(struct mpi3mr_ioc *mrioc, u16 idx,
+	u16 reply_qid)
+{
+	struct mpi3_create_request_queue_request create_req;
+	struct op_req_qinfo *op_req_q = mrioc->req_qinfo + idx;
+	int retval = 0;
+	u16 req_qid = 0;
+
+
+	req_qid = op_req_q->qid;
+
+	if (req_qid) {
+		retval = -1;
+		ioc_err(mrioc, "create request queue failed due to duplicate qid(%d)\n",
+		    req_qid);
+
+		return retval;
+	}
+	req_qid = idx + 1;
+
+	op_req_q->num_requests = MPI3MR_OP_REQ_Q_QD;
+	op_req_q->ci = 0;
+	op_req_q->pi = 0;
+	op_req_q->reply_qid = reply_qid;
+	op_req_q->last_full_host_tag =  MPI3MR_HOSTTAG_INVALID;
+	op_req_q->qfull_io_count =  0;
+	op_req_q->qfull_instances =  0;
+	spin_lock_init(&op_req_q->q_lock);
+
+	if (!op_req_q->q_segments) {
+		retval = mpi3mr_alloc_op_req_q_segments(mrioc, idx);
+		if (retval) {
+			mpi3mr_free_op_req_q_segments(mrioc, idx);
+			goto out;
+		}
+	}
+
+	memset(&create_req, 0, sizeof(create_req));
+	mutex_lock(&mrioc->init_cmds.mutex);
+	if (mrioc->init_cmds.state & MPI3MR_CMD_PENDING) {
+		retval = -1;
+		ioc_err(mrioc, "sending create request queue failed due to command in use\n");
+		goto out_unlock;
+	}
+	mrioc->init_cmds.state = MPI3MR_CMD_PENDING;
+	mrioc->init_cmds.is_waiting = 1;
+	mrioc->init_cmds.callback = NULL;
+	create_req.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INITCMDS);
+	create_req.function = MPI3_FUNCTION_CREATE_REQUEST_QUEUE;
+	create_req.queue_id = cpu_to_le16(req_qid);
+	if (mrioc->is_segqueue_enabled) {
+		create_req.flags =
+		    MPI3_CREATE_REQUEST_QUEUE_FLAGS_SEGMENTED_SEGMENTED;
+		create_req.base_address = cpu_to_le64(
+		    op_req_q->q_segment_list_dma);
+	} else
+		create_req.base_address = cpu_to_le64(
+		    op_req_q->q_segments[0].segment_dma);
+	create_req.reply_queue_id = cpu_to_le16(reply_qid);
+	create_req.size = cpu_to_le16(op_req_q->num_requests);
+
+	init_completion(&mrioc->init_cmds.done);
+	retval = mpi3mr_admin_request_post(mrioc, &create_req,
+	    sizeof(create_req), 1);
+	if (retval) {
+		ioc_err(mrioc, "posting create request queue failed\n");
+		goto out_unlock;
+	}
+	wait_for_completion_timeout(&mrioc->init_cmds.done,
+	    (MPI3MR_INTADMCMD_TIMEOUT * HZ));
+	if (!(mrioc->init_cmds.state & MPI3MR_CMD_COMPLETE)) {
+		ioc_err(mrioc, "create request queue timed out\n");
+		mpi3mr_check_rh_fault_ioc(mrioc,
+		    MPI3MR_RESET_FROM_CREATEREQQ_TIMEOUT);
+		retval = -1;
+		goto out_unlock;
+	}
+	if ((mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK)
+	    != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc,
+		    "create request queue returned with ioc_status(0x%04x), log_info(0x%08x)\n",
+		    (mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK),
+		    mrioc->init_cmds.ioc_loginfo);
+		retval = -1;
+		goto out_unlock;
+	}
+	op_req_q->qid = req_qid;
+
+out_unlock:
+	mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED;
+	mutex_unlock(&mrioc->init_cmds.mutex);
+out:
+
+	return retval;
+}
+
+/**
+ * mpi3mr_create_op_queues - create operational queue pairs
+ * @mrioc: Adapter instance reference
+ *
+ * Allocate memory for operational queue meta data and call
+ * create request and reply queue functions.
+ *
+ * Return: 0 on success, non-zero on failures.
+ */
+static int mpi3mr_create_op_queues(struct mpi3mr_ioc *mrioc)
+{
+	int retval = 0;
+	u16 num_queues = 0, i = 0, msix_count_op_q = 1;
+
+	num_queues = min_t(int, mrioc->facts.max_op_reply_q,
+	    mrioc->facts.max_op_req_q);
+
+	msix_count_op_q =
+	    mrioc->intr_info_count - mrioc->op_reply_q_offset;
+	if (!mrioc->num_queues)
+		mrioc->num_queues = min_t(int, num_queues, msix_count_op_q);
+	/*
+	 * During reset set the num_queues to the number of queues
+	 * that was set before the reset.
+	 */
+	num_queues = mrioc->num_op_reply_q ?
+	    mrioc->num_op_reply_q : mrioc->num_queues;
+	ioc_info(mrioc, "trying to create %d operational queue pairs\n",
+	    num_queues);
+
+	if (!mrioc->req_qinfo) {
+		mrioc->req_qinfo = kcalloc(num_queues,
+		    sizeof(struct op_req_qinfo), GFP_KERNEL);
+		if (!mrioc->req_qinfo) {
+			retval = -1;
+			goto out_failed;
+		}
+
+		mrioc->op_reply_qinfo = kzalloc(sizeof(struct op_reply_qinfo) *
+		    num_queues, GFP_KERNEL);
+		if (!mrioc->op_reply_qinfo) {
+			retval = -1;
+			goto out_failed;
+		}
+	}
+
+	if (mrioc->is_segqueue_enabled)
+		ioc_info(mrioc,
+		    "allocating operational queues through segmented queues\n");
+
+	for (i = 0; i < num_queues; i++) {
+		if (mpi3mr_create_op_reply_q(mrioc, i)) {
+			ioc_err(mrioc,
+			    "cannot create operational reply queue %d\n", i);
+			break;
+		}
+		if (mpi3mr_create_op_req_q(mrioc, i,
+		    mrioc->op_reply_qinfo[i].qid)) {
+			ioc_err(mrioc,
+			    "cannot create operational request queue %d\n", i);
+			mpi3mr_delete_op_reply_q(mrioc, i);
+			break;
+		}
+	}
+
+	if (i == 0) {
+		/* Not even one queue is created successfully*/
+		retval = -1;
+		goto out_failed;
+	}
+	mrioc->num_op_reply_q = mrioc->num_op_req_q = i;
+	ioc_info(mrioc, "successfully created %d operational queue pairs(default/polled) queue = (%d/%d)\n",
+			mrioc->num_op_reply_q, mrioc->default_qcount,
+			mrioc->active_poll_qcount);
+
+	return retval;
+out_failed:
+	kfree(mrioc->req_qinfo);
+	mrioc->req_qinfo = NULL;
+
+	kfree(mrioc->op_reply_qinfo);
+	mrioc->op_reply_qinfo = NULL;
+
+
+	return retval;
+}
+
+/**
+ * mpi3mr_setup_admin_qpair - Setup admin queue pair
+ * @mrioc: Adapter instance reference
+ *
+ * Allocate memory for admin queue pair if required and register
+ * the admin queue with the controller.
+ *
+ * Return: 0 on success, non-zero on failures.
+ */
+static int mpi3mr_setup_admin_qpair(struct mpi3mr_ioc *mrioc)
+{
+	int retval = 0;
+	u32 num_admin_entries = 0;
+
+	mrioc->admin_req_q_sz = MPI3MR_ADMIN_REQ_Q_SIZE;
+	mrioc->num_admin_req = mrioc->admin_req_q_sz /
+	    MPI3MR_ADMIN_REQ_FRAME_SZ;
+	mrioc->admin_req_ci = mrioc->admin_req_pi = 0;
+	mrioc->admin_req_base = NULL;
+
+	mrioc->admin_reply_q_sz = MPI3MR_ADMIN_REPLY_Q_SIZE;
+	mrioc->num_admin_replies = mrioc->admin_reply_q_sz /
+	    MPI3MR_ADMIN_REPLY_FRAME_SZ;
+	mrioc->admin_reply_ci = 0;
+	mrioc->admin_reply_ephase = 1;
+	mrioc->admin_reply_base = NULL;
+	atomic_set(&mrioc->admin_reply_q_in_use, 0);
+
+	if (!mrioc->admin_req_base) {
+		mrioc->admin_req_base = dma_zalloc_coherent(&mrioc->pdev->dev,
+		    mrioc->admin_req_q_sz, &mrioc->admin_req_dma, GFP_KERNEL);
+
+		if (!mrioc->admin_req_base) {
+			retval = -1;
+			goto out_failed;
+		}
+
+		mrioc->admin_reply_base = dma_zalloc_coherent(&mrioc->pdev->dev,
+		    mrioc->admin_reply_q_sz, &mrioc->admin_reply_dma,
+		    GFP_KERNEL);
+
+		if (!mrioc->admin_reply_base) {
+			retval = -1;
+			goto out_failed;
+		}
+
+	}
+
+	num_admin_entries = (mrioc->num_admin_replies << 16) |
+	    (mrioc->num_admin_req);
+	writel(num_admin_entries, &mrioc->sysif_regs->admin_queue_num_entries);
+	mpi3mr_writeq(mrioc->admin_req_dma,
+	    &mrioc->sysif_regs->admin_request_queue_address,
+	    &mrioc->adm_req_q_bar_writeq_lock);
+	mpi3mr_writeq(mrioc->admin_reply_dma,
+	    &mrioc->sysif_regs->admin_reply_queue_address,
+	    &mrioc->adm_reply_q_bar_writeq_lock);
+	writel(mrioc->admin_req_pi, &mrioc->sysif_regs->admin_request_queue_pi);
+	writel(mrioc->admin_reply_ci, &mrioc->sysif_regs->admin_reply_queue_ci);
+	return retval;
+
+out_failed:
+
+	if (mrioc->admin_reply_base) {
+		dma_free_coherent(&mrioc->pdev->dev, mrioc->admin_reply_q_sz,
+		    mrioc->admin_reply_base, mrioc->admin_reply_dma);
+		mrioc->admin_reply_base = NULL;
+	}
+	if (mrioc->admin_req_base) {
+		dma_free_coherent(&mrioc->pdev->dev, mrioc->admin_req_q_sz,
+		    mrioc->admin_req_base, mrioc->admin_req_dma);
+		mrioc->admin_req_base = NULL;
+	}
+	return retval;
+}
+
+/**
+ * mpi3mr_process_factsdata - Process IOC facts data
+ * @mrioc: Adapter instance reference
+ * @facts_data: IOC facts data pointer
+ *
+ * Convert IOC facts data into cpu endianness and cache it in
+ * the driver .
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_process_factsdata(struct mpi3mr_ioc *mrioc,
+	struct mpi3_ioc_facts_data *facts_data)
+{
+	u32 ioc_config, req_sz, facts_flags;
+
+	if ((le16_to_cpu(facts_data->ioc_facts_data_length)) !=
+	    (sizeof(*facts_data) / 4))
+		ioc_warn(mrioc,
+		    "ioc_facts data length mismatch driver_sz(%ld), firmware_sz(%d)\n",
+		    sizeof(*facts_data),
+		    le16_to_cpu(facts_data->ioc_facts_data_length) * 4);
+
+	ioc_config = readl(&mrioc->sysif_regs->ioc_configuration);
+	req_sz = 1 << ((ioc_config & MPI3_SYSIF_IOC_CONFIG_OPER_REQ_ENT_SZ) >>
+	    MPI3_SYSIF_IOC_CONFIG_OPER_REQ_ENT_SZ_SHIFT);
+	if (le16_to_cpu(facts_data->ioc_request_frame_size) != (req_sz/4))
+		ioc_warn(mrioc,
+		    "ioc_facts request frame size mismatch hardware_size(%d), firmware_sz(%d)\n",
+		    req_sz / 4,
+		    le16_to_cpu(facts_data->ioc_request_frame_size));
+
+	memset(&mrioc->facts, 0, sizeof(mrioc->facts));
+
+	facts_flags = le32_to_cpu(facts_data->flags);
+	mrioc->facts.op_req_sz = req_sz;
+	mrioc->op_reply_desc_sz = 1 << ((ioc_config &
+	    MPI3_SYSIF_IOC_CONFIG_OPER_RPY_ENT_SZ) >>
+	    MPI3_SYSIF_IOC_CONFIG_OPER_RPY_ENT_SZ_SHIFT);
+
+	mrioc->facts.ioc_num = facts_data->ioc_number;
+	mrioc->facts.who_init = facts_data->who_init;
+	mrioc->facts.max_msix_vectors = le16_to_cpu(facts_data->max_msix_vectors);
+	mrioc->facts.personality = (facts_flags &
+	    MPI3_IOCFACTS_FLAGS_PERSONALITY_MASK);
+	mrioc->facts.dma_mask = (facts_flags &
+	    MPI3_IOCFACTS_FLAGS_DMA_ADDRESS_WIDTH_MASK) >>
+	    MPI3_IOCFACTS_FLAGS_DMA_ADDRESS_WIDTH_SHIFT;
+	mrioc->facts.protocol_flags = facts_data->protocol_flags;
+	mrioc->facts.mpi_version = le32_to_cpu(facts_data->mpi_version.word);
+	mrioc->facts.max_reqs =
+		le16_to_cpu(facts_data->max_outstanding_requests);
+	mrioc->facts.product_id = le16_to_cpu(facts_data->product_id);
+	mrioc->facts.reply_sz = le16_to_cpu(facts_data->reply_frame_size) * 4;
+	mrioc->facts.exceptions = le16_to_cpu(facts_data->ioc_exceptions);
+	mrioc->facts.max_perids = le16_to_cpu(facts_data->max_persistent_id);
+	mrioc->facts.max_vds = le16_to_cpu(facts_data->max_vds);
+	mrioc->facts.max_hpds = le16_to_cpu(facts_data->max_host_pds);
+	mrioc->facts.max_advhpds = le16_to_cpu(facts_data->max_adv_host_pds);
+	mrioc->facts.max_raid_pds = le16_to_cpu(facts_data->max_raid_pds);
+	mrioc->facts.max_nvme = le16_to_cpu(facts_data->max_nvme);
+	mrioc->facts.max_pcie_switches =
+	    le16_to_cpu(facts_data->max_pcie_switches);
+	mrioc->facts.max_sasexpanders =
+	    le16_to_cpu(facts_data->max_sas_expanders);
+	mrioc->facts.max_sasinitiators =
+	    le16_to_cpu(facts_data->max_sas_initiators);
+	mrioc->facts.max_enclosures = le16_to_cpu(facts_data->max_enclosures);
+	mrioc->facts.min_devhandle = le16_to_cpu(facts_data->min_dev_handle);
+	mrioc->facts.max_devhandle = le16_to_cpu(facts_data->max_dev_handle);
+	mrioc->facts.max_op_req_q =
+	    le16_to_cpu(facts_data->max_operational_request_queues);
+	mrioc->facts.max_op_reply_q =
+	    le16_to_cpu(facts_data->max_operational_reply_queues);
+	mrioc->facts.ioc_capabilities =
+	    le32_to_cpu(facts_data->ioc_capabilities);
+	mrioc->facts.fw_ver.build_num =
+	    le16_to_cpu(facts_data->fw_version.build_num);
+	mrioc->facts.fw_ver.cust_id =
+	    le16_to_cpu(facts_data->fw_version.customer_id);
+	mrioc->facts.fw_ver.ph_minor = facts_data->fw_version.phase_minor;
+	mrioc->facts.fw_ver.ph_major = facts_data->fw_version.phase_major;
+	mrioc->facts.fw_ver.gen_minor = facts_data->fw_version.gen_minor;
+	mrioc->facts.fw_ver.gen_major = facts_data->fw_version.gen_major;
+	mrioc->msix_count = min_t(int, mrioc->msix_count,
+				  mrioc->facts.max_msix_vectors);
+	mrioc->facts.sge_mod_mask = facts_data->sge_modifier_mask;
+	mrioc->facts.sge_mod_value = facts_data->sge_modifier_value;
+	mrioc->facts.sge_mod_shift = facts_data->sge_modifier_shift;
+	mrioc->facts.shutdown_timeout =
+	    le16_to_cpu(facts_data->shutdown_timeout);
+	mrioc->facts.diag_trace_sz =
+	    le32_to_cpu(facts_data->diag_trace_size);
+	mrioc->facts.diag_fw_sz =
+	    le32_to_cpu(facts_data->diag_fw_size);
+	mrioc->facts.diag_drvr_sz = le32_to_cpu(facts_data->diag_driver_size);
+
+	mrioc->facts.max_dev_per_tg =
+	    facts_data->max_devices_per_throttle_group;
+	mrioc->facts.io_throttle_data_length =
+	    le16_to_cpu(facts_data->io_throttle_data_length);
+	mrioc->facts.max_io_throttle_group =
+	    le16_to_cpu(facts_data->max_io_throttle_group);
+	mrioc->facts.io_throttle_low = le16_to_cpu(facts_data->io_throttle_low);
+	mrioc->facts.io_throttle_high =
+	    le16_to_cpu(facts_data->io_throttle_high);
+
+	/*Store in 512b block count*/
+	if (mrioc->facts.io_throttle_data_length)
+		mrioc->io_throttle_data_length =
+		    (mrioc->facts.io_throttle_data_length * 2 * 4);
+	else
+		/* set the length to 1MB + 1K to disable throttle*/
+		mrioc->io_throttle_data_length = MPI3MR_MAX_SECTORS + 2;
+
+	mrioc->io_throttle_high = (mrioc->facts.io_throttle_high * 2 * 1024);
+	mrioc->io_throttle_low = (mrioc->facts.io_throttle_low * 2 * 1024);
+
+	ioc_info(mrioc,
+	    "ioc_num(%d), max_op_req_queues (%d), max_op_reply_queues(%d), max_requests(%d), max_msix_vectors(%d)\n",
+	    mrioc->facts.ioc_num, mrioc->facts.max_op_req_q,
+	    mrioc->facts.max_op_reply_q, mrioc->facts.max_reqs,
+	    mrioc->facts.max_msix_vectors);
+	ioc_info(mrioc,
+	    "max_device_handles(%d), min_device_handles(%d), max_perst_ids(%d)\n",
+	    mrioc->facts.max_devhandle, mrioc->facts.min_devhandle,
+	    mrioc->facts.max_perids);
+	ioc_info(mrioc,
+	    "sge_modifier_mask(0x%02x), sge_modifier_value(0x%02x), sge_modifier_shift(0x%02x)\n",
+	    mrioc->facts.sge_mod_mask, mrioc->facts.sge_mod_value,
+	    mrioc->facts.sge_mod_shift);
+	ioc_info(mrioc, "dma_mask(%d), initial_port_enable_status(0x%02x)\n",
+	    mrioc->facts.dma_mask, (facts_flags &
+	    MPI3_IOCFACTS_FLAGS_INITIAL_PORT_ENABLE_MASK));
+	ioc_info(mrioc,
+	    "diag_trace_sz(%dKB), diag_fw_size(%dKB), diag_drvr_sizez(%dKB)\n",
+	    mrioc->facts.diag_trace_sz / 1024, mrioc->facts.diag_fw_sz / 1024,
+	    mrioc->facts.diag_drvr_sz / 1024);
+	ioc_info(mrioc,
+	    "max_dev_per_throttle_group(%d), max_throttle_groups(%d), io_throttle_data_len(%dKiB), io_throttle_high(%dMiB), io_throttle_low(%dMiB)\n",
+	    mrioc->facts.max_dev_per_tg, mrioc->facts.max_io_throttle_group,
+	    mrioc->facts.io_throttle_data_length * 4,
+	    mrioc->facts.io_throttle_high, mrioc->facts.io_throttle_low);
+
+}
+
+/**
+ * mpi3mr_issue_iocfacts - Send IOC Facts
+ * @mrioc: Adapter instance reference
+ * @facts_data: IOC facts data pointer
+ *
+ * Issue IOC Facts MPI request through admin queue and wait for
+ * the completion of it or time out.
+ *
+ * Return: 0 on success, non-zero on failures.
+ */
+static int mpi3mr_issue_iocfacts(struct mpi3mr_ioc *mrioc,
+	struct mpi3_ioc_facts_data *facts_data)
+{
+	struct mpi3_ioc_facts_request iocfacts_req;
+	void *data = NULL;
+	dma_addr_t data_dma;
+	u32 data_len = sizeof(*facts_data);
+	int retval = 0;
+	u8 sgl_flags = MPI3MR_SGEFLAGS_SYSTEM_SIMPLE_END_OF_LIST;
+
+	data = dma_zalloc_coherent(&mrioc->pdev->dev, data_len, &data_dma,
+	    GFP_KERNEL);
+
+	if (!data) {
+		retval = -1;
+		goto out;
+	}
+
+	memset(&iocfacts_req, 0, sizeof(iocfacts_req));
+	mutex_lock(&mrioc->init_cmds.mutex);
+	if (mrioc->init_cmds.state & MPI3MR_CMD_PENDING) {
+		retval = -1;
+		ioc_err(mrioc, "getting ioc_facts failed due to command in use\n");
+		mutex_unlock(&mrioc->init_cmds.mutex);
+		goto out;
+	}
+	mrioc->init_cmds.state = MPI3MR_CMD_PENDING;
+	mrioc->init_cmds.is_waiting = 1;
+	mrioc->init_cmds.callback = NULL;
+	iocfacts_req.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INITCMDS);
+	iocfacts_req.function = MPI3_FUNCTION_IOC_FACTS;
+
+	mpi3mr_add_sg_single(&iocfacts_req.sgl, sgl_flags, data_len,
+	    data_dma);
+
+	init_completion(&mrioc->init_cmds.done);
+	retval = mpi3mr_admin_request_post(mrioc, &iocfacts_req,
+	    sizeof(iocfacts_req), 1);
+	if (retval) {
+		ioc_err(mrioc, "posting ioc_facts request failed\n");
+		goto out_unlock;
+	}
+	wait_for_completion_timeout(&mrioc->init_cmds.done,
+	    (MPI3MR_INTADMCMD_TIMEOUT * HZ));
+	if (!(mrioc->init_cmds.state & MPI3MR_CMD_COMPLETE)) {
+		ioc_err(mrioc, "ioc_facts timed out\n");
+		mpi3mr_check_rh_fault_ioc(mrioc,
+		    MPI3MR_RESET_FROM_IOCFACTS_TIMEOUT);
+		retval = -1;
+		goto out_unlock;
+	}
+	if ((mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK)
+	    != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc,
+		    "ioc_facts returned with ioc_status(0x%04x), log_info(0x%08x)\n",
+		    (mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK),
+		    mrioc->init_cmds.ioc_loginfo);
+		retval = -1;
+		goto out_unlock;
+	}
+	memcpy(facts_data, (u8 *)data, data_len);
+	mpi3mr_process_factsdata(mrioc, facts_data);
+out_unlock:
+	mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED;
+	mutex_unlock(&mrioc->init_cmds.mutex);
+
+out:
+	if (data)
+		dma_free_coherent(&mrioc->pdev->dev, data_len, data, data_dma);
+
+	return retval;
+}
+
+/**
+ * mpi3mr_check_reset_dma_mask - Process IOC facts data
+ * @mrioc: Adapter instance reference
+ *
+ * Check whether the new DMA mask requested through IOCFacts by
+ * firmware needs to be set, if so set it .
+ *
+ * Return: 0 on success, non-zero on failure.
+ */
+static inline int mpi3mr_check_reset_dma_mask(struct mpi3mr_ioc *mrioc)
+{
+	struct pci_dev *pdev = mrioc->pdev;
+	int r;
+	u64 facts_dma_mask = DMA_BIT_MASK(mrioc->facts.dma_mask);
+
+	if (!mrioc->facts.dma_mask || (mrioc->dma_mask <= facts_dma_mask))
+		return 0;
+
+	ioc_info(mrioc, "changing DMA mask from 0x%016llX to 0x%016llX\n",
+	    mrioc->dma_mask, facts_dma_mask);
+
+	r = dma_set_mask_and_coherent(&pdev->dev, facts_dma_mask);
+	if (r) {
+		ioc_err(mrioc, "setting DMA mask to 0x%016llX failed: %d\n",
+		    facts_dma_mask, r);
+		return r;
+	}
+	mrioc->dma_mask = facts_dma_mask;
+	return r;
+}
+
+
+/**
+ * mpi3mr_alloc_reply_sense_bufs - Send IOC Init
+ * @mrioc: Adapter instance reference
+ *
+ * Allocate and initialize the reply free buffers, sense
+ * buffers, reply free queue and sense buffer queue.
+ *
+ * Return: 0 on success, non-zero on failures.
+ */
+static int mpi3mr_alloc_reply_sense_bufs(struct mpi3mr_ioc *mrioc)
+{
+	int retval = 0;
+	u32 sz, i;
+
+
+	if (mrioc->init_cmds.reply)
+		return retval;
+
+	mrioc->init_cmds.reply = kzalloc(mrioc->reply_sz, GFP_KERNEL);
+	if (!mrioc->init_cmds.reply)
+		goto out_failed;
+
+	mrioc->bsg_cmds.reply = kzalloc(mrioc->reply_sz, GFP_KERNEL);
+	if (!mrioc->bsg_cmds.reply)
+		goto out_failed;
+
+	mrioc->host_tm_cmds.reply = kzalloc(mrioc->reply_sz, GFP_KERNEL);
+	if (!mrioc->host_tm_cmds.reply)
+		goto out_failed;
+
+	mrioc->pel_cmds.reply = kzalloc(mrioc->reply_sz, GFP_KERNEL);
+	if (!mrioc->pel_cmds.reply)
+		goto out_failed;
+
+	mrioc->pel_abort_cmd.reply = kzalloc(mrioc->reply_sz, GFP_KERNEL);
+	if (!mrioc->pel_abort_cmd.reply)
+		goto out_failed;
+
+	mrioc->transport_cmds.reply = kzalloc(mrioc->reply_sz, GFP_KERNEL);
+	if (!mrioc->transport_cmds.reply)
+		goto out_failed;
+
+	for (i = 0; i < MPI3MR_NUM_DEVRMCMD; i++) {
+		mrioc->dev_rmhs_cmds[i].reply = kzalloc(mrioc->reply_sz,
+		    GFP_KERNEL);
+		if (!mrioc->dev_rmhs_cmds[i].reply)
+			goto out_failed;
+	}
+	for (i = 0; i < MPI3MR_NUM_SYSFS_TM; i++) {
+		mrioc->sysfs_tm_cmds[i].reply = kzalloc(mrioc->reply_sz,
+		    GFP_KERNEL);
+		if (!mrioc->sysfs_tm_cmds[i].reply)
+			goto out_failed;
+	}
+	for (i = 0; i < MPI3MR_NUM_EVTACKCMD; i++) {
+		mrioc->evtack_cmds[i].reply = kzalloc(mrioc->reply_sz,
+		    GFP_KERNEL);
+		if (!mrioc->evtack_cmds[i].reply)
+			goto out_failed;
+	}
+	mrioc->dev_handle_bitmap_sz = mrioc->facts.max_devhandle / 8;
+	if (mrioc->facts.max_devhandle % 8)
+		mrioc->dev_handle_bitmap_sz++;
+	mrioc->removepend_bitmap = kzalloc(mrioc->dev_handle_bitmap_sz,
+	    GFP_KERNEL);
+	if (!mrioc->removepend_bitmap)
+		goto out_failed;
+
+	mrioc->devrem_bitmap_sz = MPI3MR_NUM_DEVRMCMD / 8;
+	if (MPI3MR_NUM_DEVRMCMD % 8)
+		mrioc->devrem_bitmap_sz++;
+	mrioc->devrem_bitmap = kzalloc(mrioc->devrem_bitmap_sz,
+	    GFP_KERNEL);
+	if (!mrioc->devrem_bitmap)
+		goto out_failed;
+
+	mrioc->evtack_cmds_bitmap_sz = MPI3MR_NUM_EVTACKCMD / 8;
+	if (MPI3MR_NUM_EVTACKCMD % 8)
+		mrioc->evtack_cmds_bitmap_sz++;
+	mrioc->evtack_cmds_bitmap = kzalloc(mrioc->evtack_cmds_bitmap_sz,
+	    GFP_KERNEL);
+	if (!mrioc->evtack_cmds_bitmap)
+		goto out_failed;
+
+	mrioc->num_reply_bufs = mrioc->facts.max_reqs + MPI3MR_NUM_EVT_REPLIES;
+	mrioc->reply_free_qsz = mrioc->num_reply_bufs + 1;
+	mrioc->num_sense_bufs = mrioc->facts.max_reqs / MPI3MR_SENSEBUF_FACTOR;
+	mrioc->sense_buf_q_sz = mrioc->num_sense_bufs + 1;
+
+	/* reply buffer pool, 16 byte align */
+	sz = mrioc->num_reply_bufs * mrioc->reply_sz;
+	mrioc->reply_buf_pool = dma_pool_create("reply_buf pool",
+	    &mrioc->pdev->dev, sz, 16, 0);
+	if (!mrioc->reply_buf_pool) {
+		ioc_err(mrioc, "reply buf pool: dma_pool_create failed\n");
+		goto out_failed;
+	}
+
+	mrioc->reply_buf = dma_pool_zalloc(mrioc->reply_buf_pool, GFP_KERNEL,
+	    &mrioc->reply_buf_dma);
+	if (!mrioc->reply_buf)
+		goto out_failed;
+
+	mrioc->reply_buf_dma_max_address = mrioc->reply_buf_dma + sz;
+
+	/* reply free queue, 8 byte align */
+	sz = mrioc->reply_free_qsz * 8;
+	mrioc->reply_free_q_pool = dma_pool_create("reply_free_q pool",
+	    &mrioc->pdev->dev, sz, 8, 0);
+	if (!mrioc->reply_free_q_pool) {
+		ioc_err(mrioc, "reply_free_q pool: dma_pool_create failed\n");
+		goto out_failed;
+	}
+	mrioc->reply_free_q = dma_pool_zalloc(mrioc->reply_free_q_pool,
+	    GFP_KERNEL, &mrioc->reply_free_q_dma);
+	if (!mrioc->reply_free_q)
+		goto out_failed;
+
+	/* sense buffer pool,  4 byte align */
+	sz = mrioc->num_sense_bufs * MPI3MR_SENSE_BUF_SZ;
+	mrioc->sense_buf_pool = dma_pool_create("sense_buf pool",
+	    &mrioc->pdev->dev, sz, 4, 0);
+	if (!mrioc->sense_buf_pool) {
+		ioc_err(mrioc, "sense_buf pool: dma_pool_create failed\n");
+		goto out_failed;
+	}
+	mrioc->sense_buf = dma_pool_zalloc(mrioc->sense_buf_pool, GFP_KERNEL,
+	    &mrioc->sense_buf_dma);
+	if (!mrioc->sense_buf)
+		goto out_failed;
+
+	/* sense buffer queue, 8 byte align */
+	sz = mrioc->sense_buf_q_sz * 8;
+	mrioc->sense_buf_q_pool = dma_pool_create("sense_buf_q pool",
+	    &mrioc->pdev->dev, sz, 8, 0);
+	if (!mrioc->sense_buf_q_pool) {
+		ioc_err(mrioc, "sense_buf_q pool: dma_pool_create failed\n");
+		goto out_failed;
+	}
+	mrioc->sense_buf_q = dma_pool_zalloc(mrioc->sense_buf_q_pool,
+	    GFP_KERNEL, &mrioc->sense_buf_q_dma);
+	if (!mrioc->sense_buf_q)
+		goto out_failed;
+
+	return retval;
+
+out_failed:
+	retval = -1;
+	return retval;
+}
+
+/**
+ * mpimr_initialize_reply_sbuf_queues - initialize reply sense
+ * buffers
+ * @mrioc: Adapter instance reference
+ *
+ * Helper function to initialize reply and sense buffers along
+ * with some debug prints.
+ *
+ * Return:  None.
+ */
+static void mpimr_initialize_reply_sbuf_queues(struct mpi3mr_ioc *mrioc)
+{
+	u32 sz, i;
+	dma_addr_t phy_addr;
+
+	sz = mrioc->num_reply_bufs * mrioc->reply_sz;
+	ioc_info(mrioc,
+	    "reply buf pool(0x%p): depth(%d), frame_size(%d), pool_size(%d kB), reply_dma(0x%llx)\n",
+	    mrioc->reply_buf, mrioc->num_reply_bufs, mrioc->reply_sz,
+	    (sz / 1024), (unsigned long long)mrioc->reply_buf_dma);
+	sz = mrioc->reply_free_qsz * 8;
+	ioc_info(mrioc,
+	    "reply_free_q pool(0x%p): depth(%d), frame_size(%d), pool_size(%d kB), reply_dma(0x%llx)\n",
+	    mrioc->reply_free_q, mrioc->reply_free_qsz, 8, (sz / 1024),
+	    (unsigned long long)mrioc->reply_free_q_dma);
+	sz = mrioc->num_sense_bufs * MPI3MR_SENSE_BUF_SZ;
+	ioc_info(mrioc,
+	    "sense_buf pool(0x%p): depth(%d), frame_size(%d), pool_size(%d kB), sense_dma(0x%llx)\n",
+	    mrioc->sense_buf, mrioc->num_sense_bufs, MPI3MR_SENSE_BUF_SZ,
+	    (sz / 1024), (unsigned long long)mrioc->sense_buf_dma);
+	sz = mrioc->sense_buf_q_sz * 8;
+	ioc_info(mrioc,
+	    "sense_buf_q pool(0x%p): depth(%d), frame_size(%d), pool_size(%d kB), sense_dma(0x%llx)\n",
+	    mrioc->sense_buf_q, mrioc->sense_buf_q_sz, 8, (sz / 1024),
+	    (unsigned long long)mrioc->sense_buf_q_dma);
+
+	/* initialize Reply buffer Queue */
+	for (i = 0, phy_addr = mrioc->reply_buf_dma;
+	    i < mrioc->num_reply_bufs; i++, phy_addr += mrioc->reply_sz)
+		mrioc->reply_free_q[i] = cpu_to_le64(phy_addr);
+	mrioc->reply_free_q[i] = cpu_to_le64(0);
+
+	/* initialize Sense Buffer Queue */
+	for (i = 0, phy_addr = mrioc->sense_buf_dma;
+	    i < mrioc->num_sense_bufs; i++, phy_addr += MPI3MR_SENSE_BUF_SZ)
+		mrioc->sense_buf_q[i] = cpu_to_le64(phy_addr);
+	mrioc->sense_buf_q[i] = cpu_to_le64(0);
+}
+
+/**
+ * mpi3mr_issue_iocinit - Send IOC Init
+ * @mrioc: Adapter instance reference
+ *
+ * Issue IOC Init MPI request through admin queue and wait for
+ * the completion of it or time out.
+ *
+ * Return: 0 on success, non-zero on failures.
+ */
+static int mpi3mr_issue_iocinit(struct mpi3mr_ioc *mrioc)
+{
+	struct mpi3_ioc_init_request iocinit_req;
+	struct mpi3_driver_info_layout *drv_info;
+	dma_addr_t data_dma;
+	u32 data_len = sizeof(*drv_info);
+	int retval = 0;
+	ktime_t current_time;
+
+	drv_info = dma_zalloc_coherent(&mrioc->pdev->dev, data_len, &data_dma,
+	    GFP_KERNEL);
+	if (!drv_info) {
+		retval = -1;
+		goto out;
+	}
+	mpimr_initialize_reply_sbuf_queues(mrioc);
+
+	drv_info->information_length = cpu_to_le32(data_len);
+	strscpy(drv_info->driver_signature, "Broadcom", sizeof(drv_info->driver_signature));
+	strscpy(drv_info->os_name, utsname()->sysname, sizeof(drv_info->os_name));
+	strscpy(drv_info->os_version, utsname()->release, sizeof(drv_info->os_version));
+	strscpy(drv_info->driver_name, MPI3MR_DRIVER_NAME, sizeof(drv_info->driver_name));
+	strscpy(drv_info->driver_version, MPI3MR_DRIVER_VERSION, sizeof(drv_info->driver_version));
+	strscpy(drv_info->driver_release_date, MPI3MR_DRIVER_RELDATE,
+	    sizeof(drv_info->driver_release_date));
+	drv_info->driver_capabilities = 0;
+	memcpy((u8 *)&mrioc->driver_info, (u8 *)drv_info,
+	    sizeof(mrioc->driver_info));
+
+	memset(&iocinit_req, 0, sizeof(iocinit_req));
+	mutex_lock(&mrioc->init_cmds.mutex);
+	if (mrioc->init_cmds.state & MPI3MR_CMD_PENDING) {
+		retval = -1;
+		ioc_err(mrioc, "sending ioc_init failed due to command in use\n");
+		mutex_unlock(&mrioc->init_cmds.mutex);
+		goto out;
+	}
+	mrioc->init_cmds.state = MPI3MR_CMD_PENDING;
+	mrioc->init_cmds.is_waiting = 1;
+	mrioc->init_cmds.callback = NULL;
+	iocinit_req.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INITCMDS);
+	iocinit_req.function = MPI3_FUNCTION_IOC_INIT;
+	iocinit_req.mpi_version.mpi3_version.dev = MPI3_VERSION_DEV;
+	iocinit_req.mpi_version.mpi3_version.unit = MPI3_VERSION_UNIT;
+	iocinit_req.mpi_version.mpi3_version.major = MPI3_VERSION_MAJOR;
+	iocinit_req.mpi_version.mpi3_version.minor = MPI3_VERSION_MINOR;
+	iocinit_req.who_init = MPI3_WHOINIT_HOST_DRIVER;
+	iocinit_req.reply_free_queue_depth = cpu_to_le16(mrioc->reply_free_qsz);
+	iocinit_req.reply_free_queue_address =
+	    cpu_to_le64(mrioc->reply_free_q_dma);
+	iocinit_req.sense_buffer_length = cpu_to_le16(MPI3MR_SENSE_BUF_SZ);
+	iocinit_req.sense_buffer_free_queue_depth =
+	    cpu_to_le16(mrioc->sense_buf_q_sz);
+	iocinit_req.sense_buffer_free_queue_address =
+	    cpu_to_le64(mrioc->sense_buf_q_dma);
+	iocinit_req.driver_information_address = cpu_to_le64(data_dma);
+
+	current_time = ktime_get_real();
+	iocinit_req.time_stamp = cpu_to_le64(ktime_to_ms(current_time));
+
+	if (enable_dix)
+		iocinit_req.msg_flags |=
+		    MPI3_IOCINIT_MSGFLAGS_HOSTMETADATA_SEPARATED;
+
+	init_completion(&mrioc->init_cmds.done);
+	retval = mpi3mr_admin_request_post(mrioc, &iocinit_req,
+	    sizeof(iocinit_req), 1);
+	if (retval) {
+		ioc_err(mrioc, "posting ioc_init failed\n");
+		goto out_unlock;
+	}
+	wait_for_completion_timeout(&mrioc->init_cmds.done,
+	    (MPI3MR_INTADMCMD_TIMEOUT * HZ));
+	if (!(mrioc->init_cmds.state & MPI3MR_CMD_COMPLETE)) {
+		mpi3mr_check_rh_fault_ioc(mrioc,
+		    MPI3MR_RESET_FROM_IOCINIT_TIMEOUT);
+		ioc_err(mrioc, "ioc_init timed out\n");
+		retval = -1;
+		goto out_unlock;
+	}
+	if ((mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK)
+	    != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc,
+		    "ioc_init returned with ioc_status(0x%04x) log_info(0x%08x)\n",
+		    (mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK),
+		    mrioc->init_cmds.ioc_loginfo);
+		retval = -1;
+		goto out_unlock;
+	}
+
+	mrioc->reply_free_queue_host_index = mrioc->num_reply_bufs;
+	writel(mrioc->reply_free_queue_host_index,
+	    &mrioc->sysif_regs->reply_free_host_index);
+
+	mrioc->sbq_host_index = mrioc->num_sense_bufs;
+	writel(mrioc->sbq_host_index,
+	    &mrioc->sysif_regs->sense_buffer_free_host_index);
+out_unlock:
+	mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED;
+	mutex_unlock(&mrioc->init_cmds.mutex);
+
+out:
+	if (drv_info)
+		dma_free_coherent(&mrioc->pdev->dev, data_len, drv_info,
+		    data_dma);
+
+	return retval;
+}
+
+/**
+ * mpi3mr_unmask_events - Unmask events in event mask bitmap
+ * @mrioc: Adapter instance reference
+ * @event: MPI event ID
+ *
+ * Un mask the specific event by resetting the event_mask
+ * bitmap.
+ *
+ * Return: 0 on success, non-zero on failures.
+ */
+static void mpi3mr_unmask_events(struct mpi3mr_ioc *mrioc, u16 event)
+{
+	u32 desired_event;
+	u8 word;
+
+	if (event >= 128)
+		return;
+
+	desired_event = (1 << (event % 32));
+	word = event / 32;
+
+	mrioc->event_masks[word] &= ~desired_event;
+}
+
+/**
+ * mpi3mr_issue_event_notification - Send event notification
+ * @mrioc: Adapter instance reference
+ *
+ * Issue event notification MPI request through admin queue and
+ * wait for the completion of it or time out.
+ *
+ * Return: 0 on success, non-zero on failures.
+ */
+static int mpi3mr_issue_event_notification(struct mpi3mr_ioc *mrioc)
+{
+	struct mpi3_event_notification_request evtnotify_req;
+	int retval = 0;
+	u8 i;
+
+	memset(&evtnotify_req, 0, sizeof(evtnotify_req));
+	mutex_lock(&mrioc->init_cmds.mutex);
+	if (mrioc->init_cmds.state & MPI3MR_CMD_PENDING) {
+		retval = -1;
+		ioc_err(mrioc, "sending event notification failed due to command in use\n");
+		mutex_unlock(&mrioc->init_cmds.mutex);
+		goto out;
+	}
+	mrioc->init_cmds.state = MPI3MR_CMD_PENDING;
+	mrioc->init_cmds.is_waiting = 1;
+	mrioc->init_cmds.callback = NULL;
+	evtnotify_req.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INITCMDS);
+	evtnotify_req.function = MPI3_FUNCTION_EVENT_NOTIFICATION;
+	for (i = 0; i < MPI3_EVENT_NOTIFY_EVENTMASK_WORDS; i++)
+		evtnotify_req.event_masks[i] =
+		    cpu_to_le32(mrioc->event_masks[i]);
+	init_completion(&mrioc->init_cmds.done);
+	retval = mpi3mr_admin_request_post(mrioc, &evtnotify_req,
+	    sizeof(evtnotify_req), 1);
+	if (retval) {
+		ioc_err(mrioc, "posting event notification failed\n");
+		goto out_unlock;
+	}
+	wait_for_completion_timeout(&mrioc->init_cmds.done,
+	    (MPI3MR_INTADMCMD_TIMEOUT * HZ));
+	if (!(mrioc->init_cmds.state & MPI3MR_CMD_COMPLETE)) {
+		ioc_err(mrioc, "event notification timed out\n");
+		mpi3mr_check_rh_fault_ioc(mrioc,
+		    MPI3MR_RESET_FROM_EVTNOTIFY_TIMEOUT);
+		retval = -1;
+		goto out_unlock;
+	}
+	if ((mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK)
+	    != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc,
+		    "event notification returned with ioc_tatus(0x%04x), log_info(0x%08x)\n",
+		    (mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK),
+		    mrioc->init_cmds.ioc_loginfo);
+		retval = -1;
+		goto out_unlock;
+	}
+
+out_unlock:
+	mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED;
+	mutex_unlock(&mrioc->init_cmds.mutex);
+out:
+	return retval;
+}
+
+/**
+ * mpi3mr_process_event_ack - Process event acknowledgment
+ * @mrioc: Adapter instance reference
+ * @event: MPI3 event ID
+ * @event_ctx: event context
+ *
+ * Send event acknowledgment through admin queue and wait for
+ * it to complete.
+ *
+ * Return: 0 on success, non-zero on failures.
+ */
+int mpi3mr_process_event_ack(struct mpi3mr_ioc *mrioc, u8 event,
+	u32 event_ctx)
+{
+	struct mpi3_event_ack_request evtack_req;
+	int retval = 0;
+
+	memset(&evtack_req, 0, sizeof(evtack_req));
+	mutex_lock(&mrioc->init_cmds.mutex);
+	if (mrioc->init_cmds.state & MPI3MR_CMD_PENDING) {
+		retval = -1;
+		ioc_err(mrioc, "sending blocking event ack failed due to command in use\n");
+		mutex_unlock(&mrioc->init_cmds.mutex);
+		goto out;
+	}
+	mrioc->init_cmds.state = MPI3MR_CMD_PENDING;
+	mrioc->init_cmds.is_waiting = 1;
+	mrioc->init_cmds.callback = NULL;
+	evtack_req.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INITCMDS);
+	evtack_req.function = MPI3_FUNCTION_EVENT_ACK;
+	evtack_req.event = event;
+	evtack_req.event_context = cpu_to_le32(event_ctx);
+
+	init_completion(&mrioc->init_cmds.done);
+	retval = mpi3mr_admin_request_post(mrioc, &evtack_req,
+	    sizeof(evtack_req), 1);
+	if (retval) {
+		ioc_err(mrioc, "posting event ack request failed\n");
+		goto out_unlock;
+	}
+	wait_for_completion_timeout(&mrioc->init_cmds.done,
+	    (MPI3MR_INTADMCMD_TIMEOUT * HZ));
+	if (!(mrioc->init_cmds.state & MPI3MR_CMD_COMPLETE)) {
+		ioc_err(mrioc, "blocking event ack request timed out\n");
+		if (!(mrioc->init_cmds.state & MPI3MR_CMD_RESET))
+			mpi3mr_soft_reset_handler(mrioc,
+			    MPI3MR_RESET_FROM_EVTACK_TIMEOUT, 1);
+		retval = -1;
+		goto out_unlock;
+	}
+	if ((mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK)
+	    != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc,
+		    "blocking event ack returned with ioc_status(0x%04x), log_info(0x%08x)\n",
+		    (mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK),
+		    mrioc->init_cmds.ioc_loginfo);
+		retval = -1;
+		goto out_unlock;
+	}
+
+out_unlock:
+	mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED;
+	mutex_unlock(&mrioc->init_cmds.mutex);
+out:
+	return retval;
+}
+
+
+/**
+ * mpi3mr_alloc_chain_bufs - Allocate chain buffers
+ * @mrioc: Adapter instance reference
+ *
+ * Allocate chain buffers and set a bitmap to indicate free
+ * chain buffers. Chain buffers are used to pass the SGE
+ * information along with MPI3 SCSI IO requests for host I/O.
+ *
+ * Return: 0 on success, non-zero on failure
+ */
+static int mpi3mr_alloc_chain_bufs(struct mpi3mr_ioc *mrioc)
+{
+	int retval = 0;
+	u32 sz, i;
+	u16 num_chains;
+
+	if (mrioc->chain_sgl_list)
+		return retval;
+
+	num_chains = mrioc->max_host_ios;
+
+	if (enable_dix)
+		num_chains *= 2;
+
+	mrioc->chain_buf_count = num_chains;
+	sz = sizeof(struct chain_element) * num_chains;
+	mrioc->chain_sgl_list = kzalloc(sz, GFP_KERNEL);
+	if (!mrioc->chain_sgl_list)
+		goto out_failed;
+
+	sz = MPI3MR_CHAINSGE_SIZE;
+	mrioc->chain_buf_pool = dma_pool_create("chain_buf pool",
+	    &mrioc->pdev->dev, sz, 16, 0);
+	if (!mrioc->chain_buf_pool) {
+		ioc_err(mrioc, "chain buf pool: dma_pool_create failed\n");
+		goto out_failed;
+	}
+
+	for (i = 0; i < num_chains; i++) {
+		mrioc->chain_sgl_list[i].addr =
+		    dma_pool_zalloc(mrioc->chain_buf_pool, GFP_KERNEL,
+		    &mrioc->chain_sgl_list[i].dma_addr);
+
+		if (!mrioc->chain_sgl_list[i].addr)
+			goto out_failed;
+	}
+	mrioc->chain_bitmap_sz = num_chains / 8;
+	if (num_chains % 8)
+		mrioc->chain_bitmap_sz++;
+	mrioc->chain_bitmap = kzalloc(mrioc->chain_bitmap_sz, GFP_KERNEL);
+	if (!mrioc->chain_bitmap)
+		goto out_failed;
+	return retval;
+out_failed:
+	retval = -1;
+	return retval;
+}
+
+
+/**
+ * mpi3mr_port_enable_complete - Mark port enable complete
+ * @mrioc: Adapter instance reference
+ * @drv_cmd: Internal command tracker
+ *
+ * Call back for asynchronous port enable request sets the
+ * driver command to indicate port enable request is complete.
+ *
+ * Return: Nothing
+ */
+static void mpi3mr_port_enable_complete(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_drv_cmd *drv_cmd)
+{
+	drv_cmd->callback = NULL;
+	mrioc->scan_started = 0;
+	if (drv_cmd->state & MPI3MR_CMD_RESET)
+		mrioc->scan_failed = MPI3_IOCSTATUS_INTERNAL_ERROR;
+	else
+		mrioc->scan_failed = drv_cmd->ioc_status;
+	drv_cmd->state = MPI3MR_CMD_NOTUSED;
+}
+
+/**
+ * mpi3mr_issue_port_enable - Issue Port Enable
+ * @mrioc: Adapter instance reference
+ * @async: Flag to wait for completion or not
+ *
+ * Issue Port Enable MPI request through admin queue and if the
+ * async flag is not set wait for the completion of the port
+ * enable or time out.
+ *
+ * Return: 0 on success, non-zero on failures.
+ */
+int mpi3mr_issue_port_enable(struct mpi3mr_ioc *mrioc, u8 async)
+{
+	struct mpi3_port_enable_request pe_req;
+	int retval = 0;
+	u32 pe_timeout = MPI3MR_PORTENABLE_TIMEOUT;
+
+	memset(&pe_req, 0, sizeof(pe_req));
+	mutex_lock(&mrioc->init_cmds.mutex);
+	if (mrioc->init_cmds.state & MPI3MR_CMD_PENDING) {
+		retval = -1;
+		ioc_err(mrioc, "sending port enable failed due to command is in use\n");
+		mutex_unlock(&mrioc->init_cmds.mutex);
+		goto out;
+	}
+	mrioc->init_cmds.state = MPI3MR_CMD_PENDING;
+	if (async) {
+		mrioc->init_cmds.is_waiting = 0;
+		mrioc->init_cmds.callback = mpi3mr_port_enable_complete;
+	} else {
+		mrioc->init_cmds.is_waiting = 1;
+		mrioc->init_cmds.callback = NULL;
+		init_completion(&mrioc->init_cmds.done);
+	}
+	pe_req.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INITCMDS);
+	pe_req.function = MPI3_FUNCTION_PORT_ENABLE;
+
+	retval = mpi3mr_admin_request_post(mrioc, &pe_req, sizeof(pe_req), 1);
+	if (retval) {
+		ioc_err(mrioc, "posting port enable failed\n");
+		goto out_unlock;
+	}
+	if (async) {
+		mutex_unlock(&mrioc->init_cmds.mutex);
+		goto out;
+	}
+
+	wait_for_completion_timeout(&mrioc->init_cmds.done, (pe_timeout * HZ));
+	if (!(mrioc->init_cmds.state & MPI3MR_CMD_COMPLETE)) {
+		ioc_err(mrioc, "port enable timed out\n");
+		retval = -1;
+		mpi3mr_check_rh_fault_ioc(mrioc, MPI3MR_RESET_FROM_PE_TIMEOUT);
+		goto out_unlock;
+	}
+	mpi3mr_port_enable_complete(mrioc, &mrioc->init_cmds);
+
+out_unlock:
+	mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED;
+	mutex_unlock(&mrioc->init_cmds.mutex);
+out:
+	return retval;
+}
+
+/* Protocol type to name mapper structure */
+static const struct {
+	u8 protocol;
+	char *name;
+} mpi3mr_protocols[] = {
+	{ MPI3_IOCFACTS_PROTOCOL_SCSI_INITIATOR, "Initiator" },
+	{ MPI3_IOCFACTS_PROTOCOL_SCSI_TARGET, "Target" },
+	{ MPI3_IOCFACTS_PROTOCOL_NVME, "NVMe attachment" },
+};
+
+/* Capability to name mapper structure */
+static const struct {
+	u32 capability;
+	char *name;
+} mpi3mr_capabilities[] = {
+	{ MPI3_IOCFACTS_CAPABILITY_RAID_CAPABLE, "RAID" },
+	{ MPI3_IOCFACTS_CAPABILITY_MULTIPATH_ENABLED, "MultiPath" },
+};
+
+/**
+ * mpi3mr_print_ioc_info - Display controller information
+ * @mrioc: Adapter instance reference
+ *
+ * Display controller personality, capability, supported
+ * protocols etc.
+ *
+ * Return: Nothing
+ */
+static void
+mpi3mr_print_ioc_info(struct mpi3mr_ioc *mrioc)
+{
+	int i = 0, bytes_wrote = 0;
+	char personality[16];
+	char protocol[50] = {0};
+	char capabilities[100] = {0};
+	bool is_string_nonempty = false;
+	struct mpi3mr_compimg_ver *fwver = &mrioc->facts.fw_ver;
+
+	switch (mrioc->facts.personality) {
+	case MPI3_IOCFACTS_FLAGS_PERSONALITY_EHBA:
+		strncpy(personality, "Enhanced HBA", sizeof(personality));
+		break;
+	case MPI3_IOCFACTS_FLAGS_PERSONALITY_RAID_DDR:
+		strncpy(personality, "RAID", sizeof(personality));
+		break;
+	default:
+		strncpy(personality, "Unknown", sizeof(personality));
+		break;
+	}
+
+	ioc_info(mrioc, "running in %s Personality", personality);
+
+	ioc_info(mrioc, "firmware version(%d.%d.%d.%d.%05d-%05d)\n",
+	    fwver->gen_major, fwver->gen_minor, fwver->ph_major,
+	    fwver->ph_minor, fwver->cust_id, fwver->build_num);
+
+	for (i = 0; i < ARRAY_SIZE(mpi3mr_protocols); i++) {
+		if (mrioc->facts.protocol_flags &
+		    mpi3mr_protocols[i].protocol) {
+			if (is_string_nonempty &&
+			    (bytes_wrote < sizeof(protocol)))
+				bytes_wrote += snprintf(protocol + bytes_wrote,
+				    (sizeof(protocol) - bytes_wrote), ",");
+
+			if (bytes_wrote < sizeof(protocol))
+				bytes_wrote += snprintf(protocol + bytes_wrote,
+				    (sizeof(protocol) - bytes_wrote), "%s",
+				    mpi3mr_protocols[i].name);
+			is_string_nonempty = true;
+		}
+	}
+
+	bytes_wrote = 0;
+	is_string_nonempty = false;
+	for (i = 0; i < ARRAY_SIZE(mpi3mr_capabilities); i++) {
+		if (mrioc->facts.ioc_capabilities &
+		    mpi3mr_capabilities[i].capability) {
+			if (is_string_nonempty &&
+			    (bytes_wrote < sizeof(capabilities)))
+				bytes_wrote += snprintf(capabilities +
+				    bytes_wrote,
+				    (sizeof(capabilities) - bytes_wrote), ",");
+
+			if (bytes_wrote < sizeof(capabilities))
+				bytes_wrote += snprintf(capabilities +
+				    bytes_wrote,
+				    (sizeof(capabilities) - bytes_wrote), "%s",
+				    mpi3mr_capabilities[i].name);
+			is_string_nonempty = true;
+		}
+	}
+
+	ioc_info(mrioc, "Protocol=(%s), Capabilities=(%s)\n",
+	    protocol, capabilities);
+}
+
+/**
+ * mpi3mr_cleanup_resources - Free PCI resources
+ * @mrioc: Adapter instance reference
+ *
+ * Unmap PCI device memory and disable PCI device.
+ *
+ * Return: 0 on success and non-zero on failure.
+ */
+void mpi3mr_cleanup_resources(struct mpi3mr_ioc *mrioc)
+{
+	struct pci_dev *pdev = mrioc->pdev;
+
+	mpi3mr_cleanup_isr(mrioc);
+
+	if (mrioc->sysif_regs) {
+		iounmap((void __iomem *)mrioc->sysif_regs);
+		mrioc->sysif_regs = NULL;
+	}
+
+	if (pci_is_enabled(pdev)) {
+		if (mrioc->bars)
+			pci_release_selected_regions(pdev, mrioc->bars);
+		pci_disable_pcie_error_reporting(pdev);
+		pci_disable_device(pdev);
+	}
+}
+
+/**
+ * mpi3mr_setup_resources - Enable PCI resources
+ * @mrioc: Adapter instance reference
+ *
+ * Enable PCI device memory, MSI-x registers and set DMA mask.
+ *
+ * Return: 0 on success and non-zero on failure.
+ */
+int mpi3mr_setup_resources(struct mpi3mr_ioc *mrioc)
+{
+	struct pci_dev *pdev = mrioc->pdev;
+	u32 memap_sz = 0;
+	int i, retval = 0, capb = 0;
+	u16 message_control;
+	u64 dma_mask = mrioc->dma_mask ? mrioc->dma_mask :
+	    (((dma_get_required_mask(&pdev->dev) > DMA_BIT_MASK(32)) &&
+	    (sizeof(dma_addr_t) > 4)) ? DMA_BIT_MASK(64) : DMA_BIT_MASK(32));
+
+	if (pci_enable_device_mem(pdev)) {
+		ioc_err(mrioc, "pci_enable_device_mem: failed\n");
+		retval = -ENODEV;
+		goto out_failed;
+	}
+
+	capb = pci_find_capability(pdev, PCI_CAP_ID_MSIX);
+	if (!capb) {
+		ioc_err(mrioc, "unable to find MSI-X Capabilities\n");
+		retval = -ENODEV;
+		goto out_failed;
+	}
+	mrioc->bars = pci_select_bars(pdev, IORESOURCE_MEM);
+
+	if (pci_request_selected_regions(pdev, mrioc->bars,
+	    mrioc->driver_name)) {
+		ioc_err(mrioc, "pci_request_selected_regions: failed\n");
+		retval = -ENODEV;
+		goto out_failed;
+	}
+
+	for (i = 0; (i < DEVICE_COUNT_RESOURCE); i++) {
+		if (pci_resource_flags(pdev, i) & IORESOURCE_MEM) {
+			mrioc->sysif_regs_phys = pci_resource_start(pdev, i);
+			memap_sz = pci_resource_len(pdev, i);
+			mrioc->sysif_regs =
+			    ioremap(mrioc->sysif_regs_phys, memap_sz);
+			break;
+		}
+	}
+
+	pci_enable_pcie_error_reporting(pdev);
+
+	pci_set_master(pdev);
+
+	retval = dma_set_mask_and_coherent(&pdev->dev, dma_mask);
+	if (retval) {
+		if (dma_mask != DMA_BIT_MASK(32)) {
+			ioc_warn(mrioc, "setting 64 bit DMA mask failed\n");
+			dma_mask = DMA_BIT_MASK(32);
+			retval = dma_set_mask_and_coherent(&pdev->dev,
+			    dma_mask);
+		}
+		if (retval) {
+			mrioc->dma_mask = 0;
+			ioc_err(mrioc, "setting 32 bit DMA mask also failed\n");
+			goto out_failed;
+		}
+	}
+	mrioc->dma_mask = dma_mask;
+
+	if (!mrioc->sysif_regs) {
+		ioc_err(mrioc,
+		    "unable to map adapter memory or resource not found\n");
+		retval = -EINVAL;
+		goto out_failed;
+	}
+
+	pci_read_config_word(pdev, capb + 2, &message_control);
+	mrioc->msix_count = (message_control & 0x3FF) + 1;
+
+	pci_save_state(pdev);
+
+	pci_set_drvdata(pdev, mrioc->shost);
+
+	mpi3mr_ioc_disable_intr(mrioc);
+
+	ioc_info(mrioc, "iomem(0x%016llx), mapped(0x%p), size(%d)\n",
+	    (unsigned long long)mrioc->sysif_regs_phys,
+	    mrioc->sysif_regs, memap_sz);
+	ioc_info(mrioc, "number of MSI-X vectors found in capabilities: (%d)\n",
+	    mrioc->msix_count);
+
+#if (KERNEL_VERSION(5, 12, 0) <= LINUX_VERSION_CODE)
+	if (!reset_devices && poll_queues > 0)
+		mrioc->requested_poll_qcount = min_t(int, poll_queues,
+				mrioc->msix_count - 2);
+#endif
+	return retval;
+
+out_failed:
+	mpi3mr_cleanup_resources(mrioc);
+	return retval;
+}
+
+/**
+ * mpi3mr_alloc_issue_host_diag_buf - Allocate and send host diag buffer
+ * @mrioc: Adapter instance reference
+ *
+ * Issue diagnostic buffer post (unconditional) MPI request through admin queue
+ * and wait for the completion of it or time out.
+ *
+ * Return: 0 on success non-zero on failure
+ */
+static int mpi3mr_alloc_issue_host_diag_buf(struct mpi3mr_ioc *mrioc)
+{
+	struct mpi3_diag_buffer_post_request diag_buf_post_req;
+	dma_addr_t buf_dma_addr;
+	u32 buf_sz;
+	int retval = -1;
+
+	ioc_info(mrioc, "driver diag buffer level = %s.\n",
+			mpi3mr_drv_db_name(drv_db_level));
+
+	if (!mrioc->drv_diag_buffer) {
+		mrioc->drv_diag_buffer_sz =
+			    MPI3MR_DEFAULT_DIAG_HOST_BUFFER_SZ;
+		mrioc->drv_diag_buffer =
+			dma_zalloc_coherent(&mrioc->pdev->dev,
+			    mrioc->drv_diag_buffer_sz,
+			    &mrioc->drv_diag_buffer_dma, GFP_KERNEL);
+		if (!mrioc->drv_diag_buffer) {
+			mrioc->drv_diag_buffer_sz =
+			    MPI3MR_MIN_DIAG_HOST_BUFFER_SZ;
+			mrioc->drv_diag_buffer =
+				dma_zalloc_coherent(&mrioc->pdev->dev,
+				mrioc->drv_diag_buffer_sz,
+				&mrioc->drv_diag_buffer_dma, GFP_KERNEL);
+		}
+		if (!mrioc->drv_diag_buffer) {
+			ioc_warn(mrioc, "%s:%d:failed to allocate buffer\n",
+			    __func__, __LINE__);
+			mrioc->drv_diag_buffer_sz = 0;
+			return retval;
+		}
+		/* TBD - memset to Zero once feature is stable */
+		memset(mrioc->drv_diag_buffer, 0x55, mrioc->drv_diag_buffer_sz);
+		mpi3mr_upload_drv_diag_buffer(mrioc);
+	}
+
+	buf_dma_addr = mrioc->drv_diag_buffer_dma;
+	buf_sz = mrioc->drv_diag_buffer_sz;
+
+	memset(&diag_buf_post_req, 0, sizeof(diag_buf_post_req));
+	mutex_lock(&mrioc->init_cmds.mutex);
+	if (mrioc->init_cmds.state & MPI3MR_CMD_PENDING) {
+		ioc_err(mrioc, "sending driver diag buffer post is failed due to command in use\n");
+		mutex_unlock(&mrioc->init_cmds.mutex);
+		return retval;
+	}
+	mrioc->init_cmds.state = MPI3MR_CMD_PENDING;
+	mrioc->init_cmds.is_waiting = 1;
+	mrioc->init_cmds.callback = NULL;
+	diag_buf_post_req.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INITCMDS);
+	diag_buf_post_req.function = MPI3_FUNCTION_DIAG_BUFFER_POST;
+	diag_buf_post_req.type = MPI3_DIAG_BUFFER_TYPE_DRIVER;
+	diag_buf_post_req.address = le64_to_cpu(buf_dma_addr);
+	diag_buf_post_req.length = le32_to_cpu(buf_sz);
+
+	init_completion(&mrioc->init_cmds.done);
+	retval = mpi3mr_admin_request_post(mrioc, &diag_buf_post_req,
+	    sizeof(diag_buf_post_req), 1);
+	if (retval) {
+		ioc_err(mrioc, "posting driver diag buffer failed\n");
+		goto out_unlock;
+	}
+	wait_for_completion_timeout(&mrioc->init_cmds.done,
+	    (MPI3MR_INTADMCMD_TIMEOUT * HZ));
+	if (!(mrioc->init_cmds.state & MPI3MR_CMD_COMPLETE)) {
+		ioc_err(mrioc, "posting driver diag buffer timed out\n");
+		mpi3mr_check_rh_fault_ioc(mrioc,
+		    MPI3MR_RESET_FROM_DIAG_BUFFER_POST_TIMEOUT);
+		retval = -1;
+		goto out_unlock;
+	}
+	retval = 0;
+	if ((mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK)
+	    != MPI3_IOCSTATUS_SUCCESS)
+		ioc_warn(mrioc,
+		    "driver diag buffer post returned with ioc_status(0x%04x) log_info(0x%08x)\n",
+		    (mrioc->init_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK),
+		    mrioc->init_cmds.ioc_loginfo);
+	else
+		ioc_info(mrioc, "driver diag buffer of size %dKB posted successfully\n",
+		    mrioc->drv_diag_buffer_sz / 1024);
+
+out_unlock:
+	mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED;
+	mutex_unlock(&mrioc->init_cmds.mutex);
+	return retval;
+}
+
+/**
+ * mpi3mr_revalidate_factsdata - validate IOCFacts parameters
+ * during reset/resume
+ * @mrioc: Adapter instance reference
+ *
+ * Return zero if the new IOCFacts parameters value is compatible with
+ * older values else return -EPERM
+ */
+static int
+mpi3mr_revalidate_factsdata(struct mpi3mr_ioc *mrioc)
+{
+	u16 dev_handle_bitmap_sz;
+	void *removepend_bitmap;
+
+	if (mrioc->facts.reply_sz > mrioc->reply_sz) {
+		ioc_err(mrioc,
+		    "cannot increase reply size from %d to %d\n",
+		    mrioc->reply_sz, mrioc->facts.reply_sz);
+		return -EPERM;
+	}
+	if (mrioc->num_io_throttle_group != mrioc->facts.max_io_throttle_group)
+	{
+		ioc_err(mrioc,
+		    "max io throttle group doesn't match old(%d), new(%d)\n",
+		    mrioc->num_io_throttle_group,
+		    mrioc->facts.max_io_throttle_group);
+		return -EPERM;
+	}
+
+
+	if (mrioc->facts.max_op_reply_q < mrioc->num_op_reply_q) {
+		ioc_err(mrioc,
+		    "cannot reduce number of operational reply queues from %d to %d\n",
+		    mrioc->num_op_reply_q,
+		    mrioc->facts.max_op_reply_q);
+		return -EPERM;
+	}
+
+	if (mrioc->facts.max_op_req_q < mrioc->num_op_req_q) {
+		ioc_err(mrioc,
+		    "cannot reduce number of operational request queues from %d to %d\n",
+		    mrioc->num_op_req_q, mrioc->facts.max_op_req_q);
+		return -EPERM;
+	}
+
+	if ((mrioc->sas_transport_enabled) && (mrioc->facts.ioc_capabilities &
+	      MPI3_IOCFACTS_CAPABILITY_MULTIPATH_ENABLED))
+		ioc_err(mrioc,
+		    "critical error: multipath capability is enabled at the "
+		    "controller while sas transport support is enabled at the "
+		    "driver, please reboot the system or reload the driver\n");
+
+	dev_handle_bitmap_sz = mrioc->facts.max_devhandle / 8;
+	if (mrioc->facts.max_devhandle % 8)
+		dev_handle_bitmap_sz++;
+	if (dev_handle_bitmap_sz > mrioc->dev_handle_bitmap_sz) {
+		removepend_bitmap = krealloc(mrioc->removepend_bitmap,
+		    dev_handle_bitmap_sz, GFP_KERNEL);
+		if (!removepend_bitmap) {
+			ioc_err(mrioc,
+			    "failed to increase removepend_bitmap sz from: %d to %d\n",
+			    mrioc->dev_handle_bitmap_sz, dev_handle_bitmap_sz);
+			return -EPERM;
+		}
+		memset(removepend_bitmap + mrioc->dev_handle_bitmap_sz, 0,
+		    dev_handle_bitmap_sz - mrioc->dev_handle_bitmap_sz);
+		mrioc->removepend_bitmap = removepend_bitmap;
+		ioc_info(mrioc,
+		    "increased dev_handle_bitmap_sz from %d to %d\n",
+		    mrioc->dev_handle_bitmap_sz, dev_handle_bitmap_sz);
+		mrioc->dev_handle_bitmap_sz = dev_handle_bitmap_sz;
+	}
+
+	return 0;
+}
+
+/**
+ * mpi3mr_bring_ioc_ready - Bring controller to ready state
+ * @mrioc: Adapter instance reference
+ *
+ * Set Enable IOC bit in IOC configuration register and wait for
+ * the controller to become ready.
+ *
+ * Return: 0 on success, appropriate error on failure.
+ */
+static int mpi3mr_bring_ioc_ready(struct mpi3mr_ioc *mrioc)
+{
+	u32 ioc_config, ioc_status, timeout;
+	int retval = 0;
+	enum mpi3mr_iocstate ioc_state;
+	u64 base_info;
+
+	ioc_status = readl(&mrioc->sysif_regs->ioc_status);
+	ioc_config = readl(&mrioc->sysif_regs->ioc_configuration);
+	base_info = mpi3mr_readq(&mrioc->sysif_regs->ioc_information);
+	ioc_info(mrioc, "ioc_status(0x%08x), ioc_config(0x%08x), ioc_info(0x%016llx) at the bringup\n",
+	    ioc_status, ioc_config, base_info);
+
+	/*The timeout value is in 2sec unit, changing it to seconds*/
+	mrioc->ready_timeout =
+	    ((base_info & MPI3_SYSIF_IOC_INFO_LOW_TIMEOUT_MASK) >>
+	    MPI3_SYSIF_IOC_INFO_LOW_TIMEOUT_SHIFT) * 2;
+
+	ioc_info(mrioc, "ready timeout: %d seconds\n", mrioc->ready_timeout);
+
+	ioc_state = mpi3mr_get_iocstate(mrioc);
+	ioc_info(mrioc, "controller is in %s state during detection\n",
+	    mpi3mr_iocstate_name(ioc_state));
+
+	if (ioc_state == MRIOC_STATE_BECOMING_READY ||
+	    ioc_state == MRIOC_STATE_RESET_REQUESTED) {
+		timeout = mrioc->ready_timeout * 10;
+		do {
+			msleep(100);
+		} while (--timeout);
+		if (!pci_device_is_present(mrioc->pdev))
+		{
+			mrioc->unrecoverable = 1;
+			ioc_err(mrioc, "controller is not present while waiting to reset\n");
+			goto out_device_not_present;
+		}
+
+		ioc_state = mpi3mr_get_iocstate(mrioc);
+		ioc_info(mrioc,
+		    "controller is in %s state after waiting to reset\n",
+		    mpi3mr_iocstate_name(ioc_state));
+	}
+
+	if (ioc_state == MRIOC_STATE_READY) {
+		ioc_info(mrioc, "issuing message unit reset (MUR) to bring to reset state\n");
+		retval = mpi3mr_issue_and_process_mur(mrioc,
+		    MPI3MR_RESET_FROM_BRINGUP);
+		ioc_state = mpi3mr_get_iocstate(mrioc);
+		if (retval)
+			ioc_err(mrioc,
+			    "message unit reset failed with error %d current state %s\n",
+			    retval, mpi3mr_iocstate_name(ioc_state));
+	}
+	if (ioc_state != MRIOC_STATE_RESET) {
+		mpi3mr_print_fault_info(mrioc);
+		ioc_info(mrioc, "issuing soft reset to bring to reset state\n");
+		retval = mpi3mr_issue_reset(mrioc,
+		    MPI3_SYSIF_HOST_DIAG_RESET_ACTION_SOFT_RESET,
+		    MPI3MR_RESET_FROM_BRINGUP);
+		if (retval) {
+			ioc_err(mrioc,
+			    "soft reset failed with error %d\n", retval);
+			goto out_failed;
+		}
+	}
+	ioc_state = mpi3mr_get_iocstate(mrioc);
+	if (ioc_state != MRIOC_STATE_RESET) {
+		ioc_err(mrioc,
+		    "cannot bring controller to reset state, current state: %s\n",
+		    mpi3mr_iocstate_name(ioc_state));
+		goto out_failed;
+	}
+	mpi3mr_clear_reset_history(mrioc);
+	retval = mpi3mr_setup_admin_qpair(mrioc);
+	if (retval) {
+		ioc_err(mrioc, "failed to setup admin queues: error %d\n",
+		    retval);
+		goto out_failed;
+	}
+
+	ioc_info(mrioc, "bringing controller to ready state\n");
+	ioc_config = readl(&mrioc->sysif_regs->ioc_configuration);
+	ioc_config |= MPI3_SYSIF_IOC_CONFIG_ENABLE_IOC;
+	writel(ioc_config, &mrioc->sysif_regs->ioc_configuration);
+
+	timeout = mrioc->ready_timeout * 10;
+	do {
+		ioc_state = mpi3mr_get_iocstate(mrioc);
+		if (ioc_state == MRIOC_STATE_READY)
+		{
+			ioc_info(mrioc,
+			    "successfully transistioned to %s state\n",
+			    mpi3mr_iocstate_name(ioc_state));
+			return 0;
+		}
+		if (!pci_device_is_present(mrioc->pdev))
+		{
+			mrioc->unrecoverable = 1;
+			ioc_err(mrioc, "controller is not present at the bringup\n");
+			goto out_device_not_present;
+		}
+		msleep(100);
+	} while (--timeout);
+
+out_failed:
+	ioc_state = mpi3mr_get_iocstate(mrioc);
+	ioc_err(mrioc,
+	    "failed to bring to ready state,  current state: %s\n",
+	    mpi3mr_iocstate_name(ioc_state));
+out_device_not_present:
+	retval = -1;
+	return retval;
+}
+
+/**
+ * mpi3mr_enable_events - Enable required events
+ * @mrioc: Adapter instance reference
+ *
+ * This routine unmasks the events required by the driver by
+ * sennding appropriate event mask bitmapt through an event
+ * notification request.
+ *
+ * Return: 0 on success and non-zero on failure.
+ */
+int mpi3mr_enable_events(struct mpi3mr_ioc *mrioc)
+{
+	int retval = 0;
+	u32  i;
+
+	for (i = 0; i < MPI3_EVENT_NOTIFY_EVENTMASK_WORDS; i++)
+		mrioc->event_masks[i] = -1;
+
+	mpi3mr_unmask_events(mrioc, MPI3_EVENT_DEVICE_ADDED);
+	mpi3mr_unmask_events(mrioc, MPI3_EVENT_DEVICE_INFO_CHANGED);
+	mpi3mr_unmask_events(mrioc, MPI3_EVENT_DEVICE_STATUS_CHANGE);
+	mpi3mr_unmask_events(mrioc, MPI3_EVENT_ENCL_DEVICE_STATUS_CHANGE);
+	mpi3mr_unmask_events(mrioc, MPI3_EVENT_ENCL_DEVICE_ADDED);
+	mpi3mr_unmask_events(mrioc, MPI3_EVENT_SAS_TOPOLOGY_CHANGE_LIST);
+	mpi3mr_unmask_events(mrioc, MPI3_EVENT_SAS_DISCOVERY);
+	mpi3mr_unmask_events(mrioc, MPI3_EVENT_SAS_DEVICE_DISCOVERY_ERROR);
+	mpi3mr_unmask_events(mrioc, MPI3_EVENT_SAS_BROADCAST_PRIMITIVE);
+	mpi3mr_unmask_events(mrioc, MPI3_EVENT_PCIE_TOPOLOGY_CHANGE_LIST);
+	mpi3mr_unmask_events(mrioc, MPI3_EVENT_PCIE_ENUMERATION);
+	mpi3mr_unmask_events(mrioc, MPI3_EVENT_PREPARE_FOR_RESET);
+	mpi3mr_unmask_events(mrioc, MPI3_EVENT_CABLE_MGMT);
+	mpi3mr_unmask_events(mrioc, MPI3_EVENT_ENERGY_PACK_CHANGE);
+	mpi3mr_unmask_events(mrioc, MPI3_EVENT_LOG_DATA);
+	mpi3mr_unmask_events(mrioc, MPI3_EVENT_DIAGNOSTIC_BUFFER_STATUS_CHANGE);
+
+	retval = mpi3mr_issue_event_notification(mrioc);
+	if (retval)
+		ioc_err(mrioc, "failed to issue event notification %d\n",
+		    retval);
+	return retval;
+}
+
+
+/**
+ * mpi3mr_init_ioc - Initialize the controller
+ * @mrioc: Adapter instance reference
+ *
+ * This the controller initialization routine, executed  from
+ * pci probe callback. Creates admin and operational reply queue
+ * pairs, allocate required memory for reply pool, sense buffer
+ * pool, issue IOC init request to the firmware, unmask the
+ * events.
+ *
+ * Return: 0 on success and non-zero on failure.
+ */
+int mpi3mr_init_ioc(struct mpi3mr_ioc *mrioc)
+{
+	int retval = 0;
+	u8 retry = 0;
+	struct mpi3_ioc_facts_data facts_data;
+	u32 sz;
+
+#if defined(IO_COUNTER_SUPPORT)
+	atomic_set(&mrioc->pend_ios, 0);
+#endif
+
+retry_init:
+	dprint_init(mrioc, "bringing up the controller to ready state\n");
+	retval = mpi3mr_bring_ioc_ready(mrioc);
+	if (retval) {
+		ioc_err(mrioc, "failed to bring to ready state\n");
+		goto out_failed_noretry;
+	}
+
+	dprint_init(mrioc, "setting up single ISR\n");
+	retval = mpi3mr_setup_isr(mrioc, 1);
+	if (retval) {
+		ioc_err(mrioc, "failed to setup ISR\n");
+		goto out_failed_noretry;
+	}
+
+	dprint_init(mrioc, "getting ioc_facts\n");
+	retval = mpi3mr_issue_iocfacts(mrioc, &facts_data);
+	if (retval) {
+		ioc_err(mrioc, "failed to get ioc_facts\n");
+		goto out_failed;
+	}
+
+	mrioc->max_host_ios = mrioc->facts.max_reqs - MPI3MR_INTERNALCMDS_RESVD;
+	if (!(mrioc->facts.ioc_capabilities &
+	      MPI3_IOCFACTS_CAPABILITY_MULTIPATH_ENABLED)) {
+		mrioc->sas_transport_enabled = 1;
+		mrioc->scsi_device_channel = 1;
+		mrioc->shost->max_channel = 1;
+		mrioc->shost->transportt = mpi3mr_transport_template;
+	}
+
+	mrioc->num_io_throttle_group = mrioc->facts.max_io_throttle_group;
+	atomic_set(&mrioc->pend_large_data_sz, 0);
+
+	if (reset_devices)
+		mrioc->max_host_ios = min_t(int, mrioc->max_host_ios,
+		    MPI3MR_HOST_IOS_KDUMP);
+
+	mrioc->reply_sz = mrioc->facts.reply_sz;
+
+	dprint_init(mrioc, "check and reset dma mask\n");
+	retval = mpi3mr_check_reset_dma_mask(mrioc);
+	if (retval) {
+		ioc_err(mrioc, "resetting dma mask failed\n");
+		goto out_failed_noretry;
+	}
+
+	mpi3mr_print_ioc_info(mrioc);
+
+	dprint_init(mrioc, "allocating config page buffers\n");
+	mrioc->cfg_page = dma_zalloc_coherent(&mrioc->pdev->dev,
+	    MPI3MR_DEFAULT_CFG_PAGE_SZ, &mrioc->cfg_page_dma, GFP_KERNEL);
+	if (!mrioc->cfg_page)
+		goto out_failed_noretry;
+
+	mrioc->cfg_page_sz = MPI3MR_DEFAULT_CFG_PAGE_SZ;
+
+	dprint_init(mrioc, "allocating host diag buffers\n");
+	mpi3mr_alloc_diag_bufs(mrioc);
+
+	dprint_init(mrioc, "posting host diag buffers\n");
+	retval = mpi3mr_post_diag_bufs(mrioc);
+	if (retval) {
+		ioc_err(mrioc, "failed to post host diag buffers\n");
+		goto out_failed;
+	}
+
+	dprint_init(mrioc, "allocating reply and sense buffers\n");
+	retval = mpi3mr_alloc_reply_sense_bufs(mrioc);
+	if (retval) {
+		ioc_err(mrioc, "failed to allocate reply and sense buffers\n");
+		goto out_failed_noretry;
+	}
+
+	dprint_init(mrioc, "allocating chain buffers\n");
+	retval = mpi3mr_alloc_chain_bufs(mrioc);
+	if (retval) {
+		ioc_err(mrioc, "failed to allocate chain buffers\n");
+		goto out_failed_noretry;
+	}
+
+	dprint_init(mrioc, "sending ioc_init\n");
+	retval = mpi3mr_issue_iocinit(mrioc);
+	if (retval) {
+		ioc_err(mrioc, "failed to send ioc_init\n");
+		goto out_failed;
+	}
+
+	dprint_init(mrioc, "getting package version\n");
+	retval = mpi3mr_print_pkg_ver(mrioc);
+	if (retval) {
+		ioc_err(mrioc, "failed to get package version\n");
+		goto out_failed;
+	}
+
+	dprint_init(mrioc, "setting up multiple ISR\n");
+	retval = mpi3mr_setup_isr(mrioc, 0);
+	if (retval) {
+		ioc_err(mrioc, "failed to re-setup ISR\n");
+		goto out_failed_noretry;
+	}
+
+	dprint_init(mrioc, "creating operational queue pairs\n");
+	retval = mpi3mr_create_op_queues(mrioc);
+	if (retval) {
+		ioc_err(mrioc, "failed to create operational queue pairs\n");
+		goto out_failed;
+	}
+
+	if (!mrioc->pel_seqnum_virt) {
+		dprint_init(mrioc, "allocating memory for pel_seqnum_virt\n");
+		mrioc->pel_seqnum_sz = sizeof(struct mpi3_pel_seq);
+		mrioc->pel_seqnum_virt = dma_zalloc_coherent(&mrioc->pdev->dev,
+		    mrioc->pel_seqnum_sz, &mrioc->pel_seqnum_dma,
+		    GFP_KERNEL);
+		if (!mrioc->pel_seqnum_virt)
+			goto out_failed_noretry;
+	}
+
+	if (!mrioc->throttle_groups && mrioc->num_io_throttle_group) {
+		dprint_init(mrioc, "allocating memory for throttle groups\n");
+		sz = sizeof(struct mpi3mr_throttle_group_info);
+		mrioc->throttle_groups = (struct mpi3mr_throttle_group_info *)
+					  kcalloc(mrioc->num_io_throttle_group,
+					      sz, GFP_KERNEL);
+		if (!mrioc->throttle_groups)
+			goto out_failed_noretry;
+	}
+
+	dprint_init(mrioc, "enabling events\n");
+	retval = mpi3mr_enable_events(mrioc);
+	if (retval) {
+		ioc_err(mrioc, "failed to enable events\n");
+		goto out_failed;
+	}
+	retval = mpi3mr_refresh_trigger(mrioc, MPI3_CONFIG_ACTION_READ_CURRENT);
+	if (retval) {
+		ioc_err(mrioc, "failed to refresh triggers\n");
+		goto out_failed;
+	}
+	if (mrioc->facts.diag_drvr_sz) {
+		dprint_reset(mrioc, "posting driver diag buffer\n");
+		retval = mpi3mr_alloc_issue_host_diag_buf(mrioc);
+		if (retval) {
+			ioc_err(mrioc, "failed to post driver diag buffer\n");
+			goto out_failed;
+		}
+	}
+
+	ioc_info(mrioc, "controller initialization completed successfully\n");
+	return retval;
+out_failed:
+	if (retry < 2) {
+		retry++;
+		ioc_warn(mrioc, "retrying controller initialization, retry_count:%d\n",
+		    retry);
+		mpi3mr_memset_buffers(mrioc);
+		goto retry_init;
+	}
+out_failed_noretry:
+	ioc_err(mrioc, "controller initialization failed\n");
+	mpi3mr_issue_reset(mrioc, MPI3_SYSIF_HOST_DIAG_RESET_ACTION_DIAG_FAULT,
+	    MPI3MR_RESET_FROM_CTLR_CLEANUP);
+	mrioc->unrecoverable = 1;
+	return retval;
+}
+
+/**
+ * mpi3mr_reinit_ioc - Re-Initialize the controller
+ * @mrioc: Adapter instance reference
+ * @is_resume: Called from resume or reset path
+ *
+ * This the controller re-initialization routine, executed  from
+ * the soft reset handler or resume callback. creates
+ * operational reply queue pairs, allocate required memory for
+ * reply pool, sense buffer pool, issue IOC init request to the
+ * firmware, unmask the events and issue port enable to discover
+ * SAS/SATA/NVMe devices and RAID volumes.
+ *
+ * Return: 0 on success and non-zero on failure.
+ */
+int mpi3mr_reinit_ioc(struct mpi3mr_ioc *mrioc, u8 is_resume)
+{
+	int retval = 0;
+	u8 retry = 0;
+	struct mpi3_ioc_facts_data facts_data;
+	u32 pe_timeout, ioc_status;
+
+retry_init:
+	pe_timeout =
+		( MPI3MR_PORTENABLE_TIMEOUT / MPI3MR_PORTENABLE_POLL_INTERVAL);
+	dprint_reset(mrioc, "bringing up the controller to ready state\n");
+	retval = mpi3mr_bring_ioc_ready(mrioc);
+	if (retval) {
+		ioc_err(mrioc, "failed to bring to ready state\n");
+		goto out_failed_noretry;
+	}
+
+	if (is_resume) {
+		dprint_reset(mrioc, "setting up single ISR\n");
+		retval = mpi3mr_setup_isr(mrioc, 1);
+		if (retval) {
+			ioc_err(mrioc, "failed to setup ISR\n");
+			goto out_failed_noretry;
+		}
+	} else
+		mpi3mr_ioc_enable_intr(mrioc);
+
+	dprint_reset(mrioc, "getting ioc_facts\n");
+	retval = mpi3mr_issue_iocfacts(mrioc, &facts_data);
+	if (retval) {
+		ioc_err(mrioc, "failed to get ioc_facts\n");
+		goto out_failed;
+	}
+
+	dprint_reset(mrioc, "validating ioc_facts\n");
+	retval = mpi3mr_revalidate_factsdata(mrioc);
+	if (retval) {
+		ioc_err(mrioc, "failed to revalidate ioc_facts data\n");
+		goto out_failed_noretry;
+	}
+
+	mpi3mr_print_ioc_info(mrioc);
+
+	if (is_resume) {
+		dprint_reset(mrioc, "posting host diag buffers\n");
+		retval = mpi3mr_post_diag_bufs(mrioc);
+		if (retval) {
+			ioc_err(mrioc, "failed to post host diag buffers\n");
+			goto out_failed;
+		}
+	}
+
+	dprint_reset(mrioc, "sending ioc_init\n");
+	retval = mpi3mr_issue_iocinit(mrioc);
+	if (retval) {
+		ioc_err(mrioc, "failed to send ioc_init\n");
+		goto out_failed;
+	}
+
+	dprint_reset(mrioc, "getting package version\n");
+	retval = mpi3mr_print_pkg_ver(mrioc);
+	if (retval) {
+		ioc_err(mrioc, "failed to get package version\n");
+		goto out_failed;
+	}
+
+	if (is_resume) {
+		dprint_reset(mrioc, "setting up multiple ISR\n");
+		retval = mpi3mr_setup_isr(mrioc, 0);
+		if (retval) {
+			ioc_err(mrioc, "failed to re-setup ISR\n");
+			goto out_failed_noretry;
+		}
+	}
+
+	dprint_reset(mrioc, "creating operational queue pairs\n");
+	retval = mpi3mr_create_op_queues(mrioc);
+	if (retval) {
+		ioc_err(mrioc, "failed to create operational queue pairs\n");
+		goto out_failed;
+	}
+
+	if (mpi3mr_use_blk_mq(mrioc->shost) &&
+	    (mrioc->shost->nr_hw_queues > mrioc->num_op_reply_q)) {
+		ioc_err(mrioc,
+		    "cannot create minimum number of operatioanl queues expected:%d created:%d\n",
+		    mrioc->shost->nr_hw_queues, mrioc->num_op_reply_q);
+		goto out_failed_noretry;
+	}
+
+	if (!mrioc->pel_seqnum_virt) {
+		dprint_reset(mrioc, "allocating memory for pel_seqnum_virt\n");
+		mrioc->pel_seqnum_sz = sizeof(struct mpi3_pel_seq);
+		mrioc->pel_seqnum_virt = dma_zalloc_coherent(&mrioc->pdev->dev,
+		    mrioc->pel_seqnum_sz, &mrioc->pel_seqnum_dma,
+		    GFP_KERNEL);
+		if (!mrioc->pel_seqnum_virt)
+			goto out_failed_noretry;
+	}
+
+	dprint_reset(mrioc, "enabling events\n");
+	retval = mpi3mr_enable_events(mrioc);
+	if (retval) {
+		ioc_err(mrioc, "failed to enable events\n");
+		goto out_failed;
+	}
+
+	mrioc->device_refresh_on = 1;
+	mpi3mr_add_event_wait_for_device_refresh(mrioc);
+
+	ioc_info(mrioc, "sending port enable\n");
+	retval = mpi3mr_issue_port_enable(mrioc, 1);
+	if (retval) {
+		ioc_err(mrioc, "failed to issue port enable\n");
+		goto out_failed;
+	}
+	do {
+		ssleep(MPI3MR_PORTENABLE_POLL_INTERVAL);
+		if (mrioc->init_cmds.state == MPI3MR_CMD_NOTUSED)
+			break;
+		if (!pci_device_is_present(mrioc->pdev))
+			mrioc->unrecoverable = 1;
+		if (mrioc->unrecoverable)
+			goto out_failed_noretry;
+		ioc_status = readl(&mrioc->sysif_regs->ioc_status);
+		if ((ioc_status & MPI3_SYSIF_IOC_STATUS_RESET_HISTORY) ||
+		    (ioc_status & MPI3_SYSIF_IOC_STATUS_FAULT)) {
+			mpi3mr_print_fault_info(mrioc);
+			mrioc->init_cmds.is_waiting = 0;
+			mrioc->init_cmds.callback = NULL;
+			mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED;
+			goto out_failed;
+		}
+	} while (--pe_timeout);
+
+	if (!pe_timeout) {
+		ioc_err(mrioc, "port enable timed out\n");
+		mpi3mr_check_rh_fault_ioc(mrioc,
+		    MPI3MR_RESET_FROM_PE_TIMEOUT);
+		mrioc->init_cmds.is_waiting = 0;
+		mrioc->init_cmds.callback = NULL;
+		mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED;
+		goto out_failed;
+	} else if (mrioc->scan_failed){
+		ioc_err(mrioc,
+		    "port enable failed with status=0x%04x\n",
+		    mrioc->scan_failed);
+	} else
+		ioc_info(mrioc, "port enable completed successfully\n");
+
+	if (mrioc->facts.diag_drvr_sz) {
+		dprint_reset(mrioc, "posting driver diag buffer\n");
+		retval = mpi3mr_alloc_issue_host_diag_buf(mrioc);
+		if (retval) {
+			ioc_err(mrioc, "failed to post driver diag buffer\n");
+			goto out_failed;
+		}
+	}
+
+	ioc_info(mrioc, "controller %s completed successfully\n",
+	    (is_resume)?"resume":"re-initialization");
+	return retval;
+out_failed:
+	if (retry < 2) {
+		retry++;
+		ioc_warn(mrioc, "retrying controller %s, retry_count:%d\n",
+		    (is_resume)?"resume":"re-initialization", retry);
+		mpi3mr_memset_buffers(mrioc);
+		goto retry_init;
+	}
+out_failed_noretry:
+	ioc_err(mrioc, "controller %s is failed\n",
+		    (is_resume)?"resume":"re-initialization");
+	mpi3mr_issue_reset(mrioc, MPI3_SYSIF_HOST_DIAG_RESET_ACTION_DIAG_FAULT,
+	    MPI3MR_RESET_FROM_CTLR_CLEANUP);
+	mrioc->unrecoverable = 1;
+	return retval;
+}
+
+/**
+ * mpi3mr_memset_op_reply_q_buffers - memset the operational reply queue's
+ *					segments
+ * @mrioc: Adapter instance reference
+ * @qidx: Operational reply queue index
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_memset_op_reply_q_buffers(struct mpi3mr_ioc *mrioc, u16 qidx)
+{
+	struct op_reply_qinfo *op_reply_q = mrioc->op_reply_qinfo + qidx;
+	struct segments *segments;
+	int i, size;
+
+	if (!op_reply_q->q_segments)
+		return;
+
+	size = op_reply_q->segment_qd * mrioc->op_reply_desc_sz;
+	segments = op_reply_q->q_segments;
+	for (i = 0; i < op_reply_q->num_segments; i++)
+		memset(segments[i].segment, 0, size);
+}
+
+/**
+ * mpi3mr_memset_op_req_q_buffers - memset the operational request queue's
+ *					segments
+ * @mrioc: Adapter instance reference
+ * @qidx: Operational request queue index
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_memset_op_req_q_buffers(struct mpi3mr_ioc *mrioc, u16 qidx)
+{
+	struct op_req_qinfo *op_req_q = mrioc->req_qinfo + qidx;
+	struct segments *segments;
+	int i, size;
+
+	if (!op_req_q->q_segments)
+		return;
+
+	size = op_req_q->segment_qd * mrioc->facts.op_req_sz;
+	segments = op_req_q->q_segments;
+	for (i = 0; i < op_req_q->num_segments; i++)
+		memset(segments[i].segment, 0, size);
+}
+
+/**
+ * mpi3mr_memset_buffers - memset memory for a controller
+ * @mrioc: Adapter instance reference
+ *
+ * clear all the memory allocated for a controller, typically
+ * called post reset to reuse the memory allocated during the
+ * controller init.
+ *
+ * Return: Nothing.
+ */
+void mpi3mr_memset_buffers(struct mpi3mr_ioc *mrioc)
+{
+	u16 i;
+	struct mpi3mr_throttle_group_info *tg;
+
+	mrioc->change_count = 0;
+	mrioc->active_poll_qcount = 0;
+	mrioc->default_qcount = 0;
+	if (mrioc->admin_req_base)
+		memset(mrioc->admin_req_base, 0, mrioc->admin_req_q_sz);
+	if (mrioc->admin_reply_base)
+		memset(mrioc->admin_reply_base, 0, mrioc->admin_reply_q_sz);
+	atomic_set(&mrioc->admin_reply_q_in_use, 0);
+
+	if (mrioc->init_cmds.reply) {
+		memset(mrioc->init_cmds.reply, 0,
+		    sizeof(*mrioc->init_cmds.reply));
+		memset(mrioc->bsg_cmds.reply, 0,
+		    sizeof(*mrioc->bsg_cmds.reply));
+		memset(mrioc->host_tm_cmds.reply, 0,
+		    sizeof(*mrioc->host_tm_cmds.reply));
+		memset(mrioc->pel_cmds.reply, 0,
+		    sizeof(*mrioc->pel_cmds.reply));
+		memset(mrioc->pel_abort_cmd.reply, 0,
+		    sizeof(*mrioc->pel_abort_cmd.reply));
+		memset(mrioc->transport_cmds.reply, 0,
+		    sizeof(*mrioc->transport_cmds.reply));
+		for (i = 0; i < MPI3MR_NUM_DEVRMCMD; i++)
+			memset(mrioc->dev_rmhs_cmds[i].reply, 0,
+			    sizeof(*mrioc->dev_rmhs_cmds[i].reply));
+		for (i = 0; i < MPI3MR_NUM_SYSFS_TM; i++)
+			memset(mrioc->sysfs_tm_cmds[i].reply, 0,
+			    sizeof(*mrioc->sysfs_tm_cmds[i].reply));
+		for (i = 0; i < MPI3MR_NUM_EVTACKCMD; i++)
+			memset(mrioc->evtack_cmds[i].reply, 0,
+			    sizeof(*mrioc->evtack_cmds[i].reply));
+		memset(mrioc->removepend_bitmap, 0,
+		    mrioc->dev_handle_bitmap_sz);
+		memset(mrioc->devrem_bitmap, 0, mrioc->devrem_bitmap_sz);
+		memset(mrioc->evtack_cmds_bitmap, 0,
+		    mrioc->evtack_cmds_bitmap_sz);
+	}
+
+	for (i = 0; i < mrioc->num_queues; i++) {
+		mrioc->op_reply_qinfo[i].qid = 0;
+		mrioc->op_reply_qinfo[i].ci = 0;
+		mrioc->op_reply_qinfo[i].num_replies = 0;
+		mrioc->op_reply_qinfo[i].ephase = 0;
+		atomic_set(&mrioc->op_reply_qinfo[i].pend_ios, 0);
+		atomic_set(&mrioc->op_reply_qinfo[i].in_use, 0);
+		mpi3mr_memset_op_reply_q_buffers(mrioc, i);
+
+		mrioc->req_qinfo[i].ci = 0;
+		mrioc->req_qinfo[i].pi = 0;
+		mrioc->req_qinfo[i].num_requests = 0;
+		mrioc->req_qinfo[i].qid = 0;
+		mrioc->req_qinfo[i].reply_qid = 0;
+		spin_lock_init(&mrioc->req_qinfo[i].q_lock);
+		mrioc->req_qinfo[i].last_full_host_tag = 0;
+		mpi3mr_memset_op_req_q_buffers(mrioc, i);
+	}
+
+	atomic_set(&mrioc->pend_large_data_sz, 0);
+	if (mrioc->throttle_groups) {
+		tg = mrioc->throttle_groups;
+		for (i = 0; i < mrioc->num_io_throttle_group; i++, tg++) {
+			tg->id = 0;
+			tg->fw_qd = 0;
+			tg->modified_qd = 0;
+			tg->io_divert= 0;
+			tg->need_qd_reduction= 0;
+			tg->high = 0;
+			tg->low = 0;
+			tg->qd_reduction= 0;
+			atomic_set(&tg->pend_large_data_sz, 0);
+		}
+	}
+}
+
+/**
+ * mpi3mr_free_mem - Free memory allocated for a controller
+ * @mrioc: Adapter instance reference
+ *
+ * Free all the memory allocated for a controller.
+ *
+ * Return: Nothing.
+ */
+void mpi3mr_free_mem(struct mpi3mr_ioc *mrioc)
+{
+	u16 i;
+	struct mpi3mr_intr_info *intr_info;
+	struct diag_buffer_desc *diag_buffer;
+
+	dprint_exit(mrioc, "freeing up memory allocated for the controller\n");
+
+	mpi3mr_free_enclosure_list(mrioc);
+
+	if (mrioc->sense_buf_pool) {
+		if (mrioc->sense_buf)
+			dma_pool_free(mrioc->sense_buf_pool, mrioc->sense_buf,
+			    mrioc->sense_buf_dma);
+		dma_pool_destroy(mrioc->sense_buf_pool);
+		mrioc->sense_buf = NULL;
+		mrioc->sense_buf_pool = NULL;
+	}
+	if (mrioc->sense_buf_q_pool) {
+		if (mrioc->sense_buf_q)
+			dma_pool_free(mrioc->sense_buf_q_pool,
+			    mrioc->sense_buf_q, mrioc->sense_buf_q_dma);
+		dma_pool_destroy(mrioc->sense_buf_q_pool);
+		mrioc->sense_buf_q = NULL;
+		mrioc->sense_buf_q_pool = NULL;
+	}
+
+	if (mrioc->reply_buf_pool) {
+		if (mrioc->reply_buf)
+			dma_pool_free(mrioc->reply_buf_pool, mrioc->reply_buf,
+			    mrioc->reply_buf_dma);
+		dma_pool_destroy(mrioc->reply_buf_pool);
+		mrioc->reply_buf = NULL;
+		mrioc->reply_buf_pool = NULL;
+	}
+	if (mrioc->reply_free_q_pool) {
+		if (mrioc->reply_free_q)
+			dma_pool_free(mrioc->reply_free_q_pool,
+			    mrioc->reply_free_q, mrioc->reply_free_q_dma);
+		dma_pool_destroy(mrioc->reply_free_q_pool);
+		mrioc->reply_free_q = NULL;
+		mrioc->reply_free_q_pool = NULL;
+	}
+
+	for (i = 0; i < mrioc->num_op_req_q; i++)
+		mpi3mr_free_op_req_q_segments(mrioc, i);
+
+	for (i = 0; i < mrioc->num_op_reply_q; i++)
+		mpi3mr_free_op_reply_q_segments(mrioc, i);
+
+	for (i = 0; i < mrioc->intr_info_count; i++) {
+		intr_info = mrioc->intr_info + i;
+		intr_info->op_reply_q = NULL;
+	}
+
+	kfree(mrioc->req_qinfo);
+	mrioc->req_qinfo = NULL;
+	mrioc->num_op_req_q = 0;
+
+	kfree(mrioc->op_reply_qinfo);
+	mrioc->op_reply_qinfo = NULL;
+	mrioc->num_op_reply_q = 0;
+
+	kfree(mrioc->init_cmds.reply);
+	mrioc->init_cmds.reply = NULL;
+
+	kfree(mrioc->bsg_cmds.reply);
+	mrioc->bsg_cmds.reply = NULL;
+
+	kfree(mrioc->host_tm_cmds.reply);
+	mrioc->host_tm_cmds.reply = NULL;
+
+	kfree(mrioc->pel_cmds.reply);
+	mrioc->pel_cmds.reply = NULL;
+
+	kfree(mrioc->pel_abort_cmd.reply);
+	mrioc->pel_abort_cmd.reply = NULL;
+
+	kfree(mrioc->transport_cmds.reply);
+	mrioc->transport_cmds.reply = NULL;
+
+	for (i = 0; i < MPI3MR_NUM_DEVRMCMD; i++) {
+		kfree(mrioc->dev_rmhs_cmds[i].reply);
+		mrioc->dev_rmhs_cmds[i].reply = NULL;
+	}
+	for (i = 0; i < MPI3MR_NUM_SYSFS_TM; i++) {
+		kfree(mrioc->sysfs_tm_cmds[i].reply);
+		mrioc->sysfs_tm_cmds[i].reply = NULL;
+	}
+	for (i = 0; i < MPI3MR_NUM_EVTACKCMD; i++) {
+		kfree(mrioc->evtack_cmds[i].reply);
+		mrioc->evtack_cmds[i].reply = NULL;
+	}
+
+	kfree(mrioc->removepend_bitmap);
+	mrioc->removepend_bitmap = NULL;
+
+	kfree(mrioc->devrem_bitmap);
+	mrioc->devrem_bitmap = NULL;
+
+	kfree(mrioc->evtack_cmds_bitmap);
+	mrioc->evtack_cmds_bitmap = NULL;
+
+	kfree(mrioc->chain_bitmap);
+	mrioc->chain_bitmap = NULL;
+
+	if (mrioc->chain_buf_pool) {
+		for (i = 0; i < mrioc->chain_buf_count; i++) {
+			if (mrioc->chain_sgl_list[i].addr) {
+				dma_pool_free(mrioc->chain_buf_pool,
+				    mrioc->chain_sgl_list[i].addr,
+				    mrioc->chain_sgl_list[i].dma_addr);
+				mrioc->chain_sgl_list[i].addr = NULL;
+			}
+		}
+		dma_pool_destroy(mrioc->chain_buf_pool);
+		mrioc->chain_buf_pool = NULL;
+	}
+
+	kfree(mrioc->chain_sgl_list);
+	mrioc->chain_sgl_list = NULL;
+
+	if (mrioc->admin_reply_base) {
+		dma_free_coherent(&mrioc->pdev->dev, mrioc->admin_reply_q_sz,
+		    mrioc->admin_reply_base, mrioc->admin_reply_dma);
+		mrioc->admin_reply_base = NULL;
+	}
+	if (mrioc->admin_req_base) {
+		dma_free_coherent(&mrioc->pdev->dev, mrioc->admin_req_q_sz,
+		    mrioc->admin_req_base, mrioc->admin_req_dma);
+		mrioc->admin_req_base = NULL;
+	}
+
+	if (mrioc->prp_list_virt) {
+		dma_free_coherent(&mrioc->pdev->dev, mrioc->prp_sz,
+		    mrioc->prp_list_virt, mrioc->prp_list_dma);
+		mrioc->prp_list_virt = NULL;
+	}
+
+	if (mrioc->pel_seqnum_virt) {
+		dma_free_coherent(&mrioc->pdev->dev, mrioc->pel_seqnum_sz,
+		    mrioc->pel_seqnum_virt, mrioc->pel_seqnum_dma);
+		mrioc->pel_seqnum_virt = NULL;
+	}
+
+	for (i = 0; i < MPI3MR_MAX_NUM_HDB; i++) {
+		diag_buffer = &mrioc->diag_buffers[i];
+		if (diag_buffer->addr) {
+			dma_free_coherent(&mrioc->pdev->dev,
+			    diag_buffer->size, diag_buffer->addr,
+			    diag_buffer->dma_addr);
+			diag_buffer->addr = NULL;
+			diag_buffer->size = 0;
+			diag_buffer->type = 0;
+			diag_buffer->status = 0;
+		}
+	}
+
+	if (mrioc->drv_diag_buffer) {
+		dma_free_coherent(&mrioc->pdev->dev,
+		    mrioc->drv_diag_buffer_sz, mrioc->drv_diag_buffer,
+		    mrioc->drv_diag_buffer_dma);
+		mrioc->drv_diag_buffer = NULL;
+		mrioc->drv_diag_buffer_sz = 0;
+	}
+
+	if (mrioc->cfg_page) {
+		dma_free_coherent(&mrioc->pdev->dev,
+		    mrioc->cfg_page_sz, mrioc->cfg_page,
+		    mrioc->cfg_page_dma);
+		mrioc->cfg_page = NULL;
+	}
+
+	vfree(mrioc->uefi_logs);
+	kfree(mrioc->logdata_buf);
+	mrioc->logdata_buf = NULL;
+	kfree(mrioc->driver_pg2);
+	mrioc->driver_pg2 = NULL;
+	dprint_exit(mrioc, "freed up memory allocated for the controller\n");
+}
+
+/**
+ * mpi3mr_issue_ioc_shutdown - shutdown controller
+ * @mrioc: Adapter instance reference
+ *
+ * Send shutodwn notification to the controller and wait for the
+ * shutdown_timeout for it to be completed.
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_issue_ioc_shutdown(struct mpi3mr_ioc *mrioc)
+{
+	u32 ioc_config, ioc_status;
+	u8 retval = 1;
+	u32 timeout = MPI3MR_DEFAULT_SHUTDOWN_TIME * 10;
+
+	ioc_info(mrioc, "sending shutdown notification\n");
+	if (mrioc->unrecoverable) {
+		ioc_warn(mrioc,
+		    "controller is unrecoverable, shutdown not issued\n");
+		return;
+	}
+	ioc_status = readl(&mrioc->sysif_regs->ioc_status);
+	if ((ioc_status & MPI3_SYSIF_IOC_STATUS_SHUTDOWN_MASK)
+	    == MPI3_SYSIF_IOC_STATUS_SHUTDOWN_IN_PROGRESS) {
+		ioc_warn(mrioc, "shutdown already in progress\n");
+		return;
+	}
+
+	ioc_config = readl(&mrioc->sysif_regs->ioc_configuration);
+	ioc_config |= MPI3_SYSIF_IOC_CONFIG_SHUTDOWN_NORMAL;
+	ioc_config |= MPI3_SYSIF_IOC_CONFIG_DEVICE_SHUTDOWN_SEND_REQ;
+
+	writel(ioc_config, &mrioc->sysif_regs->ioc_configuration);
+
+	if (mrioc->facts.shutdown_timeout)
+		timeout = mrioc->facts.shutdown_timeout * 10;
+
+	do {
+		ioc_status = readl(&mrioc->sysif_regs->ioc_status);
+		if ((ioc_status & MPI3_SYSIF_IOC_STATUS_SHUTDOWN_MASK)
+		    == MPI3_SYSIF_IOC_STATUS_SHUTDOWN_COMPLETE) {
+			retval = 0;
+			break;
+		}
+		msleep(100);
+	} while (--timeout);
+
+
+	ioc_status = readl(&mrioc->sysif_regs->ioc_status);
+	ioc_config = readl(&mrioc->sysif_regs->ioc_configuration);
+
+	if (retval) {
+		if ((ioc_status & MPI3_SYSIF_IOC_STATUS_SHUTDOWN_MASK)
+		    == MPI3_SYSIF_IOC_STATUS_SHUTDOWN_IN_PROGRESS)
+			ioc_warn(mrioc,
+			    "shutdown still in progress after timeout\n");
+	}
+
+	ioc_info(mrioc,
+	    "ioc_status/ioc_config after %s shutdown is (0x%x)/(0x%x)\n",
+	    (!retval)?"successful":"failed", ioc_status,
+	    ioc_config);
+}
+
+/**
+ * mpi3mr_cleanup_ioc - Cleanup controller
+ * @mrioc: Adapter instance reference
+
+ * controller cleanup handler, Message unit reset or soft reset
+ * and shutdown notification is issued to the controller.
+ *
+ * Return: Nothing.
+ */
+void mpi3mr_cleanup_ioc(struct mpi3mr_ioc *mrioc)
+{
+	enum mpi3mr_iocstate ioc_state;
+
+	dprint_exit(mrioc, "cleaning up the controller\n");
+
+	mpi3mr_ioc_disable_intr(mrioc);
+
+	ioc_state = mpi3mr_get_iocstate(mrioc);
+
+	if ((!mrioc->unrecoverable) && (!mrioc->reset_in_progress) &&
+	    (ioc_state == MRIOC_STATE_READY)) {
+		if (mrioc->is_segqueue_enabled && !mrioc->pdev->revision)
+			mpi3mr_issue_reset(mrioc,
+			    MPI3_SYSIF_HOST_DIAG_RESET_ACTION_SOFT_RESET,
+			    MPI3MR_RESET_FROM_CTLR_CLEANUP);
+		else if (mpi3mr_issue_and_process_mur(mrioc,
+		    MPI3MR_RESET_FROM_CTLR_CLEANUP))
+			mpi3mr_issue_reset(mrioc,
+			    MPI3_SYSIF_HOST_DIAG_RESET_ACTION_SOFT_RESET,
+			    MPI3MR_RESET_FROM_MUR_FAILURE);
+		mpi3mr_issue_ioc_shutdown(mrioc);
+	}
+	dprint_exit(mrioc, "controller cleanup completed\n");
+}
+
+/**
+ * mpi3mr_drv_cmd_comp_reset - Flush a internal driver command
+ * @mrioc: Adapter instance reference
+ * @cmdptr: Internal command tracker
+ *
+ * Complete an internal driver commands with state indicating it
+ * is completed due to reset.
+ *
+ * Return: Nothing.
+ */
+static inline void mpi3mr_drv_cmd_comp_reset(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_drv_cmd *cmdptr)
+{
+	if (cmdptr->state & MPI3MR_CMD_PENDING) {
+		cmdptr->state |= MPI3MR_CMD_RESET;
+		cmdptr->state &= ~MPI3MR_CMD_PENDING;
+		if (cmdptr->is_waiting) {
+			complete(&cmdptr->done);
+			cmdptr->is_waiting = 0;
+		} else if (cmdptr->callback)
+			cmdptr->callback(mrioc, cmdptr);
+	}
+}
+
+/**
+ * mpi3mr_flush_drv_cmds - Flush internal driver commands
+ * @mrioc: Adapter instance reference
+ *
+ * Flush all internal driver commands post reset
+ *
+ * Return: Nothing.
+ */
+void mpi3mr_flush_drv_cmds(struct mpi3mr_ioc *mrioc)
+{
+	struct mpi3mr_drv_cmd *cmdptr;
+	u8 i;
+
+	dprint_reset(mrioc, "flushing internal commands\n");
+	cmdptr = &mrioc->init_cmds;
+	mpi3mr_drv_cmd_comp_reset(mrioc, cmdptr);
+
+	cmdptr = &mrioc->cfg_cmds;
+	mpi3mr_drv_cmd_comp_reset(mrioc, cmdptr);
+
+	cmdptr = &mrioc->bsg_cmds;
+	mpi3mr_drv_cmd_comp_reset(mrioc, cmdptr);
+
+	cmdptr = &mrioc->host_tm_cmds;
+	mpi3mr_drv_cmd_comp_reset(mrioc, cmdptr);
+
+	for (i = 0; i < MPI3MR_NUM_DEVRMCMD; i++) {
+		cmdptr = &mrioc->dev_rmhs_cmds[i];
+		mpi3mr_drv_cmd_comp_reset(mrioc, cmdptr);
+	}
+
+	for (i = 0; i < MPI3MR_NUM_SYSFS_TM; i++) {
+		cmdptr = &mrioc->sysfs_tm_cmds[i];
+		mpi3mr_drv_cmd_comp_reset(mrioc, cmdptr);
+	}
+
+	for (i = 0; i < MPI3MR_NUM_EVTACKCMD; i++) {
+		cmdptr = &mrioc->evtack_cmds[i];
+		mpi3mr_drv_cmd_comp_reset(mrioc, cmdptr);
+	}
+
+	cmdptr = &mrioc->pel_cmds;
+	mpi3mr_drv_cmd_comp_reset(mrioc, cmdptr);
+
+	cmdptr = &mrioc->pel_abort_cmd;
+	mpi3mr_drv_cmd_comp_reset(mrioc, cmdptr);
+
+	cmdptr = &mrioc->transport_cmds;
+	mpi3mr_drv_cmd_comp_reset(mrioc, cmdptr);
+
+	init_waitqueue_head(&mrioc->sysfs_pending_tm_wq);
+	atomic_set(&mrioc->sysfs_tm_pending, 0);
+	mrioc->sysfs_tm_issued = 0;
+	mrioc->sysfs_tm_terminated_io_count = 0;
+}
+
+/**
+ * mpi3mr_free_enclosure_list - release enclosures
+ * @mrioc: Adapter instance reference
+ *
+ * Free memory allocated during encloure add.
+ *
+ * Return nothing.
+ */
+void mpi3mr_free_enclosure_list(struct mpi3mr_ioc *mrioc)
+{
+	struct mpi3mr_enclosure_node *enclosure_dev, *enclosure_dev_next;
+
+	list_for_each_entry_safe(enclosure_dev,
+			enclosure_dev_next, &mrioc->enclosure_list, list) {
+		list_del(&enclosure_dev->list);
+		kfree(enclosure_dev);
+	}
+}
+
+/**
+ * mpi3mr_pel_wait_post - Issue PEL Wait
+ * @mrioc: Adapter instance reference
+ * @drv_cmd: Internal command tracker
+ *
+ * Issue PEL Wait MPI request through admin queue and return.
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_pel_wait_post(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_drv_cmd *drv_cmd)
+{
+	struct mpi3_pel_req_action_wait pel_wait;
+
+	mrioc->pel_abort_requested = false;
+
+	memset(&pel_wait, 0, sizeof(pel_wait));
+	drv_cmd->state = MPI3MR_CMD_PENDING;
+	drv_cmd->is_waiting = 0;
+	drv_cmd->callback = mpi3mr_pel_wait_complete;
+	drv_cmd->ioc_status = 0;
+	drv_cmd->ioc_loginfo = 0;
+	pel_wait.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_PEL_WAIT);
+	pel_wait.function = MPI3_FUNCTION_PERSISTENT_EVENT_LOG;
+	pel_wait.action = MPI3_PEL_ACTION_WAIT;
+	pel_wait.starting_sequence_number = cpu_to_le32(mrioc->pel_newest_seqnum);
+	pel_wait.locale = cpu_to_le16(mrioc->pel_locale);
+	pel_wait.class = cpu_to_le16(mrioc->pel_class);
+	pel_wait.wait_time = MPI3_PEL_WAITTIME_INFINITE_WAIT;
+	dprint_bsg_info(mrioc, "sending pel_wait seqnum(%d), class(%d), locale(0x%08x)\n",
+	    mrioc->pel_newest_seqnum, mrioc->pel_class, mrioc->pel_locale);
+
+	if (mpi3mr_admin_request_post(mrioc, &pel_wait, sizeof(pel_wait), 0)) {
+		dprint_bsg_err(mrioc,
+			    "Issuing PELWait: Admin post failed\n");
+		drv_cmd->state = MPI3MR_CMD_NOTUSED;
+		drv_cmd->callback = NULL;
+		drv_cmd->retry_count = 0;
+		mrioc->pel_enabled = false;
+	}
+	return;
+}
+
+/**
+ * mpi3mr_pel_get_seqnum_post - Issue PEL Get Sequence number
+ * @mrioc: Adapter instance reference
+ * @drv_cmd: Internal command tracker
+ *
+ * Issue PEL get sequence number MPI request through admin queue
+ * and return.
+ *
+ * Return: 0 on success, non-zero on failure.
+ */
+int mpi3mr_pel_get_seqnum_post(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_drv_cmd *drv_cmd)
+{
+	struct mpi3_pel_req_action_get_sequence_numbers pel_getseq_req;
+	u8 sgl_flags = MPI3MR_SGEFLAGS_SYSTEM_SIMPLE_END_OF_LIST;
+	int retval = 0;
+
+	memset(&pel_getseq_req, 0, sizeof(pel_getseq_req));
+	mrioc->pel_cmds.state = MPI3MR_CMD_PENDING;
+	mrioc->pel_cmds.is_waiting = 0;
+	mrioc->pel_cmds.ioc_status = 0;
+	mrioc->pel_cmds.ioc_loginfo = 0;
+	mrioc->pel_cmds.callback = mpi3mr_pel_get_seqnum_complete;
+	pel_getseq_req.host_tag = cpu_to_le16(MPI3MR_HOSTTAG_PEL_WAIT);
+	pel_getseq_req.function = MPI3_FUNCTION_PERSISTENT_EVENT_LOG;
+	pel_getseq_req.action = MPI3_PEL_ACTION_GET_SEQNUM;
+	mpi3mr_add_sg_single(&pel_getseq_req.sgl, sgl_flags,
+	    mrioc->pel_seqnum_sz, mrioc->pel_seqnum_dma);
+
+	retval = mpi3mr_admin_request_post(mrioc, &pel_getseq_req,
+			sizeof(pel_getseq_req), 0);
+	if (retval) {
+		if (drv_cmd) {
+			drv_cmd->state = MPI3MR_CMD_NOTUSED;
+			drv_cmd->callback = NULL;
+			drv_cmd->retry_count = 0;
+		}
+		mrioc->pel_enabled = false;
+	}
+
+	return retval;
+}
+
+/**
+ * mpi3mr_pel_wait_complete - PELWait Completion callback
+ * @mrioc: Adapter instance reference
+ * @drv_cmd: Internal command tracker
+ *
+ * This is a callback handler for the PELWait request and
+ * firmware completes a PELWait request when it is aborted or a
+ * new PEL entry is available. This sends AEN to the application
+ * and if the PELwait completion is not due to PELAbort then
+ * this will send a request for new PEL Sequence number
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_pel_wait_complete(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_drv_cmd *drv_cmd)
+{
+	struct mpi3_pel_reply *pel_reply = NULL;
+	u16 ioc_status, pe_log_status;
+	bool do_retry = false;
+
+	if (drv_cmd->state & MPI3MR_CMD_RESET)
+		goto cleanup_drv_cmd;
+
+	ioc_status = drv_cmd->ioc_status & MPI3_IOCSTATUS_STATUS_MASK;
+	if (ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc, "%s: Failed ioc_status(0x%04x) Loginfo(0x%08x)\n",
+			__func__, ioc_status, drv_cmd->ioc_loginfo);
+		dprint_bsg_err(mrioc,
+		    "pel_wait: failed with ioc_status(0x%04x), log_info(0x%08x)\n",
+		    ioc_status, drv_cmd->ioc_loginfo);
+		do_retry = true;
+	}
+
+	if (drv_cmd->state & MPI3MR_CMD_REPLY_VALID)
+		pel_reply = (struct mpi3_pel_reply *)drv_cmd->reply;
+
+	if (!pel_reply) {
+		dprint_bsg_err(mrioc,
+		    "pel_wait: failed due to no reply\n");
+		goto out_failed;
+	}
+
+	pe_log_status = le16_to_cpu(pel_reply->pe_log_status);
+	if ((pe_log_status != MPI3_PEL_STATUS_SUCCESS) &&
+	    (pe_log_status != MPI3_PEL_STATUS_ABORTED)) {
+		ioc_err(mrioc, "%s: Failed pe_log_status(0x%04x)\n",
+			__func__, pe_log_status);
+		dprint_bsg_err(mrioc,
+		    "pel_wait: failed due to pel_log_status(0x%04x)\n",
+		    pe_log_status);
+		do_retry = true;
+	}
+
+	if (do_retry) {
+		if (drv_cmd->retry_count < MPI3MR_PEL_RETRY_COUNT) {
+			drv_cmd->retry_count++;
+			dprint_bsg_err(mrioc, "pel_wait: retrying(%d)\n",
+			    drv_cmd->retry_count);
+			mpi3mr_pel_wait_post(mrioc, drv_cmd);
+			return;
+		}
+		dprint_bsg_err(mrioc,
+		    "pel_wait: failed after all retries(%d)\n",
+		    drv_cmd->retry_count);
+		goto out_failed;
+	}
+	atomic64_inc(&event_counter);
+	if (!mrioc->pel_abort_requested) {
+		mrioc->pel_cmds.retry_count = 0;
+		mpi3mr_pel_get_seqnum_post(mrioc, &mrioc->pel_cmds);
+	}
+
+	return;
+out_failed:
+	mrioc->pel_enabled = false;
+cleanup_drv_cmd:
+	drv_cmd->state = MPI3MR_CMD_NOTUSED;
+	drv_cmd->callback = NULL;
+	drv_cmd->retry_count = 0;
+}
+
+/**
+ * mpi3mr_pel_get_seqnum_complete - PELGetSeqNum Completion callback
+ * @mrioc: Adapter instance reference
+ * @drv_cmd: Internal command tracker
+ *
+ * This is a callback handler for the PEL get sequence number
+ * request and a new PEL wait request will be issued to the
+ * firmware from this
+ *
+ * Return: Nothing.
+ */
+void mpi3mr_pel_get_seqnum_complete(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_drv_cmd *drv_cmd)
+{
+	struct mpi3_pel_reply *pel_reply = NULL;
+	struct mpi3_pel_seq *pel_seqnum_virt;
+	u16 ioc_status;
+	bool do_retry = false;
+
+	pel_seqnum_virt = (struct mpi3_pel_seq *)mrioc->pel_seqnum_virt;
+
+	if (drv_cmd->state & MPI3MR_CMD_RESET)
+		goto cleanup_drv_cmd;
+
+	ioc_status = drv_cmd->ioc_status & MPI3_IOCSTATUS_STATUS_MASK;
+	if (ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+		dprint_bsg_err(mrioc,
+		    "pel_get_seqnum: failed with ioc_status(0x%04x), log_info(0x%08x)\n",
+		    ioc_status, drv_cmd->ioc_loginfo);
+		do_retry = true;
+	}
+
+	if (drv_cmd->state & MPI3MR_CMD_REPLY_VALID)
+		pel_reply = (struct mpi3_pel_reply *)drv_cmd->reply;
+	if (!pel_reply) {
+		dprint_bsg_err(mrioc,
+		    "pel_get_seqnum: failed due to no reply\n");
+		goto out_failed;
+	}
+
+	if (le16_to_cpu(pel_reply->pe_log_status) != MPI3_PEL_STATUS_SUCCESS) {
+		dprint_bsg_err(mrioc,
+		    "pel_get_seqnum: failed due to pel_log_status(0x%04x)\n",
+ 		    le16_to_cpu(pel_reply->pe_log_status));
+		do_retry = true;
+	}
+
+	if (do_retry) {
+		if (drv_cmd->retry_count < MPI3MR_PEL_RETRY_COUNT) {
+			drv_cmd->retry_count++;
+			dprint_bsg_err(mrioc,
+			    "pel_get_seqnum: retrying(%d)\n",
+			    drv_cmd->retry_count);
+			mpi3mr_pel_get_seqnum_post(mrioc, drv_cmd);
+			return;
+		}
+
+		dprint_bsg_err(mrioc,
+		    "pel_get_seqnum: failed after all retries(%d)\n",
+		    drv_cmd->retry_count);
+		goto out_failed;
+	}
+	mrioc->pel_newest_seqnum = le32_to_cpu(pel_seqnum_virt->newest) + 1;
+	drv_cmd->retry_count = 0;
+	mpi3mr_pel_wait_post(mrioc, drv_cmd);
+
+	return;
+out_failed:
+	mrioc->pel_enabled = false;
+cleanup_drv_cmd:
+	drv_cmd->state = MPI3MR_CMD_NOTUSED;
+	drv_cmd->callback = NULL;
+	drv_cmd->retry_count = 0;
+}
+
+/**
+ * mpi3mr_soft_reset_handler - Reset the controller
+ * @mrioc: Adapter instance reference
+ * @reset_reason: Reset reason code
+ * @snapdump: Flag to generate snapdump in firmware or not
+ *
+ * This is an handler for recovering controller by issuing soft
+ * reset or diag fault reset. This is a blocking function and
+ * when one reset is executed if any other resets they will be
+ * blocked. All BSGs/IO will be blocked during the reset. If
+ * controller reset is successful then the controller will be
+ * reinitalized, otherwise the controller will be marked as not
+ * recoverable
+ *
+ * If snapdump bit is set, the controller is issued with diag
+ * fault reset so that the firmware can create a snap dump and
+ * post that the firmware will result in F000 fault and the
+ * driver will issue soft reset to recover from that.
+ *
+ * Return: 0 on success, non-zero on failure.
+ */
+int mpi3mr_soft_reset_handler(struct mpi3mr_ioc *mrioc,
+	u32 reset_reason, u8 snapdump)
+{
+	int retval = 0, i;
+	unsigned long flags;
+	u32 host_diagnostic, timeout = MPI3_SYSIF_DIAG_SAVE_TIMEOUT * 10;
+	u32 fault;
+
+	/* Block the reset handler until diag save in progress*/
+	dprint_reset(mrioc,
+	    "soft_reset_handler: check and block on diagsave_timeout(%d)\n",
+	    mrioc->diagsave_timeout);
+	while (mrioc->diagsave_timeout)
+		ssleep(1);
+	/*
+	 * Block new resets until the currently executing one is finished and
+	 * return the status of the existing reset for all blocked resets
+	 */
+	dprint_reset(mrioc, "soft_reset_handler: acquiring reset_mutex\n");
+	if (!mutex_trylock(&mrioc->reset_mutex)) {
+		ioc_info(mrioc,
+		    "controller reset triggered by %s is blocked due to another reset in progress\n",
+		    mpi3mr_reset_rc_name(reset_reason));
+		do {
+			ssleep(1);
+		} while (mrioc->reset_in_progress == 1);
+		ioc_info(mrioc,
+		    "returning previous reset result(%d) for the reset triggered by %s\n",
+		    mrioc->prev_reset_result,
+		    mpi3mr_reset_rc_name(reset_reason));
+		return mrioc->prev_reset_result;
+	}
+	ioc_info(mrioc, "controller reset is triggered by %s\n",
+	    mpi3mr_reset_rc_name(reset_reason));
+
+	mrioc->device_refresh_on = 0;
+	mrioc->reset_in_progress = 1;
+	mrioc->block_bsgs = 1;
+	mrioc->prev_reset_result = -1;
+
+	if ((!snapdump) && (reset_reason != MPI3MR_RESET_FROM_FAULT_WATCH) &&
+	    (reset_reason != MPI3MR_RESET_FROM_FIRMWARE) &&
+	    (reset_reason != MPI3MR_RESET_FROM_CIACTIV_FAULT)) {
+		mpi3mr_set_trigger_data_in_all_hdb(mrioc,
+		    MPI3MR_HDB_TRIGGER_TYPE_SOFT_RESET, 0, 0);
+		dprint_reset(mrioc,
+		    "soft_reset_handler: releasing host diagnostic buffers\n");
+		mpi3mr_release_diag_bufs(mrioc, 0);
+		for (i = 0; i < MPI3_EVENT_NOTIFY_EVENTMASK_WORDS; i++)
+			mrioc->event_masks[i] = -1;
+		dprint_reset(mrioc, "soft_reset_handler: masking events\n");
+		mpi3mr_issue_event_notification(mrioc);
+	}
+
+	mpi3mr_wait_for_host_io(mrioc, MPI3MR_RESET_HOST_IOWAIT_TIMEOUT);
+
+	mpi3mr_ioc_disable_intr(mrioc);
+
+	if (snapdump) {
+		dprint_reset(mrioc,
+		    "soft_reset_handler: saving snapdump\n");
+		mpi3mr_do_dump(mrioc);
+		mpi3mr_set_diagsave(mrioc);
+		retval = mpi3mr_issue_reset(mrioc,
+		    MPI3_SYSIF_HOST_DIAG_RESET_ACTION_DIAG_FAULT, reset_reason);
+		if (!retval) {
+			fault = readl(&mrioc->sysif_regs->fault);
+			do {
+				host_diagnostic =
+				    readl(&mrioc->sysif_regs->host_diagnostic);
+				if (!(host_diagnostic &
+				    MPI3_SYSIF_HOST_DIAG_SAVE_IN_PROGRESS))
+					break;
+				msleep(100);
+			} while (--timeout);
+			mpi3mr_set_trigger_data_in_all_hdb(mrioc,
+			    MPI3MR_HDB_TRIGGER_TYPE_FAULT, fault, 0);
+		}
+	}
+	retval = mpi3mr_issue_reset(mrioc,
+	    MPI3_SYSIF_HOST_DIAG_RESET_ACTION_SOFT_RESET, reset_reason);
+	if (retval) {
+		ioc_err(mrioc,
+		    "failed to issue soft reset to the controller\n");
+		goto out;
+	}
+
+	mpi3mr_flush_delayed_cmd_lists(mrioc);
+	mpi3mr_flush_drv_cmds(mrioc);
+	memset(mrioc->devrem_bitmap, 0, mrioc->devrem_bitmap_sz);
+	memset(mrioc->removepend_bitmap, 0, mrioc->dev_handle_bitmap_sz);
+	memset(mrioc->evtack_cmds_bitmap, 0, mrioc->evtack_cmds_bitmap_sz);
+	mpi3mr_flush_host_io(mrioc);
+	mpi3mr_cleanup_fwevt_list(mrioc);
+	mpi3mr_invalidate_devhandles(mrioc);
+	mpi3mr_free_enclosure_list(mrioc);
+
+	if (mrioc->prepare_for_reset) {
+		mrioc->prepare_for_reset = 0;
+		mrioc->prepare_for_reset_timeout_counter = 0;
+	}
+
+#if defined(IO_COUNTER_SUPPORT)
+	atomic_set(&mrioc->pend_ios, 0);
+#endif
+	mpi3mr_memset_buffers(mrioc);
+	mpi3mr_release_diag_bufs(mrioc, 1);
+	mrioc->fw_release_trigger_active = false;
+	mrioc->trace_release_trigger_active = false;
+	mrioc->snapdump_trigger_active = false;
+	mpi3mr_set_trigger_data_in_all_hdb(mrioc,
+	    MPI3MR_HDB_TRIGGER_TYPE_SOFT_RESET, 0, 0);
+
+	dprint_reset(mrioc,
+	    "soft_reset_handler: reinitializing the controller\n");
+	retval = mpi3mr_reinit_ioc(mrioc, 0);
+	if (retval) {
+		ioc_err(mrioc, "reinitialization after soft reset failed\n");
+		goto out;
+	}
+	dprint_reset(mrioc,
+		    "soft_reset_handler: waiting for device events to settle\n");
+	ssleep(10);
+
+out:
+	if (!retval) {
+		mrioc->diagsave_timeout = 0;
+		mrioc->reset_in_progress = 0;
+		mrioc->pel_abort_requested = 0;
+		if (mrioc->pel_enabled) {
+			mrioc->pel_cmds.retry_count = 0;
+			mpi3mr_pel_wait_post(mrioc, &mrioc->pel_cmds);
+		}
+
+		mrioc->device_refresh_on = 0;
+
+		mrioc->ts_update_counter = 0;
+		spin_lock_irqsave(&mrioc->watchdog_lock, flags);
+		if (mrioc->watchdog_work_q)
+			queue_delayed_work(mrioc->watchdog_work_q,
+			    &mrioc->watchdog_work,
+			    msecs_to_jiffies(MPI3MR_WATCHDOG_INTERVAL));
+		spin_unlock_irqrestore(&mrioc->watchdog_lock, flags);
+		mrioc->block_bsgs = 0;
+		if (mrioc->pel_enabled)
+			atomic64_inc(&event_counter);
+	} else {
+		dprint_reset(mrioc,
+		    "soft_reset_handler failed, marking controller as unrecoverable\n");
+		mpi3mr_issue_reset(mrioc,
+		    MPI3_SYSIF_HOST_DIAG_RESET_ACTION_DIAG_FAULT, reset_reason);
+		mrioc->device_refresh_on = 0;
+		mrioc->unrecoverable = 1;
+		mrioc->reset_in_progress = 0;
+		retval = -1;
+		mpi3mr_flush_cmds_for_unrecovered_controller(mrioc);
+	}
+	mrioc->prev_reset_result = retval;
+	mutex_unlock(&mrioc->reset_mutex);
+	ioc_info(mrioc, "controller reset is %s\n",
+	    ((retval == 0) ? "successful" : "failed"));
+	return retval;
+}
+
+
+/**
+ * mpi3mr_free_config_dma_memory - free memory for config page
+ * @mrioc: Adapter instance reference
+ * @mem_desc: memory descriptor structure
+ *
+ * Check whether the size of the buffer specified by the memory
+ * descriptor is greater than the defaulpage size if so then
+ * free the memory pointed by the descriptor.
+ *
+ * Return: 0 on success, non-zero on failure.
+ */
+static void mpi3mr_free_config_dma_memory(struct mpi3mr_ioc *mrioc,
+	struct dma_memory_desc *mem_desc)
+{
+	if ((mem_desc->size > mrioc->cfg_page_sz) && mem_desc->addr) {
+		dma_free_coherent(&mrioc->pdev->dev, mem_desc->size,
+		    mem_desc->addr, mem_desc->dma_addr);
+		mem_desc->addr = NULL;
+	}
+}
+
+
+
+/**
+ * mpi3mr_alloc_config_dma_memory - Alloc memory for config page
+ * @mrioc: Adapter instance reference
+ * @mem_desc: Memory descriptor to hold dma memory info
+ *
+ * This function allocates new dmaable memory or provides the
+ * default config page dmaable memory based on the memory size
+ * described by the descriptor.
+ *
+ * Return: 0 on success, non-zero on failure.
+ */
+static int mpi3mr_alloc_config_dma_memory(struct mpi3mr_ioc *mrioc,
+	struct dma_memory_desc *mem_desc)
+{
+	if (mem_desc->size > mrioc->cfg_page_sz) {
+		mem_desc->addr = dma_zalloc_coherent(&mrioc->pdev->dev,
+		    mem_desc->size, &mem_desc->dma_addr, GFP_KERNEL);
+		if (!mem_desc->addr)
+			return -ENOMEM;
+	} else {
+		mem_desc->addr = mrioc->cfg_page;
+		mem_desc->dma_addr = mrioc->cfg_page_dma;
+		memset(mem_desc->addr, 0, mrioc->cfg_page_sz);
+	}
+	return 0;
+}
+
+
+/**
+ * mpi3mr_post_cfg_req - Issue config requests and wait
+ * @mrioc: Adapter instance reference
+ * @cfg_request: Configuration request
+ * @timeout: Timeout in seconds
+ * @ioc_status: Pointer to return ioc status
+ *
+ * A generic function for posting MPI3 configuration request to
+ * the firmware. This blocks for the completion of request for
+ * timeout seconds and if the request times out this function
+ * faults the controller with proper reason code.
+ *
+ * On successful completion of the request this function returns
+ * appropriate ioc status from the firmware back to the caller.
+ *
+ * Return: 0 on success, non-zero on failure.
+ */
+static int mpi3mr_post_cfg_req(struct mpi3mr_ioc *mrioc,
+	struct mpi3_config_request *cfg_req, int timeout, u16 *ioc_status)
+{
+	int retval = 0;
+
+	mutex_lock(&mrioc->cfg_cmds.mutex);
+	if (mrioc->cfg_cmds.state & MPI3MR_CMD_PENDING) {
+		retval = -1;
+		ioc_err(mrioc, "sending config request failed due to command in use\n");
+		mutex_unlock(&mrioc->cfg_cmds.mutex);
+		goto out;
+	}
+	mrioc->cfg_cmds.state = MPI3MR_CMD_PENDING;
+	mrioc->cfg_cmds.is_waiting = 1;
+	mrioc->cfg_cmds.callback = NULL;
+	mrioc->cfg_cmds.ioc_status = 0;
+	mrioc->cfg_cmds.ioc_loginfo = 0;
+
+	cfg_req->host_tag = cpu_to_le16(MPI3MR_HOSTTAG_CFG_CMDS);
+	cfg_req->function = MPI3_FUNCTION_CONFIG;
+
+	init_completion(&mrioc->cfg_cmds.done);
+	dprint_cfg_info(mrioc, "posting config request\n");
+	if (mrioc->logging_level & MPI3_DEBUG_CFG_INFO)
+		dprint_dump(cfg_req, sizeof(struct mpi3_config_request),
+		    "mpi3_cfg_req");
+	retval = mpi3mr_admin_request_post(mrioc, cfg_req, sizeof(*cfg_req), 1);
+	if (retval) {
+		ioc_err(mrioc, "posting config request failed\n");
+		goto out_unlock;
+	}
+	wait_for_completion_timeout(&mrioc->cfg_cmds.done, (timeout * HZ));
+	if (!(mrioc->cfg_cmds.state & MPI3MR_CMD_COMPLETE)) {
+		mpi3mr_check_rh_fault_ioc(mrioc,
+		    MPI3MR_RESET_FROM_CFG_REQ_TIMEOUT);
+		ioc_err(mrioc, "config request timed out\n");
+		retval = -1;
+		goto out_unlock;
+	}
+	*ioc_status = mrioc->cfg_cmds.ioc_status & MPI3_IOCSTATUS_STATUS_MASK;
+	if ((*ioc_status) != MPI3_IOCSTATUS_SUCCESS)
+		dprint_cfg_err(mrioc,
+		    "cfg_page request returned with ioc_status(0x%04x), log_info(0x%08x)\n",
+		    *ioc_status, mrioc->cfg_cmds.ioc_loginfo);
+
+out_unlock:
+	mrioc->cfg_cmds.state = MPI3MR_CMD_NOTUSED;
+	mutex_unlock(&mrioc->cfg_cmds.mutex);
+
+out:
+	return retval;
+}
+
+/**
+ * mpi3mr_process_cfg_req - config page request processor
+ * @mrioc: Adapter instance reference
+ * @cfg_request: Configuration request
+ * @cf_hdr: Configuration page header
+ * @timeout: Timeout in seconds
+ * @ioc_status: Pointer to return ioc status
+ * @cfg_buf: Memory pointer to copy config page or header
+ * @cfg_buf_sz: Size of the memory to get config page or header
+ *
+ * This is handler for config page read, write and config page
+ * header read operations.
+ *
+ * This function expects the cfg_req to be populated with page
+ * type, page number, action for the header read and with page
+ * address for all other operations.
+ *
+ * The cfg_hdr can be passed as null for reading required header
+ * details for read/write pages the cfg_hdr should point valid
+ * configuration page header.
+ *
+ * This allocates dmaable memory based on the size of the config
+ * buffer and set the SGE of the cfg_req.
+ *
+ * For write actions, the config page data has to be passed in
+ * the cfg_buf and size of the data has to be mentioned in the
+ * cfg_buf_sz.
+ *
+ * For read/header actions, on successful completion of the
+ * request with successful ioc_status the data will be copied
+ * into the cfg_buf limited to a minium of actual page size and
+ * cfg_buf_sz
+ *
+ *
+ * Return: 0 on success, non-zero on failure.
+ */
+static int mpi3mr_process_cfg_req(struct mpi3mr_ioc *mrioc,
+	struct mpi3_config_request *cfg_req,
+	struct mpi3_config_page_header *cfg_hdr, int timeout, u16 *ioc_status,
+	void *cfg_buf, u32 cfg_buf_sz)
+{
+	struct dma_memory_desc mem_desc;
+	int retval = -1;
+	u8 invalid_action = 0;
+	u8 sgl_flags = MPI3MR_SGEFLAGS_SYSTEM_SIMPLE_END_OF_LIST;
+
+	memset(&mem_desc, 0, sizeof(struct dma_memory_desc));
+
+	if (cfg_req->action == MPI3_CONFIG_ACTION_PAGE_HEADER)
+		mem_desc.size = sizeof(struct mpi3_config_page_header);
+	else {
+		if (!cfg_hdr) {
+			ioc_err(mrioc, "null config header passed for config action(%d), page_type(0x%02x), page_num(%d)\n",
+			    cfg_req->action, cfg_req->page_type,
+			    cfg_req->page_number);
+			goto out;
+		}
+		switch (cfg_hdr->page_attribute & MPI3_CONFIG_PAGEATTR_MASK) {
+			case MPI3_CONFIG_PAGEATTR_READ_ONLY:
+				if (cfg_req->action
+				    != MPI3_CONFIG_ACTION_READ_CURRENT)
+					invalid_action = 1;
+				break;
+			case MPI3_CONFIG_PAGEATTR_CHANGEABLE:
+				if ((cfg_req->action ==
+				     MPI3_CONFIG_ACTION_READ_PERSISTENT) ||
+				    (cfg_req->action ==
+				     MPI3_CONFIG_ACTION_WRITE_PERSISTENT))
+					invalid_action = 1;
+				break;
+			case MPI3_CONFIG_PAGEATTR_PERSISTENT:
+			default:
+				break;
+		}
+		if (invalid_action) {
+			ioc_err(mrioc,
+			    "config action(%d) is not allowed for page_type(0x%02x), page_num(%d) with page_attribute(0x%02x)\n",
+			    cfg_req->action, cfg_req->page_type,
+			    cfg_req->page_number, cfg_hdr->page_attribute);
+			goto out;
+		}
+		mem_desc.size = le16_to_cpu(cfg_hdr->page_length) * 4;
+		cfg_req->page_length = cfg_hdr->page_length;
+		cfg_req->page_version = cfg_hdr->page_version;
+	}
+	if (mpi3mr_alloc_config_dma_memory(mrioc, &mem_desc))
+		goto out;
+
+	mpi3mr_add_sg_single(&cfg_req->sgl, sgl_flags, mem_desc.size,
+	    mem_desc.dma_addr);
+
+	if ((cfg_req->action == MPI3_CONFIG_ACTION_WRITE_PERSISTENT) ||
+	    (cfg_req->action == MPI3_CONFIG_ACTION_WRITE_CURRENT)) {
+		memcpy(mem_desc.addr, cfg_buf, min_t(u16, mem_desc.size,
+		    cfg_buf_sz));
+		dprint_cfg_info(mrioc, "config buffer to be written\n");
+		if (mrioc->logging_level & MPI3_DEBUG_CFG_INFO)
+			dprint_dump(mem_desc.addr, mem_desc.size, "cfg_buf");
+	}
+
+	if (mpi3mr_post_cfg_req(mrioc, cfg_req, timeout, ioc_status))
+		goto out;
+
+	retval = 0;
+	if ((*ioc_status == MPI3_IOCSTATUS_SUCCESS) &&
+	    (cfg_req->action != MPI3_CONFIG_ACTION_WRITE_PERSISTENT) &&
+	    (cfg_req->action != MPI3_CONFIG_ACTION_WRITE_CURRENT)) {
+		memcpy(cfg_buf, mem_desc.addr, min_t(u16, mem_desc.size,
+		    cfg_buf_sz));
+		dprint_cfg_info(mrioc, "config buffer read\n");
+		if (mrioc->logging_level & MPI3_DEBUG_CFG_INFO)
+			dprint_dump(mem_desc.addr, mem_desc.size, "cfg_buf");
+	}
+
+out:
+	mpi3mr_free_config_dma_memory(mrioc, &mem_desc);
+	return retval;
+}
+
+/**
+ * mpi3mr_cfg_get_dev_pg0 - Read current device page0
+ * @mrioc: Adapter instance reference
+ * @ioc_status: Pointer to return ioc status
+ * @dev_pg0: Pointer to return device page 0
+ * @pg_sz: Size of the memory allocated to the page pointer
+ * @form: The form to be used for addressing the page
+ * @form_spec: Form specific information like device handle
+ *
+ * This is handler for config page read for a specific device
+ * page0. The ioc_status has the controller returned ioc_status.
+ * This routine doesn't check ioc_status to decide whether the
+ * page read is success or not and it is the callers
+ * responsibility.
+ *
+ * Return: 0 on success, non-zero on failure.
+ */
+int mpi3mr_cfg_get_dev_pg0(struct mpi3mr_ioc *mrioc, u16 *ioc_status,
+	struct mpi3_device_page0 *dev_pg0, u16 pg_sz, u32 form, u32 form_spec)
+{
+	struct mpi3_config_page_header cfg_hdr;
+	struct mpi3_config_request cfg_req;
+	u32 page_address;
+
+	memset(dev_pg0, 0, pg_sz);
+	memset(&cfg_hdr, 0, sizeof(cfg_hdr));
+	memset(&cfg_req, 0, sizeof(cfg_req));
+
+	cfg_req.function = MPI3_FUNCTION_CONFIG;
+	cfg_req.action = MPI3_CONFIG_ACTION_PAGE_HEADER;
+	cfg_req.page_type = MPI3_CONFIG_PAGETYPE_DEVICE;
+	cfg_req.page_number = 0;
+	cfg_req.page_address = 0;
+
+	if (mpi3mr_process_cfg_req(mrioc, &cfg_req, NULL,
+	    MPI3MR_INTADMCMD_TIMEOUT, ioc_status, &cfg_hdr, sizeof(cfg_hdr))) {
+		ioc_err(mrioc, "device page0 header read failed \n");
+		goto out_failed;
+	}
+	if (*ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc, "device page0 header read failed with ioc_status(0x%04x)\n",
+		    *ioc_status);
+		goto out_failed;
+	}
+	cfg_req.action = MPI3_CONFIG_ACTION_READ_CURRENT;
+	page_address = ((form & MPI3_DEVICE_PGAD_FORM_MASK) |
+			(form_spec & MPI3_DEVICE_PGAD_HANDLE_MASK));
+	cfg_req.page_address = cpu_to_le32(page_address);
+	if (mpi3mr_process_cfg_req(mrioc, &cfg_req, &cfg_hdr,
+	    MPI3MR_INTADMCMD_TIMEOUT, ioc_status, dev_pg0, pg_sz)) {
+		ioc_err(mrioc, "device page0 read failed \n");
+		goto out_failed;
+	}
+	return 0;
+out_failed:
+	return -1;
+}
+
+
+/**
+ * mpi3mr_cfg_get_sas_phy_pg0 - Read current SAS Phy page0
+ * @mrioc: Adapter instance reference
+ * @ioc_status: Pointer to return ioc status
+ * @phy_pg0: Pointer to return SAS Phy page 0
+ * @pg_sz: Size of the memory allocated to the page pointer
+ * @form: The form to be used for addressing the page
+ * @form_spec: Form specific information like phy number
+ *
+ * This is handler for config page read for a specific SAS Phy
+ * page0. The ioc_status has the controller returned ioc_status.
+ * This routine doesn't check ioc_status to decide whether the
+ * page read is success or not and it is the callers
+ * responsibility.
+ *
+ * Return: 0 on success, non-zero on failure.
+ */
+int mpi3mr_cfg_get_sas_phy_pg0(struct mpi3mr_ioc *mrioc, u16 *ioc_status,
+	struct mpi3_sas_phy_page0 *phy_pg0, u16 pg_sz, u32 form,
+	u32 form_spec)
+{
+	struct mpi3_config_page_header cfg_hdr;
+	struct mpi3_config_request cfg_req;
+	u32 page_address;
+
+	memset(phy_pg0, 0, pg_sz);
+	memset(&cfg_hdr, 0, sizeof(cfg_hdr));
+	memset(&cfg_req, 0, sizeof(cfg_req));
+
+	cfg_req.function = MPI3_FUNCTION_CONFIG;
+	cfg_req.action = MPI3_CONFIG_ACTION_PAGE_HEADER;
+	cfg_req.page_type = MPI3_CONFIG_PAGETYPE_SAS_PHY;
+	cfg_req.page_number = 0;
+	cfg_req.page_address = 0;
+
+	if (mpi3mr_process_cfg_req(mrioc, &cfg_req, NULL,
+	    MPI3MR_INTADMCMD_TIMEOUT, ioc_status, &cfg_hdr, sizeof(cfg_hdr))) {
+		ioc_err(mrioc, "sas phy page0 header read failed \n");
+		goto out_failed;
+	}
+	if (*ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc, "sas phy page0 header read failed with ioc_status(0x%04x)\n",
+		    *ioc_status);
+		goto out_failed;
+	}
+	cfg_req.action = MPI3_CONFIG_ACTION_READ_CURRENT;
+	page_address = ((form & MPI3_SAS_PHY_PGAD_FORM_MASK) |
+			(form_spec & MPI3_SAS_PHY_PGAD_PHY_NUMBER_MASK));
+	cfg_req.page_address = cpu_to_le32(page_address);
+	if (mpi3mr_process_cfg_req(mrioc, &cfg_req, &cfg_hdr,
+	    MPI3MR_INTADMCMD_TIMEOUT, ioc_status, phy_pg0, pg_sz)) {
+		ioc_err(mrioc, "sas phy page0 read failed \n");
+		goto out_failed;
+	}
+	return 0;
+out_failed:
+	return -1;
+}
+
+/**
+ * mpi3mr_cfg_get_sas_phy_pg1 - Read current SAS Phy page1
+ * @mrioc: Adapter instance reference
+ * @ioc_status: Pointer to return ioc status
+ * @phy_pg1: Pointer to return SAS Phy page 1
+ * @pg_sz: Size of the memory allocated to the page pointer
+ * @form: The form to be used for addressing the page
+ * @form_spec: Form specific information like phy number
+ *
+ * This is handler for config page read for a specific SAS Phy
+ * page1. The ioc_status has the controller returned ioc_status.
+ * This routine doesn't check ioc_status to decide whether the
+ * page read is success or not and it is the callers
+ * responsibility.
+ *
+ * Return: 0 on success, non-zero on failure.
+ */
+int mpi3mr_cfg_get_sas_phy_pg1(struct mpi3mr_ioc *mrioc, u16 *ioc_status,
+	struct mpi3_sas_phy_page1 *phy_pg1, u16 pg_sz, u32 form,
+	u32 form_spec)
+{
+	struct mpi3_config_page_header cfg_hdr;
+	struct mpi3_config_request cfg_req;
+	u32 page_address;
+
+	memset(phy_pg1, 0, pg_sz);
+	memset(&cfg_hdr, 0, sizeof(cfg_hdr));
+	memset(&cfg_req, 0, sizeof(cfg_req));
+
+	cfg_req.function = MPI3_FUNCTION_CONFIG;
+	cfg_req.action = MPI3_CONFIG_ACTION_PAGE_HEADER;
+	cfg_req.page_type = MPI3_CONFIG_PAGETYPE_SAS_PHY;
+	cfg_req.page_number = 1;
+	cfg_req.page_address = 0;
+
+	if (mpi3mr_process_cfg_req(mrioc, &cfg_req, NULL,
+	    MPI3MR_INTADMCMD_TIMEOUT, ioc_status, &cfg_hdr, sizeof(cfg_hdr))) {
+		ioc_err(mrioc, "sas phy page1 header read failed \n");
+		goto out_failed;
+	}
+	if (*ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc, "sas phy page1 header read failed with ioc_status(0x%04x)\n",
+		    *ioc_status);
+		goto out_failed;
+	}
+	cfg_req.action = MPI3_CONFIG_ACTION_READ_CURRENT;
+	page_address = ((form & MPI3_SAS_PHY_PGAD_FORM_MASK) |
+			(form_spec & MPI3_SAS_PHY_PGAD_PHY_NUMBER_MASK));
+	cfg_req.page_address = cpu_to_le32(page_address);
+	if (mpi3mr_process_cfg_req(mrioc, &cfg_req, &cfg_hdr,
+	    MPI3MR_INTADMCMD_TIMEOUT, ioc_status, phy_pg1, pg_sz)) {
+		ioc_err(mrioc, "sas phy page1 read failed \n");
+		goto out_failed;
+	}
+	return 0;
+out_failed:
+	return -1;
+}
+
+
+/**
+ * mpi3mr_cfg_get_sas_exp_pg0 - Read current SAS Expander page0
+ * @mrioc: Adapter instance reference
+ * @ioc_status: Pointer to return ioc status
+ * @exp_pg0: Pointer to return SAS Expander page 0
+ * @pg_sz: Size of the memory allocated to the page pointer
+ * @form: The form to be used for addressing the page
+ * @form_spec: Form specific information like device handle
+ *
+ * This is handler for config page read for a specific SAS
+ * Expander page0. The ioc_status has the controller returned
+ * ioc_status. This routine doesn't check ioc_status to decide
+ * whether the page read is success or not and it is the callers
+ * responsibility.
+ *
+ * Return: 0 on success, non-zero on failure.
+ */
+int mpi3mr_cfg_get_sas_exp_pg0(struct mpi3mr_ioc *mrioc, u16 *ioc_status,
+	struct mpi3_sas_expander_page0 *exp_pg0, u16 pg_sz, u32 form,
+	u32 form_spec)
+{
+	struct mpi3_config_page_header cfg_hdr;
+	struct mpi3_config_request cfg_req;
+	u32 page_address;
+
+	memset(exp_pg0, 0, pg_sz);
+	memset(&cfg_hdr, 0, sizeof(cfg_hdr));
+	memset(&cfg_req, 0, sizeof(cfg_req));
+
+	cfg_req.function = MPI3_FUNCTION_CONFIG;
+	cfg_req.action = MPI3_CONFIG_ACTION_PAGE_HEADER;
+	cfg_req.page_type = MPI3_CONFIG_PAGETYPE_SAS_EXPANDER;
+	cfg_req.page_number = 0;
+	cfg_req.page_address = 0;
+
+	if (mpi3mr_process_cfg_req(mrioc, &cfg_req, NULL,
+	    MPI3MR_INTADMCMD_TIMEOUT, ioc_status, &cfg_hdr, sizeof(cfg_hdr))) {
+		ioc_err(mrioc, "expander page0 header read failed \n");
+		goto out_failed;
+	}
+	if (*ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc, "expander page0 header read failed with ioc_status(0x%04x)\n",
+		    *ioc_status);
+		goto out_failed;
+	}
+	cfg_req.action = MPI3_CONFIG_ACTION_READ_CURRENT;
+	page_address = ((form & MPI3_SAS_EXPAND_PGAD_FORM_MASK) |
+			(form_spec & (MPI3_SAS_EXPAND_PGAD_PHYNUM_MASK |
+				      MPI3_SAS_EXPAND_PGAD_HANDLE_MASK)));
+	cfg_req.page_address = cpu_to_le32(page_address);
+	if (mpi3mr_process_cfg_req(mrioc, &cfg_req, &cfg_hdr,
+	    MPI3MR_INTADMCMD_TIMEOUT, ioc_status, exp_pg0, pg_sz)) {
+		ioc_err(mrioc, "expander page0 read failed \n");
+		goto out_failed;
+	}
+	return 0;
+out_failed:
+	return -1;
+}
+
+/**
+ * mpi3mr_cfg_get_sas_exp_pg1 - Read current SAS Expander page1
+ * @mrioc: Adapter instance reference
+ * @ioc_status: Pointer to return ioc status
+ * @exp_pg1: Pointer to return SAS Expander page 1
+ * @pg_sz: Size of the memory allocated to the page pointer
+ * @form: The form to be used for addressing the page
+ * @form_spec: Form specific information like phy number
+ *
+ * This is handler for config page read for a specific SAS
+ * Expander page1. The ioc_status has the controller returned
+ * ioc_status. This routine doesn't check ioc_status to decide
+ * whether the page read is success or not and it is the callers
+ * responsibility.
+ *
+ * Return: 0 on success, non-zero on failure.
+ */
+int mpi3mr_cfg_get_sas_exp_pg1(struct mpi3mr_ioc *mrioc, u16 *ioc_status,
+	struct mpi3_sas_expander_page1 *exp_pg1, u16 pg_sz, u32 form,
+	u32 form_spec)
+{
+	struct mpi3_config_page_header cfg_hdr;
+	struct mpi3_config_request cfg_req;
+	u32 page_address;
+
+	memset(exp_pg1, 0, pg_sz);
+	memset(&cfg_hdr, 0, sizeof(cfg_hdr));
+	memset(&cfg_req, 0, sizeof(cfg_req));
+
+	cfg_req.function = MPI3_FUNCTION_CONFIG;
+	cfg_req.action = MPI3_CONFIG_ACTION_PAGE_HEADER;
+	cfg_req.page_type = MPI3_CONFIG_PAGETYPE_SAS_EXPANDER;
+	cfg_req.page_number = 1;
+	cfg_req.page_address = 0;
+
+	if (mpi3mr_process_cfg_req(mrioc, &cfg_req, NULL,
+	    MPI3MR_INTADMCMD_TIMEOUT, ioc_status, &cfg_hdr, sizeof(cfg_hdr))) {
+		ioc_err(mrioc, "expander page1 header read failed \n");
+		goto out_failed;
+	}
+	if (*ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc, "expander page1 header read failed with ioc_status(0x%04x)\n",
+		    *ioc_status);
+		goto out_failed;
+	}
+	cfg_req.action = MPI3_CONFIG_ACTION_READ_CURRENT;
+	page_address = ((form & MPI3_SAS_EXPAND_PGAD_FORM_MASK) |
+			(form_spec & (MPI3_SAS_EXPAND_PGAD_PHYNUM_MASK |
+				      MPI3_SAS_EXPAND_PGAD_HANDLE_MASK)));
+	cfg_req.page_address = cpu_to_le32(page_address);
+	if (mpi3mr_process_cfg_req(mrioc, &cfg_req, &cfg_hdr,
+	    MPI3MR_INTADMCMD_TIMEOUT, ioc_status, exp_pg1, pg_sz)) {
+		ioc_err(mrioc, "expander page1 read failed \n");
+		goto out_failed;
+	}
+	return 0;
+out_failed:
+	return -1;
+}
+
+/**
+ * mpi3mr_cfg_get_enclosure_pg0 - Read current Enclosure page0
+ * @mrioc: Adapter instance reference
+ * @ioc_status: Pointer to return ioc status
+ * @encl_pg0: Pointer to return Enclosure page 0
+ * @pg_sz: Size of the memory allocated to the page pointer
+ * @form: The form to be used for addressing the page
+ * @form_spec: Form specific information like device handle
+ *
+ * This is handler for config page read for a specific Enclosure
+ * page0. The ioc_status has the controller returned ioc_status.
+ * This routine doesn't check ioc_status to decide whether the
+ * page read is success or not and it is the callers
+ * responsibility.
+ *
+ * Return: 0 on success, non-zero on failure.
+ */
+int mpi3mr_cfg_get_enclosure_pg0(struct mpi3mr_ioc *mrioc, u16 *ioc_status,
+	struct mpi3_enclosure_page0 *encl_pg0, u16 pg_sz, u32 form,
+	u32 form_spec)
+{
+	struct mpi3_config_page_header cfg_hdr;
+	struct mpi3_config_request cfg_req;
+	u32 page_address;
+
+	memset(encl_pg0, 0, pg_sz);
+	memset(&cfg_hdr, 0, sizeof(cfg_hdr));
+	memset(&cfg_req, 0, sizeof(cfg_req));
+
+	cfg_req.function = MPI3_FUNCTION_CONFIG;
+	cfg_req.action = MPI3_CONFIG_ACTION_PAGE_HEADER;
+	cfg_req.page_type = MPI3_CONFIG_PAGETYPE_ENCLOSURE;
+	cfg_req.page_number = 0;
+	cfg_req.page_address = 0;
+
+	if (mpi3mr_process_cfg_req(mrioc, &cfg_req, NULL,
+	    MPI3MR_INTADMCMD_TIMEOUT, ioc_status, &cfg_hdr, sizeof(cfg_hdr))) {
+		ioc_err(mrioc, "enclosure page0 header read failed \n");
+		goto out_failed;
+	}
+	if (*ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc, "enclosure page0 header read failed with ioc_status(0x%04x)\n",
+		    *ioc_status);
+		goto out_failed;
+	}
+	cfg_req.action = MPI3_CONFIG_ACTION_READ_CURRENT;
+	page_address = ((form & MPI3_ENCLOS_PGAD_FORM_MASK) |
+			(form_spec & MPI3_ENCLOS_PGAD_HANDLE_MASK));
+	cfg_req.page_address = cpu_to_le32(page_address);
+	if (mpi3mr_process_cfg_req(mrioc, &cfg_req, &cfg_hdr,
+	    MPI3MR_INTADMCMD_TIMEOUT, ioc_status, encl_pg0, pg_sz)) {
+		ioc_err(mrioc, "enclosure page0 read failed \n");
+		goto out_failed;
+	}
+	return 0;
+out_failed:
+	return -1;
+}
+
+
+/**
+ * mpi3mr_cfg_get_sas_io_unit_pg0 - Read current SASIOUnit page0
+ * @mrioc: Adapter instance reference
+ * @sas_io_unit_pg0: Pointer to return SAS IO Unit page 0
+ * @pg_sz: Size of the memory allocated to the page pointer
+ *
+ * This is handler for config page read for the SAS IO Unit
+ * page0. This routine checks ioc_status to decide whether the
+ * page read is success or not.
+ *
+ * Return: 0 on success, non-zero on failure.
+ */
+int mpi3mr_cfg_get_sas_io_unit_pg0(struct mpi3mr_ioc *mrioc,
+	struct mpi3_sas_io_unit_page0 *sas_io_unit_pg0, u16 pg_sz)
+{
+	struct mpi3_config_page_header cfg_hdr;
+	struct mpi3_config_request cfg_req;
+	u16 ioc_status = 0;
+
+	memset(sas_io_unit_pg0, 0, pg_sz);
+	memset(&cfg_hdr, 0, sizeof(cfg_hdr));
+	memset(&cfg_req, 0, sizeof(cfg_req));
+
+	cfg_req.function = MPI3_FUNCTION_CONFIG;
+	cfg_req.action = MPI3_CONFIG_ACTION_PAGE_HEADER;
+	cfg_req.page_type = MPI3_CONFIG_PAGETYPE_SAS_IO_UNIT;
+	cfg_req.page_number = 0;
+	cfg_req.page_address = 0;
+
+	if (mpi3mr_process_cfg_req(mrioc, &cfg_req, NULL,
+	    MPI3MR_INTADMCMD_TIMEOUT, &ioc_status, &cfg_hdr, sizeof(cfg_hdr))) {
+		ioc_err(mrioc, "sas io unit page0 header read failed \n");
+		goto out_failed;
+	}
+	if (ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc, "sas io unit page0 header read failed with ioc_status(0x%04x)\n",
+		    ioc_status);
+		goto out_failed;
+	}
+	cfg_req.action = MPI3_CONFIG_ACTION_READ_CURRENT;
+
+	if (mpi3mr_process_cfg_req(mrioc, &cfg_req, &cfg_hdr,
+	    MPI3MR_INTADMCMD_TIMEOUT, &ioc_status, sas_io_unit_pg0, pg_sz)) {
+		ioc_err(mrioc, "sas io unit page0 read failed \n");
+		goto out_failed;
+	}
+	if (ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc, "sas io unit page0 read failed with ioc_status(0x%04x)\n",
+		    ioc_status);
+		goto out_failed;
+	}
+	return 0;
+out_failed:
+	return -1;
+}
+
+/**
+ * mpi3mr_cfg_get_sas_io_unit_pg1 - Read current SASIOUnit page1
+ * @mrioc: Adapter instance reference
+ * @sas_io_unit_pg1: Pointer to return SAS IO Unit page 1
+ * @pg_sz: Size of the memory allocated to the page pointer
+ *
+ * This is handler for config page read for the SAS IO Unit
+ * page1. This routine checks ioc_status to decide whether the
+ * page read is success or not.
+ *
+ * Return: 0 on success, non-zero on failure.
+ */
+int mpi3mr_cfg_get_sas_io_unit_pg1(struct mpi3mr_ioc *mrioc,
+	struct mpi3_sas_io_unit_page1 *sas_io_unit_pg1, u16 pg_sz)
+{
+	struct mpi3_config_page_header cfg_hdr;
+	struct mpi3_config_request cfg_req;
+	u16 ioc_status = 0;
+
+	memset(sas_io_unit_pg1, 0, pg_sz);
+	memset(&cfg_hdr, 0, sizeof(cfg_hdr));
+	memset(&cfg_req, 0, sizeof(cfg_req));
+
+	cfg_req.function = MPI3_FUNCTION_CONFIG;
+	cfg_req.action = MPI3_CONFIG_ACTION_PAGE_HEADER;
+	cfg_req.page_type = MPI3_CONFIG_PAGETYPE_SAS_IO_UNIT;
+	cfg_req.page_number = 1;
+	cfg_req.page_address = 0;
+
+	if (mpi3mr_process_cfg_req(mrioc, &cfg_req, NULL,
+	    MPI3MR_INTADMCMD_TIMEOUT, &ioc_status, &cfg_hdr, sizeof(cfg_hdr))) {
+		ioc_err(mrioc, "sas io unit page1 header read failed \n");
+		goto out_failed;
+	}
+	if (ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc, "sas io unit page1 header read failed with ioc_status(0x%04x)\n",
+		    ioc_status);
+		goto out_failed;
+	}
+	cfg_req.action = MPI3_CONFIG_ACTION_READ_CURRENT;
+
+	if (mpi3mr_process_cfg_req(mrioc, &cfg_req, &cfg_hdr,
+	    MPI3MR_INTADMCMD_TIMEOUT, &ioc_status, sas_io_unit_pg1, pg_sz)) {
+		ioc_err(mrioc, "sas io unit page1 read failed \n");
+		goto out_failed;
+	}
+	if (ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc, "sas io unit page1 read failed with ioc_status(0x%04x)\n",
+		    ioc_status);
+		goto out_failed;
+	}
+	return 0;
+out_failed:
+	return -1;
+}
+
+/**
+ * mpi3mr_cfg_set_sas_io_unit_pg1 - Write SASIOUnit page1
+ * @mrioc: Adapter instance reference
+ * @sas_io_unit_pg1: Pointer to the SAS IO Unit page 1 to write
+ * @pg_sz: Size of the memory allocated to the page pointer
+ *
+ * This is handler for config page write for the SAS IO Unit
+ * page1. This routine checks ioc_status to decide whether the
+ * page read is success or not. This will modify both current
+ * and persistent page.
+ *
+ * Return: 0 on success, non-zero on failure.
+ */
+int mpi3mr_cfg_set_sas_io_unit_pg1(struct mpi3mr_ioc *mrioc,
+	struct mpi3_sas_io_unit_page1 *sas_io_unit_pg1, u16 pg_sz)
+{
+	struct mpi3_config_page_header cfg_hdr;
+	struct mpi3_config_request cfg_req;
+	u16 ioc_status = 0;
+
+	memset(&cfg_hdr, 0, sizeof(cfg_hdr));
+	memset(&cfg_req, 0, sizeof(cfg_req));
+
+	cfg_req.function = MPI3_FUNCTION_CONFIG;
+	cfg_req.action = MPI3_CONFIG_ACTION_PAGE_HEADER;
+	cfg_req.page_type = MPI3_CONFIG_PAGETYPE_SAS_IO_UNIT;
+	cfg_req.page_number = 1;
+	cfg_req.page_address = 0;
+
+	if (mpi3mr_process_cfg_req(mrioc, &cfg_req, NULL,
+	    MPI3MR_INTADMCMD_TIMEOUT, &ioc_status, &cfg_hdr, sizeof(cfg_hdr))) {
+		ioc_err(mrioc, "sas io unit page1 header read failed \n");
+		goto out_failed;
+	}
+	if (ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc, "sas io unit page1 header read failed with ioc_status(0x%04x)\n",
+		    ioc_status);
+		goto out_failed;
+	}
+	cfg_req.action = MPI3_CONFIG_ACTION_WRITE_CURRENT;
+
+	if (mpi3mr_process_cfg_req(mrioc, &cfg_req, &cfg_hdr,
+	    MPI3MR_INTADMCMD_TIMEOUT, &ioc_status, sas_io_unit_pg1, pg_sz)) {
+		ioc_err(mrioc, "sas io unit page1 write current failed \n");
+		goto out_failed;
+	}
+	if (ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc, "sas io unit page1 write current failed with ioc_status(0x%04x)\n",
+		    ioc_status);
+		goto out_failed;
+	}
+
+	cfg_req.action = MPI3_CONFIG_ACTION_WRITE_PERSISTENT;
+
+	if (mpi3mr_process_cfg_req(mrioc, &cfg_req, &cfg_hdr,
+	    MPI3MR_INTADMCMD_TIMEOUT, &ioc_status, sas_io_unit_pg1, pg_sz)) {
+		ioc_err(mrioc, "sas io unit page1 write persistent failed \n");
+		goto out_failed;
+	}
+	if (ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc, "sas io unit page1 write persistent failed with ioc_status(0x%04x)\n",
+		    ioc_status);
+		goto out_failed;
+	}
+	return 0;
+out_failed:
+	return -1;
+}
+
+/**
+ * mpi3mr_cfg_get_driver_pg1 - Read current Driver page1
+ * @mrioc: Adapter instance reference
+ * @driver_pg1: Pointer to return Driver page 1
+ * @pg_sz: Size of the memory allocated to the page pointer
+ *
+ * This is handler for config page read for the Driver page1.
+ * This routine checks ioc_status to decide whether the page
+ * read is success or not.
+ *
+ * Return: 0 on success, non-zero on failure.
+ */
+int mpi3mr_cfg_get_driver_pg1(struct mpi3mr_ioc *mrioc,
+	struct mpi3_driver_page1 *driver_pg1, u16 pg_sz)
+{
+	struct mpi3_config_page_header cfg_hdr;
+	struct mpi3_config_request cfg_req;
+	u16 ioc_status = 0;
+
+	memset(driver_pg1, 0, pg_sz);
+	memset(&cfg_hdr, 0, sizeof(cfg_hdr));
+	memset(&cfg_req, 0, sizeof(cfg_req));
+
+	cfg_req.function = MPI3_FUNCTION_CONFIG;
+	cfg_req.action = MPI3_CONFIG_ACTION_PAGE_HEADER;
+	cfg_req.page_type = MPI3_CONFIG_PAGETYPE_DRIVER;
+	cfg_req.page_number = 1;
+	cfg_req.page_address = 0;
+
+	if (mpi3mr_process_cfg_req(mrioc, &cfg_req, NULL,
+	    MPI3MR_INTADMCMD_TIMEOUT, &ioc_status, &cfg_hdr, sizeof(cfg_hdr))) {
+		ioc_err(mrioc, "driver page1 header read failed \n");
+		goto out_failed;
+	}
+	if (ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc, "driver page1 header read failed with ioc_status(0x%04x)\n",
+		    ioc_status);
+		goto out_failed;
+	}
+	cfg_req.action = MPI3_CONFIG_ACTION_READ_CURRENT;
+
+	if (mpi3mr_process_cfg_req(mrioc, &cfg_req, &cfg_hdr,
+	    MPI3MR_INTADMCMD_TIMEOUT, &ioc_status, driver_pg1, pg_sz)) {
+		ioc_err(mrioc, "driver page1 read failed \n");
+		goto out_failed;
+	}
+	if (ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc, "driver page1 read failed with ioc_status(0x%04x)\n",
+		    ioc_status);
+		goto out_failed;
+	}
+	return 0;
+out_failed:
+	return -1;
+}
+/**
+ * mpi3mr_cfg_get_driver_pg2 - Read current driver page2
+ * @mrioc: Adapter instance reference
+ * @driver_pg2: Pointer to return driver page 2
+ * @pg_sz: Size of the memory allocated to the page pointer
+ * @page_action: Page action
+ *
+ * This is handler for config page read for the driver page2.
+ * This routine checks ioc_status to decide whether the page
+ * read is success or not.
+ *
+ * Return: 0 on success, non-zero on failure.
+ */
+int mpi3mr_cfg_get_driver_pg2(struct mpi3mr_ioc *mrioc,
+	struct mpi3_driver_page2 *driver_pg2, u16 pg_sz, u8 page_action)
+{
+	struct mpi3_config_page_header cfg_hdr;
+	struct mpi3_config_request cfg_req;
+	u16 ioc_status = 0;
+
+	memset(driver_pg2, 0, pg_sz);
+	memset(&cfg_hdr, 0, sizeof(cfg_hdr));
+	memset(&cfg_req, 0, sizeof(cfg_req));
+
+	cfg_req.function = MPI3_FUNCTION_CONFIG;
+	cfg_req.action = MPI3_CONFIG_ACTION_PAGE_HEADER;
+	cfg_req.page_type = MPI3_CONFIG_PAGETYPE_DRIVER;
+	cfg_req.page_number = 2;
+	cfg_req.page_address = 0;
+	cfg_req.page_version = MPI3_DRIVER2_PAGEVERSION;
+
+	if (mpi3mr_process_cfg_req(mrioc, &cfg_req, NULL,
+	    MPI3MR_INTADMCMD_TIMEOUT, &ioc_status, &cfg_hdr, sizeof(cfg_hdr))) {
+		ioc_err(mrioc, "driver page2 header read failed \n");
+		goto out_failed;
+	}
+	if (ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc, "driver page2 header read failed with"
+			       "ioc_status(0x%04x)\n",
+		    ioc_status);
+		goto out_failed;
+	}
+	cfg_req.action = page_action;
+
+	if (mpi3mr_process_cfg_req(mrioc, &cfg_req, &cfg_hdr,
+	    MPI3MR_INTADMCMD_TIMEOUT, &ioc_status, driver_pg2, pg_sz)) {
+		ioc_err(mrioc, "driver page2 read failed \n");
+		goto out_failed;
+	}
+	if (ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc, "driver page2 read failed with"
+			       "ioc_status(0x%04x)\n",
+		    ioc_status);
+		goto out_failed;
+	}
+	return 0;
+out_failed:
+	return -1;
+}
+
diff --git a/drivers/scsi/mpi3mr/mpi3mr_kernel_compat.h b/drivers/scsi/mpi3mr/mpi3mr_kernel_compat.h
new file mode 100644
index 0000000000000..6fa51466a911a
--- /dev/null
+++ b/drivers/scsi/mpi3mr/mpi3mr_kernel_compat.h
@@ -0,0 +1,112 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Driver for Broadcom MPI3 Storage Controllers
+ *
+ * Copyright (C) 2017-2022 Broadcom Inc.
+ *  (mailto: mpi3mr-linuxdrv.pdl@broadcom.com)
+ *
+ */
+#include <linux/kmsg_dump.h>
+
+struct mpi3mr_kmsg_dumper {
+#if ((KERNEL_VERSION(5,13,0) <= LINUX_VERSION_CODE) || \
+    (defined(RHEL_MAJOR) && (RHEL_MAJOR == 8) && (RHEL_MINOR >= 6)))
+	struct kmsg_dump_iter kdumper;
+#else
+	struct kmsg_dumper kdumper;
+#endif
+};
+
+static inline void mpi3mr_set_dumper_active(struct mpi3mr_kmsg_dumper *dumper)
+{
+#if ((KERNEL_VERSION(5,13,0) <= LINUX_VERSION_CODE) || \
+    (defined(RHEL_MAJOR) && (RHEL_MAJOR == 8) && (RHEL_MINOR >= 6)))
+	return;
+#else
+	dumper->kdumper.active = true;
+	return;
+#endif
+}
+
+#if (KERNEL_VERSION(5,15,0) <= LINUX_VERSION_CODE)
+#define SCMD_GET_REQUEST(scmd)		scsi_cmd_to_rq(scmd)
+#else
+#define SCMD_GET_REQUEST(scmd)		scmd->request
+#endif
+
+#if (KERNEL_VERSION(5,16,0) <= LINUX_VERSION_CODE)
+#define SCMD_DONE(scmd)			scsi_done(scmd)
+#else
+#define SCMD_DONE(scmd)			scmd->scsi_done(scmd)
+#endif
+
+static inline u32 mpi3mr_kc_prot_ref_tag(struct scsi_cmnd *scmd)
+{
+#if ((LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0)) || \
+     (defined(RHEL_MAJOR) && (RHEL_MAJOR == 8)) || \
+     (defined(CONFIG_SUSE_KERNEL) && ((CONFIG_SUSE_VERSION == 15) && \
+     (CONFIG_SUSE_PATCHLEVEL >= 1))))
+	return t10_pi_ref_tag(SCMD_GET_REQUEST(scmd));
+#else
+	return scsi_prot_ref_tag(scmd);
+#endif
+}
+
+static inline bool mpi3mr_use_blk_mq(struct Scsi_Host *shost)
+{
+#if ((defined(RHEL_MAJOR) && (RHEL_MAJOR == 8)) || \
+     (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0)))
+	return true;
+#else
+	return shost_use_blk_mq(shost);
+#endif
+}
+
+/*Revisit enabling Shared HostTag for RHEL8x kernels*/
+#if ((defined(RHEL_MAJOR) && (RHEL_MAJOR == 8) && (RHEL_MINOR >= 6)) || \
+	(LINUX_VERSION_CODE >= KERNEL_VERSION(5,16,0)))
+#define HOST_TAGSET_SUPPORT
+#endif
+
+#if (defined(SCMD_STATE_INFLIGHT) && !defined(HOST_TAGSET_SUPPORT))
+#define IO_COUNTER_SUPPORT
+#endif
+
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0))
+#define dma_zalloc_coherent dma_alloc_coherent
+#endif
+
+#if ((LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0)) || \
+	(defined(RHEL_MAJOR) && (RHEL_MAJOR == 8)))
+#define BLK_ITER_CALLBACK_RET_TYPE	bool
+#define BLK_ITER_CALLBACK_RET_VAL(x)	return x
+#else
+#define BLK_ITER_CALLBACK_RET_TYPE	void
+#define BLK_ITER_CALLBACK_RET_VAL(x)	return
+#endif
+
+/**
+ * mpi3mr_scsi_build_sense - build sense data
+ * @scmd:       scsi command object
+ * @desc:       Sense format (non zero == descriptor format,
+ *              0 == fixed format)
+ * @key:        Sense key
+ * @asc:        Additional sense code
+ * @ascq:       Additional sense code qualifier
+ **/
+static inline void mpi3mr_scsi_build_sense(struct scsi_cmnd *scmd,
+        int desc, u8 key, u8 asc, u8 ascq)
+{
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,14,0))
+	scsi_build_sense_buffer(desc, scmd->sense_buffer, key, asc, ascq);
+	scmd->result = (DRIVER_SENSE << 24) | SAM_STAT_CHECK_CONDITION;
+	set_host_byte(scmd, DID_OK);
+#else
+	scsi_build_sense(scmd, desc, key, asc, ascq);
+#endif
+}
+
+#ifndef fallthrough
+#define fallthrough
+#endif
diff --git a/drivers/scsi/mpi3mr/mpi3mr_os.c b/drivers/scsi/mpi3mr/mpi3mr_os.c
new file mode 100644
index 0000000000000..4718a01aa2563
--- /dev/null
+++ b/drivers/scsi/mpi3mr/mpi3mr_os.c
@@ -0,0 +1,6019 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Driver for Broadcom MPI3 Storage Controllers
+ *
+ * Copyright (C) 2017-2022 Broadcom Inc.
+ *  (mailto: mpi3mr-linuxdrv.pdl@broadcom.com)
+ *
+ */
+
+#include "mpi3mr.h"
+
+/* Global driver scope variables */
+LIST_HEAD(mrioc_list);
+DEFINE_SPINLOCK(mrioc_list_lock);
+static int mrioc_ids;
+static int warn_non_secure_ctlr;
+atomic64_t event_counter;
+
+MODULE_AUTHOR(MPI3MR_DRIVER_AUTHOR);
+MODULE_DESCRIPTION(MPI3MR_DRIVER_DESC);
+MODULE_LICENSE(MPI3MR_DRIVER_LICENSE);
+MODULE_VERSION(MPI3MR_DRIVER_VERSION);
+
+/* Module parameters*/
+static int logging_level;
+module_param(logging_level, int, 0444);
+MODULE_PARM_DESC(logging_level,
+	" Enable additional logging info (default=0)");
+
+static bool enable_dif = true;
+module_param(enable_dif, bool, 0444);
+MODULE_PARM_DESC(enable_dif,
+	"Enable Data Integrity Format (DIF) support (Default = 1)");
+
+bool enable_dix;
+module_param(enable_dix, bool, 0444);
+MODULE_PARM_DESC(enable_dix,
+	"Enable Data Integrity Extensions (DIX) support (Default = 0)");
+
+extern bool enable_segqueue;
+extern int mpi3mr_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num);
+extern struct dentry *mpi3mr_debugfs_root;
+extern void mpi3mr_init_debugfs(void);
+extern void mpi3mr_exit_debugfs(void);
+extern void mpi3mr_setup_debugfs(struct mpi3mr_ioc *mrioc);
+extern void mpi3mr_destroy_debugfs(struct mpi3mr_ioc *mrioc);
+
+/* Forward declarations*/
+static int mpi3mr_change_queue_depth(struct scsi_device *sdev,
+	int q_depth);
+static void mpi3mr_dev_rmhs_send_tm(struct mpi3mr_ioc *mrioc, u16 handle,
+	struct mpi3mr_drv_cmd *cmdparam, u8 iou_rc);
+static void mpi3mr_send_event_ack(struct mpi3mr_ioc *mrioc, u8 event,
+	struct mpi3mr_drv_cmd *cmdparam, u32 event_ctx);
+static void mpi3mr_fwevt_worker(struct work_struct *work);
+
+#define MPI3MR_DRIVER_EVENT_PROCESS_TRIGGER	(0xFFFD)
+#define MPI3MR_DRIVER_EVENT_WAIT_FOR_DEVICES_TO_REFRESH	(0xFFFE)
+#define MPI3MR_DRIVER_EVENT_TG_QD_REDUCTION 	(0xFFFF)
+
+/**
+ * struct delayed_dev_rmhs_node - Delayed device removal node
+ *
+ * @list: list head
+ * @handle: Device handle
+ * @iou_rc: IO Unit Control Reason Code
+ */
+struct delayed_dev_rmhs_node {
+	struct list_head list;
+	u16 handle;
+	u8 iou_rc;
+};
+
+/**
+ * struct delayed_evt_ack_node - Delayed event ack node
+ *
+ * @list: list head
+ * @event: MPI3 event ID
+ * @event_ctx: event context
+ */
+struct delayed_evt_ack_node {
+	struct list_head list;
+	u8 event;
+	u32 event_ctx;
+};
+
+/**
+ * mpi3mr_fwevt_free - firmware event memory dealloctor
+ * @r: k reference pointer of the firmware event
+ *
+ * Free firmware event memory when no reference.
+ */
+static void mpi3mr_fwevt_free(struct kref *r)
+{
+	kfree(container_of(r, struct mpi3mr_fwevt, ref_count));
+}
+
+/**
+ * mpi3mr_fwevt_get - k reference incrementor
+ * @fwevt: Firmware event reference
+ *
+ * Increment firmware event reference count.
+ */
+static void mpi3mr_fwevt_get(struct mpi3mr_fwevt *fwevt)
+{
+	kref_get(&fwevt->ref_count);
+}
+
+/**
+ * mpi3mr_fwevt_put - k reference decrementor
+ * @fwevt: Firmware event reference
+ *
+ * decrement firmware event reference count.
+ */
+static void mpi3mr_fwevt_put(struct mpi3mr_fwevt *fwevt)
+{
+	kref_put(&fwevt->ref_count, mpi3mr_fwevt_free);
+}
+
+/**
+ * mpi3mr_alloc_fwevt - Allocate firmware event
+ * @len: length of firmware event data to allocate
+ *
+ * Allocate firmware event with required length and initialize
+ * the reference counter.
+ *
+ * Return: firmware event reference.
+ */
+static struct mpi3mr_fwevt *mpi3mr_alloc_fwevt(int len)
+{
+	struct mpi3mr_fwevt *fwevt;
+
+	fwevt = kzalloc(sizeof(*fwevt) + len, GFP_ATOMIC);
+	if (!fwevt)
+		return NULL;
+
+	kref_init(&fwevt->ref_count);
+	return fwevt;
+}
+
+/**
+ * mpi3mr_fwevt_add_to_list - Add firmware event to the list
+ * @mrioc: Adapter instance reference
+ * @fwevt: Firmware event reference
+ *
+ * Add the given firmware event to the firmware event list.
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_fwevt_add_to_list(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_fwevt *fwevt)
+{
+	unsigned long flags;
+
+	if (!mrioc->fwevt_worker_thread)
+		return;
+
+	spin_lock_irqsave(&mrioc->fwevt_lock, flags);
+	/* get fwevt reference count while adding it to fwevt_list */
+	mpi3mr_fwevt_get(fwevt);
+	INIT_LIST_HEAD(&fwevt->list);
+	list_add_tail(&fwevt->list, &mrioc->fwevt_list);
+	INIT_WORK(&fwevt->work, mpi3mr_fwevt_worker);
+	/* get fwevt reference count while enqueueing it to worker queue */
+	mpi3mr_fwevt_get(fwevt);
+	queue_work(mrioc->fwevt_worker_thread, &fwevt->work);
+	spin_unlock_irqrestore(&mrioc->fwevt_lock, flags);
+}
+/**
+ * mpi3mr_hdb_trigger_data_event - Add hdb trigger data event to
+ * the list
+ * @mrioc: Adapter instance reference
+ * @event_data: Event data
+ *
+ * Add the given hdb trigger data event to the firmware event
+ * list.
+ *
+ * Return: Nothing.
+ */
+void mpi3mr_hdb_trigger_data_event(struct mpi3mr_ioc *mrioc,
+	struct trigger_event_data *event_data)
+{
+	struct mpi3mr_fwevt *fwevt;
+	u16 sz = sizeof(*event_data);
+
+	fwevt = mpi3mr_alloc_fwevt(sz);
+	if (!fwevt) {
+		ioc_warn(mrioc, "failed to queue hdb trigger data event\n");
+		return;
+	}
+
+	fwevt->mrioc = mrioc;
+	fwevt->event_id = MPI3MR_DRIVER_EVENT_PROCESS_TRIGGER;
+	fwevt->send_ack = 0;
+	fwevt->process_event = 1;
+	fwevt->event_context = 0;
+	fwevt->event_data_size = sz;
+	memcpy(fwevt->event_data, event_data, sz);
+
+	mpi3mr_fwevt_add_to_list(mrioc, fwevt);
+}
+
+/**
+ * mpi3mr_fwevt_del_from_list - Delete firmware event from list
+ * @mrioc: Adapter instance reference
+ * @fwevt: Firmware event reference
+ *
+ * Delete the given firmware event from the firmware event list.
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_fwevt_del_from_list(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_fwevt *fwevt)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&mrioc->fwevt_lock, flags);
+	if (!list_empty(&fwevt->list)) {
+		list_del_init(&fwevt->list);
+		/*
+		 * Put fwevt reference count after
+		 * removing it from fwevt_list
+		 */
+		mpi3mr_fwevt_put(fwevt);
+	}
+	spin_unlock_irqrestore(&mrioc->fwevt_lock, flags);
+}
+
+/**
+ * mpi3mr_dequeue_fwevt - Dequeue firmware event from the list
+ * @mrioc: Adapter instance reference
+ *
+ * Dequeue a firmware event from the firmware event list.
+ *
+ * Return: firmware event.
+ */
+static struct mpi3mr_fwevt *mpi3mr_dequeue_fwevt(
+	struct mpi3mr_ioc *mrioc)
+{
+	unsigned long flags;
+	struct mpi3mr_fwevt *fwevt = NULL;
+
+	spin_lock_irqsave(&mrioc->fwevt_lock, flags);
+	if (!list_empty(&mrioc->fwevt_list)) {
+		fwevt = list_first_entry(&mrioc->fwevt_list,
+		    struct mpi3mr_fwevt, list);
+		list_del_init(&fwevt->list);
+		/*
+		 * Put fwevt reference count after
+		 * removing it from fwevt_list
+		 */
+		mpi3mr_fwevt_put(fwevt);
+	}
+	spin_unlock_irqrestore(&mrioc->fwevt_lock, flags);
+
+	return fwevt;
+}
+
+/**
+ * mpi3mr_cleanup_fwevt_list - Cleanup firmware event list
+ * @mrioc: Adapter instance reference
+ *
+ * Flush all pending firmware events from the firmware event
+ * list.
+ *
+ * Return: Nothing.
+ */
+void mpi3mr_cleanup_fwevt_list(struct mpi3mr_ioc *mrioc)
+{
+	struct mpi3mr_fwevt *fwevt = NULL;
+
+	if ((list_empty(&mrioc->fwevt_list) && !mrioc->current_event) ||
+	    !mrioc->fwevt_worker_thread)
+		return;
+	dprint_reset(mrioc, "flushing firmware events\n");
+	while ((fwevt = mpi3mr_dequeue_fwevt(mrioc))) {
+		/*
+		 * Wait on the fwevt to complete. If this returns 1, then
+		 * the event was never executed, and we need a put for the
+		 * reference the work had on the fwevt.
+		 */
+		if (cancel_work_sync(&fwevt->work)) {
+			/*
+			 * Put fwevt reference count after
+			 * dequeuing it from worker queue
+			 */
+			mpi3mr_fwevt_put(fwevt);
+			/*
+			 * Put fwevt reference count to neutralize
+			 * kref_init increment
+			 */
+			mpi3mr_fwevt_put(fwevt);
+		}
+	}
+	if (mrioc->current_event) {
+		fwevt = mrioc->current_event;
+		/*
+		 * Don't call cancel_work_sync() API for the
+		 * fwevt work if the controller reset is
+		 * get called as part of processing the
+		 * same fwevt work (or) when worker thread is
+		 * waiting for device add/remove APIs to complete.
+		 * Otherwise we will see deadlock.
+		 */
+		if (current_work() == &fwevt->work || fwevt->pending_at_sml) {
+			fwevt->discard = 1;
+			return;
+		}
+
+		/*
+		 * Wait on the fwevt to complete. If this returns 1, then
+		 * the event was never executed, and we need a put for the
+		 * reference the work had on the fwevt.
+		 *
+		 * If it did execute, we wait for it to finish, and the put will
+		 * happen from mpi3mr_process_fwevt()
+		 */
+		if (cancel_work_sync(&fwevt->work)) {
+			/*
+			 * Put fwevt reference count after
+			 * dequeuing it from worker queue
+			 */
+			mpi3mr_fwevt_put(fwevt);
+			/*
+			 * Put fwevt reference count to neutralize
+			 * kref_init increment
+			 */
+			mpi3mr_fwevt_put(fwevt);
+		}
+	}
+}
+
+/**
+ * mpi3mr_queue_qd_reduction_event -Queue TG QD reduction event
+ * @mrioc: Adapter instance reference
+ * @tg: Throttle group information pointer
+ *
+ * Accessor to queue on synthetically generated driver event to
+ * the event worker thread, the driver event will be used to
+ * reduce the QD of all VDs in the TG from the worker thread.
+ *
+ * Return: None.
+ */
+static void mpi3mr_queue_qd_reduction_event(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_throttle_group_info *tg)
+{
+	struct mpi3mr_fwevt *fwevt;
+	u16 sz = sizeof(struct mpi3mr_throttle_group_info *);
+
+	/* If the QD reduction event is already queued due to throttle and if
+	   the QD is not restored through device info change event
+	   then dont queue further reduction events*/
+	if (tg->fw_qd != tg->modified_qd)
+		return;
+
+	fwevt = mpi3mr_alloc_fwevt(sz);
+	if (!fwevt) {
+		ioc_warn(mrioc,"failed to queue TG QD reduction event\n");
+		return;
+	}
+	*(__le64 *)fwevt->event_data = (__le64)tg;
+	fwevt->mrioc = mrioc;
+	fwevt->event_id = MPI3MR_DRIVER_EVENT_TG_QD_REDUCTION;
+	fwevt->send_ack = 0;
+	fwevt->process_event = 1;
+	fwevt->event_context = 0;
+	fwevt->event_data_size = sz;
+	tg->modified_qd = max_t(u16, (tg->fw_qd * tg->qd_reduction) / 10, 8);
+
+	dprint_event_bh(mrioc, "qd reduction event queued for tg_id(%d)\n",
+	    tg->id);
+	mpi3mr_fwevt_add_to_list(mrioc, fwevt);
+}
+
+/**
+ * mpi3mr_host_tag_for_scmd - Get host tag for a scmd
+ * @mrioc: Adapter instance reference
+ * @scmd: SCSI command reference
+ *
+ * Calculate the host tag based on block tag for a given scmd.
+ *
+ * Return: Valid host tag or MPI3MR_HOSTTAG_INVALID.
+ */
+static u16 mpi3mr_host_tag_for_scmd(struct mpi3mr_ioc *mrioc,
+	struct scsi_cmnd *scmd)
+{
+	struct scmd_priv *priv = NULL;
+	u32 unique_tag;
+	u16 host_tag, hw_queue;
+
+	unique_tag = blk_mq_unique_tag(SCMD_GET_REQUEST(scmd));
+
+	if (mpi3mr_use_blk_mq(mrioc->shost)) {
+		hw_queue = blk_mq_unique_tag_to_hwq(unique_tag);
+		if (hw_queue >= mrioc->num_op_reply_q)
+			return MPI3MR_HOSTTAG_INVALID;
+		host_tag = blk_mq_unique_tag_to_tag(unique_tag);
+	} else {
+		hw_queue = raw_smp_processor_id() % mrioc->num_op_reply_q;
+		host_tag = unique_tag & 0xFFFF;
+	}
+
+	if (WARN_ON(host_tag >= mrioc->max_host_ios))
+		return MPI3MR_HOSTTAG_INVALID;
+
+	priv = scsi_cmd_priv(scmd);
+	/*host_tag 0 is invalid hence incrementing by 1*/
+	priv->host_tag = host_tag + 1;
+	priv->scmd = scmd;
+	priv->in_lld_scope = 1;
+	priv->req_q_idx = hw_queue;
+	priv->meta_chain_idx = -1;
+	priv->chain_idx = -1;
+	priv->meta_sg_valid = 0;
+	return priv->host_tag;
+}
+
+/**
+ * mpi3mr_scmd_from_host_tag - Get SCSI command from host tag
+ * @mrioc: Adapter instance reference
+ * @host_tag: Host tag
+ * @qidx: Operational queue index
+ *
+ * Identify the block tag from the host tag and queue index and
+ * retrieve associated scsi command using scsi_host_find_tag().
+ *
+ * Return: SCSI command reference or NULL.
+ */
+static struct scsi_cmnd *mpi3mr_scmd_from_host_tag(
+	struct mpi3mr_ioc *mrioc, u16 host_tag, u16 qidx)
+{
+	struct scsi_cmnd *scmd = NULL;
+	struct scmd_priv *priv = NULL;
+	u32 unique_tag = host_tag - 1;
+
+	if (WARN_ON(host_tag > mrioc->max_host_ios))
+		goto out;
+
+	if (mpi3mr_use_blk_mq(mrioc->shost))
+		unique_tag |= (qidx << BLK_MQ_UNIQUE_TAG_BITS);
+
+	scmd = scsi_host_find_tag(mrioc->shost, unique_tag);
+	if (scmd) {
+		priv = scsi_cmd_priv(scmd);
+		if (!priv->in_lld_scope)
+			scmd = NULL;
+	}
+out:
+	return scmd;
+}
+
+/**
+ * mpi3mr_clear_scmd_priv - Cleanup SCSI command private date
+ * @mrioc: Adapter instance reference
+ * @scmd: SCSI command reference
+ *
+ * Invalidate the SCSI command private data to mark the command
+ * is not in LLD scope anymore.
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_clear_scmd_priv(struct mpi3mr_ioc *mrioc,
+	struct scsi_cmnd *scmd)
+{
+	struct scmd_priv *priv = NULL;
+
+	priv = scsi_cmd_priv(scmd);
+
+	if (WARN_ON(priv->in_lld_scope == 0))
+		return;
+	priv->host_tag = MPI3MR_HOSTTAG_INVALID;
+	priv->req_q_idx = 0xFFFF;
+	priv->scmd = NULL;
+	priv->in_lld_scope = 0;
+	priv->meta_sg_valid = 0;
+	if (priv->chain_idx >= 0) {
+		clear_bit(priv->chain_idx, mrioc->chain_bitmap);
+		priv->chain_idx = -1;
+	}
+	if (priv->meta_chain_idx >= 0) {
+		clear_bit(priv->meta_chain_idx, mrioc->chain_bitmap);
+		priv->meta_chain_idx = -1;
+	}
+}
+
+/**
+ * mpi3mr_invalidate_devhandles -Invalidate device handles
+ * @mrioc: Adapter instance reference
+ *
+ * Invalidate the device handles in the target device structures
+ * . Called post reset prior to reinitializing the controller.
+ *
+ * Return: Nothing.
+ */
+void mpi3mr_invalidate_devhandles(struct mpi3mr_ioc *mrioc)
+{
+	struct mpi3mr_tgt_dev *tgtdev;
+	struct mpi3mr_stgt_priv_data *tgt_priv;
+
+	list_for_each_entry(tgtdev, &mrioc->tgtdev_list, list) {
+		tgtdev->dev_handle = MPI3MR_INVALID_DEV_HANDLE;
+		if (tgtdev->starget && tgtdev->starget->hostdata) {
+			tgt_priv = tgtdev->starget->hostdata;
+			tgt_priv->dev_handle = MPI3MR_INVALID_DEV_HANDLE;
+			tgt_priv->io_throttle_enabled = 0;
+			tgt_priv->io_divert = 0;
+			tgt_priv->throttle_group = NULL;
+			if (tgtdev->host_exposed)
+				atomic_set(&tgt_priv->block_io, 1);
+		}
+	}
+}
+
+/**
+ * mpi3mr_print_scmd - print individual SCSI command
+ * @rq: Block request
+ * @data: Adapter instance reference
+ * @reserved: N/A. Currently not used
+ *
+ * Print the SCSI command details if it is in LLD scope.
+ *
+ * Return: true always.
+ */
+static BLK_ITER_CALLBACK_RET_TYPE mpi3mr_print_scmd(struct request *rq,
+	void *data, bool reserved)
+{
+	struct mpi3mr_ioc *mrioc = (struct mpi3mr_ioc *)data;
+	struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq);
+	struct scmd_priv *priv = NULL;
+
+	if (scmd) {
+		priv = scsi_cmd_priv(scmd);
+		if (!priv->in_lld_scope)
+			goto out;
+		dprint_reset(mrioc, "host_tag=%d, qid=%d\n", priv->host_tag,
+		    priv->req_q_idx + 1);
+		dprint_scsi_command(mrioc, scmd, MPI3_DEBUG_RESET);
+	}
+
+out:
+	BLK_ITER_CALLBACK_RET_VAL(true);
+}
+
+/**
+ * mpi3mr_flush_scmd - Flush individual SCSI command
+ * @rq: Block request
+ * @data: Adapter instance reference
+ * @reserved: N/A. Currently not used
+ *
+ * Return the SCSI command to the upper layers if it is in LLD
+ * scope.
+ *
+ * Return: true always.
+ */
+
+static BLK_ITER_CALLBACK_RET_TYPE mpi3mr_flush_scmd(struct request *rq,
+	void *data, bool reserved)
+{
+	struct mpi3mr_ioc *mrioc = (struct mpi3mr_ioc *)data;
+	struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq);
+	struct scmd_priv *priv = NULL;
+
+	if (scmd) {
+		priv = scsi_cmd_priv(scmd);
+		if (!priv->in_lld_scope)
+			goto out;
+
+		if (priv->meta_sg_valid)
+			dma_unmap_sg(&mrioc->pdev->dev, scsi_prot_sglist(scmd),
+			    scsi_prot_sg_count(scmd), scmd->sc_data_direction);
+		mpi3mr_clear_scmd_priv(mrioc, scmd);
+		scsi_dma_unmap(scmd);
+		if (mrioc->unrecoverable)
+			set_host_byte(scmd, DID_NO_CONNECT);
+		else
+			set_host_byte(scmd, DID_REQUEUE);
+		dprint_scsi_command(mrioc, scmd, MPI3_DEBUG_RESET);
+		SCMD_DONE(scmd);
+		mrioc->flush_io_count++;
+	}
+
+out:
+	BLK_ITER_CALLBACK_RET_VAL(true);
+}
+
+/**
+ * mpi3mr_count_dev_pending - Count commands pending for a lun
+ * @rq: Block request
+ * @data: SCSI device reference
+ * @reserved: Unused
+ *
+ * This is an iterator function called for each SCSI command in
+ * a host and if the command is pending in the LLD for the
+ * specific device(lun) then device specific pending I/O counter
+ * is updated in the device structure.
+ *
+ * Return: true always.
+ */
+
+static BLK_ITER_CALLBACK_RET_TYPE mpi3mr_count_dev_pending(struct request *rq,
+	void *data, bool reserved)
+{
+	struct scsi_device *sdev = (struct scsi_device *)data;
+	struct mpi3mr_sdev_priv_data *sdev_priv_data = sdev->hostdata;
+	struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq);
+	struct scmd_priv *priv;
+
+	if (scmd) {
+		priv = scsi_cmd_priv(scmd);
+		if (!priv->in_lld_scope)
+			goto out;
+		if (scmd->device == sdev)
+			sdev_priv_data->pend_count++;
+	}
+
+out:
+	BLK_ITER_CALLBACK_RET_VAL(true);
+}
+
+/**
+ * mpi3mr_count_tgt_pending - Count commands pending for target
+ * @rq: Block request
+ * @data: SCSI target reference
+ * @reserved: Unused
+ *
+ * This is an iterator function called for each SCSI command in
+ * a host and if the command is pending in the LLD for the
+ * specific target then target specific pending I/O counter is
+ * updated in the target structure.
+ *
+ * Return: true always.
+ */
+
+static BLK_ITER_CALLBACK_RET_TYPE mpi3mr_count_tgt_pending(struct request *rq,
+	void *data, bool reserved)
+{
+	struct scsi_target *starget = (struct scsi_target *)data;
+	struct mpi3mr_stgt_priv_data *stgt_priv_data = starget->hostdata;
+	struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq);
+	struct scmd_priv *priv;
+
+	if (scmd) {
+		priv = scsi_cmd_priv(scmd);
+		if (!priv->in_lld_scope)
+			goto out;
+		if (scmd->device && (scsi_target(scmd->device) == starget))
+			stgt_priv_data->pend_count++;
+	}
+
+out:
+	BLK_ITER_CALLBACK_RET_VAL(true);
+}
+
+/**
+ * mpi3mr_flush_host_io -  Flush host I/Os
+ * @mrioc: Adapter instance reference
+ *
+ * Flush all of the pending I/Os by calling
+ * blk_mq_tagset_busy_iter() for each possible tag. This is
+ * executed post controller reset
+ *
+ * Return: Nothing.
+ */
+void mpi3mr_flush_host_io(struct mpi3mr_ioc *mrioc)
+{
+	struct Scsi_Host *shost = mrioc->shost;
+
+	mrioc->flush_io_count = 0;
+	ioc_info(mrioc, "flushing host I/O cmds post reset\n");
+	blk_mq_tagset_busy_iter(&shost->tag_set,
+	    mpi3mr_flush_scmd, (void *)mrioc);
+	ioc_info(mrioc, "flushed %d host I/O cmds\n", mrioc->flush_io_count);
+}
+
+/**
+ * mpi3mr_flush_cmds_for_unrecovered_controller- Flush all pend cmds
+ * @mrioc: Adapter instance reference
+ *
+ * This function waits for currently running IO poll threads to
+ * exit and then flushes all host I/Os and any internal pending
+ * cmds. This is executed after controller is marked as
+ * unrecoverable.
+ *
+ * Return: Nothing.
+ */
+void mpi3mr_flush_cmds_for_unrecovered_controller(struct mpi3mr_ioc *mrioc)
+{
+	struct Scsi_Host *shost = mrioc->shost;
+	int i;
+
+	if (!mrioc->unrecoverable)
+		return;
+
+	if (mrioc->op_reply_qinfo)
+	{
+		for (i = 0; i < mrioc->num_queues; i++) {
+			while (atomic_read(&mrioc->op_reply_qinfo[i].in_use))
+				udelay(500);
+			atomic_set(&mrioc->op_reply_qinfo[i].pend_ios, 0);
+		}
+	}
+	mrioc->flush_io_count = 0;
+	blk_mq_tagset_busy_iter(&shost->tag_set,
+	    mpi3mr_flush_scmd, (void *)mrioc);
+	mpi3mr_flush_delayed_cmd_lists(mrioc);
+	mpi3mr_flush_drv_cmds(mrioc);
+}
+
+/**
+ * mpi3mr_alloc_tgtdev - target device allocator
+ * @void: No arguments
+ *
+ * Allocate target device instance and initialize the reference
+ * count
+ *
+ * Return: target device instance.
+ */
+static struct mpi3mr_tgt_dev *mpi3mr_alloc_tgtdev(void)
+{
+	struct mpi3mr_tgt_dev *tgtdev;
+
+	tgtdev = kzalloc(sizeof(*tgtdev), GFP_ATOMIC);
+	if (!tgtdev)
+		return NULL;
+	kref_init(&tgtdev->ref_count);
+	return tgtdev;
+}
+
+/**
+ * mpi3mr_tgtdev_add_to_list -Add tgtdevice to the list
+ * @mrioc: Adapter instance reference
+ * @tgtdev: Target device
+ *
+ * Add the target device to the target device list
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_tgtdev_add_to_list(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_tgt_dev *tgtdev)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&mrioc->tgtdev_lock, flags);
+	mpi3mr_tgtdev_get(tgtdev);
+	INIT_LIST_HEAD(&tgtdev->list);
+	list_add_tail(&tgtdev->list, &mrioc->tgtdev_list);
+	spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags);
+}
+
+/**
+ * mpi3mr_tgtdev_del_from_list -Delete tgtdevice from the list
+ * @mrioc: Adapter instance reference
+ * @tgtdev: Target device
+ *
+ * Remove the target device from the target device list
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_tgtdev_del_from_list(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_tgt_dev *tgtdev)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&mrioc->tgtdev_lock, flags);
+	if (!list_empty(&tgtdev->list)) {
+		list_del_init(&tgtdev->list);
+		mpi3mr_tgtdev_put(tgtdev);
+	}
+	spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags);
+}
+
+/**
+ * __mpi3mr_get_tgtdev_by_handle -Get tgtdev from device handle
+ * @mrioc: Adapter instance reference
+ * @handle: Device handle
+ *
+ * Accessor to retrieve target device from the device handle.
+ * Non Lock version
+ *
+ * Return: Target device reference.
+ */
+static struct mpi3mr_tgt_dev  *__mpi3mr_get_tgtdev_by_handle(
+	struct mpi3mr_ioc *mrioc, u16 handle)
+{
+	struct mpi3mr_tgt_dev *tgtdev;
+
+	assert_spin_locked(&mrioc->tgtdev_lock);
+	list_for_each_entry(tgtdev, &mrioc->tgtdev_list, list)
+		if (tgtdev->dev_handle == handle)
+			goto found_tgtdev;
+	return NULL;
+
+found_tgtdev:
+	mpi3mr_tgtdev_get(tgtdev);
+	return tgtdev;
+}
+
+/**
+ * mpi3mr_get_tgtdev_by_handle -Get tgtdev from device handle
+ * @mrioc: Adapter instance reference
+ * @handle: Device handle
+ *
+ * Accessor to retrieve target device from the device handle.
+ * Lock version
+ *
+ * Return: Target device reference.
+ */
+struct mpi3mr_tgt_dev *mpi3mr_get_tgtdev_by_handle(
+	struct mpi3mr_ioc *mrioc, u16 handle)
+{
+	struct mpi3mr_tgt_dev *tgtdev;
+	unsigned long flags;
+
+	spin_lock_irqsave(&mrioc->tgtdev_lock, flags);
+	tgtdev = __mpi3mr_get_tgtdev_by_handle(mrioc, handle);
+	spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags);
+	return tgtdev;
+}
+
+/**
+ * __mpi3mr_get_tgtdev_by_perst_id -Get tgtdev from persist ID
+ * @mrioc: Adapter instance reference
+ * @persist_id: Persistent ID
+ *
+ * Accessor to retrieve target device from the Persistent ID.
+ * Non Lock version
+ *
+ * Return: Target device reference.
+ */
+static struct mpi3mr_tgt_dev  *__mpi3mr_get_tgtdev_by_perst_id(
+	struct mpi3mr_ioc *mrioc, u16 persist_id)
+{
+	struct mpi3mr_tgt_dev *tgtdev;
+
+	assert_spin_locked(&mrioc->tgtdev_lock);
+	list_for_each_entry(tgtdev, &mrioc->tgtdev_list, list)
+		if (tgtdev->perst_id == persist_id)
+			goto found_tgtdev;
+	return NULL;
+
+found_tgtdev:
+	mpi3mr_tgtdev_get(tgtdev);
+	return tgtdev;
+}
+
+/**
+ * mpi3mr_get_tgtdev_by_perst_id -Get tgtdev from persistent ID
+ * @mrioc: Adapter instance reference
+ * @persist_id: Persistent ID
+ *
+ * Accessor to retrieve target device from the Persistent ID.
+ * Lock version
+ *
+ * Return: Target device reference.
+ */
+struct mpi3mr_tgt_dev *mpi3mr_get_tgtdev_by_perst_id(
+	struct mpi3mr_ioc *mrioc, u16 persist_id)
+{
+	struct mpi3mr_tgt_dev *tgtdev;
+	unsigned long flags;
+
+	spin_lock_irqsave(&mrioc->tgtdev_lock, flags);
+	tgtdev = __mpi3mr_get_tgtdev_by_perst_id(mrioc, persist_id);
+	spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags);
+	return tgtdev;
+}
+
+
+/**
+ * __mpi3mr_get_tgtdev_from_tgtpriv -Get tgtdev from tgt private
+ * @mrioc: Adapter instance reference
+ * @tgt_priv: Target private data
+ *
+ * Accessor to return target device from the target private
+ * data. Non Lock version
+ *
+ * Return: Target device reference.
+ */
+static struct mpi3mr_tgt_dev  *__mpi3mr_get_tgtdev_from_tgtpriv(
+	struct mpi3mr_ioc *mrioc, struct mpi3mr_stgt_priv_data *tgt_priv)
+{
+	struct mpi3mr_tgt_dev *tgtdev;
+
+	assert_spin_locked(&mrioc->tgtdev_lock);
+	tgtdev = tgt_priv->tgt_dev;
+	if (tgtdev)
+		mpi3mr_tgtdev_get(tgtdev);
+	return tgtdev;
+}
+
+/**
+ * mpi3mr_get_tgtdev_from_tgtpriv -Get tgtdev from tgt priv data
+ * @mrioc: Adapter instance reference
+ * @tgt_priv: Target private data
+ *
+ * Accessor to return target device from the target private
+ * data. Lock version
+ *
+ * Return: Target device reference.
+ */
+struct mpi3mr_tgt_dev *mpi3mr_get_tgtdev_from_tgtpriv(
+	struct mpi3mr_ioc *mrioc, struct mpi3mr_stgt_priv_data *tgt_priv)
+{
+	struct mpi3mr_tgt_dev *tgtdev;
+	unsigned long flags;
+
+	spin_lock_irqsave(&mrioc->tgtdev_lock, flags);
+	tgtdev = __mpi3mr_get_tgtdev_from_tgtpriv(mrioc, tgt_priv);
+	spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags);
+	return tgtdev;
+}
+
+/**
+ * mpi3mr_set_io_divert_for_all_vd_in_tg -set divert for TG VDs
+ * @mrioc: Adapter instance reference
+ * @tg: Throttle group information pointer
+ * @divert_value: 1 or 0
+ *
+ * Accessor to set io_divert flag for each device associated
+ * with the given throttle group with the given value.
+ *
+ * Return: None.
+ */
+static void mpi3mr_set_io_divert_for_all_vd_in_tg(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_throttle_group_info *tg, u8 divert_value)
+{
+	unsigned long flags;
+	struct mpi3mr_tgt_dev *tgtdev;
+	struct mpi3mr_stgt_priv_data *tgt_priv;
+
+	spin_lock_irqsave(&mrioc->tgtdev_lock, flags);
+	list_for_each_entry(tgtdev, &mrioc->tgtdev_list, list)
+	if (tgtdev->starget && tgtdev->starget->hostdata) {
+		tgt_priv = tgtdev->starget->hostdata;
+		if (tgt_priv->throttle_group == tg)
+			tgt_priv->io_divert = divert_value;
+	}
+	spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags);
+}
+
+/**
+ * mpi3mr_print_discard_event_notice - print discarded evt info
+ *
+ * @mrioc: Adapter instance reference
+ * @device_add: true for device add event and false for device removal event
+ *
+ * Print notice related to post processing of discarded device
+ * event after controller reset.
+ *
+ * Return: None.
+ */
+inline void mpi3mr_print_discard_event_notice(struct mpi3mr_ioc *mrioc,
+	bool device_add)
+{
+	ioc_notice(mrioc,
+	    "Device %s was under process before the reset and completed after reset\n",
+	    (device_add ? "addition" : "removal"));
+	ioc_notice(mrioc,
+	    "Verify whether the exposed devices are matched with attached devices for correctness\n");
+}
+
+/**
+ * mpi3mr_remove_tgtdev_from_host - Remove dev from upper layers
+ * @mrioc: Adapter instance reference
+ * @tgtdev: Target device structure
+ *
+ * Checks whether the device is exposed to upper layers and if
+ * it is then remove the device from upper layers by calling
+ * scsi_remove_target().
+ *
+ * Return: 0 on success, non zero on failure.
+ */
+void mpi3mr_remove_tgtdev_from_host(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_tgt_dev *tgtdev)
+{
+	struct mpi3mr_stgt_priv_data *tgt_priv;
+
+
+	ioc_info(mrioc, "removing handle(0x%04x), perst_id(%d)\n",
+	    tgtdev->dev_handle, tgtdev->perst_id);
+	if (tgtdev->starget && tgtdev->starget->hostdata) {
+		tgt_priv = tgtdev->starget->hostdata;
+		atomic_set(&tgt_priv->block_io, 0);
+		tgt_priv->dev_handle = MPI3MR_INVALID_DEV_HANDLE;
+	}
+	if (!mrioc->sas_transport_enabled || (tgtdev->dev_type !=
+	     MPI3_DEVICE_DEVFORM_SAS_SATA) || tgtdev->non_stl) {
+		if (tgtdev->starget) {
+			if (mrioc->current_event)
+				mrioc->current_event->pending_at_sml = 1;
+			scsi_remove_target(&tgtdev->starget->dev);
+			tgtdev->host_exposed = 0;
+			if (mrioc->current_event) {
+				mrioc->current_event->pending_at_sml = 0;
+				if (mrioc->current_event->discard) {
+					mpi3mr_print_discard_event_notice(mrioc,
+					    false);
+					return;
+				}
+			}
+		}
+	} else if (tgtdev->starget)
+		mpi3mr_remove_tgtdev_from_sas_transport(mrioc, tgtdev);
+	mpi3mr_master_trigger(mrioc,
+	    MPI3_DRIVER2_MASTERTRIGGER_DEVICE_REMOVAL_ENABLED);
+
+	ioc_info(mrioc, "removed handle(0x%04x), perst_id(%d)\n",
+	    tgtdev->dev_handle, tgtdev->perst_id);
+}
+
+
+/**
+ * mpi3mr_report_tgtdev_to_host - Expose device to upper layers
+ * @mrioc: Adapter instance reference
+ * @perst_id: Persistent ID of the device
+ *
+ * Checks whether the device can be exposed to upper layers and
+ * if it is not then expose the device to upper layers by
+ * calling scsi_scan_target().
+ *
+ * Return: 0 on success, non zero on failure.
+ */
+static int mpi3mr_report_tgtdev_to_host(struct mpi3mr_ioc *mrioc,
+	u16 perst_id)
+{
+	int retval = 0;
+	struct mpi3mr_tgt_dev *tgtdev;
+
+	if (mrioc->reset_in_progress)
+		return -1;
+	tgtdev = mpi3mr_get_tgtdev_by_perst_id(mrioc, perst_id);
+	if (!tgtdev) {
+		retval = -1;
+		goto out;
+	}
+	if (tgtdev->is_hidden || tgtdev->host_exposed) {
+		retval = -1;
+		goto out;
+	}
+	if (!mrioc->sas_transport_enabled || (tgtdev->dev_type !=
+	     MPI3_DEVICE_DEVFORM_SAS_SATA) || tgtdev->non_stl){
+		tgtdev->host_exposed = 1;
+		ioc_info(mrioc,
+		    "exposing target device with handle(0x%04x), perst_id(%d)\n",
+		    tgtdev->dev_handle, perst_id);
+		if (mrioc->current_event)
+			mrioc->current_event->pending_at_sml = 1;
+		scsi_scan_target(&mrioc->shost->shost_gendev,
+		    mrioc->scsi_device_channel, tgtdev->perst_id,
+		    SCAN_WILD_CARD, SCSI_SCAN_INITIAL);
+		if (!tgtdev->starget) {
+			ioc_err(mrioc,
+			    "exposing target device with handle(0x%04x), perst_id(%d) failed\n",
+			    tgtdev->dev_handle, perst_id);
+			tgtdev->host_exposed = 0;
+		}
+		if (mrioc->current_event) {
+			mrioc->current_event->pending_at_sml = 0;
+			if (mrioc->current_event->discard) {
+				mpi3mr_print_discard_event_notice(mrioc, true);
+				goto out;
+			}
+		}
+		dprint_event_bh(mrioc,
+		    "exposed target device with handle(0x%04x), perst_id(%d)\n",
+		    tgtdev->dev_handle, perst_id);
+		goto out;
+	} else
+		mpi3mr_report_tgtdev_to_sas_transport(mrioc, tgtdev);
+
+
+
+out:
+	if (tgtdev)
+		mpi3mr_tgtdev_put(tgtdev);
+
+	return retval;
+}
+
+/**
+ * mpi3mr_update_sdev - Update SCSI device information
+ * @sdev: SCSI device reference
+ * @data: target device reference
+ *
+ * This is an iterator function called for each SCSI device in a
+ * target to update the target specific information into each
+ * SCSI device.
+ *
+ * Return: Nothing.
+ */
+static void
+mpi3mr_update_sdev(struct scsi_device *sdev, void *data)
+{
+	struct mpi3mr_tgt_dev *tgtdev;
+
+	tgtdev = (struct mpi3mr_tgt_dev *)data;
+	if (!tgtdev)
+		return;
+
+	mpi3mr_change_queue_depth(sdev, tgtdev->q_depth);
+	switch (tgtdev->dev_type) {
+	case MPI3_DEVICE_DEVFORM_PCIE:
+		/*The block layer hw sector size = 512*/
+		if ((tgtdev->dev_spec.pcie_inf.dev_info &
+		    MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_MASK) ==
+		    MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_NVME_DEVICE) {
+			blk_queue_max_hw_sectors(sdev->request_queue,
+			    tgtdev->dev_spec.pcie_inf.mdts / 512);
+			if (tgtdev->dev_spec.pcie_inf.pgsz == 0)
+				blk_queue_virt_boundary(sdev->request_queue,
+				    ((1 << MPI3MR_DEFAULT_PGSZEXP) - 1));
+			else
+				blk_queue_virt_boundary(sdev->request_queue,
+				    ((1 << tgtdev->dev_spec.pcie_inf.pgsz) - 1));
+		}
+
+		break;
+	default:
+		break;
+	}
+}
+
+/**
+ * mpi3mr_refresh_tgtdevs - Refresh target device exposure
+ * @mrioc: Adapter instance reference
+ *
+ * This is executed post controller reset to identify any
+ * missing devices during reset and remove from the upper layers
+ * or expose any newly detected device to the upper layers.
+ *
+ * Return: Nothing.
+ */
+
+void mpi3mr_refresh_tgtdevs(struct mpi3mr_ioc *mrioc)
+{
+	struct mpi3mr_tgt_dev *tgtdev, *tgtdev_next;
+
+	dprint_reset(mrioc, "refresh target devices: check for removals\n");
+	list_for_each_entry_safe(tgtdev, tgtdev_next, &mrioc->tgtdev_list,
+	    list) {
+		if (tgtdev->dev_handle == MPI3MR_INVALID_DEV_HANDLE) {
+			dprint_reset(mrioc, "removing target device with perst_id(%d)\n",
+			    tgtdev->perst_id);
+			if (tgtdev->host_exposed)
+				mpi3mr_remove_tgtdev_from_host(mrioc, tgtdev);
+			mpi3mr_tgtdev_del_from_list(mrioc, tgtdev);
+			mpi3mr_tgtdev_put(tgtdev);
+		}
+	}
+
+	dprint_reset(mrioc, "refresh target devices: check for additions\n");
+	tgtdev = NULL;
+	list_for_each_entry(tgtdev, &mrioc->tgtdev_list, list) {
+		if ((tgtdev->dev_handle != MPI3MR_INVALID_DEV_HANDLE) &&
+		    !tgtdev->is_hidden && !tgtdev->host_exposed)
+			mpi3mr_report_tgtdev_to_host(mrioc, tgtdev->perst_id);
+	}
+	dprint_reset(mrioc, "refresh target devices: done\n");
+}
+
+
+/**
+ * mpi3mr_debug_dump_devpg0 - Dump device page0
+ * @mrioc: Adapter instance reference
+ * @dev_pg0: Device page 0.
+ *
+ * Prints pertinent details of the device page 0.
+ *
+ * Return: Nothing.
+ */
+static void
+mpi3mr_debug_dump_devpg0(struct mpi3mr_ioc *mrioc, struct mpi3_device_page0 *dev_pg0)
+{
+
+	if (!(mrioc->logging_level &
+	    (MPI3_DEBUG_EVENT | MPI3_DEBUG_EVENT_WORK_TASK)))
+		return;
+
+	ioc_info(mrioc,
+	    "device_pg0: handle(0x%04x), perst_id(%d), wwid(0x%016llx), encl_handle(0x%04x), slot(%d)\n",
+	    le16_to_cpu(dev_pg0->dev_handle),
+	    le16_to_cpu(dev_pg0->persistent_id),
+	    le64_to_cpu(dev_pg0->wwid), le16_to_cpu(dev_pg0->enclosure_handle),
+	    le16_to_cpu(dev_pg0->slot));
+	ioc_info(mrioc, "device_pg0: access_status(0x%02x), flags(0x%04x), device_form(0x%02x), queue_depth(%d)\n",
+	    dev_pg0->access_status, le16_to_cpu(dev_pg0->flags),
+	    dev_pg0->device_form, le16_to_cpu(dev_pg0->queue_depth));
+	ioc_info(mrioc, "device_pg0: parent_handle(0x%04x), iounit_port(%d)\n",
+	    le16_to_cpu(dev_pg0->parent_dev_handle), dev_pg0->io_unit_port);
+
+	switch (dev_pg0->device_form) {
+	case MPI3_DEVICE_DEVFORM_SAS_SATA:
+	{
+		struct mpi3_device0_sas_sata_format *sasinf =
+		    &dev_pg0->device_specific.sas_sata_format;
+		ioc_info(mrioc,
+		    "device_pg0: sas_sata: sas_address(0x%016llx),flags(0x%04x), device_info(0x%04x), phy_num(%d), attached_phy_id(%d)\n",
+		    le64_to_cpu(sasinf->sas_address),
+		    le16_to_cpu(sasinf->flags),
+		    le16_to_cpu(sasinf->device_info), sasinf->phy_num,
+		    sasinf->attached_phy_identifier);
+		break;
+	}
+	case MPI3_DEVICE_DEVFORM_PCIE:
+	{
+		struct mpi3_device0_pcie_format *pcieinf =
+		    &dev_pg0->device_specific.pcie_format;
+		ioc_info(mrioc,
+		    "device_pg0: pcie: port_num(%d), device_info(0x%04x), mdts(%d), page_sz(0x%02x)\n",
+		    pcieinf->port_num, le16_to_cpu(pcieinf->device_info),
+		    le32_to_cpu(pcieinf->maximum_data_transfer_size),
+		    pcieinf->page_size);
+		ioc_info(mrioc,
+		    "device_pg0: pcie: abort_timeout(%d), reset_timeout(%d)\n",
+		    pcieinf->nvme_abort_to, pcieinf->controller_reset_to);
+		break;
+	}
+	case MPI3_DEVICE_DEVFORM_VD:
+	{
+		struct mpi3_device0_vd_format *vdinf =
+		    &dev_pg0->device_specific.vd_format;
+
+		ioc_info(mrioc,
+		    "device_pg0: vd: state(0x%02x), raid_level(%d), flags(0x%04x), device_info(0x%04x)\n",
+		    vdinf->vd_state, vdinf->raid_level,
+		    le16_to_cpu(vdinf->flags),
+		    le16_to_cpu(vdinf->device_info));
+		ioc_info(mrioc,
+		    "device_pg0: vd: tg_id(%d), high(%dMiB), low(%dMiB), qd_reduction_factor(%d)\n",
+		    vdinf->io_throttle_group,
+		    le16_to_cpu(vdinf->io_throttle_group_high),
+		    le16_to_cpu(vdinf->io_throttle_group_low),
+		    ((le16_to_cpu(vdinf->flags) &
+		       MPI3_DEVICE0_VD_FLAGS_IO_THROTTLE_GROUP_QD_MASK) >> 12));
+
+	}
+	default:
+		break;
+	}
+}
+/**
+ * mpi3mr_update_tgtdev - DevStatusChange evt bottomhalf
+ * @mrioc: Adapter instance reference
+ * @tgtdev: Target device internal structure
+ * @dev_pg0: New device page0
+ * @is_added: Flag to indicate the device is just added
+ *
+ * Update the information from the device page0 into the driver
+ * cached target device structure.
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_update_tgtdev(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_tgt_dev *tgtdev, struct mpi3_device_page0 *dev_pg0,
+	bool is_added)
+{
+	u16 flags = 0;
+	struct mpi3mr_stgt_priv_data *scsi_tgt_priv_data = NULL;
+	struct mpi3mr_enclosure_node *enclosure_dev = NULL;
+
+	mpi3mr_debug_dump_devpg0(mrioc, dev_pg0);
+
+	tgtdev->perst_id = le16_to_cpu(dev_pg0->persistent_id);
+	tgtdev->dev_handle = le16_to_cpu(dev_pg0->dev_handle);
+	tgtdev->dev_type = dev_pg0->device_form;
+	tgtdev->io_unit_port = dev_pg0->io_unit_port;
+	tgtdev->encl_handle = le16_to_cpu(dev_pg0->enclosure_handle);
+	tgtdev->parent_handle = le16_to_cpu(dev_pg0->parent_dev_handle);
+	tgtdev->slot = le16_to_cpu(dev_pg0->slot);
+	tgtdev->q_depth = le16_to_cpu(dev_pg0->queue_depth);
+	tgtdev->wwid = le64_to_cpu(dev_pg0->wwid);
+	tgtdev->devpg0_flag = le16_to_cpu(dev_pg0->flags);
+
+	if (tgtdev->encl_handle)
+		enclosure_dev = mpi3mr_enclosure_find_by_handle(mrioc,
+		    tgtdev->encl_handle);
+	if (enclosure_dev)
+		tgtdev->enclosure_logical_id = le64_to_cpu(
+			    enclosure_dev->pg0.enclosure_logical_id);
+
+	flags = tgtdev->devpg0_flag;
+
+	tgtdev->is_hidden = (flags & MPI3_DEVICE0_FLAGS_HIDDEN);
+
+	if (is_added == true)
+		tgtdev->io_throttle_enabled =
+		    (flags & MPI3_DEVICE0_FLAGS_IO_THROTTLING_REQUIRED) ? 1 : 0;
+
+
+	if (tgtdev->starget && tgtdev->starget->hostdata) {
+		scsi_tgt_priv_data = (struct mpi3mr_stgt_priv_data *)
+		    tgtdev->starget->hostdata;
+		scsi_tgt_priv_data->perst_id = tgtdev->perst_id;
+		scsi_tgt_priv_data->dev_handle = tgtdev->dev_handle;
+		scsi_tgt_priv_data->dev_type = tgtdev->dev_type;
+		scsi_tgt_priv_data->io_throttle_enabled =
+		    tgtdev->io_throttle_enabled;
+		if (is_added == true)
+			atomic_set(&scsi_tgt_priv_data->block_io, 0);
+	}
+
+	switch (dev_pg0->access_status) {
+	case MPI3_DEVICE0_ASTATUS_NO_ERRORS:
+	case MPI3_DEVICE0_ASTATUS_PREPARE:
+	case MPI3_DEVICE0_ASTATUS_NEEDS_INITIALIZATION:
+	case MPI3_DEVICE0_ASTATUS_DEVICE_MISSING_DELAY:
+		break;
+	default:
+		tgtdev->is_hidden = 1;
+		break;
+	}
+
+	switch (tgtdev->dev_type) {
+	case MPI3_DEVICE_DEVFORM_SAS_SATA:
+	{
+		struct mpi3_device0_sas_sata_format *sasinf =
+		    &dev_pg0->device_specific.sas_sata_format;
+		u16 dev_info = le16_to_cpu(sasinf->device_info);
+
+		tgtdev->dev_spec.sas_sata_inf.dev_info = dev_info;
+		tgtdev->dev_spec.sas_sata_inf.sas_address =
+		    le64_to_cpu(sasinf->sas_address);
+		tgtdev->dev_spec.sas_sata_inf.phy_id = sasinf->phy_num;
+		tgtdev->dev_spec.sas_sata_inf.attached_phy_id =
+			sasinf->attached_phy_identifier;
+		if ((dev_info & MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_MASK)
+		    != MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_END_DEVICE)
+			tgtdev->is_hidden = 1;
+		else if (!(dev_info &
+			   (MPI3_SAS_DEVICE_INFO_STP_SATA_TARGET |
+			    MPI3_SAS_DEVICE_INFO_SSP_TARGET)))
+			tgtdev->is_hidden = 1;
+
+		if (((tgtdev->devpg0_flag &
+		      MPI3_DEVICE0_FLAGS_ATT_METHOD_DIR_ATTACHED)
+		    && (tgtdev->devpg0_flag &
+		      MPI3_DEVICE0_FLAGS_ATT_METHOD_VIRTUAL)) ||
+		    (tgtdev->parent_handle == 0xFFFF))
+			tgtdev->non_stl = 1;
+		if (tgtdev->dev_spec.sas_sata_inf.hba_port)
+			tgtdev->dev_spec.sas_sata_inf.hba_port->port_id = dev_pg0->io_unit_port;
+		break;
+	}
+	case MPI3_DEVICE_DEVFORM_PCIE:
+	{
+		struct mpi3_device0_pcie_format *pcieinf =
+		    &dev_pg0->device_specific.pcie_format;
+		u16 dev_info = le16_to_cpu(pcieinf->device_info);
+
+		tgtdev->dev_spec.pcie_inf.dev_info = dev_info;
+		tgtdev->dev_spec.pcie_inf.capb =
+		    le32_to_cpu(pcieinf->capabilities);
+		tgtdev->dev_spec.pcie_inf.mdts = MPI3MR_DEFAULT_MDTS;
+		if (dev_pg0->access_status == MPI3_DEVICE0_ASTATUS_NO_ERRORS) {
+			tgtdev->dev_spec.pcie_inf.mdts =
+			    le32_to_cpu(pcieinf->maximum_data_transfer_size);
+			tgtdev->dev_spec.pcie_inf.pgsz = pcieinf->page_size;
+			tgtdev->dev_spec.pcie_inf.reset_to =
+				max_t(u8, pcieinf->controller_reset_to,
+				      MPI3MR_INTADMCMD_TIMEOUT);
+			tgtdev->dev_spec.pcie_inf.abort_to =
+				max_t(u8, pcieinf->nvme_abort_to,
+				      MPI3MR_INTADMCMD_TIMEOUT);
+		}
+		if (tgtdev->dev_spec.pcie_inf.mdts > (1024 * 1024))
+			tgtdev->dev_spec.pcie_inf.mdts = (1024 * 1024);
+		if (((dev_info & MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_MASK) !=
+		    MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_NVME_DEVICE) &&
+		    ((dev_info & MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_MASK) !=
+		    MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_SCSI_DEVICE))
+			tgtdev->is_hidden = 1;
+		tgtdev->non_stl = 1;
+		break;
+	}
+	case MPI3_DEVICE_DEVFORM_VD:
+	{
+		struct mpi3_device0_vd_format *vdinf =
+		    &dev_pg0->device_specific.vd_format;
+		struct mpi3mr_throttle_group_info *tg = NULL;
+
+		tgtdev->dev_spec.vd_inf.state = vdinf->vd_state;
+		if (vdinf->vd_state == MPI3_DEVICE0_VD_STATE_OFFLINE)
+			tgtdev->is_hidden = 1;
+		tgtdev->non_stl = 1;
+		tgtdev->dev_spec.vd_inf.tg_id = vdinf->io_throttle_group;
+		tgtdev->dev_spec.vd_inf.tg_high =
+			le16_to_cpu(vdinf->io_throttle_group_high) * 2048;
+		tgtdev->dev_spec.vd_inf.tg_low =
+		    le16_to_cpu(vdinf->io_throttle_group_low) * 2048;
+		tgtdev->dev_spec.vd_inf.tg_qd_reduction =
+			((le16_to_cpu(vdinf->flags) &
+		       MPI3_DEVICE0_VD_FLAGS_IO_THROTTLE_GROUP_QD_MASK) >> 12);
+		if (vdinf->io_throttle_group < mrioc->num_io_throttle_group) {
+			tg = mrioc->throttle_groups + vdinf->io_throttle_group;
+			tg->id = vdinf->io_throttle_group;
+			tg->high = tgtdev->dev_spec.vd_inf.tg_high;
+			tg->low = tgtdev->dev_spec.vd_inf.tg_low;
+			tg->qd_reduction =
+				tgtdev->dev_spec.vd_inf.tg_qd_reduction;
+			if (is_added == true)
+				tg->fw_qd = tgtdev->q_depth;
+			tg->modified_qd = tgtdev->q_depth;
+		}
+		tgtdev->dev_spec.vd_inf.tg = tg;
+		if (scsi_tgt_priv_data)
+			scsi_tgt_priv_data->throttle_group = tg;
+		break;
+	}
+	default:
+		break;
+	}
+
+}
+
+/**
+ * mpi3mr_devstatuschg_evt_bh - DevStatusChange evt bottomhalf
+ * @mrioc: Adapter instance reference
+ * @fwevt: Firmware event
+ *
+ * Process Device status Change event and based on device's new
+ * information, either expose the device to the upper layers, or
+ * remove the device from upper layers.
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_devstatuschg_evt_bh(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_fwevt *fwevt)
+{
+	u16 dev_handle = 0;
+	u8 uhide = 0, delete = 0, cleanup = 0;
+	struct mpi3mr_tgt_dev *tgtdev = NULL;
+	struct mpi3_event_data_device_status_change *evtdata =
+	    (struct mpi3_event_data_device_status_change *)fwevt->event_data;
+
+
+	dev_handle = le16_to_cpu(evtdata->dev_handle);
+	dprint_event_bh(mrioc,
+	    "processing device status change event bottom half for handle(0x%04x), rc(0x%02x)\n",
+	    dev_handle, evtdata->reason_code);
+	switch (evtdata->reason_code) {
+	case MPI3_EVENT_DEV_STAT_RC_HIDDEN:
+		delete = 1;
+		break;
+	case MPI3_EVENT_DEV_STAT_RC_NOT_HIDDEN:
+		uhide = 1;
+		break;
+	case MPI3_EVENT_DEV_STAT_RC_VD_NOT_RESPONDING:
+		delete = 1;
+		cleanup = 1;
+		break;
+	default:
+		break;
+	}
+
+	tgtdev = mpi3mr_get_tgtdev_by_handle(mrioc, dev_handle);
+	if (!tgtdev) {
+		dprint_event_bh(mrioc,
+		    "processing device status change event bottom half, cannot identify target device for handle(0x%04x), rc(0x%02x)\n",
+		    dev_handle, evtdata->reason_code);
+		goto out;
+	}
+	if (uhide) {
+		tgtdev->is_hidden = 0;
+		if (!tgtdev->host_exposed)
+			mpi3mr_report_tgtdev_to_host(mrioc, tgtdev->perst_id);
+	}
+	if (tgtdev->starget && tgtdev->starget->hostdata) {
+		if (delete)
+			mpi3mr_remove_tgtdev_from_host(mrioc, tgtdev);
+	}
+	if (cleanup) {
+		mpi3mr_tgtdev_del_from_list(mrioc, tgtdev);
+		mpi3mr_tgtdev_put(tgtdev);
+	}
+
+out:
+	if (tgtdev)
+		mpi3mr_tgtdev_put(tgtdev);
+
+}
+
+/**
+ * mpi3mr_devinfochg_evt_bh - DeviceInfoChange evt bottomhalf
+ * @mrioc: Adapter instance reference
+ * @dev_pg0: New device page0
+ *
+ * Process Device Info Change event and based on device's new
+ * information, either expose the device to the upper layers, or
+ * remove the device from upper layers or update the details of
+ * the device.
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_devinfochg_evt_bh(struct mpi3mr_ioc *mrioc,
+	struct mpi3_device_page0 *dev_pg0)
+{
+	struct mpi3mr_tgt_dev *tgtdev = NULL;
+	u16 dev_handle = 0, perst_id = 0;
+
+	perst_id = le16_to_cpu(dev_pg0->persistent_id);
+	dev_handle = le16_to_cpu(dev_pg0->dev_handle);
+
+	dprint_event_bh(mrioc,
+	    "processing device info change event bottom half for handle(0x%04x), perst_id(%d)\n",
+	    dev_handle, perst_id);
+	tgtdev = mpi3mr_get_tgtdev_by_handle(mrioc, dev_handle);
+	if (!tgtdev) {
+		dprint_event_bh(mrioc,
+		    "cannot identify target device for  device info change event handle(0x%04x), perst_id(%d)\n",
+		    dev_handle, perst_id);
+		goto out;
+	}
+	mpi3mr_update_tgtdev(mrioc, tgtdev, dev_pg0, false);
+	if (!tgtdev->is_hidden && !tgtdev->host_exposed)
+		mpi3mr_report_tgtdev_to_host(mrioc, perst_id);
+	if (tgtdev->is_hidden && tgtdev->host_exposed)
+		mpi3mr_remove_tgtdev_from_host(mrioc, tgtdev);
+	if (!tgtdev->is_hidden && tgtdev->host_exposed && tgtdev->starget)
+		starget_for_each_device(tgtdev->starget, (void *) tgtdev,
+		    mpi3mr_update_sdev);
+out:
+	if (tgtdev)
+		mpi3mr_tgtdev_put(tgtdev);
+
+}
+
+/**
+ * mpi3mr_process_trigger_data_event_bh - Process trigger event
+ * data
+ * @mrioc: Adapter instance reference
+ * @event_data: Event data
+ * @trigger_type: Trigger type
+ *
+ * This function releases diage buffers or issues diag fault
+ * based on trigger conditions
+ *
+ * Return: Nothing
+ */
+static void mpi3mr_process_trigger_data_event_bh(struct mpi3mr_ioc *mrioc,
+    struct trigger_event_data *event_data)
+{
+	struct diag_buffer_desc *trace_hdb = event_data->trace_hdb;
+	struct diag_buffer_desc *fw_hdb = event_data->fw_hdb;
+	unsigned long flags;
+	u8 trigger_type = event_data->trigger_type;
+	u64 trigger_data = event_data->trigger_specific_data;
+
+	if (event_data->snapdump)  {
+		if (trace_hdb)
+			mpi3mr_set_trigger_data_in_hdb(trace_hdb, trigger_type,
+			    trigger_data, 1);
+		if (fw_hdb)
+			mpi3mr_set_trigger_data_in_hdb(fw_hdb, trigger_type,
+			    trigger_data, 1);
+		mpi3mr_soft_reset_handler(mrioc,
+			    MPI3MR_RESET_FROM_TRIGGER, 1);
+		return;
+	}
+
+	if (trace_hdb) {
+		mpi3mr_set_trigger_data_in_hdb(trace_hdb, trigger_type,
+		    trigger_data, 1);
+		mpi3mr_issue_diag_buf_release(mrioc, trace_hdb);
+		spin_lock_irqsave(&mrioc->trigger_lock, flags);
+		mrioc->trace_release_trigger_active = false;
+		spin_unlock_irqrestore(&mrioc->trigger_lock, flags);
+	}
+	if (fw_hdb) {
+		mpi3mr_set_trigger_data_in_hdb(fw_hdb, trigger_type,
+		    trigger_data, 1);
+		mpi3mr_issue_diag_buf_release(mrioc, fw_hdb);
+		spin_lock_irqsave(&mrioc->trigger_lock, flags);
+		mrioc->fw_release_trigger_active = false;
+		spin_unlock_irqrestore(&mrioc->trigger_lock, flags);
+	}
+}
+
+/**
+ * mpi3mr_encldev_add_chg_evt_debug - debug for enclosure event
+ * @mrioc: Adapter instance reference
+ * @encl_pg0: Enclosure page 0.
+ * @is_added: Added event or not
+ *
+ * Return nothing.
+ */
+static void mpi3mr_encldev_add_chg_evt_debug(struct mpi3mr_ioc *mrioc,
+	struct mpi3_enclosure_page0 *encl_pg0, u8 is_added)
+{
+	char *reason_str = NULL;
+
+	if (!(mrioc->logging_level & MPI3_DEBUG_EVENT_WORK_TASK))
+		return;
+
+	if (is_added)
+		reason_str = "enclosure added";
+	else
+		reason_str = "enclosure dev status changed";
+
+	ioc_info(mrioc, "%s: handle(0x%04x), enclosure logical id(0x%016llx)"
+	    " number of slots(%d), port(%d), flags(0x%04x), present(%d)\n",
+	    reason_str, le16_to_cpu(encl_pg0->enclosure_handle),
+	    (unsigned long long)le64_to_cpu(encl_pg0->enclosure_logical_id),
+	    le16_to_cpu(encl_pg0->num_slots), encl_pg0->io_unit_port,
+	    le16_to_cpu(encl_pg0->flags),
+	    ((le16_to_cpu(encl_pg0->flags) &
+	      MPI3_ENCLS0_FLAGS_ENCL_DEV_PRESENT_MASK) >> 4));
+}
+
+/**
+ * mpi3mr_encldev_add_chg_evt_bh - Enclosure evt bottomhalf
+ * @mrioc: Adapter instance reference
+ * @fwevt: Firmware event reference
+ *
+ * Prints information about the Enclosure device status or
+ * Enclosure add events if logging is enabled and add or remove
+ * the enclosure from the controller's internal list of
+ * enclosures.
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_encldev_add_chg_evt_bh(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_fwevt *fwevt)
+{
+	struct mpi3mr_enclosure_node *enclosure_dev = NULL;
+	struct mpi3_enclosure_page0 *encl_pg0;
+	u16 encl_handle;
+	u8 added, present;
+
+	encl_pg0= (struct mpi3_enclosure_page0 *) fwevt->event_data;
+	added = (fwevt->event_id == MPI3_EVENT_ENCL_DEVICE_ADDED) ? 1 : 0;
+	mpi3mr_encldev_add_chg_evt_debug(mrioc, encl_pg0, added);
+
+
+	encl_handle= le16_to_cpu(encl_pg0->enclosure_handle);
+	present = ((le16_to_cpu(encl_pg0->flags) &
+	      MPI3_ENCLS0_FLAGS_ENCL_DEV_PRESENT_MASK) >> 4);
+
+	if (encl_handle)
+		enclosure_dev = mpi3mr_enclosure_find_by_handle(mrioc,
+		    encl_handle);
+	if (!enclosure_dev && present) {
+		enclosure_dev =
+			kzalloc(sizeof(struct mpi3mr_enclosure_node),
+			    GFP_KERNEL);
+		if (!enclosure_dev)
+			return;
+		list_add_tail(&enclosure_dev->list,
+		    &mrioc->enclosure_list);
+	}
+	if (enclosure_dev) {
+		if (!present) {
+			list_del(&enclosure_dev->list);
+			kfree(enclosure_dev);
+		} else
+			memcpy(&enclosure_dev->pg0, encl_pg0,
+			    sizeof(enclosure_dev->pg0));
+
+	}
+}
+
+/**
+ * mpi3mr_sastopochg_evt_debug - SASTopoChange details
+ * @mrioc: Adapter instance reference
+ * @event_data: SAS topology change list event data
+ *
+ * Prints information about the SAS topology change event.
+ *
+ * Return: Nothing.
+ */
+static void
+mpi3mr_sastopochg_evt_debug(struct mpi3mr_ioc *mrioc,
+	struct mpi3_event_data_sas_topology_change_list *event_data)
+{
+	int i;
+	u16 handle;
+	u8 reason_code, phy_number;
+	char *status_str = NULL;
+	u8 link_rate, prev_link_rate;
+
+	if (!(mrioc->logging_level & MPI3_DEBUG_EVENT_WORK_TASK))
+		return;
+
+	switch (event_data->exp_status) {
+	case MPI3_EVENT_SAS_TOPO_ES_NOT_RESPONDING:
+		status_str = "remove";
+		break;
+	case MPI3_EVENT_SAS_TOPO_ES_RESPONDING:
+		status_str =  "responding";
+		break;
+	case MPI3_EVENT_SAS_TOPO_ES_DELAY_NOT_RESPONDING:
+		status_str = "remove delay";
+		break;
+	case MPI3_EVENT_SAS_TOPO_ES_NO_EXPANDER:
+		status_str = "direct attached";
+		break;
+	default:
+		status_str = "unknown status";
+		break;
+	}
+	ioc_info(mrioc, "%s :sas topology change: (%s)\n",
+	    __func__, status_str);
+	ioc_info(mrioc,
+	    "%s :\texpander_handle(0x%04x), port(%d), enclosure_handle(0x%04x) start_phy(%02d), num_entries(%d)\n",
+	    __func__, le16_to_cpu(event_data->expander_dev_handle),
+	    event_data->io_unit_port,
+	    le16_to_cpu(event_data->enclosure_handle),
+	    event_data->start_phy_num, event_data->num_entries);
+	for (i = 0; i < event_data->num_entries; i++) {
+		handle =
+		    le16_to_cpu(event_data->phy_entry[i].attached_dev_handle);
+		if (!handle)
+			continue;
+		phy_number = event_data->start_phy_num + i;
+		reason_code = event_data->phy_entry[i].status &
+		    MPI3_EVENT_SAS_TOPO_PHY_RC_MASK;
+		switch (reason_code) {
+		case MPI3_EVENT_SAS_TOPO_PHY_RC_TARG_NOT_RESPONDING:
+			status_str = "target remove";
+			break;
+		case MPI3_EVENT_SAS_TOPO_PHY_RC_DELAY_NOT_RESPONDING:
+			status_str = "delay target remove";
+			break;
+		case MPI3_EVENT_SAS_TOPO_PHY_RC_PHY_CHANGED:
+			status_str = "link status change";
+			break;
+		case MPI3_EVENT_SAS_TOPO_PHY_RC_NO_CHANGE:
+			status_str = "link status no change";
+			break;
+		case MPI3_EVENT_SAS_TOPO_PHY_RC_RESPONDING:
+			status_str = "target responding";
+			break;
+		default:
+			status_str = "unknown";
+			break;
+		}
+		link_rate = event_data->phy_entry[i].link_rate >> 4;
+		prev_link_rate = event_data->phy_entry[i].link_rate & 0xF;
+		ioc_info(mrioc,
+		    "%s :\tphy(%02d), attached_handle(0x%04x): %s: link rate: new(0x%02x), old(0x%02x)\n",
+		    __func__, phy_number, handle, status_str, link_rate,
+		    prev_link_rate);
+	}
+}
+
+/**
+ * mpi3mr_sastopochg_evt_bh - SASTopologyChange evt bottomhalf
+ * @mrioc: Adapter instance reference
+ * @fwevt: Firmware event reference
+ *
+ * Prints information about the SAS topology change event and
+ * for "not responding" event code, removes the device from the
+ * upper layers.
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_sastopochg_evt_bh(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_fwevt *fwevt)
+{
+	struct mpi3_event_data_sas_topology_change_list *event_data =
+	    (struct mpi3_event_data_sas_topology_change_list *)fwevt->event_data;
+	int i;
+	u16 handle;
+	u8 reason_code;
+	struct mpi3mr_tgt_dev *tgtdev = NULL;
+	u64 exp_sas_address = 0, parent_sas_address = 0;
+	struct mpi3mr_hba_port *hba_port = NULL;
+	struct mpi3mr_sas_node *sas_expander = NULL;
+	unsigned long flags;
+	u8 link_rate, prev_link_rate, parent_phy_number;
+
+	mpi3mr_sastopochg_evt_debug(mrioc, event_data);
+	if (mrioc->sas_transport_enabled) {
+		hba_port = mpi3mr_get_hba_port_by_id(mrioc,
+		    event_data->io_unit_port, 0);
+		if (le16_to_cpu(event_data->expander_dev_handle)) {
+			spin_lock_irqsave(&mrioc->sas_node_lock, flags);
+			sas_expander = __mpi3mr_expander_find_by_handle(mrioc,
+			    le16_to_cpu(event_data->expander_dev_handle));
+			if (sas_expander) {
+				exp_sas_address = sas_expander->sas_address;
+				hba_port = sas_expander->hba_port;
+			}
+			spin_unlock_irqrestore(&mrioc->sas_node_lock, flags);
+			parent_sas_address = exp_sas_address;
+		} else
+			parent_sas_address = mrioc->sas_hba.sas_address;
+	}
+
+	for (i = 0; i < event_data->num_entries; i++) {
+		if (fwevt->discard)
+			return;
+		handle =
+		    le16_to_cpu(event_data->phy_entry[i].attached_dev_handle);
+		if (!handle)
+			continue;
+		tgtdev = mpi3mr_get_tgtdev_by_handle(mrioc, handle);
+		if (!tgtdev)
+			continue;
+
+		reason_code = event_data->phy_entry[i].status &
+		    MPI3_EVENT_SAS_TOPO_PHY_RC_MASK;
+
+		switch (reason_code) {
+		case MPI3_EVENT_SAS_TOPO_PHY_RC_TARG_NOT_RESPONDING:
+			if (tgtdev->host_exposed)
+				mpi3mr_remove_tgtdev_from_host(mrioc, tgtdev);
+			mpi3mr_tgtdev_del_from_list(mrioc, tgtdev);
+			mpi3mr_tgtdev_put(tgtdev);
+			break;
+		case MPI3_EVENT_SAS_TOPO_PHY_RC_RESPONDING:
+		case MPI3_EVENT_SAS_TOPO_PHY_RC_PHY_CHANGED:
+		case MPI3_EVENT_SAS_TOPO_PHY_RC_NO_CHANGE:
+		{
+			if (!mrioc->sas_transport_enabled || tgtdev->non_stl
+			    || tgtdev->is_hidden)
+				break;
+			link_rate = event_data->phy_entry[i].link_rate >> 4;
+			prev_link_rate = event_data->phy_entry[i].link_rate
+				& 0xF;
+			if (link_rate == prev_link_rate)
+				break;
+			if (!parent_sas_address)
+				break;
+			parent_phy_number = event_data->start_phy_num + i;
+			mpi3mr_update_links(mrioc, parent_sas_address, handle,
+			    parent_phy_number, link_rate, hba_port);
+		}
+		default:
+			break;
+		}
+		if (tgtdev)
+			mpi3mr_tgtdev_put(tgtdev);
+	}
+	if (mrioc->sas_transport_enabled && (event_data->exp_status ==
+	     MPI3_EVENT_SAS_TOPO_ES_NOT_RESPONDING)) {
+		if (sas_expander)
+			mpi3mr_expander_remove(mrioc, exp_sas_address,
+			    hba_port);
+	}
+
+}
+
+/**
+ * mpi3mr_pcietopochg_evt_debug - PCIeTopoChange details
+ * @mrioc: Adapter instance reference
+ * @event_data: PCIe topology change list event data
+ *
+ * Prints information about the PCIe topology change event.
+ *
+ * Return: Nothing.
+ */
+static void
+mpi3mr_pcietopochg_evt_debug(struct mpi3mr_ioc *mrioc,
+	struct mpi3_event_data_pcie_topology_change_list *event_data)
+{
+	int i;
+	u16 handle;
+	u16 reason_code;
+	u8 port_number;
+	char *status_str = NULL;
+	u8 link_rate, prev_link_rate;
+
+	if (!(mrioc->logging_level & MPI3_DEBUG_EVENT_WORK_TASK))
+		return;
+
+	switch (event_data->switch_status) {
+	case MPI3_EVENT_PCIE_TOPO_SS_NOT_RESPONDING:
+		status_str = "remove";
+		break;
+	case MPI3_EVENT_PCIE_TOPO_SS_RESPONDING:
+		status_str =  "responding";
+		break;
+	case MPI3_EVENT_PCIE_TOPO_SS_DELAY_NOT_RESPONDING:
+		status_str = "remove delay";
+		break;
+	case MPI3_EVENT_PCIE_TOPO_SS_NO_PCIE_SWITCH:
+		status_str = "direct attached";
+		break;
+	default:
+		status_str = "unknown status";
+		break;
+	}
+	ioc_info(mrioc, "%s :pcie topology change: (%s)\n",
+	    __func__, status_str);
+	ioc_info(mrioc,
+	    "%s :\tswitch_handle(0x%04x), enclosure_handle(0x%04x) start_port(%02d), num_entries(%d)\n",
+	    __func__, le16_to_cpu(event_data->switch_dev_handle),
+	    le16_to_cpu(event_data->enclosure_handle),
+	    event_data->start_port_num, event_data->num_entries);
+	for (i = 0; i < event_data->num_entries; i++) {
+		handle =
+		    le16_to_cpu(event_data->port_entry[i].attached_dev_handle);
+		if (!handle)
+			continue;
+		port_number = event_data->start_port_num + i;
+		reason_code = event_data->port_entry[i].port_status;
+		switch (reason_code) {
+		case MPI3_EVENT_PCIE_TOPO_PS_NOT_RESPONDING:
+			status_str = "target remove";
+			break;
+		case MPI3_EVENT_PCIE_TOPO_PS_DELAY_NOT_RESPONDING:
+			status_str = "delay target remove";
+			break;
+		case MPI3_EVENT_PCIE_TOPO_PS_PORT_CHANGED:
+			status_str = "link status change";
+			break;
+		case MPI3_EVENT_PCIE_TOPO_PS_NO_CHANGE:
+			status_str = "link status no change";
+			break;
+		case MPI3_EVENT_PCIE_TOPO_PS_RESPONDING:
+			status_str = "target responding";
+			break;
+		default:
+			status_str = "unknown";
+			break;
+		}
+		link_rate = event_data->port_entry[i].current_port_info &
+		    MPI3_EVENT_PCIE_TOPO_PI_RATE_MASK;
+		prev_link_rate = event_data->port_entry[i].previous_port_info &
+		    MPI3_EVENT_PCIE_TOPO_PI_RATE_MASK;
+		ioc_info(mrioc,
+		    "%s :\tport(%02d), attached_handle(0x%04x): %s: link rate: new(0x%02x), old(0x%02x)\n",
+		    __func__, port_number, handle, status_str, link_rate,
+		    prev_link_rate);
+	}
+}
+
+/**
+ * mpi3mr_pcietopochg_evt_bh - PCIeTopologyChange evt bottomhalf
+ * @mrioc: Adapter instance reference
+ * @fwevt: Firmware event reference
+ *
+ * Prints information about the PCIe topology change event and
+ * for "not responding" event code, removes the device from the
+ * upper layers.
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_pcietopochg_evt_bh(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_fwevt *fwevt)
+{
+	struct mpi3_event_data_pcie_topology_change_list *event_data =
+	    (struct mpi3_event_data_pcie_topology_change_list *)
+	    fwevt->event_data;
+	int i;
+	u16 handle;
+	u8 reason_code;
+	struct mpi3mr_tgt_dev *tgtdev = NULL;
+
+	mpi3mr_pcietopochg_evt_debug(mrioc, event_data);
+
+	for (i = 0; i < event_data->num_entries; i++) {
+		if (fwevt->discard)
+			return;
+		handle =
+		    le16_to_cpu(event_data->port_entry[i].attached_dev_handle);
+		if (!handle)
+			continue;
+		tgtdev = mpi3mr_get_tgtdev_by_handle(mrioc, handle);
+		if (!tgtdev)
+			continue;
+
+		reason_code = event_data->port_entry[i].port_status;
+
+		switch (reason_code) {
+		case MPI3_EVENT_PCIE_TOPO_PS_NOT_RESPONDING:
+			if (tgtdev->host_exposed)
+				mpi3mr_remove_tgtdev_from_host(mrioc, tgtdev);
+			mpi3mr_tgtdev_del_from_list(mrioc, tgtdev);
+			mpi3mr_tgtdev_put(tgtdev);
+			break;
+		default:
+			break;
+		}
+		if (tgtdev)
+			mpi3mr_tgtdev_put(tgtdev);
+	}
+}
+
+/**
+ * mpi3mr_logdata_evt_bh -  Log data event bottomhalf
+ * @mrioc: Adapter instance reference
+ * @fwevt: Firmware event reference
+ *
+ * Extracts the event data and calls application interfacing
+ * function to process the event further.
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_logdata_evt_bh(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_fwevt *fwevt)
+{
+	mpi3mr_app_save_logdata(mrioc, fwevt->event_data,
+	    fwevt->event_data_size);
+}
+
+/**
+ * mpi3mr_update_sdev_qd - Update SCSI device queue depath
+ * @sdev: SCSI device reference
+ * @data: Queue depth reference
+ *
+ * This is an iterator function called for each SCSI device in a
+ * target to update the QD of each SCSI device.
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_update_sdev_qd(struct scsi_device *sdev, void *data)
+{
+	u16 *q_depth = (u16 *)data;
+	scsi_change_queue_depth(sdev, (int)*q_depth);
+	sdev->max_queue_depth = sdev->queue_depth;
+}
+/**
+ * mpi3mr_set_qd_for_all_vd_in_tg -set QD for TG VDs
+ * @mrioc: Adapter instance reference
+ * @tg: Throttle group information pointer
+ *
+ * Accessor to reduce QD for each device associated with the
+ * given throttle group.
+ *
+ * Return: None.
+ */
+static void mpi3mr_set_qd_for_all_vd_in_tg(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_throttle_group_info *tg)
+{
+	unsigned long flags;
+	struct mpi3mr_tgt_dev *tgtdev;
+	struct mpi3mr_stgt_priv_data *tgt_priv;
+
+
+	spin_lock_irqsave(&mrioc->tgtdev_lock, flags);
+	list_for_each_entry(tgtdev, &mrioc->tgtdev_list, list)
+	if (tgtdev->starget && tgtdev->starget->hostdata) {
+		tgt_priv = tgtdev->starget->hostdata;
+		if (tgt_priv->throttle_group == tg) {
+			dprint_event_bh(mrioc,
+			    "updating qd due to throttling for persist_id(%d) original_qd(%d), reduced_qd (%d)\n",
+			    tgt_priv->perst_id, tgtdev->q_depth,
+			    tg->modified_qd);
+			starget_for_each_device(tgtdev->starget,
+			    (void *)&tg->modified_qd,
+			    mpi3mr_update_sdev_qd);
+		}
+	}
+	spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags);
+}
+
+/**
+ * mpi3mr_fwevt_bh - Firmware event bottomhalf handler
+ * @mrioc: Adapter instance reference
+ * @fwevt: Firmware event reference
+ *
+ * Identifies the firmware event and calls corresponding bottom
+ * half handler and sends event acknowledgment if required.
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_fwevt_bh(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_fwevt *fwevt)
+{
+	struct mpi3_device_page0 *dev_pg0 = NULL;
+	u16 perst_id, handle, dev_info;
+	struct mpi3_device0_sas_sata_format *sasinf = NULL;
+	struct mpi3mr_throttle_group_info *tg;
+
+	mpi3mr_fwevt_del_from_list(mrioc, fwevt);
+	mrioc->current_event = fwevt;
+
+	if (mrioc->stop_drv_processing) {
+		dprint_event_bh(mrioc, "ignoring event(0x%02x) in the bottom half handler due to stop_drv_processing\n",
+		    fwevt->event_id);
+		goto out;
+	}
+	if (mrioc->unrecoverable) {
+		dprint_event_bh(mrioc, "ignoring event(0x%02x) in the bottom half handler due to unrecoverable controller\n",
+		    fwevt->event_id);
+		goto out;
+	}
+
+	if (!fwevt->process_event)
+		goto evt_ack;
+
+	dprint_event_bh(mrioc, "processing event(0x%02x) in the bottom half handler\n",
+	    fwevt->event_id);
+	switch (fwevt->event_id) {
+	case MPI3_EVENT_DEVICE_ADDED:
+	{
+		dev_pg0 = (struct mpi3_device_page0 *)fwevt->event_data;
+		perst_id = le16_to_cpu(dev_pg0->persistent_id);
+		handle = le16_to_cpu(dev_pg0->dev_handle);
+		if (perst_id != MPI3_DEVICE0_PERSISTENTID_INVALID)
+			mpi3mr_report_tgtdev_to_host(mrioc, perst_id);
+		else if (mrioc->sas_transport_enabled &&
+			   (dev_pg0->device_form ==
+			    MPI3_DEVICE_DEVFORM_SAS_SATA)) {
+			sasinf = &dev_pg0->device_specific.sas_sata_format;
+			dev_info = le16_to_cpu(sasinf->device_info);
+			if (!mrioc->sas_hba.num_phys)
+				mpi3mr_sas_host_add(mrioc);
+			else
+				mpi3mr_sas_host_refresh(mrioc);
+			if (mpi3mr_is_expander_device(dev_info))
+				mpi3mr_expander_add(mrioc, handle);
+		}
+		break;
+	}
+	case MPI3_EVENT_DEVICE_INFO_CHANGED:
+	{
+		dev_pg0 = (struct mpi3_device_page0 *)fwevt->event_data;
+		perst_id = le16_to_cpu(dev_pg0->persistent_id);
+		if (perst_id != MPI3_DEVICE0_PERSISTENTID_INVALID)
+			mpi3mr_devinfochg_evt_bh(mrioc, dev_pg0);
+		break;
+	}
+	case MPI3_EVENT_DEVICE_STATUS_CHANGE:
+	{
+		mpi3mr_devstatuschg_evt_bh(mrioc, fwevt);
+		break;
+	}
+	case MPI3_EVENT_ENCL_DEVICE_ADDED:
+	case MPI3_EVENT_ENCL_DEVICE_STATUS_CHANGE:
+	{
+		mpi3mr_encldev_add_chg_evt_bh(mrioc, fwevt);
+		break;
+	}
+
+	case MPI3_EVENT_SAS_TOPOLOGY_CHANGE_LIST:
+	{
+		mpi3mr_sastopochg_evt_bh(mrioc, fwevt);
+		break;
+	}
+	case MPI3_EVENT_PCIE_TOPOLOGY_CHANGE_LIST:
+	{
+		mpi3mr_pcietopochg_evt_bh(mrioc, fwevt);
+		break;
+	}
+	case MPI3_EVENT_LOG_DATA:
+	{
+		mpi3mr_logdata_evt_bh(mrioc, fwevt);
+		break;
+	}
+	case MPI3MR_DRIVER_EVENT_WAIT_FOR_DEVICES_TO_REFRESH:
+	{
+		while (mrioc->device_refresh_on) {
+			msleep(500);
+		}
+		dprint_event_bh(mrioc,
+		    "scan for non responding and newly added devices after soft reset started\n");
+		if (mrioc->sas_transport_enabled) {
+			mpi3mr_refresh_sas_ports(mrioc);
+			mpi3mr_refresh_expanders(mrioc);
+		}
+		mpi3mr_refresh_tgtdevs(mrioc);
+		ioc_info(mrioc,
+		    "scan for non responding and newly added devices after soft reset completed\n");
+		break;
+	}
+	case MPI3MR_DRIVER_EVENT_TG_QD_REDUCTION:
+	{
+		tg = (struct mpi3mr_throttle_group_info *)
+		    (*(__le64 *)fwevt->event_data);
+		dprint_event_bh(mrioc,
+		    "qd reduction event processed for tg_id(%d) reduction_needed(%d)\n",
+		    tg->id, tg->need_qd_reduction);
+		if (tg->need_qd_reduction) {
+			mpi3mr_set_qd_for_all_vd_in_tg(mrioc, tg);
+			tg->need_qd_reduction = 0;
+		}
+		break;
+	}
+	case MPI3MR_DRIVER_EVENT_PROCESS_TRIGGER:
+	{
+		mpi3mr_process_trigger_data_event_bh(mrioc,
+		    (struct trigger_event_data *)fwevt->event_data);
+		break;
+	}
+	default:
+		break;
+	}
+
+evt_ack:
+	if (fwevt->send_ack)
+		mpi3mr_process_event_ack(mrioc, fwevt->event_id,
+		    fwevt->event_context);
+out:
+	/* Put fwevt reference count to neutralize kref_init increment */
+	mpi3mr_fwevt_put(fwevt);
+	mrioc->current_event = NULL;
+
+}
+
+/**
+ * mpi3mr_fwevt_worker - Firmware event worker
+ * @work: Work struct containing firmware event
+ *
+ * Extracts the firmware event and calls mpi3mr_fwevt_bh.
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_fwevt_worker(struct work_struct *work)
+{
+	struct mpi3mr_fwevt *fwevt = container_of(work, struct mpi3mr_fwevt,
+	    work);
+	mpi3mr_fwevt_bh(fwevt->mrioc, fwevt);
+	/*
+	 * Put fwevt reference count after
+	 * dequeuing it from worker queue
+	 */
+	mpi3mr_fwevt_put(fwevt);
+}
+
+
+/**
+ * mpi3mr_create_tgtdev - Create and add a target device
+ * @mrioc: Adapter instance reference
+ * @dev_pg0: Device Page 0 data
+ *
+ * If the device specified by the device page 0 data is not
+ * present in the driver's internal list, allocate the memory
+ * for the device, populate the data and add to the list, else
+ * update the device data.  The key is persistent ID.
+ *
+ * Return: 0 on success, -ENOMEM on memory allocation failure
+ */
+static int mpi3mr_create_tgtdev(struct mpi3mr_ioc *mrioc,
+	struct mpi3_device_page0 *dev_pg0)
+{
+	int retval = 0;
+	struct mpi3mr_tgt_dev *tgtdev = NULL;
+	u16 perst_id = 0;
+
+	perst_id = le16_to_cpu(dev_pg0->persistent_id);
+	if (perst_id == MPI3_DEVICE0_PERSISTENTID_INVALID)
+		return retval;
+
+	tgtdev = mpi3mr_get_tgtdev_by_perst_id(mrioc, perst_id);
+	if (tgtdev) {
+		mpi3mr_update_tgtdev(mrioc, tgtdev, dev_pg0, true);
+		mpi3mr_tgtdev_put(tgtdev);
+	} else {
+		tgtdev = mpi3mr_alloc_tgtdev();
+		if (!tgtdev)
+			return -ENOMEM;
+		mpi3mr_update_tgtdev(mrioc, tgtdev, dev_pg0, true);
+		mpi3mr_tgtdev_add_to_list(mrioc, tgtdev);
+	}
+
+	return retval;
+}
+
+/**
+ * mpi3mr_flush_delayed_cmd_lists - Flush pending commands
+ * @mrioc: Adapter instance reference
+ *
+ * Flush pending commands in the delayed lists due to a
+ * controller reset or driver removal as a cleanup.
+ *
+ * Return: Nothing
+ */
+void mpi3mr_flush_delayed_cmd_lists(struct mpi3mr_ioc *mrioc)
+{
+	struct delayed_dev_rmhs_node *_rmhs_node;
+	struct delayed_evt_ack_node *_evtack_node;
+
+	dprint_reset(mrioc, "flushing delayed dev_remove_hs commands\n");
+	while (!list_empty(&mrioc->delayed_rmhs_list)) {
+		_rmhs_node = list_entry(mrioc->delayed_rmhs_list.next,
+		    struct delayed_dev_rmhs_node, list);
+		list_del(&_rmhs_node->list);
+		kfree(_rmhs_node);
+	}
+	dprint_reset(mrioc, "flushing delayed event ack commands\n");
+	while (!list_empty(&mrioc->delayed_evtack_cmds_list)) {
+		_evtack_node = list_entry(mrioc->delayed_evtack_cmds_list.next,
+		    struct delayed_evt_ack_node, list);
+		list_del(&_evtack_node->list);
+		kfree(_evtack_node);
+	}
+}
+
+/**
+ * mpi3mr_dev_rmhs_complete_iou - Device removal IOUC completion
+ * @mrioc: Adapter instance reference
+ * @drv_cmd: Internal command tracker
+ *
+ * Issues a target reset TM to the firmware from the device
+ * removal TM pend list or retry the removal handshake sequence
+ * based on the IOU control request IOC status.
+ *
+ * Return: Nothing
+ */
+static void mpi3mr_dev_rmhs_complete_iou(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_drv_cmd *drv_cmd)
+{
+	u16 cmd_idx = drv_cmd->host_tag - MPI3MR_HOSTTAG_DEVRMCMD_MIN;
+	struct delayed_dev_rmhs_node *delayed_dev_rmhs = NULL;
+
+	if (drv_cmd->state & MPI3MR_CMD_RESET)
+		goto clear_drv_cmd;
+
+	ioc_info(mrioc,
+	    "dev_remove_hs: cmd_idx(%d): iounit control completed for handle(0x%04x), rc(%d), ioc_status(0x%04x), loginfo(0x%08x)\n",
+	    cmd_idx,  drv_cmd->dev_handle, drv_cmd->iou_rc,
+	    drv_cmd->ioc_status, drv_cmd->ioc_loginfo);
+
+	if (drv_cmd->ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+		if (drv_cmd->retry_count < MPI3MR_DEV_RMHS_RETRY_COUNT) {
+			drv_cmd->retry_count++;
+			ioc_info(mrioc,
+			    "dev_remove_hs: cmd_idx(%d): retrying for handle(0x%04x), rc(%d), ioc_status(0x%04x), loginfo(0x%08x), retry_count(%d)\n",
+			    cmd_idx,  drv_cmd->dev_handle, drv_cmd->iou_rc,
+			    drv_cmd->ioc_status, drv_cmd->ioc_loginfo,
+			    drv_cmd->retry_count);
+			mpi3mr_dev_rmhs_send_tm(mrioc, drv_cmd->dev_handle,
+			    drv_cmd, drv_cmd->iou_rc);
+			return;
+		}
+		ioc_err(mrioc,
+		    "dev_remove_hs: cmd_idx(%d): failed for handle(0x%04x), rc(%d) after all retries(%d)\n",
+		    cmd_idx,  drv_cmd->dev_handle, drv_cmd->iou_rc,
+		    drv_cmd->retry_count);
+	} else {
+		ioc_info(mrioc,
+		    "dev_remove_hs: cmd_idx(%d): completed successfully for handle(0x%04x), rc(%d)\n",
+		    cmd_idx,  drv_cmd->dev_handle, drv_cmd->iou_rc);
+		clear_bit(drv_cmd->dev_handle, mrioc->removepend_bitmap);
+	}
+
+	if (!list_empty(&mrioc->delayed_rmhs_list)) {
+		delayed_dev_rmhs = list_entry(mrioc->delayed_rmhs_list.next,
+		    struct delayed_dev_rmhs_node, list);
+		drv_cmd->dev_handle = delayed_dev_rmhs->handle;
+		drv_cmd->retry_count = 0;
+		drv_cmd->iou_rc = delayed_dev_rmhs->iou_rc;
+		mpi3mr_dev_rmhs_send_tm(mrioc, drv_cmd->dev_handle, drv_cmd,
+		    drv_cmd->iou_rc);
+		list_del(&delayed_dev_rmhs->list);
+		kfree(delayed_dev_rmhs);
+		return;
+	}
+
+clear_drv_cmd:
+	drv_cmd->state = MPI3MR_CMD_NOTUSED;
+	drv_cmd->callback = NULL;
+	drv_cmd->retry_count = 0;
+	drv_cmd->dev_handle = MPI3MR_INVALID_DEV_HANDLE;
+	clear_bit(cmd_idx, mrioc->devrem_bitmap);
+}
+
+/**
+ * mpi3mr_dev_rmhs_complete_tm - Device removal TM completion
+ * @mrioc: Adapter instance reference
+ * @drv_cmd: Internal command tracker
+ *
+ * Issues a target reset TM to the firmware from the device
+ * removal TM pend list or issue IO Unit Control request as
+ * part of device removal or hidden acknowledgment handshake.
+ *
+ * Return: Nothing
+ */
+static void mpi3mr_dev_rmhs_complete_tm(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_drv_cmd *drv_cmd)
+{
+	struct mpi3_iounit_control_request iou_ctrl;
+	u16 cmd_idx = drv_cmd->host_tag - MPI3MR_HOSTTAG_DEVRMCMD_MIN;
+	struct mpi3_scsi_task_mgmt_reply *tm_reply = NULL;
+	int retval;
+
+	if (drv_cmd->state & MPI3MR_CMD_RESET)
+		goto clear_drv_cmd;
+
+	if (drv_cmd->state & MPI3MR_CMD_REPLY_VALID)
+		tm_reply = (struct mpi3_scsi_task_mgmt_reply *)drv_cmd->reply;
+
+	if (tm_reply)
+		ioc_info(mrioc,
+		    "dev_remove_hs: cmd_idx(%d): target reset completed for handle(0x%04x), ioc_status(0x%04x), log_info(0x%08x), termination_count(%d)\n",
+		    cmd_idx, drv_cmd->dev_handle, drv_cmd->ioc_status,
+		    drv_cmd->ioc_loginfo,
+		    le32_to_cpu(tm_reply->termination_count));
+
+	ioc_info(mrioc,
+	    "dev_remove_hs: cmd_idx(%d): sending iounit control for handle(0x%04x) rc(%d)\n",
+	    cmd_idx, drv_cmd->dev_handle, drv_cmd->iou_rc);
+
+	memset(&iou_ctrl, 0, sizeof(iou_ctrl));
+
+	drv_cmd->state = MPI3MR_CMD_PENDING;
+	drv_cmd->is_waiting = 0;
+	drv_cmd->callback = mpi3mr_dev_rmhs_complete_iou;
+	iou_ctrl.operation = drv_cmd->iou_rc;
+	iou_ctrl.param16[0] = cpu_to_le16(drv_cmd->dev_handle);
+	iou_ctrl.host_tag = cpu_to_le16(drv_cmd->host_tag);
+	iou_ctrl.function = MPI3_FUNCTION_IO_UNIT_CONTROL;
+
+	retval = mpi3mr_admin_request_post(mrioc, &iou_ctrl, sizeof(iou_ctrl),
+	    1);
+	if (retval) {
+		ioc_err(mrioc,
+		    "dev_remove_hs: cmd_idx(%d): posting iounit control for handle(0x%04x) rc(%d) failed\n",
+		    cmd_idx, drv_cmd->dev_handle, drv_cmd->iou_rc);
+		goto clear_drv_cmd;
+	}
+	ioc_info(mrioc,
+	    "dev_remove_hs: cmd_idx(%d): posted iounit control for handle(0x%04x) rc(%d)\n",
+	    cmd_idx, drv_cmd->dev_handle, drv_cmd->iou_rc);
+
+	return;
+clear_drv_cmd:
+	drv_cmd->state = MPI3MR_CMD_NOTUSED;
+	drv_cmd->callback = NULL;
+	drv_cmd->dev_handle = MPI3MR_INVALID_DEV_HANDLE;
+	drv_cmd->retry_count = 0;
+	clear_bit(cmd_idx, mrioc->devrem_bitmap);
+}
+
+/**
+ * mpi3mr_dev_rmhs_send_tm - Issue TM for device removal
+ * @mrioc: Adapter instance reference
+ * @handle: Device handle
+ * @cmdparam: Internal command tracker
+ * @iou_rc: IO unit reason code
+ *
+ * Issues a target reset TM to the firmware or add it to a pend
+ * list as part of device removal or hidden acknowledgment
+ * handshake.
+ *
+ * Return: Nothing
+ */
+static void mpi3mr_dev_rmhs_send_tm(struct mpi3mr_ioc *mrioc, u16 handle,
+	struct mpi3mr_drv_cmd *cmdparam, u8 iou_rc)
+{
+	struct mpi3_scsi_task_mgmt_request tm_req;
+	int retval = 0;
+	u16 cmd_idx = MPI3MR_NUM_DEVRMCMD;
+	u8 retrycount = 5;
+	struct mpi3mr_drv_cmd *drv_cmd = cmdparam;
+	struct delayed_dev_rmhs_node *delayed_dev_rmhs = NULL;
+
+	if (drv_cmd) {
+		ioc_info(mrioc,
+		    "dev_remove_hs: sending delayed target reset for handle(0x%04x) rc(%d)\n",
+		    handle, iou_rc);
+		goto issue_cmd;
+	}
+	ioc_info(mrioc,
+	    "dev_remove_hs: sending target reset for handle(0x%04x) rc(%d)\n",
+	    handle, iou_rc);
+
+	do {
+		cmd_idx = find_first_zero_bit(mrioc->devrem_bitmap,
+		    MPI3MR_NUM_DEVRMCMD);
+		if (cmd_idx < MPI3MR_NUM_DEVRMCMD) {
+			if (!test_and_set_bit(cmd_idx, mrioc->devrem_bitmap))
+				break;
+			cmd_idx = MPI3MR_NUM_DEVRMCMD;
+		}
+	} while (retrycount--);
+
+	if (cmd_idx >= MPI3MR_NUM_DEVRMCMD) {
+		delayed_dev_rmhs = kzalloc(sizeof(*delayed_dev_rmhs),
+		    GFP_ATOMIC);
+		if (!delayed_dev_rmhs)
+			return;
+		INIT_LIST_HEAD(&delayed_dev_rmhs->list);
+		delayed_dev_rmhs->handle = handle;
+		delayed_dev_rmhs->iou_rc = iou_rc;
+		list_add_tail(&delayed_dev_rmhs->list,
+		    &mrioc->delayed_rmhs_list);
+		ioc_info(mrioc,
+		    "dev_remove_hs: target reset for handle(0x%04x) rc(%d) is postponed\n",
+		    handle, iou_rc);
+		return;
+	}
+	drv_cmd = &mrioc->dev_rmhs_cmds[cmd_idx];
+
+issue_cmd:
+	cmd_idx = drv_cmd->host_tag - MPI3MR_HOSTTAG_DEVRMCMD_MIN;
+
+	memset(&tm_req, 0, sizeof(tm_req));
+	if (drv_cmd->state & MPI3MR_CMD_PENDING) {
+		ioc_err(mrioc,
+		    "dev_remove_hs: sending target reset for handle(0x%04x) rc(%d) is failed due to command in use\n",
+		    handle, iou_rc);
+		goto out;
+	}
+	drv_cmd->state = MPI3MR_CMD_PENDING;
+	drv_cmd->is_waiting = 0;
+	drv_cmd->callback = mpi3mr_dev_rmhs_complete_tm;
+	drv_cmd->dev_handle = handle;
+	drv_cmd->iou_rc = iou_rc;
+	tm_req.dev_handle = cpu_to_le16(handle);
+	tm_req.task_type = MPI3_SCSITASKMGMT_TASKTYPE_TARGET_RESET;
+	tm_req.host_tag = cpu_to_le16(drv_cmd->host_tag);
+	tm_req.task_host_tag = cpu_to_le16(MPI3MR_HOSTTAG_INVALID);
+	tm_req.function = MPI3_FUNCTION_SCSI_TASK_MGMT;
+
+	set_bit(handle, mrioc->removepend_bitmap);
+	retval = mpi3mr_admin_request_post(mrioc, &tm_req, sizeof(tm_req), 1);
+	if (retval) {
+		ioc_err(mrioc,
+		    "dev_remove_hs: posting target reset for handle(0x%04x) rc(%d) is failed\n",
+		    handle, iou_rc);
+		goto out_failed;
+	}
+	ioc_info(mrioc,
+	    "dev_remove_hs: posted target reset for handle(0x%04x) rc(%d) with cmd_idx(%d)\n",
+	    handle, iou_rc, cmd_idx);
+out:
+	return;
+out_failed:
+	drv_cmd->state = MPI3MR_CMD_NOTUSED;
+	drv_cmd->callback = NULL;
+	drv_cmd->dev_handle = MPI3MR_INVALID_DEV_HANDLE;
+	drv_cmd->retry_count = 0;
+	clear_bit(cmd_idx, mrioc->devrem_bitmap);
+}
+
+
+/**
+ * mpi3mr_complete_evt_ack - event ack request completion
+ * @mrioc: Adapter instance reference
+ * @drv_cmd: Internal command tracker
+ *
+ * This is the completion handler for non blocking event
+ * acknowledgment sent to the firmware and this will issue any
+ * pending event acknowledgment request.
+ *
+ * Return: Nothing
+ */
+static void mpi3mr_complete_evt_ack(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_drv_cmd *drv_cmd)
+{
+	u16 cmd_idx = drv_cmd->host_tag - MPI3MR_HOSTTAG_EVTACKCMD_MIN;
+	struct delayed_evt_ack_node *delayed_evtack = NULL;
+
+	if (drv_cmd->state & MPI3MR_CMD_RESET)
+		goto clear_drv_cmd;
+
+	if (drv_cmd->ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+		dprint_event_th(mrioc,
+		    "immediate event ack failed with ioc_status(0x%04x) log_info(0x%08x)\n",
+		    (drv_cmd->ioc_status & MPI3_IOCSTATUS_STATUS_MASK),
+		    drv_cmd->ioc_loginfo);
+	}
+
+	if (!list_empty(&mrioc->delayed_evtack_cmds_list)) {
+		delayed_evtack =
+			list_entry(mrioc->delayed_evtack_cmds_list.next,
+			    struct delayed_evt_ack_node, list);
+		mpi3mr_send_event_ack(mrioc, delayed_evtack->event, drv_cmd,
+		    delayed_evtack->event_ctx);
+		list_del(&delayed_evtack->list);
+		kfree(delayed_evtack);
+		return;
+	}
+clear_drv_cmd:
+	drv_cmd->state = MPI3MR_CMD_NOTUSED;
+	drv_cmd->callback = NULL;
+	clear_bit(cmd_idx, mrioc->evtack_cmds_bitmap);
+}
+
+/**
+ * mpi3mr_send_event_ack - Issue event acknwoledgment request
+ * @mrioc: Adapter instance reference
+ * @event: MPI3 event id
+ * @cmdparam: Internal command tracker
+ * @event_ctx: event context
+ *
+ * Issues event acknowledgment request to the firmware if there
+ * is a free command to send the event ack else it to a pend
+ * list so that it will be processed on a completion of a prior
+ * event acknowledgment .
+ *
+ * Return: Nothing
+ */
+static void mpi3mr_send_event_ack(struct mpi3mr_ioc *mrioc, u8 event,
+	struct mpi3mr_drv_cmd *cmdparam, u32 event_ctx)
+{
+	struct mpi3_event_ack_request evtack_req;
+	int retval = 0;
+	u8 retrycount = 5;
+	u16 cmd_idx = MPI3MR_NUM_EVTACKCMD;
+	struct mpi3mr_drv_cmd *drv_cmd = cmdparam;
+	struct delayed_evt_ack_node *delayed_evtack = NULL;
+
+	if (drv_cmd) {
+		dprint_event_th(mrioc,
+		    "sending delayed event ack in the top half for event(0x%02x), event_ctx(0x%08x)\n",
+		    event, event_ctx);
+		goto issue_cmd;
+	}
+	dprint_event_th(mrioc,
+	    "sending event ack in the top half for event(0x%02x), event_ctx(0x%08x)\n",
+	    event, event_ctx);
+	do {
+		cmd_idx = find_first_zero_bit(mrioc->evtack_cmds_bitmap,
+		    MPI3MR_NUM_EVTACKCMD);
+		if (cmd_idx < MPI3MR_NUM_EVTACKCMD) {
+			if (!test_and_set_bit(cmd_idx,
+			    mrioc->evtack_cmds_bitmap))
+				break;
+			cmd_idx = MPI3MR_NUM_EVTACKCMD;
+		}
+	} while (retrycount--);
+
+	if (cmd_idx >= MPI3MR_NUM_EVTACKCMD) {
+		delayed_evtack = kzalloc(sizeof(*delayed_evtack),
+		    GFP_ATOMIC);
+		if (!delayed_evtack)
+			return;
+		INIT_LIST_HEAD(&delayed_evtack->list);
+		delayed_evtack->event = event;
+		delayed_evtack->event_ctx = event_ctx;
+		list_add_tail(&delayed_evtack->list,
+		    &mrioc->delayed_evtack_cmds_list);
+		dprint_event_th(mrioc,
+		    "event ack in the top half for event(0x%02x), event_ctx(0x%08x) is postponed\n",
+		    event, event_ctx);
+		return;
+	}
+	drv_cmd = &mrioc->evtack_cmds[cmd_idx];
+
+issue_cmd:
+	cmd_idx = drv_cmd->host_tag - MPI3MR_HOSTTAG_EVTACKCMD_MIN;
+
+	memset(&evtack_req, 0, sizeof(evtack_req));
+	if (drv_cmd->state & MPI3MR_CMD_PENDING) {
+		dprint_event_th(mrioc,
+		    "sending event ack failed due to command in use\n");
+		goto out;
+	}
+	drv_cmd->state = MPI3MR_CMD_PENDING;
+	drv_cmd->is_waiting = 0;
+	drv_cmd->callback = mpi3mr_complete_evt_ack;
+	evtack_req.host_tag = cpu_to_le16(drv_cmd->host_tag);
+	evtack_req.function = MPI3_FUNCTION_EVENT_ACK;
+	evtack_req.event = event;
+	evtack_req.event_context = cpu_to_le32(event_ctx);
+	retval = mpi3mr_admin_request_post(mrioc, &evtack_req,
+	    sizeof(evtack_req), 1);
+	if (retval) {
+		dprint_event_th(mrioc,
+		    "posting event ack request is failed\n");
+		goto out_failed;
+	}
+
+	dprint_event_th(mrioc,
+	    "event ack in the top half for event(0x%02x), event_ctx(0x%08x) is posted\n",
+	    event, event_ctx);
+out:
+	return;
+out_failed:
+	drv_cmd->state = MPI3MR_CMD_NOTUSED;
+	drv_cmd->callback = NULL;
+	clear_bit(cmd_idx, mrioc->evtack_cmds_bitmap);
+}
+
+/**
+ * mpi3mr_pcietopochg_evt_th - PCIETopologyChange event tophalf
+ * @mrioc: Adapter instance reference
+ * @event_reply: event data
+ *
+ * Checks for the reason code and based on that either block I/O
+ * to device, or unblock I/O to the device, or start the device
+ * removal handshake with reason as remove with the firmware for
+ * PCIe devices.
+ *
+ * Return: Nothing
+ */
+static void mpi3mr_pcietopochg_evt_th(struct mpi3mr_ioc *mrioc,
+	struct mpi3_event_notification_reply *event_reply)
+{
+	struct mpi3_event_data_pcie_topology_change_list *topo_evt =
+	    (struct mpi3_event_data_pcie_topology_change_list *) event_reply->event_data;
+	int i;
+	u16 handle;
+	u8 reason_code;
+	struct mpi3mr_tgt_dev *tgtdev = NULL;
+	struct mpi3mr_stgt_priv_data *scsi_tgt_priv_data = NULL;
+
+	for (i = 0; i < topo_evt->num_entries; i++) {
+		handle = le16_to_cpu(topo_evt->port_entry[i].attached_dev_handle);
+		if (!handle)
+			continue;
+		reason_code = topo_evt->port_entry[i].port_status;
+		scsi_tgt_priv_data =  NULL;
+		tgtdev = mpi3mr_get_tgtdev_by_handle(mrioc, handle);
+		if (tgtdev && tgtdev->starget && tgtdev->starget->hostdata)
+			scsi_tgt_priv_data = (struct mpi3mr_stgt_priv_data *)
+			    tgtdev->starget->hostdata;
+		switch (reason_code) {
+		case MPI3_EVENT_PCIE_TOPO_PS_NOT_RESPONDING:
+			if (scsi_tgt_priv_data) {
+				scsi_tgt_priv_data->dev_removed = 1;
+				scsi_tgt_priv_data->dev_removedelay = 0;
+				atomic_set(&scsi_tgt_priv_data->block_io, 0);
+			}
+			mpi3mr_dev_rmhs_send_tm(mrioc, handle, NULL,
+			    MPI3_CTRL_OP_REMOVE_DEVICE);
+			break;
+		case MPI3_EVENT_PCIE_TOPO_PS_DELAY_NOT_RESPONDING:
+			if (scsi_tgt_priv_data) {
+				scsi_tgt_priv_data->dev_removedelay = 1;
+				atomic_inc(&scsi_tgt_priv_data->block_io);
+			}
+			break;
+		case MPI3_EVENT_PCIE_TOPO_PS_RESPONDING:
+			if (scsi_tgt_priv_data &&
+			    scsi_tgt_priv_data->dev_removedelay) {
+				scsi_tgt_priv_data->dev_removedelay = 0;
+				atomic_dec_if_positive
+				    (&scsi_tgt_priv_data->block_io);
+			}
+			break;
+		case MPI3_EVENT_PCIE_TOPO_PS_PORT_CHANGED:
+		default:
+			break;
+		}
+		if (tgtdev)
+			mpi3mr_tgtdev_put(tgtdev);
+	}
+}
+
+/**
+ * mpi3mr_sastopochg_evt_th - SASTopologyChange event tophalf
+ * @mrioc: Adapter instance reference
+ * @event_reply: event data
+ *
+ * Checks for the reason code and based on that either block I/O
+ * to device, or unblock I/O to the device, or start the device
+ * removal handshake with reason as remove with the firmware for
+ * SAS/SATA devices.
+ *
+ * Return: Nothing
+ */
+static void mpi3mr_sastopochg_evt_th(struct mpi3mr_ioc *mrioc,
+	struct mpi3_event_notification_reply *event_reply)
+{
+	struct mpi3_event_data_sas_topology_change_list *topo_evt =
+	    (struct mpi3_event_data_sas_topology_change_list *)
+	    event_reply->event_data;
+	int i;
+	u16 handle;
+	u8 reason_code;
+	struct mpi3mr_tgt_dev *tgtdev = NULL;
+	struct mpi3mr_stgt_priv_data *scsi_tgt_priv_data = NULL;
+
+	for (i = 0; i < topo_evt->num_entries; i++) {
+		handle = le16_to_cpu(topo_evt->phy_entry[i].attached_dev_handle);
+		if (!handle)
+			continue;
+		reason_code = topo_evt->phy_entry[i].status &
+		    MPI3_EVENT_SAS_TOPO_PHY_RC_MASK;
+		scsi_tgt_priv_data =  NULL;
+		tgtdev = mpi3mr_get_tgtdev_by_handle(mrioc, handle);
+		if (tgtdev && tgtdev->starget && tgtdev->starget->hostdata)
+			scsi_tgt_priv_data = (struct mpi3mr_stgt_priv_data *)
+			    tgtdev->starget->hostdata;
+		switch (reason_code) {
+		case MPI3_EVENT_SAS_TOPO_PHY_RC_TARG_NOT_RESPONDING:
+			if (scsi_tgt_priv_data) {
+				scsi_tgt_priv_data->dev_removed = 1;
+				scsi_tgt_priv_data->dev_removedelay = 0;
+				atomic_set(&scsi_tgt_priv_data->block_io, 0);
+			}
+			mpi3mr_dev_rmhs_send_tm(mrioc, handle, NULL,
+			    MPI3_CTRL_OP_REMOVE_DEVICE);
+			break;
+		case MPI3_EVENT_SAS_TOPO_PHY_RC_DELAY_NOT_RESPONDING:
+			if (scsi_tgt_priv_data) {
+				scsi_tgt_priv_data->dev_removedelay = 1;
+				atomic_inc(&scsi_tgt_priv_data->block_io);
+			}
+			break;
+		case MPI3_EVENT_SAS_TOPO_PHY_RC_RESPONDING:
+			if (scsi_tgt_priv_data &&
+			    scsi_tgt_priv_data->dev_removedelay) {
+				scsi_tgt_priv_data->dev_removedelay = 0;
+				atomic_dec_if_positive
+				    (&scsi_tgt_priv_data->block_io);
+			}
+		case MPI3_EVENT_SAS_TOPO_PHY_RC_PHY_CHANGED:
+		default:
+			break;
+		}
+		if (tgtdev)
+			mpi3mr_tgtdev_put(tgtdev);
+	}
+
+}
+
+/**
+ * mpi3mr_devstatuschg_evt_th - DeviceStatusChange event tophalf
+ * @mrioc: Adapter instance reference
+ * @event_reply: event data
+ *
+ * Checks for the reason code and based on that either block I/O
+ * to device, or unblock I/O to the device, or start the device
+ * removal handshake with reason as remove/hide acknowledgment
+ * with the firmware.
+ *
+ * Return: Nothing
+ */
+static void mpi3mr_devstatuschg_evt_th(struct mpi3mr_ioc *mrioc,
+	struct mpi3_event_notification_reply *event_reply)
+{
+	u16 dev_handle = 0;
+	u8 ublock = 0, block = 0, hide = 0, delete = 0, remove = 0;
+	struct mpi3mr_tgt_dev *tgtdev = NULL;
+	struct mpi3mr_stgt_priv_data *scsi_tgt_priv_data = NULL;
+	struct mpi3_event_data_device_status_change *evtdata =
+	    (struct mpi3_event_data_device_status_change *)
+	    event_reply->event_data;
+
+	if (mrioc->stop_drv_processing)
+		goto out;
+
+	dev_handle = le16_to_cpu(evtdata->dev_handle);
+	dprint_event_th(mrioc,
+	    "device status change event top half with rc(0x%02x) for handle(0x%04x)\n",
+	    evtdata->reason_code, dev_handle);
+
+	switch (evtdata->reason_code) {
+	case MPI3_EVENT_DEV_STAT_RC_INT_DEVICE_RESET_STRT:
+	case MPI3_EVENT_DEV_STAT_RC_INT_IT_NEXUS_RESET_STRT:
+		block = 1;
+		break;
+	case MPI3_EVENT_DEV_STAT_RC_HIDDEN:
+		delete = 1;
+		hide = 1;
+		break;
+	case MPI3_EVENT_DEV_STAT_RC_VD_NOT_RESPONDING:
+		delete = 1;
+		remove = 1;
+		break;
+	case MPI3_EVENT_DEV_STAT_RC_INT_DEVICE_RESET_CMP:
+	case MPI3_EVENT_DEV_STAT_RC_INT_IT_NEXUS_RESET_CMP:
+		ublock = 1;
+		break;
+	default:
+		break;
+	}
+
+	tgtdev = mpi3mr_get_tgtdev_by_handle(mrioc, dev_handle);
+	if (!tgtdev) {
+		dprint_event_th(mrioc,
+		    "processing device status change event could not identify device for handle(0x%04x)\n",
+		    dev_handle);
+		goto out;
+	}
+	if (hide)
+		tgtdev->is_hidden = hide;
+	if (tgtdev->starget && tgtdev->starget->hostdata) {
+		scsi_tgt_priv_data = (struct mpi3mr_stgt_priv_data *)
+		    tgtdev->starget->hostdata;
+		if (block)
+			atomic_inc(&scsi_tgt_priv_data->block_io);
+		if (delete)
+			scsi_tgt_priv_data->dev_removed = 1;
+		if (ublock)
+			atomic_dec_if_positive(&scsi_tgt_priv_data->block_io);
+	}
+	if (remove)
+		mpi3mr_dev_rmhs_send_tm(mrioc, dev_handle, NULL,
+		    MPI3_CTRL_OP_REMOVE_DEVICE);
+	if (hide)
+		mpi3mr_dev_rmhs_send_tm(mrioc, dev_handle, NULL,
+		    MPI3_CTRL_OP_HIDDEN_ACK);
+
+out:
+	if (tgtdev)
+		mpi3mr_tgtdev_put(tgtdev);
+
+}
+
+/**
+ * mpi3mr_preparereset_evt_th - Prepare for reset event tophalf
+ * @mrioc: Adapter instance reference
+ * @event_reply: event data
+ *
+ * Blocks and unblocks host level I/O based on the reason code
+ *
+ * Return: Nothing
+ */
+static void mpi3mr_preparereset_evt_th(struct mpi3mr_ioc *mrioc,
+	struct mpi3_event_notification_reply *event_reply)
+{
+	struct mpi3_event_data_prepare_for_reset *evtdata =
+	    (struct mpi3_event_data_prepare_for_reset *)event_reply->event_data;
+
+	if (evtdata->reason_code == MPI3_EVENT_PREPARE_RESET_RC_START) {
+		dprint_event_th(mrioc,
+		    "prepare for reset event top half with rc=start\n");
+		if (mrioc->prepare_for_reset)
+			return;
+		mrioc->prepare_for_reset = 1;
+		mrioc->prepare_for_reset_timeout_counter = 0;
+	} else if (evtdata->reason_code == MPI3_EVENT_PREPARE_RESET_RC_ABORT) {
+		dprint_event_th(mrioc,
+		    "prepare for reset top half with rc=abort\n");
+		mrioc->prepare_for_reset = 0;
+		mrioc->prepare_for_reset_timeout_counter = 0;
+	}
+	if ((event_reply->msg_flags & MPI3_EVENT_NOTIFY_MSGFLAGS_ACK_MASK)
+	    == MPI3_EVENT_NOTIFY_MSGFLAGS_ACK_REQUIRED)
+		mpi3mr_send_event_ack(mrioc, event_reply->event, NULL,
+		    le32_to_cpu(event_reply->event_context));
+}
+
+/**
+ * mpi3mr_energypackchg_evt_th - Energy pack change event tophalf
+ * @mrioc: Adapter instance reference
+ * @event_reply: event data
+ *
+ * Identifies the new shutdown timeout value and update.
+ *
+ * Return: Nothing
+ */
+static void mpi3mr_energypackchg_evt_th(struct mpi3mr_ioc *mrioc,
+	struct mpi3_event_notification_reply *event_reply)
+{
+	struct mpi3_event_data_energy_pack_change *evtdata =
+	    (struct mpi3_event_data_energy_pack_change *)
+	    event_reply->event_data;
+	u16 shutdown_timeout = le16_to_cpu(evtdata->shutdown_timeout);
+
+	if (shutdown_timeout <= 0) {
+		dprint_event_th(mrioc,
+		    "invalid shutdown timeout(%d) in the energy pack change event\n",
+		    shutdown_timeout);
+		return;
+	}
+
+	dprint_event_th(mrioc,
+	    "previous shutdown timeout(%d), new shutdown timeout(%d) in the energy pack change event\n",
+	    mrioc->facts.shutdown_timeout, shutdown_timeout);
+	mrioc->facts.shutdown_timeout = shutdown_timeout;
+}
+
+
+/**
+ * mpi3mr_cablemgmt_evt_th - Cable management event tophalf
+ * @mrioc: Adapter instance reference
+ * @event_reply: event data
+ *
+ * Displays Cable manegemt event details.
+ *
+ * Return: Nothing
+ */
+static void mpi3mr_cablemgmt_evt_th(struct mpi3mr_ioc *mrioc,
+	struct mpi3_event_notification_reply *event_reply)
+{
+	struct mpi3_event_data_cable_management *evtdata =
+	    (struct mpi3_event_data_cable_management *)event_reply->event_data;
+
+	switch (evtdata->status) {
+	case MPI3_EVENT_CABLE_MGMT_STATUS_INSUFFICIENT_POWER:
+	{
+		ioc_info(mrioc, "An active cable with receptacle_id %d cannot be powered.\n"
+		    "Devices connected to this cable are not detected.\n"
+		    "This cable requires %d mW of power.\n",
+		    evtdata->receptacle_id,
+		    le32_to_cpu(evtdata->active_cable_power_requirement));
+		break;
+	}
+	case MPI3_EVENT_CABLE_MGMT_STATUS_DEGRADED:
+	{
+		ioc_info(mrioc, "A cable with receptacle_id %d is not running at optimal speed\n",
+		    evtdata->receptacle_id);
+		break;
+	}
+	default:
+		break;
+	}
+}
+
+/**
+ * mpi3mr_add_event_wait_for_device_refresh - Add Wait for Device Refresh Event
+ * @mrioc: Adapter instance reference
+ *
+ * Add driver specific event to make sure that the driver won't process the
+ * events until all the devices are refreshed during soft reset.
+ *
+ * Return: Nothing
+ */
+void mpi3mr_add_event_wait_for_device_refresh(struct mpi3mr_ioc *mrioc)
+{
+	struct mpi3mr_fwevt *fwevt = NULL;
+
+	fwevt = mpi3mr_alloc_fwevt(0);
+	if (!fwevt)
+	{
+		dprint_event_th(mrioc,
+		    "failed to schedule bottom half handler for event(0x%02x)\n",
+		    MPI3MR_DRIVER_EVENT_WAIT_FOR_DEVICES_TO_REFRESH);
+		return;
+	}
+	fwevt->mrioc = mrioc;
+	fwevt->event_id = MPI3MR_DRIVER_EVENT_WAIT_FOR_DEVICES_TO_REFRESH;
+	fwevt->send_ack = 0;
+	fwevt->process_event = 1;
+	fwevt->event_context = 0;
+	fwevt->event_data_size = 0;
+	mpi3mr_fwevt_add_to_list(mrioc, fwevt);
+}
+
+/**
+ * mpi3mr_os_handle_events - Firmware event handler
+ * @mrioc: Adapter instance reference
+ * @event_reply: event data
+ *
+ * Identify whether the event has to handled and acknowledged
+ * and either process the event in the tophalf and/or schedule a
+ * bottom half through mpi3mr_fwevt_worker.
+ *
+ * Return: Nothing
+ */
+void mpi3mr_os_handle_events(struct mpi3mr_ioc *mrioc,
+	struct mpi3_event_notification_reply *event_reply)
+{
+	u8 evt_type;
+	u16 sz;
+	struct mpi3mr_fwevt *fwevt = NULL;
+	bool ack_req = 0, process_event_bh = 0;
+
+	if (mrioc->stop_drv_processing)
+		return;
+
+	if ((event_reply->msg_flags & MPI3_EVENT_NOTIFY_MSGFLAGS_ACK_MASK)
+	    == MPI3_EVENT_NOTIFY_MSGFLAGS_ACK_REQUIRED)
+		ack_req = 1;
+
+	evt_type = event_reply->event;
+	mpi3mr_event_trigger(mrioc, event_reply->event);
+
+	switch (evt_type) {
+	case MPI3_EVENT_DEVICE_ADDED:
+	{
+		struct mpi3_device_page0 *dev_pg0 =
+		    (struct mpi3_device_page0 *)event_reply->event_data;
+		if (mpi3mr_create_tgtdev(mrioc, dev_pg0))
+			dprint_event_th(mrioc,
+			    "failed to process device added event for handle(0x%04x), perst_id(%d) in the event top half handler\n",
+			    le16_to_cpu(dev_pg0->dev_handle),
+			    le16_to_cpu(dev_pg0->persistent_id));
+		else
+			process_event_bh = 1;
+		break;
+	}
+	case MPI3_EVENT_DEVICE_STATUS_CHANGE:
+	{
+		process_event_bh = 1;
+		mpi3mr_devstatuschg_evt_th(mrioc, event_reply);
+		break;
+	}
+	case MPI3_EVENT_SAS_TOPOLOGY_CHANGE_LIST:
+	{
+		process_event_bh = 1;
+		mpi3mr_sastopochg_evt_th(mrioc, event_reply);
+		break;
+	}
+	case MPI3_EVENT_PCIE_TOPOLOGY_CHANGE_LIST:
+	{
+		process_event_bh = 1;
+		mpi3mr_pcietopochg_evt_th(mrioc, event_reply);
+		break;
+	}
+	case MPI3_EVENT_PREPARE_FOR_RESET:
+	{
+		mpi3mr_preparereset_evt_th(mrioc, event_reply);
+		ack_req = 0;
+		break;
+	}
+	case MPI3_EVENT_DIAGNOSTIC_BUFFER_STATUS_CHANGE:
+	{
+		mpi3mr_hdbstatuschg_evt_th(mrioc, event_reply);
+		break;
+	}
+	case MPI3_EVENT_DEVICE_INFO_CHANGED:
+	case MPI3_EVENT_LOG_DATA:
+	case MPI3_EVENT_ENCL_DEVICE_STATUS_CHANGE:
+	case MPI3_EVENT_ENCL_DEVICE_ADDED:
+	{
+		process_event_bh = 1;
+		break;
+	}
+	case MPI3_EVENT_ENERGY_PACK_CHANGE:
+	{
+		mpi3mr_energypackchg_evt_th(mrioc, event_reply);
+		break;
+	}
+	case MPI3_EVENT_CABLE_MGMT:
+	{
+		mpi3mr_cablemgmt_evt_th(mrioc, event_reply);
+		break;
+	}
+
+	case MPI3_EVENT_SAS_DISCOVERY:
+	case MPI3_EVENT_SAS_DEVICE_DISCOVERY_ERROR:
+	case MPI3_EVENT_SAS_BROADCAST_PRIMITIVE:
+	case MPI3_EVENT_PCIE_ENUMERATION:
+		break;
+	default:
+		break;
+	}
+	if (process_event_bh || ack_req) {
+		dprint_event_th(mrioc,
+		    "scheduling bottom half handler for event(0x%02x), ack_required=%d\n",
+		    evt_type, ack_req);
+		sz = event_reply->event_data_length * 4;
+		fwevt = mpi3mr_alloc_fwevt(sz);
+		if (!fwevt)
+		{
+			dprint_event_th(mrioc,
+			    "failed to schedule bottom half handler for event(0x%02x), ack_required=%d\n",
+			    evt_type, ack_req);
+			return;
+		}
+		memcpy(fwevt->event_data, event_reply->event_data, sz);
+		fwevt->mrioc = mrioc;
+		fwevt->event_id = evt_type;
+		fwevt->send_ack = ack_req;
+		fwevt->process_event = process_event_bh;
+		fwevt->event_context =
+			le32_to_cpu(event_reply->event_context);
+		fwevt->event_data_size = sz;
+		mpi3mr_fwevt_add_to_list(mrioc, fwevt);
+	}
+}
+
+/**
+ * mpi3mr_get_fw_pending_ios - Calculate pending I/O count
+ * @mrioc: Adapter instance reference
+ *
+ * Calculate the pending I/Os for the controller and return.
+ *
+ * Return: Number of pending I/Os
+ */
+static inline int mpi3mr_get_fw_pending_ios(struct mpi3mr_ioc *mrioc)
+{
+	u16 i;
+	uint pend_ios = 0;
+
+	for (i = 0; i < mrioc->num_op_reply_q; i++)
+		pend_ios += atomic_read(&mrioc->op_reply_qinfo[i].pend_ios);
+	return pend_ios;
+}
+
+/**
+ * mpi3mr_wait_for_host_io - block for I/Os to complete
+ * @mrioc: Adapter instance reference
+ * @timeout: time out in seconds
+ * Waits for pending I/Os for the given adapter to complete or
+ * to hit the timeout.
+ *
+ * Return: Nothing
+ */
+void mpi3mr_wait_for_host_io(struct mpi3mr_ioc *mrioc, u32 timeout)
+{
+	enum mpi3mr_iocstate iocstate;
+	int i = 0;
+
+	iocstate = mpi3mr_get_iocstate(mrioc);
+	if (iocstate != MRIOC_STATE_READY)
+		return;
+
+	if (!mpi3mr_get_fw_pending_ios(mrioc))
+		return;
+	ioc_info(mrioc,
+	    "waiting for maximum of %d seconds prior to reset for %d pending I/Os to complete\n",
+	    timeout, mpi3mr_get_fw_pending_ios(mrioc));
+
+	for (i = 0; i < timeout; i++) {
+		if (!mpi3mr_get_fw_pending_ios(mrioc))
+			break;
+		iocstate = mpi3mr_get_iocstate(mrioc);
+		if (iocstate != MRIOC_STATE_READY)
+			break;
+		msleep(1000);
+	}
+
+	ioc_info(mrioc, "pending I/Os after wait is: %d\n",
+	    mpi3mr_get_fw_pending_ios(mrioc));
+}
+
+/**
+ * mpi3mr_setup_nvme_eedp - Setup DIF info for NVMe IO request
+ * @mrioc: Adapter instance reference
+ * @scmd: SCSI command reference
+ * @scsiio_req: MPI3 SCSI IO request
+ * @scsiio_flags: Pointer to MPI3 SCSI IO Flags
+ *
+ * Identifies the protection information flags from the SCSI
+ * command and set appropriate flags in the MPI3 SCSI IO request
+ * for the I/Os issued to the NVMe drives.
+ *
+ * Return: Nothing
+ */
+static void mpi3mr_setup_nvme_eedp(struct mpi3mr_ioc *mrioc,
+	struct scsi_cmnd *scmd, struct mpi3_scsi_io_request *scsiio_req,
+	u32 *scsiio_flags)
+{
+	unsigned char prot_op = scsi_get_prot_op(scmd);
+	u8 host_md = 0, opcode = scmd->cmnd[0], sa = scmd->cmnd[9], xprt = 0;
+
+
+	if ((prot_op == SCSI_PROT_READ_PASS) ||
+	    (prot_op == SCSI_PROT_WRITE_PASS)) {
+		host_md = 1;
+		scsiio_req->msg_flags |= MPI3_SCSIIO_MSGFLAGS_METASGL_VALID;
+	}
+
+	if (!mrioc->check_xprotect_nvme)
+		return;
+
+	if (!((opcode == READ_10) || (opcode == WRITE_10) ||
+	    (opcode == READ_12) || (opcode == WRITE_12) ||
+	    (opcode == READ_16) || (opcode == WRITE_16) ||
+	    ((opcode == VARIABLE_LENGTH_CMD) &&
+	    ((sa == READ_32) || (sa == WRITE_32)))))
+		return;
+	if (opcode == VARIABLE_LENGTH_CMD)
+		xprt = scmd->cmnd[10] & 0xe0;
+	else
+		xprt = scmd->cmnd[1] & 0xe0;
+	if (!xprt) {
+		scsiio_req->msg_flags &= ~MPI3_SCSIIO_MSGFLAGS_METASGL_VALID;
+		scsiio_req->msg_flags |=
+		    MPI3_SCSIIO_MSGFLAGS_DIVERT_TO_FIRMWARE;
+		*scsiio_flags |= MPI3_SCSIIO_FLAGS_DIVERT_REASON_PROD_SPECIFIC;
+	} else if (!host_md) {
+		scsiio_req->msg_flags |=
+		    MPI3_SCSIIO_MSGFLAGS_DIVERT_TO_FIRMWARE;
+		*scsiio_flags |= MPI3_SCSIIO_FLAGS_DIVERT_REASON_PROD_SPECIFIC;
+	}
+}
+/**
+ * mpi3mr_setup_sas_eedp - Setup EEDP information for SAS IO Req
+ * @mrioc: Adapter instance reference
+ * @scmd: SCSI command reference
+ * @scsiio_req: MPI3 SCSI IO request
+ *
+ * Identifies the protection information flags from the SCSI
+ * command and set appropriate flags in the MPI3 SCSI IO
+ * request.
+ *
+ * Return: Nothing
+ */
+static void mpi3mr_setup_sas_eedp(struct mpi3mr_ioc *mrioc,
+	struct scsi_cmnd *scmd, struct mpi3_scsi_io_request *scsiio_req)
+{
+	u16 eedp_flags = 0;
+	unsigned char prot_op = scsi_get_prot_op(scmd);
+	unsigned char prot_type = scsi_get_prot_type(scmd);
+
+	scsiio_req->sgl[0].eedp.flags = MPI3_SGE_FLAGS_ELEMENT_TYPE_SIMPLE;
+	switch (prot_op) {
+	case SCSI_PROT_NORMAL:
+		return;
+	case SCSI_PROT_READ_STRIP:
+		eedp_flags = MPI3_EEDPFLAGS_EEDP_OP_CHECK_REMOVE;
+		break;
+	case SCSI_PROT_WRITE_INSERT:
+		eedp_flags = MPI3_EEDPFLAGS_EEDP_OP_INSERT;
+		break;
+	case SCSI_PROT_READ_INSERT:
+		eedp_flags = MPI3_EEDPFLAGS_EEDP_OP_INSERT;
+		scsiio_req->msg_flags |= MPI3_SCSIIO_MSGFLAGS_METASGL_VALID;
+		break;
+	case SCSI_PROT_WRITE_STRIP:
+		eedp_flags = MPI3_EEDPFLAGS_EEDP_OP_CHECK_REMOVE;
+		scsiio_req->msg_flags |= MPI3_SCSIIO_MSGFLAGS_METASGL_VALID;
+		break;
+	case SCSI_PROT_READ_PASS:
+	case SCSI_PROT_WRITE_PASS:
+		eedp_flags = MPI3_EEDPFLAGS_EEDP_OP_CHECK |
+		    MPI3_EEDPFLAGS_CHK_REF_TAG | MPI3_EEDPFLAGS_CHK_APP_TAG |
+		    MPI3_EEDPFLAGS_CHK_GUARD;
+		scsiio_req->msg_flags |= MPI3_SCSIIO_MSGFLAGS_METASGL_VALID;
+		break;
+	default:
+		return;
+	}
+
+	switch (prot_type) {
+	case SCSI_PROT_DIF_TYPE0:
+		eedp_flags |= MPI3_EEDPFLAGS_INCR_PRI_REF_TAG;
+		scsiio_req->cdb.eedp32.primary_reference_tag =
+		    cpu_to_be32(mpi3mr_kc_prot_ref_tag(scmd));
+		break;
+	case SCSI_PROT_DIF_TYPE1:
+	case SCSI_PROT_DIF_TYPE2:
+		eedp_flags |= MPI3_EEDPFLAGS_INCR_PRI_REF_TAG |
+		    MPI3_EEDPFLAGS_ESC_MODE_APPTAG_DISABLE |
+		    MPI3_EEDPFLAGS_CHK_GUARD;
+		scsiio_req->cdb.eedp32.primary_reference_tag =
+		    cpu_to_be32(mpi3mr_kc_prot_ref_tag(scmd));
+		break;
+	case SCSI_PROT_DIF_TYPE3:
+		eedp_flags |= MPI3_EEDPFLAGS_CHK_GUARD |
+		    MPI3_EEDPFLAGS_ESC_MODE_APPTAG_DISABLE;
+		break;
+
+	default:
+		scsiio_req->msg_flags &= ~(MPI3_SCSIIO_MSGFLAGS_METASGL_VALID);
+		return;
+	}
+
+	switch (scmd->device->sector_size) {
+	case 512:
+		scsiio_req->sgl[0].eedp.user_data_size = MPI3_EEDP_UDS_512;
+		break;
+	case 520:
+		scsiio_req->sgl[0].eedp.user_data_size = MPI3_EEDP_UDS_520;
+		break;
+	case 4080:
+		scsiio_req->sgl[0].eedp.user_data_size = MPI3_EEDP_UDS_4080;
+		break;
+	case 4088:
+		scsiio_req->sgl[0].eedp.user_data_size = MPI3_EEDP_UDS_4088;
+		break;
+	case 4096:
+		scsiio_req->sgl[0].eedp.user_data_size = MPI3_EEDP_UDS_4096;
+		break;
+	case 4104:
+		scsiio_req->sgl[0].eedp.user_data_size = MPI3_EEDP_UDS_4104;
+		break;
+	case 4160:
+		scsiio_req->sgl[0].eedp.user_data_size = MPI3_EEDP_UDS_4160;
+		break;
+	default:
+		break;
+	}
+
+	scsiio_req->sgl[0].eedp.eedp_flags = cpu_to_le16(eedp_flags);
+	scsiio_req->sgl[0].eedp.flags = MPI3_SGE_FLAGS_ELEMENT_TYPE_EXTENDED;
+}
+
+
+
+/**
+ * mpi3mr_build_sense_buffer - Map sense information
+ * @desc: Sense type
+ * @buf: Sense buffer to populate
+ * @key: Sense key
+ * @asc: Additional sense code
+ * @ascq: Additional sense code qualifier
+ *
+ * Maps the given sense information into either descriptor or
+ * fixed format sense data.
+ *
+ * Return: Nothing
+ */
+static inline void mpi3mr_build_sense_buffer(int desc, u8 *buf, u8 key,
+	u8 asc, u8 ascq)
+{
+	if (desc) {
+		buf[0] = 0x72;	/* descriptor, current */
+		buf[1] = key;
+		buf[2] = asc;
+		buf[3] = ascq;
+		buf[7] = 0;
+	} else {
+		buf[0] = 0x70;	/* fixed, current */
+		buf[2] = key;
+		buf[7] = 0xa;
+		buf[12] = asc;
+		buf[13] = ascq;
+	}
+}
+
+/**
+ * mpi3mr_map_eedp_error - Map EEDP errors from IOC status
+ * @scmd: SCSI command reference
+ * @ioc_status: status of MPI3 request
+ *
+ * Maps the EEDP error status of the SCSI IO request to sense
+ * data.
+ *
+ * Return: Nothing
+ */
+static void mpi3mr_map_eedp_error(struct scsi_cmnd *scmd,
+	u16 ioc_status)
+{
+	u8 ascq = 0;
+
+	switch (ioc_status) {
+	case MPI3_IOCSTATUS_EEDP_GUARD_ERROR:
+		ascq = 0x01;
+		break;
+	case MPI3_IOCSTATUS_EEDP_APP_TAG_ERROR:
+		ascq = 0x02;
+		break;
+	case MPI3_IOCSTATUS_EEDP_REF_TAG_ERROR:
+		ascq = 0x03;
+		break;
+	default:
+		ascq = 0x00;
+		break;
+	}
+
+	mpi3mr_scsi_build_sense(scmd, 0, ILLEGAL_REQUEST, 0x10, ascq);
+	set_host_byte(scmd, DID_ABORT);
+}
+
+/**
+ * mpi3mr_process_op_reply_desc - reply descriptor handler
+ * @mrioc: Adapter instance reference
+ * @reply_desc: Operational reply descriptor
+ * @reply_dma: place holder for reply DMA address
+ * @qidx: Operational queue index
+ *
+ * Process the operational reply descriptor and identifies the
+ * descriptor type. Based on the descriptor map the MPI3 request
+ * status to a SCSI command status and calls scsi_done call
+ * back.
+ *
+ * Return: Nothing
+ */
+void mpi3mr_process_op_reply_desc(struct mpi3mr_ioc *mrioc,
+	struct mpi3_default_reply_descriptor *reply_desc, u64 *reply_dma,
+	u16 qidx)
+{
+	u16 reply_desc_type, host_tag = 0;
+	u16 ioc_status = MPI3_IOCSTATUS_SUCCESS;
+	u32 ioc_loginfo = 0;
+	struct mpi3_status_reply_descriptor *status_desc = NULL;
+	struct mpi3_address_reply_descriptor *addr_desc = NULL;
+	struct mpi3_success_reply_descriptor *success_desc = NULL;
+	struct mpi3_scsi_io_reply *scsi_reply = NULL;
+	struct scsi_cmnd *scmd = NULL;
+	struct scmd_priv *priv = NULL;
+	u8 *sense_buf = NULL;
+	u8 scsi_state = 0, scsi_status = 0, sense_state = 0;
+	u32 xfer_count = 0, sense_count = 0, resp_data = 0;
+	u16 dev_handle = 0xFFFF;
+	struct scsi_sense_hdr sshdr;
+	struct mpi3mr_stgt_priv_data *stgt_priv_data = NULL;
+	struct mpi3mr_sdev_priv_data *sdev_priv_data = NULL;
+	u32 ioc_pend_data_len = 0, tg_pend_data_len = 0, data_len_blks = 0;
+	struct mpi3mr_throttle_group_info *tg = NULL;
+	u8 throttle_enabled_dev = 0;
+
+	*reply_dma = 0;
+	reply_desc_type = le16_to_cpu(reply_desc->reply_flags) &
+	    MPI3_REPLY_DESCRIPT_FLAGS_TYPE_MASK;
+	switch (reply_desc_type) {
+	case MPI3_REPLY_DESCRIPT_FLAGS_TYPE_STATUS:
+		status_desc = (struct mpi3_status_reply_descriptor *)reply_desc;
+		host_tag = le16_to_cpu(status_desc->host_tag);
+		ioc_status = le16_to_cpu(status_desc->ioc_status);
+		if (ioc_status &
+		    MPI3_REPLY_DESCRIPT_STATUS_IOCSTATUS_LOGINFOAVAIL)
+			ioc_loginfo = le32_to_cpu(status_desc->ioc_log_info);
+		ioc_status &= MPI3_REPLY_DESCRIPT_STATUS_IOCSTATUS_STATUS_MASK;
+		mpi3mr_reply_trigger(mrioc, ioc_status, ioc_loginfo);
+		break;
+	case MPI3_REPLY_DESCRIPT_FLAGS_TYPE_ADDRESS_REPLY:
+		addr_desc = (struct mpi3_address_reply_descriptor *)reply_desc;
+		*reply_dma = le64_to_cpu(addr_desc->reply_frame_address);
+		scsi_reply = mpi3mr_get_reply_virt_addr(mrioc,
+		    *reply_dma);
+		if (!scsi_reply) {
+			ioc_err(mrioc, "NULL address reply is received, qidx %d\n",
+			    qidx);
+			goto out;
+		}
+		host_tag = le16_to_cpu(scsi_reply->host_tag);
+		ioc_status = le16_to_cpu(scsi_reply->ioc_status);
+		scsi_status = scsi_reply->scsi_status;
+		scsi_state = scsi_reply->scsi_state;
+		dev_handle = le16_to_cpu(scsi_reply->dev_handle);
+		sense_state = (scsi_state & MPI3_SCSI_STATE_SENSE_MASK);
+		xfer_count = le32_to_cpu(scsi_reply->transfer_count);
+		sense_count = le32_to_cpu(scsi_reply->sense_count);
+		resp_data = le32_to_cpu(scsi_reply->response_data);
+		sense_buf = mpi3mr_get_sensebuf_virt_addr(mrioc,
+		    le64_to_cpu(scsi_reply->sense_data_buffer_address));
+		if (ioc_status &
+		    MPI3_REPLY_DESCRIPT_STATUS_IOCSTATUS_LOGINFOAVAIL)
+			ioc_loginfo = le32_to_cpu(scsi_reply->ioc_log_info);
+		ioc_status &= MPI3_REPLY_DESCRIPT_STATUS_IOCSTATUS_STATUS_MASK;
+		if (sense_state == MPI3_SCSI_STATE_SENSE_BUFF_Q_EMPTY)
+			ioc_err(mrioc,
+			    "controller cannot transfer sense data due to empty sense buffer queue\n");
+		if (sense_buf) {
+			scsi_normalize_sense(sense_buf, sense_count, &sshdr);
+			mpi3mr_scsisense_trigger(mrioc, sshdr.sense_key,
+			    sshdr.asc, sshdr.ascq);
+		}
+		mpi3mr_reply_trigger(mrioc, ioc_status, ioc_loginfo);
+		break;
+	case MPI3_REPLY_DESCRIPT_FLAGS_TYPE_SUCCESS:
+		success_desc = (struct mpi3_success_reply_descriptor *)
+		    reply_desc;
+		host_tag = le16_to_cpu(success_desc->host_tag);
+		break;
+	default:
+		break;
+	}
+
+	scmd = mpi3mr_scmd_from_host_tag(mrioc, host_tag, qidx);
+	if (!scmd) {
+		ioc_err(mrioc, "cannot identify scmd for host_tag %d\n",
+		    host_tag);
+		goto out;
+	}
+	priv = scsi_cmd_priv(scmd);
+
+	data_len_blks = scsi_bufflen(scmd) >> 9;
+	sdev_priv_data = scmd->device->hostdata;
+	if (sdev_priv_data) {
+		stgt_priv_data = sdev_priv_data->tgt_priv_data;
+		if (stgt_priv_data) {
+			tg = stgt_priv_data->throttle_group;
+			throttle_enabled_dev =
+				stgt_priv_data->io_throttle_enabled;
+		}
+	}
+	if (unlikely((data_len_blks >= mrioc->io_throttle_data_length) &&
+	     throttle_enabled_dev)) {
+		ioc_pend_data_len = atomic_sub_return(data_len_blks,
+		    &mrioc->pend_large_data_sz);
+		if (tg) {
+			tg_pend_data_len = atomic_sub_return(data_len_blks,
+			    &tg->pend_large_data_sz);
+#ifdef THROTTLE_LOGGING
+			if (printk_ratelimit())
+				ioc_info(mrioc,
+				    "large vd_io completion persist_id(%d), handle(0x%04x), data_len(%d), ioc_pending(%d), tg_pending(%d), ioc_low(%d), tg_low(%d)\n",
+				    stgt_priv_data->perst_id,
+				    stgt_priv_data->dev_handle,
+				    data_len_blks, ioc_pend_data_len,
+				    tg_pend_data_len,
+				    mrioc->io_throttle_low,
+				    tg->low);
+#endif
+			if (tg->io_divert  && ((ioc_pend_data_len <=
+			    mrioc->io_throttle_low) &&
+			    (tg_pend_data_len <= tg->low))) {
+				tg->io_divert = 0;
+				mpi3mr_set_io_divert_for_all_vd_in_tg(
+				    mrioc, tg, 0);
+			}
+		} else {
+#ifdef THROTTLE_LOGGING
+			if (printk_ratelimit())
+				ioc_info(mrioc,
+				    "large pd_io completion persist_id(%d), handle(0x%04x), data_len(%d), ioc_pending(%d), ioc_low(%d)\n",
+				    stgt_priv_data->perst_id,
+				    stgt_priv_data->dev_handle,
+				    data_len_blks, ioc_pend_data_len,
+				    mrioc->io_throttle_low);
+#endif
+			if ( ioc_pend_data_len <= mrioc->io_throttle_low)
+				stgt_priv_data->io_divert = 0;
+		}
+	} else if (unlikely((stgt_priv_data && stgt_priv_data->io_divert))) {
+		ioc_pend_data_len = atomic_read(&mrioc->pend_large_data_sz);
+		if (!tg) {
+#ifdef THROTTLE_LOGGING
+			if (printk_ratelimit())
+				ioc_info(mrioc,
+				    "pd_io completion persist_id(%d), handle(0x%04x), data_len(%d), ioc_pending(%d), ioc_low(%d)\n",
+				    stgt_priv_data->perst_id,
+				    stgt_priv_data->dev_handle,
+				    data_len_blks, ioc_pend_data_len,
+				    mrioc->io_throttle_low);
+#endif
+			if ( ioc_pend_data_len <= mrioc->io_throttle_low)
+				stgt_priv_data->io_divert = 0;
+
+		} else if (ioc_pend_data_len <= mrioc->io_throttle_low) {
+			tg_pend_data_len = atomic_read(&tg->pend_large_data_sz);
+#ifdef THROTTLE_LOGGING
+			if (printk_ratelimit())
+				ioc_info(mrioc,
+				    "vd_io completion persist_id(%d), handle(0x%04x), data_len(%d), ioc_pending(%d), tg_pending(%d), ioc_low(%d), tg_low(%d)\n",
+				    stgt_priv_data->perst_id,
+				    stgt_priv_data->dev_handle,
+				    data_len_blks, ioc_pend_data_len,
+				    tg_pend_data_len,
+				    mrioc->io_throttle_low,
+				    tg->low);
+#endif
+			if (tg->io_divert  && (tg_pend_data_len <= tg->low)) {
+				tg->io_divert = 0;
+				mpi3mr_set_io_divert_for_all_vd_in_tg(
+				    mrioc, tg, 0);
+			}
+
+		}
+	}
+
+	if (success_desc) {
+		set_host_byte(scmd, DID_OK);
+		goto out_success;
+	}
+	scsi_set_resid(scmd, scsi_bufflen(scmd) - xfer_count);
+	if (ioc_status == MPI3_IOCSTATUS_SCSI_DATA_UNDERRUN &&
+	    xfer_count == 0 && (scsi_status == MPI3_SCSI_STATUS_BUSY ||
+	    scsi_status == MPI3_SCSI_STATUS_RESERVATION_CONFLICT ||
+	    scsi_status == MPI3_SCSI_STATUS_TASK_SET_FULL))
+		ioc_status = MPI3_IOCSTATUS_SUCCESS;
+
+	if ((sense_state == MPI3_SCSI_STATE_SENSE_VALID) && sense_count &&
+	    sense_buf) {
+		u32 sz = min_t(u32, SCSI_SENSE_BUFFERSIZE, sense_count);
+
+		memcpy(scmd->sense_buffer, sense_buf, sz);
+	}
+
+	switch (ioc_status) {
+	case MPI3_IOCSTATUS_BUSY:
+	case MPI3_IOCSTATUS_INSUFFICIENT_RESOURCES:
+		scmd->result = SAM_STAT_BUSY;
+		break;
+	case MPI3_IOCSTATUS_SCSI_DEVICE_NOT_THERE:
+		set_host_byte(scmd, DID_NO_CONNECT);
+		break;
+	case MPI3_IOCSTATUS_SCSI_IOC_TERMINATED:
+		set_host_byte(scmd, DID_SOFT_ERROR);
+		break;
+	case MPI3_IOCSTATUS_SCSI_TASK_TERMINATED:
+	case MPI3_IOCSTATUS_SCSI_EXT_TERMINATED:
+		set_host_byte(scmd, DID_RESET);
+		break;
+	case MPI3_IOCSTATUS_SCSI_RESIDUAL_MISMATCH:
+		if ((xfer_count == 0) || (scmd->underflow > xfer_count))
+			set_host_byte(scmd, DID_SOFT_ERROR);
+		else {
+			scmd->result |= scsi_status;
+			set_host_byte(scmd, DID_OK);
+		}
+		break;
+	case MPI3_IOCSTATUS_SCSI_DATA_UNDERRUN:
+		scmd->result |= scsi_status;
+		set_host_byte(scmd, DID_OK);
+		if (sense_state == MPI3_SCSI_STATE_SENSE_VALID)
+			break;
+		if (xfer_count < scmd->underflow) {
+			if (scsi_status == SAM_STAT_BUSY)
+				scmd->result |= SAM_STAT_BUSY;
+			else
+				set_host_byte(scmd, DID_SOFT_ERROR);
+		} else if ((scsi_state & (MPI3_SCSI_STATE_NO_SCSI_STATUS)) ||
+		    (sense_state != MPI3_SCSI_STATE_SENSE_NOT_AVAILABLE))
+			set_host_byte(scmd, DID_SOFT_ERROR);
+		else if (scsi_state & MPI3_SCSI_STATE_TERMINATED)
+			set_host_byte(scmd, DID_RESET);
+		break;
+	case MPI3_IOCSTATUS_SCSI_DATA_OVERRUN:
+		scsi_set_resid(scmd, 0);
+		/* fall through */
+		fallthrough;
+	case MPI3_IOCSTATUS_SCSI_RECOVERED_ERROR:
+	case MPI3_IOCSTATUS_SUCCESS:
+		scmd->result |= scsi_status;
+		set_host_byte(scmd, DID_OK);
+		if ((scsi_state & (MPI3_SCSI_STATE_NO_SCSI_STATUS)) ||
+			(sense_state == MPI3_SCSI_STATE_SENSE_FAILED) ||
+			(sense_state == MPI3_SCSI_STATE_SENSE_BUFF_Q_EMPTY))
+			set_host_byte(scmd, DID_SOFT_ERROR);
+		else if (scsi_state & MPI3_SCSI_STATE_TERMINATED)
+			set_host_byte(scmd, DID_RESET);
+		break;
+	case MPI3_IOCSTATUS_EEDP_GUARD_ERROR:
+	case MPI3_IOCSTATUS_EEDP_REF_TAG_ERROR:
+	case MPI3_IOCSTATUS_EEDP_APP_TAG_ERROR:
+		mpi3mr_map_eedp_error(scmd, ioc_status);
+		break;
+	case MPI3_IOCSTATUS_SCSI_PROTOCOL_ERROR:
+	case MPI3_IOCSTATUS_INVALID_FUNCTION:
+	case MPI3_IOCSTATUS_INVALID_SGL:
+	case MPI3_IOCSTATUS_INTERNAL_ERROR:
+	case MPI3_IOCSTATUS_INVALID_FIELD:
+	case MPI3_IOCSTATUS_INVALID_STATE:
+	case MPI3_IOCSTATUS_SCSI_IO_DATA_ERROR:
+	case MPI3_IOCSTATUS_SCSI_TASK_MGMT_FAILED:
+	case MPI3_IOCSTATUS_INSUFFICIENT_POWER:
+	default:
+		set_host_byte(scmd, DID_SOFT_ERROR);
+		break;
+	}
+
+	if ((mrioc->logging_level & MPI3_DEBUG_SCSI_ERROR) &&
+	    (scmd->result != (DID_OK << 16)) && (scmd->cmnd[0] != ATA_12) &&
+	    (scmd->cmnd[0] != ATA_16)) {
+		ioc_info(mrioc,
+		    "host_tag(%d): qid(%d): command issued to handle(0x%04x) returned with ioc_status(0x%04x), log_info(0x%08x), scsi_state(0x%02x), scsi_status(0x%02x), xfer_count(%d), resp_data(0x%08x) scmd_result(0x%08x)\n",
+		    host_tag, priv->req_q_idx+1, dev_handle, ioc_status,
+		    ioc_loginfo, scsi_state, scsi_status,  xfer_count,
+		    resp_data, scmd->result);
+		if (sense_buf)
+			ioc_info(mrioc,
+			    "host_tag(%d): qid(%d): sense_count(%d), sense_key(0x%x), ASC(0x%x,) ASCQ(0x%x)\n",
+			    host_tag, priv->req_q_idx+1, sense_count,
+			    sshdr.sense_key, sshdr.asc, sshdr.ascq);
+		scsi_print_command(scmd);
+	}
+out_success:
+	if (priv->meta_sg_valid) {
+		dma_unmap_sg(&mrioc->pdev->dev, scsi_prot_sglist(scmd),
+		    scsi_prot_sg_count(scmd), scmd->sc_data_direction);
+	}
+	mpi3mr_clear_scmd_priv(mrioc, scmd);
+	scsi_dma_unmap(scmd);
+	SCMD_DONE(scmd);
+out:
+	if (sense_buf)
+		mpi3mr_repost_sense_buf(mrioc,
+		    le64_to_cpu(scsi_reply->sense_data_buffer_address));
+}
+
+/**
+ * mpi3mr_get_chain_idx - get free chain buffer index
+ * @mrioc: Adapter instance reference
+ *
+ * Try to get a free chain buffer index from the free pool.
+ *
+ * Return: -1 on failure or the free chain buffer index
+ */
+static int mpi3mr_get_chain_idx(struct mpi3mr_ioc *mrioc)
+{
+	u8 retry_count = 5;
+	int cmd_idx = -1;
+
+	do {
+		spin_lock(&mrioc->chain_buf_lock);
+		cmd_idx = find_first_zero_bit(mrioc->chain_bitmap,
+		    mrioc->chain_buf_count);
+		if (cmd_idx < mrioc->chain_buf_count) {
+			set_bit(cmd_idx, mrioc->chain_bitmap);
+			spin_unlock(&mrioc->chain_buf_lock);
+			break;
+		}
+		spin_unlock(&mrioc->chain_buf_lock);
+		cmd_idx = -1;
+	} while (retry_count--);
+	return cmd_idx;
+}
+
+/**
+ * mpi3mr_prepare_sg_scmd - build scatter gather list
+ * @mrioc: Adapter instance reference
+ * @scmd: SCSI command reference
+ * @scsiio_req: MPI3 SCSI IO request
+ *
+ * This function maps SCSI command's data and protection SGEs to
+ * MPI request SGEs. If required additional 4K chain buffer is
+ * used to send the SGEs.
+ *
+ * Return: 0 on success, -ENOMEM on dma_map_sg failure
+ */
+static int mpi3mr_prepare_sg_scmd(struct mpi3mr_ioc *mrioc,
+	struct scsi_cmnd *scmd, struct mpi3_scsi_io_request *scsiio_req)
+{
+	dma_addr_t chain_dma;
+	struct scatterlist *sg_scmd;
+	void *sg_local, *chain;
+	u32 chain_length;
+	int sges_left, chain_idx;
+	u32 sges_in_segment;
+	u8 simple_sgl_flags;
+	u8 simple_sgl_flags_last;
+	u8 last_chain_sgl_flags;
+	struct chain_element *chain_req;
+	struct scmd_priv *priv = NULL;
+	u32 meta_sg = le32_to_cpu(scsiio_req->flags) &
+	    MPI3_SCSIIO_FLAGS_DMAOPERATION_HOST_PI;
+
+	priv = scsi_cmd_priv(scmd);
+
+	simple_sgl_flags = MPI3_SGE_FLAGS_ELEMENT_TYPE_SIMPLE |
+	    MPI3_SGE_FLAGS_DLAS_SYSTEM;
+	simple_sgl_flags_last = simple_sgl_flags |
+	    MPI3_SGE_FLAGS_END_OF_LIST;
+	last_chain_sgl_flags = MPI3_SGE_FLAGS_ELEMENT_TYPE_LAST_CHAIN |
+	    MPI3_SGE_FLAGS_DLAS_SYSTEM;
+
+	if (meta_sg)
+		sg_local = &scsiio_req->sgl[MPI3_SCSIIO_METASGL_INDEX];
+	else
+		sg_local = &scsiio_req->sgl;
+
+	if (!scsiio_req->data_length && !meta_sg) {
+		mpi3mr_build_zero_len_sge(sg_local);
+		return 0;
+	}
+
+	if (meta_sg) {
+		sg_scmd = scsi_prot_sglist(scmd);
+		sges_left = dma_map_sg(&mrioc->pdev->dev,
+		    scsi_prot_sglist(scmd),
+		    scsi_prot_sg_count(scmd),
+		    scmd->sc_data_direction);
+		priv->meta_sg_valid = 1; /* To unmap meta sg DMA */
+	} else {
+		sg_scmd = scsi_sglist(scmd);
+		sges_left = scsi_dma_map(scmd);
+	}
+
+	if (sges_left < 0)
+		return -ENOMEM;
+
+	if (sges_left > MPI3MR_SG_DEPTH) {
+		pr_err_ratelimited(
+		    "sd %s: scsi_dma_map returned unsupported sge count %d!\n",
+		    dev_name(&scmd->device->sdev_gendev), sges_left);
+		return -ENOMEM;
+	}
+
+	sges_in_segment = (mrioc->facts.op_req_sz -
+			   offsetof(struct mpi3_scsi_io_request, sgl)) /
+			   sizeof(struct mpi3_sge_common);
+
+	if (scsiio_req->sgl[0].eedp.flags ==
+	    MPI3_SGE_FLAGS_ELEMENT_TYPE_EXTENDED && !meta_sg) {
+		sg_local += sizeof(struct mpi3_sge_common);
+		sges_in_segment--;
+		/* Reserve 1st segment (scsiio_req->sgl[0]) for eedp */
+	}
+
+	if (scsiio_req->msg_flags ==
+	    MPI3_SCSIIO_MSGFLAGS_METASGL_VALID && !meta_sg) {
+		sges_in_segment--;
+		/* Reserve last segment (scsiio_req->sgl[3]) for meta sg */
+	}
+
+	if (meta_sg)
+		sges_in_segment = 1;
+
+	if (sges_left <= sges_in_segment)
+		goto fill_in_last_segment;
+
+	/* fill in main message segment when there is a chain following */
+	while (sges_in_segment > 1) {
+		mpi3mr_add_sg_single(sg_local, simple_sgl_flags,
+		    sg_dma_len(sg_scmd), sg_dma_address(sg_scmd));
+		sg_scmd = sg_next(sg_scmd);
+		sg_local += sizeof(struct mpi3_sge_common);
+		sges_left--;
+		sges_in_segment--;
+	}
+
+	chain_idx = mpi3mr_get_chain_idx(mrioc);
+	if (chain_idx < 0)
+		return -1;
+	chain_req = &mrioc->chain_sgl_list[chain_idx];
+	if (meta_sg)
+		priv->meta_chain_idx = chain_idx;
+	else
+		priv->chain_idx = chain_idx;
+
+	chain = chain_req->addr;
+	chain_dma = chain_req->dma_addr;
+	sges_in_segment = sges_left;
+	chain_length = sges_in_segment * sizeof(struct mpi3_sge_common);
+
+	mpi3mr_add_sg_single(sg_local, last_chain_sgl_flags,
+	    chain_length, chain_dma);
+
+	sg_local = chain;
+
+fill_in_last_segment:
+	while (sges_left > 0) {
+		if (sges_left == 1)
+			mpi3mr_add_sg_single(sg_local,
+			    simple_sgl_flags_last, sg_dma_len(sg_scmd),
+			    sg_dma_address(sg_scmd));
+		else
+			mpi3mr_add_sg_single(sg_local, simple_sgl_flags,
+			    sg_dma_len(sg_scmd), sg_dma_address(sg_scmd));
+		sg_scmd = sg_next(sg_scmd);
+		sg_local += sizeof(struct mpi3_sge_common);
+		sges_left--;
+	}
+
+	return 0;
+}
+
+/**
+ * mpi3mr_build_sg_scmd - build scatter gather list for SCSI IO
+ * @mrioc: Adapter instance reference
+ * @scmd: SCSI command reference
+ * @scsiio_req: MPI3 SCSI IO request
+ *
+ * This function calls mpi3mr_prepare_sg_scmd for constructing
+ * both data SGEs and protection information SGEs in the MPI
+ * format from the SCSI Command as appropriate .
+ *
+ * Return: return value of mpi3mr_prepare_sg_scmd.
+ */
+static int mpi3mr_build_sg_scmd(struct mpi3mr_ioc *mrioc,
+	struct scsi_cmnd *scmd, struct mpi3_scsi_io_request *scsiio_req)
+{
+	int ret;
+
+	ret = mpi3mr_prepare_sg_scmd(mrioc, scmd, scsiio_req);
+	if (ret)
+		return ret;
+
+	if (scsiio_req->msg_flags == MPI3_SCSIIO_MSGFLAGS_METASGL_VALID) {
+		/* There is a valid meta sg */
+		scsiio_req->flags |=
+		    cpu_to_le32(MPI3_SCSIIO_FLAGS_DMAOPERATION_HOST_PI);
+		ret = mpi3mr_prepare_sg_scmd(mrioc, scmd, scsiio_req);
+	}
+
+	return ret;
+}
+
+/**
+ * mpi3mr_tm_response_name -  get TM response as a string
+ * @resp_code: TM response code
+ *
+ * Convert known task management response code as a readable
+ * string.
+ *
+ * Return: response code string.
+ */
+static const char* mpi3mr_tm_response_name(u8 resp_code)
+{
+	char *desc;
+
+	switch (resp_code) {
+	case MPI3_SCSITASKMGMT_RSPCODE_TM_COMPLETE:
+		desc = "task management request completed";
+		break;
+	case MPI3_SCSITASKMGMT_RSPCODE_INVALID_FRAME:
+		desc = "invalid frame";
+		break;
+	case MPI3_SCSITASKMGMT_RSPCODE_TM_FUNCTION_NOT_SUPPORTED:
+		desc = "task management request not supported";
+		break;
+	case MPI3_SCSITASKMGMT_RSPCODE_TM_FAILED:
+		desc = "task management request failed";
+		break;
+	case MPI3_SCSITASKMGMT_RSPCODE_TM_SUCCEEDED:
+		desc = "task management request succeeded";
+		break;
+	case MPI3_SCSITASKMGMT_RSPCODE_TM_INVALID_LUN:
+		desc = "invalid LUN";
+		break;
+	case MPI3_SCSITASKMGMT_RSPCODE_TM_OVERLAPPED_TAG:
+		desc = "overlapped tag attempted";
+		break;
+	case MPI3_SCSITASKMGMT_RSPCODE_IO_QUEUED_ON_IOC:
+		desc = "task queued, however not sent to target";
+		break;
+	case MPI3_SCSITASKMGMT_RSPCODE_TM_NVME_DENIED:
+		desc = "task management request denied by NVMe device";
+		break;
+	default:
+		desc = "unknown";
+		break;
+	}
+
+	return desc;
+}
+
+inline void mpi3mr_poll_pend_io_completions(struct mpi3mr_ioc *mrioc)
+{
+	int i;
+	int num_of_reply_queues =
+	    mrioc->num_op_reply_q + mrioc->op_reply_q_offset;
+
+	for (i = mrioc->op_reply_q_offset; i < num_of_reply_queues; i++)
+		mpi3mr_process_op_reply_q(mrioc,
+		    mrioc->intr_info[i].op_reply_q);
+}
+
+/**
+ * mpi3mr_issue_tm - Issue Task Management request
+ * @mrioc: Adapter instance reference
+ * @tm_type: Task Management type
+ * @handle: Device handle
+ * @lun: lun ID
+ * @htag: Host tag of the TM request
+ * @timeout: TM timeout value
+ * @drv_cmd: Internal command tracker
+ * @resp_code: Response code place holder
+ * @scmd: SCSI command
+ *
+ * Issues a Task Management Request to the controller for a
+ * specified target, lun and command and wait for its completion
+ * and check TM response. Recover the TM if it timed out by
+ * issuing controller reset.
+ *
+ * Return: 0 on success, non-zero on errors
+ */
+int mpi3mr_issue_tm(struct mpi3mr_ioc *mrioc, u8 tm_type,
+	u16 handle, uint lun, u16 htag, ulong timeout,
+	struct mpi3mr_drv_cmd *drv_cmd,
+	u8 *resp_code, struct scsi_cmnd *scmd)
+{
+	struct mpi3_scsi_task_mgmt_request tm_req;
+	struct mpi3_scsi_task_mgmt_reply *tm_reply = NULL;
+	int retval = 0;
+	struct mpi3mr_tgt_dev *tgtdev = NULL;
+	struct mpi3mr_stgt_priv_data *scsi_tgt_priv_data = NULL;
+	struct op_req_qinfo *op_req_q = NULL;
+	struct scmd_priv *cmd_priv = NULL;
+	struct scsi_device *sdev = NULL;
+	struct mpi3mr_sdev_priv_data *sdev_priv_data = NULL;
+
+	if (mrioc->unrecoverable) {
+		retval = -1;
+		dprint_tm(mrioc, "sending task management failed due to  unrecoverable controller\n");
+		goto out;
+	}
+
+	memset(&tm_req, 0, sizeof(tm_req));
+	mutex_lock(&drv_cmd->mutex);
+	if (drv_cmd->state & MPI3MR_CMD_PENDING) {
+		retval = -1;
+		dprint_tm(mrioc, "sending task management failed due to command in use\n");
+		mutex_unlock(&drv_cmd->mutex);
+		goto out;
+	}
+	if (mrioc->reset_in_progress) {
+		retval = -1;
+		dprint_tm(mrioc, "sending task management failed due to controller reset\n");
+		mutex_unlock(&drv_cmd->mutex);
+		goto out;
+	}
+
+	drv_cmd->state = MPI3MR_CMD_PENDING;
+	drv_cmd->is_waiting = 1;
+	drv_cmd->callback = NULL;
+	tm_req.dev_handle = cpu_to_le16(handle);
+	tm_req.task_type = tm_type;
+	tm_req.host_tag = cpu_to_le16(htag);
+
+	int_to_scsilun(lun, (struct scsi_lun *)tm_req.lun);
+	tm_req.function = MPI3_FUNCTION_SCSI_TASK_MGMT;
+
+	tgtdev = mpi3mr_get_tgtdev_by_handle(mrioc, handle);
+
+	if (scmd) {
+		if (tm_type == MPI3_SCSITASKMGMT_TASKTYPE_ABORT_TASK) {
+			cmd_priv = scsi_cmd_priv(scmd);
+			if (!cmd_priv)
+				goto out_unlock;
+			op_req_q = &mrioc->req_qinfo[cmd_priv->req_q_idx];
+			tm_req.task_host_tag = cpu_to_le16(cmd_priv->host_tag);
+			tm_req.task_request_queue_id =
+				cpu_to_le16(op_req_q->qid);
+		}
+
+		sdev = scmd->device;
+		sdev_priv_data = sdev->hostdata;
+		scsi_tgt_priv_data = ((sdev_priv_data) ?
+		    sdev_priv_data->tgt_priv_data : NULL);
+	} else {
+		if (tgtdev && tgtdev->starget && tgtdev->starget->hostdata)
+			scsi_tgt_priv_data = (struct mpi3mr_stgt_priv_data *)
+			    tgtdev->starget->hostdata;
+	}
+
+	if (scsi_tgt_priv_data)
+		atomic_inc(&scsi_tgt_priv_data->block_io);
+
+	if (tgtdev && (tgtdev->dev_type == MPI3_DEVICE_DEVFORM_PCIE)) {
+		if (cmd_priv && tgtdev->dev_spec.pcie_inf.abort_to)
+			timeout = tgtdev->dev_spec.pcie_inf.abort_to;
+		else if (!cmd_priv && tgtdev->dev_spec.pcie_inf.reset_to)
+			timeout = tgtdev->dev_spec.pcie_inf.reset_to;
+	}
+
+	dprint_tm(mrioc, "posting task management request: type(%d), handle(0x%04x)\n",
+	    tm_type, handle);
+	init_completion(&drv_cmd->done);
+	retval = mpi3mr_admin_request_post(mrioc, &tm_req, sizeof(tm_req), 1);
+	if (retval) {
+		dprint_tm(mrioc, "posting task management request is failed\n");
+		goto out_unlock;
+	}
+	wait_for_completion_timeout(&drv_cmd->done, (timeout * HZ));
+
+	if (!(drv_cmd->state & MPI3MR_CMD_COMPLETE)) {
+		drv_cmd->is_waiting = 0;
+		retval = -1;
+		if (!(drv_cmd->state & MPI3MR_CMD_RESET)) {
+			dprint_tm(mrioc,
+			    "task management request timed out after %ld seconds\n",
+			    timeout);
+			if (mrioc->logging_level & MPI3_DEBUG_TM)
+				dprint_dump(&tm_req, sizeof(tm_req),
+				    "mpi3_task_mgmt_req");
+			mpi3mr_soft_reset_handler(mrioc,
+			    MPI3MR_RESET_FROM_TM_TIMEOUT, 1);
+		}
+		goto out_unlock;
+	}
+
+	if (!(drv_cmd->state & MPI3MR_CMD_REPLY_VALID)) {
+		dprint_tm(mrioc, "invalid task management reply message\n");
+		retval = -1;
+		goto out_unlock;
+	}
+
+	tm_reply = (struct mpi3_scsi_task_mgmt_reply *)drv_cmd->reply;
+
+	switch (drv_cmd->ioc_status) {
+	case MPI3_IOCSTATUS_SUCCESS:
+		*resp_code = le32_to_cpu(tm_reply->response_data) &
+			MPI3MR_RI_MASK_RESPCODE;
+		break;
+	case MPI3_IOCSTATUS_SCSI_IOC_TERMINATED:
+		*resp_code = MPI3_SCSITASKMGMT_RSPCODE_TM_COMPLETE;
+		break;
+	default:
+		dprint_tm(mrioc,
+		    "task management request to handle(0x%04x) is failed with ioc_status(0x%04x) log_info(0x%08x)\n",
+		    handle, drv_cmd->ioc_status, drv_cmd->ioc_loginfo);
+		retval = -1;
+		goto out_unlock;
+	}
+
+	switch (*resp_code) {
+	case MPI3_SCSITASKMGMT_RSPCODE_TM_SUCCEEDED:
+	case MPI3_SCSITASKMGMT_RSPCODE_TM_COMPLETE:
+		break;
+	case MPI3_SCSITASKMGMT_RSPCODE_IO_QUEUED_ON_IOC:
+		if (tm_type != MPI3_SCSITASKMGMT_TASKTYPE_QUERY_TASK)
+			retval = -1;
+		break;
+	default:
+		retval = -1;
+		break;
+	}
+
+	dprint_tm(mrioc,
+	    "task management request type(%d) completed for handle(0x%04x) with ioc_status(0x%04x), log_info(0x%08x), termination_count(%d), response:%s(0x%x)\n",
+	    tm_type, handle, drv_cmd->ioc_status, drv_cmd->ioc_loginfo,
+	    le32_to_cpu(tm_reply->termination_count),
+	    mpi3mr_tm_response_name(*resp_code), *resp_code);
+
+	if (!retval) {
+		mpi3mr_ioc_disable_intr(mrioc);
+		mpi3mr_poll_pend_io_completions(mrioc);
+		mpi3mr_ioc_enable_intr(mrioc);
+		mpi3mr_poll_pend_io_completions(mrioc);
+		mpi3mr_process_admin_reply_q(mrioc);
+	}
+	switch (tm_type) {
+	case MPI3_SCSITASKMGMT_TASKTYPE_TARGET_RESET:
+		if (!scsi_tgt_priv_data)
+			break;
+		scsi_tgt_priv_data->pend_count = 0;
+		blk_mq_tagset_busy_iter(&mrioc->shost->tag_set,
+		    mpi3mr_count_tgt_pending,
+		    (void *)scsi_tgt_priv_data->starget);
+		break;
+	case MPI3_SCSITASKMGMT_TASKTYPE_LOGICAL_UNIT_RESET:
+		if (!sdev_priv_data)
+			break;
+		sdev_priv_data->pend_count = 0;
+		blk_mq_tagset_busy_iter(&mrioc->shost->tag_set,
+		    mpi3mr_count_dev_pending, (void *)sdev);
+		break;
+	default:
+		break;
+	}
+	mpi3mr_master_trigger(mrioc,
+	    MPI3_DRIVER2_MASTERTRIGGER_TASK_MANAGEMENT_ENABLED);
+
+out_unlock:
+	drv_cmd->state = MPI3MR_CMD_NOTUSED;
+	mutex_unlock(&drv_cmd->mutex);
+	if (scsi_tgt_priv_data)
+		atomic_dec_if_positive(&scsi_tgt_priv_data->block_io);
+	if (tgtdev)
+		mpi3mr_tgtdev_put(tgtdev);
+out:
+	return retval;
+}
+
+/**
+ * mpi3mr_bios_param - BIOS param callback
+ * @sdev: SCSI device reference
+ * @bdev: Block device reference
+ * @capacity: Capacity in logical sectors
+ * @params: Parameter array
+ *
+ * Just the parameters with heads/sectors/cylinders.
+ *
+ * Return: 0 always
+ */
+static int mpi3mr_bios_param(struct scsi_device *sdev,
+	struct block_device *bdev, sector_t capacity, int params[])
+{
+	int heads;
+	int sectors;
+	sector_t cylinders;
+	ulong dummy;
+
+	heads = 64;
+	sectors = 32;
+
+	dummy = heads * sectors;
+	cylinders = capacity;
+	sector_div(cylinders, dummy);
+
+	if ((ulong)capacity >= 0x200000) {
+		heads = 255;
+		sectors = 63;
+		dummy = heads * sectors;
+		cylinders = capacity;
+		sector_div(cylinders, dummy);
+	}
+
+	params[0] = heads;
+	params[1] = sectors;
+	params[2] = cylinders;
+	return 0;
+}
+
+#if ((defined(RHEL_MAJOR) && (RHEL_MAJOR == 8)) || \
+	(KERNEL_VERSION(5, 0, 0) <= LINUX_VERSION_CODE))
+static int mpi3mr_map_queues(struct Scsi_Host *shost)
+{
+	struct mpi3mr_ioc *mrioc = shost_priv(shost);
+	int i, qoff, offset;
+	struct blk_mq_queue_map *map = NULL;
+
+	offset = mrioc->op_reply_q_offset;
+
+	for (i = 0, qoff = 0; i < HCTX_MAX_TYPES; i++) {
+		map = &shost->tag_set.map[i];
+
+		map->nr_queues  = 0;
+
+		if (i == HCTX_TYPE_DEFAULT)
+			map->nr_queues = mrioc->default_qcount;
+		else if (i == HCTX_TYPE_POLL)
+			map->nr_queues = mrioc->active_poll_qcount;
+
+		if (!map->nr_queues) {
+			BUG_ON(i == HCTX_TYPE_DEFAULT);
+			continue;
+		}
+
+		/*
+		 * The poll queue(s) doesn't have an IRQ (and hence IRQ
+		 * affinity), so use the regular blk-mq cpu mapping
+		 */
+		map->queue_offset = qoff;
+		if (i != HCTX_TYPE_POLL)
+			blk_mq_pci_map_queues(map, mrioc->pdev, offset);
+		else
+			blk_mq_map_queues(map);
+
+		qoff += map->nr_queues;
+		offset += map->nr_queues;
+	}
+
+	return 0;
+
+}
+
+#endif
+
+/**
+ * mpi3mr_print_pending_host_io - print pending I/Os
+ * @mrioc: Adapter instance reference
+ *
+ * Print number of pending I/Os and each I/O details prior to
+ * reset for debug purpose.
+ *
+ * Return: Nothing
+ */
+static void mpi3mr_print_pending_host_io(struct mpi3mr_ioc *mrioc)
+{
+	struct Scsi_Host *shost = mrioc->shost;
+
+	ioc_info(mrioc, "number of pending I/O requests prior to reset: %d\n",
+	    mpi3mr_get_fw_pending_ios(mrioc));
+	blk_mq_tagset_busy_iter(&shost->tag_set,
+	    mpi3mr_print_scmd, (void *)mrioc);
+}
+
+/**
+ * mpi3mr_eh_host_reset - Host reset error handling callback
+ * @scmd: SCSI command reference
+ *
+ * Issue controller reset if the scmd is for a Physical Device,
+ * if the scmd is for RAID volume, then wait for
+ * MPI3MR_RAID_ERRREC_RESET_TIMEOUT and checks whether any
+ * pending I/Os prior to issuing reset to the controller.
+ *
+ * Return: SUCCESS of successful reset else FAILED
+ */
+static int mpi3mr_eh_host_reset(struct scsi_cmnd *scmd)
+{
+	struct mpi3mr_ioc *mrioc = shost_priv(scmd->device->host);
+	struct mpi3mr_stgt_priv_data *stgt_priv_data;
+	struct mpi3mr_sdev_priv_data *sdev_priv_data;
+	u8 dev_type = MPI3_DEVICE_DEVFORM_VD;
+	int retval = FAILED, ret;
+
+	sdev_printk(KERN_INFO, scmd->device,
+	    "%s: attempting host reset! scmd(%p)\n", mrioc->name, scmd);
+
+	sdev_priv_data = scmd->device->hostdata;
+	if (sdev_priv_data && sdev_priv_data->tgt_priv_data) {
+		stgt_priv_data = sdev_priv_data->tgt_priv_data;
+		dev_type = stgt_priv_data->dev_type;
+	}
+
+	if (dev_type == MPI3_DEVICE_DEVFORM_VD) {
+		mpi3mr_wait_for_host_io(mrioc,
+		    MPI3MR_RAID_ERRREC_RESET_TIMEOUT);
+		if (!mpi3mr_get_fw_pending_ios(mrioc)) {
+			while (mrioc->reset_in_progress ||
+			       mrioc->prepare_for_reset)
+				ssleep(1);
+			retval = SUCCESS;
+			goto out;
+		}
+	}
+	mpi3mr_print_pending_host_io(mrioc);
+
+	ret = mpi3mr_soft_reset_handler(mrioc,
+	    MPI3MR_RESET_FROM_EH_HOS, 1);
+	if (ret)
+		goto out;
+
+	retval = SUCCESS;
+out:
+	sdev_printk(KERN_INFO, scmd->device,
+	    "%s: host reset is %s for scmd(%p)\n", mrioc->name,
+	    ((retval == SUCCESS) ? "SUCCESS" : "FAILED"), scmd);
+
+	return retval;
+}
+
+/**
+ * mpi3mr_eh_target_reset - Target reset error handling callback
+ * @scmd: SCSI command reference
+ *
+ * Issue Target reset Task Management and verify the scmd is
+ * terminated successfully and return status accordingly.
+ *
+ * Return: SUCCESS of successful termination of the scmd else
+ *         FAILED
+ */
+static int mpi3mr_eh_target_reset(struct scsi_cmnd *scmd)
+{
+	struct mpi3mr_ioc *mrioc = shost_priv(scmd->device->host);
+	struct mpi3mr_stgt_priv_data *stgt_priv_data;
+	struct mpi3mr_sdev_priv_data *sdev_priv_data;
+	u16 dev_handle;
+	u8 resp_code = 0;
+	int retval = FAILED, ret = 0;
+
+
+	sdev_printk(KERN_INFO, scmd->device,
+	    "%s: attempting target reset! scmd(%p)\n", mrioc->name, scmd);
+	scsi_print_command(scmd);
+
+	sdev_priv_data = scmd->device->hostdata;
+	if (!sdev_priv_data || !sdev_priv_data->tgt_priv_data) {
+		sdev_printk(KERN_INFO, scmd->device,
+		    "%s: target is not available, target reset  is not issued\n",
+		    mrioc->name);
+		retval = SUCCESS;
+		goto out;
+	}
+
+	stgt_priv_data = sdev_priv_data->tgt_priv_data;
+	dev_handle = stgt_priv_data->dev_handle;
+	if (stgt_priv_data->dev_removed) {
+		sdev_printk(KERN_INFO, scmd->device,
+		    "%s:target(handle = 0x%04x) is removed, target reset is not issued\n",
+		    mrioc->name, dev_handle);
+		retval = FAILED;
+		goto out;
+	}
+	sdev_printk(KERN_INFO, scmd->device,
+	    "%s: target reset is issued to handle(0x%04x)\n",
+	    mrioc->name, dev_handle);
+
+	ret = mpi3mr_issue_tm(mrioc,
+	    MPI3_SCSITASKMGMT_TASKTYPE_TARGET_RESET, dev_handle,
+	    sdev_priv_data->lun_id, MPI3MR_HOSTTAG_BLK_TMS,
+	    MPI3MR_RESETTM_TIMEOUT, &mrioc->host_tm_cmds, &resp_code, scmd);
+
+	if (ret)
+		goto out;
+
+	if (stgt_priv_data->pend_count) {
+		sdev_printk(KERN_INFO, scmd->device,
+		    "%s: target has %d pending commands, target reset is failed\n",
+		    mrioc->name, stgt_priv_data->pend_count);
+		goto out;
+	}
+
+	retval = SUCCESS;
+out:
+	sdev_printk(KERN_INFO, scmd->device,
+	    "%s: target reset is %s for scmd(%p)\n", mrioc->name,
+	    ((retval == SUCCESS) ? "SUCCESS" : "FAILED"), scmd);
+
+	return retval;
+}
+
+/**
+ * mpi3mr_eh_dev_reset- Device reset error handling callback
+ * @scmd: SCSI command reference
+ *
+ * Issue lun reset Task Management and verify the scmd is
+ * terminated successfully and return status accordingly.
+ *
+ * Return: SUCCESS of successful termination of the scmd else
+ *         FAILED
+ */
+static int mpi3mr_eh_dev_reset(struct scsi_cmnd *scmd)
+{
+	struct mpi3mr_ioc *mrioc = shost_priv(scmd->device->host);
+	struct mpi3mr_stgt_priv_data *stgt_priv_data;
+	struct mpi3mr_sdev_priv_data *sdev_priv_data;
+	u16 dev_handle;
+	u8 resp_code = 0;
+	int retval = FAILED, ret = 0;
+
+	sdev_printk(KERN_INFO, scmd->device,
+	    "%s: attempting device(LUN) reset! scmd(%p)\n", mrioc->name, scmd);
+	scsi_print_command(scmd);
+
+	sdev_priv_data = scmd->device->hostdata;
+	if (!sdev_priv_data || !sdev_priv_data->tgt_priv_data) {
+		sdev_printk(KERN_INFO, scmd->device,
+		    "%s: device is not available, device(LUN) reset  is not issued\n",
+		    mrioc->name);
+		retval = SUCCESS;
+		goto out;
+	}
+
+	stgt_priv_data = sdev_priv_data->tgt_priv_data;
+	dev_handle = stgt_priv_data->dev_handle;
+	if (stgt_priv_data->dev_removed) {
+		sdev_printk(KERN_INFO, scmd->device,
+		    "%s: device(handle = 0x%04x) is removed, device(LUN) reset is not issued\n",
+		    mrioc->name, dev_handle);
+		retval = FAILED;
+		goto out;
+	}
+	sdev_printk(KERN_INFO, scmd->device,
+	    "%s: device(LUN) reset is issued to handle(0x%04x)\n",
+	    mrioc->name, dev_handle);
+
+	ret = mpi3mr_issue_tm(mrioc,
+	    MPI3_SCSITASKMGMT_TASKTYPE_LOGICAL_UNIT_RESET, dev_handle,
+	    sdev_priv_data->lun_id, MPI3MR_HOSTTAG_BLK_TMS,
+	    MPI3MR_RESETTM_TIMEOUT, &mrioc->host_tm_cmds, &resp_code, scmd);
+
+	if (ret)
+		goto out;
+
+	if (sdev_priv_data->pend_count) {
+		sdev_printk(KERN_INFO, scmd->device,
+		    "%s: device has %d pending commands, device(LUN) reset is failed\n",
+		    mrioc->name, sdev_priv_data->pend_count);
+		goto out;
+	}
+	retval = SUCCESS;
+out:
+	sdev_printk(KERN_INFO, scmd->device,
+	    "%s: device(LUN) reset is %s for scmd(%p)\n", mrioc->name,
+	    ((retval == SUCCESS) ? "SUCCESS" : "FAILED"), scmd);
+
+	return retval;
+}
+
+/**
+ * mpi3mr_eh_abort- Abort error handling callback
+ * @scmd: SCSI command reference
+ *
+ * Issue Abort Task Management if the command is in LLD scope
+ * and verify if it is aborted successfully and return status
+ * accordingly.
+ *
+ * Return: SUCCESS of successful abort the scmd else FAILED
+ */
+static int mpi3mr_eh_abort(struct scsi_cmnd *scmd)
+{
+	struct mpi3mr_ioc *mrioc = shost_priv(scmd->device->host);
+	struct mpi3mr_stgt_priv_data *stgt_priv_data;
+	struct mpi3mr_sdev_priv_data *sdev_priv_data;
+	struct scmd_priv *cmd_priv;
+	u16 dev_handle;
+	u8 resp_code = 0;
+	int retval = FAILED, ret = 0;
+
+	sdev_printk(KERN_INFO, scmd->device,
+	    "%s: attempting abort task! scmd(%p)\n", mrioc->name, scmd);
+	scsi_print_command(scmd);
+
+	sdev_priv_data = scmd->device->hostdata;
+	if (!sdev_priv_data || !sdev_priv_data->tgt_priv_data) {
+		sdev_printk(KERN_INFO, scmd->device,
+		    "%s: device is not available, abort task is not issued\n",
+		    mrioc->name);
+		retval = SUCCESS;
+		goto out;
+	}
+
+	stgt_priv_data = sdev_priv_data->tgt_priv_data;
+	dev_handle = stgt_priv_data->dev_handle;
+	if (stgt_priv_data->dev_removed) {
+		sdev_printk(KERN_INFO, scmd->device,
+		    "%s: device(handle = 0x%04x) is removed, abort task is not issued\n",
+		    mrioc->name, dev_handle);
+		retval = FAILED;
+		goto out;
+	}
+
+	sdev_printk(KERN_INFO, scmd->device,
+	    "%s: scmd(%p) to be aborted is issued to handle(0x%04x)\n",
+	    mrioc->name, scmd, dev_handle);
+
+	cmd_priv = scsi_cmd_priv(scmd);
+	if (!cmd_priv->in_lld_scope ||
+	    cmd_priv->host_tag == MPI3MR_HOSTTAG_INVALID) {
+		sdev_printk(KERN_INFO, scmd->device,
+		    "%s: scmd is not in LLD scope, abort task is not issued\n",
+		    mrioc->name);
+		retval = SUCCESS;
+		goto out;
+	}
+
+	ret = mpi3mr_issue_tm(mrioc, MPI3_SCSITASKMGMT_TASKTYPE_ABORT_TASK,
+	    dev_handle, sdev_priv_data->lun_id, MPI3MR_HOSTTAG_BLK_TMS,
+	    MPI3MR_ABORTTM_TIMEOUT, &mrioc->host_tm_cmds, &resp_code, scmd);
+
+	if (ret)
+		goto out;
+
+	if (cmd_priv->in_lld_scope) {
+		sdev_printk(KERN_INFO, scmd->device,
+		    "%s: scmd was not terminated, abort task is failed\n",
+		    mrioc->name);
+		goto out;
+	}
+	retval = SUCCESS;
+out:
+	sdev_printk(KERN_INFO, scmd->device,
+	    "%s: abort task is %s for scmd(%p)\n", mrioc->name,
+	    ((retval == SUCCESS) ? "SUCCESS" : "FAILED"), scmd);
+
+	return retval;
+}
+
+/**
+ * mpi3mr_change_queue_depth- Change QD callback handler
+ * @sdev: SCSI device reference
+ * @q_depth: Queue depth
+ *
+ * Validate and limit QD and call scsi_change_queue_depth.
+ *
+ * Return: return value of scsi_change_queue_depth
+ */
+static int mpi3mr_change_queue_depth(struct scsi_device *sdev,
+	int q_depth)
+{
+	struct scsi_target *starget = scsi_target(sdev);
+	struct Scsi_Host *shost = dev_to_shost(&starget->dev);
+	int retval = 0;
+
+	if (!sdev->tagged_supported)
+		q_depth = 1;
+	if (q_depth > shost->can_queue)
+		q_depth = shost->can_queue;
+	else if (!q_depth)
+		q_depth = MPI3MR_DEFAULT_SDEV_QD;
+	retval = scsi_change_queue_depth(sdev, q_depth);
+	sdev->max_queue_depth = sdev->queue_depth;
+
+	return retval;
+}
+
+/**
+ * mpi3mr_scan_start - Scan start callback handler
+ * @shost: SCSI host reference
+ *
+ * Issue port enable request asynchronously.
+ *
+ * Return: Nothing
+ */
+static void mpi3mr_scan_start(struct Scsi_Host *shost)
+{
+	struct mpi3mr_ioc *mrioc = shost_priv(shost);
+
+	mrioc->scan_started = 1;
+	ioc_info(mrioc, "scan started, issuing port enable\n");
+	if (mpi3mr_issue_port_enable(mrioc, 1)) {
+		ioc_err(mrioc, "issuing port enable failed\n");
+		mrioc->scan_started = 0;
+		mrioc->scan_failed = MPI3_IOCSTATUS_INTERNAL_ERROR;
+	}
+
+}
+
+/**
+ * mpi3mr_scan_finished - Scan finished callback handler
+ * @shost: SCSI host reference
+ * @time: Jiffies from the scan start
+ *
+ * Checks whether the port enable is completed or timedout or
+ * failed and set the scan status accordingly after taking any
+ * recovery if required.
+ *
+ * Return: 1 on scan finished or timed out, 0 for in progress
+ */
+static int mpi3mr_scan_finished(struct Scsi_Host *shost,
+	unsigned long time)
+{
+	struct mpi3mr_ioc *mrioc = shost_priv(shost);
+	u32 pe_timeout = MPI3MR_PORTENABLE_TIMEOUT;
+	u32 ioc_status = readl(&mrioc->sysif_regs->ioc_status);
+
+	if ((ioc_status & MPI3_SYSIF_IOC_STATUS_RESET_HISTORY) ||
+	    (ioc_status & MPI3_SYSIF_IOC_STATUS_FAULT)) {
+		ioc_err(mrioc, "port enable failed due to fault or reset\n");
+		mpi3mr_print_fault_info(mrioc);
+		mrioc->scan_failed = MPI3_IOCSTATUS_INTERNAL_ERROR;
+		mrioc->scan_started = 0;
+		mrioc->init_cmds.is_waiting = 0;
+		mrioc->init_cmds.callback = NULL;
+		mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED;
+	}
+
+	if (time >= (pe_timeout * HZ)) {
+		ioc_err(mrioc, "port enable failed due to time out\n");
+		mpi3mr_check_rh_fault_ioc(mrioc,
+		    MPI3MR_RESET_FROM_PE_TIMEOUT);
+		mrioc->scan_failed = MPI3_IOCSTATUS_INTERNAL_ERROR;
+		mrioc->scan_started = 0;
+		mrioc->init_cmds.is_waiting = 0;
+		mrioc->init_cmds.callback = NULL;
+		mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED;
+	}
+
+	if (mrioc->scan_started)
+		return 0;
+
+	if (mrioc->scan_failed) {
+		ioc_err(mrioc,
+		    "port enable failed with status=0x%04x\n",
+		    mrioc->scan_failed);
+	} else
+		ioc_info(mrioc, "port enable is successfully completed\n");
+
+	mpi3mr_start_watchdog(mrioc);
+	mrioc->is_driver_loading = 0;
+	mrioc->block_bsgs = 0;
+	return 1;
+}
+
+/**
+ * mpi3mr_slave_destroy - Slave destroy callback handler
+ * @sdev: SCSI device reference
+ *
+ * Cleanup and free per device(lun) private data.
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_slave_destroy(struct scsi_device *sdev)
+{
+	struct Scsi_Host *shost;
+	struct mpi3mr_ioc *mrioc;
+	struct mpi3mr_stgt_priv_data *scsi_tgt_priv_data;
+	struct mpi3mr_tgt_dev *tgt_dev = NULL;
+	unsigned long flags;
+	struct scsi_target *starget;
+	struct sas_rphy *rphy = NULL;
+
+	if (!sdev->hostdata)
+		return;
+
+	starget = scsi_target(sdev);
+	shost = dev_to_shost(&starget->dev);
+	mrioc = shost_priv(shost);
+	scsi_tgt_priv_data = starget->hostdata;
+
+	scsi_tgt_priv_data->num_luns--;
+
+	spin_lock_irqsave(&mrioc->tgtdev_lock, flags);
+	if (starget->channel == mrioc->scsi_device_channel)
+		tgt_dev = __mpi3mr_get_tgtdev_by_perst_id(mrioc, starget->id);
+	else if (mrioc->sas_transport_enabled && !starget->channel) {
+		rphy = dev_to_rphy(starget->dev.parent);
+		tgt_dev = __mpi3mr_get_tgtdev_by_addr_and_rphy(mrioc,
+		    rphy->identify.sas_address, rphy);
+	}
+
+	if (tgt_dev && (!scsi_tgt_priv_data->num_luns))
+		tgt_dev->starget = NULL;
+	if (tgt_dev)
+		mpi3mr_tgtdev_put(tgt_dev);
+	spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags);
+
+	kfree(sdev->hostdata);
+	sdev->hostdata = NULL;
+}
+
+/**
+ * mpi3mr_target_destroy - Target destroy callback handler
+ * @starget: SCSI target reference
+ *
+ * Cleanup and free per target private data.
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_target_destroy(struct scsi_target *starget)
+{
+	struct Scsi_Host *shost;
+	struct mpi3mr_ioc *mrioc;
+	struct mpi3mr_stgt_priv_data *scsi_tgt_priv_data;
+	struct mpi3mr_tgt_dev *tgt_dev;
+	unsigned long flags;
+
+	if (!starget->hostdata)
+		return;
+
+	shost = dev_to_shost(&starget->dev);
+	mrioc = shost_priv(shost);
+	scsi_tgt_priv_data = starget->hostdata;
+
+	spin_lock_irqsave(&mrioc->tgtdev_lock, flags);
+	tgt_dev = __mpi3mr_get_tgtdev_from_tgtpriv(mrioc, scsi_tgt_priv_data);
+	if (tgt_dev && (tgt_dev->starget == starget) &&
+	    (tgt_dev->perst_id == starget->id))
+		tgt_dev->starget = NULL;
+	if (tgt_dev) {
+		scsi_tgt_priv_data->tgt_dev = NULL;
+		scsi_tgt_priv_data->perst_id = 0;
+		mpi3mr_tgtdev_put(tgt_dev);
+		mpi3mr_tgtdev_put(tgt_dev);
+	}
+	spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags);
+
+	kfree(starget->hostdata);
+	starget->hostdata = NULL;
+
+}
+
+/**
+ * mpi3mr_slave_configure - Slave configure callback handler
+ * @sdev: SCSI device reference
+ *
+ * Configure queue depth, max hardware sectors and virt boundary
+ * as required
+ *
+ * Return: 0 always.
+ */
+static int mpi3mr_slave_configure(struct scsi_device *sdev)
+{
+	struct scsi_target *starget;
+	struct Scsi_Host *shost;
+	struct mpi3mr_ioc *mrioc;
+	struct mpi3mr_tgt_dev *tgt_dev = NULL;
+	unsigned long flags;
+	int retval = 0;
+	struct sas_rphy *rphy = NULL;
+
+	starget = scsi_target(sdev);
+	shost = dev_to_shost(&starget->dev);
+	mrioc = shost_priv(shost);
+
+	spin_lock_irqsave(&mrioc->tgtdev_lock, flags);
+	if (starget->channel == mrioc->scsi_device_channel)
+		tgt_dev = __mpi3mr_get_tgtdev_by_perst_id(mrioc, starget->id);
+	else if (mrioc->sas_transport_enabled && !starget->channel) {
+		rphy = dev_to_rphy(starget->dev.parent);
+		tgt_dev = __mpi3mr_get_tgtdev_by_addr_and_rphy(mrioc,
+		    rphy->identify.sas_address, rphy);
+	}
+	spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags);
+	if (!tgt_dev)
+		return -ENXIO;
+
+	sdev->eh_timeout = MPI3MR_EH_SCMD_TIMEOUT;
+	blk_queue_rq_timeout(sdev->request_queue, MPI3MR_SCMD_TIMEOUT);
+
+	mpi3mr_change_queue_depth(sdev, tgt_dev->q_depth);
+	switch (tgt_dev->dev_type) {
+	case MPI3_DEVICE_DEVFORM_PCIE:
+		/*The block layer hw sector size = 512*/
+		if ((tgt_dev->dev_spec.pcie_inf.dev_info &
+		    MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_MASK) ==
+		    MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_NVME_DEVICE) {
+			blk_queue_max_hw_sectors(sdev->request_queue,
+			    tgt_dev->dev_spec.pcie_inf.mdts / 512);
+			if (tgt_dev->dev_spec.pcie_inf.pgsz == 0)
+				blk_queue_virt_boundary(sdev->request_queue,
+				    ((1 << MPI3MR_DEFAULT_PGSZEXP) - 1));
+			else
+				blk_queue_virt_boundary(sdev->request_queue,
+				    ((1 << tgt_dev->dev_spec.pcie_inf.pgsz)
+				     - 1));
+		}
+		break;
+	default:
+		break;
+	}
+	mpi3mr_tgtdev_put(tgt_dev);
+
+	return retval;
+}
+
+/**
+ * mpi3mr_slave_alloc -Slave alloc callback handler
+ * @sdev: SCSI device reference
+ *
+ * Allocate per device(lun) private data and initialize it.
+ *
+ * Return: 0 on success -ENOMEM on memory allocation failure.
+ */
+static int mpi3mr_slave_alloc(struct scsi_device *sdev)
+{
+	struct Scsi_Host *shost;
+	struct mpi3mr_ioc *mrioc;
+	struct mpi3mr_stgt_priv_data *scsi_tgt_priv_data;
+	struct mpi3mr_tgt_dev *tgt_dev = NULL;
+	struct mpi3mr_sdev_priv_data *scsi_dev_priv_data;
+	unsigned long flags;
+	struct scsi_target *starget;
+	int retval = 0;
+	struct sas_rphy *rphy = NULL;
+
+	starget = scsi_target(sdev);
+	shost = dev_to_shost(&starget->dev);
+	mrioc = shost_priv(shost);
+	scsi_tgt_priv_data = starget->hostdata;
+
+	spin_lock_irqsave(&mrioc->tgtdev_lock, flags);
+
+	if (starget->channel == mrioc->scsi_device_channel)
+		tgt_dev = __mpi3mr_get_tgtdev_by_perst_id(mrioc, starget->id);
+	else if (mrioc->sas_transport_enabled && !starget->channel) {
+		rphy = dev_to_rphy(starget->dev.parent);
+		tgt_dev = __mpi3mr_get_tgtdev_by_addr_and_rphy(mrioc,
+		    rphy->identify.sas_address, rphy);
+	}
+
+	if (tgt_dev) {
+		if (tgt_dev->starget == NULL)
+			tgt_dev->starget = starget;
+		mpi3mr_tgtdev_put(tgt_dev);
+		retval = 0;
+	} else {
+		spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags);
+		return -ENXIO;
+	}
+
+	spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags);
+
+	scsi_dev_priv_data = kzalloc(sizeof(*scsi_dev_priv_data), GFP_KERNEL);
+	if (!scsi_dev_priv_data)
+		return -ENOMEM;
+
+	scsi_dev_priv_data->lun_id = sdev->lun;
+	scsi_dev_priv_data->tgt_priv_data = scsi_tgt_priv_data;
+	sdev->hostdata = scsi_dev_priv_data;
+
+	scsi_tgt_priv_data->num_luns++;
+
+	return retval;
+}
+
+/**
+ * mpi3mr_target_alloc - Target alloc callback handler
+ * @starget: SCSI target reference
+ *
+ * Allocate per target private data and initialize it.
+ *
+ * Return: 0 on success -ENOMEM on memory allocation failure.
+ */
+static int mpi3mr_target_alloc(struct scsi_target *starget)
+{
+	struct Scsi_Host *shost = dev_to_shost(&starget->dev);
+	struct mpi3mr_ioc *mrioc = shost_priv(shost);
+	struct mpi3mr_stgt_priv_data *scsi_tgt_priv_data;
+	struct mpi3mr_tgt_dev *tgt_dev;
+	unsigned long flags;
+	int retval = 0;
+	struct sas_rphy *rphy = NULL;
+
+	scsi_tgt_priv_data = kzalloc(sizeof(*scsi_tgt_priv_data), GFP_KERNEL);
+	if (!scsi_tgt_priv_data)
+		return -ENOMEM;
+
+	starget->hostdata = scsi_tgt_priv_data;
+
+	spin_lock_irqsave(&mrioc->tgtdev_lock, flags);
+	if (starget->channel == mrioc->scsi_device_channel) {
+		tgt_dev = __mpi3mr_get_tgtdev_by_perst_id(mrioc, starget->id);
+		if (tgt_dev && !tgt_dev->is_hidden) {
+			scsi_tgt_priv_data->starget = starget;
+			scsi_tgt_priv_data->dev_handle = tgt_dev->dev_handle;
+			scsi_tgt_priv_data->perst_id = tgt_dev->perst_id;
+			scsi_tgt_priv_data->dev_type = tgt_dev->dev_type;
+			scsi_tgt_priv_data->tgt_dev = tgt_dev;
+			tgt_dev->starget = starget;
+			atomic_set(&scsi_tgt_priv_data->block_io, 0);
+			retval = 0;
+			if ((tgt_dev->dev_type ==
+			    MPI3_DEVICE_DEVFORM_PCIE) &&
+			    ((tgt_dev->dev_spec.pcie_inf.dev_info &
+			    MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_MASK) ==
+			    MPI3_DEVICE0_PCIE_DEVICE_INFO_TYPE_NVME_DEVICE) &&
+			    ((tgt_dev->dev_spec.pcie_inf.dev_info &
+			    MPI3_DEVICE0_PCIE_DEVICE_INFO_PITYPE_MASK) !=
+			    MPI3_DEVICE0_PCIE_DEVICE_INFO_PITYPE_0))
+				scsi_tgt_priv_data->dev_nvme_dif = 1;
+			scsi_tgt_priv_data->io_throttle_enabled =
+				tgt_dev->io_throttle_enabled;
+			if (tgt_dev->dev_type == MPI3_DEVICE_DEVFORM_VD)
+				scsi_tgt_priv_data->throttle_group =
+				tgt_dev->dev_spec.vd_inf.tg;
+		} else
+			retval = -ENXIO;
+	} else if (mrioc->sas_transport_enabled && !starget->channel) {
+		rphy = dev_to_rphy(starget->dev.parent);
+		tgt_dev = __mpi3mr_get_tgtdev_by_addr_and_rphy(mrioc,
+		    rphy->identify.sas_address, rphy);
+		if (tgt_dev && !tgt_dev->is_hidden && !tgt_dev->non_stl &&
+		    (tgt_dev->dev_type == MPI3_DEVICE_DEVFORM_SAS_SATA)) {
+			scsi_tgt_priv_data->starget = starget;
+			scsi_tgt_priv_data->dev_handle = tgt_dev->dev_handle;
+			scsi_tgt_priv_data->perst_id = tgt_dev->perst_id;
+			scsi_tgt_priv_data->dev_type = tgt_dev->dev_type;
+			scsi_tgt_priv_data->tgt_dev = tgt_dev;
+			scsi_tgt_priv_data->io_throttle_enabled =
+				tgt_dev->io_throttle_enabled;
+			tgt_dev->starget = starget;
+			atomic_set(&scsi_tgt_priv_data->block_io, 0);
+			retval = 0;
+		} else
+			retval = -ENXIO;
+	}
+	spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags);
+	return retval;
+}
+
+/**
+ * mpi3mr_check_return_unmap - Whether an unmap is allowed
+ * @mrioc: Adapter instance reference
+ * @scmd: SCSI Command reference
+ *
+ * The controller hardware cannot handle certain unmap commands
+ * for NVMe drives, this routine checks those and return true
+ * and completes the SCSI command with proper status and sense
+ * data.
+ *
+ * Return: TRUE for not  allowed unmap, FALSE otherwise.
+ */
+static bool mpi3mr_check_return_unmap(struct mpi3mr_ioc *mrioc,
+	struct scsi_cmnd *scmd)
+{
+	unsigned char *buf;
+	u16 param_len, desc_len, trunc_param_len;
+
+	trunc_param_len = param_len = get_unaligned_be16(scmd->cmnd + 7);
+
+	if (!mrioc->pdev->revision) {
+		if (!param_len) {
+			dprint_scsi_err(mrioc, "CDB received with zero parameter length\n");
+			dprint_scsi_command(mrioc, scmd, MPI3_DEBUG_SCSI_ERROR);
+			set_host_byte(scmd, DID_OK);
+			SCMD_DONE(scmd);
+			return true;
+		}
+
+		if (param_len < 24) {
+			dprint_scsi_err(mrioc,
+			    "CDB received with invalid param_len: %d\n",
+			    param_len);
+			dprint_scsi_command(mrioc, scmd, MPI3_DEBUG_SCSI_ERROR);
+			mpi3mr_scsi_build_sense(scmd, 0,
+			    ILLEGAL_REQUEST, 0x1A, 0);
+			SCMD_DONE(scmd);
+			return true;
+		}
+		if (param_len != scsi_bufflen(scmd)) {
+			dprint_scsi_err(mrioc,
+			    "CDB received with param_len: %d bufflen: %d\n",
+			    param_len, scsi_bufflen(scmd));
+			dprint_scsi_command(mrioc, scmd, MPI3_DEBUG_SCSI_ERROR);
+			mpi3mr_scsi_build_sense(scmd, 0,
+			    ILLEGAL_REQUEST, 0x1A, 0);
+			SCMD_DONE(scmd);
+			return true;
+		}
+		buf = kzalloc(scsi_bufflen(scmd), GFP_ATOMIC);
+		if (!buf) {
+			mpi3mr_scsi_build_sense(scmd, 0,
+			    ILLEGAL_REQUEST, 0x55, 0x03);
+			SCMD_DONE(scmd);
+			return true;
+		}
+		scsi_sg_copy_to_buffer(scmd, buf, scsi_bufflen(scmd));
+		desc_len = get_unaligned_be16(&buf[2]);
+
+		if (desc_len < 16) {
+			dprint_scsi_err(mrioc,
+			    "invalid descriptor length in parameter list: %d\n",
+			    desc_len);
+			dprint_scsi_command(mrioc, scmd, MPI3_DEBUG_SCSI_ERROR);
+			mpi3mr_scsi_build_sense(scmd, 0,
+			    ILLEGAL_REQUEST, 0x26, 0);
+			SCMD_DONE(scmd);
+			kfree(buf);
+			return true;
+		}
+
+		if (param_len > (desc_len + 8)) {
+			trunc_param_len = desc_len + 8;
+			dprint_scsi_command(mrioc, scmd, MPI3_DEBUG_SCSI_ERROR);
+			dprint_scsi_err(mrioc,
+			    "truncating param_len(%d) to desc_len+8(%d)\n",
+			    param_len, trunc_param_len);
+			put_unaligned_be16(trunc_param_len, scmd->cmnd + 7);
+			dprint_scsi_command(mrioc, scmd, MPI3_DEBUG_SCSI_ERROR);
+		}
+
+		kfree(buf);
+	} else {
+		if ((param_len > 24) && ((param_len - 8) & 0xF)) {
+			trunc_param_len -= (param_len - 8) & 0xF;
+			dprint_scsi_command(mrioc, scmd, MPI3_DEBUG_SCSI_ERROR);
+			dprint_scsi_err(mrioc,
+			    "truncating param_len from (%d) to (%d)\n",
+			    param_len, trunc_param_len);
+			put_unaligned_be16(trunc_param_len, scmd->cmnd + 7);
+			dprint_scsi_command(mrioc, scmd, MPI3_DEBUG_SCSI_ERROR);
+		}
+	}
+	return false;
+}
+
+/**
+ * mpi3mr_allow_scmd_to_fw - Command is allowed during shutdown
+ * @scmd: SCSI Command reference
+ *
+ * Checks whether a cdb is allowed during shutdown or not.
+ *
+ * Return: TRUE for allowed commands, FALSE otherwise.
+ */
+
+inline bool mpi3mr_allow_scmd_to_fw(struct scsi_cmnd *scmd)
+{
+	switch (scmd->cmnd[0]) {
+	case SYNCHRONIZE_CACHE:
+	case START_STOP:
+		return true;
+	default:
+		return false;
+	}
+}
+
+/**
+ * mpi3mr_qcmd - I/O request dispatcher
+ * @shost: SCSI Host reference
+ * @scmd: SCSI Command reference
+ *
+ * Issues the SCSI Command as an MPI3 request.
+ *
+ * Return: 0 on successful queueing of the request or if the
+ *         request is completed with failure.
+ *         SCSI_MLQUEUE_DEVICE_BUSY when the device is busy.
+ *         SCSI_MLQUEUE_HOST_BUSY when the host queue is full.
+ */
+static int mpi3mr_qcmd(struct Scsi_Host *shost,
+	struct scsi_cmnd *scmd)
+{
+	struct mpi3mr_ioc *mrioc = shost_priv(shost);
+	struct mpi3mr_stgt_priv_data *stgt_priv_data;
+	struct mpi3mr_sdev_priv_data *sdev_priv_data;
+	struct scmd_priv *scmd_priv_data = NULL;
+	struct mpi3_scsi_io_request *scsiio_req = NULL;
+	struct op_req_qinfo *op_req_q = NULL;
+	int retval = 0;
+	u16 dev_handle;
+	u16 host_tag;
+	u32 scsiio_flags = 0, data_len_blks = 0;
+	struct request *rq = SCMD_GET_REQUEST(scmd);
+	int iprio_class;
+	u8 is_pcie_dev = 0;
+	struct chain_element *chain_req;
+	u32 tracked_io_sz = 0;
+	u32 ioc_pend_data_len = 0, tg_pend_data_len = 0;
+	struct mpi3mr_throttle_group_info *tg = NULL;
+
+
+	dprint_scsi_info(mrioc, "qcmd invoked for scmd(%p)\n", scmd);
+	dprint_scsi_command(mrioc, scmd, MPI3_DEBUG_SCSI_INFO);
+
+	if (mrioc->unrecoverable) {
+		set_host_byte(scmd, DID_NO_CONNECT);
+		SCMD_DONE(scmd);
+		goto out;
+	}
+
+	sdev_priv_data = scmd->device->hostdata;
+	if (!sdev_priv_data || !sdev_priv_data->tgt_priv_data) {
+		set_host_byte(scmd, DID_NO_CONNECT);
+		SCMD_DONE(scmd);
+		goto out;
+	}
+
+	if (mrioc->stop_drv_processing &&
+	    !(mpi3mr_allow_scmd_to_fw(scmd))) {
+		set_host_byte(scmd, DID_NO_CONNECT);
+		SCMD_DONE(scmd);
+		goto out;
+	}
+
+	if (mrioc->reset_in_progress || mrioc->prepare_for_reset) {
+		retval = SCSI_MLQUEUE_HOST_BUSY;
+		goto out;
+	}
+
+	stgt_priv_data = sdev_priv_data->tgt_priv_data;
+
+	if (atomic_read(&stgt_priv_data->block_io)) {
+		if (mrioc->stop_drv_processing) {
+			set_host_byte(scmd, DID_NO_CONNECT);
+			SCMD_DONE(scmd);
+			goto out;
+		}
+		retval = SCSI_MLQUEUE_DEVICE_BUSY;
+		goto out;
+	}
+
+	dev_handle = stgt_priv_data->dev_handle;
+	if (dev_handle == MPI3MR_INVALID_DEV_HANDLE) {
+		set_host_byte(scmd, DID_NO_CONNECT);
+		SCMD_DONE(scmd);
+		goto out;
+	}
+	if (stgt_priv_data->dev_removed) {
+		set_host_byte(scmd, DID_NO_CONNECT);
+		SCMD_DONE(scmd);
+		goto out;
+	}
+
+#if defined(IO_COUNTER_SUPPORT)
+	if (atomic_read(&mrioc->pend_ios) >= shost->can_queue) {
+		retval = SCSI_MLQUEUE_HOST_BUSY;
+		goto out;
+	}
+#endif
+
+	if (stgt_priv_data->dev_type == MPI3_DEVICE_DEVFORM_PCIE)
+		is_pcie_dev = 1;
+	if ((scmd->cmnd[0] == UNMAP) && is_pcie_dev &&
+	    (mrioc->pdev->device == MPI3_MFGPAGE_DEVID_SAS4116) &&
+	    mpi3mr_check_return_unmap(mrioc, scmd))
+		goto out;
+
+	host_tag = mpi3mr_host_tag_for_scmd(mrioc, scmd);
+	if (host_tag == MPI3MR_HOSTTAG_INVALID) {
+		set_host_byte(scmd, DID_ERROR);
+		SCMD_DONE(scmd);
+		goto out;
+	}
+
+	if (scmd->sc_data_direction == DMA_FROM_DEVICE)
+		scsiio_flags = MPI3_SCSIIO_FLAGS_DATADIRECTION_READ;
+	else if (scmd->sc_data_direction == DMA_TO_DEVICE)
+		scsiio_flags = MPI3_SCSIIO_FLAGS_DATADIRECTION_WRITE;
+	else
+		scsiio_flags = MPI3_SCSIIO_FLAGS_DATADIRECTION_NO_DATA_TRANSFER;
+
+	scsiio_flags |= MPI3_SCSIIO_FLAGS_TASKATTRIBUTE_SIMPLEQ;
+
+	if (sdev_priv_data->ncq_prio_enable) {
+		iprio_class = IOPRIO_PRIO_CLASS(req_get_ioprio(rq));
+		if (iprio_class == IOPRIO_CLASS_RT)
+			scsiio_flags |= 1 << MPI3_SCSIIO_FLAGS_CMDPRI_SHIFT;
+	}
+
+	if (scmd->cmd_len > 16)
+		scsiio_flags |= MPI3_SCSIIO_FLAGS_CDB_GREATER_THAN_16;
+
+	scmd_priv_data = scsi_cmd_priv(scmd);
+	memset(scmd_priv_data->mpi3mr_scsiio_req, 0, MPI3MR_ADMIN_REQ_FRAME_SZ);
+	scsiio_req = (struct mpi3_scsi_io_request *)
+	    scmd_priv_data->mpi3mr_scsiio_req;
+	scsiio_req->function = MPI3_FUNCTION_SCSI_IO;
+	scsiio_req->host_tag = cpu_to_le16(host_tag);
+
+	if (!is_pcie_dev)
+		mpi3mr_setup_sas_eedp(mrioc, scmd, scsiio_req);
+	else if (stgt_priv_data->dev_nvme_dif)
+		mpi3mr_setup_nvme_eedp(mrioc, scmd, scsiio_req, &scsiio_flags);
+
+	memcpy(scsiio_req->cdb.cdb32, scmd->cmnd, scmd->cmd_len);
+	scsiio_req->data_length = cpu_to_le32(scsi_bufflen(scmd));
+	scsiio_req->dev_handle = cpu_to_le16(dev_handle);
+
+	int_to_scsilun(sdev_priv_data->lun_id,
+	    (struct scsi_lun *)scsiio_req->lun);
+
+	if (mpi3mr_build_sg_scmd(mrioc, scmd, scsiio_req)) {
+		mpi3mr_clear_scmd_priv(mrioc, scmd);
+		retval = SCSI_MLQUEUE_HOST_BUSY;
+		goto out;
+	}
+	if (mrioc->logging_level & MPI3_DEBUG_SG) {
+		dprint_dump(scmd_priv_data->mpi3mr_scsiio_req,
+		    MPI3MR_ADMIN_REQ_FRAME_SZ, "mpi3_scsi_io_req");
+		if (scmd_priv_data->chain_idx >= 0) {
+			chain_req =
+			    &mrioc->chain_sgl_list[scmd_priv_data->chain_idx];
+			dprint_dump(chain_req->addr, MPI3MR_CHAINSGE_SIZE,
+			    "chain_sge");
+		}
+		if (scmd_priv_data->meta_chain_idx > 0) {
+			chain_req =
+			&mrioc->chain_sgl_list[scmd_priv_data->meta_chain_idx];
+			ioc_info(mrioc, "meta SGE\n");
+			dprint_dump(chain_req->addr, MPI3MR_CHAINSGE_SIZE,
+			    "meta_chain_sge");
+		}
+	}
+	op_req_q = &mrioc->req_qinfo[scmd_priv_data->req_q_idx];
+		data_len_blks = scsi_bufflen(scmd) >> 9;
+	if ((data_len_blks >= mrioc->io_throttle_data_length) &&
+	    stgt_priv_data->io_throttle_enabled) {
+		tracked_io_sz = data_len_blks;
+		tg = stgt_priv_data->throttle_group;
+		if (tg) {
+			ioc_pend_data_len = atomic_add_return(data_len_blks,
+			    &mrioc->pend_large_data_sz);
+			tg_pend_data_len = atomic_add_return(data_len_blks,
+			    &tg->pend_large_data_sz);
+#ifdef THROTTLE_LOGGING
+			if (printk_ratelimit())
+				ioc_info(mrioc,
+				    "large vd_io persist_id(%d), handle(0x%04x), data_len(%d), ioc_pending(%d), tg_pending(%d), ioc_high(%d), tg_high(%d)\n",
+				    stgt_priv_data->perst_id, dev_handle,
+				    data_len_blks, ioc_pend_data_len,
+				    tg_pend_data_len, mrioc->io_throttle_high,
+				    tg->high);
+#endif
+			if (!tg->io_divert  && ((ioc_pend_data_len >=
+			    mrioc->io_throttle_high) ||
+			    (tg_pend_data_len >= tg->high))) {
+				tg->io_divert = 1;
+				tg->need_qd_reduction = 1;
+				mpi3mr_set_io_divert_for_all_vd_in_tg(mrioc,
+				    tg, 1);
+				mpi3mr_queue_qd_reduction_event(mrioc, tg);
+			}
+		} else {
+			ioc_pend_data_len = atomic_add_return(data_len_blks,
+			    &mrioc->pend_large_data_sz);
+#ifdef THROTTLE_LOGGING
+			if (printk_ratelimit())
+				ioc_info(mrioc,
+				    "large pd_io persist_id(%d), handle(0x%04x), data_len(%d), ioc_pending(%d), ioc_high(%d)\n",
+				    stgt_priv_data->perst_id, dev_handle,
+				    data_len_blks, ioc_pend_data_len,
+				    mrioc->io_throttle_high);
+#endif
+			if ( ioc_pend_data_len >= mrioc->io_throttle_high)
+				stgt_priv_data->io_divert = 1;
+		}
+	}
+
+	if (stgt_priv_data->io_divert) {
+#ifdef THROTTLE_LOGGING
+		if (printk_ratelimit()) {
+			scsi_print_command(scmd);
+			ioc_info(mrioc, "setting divert flag for host_tag(%d), qid(%d)\n",
+			    host_tag, scmd_priv_data->req_q_idx);
+		}
+#endif
+		scsiio_req->msg_flags |=
+		    MPI3_SCSIIO_MSGFLAGS_DIVERT_TO_FIRMWARE;
+		scsiio_flags |= MPI3_SCSIIO_FLAGS_DIVERT_REASON_IO_THROTTLING;
+	}
+	scsiio_req->flags = cpu_to_le32(scsiio_flags);
+
+	if (mpi3mr_op_request_post(mrioc, op_req_q,
+	    scmd_priv_data->mpi3mr_scsiio_req)) {
+		mpi3mr_clear_scmd_priv(mrioc, scmd);
+		retval = SCSI_MLQUEUE_HOST_BUSY;
+		if (tracked_io_sz) {
+			atomic_sub(tracked_io_sz, &mrioc->pend_large_data_sz);
+			if (tg)
+				atomic_sub(tracked_io_sz,
+				    &tg->pend_large_data_sz);
+		}
+		goto out;
+	}
+	dprint_scsi_info(mrioc, "sent scmd(%p) to the controller\n", scmd);
+
+out:
+	return retval;
+}
+
+static struct scsi_host_template mpi3mr_driver_template = {
+	.module				= THIS_MODULE,
+	.name				= "MPI3 Storage Controller",
+	.proc_name			= MPI3MR_DRIVER_NAME,
+	.queuecommand			= mpi3mr_qcmd,
+	.target_alloc			= mpi3mr_target_alloc,
+	.slave_alloc			= mpi3mr_slave_alloc,
+	.slave_configure		= mpi3mr_slave_configure,
+	.target_destroy			= mpi3mr_target_destroy,
+	.slave_destroy			= mpi3mr_slave_destroy,
+	.scan_finished			= mpi3mr_scan_finished,
+	.scan_start			= mpi3mr_scan_start,
+	.change_queue_depth		= mpi3mr_change_queue_depth,
+	.eh_abort_handler		= mpi3mr_eh_abort,
+	.eh_device_reset_handler	= mpi3mr_eh_dev_reset,
+	.eh_target_reset_handler	= mpi3mr_eh_target_reset,
+	.eh_host_reset_handler		= mpi3mr_eh_host_reset,
+	.bios_param			= mpi3mr_bios_param,
+#if ((defined(RHEL_MAJOR) && (RHEL_MAJOR == 8)) || \
+	(KERNEL_VERSION(5, 0, 0) <= LINUX_VERSION_CODE))
+	.map_queues			= mpi3mr_map_queues,
+#endif
+#if (KERNEL_VERSION(5, 13, 0) <= LINUX_VERSION_CODE)
+	.mq_poll			= mpi3mr_blk_mq_poll,
+#endif
+	.no_write_same			= 1,
+	.can_queue			= 1,
+	.this_id			= -1,
+	.sg_tablesize			= MPI3MR_SG_DEPTH,
+	.max_sectors			= MPI3MR_MAX_SECTORS,
+	.cmd_per_lun			= MPI3MR_MAX_CMDS_LUN,
+#if (KERNEL_VERSION(5, 0, 0) > LINUX_VERSION_CODE)
+	.use_clustering			= ENABLE_CLUSTERING,
+#endif
+#if (KERNEL_VERSION(5, 3, 0) <= LINUX_VERSION_CODE)
+	.max_segment_size		= 0xffffffff,
+#endif
+#if (KERNEL_VERSION(5, 16, 0) > LINUX_VERSION_CODE)
+	.shost_attrs			= mpi3mr_host_attrs,
+	.sdev_attrs			= mpi3mr_dev_attrs,
+#else
+	.shost_groups			= mpi3mr_host_groups,
+	.sdev_groups			= mpi3mr_dev_groups,
+#endif
+	.track_queue_depth		= 1,
+	.cmd_size			= sizeof(struct scmd_priv),
+};
+
+
+/**
+ * mpi3mr_init_drv_cmd - Initialize internal command tracker
+ * @cmdptr: Internal command tracker
+ * @host_tag: Host tag used for the specific command
+ *
+ * Initialize the internal command tracker structure with
+ * specified host tag.
+ *
+ * Return: Nothing.
+ */
+static inline void mpi3mr_init_drv_cmd(struct mpi3mr_drv_cmd *cmdptr,
+	u16 host_tag)
+{
+	mutex_init(&cmdptr->mutex);
+	cmdptr->reply = NULL;
+	cmdptr->state = MPI3MR_CMD_NOTUSED;
+	cmdptr->dev_handle = MPI3MR_INVALID_DEV_HANDLE;
+	cmdptr->host_tag = host_tag;
+}
+
+/**
+ * osintfc_mrioc_security_status -Check controller secure status
+ * @pdev: PCI device instance
+ *
+ * Read the Device Serial Number capability from PCI config
+ * space and decide whether the controller is secure or not.
+ *
+ * Return: 0 on success, non-zero on failure.
+ */
+static int
+osintfc_mrioc_security_status(struct pci_dev *pdev)
+{
+	u32 cap_data;
+	int base;
+	u32 ctlr_status;
+	u32 debug_status;
+	int retval = 0;
+
+	base = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_DSN);
+	if (!base) {
+		dev_err(&pdev->dev, "PCI_EXT_CAP_ID_DSN is not supported\n");
+		return -1;
+	}
+
+	pci_read_config_dword(pdev, base + 4, &cap_data);
+
+	debug_status = cap_data & MPI3MR_CTLR_SECURE_DBG_STATUS_MASK;
+	ctlr_status = cap_data & MPI3MR_CTLR_SECURITY_STATUS_MASK;
+
+	switch (ctlr_status) {
+	case MPI3MR_INVALID_DEVICE:
+		dev_err(&pdev->dev,
+		    "non secure controller (Invalid) is detected: DID: 0x%x: SVID: 0x%x: SDID: 0x%x\n",
+		    pdev->device, pdev->subsystem_vendor,
+		    pdev->subsystem_device);
+		retval = -1;
+		break;
+	case MPI3MR_CONFIG_SECURE_DEVICE:
+		if (!debug_status)
+			dev_info(&pdev->dev,
+			    "configurable secure controller is detected\n");
+		break;
+	case MPI3MR_HARD_SECURE_DEVICE:
+		break;
+	case MPI3MR_TAMPERED_DEVICE:
+		dev_err(&pdev->dev,
+		    "non secure controller (Tampered) is detected: DID: 0x%x: SVID: 0x%x: SDID: 0x%x\n",
+		    pdev->device, pdev->subsystem_vendor,
+		    pdev->subsystem_device);
+		retval = -1;
+		break;
+	default:
+		retval = -1;
+			break;
+	}
+
+	if (!retval && debug_status) {
+		dev_err(&pdev->dev,
+		    "non secure controller (Secure Debug) is detected: DID: 0x%x: SVID: 0x%x: SDID: 0x%x\n",
+		    pdev->device, pdev->subsystem_vendor,
+		    pdev->subsystem_device);
+		retval = -1;
+	}
+
+	return retval;
+}
+
+/**
+ * mpi3mr_probe - PCI probe callback
+ * @pdev: PCI device instance
+ * @id: PCI device ID details
+ *
+ * controller initialization routine. Checks the security status
+ * of the controller and if it is invalid or tampered return the
+ * probe without initializing the controller. Otherwise,
+ * allocate per adapter instance through shost_priv and
+ * initialize controller specific data structures, initialize
+ * the controller hardware, add shost to the SCSI subsystem.
+ *
+ * Return: 0 on success, non-zero on failure.
+ */
+
+static int
+mpi3mr_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct mpi3mr_ioc *mrioc = NULL;
+	struct Scsi_Host *shost = NULL;
+	int retval = 0, i, prot_mask = 0;
+
+	if (osintfc_mrioc_security_status(pdev)) {
+		warn_non_secure_ctlr = 1;
+		return 1; /* For Invalid and Tampered device */
+	}
+
+	shost = scsi_host_alloc(&mpi3mr_driver_template,
+	    sizeof(struct mpi3mr_ioc));
+	if (!shost) {
+		retval = -ENODEV;
+		goto shost_failed;
+	}
+
+	mrioc = shost_priv(shost);
+	mrioc->id = mrioc_ids++;
+	if (!pdev->revision)
+		mrioc->is_segqueue_enabled = false;
+	else
+		mrioc->is_segqueue_enabled = enable_segqueue;
+	sprintf(mrioc->driver_name, "%s", MPI3MR_DRIVER_NAME);
+	sprintf(mrioc->name, "%s%d", mrioc->driver_name, mrioc->id);
+	dev_info(&pdev->dev, "PCI device is: %s\n", mrioc->name);
+	INIT_LIST_HEAD(&mrioc->list);
+	spin_lock(&mrioc_list_lock);
+	list_add_tail(&mrioc->list, &mrioc_list);
+	spin_unlock(&mrioc_list_lock);
+
+	spin_lock_init(&mrioc->admin_req_lock);
+	spin_lock_init(&mrioc->reply_free_queue_lock);
+	spin_lock_init(&mrioc->sbq_lock);
+	spin_lock_init(&mrioc->fwevt_lock);
+	spin_lock_init(&mrioc->tgtdev_lock);
+	spin_lock_init(&mrioc->watchdog_lock);
+	spin_lock_init(&mrioc->chain_buf_lock);
+	spin_lock_init(&mrioc->adm_req_q_bar_writeq_lock);
+	spin_lock_init(&mrioc->adm_reply_q_bar_writeq_lock);
+	spin_lock_init(&mrioc->sas_node_lock);
+	spin_lock_init(&mrioc->trigger_lock);
+
+	INIT_LIST_HEAD(&mrioc->fwevt_list);
+	INIT_LIST_HEAD(&mrioc->tgtdev_list);
+	INIT_LIST_HEAD(&mrioc->delayed_rmhs_list);
+	INIT_LIST_HEAD(&mrioc->delayed_evtack_cmds_list);
+	INIT_LIST_HEAD(&mrioc->sas_expander_list);
+	INIT_LIST_HEAD(&mrioc->hba_port_table_list);
+	INIT_LIST_HEAD(&mrioc->enclosure_list);
+
+	mutex_init(&mrioc->reset_mutex);
+
+	mpi3mr_init_drv_cmd(&mrioc->init_cmds, MPI3MR_HOSTTAG_INITCMDS);
+	mpi3mr_init_drv_cmd(&mrioc->cfg_cmds, MPI3MR_HOSTTAG_CFG_CMDS);
+	mpi3mr_init_drv_cmd(&mrioc->bsg_cmds, MPI3MR_HOSTTAG_BSG_CMDS);
+	mpi3mr_init_drv_cmd(&mrioc->host_tm_cmds, MPI3MR_HOSTTAG_BLK_TMS);
+	mpi3mr_init_drv_cmd(&mrioc->pel_abort_cmd, MPI3MR_HOSTTAG_PEL_ABORT);
+	mpi3mr_init_drv_cmd(&mrioc->pel_cmds, MPI3MR_HOSTTAG_PEL_WAIT);
+	mpi3mr_init_drv_cmd(&mrioc->transport_cmds,
+	    MPI3MR_HOSTTAG_TRANSPORT_CMDS);
+
+	for (i = 0; i < MPI3MR_NUM_DEVRMCMD; i++)
+		mpi3mr_init_drv_cmd(&mrioc->dev_rmhs_cmds[i],
+		    MPI3MR_HOSTTAG_DEVRMCMD_MIN + i);
+
+	for (i = 0; i < MPI3MR_NUM_SYSFS_TM; i++)
+		mpi3mr_init_drv_cmd(&mrioc->sysfs_tm_cmds[i],
+		    MPI3MR_HOSTTAG_SYSFS_TM_MIN + i);
+
+	for (i = 0; i < MPI3MR_NUM_EVTACKCMD; i++)
+		mpi3mr_init_drv_cmd(&mrioc->evtack_cmds[i],
+		    MPI3MR_HOSTTAG_EVTACKCMD_MIN + i);
+
+	init_waitqueue_head(&mrioc->reset_waitq);
+
+	mrioc->logging_level = logging_level;
+	mrioc->shost = shost;
+	mrioc->pdev = pdev;
+	mrioc->block_bsgs = 1;
+
+	/* init shost parameters */
+	shost->max_cmd_len = MPI3MR_MAX_CDB_LENGTH;
+	shost->max_lun = -1;
+	shost->unique_id = mrioc->id;
+
+	shost->max_channel = 0;
+	shost->max_id = 0xFFFFFFFF;
+
+#if defined(HOST_TAGSET_SUPPORT)
+	shost->host_tagset = 1;
+#endif
+
+	if (enable_dix)  {
+		prot_mask = SHOST_DIF_TYPE1_PROTECTION
+		    | SHOST_DIF_TYPE2_PROTECTION
+		    | SHOST_DIF_TYPE3_PROTECTION
+		    | SHOST_DIX_TYPE1_PROTECTION
+		    | SHOST_DIX_TYPE2_PROTECTION
+		    | SHOST_DIX_TYPE3_PROTECTION;
+		enable_dif = true;
+	} else if (enable_dif)
+		prot_mask = SHOST_DIF_TYPE1_PROTECTION
+		    | SHOST_DIF_TYPE2_PROTECTION
+		    | SHOST_DIF_TYPE3_PROTECTION;
+	else
+		prot_mask = 0;
+
+	scsi_host_set_prot(shost, prot_mask);
+
+	if (enable_dix && (pdev->device == MPI3_MFGPAGE_DEVID_SAS4116) &&
+	    pdev->revision)
+		mrioc->check_xprotect_nvme = true;
+	else
+		mrioc->check_xprotect_nvme = false;
+
+	ioc_info(mrioc,
+	    "host protection capabilities enabled %s%s%s%s%s%s\n",
+	    (prot_mask & SHOST_DIF_TYPE1_PROTECTION) ? " DIF1" : "",
+	    (prot_mask & SHOST_DIF_TYPE2_PROTECTION) ? " DIF2" : "",
+	    (prot_mask & SHOST_DIF_TYPE3_PROTECTION) ? " DIF3" : "",
+	    (prot_mask & SHOST_DIX_TYPE1_PROTECTION) ? " DIX1" : "",
+	    (prot_mask & SHOST_DIX_TYPE2_PROTECTION) ? " DIX2" : "",
+	    (prot_mask & SHOST_DIX_TYPE3_PROTECTION) ? " DIX3" : "");
+
+	scsi_host_set_guard(shost, SHOST_DIX_GUARD_CRC);
+
+	snprintf(mrioc->fwevt_worker_name, sizeof(mrioc->fwevt_worker_name),
+	    "%s%d_fwevt_wrkr", mrioc->driver_name, mrioc->id);
+	mrioc->fwevt_worker_thread = alloc_ordered_workqueue(
+	    mrioc->fwevt_worker_name, 0);
+	if (!mrioc->fwevt_worker_thread) {
+		ioc_err(mrioc, "firmware worker thread creation failed\n");
+		retval = -ENODEV;
+		goto fwevt_thread_failed;
+	}
+
+	mrioc->is_driver_loading = 1;
+	mrioc->cpu_count = num_online_cpus();
+
+	if (mpi3mr_setup_resources(mrioc)) {
+		ioc_err(mrioc, "setup resources failed\n");
+		retval = -ENODEV;
+		goto resource_alloc_failed;
+	}
+	if (mpi3mr_init_ioc(mrioc)) {
+		ioc_err(mrioc, "initializing IOC failed\n");
+		retval = -ENODEV;
+		goto init_ioc_failed;
+	}
+
+	shost->nr_hw_queues = 1;
+	if (mpi3mr_use_blk_mq(mrioc->shost)) {
+		shost->nr_hw_queues = mrioc->num_op_reply_q;
+#if (KERNEL_VERSION(5, 13, 0) <= LINUX_VERSION_CODE)
+		if (mrioc->active_poll_qcount)
+			shost->nr_maps = 3;
+#endif
+	}
+
+	shost->can_queue = mrioc->max_host_ios;
+	shost->sg_tablesize = MPI3MR_SG_DEPTH;
+	shost->max_id = mrioc->facts.max_perids + 1;
+
+	retval = scsi_add_host(shost, &pdev->dev);
+	if (retval) {
+		ioc_err(mrioc, "scsi_add_host failed error:%d\n", retval);
+		goto addhost_failed;
+	}
+
+	scsi_scan_host(shost);
+	mpi3mr_setup_debugfs(mrioc);
+	mpi3mr_bsg_init(mrioc);
+	return retval;
+
+addhost_failed:
+	mpi3mr_stop_watchdog(mrioc);
+	mpi3mr_cleanup_ioc(mrioc);
+init_ioc_failed:
+	mpi3mr_free_mem(mrioc);
+	mpi3mr_cleanup_resources(mrioc);
+resource_alloc_failed:
+	destroy_workqueue(mrioc->fwevt_worker_thread);
+fwevt_thread_failed:
+	spin_lock(&mrioc_list_lock);
+	list_del(&mrioc->list);
+	spin_unlock(&mrioc_list_lock);
+	scsi_host_put(shost);
+shost_failed:
+	return retval;
+}
+
+/**
+ * mpi3mr_remove - PCI remove callback
+ * @pdev: PCI device instance
+ *
+ * Cleanup the IOC by issuing MUR and shutdown notification.
+ * Free up all memory and resources associated with the
+ * controllerand target devices, unregister the shost.
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_remove(struct pci_dev *pdev)
+{
+	struct Scsi_Host *shost = pci_get_drvdata(pdev);
+	struct mpi3mr_ioc *mrioc;
+	struct workqueue_struct	*wq;
+	unsigned long flags;
+	struct mpi3mr_tgt_dev *tgtdev, *tgtdev_next;
+
+	if (!shost)
+		return;
+
+	mrioc = shost_priv(shost);
+	while (mrioc->reset_in_progress || mrioc->is_driver_loading)
+		ssleep(1);
+	if (!pci_device_is_present(mrioc->pdev)) {
+		mrioc->unrecoverable = 1;
+		mpi3mr_flush_cmds_for_unrecovered_controller(mrioc);
+	}
+
+	mpi3mr_bsg_exit(mrioc);
+	mpi3mr_destroy_debugfs(mrioc);
+	mrioc->stop_drv_processing = 1;
+
+	mpi3mr_cleanup_fwevt_list(mrioc);
+	spin_lock_irqsave(&mrioc->fwevt_lock, flags);
+	wq = mrioc->fwevt_worker_thread;
+	mrioc->fwevt_worker_thread = NULL;
+	spin_unlock_irqrestore(&mrioc->fwevt_lock, flags);
+	if (wq)
+		destroy_workqueue(wq);
+
+	if (mrioc->sas_transport_enabled)
+		sas_remove_host(shost);
+	scsi_remove_host(shost);
+
+	list_for_each_entry_safe(tgtdev, tgtdev_next, &mrioc->tgtdev_list,
+	    list) {
+		mpi3mr_remove_tgtdev_from_host(mrioc, tgtdev);
+		mpi3mr_tgtdev_del_from_list(mrioc, tgtdev);
+		mpi3mr_tgtdev_put(tgtdev);
+	}
+	mpi3mr_stop_watchdog(mrioc);
+	mpi3mr_cleanup_ioc(mrioc);
+	mpi3mr_free_mem(mrioc);
+	mpi3mr_cleanup_resources(mrioc);
+
+	spin_lock(&mrioc_list_lock);
+	list_del(&mrioc->list);
+	spin_unlock(&mrioc_list_lock);
+
+	scsi_host_put(shost);
+}
+
+/**
+ * mpi3mr_suspend - PCI shutdown callback
+ * @pdev: PCI device instance
+ *
+ * Cleanup the IOC by issuing MUR and shutdown notification.
+ * Free up all memory and resources associated with the
+ * controller
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_shutdown(struct pci_dev *pdev)
+{
+	struct Scsi_Host *shost = pci_get_drvdata(pdev);
+	struct mpi3mr_ioc *mrioc;
+	struct workqueue_struct	*wq;
+	unsigned long flags;
+
+	if (!shost)
+		return;
+
+	mrioc = shost_priv(shost);
+	while (mrioc->reset_in_progress || mrioc->is_driver_loading)
+		ssleep(1);
+	mrioc->stop_drv_processing = 1;
+
+	mpi3mr_cleanup_fwevt_list(mrioc);
+	spin_lock_irqsave(&mrioc->fwevt_lock, flags);
+	wq = mrioc->fwevt_worker_thread;
+	mrioc->fwevt_worker_thread = NULL;
+	spin_unlock_irqrestore(&mrioc->fwevt_lock, flags);
+	if (wq)
+		destroy_workqueue(wq);
+
+	mpi3mr_stop_watchdog(mrioc);
+	mpi3mr_cleanup_ioc(mrioc);
+	mpi3mr_cleanup_resources(mrioc);
+
+}
+
+/**
+ * mpi3mr_suspend - PCI power management suspend callback
+ * @dev: Device struct
+ *
+ * Change the power state to the given value and cleanup the IOC
+ * by issuing MUR and shutdown notification
+ *
+ * Return: 0 always.
+ */
+static int __maybe_unused
+mpi3mr_suspend(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct Scsi_Host *shost = pci_get_drvdata(pdev);
+	struct mpi3mr_ioc *mrioc;
+
+	if (!shost)
+		return 0;
+
+	mrioc = shost_priv(shost);
+	while (mrioc->reset_in_progress || mrioc->is_driver_loading)
+		ssleep(1);
+	mrioc->stop_drv_processing = 1;
+	mpi3mr_cleanup_fwevt_list(mrioc);
+	scsi_block_requests(shost);
+	mpi3mr_stop_watchdog(mrioc);
+	mpi3mr_cleanup_ioc(mrioc);
+
+	ioc_info(mrioc,
+	    "suspending controller pdev=0x%p, slot=%s, entering operating state\n",
+	    pdev, pci_name(pdev));
+	mpi3mr_cleanup_resources(mrioc);
+
+	return 0;
+}
+
+/**
+ * mpi3mr_resume - PCI power management resume callback
+ * @dev: Device struct
+ *
+ * Restore the power state to D0 and reinitialize the controller
+ * and resume I/O operations to the target devices
+ *
+ * Return: 0 on success, non-zero on failure
+ */
+static int __maybe_unused
+mpi3mr_resume(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct Scsi_Host *shost = pci_get_drvdata(pdev);
+	struct mpi3mr_ioc *mrioc;
+	pci_power_t device_state = pdev->current_state;
+	int r;
+
+	if (!shost)
+		return 0;
+
+	mrioc = shost_priv(shost);
+
+	ioc_info(mrioc,
+	     "resuming controller pdev=0x%p, slot=%s, previous operating state [D%d]\n",
+	    pdev, pci_name(pdev), device_state);
+	mrioc->pdev = pdev;
+	mrioc->cpu_count = num_online_cpus();
+	r = mpi3mr_setup_resources(mrioc);
+	if (r) {
+		ioc_err(mrioc, "setup resoruces failed[%d]\n", r);
+		return r;
+	}
+
+	mrioc->stop_drv_processing = 0;
+	mpi3mr_invalidate_devhandles(mrioc);
+	mpi3mr_free_enclosure_list(mrioc);
+	mpi3mr_memset_buffers(mrioc);
+	r = mpi3mr_reinit_ioc(mrioc, 1);
+	if (r) {
+		ioc_err(mrioc, "resuming controller failed[%d]\n", r);
+		return r;
+	}
+	ssleep(MPI3MR_RESET_TOPOLOGY_SETTLE_TIME);
+	scsi_unblock_requests(shost);
+	mrioc->device_refresh_on = 0;
+	mpi3mr_start_watchdog(mrioc);
+
+	return 0;
+}
+
+static ssize_t event_counter_show(struct device_driver *dd, char *buf)
+{
+	return sprintf(buf, "%llu\n", atomic64_read(&event_counter));
+}
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0))
+static DRIVER_ATTR_RO(event_counter);
+#else
+static DRIVER_ATTR(version, S_IRUGO, event_counter_show, NULL);
+#endif
+
+/**
+ * mpi3mr_pcierr_detected - PCI error detected callback
+ * @pdev: PCI device instance
+ * @state: channel state
+ *
+ * Template function, need to implement actual handling
+ *
+ * Return: PCI_ERS_RESULT_NEED_RESET
+ */
+static pci_ers_result_t
+mpi3mr_pcierr_detected(struct pci_dev *pdev, pci_channel_state_t state)
+{
+	dev_info(&pdev->dev, "%s: callback invoked state(%d)\n", __func__,
+	    state);
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+/**
+ * mpi3mr_pcierr_slot_reset - PCI error recovery slot reset
+ * @pdev: PCI device instance
+ *
+ * Template function, need to implement actual handling
+ *
+ * Return: PCI_ERS_RESULT_DISCONNECT
+ */
+static pci_ers_result_t mpi3mr_pcierr_slot_reset(struct pci_dev *pdev)
+{
+	dev_info(&pdev->dev, "%s: callback invoked\n", __func__);
+	return PCI_ERS_RESULT_DISCONNECT;
+}
+
+/**
+ * mpi3mr_pcierr_mmio_enabled - PCI error recovery resume
+ * callback
+ * @pdev: PCI device instance
+ *
+ * Template function, need to implement actual handling
+ *
+ * Return: Nothing.
+ */
+static void mpi3mr_pcierr_resume(struct pci_dev *pdev)
+{
+	dev_info(&pdev->dev, "%s: callback invoked\n", __func__);
+}
+
+/**
+ * mpi3mr_pcierr_mmio_enabled - PCI error recovery callback
+ * @pdev: PCI device instance
+ *
+ * Template function, need to implement actual handling
+ *
+ * Return: PCI_ERS_RESULT_RECOVERED
+ */
+static pci_ers_result_t mpi3mr_pcierr_mmio_enabled(struct pci_dev *pdev)
+{
+/*
+ * This is called only if _pcierr_error_detected returns
+ * PCI_ERS_RESULT_CAN_RECOVER. Read/Write to the device still works and
+ * there is no need to reset the slot
+ */
+	dev_info(&pdev->dev, "%s: callback invoked\n", __func__);
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
+static const struct pci_device_id mpi3mr_pci_id_table[] = {
+	{
+		PCI_DEVICE_SUB(MPI3_MFGPAGE_VENDORID_BROADCOM,
+		    MPI3_MFGPAGE_DEVID_SAS4116, PCI_ANY_ID, PCI_ANY_ID)
+	},
+	{ 0 }
+};
+MODULE_DEVICE_TABLE(pci, mpi3mr_pci_id_table);
+
+static struct pci_error_handlers mpi3mr_err_handler = {
+	.error_detected = mpi3mr_pcierr_detected,
+	.mmio_enabled = mpi3mr_pcierr_mmio_enabled,
+	.slot_reset = mpi3mr_pcierr_slot_reset,
+	.resume = mpi3mr_pcierr_resume,
+};
+
+static SIMPLE_DEV_PM_OPS(mpi3mr_pm_ops, mpi3mr_suspend, mpi3mr_resume);
+
+static struct pci_driver mpi3mr_pci_driver = {
+	.name = MPI3MR_DRIVER_NAME,
+	.id_table = mpi3mr_pci_id_table,
+	.probe = mpi3mr_probe,
+	.remove = mpi3mr_remove,
+	.shutdown = mpi3mr_shutdown,
+	.err_handler = &mpi3mr_err_handler,
+	.driver.pm = &mpi3mr_pm_ops,
+};
+
+/**
+ * mpi3mr_init - Module init entry point
+ * @void: No argument
+ * Registers character driver interface and PCI driver.
+ *
+ * Return: Success or failure of PCI driver registration
+ */
+static int __init mpi3mr_init(void)
+{
+	int ret_val;
+
+	pr_info("Loading %s version %s\n", MPI3MR_DRIVER_NAME,
+	    MPI3MR_DRIVER_VERSION);
+
+	mpi3mr_transport_template =
+		sas_attach_transport(&mpi3mr_transport_functions);
+	if (!mpi3mr_transport_template) {
+		pr_err("%s failed to load due to sas transport attach failure\n",
+		    MPI3MR_DRIVER_NAME);
+		return -ENODEV;
+	}
+
+	mpi3mr_init_debugfs();
+
+	ret_val = pci_register_driver(&mpi3mr_pci_driver);
+	if (ret_val) {
+		pr_err("%s failed to load due to pci register driver failure\n",
+		    MPI3MR_DRIVER_NAME);
+		goto err_pci_reg_fail;
+	}
+
+	ret_val = driver_create_file(&mpi3mr_pci_driver.driver,
+				     &driver_attr_event_counter);
+	if (ret_val)
+		goto err_event_counter;
+
+	return ret_val;
+
+err_event_counter:
+	pci_unregister_driver(&mpi3mr_pci_driver);
+
+err_pci_reg_fail:
+	mpi3mr_exit_debugfs();
+	sas_release_transport(mpi3mr_transport_template);
+
+	return ret_val;
+}
+
+
+/**
+ * mpi3mr_exit - Module unload entry point
+ * @void: No argument
+ *
+ * Registers character driver interface and PCI driver.
+ *
+ * Return: Nothing
+ */
+static void __exit mpi3mr_exit(void)
+{
+	if (warn_non_secure_ctlr)
+		pr_warn(
+		    "Unloading %s version %s while managing a non secure controller\n",
+		    MPI3MR_DRIVER_NAME, MPI3MR_DRIVER_VERSION);
+	else
+		pr_info("Unloading %s version %s\n", MPI3MR_DRIVER_NAME,
+		    MPI3MR_DRIVER_VERSION);
+
+	driver_remove_file(&mpi3mr_pci_driver.driver, &driver_attr_event_counter);
+	pci_unregister_driver(&mpi3mr_pci_driver);
+	mpi3mr_exit_debugfs();
+	sas_release_transport(mpi3mr_transport_template);
+}
+
+module_init(mpi3mr_init);
+module_exit(mpi3mr_exit);
diff --git a/drivers/scsi/mpi3mr/mpi3mr_transport.c b/drivers/scsi/mpi3mr/mpi3mr_transport.c
new file mode 100644
index 0000000000000..def047d6e231c
--- /dev/null
+++ b/drivers/scsi/mpi3mr/mpi3mr_transport.c
@@ -0,0 +1,3374 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Driver for Broadcom MPI3 Storage Controllers
+ *
+ * Copyright (C) 2017-2022 Broadcom Inc.
+ *  (mailto: mpi3mr-linuxdrv.pdl@broadcom.com)
+ *
+ */
+
+#include "mpi3mr.h"
+
+#define MPI3MR_MAX_PHYSICAL_PHYS 32
+
+static void mpi3mr_expander_node_remove(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_sas_node *sas_expander);
+
+/**
+ * mpi3mr_post_transport_req - Issue transport requests and wait
+ * @mrioc: Adapter instance reference
+ * @request: Properly populated MPI3 request
+ * @request_sz: Size of the MPI3 request
+ * @reply: Pointer to return MPI3 reply
+ * @reply_sz: Size of the MPI3 reply buffer
+ * @timeout: Timeout in seconds
+ * @ioc_status: Pointer to return ioc status
+ *
+ * A generic function for posting MPI3 requests from the SAS
+ * transport layer that uses transport command infrastructure.
+ * This blocks for the completion of request for timeout seconds
+ * and if the request times out this function faults the
+ * controller with proper reason code.
+ *
+ * On successful completion of the request this function returns
+ * appropriate ioc status from the firmware back to the caller.
+ *
+ * Return: 0 on success, non-zero on failure.
+ */
+static int mpi3mr_post_transport_req(struct mpi3mr_ioc *mrioc, void *request,
+	u16 request_sz, void *reply, u16 reply_sz, int timeout,
+	u16 *ioc_status)
+{
+	int retval = 0;
+
+	mutex_lock(&mrioc->transport_cmds.mutex);
+	if (mrioc->transport_cmds.state & MPI3MR_CMD_PENDING) {
+		retval = -1;
+		ioc_err(mrioc, "sending transport request failed due to command in use\n");
+		mutex_unlock(&mrioc->transport_cmds.mutex);
+		goto out;
+	}
+	mrioc->transport_cmds.state = MPI3MR_CMD_PENDING;
+	mrioc->transport_cmds.is_waiting = 1;
+	mrioc->transport_cmds.callback = NULL;
+	mrioc->transport_cmds.ioc_status = 0;
+	mrioc->transport_cmds.ioc_loginfo = 0;
+
+	init_completion(&mrioc->transport_cmds.done);
+	dprint_cfg_info(mrioc, "posting transport request\n");
+	if (mrioc->logging_level & MPI3_DEBUG_TRANSPORT_INFO)
+		dprint_dump(request, request_sz,"transport_req");
+	retval = mpi3mr_admin_request_post(mrioc, request, request_sz, 1);
+	if (retval) {
+		ioc_err(mrioc, "posting transport request failed\n");
+		goto out_unlock;
+	}
+	wait_for_completion_timeout(&mrioc->transport_cmds.done,
+	    (timeout * HZ));
+	if (!(mrioc->transport_cmds.state & MPI3MR_CMD_COMPLETE)) {
+		mpi3mr_check_rh_fault_ioc(mrioc,
+		    MPI3MR_RESET_FROM_SAS_TRANSPORT_TIMEOUT);
+		ioc_err(mrioc, "transport request timed out\n");
+		retval = -1;
+		goto out_unlock;
+	}
+	*ioc_status = mrioc->transport_cmds.ioc_status &
+		MPI3_IOCSTATUS_STATUS_MASK;
+	if ((*ioc_status) != MPI3_IOCSTATUS_SUCCESS)
+		dprint_transport_err(mrioc,
+		    "transport request returned with ioc_status(0x%04x), log_info(0x%08x)\n",
+		    *ioc_status, mrioc->transport_cmds.ioc_loginfo);
+
+	if ((reply) && (mrioc->transport_cmds.state & MPI3MR_CMD_REPLY_VALID))
+		memcpy((u8 *)reply, mrioc->transport_cmds.reply, reply_sz);
+
+out_unlock:
+	mrioc->transport_cmds.state = MPI3MR_CMD_NOTUSED;
+	mutex_unlock(&mrioc->transport_cmds.mutex);
+
+out:
+	return retval;
+}
+
+/**
+ * __mpi3mr_expander_find_by_handle - expander search by handle
+ * @mrioc: Adapter instance reference
+ * @handle: Firmware device handle of the expander
+ *
+ * Context: The caller should acquire sas_node_lock
+ *
+ * This searches for expander device based on handle, then
+ * returns the sas_node object.
+ *
+ * Return: Expander sas_node object reference or NULL
+ */
+struct mpi3mr_sas_node *__mpi3mr_expander_find_by_handle(struct mpi3mr_ioc
+	*mrioc, u16 handle)
+{
+	struct mpi3mr_sas_node *sas_expander, *r;
+
+	r = NULL;
+	list_for_each_entry(sas_expander, &mrioc->sas_expander_list, list) {
+		if (sas_expander->handle != handle)
+			continue;
+		r = sas_expander;
+		goto out;
+	}
+ out:
+	return r;
+}
+
+/**
+ * mpi3mr_enclosure_find_by_handle - enclosure search by handle
+ * @mrioc: Adapter instance reference
+ * @handle: Firmware device handle of the enclosure
+
+ * This searches for enclosure device based on handle, then returns the
+ * enclosure object.
+ *
+ * Return: Enclosure object reference or NULL
+ */
+struct mpi3mr_enclosure_node *mpi3mr_enclosure_find_by_handle(
+	struct mpi3mr_ioc*mrioc, u16 handle)
+{
+	struct mpi3mr_enclosure_node *enclosure_dev, *r;
+	r = NULL;
+
+	list_for_each_entry(enclosure_dev, &mrioc->enclosure_list, list) {
+		if (le16_to_cpu(enclosure_dev->pg0.enclosure_handle) != handle)
+			continue;
+		r = enclosure_dev;
+		goto out;
+	}
+out:
+	return r;
+}
+
+
+/**
+ * mpi3mr_expander_node_add - insert an expander to the list.
+ * @mrioc: Adapter instance reference
+ * @sas_expander: Expander sas node
+ * Context: This function will acquire sas_node_lock.
+ *
+ * Adding new object to the ioc->sas_expander_list.
+ *
+ * Return: None.
+ */
+static void mpi3mr_expander_node_add(struct mpi3mr_ioc*mrioc,
+	struct mpi3mr_sas_node *sas_expander)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&mrioc->sas_node_lock, flags);
+	list_add_tail(&sas_expander->list, &mrioc->sas_expander_list);
+	spin_unlock_irqrestore(&mrioc->sas_node_lock, flags);
+}
+
+/**
+ * mpi3mr_is_sas_exp_device - if device is an expander
+ * @device_info: Bitfield providing information about the device
+ *
+ * Return: 1 if the device is expander device, else 0.
+ */
+u8 mpi3mr_is_expander_device(u16 device_info)
+{
+	if ((device_info & MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_MASK) ==
+	     MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_EXPANDER)
+		return 1;
+	else
+		return 0;
+}
+
+
+/**
+ * mpi3mr_get_sas_address - retrieve sas_address for handle
+ * @mrioc: Adapter instance reference
+ * @handle: Firmware device handle
+ * @sas_address: Address to hold sas address
+ *
+ * This function issues device page0 read for a given device
+ * handle and gets the SAS address and return it back
+ *
+ * Return: 0 for success, non-zero for failure
+ */
+static int mpi3mr_get_sas_address(struct mpi3mr_ioc *mrioc, u16 handle,
+	u64 *sas_address)
+{
+	struct mpi3_device_page0 dev_pg0;
+	u16 ioc_status;
+	struct mpi3_device0_sas_sata_format *sasinf;
+
+	*sas_address = 0;
+
+	if ((mpi3mr_cfg_get_dev_pg0(mrioc, &ioc_status, &dev_pg0,
+	    sizeof(dev_pg0), MPI3_DEVICE_PGAD_FORM_HANDLE,
+	    handle))) {
+		ioc_err(mrioc, "%s: device page0 read failed\n", __func__);
+		return -ENXIO;
+	}
+
+	if (ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc, "device page read failed for handle(0x%04x), with ioc_status(0x%04x) failure at %s:%d/%s()!\n",
+		    handle, ioc_status, __FILE__, __LINE__, __func__);
+		return -ENXIO;
+	}
+
+	if (le16_to_cpu(dev_pg0.flags) &
+	     MPI3_DEVICE0_FLAGS_CONTROLLER_DEV_HANDLE)
+		*sas_address = mrioc->sas_hba.sas_address;
+	else if (dev_pg0.device_form == MPI3_DEVICE_DEVFORM_SAS_SATA) {
+		sasinf = &dev_pg0.device_specific.sas_sata_format;
+		*sas_address = le64_to_cpu(sasinf->sas_address);
+	} else {
+		ioc_err(mrioc, "%s: device_form(%d) is not SAS_SATA\n",
+		    __func__, dev_pg0.device_form);
+		return -ENXIO;
+	}
+	return 0;
+}
+
+/**
+ * __mpi3mr_get_tgtdev_by_addr - target device search
+ * @mrioc: Adapter instance reference
+ * @sas_address: SAS address of the device
+ * @hba_port: HBA port entry
+ *
+ * This searches for target device from sas address and hba port
+ * pointer then return mpi3mr_tgt_dev object.
+ *
+ * Return: Valid tget_dev or NULL
+ */
+struct mpi3mr_tgt_dev *__mpi3mr_get_tgtdev_by_addr(struct mpi3mr_ioc *mrioc,
+	u64 sas_address, struct mpi3mr_hba_port *hba_port)
+{
+	struct mpi3mr_tgt_dev *tgtdev;
+
+	assert_spin_locked(&mrioc->tgtdev_lock);
+
+	list_for_each_entry(tgtdev, &mrioc->tgtdev_list, list)
+		if ((tgtdev->dev_type == MPI3_DEVICE_DEVFORM_SAS_SATA) &&
+		    (tgtdev->dev_spec.sas_sata_inf.sas_address == sas_address)
+		    && (tgtdev->dev_spec.sas_sata_inf.hba_port == hba_port))
+			goto found_device;
+	return NULL;
+found_device:
+	mpi3mr_tgtdev_get(tgtdev);
+	return tgtdev;
+}
+
+/**
+ * mpi3mr_get_tgtdev_by_addr - target device search
+ * @mrioc: Adapter instance reference
+ * @sas_address: SAS address of the device
+ * @hba_port: HBA port entry
+ *
+ * This searches for target device from sas address and hba port
+ * pointer then return mpi3mr_tgt_dev object.
+ *
+ * Context: This function will acquire tgtdev_lock and will
+ * release before returning the mpi3mr_tgt_dev object.
+ *
+ * Return: Valid tget_dev or NULL
+ */
+struct mpi3mr_tgt_dev *mpi3mr_get_tgtdev_by_addr(struct mpi3mr_ioc *mrioc,
+	u64 sas_address, struct mpi3mr_hba_port *hba_port)
+{
+	struct mpi3mr_tgt_dev *tgtdev = NULL;
+	unsigned long flags;
+
+	if(!hba_port)
+		goto out;
+
+	spin_lock_irqsave(&mrioc->tgtdev_lock, flags);
+	tgtdev = __mpi3mr_get_tgtdev_by_addr(mrioc, sas_address, hba_port);
+	spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags);
+
+out:
+	return tgtdev;
+}
+
+/**
+ * mpi3mr_remove_device_by_sas_address - remove the device
+ * @mrioc: Adapter instance reference
+ * @sas_address: SAS address of the device
+ * @hba_port: HBA port entry
+ *
+ * This searches for target device using sas address and hba
+ * port pointer then removes it from the OS.
+ *
+ * Return: None
+ */
+void mpi3mr_remove_device_by_sas_address(struct mpi3mr_ioc *mrioc,
+	u64 sas_address, struct mpi3mr_hba_port *hba_port)
+{
+	struct mpi3mr_tgt_dev *tgtdev = NULL;
+	unsigned long flags;
+	u8 was_on_tgtdev_list = 0;
+
+	if(!hba_port)
+		return;
+
+	spin_lock_irqsave(&mrioc->tgtdev_lock, flags);
+	tgtdev = __mpi3mr_get_tgtdev_by_addr(mrioc,
+			 sas_address, hba_port);
+	if (tgtdev) {
+		if (!list_empty(&tgtdev->list)) {
+			list_del_init(&tgtdev->list);
+			was_on_tgtdev_list = 1;
+			mpi3mr_tgtdev_put(tgtdev);
+		}
+	}
+	spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags);
+	/*SP2DO -- Needs revisit of the removal logic*/
+	if (was_on_tgtdev_list) {
+		if (tgtdev->host_exposed)
+			mpi3mr_remove_tgtdev_from_host(mrioc, tgtdev);
+		mpi3mr_tgtdev_put(tgtdev);
+	}
+}
+
+/**
+ * __mpi3mr_get_tgtdev_by_addr_and_rphy - target device search
+ * @mrioc: Adapter instance reference
+ * @sas_address: SAS address of the device
+ * @rphy: SAS transport layer rphy object
+ *
+ * This searches for target device from sas address and rphy
+ * pointer then return mpi3mr_tgt_dev object.
+ *
+ * Return: Valid tget_dev or NULL
+ */
+struct mpi3mr_tgt_dev *__mpi3mr_get_tgtdev_by_addr_and_rphy(
+	struct mpi3mr_ioc *mrioc, u64 sas_address, struct sas_rphy *rphy)
+{
+	struct mpi3mr_tgt_dev *tgtdev;
+
+	assert_spin_locked(&mrioc->tgtdev_lock);
+
+	list_for_each_entry(tgtdev, &mrioc->tgtdev_list, list)
+		if ((tgtdev->dev_type == MPI3_DEVICE_DEVFORM_SAS_SATA) &&
+		    (tgtdev->dev_spec.sas_sata_inf.sas_address == sas_address)
+		    && (tgtdev->dev_spec.sas_sata_inf.rphy == rphy))
+			goto found_device;
+	return NULL;
+found_device:
+	mpi3mr_tgtdev_get(tgtdev);
+	return tgtdev;
+}
+
+
+
+/**
+ * mpi3mr_expander_find_by_sas_address - sas expander search
+ * @mrioc: Adapter instance reference
+ * @sas_address: SAS address of expander
+ * @hba_port: HBA port entry
+ *
+ * Return: A valid SAS expander node or NULL.
+ *
+ */
+struct mpi3mr_sas_node *mpi3mr_expander_find_by_sas_address(
+	struct mpi3mr_ioc *mrioc, u64 sas_address,
+	struct mpi3mr_hba_port *hba_port)
+{
+	struct mpi3mr_sas_node *sas_expander, *r=NULL;
+	if (!hba_port)
+		goto out;
+
+	list_for_each_entry(sas_expander, &mrioc->sas_expander_list, list) {
+		if ((sas_expander->sas_address != sas_address) ||
+					 (sas_expander->hba_port != hba_port))
+			continue;
+		r = sas_expander;
+		goto out;
+	}
+out:
+	return r;
+}
+
+/**
+ * __mpi3mr_sas_node_find_by_sas_address - sas node search
+ * @mrioc: Adapter instance reference
+ * @sas_address: SAS address of expander or sas host
+ * @hba_port: HBA port entry
+ * Context: Caller should acquire mrioc->sas_node_lock.
+ *
+ * If the SAS address indicates the device is direct attached to
+ * the controller (controller's SAS address) then the SAS node
+ * associated with the controller is returned back else the SAS
+ * address and hba port are used to identify the exact expander
+ * and the associated sas_node object is returned. If there is
+ * no match NULL is returned.
+ *
+ * Return: A valid SAS node or NULL.
+ *
+ */
+static struct mpi3mr_sas_node *__mpi3mr_sas_node_find_by_sas_address(
+	struct mpi3mr_ioc *mrioc, u64 sas_address,
+	struct mpi3mr_hba_port *hba_port)
+{
+
+	if (mrioc->sas_hba.sas_address == sas_address)
+		return &mrioc->sas_hba;
+	return mpi3mr_expander_find_by_sas_address(mrioc, sas_address,
+	    hba_port);
+}
+
+/**
+ * mpi3mr_get_port_id_by_sas_phy -  Get port ID of the given phy
+ * @phy - SAS transport layer phy object
+ *
+ * Return: Port number for valid ID else 0xFFFF
+ */
+static inline u8 mpi3mr_get_port_id_by_sas_phy(struct sas_phy *phy)
+{
+	u8 port_id = 0xFF;
+
+	struct mpi3mr_hba_port *hba_port = phy->hostdata;
+	if (hba_port)
+		port_id = hba_port->port_id;
+
+	return port_id;
+}
+
+
+
+/**
+ * mpi3mr_find_parent_present - Is parent present for a phy
+ * @mrioc: Adapter instance reference
+ * @phy - SAS transport layer phy object
+ *
+ * Return: 0 if parent is present else non-zero
+ */
+static int mpi3mr_parent_present(struct mpi3mr_ioc *mrioc, struct sas_phy *phy)
+{
+
+	unsigned long flags;
+	struct mpi3mr_hba_port *hba_port = phy->hostdata;
+
+	spin_lock_irqsave(&mrioc->sas_node_lock, flags);
+	if (__mpi3mr_sas_node_find_by_sas_address(mrioc,
+	    phy->identify.sas_address,
+	    hba_port) == NULL) {
+		spin_unlock_irqrestore(&mrioc->sas_node_lock, flags);
+		return -1;
+	}
+	spin_unlock_irqrestore(&mrioc->sas_node_lock, flags);
+	return 0;
+}
+
+
+/**
+ * mpi3mr_get_hba_port_by_id - find hba port by id
+ * @mrioc: Adapter instance reference
+ * @port_id - Port ID to search
+ * @skip_dirty_flag - Skip dirty ports that matched pot_id
+ *
+ * Return: mpi3mr_hba_port reference for the matched port
+ */
+
+struct mpi3mr_hba_port * mpi3mr_get_hba_port_by_id(struct mpi3mr_ioc *mrioc,
+	u8 port_id, u8 skip_dirty_flag)
+{
+
+	struct mpi3mr_hba_port *port, *port_next;
+
+	list_for_each_entry_safe (port, port_next,
+	    &mrioc->hba_port_table_list, list) {
+		if (port->port_id != port_id)
+			continue;
+		if (!skip_dirty_flag && (port->flags &
+		    MPI3MR_HBA_PORT_FLAG_DIRTY))
+			continue;
+		return port;
+	}
+
+	return NULL;
+}
+
+
+/**
+ * mpi3mr_get_port_id_by_rphy - Get Port number from SAS rphy
+ *
+ * @mrioc: Adapter instance reference
+ * @rphy - SAS transport layer remote phy object
+ *
+ * Retrieves HBA port number in which the device pointed by the
+ * rphy object is attached with.
+ *
+ * Return: Valid port number on success else OxFFFF.
+ */
+u8 mpi3mr_get_port_id_by_rphy(struct mpi3mr_ioc *mrioc, struct sas_rphy *rphy)
+{
+	struct mpi3mr_sas_node *sas_expander;
+	struct mpi3mr_tgt_dev *tgtdev;
+	unsigned long flags;
+	u8 port_id = 0xFF;
+
+	if (!rphy)
+		return port_id;
+
+	if (rphy->identify.device_type == SAS_EDGE_EXPANDER_DEVICE ||
+	    rphy->identify.device_type == SAS_FANOUT_EXPANDER_DEVICE) {
+		spin_lock_irqsave(&mrioc->sas_node_lock, flags);
+		list_for_each_entry(sas_expander, &mrioc->sas_expander_list,
+		    list) {
+			if (sas_expander->rphy == rphy) {
+				port_id = sas_expander->hba_port->port_id;
+				break;
+			}
+		}
+		spin_unlock_irqrestore(&mrioc->sas_node_lock, flags);
+	} else if (rphy->identify.device_type == SAS_END_DEVICE) {
+		spin_lock_irqsave(&mrioc->tgtdev_lock, flags);
+
+		tgtdev = __mpi3mr_get_tgtdev_by_addr_and_rphy(mrioc,
+			    rphy->identify.sas_address, rphy);
+		if (tgtdev) {
+			port_id =
+				tgtdev->dev_spec.sas_sata_inf.hba_port->port_id;
+			mpi3mr_tgtdev_put(tgtdev);
+		}
+		spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags);
+	}
+	return port_id;
+}
+
+/**
+ * mpi3mr_convert_phy_link_rate -
+ * @link_rate: link rate as defined in the MPI header
+ *
+ * Convert link_rate from mpi format into sas_transport layer
+ * form.
+ *
+ * Return: A valid SAS transport layer defined link rate
+ */
+static enum sas_linkrate mpi3mr_convert_phy_link_rate(u8 link_rate)
+{
+	enum sas_linkrate rc;
+
+	switch (link_rate) {
+	case MPI3_SAS_NEG_LINK_RATE_1_5:
+		rc = SAS_LINK_RATE_1_5_GBPS;
+		break;
+	case MPI3_SAS_NEG_LINK_RATE_3_0:
+		rc = SAS_LINK_RATE_3_0_GBPS;
+		break;
+	case MPI3_SAS_NEG_LINK_RATE_6_0:
+		rc = SAS_LINK_RATE_6_0_GBPS;
+		break;
+	case MPI3_SAS_NEG_LINK_RATE_12_0:
+		rc = SAS_LINK_RATE_12_0_GBPS;
+		break;
+	case MPI3_SAS_NEG_LINK_RATE_22_5:
+		/*TODO: Once SAS TL included define for 22.5 replace this*/
+		rc = SAS_LINK_RATE_12_0_GBPS;
+		break;
+	case MPI3_SAS_NEG_LINK_RATE_PHY_DISABLED:
+		rc = SAS_PHY_DISABLED;
+		break;
+	case MPI3_SAS_NEG_LINK_RATE_NEGOTIATION_FAILED:
+		rc = SAS_LINK_RATE_FAILED;
+		break;
+	case MPI3_SAS_NEG_LINK_RATE_PORT_SELECTOR:
+		rc = SAS_SATA_PORT_SELECTOR;
+		break;
+	case MPI3_SAS_NEG_LINK_RATE_SMP_RESET_IN_PROGRESS:
+		rc = SAS_PHY_RESET_IN_PROGRESS;
+		break;
+	default:
+	case MPI3_SAS_NEG_LINK_RATE_SATA_OOB_COMPLETE:
+	case MPI3_SAS_NEG_LINK_RATE_UNKNOWN_LINK_RATE:
+		rc = SAS_LINK_RATE_UNKNOWN;
+		break;
+	}
+	return rc;
+}
+
+/**
+ * mpi3mr_set_identify - set identify for phys and end devices
+ * @mrioc: Adapter instance reference
+ * @handle: Firmware device handle
+ * @identify: SAS transport layer's identify info
+ *
+ * Populates sas identify info for a specific device.
+ *
+ * Return: 0 for success, non-zero for failure.
+ */
+static int mpi3mr_set_identify(struct mpi3mr_ioc *mrioc, u16 handle,
+	struct sas_identify *identify)
+{
+
+	struct mpi3_device_page0 device_pg0;
+	struct mpi3_device0_sas_sata_format *sasinf;
+	u16 device_info;
+	u16 ioc_status;
+
+	if (mrioc->reset_in_progress) {
+		ioc_err(mrioc, "%s: host reset in progress!\n", __func__);
+		return -EFAULT;
+	}
+
+	if ((mpi3mr_cfg_get_dev_pg0(mrioc, &ioc_status, &device_pg0,
+	    sizeof(device_pg0), MPI3_DEVICE_PGAD_FORM_HANDLE, handle))) {
+		ioc_err(mrioc, "%s: device page0 read failed\n", __func__);
+		return -ENXIO;
+	}
+
+	if (ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc, "device page read failed for handle(0x%04x), with ioc_status(0x%04x) failure at %s:%d/%s()!\n",
+		    handle, ioc_status, __FILE__, __LINE__, __func__);
+		return -EIO;
+	}
+
+	memset(identify, 0, sizeof(struct sas_identify));
+	sasinf = &device_pg0.device_specific.sas_sata_format;
+	device_info = le16_to_cpu(sasinf->device_info);
+
+	/* sas_address */
+	identify->sas_address = le64_to_cpu(sasinf->sas_address);
+
+	/* phy number of the parent device this device is linked to */
+	identify->phy_identifier = sasinf->phy_num;
+
+	/* device_type */
+	switch (device_info & MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_MASK) {
+	case MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_NO_DEVICE:
+		identify->device_type = SAS_PHY_UNUSED;
+		break;
+	case MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_END_DEVICE:
+		identify->device_type = SAS_END_DEVICE;
+		break;
+	case MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_EXPANDER:
+		identify->device_type = SAS_EDGE_EXPANDER_DEVICE;
+		break;
+	/* MPI3.0 doesnt have define for FANOUT expander*/
+	}
+
+	/* initiator_port_protocols */
+	if (device_info & MPI3_SAS_DEVICE_INFO_SSP_INITIATOR)
+		identify->initiator_port_protocols |= SAS_PROTOCOL_SSP;
+	/* MPI3.0 doesnt have define for SATA INIT so setting both here*/
+	if (device_info & MPI3_SAS_DEVICE_INFO_STP_INITIATOR)
+		identify->initiator_port_protocols |= (SAS_PROTOCOL_STP |
+		    SAS_PROTOCOL_SATA);
+	if (device_info & MPI3_SAS_DEVICE_INFO_SMP_INITIATOR)
+		identify->initiator_port_protocols |= SAS_PROTOCOL_SMP;
+
+	/* target_port_protocols */
+	if (device_info & MPI3_SAS_DEVICE_INFO_SSP_TARGET)
+		identify->target_port_protocols |= SAS_PROTOCOL_SSP;
+	/* MPI3.0 doesnt have define for STP Target so setting both here*/
+	if (device_info & MPI3_SAS_DEVICE_INFO_STP_SATA_TARGET)
+		identify->target_port_protocols |= (SAS_PROTOCOL_STP |
+		    SAS_PROTOCOL_SATA);
+	if (device_info & MPI3_SAS_DEVICE_INFO_SMP_TARGET)
+		identify->target_port_protocols |= SAS_PROTOCOL_SMP;
+	return 0;
+}
+
+/* report manufacture request structure */
+struct rep_manu_request {
+	u8 smp_frame_type;
+	u8 function;
+	u8 reserved;
+	u8 request_length;
+};
+
+/* report manufacture reply structure */
+struct rep_manu_reply {
+	u8 smp_frame_type; /* 0x41 */
+	u8 function; /* 0x01 */
+	u8 function_result;
+	u8 response_length;
+	u16 expander_change_count;
+	u8 reserved0[2];
+	u8 sas_format;
+	u8 reserved2[3];
+	u8 vendor_id[SAS_EXPANDER_VENDOR_ID_LEN];
+	u8 product_id[SAS_EXPANDER_PRODUCT_ID_LEN];
+	u8 product_rev[SAS_EXPANDER_PRODUCT_REV_LEN];
+	u8 component_vendor_id[SAS_EXPANDER_COMPONENT_VENDOR_ID_LEN];
+	u16 component_id;
+	u8 component_revision_id;
+	u8 reserved3;
+	u8 vendor_specific[8];
+};
+
+/**
+ * mpi3mr_report_manufacture - obtain SMP report_manufacture
+ * @mrioc: Adapter instance reference
+ * @sas_address: SAS address of the expander device
+ * @edev: SAS transport layer sas_expander_device object
+ * @port_id: ID of the HBA port
+ *
+ * Fills in the sas_expander_device with manufacturing info.
+ *
+ * Return: 0 for success, non-zero for failure.
+ */
+static int mpi3mr_report_manufacture(struct mpi3mr_ioc *mrioc,
+	u64 sas_address, struct sas_expander_device *edev, u8 port_id)
+{
+	struct mpi3_smp_passthrough_request mpi_request;
+	struct mpi3_smp_passthrough_reply mpi_reply;
+	struct rep_manu_reply *manufacture_reply;
+	struct rep_manu_request *manufacture_request;
+	int rc = 0;
+	void *psge;
+	void *data_out = NULL;
+	dma_addr_t data_out_dma;
+	dma_addr_t data_in_dma;
+	size_t data_in_sz;
+	size_t data_out_sz;
+	u8 sgl_flags = MPI3MR_SGEFLAGS_SYSTEM_SIMPLE_END_OF_LIST;
+	u16 request_sz = sizeof(struct mpi3_smp_passthrough_request);
+	u16 reply_sz = sizeof(struct mpi3_smp_passthrough_reply);
+	u16 ioc_status;
+
+	if (mrioc->reset_in_progress) {
+		ioc_err(mrioc, "%s: host reset in progress!\n", __func__);
+		return -EFAULT;
+	}
+
+	data_out_sz = sizeof(struct rep_manu_request);
+	data_in_sz = sizeof(struct rep_manu_reply);
+	data_out = dma_zalloc_coherent(&mrioc->pdev->dev,
+	    data_out_sz + data_in_sz, &data_out_dma, GFP_KERNEL);
+	if (!data_out) {
+		ioc_err(mrioc, "failure at %s:%d/%s()!\n", __FILE__,
+		    __LINE__, __func__);
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	data_in_dma = data_out_dma + data_out_sz;
+	manufacture_reply = data_out + data_out_sz;
+
+	manufacture_request = data_out;
+	manufacture_request->smp_frame_type = 0x40;
+	manufacture_request->function = 1;
+	manufacture_request->reserved = 0;
+	manufacture_request->request_length = 0;
+
+	memset(&mpi_request, 0, request_sz);
+	memset(&mpi_reply, 0, reply_sz);
+	mpi_request.host_tag = MPI3MR_HOSTTAG_TRANSPORT_CMDS;
+	mpi_request.function = MPI3_FUNCTION_SMP_PASSTHROUGH;
+	mpi_request.io_unit_port = (u8) port_id;
+	mpi_request.sas_address = cpu_to_le64(sas_address);
+
+	psge = &mpi_request.request_sge;
+	mpi3mr_add_sg_single(psge, sgl_flags, data_out_sz, data_out_dma);
+
+	psge = &mpi_request.response_sge;
+	mpi3mr_add_sg_single(psge, sgl_flags, data_in_sz, data_in_dma);
+
+	dprint_transport_info(mrioc,
+	    "sending report manufacturer SMP request to sas_address(0x%016llx), port(%d)\n",
+	    (unsigned long long)sas_address, port_id);
+
+	if (mpi3mr_post_transport_req(mrioc, &mpi_request, request_sz,
+	    &mpi_reply, reply_sz, MPI3MR_INTADMCMD_TIMEOUT, &ioc_status))
+		goto out;
+
+	dprint_transport_info(mrioc,
+	    "report manufacturer SMP request completed with ioc_status(0x%04x)\n",
+	    ioc_status);
+
+	if (ioc_status == MPI3_IOCSTATUS_SUCCESS) {
+		u8 *tmp;
+
+		dprint_transport_info(mrioc,
+		    "report manufacturer - reply data transfer size(%d)\n",
+		    le16_to_cpu(mpi_reply.response_data_length));
+
+		if (le16_to_cpu(mpi_reply.response_data_length) !=
+		    sizeof(struct rep_manu_reply))
+			goto out;
+
+		strscpy(edev->vendor_id, manufacture_reply->vendor_id,
+		     SAS_EXPANDER_VENDOR_ID_LEN);
+		strscpy(edev->product_id, manufacture_reply->product_id,
+		     SAS_EXPANDER_PRODUCT_ID_LEN);
+		strscpy(edev->product_rev, manufacture_reply->product_rev,
+		     SAS_EXPANDER_PRODUCT_REV_LEN);
+		edev->level = manufacture_reply->sas_format & 1;
+		if (edev->level) {
+			strscpy(edev->component_vendor_id,
+			    manufacture_reply->component_vendor_id,
+			     SAS_EXPANDER_COMPONENT_VENDOR_ID_LEN);
+			tmp = (u8 *)&manufacture_reply->component_id;
+			edev->component_id = tmp[0] << 8 | tmp[1];
+			edev->component_revision_id =
+			    manufacture_reply->component_revision_id;
+		}
+	}
+
+out:
+	if (data_out)
+		dma_free_coherent(&mrioc->pdev->dev, data_out_sz + data_in_sz,
+		    data_out, data_out_dma);
+
+	return rc;
+}
+
+
+/**
+ * mpi3mr_delete_sas_port - helper function to removing a port
+ * @mrioc: Adapter instance reference
+ * @mr_sas_port: Internal Port object
+ *
+ * Return: None.
+ */
+static void  mpi3mr_delete_sas_port(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_sas_port *mr_sas_port)
+{
+
+	u64 sas_address = mr_sas_port->remote_identify.sas_address;
+	struct mpi3mr_hba_port *hba_port = mr_sas_port->hba_port;
+	enum sas_device_type device_type =
+	    mr_sas_port->remote_identify.device_type;
+
+	dev_printk(KERN_INFO, &mr_sas_port->port->dev,
+	    "remove: sas_address(0x%016llx)\n",
+	    (unsigned long long) sas_address);
+
+	if (device_type == SAS_END_DEVICE)
+		mpi3mr_remove_device_by_sas_address(mrioc, sas_address,
+		    hba_port);
+
+	else if (device_type == SAS_EDGE_EXPANDER_DEVICE ||
+	    device_type == SAS_FANOUT_EXPANDER_DEVICE)
+		mpi3mr_expander_remove(mrioc, sas_address, hba_port);
+}
+
+/**
+ * mpi3mr_delete_phy - Remove a single phy from port
+ * @mrioc: Adapter instance reference
+ * @mr_sas_port: Internal Port object
+ * @mr_sas_phy: Internal Phy object
+ *
+ * Return: None.
+ */
+static void mpi3mr_delete_sas_phy(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_sas_port *mr_sas_port,
+	struct mpi3mr_sas_phy *mr_sas_phy)
+{
+
+	u64 sas_address = mr_sas_port->remote_identify.sas_address;
+
+	dev_printk(KERN_INFO, &mr_sas_phy->phy->dev,
+	    "remove: sas_address(0x%016llx), phy(%d)\n",
+	    (unsigned long long) sas_address, mr_sas_phy->phy_id);
+
+	list_del(&mr_sas_phy->port_siblings);
+	mr_sas_port->num_phys--;
+	mr_sas_port->phy_mask &= ~(1 << mr_sas_phy->phy_id);
+	if (mr_sas_port->lowest_phy == mr_sas_phy->phy_id)
+		mr_sas_port->lowest_phy = ffs(mr_sas_port->phy_mask) - 1;
+	sas_port_delete_phy(mr_sas_port->port, mr_sas_phy->phy);
+	mr_sas_phy->phy_belongs_to_port = 0;
+}
+
+/**
+ * mpi3mr_add_sas_phy - Adding a single phy to a port
+ * @mrioc: Adapter instance reference
+ * @mr_sas_port: Internal Port object
+ * @mr_sas_phy: Internal Phy object
+ *
+ * Return: None.
+ */
+static void mpi3mr_add_sas_phy(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_sas_port *mr_sas_port,
+	struct mpi3mr_sas_phy *mr_sas_phy)
+{
+	u64 sas_address = mr_sas_port->remote_identify.sas_address;
+
+	dev_printk(KERN_INFO, &mr_sas_phy->phy->dev,
+	    "add: sas_address(0x%016llx), phy(%d)\n", (unsigned long long)
+	    sas_address, mr_sas_phy->phy_id);
+
+	list_add_tail(&mr_sas_phy->port_siblings, &mr_sas_port->phy_list);
+	mr_sas_port->num_phys++;
+	mr_sas_port->phy_mask |= (1 << mr_sas_phy->phy_id);
+	if (mr_sas_phy->phy_id < mr_sas_port->lowest_phy)
+		mr_sas_port->lowest_phy = ffs(mr_sas_port->phy_mask) - 1;
+	sas_port_add_phy(mr_sas_port->port, mr_sas_phy->phy);
+	mr_sas_phy->phy_belongs_to_port = 1;
+}
+
+/**
+ * mpi3mr_add_phy_to_an_existing_port - add phy to existing port
+ * @mrioc: Adapter instance reference
+ * @mr_sas_node: Internal sas node object (expander or host)
+ * @mr_sas_phy: Internal Phy object *
+ * @sas_address: SAS address of device/expander were phy needs
+ *             to be added to
+ * @hba_port: HBA port entry
+ *
+ * Return: None.
+ */
+void mpi3mr_add_phy_to_an_existing_port(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_sas_node *mr_sas_node, struct mpi3mr_sas_phy *mr_sas_phy,
+	u64 sas_address, struct mpi3mr_hba_port *hba_port)
+{
+	struct mpi3mr_sas_port *mr_sas_port;
+	struct mpi3mr_sas_phy *srch_phy;
+
+	if (mr_sas_phy->phy_belongs_to_port == 1)
+		return;
+
+	if (!hba_port)
+		return;
+
+	list_for_each_entry(mr_sas_port, &mr_sas_node->sas_port_list,
+	    port_list) {
+		if (mr_sas_port->remote_identify.sas_address !=
+		    sas_address)
+			continue;
+		if (mr_sas_port->hba_port != hba_port)
+			continue;
+		list_for_each_entry(srch_phy, &mr_sas_port->phy_list,
+		    port_siblings) {
+			if (srch_phy == mr_sas_phy)
+				return;
+		}
+		mpi3mr_add_sas_phy(mrioc, mr_sas_port, mr_sas_phy);
+		return;
+	}
+}
+
+/**
+ * mpi3mr_del_phy_from_an_existing_port - del phy from a port
+ * @mrioc: Adapter instance reference
+ * @mr_sas_node: Internal sas node object (expander or host)
+ * @mr_sas_phy: Internal Phy object
+ *
+ * Return: None.
+ */
+void mpi3mr_del_phy_from_an_existing_port(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_sas_node *mr_sas_node, struct mpi3mr_sas_phy *mr_sas_phy)
+{
+	struct mpi3mr_sas_port *mr_sas_port, *next;
+	struct mpi3mr_sas_phy *srch_phy;
+
+	if (mr_sas_phy->phy_belongs_to_port == 0)
+		return;
+
+	list_for_each_entry_safe(mr_sas_port, next, &mr_sas_node->sas_port_list,
+	    port_list) {
+		list_for_each_entry(srch_phy, &mr_sas_port->phy_list,
+		    port_siblings) {
+			if (srch_phy != mr_sas_phy)
+				continue;
+			if ((mr_sas_port->num_phys == 1) &&
+			    !mrioc->reset_in_progress)
+				mpi3mr_delete_sas_port(mrioc, mr_sas_port);
+			else
+				mpi3mr_delete_sas_phy(mrioc, mr_sas_port,
+				    mr_sas_phy);
+			return;
+		}
+	}
+}
+
+/**
+ * mpi3mr_sas_phy_sanity_check - sanity check while adding port
+ * @mrioc: Adapter instance reference
+ * @mr_sas_node: Internal sas node object (expander or host)
+ * @sas_address: SAS address of device/expander
+ * @hba_port: HBA port entry
+ *
+ * Verifies whether the Phys attached to a device with the given
+ * SAS address already belongs to an existing sas port if so
+ * will remove those phys from the sas port
+ *
+ * Return: None.
+ */
+static void mpi3mr_sas_port_sanity_check(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_sas_node *mr_sas_node, u64 sas_address,
+	struct mpi3mr_hba_port *hba_port)
+{
+	int i;
+
+	for (i = 0; i < mr_sas_node->num_phys; i++) {
+		if ((mr_sas_node->phy[i].remote_identify.sas_address !=
+		    sas_address) || (mr_sas_node->phy[i].hba_port != hba_port))
+			continue;
+		if (mr_sas_node->phy[i].phy_belongs_to_port == 1)
+			mpi3mr_del_phy_from_an_existing_port(mrioc,
+			    mr_sas_node, &mr_sas_node->phy[i]);
+	}
+}
+
+/**
+ * mpi3mr_sas_port_add - Expose the SAS device to the SAS TL
+ * @mrioc: Adapter instance reference
+ * @handle: Firmware device handle of the attached device
+ * @sas_address_parent: sas address of parent expander or host
+ * @hba_port: HBA port entry
+ *
+ * This function creates a new sas port object for the given end
+ * device matching sas address and hba_port and adds it to the
+ * sas_node's sas_port_list and expose the attached sas device
+ * to the SAS transport layer through sas_rphy_add.
+ *
+ * Returns a valid mpi3mr_sas_port reference or NULL.
+ */
+struct mpi3mr_sas_port * mpi3mr_sas_port_add(struct mpi3mr_ioc *mrioc,
+	u16 handle, u64 sas_address_parent, struct mpi3mr_hba_port *hba_port)
+{
+
+	struct mpi3mr_sas_phy *mr_sas_phy, *next;
+	struct mpi3mr_sas_port *mr_sas_port;
+	unsigned long flags;
+	struct mpi3mr_sas_node *mr_sas_node;
+	struct sas_rphy *rphy;
+	struct mpi3mr_tgt_dev *tgtdev = NULL;
+	int i;
+	struct sas_port *port;
+
+	if (!hba_port) {
+		ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		return NULL;
+	}
+
+	mr_sas_port = kzalloc(sizeof(struct mpi3mr_sas_port), GFP_KERNEL);
+	if (!mr_sas_port)
+		return NULL;
+
+	INIT_LIST_HEAD(&mr_sas_port->port_list);
+	INIT_LIST_HEAD(&mr_sas_port->phy_list);
+	spin_lock_irqsave(&mrioc->sas_node_lock, flags);
+	mr_sas_node = __mpi3mr_sas_node_find_by_sas_address(mrioc,
+	    sas_address_parent, hba_port);
+	spin_unlock_irqrestore(&mrioc->sas_node_lock, flags);
+
+	if (!mr_sas_node) {
+		ioc_err(mrioc, "%s:could not find parent sas_address(0x%016llx)!\n",
+		    __func__, (unsigned long long)sas_address_parent);
+		goto out_fail;
+	}
+
+	if ((mpi3mr_set_identify(mrioc, handle,
+	    &mr_sas_port->remote_identify))) {
+		ioc_err(mrioc,  "failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		goto out_fail;
+	}
+
+	if (mr_sas_port->remote_identify.device_type == SAS_PHY_UNUSED) {
+		ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		goto out_fail;
+	}
+
+	mr_sas_port->hba_port = hba_port;
+	mpi3mr_sas_port_sanity_check(mrioc, mr_sas_node,
+	    mr_sas_port->remote_identify.sas_address, hba_port);
+
+	for (i = 0; i < mr_sas_node->num_phys; i++) {
+		if ((mr_sas_node->phy[i].remote_identify.sas_address !=
+		    mr_sas_port->remote_identify.sas_address) ||
+		    (mr_sas_node->phy[i].hba_port != hba_port))
+			continue;
+		list_add_tail(&mr_sas_node->phy[i].port_siblings,
+		    &mr_sas_port->phy_list);
+		mr_sas_port->num_phys++;
+		mr_sas_port->phy_mask |= (1 << i);
+	}
+
+	if (!mr_sas_port->num_phys) {
+		ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		goto out_fail;
+	}
+
+	mr_sas_port->lowest_phy = ffs(mr_sas_port->phy_mask) - 1;
+
+	if (mr_sas_port->remote_identify.device_type == SAS_END_DEVICE) {
+		tgtdev = mpi3mr_get_tgtdev_by_addr(mrioc,
+		    mr_sas_port->remote_identify.sas_address,
+		    mr_sas_port->hba_port);
+
+		if (!tgtdev) {
+			ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+			    __FILE__, __LINE__, __func__);
+			goto out_fail;
+		}
+		tgtdev->dev_spec.sas_sata_inf.pend_sas_rphy_add = 1;
+	}
+
+	if (!mr_sas_node->parent_dev) {
+		ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		goto out_fail;
+	}
+
+	port = sas_port_alloc_num(mr_sas_node->parent_dev);
+	if ((sas_port_add(port))) {
+		ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		goto out_fail;
+	}
+
+	list_for_each_entry(mr_sas_phy, &mr_sas_port->phy_list,
+	    port_siblings) {
+		if ((mrioc->logging_level & MPI3_DEBUG_TRANSPORT_INFO))
+			dev_printk(KERN_INFO, &port->dev, "add: handle(0x%04x)"
+			    ", sas_address(0x%016llx), phy(%d)\n", handle,
+			    (unsigned long long)
+			    mr_sas_port->remote_identify.sas_address,
+			    mr_sas_phy->phy_id);
+		sas_port_add_phy(port, mr_sas_phy->phy);
+		mr_sas_phy->phy_belongs_to_port = 1;
+		mr_sas_phy->hba_port = hba_port;
+	}
+
+	mr_sas_port->port = port;
+	if (mr_sas_port->remote_identify.device_type == SAS_END_DEVICE) {
+		rphy = sas_end_device_alloc(port);
+		tgtdev->dev_spec.sas_sata_inf.rphy=rphy;
+	} else {
+		rphy = sas_expander_alloc(port,
+		    mr_sas_port->remote_identify.device_type);
+	}
+	rphy->identify = mr_sas_port->remote_identify;
+
+	if (mrioc->current_event)
+		mrioc->current_event->pending_at_sml = 1;
+
+	if ((sas_rphy_add(rphy))) {
+		ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+	}
+
+	if (mr_sas_port->remote_identify.device_type == SAS_END_DEVICE) {
+		tgtdev->dev_spec.sas_sata_inf.pend_sas_rphy_add = 0;
+		tgtdev->dev_spec.sas_sata_inf.sas_transport_attached = 1;
+		mpi3mr_tgtdev_put(tgtdev);
+	}
+
+	dev_printk(KERN_INFO, &rphy->dev,
+	    "%s: added: handle(0x%04x), sas_address(0x%016llx)\n",
+	    __func__, handle, (unsigned long long)
+	    mr_sas_port->remote_identify.sas_address);
+
+	mr_sas_port->rphy = rphy;
+	spin_lock_irqsave(&mrioc->sas_node_lock, flags);
+	list_add_tail(&mr_sas_port->port_list, &mr_sas_node->sas_port_list);
+	spin_unlock_irqrestore(&mrioc->sas_node_lock, flags);
+
+	if (mrioc->current_event) {
+		mrioc->current_event->pending_at_sml = 0;
+		if (mrioc->current_event->discard)
+			mpi3mr_print_discard_event_notice(mrioc, true);
+	}
+
+	/* fill in report manufacture */
+	if (mr_sas_port->remote_identify.device_type ==
+	    SAS_EDGE_EXPANDER_DEVICE ||
+	    mr_sas_port->remote_identify.device_type ==
+	    SAS_FANOUT_EXPANDER_DEVICE)
+		mpi3mr_report_manufacture(mrioc,
+		    mr_sas_port->remote_identify.sas_address,
+		    rphy_to_expander_device(rphy), hba_port->port_id);
+
+	return mr_sas_port;
+
+ out_fail:
+	list_for_each_entry_safe(mr_sas_phy, next, &mr_sas_port->phy_list,
+	    port_siblings)
+		list_del(&mr_sas_phy->port_siblings);
+	kfree(mr_sas_port);
+	return NULL;
+}
+
+/**
+ * mpi3mr_sas_port_remove - remove port from the list
+ * @mrioc: Adapter instance reference
+ * @sas_address: SAS address of attached device
+ * @sas_address_parent: SAS address of parent expander or host
+ * @hba_port: HBA port entry
+ *
+ * Removing object and freeing associated memory from the
+ * sas_port_list.
+ *
+ * Return: None
+ */
+void mpi3mr_sas_port_remove(struct mpi3mr_ioc *mrioc, u64 sas_address,
+	u64 sas_address_parent, struct mpi3mr_hba_port *hba_port)
+{
+	int i;
+	unsigned long flags;
+	struct mpi3mr_sas_port *mr_sas_port, *next;
+	struct mpi3mr_sas_node *mr_sas_node;
+	u8 found = 0;
+	struct mpi3mr_sas_phy *mr_sas_phy, *next_phy;
+	struct mpi3mr_hba_port *srch_port, *hba_port_next=NULL;
+
+
+	if (!hba_port)
+		return;
+
+	spin_lock_irqsave(&mrioc->sas_node_lock, flags);
+	mr_sas_node = __mpi3mr_sas_node_find_by_sas_address(mrioc,
+	    sas_address_parent, hba_port);
+	if (!mr_sas_node) {
+		spin_unlock_irqrestore(&mrioc->sas_node_lock, flags);
+		return;
+	}
+	list_for_each_entry_safe(mr_sas_port, next, &mr_sas_node->sas_port_list,
+	    port_list) {
+		if (mr_sas_port->remote_identify.sas_address != sas_address)
+			continue;
+		if (mr_sas_port->hba_port != hba_port)
+			continue;
+		found = 1;
+		list_del(&mr_sas_port->port_list);
+		goto out;
+	}
+
+ out:
+	if (!found) {
+		spin_unlock_irqrestore(&mrioc->sas_node_lock, flags);
+		return;
+	}
+
+	if (mr_sas_node->host_node) {
+		list_for_each_entry_safe(srch_port, hba_port_next,
+		    &mrioc->hba_port_table_list, list) {
+			if (srch_port != hba_port)
+				continue;
+			ioc_info(mrioc,
+			    "removing hba_port entry: %p port: %d from hba_port list\n",
+			    srch_port, srch_port->port_id);
+			list_del(&hba_port->list);
+			kfree(hba_port);
+			break;
+		}
+	}
+
+	for (i = 0; i < mr_sas_node->num_phys; i++) {
+		if (mr_sas_node->phy[i].remote_identify.sas_address ==
+		    sas_address)
+			memset(&mr_sas_node->phy[i].remote_identify, 0 ,
+			    sizeof(struct sas_identify));
+	}
+
+	spin_unlock_irqrestore(&mrioc->sas_node_lock, flags);
+
+	if (mrioc->current_event)
+		mrioc->current_event->pending_at_sml = 1;
+
+	list_for_each_entry_safe(mr_sas_phy, next_phy,
+	    &mr_sas_port->phy_list, port_siblings) {
+		if ((mrioc->logging_level & MPI3_DEBUG_TRANSPORT_INFO))
+			dev_printk(KERN_INFO, &mr_sas_port->port->dev,
+			    "remove: sas_address(0x%016llx), phy(%d)\n",
+			    (unsigned long long)
+			    mr_sas_port->remote_identify.sas_address,
+			    mr_sas_phy->phy_id);
+		mr_sas_phy->phy_belongs_to_port = 0;
+		if(!mrioc->stop_drv_processing)
+			sas_port_delete_phy(mr_sas_port->port,
+			    mr_sas_phy->phy);
+		list_del(&mr_sas_phy->port_siblings);
+	}
+	if(!mrioc->stop_drv_processing)
+		sas_port_delete(mr_sas_port->port);
+
+	ioc_info(mrioc, "%s: removed sas_address(0x%016llx)\n",
+	    __func__, (unsigned long long)sas_address);
+
+	if (mrioc->current_event) {
+		mrioc->current_event->pending_at_sml = 0;
+		if (mrioc->current_event->discard)
+			mpi3mr_print_discard_event_notice(mrioc, false);
+	}
+
+	kfree(mr_sas_port);
+}
+
+/**
+ * mpi3mr_add_host_phy - report sas_host phy to SAS transport
+ * @mrioc: Adapter instance reference
+ * @mr_sas_phy: Intenal Phy object
+ * @phy_pg0: SAS phy page 0
+ * @parent_dev: Prent device class object
+ *
+ * Return: 0 for success, non-zero for failure.
+ */
+int mpi3mr_add_host_phy(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_sas_phy *mr_sas_phy, struct mpi3_sas_phy_page0 phy_pg0,
+	struct device *parent_dev)
+{
+	struct sas_phy *phy;
+	int phy_index = mr_sas_phy->phy_id;
+
+
+	INIT_LIST_HEAD(&mr_sas_phy->port_siblings);
+	phy = sas_phy_alloc(parent_dev, phy_index);
+	if (!phy) {
+		ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		return -1;
+	}
+	if ((mpi3mr_set_identify(mrioc, mr_sas_phy->handle,
+	    &mr_sas_phy->identify))) {
+		ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		sas_phy_free(phy);
+		return -1;
+	}
+	phy->identify = mr_sas_phy->identify;
+	mr_sas_phy->attached_handle = le16_to_cpu(phy_pg0.attached_dev_handle);
+	if (mr_sas_phy->attached_handle)
+		mpi3mr_set_identify(mrioc, mr_sas_phy->attached_handle,
+		    &mr_sas_phy->remote_identify);
+	phy->identify.phy_identifier = mr_sas_phy->phy_id;
+	phy->negotiated_linkrate = mpi3mr_convert_phy_link_rate(
+	    (phy_pg0.negotiated_link_rate &
+	    MPI3_SAS_NEG_LINK_RATE_LOGICAL_MASK) >>
+	    MPI3_SAS_NEG_LINK_RATE_LOGICAL_SHIFT);
+	phy->minimum_linkrate_hw = mpi3mr_convert_phy_link_rate(
+	    phy_pg0.hw_link_rate & MPI3_SAS_HWRATE_MIN_RATE_MASK);
+	phy->maximum_linkrate_hw = mpi3mr_convert_phy_link_rate(
+	    phy_pg0.hw_link_rate >> 4);
+	phy->minimum_linkrate = mpi3mr_convert_phy_link_rate(
+	    phy_pg0.programmed_link_rate & MPI3_SAS_PRATE_MIN_RATE_MASK);
+	phy->maximum_linkrate = mpi3mr_convert_phy_link_rate(
+	    phy_pg0.programmed_link_rate >> 4);
+	phy->hostdata = mr_sas_phy->hba_port;
+
+	if ((sas_phy_add(phy))) {
+		ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		sas_phy_free(phy);
+		return -1;
+	}
+	if ((mrioc->logging_level & MPI3_DEBUG_TRANSPORT_INFO))
+		dev_printk(KERN_INFO, &phy->dev,
+		    "add: handle(0x%04x), sas_address(0x%016llx)\n"
+		    "\tattached_handle(0x%04x), sas_address(0x%016llx)\n",
+		    mr_sas_phy->handle, (unsigned long long)
+		    mr_sas_phy->identify.sas_address,
+		    mr_sas_phy->attached_handle,
+		    (unsigned long long)
+		    mr_sas_phy->remote_identify.sas_address);
+	mr_sas_phy->phy = phy;
+	return 0;
+}
+
+/**
+ * mpi3mr_add_expander_phy - report expander phy to transport
+ * @mrioc: Adapter instance reference
+ * @mr_sas_phy: Intenal Phy object
+ * @expander_pg1: SAS Expander page 1
+ * @parent_dev: Parent device class object
+
+ *
+ * Return: 0 for success, non-zero for failure.
+ */
+int mpi3mr_add_expander_phy(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_sas_phy *mr_sas_phy,
+	struct mpi3_sas_expander_page1 expander_pg1,
+	struct device *parent_dev)
+{
+	struct sas_phy *phy;
+	int phy_index = mr_sas_phy->phy_id;
+
+	INIT_LIST_HEAD(&mr_sas_phy->port_siblings);
+	phy = sas_phy_alloc(parent_dev, phy_index);
+	if (!phy) {
+		ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		return -1;
+	}
+	if ((mpi3mr_set_identify(mrioc, mr_sas_phy->handle,
+	    &mr_sas_phy->identify))) {
+		ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		sas_phy_free(phy);
+		return -1;
+	}
+	phy->identify = mr_sas_phy->identify;
+	mr_sas_phy->attached_handle =
+	    le16_to_cpu(expander_pg1.attached_dev_handle);
+	if (mr_sas_phy->attached_handle)
+		mpi3mr_set_identify(mrioc, mr_sas_phy->attached_handle,
+		    &mr_sas_phy->remote_identify);
+	phy->identify.phy_identifier = mr_sas_phy->phy_id;
+	phy->negotiated_linkrate = mpi3mr_convert_phy_link_rate(
+	    (expander_pg1.negotiated_link_rate &
+	    MPI3_SAS_NEG_LINK_RATE_LOGICAL_MASK) >>
+	    MPI3_SAS_NEG_LINK_RATE_LOGICAL_SHIFT);
+	phy->minimum_linkrate_hw = mpi3mr_convert_phy_link_rate(
+	    expander_pg1.hw_link_rate & MPI3_SAS_HWRATE_MIN_RATE_MASK);
+	phy->maximum_linkrate_hw = mpi3mr_convert_phy_link_rate(
+	    expander_pg1.hw_link_rate >> 4);
+	phy->minimum_linkrate = mpi3mr_convert_phy_link_rate(
+	    expander_pg1.programmed_link_rate & MPI3_SAS_PRATE_MIN_RATE_MASK);
+	phy->maximum_linkrate = mpi3mr_convert_phy_link_rate(
+	    expander_pg1.programmed_link_rate >> 4);
+	phy->hostdata = mr_sas_phy->hba_port;
+
+	if ((sas_phy_add(phy))) {
+		ioc_err(mrioc,"failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		sas_phy_free(phy);
+		return -1;
+	}
+	if ((mrioc->logging_level & MPI3_DEBUG_TRANSPORT_INFO))
+		dev_printk(KERN_INFO, &phy->dev,
+		    "add: handle(0x%04x), sas_address(0x%016llx)\n"
+		    "\tattached_handle(0x%04x), sas_address(0x%016llx)\n",
+		    mr_sas_phy->handle, (unsigned long long)
+		    mr_sas_phy->identify.sas_address,
+		    mr_sas_phy->attached_handle,
+		    (unsigned long long)
+		    mr_sas_phy->remote_identify.sas_address);
+	mr_sas_phy->phy = phy;
+	return 0;
+}
+
+
+/**
+ * mpi3mr_update_sas_links - refreshing SAS phy link changes
+ * @mrioc: Adapter instance reference
+ * @sas_address_parent: SAS address of parent expander or host
+ * @handle: Firmware device handle of attached device
+ * @phy_number: Phy number
+ * @link_rate: New link rate
+ * @hba_port: HBA port entry
+ *
+ * Return: None.
+ */
+void mpi3mr_update_links(struct mpi3mr_ioc *mrioc,
+	u64 sas_address_parent, u16 handle, u8 phy_number, u8 link_rate,
+	struct mpi3mr_hba_port *hba_port)
+{
+	unsigned long flags;
+	struct mpi3mr_sas_node *mr_sas_node;
+	struct mpi3mr_sas_phy *mr_sas_phy;
+
+	if (mrioc->reset_in_progress)
+		return;
+
+	spin_lock_irqsave(&mrioc->sas_node_lock, flags);
+	mr_sas_node = __mpi3mr_sas_node_find_by_sas_address(mrioc,
+					 sas_address_parent, hba_port);
+	if (!mr_sas_node) {
+		spin_unlock_irqrestore(&mrioc->sas_node_lock, flags);
+		return;
+	}
+
+	mr_sas_phy = &mr_sas_node->phy[phy_number];
+	mr_sas_phy->attached_handle = handle;
+	spin_unlock_irqrestore(&mrioc->sas_node_lock, flags);
+	if (handle && (link_rate >= MPI3_SAS_NEG_LINK_RATE_1_5)) {
+		mpi3mr_set_identify(mrioc, handle,
+		    &mr_sas_phy->remote_identify);
+		mpi3mr_add_phy_to_an_existing_port(mrioc, mr_sas_node,
+		    mr_sas_phy, mr_sas_phy->remote_identify.sas_address,
+		    hba_port);
+	} else
+		memset(&mr_sas_phy->remote_identify, 0 , sizeof(struct
+		    sas_identify));
+
+	if (mr_sas_phy->phy)
+		mr_sas_phy->phy->negotiated_linkrate =
+		    mpi3mr_convert_phy_link_rate(link_rate);
+
+	if ((mrioc->logging_level & MPI3_DEBUG_TRANSPORT_INFO))
+		dev_printk(KERN_INFO, &mr_sas_phy->phy->dev,
+		    "refresh: parent sas_address(0x%016llx),\n"
+		    "\tlink_rate(0x%02x), phy(%d)\n"
+		    "\tattached_handle(0x%04x), sas_address(0x%016llx)\n",
+		    (unsigned long long)sas_address_parent,
+		    link_rate, phy_number, handle, (unsigned long long)
+		    mr_sas_phy->remote_identify.sas_address);
+}
+
+static inline struct mpi3mr_ioc *phy_to_mrioc(struct sas_phy *phy)
+{
+	struct Scsi_Host *shost = dev_to_shost(phy->dev.parent);
+	return shost_priv(shost);
+}
+
+static inline struct mpi3mr_ioc *rphy_to_mrioc(struct sas_rphy *rphy)
+{
+	struct Scsi_Host *shost = dev_to_shost(rphy->dev.parent->parent);
+	return shost_priv(shost);
+}
+
+/* report phy error log structure */
+struct phy_error_log_request {
+	u8 smp_frame_type; /* 0x40 */
+	u8 function; /* 0x11 */
+	u8 allocated_response_length;
+	u8 request_length; /* 02 */
+	u8 reserved_1[5];
+	u8 phy_identifier;
+	u8 reserved_2[2];
+};
+
+/* report phy error log reply structure */
+struct phy_error_log_reply {
+	u8 smp_frame_type; /* 0x41 */
+	u8 function; /* 0x11 */
+	u8 function_result;
+	u8 response_length;
+	__be16 expander_change_count;
+	u8 reserved_1[3];
+	u8 phy_identifier;
+	u8 reserved_2[2];
+	__be32 invalid_dword;
+	__be32 running_disparity_error;
+	__be32 loss_of_dword_sync;
+	__be32 phy_reset_problem;
+};
+
+
+/**
+ * mpi3mr_get_expander_phy_error_log - return expander counters:
+ * @mrioc: Adapter instance reference
+ * @phy: The SAS transport layer phy object
+ *
+ * Return: 0 for success, non-zero for failure.
+ *
+ */
+static int mpi3mr_get_expander_phy_error_log(struct mpi3mr_ioc *mrioc,
+	struct sas_phy *phy)
+{
+	struct mpi3_smp_passthrough_request mpi_request;
+	struct mpi3_smp_passthrough_reply mpi_reply;
+	struct phy_error_log_request *phy_error_log_request;
+	struct phy_error_log_reply *phy_error_log_reply;
+	int rc;
+	void *psge;
+	void *data_out = NULL;
+	dma_addr_t data_out_dma, data_in_dma;
+	u32 data_out_sz, data_in_sz, sz;
+	u8 sgl_flags = MPI3MR_SGEFLAGS_SYSTEM_SIMPLE_END_OF_LIST;
+	u16 request_sz = sizeof(struct mpi3_smp_passthrough_request);
+	u16 reply_sz = sizeof(struct mpi3_smp_passthrough_reply);
+	u16 ioc_status;
+
+
+	if (mrioc->reset_in_progress) {
+		ioc_err(mrioc, "%s: host reset in progress!\n", __func__);
+		return -EFAULT;
+	}
+
+
+	data_out_sz = sizeof(struct phy_error_log_request);
+	data_in_sz = sizeof(struct phy_error_log_reply);
+	sz = data_out_sz + data_in_sz;
+	data_out = dma_zalloc_coherent(&mrioc->pdev->dev, sz, &data_out_dma,
+	    GFP_KERNEL);
+	if (!data_out) {
+		ioc_err(mrioc, "failure at %s:%d/%s()!\n", __FILE__,
+		    __LINE__, __func__);
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	data_in_dma = data_out_dma + data_out_sz;
+	phy_error_log_reply = data_out + data_out_sz;
+
+	rc = -EINVAL;
+	memset(data_out, 0, sz);
+	phy_error_log_request = data_out;
+	phy_error_log_request->smp_frame_type = 0x40;
+	phy_error_log_request->function = 0x11;
+	phy_error_log_request->request_length = 2;
+	phy_error_log_request->allocated_response_length = 0;
+	phy_error_log_request->phy_identifier = phy->number;
+
+	memset(&mpi_request, 0, request_sz);
+	memset(&mpi_reply, 0, reply_sz);
+	mpi_request.host_tag = MPI3MR_HOSTTAG_TRANSPORT_CMDS;
+	mpi_request.function = MPI3_FUNCTION_SMP_PASSTHROUGH;
+	mpi_request.io_unit_port = (u8) mpi3mr_get_port_id_by_sas_phy(phy);
+	mpi_request.sas_address = cpu_to_le64(phy->identify.sas_address);
+
+	psge = &mpi_request.request_sge;
+	mpi3mr_add_sg_single(psge, sgl_flags, data_out_sz, data_out_dma);
+
+	psge = &mpi_request.response_sge;
+	mpi3mr_add_sg_single(psge, sgl_flags, data_in_sz, data_in_dma);
+
+	dprint_transport_info(mrioc,
+	    "sending phy error log SMP request to sas_address(0x%016llx), phy_id(%d)\n",
+	    (unsigned long long)phy->identify.sas_address, phy->number);
+
+	if (mpi3mr_post_transport_req(mrioc, &mpi_request, request_sz,
+	    &mpi_reply, reply_sz, MPI3MR_INTADMCMD_TIMEOUT, &ioc_status))
+		goto out;
+
+	dprint_transport_info(mrioc,
+	    "phy error log SMP request completed with ioc_status(0x%04x)\n",
+	    ioc_status);
+
+
+	if (ioc_status == MPI3_IOCSTATUS_SUCCESS) {
+		dprint_transport_info(mrioc,
+		    "phy error log - reply data transfer size(%d)\n",
+		    le16_to_cpu(mpi_reply.response_data_length));
+
+		if (le16_to_cpu(mpi_reply.response_data_length) !=
+		    sizeof(struct phy_error_log_reply))
+			goto out;
+
+
+		dprint_transport_info(mrioc,
+		    "phy error log - function_result(%d)\n",
+		    phy_error_log_reply->function_result);
+
+		phy->invalid_dword_count =
+		    be32_to_cpu(phy_error_log_reply->invalid_dword);
+		phy->running_disparity_error_count =
+		    be32_to_cpu(phy_error_log_reply->running_disparity_error);
+		phy->loss_of_dword_sync_count =
+		    be32_to_cpu(phy_error_log_reply->loss_of_dword_sync);
+		phy->phy_reset_problem_count =
+		    be32_to_cpu(phy_error_log_reply->phy_reset_problem);
+		rc = 0;
+	}
+
+out:
+	if (data_out)
+		dma_free_coherent(&mrioc->pdev->dev, sz, data_out,
+		    data_out_dma);
+
+	return rc;
+}
+
+
+/**
+ * mpi3mr_transport_get_linkerrors - return phy error counters
+ * @phy: The SAS transport layer phy object
+ *
+ * This function retrieves the phy error log information of the
+ * HBA or expander for which the phy belongs to
+ *
+ * Return: 0 for success, non-zero for failure.
+ *
+ */
+static int mpi3mr_transport_get_linkerrors(struct sas_phy *phy)
+{
+	struct mpi3mr_ioc *mrioc = phy_to_mrioc(phy);
+	struct mpi3_sas_phy_page1 phy_pg1;
+	int rc = 0;
+	u16 ioc_status;
+
+	rc = mpi3mr_parent_present(mrioc, phy);
+	if (rc)
+		return rc;
+
+	if (phy->identify.sas_address != mrioc->sas_hba.sas_address)
+		return mpi3mr_get_expander_phy_error_log(mrioc, phy);
+
+	memset(&phy_pg1, 0, sizeof(struct mpi3_sas_phy_page1));
+	/* get hba phy error logs */
+	if ((mpi3mr_cfg_get_sas_phy_pg1(mrioc, &ioc_status, &phy_pg1,
+	    sizeof(struct mpi3_sas_phy_page1),
+	    MPI3_SAS_PHY_PGAD_FORM_PHY_NUMBER, phy->number))) {
+		ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		return -ENXIO;
+	}
+
+	if (ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		return -ENXIO;
+	}
+	phy->invalid_dword_count = le32_to_cpu(phy_pg1.invalid_dword_count);
+	phy->running_disparity_error_count =
+		le32_to_cpu(phy_pg1.running_disparity_error_count);
+	phy->loss_of_dword_sync_count =
+		le32_to_cpu(phy_pg1.loss_dword_synch_count);
+	phy->phy_reset_problem_count =
+		le32_to_cpu(phy_pg1.phy_reset_problem_count);
+	return 0;
+}
+
+
+/**
+ * mpi3mr_transport_get_enclosure_identifier - Get Enclosure ID
+ * @rphy: The SAS transport layer remote phy object
+ * @identifier: Enclosure identifier to be returned
+ *
+ * Returns the enclosure id for the device pointed by the remote
+ * phy object.
+ *
+ * Return: 0 on success or -ENXIO
+ */
+static int
+mpi3mr_transport_get_enclosure_identifier(struct sas_rphy *rphy,
+	u64 *identifier)
+{
+	struct mpi3mr_ioc *mrioc = rphy_to_mrioc(rphy);
+	struct mpi3mr_tgt_dev *tgtdev = NULL;
+	unsigned long flags;
+	int rc;
+
+	spin_lock_irqsave(&mrioc->tgtdev_lock, flags);
+	tgtdev = __mpi3mr_get_tgtdev_by_addr_and_rphy(mrioc,
+	    rphy->identify.sas_address, rphy);
+	if (tgtdev) {
+		*identifier =
+			tgtdev->enclosure_logical_id;
+		rc = 0;
+		mpi3mr_tgtdev_put(tgtdev);
+	} else {
+		*identifier = 0;
+		rc = -ENXIO;
+	}
+	spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags);
+
+	return rc;
+}
+
+/**
+ * mpi3mr_transport_get_bay_identifier - Get bay ID
+ * @rphy: The SAS transport layer remote phy object
+ *
+ * Returns the slot id for the device pointed by the remote phy
+ * object.
+ *
+ * Return: Valid slot ID on success or -ENXIO
+ */
+static int
+mpi3mr_transport_get_bay_identifier(struct sas_rphy *rphy)
+{
+
+	struct mpi3mr_ioc *mrioc = rphy_to_mrioc(rphy);
+	struct mpi3mr_tgt_dev *tgtdev = NULL;
+	unsigned long flags;
+	int rc;
+
+	spin_lock_irqsave(&mrioc->tgtdev_lock, flags);
+	tgtdev = __mpi3mr_get_tgtdev_by_addr_and_rphy(mrioc,
+	    rphy->identify.sas_address, rphy);
+	if (tgtdev) {
+		rc = tgtdev->slot;
+		mpi3mr_tgtdev_put(tgtdev);
+	} else
+		rc = -ENXIO;
+	spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags);
+
+	return rc;
+}
+
+/* phy control request structure */
+struct phy_control_request {
+	u8 smp_frame_type; /* 0x40 */
+	u8 function; /* 0x91 */
+	u8 allocated_response_length;
+	u8 request_length; /* 0x09 */
+	u16 expander_change_count;
+	u8 reserved_1[3];
+	u8 phy_identifier;
+	u8 phy_operation;
+	u8 reserved_2[13];
+	u64 attached_device_name;
+	u8 programmed_min_physical_link_rate;
+	u8 programmed_max_physical_link_rate;
+	u8 reserved_3[6];
+};
+
+/* phy control reply structure */
+struct phy_control_reply {
+	u8 smp_frame_type; /* 0x41 */
+	u8 function; /* 0x11 */
+	u8 function_result;
+	u8 response_length;
+};
+
+#define SMP_PHY_CONTROL_LINK_RESET	(0x01)
+#define SMP_PHY_CONTROL_HARD_RESET	(0x02)
+#define SMP_PHY_CONTROL_DISABLE		(0x03)
+
+/**
+ * mpi3mr_expander_phy_control - expander phy control
+ * @mrioc: Adapter instance reference
+ * @phy: The SAS transport layer phy object
+ * @phy_operation: The phy operation to be executed
+ *
+ * Issues SMP passthru phy control reuest to execute a specific
+ * phy operation for a given expander device.
+ *
+ * Return: 0 for success, non-zero for failure.
+ *
+ */
+static int
+mpi3mr_expander_phy_control(struct mpi3mr_ioc *mrioc,
+	struct sas_phy *phy, u8 phy_operation)
+{
+	struct mpi3_smp_passthrough_request mpi_request;
+	struct mpi3_smp_passthrough_reply mpi_reply;
+	struct phy_control_request *phy_control_request;
+	struct phy_control_reply *phy_control_reply;
+	int rc;
+	void *psge;
+	void *data_out = NULL;
+	dma_addr_t data_out_dma;
+	dma_addr_t data_in_dma;
+	size_t data_in_sz;
+	size_t data_out_sz;
+	u8 sgl_flags = MPI3MR_SGEFLAGS_SYSTEM_SIMPLE_END_OF_LIST;
+	u16 request_sz = sizeof(struct mpi3_smp_passthrough_request);
+	u16 reply_sz = sizeof(struct mpi3_smp_passthrough_reply);
+	u16 ioc_status;
+	u16 sz;
+
+	if (mrioc->reset_in_progress) {
+		ioc_err(mrioc, "%s: host reset in progress!\n", __func__);
+		return -EFAULT;
+	}
+
+
+	data_out_sz = sizeof(struct phy_control_request);
+	data_in_sz = sizeof(struct phy_control_reply);
+	sz = data_out_sz + data_in_sz;
+	data_out = dma_zalloc_coherent(&mrioc->pdev->dev, sz, &data_out_dma,
+	    GFP_KERNEL);
+	if (!data_out) {
+		ioc_err(mrioc, "failure at %s:%d/%s()!\n", __FILE__,
+		    __LINE__, __func__);
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	data_in_dma = data_out_dma + data_out_sz;
+	phy_control_reply = data_out + data_out_sz;
+
+	rc = -EINVAL;
+	memset(data_out, 0, sz);
+
+	phy_control_request = data_out;
+	phy_control_request->smp_frame_type = 0x40;
+	phy_control_request->function = 0x91;
+	phy_control_request->request_length = 9;
+	phy_control_request->allocated_response_length = 0;
+	phy_control_request->phy_identifier = phy->number;
+	phy_control_request->phy_operation = phy_operation;
+	phy_control_request->programmed_min_physical_link_rate =
+	    phy->minimum_linkrate << 4;
+	phy_control_request->programmed_max_physical_link_rate =
+	    phy->maximum_linkrate << 4;
+
+	memset(&mpi_request, 0, request_sz);
+	memset(&mpi_reply, 0, reply_sz);
+	mpi_request.host_tag = MPI3MR_HOSTTAG_TRANSPORT_CMDS;
+	mpi_request.function = MPI3_FUNCTION_SMP_PASSTHROUGH;
+	mpi_request.io_unit_port = (u8) mpi3mr_get_port_id_by_sas_phy(phy);
+	mpi_request.sas_address = cpu_to_le64(phy->identify.sas_address);
+
+	psge = &mpi_request.request_sge;
+	mpi3mr_add_sg_single(psge, sgl_flags, data_out_sz, data_out_dma);
+
+	psge = &mpi_request.response_sge;
+	mpi3mr_add_sg_single(psge, sgl_flags, data_in_sz, data_in_dma);
+
+	dprint_transport_info(mrioc,
+	    "sending phy control SMP request to sas_address(0x%016llx), phy_id(%d) opcode(%d)\n",
+	    (unsigned long long)phy->identify.sas_address, phy->number,
+	    phy_operation);
+
+	if (mpi3mr_post_transport_req(mrioc, &mpi_request, request_sz,
+	    &mpi_reply, reply_sz, MPI3MR_INTADMCMD_TIMEOUT, &ioc_status))
+		goto out;
+
+	dprint_transport_info(mrioc,
+	    "phy control SMP request completed with ioc_status(0x%04x)\n",
+	    ioc_status);
+
+
+	if (ioc_status == MPI3_IOCSTATUS_SUCCESS) {
+		dprint_transport_info(mrioc,
+		    "phy control - reply data transfer size(%d)\n",
+		    le16_to_cpu(mpi_reply.response_data_length));
+
+		if (le16_to_cpu(mpi_reply.response_data_length) !=
+		    sizeof(struct phy_control_reply))
+			goto out;
+		dprint_transport_info(mrioc,
+		    "phy control - function_result(%d)\n",
+		    phy_control_reply->function_result);
+		rc = 0;
+	}
+ out:
+	if (data_out)
+		dma_free_coherent(&mrioc->pdev->dev, sz, data_out,
+		    data_out_dma);
+
+	return rc;
+}
+
+/**
+ * mpi3mr_transport_phy_reset - Reset a given phy
+ * @phy: The SAS transport layer phy object
+ * @hard_reset: Flag to indicate the type of reset
+ *
+ * Return: 0 for success, non-zero for failure.
+ */
+static int
+mpi3mr_transport_phy_reset(struct sas_phy *phy, int hard_reset)
+{
+	struct mpi3mr_ioc *mrioc = phy_to_mrioc(phy);
+	struct mpi3_iounit_control_request mpi_request;
+	struct mpi3_iounit_control_reply mpi_reply;
+	u16 request_sz = sizeof(struct mpi3_iounit_control_request);
+	u16 reply_sz = sizeof(struct mpi3_iounit_control_reply);
+	int rc = 0;
+	u16 ioc_status;
+
+	rc = mpi3mr_parent_present(mrioc, phy);
+	if (rc)
+		return rc;
+
+	/* handle expander phys */
+	if (phy->identify.sas_address != mrioc->sas_hba.sas_address)
+		return mpi3mr_expander_phy_control(mrioc, phy,
+		    (hard_reset == 1) ? SMP_PHY_CONTROL_HARD_RESET :
+		    SMP_PHY_CONTROL_LINK_RESET);
+
+	/* handle hba phys */
+	memset(&mpi_request, 0, request_sz);
+	mpi_request.host_tag = MPI3MR_HOSTTAG_TRANSPORT_CMDS;
+	mpi_request.function = MPI3_FUNCTION_IO_UNIT_CONTROL;
+	mpi_request.operation = MPI3_CTRL_OP_SAS_PHY_CONTROL;
+	mpi_request.param8[MPI3_CTRL_OP_SAS_PHY_CONTROL_PARAM8_ACTION_INDEX] =
+		(hard_reset ? MPI3_CTRL_ACTION_HARD_RESET :
+		 MPI3_CTRL_ACTION_LINK_RESET);
+	mpi_request.param8[MPI3_CTRL_OP_SAS_PHY_CONTROL_PARAM8_PHY_INDEX] =
+		phy->number;
+
+	dprint_transport_info(mrioc,
+	    "sending phy reset request to sas_address(0x%016llx), phy_id(%d) hard_reset(%d)\n",
+	    (unsigned long long)phy->identify.sas_address, phy->number,
+	    hard_reset);
+
+	if (mpi3mr_post_transport_req(mrioc, &mpi_request, request_sz,
+	    &mpi_reply, reply_sz, MPI3MR_INTADMCMD_TIMEOUT, &ioc_status)) {
+		rc = -EAGAIN;
+		goto out;
+	}
+
+	dprint_transport_info(mrioc,
+	    "phy reset request completed with ioc_status(0x%04x)\n",
+	    ioc_status);
+out:
+	return rc;
+}
+
+/**
+ * mpi3mr_transport_phy_enable - enable/disable phys
+ * @phy: The SAS transport layer phy object
+ * @enable: flag to enable/disable, enable phy when true
+ *
+ * This function enables/disables a given by executing required
+ * configuration page changes or expander phy control command
+ *
+ * Return: 0 for success, non-zero for failure.
+ */
+static int
+mpi3mr_transport_phy_enable(struct sas_phy *phy, int enable)
+{
+	struct mpi3mr_ioc *mrioc = phy_to_mrioc(phy);
+	struct mpi3_sas_io_unit_page0 *sas_io_unit_pg0 = NULL;
+	struct mpi3_sas_io_unit_page1 *sas_io_unit_pg1 = NULL;
+	u16 sz;
+	int rc = 0;
+	int i, discovery_active;
+
+	rc = mpi3mr_parent_present(mrioc, phy);
+	if (rc)
+		return rc;
+
+	/* handle expander phys */
+	if (phy->identify.sas_address != mrioc->sas_hba.sas_address)
+		return mpi3mr_expander_phy_control(mrioc, phy,
+		    (enable == 1) ? SMP_PHY_CONTROL_LINK_RESET :
+		    SMP_PHY_CONTROL_DISABLE);
+
+	/* handle hba phys */
+
+	/* read sas_iounit page 0 */
+	sz = offsetof(struct mpi3_sas_io_unit_page0, phy_data) +
+		(mrioc->sas_hba.num_phys *
+		 sizeof(struct mpi3_sas_io_unit0_phy_data));
+	sas_io_unit_pg0 = kzalloc(sz, GFP_KERNEL);
+	if (!sas_io_unit_pg0) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	if (mpi3mr_cfg_get_sas_io_unit_pg0(mrioc, sas_io_unit_pg0, sz)) {
+		ioc_err(mrioc,"failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		rc = -ENXIO;
+		goto out;
+	}
+
+	/* unable to enable/disable phys when when discovery is active */
+	for (i = 0, discovery_active = 0; i < mrioc->sas_hba.num_phys ; i++) {
+		if (sas_io_unit_pg0->phy_data[i].port_flags &
+		    MPI3_SASIOUNIT0_PORTFLAGS_DISC_IN_PROGRESS) {
+			ioc_err(mrioc, "discovery is active on "
+			    "port = %d, phy = %d: unable to enable/disable "
+			    "phys, try again later!\n",
+			    sas_io_unit_pg0->phy_data[i].io_unit_port, i);
+			discovery_active = 1;
+		}
+	}
+
+	if (discovery_active) {
+		rc = -EAGAIN;
+		goto out;
+	}
+
+	if ((sas_io_unit_pg0->phy_data[phy->number].phy_flags &
+	     ( MPI3_SASIOUNIT0_PHYFLAGS_HOST_PHY |
+	      MPI3_SASIOUNIT0_PHYFLAGS_VIRTUAL_PHY)))
+	{
+		ioc_err(mrioc,"failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		rc = -ENXIO;
+		goto out;
+	}
+
+
+	/* read sas_iounit page 1 */
+	sz = offsetof(struct mpi3_sas_io_unit_page1, phy_data) +
+		(mrioc->sas_hba.num_phys *
+		 sizeof(struct mpi3_sas_io_unit1_phy_data));
+	sas_io_unit_pg1 = kzalloc(sz, GFP_KERNEL);
+	if (!sas_io_unit_pg1) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	if (mpi3mr_cfg_get_sas_io_unit_pg1(mrioc, sas_io_unit_pg1, sz)) {
+		ioc_err(mrioc,"failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		rc = -ENXIO;
+		goto out;
+	}
+
+	if (enable)
+		sas_io_unit_pg1->phy_data[phy->number].phy_flags
+		    &= ~MPI3_SASIOUNIT1_PHYFLAGS_PHY_DISABLE;
+	else
+		sas_io_unit_pg1->phy_data[phy->number].phy_flags
+		    |= MPI3_SASIOUNIT1_PHYFLAGS_PHY_DISABLE;
+
+	mpi3mr_cfg_set_sas_io_unit_pg1(mrioc, sas_io_unit_pg1, sz);
+
+	/* link reset */
+	if (enable)
+		mpi3mr_transport_phy_reset(phy, 0);
+
+ out:
+	kfree(sas_io_unit_pg1);
+	kfree(sas_io_unit_pg0);
+	return rc;
+}
+
+/**
+ * mpi3mr_transport_phy_speed - set phy min/max speed
+ * @phy: The SAS transport later phy object
+ * @rates: Rates defined as in sas_phy_linkrates
+ *
+ * This function sets the the link rates given in the rates
+ * argument to the given phy by executing required configuration
+ * page changes or expander phy control command
+ *
+ * Return: 0 for success, non-zero for failure.
+ */
+static int
+mpi3mr_transport_phy_speed(struct sas_phy *phy, struct sas_phy_linkrates *rates)
+{
+
+	struct mpi3mr_ioc *mrioc = phy_to_mrioc(phy);
+	struct mpi3_sas_io_unit_page1 *sas_io_unit_pg1 = NULL;
+	struct mpi3_sas_phy_page0 phy_pg0;
+	u16 sz, ioc_status;
+	int rc = 0;
+
+	rc = mpi3mr_parent_present(mrioc, phy);
+	if (rc)
+		return rc;
+
+	if (!rates->minimum_linkrate)
+		rates->minimum_linkrate = phy->minimum_linkrate;
+	else if (rates->minimum_linkrate < phy->minimum_linkrate_hw)
+		rates->minimum_linkrate = phy->minimum_linkrate_hw;
+
+	if (!rates->maximum_linkrate)
+		rates->maximum_linkrate = phy->maximum_linkrate;
+	else if (rates->maximum_linkrate > phy->maximum_linkrate_hw)
+		rates->maximum_linkrate = phy->maximum_linkrate_hw;
+
+	/* handle expander phys */
+	if (phy->identify.sas_address != mrioc->sas_hba.sas_address) {
+		phy->minimum_linkrate = rates->minimum_linkrate;
+		phy->maximum_linkrate = rates->maximum_linkrate;
+		return mpi3mr_expander_phy_control(mrioc, phy,
+		    SMP_PHY_CONTROL_LINK_RESET);
+	}
+
+	/* handle hba phys */
+
+	/* sas_iounit page 1 */
+	sz = offsetof(struct mpi3_sas_io_unit_page1, phy_data) +
+		(mrioc->sas_hba.num_phys *
+		 sizeof(struct mpi3_sas_io_unit1_phy_data));
+	sas_io_unit_pg1 = kzalloc(sz, GFP_KERNEL);
+	if (!sas_io_unit_pg1) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	if (mpi3mr_cfg_get_sas_io_unit_pg1(mrioc, sas_io_unit_pg1, sz)) {
+		ioc_err(mrioc,"failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		rc = -ENXIO;
+		goto out;
+	}
+
+	sas_io_unit_pg1->phy_data[phy->number].max_min_link_rate =
+		(rates->minimum_linkrate + (rates->maximum_linkrate << 4));
+
+	if (mpi3mr_cfg_set_sas_io_unit_pg1(mrioc, sas_io_unit_pg1, sz)) {
+		ioc_err(mrioc,"failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		rc = -ENXIO;
+		goto out;
+	}
+
+	/* link reset */
+	mpi3mr_transport_phy_reset(phy, 0);
+
+	/* read phy page 0, then update the rates in the sas transport phy */
+	if (!mpi3mr_cfg_get_sas_phy_pg0(mrioc, &ioc_status, &phy_pg0,
+	    sizeof(struct mpi3_sas_phy_page0),
+	    MPI3_SAS_PHY_PGAD_FORM_PHY_NUMBER, phy->number) &&
+	    (ioc_status == MPI3_IOCSTATUS_SUCCESS)) {
+		phy->minimum_linkrate = mpi3mr_convert_phy_link_rate(
+		    phy_pg0.programmed_link_rate &
+		    MPI3_SAS_PRATE_MIN_RATE_MASK);
+		phy->maximum_linkrate = mpi3mr_convert_phy_link_rate(
+		    phy_pg0.programmed_link_rate >> 4);
+		phy->negotiated_linkrate =
+			mpi3mr_convert_phy_link_rate(
+			    (phy_pg0.negotiated_link_rate &
+			    MPI3_SAS_NEG_LINK_RATE_LOGICAL_MASK)
+			    >> MPI3_SAS_NEG_LINK_RATE_LOGICAL_SHIFT);
+	}
+
+out:
+	kfree(sas_io_unit_pg1);
+	return rc;
+}
+
+
+/**
+ * mpi3mr_map_smp_buffer - map BSG dma buffer
+ * @dev: Generic device reference
+ * @buf: BSG buffer pointer
+ * @dma_addr: Phyiscal address holder
+ * @dma_len: Mapped DMA buffer length.
+ * @p: Virtual address holder
+ *
+ * This function maps the DMAable buffer
+ *
+ * Return: 0 on success, non-zero on failure
+ */
+
+static int
+mpi3mr_map_smp_buffer(struct device *dev, struct bsg_buffer *buf,
+		dma_addr_t *dma_addr, size_t *dma_len, void **p)
+{
+	/* Check if the request is split across multiple segments */
+	if (buf->sg_cnt > 1) {
+		*p = dma_zalloc_coherent(dev, buf->payload_len, dma_addr,
+				GFP_KERNEL);
+		if (!*p)
+			return -ENOMEM;
+		*dma_len = buf->payload_len;
+	} else {
+		if (!dma_map_sg(dev, buf->sg_list, 1, DMA_BIDIRECTIONAL))
+			return -ENOMEM;
+		*dma_addr = sg_dma_address(buf->sg_list);
+		*dma_len = sg_dma_len(buf->sg_list);
+		*p = NULL;
+	}
+
+	return 0;
+}
+
+/**
+ * mpi3mr_unmap_smp_buffer - unmap BSG dma buffer
+ * @dev: Generic device reference
+ * @buf: BSG buffer pointer
+ * @dma_addr: Phyiscal address to be unmapped
+ * @p: Virtual address
+ *
+ * This function unmaps the DMAable buffer
+ */
+
+static void
+mpi3mr_unmap_smp_buffer(struct device *dev, struct bsg_buffer *buf,
+		dma_addr_t dma_addr, void *p)
+{
+	if (p)
+		dma_free_coherent(dev, buf->payload_len, p, dma_addr);
+	else
+		dma_unmap_sg(dev, buf->sg_list, 1, DMA_BIDIRECTIONAL);
+}
+
+/**
+ * mpi3mr_transport_smp_handler - handler for smp passthru
+ * @job: BSG job reference
+ * @shost: SCSI host object reference
+ * @rphy: SAS transport rphy object pointing the expander
+ *
+ * This is used primarily by smp utils for sending the SMP
+ * commands to the expanders attached to the controller
+ */
+static void
+mpi3mr_transport_smp_handler(struct bsg_job *job, struct Scsi_Host *shost,
+		struct sas_rphy *rphy)
+{
+
+	struct mpi3mr_ioc *mrioc = shost_priv(shost);
+	struct mpi3_smp_passthrough_request mpi_request;
+	struct mpi3_smp_passthrough_reply mpi_reply;
+	int rc;
+	void *psge;
+	dma_addr_t dma_addr_in;
+	dma_addr_t dma_addr_out;
+	void *addr_in = NULL;
+	void *addr_out = NULL;
+	size_t dma_len_in;
+	size_t dma_len_out;
+	unsigned int reslen = 0;
+	u16 request_sz = sizeof(struct mpi3_smp_passthrough_request);
+	u16 reply_sz = sizeof(struct mpi3_smp_passthrough_reply);
+	u8 sgl_flags = MPI3MR_SGEFLAGS_SYSTEM_SIMPLE_END_OF_LIST;
+	u16 ioc_status;
+
+
+	if (mrioc->reset_in_progress) {
+		ioc_err(mrioc, "%s: host reset in progress!\n", __func__);
+		rc = -EFAULT;
+		goto out;
+	}
+
+	rc = mpi3mr_map_smp_buffer(&mrioc->pdev->dev, &job->request_payload,
+	    &dma_addr_out, &dma_len_out, &addr_out);
+	if (rc)
+		goto out;
+
+	if (addr_out)
+		sg_copy_to_buffer(job->request_payload.sg_list,
+		    job->request_payload.sg_cnt, addr_out,
+		    job->request_payload.payload_len);
+
+	rc = mpi3mr_map_smp_buffer(&mrioc->pdev->dev, &job->reply_payload,
+			&dma_addr_in, &dma_len_in, &addr_in);
+	if (rc)
+		goto unmap_out;
+
+	memset(&mpi_request, 0, request_sz);
+	memset(&mpi_reply, 0, reply_sz);
+	mpi_request.host_tag = MPI3MR_HOSTTAG_TRANSPORT_CMDS;
+	mpi_request.function = MPI3_FUNCTION_SMP_PASSTHROUGH;
+	mpi_request.io_unit_port = (u8) mpi3mr_get_port_id_by_rphy(mrioc, rphy);
+	mpi_request.sas_address = ((rphy) ?
+				   cpu_to_le64(rphy->identify.sas_address) :
+				   cpu_to_le64(mrioc->sas_hba.sas_address));
+	psge = &mpi_request.request_sge;
+	mpi3mr_add_sg_single(psge, sgl_flags, dma_len_out - 4, dma_addr_out);
+
+	psge = &mpi_request.response_sge;
+	mpi3mr_add_sg_single(psge, sgl_flags, dma_len_in - 4, dma_addr_in);
+
+
+	dprint_transport_info(mrioc, "sending SMP request \n");
+
+	if (mpi3mr_post_transport_req(mrioc, &mpi_request, request_sz,
+	    &mpi_reply, reply_sz, MPI3MR_INTADMCMD_TIMEOUT, &ioc_status))
+		goto unmap_in;
+
+	dprint_transport_info(mrioc,
+	    "SMP request completed with ioc_status(0x%04x)\n", ioc_status);
+
+
+	dprint_transport_info(mrioc,
+		    "SMP request - reply data transfer size(%d)\n",
+		    le16_to_cpu(mpi_reply.response_data_length));
+
+	memcpy(job->reply, &mpi_reply, reply_sz);
+	job->reply_len = reply_sz;
+	reslen = le16_to_cpu(mpi_reply.response_data_length);
+
+	if (addr_in)
+		sg_copy_from_buffer(job->reply_payload.sg_list,
+				job->reply_payload.sg_cnt, addr_in,
+				job->reply_payload.payload_len);
+
+	rc = 0;
+unmap_in:
+	mpi3mr_unmap_smp_buffer(&mrioc->pdev->dev, &job->reply_payload,
+			dma_addr_in, addr_in);
+unmap_out:
+	mpi3mr_unmap_smp_buffer(&mrioc->pdev->dev, &job->request_payload,
+			dma_addr_out, addr_out);
+out:
+	bsg_job_done(job, rc, reslen);
+
+}
+
+
+
+struct sas_function_template mpi3mr_transport_functions = {
+	.get_linkerrors		= mpi3mr_transport_get_linkerrors,
+	.get_enclosure_identifier = mpi3mr_transport_get_enclosure_identifier,
+	.get_bay_identifier	= mpi3mr_transport_get_bay_identifier,
+	.phy_reset		= mpi3mr_transport_phy_reset,
+	.phy_enable		= mpi3mr_transport_phy_enable,
+	.set_phy_speed		= mpi3mr_transport_phy_speed,
+	.smp_handler		= mpi3mr_transport_smp_handler,
+};
+
+struct scsi_transport_template *mpi3mr_transport_template;
+
+/**
+ * struct host_port - host port details
+ * @sas_address: SAS Address of the attached device
+ * @phy_mask: phy mask of host port
+ * @handle: Device Handle of attached device
+ * @iounit_port_id: port ID
+ * @used: host port is already matched with sas port from sas_port_list
+ * lowest_phy: lowest phy ID of host port
+ */
+struct host_port {
+	u64	sas_address;
+	u32	phy_mask;
+	u16	handle;
+	u8	iounit_port_id;
+	u8	used;
+	u8	lowest_phy;
+};
+
+
+/**
+ * mpi3mr_update_mr_sas_port - update sas port objects during reset
+ * @mrioc: Adapter instance reference
+ * @h_port: host_port object
+ * @mr_sas_port: sas_port objects which needs to be updated
+ *
+ * Update the port ID of sas port object. Also add the phys if new phys got
+ * added to current sas port and remove the phys if some phys are moved
+ * out of the current sas port.
+ *
+ * Return: Nothing.
+ */
+void
+mpi3mr_update_mr_sas_port(struct mpi3mr_ioc *mrioc, struct host_port *h_port,
+	     struct mpi3mr_sas_port *mr_sas_port)
+{
+	struct mpi3mr_sas_phy *mr_sas_phy;
+	u32 phy_mask_xor, phys_to_be_added, phys_to_be_removed;
+	int i;
+
+	h_port->used = 1;
+	mr_sas_port->marked_responding = 1;
+
+	dev_printk(KERN_INFO, &mr_sas_port->port->dev,
+	    "sas_address(0x%016llx), old: port_id %d phy_mask 0x%x, new: port_id %d phy_mask:0x%x\n",
+	    mr_sas_port->remote_identify.sas_address,
+	    mr_sas_port->hba_port->port_id, mr_sas_port->phy_mask,
+	    h_port->iounit_port_id, h_port->phy_mask);
+
+	mr_sas_port->hba_port->port_id = h_port->iounit_port_id;
+	mr_sas_port->hba_port->flags &= ~MPI3MR_HBA_PORT_FLAG_DIRTY;
+
+	/* Get the newly added phys bit map & removed phys bit map */
+	phy_mask_xor = mr_sas_port->phy_mask ^ h_port->phy_mask;
+	phys_to_be_added = h_port->phy_mask & phy_mask_xor;
+	phys_to_be_removed = mr_sas_port->phy_mask & phy_mask_xor;
+
+	/* Register these new phys to current mr_sas_port's port.
+	 * if these phys are previously registered with another port
+	 * then delete these phys from that port first.
+	 */
+	for_each_set_bit(i, (ulong *) &phys_to_be_added, BITS_PER_TYPE(u32)) {
+		mr_sas_phy = &mrioc->sas_hba.phy[i];
+		if (mr_sas_phy->phy_belongs_to_port)
+			mpi3mr_del_phy_from_an_existing_port(mrioc,
+			    &mrioc->sas_hba, mr_sas_phy);
+		mpi3mr_add_phy_to_an_existing_port(mrioc,
+		    &mrioc->sas_hba, mr_sas_phy,
+		    mr_sas_port->remote_identify.sas_address,
+		    mr_sas_port->hba_port);
+	}
+
+	/* Delete the phys which are not part of current mr_sas_port's port. */
+	for_each_set_bit(i, (ulong *) &phys_to_be_removed, BITS_PER_TYPE(u32)) {
+		mr_sas_phy = &mrioc->sas_hba.phy[i];
+		if (mr_sas_phy->phy_belongs_to_port)
+			mpi3mr_del_phy_from_an_existing_port(mrioc,
+			    &mrioc->sas_hba, mr_sas_phy);
+	}
+}
+
+/**
+ * mpi3mr_refresh_sas_ports - update host's sas ports during reset
+ * @mrioc: Adapter instance reference
+ *
+ * Update the host's sas ports during reset by checking whether
+ * sas ports are still intact or not. Add/remove phys if any hba
+ * phys are (moved in)/(moved out) of sas port. Also update
+ * io_unit_port if it got changed during reset.
+ *
+ * Return: Nothing.
+ */
+void
+mpi3mr_refresh_sas_ports(struct mpi3mr_ioc *mrioc)
+{
+	struct host_port h_port[32];
+	int i, j, found, host_port_count = 0, port_idx, num_phys;
+	u16 sz, attached_handle, ioc_status;
+	struct mpi3_sas_io_unit_page0 *sas_io_unit_pg0 = NULL;
+	struct mpi3_device_page0 dev_pg0;
+	struct mpi3_device0_sas_sata_format *sasinf;
+	struct mpi3mr_sas_port *mr_sas_port;
+
+	sz = offsetof(struct mpi3_sas_io_unit_page0, phy_data) +
+		(mrioc->sas_hba.num_phys *
+		 sizeof(struct mpi3_sas_io_unit0_phy_data));
+	sas_io_unit_pg0 = kzalloc(sz, GFP_KERNEL);
+	if (!sas_io_unit_pg0)
+		return;
+	if (mpi3mr_cfg_get_sas_io_unit_pg0(mrioc, sas_io_unit_pg0, sz)) {
+		ioc_err(mrioc,"failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		goto out;
+	}
+
+	/* Create a new expander port table */
+	num_phys = min_t(int,
+	    mrioc->sas_hba.num_phys, MPI3MR_MAX_PHYSICAL_PHYS);
+	for (i = 0; i < num_phys; i++) {
+		attached_handle = le16_to_cpu(
+		    sas_io_unit_pg0->phy_data[i].attached_dev_handle);
+		if (!attached_handle)
+			continue;
+		found = 0;
+		for (j = 0; j < host_port_count; j++) {
+			if (h_port[j].handle == attached_handle) {
+				h_port[j].phy_mask |= (1 << i);
+				found = 1;
+				break;
+			}
+		}
+		if (found)
+			continue;
+		if ((mpi3mr_cfg_get_dev_pg0(mrioc, &ioc_status, &dev_pg0,
+		    sizeof(dev_pg0), MPI3_DEVICE_PGAD_FORM_HANDLE,
+		    attached_handle))) {
+			dprint_reset(mrioc,
+			    "failed to read dev_pg0 for handle(0x%04x) at %s:%d/%s()!\n",
+			    attached_handle, __FILE__, __LINE__, __func__);
+			continue;
+		}
+		if (ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+			dprint_reset(mrioc,
+			    "ioc_status(0x%x) while reading dev_pg0 for handle(0x%04x) at %s:%d/%s()!\n",
+			    ioc_status, attached_handle,
+			    __FILE__, __LINE__, __func__);
+			continue;
+		}
+		sasinf = &dev_pg0.device_specific.sas_sata_format;
+
+		port_idx = host_port_count;
+		h_port[port_idx].sas_address = le64_to_cpu(sasinf->sas_address);
+		h_port[port_idx].handle = attached_handle;
+		h_port[port_idx].phy_mask = (1 << i);
+		h_port[port_idx].iounit_port_id = sas_io_unit_pg0->phy_data[i].io_unit_port;
+		h_port[port_idx].lowest_phy = sasinf->phy_num;
+		h_port[port_idx].used = 0;
+		host_port_count++;
+	}
+
+	if (!host_port_count)
+		goto out;
+
+	if (mrioc->logging_level & MPI3_DEBUG_RESET) {
+		ioc_info(mrioc, "Host port details before reset\n");
+		list_for_each_entry(mr_sas_port, &mrioc->sas_hba.sas_port_list,
+		    port_list) {
+			ioc_info(mrioc,
+			    "port_id:%d, sas_address:(0x%016llx), phy_mask:(0x%x), lowest phy id:%d\n",
+			    mr_sas_port->hba_port->port_id,
+			    mr_sas_port->remote_identify.sas_address,
+			    mr_sas_port->phy_mask, mr_sas_port->lowest_phy);
+		}
+		mr_sas_port = NULL;
+		ioc_info(mrioc, "Host port details after reset\n");
+		for (i = 0; i < host_port_count; i++) {
+			ioc_info(mrioc,
+			    "port_id:%d, sas_address:(0x%016llx), phy_mask:(0x%x), lowest phy id:%d\n",
+			    h_port[i].iounit_port_id, h_port[i].sas_address,
+			    h_port[i].phy_mask, h_port[i].lowest_phy);
+		}
+	}
+
+	/* mark all host sas port entries as dirty */
+	list_for_each_entry(mr_sas_port, &mrioc->sas_hba.sas_port_list,
+	    port_list) {
+		mr_sas_port->marked_responding = 0;
+		mr_sas_port->hba_port->flags |= MPI3MR_HBA_PORT_FLAG_DIRTY;
+	}
+
+	/* First check for matching lowest phy */
+	for (i = 0; i < host_port_count; i++) {
+		mr_sas_port = NULL;
+		list_for_each_entry(mr_sas_port, &mrioc->sas_hba.sas_port_list,
+		    port_list) {
+			if (mr_sas_port->marked_responding)
+				continue;
+			if (h_port[i].sas_address != mr_sas_port->remote_identify.sas_address)
+				continue;
+			if (h_port[i].lowest_phy == mr_sas_port->lowest_phy) {
+				mpi3mr_update_mr_sas_port(mrioc, &h_port[i], mr_sas_port);
+				break;
+			}
+		}
+	}
+
+	/* In case if lowest phy is got enabled or disabled during reset */
+	for (i = 0; i < host_port_count; i++) {
+		if (h_port[i].used)
+			continue;
+		mr_sas_port = NULL;
+		list_for_each_entry(mr_sas_port, &mrioc->sas_hba.sas_port_list,
+		    port_list) {
+			if (mr_sas_port->marked_responding)
+				continue;
+			if (h_port[i].sas_address != mr_sas_port->remote_identify.sas_address)
+				continue;
+			if (h_port[i].phy_mask & mr_sas_port->phy_mask) {
+				mpi3mr_update_mr_sas_port(mrioc, &h_port[i], mr_sas_port);
+				break;
+			}
+		}
+	}
+
+	/* In case if expander cable is removed & connected to another HBA port during reset */
+	for (i = 0; i < host_port_count; i++) {
+		if (h_port[i].used)
+			continue;
+		mr_sas_port = NULL;
+		list_for_each_entry(mr_sas_port, &mrioc->sas_hba.sas_port_list,
+		    port_list) {
+			if (mr_sas_port->marked_responding)
+				continue;
+			if (h_port[i].sas_address != mr_sas_port->remote_identify.sas_address)
+				continue;
+			mpi3mr_update_mr_sas_port(mrioc, &h_port[i], mr_sas_port);
+			break;
+		}
+	}
+out:
+	kfree(sas_io_unit_pg0);
+}
+
+/**
+ * mpi3mr_refresh_expanders - Refresh expander device exposure
+ * @mrioc: Adapter instance reference
+ *
+ * This is executed post controller reset to identify any
+ * missing expander devices during reset and remove from the upper layers
+ * or expose any newly detected expander device to the upper layers.
+ *
+ * Return: Nothing.
+ */
+void
+mpi3mr_refresh_expanders(struct mpi3mr_ioc *mrioc)
+{
+	struct mpi3mr_sas_node *sas_expander, *sas_expander_next;
+	struct mpi3_sas_expander_page0 expander_pg0;
+	u16 ioc_status, handle;
+	u64 sas_address;
+	int i;
+	unsigned long flags;
+	struct mpi3mr_hba_port *hba_port;
+
+	spin_lock_irqsave(&mrioc->sas_node_lock, flags);
+	list_for_each_entry(sas_expander, &mrioc->sas_expander_list, list) {
+		sas_expander->non_responding = 1;
+	}
+	spin_unlock_irqrestore(&mrioc->sas_node_lock, flags);
+
+	sas_expander = NULL;
+
+	handle = 0xffff;
+
+	/* Search for responding expander devices and add them if they are newly got added */
+	while (true) {
+		if ((mpi3mr_cfg_get_sas_exp_pg0(mrioc, &ioc_status, &expander_pg0,
+		    sizeof(struct mpi3_sas_expander_page0),
+		    MPI3_SAS_EXPAND_PGAD_FORM_GET_NEXT_HANDLE, handle))) {
+			dprint_reset(mrioc,
+			    "failed to read exp pg0 for handle(0x%04x) at %s:%d/%s()!\n",
+			    handle, __FILE__, __LINE__, __func__);
+			break;
+		}
+
+		if (ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+			dprint_reset(mrioc,
+			   "ioc_status(0x%x) while reading exp pg0 for handle:(0x%04x), %s:%d/%s()!\n",
+			   ioc_status, handle, __FILE__, __LINE__, __func__);
+			break;
+		}
+
+		handle = le16_to_cpu(expander_pg0.dev_handle);
+		sas_address = le64_to_cpu(expander_pg0.sas_address);
+		hba_port = mpi3mr_get_hba_port_by_id(mrioc, expander_pg0.io_unit_port, 0);
+
+		if (!hba_port) {
+			mpi3mr_sas_host_refresh(mrioc);
+			mpi3mr_expander_add(mrioc, handle);
+			continue;
+		}
+
+		spin_lock_irqsave(&mrioc->sas_node_lock, flags);
+		sas_expander =
+		    mpi3mr_expander_find_by_sas_address(mrioc,
+		    sas_address, hba_port);
+		spin_unlock_irqrestore(&mrioc->sas_node_lock, flags);
+
+		if (!sas_expander) {
+			mpi3mr_sas_host_refresh(mrioc);
+			mpi3mr_expander_add(mrioc, handle);
+			continue;
+		}
+
+		sas_expander->non_responding = 0;
+		if (sas_expander->handle == handle)
+			continue;
+
+		sas_expander->handle = handle;
+		for (i = 0 ; i < sas_expander->num_phys ; i++)
+			sas_expander->phy[i].handle = handle;
+	}
+
+	/* Delete non responding expander devices and the corresponding hba_port ( if
+	 * the non responding expander device's parent device is host node.
+	 */
+
+	sas_expander = NULL;
+	spin_lock_irqsave(&mrioc->sas_node_lock, flags);
+	list_for_each_entry_safe_reverse(sas_expander, sas_expander_next,
+	    &mrioc->sas_expander_list, list) {
+		if (sas_expander->non_responding) {
+			spin_unlock_irqrestore(&mrioc->sas_node_lock, flags);
+			mpi3mr_expander_node_remove(mrioc, sas_expander);
+			spin_lock_irqsave(&mrioc->sas_node_lock, flags);
+		}
+	}
+	spin_unlock_irqrestore(&mrioc->sas_node_lock, flags);
+}
+
+/**
+ * mpi3mr_expander_add -  Create expander object
+ * @mrioc: Adapter instance reference
+ * @handle: Expander firmware device handle
+ *
+ * This funcion creating expander object, stored in
+ * sas_expander_list and expose it to the SAS transport
+ * layer.
+ *
+ * Return: 0 for success, non-zero for failure.
+ */
+int mpi3mr_expander_add(struct mpi3mr_ioc *mrioc, u16 handle)
+{
+	struct mpi3mr_sas_node *sas_expander;
+	struct mpi3mr_enclosure_node *enclosure_dev;
+	struct mpi3_sas_expander_page0 expander_pg0;
+	struct mpi3_sas_expander_page1 expander_pg1;
+	u16 ioc_status, parent_handle, temp_handle;
+	u64 sas_address, sas_address_parent = 0;
+	int i;
+	unsigned long flags;
+	u8 port_id, link_rate;
+	struct mpi3mr_sas_port *mr_sas_port = NULL;
+	struct mpi3mr_hba_port *hba_port;
+	u32 phynum_handle;
+
+	int rc = 0;
+
+	if (!handle)
+		return -1;
+
+	if (mrioc->reset_in_progress)
+		return -1;
+
+	if ((mpi3mr_cfg_get_sas_exp_pg0(mrioc, &ioc_status, &expander_pg0,
+	    sizeof(expander_pg0), MPI3_SAS_EXPAND_PGAD_FORM_HANDLE, handle))) {
+		ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		return -1;
+	}
+
+	if (ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		return -1;
+	}
+
+	parent_handle = le16_to_cpu(expander_pg0.parent_dev_handle);
+	if (mpi3mr_get_sas_address(mrioc, parent_handle, &sas_address_parent)
+	    != 0) {
+		ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		return -1;
+	}
+
+	port_id = expander_pg0.io_unit_port;
+	hba_port = mpi3mr_get_hba_port_by_id(mrioc,port_id, 0);
+	if (!hba_port) {
+		ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		return -1;
+	}
+	if (sas_address_parent != mrioc->sas_hba.sas_address) {
+		spin_lock_irqsave(&mrioc->sas_node_lock, flags);
+		sas_expander =
+		   mpi3mr_expander_find_by_sas_address(mrioc,
+		    sas_address_parent, hba_port);
+		spin_unlock_irqrestore(&mrioc->sas_node_lock, flags);
+		if (!sas_expander) {
+			rc = mpi3mr_expander_add(mrioc, parent_handle);
+			if (rc != 0)
+				return rc;
+		} else {
+			/*When there is a parent expander present, update it's
+			  phys where child expander is connected with the link
+			  speed, attached dev handle and sas address*/
+			for (i = 0 ; i < sas_expander->num_phys ; i++) {
+				phynum_handle =
+				    (i <<MPI3_SAS_EXPAND_PGAD_PHYNUM_SHIFT) |
+				    parent_handle;
+				if (mpi3mr_cfg_get_sas_exp_pg1(mrioc,
+				    &ioc_status, &expander_pg1,
+				    sizeof(expander_pg1),
+				    MPI3_SAS_EXPAND_PGAD_FORM_HANDLE_PHY_NUM,
+				    phynum_handle)) {
+					ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+					    __FILE__, __LINE__, __func__);
+					rc = -1;
+					return rc;
+				}
+				if (ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+					ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+					    __FILE__, __LINE__, __func__);
+					rc = -1;
+					return rc;
+				}
+				temp_handle = le16_to_cpu(
+				    expander_pg1.attached_dev_handle);
+				if (temp_handle != handle)
+					continue;
+				link_rate = (expander_pg1.negotiated_link_rate &
+				    MPI3_SAS_NEG_LINK_RATE_LOGICAL_MASK) >>
+				    MPI3_SAS_NEG_LINK_RATE_LOGICAL_SHIFT;
+				mpi3mr_update_links(mrioc, sas_address_parent,
+				    handle, i, link_rate, hba_port);
+			}
+		}
+	}
+
+	spin_lock_irqsave(&mrioc->sas_node_lock, flags);
+	sas_address = le64_to_cpu(expander_pg0.sas_address);
+	sas_expander = mpi3mr_expander_find_by_sas_address(mrioc,
+	    sas_address, hba_port);
+	spin_unlock_irqrestore(&mrioc->sas_node_lock, flags);
+
+	if (sas_expander)
+		return 0;
+
+	sas_expander = kzalloc(sizeof(struct mpi3mr_sas_node),
+	    GFP_KERNEL);
+	if (!sas_expander) {
+		ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		return -1;
+	}
+
+	sas_expander->handle = handle;
+	sas_expander->num_phys = expander_pg0.num_phys;
+	sas_expander->sas_address_parent = sas_address_parent;
+	sas_expander->sas_address = sas_address;
+	sas_expander->hba_port = hba_port;
+
+	ioc_info(mrioc, "expander_add: handle(0x%04x),"
+	    " parent(0x%04x), sas_addr(0x%016llx), phys(%d)\n",
+	    handle, parent_handle, (unsigned long long)
+	    sas_expander->sas_address, sas_expander->num_phys);
+
+	if (!sas_expander->num_phys) {
+		rc = -1;
+		goto out_fail;
+	}
+	sas_expander->phy = kcalloc(sas_expander->num_phys,
+	    sizeof(struct mpi3mr_sas_phy), GFP_KERNEL);
+	if (!sas_expander->phy) {
+		ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		rc = -1;
+		goto out_fail;
+	}
+
+	INIT_LIST_HEAD(&sas_expander->sas_port_list);
+	mr_sas_port = mpi3mr_sas_port_add(mrioc, handle, sas_address_parent,
+	    sas_expander->hba_port);
+	if (!mr_sas_port) {
+		ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		rc = -1;
+		goto out_fail;
+	}
+	sas_expander->parent_dev = &mr_sas_port->rphy->dev;
+	sas_expander->rphy = mr_sas_port->rphy;
+
+	for (i = 0 ; i < sas_expander->num_phys ; i++) {
+		phynum_handle = (i <<MPI3_SAS_EXPAND_PGAD_PHYNUM_SHIFT) |
+			handle;
+		if (mpi3mr_cfg_get_sas_exp_pg1(mrioc, &ioc_status,
+		    &expander_pg1, sizeof(expander_pg1),
+		    MPI3_SAS_EXPAND_PGAD_FORM_HANDLE_PHY_NUM,
+		    phynum_handle)) {
+			ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+			    __FILE__, __LINE__, __func__);
+			rc = -1;
+			goto out_fail;
+		}
+		if (ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+			ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+			    __FILE__, __LINE__, __func__);
+			rc = -1;
+			goto out_fail;
+		}
+
+		sas_expander->phy[i].handle = handle;
+		sas_expander->phy[i].phy_id = i;
+		sas_expander->phy[i].hba_port = hba_port;
+
+		if ((mpi3mr_add_expander_phy(mrioc, &sas_expander->phy[i],
+		    expander_pg1, sas_expander->parent_dev))) {
+			ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+			    __FILE__, __LINE__, __func__);
+			rc = -1;
+			goto out_fail;
+		}
+	}
+
+	if (sas_expander->enclosure_handle) {
+		enclosure_dev =
+			mpi3mr_enclosure_find_by_handle(mrioc,
+						sas_expander->enclosure_handle);
+		if (enclosure_dev)
+			sas_expander->enclosure_logical_id = le64_to_cpu(
+			    enclosure_dev->pg0.enclosure_logical_id);
+	}
+
+	mpi3mr_expander_node_add(mrioc, sas_expander);
+	return 0;
+
+out_fail:
+
+	if (mr_sas_port)
+		mpi3mr_sas_port_remove(mrioc,
+		    sas_expander->sas_address,
+		    sas_address_parent, sas_expander->hba_port);
+	if (sas_expander->phy)
+		kfree(sas_expander->phy);
+	if (sas_expander)
+		kfree(sas_expander);
+	return rc;
+}
+
+/**
+ * mpi3mr_expander_node_remove - recursive removal of expander.
+ * @mrioc: Adapter instance reference
+ * @sas_expander: Expander device object
+ *
+ * Removes expander object and freeing associated memory from
+ * the sas_expander_list and removes the same from SAS TL, if
+ * one of the attached device is an expander then it recursively
+ * removes the expander device too.
+ *
+ * Return nothing.
+ */
+static void mpi3mr_expander_node_remove(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_sas_node *sas_expander)
+{
+	struct mpi3mr_sas_port *mr_sas_port, *next;
+	unsigned long flags;
+	u8 port_id;
+
+	/* remove sibling ports attached to this expander */
+	list_for_each_entry_safe(mr_sas_port, next,
+	   &sas_expander->sas_port_list, port_list) {
+		if (mrioc->reset_in_progress)
+			return;
+		if (mr_sas_port->remote_identify.device_type ==
+		    SAS_END_DEVICE)
+			mpi3mr_remove_device_by_sas_address(mrioc,
+			    mr_sas_port->remote_identify.sas_address,
+			    mr_sas_port->hba_port);
+		else if (mr_sas_port->remote_identify.device_type ==
+		    SAS_EDGE_EXPANDER_DEVICE ||
+		    mr_sas_port->remote_identify.device_type ==
+		    SAS_FANOUT_EXPANDER_DEVICE)
+			mpi3mr_expander_remove(mrioc,
+			    mr_sas_port->remote_identify.sas_address,
+			    mr_sas_port->hba_port);
+	}
+	port_id = sas_expander->hba_port->port_id;
+	mpi3mr_sas_port_remove(mrioc, sas_expander->sas_address,
+	    sas_expander->sas_address_parent, sas_expander->hba_port);
+
+	ioc_info(mrioc, "expander_remove: handle(0x%04x), sas_addr(0x%016llx),"
+	    "port:%d\n", sas_expander->handle,
+	    (unsigned long long)sas_expander->sas_address, port_id);
+
+	spin_lock_irqsave(&mrioc->sas_node_lock, flags);
+	list_del(&sas_expander->list);
+	spin_unlock_irqrestore(&mrioc->sas_node_lock, flags);
+
+	kfree(sas_expander->phy);
+	kfree(sas_expander);
+}
+
+
+/**
+ * mpi3mr_expander_remove - Remove expander object
+ * @mrioc: Adapter instance reference
+ * @sas_address: Remove expander sas_address
+ * @hba_port: HBA port reference
+ *
+ * This funcion remove expander object, stored in
+ * mrioc->sas_expander_list and removes it from the SAS TL by
+ * calling mpi3mr_expander_node_remove().
+ *
+ * Return: None
+ */
+void mpi3mr_expander_remove(struct mpi3mr_ioc *mrioc, u64 sas_address,
+	struct mpi3mr_hba_port *hba_port)
+{
+	struct mpi3mr_sas_node *sas_expander;
+	unsigned long flags;
+
+	if (mrioc->reset_in_progress)
+		return;
+
+	if (!hba_port)
+		return;
+
+	spin_lock_irqsave(&mrioc->sas_node_lock, flags);
+	sas_expander = mpi3mr_expander_find_by_sas_address(mrioc, sas_address,
+	    hba_port);
+	spin_unlock_irqrestore(&mrioc->sas_node_lock, flags);
+	if (sas_expander)
+		mpi3mr_expander_node_remove(mrioc, sas_expander);
+
+}
+
+/**
+ * mpi3mr_sas_host_refresh - refreshing sas host object contents
+ * @mrioc: Adapter instance reference
+ *
+ * This function refreshes the controllers phy information and
+ * updates the SAS transport layer with updated information,
+ * this is excecuted for eeach device addition or device info
+ * change events
+ *
+ * Return: None.
+ */
+void mpi3mr_sas_host_refresh(struct mpi3mr_ioc *mrioc)
+{
+	int i;
+	u8 link_rate;
+	u16 sz, port_id, attached_handle;
+	struct mpi3mr_hba_port *hba_port;
+	struct mpi3_sas_io_unit_page0 *sas_io_unit_pg0 = NULL;
+
+	dprint_transport_info(mrioc,
+	    "updating handles for sas_host(0x%016llx)\n",
+	    (unsigned long long)mrioc->sas_hba.sas_address);
+
+	sz = offsetof(struct mpi3_sas_io_unit_page0, phy_data) +
+		(mrioc->sas_hba.num_phys *
+		 sizeof(struct mpi3_sas_io_unit0_phy_data));
+	sas_io_unit_pg0 = kzalloc(sz, GFP_KERNEL);
+	if (!sas_io_unit_pg0)
+		return;
+	if (mpi3mr_cfg_get_sas_io_unit_pg0(mrioc, sas_io_unit_pg0, sz)) {
+		ioc_err(mrioc,"failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		goto out;
+	}
+
+	mrioc->sas_hba.handle = 0;
+	for (i = 0; i < mrioc->sas_hba.num_phys; i++) {
+		if (sas_io_unit_pg0->phy_data[i].phy_flags &
+		    (MPI3_SASIOUNIT0_PHYFLAGS_HOST_PHY |
+		     MPI3_SASIOUNIT0_PHYFLAGS_VIRTUAL_PHY))
+			continue;
+		link_rate =
+		    sas_io_unit_pg0->phy_data[i].negotiated_link_rate >> 4;
+		if (!mrioc->sas_hba.handle)
+			mrioc->sas_hba.handle = le16_to_cpu(sas_io_unit_pg0->
+			    phy_data[i].controller_dev_handle);
+		port_id = sas_io_unit_pg0->phy_data[i].io_unit_port;
+		if (!(mpi3mr_get_hba_port_by_id(mrioc, port_id, 0))) {
+			hba_port = kzalloc(sizeof(struct mpi3mr_hba_port),
+							GFP_KERNEL);
+			if (!hba_port)
+				goto out;
+			hba_port->port_id = port_id;
+			ioc_info(mrioc, "hba_port entry: %p,"
+			   " port: %d is added to hba_port list\n",
+			   hba_port, hba_port->port_id);
+			if (mrioc->reset_in_progress)
+				hba_port->flags = MPI3MR_HBA_PORT_FLAG_NEW;
+			list_add_tail(&hba_port->list,
+			    &mrioc->hba_port_table_list);
+		}
+
+		mrioc->sas_hba.phy[i].handle = mrioc->sas_hba.handle;
+		attached_handle = le16_to_cpu(sas_io_unit_pg0->phy_data[i].
+		    attached_dev_handle);
+		if (attached_handle && link_rate < MPI3_SAS_NEG_LINK_RATE_1_5)
+			link_rate = MPI3_SAS_NEG_LINK_RATE_1_5;
+		mrioc->sas_hba.phy[i].hba_port =
+			mpi3mr_get_hba_port_by_id(mrioc,port_id, 0);
+		mpi3mr_update_links(mrioc, mrioc->sas_hba.sas_address,
+		    attached_handle, i, link_rate,
+		    mrioc->sas_hba.phy[i].hba_port);
+	}
+ out:
+	kfree(sas_io_unit_pg0);
+}
+
+/**
+ * mpi3mr_sas_host_add - create sas host object
+ * @mrioc: Adapter instance reference
+ *
+ * This function creates the controllers phy information and
+ * updates the SAS transport layer with updated information,
+ * this is excecuted for first device addition or device info
+ * change event.
+ *
+ * Return: None.
+ */
+void mpi3mr_sas_host_add(struct mpi3mr_ioc *mrioc)
+{
+	int i;
+	u16 sz, num_phys = 1, port_id, ioc_status;
+	struct mpi3mr_hba_port *hba_port;
+	struct mpi3_sas_io_unit_page0 *sas_io_unit_pg0 = NULL;
+	struct mpi3_sas_phy_page0 phy_pg0;
+	struct mpi3_device_page0 dev_pg0;
+	struct mpi3_enclosure_page0 encl_pg0;
+	struct mpi3_device0_sas_sata_format *sasinf;
+
+
+	sz = offsetof(struct mpi3_sas_io_unit_page0, phy_data) +
+		(num_phys * sizeof(struct mpi3_sas_io_unit0_phy_data));
+	sas_io_unit_pg0 = kzalloc(sz, GFP_KERNEL);
+	if (!sas_io_unit_pg0)
+		return;
+
+	if (mpi3mr_cfg_get_sas_io_unit_pg0(mrioc, sas_io_unit_pg0, sz)) {
+		ioc_err(mrioc,"failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		goto out;
+	}
+	num_phys = sas_io_unit_pg0->num_phys;
+	kfree(sas_io_unit_pg0);
+
+	mrioc->sas_hba.host_node = 1;
+	INIT_LIST_HEAD(&mrioc->sas_hba.sas_port_list);
+	mrioc->sas_hba.parent_dev = &mrioc->shost->shost_gendev;
+	mrioc->sas_hba.phy = kcalloc(num_phys,
+	    sizeof(struct mpi3mr_sas_phy), GFP_KERNEL);
+	if (!mrioc->sas_hba.phy)
+		return;
+
+	mrioc->sas_hba.num_phys = num_phys;
+
+	sz = offsetof(struct mpi3_sas_io_unit_page0, phy_data) +
+		(num_phys * sizeof(struct mpi3_sas_io_unit0_phy_data));
+	sas_io_unit_pg0 = kzalloc(sz, GFP_KERNEL);
+	if (!sas_io_unit_pg0)
+		return;
+
+	if (mpi3mr_cfg_get_sas_io_unit_pg0(mrioc, sas_io_unit_pg0, sz)) {
+		ioc_err(mrioc,"failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		goto out;
+	}
+
+	mrioc->sas_hba.handle = 0;
+	for (i = 0; i < mrioc->sas_hba.num_phys; i++) {
+		if (sas_io_unit_pg0->phy_data[i].phy_flags &
+		    (MPI3_SASIOUNIT0_PHYFLAGS_HOST_PHY |
+		    MPI3_SASIOUNIT0_PHYFLAGS_VIRTUAL_PHY))
+			continue;
+		if (mpi3mr_cfg_get_sas_phy_pg0(mrioc, &ioc_status, &phy_pg0,
+		    sizeof(struct mpi3_sas_phy_page0),
+		    MPI3_SAS_PHY_PGAD_FORM_PHY_NUMBER, i)) {
+			ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+			    __FILE__, __LINE__, __func__);
+			goto out;
+		}
+		if (ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+			ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+			    __FILE__, __LINE__, __func__);
+			goto out;
+		}
+
+		if (!mrioc->sas_hba.handle)
+			mrioc->sas_hba.handle = le16_to_cpu(sas_io_unit_pg0->
+			    phy_data[i].controller_dev_handle);
+		port_id = sas_io_unit_pg0->phy_data[i].io_unit_port;
+
+		if (!(mpi3mr_get_hba_port_by_id(mrioc, port_id, 0))) {
+			hba_port = kzalloc(sizeof(struct mpi3mr_hba_port),
+							GFP_KERNEL);
+			if (!hba_port)
+				goto out;
+			hba_port->port_id = port_id;
+			ioc_info(mrioc, "hba_port entry: %p,"
+			   " port: %d is added to hba_port list\n",
+			   hba_port, hba_port->port_id);
+			list_add_tail(&hba_port->list,
+			    &mrioc->hba_port_table_list);
+		}
+
+		mrioc->sas_hba.phy[i].handle = mrioc->sas_hba.handle;
+		mrioc->sas_hba.phy[i].phy_id = i;
+		mrioc->sas_hba.phy[i].hba_port =
+			mpi3mr_get_hba_port_by_id(mrioc, port_id, 0);
+		mpi3mr_add_host_phy(mrioc, &mrioc->sas_hba.phy[i],
+		    phy_pg0, mrioc->sas_hba.parent_dev);
+	}
+	if ((mpi3mr_cfg_get_dev_pg0(mrioc, &ioc_status, &dev_pg0,
+	    sizeof(dev_pg0), MPI3_DEVICE_PGAD_FORM_HANDLE,
+	    mrioc->sas_hba.handle))) {
+		ioc_err(mrioc, "%s: device page0 read failed\n", __func__);
+		goto out;
+	}
+	if (ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc, "device page read failed for handle(0x%04x), with ioc_status(0x%04x) failure at %s:%d/%s()!\n",
+		    mrioc->sas_hba.handle, ioc_status, __FILE__, __LINE__,
+		    __func__);
+		goto out;
+	}
+	mrioc->sas_hba.enclosure_handle =
+	    le16_to_cpu(dev_pg0.enclosure_handle);
+	sasinf = &dev_pg0.device_specific.sas_sata_format;
+	mrioc->sas_hba.sas_address =
+		le64_to_cpu(sasinf->sas_address);
+	ioc_info(mrioc, "host_add: handle(0x%04x), "
+	    "sas_addr(0x%016llx), phys(%d)\n",
+	    mrioc->sas_hba.handle,
+	    (unsigned long long) mrioc->sas_hba.sas_address,
+	    mrioc->sas_hba.num_phys);
+
+	if (mrioc->sas_hba.enclosure_handle) {
+		if (!(mpi3mr_cfg_get_enclosure_pg0(mrioc, &ioc_status,
+		    &encl_pg0, sizeof(dev_pg0),
+		    MPI3_ENCLOS_PGAD_FORM_HANDLE,
+		    mrioc->sas_hba.enclosure_handle)) &&
+		    (ioc_status == MPI3_IOCSTATUS_SUCCESS))
+			mrioc->sas_hba.enclosure_logical_id =
+				le64_to_cpu(encl_pg0.enclosure_logical_id);
+	}
+
+out:
+	kfree(sas_io_unit_pg0);
+}
+
+/**
+ * mpi3mr_get_sas_negotiated_logical_linkrate - get linkrate
+ * @mrioc: Adapter instance reference
+ * @tgtdev: Target device
+ *
+ * This function identifies whether the target device is
+ * attached directly or through expander and issues sas phy
+ * page0 or expander phy page1 and gets the link rate, if there
+ * is any faiulre in reading the pages then this returns link
+ * rate of 1.5.
+ *
+ * Return: logical link rate.
+ */
+static u8 mpi3mr_get_sas_negotiated_logical_linkrate(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_tgt_dev *tgtdev)
+{
+	u8 link_rate = MPI3_SAS_NEG_LINK_RATE_1_5, phy_number;
+	struct mpi3_sas_expander_page1 expander_pg1;
+	struct mpi3_sas_phy_page0 phy_pg0;
+	u32 phynum_handle;
+	u16 ioc_status;
+
+	phy_number = tgtdev->dev_spec.sas_sata_inf.phy_id;
+	if (!(tgtdev->devpg0_flag & MPI3_DEVICE0_FLAGS_ATT_METHOD_DIR_ATTACHED))
+	{
+		phynum_handle = ((phy_number<<MPI3_SAS_EXPAND_PGAD_PHYNUM_SHIFT)
+				 | tgtdev->parent_handle);
+		if (mpi3mr_cfg_get_sas_exp_pg1(mrioc, &ioc_status,
+		    &expander_pg1, sizeof(expander_pg1),
+		    MPI3_SAS_EXPAND_PGAD_FORM_HANDLE_PHY_NUM,
+		    phynum_handle)) {
+			ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+			    __FILE__, __LINE__, __func__);
+			goto out;
+		}
+		if (ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+			ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+			    __FILE__, __LINE__, __func__);
+			goto out;
+		}
+		link_rate = (expander_pg1.negotiated_link_rate &
+			     MPI3_SAS_NEG_LINK_RATE_LOGICAL_MASK) >>
+			MPI3_SAS_NEG_LINK_RATE_LOGICAL_SHIFT;
+		goto out;
+	}
+	if (mpi3mr_cfg_get_sas_phy_pg0(mrioc, &ioc_status, &phy_pg0,
+	    sizeof(struct mpi3_sas_phy_page0),
+	    MPI3_SAS_PHY_PGAD_FORM_PHY_NUMBER, phy_number)) {
+		ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		goto out;
+	}
+	if (ioc_status != MPI3_IOCSTATUS_SUCCESS) {
+		ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		goto out;
+	}
+	link_rate = (phy_pg0.negotiated_link_rate &
+		     MPI3_SAS_NEG_LINK_RATE_LOGICAL_MASK) >>
+		MPI3_SAS_NEG_LINK_RATE_LOGICAL_SHIFT;
+out:
+	return link_rate;
+}
+
+/**
+ * mpi3mr_report_tgtdev_to_sas_transport - expose dev to SAS TL
+ * @mrioc: Adapter instance reference
+ * @tgtdev: Target device
+ *
+ * This function function exposes the target device after
+ * preparing host_phy, setting up link rate etc.
+ *
+ * Return: 0 on success, non-zero for failure.
+ */
+int mpi3mr_report_tgtdev_to_sas_transport(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_tgt_dev *tgtdev)
+{
+	int retval = 0;
+	u8 link_rate, parent_phy_number;
+	u64 sas_address_parent, sas_address;
+	struct mpi3mr_hba_port *hba_port;
+	u8 port_id;
+
+	if ((tgtdev->dev_type != MPI3_DEVICE_DEVFORM_SAS_SATA) ||
+	    !mrioc->sas_transport_enabled)
+		return -1;
+
+	sas_address = tgtdev->dev_spec.sas_sata_inf.sas_address;
+	if (!mrioc->sas_hba.num_phys)
+		mpi3mr_sas_host_add(mrioc);
+	else
+		mpi3mr_sas_host_refresh(mrioc);
+
+	if (mpi3mr_get_sas_address(mrioc, tgtdev->parent_handle,
+	    &sas_address_parent) != 0) {
+		ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		return -1;
+	}
+	tgtdev->dev_spec.sas_sata_inf.sas_address_parent = sas_address_parent;
+
+	parent_phy_number = tgtdev->dev_spec.sas_sata_inf.phy_id;
+	port_id = tgtdev->io_unit_port;
+
+	hba_port = mpi3mr_get_hba_port_by_id(mrioc, port_id, 0);
+	if (!hba_port) {
+		ioc_err(mrioc, "failure at %s:%d/%s()!\n",
+		    __FILE__, __LINE__, __func__);
+		return -1;
+	}
+	tgtdev->dev_spec.sas_sata_inf.hba_port = hba_port;
+
+	link_rate = mpi3mr_get_sas_negotiated_logical_linkrate(mrioc, tgtdev);
+
+	mpi3mr_update_links(mrioc, sas_address_parent, tgtdev->dev_handle,
+	    parent_phy_number, link_rate, hba_port);
+
+	tgtdev->host_exposed = 1;
+	if (!mpi3mr_sas_port_add(mrioc, tgtdev->dev_handle,
+	    sas_address_parent, hba_port)) {
+		tgtdev->host_exposed = 0;
+		retval = -1;
+	} else if ((!tgtdev->starget)) {
+		if (!mrioc->is_driver_loading)
+			mpi3mr_sas_port_remove(mrioc, sas_address,
+			    sas_address_parent, hba_port);
+		tgtdev->host_exposed = 0;
+		retval = -1;
+	}
+	return retval;
+}
+
+/**
+ * mpi3mr_remove_tgtdev_from_sas_transport - remove from SAS TL
+ * @mrioc: Adapter instance reference
+ * @tgtdev: Target device
+ *
+ * This function function removes the target device
+ *
+ * Return: None.
+ */
+void mpi3mr_remove_tgtdev_from_sas_transport(struct mpi3mr_ioc *mrioc,
+	struct mpi3mr_tgt_dev *tgtdev)
+{
+	u64 sas_address_parent, sas_address;
+	struct mpi3mr_hba_port *hba_port;
+
+	if ((tgtdev->dev_type != MPI3_DEVICE_DEVFORM_SAS_SATA) ||
+	    !mrioc->sas_transport_enabled)
+		return;
+
+	hba_port = tgtdev->dev_spec.sas_sata_inf.hba_port;
+	sas_address = tgtdev->dev_spec.sas_sata_inf.sas_address;
+	sas_address_parent = tgtdev->dev_spec.sas_sata_inf.sas_address_parent;
+	mpi3mr_sas_port_remove(mrioc, sas_address, sas_address_parent,
+	    hba_port);
+	tgtdev->host_exposed = 0;
+}
+
+

From 61a288408ef4ba9841469298e95b59e80aac0f74 Mon Sep 17 00:00:00 2001
From: Balbir Singh <sblbir@amazon.com>
Date: Fri, 8 Jan 2021 23:10:52 +1100
Subject: [PATCH 592/737] x86/smp: Add a per-cpu view of SMT state

A new field smt_active in cpuinfo_x86 identifies if the current core/cpu
is in SMT mode or not.

This is helpful when the system has some of its cores with threads offlined
and can be used for cases where action is taken based on the state of SMT.

The upcoming support for paranoid L1D flush will make use of this information.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Balbir Singh <sblbir@amazon.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20210108121056.21940-2-sblbir@amazon.com
(cherry picked from commit c52787b590634646d4da3d8f23c4532ba050d40d)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 arch/x86/include/asm/processor.h |  2 ++
 arch/x86/kernel/smpboot.c        | 10 +++++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 2dd9b661a5fd5..5ecf1d6a20dca 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -136,6 +136,8 @@ struct cpuinfo_x86 {
 	u16			logical_die_id;
 	/* Index into per_cpu list: */
 	u16			cpu_index;
+	/*  Is SMT active on this core? */
+	bool			smt_active;
 	u32			microcode;
 	/* Address space bits used by the cache internally */
 	u8			x86_cache_bits;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index d2403da17842b..9aece7f7cd081 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -616,6 +616,9 @@ void set_cpu_sibling_map(int cpu)
 	if (threads > __max_smt_threads)
 		__max_smt_threads = threads;
 
+	for_each_cpu(i, topology_sibling_cpumask(cpu))
+		cpu_data(i).smt_active = threads > 1;
+
 	/*
 	 * This needs a separate iteration over the cpus because we rely on all
 	 * topology_sibling_cpumask links to be set-up.
@@ -1560,8 +1563,13 @@ static void remove_siblinginfo(int cpu)
 
 	for_each_cpu(sibling, topology_die_cpumask(cpu))
 		cpumask_clear_cpu(cpu, topology_die_cpumask(sibling));
-	for_each_cpu(sibling, topology_sibling_cpumask(cpu))
+
+	for_each_cpu(sibling, topology_sibling_cpumask(cpu)) {
 		cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
+		if (cpumask_weight(topology_sibling_cpumask(sibling)) == 1)
+			cpu_data(sibling).smt_active = false;
+	}
+
 	for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
 		cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
 	cpumask_clear(cpu_llc_shared_mask(cpu));

From 26bffcddb1170128b185a5056ff00a5f324ec79d Mon Sep 17 00:00:00 2001
From: Balbir Singh <sblbir@amazon.com>
Date: Fri, 8 Jan 2021 23:10:53 +1100
Subject: [PATCH 593/737] x86/mm: Refactor cond_ibpb() to support other use
 cases

cond_ibpb() has the necessary bits required to track the previous mm in
switch_mm_irqs_off(). This can be reused for other use cases like L1D
flushing on context switch.

[ Luiz: small conflict resolved in arch/x86/mm/tlb.c because
  mm_mangle_tif_spec_ib() inline keyword was removed on upstream ]

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Balbir Singh <sblbir@amazon.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20210108121056.21940-3-sblbir@amazon.com
(cherry picked from commit 371b09c6fdc436f2c7bb67fc90df5eec8ce90f06)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 arch/x86/include/asm/tlbflush.h |  2 +-
 arch/x86/mm/tlb.c               | 53 ++++++++++++++++++---------------
 2 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 8c87a2e0b660c..a927d40664df7 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -83,7 +83,7 @@ struct tlb_state {
 	/* Last user mm for optimizing IBPB */
 	union {
 		struct mm_struct	*last_user_mm;
-		unsigned long		last_user_mm_ibpb;
+		unsigned long		last_user_mm_spec;
 	};
 
 	u16 loaded_mm_asid;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 569ac1d57f55a..67fe38b33a793 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -42,10 +42,14 @@
  */
 
 /*
- * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is
- * stored in cpu_tlb_state.last_user_mm_ibpb.
+ * Bits to mangle the TIF_SPEC_IB state into the mm pointer which is
+ * stored in cpu_tlb_state.last_user_mm_spec.
  */
 #define LAST_USER_MM_IBPB	0x1UL
+#define LAST_USER_MM_SPEC_MASK	(LAST_USER_MM_IBPB)
+
+/* Bits to set when tlbstate and flush is (re)initialized */
+#define LAST_USER_MM_INIT	LAST_USER_MM_IBPB
 
 /*
  * The x86 feature is called PCID (Process Context IDentifier). It is similar
@@ -316,20 +320,29 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	local_irq_restore(flags);
 }
 
-static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
+static unsigned long mm_mangle_tif_spec_bits(struct task_struct *next)
 {
 	unsigned long next_tif = task_thread_info(next)->flags;
-	unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB;
+	unsigned long spec_bits = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_SPEC_MASK;
 
-	return (unsigned long)next->mm | ibpb;
+	return (unsigned long)next->mm | spec_bits;
 }
 
-static void cond_ibpb(struct task_struct *next)
+static void cond_mitigation(struct task_struct *next)
 {
+	unsigned long prev_mm, next_mm;
+
 	if (!next || !next->mm)
 		return;
 
+	next_mm = mm_mangle_tif_spec_bits(next);
+	prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec);
+
 	/*
+	 * Avoid user/user BTB poisoning by flushing the branch predictor
+	 * when switching between processes. This stops one process from
+	 * doing Spectre-v2 attacks on another.
+	 *
 	 * Both, the conditional and the always IBPB mode use the mm
 	 * pointer to avoid the IBPB when switching between tasks of the
 	 * same process. Using the mm pointer instead of mm->context.ctx_id
@@ -339,8 +352,6 @@ static void cond_ibpb(struct task_struct *next)
 	 * exposed data is not really interesting.
 	 */
 	if (static_branch_likely(&switch_mm_cond_ibpb)) {
-		unsigned long prev_mm, next_mm;
-
 		/*
 		 * This is a bit more complex than the always mode because
 		 * it has to handle two cases:
@@ -370,20 +381,14 @@ static void cond_ibpb(struct task_struct *next)
 		 * Optimize this with reasonably small overhead for the
 		 * above cases. Mangle the TIF_SPEC_IB bit into the mm
 		 * pointer of the incoming task which is stored in
-		 * cpu_tlbstate.last_user_mm_ibpb for comparison.
-		 */
-		next_mm = mm_mangle_tif_spec_ib(next);
-		prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb);
-
-		/*
+		 * cpu_tlbstate.last_user_mm_spec for comparison.
+		 *
 		 * Issue IBPB only if the mm's are different and one or
 		 * both have the IBPB bit set.
 		 */
 		if (next_mm != prev_mm &&
 		    (next_mm | prev_mm) & LAST_USER_MM_IBPB)
 			indirect_branch_prediction_barrier();
-
-		this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm);
 	}
 
 	if (static_branch_unlikely(&switch_mm_always_ibpb)) {
@@ -392,11 +397,12 @@ static void cond_ibpb(struct task_struct *next)
 		 * different context than the user space task which ran
 		 * last on this CPU.
 		 */
-		if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) {
+		if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) !=
+					(unsigned long)next->mm)
 			indirect_branch_prediction_barrier();
-			this_cpu_write(cpu_tlbstate.last_user_mm, next->mm);
-		}
 	}
+
+	this_cpu_write(cpu_tlbstate.last_user_mm_spec, next_mm);
 }
 
 #ifdef CONFIG_PERF_EVENTS
@@ -524,11 +530,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		need_flush = true;
 	} else {
 		/*
-		 * Avoid user/user BTB poisoning by flushing the branch
-		 * predictor when switching between processes. This stops
-		 * one process from doing Spectre-v2 attacks on another.
+		 * Apply process to process speculation vulnerability
+		 * mitigations if applicable.
 		 */
-		cond_ibpb(tsk);
+		cond_mitigation(tsk);
 
 		/*
 		 * Stop remote flushes for the previous mm.
@@ -636,7 +641,7 @@ void initialize_tlbstate_and_flush(void)
 	write_cr3(build_cr3(mm->pgd, 0));
 
 	/* Reinitialize tlbstate. */
-	this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB);
+	this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT);
 	this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
 	this_cpu_write(cpu_tlbstate.next_asid, 1);
 	this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);

From 21eb5cd0403410dc9aa3e5f055406a4bf507cdcf Mon Sep 17 00:00:00 2001
From: Balbir Singh <sblbir@amazon.com>
Date: Mon, 26 Apr 2021 21:59:11 +0200
Subject: [PATCH 594/737] sched: Add task_work callback for paranoid L1D flush

The upcoming paranoid L1D flush infrastructure allows to conditionally
(opt-in) flush L1D in switch_mm() as a defense against potential new side
channels or for paranoia reasons. As the flush makes only sense when a task
runs on a non-SMT enabled core, because SMT siblings share L1, the
switch_mm() logic will kill a task which is flagged for L1D flush when it
is running on a SMT thread.

Add a taskwork callback so switch_mm() can queue a SIG_KILL command which
is invoked when the task tries to return to user space.

[ Luiz: Minor conflict in arch/Kconfig because 5.10 doesn't have
  ARCH_HAS_ELFCORE_COMPAT ]

Signed-off-by: Balbir Singh <sblbir@amazon.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20210108121056.21940-1-sblbir@amazon.com
(cherry picked from commit 58e106e725eed59896b9141a1c9a917d2f67962a)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 arch/Kconfig          |  3 +++
 include/linux/sched.h | 10 ++++++++++
 2 files changed, 13 insertions(+)

diff --git a/arch/Kconfig b/arch/Kconfig
index 240277d5626c8..e5549cc65fbc0 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1065,6 +1065,9 @@ config ARCH_SPLIT_ARG64
 	   If a 32-bit architecture requires 64-bit arguments to be split into
 	   pairs of 32-bit arguments, select this option.
 
+config ARCH_HAS_PARANOID_L1D_FLUSH
+	bool
+
 source "kernel/gcov/Kconfig"
 
 source "scripts/gcc-plugins/Kconfig"
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5da4b3c89f636..2f2bcaa7e7d78 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1357,6 +1357,16 @@ struct task_struct {
 	int				mce_count;
 #endif
 
+#ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH
+	/*
+	 * If L1D flush is supported on mm context switch
+	 * then we use this callback head to queue kill work
+	 * to kill tasks that are not running on SMT disabled
+	 * cores
+	 */
+	struct callback_head		l1d_flush_kill;
+#endif
+
 	/*
 	 * New fields for task_struct should be added above here, so that
 	 * they are included in the randomized portion of task_struct.

From 72d9cf077068bc830a90800a90055df8663ca499 Mon Sep 17 00:00:00 2001
From: Balbir Singh <sblbir@amazon.com>
Date: Mon, 26 Apr 2021 22:09:43 +0200
Subject: [PATCH 595/737] x86/process: Make room for TIF_SPEC_L1D_FLUSH

The upcoming support for paranoid L1D flush in switch_mm() requires that
TIF_SPEC_IB and the new TIF_SPEC_L1D_FLUSH are two consecutive bits in
thread_info::flags.

Move TIF_SPEC_FORCE_UPDATE to a spare bit to make room for the new one.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Balbir Singh <sblbir@amazon.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20210108121056.21940-1-sblbir@amazon.com
(cherry picked from commit 8aacd1eab53ec853c2d29cdc9b64e9dc87d2a519)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 arch/x86/include/asm/thread_info.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 012c8ee93b67f..1f7144c1af23d 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -84,7 +84,6 @@ struct thread_info {
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_SPEC_IB		9	/* Indirect branch speculation mitigation */
-#define TIF_SPEC_FORCE_UPDATE	10	/* Force speculation MSR update in context switch */
 #define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
 #define TIF_UPROBE		12	/* breakpointed or singlestepping */
 #define TIF_PATCH_PENDING	13	/* pending live patching update */
@@ -97,6 +96,7 @@ struct thread_info {
 #define TIF_MEMDIE		20	/* is terminating due to OOM killer */
 #define TIF_POLLING_NRFLAG	21	/* idle is polling for TIF_NEED_RESCHED */
 #define TIF_IO_BITMAP		22	/* uses I/O bitmap */
+#define TIF_SPEC_FORCE_UPDATE	23	/* Force speculation MSR update in context switch */
 #define TIF_FORCED_TF		24	/* true if TF in eflags artificially */
 #define TIF_BLOCKSTEP		25	/* set when we want DEBUGCTLMSR_BTF */
 #define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
@@ -114,7 +114,6 @@ struct thread_info {
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 #define _TIF_SPEC_IB		(1 << TIF_SPEC_IB)
-#define _TIF_SPEC_FORCE_UPDATE	(1 << TIF_SPEC_FORCE_UPDATE)
 #define _TIF_USER_RETURN_NOTIFY	(1 << TIF_USER_RETURN_NOTIFY)
 #define _TIF_UPROBE		(1 << TIF_UPROBE)
 #define _TIF_PATCH_PENDING	(1 << TIF_PATCH_PENDING)
@@ -126,6 +125,7 @@ struct thread_info {
 #define _TIF_SLD		(1 << TIF_SLD)
 #define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
 #define _TIF_IO_BITMAP		(1 << TIF_IO_BITMAP)
+#define _TIF_SPEC_FORCE_UPDATE	(1 << TIF_SPEC_FORCE_UPDATE)
 #define _TIF_FORCED_TF		(1 << TIF_FORCED_TF)
 #define _TIF_BLOCKSTEP		(1 << TIF_BLOCKSTEP)
 #define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)

From 2a67b566931d02a93331181e5dc86c5345c645dd Mon Sep 17 00:00:00 2001
From: Balbir Singh <sblbir@amazon.com>
Date: Mon, 26 Apr 2021 21:42:30 +0200
Subject: [PATCH 596/737] x86/mm: Prepare for opt-in based L1D flush in
 switch_mm()

The goal of this is to allow tasks that want to protect sensitive
information, against e.g. the recently found snoop assisted data sampling
vulnerabilites, to flush their L1D on being switched out.  This protects
their data from being snooped or leaked via side channels after the task
has context switched out.

This could also be used to wipe L1D when an untrusted task is switched in,
but that's not a really well defined scenario while the opt-in variant is
clearly defined.

The mechanism is default disabled and can be enabled on the kernel command
line.

Prepare for the actual prctl based opt-in:

  1) Provide the necessary setup functionality similar to the other
     mitigations and enable the static branch when the command line option
     is set and the CPU provides support for hardware assisted L1D
     flushing. Software based L1D flush is not supported because it's CPU
     model specific and not really well defined.

     This does not come with a sysfs file like the other mitigations
     because it is not bound to any specific vulnerability.

     Support has to be queried via the prctl(2) interface.

  2) Add TIF_SPEC_L1D_FLUSH next to L1D_SPEC_IB so the two bits can be
     mangled into the mm pointer in one go which allows to reuse the
     existing mechanism in switch_mm() for the conditional IBPB speculation
     barrier efficiently.

  3) Add the L1D flush specific functionality which flushes L1D when the
     outgoing task opted in.

     Also check whether the incoming task has requested L1D flush and if so
     validate that it is not accidentaly running on an SMT sibling as this
     makes the whole excercise moot because SMT siblings share L1D which
     opens tons of other attack vectors. If that happens schedule task work
     which signals the incoming task on return to user/guest with SIGBUS as
     this is part of the paranoid L1D flush contract.

[ Luiz: resolved small conflicts in arch/x86/include/asm/nospec-branch.h
  and arch/x86/kernel/cpu/bugs.c. Both were due to 7462cd2443bc from
  linux-5.10.y stable tree ]

[ Hailmo: resolved conflicts when rebasing onto 5.10.190 - this collided
with the introduction of GDS (288a2f6) and SRSO (3f9b710) ]

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Balbir Singh <sblbir@amazon.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20210108121056.21940-1-sblbir@amazon.com
(cherry picked from commit b5f06f64e269f9820cd5ad9e9a98afa6c8914b7a)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 arch/x86/Kconfig                     |  1 +
 arch/x86/include/asm/nospec-branch.h |  2 +
 arch/x86/include/asm/thread_info.h   |  2 +
 arch/x86/kernel/cpu/bugs.c           | 37 ++++++++++++++++++
 arch/x86/mm/tlb.c                    | 58 +++++++++++++++++++++++++++-
 5 files changed, 98 insertions(+), 2 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b451b3ff9c351..76424a04383d7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -104,6 +104,7 @@ config X86
 	select ARCH_WANT_HUGE_PMD_SHARE
 	select ARCH_WANT_LD_ORPHAN_WARN
 	select ARCH_WANTS_THP_SWAP		if X86_64
+	select ARCH_HAS_PARANOID_L1D_FLUSH
 	select BUILDTIME_TABLE_SORT
 	select CLKEVT_I8253
 	select CLOCKSOURCE_VALIDATE_LAST_CYCLE
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index ec556ae20545c..6a1011989e471 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -351,6 +351,8 @@ DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
 DECLARE_STATIC_KEY_FALSE(mds_user_clear);
 DECLARE_STATIC_KEY_FALSE(mds_idle_clear);
 
+DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
+
 DECLARE_STATIC_KEY_FALSE(mmio_stale_data_clear);
 
 #include <asm/segment.h>
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 1f7144c1af23d..753053cd50380 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -84,6 +84,7 @@ struct thread_info {
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_SPEC_IB		9	/* Indirect branch speculation mitigation */
+#define TIF_SPEC_L1D_FLUSH	10	/* Flush L1D on mm switches (processes) */
 #define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
 #define TIF_UPROBE		12	/* breakpointed or singlestepping */
 #define TIF_PATCH_PENDING	13	/* pending live patching update */
@@ -114,6 +115,7 @@ struct thread_info {
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 #define _TIF_SPEC_IB		(1 << TIF_SPEC_IB)
+#define _TIF_SPEC_L1D_FLUSH	(1 << TIF_SPEC_L1D_FLUSH)
 #define _TIF_USER_RETURN_NOTIFY	(1 << TIF_USER_RETURN_NOTIFY)
 #define _TIF_UPROBE		(1 << TIF_UPROBE)
 #define _TIF_PATCH_PENDING	(1 << TIF_PATCH_PENDING)
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 035b4ba4e5feb..3f0e89d20d533 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -45,6 +45,7 @@ static void __init md_clear_select_mitigation(void);
 static void __init taa_select_mitigation(void);
 static void __init mmio_select_mitigation(void);
 static void __init srbds_select_mitigation(void);
+static void __init l1d_flush_select_mitigation(void);
 static void __init gds_select_mitigation(void);
 static void __init srso_select_mitigation(void);
 
@@ -114,6 +115,13 @@ EXPORT_SYMBOL_GPL(mds_user_clear);
 DEFINE_STATIC_KEY_FALSE(mds_idle_clear);
 EXPORT_SYMBOL_GPL(mds_idle_clear);
 
+/*
+ * Controls whether l1d flush based mitigations are enabled,
+ * based on hw features and admin setting via boot parameter
+ * defaults to false
+ */
+DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
+
 /* Controls CPU Fill buffer clear before KVM guest MMIO accesses */
 DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear);
 EXPORT_SYMBOL_GPL(mmio_stale_data_clear);
@@ -155,6 +163,7 @@ void __init cpu_select_mitigations(void)
 	l1tf_select_mitigation();
 	md_clear_select_mitigation();
 	srbds_select_mitigation();
+	l1d_flush_select_mitigation();
 	gds_select_mitigation();
 	srso_select_mitigation();
 }
@@ -614,6 +623,34 @@ static int __init srbds_parse_cmdline(char *str)
 }
 early_param("srbds", srbds_parse_cmdline);
 
+#undef pr_fmt
+#define pr_fmt(fmt)     "L1D Flush : " fmt
+
+enum l1d_flush_mitigations {
+	L1D_FLUSH_OFF = 0,
+	L1D_FLUSH_ON,
+};
+
+static enum l1d_flush_mitigations l1d_flush_mitigation __initdata = L1D_FLUSH_OFF;
+
+static void __init l1d_flush_select_mitigation(void)
+{
+	if (!l1d_flush_mitigation || !boot_cpu_has(X86_FEATURE_FLUSH_L1D))
+		return;
+
+	static_branch_enable(&switch_mm_cond_l1d_flush);
+	pr_info("Conditional flush on switch_mm() enabled\n");
+}
+
+static int __init l1d_flush_parse_cmdline(char *str)
+{
+	if (!strcmp(str, "on"))
+		l1d_flush_mitigation = L1D_FLUSH_ON;
+
+	return 0;
+}
+early_param("l1d_flush", l1d_flush_parse_cmdline);
+
 #undef pr_fmt
 #define pr_fmt(fmt)	"GDS: " fmt
 
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 67fe38b33a793..e8489bee0a6bc 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -8,11 +8,13 @@
 #include <linux/export.h>
 #include <linux/cpu.h>
 #include <linux/debugfs.h>
+#include <linux/sched/smt.h>
 
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
 #include <asm/nospec-branch.h>
 #include <asm/cache.h>
+#include <asm/cacheflush.h>
 #include <asm/apic.h>
 
 #include "mm_internal.h"
@@ -42,11 +44,12 @@
  */
 
 /*
- * Bits to mangle the TIF_SPEC_IB state into the mm pointer which is
+ * Bits to mangle the TIF_SPEC_* state into the mm pointer which is
  * stored in cpu_tlb_state.last_user_mm_spec.
  */
 #define LAST_USER_MM_IBPB	0x1UL
-#define LAST_USER_MM_SPEC_MASK	(LAST_USER_MM_IBPB)
+#define LAST_USER_MM_L1D_FLUSH	0x2UL
+#define LAST_USER_MM_SPEC_MASK	(LAST_USER_MM_IBPB | LAST_USER_MM_L1D_FLUSH)
 
 /* Bits to set when tlbstate and flush is (re)initialized */
 #define LAST_USER_MM_INIT	LAST_USER_MM_IBPB
@@ -320,11 +323,52 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	local_irq_restore(flags);
 }
 
+/*
+ * Invoked from return to user/guest by a task that opted-in to L1D
+ * flushing but ended up running on an SMT enabled core due to wrong
+ * affinity settings or CPU hotplug. This is part of the paranoid L1D flush
+ * contract which this task requested.
+ */
+static void l1d_flush_force_sigbus(struct callback_head *ch)
+{
+	force_sig(SIGBUS);
+}
+
+static void l1d_flush_evaluate(unsigned long prev_mm, unsigned long next_mm,
+				struct task_struct *next)
+{
+	/* Flush L1D if the outgoing task requests it */
+	if (prev_mm & LAST_USER_MM_L1D_FLUSH)
+		wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+
+	/* Check whether the incoming task opted in for L1D flush */
+	if (likely(!(next_mm & LAST_USER_MM_L1D_FLUSH)))
+		return;
+
+	/*
+	 * Validate that it is not running on an SMT sibling as this would
+	 * make the excercise pointless because the siblings share L1D. If
+	 * it runs on a SMT sibling, notify it with SIGBUS on return to
+	 * user/guest
+	 */
+	if (this_cpu_read(cpu_info.smt_active)) {
+		clear_ti_thread_flag(&next->thread_info, TIF_SPEC_L1D_FLUSH);
+		next->l1d_flush_kill.func = l1d_flush_force_sigbus;
+		task_work_add(next, &next->l1d_flush_kill, TWA_RESUME);
+	}
+}
+
 static unsigned long mm_mangle_tif_spec_bits(struct task_struct *next)
 {
 	unsigned long next_tif = task_thread_info(next)->flags;
 	unsigned long spec_bits = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_SPEC_MASK;
 
+	/*
+	 * Ensure that the bit shift above works as expected and the two flags
+	 * end up in bit 0 and 1.
+	 */
+	BUILD_BUG_ON(TIF_SPEC_L1D_FLUSH != TIF_SPEC_IB + 1);
+
 	return (unsigned long)next->mm | spec_bits;
 }
 
@@ -402,6 +446,16 @@ static void cond_mitigation(struct task_struct *next)
 			indirect_branch_prediction_barrier();
 	}
 
+	if (static_branch_unlikely(&switch_mm_cond_l1d_flush)) {
+		/*
+		 * Flush L1D when the outgoing task requested it and/or
+		 * check whether the incoming task requested L1D flushing
+		 * and ended up on an SMT sibling.
+		 */
+		if (unlikely((prev_mm | next_mm) & LAST_USER_MM_L1D_FLUSH))
+			l1d_flush_evaluate(prev_mm, next_mm, next);
+	}
+
 	this_cpu_write(cpu_tlbstate.last_user_mm_spec, next_mm);
 }
 

From 2965bb9e7e053382de6075d448375d423990e956 Mon Sep 17 00:00:00 2001
From: Balbir Singh <sblbir@amazon.com>
Date: Fri, 8 Jan 2021 23:10:55 +1100
Subject: [PATCH 597/737] x86, prctl: Hook L1D flushing in via prctl

Use the existing PR_GET/SET_SPECULATION_CTRL API to expose the L1D flush
capability. For L1D flushing PR_SPEC_FORCE_DISABLE and
PR_SPEC_DISABLE_NOEXEC are not supported.

Enabling L1D flush does not check if the task is running on an SMT enabled
core, rather a check is done at runtime (at the time of flush), if the task
runs on a SMT sibling then the task is sent a SIGBUS which is executed
before the task returns to user space or to a guest.

This is better than the other alternatives of:

  a. Ensuring strict affinity of the task (hard to enforce without further
     changes in the scheduler)

  b. Silently skipping flush for tasks that move to SMT enabled cores.

Hook up the core prctl and implement the x86 specific parts which in turn
makes it functional.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Balbir Singh <sblbir@amazon.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20210108121056.21940-5-sblbir@amazon.com
(cherry picked from commit e893bb1bb4d2eb635eba61e5d9c5135d96855773)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 arch/x86/kernel/cpu/bugs.c | 33 +++++++++++++++++++++++++++++++++
 include/uapi/linux/prctl.h |  1 +
 2 files changed, 34 insertions(+)

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 3f0e89d20d533..3a53339624152 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -1933,6 +1933,24 @@ static void task_update_spec_tif(struct task_struct *tsk)
 		speculation_ctrl_update_current();
 }
 
+static int l1d_flush_prctl_set(struct task_struct *task, unsigned long ctrl)
+{
+
+	if (!static_branch_unlikely(&switch_mm_cond_l1d_flush))
+		return -EPERM;
+
+	switch (ctrl) {
+	case PR_SPEC_ENABLE:
+		set_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH);
+		return 0;
+	case PR_SPEC_DISABLE:
+		clear_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH);
+		return 0;
+	default:
+		return -ERANGE;
+	}
+}
+
 static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
 {
 	if (ssb_mode != SPEC_STORE_BYPASS_PRCTL &&
@@ -2044,6 +2062,8 @@ int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
 		return ssb_prctl_set(task, ctrl);
 	case PR_SPEC_INDIRECT_BRANCH:
 		return ib_prctl_set(task, ctrl);
+	case PR_SPEC_L1D_FLUSH:
+		return l1d_flush_prctl_set(task, ctrl);
 	default:
 		return -ENODEV;
 	}
@@ -2060,6 +2080,17 @@ void arch_seccomp_spec_mitigate(struct task_struct *task)
 }
 #endif
 
+static int l1d_flush_prctl_get(struct task_struct *task)
+{
+	if (!static_branch_unlikely(&switch_mm_cond_l1d_flush))
+		return PR_SPEC_FORCE_DISABLE;
+
+	if (test_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH))
+		return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
+	else
+		return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
+}
+
 static int ssb_prctl_get(struct task_struct *task)
 {
 	switch (ssb_mode) {
@@ -2110,6 +2141,8 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
 		return ssb_prctl_get(task);
 	case PR_SPEC_INDIRECT_BRANCH:
 		return ib_prctl_get(task);
+	case PR_SPEC_L1D_FLUSH:
+		return l1d_flush_prctl_get(task);
 	default:
 		return -ENODEV;
 	}
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 7f0827705c9a4..943e0f34565c1 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -213,6 +213,7 @@ struct prctl_mm_map {
 /* Speculation control variants */
 # define PR_SPEC_STORE_BYPASS		0
 # define PR_SPEC_INDIRECT_BRANCH	1
+# define PR_SPEC_L1D_FLUSH		2
 /* Return and control values for PR_SET/GET_SPECULATION_CTRL */
 # define PR_SPEC_NOT_AFFECTED		0
 # define PR_SPEC_PRCTL			(1UL << 0)

From 65cf13eb1e3d3da75e27d0ebee70fbf8ea51307f Mon Sep 17 00:00:00 2001
From: Balbir Singh <sblbir@amazon.com>
Date: Fri, 8 Jan 2021 23:10:56 +1100
Subject: [PATCH 598/737] Documentation: Add L1D flushing Documentation

Add documentation of l1d flushing, explain the need for the
feature and how it can be used.

[ Luiz: resolved minor conflict in
  Documentation/admin-guide/hw-vuln/index.rst ]

Signed-off-by: Balbir Singh <sblbir@amazon.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20210108121056.21940-6-sblbir@amazon.com
(cherry picked from commit b7fe54f6c2d437082dcbecfbd832f38edd9caaf4)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 Documentation/admin-guide/hw-vuln/index.rst   |  1 +
 .../admin-guide/hw-vuln/l1d_flush.rst         | 69 +++++++++++++++++++
 .../admin-guide/kernel-parameters.txt         | 17 +++++
 Documentation/userspace-api/spec_ctrl.rst     |  8 +++
 4 files changed, 95 insertions(+)
 create mode 100644 Documentation/admin-guide/hw-vuln/l1d_flush.rst

diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst
index 84742be223ff8..7c3cf24cf6728 100644
--- a/Documentation/admin-guide/hw-vuln/index.rst
+++ b/Documentation/admin-guide/hw-vuln/index.rst
@@ -15,6 +15,7 @@ are configurable at compile, boot or run time.
    tsx_async_abort
    multihit.rst
    special-register-buffer-data-sampling.rst
+   l1d_flush.rst
    processor_mmio_stale_data.rst
    gather_data_sampling.rst
    srso
diff --git a/Documentation/admin-guide/hw-vuln/l1d_flush.rst b/Documentation/admin-guide/hw-vuln/l1d_flush.rst
new file mode 100644
index 0000000000000..210020bc3f568
--- /dev/null
+++ b/Documentation/admin-guide/hw-vuln/l1d_flush.rst
@@ -0,0 +1,69 @@
+L1D Flushing
+============
+
+With an increasing number of vulnerabilities being reported around data
+leaks from the Level 1 Data cache (L1D) the kernel provides an opt-in
+mechanism to flush the L1D cache on context switch.
+
+This mechanism can be used to address e.g. CVE-2020-0550. For applications
+the mechanism keeps them safe from vulnerabilities, related to leaks
+(snooping of) from the L1D cache.
+
+
+Related CVEs
+------------
+The following CVEs can be addressed by this
+mechanism
+
+    =============       ========================     ==================
+    CVE-2020-0550       Improper Data Forwarding     OS related aspects
+    =============       ========================     ==================
+
+Usage Guidelines
+----------------
+
+Please see document: :ref:`Documentation/userspace-api/spec_ctrl.rst
+<set_spec_ctrl>` for details.
+
+**NOTE**: The feature is disabled by default, applications need to
+specifically opt into the feature to enable it.
+
+Mitigation
+----------
+
+When PR_SET_L1D_FLUSH is enabled for a task a flush of the L1D cache is
+performed when the task is scheduled out and the incoming task belongs to a
+different process and therefore to a different address space.
+
+If the underlying CPU supports L1D flushing in hardware, the hardware
+mechanism is used, software fallback for the mitigation, is not supported.
+
+Mitigation control on the kernel command line
+---------------------------------------------
+
+The kernel command line allows to control the L1D flush mitigations at boot
+time with the option "l1d_flush=". The valid arguments for this option are:
+
+  ============  =============================================================
+  on            Enables the prctl interface, applications trying to use
+                the prctl() will fail with an error if l1d_flush is not
+                enabled
+  ============  =============================================================
+
+By default the mechanism is disabled.
+
+Limitations
+-----------
+
+The mechanism does not mitigate L1D data leaks between tasks belonging to
+different processes which are concurrently executing on sibling threads of
+a physical CPU core when SMT is enabled on the system.
+
+This can be addressed by controlled placement of processes on physical CPU
+cores or by disabling SMT. See the relevant chapter in the L1TF mitigation
+document: :ref:`Documentation/admin-guide/hw-vuln/l1tf.rst <smt_control>`.
+
+**NOTE** : The opt-in of a task for L1D flushing works only when the task's
+affinity is limited to cores running in non-SMT mode. If a task which
+requested L1D flushing is scheduled on a SMT-enabled core the kernel sends
+a SIGBUS to the task.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index c94f7228032a7..3bc110326a647 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2381,6 +2381,23 @@
 			feature (tagged TLBs) on capable Intel chips.
 			Default is 1 (enabled)
 
+	l1d_flush=	[X86,INTEL]
+			Control mitigation for L1D based snooping vulnerability.
+
+			Certain CPUs are vulnerable to an exploit against CPU
+			internal buffers which can forward information to a
+			disclosure gadget under certain conditions.
+
+			In vulnerable processors, the speculatively
+			forwarded data can be used in a cache side channel
+			attack, to access data to which the attacker does
+			not have direct access.
+
+			This parameter controls the mitigation. The
+			options are:
+
+			on         - enable the interface for the mitigation
+
 	l1tf=           [X86] Control mitigation of the L1TF vulnerability on
 			      affected CPUs
 
diff --git a/Documentation/userspace-api/spec_ctrl.rst b/Documentation/userspace-api/spec_ctrl.rst
index 7ddd8f667459b..5e8ed9eef9aa8 100644
--- a/Documentation/userspace-api/spec_ctrl.rst
+++ b/Documentation/userspace-api/spec_ctrl.rst
@@ -106,3 +106,11 @@ Speculation misfeature controls
    * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_ENABLE, 0, 0);
    * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_DISABLE, 0, 0);
    * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_FORCE_DISABLE, 0, 0);
+
+- PR_SPEC_L1D_FLUSH: Flush L1D Cache on context switch out of the task
+                        (works only when tasks run on non SMT cores)
+
+  Invocations:
+   * prctl(PR_GET_SPECULATION_CTRL, PR_SPEC_L1D_FLUSH, 0, 0, 0);
+   * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_L1D_FLUSH, PR_SPEC_ENABLE, 0, 0);
+   * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_L1D_FLUSH, PR_SPEC_DISABLE, 0, 0);

From 23290edddfd95811a055a87b019e19044456a5b2 Mon Sep 17 00:00:00 2001
From: Luiz Capitulino <luizcap@amazon.com>
Date: Tue, 10 Jan 2023 19:16:01 +0000
Subject: [PATCH 599/737] add L1D software flush interface

This implementation is taken from the following functions from KVM:

o arch/x86/kvm/vmx/vmx.c::vmx_setup_l1d_flush()
o arch/x86/kvm/vmx/vmx.c::vmx_l1d_flush()

As we did in the AL2 5.4 version, this version doesn't populate the TLB
and only performs the cache filling part.

As opposed to sharing the L1D flush interface with KVM like we did in the
AL2 5.4 version, I chose to duplicate the code for the following reasons:

1. We don't touch KVM, which causes less code churn, less chances
   of regressions for other customers and less need for additional testing

2. The downstream-only part of this series becomes self-contained,
   making it easier to rebase and easier to drop if/when necessary

The main disadvantages of duplicating the code are:

1. If L1D flushing and KVM are used at the same time, we'll consume 4
   additional pages

2. We can miss fixups to the KVM implementation. I don't think this is a
   huge deal because the KVM code has been in use for years without
   additional fixups

Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 arch/x86/include/asm/l1d_flush.h |  7 ++++
 arch/x86/kernel/Makefile         |  2 ++
 arch/x86/kernel/l1d_flush.c      | 61 ++++++++++++++++++++++++++++++++
 3 files changed, 70 insertions(+)
 create mode 100644 arch/x86/include/asm/l1d_flush.h
 create mode 100644 arch/x86/kernel/l1d_flush.c

diff --git a/arch/x86/include/asm/l1d_flush.h b/arch/x86/include/asm/l1d_flush.h
new file mode 100644
index 0000000000000..fdb798e634614
--- /dev/null
+++ b/arch/x86/include/asm/l1d_flush.h
@@ -0,0 +1,7 @@
+#ifndef _L1D_FLUSH_H
+#define _L1D_FLUSH_H
+
+void l1d_flush_init(void);
+void l1d_flush_sw(void);
+
+#endif /* _L1D_FLUSH_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index c06f3a961d647..1419853536995 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -169,3 +169,5 @@ ifeq ($(CONFIG_X86_64),y)
 endif
 
 obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT)	+= ima_arch.o
+
+obj-y					+= l1d_flush.o
diff --git a/arch/x86/kernel/l1d_flush.c b/arch/x86/kernel/l1d_flush.c
new file mode 100644
index 0000000000000..f40efc5095d72
--- /dev/null
+++ b/arch/x86/kernel/l1d_flush.c
@@ -0,0 +1,61 @@
+/*
+ * This software-based L1D flush implementation is taken from the following
+ * functions from KVM:
+ *
+ * o arch/x86/kvm/vmx/vmx.c::vmx_setup_l1d_flush()
+ * o arch/x86/kvm/vmx/vmx.c::vmx_l1d_flush()
+ *
+ * As we did in the AL2 5.4 version this version doesn't populate the
+ * TLB and only performs the cache filling part.
+ */
+#include <linux/mm.h>
+#include <asm/l1d_flush.h>
+
+#define L1D_CACHE_ORDER 4
+
+static void *l1d_flush_pages;
+
+void __init l1d_flush_init(void)
+{
+	struct page *page;
+	int i;
+
+	/*
+	 * This allocation for l1d_flush_pages is not tied to a task's
+	 * lifetime and so should not be charged to a memcg.
+	 */
+	page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
+	BUG_ON(!page);
+
+	l1d_flush_pages = page_address(page);
+
+	/*
+	 * The original implementation in vmx_l1d_flush() does this
+	 * initialization to protect against KSM for nested Virt.
+	 * Let's keep it just in case.
+	 */
+	for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
+		memset(l1d_flush_pages + i * PAGE_SIZE, i + 1,
+				PAGE_SIZE);
+	}
+}
+
+void l1d_flush_sw(void)
+{
+	int size = PAGE_SIZE << L1D_CACHE_ORDER;
+
+	BUG_ON(!l1d_flush_pages);
+
+	asm volatile(
+			/* Fill the cache */
+			"xorl	%%eax, %%eax\n"
+			".Lfill_cache:\n"
+			"movzbl	(%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+			"addl	$64, %%eax\n\t"
+			"cmpl	%%eax, %[size]\n\t"
+			"jne	.Lfill_cache\n\t"
+			"lfence\n"
+			:: [flush_pages] "r" (l1d_flush_pages),
+			[size] "r" (size)
+			: "eax", "ecx");
+}

From 56ff809011eff5700bd2314c017bc6f651d8fcdb Mon Sep 17 00:00:00 2001
From: Luiz Capitulino <luizcap@amazon.com>
Date: Tue, 10 Jan 2023 19:16:31 +0000
Subject: [PATCH 600/737] use L1D software flush interface

Fall back to software flush if the CPU doesn't support L1D flush in hardware.

Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 arch/x86/kernel/cpu/bugs.c |  6 +++++-
 arch/x86/mm/tlb.c          | 13 ++++++++++++-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 3a53339624152..c09192504eafe 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -30,6 +30,7 @@
 #include <asm/e820/api.h>
 #include <asm/hypervisor.h>
 #include <asm/tlbflush.h>
+#include <asm/l1d_flush.h>
 
 #include "cpu.h"
 
@@ -635,9 +636,12 @@ static enum l1d_flush_mitigations l1d_flush_mitigation __initdata = L1D_FLUSH_OF
 
 static void __init l1d_flush_select_mitigation(void)
 {
-	if (!l1d_flush_mitigation || !boot_cpu_has(X86_FEATURE_FLUSH_L1D))
+	if (!l1d_flush_mitigation)
 		return;
 
+	if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D))
+		l1d_flush_init();
+
 	static_branch_enable(&switch_mm_cond_l1d_flush);
 	pr_info("Conditional flush on switch_mm() enabled\n");
 }
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index e8489bee0a6bc..318f35ac99d84 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -16,6 +16,7 @@
 #include <asm/cache.h>
 #include <asm/cacheflush.h>
 #include <asm/apic.h>
+#include <asm/l1d_flush.h>
 
 #include "mm_internal.h"
 
@@ -334,12 +335,22 @@ static void l1d_flush_force_sigbus(struct callback_head *ch)
 	force_sig(SIGBUS);
 }
 
+static void l1d_do_flush(void)
+{
+	if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+		wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+		return;
+	}
+
+	l1d_flush_sw();
+}
+
 static void l1d_flush_evaluate(unsigned long prev_mm, unsigned long next_mm,
 				struct task_struct *next)
 {
 	/* Flush L1D if the outgoing task requests it */
 	if (prev_mm & LAST_USER_MM_L1D_FLUSH)
-		wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+		l1d_do_flush();
 
 	/* Check whether the incoming task opted in for L1D flush */
 	if (likely(!(next_mm & LAST_USER_MM_L1D_FLUSH)))

From 904530312ae6ff1ef84917205828ca377510f9a6 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 26 Oct 2022 22:59:32 +0000
Subject: [PATCH 601/737] mm/damon/core: split out DAMOS-charged region skip
 logic into a new function

Patch series "mm/damon: cleanup and refactoring code", v2.

This patchset cleans up and refactors a range of DAMON code including the
core, DAMON sysfs interface, and DAMON modules, for better readability and
convenient future feature implementations.

In detail, this patchset splits unnecessarily long and complex functions
in core into smaller functions (patches 1-4).  Then, it cleans up the
DAMON sysfs interface by using more type-safe code (patch 5) and removing
unnecessary function parameters (patch 6).  Further, it refactor the code
by distributing the code into multiple files (patches 7-10).  Last two
patches (patches 11 and 12) deduplicates and remove unnecessary header
inclusion in DAMON modules (reclaim and lru_sort).

This patch (of 12):

The DAMOS action applying function, 'damon_do_apply_schemes()', is quite
long and not so simple.  Split out the already quota-charged region skip
code, which is not a small amount of simple code, into a new function with
some comments for better readability.

Link: https://lkml.kernel.org/r/20221026225943.100429-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20221026225943.100429-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/core.c | 96 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 65 insertions(+), 31 deletions(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index 36d098d06c558..06b50ede9cc62 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -694,6 +694,67 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t,
 	return c->ops.get_scheme_score(c, t, r, s) >= s->quota.min_score;
 }
 
+/*
+ * damos_skip_charged_region() - Check if the given region or starting part of
+ * it is already charged for the DAMOS quota.
+ * @t:	The target of the region.
+ * @rp:	The pointer to the region.
+ * @s:	The scheme to be applied.
+ *
+ * If a quota of a scheme has exceeded in a quota charge window, the scheme's
+ * action would applied to only a part of the target access pattern fulfilling
+ * regions.  To avoid applying the scheme action to only already applied
+ * regions, DAMON skips applying the scheme action to the regions that charged
+ * in the previous charge window.
+ *
+ * This function checks if a given region should be skipped or not for the
+ * reason.  If only the starting part of the region has previously charged,
+ * this function splits the region into two so that the second one covers the
+ * area that not charged in the previous charge widnow and saves the second
+ * region in *rp and returns false, so that the caller can apply DAMON action
+ * to the second one.
+ *
+ * Return: true if the region should be entirely skipped, false otherwise.
+ */
+static bool damos_skip_charged_region(struct damon_target *t,
+		struct damon_region **rp, struct damos *s)
+{
+	struct damon_region *r = *rp;
+	struct damos_quota *quota = &s->quota;
+	unsigned long sz_to_skip;
+
+	/* Skip previously charged regions */
+	if (quota->charge_target_from) {
+		if (t != quota->charge_target_from)
+			return true;
+		if (r == damon_last_region(t)) {
+			quota->charge_target_from = NULL;
+			quota->charge_addr_from = 0;
+			return true;
+		}
+		if (quota->charge_addr_from &&
+				r->ar.end <= quota->charge_addr_from)
+			return true;
+
+		if (quota->charge_addr_from && r->ar.start <
+				quota->charge_addr_from) {
+			sz_to_skip = ALIGN_DOWN(quota->charge_addr_from -
+					r->ar.start, DAMON_MIN_REGION);
+			if (!sz_to_skip) {
+				if (damon_sz_region(r) <= DAMON_MIN_REGION)
+					return true;
+				sz_to_skip = DAMON_MIN_REGION;
+			}
+			damon_split_region_at(t, r, sz_to_skip);
+			r = damon_next_region(r);
+			*rp = r;
+		}
+		quota->charge_target_from = NULL;
+		quota->charge_addr_from = 0;
+	}
+	return false;
+}
+
 static void damon_do_apply_schemes(struct damon_ctx *c,
 				   struct damon_target *t,
 				   struct damon_region *r)
@@ -702,7 +763,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
 
 	damon_for_each_scheme(s, c) {
 		struct damos_quota *quota = &s->quota;
-		unsigned long sz = damon_sz_region(r);
+		unsigned long sz;
 		struct timespec64 begin, end;
 		unsigned long sz_applied = 0;
 
@@ -713,41 +774,14 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
 		if (quota->esz && quota->charged_sz >= quota->esz)
 			continue;
 
-		/* Skip previously charged regions */
-		if (quota->charge_target_from) {
-			if (t != quota->charge_target_from)
-				continue;
-			if (r == damon_last_region(t)) {
-				quota->charge_target_from = NULL;
-				quota->charge_addr_from = 0;
-				continue;
-			}
-			if (quota->charge_addr_from &&
-					r->ar.end <= quota->charge_addr_from)
-				continue;
-
-			if (quota->charge_addr_from && r->ar.start <
-					quota->charge_addr_from) {
-				sz = ALIGN_DOWN(quota->charge_addr_from -
-						r->ar.start, DAMON_MIN_REGION);
-				if (!sz) {
-					if (damon_sz_region(r) <=
-					    DAMON_MIN_REGION)
-						continue;
-					sz = DAMON_MIN_REGION;
-				}
-				damon_split_region_at(t, r, sz);
-				r = damon_next_region(r);
-				sz = damon_sz_region(r);
-			}
-			quota->charge_target_from = NULL;
-			quota->charge_addr_from = 0;
-		}
+		if (damos_skip_charged_region(t, &r, s))
+			continue;
 
 		if (!damos_valid_target(c, t, r, s))
 			continue;
 
 		/* Apply the scheme */
+		sz = damon_sz_region(r);
 		if (c->ops.apply_scheme) {
 			if (quota->esz &&
 					quota->charged_sz + sz > quota->esz) {

From 4f22ce0e8c848c511e0d31b517ecfad686118175 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 26 Oct 2022 22:59:33 +0000
Subject: [PATCH 602/737] mm/damon/core: split damos application logic into a
 new function

The DAMOS action applying function, 'damon_do_apply_schemes()', is still
long and not easy to read.  Split out the code for applying a single
action to a single region into a new function for better readability.

Link: https://lkml.kernel.org/r/20221026225943.100429-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/core.c | 73 ++++++++++++++++++++++++++-----------------------
 1 file changed, 39 insertions(+), 34 deletions(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index 06b50ede9cc62..c1a912bc46ae8 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -755,6 +755,44 @@ static bool damos_skip_charged_region(struct damon_target *t,
 	return false;
 }
 
+static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
+		struct damon_region *r, struct damos *s)
+{
+	struct damos_quota *quota = &s->quota;
+	unsigned long sz = damon_sz_region(r);
+	struct timespec64 begin, end;
+	unsigned long sz_applied = 0;
+
+	if (c->ops.apply_scheme) {
+		if (quota->esz && quota->charged_sz + sz > quota->esz) {
+			sz = ALIGN_DOWN(quota->esz - quota->charged_sz,
+					DAMON_MIN_REGION);
+			if (!sz)
+				goto update_stat;
+			damon_split_region_at(t, r, sz);
+		}
+		ktime_get_coarse_ts64(&begin);
+		sz_applied = c->ops.apply_scheme(c, t, r, s);
+		ktime_get_coarse_ts64(&end);
+		quota->total_charged_ns += timespec64_to_ns(&end) -
+			timespec64_to_ns(&begin);
+		quota->charged_sz += sz;
+		if (quota->esz && quota->charged_sz >= quota->esz) {
+			quota->charge_target_from = t;
+			quota->charge_addr_from = r->ar.end + 1;
+		}
+	}
+	if (s->action != DAMOS_STAT)
+		r->age = 0;
+
+update_stat:
+	s->stat.nr_tried++;
+	s->stat.sz_tried += sz;
+	if (sz_applied)
+		s->stat.nr_applied++;
+	s->stat.sz_applied += sz_applied;
+}
+
 static void damon_do_apply_schemes(struct damon_ctx *c,
 				   struct damon_target *t,
 				   struct damon_region *r)
@@ -763,9 +801,6 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
 
 	damon_for_each_scheme(s, c) {
 		struct damos_quota *quota = &s->quota;
-		unsigned long sz;
-		struct timespec64 begin, end;
-		unsigned long sz_applied = 0;
 
 		if (!s->wmarks.activated)
 			continue;
@@ -780,37 +815,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
 		if (!damos_valid_target(c, t, r, s))
 			continue;
 
-		/* Apply the scheme */
-		sz = damon_sz_region(r);
-		if (c->ops.apply_scheme) {
-			if (quota->esz &&
-					quota->charged_sz + sz > quota->esz) {
-				sz = ALIGN_DOWN(quota->esz - quota->charged_sz,
-						DAMON_MIN_REGION);
-				if (!sz)
-					goto update_stat;
-				damon_split_region_at(t, r, sz);
-			}
-			ktime_get_coarse_ts64(&begin);
-			sz_applied = c->ops.apply_scheme(c, t, r, s);
-			ktime_get_coarse_ts64(&end);
-			quota->total_charged_ns += timespec64_to_ns(&end) -
-				timespec64_to_ns(&begin);
-			quota->charged_sz += sz;
-			if (quota->esz && quota->charged_sz >= quota->esz) {
-				quota->charge_target_from = t;
-				quota->charge_addr_from = r->ar.end + 1;
-			}
-		}
-		if (s->action != DAMOS_STAT)
-			r->age = 0;
-
-update_stat:
-		s->stat.nr_tried++;
-		s->stat.sz_tried += sz;
-		if (sz_applied)
-			s->stat.nr_applied++;
-		s->stat.sz_applied += sz_applied;
+		damos_apply_scheme(c, t, r, s);
 	}
 }
 

From 146b56684d19500dc9ae074456331d186e6b10dd Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 26 Oct 2022 22:59:34 +0000
Subject: [PATCH 603/737] mm/damon/core: split out scheme stat update logic
 into a new function

The function for applying a given DAMON scheme action to a given DAMON
region, 'damos_apply_scheme()' is not quite short.  Make it better to read
by splitting out the stat update logic into a new function.

Link: https://lkml.kernel.org/r/20221026225943.100429-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/core.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index c1a912bc46ae8..3a810c6e26bc6 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -755,6 +755,16 @@ static bool damos_skip_charged_region(struct damon_target *t,
 	return false;
 }
 
+static void damos_update_stat(struct damos *s,
+		unsigned long sz_tried, unsigned long sz_applied)
+{
+	s->stat.nr_tried++;
+	s->stat.sz_tried += sz_tried;
+	if (sz_applied)
+		s->stat.nr_applied++;
+	s->stat.sz_applied += sz_applied;
+}
+
 static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
 		struct damon_region *r, struct damos *s)
 {
@@ -786,11 +796,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
 		r->age = 0;
 
 update_stat:
-	s->stat.nr_tried++;
-	s->stat.sz_tried += sz;
-	if (sz_applied)
-		s->stat.nr_applied++;
-	s->stat.sz_applied += sz_applied;
+	damos_update_stat(s, sz, sz_applied);
 }
 
 static void damon_do_apply_schemes(struct damon_ctx *c,

From 01e5abc1aeeabfefc94717b19b3c9c1aeca42f6b Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 26 Oct 2022 22:59:35 +0000
Subject: [PATCH 604/737] mm/damon/core: split out scheme quota adjustment
 logic into a new function

DAMOS quota adjustment logic in 'kdamond_apply_schemes()', has some amount
of code, and the logic is not so straightforward.  Split it out to a new
function for better readability.

Link: https://lkml.kernel.org/r/20221026225943.100429-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/core.c | 91 ++++++++++++++++++++++++++-----------------------
 1 file changed, 48 insertions(+), 43 deletions(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index 3a810c6e26bc6..80d5937fe3373 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -848,59 +848,64 @@ static void damos_set_effective_quota(struct damos_quota *quota)
 	quota->esz = esz;
 }
 
-static void kdamond_apply_schemes(struct damon_ctx *c)
+static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
 {
+	struct damos_quota *quota = &s->quota;
 	struct damon_target *t;
-	struct damon_region *r, *next_r;
-	struct damos *s;
+	struct damon_region *r;
+	unsigned long cumulated_sz;
+	unsigned int score, max_score = 0;
 
-	damon_for_each_scheme(s, c) {
-		struct damos_quota *quota = &s->quota;
-		unsigned long cumulated_sz;
-		unsigned int score, max_score = 0;
+	if (!quota->ms && !quota->sz)
+		return;
 
-		if (!s->wmarks.activated)
-			continue;
+	/* New charge window starts */
+	if (time_after_eq(jiffies, quota->charged_from +
+				msecs_to_jiffies(quota->reset_interval))) {
+		if (quota->esz && quota->charged_sz >= quota->esz)
+			s->stat.qt_exceeds++;
+		quota->total_charged_sz += quota->charged_sz;
+		quota->charged_from = jiffies;
+		quota->charged_sz = 0;
+		damos_set_effective_quota(quota);
+	}
 
-		if (!quota->ms && !quota->sz)
-			continue;
+	if (!c->ops.get_scheme_score)
+		return;
 
-		/* New charge window starts */
-		if (time_after_eq(jiffies, quota->charged_from +
-					msecs_to_jiffies(
-						quota->reset_interval))) {
-			if (quota->esz && quota->charged_sz >= quota->esz)
-				s->stat.qt_exceeds++;
-			quota->total_charged_sz += quota->charged_sz;
-			quota->charged_from = jiffies;
-			quota->charged_sz = 0;
-			damos_set_effective_quota(quota);
+	/* Fill up the score histogram */
+	memset(quota->histogram, 0, sizeof(quota->histogram));
+	damon_for_each_target(t, c) {
+		damon_for_each_region(r, t) {
+			if (!__damos_valid_target(r, s))
+				continue;
+			score = c->ops.get_scheme_score(c, t, r, s);
+			quota->histogram[score] += damon_sz_region(r);
+			if (score > max_score)
+				max_score = score;
 		}
+	}
 
-		if (!c->ops.get_scheme_score)
-			continue;
+	/* Set the min score limit */
+	for (cumulated_sz = 0, score = max_score; ; score--) {
+		cumulated_sz += quota->histogram[score];
+		if (cumulated_sz >= quota->esz || !score)
+			break;
+	}
+	quota->min_score = score;
+}
 
-		/* Fill up the score histogram */
-		memset(quota->histogram, 0, sizeof(quota->histogram));
-		damon_for_each_target(t, c) {
-			damon_for_each_region(r, t) {
-				if (!__damos_valid_target(r, s))
-					continue;
-				score = c->ops.get_scheme_score(
-						c, t, r, s);
-				quota->histogram[score] += damon_sz_region(r);
-				if (score > max_score)
-					max_score = score;
-			}
-		}
+static void kdamond_apply_schemes(struct damon_ctx *c)
+{
+	struct damon_target *t;
+	struct damon_region *r, *next_r;
+	struct damos *s;
 
-		/* Set the min score limit */
-		for (cumulated_sz = 0, score = max_score; ; score--) {
-			cumulated_sz += quota->histogram[score];
-			if (cumulated_sz >= quota->esz || !score)
-				break;
-		}
-		quota->min_score = score;
+	damon_for_each_scheme(s, c) {
+		if (!s->wmarks.activated)
+			continue;
+
+		damos_adjust_quota(c, s);
 	}
 
 	damon_for_each_target(t, c) {

From d9b893527311032ff45efe4900c4282edd3a4b68 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 26 Oct 2022 22:59:36 +0000
Subject: [PATCH 605/737] mm/damon/sysfs: use damon_addr_range for region's
 start and end values

DAMON has a struct for each address range but DAMON sysfs interface is
using the low type (unsigned long) for storing the start and end addresses
of regions.  Use the dedicated struct for better type safety.

Link: https://lkml.kernel.org/r/20221026225943.100429-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 07e5f1bdf025f..a5ef503d84445 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1062,13 +1062,11 @@ static struct kobj_type damon_sysfs_schemes_ktype = {
 
 struct damon_sysfs_region {
 	struct kobject kobj;
-	unsigned long start;
-	unsigned long end;
+	struct damon_addr_range ar;
 };
 
 static struct damon_sysfs_region *damon_sysfs_region_alloc(
-		unsigned long start,
-		unsigned long end)
+		struct damon_addr_range ar)
 {
 	struct damon_sysfs_region *region = kmalloc(sizeof(*region),
 			GFP_KERNEL);
@@ -1076,8 +1074,7 @@ static struct damon_sysfs_region *damon_sysfs_region_alloc(
 	if (!region)
 		return NULL;
 	region->kobj = (struct kobject){};
-	region->start = start;
-	region->end = end;
+	region->ar = ar;
 	return region;
 }
 
@@ -1087,7 +1084,7 @@ static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr,
 	struct damon_sysfs_region *region = container_of(kobj,
 			struct damon_sysfs_region, kobj);
 
-	return sysfs_emit(buf, "%lu\n", region->start);
+	return sysfs_emit(buf, "%lu\n", region->ar.start);
 }
 
 static ssize_t start_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1095,7 +1092,7 @@ static ssize_t start_store(struct kobject *kobj, struct kobj_attribute *attr,
 {
 	struct damon_sysfs_region *region = container_of(kobj,
 			struct damon_sysfs_region, kobj);
-	int err = kstrtoul(buf, 0, &region->start);
+	int err = kstrtoul(buf, 0, &region->ar.start);
 
 	return err ? err : count;
 }
@@ -1106,7 +1103,7 @@ static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr,
 	struct damon_sysfs_region *region = container_of(kobj,
 			struct damon_sysfs_region, kobj);
 
-	return sysfs_emit(buf, "%lu\n", region->end);
+	return sysfs_emit(buf, "%lu\n", region->ar.end);
 }
 
 static ssize_t end_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1114,7 +1111,7 @@ static ssize_t end_store(struct kobject *kobj, struct kobj_attribute *attr,
 {
 	struct damon_sysfs_region *region = container_of(kobj,
 			struct damon_sysfs_region, kobj);
-	int err = kstrtoul(buf, 0, &region->end);
+	int err = kstrtoul(buf, 0, &region->ar.end);
 
 	return err ? err : count;
 }
@@ -1187,7 +1184,7 @@ static int damon_sysfs_regions_add_dirs(struct damon_sysfs_regions *regions,
 	regions->regions_arr = regions_arr;
 
 	for (i = 0; i < nr_regions; i++) {
-		region = damon_sysfs_region_alloc(0, 0);
+		region = damon_sysfs_region_alloc((struct damon_addr_range){});
 		if (!region) {
 			damon_sysfs_regions_rm_dirs(regions);
 			return -ENOMEM;
@@ -2147,11 +2144,11 @@ static int damon_sysfs_set_regions(struct damon_target *t,
 		struct damon_sysfs_region *sys_region =
 			sysfs_regions->regions_arr[i];
 
-		if (sys_region->start > sys_region->end)
+		if (sys_region->ar.start > sys_region->ar.end)
 			goto out;
 
-		ranges[i].start = sys_region->start;
-		ranges[i].end = sys_region->end;
+		ranges[i].start = sys_region->ar.start;
+		ranges[i].end = sys_region->ar.end;
 		if (i == 0)
 			continue;
 		if (ranges[i - 1].end > ranges[i].start)

From eb83806cdd711161397b7ffad5c126112774d297 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 26 Oct 2022 22:59:37 +0000
Subject: [PATCH 606/737] mm/damon/sysfs: remove parameters of
 damon_sysfs_region_alloc()

'damon_sysfs_region_alloc()' is always called with zero-filled 'struct
damon_addr_range', because the start and end addresses should set by
users.  Remove unnecessary parameters of the function and simplify the
body by using 'kzalloc()'.

Link: https://lkml.kernel.org/r/20221026225943.100429-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index a5ef503d84445..f3d7b34ea0ab8 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1065,17 +1065,9 @@ struct damon_sysfs_region {
 	struct damon_addr_range ar;
 };
 
-static struct damon_sysfs_region *damon_sysfs_region_alloc(
-		struct damon_addr_range ar)
+static struct damon_sysfs_region *damon_sysfs_region_alloc(void)
 {
-	struct damon_sysfs_region *region = kmalloc(sizeof(*region),
-			GFP_KERNEL);
-
-	if (!region)
-		return NULL;
-	region->kobj = (struct kobject){};
-	region->ar = ar;
-	return region;
+	return kzalloc(sizeof(struct damon_sysfs_region), GFP_KERNEL);
 }
 
 static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1184,7 +1176,7 @@ static int damon_sysfs_regions_add_dirs(struct damon_sysfs_regions *regions,
 	regions->regions_arr = regions_arr;
 
 	for (i = 0; i < nr_regions; i++) {
-		region = damon_sysfs_region_alloc((struct damon_addr_range){});
+		region = damon_sysfs_region_alloc();
 		if (!region) {
 			damon_sysfs_regions_rm_dirs(regions);
 			return -ENOMEM;

From 1dcc01bf75ef6d945bb7614910081d7c28e8dd46 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 26 Oct 2022 22:59:38 +0000
Subject: [PATCH 607/737] mm/damon/sysfs: move sysfs_lock to common module

DAMON sysfs interface is implemented in a single file, sysfs.c, which has
about 2,800 lines of code.  As the interface is hierarchical and some of
the code can be reused by different hierarchies, it would make more sense
to split out the implementation into common parts and different parts in
multiple files.  As the beginning of the work, create files for common
code and move the global mutex for directories modifications protection
into the new file.

Link: https://lkml.kernel.org/r/20221026225943.100429-8-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/Makefile       |  2 +-
 mm/damon/sysfs-common.c | 11 +++++++++++
 mm/damon/sysfs-common.h | 11 +++++++++++
 mm/damon/sysfs.c        |  4 +---
 4 files changed, 24 insertions(+), 4 deletions(-)
 create mode 100644 mm/damon/sysfs-common.c
 create mode 100644 mm/damon/sysfs-common.h

diff --git a/mm/damon/Makefile b/mm/damon/Makefile
index 3e6b8ad73858a..f8d535a6253ba 100644
--- a/mm/damon/Makefile
+++ b/mm/damon/Makefile
@@ -3,7 +3,7 @@
 obj-y				:= core.o
 obj-$(CONFIG_DAMON_VADDR)	+= ops-common.o vaddr.o
 obj-$(CONFIG_DAMON_PADDR)	+= ops-common.o paddr.o
-obj-$(CONFIG_DAMON_SYSFS)	+= sysfs.o
+obj-$(CONFIG_DAMON_SYSFS)	+= sysfs-common.o sysfs.o
 obj-$(CONFIG_DAMON_DBGFS)	+= dbgfs.o
 obj-$(CONFIG_DAMON_RECLAIM)	+= reclaim.o
 obj-$(CONFIG_DAMON_LRU_SORT)	+= lru_sort.o
diff --git a/mm/damon/sysfs-common.c b/mm/damon/sysfs-common.c
new file mode 100644
index 0000000000000..9dc743868d5b6
--- /dev/null
+++ b/mm/damon/sysfs-common.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common Primitives for DAMON Sysfs Interface
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#include "sysfs-common.h"
+
+DEFINE_MUTEX(damon_sysfs_lock);
+
diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
new file mode 100644
index 0000000000000..745a918b94f5d
--- /dev/null
+++ b/mm/damon/sysfs-common.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common Primitives for DAMON Sysfs Interface
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#include <linux/damon.h>
+#include <linux/kobject.h>
+
+extern struct mutex damon_sysfs_lock;
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index f3d7b34ea0ab8..a847b9159718a 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -5,13 +5,11 @@
  * Copyright (c) 2022 SeongJae Park <sj@kernel.org>
  */
 
-#include <linux/damon.h>
-#include <linux/kobject.h>
 #include <linux/pid.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 
-static DEFINE_MUTEX(damon_sysfs_lock);
+#include "sysfs-common.h"
 
 /*
  * unsigned long range directory

From 47d3e38b6ceb892672a8d210b52d4241304cae6f Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 26 Oct 2022 22:59:39 +0000
Subject: [PATCH 608/737] mm/damon/sysfs: move unsigned long range directory to
 common module

The implementation of unsigned long type range directories can be reused
by multiple DAMON sysfs directories including those for DAMON-based
Operation Schemes and the range of number of monitoring regions.  Move the
code into the files for DAMON sysfs common logics.

Link: https://lkml.kernel.org/r/20221026225943.100429-9-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs-common.c |  96 ++++++++++++++++++++++++++++++++++++++
 mm/damon/sysfs-common.h |  13 ++++++
 mm/damon/sysfs.c        | 100 ----------------------------------------
 3 files changed, 109 insertions(+), 100 deletions(-)

diff --git a/mm/damon/sysfs-common.c b/mm/damon/sysfs-common.c
index 9dc743868d5b6..52bebf242f742 100644
--- a/mm/damon/sysfs-common.c
+++ b/mm/damon/sysfs-common.c
@@ -5,7 +5,103 @@
  * Author: SeongJae Park <sj@kernel.org>
  */
 
+#include <linux/slab.h>
+
 #include "sysfs-common.h"
 
 DEFINE_MUTEX(damon_sysfs_lock);
 
+/*
+ * unsigned long range directory
+ */
+
+struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc(
+		unsigned long min,
+		unsigned long max)
+{
+	struct damon_sysfs_ul_range *range = kmalloc(sizeof(*range),
+			GFP_KERNEL);
+
+	if (!range)
+		return NULL;
+	range->kobj = (struct kobject){};
+	range->min = min;
+	range->max = max;
+
+	return range;
+}
+
+static ssize_t min_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_ul_range *range = container_of(kobj,
+			struct damon_sysfs_ul_range, kobj);
+
+	return sysfs_emit(buf, "%lu\n", range->min);
+}
+
+static ssize_t min_store(struct kobject *kobj, struct kobj_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct damon_sysfs_ul_range *range = container_of(kobj,
+			struct damon_sysfs_ul_range, kobj);
+	unsigned long min;
+	int err;
+
+	err = kstrtoul(buf, 0, &min);
+	if (err)
+		return err;
+
+	range->min = min;
+	return count;
+}
+
+static ssize_t max_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_ul_range *range = container_of(kobj,
+			struct damon_sysfs_ul_range, kobj);
+
+	return sysfs_emit(buf, "%lu\n", range->max);
+}
+
+static ssize_t max_store(struct kobject *kobj, struct kobj_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct damon_sysfs_ul_range *range = container_of(kobj,
+			struct damon_sysfs_ul_range, kobj);
+	unsigned long max;
+	int err;
+
+	err = kstrtoul(buf, 0, &max);
+	if (err)
+		return err;
+
+	range->max = max;
+	return count;
+}
+
+void damon_sysfs_ul_range_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_ul_range, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_ul_range_min_attr =
+		__ATTR_RW_MODE(min, 0600);
+
+static struct kobj_attribute damon_sysfs_ul_range_max_attr =
+		__ATTR_RW_MODE(max, 0600);
+
+static struct attribute *damon_sysfs_ul_range_attrs[] = {
+	&damon_sysfs_ul_range_min_attr.attr,
+	&damon_sysfs_ul_range_max_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_ul_range);
+
+struct kobj_type damon_sysfs_ul_range_ktype = {
+	.release = damon_sysfs_ul_range_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_ul_range_groups,
+};
+
diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
index 745a918b94f5d..56e6a99e353b7 100644
--- a/mm/damon/sysfs-common.h
+++ b/mm/damon/sysfs-common.h
@@ -9,3 +9,16 @@
 #include <linux/kobject.h>
 
 extern struct mutex damon_sysfs_lock;
+
+struct damon_sysfs_ul_range {
+	struct kobject kobj;
+	unsigned long min;
+	unsigned long max;
+};
+
+struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc(
+		unsigned long min,
+		unsigned long max);
+void damon_sysfs_ul_range_release(struct kobject *kobj);
+
+extern struct kobj_type damon_sysfs_ul_range_ktype;
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index a847b9159718a..6774a669962e7 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -11,106 +11,6 @@
 
 #include "sysfs-common.h"
 
-/*
- * unsigned long range directory
- */
-
-struct damon_sysfs_ul_range {
-	struct kobject kobj;
-	unsigned long min;
-	unsigned long max;
-};
-
-static struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc(
-		unsigned long min,
-		unsigned long max)
-{
-	struct damon_sysfs_ul_range *range = kmalloc(sizeof(*range),
-			GFP_KERNEL);
-
-	if (!range)
-		return NULL;
-	range->kobj = (struct kobject){};
-	range->min = min;
-	range->max = max;
-
-	return range;
-}
-
-static ssize_t min_show(struct kobject *kobj, struct kobj_attribute *attr,
-		char *buf)
-{
-	struct damon_sysfs_ul_range *range = container_of(kobj,
-			struct damon_sysfs_ul_range, kobj);
-
-	return sysfs_emit(buf, "%lu\n", range->min);
-}
-
-static ssize_t min_store(struct kobject *kobj, struct kobj_attribute *attr,
-		const char *buf, size_t count)
-{
-	struct damon_sysfs_ul_range *range = container_of(kobj,
-			struct damon_sysfs_ul_range, kobj);
-	unsigned long min;
-	int err;
-
-	err = kstrtoul(buf, 0, &min);
-	if (err)
-		return err;
-
-	range->min = min;
-	return count;
-}
-
-static ssize_t max_show(struct kobject *kobj, struct kobj_attribute *attr,
-		char *buf)
-{
-	struct damon_sysfs_ul_range *range = container_of(kobj,
-			struct damon_sysfs_ul_range, kobj);
-
-	return sysfs_emit(buf, "%lu\n", range->max);
-}
-
-static ssize_t max_store(struct kobject *kobj, struct kobj_attribute *attr,
-		const char *buf, size_t count)
-{
-	struct damon_sysfs_ul_range *range = container_of(kobj,
-			struct damon_sysfs_ul_range, kobj);
-	unsigned long max;
-	int err;
-
-	err = kstrtoul(buf, 0, &max);
-	if (err)
-		return err;
-
-	range->max = max;
-	return count;
-}
-
-static void damon_sysfs_ul_range_release(struct kobject *kobj)
-{
-	kfree(container_of(kobj, struct damon_sysfs_ul_range, kobj));
-}
-
-static struct kobj_attribute damon_sysfs_ul_range_min_attr =
-		__ATTR_RW_MODE(min, 0600);
-
-static struct kobj_attribute damon_sysfs_ul_range_max_attr =
-		__ATTR_RW_MODE(max, 0600);
-
-static struct attribute *damon_sysfs_ul_range_attrs[] = {
-	&damon_sysfs_ul_range_min_attr.attr,
-	&damon_sysfs_ul_range_max_attr.attr,
-	NULL,
-};
-ATTRIBUTE_GROUPS(damon_sysfs_ul_range);
-
-static struct kobj_type damon_sysfs_ul_range_ktype = {
-	.release = damon_sysfs_ul_range_release,
-	.sysfs_ops = &kobj_sysfs_ops,
-	.default_groups = damon_sysfs_ul_range_groups,
-};
-
 /*
  * schemes/stats directory
  */

From 4e530c0138811ffc82da7cea12a1959dd7ad3b8a Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 26 Oct 2022 22:59:40 +0000
Subject: [PATCH 609/737] mm/damon/sysfs: split out kdamond-independent schemes
 stats update logic into a new function

'damon_sysfs_schemes_update_stats()' is coupled with both
damon_sysfs_kdamond and damon_sysfs_schemes.  It's a wide range of types
dependency.  It makes splitting the logics a little bit distracting.
Split the function so that each function is coupled with smaller range of
types.

Link: https://lkml.kernel.org/r/20221026225943.100429-10-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs.c | 37 ++++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 6774a669962e7..836df19a7d86c 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2246,25 +2246,13 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
 	mutex_unlock(&ctx->kdamond_lock);
 }
 
-/*
- * damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files.
- * @kdamond:	The kobject wrapper that associated to the kdamond thread.
- *
- * This function reads the schemes stats of specific kdamond and update the
- * related values for sysfs files.  This function should be called from DAMON
- * callbacks while holding ``damon_syfs_lock``, to safely access the DAMON
- * contexts-internal data and DAMON sysfs variables.
- */
-static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond)
+static void damon_sysfs_schemes_update_stats(
+		struct damon_sysfs_schemes *sysfs_schemes,
+		struct damon_ctx *ctx)
 {
-	struct damon_ctx *ctx = kdamond->damon_ctx;
-	struct damon_sysfs_schemes *sysfs_schemes;
 	struct damos *scheme;
 	int schemes_idx = 0;
 
-	if (!ctx)
-		return -EINVAL;
-	sysfs_schemes = kdamond->contexts->contexts_arr[0]->schemes;
 	damon_for_each_scheme(scheme, ctx) {
 		struct damon_sysfs_stats *sysfs_stats;
 
@@ -2279,6 +2267,25 @@ static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond)
 		sysfs_stats->sz_applied = scheme->stat.sz_applied;
 		sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds;
 	}
+}
+
+/*
+ * damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files.
+ * @kdamond:	The kobject wrapper that associated to the kdamond thread.
+ *
+ * This function reads the schemes stats of specific kdamond and update the
+ * related values for sysfs files.  This function should be called from DAMON
+ * callbacks while holding ``damon_syfs_lock``, to safely access the DAMON
+ * contexts-internal data and DAMON sysfs variables.
+ */
+static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond)
+{
+	struct damon_ctx *ctx = kdamond->damon_ctx;
+
+	if (!ctx)
+		return -EINVAL;
+	damon_sysfs_schemes_update_stats(
+			kdamond->contexts->contexts_arr[0]->schemes, ctx);
 	return 0;
 }
 

From 74694420fd500d90e706353ba3cf0ec658f2b2c9 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 26 Oct 2022 22:59:41 +0000
Subject: [PATCH 610/737] mm/damon/sysfs: split out schemes directory
 implementation to separate file

DAMON sysfs interface for 'schemes' directory is implemented using about
one thousand lines of code.  It has no strong dependency with other
parts of its file, so split it out to another file for better code
management.

Link: https://lkml.kernel.org/r/20221026225943.100429-11-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/Makefile        |    2 +-
 mm/damon/sysfs-common.h  |   22 +
 mm/damon/sysfs-schemes.c | 1068 ++++++++++++++++++++++++++++++++++++++
 mm/damon/sysfs.c         | 1064 -------------------------------------
 4 files changed, 1091 insertions(+), 1065 deletions(-)
 create mode 100644 mm/damon/sysfs-schemes.c

diff --git a/mm/damon/Makefile b/mm/damon/Makefile
index f8d535a6253ba..1e86f5253d7ff 100644
--- a/mm/damon/Makefile
+++ b/mm/damon/Makefile
@@ -3,7 +3,7 @@
 obj-y				:= core.o
 obj-$(CONFIG_DAMON_VADDR)	+= ops-common.o vaddr.o
 obj-$(CONFIG_DAMON_PADDR)	+= ops-common.o paddr.o
-obj-$(CONFIG_DAMON_SYSFS)	+= sysfs-common.o sysfs.o
+obj-$(CONFIG_DAMON_SYSFS)	+= sysfs-common.o sysfs-schemes.o sysfs.o
 obj-$(CONFIG_DAMON_DBGFS)	+= dbgfs.o
 obj-$(CONFIG_DAMON_RECLAIM)	+= reclaim.o
 obj-$(CONFIG_DAMON_LRU_SORT)	+= lru_sort.o
diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
index 56e6a99e353b7..4626b27844047 100644
--- a/mm/damon/sysfs-common.h
+++ b/mm/damon/sysfs-common.h
@@ -22,3 +22,25 @@ struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc(
 void damon_sysfs_ul_range_release(struct kobject *kobj);
 
 extern struct kobj_type damon_sysfs_ul_range_ktype;
+
+/*
+ * schemes directory
+ */
+
+struct damon_sysfs_schemes {
+	struct kobject kobj;
+	struct damon_sysfs_scheme **schemes_arr;
+	int nr;
+};
+
+struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void);
+void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes);
+
+extern struct kobj_type damon_sysfs_schemes_ktype;
+
+int damon_sysfs_set_schemes(struct damon_ctx *ctx,
+		struct damon_sysfs_schemes *sysfs_schemes);
+
+void damon_sysfs_schemes_update_stats(
+		struct damon_sysfs_schemes *sysfs_schemes,
+		struct damon_ctx *ctx);
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
new file mode 100644
index 0000000000000..9509d5c1e7fce
--- /dev/null
+++ b/mm/damon/sysfs-schemes.c
@@ -0,0 +1,1068 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DAMON sysfs Interface
+ *
+ * Copyright (c) 2022 SeongJae Park <sj@kernel.org>
+ */
+
+#include <linux/slab.h>
+
+#include "sysfs-common.h"
+
+/*
+ * schemes/stats directory
+ */
+
+struct damon_sysfs_stats {
+	struct kobject kobj;
+	unsigned long nr_tried;
+	unsigned long sz_tried;
+	unsigned long nr_applied;
+	unsigned long sz_applied;
+	unsigned long qt_exceeds;
+};
+
+static struct damon_sysfs_stats *damon_sysfs_stats_alloc(void)
+{
+	return kzalloc(sizeof(struct damon_sysfs_stats), GFP_KERNEL);
+}
+
+static ssize_t nr_tried_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_stats *stats = container_of(kobj,
+			struct damon_sysfs_stats, kobj);
+
+	return sysfs_emit(buf, "%lu\n", stats->nr_tried);
+}
+
+static ssize_t sz_tried_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_stats *stats = container_of(kobj,
+			struct damon_sysfs_stats, kobj);
+
+	return sysfs_emit(buf, "%lu\n", stats->sz_tried);
+}
+
+static ssize_t nr_applied_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_stats *stats = container_of(kobj,
+			struct damon_sysfs_stats, kobj);
+
+	return sysfs_emit(buf, "%lu\n", stats->nr_applied);
+}
+
+static ssize_t sz_applied_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_stats *stats = container_of(kobj,
+			struct damon_sysfs_stats, kobj);
+
+	return sysfs_emit(buf, "%lu\n", stats->sz_applied);
+}
+
+static ssize_t qt_exceeds_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_stats *stats = container_of(kobj,
+			struct damon_sysfs_stats, kobj);
+
+	return sysfs_emit(buf, "%lu\n", stats->qt_exceeds);
+}
+
+static void damon_sysfs_stats_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_stats, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_stats_nr_tried_attr =
+		__ATTR_RO_MODE(nr_tried, 0400);
+
+static struct kobj_attribute damon_sysfs_stats_sz_tried_attr =
+		__ATTR_RO_MODE(sz_tried, 0400);
+
+static struct kobj_attribute damon_sysfs_stats_nr_applied_attr =
+		__ATTR_RO_MODE(nr_applied, 0400);
+
+static struct kobj_attribute damon_sysfs_stats_sz_applied_attr =
+		__ATTR_RO_MODE(sz_applied, 0400);
+
+static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr =
+		__ATTR_RO_MODE(qt_exceeds, 0400);
+
+static struct attribute *damon_sysfs_stats_attrs[] = {
+	&damon_sysfs_stats_nr_tried_attr.attr,
+	&damon_sysfs_stats_sz_tried_attr.attr,
+	&damon_sysfs_stats_nr_applied_attr.attr,
+	&damon_sysfs_stats_sz_applied_attr.attr,
+	&damon_sysfs_stats_qt_exceeds_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_stats);
+
+static struct kobj_type damon_sysfs_stats_ktype = {
+	.release = damon_sysfs_stats_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_stats_groups,
+};
+
+/*
+ * watermarks directory
+ */
+
+struct damon_sysfs_watermarks {
+	struct kobject kobj;
+	enum damos_wmark_metric metric;
+	unsigned long interval_us;
+	unsigned long high;
+	unsigned long mid;
+	unsigned long low;
+};
+
+static struct damon_sysfs_watermarks *damon_sysfs_watermarks_alloc(
+		enum damos_wmark_metric metric, unsigned long interval_us,
+		unsigned long high, unsigned long mid, unsigned long low)
+{
+	struct damon_sysfs_watermarks *watermarks = kmalloc(
+			sizeof(*watermarks), GFP_KERNEL);
+
+	if (!watermarks)
+		return NULL;
+	watermarks->kobj = (struct kobject){};
+	watermarks->metric = metric;
+	watermarks->interval_us = interval_us;
+	watermarks->high = high;
+	watermarks->mid = mid;
+	watermarks->low = low;
+	return watermarks;
+}
+
+/* Should match with enum damos_wmark_metric */
+static const char * const damon_sysfs_wmark_metric_strs[] = {
+	"none",
+	"free_mem_rate",
+};
+
+static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+
+	return sysfs_emit(buf, "%s\n",
+			damon_sysfs_wmark_metric_strs[watermarks->metric]);
+}
+
+static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+	enum damos_wmark_metric metric;
+
+	for (metric = 0; metric < NR_DAMOS_WMARK_METRICS; metric++) {
+		if (sysfs_streq(buf, damon_sysfs_wmark_metric_strs[metric])) {
+			watermarks->metric = metric;
+			return count;
+		}
+	}
+	return -EINVAL;
+}
+
+static ssize_t interval_us_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+
+	return sysfs_emit(buf, "%lu\n", watermarks->interval_us);
+}
+
+static ssize_t interval_us_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+	int err = kstrtoul(buf, 0, &watermarks->interval_us);
+
+	return err ? err : count;
+}
+
+static ssize_t high_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+
+	return sysfs_emit(buf, "%lu\n", watermarks->high);
+}
+
+static ssize_t high_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+	int err = kstrtoul(buf, 0, &watermarks->high);
+
+	return err ? err : count;
+}
+
+static ssize_t mid_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+
+	return sysfs_emit(buf, "%lu\n", watermarks->mid);
+}
+
+static ssize_t mid_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+	int err = kstrtoul(buf, 0, &watermarks->mid);
+
+	return err ? err : count;
+}
+
+static ssize_t low_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+
+	return sysfs_emit(buf, "%lu\n", watermarks->low);
+}
+
+static ssize_t low_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+	int err = kstrtoul(buf, 0, &watermarks->low);
+
+	return err ? err : count;
+}
+
+static void damon_sysfs_watermarks_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_watermarks, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_watermarks_metric_attr =
+		__ATTR_RW_MODE(metric, 0600);
+
+static struct kobj_attribute damon_sysfs_watermarks_interval_us_attr =
+		__ATTR_RW_MODE(interval_us, 0600);
+
+static struct kobj_attribute damon_sysfs_watermarks_high_attr =
+		__ATTR_RW_MODE(high, 0600);
+
+static struct kobj_attribute damon_sysfs_watermarks_mid_attr =
+		__ATTR_RW_MODE(mid, 0600);
+
+static struct kobj_attribute damon_sysfs_watermarks_low_attr =
+		__ATTR_RW_MODE(low, 0600);
+
+static struct attribute *damon_sysfs_watermarks_attrs[] = {
+	&damon_sysfs_watermarks_metric_attr.attr,
+	&damon_sysfs_watermarks_interval_us_attr.attr,
+	&damon_sysfs_watermarks_high_attr.attr,
+	&damon_sysfs_watermarks_mid_attr.attr,
+	&damon_sysfs_watermarks_low_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_watermarks);
+
+static struct kobj_type damon_sysfs_watermarks_ktype = {
+	.release = damon_sysfs_watermarks_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_watermarks_groups,
+};
+
+/*
+ * scheme/weights directory
+ */
+
+struct damon_sysfs_weights {
+	struct kobject kobj;
+	unsigned int sz;
+	unsigned int nr_accesses;
+	unsigned int age;
+};
+
+static struct damon_sysfs_weights *damon_sysfs_weights_alloc(unsigned int sz,
+		unsigned int nr_accesses, unsigned int age)
+{
+	struct damon_sysfs_weights *weights = kmalloc(sizeof(*weights),
+			GFP_KERNEL);
+
+	if (!weights)
+		return NULL;
+	weights->kobj = (struct kobject){};
+	weights->sz = sz;
+	weights->nr_accesses = nr_accesses;
+	weights->age = age;
+	return weights;
+}
+
+static ssize_t sz_permil_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_weights *weights = container_of(kobj,
+			struct damon_sysfs_weights, kobj);
+
+	return sysfs_emit(buf, "%u\n", weights->sz);
+}
+
+static ssize_t sz_permil_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_weights *weights = container_of(kobj,
+			struct damon_sysfs_weights, kobj);
+	int err = kstrtouint(buf, 0, &weights->sz);
+
+	return err ? err : count;
+}
+
+static ssize_t nr_accesses_permil_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_weights *weights = container_of(kobj,
+			struct damon_sysfs_weights, kobj);
+
+	return sysfs_emit(buf, "%u\n", weights->nr_accesses);
+}
+
+static ssize_t nr_accesses_permil_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_weights *weights = container_of(kobj,
+			struct damon_sysfs_weights, kobj);
+	int err = kstrtouint(buf, 0, &weights->nr_accesses);
+
+	return err ? err : count;
+}
+
+static ssize_t age_permil_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_weights *weights = container_of(kobj,
+			struct damon_sysfs_weights, kobj);
+
+	return sysfs_emit(buf, "%u\n", weights->age);
+}
+
+static ssize_t age_permil_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_weights *weights = container_of(kobj,
+			struct damon_sysfs_weights, kobj);
+	int err = kstrtouint(buf, 0, &weights->age);
+
+	return err ? err : count;
+}
+
+static void damon_sysfs_weights_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_weights, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_weights_sz_attr =
+		__ATTR_RW_MODE(sz_permil, 0600);
+
+static struct kobj_attribute damon_sysfs_weights_nr_accesses_attr =
+		__ATTR_RW_MODE(nr_accesses_permil, 0600);
+
+static struct kobj_attribute damon_sysfs_weights_age_attr =
+		__ATTR_RW_MODE(age_permil, 0600);
+
+static struct attribute *damon_sysfs_weights_attrs[] = {
+	&damon_sysfs_weights_sz_attr.attr,
+	&damon_sysfs_weights_nr_accesses_attr.attr,
+	&damon_sysfs_weights_age_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_weights);
+
+static struct kobj_type damon_sysfs_weights_ktype = {
+	.release = damon_sysfs_weights_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_weights_groups,
+};
+
+/*
+ * quotas directory
+ */
+
+struct damon_sysfs_quotas {
+	struct kobject kobj;
+	struct damon_sysfs_weights *weights;
+	unsigned long ms;
+	unsigned long sz;
+	unsigned long reset_interval_ms;
+};
+
+static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void)
+{
+	return kzalloc(sizeof(struct damon_sysfs_quotas), GFP_KERNEL);
+}
+
+static int damon_sysfs_quotas_add_dirs(struct damon_sysfs_quotas *quotas)
+{
+	struct damon_sysfs_weights *weights;
+	int err;
+
+	weights = damon_sysfs_weights_alloc(0, 0, 0);
+	if (!weights)
+		return -ENOMEM;
+
+	err = kobject_init_and_add(&weights->kobj, &damon_sysfs_weights_ktype,
+			&quotas->kobj, "weights");
+	if (err)
+		kobject_put(&weights->kobj);
+	else
+		quotas->weights = weights;
+	return err;
+}
+
+static void damon_sysfs_quotas_rm_dirs(struct damon_sysfs_quotas *quotas)
+{
+	kobject_put(&quotas->weights->kobj);
+}
+
+static ssize_t ms_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_quotas *quotas = container_of(kobj,
+			struct damon_sysfs_quotas, kobj);
+
+	return sysfs_emit(buf, "%lu\n", quotas->ms);
+}
+
+static ssize_t ms_store(struct kobject *kobj, struct kobj_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct damon_sysfs_quotas *quotas = container_of(kobj,
+			struct damon_sysfs_quotas, kobj);
+	int err = kstrtoul(buf, 0, &quotas->ms);
+
+	if (err)
+		return -EINVAL;
+	return count;
+}
+
+static ssize_t bytes_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_quotas *quotas = container_of(kobj,
+			struct damon_sysfs_quotas, kobj);
+
+	return sysfs_emit(buf, "%lu\n", quotas->sz);
+}
+
+static ssize_t bytes_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_quotas *quotas = container_of(kobj,
+			struct damon_sysfs_quotas, kobj);
+	int err = kstrtoul(buf, 0, &quotas->sz);
+
+	if (err)
+		return -EINVAL;
+	return count;
+}
+
+static ssize_t reset_interval_ms_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_quotas *quotas = container_of(kobj,
+			struct damon_sysfs_quotas, kobj);
+
+	return sysfs_emit(buf, "%lu\n", quotas->reset_interval_ms);
+}
+
+static ssize_t reset_interval_ms_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_quotas *quotas = container_of(kobj,
+			struct damon_sysfs_quotas, kobj);
+	int err = kstrtoul(buf, 0, &quotas->reset_interval_ms);
+
+	if (err)
+		return -EINVAL;
+	return count;
+}
+
+static void damon_sysfs_quotas_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_quotas, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_quotas_ms_attr =
+		__ATTR_RW_MODE(ms, 0600);
+
+static struct kobj_attribute damon_sysfs_quotas_sz_attr =
+		__ATTR_RW_MODE(bytes, 0600);
+
+static struct kobj_attribute damon_sysfs_quotas_reset_interval_ms_attr =
+		__ATTR_RW_MODE(reset_interval_ms, 0600);
+
+static struct attribute *damon_sysfs_quotas_attrs[] = {
+	&damon_sysfs_quotas_ms_attr.attr,
+	&damon_sysfs_quotas_sz_attr.attr,
+	&damon_sysfs_quotas_reset_interval_ms_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_quotas);
+
+static struct kobj_type damon_sysfs_quotas_ktype = {
+	.release = damon_sysfs_quotas_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_quotas_groups,
+};
+
+/*
+ * access_pattern directory
+ */
+
+struct damon_sysfs_access_pattern {
+	struct kobject kobj;
+	struct damon_sysfs_ul_range *sz;
+	struct damon_sysfs_ul_range *nr_accesses;
+	struct damon_sysfs_ul_range *age;
+};
+
+static
+struct damon_sysfs_access_pattern *damon_sysfs_access_pattern_alloc(void)
+{
+	struct damon_sysfs_access_pattern *access_pattern =
+		kmalloc(sizeof(*access_pattern), GFP_KERNEL);
+
+	if (!access_pattern)
+		return NULL;
+	access_pattern->kobj = (struct kobject){};
+	return access_pattern;
+}
+
+static int damon_sysfs_access_pattern_add_range_dir(
+		struct damon_sysfs_access_pattern *access_pattern,
+		struct damon_sysfs_ul_range **range_dir_ptr,
+		char *name)
+{
+	struct damon_sysfs_ul_range *range = damon_sysfs_ul_range_alloc(0, 0);
+	int err;
+
+	if (!range)
+		return -ENOMEM;
+	err = kobject_init_and_add(&range->kobj, &damon_sysfs_ul_range_ktype,
+			&access_pattern->kobj, name);
+	if (err)
+		kobject_put(&range->kobj);
+	else
+		*range_dir_ptr = range;
+	return err;
+}
+
+static int damon_sysfs_access_pattern_add_dirs(
+		struct damon_sysfs_access_pattern *access_pattern)
+{
+	int err;
+
+	err = damon_sysfs_access_pattern_add_range_dir(access_pattern,
+			&access_pattern->sz, "sz");
+	if (err)
+		goto put_sz_out;
+
+	err = damon_sysfs_access_pattern_add_range_dir(access_pattern,
+			&access_pattern->nr_accesses, "nr_accesses");
+	if (err)
+		goto put_nr_accesses_sz_out;
+
+	err = damon_sysfs_access_pattern_add_range_dir(access_pattern,
+			&access_pattern->age, "age");
+	if (err)
+		goto put_age_nr_accesses_sz_out;
+	return 0;
+
+put_age_nr_accesses_sz_out:
+	kobject_put(&access_pattern->age->kobj);
+	access_pattern->age = NULL;
+put_nr_accesses_sz_out:
+	kobject_put(&access_pattern->nr_accesses->kobj);
+	access_pattern->nr_accesses = NULL;
+put_sz_out:
+	kobject_put(&access_pattern->sz->kobj);
+	access_pattern->sz = NULL;
+	return err;
+}
+
+static void damon_sysfs_access_pattern_rm_dirs(
+		struct damon_sysfs_access_pattern *access_pattern)
+{
+	kobject_put(&access_pattern->sz->kobj);
+	kobject_put(&access_pattern->nr_accesses->kobj);
+	kobject_put(&access_pattern->age->kobj);
+}
+
+static void damon_sysfs_access_pattern_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_access_pattern, kobj));
+}
+
+static struct attribute *damon_sysfs_access_pattern_attrs[] = {
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_access_pattern);
+
+static struct kobj_type damon_sysfs_access_pattern_ktype = {
+	.release = damon_sysfs_access_pattern_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_access_pattern_groups,
+};
+
+/*
+ * scheme directory
+ */
+
+struct damon_sysfs_scheme {
+	struct kobject kobj;
+	enum damos_action action;
+	struct damon_sysfs_access_pattern *access_pattern;
+	struct damon_sysfs_quotas *quotas;
+	struct damon_sysfs_watermarks *watermarks;
+	struct damon_sysfs_stats *stats;
+};
+
+/* This should match with enum damos_action */
+static const char * const damon_sysfs_damos_action_strs[] = {
+	"willneed",
+	"cold",
+	"pageout",
+	"hugepage",
+	"nohugepage",
+	"lru_prio",
+	"lru_deprio",
+	"stat",
+};
+
+static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc(
+		enum damos_action action)
+{
+	struct damon_sysfs_scheme *scheme = kmalloc(sizeof(*scheme),
+				GFP_KERNEL);
+
+	if (!scheme)
+		return NULL;
+	scheme->kobj = (struct kobject){};
+	scheme->action = action;
+	return scheme;
+}
+
+static int damon_sysfs_scheme_set_access_pattern(
+		struct damon_sysfs_scheme *scheme)
+{
+	struct damon_sysfs_access_pattern *access_pattern;
+	int err;
+
+	access_pattern = damon_sysfs_access_pattern_alloc();
+	if (!access_pattern)
+		return -ENOMEM;
+	err = kobject_init_and_add(&access_pattern->kobj,
+			&damon_sysfs_access_pattern_ktype, &scheme->kobj,
+			"access_pattern");
+	if (err)
+		goto out;
+	err = damon_sysfs_access_pattern_add_dirs(access_pattern);
+	if (err)
+		goto out;
+	scheme->access_pattern = access_pattern;
+	return 0;
+
+out:
+	kobject_put(&access_pattern->kobj);
+	return err;
+}
+
+static int damon_sysfs_scheme_set_quotas(struct damon_sysfs_scheme *scheme)
+{
+	struct damon_sysfs_quotas *quotas = damon_sysfs_quotas_alloc();
+	int err;
+
+	if (!quotas)
+		return -ENOMEM;
+	err = kobject_init_and_add(&quotas->kobj, &damon_sysfs_quotas_ktype,
+			&scheme->kobj, "quotas");
+	if (err)
+		goto out;
+	err = damon_sysfs_quotas_add_dirs(quotas);
+	if (err)
+		goto out;
+	scheme->quotas = quotas;
+	return 0;
+
+out:
+	kobject_put(&quotas->kobj);
+	return err;
+}
+
+static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme)
+{
+	struct damon_sysfs_watermarks *watermarks =
+		damon_sysfs_watermarks_alloc(DAMOS_WMARK_NONE, 0, 0, 0, 0);
+	int err;
+
+	if (!watermarks)
+		return -ENOMEM;
+	err = kobject_init_and_add(&watermarks->kobj,
+			&damon_sysfs_watermarks_ktype, &scheme->kobj,
+			"watermarks");
+	if (err)
+		kobject_put(&watermarks->kobj);
+	else
+		scheme->watermarks = watermarks;
+	return err;
+}
+
+static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme)
+{
+	struct damon_sysfs_stats *stats = damon_sysfs_stats_alloc();
+	int err;
+
+	if (!stats)
+		return -ENOMEM;
+	err = kobject_init_and_add(&stats->kobj, &damon_sysfs_stats_ktype,
+			&scheme->kobj, "stats");
+	if (err)
+		kobject_put(&stats->kobj);
+	else
+		scheme->stats = stats;
+	return err;
+}
+
+static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
+{
+	int err;
+
+	err = damon_sysfs_scheme_set_access_pattern(scheme);
+	if (err)
+		return err;
+	err = damon_sysfs_scheme_set_quotas(scheme);
+	if (err)
+		goto put_access_pattern_out;
+	err = damon_sysfs_scheme_set_watermarks(scheme);
+	if (err)
+		goto put_quotas_access_pattern_out;
+	err = damon_sysfs_scheme_set_stats(scheme);
+	if (err)
+		goto put_watermarks_quotas_access_pattern_out;
+	return 0;
+
+put_watermarks_quotas_access_pattern_out:
+	kobject_put(&scheme->watermarks->kobj);
+	scheme->watermarks = NULL;
+put_quotas_access_pattern_out:
+	kobject_put(&scheme->quotas->kobj);
+	scheme->quotas = NULL;
+put_access_pattern_out:
+	kobject_put(&scheme->access_pattern->kobj);
+	scheme->access_pattern = NULL;
+	return err;
+}
+
+static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme)
+{
+	damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern);
+	kobject_put(&scheme->access_pattern->kobj);
+	damon_sysfs_quotas_rm_dirs(scheme->quotas);
+	kobject_put(&scheme->quotas->kobj);
+	kobject_put(&scheme->watermarks->kobj);
+	kobject_put(&scheme->stats->kobj);
+}
+
+static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_scheme *scheme = container_of(kobj,
+			struct damon_sysfs_scheme, kobj);
+
+	return sysfs_emit(buf, "%s\n",
+			damon_sysfs_damos_action_strs[scheme->action]);
+}
+
+static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct damon_sysfs_scheme *scheme = container_of(kobj,
+			struct damon_sysfs_scheme, kobj);
+	enum damos_action action;
+
+	for (action = 0; action < NR_DAMOS_ACTIONS; action++) {
+		if (sysfs_streq(buf, damon_sysfs_damos_action_strs[action])) {
+			scheme->action = action;
+			return count;
+		}
+	}
+	return -EINVAL;
+}
+
+static void damon_sysfs_scheme_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_scheme, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_scheme_action_attr =
+		__ATTR_RW_MODE(action, 0600);
+
+static struct attribute *damon_sysfs_scheme_attrs[] = {
+	&damon_sysfs_scheme_action_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_scheme);
+
+static struct kobj_type damon_sysfs_scheme_ktype = {
+	.release = damon_sysfs_scheme_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_scheme_groups,
+};
+
+/*
+ * schemes directory
+ */
+
+struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void)
+{
+	return kzalloc(sizeof(struct damon_sysfs_schemes), GFP_KERNEL);
+}
+
+void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes)
+{
+	struct damon_sysfs_scheme **schemes_arr = schemes->schemes_arr;
+	int i;
+
+	for (i = 0; i < schemes->nr; i++) {
+		damon_sysfs_scheme_rm_dirs(schemes_arr[i]);
+		kobject_put(&schemes_arr[i]->kobj);
+	}
+	schemes->nr = 0;
+	kfree(schemes_arr);
+	schemes->schemes_arr = NULL;
+}
+
+static int damon_sysfs_schemes_add_dirs(struct damon_sysfs_schemes *schemes,
+		int nr_schemes)
+{
+	struct damon_sysfs_scheme **schemes_arr, *scheme;
+	int err, i;
+
+	damon_sysfs_schemes_rm_dirs(schemes);
+	if (!nr_schemes)
+		return 0;
+
+	schemes_arr = kmalloc_array(nr_schemes, sizeof(*schemes_arr),
+			GFP_KERNEL | __GFP_NOWARN);
+	if (!schemes_arr)
+		return -ENOMEM;
+	schemes->schemes_arr = schemes_arr;
+
+	for (i = 0; i < nr_schemes; i++) {
+		scheme = damon_sysfs_scheme_alloc(DAMOS_STAT);
+		if (!scheme) {
+			damon_sysfs_schemes_rm_dirs(schemes);
+			return -ENOMEM;
+		}
+
+		err = kobject_init_and_add(&scheme->kobj,
+				&damon_sysfs_scheme_ktype, &schemes->kobj,
+				"%d", i);
+		if (err)
+			goto out;
+		err = damon_sysfs_scheme_add_dirs(scheme);
+		if (err)
+			goto out;
+
+		schemes_arr[i] = scheme;
+		schemes->nr++;
+	}
+	return 0;
+
+out:
+	damon_sysfs_schemes_rm_dirs(schemes);
+	kobject_put(&scheme->kobj);
+	return err;
+}
+
+static ssize_t nr_schemes_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_schemes *schemes = container_of(kobj,
+			struct damon_sysfs_schemes, kobj);
+
+	return sysfs_emit(buf, "%d\n", schemes->nr);
+}
+
+static ssize_t nr_schemes_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_schemes *schemes;
+	int nr, err = kstrtoint(buf, 0, &nr);
+
+	if (err)
+		return err;
+	if (nr < 0)
+		return -EINVAL;
+
+	schemes = container_of(kobj, struct damon_sysfs_schemes, kobj);
+
+	if (!mutex_trylock(&damon_sysfs_lock))
+		return -EBUSY;
+	err = damon_sysfs_schemes_add_dirs(schemes, nr);
+	mutex_unlock(&damon_sysfs_lock);
+	if (err)
+		return err;
+	return count;
+}
+
+static void damon_sysfs_schemes_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_schemes, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_schemes_nr_attr =
+		__ATTR_RW_MODE(nr_schemes, 0600);
+
+static struct attribute *damon_sysfs_schemes_attrs[] = {
+	&damon_sysfs_schemes_nr_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_schemes);
+
+struct kobj_type damon_sysfs_schemes_ktype = {
+	.release = damon_sysfs_schemes_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_schemes_groups,
+};
+
+static struct damos *damon_sysfs_mk_scheme(
+		struct damon_sysfs_scheme *sysfs_scheme)
+{
+	struct damon_sysfs_access_pattern *access_pattern =
+		sysfs_scheme->access_pattern;
+	struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
+	struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
+	struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
+
+	struct damos_access_pattern pattern = {
+		.min_sz_region = access_pattern->sz->min,
+		.max_sz_region = access_pattern->sz->max,
+		.min_nr_accesses = access_pattern->nr_accesses->min,
+		.max_nr_accesses = access_pattern->nr_accesses->max,
+		.min_age_region = access_pattern->age->min,
+		.max_age_region = access_pattern->age->max,
+	};
+	struct damos_quota quota = {
+		.ms = sysfs_quotas->ms,
+		.sz = sysfs_quotas->sz,
+		.reset_interval = sysfs_quotas->reset_interval_ms,
+		.weight_sz = sysfs_weights->sz,
+		.weight_nr_accesses = sysfs_weights->nr_accesses,
+		.weight_age = sysfs_weights->age,
+	};
+	struct damos_watermarks wmarks = {
+		.metric = sysfs_wmarks->metric,
+		.interval = sysfs_wmarks->interval_us,
+		.high = sysfs_wmarks->high,
+		.mid = sysfs_wmarks->mid,
+		.low = sysfs_wmarks->low,
+	};
+
+	return damon_new_scheme(&pattern, sysfs_scheme->action, &quota,
+			&wmarks);
+}
+
+static void damon_sysfs_update_scheme(struct damos *scheme,
+		struct damon_sysfs_scheme *sysfs_scheme)
+{
+	struct damon_sysfs_access_pattern *access_pattern =
+		sysfs_scheme->access_pattern;
+	struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
+	struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
+	struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
+
+	scheme->pattern.min_sz_region = access_pattern->sz->min;
+	scheme->pattern.max_sz_region = access_pattern->sz->max;
+	scheme->pattern.min_nr_accesses = access_pattern->nr_accesses->min;
+	scheme->pattern.max_nr_accesses = access_pattern->nr_accesses->max;
+	scheme->pattern.min_age_region = access_pattern->age->min;
+	scheme->pattern.max_age_region = access_pattern->age->max;
+
+	scheme->action = sysfs_scheme->action;
+
+	scheme->quota.ms = sysfs_quotas->ms;
+	scheme->quota.sz = sysfs_quotas->sz;
+	scheme->quota.reset_interval = sysfs_quotas->reset_interval_ms;
+	scheme->quota.weight_sz = sysfs_weights->sz;
+	scheme->quota.weight_nr_accesses = sysfs_weights->nr_accesses;
+	scheme->quota.weight_age = sysfs_weights->age;
+
+	scheme->wmarks.metric = sysfs_wmarks->metric;
+	scheme->wmarks.interval = sysfs_wmarks->interval_us;
+	scheme->wmarks.high = sysfs_wmarks->high;
+	scheme->wmarks.mid = sysfs_wmarks->mid;
+	scheme->wmarks.low = sysfs_wmarks->low;
+}
+
+int damon_sysfs_set_schemes(struct damon_ctx *ctx,
+		struct damon_sysfs_schemes *sysfs_schemes)
+{
+	struct damos *scheme, *next;
+	int i = 0;
+
+	damon_for_each_scheme_safe(scheme, next, ctx) {
+		if (i < sysfs_schemes->nr)
+			damon_sysfs_update_scheme(scheme,
+					sysfs_schemes->schemes_arr[i]);
+		else
+			damon_destroy_scheme(scheme);
+		i++;
+	}
+
+	for (; i < sysfs_schemes->nr; i++) {
+		struct damos *scheme, *next;
+
+		scheme = damon_sysfs_mk_scheme(sysfs_schemes->schemes_arr[i]);
+		if (!scheme) {
+			damon_for_each_scheme_safe(scheme, next, ctx)
+				damon_destroy_scheme(scheme);
+			return -ENOMEM;
+		}
+		damon_add_scheme(ctx, scheme);
+	}
+	return 0;
+}
+
+void damon_sysfs_schemes_update_stats(
+		struct damon_sysfs_schemes *sysfs_schemes,
+		struct damon_ctx *ctx)
+{
+	struct damos *scheme;
+	int schemes_idx = 0;
+
+	damon_for_each_scheme(scheme, ctx) {
+		struct damon_sysfs_stats *sysfs_stats;
+
+		/* user could have removed the scheme sysfs dir */
+		if (schemes_idx >= sysfs_schemes->nr)
+			break;
+
+		sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats;
+		sysfs_stats->nr_tried = scheme->stat.nr_tried;
+		sysfs_stats->sz_tried = scheme->stat.sz_tried;
+		sysfs_stats->nr_applied = scheme->stat.nr_applied;
+		sysfs_stats->sz_applied = scheme->stat.sz_applied;
+		sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds;
+	}
+}
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 836df19a7d86c..284daf274b3ed 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -11,949 +11,6 @@
 
 #include "sysfs-common.h"
 
-/*
- * schemes/stats directory
- */
-
-struct damon_sysfs_stats {
-	struct kobject kobj;
-	unsigned long nr_tried;
-	unsigned long sz_tried;
-	unsigned long nr_applied;
-	unsigned long sz_applied;
-	unsigned long qt_exceeds;
-};
-
-static struct damon_sysfs_stats *damon_sysfs_stats_alloc(void)
-{
-	return kzalloc(sizeof(struct damon_sysfs_stats), GFP_KERNEL);
-}
-
-static ssize_t nr_tried_show(struct kobject *kobj, struct kobj_attribute *attr,
-		char *buf)
-{
-	struct damon_sysfs_stats *stats = container_of(kobj,
-			struct damon_sysfs_stats, kobj);
-
-	return sysfs_emit(buf, "%lu\n", stats->nr_tried);
-}
-
-static ssize_t sz_tried_show(struct kobject *kobj, struct kobj_attribute *attr,
-		char *buf)
-{
-	struct damon_sysfs_stats *stats = container_of(kobj,
-			struct damon_sysfs_stats, kobj);
-
-	return sysfs_emit(buf, "%lu\n", stats->sz_tried);
-}
-
-static ssize_t nr_applied_show(struct kobject *kobj,
-		struct kobj_attribute *attr, char *buf)
-{
-	struct damon_sysfs_stats *stats = container_of(kobj,
-			struct damon_sysfs_stats, kobj);
-
-	return sysfs_emit(buf, "%lu\n", stats->nr_applied);
-}
-
-static ssize_t sz_applied_show(struct kobject *kobj,
-		struct kobj_attribute *attr, char *buf)
-{
-	struct damon_sysfs_stats *stats = container_of(kobj,
-			struct damon_sysfs_stats, kobj);
-
-	return sysfs_emit(buf, "%lu\n", stats->sz_applied);
-}
-
-static ssize_t qt_exceeds_show(struct kobject *kobj,
-		struct kobj_attribute *attr, char *buf)
-{
-	struct damon_sysfs_stats *stats = container_of(kobj,
-			struct damon_sysfs_stats, kobj);
-
-	return sysfs_emit(buf, "%lu\n", stats->qt_exceeds);
-}
-
-static void damon_sysfs_stats_release(struct kobject *kobj)
-{
-	kfree(container_of(kobj, struct damon_sysfs_stats, kobj));
-}
-
-static struct kobj_attribute damon_sysfs_stats_nr_tried_attr =
-		__ATTR_RO_MODE(nr_tried, 0400);
-
-static struct kobj_attribute damon_sysfs_stats_sz_tried_attr =
-		__ATTR_RO_MODE(sz_tried, 0400);
-
-static struct kobj_attribute damon_sysfs_stats_nr_applied_attr =
-		__ATTR_RO_MODE(nr_applied, 0400);
-
-static struct kobj_attribute damon_sysfs_stats_sz_applied_attr =
-		__ATTR_RO_MODE(sz_applied, 0400);
-
-static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr =
-		__ATTR_RO_MODE(qt_exceeds, 0400);
-
-static struct attribute *damon_sysfs_stats_attrs[] = {
-	&damon_sysfs_stats_nr_tried_attr.attr,
-	&damon_sysfs_stats_sz_tried_attr.attr,
-	&damon_sysfs_stats_nr_applied_attr.attr,
-	&damon_sysfs_stats_sz_applied_attr.attr,
-	&damon_sysfs_stats_qt_exceeds_attr.attr,
-	NULL,
-};
-ATTRIBUTE_GROUPS(damon_sysfs_stats);
-
-static struct kobj_type damon_sysfs_stats_ktype = {
-	.release = damon_sysfs_stats_release,
-	.sysfs_ops = &kobj_sysfs_ops,
-	.default_groups = damon_sysfs_stats_groups,
-};
-
-/*
- * watermarks directory
- */
-
-struct damon_sysfs_watermarks {
-	struct kobject kobj;
-	enum damos_wmark_metric metric;
-	unsigned long interval_us;
-	unsigned long high;
-	unsigned long mid;
-	unsigned long low;
-};
-
-static struct damon_sysfs_watermarks *damon_sysfs_watermarks_alloc(
-		enum damos_wmark_metric metric, unsigned long interval_us,
-		unsigned long high, unsigned long mid, unsigned long low)
-{
-	struct damon_sysfs_watermarks *watermarks = kmalloc(
-			sizeof(*watermarks), GFP_KERNEL);
-
-	if (!watermarks)
-		return NULL;
-	watermarks->kobj = (struct kobject){};
-	watermarks->metric = metric;
-	watermarks->interval_us = interval_us;
-	watermarks->high = high;
-	watermarks->mid = mid;
-	watermarks->low = low;
-	return watermarks;
-}
-
-/* Should match with enum damos_wmark_metric */
-static const char * const damon_sysfs_wmark_metric_strs[] = {
-	"none",
-	"free_mem_rate",
-};
-
-static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr,
-		char *buf)
-{
-	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
-			struct damon_sysfs_watermarks, kobj);
-
-	return sysfs_emit(buf, "%s\n",
-			damon_sysfs_wmark_metric_strs[watermarks->metric]);
-}
-
-static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr,
-		const char *buf, size_t count)
-{
-	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
-			struct damon_sysfs_watermarks, kobj);
-	enum damos_wmark_metric metric;
-
-	for (metric = 0; metric < NR_DAMOS_WMARK_METRICS; metric++) {
-		if (sysfs_streq(buf, damon_sysfs_wmark_metric_strs[metric])) {
-			watermarks->metric = metric;
-			return count;
-		}
-	}
-	return -EINVAL;
-}
-
-static ssize_t interval_us_show(struct kobject *kobj,
-		struct kobj_attribute *attr, char *buf)
-{
-	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
-			struct damon_sysfs_watermarks, kobj);
-
-	return sysfs_emit(buf, "%lu\n", watermarks->interval_us);
-}
-
-static ssize_t interval_us_store(struct kobject *kobj,
-		struct kobj_attribute *attr, const char *buf, size_t count)
-{
-	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
-			struct damon_sysfs_watermarks, kobj);
-	int err = kstrtoul(buf, 0, &watermarks->interval_us);
-
-	return err ? err : count;
-}
-
-static ssize_t high_show(struct kobject *kobj,
-		struct kobj_attribute *attr, char *buf)
-{
-	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
-			struct damon_sysfs_watermarks, kobj);
-
-	return sysfs_emit(buf, "%lu\n", watermarks->high);
-}
-
-static ssize_t high_store(struct kobject *kobj,
-		struct kobj_attribute *attr, const char *buf, size_t count)
-{
-	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
-			struct damon_sysfs_watermarks, kobj);
-	int err = kstrtoul(buf, 0, &watermarks->high);
-
-	return err ? err : count;
-}
-
-static ssize_t mid_show(struct kobject *kobj,
-		struct kobj_attribute *attr, char *buf)
-{
-	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
-			struct damon_sysfs_watermarks, kobj);
-
-	return sysfs_emit(buf, "%lu\n", watermarks->mid);
-}
-
-static ssize_t mid_store(struct kobject *kobj,
-		struct kobj_attribute *attr, const char *buf, size_t count)
-{
-	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
-			struct damon_sysfs_watermarks, kobj);
-	int err = kstrtoul(buf, 0, &watermarks->mid);
-
-	return err ? err : count;
-}
-
-static ssize_t low_show(struct kobject *kobj,
-		struct kobj_attribute *attr, char *buf)
-{
-	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
-			struct damon_sysfs_watermarks, kobj);
-
-	return sysfs_emit(buf, "%lu\n", watermarks->low);
-}
-
-static ssize_t low_store(struct kobject *kobj,
-		struct kobj_attribute *attr, const char *buf, size_t count)
-{
-	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
-			struct damon_sysfs_watermarks, kobj);
-	int err = kstrtoul(buf, 0, &watermarks->low);
-
-	return err ? err : count;
-}
-
-static void damon_sysfs_watermarks_release(struct kobject *kobj)
-{
-	kfree(container_of(kobj, struct damon_sysfs_watermarks, kobj));
-}
-
-static struct kobj_attribute damon_sysfs_watermarks_metric_attr =
-		__ATTR_RW_MODE(metric, 0600);
-
-static struct kobj_attribute damon_sysfs_watermarks_interval_us_attr =
-		__ATTR_RW_MODE(interval_us, 0600);
-
-static struct kobj_attribute damon_sysfs_watermarks_high_attr =
-		__ATTR_RW_MODE(high, 0600);
-
-static struct kobj_attribute damon_sysfs_watermarks_mid_attr =
-		__ATTR_RW_MODE(mid, 0600);
-
-static struct kobj_attribute damon_sysfs_watermarks_low_attr =
-		__ATTR_RW_MODE(low, 0600);
-
-static struct attribute *damon_sysfs_watermarks_attrs[] = {
-	&damon_sysfs_watermarks_metric_attr.attr,
-	&damon_sysfs_watermarks_interval_us_attr.attr,
-	&damon_sysfs_watermarks_high_attr.attr,
-	&damon_sysfs_watermarks_mid_attr.attr,
-	&damon_sysfs_watermarks_low_attr.attr,
-	NULL,
-};
-ATTRIBUTE_GROUPS(damon_sysfs_watermarks);
-
-static struct kobj_type damon_sysfs_watermarks_ktype = {
-	.release = damon_sysfs_watermarks_release,
-	.sysfs_ops = &kobj_sysfs_ops,
-	.default_groups = damon_sysfs_watermarks_groups,
-};
-
-/*
- * scheme/weights directory
- */
-
-struct damon_sysfs_weights {
-	struct kobject kobj;
-	unsigned int sz;
-	unsigned int nr_accesses;
-	unsigned int age;
-};
-
-static struct damon_sysfs_weights *damon_sysfs_weights_alloc(unsigned int sz,
-		unsigned int nr_accesses, unsigned int age)
-{
-	struct damon_sysfs_weights *weights = kmalloc(sizeof(*weights),
-			GFP_KERNEL);
-
-	if (!weights)
-		return NULL;
-	weights->kobj = (struct kobject){};
-	weights->sz = sz;
-	weights->nr_accesses = nr_accesses;
-	weights->age = age;
-	return weights;
-}
-
-static ssize_t sz_permil_show(struct kobject *kobj,
-		struct kobj_attribute *attr, char *buf)
-{
-	struct damon_sysfs_weights *weights = container_of(kobj,
-			struct damon_sysfs_weights, kobj);
-
-	return sysfs_emit(buf, "%u\n", weights->sz);
-}
-
-static ssize_t sz_permil_store(struct kobject *kobj,
-		struct kobj_attribute *attr, const char *buf, size_t count)
-{
-	struct damon_sysfs_weights *weights = container_of(kobj,
-			struct damon_sysfs_weights, kobj);
-	int err = kstrtouint(buf, 0, &weights->sz);
-
-	return err ? err : count;
-}
-
-static ssize_t nr_accesses_permil_show(struct kobject *kobj,
-		struct kobj_attribute *attr, char *buf)
-{
-	struct damon_sysfs_weights *weights = container_of(kobj,
-			struct damon_sysfs_weights, kobj);
-
-	return sysfs_emit(buf, "%u\n", weights->nr_accesses);
-}
-
-static ssize_t nr_accesses_permil_store(struct kobject *kobj,
-		struct kobj_attribute *attr, const char *buf, size_t count)
-{
-	struct damon_sysfs_weights *weights = container_of(kobj,
-			struct damon_sysfs_weights, kobj);
-	int err = kstrtouint(buf, 0, &weights->nr_accesses);
-
-	return err ? err : count;
-}
-
-static ssize_t age_permil_show(struct kobject *kobj,
-		struct kobj_attribute *attr, char *buf)
-{
-	struct damon_sysfs_weights *weights = container_of(kobj,
-			struct damon_sysfs_weights, kobj);
-
-	return sysfs_emit(buf, "%u\n", weights->age);
-}
-
-static ssize_t age_permil_store(struct kobject *kobj,
-		struct kobj_attribute *attr, const char *buf, size_t count)
-{
-	struct damon_sysfs_weights *weights = container_of(kobj,
-			struct damon_sysfs_weights, kobj);
-	int err = kstrtouint(buf, 0, &weights->age);
-
-	return err ? err : count;
-}
-
-static void damon_sysfs_weights_release(struct kobject *kobj)
-{
-	kfree(container_of(kobj, struct damon_sysfs_weights, kobj));
-}
-
-static struct kobj_attribute damon_sysfs_weights_sz_attr =
-		__ATTR_RW_MODE(sz_permil, 0600);
-
-static struct kobj_attribute damon_sysfs_weights_nr_accesses_attr =
-		__ATTR_RW_MODE(nr_accesses_permil, 0600);
-
-static struct kobj_attribute damon_sysfs_weights_age_attr =
-		__ATTR_RW_MODE(age_permil, 0600);
-
-static struct attribute *damon_sysfs_weights_attrs[] = {
-	&damon_sysfs_weights_sz_attr.attr,
-	&damon_sysfs_weights_nr_accesses_attr.attr,
-	&damon_sysfs_weights_age_attr.attr,
-	NULL,
-};
-ATTRIBUTE_GROUPS(damon_sysfs_weights);
-
-static struct kobj_type damon_sysfs_weights_ktype = {
-	.release = damon_sysfs_weights_release,
-	.sysfs_ops = &kobj_sysfs_ops,
-	.default_groups = damon_sysfs_weights_groups,
-};
-
-/*
- * quotas directory
- */
-
-struct damon_sysfs_quotas {
-	struct kobject kobj;
-	struct damon_sysfs_weights *weights;
-	unsigned long ms;
-	unsigned long sz;
-	unsigned long reset_interval_ms;
-};
-
-static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void)
-{
-	return kzalloc(sizeof(struct damon_sysfs_quotas), GFP_KERNEL);
-}
-
-static int damon_sysfs_quotas_add_dirs(struct damon_sysfs_quotas *quotas)
-{
-	struct damon_sysfs_weights *weights;
-	int err;
-
-	weights = damon_sysfs_weights_alloc(0, 0, 0);
-	if (!weights)
-		return -ENOMEM;
-
-	err = kobject_init_and_add(&weights->kobj, &damon_sysfs_weights_ktype,
-			&quotas->kobj, "weights");
-	if (err)
-		kobject_put(&weights->kobj);
-	else
-		quotas->weights = weights;
-	return err;
-}
-
-static void damon_sysfs_quotas_rm_dirs(struct damon_sysfs_quotas *quotas)
-{
-	kobject_put(&quotas->weights->kobj);
-}
-
-static ssize_t ms_show(struct kobject *kobj, struct kobj_attribute *attr,
-		char *buf)
-{
-	struct damon_sysfs_quotas *quotas = container_of(kobj,
-			struct damon_sysfs_quotas, kobj);
-
-	return sysfs_emit(buf, "%lu\n", quotas->ms);
-}
-
-static ssize_t ms_store(struct kobject *kobj, struct kobj_attribute *attr,
-		const char *buf, size_t count)
-{
-	struct damon_sysfs_quotas *quotas = container_of(kobj,
-			struct damon_sysfs_quotas, kobj);
-	int err = kstrtoul(buf, 0, &quotas->ms);
-
-	if (err)
-		return -EINVAL;
-	return count;
-}
-
-static ssize_t bytes_show(struct kobject *kobj, struct kobj_attribute *attr,
-		char *buf)
-{
-	struct damon_sysfs_quotas *quotas = container_of(kobj,
-			struct damon_sysfs_quotas, kobj);
-
-	return sysfs_emit(buf, "%lu\n", quotas->sz);
-}
-
-static ssize_t bytes_store(struct kobject *kobj,
-		struct kobj_attribute *attr, const char *buf, size_t count)
-{
-	struct damon_sysfs_quotas *quotas = container_of(kobj,
-			struct damon_sysfs_quotas, kobj);
-	int err = kstrtoul(buf, 0, &quotas->sz);
-
-	if (err)
-		return -EINVAL;
-	return count;
-}
-
-static ssize_t reset_interval_ms_show(struct kobject *kobj,
-		struct kobj_attribute *attr, char *buf)
-{
-	struct damon_sysfs_quotas *quotas = container_of(kobj,
-			struct damon_sysfs_quotas, kobj);
-
-	return sysfs_emit(buf, "%lu\n", quotas->reset_interval_ms);
-}
-
-static ssize_t reset_interval_ms_store(struct kobject *kobj,
-		struct kobj_attribute *attr, const char *buf, size_t count)
-{
-	struct damon_sysfs_quotas *quotas = container_of(kobj,
-			struct damon_sysfs_quotas, kobj);
-	int err = kstrtoul(buf, 0, &quotas->reset_interval_ms);
-
-	if (err)
-		return -EINVAL;
-	return count;
-}
-
-static void damon_sysfs_quotas_release(struct kobject *kobj)
-{
-	kfree(container_of(kobj, struct damon_sysfs_quotas, kobj));
-}
-
-static struct kobj_attribute damon_sysfs_quotas_ms_attr =
-		__ATTR_RW_MODE(ms, 0600);
-
-static struct kobj_attribute damon_sysfs_quotas_sz_attr =
-		__ATTR_RW_MODE(bytes, 0600);
-
-static struct kobj_attribute damon_sysfs_quotas_reset_interval_ms_attr =
-		__ATTR_RW_MODE(reset_interval_ms, 0600);
-
-static struct attribute *damon_sysfs_quotas_attrs[] = {
-	&damon_sysfs_quotas_ms_attr.attr,
-	&damon_sysfs_quotas_sz_attr.attr,
-	&damon_sysfs_quotas_reset_interval_ms_attr.attr,
-	NULL,
-};
-ATTRIBUTE_GROUPS(damon_sysfs_quotas);
-
-static struct kobj_type damon_sysfs_quotas_ktype = {
-	.release = damon_sysfs_quotas_release,
-	.sysfs_ops = &kobj_sysfs_ops,
-	.default_groups = damon_sysfs_quotas_groups,
-};
-
-/*
- * access_pattern directory
- */
-
-struct damon_sysfs_access_pattern {
-	struct kobject kobj;
-	struct damon_sysfs_ul_range *sz;
-	struct damon_sysfs_ul_range *nr_accesses;
-	struct damon_sysfs_ul_range *age;
-};
-
-static
-struct damon_sysfs_access_pattern *damon_sysfs_access_pattern_alloc(void)
-{
-	struct damon_sysfs_access_pattern *access_pattern =
-		kmalloc(sizeof(*access_pattern), GFP_KERNEL);
-
-	if (!access_pattern)
-		return NULL;
-	access_pattern->kobj = (struct kobject){};
-	return access_pattern;
-}
-
-static int damon_sysfs_access_pattern_add_range_dir(
-		struct damon_sysfs_access_pattern *access_pattern,
-		struct damon_sysfs_ul_range **range_dir_ptr,
-		char *name)
-{
-	struct damon_sysfs_ul_range *range = damon_sysfs_ul_range_alloc(0, 0);
-	int err;
-
-	if (!range)
-		return -ENOMEM;
-	err = kobject_init_and_add(&range->kobj, &damon_sysfs_ul_range_ktype,
-			&access_pattern->kobj, name);
-	if (err)
-		kobject_put(&range->kobj);
-	else
-		*range_dir_ptr = range;
-	return err;
-}
-
-static int damon_sysfs_access_pattern_add_dirs(
-		struct damon_sysfs_access_pattern *access_pattern)
-{
-	int err;
-
-	err = damon_sysfs_access_pattern_add_range_dir(access_pattern,
-			&access_pattern->sz, "sz");
-	if (err)
-		goto put_sz_out;
-
-	err = damon_sysfs_access_pattern_add_range_dir(access_pattern,
-			&access_pattern->nr_accesses, "nr_accesses");
-	if (err)
-		goto put_nr_accesses_sz_out;
-
-	err = damon_sysfs_access_pattern_add_range_dir(access_pattern,
-			&access_pattern->age, "age");
-	if (err)
-		goto put_age_nr_accesses_sz_out;
-	return 0;
-
-put_age_nr_accesses_sz_out:
-	kobject_put(&access_pattern->age->kobj);
-	access_pattern->age = NULL;
-put_nr_accesses_sz_out:
-	kobject_put(&access_pattern->nr_accesses->kobj);
-	access_pattern->nr_accesses = NULL;
-put_sz_out:
-	kobject_put(&access_pattern->sz->kobj);
-	access_pattern->sz = NULL;
-	return err;
-}
-
-static void damon_sysfs_access_pattern_rm_dirs(
-		struct damon_sysfs_access_pattern *access_pattern)
-{
-	kobject_put(&access_pattern->sz->kobj);
-	kobject_put(&access_pattern->nr_accesses->kobj);
-	kobject_put(&access_pattern->age->kobj);
-}
-
-static void damon_sysfs_access_pattern_release(struct kobject *kobj)
-{
-	kfree(container_of(kobj, struct damon_sysfs_access_pattern, kobj));
-}
-
-static struct attribute *damon_sysfs_access_pattern_attrs[] = {
-	NULL,
-};
-ATTRIBUTE_GROUPS(damon_sysfs_access_pattern);
-
-static struct kobj_type damon_sysfs_access_pattern_ktype = {
-	.release = damon_sysfs_access_pattern_release,
-	.sysfs_ops = &kobj_sysfs_ops,
-	.default_groups = damon_sysfs_access_pattern_groups,
-};
-
-/*
- * scheme directory
- */
-
-struct damon_sysfs_scheme {
-	struct kobject kobj;
-	enum damos_action action;
-	struct damon_sysfs_access_pattern *access_pattern;
-	struct damon_sysfs_quotas *quotas;
-	struct damon_sysfs_watermarks *watermarks;
-	struct damon_sysfs_stats *stats;
-};
-
-/* This should match with enum damos_action */
-static const char * const damon_sysfs_damos_action_strs[] = {
-	"willneed",
-	"cold",
-	"pageout",
-	"hugepage",
-	"nohugepage",
-	"lru_prio",
-	"lru_deprio",
-	"stat",
-};
-
-static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc(
-		enum damos_action action)
-{
-	struct damon_sysfs_scheme *scheme = kmalloc(sizeof(*scheme),
-				GFP_KERNEL);
-
-	if (!scheme)
-		return NULL;
-	scheme->kobj = (struct kobject){};
-	scheme->action = action;
-	return scheme;
-}
-
-static int damon_sysfs_scheme_set_access_pattern(
-		struct damon_sysfs_scheme *scheme)
-{
-	struct damon_sysfs_access_pattern *access_pattern;
-	int err;
-
-	access_pattern = damon_sysfs_access_pattern_alloc();
-	if (!access_pattern)
-		return -ENOMEM;
-	err = kobject_init_and_add(&access_pattern->kobj,
-			&damon_sysfs_access_pattern_ktype, &scheme->kobj,
-			"access_pattern");
-	if (err)
-		goto out;
-	err = damon_sysfs_access_pattern_add_dirs(access_pattern);
-	if (err)
-		goto out;
-	scheme->access_pattern = access_pattern;
-	return 0;
-
-out:
-	kobject_put(&access_pattern->kobj);
-	return err;
-}
-
-static int damon_sysfs_scheme_set_quotas(struct damon_sysfs_scheme *scheme)
-{
-	struct damon_sysfs_quotas *quotas = damon_sysfs_quotas_alloc();
-	int err;
-
-	if (!quotas)
-		return -ENOMEM;
-	err = kobject_init_and_add(&quotas->kobj, &damon_sysfs_quotas_ktype,
-			&scheme->kobj, "quotas");
-	if (err)
-		goto out;
-	err = damon_sysfs_quotas_add_dirs(quotas);
-	if (err)
-		goto out;
-	scheme->quotas = quotas;
-	return 0;
-
-out:
-	kobject_put(&quotas->kobj);
-	return err;
-}
-
-static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme)
-{
-	struct damon_sysfs_watermarks *watermarks =
-		damon_sysfs_watermarks_alloc(DAMOS_WMARK_NONE, 0, 0, 0, 0);
-	int err;
-
-	if (!watermarks)
-		return -ENOMEM;
-	err = kobject_init_and_add(&watermarks->kobj,
-			&damon_sysfs_watermarks_ktype, &scheme->kobj,
-			"watermarks");
-	if (err)
-		kobject_put(&watermarks->kobj);
-	else
-		scheme->watermarks = watermarks;
-	return err;
-}
-
-static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme)
-{
-	struct damon_sysfs_stats *stats = damon_sysfs_stats_alloc();
-	int err;
-
-	if (!stats)
-		return -ENOMEM;
-	err = kobject_init_and_add(&stats->kobj, &damon_sysfs_stats_ktype,
-			&scheme->kobj, "stats");
-	if (err)
-		kobject_put(&stats->kobj);
-	else
-		scheme->stats = stats;
-	return err;
-}
-
-static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
-{
-	int err;
-
-	err = damon_sysfs_scheme_set_access_pattern(scheme);
-	if (err)
-		return err;
-	err = damon_sysfs_scheme_set_quotas(scheme);
-	if (err)
-		goto put_access_pattern_out;
-	err = damon_sysfs_scheme_set_watermarks(scheme);
-	if (err)
-		goto put_quotas_access_pattern_out;
-	err = damon_sysfs_scheme_set_stats(scheme);
-	if (err)
-		goto put_watermarks_quotas_access_pattern_out;
-	return 0;
-
-put_watermarks_quotas_access_pattern_out:
-	kobject_put(&scheme->watermarks->kobj);
-	scheme->watermarks = NULL;
-put_quotas_access_pattern_out:
-	kobject_put(&scheme->quotas->kobj);
-	scheme->quotas = NULL;
-put_access_pattern_out:
-	kobject_put(&scheme->access_pattern->kobj);
-	scheme->access_pattern = NULL;
-	return err;
-}
-
-static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme)
-{
-	damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern);
-	kobject_put(&scheme->access_pattern->kobj);
-	damon_sysfs_quotas_rm_dirs(scheme->quotas);
-	kobject_put(&scheme->quotas->kobj);
-	kobject_put(&scheme->watermarks->kobj);
-	kobject_put(&scheme->stats->kobj);
-}
-
-static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr,
-		char *buf)
-{
-	struct damon_sysfs_scheme *scheme = container_of(kobj,
-			struct damon_sysfs_scheme, kobj);
-
-	return sysfs_emit(buf, "%s\n",
-			damon_sysfs_damos_action_strs[scheme->action]);
-}
-
-static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr,
-		const char *buf, size_t count)
-{
-	struct damon_sysfs_scheme *scheme = container_of(kobj,
-			struct damon_sysfs_scheme, kobj);
-	enum damos_action action;
-
-	for (action = 0; action < NR_DAMOS_ACTIONS; action++) {
-		if (sysfs_streq(buf, damon_sysfs_damos_action_strs[action])) {
-			scheme->action = action;
-			return count;
-		}
-	}
-	return -EINVAL;
-}
-
-static void damon_sysfs_scheme_release(struct kobject *kobj)
-{
-	kfree(container_of(kobj, struct damon_sysfs_scheme, kobj));
-}
-
-static struct kobj_attribute damon_sysfs_scheme_action_attr =
-		__ATTR_RW_MODE(action, 0600);
-
-static struct attribute *damon_sysfs_scheme_attrs[] = {
-	&damon_sysfs_scheme_action_attr.attr,
-	NULL,
-};
-ATTRIBUTE_GROUPS(damon_sysfs_scheme);
-
-static struct kobj_type damon_sysfs_scheme_ktype = {
-	.release = damon_sysfs_scheme_release,
-	.sysfs_ops = &kobj_sysfs_ops,
-	.default_groups = damon_sysfs_scheme_groups,
-};
-
-/*
- * schemes directory
- */
-
-struct damon_sysfs_schemes {
-	struct kobject kobj;
-	struct damon_sysfs_scheme **schemes_arr;
-	int nr;
-};
-
-static struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void)
-{
-	return kzalloc(sizeof(struct damon_sysfs_schemes), GFP_KERNEL);
-}
-
-static void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes)
-{
-	struct damon_sysfs_scheme **schemes_arr = schemes->schemes_arr;
-	int i;
-
-	for (i = 0; i < schemes->nr; i++) {
-		damon_sysfs_scheme_rm_dirs(schemes_arr[i]);
-		kobject_put(&schemes_arr[i]->kobj);
-	}
-	schemes->nr = 0;
-	kfree(schemes_arr);
-	schemes->schemes_arr = NULL;
-}
-
-static int damon_sysfs_schemes_add_dirs(struct damon_sysfs_schemes *schemes,
-		int nr_schemes)
-{
-	struct damon_sysfs_scheme **schemes_arr, *scheme;
-	int err, i;
-
-	damon_sysfs_schemes_rm_dirs(schemes);
-	if (!nr_schemes)
-		return 0;
-
-	schemes_arr = kmalloc_array(nr_schemes, sizeof(*schemes_arr),
-			GFP_KERNEL | __GFP_NOWARN);
-	if (!schemes_arr)
-		return -ENOMEM;
-	schemes->schemes_arr = schemes_arr;
-
-	for (i = 0; i < nr_schemes; i++) {
-		scheme = damon_sysfs_scheme_alloc(DAMOS_STAT);
-		if (!scheme) {
-			damon_sysfs_schemes_rm_dirs(schemes);
-			return -ENOMEM;
-		}
-
-		err = kobject_init_and_add(&scheme->kobj,
-				&damon_sysfs_scheme_ktype, &schemes->kobj,
-				"%d", i);
-		if (err)
-			goto out;
-		err = damon_sysfs_scheme_add_dirs(scheme);
-		if (err)
-			goto out;
-
-		schemes_arr[i] = scheme;
-		schemes->nr++;
-	}
-	return 0;
-
-out:
-	damon_sysfs_schemes_rm_dirs(schemes);
-	kobject_put(&scheme->kobj);
-	return err;
-}
-
-static ssize_t nr_schemes_show(struct kobject *kobj,
-		struct kobj_attribute *attr, char *buf)
-{
-	struct damon_sysfs_schemes *schemes = container_of(kobj,
-			struct damon_sysfs_schemes, kobj);
-
-	return sysfs_emit(buf, "%d\n", schemes->nr);
-}
-
-static ssize_t nr_schemes_store(struct kobject *kobj,
-		struct kobj_attribute *attr, const char *buf, size_t count)
-{
-	struct damon_sysfs_schemes *schemes;
-	int nr, err = kstrtoint(buf, 0, &nr);
-
-	if (err)
-		return err;
-	if (nr < 0)
-		return -EINVAL;
-
-	schemes = container_of(kobj, struct damon_sysfs_schemes, kobj);
-
-	if (!mutex_trylock(&damon_sysfs_lock))
-		return -EBUSY;
-	err = damon_sysfs_schemes_add_dirs(schemes, nr);
-	mutex_unlock(&damon_sysfs_lock);
-	if (err)
-		return err;
-	return count;
-}
-
-static void damon_sysfs_schemes_release(struct kobject *kobj)
-{
-	kfree(container_of(kobj, struct damon_sysfs_schemes, kobj));
-}
-
-static struct kobj_attribute damon_sysfs_schemes_nr_attr =
-		__ATTR_RW_MODE(nr_schemes, 0600);
-
-static struct attribute *damon_sysfs_schemes_attrs[] = {
-	&damon_sysfs_schemes_nr_attr.attr,
-	NULL,
-};
-ATTRIBUTE_GROUPS(damon_sysfs_schemes);
-
-static struct kobj_type damon_sysfs_schemes_ktype = {
-	.release = damon_sysfs_schemes_release,
-	.sysfs_ops = &kobj_sysfs_ops,
-	.default_groups = damon_sysfs_schemes_groups,
-};
-
 /*
  * init region directory
  */
@@ -2133,104 +1190,6 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx,
 	return 0;
 }
 
-static struct damos *damon_sysfs_mk_scheme(
-		struct damon_sysfs_scheme *sysfs_scheme)
-{
-	struct damon_sysfs_access_pattern *access_pattern =
-		sysfs_scheme->access_pattern;
-	struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
-	struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
-	struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
-
-	struct damos_access_pattern pattern = {
-		.min_sz_region = access_pattern->sz->min,
-		.max_sz_region = access_pattern->sz->max,
-		.min_nr_accesses = access_pattern->nr_accesses->min,
-		.max_nr_accesses = access_pattern->nr_accesses->max,
-		.min_age_region = access_pattern->age->min,
-		.max_age_region = access_pattern->age->max,
-	};
-	struct damos_quota quota = {
-		.ms = sysfs_quotas->ms,
-		.sz = sysfs_quotas->sz,
-		.reset_interval = sysfs_quotas->reset_interval_ms,
-		.weight_sz = sysfs_weights->sz,
-		.weight_nr_accesses = sysfs_weights->nr_accesses,
-		.weight_age = sysfs_weights->age,
-	};
-	struct damos_watermarks wmarks = {
-		.metric = sysfs_wmarks->metric,
-		.interval = sysfs_wmarks->interval_us,
-		.high = sysfs_wmarks->high,
-		.mid = sysfs_wmarks->mid,
-		.low = sysfs_wmarks->low,
-	};
-
-	return damon_new_scheme(&pattern, sysfs_scheme->action, &quota,
-			&wmarks);
-}
-
-static void damon_sysfs_update_scheme(struct damos *scheme,
-		struct damon_sysfs_scheme *sysfs_scheme)
-{
-	struct damon_sysfs_access_pattern *access_pattern =
-		sysfs_scheme->access_pattern;
-	struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
-	struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
-	struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
-
-	scheme->pattern.min_sz_region = access_pattern->sz->min;
-	scheme->pattern.max_sz_region = access_pattern->sz->max;
-	scheme->pattern.min_nr_accesses = access_pattern->nr_accesses->min;
-	scheme->pattern.max_nr_accesses = access_pattern->nr_accesses->max;
-	scheme->pattern.min_age_region = access_pattern->age->min;
-	scheme->pattern.max_age_region = access_pattern->age->max;
-
-	scheme->action = sysfs_scheme->action;
-
-	scheme->quota.ms = sysfs_quotas->ms;
-	scheme->quota.sz = sysfs_quotas->sz;
-	scheme->quota.reset_interval = sysfs_quotas->reset_interval_ms;
-	scheme->quota.weight_sz = sysfs_weights->sz;
-	scheme->quota.weight_nr_accesses = sysfs_weights->nr_accesses;
-	scheme->quota.weight_age = sysfs_weights->age;
-
-	scheme->wmarks.metric = sysfs_wmarks->metric;
-	scheme->wmarks.interval = sysfs_wmarks->interval_us;
-	scheme->wmarks.high = sysfs_wmarks->high;
-	scheme->wmarks.mid = sysfs_wmarks->mid;
-	scheme->wmarks.low = sysfs_wmarks->low;
-}
-
-static int damon_sysfs_set_schemes(struct damon_ctx *ctx,
-		struct damon_sysfs_schemes *sysfs_schemes)
-{
-	struct damos *scheme, *next;
-	int i = 0;
-
-	damon_for_each_scheme_safe(scheme, next, ctx) {
-		if (i < sysfs_schemes->nr)
-			damon_sysfs_update_scheme(scheme,
-					sysfs_schemes->schemes_arr[i]);
-		else
-			damon_destroy_scheme(scheme);
-		i++;
-	}
-
-	for (; i < sysfs_schemes->nr; i++) {
-		struct damos *scheme, *next;
-
-		scheme = damon_sysfs_mk_scheme(sysfs_schemes->schemes_arr[i]);
-		if (!scheme) {
-			damon_for_each_scheme_safe(scheme, next, ctx)
-				damon_destroy_scheme(scheme);
-			return -ENOMEM;
-		}
-		damon_add_scheme(ctx, scheme);
-	}
-	return 0;
-}
-
 static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
 {
 	struct damon_target *t, *next;
@@ -2246,29 +1205,6 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
 	mutex_unlock(&ctx->kdamond_lock);
 }
 
-static void damon_sysfs_schemes_update_stats(
-		struct damon_sysfs_schemes *sysfs_schemes,
-		struct damon_ctx *ctx)
-{
-	struct damos *scheme;
-	int schemes_idx = 0;
-
-	damon_for_each_scheme(scheme, ctx) {
-		struct damon_sysfs_stats *sysfs_stats;
-
-		/* user could have removed the scheme sysfs dir */
-		if (schemes_idx >= sysfs_schemes->nr)
-			break;
-
-		sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats;
-		sysfs_stats->nr_tried = scheme->stat.nr_tried;
-		sysfs_stats->sz_tried = scheme->stat.sz_tried;
-		sysfs_stats->nr_applied = scheme->stat.nr_applied;
-		sysfs_stats->sz_applied = scheme->stat.sz_applied;
-		sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds;
-	}
-}
-
 /*
  * damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files.
  * @kdamond:	The kobject wrapper that associated to the kdamond thread.

From 79cb0594f5c173fe4ecb97ac2dc5c76b8e2c8321 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 26 Oct 2022 22:59:42 +0000
Subject: [PATCH 611/737] mm/damon/modules: deduplicate init steps for DAMON
 context setup

DAMON_RECLAIM and DAMON_LRU_SORT has duplicated code for DAMON context and
target initializations.  Deduplicate the part by implementing a function
for the initialization in 'modules-common.c' and using it.

Link: https://lkml.kernel.org/r/20221026225943.100429-12-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/Makefile         |  4 ++--
 mm/damon/lru_sort.c       | 17 +++-------------
 mm/damon/modules-common.c | 42 +++++++++++++++++++++++++++++++++++++++
 mm/damon/modules-common.h |  3 +++
 mm/damon/reclaim.c        | 17 +++-------------
 5 files changed, 53 insertions(+), 30 deletions(-)
 create mode 100644 mm/damon/modules-common.c

diff --git a/mm/damon/Makefile b/mm/damon/Makefile
index 1e86f5253d7ff..f7add3f4aa793 100644
--- a/mm/damon/Makefile
+++ b/mm/damon/Makefile
@@ -5,5 +5,5 @@ obj-$(CONFIG_DAMON_VADDR)	+= ops-common.o vaddr.o
 obj-$(CONFIG_DAMON_PADDR)	+= ops-common.o paddr.o
 obj-$(CONFIG_DAMON_SYSFS)	+= sysfs-common.o sysfs-schemes.o sysfs.o
 obj-$(CONFIG_DAMON_DBGFS)	+= dbgfs.o
-obj-$(CONFIG_DAMON_RECLAIM)	+= reclaim.o
-obj-$(CONFIG_DAMON_LRU_SORT)	+= lru_sort.o
+obj-$(CONFIG_DAMON_RECLAIM)	+= modules-common.o reclaim.o
+obj-$(CONFIG_DAMON_LRU_SORT)	+= modules-common.o lru_sort.o
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index efbc2bda8b9cd..a1896c5acfe97 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -314,25 +314,14 @@ static int damon_lru_sort_after_wmarks_check(struct damon_ctx *c)
 
 static int __init damon_lru_sort_init(void)
 {
-	ctx = damon_new_ctx();
-	if (!ctx)
-		return -ENOMEM;
+	int err = damon_modules_new_paddr_ctx_target(&ctx, &target);
 
-	if (damon_select_ops(ctx, DAMON_OPS_PADDR)) {
-		damon_destroy_ctx(ctx);
-		return -EINVAL;
-	}
+	if (err)
+		return err;
 
 	ctx->callback.after_wmarks_check = damon_lru_sort_after_wmarks_check;
 	ctx->callback.after_aggregation = damon_lru_sort_after_aggregation;
 
-	target = damon_new_target();
-	if (!target) {
-		damon_destroy_ctx(ctx);
-		return -ENOMEM;
-	}
-	damon_add_target(ctx, target);
-
 	schedule_delayed_work(&damon_lru_sort_timer, 0);
 
 	damon_lru_sort_initialized = true;
diff --git a/mm/damon/modules-common.c b/mm/damon/modules-common.c
new file mode 100644
index 0000000000000..b2381a8466ecf
--- /dev/null
+++ b/mm/damon/modules-common.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common Primitives for DAMON Modules
+ *
+ * Author: SeongJae Park <sjpark@amazon.de>
+ */
+
+#include <linux/damon.h>
+
+#include "modules-common.h"
+
+/*
+ * Allocate, set, and return a DAMON context for the physical address space.
+ * @ctxp:	Pointer to save the point to the newly created context
+ * @targetp:	Pointer to save the point to the newly created target
+ */
+int damon_modules_new_paddr_ctx_target(struct damon_ctx **ctxp,
+		struct damon_target **targetp)
+{
+	struct damon_ctx *ctx;
+	struct damon_target *target;
+
+	ctx = damon_new_ctx();
+	if (!ctx)
+		return -ENOMEM;
+
+	if (damon_select_ops(ctx, DAMON_OPS_PADDR)) {
+		damon_destroy_ctx(ctx);
+		return -EINVAL;
+	}
+
+	target = damon_new_target();
+	if (!target) {
+		damon_destroy_ctx(ctx);
+		return -ENOMEM;
+	}
+	damon_add_target(ctx, target);
+
+	*ctxp = ctx;
+	*targetp = target;
+	return 0;
+}
diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h
index 5a4921851d326..f49cdb4170051 100644
--- a/mm/damon/modules-common.h
+++ b/mm/damon/modules-common.h
@@ -44,3 +44,6 @@
 			0400);						\
 	module_param_named(nr_##qt_exceed_name, stat.qt_exceeds, ulong,	\
 			0400);
+
+int damon_modules_new_paddr_ctx_target(struct damon_ctx **ctxp,
+		struct damon_target **targetp);
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 162c9b1ca00fd..3173f373435c2 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -256,25 +256,14 @@ static int damon_reclaim_after_wmarks_check(struct damon_ctx *c)
 
 static int __init damon_reclaim_init(void)
 {
-	ctx = damon_new_ctx();
-	if (!ctx)
-		return -ENOMEM;
+	int err = damon_modules_new_paddr_ctx_target(&ctx, &target);
 
-	if (damon_select_ops(ctx, DAMON_OPS_PADDR)) {
-		damon_destroy_ctx(ctx);
-		return -EINVAL;
-	}
+	if (err)
+		return err;
 
 	ctx->callback.after_wmarks_check = damon_reclaim_after_wmarks_check;
 	ctx->callback.after_aggregation = damon_reclaim_after_aggregation;
 
-	target = damon_new_target();
-	if (!target) {
-		damon_destroy_ctx(ctx);
-		return -ENOMEM;
-	}
-	damon_add_target(ctx, target);
-
 	schedule_delayed_work(&damon_reclaim_timer, 0);
 
 	damon_reclaim_initialized = true;

From 711988450bacc4c93752522ad6bbd9c7b5a0eec5 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 25 Oct 2022 17:36:47 +0000
Subject: [PATCH 612/737] mm/damon/reclaim: enable and disable synchronously

Patch series "mm/damon/reclaim,lru_sort: enable/disable synchronously".

Writing a value to DAMON_RECLAIM and DAMON_LRU_SORT's 'enabled' parameters
turns on or off DAMON in an ansychronous way.  This means the parameter
cannot be used to read the current status of them.  'kdamond_pid'
parameter should be used instead for the purpose.  The documentation is
easy to be read as it works in a synchronous way, so it is a little bit
confusing.  It also makes the user space tooling dirty.

There's no real reason to have the asynchronous behavior, though.  Simply
make the parameter works synchronously, rather than updating the document.

The first and second patches changes the behavior of the 'enabled'
parameter for DAMON_RECLAIM and adds a selftest for the changed behavior,
respectively.  Following two patches make the same changes for
DAMON_LRU_SORT.

This patch (of 4):

Writing a value to DAMON_RECLAIM's 'enabled' parameter turns on or off
DAMON in an ansychronous way.  This means the parameter cannot be used to
read the current status of DAMON_RECLAIM.  'kdamond_pid' parameter should
be used instead for the purpose.  The documentation is easy to be read as
it works in a synchronous way, so it is a little bit confusing.  It also
makes the user space tooling dirty.

There's no real reason to have the asynchronous behavior, though.  Simply
make the parameter works synchronously, rather than updating the document.

Link: https://lkml.kernel.org/r/20221025173650.90624-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20221025173650.90624-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/reclaim.c | 53 ++++++++++++++++++++--------------------------
 1 file changed, 23 insertions(+), 30 deletions(-)

diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 3173f373435c2..408b9f7688ded 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -11,7 +11,6 @@
 #include <linux/ioport.h>
 #include <linux/module.h>
 #include <linux/sched.h>
-#include <linux/workqueue.h>
 
 #include "modules-common.h"
 
@@ -183,38 +182,31 @@ static int damon_reclaim_turn(bool on)
 	return 0;
 }
 
-static struct delayed_work damon_reclaim_timer;
-static void damon_reclaim_timer_fn(struct work_struct *work)
-{
-	static bool last_enabled;
-	bool now_enabled;
-
-	now_enabled = enabled;
-	if (last_enabled != now_enabled) {
-		if (!damon_reclaim_turn(now_enabled))
-			last_enabled = now_enabled;
-		else
-			enabled = last_enabled;
-	}
-}
-static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn);
-
-static bool damon_reclaim_initialized;
-
 static int damon_reclaim_enabled_store(const char *val,
 		const struct kernel_param *kp)
 {
-	int rc = param_set_bool(val, kp);
+	bool is_enabled = enabled;
+	bool enable;
+	int err;
 
-	if (rc < 0)
-		return rc;
+	err = strtobool(val, &enable);
+	if (err)
+		return err;
 
-	/* system_wq might not initialized yet */
-	if (!damon_reclaim_initialized)
-		return rc;
+	if (is_enabled == enable)
+		return 0;
 
-	schedule_delayed_work(&damon_reclaim_timer, 0);
-	return 0;
+	/* Called before init function.  The function will handle this. */
+	if (!ctx)
+		goto set_param_out;
+
+	err = damon_reclaim_turn(enable);
+	if (err)
+		return err;
+
+set_param_out:
+	enabled = enable;
+	return err;
 }
 
 static const struct kernel_param_ops enabled_param_ops = {
@@ -264,10 +256,11 @@ static int __init damon_reclaim_init(void)
 	ctx->callback.after_wmarks_check = damon_reclaim_after_wmarks_check;
 	ctx->callback.after_aggregation = damon_reclaim_after_aggregation;
 
-	schedule_delayed_work(&damon_reclaim_timer, 0);
+	/* 'enabled' has set before this function, probably via command line */
+	if (enabled)
+		err = damon_reclaim_turn(true);
 
-	damon_reclaim_initialized = true;
-	return 0;
+	return err;
 }
 
 module_init(damon_reclaim_init);

From 1724a1857507409bbe31a99941f1dcc93806ce9d Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 25 Oct 2022 17:36:48 +0000
Subject: [PATCH 613/737] selftests/damon: add tests for DAMON_RECLAIM's
 enabled parameter

Add simple test cases for DAMON_RECLAIM's 'enabled' parameter.  Those
tests are focusing on the synchronous behavior of DAMON_RECLAIM enabling
and disabling.

Link: https://lkml.kernel.org/r/20221025173650.90624-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/Makefile   |  1 +
 tools/testing/selftests/damon/reclaim.sh | 42 ++++++++++++++++++++++++
 2 files changed, 43 insertions(+)
 create mode 100644 tools/testing/selftests/damon/reclaim.sh

diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile
index a1fa2eff8192f..dbbf18cb3e6b5 100644
--- a/tools/testing/selftests/damon/Makefile
+++ b/tools/testing/selftests/damon/Makefile
@@ -8,5 +8,6 @@ TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh
 TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh
 TEST_PROGS += debugfs_duplicate_context_creation.sh
 TEST_PROGS += sysfs.sh
+TEST_PROGS += reclaim.sh
 
 include ../lib.mk
diff --git a/tools/testing/selftests/damon/reclaim.sh b/tools/testing/selftests/damon/reclaim.sh
new file mode 100644
index 0000000000000..78dbc2334cbe1
--- /dev/null
+++ b/tools/testing/selftests/damon/reclaim.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+if [ $EUID -ne 0 ]
+then
+	echo "Run as root"
+	exit $ksft_skip
+fi
+
+damon_reclaim_enabled="/sys/module/damon_reclaim/parameters/enabled"
+if [ ! -f "$damon_reclaim_enabled" ]
+then
+	echo "No 'enabled' file.  Maybe DAMON_RECLAIM not built"
+	exit $ksft_skip
+fi
+
+nr_kdamonds=$(pgrep kdamond | wc -l)
+if [ "$nr_kdamonds" -ne 0 ]
+then
+	echo "Another kdamond is running"
+	exit $ksft_skip
+fi
+
+echo Y > "$damon_reclaim_enabled"
+
+nr_kdamonds=$(pgrep kdamond | wc -l)
+if [ "$nr_kdamonds" -ne 1 ]
+then
+	echo "kdamond is not turned on"
+	exit 1
+fi
+
+echo N > "$damon_reclaim_enabled"
+nr_kdamonds=$(pgrep kdamond | wc -l)
+if [ "$nr_kdamonds" -ne 0 ]
+then
+	echo "kdamond is not turned off"
+	exit 1
+fi

From 295994926b2b19ba1de53ea6a6252075088f8254 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 25 Oct 2022 17:36:49 +0000
Subject: [PATCH 614/737] mm/damon/lru_sort: enable and disable synchronously

Writing a value to DAMON_RECLAIM's 'enabled' parameter turns on or off
DAMON in an ansychronous way.  This means the parameter cannot be used to
read the current status of DAMON_RECLAIM.  'kdamond_pid' parameter should
be used instead for the purpose.  The documentation is easy to be read as
it works in a synchronous way, so it is a little bit confusing.  It also
makes the user space tooling dirty.

There's no real reason to have the asynchronous behavior, though.  Simply
make the parameter works synchronously, rather than updating the document.

Link: https://lkml.kernel.org/r/20221025173650.90624-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/lru_sort.c | 51 +++++++++++++++++++--------------------------
 1 file changed, 22 insertions(+), 29 deletions(-)

diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index a1896c5acfe97..e39fef0135c0e 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -11,7 +11,6 @@
 #include <linux/ioport.h>
 #include <linux/module.h>
 #include <linux/sched.h>
-#include <linux/workqueue.h>
 
 #include "modules-common.h"
 
@@ -237,38 +236,31 @@ static int damon_lru_sort_turn(bool on)
 	return 0;
 }
 
-static struct delayed_work damon_lru_sort_timer;
-static void damon_lru_sort_timer_fn(struct work_struct *work)
-{
-	static bool last_enabled;
-	bool now_enabled;
-
-	now_enabled = enabled;
-	if (last_enabled != now_enabled) {
-		if (!damon_lru_sort_turn(now_enabled))
-			last_enabled = now_enabled;
-		else
-			enabled = last_enabled;
-	}
-}
-static DECLARE_DELAYED_WORK(damon_lru_sort_timer, damon_lru_sort_timer_fn);
-
-static bool damon_lru_sort_initialized;
-
 static int damon_lru_sort_enabled_store(const char *val,
 		const struct kernel_param *kp)
 {
-	int rc = param_set_bool(val, kp);
+	bool is_enabled = enabled;
+	bool enable;
+	int err;
+
+	err = strtobool(val, &enable);
+	if (err)
+		return err;
 
-	if (rc < 0)
-		return rc;
+	if (is_enabled == enable)
+		return 0;
 
-	if (!damon_lru_sort_initialized)
-		return rc;
+	/* Called before init function.  The function will handle this. */
+	if (!ctx)
+		goto set_param_out;
 
-	schedule_delayed_work(&damon_lru_sort_timer, 0);
+	err = damon_lru_sort_turn(enable);
+	if (err)
+		return err;
 
-	return 0;
+set_param_out:
+	enabled = enable;
+	return err;
 }
 
 static const struct kernel_param_ops enabled_param_ops = {
@@ -322,10 +314,11 @@ static int __init damon_lru_sort_init(void)
 	ctx->callback.after_wmarks_check = damon_lru_sort_after_wmarks_check;
 	ctx->callback.after_aggregation = damon_lru_sort_after_aggregation;
 
-	schedule_delayed_work(&damon_lru_sort_timer, 0);
+	/* 'enabled' has set before this function, probably via command line */
+	if (enabled)
+		err = damon_lru_sort_turn(true);
 
-	damon_lru_sort_initialized = true;
-	return 0;
+	return err;
 }
 
 module_init(damon_lru_sort_init);

From af82f6cd620b0275cf3da01b426afc633179c153 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 25 Oct 2022 17:36:50 +0000
Subject: [PATCH 615/737] selftests/damon: add tests for DAMON_LRU_SORT's
 enabled parameter

Add simple test cases for DAMON_LRU_SORT's 'enabled' parameter.  Those
tests are focusing on the synchronous behavior of DAMON_RECLAIM enabling
and disabling.

Link: https://lkml.kernel.org/r/20221025173650.90624-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/Makefile    |  2 +-
 tools/testing/selftests/damon/lru_sort.sh | 41 +++++++++++++++++++++++
 2 files changed, 42 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/damon/lru_sort.sh

diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile
index dbbf18cb3e6b5..af490acc53485 100644
--- a/tools/testing/selftests/damon/Makefile
+++ b/tools/testing/selftests/damon/Makefile
@@ -8,6 +8,6 @@ TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh
 TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh
 TEST_PROGS += debugfs_duplicate_context_creation.sh
 TEST_PROGS += sysfs.sh
-TEST_PROGS += reclaim.sh
+TEST_PROGS += reclaim.sh lru_sort.sh
 
 include ../lib.mk
diff --git a/tools/testing/selftests/damon/lru_sort.sh b/tools/testing/selftests/damon/lru_sort.sh
new file mode 100644
index 0000000000000..61b80197c8966
--- /dev/null
+++ b/tools/testing/selftests/damon/lru_sort.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+if [ $EUID -ne 0 ]
+then
+	echo "Run as root"
+	exit $ksft_skip
+fi
+
+damon_lru_sort_enabled="/sys/module/damon_lru_sort/parameters/enabled"
+if [ ! -f "$damon_lru_sort_enabled" ]
+then
+	echo "No 'enabled' file.  Maybe DAMON_LRU_SORT not built"
+	exit $ksft_skip
+fi
+
+nr_kdamonds=$(pgrep kdamond | wc -l)
+if [ "$nr_kdamonds" -ne 0 ]
+then
+	echo "Another kdamond is running"
+	exit $ksft_skip
+fi
+
+echo Y > "$damon_lru_sort_enabled"
+nr_kdamonds=$(pgrep kdamond | wc -l)
+if [ "$nr_kdamonds" -ne 1 ]
+then
+	echo "kdamond is not turned on"
+	exit 1
+fi
+
+echo N > "$damon_lru_sort_enabled"
+nr_kdamonds=$(pgrep kdamond | wc -l)
+if [ "$nr_kdamonds" -ne 0 ]
+then
+	echo "kdamond is not turned off"
+	exit 1
+fi

From 2e95e25492c1be2700ebe81e7d5d3b29c3f6bcaa Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 24 Oct 2022 17:46:18 +0000
Subject: [PATCH 616/737] Docs/admin-guide/mm/damon/usage: describe the rules
 of sysfs region directories

Patch series "Docs/admin-buide/mm/damon/usage: minor fixes".

DAMON usage document contains an unclear description and a wrong usage
example.  This patchset fixes the two minor problems.

This patch (of 2):

Target region directories of DAMON sysfs interface should contain no
overlap and sorted by the address, but not clearly documented.  Actually,
a user had an issue[1] due to the poor documentation.  Add clear
description of it on the usage document.

[1] https://lore.kernel.org/damon/CAEZ6=UNUcH2BvJj++OrT=XQLdkidU79wmCO=tantSOB36pPNTg@mail.gmail.com/

Link: https://lkml.kernel.org/r/20221024174619.15600-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20221024174619.15600-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Reported-by: Vinicius Petrucci <vpetrucci@gmail.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 6e0402f84a5e9..b77cc49e2a02b 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -235,6 +235,9 @@ In each region directory, you will find two files (``start`` and ``end``).  You
 can set and get the start and end addresses of the initial monitoring target
 region by writing to and reading from the files, respectively.
 
+Each region should not overlap with others.  ``end`` of directory ``N`` should
+be equal or smaller than ``start`` of directory ``N+1``.
+
 contexts/<N>/schemes/
 ---------------------
 

From 379dc84242f10ef729a5961ba87922bdbe0c6576 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 24 Oct 2022 17:46:19 +0000
Subject: [PATCH 617/737] Docs/admin-guide/mm/damon/usage: fix wrong usage
 example of init_regions file

DAMON debugfs interface assumes the users will write all inputs at once.
However, redirecting a string of multiple lines sometimes end up writing
line by line.  Therefore, the example usage of 'init_regions' file, which
writes input as a string of multiple lines can fail.  Fix it to use a
single line string instead.  Also update the description of the usage to
not assume users will write inputs in multiple lines.

Link: https://lkml.kernel.org/r/20221024174619.15600-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Vinicius Petrucci <vpetrucci@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index b77cc49e2a02b..ab480194f9bac 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -468,8 +468,9 @@ regions in case of physical memory monitoring.  Therefore, users should set the
 monitoring target regions by themselves.
 
 In such cases, users can explicitly set the initial monitoring target regions
-as they want, by writing proper values to the ``init_regions`` file.  Each line
-of the input should represent one region in below form.::
+as they want, by writing proper values to the ``init_regions`` file.  The input
+should be a sequence of three integers separated by white spaces that represent
+one region in below form.::
 
     <target idx> <start address> <end address>
 
@@ -484,9 +485,9 @@ ranges, ``20-40`` and ``50-100`` as that of pid 4242, which is the second one
     # cd <debugfs>/damon
     # cat target_ids
     42 4242
-    # echo "0   1       100
-            0   100     200
-            1   20      40
+    # echo "0   1       100 \
+            0   100     200 \
+            1   20      40  \
             1   50      100" > init_regions
 
 Note that this sets the initial monitoring target regions only.  In case of

From 79c3b8f0e50c83a149f61f71486ca55aaafcf51f Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 1 Nov 2022 22:03:21 +0000
Subject: [PATCH 618/737] mm/damon/core: add a callback for scheme target
 regions check

Patch series "efficiently expose damos action tried regions information".

DAMON users can retrieve the monitoring results via 'after_aggregation'
callbacks if the user is using the kernel API, or 'damon_aggregated'
tracepoint if the user is in the user space.  Those are useful if full
monitoring results are necessary.  However, if the user has interest in
only a snapshot of the results for some regions having specific access
pattern, the interfaces could be inefficient.  For example, some users
only want to know which memory regions are not accessed for more than a
specific time at the moment.

Also, some DAMOS users would want to know exactly to what memory regions
the schemes' actions tried to be applied, for a debugging or a tuning.  As
DAMOS has its internal mechanism for quota and regions prioritization, the
users would need to simulate DAMOS' mechanism against the monitoring
results.  That's unnecessarily complex.

This patchset implements DAMON kernel API callbacks and sysfs directory
for efficient exposure of the information for the use cases.  The new
callback will be called for each region when a DAMOS action is gonna tried
to be applied to it.  The sysfs directory will be called 'tried_regions'
and placed under each scheme sysfs directory.  Users can write a special
keyworkd, 'update_schemes_regions', to the 'state' file of a kdamond sysfs
directory.  Then, DAMON sysfs interface will fill the directory with the
information of regions that corresponding scheme action was tried to be
applied for next one aggregation interval.

Patches Sequence
----------------

The first one (patch 1) implements the callback for the kernel space
users.  Following two patches (patches 2 and 3) implements sysfs
directories for the information and its sub directories.  Two patches
(patches 4 and 5) for implementing the special keywords for filling the
data to and cleaning up the directories follow.  Patch 6 adds a selftest
for the new sysfs directory.  Finally, two patches (patches 7 and 8)
document the new feature in the administrator guide and the ABI document.

This patch (of 8):

Getting DAMON monitoring results of only specific access pattern (e.g.,
getting address ranges of memory that not accessed at all for two minutes)
can be useful for efficient monitoring of the system.  The information can
also be helpful for deep level investigation of DAMON-based operation
schemes.

For that, users need to record (in case of the user space users) or
iterate (in case of the kernel space users) full monitoring results and
filter it out for the specific access pattern.  In case of the DAMOS
investigation, users will even need to simulate DAMOS' quota and
prioritization mechanisms.  It's inefficient and complex.

Add a new DAMON callback that will be called before each scheme is applied
to each region.  DAMON kernel API users will be able to do the query-like
monitoring results collection, or DAMOS investigation in an efficient and
simple way using it.

Commits for providing the capability to the user space users will follow.

Link: https://lkml.kernel.org/r/20221101220328.95765-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20221101220328.95765-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 5 +++++
 mm/damon/core.c       | 6 +++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 620ada094c3b2..35630634d7904 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -357,6 +357,7 @@ struct damon_operations {
  * @after_wmarks_check:	Called after each schemes' watermarks check.
  * @after_sampling:	Called after each sampling.
  * @after_aggregation:	Called after each aggregation.
+ * @before_damos_apply:	Called before applying DAMOS action.
  * @before_terminate:	Called before terminating the monitoring.
  * @private:		User private data.
  *
@@ -385,6 +386,10 @@ struct damon_callback {
 	int (*after_wmarks_check)(struct damon_ctx *context);
 	int (*after_sampling)(struct damon_ctx *context);
 	int (*after_aggregation)(struct damon_ctx *context);
+	int (*before_damos_apply)(struct damon_ctx *context,
+			struct damon_target *target,
+			struct damon_region *region,
+			struct damos *scheme);
 	void (*before_terminate)(struct damon_ctx *context);
 };
 
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 80d5937fe3373..ceec75b88ef96 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -772,6 +772,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
 	unsigned long sz = damon_sz_region(r);
 	struct timespec64 begin, end;
 	unsigned long sz_applied = 0;
+	int err = 0;
 
 	if (c->ops.apply_scheme) {
 		if (quota->esz && quota->charged_sz + sz > quota->esz) {
@@ -782,7 +783,10 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
 			damon_split_region_at(t, r, sz);
 		}
 		ktime_get_coarse_ts64(&begin);
-		sz_applied = c->ops.apply_scheme(c, t, r, s);
+		if (c->callback.before_damos_apply)
+			err = c->callback.before_damos_apply(c, t, r, s);
+		if (!err)
+			sz_applied = c->ops.apply_scheme(c, t, r, s);
 		ktime_get_coarse_ts64(&end);
 		quota->total_charged_ns += timespec64_to_ns(&end) -
 			timespec64_to_ns(&begin);

From ec6b582bbda48c865e1a4df428a0def4f1755b9c Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 1 Nov 2022 22:03:22 +0000
Subject: [PATCH 619/737] mm/damon/sysfs-schemes: implement
 schemes/tried_regions directory

For efficient and simple query-like DAMON monitoring results readings and
deep level investigations of DAMOS, DAMON kernel API
(include/linux/damon.h) users can use 'before_damos_apply' DAMON callback.
However, DAMON sysfs interface users don't have such option.

Add a directory, namely 'tried_regions', under each scheme directory to
use it as the interface for the purpose.  Note that this commit is
implementing only the directory but the data filling.

After the data filling change is made, users will be able to signal DAMON
to fill the directory with the regions that corresponding scheme has tried
to be applied.  By setting the access pattern of the scheme, users could
do the efficient query-like monitoring.

Link: https://lkml.kernel.org/r/20221101220328.95765-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs-schemes.c | 57 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 9509d5c1e7fce..500759d8b20c7 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -9,6 +9,36 @@
 
 #include "sysfs-common.h"
 
+/*
+ * scheme regions directory
+ */
+
+struct damon_sysfs_scheme_regions {
+	struct kobject kobj;
+};
+
+static struct damon_sysfs_scheme_regions *
+damon_sysfs_scheme_regions_alloc(void)
+{
+	return kzalloc(sizeof(struct damon_sysfs_scheme_regions), GFP_KERNEL);
+}
+
+static void damon_sysfs_scheme_regions_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_scheme_regions, kobj));
+}
+
+static struct attribute *damon_sysfs_scheme_regions_attrs[] = {
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_scheme_regions);
+
+static struct kobj_type damon_sysfs_scheme_regions_ktype = {
+	.release = damon_sysfs_scheme_regions_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_scheme_regions_groups,
+};
+
 /*
  * schemes/stats directory
  */
@@ -635,6 +665,7 @@ struct damon_sysfs_scheme {
 	struct damon_sysfs_quotas *quotas;
 	struct damon_sysfs_watermarks *watermarks;
 	struct damon_sysfs_stats *stats;
+	struct damon_sysfs_scheme_regions *tried_regions;
 };
 
 /* This should match with enum damos_action */
@@ -743,6 +774,25 @@ static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme)
 	return err;
 }
 
+static int damon_sysfs_scheme_set_tried_regions(
+		struct damon_sysfs_scheme *scheme)
+{
+	struct damon_sysfs_scheme_regions *tried_regions =
+		damon_sysfs_scheme_regions_alloc();
+	int err;
+
+	if (!tried_regions)
+		return -ENOMEM;
+	err = kobject_init_and_add(&tried_regions->kobj,
+			&damon_sysfs_scheme_regions_ktype, &scheme->kobj,
+			"tried_regions");
+	if (err)
+		kobject_put(&tried_regions->kobj);
+	else
+		scheme->tried_regions = tried_regions;
+	return err;
+}
+
 static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
 {
 	int err;
@@ -759,8 +809,14 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
 	err = damon_sysfs_scheme_set_stats(scheme);
 	if (err)
 		goto put_watermarks_quotas_access_pattern_out;
+	err = damon_sysfs_scheme_set_tried_regions(scheme);
+	if (err)
+		goto put_tried_regions_out;
 	return 0;
 
+put_tried_regions_out:
+	kobject_put(&scheme->tried_regions->kobj);
+	scheme->tried_regions = NULL;
 put_watermarks_quotas_access_pattern_out:
 	kobject_put(&scheme->watermarks->kobj);
 	scheme->watermarks = NULL;
@@ -781,6 +837,7 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme)
 	kobject_put(&scheme->quotas->kobj);
 	kobject_put(&scheme->watermarks->kobj);
 	kobject_put(&scheme->stats->kobj);
+	kobject_put(&scheme->tried_regions->kobj);
 }
 
 static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr,

From 651995f474af6497b6e4c0e314a3d5febd3de27a Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 1 Nov 2022 22:03:23 +0000
Subject: [PATCH 620/737] mm/damon/sysfs-schemes: implement scheme region
 directory

Implement region directories under 'tried_regions' directory of each
scheme DAMON sysfs directory.  This directory will provide the address
range, the monitored access frequency ('nr_accesses'), and the age of each
DAMON region that corresponding DAMON-based operation scheme has tried to
be applied.  Note that this commit doesn't implement the code for filling
the data but only the sysfs directory.

Link: https://lkml.kernel.org/r/20221101220328.95765-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs-schemes.c | 123 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 122 insertions(+), 1 deletion(-)

diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 500759d8b20c7..f0b5ad7e721d6 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -9,18 +9,138 @@
 
 #include "sysfs-common.h"
 
+/*
+ * scheme region directory
+ */
+
+struct damon_sysfs_scheme_region {
+	struct kobject kobj;
+	struct damon_addr_range ar;
+	unsigned int nr_accesses;
+	unsigned int age;
+	struct list_head list;
+};
+
+static struct damon_sysfs_scheme_region *damon_sysfs_scheme_region_alloc(
+		struct damon_region *region)
+{
+	struct damon_sysfs_scheme_region *sysfs_region = kmalloc(
+			sizeof(*sysfs_region), GFP_KERNEL);
+
+	if (!sysfs_region)
+		return NULL;
+	sysfs_region->kobj = (struct kobject){};
+	sysfs_region->ar = region->ar;
+	sysfs_region->nr_accesses = region->nr_accesses;
+	sysfs_region->age = region->age;
+	INIT_LIST_HEAD(&sysfs_region->list);
+	return sysfs_region;
+}
+
+static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_scheme_region *region = container_of(kobj,
+			struct damon_sysfs_scheme_region, kobj);
+
+	return sysfs_emit(buf, "%lu\n", region->ar.start);
+}
+
+static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_scheme_region *region = container_of(kobj,
+			struct damon_sysfs_scheme_region, kobj);
+
+	return sysfs_emit(buf, "%lu\n", region->ar.end);
+}
+
+static ssize_t nr_accesses_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_scheme_region *region = container_of(kobj,
+			struct damon_sysfs_scheme_region, kobj);
+
+	return sysfs_emit(buf, "%u\n", region->nr_accesses);
+}
+
+static ssize_t age_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_scheme_region *region = container_of(kobj,
+			struct damon_sysfs_scheme_region, kobj);
+
+	return sysfs_emit(buf, "%u\n", region->age);
+}
+
+static void damon_sysfs_scheme_region_release(struct kobject *kobj)
+{
+	struct damon_sysfs_scheme_region *region = container_of(kobj,
+			struct damon_sysfs_scheme_region, kobj);
+
+	list_del(&region->list);
+	kfree(region);
+}
+
+static struct kobj_attribute damon_sysfs_scheme_region_start_attr =
+		__ATTR_RO_MODE(start, 0400);
+
+static struct kobj_attribute damon_sysfs_scheme_region_end_attr =
+		__ATTR_RO_MODE(end, 0400);
+
+static struct kobj_attribute damon_sysfs_scheme_region_nr_accesses_attr =
+		__ATTR_RO_MODE(nr_accesses, 0400);
+
+static struct kobj_attribute damon_sysfs_scheme_region_age_attr =
+		__ATTR_RO_MODE(age, 0400);
+
+static struct attribute *damon_sysfs_scheme_region_attrs[] = {
+	&damon_sysfs_scheme_region_start_attr.attr,
+	&damon_sysfs_scheme_region_end_attr.attr,
+	&damon_sysfs_scheme_region_nr_accesses_attr.attr,
+	&damon_sysfs_scheme_region_age_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_scheme_region);
+
+static struct kobj_type damon_sysfs_scheme_region_ktype = {
+	.release = damon_sysfs_scheme_region_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_scheme_region_groups,
+};
+
 /*
  * scheme regions directory
  */
 
 struct damon_sysfs_scheme_regions {
 	struct kobject kobj;
+	struct list_head regions_list;
+	int nr_regions;
 };
 
 static struct damon_sysfs_scheme_regions *
 damon_sysfs_scheme_regions_alloc(void)
 {
-	return kzalloc(sizeof(struct damon_sysfs_scheme_regions), GFP_KERNEL);
+	struct damon_sysfs_scheme_regions *regions = kmalloc(sizeof(*regions),
+			GFP_KERNEL);
+
+	regions->kobj = (struct kobject){};
+	INIT_LIST_HEAD(&regions->regions_list);
+	regions->nr_regions = 0;
+	return regions;
+}
+
+static void damon_sysfs_scheme_regions_rm_dirs(
+		struct damon_sysfs_scheme_regions *regions)
+{
+	struct damon_sysfs_scheme_region *r, *next;
+
+	list_for_each_entry_safe(r, next, &regions->regions_list, list) {
+		/* release function deletes it from the list */
+		kobject_put(&r->kobj);
+		regions->nr_regions--;
+	}
 }
 
 static void damon_sysfs_scheme_regions_release(struct kobject *kobj)
@@ -837,6 +957,7 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme)
 	kobject_put(&scheme->quotas->kobj);
 	kobject_put(&scheme->watermarks->kobj);
 	kobject_put(&scheme->stats->kobj);
+	damon_sysfs_scheme_regions_rm_dirs(scheme->tried_regions);
 	kobject_put(&scheme->tried_regions->kobj);
 }
 

From 0bcef185436ded15b96c254aa6e7db19e9544713 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 1 Nov 2022 22:03:24 +0000
Subject: [PATCH 621/737] mm/damon/sysfs: implement DAMOS tried regions update
 command

Implement the code for filling the data of 'tried_regions' DAMON sysfs
directory.  With this commit, DAMON sysfs interface users can write a
special keyword, 'update_schemes_tried_regions' to the corresponding
'state' file of the kdamond.  Then, DAMON sysfs interface will collect the
tried regions information using the 'before_damos_apply()' callback for
one aggregation interval and populate scheme region directories with the
values.

[sj@kernel.org: skip tried regions update if the scheme directory was removed]
  Link: https://lkml.kernel.org/r/20221114182954.4745-2-sj@kernel.org
Link: https://lkml.kernel.org/r/20221101220328.95765-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs-common.h  |  6 +++
 mm/damon/sysfs-schemes.c | 80 ++++++++++++++++++++++++++++++++++++++++
 mm/damon/sysfs.c         | 57 +++++++++++++++++++++++++++-
 3 files changed, 141 insertions(+), 2 deletions(-)

diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
index 4626b27844047..634a6e7fca78a 100644
--- a/mm/damon/sysfs-common.h
+++ b/mm/damon/sysfs-common.h
@@ -44,3 +44,9 @@ int damon_sysfs_set_schemes(struct damon_ctx *ctx,
 void damon_sysfs_schemes_update_stats(
 		struct damon_sysfs_schemes *sysfs_schemes,
 		struct damon_ctx *ctx);
+
+int damon_sysfs_schemes_update_regions_start(
+		struct damon_sysfs_schemes *sysfs_schemes,
+		struct damon_ctx *ctx);
+
+int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx);
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index f0b5ad7e721d6..5f14f18bcc49c 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -1244,3 +1244,83 @@ void damon_sysfs_schemes_update_stats(
 		sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds;
 	}
 }
+
+/*
+ * damon_sysfs_schemes that need to update its schemes regions dir.  Protected
+ * by damon_sysfs_lock
+ */
+static struct damon_sysfs_schemes *damon_sysfs_schemes_for_damos_callback;
+static int damon_sysfs_schemes_region_idx;
+
+/*
+ * DAMON callback that called before damos apply.  While this callback is
+ * registered, damon_sysfs_lock should be held to ensure the regions
+ * directories exist.
+ */
+static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx,
+		struct damon_target *t, struct damon_region *r,
+		struct damos *s)
+{
+	struct damos *scheme;
+	struct damon_sysfs_scheme_regions *sysfs_regions;
+	struct damon_sysfs_scheme_region *region;
+	struct damon_sysfs_schemes *sysfs_schemes =
+		damon_sysfs_schemes_for_damos_callback;
+	int schemes_idx = 0;
+
+	damon_for_each_scheme(scheme, ctx) {
+		if (scheme == s)
+			break;
+		schemes_idx++;
+	}
+
+	/* user could have removed the scheme sysfs dir */
+	if (schemes_idx >= sysfs_schemes->nr)
+		return 0;
+
+	sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions;
+	region = damon_sysfs_scheme_region_alloc(r);
+	list_add_tail(&region->list, &sysfs_regions->regions_list);
+	sysfs_regions->nr_regions++;
+	if (kobject_init_and_add(&region->kobj,
+				&damon_sysfs_scheme_region_ktype,
+				&sysfs_regions->kobj, "%d",
+				damon_sysfs_schemes_region_idx++)) {
+		kobject_put(&region->kobj);
+	}
+	return 0;
+}
+
+/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */
+int damon_sysfs_schemes_update_regions_start(
+		struct damon_sysfs_schemes *sysfs_schemes,
+		struct damon_ctx *ctx)
+{
+	struct damos *scheme;
+	int schemes_idx = 0;
+
+	damon_for_each_scheme(scheme, ctx) {
+		struct damon_sysfs_scheme *sysfs_scheme;
+
+		sysfs_scheme = sysfs_schemes->schemes_arr[schemes_idx++];
+		damon_sysfs_scheme_regions_rm_dirs(
+				sysfs_scheme->tried_regions);
+	}
+
+	damon_sysfs_schemes_for_damos_callback = sysfs_schemes;
+	ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply;
+	return 0;
+}
+
+/*
+ * Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock.  Caller
+ * should unlock damon_sysfs_lock which held before
+ * damon_sysfs_schemes_update_regions_start()
+ */
+int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx)
+{
+	damon_sysfs_schemes_for_damos_callback = NULL;
+	ctx->callback.before_damos_apply = NULL;
+	damon_sysfs_schemes_region_idx = 0;
+	return 0;
+}
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 284daf274b3ed..ffb5a84059d7c 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -999,6 +999,11 @@ enum damon_sysfs_cmd {
 	 * files.
 	 */
 	DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS,
+	/*
+	 * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: Update schemes tried
+	 * regions
+	 */
+	DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS,
 	/*
 	 * @NR_DAMON_SYSFS_CMDS: Total number of DAMON sysfs commands.
 	 */
@@ -1011,6 +1016,7 @@ static const char * const damon_sysfs_cmd_strs[] = {
 	"off",
 	"commit",
 	"update_schemes_stats",
+	"update_schemes_tried_regions",
 };
 
 /*
@@ -1193,6 +1199,16 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx,
 static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
 {
 	struct damon_target *t, *next;
+	struct damon_sysfs_kdamond *kdamond;
+
+	/* damon_sysfs_schemes_update_regions_stop() might not yet called */
+	kdamond = damon_sysfs_cmd_request.kdamond;
+	if (kdamond && damon_sysfs_cmd_request.cmd ==
+			DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS &&
+			ctx == kdamond->damon_ctx) {
+		damon_sysfs_schemes_update_regions_stop(ctx);
+		mutex_unlock(&damon_sysfs_lock);
+	}
 
 	if (!damon_target_has_pid(ctx))
 		return;
@@ -1225,6 +1241,27 @@ static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond)
 	return 0;
 }
 
+static int damon_sysfs_upd_schemes_regions_start(
+		struct damon_sysfs_kdamond *kdamond)
+{
+	struct damon_ctx *ctx = kdamond->damon_ctx;
+
+	if (!ctx)
+		return -EINVAL;
+	return damon_sysfs_schemes_update_regions_start(
+			kdamond->contexts->contexts_arr[0]->schemes, ctx);
+}
+
+static int damon_sysfs_upd_schemes_regions_stop(
+		struct damon_sysfs_kdamond *kdamond)
+{
+	struct damon_ctx *ctx = kdamond->damon_ctx;
+
+	if (!ctx)
+		return -EINVAL;
+	return damon_sysfs_schemes_update_regions_stop(ctx);
+}
+
 static inline bool damon_sysfs_kdamond_running(
 		struct damon_sysfs_kdamond *kdamond)
 {
@@ -1277,10 +1314,12 @@ static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond)
 static int damon_sysfs_cmd_request_callback(struct damon_ctx *c)
 {
 	struct damon_sysfs_kdamond *kdamond;
+	static bool damon_sysfs_schemes_regions_updating;
 	int err = 0;
 
 	/* avoid deadlock due to concurrent state_store('off') */
-	if (!mutex_trylock(&damon_sysfs_lock))
+	if (!damon_sysfs_schemes_regions_updating &&
+			!mutex_trylock(&damon_sysfs_lock))
 		return 0;
 	kdamond = damon_sysfs_cmd_request.kdamond;
 	if (!kdamond || kdamond->damon_ctx != c)
@@ -1292,13 +1331,27 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c)
 	case DAMON_SYSFS_CMD_COMMIT:
 		err = damon_sysfs_commit_input(kdamond);
 		break;
+	case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS:
+		if (!damon_sysfs_schemes_regions_updating) {
+			err = damon_sysfs_upd_schemes_regions_start(kdamond);
+			if (!err) {
+				damon_sysfs_schemes_regions_updating = true;
+				goto keep_lock_out;
+			}
+		} else {
+			err = damon_sysfs_upd_schemes_regions_stop(kdamond);
+			damon_sysfs_schemes_regions_updating = false;
+		}
+		break;
 	default:
 		break;
 	}
 	/* Mark the request as invalid now. */
 	damon_sysfs_cmd_request.kdamond = NULL;
 out:
-	mutex_unlock(&damon_sysfs_lock);
+	if (!damon_sysfs_schemes_regions_updating)
+		mutex_unlock(&damon_sysfs_lock);
+keep_lock_out:
 	return err;
 }
 

From 265e1f6ce4d4fa3641d88e87aa084fbbdeece131 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 1 Nov 2022 22:03:25 +0000
Subject: [PATCH 622/737] mm/damon/sysfs-schemes: implement DAMOS-tried regions
 clear command

When there are huge number of DAMON regions that specific scheme actions
are tried to be applied, directories and files under 'tried_regions'
scheme directory could waste some memory.  Add another special input
keyword ('clear_schemes_tried_regions') for 'state' file of each kdamond
sysfs directory that can be used for cleanup of the 'tried_regions'
sub-directories.

[sj@kernel.org: skip regions clearing if the scheme directory was removed]
  Link: https://lkml.kernel.org/r/20221114182954.4745-3-sj@kernel.org
Link: https://lkml.kernel.org/r/20221101220328.95765-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs-common.h  |  4 ++++
 mm/damon/sysfs-schemes.c | 14 +++++++++++++-
 mm/damon/sysfs.c         | 20 ++++++++++++++++++++
 3 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
index 634a6e7fca78a..604a6cbc3edea 100644
--- a/mm/damon/sysfs-common.h
+++ b/mm/damon/sysfs-common.h
@@ -50,3 +50,7 @@ int damon_sysfs_schemes_update_regions_start(
 		struct damon_ctx *ctx);
 
 int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx);
+
+int damon_sysfs_schemes_clear_regions(
+		struct damon_sysfs_schemes *sysfs_schemes,
+		struct damon_ctx *ctx);
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 5f14f18bcc49c..81fc4d27f4e45 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -1292,7 +1292,7 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx,
 }
 
 /* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */
-int damon_sysfs_schemes_update_regions_start(
+int damon_sysfs_schemes_clear_regions(
 		struct damon_sysfs_schemes *sysfs_schemes,
 		struct damon_ctx *ctx)
 {
@@ -1302,11 +1302,23 @@ int damon_sysfs_schemes_update_regions_start(
 	damon_for_each_scheme(scheme, ctx) {
 		struct damon_sysfs_scheme *sysfs_scheme;
 
+		/* user could have removed the scheme sysfs dir */
+		if (schemes_idx >= sysfs_schemes->nr)
+			break;
+
 		sysfs_scheme = sysfs_schemes->schemes_arr[schemes_idx++];
 		damon_sysfs_scheme_regions_rm_dirs(
 				sysfs_scheme->tried_regions);
 	}
+	return 0;
+}
 
+/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */
+int damon_sysfs_schemes_update_regions_start(
+		struct damon_sysfs_schemes *sysfs_schemes,
+		struct damon_ctx *ctx)
+{
+	damon_sysfs_schemes_clear_regions(sysfs_schemes, ctx);
 	damon_sysfs_schemes_for_damos_callback = sysfs_schemes;
 	ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply;
 	return 0;
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index ffb5a84059d7c..aeb0beb1da913 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1004,6 +1004,11 @@ enum damon_sysfs_cmd {
 	 * regions
 	 */
 	DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS,
+	/*
+	 * @DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS: Clear schemes tried
+	 * regions
+	 */
+	DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS,
 	/*
 	 * @NR_DAMON_SYSFS_CMDS: Total number of DAMON sysfs commands.
 	 */
@@ -1017,6 +1022,7 @@ static const char * const damon_sysfs_cmd_strs[] = {
 	"commit",
 	"update_schemes_stats",
 	"update_schemes_tried_regions",
+	"clear_schemes_tried_regions",
 };
 
 /*
@@ -1262,6 +1268,17 @@ static int damon_sysfs_upd_schemes_regions_stop(
 	return damon_sysfs_schemes_update_regions_stop(ctx);
 }
 
+static int damon_sysfs_clear_schemes_regions(
+		struct damon_sysfs_kdamond *kdamond)
+{
+	struct damon_ctx *ctx = kdamond->damon_ctx;
+
+	if (!ctx)
+		return -EINVAL;
+	return damon_sysfs_schemes_clear_regions(
+			kdamond->contexts->contexts_arr[0]->schemes, ctx);
+}
+
 static inline bool damon_sysfs_kdamond_running(
 		struct damon_sysfs_kdamond *kdamond)
 {
@@ -1343,6 +1360,9 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c)
 			damon_sysfs_schemes_regions_updating = false;
 		}
 		break;
+	case DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS:
+		err = damon_sysfs_clear_schemes_regions(kdamond);
+		break;
 	default:
 		break;
 	}

From 6b7a1386f95b0354d9a7c5a383b786559b8ea5c4 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 1 Nov 2022 22:03:26 +0000
Subject: [PATCH 623/737] tools/selftets/damon/sysfs: test tried_regions
 directory existence

Add a simple test case for ensuring tried_regions directory existence.

Link: https://lkml.kernel.org/r/20221101220328.95765-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/sysfs.sh | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh
index 89592c64462f8..db4942383a50f 100644
--- a/tools/testing/selftests/damon/sysfs.sh
+++ b/tools/testing/selftests/damon/sysfs.sh
@@ -80,6 +80,12 @@ test_range()
 	ensure_file "$range_dir/max" "exist" 600
 }
 
+test_tried_regions()
+{
+	tried_regions_dir=$1
+	ensure_dir "$tried_regions_dir" "exist"
+}
+
 test_stats()
 {
 	stats_dir=$1
@@ -138,6 +144,7 @@ test_scheme()
 	test_quotas "$scheme_dir/quotas"
 	test_watermarks "$scheme_dir/watermarks"
 	test_stats "$scheme_dir/stats"
+	test_tried_regions "$scheme_dir/tried_regions"
 }
 
 test_schemes()

From e958f09dca066e06031d6722b52d708a97eb9944 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 1 Nov 2022 22:03:27 +0000
Subject: [PATCH 624/737] Docs/admin-guide/mm/damon/usage: document
 schemes/<s>/tried_regions sysfs directory

Document 'tried_regions' directory in DAMON sysfs interface usage in the
administrator guide.

Link: https://lkml.kernel.org/r/20221101220328.95765-8-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 45 ++++++++++++++++++--
 1 file changed, 42 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index ab480194f9bac..86323d1eaab41 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -88,6 +88,9 @@ comma (","). ::
     │ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil
     │ │ │ │ │ │ │ watermarks/metric,interval_us,high,mid,low
     │ │ │ │ │ │ │ stats/nr_tried,sz_tried,nr_applied,sz_applied,qt_exceeds
+    │ │ │ │ │ │ │ tried_regions/
+    │ │ │ │ │ │ │ │ 0/start,end,nr_accesses,age
+    │ │ │ │ │ │ │ │ ...
     │ │ │ │ │ │ ...
     │ │ │ │ ...
     │ │ ...
@@ -125,7 +128,14 @@ in the state.  Writing ``commit`` to the ``state`` file makes kdamond reads the
 user inputs in the sysfs files except ``state`` file again.  Writing
 ``update_schemes_stats`` to ``state`` file updates the contents of stats files
 for each DAMON-based operation scheme of the kdamond.  For details of the
-stats, please refer to :ref:`stats section <sysfs_schemes_stats>`.
+stats, please refer to :ref:`stats section <sysfs_schemes_stats>`.  Writing
+``update_schemes_tried_regions`` to ``state`` file updates the DAMON-based
+operation scheme action tried regions directory for each DAMON-based operation
+scheme of the kdamond.  Writing ``clear_schemes_tried_regions`` to ``state``
+file clears the DAMON-based operating scheme action tried regions directory for
+each DAMON-based operation scheme of the kdamond.  For details of the
+DAMON-based operation scheme action tried regions directory, please refer to
+:ref:tried_regions section <sysfs_schemes_tried_regions>`.
 
 If the state is ``on``, reading ``pid`` shows the pid of the kdamond thread.
 
@@ -166,6 +176,8 @@ You can set and get what type of monitoring operations DAMON will use for the
 context by writing one of the keywords listed in ``avail_operations`` file and
 reading from the ``operations`` file.
 
+.. _sysfs_monitoring_attrs:
+
 contexts/<N>/monitoring_attrs/
 ------------------------------
 
@@ -255,8 +267,9 @@ to ``N-1``.  Each directory represents each DAMON-based operation scheme.
 schemes/<N>/
 ------------
 
-In each scheme directory, four directories (``access_pattern``, ``quotas``,
-``watermarks``, and ``stats``) and one file (``action``) exist.
+In each scheme directory, five directories (``access_pattern``, ``quotas``,
+``watermarks``, ``stats``, and ``tried_regions``) and one file (``action``)
+exist.
 
 The ``action`` file is for setting and getting what action you want to apply to
 memory regions having specific access pattern of the interest.  The keywords
@@ -351,6 +364,32 @@ should ask DAMON sysfs interface to updte the content of the files for the
 stats by writing a special keyword, ``update_schemes_stats`` to the relevant
 ``kdamonds/<N>/state`` file.
 
+.. _sysfs_schemes_tried_regions:
+
+schemes/<N>/tried_regions/
+--------------------------
+
+When a special keyword, ``update_schemes_tried_regions``, is written to the
+relevant ``kdamonds/<N>/state`` file, DAMON creates directories named integer
+starting from ``0`` under this directory.  Each directory contains files
+exposing detailed information about each of the memory region that the
+corresponding scheme's ``action`` has tried to be applied under this directory,
+during next :ref:`aggregation interval <sysfs_monitoring_attrs>`.  The
+information includes address range, ``nr_accesses``, , and ``age`` of the
+region.
+
+The directories will be removed when another special keyword,
+``clear_schemes_tried_regions``, is written to the relevant
+``kdamonds/<N>/state`` file.
+
+tried_regions/<N>/
+------------------
+
+In each region directory, you will find four files (``start``, ``end``,
+``nr_accesses``, and ``age``).  Reading the files will show the start and end
+addresses, ``nr_accesses``, and ``age`` of the region that corresponding
+DAMON-based operation scheme ``action`` has tried to be applied.
+
 Example
 ~~~~~~~
 

From 245ccf8aab4b4768516e785672f9eef8582dad5d Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 1 Nov 2022 22:03:28 +0000
Subject: [PATCH 625/737] Docs/ABI/damon: document 'schemes/<s>/tried_regions'
 sysfs directory

Update DAMON ABI document for the 'tried_regions' directory of DAMON
sysfs interface.

Link: https://lkml.kernel.org/r/20221101220328.95765-9-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../ABI/testing/sysfs-kernel-mm-damon         | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon
index 08b9df3235609..13397b8536926 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-damon
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon
@@ -27,6 +27,10 @@ Description:	Writing 'on' or 'off' to this file makes the kdamond starts or
 		makes the kdamond reads the user inputs in the sysfs files
 		except 'state' again.  Writing 'update_schemes_stats' to the
 		file updates contents of schemes stats files of the kdamond.
+		Writing 'update_schemes_tried_regions' to the file updates
+		contents of 'tried_regions' directory of every scheme directory
+		of this kdamond.  Writing 'clear_schemes_tried_regions' to the
+		file removes contents of the 'tried_regions' directory.
 
 What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/pid
 Date:		Mar 2022
@@ -283,3 +287,31 @@ Date:		Mar 2022
 Contact:	SeongJae Park <sj@kernel.org>
 Description:	Reading this file returns the number of the exceed events of
 		the scheme's quotas.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/tried_regions/<R>/start
+Date:		Oct 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Reading this file returns the start address of a memory region
+		that corresponding DAMON-based Operation Scheme's action has
+		tried to be applied.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/tried_regions/<R>/end
+Date:		Oct 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Reading this file returns the end address of a memory region
+		that corresponding DAMON-based Operation Scheme's action has
+		tried to be applied.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/tried_regions/<R>/nr_accesses
+Date:		Oct 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Reading this file returns the 'nr_accesses' of a memory region
+		that corresponding DAMON-based Operation Scheme's action has
+		tried to be applied.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/tried_regions/<R>/age
+Date:		Oct 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Reading this file returns the 'age' of a memory region that
+		corresponding DAMON-based Operation Scheme's action has tried
+		to be applied.

From 3d822f419747cf6439e7b7eccdb7aeee4bfe4b4e Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 7 Nov 2022 16:50:01 +0000
Subject: [PATCH 626/737] selftests/damon: test non-context inputs to
 rm_contexts file

There was a bug[1] that triggered by writing non-context DAMON debugfs
file names to the 'rm_contexts' DAMON debugfs file.  Add a selftest for
the bug to avoid it happen again.

[1] https://lore.kernel.org/damon/000000000000ede3ac05ec4abf8e@google.com/

Link: https://lkml.kernel.org/r/20221107165001.5717-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/Makefile        |  1 +
 .../damon/debugfs_rm_non_contexts.sh          | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+)
 create mode 100644 tools/testing/selftests/damon/debugfs_rm_non_contexts.sh

diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile
index af490acc53485..838a8e49f77b9 100644
--- a/tools/testing/selftests/damon/Makefile
+++ b/tools/testing/selftests/damon/Makefile
@@ -7,6 +7,7 @@ TEST_FILES = _chk_dependency.sh _debugfs_common.sh
 TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh
 TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh
 TEST_PROGS += debugfs_duplicate_context_creation.sh
+TEST_PROGS += debugfs_rm_non_contexts.sh
 TEST_PROGS += sysfs.sh
 TEST_PROGS += reclaim.sh lru_sort.sh
 
diff --git a/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh b/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh
new file mode 100644
index 0000000000000..48b7af6b022cb
--- /dev/null
+++ b/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source _debugfs_common.sh
+
+# Test putting non-ctx files/dirs to rm_contexts file
+# ===================================================
+
+dmesg -C
+
+for file in "$DBGFS/"*
+do
+	echo "$(basename "$f")" > "$DBGFS/rm_contexts"
+	if dmesg | grep -q BUG
+	then
+		dmesg
+		exit 1
+	fi
+done

From 95df4ffb77d3f3215a1d5e4c5e082d6c7376c809 Mon Sep 17 00:00:00 2001
From: Rong Tao <rongtao@cestc.cn>
Date: Sun, 13 Nov 2022 08:38:45 +0800
Subject: [PATCH 627/737] selftests/damon: fix unnecessary compilation warnings

When testing overflow and overread, there is no need to keep unnecessary
compilation warnings, we should simply ignore them.

The motivation for this patch is to eliminate the compilation warning,
maybe one day we will compile the kernel with "-Werror -Wall", at which
point this compilation warning will turn into a compilation error, we
should fix this error in advance.

How to reproduce the problem (with gcc-11.3.1):

    $ make -C tools/testing/selftests/
    ...
    warning: `write' reading 4294967295 bytes from a region of size 1
    [-Wstringop-overread]
    warning: `read' writing 4294967295 bytes into a region of size 25
    overflows the destination [-Wstringop-overflow=]

"-Wno-stringop-overread" is supported at least in gcc-11.1.0.

Link: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=d14c547abd484d3540b692bb8048c4a6efe92c8b
Link: https://lkml.kernel.org/r/tencent_51C4ACA8CB3895C2D7F35178440283602107@qq.com
Signed-off-by: Rong Tao <rongtao@cestc.cn>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/huge_count_read_write.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tools/testing/selftests/damon/huge_count_read_write.c b/tools/testing/selftests/damon/huge_count_read_write.c
index ad7a6b4cf3387..a6fe0689f88dc 100644
--- a/tools/testing/selftests/damon/huge_count_read_write.c
+++ b/tools/testing/selftests/damon/huge_count_read_write.c
@@ -8,6 +8,13 @@
 #include <unistd.h>
 #include <stdio.h>
 
+#pragma GCC diagnostic push
+#if __GNUC__ >= 11 && __GNUC_MINOR__ >= 1
+/* Ignore read(2) overflow and write(2) overread compile warnings */
+#pragma GCC diagnostic ignored "-Wstringop-overread"
+#pragma GCC diagnostic ignored "-Wstringop-overflow"
+#endif
+
 void write_read_with_huge_count(char *file)
 {
 	int filedesc = open(file, O_RDWR);
@@ -27,6 +34,8 @@ void write_read_with_huge_count(char *file)
 	close(filedesc);
 }
 
+#pragma GCC diagnostic pop
+
 int main(int argc, char *argv[])
 {
 	if (argc != 2) {

From 448ee43c509d0649c950c84e5eb01bb760ef4050 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 1 Dec 2022 17:08:34 +0000
Subject: [PATCH 628/737] selftests/damon: test removed scheme sysfs dir access
 bug

A DAMON sysfs user could start DAMON with a scheme, remove the sysfs
directory for the scheme, and then ask stats or schemes tried regions
update.  The related logic were not aware of the already removed directory
situation, so it was able to results in invalid memory accesses.  The fix
has made with commit 8468b486612c ("mm/damon/sysfs-schemes: skip stats
update if the scheme directory is removed"), though.  Add a selftest to
prevent such kinds of bugs from being introduced again.

Link: https://lkml.kernel.org/r/20221201170834.62823-1-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/Makefile        |  2 +-
 .../damon/sysfs_update_removed_scheme_dir.sh  | 58 +++++++++++++++++++
 2 files changed, 59 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/damon/sysfs_update_removed_scheme_dir.sh

diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile
index 838a8e49f77b9..b71247ba71969 100644
--- a/tools/testing/selftests/damon/Makefile
+++ b/tools/testing/selftests/damon/Makefile
@@ -8,7 +8,7 @@ TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh
 TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh
 TEST_PROGS += debugfs_duplicate_context_creation.sh
 TEST_PROGS += debugfs_rm_non_contexts.sh
-TEST_PROGS += sysfs.sh
+TEST_PROGS += sysfs.sh sysfs_update_removed_scheme_dir.sh
 TEST_PROGS += reclaim.sh lru_sort.sh
 
 include ../lib.mk
diff --git a/tools/testing/selftests/damon/sysfs_update_removed_scheme_dir.sh b/tools/testing/selftests/damon/sysfs_update_removed_scheme_dir.sh
new file mode 100644
index 0000000000000..ade35576e7487
--- /dev/null
+++ b/tools/testing/selftests/damon/sysfs_update_removed_scheme_dir.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+if [ $EUID -ne 0 ]
+then
+	echo "Run as root"
+	exit $ksft_skip
+fi
+
+damon_sysfs="/sys/kernel/mm/damon/admin"
+if [ ! -d "$damon_sysfs" ]
+then
+	echo "damon sysfs not found"
+	exit $ksft_skip
+fi
+
+# clear log
+dmesg -C
+
+# start DAMON with a scheme
+echo 1 > "$damon_sysfs/kdamonds/nr_kdamonds"
+echo 1 > "$damon_sysfs/kdamonds/0/contexts/nr_contexts"
+echo "vaddr" > "$damon_sysfs/kdamonds/0/contexts/0/operations"
+echo 1 > "$damon_sysfs/kdamonds/0/contexts/0/targets/nr_targets"
+echo $$ > "$damon_sysfs/kdamonds/0/contexts/0/targets/0/pid_target"
+echo 1 > "$damon_sysfs/kdamonds/0/contexts/0/schemes/nr_schemes"
+scheme_dir="$damon_sysfs/kdamonds/0/contexts/0/schemes/0"
+echo 4096000 > "$scheme_dir/access_pattern/sz/max"
+echo 20 > "$scheme_dir/access_pattern/nr_accesses/max"
+echo 1024 > "$scheme_dir/access_pattern/age/max"
+echo "on" > "$damon_sysfs/kdamonds/0/state"
+sleep 0.3
+
+# remove scheme sysfs dir
+echo 0 > "$damon_sysfs/kdamonds/0/contexts/0/schemes/nr_schemes"
+
+# try to update stat of already removed scheme sysfs dir
+echo "update_schemes_stats" > "$damon_sysfs/kdamonds/0/state"
+if dmesg | grep -q BUG
+then
+	echo "update_schemes_stats triggers a kernel bug"
+	dmesg
+	exit 1
+fi
+
+# try to update tried regions of already removed scheme sysfs dir
+echo "update_schemes_tried_regions" > "$damon_sysfs/kdamonds/0/state"
+if dmesg | grep -q BUG
+then
+	echo "update_schemes_tried_regions triggers a kernel bug"
+	dmesg
+	exit 1
+fi
+
+echo "off" > "$damon_sysfs/kdamonds/0/state"

From faf8203a218a9c8a08f04abd298cf608f2018e46 Mon Sep 17 00:00:00 2001
From: Kanchan Joshi <joshi.k@samsung.com>
Date: Thu, 10 Feb 2022 11:07:55 +0530
Subject: [PATCH 629/737] nvme: add vectored-io support for user-passthrough

Add a new NVME_IOCTL_IO64_CMD_VEC ioctl that works like the existing
NVME_IOCTL_IO64_CMD ioctl except that it takes and array of iovecs
and thus supports vectored I/O.

  - cmd.addr is base address of user iovec array
  - cmd.vec_cnt is count of iovec array elements

This patch does not include vectored-variant for admin-commands as most
of them are light on buffers and likely to have low invocation frequency.

Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
[Resolved contextual conflict by moving changes from ioctl.c to core.c in v5.10]
Signed-off-by: Shaoying Xu <shaoyi@amazon.com>
---
 drivers/nvme/host/core.c        | 36 ++++++++++++++++++++++++---------
 include/uapi/linux/nvme_ioctl.h |  6 +++++-
 2 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 07c41a149328a..1bb2bb840ffe3 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1116,7 +1116,7 @@ EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU);
 static int nvme_submit_user_cmd(struct request_queue *q,
 		struct nvme_command *cmd, void __user *ubuffer,
 		unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
-		u32 meta_seed, u64 *result, unsigned timeout)
+		u32 meta_seed, u64 *result, unsigned int timeout, bool vec)
 {
 	bool write = nvme_is_write(cmd);
 	struct nvme_ns *ns = q->queuedata;
@@ -1135,8 +1135,22 @@ static int nvme_submit_user_cmd(struct request_queue *q,
 	nvme_req(req)->flags |= NVME_REQ_USERCMD;
 
 	if (ubuffer && bufflen) {
-		ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
+		if (!vec)
+			ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
 				GFP_KERNEL);
+		else {
+			struct iovec fast_iov[UIO_FASTIOV];
+			struct iovec *iov = fast_iov;
+			struct iov_iter iter;
+
+			ret = import_iovec(rq_data_dir(req), ubuffer, bufflen,
+					UIO_FASTIOV, &iov, &iter);
+			if (ret < 0)
+				goto out;
+			ret = blk_rq_map_user_iov(q, req, NULL, &iter,
+					GFP_KERNEL);
+			kfree(iov);
+		}
 		if (ret)
 			goto out;
 		bio = req->bio;
@@ -1599,7 +1613,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 
 	return nvme_submit_user_cmd(ns->queue, &c,
 			nvme_to_user_ptr(io.addr), length,
-			metadata, meta_len, lower_32_bits(io.slba), NULL, 0);
+			metadata, meta_len, lower_32_bits(io.slba), NULL, 0,
+			false);
 }
 
 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
@@ -1637,7 +1652,7 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
 			nvme_to_user_ptr(cmd.addr), cmd.data_len,
 			nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
-			0, &result, timeout);
+			0, &result, timeout, false);
 
 	if (status >= 0) {
 		if (put_user(result, &ucmd->result))
@@ -1648,7 +1663,7 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 }
 
 static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
-			struct nvme_passthru_cmd64 __user *ucmd)
+			struct nvme_passthru_cmd64 __user *ucmd, bool vec)
 {
 	struct nvme_passthru_cmd64 cmd;
 	struct nvme_command c;
@@ -1681,7 +1696,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
 			nvme_to_user_ptr(cmd.addr), cmd.data_len,
 			nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
-			0, &cmd.result, timeout);
+			0, &cmd.result, timeout, vec);
 
 	if (status >= 0) {
 		if (put_user(cmd.result, &ucmd->result))
@@ -1746,7 +1761,7 @@ static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
 		ret = nvme_user_cmd(ctrl, NULL, argp);
 		break;
 	case NVME_IOCTL_ADMIN64_CMD:
-		ret = nvme_user_cmd64(ctrl, NULL, argp);
+		ret = nvme_user_cmd64(ctrl, NULL, argp, false);
 		break;
 	default:
 		ret = sed_ioctl(ctrl->opal_dev, cmd, argp);
@@ -1788,7 +1803,10 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
 		ret = nvme_submit_io(ns, argp);
 		break;
 	case NVME_IOCTL_IO64_CMD:
-		ret = nvme_user_cmd64(ns->ctrl, ns, argp);
+		ret = nvme_user_cmd64(ns->ctrl, ns, argp, false);
+		break;
+	case NVME_IOCTL_IO64_CMD_VEC:
+		ret = nvme_user_cmd64(ns->ctrl, ns, argp, true);
 		break;
 	default:
 		if (ns->ndev)
@@ -3336,7 +3354,7 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
 	case NVME_IOCTL_ADMIN_CMD:
 		return nvme_user_cmd(ctrl, NULL, argp);
 	case NVME_IOCTL_ADMIN64_CMD:
-		return nvme_user_cmd64(ctrl, NULL, argp);
+		return nvme_user_cmd64(ctrl, NULL, argp, false);
 	case NVME_IOCTL_IO_CMD:
 		return nvme_dev_user_cmd(ctrl, argp);
 	case NVME_IOCTL_RESET:
diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h
index d99b5a7726980..b2e43185e3b55 100644
--- a/include/uapi/linux/nvme_ioctl.h
+++ b/include/uapi/linux/nvme_ioctl.h
@@ -55,7 +55,10 @@ struct nvme_passthru_cmd64 {
 	__u64	metadata;
 	__u64	addr;
 	__u32	metadata_len;
-	__u32	data_len;
+	union {
+		__u32	data_len; /* for non-vectored io */
+		__u32	vec_cnt; /* for vectored io */
+	};
 	__u32	cdw10;
 	__u32	cdw11;
 	__u32	cdw12;
@@ -78,5 +81,6 @@ struct nvme_passthru_cmd64 {
 #define NVME_IOCTL_RESCAN	_IO('N', 0x46)
 #define NVME_IOCTL_ADMIN64_CMD	_IOWR('N', 0x47, struct nvme_passthru_cmd64)
 #define NVME_IOCTL_IO64_CMD	_IOWR('N', 0x48, struct nvme_passthru_cmd64)
+#define NVME_IOCTL_IO64_CMD_VEC	_IOWR('N', 0x49, struct nvme_passthru_cmd64)
 
 #endif /* _UAPI_LINUX_NVME_IOCTL_H */

From c791c716864f49eb0ae867fa96d951b62d8c4a21 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Thu, 9 Feb 2023 09:22:24 -0600
Subject: [PATCH 630/737] x86/speculation: Identify processors vulnerable to
 SMT RSB predictions

Certain AMD processors are vulnerable to a cross-thread return address
predictions bug. When running in SMT mode and one of the sibling threads
transitions out of C0 state, the other sibling thread could use return
target predictions from the sibling thread that transitioned out of C0.

The Spectre v2 mitigations cover the Linux kernel, as it fills the RSB
when context switching to the idle thread. However, KVM allows a VMM to
prevent exiting guest mode when transitioning out of C0. A guest could
act maliciously in this situation, so create a new x86 BUG that can be
used to detect if the processor is vulnerable.

[ Hailmo: resolved conflicts when rebasing onto 5.10.190 which
introduced GDS and SRSO ]

Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Message-Id: <91cec885656ca1fcd4f0185ce403a53dd9edecb7.1675956146.git.thomas.lendacky@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
[Resolved contextual conflicts due to unavailable identification of X86_BUG_EIBRS_PBRSB in v5.10]
Signed-off-by: Shaoying Xu <shaoyi@amazon.com>
---
 arch/x86/include/asm/cpufeatures.h | 1 +
 arch/x86/kernel/cpu/common.c       | 7 +++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 5c9488cd662c6..bebeea9e771e3 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -445,6 +445,7 @@
 #define X86_BUG_MMIO_STALE_DATA		X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */
 #define X86_BUG_MMIO_UNKNOWN		X86_BUG(26) /* CPU is too old and its MMIO Stale Data status is unknown */
 #define X86_BUG_RETBLEED		X86_BUG(27) /* CPU is affected by RETBleed */
+#define X86_BUG_SMT_RSB			X86_BUG(28) /* CPU is vulnerable to Cross-Thread Return Address Predictions */
 #define X86_BUG_GDS			X86_BUG(29) /* CPU is affected by Gather Data Sampling */
 
 /* BUG word 2 */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 5cf7db6dc2f01..13e217b01cbca 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1169,8 +1169,8 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
 
 	VULNBL_AMD(0x15, RETBLEED),
 	VULNBL_AMD(0x16, RETBLEED),
-	VULNBL_AMD(0x17, RETBLEED | SRSO),
-	VULNBL_HYGON(0x18, RETBLEED),
+	VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO),
+	VULNBL_HYGON(0x18, RETBLEED | SMT_RSB),
 	VULNBL_AMD(0x19, SRSO),
 	{}
 };
@@ -1284,6 +1284,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 			setup_force_cpu_bug(X86_BUG_RETBLEED);
 	}
 
+	if (cpu_matches(cpu_vuln_blacklist, SMT_RSB))
+		setup_force_cpu_bug(X86_BUG_SMT_RSB);
+
 	/*
 	 * Check if CPU is vulnerable to GDS. If running in a virtual machine on
 	 * an affected processor, the VMM may have disabled the use of GATHER by

From 6ab64233e41f943432e498f5acb8759214435fba Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Thu, 9 Feb 2023 09:22:25 -0600
Subject: [PATCH 631/737] KVM: x86: Mitigate the cross-thread return address
 predictions bug

By default, KVM/SVM will intercept attempts by the guest to transition
out of C0. However, the KVM_CAP_X86_DISABLE_EXITS capability can be used
by a VMM to change this behavior. To mitigate the cross-thread return
address predictions bug (X86_BUG_SMT_RSB), a VMM must not be allowed to
override the default behavior to intercept C0 transitions.

Use a module parameter to control the mitigation on processors that are
vulnerable to X86_BUG_SMT_RSB. If the processor is vulnerable to the
X86_BUG_SMT_RSB bug and the module parameter is set to mitigate the bug,
KVM will not allow the disabling of the HLT, MWAIT and CSTATE exits.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Message-Id: <4019348b5e07148eb4d593380a5f6713b93c9a16.1675956146.git.thomas.lendacky@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
[Resolved contextual conflict due to unavailable enable/disable PMU virtualization in v5.10]
Signed-off-by: Shaoying Xu <shaoyi@amazon.com>
---
 arch/x86/kvm/x86.c | 43 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 32 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2f15ae110f42a..f6149ac506af8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -163,6 +163,10 @@ module_param(force_emulation_prefix, bool, S_IRUGO);
 int __read_mostly pi_inject_timer = -1;
 module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
 
+/* Enable/disable SMT_RSB bug mitigation */
+bool __read_mostly mitigate_smt_rsb;
+module_param(mitigate_smt_rsb, bool, 0444);
+
 /*
  * Restoring the host value for MSRs that are only consumed when running in
  * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
@@ -3950,10 +3954,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = KVM_CLOCK_TSC_STABLE;
 		break;
 	case KVM_CAP_X86_DISABLE_EXITS:
-		r |=  KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE |
-		      KVM_X86_DISABLE_EXITS_CSTATE;
-		if(kvm_can_mwait_in_guest())
-			r |= KVM_X86_DISABLE_EXITS_MWAIT;
+		r = KVM_X86_DISABLE_EXITS_PAUSE;
+
+		if (!mitigate_smt_rsb) {
+			r |= KVM_X86_DISABLE_EXITS_HLT |
+			     KVM_X86_DISABLE_EXITS_CSTATE;
+
+			if (kvm_can_mwait_in_guest())
+				r |= KVM_X86_DISABLE_EXITS_MWAIT;
+		}
 		break;
 	case KVM_CAP_X86_SMM:
 		/* SMBASE is usually relocated above 1M on modern chipsets,
@@ -5480,15 +5489,26 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 		if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS)
 			break;
 
-		if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
-			kvm_can_mwait_in_guest())
-			kvm->arch.mwait_in_guest = true;
-		if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
-			kvm->arch.hlt_in_guest = true;
 		if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
 			kvm->arch.pause_in_guest = true;
-		if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
-			kvm->arch.cstate_in_guest = true;
+
+#define SMT_RSB_MSG "This processor is affected by the Cross-Thread Return Predictions vulnerability. " \
+		    "KVM_CAP_X86_DISABLE_EXITS should only be used with SMT disabled or trusted guests."
+
+		if (!mitigate_smt_rsb) {
+			if (boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible() &&
+			    (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE))
+				pr_warn_once(SMT_RSB_MSG);
+
+			if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
+			    kvm_can_mwait_in_guest())
+				kvm->arch.mwait_in_guest = true;
+			if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
+				kvm->arch.hlt_in_guest = true;
+			if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
+				kvm->arch.cstate_in_guest = true;
+		}
+
 		r = 0;
 		break;
 	case KVM_CAP_MSR_PLATFORM_INFO:
@@ -11707,6 +11727,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request);
 static int __init kvm_x86_init(void)
 {
 	kvm_mmu_x86_module_init();
+	mitigate_smt_rsb &= boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible();
 	return 0;
 }
 module_init(kvm_x86_init);

From c542bf988b4f7690723e386b08da00d4ed7b7720 Mon Sep 17 00:00:00 2001
From: "andrew.yang" <andrew.yang@mediatek.com>
Date: Wed, 22 Feb 2023 14:42:20 +0800
Subject: [PATCH 632/737] mm/damon/paddr: fix missing folio_put()

commit 3f98c9a62c338bbe06a215c9491e6166ea39bf82 upstream.

damon_get_folio() would always increase folio _refcount and
folio_isolate_lru() would increase folio _refcount if the folio's lru flag
is set.

If an unevictable folio isolated successfully, there will be two more
_refcount.  The one from folio_isolate_lru() will be decreased in
folio_puback_lru(), but the other one from damon_get_folio() will be left
behind.  This causes a pin page.

Whatever the case, the _refcount from damon_get_folio() should be
decreased.

Link: https://lkml.kernel.org/r/20230222064223.6735-1-andrew.yang@mediatek.com
Fixes: 57223ac29584 ("mm/damon/paddr: support the pageout scheme")
Signed-off-by: andrew.yang <andrew.yang@mediatek.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>	[5.16.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/paddr.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 2f0196bbf0b22..5945e1e379382 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -223,12 +223,11 @@ static unsigned long damon_pa_pageout(struct damon_region *r)
 			put_page(page);
 			continue;
 		}
-		if (PageUnevictable(page)) {
+		if (PageUnevictable(page))
 			putback_lru_page(page);
-		} else {
+		else
 			list_add(&page->lru, &page_list);
-			put_page(page);
-		}
+		put_page(page);
 	}
 	applied = reclaim_pages(&page_list);
 	cond_resched();

From da98252f7a207b6e462acfa0acedc1ae390df7e7 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Fri, 3 Mar 2023 07:51:43 +0000
Subject: [PATCH 633/737] udp: Fix memleaks of sk and zerocopy skbs with TX
 timestamp.

syzkaller reported [0] memory leaks of an UDP socket and ZEROCOPY
skbs.  We can reproduce the problem with these sequences:

  sk = socket(AF_INET, SOCK_DGRAM, 0)
  sk.setsockopt(SOL_SOCKET, SO_TIMESTAMPING, SOF_TIMESTAMPING_TX_SOFTWARE)
  sk.setsockopt(SOL_SOCKET, SO_ZEROCOPY, 1)
  sk.sendto(b'', MSG_ZEROCOPY, ('127.0.0.1', 53))
  sk.close()

sendmsg() calls msg_zerocopy_alloc(), which allocates a skb, sets
skb->cb->ubuf.refcnt to 1, and calls sock_hold().  Here, struct
ubuf_info_msgzc indirectly holds a refcnt of the socket.  When the
skb is sent, __skb_tstamp_tx() clones it and puts the clone into
the socket's error queue with the TX timestamp.

When the original skb is received locally, skb_copy_ubufs() calls
skb_unclone(), and pskb_expand_head() increments skb->cb->ubuf.refcnt.
This additional count is decremented while freeing the skb, but struct
ubuf_info_msgzc still has a refcnt, so __msg_zerocopy_callback() is
not called.

The last refcnt is not released unless we retrieve the TX timestamped
skb by recvmsg().  When we close() the socket holding such skb, we
never call sock_put() and leak the count.

To avoid this problem, we must call skb_queue_purge() while we close()
UDP sockets.

Note that TCP does not have this problem because skb_queue_purge() is
called by sk_stream_kill_queues() during close().

[0]:
BUG: memory leak
unreferenced object 0xffff88800c6d2d00 (size 1152):
  comm "syz-executor392", pid 264, jiffies 4294785440 (age 13.044s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 cd af e8 81 00 00 00 00  ................
    02 00 07 40 00 00 00 00 00 00 00 00 00 00 00 00  ...@............
  backtrace:
    [<0000000055636812>] sk_prot_alloc+0x64/0x2a0 net/core/sock.c:2024
    [<0000000054d77b7a>] sk_alloc+0x3b/0x800 net/core/sock.c:2083
    [<0000000066f3c7e0>] inet_create net/ipv4/af_inet.c:319 [inline]
    [<0000000066f3c7e0>] inet_create+0x31e/0xe40 net/ipv4/af_inet.c:245
    [<000000009b83af97>] __sock_create+0x2ab/0x550 net/socket.c:1515
    [<00000000b9b11231>] sock_create net/socket.c:1566 [inline]
    [<00000000b9b11231>] __sys_socket_create net/socket.c:1603 [inline]
    [<00000000b9b11231>] __sys_socket_create net/socket.c:1588 [inline]
    [<00000000b9b11231>] __sys_socket+0x138/0x250 net/socket.c:1636
    [<000000004fb45142>] __do_sys_socket net/socket.c:1649 [inline]
    [<000000004fb45142>] __se_sys_socket net/socket.c:1647 [inline]
    [<000000004fb45142>] __x64_sys_socket+0x73/0xb0 net/socket.c:1647
    [<0000000066999e0e>] do_syscall_x64 arch/x86/entry/common.c:50 [inline]
    [<0000000066999e0e>] do_syscall_64+0x38/0x90 arch/x86/entry/common.c:80
    [<0000000017f238c1>] entry_SYSCALL_64_after_hwframe+0x63/0xcd

BUG: memory leak
unreferenced object 0xffff888017633a00 (size 240):
  comm "syz-executor392", pid 264, jiffies 4294785440 (age 13.044s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
    00 00 00 00 00 00 00 00 00 2d 6d 0c 80 88 ff ff  .........-m.....
  backtrace:
    [<000000002b1c4368>] __alloc_skb+0x229/0x320 net/core/skbuff.c:497
    [<00000000143579a6>] alloc_skb include/linux/skbuff.h:1265 [inline]
    [<00000000143579a6>] sock_omalloc+0xaa/0x190 net/core/sock.c:2596
    [<00000000be626478>] msg_zerocopy_alloc net/core/skbuff.c:1294 [inline]
    [<00000000be626478>] msg_zerocopy_realloc+0x1ce/0x7f0 net/core/skbuff.c:1370
    [<00000000cbfc9870>] __ip_append_data+0x2adf/0x3b30 net/ipv4/ip_output.c:1037
    [<0000000089869146>] ip_make_skb+0x26c/0x2e0 net/ipv4/ip_output.c:1652
    [<00000000098015c2>] udp_sendmsg+0x1bac/0x2390 net/ipv4/udp.c:1253
    [<0000000045e0e95e>] inet_sendmsg+0x10a/0x150 net/ipv4/af_inet.c:819
    [<000000008d31bfde>] sock_sendmsg_nosec net/socket.c:714 [inline]
    [<000000008d31bfde>] sock_sendmsg+0x141/0x190 net/socket.c:734
    [<0000000021e21aa4>] __sys_sendto+0x243/0x360 net/socket.c:2117
    [<00000000ac0af00c>] __do_sys_sendto net/socket.c:2129 [inline]
    [<00000000ac0af00c>] __se_sys_sendto net/socket.c:2125 [inline]
    [<00000000ac0af00c>] __x64_sys_sendto+0xe1/0x1c0 net/socket.c:2125
    [<0000000066999e0e>] do_syscall_x64 arch/x86/entry/common.c:50 [inline]
    [<0000000066999e0e>] do_syscall_64+0x38/0x90 arch/x86/entry/common.c:80
    [<0000000017f238c1>] entry_SYSCALL_64_after_hwframe+0x63/0xcd

Fixes: b5947e5d1e71 ("udp: msg_zerocopy")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
 include/net/udp.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/net/udp.h b/include/net/udp.h
index e2550a4547a70..dcc2230e30a28 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -207,6 +207,11 @@ void udp_lib_rehash(struct sock *sk, u16 new_hash);
 
 static inline void udp_lib_close(struct sock *sk, long timeout)
 {
+	/* A zerocopy skb has a refcnt of sk and may be
+	 * put into sk_error_queue with TX timestamp
+	 */
+	skb_queue_purge(&sk->sk_error_queue);
+
 	sk_common_release(sk);
 }
 

From 8c2e2637b5164e6f3ad94c084b72e6b13417be8e Mon Sep 17 00:00:00 2001
From: Shaoying Xu <shaoyi@amazon.com>
Date: Sat, 25 Mar 2023 00:15:00 +0000
Subject: [PATCH 634/737] Update out-of-tree smartpqi driver to 2.1.20-035

This change was from Microchip HBA driver download page:
https://storage.microsemi.com/en-us/downloads/linux_source/linux_source_code/\
productid=aha-1100-24i&dn=microsemi+adaptec+hba+1100-24i.php

Signed-off-by: Shaoying Xu <shaoyi@amazon.com>
---
 drivers/amazon/scsi/smartpqi/Makefile         |    1 +
 drivers/amazon/scsi/smartpqi/smartpqi.h       |   42 +-
 drivers/amazon/scsi/smartpqi/smartpqi_init.c  | 1128 +++++++++++------
 .../scsi/smartpqi/smartpqi_kernel_compat.c    |   80 +-
 .../scsi/smartpqi/smartpqi_kernel_compat.h    |  175 ++-
 drivers/amazon/scsi/smartpqi/smartpqi_sis.c   |   31 +-
 drivers/amazon/scsi/smartpqi/smartpqi_sis.h   |    3 +
 7 files changed, 1011 insertions(+), 449 deletions(-)

diff --git a/drivers/amazon/scsi/smartpqi/Makefile b/drivers/amazon/scsi/smartpqi/Makefile
index 4b7ba538fb1fa..64a48e7248a99 100644
--- a/drivers/amazon/scsi/smartpqi/Makefile
+++ b/drivers/amazon/scsi/smartpqi/Makefile
@@ -2,3 +2,4 @@
 obj-$(CONFIG_AMAZON_SCSI_SMARTPQI) += smartpqi.o
 smartpqi-objs := smartpqi_init.o smartpqi_sis.o smartpqi_sas_transport.o smartpqi_kernel_compat.o
 EXTRA_CFLAGS += -DKCLASS5D
+EXTRA_CFLAGS += -DKFEATURE_HAS_SCSI_CMD_PRIV -DKFEATURE_HAS_HOST_TAGSET_SUPPORT
diff --git a/drivers/amazon/scsi/smartpqi/smartpqi.h b/drivers/amazon/scsi/smartpqi/smartpqi.h
index 7582041c5dda2..e120fd2e1b0a8 100644
--- a/drivers/amazon/scsi/smartpqi/smartpqi.h
+++ b/drivers/amazon/scsi/smartpqi/smartpqi.h
@@ -301,7 +301,8 @@ struct pqi_raid_path_request {
 	u8	additional_cdb_bytes_usage : 3;
 	u8	reserved5 : 3;
 	u8	cdb[16];
-	u8	reserved6[12];
+	u8      reserved6[11];
+	u8      ml_device_lun_number;
 	__le32	timeout;
 	struct pqi_sg_descriptor sg_descriptors[PQI_MAX_EMBEDDED_SG_DESCRIPTORS];
 };
@@ -475,7 +476,8 @@ struct pqi_task_management_request {
 	struct pqi_iu_header header;
 	__le16	request_id;
 	__le16	nexus_id;
-	u8	reserved[2];
+	u8      reserved;
+	u8      ml_device_lun_number;
 	__le16  timeout;
 	u8	lun_number[8];
 	__le16	protocol_specific;
@@ -716,6 +718,7 @@ typedef u32 pqi_index_t;
 #define SOP_TMF_COMPLETE		0x0
 #define SOP_TMF_REJECTED		0x4
 #define SOP_TMF_FUNCTION_SUCCEEDED	0x8
+#define SOP_RC_INCORRECT_LOGICAL_UNIT	0x9
 
 /* additional CDB bytes usage field codes */
 #define SOP_ADDITIONAL_CDB_BYTES_0	0	/* 16-byte CDB */
@@ -875,7 +878,8 @@ struct pqi_config_table_firmware_features {
 #define PQI_FIRMWARE_FEATURE_UNIQUE_WWID_IN_REPORT_PHYS_LUN	16
 #define PQI_FIRMWARE_FEATURE_FW_TRIAGE				17
 #define PQI_FIRMWARE_FEATURE_RPL_EXTENDED_FORMAT_4_5		18
-#define PQI_FIRMWARE_FEATURE_MAXIMUM				18
+#define PQI_FIRMWARE_FEATURE_MULTI_LUN_DEVICE_SUPPORT           21
+#define PQI_FIRMWARE_FEATURE_MAXIMUM                            21
 
 struct pqi_config_table_debug {
 	struct pqi_config_table_section_header header;
@@ -937,7 +941,8 @@ union pqi_reset_register {
 #define PQI_MAX_TRANSFER_SIZE_KDUMP		(512 * 1024U)
 #endif
 
-#define RAID_MAP_MAX_ENTRIES		1024
+#define RAID_MAP_MAX_ENTRIES			1024
+#define RAID_MAP_MAX_DATA_DISKS_PER_ROW		128
 
 #define PQI_PHYSICAL_DEVICE_BUS		0
 #define PQI_RAID_VOLUME_BUS		1
@@ -961,7 +966,6 @@ struct report_lun_header {
 #define CISS_REPORT_PHYS_FLAG_EXTENDED_FORMAT_4		0x4
 #define CISS_REPORT_PHYS_FLAG_EXTENDED_FORMAT_MASK	0xf
 
-
 struct report_log_lun {
 	u8	lunid[8];
 	u8	volume_id[16];
@@ -1100,6 +1104,8 @@ struct pqi_stream_data {
 	u32	last_accessed;
 };
 
+#define PQI_MAX_LUNS_PER_DEVICE         256
+
 struct pqi_scsi_dev {
 	int	devtype;		/* as reported by INQUIRY commmand */
 	u8	device_type;		/* as reported by */
@@ -1143,9 +1149,10 @@ struct pqi_scsi_dev {
 	u8	phy_id;
 	u8	ncq_prio_enable;
 	u8	ncq_prio_support;
+	u8	lun_count;
 	bool	raid_bypass_configured;	/* RAID bypass configured */
 	bool	raid_bypass_enabled;	/* RAID bypass enabled */
-	u32	next_bypass_group;
+	u32	next_bypass_group[RAID_MAP_MAX_DATA_DISKS_PER_ROW];
 	struct raid_map *raid_map;	/* RAID bypass map */
 	u32	max_transfer_encrypted;
 
@@ -1158,9 +1165,8 @@ struct pqi_scsi_dev {
 	struct list_head delete_list_entry;
 
 	struct pqi_stream_data stream_data[NUM_STREAMS_PER_LUN];
-	atomic_t scsi_cmds_outstanding;
+	atomic_t scsi_cmds_outstanding[PQI_MAX_LUNS_PER_DEVICE];
 	atomic_t raid_bypass_cnt;
-	u8	page_83_identifier[16];
 };
 
 /* VPD inquiry pages */
@@ -1282,6 +1288,12 @@ struct pqi_event {
 #define PQI_CTRL_PRODUCT_REVISION_A	0
 #define PQI_CTRL_PRODUCT_REVISION_B	1
 
+enum pqi_ctrl_removal_state {
+	PQI_CTRL_PRESENT = 0,
+	PQI_CTRL_GRACEFUL_REMOVAL,
+	PQI_CTRL_SURPRISE_REMOVAL
+};
+
 struct pqi_ctrl_info {
 	unsigned int	ctrl_id;
 	struct pci_dev	*pci_dev;
@@ -1314,7 +1326,6 @@ struct pqi_ctrl_info {
 	dma_addr_t	error_buffer_dma_handle;
 	size_t		sg_chain_buffer_length;
 	unsigned int	num_queue_groups;
-	u16		max_hw_queue_index;
 	u16		num_elements_per_iq;
 	u16		num_elements_per_oq;
 	u16		max_inbound_iu_length_per_firmware;
@@ -1344,6 +1355,7 @@ struct pqi_ctrl_info {
 	bool		controller_online;
 	bool		block_requests;
 	bool		scan_blocked;
+	u8		logical_volume_rescan_needed : 1;
 	u8		inbound_spanning_supported : 1;
 	u8		outbound_spanning_supported : 1;
 	u8		pqi_mode_enabled : 1;
@@ -1351,15 +1363,15 @@ struct pqi_ctrl_info {
 	u8		soft_reset_handshake_supported : 1;
 	u8		raid_iu_timeout_supported : 1;
 	u8		tmf_iu_timeout_supported : 1;
-	u8		unique_wwid_in_report_phys_lun_supported : 1;
 	u8		firmware_triage_supported : 1;
 	u8		rpl_extended_format_4_5_supported : 1;
+	u8              multi_lun_device_supported : 1;
 	u8		enable_r1_writes : 1;
 	u8		enable_r5_writes : 1;
 	u8		enable_r6_writes : 1;
 	u8		lv_drive_type_mix_valid : 1;
 	u8		enable_stream_detection : 1;
-
+	u8		disable_managed_interrupts : 1;
 	u8		ciss_report_log_flags;
 	u32		max_transfer_encrypted_sas_sata;
 	u32		max_transfer_encrypted_nvme;
@@ -1377,8 +1389,9 @@ struct pqi_ctrl_info {
 	u64		sas_address;
 
 	struct pqi_io_request *io_request_pool;
-	u16		next_io_request_slot;
-
+#if !defined(KFEATURE_HAS_HOST_TAGSET_SUPPORT)
+	u16		per_cpu_factor;
+#endif
 	struct pqi_event events[PQI_NUM_SUPPORTED_EVENTS];
 	struct work_struct event_work;
 
@@ -1403,8 +1416,11 @@ struct pqi_ctrl_info {
 	struct work_struct ofa_quiesce_work;
 	u32		ofa_bytes_requested;
 	u16		ofa_cancel_reason;
+	enum pqi_ctrl_removal_state ctrl_removal_state;
 
+#if !defined(KFEATURE_HAS_HOST_TAGSET_SUPPORT)
 	atomic_t	total_scmds_outstanding;
+#endif
 };
 
 enum pqi_ctrl_mode {
diff --git a/drivers/amazon/scsi/smartpqi/smartpqi_init.c b/drivers/amazon/scsi/smartpqi/smartpqi_init.c
index db6e6f50fd745..1358f722e6811 100644
--- a/drivers/amazon/scsi/smartpqi/smartpqi_init.c
+++ b/drivers/amazon/scsi/smartpqi/smartpqi_init.c
@@ -42,11 +42,11 @@
 #define BUILD_TIMESTAMP
 #endif
 
-#define DRIVER_VERSION		"2.1.14-030"
+#define DRIVER_VERSION		"2.1.20-035"
 #define DRIVER_MAJOR		2
 #define DRIVER_MINOR		1
-#define DRIVER_RELEASE		14
-#define DRIVER_REVISION		24
+#define DRIVER_RELEASE		20
+#define DRIVER_REVISION		29
 
 #define DRIVER_NAME		"Microchip SmartPQI Driver (v" \
 				DRIVER_VERSION BUILD_TIMESTAMP ")"
@@ -61,14 +61,15 @@
 MODULE_AUTHOR("Microchip");
 #if TORTUGA
 MODULE_DESCRIPTION("Driver for Microchip Smart Family Controller version "
-	DRIVER_VERSION " (d-108cd0d/s-8601640)" " (d147/s325)");
+	DRIVER_VERSION " (d-eaf4713/s-2aee658) (d147/s325)");
 #else
 MODULE_DESCRIPTION("Driver for Microchip Smart Family Controller version "
-	DRIVER_VERSION " (d-108cd0d/s-8601640)");
+	DRIVER_VERSION " (d-eaf4713/s-2aee658)");
 #endif
 MODULE_VERSION(DRIVER_VERSION);
 MODULE_LICENSE("GPL");
 
+static void pqi_verify_structures(void);
 static void pqi_take_ctrl_offline(struct pqi_ctrl_info *ctrl_info,
 	enum pqi_ctrl_shutdown_reason ctrl_shutdown_reason);
 static void pqi_ctrl_offline_worker(struct work_struct *work);
@@ -99,7 +100,8 @@ static void pqi_ofa_setup_host_buffer(struct pqi_ctrl_info *ctrl_info);
 static void pqi_ofa_free_host_buffer(struct pqi_ctrl_info *ctrl_info);
 static int pqi_ofa_host_memory_update(struct pqi_ctrl_info *ctrl_info);
 static int pqi_device_wait_for_pending_io(struct pqi_ctrl_info *ctrl_info,
-	struct pqi_scsi_dev *device, unsigned long timeout_msecs);
+	struct pqi_scsi_dev *device, u8 lun, unsigned long timeout_msecs);
+static void pqi_fail_all_outstanding_requests(struct pqi_ctrl_info *ctrl_info);
 
 /* for flags argument to pqi_submit_raid_request_synchronous() */
 #define PQI_SYNC_FLAGS_INTERRUPTABLE	0x1
@@ -184,6 +186,18 @@ module_param_named(limit_xfer_size_to_1MB,
 	pqi_limit_xfer_to_1MB, int, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(limit_xfer_size_to_1MB, "Limit max transfer size to 1MB.");
 
+static int pqi_disable_managed_interrupts;
+module_param_named(disable_managed_interrupts,
+	pqi_disable_managed_interrupts, int, 0644);
+MODULE_PARM_DESC(disable_managed_interrupts,
+	"Disable the kernel automatically assigning SMP affinity to IRQs.");
+
+static unsigned int pqi_ctrl_ready_timeout_secs;
+module_param_named(ctrl_ready_timeout,
+	pqi_ctrl_ready_timeout_secs, uint, 0644);
+MODULE_PARM_DESC(ctrl_ready_timeout,
+	"Timeout in seconds for driver to wait for controller ready.");
+
 static char *raid_levels[] = {
 	"RAID-0",
 	"RAID-4",
@@ -383,7 +397,7 @@ static void pqi_wait_if_ctrl_blocked(struct pqi_ctrl_info *ctrl_info)
 	atomic_dec(&ctrl_info->num_blocked_threads);
 }
 
-#define PQI_QUIESE_WARNING_TIMEOUT_SECS		(10 * HZ)
+#define PQI_QUIESCE_WARNING_TIMEOUT_SECS		10
 
 static inline void pqi_ctrl_wait_until_quiesced(struct pqi_ctrl_info *ctrl_info)
 {
@@ -393,7 +407,7 @@ static inline void pqi_ctrl_wait_until_quiesced(struct pqi_ctrl_info *ctrl_info)
 
 	displayed_warning = false;
 	start_jiffies = jiffies;
-	warning_timeout = PQI_QUIESE_WARNING_TIMEOUT_SECS + start_jiffies;
+	warning_timeout = (PQI_QUIESCE_WARNING_TIMEOUT_SECS * HZ) + start_jiffies;
 
 	while (atomic_read(&ctrl_info->num_busy_threads) >
 		atomic_read(&ctrl_info->num_blocked_threads)) {
@@ -402,7 +416,7 @@ static inline void pqi_ctrl_wait_until_quiesced(struct pqi_ctrl_info *ctrl_info)
 				"waiting %u seconds for driver activity to quiesce\n",
 				jiffies_to_msecs(jiffies - start_jiffies) / 1000);
 			displayed_warning = true;
-			warning_timeout = PQI_QUIESE_WARNING_TIMEOUT_SECS + jiffies;
+			warning_timeout = (PQI_QUIESCE_WARNING_TIMEOUT_SECS * HZ) + jiffies;
 		}
 		msleep(1);
 	}
@@ -491,11 +505,6 @@ static inline void pqi_cancel_rescan_worker(struct pqi_ctrl_info *ctrl_info)
 	cancel_delayed_work_sync(&ctrl_info->rescan_work);
 }
 
-static inline void pqi_cancel_event_worker(struct pqi_ctrl_info *ctrl_info)
-{
-	cancel_work_sync(&ctrl_info->event_work);
-}
-
 static inline u32 pqi_read_heartbeat_counter(struct pqi_ctrl_info *ctrl_info)
 {
 	if (!ctrl_info->heartbeat_counter)
@@ -678,22 +687,11 @@ static inline void pqi_reinit_io_request(struct pqi_io_request *io_request)
 	io_request->raid_bypass = false;
 }
 
-static struct pqi_io_request *pqi_alloc_io_request(
-	struct pqi_ctrl_info *ctrl_info)
+static inline struct pqi_io_request *pqi_alloc_io_request(struct pqi_ctrl_info *ctrl_info, struct scsi_cmnd *scmd)
 {
 	struct pqi_io_request *io_request;
-	u16 i = ctrl_info->next_io_request_slot;	/* benignly racy */
-
-	while (1) {
-		io_request = &ctrl_info->io_request_pool[i];
-		if (atomic_inc_return(&io_request->refcount) == 1)
-			break;
-		atomic_dec(&io_request->refcount);
-		i = (i + 1) % ctrl_info->max_io_slots;
-	}
 
-	/* benignly racy */
-	ctrl_info->next_io_request_slot = (i + 1) % ctrl_info->max_io_slots;
+	io_request =  pqi_get_io_request(ctrl_info, scmd);
 
 	pqi_reinit_io_request(io_request);
 
@@ -718,8 +716,7 @@ static int pqi_send_scsi_raid_request(struct pqi_ctrl_info *ctrl_info, u8 cmd,
 	if (rc)
 		return rc;
 
-	rc = pqi_submit_raid_request_synchronous(ctrl_info, &request.header, 0,
-		error_info);
+	rc = pqi_submit_raid_request_synchronous(ctrl_info, &request.header, 0, error_info);
 
 	pqi_pci_unmap(ctrl_info->pci_dev, request.sg_descriptors, 1, dir);
 
@@ -1205,8 +1202,8 @@ static inline int pqi_report_phys_luns(struct pqi_ctrl_info *ctrl_info, void **b
 
 	for (i = 0; i < num_physicals; i++) {
 		memcpy(&rpl_16byte_wwid_list->lun_entries[i].lunid, &rpl_8byte_wwid_list->lun_entries[i].lunid, sizeof(rpl_8byte_wwid_list->lun_entries[i].lunid));
-		memset(&rpl_16byte_wwid_list->lun_entries[i].wwid, 0, 8);
-		memcpy(&rpl_16byte_wwid_list->lun_entries[i].wwid[8], &rpl_8byte_wwid_list->lun_entries[i].wwid, sizeof(rpl_8byte_wwid_list->lun_entries[i].wwid));
+		memcpy(&rpl_16byte_wwid_list->lun_entries[i].wwid[0], &rpl_8byte_wwid_list->lun_entries[i].wwid, sizeof(rpl_8byte_wwid_list->lun_entries[i].wwid));
+		memset(&rpl_16byte_wwid_list->lun_entries[i].wwid[8], 0, 8);
 		rpl_16byte_wwid_list->lun_entries[i].device_type = rpl_8byte_wwid_list->lun_entries[i].device_type;
 		rpl_16byte_wwid_list->lun_entries[i].device_flags = rpl_8byte_wwid_list->lun_entries[i].device_flags;
 		rpl_16byte_wwid_list->lun_entries[i].lun_count = rpl_8byte_wwid_list->lun_entries[i].lun_count;
@@ -1596,10 +1593,7 @@ static int pqi_get_physical_device_info(struct pqi_ctrl_info *ctrl_info,
 		&id_phys->alternate_paths_phys_connector,
 		sizeof(device->phys_connector));
 	device->bay = id_phys->phys_bay_in_box;
-
-	memcpy(&device->page_83_identifier, &id_phys->page_83_identifier,
-		sizeof(device->page_83_identifier));
-
+	device->lun_count = id_phys->multi_lun_device_lun_count;
 	if ((id_phys->even_more_flags & PQI_DEVICE_PHY_MAP_SUPPORTED) &&
 		id_phys->phy_count)
 		device->phy_id =
@@ -1734,7 +1728,7 @@ static bool pqi_keep_device_offline(struct pqi_ctrl_info *ctrl_info,
 	return offline;
 }
 
-static int pqi_get_device_info(struct pqi_ctrl_info *ctrl_info,
+static int pqi_get_device_info_phys_logical(struct pqi_ctrl_info *ctrl_info,
 	struct pqi_scsi_dev *device,
 	struct bmic_identify_physical_device *id_phys)
 {
@@ -1751,6 +1745,20 @@ static int pqi_get_device_info(struct pqi_ctrl_info *ctrl_info,
 	return rc;
 }
 
+static int pqi_get_device_info(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *device,
+	struct bmic_identify_physical_device *id_phys)
+{
+	int rc;
+
+	rc = pqi_get_device_info_phys_logical(ctrl_info, device, id_phys);
+
+	if (rc == 0 && device->lun_count == 0)
+		device->lun_count = 1;
+
+	return rc;
+}
+
 static void pqi_show_volume_status(struct pqi_ctrl_info *ctrl_info,
 	struct pqi_scsi_dev *device)
 {
@@ -1883,15 +1891,18 @@ static int pqi_add_device(struct pqi_ctrl_info *ctrl_info,
 static inline void pqi_remove_device(struct pqi_ctrl_info *ctrl_info, struct pqi_scsi_dev *device)
 {
 	int rc;
+	int lun;
 
-	rc = pqi_device_wait_for_pending_io(ctrl_info, device,
-		PQI_REMOVE_DEVICE_PENDING_IO_TIMEOUT_MSECS);
-	if (rc)
-		dev_err(&ctrl_info->pci_dev->dev,
-			"scsi %d:%d:%d:%d removing device with %d outstanding command(s)\n",
-			ctrl_info->scsi_host->host_no, device->bus,
-			device->target, device->lun,
-			atomic_read(&device->scsi_cmds_outstanding));
+	for (lun = 0; lun < device->lun_count; lun++) {
+		rc = pqi_device_wait_for_pending_io(ctrl_info, device, lun,
+			PQI_REMOVE_DEVICE_PENDING_IO_TIMEOUT_MSECS);
+		if (rc)
+			dev_err(&ctrl_info->pci_dev->dev,
+				"scsi %d:%d:%d:%d removing device with %d outstanding command(s)\n",
+				ctrl_info->scsi_host->host_no, device->bus,
+				device->target, lun,
+				atomic_read(&device->scsi_cmds_outstanding[lun]));
+	}
 
 	if (pqi_is_logical_device(device))
 		scsi_remove_device(device->sdev);
@@ -1991,7 +2002,7 @@ static void pqi_dev_info(struct pqi_ctrl_info *ctrl_info,
 	else
 		count += scnprintf(buffer + count,
 			PQI_DEV_INFO_BUFFER_LENGTH - count,
-			" %016llx%016llx", 
+			" %016llx%016llx",
 			get_unaligned_be64(&device->wwid[0]),
 			get_unaligned_be64(&device->wwid[8]));
 
@@ -2023,10 +2034,27 @@ static void pqi_dev_info(struct pqi_ctrl_info *ctrl_info,
 	dev_info(&ctrl_info->pci_dev->dev, "%s %s\n", action, buffer);
 }
 
+static bool pqi_raid_maps_equal(struct raid_map *raid_map1, struct raid_map *raid_map2)
+{
+	u32 raid_map1_size;
+	u32 raid_map2_size;
+
+	if (raid_map1 == NULL || raid_map2 == NULL)
+		return raid_map1 == raid_map2;
+
+	raid_map1_size = get_unaligned_le32(&raid_map1->structure_size);
+	raid_map2_size = get_unaligned_le32(&raid_map2->structure_size);
+
+	if (raid_map1_size != raid_map2_size)
+		return false;
+
+	return memcmp(raid_map1, raid_map2, raid_map1_size) == 0;
+}
+
 /* Assumes the SCSI device list lock is held. */
 
-static void pqi_scsi_update_device(struct pqi_scsi_dev *existing_device,
-	struct pqi_scsi_dev *new_device)
+static void pqi_scsi_update_device(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *existing_device, struct pqi_scsi_dev *new_device)
 {
 	existing_device->device_type = new_device->device_type;
 	existing_device->bus = new_device->bus;
@@ -2036,50 +2064,48 @@ static void pqi_scsi_update_device(struct pqi_scsi_dev *existing_device,
 		existing_device->target_lun_valid = true;
 	}
 
-	if ((existing_device->volume_status == CISS_LV_QUEUED_FOR_EXPANSION ||
-		existing_device->volume_status == CISS_LV_UNDERGOING_EXPANSION) &&
-		new_device->volume_status == CISS_LV_OK)
-		existing_device->rescan = true;
-
 	/* By definition, the scsi3addr and wwid fields are already the same. */
 
 	existing_device->is_physical_device = new_device->is_physical_device;
-	existing_device->is_external_raid_device =
-		new_device->is_external_raid_device;
-	existing_device->is_expander_smp_device =
-		new_device->is_expander_smp_device;
-	existing_device->aio_enabled = new_device->aio_enabled;
-	memcpy(existing_device->vendor, new_device->vendor,
-		sizeof(existing_device->vendor));
-	memcpy(existing_device->model, new_device->model,
-		sizeof(existing_device->model));
+	memcpy(existing_device->vendor, new_device->vendor, sizeof(existing_device->vendor));
+	memcpy(existing_device->model, new_device->model, sizeof(existing_device->model));
 	existing_device->sas_address = new_device->sas_address;
-	existing_device->raid_level = new_device->raid_level;
 	existing_device->queue_depth = new_device->queue_depth;
-	existing_device->aio_handle = new_device->aio_handle;
-	existing_device->volume_status = new_device->volume_status;
-	existing_device->active_path_index = new_device->active_path_index;
-	existing_device->phy_id = new_device->phy_id;
-	existing_device->path_map = new_device->path_map;
-	existing_device->bay = new_device->bay;
-	existing_device->box_index = new_device->box_index;
-	existing_device->phys_box_on_bus = new_device->phys_box_on_bus;
-	existing_device->phy_connected_dev_type = new_device->phy_connected_dev_type;
-	memcpy(existing_device->box, new_device->box,
-		sizeof(existing_device->box));
-	memcpy(existing_device->phys_connector, new_device->phys_connector,
-		sizeof(existing_device->phys_connector));
-	existing_device->next_bypass_group = 0;
-	kfree(existing_device->raid_map);
-	existing_device->raid_map = new_device->raid_map;
-	existing_device->raid_bypass_configured =
-		new_device->raid_bypass_configured;
-	existing_device->raid_bypass_enabled =
-		new_device->raid_bypass_enabled;
 	existing_device->device_offline = false;
-
-	/* To prevent this from being freed later. */
-	new_device->raid_map = NULL;
+	existing_device->lun_count = new_device->lun_count;
+
+	if (pqi_is_logical_device(existing_device)) {
+		existing_device->is_external_raid_device = new_device->is_external_raid_device;
+
+		if (existing_device->devtype == TYPE_DISK) {
+			existing_device->raid_level = new_device->raid_level;
+			existing_device->volume_status = new_device->volume_status;
+			if (ctrl_info->logical_volume_rescan_needed)
+				existing_device->rescan = true;
+			memset(existing_device->next_bypass_group, 0, sizeof(existing_device->next_bypass_group));
+			if (!pqi_raid_maps_equal(existing_device->raid_map, new_device->raid_map)) {
+				kfree(existing_device->raid_map);
+				existing_device->raid_map = new_device->raid_map;
+				/* To prevent this from being freed later. */
+				new_device->raid_map = NULL;
+			}
+			existing_device->raid_bypass_configured = new_device->raid_bypass_configured;
+			existing_device->raid_bypass_enabled = new_device->raid_bypass_enabled;
+		}
+	} else {
+		existing_device->aio_enabled = new_device->aio_enabled;
+		existing_device->aio_handle = new_device->aio_handle;
+		existing_device->is_expander_smp_device = new_device->is_expander_smp_device;
+		existing_device->active_path_index = new_device->active_path_index;
+		existing_device->phy_id = new_device->phy_id;
+		existing_device->path_map = new_device->path_map;
+		existing_device->bay = new_device->bay;
+		existing_device->box_index = new_device->box_index;
+		existing_device->phys_box_on_bus = new_device->phys_box_on_bus;
+		existing_device->phy_connected_dev_type = new_device->phy_connected_dev_type;
+		memcpy(existing_device->box, new_device->box, sizeof(existing_device->box));
+		memcpy(existing_device->phys_connector, new_device->phys_connector, sizeof(existing_device->phys_connector));
+	}
 }
 
 static inline void pqi_free_device(struct pqi_scsi_dev *device)
@@ -2156,7 +2182,7 @@ static void pqi_update_device_list(struct pqi_ctrl_info *ctrl_info,
 			 */
 			device->new_device = false;
 			matching_device->device_gone = false;
-			pqi_scsi_update_device(matching_device, device);
+			pqi_scsi_update_device(ctrl_info, matching_device, device);
 			break;
 		case DEVICE_NOT_FOUND:
 			/*
@@ -2203,6 +2229,11 @@ static void pqi_update_device_list(struct pqi_ctrl_info *ctrl_info,
 
 	spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
 
+	/*
+	 * If OFA is in progress and there are devices that need to be deleted,
+	 * allow any pending reset operations to continue and unblock any SCSI
+	 * requests before removal.
+	 */
 	if (pqi_ofa_in_progress(ctrl_info)) {
 		list_for_each_entry_safe(device, next, &delete_list, delete_list_entry)
 			if (pqi_is_device_added(device))
@@ -2226,8 +2257,8 @@ static void pqi_update_device_list(struct pqi_ctrl_info *ctrl_info,
 	}
 
 	/*
-	 * Notify the SCSI ML if the queue depth of any existing device has
-	 * changed.
+	 * Notify the SML of any existing device changes such as;
+	 * queue depth, device size.
 	 */
 	list_for_each_entry(device, &ctrl_info->scsi_device_list, scsi_device_list_entry) {
 		if (device->sdev && device->queue_depth != device->advertised_queue_depth) {
@@ -2256,6 +2287,9 @@ static void pqi_update_device_list(struct pqi_ctrl_info *ctrl_info,
 			}
 		}
 	}
+
+	ctrl_info->logical_volume_rescan_needed = false;
+
 }
 
 static inline bool pqi_is_supported_device(struct pqi_scsi_dev *device)
@@ -2287,18 +2321,6 @@ static inline void pqi_mask_device(u8 *scsi3addr)
 	scsi3addr[3] |= 0xc0;
 }
 
-static inline bool pqi_is_device_with_sas_address(struct pqi_scsi_dev *device)
-{
-	switch (device->device_type) {
-	case SA_DEVICE_TYPE_SAS:
-	case SA_DEVICE_TYPE_EXPANDER_SMP:
-	case SA_DEVICE_TYPE_SES:
-		return true;
-	}
-
-	return false;
-}
-
 static inline bool pqi_is_multipath_device(struct pqi_scsi_dev *device)
 {
 	if (pqi_is_logical_device(device))
@@ -2312,17 +2334,6 @@ static inline bool pqi_expose_device(struct pqi_scsi_dev *device)
 	return !device->is_physical_device || !pqi_skip_device(device->scsi3addr);
 }
 
-static inline void pqi_set_physical_device_wwid(struct pqi_ctrl_info *ctrl_info,
-	struct pqi_scsi_dev *device, struct report_phys_lun_16byte_wwid *phys_lun)
-{
-	if (ctrl_info->unique_wwid_in_report_phys_lun_supported ||
-		ctrl_info->rpl_extended_format_4_5_supported ||
-		pqi_is_device_with_sas_address(device))
-		memcpy(device->wwid, phys_lun->wwid, sizeof(device->wwid));
-	else
-		memcpy(&device->wwid[8], device->page_83_identifier, 8);
-}
-
 static int pqi_update_scsi_devices(struct pqi_ctrl_info *ctrl_info)
 {
 	int i;
@@ -2489,12 +2500,12 @@ static int pqi_update_scsi_devices(struct pqi_ctrl_info *ctrl_info)
 		pqi_assign_bus_target_lun(device);
 
 		if (device->is_physical_device) {
-			pqi_set_physical_device_wwid(ctrl_info, device, phys_lun);
+			memcpy(device->wwid, phys_lun->wwid, sizeof(device->wwid));
 			if ((phys_lun->device_flags &
 				CISS_REPORT_PHYS_DEV_FLAG_AIO_ENABLED) &&
 				phys_lun->aio_handle) {
 					device->aio_enabled = true;
-					device->aio_handle = 
+					device->aio_handle =
 						phys_lun->aio_handle;
 			}
 		} else {
@@ -2502,8 +2513,7 @@ static int pqi_update_scsi_devices(struct pqi_ctrl_info *ctrl_info)
 				sizeof(device->volume_id));
 		}
 
-		if (pqi_is_device_with_sas_address(device))
-			device->sas_address = get_unaligned_be64(&device->wwid[8]);
+		device->sas_address = get_unaligned_be64(&device->wwid[0]);
 
 		new_device_list[num_valid_devices++] = device;
 	}
@@ -2527,25 +2537,6 @@ static int pqi_update_scsi_devices(struct pqi_ctrl_info *ctrl_info)
 	return rc;
 }
 
-static void pqi_remove_all_scsi_devices(struct pqi_ctrl_info *ctrl_info)
-{
-	unsigned long flags;
-	struct pqi_scsi_dev *device;
-	struct pqi_scsi_dev *next;
-
-	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
-
-	list_for_each_entry_safe(device, next, &ctrl_info->scsi_device_list,
-		scsi_device_list_entry) {
-		if (pqi_is_device_added(device))
-			pqi_remove_device(ctrl_info, device);
-		list_del(&device->scsi_device_list_entry);
-		pqi_free_device(device);
-	}
-
-	spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
-}
-
 #if TORTUGA
 
 static int pqi_add_controller(struct pqi_ctrl_info *ctrl_info)
@@ -2678,7 +2669,7 @@ static inline void pqi_set_encryption_info(struct pqi_encryption_info *encryptio
  */
 
 static bool pqi_aio_raid_level_supported(struct pqi_ctrl_info *ctrl_info,
-	struct pqi_scsi_dev *device, struct pqi_scsi_dev_raid_map_data *rmd)
+	struct pqi_scsi_dev_raid_map_data *rmd)
 {
 	bool is_supported = true;
 
@@ -2854,6 +2845,15 @@ static int pqi_calc_aio_r5_or_r6(struct pqi_scsi_dev_raid_map_data *rmd,
 	if (rmd->is_write) {
 		u32 index;
 
+		/*
+		 * p_parity_it_nexus and q_parity_it_nexus are pointers to the
+		 * parity entries inside the device's raid_map.
+		 *
+		 * A device's RAID map is bounded by: number of RAID disks squared.
+		 *
+		 * The device's RAID map size is checked during device
+		 * initialization.
+		 */
 		index = DIV_ROUND_UP(rmd->map_index + 1, rmd->total_disks_per_row);
 		index *= rmd->total_disks_per_row;
 		index -= get_unaligned_le16(&raid_map->metadata_disks_per_row);
@@ -2931,7 +2931,7 @@ static int pqi_raid_bypass_submit_scsi_cmd(struct pqi_ctrl_info *ctrl_info,
 
 	rmd.raid_level = device->raid_level;
 
-	if (!pqi_aio_raid_level_supported(ctrl_info, device, &rmd))
+	if (!pqi_aio_raid_level_supported(ctrl_info, &rmd))
 		return PQI_RAID_BYPASS_INELIGIBLE;
 
 	if (unlikely(rmd.block_cnt == 0))
@@ -2948,11 +2948,11 @@ static int pqi_raid_bypass_submit_scsi_cmd(struct pqi_ctrl_info *ctrl_info,
 		if (rmd.is_write) {
 			pqi_calc_aio_r1_nexus(raid_map, &rmd);
 		} else {
-			group = device->next_bypass_group;
+			group = device->next_bypass_group[rmd.map_index];
 			next_bypass_group = group + 1;
 			if (next_bypass_group >= rmd.layout_map_count)
 				next_bypass_group = 0;
-			device->next_bypass_group = next_bypass_group;
+			device->next_bypass_group[rmd.map_index] = next_bypass_group;
 			rmd.map_index += group * rmd.data_disks_per_row;
 		}
 	} else if ((device->raid_level == SA_RAID_5 ||
@@ -3021,7 +3021,7 @@ static int pqi_raid_bypass_submit_scsi_cmd(struct pqi_ctrl_info *ctrl_info,
 #define PQI_DEVICE_STATE_ADMIN_QUEUE_PAIR_READY		0x3
 #define PQI_DEVICE_STATE_ERROR				0x4
 
-#define PQI_MODE_READY_TIMEOUT_SECS		(30 * HZ)
+#define PQI_MODE_READY_TIMEOUT_SECS		30
 #define PQI_MODE_READY_POLL_INTERVAL_MSECS	1
 
 static int pqi_wait_for_pqi_mode_ready(struct pqi_ctrl_info *ctrl_info)
@@ -3032,7 +3032,7 @@ static int pqi_wait_for_pqi_mode_ready(struct pqi_ctrl_info *ctrl_info)
 	u8 status;
 
 	pqi_registers = ctrl_info->pqi_registers;
-	timeout = PQI_MODE_READY_TIMEOUT_SECS + jiffies;
+	timeout = (PQI_MODE_READY_TIMEOUT_SECS * HZ) + jiffies;
 
 	while (1) {
 		signature = readq(&pqi_registers->signature);
@@ -3277,8 +3277,7 @@ static void pqi_process_aio_io_error(struct pqi_io_request *io_request)
 	}
 
 	if (device_offline && sense_data_length == 0)
-		scsi_build_sense_buffer(0, scmd->sense_buffer, HARDWARE_ERROR,
-			0x3e, 0x1);
+		scsi_build_sense_buffer(0, scmd->sense_buffer, HARDWARE_ERROR, 0x3e, 0x1);
 
 	scmd->result = scsi_status;
 	set_host_byte(scmd, host_byte);
@@ -3310,6 +3309,9 @@ static int pqi_interpret_task_management_response(struct pqi_ctrl_info *ctrl_inf
 	case SOP_TMF_REJECTED:
 		rc = -EAGAIN;
 		break;
+	case SOP_RC_INCORRECT_LOGICAL_UNIT:
+		rc = -ENODEV;
+		break;
 	default:
 		rc = -EIO;
 		break;
@@ -3503,7 +3505,7 @@ static void pqi_acknowledge_event(struct pqi_ctrl_info *ctrl_info,
 	pqi_send_event_ack(ctrl_info, &request, sizeof(request));
 }
 
-#define PQI_SOFT_RESET_STATUS_TIMEOUT_SECS		(30 * HZ)
+#define PQI_SOFT_RESET_STATUS_TIMEOUT_SECS		30
 #define PQI_SOFT_RESET_STATUS_POLL_INTERVAL_SECS	1
 
 static enum pqi_soft_reset_status pqi_poll_for_soft_reset_status(
@@ -3512,7 +3514,7 @@ static enum pqi_soft_reset_status pqi_poll_for_soft_reset_status(
 	u8 status;
 	unsigned long timeout;
 
-	timeout = PQI_SOFT_RESET_STATUS_TIMEOUT_SECS + jiffies;
+	timeout = (PQI_SOFT_RESET_STATUS_TIMEOUT_SECS * HZ) + jiffies;
 
 	while (1) {
 		status = pqi_read_soft_reset_status(ctrl_info);
@@ -3549,43 +3551,44 @@ static void pqi_process_soft_reset(struct pqi_ctrl_info *ctrl_info)
 	delay_secs = PQI_POST_RESET_DELAY_SECS;
 
 	switch (reset_status) {
-		case RESET_TIMEDOUT:
-			delay_secs = PQI_POST_OFA_RESET_DELAY_UPON_TIMEOUT_SECS;
-			/* fall through */
-		case RESET_INITIATE_DRIVER:
-			dev_info(&ctrl_info->pci_dev->dev,
+	case RESET_TIMEDOUT:
+		delay_secs = PQI_POST_OFA_RESET_DELAY_UPON_TIMEOUT_SECS;
+		/* fall through */
+	case RESET_INITIATE_DRIVER:
+		dev_info(&ctrl_info->pci_dev->dev,
 				"Online Firmware Activation: resetting controller\n");
-			sis_soft_reset(ctrl_info);
-			/* fall through */
-		case RESET_INITIATE_FIRMWARE:
-			ctrl_info->pqi_mode_enabled = false;
-			pqi_save_ctrl_mode(ctrl_info, SIS_MODE);
-			rc = pqi_ofa_ctrl_restart(ctrl_info, delay_secs);
-			pqi_ofa_free_host_buffer(ctrl_info);
-			pqi_ctrl_ofa_done(ctrl_info);
-			dev_info(&ctrl_info->pci_dev->dev,
+		sis_soft_reset(ctrl_info);
+		/* fall through */
+	case RESET_INITIATE_FIRMWARE:
+		ctrl_info->pqi_mode_enabled = false;
+		pqi_save_ctrl_mode(ctrl_info, SIS_MODE);
+		rc = pqi_ofa_ctrl_restart(ctrl_info, delay_secs);
+		pqi_ofa_free_host_buffer(ctrl_info);
+		pqi_ctrl_ofa_done(ctrl_info);
+		dev_info(&ctrl_info->pci_dev->dev,
 				"Online Firmware Activation: %s\n",
 				rc == 0 ? "SUCCESS" : "FAILED");
-			break;
-		case RESET_ABORT:
-			dev_info(&ctrl_info->pci_dev->dev,
+		break;
+	case RESET_ABORT:
+		dev_info(&ctrl_info->pci_dev->dev,
 				"Online Firmware Activation ABORTED\n");
-			if (ctrl_info->soft_reset_handshake_supported)
-				pqi_clear_soft_reset_status(ctrl_info);
-			pqi_ofa_free_host_buffer(ctrl_info);
-			pqi_ctrl_ofa_done(ctrl_info);
-			pqi_ofa_ctrl_unquiesce(ctrl_info);
-			break;
-		case RESET_NORESPONSE:
-		default:
-			dev_err(&ctrl_info->pci_dev->dev,
-				"unexpected Online Firmware Activation reset status: 0x%x\n",
-				reset_status);
-			pqi_ofa_free_host_buffer(ctrl_info);
-			pqi_ctrl_ofa_done(ctrl_info);
-			pqi_ofa_ctrl_unquiesce(ctrl_info);
-			pqi_take_ctrl_offline(ctrl_info, PQI_OFA_RESPONSE_TIMEOUT);
-			break;
+		if (ctrl_info->soft_reset_handshake_supported)
+			pqi_clear_soft_reset_status(ctrl_info);
+		pqi_ofa_free_host_buffer(ctrl_info);
+		pqi_ctrl_ofa_done(ctrl_info);
+		pqi_ofa_ctrl_unquiesce(ctrl_info);
+		break;
+	case RESET_NORESPONSE:
+		/* fall through */
+	default:
+		dev_err(&ctrl_info->pci_dev->dev,
+			"unexpected Online Firmware Activation reset status: 0x%x\n",
+			reset_status);
+		pqi_ofa_free_host_buffer(ctrl_info);
+		pqi_ctrl_ofa_done(ctrl_info);
+		pqi_ofa_ctrl_unquiesce(ctrl_info);
+		pqi_take_ctrl_offline(ctrl_info, PQI_OFA_RESPONSE_TIMEOUT);
+		break;
 	}
 }
 
@@ -3650,6 +3653,20 @@ static bool pqi_ofa_process_event(struct pqi_ctrl_info *ctrl_info,
 	return ack_event;
 }
 
+static void pqi_disable_raid_bypass(struct pqi_ctrl_info *ctrl_info)
+{
+	unsigned long flags;
+	struct pqi_scsi_dev *device;
+
+	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
+
+	list_for_each_entry(device, &ctrl_info->scsi_device_list, scsi_device_list_entry)
+		if (device->raid_bypass_enabled)
+			device->raid_bypass_enabled = false;
+
+	spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+}
+
 static void pqi_event_worker(struct work_struct *work)
 {
 	unsigned int i;
@@ -3675,6 +3692,10 @@ static void pqi_event_worker(struct work_struct *work)
 			} else {
 				ack_event = true;
 				rescan_needed = true;
+				if (event->event_type == PQI_EVENT_TYPE_LOGICAL_DEVICE)
+					ctrl_info->logical_volume_rescan_needed = true;
+				else if (event->event_type == PQI_EVENT_TYPE_AIO_STATE_CHANGE)
+					pqi_disable_raid_bypass(ctrl_info);
 			}
 			if (ack_event)
 				pqi_acknowledge_event(ctrl_info, event);
@@ -3682,8 +3703,11 @@ static void pqi_event_worker(struct work_struct *work)
 		event++;
 	}
 
+#define PQI_RESCAN_WORK_FOR_EVENT_DELAY		(5 * HZ)
+
 	if (rescan_needed)
-		pqi_schedule_rescan_worker_delayed(ctrl_info);
+		pqi_schedule_rescan_worker_with_delay(ctrl_info,
+			PQI_RESCAN_WORK_FOR_EVENT_DELAY);
 
 out:
 	pqi_ctrl_unbusy(ctrl_info);
@@ -3695,9 +3719,7 @@ static void pqi_heartbeat_timer_handler(struct timer_list *t)
 {
 	int num_interrupts;
 	u32 heartbeat_count;
-	struct pqi_ctrl_info *ctrl_info;
-
-	ctrl_info = from_timer(ctrl_info, t, heartbeat_timer);
+	struct pqi_ctrl_info *ctrl_info = from_timer(ctrl_info, t, heartbeat_timer);
 
 	pqi_check_ctrl_health(ctrl_info);
 	if (pqi_ctrl_offline(ctrl_info))
@@ -3714,8 +3736,9 @@ static void pqi_heartbeat_timer_handler(struct timer_list *t)
 			pqi_take_ctrl_offline(ctrl_info, PQI_NO_HEARTBEAT);
 			return;
 		}
-	} else
+	} else {
 		ctrl_info->previous_num_interrupts = num_interrupts;
+	}
 
 	ctrl_info->previous_heartbeat_count = heartbeat_count;
 	mod_timer(&ctrl_info->heartbeat_timer,
@@ -3979,10 +4002,14 @@ static void pqi_free_irqs(struct pqi_ctrl_info *ctrl_info)
 static int pqi_enable_msix_interrupts(struct pqi_ctrl_info *ctrl_info)
 {
 	int num_vectors_enabled;
+	unsigned int flags = PCI_IRQ_MSIX;
+
+	if (!pqi_disable_managed_interrupts)
+		flags |= PCI_IRQ_AFFINITY;
 
 	num_vectors_enabled = pqi_pci_alloc_irq_vectors(ctrl_info->pci_dev,
 			PQI_MIN_MSIX_VECTORS, ctrl_info->num_queue_groups,
-			PCI_IRQ_MSIX | PCI_IRQ_AFFINITY);
+			flags);
 
 	if (num_vectors_enabled < 0) {
 		dev_err(&ctrl_info->pci_dev->dev,
@@ -4321,7 +4348,7 @@ static void pqi_submit_admin_request(struct pqi_ctrl_info *ctrl_info,
 	writel(iq_pi, admin_queues->iq_pi);
 }
 
-#define PQI_ADMIN_REQUEST_TIMEOUT_SECS	(60 * HZ)
+#define PQI_ADMIN_REQUEST_TIMEOUT_SECS		60
 
 static int pqi_poll_for_admin_response(struct pqi_ctrl_info *ctrl_info,
 	struct pqi_general_admin_response *response)
@@ -4334,7 +4361,7 @@ static int pqi_poll_for_admin_response(struct pqi_ctrl_info *ctrl_info,
 	admin_queues = &ctrl_info->admin_queues;
 	oq_ci = admin_queues->oq_ci_copy;
 
-	timeout = PQI_ADMIN_REQUEST_TIMEOUT_SECS + jiffies;
+	timeout = (PQI_ADMIN_REQUEST_TIMEOUT_SECS * HZ) + jiffies;
 
 	while (1) {
 		oq_pi = readl(admin_queues->oq_pi);
@@ -4440,7 +4467,7 @@ static void pqi_start_io(struct pqi_ctrl_info *ctrl_info,
 	spin_unlock_irqrestore(&queue_group->submit_lock[path], flags);
 }
 
-#define PQI_WAIT_FOR_COMPLETION_IO_TIMEOUT_SECS		(10 * HZ)
+#define PQI_WAIT_FOR_COMPLETION_IO_TIMEOUT_SECS		10
 
 static int pqi_wait_for_completion_io(struct pqi_ctrl_info *ctrl_info,
 	struct completion *wait)
@@ -4449,7 +4476,7 @@ static int pqi_wait_for_completion_io(struct pqi_ctrl_info *ctrl_info,
 
 	while (1) {
 		if (wait_for_completion_io_timeout(wait,
-			PQI_WAIT_FOR_COMPLETION_IO_TIMEOUT_SECS)) {
+			PQI_WAIT_FOR_COMPLETION_IO_TIMEOUT_SECS * HZ)) {
 			rc = 0;
 			break;
 		}
@@ -4517,6 +4544,10 @@ static int pqi_submit_raid_request_synchronous(struct pqi_ctrl_info *ctrl_info,
 	}
 
 	pqi_ctrl_busy(ctrl_info);
+	/*
+	 * Wait for other admin queue updates such as:
+	 * config table changes, OFA memory updates, ...
+	 */
 	if (pqi_is_blockable_request(request))
 		pqi_wait_if_ctrl_blocked(ctrl_info);
 
@@ -4525,7 +4556,7 @@ static int pqi_submit_raid_request_synchronous(struct pqi_ctrl_info *ctrl_info,
 		goto out;
 	}
 
-	io_request = pqi_alloc_io_request(ctrl_info);
+	io_request = pqi_alloc_io_request(ctrl_info, NULL);
 
 	put_unaligned_le16(io_request->index,
 		&(((struct pqi_raid_path_request *)request)->request_id));
@@ -4964,8 +4995,7 @@ static int pqi_configure_events(struct pqi_ctrl_info *ctrl_info,
 	if (rc)
 		goto out;
 
-	rc = pqi_submit_raid_request_synchronous(ctrl_info, &request.header,
-		0, NULL);
+	rc = pqi_submit_raid_request_synchronous(ctrl_info, &request.header, 0, NULL);
 
 	pqi_pci_unmap(ctrl_info->pci_dev,
 		request.data.report_event_configuration.sg_descriptors, 1,
@@ -5000,8 +5030,7 @@ static int pqi_configure_events(struct pqi_ctrl_info *ctrl_info,
 	if (rc)
 		goto out;
 
-	rc = pqi_submit_raid_request_synchronous(ctrl_info, &request.header, 0,
-		NULL);
+	rc = pqi_submit_raid_request_synchronous(ctrl_info, &request.header, 0, NULL);
 
 	pqi_pci_unmap(ctrl_info->pci_dev,
 		request.data.report_event_configuration.sg_descriptors, 1,
@@ -5018,11 +5047,6 @@ static inline int pqi_enable_events(struct pqi_ctrl_info *ctrl_info)
 	return pqi_configure_events(ctrl_info, true);
 }
 
-static inline int pqi_disable_events(struct pqi_ctrl_info *ctrl_info)
-{
-	return pqi_configure_events(ctrl_info, false);
-}
-
 static void pqi_free_all_io_requests(struct pqi_ctrl_info *ctrl_info)
 {
 	unsigned int i;
@@ -5131,6 +5155,9 @@ static void pqi_calculate_io_resources(struct pqi_ctrl_info *ctrl_info)
 	ctrl_info->scsi_ml_can_queue =
 		ctrl_info->max_outstanding_requests - PQI_RESERVED_IO_SLOTS;
 	ctrl_info->max_io_slots = ctrl_info->max_outstanding_requests;
+#if !KFEATURE_HAS_HOST_TAGSET_SUPPORT
+	ctrl_info->per_cpu_factor = ctrl_info->max_io_slots / num_online_cpus();
+#endif
 
 	ctrl_info->error_buffer_length =
 		ctrl_info->max_io_slots * PQI_ERROR_BUFFER_ELEMENT_LENGTH;
@@ -5180,7 +5207,6 @@ static void pqi_calculate_queue_resources(struct pqi_ctrl_info *ctrl_info)
 	}
 
 	ctrl_info->num_queue_groups = num_queue_groups;
-	ctrl_info->max_hw_queue_index = num_queue_groups - 1;
 
 	/*
 	 * Make sure that the max. inbound IU length is an even multiple
@@ -5452,6 +5478,7 @@ static int pqi_raid_submit_scsi_cmd_with_io_request(
 	put_unaligned_le16(io_request->index, &request->request_id);
 	request->error_index = request->request_id;
 	memcpy(request->lun_number, device->scsi3addr, sizeof(request->lun_number));
+	request->ml_device_lun_number = (u8)scmd->device->lun;
 
 	cdb_length = min_t(size_t, scmd->cmd_len, sizeof(request->cdb));
 	memcpy(request->cdb, scmd->cmnd, cdb_length);
@@ -5483,10 +5510,10 @@ static int pqi_raid_submit_scsi_cmd_with_io_request(
 #endif
 
 	switch (scmd->sc_data_direction) {
-	case DMA_TO_DEVICE:
+	case DMA_FROM_DEVICE:
 		request->data_direction = SOP_READ_FLAG;
 		break;
-	case DMA_FROM_DEVICE:
+	case DMA_TO_DEVICE:
 		request->data_direction = SOP_WRITE_FLAG;
 		break;
 	case DMA_NONE:
@@ -5520,7 +5547,9 @@ static inline int pqi_raid_submit_scsi_cmd(struct pqi_ctrl_info *ctrl_info,
 {
 	struct pqi_io_request *io_request;
 
-	io_request = pqi_alloc_io_request(ctrl_info);
+	io_request = pqi_alloc_io_request(ctrl_info, scmd);
+	if (!io_request)
+		return SCSI_MLQUEUE_HOST_BUSY;
 
 	return pqi_raid_submit_scsi_cmd_with_io_request(ctrl_info, io_request,
 		device, scmd, queue_group);
@@ -5561,7 +5590,7 @@ static void pqi_aio_io_complete(struct pqi_io_request *io_request,
 	scsi_dma_unmap(scmd);
 	if (io_request->status == -EAGAIN || pqi_raid_bypass_retry_needed(io_request)) {
 		set_host_byte(scmd, DID_IMM_RETRY);
-		scmd->SCp.this_residual++;
+		PQI_SCSI_CMD_RESIDUAL(scmd)++;
 	}
 
 	pqi_free_io_request(io_request);
@@ -5578,7 +5607,7 @@ static inline bool pqi_is_io_high_priority(struct pqi_ctrl_info *ctrl_info,
 
 	if (device->ncq_prio_enable) {
 		priority_class =
-			IOPRIO_PRIO_CLASS(req_get_ioprio(scmd->request));
+			IOPRIO_PRIO_CLASS(req_get_ioprio(PQI_SCSI_REQUEST(scmd)));
 		if (priority_class == IOPRIO_CLASS_RT) {
 			/* Set NCQ priority for read/write commands. */
 			switch (scmd->cmnd[0]) {
@@ -5621,8 +5650,12 @@ static int pqi_aio_submit_io(struct pqi_ctrl_info *ctrl_info,
 	int rc;
 	struct pqi_io_request *io_request;
 	struct pqi_aio_path_request *request;
+	struct pqi_scsi_dev *device;
 
-	io_request = pqi_alloc_io_request(ctrl_info);
+	device = scmd->device->hostdata;
+	io_request = pqi_alloc_io_request(ctrl_info, scmd);
+	if (!io_request)
+		return SCSI_MLQUEUE_HOST_BUSY;
 	io_request->io_complete_callback = pqi_aio_io_complete;
 	io_request->scmd = scmd;
 	io_request->raid_bypass = raid_bypass;
@@ -5637,6 +5670,8 @@ static int pqi_aio_submit_io(struct pqi_ctrl_info *ctrl_info,
 	request->command_priority = io_high_prio;
 	put_unaligned_le16(io_request->index, &request->request_id);
 	request->error_index = request->request_id;
+	if (!pqi_is_logical_device(device) && ctrl_info->multi_lun_device_supported)
+		put_unaligned_le64(((scmd->device->lun) << 8), &request->lun_number);
 	if (cdb_length > sizeof(request->cdb))
 		cdb_length = sizeof(request->cdb);
 	request->cdb_length = cdb_length;
@@ -5693,7 +5728,10 @@ static  int pqi_aio_submit_r1_write_io(struct pqi_ctrl_info *ctrl_info,
 	struct pqi_io_request *io_request;
 	struct pqi_aio_r1_path_request *r1_request;
 
-	io_request = pqi_alloc_io_request(ctrl_info);
+	io_request = pqi_alloc_io_request(ctrl_info, scmd);
+	if (!io_request)
+		return SCSI_MLQUEUE_HOST_BUSY;
+
 	io_request->io_complete_callback = pqi_aio_io_complete;
 	io_request->scmd = scmd;
 	io_request->raid_bypass = true;
@@ -5718,26 +5756,11 @@ static  int pqi_aio_submit_r1_write_io(struct pqi_ctrl_info *ctrl_info,
 	r1_request->cdb_length = rmd->cdb_length;
 	memcpy(r1_request->cdb, rmd->cdb, rmd->cdb_length);
 
-	switch (scmd->sc_data_direction) {
-	case DMA_TO_DEVICE:
-		r1_request->data_direction = SOP_READ_FLAG;
-		break;
-	case DMA_FROM_DEVICE:
-		r1_request->data_direction = SOP_WRITE_FLAG;
-		break;
-	case DMA_NONE:
-		r1_request->data_direction = SOP_NO_DIRECTION_FLAG;
-		break;
-	case DMA_BIDIRECTIONAL:
-		r1_request->data_direction = SOP_BIDIRECTIONAL;
-		break;
-	default:
-		dev_err(&ctrl_info->pci_dev->dev,
-			"unknown data direction: %d\n",
-			scmd->sc_data_direction);
-		BUG();
-		break;
-	}
+	/*
+	 * The direction is always write.
+	 * Note: a host write results in a controller read.
+	 */
+	r1_request->data_direction = SOP_READ_FLAG;
 
 	if (encryption_info) {
 		r1_request->encryption_enable = true;
@@ -5769,7 +5792,9 @@ static int pqi_aio_submit_r56_write_io(struct pqi_ctrl_info *ctrl_info,
 	struct pqi_io_request *io_request;
 	struct pqi_aio_r56_path_request *r56_request;
 
-	io_request = pqi_alloc_io_request(ctrl_info);
+	io_request = pqi_alloc_io_request(ctrl_info, scmd);
+	if (!io_request)
+		return SCSI_MLQUEUE_HOST_BUSY;
 	io_request->io_complete_callback = pqi_aio_io_complete;
 	io_request->scmd = scmd;
 	io_request->raid_bypass = true;
@@ -5801,26 +5826,11 @@ static int pqi_aio_submit_r56_write_io(struct pqi_ctrl_info *ctrl_info,
 	r56_request->cdb_length = rmd->cdb_length;
 	memcpy(r56_request->cdb, rmd->cdb, rmd->cdb_length);
 
-	switch (scmd->sc_data_direction) {
-	case DMA_TO_DEVICE:
-		r56_request->data_direction = SOP_READ_FLAG;
-		break;
-	case DMA_FROM_DEVICE:
-		r56_request->data_direction = SOP_WRITE_FLAG;
-		break;
-	case DMA_NONE:
-		r56_request->data_direction = SOP_NO_DIRECTION_FLAG;
-		break;
-	case DMA_BIDIRECTIONAL:
-		r56_request->data_direction = SOP_BIDIRECTIONAL;
-		break;
-	default:
-		dev_err(&ctrl_info->pci_dev->dev,
-			"unknown data direction: %d\n",
-			scmd->sc_data_direction);
-		BUG();
-		break;
-	}
+	/*
+	 * The direction is always write.
+	 * Note: a host write results in a controller read.
+	 */
+	r56_request->data_direction = SOP_READ_FLAG;
 
 	if (encryption_info) {
 		r56_request->encryption_enable = true;
@@ -5845,10 +5855,10 @@ static int pqi_aio_submit_r56_write_io(struct pqi_ctrl_info *ctrl_info,
 
 static inline bool pqi_is_bypass_eligible_request(struct scsi_cmnd *scmd)
 {
-	if (blk_rq_is_passthrough(scmd->request))
+	if (blk_rq_is_passthrough(PQI_SCSI_REQUEST(scmd)))
 		return false;
 
-	return scmd->SCp.this_residual == 0;
+	return PQI_SCSI_CMD_RESIDUAL(scmd) == 0;
 }
 
 /*
@@ -5876,8 +5886,10 @@ void pqi_prep_for_scsi_done(struct scsi_cmnd *scmd)
 	shost = scmd->device->host;
 	ctrl_info = shost_to_hba(shost);
 
-	atomic_dec(&device->scsi_cmds_outstanding);
+	atomic_dec(&device->scsi_cmds_outstanding[scmd->device->lun]);
+#if !KFEATURE_HAS_HOST_TAGSET_SUPPORT
 	atomic_dec(&ctrl_info->total_scmds_outstanding);
+#endif
 }
 
 static bool pqi_is_parity_write_stream(struct pqi_ctrl_info *ctrl_info,
@@ -5973,12 +5985,14 @@ int pqi_scsi_queue_command(struct Scsi_Host *shost, struct scsi_cmnd *scmd)
 		return 0;
 	}
 
-	atomic_inc(&device->scsi_cmds_outstanding);
+	atomic_inc(&device->scsi_cmds_outstanding[scmd->device->lun]);
+#if !KFEATURE_HAS_HOST_TAGSET_SUPPORT
 	if (atomic_inc_return(&ctrl_info->total_scmds_outstanding) >
 		ctrl_info->scsi_ml_can_queue) {
 		rc = SCSI_MLQUEUE_HOST_BUSY;
 		goto out;
 	}
+#endif
 
 	if (pqi_ctrl_offline(ctrl_info) || pqi_device_in_remove(device)) {
 		set_host_byte(scmd, DID_NO_CONNECT);
@@ -6022,8 +6036,10 @@ int pqi_scsi_queue_command(struct Scsi_Host *shost, struct scsi_cmnd *scmd)
 
 out:
 	if (rc) {
-		atomic_dec(&device->scsi_cmds_outstanding);
+		atomic_dec(&device->scsi_cmds_outstanding[scmd->device->lun]);
+#if !KFEATURE_HAS_HOST_TAGSET_SUPPORT
 		atomic_dec(&ctrl_info->total_scmds_outstanding);
+#endif
 	}
 
 	return rc;
@@ -6077,7 +6093,7 @@ static unsigned int pqi_nonempty_inbound_queue_count(struct pqi_ctrl_info *ctrl_
 	return nonempty_inbound_queue_count;
 }
 
-#define PQI_INBOUND_QUEUES_NONEMPTY_WARNING_TIMEOUT_SECS	(10 * HZ)
+#define PQI_INBOUND_QUEUES_NONEMPTY_WARNING_TIMEOUT_SECS	10
 
 static int pqi_wait_until_inbound_queues_empty(struct pqi_ctrl_info *ctrl_info)
 {
@@ -6089,7 +6105,7 @@ static int pqi_wait_until_inbound_queues_empty(struct pqi_ctrl_info *ctrl_info)
 
 	displayed_warning = false;
 	start_jiffies = jiffies;
-	warning_timeout = PQI_INBOUND_QUEUES_NONEMPTY_WARNING_TIMEOUT_SECS + start_jiffies;
+	warning_timeout = (PQI_INBOUND_QUEUES_NONEMPTY_WARNING_TIMEOUT_SECS * HZ) + start_jiffies;
 
 	while (1) {
 		queued_io_count = pqi_queued_io_count(ctrl_info);
@@ -6104,7 +6120,7 @@ static int pqi_wait_until_inbound_queues_empty(struct pqi_ctrl_info *ctrl_info)
 				"waiting %u seconds for queued I/O to drain (queued I/O count: %u; non-empty inbound queue count: %u)\n",
 				jiffies_to_msecs(jiffies - start_jiffies) / 1000, queued_io_count, nonempty_inbound_queue_count);
 			displayed_warning = true;
-			warning_timeout = PQI_INBOUND_QUEUES_NONEMPTY_WARNING_TIMEOUT_SECS + jiffies;
+			warning_timeout = (PQI_INBOUND_QUEUES_NONEMPTY_WARNING_TIMEOUT_SECS * HZ) + jiffies;
 		}
 		msleep(1);
 	}
@@ -6161,10 +6177,10 @@ static void pqi_fail_io_queued_for_device(struct pqi_ctrl_info *ctrl_info,
 	}
 }
 
-#define PQI_PENDING_IO_WARNING_TIMEOUT_SECS	(10 * HZ)
+#define PQI_PENDING_IO_WARNING_TIMEOUT_SECS	10
 
 static int pqi_device_wait_for_pending_io(struct pqi_ctrl_info *ctrl_info,
-	struct pqi_scsi_dev *device, unsigned long timeout_msecs)
+	struct pqi_scsi_dev *device, u8 lun, unsigned long timeout_msecs)
 {
 	int cmds_outstanding;
 	unsigned long start_jiffies;
@@ -6172,26 +6188,28 @@ static int pqi_device_wait_for_pending_io(struct pqi_ctrl_info *ctrl_info,
 	unsigned long msecs_waiting;
 
 	start_jiffies = jiffies;
-	warning_timeout = PQI_PENDING_IO_WARNING_TIMEOUT_SECS + start_jiffies;
+	warning_timeout = (PQI_PENDING_IO_WARNING_TIMEOUT_SECS * HZ) + start_jiffies;
 
-	while ((cmds_outstanding = atomic_read(&device->scsi_cmds_outstanding)) > 0) {
-		pqi_check_ctrl_health(ctrl_info);
-		if (pqi_ctrl_offline(ctrl_info))
-			return -ENXIO;
+	while ((cmds_outstanding = atomic_read(&device->scsi_cmds_outstanding[lun])) > 0) {
+		if (ctrl_info->ctrl_removal_state != PQI_CTRL_GRACEFUL_REMOVAL) {
+			pqi_check_ctrl_health(ctrl_info);
+			if (pqi_ctrl_offline(ctrl_info))
+				return -ENXIO;
+		}
 		msecs_waiting = jiffies_to_msecs(jiffies - start_jiffies);
 		if (msecs_waiting >= timeout_msecs) {
 			dev_err(&ctrl_info->pci_dev->dev,
 				"scsi %d:%d:%d:%d: timed out after %lu seconds waiting for %d outstanding command(s)\n",
 				ctrl_info->scsi_host->host_no, device->bus, device->target,
-				device->lun, msecs_waiting / 1000, cmds_outstanding);
+				lun, msecs_waiting / 1000, cmds_outstanding);
 			return -ETIMEDOUT;
 		}
 		if (time_after(jiffies, warning_timeout)) {
 			dev_warn(&ctrl_info->pci_dev->dev,
 				"scsi %d:%d:%d:%d: waiting %lu seconds for %d outstanding command(s)\n",
 				ctrl_info->scsi_host->host_no, device->bus, device->target,
-				device->lun, msecs_waiting / 1000, cmds_outstanding);
-			warning_timeout = PQI_PENDING_IO_WARNING_TIMEOUT_SECS + jiffies;
+				lun, msecs_waiting / 1000, cmds_outstanding);
+			warning_timeout = (PQI_PENDING_IO_WARNING_TIMEOUT_SECS * HZ) + jiffies;
 		}
 		msleep(1);
 	}
@@ -6210,7 +6228,7 @@ static void pqi_lun_reset_complete(struct pqi_io_request *io_request,
 #define PQI_LUN_RESET_POLL_COMPLETION_SECS	10
 
 static int pqi_wait_for_lun_reset_completion(struct pqi_ctrl_info *ctrl_info,
-	struct pqi_scsi_dev *device, struct completion *wait)
+	struct pqi_scsi_dev *device, u8 lun, struct completion *wait)
 {
 	int rc;
 	unsigned int wait_secs;
@@ -6232,10 +6250,10 @@ static int pqi_wait_for_lun_reset_completion(struct pqi_ctrl_info *ctrl_info,
 		}
 
 		wait_secs += PQI_LUN_RESET_POLL_COMPLETION_SECS;
-		cmds_outstanding = atomic_read(&device->scsi_cmds_outstanding);
+		cmds_outstanding = atomic_read(&device->scsi_cmds_outstanding[lun]);
 		dev_warn(&ctrl_info->pci_dev->dev,
 			"scsi %d:%d:%d:%d: waiting %u seconds for LUN reset to complete (%d command(s) outstanding)\n",
-			ctrl_info->scsi_host->host_no, device->bus, device->target, device->lun, wait_secs, cmds_outstanding);
+			ctrl_info->scsi_host->host_no, device->bus, device->target, lun, wait_secs, cmds_outstanding);
 	}
 
 	return rc;
@@ -6243,14 +6261,16 @@ static int pqi_wait_for_lun_reset_completion(struct pqi_ctrl_info *ctrl_info,
 
 #define PQI_LUN_RESET_FIRMWARE_TIMEOUT_SECS	30
 
-static int pqi_lun_reset(struct pqi_ctrl_info *ctrl_info, struct pqi_scsi_dev *device)
+static int pqi_lun_reset(struct pqi_ctrl_info *ctrl_info, struct scsi_cmnd *scmd)
 {
 	int rc;
 	struct pqi_io_request *io_request;
 	DECLARE_COMPLETION_ONSTACK(wait);
 	struct pqi_task_management_request *request;
+	struct pqi_scsi_dev *device;
 
-	io_request = pqi_alloc_io_request(ctrl_info);
+	device = scmd->device->hostdata;
+	io_request = pqi_alloc_io_request(ctrl_info, NULL);
 	io_request->io_complete_callback = pqi_lun_reset_complete;
 	io_request->context = &wait;
 
@@ -6263,6 +6283,8 @@ static int pqi_lun_reset(struct pqi_ctrl_info *ctrl_info, struct pqi_scsi_dev *d
 	put_unaligned_le16(io_request->index, &request->request_id);
 	memcpy(request->lun_number, device->scsi3addr,
 		sizeof(request->lun_number));
+	if (!pqi_is_logical_device(device) && ctrl_info->multi_lun_device_supported)
+		request->ml_device_lun_number = (u8)scmd->device->lun;
 	request->task_management_function = SOP_TASK_MANAGEMENT_LUN_RESET;
 	if (ctrl_info->tmf_iu_timeout_supported)
 		put_unaligned_le16(PQI_LUN_RESET_FIRMWARE_TIMEOUT_SECS, &request->timeout);
@@ -6270,7 +6292,7 @@ static int pqi_lun_reset(struct pqi_ctrl_info *ctrl_info, struct pqi_scsi_dev *d
 	pqi_start_io(ctrl_info, &ctrl_info->queue_groups[PQI_DEFAULT_QUEUE_GROUP], RAID_PATH,
 		io_request);
 
-	rc = pqi_wait_for_lun_reset_completion(ctrl_info, device, &wait);
+	rc = pqi_wait_for_lun_reset_completion(ctrl_info, device, (u8)scmd->device->lun, &wait);
 	if (rc == 0)
 		rc = io_request->status;
 
@@ -6284,16 +6306,18 @@ static int pqi_lun_reset(struct pqi_ctrl_info *ctrl_info, struct pqi_scsi_dev *d
 #define PQI_LUN_RESET_PENDING_IO_TIMEOUT_MSECS		(10 * 60 * 1000)
 #define PQI_LUN_RESET_FAILED_PENDING_IO_TIMEOUT_MSECS	(2 * 60 * 1000)
 
-static int pqi_lun_reset_with_retries(struct pqi_ctrl_info *ctrl_info, struct pqi_scsi_dev *device)
+static int pqi_lun_reset_with_retries(struct pqi_ctrl_info *ctrl_info, struct scsi_cmnd *scmd)
 {
 	int reset_rc;
 	int wait_rc;
 	unsigned int retries;
 	unsigned long timeout_msecs;
+	struct pqi_scsi_dev *device;
 
+	device = scmd->device->hostdata;
 	for (retries = 0;;) {
-		reset_rc = pqi_lun_reset(ctrl_info, device);
-		if (reset_rc == 0 || ++retries > PQI_LUN_RESET_RETRIES)
+		reset_rc = pqi_lun_reset(ctrl_info, scmd);
+		if (reset_rc == 0 || reset_rc == -ENODEV || ++retries > PQI_LUN_RESET_RETRIES)
 			break;
 		msleep(PQI_LUN_RESET_RETRY_INTERVAL_MSECS);
 	}
@@ -6301,7 +6325,7 @@ static int pqi_lun_reset_with_retries(struct pqi_ctrl_info *ctrl_info, struct pq
 	timeout_msecs = reset_rc ? PQI_LUN_RESET_FAILED_PENDING_IO_TIMEOUT_MSECS :
 		PQI_LUN_RESET_PENDING_IO_TIMEOUT_MSECS;
 
-	wait_rc = pqi_device_wait_for_pending_io(ctrl_info, device, timeout_msecs);
+	wait_rc = pqi_device_wait_for_pending_io(ctrl_info, device, scmd->device->lun, timeout_msecs);
 	if (wait_rc && reset_rc == 0)
 		reset_rc = wait_rc;
 
@@ -6309,10 +6333,12 @@ static int pqi_lun_reset_with_retries(struct pqi_ctrl_info *ctrl_info, struct pq
 }
 
 static int pqi_device_reset(struct pqi_ctrl_info *ctrl_info,
-	struct pqi_scsi_dev *device)
+	struct scsi_cmnd *scmd)
 {
 	int rc;
+	struct pqi_scsi_dev *device;
 
+	device = scmd->device->hostdata;
 	pqi_ctrl_block_requests(ctrl_info);
 	pqi_ctrl_wait_until_quiesced(ctrl_info);
 	pqi_fail_io_queued_for_device(ctrl_info, device);
@@ -6320,7 +6346,7 @@ static int pqi_device_reset(struct pqi_ctrl_info *ctrl_info,
 	if (rc)
 		rc = FAILED;
 	else
-		rc = pqi_lun_reset_with_retries(ctrl_info, device);
+		rc = pqi_lun_reset_with_retries(ctrl_info, scmd);
 	pqi_ctrl_unblock_requests(ctrl_info);
 
 	return rc;
@@ -6342,18 +6368,18 @@ static int pqi_eh_device_reset_handler(struct scsi_cmnd *scmd)
 	dev_err(&ctrl_info->pci_dev->dev,
 		"resetting scsi %d:%d:%d:%d due to cmd 0x%02x\n",
 		shost->host_no,
-		device->bus, device->target, device->lun,
+		device->bus, device->target, (u32)scmd->device->lun,
 		scmd->cmd_len > 0 ? scmd->cmnd[0] : 0xff);
 
 	pqi_check_ctrl_health(ctrl_info);
 	if (pqi_ctrl_offline(ctrl_info))
 		rc = FAILED;
 	else
-		rc = pqi_device_reset(ctrl_info, device);
+		rc = pqi_device_reset(ctrl_info, scmd);
 
 	dev_err(&ctrl_info->pci_dev->dev,
 		"reset of scsi %d:%d:%d:%d: %s\n",
-		shost->host_no, device->bus, device->target, device->lun,
+		shost->host_no, device->bus, device->target, (u32)scmd->device->lun,
 		rc == SUCCESS ? "SUCCESS" : "FAILED");
 
 	mutex_unlock(&ctrl_info->lun_reset_mutex);
@@ -6437,6 +6463,41 @@ static int pqi_slave_configure(struct scsi_device *sdev)
 	return rc;
 }
 
+static void pqi_slave_destroy(struct scsi_device *sdev)
+{
+	struct pqi_ctrl_info *ctrl_info;
+	struct pqi_scsi_dev *device;
+	int mutex_acquired;
+	unsigned long flags;
+
+	ctrl_info = shost_to_hba(sdev->host);
+
+	mutex_acquired = mutex_trylock(&ctrl_info->scan_mutex);
+	if (!mutex_acquired)
+		return;
+
+	device = sdev->hostdata;
+	if (!device) {
+		mutex_unlock(&ctrl_info->scan_mutex);
+		return;
+	}
+
+	device->lun_count--;
+	if (device->lun_count > 0) {
+		mutex_unlock(&ctrl_info->scan_mutex);
+		return;
+	}
+
+	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
+	list_del(&device->scsi_device_list_entry);
+	spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
+
+	mutex_unlock(&ctrl_info->scan_mutex);
+
+	pqi_dev_info(ctrl_info, "removed", device);
+	pqi_free_device(device);
+}
+
 static int pqi_getpciinfo_ioctl(struct pqi_ctrl_info *ctrl_info, void __user *arg)
 {
 	struct pci_dev *pci_dev;
@@ -6913,19 +6974,20 @@ static DEVICE_ATTR(enable_r5_writes, S_IWUSR | S_IRUGO,
 static DEVICE_ATTR(enable_r6_writes, S_IWUSR | S_IRUGO,
 	pqi_host_enable_r6_writes_show, pqi_host_enable_r6_writes_store);
 
-static struct device_attribute *pqi_shost_attrs[] = {
-	&dev_attr_driver_version,
-	&dev_attr_firmware_version,
-	&dev_attr_model,
-	&dev_attr_serial_number,
-	&dev_attr_vendor,
-	&dev_attr_rescan,
-	&dev_attr_lockup_action,
-	&dev_attr_enable_stream_detection,
-	&dev_attr_enable_r5_writes,
-	&dev_attr_enable_r6_writes,
+static struct PQI_DEVICE_ATTRIBUTE *pqi_shost_attrs[] = {
+	PQI_ATTRIBUTE(&dev_attr_driver_version),
+	PQI_ATTRIBUTE(&dev_attr_firmware_version),
+	PQI_ATTRIBUTE(&dev_attr_model),
+	PQI_ATTRIBUTE(&dev_attr_serial_number),
+	PQI_ATTRIBUTE(&dev_attr_vendor),
+	PQI_ATTRIBUTE(&dev_attr_rescan),
+	PQI_ATTRIBUTE(&dev_attr_lockup_action),
+	PQI_ATTRIBUTE(&dev_attr_enable_stream_detection),
+	PQI_ATTRIBUTE(&dev_attr_enable_r5_writes),
+	PQI_ATTRIBUTE(&dev_attr_enable_r6_writes),
 	NULL
 };
+PQI_ATTRIBUTE_GROUPS(pqi_shost);
 
 static ssize_t pqi_unique_id_show(struct device *dev,
 	struct device_attribute *attr, char *buffer)
@@ -6939,6 +7001,9 @@ static ssize_t pqi_unique_id_show(struct device *dev,
 	sdev = to_scsi_device(dev);
 	ctrl_info = shost_to_hba(sdev->host);
 
+	if (pqi_ctrl_offline(ctrl_info))
+		return -ENODEV;
+
 	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
 
 	device = sdev->hostdata;
@@ -6975,6 +7040,9 @@ static ssize_t pqi_lunid_show(struct device *dev,
 	sdev = to_scsi_device(dev);
 	ctrl_info = shost_to_hba(sdev->host);
 
+	if (pqi_ctrl_offline(ctrl_info))
+		return -ENODEV;
+
 	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
 
 	device = sdev->hostdata;
@@ -7010,6 +7078,9 @@ static ssize_t pqi_path_info_show(struct device *dev,
 	sdev = to_scsi_device(dev);
 	ctrl_info = shost_to_hba(sdev->host);
 
+	if (pqi_ctrl_offline(ctrl_info))
+		return -ENODEV;
+
 	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
 
 	device = sdev->hostdata;
@@ -7087,10 +7158,13 @@ static ssize_t pqi_sas_address_show(struct device *dev,
 	sdev = to_scsi_device(dev);
 	ctrl_info = shost_to_hba(sdev->host);
 
+	if (pqi_ctrl_offline(ctrl_info))
+		return -ENODEV;
+
 	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
 
 	device = sdev->hostdata;
-	if (!device || !pqi_is_device_with_sas_address(device)) {
+	if (!device) {
 		spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
 		return -ENODEV;
 	}
@@ -7113,6 +7187,9 @@ static ssize_t pqi_ssd_smart_path_enabled_show(struct device *dev,
 	sdev = to_scsi_device(dev);
 	ctrl_info = shost_to_hba(sdev->host);
 
+	if (pqi_ctrl_offline(ctrl_info))
+		return -ENODEV;
+
 	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
 
 	device = sdev->hostdata;
@@ -7142,6 +7219,9 @@ static ssize_t pqi_raid_level_show(struct device *dev,
 	sdev = to_scsi_device(dev);
 	ctrl_info = shost_to_hba(sdev->host);
 
+	if (pqi_ctrl_offline(ctrl_info))
+		return -ENODEV;
+
 	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
 
 	device = sdev->hostdata;
@@ -7150,7 +7230,7 @@ static ssize_t pqi_raid_level_show(struct device *dev,
 		return -ENODEV;
 	}
 
-	if (pqi_is_logical_device(device))
+	if (pqi_is_logical_device(device) && device->devtype == TYPE_DISK)
 		raid_level = pqi_raid_level_to_string(device->raid_level);
 	else
 		raid_level = "N/A";
@@ -7172,6 +7252,9 @@ static ssize_t pqi_raid_bypass_cnt_show(struct device *dev,
 	sdev = to_scsi_device(dev);
 	ctrl_info = shost_to_hba(sdev->host);
 
+	if (pqi_ctrl_offline(ctrl_info))
+		return -ENODEV;
+
 	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
 
 	device = sdev->hostdata;
@@ -7199,6 +7282,9 @@ static ssize_t pqi_sas_ncq_prio_enable_show(struct device *dev,
 	sdev = to_scsi_device(dev);
 	ctrl_info = shost_to_hba(sdev->host);
 
+	if (pqi_ctrl_offline(ctrl_info))
+		return -ENODEV;
+
 	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
 
 	device = sdev->hostdata;
@@ -7227,7 +7313,6 @@ static ssize_t pqi_sas_ncq_prio_enable_store(struct device *dev,
 	if (kstrtou8(buf, 0, &ncq_prio_enable))
 		return -EINVAL;
 
-
 	sdev = to_scsi_device(dev);
 	ctrl_info = shost_to_hba(sdev->host);
 
@@ -7263,28 +7348,18 @@ static DEVICE_ATTR(raid_bypass_cnt, S_IRUGO, pqi_raid_bypass_cnt_show, NULL);
 static DEVICE_ATTR(sas_ncq_prio_enable, S_IWUSR | S_IRUGO,
 		pqi_sas_ncq_prio_enable_show, pqi_sas_ncq_prio_enable_store);
 
-static struct device_attribute *pqi_sdev_attrs[] = {
-	&dev_attr_lunid,
-	&dev_attr_unique_id,
-	&dev_attr_path_info,
-	&dev_attr_sas_address,
-	&dev_attr_ssd_smart_path_enabled,
-	&dev_attr_raid_level,
-	&dev_attr_raid_bypass_cnt,
-	NULL
-};
-
-struct device_attribute *pqi_ncq_prio_sdev_attrs[] = {
-	&dev_attr_lunid,
-	&dev_attr_unique_id,
-	&dev_attr_path_info,
-	&dev_attr_sas_address,
-	&dev_attr_ssd_smart_path_enabled,
-	&dev_attr_raid_level,
-	&dev_attr_raid_bypass_cnt,
-	&dev_attr_sas_ncq_prio_enable,
+static struct PQI_DEVICE_ATTRIBUTE *pqi_sdev_attrs[] = {
+	PQI_ATTRIBUTE(&dev_attr_lunid),
+	PQI_ATTRIBUTE(&dev_attr_unique_id),
+	PQI_ATTRIBUTE(&dev_attr_path_info),
+	PQI_ATTRIBUTE(&dev_attr_sas_address),
+	PQI_ATTRIBUTE(&dev_attr_ssd_smart_path_enabled),
+	PQI_ATTRIBUTE(&dev_attr_raid_level),
+	PQI_ATTRIBUTE(&dev_attr_raid_bypass_cnt),
+	PQI_ATTRIBUTE(&dev_attr_sas_ncq_prio_enable),
 	NULL
 };
+PQI_ATTRIBUTE_GROUPS(pqi_sdev);
 
 static struct scsi_host_template pqi_driver_template = {
 	.module = THIS_MODULE,
@@ -7298,8 +7373,10 @@ static struct scsi_host_template pqi_driver_template = {
 	.ioctl = pqi_ioctl,
 	.slave_alloc = pqi_slave_alloc,
 	.slave_configure = pqi_slave_configure,
-	.sdev_attrs = pqi_sdev_attrs,
-	.shost_attrs = pqi_shost_attrs,
+	.slave_destroy = pqi_slave_destroy,
+	PQI_SDEV_ATTRS,
+	PQI_SHOST_ATTRS,
+	PQI_CMD_PRIV
 };
 
 static int pqi_register_scsi(struct pqi_ctrl_info *ctrl_info)
@@ -7320,7 +7397,7 @@ static int pqi_register_scsi(struct pqi_ctrl_info *ctrl_info)
 	shost->this_id = -1;
 	shost->max_channel = PQI_MAX_BUS;
 	shost->max_cmd_len = MAX_COMMAND_SIZE;
-	shost->max_lun = ~0;
+	shost->max_lun = PQI_MAX_LUNS_PER_DEVICE;
 	shost->max_id = ~0;
 	shost->max_sectors = ctrl_info->max_sectors;
 	shost->can_queue = ctrl_info->scsi_ml_can_queue;
@@ -7330,6 +7407,8 @@ static int pqi_register_scsi(struct pqi_ctrl_info *ctrl_info)
 	shost->irq = pqi_pci_irq_vector(ctrl_info->pci_dev, 0);
 	shost->unique_id = shost->irq;
 	shost->hostdata[0] = (unsigned long)ctrl_info;
+	PQI_SET_HOST_TAGSET(shost);
+
 	pqi_compat_init_scsi_host(shost, ctrl_info);
 
 	rc = scsi_add_host(shost, &ctrl_info->pci_dev->dev);
@@ -7387,8 +7466,7 @@ static int pqi_wait_for_pqi_reset_completion(struct pqi_ctrl_info *ctrl_info)
 		reset_reg.all_bits = readl(&pqi_registers->device_reset);
 		if (reset_reg.bits.reset_action == PQI_RESET_ACTION_COMPLETED)
 			break;
-		pqi_check_ctrl_health(ctrl_info);
-		if (pqi_ctrl_offline(ctrl_info)) {
+		if (!sis_is_firmware_running(ctrl_info)) {
 			rc = -ENXIO;
 			break;
 		}
@@ -7491,6 +7569,9 @@ static int pqi_get_ctrl_product_details(struct pqi_ctrl_info *ctrl_info)
 		sizeof(identify->vendor_id));
 	ctrl_info->vendor[sizeof(identify->vendor_id)] = '\0';
 
+	dev_info(&ctrl_info->pci_dev->dev,
+		"Firmware version: %s\n", ctrl_info->firmware_version);
+
 out:
 	kfree(identify);
 
@@ -7568,8 +7649,7 @@ static int pqi_config_table_update(struct pqi_ctrl_info *ctrl_info,
 	put_unaligned_le16(last_section,
 		&request.data.config_table_update.last_section);
 
-	return pqi_submit_raid_request_synchronous(ctrl_info, &request.header,
-		0, NULL);
+	return pqi_submit_raid_request_synchronous(ctrl_info, &request.header, 0, NULL);
 }
 
 static int pqi_enable_firmware_features(struct pqi_ctrl_info *ctrl_info,
@@ -7653,10 +7733,6 @@ static void pqi_ctrl_update_feature_flags(struct pqi_ctrl_info *ctrl_info,
 	case PQI_FIRMWARE_FEATURE_TMF_IU_TIMEOUT:
 		ctrl_info->tmf_iu_timeout_supported = firmware_feature->enabled;
 		break;
-	case PQI_FIRMWARE_FEATURE_UNIQUE_WWID_IN_REPORT_PHYS_LUN:
-		ctrl_info->unique_wwid_in_report_phys_lun_supported =
-			firmware_feature->enabled;
-		break;
 	case PQI_FIRMWARE_FEATURE_FW_TRIAGE:
 		ctrl_info->firmware_triage_supported = firmware_feature->enabled;
 		pqi_save_fw_triage_setting(ctrl_info, firmware_feature->enabled);
@@ -7664,6 +7740,10 @@ static void pqi_ctrl_update_feature_flags(struct pqi_ctrl_info *ctrl_info,
 	case PQI_FIRMWARE_FEATURE_RPL_EXTENDED_FORMAT_4_5:
 		ctrl_info->rpl_extended_format_4_5_supported = firmware_feature->enabled;
 		break;
+	case PQI_FIRMWARE_FEATURE_MULTI_LUN_DEVICE_SUPPORT:
+		ctrl_info->multi_lun_device_supported =
+			firmware_feature->enabled;
+		break;
 	}
 
 	pqi_firmware_feature_status(ctrl_info, firmware_feature);
@@ -7754,11 +7834,6 @@ static struct pqi_firmware_feature pqi_firmware_features[] = {
 		.feature_bit = PQI_FIRMWARE_FEATURE_RAID_BYPASS_ON_ENCRYPTED_NVME,
 		.feature_status = pqi_firmware_feature_status,
 	},
-	{
-		.feature_name = "Unique WWID in Report Physical LUN",
-		.feature_bit = PQI_FIRMWARE_FEATURE_UNIQUE_WWID_IN_REPORT_PHYS_LUN,
-		.feature_status = pqi_ctrl_update_feature_flags,
-	},
 	{
 		.feature_name = "Firmware Triage",
 		.feature_bit = PQI_FIRMWARE_FEATURE_FW_TRIAGE,
@@ -7769,6 +7844,11 @@ static struct pqi_firmware_feature pqi_firmware_features[] = {
 		.feature_bit = PQI_FIRMWARE_FEATURE_RPL_EXTENDED_FORMAT_4_5,
 		.feature_status = pqi_ctrl_update_feature_flags,
 	},
+	{
+		.feature_name = "Multi-LUN Target",
+		.feature_bit = PQI_FIRMWARE_FEATURE_MULTI_LUN_DEVICE_SUPPORT,
+		.feature_status = pqi_ctrl_update_feature_flags,
+	},
 };
 
 static void pqi_process_firmware_features(
@@ -7868,9 +7948,9 @@ static void pqi_ctrl_reset_config(struct pqi_ctrl_info *ctrl_info)
 	ctrl_info->enable_r6_writes = false;
 	ctrl_info->raid_iu_timeout_supported = false;
 	ctrl_info->tmf_iu_timeout_supported = false;
-	ctrl_info->unique_wwid_in_report_phys_lun_supported = false;
 	ctrl_info->firmware_triage_supported = false;
 	ctrl_info->rpl_extended_format_4_5_supported = false;
+	ctrl_info->multi_lun_device_supported = false;
 }
 
 static int pqi_process_config_table(struct pqi_ctrl_info *ctrl_info)
@@ -7882,7 +7962,7 @@ static int pqi_process_config_table(struct pqi_ctrl_info *ctrl_info)
 	struct pqi_config_table *config_table;
 	struct pqi_config_table_section_header *section;
 	struct pqi_config_table_section_info section_info;
-	struct pqi_config_table_section_info feature_section_info;
+	struct pqi_config_table_section_info feature_section_info = {0};
 
 	table_length = ctrl_info->config_table_length;
 	if (table_length == 0)
@@ -7996,6 +8076,21 @@ static int pqi_force_sis_mode(struct pqi_ctrl_info *ctrl_info)
 	return pqi_revert_to_sis_mode(ctrl_info);
 }
 
+static void pqi_perform_lockup_action(void)
+{
+	switch (pqi_lockup_action) {
+	case PANIC:
+		panic("FATAL: Smart Family Controller lockup detected");
+		break;
+	case REBOOT:
+		emergency_restart();
+		break;
+	case NONE:
+	default:
+		break;
+	}
+}
+
 static int pqi_ctrl_init(struct pqi_ctrl_info *ctrl_info)
 {
 	int rc;
@@ -8020,8 +8115,15 @@ static int pqi_ctrl_init(struct pqi_ctrl_info *ctrl_info)
 	 * commands.
 	 */
 	rc = sis_wait_for_ctrl_ready(ctrl_info);
-	if (rc)
+	if (rc) {
+		if (reset_devices) {
+			dev_err(&ctrl_info->pci_dev->dev,
+				"kdump init failed with error %d\n", rc);
+			pqi_lockup_action = REBOOT;
+			pqi_perform_lockup_action();
+		}
 		return rc;
+	}
 
 	/*
 	 * Get the controller properties.  This allows us to determine
@@ -8163,7 +8265,7 @@ static int pqi_ctrl_init(struct pqi_ctrl_info *ctrl_info)
 
 	if (ctrl_info->enable_r5_writes || ctrl_info->enable_r6_writes) {
 		rc = pqi_get_advanced_raid_bypass_config(ctrl_info);
-		if (rc) {
+		if (rc) { /* Supported features not returned correctly. */
 			dev_err(&ctrl_info->pci_dev->dev,
 				"error obtaining advanced RAID bypass configuration\n");
 			return rc;
@@ -8478,8 +8580,9 @@ static struct pqi_ctrl_info *pqi_alloc_ctrl_info(int numa_node)
 
 	INIT_WORK(&ctrl_info->event_work, pqi_event_worker);
 	atomic_set(&ctrl_info->num_interrupts, 0);
+#if !KFEATURE_HAS_HOST_TAGSET_SUPPORT
 	atomic_set(&ctrl_info->total_scmds_outstanding, 0);
-
+#endif
 	INIT_DELAYED_WORK(&ctrl_info->rescan_work, pqi_rescan_worker);
 	INIT_DELAYED_WORK(&ctrl_info->update_time_work, pqi_update_time_worker);
 
@@ -8505,6 +8608,7 @@ static struct pqi_ctrl_info *pqi_alloc_ctrl_info(int numa_node)
 	ctrl_info->max_write_raid_5_6 = PQI_DEFAULT_MAX_WRITE_RAID_5_6;
 	ctrl_info->max_write_raid_1_10_2drive = ~0;
 	ctrl_info->max_write_raid_1_10_3drive = ~0;
+	ctrl_info->disable_managed_interrupts = pqi_disable_managed_interrupts;
 
 	return ctrl_info;
 }
@@ -8522,7 +8626,6 @@ static void pqi_free_interrupts(struct pqi_ctrl_info *ctrl_info)
 
 static void pqi_free_ctrl_resources(struct pqi_ctrl_info *ctrl_info)
 {
-	pqi_stop_heartbeat_timer(ctrl_info);
 	pqi_free_interrupts(ctrl_info);
 	if (ctrl_info->queue_memory_base)
 		dma_free_coherent(&ctrl_info->pci_dev->dev,
@@ -8547,9 +8650,15 @@ static void pqi_free_ctrl_resources(struct pqi_ctrl_info *ctrl_info)
 
 static void pqi_remove_ctrl(struct pqi_ctrl_info *ctrl_info)
 {
+	ctrl_info->controller_online = false;
+	pqi_stop_heartbeat_timer(ctrl_info);
+	pqi_ctrl_block_requests(ctrl_info);
 	pqi_cancel_rescan_worker(ctrl_info);
 	pqi_cancel_update_time_worker(ctrl_info);
-	pqi_remove_all_scsi_devices(ctrl_info);
+	if (ctrl_info->ctrl_removal_state == PQI_CTRL_SURPRISE_REMOVAL) {
+		pqi_fail_all_outstanding_requests(ctrl_info);
+		ctrl_info->pqi_mode_enabled = false;
+       }
 	pqi_unregister_scsi(ctrl_info);
 	if (ctrl_info->pqi_mode_enabled)
 		pqi_revert_to_sis_mode(ctrl_info);
@@ -8736,8 +8845,7 @@ static int pqi_ofa_host_memory_update(struct pqi_ctrl_info *ctrl_info)
 			&request.data.ofa_memory_allocation.buffer_length);
 	}
 
-	return pqi_submit_raid_request_synchronous(ctrl_info, &request.header,
-		0, NULL);
+	return pqi_submit_raid_request_synchronous(ctrl_info, &request.header, 0, NULL);
 }
 
 static int pqi_ofa_ctrl_restart(struct pqi_ctrl_info *ctrl_info, unsigned int delay_secs)
@@ -8747,21 +8855,6 @@ static int pqi_ofa_ctrl_restart(struct pqi_ctrl_info *ctrl_info, unsigned int de
 	return pqi_ctrl_init_resume(ctrl_info);
 }
 
-static void pqi_perform_lockup_action(void)
-{
-	switch (pqi_lockup_action) {
-	case PANIC:
-		panic("FATAL: Smart Family Controller lockup detected");
-		break;
-	case REBOOT:
-		emergency_restart();
-		break;
-	case NONE:
-	default:
-		break;
-	}
-}
-
 static struct pqi_raid_error_info pqi_ctrl_offline_raid_error_info = {
 	.data_out_result = PQI_DATA_IN_OUT_HARDWARE_ERROR,
 	.status = SAM_STAT_CHECK_CONDITION,
@@ -8852,7 +8945,7 @@ static int pqi_pci_probe(struct pci_dev *pci_dev,
 	const struct pci_device_id *id)
 {
 	int rc;
-	int node, cp_node;
+	int node;
 	struct pqi_ctrl_info *ctrl_info;
 
 	pqi_print_ctrl_info(pci_dev, id);
@@ -8871,10 +8964,10 @@ static int pqi_pci_probe(struct pci_dev *pci_dev,
 
 	node = dev_to_node(&pci_dev->dev);
 	if (node == NUMA_NO_NODE) {
-		cp_node = cpu_to_node(0);
-		if (cp_node == NUMA_NO_NODE)
-			cp_node = 0;
-		set_dev_node(&pci_dev->dev, cp_node);
+		node = cpu_to_node(0);
+		if (node == NUMA_NO_NODE)
+			node = 0;
+		set_dev_node(&pci_dev->dev, node);
 	}
 
 	ctrl_info = pqi_alloc_ctrl_info(node);
@@ -8905,11 +8998,26 @@ static int pqi_pci_probe(struct pci_dev *pci_dev,
 static void pqi_pci_remove(struct pci_dev *pci_dev)
 {
 	struct pqi_ctrl_info *ctrl_info;
+	u16 vendor_id;
+	int rc;
 
 	ctrl_info = pci_get_drvdata(pci_dev);
 	if (!ctrl_info)
 		return;
 
+	pci_read_config_word(ctrl_info->pci_dev, PCI_SUBSYSTEM_VENDOR_ID, &vendor_id);
+	if (vendor_id == 0xffff)
+		ctrl_info->ctrl_removal_state = PQI_CTRL_SURPRISE_REMOVAL;
+	else
+		ctrl_info->ctrl_removal_state = PQI_CTRL_GRACEFUL_REMOVAL;
+
+	if (ctrl_info->ctrl_removal_state == PQI_CTRL_GRACEFUL_REMOVAL) {
+		rc = pqi_flush_cache(ctrl_info, RESTART);
+		if (rc)
+			dev_err(&pci_dev->dev,
+				"unable to flush controller cache during remove\n");
+	}
+
 	pqi_remove_ctrl(ctrl_info);
 }
 
@@ -8937,7 +9045,7 @@ static void pqi_dump_request(struct pqi_ctrl_info *ctrl_info,
 		scmd->cmnd[4], scmd->cmnd[5], scmd->cmnd[6], scmd->cmnd[7],
 		scmd->cmnd[8], scmd->cmnd[9], scmd->cmnd[10], scmd->cmnd[11],
 		scmd->cmnd[12], scmd->cmnd[13], scmd->cmnd[14], scmd->cmnd[15],
-		scmd, atomic_read(&device->scsi_cmds_outstanding));
+		scmd, atomic_read(&device->scsi_cmds_outstanding[scmd->device->lun]));
 	} else {
 		struct pqi_iu_header request_h;
 		size_t iu_length;
@@ -9025,26 +9133,49 @@ static void pqi_process_lockup_action_param(void)
 		DRIVER_NAME_SHORT, pqi_lockup_action_param);
 }
 
+#define PQI_CTRL_READY_TIMEOUT_PARAM_MIN_SECS		30
+#define PQI_CTRL_READY_TIMEOUT_PARAM_MAX_SECS		(30 * 60)
+
+static void pqi_process_ctrl_ready_timeout_param(void)
+{
+	if (pqi_ctrl_ready_timeout_secs == 0)
+		return;
+
+	if (pqi_ctrl_ready_timeout_secs < PQI_CTRL_READY_TIMEOUT_PARAM_MIN_SECS) {
+		pr_warn("%s: ctrl_ready_timeout parm of %u second(s) is less than minimum timeout of %d seconds - setting timeout to %d seconds\n",
+			DRIVER_NAME_SHORT, pqi_ctrl_ready_timeout_secs, PQI_CTRL_READY_TIMEOUT_PARAM_MIN_SECS, PQI_CTRL_READY_TIMEOUT_PARAM_MIN_SECS);
+		pqi_ctrl_ready_timeout_secs = PQI_CTRL_READY_TIMEOUT_PARAM_MIN_SECS;
+	} else if (pqi_ctrl_ready_timeout_secs > PQI_CTRL_READY_TIMEOUT_PARAM_MAX_SECS) {
+		pr_warn("%s: ctrl_ready_timeout parm of %u seconds is greater than maximum timeout of %d seconds - setting timeout to %d seconds\n",
+			DRIVER_NAME_SHORT, pqi_ctrl_ready_timeout_secs, PQI_CTRL_READY_TIMEOUT_PARAM_MAX_SECS, PQI_CTRL_READY_TIMEOUT_PARAM_MAX_SECS);
+		pqi_ctrl_ready_timeout_secs = PQI_CTRL_READY_TIMEOUT_PARAM_MAX_SECS;
+	}
+
+	sis_ctrl_ready_timeout_secs = pqi_ctrl_ready_timeout_secs;
+}
+
 static void pqi_process_module_params(void)
 {
 	pqi_process_lockup_action_param();
+	pqi_process_ctrl_ready_timeout_param();
 }
 
+#if defined(CONFIG_PM)
+
 static inline enum bmic_flush_cache_shutdown_event pqi_get_flush_cache_shutdown_event(struct pci_dev *pci_dev)
 {
 	if (pci_dev->subsystem_vendor == PCI_VENDOR_ID_ADAPTEC2 && pci_dev->subsystem_device == 0x1304)
 		return RESTART;
+
 	return SUSPEND;
 }
 
-#if defined(CONFIG_PM)
-
-static int pqi_suspend(struct pci_dev *pci_dev, pm_message_t state)
+static int pqi_suspend_or_freeze(struct device *dev, bool suspend)
 {
+	struct pci_dev *pci_dev;
 	struct pqi_ctrl_info *ctrl_info;
-	enum bmic_flush_cache_shutdown_event shutdown_event;
 
-	shutdown_event = pqi_get_flush_cache_shutdown_event(pci_dev);
+	pci_dev = to_pci_dev(dev);
 	ctrl_info = pci_get_drvdata(pci_dev);
 
 	pqi_wait_until_ofa_finished(ctrl_info);
@@ -9054,16 +9185,17 @@ static int pqi_suspend(struct pci_dev *pci_dev, pm_message_t state)
 	pqi_ctrl_block_device_reset(ctrl_info);
 	pqi_ctrl_block_requests(ctrl_info);
 	pqi_ctrl_wait_until_quiesced(ctrl_info);
-	pqi_flush_cache(ctrl_info, shutdown_event);
-	pqi_stop_heartbeat_timer(ctrl_info);
 
-	pqi_crash_if_pending_command(ctrl_info);
+	if (suspend) {
+		enum bmic_flush_cache_shutdown_event shutdown_event;
 
-	if (state.event == PM_EVENT_FREEZE)
-		return 0;
+		shutdown_event = pqi_get_flush_cache_shutdown_event(pci_dev);
+		pqi_flush_cache(ctrl_info, shutdown_event);
+	}
 
-	pci_save_state(pci_dev);
-	pci_set_power_state(pci_dev, pci_choose_state(pci_dev, state));
+	pqi_stop_heartbeat_timer(ctrl_info);
+	pqi_crash_if_pending_command(ctrl_info);
+	pqi_free_irqs(ctrl_info);
 
 	ctrl_info->controller_online = false;
 	ctrl_info->pqi_mode_enabled = false;
@@ -9071,44 +9203,87 @@ static int pqi_suspend(struct pci_dev *pci_dev, pm_message_t state)
 	return 0;
 }
 
-static int pqi_resume(struct pci_dev *pci_dev)
+static int pqi_suspend(struct device *dev)
+{
+	return pqi_suspend_or_freeze(dev, true);
+}
+
+static int pqi_resume_or_restore(struct device *dev)
 {
 	int rc;
+	struct pci_dev *pci_dev;
 	struct pqi_ctrl_info *ctrl_info;
 
+	pci_dev = to_pci_dev(dev);
 	ctrl_info = pci_get_drvdata(pci_dev);
 
-	if (pci_dev->current_state != PCI_D0) {
-		ctrl_info->max_hw_queue_index = 0;
-		pqi_free_interrupts(ctrl_info);
-		pqi_change_irq_mode(ctrl_info, IRQ_MODE_INTX);
-		rc = request_irq(pqi_pci_irq_vector(pci_dev, 0), pqi_irq_handler,
-			IRQF_SHARED, DRIVER_NAME_SHORT,
-			pqi_get_irq_cookie(ctrl_info, 0));
-		if (rc) {
-			dev_err(&ctrl_info->pci_dev->dev,
-				"irq %u init failed with error %d\n",
-				pci_dev->irq, rc);
-			return rc;
-		}
-		pqi_ctrl_unblock_device_reset(ctrl_info);
-		pqi_ctrl_unblock_requests(ctrl_info);
-		pqi_scsi_unblock_requests(ctrl_info);
-		pqi_ctrl_unblock_scan(ctrl_info);
-		return 0;
-	}
-
-	pci_set_power_state(pci_dev, PCI_D0);
-	pci_restore_state(pci_dev);
+	rc = pqi_request_irqs(ctrl_info);
+	if (rc)
+		return rc;
 
 	pqi_ctrl_unblock_device_reset(ctrl_info);
 	pqi_ctrl_unblock_requests(ctrl_info);
 	pqi_scsi_unblock_requests(ctrl_info);
 	pqi_ctrl_unblock_scan(ctrl_info);
 
+	ssleep(PQI_POST_RESET_DELAY_SECS);
+
 	return pqi_ctrl_init_resume(ctrl_info);
 }
 
+static int pqi_freeze(struct device *dev)
+{
+	return pqi_suspend_or_freeze(dev, false);
+}
+
+static int pqi_thaw(struct device *dev)
+{
+	int rc;
+	struct pci_dev *pci_dev;
+	struct pqi_ctrl_info *ctrl_info;
+
+	pci_dev = to_pci_dev(dev);
+	ctrl_info = pci_get_drvdata(pci_dev);
+
+	rc = pqi_request_irqs(ctrl_info);
+	if (rc)
+		return rc;
+
+	ctrl_info->controller_online = true;
+	ctrl_info->pqi_mode_enabled = true;
+
+	pqi_ctrl_unblock_device_reset(ctrl_info);
+	pqi_ctrl_unblock_requests(ctrl_info);
+	pqi_scsi_unblock_requests(ctrl_info);
+	pqi_ctrl_unblock_scan(ctrl_info);
+
+	return 0;
+}
+
+static int pqi_poweroff(struct device *dev)
+{
+	struct pci_dev *pci_dev;
+	struct pqi_ctrl_info *ctrl_info;
+	enum bmic_flush_cache_shutdown_event shutdown_event;
+
+	pci_dev = to_pci_dev(dev);
+	ctrl_info = pci_get_drvdata(pci_dev);
+
+	shutdown_event = pqi_get_flush_cache_shutdown_event(pci_dev);
+	pqi_flush_cache(ctrl_info, shutdown_event);
+
+	return 0;
+}
+
+static const struct dev_pm_ops pqi_pm_ops = {
+	.suspend = pqi_suspend,
+	.resume = pqi_resume_or_restore,
+	.freeze = pqi_freeze,
+	.thaw = pqi_thaw,
+	.poweroff = pqi_poweroff,
+	.restore = pqi_resume_or_restore,
+};
+
 #endif /* CONFIG_PM */
 
 /* Define the PCI IDs for the controllers that we support. */
@@ -9165,6 +9340,10 @@ static const struct pci_device_id pqi_pci_id_table[] = {
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
 			       PCI_VENDOR_ID_H3C, 0x1109)
 	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_H3C, 0x110b)
+	},
 	{
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
 			       PCI_VENDOR_ID_H3C, 0x8460)
@@ -9237,6 +9416,50 @@ static const struct pci_device_id pqi_pci_id_table[] = {
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
 			       PCI_VENDOR_ID_INSPUR, 0x0054)
 	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_INSPUR, 0x006b)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_INSPUR, 0x006c)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_INSPUR, 0x006d)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_INSPUR, 0x006f)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_INSPUR, 0x0070)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_INSPUR, 0x0071)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_INSPUR, 0x0072)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_INSPUR, 0x0086)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_INSPUR, 0x0087)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_INSPUR, 0x0088)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_INSPUR, 0x0089)
+	},
 	{
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
 			       PCI_VENDOR_ID_HUAWEI, 0xd227)
@@ -9269,6 +9492,10 @@ static const struct pci_device_id pqi_pci_id_table[] = {
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
 			       PCI_VENDOR_ID_ADAPTEC2, 0x0608)
 	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x0659)
+	},
 	{
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
 			       PCI_VENDOR_ID_ADAPTEC2, 0x0800)
@@ -9457,6 +9684,10 @@ static const struct pci_device_id pqi_pci_id_table[] = {
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
 			       PCI_VENDOR_ID_ADAPTEC2, 0x1462)
 	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1463)
+	},
 	{
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
 			       PCI_VENDOR_ID_ADAPTEC2, 0x1470)
@@ -9473,6 +9704,14 @@ static const struct pci_device_id pqi_pci_id_table[] = {
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
 			       PCI_VENDOR_ID_ADAPTEC2, 0x1473)
 	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1474)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x1475)
+	},
 	{
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
 			       PCI_VENDOR_ID_ADAPTEC2, 0x1480)
@@ -9497,6 +9736,18 @@ static const struct pci_device_id pqi_pci_id_table[] = {
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
 			       PCI_VENDOR_ID_ADAPTEC2, 0x14a2)
 	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x14a4)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x14a5)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x14a6)
+	},
 	{
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
 			       PCI_VENDOR_ID_ADAPTEC2, 0x14b0)
@@ -9513,6 +9764,18 @@ static const struct pci_device_id pqi_pci_id_table[] = {
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
 			       PCI_VENDOR_ID_ADAPTEC2, 0x14c1)
 	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x14c2)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x14c3)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ADAPTEC2, 0x14c4)
+	},
 	{
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
 			       PCI_VENDOR_ID_ADAPTEC2, 0x14d0)
@@ -9621,6 +9884,18 @@ static const struct pci_device_id pqi_pci_id_table[] = {
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
 			       PCI_VENDOR_ID_HPE, 0x036f)
 	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HPE, 0x0381)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HPE, 0x0382)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_HPE, 0x0383)
+	},
 	{
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
 			       PCI_VENDOR_ID_FIBERHOME, 0x0800)
@@ -9645,6 +9920,10 @@ static const struct pci_device_id pqi_pci_id_table[] = {
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
 			       PCI_VENDOR_ID_NTCOM, 0x3161)
 	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_NT, 0x3161)
+	},
 	{
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
 			       PCI_VENDOR_ID_ZTE, 0x5445)
@@ -9693,6 +9972,58 @@ static const struct pci_device_id pqi_pci_id_table[] = {
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
 			       PCI_VENDOR_ID_ZTE, 0x0b45)
 	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_RAMAXEL, 0x0101)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_RAMAXEL, 0x0201)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_LENOVO, 0x0220)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_LENOVO, 0x0221)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_LENOVO, 0x0520)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_LENOVO, 0x0522)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_LENOVO, 0x0620)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_LENOVO, 0x0621)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_LENOVO, 0x0622)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_LENOVO, 0x0623)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+				0x1e93, 0x1000)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+				0x1e93, 0x1001)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+				0x1e93, 0x1002)
+	},
 	{
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
 			       PCI_ANY_ID, PCI_ANY_ID)
@@ -9709,8 +10040,9 @@ static struct pci_driver pqi_pci_driver = {
 	.remove = pqi_pci_remove,
 	.shutdown = pqi_shutdown,
 #if defined(CONFIG_PM)
-	.suspend = pqi_suspend,
-	.resume = pqi_resume,
+	.driver = {
+		.pm = &pqi_pm_ops
+	},
 #endif
 };
 
@@ -9719,6 +10051,8 @@ static int __init pqi_init(void)
 	int rc;
 
 	pr_info(DRIVER_NAME "\n");
+	pqi_verify_structures();
+	sis_verify_structures();
 
 	pqi_sas_transport_template = sas_attach_transport(&pqi_sas_transport_functions);
 	if (!pqi_sas_transport_template)
@@ -9742,7 +10076,7 @@ static void __exit pqi_cleanup(void)
 module_init(pqi_init);
 module_exit(pqi_cleanup);
 
-static void __attribute__((unused)) verify_structures(void)
+static void pqi_verify_structures(void)
 {
 	BUILD_BUG_ON(offsetof(struct pqi_ctrl_registers,
 		sis_host_to_ctrl_doorbell) != 0x20);
diff --git a/drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.c b/drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.c
index c0c598f99c4c6..d4c280e8a3453 100644
--- a/drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.c
+++ b/drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.c
@@ -122,16 +122,26 @@ static int pqi_map_queues(struct Scsi_Host *shost)
 {
 	struct pqi_ctrl_info *ctrl_info = shost_to_hba(shost);
 
+	if (!ctrl_info->disable_managed_interrupts) {
 #if KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V1
-	return blk_mq_pci_map_queues(&shost->tag_set, ctrl_info->pci_dev);
+		return blk_mq_pci_map_queues(&shost->tag_set, ctrl_info->pci_dev);
 #elif KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V2
-	return blk_mq_pci_map_queues(&shost->tag_set, ctrl_info->pci_dev, 0);
+		return blk_mq_pci_map_queues(&shost->tag_set, ctrl_info->pci_dev, 0);
 #elif KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V3
-	return blk_mq_pci_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT],
-					ctrl_info->pci_dev, 0);
+		return blk_mq_pci_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT],
+						ctrl_info->pci_dev, 0);
 #else
 	#error "A version for KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES has not been defined."
 #endif
+	} else {
+#if KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V1
+		return blk_mq_map_queues(&shost->tag_set);
+#elif KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V2
+		return blk_mq_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT]);
+#else
+	#error "A version for KFEATURE_HAS_BLK_MQ_MAP_QUEUES has not been defined."
+#endif
+	}
 }
 #endif /* KFEATURE_ENABLE_SCSI_MAP_QUEUES */
 
@@ -150,9 +160,6 @@ void pqi_compat_init_scsi_host_template(struct scsi_host_template *hostt)
 #if KFEATURE_ENABLE_SCSI_MAP_QUEUES
 	hostt->map_queues = pqi_map_queues;
 #endif
-#if KFEATURE_HAS_NCQ_PRIO_SUPPORT
-	hostt->sdev_attrs = &pqi_ncq_prio_sdev_attrs;
-#endif
 }
 
 void pqi_compat_init_scsi_host(struct Scsi_Host *shost,
@@ -389,3 +396,62 @@ int pqi_pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
 	return num_vectors_enabled;
 #endif
 }
+
+#if KFEATURE_HAS_SCSI_CMD_PRIV
+struct pqi_cmd_priv *pqi_cmd_priv(struct scsi_cmnd *cmd)
+{
+	return scsi_cmd_priv(cmd);
+}
+#endif
+
+#if !KFEATURE_HAS_HOST_TAGSET_SUPPORT
+
+struct pqi_io_request *pqi_get_io_request(struct pqi_ctrl_info *ctrl_info, struct scsi_cmnd *scmd)
+{
+	struct pqi_io_request *io_request;
+	u16 i = smp_processor_id() * ctrl_info->per_cpu_factor;
+
+	while (1) {
+		io_request = &ctrl_info->io_request_pool[i];
+		if (atomic_inc_return(&io_request->refcount) == 1)
+			break;
+		atomic_dec(&io_request->refcount);
+		i = (i + 1) % ctrl_info->max_io_slots;
+	}
+
+	return io_request;
+}
+
+#else
+
+struct pqi_io_request *pqi_get_io_request(struct pqi_ctrl_info *ctrl_info, struct scsi_cmnd *scmd)
+{
+	struct pqi_io_request *io_request;
+	u16 i;
+
+	if (scmd) {
+		u32 blk_tag = blk_mq_unique_tag(PQI_SCSI_REQUEST(scmd));
+
+		i = blk_mq_unique_tag_to_tag(blk_tag);
+		io_request = &ctrl_info->io_request_pool[i];
+		if (atomic_inc_return(&io_request->refcount) > 1) {
+			atomic_dec(&io_request->refcount);
+			return NULL;
+		}
+	} else {
+		/*
+		 * benignly racy - may have to wait for an open slot.
+		 */
+		i = 0;
+		while (1) {
+			io_request = &ctrl_info->io_request_pool[ctrl_info->scsi_ml_can_queue + i];
+			if (atomic_inc_return(&io_request->refcount) == 1)
+				break;
+			atomic_dec(&io_request->refcount);
+			i = (i + 1) % PQI_RESERVED_IO_SLOTS;
+		}
+	}
+
+	return io_request;
+}
+#endif
diff --git a/drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.h b/drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.h
index 4ae705d86ba74..1533444644b80 100644
--- a/drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.h
+++ b/drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.h
@@ -17,6 +17,9 @@
  *
  */
 
+/* needed for struct definitions */
+#include <linux/pci.h>
+
 #if !defined(_SMARTPQI_KERNEL_COMPAT_H)
 #define _SMARTPQI_KERNEL_COMPAT_H
 
@@ -70,6 +73,13 @@
 #define RHEL8
 #endif
 
+/* ----- RHEL9 variants --------- */
+#if \
+	defined(RHEL9U0)    || \
+	defined(RHEL9U1)
+#define RHEL9
+#endif
+
 /* ----- SLES11 variants --------- */
 #if \
 	defined(SLES11SP0) || \
@@ -96,11 +106,25 @@
 	defined(SLES15SP0) || \
 	defined(SLES15SP1) || \
 	defined(SLES15SP2) || \
-	defined(SLES15SP3) || \
-	defined(SLES15SP4)
+	defined(SLES15SP3)
 #define SLES15
 #endif
 
+/* ----- KCLASS5 variants --------- */
+#if \
+	defined(KCLASS5A) || \
+	defined(KCLASS5B) || \
+	defined(KCLASS5C) || \
+	defined(KCLASS5D)
+#define KCLASS5
+#endif
+
+/* ----- KCLASS6 variants --------- */
+#if \
+	defined(KCLASS6A)
+#define KCLASS6
+#endif
+
 #include <scsi/scsi_tcq.h>
 #include <linux/bsg-lib.h>
 #include <linux/ktime.h>
@@ -157,12 +181,14 @@
 #endif
 #if defined(RHEL7U4ARM) || defined(RHEL7U5ARM)
 #endif
-#elif defined(RHEL8)
+#elif defined(RHEL8) || defined(RHEL9) || defined(KCLASS5) || \
+      defined(KCLASS6) || defined(OEULER2203)
 #define KFEATURE_ENABLE_PCI_ALLOC_IRQ_VECTORS 		1
 #define KFEATURE_HAS_MQ_SUPPORT 			1
 #define shost_use_blk_mq(x) 				1
 #define KFEATURE_ENABLE_SCSI_MAP_QUEUES 		1
 #define KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V3		1
+#define KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V2		1
 #elif defined(SLES11)
 #define KFEATURE_HAS_WAIT_FOR_COMPLETION_IO		0
 #define KFEATURE_HAS_NO_WRITE_SAME			0
@@ -193,11 +219,28 @@
 #define KFEATURE_ENABLE_SCSI_MAP_QUEUES 		1
 #if defined(SLES15SP0)
 #define KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V1 		1
+#define KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V1		1
 #elif defined(SLES15SP1)
 #define KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V2 		1
+#define KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V1		1
 #else
 #define KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V3 		1
+#define KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V2		1
 #endif
+#elif defined(OEULER2003)
+#define dma_zalloc_coherent				dma_alloc_coherent
+#define KFEATURE_HAS_KTIME_SECONDS			1
+#define KFEATURE_HAS_SCSI_REQUEST			1
+#define KFEATURE_HAS_KTIME64				1
+#define KFEATURE_HAS_BSG_JOB_SMP_HANDLER		1
+#define KFEATURE_HAS_USE_CLUSTERING			0
+#define KFEATURE_HAS_NCQ_PRIO_SUPPORT			1
+#define KFEATURE_ENABLE_PCI_ALLOC_IRQ_VECTORS		1
+#define KFEATURE_HAS_MQ_SUPPORT			1
+#define shost_use_blk_mq(x)				1
+#define KFEATURE_ENABLE_SCSI_MAP_QUEUES		1
+#define KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V2		1
+#define KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V1		1
 #elif defined(UBUNTU1404) || TORTUGA || defined(KCLASS3C)
 #define KFEATURE_HAS_PCI_ENABLE_MSIX_RANGE		0
 #define KFEATURE_HAS_ATOMIC_HOST_BUSY			0
@@ -212,46 +255,57 @@
 #if defined(KCLASS4B) || defined(KCLASS4C) || defined(SLES12SP4) || \
     defined(SLES12SP5) || defined(RHEL8) || defined(KCLASS5A) || \
     defined(KCLASS5B) || defined(KCLASS5C) || defined(KCLASS5D) || \
-    defined(SLES15SP2) || defined(SLES15SP3) || defined (CENTOS7ALTARM)
+    defined(SLES15SP2) || defined(SLES15SP3) || defined(SLES15SP4) || \
+    defined(RHEL9) || defined(CENTOS7ALTARM) || defined(OEULER2203) || \
+    defined(KCLASS6)
 #define KFEATURE_HAS_KTIME_SECONDS			1
 #define KFEATURE_HAS_SCSI_REQUEST			1
 #define KFEATURE_HAS_KTIME64				1
 #endif
 #if defined(KCLASS4C) || defined(RHEL8) || defined(SLES15SP1) || \
-    defined(SLES15SP2) || defined(SLES15SP3) || defined(KCLASS5A) || \
-    defined(KCLASS5B) || defined(KCLASS5C) || defined(KCLASS5D) || \
-    defined(SLES12SP5) || defined (CENTOS7ALTARM)
+    defined(SLES15SP2) || defined(SLES15SP3) || defined(SLES15SP4) || \
+    defined(KCLASS5A) ||  defined(KCLASS5B) || defined(KCLASS5C) || \
+    defined(KCLASS5D) ||  defined(SLES12SP5) || defined(CENTOS7ALTARM) || \
+    defined(RHEL9) || defined(OEULER2203) || defined(KCLASS6)
 #define KFEATURE_HAS_BSG_JOB_SMP_HANDLER		1
 #endif
-#if defined(RHEL8U3) || defined(RHEL8U4) || defined(RHEL8U5)
+#if defined(RHEL8U3) || defined(RHEL8U4) || defined(RHEL8U5) || \
+    defined(RHEL8U6) || defined(RHEL8U7)
 #define KFEATURE_HAS_HOST_BUSY_FUNCTION			1
 #endif
 
 #if defined(KCLASS3D)
 #define KFEATURE_HAS_KTIME_SECONDS			1
 #endif
-#if defined(KCLASS5A) || defined(KCLASS5B) || defined(KCLASS5C) || defined(KCLASS5D) || \
-    defined(KCLASS4D) || defined(SLES15SP2) || defined(SLES15SP3)
+#if defined(KCLASS5A) || defined(KCLASS5B) || defined(KCLASS5C) || \
+    defined(KCLASS5D) || defined(KCLASS4D) || defined(SLES15SP2) || \
+    defined(SLES15SP3) || defined(SLES15SP4) || defined(RHEL9) || \
+    defined(OEULER2203) || defined(KCLASS6)
 #define dma_zalloc_coherent	dma_alloc_coherent
 #define shost_use_blk_mq(x)	1
 #define KFEATURE_HAS_USE_CLUSTERING			0
 #endif
 
 #if defined(KCLASS5B) || defined(KCLASS5C) || defined(KCLASS5D) || \
-    defined(KCLASS4D) || defined(SLES15SP2) || defined(SLES15SP3)
+    defined(KCLASS4D) || defined(SLES15SP2) || defined(SLES15SP3) || \
+    defined(SLES15SP4) || defined(RHEL9) || defined(OEULER2003) || \
+    defined(OEULER2203) || defined(KCLASS6)
 #define IOCTL_INT	unsigned int
 #else
 #define IOCTL_INT	int
 #endif
 
-#if defined(KCLASS5C) || defined(KCLASS5D)
+#if defined(KCLASS5C) || defined(KCLASS5D) || defined(SLES15SP4) || \
+    defined(RHEL9) || defined(OEULER2203) || defined(KCLASS6)
 #define KFEATURE_HAS_HOST_BUSY_FUNCTION			1
 #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
 #define ioremap_nocache ioremap
 #endif
 
-#if defined(KCLASS5A) || defined(KCLASS5B) || defined(KCLASS5C) || defined(KCLASS5D) || \
-	defined(KCLASS4C) || defined(KCLASS4D) || defined(RHEL8) || defined(SLES15)
+#if defined(KCLASS5A) || defined(KCLASS5B) || defined(KCLASS5C) || \
+    defined(KCLASS5D) || defined(KCLASS4C) || defined(KCLASS4D) || \
+    defined(RHEL8) || defined(SLES15) || defined(SLES15SP4) || \
+    defined(RHEL9) || defined(OEULER2203) || defined(KCLASS6)
 #define KFEATURE_HAS_NCQ_PRIO_SUPPORT			1
 #endif
 
@@ -335,9 +389,69 @@
 #if !defined(KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V3)
 #define KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V3 		0
 #endif
+#if !defined(KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V1)
+#define KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V1		0
+#endif
+#if !defined(KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V2)
+#define KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V2		0
+#endif
 #if !defined(KFEATURE_HAS_NCQ_PRIO_SUPPORT)
 #define KFEATURE_HAS_NCQ_PRIO_SUPPORT			0
 #endif
+#if !defined(KFEATURE_HAS_GLOBAL_SCSI_DONE)
+#define KFEATURE_HAS_GLOBAL_SCSI_DONE			0
+#endif
+#if !defined(KFEATURE_HAS_HOST_TAGSET_SUPPORT)
+#define KFEATURE_HAS_HOST_TAGSET_SUPPORT		0
+#endif
+/* Check for change in host device attributes are defined */
+#if !defined(KFEATURE_HAS_SDEV_GROUPS)
+#define KFEATURE_HAS_SDEV_GROUPS			0
+#	define PQI_DEVICE_ATTRIBUTE device_attribute
+#	define PQI_ATTR
+#	define PQI_ATTRIBUTE_GROUPS(x)
+#	define PQI_ATTRIBUTE(x) (x)
+#	define PQI_SDEV_ATTRS \
+	.sdev_attrs = pqi_sdev_attrs
+#	define PQI_SHOST_ATTRS \
+	.shost_attrs = pqi_shost_attrs
+/* Newer device attribute groups defined */
+#else
+#	define PQI_DEVICE_ATTRIBUTE attribute
+#	define PQI_ATTRIBUTE_GROUPS(x) \
+		ATTRIBUTE_GROUPS(x);
+#	define PQI_ATTRIBUTE(x) (x.attr)
+#	define PQI_SDEV_ATTRS \
+	.sdev_groups = pqi_sdev_groups
+#	define PQI_SHOST_ATTRS \
+	.shost_groups = pqi_shost_groups
+#endif
+
+#if !defined(KFEATURE_HAS_SCSI_CMD_TO_RQ)
+#define  KFEATURE_HAS_SCSI_CMD_TO_RQ			0
+#	define PQI_SCSI_REQUEST(x) \
+		x->request
+#else
+#	define PQI_SCSI_REQUEST(x) \
+		scsi_cmd_to_rq(x)
+#endif
+
+#if !defined(KFEATURE_HAS_SCSI_CMD_PRIV)
+#define KFEATURE_HAS_SCSI_CMD_PRIV			0
+#	define PQI_CMD_PRIV
+#	define PQI_SCSI_CMD_RESIDUAL(scmd) \
+		(scmd->SCp.this_residual)
+#else
+#	define PQI_CMD_PRIV \
+	.cmd_size = sizeof(struct pqi_cmd_priv),
+	struct pqi_cmd_priv {
+		int this_residual;
+	};
+	struct pqi_cmd_priv *pqi_cmd_priv(struct scsi_cmnd *cmd);
+#	define PQI_SCSI_CMD_RESIDUAL(scmd) \
+		pqi_cmd_priv(scmd)->this_residual
+#endif
+
 #if !defined(list_next_entry)
 #define list_next_entry(pos, member) \
 	list_entry((pos)->member.next, typeof(*(pos)), member)
@@ -434,10 +548,22 @@ static inline void pqi_disable_write_same(struct scsi_device *sdev)
 #define PCI_VENDOR_ID_NTCOM		0x1dfc
 #endif
 
+#if !defined(PCI_VENDOR_ID_NT)
+#define PCI_VENDOR_ID_NT		0x1f0c
+#endif
+
 #if !defined(PCI_VENDOR_ID_ZTE)
 #define PCI_VENDOR_ID_ZTE		0x1cf2
 #endif
 
+#if !defined(PCI_VENDOR_ID_RAMAXEL)
+#define PCI_VENDOR_ID_RAMAXEL		0x1cc4
+#endif
+
+#if !defined(PCI_VENDOR_ID_LENOVO)
+#define PCI_VENDOR_ID_LENOVO		0x1d49
+#endif
+
 #if !defined(offsetofend)
 #define offsetofend(TYPE, MEMBER) \
 	(offsetof(TYPE, MEMBER)	+ sizeof(((TYPE *)0)->MEMBER))
@@ -470,8 +596,13 @@ static inline unsigned long wait_for_completion_io(struct completion *x)
 static inline void pqi_scsi_done(struct scsi_cmnd *scmd)
 {
 	pqi_prep_for_scsi_done(scmd);
+#if !KFEATURE_HAS_GLOBAL_SCSI_DONE
 	if (scmd && scmd->scsi_done)
 		scmd->scsi_done(scmd);
+#else
+	if (scmd)
+		scsi_done(scmd);
+#endif
 }
 
 #else
@@ -541,14 +672,12 @@ static inline u16 pqi_get_hw_queue(struct pqi_ctrl_info *ctrl_info,
 
 #if KFEATURE_HAS_MQ_SUPPORT
 	if (shost_use_blk_mq(scmd->device->host))
-		hw_queue = blk_mq_unique_tag_to_hwq(blk_mq_unique_tag(scmd->request));
+		hw_queue = blk_mq_unique_tag_to_hwq(blk_mq_unique_tag(PQI_SCSI_REQUEST(scmd)));
 	else
-		hw_queue = smp_processor_id();
+		hw_queue = smp_processor_id() % ctrl_info->num_queue_groups;
 #else
-	hw_queue = smp_processor_id();
+	hw_queue = smp_processor_id() % ctrl_info->num_queue_groups;
 #endif
-	if (hw_queue > ctrl_info->max_hw_queue_index)
-		hw_queue = 0;
 
 	return hw_queue;
 }
@@ -661,6 +790,7 @@ int pqi_pci_irq_vector(struct pci_dev *dev, unsigned int nr);
 int pqi_pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
                               unsigned int max_vecs, unsigned int flags);
 void pqi_pci_free_irq_vectors(struct pci_dev *dev);
+struct pqi_io_request *pqi_get_io_request(struct pqi_ctrl_info *ctrl_info, struct scsi_cmnd *scmd);
 
 static inline void *pqi_get_irq_cookie(struct pqi_ctrl_info *ctrl_info, unsigned int nr)
 {
@@ -671,4 +801,11 @@ static inline void *pqi_get_irq_cookie(struct pqi_ctrl_info *ctrl_info, unsigned
 #endif
 }
 
+#if !KFEATURE_HAS_HOST_TAGSET_SUPPORT
+#define PQI_SET_HOST_TAGSET(s)
+#else
+#define PQI_SET_HOST_TAGSET(s) \
+	s->host_tagset = 1;
+#endif
+
 #endif	/* _SMARTPQI_KERNEL_COMPAT_H */
diff --git a/drivers/amazon/scsi/smartpqi/smartpqi_sis.c b/drivers/amazon/scsi/smartpqi/smartpqi_sis.c
index 5a6369668c382..5381bfb39090e 100644
--- a/drivers/amazon/scsi/smartpqi/smartpqi_sis.c
+++ b/drivers/amazon/scsi/smartpqi/smartpqi_sis.c
@@ -44,7 +44,7 @@
 #define SIS_CLEAR_CTRL_TO_HOST_DOORBELL		0x1000
 
 #define SIS_CMD_STATUS_SUCCESS			0x1
-#define SIS_CMD_COMPLETE_TIMEOUT_SECS		(30 * HZ)
+#define SIS_CMD_COMPLETE_TIMEOUT_SECS		30
 #define SIS_CMD_COMPLETE_POLL_INTERVAL_MSECS	10
 
 /* used with SIS_CMD_GET_ADAPTER_PROPERTIES command */
@@ -63,11 +63,11 @@
 #define SIS_CTRL_KERNEL_UP			0x80
 #define SIS_CTRL_KERNEL_PANIC			0x100
 #if TORTUGA
-#define SIS_CTRL_READY_TIMEOUT_SECS		(150 * HZ)
+#define SIS_CTRL_READY_TIMEOUT_SECS		150
 #else
-#define SIS_CTRL_READY_TIMEOUT_SECS		(180 * HZ)
+#define SIS_CTRL_READY_TIMEOUT_SECS		180
 #endif
-#define SIS_CTRL_READY_RESUME_TIMEOUT_SECS	(90 * HZ)
+#define SIS_CTRL_READY_RESUME_TIMEOUT_SECS	90
 #define SIS_CTRL_READY_POLL_INTERVAL_MSECS	10
 
 enum sis_fw_triage_status {
@@ -98,13 +98,15 @@ struct sis_base_struct {
 
 #pragma pack()
 
+unsigned int sis_ctrl_ready_timeout_secs = SIS_CTRL_READY_TIMEOUT_SECS;
+
 static int sis_wait_for_ctrl_ready_with_timeout(struct pqi_ctrl_info *ctrl_info,
 	unsigned int timeout_secs)
 {
 	unsigned long timeout;
 	u32 status;
 
-	timeout = timeout_secs + jiffies;
+	timeout = (timeout_secs * HZ) + jiffies;
 
 	while (1) {
 		status = readl(&ctrl_info->registers->sis_firmware_status);
@@ -134,7 +136,7 @@ static int sis_wait_for_ctrl_ready_with_timeout(struct pqi_ctrl_info *ctrl_info,
 int sis_wait_for_ctrl_ready(struct pqi_ctrl_info *ctrl_info)
 {
 	return sis_wait_for_ctrl_ready_with_timeout(ctrl_info,
-		SIS_CTRL_READY_TIMEOUT_SECS);
+		sis_ctrl_ready_timeout_secs);
 }
 
 int sis_wait_for_ctrl_ready_resume(struct pqi_ctrl_info *ctrl_info)
@@ -150,7 +152,7 @@ bool sis_is_firmware_running(struct pqi_ctrl_info *ctrl_info)
 
 	status = readl(&ctrl_info->registers->sis_firmware_status);
 
-	if (status & SIS_CTRL_KERNEL_PANIC)
+	if (status != ~0 && (status & SIS_CTRL_KERNEL_PANIC))
 		running = false;
 	else
 		running = true;
@@ -206,6 +208,7 @@ static int sis_send_sync_cmd(struct pqi_ctrl_info *ctrl_info,
 
 	/* Disable doorbell interrupts by masking all interrupts. */
 	writel(~0, &registers->sis_interrupt_mask);
+	usleep_range(1000, 2000);
 
 	/*
 	 * Force the completion of the interrupt mask register write before
@@ -221,7 +224,7 @@ static int sis_send_sync_cmd(struct pqi_ctrl_info *ctrl_info,
 	 * the top of the loop in order to give the controller time to start
 	 * processing the command before we start polling.
 	 */
-	timeout = SIS_CMD_COMPLETE_TIMEOUT_SECS + jiffies;
+	timeout = (SIS_CMD_COMPLETE_TIMEOUT_SECS * HZ) + jiffies;
 	while (1) {
 		msleep(SIS_CMD_COMPLETE_POLL_INTERVAL_MSECS);
 		doorbell = readl(&registers->sis_ctrl_to_host_doorbell);
@@ -358,7 +361,7 @@ int sis_init_base_struct_addr(struct pqi_ctrl_info *ctrl_info)
 	return rc;
 }
 
-#define SIS_DOORBELL_BIT_CLEAR_TIMEOUT_SECS	(30 * HZ)
+#define SIS_DOORBELL_BIT_CLEAR_TIMEOUT_SECS	30
 
 static int sis_wait_for_doorbell_bit_to_clear(
 	struct pqi_ctrl_info *ctrl_info, u32 bit)
@@ -367,7 +370,7 @@ static int sis_wait_for_doorbell_bit_to_clear(
 	u32 doorbell_register;
 	unsigned long timeout;
 
-	timeout = SIS_DOORBELL_BIT_CLEAR_TIMEOUT_SECS + jiffies;
+	timeout = (SIS_DOORBELL_BIT_CLEAR_TIMEOUT_SECS * HZ) + jiffies;
 
 	while (1) {
 		doorbell_register =
@@ -395,6 +398,7 @@ static int sis_wait_for_doorbell_bit_to_clear(
 static inline int sis_set_doorbell_bit(struct pqi_ctrl_info *ctrl_info, u32 bit)
 {
 	writel(bit, &ctrl_info->registers->sis_host_to_ctrl_doorbell);
+	usleep_range(1000, 2000);
 
 	return sis_wait_for_doorbell_bit_to_clear(ctrl_info, bit);
 }
@@ -435,6 +439,7 @@ int sis_reenable_sis_mode(struct pqi_ctrl_info *ctrl_info)
 void sis_write_driver_scratch(struct pqi_ctrl_info *ctrl_info, u32 value)
 {
 	writel(value, &ctrl_info->registers->sis_driver_scratch);
+	usleep_range(1000, 2000);
 }
 
 u32 sis_read_driver_scratch(struct pqi_ctrl_info *ctrl_info)
@@ -455,7 +460,7 @@ void sis_soft_reset(struct pqi_ctrl_info *ctrl_info)
 		&ctrl_info->registers->sis_host_to_ctrl_doorbell);
 }
 
-#define SIS_FW_TRIAGE_STATUS_TIMEOUT_SECS		(300 * HZ)
+#define SIS_FW_TRIAGE_STATUS_TIMEOUT_SECS		300
 #define SIS_FW_TRIAGE_STATUS_POLL_INTERVAL_SECS		1
 
 int sis_wait_for_fw_triage_completion(struct pqi_ctrl_info *ctrl_info)
@@ -464,7 +469,7 @@ int sis_wait_for_fw_triage_completion(struct pqi_ctrl_info *ctrl_info)
 	enum sis_fw_triage_status status;
 	unsigned long timeout;
 
-	timeout = SIS_FW_TRIAGE_STATUS_TIMEOUT_SECS + jiffies;
+	timeout = (SIS_FW_TRIAGE_STATUS_TIMEOUT_SECS * HZ) + jiffies;
 	while (1) {
 		status = sis_read_firmware_triage_status(ctrl_info);
 		if (status == FW_TRIAGE_COND_INVALID) {
@@ -492,7 +497,7 @@ int sis_wait_for_fw_triage_completion(struct pqi_ctrl_info *ctrl_info)
 
 }
 
-static void __attribute__((unused)) verify_structures(void)
+void sis_verify_structures(void)
 {
 	BUILD_BUG_ON(offsetof(struct sis_base_struct,
 		revision) != 0x0);
diff --git a/drivers/amazon/scsi/smartpqi/smartpqi_sis.h b/drivers/amazon/scsi/smartpqi/smartpqi_sis.h
index 5a265d52e3585..ad570d4cc16d6 100644
--- a/drivers/amazon/scsi/smartpqi/smartpqi_sis.h
+++ b/drivers/amazon/scsi/smartpqi/smartpqi_sis.h
@@ -20,6 +20,7 @@
 #if !defined(_SMARTPQI_SIS_H)
 #define _SMARTPQI_SIS_H
 
+void sis_verify_structures(void);
 int sis_wait_for_ctrl_ready(struct pqi_ctrl_info *ctrl_info);
 int sis_wait_for_ctrl_ready_resume(struct pqi_ctrl_info *ctrl_info);
 bool sis_is_firmware_running(struct pqi_ctrl_info *ctrl_info);
@@ -39,4 +40,6 @@ void sis_soft_reset(struct pqi_ctrl_info *ctrl_info);
 u32 sis_get_product_id(struct pqi_ctrl_info *ctrl_info);
 int sis_wait_for_fw_triage_completion(struct pqi_ctrl_info *ctrl_info);
 
+extern unsigned int sis_ctrl_ready_timeout_secs;
+
 #endif	/* _SMARTPQI_SIS_H */

From e31a829a5f377455ca69cd85167dedbdfa81a960 Mon Sep 17 00:00:00 2001
From: Rikard Falkeborn <rikard.falkeborn@gmail.com>
Date: Sun, 17 Jan 2021 22:28:47 +0100
Subject: [PATCH 635/737] perf: Constify static struct attribute_group

The only usage is to put their addresses in an array of pointers to
const struct attribute group. Make them const to allow the compiler
to put them in read-only memory.

Conflicts:
  * arm_dmc620_pmu.c: doesn't exist in 5.10
  * arm_smmuv3_pmu.c: code differed, applied change manually

Signed-off-by: Rikard Falkeborn <rikard.falkeborn@gmail.com>
Link: https://lore.kernel.org/r/20210117212847.21319-5-rikard.falkeborn@gmail.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit f0c140481d1b807217cacdcf11d24cfa407a7a53)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/arm-cci.c        | 2 +-
 drivers/perf/arm-cmn.c        | 2 +-
 drivers/perf/arm_pmu.c        | 2 +-
 drivers/perf/arm_smmuv3_pmu.c | 6 +++---
 drivers/perf/arm_spe_pmu.c    | 6 +++---
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/perf/arm-cci.c b/drivers/perf/arm-cci.c
index 87c4be9dd4125..a75cf77c4de4c 100644
--- a/drivers/perf/arm-cci.c
+++ b/drivers/perf/arm-cci.c
@@ -1376,7 +1376,7 @@ static struct attribute *pmu_attrs[] = {
 	NULL,
 };
 
-static struct attribute_group pmu_attr_group = {
+static const struct attribute_group pmu_attr_group = {
 	.attrs = pmu_attrs,
 };
 
diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 36061aaf026c8..c8c784daef4ee 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -616,7 +616,7 @@ static struct attribute *arm_cmn_cpumask_attrs[] = {
 	NULL,
 };
 
-static struct attribute_group arm_cmn_cpumask_attr_group = {
+static const struct attribute_group arm_cmn_cpumask_attr_group = {
 	.attrs = arm_cmn_cpumask_attrs,
 };
 
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 7fd11ef5cb8a2..952264f4fd796 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -575,7 +575,7 @@ static struct attribute *armpmu_common_attrs[] = {
 	NULL,
 };
 
-static struct attribute_group armpmu_common_attr_group = {
+static const struct attribute_group armpmu_common_attr_group = {
 	.attrs = armpmu_common_attrs,
 };
 
diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index f5a33dbe7acb9..da3d1d807c179 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -493,7 +493,7 @@ static struct attribute *smmu_pmu_cpumask_attrs[] = {
 	NULL
 };
 
-static struct attribute_group smmu_pmu_cpumask_group = {
+static const struct attribute_group smmu_pmu_cpumask_group = {
 	.attrs = smmu_pmu_cpumask_attrs,
 };
 
@@ -548,7 +548,7 @@ static umode_t smmu_pmu_event_is_visible(struct kobject *kobj,
 	return 0;
 }
 
-static struct attribute_group smmu_pmu_events_group = {
+static const struct attribute_group smmu_pmu_events_group = {
 	.name = "events",
 	.attrs = smmu_pmu_events,
 	.is_visible = smmu_pmu_event_is_visible,
@@ -568,7 +568,7 @@ static struct attribute *smmu_pmu_formats[] = {
 	NULL
 };
 
-static struct attribute_group smmu_pmu_format_group = {
+static const struct attribute_group smmu_pmu_format_group = {
 	.name = "format",
 	.attrs = smmu_pmu_formats,
 };
diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c
index 6fbfcab4918cf..03cb78764e7c7 100644
--- a/drivers/perf/arm_spe_pmu.c
+++ b/drivers/perf/arm_spe_pmu.c
@@ -164,7 +164,7 @@ static struct attribute *arm_spe_pmu_cap_attr[] = {
 	NULL,
 };
 
-static struct attribute_group arm_spe_pmu_cap_group = {
+static const struct attribute_group arm_spe_pmu_cap_group = {
 	.name	= "caps",
 	.attrs	= arm_spe_pmu_cap_attr,
 };
@@ -245,7 +245,7 @@ static struct attribute *arm_spe_pmu_formats_attr[] = {
 	NULL,
 };
 
-static struct attribute_group arm_spe_pmu_format_group = {
+static const struct attribute_group arm_spe_pmu_format_group = {
 	.name	= "format",
 	.attrs	= arm_spe_pmu_formats_attr,
 };
@@ -265,7 +265,7 @@ static struct attribute *arm_spe_pmu_attrs[] = {
 	NULL,
 };
 
-static struct attribute_group arm_spe_pmu_group = {
+static const struct attribute_group arm_spe_pmu_group = {
 	.attrs	= arm_spe_pmu_attrs,
 };
 

From 002b7ace596678c014834d3c3ba002d4eb4ee9ed Mon Sep 17 00:00:00 2001
From: Zihao Tang <tangzihao1@hisilicon.com>
Date: Fri, 19 Mar 2021 18:04:31 +0800
Subject: [PATCH 636/737] drivers/perf: convert sysfs snprintf family to
 sysfs_emit

Fix the following coccicheck warning:

./drivers/perf/hisilicon/hisi_uncore_pmu.c:128:8-16: WARNING: use scnprintf or sprintf.
./drivers/perf/fsl_imx8_ddr_perf.c:173:8-16: WARNING: use scnprintf or sprintf.
./drivers/perf/arm_spe_pmu.c:129:8-16: WARNING: use scnprintf or sprintf.
./drivers/perf/arm_smmu_pmu.c:563:8-16: WARNING: use scnprintf or sprintf.
./drivers/perf/arm_dsu_pmu.c:149:8-16: WARNING: use scnprintf or sprintf.
./drivers/perf/arm_dsu_pmu.c:139:8-16: WARNING: use scnprintf or sprintf.
./drivers/perf/arm-cmn.c:563:8-16: WARNING: use scnprintf or sprintf.
./drivers/perf/arm-cmn.c:351:8-16: WARNING: use scnprintf or sprintf.
./drivers/perf/arm-ccn.c:224:8-16: WARNING: use scnprintf or sprintf.
./drivers/perf/arm-cci.c:708:8-16: WARNING: use scnprintf or sprintf.
./drivers/perf/arm-cci.c:699:8-16: WARNING: use scnprintf or sprintf.
./drivers/perf/arm-cci.c:528:8-16: WARNING: use scnprintf or sprintf.
./drivers/perf/arm-cci.c:309:8-16: WARNING: use scnprintf or sprintf.

Conflicts:
 * drivers/perf/arm_smmuv3_pmu.c: doesn't have affected code
 * drivers/perf/hisilicon/hisi_uncore_pmu.c: doesn't have affected code

Signed-off-by: Zihao Tang <tangzihao1@hisilicon.com>
Signed-off-by: Qi Liu <liuqi115@huawei.com>
Link: https://lore.kernel.org/r/1616148273-16374-2-git-send-email-liuqi115@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 700a9cf0527ca2d7d3e4980fef2deb4883432ab6)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/arm-cci.c           | 12 ++++++------
 drivers/perf/arm-ccn.c           |  4 ++--
 drivers/perf/arm-cmn.c           | 22 +++++++++++-----------
 drivers/perf/arm_dsu_pmu.c       |  5 ++---
 drivers/perf/arm_spe_pmu.c       |  3 +--
 drivers/perf/fsl_imx8_ddr_perf.c |  3 +--
 6 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/drivers/perf/arm-cci.c b/drivers/perf/arm-cci.c
index a75cf77c4de4c..8468f0e8b704e 100644
--- a/drivers/perf/arm-cci.c
+++ b/drivers/perf/arm-cci.c
@@ -306,7 +306,7 @@ static ssize_t cci400_pmu_cycle_event_show(struct device *dev,
 {
 	struct dev_ext_attribute *eattr = container_of(attr,
 				struct dev_ext_attribute, attr);
-	return snprintf(buf, PAGE_SIZE, "config=0x%lx\n", (unsigned long)eattr->var);
+	return sysfs_emit(buf, "config=0x%lx\n", (unsigned long)eattr->var);
 }
 
 static int cci400_get_event_idx(struct cci_pmu *cci_pmu,
@@ -525,8 +525,8 @@ static ssize_t cci5xx_pmu_global_event_show(struct device *dev,
 	struct dev_ext_attribute *eattr = container_of(attr,
 					struct dev_ext_attribute, attr);
 	/* Global events have single fixed source code */
-	return snprintf(buf, PAGE_SIZE, "event=0x%lx,source=0x%x\n",
-				(unsigned long)eattr->var, CCI5xx_PORT_GLOBAL);
+	return sysfs_emit(buf, "event=0x%lx,source=0x%x\n",
+			  (unsigned long)eattr->var, CCI5xx_PORT_GLOBAL);
 }
 
 /*
@@ -696,7 +696,7 @@ static ssize_t cci_pmu_format_show(struct device *dev,
 {
 	struct dev_ext_attribute *eattr = container_of(attr,
 				struct dev_ext_attribute, attr);
-	return snprintf(buf, PAGE_SIZE, "%s\n", (char *)eattr->var);
+	return sysfs_emit(buf, "%s\n", (char *)eattr->var);
 }
 
 static ssize_t cci_pmu_event_show(struct device *dev,
@@ -705,8 +705,8 @@ static ssize_t cci_pmu_event_show(struct device *dev,
 	struct dev_ext_attribute *eattr = container_of(attr,
 				struct dev_ext_attribute, attr);
 	/* source parameter is mandatory for normal PMU events */
-	return snprintf(buf, PAGE_SIZE, "source=?,event=0x%lx\n",
-					 (unsigned long)eattr->var);
+	return sysfs_emit(buf, "source=?,event=0x%lx\n",
+			  (unsigned long)eattr->var);
 }
 
 static int pmu_is_valid_counter(struct cci_pmu *cci_pmu, int idx)
diff --git a/drivers/perf/arm-ccn.c b/drivers/perf/arm-ccn.c
index a0a71c1df042a..3a2ddc0cc6c39 100644
--- a/drivers/perf/arm-ccn.c
+++ b/drivers/perf/arm-ccn.c
@@ -221,7 +221,7 @@ static ssize_t arm_ccn_pmu_format_show(struct device *dev,
 	struct dev_ext_attribute *ea = container_of(attr,
 			struct dev_ext_attribute, attr);
 
-	return snprintf(buf, PAGE_SIZE, "%s\n", (char *)ea->var);
+	return sysfs_emit(buf, "%s\n", (char *)ea->var);
 }
 
 #define CCN_FORMAT_ATTR(_name, _config) \
@@ -476,7 +476,7 @@ static ssize_t arm_ccn_pmu_cmp_mask_show(struct device *dev,
 	struct arm_ccn *ccn = pmu_to_arm_ccn(dev_get_drvdata(dev));
 	u64 *mask = arm_ccn_pmu_get_cmp_mask(ccn, attr->attr.name);
 
-	return mask ? snprintf(buf, PAGE_SIZE, "0x%016llx\n", *mask) : -EINVAL;
+	return mask ? sysfs_emit(buf, "0x%016llx\n", *mask) : -EINVAL;
 }
 
 static ssize_t arm_ccn_pmu_cmp_mask_store(struct device *dev,
diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index c8c784daef4ee..86ac9963c0d29 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -348,19 +348,19 @@ static ssize_t arm_cmn_event_show(struct device *dev,
 	eattr = container_of(attr, typeof(*eattr), attr);
 
 	if (eattr->type == CMN_TYPE_DTC)
-		return snprintf(buf, PAGE_SIZE, "type=0x%x\n", eattr->type);
+		return sysfs_emit(buf, "type=0x%x\n", eattr->type);
 
 	if (eattr->type == CMN_TYPE_WP)
-		return snprintf(buf, PAGE_SIZE,
-				"type=0x%x,eventid=0x%x,wp_dev_sel=?,wp_chn_sel=?,wp_grp=?,wp_val=?,wp_mask=?\n",
-				eattr->type, eattr->eventid);
+		return sysfs_emit(buf,
+				  "type=0x%x,eventid=0x%x,wp_dev_sel=?,wp_chn_sel=?,wp_grp=?,wp_val=?,wp_mask=?\n",
+				  eattr->type, eattr->eventid);
 
 	if (arm_cmn_is_occup_event(eattr->type, eattr->eventid))
-		return snprintf(buf, PAGE_SIZE, "type=0x%x,eventid=0x%x,occupid=0x%x\n",
-				eattr->type, eattr->eventid, eattr->occupid);
+		return sysfs_emit(buf, "type=0x%x,eventid=0x%x,occupid=0x%x\n",
+				  eattr->type, eattr->eventid, eattr->occupid);
 
-	return snprintf(buf, PAGE_SIZE, "type=0x%x,eventid=0x%x\n",
-			eattr->type, eattr->eventid);
+	return sysfs_emit(buf, "type=0x%x,eventid=0x%x\n", eattr->type,
+			  eattr->eventid);
 }
 
 static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj,
@@ -560,12 +560,12 @@ static ssize_t arm_cmn_format_show(struct device *dev,
 	int lo = __ffs(fmt->field), hi = __fls(fmt->field);
 
 	if (lo == hi)
-		return snprintf(buf, PAGE_SIZE, "config:%d\n", lo);
+		return sysfs_emit(buf, "config:%d\n", lo);
 
 	if (!fmt->config)
-		return snprintf(buf, PAGE_SIZE, "config:%d-%d\n", lo, hi);
+		return sysfs_emit(buf, "config:%d-%d\n", lo, hi);
 
-	return snprintf(buf, PAGE_SIZE, "config%d:%d-%d\n", fmt->config, lo, hi);
+	return sysfs_emit(buf, "config%d:%d-%d\n", fmt->config, lo, hi);
 }
 
 #define _CMN_FORMAT_ATTR(_name, _cfg, _fld)				\
diff --git a/drivers/perf/arm_dsu_pmu.c b/drivers/perf/arm_dsu_pmu.c
index 1db8eccc9735c..83bc031d79b79 100644
--- a/drivers/perf/arm_dsu_pmu.c
+++ b/drivers/perf/arm_dsu_pmu.c
@@ -136,8 +136,7 @@ static ssize_t dsu_pmu_sysfs_event_show(struct device *dev,
 {
 	struct dev_ext_attribute *eattr = container_of(attr,
 					struct dev_ext_attribute, attr);
-	return snprintf(buf, PAGE_SIZE, "event=0x%lx\n",
-					 (unsigned long)eattr->var);
+	return sysfs_emit(buf, "event=0x%lx\n", (unsigned long)eattr->var);
 }
 
 static ssize_t dsu_pmu_sysfs_format_show(struct device *dev,
@@ -146,7 +145,7 @@ static ssize_t dsu_pmu_sysfs_format_show(struct device *dev,
 {
 	struct dev_ext_attribute *eattr = container_of(attr,
 					struct dev_ext_attribute, attr);
-	return snprintf(buf, PAGE_SIZE, "%s\n", (char *)eattr->var);
+	return sysfs_emit(buf, "%s\n", (char *)eattr->var);
 }
 
 static ssize_t dsu_pmu_cpumask_show(struct device *dev,
diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c
index 03cb78764e7c7..a9d001b2447f4 100644
--- a/drivers/perf/arm_spe_pmu.c
+++ b/drivers/perf/arm_spe_pmu.c
@@ -144,8 +144,7 @@ static ssize_t arm_spe_pmu_cap_show(struct device *dev,
 		container_of(attr, struct dev_ext_attribute, attr);
 	int cap = (long)ea->var;
 
-	return snprintf(buf, PAGE_SIZE, "%u\n",
-		arm_spe_pmu_cap_get(spe_pmu, cap));
+	return sysfs_emit(buf, "%u\n", arm_spe_pmu_cap_get(spe_pmu, cap));
 }
 
 #define SPE_EXT_ATTR_ENTRY(_name, _func, _var)				\
diff --git a/drivers/perf/fsl_imx8_ddr_perf.c b/drivers/perf/fsl_imx8_ddr_perf.c
index e09bbf3890c49..b86f6be0f1cbe 100644
--- a/drivers/perf/fsl_imx8_ddr_perf.c
+++ b/drivers/perf/fsl_imx8_ddr_perf.c
@@ -116,8 +116,7 @@ static ssize_t ddr_perf_filter_cap_show(struct device *dev,
 		container_of(attr, struct dev_ext_attribute, attr);
 	int cap = (long)ea->var;
 
-	return snprintf(buf, PAGE_SIZE, "%u\n",
-			ddr_perf_filter_cap_get(pmu, cap));
+	return sysfs_emit(buf, "%u\n", ddr_perf_filter_cap_get(pmu, cap));
 }
 
 #define PERF_EXT_ATTR_ENTRY(_name, _func, _var)				\

From 5c8f6acf57530eb446affebea9830706ba452da7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 18 May 2021 11:17:28 +0200
Subject: [PATCH 637/737] perf/arm-cmn: Use irq_set_affinity()

The driver uses irq_set_affinity_hint() to set the affinity for the PMU
interrupts, which relies on the undocumented side effect that this function
actually sets the affinity under the hood.

Setting an hint is clearly not a guarantee and for these PMU interrupts an
affinity hint, which is supposed to guide userspace for setting affinity,
is beyond pointless, because the affinity of these interrupts cannot be
modified from user space.

Aside of that the error checks are bogus because the only error which is
returned from irq_set_affinity_hint() is when there is no irq descriptor
for the interrupt number, but not when the affinity set fails. That's on
purpose because the hint can point to an offline CPU.

Replace the mindless abuse with irq_set_affinity().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20210518093118.277228577@linutronix.de
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 8ec25d34012da3bf417a4d16c057a54064626058)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/arm-cmn.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 86ac9963c0d29..b74620207cd66 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -1162,7 +1162,7 @@ static int arm_cmn_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
 
 	perf_pmu_migrate_context(&cmn->pmu, cpu, target);
 	for (i = 0; i < cmn->num_dtcs; i++)
-		irq_set_affinity_hint(cmn->dtc[i].irq, cpumask_of(target));
+		irq_set_affinity(cmn->dtc[i].irq, cpumask_of(target));
 	cmn->cpu = target;
 	return 0;
 }
@@ -1222,7 +1222,7 @@ static int arm_cmn_init_irqs(struct arm_cmn *cmn)
 		if (err)
 			return err;
 
-		err = irq_set_affinity_hint(irq, cpumask_of(cmn->cpu));
+		err = irq_set_affinity(irq, cpumask_of(cmn->cpu));
 		if (err)
 			return err;
 	next:
@@ -1569,16 +1569,11 @@ static int arm_cmn_probe(struct platform_device *pdev)
 static int arm_cmn_remove(struct platform_device *pdev)
 {
 	struct arm_cmn *cmn = platform_get_drvdata(pdev);
-	int i;
 
 	writel_relaxed(0, cmn->dtc[0].base + CMN_DT_DTC_CTL);
 
 	perf_pmu_unregister(&cmn->pmu);
 	cpuhp_state_remove_instance(arm_cmn_hp_state, &cmn->cpuhp_node);
-
-	for (i = 0; i < cmn->num_dtcs; i++)
-		irq_set_affinity_hint(cmn->dtc[i].irq, NULL);
-
 	return 0;
 }
 

From dc3b43cec642922d7fd7ab1e7eb5c301d41b70aa Mon Sep 17 00:00:00 2001
From: Junhao He <hejunhao2@hisilicon.com>
Date: Tue, 11 May 2021 20:27:33 +0800
Subject: [PATCH 638/737] drivers/perf: arm-cmn: Add space after ','

Fix a warning from checkpatch.pl.

ERROR: space required after that ',' (ctx:VxV)

Signed-off-by: Junhao He <hejunhao2@hisilicon.com>
Signed-off-by: Jay Fang <f.fangjian@huawei.com>
Link: https://lore.kernel.org/r/1620736054-58412-4-git-send-email-f.fangjian@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit a9f00c9760febb84215bcb489855b5b23e3ab4dc)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/arm-cmn.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index b74620207cd66..e5dc74c3d8af0 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -31,7 +31,7 @@
 #define CMN_CI_CHILD_COUNT		GENMASK_ULL(15, 0)
 #define CMN_CI_CHILD_PTR_OFFSET		GENMASK_ULL(31, 16)
 
-#define CMN_CHILD_NODE_ADDR		GENMASK(27,0)
+#define CMN_CHILD_NODE_ADDR		GENMASK(27, 0)
 #define CMN_CHILD_NODE_EXTERNAL		BIT(31)
 
 #define CMN_ADDR_NODE_PTR		GENMASK(27, 14)

From 729f9963450cbade1ed0672868cecc3d4efb5155 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 3 Dec 2021 11:44:50 +0000
Subject: [PATCH 639/737] perf/arm-cmn: Fix CPU hotplug unregistration

Attempting to migrate the PMU context after we've unregistered the PMU
device, or especially if we never successfully registered it in the
first place, is a woefully bad idea. It's also fundamentally pointless
anyway. Make sure to unregister an instance from the hotplug handler
*without* invoking the teardown callback.

Fixes: 0ba64770a2f2 ("perf: Add Arm CMN-600 PMU driver")
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/2c221d745544774e4b07583b65b5d4d94f7e0fe4.1638530442.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 56c7c6eaf3eb8ac1ec40d56096c0f2b27250da5f)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/arm-cmn.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index e5dc74c3d8af0..40945343c4cc1 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -1562,7 +1562,8 @@ static int arm_cmn_probe(struct platform_device *pdev)
 
 	err = perf_pmu_register(&cmn->pmu, name, -1);
 	if (err)
-		cpuhp_state_remove_instance(arm_cmn_hp_state, &cmn->cpuhp_node);
+		cpuhp_state_remove_instance_nocalls(arm_cmn_hp_state, &cmn->cpuhp_node);
+
 	return err;
 }
 
@@ -1573,7 +1574,7 @@ static int arm_cmn_remove(struct platform_device *pdev)
 	writel_relaxed(0, cmn->dtc[0].base + CMN_DT_DTC_CTL);
 
 	perf_pmu_unregister(&cmn->pmu);
-	cpuhp_state_remove_instance(arm_cmn_hp_state, &cmn->cpuhp_node);
+	cpuhp_state_remove_instance_nocalls(arm_cmn_hp_state, &cmn->cpuhp_node);
 	return 0;
 }
 

From cfebfc413207d76e24a980309a1753be96914207 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 3 Dec 2021 11:44:51 +0000
Subject: [PATCH 640/737] perf/arm-cmn: Account for NUMA affinity

On a system with multiple CMN meshes, ideally we'd want to access each
PMU from within its own mesh, rather than with a long CML round-trip,
wherever feasible. Since such a system is likely to be presented as
multiple NUMA nodes, let's also hope a proximity domain is specified
for each CMN programming interface, and use that to guide our choice
of IRQ affinity to favour a node-local CPU where possible.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/32438b0d016e0649d882d47d30ac2000484287b9.1638530442.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 6190741c294d1cad15198d5d2f912868434fa492)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/arm-cmn.c | 51 +++++++++++++++++++++++++++++++-----------
 1 file changed, 38 insertions(+), 13 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 40945343c4cc1..ec4ea79fc5e34 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -1147,23 +1147,47 @@ static int arm_cmn_commit_txn(struct pmu *pmu)
 	return 0;
 }
 
-static int arm_cmn_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
+static void arm_cmn_migrate(struct arm_cmn *cmn, unsigned int cpu)
+{
+	unsigned int i;
+
+	perf_pmu_migrate_context(&cmn->pmu, cmn->cpu, cpu);
+	for (i = 0; i < cmn->num_dtcs; i++)
+		irq_set_affinity(cmn->dtc[i].irq, cpumask_of(cpu));
+	cmn->cpu = cpu;
+}
+
+static int arm_cmn_pmu_online_cpu(unsigned int cpu, struct hlist_node *cpuhp_node)
 {
 	struct arm_cmn *cmn;
-	unsigned int i, target;
+	int node;
 
-	cmn = hlist_entry_safe(node, struct arm_cmn, cpuhp_node);
-	if (cpu != cmn->cpu)
-		return 0;
+	cmn = hlist_entry_safe(cpuhp_node, struct arm_cmn, cpuhp_node);
+	node = dev_to_node(cmn->dev);
+	if (node != NUMA_NO_NODE && cpu_to_node(cmn->cpu) != node && cpu_to_node(cpu) == node)
+		arm_cmn_migrate(cmn, cpu);
+	return 0;
+}
+
+static int arm_cmn_pmu_offline_cpu(unsigned int cpu, struct hlist_node *cpuhp_node)
+{
+	struct arm_cmn *cmn;
+	unsigned int target;
+	int node;
+	cpumask_t mask;
 
-	target = cpumask_any_but(cpu_online_mask, cpu);
-	if (target >= nr_cpu_ids)
+	cmn = hlist_entry_safe(cpuhp_node, struct arm_cmn, cpuhp_node);
+	if (cpu != cmn->cpu)
 		return 0;
 
-	perf_pmu_migrate_context(&cmn->pmu, cpu, target);
-	for (i = 0; i < cmn->num_dtcs; i++)
-		irq_set_affinity(cmn->dtc[i].irq, cpumask_of(target));
-	cmn->cpu = target;
+	node = dev_to_node(cmn->dev);
+	if (cpumask_and(&mask, cpumask_of_node(node), cpu_online_mask) &&
+	    cpumask_andnot(&mask, &mask, cpumask_of(cpu)))
+		target = cpumask_any(&mask);
+	else
+		target = cpumask_any_but(cpu_online_mask, cpu);
+	if (target < nr_cpu_ids)
+		arm_cmn_migrate(cmn, target);
 	return 0;
 }
 
@@ -1533,7 +1557,7 @@ static int arm_cmn_probe(struct platform_device *pdev)
 	if (err)
 		return err;
 
-	cmn->cpu = raw_smp_processor_id();
+	cmn->cpu = cpumask_local_spread(0, dev_to_node(cmn->dev));
 	cmn->pmu = (struct pmu) {
 		.module = THIS_MODULE,
 		.attr_groups = arm_cmn_attr_groups,
@@ -1609,7 +1633,8 @@ static int __init arm_cmn_init(void)
 	int ret;
 
 	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
-				      "perf/arm/cmn:online", NULL,
+				      "perf/arm/cmn:online",
+				      arm_cmn_pmu_online_cpu,
 				      arm_cmn_pmu_offline_cpu);
 	if (ret < 0)
 		return ret;

From 0aa3f5950b24a75c79b3678d6f304809fc0ee67e Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 3 Dec 2021 11:44:52 +0000
Subject: [PATCH 641/737] perf/arm-cmn: Drop compile-test restriction

Although CMN is currently (and overwhelmingly likely to remain) deployed
in arm64-only (modulo userspace) systems, the 64-bit "dependency" for
compile-testing was just laziness due to heavy reliance on readq/writeq
accessors. Since we only need one extra include for robustness in that
regard, let's pull that in, widen the compile-test coverage, and fix up
the smattering of type laziness that that brings to light.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/baee9ee0d0bdad8aaeb70f5a4b98d8fd4b1f5786.1638530442.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 82d8ea4b450074e81748830929bbd94eebbaffea)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/Kconfig   |  2 +-
 drivers/perf/arm-cmn.c | 25 +++++++++++++------------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 130327ff0b0ec..828a042d6a07b 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -43,7 +43,7 @@ config ARM_CCN
 
 config ARM_CMN
 	tristate "Arm CMN-600 PMU support"
-	depends on ARM64 || (COMPILE_TEST && 64BIT)
+	depends on ARM64 || COMPILE_TEST
 	help
 	  Support for PMU events monitoring on the Arm CMN-600 Coherent Mesh
 	  Network interconnect.
diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index ec4ea79fc5e34..0cdef1e42d49d 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -7,6 +7,7 @@
 #include <linux/bitops.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/module.h>
@@ -122,11 +123,11 @@
 
 
 /* Event attributes */
-#define CMN_CONFIG_TYPE			GENMASK(15, 0)
-#define CMN_CONFIG_EVENTID		GENMASK(23, 16)
-#define CMN_CONFIG_OCCUPID		GENMASK(27, 24)
-#define CMN_CONFIG_BYNODEID		BIT(31)
-#define CMN_CONFIG_NODEID		GENMASK(47, 32)
+#define CMN_CONFIG_TYPE			GENMASK_ULL(15, 0)
+#define CMN_CONFIG_EVENTID		GENMASK_ULL(23, 16)
+#define CMN_CONFIG_OCCUPID		GENMASK_ULL(27, 24)
+#define CMN_CONFIG_BYNODEID		BIT_ULL(31)
+#define CMN_CONFIG_NODEID		GENMASK_ULL(47, 32)
 
 #define CMN_EVENT_TYPE(event)		FIELD_GET(CMN_CONFIG_TYPE, (event)->attr.config)
 #define CMN_EVENT_EVENTID(event)	FIELD_GET(CMN_CONFIG_EVENTID, (event)->attr.config)
@@ -134,13 +135,13 @@
 #define CMN_EVENT_BYNODEID(event)	FIELD_GET(CMN_CONFIG_BYNODEID, (event)->attr.config)
 #define CMN_EVENT_NODEID(event)		FIELD_GET(CMN_CONFIG_NODEID, (event)->attr.config)
 
-#define CMN_CONFIG_WP_COMBINE		GENMASK(27, 24)
-#define CMN_CONFIG_WP_DEV_SEL		BIT(48)
-#define CMN_CONFIG_WP_CHN_SEL		GENMASK(50, 49)
-#define CMN_CONFIG_WP_GRP		BIT(52)
-#define CMN_CONFIG_WP_EXCLUSIVE		BIT(53)
-#define CMN_CONFIG1_WP_VAL		GENMASK(63, 0)
-#define CMN_CONFIG2_WP_MASK		GENMASK(63, 0)
+#define CMN_CONFIG_WP_COMBINE		GENMASK_ULL(27, 24)
+#define CMN_CONFIG_WP_DEV_SEL		BIT_ULL(48)
+#define CMN_CONFIG_WP_CHN_SEL		GENMASK_ULL(50, 49)
+#define CMN_CONFIG_WP_GRP		BIT_ULL(52)
+#define CMN_CONFIG_WP_EXCLUSIVE		BIT_ULL(53)
+#define CMN_CONFIG1_WP_VAL		GENMASK_ULL(63, 0)
+#define CMN_CONFIG2_WP_MASK		GENMASK_ULL(63, 0)
 
 #define CMN_EVENT_WP_COMBINE(event)	FIELD_GET(CMN_CONFIG_WP_COMBINE, (event)->attr.config)
 #define CMN_EVENT_WP_DEV_SEL(event)	FIELD_GET(CMN_CONFIG_WP_DEV_SEL, (event)->attr.config)

From 685800a2ddacf73f13d9a40d4493596fcb790f77 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 3 Dec 2021 11:44:53 +0000
Subject: [PATCH 642/737] perf/arm-cmn: Refactor node ID handling

Add a bit more abstraction for the places where we decompose node IDs.
This will help keep things nice and manageable when we come to add yet
more variables which affect the node ID format. Also use the opportunity
to move the rest of the low-level node management helpers back up to the
logical place they were meant to be - how they ended up buried right in
the middle of the event-related definitions is somewhat of a mystery...

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/a2242a8c3c96056c13a04ae87bf2047e5e64d2d9.1638530442.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 5f167eab83f153c2c6f80cfe419e269d5f481b09)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/arm-cmn.c | 94 +++++++++++++++++++++++++-----------------
 1 file changed, 56 insertions(+), 38 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 0cdef1e42d49d..cc695d0d1808f 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -255,6 +255,58 @@ struct arm_cmn {
 
 static int arm_cmn_hp_state;
 
+struct arm_cmn_nodeid {
+	u8 x;
+	u8 y;
+	u8 port;
+	u8 dev;
+};
+
+static int arm_cmn_xyidbits(const struct arm_cmn *cmn)
+{
+	int dim = max(cmn->mesh_x, cmn->mesh_y);
+
+	return dim > 4 ? 3 : 2;
+}
+
+static struct arm_cmn_nodeid arm_cmn_nid(const struct arm_cmn *cmn, u16 id)
+{
+	struct arm_cmn_nodeid nid;
+	int bits = arm_cmn_xyidbits(cmn);
+
+	nid.x = CMN_NODEID_X(id, bits);
+	nid.y = CMN_NODEID_Y(id, bits);
+	nid.port = CMN_NODEID_PID(id);
+	nid.dev = CMN_NODEID_DEVID(id);
+
+	return nid;
+}
+
+static void arm_cmn_init_node_to_xp(const struct arm_cmn *cmn,
+				    struct arm_cmn_node *dn)
+{
+	struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id);
+	int xp_idx = cmn->mesh_x * nid.y + nid.x;
+
+	dn->to_xp = (cmn->xps + xp_idx) - dn;
+}
+
+static struct arm_cmn_node *arm_cmn_node_to_xp(struct arm_cmn_node *dn)
+{
+	return dn->type == CMN_TYPE_XP ? dn : dn + dn->to_xp;
+}
+
+static struct arm_cmn_node *arm_cmn_node(const struct arm_cmn *cmn,
+					 enum cmn_node_type type)
+{
+	int i;
+
+	for (i = 0; i < cmn->num_dns; i++)
+		if (cmn->dns[i].type == type)
+			return &cmn->dns[i];
+	return NULL;
+}
+
 struct arm_cmn_hw_event {
 	struct arm_cmn_node *dn;
 	u64 dtm_idx[2];
@@ -295,38 +347,6 @@ struct arm_cmn_format_attr {
 	int config;
 };
 
-static int arm_cmn_xyidbits(const struct arm_cmn *cmn)
-{
-	return cmn->mesh_x > 4 || cmn->mesh_y > 4 ? 3 : 2;
-}
-
-static void arm_cmn_init_node_to_xp(const struct arm_cmn *cmn,
-				    struct arm_cmn_node *dn)
-{
-	int bits = arm_cmn_xyidbits(cmn);
-	int x = CMN_NODEID_X(dn->id, bits);
-	int y = CMN_NODEID_Y(dn->id, bits);
-	int xp_idx = cmn->mesh_x * y + x;
-
-	dn->to_xp = (cmn->xps + xp_idx) - dn;
-}
-
-static struct arm_cmn_node *arm_cmn_node_to_xp(struct arm_cmn_node *dn)
-{
-	return dn->type == CMN_TYPE_XP ? dn : dn + dn->to_xp;
-}
-
-static struct arm_cmn_node *arm_cmn_node(const struct arm_cmn *cmn,
-					 enum cmn_node_type type)
-{
-	int i;
-
-	for (i = 0; i < cmn->num_dns; i++)
-		if (cmn->dns[i].type == type)
-			return &cmn->dns[i];
-	return NULL;
-}
-
 #define CMN_EVENT_ATTR(_name, _type, _eventid, _occupid)		\
 	(&((struct arm_cmn_event_attr[]) {{				\
 		.attr = __ATTR(_name, 0444, arm_cmn_event_show, NULL),	\
@@ -966,11 +986,10 @@ static int arm_cmn_event_init(struct perf_event *event)
 	}
 
 	if (!hw->num_dns) {
-		int bits = arm_cmn_xyidbits(cmn);
+		struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, nodeid);
 
 		dev_dbg(cmn->dev, "invalid node 0x%x (%d,%d,%d,%d) type 0x%x\n",
-			nodeid, CMN_NODEID_X(nodeid, bits), CMN_NODEID_Y(nodeid, bits),
-			CMN_NODEID_PID(nodeid), CMN_NODEID_DEVID(nodeid), type);
+			nodeid, nid.x, nid.y, nid.port, nid.dev, type);
 		return -EINVAL;
 	}
 	/*
@@ -1068,11 +1087,10 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
 			dn->wp_event[wp_idx] = dtc_idx;
 			writel_relaxed(cfg, dn->pmu_base + CMN_DTM_WPn_CONFIG(wp_idx));
 		} else {
-			unsigned int port = CMN_NODEID_PID(dn->id);
-			unsigned int dev = CMN_NODEID_DEVID(dn->id);
+			struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id);
 
 			input_sel = CMN__PMEVCNT0_INPUT_SEL_DEV + dtm_idx +
-				    (port << 4) + (dev << 2);
+				    (nid.port << 4) + (nid.dev << 2);
 
 			if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event))) {
 				int occupid = CMN_EVENT_OCCUPID(event);

From 28353f29ec5b778269145b729b4bffbda07fce76 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 3 Dec 2021 11:44:54 +0000
Subject: [PATCH 643/737] perf/arm-cmn: Streamline node iteration

Refactor the places where we scan through the set of nodes to switch
from explicit array indexing to pointer-based iteration. This leads to
slightly simpler object code, but also makes the source less dense and
more pleasant for further development. It also unearths an almost-bug
in arm_cmn_event_init() where we've been depending on the "array index"
of NULL relative to cmn->dns being a sufficiently large number, yuck.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/ee0c9eda9a643f46001ac43aadf3f0b1fd5660dd.1638530442.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit da5f7d2c8019c9dd053e2d94fdc1b3e7c03c35a5)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/arm-cmn.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index cc695d0d1808f..c349fdc0eaa0b 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -299,11 +299,11 @@ static struct arm_cmn_node *arm_cmn_node_to_xp(struct arm_cmn_node *dn)
 static struct arm_cmn_node *arm_cmn_node(const struct arm_cmn *cmn,
 					 enum cmn_node_type type)
 {
-	int i;
+	struct arm_cmn_node *dn;
 
-	for (i = 0; i < cmn->num_dns; i++)
-		if (cmn->dns[i].type == type)
-			return &cmn->dns[i];
+	for (dn = cmn->dns; dn->type; dn++)
+		if (dn->type == type)
+			return dn;
 	return NULL;
 }
 
@@ -941,8 +941,8 @@ static int arm_cmn_event_init(struct perf_event *event)
 {
 	struct arm_cmn *cmn = to_cmn(event->pmu);
 	struct arm_cmn_hw_event *hw = to_cmn_hw(event);
+	struct arm_cmn_node *dn;
 	enum cmn_node_type type;
-	unsigned int i;
 	bool bynodeid;
 	u16 nodeid, eventid;
 
@@ -974,10 +974,12 @@ static int arm_cmn_event_init(struct perf_event *event)
 	nodeid = CMN_EVENT_NODEID(event);
 
 	hw->dn = arm_cmn_node(cmn, type);
-	for (i = hw->dn - cmn->dns; i < cmn->num_dns && cmn->dns[i].type == type; i++) {
+	if (!hw->dn)
+		return -EINVAL;
+	for (dn = hw->dn; dn->type == type; dn++) {
 		if (!bynodeid) {
 			hw->num_dns++;
-		} else if (cmn->dns[i].id != nodeid) {
+		} else if (dn->id != nodeid) {
 			hw->dn++;
 		} else {
 			hw->num_dns = 1;
@@ -1333,7 +1335,7 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn)
 
 	cmn->xps = arm_cmn_node(cmn, CMN_TYPE_XP);
 
-	for (dn = cmn->dns; dn < cmn->dns + cmn->num_dns; dn++) {
+	for (dn = cmn->dns; dn->type; dn++) {
 		if (dn->type != CMN_TYPE_XP)
 			arm_cmn_init_node_to_xp(cmn, dn);
 		else if (cmn->num_dtcs == 1)
@@ -1383,6 +1385,7 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 	u32 xp_offset[CMN_MAX_XPS];
 	u64 reg;
 	int i, j;
+	size_t sz;
 
 	cfg_region = cmn->base + rgn_offset;
 	reg = readl_relaxed(cfg_region + CMN_CFGM_PERIPH_ID_2);
@@ -1409,14 +1412,13 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 		cmn->num_dns += FIELD_GET(CMN_CI_CHILD_COUNT, reg);
 	}
 
-	/* Cheeky +1 to help terminate pointer-based iteration */
-	cmn->dns = devm_kcalloc(cmn->dev, cmn->num_dns + 1,
-				sizeof(*cmn->dns), GFP_KERNEL);
-	if (!cmn->dns)
+	/* Cheeky +1 to help terminate pointer-based iteration later */
+	dn = devm_kcalloc(cmn->dev, cmn->num_dns + 1, sizeof(*dn), GFP_KERNEL);
+	if (!dn)
 		return -ENOMEM;
 
 	/* Pass 2: now we can actually populate the nodes */
-	dn = cmn->dns;
+	cmn->dns = dn;
 	for (i = 0; i < cmn->num_xps; i++) {
 		void __iomem *xp_region = cmn->base + xp_offset[i];
 		struct arm_cmn_node *xp = dn++;
@@ -1485,6 +1487,11 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 	/* Correct for any nodes we skipped */
 	cmn->num_dns = dn - cmn->dns;
 
+	sz = (void *)(dn + 1) - (void *)cmn->dns;
+	dn = devm_krealloc(cmn->dev, cmn->dns, sz, GFP_KERNEL);
+	if (dn)
+		cmn->dns = dn;
+
 	/*
 	 * If mesh_x wasn't set during discovery then we never saw
 	 * an XP at (0,1), thus we must have an Nx1 configuration.

From 97a1669015f95bbfd60c94f6d21488a6afdc030c Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 3 Dec 2021 11:44:55 +0000
Subject: [PATCH 644/737] perf/arm-cmn: Refactor DTM handling

Untangle DTMs from XPs into a dedicated abstraction. This helps make
things a little more obvious and robust, but primarily paves the way
for further development where new IPs can grow extra DTMs per XP.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/9cca18b1b98f482df7f1aaf3d3213e7f39500423.1638530442.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 0947c80aba23972987a88e620812d17a7af27297)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/arm-cmn.c | 169 +++++++++++++++++++++--------------------
 1 file changed, 87 insertions(+), 82 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index c349fdc0eaa0b..43d426f37d0dd 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -35,14 +35,9 @@
 #define CMN_CHILD_NODE_ADDR		GENMASK(27, 0)
 #define CMN_CHILD_NODE_EXTERNAL		BIT(31)
 
-#define CMN_ADDR_NODE_PTR		GENMASK(27, 14)
-
-#define CMN_NODE_PTR_DEVID(ptr)		(((ptr) >> 2) & 3)
-#define CMN_NODE_PTR_PID(ptr)		((ptr) & 1)
-#define CMN_NODE_PTR_X(ptr, bits)	((ptr) >> (6 + (bits)))
-#define CMN_NODE_PTR_Y(ptr, bits)	(((ptr) >> 6) & ((1U << (bits)) - 1))
-
-#define CMN_MAX_XPS			(8 * 8)
+#define CMN_MAX_DIMENSION		8
+#define CMN_MAX_XPS			(CMN_MAX_DIMENSION * CMN_MAX_DIMENSION)
+#define CMN_MAX_DTMS			CMN_MAX_XPS
 
 /* The CFG node has one other useful purpose */
 #define CMN_CFGM_PERIPH_ID_2		0x0010
@@ -190,32 +185,32 @@ struct arm_cmn_node {
 	u16 id, logid;
 	enum cmn_node_type type;
 
+	int dtm;
 	union {
-		/* Device node */
+		/* DN/HN-F/CXHA */
 		struct {
-			int to_xp;
-			/* DN/HN-F/CXHA */
-			unsigned int occupid_val;
-			unsigned int occupid_count;
+			u8 occupid_val;
+			u8 occupid_count;
 		};
 		/* XP */
-		struct {
-			int dtc;
-			u32 pmu_config_low;
-			union {
-				u8 input_sel[4];
-				__le32 pmu_config_high;
-			};
-			s8 wp_event[4];
-		};
+		int dtc;
 	};
-
 	union {
 		u8 event[4];
 		__le32 event_sel;
 	};
 };
 
+struct arm_cmn_dtm {
+	void __iomem *base;
+	u32 pmu_config_low;
+	union {
+		u8 input_sel[4];
+		__le32 pmu_config_high;
+	};
+	s8 wp_event[4];
+};
+
 struct arm_cmn_dtc {
 	void __iomem *base;
 	int irq;
@@ -241,6 +236,7 @@ struct arm_cmn {
 	struct arm_cmn_node *xps;
 	struct arm_cmn_node *dns;
 
+	struct arm_cmn_dtm *dtms;
 	struct arm_cmn_dtc *dtc;
 	unsigned int num_dtcs;
 
@@ -282,20 +278,14 @@ static struct arm_cmn_nodeid arm_cmn_nid(const struct arm_cmn *cmn, u16 id)
 	return nid;
 }
 
-static void arm_cmn_init_node_to_xp(const struct arm_cmn *cmn,
-				    struct arm_cmn_node *dn)
+static struct arm_cmn_node *arm_cmn_node_to_xp(const struct arm_cmn *cmn,
+					       const struct arm_cmn_node *dn)
 {
 	struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id);
 	int xp_idx = cmn->mesh_x * nid.y + nid.x;
 
-	dn->to_xp = (cmn->xps + xp_idx) - dn;
-}
-
-static struct arm_cmn_node *arm_cmn_node_to_xp(struct arm_cmn_node *dn)
-{
-	return dn->type == CMN_TYPE_XP ? dn : dn + dn->to_xp;
+	return cmn->xps + xp_idx;
 }
-
 static struct arm_cmn_node *arm_cmn_node(const struct arm_cmn *cmn,
 					 enum cmn_node_type type)
 {
@@ -706,9 +696,9 @@ static u64 arm_cmn_read_dtm(struct arm_cmn *cmn, struct arm_cmn_hw_event *hw,
 
 	offset = snapshot ? CMN_DTM_PMEVCNTSR : CMN_DTM_PMEVCNT;
 	for_each_hw_dn(hw, dn, i) {
-		struct arm_cmn_node *xp = arm_cmn_node_to_xp(dn);
+		struct arm_cmn_dtm *dtm = &cmn->dtms[dn->dtm];
 		int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i);
-		u64 reg = readq_relaxed(xp->pmu_base + offset);
+		u64 reg = readq_relaxed(dtm->base + offset);
 		u16 dtm_count = reg >> (dtm_idx * 16);
 
 		count += dtm_count;
@@ -835,9 +825,9 @@ static void arm_cmn_event_stop(struct perf_event *event, int flags)
 }
 
 struct arm_cmn_val {
-	u8 dtm_count[CMN_MAX_XPS];
-	u8 occupid[CMN_MAX_XPS];
-	u8 wp[CMN_MAX_XPS][4];
+	u8 dtm_count[CMN_MAX_DTMS];
+	u8 occupid[CMN_MAX_DTMS];
+	u8 wp[CMN_MAX_DTMS][4];
 	int dtc_count;
 	bool cycles;
 };
@@ -866,16 +856,16 @@ static void arm_cmn_val_add_event(struct arm_cmn_val *val, struct perf_event *ev
 		occupid = 0;
 
 	for_each_hw_dn(hw, dn, i) {
-		int wp_idx, xp = arm_cmn_node_to_xp(dn)->logid;
+		int wp_idx, dtm = dn->dtm;
 
-		val->dtm_count[xp]++;
-		val->occupid[xp] = occupid;
+		val->dtm_count[dtm]++;
+		val->occupid[dtm] = occupid;
 
 		if (type != CMN_TYPE_WP)
 			continue;
 
 		wp_idx = arm_cmn_wp_idx(event);
-		val->wp[xp][wp_idx] = CMN_EVENT_WP_COMBINE(event) + 1;
+		val->wp[dtm][wp_idx] = CMN_EVENT_WP_COMBINE(event) + 1;
 	}
 }
 
@@ -914,22 +904,22 @@ static int arm_cmn_validate_group(struct perf_event *event)
 		occupid = 0;
 
 	for_each_hw_dn(hw, dn, i) {
-		int wp_idx, wp_cmb, xp = arm_cmn_node_to_xp(dn)->logid;
+		int wp_idx, wp_cmb, dtm = dn->dtm;
 
-		if (val.dtm_count[xp] == CMN_DTM_NUM_COUNTERS)
+		if (val.dtm_count[dtm] == CMN_DTM_NUM_COUNTERS)
 			return -EINVAL;
 
-		if (occupid && val.occupid[xp] && occupid != val.occupid[xp])
+		if (occupid && val.occupid[dtm] && occupid != val.occupid[dtm])
 			return -EINVAL;
 
 		if (type != CMN_TYPE_WP)
 			continue;
 
 		wp_idx = arm_cmn_wp_idx(event);
-		if (val.wp[xp][wp_idx])
+		if (val.wp[dtm][wp_idx])
 			return -EINVAL;
 
-		wp_cmb = val.wp[xp][wp_idx ^ 1];
+		wp_cmb = val.wp[dtm][wp_idx ^ 1];
 		if (wp_cmb && wp_cmb != CMN_EVENT_WP_COMBINE(event) + 1)
 			return -EINVAL;
 	}
@@ -1010,17 +1000,17 @@ static void arm_cmn_event_clear(struct arm_cmn *cmn, struct perf_event *event,
 	enum cmn_node_type type = CMN_EVENT_TYPE(event);
 
 	while (i--) {
-		struct arm_cmn_node *xp = arm_cmn_node_to_xp(hw->dn + i);
+		struct arm_cmn_dtm *dtm = &cmn->dtms[hw->dn[i].dtm];
 		unsigned int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i);
 
 		if (type == CMN_TYPE_WP)
-			hw->dn[i].wp_event[arm_cmn_wp_idx(event)] = -1;
+			dtm->wp_event[arm_cmn_wp_idx(event)] = -1;
 
 		if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event)))
 			hw->dn[i].occupid_count--;
 
-		xp->pmu_config_low &= ~CMN__PMEVCNT_PAIRED(dtm_idx);
-		writel_relaxed(xp->pmu_config_low, xp->pmu_base + CMN_DTM_PMU_CONFIG);
+		dtm->pmu_config_low &= ~CMN__PMEVCNT_PAIRED(dtm_idx);
+		writel_relaxed(dtm->pmu_config_low, dtm->base + CMN_DTM_PMU_CONFIG);
 	}
 	memset(hw->dtm_idx, 0, sizeof(hw->dtm_idx));
 
@@ -1062,12 +1052,12 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
 
 	/* ...then the local counters to feed it. */
 	for_each_hw_dn(hw, dn, i) {
-		struct arm_cmn_node *xp = arm_cmn_node_to_xp(dn);
+		struct arm_cmn_dtm *dtm = &cmn->dtms[dn->dtm];
 		unsigned int dtm_idx, shift;
 		u64 reg;
 
 		dtm_idx = 0;
-		while (xp->pmu_config_low & CMN__PMEVCNT_PAIRED(dtm_idx))
+		while (dtm->pmu_config_low & CMN__PMEVCNT_PAIRED(dtm_idx))
 			if (++dtm_idx == CMN_DTM_NUM_COUNTERS)
 				goto free_dtms;
 
@@ -1077,17 +1067,17 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
 			int tmp, wp_idx = arm_cmn_wp_idx(event);
 			u32 cfg = arm_cmn_wp_config(event);
 
-			if (dn->wp_event[wp_idx] >= 0)
+			if (dtm->wp_event[wp_idx] >= 0)
 				goto free_dtms;
 
-			tmp = dn->wp_event[wp_idx ^ 1];
+			tmp = dtm->wp_event[wp_idx ^ 1];
 			if (tmp >= 0 && CMN_EVENT_WP_COMBINE(event) !=
 					CMN_EVENT_WP_COMBINE(dtc->counters[tmp]))
 				goto free_dtms;
 
 			input_sel = CMN__PMEVCNT0_INPUT_SEL_WP + wp_idx;
-			dn->wp_event[wp_idx] = dtc_idx;
-			writel_relaxed(cfg, dn->pmu_base + CMN_DTM_WPn_CONFIG(wp_idx));
+			dtm->wp_event[wp_idx] = dtc_idx;
+			writel_relaxed(cfg, dtm->base + CMN_DTM_WPn_CONFIG(wp_idx));
 		} else {
 			struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id);
 
@@ -1095,7 +1085,7 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
 				    (nid.port << 4) + (nid.dev << 2);
 
 			if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event))) {
-				int occupid = CMN_EVENT_OCCUPID(event);
+				u8 occupid = CMN_EVENT_OCCUPID(event);
 
 				if (dn->occupid_count == 0) {
 					dn->occupid_val = occupid;
@@ -1110,13 +1100,13 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
 
 		arm_cmn_set_index(hw->dtm_idx, i, dtm_idx);
 
-		xp->input_sel[dtm_idx] = input_sel;
+		dtm->input_sel[dtm_idx] = input_sel;
 		shift = CMN__PMEVCNTn_GLOBAL_NUM_SHIFT(dtm_idx);
-		xp->pmu_config_low &= ~(CMN__PMEVCNT0_GLOBAL_NUM << shift);
-		xp->pmu_config_low |= FIELD_PREP(CMN__PMEVCNT0_GLOBAL_NUM, dtc_idx) << shift;
-		xp->pmu_config_low |= CMN__PMEVCNT_PAIRED(dtm_idx);
-		reg = (u64)le32_to_cpu(xp->pmu_config_high) << 32 | xp->pmu_config_low;
-		writeq_relaxed(reg, xp->pmu_base + CMN_DTM_PMU_CONFIG);
+		dtm->pmu_config_low &= ~(CMN__PMEVCNT0_GLOBAL_NUM << shift);
+		dtm->pmu_config_low |= FIELD_PREP(CMN__PMEVCNT0_GLOBAL_NUM, dtc_idx) << shift;
+		dtm->pmu_config_low |= CMN__PMEVCNT_PAIRED(dtm_idx);
+		reg = (u64)le32_to_cpu(dtm->pmu_config_high) << 32 | dtm->pmu_config_low;
+		writeq_relaxed(reg, dtm->base + CMN_DTM_PMU_CONFIG);
 	}
 
 	/* Go go go! */
@@ -1276,23 +1266,22 @@ static int arm_cmn_init_irqs(struct arm_cmn *cmn)
 	return 0;
 }
 
-static void arm_cmn_init_dtm(struct arm_cmn_node *xp)
+static void arm_cmn_init_dtm(struct arm_cmn_dtm *dtm, struct arm_cmn_node *xp)
 {
 	int i;
 
+	dtm->base = xp->pmu_base;
+	dtm->pmu_config_low = CMN_DTM_PMU_CONFIG_PMU_EN;
 	for (i = 0; i < 4; i++) {
-		xp->wp_event[i] = -1;
-		writeq_relaxed(0, xp->pmu_base + CMN_DTM_WPn_MASK(i));
-		writeq_relaxed(~0ULL, xp->pmu_base + CMN_DTM_WPn_VAL(i));
+		dtm->wp_event[i] = -1;
+		writeq_relaxed(0, dtm->base + CMN_DTM_WPn_MASK(i));
+		writeq_relaxed(~0ULL, dtm->base + CMN_DTM_WPn_VAL(i));
 	}
-	xp->pmu_config_low = CMN_DTM_PMU_CONFIG_PMU_EN;
-	xp->dtc = -1;
 }
 
 static int arm_cmn_init_dtc(struct arm_cmn *cmn, struct arm_cmn_node *dn, int idx)
 {
 	struct arm_cmn_dtc *dtc = cmn->dtc + idx;
-	struct arm_cmn_node *xp;
 
 	dtc->base = dn->pmu_base - CMN_PMU_OFFSET;
 	dtc->irq = platform_get_irq(to_platform_device(cmn->dev), idx);
@@ -1304,10 +1293,6 @@ static int arm_cmn_init_dtc(struct arm_cmn *cmn, struct arm_cmn_node *dn, int id
 	writeq_relaxed(0, dtc->base + CMN_DT_PMCCNTR);
 	writel_relaxed(0x1ff, dtc->base + CMN_DT_PMOVSR_CLR);
 
-	/* We do at least know that a DTC's XP must be in that DTC's domain */
-	xp = arm_cmn_node_to_xp(dn);
-	xp->dtc = idx;
-
 	return 0;
 }
 
@@ -1324,7 +1309,7 @@ static int arm_cmn_node_cmp(const void *a, const void *b)
 
 static int arm_cmn_init_dtcs(struct arm_cmn *cmn)
 {
-	struct arm_cmn_node *dn;
+	struct arm_cmn_node *dn, *xp;
 	int dtc_idx = 0;
 
 	cmn->dtc = devm_kcalloc(cmn->dev, cmn->num_dtcs, sizeof(cmn->dtc[0]), GFP_KERNEL);
@@ -1336,13 +1321,24 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn)
 	cmn->xps = arm_cmn_node(cmn, CMN_TYPE_XP);
 
 	for (dn = cmn->dns; dn->type; dn++) {
-		if (dn->type != CMN_TYPE_XP)
-			arm_cmn_init_node_to_xp(cmn, dn);
-		else if (cmn->num_dtcs == 1)
-			dn->dtc = 0;
+		if (dn->type == CMN_TYPE_XP) {
+			if (dn->dtc < 0 && cmn->num_dtcs == 1)
+				dn->dtc = 0;
+			continue;
+		}
 
-		if (dn->type == CMN_TYPE_DTC)
-			arm_cmn_init_dtc(cmn, dn, dtc_idx++);
+		xp = arm_cmn_node_to_xp(cmn, dn);
+		dn->dtm = xp->dtm;
+
+		if (dn->type == CMN_TYPE_DTC) {
+			int err;
+			/* We do at least know that a DTC's XP must be in that DTC's domain */
+			if (xp->dtc < 0)
+				xp->dtc = dtc_idx;
+			err = arm_cmn_init_dtc(cmn, dn, dtc_idx++);
+			if (err)
+				return err;
+		}
 
 		/* To the PMU, RN-Ds don't add anything over RN-Is, so smoosh them together */
 		if (dn->type == CMN_TYPE_RND)
@@ -1381,6 +1377,7 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 {
 	void __iomem *cfg_region;
 	struct arm_cmn_node cfg, *dn;
+	struct arm_cmn_dtm *dtm;
 	u16 child_count, child_poff;
 	u32 xp_offset[CMN_MAX_XPS];
 	u64 reg;
@@ -1417,14 +1414,18 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 	if (!dn)
 		return -ENOMEM;
 
+	dtm = devm_kcalloc(cmn->dev, cmn->num_xps, sizeof(*dtm), GFP_KERNEL);
+	if (!dtm)
+		return -ENOMEM;
+
 	/* Pass 2: now we can actually populate the nodes */
 	cmn->dns = dn;
+	cmn->dtms = dtm;
 	for (i = 0; i < cmn->num_xps; i++) {
 		void __iomem *xp_region = cmn->base + xp_offset[i];
 		struct arm_cmn_node *xp = dn++;
 
 		arm_cmn_init_node_info(cmn, xp_offset[i], xp);
-		arm_cmn_init_dtm(xp);
 		/*
 		 * Thanks to the order in which XP logical IDs seem to be
 		 * assigned, we can handily infer the mesh X dimension by
@@ -1434,6 +1435,10 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 		if (xp->id == (1 << 3))
 			cmn->mesh_x = xp->logid;
 
+		xp->dtc = -1;
+		xp->dtm = dtm - cmn->dtms;
+		arm_cmn_init_dtm(dtm++, xp);
+
 		reg = readq_relaxed(xp_region + CMN_CHILD_INFO);
 		child_count = FIELD_GET(CMN_CI_CHILD_COUNT, reg);
 		child_poff = FIELD_GET(CMN_CI_CHILD_PTR_OFFSET, reg);

From 318f9392aba8943e66a880692a85e6245dd02af8 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 3 Dec 2021 11:44:56 +0000
Subject: [PATCH 645/737] perf/arm-cmn: Optimise DTM counter reads

When multiple nodes of the same type are connected to the same XP
(particularly in CAL configurations), it seems that they are likely
to be consecutive in logical ID. Therefore, we're likely to gain a
small benefit from an easy tweak to optimise out consecutive reads
of the same set of DTM counters for an aggregated event.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/7777d77c2df17693cd3dabb6e268906e15238d82.1638530442.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 847eef94e6327dd7690bfac0bd3a81a7ba6aa1ee)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/arm-cmn.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 43d426f37d0dd..019d21eb2c848 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -690,18 +690,19 @@ static void arm_cmn_pmu_disable(struct pmu *pmu)
 static u64 arm_cmn_read_dtm(struct arm_cmn *cmn, struct arm_cmn_hw_event *hw,
 			    bool snapshot)
 {
+	struct arm_cmn_dtm *dtm = NULL;
 	struct arm_cmn_node *dn;
-	unsigned int i, offset;
-	u64 count = 0;
+	unsigned int i, offset, dtm_idx;
+	u64 reg, count = 0;
 
 	offset = snapshot ? CMN_DTM_PMEVCNTSR : CMN_DTM_PMEVCNT;
 	for_each_hw_dn(hw, dn, i) {
-		struct arm_cmn_dtm *dtm = &cmn->dtms[dn->dtm];
-		int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i);
-		u64 reg = readq_relaxed(dtm->base + offset);
-		u16 dtm_count = reg >> (dtm_idx * 16);
-
-		count += dtm_count;
+		if (dtm != &cmn->dtms[dn->dtm]) {
+			dtm = &cmn->dtms[dn->dtm];
+			reg = readq_relaxed(dtm->base + offset);
+		}
+		dtm_idx = arm_cmn_get_index(hw->dtm_idx, i);
+		count += (u16)(reg >> (dtm_idx * 16));
 	}
 	return count;
 }

From fcc67d104bd9509e2b9977a253c8681f057c4deb Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 3 Dec 2021 11:44:57 +0000
Subject: [PATCH 646/737] perf/arm-cmn: Optimise DTC counter accesses

In cases where we do know which DTC domain a node belongs to, we can
skip initialising or reading the global count in DTCs where we know
it won't change. The machinery to achieve that is mostly in place
already, so finish hooking it up by converting the vestigial domain
tracking to propagate suitable bitmaps all the way through to events.

Note that this does not allow allocating such an unused counter to a
different event on that DTC, because that is a flippin' nightmare.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/51d930fd945ef51c81f5889ccca055c302b0a1d0.1638530442.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 4f2c3872dde55090bf39e1f12a8517a32b6cd048)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/arm-cmn.c | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 019d21eb2c848..65f6037e10e5f 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -193,7 +193,7 @@ struct arm_cmn_node {
 			u8 occupid_count;
 		};
 		/* XP */
-		int dtc;
+		u8 dtc;
 	};
 	union {
 		u8 event[4];
@@ -968,14 +968,14 @@ static int arm_cmn_event_init(struct perf_event *event)
 	if (!hw->dn)
 		return -EINVAL;
 	for (dn = hw->dn; dn->type == type; dn++) {
-		if (!bynodeid) {
-			hw->num_dns++;
-		} else if (dn->id != nodeid) {
+		if (bynodeid && dn->id != nodeid) {
 			hw->dn++;
-		} else {
-			hw->num_dns = 1;
-			break;
+			continue;
 		}
+		hw->dtcs_used |= arm_cmn_node_to_xp(cmn, dn)->dtc;
+		hw->num_dns++;
+		if (bynodeid)
+			break;
 	}
 
 	if (!hw->num_dns) {
@@ -985,11 +985,6 @@ static int arm_cmn_event_init(struct perf_event *event)
 			nodeid, nid.x, nid.y, nid.port, nid.dev, type);
 		return -EINVAL;
 	}
-	/*
-	 * By assuming events count in all DTC domains, we cunningly avoid
-	 * needing to know anything about how XPs are assigned to domains.
-	 */
-	hw->dtcs_used = (1U << cmn->num_dtcs) - 1;
 
 	return arm_cmn_validate_group(event);
 }
@@ -1312,6 +1307,7 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn)
 {
 	struct arm_cmn_node *dn, *xp;
 	int dtc_idx = 0;
+	u8 dtcs_present = (1 << cmn->num_dtcs) - 1;
 
 	cmn->dtc = devm_kcalloc(cmn->dev, cmn->num_dtcs, sizeof(cmn->dtc[0]), GFP_KERNEL);
 	if (!cmn->dtc)
@@ -1323,8 +1319,7 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn)
 
 	for (dn = cmn->dns; dn->type; dn++) {
 		if (dn->type == CMN_TYPE_XP) {
-			if (dn->dtc < 0 && cmn->num_dtcs == 1)
-				dn->dtc = 0;
+			dn->dtc &= dtcs_present;
 			continue;
 		}
 
@@ -1334,8 +1329,8 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn)
 		if (dn->type == CMN_TYPE_DTC) {
 			int err;
 			/* We do at least know that a DTC's XP must be in that DTC's domain */
-			if (xp->dtc < 0)
-				xp->dtc = dtc_idx;
+			if (xp->dtc == 0xf)
+				xp->dtc = 1 << dtc_idx;
 			err = arm_cmn_init_dtc(cmn, dn, dtc_idx++);
 			if (err)
 				return err;
@@ -1436,7 +1431,7 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 		if (xp->id == (1 << 3))
 			cmn->mesh_x = xp->logid;
 
-		xp->dtc = -1;
+		xp->dtc = 0xf;
 		xp->dtm = dtm - cmn->dtms;
 		arm_cmn_init_dtm(dtm++, xp);
 

From 6615f957c1849f82ebfd7e950aa00e90105e2194 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 3 Dec 2021 11:44:58 +0000
Subject: [PATCH 647/737] perf/arm-cmn: Move group validation data off-stack

With the value of CMN_MAX_DTMS increasing significantly, our validation
data structure is set to get quite big. Technically we could pack it at
least twice as densely, since we only need around 19 bits of information
per DTM, but that makes the code even more mind-bogglingly impenetrable,
and even half of "quite big" may still be uncomfortably large for a
stack frame (~1KB). Just move it to an off-stack allocation instead.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/0cabff2e5839ddc0979e757c55515966f65359e4.1638530442.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 558a07807038017255005a4820f600da643d8a5f)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/arm-cmn.c | 43 ++++++++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 65f6037e10e5f..efc3e17eea357 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -876,8 +876,8 @@ static int arm_cmn_validate_group(struct perf_event *event)
 	struct arm_cmn_node *dn;
 	struct perf_event *sibling, *leader = event->group_leader;
 	enum cmn_node_type type;
-	struct arm_cmn_val val;
-	int i;
+	struct arm_cmn_val *val;
+	int i, ret = -EINVAL;
 	u8 occupid;
 
 	if (leader == event)
@@ -886,18 +886,22 @@ static int arm_cmn_validate_group(struct perf_event *event)
 	if (event->pmu != leader->pmu && !is_software_event(leader))
 		return -EINVAL;
 
-	memset(&val, 0, sizeof(val));
+	val = kzalloc(sizeof(*val), GFP_KERNEL);
+	if (!val)
+		return -ENOMEM;
 
-	arm_cmn_val_add_event(&val, leader);
+	arm_cmn_val_add_event(val, leader);
 	for_each_sibling_event(sibling, leader)
-		arm_cmn_val_add_event(&val, sibling);
+		arm_cmn_val_add_event(val, sibling);
 
 	type = CMN_EVENT_TYPE(event);
-	if (type == CMN_TYPE_DTC)
-		return val.cycles ? -EINVAL : 0;
+	if (type == CMN_TYPE_DTC) {
+		ret = val->cycles ? -EINVAL : 0;
+		goto done;
+	}
 
-	if (val.dtc_count == CMN_DT_NUM_COUNTERS)
-		return -EINVAL;
+	if (val->dtc_count == CMN_DT_NUM_COUNTERS)
+		goto done;
 
 	if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event)))
 		occupid = CMN_EVENT_OCCUPID(event) + 1;
@@ -907,25 +911,28 @@ static int arm_cmn_validate_group(struct perf_event *event)
 	for_each_hw_dn(hw, dn, i) {
 		int wp_idx, wp_cmb, dtm = dn->dtm;
 
-		if (val.dtm_count[dtm] == CMN_DTM_NUM_COUNTERS)
-			return -EINVAL;
+		if (val->dtm_count[dtm] == CMN_DTM_NUM_COUNTERS)
+			goto done;
 
-		if (occupid && val.occupid[dtm] && occupid != val.occupid[dtm])
-			return -EINVAL;
+		if (occupid && val->occupid[dtm] && occupid != val->occupid[dtm])
+			goto done;
 
 		if (type != CMN_TYPE_WP)
 			continue;
 
 		wp_idx = arm_cmn_wp_idx(event);
-		if (val.wp[dtm][wp_idx])
-			return -EINVAL;
+		if (val->wp[dtm][wp_idx])
+			goto done;
 
-		wp_cmb = val.wp[dtm][wp_idx ^ 1];
+		wp_cmb = val->wp[dtm][wp_idx ^ 1];
 		if (wp_cmb && wp_cmb != CMN_EVENT_WP_COMBINE(event) + 1)
-			return -EINVAL;
+			goto done;
 	}
 
-	return 0;
+	ret = 0;
+done:
+	kfree(val);
+	return ret;
 }
 
 static int arm_cmn_event_init(struct perf_event *event)

From 407dfeb24d924450ae6437a48de909f47859efd9 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 3 Dec 2021 11:44:59 +0000
Subject: [PATCH 648/737] perf/arm-cmn: Demarcate CMN-600 specifics

In preparation for supporting newer CMN products, let's introduce a
means to differentiate the features and events which are specific to a
particular IP from those which remain common to the whole family. The
newer designs have also smoothed off some of the rough edges in terms
of discoverability, so separate out the parts of the flow which have
effectively now become CMN-600 quirks.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/9f6368cdca4c821d801138939508a5bba54ccabb.1638530442.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 61ec1d875812046ff9d473183d53e19dcd6b2ada)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/arm-cmn.c | 313 +++++++++++++++++++++--------------------
 1 file changed, 162 insertions(+), 151 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index efc3e17eea357..7346e1e70f9b2 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -151,7 +151,12 @@
 #define CMN_WP_DOWN			2
 
 
-/* r0px probably don't exist in silicon, thankfully */
+enum cmn_model {
+	CMN_ANY = -1,
+	CMN600 = 1,
+};
+
+/* CMN-600 r0px shouldn't exist in silicon, thankfully */
 enum cmn_revision {
 	CMN600_R1P0,
 	CMN600_R1P1,
@@ -159,6 +164,7 @@ enum cmn_revision {
 	CMN600_R1P3,
 	CMN600_R2P0,
 	CMN600_R3P0,
+	CMN600_R3P1,
 };
 
 enum cmn_node_type {
@@ -229,6 +235,7 @@ struct arm_cmn {
 	void __iomem *base;
 
 	enum cmn_revision rev;
+	enum cmn_model model;
 	u8 mesh_x;
 	u8 mesh_y;
 	u16 num_xps;
@@ -326,6 +333,7 @@ static unsigned int arm_cmn_get_index(u64 x[], unsigned int pos)
 
 struct arm_cmn_event_attr {
 	struct device_attribute attr;
+	enum cmn_model model;
 	enum cmn_node_type type;
 	u8 eventid;
 	u8 occupid;
@@ -337,9 +345,10 @@ struct arm_cmn_format_attr {
 	int config;
 };
 
-#define CMN_EVENT_ATTR(_name, _type, _eventid, _occupid)		\
+#define CMN_EVENT_ATTR(_model, _name, _type, _eventid, _occupid)	\
 	(&((struct arm_cmn_event_attr[]) {{				\
 		.attr = __ATTR(_name, 0444, arm_cmn_event_show, NULL),	\
+		.model = _model,					\
 		.type = _type,						\
 		.eventid = _eventid,					\
 		.occupid = _occupid,					\
@@ -386,12 +395,15 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj,
 	eattr = container_of(attr, typeof(*eattr), attr.attr);
 	type = eattr->type;
 
+	if (!(eattr->model & cmn->model))
+		return 0;
+
 	/* Watchpoints aren't nodes */
 	if (type == CMN_TYPE_WP)
 		type = CMN_TYPE_XP;
 
 	/* Revision-specific differences */
-	if (cmn->rev < CMN600_R1P2) {
+	if (cmn->model == CMN600 && cmn->rev < CMN600_R1P2) {
 		if (type == CMN_TYPE_HNF && eattr->eventid == 0x1b)
 			return 0;
 	}
@@ -402,25 +414,27 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj,
 	return attr->mode;
 }
 
-#define _CMN_EVENT_DVM(_name, _event, _occup)			\
-	CMN_EVENT_ATTR(dn_##_name, CMN_TYPE_DVM, _event, _occup)
+#define _CMN_EVENT_DVM(_model, _name, _event, _occup)		\
+	CMN_EVENT_ATTR(_model, dn_##_name, CMN_TYPE_DVM, _event, _occup)
 #define CMN_EVENT_DTC(_name)					\
-	CMN_EVENT_ATTR(dtc_##_name, CMN_TYPE_DTC, 0, 0)
-#define _CMN_EVENT_HNF(_name, _event, _occup)			\
-	CMN_EVENT_ATTR(hnf_##_name, CMN_TYPE_HNF, _event, _occup)
+	CMN_EVENT_ATTR(CMN_ANY, dtc_##_name, CMN_TYPE_DTC, 0, 0)
+#define _CMN_EVENT_HNF(_model, _name, _event, _occup)		\
+	CMN_EVENT_ATTR(_model, hnf_##_name, CMN_TYPE_HNF, _event, _occup)
 #define CMN_EVENT_HNI(_name, _event)				\
-	CMN_EVENT_ATTR(hni_##_name, CMN_TYPE_HNI, _event, 0)
+	CMN_EVENT_ATTR(CMN_ANY, hni_##_name, CMN_TYPE_HNI, _event, 0)
 #define __CMN_EVENT_XP(_name, _event)				\
-	CMN_EVENT_ATTR(mxp_##_name, CMN_TYPE_XP, _event, 0)
-#define CMN_EVENT_SBSX(_name, _event)				\
-	CMN_EVENT_ATTR(sbsx_##_name, CMN_TYPE_SBSX, _event, 0)
-#define CMN_EVENT_RNID(_name, _event)				\
-	CMN_EVENT_ATTR(rnid_##_name, CMN_TYPE_RNI, _event, 0)
-
-#define CMN_EVENT_DVM(_name, _event)				\
-	_CMN_EVENT_DVM(_name, _event, 0)
-#define CMN_EVENT_HNF(_name, _event)				\
-	_CMN_EVENT_HNF(_name, _event, 0)
+	CMN_EVENT_ATTR(CMN_ANY, mxp_##_name, CMN_TYPE_XP, _event, 0)
+#define CMN_EVENT_SBSX(_model, _name, _event)			\
+	CMN_EVENT_ATTR(_model, sbsx_##_name, CMN_TYPE_SBSX, _event, 0)
+#define CMN_EVENT_RNID(_model, _name, _event)			\
+	CMN_EVENT_ATTR(_model, rnid_##_name, CMN_TYPE_RNI, _event, 0)
+#define CMN_EVENT_MTSX(_name, _event)				\
+	CMN_EVENT_ATTR(CMN_ANY, mtsx_##_name, CMN_TYPE_MTSX, _event, 0)
+
+#define CMN_EVENT_DVM(_model, _name, _event)			\
+	_CMN_EVENT_DVM(_model, _name, _event, 0)
+#define CMN_EVENT_HNF(_model, _name, _event)			\
+	_CMN_EVENT_HNF(_model, _name, _event, 0)
 #define _CMN_EVENT_XP(_name, _event)				\
 	__CMN_EVENT_XP(e_##_name, (_event) | (0 << 2)),		\
 	__CMN_EVENT_XP(w_##_name, (_event) | (1 << 2)),		\
@@ -445,115 +459,115 @@ static struct attribute *arm_cmn_event_attrs[] = {
 	 * slot, but our lazy short-cut of using the DTM counter index for
 	 * the PMU index as well happens to avoid that by construction.
 	 */
-	CMN_EVENT_DVM(rxreq_dvmop,	0x01),
-	CMN_EVENT_DVM(rxreq_dvmsync,	0x02),
-	CMN_EVENT_DVM(rxreq_dvmop_vmid_filtered, 0x03),
-	CMN_EVENT_DVM(rxreq_retried,	0x04),
-	_CMN_EVENT_DVM(rxreq_trk_occupancy_all, 0x05, 0),
-	_CMN_EVENT_DVM(rxreq_trk_occupancy_dvmop, 0x05, 1),
-	_CMN_EVENT_DVM(rxreq_trk_occupancy_dvmsync, 0x05, 2),
-
-	CMN_EVENT_HNF(cache_miss,	0x01),
-	CMN_EVENT_HNF(slc_sf_cache_access, 0x02),
-	CMN_EVENT_HNF(cache_fill,	0x03),
-	CMN_EVENT_HNF(pocq_retry,	0x04),
-	CMN_EVENT_HNF(pocq_reqs_recvd,	0x05),
-	CMN_EVENT_HNF(sf_hit,		0x06),
-	CMN_EVENT_HNF(sf_evictions,	0x07),
-	CMN_EVENT_HNF(dir_snoops_sent,	0x08),
-	CMN_EVENT_HNF(brd_snoops_sent,	0x09),
-	CMN_EVENT_HNF(slc_eviction,	0x0a),
-	CMN_EVENT_HNF(slc_fill_invalid_way, 0x0b),
-	CMN_EVENT_HNF(mc_retries,	0x0c),
-	CMN_EVENT_HNF(mc_reqs,		0x0d),
-	CMN_EVENT_HNF(qos_hh_retry,	0x0e),
-	_CMN_EVENT_HNF(qos_pocq_occupancy_all, 0x0f, 0),
-	_CMN_EVENT_HNF(qos_pocq_occupancy_read, 0x0f, 1),
-	_CMN_EVENT_HNF(qos_pocq_occupancy_write, 0x0f, 2),
-	_CMN_EVENT_HNF(qos_pocq_occupancy_atomic, 0x0f, 3),
-	_CMN_EVENT_HNF(qos_pocq_occupancy_stash, 0x0f, 4),
-	CMN_EVENT_HNF(pocq_addrhaz,	0x10),
-	CMN_EVENT_HNF(pocq_atomic_addrhaz, 0x11),
-	CMN_EVENT_HNF(ld_st_swp_adq_full, 0x12),
-	CMN_EVENT_HNF(cmp_adq_full,	0x13),
-	CMN_EVENT_HNF(txdat_stall,	0x14),
-	CMN_EVENT_HNF(txrsp_stall,	0x15),
-	CMN_EVENT_HNF(seq_full,		0x16),
-	CMN_EVENT_HNF(seq_hit,		0x17),
-	CMN_EVENT_HNF(snp_sent,		0x18),
-	CMN_EVENT_HNF(sfbi_dir_snp_sent, 0x19),
-	CMN_EVENT_HNF(sfbi_brd_snp_sent, 0x1a),
-	CMN_EVENT_HNF(snp_sent_untrk,	0x1b),
-	CMN_EVENT_HNF(intv_dirty,	0x1c),
-	CMN_EVENT_HNF(stash_snp_sent,	0x1d),
-	CMN_EVENT_HNF(stash_data_pull,	0x1e),
-	CMN_EVENT_HNF(snp_fwded,	0x1f),
-
-	CMN_EVENT_HNI(rrt_rd_occ_cnt_ovfl, 0x20),
-	CMN_EVENT_HNI(rrt_wr_occ_cnt_ovfl, 0x21),
-	CMN_EVENT_HNI(rdt_rd_occ_cnt_ovfl, 0x22),
-	CMN_EVENT_HNI(rdt_wr_occ_cnt_ovfl, 0x23),
-	CMN_EVENT_HNI(wdb_occ_cnt_ovfl,	0x24),
-	CMN_EVENT_HNI(rrt_rd_alloc,	0x25),
-	CMN_EVENT_HNI(rrt_wr_alloc,	0x26),
-	CMN_EVENT_HNI(rdt_rd_alloc,	0x27),
-	CMN_EVENT_HNI(rdt_wr_alloc,	0x28),
-	CMN_EVENT_HNI(wdb_alloc,	0x29),
-	CMN_EVENT_HNI(txrsp_retryack,	0x2a),
-	CMN_EVENT_HNI(arvalid_no_arready, 0x2b),
-	CMN_EVENT_HNI(arready_no_arvalid, 0x2c),
-	CMN_EVENT_HNI(awvalid_no_awready, 0x2d),
-	CMN_EVENT_HNI(awready_no_awvalid, 0x2e),
-	CMN_EVENT_HNI(wvalid_no_wready,	0x2f),
-	CMN_EVENT_HNI(txdat_stall,	0x30),
-	CMN_EVENT_HNI(nonpcie_serialization, 0x31),
-	CMN_EVENT_HNI(pcie_serialization, 0x32),
-
-	CMN_EVENT_XP(txflit_valid,	0x01),
-	CMN_EVENT_XP(txflit_stall,	0x02),
-	CMN_EVENT_XP(partial_dat_flit,	0x03),
+	CMN_EVENT_DVM(CMN600, rxreq_dvmop,		0x01),
+	CMN_EVENT_DVM(CMN600, rxreq_dvmsync,		0x02),
+	CMN_EVENT_DVM(CMN600, rxreq_dvmop_vmid_filtered, 0x03),
+	CMN_EVENT_DVM(CMN600, rxreq_retried,		0x04),
+	_CMN_EVENT_DVM(CMN600, rxreq_trk_occupancy_all, 0x05, 0),
+	_CMN_EVENT_DVM(CMN600, rxreq_trk_occupancy_dvmop, 0x05, 1),
+	_CMN_EVENT_DVM(CMN600, rxreq_trk_occupancy_dvmsync, 0x05, 2),
+
+	CMN_EVENT_HNF(CMN_ANY, cache_miss,		0x01),
+	CMN_EVENT_HNF(CMN_ANY, slc_sf_cache_access,	0x02),
+	CMN_EVENT_HNF(CMN_ANY, cache_fill,		0x03),
+	CMN_EVENT_HNF(CMN_ANY, pocq_retry,		0x04),
+	CMN_EVENT_HNF(CMN_ANY, pocq_reqs_recvd,		0x05),
+	CMN_EVENT_HNF(CMN_ANY, sf_hit,			0x06),
+	CMN_EVENT_HNF(CMN_ANY, sf_evictions,		0x07),
+	CMN_EVENT_HNF(CMN_ANY, dir_snoops_sent,		0x08),
+	CMN_EVENT_HNF(CMN_ANY, brd_snoops_sent,		0x09),
+	CMN_EVENT_HNF(CMN_ANY, slc_eviction,		0x0a),
+	CMN_EVENT_HNF(CMN_ANY, slc_fill_invalid_way,	0x0b),
+	CMN_EVENT_HNF(CMN_ANY, mc_retries,		0x0c),
+	CMN_EVENT_HNF(CMN_ANY, mc_reqs,			0x0d),
+	CMN_EVENT_HNF(CMN_ANY, qos_hh_retry,		0x0e),
+	_CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_all,	0x0f, 0),
+	_CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_read, 0x0f, 1),
+	_CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_write, 0x0f, 2),
+	_CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_atomic, 0x0f, 3),
+	_CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_stash, 0x0f, 4),
+	CMN_EVENT_HNF(CMN_ANY, pocq_addrhaz,		0x10),
+	CMN_EVENT_HNF(CMN_ANY, pocq_atomic_addrhaz,	0x11),
+	CMN_EVENT_HNF(CMN_ANY, ld_st_swp_adq_full,	0x12),
+	CMN_EVENT_HNF(CMN_ANY, cmp_adq_full,		0x13),
+	CMN_EVENT_HNF(CMN_ANY, txdat_stall,		0x14),
+	CMN_EVENT_HNF(CMN_ANY, txrsp_stall,		0x15),
+	CMN_EVENT_HNF(CMN_ANY, seq_full,		0x16),
+	CMN_EVENT_HNF(CMN_ANY, seq_hit,			0x17),
+	CMN_EVENT_HNF(CMN_ANY, snp_sent,		0x18),
+	CMN_EVENT_HNF(CMN_ANY, sfbi_dir_snp_sent,	0x19),
+	CMN_EVENT_HNF(CMN_ANY, sfbi_brd_snp_sent,	0x1a),
+	CMN_EVENT_HNF(CMN_ANY, snp_sent_untrk,		0x1b),
+	CMN_EVENT_HNF(CMN_ANY, intv_dirty,		0x1c),
+	CMN_EVENT_HNF(CMN_ANY, stash_snp_sent,		0x1d),
+	CMN_EVENT_HNF(CMN_ANY, stash_data_pull,		0x1e),
+	CMN_EVENT_HNF(CMN_ANY, snp_fwded,		0x1f),
+
+	CMN_EVENT_HNI(rrt_rd_occ_cnt_ovfl,		0x20),
+	CMN_EVENT_HNI(rrt_wr_occ_cnt_ovfl,		0x21),
+	CMN_EVENT_HNI(rdt_rd_occ_cnt_ovfl,		0x22),
+	CMN_EVENT_HNI(rdt_wr_occ_cnt_ovfl,		0x23),
+	CMN_EVENT_HNI(wdb_occ_cnt_ovfl,			0x24),
+	CMN_EVENT_HNI(rrt_rd_alloc,			0x25),
+	CMN_EVENT_HNI(rrt_wr_alloc,			0x26),
+	CMN_EVENT_HNI(rdt_rd_alloc,			0x27),
+	CMN_EVENT_HNI(rdt_wr_alloc,			0x28),
+	CMN_EVENT_HNI(wdb_alloc,			0x29),
+	CMN_EVENT_HNI(txrsp_retryack,			0x2a),
+	CMN_EVENT_HNI(arvalid_no_arready,		0x2b),
+	CMN_EVENT_HNI(arready_no_arvalid,		0x2c),
+	CMN_EVENT_HNI(awvalid_no_awready,		0x2d),
+	CMN_EVENT_HNI(awready_no_awvalid,		0x2e),
+	CMN_EVENT_HNI(wvalid_no_wready,			0x2f),
+	CMN_EVENT_HNI(txdat_stall,			0x30),
+	CMN_EVENT_HNI(nonpcie_serialization,		0x31),
+	CMN_EVENT_HNI(pcie_serialization,		0x32),
+
+	CMN_EVENT_XP(txflit_valid,			0x01),
+	CMN_EVENT_XP(txflit_stall,			0x02),
+	CMN_EVENT_XP(partial_dat_flit,			0x03),
 	/* We treat watchpoints as a special made-up class of XP events */
-	CMN_EVENT_ATTR(watchpoint_up, CMN_TYPE_WP, 0, 0),
-	CMN_EVENT_ATTR(watchpoint_down, CMN_TYPE_WP, 2, 0),
-
-	CMN_EVENT_SBSX(rd_req,		0x01),
-	CMN_EVENT_SBSX(wr_req,		0x02),
-	CMN_EVENT_SBSX(cmo_req,		0x03),
-	CMN_EVENT_SBSX(txrsp_retryack,	0x04),
-	CMN_EVENT_SBSX(txdat_flitv,	0x05),
-	CMN_EVENT_SBSX(txrsp_flitv,	0x06),
-	CMN_EVENT_SBSX(rd_req_trkr_occ_cnt_ovfl, 0x11),
-	CMN_EVENT_SBSX(wr_req_trkr_occ_cnt_ovfl, 0x12),
-	CMN_EVENT_SBSX(cmo_req_trkr_occ_cnt_ovfl, 0x13),
-	CMN_EVENT_SBSX(wdb_occ_cnt_ovfl, 0x14),
-	CMN_EVENT_SBSX(rd_axi_trkr_occ_cnt_ovfl, 0x15),
-	CMN_EVENT_SBSX(cmo_axi_trkr_occ_cnt_ovfl, 0x16),
-	CMN_EVENT_SBSX(arvalid_no_arready, 0x21),
-	CMN_EVENT_SBSX(awvalid_no_awready, 0x22),
-	CMN_EVENT_SBSX(wvalid_no_wready, 0x23),
-	CMN_EVENT_SBSX(txdat_stall,	0x24),
-	CMN_EVENT_SBSX(txrsp_stall,	0x25),
-
-	CMN_EVENT_RNID(s0_rdata_beats,	0x01),
-	CMN_EVENT_RNID(s1_rdata_beats,	0x02),
-	CMN_EVENT_RNID(s2_rdata_beats,	0x03),
-	CMN_EVENT_RNID(rxdat_flits,	0x04),
-	CMN_EVENT_RNID(txdat_flits,	0x05),
-	CMN_EVENT_RNID(txreq_flits_total, 0x06),
-	CMN_EVENT_RNID(txreq_flits_retried, 0x07),
-	CMN_EVENT_RNID(rrt_occ_ovfl,	0x08),
-	CMN_EVENT_RNID(wrt_occ_ovfl,	0x09),
-	CMN_EVENT_RNID(txreq_flits_replayed, 0x0a),
-	CMN_EVENT_RNID(wrcancel_sent,	0x0b),
-	CMN_EVENT_RNID(s0_wdata_beats,	0x0c),
-	CMN_EVENT_RNID(s1_wdata_beats,	0x0d),
-	CMN_EVENT_RNID(s2_wdata_beats,	0x0e),
-	CMN_EVENT_RNID(rrt_alloc,	0x0f),
-	CMN_EVENT_RNID(wrt_alloc,	0x10),
-	CMN_EVENT_RNID(rdb_unord,	0x11),
-	CMN_EVENT_RNID(rdb_replay,	0x12),
-	CMN_EVENT_RNID(rdb_hybrid,	0x13),
-	CMN_EVENT_RNID(rdb_ord,		0x14),
+	CMN_EVENT_ATTR(CMN_ANY, watchpoint_up, CMN_TYPE_WP, CMN_WP_UP, 0),
+	CMN_EVENT_ATTR(CMN_ANY, watchpoint_down, CMN_TYPE_WP, CMN_WP_DOWN, 0),
+
+	CMN_EVENT_SBSX(CMN_ANY, rd_req,			0x01),
+	CMN_EVENT_SBSX(CMN_ANY, wr_req,			0x02),
+	CMN_EVENT_SBSX(CMN_ANY, cmo_req,		0x03),
+	CMN_EVENT_SBSX(CMN_ANY, txrsp_retryack,		0x04),
+	CMN_EVENT_SBSX(CMN_ANY, txdat_flitv,		0x05),
+	CMN_EVENT_SBSX(CMN_ANY, txrsp_flitv,		0x06),
+	CMN_EVENT_SBSX(CMN_ANY, rd_req_trkr_occ_cnt_ovfl, 0x11),
+	CMN_EVENT_SBSX(CMN_ANY, wr_req_trkr_occ_cnt_ovfl, 0x12),
+	CMN_EVENT_SBSX(CMN_ANY, cmo_req_trkr_occ_cnt_ovfl, 0x13),
+	CMN_EVENT_SBSX(CMN_ANY, wdb_occ_cnt_ovfl,	0x14),
+	CMN_EVENT_SBSX(CMN_ANY, rd_axi_trkr_occ_cnt_ovfl, 0x15),
+	CMN_EVENT_SBSX(CMN_ANY, cmo_axi_trkr_occ_cnt_ovfl, 0x16),
+	CMN_EVENT_SBSX(CMN_ANY, arvalid_no_arready,	0x21),
+	CMN_EVENT_SBSX(CMN_ANY, awvalid_no_awready,	0x22),
+	CMN_EVENT_SBSX(CMN_ANY, wvalid_no_wready,	0x23),
+	CMN_EVENT_SBSX(CMN_ANY, txdat_stall,		0x24),
+	CMN_EVENT_SBSX(CMN_ANY, txrsp_stall,		0x25),
+
+	CMN_EVENT_RNID(CMN_ANY, s0_rdata_beats,		0x01),
+	CMN_EVENT_RNID(CMN_ANY, s1_rdata_beats,		0x02),
+	CMN_EVENT_RNID(CMN_ANY, s2_rdata_beats,		0x03),
+	CMN_EVENT_RNID(CMN_ANY, rxdat_flits,		0x04),
+	CMN_EVENT_RNID(CMN_ANY, txdat_flits,		0x05),
+	CMN_EVENT_RNID(CMN_ANY, txreq_flits_total,	0x06),
+	CMN_EVENT_RNID(CMN_ANY, txreq_flits_retried,	0x07),
+	CMN_EVENT_RNID(CMN_ANY, rrt_occ_ovfl,		0x08),
+	CMN_EVENT_RNID(CMN_ANY, wrt_occ_ovfl,		0x09),
+	CMN_EVENT_RNID(CMN_ANY, txreq_flits_replayed,	0x0a),
+	CMN_EVENT_RNID(CMN_ANY, wrcancel_sent,		0x0b),
+	CMN_EVENT_RNID(CMN_ANY, s0_wdata_beats,		0x0c),
+	CMN_EVENT_RNID(CMN_ANY, s1_wdata_beats,		0x0d),
+	CMN_EVENT_RNID(CMN_ANY, s2_wdata_beats,		0x0e),
+	CMN_EVENT_RNID(CMN_ANY, rrt_alloc,		0x0f),
+	CMN_EVENT_RNID(CMN_ANY, wrt_alloc,		0x10),
+	CMN_EVENT_RNID(CMN600, rdb_unord,		0x11),
+	CMN_EVENT_RNID(CMN600, rdb_replay,		0x12),
+	CMN_EVENT_RNID(CMN600, rdb_hybrid,		0x13),
+	CMN_EVENT_RNID(CMN600, rdb_ord,			0x14),
 
 	NULL
 };
@@ -1387,15 +1401,14 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 	int i, j;
 	size_t sz;
 
-	cfg_region = cmn->base + rgn_offset;
-	reg = readl_relaxed(cfg_region + CMN_CFGM_PERIPH_ID_2);
-	cmn->rev = FIELD_GET(CMN_CFGM_PID2_REVISION, reg);
-	dev_dbg(cmn->dev, "periph_id_2 revision: %d\n", cmn->rev);
-
 	arm_cmn_init_node_info(cmn, rgn_offset, &cfg);
 	if (cfg.type != CMN_TYPE_CFG)
 		return -ENODEV;
 
+	cfg_region = cmn->base + rgn_offset;
+	reg = readl_relaxed(cfg_region + CMN_CFGM_PERIPH_ID_2);
+	cmn->rev = FIELD_GET(CMN_CFGM_PID2_REVISION, reg);
+
 	reg = readq_relaxed(cfg_region + CMN_CHILD_INFO);
 	child_count = FIELD_GET(CMN_CI_CHILD_COUNT, reg);
 	child_poff = FIELD_GET(CMN_CI_CHILD_PTR_OFFSET, reg);
@@ -1508,13 +1521,14 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 		cmn->mesh_x = cmn->num_xps;
 	cmn->mesh_y = cmn->num_xps / cmn->mesh_x;
 
+	dev_dbg(cmn->dev, "model %d, periph_id_2 revision %d\n", cmn->model, cmn->rev);
 	dev_dbg(cmn->dev, "mesh %dx%d, ID width %d\n",
 		cmn->mesh_x, cmn->mesh_y, arm_cmn_xyidbits(cmn));
 
 	return 0;
 }
 
-static int arm_cmn_acpi_probe(struct platform_device *pdev, struct arm_cmn *cmn)
+static int arm_cmn600_acpi_probe(struct platform_device *pdev, struct arm_cmn *cmn)
 {
 	struct resource *cfg, *root;
 
@@ -1541,21 +1555,11 @@ static int arm_cmn_acpi_probe(struct platform_device *pdev, struct arm_cmn *cmn)
 	return root->start - cfg->start;
 }
 
-static int arm_cmn_of_probe(struct platform_device *pdev, struct arm_cmn *cmn)
+static int arm_cmn600_of_probe(struct device_node *np)
 {
-	struct device_node *np = pdev->dev.of_node;
 	u32 rootnode;
-	int ret;
 
-	cmn->base = devm_platform_ioremap_resource(pdev, 0);
-	if (IS_ERR(cmn->base))
-		return PTR_ERR(cmn->base);
-
-	ret = of_property_read_u32(np, "arm,root-node", &rootnode);
-	if (ret)
-		return ret;
-
-	return rootnode;
+	return of_property_read_u32(np, "arm,root-node", &rootnode) ?: rootnode;
 }
 
 static int arm_cmn_probe(struct platform_device *pdev)
@@ -1570,12 +1574,19 @@ static int arm_cmn_probe(struct platform_device *pdev)
 		return -ENOMEM;
 
 	cmn->dev = &pdev->dev;
+	cmn->model = (unsigned long)device_get_match_data(cmn->dev);
 	platform_set_drvdata(pdev, cmn);
 
-	if (has_acpi_companion(cmn->dev))
-		rootnode = arm_cmn_acpi_probe(pdev, cmn);
-	else
-		rootnode = arm_cmn_of_probe(pdev, cmn);
+	if (cmn->model == CMN600 && has_acpi_companion(cmn->dev)) {
+		rootnode = arm_cmn600_acpi_probe(pdev, cmn);
+	} else {
+		rootnode = 0;
+		cmn->base = devm_platform_ioremap_resource(pdev, 0);
+		if (IS_ERR(cmn->base))
+			return PTR_ERR(cmn->base);
+		if (cmn->model == CMN600)
+			rootnode = arm_cmn600_of_probe(pdev->dev.of_node);
+	}
 	if (rootnode < 0)
 		return rootnode;
 
@@ -1638,7 +1649,7 @@ static int arm_cmn_remove(struct platform_device *pdev)
 
 #ifdef CONFIG_OF
 static const struct of_device_id arm_cmn_of_match[] = {
-	{ .compatible = "arm,cmn-600", },
+	{ .compatible = "arm,cmn-600", .data = (void *)CMN600 },
 	{}
 };
 MODULE_DEVICE_TABLE(of, arm_cmn_of_match);
@@ -1646,7 +1657,7 @@ MODULE_DEVICE_TABLE(of, arm_cmn_of_match);
 
 #ifdef CONFIG_ACPI
 static const struct acpi_device_id arm_cmn_acpi_match[] = {
-	{ "ARMHC600", },
+	{ "ARMHC600", CMN600 },
 	{}
 };
 MODULE_DEVICE_TABLE(acpi, arm_cmn_acpi_match);

From d443e31598e66d7e548cab0c9ce3e5fa4b89528a Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 3 Dec 2021 11:45:00 +0000
Subject: [PATCH 649/737] perf/arm-cmn: Support new IP features

The second generation of CMN IPs add new node types and significantly
expand the configuration space with options for extra device ports on
edge XPs, either plumbed into the regular DTM or with extra dedicated
DTMs to monitor them, plus larger (and smaller) mesh sizes. Add basic
support for pulling this new information out of the hardware, piping
it around as necessary, and handling (most of) the new choices.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/e58b495bcc7deec3882be4bac910ed0bf6979674.1638530442.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 60d1504070c22c059a1e11bc3fd444953da988c1)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/arm-cmn.c | 218 ++++++++++++++++++++++++++++++++---------
 1 file changed, 171 insertions(+), 47 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 7346e1e70f9b2..1d21f39cb7279 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -24,7 +24,10 @@
 #define CMN_NI_LOGICAL_ID		GENMASK_ULL(47, 32)
 
 #define CMN_NODEID_DEVID(reg)		((reg) & 3)
+#define CMN_NODEID_EXT_DEVID(reg)	((reg) & 1)
 #define CMN_NODEID_PID(reg)		(((reg) >> 2) & 1)
+#define CMN_NODEID_EXT_PID(reg)		(((reg) >> 1) & 3)
+#define CMN_NODEID_1x1_PID(reg)		(((reg) >> 2) & 7)
 #define CMN_NODEID_X(reg, bits)		((reg) >> (3 + (bits)))
 #define CMN_NODEID_Y(reg, bits)		(((reg) >> 3) & ((1U << (bits)) - 1))
 
@@ -37,13 +40,26 @@
 
 #define CMN_MAX_DIMENSION		8
 #define CMN_MAX_XPS			(CMN_MAX_DIMENSION * CMN_MAX_DIMENSION)
-#define CMN_MAX_DTMS			CMN_MAX_XPS
+#define CMN_MAX_DTMS			(CMN_MAX_XPS + (CMN_MAX_DIMENSION - 1) * 4)
 
-/* The CFG node has one other useful purpose */
+/* The CFG node has various info besides the discovery tree */
 #define CMN_CFGM_PERIPH_ID_2		0x0010
 #define CMN_CFGM_PID2_REVISION		GENMASK(7, 4)
 
-/* PMU registers occupy the 3rd 4KB page of each node's 16KB space */
+#define CMN_CFGM_INFO_GLOBAL		0x900
+#define CMN_INFO_MULTIPLE_DTM_EN	BIT_ULL(63)
+#define CMN_INFO_RSP_VC_NUM		GENMASK_ULL(53, 52)
+#define CMN_INFO_DAT_VC_NUM		GENMASK_ULL(51, 50)
+
+/* XPs also have some local topology info which has uses too */
+#define CMN_MXP__CONNECT_INFO_P0	0x0008
+#define CMN_MXP__CONNECT_INFO_P1	0x0010
+#define CMN_MXP__CONNECT_INFO_P2	0x0028
+#define CMN_MXP__CONNECT_INFO_P3	0x0030
+#define CMN_MXP__CONNECT_INFO_P4	0x0038
+#define CMN_MXP__CONNECT_INFO_P5	0x0040
+
+/* PMU registers occupy the 3rd 4KB page of each node's region */
 #define CMN_PMU_OFFSET			0x2000
 
 /* For most nodes, this is all there is */
@@ -53,6 +69,7 @@
 /* DTMs live in the PMU space of XP registers */
 #define CMN_DTM_WPn(n)			(0x1A0 + (n) * 0x18)
 #define CMN_DTM_WPn_CONFIG(n)		(CMN_DTM_WPn(n) + 0x00)
+#define CMN_DTM_WPn_CONFIG_WP_DEV_SEL2	GENMASK_ULL(18,17)
 #define CMN_DTM_WPn_CONFIG_WP_COMBINE	BIT(6)
 #define CMN_DTM_WPn_CONFIG_WP_EXCLUSIVE	BIT(5)
 #define CMN_DTM_WPn_CONFIG_WP_GRP	BIT(4)
@@ -77,7 +94,11 @@
 
 #define CMN_DTM_PMEVCNTSR		0x240
 
+#define CMN_DTM_UNIT_INFO		0x0910
+
 #define CMN_DTM_NUM_COUNTERS		4
+/* Want more local counters? Why not replicate the whole DTM! Ugh... */
+#define CMN_DTM_OFFSET(n)		((n) * 0x200)
 
 /* The DTC node is where the magic happens */
 #define CMN_DT_DTC_CTL			0x0a00
@@ -131,10 +152,10 @@
 #define CMN_EVENT_NODEID(event)		FIELD_GET(CMN_CONFIG_NODEID, (event)->attr.config)
 
 #define CMN_CONFIG_WP_COMBINE		GENMASK_ULL(27, 24)
-#define CMN_CONFIG_WP_DEV_SEL		BIT_ULL(48)
-#define CMN_CONFIG_WP_CHN_SEL		GENMASK_ULL(50, 49)
-#define CMN_CONFIG_WP_GRP		BIT_ULL(52)
-#define CMN_CONFIG_WP_EXCLUSIVE		BIT_ULL(53)
+#define CMN_CONFIG_WP_DEV_SEL		GENMASK_ULL(50, 48)
+#define CMN_CONFIG_WP_CHN_SEL		GENMASK_ULL(55, 51)
+#define CMN_CONFIG_WP_GRP		BIT_ULL(56)
+#define CMN_CONFIG_WP_EXCLUSIVE		BIT_ULL(57)
 #define CMN_CONFIG1_WP_VAL		GENMASK_ULL(63, 0)
 #define CMN_CONFIG2_WP_MASK		GENMASK_ULL(63, 0)
 
@@ -176,9 +197,12 @@ enum cmn_node_type {
 	CMN_TYPE_HNF,
 	CMN_TYPE_XP,
 	CMN_TYPE_SBSX,
-	CMN_TYPE_RNI = 0xa,
+	CMN_TYPE_MPAM_S,
+	CMN_TYPE_MPAM_NS,
+	CMN_TYPE_RNI,
 	CMN_TYPE_RND = 0xd,
 	CMN_TYPE_RNSAM = 0xf,
+	CMN_TYPE_MTSX,
 	CMN_TYPE_CXRA = 0x100,
 	CMN_TYPE_CXHA = 0x101,
 	CMN_TYPE_CXLA = 0x102,
@@ -233,6 +257,7 @@ struct arm_cmn_dtc {
 struct arm_cmn {
 	struct device *dev;
 	void __iomem *base;
+	unsigned int state;
 
 	enum cmn_revision rev;
 	enum cmn_model model;
@@ -240,6 +265,13 @@ struct arm_cmn {
 	u8 mesh_y;
 	u16 num_xps;
 	u16 num_dns;
+	bool multi_dtm;
+	u8 ports_used;
+	struct {
+		unsigned int rsp_vc_num : 2;
+		unsigned int dat_vc_num : 2;
+	};
+
 	struct arm_cmn_node *xps;
 	struct arm_cmn_node *dns;
 
@@ -250,7 +282,6 @@ struct arm_cmn {
 	int cpu;
 	struct hlist_node cpuhp_node;
 
-	unsigned int state;
 	struct pmu pmu;
 };
 
@@ -275,13 +306,25 @@ static int arm_cmn_xyidbits(const struct arm_cmn *cmn)
 static struct arm_cmn_nodeid arm_cmn_nid(const struct arm_cmn *cmn, u16 id)
 {
 	struct arm_cmn_nodeid nid;
-	int bits = arm_cmn_xyidbits(cmn);
 
-	nid.x = CMN_NODEID_X(id, bits);
-	nid.y = CMN_NODEID_Y(id, bits);
-	nid.port = CMN_NODEID_PID(id);
-	nid.dev = CMN_NODEID_DEVID(id);
+	if (cmn->num_xps == 1) {
+		nid.x = 0;
+		nid.y = 0;
+		nid.port = CMN_NODEID_1x1_PID(id);
+		nid.dev = CMN_NODEID_DEVID(id);
+	} else {
+		int bits = arm_cmn_xyidbits(cmn);
 
+		nid.x = CMN_NODEID_X(id, bits);
+		nid.y = CMN_NODEID_Y(id, bits);
+		if (cmn->ports_used & 0xc) {
+			nid.port = CMN_NODEID_EXT_PID(id);
+			nid.dev = CMN_NODEID_EXT_DEVID(id);
+		} else {
+			nid.port = CMN_NODEID_PID(id);
+			nid.dev = CMN_NODEID_DEVID(id);
+		}
+	}
 	return nid;
 }
 
@@ -310,6 +353,7 @@ struct arm_cmn_hw_event {
 	unsigned int dtc_idx;
 	u8 dtcs_used;
 	u8 num_dns;
+	u8 dtm_offset;
 };
 
 #define for_each_hw_dn(hw, dn, i) \
@@ -354,7 +398,8 @@ struct arm_cmn_format_attr {
 		.occupid = _occupid,					\
 	}})[0].attr.attr)
 
-static bool arm_cmn_is_occup_event(enum cmn_node_type type, unsigned int id)
+static bool arm_cmn_is_occup_event(enum cmn_model model,
+				   enum cmn_node_type type, unsigned int id)
 {
 	return (type == CMN_TYPE_DVM && id == 0x05) ||
 	       (type == CMN_TYPE_HNF && id == 0x0f);
@@ -375,7 +420,7 @@ static ssize_t arm_cmn_event_show(struct device *dev,
 				  "type=0x%x,eventid=0x%x,wp_dev_sel=?,wp_chn_sel=?,wp_grp=?,wp_val=?,wp_mask=?\n",
 				  eattr->type, eattr->eventid);
 
-	if (arm_cmn_is_occup_event(eattr->type, eattr->eventid))
+	if (arm_cmn_is_occup_event(eattr->model, eattr->type, eattr->eventid))
 		return sysfs_emit(buf, "type=0x%x,eventid=0x%x,occupid=0x%x\n",
 				  eattr->type, eattr->eventid, eattr->occupid);
 
@@ -390,25 +435,36 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj,
 	struct device *dev = kobj_to_dev(kobj);
 	struct arm_cmn *cmn = to_cmn(dev_get_drvdata(dev));
 	struct arm_cmn_event_attr *eattr;
-	enum cmn_node_type type;
 
 	eattr = container_of(attr, typeof(*eattr), attr.attr);
-	type = eattr->type;
 
 	if (!(eattr->model & cmn->model))
 		return 0;
 
-	/* Watchpoints aren't nodes */
-	if (type == CMN_TYPE_WP)
-		type = CMN_TYPE_XP;
+	/* Watchpoints aren't nodes, so avoid confusion */
+	if (eattr->type == CMN_TYPE_WP)
+		return attr->mode;
+
+	/* Hide XP events for unused interfaces/channels */
+	if (eattr->type == CMN_TYPE_XP) {
+		unsigned int intf = (eattr->eventid >> 2) & 7;
+		unsigned int chan = eattr->eventid >> 5;
+
+		if ((intf & 4) && !(cmn->ports_used & BIT(intf & 3)))
+			return 0;
+
+		if ((chan == 5 && cmn->rsp_vc_num < 2) ||
+		    (chan == 6 && cmn->dat_vc_num < 2))
+			return 0;
+	}
 
 	/* Revision-specific differences */
 	if (cmn->model == CMN600 && cmn->rev < CMN600_R1P2) {
-		if (type == CMN_TYPE_HNF && eattr->eventid == 0x1b)
+		if (eattr->type == CMN_TYPE_HNF && eattr->eventid == 0x1b)
 			return 0;
 	}
 
-	if (!arm_cmn_node(cmn, type))
+	if (!arm_cmn_node(cmn, eattr->type))
 		return 0;
 
 	return attr->mode;
@@ -669,7 +725,8 @@ static u32 arm_cmn_wp_config(struct perf_event *event)
 	config = FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_DEV_SEL, dev) |
 		 FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_CHN_SEL, chn) |
 		 FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_GRP, grp) |
-		 FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_EXCLUSIVE, exc);
+		 FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_EXCLUSIVE, exc) |
+		 FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_DEV_SEL2, dev >> 1);
 	if (combine && !grp)
 		config |= CMN_DTM_WPn_CONFIG_WP_COMBINE;
 
@@ -712,7 +769,7 @@ static u64 arm_cmn_read_dtm(struct arm_cmn *cmn, struct arm_cmn_hw_event *hw,
 	offset = snapshot ? CMN_DTM_PMEVCNTSR : CMN_DTM_PMEVCNT;
 	for_each_hw_dn(hw, dn, i) {
 		if (dtm != &cmn->dtms[dn->dtm]) {
-			dtm = &cmn->dtms[dn->dtm];
+			dtm = &cmn->dtms[dn->dtm] + hw->dtm_offset;
 			reg = readq_relaxed(dtm->base + offset);
 		}
 		dtm_idx = arm_cmn_get_index(hw->dtm_idx, i);
@@ -800,8 +857,10 @@ static void arm_cmn_event_start(struct perf_event *event, int flags)
 		u64 mask = CMN_EVENT_WP_MASK(event);
 
 		for_each_hw_dn(hw, dn, i) {
-			writeq_relaxed(val, dn->pmu_base + CMN_DTM_WPn_VAL(wp_idx));
-			writeq_relaxed(mask, dn->pmu_base + CMN_DTM_WPn_MASK(wp_idx));
+			void __iomem *base = dn->pmu_base + CMN_DTM_OFFSET(hw->dtm_offset);
+
+			writeq_relaxed(val, base + CMN_DTM_WPn_VAL(wp_idx));
+			writeq_relaxed(mask, base + CMN_DTM_WPn_MASK(wp_idx));
 		}
 	} else for_each_hw_dn(hw, dn, i) {
 		int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i);
@@ -826,8 +885,10 @@ static void arm_cmn_event_stop(struct perf_event *event, int flags)
 		int wp_idx = arm_cmn_wp_idx(event);
 
 		for_each_hw_dn(hw, dn, i) {
-			writeq_relaxed(0, dn->pmu_base + CMN_DTM_WPn_MASK(wp_idx));
-			writeq_relaxed(~0ULL, dn->pmu_base + CMN_DTM_WPn_VAL(wp_idx));
+			void __iomem *base = dn->pmu_base + CMN_DTM_OFFSET(hw->dtm_offset);
+
+			writeq_relaxed(0, base + CMN_DTM_WPn_MASK(wp_idx));
+			writeq_relaxed(~0ULL, base + CMN_DTM_WPn_VAL(wp_idx));
 		}
 	} else for_each_hw_dn(hw, dn, i) {
 		int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i);
@@ -847,7 +908,8 @@ struct arm_cmn_val {
 	bool cycles;
 };
 
-static void arm_cmn_val_add_event(struct arm_cmn_val *val, struct perf_event *event)
+static void arm_cmn_val_add_event(struct arm_cmn *cmn, struct arm_cmn_val *val,
+				  struct perf_event *event)
 {
 	struct arm_cmn_hw_event *hw = to_cmn_hw(event);
 	struct arm_cmn_node *dn;
@@ -865,7 +927,7 @@ static void arm_cmn_val_add_event(struct arm_cmn_val *val, struct perf_event *ev
 	}
 
 	val->dtc_count++;
-	if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event)))
+	if (arm_cmn_is_occup_event(cmn->model, type, CMN_EVENT_EVENTID(event)))
 		occupid = CMN_EVENT_OCCUPID(event) + 1;
 	else
 		occupid = 0;
@@ -884,7 +946,7 @@ static void arm_cmn_val_add_event(struct arm_cmn_val *val, struct perf_event *ev
 	}
 }
 
-static int arm_cmn_validate_group(struct perf_event *event)
+static int arm_cmn_validate_group(struct arm_cmn *cmn, struct perf_event *event)
 {
 	struct arm_cmn_hw_event *hw = to_cmn_hw(event);
 	struct arm_cmn_node *dn;
@@ -904,9 +966,9 @@ static int arm_cmn_validate_group(struct perf_event *event)
 	if (!val)
 		return -ENOMEM;
 
-	arm_cmn_val_add_event(val, leader);
+	arm_cmn_val_add_event(cmn, val, leader);
 	for_each_sibling_event(sibling, leader)
-		arm_cmn_val_add_event(val, sibling);
+		arm_cmn_val_add_event(cmn, val, sibling);
 
 	type = CMN_EVENT_TYPE(event);
 	if (type == CMN_TYPE_DTC) {
@@ -917,7 +979,7 @@ static int arm_cmn_validate_group(struct perf_event *event)
 	if (val->dtc_count == CMN_DT_NUM_COUNTERS)
 		goto done;
 
-	if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event)))
+	if (arm_cmn_is_occup_event(cmn->model, type, CMN_EVENT_EVENTID(event)))
 		occupid = CMN_EVENT_OCCUPID(event) + 1;
 	else
 		occupid = 0;
@@ -980,6 +1042,9 @@ static int arm_cmn_event_init(struct perf_event *event)
 		eventid = CMN_EVENT_EVENTID(event);
 		if (eventid != CMN_WP_UP && eventid != CMN_WP_DOWN)
 			return -EINVAL;
+		/* ...but the DTM may depend on which port we're watching */
+		if (cmn->multi_dtm)
+			hw->dtm_offset = CMN_EVENT_WP_DEV_SEL(event) / 2;
 	}
 
 	bynodeid = CMN_EVENT_BYNODEID(event);
@@ -1007,7 +1072,7 @@ static int arm_cmn_event_init(struct perf_event *event)
 		return -EINVAL;
 	}
 
-	return arm_cmn_validate_group(event);
+	return arm_cmn_validate_group(cmn, event);
 }
 
 static void arm_cmn_event_clear(struct arm_cmn *cmn, struct perf_event *event,
@@ -1017,13 +1082,13 @@ static void arm_cmn_event_clear(struct arm_cmn *cmn, struct perf_event *event,
 	enum cmn_node_type type = CMN_EVENT_TYPE(event);
 
 	while (i--) {
-		struct arm_cmn_dtm *dtm = &cmn->dtms[hw->dn[i].dtm];
+		struct arm_cmn_dtm *dtm = &cmn->dtms[hw->dn[i].dtm] + hw->dtm_offset;
 		unsigned int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i);
 
 		if (type == CMN_TYPE_WP)
 			dtm->wp_event[arm_cmn_wp_idx(event)] = -1;
 
-		if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event)))
+		if (arm_cmn_is_occup_event(cmn->model, type, CMN_EVENT_EVENTID(event)))
 			hw->dn[i].occupid_count--;
 
 		dtm->pmu_config_low &= ~CMN__PMEVCNT_PAIRED(dtm_idx);
@@ -1069,7 +1134,7 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
 
 	/* ...then the local counters to feed it. */
 	for_each_hw_dn(hw, dn, i) {
-		struct arm_cmn_dtm *dtm = &cmn->dtms[dn->dtm];
+		struct arm_cmn_dtm *dtm = &cmn->dtms[dn->dtm] + hw->dtm_offset;
 		unsigned int dtm_idx, shift;
 		u64 reg;
 
@@ -1098,10 +1163,13 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
 		} else {
 			struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id);
 
+			if (cmn->multi_dtm)
+				nid.port %= 2;
+
 			input_sel = CMN__PMEVCNT0_INPUT_SEL_DEV + dtm_idx +
 				    (nid.port << 4) + (nid.dev << 2);
 
-			if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event))) {
+			if (arm_cmn_is_occup_event(cmn->model, type, CMN_EVENT_EVENTID(event))) {
 				u8 occupid = CMN_EVENT_OCCUPID(event);
 
 				if (dn->occupid_count == 0) {
@@ -1283,11 +1351,11 @@ static int arm_cmn_init_irqs(struct arm_cmn *cmn)
 	return 0;
 }
 
-static void arm_cmn_init_dtm(struct arm_cmn_dtm *dtm, struct arm_cmn_node *xp)
+static void arm_cmn_init_dtm(struct arm_cmn_dtm *dtm, struct arm_cmn_node *xp, int idx)
 {
 	int i;
 
-	dtm->base = xp->pmu_base;
+	dtm->base = xp->pmu_base + CMN_DTM_OFFSET(idx);
 	dtm->pmu_config_low = CMN_DTM_PMU_CONFIG_PMU_EN;
 	for (i = 0; i < 4; i++) {
 		dtm->wp_event[i] = -1;
@@ -1346,6 +1414,8 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn)
 
 		xp = arm_cmn_node_to_xp(cmn, dn);
 		dn->dtm = xp->dtm;
+		if (cmn->multi_dtm)
+			dn->dtm += arm_cmn_nid(cmn, dn->id).port / 2;
 
 		if (dn->type == CMN_TYPE_DTC) {
 			int err;
@@ -1409,6 +1479,11 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 	reg = readl_relaxed(cfg_region + CMN_CFGM_PERIPH_ID_2);
 	cmn->rev = FIELD_GET(CMN_CFGM_PID2_REVISION, reg);
 
+	reg = readq_relaxed(cfg_region + CMN_CFGM_INFO_GLOBAL);
+	cmn->multi_dtm = reg & CMN_INFO_MULTIPLE_DTM_EN;
+	cmn->rsp_vc_num = FIELD_GET(CMN_INFO_RSP_VC_NUM, reg);
+	cmn->dat_vc_num = FIELD_GET(CMN_INFO_DAT_VC_NUM, reg);
+
 	reg = readq_relaxed(cfg_region + CMN_CHILD_INFO);
 	child_count = FIELD_GET(CMN_CI_CHILD_COUNT, reg);
 	child_poff = FIELD_GET(CMN_CI_CHILD_PTR_OFFSET, reg);
@@ -1430,7 +1505,11 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 	if (!dn)
 		return -ENOMEM;
 
-	dtm = devm_kcalloc(cmn->dev, cmn->num_xps, sizeof(*dtm), GFP_KERNEL);
+	/* Initial safe upper bound on DTMs for any possible mesh layout */
+	i = cmn->num_xps;
+	if (cmn->multi_dtm)
+		i += cmn->num_xps + 1;
+	dtm = devm_kcalloc(cmn->dev, i, sizeof(*dtm), GFP_KERNEL);
 	if (!dtm)
 		return -ENOMEM;
 
@@ -1440,6 +1519,7 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 	for (i = 0; i < cmn->num_xps; i++) {
 		void __iomem *xp_region = cmn->base + xp_offset[i];
 		struct arm_cmn_node *xp = dn++;
+		unsigned int xp_ports = 0;
 
 		arm_cmn_init_node_info(cmn, xp_offset[i], xp);
 		/*
@@ -1451,9 +1531,39 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 		if (xp->id == (1 << 3))
 			cmn->mesh_x = xp->logid;
 
-		xp->dtc = 0xf;
+		if (cmn->model == CMN600)
+			xp->dtc = 0xf;
+		else
+			xp->dtc = 1 << readl_relaxed(xp_region + CMN_DTM_UNIT_INFO);
+
 		xp->dtm = dtm - cmn->dtms;
-		arm_cmn_init_dtm(dtm++, xp);
+		arm_cmn_init_dtm(dtm++, xp, 0);
+		/*
+		 * Keeping track of connected ports will let us filter out
+		 * unnecessary XP events easily. We can also reliably infer the
+		 * "extra device ports" configuration for the node ID format
+		 * from this, since in that case we will see at least one XP
+		 * with port 2 connected, for the HN-D.
+		 */
+		if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P0))
+			xp_ports |= BIT(0);
+		if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P1))
+			xp_ports |= BIT(1);
+		if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P2))
+			xp_ports |= BIT(2);
+		if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P3))
+			xp_ports |= BIT(3);
+		if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P4))
+			xp_ports |= BIT(4);
+		if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P5))
+			xp_ports |= BIT(5);
+
+		if (cmn->multi_dtm && (xp_ports & 0xc))
+			arm_cmn_init_dtm(dtm++, xp, 1);
+		if (cmn->multi_dtm && (xp_ports & 0x30))
+			arm_cmn_init_dtm(dtm++, xp, 2);
+
+		cmn->ports_used |= xp_ports;
 
 		reg = readq_relaxed(xp_region + CMN_CHILD_INFO);
 		child_count = FIELD_GET(CMN_CI_CHILD_COUNT, reg);
@@ -1489,11 +1599,14 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 			case CMN_TYPE_SBSX:
 			case CMN_TYPE_RNI:
 			case CMN_TYPE_RND:
+			case CMN_TYPE_MTSX:
 			case CMN_TYPE_CXRA:
 			case CMN_TYPE_CXHA:
 				dn++;
 				break;
 			/* Nothing to see here */
+			case CMN_TYPE_MPAM_S:
+			case CMN_TYPE_MPAM_NS:
 			case CMN_TYPE_RNSAM:
 			case CMN_TYPE_CXLA:
 				break;
@@ -1513,6 +1626,11 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 	if (dn)
 		cmn->dns = dn;
 
+	sz = (void *)dtm - (void *)cmn->dtms;
+	dtm = devm_krealloc(cmn->dev, cmn->dtms, sz, GFP_KERNEL);
+	if (dtm)
+		cmn->dtms = dtm;
+
 	/*
 	 * If mesh_x wasn't set during discovery then we never saw
 	 * an XP at (0,1), thus we must have an Nx1 configuration.
@@ -1521,9 +1639,15 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 		cmn->mesh_x = cmn->num_xps;
 	cmn->mesh_y = cmn->num_xps / cmn->mesh_x;
 
+	/* 1x1 config plays havoc with XP event encodings */
+	if (cmn->num_xps == 1)
+		dev_warn(cmn->dev, "1x1 config not fully supported, translate XP events manually\n");
+
 	dev_dbg(cmn->dev, "model %d, periph_id_2 revision %d\n", cmn->model, cmn->rev);
-	dev_dbg(cmn->dev, "mesh %dx%d, ID width %d\n",
-		cmn->mesh_x, cmn->mesh_y, arm_cmn_xyidbits(cmn));
+	reg = cmn->ports_used;
+	dev_dbg(cmn->dev, "mesh %dx%d, ID width %d, ports %6pbl%s\n",
+		cmn->mesh_x, cmn->mesh_y, arm_cmn_xyidbits(cmn), &reg,
+		cmn->multi_dtm ? ", multi-DTM" : "");
 
 	return 0;
 }

From 795b4e4c4d7a7b9f0449ed99528dc503e243df9c Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 3 Dec 2021 11:45:02 +0000
Subject: [PATCH 650/737] perf/arm-cmn: Add CI-700 Support

Add the identifiers and events for the CI-700 coherent interconnect.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/28f566ab23a83733c6c9ef9414c010b760b4549c.1638530442.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit b2fea780c9282dbaf77ef081e6d97a3f2c0dfc6a)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/arm-cmn.c | 57 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 53 insertions(+), 4 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 1d21f39cb7279..3468df6050c16 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -175,6 +175,7 @@
 enum cmn_model {
 	CMN_ANY = -1,
 	CMN600 = 1,
+	CI700 = 2,
 };
 
 /* CMN-600 r0px shouldn't exist in silicon, thankfully */
@@ -186,6 +187,9 @@ enum cmn_revision {
 	CMN600_R2P0,
 	CMN600_R3P0,
 	CMN600_R3P1,
+	CI700_R0P0 = 0,
+	CI700_R1P0,
+	CI700_R2P0,
 };
 
 enum cmn_node_type {
@@ -401,8 +405,10 @@ struct arm_cmn_format_attr {
 static bool arm_cmn_is_occup_event(enum cmn_model model,
 				   enum cmn_node_type type, unsigned int id)
 {
-	return (type == CMN_TYPE_DVM && id == 0x05) ||
-	       (type == CMN_TYPE_HNF && id == 0x0f);
+	if (type == CMN_TYPE_DVM)
+		return (model == CMN600 && id == 0x05) ||
+		       (model == CI700 && id == 0x0c);
+	return type == CMN_TYPE_HNF && id == 0x0f;
 }
 
 static ssize_t arm_cmn_event_show(struct device *dev,
@@ -497,14 +503,19 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj,
 	__CMN_EVENT_XP(n_##_name, (_event) | (2 << 2)),		\
 	__CMN_EVENT_XP(s_##_name, (_event) | (3 << 2)),		\
 	__CMN_EVENT_XP(p0_##_name, (_event) | (4 << 2)),	\
-	__CMN_EVENT_XP(p1_##_name, (_event) | (5 << 2))
+	__CMN_EVENT_XP(p1_##_name, (_event) | (5 << 2)),	\
+	__CMN_EVENT_XP(p2_##_name, (_event) | (6 << 2)),	\
+	__CMN_EVENT_XP(p3_##_name, (_event) | (7 << 2))
 
 /* Good thing there are only 3 fundamental XP events... */
 #define CMN_EVENT_XP(_name, _event)				\
 	_CMN_EVENT_XP(req_##_name, (_event) | (0 << 5)),	\
 	_CMN_EVENT_XP(rsp_##_name, (_event) | (1 << 5)),	\
 	_CMN_EVENT_XP(snp_##_name, (_event) | (2 << 5)),	\
-	_CMN_EVENT_XP(dat_##_name, (_event) | (3 << 5))
+	_CMN_EVENT_XP(dat_##_name, (_event) | (3 << 5)),	\
+	_CMN_EVENT_XP(pub_##_name, (_event) | (4 << 5)),	\
+	_CMN_EVENT_XP(rsp2_##_name, (_event) | (5 << 5)),	\
+	_CMN_EVENT_XP(dat2_##_name, (_event) | (6 << 5))
 
 
 static struct attribute *arm_cmn_event_attrs[] = {
@@ -522,6 +533,20 @@ static struct attribute *arm_cmn_event_attrs[] = {
 	_CMN_EVENT_DVM(CMN600, rxreq_trk_occupancy_all, 0x05, 0),
 	_CMN_EVENT_DVM(CMN600, rxreq_trk_occupancy_dvmop, 0x05, 1),
 	_CMN_EVENT_DVM(CMN600, rxreq_trk_occupancy_dvmsync, 0x05, 2),
+	CMN_EVENT_DVM(CI700, dvmop_tlbi,		0x01),
+	CMN_EVENT_DVM(CI700, dvmop_bpi,			0x02),
+	CMN_EVENT_DVM(CI700, dvmop_pici,		0x03),
+	CMN_EVENT_DVM(CI700, dvmop_vici,		0x04),
+	CMN_EVENT_DVM(CI700, dvmsync,			0x05),
+	CMN_EVENT_DVM(CI700, vmid_filtered,		0x06),
+	CMN_EVENT_DVM(CI700, rndop_filtered,		0x07),
+	CMN_EVENT_DVM(CI700, retry,			0x08),
+	CMN_EVENT_DVM(CI700, txsnp_flitv,		0x09),
+	CMN_EVENT_DVM(CI700, txsnp_stall,		0x0a),
+	CMN_EVENT_DVM(CI700, trkfull,			0x0b),
+	_CMN_EVENT_DVM(CI700, trk_occupancy_all,	0x0c, 0),
+	_CMN_EVENT_DVM(CI700, trk_occupancy_dvmop,	0x0c, 1),
+	_CMN_EVENT_DVM(CI700, trk_occupancy_dvmsync,	0x0c, 2),
 
 	CMN_EVENT_HNF(CMN_ANY, cache_miss,		0x01),
 	CMN_EVENT_HNF(CMN_ANY, slc_sf_cache_access,	0x02),
@@ -558,6 +583,9 @@ static struct attribute *arm_cmn_event_attrs[] = {
 	CMN_EVENT_HNF(CMN_ANY, stash_snp_sent,		0x1d),
 	CMN_EVENT_HNF(CMN_ANY, stash_data_pull,		0x1e),
 	CMN_EVENT_HNF(CMN_ANY, snp_fwded,		0x1f),
+	CMN_EVENT_HNF(CI700, atomic_fwd,		0x20),
+	CMN_EVENT_HNF(CI700, mpam_hardlim,		0x21),
+	CMN_EVENT_HNF(CI700, mpam_softlim,		0x22),
 
 	CMN_EVENT_HNI(rrt_rd_occ_cnt_ovfl,		0x20),
 	CMN_EVENT_HNI(rrt_wr_occ_cnt_ovfl,		0x21),
@@ -598,6 +626,7 @@ static struct attribute *arm_cmn_event_attrs[] = {
 	CMN_EVENT_SBSX(CMN_ANY, wdb_occ_cnt_ovfl,	0x14),
 	CMN_EVENT_SBSX(CMN_ANY, rd_axi_trkr_occ_cnt_ovfl, 0x15),
 	CMN_EVENT_SBSX(CMN_ANY, cmo_axi_trkr_occ_cnt_ovfl, 0x16),
+	CMN_EVENT_SBSX(CI700, rdb_occ_cnt_ovfl,		0x17),
 	CMN_EVENT_SBSX(CMN_ANY, arvalid_no_arready,	0x21),
 	CMN_EVENT_SBSX(CMN_ANY, awvalid_no_awready,	0x22),
 	CMN_EVENT_SBSX(CMN_ANY, wvalid_no_wready,	0x23),
@@ -624,6 +653,25 @@ static struct attribute *arm_cmn_event_attrs[] = {
 	CMN_EVENT_RNID(CMN600, rdb_replay,		0x12),
 	CMN_EVENT_RNID(CMN600, rdb_hybrid,		0x13),
 	CMN_EVENT_RNID(CMN600, rdb_ord,			0x14),
+	CMN_EVENT_RNID(CI700, padb_occ_ovfl,		0x11),
+	CMN_EVENT_RNID(CI700, rpdb_occ_ovfl,		0x12),
+	CMN_EVENT_RNID(CI700, rrt_occup_ovfl_slice1,	0x13),
+	CMN_EVENT_RNID(CI700, rrt_occup_ovfl_slice2,	0x14),
+	CMN_EVENT_RNID(CI700, rrt_occup_ovfl_slice3,	0x15),
+	CMN_EVENT_RNID(CI700, wrt_throttled,		0x16),
+
+	CMN_EVENT_MTSX(tc_lookup,			0x01),
+	CMN_EVENT_MTSX(tc_fill,				0x02),
+	CMN_EVENT_MTSX(tc_miss,				0x03),
+	CMN_EVENT_MTSX(tdb_forward,			0x04),
+	CMN_EVENT_MTSX(tcq_hazard,			0x05),
+	CMN_EVENT_MTSX(tcq_rd_alloc,			0x06),
+	CMN_EVENT_MTSX(tcq_wr_alloc,			0x07),
+	CMN_EVENT_MTSX(tcq_cmo_alloc,			0x08),
+	CMN_EVENT_MTSX(axi_rd_req,			0x09),
+	CMN_EVENT_MTSX(axi_wr_req,			0x0a),
+	CMN_EVENT_MTSX(tcq_occ_cnt_ovfl,		0x0b),
+	CMN_EVENT_MTSX(tdb_occ_cnt_ovfl,		0x0c),
 
 	NULL
 };
@@ -1774,6 +1822,7 @@ static int arm_cmn_remove(struct platform_device *pdev)
 #ifdef CONFIG_OF
 static const struct of_device_id arm_cmn_of_match[] = {
 	{ .compatible = "arm,cmn-600", .data = (void *)CMN600 },
+	{ .compatible = "arm,ci-700", .data = (void *)CI700 },
 	{}
 };
 MODULE_DEVICE_TABLE(of, arm_cmn_of_match);

From 8cc0bcbc5b4f38cd96e3ac77555ce2fcef70aa71 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 3 Dec 2021 11:45:03 +0000
Subject: [PATCH 651/737] perf/arm-cmn: Add debugfs topology info

In general, detailed performance analysis will require knoweldge of the
the SoC beyond the CMN itself - e.g. which actual CPUs/peripherals/etc.
are connected to each node. However for certain development and bringup
tasks it can be useful to have a quick overview of the CMN internal
topology to hand too. Add a debugfs file to map this out.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/159fd4d7e19fb3c8801a8cb64ee73ec50f55903c.1638530442.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit a88fa6c28b867a387e3af202d6dbbb754d3aa2f1)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/arm-cmn.c | 151 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 148 insertions(+), 3 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 3468df6050c16..5c3b22c235929 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -5,6 +5,7 @@
 #include <linux/acpi.h>
 #include <linux/bitfield.h>
 #include <linux/bitops.h>
+#include <linux/debugfs.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
@@ -287,6 +288,7 @@ struct arm_cmn {
 	struct hlist_node cpuhp_node;
 
 	struct pmu pmu;
+	struct dentry *debug;
 };
 
 #define to_cmn(p)	container_of(p, struct arm_cmn, pmu)
@@ -351,6 +353,140 @@ static struct arm_cmn_node *arm_cmn_node(const struct arm_cmn *cmn,
 	return NULL;
 }
 
+struct dentry *arm_cmn_debugfs;
+
+#ifdef CONFIG_DEBUG_FS
+static const char *arm_cmn_device_type(u8 type)
+{
+	switch(type) {
+		case 0x01: return "  RN-I  |";
+		case 0x02: return "  RN-D  |";
+		case 0x04: return " RN-F_B |";
+		case 0x05: return "RN-F_B_E|";
+		case 0x06: return " RN-F_A |";
+		case 0x07: return "RN-F_A_E|";
+		case 0x08: return "  HN-T  |";
+		case 0x09: return "  HN-I  |";
+		case 0x0a: return "  HN-D  |";
+		case 0x0c: return "  SN-F  |";
+		case 0x0d: return "  SBSX  |";
+		case 0x0e: return "  HN-F  |";
+		case 0x0f: return " SN-F_E |";
+		case 0x10: return " SN-F_D |";
+		case 0x11: return "  CXHA  |";
+		case 0x12: return "  CXRA  |";
+		case 0x13: return "  CXRH  |";
+		case 0x14: return " RN-F_D |";
+		case 0x15: return "RN-F_D_E|";
+		case 0x16: return " RN-F_C |";
+		case 0x17: return "RN-F_C_E|";
+		case 0x1c: return "  MTSX  |";
+		default:   return "        |";
+	}
+}
+
+static void arm_cmn_show_logid(struct seq_file *s, int x, int y, int p, int d)
+{
+	struct arm_cmn *cmn = s->private;
+	struct arm_cmn_node *dn;
+
+	for (dn = cmn->dns; dn->type; dn++) {
+		struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id);
+
+		if (dn->type == CMN_TYPE_XP)
+			continue;
+		/* Ignore the extra components that will overlap on some ports */
+		if (dn->type < CMN_TYPE_HNI)
+			continue;
+
+		if (nid.x != x || nid.y != y || nid.port != p || nid.dev != d)
+			continue;
+
+		seq_printf(s, "   #%-2d  |", dn->logid);
+		return;
+	}
+	seq_puts(s, "        |");
+}
+
+static int arm_cmn_map_show(struct seq_file *s, void *data)
+{
+	struct arm_cmn *cmn = s->private;
+	int x, y, p, pmax = fls(cmn->ports_used);
+
+	seq_puts(s, "     X");
+	for (x = 0; x < cmn->mesh_x; x++)
+		seq_printf(s, "    %d    ", x);
+	seq_puts(s, "\nY P D+");
+	y = cmn->mesh_y;
+	while (y--) {
+		int xp_base = cmn->mesh_x * y;
+		u8 port[6][CMN_MAX_DIMENSION];
+
+		for (x = 0; x < cmn->mesh_x; x++)
+			seq_puts(s, "--------+");
+
+		seq_printf(s, "\n%d    |", y);
+		for (x = 0; x < cmn->mesh_x; x++) {
+			struct arm_cmn_node *xp = cmn->xps + xp_base + x;
+			void __iomem *base = xp->pmu_base - CMN_PMU_OFFSET;
+
+			port[0][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P0);
+			port[1][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P1);
+			port[2][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P2);
+			port[3][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P3);
+			port[4][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P4);
+			port[5][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P5);
+			seq_printf(s, " XP #%-2d |", xp_base + x);
+		}
+
+		seq_puts(s, "\n     |");
+		for (x = 0; x < cmn->mesh_x; x++) {
+			u8 dtc = cmn->xps[xp_base + x].dtc;
+
+			if (dtc & (dtc - 1))
+				seq_puts(s, " DTC ?? |");
+			else
+				seq_printf(s, " DTC %ld  |", __ffs(dtc));
+		}
+		seq_puts(s, "\n     |");
+		for (x = 0; x < cmn->mesh_x; x++)
+			seq_puts(s, "........|");
+
+		for (p = 0; p < pmax; p++) {
+			seq_printf(s, "\n  %d  |", p);
+			for (x = 0; x < cmn->mesh_x; x++)
+				seq_puts(s, arm_cmn_device_type(port[p][x]));
+			seq_puts(s, "\n    0|");
+			for (x = 0; x < cmn->mesh_x; x++)
+				arm_cmn_show_logid(s, x, y, p, 0);
+			seq_puts(s, "\n    1|");
+			for (x = 0; x < cmn->mesh_x; x++)
+				arm_cmn_show_logid(s, x, y, p, 1);
+		}
+		seq_puts(s, "\n-----+");
+	}
+	for (x = 0; x < cmn->mesh_x; x++)
+		seq_puts(s, "--------+");
+	seq_puts(s, "\n");
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(arm_cmn_map);
+
+static void arm_cmn_debugfs_init(struct arm_cmn *cmn, int id)
+{
+	const char *name  = "map";
+
+	if (id > 0)
+		name = devm_kasprintf(cmn->dev, GFP_KERNEL, "map_%d", id);
+	if (!name)
+		return;
+
+	cmn->debug = debugfs_create_file(name, 0444, arm_cmn_debugfs, cmn, &arm_cmn_map_fops);
+}
+#else
+static void arm_cmn_debugfs_init(struct arm_cmn *cmn, int id) {}
+#endif
+
 struct arm_cmn_hw_event {
 	struct arm_cmn_node *dn;
 	u64 dtm_idx[2];
@@ -1739,7 +1875,7 @@ static int arm_cmn_probe(struct platform_device *pdev)
 	struct arm_cmn *cmn;
 	const char *name;
 	static atomic_t id;
-	int err, rootnode;
+	int err, rootnode, this_id;
 
 	cmn = devm_kzalloc(&pdev->dev, sizeof(*cmn), GFP_KERNEL);
 	if (!cmn)
@@ -1793,7 +1929,8 @@ static int arm_cmn_probe(struct platform_device *pdev)
 		.cancel_txn = arm_cmn_end_txn,
 	};
 
-	name = devm_kasprintf(cmn->dev, GFP_KERNEL, "arm_cmn_%d", atomic_fetch_inc(&id));
+	this_id = atomic_fetch_inc(&id);
+	name = devm_kasprintf(cmn->dev, GFP_KERNEL, "arm_cmn_%d", this_id);
 	if (!name)
 		return -ENOMEM;
 
@@ -1804,6 +1941,8 @@ static int arm_cmn_probe(struct platform_device *pdev)
 	err = perf_pmu_register(&cmn->pmu, name, -1);
 	if (err)
 		cpuhp_state_remove_instance_nocalls(arm_cmn_hp_state, &cmn->cpuhp_node);
+	else
+		arm_cmn_debugfs_init(cmn, this_id);
 
 	return err;
 }
@@ -1816,6 +1955,7 @@ static int arm_cmn_remove(struct platform_device *pdev)
 
 	perf_pmu_unregister(&cmn->pmu);
 	cpuhp_state_remove_instance_nocalls(arm_cmn_hp_state, &cmn->cpuhp_node);
+	debugfs_remove(cmn->debug);
 	return 0;
 }
 
@@ -1858,9 +1998,13 @@ static int __init arm_cmn_init(void)
 		return ret;
 
 	arm_cmn_hp_state = ret;
+	arm_cmn_debugfs = debugfs_create_dir("arm-cmn", NULL);
+
 	ret = platform_driver_register(&arm_cmn_driver);
-	if (ret)
+	if (ret) {
 		cpuhp_remove_multi_state(arm_cmn_hp_state);
+		debugfs_remove(arm_cmn_debugfs);
+	}
 	return ret;
 }
 
@@ -1868,6 +2012,7 @@ static void __exit arm_cmn_exit(void)
 {
 	platform_driver_unregister(&arm_cmn_driver);
 	cpuhp_remove_multi_state(arm_cmn_hp_state);
+	debugfs_remove(arm_cmn_debugfs);
 }
 
 module_init(arm_cmn_init);

From da4d5263a1e052774192b21b9e678e7d9039cddc Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Thu, 3 Feb 2022 18:01:18 +0000
Subject: [PATCH 652/737] perf/arm-cmn: Make arm_cmn_debugfs static

Indeed our debugfs directory is driver-internal so should be static.

Link: https://lore.kernel.org/r/202202030812.II1K2ZXf-lkp@intel.com
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/ca9248caaae69b5134f69e085fe78905dfe74378.1643911278.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 6f75217b20a768c72fb8bb999e25a95673fe0174)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/arm-cmn.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 5c3b22c235929..605881e271902 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -353,7 +353,7 @@ static struct arm_cmn_node *arm_cmn_node(const struct arm_cmn *cmn,
 	return NULL;
 }
 
-struct dentry *arm_cmn_debugfs;
+static struct dentry *arm_cmn_debugfs;
 
 #ifdef CONFIG_DEBUG_FS
 static const char *arm_cmn_device_type(u8 type)

From 5ecd6c184b49c7faae1f71e34ab5c84c3062a737 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Thu, 24 Feb 2022 18:41:21 +0000
Subject: [PATCH 653/737] perf/arm-cmn: Hide XP PUB events for CMN-600

CMN-600 doesn't have XP events for the PUB channel, but we missed
the appropriate check to avoid exposing them.

Fixes: 60d1504070c2 ("perf/arm-cmn: Support new IP features")
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/4c108d39a0513def63acccf09ab52b328f242aeb.1645727871.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 205295c7e1abba9c1db1f9fe075f22f71351887f)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/arm-cmn.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 605881e271902..bdce545980f5c 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -595,6 +595,9 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj,
 		if ((intf & 4) && !(cmn->ports_used & BIT(intf & 3)))
 			return 0;
 
+		if (chan == 4 && cmn->model == CMN600)
+			return 0;
+
 		if ((chan == 5 && cmn->rsp_vc_num < 2) ||
 		    (chan == 6 && cmn->dat_vc_num < 2))
 			return 0;

From 994cb82191e1eff9eb7c98271d5d04c7f9d118d0 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Thu, 24 Feb 2022 18:41:22 +0000
Subject: [PATCH 654/737] perf/arm-cmn: Update watchpoint format

From CMN-650 onwards, some of the fields in the watchpoint config
registers moved subtly enough to easily overlook. Watchpoint events are
still only partially supported on newer IPs - which in itself deserves
noting - but were not intended to become any *less* functional than on
CMN-600.

Fixes: 60d1504070c2 ("perf/arm-cmn: Support new IP features")
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/e1ce4c2f1e4f73ab1c60c3a85e4037cd62dd6352.1645727871.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 31fac565773981df43f018b2dbfbc7a3164f4b6c)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/arm-cmn.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index bdce545980f5c..deabd4c564e62 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -71,9 +71,11 @@
 #define CMN_DTM_WPn(n)			(0x1A0 + (n) * 0x18)
 #define CMN_DTM_WPn_CONFIG(n)		(CMN_DTM_WPn(n) + 0x00)
 #define CMN_DTM_WPn_CONFIG_WP_DEV_SEL2	GENMASK_ULL(18,17)
-#define CMN_DTM_WPn_CONFIG_WP_COMBINE	BIT(6)
-#define CMN_DTM_WPn_CONFIG_WP_EXCLUSIVE	BIT(5)
-#define CMN_DTM_WPn_CONFIG_WP_GRP	BIT(4)
+#define CMN_DTM_WPn_CONFIG_WP_COMBINE	BIT(9)
+#define CMN_DTM_WPn_CONFIG_WP_EXCLUSIVE	BIT(8)
+#define CMN600_WPn_CONFIG_WP_COMBINE	BIT(6)
+#define CMN600_WPn_CONFIG_WP_EXCLUSIVE	BIT(5)
+#define CMN_DTM_WPn_CONFIG_WP_GRP	GENMASK_ULL(5, 4)
 #define CMN_DTM_WPn_CONFIG_WP_CHN_SEL	GENMASK_ULL(3, 1)
 #define CMN_DTM_WPn_CONFIG_WP_DEV_SEL	BIT(0)
 #define CMN_DTM_WPn_VAL(n)		(CMN_DTM_WPn(n) + 0x08)
@@ -155,6 +157,7 @@
 #define CMN_CONFIG_WP_COMBINE		GENMASK_ULL(27, 24)
 #define CMN_CONFIG_WP_DEV_SEL		GENMASK_ULL(50, 48)
 #define CMN_CONFIG_WP_CHN_SEL		GENMASK_ULL(55, 51)
+/* Note that we don't yet support the tertiary match group on newer IPs */
 #define CMN_CONFIG_WP_GRP		BIT_ULL(56)
 #define CMN_CONFIG_WP_EXCLUSIVE		BIT_ULL(57)
 #define CMN_CONFIG1_WP_VAL		GENMASK_ULL(63, 0)
@@ -908,15 +911,18 @@ static u32 arm_cmn_wp_config(struct perf_event *event)
 	u32 grp = CMN_EVENT_WP_GRP(event);
 	u32 exc = CMN_EVENT_WP_EXCLUSIVE(event);
 	u32 combine = CMN_EVENT_WP_COMBINE(event);
+	bool is_cmn600 = to_cmn(event->pmu)->model == CMN600;
 
 	config = FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_DEV_SEL, dev) |
 		 FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_CHN_SEL, chn) |
 		 FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_GRP, grp) |
-		 FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_EXCLUSIVE, exc) |
 		 FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_DEV_SEL2, dev >> 1);
+	if (exc)
+		config |= is_cmn600 ? CMN600_WPn_CONFIG_WP_EXCLUSIVE :
+				      CMN_DTM_WPn_CONFIG_WP_EXCLUSIVE;
 	if (combine && !grp)
-		config |= CMN_DTM_WPn_CONFIG_WP_COMBINE;
-
+		config |= is_cmn600 ? CMN600_WPn_CONFIG_WP_COMBINE :
+				      CMN_DTM_WPn_CONFIG_WP_COMBINE;
 	return config;
 }
 

From f3343dae83552c7375ecfea385591a674b18e014 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 18 Apr 2022 23:57:39 +0100
Subject: [PATCH 655/737] perf/arm-cmn: Add CMN-650 support

Add the identifiers and events for CMN-650, which slots into its
evolutionary position between CMN-600 and the 700-series products.
Imagine CMN-600 made bigger, and with most of the rough edges smoothed
off, but that then balanced out by some bonkers PMU functionality for
the new HN-P enhancement in CMN-650r2.

Most of the CXG events are actually common to newer revisions of CMN-600
too, so they're arguably a little late; oh well.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Tested-by: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Link: https://lore.kernel.org/r/b0adc5824db53f71a2b561c293e2120390106536.1650320598.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 8e504d93acb647c0db31ba13ba11b510bbab4174)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/arm-cmn.c | 222 ++++++++++++++++++++++++++++++++---------
 1 file changed, 176 insertions(+), 46 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index deabd4c564e62..630345ac2cb1f 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -39,7 +39,7 @@
 #define CMN_CHILD_NODE_ADDR		GENMASK(27, 0)
 #define CMN_CHILD_NODE_EXTERNAL		BIT(31)
 
-#define CMN_MAX_DIMENSION		8
+#define CMN_MAX_DIMENSION		12
 #define CMN_MAX_XPS			(CMN_MAX_DIMENSION * CMN_MAX_DIMENSION)
 #define CMN_MAX_DTMS			(CMN_MAX_XPS + (CMN_MAX_DIMENSION - 1) * 4)
 
@@ -65,7 +65,9 @@
 
 /* For most nodes, this is all there is */
 #define CMN_PMU_EVENT_SEL		0x000
-#define CMN_PMU_EVENTn_ID_SHIFT(n)	((n) * 8)
+
+/* HN-Ps are weird... */
+#define CMN_HNP_PMU_EVENT_SEL		0x008
 
 /* DTMs live in the PMU space of XP registers */
 #define CMN_DTM_WPn(n)			(0x1A0 + (n) * 0x18)
@@ -177,9 +179,12 @@
 
 
 enum cmn_model {
-	CMN_ANY = -1,
 	CMN600 = 1,
-	CI700 = 2,
+	CMN650 = 2,
+	CI700 = 8,
+	/* ...and then we can use bitmap tricks for commonality */
+	CMN_ANY = -1,
+	NOT_CMN600 = -2,
 };
 
 /* CMN-600 r0px shouldn't exist in silicon, thankfully */
@@ -191,6 +196,11 @@ enum cmn_revision {
 	CMN600_R2P0,
 	CMN600_R3P0,
 	CMN600_R3P1,
+	CMN650_R0P0 = 0,
+	CMN650_R1P0,
+	CMN650_R1P1,
+	CMN650_R2P0,
+	CMN650_R1P2,
 	CI700_R0P0 = 0,
 	CI700_R1P0,
 	CI700_R2P0,
@@ -211,6 +221,7 @@ enum cmn_node_type {
 	CMN_TYPE_RND = 0xd,
 	CMN_TYPE_RNSAM = 0xf,
 	CMN_TYPE_MTSX,
+	CMN_TYPE_HNP,
 	CMN_TYPE_CXRA = 0x100,
 	CMN_TYPE_CXHA = 0x101,
 	CMN_TYPE_CXLA = 0x102,
@@ -307,9 +318,7 @@ struct arm_cmn_nodeid {
 
 static int arm_cmn_xyidbits(const struct arm_cmn *cmn)
 {
-	int dim = max(cmn->mesh_x, cmn->mesh_y);
-
-	return dim > 4 ? 3 : 2;
+	return fls((cmn->mesh_x - 1) | (cmn->mesh_y - 1) | 2);
 }
 
 static struct arm_cmn_nodeid arm_cmn_nid(const struct arm_cmn *cmn, u16 id)
@@ -362,6 +371,7 @@ static struct dentry *arm_cmn_debugfs;
 static const char *arm_cmn_device_type(u8 type)
 {
 	switch(type) {
+		case 0x00: return "        |";
 		case 0x01: return "  RN-I  |";
 		case 0x02: return "  RN-D  |";
 		case 0x04: return " RN-F_B |";
@@ -371,6 +381,7 @@ static const char *arm_cmn_device_type(u8 type)
 		case 0x08: return "  HN-T  |";
 		case 0x09: return "  HN-I  |";
 		case 0x0a: return "  HN-D  |";
+		case 0x0b: return "  HN-P  |";
 		case 0x0c: return "  SN-F  |";
 		case 0x0d: return "  SBSX  |";
 		case 0x0e: return "  HN-F  |";
@@ -383,8 +394,10 @@ static const char *arm_cmn_device_type(u8 type)
 		case 0x15: return "RN-F_D_E|";
 		case 0x16: return " RN-F_C |";
 		case 0x17: return "RN-F_C_E|";
+		case 0x18: return " RN-F_E |";
+		case 0x19: return "RN-F_E_E|";
 		case 0x1c: return "  MTSX  |";
-		default:   return "        |";
+		default:   return "  ????  |";
 	}
 }
 
@@ -492,7 +505,7 @@ static void arm_cmn_debugfs_init(struct arm_cmn *cmn, int id) {}
 
 struct arm_cmn_hw_event {
 	struct arm_cmn_node *dn;
-	u64 dtm_idx[2];
+	u64 dtm_idx[4];
 	unsigned int dtc_idx;
 	u8 dtcs_used;
 	u8 num_dns;
@@ -545,8 +558,7 @@ static bool arm_cmn_is_occup_event(enum cmn_model model,
 				   enum cmn_node_type type, unsigned int id)
 {
 	if (type == CMN_TYPE_DVM)
-		return (model == CMN600 && id == 0x05) ||
-		       (model == CI700 && id == 0x0c);
+		return model == CMN600 ? id == 0x05 : id == 0x0c;
 	return type == CMN_TYPE_HNF && id == 0x0f;
 }
 
@@ -580,20 +592,25 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj,
 	struct device *dev = kobj_to_dev(kobj);
 	struct arm_cmn *cmn = to_cmn(dev_get_drvdata(dev));
 	struct arm_cmn_event_attr *eattr;
+	enum cmn_node_type type;
+	u16 eventid;
 
 	eattr = container_of(attr, typeof(*eattr), attr.attr);
 
 	if (!(eattr->model & cmn->model))
 		return 0;
 
+	type = eattr->type;
+	eventid = eattr->eventid;
+
 	/* Watchpoints aren't nodes, so avoid confusion */
-	if (eattr->type == CMN_TYPE_WP)
+	if (type == CMN_TYPE_WP)
 		return attr->mode;
 
 	/* Hide XP events for unused interfaces/channels */
-	if (eattr->type == CMN_TYPE_XP) {
-		unsigned int intf = (eattr->eventid >> 2) & 7;
-		unsigned int chan = eattr->eventid >> 5;
+	if (type == CMN_TYPE_XP) {
+		unsigned int intf = (eventid >> 2) & 7;
+		unsigned int chan = eventid >> 5;
 
 		if ((intf & 4) && !(cmn->ports_used & BIT(intf & 3)))
 			return 0;
@@ -607,12 +624,29 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj,
 	}
 
 	/* Revision-specific differences */
-	if (cmn->model == CMN600 && cmn->rev < CMN600_R1P2) {
-		if (eattr->type == CMN_TYPE_HNF && eattr->eventid == 0x1b)
-			return 0;
+	if (cmn->model == CMN600) {
+		if (cmn->rev < CMN600_R1P3) {
+			if (type == CMN_TYPE_CXRA && eventid > 0x10)
+				return 0;
+		}
+		if (cmn->rev < CMN600_R1P2) {
+			if (type == CMN_TYPE_HNF && eventid == 0x1b)
+				return 0;
+			if (type == CMN_TYPE_CXRA || type == CMN_TYPE_CXHA)
+				return 0;
+		}
+	} else if (cmn->model == CMN650) {
+		if (cmn->rev < CMN650_R2P0 || cmn->rev == CMN650_R1P2) {
+			if (type == CMN_TYPE_HNF && eventid > 0x22)
+				return 0;
+			if (type == CMN_TYPE_SBSX && eventid == 0x17)
+				return 0;
+			if (type == CMN_TYPE_RNI && eventid > 0x10)
+				return 0;
+		}
 	}
 
-	if (!arm_cmn_node(cmn, eattr->type))
+	if (!arm_cmn_node(cmn, type))
 		return 0;
 
 	return attr->mode;
@@ -626,6 +660,8 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj,
 	CMN_EVENT_ATTR(_model, hnf_##_name, CMN_TYPE_HNF, _event, _occup)
 #define CMN_EVENT_HNI(_name, _event)				\
 	CMN_EVENT_ATTR(CMN_ANY, hni_##_name, CMN_TYPE_HNI, _event, 0)
+#define CMN_EVENT_HNP(_name, _event)				\
+	CMN_EVENT_ATTR(CMN_ANY, hnp_##_name, CMN_TYPE_HNP, _event, 0)
 #define __CMN_EVENT_XP(_name, _event)				\
 	CMN_EVENT_ATTR(CMN_ANY, mxp_##_name, CMN_TYPE_XP, _event, 0)
 #define CMN_EVENT_SBSX(_model, _name, _event)			\
@@ -634,6 +670,10 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj,
 	CMN_EVENT_ATTR(_model, rnid_##_name, CMN_TYPE_RNI, _event, 0)
 #define CMN_EVENT_MTSX(_name, _event)				\
 	CMN_EVENT_ATTR(CMN_ANY, mtsx_##_name, CMN_TYPE_MTSX, _event, 0)
+#define CMN_EVENT_CXRA(_model, _name, _event)				\
+	CMN_EVENT_ATTR(_model, cxra_##_name, CMN_TYPE_CXRA, _event, 0)
+#define CMN_EVENT_CXHA(_name, _event)				\
+	CMN_EVENT_ATTR(CMN_ANY, cxha_##_name, CMN_TYPE_CXHA, _event, 0)
 
 #define CMN_EVENT_DVM(_model, _name, _event)			\
 	_CMN_EVENT_DVM(_model, _name, _event, 0)
@@ -675,20 +715,20 @@ static struct attribute *arm_cmn_event_attrs[] = {
 	_CMN_EVENT_DVM(CMN600, rxreq_trk_occupancy_all, 0x05, 0),
 	_CMN_EVENT_DVM(CMN600, rxreq_trk_occupancy_dvmop, 0x05, 1),
 	_CMN_EVENT_DVM(CMN600, rxreq_trk_occupancy_dvmsync, 0x05, 2),
-	CMN_EVENT_DVM(CI700, dvmop_tlbi,		0x01),
-	CMN_EVENT_DVM(CI700, dvmop_bpi,			0x02),
-	CMN_EVENT_DVM(CI700, dvmop_pici,		0x03),
-	CMN_EVENT_DVM(CI700, dvmop_vici,		0x04),
-	CMN_EVENT_DVM(CI700, dvmsync,			0x05),
-	CMN_EVENT_DVM(CI700, vmid_filtered,		0x06),
-	CMN_EVENT_DVM(CI700, rndop_filtered,		0x07),
-	CMN_EVENT_DVM(CI700, retry,			0x08),
-	CMN_EVENT_DVM(CI700, txsnp_flitv,		0x09),
-	CMN_EVENT_DVM(CI700, txsnp_stall,		0x0a),
-	CMN_EVENT_DVM(CI700, trkfull,			0x0b),
-	_CMN_EVENT_DVM(CI700, trk_occupancy_all,	0x0c, 0),
-	_CMN_EVENT_DVM(CI700, trk_occupancy_dvmop,	0x0c, 1),
-	_CMN_EVENT_DVM(CI700, trk_occupancy_dvmsync,	0x0c, 2),
+	CMN_EVENT_DVM(NOT_CMN600, dvmop_tlbi,		0x01),
+	CMN_EVENT_DVM(NOT_CMN600, dvmop_bpi,		0x02),
+	CMN_EVENT_DVM(NOT_CMN600, dvmop_pici,		0x03),
+	CMN_EVENT_DVM(NOT_CMN600, dvmop_vici,		0x04),
+	CMN_EVENT_DVM(NOT_CMN600, dvmsync,		0x05),
+	CMN_EVENT_DVM(NOT_CMN600, vmid_filtered,	0x06),
+	CMN_EVENT_DVM(NOT_CMN600, rndop_filtered,	0x07),
+	CMN_EVENT_DVM(NOT_CMN600, retry,		0x08),
+	CMN_EVENT_DVM(NOT_CMN600, txsnp_flitv,		0x09),
+	CMN_EVENT_DVM(NOT_CMN600, txsnp_stall,		0x0a),
+	CMN_EVENT_DVM(NOT_CMN600, trkfull,		0x0b),
+	_CMN_EVENT_DVM(NOT_CMN600, trk_occupancy_all,	0x0c, 0),
+	_CMN_EVENT_DVM(NOT_CMN600, trk_occupancy_dvmop,	0x0c, 1),
+	_CMN_EVENT_DVM(NOT_CMN600, trk_occupancy_dvmsync, 0x0c, 2),
 
 	CMN_EVENT_HNF(CMN_ANY, cache_miss,		0x01),
 	CMN_EVENT_HNF(CMN_ANY, slc_sf_cache_access,	0x02),
@@ -725,9 +765,12 @@ static struct attribute *arm_cmn_event_attrs[] = {
 	CMN_EVENT_HNF(CMN_ANY, stash_snp_sent,		0x1d),
 	CMN_EVENT_HNF(CMN_ANY, stash_data_pull,		0x1e),
 	CMN_EVENT_HNF(CMN_ANY, snp_fwded,		0x1f),
-	CMN_EVENT_HNF(CI700, atomic_fwd,		0x20),
-	CMN_EVENT_HNF(CI700, mpam_hardlim,		0x21),
-	CMN_EVENT_HNF(CI700, mpam_softlim,		0x22),
+	CMN_EVENT_HNF(NOT_CMN600, atomic_fwd,		0x20),
+	CMN_EVENT_HNF(NOT_CMN600, mpam_hardlim,		0x21),
+	CMN_EVENT_HNF(NOT_CMN600, mpam_softlim,		0x22),
+	CMN_EVENT_HNF(CMN650, snp_sent_cluster,		0x23),
+	CMN_EVENT_HNF(CMN650, sf_imprecise_evict,	0x24),
+	CMN_EVENT_HNF(CMN650, sf_evict_shared_line,	0x25),
 
 	CMN_EVENT_HNI(rrt_rd_occ_cnt_ovfl,		0x20),
 	CMN_EVENT_HNI(rrt_wr_occ_cnt_ovfl,		0x21),
@@ -749,6 +792,27 @@ static struct attribute *arm_cmn_event_attrs[] = {
 	CMN_EVENT_HNI(nonpcie_serialization,		0x31),
 	CMN_EVENT_HNI(pcie_serialization,		0x32),
 
+	/*
+	 * HN-P events squat on top of the HN-I similarly to DVM events, except
+	 * for being crammed into the same physical node as well. And of course
+	 * where would the fun be if the same events were in the same order...
+	 */
+	CMN_EVENT_HNP(rrt_wr_occ_cnt_ovfl,		0x01),
+	CMN_EVENT_HNP(rdt_wr_occ_cnt_ovfl,		0x02),
+	CMN_EVENT_HNP(wdb_occ_cnt_ovfl,			0x03),
+	CMN_EVENT_HNP(rrt_wr_alloc,			0x04),
+	CMN_EVENT_HNP(rdt_wr_alloc,			0x05),
+	CMN_EVENT_HNP(wdb_alloc,			0x06),
+	CMN_EVENT_HNP(awvalid_no_awready,		0x07),
+	CMN_EVENT_HNP(awready_no_awvalid,		0x08),
+	CMN_EVENT_HNP(wvalid_no_wready,			0x09),
+	CMN_EVENT_HNP(rrt_rd_occ_cnt_ovfl,		0x11),
+	CMN_EVENT_HNP(rdt_rd_occ_cnt_ovfl,		0x12),
+	CMN_EVENT_HNP(rrt_rd_alloc,			0x13),
+	CMN_EVENT_HNP(rdt_rd_alloc,			0x14),
+	CMN_EVENT_HNP(arvalid_no_arready,		0x15),
+	CMN_EVENT_HNP(arready_no_arvalid,		0x16),
+
 	CMN_EVENT_XP(txflit_valid,			0x01),
 	CMN_EVENT_XP(txflit_stall,			0x02),
 	CMN_EVENT_XP(partial_dat_flit,			0x03),
@@ -768,7 +832,7 @@ static struct attribute *arm_cmn_event_attrs[] = {
 	CMN_EVENT_SBSX(CMN_ANY, wdb_occ_cnt_ovfl,	0x14),
 	CMN_EVENT_SBSX(CMN_ANY, rd_axi_trkr_occ_cnt_ovfl, 0x15),
 	CMN_EVENT_SBSX(CMN_ANY, cmo_axi_trkr_occ_cnt_ovfl, 0x16),
-	CMN_EVENT_SBSX(CI700, rdb_occ_cnt_ovfl,		0x17),
+	CMN_EVENT_SBSX(NOT_CMN600, rdb_occ_cnt_ovfl,	0x17),
 	CMN_EVENT_SBSX(CMN_ANY, arvalid_no_arready,	0x21),
 	CMN_EVENT_SBSX(CMN_ANY, awvalid_no_awready,	0x22),
 	CMN_EVENT_SBSX(CMN_ANY, wvalid_no_wready,	0x23),
@@ -795,12 +859,12 @@ static struct attribute *arm_cmn_event_attrs[] = {
 	CMN_EVENT_RNID(CMN600, rdb_replay,		0x12),
 	CMN_EVENT_RNID(CMN600, rdb_hybrid,		0x13),
 	CMN_EVENT_RNID(CMN600, rdb_ord,			0x14),
-	CMN_EVENT_RNID(CI700, padb_occ_ovfl,		0x11),
-	CMN_EVENT_RNID(CI700, rpdb_occ_ovfl,		0x12),
-	CMN_EVENT_RNID(CI700, rrt_occup_ovfl_slice1,	0x13),
-	CMN_EVENT_RNID(CI700, rrt_occup_ovfl_slice2,	0x14),
-	CMN_EVENT_RNID(CI700, rrt_occup_ovfl_slice3,	0x15),
-	CMN_EVENT_RNID(CI700, wrt_throttled,		0x16),
+	CMN_EVENT_RNID(NOT_CMN600, padb_occ_ovfl,	0x11),
+	CMN_EVENT_RNID(NOT_CMN600, rpdb_occ_ovfl,	0x12),
+	CMN_EVENT_RNID(NOT_CMN600, rrt_occup_ovfl_slice1, 0x13),
+	CMN_EVENT_RNID(NOT_CMN600, rrt_occup_ovfl_slice2, 0x14),
+	CMN_EVENT_RNID(NOT_CMN600, rrt_occup_ovfl_slice3, 0x15),
+	CMN_EVENT_RNID(NOT_CMN600, wrt_throttled,	0x16),
 
 	CMN_EVENT_MTSX(tc_lookup,			0x01),
 	CMN_EVENT_MTSX(tc_fill,				0x02),
@@ -815,6 +879,42 @@ static struct attribute *arm_cmn_event_attrs[] = {
 	CMN_EVENT_MTSX(tcq_occ_cnt_ovfl,		0x0b),
 	CMN_EVENT_MTSX(tdb_occ_cnt_ovfl,		0x0c),
 
+	CMN_EVENT_CXRA(CMN_ANY, rht_occ,		0x01),
+	CMN_EVENT_CXRA(CMN_ANY, sht_occ,		0x02),
+	CMN_EVENT_CXRA(CMN_ANY, rdb_occ,		0x03),
+	CMN_EVENT_CXRA(CMN_ANY, wdb_occ,		0x04),
+	CMN_EVENT_CXRA(CMN_ANY, ssb_occ,		0x05),
+	CMN_EVENT_CXRA(CMN_ANY, snp_bcasts,		0x06),
+	CMN_EVENT_CXRA(CMN_ANY, req_chains,		0x07),
+	CMN_EVENT_CXRA(CMN_ANY, req_chain_avglen,	0x08),
+	CMN_EVENT_CXRA(CMN_ANY, chirsp_stalls,		0x09),
+	CMN_EVENT_CXRA(CMN_ANY, chidat_stalls,		0x0a),
+	CMN_EVENT_CXRA(CMN_ANY, cxreq_pcrd_stalls_link0, 0x0b),
+	CMN_EVENT_CXRA(CMN_ANY, cxreq_pcrd_stalls_link1, 0x0c),
+	CMN_EVENT_CXRA(CMN_ANY, cxreq_pcrd_stalls_link2, 0x0d),
+	CMN_EVENT_CXRA(CMN_ANY, cxdat_pcrd_stalls_link0, 0x0e),
+	CMN_EVENT_CXRA(CMN_ANY, cxdat_pcrd_stalls_link1, 0x0f),
+	CMN_EVENT_CXRA(CMN_ANY, cxdat_pcrd_stalls_link2, 0x10),
+	CMN_EVENT_CXRA(CMN_ANY, external_chirsp_stalls,	0x11),
+	CMN_EVENT_CXRA(CMN_ANY, external_chidat_stalls,	0x12),
+	CMN_EVENT_CXRA(NOT_CMN600, cxmisc_pcrd_stalls_link0, 0x13),
+	CMN_EVENT_CXRA(NOT_CMN600, cxmisc_pcrd_stalls_link1, 0x14),
+	CMN_EVENT_CXRA(NOT_CMN600, cxmisc_pcrd_stalls_link2, 0x15),
+
+	CMN_EVENT_CXHA(rddatbyp,			0x21),
+	CMN_EVENT_CXHA(chirsp_up_stall,			0x22),
+	CMN_EVENT_CXHA(chidat_up_stall,			0x23),
+	CMN_EVENT_CXHA(snppcrd_link0_stall,		0x24),
+	CMN_EVENT_CXHA(snppcrd_link1_stall,		0x25),
+	CMN_EVENT_CXHA(snppcrd_link2_stall,		0x26),
+	CMN_EVENT_CXHA(reqtrk_occ,			0x27),
+	CMN_EVENT_CXHA(rdb_occ,				0x28),
+	CMN_EVENT_CXHA(rdbyp_occ,			0x29),
+	CMN_EVENT_CXHA(wdb_occ,				0x2a),
+	CMN_EVENT_CXHA(snptrk_occ,			0x2b),
+	CMN_EVENT_CXHA(sdb_occ,				0x2c),
+	CMN_EVENT_CXHA(snphaz_occ,			0x2d),
+
 	NULL
 };
 
@@ -1653,6 +1753,16 @@ static void arm_cmn_init_node_info(struct arm_cmn *cmn, u32 offset, struct arm_c
 			node->type, node->logid, offset);
 }
 
+static enum cmn_node_type arm_cmn_subtype(enum cmn_node_type type)
+{
+	switch (type) {
+	case CMN_TYPE_HNP:
+		return CMN_TYPE_HNI;
+	default:
+		return CMN_TYPE_INVALID;
+	}
+}
+
 static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 {
 	void __iomem *cfg_region;
@@ -1693,8 +1803,13 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 		cmn->num_dns += FIELD_GET(CMN_CI_CHILD_COUNT, reg);
 	}
 
-	/* Cheeky +1 to help terminate pointer-based iteration later */
-	dn = devm_kcalloc(cmn->dev, cmn->num_dns + 1, sizeof(*dn), GFP_KERNEL);
+	/*
+	 * Some nodes effectively have two separate types, which we'll handle
+	 * by creating one of each internally. For a (very) safe initial upper
+	 * bound, account for double the number of non-XP nodes.
+	 */
+	dn = devm_kcalloc(cmn->dev, cmn->num_dns * 2 - cmn->num_xps,
+			  sizeof(*dn), GFP_KERNEL);
 	if (!dn)
 		return -ENOMEM;
 
@@ -1803,6 +1918,18 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 			case CMN_TYPE_RNSAM:
 			case CMN_TYPE_CXLA:
 				break;
+			/*
+			 * Split "optimised" combination nodes into separate
+			 * types for the different event sets. Offsetting the
+			 * base address lets us handle the second pmu_event_sel
+			 * register via the normal mechanism later.
+			 */
+			case CMN_TYPE_HNP:
+				dn[1] = dn[0];
+				dn[0].pmu_base += CMN_HNP_PMU_EVENT_SEL;
+				dn[1].type = arm_cmn_subtype(dn->type);
+				dn += 2;
+				break;
 			/* Something has gone horribly wrong */
 			default:
 				dev_err(cmn->dev, "invalid device node type: 0x%x\n", dn->type);
@@ -1811,9 +1938,10 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 		}
 	}
 
-	/* Correct for any nodes we skipped */
+	/* Correct for any nodes we added or skipped */
 	cmn->num_dns = dn - cmn->dns;
 
+	/* Cheeky +1 to help terminate pointer-based iteration later */
 	sz = (void *)(dn + 1) - (void *)cmn->dns;
 	dn = devm_krealloc(cmn->dev, cmn->dns, sz, GFP_KERNEL);
 	if (dn)
@@ -1971,6 +2099,7 @@ static int arm_cmn_remove(struct platform_device *pdev)
 #ifdef CONFIG_OF
 static const struct of_device_id arm_cmn_of_match[] = {
 	{ .compatible = "arm,cmn-600", .data = (void *)CMN600 },
+	{ .compatible = "arm,cmn-650", .data = (void *)CMN650 },
 	{ .compatible = "arm,ci-700", .data = (void *)CI700 },
 	{}
 };
@@ -1980,6 +2109,7 @@ MODULE_DEVICE_TABLE(of, arm_cmn_of_match);
 #ifdef CONFIG_ACPI
 static const struct acpi_device_id arm_cmn_acpi_match[] = {
 	{ "ARMHC600", CMN600 },
+	{ "ARMHC650", CMN650 },
 	{}
 };
 MODULE_DEVICE_TABLE(acpi, arm_cmn_acpi_match);

From d651ed532cddf97dfe6e8f72e1f0b2b06c43b988 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 18 Apr 2022 23:57:40 +0100
Subject: [PATCH 656/737] perf/arm-cmn: Refactor occupancy filter selector

So far, DNs and HN-Fs have each had one event ralated to occupancy
trackers which are filtered by a separate field. CMN-700 raises the
stakes by introducing two more sets of HN-F events with corresponding
additional filter fields. Prepare for this by refactoring our filter
selection and tracking logic to account for multiple filter types
coexisting on the same node. This need not affect the uAPI, which can
just continue to encode any per-event filter setting in the "occupid"
config field, even if it's technically not the most accurate name for
some of them.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Tested-by: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Link: https://lore.kernel.org/r/1aa47ba0455b144c416537f6b0e58dc93b467a00.1650320598.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 65adf71398f5af9a591dc1b7eccac123f992d97a)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/arm-cmn.c | 170 ++++++++++++++++++++++++-----------------
 1 file changed, 98 insertions(+), 72 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 630345ac2cb1f..7e80ef84fae41 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -65,6 +65,8 @@
 
 /* For most nodes, this is all there is */
 #define CMN_PMU_EVENT_SEL		0x000
+/* Technically this is 4 bits wide on DNs, but we only use 2 there anyway */
+#define CMN__PMU_OCCUP1_ID		GENMASK_ULL(34, 32)
 
 /* HN-Ps are weird... */
 #define CMN_HNP_PMU_EVENT_SEL		0x008
@@ -229,6 +231,12 @@ enum cmn_node_type {
 	CMN_TYPE_WP = 0x7770
 };
 
+enum cmn_filter_select {
+	SEL_NONE = -1,
+	SEL_OCCUP1ID,
+	SEL_MAX
+};
+
 struct arm_cmn_node {
 	void __iomem *pmu_base;
 	u16 id, logid;
@@ -238,9 +246,9 @@ struct arm_cmn_node {
 	union {
 		/* DN/HN-F/CXHA */
 		struct {
-			u8 occupid_val;
-			u8 occupid_count;
-		};
+			u8 val : 4;
+			u8 count : 4;
+		} occupid[SEL_MAX];
 		/* XP */
 		u8 dtc;
 	};
@@ -510,6 +518,7 @@ struct arm_cmn_hw_event {
 	u8 dtcs_used;
 	u8 num_dns;
 	u8 dtm_offset;
+	enum cmn_filter_select filter_sel;
 };
 
 #define for_each_hw_dn(hw, dn, i) \
@@ -535,6 +544,7 @@ struct arm_cmn_event_attr {
 	struct device_attribute attr;
 	enum cmn_model model;
 	enum cmn_node_type type;
+	enum cmn_filter_select fsel;
 	u8 eventid;
 	u8 occupid;
 };
@@ -545,22 +555,17 @@ struct arm_cmn_format_attr {
 	int config;
 };
 
-#define CMN_EVENT_ATTR(_model, _name, _type, _eventid, _occupid)	\
+#define _CMN_EVENT_ATTR(_model, _name, _type, _eventid, _occupid, _fsel)\
 	(&((struct arm_cmn_event_attr[]) {{				\
 		.attr = __ATTR(_name, 0444, arm_cmn_event_show, NULL),	\
 		.model = _model,					\
 		.type = _type,						\
 		.eventid = _eventid,					\
 		.occupid = _occupid,					\
+		.fsel = _fsel,						\
 	}})[0].attr.attr)
-
-static bool arm_cmn_is_occup_event(enum cmn_model model,
-				   enum cmn_node_type type, unsigned int id)
-{
-	if (type == CMN_TYPE_DVM)
-		return model == CMN600 ? id == 0x05 : id == 0x0c;
-	return type == CMN_TYPE_HNF && id == 0x0f;
-}
+#define CMN_EVENT_ATTR(_model, _name, _type, _eventid)			\
+	_CMN_EVENT_ATTR(_model, _name, _type, _eventid, 0, SEL_NONE)
 
 static ssize_t arm_cmn_event_show(struct device *dev,
 				  struct device_attribute *attr, char *buf)
@@ -577,7 +582,7 @@ static ssize_t arm_cmn_event_show(struct device *dev,
 				  "type=0x%x,eventid=0x%x,wp_dev_sel=?,wp_chn_sel=?,wp_grp=?,wp_val=?,wp_mask=?\n",
 				  eattr->type, eattr->eventid);
 
-	if (arm_cmn_is_occup_event(eattr->model, eattr->type, eattr->eventid))
+	if (eattr->fsel > SEL_NONE)
 		return sysfs_emit(buf, "type=0x%x,eventid=0x%x,occupid=0x%x\n",
 				  eattr->type, eattr->eventid, eattr->occupid);
 
@@ -652,33 +657,37 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj,
 	return attr->mode;
 }
 
-#define _CMN_EVENT_DVM(_model, _name, _event, _occup)		\
-	CMN_EVENT_ATTR(_model, dn_##_name, CMN_TYPE_DVM, _event, _occup)
+#define _CMN_EVENT_DVM(_model, _name, _event, _occup, _fsel)	\
+	_CMN_EVENT_ATTR(_model, dn_##_name, CMN_TYPE_DVM, _event, _occup, _fsel)
 #define CMN_EVENT_DTC(_name)					\
-	CMN_EVENT_ATTR(CMN_ANY, dtc_##_name, CMN_TYPE_DTC, 0, 0)
-#define _CMN_EVENT_HNF(_model, _name, _event, _occup)		\
-	CMN_EVENT_ATTR(_model, hnf_##_name, CMN_TYPE_HNF, _event, _occup)
+	CMN_EVENT_ATTR(CMN_ANY, dtc_##_name, CMN_TYPE_DTC, 0)
+#define _CMN_EVENT_HNF(_model, _name, _event, _occup, _fsel)		\
+	_CMN_EVENT_ATTR(_model, hnf_##_name, CMN_TYPE_HNF, _event, _occup, _fsel)
 #define CMN_EVENT_HNI(_name, _event)				\
-	CMN_EVENT_ATTR(CMN_ANY, hni_##_name, CMN_TYPE_HNI, _event, 0)
+	CMN_EVENT_ATTR(CMN_ANY, hni_##_name, CMN_TYPE_HNI, _event)
 #define CMN_EVENT_HNP(_name, _event)				\
-	CMN_EVENT_ATTR(CMN_ANY, hnp_##_name, CMN_TYPE_HNP, _event, 0)
+	CMN_EVENT_ATTR(CMN_ANY, hnp_##_name, CMN_TYPE_HNP, _event)
 #define __CMN_EVENT_XP(_name, _event)				\
-	CMN_EVENT_ATTR(CMN_ANY, mxp_##_name, CMN_TYPE_XP, _event, 0)
+	CMN_EVENT_ATTR(CMN_ANY, mxp_##_name, CMN_TYPE_XP, _event)
 #define CMN_EVENT_SBSX(_model, _name, _event)			\
-	CMN_EVENT_ATTR(_model, sbsx_##_name, CMN_TYPE_SBSX, _event, 0)
+	CMN_EVENT_ATTR(_model, sbsx_##_name, CMN_TYPE_SBSX, _event)
 #define CMN_EVENT_RNID(_model, _name, _event)			\
-	CMN_EVENT_ATTR(_model, rnid_##_name, CMN_TYPE_RNI, _event, 0)
+	CMN_EVENT_ATTR(_model, rnid_##_name, CMN_TYPE_RNI, _event)
 #define CMN_EVENT_MTSX(_name, _event)				\
-	CMN_EVENT_ATTR(CMN_ANY, mtsx_##_name, CMN_TYPE_MTSX, _event, 0)
+	CMN_EVENT_ATTR(CMN_ANY, mtsx_##_name, CMN_TYPE_MTSX, _event)
 #define CMN_EVENT_CXRA(_model, _name, _event)				\
-	CMN_EVENT_ATTR(_model, cxra_##_name, CMN_TYPE_CXRA, _event, 0)
+	CMN_EVENT_ATTR(_model, cxra_##_name, CMN_TYPE_CXRA, _event)
 #define CMN_EVENT_CXHA(_name, _event)				\
-	CMN_EVENT_ATTR(CMN_ANY, cxha_##_name, CMN_TYPE_CXHA, _event, 0)
+	CMN_EVENT_ATTR(CMN_ANY, cxha_##_name, CMN_TYPE_CXHA, _event)
 
 #define CMN_EVENT_DVM(_model, _name, _event)			\
-	_CMN_EVENT_DVM(_model, _name, _event, 0)
+	_CMN_EVENT_DVM(_model, _name, _event, 0, SEL_NONE)
+#define CMN_EVENT_DVM_OCC(_model, _name, _event)			\
+	_CMN_EVENT_DVM(_model, _name##_all, _event, 0, SEL_OCCUP1ID),	\
+	_CMN_EVENT_DVM(_model, _name##_dvmop, _event, 1, SEL_OCCUP1ID),	\
+	_CMN_EVENT_DVM(_model, _name##_dvmsync, _event, 2, SEL_OCCUP1ID)
 #define CMN_EVENT_HNF(_model, _name, _event)			\
-	_CMN_EVENT_HNF(_model, _name, _event, 0)
+	_CMN_EVENT_HNF(_model, _name, _event, 0, SEL_NONE)
 #define _CMN_EVENT_XP(_name, _event)				\
 	__CMN_EVENT_XP(e_##_name, (_event) | (0 << 2)),		\
 	__CMN_EVENT_XP(w_##_name, (_event) | (1 << 2)),		\
@@ -712,9 +721,7 @@ static struct attribute *arm_cmn_event_attrs[] = {
 	CMN_EVENT_DVM(CMN600, rxreq_dvmsync,		0x02),
 	CMN_EVENT_DVM(CMN600, rxreq_dvmop_vmid_filtered, 0x03),
 	CMN_EVENT_DVM(CMN600, rxreq_retried,		0x04),
-	_CMN_EVENT_DVM(CMN600, rxreq_trk_occupancy_all, 0x05, 0),
-	_CMN_EVENT_DVM(CMN600, rxreq_trk_occupancy_dvmop, 0x05, 1),
-	_CMN_EVENT_DVM(CMN600, rxreq_trk_occupancy_dvmsync, 0x05, 2),
+	CMN_EVENT_DVM_OCC(CMN600, rxreq_trk_occupancy,	0x05),
 	CMN_EVENT_DVM(NOT_CMN600, dvmop_tlbi,		0x01),
 	CMN_EVENT_DVM(NOT_CMN600, dvmop_bpi,		0x02),
 	CMN_EVENT_DVM(NOT_CMN600, dvmop_pici,		0x03),
@@ -726,9 +733,7 @@ static struct attribute *arm_cmn_event_attrs[] = {
 	CMN_EVENT_DVM(NOT_CMN600, txsnp_flitv,		0x09),
 	CMN_EVENT_DVM(NOT_CMN600, txsnp_stall,		0x0a),
 	CMN_EVENT_DVM(NOT_CMN600, trkfull,		0x0b),
-	_CMN_EVENT_DVM(NOT_CMN600, trk_occupancy_all,	0x0c, 0),
-	_CMN_EVENT_DVM(NOT_CMN600, trk_occupancy_dvmop,	0x0c, 1),
-	_CMN_EVENT_DVM(NOT_CMN600, trk_occupancy_dvmsync, 0x0c, 2),
+	CMN_EVENT_DVM_OCC(NOT_CMN600, trk_occupancy,	0x0c),
 
 	CMN_EVENT_HNF(CMN_ANY, cache_miss,		0x01),
 	CMN_EVENT_HNF(CMN_ANY, slc_sf_cache_access,	0x02),
@@ -744,11 +749,11 @@ static struct attribute *arm_cmn_event_attrs[] = {
 	CMN_EVENT_HNF(CMN_ANY, mc_retries,		0x0c),
 	CMN_EVENT_HNF(CMN_ANY, mc_reqs,			0x0d),
 	CMN_EVENT_HNF(CMN_ANY, qos_hh_retry,		0x0e),
-	_CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_all,	0x0f, 0),
-	_CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_read, 0x0f, 1),
-	_CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_write, 0x0f, 2),
-	_CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_atomic, 0x0f, 3),
-	_CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_stash, 0x0f, 4),
+	_CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_all,	0x0f, 0, SEL_OCCUP1ID),
+	_CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_read, 0x0f, 1, SEL_OCCUP1ID),
+	_CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_write, 0x0f, 2, SEL_OCCUP1ID),
+	_CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_atomic, 0x0f, 3, SEL_OCCUP1ID),
+	_CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_stash, 0x0f, 4, SEL_OCCUP1ID),
 	CMN_EVENT_HNF(CMN_ANY, pocq_addrhaz,		0x10),
 	CMN_EVENT_HNF(CMN_ANY, pocq_atomic_addrhaz,	0x11),
 	CMN_EVENT_HNF(CMN_ANY, ld_st_swp_adq_full,	0x12),
@@ -817,8 +822,8 @@ static struct attribute *arm_cmn_event_attrs[] = {
 	CMN_EVENT_XP(txflit_stall,			0x02),
 	CMN_EVENT_XP(partial_dat_flit,			0x03),
 	/* We treat watchpoints as a special made-up class of XP events */
-	CMN_EVENT_ATTR(CMN_ANY, watchpoint_up, CMN_TYPE_WP, CMN_WP_UP, 0),
-	CMN_EVENT_ATTR(CMN_ANY, watchpoint_down, CMN_TYPE_WP, CMN_WP_DOWN, 0),
+	CMN_EVENT_ATTR(CMN_ANY, watchpoint_up, CMN_TYPE_WP, CMN_WP_UP),
+	CMN_EVENT_ATTR(CMN_ANY, watchpoint_down, CMN_TYPE_WP, CMN_WP_DOWN),
 
 	CMN_EVENT_SBSX(CMN_ANY, rd_req,			0x01),
 	CMN_EVENT_SBSX(CMN_ANY, wr_req,			0x02),
@@ -1132,6 +1137,26 @@ static void arm_cmn_event_read(struct perf_event *event)
 	local64_add(delta, &event->count);
 }
 
+static int arm_cmn_set_event_sel_hi(struct arm_cmn_node *dn,
+				    enum cmn_filter_select fsel, u8 occupid)
+{
+	u64 reg;
+
+	if (fsel == SEL_NONE)
+		return 0;
+
+	if (!dn->occupid[fsel].count) {
+		dn->occupid[fsel].val = occupid;
+		reg = FIELD_PREP(CMN__PMU_OCCUP1_ID,
+				 dn->occupid[SEL_OCCUP1ID].val);
+		writel_relaxed(reg >> 32, dn->pmu_base + CMN_PMU_EVENT_SEL + 4);
+	} else if (dn->occupid[fsel].val != occupid) {
+		return -EBUSY;
+	}
+	dn->occupid[fsel].count++;
+	return 0;
+}
+
 static void arm_cmn_event_start(struct perf_event *event, int flags)
 {
 	struct arm_cmn *cmn = to_cmn(event->pmu);
@@ -1195,7 +1220,7 @@ static void arm_cmn_event_stop(struct perf_event *event, int flags)
 
 struct arm_cmn_val {
 	u8 dtm_count[CMN_MAX_DTMS];
-	u8 occupid[CMN_MAX_DTMS];
+	u8 occupid[CMN_MAX_DTMS][SEL_MAX];
 	u8 wp[CMN_MAX_DTMS][4];
 	int dtc_count;
 	bool cycles;
@@ -1208,7 +1233,6 @@ static void arm_cmn_val_add_event(struct arm_cmn *cmn, struct arm_cmn_val *val,
 	struct arm_cmn_node *dn;
 	enum cmn_node_type type;
 	int i;
-	u8 occupid;
 
 	if (is_software_event(event))
 		return;
@@ -1220,16 +1244,14 @@ static void arm_cmn_val_add_event(struct arm_cmn *cmn, struct arm_cmn_val *val,
 	}
 
 	val->dtc_count++;
-	if (arm_cmn_is_occup_event(cmn->model, type, CMN_EVENT_EVENTID(event)))
-		occupid = CMN_EVENT_OCCUPID(event) + 1;
-	else
-		occupid = 0;
 
 	for_each_hw_dn(hw, dn, i) {
-		int wp_idx, dtm = dn->dtm;
+		int wp_idx, dtm = dn->dtm, sel = hw->filter_sel;
 
 		val->dtm_count[dtm]++;
-		val->occupid[dtm] = occupid;
+
+		if (sel > SEL_NONE)
+			val->occupid[dtm][sel] = CMN_EVENT_OCCUPID(event) + 1;
 
 		if (type != CMN_TYPE_WP)
 			continue;
@@ -1247,7 +1269,6 @@ static int arm_cmn_validate_group(struct arm_cmn *cmn, struct perf_event *event)
 	enum cmn_node_type type;
 	struct arm_cmn_val *val;
 	int i, ret = -EINVAL;
-	u8 occupid;
 
 	if (leader == event)
 		return 0;
@@ -1272,18 +1293,14 @@ static int arm_cmn_validate_group(struct arm_cmn *cmn, struct perf_event *event)
 	if (val->dtc_count == CMN_DT_NUM_COUNTERS)
 		goto done;
 
-	if (arm_cmn_is_occup_event(cmn->model, type, CMN_EVENT_EVENTID(event)))
-		occupid = CMN_EVENT_OCCUPID(event) + 1;
-	else
-		occupid = 0;
-
 	for_each_hw_dn(hw, dn, i) {
-		int wp_idx, wp_cmb, dtm = dn->dtm;
+		int wp_idx, wp_cmb, dtm = dn->dtm, sel = hw->filter_sel;
 
 		if (val->dtm_count[dtm] == CMN_DTM_NUM_COUNTERS)
 			goto done;
 
-		if (occupid && val->occupid[dtm] && occupid != val->occupid[dtm])
+		if (sel > SEL_NONE && val->occupid[dtm][sel] &&
+		    val->occupid[dtm][sel] != CMN_EVENT_OCCUPID(event) + 1)
 			goto done;
 
 		if (type != CMN_TYPE_WP)
@@ -1304,6 +1321,22 @@ static int arm_cmn_validate_group(struct arm_cmn *cmn, struct perf_event *event)
 	return ret;
 }
 
+static enum cmn_filter_select arm_cmn_filter_sel(enum cmn_model model,
+						 enum cmn_node_type type,
+						 unsigned int eventid)
+{
+	struct arm_cmn_event_attr *e;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(arm_cmn_event_attrs); i++) {
+		e = container_of(arm_cmn_event_attrs[i], typeof(*e), attr.attr);
+		if (e->model & model && e->type == type && e->eventid == eventid)
+			return e->fsel;
+	}
+	return SEL_NONE;
+}
+
+
 static int arm_cmn_event_init(struct perf_event *event)
 {
 	struct arm_cmn *cmn = to_cmn(event->pmu);
@@ -1328,11 +1361,11 @@ static int arm_cmn_event_init(struct perf_event *event)
 	if (type == CMN_TYPE_DTC)
 		return 0;
 
+	eventid = CMN_EVENT_EVENTID(event);
 	/* For watchpoints we need the actual XP node here */
 	if (type == CMN_TYPE_WP) {
 		type = CMN_TYPE_XP;
 		/* ...and we need a "real" direction */
-		eventid = CMN_EVENT_EVENTID(event);
 		if (eventid != CMN_WP_UP && eventid != CMN_WP_DOWN)
 			return -EINVAL;
 		/* ...but the DTM may depend on which port we're watching */
@@ -1340,6 +1373,9 @@ static int arm_cmn_event_init(struct perf_event *event)
 			hw->dtm_offset = CMN_EVENT_WP_DEV_SEL(event) / 2;
 	}
 
+	/* This is sufficiently annoying to recalculate, so cache it */
+	hw->filter_sel = arm_cmn_filter_sel(cmn->model, type, eventid);
+
 	bynodeid = CMN_EVENT_BYNODEID(event);
 	nodeid = CMN_EVENT_NODEID(event);
 
@@ -1381,8 +1417,8 @@ static void arm_cmn_event_clear(struct arm_cmn *cmn, struct perf_event *event,
 		if (type == CMN_TYPE_WP)
 			dtm->wp_event[arm_cmn_wp_idx(event)] = -1;
 
-		if (arm_cmn_is_occup_event(cmn->model, type, CMN_EVENT_EVENTID(event)))
-			hw->dn[i].occupid_count--;
+		if (hw->filter_sel > SEL_NONE)
+			hw->dn[i].occupid[hw->filter_sel].count--;
 
 		dtm->pmu_config_low &= ~CMN__PMEVCNT_PAIRED(dtm_idx);
 		writel_relaxed(dtm->pmu_config_low, dtm->base + CMN_DTM_PMU_CONFIG);
@@ -1462,18 +1498,8 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
 			input_sel = CMN__PMEVCNT0_INPUT_SEL_DEV + dtm_idx +
 				    (nid.port << 4) + (nid.dev << 2);
 
-			if (arm_cmn_is_occup_event(cmn->model, type, CMN_EVENT_EVENTID(event))) {
-				u8 occupid = CMN_EVENT_OCCUPID(event);
-
-				if (dn->occupid_count == 0) {
-					dn->occupid_val = occupid;
-					writel_relaxed(occupid,
-						       dn->pmu_base + CMN_PMU_EVENT_SEL + 4);
-				} else if (dn->occupid_val != occupid) {
-					goto free_dtms;
-				}
-				dn->occupid_count++;
-			}
+			if (arm_cmn_set_event_sel_hi(dn, hw->filter_sel, CMN_EVENT_OCCUPID(event)))
+				goto free_dtms;
 		}
 
 		arm_cmn_set_index(hw->dtm_idx, i, dtm_idx);

From a6e0ea3e8b28c7b647c71f26dbe3b7265d48500a Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 18 Apr 2022 23:57:41 +0100
Subject: [PATCH 657/737] perf/arm-cmn: Add CMN-700 support

Add the identifiers, events, and subtleties for CMN-700. Highlights
include yet more options for doubling up CHI channels, which finally
grows event IDs beyond 8 bits for XPs, and a new set of CML gateway
nodes adding support for CXL as well as CCIX, where the Link Agent is
now internal to the CMN mesh so we gain regular PMU events for that too.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Tested-by: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Link: https://lore.kernel.org/r/cf892baa0d0258ea6cd6544b15171be0069a083a.1650320598.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 23760a0144173ef398522fbcc1dbe79521b5caf9)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/arm-cmn.c | 236 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 220 insertions(+), 16 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 7e80ef84fae41..c8101e2053f15 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -52,6 +52,10 @@
 #define CMN_INFO_RSP_VC_NUM		GENMASK_ULL(53, 52)
 #define CMN_INFO_DAT_VC_NUM		GENMASK_ULL(51, 50)
 
+#define CMN_CFGM_INFO_GLOBAL_1		0x908
+#define CMN_INFO_SNP_VC_NUM		GENMASK_ULL(3, 2)
+#define CMN_INFO_REQ_VC_NUM		GENMASK_ULL(1, 0)
+
 /* XPs also have some local topology info which has uses too */
 #define CMN_MXP__CONNECT_INFO_P0	0x0008
 #define CMN_MXP__CONNECT_INFO_P1	0x0010
@@ -65,6 +69,8 @@
 
 /* For most nodes, this is all there is */
 #define CMN_PMU_EVENT_SEL		0x000
+#define CMN__PMU_CBUSY_SNTHROTTLE_SEL	GENMASK_ULL(44, 42)
+#define CMN__PMU_CLASS_OCCUP_ID		GENMASK_ULL(36, 35)
 /* Technically this is 4 bits wide on DNs, but we only use 2 there anyway */
 #define CMN__PMU_OCCUP1_ID		GENMASK_ULL(34, 32)
 
@@ -74,7 +80,8 @@
 /* DTMs live in the PMU space of XP registers */
 #define CMN_DTM_WPn(n)			(0x1A0 + (n) * 0x18)
 #define CMN_DTM_WPn_CONFIG(n)		(CMN_DTM_WPn(n) + 0x00)
-#define CMN_DTM_WPn_CONFIG_WP_DEV_SEL2	GENMASK_ULL(18,17)
+#define CMN_DTM_WPn_CONFIG_WP_CHN_NUM	GENMASK_ULL(20, 19)
+#define CMN_DTM_WPn_CONFIG_WP_DEV_SEL2	GENMASK_ULL(18, 17)
 #define CMN_DTM_WPn_CONFIG_WP_COMBINE	BIT(9)
 #define CMN_DTM_WPn_CONFIG_WP_EXCLUSIVE	BIT(8)
 #define CMN600_WPn_CONFIG_WP_COMBINE	BIT(6)
@@ -147,8 +154,8 @@
 
 /* Event attributes */
 #define CMN_CONFIG_TYPE			GENMASK_ULL(15, 0)
-#define CMN_CONFIG_EVENTID		GENMASK_ULL(23, 16)
-#define CMN_CONFIG_OCCUPID		GENMASK_ULL(27, 24)
+#define CMN_CONFIG_EVENTID		GENMASK_ULL(26, 16)
+#define CMN_CONFIG_OCCUPID		GENMASK_ULL(30, 27)
 #define CMN_CONFIG_BYNODEID		BIT_ULL(31)
 #define CMN_CONFIG_NODEID		GENMASK_ULL(47, 32)
 
@@ -183,10 +190,12 @@
 enum cmn_model {
 	CMN600 = 1,
 	CMN650 = 2,
+	CMN700 = 4,
 	CI700 = 8,
 	/* ...and then we can use bitmap tricks for commonality */
 	CMN_ANY = -1,
 	NOT_CMN600 = -2,
+	CMN_650ON = CMN650 | CMN700,
 };
 
 /* CMN-600 r0px shouldn't exist in silicon, thankfully */
@@ -203,6 +212,9 @@ enum cmn_revision {
 	CMN650_R1P1,
 	CMN650_R2P0,
 	CMN650_R1P2,
+	CMN700_R0P0 = 0,
+	CMN700_R1P0,
+	CMN700_R2P0,
 	CI700_R0P0 = 0,
 	CI700_R1P0,
 	CI700_R2P0,
@@ -225,8 +237,12 @@ enum cmn_node_type {
 	CMN_TYPE_MTSX,
 	CMN_TYPE_HNP,
 	CMN_TYPE_CXRA = 0x100,
-	CMN_TYPE_CXHA = 0x101,
-	CMN_TYPE_CXLA = 0x102,
+	CMN_TYPE_CXHA,
+	CMN_TYPE_CXLA,
+	CMN_TYPE_CCRA,
+	CMN_TYPE_CCHA,
+	CMN_TYPE_CCLA,
+	CMN_TYPE_CCLA_RNI,
 	/* Not a real node type */
 	CMN_TYPE_WP = 0x7770
 };
@@ -234,6 +250,8 @@ enum cmn_node_type {
 enum cmn_filter_select {
 	SEL_NONE = -1,
 	SEL_OCCUP1ID,
+	SEL_CLASS_OCCUP_ID,
+	SEL_CBUSY_SNTHROTTLE_SEL,
 	SEL_MAX
 };
 
@@ -255,6 +273,8 @@ struct arm_cmn_node {
 	union {
 		u8 event[4];
 		__le32 event_sel;
+		u16 event_w[4];
+		__le64 event_sel_w;
 	};
 };
 
@@ -297,6 +317,8 @@ struct arm_cmn {
 	struct {
 		unsigned int rsp_vc_num : 2;
 		unsigned int dat_vc_num : 2;
+		unsigned int snp_vc_num : 2;
+		unsigned int req_vc_num : 2;
 	};
 
 	struct arm_cmn_node *xps;
@@ -405,6 +427,8 @@ static const char *arm_cmn_device_type(u8 type)
 		case 0x18: return " RN-F_E |";
 		case 0x19: return "RN-F_E_E|";
 		case 0x1c: return "  MTSX  |";
+		case 0x1d: return "  HN-V  |";
+		case 0x1e: return "  CCG   |";
 		default:   return "  ????  |";
 	}
 }
@@ -518,6 +542,7 @@ struct arm_cmn_hw_event {
 	u8 dtcs_used;
 	u8 num_dns;
 	u8 dtm_offset;
+	bool wide_sel;
 	enum cmn_filter_select filter_sel;
 };
 
@@ -545,7 +570,7 @@ struct arm_cmn_event_attr {
 	enum cmn_model model;
 	enum cmn_node_type type;
 	enum cmn_filter_select fsel;
-	u8 eventid;
+	u16 eventid;
 	u8 occupid;
 };
 
@@ -624,7 +649,9 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj,
 			return 0;
 
 		if ((chan == 5 && cmn->rsp_vc_num < 2) ||
-		    (chan == 6 && cmn->dat_vc_num < 2))
+		    (chan == 6 && cmn->dat_vc_num < 2) ||
+		    (chan == 7 && cmn->snp_vc_num < 2) ||
+		    (chan == 8 && cmn->req_vc_num < 2))
 			return 0;
 	}
 
@@ -649,6 +676,19 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj,
 			if (type == CMN_TYPE_RNI && eventid > 0x10)
 				return 0;
 		}
+	} else if (cmn->model == CMN700) {
+		if (cmn->rev < CMN700_R2P0) {
+			if (type == CMN_TYPE_HNF && eventid > 0x2c)
+				return 0;
+			if (type == CMN_TYPE_CCHA && eventid > 0x74)
+				return 0;
+			if (type == CMN_TYPE_CCLA && eventid > 0x27)
+				return 0;
+		}
+		if (cmn->rev < CMN700_R1P0) {
+			if (type == CMN_TYPE_HNF && eventid > 0x2b)
+				return 0;
+		}
 	}
 
 	if (!arm_cmn_node(cmn, type))
@@ -679,6 +719,14 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj,
 	CMN_EVENT_ATTR(_model, cxra_##_name, CMN_TYPE_CXRA, _event)
 #define CMN_EVENT_CXHA(_name, _event)				\
 	CMN_EVENT_ATTR(CMN_ANY, cxha_##_name, CMN_TYPE_CXHA, _event)
+#define CMN_EVENT_CCRA(_name, _event)				\
+	CMN_EVENT_ATTR(CMN_ANY, ccra_##_name, CMN_TYPE_CCRA, _event)
+#define CMN_EVENT_CCHA(_name, _event)				\
+	CMN_EVENT_ATTR(CMN_ANY, ccha_##_name, CMN_TYPE_CCHA, _event)
+#define CMN_EVENT_CCLA(_name, _event)				\
+	CMN_EVENT_ATTR(CMN_ANY, ccla_##_name, CMN_TYPE_CCLA, _event)
+#define CMN_EVENT_CCLA_RNI(_name, _event)				\
+	CMN_EVENT_ATTR(CMN_ANY, ccla_rni_##_name, CMN_TYPE_CCLA_RNI, _event)
 
 #define CMN_EVENT_DVM(_model, _name, _event)			\
 	_CMN_EVENT_DVM(_model, _name, _event, 0, SEL_NONE)
@@ -688,6 +736,20 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj,
 	_CMN_EVENT_DVM(_model, _name##_dvmsync, _event, 2, SEL_OCCUP1ID)
 #define CMN_EVENT_HNF(_model, _name, _event)			\
 	_CMN_EVENT_HNF(_model, _name, _event, 0, SEL_NONE)
+#define CMN_EVENT_HNF_CLS(_model, _name, _event)			\
+	_CMN_EVENT_HNF(_model, _name##_class0, _event, 0, SEL_CLASS_OCCUP_ID), \
+	_CMN_EVENT_HNF(_model, _name##_class1, _event, 1, SEL_CLASS_OCCUP_ID), \
+	_CMN_EVENT_HNF(_model, _name##_class2, _event, 2, SEL_CLASS_OCCUP_ID), \
+	_CMN_EVENT_HNF(_model, _name##_class3, _event, 3, SEL_CLASS_OCCUP_ID)
+#define CMN_EVENT_HNF_SNT(_model, _name, _event)			\
+	_CMN_EVENT_HNF(_model, _name##_all, _event, 0, SEL_CBUSY_SNTHROTTLE_SEL), \
+	_CMN_EVENT_HNF(_model, _name##_group0_read, _event, 1, SEL_CBUSY_SNTHROTTLE_SEL), \
+	_CMN_EVENT_HNF(_model, _name##_group0_write, _event, 2, SEL_CBUSY_SNTHROTTLE_SEL), \
+	_CMN_EVENT_HNF(_model, _name##_group1_read, _event, 3, SEL_CBUSY_SNTHROTTLE_SEL), \
+	_CMN_EVENT_HNF(_model, _name##_group1_write, _event, 4, SEL_CBUSY_SNTHROTTLE_SEL), \
+	_CMN_EVENT_HNF(_model, _name##_read, _event, 5, SEL_CBUSY_SNTHROTTLE_SEL), \
+	_CMN_EVENT_HNF(_model, _name##_write, _event, 6, SEL_CBUSY_SNTHROTTLE_SEL)
+
 #define _CMN_EVENT_XP(_name, _event)				\
 	__CMN_EVENT_XP(e_##_name, (_event) | (0 << 2)),		\
 	__CMN_EVENT_XP(w_##_name, (_event) | (1 << 2)),		\
@@ -706,7 +768,9 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj,
 	_CMN_EVENT_XP(dat_##_name, (_event) | (3 << 5)),	\
 	_CMN_EVENT_XP(pub_##_name, (_event) | (4 << 5)),	\
 	_CMN_EVENT_XP(rsp2_##_name, (_event) | (5 << 5)),	\
-	_CMN_EVENT_XP(dat2_##_name, (_event) | (6 << 5))
+	_CMN_EVENT_XP(dat2_##_name, (_event) | (6 << 5)),	\
+	_CMN_EVENT_XP(snp2_##_name, (_event) | (7 << 5)),	\
+	_CMN_EVENT_XP(req2_##_name, (_event) | (8 << 5))
 
 
 static struct attribute *arm_cmn_event_attrs[] = {
@@ -734,6 +798,14 @@ static struct attribute *arm_cmn_event_attrs[] = {
 	CMN_EVENT_DVM(NOT_CMN600, txsnp_stall,		0x0a),
 	CMN_EVENT_DVM(NOT_CMN600, trkfull,		0x0b),
 	CMN_EVENT_DVM_OCC(NOT_CMN600, trk_occupancy,	0x0c),
+	CMN_EVENT_DVM_OCC(CMN700, trk_occupancy_cxha,	0x0d),
+	CMN_EVENT_DVM_OCC(CMN700, trk_occupancy_pdn,	0x0e),
+	CMN_EVENT_DVM(CMN700, trk_alloc,		0x0f),
+	CMN_EVENT_DVM(CMN700, trk_cxha_alloc,		0x10),
+	CMN_EVENT_DVM(CMN700, trk_pdn_alloc,		0x11),
+	CMN_EVENT_DVM(CMN700, txsnp_stall_limit,	0x12),
+	CMN_EVENT_DVM(CMN700, rxsnp_stall_starv,	0x13),
+	CMN_EVENT_DVM(CMN700, txsnp_sync_stall_op,	0x14),
 
 	CMN_EVENT_HNF(CMN_ANY, cache_miss,		0x01),
 	CMN_EVENT_HNF(CMN_ANY, slc_sf_cache_access,	0x02),
@@ -773,9 +845,19 @@ static struct attribute *arm_cmn_event_attrs[] = {
 	CMN_EVENT_HNF(NOT_CMN600, atomic_fwd,		0x20),
 	CMN_EVENT_HNF(NOT_CMN600, mpam_hardlim,		0x21),
 	CMN_EVENT_HNF(NOT_CMN600, mpam_softlim,		0x22),
-	CMN_EVENT_HNF(CMN650, snp_sent_cluster,		0x23),
-	CMN_EVENT_HNF(CMN650, sf_imprecise_evict,	0x24),
-	CMN_EVENT_HNF(CMN650, sf_evict_shared_line,	0x25),
+	CMN_EVENT_HNF(CMN_650ON, snp_sent_cluster,	0x23),
+	CMN_EVENT_HNF(CMN_650ON, sf_imprecise_evict,	0x24),
+	CMN_EVENT_HNF(CMN_650ON, sf_evict_shared_line,	0x25),
+	CMN_EVENT_HNF_CLS(CMN700, pocq_class_occup,	0x26),
+	CMN_EVENT_HNF_CLS(CMN700, pocq_class_retry,	0x27),
+	CMN_EVENT_HNF_CLS(CMN700, class_mc_reqs,	0x28),
+	CMN_EVENT_HNF_CLS(CMN700, class_cgnt_cmin,	0x29),
+	CMN_EVENT_HNF_SNT(CMN700, sn_throttle,		0x2a),
+	CMN_EVENT_HNF_SNT(CMN700, sn_throttle_min,	0x2b),
+	CMN_EVENT_HNF(CMN700, sf_precise_to_imprecise,	0x2c),
+	CMN_EVENT_HNF(CMN700, snp_intv_cln,		0x2d),
+	CMN_EVENT_HNF(CMN700, nc_excl,			0x2e),
+	CMN_EVENT_HNF(CMN700, excl_mon_ovfl,		0x2f),
 
 	CMN_EVENT_HNI(rrt_rd_occ_cnt_ovfl,		0x20),
 	CMN_EVENT_HNI(rrt_wr_occ_cnt_ovfl,		0x21),
@@ -870,6 +952,19 @@ static struct attribute *arm_cmn_event_attrs[] = {
 	CMN_EVENT_RNID(NOT_CMN600, rrt_occup_ovfl_slice2, 0x14),
 	CMN_EVENT_RNID(NOT_CMN600, rrt_occup_ovfl_slice3, 0x15),
 	CMN_EVENT_RNID(NOT_CMN600, wrt_throttled,	0x16),
+	CMN_EVENT_RNID(CMN700, ldb_full,		0x17),
+	CMN_EVENT_RNID(CMN700, rrt_rd_req_occup_ovfl_slice0, 0x18),
+	CMN_EVENT_RNID(CMN700, rrt_rd_req_occup_ovfl_slice1, 0x19),
+	CMN_EVENT_RNID(CMN700, rrt_rd_req_occup_ovfl_slice2, 0x1a),
+	CMN_EVENT_RNID(CMN700, rrt_rd_req_occup_ovfl_slice3, 0x1b),
+	CMN_EVENT_RNID(CMN700, rrt_burst_occup_ovfl_slice0, 0x1c),
+	CMN_EVENT_RNID(CMN700, rrt_burst_occup_ovfl_slice1, 0x1d),
+	CMN_EVENT_RNID(CMN700, rrt_burst_occup_ovfl_slice2, 0x1e),
+	CMN_EVENT_RNID(CMN700, rrt_burst_occup_ovfl_slice3, 0x1f),
+	CMN_EVENT_RNID(CMN700, rrt_burst_alloc,		0x20),
+	CMN_EVENT_RNID(CMN700, awid_hash,		0x21),
+	CMN_EVENT_RNID(CMN700, atomic_alloc,		0x22),
+	CMN_EVENT_RNID(CMN700, atomic_occ_ovfl,		0x23),
 
 	CMN_EVENT_MTSX(tc_lookup,			0x01),
 	CMN_EVENT_MTSX(tc_fill,				0x02),
@@ -920,6 +1015,82 @@ static struct attribute *arm_cmn_event_attrs[] = {
 	CMN_EVENT_CXHA(sdb_occ,				0x2c),
 	CMN_EVENT_CXHA(snphaz_occ,			0x2d),
 
+	CMN_EVENT_CCRA(rht_occ,				0x41),
+	CMN_EVENT_CCRA(sht_occ,				0x42),
+	CMN_EVENT_CCRA(rdb_occ,				0x43),
+	CMN_EVENT_CCRA(wdb_occ,				0x44),
+	CMN_EVENT_CCRA(ssb_occ,				0x45),
+	CMN_EVENT_CCRA(snp_bcasts,			0x46),
+	CMN_EVENT_CCRA(req_chains,			0x47),
+	CMN_EVENT_CCRA(req_chain_avglen,		0x48),
+	CMN_EVENT_CCRA(chirsp_stalls,			0x49),
+	CMN_EVENT_CCRA(chidat_stalls,			0x4a),
+	CMN_EVENT_CCRA(cxreq_pcrd_stalls_link0,		0x4b),
+	CMN_EVENT_CCRA(cxreq_pcrd_stalls_link1,		0x4c),
+	CMN_EVENT_CCRA(cxreq_pcrd_stalls_link2,		0x4d),
+	CMN_EVENT_CCRA(cxdat_pcrd_stalls_link0,		0x4e),
+	CMN_EVENT_CCRA(cxdat_pcrd_stalls_link1,		0x4f),
+	CMN_EVENT_CCRA(cxdat_pcrd_stalls_link2,		0x50),
+	CMN_EVENT_CCRA(external_chirsp_stalls,		0x51),
+	CMN_EVENT_CCRA(external_chidat_stalls,		0x52),
+	CMN_EVENT_CCRA(cxmisc_pcrd_stalls_link0,	0x53),
+	CMN_EVENT_CCRA(cxmisc_pcrd_stalls_link1,	0x54),
+	CMN_EVENT_CCRA(cxmisc_pcrd_stalls_link2,	0x55),
+	CMN_EVENT_CCRA(rht_alloc,			0x56),
+	CMN_EVENT_CCRA(sht_alloc,			0x57),
+	CMN_EVENT_CCRA(rdb_alloc,			0x58),
+	CMN_EVENT_CCRA(wdb_alloc,			0x59),
+	CMN_EVENT_CCRA(ssb_alloc,			0x5a),
+
+	CMN_EVENT_CCHA(rddatbyp,			0x61),
+	CMN_EVENT_CCHA(chirsp_up_stall,			0x62),
+	CMN_EVENT_CCHA(chidat_up_stall,			0x63),
+	CMN_EVENT_CCHA(snppcrd_link0_stall,		0x64),
+	CMN_EVENT_CCHA(snppcrd_link1_stall,		0x65),
+	CMN_EVENT_CCHA(snppcrd_link2_stall,		0x66),
+	CMN_EVENT_CCHA(reqtrk_occ,			0x67),
+	CMN_EVENT_CCHA(rdb_occ,				0x68),
+	CMN_EVENT_CCHA(rdbyp_occ,			0x69),
+	CMN_EVENT_CCHA(wdb_occ,				0x6a),
+	CMN_EVENT_CCHA(snptrk_occ,			0x6b),
+	CMN_EVENT_CCHA(sdb_occ,				0x6c),
+	CMN_EVENT_CCHA(snphaz_occ,			0x6d),
+	CMN_EVENT_CCHA(reqtrk_alloc,			0x6e),
+	CMN_EVENT_CCHA(rdb_alloc,			0x6f),
+	CMN_EVENT_CCHA(rdbyp_alloc,			0x70),
+	CMN_EVENT_CCHA(wdb_alloc,			0x71),
+	CMN_EVENT_CCHA(snptrk_alloc,			0x72),
+	CMN_EVENT_CCHA(sdb_alloc,			0x73),
+	CMN_EVENT_CCHA(snphaz_alloc,			0x74),
+	CMN_EVENT_CCHA(pb_rhu_req_occ,			0x75),
+	CMN_EVENT_CCHA(pb_rhu_req_alloc,		0x76),
+	CMN_EVENT_CCHA(pb_rhu_pcie_req_occ,		0x77),
+	CMN_EVENT_CCHA(pb_rhu_pcie_req_alloc,		0x78),
+	CMN_EVENT_CCHA(pb_pcie_wr_req_occ,		0x79),
+	CMN_EVENT_CCHA(pb_pcie_wr_req_alloc,		0x7a),
+	CMN_EVENT_CCHA(pb_pcie_reg_req_occ,		0x7b),
+	CMN_EVENT_CCHA(pb_pcie_reg_req_alloc,		0x7c),
+	CMN_EVENT_CCHA(pb_pcie_rsvd_req_occ,		0x7d),
+	CMN_EVENT_CCHA(pb_pcie_rsvd_req_alloc,		0x7e),
+	CMN_EVENT_CCHA(pb_rhu_dat_occ,			0x7f),
+	CMN_EVENT_CCHA(pb_rhu_dat_alloc,		0x80),
+	CMN_EVENT_CCHA(pb_rhu_pcie_dat_occ,		0x81),
+	CMN_EVENT_CCHA(pb_rhu_pcie_dat_alloc,		0x82),
+	CMN_EVENT_CCHA(pb_pcie_wr_dat_occ,		0x83),
+	CMN_EVENT_CCHA(pb_pcie_wr_dat_alloc,		0x84),
+
+	CMN_EVENT_CCLA(rx_cxs,				0x21),
+	CMN_EVENT_CCLA(tx_cxs,				0x22),
+	CMN_EVENT_CCLA(rx_cxs_avg_size,			0x23),
+	CMN_EVENT_CCLA(tx_cxs_avg_size,			0x24),
+	CMN_EVENT_CCLA(tx_cxs_lcrd_backpressure,	0x25),
+	CMN_EVENT_CCLA(link_crdbuf_occ,			0x26),
+	CMN_EVENT_CCLA(link_crdbuf_alloc,		0x27),
+	CMN_EVENT_CCLA(pfwd_rcvr_cxs,			0x28),
+	CMN_EVENT_CCLA(pfwd_sndr_num_flits,		0x29),
+	CMN_EVENT_CCLA(pfwd_sndr_stalls_static_crd,	0x2a),
+	CMN_EVENT_CCLA(pfwd_sndr_stalls_dynmaic_crd,	0x2b),
+
 	NULL
 };
 
@@ -1147,7 +1318,11 @@ static int arm_cmn_set_event_sel_hi(struct arm_cmn_node *dn,
 
 	if (!dn->occupid[fsel].count) {
 		dn->occupid[fsel].val = occupid;
-		reg = FIELD_PREP(CMN__PMU_OCCUP1_ID,
+		reg = FIELD_PREP(CMN__PMU_CBUSY_SNTHROTTLE_SEL,
+				 dn->occupid[SEL_CBUSY_SNTHROTTLE_SEL].val) |
+		      FIELD_PREP(CMN__PMU_CLASS_OCCUP_ID,
+				 dn->occupid[SEL_CLASS_OCCUP_ID].val) |
+		      FIELD_PREP(CMN__PMU_OCCUP1_ID,
 				 dn->occupid[SEL_OCCUP1ID].val);
 		writel_relaxed(reg >> 32, dn->pmu_base + CMN_PMU_EVENT_SEL + 4);
 	} else if (dn->occupid[fsel].val != occupid) {
@@ -1157,6 +1332,18 @@ static int arm_cmn_set_event_sel_hi(struct arm_cmn_node *dn,
 	return 0;
 }
 
+static void arm_cmn_set_event_sel_lo(struct arm_cmn_node *dn, int dtm_idx,
+				     int eventid, bool wide_sel)
+{
+	if (wide_sel) {
+		dn->event_w[dtm_idx] = eventid;
+		writeq_relaxed(le64_to_cpu(dn->event_sel_w), dn->pmu_base + CMN_PMU_EVENT_SEL);
+	} else {
+		dn->event[dtm_idx] = eventid;
+		writel_relaxed(le32_to_cpu(dn->event_sel), dn->pmu_base + CMN_PMU_EVENT_SEL);
+	}
+}
+
 static void arm_cmn_event_start(struct perf_event *event, int flags)
 {
 	struct arm_cmn *cmn = to_cmn(event->pmu);
@@ -1183,8 +1370,8 @@ static void arm_cmn_event_start(struct perf_event *event, int flags)
 	} else for_each_hw_dn(hw, dn, i) {
 		int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i);
 
-		dn->event[dtm_idx] = CMN_EVENT_EVENTID(event);
-		writel_relaxed(le32_to_cpu(dn->event_sel), dn->pmu_base + CMN_PMU_EVENT_SEL);
+		arm_cmn_set_event_sel_lo(dn, dtm_idx, CMN_EVENT_EVENTID(event),
+					 hw->wide_sel);
 	}
 }
 
@@ -1211,8 +1398,7 @@ static void arm_cmn_event_stop(struct perf_event *event, int flags)
 	} else for_each_hw_dn(hw, dn, i) {
 		int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i);
 
-		dn->event[dtm_idx] = 0;
-		writel_relaxed(le32_to_cpu(dn->event_sel), dn->pmu_base + CMN_PMU_EVENT_SEL);
+		arm_cmn_set_event_sel_lo(dn, dtm_idx, 0, hw->wide_sel);
 	}
 
 	arm_cmn_event_read(event);
@@ -1371,6 +1557,8 @@ static int arm_cmn_event_init(struct perf_event *event)
 		/* ...but the DTM may depend on which port we're watching */
 		if (cmn->multi_dtm)
 			hw->dtm_offset = CMN_EVENT_WP_DEV_SEL(event) / 2;
+	} else if (type == CMN_TYPE_XP && cmn->model == CMN700) {
+		hw->wide_sel = true;
 	}
 
 	/* This is sufficiently annoying to recalculate, so cache it */
@@ -1749,6 +1937,10 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn)
 		/* To the PMU, RN-Ds don't add anything over RN-Is, so smoosh them together */
 		if (dn->type == CMN_TYPE_RND)
 			dn->type = CMN_TYPE_RNI;
+
+		/* We split the RN-I off already, so let the CCLA part match CCLA events */
+		if (dn->type == CMN_TYPE_CCLA_RNI)
+			dn->type = CMN_TYPE_CCLA;
 	}
 
 	arm_cmn_set_state(cmn, CMN_STATE_DISABLED);
@@ -1784,6 +1976,8 @@ static enum cmn_node_type arm_cmn_subtype(enum cmn_node_type type)
 	switch (type) {
 	case CMN_TYPE_HNP:
 		return CMN_TYPE_HNI;
+	case CMN_TYPE_CCLA_RNI:
+		return CMN_TYPE_RNI;
 	default:
 		return CMN_TYPE_INVALID;
 	}
@@ -1813,6 +2007,10 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 	cmn->rsp_vc_num = FIELD_GET(CMN_INFO_RSP_VC_NUM, reg);
 	cmn->dat_vc_num = FIELD_GET(CMN_INFO_DAT_VC_NUM, reg);
 
+	reg = readq_relaxed(cfg_region + CMN_CFGM_INFO_GLOBAL_1);
+	cmn->snp_vc_num = FIELD_GET(CMN_INFO_SNP_VC_NUM, reg);
+	cmn->req_vc_num = FIELD_GET(CMN_INFO_REQ_VC_NUM, reg);
+
 	reg = readq_relaxed(cfg_region + CMN_CHILD_INFO);
 	child_count = FIELD_GET(CMN_CI_CHILD_COUNT, reg);
 	child_poff = FIELD_GET(CMN_CI_CHILD_PTR_OFFSET, reg);
@@ -1936,6 +2134,9 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 			case CMN_TYPE_MTSX:
 			case CMN_TYPE_CXRA:
 			case CMN_TYPE_CXHA:
+			case CMN_TYPE_CCRA:
+			case CMN_TYPE_CCHA:
+			case CMN_TYPE_CCLA:
 				dn++;
 				break;
 			/* Nothing to see here */
@@ -1951,6 +2152,7 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 			 * register via the normal mechanism later.
 			 */
 			case CMN_TYPE_HNP:
+			case CMN_TYPE_CCLA_RNI:
 				dn[1] = dn[0];
 				dn[0].pmu_base += CMN_HNP_PMU_EVENT_SEL;
 				dn[1].type = arm_cmn_subtype(dn->type);
@@ -2126,6 +2328,7 @@ static int arm_cmn_remove(struct platform_device *pdev)
 static const struct of_device_id arm_cmn_of_match[] = {
 	{ .compatible = "arm,cmn-600", .data = (void *)CMN600 },
 	{ .compatible = "arm,cmn-650", .data = (void *)CMN650 },
+	{ .compatible = "arm,cmn-700", .data = (void *)CMN700 },
 	{ .compatible = "arm,ci-700", .data = (void *)CI700 },
 	{}
 };
@@ -2136,6 +2339,7 @@ MODULE_DEVICE_TABLE(of, arm_cmn_of_match);
 static const struct acpi_device_id arm_cmn_acpi_match[] = {
 	{ "ARMHC600", CMN600 },
 	{ "ARMHC650", CMN650 },
+	{ "ARMHC700", CMN700 },
 	{}
 };
 MODULE_DEVICE_TABLE(acpi, arm_cmn_acpi_match);

From 1d38b3aa43434e3f4d25787ece7b7d57bcaad7ce Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 10 May 2022 22:23:08 +0100
Subject: [PATCH 658/737] perf/arm-cmn: Fix filter_sel lookup

Carefully considering the bounds of an array is all well and good,
until you forget that that array also contains a NULL sentinel at
the end and dereference it. So close...

Reported-by: Qian Cai <quic_qiancai@quicinc.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/bebba768156aa3c0757140457bdd0fec10819388.1652217788.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 3630b2a86390b4be907d8685b2fddee0dd73a835)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/arm-cmn.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index c8101e2053f15..6ba6395757321 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -1514,7 +1514,7 @@ static enum cmn_filter_select arm_cmn_filter_sel(enum cmn_model model,
 	struct arm_cmn_event_attr *e;
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(arm_cmn_event_attrs); i++) {
+	for (i = 0; i < ARRAY_SIZE(arm_cmn_event_attrs) - 1; i++) {
 		e = container_of(arm_cmn_event_attrs[i], typeof(*e), attr.attr);
 		if (e->model & model && e->type == type && e->eventid == eventid)
 			return e->fsel;

From 6e12b027bb5cb7508711cb48cce05b36515a4c3f Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 11 May 2022 14:12:53 +0100
Subject: [PATCH 659/737] perf/arm-cmn: Decode CAL devices properly in debugfs

The debugfs code is lazy, and since it only keeps the bottom byte of
each connect_info register to save space, it also treats the whole thing
as the device_type since the other bits were reserved anyway. Upon
closer inspection, though, this is no longer true on newer IP versions,
so let's be good and decode the exact field properly. This should help
it not get confused when a Component Aggregation Layer is present (which
is already implied if Node IDs are found for both device addresses
represented by the next two lines of the table).

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/6a13a6128a28cfe2eec6d09cf372a167ec9c3b65.1652274773.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit c5781212985a76ae610d18429388f9ec6ee3f77b)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/arm-cmn.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 6ba6395757321..314d256714fec 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -63,6 +63,7 @@
 #define CMN_MXP__CONNECT_INFO_P3	0x0030
 #define CMN_MXP__CONNECT_INFO_P4	0x0038
 #define CMN_MXP__CONNECT_INFO_P5	0x0040
+#define CMN__CONNECT_INFO_DEVICE_TYPE	GENMASK_ULL(4, 0)
 
 /* PMU registers occupy the 3rd 4KB page of each node's region */
 #define CMN_PMU_OFFSET			0x2000
@@ -400,7 +401,7 @@ static struct dentry *arm_cmn_debugfs;
 #ifdef CONFIG_DEBUG_FS
 static const char *arm_cmn_device_type(u8 type)
 {
-	switch(type) {
+	switch(FIELD_GET(CMN__CONNECT_INFO_DEVICE_TYPE, type)) {
 		case 0x00: return "        |";
 		case 0x01: return "  RN-I  |";
 		case 0x02: return "  RN-D  |";

From 458d4fed24fbb0028c85a767264e9c53b74c2edb Mon Sep 17 00:00:00 2001
From: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Date: Mon, 8 Aug 2022 12:54:55 -0700
Subject: [PATCH 660/737] perf/arm-cmn: Add more bits to child node address
 offset field

CMN-600 uses bits [27:0] for child node address offset while bits [30:28]
are required to be zero.

For CMN-650, the child node address offset field has been increased
to include bits [29:0] while leaving only bit 30 set to zero.

Let's include the missing two bits and assume older implementations
comply with the spec and set bits [29:28] to 0.

Signed-off-by: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Fixes: 60d1504070c2 ("perf/arm-cmn: Support new IP features")
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20220808195455.79277-1-ilkka@os.amperecomputing.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 05d6f6d346fea2fa4580a0c2b6be207456bebb08)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/arm-cmn.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 314d256714fec..1f9fbaf87309b 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -36,7 +36,7 @@
 #define CMN_CI_CHILD_COUNT		GENMASK_ULL(15, 0)
 #define CMN_CI_CHILD_PTR_OFFSET		GENMASK_ULL(31, 16)
 
-#define CMN_CHILD_NODE_ADDR		GENMASK(27, 0)
+#define CMN_CHILD_NODE_ADDR		GENMASK(29, 0)
 #define CMN_CHILD_NODE_EXTERNAL		BIT(31)
 
 #define CMN_MAX_DIMENSION		12

From b4d0a0c4db2e17c1f372d2a68756c9d1ba8682dc Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 5 Dec 2022 19:46:13 +0000
Subject: [PATCH 661/737] perf/arm-cmn: Reset DTM_PMU_CONFIG at probe

Although we treat the DTM counters as free-running such that we're not
too concerned about the initial DTM state, it's possible for a previous
user to have left DTM counters enabled and paired with DTC counters.
Thus if the first events are scheduled using some, but not all, DTMs,
the as-yet-unused ones could end up adding spurious increments to the
event counts at the DTC. Make sure we sync our initial DTM_PMU_CONFIG
state to all the DTMs at probe time to avoid that possibility.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/ba5f38b3dc733cd06bfb5e659b697e76d18c2183.1670269572.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit bb21ef19a3d8f586a99310116d40622fb5b79942)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/arm-cmn.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 1f9fbaf87309b..9ffdbb7ca5fa5 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -1865,6 +1865,7 @@ static void arm_cmn_init_dtm(struct arm_cmn_dtm *dtm, struct arm_cmn_node *xp, i
 
 	dtm->base = xp->pmu_base + CMN_DTM_OFFSET(idx);
 	dtm->pmu_config_low = CMN_DTM_PMU_CONFIG_PMU_EN;
+	writeq_relaxed(dtm->pmu_config_low, dtm->base + CMN_DTM_PMU_CONFIG);
 	for (i = 0; i < 4; i++) {
 		dtm->wp_event[i] = -1;
 		writeq_relaxed(0, dtm->base + CMN_DTM_WPn_MASK(i));

From 4d267ada90e9fe8759d86276721e992320723c0c Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 23 Jan 2023 18:30:38 +0000
Subject: [PATCH 662/737] Partially revert "perf/arm-cmn: Optimise DTC counter
 accesses"

It turns out the optimisation implemented by commit 4f2c3872dde5 is
totally broken, since all the places that consume hw->dtcs_used for
events other than cycle count are still not expecting it to be sparsely
populated, and fail to read all the relevant DTC counters correctly if
so.

If implemented correctly, the optimisation potentially saves up to 3
register reads per event update, which is reasonably significant for
events targeting a single node, but still not worth a massive amount of
additional code complexity overall. Getting it right within the current
design looks a fair bit more involved than it was ever intended to be,
so let's just make a functional revert which restores the old behaviour
while still backporting easily.

Fixes: 4f2c3872dde5 ("perf/arm-cmn: Optimise DTC counter accesses")
Reported-by: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/b41bb4ed7283c3d8400ce5cf5e6ec94915e6750f.1674498637.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit a428eb4b99ab80454f06ad256b25e930fe8a4954)
Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/arm-cmn.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 9ffdbb7ca5fa5..5d70d4066a2d8 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -1576,7 +1576,6 @@ static int arm_cmn_event_init(struct perf_event *event)
 			hw->dn++;
 			continue;
 		}
-		hw->dtcs_used |= arm_cmn_node_to_xp(cmn, dn)->dtc;
 		hw->num_dns++;
 		if (bynodeid)
 			break;
@@ -1589,6 +1588,12 @@ static int arm_cmn_event_init(struct perf_event *event)
 			nodeid, nid.x, nid.y, nid.port, nid.dev, type);
 		return -EINVAL;
 	}
+	/*
+	 * Keep assuming non-cycles events count in all DTC domains; turns out
+	 * it's hard to make a worthwhile optimisation around this, short of
+	 * going all-in with domain-local counter allocation as well.
+	 */
+	hw->dtcs_used = (1U << cmn->num_dtcs) - 1;
 
 	return arm_cmn_validate_group(cmn, event);
 }

From d586b97ca15c77d731cf81d4194c788bafd098d6 Mon Sep 17 00:00:00 2001
From: Luiz Capitulino <luizcap@amazon.com>
Date: Fri, 7 Apr 2023 03:08:59 +0000
Subject: [PATCH 663/737] Revert "perf/arm-cmn: Use irq_set_affinity()"

This reverts commit 1119e8497690cfbbcc71e4118783b68b7937ce2f.

The 5.10 kernel doesn't have irq_set_affinity() exported and the PMU
drivers still count on the old behavior of irq_set_affinit_hint().

Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
---
 drivers/perf/arm-cmn.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 5d70d4066a2d8..9c7a5533622f5 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -1762,7 +1762,7 @@ static void arm_cmn_migrate(struct arm_cmn *cmn, unsigned int cpu)
 
 	perf_pmu_migrate_context(&cmn->pmu, cmn->cpu, cpu);
 	for (i = 0; i < cmn->num_dtcs; i++)
-		irq_set_affinity(cmn->dtc[i].irq, cpumask_of(cpu));
+		irq_set_affinity_hint(cmn->dtc[i].irq, cpumask_of(cpu));
 	cmn->cpu = cpu;
 }
 
@@ -1855,7 +1855,7 @@ static int arm_cmn_init_irqs(struct arm_cmn *cmn)
 		if (err)
 			return err;
 
-		err = irq_set_affinity(irq, cpumask_of(cmn->cpu));
+		err = irq_set_affinity_hint(irq, cpumask_of(cmn->cpu));
 		if (err)
 			return err;
 	next:
@@ -2322,12 +2322,17 @@ static int arm_cmn_probe(struct platform_device *pdev)
 static int arm_cmn_remove(struct platform_device *pdev)
 {
 	struct arm_cmn *cmn = platform_get_drvdata(pdev);
+	int i;
 
 	writel_relaxed(0, cmn->dtc[0].base + CMN_DT_DTC_CTL);
 
 	perf_pmu_unregister(&cmn->pmu);
 	cpuhp_state_remove_instance_nocalls(arm_cmn_hp_state, &cmn->cpuhp_node);
 	debugfs_remove(cmn->debug);
+
+	for (i = 0; i < cmn->num_dtcs; i++)
+		irq_set_affinity_hint(cmn->dtc[i].irq, NULL);
+
 	return 0;
 }
 

From ca7a7ef0b25bb7303dc5ffbb72f966348d02cbaa Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 17 Apr 2023 23:15:12 +0000
Subject: [PATCH 664/737] bpf: Fix up bpf_attach_type for sk_reuseport/migrate.

When backporting BPF_SK_REUSEPORT_SELECT_OR_MIGRATE series,
BPF_SK_SKB_VERDICT is dropped from enum bpf_attach_type, leading
wrong offsets for these attach types on AL2 5.10 kernel.

  - BPF_SK_REUSEPORT_SELECT
  - BPF_SK_REUSEPORT_SELECT_OR_MIGRATE

On AL2, we use the upstream libbpf, so the correct attach_type
is rejected as illegal against the wrong offset by kernel.

However, SEC("sk_reuseport") is not affected by this bug because
the type of BPF prog is compiled with attach_type set 0 for a
historical reason.

Before I introduce sk_reuseport/migrate section, sk_reuseport did
not have any variant, thus attach_type is set to 0.  When we introduce
attach_type for sk_reuseport BPF prog, we have decided to fix it up
only when it's 0 not to break compatibility.

So, only SEC("sk_reuseport/migrate") BPF prog cannot be loaded on
AL2 5.10 kernel.

To fix the enum, let's sync it with the upstream libbpf without
adding BPF_SK_SKB_VERDICT infra.  We can backport it when customers
need it.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
 include/uapi/linux/bpf.h       | 1 +
 tools/include/uapi/linux/bpf.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 956003b905b38..75b2d5df95a1c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -241,6 +241,7 @@ enum bpf_attach_type {
 	BPF_XDP_CPUMAP,
 	BPF_SK_LOOKUP,
 	BPF_XDP,
+	BPF_SK_SKB_VERDICT,
 	BPF_SK_REUSEPORT_SELECT,
 	BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
 	__MAX_BPF_ATTACH_TYPE
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index a09181d1039f3..f47d79146b9cd 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -241,6 +241,7 @@ enum bpf_attach_type {
 	BPF_XDP_CPUMAP,
 	BPF_SK_LOOKUP,
 	BPF_XDP,
+	BPF_SK_SKB_VERDICT,
 	BPF_SK_REUSEPORT_SELECT,
 	BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
 	__MAX_BPF_ATTACH_TYPE

From 4677abf9a0cb008dd95d36fa357bdab55ca54b01 Mon Sep 17 00:00:00 2001
From: Shaoying Xu <shaoyi@amazon.com>
Date: Fri, 28 Apr 2023 18:48:56 +0000
Subject: [PATCH 665/737] Revert "module: Don't wait for GOING modules"

This reverts commit 083b3dda86f81f9493d43e0210e6a9a2cc1bee7a
for some bare metal host unreachable issue because its
network intel ixgbe driver is not loaded successfully with this commit.

Signed-off-by: Shaoying Xu <shaoyi@amazon.com>
---
 kernel/module.c | 26 +++++---------------------
 1 file changed, 5 insertions(+), 21 deletions(-)

diff --git a/kernel/module.c b/kernel/module.c
index 33d1dc6d4cd6a..6a0fd245c0483 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3661,8 +3661,7 @@ static bool finished_loading(const char *name)
 	sched_annotate_sleep();
 	mutex_lock(&module_mutex);
 	mod = find_module_all(name, strlen(name), true);
-	ret = !mod || mod->state == MODULE_STATE_LIVE
-		|| mod->state == MODULE_STATE_GOING;
+	ret = !mod || mod->state == MODULE_STATE_LIVE;
 	mutex_unlock(&module_mutex);
 
 	return ret;
@@ -3828,35 +3827,20 @@ static int add_unformed_module(struct module *mod)
 
 	mod->state = MODULE_STATE_UNFORMED;
 
+again:
 	mutex_lock(&module_mutex);
 	old = find_module_all(mod->name, strlen(mod->name), true);
 	if (old != NULL) {
-		if (old->state == MODULE_STATE_COMING
-		    || old->state == MODULE_STATE_UNFORMED) {
+		if (old->state != MODULE_STATE_LIVE) {
 			/* Wait in case it fails to load. */
 			mutex_unlock(&module_mutex);
 			err = wait_event_interruptible(module_wq,
 					       finished_loading(mod->name));
 			if (err)
 				goto out_unlocked;
-
-			/* The module might have gone in the meantime. */
-			mutex_lock(&module_mutex);
-			old = find_module_all(mod->name, strlen(mod->name),
-					      true);
+			goto again;
 		}
-
-		/*
-		 * We are here only when the same module was being loaded. Do
-		 * not try to load it again right now. It prevents long delays
-		 * caused by serialized module load failures. It might happen
-		 * when more devices of the same type trigger load of
-		 * a particular module.
-		 */
-		if (old && old->state == MODULE_STATE_LIVE)
-			err = -EEXIST;
-		else
-			err = -EBUSY;
+		err = -EEXIST;
 		goto out;
 	}
 	mod_update_bounds(mod);

From e0c0aeeb05b44952d57d5ca134de807ae014ea3e Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 2 Mar 2022 10:15:58 +0200
Subject: [PATCH 666/737] intel_idle: add SPR support

Add Sapphire Rapids Xeon support.

Up until very recently, the C1 and C1E C-states were independent, but this
has changed in some new chips, including Sapphire Rapids Xeon (SPR). In these
chips the C1 and C1E states cannot be enabled at the same time. The "C1E
promotion" bit in 'MSR_IA32_POWER_CTL' also has its semantics changed a bit.

Here are the C1, C1E, and "C1E promotion" bit rules on Xeons before SPR.

1. If C1E promotion bit is disabled.
   a. C1  requests end up with C1  C-state.
   b. C1E requests end up with C1E C-state.
2. If C1E promotion bit is enabled.
   a. C1  requests end up with C1E C-state.
   b. C1E requests end up with C1E C-state.

Here are the C1, C1E, and "C1E promotion" bit rules on Sapphire Rapids Xeon.
1. If C1E promotion bit is disabled.
   a. C1  requests end up with C1 C-state.
   b. C1E requests end up with C1 C-state.
2. If C1E promotion bit is enabled.
   a. C1  requests end up with C1E C-state.
   b. C1E requests end up with C1E C-state.

Before SPR Xeon, the 'intel_idle' driver was disabling C1E promotion and was
exposing C1 and C1E as independent C-states. But on SPR, C1 and C1E cannot be
enabled at the same time.

This patch adds both C1 and C1E states. However, C1E is marked as with the
"CPUIDLE_FLAG_UNUSABLE" flag, which means that in won't be registered by
default. The C1E promotion bit will be cleared, which means that by default
only C1 and C6 will be registered on SPR.

The next patch will add an option for enabling C1E and disabling C1 on SPR.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/idle/intel_idle.c | 47 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index b92b032fb6d13..7694d852b49d9 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -778,6 +778,46 @@ static struct cpuidle_state icx_cstates[] __initdata = {
 		.enter = NULL }
 };
 
+/*
+ * On Sapphire Rapids Xeon C1 has to be disabled if C1E is enabled, and vice
+ * versa. On SPR C1E is enabled only if "C1E promotion" bit is set in
+ * MSR_IA32_POWER_CTL. But in this case there effectively no C1, because C1
+ * requests are promoted to C1E. If the "C1E promotion" bit is cleared, then
+ * both C1 and C1E requests end up with C1, so there is effectively no C1E.
+ *
+ * By default we enable C1 and disable C1E by marking it with
+ * 'CPUIDLE_FLAG_UNUSABLE'.
+ */
+static struct cpuidle_state spr_cstates[] __initdata = {
+	{
+		.name = "C1",
+		.desc = "MWAIT 0x00",
+		.flags = MWAIT2flg(0x00),
+		.exit_latency = 1,
+		.target_residency = 1,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.name = "C1E",
+		.desc = "MWAIT 0x01",
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE | \
+					   CPUIDLE_FLAG_UNUSABLE,
+		.exit_latency = 2,
+		.target_residency = 4,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.name = "C6",
+		.desc = "MWAIT 0x20",
+		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 290,
+		.target_residency = 800,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.enter = NULL }
+};
+
 static struct cpuidle_state atom_cstates[] __initdata = {
 	{
 		.name = "C1E",
@@ -1088,6 +1128,12 @@ static const struct idle_cpu idle_cpu_icx __initconst = {
 	.use_acpi = true,
 };
 
+static const struct idle_cpu idle_cpu_spr __initconst = {
+	.state_table = spr_cstates,
+	.disable_promotion_to_c1e = true,
+	.use_acpi = true,
+};
+
 static const struct idle_cpu idle_cpu_avn __initconst = {
 	.state_table = avn_cstates,
 	.disable_promotion_to_c1e = true,
@@ -1143,6 +1189,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE,		&idle_cpu_skl),
 	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X,		&idle_cpu_skx),
 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X,		&idle_cpu_icx),
+	X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X,	&idle_cpu_spr),
 	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL,	&idle_cpu_knl),
 	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM,	&idle_cpu_knl),
 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT,	&idle_cpu_bxt),

From 940af92e2c26ee6dd3bfddb4280ba2ff7a549246 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 2 Mar 2022 10:15:59 +0200
Subject: [PATCH 667/737] intel_idle: add 'preferred_cstates' module argument

On Sapphire Rapids Xeon (SPR) the C1 and C1E states are basically mutually
exclusive - only one of them can be enabled. By default, 'intel_idle' driver
enables C1 and disables C1E. However, some users prefer to use C1E instead of
C1, because it saves more energy.

This patch adds a new module parameter ('preferred_cstates') for enabling C1E
and disabling C1. Here is the idea behind it.

1. This option has effect only for "mutually exclusive" C-states like C1 and
   C1E on SPR.
2. It does not have any effect on independent C-states, which do not require
   other C-states to be disabled (most states on most platforms as of today).
3. For mutually exclusive C-states, the 'intel_idle' driver always has a
   reasonable default, such as enabling C1 on SPR by default. On other
   platforms, the default may be different.
4. Users can override the default using the 'preferred_cstates' parameter.
5. The parameter accepts the preferred C-states bit-mask, similarly to the
   existing 'states_off' parameter.
6. This parameter is not limited to C1/C1E, and leaves room for supporting
   other mutually exclusive C-states, if they come in the future.

Today 'intel_idle' can only be compiled-in, which means that on SPR, in order
to disable C1 and enable C1E, users should boot with the following kernel
argument: intel_idle.preferred_cstates=4

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/idle/intel_idle.c | 46 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 7694d852b49d9..6837a5fa0214a 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -66,6 +66,7 @@ static struct cpuidle_driver intel_idle_driver = {
 /* intel_idle.max_cstate=0 disables driver */
 static int max_cstate = CPUIDLE_STATE_MAX - 1;
 static unsigned int disabled_states_mask;
+static unsigned int preferred_states_mask;
 
 static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
 
@@ -1377,6 +1378,8 @@ static inline void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) { }
 static inline bool intel_idle_off_by_default(u32 mwait_hint) { return false; }
 #endif /* !CONFIG_ACPI_PROCESSOR_CSTATE */
 
+static void c1e_promotion_enable(void);
+
 /**
  * ivt_idle_state_table_update - Tune the idle states table for Ivy Town.
  *
@@ -1517,6 +1520,26 @@ static void __init sklh_idle_state_table_update(void)
 	skl_cstates[6].flags |= CPUIDLE_FLAG_UNUSABLE;	/* C9-SKL */
 }
 
+/**
+ * spr_idle_state_table_update - Adjust Sapphire Rapids idle states table.
+ */
+static void __init spr_idle_state_table_update(void)
+{
+	/* Check if user prefers C1E over C1. */
+	if (preferred_states_mask & BIT(2)) {
+		if (preferred_states_mask & BIT(1))
+			/* Both can't be enabled, stick to the defaults. */
+			return;
+
+		spr_cstates[0].flags |= CPUIDLE_FLAG_UNUSABLE;
+		spr_cstates[1].flags &= ~CPUIDLE_FLAG_UNUSABLE;
+
+		/* Enable C1E using the "C1E promotion" bit. */
+		c1e_promotion_enable();
+		disable_promotion_to_c1e = false;
+	}
+}
+
 static bool __init intel_idle_verify_cstate(unsigned int mwait_hint)
 {
 	unsigned int mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint) + 1;
@@ -1548,6 +1571,9 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 	case INTEL_FAM6_SKYLAKE:
 		sklh_idle_state_table_update();
 		break;
+	case INTEL_FAM6_SAPPHIRERAPIDS_X:
+		spr_idle_state_table_update();
+		break;
 	}
 
 	for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) {
@@ -1625,6 +1651,15 @@ static void auto_demotion_disable(void)
 	wrmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_bits);
 }
 
+static void c1e_promotion_enable(void)
+{
+	unsigned long long msr_bits;
+
+	rdmsrl(MSR_IA32_POWER_CTL, msr_bits);
+	msr_bits |= 0x2;
+	wrmsrl(MSR_IA32_POWER_CTL, msr_bits);
+}
+
 static void c1e_promotion_disable(void)
 {
 	unsigned long long msr_bits;
@@ -1794,3 +1829,14 @@ module_param(max_cstate, int, 0444);
  */
 module_param_named(states_off, disabled_states_mask, uint, 0444);
 MODULE_PARM_DESC(states_off, "Mask of disabled idle states");
+/*
+ * Some platforms come with mutually exclusive C-states, so that if one is
+ * enabled, the other C-states must not be used. Example: C1 and C1E on
+ * Sapphire Rapids platform. This parameter allows for selecting the
+ * preferred C-states among the groups of mutually exclusive C-states - the
+ * selected C-states will be registered, the other C-states from the mutually
+ * exclusive group won't be registered. If the platform has no mutually
+ * exclusive C-states, this parameter has no effect.
+ */
+module_param_named(preferred_cstates, preferred_states_mask, uint, 0444);
+MODULE_PARM_DESC(preferred_cstates, "Mask of preferred idle states");

From e5f972b17b76ff2d7a1312d657468783888f41b0 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 2 Mar 2022 10:16:00 +0200
Subject: [PATCH 668/737] intel_idle: add core C6 optimization for SPR

Add a Sapphire Rapids Xeon C6 optimization, similar to what we have for Sky Lake
Xeon: if package C6 is disabled, adjust C6 exit latency and target residency to
match core C6 values, instead of using the default package C6 values.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/idle/intel_idle.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 6837a5fa0214a..8e4c41dab7ce0 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1525,6 +1525,8 @@ static void __init sklh_idle_state_table_update(void)
  */
 static void __init spr_idle_state_table_update(void)
 {
+	unsigned long long msr;
+
 	/* Check if user prefers C1E over C1. */
 	if (preferred_states_mask & BIT(2)) {
 		if (preferred_states_mask & BIT(1))
@@ -1538,6 +1540,19 @@ static void __init spr_idle_state_table_update(void)
 		c1e_promotion_enable();
 		disable_promotion_to_c1e = false;
 	}
+
+	/*
+	 * By default, the C6 state assumes the worst-case scenario of package
+	 * C6. However, if PC6 is disabled, we update the numbers to match
+	 * core C6.
+	 */
+	rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr);
+
+	/* Limit value 2 and above allow for PC6. */
+	if ((msr & 0x7) < 2) {
+		spr_cstates[2].exit_latency = 190;
+		spr_cstates[2].target_residency = 600;
+	}
 }
 
 static bool __init intel_idle_verify_cstate(unsigned int mwait_hint)

From aedb7f63fe807fbb06c0d0ebfaebe17168a395f9 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 27 Apr 2022 09:08:52 +0300
Subject: [PATCH 669/737] intel_idle: Fix the 'preferred_cstates' module
 parameter

Problem description.

When user boots kernel up with the 'intel_idle.preferred_cstates=4' option,
we enable C1E and disable C1 states on Sapphire Rapids Xeon (SPR). In order
for C1E to work on SPR, we have to enable the C1E promotion bit on all
CPUs.  However, we enable it only on one CPU.

Fix description.

The 'intel_idle' driver already has the infrastructure for disabling C1E
promotion on every CPU. This patch uses the same infrastructure for
enabling C1E promotion on every CPU. It changes the boolean
'disable_promotion_to_c1e' variable to a tri-state 'c1e_promotion'
variable.

Tested on a 2-socket SPR system. I verified the following combinations:

 * C1E promotion enabled and disabled in BIOS.
 * Booted with and without the 'intel_idle.preferred_cstates=4' kernel
   argument.

In all 4 cases C1E promotion was correctly set on all CPUs.

Also tested on an old Broadwell system, just to make sure it does not cause
a regression. C1E promotion was correctly disabled on that system, both C1
and C1E were exposed (as expected).

Fixes: da0e58c038e6 ("intel_idle: add 'preferred_cstates' module argument")
Reported-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
[ rjw: Minor changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/idle/intel_idle.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 8e4c41dab7ce0..f7da2031c994d 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -71,7 +71,12 @@ static unsigned int preferred_states_mask;
 static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
 
 static unsigned long auto_demotion_disable_flags;
-static bool disable_promotion_to_c1e;
+
+static enum {
+	C1E_PROMOTION_PRESERVE,
+	C1E_PROMOTION_ENABLE,
+	C1E_PROMOTION_DISABLE
+} c1e_promotion = C1E_PROMOTION_PRESERVE;
 
 struct idle_cpu {
 	struct cpuidle_state *state_table;
@@ -1378,8 +1383,6 @@ static inline void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) { }
 static inline bool intel_idle_off_by_default(u32 mwait_hint) { return false; }
 #endif /* !CONFIG_ACPI_PROCESSOR_CSTATE */
 
-static void c1e_promotion_enable(void);
-
 /**
  * ivt_idle_state_table_update - Tune the idle states table for Ivy Town.
  *
@@ -1537,8 +1540,7 @@ static void __init spr_idle_state_table_update(void)
 		spr_cstates[1].flags &= ~CPUIDLE_FLAG_UNUSABLE;
 
 		/* Enable C1E using the "C1E promotion" bit. */
-		c1e_promotion_enable();
-		disable_promotion_to_c1e = false;
+		c1e_promotion = C1E_PROMOTION_ENABLE;
 	}
 
 	/*
@@ -1706,7 +1708,9 @@ static int intel_idle_cpu_init(unsigned int cpu)
 	if (auto_demotion_disable_flags)
 		auto_demotion_disable();
 
-	if (disable_promotion_to_c1e)
+	if (c1e_promotion == C1E_PROMOTION_ENABLE)
+		c1e_promotion_enable();
+	else if (c1e_promotion == C1E_PROMOTION_DISABLE)
 		c1e_promotion_disable();
 
 	return 0;
@@ -1785,7 +1789,8 @@ static int __init intel_idle_init(void)
 	if (icpu) {
 		cpuidle_state_table = icpu->state_table;
 		auto_demotion_disable_flags = icpu->auto_demotion_disable_flags;
-		disable_promotion_to_c1e = icpu->disable_promotion_to_c1e;
+		if (icpu->disable_promotion_to_c1e)
+			c1e_promotion = C1E_PROMOTION_DISABLE;
 		if (icpu->use_acpi || force_use_acpi)
 			intel_idle_acpi_cst_extract();
 	} else if (!intel_idle_acpi_cst_extract()) {

From 8d933f8fcf83a052738a0224f9f1eba6585282d3 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 27 Apr 2022 09:08:53 +0300
Subject: [PATCH 670/737] intel_idle: Fix SPR C6 optimization

The Sapphire Rapids (SPR) C6 optimization was added to the end of the
'spr_idle_state_table_update()' function. However, the function has a
'return' which may happen before the optimization has a chance to run.
And this may prevent the optimization from happening.

This is an unlikely scenario, but possible if user boots with, say,
the 'intel_idle.preferred_cstates=6' kernel boot option.

This patch fixes the issue by eliminating the problematic 'return'
statement.

Fixes: 3a9cf77b60dc ("intel_idle: add core C6 optimization for SPR")
Suggested-by: Jan Beulich <jbeulich@suse.com>
Reported-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
[ rjw: Minor changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/idle/intel_idle.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index f7da2031c994d..fcd086916cfa8 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1531,11 +1531,9 @@ static void __init spr_idle_state_table_update(void)
 	unsigned long long msr;
 
 	/* Check if user prefers C1E over C1. */
-	if (preferred_states_mask & BIT(2)) {
-		if (preferred_states_mask & BIT(1))
-			/* Both can't be enabled, stick to the defaults. */
-			return;
-
+	if ((preferred_states_mask & BIT(2)) &&
+	    !(preferred_states_mask & BIT(1))) {
+		/* Disable C1 and enable C1E. */
 		spr_cstates[0].flags |= CPUIDLE_FLAG_UNUSABLE;
 		spr_cstates[1].flags &= ~CPUIDLE_FLAG_UNUSABLE;
 

From b10b88cd1f04a8f2106ed92e7653573a66a6e343 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Sat, 16 Jul 2022 09:26:55 +0300
Subject: [PATCH 671/737] intel_idle: make SPR C1 and C1E be independent

This patch partially reverts the changes made by the following commit:

da0e58c038e6 intel_idle: add 'preferred_cstates' module argument

As that commit describes, on early Sapphire Rapids Xeon platforms the C1 and
C1E states were mutually exclusive, so that users could only have either C1 and
C6, or C1E and C6.

However, Intel firmware engineers managed to remove this limitation and make C1
and C1E to be completely independent, just like on previous Xeon platforms.

Therefore, this patch:
 * Removes commentary describing the old, and now non-existing SPR C1E
   limitation.
 * Marks SPR C1E as available by default.
 * Removes the 'preferred_cstates' parameter handling for SPR. Both C1 and
   C1E will be available regardless of 'preferred_cstates' value.

We expect that all SPR systems are shipping with new firmware, which includes
the C1/C1E improvement.

Cc: v5.18+ <stable@vger.kernel.org> # v5.18+
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/idle/intel_idle.c | 24 +-----------------------
 1 file changed, 1 insertion(+), 23 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index fcd086916cfa8..8db9d2f7ee742 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -784,16 +784,6 @@ static struct cpuidle_state icx_cstates[] __initdata = {
 		.enter = NULL }
 };
 
-/*
- * On Sapphire Rapids Xeon C1 has to be disabled if C1E is enabled, and vice
- * versa. On SPR C1E is enabled only if "C1E promotion" bit is set in
- * MSR_IA32_POWER_CTL. But in this case there effectively no C1, because C1
- * requests are promoted to C1E. If the "C1E promotion" bit is cleared, then
- * both C1 and C1E requests end up with C1, so there is effectively no C1E.
- *
- * By default we enable C1 and disable C1E by marking it with
- * 'CPUIDLE_FLAG_UNUSABLE'.
- */
 static struct cpuidle_state spr_cstates[] __initdata = {
 	{
 		.name = "C1",
@@ -806,8 +796,7 @@ static struct cpuidle_state spr_cstates[] __initdata = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE | \
-					   CPUIDLE_FLAG_UNUSABLE,
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
 		.exit_latency = 2,
 		.target_residency = 4,
 		.enter = &intel_idle,
@@ -1530,17 +1519,6 @@ static void __init spr_idle_state_table_update(void)
 {
 	unsigned long long msr;
 
-	/* Check if user prefers C1E over C1. */
-	if ((preferred_states_mask & BIT(2)) &&
-	    !(preferred_states_mask & BIT(1))) {
-		/* Disable C1 and enable C1E. */
-		spr_cstates[0].flags |= CPUIDLE_FLAG_UNUSABLE;
-		spr_cstates[1].flags &= ~CPUIDLE_FLAG_UNUSABLE;
-
-		/* Enable C1E using the "C1E promotion" bit. */
-		c1e_promotion = C1E_PROMOTION_ENABLE;
-	}
-
 	/*
 	 * By default, the C6 state assumes the worst-case scenario of package
 	 * C6. However, if PC6 is disabled, we update the numbers to match

From d182e9757d6b6975ea968552962267b3351222bd Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 5 Dec 2022 23:08:20 +0000
Subject: [PATCH 672/737] mm/damon/core: implement damos filter

Patch series "implement DAMOS filtering for anon pages and/or specific
memory cgroups"

DAMOS let users do system operations in a data access pattern oriented
way.  The data access pattern, which is extracted by DAMON, is somewhat
accurate more than what user space could know in many cases.  However, in
some situation, users could know something more than the kernel about the
pattern or some special requirements for some types of memory or
processes.  For example, some users would have slow swap devices and knows
latency-ciritical processes and therefore want to use DAMON-based
proactive reclamation (DAMON_RECLAIM) for only non-anonymous pages of
non-latency-critical processes.

For such restriction, users could exclude the memory regions from the
initial monitoring regions and use non-dynamic monitoring regions update
monitoring operations set including fvaddr and paddr.  They could also
adjust the DAMOS target access pattern.  For dynamically changing memory
layout and access pattern, those would be not enough.

To help the case, add an interface, namely DAMOS filters, which can be
used to avoid the DAMOS actions be applied to specific types of memory, to
DAMON kernel API (damon.h).  At the moment, it supports filtering
anonymous pages and/or specific memory cgroups in or out for each DAMOS
scheme.

This patchset adds the support for all DAMOS actions that 'paddr'
monitoring operations set supports ('pageout', 'lru_prio', and
'lru_deprio'), and the functionality is exposed via DAMON kernel API
(damon.h) the DAMON sysfs interface (/sys/kernel/mm/damon/admins/), and
DAMON_RECLAIM module parameters.

Patches Sequence
----------------

First patch implements DAMOS filter interface to DAMON kernel API.  Second
patch makes the physical address space monitoring operations set to
support the filters from all supporting DAMOS actions.  Third patch adds
anonymous pages filter support to DAMON_RECLAIM, and the fourth patch
documents the DAMON_RECLAIM's new feature.  Fifth to seventh patches
implement DAMON sysfs files for support of the filters, and eighth patch
connects the file to use DAMOS filters feature.  Ninth patch adds simple
self test cases for DAMOS filters of the sysfs interface.  Finally,
following two patches (tenth and eleventh) document the new features and
interfaces.

This patch (of 11):

DAMOS lets users do system operation in a data access pattern oriented
way.  The data access pattern, which is extracted by DAMON, is somewhat
accurate more than what user space could know in many cases.  However, in
some situation, users could know something more than the kernel about the
pattern or some special requirements for some types of memory or
processes.  For example, some users would have slow swap devices and knows
latency-ciritical processes and therefore want to use DAMON-based
proactive reclamation (DAMON_RECLAIM) for only non-anonymous pages of
non-latency-critical processes.

For such restriction, users could exclude the memory regions from the
initial monitoring regions and use non-dynamic monitoring regions update
monitoring operations set including fvaddr and paddr.  They could also
adjust the DAMOS target access pattern.  For dynamically changing memory
layout and access pattern, those would be not enough.

To help the case, add an interface, namely DAMOS filters, which can be
used to avoid the DAMOS actions be applied to specific types of memory, to
DAMON kernel API (damon.h).  At the moment, it supports filtering
anonymous pages and/or specific memory cgroups in or out for each DAMOS
scheme.

Note that this commit adds only the interface to the DAMON kernel API.
The impelmentation should be made in the monitoring operations sets, and
following commits will add that.

Link: https://lkml.kernel.org/r/20221205230830.144349-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20221205230830.144349-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 51 +++++++++++++++++++++++++++++++++++++++++++
 mm/damon/core.c       | 39 +++++++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 35630634d7904..42cea3bf7319f 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -8,6 +8,7 @@
 #ifndef _DAMON_H_
 #define _DAMON_H_
 
+#include <linux/memcontrol.h>
 #include <linux/mutex.h>
 #include <linux/time64.h>
 #include <linux/types.h>
@@ -215,6 +216,39 @@ struct damos_stat {
 	unsigned long qt_exceeds;
 };
 
+/**
+ * enum damos_filter_type - Type of memory for &struct damos_filter
+ * @DAMOS_FILTER_TYPE_ANON:	Anonymous pages.
+ * @DAMOS_FILTER_TYPE_MEMCG:	Specific memcg's pages.
+ * @NR_DAMOS_FILTER_TYPES:	Number of filter types.
+ */
+enum damos_filter_type {
+	DAMOS_FILTER_TYPE_ANON,
+	DAMOS_FILTER_TYPE_MEMCG,
+	NR_DAMOS_FILTER_TYPES,
+};
+
+/**
+ * struct damos_filter - DAMOS action target memory filter.
+ * @type:	Type of the page.
+ * @matching:	If the matching page should filtered out or in.
+ * @memcg_id:	Memcg id of the question if @type is DAMOS_FILTER_MEMCG.
+ * @list:	List head for siblings.
+ *
+ * Before applying the &damos->action to a memory region, DAMOS checks if each
+ * page of the region matches to this and avoid applying the action if so.
+ * Note that the check support is up to &struct damon_operations
+ * implementation.
+ */
+struct damos_filter {
+	enum damos_filter_type type;
+	bool matching;
+	union {
+		unsigned short memcg_id;
+	};
+	struct list_head list;
+};
+
 /**
  * struct damos_access_pattern - Target access pattern of the given scheme.
  * @min_sz_region:	Minimum size of target regions.
@@ -239,6 +273,7 @@ struct damos_access_pattern {
  * @action:		&damo_action to be applied to the target regions.
  * @quota:		Control the aggressiveness of this scheme.
  * @wmarks:		Watermarks for automated (in)activation of this scheme.
+ * @filters:		Additional set of &struct damos_filter for &action.
  * @stat:		Statistics of this scheme.
  * @list:		List head for siblings.
  *
@@ -254,6 +289,10 @@ struct damos_access_pattern {
  * If all schemes that registered to a &struct damon_ctx are inactive, DAMON
  * stops monitoring and just repeatedly checks the watermarks.
  *
+ * Before applying the &action to a memory region, &struct damon_operations
+ * implementation could check pages of the region and skip &action to respect
+ * &filters
+ *
  * After applying the &action to each region, &stat_count and &stat_sz is
  * updated to reflect the number of regions and total size of regions that the
  * &action is applied.
@@ -263,6 +302,7 @@ struct damos {
 	enum damos_action action;
 	struct damos_quota quota;
 	struct damos_watermarks wmarks;
+	struct list_head filters;
 	struct damos_stat stat;
 	struct list_head list;
 };
@@ -516,6 +556,12 @@ static inline unsigned long damon_sz_region(struct damon_region *r)
 #define damon_for_each_scheme_safe(s, next, ctx) \
 	list_for_each_entry_safe(s, next, &(ctx)->schemes, list)
 
+#define damos_for_each_filter(f, scheme) \
+	list_for_each_entry(f, &(scheme)->filters, list)
+
+#define damos_for_each_filter_safe(f, next, scheme) \
+	list_for_each_entry_safe(f, next, &(scheme)->filters, list)
+
 #ifdef CONFIG_DAMON
 
 struct damon_region *damon_new_region(unsigned long start, unsigned long end);
@@ -536,6 +582,11 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t);
 int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
 		unsigned int nr_ranges);
 
+struct damos_filter *damos_new_filter(enum damos_filter_type type,
+		bool matching);
+void damos_add_filter(struct damos *s, struct damos_filter *f);
+void damos_destroy_filter(struct damos_filter *f);
+
 struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
 			enum damos_action action, struct damos_quota *quota,
 			struct damos_watermarks *wmarks);
diff --git a/mm/damon/core.c b/mm/damon/core.c
index ceec75b88ef96..1bf0654ae189d 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -263,6 +263,40 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
 	return 0;
 }
 
+struct damos_filter *damos_new_filter(enum damos_filter_type type,
+		bool matching)
+{
+	struct damos_filter *filter;
+
+	filter = kmalloc(sizeof(*filter), GFP_KERNEL);
+	if (!filter)
+		return NULL;
+	filter->type = type;
+	filter->matching = matching;
+	return filter;
+}
+
+void damos_add_filter(struct damos *s, struct damos_filter *f)
+{
+	list_add_tail(&f->list, &s->filters);
+}
+
+static void damos_del_filter(struct damos_filter *f)
+{
+	list_del(&f->list);
+}
+
+static void damos_free_filter(struct damos_filter *f)
+{
+	kfree(f);
+}
+
+void damos_destroy_filter(struct damos_filter *f)
+{
+	damos_del_filter(f);
+	damos_free_filter(f);
+}
+
 /* initialize private fields of damos_quota and return the pointer */
 static struct damos_quota *damos_quota_init_priv(struct damos_quota *quota)
 {
@@ -287,6 +321,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
 		return NULL;
 	scheme->pattern = *pattern;
 	scheme->action = action;
+	INIT_LIST_HEAD(&scheme->filters);
 	scheme->stat = (struct damos_stat){};
 	INIT_LIST_HEAD(&scheme->list);
 
@@ -315,6 +350,10 @@ static void damon_free_scheme(struct damos *s)
 
 void damon_destroy_scheme(struct damos *s)
 {
+	struct damos_filter *f, *next;
+
+	damos_for_each_filter_safe(f, next, s)
+		damos_destroy_filter(f);
 	damon_del_scheme(s);
 	damon_free_scheme(s);
 }

From 46eb77e6815500fb2e52a2d7da876078bc76886f Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 5 Dec 2022 23:08:21 +0000
Subject: [PATCH 673/737] mm/damon/paddr: support DAMOS filters

Implement support of the DAMOS filters in the physical address space
monitoring operations set, for all DAMOS actions that it supports
including 'pageout', 'lru_prio', and 'lru_deprio'.

Link: https://lkml.kernel.org/r/20221205230830.144349-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/paddr.c | 71 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 62 insertions(+), 9 deletions(-)

diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 5945e1e379382..f4bfa0634cc38 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -206,7 +206,47 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
 	return max_nr_accesses;
 }
 
-static unsigned long damon_pa_pageout(struct damon_region *r)
+static bool __damos_pa_filter_out(struct damos_filter *filter,
+		struct page *page)
+{
+	bool matched = false;
+	struct mem_cgroup *memcg;
+
+	switch (filter->type) {
+	case DAMOS_FILTER_TYPE_ANON:
+		matched = PageAnon(page);
+		break;
+	case DAMOS_FILTER_TYPE_MEMCG:
+		rcu_read_lock();
+		memcg = page_memcg_rcu(page);
+		if (!memcg)
+			matched = false;
+		else
+			matched = filter->memcg_id == mem_cgroup_id(memcg);
+		rcu_read_unlock();
+		break;
+	default:
+		break;
+	}
+
+	return matched == filter->matching;
+}
+
+/*
+ * damos_pa_filter_out - Return true if the page should be filtered out.
+ */
+static bool damos_pa_filter_out(struct damos *scheme, struct page *page)
+{
+	struct damos_filter *filter;
+
+	damos_for_each_filter(filter, scheme) {
+		if (__damos_pa_filter_out(filter, page))
+			return true;
+	}
+	return false;
+}
+
+static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s)
 {
 	unsigned long addr, applied;
 	LIST_HEAD(page_list);
@@ -217,6 +257,11 @@ static unsigned long damon_pa_pageout(struct damon_region *r)
 		if (!page)
 			continue;
 
+		if (damos_pa_filter_out(s, page)) {
+			put_page(page);
+			continue;
+		}
+
 		ClearPageReferenced(page);
 		test_and_clear_page_young(page);
 		if (isolate_lru_page(page)) {
@@ -235,7 +280,7 @@ static unsigned long damon_pa_pageout(struct damon_region *r)
 }
 
 static inline unsigned long damon_pa_mark_accessed_or_deactivate(
-		struct damon_region *r, bool mark_accessed)
+		struct damon_region *r, struct damos *s, bool mark_accessed)
 {
 	unsigned long addr, applied = 0;
 
@@ -244,6 +289,12 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate(
 
 		if (!page)
 			continue;
+
+		if (damos_pa_filter_out(s, page)) {
+			put_page(page);
+			continue;
+		}
+
 		if (mark_accessed)
 			mark_page_accessed(page);
 		else
@@ -254,14 +305,16 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate(
 	return applied * PAGE_SIZE;
 }
 
-static unsigned long damon_pa_mark_accessed(struct damon_region *r)
+static unsigned long damon_pa_mark_accessed(struct damon_region *r,
+	struct damos *s)
 {
-	return damon_pa_mark_accessed_or_deactivate(r, true);
+	return damon_pa_mark_accessed_or_deactivate(r, s, true);
 }
 
-static unsigned long damon_pa_deactivate_pages(struct damon_region *r)
+static unsigned long damon_pa_deactivate_pages(struct damon_region *r,
+	struct damos *s)
 {
-	return damon_pa_mark_accessed_or_deactivate(r, false);
+	return damon_pa_mark_accessed_or_deactivate(r, s, false);
 }
 
 static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
@@ -270,11 +323,11 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
 {
 	switch (scheme->action) {
 	case DAMOS_PAGEOUT:
-		return damon_pa_pageout(r);
+		return damon_pa_pageout(r, scheme);
 	case DAMOS_LRU_PRIO:
-		return damon_pa_mark_accessed(r);
+		return damon_pa_mark_accessed(r, scheme);
 	case DAMOS_LRU_DEPRIO:
-		return damon_pa_deactivate_pages(r);
+		return damon_pa_deactivate_pages(r, scheme);
 	case DAMOS_STAT:
 		break;
 	default:

From ac6a663f11d1cc345fc5e07f0f3b9405c170e6bb Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 5 Dec 2022 23:08:22 +0000
Subject: [PATCH 674/737] mm/damon/reclaim: add a parameter called skip_anon
 for avoiding anonymous pages reclamation

In some cases, for example if users have confidence at anonymous pages
management or the swap device is too slow, users would want to avoid
DAMON_RECLAIM swapping the anonymous pages out.  For such case, add yet
another DAMON_RECLAIM parameter, namely 'skip_anon'.  When it is set as
'Y', DAMON_RECLAIM will avoid reclaiming anonymous pages using a DAMOS
filter.

Link: https://lkml.kernel.org/r/20221205230830.144349-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/reclaim.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 408b9f7688ded..8beeb2894f502 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -99,6 +99,15 @@ module_param(monitor_region_start, ulong, 0600);
 static unsigned long monitor_region_end __read_mostly;
 module_param(monitor_region_end, ulong, 0600);
 
+/*
+ * Skip anonymous pages reclamation.
+ *
+ * If this parameter is set as ``Y``, DAMON_RECLAIM does not reclaim anonymous
+ * pages.  By default, ``N``.
+ */
+static bool skip_anon __read_mostly;
+module_param(skip_anon, bool, 0600);
+
 /*
  * PID of the DAMON thread
  *
@@ -143,6 +152,7 @@ static struct damos *damon_reclaim_new_scheme(void)
 static int damon_reclaim_apply_parameters(void)
 {
 	struct damos *scheme;
+	struct damos_filter *filter;
 	int err = 0;
 
 	err = damon_set_attrs(ctx, &damon_reclaim_mon_attrs);
@@ -153,6 +163,15 @@ static int damon_reclaim_apply_parameters(void)
 	scheme = damon_reclaim_new_scheme();
 	if (!scheme)
 		return -ENOMEM;
+	if (skip_anon) {
+		filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true);
+		if (!filter) {
+			/* Will be freed by next 'damon_set_schemes()' below */
+			damon_destroy_scheme(scheme);
+			return -ENOMEM;
+		}
+		damos_add_filter(scheme, filter);
+	}
 	damon_set_schemes(ctx, &scheme, 1);
 
 	return damon_set_region_biggest_system_ram_default(target,

From 501e52232b84920eaeefb64742081c64263f5d22 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 5 Dec 2022 23:08:23 +0000
Subject: [PATCH 675/737] Docs/admin-guide/damon/reclaim: document 'skip_anon'
 parameter

Document the newly added 'skip_anon' parameter of DAMON_RECLAIM, which can
be used to avoid anonymous pages reclamation.

Link: https://lkml.kernel.org/r/20221205230830.144349-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/reclaim.rst | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst
index 6510baa911097..3fedcd0a794ec 100644
--- a/Documentation/admin-guide/mm/damon/reclaim.rst
+++ b/Documentation/admin-guide/mm/damon/reclaim.rst
@@ -205,6 +205,15 @@ The end physical address of memory region that DAMON_RECLAIM will do work
 against.  That is, DAMON_RECLAIM will find cold memory regions in this region
 and reclaims.  By default, biggest System RAM is used as the region.
 
+skip_anon
+---------
+
+Skip anonymous pages reclamation.
+
+If this parameter is set as ``Y``, DAMON_RECLAIM does not reclaim anonymous
+pages.  By default, ``N``.
+
+
 kdamond_pid
 -----------
 

From 3c143a46180774845627618b7e3bbc3b758d21e8 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 5 Dec 2022 23:08:24 +0000
Subject: [PATCH 676/737] mm/damon/sysfs-schemes: implement filters directory

DAMOS filters are currently supported by only DAMON kernel API.  To expose
the feature to user space, implement a DAMON sysfs directory named
'filters' under each scheme directory.  Please note that this is
implementing only the directory.  Following commits will implement more
files and directories, and finally connect the DAMOS filters feature.

Link: https://lkml.kernel.org/r/20221205230830.144349-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs-schemes.c | 85 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 84 insertions(+), 1 deletion(-)

diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 81fc4d27f4e45..50c8148cb474c 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -258,6 +258,63 @@ static struct kobj_type damon_sysfs_stats_ktype = {
 	.default_groups = damon_sysfs_stats_groups,
 };
 
+/*
+ * filters directory
+ */
+
+struct damon_sysfs_scheme_filters {
+	struct kobject kobj;
+	int nr;
+};
+
+static struct damon_sysfs_scheme_filters *
+damon_sysfs_scheme_filters_alloc(void)
+{
+	return kzalloc(sizeof(struct damon_sysfs_scheme_filters), GFP_KERNEL);
+}
+
+static ssize_t nr_filters_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_scheme_filters *filters = container_of(kobj,
+			struct damon_sysfs_scheme_filters, kobj);
+
+	return sysfs_emit(buf, "%d\n", filters->nr);
+}
+
+static ssize_t nr_filters_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	int nr, err = kstrtoint(buf, 0, &nr);
+
+	if (err)
+		return err;
+	if (nr < 0)
+		return -EINVAL;
+
+	return count;
+}
+
+static void damon_sysfs_scheme_filters_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_scheme_filters, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_scheme_filters_nr_attr =
+		__ATTR_RW_MODE(nr_filters, 0600);
+
+static struct attribute *damon_sysfs_scheme_filters_attrs[] = {
+	&damon_sysfs_scheme_filters_nr_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_scheme_filters);
+
+static struct kobj_type damon_sysfs_scheme_filters_ktype = {
+	.release = damon_sysfs_scheme_filters_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_scheme_filters_groups,
+};
+
 /*
  * watermarks directory
  */
@@ -784,6 +841,7 @@ struct damon_sysfs_scheme {
 	struct damon_sysfs_access_pattern *access_pattern;
 	struct damon_sysfs_quotas *quotas;
 	struct damon_sysfs_watermarks *watermarks;
+	struct damon_sysfs_scheme_filters *filters;
 	struct damon_sysfs_stats *stats;
 	struct damon_sysfs_scheme_regions *tried_regions;
 };
@@ -878,6 +936,24 @@ static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme)
 	return err;
 }
 
+static int damon_sysfs_scheme_set_filters(struct damon_sysfs_scheme *scheme)
+{
+	struct damon_sysfs_scheme_filters *filters =
+		damon_sysfs_scheme_filters_alloc();
+	int err;
+
+	if (!filters)
+		return -ENOMEM;
+	err = kobject_init_and_add(&filters->kobj,
+			&damon_sysfs_scheme_filters_ktype, &scheme->kobj,
+			"filters");
+	if (err)
+		kobject_put(&filters->kobj);
+	else
+		scheme->filters = filters;
+	return err;
+}
+
 static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme)
 {
 	struct damon_sysfs_stats *stats = damon_sysfs_stats_alloc();
@@ -926,9 +1002,12 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
 	err = damon_sysfs_scheme_set_watermarks(scheme);
 	if (err)
 		goto put_quotas_access_pattern_out;
-	err = damon_sysfs_scheme_set_stats(scheme);
+	err = damon_sysfs_scheme_set_filters(scheme);
 	if (err)
 		goto put_watermarks_quotas_access_pattern_out;
+	err = damon_sysfs_scheme_set_stats(scheme);
+	if (err)
+		goto put_filters_watermarks_quotas_access_pattern_out;
 	err = damon_sysfs_scheme_set_tried_regions(scheme);
 	if (err)
 		goto put_tried_regions_out;
@@ -937,6 +1016,9 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
 put_tried_regions_out:
 	kobject_put(&scheme->tried_regions->kobj);
 	scheme->tried_regions = NULL;
+put_filters_watermarks_quotas_access_pattern_out:
+	kobject_put(&scheme->filters->kobj);
+	scheme->filters = NULL;
 put_watermarks_quotas_access_pattern_out:
 	kobject_put(&scheme->watermarks->kobj);
 	scheme->watermarks = NULL;
@@ -956,6 +1038,7 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme)
 	damon_sysfs_quotas_rm_dirs(scheme->quotas);
 	kobject_put(&scheme->quotas->kobj);
 	kobject_put(&scheme->watermarks->kobj);
+	kobject_put(&scheme->filters->kobj);
 	kobject_put(&scheme->stats->kobj);
 	damon_sysfs_scheme_regions_rm_dirs(scheme->tried_regions);
 	kobject_put(&scheme->tried_regions->kobj);

From 8b5eb516d2aa2228d9ad69208d3809d0dfa664b8 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 5 Dec 2022 23:08:25 +0000
Subject: [PATCH 677/737] mm/damon/sysfs-schemes: implement filter directory

Implement DAMOS filter directory which will be located under the filters
directory.  The directory provides three files, namely type, matching, and
memcg_path.  'type' and 'matching' will be directly connected to the
fields of 'struct damos_filter' having same name.  'memcg_path' will
receive the path of the memory cgroup of the interest and later converted
to memcg id when it's committed.

Link: https://lkml.kernel.org/r/20221205230830.144349-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs-schemes.c | 128 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 128 insertions(+)

diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 50c8148cb474c..afbfc55a8e842 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -258,6 +258,134 @@ static struct kobj_type damon_sysfs_stats_ktype = {
 	.default_groups = damon_sysfs_stats_groups,
 };
 
+/*
+ * filter directory
+ */
+
+struct damon_sysfs_scheme_filter {
+	struct kobject kobj;
+	enum damos_filter_type type;
+	bool matching;
+	char *memcg_path;
+};
+
+/* Should match with enum damos_filter_type */
+static const char * const damon_sysfs_scheme_filter_type_strs[] = {
+	"anon",
+	"memcg",
+};
+
+static ssize_t type_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+			struct damon_sysfs_scheme_filter, kobj);
+
+	return sysfs_emit(buf, "%s\n",
+			damon_sysfs_scheme_filter_type_strs[filter->type]);
+}
+
+static ssize_t type_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+			struct damon_sysfs_scheme_filter, kobj);
+	enum damos_filter_type type;
+	ssize_t ret = -EINVAL;
+
+	for (type = 0; type < NR_DAMOS_FILTER_TYPES; type++) {
+		if (sysfs_streq(buf, damon_sysfs_scheme_filter_type_strs[
+					type])) {
+			filter->type = type;
+			ret = count;
+			break;
+		}
+	}
+	return ret;
+}
+
+static ssize_t matching_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+			struct damon_sysfs_scheme_filter, kobj);
+
+	return sysfs_emit(buf, "%c\n", filter->matching ? 'Y' : 'N');
+}
+
+static ssize_t matching_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+			struct damon_sysfs_scheme_filter, kobj);
+	bool matching;
+	int err = kstrtobool(buf, &matching);
+
+	if (err)
+		return err;
+
+	filter->matching = matching;
+	return count;
+}
+
+static ssize_t memcg_path_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+			struct damon_sysfs_scheme_filter, kobj);
+
+	return sysfs_emit(buf, "%s\n",
+			filter->memcg_path ? filter->memcg_path : "");
+}
+
+static ssize_t memcg_path_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+			struct damon_sysfs_scheme_filter, kobj);
+	char *path = kmalloc(sizeof(*path) * (count + 1), GFP_KERNEL);
+
+	if (!path)
+		return -ENOMEM;
+
+	strncpy(path, buf, count);
+	path[count] = '\0';
+	filter->memcg_path = path;
+	return count;
+}
+
+static void damon_sysfs_scheme_filter_release(struct kobject *kobj)
+{
+	struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+			struct damon_sysfs_scheme_filter, kobj);
+
+	kfree(filter->memcg_path);
+	kfree(filter);
+}
+
+static struct kobj_attribute damon_sysfs_scheme_filter_type_attr =
+		__ATTR_RW_MODE(type, 0600);
+
+static struct kobj_attribute damon_sysfs_scheme_filter_matching_attr =
+		__ATTR_RW_MODE(matching, 0600);
+
+static struct kobj_attribute damon_sysfs_scheme_filter_memcg_path_attr =
+		__ATTR_RW_MODE(memcg_path, 0600);
+
+static struct attribute *damon_sysfs_scheme_filter_attrs[] = {
+	&damon_sysfs_scheme_filter_type_attr.attr,
+	&damon_sysfs_scheme_filter_matching_attr.attr,
+	&damon_sysfs_scheme_filter_memcg_path_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_scheme_filter);
+
+static struct kobj_type damon_sysfs_scheme_filter_ktype = {
+	.release = damon_sysfs_scheme_filter_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_scheme_filter_groups,
+};
+
 /*
  * filters directory
  */

From 37afae597d39b582a4e2ec5b471f8869743dc301 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 5 Dec 2022 23:08:26 +0000
Subject: [PATCH 678/737] mm/damon/sysfs-schemes: connect filter directory and
 filters directory

Implement 'nr_filters' file under 'filters' directory, which will be used
to populate specific number of 'filter' directory under the directory,
similar to other 'nr_*' files in DAMON sysfs interface.

Link: https://lkml.kernel.org/r/20221205230830.144349-8-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs-schemes.c | 68 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index afbfc55a8e842..e79c678a69d58 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -269,6 +269,11 @@ struct damon_sysfs_scheme_filter {
 	char *memcg_path;
 };
 
+static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(void)
+{
+	return kzalloc(sizeof(struct damon_sysfs_scheme_filter), GFP_KERNEL);
+}
+
 /* Should match with enum damos_filter_type */
 static const char * const damon_sysfs_scheme_filter_type_strs[] = {
 	"anon",
@@ -392,6 +397,7 @@ static struct kobj_type damon_sysfs_scheme_filter_ktype = {
 
 struct damon_sysfs_scheme_filters {
 	struct kobject kobj;
+	struct damon_sysfs_scheme_filter **filters_arr;
 	int nr;
 };
 
@@ -401,6 +407,57 @@ damon_sysfs_scheme_filters_alloc(void)
 	return kzalloc(sizeof(struct damon_sysfs_scheme_filters), GFP_KERNEL);
 }
 
+static void damon_sysfs_scheme_filters_rm_dirs(
+		struct damon_sysfs_scheme_filters *filters)
+{
+	struct damon_sysfs_scheme_filter **filters_arr = filters->filters_arr;
+	int i;
+
+	for (i = 0; i < filters->nr; i++)
+		kobject_put(&filters_arr[i]->kobj);
+	filters->nr = 0;
+	kfree(filters_arr);
+	filters->filters_arr = NULL;
+}
+
+static int damon_sysfs_scheme_filters_add_dirs(
+		struct damon_sysfs_scheme_filters *filters, int nr_filters)
+{
+	struct damon_sysfs_scheme_filter **filters_arr, *filter;
+	int err, i;
+
+	damon_sysfs_scheme_filters_rm_dirs(filters);
+	if (!nr_filters)
+		return 0;
+
+	filters_arr = kmalloc_array(nr_filters, sizeof(*filters_arr),
+			GFP_KERNEL | __GFP_NOWARN);
+	if (!filters_arr)
+		return -ENOMEM;
+	filters->filters_arr = filters_arr;
+
+	for (i = 0; i < nr_filters; i++) {
+		filter = damon_sysfs_scheme_filter_alloc();
+		if (!filter) {
+			damon_sysfs_scheme_filters_rm_dirs(filters);
+			return -ENOMEM;
+		}
+
+		err = kobject_init_and_add(&filter->kobj,
+				&damon_sysfs_scheme_filter_ktype,
+				&filters->kobj, "%d", i);
+		if (err) {
+			kobject_put(&filter->kobj);
+			damon_sysfs_scheme_filters_rm_dirs(filters);
+			return err;
+		}
+
+		filters_arr[i] = filter;
+		filters->nr++;
+	}
+	return 0;
+}
+
 static ssize_t nr_filters_show(struct kobject *kobj,
 		struct kobj_attribute *attr, char *buf)
 {
@@ -413,6 +470,7 @@ static ssize_t nr_filters_show(struct kobject *kobj,
 static ssize_t nr_filters_store(struct kobject *kobj,
 		struct kobj_attribute *attr, const char *buf, size_t count)
 {
+	struct damon_sysfs_scheme_filters *filters;
 	int nr, err = kstrtoint(buf, 0, &nr);
 
 	if (err)
@@ -420,6 +478,15 @@ static ssize_t nr_filters_store(struct kobject *kobj,
 	if (nr < 0)
 		return -EINVAL;
 
+	filters = container_of(kobj, struct damon_sysfs_scheme_filters, kobj);
+
+	if (!mutex_trylock(&damon_sysfs_lock))
+		return -EBUSY;
+	err = damon_sysfs_scheme_filters_add_dirs(filters, nr);
+	mutex_unlock(&damon_sysfs_lock);
+	if (err)
+		return err;
+
 	return count;
 }
 
@@ -1166,6 +1233,7 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme)
 	damon_sysfs_quotas_rm_dirs(scheme->quotas);
 	kobject_put(&scheme->quotas->kobj);
 	kobject_put(&scheme->watermarks->kobj);
+	damon_sysfs_scheme_filters_rm_dirs(scheme->filters);
 	kobject_put(&scheme->filters->kobj);
 	kobject_put(&scheme->stats->kobj);
 	damon_sysfs_scheme_regions_rm_dirs(scheme->tried_regions);

From e4c3bf39ea72ec309f633aaa79336315ef4cc615 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 5 Dec 2022 23:08:27 +0000
Subject: [PATCH 679/737] mm/damon/sysfs-schemes: implement scheme filters

Implement scheme filters functionality of DAMON sysfs interface by making
the code reads the values of files under the filter directories and pass
that to DAMON using DAMON kernel API.

[sj@kernel.org: fix leaking a filter for wrong cgroup path]
  Link: https://lkml.kernel.org/r/20221219171807.55708-2-sj@kernel.org
[sj@kernel.org: return an error for filter memcg path id lookup failure]
  Link: https://lkml.kernel.org/r/20221219171807.55708-3-sj@kernel.org
Link: https://lkml.kernel.org/r/20221205230830.144349-9-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs-schemes.c | 93 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 92 insertions(+), 1 deletion(-)

diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index e79c678a69d58..f0dabe3e2dc03 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -1403,6 +1403,79 @@ struct kobj_type damon_sysfs_schemes_ktype = {
 	.default_groups = damon_sysfs_schemes_groups,
 };
 
+static bool damon_sysfs_memcg_path_eq(struct mem_cgroup *memcg,
+		char *memcg_path_buf, char *path)
+{
+#ifdef CONFIG_MEMCG
+	cgroup_path(memcg->css.cgroup, memcg_path_buf, PATH_MAX);
+	if (sysfs_streq(memcg_path_buf, path))
+		return true;
+#endif /* CONFIG_MEMCG */
+	return false;
+}
+
+static int damon_sysfs_memcg_path_to_id(char *memcg_path, unsigned short *id)
+{
+	struct mem_cgroup *memcg;
+	char *path;
+	bool found = false;
+
+	if (!memcg_path)
+		return -EINVAL;
+
+	path = kmalloc(sizeof(*path) * PATH_MAX, GFP_KERNEL);
+	if (!path)
+		return -ENOMEM;
+
+	for (memcg = mem_cgroup_iter(NULL, NULL, NULL); memcg;
+			memcg = mem_cgroup_iter(NULL, memcg, NULL)) {
+		/* skip removed memcg */
+		if (!mem_cgroup_id(memcg))
+			continue;
+		if (damon_sysfs_memcg_path_eq(memcg, path, memcg_path)) {
+			*id = mem_cgroup_id(memcg);
+			found = true;
+			break;
+		}
+	}
+
+	kfree(path);
+	return found ? 0 : -EINVAL;
+}
+
+static int damon_sysfs_set_scheme_filters(struct damos *scheme,
+		struct damon_sysfs_scheme_filters *sysfs_filters)
+{
+	int i;
+	struct damos_filter *filter, *next;
+
+	damos_for_each_filter_safe(filter, next, scheme)
+		damos_destroy_filter(filter);
+
+	for (i = 0; i < sysfs_filters->nr; i++) {
+		struct damon_sysfs_scheme_filter *sysfs_filter =
+			sysfs_filters->filters_arr[i];
+		struct damos_filter *filter =
+			damos_new_filter(sysfs_filter->type,
+					sysfs_filter->matching);
+		int err;
+
+		if (!filter)
+			return -ENOMEM;
+		if (filter->type == DAMOS_FILTER_TYPE_MEMCG) {
+			err = damon_sysfs_memcg_path_to_id(
+					sysfs_filter->memcg_path,
+					&filter->memcg_id);
+			if (err) {
+				damos_destroy_filter(filter);
+				return err;
+			}
+		}
+		damos_add_filter(scheme, filter);
+	}
+	return 0;
+}
+
 static struct damos *damon_sysfs_mk_scheme(
 		struct damon_sysfs_scheme *sysfs_scheme)
 {
@@ -1411,6 +1484,10 @@ static struct damos *damon_sysfs_mk_scheme(
 	struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
 	struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
 	struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
+	struct damon_sysfs_scheme_filters *sysfs_filters =
+		sysfs_scheme->filters;
+	struct damos *scheme;
+	int err;
 
 	struct damos_access_pattern pattern = {
 		.min_sz_region = access_pattern->sz->min,
@@ -1436,8 +1513,17 @@ static struct damos *damon_sysfs_mk_scheme(
 		.low = sysfs_wmarks->low,
 	};
 
-	return damon_new_scheme(&pattern, sysfs_scheme->action, &quota,
+	scheme = damon_new_scheme(&pattern, sysfs_scheme->action, &quota,
 			&wmarks);
+	if (!scheme)
+		return NULL;
+
+	err = damon_sysfs_set_scheme_filters(scheme, sysfs_filters);
+	if (err) {
+		damon_destroy_scheme(scheme);
+		return NULL;
+	}
+	return scheme;
 }
 
 static void damon_sysfs_update_scheme(struct damos *scheme,
@@ -1448,6 +1534,7 @@ static void damon_sysfs_update_scheme(struct damos *scheme,
 	struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
 	struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
 	struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
+	int err;
 
 	scheme->pattern.min_sz_region = access_pattern->sz->min;
 	scheme->pattern.max_sz_region = access_pattern->sz->max;
@@ -1470,6 +1557,10 @@ static void damon_sysfs_update_scheme(struct damos *scheme,
 	scheme->wmarks.high = sysfs_wmarks->high;
 	scheme->wmarks.mid = sysfs_wmarks->mid;
 	scheme->wmarks.low = sysfs_wmarks->low;
+
+	err = damon_sysfs_set_scheme_filters(scheme, sysfs_scheme->filters);
+	if (err)
+		damon_destroy_scheme(scheme);
 }
 
 int damon_sysfs_set_schemes(struct damon_ctx *ctx,

From a6ddac3f6040e32312ecd070cf73be22cfff8636 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 5 Dec 2022 23:08:28 +0000
Subject: [PATCH 680/737] selftests/damon/sysfs: test filters directory

Add simple test cases for scheme filters of DAMON sysfs interface.  The
test cases check if the files are populated as expected, receives some
valid inputs, and refuses some invalid inputs.

Link: https://lkml.kernel.org/r/20221205230830.144349-10-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/sysfs.sh | 29 ++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh
index db4942383a50f..a00336ffdcad4 100644
--- a/tools/testing/selftests/damon/sysfs.sh
+++ b/tools/testing/selftests/damon/sysfs.sh
@@ -96,6 +96,34 @@ test_stats()
 	done
 }
 
+test_filter()
+{
+	filter_dir=$1
+	ensure_file "$filter_dir/type" "exist" "600"
+	ensure_write_succ "$filter_dir/type" "anon" "valid input"
+	ensure_write_succ "$filter_dir/type" "memcg" "valid input"
+	ensure_write_fail "$filter_dir/type" "foo" "invalid input"
+	ensure_file "$filter_dir/matching" "exist" "600"
+	ensure_file "$filter_dir/memcg_path" "exist" "600"
+}
+
+test_filters()
+{
+	filters_dir=$1
+	ensure_dir "$filters_dir" "exist"
+	ensure_file "$filters_dir/nr_filters" "exist" "600"
+	ensure_write_succ  "$filters_dir/nr_filters" "1" "valid input"
+	test_filter "$filters_dir/0"
+
+	ensure_write_succ  "$filters_dir/nr_filters" "2" "valid input"
+	test_filter "$filters_dir/0"
+	test_filter "$filters_dir/1"
+
+	ensure_write_succ "$filters_dir/nr_filters" "0" "valid input"
+	ensure_dir "$filters_dir/0" "not_exist"
+	ensure_dir "$filters_dir/1" "not_exist"
+}
+
 test_watermarks()
 {
 	watermarks_dir=$1
@@ -143,6 +171,7 @@ test_scheme()
 	test_access_pattern "$scheme_dir/access_pattern"
 	test_quotas "$scheme_dir/quotas"
 	test_watermarks "$scheme_dir/watermarks"
+	test_filters "$scheme_dir/filters"
 	test_stats "$scheme_dir/stats"
 	test_tried_regions "$scheme_dir/tried_regions"
 }

From efb6bb268f1f9e68b75d992b558d234f8bc6f8bf Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 5 Dec 2022 23:08:29 +0000
Subject: [PATCH 681/737] Docs/admin-guide/mm/damon/usage: document DAMOS
 filters of sysfs

Document about the newly added files for DAMOS filters on the DAMON usage
document.

Link: https://lkml.kernel.org/r/20221205230830.144349-11-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 48 +++++++++++++++++++-
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 86323d1eaab41..8a6a963eff25b 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -87,6 +87,8 @@ comma (","). ::
     │ │ │ │ │ │ │ quotas/ms,bytes,reset_interval_ms
     │ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil
     │ │ │ │ │ │ │ watermarks/metric,interval_us,high,mid,low
+    │ │ │ │ │ │ │ filters/nr_filters
+    │ │ │ │ │ │ │ │ 0/type,matching,memcg_id
     │ │ │ │ │ │ │ stats/nr_tried,sz_tried,nr_applied,sz_applied,qt_exceeds
     │ │ │ │ │ │ │ tried_regions/
     │ │ │ │ │ │ │ │ 0/start,end,nr_accesses,age
@@ -151,6 +153,8 @@ number (``N``) to the file creates the number of child directories named as
 moment, only one context per kdamond is supported, so only ``0`` or ``1`` can
 be written to the file.
 
+.. _sysfs_contexts:
+
 contexts/<N>/
 -------------
 
@@ -268,8 +272,8 @@ schemes/<N>/
 ------------
 
 In each scheme directory, five directories (``access_pattern``, ``quotas``,
-``watermarks``, ``stats``, and ``tried_regions``) and one file (``action``)
-exist.
+``watermarks``, ``filters``, ``stats``, and ``tried_regions``) and one file
+(``action``) exist.
 
 The ``action`` file is for setting and getting what action you want to apply to
 memory regions having specific access pattern of the interest.  The keywords
@@ -347,6 +351,46 @@ as below.
 
 The ``interval`` should written in microseconds unit.
 
+schemes/<N>/filters/
+--------------------
+
+Users could know something more than the kernel for specific types of memory.
+In the case, users could do their own management for the memory and hence
+doesn't want DAMOS bothers that.  Users could limit DAMOS by setting the access
+pattern of the scheme and/or the monitoring regions for the purpose, but that
+can be inefficient in some cases.  In such cases, users could set non-access
+pattern driven filters using files in this directory.
+
+In the beginning, this directory has only one file, ``nr_filters``.  Writing a
+number (``N``) to the file creates the number of child directories named ``0``
+to ``N-1``.  Each directory represents each filter.  The filters are evaluated
+in the numeric order.
+
+Each filter directory contains three files, namely ``type``, ``matcing``, and
+``memcg_path``.  You can write one of two special keywords, ``anon`` for
+anonymous pages, or ``memcg`` for specific memory cgroup filtering.  In case of
+the memory cgroup filtering, you can specify the memory cgroup of the interest
+by writing the path of the memory cgroup from the cgroups mount point to
+``memcg_path`` file.  You can write ``Y`` or ``N`` to ``matching`` file to
+filter out pages that does or does not match to the type, respectively.  Then,
+the scheme's action will not be applied to the pages that specified to be
+filtered out.
+
+For example, below restricts a DAMOS action to be applied to only non-anonymous
+pages of all memory cgroups except ``/having_care_already``.::
+
+    # echo 2 > nr_filters
+    # # filter out anonymous pages
+    echo anon > 0/type
+    echo Y > 0/matching
+    # # further filter out all cgroups except one at '/having_care_already'
+    echo memcg > 1/type
+    echo /having_care_already > 1/memcg_path
+    echo N > 1/matching
+
+Note that filters could be ignored depend on the running DAMON operations set
+`implementation <sysfs_contexts>`.
+
 .. _sysfs_schemes_stats:
 
 schemes/<N>/stats/

From 87934ea47b804fb6b622856ba74cbf4af7122ef2 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 5 Dec 2022 23:08:30 +0000
Subject: [PATCH 682/737] Docs/ABI/damon: document scheme filters files

Document newly added DAMON sysfs interface files for DAMOS filtering on
the DAMON ABI document.

Link: https://lkml.kernel.org/r/20221205230830.144349-12-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../ABI/testing/sysfs-kernel-mm-damon         | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon
index 13397b8536926..2744f21b5a6b3 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-damon
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon
@@ -258,6 +258,35 @@ Contact:	SeongJae Park <sj@kernel.org>
 Description:	Writing to and reading from this file sets and gets the low
 		watermark of the scheme in permil.
 
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/nr_filters
+Date:		Dec 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing a number 'N' to this file creates the number of
+		directories for setting filters of the scheme named '0' to
+		'N-1' under the filters/ directory.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/type
+Date:		Dec 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing to and reading from this file sets and gets the type of
+		the memory of the interest.  'anon' for anonymous pages, or
+		'memcg' for specific memory cgroup can be written and read.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/memcg_path
+Date:		Dec 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	If 'memcg' is written to the 'type' file, writing to and
+		reading from this file sets and gets the path to the memory
+		cgroup of the interest.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/matching
+Date:		Dec 2022
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing 'Y' or 'N' to this file sets whether to filter out
+		pages that do or do not match to the 'type' and 'memcg_path',
+		respectively.  Filter out means the action of the scheme will
+		not be applied to.
+
 What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/stats/nr_tried
 Date:		Mar 2022
 Contact:	SeongJae Park <sj@kernel.org>

From 102afbecc3c22b85ca20af25f962961a6b0b59aa Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 9 Jan 2023 21:33:32 +0000
Subject: [PATCH 683/737] mm/damon/vaddr: record appropriate folio size when
 the access is not found

DAMON virtual address spaces monitoring operations set doesn't set folio
size of the access checked address if access is not found.  It could
result in unnecessary and inefficient repeated check.  Appropriately set
the size regardless of access check result.

Link: https://lkml.kernel.org/r/20230109213335.62525-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/vaddr.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 4c953e4701f05..87d7d314b4351 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -450,10 +450,9 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
 			goto huge_out;
 		if (pmd_young(*pmd) || !page_is_idle(page) ||
 					mmu_notifier_test_young(walk->mm,
-						addr)) {
-			*priv->page_sz = HPAGE_PMD_SIZE;
+						addr))
 			priv->young = true;
-		}
+		*priv->page_sz = HPAGE_PMD_SIZE;
 		put_page(page);
 huge_out:
 		spin_unlock(ptl);
@@ -472,10 +471,9 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
 	if (!page)
 		goto out;
 	if (pte_young(*pte) || !page_is_idle(page) ||
-			mmu_notifier_test_young(walk->mm, addr)) {
-		*priv->page_sz = PAGE_SIZE;
+			mmu_notifier_test_young(walk->mm, addr))
 		priv->young = true;
-	}
+	*priv->page_sz = PAGE_SIZE;
 	put_page(page);
 out:
 	pte_unmap_unlock(pte, ptl);
@@ -502,10 +500,9 @@ static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask,
 	get_page(page);
 
 	if (pte_young(entry) || !page_is_idle(page) ||
-	    mmu_notifier_test_young(walk->mm, addr)) {
-		*priv->page_sz = huge_page_size(h);
+	    mmu_notifier_test_young(walk->mm, addr))
 		priv->young = true;
-	}
+	*priv->page_sz = huge_page_size(h);
 
 	put_page(page);
 

From f7e9a9f5d808f7080469e11043ffeb7a5b734102 Mon Sep 17 00:00:00 2001
From: Xu Panda <xu.panda@zte.com.cn>
Date: Mon, 9 Jan 2023 19:46:55 +0800
Subject: [PATCH 684/737] mm/damon/sysfs-schemes: use strscpy() to instead of
 strncpy()

The implementation of strscpy() is more robust and safer.
That's now the recommended way to copy NUL-terminated strings.

Link: https://lkml.kernel.org/r/202301091946553770006@zte.com.cn
Signed-off-by: Xu Panda <xu.panda@zte.com.cn>
Signed-off-by: Yang Yang <yang.yang29@zte.com.cn>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs-schemes.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index f0dabe3e2dc03..86edca66aab1a 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -353,8 +353,7 @@ static ssize_t memcg_path_store(struct kobject *kobj,
 	if (!path)
 		return -ENOMEM;
 
-	strncpy(path, buf, count);
-	path[count] = '\0';
+	strscpy(path, buf, count + 1);
 	filter->memcg_path = path;
 	return count;
 }

From af7822dfaa46911257084d34b51f6ef17def98d8 Mon Sep 17 00:00:00 2001
From: Hui Su <suhui_kernel@163.com>
Date: Sat, 28 Jan 2023 17:11:48 +0800
Subject: [PATCH 685/737] Doc/damon: fix the data path error

%s/modules/module/

Signed-off-by: Hui Su <suhui_kernel@163.com>
Reviewed-by: Alex Shi <alexsshi@kernel.org>
Reviewed-by: SeongJae Park <sj@kernel.org>
Link: https://lore.kernel.org/r/Y9Tm1FiKBPKA2Tcx@localhost.localdomain
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/admin-guide/mm/damon/lru_sort.rst | 4 ++--
 Documentation/admin-guide/mm/damon/reclaim.rst  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/lru_sort.rst b/Documentation/admin-guide/mm/damon/lru_sort.rst
index c09cace806516..7b0775d281b48 100644
--- a/Documentation/admin-guide/mm/damon/lru_sort.rst
+++ b/Documentation/admin-guide/mm/damon/lru_sort.rst
@@ -54,7 +54,7 @@ that is built with ``CONFIG_DAMON_LRU_SORT=y``.
 To let sysadmins enable or disable it and tune for the given system,
 DAMON_LRU_SORT utilizes module parameters.  That is, you can put
 ``damon_lru_sort.<parameter>=<value>`` on the kernel boot command line or write
-proper values to ``/sys/modules/damon_lru_sort/parameters/<parameter>`` files.
+proper values to ``/sys/module/damon_lru_sort/parameters/<parameter>`` files.
 
 Below are the description of each parameter.
 
@@ -283,7 +283,7 @@ doesn't make progress and therefore the free memory rate becomes lower than
 20%, it asks DAMON_LRU_SORT to do nothing again, so that we can fall back to
 the LRU-list based page granularity reclamation. ::
 
-    # cd /sys/modules/damon_lru_sort/parameters
+    # cd /sys/module/damon_lru_sort/parameters
     # echo 500 > hot_thres_access_freq
     # echo 120000000 > cold_min_age
     # echo 10 > quota_ms
diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst
index 3fedcd0a794ec..3394191db9851 100644
--- a/Documentation/admin-guide/mm/damon/reclaim.rst
+++ b/Documentation/admin-guide/mm/damon/reclaim.rst
@@ -46,7 +46,7 @@ that is built with ``CONFIG_DAMON_RECLAIM=y``.
 To let sysadmins enable or disable it and tune for the given system,
 DAMON_RECLAIM utilizes module parameters.  That is, you can put
 ``damon_reclaim.<parameter>=<value>`` on the kernel boot command line or write
-proper values to ``/sys/modules/damon_reclaim/parameters/<parameter>`` files.
+proper values to ``/sys/module/damon_reclaim/parameters/<parameter>`` files.
 
 Below are the description of each parameter.
 
@@ -260,7 +260,7 @@ therefore the free memory rate becomes lower than 20%, it asks DAMON_RECLAIM to
 do nothing again, so that we can fall back to the LRU-list based page
 granularity reclamation. ::
 
-    # cd /sys/modules/damon_reclaim/parameters
+    # cd /sys/module/damon_reclaim/parameters
     # echo 30000000 > min_age
     # echo $((1 * 1024 * 1024 * 1024)) > quota_sz
     # echo 1000 > quota_reset_interval_ms

From 7a5629520ed475c2b63c398d73dc6a1c38b0a420 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 10 Jan 2023 19:03:53 +0000
Subject: [PATCH 686/737] mm/damon/core: update kernel-doc comments for DAMOS
 action supports of each DAMON operations set

Patch series "mm/damon: trivial fixups".

This patchset contains patches for trivial fixups of DAMON's
documentation, MAINTAINERS section, and selftests.

This patch (of 8):

Supports of each DAMOS action are up to DAMON operations set
implementation in use, but not well mentioned on the kernel-doc comments.
Add the comment.

Link: https://lkml.kernel.org/r/20230110190400.119388-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20230110190400.119388-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 42cea3bf7319f..97df587410556 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -91,6 +91,12 @@ struct damon_target {
  * @DAMOS_LRU_DEPRIO:	Deprioritize the region on its LRU lists.
  * @DAMOS_STAT:		Do nothing but count the stat.
  * @NR_DAMOS_ACTIONS:	Total number of DAMOS actions
+ *
+ * The support of each action is up to running &struct damon_operations.
+ * &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR supports all actions except
+ * &enum DAMOS_LRU_PRIO and &enum DAMOS_LRU_DEPRIO.  &enum DAMON_OPS_PADDR
+ * supports only &enum DAMOS_PAGEOUT, &enum DAMOS_LRU_PRIO, &enum
+ * DAMOS_LRU_DEPRIO, and &DAMOS_STAT.
  */
 enum damos_action {
 	DAMOS_WILLNEED,

From 1242f441453cf7754b4ba34ac787cc11e177298b Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 10 Jan 2023 19:03:54 +0000
Subject: [PATCH 687/737] mm/damon/core: update kernel-doc comments for DAMOS
 filters supports of each DAMON operations set

Supports of each DAMOS filter type are up to DAMON operations set
implementation in use, but not well mentioned on the kernel-doc comments.
Add the comment.

Link: https://lkml.kernel.org/r/20230110190400.119388-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 97df587410556..f66fa40e23779 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -227,6 +227,11 @@ struct damos_stat {
  * @DAMOS_FILTER_TYPE_ANON:	Anonymous pages.
  * @DAMOS_FILTER_TYPE_MEMCG:	Specific memcg's pages.
  * @NR_DAMOS_FILTER_TYPES:	Number of filter types.
+ *
+ * The support of each filter type is up to running &struct damon_operations.
+ * &enum DAMON_OPS_PADDR is supporting all filter types, while
+ * &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR are not supporting any
+ * filter types.
  */
 enum damos_filter_type {
 	DAMOS_FILTER_TYPE_ANON,

From 1223ca5a5d7614060ad44a4f8db696857b2868b4 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 10 Jan 2023 19:03:55 +0000
Subject: [PATCH 688/737] Docs/mm/damon/index: mention DAMOS on the intro

What DAMON aims to do is not only access monitoring but efficient and
effective access-aware system operations.  And DAMon-based Operation
Schemes (DAMOS) is the important feature of DAMON for the goal.  Make the
intro of DAMON documentation to emphasize the goal and mention DAMOS.

Link: https://lkml.kernel.org/r/20230110190400.119388-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/vm/damon/index.rst | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/Documentation/vm/damon/index.rst b/Documentation/vm/damon/index.rst
index 48c0bbff98b2f..2983699c12ea1 100644
--- a/Documentation/vm/damon/index.rst
+++ b/Documentation/vm/damon/index.rst
@@ -4,8 +4,9 @@
 DAMON: Data Access MONitor
 ==========================
 
-DAMON is a data access monitoring framework subsystem for the Linux kernel.
-The core mechanisms of DAMON (refer to :doc:`design` for the detail) make it
+DAMON is a Linux kernel subsystem that provides a framework for data access
+monitoring and the monitoring results based system operations.  The core
+monitoring mechanisms of DAMON (refer to :doc:`design` for the detail) make it
 
  - *accurate* (the monitoring output is useful enough for DRAM level memory
    management; It might not appropriate for CPU Cache levels, though),
@@ -14,12 +15,16 @@ The core mechanisms of DAMON (refer to :doc:`design` for the detail) make it
  - *scalable* (the upper-bound of the overhead is in constant range regardless
    of the size of target workloads).
 
-Using this framework, therefore, the kernel's memory management mechanisms can
-make advanced decisions.  Experimental memory management optimization works
-that incurring high data accesses monitoring overhead could implemented again.
-In user space, meanwhile, users who have some special workloads can write
-personalized applications for better understanding and optimizations of their
-workloads and systems.
+Using this framework, therefore, the kernel can operate system in an
+access-aware fashion.  Because the features are also exposed to the user space,
+users who have special information about their workloads can write personalized
+applications for better understanding and optimizations of their workloads and
+systems.
+
+For easier development of such systems, DAMON provides a feature called DAMOS
+(DAMon-based Operation Schemes) in addition to the monitoring.  Using the
+feature, DAMON users in both kernel and user spaces can do access-aware system
+operations with no code but simple configurations.
 
 .. toctree::
    :maxdepth: 2

From ff6de4428c9b0ec1980d83deebe4ea86fb399633 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 10 Jan 2023 19:03:56 +0000
Subject: [PATCH 689/737] Docs/admin-guide/mm/damon/usage: update DAMOS
 actions/filters supports of each DAMON operations set

Supports of each DAMOS action and filters are up to DAMON operations set
implementation, but it's not mentioned in detail on the documentation.
Update the information on the usage document.

Link: https://lkml.kernel.org/r/20230110190400.119388-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 41 +++++++++++++-------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 8a6a963eff25b..18bb283b082ab 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -279,14 +279,25 @@ The ``action`` file is for setting and getting what action you want to apply to
 memory regions having specific access pattern of the interest.  The keywords
 that can be written to and read from the file and their meaning are as below.
 
- - ``willneed``: Call ``madvise()`` for the region with ``MADV_WILLNEED``
- - ``cold``: Call ``madvise()`` for the region with ``MADV_COLD``
- - ``pageout``: Call ``madvise()`` for the region with ``MADV_PAGEOUT``
- - ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``
- - ``nohugepage``: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``
+Note that support of each action depends on the running DAMON operations set
+`implementation <sysfs_contexts>`.
+
+ - ``willneed``: Call ``madvise()`` for the region with ``MADV_WILLNEED``.
+   Supported by ``vaddr`` and ``fvaddr`` operations set.
+ - ``cold``: Call ``madvise()`` for the region with ``MADV_COLD``.
+   Supported by ``vaddr`` and ``fvaddr`` operations set.
+ - ``pageout``: Call ``madvise()`` for the region with ``MADV_PAGEOUT``.
+   Supported by ``vaddr``, ``fvaddr`` and ``paddr`` operations set.
+ - ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``.
+   Supported by ``vaddr`` and ``fvaddr`` operations set.
+ - ``nohugepage``: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``.
+   Supported by ``vaddr`` and ``fvaddr`` operations set.
  - ``lru_prio``: Prioritize the region on its LRU lists.
+   Supported by ``paddr`` operations set.
  - ``lru_deprio``: Deprioritize the region on its LRU lists.
- - ``stat``: Do nothing but count the statistics
+   Supported by ``paddr`` operations set.
+ - ``stat``: Do nothing but count the statistics.
+   Supported by all operations sets.
 
 schemes/<N>/access_pattern/
 ---------------------------
@@ -388,8 +399,8 @@ pages of all memory cgroups except ``/having_care_already``.::
     echo /having_care_already > 1/memcg_path
     echo N > 1/matching
 
-Note that filters could be ignored depend on the running DAMON operations set
-`implementation <sysfs_contexts>`.
+Note that filters are currently supported only when ``paddr``
+`implementation <sysfs_contexts>` is being used.
 
 .. _sysfs_schemes_stats:
 
@@ -618,11 +629,15 @@ The ``<action>`` is a predefined integer for memory management actions, which
 DAMON will apply to the regions having the target access pattern.  The
 supported numbers and their meanings are as below.
 
- - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED``
- - 1: Call ``madvise()`` for the region with ``MADV_COLD``
- - 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT``
- - 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``
- - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``
+ - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED``.  Ignored if
+   ``target`` is ``paddr``.
+ - 1: Call ``madvise()`` for the region with ``MADV_COLD``.  Ignored if
+   ``target`` is ``paddr``.
+ - 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT``.
+ - 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``.  Ignored if
+   ``target`` is ``paddr``.
+ - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``.  Ignored if
+   ``target`` is ``paddr``.
  - 5: Do nothing but count the statistics
 
 Quota

From 282280d6a8487c4ebbe71d23b8aee83f199be064 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 10 Jan 2023 19:03:57 +0000
Subject: [PATCH 690/737] Docs/mm/damon: add a maintainer-profile for DAMON

Document the basic policies and expectations for DAMON development.

Link: https://lkml.kernel.org/r/20230110190400.119388-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/vm/damon/index.rst              |  1 +
 Documentation/vm/damon/maintainer-profile.rst | 62 +++++++++++++++++++
 2 files changed, 63 insertions(+)
 create mode 100644 Documentation/vm/damon/maintainer-profile.rst

diff --git a/Documentation/vm/damon/index.rst b/Documentation/vm/damon/index.rst
index 2983699c12ea1..5e0a505835005 100644
--- a/Documentation/vm/damon/index.rst
+++ b/Documentation/vm/damon/index.rst
@@ -32,3 +32,4 @@ operations with no code but simple configurations.
    faq
    design
    api
+   maintainer-profile
diff --git a/Documentation/vm/damon/maintainer-profile.rst b/Documentation/vm/damon/maintainer-profile.rst
new file mode 100644
index 0000000000000..24a202f03de82
--- /dev/null
+++ b/Documentation/vm/damon/maintainer-profile.rst
@@ -0,0 +1,62 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+DAMON Maintainer Entry Profile
+==============================
+
+The DAMON subsystem covers the files that listed in 'DATA ACCESS MONITOR'
+section of 'MAINTAINERS' file.
+
+The mailing lists for the subsystem are damon@lists.linux.dev and
+linux-mm@kvack.org.  Patches should be made against the mm-unstable tree [1]_
+whenever possible and posted to the mailing lists.
+
+SCM Trees
+---------
+
+There are multiple Linux trees for DAMON development.  Patches under
+development or testing are queued in damon/next [2]_ by the DAMON maintainer.
+Suffieicntly reviewed patches will be queued in mm-unstable [1]_ by the memory
+management subsystem maintainer.  After more sufficient tests, the patches will
+be queued in mm-stable [3]_ , and finally pull-requested to the mainline by the
+memory management subsystem maintainer.
+
+Note again the patches for review should be made against the mm-unstable
+tree[1] whenever possible.  damon/next is only for preview of others' works in
+progress.
+
+Submit checklist addendum
+-------------------------
+
+When making DAMON changes, you should do below.
+
+- Build changes related outputs including kernel and documents.
+- Ensure the builds introduce no new errors or warnings.
+- Run and ensure no new failures for DAMON selftests [4]_ and kunittests [5]_ .
+
+Further doing below and putting the results will be helpful.
+
+- Run damon-tests/corr [6]_ for normal changes.
+- Run damon-tests/perf [7]_ for performance changes.
+
+Key cycle dates
+---------------
+
+Patches can be sent anytime.  Key cycle dates of the mm-unstable[1] and
+mm-stable[3] trees depend on the memory management subsystem maintainer.
+
+Review cadence
+--------------
+
+The DAMON maintainer does the work on the usual work hour (09:00 to 17:00,
+Mon-Fri) in PST.  The response to patches will occasionally be slow.  Do not
+hesitate to send a ping if you have not heard back within a week of sending a
+patch.
+
+
+.. [1] https://git.kernel.org/akpm/mm/h/mm-unstable
+.. [2] https://git.kernel.org/sj/h/damon/next
+.. [3] https://git.kernel.org/akpm/mm/h/mm-stable
+.. [4] https://github.com/awslabs/damon-tests/blob/master/corr/run.sh#L49
+.. [5] https://github.com/awslabs/damon-tests/blob/master/corr/tests/kunit.sh
+.. [6] https://github.com/awslabs/damon-tests/tree/master/corr
+.. [7] https://github.com/awslabs/damon-tests/tree/master/perf

From a363c32046e4f7cccc77aed233a1640b0c995c3a Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 10 Jan 2023 19:03:59 +0000
Subject: [PATCH 691/737] selftests/damon/sysfs: hide expected write failures

DAMON selftests for sysfs (sysfs.sh) tests if some writes to DAMON sysfs
interface files fails as expected.  It makes the test results noisy with
the failure error message because it tests a number of such failures.
Redirect the expected failure error messages to /dev/null to make the
results clean.

Link: https://lkml.kernel.org/r/20230110190400.119388-8-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/sysfs.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh
index a00336ffdcad4..bcd4734ca0943 100644
--- a/tools/testing/selftests/damon/sysfs.sh
+++ b/tools/testing/selftests/damon/sysfs.sh
@@ -24,7 +24,7 @@ ensure_write_fail()
 	content=$2
 	reason=$3
 
-	if echo "$content" > "$file"
+	if (echo "$content" > "$file") 2> /dev/null
 	then
 		echo "writing $content to $file succeed ($fail_reason)"
 		echo "expected failure because $reason"

From c94457ea2c73657e50187d4bfb8d197a608eb631 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 10 Jan 2023 19:04:00 +0000
Subject: [PATCH 692/737] selftests/damon/debugfs_rm_non_contexts: hide
 expected write error messages

A selftest case for DAMON debugfs interface has a test for expected
failure.  To make the test output clean, hide the expected failure error
message.

Link: https://lkml.kernel.org/r/20230110190400.119388-9-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/debugfs_rm_non_contexts.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh b/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh
index 48b7af6b022cb..f3ffeb1343cf2 100644
--- a/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh
+++ b/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh
@@ -10,7 +10,7 @@ dmesg -C
 
 for file in "$DBGFS/"*
 do
-	echo "$(basename "$f")" > "$DBGFS/rm_contexts"
+	(echo "$(basename "$f")" > "$DBGFS/rm_contexts") &> /dev/null
 	if dmesg | grep -q BUG
 	then
 		dmesg

From 02331273383cc8f2df69f906faf2068ce4835fe0 Mon Sep 17 00:00:00 2001
From: Huaisheng Ye <huaisheng.ye@intel.com>
Date: Mon, 16 Jan 2023 14:23:47 +0800
Subject: [PATCH 693/737] mm/damon/core: skip apply schemes if empty

Sometimes there is no scheme in damon's context, for example just use damo
record to monitor workload's data access pattern.

If current damon context doesn't have any scheme in the list, kdamond has
no need to iterate over list of all targets and regions but do nothing.

So, skip apply schemes when ctx->schemes is empty.

Link: https://lkml.kernel.org/r/20230116062347.1148553-1-huaisheng.ye@intel.com
Signed-off-by: Huaisheng Ye <huaisheng.ye@intel.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index 1bf0654ae189d..2db8c53491ca8 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1269,7 +1269,8 @@ static int kdamond_fn(void *data)
 			if (ctx->callback.after_aggregation &&
 					ctx->callback.after_aggregation(ctx))
 				break;
-			kdamond_apply_schemes(ctx);
+			if (!list_empty(&ctx->schemes))
+				kdamond_apply_schemes(ctx);
 			kdamond_reset_aggregated(ctx);
 			kdamond_split_regions(ctx);
 			if (ctx->ops.reset_aggregated)

From 6780f210f49e684b48f7e2c44c0efc6794f25764 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 19 Jan 2023 01:38:29 +0000
Subject: [PATCH 694/737] mm/damon: update comments in damon.h for damon_attrs

Patch series "mm/damon: misc fixes".

This patchset contains three miscellaneous simple fixes for DAMON online
tuning.

This patch (of 3):

Commit cbeaa77b0449 ("mm/damon/core: use a dedicated struct for monitoring
attributes") moved monitoring intervals from damon_ctx to a new struct,
damon_attrs, but a comment in the header file has not updated for the
change.  Update it.

Link: https://lkml.kernel.org/r/20230119013831.1911-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20230119013831.1911-2-sj@kernel.org
Fixes: cbeaa77b0449 ("mm/damon/core: use a dedicated struct for monitoring attributes")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index f66fa40e23779..b5d5e9d6d8358 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -354,10 +354,10 @@ struct damon_ctx;
  * users should register the low level operations for their target address
  * space and usecase via the &damon_ctx.ops.  Then, the monitoring thread
  * (&damon_ctx.kdamond) calls @init and @prepare_access_checks before starting
- * the monitoring, @update after each &damon_ctx.ops_update_interval, and
+ * the monitoring, @update after each &damon_attrs.ops_update_interval, and
  * @check_accesses, @target_valid and @prepare_access_checks after each
- * &damon_ctx.sample_interval.  Finally, @reset_aggregated is called after each
- * &damon_ctx.aggr_interval.
+ * &damon_attrs.sample_interval.  Finally, @reset_aggregated is called after
+ * each &damon_attrs.aggr_interval.
  *
  * Each &struct damon_operations instance having valid @id can be registered
  * via damon_register_ops() and selected by damon_select_ops() later.

From 3aab581af0d5a475873dff1bd7202d7eb2ee863f Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 19 Jan 2023 01:38:30 +0000
Subject: [PATCH 695/737] mm/damon/core: update monitoring results for new
 monitoring attributes

region->nr_accesses is the number of sampling intervals in the last
aggregation interval that access to the region has found, and region->age
is the number of aggregation intervals that its access pattern has
maintained.  Hence, the real meaning of the two fields' values is
depending on current sampling and aggregation intervals.

This means the values need to be updated for every sampling and/or
aggregation intervals updates.  As DAMON core doesn't, it is a duty of
in-kernel DAMON framework applications like DAMON sysfs interface, or the
userspace users.

Handling it in userspace or in-kernel DAMON application is complicated,
inefficient, and repetitive compared to doing the update in DAMON core.
Do the update in DAMON core.

Link: https://lkml.kernel.org/r/20230119013831.1911-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/core.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index 2db8c53491ca8..d9ef62047bf5f 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -465,6 +465,76 @@ void damon_destroy_ctx(struct damon_ctx *ctx)
 	kfree(ctx);
 }
 
+static unsigned int damon_age_for_new_attrs(unsigned int age,
+		struct damon_attrs *old_attrs, struct damon_attrs *new_attrs)
+{
+	return age * old_attrs->aggr_interval / new_attrs->aggr_interval;
+}
+
+/* convert access ratio in bp (per 10,000) to nr_accesses */
+static unsigned int damon_accesses_bp_to_nr_accesses(
+		unsigned int accesses_bp, struct damon_attrs *attrs)
+{
+	unsigned int max_nr_accesses =
+		attrs->aggr_interval / attrs->sample_interval;
+
+	return accesses_bp * max_nr_accesses / 10000;
+}
+
+/* convert nr_accesses to access ratio in bp (per 10,000) */
+static unsigned int damon_nr_accesses_to_accesses_bp(
+		unsigned int nr_accesses, struct damon_attrs *attrs)
+{
+	unsigned int max_nr_accesses =
+		attrs->aggr_interval / attrs->sample_interval;
+
+	return nr_accesses * 10000 / max_nr_accesses;
+}
+
+static unsigned int damon_nr_accesses_for_new_attrs(unsigned int nr_accesses,
+		struct damon_attrs *old_attrs, struct damon_attrs *new_attrs)
+{
+	return damon_accesses_bp_to_nr_accesses(
+			damon_nr_accesses_to_accesses_bp(
+				nr_accesses, old_attrs),
+			new_attrs);
+}
+
+static void damon_update_monitoring_result(struct damon_region *r,
+		struct damon_attrs *old_attrs, struct damon_attrs *new_attrs)
+{
+	r->nr_accesses = damon_nr_accesses_for_new_attrs(r->nr_accesses,
+			old_attrs, new_attrs);
+	r->age = damon_age_for_new_attrs(r->age, old_attrs, new_attrs);
+}
+
+/*
+ * region->nr_accesses is the number of sampling intervals in the last
+ * aggregation interval that access to the region has found, and region->age is
+ * the number of aggregation intervals that its access pattern has maintained.
+ * For the reason, the real meaning of the two fields depend on current
+ * sampling interval and aggregation interval.  This function updates
+ * ->nr_accesses and ->age of given damon_ctx's regions for new damon_attrs.
+ */
+static void damon_update_monitoring_results(struct damon_ctx *ctx,
+		struct damon_attrs *new_attrs)
+{
+	struct damon_attrs *old_attrs = &ctx->attrs;
+	struct damon_target *t;
+	struct damon_region *r;
+
+	/* if any interval is zero, simply forgive conversion */
+	if (!old_attrs->sample_interval || !old_attrs->aggr_interval ||
+			!new_attrs->sample_interval ||
+			!new_attrs->aggr_interval)
+		return;
+
+	damon_for_each_target(t, ctx)
+		damon_for_each_region(r, t)
+			damon_update_monitoring_result(
+					r, old_attrs, new_attrs);
+}
+
 /**
  * damon_set_attrs() - Set attributes for the monitoring.
  * @ctx:		monitoring context
@@ -482,6 +552,7 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs)
 	if (attrs->min_nr_regions > attrs->max_nr_regions)
 		return -EINVAL;
 
+	damon_update_monitoring_results(ctx, attrs);
 	ctx->attrs = *attrs;
 	return 0;
 }

From 747f941bd7d20fce6792e2442999c473024fab24 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 19 Jan 2023 01:38:31 +0000
Subject: [PATCH 696/737] mm/damon/core-test: add a test for
 damon_update_monitoring_results()

Add a simple unit test for damon_update_monitoring_results() function.

Link: https://lkml.kernel.org/r/20230119013831.1911-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/core-test.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h
index 3db9b73687562..fae64d32b9257 100644
--- a/mm/damon/core-test.h
+++ b/mm/damon/core-test.h
@@ -289,6 +289,35 @@ static void damon_test_set_regions(struct kunit *test)
 	damon_destroy_target(t);
 }
 
+static void damon_test_update_monitoring_result(struct kunit *test)
+{
+	struct damon_attrs old_attrs = {
+		.sample_interval = 10, .aggr_interval = 1000,};
+	struct damon_attrs new_attrs;
+	struct damon_region *r = damon_new_region(3, 7);
+
+	r->nr_accesses = 15;
+	r->age = 20;
+
+	new_attrs = (struct damon_attrs){
+		.sample_interval = 100, .aggr_interval = 10000,};
+	damon_update_monitoring_result(r, &old_attrs, &new_attrs);
+	KUNIT_EXPECT_EQ(test, r->nr_accesses, 15);
+	KUNIT_EXPECT_EQ(test, r->age, 2);
+
+	new_attrs = (struct damon_attrs){
+		.sample_interval = 1, .aggr_interval = 1000};
+	damon_update_monitoring_result(r, &old_attrs, &new_attrs);
+	KUNIT_EXPECT_EQ(test, r->nr_accesses, 150);
+	KUNIT_EXPECT_EQ(test, r->age, 2);
+
+	new_attrs = (struct damon_attrs){
+		.sample_interval = 1, .aggr_interval = 100};
+	damon_update_monitoring_result(r, &old_attrs, &new_attrs);
+	KUNIT_EXPECT_EQ(test, r->nr_accesses, 150);
+	KUNIT_EXPECT_EQ(test, r->age, 20);
+}
+
 static struct kunit_case damon_test_cases[] = {
 	KUNIT_CASE(damon_test_target),
 	KUNIT_CASE(damon_test_regions),
@@ -299,6 +328,7 @@ static struct kunit_case damon_test_cases[] = {
 	KUNIT_CASE(damon_test_split_regions_of),
 	KUNIT_CASE(damon_test_ops_registration),
 	KUNIT_CASE(damon_test_set_regions),
+	KUNIT_CASE(damon_test_update_monitoring_result),
 	{},
 };
 

From 809fa16531f2d679c8f568c9786865f28569915a Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 9 Feb 2023 19:20:07 +0000
Subject: [PATCH 697/737] Docs/admin-guide/mm/damon/usage: add DAMON debugfs
 interface deprecation notice

Patch series "mm/damon: deprecate DAMON debugfs interface".

DAMON debugfs interface has announced to be deprecated after >v5.15 LTS
kernel is released.  And v6.1.y has been announced to be an LTS[1].

Though the announcement was there for a while, some people might not have
noticed that so far.  Also, some users could depend on it and have
problems at movng to the alternative (DAMON sysfs interface).

For such cases, keep the code and documents with warning messages and
contacts to ask helps for the deprecation.

[1] https://git.kernel.org/pub/scm/docs/kernel/website.git/commit/?id=332e9121320bc7461b2d3a79665caf153e51732c

This patch (of 3):

DAMON debugfs interface has announced to be deprecated after >v5.15 LTS
kernel is released.  And, v6.1.y has announced to be an LTS[1].

Though the announcement was there for a while, some people might not
noticed that so far.  Also, some users could depend on it and have
problems at  movng to the alternative (DAMON sysfs interface).

For such cases, note DAMON debugfs interface as deprecated, and contacts
to ask helps on the document.

[1] https://git.kernel.org/pub/scm/docs/kernel/website.git/commit/?id=332e9121320bc7461b2d3a79665caf153e51732c

Link: https://lkml.kernel.org/r/20230209192009.7885-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20230209192009.7885-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 18bb283b082ab..1772770eedbe4 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -25,10 +25,12 @@ DAMON provides below interfaces for different users.
   interface provides only simple :ref:`statistics <damos_stats>` for the
   monitoring results.  For detailed monitoring results, DAMON provides a
   :ref:`tracepoint <tracepoint>`.
-- *debugfs interface.*
+- *debugfs interface. (DEPRECATED!)*
   :ref:`This <debugfs_interface>` is almost identical to :ref:`sysfs interface
-  <sysfs_interface>`.  This will be removed after next LTS kernel is released,
-  so users should move to the :ref:`sysfs interface <sysfs_interface>`.
+  <sysfs_interface>`.  This is deprecated, so users should move to the
+  :ref:`sysfs interface <sysfs_interface>`.  If you depend on this and cannot
+  move, please report your usecase to damon@lists.linux.dev and
+  linux-mm@kvack.org.
 - *Kernel Space Programming Interface.*
   :doc:`This </vm/damon/api>` is for kernel space programmers.  Using this,
   users can utilize every feature of DAMON most flexibly and efficiently by
@@ -487,13 +489,17 @@ the files as above.  Above is only for an example.
 
 .. _debugfs_interface:
 
-debugfs Interface
-=================
+debugfs Interface (DEPRECATED!)
+===============================
 
 .. note::
 
-  DAMON debugfs interface will be removed after next LTS kernel is released, so
-  users should move to the :ref:`sysfs interface <sysfs_interface>`.
+  THIS IS DEPRECATED!
+
+  DAMON debugfs interface is deprecated, so users should move to the
+  :ref:`sysfs interface <sysfs_interface>`.  If you depend on this and cannot
+  move, please report your usecase to damon@lists.linux.dev and
+  linux-mm@kvack.org.
 
 DAMON exports eight files, ``attrs``, ``target_ids``, ``init_regions``,
 ``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` and

From 520149157cc1f4b4b662618b34ba3b5a7f8beacc Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 9 Feb 2023 19:20:08 +0000
Subject: [PATCH 698/737] mm/damon/Kconfig: add DAMON debugfs interface
 deprecation notice

DAMON debugfs interface has announced to be deprecated after >v5.15 LTS
kernel is released.  And, v6.1.y has announced to be an LTS[1].

Though the announcement was there for a while, some people might not
noticed that so far.  Also, some users could depend on it and have
problems at  movng to the alternative (DAMON sysfs interface).

For such cases, note DAMON debugfs interface as deprecated, and contacts
to ask helps on the Kconfig.

[1] https://git.kernel.org/pub/scm/docs/kernel/website.git/commit/?id=332e9121320bc7461b2d3a79665caf153e51732c

Link: https://lkml.kernel.org/r/20230209192009.7885-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/Kconfig | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index 7821fcb3f2586..436c6b4cb5ec5 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -60,7 +60,7 @@ config DAMON_SYSFS
 	  the interface for arbitrary data access monitoring.
 
 config DAMON_DBGFS
-	bool "DAMON debugfs interface"
+	bool "DAMON debugfs interface (DEPRECATED!)"
 	depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS
 	help
 	  This builds the debugfs interface for DAMON.  The user space admins
@@ -68,8 +68,9 @@ config DAMON_DBGFS
 
 	  If unsure, say N.
 
-	  This will be removed after >5.15.y LTS kernel is released, so users
-	  should move to the sysfs interface (DAMON_SYSFS).
+	  This is deprecated, so users should move to the sysfs interface
+	  (DAMON_SYSFS).  If you depend on this and cannot move, please report
+	  your usecase to damon@lists.linux.dev and linux-mm@kvack.org.
 
 config DAMON_DBGFS_KUNIT_TEST
 	bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS

From b9cc26d9607a020b9a5bf63ee16df2535973d072 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 9 Feb 2023 19:20:09 +0000
Subject: [PATCH 699/737] mm/damon/dbgfs: print DAMON debugfs interface
 deprecation message

DAMON debugfs interface has announced to be deprecated after >v5.15 LTS
kernel is released.  And, v6.1.y has announced to be an LTS[1].

Though the announcement was there for a while, some people might not
noticed that so far.  Also, some users could depend on it and have
problems at  movng to the alternative (DAMON sysfs interface).

For such cases, warn DAMON debugfs interface deprecation with contacts
to ask helps when any DAMON debugfs interface file is opened.

[1] https://git.kernel.org/pub/scm/docs/kernel/website.git/commit/?id=332e9121320bc7461b2d3a79665caf153e51732c

[sj@kernel.org: split DAMON debugfs file open warning message, per Randy]
  Link: https://lkml.kernel.org/r/20230209192009.7885-4-sj@kernel.org
  Link: https://lkml.kernel.org/r/20230210044838.63723-4-sj@kernel.org
Link: https://lkml.kernel.org/r/20230209192009.7885-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/dbgfs.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index b3f454a5c6828..124f0f8c97b75 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -20,6 +20,14 @@ static int dbgfs_nr_ctxs;
 static struct dentry **dbgfs_dirs;
 static DEFINE_MUTEX(damon_dbgfs_lock);
 
+static void damon_dbgfs_warn_deprecation(void)
+{
+	pr_warn_once("DAMON debugfs interface is deprecated, "
+		     "so users should move to DAMON_SYSFS. If you cannot, "
+		     "please report your usecase to damon@lists.linux.dev and "
+		     "linux-mm@kvack.org.\n");
+}
+
 /*
  * Returns non-empty string on success, negative error code otherwise.
  */
@@ -711,6 +719,8 @@ static ssize_t dbgfs_kdamond_pid_read(struct file *file,
 
 static int damon_dbgfs_open(struct inode *inode, struct file *file)
 {
+	damon_dbgfs_warn_deprecation();
+
 	file->private_data = inode->i_private;
 
 	return nonseekable_open(inode, file);
@@ -1039,15 +1049,24 @@ static ssize_t dbgfs_monitor_on_write(struct file *file,
 	return ret;
 }
 
+static int damon_dbgfs_static_file_open(struct inode *inode, struct file *file)
+{
+	damon_dbgfs_warn_deprecation();
+	return nonseekable_open(inode, file);
+}
+
 static const struct file_operations mk_contexts_fops = {
+	.open = damon_dbgfs_static_file_open,
 	.write = dbgfs_mk_context_write,
 };
 
 static const struct file_operations rm_contexts_fops = {
+	.open = damon_dbgfs_static_file_open,
 	.write = dbgfs_rm_context_write,
 };
 
 static const struct file_operations monitor_on_fops = {
+	.open = damon_dbgfs_static_file_open,
 	.read = dbgfs_monitor_on_read,
 	.write = dbgfs_monitor_on_write,
 };

From 26a61af86bd334bdeaa9ea5039758bbc3c0f5c22 Mon Sep 17 00:00:00 2001
From: Rishabh Bhatnagar <risbhat@amazon.com>
Date: Wed, 17 May 2023 23:34:54 +0000
Subject: [PATCH 700/737] Update smartpqi driver to latest version.

Update out of tree smartpqi driver to v2.1.22-040.

Signed-off-by: Rishabh Bhatnagar <risbhat@amazon.com>
---
 drivers/amazon/scsi/smartpqi/Makefile         |   2 +
 drivers/amazon/scsi/smartpqi/smartpqi.h       |  15 +-
 drivers/amazon/scsi/smartpqi/smartpqi_init.c  | 466 +++++++++---------
 .../scsi/smartpqi/smartpqi_kernel_compat.c    |   3 +
 .../scsi/smartpqi/smartpqi_kernel_compat.h    |  56 ++-
 .../scsi/smartpqi/smartpqi_sas_transport.c    |  30 +-
 6 files changed, 306 insertions(+), 266 deletions(-)

diff --git a/drivers/amazon/scsi/smartpqi/Makefile b/drivers/amazon/scsi/smartpqi/Makefile
index 64a48e7248a99..f4c5373e4513b 100644
--- a/drivers/amazon/scsi/smartpqi/Makefile
+++ b/drivers/amazon/scsi/smartpqi/Makefile
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
+
 obj-$(CONFIG_AMAZON_SCSI_SMARTPQI) += smartpqi.o
 smartpqi-objs := smartpqi_init.o smartpqi_sis.o smartpqi_sas_transport.o smartpqi_kernel_compat.o
 EXTRA_CFLAGS += -DKCLASS5D
 EXTRA_CFLAGS += -DKFEATURE_HAS_SCSI_CMD_PRIV -DKFEATURE_HAS_HOST_TAGSET_SUPPORT
+
diff --git a/drivers/amazon/scsi/smartpqi/smartpqi.h b/drivers/amazon/scsi/smartpqi/smartpqi.h
index e120fd2e1b0a8..942682598a107 100644
--- a/drivers/amazon/scsi/smartpqi/smartpqi.h
+++ b/drivers/amazon/scsi/smartpqi/smartpqi.h
@@ -301,8 +301,8 @@ struct pqi_raid_path_request {
 	u8	additional_cdb_bytes_usage : 3;
 	u8	reserved5 : 3;
 	u8	cdb[16];
-	u8      reserved6[11];
-	u8      ml_device_lun_number;
+        u8      reserved6[11];
+        u8      ml_device_lun_number;
 	__le32	timeout;
 	struct pqi_sg_descriptor sg_descriptors[PQI_MAX_EMBEDDED_SG_DESCRIPTORS];
 };
@@ -476,8 +476,8 @@ struct pqi_task_management_request {
 	struct pqi_iu_header header;
 	__le16	request_id;
 	__le16	nexus_id;
-	u8      reserved;
-	u8      ml_device_lun_number;
+        u8      reserved;
+        u8      ml_device_lun_number;
 	__le16  timeout;
 	u8	lun_number[8];
 	__le16	protocol_specific;
@@ -1127,6 +1127,7 @@ struct pqi_scsi_dev {
 	u8	volume_offline : 1;
 	u8	rescan : 1;
 	u8	ignore_device : 1;
+	u8	erase_in_progress : 1;
 	bool	aio_enabled;		/* only valid for physical disks */
 	bool	in_remove;
 	bool	device_offline;
@@ -1165,8 +1166,8 @@ struct pqi_scsi_dev {
 	struct list_head delete_list_entry;
 
 	struct pqi_stream_data stream_data[NUM_STREAMS_PER_LUN];
-	atomic_t scsi_cmds_outstanding[PQI_MAX_LUNS_PER_DEVICE];
-	atomic_t raid_bypass_cnt;
+        atomic_t scsi_cmds_outstanding[PQI_MAX_LUNS_PER_DEVICE];
+	unsigned int raid_bypass_cnt;
 };
 
 /* VPD inquiry pages */
@@ -1365,7 +1366,7 @@ struct pqi_ctrl_info {
 	u8		tmf_iu_timeout_supported : 1;
 	u8		firmware_triage_supported : 1;
 	u8		rpl_extended_format_4_5_supported : 1;
-	u8              multi_lun_device_supported : 1;
+        u8              multi_lun_device_supported : 1;
 	u8		enable_r1_writes : 1;
 	u8		enable_r5_writes : 1;
 	u8		enable_r6_writes : 1;
diff --git a/drivers/amazon/scsi/smartpqi/smartpqi_init.c b/drivers/amazon/scsi/smartpqi/smartpqi_init.c
index 1358f722e6811..cd17a128c5133 100644
--- a/drivers/amazon/scsi/smartpqi/smartpqi_init.c
+++ b/drivers/amazon/scsi/smartpqi/smartpqi_init.c
@@ -42,11 +42,11 @@
 #define BUILD_TIMESTAMP
 #endif
 
-#define DRIVER_VERSION		"2.1.20-035"
+#define DRIVER_VERSION		"2.1.22-040"
 #define DRIVER_MAJOR		2
 #define DRIVER_MINOR		1
-#define DRIVER_RELEASE		20
-#define DRIVER_REVISION		29
+#define DRIVER_RELEASE		22
+#define DRIVER_REVISION		32
 
 #define DRIVER_NAME		"Microchip SmartPQI Driver (v" \
 				DRIVER_VERSION BUILD_TIMESTAMP ")"
@@ -61,10 +61,10 @@
 MODULE_AUTHOR("Microchip");
 #if TORTUGA
 MODULE_DESCRIPTION("Driver for Microchip Smart Family Controller version "
-	DRIVER_VERSION " (d-eaf4713/s-2aee658) (d147/s325)");
+	DRIVER_VERSION " (d-b7f1535/s-ed725ab)" " (d147/s325)");
 #else
 MODULE_DESCRIPTION("Driver for Microchip Smart Family Controller version "
-	DRIVER_VERSION " (d-eaf4713/s-2aee658)");
+	DRIVER_VERSION " (d-b7f1535/s-ed725ab)");
 #endif
 MODULE_VERSION(DRIVER_VERSION);
 MODULE_LICENSE("GPL");
@@ -148,42 +148,42 @@ static unsigned int pqi_supported_event_types[] = {
 
 static int pqi_disable_device_id_wildcards;
 module_param_named(disable_device_id_wildcards,
-	pqi_disable_device_id_wildcards, int, S_IRUGO | S_IWUSR);
+	pqi_disable_device_id_wildcards, int, 0644);
 MODULE_PARM_DESC(disable_device_id_wildcards,
 	"Disable device ID wildcards.");
 
 static int pqi_disable_heartbeat;
 module_param_named(disable_heartbeat,
-	pqi_disable_heartbeat, int, S_IRUGO | S_IWUSR);
+	pqi_disable_heartbeat, int, 0644);
 MODULE_PARM_DESC(disable_heartbeat,
 	"Disable heartbeat.");
 
 static int pqi_disable_ctrl_shutdown;
 module_param_named(disable_ctrl_shutdown,
-	pqi_disable_ctrl_shutdown, int, S_IRUGO | S_IWUSR);
+	pqi_disable_ctrl_shutdown, int, 0644);
 MODULE_PARM_DESC(disable_ctrl_shutdown,
 	"Disable controller shutdown when controller locked up.");
 
 static char *pqi_lockup_action_param;
 module_param_named(lockup_action,
-	pqi_lockup_action_param, charp, S_IRUGO | S_IWUSR);
+	pqi_lockup_action_param, charp, 0644);
 MODULE_PARM_DESC(lockup_action, "Action to take when controller locked up.\n"
 	"\t\tSupported: none, reboot, panic\n"
 	"\t\tDefault: none");
 
 static int pqi_expose_ld_first;
 module_param_named(expose_ld_first,
-	pqi_expose_ld_first, int, S_IRUGO | S_IWUSR);
+	pqi_expose_ld_first, int, 0644);
 MODULE_PARM_DESC(expose_ld_first, "Expose logical drives before physical drives.");
 
 static int pqi_hide_vsep;
 module_param_named(hide_vsep,
-	pqi_hide_vsep, int, S_IRUGO | S_IWUSR);
+	pqi_hide_vsep, int, 0644);
 MODULE_PARM_DESC(hide_vsep, "Hide the virtual SEP for direct attached drives.");
 
 static int pqi_limit_xfer_to_1MB;
 module_param_named(limit_xfer_size_to_1MB,
-	pqi_limit_xfer_to_1MB, int, S_IRUGO | S_IWUSR);
+	pqi_limit_xfer_to_1MB, int, 0644);
 MODULE_PARM_DESC(limit_xfer_size_to_1MB, "Limit max transfer size to 1MB.");
 
 static int pqi_disable_managed_interrupts;
@@ -527,6 +527,36 @@ static inline void pqi_clear_soft_reset_status(struct pqi_ctrl_info *ctrl_info)
 	writeb(status, ctrl_info->soft_reset_status);
 }
 
+static inline bool pqi_is_io_high_priority(struct pqi_scsi_dev *device, struct scsi_cmnd *scmd)
+{
+	bool io_high_prio;
+	int priority_class;
+
+	io_high_prio = false;
+
+	if (device->ncq_prio_enable) {
+		priority_class =
+			IOPRIO_PRIO_CLASS(req_get_ioprio(PQI_SCSI_REQUEST(scmd)));
+		if (priority_class == IOPRIO_CLASS_RT) {
+			/* Set NCQ priority for read/write commands. */
+			switch (scmd->cmnd[0]) {
+			case WRITE_16:
+			case READ_16:
+			case WRITE_12:
+			case READ_12:
+			case WRITE_10:
+			case READ_10:
+			case WRITE_6:
+			case READ_6:
+				io_high_prio = true;
+				break;
+			}
+		}
+	}
+
+	return io_high_prio;
+}
+
 static int pqi_map_single(struct pci_dev *pci_dev,
 	struct pqi_sg_descriptor *sg_descriptor, void *buffer,
 	size_t buffer_length, enum dma_data_direction data_direction)
@@ -586,10 +616,6 @@ static int pqi_build_raid_path_request(struct pqi_ctrl_info *ctrl_info,
 	cdb = request->cdb;
 
 	switch (cmd) {
-	case TEST_UNIT_READY:
-		request->data_direction = SOP_READ_FLAG;
-		cdb[0] = TEST_UNIT_READY;
-		break;
 	case INQUIRY:
 		request->data_direction = SOP_READ_FLAG;
 		cdb[0] = INQUIRY;
@@ -693,7 +719,8 @@ static inline struct pqi_io_request *pqi_alloc_io_request(struct pqi_ctrl_info *
 
 	io_request =  pqi_get_io_request(ctrl_info, scmd);
 
-	pqi_reinit_io_request(io_request);
+	if (io_request)
+		pqi_reinit_io_request(io_request);
 
 	return io_request;
 }
@@ -1557,6 +1584,7 @@ static void pqi_get_volume_status(struct pqi_ctrl_info *ctrl_info,
 
 #define PQI_DEVICE_NCQ_PRIO_SUPPORTED	0x01
 #define PQI_DEVICE_PHY_MAP_SUPPORTED	0x10
+#define PQI_DEVICE_ERASE_IN_PROGRESS	0x10
 
 static int pqi_get_physical_device_info(struct pqi_ctrl_info *ctrl_info,
 	struct pqi_scsi_dev *device,
@@ -1605,6 +1633,8 @@ static int pqi_get_physical_device_info(struct pqi_ctrl_info *ctrl_info,
 		((get_unaligned_le32(&id_phys->misc_drive_flags) >> 16) &
 		PQI_DEVICE_NCQ_PRIO_SUPPORTED);
 
+	device->erase_in_progress = !!(get_unaligned_le16(&id_phys->extra_physical_drive_flags) & PQI_DEVICE_ERASE_IN_PROGRESS);
+
 	return 0;
 }
 
@@ -1650,7 +1680,7 @@ static int pqi_get_logical_device_info(struct pqi_ctrl_info *ctrl_info,
 
 /*
  * Prevent adding drive to OS for some corner cases such as a drive
- * undergoing a sanitize operation. Some OSes will continue to poll
+ * undergoing a sanitize (erase) operation. Some OSes will continue to poll
  * the drive until the sanitize completes, which can take hours,
  * resulting in long bootup delays. Commands such as TUR, READ_CAP
  * are allowed, but READ/WRITE cause check condition. So the OS
@@ -1658,74 +1688,9 @@ static int pqi_get_logical_device_info(struct pqi_ctrl_info *ctrl_info,
  * Note: devices that have completed sanitize must be re-enabled
  *       using the management utility.
  */
-static bool pqi_keep_device_offline(struct pqi_ctrl_info *ctrl_info,
-	struct pqi_scsi_dev *device)
+static inline bool pqi_keep_device_offline(struct pqi_scsi_dev *device)
 {
-	u8 scsi_status;
-	int rc;
-	enum dma_data_direction dir;
-	char *buffer;
-	int buffer_length = 64;
-	size_t sense_data_length;
-	struct scsi_sense_hdr sshdr;
-	struct pqi_raid_path_request request;
-	struct pqi_raid_error_info error_info;
-	bool offline = false; /* Assume keep online */
-
-	/* Do not check controllers. */
-	if (pqi_is_hba_lunid(device->scsi3addr))
-		return false;
-
-	/* Do not check LVs. */
-	if (pqi_is_logical_device(device))
-		return false;
-
-	buffer = kmalloc(buffer_length, GFP_KERNEL);
-	if (!buffer)
-		return false; /* Assume not offline */
-
-	/* Check for SANITIZE in progress using TUR */
-	rc = pqi_build_raid_path_request(ctrl_info, &request,
-		TEST_UNIT_READY, RAID_CTLR_LUNID, buffer,
-		buffer_length, 0, &dir);
-	if (rc)
-		goto out; /* Assume not offline */
-
-	memcpy(request.lun_number, device->scsi3addr, sizeof(request.lun_number));
-
-	rc = pqi_submit_raid_request_synchronous(ctrl_info, &request.header, 0, &error_info);
-
-	if (rc)
-		goto out; /* Assume not offline */
-
-	scsi_status = error_info.status;
-	sense_data_length = get_unaligned_le16(&error_info.sense_data_length);
-	if (sense_data_length == 0)
-		sense_data_length =
-			get_unaligned_le16(&error_info.response_data_length);
-	if (sense_data_length) {
-		if (sense_data_length > sizeof(error_info.data))
-			sense_data_length = sizeof(error_info.data);
-
-		/*
-		 * Check for sanitize in progress: asc:0x04, ascq: 0x1b
-		 */
-		if (scsi_status == SAM_STAT_CHECK_CONDITION &&
-			scsi_normalize_sense(error_info.data,
-				sense_data_length, &sshdr) &&
-				sshdr.sense_key == NOT_READY &&
-				sshdr.asc == 0x04 &&
-				sshdr.ascq == 0x1b) {
-
-			device->device_offline = true;
-			offline = true;
-			goto out; /* Keep device offline */
-		}
-	}
-
-out:
-	kfree(buffer);
-	return offline;
+	return device->erase_in_progress;
 }
 
 static int pqi_get_device_info_phys_logical(struct pqi_ctrl_info *ctrl_info,
@@ -1850,7 +1815,7 @@ static void pqi_show_volume_status(struct pqi_ctrl_info *ctrl_info,
 		status = "Volume status not available";
 		break;
 	default:
-		snprintf(unknown_state_buffer, sizeof(unknown_state_buffer),
+		scnprintf(unknown_state_buffer, sizeof(unknown_state_buffer),
 			unknown_state_str, device->volume_status);
 		status = unknown_state_buffer;
 		break;
@@ -2471,10 +2436,6 @@ static int pqi_update_scsi_devices(struct pqi_ctrl_info *ctrl_info)
 		if (!pqi_is_supported_device(device))
 			continue;
 
-		/* Do not present disks that the OS cannot fully probe */
-		if (pqi_keep_device_offline(ctrl_info, device))
-			continue;
-
 		/* Gather information about the device. */
 		rc = pqi_get_device_info(ctrl_info, device, id_phys);
 		if (rc == -ENOMEM) {
@@ -2497,6 +2458,10 @@ static int pqi_update_scsi_devices(struct pqi_ctrl_info *ctrl_info)
 			continue;
 		}
 
+		/* Do not present disks that the OS cannot fully probe. */
+		if (pqi_keep_device_offline(device))
+			continue;
+
 		pqi_assign_bus_target_lun(device);
 
 		if (device->is_physical_device) {
@@ -2803,9 +2768,9 @@ static int pqi_calc_aio_r5_or_r6(struct pqi_scsi_dev_raid_map_data *rmd,
 	/* Verify first and last block are in same RAID group. */
 	rmd->stripesize = rmd->blocks_per_row * rmd->layout_map_count;
 	rmd->first_group = (rmd->first_block %
-			rmd->stripesize) / rmd->blocks_per_row;
+		rmd->stripesize) / rmd->blocks_per_row;
 	rmd->last_group = (rmd->last_block %
-			rmd->stripesize) / rmd->blocks_per_row;
+		rmd->stripesize) / rmd->blocks_per_row;
 	if (rmd->first_group != rmd->last_group)
 		return PQI_RAID_BYPASS_INELIGIBLE;
 
@@ -3556,7 +3521,7 @@ static void pqi_process_soft_reset(struct pqi_ctrl_info *ctrl_info)
 		/* fall through */
 	case RESET_INITIATE_DRIVER:
 		dev_info(&ctrl_info->pci_dev->dev,
-				"Online Firmware Activation: resetting controller\n");
+			"Online Firmware Activation: resetting controller\n");
 		sis_soft_reset(ctrl_info);
 		/* fall through */
 	case RESET_INITIATE_FIRMWARE:
@@ -3566,12 +3531,12 @@ static void pqi_process_soft_reset(struct pqi_ctrl_info *ctrl_info)
 		pqi_ofa_free_host_buffer(ctrl_info);
 		pqi_ctrl_ofa_done(ctrl_info);
 		dev_info(&ctrl_info->pci_dev->dev,
-				"Online Firmware Activation: %s\n",
-				rc == 0 ? "SUCCESS" : "FAILED");
+			"Online Firmware Activation: %s\n",
+			rc == 0 ? "SUCCESS" : "FAILED");
 		break;
 	case RESET_ABORT:
 		dev_info(&ctrl_info->pci_dev->dev,
-				"Online Firmware Activation ABORTED\n");
+			"Online Firmware Activation ABORTED\n");
 		if (ctrl_info->soft_reset_handshake_supported)
 			pqi_clear_soft_reset_status(ctrl_info);
 		pqi_ofa_free_host_buffer(ctrl_info);
@@ -3719,7 +3684,9 @@ static void pqi_heartbeat_timer_handler(struct timer_list *t)
 {
 	int num_interrupts;
 	u32 heartbeat_count;
-	struct pqi_ctrl_info *ctrl_info = from_timer(ctrl_info, t, heartbeat_timer);
+	struct pqi_ctrl_info *ctrl_info;
+
+	ctrl_info = from_timer(ctrl_info, t, heartbeat_timer);
 
 	pqi_check_ctrl_health(ctrl_info);
 	if (pqi_ctrl_offline(ctrl_info))
@@ -3967,10 +3934,11 @@ static irqreturn_t pqi_irq_handler(int irq, void *data)
 
 static int pqi_request_irqs(struct pqi_ctrl_info *ctrl_info)
 {
-	struct pci_dev *pci_dev = ctrl_info->pci_dev;
+	struct pci_dev *pci_dev;
 	int i;
 	int rc;
 
+	pci_dev = ctrl_info->pci_dev;
 	ctrl_info->event_irq = pqi_pci_irq_vector(pci_dev, 0);
 
 	for (i = 0; i < ctrl_info->num_msix_vectors_enabled; i++) {
@@ -4782,8 +4750,7 @@ static int pqi_create_event_queue(struct pqi_ctrl_info *ctrl_info)
 	put_unaligned_le16(event_queue->int_msg_num,
 		&request.data.create_operational_oq.int_msg_num);
 
-	rc = pqi_submit_admin_request_synchronous(ctrl_info, &request,
-		&response);
+	rc = pqi_submit_admin_request_synchronous(ctrl_info, &request, &response);
 	if (rc)
 		return rc;
 
@@ -4862,8 +4829,7 @@ static int pqi_create_queue_group(struct pqi_ctrl_info *ctrl_info,
 		&request.data.create_operational_iq.element_length);
 	request.data.create_operational_iq.queue_protocol = PQI_PROTOCOL_SOP;
 
-	rc = pqi_submit_admin_request_synchronous(ctrl_info, &request,
-		&response);
+	rc = pqi_submit_admin_request_synchronous(ctrl_info, &request, &response);
 	if (rc) {
 		dev_err(&ctrl_info->pci_dev->dev,
 			"error creating inbound AIO queue\n");
@@ -5457,15 +5423,19 @@ static void pqi_raid_io_complete(struct pqi_io_request *io_request,
 	pqi_scsi_done(scmd);
 }
 
-static int pqi_raid_submit_scsi_cmd_with_io_request(
-	struct pqi_ctrl_info *ctrl_info, struct pqi_io_request *io_request,
+static int pqi_raid_submit_io(struct pqi_ctrl_info *ctrl_info,
 	struct pqi_scsi_dev *device, struct scsi_cmnd *scmd,
-	struct pqi_queue_group *queue_group)
+	struct pqi_queue_group *queue_group, bool io_high_prio)
 {
 	int rc;
 	size_t cdb_length;
+	struct pqi_io_request *io_request;
 	struct pqi_raid_path_request *request;
 
+	io_request = pqi_alloc_io_request(ctrl_info, scmd);
+	if (!io_request)
+		return SCSI_MLQUEUE_HOST_BUSY;
+
 	io_request->io_complete_callback = pqi_raid_io_complete;
 	io_request->scmd = scmd;
 
@@ -5475,6 +5445,7 @@ static int pqi_raid_submit_scsi_cmd_with_io_request(
 	request->header.iu_type = PQI_REQUEST_IU_RAID_PATH_IO;
 	put_unaligned_le32(scsi_bufflen(scmd), &request->buffer_length);
 	request->task_attribute = SOP_TASK_ATTRIBUTE_SIMPLE;
+	request->command_priority = io_high_prio;
 	put_unaligned_le16(io_request->index, &request->request_id);
 	request->error_index = request->request_id;
 	memcpy(request->lun_number, device->scsi3addr, sizeof(request->lun_number));
@@ -5545,14 +5516,11 @@ static inline int pqi_raid_submit_scsi_cmd(struct pqi_ctrl_info *ctrl_info,
 	struct pqi_scsi_dev *device, struct scsi_cmnd *scmd,
 	struct pqi_queue_group *queue_group)
 {
-	struct pqi_io_request *io_request;
+	bool io_high_prio;
 
-	io_request = pqi_alloc_io_request(ctrl_info, scmd);
-	if (!io_request)
-		return SCSI_MLQUEUE_HOST_BUSY;
+	io_high_prio = pqi_is_io_high_priority(device, scmd);
 
-	return pqi_raid_submit_scsi_cmd_with_io_request(ctrl_info, io_request,
-		device, scmd, queue_group);
+	return pqi_raid_submit_io(ctrl_info, device, scmd, queue_group, io_high_prio);
 }
 
 static bool pqi_raid_bypass_retry_needed(struct pqi_io_request *io_request)
@@ -5597,44 +5565,13 @@ static void pqi_aio_io_complete(struct pqi_io_request *io_request,
 	pqi_scsi_done(scmd);
 }
 
-static inline bool pqi_is_io_high_priority(struct pqi_ctrl_info *ctrl_info,
-	struct pqi_scsi_dev *device, struct scsi_cmnd *scmd)
-{
-	bool io_high_prio;
-	int priority_class;
-
-	io_high_prio = false;
-
-	if (device->ncq_prio_enable) {
-		priority_class =
-			IOPRIO_PRIO_CLASS(req_get_ioprio(PQI_SCSI_REQUEST(scmd)));
-		if (priority_class == IOPRIO_CLASS_RT) {
-			/* Set NCQ priority for read/write commands. */
-			switch (scmd->cmnd[0]) {
-			case WRITE_16:
-			case READ_16:
-			case WRITE_12:
-			case READ_12:
-			case WRITE_10:
-			case READ_10:
-			case WRITE_6:
-			case READ_6:
-				io_high_prio = true;
-				break;
-			}
-		}
-	}
-
-	return io_high_prio;
-}
-
 static inline int pqi_aio_submit_scsi_cmd(struct pqi_ctrl_info *ctrl_info,
 	struct pqi_scsi_dev *device, struct scsi_cmnd *scmd,
 	struct pqi_queue_group *queue_group)
 {
 	bool io_high_prio;
 
-	io_high_prio = pqi_is_io_high_priority(ctrl_info, device, scmd);
+	io_high_prio = pqi_is_io_high_priority(device, scmd);
 
 	return pqi_aio_submit_io(ctrl_info, scmd, device->aio_handle,
 		scmd->cmnd, scmd->cmd_len, queue_group, NULL,
@@ -5652,10 +5589,10 @@ static int pqi_aio_submit_io(struct pqi_ctrl_info *ctrl_info,
 	struct pqi_aio_path_request *request;
 	struct pqi_scsi_dev *device;
 
-	device = scmd->device->hostdata;
 	io_request = pqi_alloc_io_request(ctrl_info, scmd);
 	if (!io_request)
 		return SCSI_MLQUEUE_HOST_BUSY;
+
 	io_request->io_complete_callback = pqi_aio_io_complete;
 	io_request->scmd = scmd;
 	io_request->raid_bypass = raid_bypass;
@@ -5670,6 +5607,7 @@ static int pqi_aio_submit_io(struct pqi_ctrl_info *ctrl_info,
 	request->command_priority = io_high_prio;
 	put_unaligned_le16(io_request->index, &request->request_id);
 	request->error_index = request->request_id;
+	device = scmd->device->hostdata;
 	if (!pqi_is_logical_device(device) && ctrl_info->multi_lun_device_supported)
 		put_unaligned_le64(((scmd->device->lun) << 8), &request->lun_number);
 	if (cdb_length > sizeof(request->cdb))
@@ -5886,7 +5824,7 @@ void pqi_prep_for_scsi_done(struct scsi_cmnd *scmd)
 	shost = scmd->device->host;
 	ctrl_info = shost_to_hba(shost);
 
-	atomic_dec(&device->scsi_cmds_outstanding[scmd->device->lun]);
+        atomic_dec(&device->scsi_cmds_outstanding[scmd->device->lun]);
 #if !KFEATURE_HAS_HOST_TAGSET_SUPPORT
 	atomic_dec(&ctrl_info->total_scmds_outstanding);
 #endif
@@ -5985,7 +5923,7 @@ int pqi_scsi_queue_command(struct Scsi_Host *shost, struct scsi_cmnd *scmd)
 		return 0;
 	}
 
-	atomic_inc(&device->scsi_cmds_outstanding[scmd->device->lun]);
+        atomic_inc(&device->scsi_cmds_outstanding[scmd->device->lun]);
 #if !KFEATURE_HAS_HOST_TAGSET_SUPPORT
 	if (atomic_inc_return(&ctrl_info->total_scmds_outstanding) >
 		ctrl_info->scsi_ml_can_queue) {
@@ -6022,7 +5960,7 @@ int pqi_scsi_queue_command(struct Scsi_Host *shost, struct scsi_cmnd *scmd)
 			rc = pqi_raid_bypass_submit_scsi_cmd(ctrl_info, device, scmd, queue_group);
 			if (rc == 0 || rc == SCSI_MLQUEUE_HOST_BUSY) {
 				raid_bypassed = true;
-				atomic_inc(&device->raid_bypass_cnt);
+				device->raid_bypass_cnt++;
 			}
 		}
 		if (!raid_bypassed)
@@ -6036,7 +5974,7 @@ int pqi_scsi_queue_command(struct Scsi_Host *shost, struct scsi_cmnd *scmd)
 
 out:
 	if (rc) {
-		atomic_dec(&device->scsi_cmds_outstanding[scmd->device->lun]);
+                atomic_dec(&device->scsi_cmds_outstanding[scmd->device->lun]);
 #if !KFEATURE_HAS_HOST_TAGSET_SUPPORT
 		atomic_dec(&ctrl_info->total_scmds_outstanding);
 #endif
@@ -6250,7 +6188,7 @@ static int pqi_wait_for_lun_reset_completion(struct pqi_ctrl_info *ctrl_info,
 		}
 
 		wait_secs += PQI_LUN_RESET_POLL_COMPLETION_SECS;
-		cmds_outstanding = atomic_read(&device->scsi_cmds_outstanding[lun]);
+                cmds_outstanding = atomic_read(&device->scsi_cmds_outstanding[lun]);
 		dev_warn(&ctrl_info->pci_dev->dev,
 			"scsi %d:%d:%d:%d: waiting %u seconds for LUN reset to complete (%d command(s) outstanding)\n",
 			ctrl_info->scsi_host->host_no, device->bus, device->target, lun, wait_secs, cmds_outstanding);
@@ -6888,21 +6826,26 @@ static ssize_t pqi_lockup_action_store(struct device *dev,
 static ssize_t pqi_host_enable_stream_detection_show(struct device *dev,
 	struct device_attribute *attr, char *buffer)
 {
-	struct Scsi_Host *shost = class_to_shost(dev);
-	struct pqi_ctrl_info *ctrl_info = shost_to_hba(shost);
+	struct Scsi_Host *shost;
+	struct pqi_ctrl_info *ctrl_info;
 
-	return snprintf(buffer, 10, "%hhx\n",
-			ctrl_info->enable_stream_detection);
+	shost = class_to_shost(dev);
+	ctrl_info = shost_to_hba(shost);
+
+	return scnprintf(buffer, 10, "%hhx\n", ctrl_info->enable_stream_detection);
 }
 
 static ssize_t pqi_host_enable_stream_detection_store(struct device *dev,
 	struct device_attribute *attr, const char *buffer, size_t count)
 {
-	struct Scsi_Host *shost = class_to_shost(dev);
-	struct pqi_ctrl_info *ctrl_info = shost_to_hba(shost);
-	u8 set_stream_detection = 0;
+	struct Scsi_Host *shost;
+	struct pqi_ctrl_info *ctrl_info;
+	u8 set_stream_detection;
+
+	shost = class_to_shost(dev);
+	ctrl_info = shost_to_hba(shost);
 
-	if (sscanf(buffer, "%hhx", &set_stream_detection) != 1)
+	if (kstrtou8(buffer, 0, &set_stream_detection))
 		return -EINVAL;
 
 	ctrl_info->enable_stream_detection = set_stream_detection;
@@ -6913,20 +6856,26 @@ static ssize_t pqi_host_enable_stream_detection_store(struct device *dev,
 static ssize_t pqi_host_enable_r5_writes_show(struct device *dev,
 	struct device_attribute *attr, char *buffer)
 {
-	struct Scsi_Host *shost = class_to_shost(dev);
-	struct pqi_ctrl_info *ctrl_info = shost_to_hba(shost);
+	struct Scsi_Host *shost;
+	struct pqi_ctrl_info *ctrl_info;
 
-	return snprintf(buffer, 10, "%hhx\n", ctrl_info->enable_r5_writes);
+	shost = class_to_shost(dev);
+	ctrl_info = shost_to_hba(shost);
+
+	return scnprintf(buffer, 10, "%hhx\n", ctrl_info->enable_r5_writes);
 }
 
 static ssize_t pqi_host_enable_r5_writes_store(struct device *dev,
 	struct device_attribute *attr, const char *buffer, size_t count)
 {
-	struct Scsi_Host *shost = class_to_shost(dev);
-	struct pqi_ctrl_info *ctrl_info = shost_to_hba(shost);
-	u8 set_r5_writes = 0;
+	struct Scsi_Host *shost;
+	struct pqi_ctrl_info *ctrl_info;
+	u8 set_r5_writes;
 
-	if (sscanf(buffer, "%hhx", &set_r5_writes) != 1)
+	shost = class_to_shost(dev);
+	ctrl_info = shost_to_hba(shost);
+	
+	if (kstrtou8(buffer, 0, &set_r5_writes))
 		return -EINVAL;
 
 	ctrl_info->enable_r5_writes = set_r5_writes;
@@ -6937,20 +6886,26 @@ static ssize_t pqi_host_enable_r5_writes_store(struct device *dev,
 static ssize_t pqi_host_enable_r6_writes_show(struct device *dev,
 	struct device_attribute *attr, char *buffer)
 {
-	struct Scsi_Host *shost = class_to_shost(dev);
-	struct pqi_ctrl_info *ctrl_info = shost_to_hba(shost);
+	struct Scsi_Host *shost;
+	struct pqi_ctrl_info *ctrl_info;
+
+	shost = class_to_shost(dev);
+	ctrl_info = shost_to_hba(shost);
 
-	return snprintf(buffer, 10, "%hhx\n", ctrl_info->enable_r6_writes);
+	return scnprintf(buffer, 10, "%hhx\n", ctrl_info->enable_r6_writes);
 }
 
 static ssize_t pqi_host_enable_r6_writes_store(struct device *dev,
 	struct device_attribute *attr, const char *buffer, size_t count)
 {
-	struct Scsi_Host *shost = class_to_shost(dev);
-	struct pqi_ctrl_info *ctrl_info = shost_to_hba(shost);
-	u8 set_r6_writes = 0;
+	struct Scsi_Host *shost;
+	struct pqi_ctrl_info *ctrl_info;
+	u8 set_r6_writes;
 
-	if (sscanf(buffer, "%hhx", &set_r6_writes) != 1)
+	shost = class_to_shost(dev);
+	ctrl_info = shost_to_hba(shost);
+	
+	if (kstrtou8(buffer, 0, &set_r6_writes))
 		return -EINVAL;
 
 	ctrl_info->enable_r6_writes = set_r6_writes;
@@ -6958,20 +6913,20 @@ static ssize_t pqi_host_enable_r6_writes_store(struct device *dev,
 	return count;
 }
 
-static DEVICE_ATTR(driver_version, S_IRUGO, pqi_driver_version_show, NULL);
-static DEVICE_ATTR(firmware_version, S_IRUGO, pqi_firmware_version_show, NULL);
-static DEVICE_ATTR(model, S_IRUGO, pqi_model_show, NULL);
-static DEVICE_ATTR(serial_number, S_IRUGO, pqi_serial_number_show, NULL);
-static DEVICE_ATTR(vendor, S_IRUGO, pqi_vendor_show, NULL);
-static DEVICE_ATTR(rescan, S_IWUSR, NULL, pqi_host_rescan_store);
-static DEVICE_ATTR(lockup_action, S_IWUSR | S_IRUGO, pqi_lockup_action_show,
+static DEVICE_ATTR(driver_version, 0444, pqi_driver_version_show, NULL);
+static DEVICE_ATTR(firmware_version, 0444, pqi_firmware_version_show, NULL);
+static DEVICE_ATTR(model, 0444, pqi_model_show, NULL);
+static DEVICE_ATTR(serial_number, 0444, pqi_serial_number_show, NULL);
+static DEVICE_ATTR(vendor, 0444, pqi_vendor_show, NULL);
+static DEVICE_ATTR(rescan, 0200, NULL, pqi_host_rescan_store);
+static DEVICE_ATTR(lockup_action, 0644, pqi_lockup_action_show,
 	pqi_lockup_action_store);
-static DEVICE_ATTR(enable_stream_detection, S_IWUSR | S_IRUGO,
+static DEVICE_ATTR(enable_stream_detection, 0644,
 	pqi_host_enable_stream_detection_show,
 	pqi_host_enable_stream_detection_store);
-static DEVICE_ATTR(enable_r5_writes, S_IWUSR | S_IRUGO,
+static DEVICE_ATTR(enable_r5_writes, 0644,
 	pqi_host_enable_r5_writes_show, pqi_host_enable_r5_writes_store);
-static DEVICE_ATTR(enable_r6_writes, S_IWUSR | S_IRUGO,
+static DEVICE_ATTR(enable_r6_writes, 0644,
 	pqi_host_enable_r6_writes_show, pqi_host_enable_r6_writes_store);
 
 static struct PQI_DEVICE_ATTRIBUTE *pqi_shost_attrs[] = {
@@ -7061,7 +7016,7 @@ static ssize_t pqi_lunid_show(struct device *dev,
 #define MAX_PATHS	8
 
 static ssize_t pqi_path_info_show(struct device *dev,
-	struct device_attribute *attr, char *buf)
+	struct device_attribute *attr, char *buffer)
 {
 	struct pqi_ctrl_info *ctrl_info;
 	struct scsi_device *sdev;
@@ -7099,7 +7054,7 @@ static ssize_t pqi_path_info_show(struct device *dev,
 		else
 			continue;
 
-		output_len += scnprintf(buf + output_len,
+		output_len += scnprintf(buffer + output_len,
 					PAGE_SIZE - output_len,
 					"[%d:%d:%d:%d] %20.20s ",
 					ctrl_info->scsi_host->host_no,
@@ -7118,25 +7073,25 @@ static ssize_t pqi_path_info_show(struct device *dev,
 		if (phys_connector[1] < '0')
 			phys_connector[1] = '0';
 
-		output_len += scnprintf(buf + output_len,
+		output_len += scnprintf(buffer + output_len,
 					PAGE_SIZE - output_len,
 					"PORT: %.2s ", phys_connector);
 
 		box = device->box[i];
 		if (box != 0 && box != 0xFF)
-			output_len += scnprintf(buf + output_len,
+			output_len += scnprintf(buffer + output_len,
 						PAGE_SIZE - output_len,
 						"BOX: %hhu ", box);
 
 		if ((device->devtype == TYPE_DISK ||
 			device->devtype == TYPE_ZBC) &&
 			pqi_expose_device(device))
-			output_len += scnprintf(buf + output_len,
+			output_len += scnprintf(buffer + output_len,
 						PAGE_SIZE - output_len,
 						"BAY: %hhu ", bay);
 
 end_buffer:
-		output_len += scnprintf(buf + output_len,
+		output_len += scnprintf(buffer + output_len,
 					PAGE_SIZE - output_len,
 					"%s\n", active);
 	}
@@ -7247,7 +7202,7 @@ static ssize_t pqi_raid_bypass_cnt_show(struct device *dev,
 	struct scsi_device *sdev;
 	struct pqi_scsi_dev *device;
 	unsigned long flags;
-	int raid_bypass_cnt;
+	unsigned int raid_bypass_cnt;
 
 	sdev = to_scsi_device(dev);
 	ctrl_info = shost_to_hba(sdev->host);
@@ -7263,7 +7218,7 @@ static ssize_t pqi_raid_bypass_cnt_show(struct device *dev,
 		return -ENODEV;
 	}
 
-	raid_bypass_cnt = atomic_read(&device->raid_bypass_cnt);
+	raid_bypass_cnt = device->raid_bypass_cnt;
 
 	spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
 
@@ -7271,13 +7226,13 @@ static ssize_t pqi_raid_bypass_cnt_show(struct device *dev,
 }
 
 static ssize_t pqi_sas_ncq_prio_enable_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
+	struct device_attribute *attr, char *buffer)
 {
 	struct pqi_ctrl_info *ctrl_info;
 	struct scsi_device *sdev;
 	struct pqi_scsi_dev *device;
 	unsigned long flags;
-	int output_len = 0;
+	bool ncq_prio_enable;
 
 	sdev = to_scsi_device(dev);
 	ctrl_info = shost_to_hba(sdev->host);
@@ -7293,24 +7248,23 @@ static ssize_t pqi_sas_ncq_prio_enable_show(struct device *dev,
 		return -ENODEV;
 	}
 
-	output_len = snprintf(buf, PAGE_SIZE, "%d\n",
-				device->ncq_prio_enable);
+	ncq_prio_enable = device->ncq_prio_enable;
+
 	spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
 
-	return output_len;
+	return scnprintf(buffer, PAGE_SIZE, "%d\n", ncq_prio_enable);
 }
 
 static ssize_t pqi_sas_ncq_prio_enable_store(struct device *dev,
-			struct device_attribute *attr,
-			const char *buf, size_t count)
+	struct device_attribute *attr, const char *buffer, size_t count)
 {
 	struct pqi_ctrl_info *ctrl_info;
 	struct scsi_device *sdev;
 	struct pqi_scsi_dev *device;
 	unsigned long flags;
-	u8 ncq_prio_enable = 0;
+	u8 ncq_prio_enable;
 
-	if (kstrtou8(buf, 0, &ncq_prio_enable))
+	if (kstrtou8(buffer, 0, &ncq_prio_enable))
 		return -EINVAL;
 
 	sdev = to_scsi_device(dev);
@@ -7319,14 +7273,12 @@ static ssize_t pqi_sas_ncq_prio_enable_store(struct device *dev,
 	spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags);
 
 	device = sdev->hostdata;
-
 	if (!device) {
 		spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
 		return -ENODEV;
 	}
 
-	if (!device->ncq_prio_support ||
-		!device->is_physical_device) {
+	if (!device->ncq_prio_support) {
 		spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
 		return -EINVAL;
 	}
@@ -7335,18 +7287,18 @@ static ssize_t pqi_sas_ncq_prio_enable_store(struct device *dev,
 
 	spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags);
 
-	return  strlen(buf);
+	return count;
 }
 
-static DEVICE_ATTR(lunid, S_IRUGO, pqi_lunid_show, NULL);
-static DEVICE_ATTR(unique_id, S_IRUGO, pqi_unique_id_show, NULL);
-static DEVICE_ATTR(path_info, S_IRUGO, pqi_path_info_show, NULL);
-static DEVICE_ATTR(sas_address, S_IRUGO, pqi_sas_address_show, NULL);
-static DEVICE_ATTR(ssd_smart_path_enabled, S_IRUGO, pqi_ssd_smart_path_enabled_show, NULL);
-static DEVICE_ATTR(raid_level, S_IRUGO, pqi_raid_level_show, NULL);
-static DEVICE_ATTR(raid_bypass_cnt, S_IRUGO, pqi_raid_bypass_cnt_show, NULL);
-static DEVICE_ATTR(sas_ncq_prio_enable, S_IWUSR | S_IRUGO,
-		pqi_sas_ncq_prio_enable_show, pqi_sas_ncq_prio_enable_store);
+static DEVICE_ATTR(lunid, 0444, pqi_lunid_show, NULL);
+static DEVICE_ATTR(unique_id, 0444, pqi_unique_id_show, NULL);
+static DEVICE_ATTR(path_info, 0444, pqi_path_info_show, NULL);
+static DEVICE_ATTR(sas_address, 0444, pqi_sas_address_show, NULL);
+static DEVICE_ATTR(ssd_smart_path_enabled, 0444, pqi_ssd_smart_path_enabled_show, NULL);
+static DEVICE_ATTR(raid_level, 0444, pqi_raid_level_show, NULL);
+static DEVICE_ATTR(raid_bypass_cnt, 0444, pqi_raid_bypass_cnt_show, NULL);
+static DEVICE_ATTR(sas_ncq_prio_enable, 0644,
+	pqi_sas_ncq_prio_enable_show, pqi_sas_ncq_prio_enable_store);
 
 static struct PQI_DEVICE_ATTRIBUTE *pqi_sdev_attrs[] = {
 	PQI_ATTRIBUTE(&dev_attr_lunid),
@@ -7554,7 +7506,7 @@ static int pqi_get_ctrl_product_details(struct pqi_ctrl_info *ctrl_info)
 			sizeof(identify->firmware_version_short));
 		ctrl_info->firmware_version
 			[sizeof(identify->firmware_version_short)] = '\0';
-		snprintf(ctrl_info->firmware_version +
+		scnprintf(ctrl_info->firmware_version +
 			strlen(ctrl_info->firmware_version),
 			sizeof(ctrl_info->firmware_version),
 			"-%u",
@@ -7675,8 +7627,8 @@ static int pqi_enable_firmware_features(struct pqi_ctrl_info *ctrl_info,
 			features_requested_iomem_addr +
 			(le16_to_cpu(firmware_features->num_elements) * 2) +
 			sizeof(__le16);
-		writew(PQI_FIRMWARE_FEATURE_MAXIMUM,
-			host_max_known_feature_iomem_addr);
+		writeb(PQI_FIRMWARE_FEATURE_MAXIMUM & 0xFF, host_max_known_feature_iomem_addr);
+		writeb((PQI_FIRMWARE_FEATURE_MAXIMUM & 0xFF00) >> 8, host_max_known_feature_iomem_addr + 1);
 	}
 
 	return pqi_config_table_update(ctrl_info,
@@ -8516,7 +8468,7 @@ static int pqi_pci_init(struct pqi_ctrl_info *ctrl_info)
 
 	ctrl_info->iomem_base = ioremap_nocache(pci_resource_start(
 		ctrl_info->pci_dev, 0),
-		sizeof(struct pqi_ctrl_registers));
+		pci_resource_len(ctrl_info->pci_dev, 0));
 	if (!ctrl_info->iomem_base) {
 		dev_err(&ctrl_info->pci_dev->dev,
 			"failed to map memory for controller registers\n");
@@ -8658,7 +8610,7 @@ static void pqi_remove_ctrl(struct pqi_ctrl_info *ctrl_info)
 	if (ctrl_info->ctrl_removal_state == PQI_CTRL_SURPRISE_REMOVAL) {
 		pqi_fail_all_outstanding_requests(ctrl_info);
 		ctrl_info->pqi_mode_enabled = false;
-       }
+	}
 	pqi_unregister_scsi(ctrl_info);
 	if (ctrl_info->pqi_mode_enabled)
 		pqi_revert_to_sis_mode(ctrl_info);
@@ -9047,16 +8999,12 @@ static void pqi_dump_request(struct pqi_ctrl_info *ctrl_info,
 		scmd->cmnd[12], scmd->cmnd[13], scmd->cmnd[14], scmd->cmnd[15],
 		scmd, atomic_read(&device->scsi_cmds_outstanding[scmd->device->lun]));
 	} else {
-		struct pqi_iu_header request_h;
-		size_t iu_length;
-
-		memcpy(&request_h, io_request->iu, PQI_REQUEST_HEADER_LENGTH);
-		iu_length = get_unaligned_le16(&request_h.iu_length) +
-					PQI_REQUEST_HEADER_LENGTH;
+		struct pqi_iu_header *request;
 
+		request = io_request->iu;
 		dev_warn(&ctrl_info->pci_dev->dev,
 			"sync cmd IU type = 0x%02x len = %u\n",
-			request_h.iu_type, request_h.iu_length);
+			request->iu_type, get_unaligned_le16(&request->iu_length));
 	}
 }
 
@@ -9924,6 +9872,18 @@ static const struct pci_device_id pqi_pci_id_table[] = {
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
 			       PCI_VENDOR_ID_NT, 0x3161)
 	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ZTE, 0x0804)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ZTE, 0x0805)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ZTE, 0x0806)
+	},
 	{
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
 			       PCI_VENDOR_ID_ZTE, 0x5445)
@@ -9960,6 +9920,18 @@ static const struct pci_device_id pqi_pci_id_table[] = {
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
 			       PCI_VENDOR_ID_ZTE, 0x544F)
 	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ZTE, 0x54DA)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ZTE, 0x54DB)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+			       PCI_VENDOR_ID_ZTE, 0x54DC)
+	},
 	{
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
 			       PCI_VENDOR_ID_ZTE, 0x0b27)
@@ -10012,6 +9984,10 @@ static const struct pci_device_id pqi_pci_id_table[] = {
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
 			       PCI_VENDOR_ID_LENOVO, 0x0623)
 	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+				PCI_VENDOR_ID_IBM, 0x0718)
+	},
 	{
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
 				0x1e93, 0x1000)
@@ -10024,6 +10000,50 @@ static const struct pci_device_id pqi_pci_id_table[] = {
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
 				0x1e93, 0x1002)
 	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+				0x1e93, 0x1005)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+				0x1f51, 0x1001)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+				0x1f51, 0x1002)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+				0x1f51, 0x1003)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+				0x1f51, 0x1004)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+				0x1f51, 0x1005)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+				0x1f51, 0x1006)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+				0x1f51, 0x1007)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+				0x1f51, 0x1008)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+				0x1f51, 0x1009)
+	},
+	{
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
+				0x1f51, 0x100A)
+	},
 	{
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f,
 			       PCI_ANY_ID, PCI_ANY_ID)
diff --git a/drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.c b/drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.c
index d4c280e8a3453..da21e39d1e189 100644
--- a/drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.c
+++ b/drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.c
@@ -433,6 +433,9 @@ struct pqi_io_request *pqi_get_io_request(struct pqi_ctrl_info *ctrl_info, struc
 		u32 blk_tag = blk_mq_unique_tag(PQI_SCSI_REQUEST(scmd));
 
 		i = blk_mq_unique_tag_to_tag(blk_tag);
+		if (i < 0 || i >= ctrl_info->scsi_ml_can_queue)
+			return NULL;
+
 		io_request = &ctrl_info->io_request_pool[i];
 		if (atomic_inc_return(&io_request->refcount) > 1) {
 			atomic_dec(&io_request->refcount);
diff --git a/drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.h b/drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.h
index 1533444644b80..016cfeaf48237 100644
--- a/drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.h
+++ b/drivers/amazon/scsi/smartpqi/smartpqi_kernel_compat.h
@@ -106,7 +106,9 @@
 	defined(SLES15SP0) || \
 	defined(SLES15SP1) || \
 	defined(SLES15SP2) || \
-	defined(SLES15SP3)
+	defined(SLES15SP3) || \
+	defined(SLES15SP4) || \
+	defined(SLES15SP5)
 #define SLES15
 #endif
 
@@ -188,7 +190,7 @@
 #define shost_use_blk_mq(x) 				1
 #define KFEATURE_ENABLE_SCSI_MAP_QUEUES 		1
 #define KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V3		1
-#define KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V2		1
+#define KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V2 		1
 #elif defined(SLES11)
 #define KFEATURE_HAS_WAIT_FOR_COMPLETION_IO		0
 #define KFEATURE_HAS_NO_WRITE_SAME			0
@@ -219,13 +221,13 @@
 #define KFEATURE_ENABLE_SCSI_MAP_QUEUES 		1
 #if defined(SLES15SP0)
 #define KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V1 		1
-#define KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V1		1
+#define KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V1 		1
 #elif defined(SLES15SP1)
 #define KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V2 		1
-#define KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V1		1
+#define KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V1 		1
 #else
 #define KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V3 		1
-#define KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V2		1
+#define KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V2 		1
 #endif
 #elif defined(OEULER2003)
 #define dma_zalloc_coherent				dma_alloc_coherent
@@ -235,12 +237,12 @@
 #define KFEATURE_HAS_BSG_JOB_SMP_HANDLER		1
 #define KFEATURE_HAS_USE_CLUSTERING			0
 #define KFEATURE_HAS_NCQ_PRIO_SUPPORT			1
-#define KFEATURE_ENABLE_PCI_ALLOC_IRQ_VECTORS		1
-#define KFEATURE_HAS_MQ_SUPPORT			1
-#define shost_use_blk_mq(x)				1
-#define KFEATURE_ENABLE_SCSI_MAP_QUEUES		1
+#define KFEATURE_ENABLE_PCI_ALLOC_IRQ_VECTORS 		1
+#define KFEATURE_HAS_MQ_SUPPORT 			1
+#define shost_use_blk_mq(x) 				1
+#define KFEATURE_ENABLE_SCSI_MAP_QUEUES 		1
 #define KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V2		1
-#define KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V1		1
+#define KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V1 		1
 #elif defined(UBUNTU1404) || TORTUGA || defined(KCLASS3C)
 #define KFEATURE_HAS_PCI_ENABLE_MSIX_RANGE		0
 #define KFEATURE_HAS_ATOMIC_HOST_BUSY			0
@@ -256,17 +258,20 @@
     defined(SLES12SP5) || defined(RHEL8) || defined(KCLASS5A) || \
     defined(KCLASS5B) || defined(KCLASS5C) || defined(KCLASS5D) || \
     defined(SLES15SP2) || defined(SLES15SP3) || defined(SLES15SP4) || \
-    defined(RHEL9) || defined(CENTOS7ALTARM) || defined(OEULER2203) || \
-    defined(KCLASS6)
+    defined(SLES15SP5) || \
+    defined(RHEL9) || defined (CENTOS7ALTARM) || defined(OEULER2203) || \
+    defined(KCLASS6) || defined(K10SP2)
 #define KFEATURE_HAS_KTIME_SECONDS			1
 #define KFEATURE_HAS_SCSI_REQUEST			1
 #define KFEATURE_HAS_KTIME64				1
 #endif
 #if defined(KCLASS4C) || defined(RHEL8) || defined(SLES15SP1) || \
     defined(SLES15SP2) || defined(SLES15SP3) || defined(SLES15SP4) || \
+    defined(SLES15SP5) || \
     defined(KCLASS5A) ||  defined(KCLASS5B) || defined(KCLASS5C) || \
-    defined(KCLASS5D) ||  defined(SLES12SP5) || defined(CENTOS7ALTARM) || \
-    defined(RHEL9) || defined(OEULER2203) || defined(KCLASS6)
+    defined(KCLASS5D) ||  defined(SLES12SP5) || defined (CENTOS7ALTARM) || \
+    defined(RHEL9) || defined(OEULER2203) || defined(KCLASS6) || \
+    defined(K10SP2)
 #define KFEATURE_HAS_BSG_JOB_SMP_HANDLER		1
 #endif
 #if defined(RHEL8U3) || defined(RHEL8U4) || defined(RHEL8U5) || \
@@ -279,8 +284,9 @@
 #endif
 #if defined(KCLASS5A) || defined(KCLASS5B) || defined(KCLASS5C) || \
     defined(KCLASS5D) || defined(KCLASS4D) || defined(SLES15SP2) || \
-    defined(SLES15SP3) || defined(SLES15SP4) || defined(RHEL9) || \
-    defined(OEULER2203) || defined(KCLASS6)
+    defined(SLES15SP3) || defined(SLES15SP4) || defined(SLES15SP5) || \
+    defined(RHEL9) || defined(OEULER2203) || defined(KCLASS6) || \
+    defined(K10SP2)
 #define dma_zalloc_coherent	dma_alloc_coherent
 #define shost_use_blk_mq(x)	1
 #define KFEATURE_HAS_USE_CLUSTERING			0
@@ -288,14 +294,16 @@
 
 #if defined(KCLASS5B) || defined(KCLASS5C) || defined(KCLASS5D) || \
     defined(KCLASS4D) || defined(SLES15SP2) || defined(SLES15SP3) || \
-    defined(SLES15SP4) || defined(RHEL9) || defined(OEULER2003) || \
-    defined(OEULER2203) || defined(KCLASS6)
+    defined(SLES15SP4) || defined(SLES15SP5) || \
+    defined(RHEL9) || defined(OEULER2003) || \
+    defined(OEULER2203) || defined(KCLASS6) || defined(K10SP2)
 #define IOCTL_INT	unsigned int
 #else
 #define IOCTL_INT	int
 #endif
 
 #if defined(KCLASS5C) || defined(KCLASS5D) || defined(SLES15SP4) || \
+    defined(SLES15SP5) || \
     defined(RHEL9) || defined(OEULER2203) || defined(KCLASS6)
 #define KFEATURE_HAS_HOST_BUSY_FUNCTION			1
 #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
@@ -305,7 +313,9 @@
 #if defined(KCLASS5A) || defined(KCLASS5B) || defined(KCLASS5C) || \
     defined(KCLASS5D) || defined(KCLASS4C) || defined(KCLASS4D) || \
     defined(RHEL8) || defined(SLES15) || defined(SLES15SP4) || \
-    defined(RHEL9) || defined(OEULER2203) || defined(KCLASS6)
+    defined(SLES15SP5) || \
+    defined(RHEL9) || defined(OEULER2203) || defined(KCLASS6) || \
+    defined(K10SP2)
 #define KFEATURE_HAS_NCQ_PRIO_SUPPORT			1
 #endif
 
@@ -390,10 +400,10 @@
 #define KFEATURE_HAS_BLK_MQ_PCI_MAP_QUEUES_V3 		0
 #endif
 #if !defined(KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V1)
-#define KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V1		0
+#define KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V1 		0
 #endif
 #if !defined(KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V2)
-#define KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V2		0
+#define KFEATURE_HAS_BLK_MQ_MAP_QUEUES_V2 		0
 #endif
 #if !defined(KFEATURE_HAS_NCQ_PRIO_SUPPORT)
 #define KFEATURE_HAS_NCQ_PRIO_SUPPORT			0
@@ -564,6 +574,10 @@ static inline void pqi_disable_write_same(struct scsi_device *sdev)
 #define PCI_VENDOR_ID_LENOVO		0x1d49
 #endif
 
+#if !defined(PCI_VENDOR_ID_IBM)
+#define PCI_VENDOR_ID_IBM		0x1014
+#endif
+
 #if !defined(offsetofend)
 #define offsetofend(TYPE, MEMBER) \
 	(offsetof(TYPE, MEMBER)	+ sizeof(((TYPE *)0)->MEMBER))
diff --git a/drivers/amazon/scsi/smartpqi/smartpqi_sas_transport.c b/drivers/amazon/scsi/smartpqi/smartpqi_sas_transport.c
index 17bee4f5ccdd7..54dd32170eb61 100644
--- a/drivers/amazon/scsi/smartpqi/smartpqi_sas_transport.c
+++ b/drivers/amazon/scsi/smartpqi/smartpqi_sas_transport.c
@@ -106,20 +106,18 @@ static int pqi_sas_port_add_rphy(struct pqi_sas_port *pqi_sas_port,
 	identify->initiator_port_protocols = SAS_PROTOCOL_ALL;
 	identify->target_port_protocols = SAS_PROTOCOL_STP;
 
-	if (pqi_sas_port->device) {
-		switch (pqi_sas_port->device->device_type) {
-		case SA_DEVICE_TYPE_SAS:
-		case SA_DEVICE_TYPE_SES:
-		case SA_DEVICE_TYPE_NVME:
-			identify->target_port_protocols = SAS_PROTOCOL_SSP;
-			break;
-		case SA_DEVICE_TYPE_EXPANDER_SMP:
-			identify->target_port_protocols = SAS_PROTOCOL_SMP;
-			break;
-		case SA_DEVICE_TYPE_SATA:
-		default:
-			break;
-		}
+	switch (pqi_sas_port->device->device_type) {
+	case SA_DEVICE_TYPE_SAS:
+	case SA_DEVICE_TYPE_SES:
+	case SA_DEVICE_TYPE_NVME:
+		identify->target_port_protocols = SAS_PROTOCOL_SSP;
+		break;
+	case SA_DEVICE_TYPE_EXPANDER_SMP:
+		identify->target_port_protocols = SAS_PROTOCOL_SMP;
+		break;
+	case SA_DEVICE_TYPE_SATA:
+	default:
+		break;
 	}
 
 	return sas_rphy_add(rphy);
@@ -304,10 +302,12 @@ int pqi_add_sas_device(struct pqi_sas_node *pqi_sas_node,
 
 	rc = pqi_sas_port_add_rphy(pqi_sas_port, rphy);
 	if (rc)
-		goto free_sas_port;
+		goto free_sas_rphy;
 
 	return 0;
 
+free_sas_rphy:
+	sas_rphy_free(rphy);
 free_sas_port:
 	pqi_free_sas_port(pqi_sas_port);
 	device->sas_port = NULL;

From 3b2c8b0ee55f9bef56b21afb3c1327d4ddae7948 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sjpark@amazon.de>
Date: Mon, 8 Nov 2021 18:35:56 -0800
Subject: [PATCH 701/737] selftests/kselftest/runner/run_one(): allow running
 non-executable files

commit 303f8e2d02002dbe331cab7813ee091aead3cd39 upstream.

When running a test program, 'run_one()' checks if the program has the
execution permission and fails if it doesn't.  However, it's easy to
mistakenly lose the permissions, as some common tools like 'diff' don't
support the permission change well[1].  Compared to that, making mistakes
in the test program's path would only rare, as those are explicitly listed
in 'TEST_PROGS'.  Therefore, it might make more sense to resolve the
situation on our own and run the program.

For this reason, this commit makes the test program runner function still
print the warning message but to try parsing the interpreter of the
program and to explicitly run it with the interpreter, in this case.

[1] https://lore.kernel.org/mm-commits/YRJisBs9AunccCD4@kroah.com/

Link: https://lkml.kernel.org/r/20210810164534.25902-1-sj38.park@gmail.com
Signed-off-by: SeongJae Park <sjpark@amazon.de>
Suggested-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 tools/testing/selftests/kselftest/runner.sh | 28 +++++++++++++--------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/tools/testing/selftests/kselftest/runner.sh b/tools/testing/selftests/kselftest/runner.sh
index cc9c846585f05..a9ba782d8ca0f 100644
--- a/tools/testing/selftests/kselftest/runner.sh
+++ b/tools/testing/selftests/kselftest/runner.sh
@@ -33,9 +33,9 @@ tap_timeout()
 {
 	# Make sure tests will time out if utility is available.
 	if [ -x /usr/bin/timeout ] ; then
-		/usr/bin/timeout --foreground "$kselftest_timeout" "$1"
+		/usr/bin/timeout --foreground "$kselftest_timeout" $1
 	else
-		"$1"
+		$1
 	fi
 }
 
@@ -65,17 +65,25 @@ run_one()
 
 	TEST_HDR_MSG="selftests: $DIR: $BASENAME_TEST"
 	echo "# $TEST_HDR_MSG"
-	if [ ! -x "$TEST" ]; then
-		echo -n "# Warning: file $TEST is "
-		if [ ! -e "$TEST" ]; then
-			echo "missing!"
-		else
-			echo "not executable, correct this."
-		fi
+	if [ ! -e "$TEST" ]; then
+		echo "# Warning: file $TEST is missing!"
 		echo "not ok $test_num $TEST_HDR_MSG"
 	else
+		cmd="./$BASENAME_TEST"
+		if [ ! -x "$TEST" ]; then
+			echo "# Warning: file $TEST is not executable"
+
+			if [ $(head -n 1 "$TEST" | cut -c -2) = "#!" ]
+			then
+				interpreter=$(head -n 1 "$TEST" | cut -c 3-)
+				cmd="$interpreter ./$BASENAME_TEST"
+			else
+				echo "not ok $test_num $TEST_HDR_MSG"
+				return
+			fi
+		fi
 		cd `dirname $TEST` > /dev/null
-		((((( tap_timeout ./$BASENAME_TEST 2>&1; echo $? >&3) |
+		((((( tap_timeout "$cmd" 2>&1; echo $? >&3) |
 			tap_prefix >&4) 3>&1) |
 			(read xs; exit $xs)) 4>>"$logfile" &&
 		echo "ok $test_num $TEST_HDR_MSG") ||

From e93d9aba16b6f85a0d0cb6f5d3bb0cf76245e997 Mon Sep 17 00:00:00 2001
From: Arthur Kiyanovski <akiyano@amazon.com>
Date: Wed, 17 May 2023 18:19:53 +0000
Subject: [PATCH 702/737] Al2 5.10 Update ena driver to 2.8.6g

Signed-off-by: Arthur Kiyanovski <akiyano@amazon.com>
---
 drivers/amazon/net/ena/ena_netdev.c |  50 +++++-----
 drivers/amazon/net/ena/ena_netdev.h |   2 +-
 drivers/amazon/net/ena/kcompat.h    | 136 ++++++++++++++++++++++------
 3 files changed, 136 insertions(+), 52 deletions(-)

diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c
index 5308f35e29f5a..072be72e14d56 100644
--- a/drivers/amazon/net/ena/ena_netdev.c
+++ b/drivers/amazon/net/ena/ena_netdev.c
@@ -693,16 +693,22 @@ static int ena_alloc_rx_buffer(struct ena_ring *rx_ring,
 	return 0;
 }
 
-static void ena_unmap_rx_buff(struct ena_ring *rx_ring,
-			      struct ena_rx_buffer *rx_info)
+static void ena_unmap_rx_buff_attrs(struct ena_ring *rx_ring,
+				    struct ena_rx_buffer *rx_info,
+				    unsigned long attrs)
 {
 	/* LPC pages are unmapped at cache destruction */
 	if (rx_info->is_lpc_page)
 		return;
 
-	dma_unmap_page(rx_ring->dev, rx_info->dma_addr,
-		       ENA_PAGE_SIZE,
-		       DMA_BIDIRECTIONAL);
+	ena_dma_unmap_page_attrs(rx_ring->dev, rx_info->dma_addr, ENA_PAGE_SIZE,
+				 DMA_BIDIRECTIONAL, attrs);
+}
+
+static void ena_unmap_rx_buff(struct ena_ring *rx_ring,
+			      struct ena_rx_buffer *rx_info)
+{
+	ena_unmap_rx_buff_attrs(rx_ring, rx_info, 0);
 }
 
 static void ena_free_rx_page(struct ena_ring *rx_ring,
@@ -880,7 +886,7 @@ static void ena_free_tx_bufs(struct ena_ring *tx_ring)
 
 		ena_unmap_tx_buff(tx_ring, tx_info);
 
-		napi_consume_skb(tx_info->skb, 0);
+		dev_kfree_skb_any(tx_info->skb);
 	}
 	netdev_tx_reset_queue(netdev_get_tx_queue(tx_ring->netdev,
 						  tx_ring->qid));
@@ -1012,7 +1018,7 @@ static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget)
 			  skb);
 
 		tx_bytes += tx_info->total_tx_size;
-		napi_consume_skb(skb, budget);
+		dev_kfree_skb(skb);
 		tx_pkts++;
 		total_done += tx_info->tx_descs;
 
@@ -1063,7 +1069,7 @@ static struct sk_buff *ena_alloc_skb(struct ena_ring *rx_ring, void *first_frag,
 	if (!first_frag)
 		skb = napi_alloc_skb(rx_ring->napi, len);
 	else
-		skb = ena_build_skb(first_frag, len);
+		skb = build_skb(first_frag, len);
 #else
 	if (!first_frag)
 		skb = napi_alloc_skb(rx_ring->napi, len);
@@ -1185,13 +1191,14 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 	reuse_rx_buf_page = !is_xdp_loaded &&
 			    ena_try_rx_buf_page_reuse(rx_info, buf_len, len, pkt_offset);
 
+	dma_sync_single_for_cpu(rx_ring->dev,
+				pre_reuse_paddr + pkt_offset,
+				len,
+				DMA_FROM_DEVICE);
+
 	if (!reuse_rx_buf_page)
-		ena_unmap_rx_buff(rx_ring, rx_info);
-	else
-		dma_sync_single_for_cpu(rx_ring->dev,
-					pre_reuse_paddr + pkt_offset,
-					len,
-					DMA_FROM_DEVICE);
+		ena_unmap_rx_buff_attrs(rx_ring, rx_info, ENA_DMA_ATTR_SKIP_CPU_SYNC);
+
 
 	skb = ena_alloc_skb(rx_ring, buf_addr, buf_len);
 	if (unlikely(!skb))
@@ -1249,13 +1256,14 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 		reuse_rx_buf_page = !is_xdp_loaded &&
 				    ena_try_rx_buf_page_reuse(rx_info, buf_len, len, pkt_offset);
 
+		dma_sync_single_for_cpu(rx_ring->dev,
+					pre_reuse_paddr + pkt_offset,
+					len,
+					DMA_FROM_DEVICE);
+
 		if (!reuse_rx_buf_page)
-			ena_unmap_rx_buff(rx_ring, rx_info);
-		else
-			dma_sync_single_for_cpu(rx_ring->dev,
-						pre_reuse_paddr + pkt_offset,
-						len,
-						DMA_FROM_DEVICE);
+			ena_unmap_rx_buff_attrs(rx_ring, rx_info, ENA_DMA_ATTR_SKIP_CPU_SYNC);
+
 
 		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_info->page,
 				page_offset + buf_offset, len, buf_len);
@@ -3038,7 +3046,7 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	tx_info->skb = NULL;
 
 error_drop_packet:
-	napi_consume_skb(skb, 0);
+	dev_kfree_skb(skb);
 	return NETDEV_TX_OK;
 }
 
diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h
index 5098ac28966c5..92e03d79971f7 100644
--- a/drivers/amazon/net/ena/ena_netdev.h
+++ b/drivers/amazon/net/ena/ena_netdev.h
@@ -29,7 +29,7 @@
 
 #define DRV_MODULE_GEN_MAJOR	2
 #define DRV_MODULE_GEN_MINOR	8
-#define DRV_MODULE_GEN_SUBMINOR 3
+#define DRV_MODULE_GEN_SUBMINOR 6
 
 #define DRV_MODULE_NAME		"ena"
 #ifndef DRV_MODULE_GENERATION
diff --git a/drivers/amazon/net/ena/kcompat.h b/drivers/amazon/net/ena/kcompat.h
index fd44a3ebe0414..1b3e7edf570b0 100644
--- a/drivers/amazon/net/ena/kcompat.h
+++ b/drivers/amazon/net/ena/kcompat.h
@@ -107,6 +107,33 @@ Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
 	LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
 #define ENA_BUSY_POLL_SUPPORT
 #endif
+
+/* Distribution kernel version comparison macros.
+ * Distribution kernel versioning format may be A.B.C-D.E.F and standard
+ * KERNEL_VERSION macro covers only the first 3 subversions.
+ * Using 20bit per subversion, as in some cases, subversion D may be a large
+ * number (6 digits).
+ */
+#define ENA_KERNEL_VERSION_16BIT(SV1, SV2, SV3) ((SV1 << 40) | (SV2 << 20) | (SV3))
+#define ENA_KERNEL_VERSION_MAJOR(SV1, SV2, SV3) ENA_KERNEL_VERSION_16BIT(SV1, SV2, SV3)
+#define ENA_KERNEL_VERSION_MINOR(SV1, SV2, SV3) ENA_KERNEL_VERSION_16BIT(SV1, SV2, SV3)
+
+#define ENA_KERNEL_VERSION_GTE(SV1, SV2, SV3, SV4, SV5, SV6) \
+	((ENA_KERNEL_VERSION_MAJOR(ENA_KERNEL_SUBVERSION_1, ENA_KERNEL_SUBVERSION_2, ENA_KERNEL_SUBVERSION_3) > \
+	  ENA_KERNEL_VERSION_MAJOR((SV1), (SV2), (SV3))) || \
+	 (ENA_KERNEL_VERSION_MAJOR(ENA_KERNEL_SUBVERSION_1, ENA_KERNEL_SUBVERSION_2, ENA_KERNEL_SUBVERSION_3) == \
+	  ENA_KERNEL_VERSION_MAJOR((SV1), (SV2), (SV3)) && \
+	  ENA_KERNEL_VERSION_MINOR(ENA_KERNEL_SUBVERSION_4, ENA_KERNEL_SUBVERSION_5, ENA_KERNEL_SUBVERSION_6) >= \
+	  ENA_KERNEL_VERSION_MINOR((SV4), (SV5), (SV6))))
+
+#define ENA_KERNEL_VERSION_LTE(SV1, SV2, SV3, SV4, SV5, SV6) \
+	((ENA_KERNEL_VERSION_MAJOR(ENA_KERNEL_SUBVERSION_1, ENA_KERNEL_SUBVERSION_2, ENA_KERNEL_SUBVERSION_3) < \
+	  ENA_KERNEL_VERSION_MAJOR((SV1), (SV2), (SV3))) || \
+	 (ENA_KERNEL_VERSION_MAJOR(ENA_KERNEL_SUBVERSION_1, ENA_KERNEL_SUBVERSION_2, ENA_KERNEL_SUBVERSION_3) == \
+	  ENA_KERNEL_VERSION_MAJOR((SV1), (SV2), (SV3)) && \
+	  ENA_KERNEL_VERSION_MINOR(ENA_KERNEL_SUBVERSION_4, ENA_KERNEL_SUBVERSION_5, ENA_KERNEL_SUBVERSION_6) <= \
+	  ENA_KERNEL_VERSION_MINOR((SV4), (SV5), (SV6))))
+
 /******************************************************************************/
 /************************** Ubuntu macros *************************************/
 /******************************************************************************/
@@ -177,7 +204,6 @@ Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
 #define SUSE_VERSION 0
 #endif /* SUSE_VERSION */
 
-
 /******************************************************************************/
 /**************************** RHEL macros *************************************/
 /******************************************************************************/
@@ -822,17 +848,32 @@ static inline void netdev_rss_key_fill(void *buffer, size_t len)
 
 #if ((LINUX_VERSION_CODE < KERNEL_VERSION(4, 6 ,0)) && \
      !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,3)))
-/* Linux versions 4.4.216 - 4.5 (non inclusive) back propagated page_ref_count
- * function from kernel 4.6. To make things more difficult, Ubuntu didn't add
- * these changes to its 4.4.* kernels
+/* Linux versions 4.4.216 - 4.5 (non inclusive) back propagated
+ * page_ref_count() from kernel 4.6.
+ * Ubuntu didn't add these changes to its 4.4.* kernels.
+ * UEK added this function in kernel 4.1.12-124.43.1
+ * Here is a figure that shows all of the cases:
+ * Legend:
+ * -------- page_ref_count() is present in the kernel
+ * ******** page_ref_count() is missing in the kernel
+ *
+ * Distro\Kernel      4.1.12-124.43.1   4.4.216    4.5    4.6
+ *                           |              |        |      |
+ * Upstrem kernel ***********|**************|--------|******|
+ *                           |              |        |      |
+ * Ubuntu         ***********|**************|********|******|
+ *                           |              |        |      |
+ * UEK            ***********|--------------|--------|------|
  */
-#if !(KERNEL_VERSION(4, 4 ,216) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(4, 5 ,0)) ||\
-      defined(UBUNTU_VERSION_CODE)
+#if (defined(IS_UEK) && !ENA_KERNEL_VERSION_GTE(4, 1, 12, 124, 43, 1)) || \
+    (defined(ubuntu)) || \
+    (!defined(IS_UEK) && !defined(ubuntu) && \
+     !(KERNEL_VERSION(4, 4, 216) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0)))
 static inline int page_ref_count(struct page *page)
 {
 	return atomic_read(&page->_count);
 }
-#endif /* !(KERNEL_VERSION(4, 4 ,216) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(4, 5 ,0)) */
+#endif /* (defined(IS_UEK) && !ENA_KERNEL_VERSION_GTE(4, 1, 12, 124, 43, 1)) ... */
 
 static inline void page_ref_inc(struct page *page)
 {
@@ -881,32 +922,16 @@ static inline int numa_mem_id(void)
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0)
 #define ENA_LINEAR_FRAG_SUPPORTED
-static __always_inline struct sk_buff*
-ena_build_skb(void *data, unsigned int frag_size)
-{
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 12, 0)
-	return napi_build_skb(data, frag_size);
-#else
-	return build_skb(data, frag_size);
-#endif
-}
-#endif
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 6, 0) && \
-	!(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 3)) && \
-	!(defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(4, 2, 0, 42))
-static __always_inline
-void napi_consume_skb(struct sk_buff *skb, int budget)
-{
-	dev_kfree_skb_any(skb);
-}
 #endif
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0)
 #define ENA_NETDEV_LOGS_WITHOUT_RV
 #endif
 
-#if defined(ENA_XDP_SUPPORT) && LINUX_VERSION_CODE < KERNEL_VERSION(5, 12, 0)
+#if defined(ENA_XDP_SUPPORT) && \
+	(LINUX_VERSION_CODE < KERNEL_VERSION(5, 12, 0) && \
+	!(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL == 3) && \
+	 ENA_KERNEL_VERSION_GTE(5, 3, 18, 150300, 59, 49)))
 static __always_inline void
 xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq)
 {
@@ -928,7 +953,7 @@ xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start,
 	xdp->data_meta = meta_valid ? data : data + 1;
 }
 
-#endif /* defined(ENA_XDP_SUPPORT) && LINUX_VERSION_CODE <= KERNEL_VERSION(5, 12, 0) */
+#endif /* defined(ENA_XDP_SUPPORT) && (LINUX_VERSION_CODE <= KERNEL_VERSION(5, 12, 0) && !SUSE_VERSION(...)) */
 
 #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 13, 0)
 #define ethtool_sprintf(data, fmt, args...)			\
@@ -944,7 +969,9 @@ xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start,
 
 #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0) && \
 	!(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 6)) && \
-	!(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4))
+	!(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4)) && \
+	!(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL == 3) && \
+	  ENA_KERNEL_VERSION_GTE(5, 3, 18, 150300, 59, 43))
 static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr)
 {
 	memcpy(dev->dev_addr, addr, ETH_ALEN);
@@ -1065,7 +1092,8 @@ static inline void ena_netif_napi_add(struct net_device *dev,
 				      struct napi_struct *napi,
 				      int (*poll)(struct napi_struct *, int))
 {
-#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0)
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0)) && \
+	!(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 2)))
 #ifndef NAPI_POLL_WEIGHT
 #define NAPI_POLL_WEIGHT 64
 #endif
@@ -1075,4 +1103,52 @@ static inline void ena_netif_napi_add(struct net_device *dev,
 #endif /* LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0) */
 }
 
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7, 4))) || \
+    (defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE < UBUNTU_VERSION(4, 5, 0, 0)) || \
+    (defined(IS_UEK) && !ENA_KERNEL_VERSION_GTE(4, 1, 12, 105, 0, 0))
+static inline void dma_unmap_page_attrs(struct device *dev,
+					dma_addr_t addr, size_t size,
+					enum dma_data_direction dir,
+					struct dma_attrs *attrs)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (ops->unmap_page)
+		ops->unmap_page(dev, addr, size, dir, attrs);
+	debug_dma_unmap_page(dev, addr, size, dir, false);
+}
+#endif /* RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7, 4)) */
+
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7, 9)) && \
+     (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 0)) && \
+     (LINUX_VERSION_CODE != KERNEL_VERSION(4, 14, 0))) || \
+    (defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE < UBUNTU_VERSION(4, 5, 0, 0)) || \
+    (defined(IS_UEK) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 1, 13))
+#define ENA_DMA_ATTR_SKIP_CPU_SYNC (1 << DMA_ATTR_SKIP_CPU_SYNC)
+#elif (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(6, 10)))
+#define	ENA_DMA_ATTR_SKIP_CPU_SYNC 0
+#else
+#define ENA_DMA_ATTR_SKIP_CPU_SYNC DMA_ATTR_SKIP_CPU_SYNC
+#endif
+
+static inline void ena_dma_unmap_page_attrs(struct device *dev,
+					    dma_addr_t addr, size_t size,
+					    enum dma_data_direction dir,
+					    unsigned long attrs)
+{
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7, 9)) && \
+     (LINUX_VERSION_CODE != KERNEL_VERSION(4, 14, 0))) || \
+    (defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE < UBUNTU_VERSION(4, 5, 0, 0)) || \
+    (defined(IS_UEK) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 1, 13))
+	struct dma_attrs dma_attrs;
+
+	init_dma_attrs(&dma_attrs);
+	dma_attrs.flags[0] = attrs;
+	dma_unmap_page_attrs(dev, addr, size, dir, &dma_attrs);
+#else
+	dma_unmap_page_attrs(dev, addr, size, dir, attrs);
+#endif
+}
+
 #endif /* _KCOMPAT_H_ */

From a457eb924f77e70be0fc6d44a3f32f1bf229ff2e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 2 Apr 2021 11:10:37 -0700
Subject: [PATCH 703/737] tcp: reorder tcp_congestion_ops for better cache
 locality

Group all the often used fields in the first cache line,
to reduce cache line misses.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 82506665179209e43d3c9d39ffa42f8c8ff968bd)
Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 include/net/tcp.h | 42 +++++++++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 15 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index dcca41f3a2240..c8acb88e2f9f2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1052,44 +1052,56 @@ struct rate_sample {
 };
 
 struct tcp_congestion_ops {
-	struct list_head	list;
-	u32 key;
-	u32 flags;
-
-	/* initialize private data (optional) */
-	void (*init)(struct sock *sk);
-	/* cleanup private data  (optional) */
-	void (*release)(struct sock *sk);
+/* fast path fields are put first to fill one cache line */
 
 	/* return slow start threshold (required) */
 	u32 (*ssthresh)(struct sock *sk);
+
 	/* do new cwnd calculation (required) */
 	void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked);
+
 	/* call before changing ca_state (optional) */
 	void (*set_state)(struct sock *sk, u8 new_state);
+
 	/* call when cwnd event occurs (optional) */
 	void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
+
 	/* call when ack arrives (optional) */
 	void (*in_ack_event)(struct sock *sk, u32 flags);
-	/* new value of cwnd after loss (required) */
-	u32  (*undo_cwnd)(struct sock *sk);
+
 	/* hook for packet ack accounting (optional) */
 	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
+
 	/* override sysctl_tcp_min_tso_segs */
 	u32 (*min_tso_segs)(struct sock *sk);
-	/* returns the multiplier used in tcp_sndbuf_expand (optional) */
-	u32 (*sndbuf_expand)(struct sock *sk);
+
 	/* call when packets are delivered to update cwnd and pacing rate,
 	 * after all the ca_state processing. (optional)
 	 */
 	void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
+
+
+	/* new value of cwnd after loss (required) */
+	u32  (*undo_cwnd)(struct sock *sk);
+	/* returns the multiplier used in tcp_sndbuf_expand (optional) */
+	u32 (*sndbuf_expand)(struct sock *sk);
+
+/* control/slow paths put last */
 	/* get info for inet_diag (optional) */
 	size_t (*get_info)(struct sock *sk, u32 ext, int *attr,
 			   union tcp_cc_info *info);
 
-	char 		name[TCP_CA_NAME_MAX];
-	struct module 	*owner;
-};
+	char 			name[TCP_CA_NAME_MAX];
+	struct module		*owner;
+	struct list_head	list;
+	u32			key;
+	u32			flags;
+
+	/* initialize private data (optional) */
+	void (*init)(struct sock *sk);
+	/* cleanup private data  (optional) */
+	void (*release)(struct sock *sk);
+} ____cacheline_aligned_in_smp;
 
 int tcp_register_congestion_control(struct tcp_congestion_ops *type);
 void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);

From 9d2771ccf0dc3c0ef80ad162f0fff5b0c460c69f Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Tue, 11 Jun 2019 12:26:55 -0400
Subject: [PATCH 704/737] net-tcp_bbr: broaden app-limited rate sample
 detection

This commit is a bug fix for the Linux TCP app-limited
(application-limited) logic that is used for collecting rate
(bandwidth) samples.

Previously the app-limited logic only looked for "bubbles" of
silence in between application writes, by checking at the start
of each sendmsg. But "bubbles" of silence can also happen before
retransmits: e.g. bubbles can happen between an application write
and a retransmit, or between two retransmits.

Retransmits are triggered by ACKs or timers. So this commit checks
for bubbles of app-limited silence upon ACKs or timers.

Why does this commit check for app-limited state at the start of
ACKs and timer handling? Because at that point we know whether
inflight was fully using the cwnd.  During processing the ACK or
timer event we often change the cwnd; after changing the cwnd we
can't know whether inflight was fully using the old cwnd.

Origin-9xx-SHA1: 3fe9b53291e018407780fb8c356adb5666722cbc
Change-Id: I37221506f5166877c2b110753d39bb0757985e68
---
 net/ipv4/tcp_input.c | 1 +
 net/ipv4/tcp_timer.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d6dfbb88dcf5b..8a6f176f75b7b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3787,6 +3787,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
 	rs.prior_in_flight = tcp_packets_in_flight(tp);
+	tcp_rate_check_app_limited(sk);
 
 	/* ts_recent update must be made after we are sure that the packet
 	 * is in window.
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 715fdfa3e2ae9..35b7792669c40 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -611,6 +611,7 @@ void tcp_write_timer_handler(struct sock *sk)
 		goto out;
 	}
 
+	tcp_rate_check_app_limited(sk);
 	tcp_mstamp_refresh(tcp_sk(sk));
 	event = icsk->icsk_pending;
 

From f2dcb50e7ca4234f3e6dd61816148fbef7e97055 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Tue, 27 Mar 2018 18:33:29 -0700
Subject: [PATCH 705/737] net-tcp_rate: consolidate inflight tracking
 approaches in TCP

In order to track CE marks per rate sample (one round trip), we'll
need to snap the starting tcp delivered_ce acount in the packet
meta header (tcp_skb_cb). But there's not enough space.

Good news is that the "last_in_flight" in the header, used by
NV congestion control, is almost equivalent as "delivered". In
fact "delivered" is better by accounting out-of-order packets
additionally.  Therefore we can remove it to make room for the
CE tracking.

This would make delayed ACK detection slightly less accurate but the
impact is negligible since it's not used for any critical control.

Effort: net-tcp_rate
Origin-9xx-SHA1: ddcd46ec85d5f1c4454258af0c54b3254c0d64a7
Change-Id: I1a184aad6d101c981ac7f2f275aa9417ff856910
---
 include/net/tcp.h     |  5 ++---
 net/ipv4/tcp_input.c  | 11 +++++------
 net/ipv4/tcp_output.c |  2 --
 3 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index c8acb88e2f9f2..6440dde8bde5d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -858,9 +858,8 @@ struct tcp_skb_cb {
 	union {
 		struct {
 			/* There is space for up to 24 bytes */
-			__u32 in_flight:30,/* Bytes in flight at transmit */
-			      is_app_limited:1, /* cwnd not fully used? */
-			      unused:1;
+			__u32 is_app_limited:1, /* cwnd not fully used? */
+			      unused:31;
 			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
 			__u32 delivered;
 			/* start of send pipeline phase */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8a6f176f75b7b..aa84e7c2b6241 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3219,7 +3219,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
 	long seq_rtt_us = -1L;
 	long ca_rtt_us = -1L;
 	u32 pkts_acked = 0;
-	u32 last_in_flight = 0;
 	bool rtt_update;
 	int flag = 0;
 
@@ -3255,7 +3254,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
 			if (!first_ackt)
 				first_ackt = last_ackt;
 
-			last_in_flight = TCP_SKB_CB(skb)->tx.in_flight;
 			if (before(start_seq, reord))
 				reord = start_seq;
 			if (!after(scb->end_seq, tp->high_seq))
@@ -3321,8 +3319,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
 		seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt);
 		ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt);
 
-		if (pkts_acked == 1 && last_in_flight < tp->mss_cache &&
-		    last_in_flight && !prior_sacked && fully_acked &&
+		if (pkts_acked == 1 && fully_acked && !prior_sacked &&
+		    (tp->snd_una - prior_snd_una) < tp->mss_cache &&
 		    sack->rate->prior_delivered + 1 == tp->delivered &&
 		    !(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) {
 			/* Conservatively mark a delayed ACK. It's typically
@@ -3379,9 +3377,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
 
 	if (icsk->icsk_ca_ops->pkts_acked) {
 		struct ack_sample sample = { .pkts_acked = pkts_acked,
-					     .rtt_us = sack->rate->rtt_us,
-					     .in_flight = last_in_flight };
+					     .rtt_us = sack->rate->rtt_us };
 
+		sample.in_flight = tp->mss_cache *
+			(tp->delivered - sack->rate->prior_delivered);
 		icsk->icsk_ca_ops->pkts_acked(sk, &sample);
 	}
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 86e896351364e..4374fcf9d0f79 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1255,8 +1255,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 	tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
 	skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
 	if (clone_it) {
-		TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
-			- tp->snd_una;
 		oskb = skb;
 
 		tcp_skb_tsorted_save(oskb) {

From 36d9aadf1624a00b3de3d1cc39cf421033fed0fa Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Tue, 27 Mar 2018 18:01:46 -0700
Subject: [PATCH 706/737] net-tcp_rate: account for CE marks in rate sample

This patch counts number of packets delivered have CE mark in the
rate sample, using similar approach of delivery accounting.

Effort: net-tcp_rate
Origin-9xx-SHA1: 710644db434c3da335a7c8b72207a671ccbb5cf8
Change-Id: I0968fb33fe19b5c774e8c3afd2685558a6ec8710
[5.10: Stable contains backport of b253a0680]
---
 include/net/tcp.h   | 6 +++++-
 net/ipv4/tcp_rate.c | 6 ++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6440dde8bde5d..522382eb2ca9e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -857,9 +857,11 @@ struct tcp_skb_cb {
 	__u32		ack_seq;	/* Sequence number ACK'd	*/
 	union {
 		struct {
+#define TCPCB_DELIVERED_CE_MASK ((1U<<20) - 1)
 			/* There is space for up to 24 bytes */
 			__u32 is_app_limited:1, /* cwnd not fully used? */
-			      unused:31;
+			      delivered_ce:20,
+			      unused:11;
 			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
 			__u32 delivered;
 			/* start of send pipeline phase */
@@ -1036,7 +1038,9 @@ struct ack_sample {
 struct rate_sample {
 	u64  prior_mstamp; /* starting timestamp for interval */
 	u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
+	u32  prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
 	s32  delivered;		/* number of packets delivered over interval */
+	s32  delivered_ce;	/* number of packets delivered w/ CE marks*/
 	long interval_us;	/* time for tp->delivered to incr "delivered" */
 	u32 snd_interval_us;	/* snd interval for delivered packets */
 	u32 rcv_interval_us;	/* rcv interval for delivered packets */
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index 6ab197928abbc..9a8e014d9b5b9 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -65,6 +65,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
 	TCP_SKB_CB(skb)->tx.first_tx_mstamp	= tp->first_tx_mstamp;
 	TCP_SKB_CB(skb)->tx.delivered_mstamp	= tp->delivered_mstamp;
 	TCP_SKB_CB(skb)->tx.delivered		= tp->delivered;
+	TCP_SKB_CB(skb)->tx.delivered_ce	= tp->delivered_ce;
 	TCP_SKB_CB(skb)->tx.is_app_limited	= tp->app_limited ? 1 : 0;
 }
 
@@ -90,6 +91,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
 	if (!rs->prior_delivered ||
 	    tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
 			       scb->end_seq, rs->last_end_seq)) {
+		rs->prior_delivered_ce  = scb->tx.delivered_ce;
 		rs->prior_delivered  = scb->tx.delivered;
 		rs->prior_mstamp     = scb->tx.delivered_mstamp;
 		rs->is_app_limited   = scb->tx.is_app_limited;
@@ -143,6 +145,10 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
 	}
 	rs->delivered   = tp->delivered - rs->prior_delivered;
 
+	rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
+	/* delivered_ce occupies less than 32 bits in the skb control block */
+	rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK;
+
 	/* Model sending data and receiving ACKs as separate pipeline phases
 	 * for a window. Usually the ACK phase is longer, but with ACK
 	 * compression the send phase can be longer. To be safe we use the

From e90d034b47ab7d2aa140c4a200494534ada067a1 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Sun, 24 Jun 2018 21:55:59 -0400
Subject: [PATCH 707/737] net-tcp_bbr: v2: shrink delivered_mstamp,
 first_tx_mstamp to u32 to free up 8 bytes

Free up some space for tracking inflight and losses for each
bw sample, in upcoming commits.

These timestamps are in microseconds, and are now stored in 32
bits. So they can only hold time intervals up to roughly 2^12 = 4096
seconds.  But Linux TCP RTT and RTO tracking has the same 32-bit
microsecond implementation approach and resulting deployment
limitations. So this is not introducing a new limit. And these should
not be a limitation for the foreseeable future.

Effort: net-tcp_bbr
Origin-9xx-SHA1: 238a7e6b5d51625fef1ce7769826a7b21b02ae55
Change-Id: I3b779603797263b52a61ad57c565eb91fe42680c
---
 include/net/tcp.h   | 9 +++++++--
 net/ipv4/tcp_rate.c | 7 ++++---
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 522382eb2ca9e..3e1fc41614ec8 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -790,6 +790,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
 	return max_t(s64, t1 - t0, 0);
 }
 
+static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0)
+{
+	return max_t(s32, t1 - t0, 0);
+}
+
 static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
 {
 	return tcp_ns_to_ts(skb->skb_mstamp_ns);
@@ -865,9 +870,9 @@ struct tcp_skb_cb {
 			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
 			__u32 delivered;
 			/* start of send pipeline phase */
-			u64 first_tx_mstamp;
+			u32 first_tx_mstamp;
 			/* when we reached the "delivered" count */
-			u64 delivered_mstamp;
+			u32 delivered_mstamp;
 		} tx;   /* only used for outgoing skbs */
 		union {
 			struct inet_skb_parm	h4;
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index 9a8e014d9b5b9..1a7140a36e76f 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -101,8 +101,9 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
 		/* Record send time of most recently ACKed packet: */
 		tp->first_tx_mstamp  = tx_tstamp;
 		/* Find the duration of the "send phase" of this window: */
-		rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
-						     scb->tx.first_tx_mstamp);
+		rs->interval_us      = tcp_stamp32_us_delta(
+						tp->first_tx_mstamp,
+						scb->tx.first_tx_mstamp);
 
 	}
 	/* Mark off the skb delivered once it's sacked to avoid being
@@ -155,7 +156,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
 	 * longer phase.
 	 */
 	snd_us = rs->interval_us;				/* send phase */
-	ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
+	ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp,
 				    rs->prior_mstamp); /* ack phase */
 	rs->interval_us = max(snd_us, ack_us);
 

From 79215caf5b5328d8723bcf86d307e49c29a76c23 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Sat, 5 Aug 2017 11:49:50 -0400
Subject: [PATCH 708/737] net-tcp_bbr: v2: snapshot packets in flight at
 transmit time and pass in rate_sample

For understanding the relationship between inflight and losses or ECN
signals, to try to find the highest inflight value that has acceptable
levels of loss/ECN marking.

Effort: net-tcp_bbr
Origin-9xx-SHA1: b3eb4f2d20efab4ca001f32c9294739036c493ea
Change-Id: I7314047d0ff14dd261a04b1969a46dc658c8836a
---
 include/net/tcp.h   |  5 +++++
 net/ipv4/tcp_rate.c | 14 ++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3e1fc41614ec8..1011390889f8b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -873,6 +873,10 @@ struct tcp_skb_cb {
 			u32 first_tx_mstamp;
 			/* when we reached the "delivered" count */
 			u32 delivered_mstamp;
+#define TCPCB_IN_FLIGHT_BITS 20
+#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1)
+			u32 in_flight:20,   /* packets in flight at transmit */
+			    unused2:12;
 		} tx;   /* only used for outgoing skbs */
 		union {
 			struct inet_skb_parm	h4;
@@ -1044,6 +1048,7 @@ struct rate_sample {
 	u64  prior_mstamp; /* starting timestamp for interval */
 	u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
 	u32  prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
+	u32 tx_in_flight;	/* packets in flight at starting timestamp */
 	s32  delivered;		/* number of packets delivered over interval */
 	s32  delivered_ce;	/* number of packets delivered w/ CE marks*/
 	long interval_us;	/* time for tp->delivered to incr "delivered" */
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index 1a7140a36e76f..42b3b866ae696 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -40,6 +40,7 @@
 void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	u32 in_flight;
 
 	 /* In general we need to start delivery rate samples from the
 	  * time we received the most recent ACK, to ensure we include
@@ -67,6 +68,18 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
 	TCP_SKB_CB(skb)->tx.delivered		= tp->delivered;
 	TCP_SKB_CB(skb)->tx.delivered_ce	= tp->delivered_ce;
 	TCP_SKB_CB(skb)->tx.is_app_limited	= tp->app_limited ? 1 : 0;
+
+	/* Check, sanitize, and record packets in flight after skb was sent. */
+	in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb);
+	WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX,
+		  "insane in_flight %u cc %s mss %u "
+		  "cwnd %u pif %u %u %u %u\n",
+		  in_flight, inet_csk(sk)->icsk_ca_ops->name,
+		  tp->mss_cache, tp->snd_cwnd,
+		  tp->packets_out, tp->retrans_out,
+		  tp->sacked_out, tp->lost_out);
+	in_flight = min(in_flight, TCPCB_IN_FLIGHT_MAX);
+	TCP_SKB_CB(skb)->tx.in_flight		= in_flight;
 }
 
 /* When an skb is sacked or acked, we fill in the rate sample with the (prior)
@@ -97,6 +110,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
 		rs->is_app_limited   = scb->tx.is_app_limited;
 		rs->is_retrans	     = scb->sacked & TCPCB_RETRANS;
 		rs->last_end_seq     = scb->end_seq;
+		rs->tx_in_flight     = scb->tx.in_flight;
 
 		/* Record send time of most recently ACKed packet: */
 		tp->first_tx_mstamp  = tx_tstamp;

From 709ce1dd4d294060e1107f3c83d4a737e266cb7a Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Thu, 12 Oct 2017 23:44:27 -0400
Subject: [PATCH 709/737] net-tcp_bbr: v2: count packets lost over TCP rate
 sampling interval

For understanding the relationship between inflight and packet loss
signals, to try to find the highest inflight value that has acceptable
levels of packet losses.

Effort: net-tcp_bbr
Origin-9xx-SHA1: 4527e26b2bd7756a88b5b9ef1ada3da33dd609ab
Change-Id: I594c2500868d9c530770e7ddd68ffc87c57f4fd5
---
 include/net/tcp.h   | 5 ++++-
 net/ipv4/tcp_rate.c | 3 +++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 1011390889f8b..9415683478750 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -877,6 +877,7 @@ struct tcp_skb_cb {
 #define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1)
 			u32 in_flight:20,   /* packets in flight at transmit */
 			    unused2:12;
+			u32 lost;	/* packets lost so far upon tx of skb */
 		} tx;   /* only used for outgoing skbs */
 		union {
 			struct inet_skb_parm	h4;
@@ -1046,11 +1047,13 @@ struct ack_sample {
  */
 struct rate_sample {
 	u64  prior_mstamp; /* starting timestamp for interval */
+	u32  prior_lost;	/* tp->lost at "prior_mstamp" */
 	u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
 	u32  prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
 	u32 tx_in_flight;	/* packets in flight at starting timestamp */
+	s32  lost;		/* number of packets lost over interval */
 	s32  delivered;		/* number of packets delivered over interval */
-	s32  delivered_ce;	/* number of packets delivered w/ CE marks*/
+	s32  delivered_ce;	/* packets delivered w/ CE mark over interval */
 	long interval_us;	/* time for tp->delivered to incr "delivered" */
 	u32 snd_interval_us;	/* snd interval for delivered packets */
 	u32 rcv_interval_us;	/* rcv interval for delivered packets */
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index 42b3b866ae696..d54a1a972cf62 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -67,6 +67,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
 	TCP_SKB_CB(skb)->tx.delivered_mstamp	= tp->delivered_mstamp;
 	TCP_SKB_CB(skb)->tx.delivered		= tp->delivered;
 	TCP_SKB_CB(skb)->tx.delivered_ce	= tp->delivered_ce;
+	TCP_SKB_CB(skb)->tx.lost		= tp->lost;
 	TCP_SKB_CB(skb)->tx.is_app_limited	= tp->app_limited ? 1 : 0;
 
 	/* Check, sanitize, and record packets in flight after skb was sent. */
@@ -104,6 +105,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
 	if (!rs->prior_delivered ||
 	    tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
 			       scb->end_seq, rs->last_end_seq)) {
+		rs->prior_lost	     = scb->tx.lost;
 		rs->prior_delivered_ce  = scb->tx.delivered_ce;
 		rs->prior_delivered  = scb->tx.delivered;
 		rs->prior_mstamp     = scb->tx.delivered_mstamp;
@@ -159,6 +161,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
 		return;
 	}
 	rs->delivered   = tp->delivered - rs->prior_delivered;
+	rs->lost        = tp->lost - rs->prior_lost;
 
 	rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
 	/* delivered_ce occupies less than 32 bits in the skb control block */

From 37b6f97d0afa6c4318081b169890eea101cae83e Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 19 Nov 2018 13:48:36 -0500
Subject: [PATCH 710/737] net-tcp_bbr: v2: export FLAG_ECE in
 rate_sample.is_ece

For understanding the relationship between inflight and ECN signals,
to try to find the highest inflight value that has acceptable levels
ECN marking.

Effort: net-tcp_bbr
Origin-9xx-SHA1: 3eba998f2898541406c2666781182200934965a8
Change-Id: I3a964e04cee83e11649a54507043d2dfe769a3b3
---
 include/net/tcp.h    | 1 +
 net/ipv4/tcp_input.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9415683478750..a9da7bd96ab58 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1065,6 +1065,7 @@ struct rate_sample {
 	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
 	bool is_retrans;	/* is sample from retransmission? */
 	bool is_ack_delayed;	/* is this (likely) a delayed ACK? */
+	bool is_ece;		/* did this ACK have ECN marked? */
 };
 
 struct tcp_congestion_ops {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index aa84e7c2b6241..94ee79adc1bd0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3885,6 +3885,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	delivered = tcp_newly_delivered(sk, delivered, flag);
 	lost = tp->lost - lost;			/* freshly marked lost */
 	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
+	rs.is_ece = !!(flag & FLAG_ECE);
 	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
 	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
 	tcp_xmit_recovery(sk, rexmit);

From 4f56eaf050dbe409821d9fc926a4818a5e79a311 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Tue, 7 Aug 2018 21:52:06 -0400
Subject: [PATCH 711/737] net-tcp_bbr: v2: introduce ca_ops->skb_marked_lost()
 CC module callback API

For connections experiencing reordering, RACK can mark packets lost
long after we receive the SACKs/ACKs hinting that the packets were
actually lost.

This means that CC modules cannot easily learn the volume of inflight
data at which packet loss happens by looking at the current inflight
or even the packets in flight when the most recently SACKed packet was
sent. To learn this, CC modules need to know how many packets were in
flight at the time lost packets were sent. This new callback, combined
with TCP_SKB_CB(skb)->tx.in_flight, allows them to learn this.

This also provides a consistent callback that is invoked whether
packets are marked lost upon ACK processing, using the RACK reordering
timer, or at RTO time.

Effort: net-tcp_bbr
Origin-9xx-SHA1: afcbebe3374e4632ac6714d39e4dc8a8455956f4
Change-Id: I54826ab53df636be537e5d3c618a46145d12d51a
---
 include/net/tcp.h    | 3 +++
 net/ipv4/tcp_input.c | 5 +++++
 2 files changed, 8 insertions(+)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index a9da7bd96ab58..abc7aee08662b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1092,6 +1092,9 @@ struct tcp_congestion_ops {
 	/* override sysctl_tcp_min_tso_segs */
 	u32 (*min_tso_segs)(struct sock *sk);
 
+	/* react to a specific lost skb (optional) */
+	void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb);
+
 	/* call when packets are delivered to update cwnd and pacing rate,
 	 * after all the ca_state processing. (optional)
 	 */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 94ee79adc1bd0..fb7f23d641bb3 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1062,7 +1062,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
  */
 static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
 {
+	struct sock *sk = (struct sock *)tp;
+	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+
 	tp->lost += tcp_skb_pcount(skb);
+	if (ca_ops->skb_marked_lost)
+		ca_ops->skb_marked_lost(sk, skb);
 }
 
 void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)

From 60c3092be1bd1d2c7cebe4e9bb87579812e7c14d Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Tue, 7 May 2019 22:36:36 -0400
Subject: [PATCH 712/737] net-tcp_bbr: v2: factor out tx.in_flight setting into
 tcp_set_tx_in_flight()

Factor out the code to set an skb's tx.in_flight field into its own
function, so that this code can be used for the TCP_REPAIR "fake send"
code path that inserts skbs into the rtx queue without sending
them. This is in preparation for the following patch, which fixes an
issue with TCP_REPAIR and tx.in_flight.

Tested: See last patch in series for sponge link.

Effort: net-tcp_bbr
Origin-9xx-SHA1: e880fc907d06ea7354333f60f712748ebce9497b
Change-Id: I4fbd4a6e18a51ab06d50ab1c9ad820ce5bea89af
---
 include/net/tcp.h   |  1 +
 net/ipv4/tcp_rate.c | 32 +++++++++++++++++++-------------
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index abc7aee08662b..124f2325100bd 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1181,6 +1181,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
 }
 
 /* From tcp_rate.c */
+void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb);
 void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
 void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
 			    struct rate_sample *rs);
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index d54a1a972cf62..de9d4cc29722d 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -34,13 +34,30 @@
  * ready to send in the write queue.
  */
 
+void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 in_flight;
+
+	/* Check, sanitize, and record packets in flight after skb was sent. */
+	in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb);
+	if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX,
+		      "insane in_flight %u cc %s mss %u "
+		      "cwnd %u pif %u %u %u %u\n",
+		      in_flight, inet_csk(sk)->icsk_ca_ops->name,
+		      tp->mss_cache, tp->snd_cwnd,
+		      tp->packets_out, tp->retrans_out,
+		      tp->sacked_out, tp->lost_out))
+		in_flight = TCPCB_IN_FLIGHT_MAX;
+	TCP_SKB_CB(skb)->tx.in_flight = in_flight;
+}
+
 /* Snapshot the current delivery information in the skb, to generate
  * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
  */
 void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	u32 in_flight;
 
 	 /* In general we need to start delivery rate samples from the
 	  * time we received the most recent ACK, to ensure we include
@@ -69,18 +86,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
 	TCP_SKB_CB(skb)->tx.delivered_ce	= tp->delivered_ce;
 	TCP_SKB_CB(skb)->tx.lost		= tp->lost;
 	TCP_SKB_CB(skb)->tx.is_app_limited	= tp->app_limited ? 1 : 0;
-
-	/* Check, sanitize, and record packets in flight after skb was sent. */
-	in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb);
-	WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX,
-		  "insane in_flight %u cc %s mss %u "
-		  "cwnd %u pif %u %u %u %u\n",
-		  in_flight, inet_csk(sk)->icsk_ca_ops->name,
-		  tp->mss_cache, tp->snd_cwnd,
-		  tp->packets_out, tp->retrans_out,
-		  tp->sacked_out, tp->lost_out);
-	in_flight = min(in_flight, TCPCB_IN_FLIGHT_MAX);
-	TCP_SKB_CB(skb)->tx.in_flight		= in_flight;
+	tcp_set_tx_in_flight(sk, skb);
 }
 
 /* When an skb is sacked or acked, we fill in the rate sample with the (prior)

From 5253380ff817120b82344cb7dabdee5c92e3c4a9 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Wed, 1 May 2019 20:16:33 -0400
Subject: [PATCH 713/737] net-tcp_bbr: v2: adjust skb tx.in_flight upon merge
 in tcp_shifted_skb()

When tcp_shifted_skb() updates state as adjacent SACKed skbs are
coalesced, previously the tx.in_flight was not adjusted, so we could
get contradictory state where the skb's recorded pcount was bigger
than the tx.in_flight (the number of segments that were in_flight
after sending the skb).

Normally have a SACKed skb with contradictory pcount/tx.in_flight
would not matter. However, with SACK reneging, the SACKed bit is
removed, and an skb once again becomes eligible for retransmitting,
fragmenting, SACKing, etc. Packetdrill testing verified the following
sequence is possible in a kernel that does not have this commit:

 - skb N is SACKed
 - skb N+1 is SACKed and combined with skb N using tcp_shifted_skb()
   - tcp_shifted_skb() will increase the pcount of prev,
     but leave tx.in_flight as-is
   - so prev skb can have pcount > tx.in_flight
 - RTO, tcp_timeout_mark_lost(), detect reneg,
   remove "SACKed" bit, mark skb N as lost
   - find pcount of skb N is greater than its tx.in_flight

I suspect this issue iw what caused the bbr2_inflight_hi_from_lost_skb():
  WARN_ON_ONCE(inflight_prev < 0)
to fire in production machines using bbr2.

Tested: See last commit in series for sponge link.

Effort: net-tcp_bbr
Origin-9xx-SHA1: 1a3e997e613d2dcf32b947992882854ebe873715
Change-Id: I1b0b75c27519953430c7db51c6f358f104c7af55
---
 net/ipv4/tcp_input.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fb7f23d641bb3..5aca2a5dadc65 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1448,6 +1448,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
 	WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
 	tcp_skb_pcount_add(skb, -pcount);
 
+	/* Adjust tx.in_flight as pcount is shifted from skb to prev. */
+	if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount,
+		      "prev in_flight: %u skb in_flight: %u pcount: %u",
+		      TCP_SKB_CB(prev)->tx.in_flight,
+		      TCP_SKB_CB(skb)->tx.in_flight,
+		      pcount))
+		TCP_SKB_CB(skb)->tx.in_flight = 0;
+	else
+		TCP_SKB_CB(skb)->tx.in_flight -= pcount;
+	TCP_SKB_CB(prev)->tx.in_flight += pcount;
+
 	/* When we're adding to gso_segs == 1, gso_size will be zero,
 	 * in theory this shouldn't be necessary but as long as DSACK
 	 * code can come after this skb later on it's better to keep

From 6e38f2869578ae9bdd002b5672441337c68c73b7 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Wed, 1 May 2019 20:16:25 -0400
Subject: [PATCH 714/737] net-tcp_bbr: v2: adjust skb tx.in_flight upon split
 in tcp_fragment()

When we fragment an skb that has already been sent, we need to update
the tx.in_flight for the first skb in the resulting pair ("buff").

Because we were not updating the tx.in_flight, the tx.in_flight value
was inconsistent with the pcount of the "buff" skb (tx.in_flight would
be too high). That meant that if the "buff" skb was lost, then
bbr2_inflight_hi_from_lost_skb() would calculate an inflight_hi value
that is too high. This could result in longer queues and higher packet
loss.

Packetdrill testing verified that without this commit, when the second
half of an skb is SACKed and then later the first half of that skb is
marked lost, the calculated inflight_hi was incorrect.

Effort: net-tcp_bbr
Origin-9xx-SHA1: 385f1ddc610798fab2837f9f372857438b25f874
Change-Id: I617f8cab4e9be7a0b8e8d30b047bf8645393354d
---
 net/ipv4/tcp_output.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 4374fcf9d0f79..143055564c31d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1531,7 +1531,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *buff;
-	int nsize, old_factor;
+	int nsize, old_factor, inflight_prev;
 	long limit;
 	int nlen;
 	u8 flags;
@@ -1609,6 +1609,15 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
 
 		if (diff)
 			tcp_adjust_pcount(sk, skb, diff);
+
+		/* Set buff tx.in_flight as if buff were sent by itself. */
+		inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor;
+		if (WARN_ONCE(inflight_prev < 0,
+			      "inconsistent: tx.in_flight: %u old_factor: %d",
+			      TCP_SKB_CB(skb)->tx.in_flight, old_factor))
+			inflight_prev = 0;
+		TCP_SKB_CB(buff)->tx.in_flight = inflight_prev +
+						 tcp_skb_pcount(buff);
 	}
 
 	/* Link BUFF into the send queue. */

From 74d5250ceb9b0a7435c1a9131c3aa2e16af6683e Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Tue, 7 May 2019 22:37:19 -0400
Subject: [PATCH 715/737] net-tcp_bbr: v2: set tx.in_flight for skbs in repair
 write queue

Syzkaller was able to use TCP_REPAIR to reproduce the new warning
added in tcp_fragment():

  WARNING: CPU: 0 PID: 118174 at net/ipv4/tcp_output.c:1487
    tcp_fragment+0xdcc/0x10a0 net/ipv4/tcp_output.c:1487()
  inconsistent: tx.in_flight: 0 old_factor: 53

The warning happens because skbs inserted into the tcp_rtx_queue
during the repair process go through a sort of "fake send" process,
and that process was seting pcount but not tx.in_flight, and thus the
warnings (where old_factor is the old pcount).

The fix of setting tx.in_flight in the TCP_REPAIR code path seems
simple enough, and indeed makes the repro code from syzkaller stop
producing warnings. Running through kokonut tests, and will send out
for review when all tests pass.

Effort: net-tcp_bbr
Origin-9xx-SHA1: 330f825a08a6fe92cef74d799cc468864c479f63
Change-Id: I0bc4a790f040fd4239620e1eedd5dc64666c6f05
---
 net/ipv4/tcp_output.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 143055564c31d..74e616ce6d3c2 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2636,6 +2636,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 			skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache;
 			list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
 			tcp_init_tso_segs(skb, mss_now);
+			tcp_set_tx_in_flight(sk, skb);
 			goto repair; /* Skip network transmission */
 		}
 

From b4e52d600776b65a99b339a1b4475ffd9d2c89af Mon Sep 17 00:00:00 2001
From: Yousuk Seung <ysseung@google.com>
Date: Wed, 23 May 2018 17:55:54 -0700
Subject: [PATCH 716/737] net-tcp: add new ca opts flag
 TCP_CONG_WANTS_CE_EVENTS

Add a a new ca opts flag TCP_CONG_WANTS_CE_EVENTS that allows a
congestion control module to receive CE events.

Currently congestion control modules have to set the TCP_CONG_NEEDS_ECN
bit in opts flag to receive CE events but this may incur changes in ECN
behavior elsewhere. This patch adds a new bit TCP_CONG_WANTS_CE_EVENTS
that allows congestion control modules to receive CE events
independently of TCP_CONG_NEEDS_ECN.

Effort: net-tcp
Origin-9xx-SHA1: 9f7e14716cde760bc6c67ef8ef7e1ee48501d95b
Change-Id: I2255506985242f376d910c6fd37daabaf4744f24
---
 include/net/tcp.h    | 14 +++++++++++++-
 net/ipv4/tcp_input.c |  4 ++--
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 124f2325100bd..cf2fe9c34aaed 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1027,7 +1027,11 @@ enum tcp_ca_ack_event_flags {
 #define TCP_CONG_NON_RESTRICTED 0x1
 /* Requires ECN/ECT set on all packets */
 #define TCP_CONG_NEEDS_ECN	0x2
-#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
+/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */
+#define TCP_CONG_WANTS_CE_EVENTS	0x4
+#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | \
+			 TCP_CONG_NEEDS_ECN | \
+			 TCP_CONG_WANTS_CE_EVENTS)
 
 union tcp_cc_info;
 
@@ -1156,6 +1160,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
 }
 #endif
 
+static inline bool tcp_ca_wants_ce_events(const struct sock *sk)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+
+	return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN |
+					   TCP_CONG_WANTS_CE_EVENTS);
+}
+
 static inline bool tcp_ca_needs_ecn(const struct sock *sk)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5aca2a5dadc65..83681bf44a38a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -348,7 +348,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
 			tcp_enter_quickack_mode(sk, 2);
 		break;
 	case INET_ECN_CE:
-		if (tcp_ca_needs_ecn(sk))
+		if (tcp_ca_wants_ce_events(sk))
 			tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
 
 		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
@@ -359,7 +359,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
 		tp->ecn_flags |= TCP_ECN_SEEN;
 		break;
 	default:
-		if (tcp_ca_needs_ecn(sk))
+		if (tcp_ca_wants_ce_events(sk))
 			tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
 		tp->ecn_flags |= TCP_ECN_SEEN;
 		break;

From b3f4e605cd83adcee4cab5ab9d4c117ac6ee1fac Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Fri, 27 Sep 2019 17:10:26 -0400
Subject: [PATCH 717/737] net-tcp: re-generalize TSO sizing in TCP CC module
 API

Reorganize the API for CC modules so that the CC module once again
gets complete control of the TSO sizing decision. This is how the API
was set up around 2016 and the initial BBRv1 upstreaming. Later Eric
Dumazet simplified it. But with wider testing it now seems that to
avoid CPU regressions BBR needs to have a different TSO sizing
function.

This is necessary to handle cases where there are many flows
bottlenecked on the sender host's NIC, in which case BBR's pacing rate
is much lower than CUBIC/Reno/DCTCP's. Why does this happen? Because
BBR's pacing rate adapts to the low bandwidth share each flow sees. By
contrast, CUBIC/Reno/DCTCP see no loss or ECN, so they grow a very
large cwnd, and thus large pacing rate and large TSO burst size.

Change-Id: Ic8ccfdbe4010ee8d4bf6a6334c48a2fceb2171ea
[5.10: Include the READ_ONCE from e0bb4ab9dfdd, handle ctx change
from sysctl_tcp_min_tso_segs]
---
 include/net/tcp.h     |  3 +++
 net/ipv4/bpf_tcp_ca.c |  2 +-
 net/ipv4/tcp_bbr.c    | 38 ++++++++++++++++++++++++++------------
 net/ipv4/tcp_output.c | 11 +++++------
 4 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index cf2fe9c34aaed..0840f9d432f0f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1096,6 +1096,9 @@ struct tcp_congestion_ops {
 	/* override sysctl_tcp_min_tso_segs */
 	u32 (*min_tso_segs)(struct sock *sk);
 
+	/* pick target number of segments per TSO/GSO skb (optional): */
+	u32 (*tso_segs)(struct sock *sk, unsigned int mss_now);
+
 	/* react to a specific lost skb (optional) */
 	void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb);
 
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 618954f82764d..8ad93e1fe9dd3 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -16,7 +16,7 @@ static u32 optional_ops[] = {
 	offsetof(struct tcp_congestion_ops, cwnd_event),
 	offsetof(struct tcp_congestion_ops, in_ack_event),
 	offsetof(struct tcp_congestion_ops, pkts_acked),
-	offsetof(struct tcp_congestion_ops, min_tso_segs),
+	offsetof(struct tcp_congestion_ops, tso_segs),
 	offsetof(struct tcp_congestion_ops, sndbuf_expand),
 	offsetof(struct tcp_congestion_ops, cong_control),
 };
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 6274462b86b4b..c0d5a4211fc18 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -292,26 +292,40 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
 		sk->sk_pacing_rate = rate;
 }
 
-/* override sysctl_tcp_min_tso_segs */
 static u32 bbr_min_tso_segs(struct sock *sk)
 {
 	return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
 }
 
+/* Return the number of segments BBR would like in a TSO/GSO skb, given
+ * a particular max gso size as a constraint.
+ */
+static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
+				u32 gso_max_size)
+{
+	u32 segs;
+	u64 bytes;
+
+	/* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
+	bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift;
+
+	bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
+	segs = max_t(u32, bytes / mss_now, bbr_min_tso_segs(sk));
+	return segs;
+}
+
+/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
+static u32  bbr_tso_segs(struct sock *sk, unsigned int mss_now)
+{
+	return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
+}
+
+/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
 static u32 bbr_tso_segs_goal(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	u32 segs, bytes;
-
-	/* Sort of tcp_tso_autosize() but ignoring
-	 * driver provided sk_gso_max_size.
-	 */
-	bytes = min_t(unsigned long,
-		      sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
-		      GSO_MAX_SIZE - 1 - MAX_TCP_HEADER);
-	segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
 
-	return min(segs, 0x7FU);
+	return  bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE);
 }
 
 /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
@@ -1147,7 +1161,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
 	.undo_cwnd	= bbr_undo_cwnd,
 	.cwnd_event	= bbr_cwnd_event,
 	.ssthresh	= bbr_ssthresh,
-	.min_tso_segs	= bbr_min_tso_segs,
+	.tso_segs	= bbr_tso_segs,
 	.get_info	= bbr_get_info,
 	.set_state	= bbr_set_state,
 };
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 74e616ce6d3c2..560abfad552ca 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1992,13 +1992,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
 static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
 {
 	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
-	u32 min_tso, tso_segs;
+	u32 tso_segs;
 
-	min_tso = ca_ops->min_tso_segs ?
-			ca_ops->min_tso_segs(sk) :
-			READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
-
-	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
+	tso_segs = ca_ops->tso_segs ?
+		ca_ops->tso_segs(sk, mss_now) :
+		tcp_tso_autosize(sk, mss_now,
+				 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs));
 	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
 }
 

From 43297058cb89ca077ff1de51e74f6e8ba2167cef Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Sat, 16 Nov 2019 13:16:25 -0500
Subject: [PATCH 718/737] net-tcp: add fast_ack_mode=1: skip rwin check in
 tcp_fast_ack_mode__tcp_ack_snd_check()

Add logic for an experimental TCP connection behavior, enabled with
tp->fast_ack_mode = 1, which disables checking the receive window
before sending an ack in __tcp_ack_snd_check(). If this behavior is
enabled, the data receiver sends an ACK if the amount of data is >
RCV.MSS.

Change-Id: Iaa0a0fd7108221f883137a79d5bfa724f1b096d4
---
 include/linux/tcp.h  | 3 ++-
 net/ipv4/tcp.c       | 1 +
 net/ipv4/tcp_cong.c  | 1 +
 net/ipv4/tcp_input.c | 5 +++--
 4 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 11a98144bda0b..1472d1c147d5c 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -225,7 +225,8 @@ struct tcp_sock {
 	u8	compressed_ack;
 	u8	dup_ack_counter:2,
 		tlp_retrans:1,	/* TLP is a retransmission */
-		unused:5;
+		fast_ack_mode:2, /* which fast ack mode ? */
+		unused:3;
 	u32	chrono_start;	/* Start time in jiffies of a TCP chrono */
 	u32	chrono_stat[3];	/* Time in jiffies for chrono_stat stats */
 	u8	chrono_type:2,	/* current chronograph type */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3dd9b76f40559..92229a9030a34 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2863,6 +2863,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	tp->rx_opt.dsack = 0;
 	tp->rx_opt.num_sacks = 0;
 	tp->rcv_ooopack = 0;
+	tp->fast_ack_mode = 0;
 
 
 	/* Clean up fastopen related fields */
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index db5831e6c136a..153ed9010c0c2 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -179,6 +179,7 @@ void tcp_init_congestion_control(struct sock *sk)
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
 	tcp_sk(sk)->prior_ssthresh = 0;
+	tcp_sk(sk)->fast_ack_mode = 0;
 	if (icsk->icsk_ca_ops->init)
 		icsk->icsk_ca_ops->init(sk);
 	if (tcp_ca_needs_ecn(sk))
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 83681bf44a38a..14c3dd92cd4ff 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5476,13 +5476,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 
 	    /* More than one full frame received... */
 	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
+	     (tp->fast_ack_mode == 1 ||
 	     /* ... and right edge of window advances far enough.
 	      * (tcp_recvmsg() will send ACK otherwise).
 	      * If application uses SO_RCVLOWAT, we want send ack now if
 	      * we have not received enough bytes to satisfy the condition.
 	      */
-	    (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
-	     __tcp_select_window(sk) >= tp->rcv_wnd)) ||
+	      (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
+	       __tcp_select_window(sk) >= tp->rcv_wnd))) ||
 	    /* We ACK each frame or... */
 	    tcp_in_quickack_mode(sk) ||
 	    /* Protocol state mandates a one-time immediate ACK */

From 90d33471b84e1d47c0de1503d8bfadd4e5af071c Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Tue, 11 Jun 2019 12:54:22 -0400
Subject: [PATCH 719/737] net-tcp_bbr: v2: BBRv2 ("bbr2") congestion control
 for Linux TCP

BBR v2 is an enhacement to the BBR v1 algorithm. It's designed to aim for lower
queues, lower loss, and better Reno/CUBIC coexistence than BBR v1.

BBR v2 maintains the core of BBR v1: an explicit model of the network
path that is two-dimensional, adapting to estimate the (a) maximum
available bandwidth and (b) maximum safe volume of data a flow can
keep in-flight in the network. It maintains the estimated BDP as a
core guide for estimating an appropriate level of in-flight data.

BBR v2 makes several key enhancements:

o Its bandwidth-probing time scale is adapted, within bounds, to allow improved
coexistence with Reno and CUBIC. The bandwidth-probing time scale is (a)
extended dynamically based on estimated BDP to improve coexistence with
Reno/CUBIC; (b) bounded by an interactive wall-clock time-scale to be more
scalable and responsive than Reno and CUBIC.

o Rather than being largely agnostic to loss and ECN marks, it explicitly uses
loss and (DCTCP-style) ECN signals to maintain its model.

o It aims for lower losses than v1 by adjusting its model to attempt to stay
within loss rate and ECN mark rate bounds (loss_thresh and ecn_thresh,
respectively).

o It adapts to loss/ECN signals even when the application is running out of
data ("application-limited"), in case the "application-limited" flow is also
"network-limited" (the bw and/or inflight available to this flow is lower than
previously estimated when the flow ran out of data).

o It has a three-part model: the model explicit three tracks operating points,
where an operating point is a tuple: (bandwidth, inflight). The three operating
points are:

  o latest:        the latest measurement from the current round trip
  o upper bound:   robust, optimistic, long-term upper bound
  o lower bound:   robust, conservative, short-term lower bound

These are stored in the following state variables:

  o latest:  bw_latest, inflight_latest
  o lo:      bw_lo,     inflight_lo
  o hi:      bw_hi[2],  inflight_hi

To gain intuition about the meaning of the three operating points, it
may help to consider the analogs in CUBIC, which has a somewhat
analogous three-part model used by its probing state machine:

  BBR param     CUBIC param
  -----------   -------------
  latest     ~  cwnd
  lo         ~  ssthresh
  hi         ~  last_max_cwnd

The analogy is only a loose one, though, since the BBR operating
points are calculated differently, and are 2-dimensional (bw,inflight)
rather than CUBIC's one-dimensional notion of operating point
(inflight).

o It uses the three-part model to adapt the magnitude of its bandwidth
to match the estimated space available in the buffer, rather than (as
in BBR v1) assuming that it was always acceptable to place 0.25*BDP in
the bottleneck buffer when probing (commodity datacenter switches
commonly do not have that much buffer for WAN flows). When BBR v2
estimates it hit a buffer limit during probing, its bandwidth probing
then starts gently in case little space is still available in the
buffer, and the accelerates, slowly at first and then rapidly if it
can grow inflight without seeing congestion signals. In such cases,
probing is bounded by inflight_hi + inflight_probe, where
inflight_probe grows as: [0, 1, 2, 4, 8, 16,...]. This allows BBR to
keep losses low and bounded if a bottleneck remains congested, while
rapidly/scalably utilizing free bandwidth when it becomes available.

o It has a slightly revised state machine, to achieve the goals above.
    BBR_BW_PROBE_UP:    pushes up inflight to probe for bw/vol
    BBR_BW_PROBE_DOWN:  drain excess inflight from the queue
    BBR_BW_PROBE_CRUISE: use pipe, w/ headroom in queue/pipe
    BBR_BW_PROBE_REFILL: try refill the pipe again to 100%, leaving queue empty

o The estimated BDP: BBR v2 continues to maintain an estimate of the
path's two-way propagation delay, by tracking a windowed min_rtt, and
coordinating (on an as-ndeeded basis) to try to expose the two-way
propagation delay by draining the bottleneck queue.

BBR v2 continues to use its min_rtt and (currently-applicable) bandwidth
estimate to estimate the current bandwidth-delay product. The estimated BDP
still provides one important guideline for bounding inflight data. However,
because any min-filtered RTT and max-filtered bw inherently tend to both
overestimate, the estimated BDP is often too high; in this case loss or ECN
marks can ensue, in which case BBR v2 adjusts inflight_hi and inflight_lo to
adapt its sending rate and inflight down to match the available capacity of the
path.

o Space: Note that ICSK_CA_PRIV_SIZE increased. This is because BBR v2
requires more space. Note that much of the space is due to support for
per-socket parameterization and debugging in this release for research
and debugging. With that state removed, the full "struct bbr" is 140
bytes, or 144 with padding. This is an increase of 40 bytes over the
existing ca_priv space.

o Code: BBR v2 reuses many pieces from BBR v1. But it omits the following
  significant pieces:

  o "packet conservation" (bbr_set_cwnd_to_recover_or_restore(),
    bbr_can_grow_inflight())
  o long-term bandwidth estimator ("policer mode")

  The code layout tries to keep BBR v2 code near the bottom of the
  file, so that v1-applicable code in the top does not accidentally
  refer to v2 code.

o Docs:
  See the following docs for more details and diagrams decsribing the BBR v2
  algorithm:
    https://datatracker.ietf.org/meeting/104/materials/slides-104-iccrg-an-update-on-bbr-00
    https://datatracker.ietf.org/meeting/102/materials/slides-102-iccrg-an-update-on-bbr-work-at-google-00

o Internal notes:
  For this upstream rebase, Neal started from:
    git show fed518041ac6:net/ipv4/tcp_bbr.c > net/ipv4/tcp_bbr.c
  then removed dev instrumentation (dynamic get/set for parameters)
  and code that was only used by BBRv1

Effort: net-tcp_bbr
Origin-9xx-SHA1: 2c84098e60bed6d67dde23cd7538c51dee273102
Change-Id: I125cf26ba2a7a686f2fa5e87f4c2afceb65f7a05
---
 include/net/inet_connection_sock.h |    5 +-
 include/uapi/linux/inet_diag.h     |   33 +
 net/ipv4/Kconfig                   |   22 +
 net/ipv4/Makefile                  |    1 +
 net/ipv4/tcp_bbr2.c                | 2683 ++++++++++++++++++++++++++++
 5 files changed, 2742 insertions(+), 2 deletions(-)
 create mode 100644 net/ipv4/tcp_bbr2.c

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index ff901aade442f..921b468969c69 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -134,8 +134,9 @@ struct inet_connection_sock {
 	u32			  icsk_probes_tstamp;
 	u32			  icsk_user_timeout;
 
-	u64			  icsk_ca_priv[104 / sizeof(u64)];
-#define ICSK_CA_PRIV_SIZE      (13 * sizeof(u64))
+/* XXX inflated by temporary internal debugging info */
+#define ICSK_CA_PRIV_SIZE      (216)
+	u64			  icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)];
 };
 
 #define ICSK_TIME_RETRANS	1	/* Retransmit timer */
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index 20ee93f0f8761..96d52dd9c48ac 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -231,9 +231,42 @@ struct tcp_bbr_info {
 	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
 };
 
+/* Phase as reported in netlink/ss stats. */
+enum tcp_bbr2_phase {
+	BBR2_PHASE_INVALID		= 0,
+	BBR2_PHASE_STARTUP		= 1,
+	BBR2_PHASE_DRAIN		= 2,
+	BBR2_PHASE_PROBE_RTT		= 3,
+	BBR2_PHASE_PROBE_BW_UP		= 4,
+	BBR2_PHASE_PROBE_BW_DOWN	= 5,
+	BBR2_PHASE_PROBE_BW_CRUISE	= 6,
+	BBR2_PHASE_PROBE_BW_REFILL	= 7
+};
+
+struct tcp_bbr2_info {
+	/* u64 bw: bandwidth (app throughput) estimate in Byte per sec: */
+	__u32	bbr_bw_lsb;		/* lower 32 bits of bw */
+	__u32	bbr_bw_msb;		/* upper 32 bits of bw */
+	__u32	bbr_min_rtt;		/* min-filtered RTT in uSec */
+	__u32	bbr_pacing_gain;	/* pacing gain shifted left 8 bits */
+	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
+	__u32	bbr_bw_hi_lsb;		/* lower 32 bits of bw_hi */
+	__u32	bbr_bw_hi_msb;		/* upper 32 bits of bw_hi */
+	__u32	bbr_bw_lo_lsb;		/* lower 32 bits of bw_lo */
+	__u32	bbr_bw_lo_msb;		/* upper 32 bits of bw_lo */
+	__u8	bbr_mode;		/* current bbr_mode in state machine */
+	__u8	bbr_phase;		/* current state machine phase */
+	__u8	unused1;		/* alignment padding; not used yet */
+	__u8	bbr_version;		/* MUST be at this offset in struct */
+	__u32	bbr_inflight_lo;	/* lower/short-term data volume bound */
+	__u32	bbr_inflight_hi;	/* higher/long-term data volume bound */
+	__u32	bbr_extra_acked;	/* max excess packets ACKed in epoch */
+};
+
 union tcp_cc_info {
 	struct tcpvegas_info	vegas;
 	struct tcp_dctcp_info	dctcp;
 	struct tcp_bbr_info	bbr;
+	struct tcp_bbr2_info	bbr2;
 };
 #endif /* _UAPI_INET_DIAG_H_ */
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 23b06063e1a51..27f80112c7072 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -679,6 +679,24 @@ config TCP_CONG_BBR
 	  AQM schemes that do not provide a delay signal. It requires the fq
 	  ("Fair Queue") pacing packet scheduler.
 
+config TCP_CONG_BBR2
+	tristate "BBR2 TCP"
+	default n
+	help
+
+	BBR2 TCP congestion control is a model-based congestion control
+	algorithm that aims to maximize network utilization, keep queues and
+	retransmit rates low, and to be able to coexist with Reno/CUBIC in
+	common scenarios. It builds an explicit model of the network path.  It
+	tolerates a targeted degree of random packet loss and delay that are
+	unrelated to congestion. It can operate over LAN, WAN, cellular, wifi,
+	or cable modem links, and can use DCTCP-L4S-style ECN signals.  It can
+	coexist with flows that use loss-based congestion control, and can
+	operate with shallow buffers, deep buffers, bufferbloat, policers, or
+	AQM schemes that do not provide a delay signal. It requires pacing,
+	using either TCP internal pacing or the fq ("Fair Queue") pacing packet
+	scheduler.
+
 choice
 	prompt "Default TCP congestion control"
 	default DEFAULT_CUBIC
@@ -716,6 +734,9 @@ choice
 	config DEFAULT_BBR
 		bool "BBR" if TCP_CONG_BBR=y
 
+	config DEFAULT_BBR2
+		bool "BBR2" if TCP_CONG_BBR2=y
+
 	config DEFAULT_RENO
 		bool "Reno"
 endchoice
@@ -740,6 +761,7 @@ config DEFAULT_TCP_CONG
 	default "dctcp" if DEFAULT_DCTCP
 	default "cdg" if DEFAULT_CDG
 	default "bbr" if DEFAULT_BBR
+	default "bbr2" if DEFAULT_BBR2
 	default "cubic"
 
 config TCP_MD5SIG
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 5b77a46885b95..8c5779dba462a 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -46,6 +46,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
 obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
 obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
 obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
+obj-$(CONFIG_TCP_CONG_BBR2) += tcp_bbr2.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
 obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
diff --git a/net/ipv4/tcp_bbr2.c b/net/ipv4/tcp_bbr2.c
new file mode 100644
index 0000000000000..a6959b70e51d1
--- /dev/null
+++ b/net/ipv4/tcp_bbr2.c
@@ -0,0 +1,2683 @@
+/* BBR (Bottleneck Bandwidth and RTT) congestion control, v2
+ *
+ * BBRv2 is a model-based congestion control algorithm that aims for low
+ * queues, low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model
+ * of the network path, it uses measurements of bandwidth and RTT, as well as
+ * (if they occur) packet loss and/or DCTCP/L4S-style ECN signals.  Note that
+ * although it can use ECN or loss signals explicitly, it does not require
+ * either; it can bound its in-flight data based on its estimate of the BDP.
+ *
+ * The model has both higher and lower bounds for the operating range:
+ *   lo: bw_lo, inflight_lo: conservative short-term lower bound
+ *   hi: bw_hi, inflight_hi: robust long-term upper bound
+ * The bandwidth-probing time scale is (a) extended dynamically based on
+ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by
+ * an interactive wall-clock time-scale to be more scalable and responsive
+ * than Reno and CUBIC.
+ *
+ * Here is a state transition diagram for BBR:
+ *
+ *             |
+ *             V
+ *    +---> STARTUP  ----+
+ *    |        |         |
+ *    |        V         |
+ *    |      DRAIN   ----+
+ *    |        |         |
+ *    |        V         |
+ *    +---> PROBE_BW ----+
+ *    |      ^    |      |
+ *    |      |    |      |
+ *    |      +----+      |
+ *    |                  |
+ *    +---- PROBE_RTT <--+
+ *
+ * A BBR flow starts in STARTUP, and ramps up its sending rate quickly.
+ * When it estimates the pipe is full, it enters DRAIN to drain the queue.
+ * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT.
+ * A long-lived BBR flow spends the vast majority of its time remaining
+ * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth
+ * in a fair manner, with a small, bounded queue. *If* a flow has been
+ * continuously sending for the entire min_rtt window, and hasn't seen an RTT
+ * sample that matches or decreases its min_rtt estimate for 10 seconds, then
+ * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe
+ * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if
+ * we estimated that we reached the full bw of the pipe then we enter PROBE_BW;
+ * otherwise we enter STARTUP to try to fill the pipe.
+ *
+ * BBR is described in detail in:
+ *   "BBR: Congestion-Based Congestion Control",
+ *   Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
+ *   Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016.
+ *
+ * There is a public e-mail list for discussing BBR development and testing:
+ *   https://groups.google.com/forum/#!forum/bbr-dev
+ *
+ * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled,
+ * otherwise TCP stack falls back to an internal pacing using one high
+ * resolution timer per TCP socket and may use more resources.
+ */
+#include <linux/module.h>
+#include <net/tcp.h>
+#include <linux/inet_diag.h>
+#include <linux/inet.h>
+#include <linux/random.h>
+
+#include "tcp_dctcp.h"
+
+/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
+ * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
+ * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
+ * Since the minimum window is >=4 packets, the lower bound isn't
+ * an issue. The upper bound isn't an issue with existing technologies.
+ */
+#define BW_SCALE 24
+#define BW_UNIT (1 << BW_SCALE)
+
+#define BBR_SCALE 8	/* scaling factor for fractions in BBR (e.g. gains) */
+#define BBR_UNIT (1 << BBR_SCALE)
+
+#define FLAG_DEBUG_VERBOSE	0x1	/* Verbose debugging messages */
+#define FLAG_DEBUG_LOOPBACK	0x2	/* Do NOT skip loopback addr */
+
+#define CYCLE_LEN		8	/* number of phases in a pacing gain cycle */
+
+/* BBR has the following modes for deciding how fast to send: */
+enum bbr_mode {
+	BBR_STARTUP,	/* ramp up sending rate rapidly to fill pipe */
+	BBR_DRAIN,	/* drain any queue created during startup */
+	BBR_PROBE_BW,	/* discover, share bw: pace around estimated bw */
+	BBR_PROBE_RTT,	/* cut inflight to min to probe min_rtt */
+};
+
+/* How does the incoming ACK stream relate to our bandwidth probing? */
+enum bbr_ack_phase {
+	BBR_ACKS_INIT,		  /* not probing; not getting probe feedback */
+	BBR_ACKS_REFILLING,	  /* sending at est. bw to fill pipe */
+	BBR_ACKS_PROBE_STARTING,  /* inflight rising to probe bw */
+	BBR_ACKS_PROBE_FEEDBACK,  /* getting feedback from bw probing */
+	BBR_ACKS_PROBE_STOPPING,  /* stopped probing; still getting feedback */
+};
+
+/* BBR congestion control block */
+struct bbr {
+	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
+	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
+	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
+	u32	probe_rtt_min_us;	/* min RTT in bbr_probe_rtt_win_ms window */
+	u32	probe_rtt_min_stamp;	/* timestamp of probe_rtt_min_us*/
+	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
+	u32	prior_rcv_nxt;	/* tp->rcv_nxt when CE state last changed */
+	u64	cycle_mstamp;	     /* time of this cycle phase start */
+	u32     mode:3,		     /* current bbr_mode in state machine */
+		prev_ca_state:3,     /* CA state on previous ACK */
+		packet_conservation:1,  /* use packet conservation? */
+		round_start:1,	     /* start of packet-timed tx->ack round? */
+		ce_state:1,          /* If most recent data has CE bit set */
+		bw_probe_up_rounds:5,   /* cwnd-limited rounds in PROBE_UP */
+		try_fast_path:1, 	/* can we take fast path? */
+		unused2:11,
+		idle_restart:1,	     /* restarting after idle? */
+		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
+		cycle_idx:3,	/* current index in pacing_gain cycle array */
+		has_seen_rtt:1;	     /* have we seen an RTT sample yet? */
+	u32	pacing_gain:11,	/* current gain for setting pacing rate */
+		cwnd_gain:11,	/* current gain for setting cwnd */
+		full_bw_reached:1,   /* reached full bw in Startup? */
+		full_bw_cnt:2,	/* number of rounds without large bw gains */
+		init_cwnd:7;	/* initial cwnd */
+	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
+	u32	full_bw;	/* recent bw, to estimate if pipe is full */
+
+	/* For tracking ACK aggregation: */
+	u64	ack_epoch_mstamp;	/* start of ACK sampling epoch */
+	u16	extra_acked[2];		/* max excess data ACKed in epoch */
+	u32	ack_epoch_acked:20,	/* packets (S)ACKed in sampling epoch */
+		extra_acked_win_rtts:5,	/* age of extra_acked, in round trips */
+		extra_acked_win_idx:1,	/* current index in extra_acked array */
+	/* BBR v2 state: */
+		unused1:2,
+		startup_ecn_rounds:2,	/* consecutive hi ECN STARTUP rounds */
+		loss_in_cycle:1,	/* packet loss in this cycle? */
+		ecn_in_cycle:1;		/* ECN in this cycle? */
+	u32	loss_round_delivered; /* scb->tx.delivered ending loss round */
+	u32	undo_bw_lo;	     /* bw_lo before latest losses */
+	u32	undo_inflight_lo;    /* inflight_lo before latest losses */
+	u32	undo_inflight_hi;    /* inflight_hi before latest losses */
+	u32	bw_latest;	 /* max delivered bw in last round trip */
+	u32	bw_lo;		 /* lower bound on sending bandwidth */
+	u32	bw_hi[2];	 /* upper bound of sending bandwidth range*/
+	u32	inflight_latest; /* max delivered data in last round trip */
+	u32	inflight_lo;	 /* lower bound of inflight data range */
+	u32	inflight_hi;	 /* upper bound of inflight data range */
+	u32	bw_probe_up_cnt; /* packets delivered per inflight_hi incr */
+	u32	bw_probe_up_acks;  /* packets (S)ACKed since inflight_hi incr */
+	u32	probe_wait_us;	 /* PROBE_DOWN until next clock-driven probe */
+	u32	ecn_eligible:1,	/* sender can use ECN (RTT, handshake)? */
+		ecn_alpha:9,	/* EWMA delivered_ce/delivered; 0..256 */
+		bw_probe_samples:1,    /* rate samples reflect bw probing? */
+		prev_probe_too_high:1, /* did last PROBE_UP go too high? */
+		stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */
+		rounds_since_probe:8,  /* packet-timed rounds since probed bw */
+		loss_round_start:1,    /* loss_round_delivered round trip? */
+		loss_in_round:1,       /* loss marked in this round trip? */
+		ecn_in_round:1,	       /* ECN marked in this round trip? */
+		ack_phase:3,	       /* bbr_ack_phase: meaning of ACKs */
+		loss_events_in_round:4,/* losses in STARTUP round */
+		initialized:1;	       /* has bbr_init() been called? */
+	u32	alpha_last_delivered;	 /* tp->delivered    at alpha update */
+	u32	alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */
+
+	/* Params configurable using setsockopt. Refer to correspoding
+	 * module param for detailed description of params.
+	 */
+	struct bbr_params {
+		u32	high_gain:11,		/* max allowed value: 2047 */
+			drain_gain:10,		/* max allowed value: 1023 */
+			cwnd_gain:11;		/* max allowed value: 2047 */
+		u32	cwnd_min_target:4,	/* max allowed value: 15 */
+			min_rtt_win_sec:5,	/* max allowed value: 31 */
+			probe_rtt_mode_ms:9,	/* max allowed value: 511 */
+			full_bw_cnt:3,		/* max allowed value: 7 */
+			bw_rtts:5,		/* max allowed value: 31 */
+			cwnd_tso_budget:1,	/* allowed values: {0, 1} */
+			unused3:1,
+			drain_to_target:1,	/* boolean */
+			precise_ece_ack:1,	/* boolean */
+			extra_acked_in_startup:1, /* allowed values: {0, 1} */
+			fast_path:1;		/* boolean */
+		u32	full_bw_thresh:10,	/* max allowed value: 1023 */
+			startup_cwnd_gain:11,	/* max allowed value: 2047 */
+			bw_probe_pif_gain:9,	/* max allowed value: 511 */
+			usage_based_cwnd:1, 	/* boolean */
+			unused2:1;
+		u16	probe_rtt_win_ms:14,	/* max allowed value: 16383 */
+			refill_add_inc:2;	/* max allowed value: 3 */
+		u16	extra_acked_gain:11,	/* max allowed value: 2047 */
+			extra_acked_win_rtts:5; /* max allowed value: 31*/
+		u16	pacing_gain[CYCLE_LEN]; /* max allowed value: 1023 */
+		/* Mostly BBR v2 parameters below here: */
+		u32	ecn_alpha_gain:8,	/* max allowed value: 255 */
+			ecn_factor:8,		/* max allowed value: 255 */
+			ecn_thresh:8,		/* max allowed value: 255 */
+			beta:8;			/* max allowed value: 255 */
+		u32	ecn_max_rtt_us:19,	/* max allowed value: 524287 */
+			bw_probe_reno_gain:9,	/* max allowed value: 511 */
+			full_loss_cnt:4;	/* max allowed value: 15 */
+		u32	probe_rtt_cwnd_gain:8,	/* max allowed value: 255 */
+			inflight_headroom:8,	/* max allowed value: 255 */
+			loss_thresh:8,		/* max allowed value: 255 */
+			bw_probe_max_rounds:8;	/* max allowed value: 255 */
+		u32	bw_probe_rand_rounds:4, /* max allowed value: 15 */
+			bw_probe_base_us:26,	/* usecs: 0..2^26-1 (67 secs) */
+			full_ecn_cnt:2;		/* max allowed value: 3 */
+		u32	bw_probe_rand_us:26,	/* usecs: 0..2^26-1 (67 secs) */
+			undo:1,			/* boolean */
+			tso_rtt_shift:4,	/* max allowed value: 15 */
+			unused5:1;
+		u32	ecn_reprobe_gain:9,	/* max allowed value: 511 */
+			unused1:14,
+			ecn_alpha_init:9;	/* max allowed value: 256 */
+	} params;
+
+	struct {
+		u32	snd_isn; /* Initial sequence number */
+		u32	rs_bw; 	 /* last valid rate sample bw */
+		u32	target_cwnd; /* target cwnd, based on BDP */
+		u8	undo:1,  /* Undo even happened but not yet logged */
+			unused:7;
+		char	event;	 /* single-letter event debug codes */
+		u16	unused2;
+	} debug;
+};
+
+struct bbr_context {
+	u32 sample_bw;
+	u32 target_cwnd;
+	u32 log:1;
+};
+
+/* Window length of bw filter (in rounds). Max allowed value is 31 (0x1F) */
+static int bbr_bw_rtts = CYCLE_LEN + 2;
+/* Window length of min_rtt filter (in sec). Max allowed value is 31 (0x1F) */
+static u32 bbr_min_rtt_win_sec = 10;
+/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode.
+ * Max allowed value is 511 (0x1FF).
+ */
+static u32 bbr_probe_rtt_mode_ms = 200;
+/* Window length of probe_rtt_min_us filter (in ms), and consequently the
+ * typical interval between PROBE_RTT mode entries.
+ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC
+ */
+static u32 bbr_probe_rtt_win_ms = 5000;
+/* Skip TSO below the following bandwidth (bits/sec): */
+static int bbr_min_tso_rate = 1200000;
+
+/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
+ * in bigger TSO bursts. By default we cut the RTT-based allowance in half
+ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
+ * is below 1500 bytes after 6 * ~500 usec = 3ms.
+ */
+static u32 bbr_tso_rtt_shift = 9;  /* halve allowance per 2^9 usecs, 512us */
+
+/* Select cwnd TSO budget approach:
+ *  0: padding
+ *  1: flooring
+ */
+static uint bbr_cwnd_tso_budget = 1;
+
+/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
+ * In order to help drive the network toward lower queues and low latency while
+ * maintaining high utilization, the average pacing rate aims to be slightly
+ * lower than the estimated bandwidth. This is an important aspect of the
+ * design.
+ */
+static const int bbr_pacing_margin_percent = 1;
+
+/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
+ * that will allow a smoothly increasing pacing rate that will double each RTT
+ * and send the same number of packets per RTT that an un-paced, slow-starting
+ * Reno or CUBIC flow would. Max allowed value is 2047 (0x7FF).
+ */
+static int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
+/* The gain for deriving startup cwnd. Max allowed value is 2047 (0x7FF). */
+static int bbr_startup_cwnd_gain  = BBR_UNIT * 2885 / 1000 + 1;
+/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
+ * the queue created in BBR_STARTUP in a single round. Max allowed value
+ * is 1023 (0x3FF).
+ */
+static int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs.
+ * Max allowed value is 2047 (0x7FF).
+ */
+static int bbr_cwnd_gain  = BBR_UNIT * 2;
+/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw.
+ * Max allowed value for each element is 1023 (0x3FF).
+ */
+enum bbr_pacing_gain_phase {
+	BBR_BW_PROBE_UP		= 0,  /* push up inflight to probe for bw/vol */
+	BBR_BW_PROBE_DOWN	= 1,  /* drain excess inflight from the queue */
+	BBR_BW_PROBE_CRUISE	= 2,  /* use pipe, w/ headroom in queue/pipe */
+	BBR_BW_PROBE_REFILL	= 3,  /* v2: refill the pipe again to 100% */
+};
+static int bbr_pacing_gain[] = {
+	BBR_UNIT * 5 / 4,	/* probe for more available bw */
+	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
+	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
+	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
+};
+/* Randomize the starting gain cycling phase over N phases: */
+static u32 bbr_cycle_rand = 7;
+
+/* Try to keep at least this many packets in flight, if things go smoothly. For
+ * smooth functioning, a sliding window protocol ACKing every other packet
+ * needs at least 4 packets in flight. Max allowed value is 15 (0xF).
+ */
+static u32 bbr_cwnd_min_target = 4;
+
+/* Cwnd to BDP proportion in PROBE_RTT mode scaled by BBR_UNIT. Default: 50%.
+ * Use 0 to disable. Max allowed value is 255.
+ */
+static u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2;
+
+/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
+/* If bw has increased significantly (1.25x), there may be more bw available.
+ * Max allowed value is 1023 (0x3FF).
+ */
+static u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
+/* But after 3 rounds w/o significant bw growth, estimate pipe is full.
+ * Max allowed value is 7 (0x7).
+ */
+static u32 bbr_full_bw_cnt = 3;
+
+static u32 bbr_flags;		/* Debugging related stuff */
+
+/* Whether to debug using printk.
+ */
+static bool bbr_debug_with_printk;
+
+/* Whether to debug using ftrace event tcp:tcp_bbr_event.
+ * Ignored when bbr_debug_with_printk is set.
+ */
+static bool bbr_debug_ftrace;
+
+/* Experiment: each cycle, try to hold sub-unity gain until inflight <= BDP. */
+static bool bbr_drain_to_target = true;		/* default: enabled */
+
+/* Experiment: Flags to control BBR with ECN behavior.
+ */
+static bool bbr_precise_ece_ack = true;		/* default: enabled */
+
+/* The max rwin scaling shift factor is 14 (RFC 1323), so the max sane rwin is
+ * (2^(16+14) B)/(1024 B/packet) = 1M packets.
+ */
+static u32 bbr_cwnd_warn_val	= 1U << 20;
+
+static u16 bbr_debug_port_mask;
+
+/* BBR module parameters. These are module parameters only in Google prod.
+ * Upstream these are intentionally not module parameters.
+ */
+static int bbr_pacing_gain_size = CYCLE_LEN;
+
+/* Gain factor for adding extra_acked to target cwnd: */
+static int bbr_extra_acked_gain = 256;
+
+/* Window length of extra_acked window. Max allowed val is 31. */
+static u32 bbr_extra_acked_win_rtts = 5;
+
+/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */
+static u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
+
+/* Time period for clamping cwnd increment due to ack aggregation */
+static u32 bbr_extra_acked_max_us = 100 * 1000;
+
+/* Use extra acked in startup ?
+ * 0: disabled
+ * 1: use latest extra_acked value from 1-2 rtt in startup
+ */
+static int bbr_extra_acked_in_startup = 1;		/* default: enabled */
+
+/* Experiment: don't grow cwnd beyond twice of what we just probed. */
+static bool bbr_usage_based_cwnd;		/* default: disabled */
+
+/* For lab testing, researchers can enable BBRv2 ECN support with this flag,
+ * when they know that any ECN marks that the connections experience will be
+ * DCTCP/L4S-style ECN marks, rather than RFC3168 ECN marks.
+ * TODO(ncardwell): Production use of the BBRv2 ECN functionality depends on
+ * negotiation or configuration that is outside the scope of the BBRv2
+ * alpha release.
+ */
+static bool bbr_ecn_enable = false;
+
+module_param_named(bw_rtts,           bbr_bw_rtts,           int,    0644);
+module_param_named(min_tso_rate,      bbr_min_tso_rate,      int,    0644);
+module_param_named(tso_rtt_shift,     bbr_tso_rtt_shift,     int,    0644);
+module_param_named(high_gain,         bbr_high_gain,         int,    0644);
+module_param_named(drain_gain,        bbr_drain_gain,        int,    0644);
+module_param_named(startup_cwnd_gain, bbr_startup_cwnd_gain, int,    0644);
+module_param_named(cwnd_gain,         bbr_cwnd_gain,         int,    0644);
+module_param_array_named(pacing_gain, bbr_pacing_gain,       int,
+			 &bbr_pacing_gain_size, 0644);
+module_param_named(cycle_rand,        bbr_cycle_rand,        uint,   0644);
+module_param_named(cwnd_min_target,   bbr_cwnd_min_target,   uint,   0644);
+module_param_named(probe_rtt_cwnd_gain,
+		   bbr_probe_rtt_cwnd_gain,		     uint,   0664);
+module_param_named(cwnd_warn_val,     bbr_cwnd_warn_val,     uint,   0664);
+module_param_named(debug_port_mask,   bbr_debug_port_mask,   ushort, 0644);
+module_param_named(flags,             bbr_flags,             uint,   0644);
+module_param_named(debug_ftrace,      bbr_debug_ftrace, bool,   0644);
+module_param_named(debug_with_printk, bbr_debug_with_printk, bool,   0644);
+module_param_named(min_rtt_win_sec,   bbr_min_rtt_win_sec,   uint,   0644);
+module_param_named(probe_rtt_mode_ms, bbr_probe_rtt_mode_ms, uint,   0644);
+module_param_named(probe_rtt_win_ms,  bbr_probe_rtt_win_ms,  uint,   0644);
+module_param_named(full_bw_thresh,    bbr_full_bw_thresh,    uint,   0644);
+module_param_named(full_bw_cnt,       bbr_full_bw_cnt,       uint,   0644);
+module_param_named(cwnd_tso_bduget,   bbr_cwnd_tso_budget,   uint,   0664);
+module_param_named(extra_acked_gain,  bbr_extra_acked_gain,  int,    0664);
+module_param_named(extra_acked_win_rtts,
+		   bbr_extra_acked_win_rtts, uint,   0664);
+module_param_named(extra_acked_max_us,
+		   bbr_extra_acked_max_us, uint,   0664);
+module_param_named(ack_epoch_acked_reset_thresh,
+		   bbr_ack_epoch_acked_reset_thresh, uint,   0664);
+module_param_named(drain_to_target,   bbr_drain_to_target,   bool,   0664);
+module_param_named(precise_ece_ack,   bbr_precise_ece_ack,   bool,   0664);
+module_param_named(extra_acked_in_startup,
+		   bbr_extra_acked_in_startup, int, 0664);
+module_param_named(usage_based_cwnd, bbr_usage_based_cwnd, bool,   0664);
+module_param_named(ecn_enable,       bbr_ecn_enable,         bool,   0664);
+
+static void bbr2_exit_probe_rtt(struct sock *sk);
+static void bbr2_reset_congestion_signals(struct sock *sk);
+
+static void bbr_check_probe_rtt_done(struct sock *sk);
+
+/* Do we estimate that STARTUP filled the pipe? */
+static bool bbr_full_bw_reached(const struct sock *sk)
+{
+	const struct bbr *bbr = inet_csk_ca(sk);
+
+	return bbr->full_bw_reached;
+}
+
+/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
+static u32 bbr_max_bw(const struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	return max(bbr->bw_hi[0], bbr->bw_hi[1]);
+}
+
+/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
+static u32 bbr_bw(const struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	return min(bbr_max_bw(sk), bbr->bw_lo);
+}
+
+/* Return maximum extra acked in past k-2k round trips,
+ * where k = bbr_extra_acked_win_rtts.
+ */
+static u16 bbr_extra_acked(const struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	return max(bbr->extra_acked[0], bbr->extra_acked[1]);
+}
+
+/* Return rate in bytes per second, optionally with a gain.
+ * The order here is chosen carefully to avoid overflow of u64. This should
+ * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
+ */
+static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain,
+				  int margin)
+{
+	unsigned int mss = tcp_sk(sk)->mss_cache;
+
+	rate *= mss;
+	rate *= gain;
+	rate >>= BBR_SCALE;
+	rate *= USEC_PER_SEC / 100 * (100 - margin);
+	rate >>= BW_SCALE;
+	rate = max(rate, 1ULL);
+	return rate;
+}
+
+static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate)
+{
+	return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0);
+}
+
+static u64 bbr_rate_kbps(struct sock *sk, u64 rate)
+{
+	rate = bbr_bw_bytes_per_sec(sk, rate);
+	rate *= 8;
+	do_div(rate, 1000);
+	return rate;
+}
+
+static u32 bbr_tso_segs_goal(struct sock *sk);
+static void bbr_debug(struct sock *sk, u32 acked,
+		      const struct rate_sample *rs, struct bbr_context *ctx)
+{
+	static const char ca_states[] = {
+		[TCP_CA_Open]		= 'O',
+		[TCP_CA_Disorder]	= 'D',
+		[TCP_CA_CWR]		= 'C',
+		[TCP_CA_Recovery]	= 'R',
+		[TCP_CA_Loss]		= 'L',
+	};
+	static const char mode[] = {
+		'G',  /* Growing   - BBR_STARTUP */
+		'D',  /* Drain     - BBR_DRAIN */
+		'W',  /* Window    - BBR_PROBE_BW */
+		'M',  /* Min RTT   - BBR_PROBE_RTT */
+	};
+	static const char ack_phase[] = { /* bbr_ack_phase strings */
+		'I',	/* BBR_ACKS_INIT	   - 'Init' */
+		'R',	/* BBR_ACKS_REFILLING	   - 'Refilling' */
+		'B',	/* BBR_ACKS_PROBE_STARTING - 'Before' */
+		'F',	/* BBR_ACKS_PROBE_FEEDBACK - 'Feedback' */
+		'A',	/* BBR_ACKS_PROBE_STOPPING - 'After' */
+	};
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	const u32 una = tp->snd_una - bbr->debug.snd_isn;
+	const u32 fack = tcp_highest_sack_seq(tp);
+	const u16 dport = ntohs(inet_sk(sk)->inet_dport);
+	bool is_port_match = (bbr_debug_port_mask &&
+			      ((dport & bbr_debug_port_mask) == 0));
+	char debugmsg[320];
+
+	if (sk->sk_state == TCP_SYN_SENT)
+		return;  /* no bbr_init() yet if SYN retransmit -> CA_Loss */
+
+	if (!tp->snd_cwnd || tp->snd_cwnd > bbr_cwnd_warn_val) {
+		char addr[INET6_ADDRSTRLEN + 10] = { 0 };
+
+		if (sk->sk_family == AF_INET)
+			snprintf(addr, sizeof(addr), "%pI4:%u",
+				 &inet_sk(sk)->inet_daddr, dport);
+		else if (sk->sk_family == AF_INET6)
+			snprintf(addr, sizeof(addr), "%pI6:%u",
+				 &sk->sk_v6_daddr, dport);
+
+		WARN_ONCE(1,
+			"BBR %s cwnd alert: %u "
+			"snd_una: %u ca: %d pacing_gain: %u cwnd_gain: %u "
+			"bw: %u rtt: %u min_rtt: %u "
+			"acked: %u tso_segs: %u "
+			"bw: %d %ld %d pif: %u\n",
+			addr, tp->snd_cwnd,
+			una, inet_csk(sk)->icsk_ca_state,
+			bbr->pacing_gain, bbr->cwnd_gain,
+			bbr_max_bw(sk), (tp->srtt_us >> 3), bbr->min_rtt_us,
+			acked, bbr_tso_segs_goal(sk),
+			rs->delivered, rs->interval_us, rs->is_retrans,
+			tcp_packets_in_flight(tp));
+	}
+
+	if (likely(!bbr_debug_with_printk && !bbr_debug_ftrace))
+		return;
+
+	if (!sock_flag(sk, SOCK_DBG) && !is_port_match)
+		return;
+
+	if (!ctx->log && !tp->app_limited && !(bbr_flags & FLAG_DEBUG_VERBOSE))
+		return;
+
+	if (ipv4_is_loopback(inet_sk(sk)->inet_daddr) &&
+	    !(bbr_flags & FLAG_DEBUG_LOOPBACK))
+		return;
+
+	snprintf(debugmsg, sizeof(debugmsg) - 1,
+		 "BBR %pI4:%-5u %5u,%03u:%-7u %c "
+		 "%c %2u br %2u cr %2d rtt %5ld d %2d i %5ld mrtt %d %cbw %llu "
+		 "bw %llu lb %llu ib %llu qb %llu "
+		 "a %u if %2u %c %c dl %u l %u al %u # %u t %u %c %c "
+		 "lr %d er %d ea %d bwl %lld il %d ih %d c %d "
+		 "v %d %c %u %c %s\n",
+		 &inet_sk(sk)->inet_daddr, dport,
+		 una / 1000, una % 1000, fack - tp->snd_una,
+		 ca_states[inet_csk(sk)->icsk_ca_state],
+		 bbr->debug.undo ? '@' : mode[bbr->mode],
+		 tp->snd_cwnd,
+		 bbr_extra_acked(sk),	/* br (legacy): extra_acked */
+		 rs->tx_in_flight,	/* cr (legacy): tx_inflight */
+		 rs->rtt_us,
+		 rs->delivered,
+		 rs->interval_us,
+		 bbr->min_rtt_us,
+		 rs->is_app_limited ? '_' : 'l',
+		 bbr_rate_kbps(sk, ctx->sample_bw), /* lbw: latest sample bw */
+		 bbr_rate_kbps(sk, bbr_max_bw(sk)), /* bw: max bw */
+		 0ULL,				    /* lb: [obsolete] */
+		 0ULL,				    /* ib: [obsolete] */
+		 (u64)sk->sk_pacing_rate * 8 / 1000,
+		 acked,
+		 tcp_packets_in_flight(tp),
+		 rs->is_ack_delayed ? 'd' : '.',
+		 bbr->round_start ? '*' : '.',
+		 tp->delivered, tp->lost,
+		 tp->app_limited,
+		 0,			    	    /* #: [obsolete] */
+		 ctx->target_cwnd,
+		 tp->reord_seen ? 'r' : '.',  /* r: reordering seen? */
+		 ca_states[bbr->prev_ca_state],
+		 (rs->lost + rs->delivered) > 0 ?
+		 (1000 * rs->lost /
+		  (rs->lost + rs->delivered)) : 0,    /* lr: loss rate x1000 */
+		 (rs->delivered) > 0 ?
+		 (1000 * rs->delivered_ce /
+		  (rs->delivered)) : 0,		      /* er: ECN rate x1000 */
+		 1000 * bbr->ecn_alpha >> BBR_SCALE,  /* ea: ECN alpha x1000 */
+		 bbr->bw_lo == ~0U ?
+		   -1 : (s64)bbr_rate_kbps(sk, bbr->bw_lo), /* bwl */
+		 bbr->inflight_lo,	/* il */
+		 bbr->inflight_hi,	/* ih */
+		 bbr->bw_probe_up_cnt,	/* c */
+		 2,			/* v: version */
+		 bbr->debug.event,
+		 bbr->cycle_idx,
+		 ack_phase[bbr->ack_phase],
+		 bbr->bw_probe_samples ? "Y" : "N");
+	debugmsg[sizeof(debugmsg) - 1] = 0;
+
+	/* printk takes a higher precedence. */
+	if (bbr_debug_with_printk)
+		printk(KERN_DEBUG "%s", debugmsg);
+
+	if (unlikely(bbr->debug.undo))
+		bbr->debug.undo = 0;
+}
+
+/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
+static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
+{
+	u64 rate = bw;
+
+	rate = bbr_rate_bytes_per_sec(sk, rate, gain,
+				      bbr_pacing_margin_percent);
+	rate = min_t(u64, rate, sk->sk_max_pacing_rate);
+	return rate;
+}
+
+/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
+static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u64 bw;
+	u32 rtt_us;
+
+	if (tp->srtt_us) {		/* any RTT sample yet? */
+		rtt_us = max(tp->srtt_us >> 3, 1U);
+		bbr->has_seen_rtt = 1;
+	} else {			 /* no RTT sample yet */
+		rtt_us = USEC_PER_MSEC;	 /* use nominal default RTT */
+	}
+	bw = (u64)tp->snd_cwnd * BW_UNIT;
+	do_div(bw, rtt_us);
+	sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr->params.high_gain);
+}
+
+/* Pace using current bw estimate and a gain factor. */
+static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain);
+
+	if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
+		bbr_init_pacing_rate_from_rtt(sk);
+	if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate)
+		sk->sk_pacing_rate = rate;
+}
+
+static u32 bbr_min_tso_segs(struct sock *sk)
+{
+	return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
+}
+
+/* Return the number of segments BBR would like in a TSO/GSO skb, given
+ * a particular max gso size as a constraint.
+ */
+static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
+				u32 gso_max_size)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 segs, r;
+	u64 bytes;
+
+	/* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
+	bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift;
+
+	/* Budget a TSO/GSO burst size allowance based on min_rtt. For every
+	 * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst.
+	 * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K)
+	 */
+	if (bbr->params.tso_rtt_shift) {
+		r = bbr->min_rtt_us >> bbr->params.tso_rtt_shift;
+		if (r < BITS_PER_TYPE(u32))   /* prevent undefined behavior */
+			bytes += GSO_MAX_SIZE >> r;
+	}
+
+	bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
+	segs = max_t(u32, bytes / mss_now, bbr_min_tso_segs(sk));
+	return segs;
+}
+
+/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
+static u32  bbr_tso_segs(struct sock *sk, unsigned int mss_now)
+{
+	return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
+}
+
+/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
+static u32 bbr_tso_segs_goal(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	return  bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE);
+}
+
+/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
+static void bbr_save_cwnd(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
+		bbr->prior_cwnd = tp->snd_cwnd;  /* this cwnd is good enough */
+	else  /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
+		bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd);
+}
+
+static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (event == CA_EVENT_TX_START && tp->app_limited) {
+		bbr->idle_restart = 1;
+		bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+		bbr->ack_epoch_acked = 0;
+		/* Avoid pointless buffer overflows: pace at est. bw if we don't
+		 * need more speed (we're restarting from idle and app-limited).
+		 */
+		if (bbr->mode == BBR_PROBE_BW)
+			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
+		else if (bbr->mode == BBR_PROBE_RTT)
+			bbr_check_probe_rtt_done(sk);
+	} else if ((event == CA_EVENT_ECN_IS_CE ||
+		    event == CA_EVENT_ECN_NO_CE) &&
+		    bbr_ecn_enable &&
+		    bbr->params.precise_ece_ack) {
+		u32 state = bbr->ce_state;
+		dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state);
+		bbr->ce_state = state;
+		if (tp->fast_ack_mode == 2 && event == CA_EVENT_ECN_IS_CE)
+			tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
+	}
+}
+
+/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth:
+ *
+ * bdp = ceil(bw * min_rtt * gain)
+ *
+ * The key factor, gain, controls the amount of queue. While a small gain
+ * builds a smaller queue, it becomes more vulnerable to noise in RTT
+ * measurements (e.g., delayed ACKs or other ACK compression effects). This
+ * noise may cause BBR to under-estimate the rate.
+ */
+static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 bdp;
+	u64 w;
+
+	/* If we've never had a valid RTT sample, cap cwnd at the initial
+	 * default. This should only happen when the connection is not using TCP
+	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
+	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
+	 * case we need to slow-start up toward something safe: initial cwnd.
+	 */
+	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
+		return bbr->init_cwnd;  /* be safe: cap at initial cwnd */
+
+	w = (u64)bw * bbr->min_rtt_us;
+
+	/* Apply a gain to the given value, remove the BW_SCALE shift, and
+	 * round the value up to avoid a negative feedback loop.
+	 */
+	bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
+
+	return bdp;
+}
+
+/* To achieve full performance in high-speed paths, we budget enough cwnd to
+ * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
+ *   - one skb in sending host Qdisc,
+ *   - one skb in sending host TSO/GSO engine
+ *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
+ * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
+ * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
+ * which allows 2 outstanding 2-packet sequences, to try to keep pipe
+ * full even with ACK-every-other-packet delayed ACKs.
+ */
+static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 tso_segs_goal;
+
+	tso_segs_goal = 3 * bbr_tso_segs_goal(sk);
+
+	/* Allow enough full-sized skbs in flight to utilize end systems. */
+	if (bbr->params.cwnd_tso_budget == 1) {
+		cwnd = max_t(u32, cwnd, tso_segs_goal);
+		cwnd = max_t(u32, cwnd, bbr->params.cwnd_min_target);
+	} else {
+		cwnd += tso_segs_goal;
+		cwnd = (cwnd + 1) & ~1U;
+	}
+	/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
+	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
+		cwnd += 2;
+
+	return cwnd;
+}
+
+/* Find inflight based on min RTT and the estimated bottleneck bandwidth. */
+static u32 bbr_inflight(struct sock *sk, u32 bw, int gain)
+{
+	u32 inflight;
+
+	inflight = bbr_bdp(sk, bw, gain);
+	inflight = bbr_quantization_budget(sk, inflight);
+
+	return inflight;
+}
+
+/* With pacing at lower layers, there's often less data "in the network" than
+ * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq),
+ * we often have several skbs queued in the pacing layer with a pre-scheduled
+ * earliest departure time (EDT). BBR adapts its pacing rate based on the
+ * inflight level that it estimates has already been "baked in" by previous
+ * departure time decisions. We calculate a rough estimate of the number of our
+ * packets that might be in the network at the earliest departure time for the
+ * next skb scheduled:
+ *   in_network_at_edt = inflight_at_edt - (EDT - now) * bw
+ * If we're increasing inflight, then we want to know if the transmit of the
+ * EDT skb will push inflight above the target, so inflight_at_edt includes
+ * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight,
+ * then estimate if inflight will sink too low just before the EDT transmit.
+ */
+static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u64 now_ns, edt_ns, interval_us;
+	u32 interval_delivered, inflight_at_edt;
+
+	now_ns = tp->tcp_clock_cache;
+	edt_ns = max(tp->tcp_wstamp_ns, now_ns);
+	interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC);
+	interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE;
+	inflight_at_edt = inflight_now;
+	if (bbr->pacing_gain > BBR_UNIT)              /* increasing inflight */
+		inflight_at_edt += bbr_tso_segs_goal(sk);  /* include EDT skb */
+	if (interval_delivered >= inflight_at_edt)
+		return 0;
+	return inflight_at_edt - interval_delivered;
+}
+
+/* Find the cwnd increment based on estimate of ack aggregation */
+static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 max_aggr_cwnd, aggr_cwnd = 0;
+
+	if (bbr->params.extra_acked_gain &&
+	    (bbr_full_bw_reached(sk) || bbr->params.extra_acked_in_startup)) {
+		max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
+				/ BW_UNIT;
+		aggr_cwnd = (bbr->params.extra_acked_gain * bbr_extra_acked(sk))
+			     >> BBR_SCALE;
+		aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
+	}
+
+	return aggr_cwnd;
+}
+
+/* Returns the cwnd for PROBE_RTT mode. */
+static u32 bbr_probe_rtt_cwnd(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr->params.probe_rtt_cwnd_gain == 0)
+		return bbr->params.cwnd_min_target;
+	return max_t(u32, bbr->params.cwnd_min_target,
+		     bbr_bdp(sk, bbr_bw(sk), bbr->params.probe_rtt_cwnd_gain));
+}
+
+/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
+ * has drawn us down below target), or snap down to target if we're above it.
+ */
+static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+			 u32 acked, u32 bw, int gain, u32 cwnd,
+			 struct bbr_context *ctx)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 target_cwnd = 0, prev_cwnd = tp->snd_cwnd, max_probe;
+
+	if (!acked)
+		goto done;  /* no packet fully ACKed; just apply caps */
+
+	target_cwnd = bbr_bdp(sk, bw, gain);
+
+	/* Increment the cwnd to account for excess ACKed data that seems
+	 * due to aggregation (of data and/or ACKs) visible in the ACK stream.
+	 */
+	target_cwnd += bbr_ack_aggregation_cwnd(sk);
+	target_cwnd = bbr_quantization_budget(sk, target_cwnd);
+
+	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
+	bbr->debug.target_cwnd = target_cwnd;
+
+	/* Update cwnd and enable fast path if cwnd reaches target_cwnd. */
+	bbr->try_fast_path = 0;
+	if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */
+		cwnd += acked;
+		if (cwnd >= target_cwnd) {
+			cwnd = target_cwnd;
+			bbr->try_fast_path = 1;
+		}
+	} else if (cwnd < target_cwnd || cwnd  < 2 * bbr->init_cwnd) {
+		cwnd += acked;
+	} else {
+		bbr->try_fast_path = 1;
+	}
+
+	/* When growing cwnd, don't grow beyond twice what we just probed. */
+	if (bbr->params.usage_based_cwnd) {
+		max_probe = max(2 * tp->max_packets_out, tp->snd_cwnd);
+		cwnd = min(cwnd, max_probe);
+	}
+
+	cwnd = max_t(u32, cwnd, bbr->params.cwnd_min_target);
+done:
+	tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);	/* apply global cap */
+	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
+		tp->snd_cwnd = min_t(u32, tp->snd_cwnd, bbr_probe_rtt_cwnd(sk));
+
+	ctx->target_cwnd = target_cwnd;
+	ctx->log = (tp->snd_cwnd != prev_cwnd);
+}
+
+/* See if we have reached next round trip */
+static void bbr_update_round_start(struct sock *sk,
+		const struct rate_sample *rs, struct bbr_context *ctx)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->round_start = 0;
+
+	/* See if we've reached the next RTT */
+	if (rs->interval_us > 0 &&
+	    !before(rs->prior_delivered, bbr->next_rtt_delivered)) {
+		bbr->next_rtt_delivered = tp->delivered;
+		bbr->round_start = 1;
+	}
+}
+
+/* Calculate the bandwidth based on how fast packets are delivered */
+static void bbr_calculate_bw_sample(struct sock *sk,
+			const struct rate_sample *rs, struct bbr_context *ctx)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u64 bw = 0;
+
+	/* Divide delivered by the interval to find a (lower bound) bottleneck
+	 * bandwidth sample. Delivered is in packets and interval_us in uS and
+	 * ratio will be <<1 for most connections. So delivered is first scaled.
+	 * Round up to allow growth at low rates, even with integer division.
+	 */
+	if (rs->interval_us > 0) {
+		if (WARN_ONCE(rs->delivered < 0,
+			      "negative delivered: %d interval_us: %ld\n",
+			      rs->delivered, rs->interval_us))
+			return;
+
+		bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us);
+	}
+
+	ctx->sample_bw = bw;
+	bbr->debug.rs_bw = bw;
+}
+
+/* Estimates the windowed max degree of ack aggregation.
+ * This is used to provision extra in-flight data to keep sending during
+ * inter-ACK silences.
+ *
+ * Degree of ack aggregation is estimated as extra data acked beyond expected.
+ *
+ * max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval"
+ * cwnd += max_extra_acked
+ *
+ * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
+ * Max filter is an approximate sliding window of 5-10 (packet timed) round
+ * trips for non-startup phase, and 1-2 round trips for startup.
+ */
+static void bbr_update_ack_aggregation(struct sock *sk,
+				       const struct rate_sample *rs)
+{
+	u32 epoch_us, expected_acked, extra_acked;
+	struct bbr *bbr = inet_csk_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 extra_acked_win_rtts_thresh = bbr->params.extra_acked_win_rtts;
+
+	if (!bbr->params.extra_acked_gain || rs->acked_sacked <= 0 ||
+	    rs->delivered < 0 || rs->interval_us <= 0)
+		return;
+
+	if (bbr->round_start) {
+		bbr->extra_acked_win_rtts = min(0x1F,
+						bbr->extra_acked_win_rtts + 1);
+		if (bbr->params.extra_acked_in_startup &&
+		    !bbr_full_bw_reached(sk))
+			extra_acked_win_rtts_thresh = 1;
+		if (bbr->extra_acked_win_rtts >=
+		    extra_acked_win_rtts_thresh) {
+			bbr->extra_acked_win_rtts = 0;
+			bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
+						   0 : 1;
+			bbr->extra_acked[bbr->extra_acked_win_idx] = 0;
+		}
+	}
+
+	/* Compute how many packets we expected to be delivered over epoch. */
+	epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp,
+				      bbr->ack_epoch_mstamp);
+	expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT;
+
+	/* Reset the aggregation epoch if ACK rate is below expected rate or
+	 * significantly large no. of ack received since epoch (potentially
+	 * quite old epoch).
+	 */
+	if (bbr->ack_epoch_acked <= expected_acked ||
+	    (bbr->ack_epoch_acked + rs->acked_sacked >=
+	     bbr_ack_epoch_acked_reset_thresh)) {
+		bbr->ack_epoch_acked = 0;
+		bbr->ack_epoch_mstamp = tp->delivered_mstamp;
+		expected_acked = 0;
+	}
+
+	/* Compute excess data delivered, beyond what was expected. */
+	bbr->ack_epoch_acked = min_t(u32, 0xFFFFF,
+				   bbr->ack_epoch_acked + rs->acked_sacked);
+	extra_acked = bbr->ack_epoch_acked - expected_acked;
+	extra_acked = min(extra_acked, tp->snd_cwnd);
+	if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx])
+		bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
+}
+
+/* Estimate when the pipe is full, using the change in delivery rate: BBR
+ * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
+ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
+ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
+ * higher rwin, 3: we get higher delivery rate samples. Or transient
+ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
+ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
+ */
+static void bbr_check_full_bw_reached(struct sock *sk,
+				      const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 bw_thresh;
+
+	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
+		return;
+
+	bw_thresh = (u64)bbr->full_bw * bbr->params.full_bw_thresh >> BBR_SCALE;
+	if (bbr_max_bw(sk) >= bw_thresh) {
+		bbr->full_bw = bbr_max_bw(sk);
+		bbr->full_bw_cnt = 0;
+		return;
+	}
+	++bbr->full_bw_cnt;
+	bbr->full_bw_reached = bbr->full_bw_cnt >= bbr->params.full_bw_cnt;
+}
+
+/* If pipe is probably full, drain the queue and then enter steady-state. */
+static bool bbr_check_drain(struct sock *sk, const struct rate_sample *rs,
+			    struct bbr_context *ctx)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
+		bbr->mode = BBR_DRAIN;	/* drain queue we created */
+		tcp_sk(sk)->snd_ssthresh =
+				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
+		bbr2_reset_congestion_signals(sk);
+	}	/* fall through to check if in-flight is already small: */
+	if (bbr->mode == BBR_DRAIN &&
+	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
+	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
+		return true;  /* exiting DRAIN now */
+	return false;
+}
+
+static void bbr_check_probe_rtt_done(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (!(bbr->probe_rtt_done_stamp &&
+	      after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
+		return;
+
+	bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */
+	tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd);
+	bbr2_exit_probe_rtt(sk);
+}
+
+/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
+ * periodically drain the bottleneck queue, to converge to measure the true
+ * min_rtt (unloaded propagation delay). This allows the flows to keep queues
+ * small (reducing queuing delay and packet loss) and achieve fairness among
+ * BBR flows.
+ *
+ * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
+ * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
+ * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
+ * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
+ * re-enter the previous mode. BBR uses 200ms to approximately bound the
+ * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
+ *
+ * Note that flows need only pay 2% if they are busy sending over the last 10
+ * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
+ * natural silences or low-rate periods within 10 seconds where the rate is low
+ * enough for long enough to drain its queue in the bottleneck. We pick up
+ * these min RTT measurements opportunistically with our min_rtt filter. :-)
+ */
+static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	bool probe_rtt_expired, min_rtt_expired;
+	u32 expire;
+
+	/* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */
+	expire = bbr->probe_rtt_min_stamp +
+		 msecs_to_jiffies(bbr->params.probe_rtt_win_ms);
+	probe_rtt_expired = after(tcp_jiffies32, expire);
+	if (rs->rtt_us >= 0 &&
+	    (rs->rtt_us <= bbr->probe_rtt_min_us ||
+	     (probe_rtt_expired && !rs->is_ack_delayed))) {
+		bbr->probe_rtt_min_us = rs->rtt_us;
+		bbr->probe_rtt_min_stamp = tcp_jiffies32;
+	}
+	/* Track min RTT seen in the min_rtt_win_sec filter window: */
+	expire = bbr->min_rtt_stamp + bbr->params.min_rtt_win_sec * HZ;
+	min_rtt_expired = after(tcp_jiffies32, expire);
+	if (bbr->probe_rtt_min_us <= bbr->min_rtt_us ||
+	    min_rtt_expired) {
+		bbr->min_rtt_us = bbr->probe_rtt_min_us;
+		bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp;
+	}
+
+	if (bbr->params.probe_rtt_mode_ms > 0 && probe_rtt_expired &&
+	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
+		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
+		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
+		bbr->probe_rtt_done_stamp = 0;
+		bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
+		bbr->next_rtt_delivered = tp->delivered;
+	}
+
+	if (bbr->mode == BBR_PROBE_RTT) {
+		/* Ignore low rate samples during this mode. */
+		tp->app_limited =
+			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+		/* Maintain min packets in flight for max(200 ms, 1 round). */
+		if (!bbr->probe_rtt_done_stamp &&
+		    tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) {
+			bbr->probe_rtt_done_stamp = tcp_jiffies32 +
+				msecs_to_jiffies(bbr->params.probe_rtt_mode_ms);
+			bbr->probe_rtt_round_done = 0;
+			bbr->next_rtt_delivered = tp->delivered;
+		} else if (bbr->probe_rtt_done_stamp) {
+			if (bbr->round_start)
+				bbr->probe_rtt_round_done = 1;
+			if (bbr->probe_rtt_round_done)
+				bbr_check_probe_rtt_done(sk);
+		}
+	}
+	/* Restart after idle ends only once we process a new S/ACK for data */
+	if (rs->delivered > 0)
+		bbr->idle_restart = 0;
+}
+
+static void bbr_update_gains(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	switch (bbr->mode) {
+	case BBR_STARTUP:
+		bbr->pacing_gain = bbr->params.high_gain;
+		bbr->cwnd_gain	 = bbr->params.startup_cwnd_gain;
+		break;
+	case BBR_DRAIN:
+		bbr->pacing_gain = bbr->params.drain_gain;  /* slow, to drain */
+		bbr->cwnd_gain = bbr->params.startup_cwnd_gain;  /* keep cwnd */
+		break;
+	case BBR_PROBE_BW:
+		bbr->pacing_gain = bbr->params.pacing_gain[bbr->cycle_idx];
+		bbr->cwnd_gain = bbr->params.cwnd_gain;
+		break;
+	case BBR_PROBE_RTT:
+		bbr->pacing_gain = BBR_UNIT;
+		bbr->cwnd_gain = BBR_UNIT;
+		break;
+	default:
+		WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode);
+		break;
+	}
+}
+
+static void bbr_init(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	int i;
+
+	WARN_ON_ONCE(tp->snd_cwnd >= bbr_cwnd_warn_val);
+
+	bbr->initialized = 1;
+	bbr->params.high_gain = min(0x7FF, bbr_high_gain);
+	bbr->params.drain_gain = min(0x3FF, bbr_drain_gain);
+	bbr->params.startup_cwnd_gain = min(0x7FF, bbr_startup_cwnd_gain);
+	bbr->params.cwnd_gain = min(0x7FF, bbr_cwnd_gain);
+	bbr->params.cwnd_tso_budget = min(0x1U, bbr_cwnd_tso_budget);
+	bbr->params.cwnd_min_target = min(0xFU, bbr_cwnd_min_target);
+	bbr->params.min_rtt_win_sec = min(0x1FU, bbr_min_rtt_win_sec);
+	bbr->params.probe_rtt_mode_ms = min(0x1FFU, bbr_probe_rtt_mode_ms);
+	bbr->params.full_bw_cnt = min(0x7U, bbr_full_bw_cnt);
+	bbr->params.bw_rtts = min(0x1F, bbr_bw_rtts);
+	bbr->params.full_bw_thresh = min(0x3FFU, bbr_full_bw_thresh);
+	bbr->params.extra_acked_gain = min(0x7FF, bbr_extra_acked_gain);
+	bbr->params.extra_acked_win_rtts = min(0x1FU, bbr_extra_acked_win_rtts);
+	bbr->params.drain_to_target = bbr_drain_to_target ? 1 : 0;
+	bbr->params.precise_ece_ack = bbr_precise_ece_ack ? 1 : 0;
+	bbr->params.extra_acked_in_startup = bbr_extra_acked_in_startup ? 1 : 0;
+	bbr->params.probe_rtt_cwnd_gain = min(0xFFU, bbr_probe_rtt_cwnd_gain);
+	bbr->params.probe_rtt_win_ms =
+		min(0x3FFFU,
+		    min_t(u32, bbr_probe_rtt_win_ms,
+			  bbr->params.min_rtt_win_sec * MSEC_PER_SEC));
+	for (i = 0; i < CYCLE_LEN; i++)
+		bbr->params.pacing_gain[i] = min(0x3FF, bbr_pacing_gain[i]);
+	bbr->params.usage_based_cwnd = bbr_usage_based_cwnd ? 1 : 0;
+	bbr->params.tso_rtt_shift =  min(0xFU, bbr_tso_rtt_shift);
+
+	bbr->debug.snd_isn = tp->snd_una;
+	bbr->debug.target_cwnd = 0;
+	bbr->debug.undo = 0;
+
+	bbr->init_cwnd = min(0x7FU, tp->snd_cwnd);
+	bbr->prior_cwnd = tp->prior_cwnd;
+	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+	bbr->next_rtt_delivered = 0;
+	bbr->prev_ca_state = TCP_CA_Open;
+	bbr->packet_conservation = 0;
+
+	bbr->probe_rtt_done_stamp = 0;
+	bbr->probe_rtt_round_done = 0;
+	bbr->probe_rtt_min_us = tcp_min_rtt(tp);
+	bbr->probe_rtt_min_stamp = tcp_jiffies32;
+	bbr->min_rtt_us = tcp_min_rtt(tp);
+	bbr->min_rtt_stamp = tcp_jiffies32;
+
+	bbr->has_seen_rtt = 0;
+	bbr_init_pacing_rate_from_rtt(sk);
+
+	bbr->round_start = 0;
+	bbr->idle_restart = 0;
+	bbr->full_bw_reached = 0;
+	bbr->full_bw = 0;
+	bbr->full_bw_cnt = 0;
+	bbr->cycle_mstamp = 0;
+	bbr->cycle_idx = 0;
+	bbr->mode = BBR_STARTUP;
+	bbr->debug.rs_bw = 0;
+
+	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+	bbr->ack_epoch_acked = 0;
+	bbr->extra_acked_win_rtts = 0;
+	bbr->extra_acked_win_idx = 0;
+	bbr->extra_acked[0] = 0;
+	bbr->extra_acked[1] = 0;
+
+	bbr->ce_state = 0;
+	bbr->prior_rcv_nxt = tp->rcv_nxt;
+	bbr->try_fast_path = 0;
+
+	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
+}
+
+static u32 bbr_sndbuf_expand(struct sock *sk)
+{
+	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
+	return 3;
+}
+
+/* __________________________________________________________________________
+ *
+ * Functions new to BBR v2 ("bbr") congestion control are below here.
+ * __________________________________________________________________________
+ */
+
+/* Incorporate a new bw sample into the current window of our max filter. */
+static void bbr2_take_bw_hi_sample(struct sock *sk, u32 bw)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]);
+}
+
+/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */
+static void bbr2_advance_bw_hi_filter(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (!bbr->bw_hi[1])
+		return;  /* no samples in this window; remember old window */
+	bbr->bw_hi[0] = bbr->bw_hi[1];
+	bbr->bw_hi[1] = 0;
+}
+
+/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */
+static u32 bbr2_target_inflight(struct sock *sk)
+{
+	u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT);
+
+	return min(bdp, tcp_sk(sk)->snd_cwnd);
+}
+
+static bool bbr2_is_probing_bandwidth(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	return (bbr->mode == BBR_STARTUP) ||
+		(bbr->mode == BBR_PROBE_BW &&
+		 (bbr->cycle_idx == BBR_BW_PROBE_REFILL ||
+		  bbr->cycle_idx == BBR_BW_PROBE_UP));
+}
+
+/* Has the given amount of time elapsed since we marked the phase start? */
+static bool bbr2_has_elapsed_in_phase(const struct sock *sk, u32 interval_us)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct bbr *bbr = inet_csk_ca(sk);
+
+	return tcp_stamp_us_delta(tp->tcp_mstamp,
+				  bbr->cycle_mstamp + interval_us) > 0;
+}
+
+static void bbr2_handle_queue_too_high_in_startup(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->full_bw_reached = 1;
+	bbr->inflight_hi = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
+}
+
+/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */
+static void bbr2_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible ||
+	    !bbr->params.full_ecn_cnt || !bbr->params.ecn_thresh)
+		return;
+
+	if (ce_ratio >= bbr->params.ecn_thresh)
+		bbr->startup_ecn_rounds++;
+	else
+		bbr->startup_ecn_rounds = 0;
+
+	if (bbr->startup_ecn_rounds >= bbr->params.full_ecn_cnt) {
+		bbr->debug.event = 'E';  /* ECN caused STARTUP exit */
+		bbr2_handle_queue_too_high_in_startup(sk);
+		return;
+	}
+}
+
+static void bbr2_update_ecn_alpha(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	s32 delivered, delivered_ce;
+	u64 alpha, ce_ratio;
+	u32 gain;
+
+	if (bbr->params.ecn_factor == 0)
+		return;
+
+	delivered = tp->delivered - bbr->alpha_last_delivered;
+	delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce;
+
+	if (delivered == 0 ||		/* avoid divide by zero */
+	    WARN_ON_ONCE(delivered < 0 || delivered_ce < 0))  /* backwards? */
+		return;
+
+	/* See if we should use ECN sender logic for this connection. */
+	if (!bbr->ecn_eligible && bbr_ecn_enable &&
+	    (bbr->min_rtt_us <= bbr->params.ecn_max_rtt_us ||
+	     !bbr->params.ecn_max_rtt_us))
+		bbr->ecn_eligible = 1;
+
+	ce_ratio = (u64)delivered_ce << BBR_SCALE;
+	do_div(ce_ratio, delivered);
+	gain = bbr->params.ecn_alpha_gain;
+	alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE;
+	alpha += (gain * ce_ratio) >> BBR_SCALE;
+	bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT);
+
+	bbr->alpha_last_delivered = tp->delivered;
+	bbr->alpha_last_delivered_ce = tp->delivered_ce;
+
+	bbr2_check_ecn_too_high_in_startup(sk, ce_ratio);
+}
+
+/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */
+static void bbr2_raise_inflight_hi_slope(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 growth_this_round, cnt;
+
+	/* Calculate "slope": packets S/Acked per inflight_hi increment. */
+	growth_this_round = 1 << bbr->bw_probe_up_rounds;
+	bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30);
+	cnt = tp->snd_cwnd / growth_this_round;
+	cnt = max(cnt, 1U);
+	bbr->bw_probe_up_cnt = cnt;
+	bbr->debug.event = 'G';  /* Grow inflight_hi slope */
+}
+
+/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */
+static void bbr2_probe_inflight_hi_upward(struct sock *sk,
+					  const struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 delta;
+
+	if (!tp->is_cwnd_limited || tp->snd_cwnd < bbr->inflight_hi) {
+		bbr->bw_probe_up_acks = 0;  /* don't accmulate unused credits */
+		return;  /* not fully using inflight_hi, so don't grow it */
+	}
+
+	/* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */
+	bbr->bw_probe_up_acks += rs->acked_sacked;
+	if (bbr->bw_probe_up_acks >=  bbr->bw_probe_up_cnt) {
+		delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt;
+		bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt;
+		bbr->inflight_hi += delta;
+		bbr->debug.event = 'I';  /* Increment inflight_hi */
+	}
+
+	if (bbr->round_start)
+		bbr2_raise_inflight_hi_slope(sk);
+}
+
+/* Does loss/ECN rate for this sample say inflight is "too high"?
+ * This is used by both the bbr_check_loss_too_high_in_startup() function,
+ * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which
+ * uses it to notice when loss/ECN rates suggest inflight is too high.
+ */
+static bool bbr2_is_inflight_too_high(const struct sock *sk,
+				     const struct rate_sample *rs)
+{
+	const struct bbr *bbr = inet_csk_ca(sk);
+	u32 loss_thresh, ecn_thresh;
+
+	if (rs->lost > 0 && rs->tx_in_flight) {
+		loss_thresh = (u64)rs->tx_in_flight * bbr->params.loss_thresh >>
+				BBR_SCALE;
+		if (rs->lost > loss_thresh)
+			return true;
+	}
+
+	if (rs->delivered_ce > 0 && rs->delivered > 0 &&
+	    bbr->ecn_eligible && bbr->params.ecn_thresh) {
+		ecn_thresh = (u64)rs->delivered * bbr->params.ecn_thresh >>
+				BBR_SCALE;
+		if (rs->delivered_ce >= ecn_thresh)
+			return true;
+	}
+
+	return false;
+}
+
+/* Calculate the tx_in_flight level that corresponded to excessive loss.
+ * We find "lost_prefix" segs of the skb where loss rate went too high,
+ * by solving for "lost_prefix" in the following equation:
+ *   lost                     /  inflight                     >= loss_thresh
+ *  (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh
+ * Then we take that equation, convert it to fixed point, and
+ * round up to the nearest packet.
+ */
+static u32 bbr2_inflight_hi_from_lost_skb(const struct sock *sk,
+					  const struct rate_sample *rs,
+					  const struct sk_buff *skb)
+{
+	const struct bbr *bbr = inet_csk_ca(sk);
+	u32 loss_thresh  = bbr->params.loss_thresh;
+	u32 pcount, divisor, inflight_hi;
+	s32 inflight_prev, lost_prev;
+	u64 loss_budget, lost_prefix;
+
+	pcount = tcp_skb_pcount(skb);
+
+	/* How much data was in flight before this skb? */
+	inflight_prev = rs->tx_in_flight - pcount;
+	if (WARN_ONCE(inflight_prev < 0,
+		      "tx_in_flight: %u pcount: %u reneg: %u",
+		      rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg))
+		return ~0U;
+
+	/* How much inflight data was marked lost before this skb? */
+	lost_prev = rs->lost - pcount;
+	if (WARN_ON_ONCE(lost_prev < 0))
+		return ~0U;
+
+	/* At what prefix of this lost skb did losss rate exceed loss_thresh? */
+	loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1;
+	loss_budget >>= BBR_SCALE;
+	if (lost_prev >= loss_budget) {
+		lost_prefix = 0;   /* previous losses crossed loss_thresh */
+	} else {
+		lost_prefix = loss_budget - lost_prev;
+		lost_prefix <<= BBR_SCALE;
+		divisor = BBR_UNIT - loss_thresh;
+		if (WARN_ON_ONCE(!divisor))  /* loss_thresh is 8 bits */
+			return ~0U;
+		do_div(lost_prefix, divisor);
+	}
+
+	inflight_hi = inflight_prev + lost_prefix;
+	return inflight_hi;
+}
+
+/* If loss/ECN rates during probing indicated we may have overfilled a
+ * buffer, return an operating point that tries to leave unutilized headroom in
+ * the path for other flows, for fairness convergence and lower RTTs and loss.
+ */
+static u32 bbr2_inflight_with_headroom(const struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 headroom, headroom_fraction;
+
+	if (bbr->inflight_hi == ~0U)
+		return ~0U;
+
+	headroom_fraction = bbr->params.inflight_headroom;
+	headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE;
+	headroom = max(headroom, 1U);
+	return max_t(s32, bbr->inflight_hi - headroom,
+		     bbr->params.cwnd_min_target);
+}
+
+/* Bound cwnd to a sensible level, based on our current probing state
+ * machine phase and model of a good inflight level (inflight_lo, inflight_hi).
+ */
+static void bbr2_bound_cwnd_for_inflight_model(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 cap;
+
+	/* tcp_rcv_synsent_state_process() currently calls tcp_ack()
+	 * and thus cong_control() without first initializing us(!).
+	 */
+	if (!bbr->initialized)
+		return;
+
+	cap = ~0U;
+	if (bbr->mode == BBR_PROBE_BW &&
+	    bbr->cycle_idx != BBR_BW_PROBE_CRUISE) {
+		/* Probe to see if more packets fit in the path. */
+		cap = bbr->inflight_hi;
+	} else {
+		if (bbr->mode == BBR_PROBE_RTT ||
+		    (bbr->mode == BBR_PROBE_BW &&
+		     bbr->cycle_idx == BBR_BW_PROBE_CRUISE))
+			cap = bbr2_inflight_with_headroom(sk);
+	}
+	/* Adapt to any loss/ECN since our last bw probe. */
+	cap = min(cap, bbr->inflight_lo);
+
+	cap = max_t(u32, cap, bbr->params.cwnd_min_target);
+	tp->snd_cwnd = min(cap, tp->snd_cwnd);
+}
+
+/* Estimate a short-term lower bound on the capacity available now, based
+ * on measurements of the current delivery process and recent history. When we
+ * are seeing loss/ECN at times when we are not probing bw, then conservatively
+ * move toward flow balance by multiplicatively cutting our short-term
+ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a
+ * multiplicative decrease in order to converge to a lower capacity in time
+ * logarithmic in the magnitude of the decrease.
+ *
+ * However, we do not cut our short-term estimates lower than the current rate
+ * and volume of delivered data from this round trip, since from the current
+ * delivery process we can estimate the measured capacity available now.
+ *
+ * Anything faster than that approach would knowingly risk high loss, which can
+ * cause low bw for Reno/CUBIC and high loss recovery latency for
+ * request/response flows using any congestion control.
+ */
+static void bbr2_adapt_lower_bounds(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 ecn_cut, ecn_inflight_lo, beta;
+
+	/* We only use lower-bound estimates when not probing bw.
+	 * When probing we need to push inflight higher to probe bw.
+	 */
+	if (bbr2_is_probing_bandwidth(sk))
+		return;
+
+	/* ECN response. */
+	if (bbr->ecn_in_round && bbr->ecn_eligible && bbr->params.ecn_factor) {
+		/* Reduce inflight to (1 - alpha*ecn_factor). */
+		ecn_cut = (BBR_UNIT -
+			   ((bbr->ecn_alpha * bbr->params.ecn_factor) >>
+			    BBR_SCALE));
+		if (bbr->inflight_lo == ~0U)
+			bbr->inflight_lo = tp->snd_cwnd;
+		ecn_inflight_lo = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE;
+	} else {
+		ecn_inflight_lo = ~0U;
+	}
+
+	/* Loss response. */
+	if (bbr->loss_in_round) {
+		/* Reduce bw and inflight to (1 - beta). */
+		if (bbr->bw_lo == ~0U)
+			bbr->bw_lo = bbr_max_bw(sk);
+		if (bbr->inflight_lo == ~0U)
+			bbr->inflight_lo = tp->snd_cwnd;
+		beta = bbr->params.beta;
+		bbr->bw_lo =
+			max_t(u32, bbr->bw_latest,
+			      (u64)bbr->bw_lo *
+			      (BBR_UNIT - beta) >> BBR_SCALE);
+		bbr->inflight_lo =
+			max_t(u32, bbr->inflight_latest,
+			      (u64)bbr->inflight_lo *
+			      (BBR_UNIT - beta) >> BBR_SCALE);
+	}
+
+	/* Adjust to the lower of the levels implied by loss or ECN. */
+	bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo);
+}
+
+/* Reset any short-term lower-bound adaptation to congestion, so that we can
+ * push our inflight up.
+ */
+static void bbr2_reset_lower_bounds(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->bw_lo = ~0U;
+	bbr->inflight_lo = ~0U;
+}
+
+/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state
+ * machine phase where we adapt our lower bound based on congestion signals.
+ */
+static void bbr2_reset_congestion_signals(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->loss_in_round = 0;
+	bbr->ecn_in_round = 0;
+	bbr->loss_in_cycle = 0;
+	bbr->ecn_in_cycle = 0;
+	bbr->bw_latest = 0;
+	bbr->inflight_latest = 0;
+}
+
+/* Update (most of) our congestion signals: track the recent rate and volume of
+ * delivered data, presence of loss, and EWMA degree of ECN marking.
+ */
+static void bbr2_update_congestion_signals(
+	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u64 bw;
+
+	bbr->loss_round_start = 0;
+	if (rs->interval_us <= 0 || !rs->acked_sacked)
+		return; /* Not a valid observation */
+	bw = ctx->sample_bw;
+
+	if (!rs->is_app_limited || bw >= bbr_max_bw(sk))
+		bbr2_take_bw_hi_sample(sk, bw);
+
+	bbr->loss_in_round |= (rs->losses > 0);
+
+	/* Update rate and volume of delivered data from latest round trip: */
+	bbr->bw_latest       = max_t(u32, bbr->bw_latest,       ctx->sample_bw);
+	bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered);
+
+	if (before(rs->prior_delivered, bbr->loss_round_delivered))
+		return;		/* skip the per-round-trip updates */
+	/* Now do per-round-trip updates. */
+	bbr->loss_round_delivered = tp->delivered;  /* mark round trip */
+	bbr->loss_round_start = 1;
+	bbr2_adapt_lower_bounds(sk);
+
+	/* Update windowed "latest" (single-round-trip) filters. */
+	bbr->loss_in_round = 0;
+	bbr->ecn_in_round  = 0;
+	bbr->bw_latest = ctx->sample_bw;
+	bbr->inflight_latest = rs->delivered;
+}
+
+/* Bandwidth probing can cause loss. To help coexistence with loss-based
+ * congestion control we spread out our probing in a Reno-conscious way. Due to
+ * the shape of the Reno sawtooth, the time required between loss epochs for an
+ * idealized Reno flow is a number of round trips that is the BDP of that
+ * flow. We count packet-timed round trips directly, since measured RTT can
+ * vary widely, and Reno is driven by packet-timed round trips.
+ */
+static bool bbr2_is_reno_coexistence_probe_time(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 inflight, rounds, reno_gain, reno_rounds;
+
+	/* Random loss can shave some small percentage off of our inflight
+	 * in each round. To survive this, flows need robust periodic probes.
+	 */
+	rounds = bbr->params.bw_probe_max_rounds;
+
+	reno_gain = bbr->params.bw_probe_reno_gain;
+	if (reno_gain) {
+		inflight = bbr2_target_inflight(sk);
+		reno_rounds = ((u64)inflight * reno_gain) >> BBR_SCALE;
+		rounds = min(rounds, reno_rounds);
+	}
+	return bbr->rounds_since_probe >= rounds;
+}
+
+/* How long do we want to wait before probing for bandwidth (and risking
+ * loss)? We randomize the wait, for better mixing and fairness convergence.
+ *
+ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips.
+ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow,
+ * (eg 4K video to a broadband user):
+ *   BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
+ *
+ * We bound the BBR-native inter-bw-probe wall clock time to be:
+ *  (a) higher than 2 sec: to try to avoid causing loss for a long enough time
+ *      to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must
+ *      be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs
+ *  (b) lower than 3 sec: to ensure flows can start probing in a reasonable
+ *      amount of time to discover unutilized bw on human-scale interactive
+ *      time-scales (e.g. perhaps traffic from a web page download that we
+ *      were competing with is now complete).
+ */
+static void bbr2_pick_probe_wait(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	/* Decide the random round-trip bound for wait until probe: */
+	bbr->rounds_since_probe =
+		prandom_u32_max(bbr->params.bw_probe_rand_rounds);
+	/* Decide the random wall clock bound for wait until probe: */
+	bbr->probe_wait_us = bbr->params.bw_probe_base_us +
+			     prandom_u32_max(bbr->params.bw_probe_rand_us);
+}
+
+static void bbr2_set_cycle_idx(struct sock *sk, int cycle_idx)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->cycle_idx = cycle_idx;
+	/* New phase, so need to update cwnd and pacing rate. */
+	bbr->try_fast_path = 0;
+}
+
+/* Send at estimated bw to fill the pipe, but not queue. We need this phase
+ * before PROBE_UP, because as soon as we send faster than the available bw
+ * we will start building a queue, and if the buffer is shallow we can cause
+ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and
+ * inflight_hi estimates will underestimate.
+ */
+static void bbr2_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr2_reset_lower_bounds(sk);
+	if (bbr->inflight_hi != ~0U)
+		bbr->inflight_hi += bbr->params.refill_add_inc;
+	bbr->bw_probe_up_rounds = bw_probe_up_rounds;
+	bbr->bw_probe_up_acks = 0;
+	bbr->stopped_risky_probe = 0;
+	bbr->ack_phase = BBR_ACKS_REFILLING;
+	bbr->next_rtt_delivered = tp->delivered;
+	bbr2_set_cycle_idx(sk, BBR_BW_PROBE_REFILL);
+}
+
+/* Now probe max deliverable data rate and volume. */
+static void bbr2_start_bw_probe_up(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->ack_phase = BBR_ACKS_PROBE_STARTING;
+	bbr->next_rtt_delivered = tp->delivered;
+	bbr->cycle_mstamp = tp->tcp_mstamp;
+	bbr2_set_cycle_idx(sk, BBR_BW_PROBE_UP);
+	bbr2_raise_inflight_hi_slope(sk);
+}
+
+/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall
+ * clock time at which to probe beyond an inflight that we think to be
+ * safe. This will knowingly risk packet loss, so we want to do this rarely, to
+ * keep packet loss rates low. Also start a round-trip counter, to probe faster
+ * if we estimate a Reno flow at our BDP would probe faster.
+ */
+static void bbr2_start_bw_probe_down(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr2_reset_congestion_signals(sk);
+	bbr->bw_probe_up_cnt = ~0U;     /* not growing inflight_hi any more */
+	bbr2_pick_probe_wait(sk);
+	bbr->cycle_mstamp = tp->tcp_mstamp;		/* start wall clock */
+	bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
+	bbr->next_rtt_delivered = tp->delivered;
+	bbr2_set_cycle_idx(sk, BBR_BW_PROBE_DOWN);
+}
+
+/* Cruise: maintain what we estimate to be a neutral, conservative
+ * operating point, without attempting to probe up for bandwidth or down for
+ * RTT, and only reducing inflight in response to loss/ECN signals.
+ */
+static void bbr2_start_bw_probe_cruise(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr->inflight_lo != ~0U)
+		bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi);
+
+	bbr2_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE);
+}
+
+/* Loss and/or ECN rate is too high while probing.
+ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle.
+ */
+static void bbr2_handle_inflight_too_high(struct sock *sk,
+					  const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	const u32 beta = bbr->params.beta;
+
+	bbr->prev_probe_too_high = 1;
+	bbr->bw_probe_samples = 0;  /* only react once per probe */
+	bbr->debug.event = 'L';     /* Loss/ECN too high */
+	/* If we are app-limited then we are not robustly
+	 * probing the max volume of inflight data we think
+	 * might be safe (analogous to how app-limited bw
+	 * samples are not known to be robustly probing bw).
+	 */
+	if (!rs->is_app_limited)
+		bbr->inflight_hi = max_t(u32, rs->tx_in_flight,
+					 (u64)bbr2_target_inflight(sk) *
+					 (BBR_UNIT - beta) >> BBR_SCALE);
+	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
+		bbr2_start_bw_probe_down(sk);
+}
+
+/* If we're seeing bw and loss samples reflecting our bw probing, adapt
+ * using the signals we see. If loss or ECN mark rate gets too high, then adapt
+ * inflight_hi downward. If we're able to push inflight higher without such
+ * signals, push higher: adapt inflight_hi upward.
+ */
+static bool bbr2_adapt_upper_bounds(struct sock *sk,
+				   const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	/* Track when we'll see bw/loss samples resulting from our bw probes. */
+	if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start)
+		bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK;
+	if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) {
+		/* End of samples from bw probing phase. */
+		bbr->bw_probe_samples = 0;
+		bbr->ack_phase = BBR_ACKS_INIT;
+		/* At this point in the cycle, our current bw sample is also
+		 * our best recent chance at finding the highest available bw
+		 * for this flow. So now is the best time to forget the bw
+		 * samples from the previous cycle, by advancing the window.
+		 */
+		if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited)
+			bbr2_advance_bw_hi_filter(sk);
+		/* If we had an inflight_hi, then probed and pushed inflight all
+		 * the way up to hit that inflight_hi without seeing any
+		 * high loss/ECN in all the resulting ACKs from that probing,
+		 * then probe up again, this time letting inflight persist at
+		 * inflight_hi for a round trip, then accelerating beyond.
+		 */
+		if (bbr->mode == BBR_PROBE_BW &&
+		    bbr->stopped_risky_probe && !bbr->prev_probe_too_high) {
+			bbr->debug.event = 'R';  /* reprobe */
+			bbr2_start_bw_probe_refill(sk, 0);
+			return true;  /* yes, decided state transition */
+		}
+	}
+
+	if (bbr2_is_inflight_too_high(sk, rs)) {
+		if (bbr->bw_probe_samples)  /*  sample is from bw probing? */
+			bbr2_handle_inflight_too_high(sk, rs);
+	} else {
+		/* Loss/ECN rate is declared safe. Adjust upper bound upward. */
+		if (bbr->inflight_hi == ~0U)  /* no excess queue signals yet? */
+			return false;
+
+		/* To be resilient to random loss, we must raise inflight_hi
+		 * if we observe in any phase that a higher level is safe.
+		 */
+		if (rs->tx_in_flight > bbr->inflight_hi) {
+			bbr->inflight_hi = rs->tx_in_flight;
+			bbr->debug.event = 'U';  /* raise up inflight_hi */
+		}
+
+		if (bbr->mode == BBR_PROBE_BW &&
+		    bbr->cycle_idx == BBR_BW_PROBE_UP)
+			bbr2_probe_inflight_hi_upward(sk, rs);
+	}
+
+	return false;
+}
+
+/* Check if it's time to probe for bandwidth now, and if so, kick it off. */
+static bool bbr2_check_time_to_probe_bw(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 n;
+
+	/* If we seem to be at an operating point where we are not seeing loss
+	 * but we are seeing ECN marks, then when the ECN marks cease we reprobe
+	 * quickly (in case a burst of cross-traffic has ceased and freed up bw,
+	 * or in case we are sharing with multiplicatively probing traffic).
+	 */
+	if (bbr->params.ecn_reprobe_gain && bbr->ecn_eligible &&
+	    bbr->ecn_in_cycle && !bbr->loss_in_cycle &&
+	    inet_csk(sk)->icsk_ca_state == TCP_CA_Open) {
+		bbr->debug.event = 'A';  /* *A*ll clear to probe *A*gain */
+		/* Calculate n so that when bbr2_raise_inflight_hi_slope()
+		 * computes growth_this_round as 2^n it will be roughly the
+		 * desired volume of data (inflight_hi*ecn_reprobe_gain).
+		 */
+		n = ilog2((((u64)bbr->inflight_hi *
+			    bbr->params.ecn_reprobe_gain) >> BBR_SCALE));
+		bbr2_start_bw_probe_refill(sk, n);
+		return true;
+	}
+
+	if (bbr2_has_elapsed_in_phase(sk, bbr->probe_wait_us) ||
+	    bbr2_is_reno_coexistence_probe_time(sk)) {
+		bbr2_start_bw_probe_refill(sk, 0);
+		return true;
+	}
+	return false;
+}
+
+/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */
+static bool bbr2_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	bool is_under_bdp, is_long_enough;
+
+	/* Always need to pull inflight down to leave headroom in queue. */
+	if (inflight > bbr2_inflight_with_headroom(sk))
+		return false;
+
+	is_under_bdp = inflight <= bbr_inflight(sk, bw, BBR_UNIT);
+	if (bbr->params.drain_to_target)
+		return is_under_bdp;
+
+	is_long_enough = bbr2_has_elapsed_in_phase(sk, bbr->min_rtt_us);
+	return is_under_bdp || is_long_enough;
+}
+
+/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */
+static void bbr2_update_cycle_phase(struct sock *sk,
+				    const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	bool is_risky = false, is_queuing = false;
+	u32 inflight, bw;
+
+	if (!bbr_full_bw_reached(sk))
+		return;
+
+	/* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */
+	if (bbr2_adapt_upper_bounds(sk, rs))
+		return;		/* already decided state transition */
+
+	if (bbr->mode != BBR_PROBE_BW)
+		return;
+
+	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
+	bw = bbr_max_bw(sk);
+
+	switch (bbr->cycle_idx) {
+	/* First we spend most of our time cruising with a pacing_gain of 1.0,
+	 * which paces at the estimated bw, to try to fully use the pipe
+	 * without building queue. If we encounter loss/ECN marks, we adapt
+	 * by slowing down.
+	 */
+	case BBR_BW_PROBE_CRUISE:
+		if (bbr2_check_time_to_probe_bw(sk))
+			return;		/* already decided state transition */
+		break;
+
+	/* After cruising, when it's time to probe, we first "refill": we send
+	 * at the estimated bw to fill the pipe, before probing higher and
+	 * knowingly risking overflowing the bottleneck buffer (causing loss).
+	 */
+	case BBR_BW_PROBE_REFILL:
+		if (bbr->round_start) {
+			/* After one full round trip of sending in REFILL, we
+			 * start to see bw samples reflecting our REFILL, which
+			 * may be putting too much data in flight.
+			 */
+			bbr->bw_probe_samples = 1;
+			bbr2_start_bw_probe_up(sk);
+		}
+		break;
+
+	/* After we refill the pipe, we probe by using a pacing_gain > 1.0, to
+	 * probe for bw. If we have not seen loss/ECN, we try to raise inflight
+	 * to at least pacing_gain*BDP; note that this may take more than
+	 * min_rtt if min_rtt is small (e.g. on a LAN).
+	 *
+	 * We terminate PROBE_UP bandwidth probing upon any of the following:
+	 *
+	 * (1) We've pushed inflight up to hit the inflight_hi target set in the
+	 *     most recent previous bw probe phase. Thus we want to start
+	 *     draining the queue immediately because it's very likely the most
+	 *     recently sent packets will fill the queue and cause drops.
+	 *     (checked here)
+	 * (2) We have probed for at least 1*min_rtt_us, and the
+	 *     estimated queue is high enough (inflight > 1.25 * estimated_bdp).
+	 *     (checked here)
+	 * (3) Loss filter says loss rate is "too high".
+	 *     (checked in bbr_is_inflight_too_high())
+	 * (4) ECN filter says ECN mark rate is "too high".
+	 *     (checked in bbr_is_inflight_too_high())
+	 */
+	case BBR_BW_PROBE_UP:
+		if (bbr->prev_probe_too_high &&
+		    inflight >= bbr->inflight_hi) {
+			bbr->stopped_risky_probe = 1;
+			is_risky = true;
+			bbr->debug.event = 'D';   /* D for danger */
+		} else if (bbr2_has_elapsed_in_phase(sk, bbr->min_rtt_us) &&
+			   inflight >=
+			   bbr_inflight(sk, bw,
+					bbr->params.bw_probe_pif_gain)) {
+			is_queuing = true;
+			bbr->debug.event = 'Q'; /* building Queue */
+		}
+		if (is_risky || is_queuing) {
+			bbr->prev_probe_too_high = 0;  /* no loss/ECN (yet) */
+			bbr2_start_bw_probe_down(sk);  /* restart w/ down */
+		}
+		break;
+
+	/* After probing in PROBE_UP, we have usually accumulated some data in
+	 * the bottleneck buffer (if bw probing didn't find more bw). We next
+	 * enter PROBE_DOWN to try to drain any excess data from the queue. To
+	 * do this, we use a pacing_gain < 1.0. We hold this pacing gain until
+	 * our inflight is less then that target cruising point, which is the
+	 * minimum of (a) the amount needed to leave headroom, and (b) the
+	 * estimated BDP. Once inflight falls to match the target, we estimate
+	 * the queue is drained; persisting would underutilize the pipe.
+	 */
+	case BBR_BW_PROBE_DOWN:
+		if (bbr2_check_time_to_probe_bw(sk))
+			return;		/* already decided state transition */
+		if (bbr2_check_time_to_cruise(sk, inflight, bw))
+			bbr2_start_bw_probe_cruise(sk);
+		break;
+
+	default:
+		WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx);
+	}
+}
+
+/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */
+static void bbr2_exit_probe_rtt(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr2_reset_lower_bounds(sk);
+	if (bbr_full_bw_reached(sk)) {
+		bbr->mode = BBR_PROBE_BW;
+		/* Raising inflight after PROBE_RTT may cause loss, so reset
+		 * the PROBE_BW clock and schedule the next bandwidth probe for
+		 * a friendly and randomized future point in time.
+		 */
+		bbr2_start_bw_probe_down(sk);
+		/* Since we are exiting PROBE_RTT, we know inflight is
+		 * below our estimated BDP, so it is reasonable to cruise.
+		 */
+		bbr2_start_bw_probe_cruise(sk);
+	} else {
+		bbr->mode = BBR_STARTUP;
+	}
+}
+
+/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until
+ * the end of the round in recovery to get a good estimate of how many packets
+ * have been lost, and how many we need to drain with a low pacing rate.
+ */
+static void bbr2_check_loss_too_high_in_startup(struct sock *sk,
+					       const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr_full_bw_reached(sk))
+		return;
+
+	/* For STARTUP exit, check the loss rate at the end of each round trip
+	 * of Recovery episodes in STARTUP. We check the loss rate at the end
+	 * of the round trip to filter out noisy/low loss and have a better
+	 * sense of inflight (extent of loss), so we can drain more accurately.
+	 */
+	if (rs->losses && bbr->loss_events_in_round < 0xf)
+		bbr->loss_events_in_round++;  /* update saturating counter */
+	if (bbr->params.full_loss_cnt && bbr->loss_round_start &&
+	    inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery &&
+	    bbr->loss_events_in_round >= bbr->params.full_loss_cnt &&
+	    bbr2_is_inflight_too_high(sk, rs)) {
+		bbr->debug.event = 'P';  /* Packet loss caused STARTUP exit */
+		bbr2_handle_queue_too_high_in_startup(sk);
+		return;
+	}
+	if (bbr->loss_round_start)
+		bbr->loss_events_in_round = 0;
+}
+
+/* If we are done draining, advance into steady state operation in PROBE_BW. */
+static void bbr2_check_drain(struct sock *sk, const struct rate_sample *rs,
+			     struct bbr_context *ctx)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr_check_drain(sk, rs, ctx)) {
+		bbr->mode = BBR_PROBE_BW;
+		bbr2_start_bw_probe_down(sk);
+	}
+}
+
+static void bbr2_update_model(struct sock *sk, const struct rate_sample *rs,
+			      struct bbr_context *ctx)
+{
+	bbr2_update_congestion_signals(sk, rs, ctx);
+	bbr_update_ack_aggregation(sk, rs);
+	bbr2_check_loss_too_high_in_startup(sk, rs);
+	bbr_check_full_bw_reached(sk, rs);
+	bbr2_check_drain(sk, rs, ctx);
+	bbr2_update_cycle_phase(sk, rs);
+	bbr_update_min_rtt(sk, rs);
+}
+
+/* Fast path for app-limited case.
+ *
+ * On each ack, we execute bbr state machine, which primarily consists of:
+ * 1) update model based on new rate sample, and
+ * 2) update control based on updated model or state change.
+ *
+ * There are certain workload/scenarios, e.g. app-limited case, where
+ * either we can skip updating model or we can skip update of both model
+ * as well as control. This provides signifcant softirq cpu savings for
+ * processing incoming acks.
+ *
+ * In case of app-limited, if there is no congestion (loss/ecn) and
+ * if observed bw sample is less than current estimated bw, then we can
+ * skip some of the computation in bbr state processing:
+ *
+ * - if there is no rtt/mode/phase change: In this case, since all the
+ *   parameters of the network model are constant, we can skip model
+ *   as well control update.
+ *
+ * - else we can skip rest of the model update. But we still need to
+ *   update the control to account for the new rtt/mode/phase.
+ *
+ * Returns whether we can take fast path or not.
+ */
+static bool bbr2_fast_path(struct sock *sk, bool *update_model,
+		const struct rate_sample *rs, struct bbr_context *ctx)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 prev_min_rtt_us, prev_mode;
+
+	if (bbr->params.fast_path && bbr->try_fast_path &&
+	    rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) &&
+	    !bbr->loss_in_round && !bbr->ecn_in_round) {
+		prev_mode = bbr->mode;
+		prev_min_rtt_us = bbr->min_rtt_us;
+		bbr2_check_drain(sk, rs, ctx);
+		bbr2_update_cycle_phase(sk, rs);
+		bbr_update_min_rtt(sk, rs);
+
+		if (bbr->mode == prev_mode &&
+		    bbr->min_rtt_us == prev_min_rtt_us &&
+		    bbr->try_fast_path)
+			return true;
+
+		/* Skip model update, but control still needs to be updated */
+		*update_model = false;
+	}
+	return false;
+}
+
+static void bbr2_main(struct sock *sk, const struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	struct bbr_context ctx = { 0 };
+	bool update_model = true;
+	u32 bw;
+
+	bbr->debug.event = '.';  /* init to default NOP (no event yet) */
+
+	bbr_update_round_start(sk, rs, &ctx);
+	if (bbr->round_start) {
+		bbr->rounds_since_probe =
+			min_t(s32, bbr->rounds_since_probe + 1, 0xFF);
+		bbr2_update_ecn_alpha(sk);
+	}
+
+	bbr->ecn_in_round  |= rs->is_ece;
+	bbr_calculate_bw_sample(sk, rs, &ctx);
+
+	if (bbr2_fast_path(sk, &update_model, rs, &ctx))
+		goto out;
+
+	if (update_model)
+		bbr2_update_model(sk, rs, &ctx);
+
+	bbr_update_gains(sk);
+	bw = bbr_bw(sk);
+	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
+	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain,
+		     tp->snd_cwnd, &ctx);
+	bbr2_bound_cwnd_for_inflight_model(sk);
+
+out:
+	bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state;
+	bbr->loss_in_cycle |= rs->lost > 0;
+	bbr->ecn_in_cycle  |= rs->delivered_ce > 0;
+
+	bbr_debug(sk, rs->acked_sacked, rs, &ctx);
+}
+
+/* Module parameters that are settable by TCP_CONGESTION_PARAMS are declared
+ * down here, so that the algorithm functions that use the parameters must use
+ * the per-socket parameters; if they accidentally use the global version
+ * then there will be a compile error.
+ * TODO(ncardwell): move all per-socket parameters down to this section.
+ */
+
+/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE.
+ * No loss response when 0. Max allwed value is 255.
+ */
+static u32 bbr_beta = BBR_UNIT * 30 / 100;
+
+/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE.
+ * Max allowed value is 255.
+ */
+static u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16;  /* 1/16 = 6.25% */
+
+/* The initial value for the ecn_alpha state variable. Default and max
+ * BBR_UNIT (256), representing 1.0. This allows a flow to respond quickly
+ * to congestion if the bottleneck is congested when the flow starts up.
+ */
+static u32 bbr_ecn_alpha_init = BBR_UNIT;	/* 1.0, to respond quickly */
+
+/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE.
+ * No ECN based bounding when 0. Max allwed value is 255.
+ */
+static u32 bbr_ecn_factor = BBR_UNIT * 1 / 3;	    /* 1/3 = 33% */
+
+/* Estimate bw probing has gone too far if CE ratio exceeds this threshold.
+ * Scaled by BBR_SCALE. Disabled when 0. Max allowed is 255.
+ */
+static u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2;  /* 1/2 = 50% */
+
+/* Max RTT (in usec) at which to use sender-side ECN logic.
+ * Disabled when 0 (ECN allowed at any RTT).
+ * Max allowed for the parameter is 524287 (0x7ffff) us, ~524 ms.
+ */
+static u32 bbr_ecn_max_rtt_us = 5000;
+
+/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN
+ * clears then use a multiplicative increase to quickly reprobe bw by
+ * starting inflight probing at the given multiple of inflight_hi.
+ * Default for this experimental knob is 0 (disabled).
+ * Planned value for experiments: BBR_UNIT * 1 / 2 = 128, representing 0.5.
+ */
+static u32 bbr_ecn_reprobe_gain;
+
+/* Estimate bw probing has gone too far if loss rate exceeds this level. */
+static u32 bbr_loss_thresh = BBR_UNIT * 2 / 100;  /* 2% loss */
+
+/* Exit STARTUP if number of loss marking events in a Recovery round is >= N,
+ * and loss rate is higher than bbr_loss_thresh.
+ * Disabled if 0. Max allowed value is 15 (0xF).
+ */
+static u32 bbr_full_loss_cnt = 8;
+
+/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh
+ * meets this count. Max allowed value is 3.
+ */
+static u32 bbr_full_ecn_cnt = 2;
+
+/* Fraction of unutilized headroom to try to leave in path upon high loss. */
+static u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100;
+
+/* Multiplier to get target inflight (as multiple of BDP) for PROBE_UP phase.
+ * Default is 1.25x, as in BBR v1. Max allowed is 511.
+ */
+static u32 bbr_bw_probe_pif_gain = BBR_UNIT * 5 / 4;
+
+/* Multiplier to get Reno-style probe epoch duration as: k * BDP round trips.
+ * If zero, disables this BBR v2 Reno-style BDP-scaled coexistence mechanism.
+ * Max allowed is 511.
+ */
+static u32 bbr_bw_probe_reno_gain = BBR_UNIT;
+
+/* Max number of packet-timed rounds to wait before probing for bandwidth.  If
+ * we want to tolerate 1% random loss per round, and not have this cut our
+ * inflight too much, we must probe for bw periodically on roughly this scale.
+ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance.
+ * We aim to be fair with Reno/CUBIC up to a BDP of at least:
+ *  BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
+ */
+static u32 bbr_bw_probe_max_rounds = 63;
+
+/* Max amount of randomness to inject in round counting for Reno-coexistence.
+ * Max value is 15.
+ */
+static u32 bbr_bw_probe_rand_rounds = 2;
+
+/* Use BBR-native probe time scale starting at this many usec.
+ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least:
+ *  BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs
+ */
+static u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC;  /* 2 secs */
+
+/* Use BBR-native probes spread over this many usec: */
+static u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC;  /* 1 secs */
+
+/* Undo the model changes made in loss recovery if recovery was spurious? */
+static bool bbr_undo = true;
+
+/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */
+static bool bbr_fast_path = true;	/* default: enabled */
+
+/* Use fast ack mode ? */
+static int bbr_fast_ack_mode = 1;	/* default: rwnd check off */
+
+/* How much to additively increase inflight_hi when entering REFILL? */
+static u32 bbr_refill_add_inc;		/* default: disabled */
+
+module_param_named(beta,                 bbr_beta,                 uint, 0644);
+module_param_named(ecn_alpha_gain,       bbr_ecn_alpha_gain,       uint, 0644);
+module_param_named(ecn_alpha_init,       bbr_ecn_alpha_init,       uint, 0644);
+module_param_named(ecn_factor,           bbr_ecn_factor,           uint, 0644);
+module_param_named(ecn_thresh,           bbr_ecn_thresh,           uint, 0644);
+module_param_named(ecn_max_rtt_us,       bbr_ecn_max_rtt_us,       uint, 0644);
+module_param_named(ecn_reprobe_gain,     bbr_ecn_reprobe_gain,     uint, 0644);
+module_param_named(loss_thresh,          bbr_loss_thresh,          uint, 0664);
+module_param_named(full_loss_cnt,        bbr_full_loss_cnt,        uint, 0664);
+module_param_named(full_ecn_cnt,         bbr_full_ecn_cnt,         uint, 0664);
+module_param_named(inflight_headroom,    bbr_inflight_headroom,    uint, 0664);
+module_param_named(bw_probe_pif_gain,    bbr_bw_probe_pif_gain,    uint, 0664);
+module_param_named(bw_probe_reno_gain,   bbr_bw_probe_reno_gain,   uint, 0664);
+module_param_named(bw_probe_max_rounds,  bbr_bw_probe_max_rounds,  uint, 0664);
+module_param_named(bw_probe_rand_rounds, bbr_bw_probe_rand_rounds, uint, 0664);
+module_param_named(bw_probe_base_us,     bbr_bw_probe_base_us,     uint, 0664);
+module_param_named(bw_probe_rand_us,     bbr_bw_probe_rand_us,     uint, 0664);
+module_param_named(undo,                 bbr_undo,                 bool, 0664);
+module_param_named(fast_path,		 bbr_fast_path,		   bool, 0664);
+module_param_named(fast_ack_mode,	 bbr_fast_ack_mode,	   uint, 0664);
+module_param_named(refill_add_inc,       bbr_refill_add_inc,       uint, 0664);
+
+static void bbr2_init(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr_init(sk);	/* run shared init code for v1 and v2 */
+
+	/* BBR v2 parameters: */
+	bbr->params.beta = min_t(u32, 0xFFU, bbr_beta);
+	bbr->params.ecn_alpha_gain = min_t(u32, 0xFFU, bbr_ecn_alpha_gain);
+	bbr->params.ecn_alpha_init = min_t(u32, BBR_UNIT, bbr_ecn_alpha_init);
+	bbr->params.ecn_factor = min_t(u32, 0xFFU, bbr_ecn_factor);
+	bbr->params.ecn_thresh = min_t(u32, 0xFFU, bbr_ecn_thresh);
+	bbr->params.ecn_max_rtt_us = min_t(u32, 0x7ffffU, bbr_ecn_max_rtt_us);
+	bbr->params.ecn_reprobe_gain = min_t(u32, 0x1FF, bbr_ecn_reprobe_gain);
+	bbr->params.loss_thresh = min_t(u32, 0xFFU, bbr_loss_thresh);
+	bbr->params.full_loss_cnt = min_t(u32, 0xFU, bbr_full_loss_cnt);
+	bbr->params.full_ecn_cnt = min_t(u32, 0x3U, bbr_full_ecn_cnt);
+	bbr->params.inflight_headroom =
+		min_t(u32, 0xFFU, bbr_inflight_headroom);
+	bbr->params.bw_probe_pif_gain =
+		min_t(u32, 0x1FFU, bbr_bw_probe_pif_gain);
+	bbr->params.bw_probe_reno_gain =
+		min_t(u32, 0x1FFU, bbr_bw_probe_reno_gain);
+	bbr->params.bw_probe_max_rounds =
+		min_t(u32, 0xFFU, bbr_bw_probe_max_rounds);
+	bbr->params.bw_probe_rand_rounds =
+		min_t(u32, 0xFU, bbr_bw_probe_rand_rounds);
+	bbr->params.bw_probe_base_us =
+		min_t(u32, (1 << 26) - 1, bbr_bw_probe_base_us);
+	bbr->params.bw_probe_rand_us =
+		min_t(u32, (1 << 26) - 1, bbr_bw_probe_rand_us);
+	bbr->params.undo = bbr_undo;
+	bbr->params.fast_path = bbr_fast_path ? 1 : 0;
+	bbr->params.refill_add_inc = min_t(u32, 0x3U, bbr_refill_add_inc);
+
+	/* BBR v2 state: */
+	bbr->initialized = 1;
+	/* Start sampling ECN mark rate after first full flight is ACKed: */
+	bbr->loss_round_delivered = tp->delivered + 1;
+	bbr->loss_round_start = 0;
+	bbr->undo_bw_lo = 0;
+	bbr->undo_inflight_lo = 0;
+	bbr->undo_inflight_hi = 0;
+	bbr->loss_events_in_round = 0;
+	bbr->startup_ecn_rounds = 0;
+	bbr2_reset_congestion_signals(sk);
+	bbr->bw_lo = ~0U;
+	bbr->bw_hi[0] = 0;
+	bbr->bw_hi[1] = 0;
+	bbr->inflight_lo = ~0U;
+	bbr->inflight_hi = ~0U;
+	bbr->bw_probe_up_cnt = ~0U;
+	bbr->bw_probe_up_acks = 0;
+	bbr->bw_probe_up_rounds = 0;
+	bbr->probe_wait_us = 0;
+	bbr->stopped_risky_probe = 0;
+	bbr->ack_phase = BBR_ACKS_INIT;
+	bbr->rounds_since_probe = 0;
+	bbr->bw_probe_samples = 0;
+	bbr->prev_probe_too_high = 0;
+	bbr->ecn_eligible = 0;
+	bbr->ecn_alpha = bbr->params.ecn_alpha_init;
+	bbr->alpha_last_delivered = 0;
+	bbr->alpha_last_delivered_ce = 0;
+
+	tp->fast_ack_mode = min_t(u32, 0x2U, bbr_fast_ack_mode);
+}
+
+/* Core TCP stack informs us that the given skb was just marked lost. */
+static void bbr2_skb_marked_lost(struct sock *sk, const struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+	struct rate_sample rs;
+
+	/* Capture "current" data over the full round trip of loss,
+	 * to have a better chance to see the full capacity of the path.
+	*/
+	if (!bbr->loss_in_round)  /* first loss in this round trip? */
+		bbr->loss_round_delivered = tp->delivered;  /* set round trip */
+	bbr->loss_in_round = 1;
+	bbr->loss_in_cycle = 1;
+
+	if (!bbr->bw_probe_samples)
+		return;  /* not an skb sent while probing for bandwidth */
+	if (unlikely(!scb->tx.delivered_mstamp))
+		return;  /* skb was SACKed, reneged, marked lost; ignore it */
+	/* We are probing for bandwidth. Construct a rate sample that
+	 * estimates what happened in the flight leading up to this lost skb,
+	 * then see if the loss rate went too high, and if so at which packet.
+	 */
+	memset(&rs, 0, sizeof(rs));
+	rs.tx_in_flight = scb->tx.in_flight;
+	rs.lost = tp->lost - scb->tx.lost;
+	rs.delivered_ce = tp->delivered_ce - scb->tx.delivered_ce;
+	rs.is_app_limited = scb->tx.is_app_limited;
+	if (bbr2_is_inflight_too_high(sk, &rs)) {
+		rs.tx_in_flight = bbr2_inflight_hi_from_lost_skb(sk, &rs, skb);
+		bbr2_handle_inflight_too_high(sk, &rs);
+	}
+}
+
+/* Revert short-term model if current loss recovery event was spurious. */
+static u32 bbr2_undo_cwnd(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->debug.undo = 1;
+	bbr->full_bw = 0;   /* spurious slow-down; reset full pipe detection */
+	bbr->full_bw_cnt = 0;
+	bbr->loss_in_round = 0;
+
+	if (!bbr->params.undo)
+		return tp->snd_cwnd;
+
+	/* Revert to cwnd and other state saved before loss episode. */
+	bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo);
+	bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo);
+	bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi);
+	return bbr->prior_cwnd;
+}
+
+/* Entering loss recovery, so save state for when we undo recovery. */
+static u32 bbr2_ssthresh(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr_save_cwnd(sk);
+	/* For undo, save state that adapts based on loss signal. */
+	bbr->undo_bw_lo		= bbr->bw_lo;
+	bbr->undo_inflight_lo	= bbr->inflight_lo;
+	bbr->undo_inflight_hi	= bbr->inflight_hi;
+	return tcp_sk(sk)->snd_ssthresh;
+}
+
+static enum tcp_bbr2_phase bbr2_get_phase(struct bbr *bbr)
+{
+	switch (bbr->mode) {
+	case BBR_STARTUP:
+		return BBR2_PHASE_STARTUP;
+	case BBR_DRAIN:
+		return BBR2_PHASE_DRAIN;
+	case BBR_PROBE_BW:
+		break;
+	case BBR_PROBE_RTT:
+		return BBR2_PHASE_PROBE_RTT;
+	default:
+		return BBR2_PHASE_INVALID;
+	}
+	switch (bbr->cycle_idx) {
+	case BBR_BW_PROBE_UP:
+		return BBR2_PHASE_PROBE_BW_UP;
+	case BBR_BW_PROBE_DOWN:
+		return BBR2_PHASE_PROBE_BW_DOWN;
+	case BBR_BW_PROBE_CRUISE:
+		return BBR2_PHASE_PROBE_BW_CRUISE;
+	case BBR_BW_PROBE_REFILL:
+		return BBR2_PHASE_PROBE_BW_REFILL;
+	default:
+		return BBR2_PHASE_INVALID;
+	}
+}
+
+static size_t bbr2_get_info(struct sock *sk, u32 ext, int *attr,
+			    union tcp_cc_info *info)
+{
+	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
+	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+		struct bbr *bbr = inet_csk_ca(sk);
+		u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk));
+		u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk));
+		u64 bw_lo = bbr->bw_lo == ~0U ?
+			~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo);
+
+		memset(&info->bbr2, 0, sizeof(info->bbr2));
+		info->bbr2.bbr_bw_lsb		= (u32)bw;
+		info->bbr2.bbr_bw_msb		= (u32)(bw >> 32);
+		info->bbr2.bbr_min_rtt		= bbr->min_rtt_us;
+		info->bbr2.bbr_pacing_gain	= bbr->pacing_gain;
+		info->bbr2.bbr_cwnd_gain	= bbr->cwnd_gain;
+		info->bbr2.bbr_bw_hi_lsb	= (u32)bw_hi;
+		info->bbr2.bbr_bw_hi_msb	= (u32)(bw_hi >> 32);
+		info->bbr2.bbr_bw_lo_lsb	= (u32)bw_lo;
+		info->bbr2.bbr_bw_lo_msb	= (u32)(bw_lo >> 32);
+		info->bbr2.bbr_mode		= bbr->mode;
+		info->bbr2.bbr_phase		= (__u8)bbr2_get_phase(bbr);
+		info->bbr2.bbr_version		= (__u8)2;
+		info->bbr2.bbr_inflight_lo	= bbr->inflight_lo;
+		info->bbr2.bbr_inflight_hi	= bbr->inflight_hi;
+		info->bbr2.bbr_extra_acked	= bbr_extra_acked(sk);
+		*attr = INET_DIAG_BBRINFO;
+		return sizeof(info->bbr2);
+	}
+	return 0;
+}
+
+static void bbr2_set_state(struct sock *sk, u8 new_state)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (new_state == TCP_CA_Loss) {
+		struct rate_sample rs = { .losses = 1 };
+		struct bbr_context ctx = { 0 };
+
+		bbr->prev_ca_state = TCP_CA_Loss;
+		bbr->full_bw = 0;
+		if (!bbr2_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) {
+			/* bbr_adapt_lower_bounds() needs cwnd before
+			 * we suffered an RTO, to update inflight_lo:
+			 */
+			WARN_ON_ONCE(bbr->prior_cwnd == 0);
+			WARN_ON_ONCE(bbr->prior_cwnd == ~0U);
+			bbr->inflight_lo = bbr->prior_cwnd;
+		}
+		bbr_debug(sk, 0, &rs, &ctx);
+	} else if (bbr->prev_ca_state == TCP_CA_Loss &&
+		   new_state != TCP_CA_Loss) {
+		WARN_ON_ONCE(bbr->prior_cwnd == 0);
+		WARN_ON_ONCE(bbr->prior_cwnd == ~0U);
+		tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd);
+		bbr->try_fast_path = 0; /* bound cwnd using latest model */
+	}
+}
+
+static struct tcp_congestion_ops tcp_bbr2_cong_ops __read_mostly = {
+	.flags		= TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS,
+	.name		= "bbr2",
+	.owner		= THIS_MODULE,
+	.init		= bbr2_init,
+	.cong_control	= bbr2_main,
+	.sndbuf_expand	= bbr_sndbuf_expand,
+	.skb_marked_lost = bbr2_skb_marked_lost,
+	.undo_cwnd	= bbr2_undo_cwnd,
+	.cwnd_event	= bbr_cwnd_event,
+	.ssthresh	= bbr2_ssthresh,
+	.tso_segs	= bbr_tso_segs,
+	.get_info	= bbr2_get_info,
+	.set_state	= bbr2_set_state,
+};
+
+static int __init bbr_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&tcp_bbr2_cong_ops);
+}
+
+static void __exit bbr_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_bbr2_cong_ops);
+}
+
+module_init(bbr_register);
+module_exit(bbr_unregister);
+
+MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
+MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
+MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
+MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
+MODULE_AUTHOR("Priyaranjan Jha <priyarjha@google.com>");
+MODULE_AUTHOR("Yousuk Seung <ysseung@google.com>");
+MODULE_AUTHOR("Kevin Yang <yyd@google.com>");
+MODULE_AUTHOR("Arjun Roy <arjunroy@google.com>");
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");

From 57d4f3170b5efb4830dacefa7b4e524d8cfb0314 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Thu, 21 Nov 2019 15:28:01 -0500
Subject: [PATCH 720/737] net-tcp_bbr: v2: remove unnecessary rs.delivered_ce
 logic upon loss

There is no reason to compute rs.delivered_ce upon loss.

In fact, we specifically do not want to compute rs.delivered_ce upon loss.

Two issues:

(1) This would be the wrong thing to do, in behavior terms.  With
    RACK's dynamic reordering window, losses can be marked long after
    the sequence hole appears in the ACK/SACK stream. We want to to
    catch the ECN mark rate rising too high as quickly as possible,
    which means we want to check for high ECN mark rates at ACK time
    (as BBRv2 currently does) and not loss marking time.

(2) This is dead code. The ECN mark rate cannot be detected as too
    high because the check needs rs->delivered to be > 0 as well:

       if (rs->delivered_ce > 0 && rs->delivered > 0 &&

    Since we are not setting rs->delivered upon loss, this check
    cannot succeed, so setting delivered_ce is pointless.

This dead and wrong line was discovered by Randall Stewart at Netflix
as he was reading the BBRv2 code.

Change-Id: I37f83f418a259ec31d8f82de986db071b364b76a
---
 net/ipv4/tcp_bbr2.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/ipv4/tcp_bbr2.c b/net/ipv4/tcp_bbr2.c
index a6959b70e51d1..e00b47850dcef 100644
--- a/net/ipv4/tcp_bbr2.c
+++ b/net/ipv4/tcp_bbr2.c
@@ -2508,7 +2508,6 @@ static void bbr2_skb_marked_lost(struct sock *sk, const struct sk_buff *skb)
 	memset(&rs, 0, sizeof(rs));
 	rs.tx_in_flight = scb->tx.in_flight;
 	rs.lost = tp->lost - scb->tx.lost;
-	rs.delivered_ce = tp->delivered_ce - scb->tx.delivered_ce;
 	rs.is_app_limited = scb->tx.is_app_limited;
 	if (bbr2_is_inflight_too_high(sk, &rs)) {
 		rs.tx_in_flight = bbr2_inflight_hi_from_lost_skb(sk, &rs, skb);

From 50fc64cdcb37e9ca3813301dae81feca932283a1 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 17 Aug 2020 19:08:41 -0400
Subject: [PATCH 721/737] net-tcp_bbr: v2: remove field bw_rtts that is unused
 in BBRv2

Change-Id: I58e3346c707748a6f316f3ed060d2da84c32a79b
---
 net/ipv4/tcp_bbr2.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/net/ipv4/tcp_bbr2.c b/net/ipv4/tcp_bbr2.c
index e00b47850dcef..6121cd46f81a8 100644
--- a/net/ipv4/tcp_bbr2.c
+++ b/net/ipv4/tcp_bbr2.c
@@ -179,9 +179,8 @@ struct bbr {
 			min_rtt_win_sec:5,	/* max allowed value: 31 */
 			probe_rtt_mode_ms:9,	/* max allowed value: 511 */
 			full_bw_cnt:3,		/* max allowed value: 7 */
-			bw_rtts:5,		/* max allowed value: 31 */
 			cwnd_tso_budget:1,	/* allowed values: {0, 1} */
-			unused3:1,
+			unused3:6,
 			drain_to_target:1,	/* boolean */
 			precise_ece_ack:1,	/* boolean */
 			extra_acked_in_startup:1, /* allowed values: {0, 1} */
@@ -237,8 +236,6 @@ struct bbr_context {
 	u32 log:1;
 };
 
-/* Window length of bw filter (in rounds). Max allowed value is 31 (0x1F) */
-static int bbr_bw_rtts = CYCLE_LEN + 2;
 /* Window length of min_rtt filter (in sec). Max allowed value is 31 (0x1F) */
 static u32 bbr_min_rtt_win_sec = 10;
 /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode.
@@ -390,7 +387,6 @@ static bool bbr_usage_based_cwnd;		/* default: disabled */
  */
 static bool bbr_ecn_enable = false;
 
-module_param_named(bw_rtts,           bbr_bw_rtts,           int,    0644);
 module_param_named(min_tso_rate,      bbr_min_tso_rate,      int,    0644);
 module_param_named(tso_rtt_shift,     bbr_tso_rtt_shift,     int,    0644);
 module_param_named(high_gain,         bbr_high_gain,         int,    0644);
@@ -1247,7 +1243,6 @@ static void bbr_init(struct sock *sk)
 	bbr->params.min_rtt_win_sec = min(0x1FU, bbr_min_rtt_win_sec);
 	bbr->params.probe_rtt_mode_ms = min(0x1FFU, bbr_probe_rtt_mode_ms);
 	bbr->params.full_bw_cnt = min(0x7U, bbr_full_bw_cnt);
-	bbr->params.bw_rtts = min(0x1F, bbr_bw_rtts);
 	bbr->params.full_bw_thresh = min(0x3FFU, bbr_full_bw_thresh);
 	bbr->params.extra_acked_gain = min(0x7FF, bbr_extra_acked_gain);
 	bbr->params.extra_acked_win_rtts = min(0x1FU, bbr_extra_acked_win_rtts);

From f2d4bc72289bd6adb04315a6cc93d4d1e82bae93 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 17 Aug 2020 19:10:21 -0400
Subject: [PATCH 722/737] net-tcp_bbr: v2: remove cycle_rand parameter that is
 unused in BBRv2

Change-Id: Iee1df7e41e42de199068d7c89131ed3d228327c0
---
 net/ipv4/tcp_bbr2.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/ipv4/tcp_bbr2.c b/net/ipv4/tcp_bbr2.c
index 6121cd46f81a8..57b4abebb275b 100644
--- a/net/ipv4/tcp_bbr2.c
+++ b/net/ipv4/tcp_bbr2.c
@@ -303,8 +303,6 @@ static int bbr_pacing_gain[] = {
 	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
 	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
 };
-/* Randomize the starting gain cycling phase over N phases: */
-static u32 bbr_cycle_rand = 7;
 
 /* Try to keep at least this many packets in flight, if things go smoothly. For
  * smooth functioning, a sliding window protocol ACKing every other packet
@@ -395,7 +393,6 @@ module_param_named(startup_cwnd_gain, bbr_startup_cwnd_gain, int,    0644);
 module_param_named(cwnd_gain,         bbr_cwnd_gain,         int,    0644);
 module_param_array_named(pacing_gain, bbr_pacing_gain,       int,
 			 &bbr_pacing_gain_size, 0644);
-module_param_named(cycle_rand,        bbr_cycle_rand,        uint,   0644);
 module_param_named(cwnd_min_target,   bbr_cwnd_min_target,   uint,   0644);
 module_param_named(probe_rtt_cwnd_gain,
 		   bbr_probe_rtt_cwnd_gain,		     uint,   0664);

From 96c78ac09a6ce8d450981faea060225e32ece977 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 28 Dec 2020 19:23:09 -0500
Subject: [PATCH 723/737] net-tcp_bbr: v2: don't assume prior_cwnd was set
 entering CA_Loss

Fix WARN_ON_ONCE() warnings that were firing and pointing to a
bbr->prior_cwnd of 0 when exiting CA_Loss and transitioning to
CA_Open.

The issue was that tcp_simple_retransmit() calls:

  tcp_set_ca_state(sk, TCP_CA_Loss);

without first calling icsk_ca_ops->ssthresh(sk) (because
tcp_simple_retransmit() is dealing with losses due to MTU issues and
not congestion). The lack of this callback means that BBR did not get
a chance to set bbr->prior_cwnd, and thus upon exiting CA_Loss in such
cases the WARN_ON_ONCE() would fire due to a zero bbr->prior_cwnd.

This commit removes that warning, since a bbr->prior_cwnd of 0 is a
valid situation in this state transition.

For setting inflight_lo upon entering CA_Loss, to avoid setting an
inflight_lo of 0 in this case, this commit switches to taking the max
of cwnd and prior_cwnd. We plan to remove that line of code when we
switch to cautious (PRR-style) recovery, so that awkwardness will go
away.

Change-Id: I575dce871c2f20e91e3e9449e1706f42a07b8118
---
 net/ipv4/tcp_bbr2.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/tcp_bbr2.c b/net/ipv4/tcp_bbr2.c
index 57b4abebb275b..5510adc92bbb4 100644
--- a/net/ipv4/tcp_bbr2.c
+++ b/net/ipv4/tcp_bbr2.c
@@ -2617,15 +2617,12 @@ static void bbr2_set_state(struct sock *sk, u8 new_state)
 			/* bbr_adapt_lower_bounds() needs cwnd before
 			 * we suffered an RTO, to update inflight_lo:
 			 */
-			WARN_ON_ONCE(bbr->prior_cwnd == 0);
-			WARN_ON_ONCE(bbr->prior_cwnd == ~0U);
-			bbr->inflight_lo = bbr->prior_cwnd;
+			bbr->inflight_lo =
+				max(tp->snd_cwnd, bbr->prior_cwnd);
 		}
 		bbr_debug(sk, 0, &rs, &ctx);
 	} else if (bbr->prev_ca_state == TCP_CA_Loss &&
 		   new_state != TCP_CA_Loss) {
-		WARN_ON_ONCE(bbr->prior_cwnd == 0);
-		WARN_ON_ONCE(bbr->prior_cwnd == ~0U);
 		tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd);
 		bbr->try_fast_path = 0; /* bound cwnd using latest model */
 	}

From 1f092750c3f4b43184c315a725a96d92b9aad86b Mon Sep 17 00:00:00 2001
From: Adithya Abraham Philip <abrahamphilip@google.com>
Date: Fri, 11 Jun 2021 21:56:10 +0000
Subject: [PATCH 724/737] net-tcp_bbr: v2: Fix missing ECT markings on
 retransmits for BBRv2

Adds a new flag TCP_ECN_ECT_PERMANENT that is used by CCAs to
indicate that retransmitted packets and pure ACKs must have the
ECT bit set. This is a necessary fix for BBRv2, which when using
ECN expects ECT to be set even on retransmitted packets and ACKs.
Currently CCAs like BBRv2 which can use ECN but don't "need" it
do not have a way to indicate that ECT should be set on
retransmissions/ACKs.

Signed-off-by: Adithya Abraham Philip <abrahamphilip@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
---
 include/net/tcp.h     | 1 +
 net/ipv4/tcp_bbr2.c   | 3 +++
 net/ipv4/tcp_output.c | 3 ++-
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0840f9d432f0f..0cf7b271e9b76 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -357,6 +357,7 @@ static inline void tcp_dec_quickack_mode(struct sock *sk,
 #define	TCP_ECN_QUEUE_CWR	2
 #define	TCP_ECN_DEMAND_CWR	4
 #define	TCP_ECN_SEEN		8
+#define	TCP_ECN_ECT_PERMANENT	16
 
 enum tcp_tw_status {
 	TCP_TW_SUCCESS = 0,
diff --git a/net/ipv4/tcp_bbr2.c b/net/ipv4/tcp_bbr2.c
index 5510adc92bbb4..fa49e17c47ca9 100644
--- a/net/ipv4/tcp_bbr2.c
+++ b/net/ipv4/tcp_bbr2.c
@@ -2471,6 +2471,9 @@ static void bbr2_init(struct sock *sk)
 	bbr->alpha_last_delivered_ce = 0;
 
 	tp->fast_ack_mode = min_t(u32, 0x2U, bbr_fast_ack_mode);
+
+	if ((tp->ecn_flags & TCP_ECN_OK) && bbr_ecn_enable)
+		tp->ecn_flags |= TCP_ECN_ECT_PERMANENT;
 }
 
 /* Core TCP stack informs us that the given skb was just marked lost. */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 560abfad552ca..932d0f8c2f246 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -375,7 +375,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
 				th->cwr = 1;
 				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
 			}
-		} else if (!tcp_ca_needs_ecn(sk)) {
+		} else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) &&
+			!tcp_ca_needs_ecn(sk)) {
 			/* ACK or retransmitted segment: clear ECT|CE */
 			INET_ECN_dontxmit(sk);
 		}

From ae025410ec1d864edc1c83532b1330769b4bed56 Mon Sep 17 00:00:00 2001
From: Mubashir Adnan Qureshi <mubashirq@google.com>
Date: Wed, 20 Jul 2022 00:11:26 +0000
Subject: [PATCH 725/737] net-tcp_bbr: v2: add support for PLB in TCP and BBRv2

PLB (Protective Load Balancing) is a host based mechanism for load
balancing across switch links. It leverages congestion signals(e.g. ECN)
from transport layer to randomly change the path of the connection
experiencing congestion. PLB changes the path of the connection by
changing the outgoing IPv6 flow label for IPv6 connections (implemented
in Linux by calling sk_rethink_txhash()). Because of this implementation
mechanism, PLB can currently only work for IPv6 traffic. For more
information, see the SIGCOMM 2022 paper:
  https://doi.org/10.1145/3544216.3544226

Congestion control algorithms track PLB state and cause the connection
to trigger a path change when either of the 2 conditions is satisfied:

- No packets are in flight and (# consecutive congested rounds >=
  sysctl_tcp_plb_idle_rehash_rounds)
- (# consecutive congested rounds >= sysctl_tcp_plb_rehash_rounds)

A round (RTT) is marked as congested when congestion signal
(ECN ce_ratio) over an RTT is greater than sysctl_tcp_plb_cong_thresh.
In the event of RTO, PLB (via tcp_write_timeout()) triggers a path
change and disables congestion-triggered path changes for random time
between (sysctl_tcp_plb_suspend_rto_sec, 2*sysctl_tcp_plb_suspend_rto_sec)
to avoid hopping onto the "connectivity blackhole". RTO-triggered
path changes can still happen during this cool-off period.

Change-Id: I5d0fb3ab55b27b506b0cf32bc93df892b5336c2c
[5.10: Context from extra defintions]
---
 Documentation/networking/ip-sysctl.rst |  58 ++++++++++++++
 include/linux/tcp.h                    |   3 +
 include/net/inet_connection_sock.h     |   2 +-
 include/net/netns/ipv4.h               |   5 ++
 include/net/tcp.h                      |  17 +++++
 include/uapi/linux/snmp.h              |   1 +
 net/ipv4/Makefile                      |   2 +-
 net/ipv4/proc.c                        |   1 +
 net/ipv4/sysctl_net_ipv4.c             |  43 +++++++++++
 net/ipv4/tcp_bbr2.c                    |  28 +++++--
 net/ipv4/tcp_ipv4.c                    |   7 ++
 net/ipv4/tcp_plb.c                     | 100 +++++++++++++++++++++++++
 12 files changed, 260 insertions(+), 7 deletions(-)
 create mode 100644 net/ipv4/tcp_plb.c

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 252212998378e..8b3031e3ca174 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -949,6 +949,64 @@ tcp_rx_skb_cache - BOOLEAN
 
 	Default: 0 (disabled)
 
+tcp_plb_enabled - BOOLEAN
+	If set, TCP PLB (Protective Load Balancing) is enabled. PLB is
+	described in the following paper:
+	https://doi.org/10.1145/3544216.3544226. Based on PLB parameters,
+	upon sensing sustained congestion, TCP triggers a change in
+	flow label field for outgoing IPv6 packets. A change in flow label
+	field potentially changes the path of outgoing packets for switches
+	that use ECMP/WCMP for routing.
+
+	Default: 0
+
+tcp_plb_cong_thresh - INTEGER
+	Fraction of packets marked with congestion over a round (RTT) to
+	tag that round as congested. This is referred to as K in the PLB paper:
+	https://doi.org/10.1145/3544216.3544226.
+
+	The 0-1 fraction range is mapped to 0-256 range to avoid floating
+	point operations. For example, 128 means that if at least 50% of
+	the packets in a round were marked as congested then the round
+	will be tagged as congested.
+
+	Possible Values: 0 - 256
+
+	Default: 128
+
+tcp_plb_idle_rehash_rounds - INTEGER
+	Number of consecutive congested rounds (RTT) seen after which
+	a rehash can be performed, given there are no packets in flight.
+	This is referred to as M in PLB paper:
+	https://doi.org/10.1145/3544216.3544226.
+
+	Possible Values: 0 - 31
+
+	Default: 3
+
+tcp_plb_rehash_rounds - INTEGER
+	Number of consecutive congested rounds (RTT) seen after which
+	a forced rehash can be performed. Be careful when setting this
+	parameter, as a small value increases the risk of retransmissions.
+	This is referred to as N in PLB paper:
+	https://doi.org/10.1145/3544216.3544226.
+
+	Possible Values: 0 - 31
+
+	Default: 12
+
+tcp_plb_suspend_rto_sec - INTEGER
+	Time, in seconds, to suspend PLB in event of an RTO. In order to avoid
+	having PLB repath onto a connectivity "black hole", after an RTO a TCP
+	connection suspends PLB repathing for a random duration between 1x and
+	2x of this parameter. Randomness is added to avoid concurrent rehashing
+	of multiple TCP connections. This should be set corresponding to the
+	amount of time it takes to repair a failed link.
+
+	Possible Values: 0 - 255
+
+	Default: 60
+
 UDP variables
 =============
 
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 1472d1c147d5c..f626afe60d8fd 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -413,6 +413,9 @@ struct tcp_sock {
 	 */
 	struct request_sock __rcu *fastopen_rsk;
 	struct saved_syn *saved_syn;
+
+/* Rerouting information */
+	u16	ecn_rehash;	/* PLB triggered rehash attempts */
 };
 
 enum tsq_enum {
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 921b468969c69..2515ffe09e51c 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -135,7 +135,7 @@ struct inet_connection_sock {
 	u32			  icsk_user_timeout;
 
 /* XXX inflated by temporary internal debugging info */
-#define ICSK_CA_PRIV_SIZE      (216)
+#define ICSK_CA_PRIV_SIZE      (224)
 	u64			  icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)];
 };
 
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 0bdfdb1ac8ac0..462b8e96dd9c3 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -184,6 +184,11 @@ struct netns_ipv4 {
 	unsigned int sysctl_tcp_fastopen_blackhole_timeout;
 	atomic_t tfo_active_disable_times;
 	unsigned long tfo_active_disable_stamp;
+	u8 sysctl_tcp_plb_enabled;
+	int sysctl_tcp_plb_cong_thresh;
+	u8 sysctl_tcp_plb_idle_rehash_rounds;
+	u8 sysctl_tcp_plb_rehash_rounds;
+	u8 sysctl_tcp_plb_suspend_rto_sec;
 
 	int sysctl_udp_wmem_min;
 	int sysctl_udp_rmem_min;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0cf7b271e9b76..b2abfa98ec1bd 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2153,6 +2153,23 @@ extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
 extern void tcp_rack_reo_timeout(struct sock *sk);
 extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs);
 
+/* tcp_plb.c */
+
+#define TCP_PLB_SCALE 8	/* scaling factor for fractions in PLB (e.g. ce_ratio) */
+
+/* State for PLB (Protective Load Balancing) for a single TCP connection. */
+struct tcp_plb_state {
+	u8	consec_cong_rounds:5, /* consecutive congested rounds */
+		enabled:1,	/* Check if PLB is enabled */
+		unused:2;
+	u32	pause_until; /* jiffies32 when PLB can resume repathing */
+};
+
+void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb,
+			  const int cong_ratio);
+void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb);
+void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb);
+
 /* At how many usecs into the future should the RTO fire? */
 static inline s64 tcp_rto_delta_us(const struct sock *sk)
 {
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 4a16fbe247596..232961aecdeb4 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -291,6 +291,7 @@ enum
 	LINUX_MIB_TCPDSACKIGNOREDDUBIOUS,	/* TCPDSACKIgnoredDubious */
 	LINUX_MIB_TCPMIGRATEREQSUCCESS,		/* TCPMigrateReqSuccess */
 	LINUX_MIB_TCPMIGRATEREQFAILURE,		/* TCPMigrateReqFailure */
+	LINUX_MIB_TCPECNREHASH,			/* TCPECNRehash */
 	__LINUX_MIB_MAX
 };
 
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 8c5779dba462a..babfdbf6c15b9 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -10,7 +10,7 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
 	     tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
 	     tcp_rate.o tcp_recovery.o tcp_ulp.o \
-	     tcp_offload.o datagram.o raw.o udp.o udplite.o \
+	     tcp_offload.o tcp_plb.o datagram.o raw.o udp.o udplite.o \
 	     udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
 	     fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
 	     inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index c1dbc41088a33..dfa9cbe280951 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -296,6 +296,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPDSACKIgnoredDubious", LINUX_MIB_TCPDSACKIGNOREDDUBIOUS),
 	SNMP_MIB_ITEM("TCPMigrateReqSuccess", LINUX_MIB_TCPMIGRATEREQSUCCESS),
 	SNMP_MIB_ITEM("TCPMigrateReqFailure", LINUX_MIB_TCPMIGRATEREQFAILURE),
+	SNMP_MIB_ITEM("TCPECNRehash", LINUX_MIB_TCPECNREHASH),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 2afa5434c0421..a7f8a94da69fc 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -50,6 +50,8 @@ static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
 static int comp_sack_nr_max = 255;
 static u32 u32_max_div_HZ = UINT_MAX / HZ;
 static int one_day_secs = 24 * 3600;
+static int tcp_plb_max_rounds = 31;
+static int tcp_plb_max_cong_thresh = 256;
 
 /* obsolete */
 static int sysctl_tcp_low_latency __read_mostly;
@@ -1309,6 +1311,47 @@ static struct ctl_table ipv4_net_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= SYSCTL_ONE
 	},
+	{
+		.procname       = "tcp_plb_enabled",
+		.data           = &init_net.ipv4.sysctl_tcp_plb_enabled,
+		.maxlen         = sizeof(u8),
+		.mode           = 0644,
+		.proc_handler   = proc_dou8vec_minmax,
+		.extra1         = SYSCTL_ZERO,
+		.extra2         = SYSCTL_ONE,
+	},
+	{
+		.procname       = "tcp_plb_cong_thresh",
+		.data           = &init_net.ipv4.sysctl_tcp_plb_cong_thresh,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec_minmax,
+		.extra1         = SYSCTL_ZERO,
+		.extra2         = &tcp_plb_max_cong_thresh,
+	},
+	{
+		.procname       = "tcp_plb_idle_rehash_rounds",
+		.data           = &init_net.ipv4.sysctl_tcp_plb_idle_rehash_rounds,
+		.maxlen         = sizeof(u8),
+		.mode           = 0644,
+		.proc_handler   = proc_dou8vec_minmax,
+		.extra2		= &tcp_plb_max_rounds,
+	},
+	{
+		.procname       = "tcp_plb_rehash_rounds",
+		.data           = &init_net.ipv4.sysctl_tcp_plb_rehash_rounds,
+		.maxlen         = sizeof(u8),
+		.mode           = 0644,
+		.proc_handler   = proc_dou8vec_minmax,
+		.extra2         = &tcp_plb_max_rounds,
+	},
+	{
+		.procname       = "tcp_plb_suspend_rto_sec",
+		.data           = &init_net.ipv4.sysctl_tcp_plb_suspend_rto_sec,
+		.maxlen         = sizeof(u8),
+		.mode           = 0644,
+		.proc_handler   = proc_dou8vec_minmax,
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_bbr2.c b/net/ipv4/tcp_bbr2.c
index fa49e17c47ca9..09319695da824 100644
--- a/net/ipv4/tcp_bbr2.c
+++ b/net/ipv4/tcp_bbr2.c
@@ -167,6 +167,7 @@ struct bbr {
 		initialized:1;	       /* has bbr_init() been called? */
 	u32	alpha_last_delivered;	 /* tp->delivered    at alpha update */
 	u32	alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */
+	struct	tcp_plb_state plb;
 
 	/* Params configurable using setsockopt. Refer to correspoding
 	 * module param for detailed description of params.
@@ -733,7 +734,11 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct bbr *bbr = inet_csk_ca(sk);
 
-	if (event == CA_EVENT_TX_START && tp->app_limited) {
+	if (event == CA_EVENT_TX_START) {
+		tcp_plb_check_rehash(sk, &bbr->plb);
+
+		if (!tp->app_limited)
+			return;
 		bbr->idle_restart = 1;
 		bbr->ack_epoch_mstamp = tp->tcp_mstamp;
 		bbr->ack_epoch_acked = 0;
@@ -1389,7 +1394,7 @@ static void bbr2_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio)
 	}
 }
 
-static void bbr2_update_ecn_alpha(struct sock *sk)
+static int bbr2_update_ecn_alpha(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct bbr *bbr = inet_csk_ca(sk);
@@ -1398,14 +1403,14 @@ static void bbr2_update_ecn_alpha(struct sock *sk)
 	u32 gain;
 
 	if (bbr->params.ecn_factor == 0)
-		return;
+		return -1;
 
 	delivered = tp->delivered - bbr->alpha_last_delivered;
 	delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce;
 
 	if (delivered == 0 ||		/* avoid divide by zero */
 	    WARN_ON_ONCE(delivered < 0 || delivered_ce < 0))  /* backwards? */
-		return;
+		return -1;
 
 	/* See if we should use ECN sender logic for this connection. */
 	if (!bbr->ecn_eligible && bbr_ecn_enable &&
@@ -1424,6 +1429,7 @@ static void bbr2_update_ecn_alpha(struct sock *sk)
 	bbr->alpha_last_delivered_ce = tp->delivered_ce;
 
 	bbr2_check_ecn_too_high_in_startup(sk, ce_ratio);
+	return (int)ce_ratio;
 }
 
 /* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */
@@ -2238,6 +2244,7 @@ static void bbr2_main(struct sock *sk, const struct rate_sample *rs)
 	struct bbr_context ctx = { 0 };
 	bool update_model = true;
 	u32 bw;
+	int ce_ratio = -1;
 
 	bbr->debug.event = '.';  /* init to default NOP (no event yet) */
 
@@ -2245,7 +2252,9 @@ static void bbr2_main(struct sock *sk, const struct rate_sample *rs)
 	if (bbr->round_start) {
 		bbr->rounds_since_probe =
 			min_t(s32, bbr->rounds_since_probe + 1, 0xFF);
-		bbr2_update_ecn_alpha(sk);
+		ce_ratio = bbr2_update_ecn_alpha(sk);
+		tcp_plb_update_state(sk, &bbr->plb, ce_ratio);
+		tcp_plb_check_rehash(sk, &bbr->plb);
 	}
 
 	bbr->ecn_in_round  |= rs->is_ece;
@@ -2408,6 +2417,7 @@ static void bbr2_init(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct bbr *bbr = inet_csk_ca(sk);
+	const struct net *net = sock_net(sk);
 
 	bbr_init(sk);	/* run shared init code for v1 and v2 */
 
@@ -2470,6 +2480,13 @@ static void bbr2_init(struct sock *sk)
 	bbr->alpha_last_delivered = 0;
 	bbr->alpha_last_delivered_ce = 0;
 
+	bbr->plb.enabled = 0;
+	bbr->plb.consec_cong_rounds = 0;
+	bbr->plb.pause_until = 0;
+	if ((tp->ecn_flags & TCP_ECN_OK) &&
+	    net->ipv4.sysctl_tcp_plb_enabled)
+		bbr->plb.enabled = 1;
+
 	tp->fast_ack_mode = min_t(u32, 0x2U, bbr_fast_ack_mode);
 
 	if ((tp->ecn_flags & TCP_ECN_OK) && bbr_ecn_enable)
@@ -2614,6 +2631,7 @@ static void bbr2_set_state(struct sock *sk, u8 new_state)
 		struct rate_sample rs = { .losses = 1 };
 		struct bbr_context ctx = { 0 };
 
+		tcp_plb_update_state_upon_rto(sk, &bbr->plb);
 		bbr->prev_ca_state = TCP_CA_Loss;
 		bbr->full_bw = 0;
 		if (!bbr2_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 33d89d89baeec..c3948ea605621 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2929,6 +2929,13 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
 
+	/* Set default values for PLB */
+	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
+	net->ipv4.sysctl_tcp_plb_cong_thresh = 128; /* 50% congestion */
+	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
+	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
+	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
+
 	/* Reno is always built in */
 	if (!net_eq(net, &init_net) &&
 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
diff --git a/net/ipv4/tcp_plb.c b/net/ipv4/tcp_plb.c
new file mode 100644
index 0000000000000..71b02c0404cea
--- /dev/null
+++ b/net/ipv4/tcp_plb.c
@@ -0,0 +1,100 @@
+/* Protective Load Balancing (PLB)
+ *
+ * PLB was designed to reduce link load imbalance across datacenter
+ * switches. PLB is a host-based optimization; it leverages congestion
+ * signals from the transport layer to randomly change the path of the
+ * connection experiencing sustained congestion. PLB prefers to repath
+ * after idle periods to minimize packet reordering. It repaths by
+ * changing the IPv6 Flow Label on the packets of a connection, which
+ * datacenter switches include as part of ECMP/WCMP hashing.
+ *
+ * PLB is described in detail in:
+ *
+ *	Mubashir Adnan Qureshi, Yuchung Cheng, Qianwen Yin, Qiaobin Fu,
+ *	Gautam Kumar, Masoud Moshref, Junhua Yan, Van Jacobson,
+ *	David Wetherall,Abdul Kabbani:
+ *	"PLB: Congestion Signals are Simple and Effective for
+ *	 Network Load Balancing"
+ *	In ACM SIGCOMM 2022, Amsterdam Netherlands.
+ *
+ */
+
+#include <net/tcp.h>
+
+/* Called once per round-trip to update PLB state for a connection. */
+void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb,
+			  const int cong_ratio)
+{
+	struct net *net = sock_net(sk);
+
+	if (!plb->enabled)
+		return;
+
+	if (cong_ratio >= 0) {
+		if (cong_ratio < net->ipv4.sysctl_tcp_plb_cong_thresh)
+			plb->consec_cong_rounds = 0;
+		else if (plb->consec_cong_rounds <
+			 net->ipv4.sysctl_tcp_plb_rehash_rounds)
+			plb->consec_cong_rounds++;
+	}
+}
+EXPORT_SYMBOL_GPL(tcp_plb_update_state);
+
+/* Check whether recent congestion has been persistent enough to warrant
+ * a load balancing decision that switches the connection to another path.
+ */
+void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb)
+{
+	struct net *net = sock_net(sk);
+	bool can_idle_rehash, can_force_rehash;
+
+	if (!plb->enabled)
+		return;
+
+	/* Note that tcp_jiffies32 can wrap, so we clear pause_until
+	 * to 0 to indicate there is no recent RTO event that constrains
+	 * PLB rehashing.
+	 */
+	if (plb->pause_until &&
+	    !before(tcp_jiffies32, plb->pause_until))
+		plb->pause_until = 0;
+
+	can_idle_rehash = net->ipv4.sysctl_tcp_plb_idle_rehash_rounds &&
+			  !tcp_sk(sk)->packets_out &&
+			  plb->consec_cong_rounds >=
+			  net->ipv4.sysctl_tcp_plb_idle_rehash_rounds;
+	can_force_rehash = plb->consec_cong_rounds >=
+			   net->ipv4.sysctl_tcp_plb_rehash_rounds;
+
+	if (!plb->pause_until && (can_idle_rehash || can_force_rehash)) {
+		sk_rethink_txhash(sk);
+		plb->consec_cong_rounds = 0;
+		tcp_sk(sk)->ecn_rehash++;
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPECNREHASH);
+	}
+}
+EXPORT_SYMBOL_GPL(tcp_plb_check_rehash);
+
+/* Upon RTO, disallow load balancing for a while, to avoid having load
+ * balancing decisions switch traffic to a black-holed path that was
+ * previously avoided with a sk_rethink_txhash() call at RTO time.
+ */
+void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb)
+{
+	struct net *net = sock_net(sk);
+	u32 pause;
+
+	if (!plb->enabled)
+		return;
+
+	pause = net->ipv4.sysctl_tcp_plb_suspend_rto_sec * HZ;
+	pause += prandom_u32_max(pause);
+	plb->pause_until = tcp_jiffies32 + pause;
+
+	/* Reset PLB state upon RTO, since an RTO causes a sk_rethink_txhash() call
+	 * that may switch this connection to a path with completely different
+	 * congestion characteristics.
+	 */
+	plb->consec_cong_rounds = 0;
+}
+EXPORT_SYMBOL_GPL(tcp_plb_update_state_upon_rto);

From d2964f72224f819e2d5f64388734a330b7c18562 Mon Sep 17 00:00:00 2001
From: Stewart Smith <trawets@amazon.com>
Date: Tue, 14 Feb 2023 18:46:17 -0800
Subject: [PATCH 726/737] net/ipv6: Improve performance of inet6_ehashfn()

Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
---
 net/ipv6/inet6_hashtables.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index b4a5e01e12016..4a6aad4b46fca 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -35,8 +35,14 @@ u32 inet6_ehashfn(const struct net *net,
 	net_get_random_once(&inet6_ehash_secret, sizeof(inet6_ehash_secret));
 	net_get_random_once(&ipv6_hash_secret, sizeof(ipv6_hash_secret));
 
-	lhash = (__force u32)laddr->s6_addr32[3];
-	fhash = __ipv6_addr_jhash(faddr, ipv6_hash_secret);
+	lhash = jhash_3words((__force u32)laddr->s6_addr32[3],
+			    (((u32)lport) << 16) | (__force u32)fport,
+			    (__force u32)faddr->s6_addr32[0],
+			    ipv6_hash_secret);
+	fhash = jhash_3words((__force u32)faddr->s6_addr32[1],
+			    (__force u32)faddr->s6_addr32[2],
+			    (__force u32)faddr->s6_addr32[3],
+			    ipv6_hash_secret);
 
 	return __inet6_ehashfn(lhash, lport, fhash, fport,
 			       inet6_ehash_secret + net_hash_mix(net));

From 26bf604cd077b96f790c66fcdd3e0eed10515f57 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Sat, 27 May 2023 11:21:01 +0800
Subject: [PATCH 727/737] mm/damon/core: fix divide error in
 damon_nr_accesses_to_accesses_bp()

commit 5ff6e2fff88ef9bf110c5e85a48e7b557bfc64c1 upstream.

If 'aggr_interval' is smaller than 'sample_interval', max_nr_accesses in
damon_nr_accesses_to_accesses_bp() becomes zero which leads to divide
error, let's validate the values of them in damon_set_attrs() to fix it,
which similar to others attrs check.

Link: https://lkml.kernel.org/r/20230527032101.167788-1-wangkefeng.wang@huawei.com
Fixes: 2f5bef5a590b ("mm/damon/core: update monitoring results for new monitoring attributes")
Reported-by: syzbot+841a46899768ec7bec67@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=841a46899768ec7bec67
Link: https://lore.kernel.org/damon/00000000000055fc4e05fc975bc2@google.com/
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/damon/core.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index d9ef62047bf5f..91cff7f2997ef 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -551,6 +551,8 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs)
 		return -EINVAL;
 	if (attrs->min_nr_regions > attrs->max_nr_regions)
 		return -EINVAL;
+	if (attrs->sample_interval > attrs->aggr_interval)
+		return -EINVAL;
 
 	damon_update_monitoring_results(ctx, attrs);
 	ctx->attrs = *attrs;

From 27d8e5b55010cfb68c3f21ec8d8e4720f3249725 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 5 Jul 2023 17:00:09 +0200
Subject: [PATCH 728/737] netfilter: nf_tables: drop map element references
 from preparation phase

[ 628bd3e49cba1c066228e23d71a852c23e26da73 ]

set .destroy callback releases the references to other objects in maps.
This is very late and it results in spurious EBUSY errors. Drop refcount
from the preparation phase instead, update set backend not to drop
reference counter from set .destroy path.

Exceptions: NFT_TRANS_PREPARE_ERROR does not require to drop the
reference counter because the transaction abort path releases the map
references for each element since the set is unbound. The abort path
also deals with releasing reference counter for new elements added to
unbound sets.

Fixes: 591054469b3e ("netfilter: nf_tables: revisit chain/object refcounting from elements")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 19653b8784bbc..bbdb1371d0c01 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3306,6 +3306,36 @@ static int nft_table_validate(struct net *net, const struct nft_table *table)
 	return 0;
 }
 
+int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set,
+			 const struct nft_set_iter *iter,
+			 struct nft_set_elem *elem)
+{
+	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+	struct nft_ctx *pctx = (struct nft_ctx *)ctx;
+	const struct nft_data *data;
+	int err;
+
+	if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
+	    *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
+		return 0;
+
+	data = nft_set_ext_data(ext);
+	switch (data->verdict.code) {
+	case NFT_JUMP:
+	case NFT_GOTO:
+		pctx->level++;
+		err = nft_chain_validate(ctx, data->verdict.chain);
+		if (err < 0)
+			return err;
+		pctx->level--;
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
 static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
 					     const struct nft_chain *chain,
 					     const struct nlattr *nla);

From eb4d4753729fb2d8ca55872ae23d7f37ed0bcdab Mon Sep 17 00:00:00 2001
From: David de Bruyn <daviddb@amazon.com>
Date: Thu, 9 Feb 2023 09:19:09 +0000
Subject: [PATCH 729/737] Enable ptIOMMU for all supported platforms.

Currently ptIOMMU functionality is only enabled for ARM64. This patch
removes the architecture specific code to enable the ptIOMMU and
replaces it with an achitecture-independent strategy.

Signed-off-by: David de Bruyn <daviddb@amazon.com>
---
 arch/arm64/mm/dma-mapping.c | 6 ------
 drivers/acpi/scan.c         | 6 ++++++
 drivers/of/device.c         | 6 ++++++
 include/linux/dma-map-ops.h | 5 +++++
 4 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index e0f70ae4d34ed..93e87b2875567 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -8,7 +8,6 @@
 #include <linux/cache.h>
 #include <linux/dma-map-ops.h>
 #include <linux/dma-iommu.h>
-#include <linux/dma-page-touching.h>
 #include <xen/xen.h>
 #include <xen/swiotlb-xen.h>
 
@@ -57,9 +56,4 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 	if (xen_initial_domain())
 		dev->dma_ops = &xen_swiotlb_dma_ops;
 #endif
-
-#ifdef CONFIG_DMA_PAGE_TOUCHING
-	if (!dev->dma_ops)
-		setup_dma_page_touching_ops(dev);
-#endif
 }
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 67a5ee2fedfd3..7dfb66ad1f907 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/acpi.h>
 #include <linux/acpi_iort.h>
+#include <linux/dma-page-touching.h>
 #include <linux/signal.h>
 #include <linux/kthread.h>
 #include <linux/dmi.h>
@@ -1512,6 +1513,11 @@ int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr,
 	arch_setup_dma_ops(dev, dma_addr, size,
 				iommu, attr == DEV_DMA_COHERENT);
 
+#ifdef CONFIG_DMA_PAGE_TOUCHING
+	if (!dev->dma_ops)
+		setup_dma_page_touching_ops(dev);
+#endif
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(acpi_dma_configure_id);
diff --git a/drivers/of/device.c b/drivers/of/device.c
index 3a547793135c3..d21653a74e57e 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -7,6 +7,7 @@
 #include <linux/of_iommu.h>
 #include <linux/dma-direct.h> /* for bus_dma_region */
 #include <linux/dma-map-ops.h>
+#include <linux/dma-page-touching.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mod_devicetable.h>
@@ -186,6 +187,11 @@ int of_dma_configure_id(struct device *dev, struct device_node *np,
 
 	arch_setup_dma_ops(dev, dma_start, size, iommu, coherent);
 
+#ifdef CONFIG_DMA_PAGE_TOUCHING
+	if (!dev->dma_ops)
+		setup_dma_page_touching_ops(dev);
+#endif
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(of_dma_configure_id);
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index a5f89fc4d6df1..e7c4233ea044f 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -7,6 +7,7 @@
 #define _LINUX_DMA_MAP_OPS_H
 
 #include <linux/dma-mapping.h>
+#include <linux/dma-page-touching.h>
 #include <linux/pgtable.h>
 
 struct cma;
@@ -321,6 +322,10 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 static inline void arch_setup_dma_ops(struct device *dev, u64 dma_base,
 		u64 size, const struct iommu_ops *iommu, bool coherent)
 {
+#ifdef CONFIG_DMA_PAGE_TOUCHING
+	if (!dev->dma_ops)
+		setup_dma_page_touching_ops(dev);
+#endif
 }
 #endif /* CONFIG_ARCH_HAS_SETUP_DMA_OPS */
 

From e4f98cd7fe4f60fb2b3c5616f88f5d957f3268cb Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Thu, 27 Jul 2023 18:49:29 +0000
Subject: [PATCH 730/737] ip: Bump default ttl to 127.

In 4.14 and 5.4, the default TTL was bumped up to 255, but we moved
the change to sysctl-defaults package.  However, sysctl config is not
applied to netns.

Let's bump it again but to 127 as some nodes could block packets with
TTL 255.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
 include/uapi/linux/ip.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
index 860bbf6bf29cb..a9b626f014e78 100644
--- a/include/uapi/linux/ip.h
+++ b/include/uapi/linux/ip.h
@@ -67,7 +67,7 @@
 
 #define IPVERSION	4
 #define MAXTTL		255
-#define IPDEFTTL	64
+#define IPDEFTTL	127
 
 #define IPOPT_OPTVAL 0
 #define IPOPT_OLEN   1

From fb6e1b1f2f68c7b4caa73c2511e65cabd38f0efa Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Fri, 2 Jun 2023 10:29:47 +0100
Subject: [PATCH 731/737] mm/damon/ops-common: atomically test and clear young
 on ptes and pmds

It is racy to non-atomically read a pte, then clear the young bit, then
write it back as this could discard dirty information.  Further, it is bad
practice to directly set a pte entry within a table.  Instead clearing
young must go through the arch-provided helper,
ptep_test_and_clear_young() to ensure it is modified atomically and to
give the arch code visibility and allow it to check (and potentially
modify) the operation.

Link: https://lkml.kernel.org/r/20230602092949.545577-3-ryan.roberts@arm.com
Fixes: 3f49584b262c ("mm/damon: implement primitives for the virtual memory address spaces").
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/ops-common.c | 16 ++++++----------
 mm/damon/ops-common.h |  4 ++--
 mm/damon/paddr.c      |  4 ++--
 mm/damon/vaddr.c      |  4 ++--
 4 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index 75409601f9349..13b99975cbc2c 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -33,7 +33,7 @@ struct page *damon_get_page(unsigned long pfn)
 	return page;
 }
 
-void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr)
+void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr)
 {
 	bool referenced = false;
 	struct page *page = damon_get_page(pte_pfn(*pte));
@@ -41,13 +41,11 @@ void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr)
 	if (!page)
 		return;
 
-	if (pte_young(*pte)) {
+	if (ptep_test_and_clear_young(vma, addr, pte))
 		referenced = true;
-		*pte = pte_mkold(*pte);
-	}
 
 #ifdef CONFIG_MMU_NOTIFIER
-	if (mmu_notifier_clear_young(mm, addr, addr + PAGE_SIZE))
+	if (mmu_notifier_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE))
 		referenced = true;
 #endif /* CONFIG_MMU_NOTIFIER */
 
@@ -58,7 +56,7 @@ void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr)
 	put_page(page);
 }
 
-void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr)
+void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr)
 {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	bool referenced = false;
@@ -67,13 +65,11 @@ void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr)
 	if (!page)
 		return;
 
-	if (pmd_young(*pmd)) {
+	if (pmdp_test_and_clear_young(vma, addr, pmd))
 		referenced = true;
-		*pmd = pmd_mkold(*pmd);
-	}
 
 #ifdef CONFIG_MMU_NOTIFIER
-	if (mmu_notifier_clear_young(mm, addr, addr + HPAGE_PMD_SIZE))
+	if (mmu_notifier_clear_young(vma->vm_mm, addr, addr + HPAGE_PMD_SIZE))
 		referenced = true;
 #endif /* CONFIG_MMU_NOTIFIER */
 
diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h
index 8d82d37222042..e062a8874e411 100644
--- a/mm/damon/ops-common.h
+++ b/mm/damon/ops-common.h
@@ -9,8 +9,8 @@
 
 struct page *damon_get_page(unsigned long pfn);
 
-void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr);
-void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr);
+void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr);
+void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr);
 
 int damon_cold_score(struct damon_ctx *c, struct damon_region *r,
 			struct damos *s);
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index f4bfa0634cc38..b6f5171dc2ccb 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -28,9 +28,9 @@ static bool __damon_pa_mkold(struct page *page, struct vm_area_struct *vma,
 	while (page_vma_mapped_walk(&pvmw)) {
 		addr = pvmw.address;
 		if (pvmw.pte)
-			damon_ptep_mkold(pvmw.pte, vma->vm_mm, addr);
+			damon_ptep_mkold(pvmw.pte, vma, addr);
 		else
-			damon_pmdp_mkold(pvmw.pmd, vma->vm_mm, addr);
+			damon_pmdp_mkold(pvmw.pmd, vma, addr);
 	}
 	return true;
 }
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 87d7d314b4351..c245310cfb6dd 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -310,7 +310,7 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
 		}
 
 		if (pmd_trans_huge(*pmd)) {
-			damon_pmdp_mkold(pmd, walk->mm, addr);
+			damon_pmdp_mkold(pmd, walk->vma, addr);
 			spin_unlock(ptl);
 			return 0;
 		}
@@ -322,7 +322,7 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
 	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
 	if (!pte_present(*pte))
 		goto out;
-	damon_ptep_mkold(pte, walk->mm, addr);
+	damon_ptep_mkold(pte, walk->vma, addr);
 out:
 	pte_unmap_unlock(pte, ptl);
 	return 0;

From d62f961329d7fa6fd66e05e9ace5072184b795bb Mon Sep 17 00:00:00 2001
From: David Arinzon <darinzon@amazon.com>
Date: Tue, 8 Aug 2023 10:54:29 +0000
Subject: [PATCH 732/737] AL2 5.10 Update ena driver to 2.8.9g

Singed-off-by: David Arinzon <darinzon@amazon.com>
Signed-off-by: SeongJae Park <sjpark@amazon.com>
---
 drivers/amazon/net/ena/ena_admin_defs.h |  8 +++++++
 drivers/amazon/net/ena/ena_devlink.c    | 24 +++++++++----------
 drivers/amazon/net/ena/ena_netdev.c     | 14 ++++++++---
 drivers/amazon/net/ena/ena_netdev.h     |  3 ++-
 drivers/amazon/net/ena/ena_phc.c        |  5 +++-
 drivers/amazon/net/ena/kcompat.h        | 31 +++++++++++++++++++------
 6 files changed, 61 insertions(+), 24 deletions(-)

diff --git a/drivers/amazon/net/ena/ena_admin_defs.h b/drivers/amazon/net/ena/ena_admin_defs.h
index b3a9f1aec52b3..f34b44a6fa230 100644
--- a/drivers/amazon/net/ena/ena_admin_defs.h
+++ b/drivers/amazon/net/ena/ena_admin_defs.h
@@ -435,6 +435,10 @@ struct ena_admin_basic_stats {
 	u32 tx_drops_low;
 
 	u32 tx_drops_high;
+
+	u32 rx_overruns_low;
+
+	u32 rx_overruns_high;
 };
 
 /* ENI Statistics Command. */
@@ -1223,6 +1227,10 @@ struct ena_admin_aenq_keep_alive_desc {
 	u32 tx_drops_low;
 
 	u32 tx_drops_high;
+
+	u32 rx_overruns_low;
+
+	u32 rx_overruns_high;
 };
 
 struct ena_admin_ena_mmio_req_read_less_resp {
diff --git a/drivers/amazon/net/ena/ena_devlink.c b/drivers/amazon/net/ena/ena_devlink.c
index f140d024ef166..43ce1ae2cebaa 100644
--- a/drivers/amazon/net/ena/ena_devlink.c
+++ b/drivers/amazon/net/ena/ena_devlink.c
@@ -108,9 +108,9 @@ void ena_devlink_params_get(struct devlink *devlink)
 	if (!ena_is_devlink_params_registered(devlink))
 		return;
 #endif
-	err = devlink_param_driverinit_value_get(devlink,
-						 ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
-						 &val);
+	err = devl_param_driverinit_value_get(devlink,
+					      ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
+					      &val);
 	if (err) {
 		netdev_err(adapter->netdev, "Failed to query LLQ header size param\n");
 		return;
@@ -119,7 +119,7 @@ void ena_devlink_params_get(struct devlink *devlink)
 	adapter->large_llq_header_enabled = val.vbool;
 #ifdef ENA_PHC_SUPPORT
 
-	err = devlink_param_driverinit_value_get(devlink, ENA_DEVLINK_PARAM_ID_PHC_ENABLE, &val);
+	err = devl_param_driverinit_value_get(devlink, ENA_DEVLINK_PARAM_ID_PHC_ENABLE, &val);
 	if (err) {
 		netdev_err(adapter->netdev, "Failed to query PHC param\n");
 		return;
@@ -140,9 +140,9 @@ void ena_devlink_disable_large_llq_header_param(struct devlink *devlink)
 
 #endif
 	value.vbool = false;
-	devlink_param_driverinit_value_set(devlink,
-					   ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
-					   value);
+	devl_param_driverinit_value_set(devlink,
+					ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
+					value);
 }
 
 #ifdef ENA_PHC_SUPPORT
@@ -157,7 +157,7 @@ void ena_devlink_disable_phc_param(struct devlink *devlink)
 
 #endif
 	value.vbool = false;
-	devlink_param_driverinit_value_set(devlink, ENA_DEVLINK_PARAM_ID_PHC_ENABLE, value);
+	devl_param_driverinit_value_set(devlink, ENA_DEVLINK_PARAM_ID_PHC_ENABLE, value);
 }
 
 #endif /* ENA_PHC_SUPPORT */
@@ -277,13 +277,13 @@ static int ena_devlink_configure_params(struct devlink *devlink)
 	}
 
 	value.vbool = adapter->large_llq_header_enabled;
-	devlink_param_driverinit_value_set(devlink,
-					   ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
-					   value);
+	devl_param_driverinit_value_set(devlink,
+					ENA_DEVLINK_PARAM_ID_LLQ_HEADER_SIZE,
+					value);
 
 #ifdef ENA_PHC_SUPPORT
 	value.vbool = ena_phc_is_enabled(adapter);
-	devlink_param_driverinit_value_set(devlink, ENA_DEVLINK_PARAM_ID_PHC_ENABLE, value);
+	devl_param_driverinit_value_set(devlink, ENA_DEVLINK_PARAM_ID_PHC_ENABLE, value);
 
 #endif /* ENA_PHC_SUPPORT */
 #ifdef ENA_DEVLINK_RELOAD_SUPPORT_ADVERTISEMENT_NEEDED
diff --git a/drivers/amazon/net/ena/ena_netdev.c b/drivers/amazon/net/ena/ena_netdev.c
index 072be72e14d56..759926e8f8716 100644
--- a/drivers/amazon/net/ena/ena_netdev.c
+++ b/drivers/amazon/net/ena/ena_netdev.c
@@ -46,6 +46,8 @@ MODULE_VERSION(DRV_MODULE_GENERATION);
 
 #define DEFAULT_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_IFUP | \
 		NETIF_MSG_IFDOWN | NETIF_MSG_TX_ERR | NETIF_MSG_RX_ERR)
+
+#define ENA_HIGH_LOW_TO_U64(high, low) ((((u64)(high)) << 32) | (low))
 #ifndef ENA_LINEAR_FRAG_SUPPORTED
 
 #define ENA_SKB_PULL_MIN_LEN 64
@@ -3222,6 +3224,7 @@ static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
 	struct ena_ring *rx_ring, *tx_ring;
 	u64 xdp_rx_drops = 0;
 	unsigned int start;
+	u64 rx_overruns;
 	u64 rx_drops;
 	u64 tx_drops;
 	int i;
@@ -3268,6 +3271,7 @@ static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
 		start = ena_u64_stats_fetch_begin(&adapter->syncp);
 		rx_drops = adapter->dev_stats.rx_drops;
 		tx_drops = adapter->dev_stats.tx_drops;
+		rx_overruns = adapter->dev_stats.rx_overruns;
 	} while (ena_u64_stats_fetch_retry(&adapter->syncp, start));
 
 	stats->rx_dropped = rx_drops + xdp_rx_drops;
@@ -3282,8 +3286,9 @@ static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
 	stats->rx_fifo_errors = 0;
 	stats->rx_missed_errors = 0;
 	stats->tx_window_errors = 0;
+	stats->rx_over_errors = rx_overruns;
 
-	stats->rx_errors = 0;
+	stats->rx_errors = stats->rx_over_errors;
 	stats->tx_errors = 0;
 #ifndef NDO_GET_STATS_64_V2
 		return stats;
@@ -4987,14 +4992,16 @@ static void ena_keep_alive_wd(void *adapter_data,
 {
 	struct ena_adapter *adapter = (struct ena_adapter *)adapter_data;
 	struct ena_admin_aenq_keep_alive_desc *desc;
+	u64 rx_overruns;
 	u64 rx_drops;
 	u64 tx_drops;
 
 	desc = (struct ena_admin_aenq_keep_alive_desc *)aenq_e;
 	adapter->last_keep_alive_jiffies = jiffies;
 
-	rx_drops = ((u64)desc->rx_drops_high << 32) | desc->rx_drops_low;
-	tx_drops = ((u64)desc->tx_drops_high << 32) | desc->tx_drops_low;
+	rx_drops = ENA_HIGH_LOW_TO_U64(desc->rx_drops_high, desc->rx_drops_low);
+	tx_drops = ENA_HIGH_LOW_TO_U64(desc->tx_drops_high, desc->tx_drops_low);
+	rx_overruns = ENA_HIGH_LOW_TO_U64(desc->rx_overruns_high, desc->rx_overruns_low);
 
 	u64_stats_update_begin(&adapter->syncp);
 	/* These stats are accumulated by the device, so the counters indicate
@@ -5002,6 +5009,7 @@ static void ena_keep_alive_wd(void *adapter_data,
 	 */
 	adapter->dev_stats.rx_drops = rx_drops;
 	adapter->dev_stats.tx_drops = tx_drops;
+	adapter->dev_stats.rx_overruns = rx_overruns;
 	u64_stats_update_end(&adapter->syncp);
 }
 
diff --git a/drivers/amazon/net/ena/ena_netdev.h b/drivers/amazon/net/ena/ena_netdev.h
index 92e03d79971f7..97bdd08853400 100644
--- a/drivers/amazon/net/ena/ena_netdev.h
+++ b/drivers/amazon/net/ena/ena_netdev.h
@@ -29,7 +29,7 @@
 
 #define DRV_MODULE_GEN_MAJOR	2
 #define DRV_MODULE_GEN_MINOR	8
-#define DRV_MODULE_GEN_SUBMINOR 6
+#define DRV_MODULE_GEN_SUBMINOR 9
 
 #define DRV_MODULE_NAME		"ena"
 #ifndef DRV_MODULE_GENERATION
@@ -361,6 +361,7 @@ struct ena_stats_dev {
 	u64 admin_q_pause;
 	u64 rx_drops;
 	u64 tx_drops;
+	u64 rx_overruns;
 	u64 reset_fail;
 };
 
diff --git a/drivers/amazon/net/ena/ena_phc.c b/drivers/amazon/net/ena/ena_phc.c
index 8b89ae9efb4ec..5b637ef79bc04 100644
--- a/drivers/amazon/net/ena/ena_phc.c
+++ b/drivers/amazon/net/ena/ena_phc.c
@@ -7,11 +7,12 @@
 #include "ena_phc.h"
 
 #ifdef ENA_PHC_SUPPORT
-
+#ifdef ENA_PHC_SUPPORT_ADJFREQ
 static int ena_phc_adjfreq(struct ptp_clock_info *clock_info, s32 ppb)
 {
 	return -EOPNOTSUPP;
 }
+#endif /* ENA_PHC_SUPPORT_ADJFREQ */
 
 static int ena_phc_adjtime(struct ptp_clock_info *clock_info, s64 delta)
 {
@@ -109,7 +110,9 @@ static struct ptp_clock_info ena_ptp_clock_info = {
 	.n_ext_ts	= 0,
 	.n_per_out	= 0,
 	.pps		= 0,
+#ifdef ENA_PHC_SUPPORT_ADJFREQ
 	.adjfreq	= ena_phc_adjfreq,
+#endif /* ENA_PHC_SUPPORT_ADJFREQ */
 	.adjtime	= ena_phc_adjtime,
 #ifdef ENA_PHC_SUPPORT_GETTIME64
 #ifdef ENA_PHC_SUPPORT_GETTIME64_EXTENDED
diff --git a/drivers/amazon/net/ena/kcompat.h b/drivers/amazon/net/ena/kcompat.h
index 1b3e7edf570b0..62ddd400e787f 100644
--- a/drivers/amazon/net/ena/kcompat.h
+++ b/drivers/amazon/net/ena/kcompat.h
@@ -790,9 +790,9 @@ do {									\
 #define ENA_DEVLINK_RECEIVES_DEVICE_ON_ALLOC
 #endif
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0) || \
+#if (KERNEL_VERSION(5, 16, 0) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(6, 3, 0)) || \
 	(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4))  || \
-	(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
+	(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0) && !(defined(FEDORA_RELEASE)))
 #define ENA_DEVLINK_RELOAD_SUPPORT_ADVERTISEMENT_NEEDED
 #endif
 
@@ -866,8 +866,8 @@ static inline void netdev_rss_key_fill(void *buffer, size_t len)
  * UEK            ***********|--------------|--------|------|
  */
 #if (defined(IS_UEK) && !ENA_KERNEL_VERSION_GTE(4, 1, 12, 124, 43, 1)) || \
-    (defined(ubuntu)) || \
-    (!defined(IS_UEK) && !defined(ubuntu) && \
+    (defined(UBUNTU_VERSION_CODE)) || \
+    (!defined(IS_UEK) && !defined(UBUNTU_VERSION_CODE) && \
      !(KERNEL_VERSION(4, 4, 216) <= LINUX_VERSION_CODE && LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0)))
 static inline int page_ref_count(struct page *page)
 {
@@ -967,7 +967,11 @@ xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start,
 #define ENA_XDP_XMIT_FREES_FAILED_DESCS_INTERNALLY
 #endif
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0) && \
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0) && \
+	!(LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 188) && \
+	 LINUX_VERSION_CODE < KERNEL_VERSION(5, 11, 0)) && \
+	!(LINUX_VERSION_CODE >= KERNEL_VERSION(5, 4, 251) && \
+	 LINUX_VERSION_CODE < KERNEL_VERSION(5, 5, 0))) && \
 	!(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 6)) && \
 	!(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 4)) && \
 	!(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL == 3) && \
@@ -989,7 +993,8 @@ static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr)
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 17, 0) || \
 	(defined(RHEL_RELEASE_CODE) && \
 	RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 7) && \
-	RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(9, 0))
+	RHEL_RELEASE_CODE != RHEL_RELEASE_VERSION(9, 0)) || \
+	(defined(SUSE_VERSION) && (SUSE_VERSION == 15 && SUSE_PATCHLEVEL >= 5))
 #define ENA_ETHTOOL_RX_BUFF_SIZE_CHANGE
 #endif
 
@@ -1062,6 +1067,10 @@ static inline bool ktime_after(const ktime_t cmp1, const ktime_t cmp2)
 #define ENA_PHC_SUPPORT_GETTIME64_EXTENDED
 #endif /* ENA_PHC_SUPPORT_GETTIME64_EXTENDED */
 
+#if ((LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0)) && (LINUX_VERSION_CODE < KERNEL_VERSION(6, 2, 0)))
+#define ENA_PHC_SUPPORT_ADJFREQ
+#endif /* ENA_PHC_SUPPORT_ADJFREQ */
+
 #if ((LINUX_VERSION_CODE < KERNEL_VERSION(3, 7, 0)) && \
 	!(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6, 4))))
 #define ptp_clock_register(info, parent) ptp_clock_register(info)
@@ -1093,7 +1102,10 @@ static inline void ena_netif_napi_add(struct net_device *dev,
 				      int (*poll)(struct napi_struct *, int))
 {
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0)) && \
-	!(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 2)))
+	!(RHEL_RELEASE_CODE && \
+	  ((RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 8)) && \
+	   (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(9, 0))) || \
+	  (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 2)))
 #ifndef NAPI_POLL_WEIGHT
 #define NAPI_POLL_WEIGHT 64
 #endif
@@ -1103,6 +1115,11 @@ static inline void ena_netif_napi_add(struct net_device *dev,
 #endif /* LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0) */
 }
 
+#if defined(ENA_DEVLINK_SUPPORT) && LINUX_VERSION_CODE < KERNEL_VERSION(6, 3, 0)
+#define devl_param_driverinit_value_get devlink_param_driverinit_value_get
+#define devl_param_driverinit_value_set devlink_param_driverinit_value_set
+#endif
+
 #if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7, 4))) || \
     (defined(UBUNTU_VERSION_CODE) && UBUNTU_VERSION_CODE < UBUNTU_VERSION(4, 5, 0, 0)) || \
     (defined(IS_UEK) && !ENA_KERNEL_VERSION_GTE(4, 1, 12, 105, 0, 0))

From d0790574bb629bd098c7831d8d144bcd0b1409a4 Mon Sep 17 00:00:00 2001
From: Hailey Mothershead <hailmo@amazon.com>
Date: Tue, 15 Aug 2023 16:50:39 +0000
Subject: [PATCH 733/737] Revert "x86/xen: Fix secondary processors' FPU
 initialization"

This reverts commit 583016037a092e4189c86bad7946c6d88669b4ca since
it is the fix of reverted commit "x86/fpu: Move FPU initialization
into arch_cpu_finalize_init()".
---
 arch/x86/xen/smp_pv.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index 755e939db3ed3..64873937cd1d7 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -30,7 +30,6 @@
 #include <asm/desc.h>
 #include <asm/cpu.h>
 #include <asm/io_apic.h>
-#include <asm/fpu/internal.h>
 
 #include <xen/interface/xen.h>
 #include <xen/interface/vcpu.h>
@@ -64,7 +63,6 @@ static void cpu_bringup(void)
 
 	cr4_init();
 	cpu_init();
-	fpu__init_cpu();
 	touch_softlockup_watchdog();
 	preempt_disable();
 

From 1f4994e7a4a294085c826b0bc74bb0f4a916dadb Mon Sep 17 00:00:00 2001
From: Hailey Mothershead <hailmo@amazon.com>
Date: Tue, 15 Aug 2023 16:51:25 +0000
Subject: [PATCH 734/737] Revert "x86/fpu: Move FPU initialization into
 arch_cpu_finalize_init()"

This reverts commit 4ae1cbb730bd574d57d3996d4c20974972d47009.

We found this commit would cause warnings during FPU initilization
in all EC2 Nitro instances:

[    0.832860] ------------[ cut here ]------------
[    0.832860] get of unsupported state
[    0.832860] WARNING: CPU: 0 PID: 0 at arch/x86/kernel/fpu/xstate.c:879 get_xsave_addr+0x81/0x90
[    0.832860] Modules linked in:
[    0.832860] Hardware name: Amazon EC2 c5.large/, BIOS 1.0 10/16/2017
[    0.832860] RIP: 0010:get_xsave_addr+0x81/0x90
[    0.832860] Code: 5b c3 48 83 c4 08 31 c0 5b c3 80 3d 7c ea 78 01 00 75 c1 48 c7 c7 1c a6 03 82 89 4c 24 04 c6 05 68 ea 78 01 01 e8 4f 4c 05 00 <0f> 0b 48 63 4c 24 04 eb a1 31 c0 c3 0f 1f 00 0f 1f 44 00 00 41 54
[    0.832860] RSP: 0000:ffffffff82603ed0 EFLAGS: 00010282
[    0.832860] RAX: 0000000000000000 RBX: ffffffff827ebfc0 RCX: 0000000031a47119
[    0.832860] RDX: 0000000000000018 RSI: ffffffff839eb960 RDI: ffffffff839e952c
[    0.832860] RBP: ffffffff827ebe80 R08: 7520666f20746567 R09: 74726f707075736e
[    0.832860] R10: 00000000000962fc R11: 6574617473206465 R12: ffffffff82d8bb60
[    0.832860] R13: 0000000000000246 R14: 0000000000000000 R15: 0000000000000000
[    0.832860] FS:  0000000000000000(0000) GS:ffff88812ca00000(0000) knlGS:0000000000000000
[    0.832860] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    0.832860] CR2: ffff888130dff000 CR3: 000000000260a001 CR4: 00000000007200b0
[    0.832860] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[    0.832860] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[    0.832860] Call Trace:
[    0.832860]  ? __warn+0x85/0xcc
[    0.832860]  ? get_xsave_addr+0x81/0x90
[    0.832860]  ? report_bug+0xb6/0x130
[    0.832860]  ? get_xsave_addr+0x81/0x90
[    0.832860]  ? fixup_bug.part.12+0x18/0x30
[    0.832860]  ? do_error_trap+0x95/0xb0
[    0.832860]  ? do_invalid_op+0x36/0x40
[    0.832860]  ? get_xsave_addr+0x81/0x90
[    0.832860]  ? invalid_op+0x1e/0x30
[    0.832860]  ? get_xsave_addr+0x81/0x90
[    0.832860]  identify_cpu+0x422/0x510
[    0.832860]  identify_boot_cpu+0xc/0x94
[    0.832860]  arch_cpu_finalize_init+0x5/0x4a
[    0.832860]  start_kernel+0x461/0x50a
[    0.832860]  secondary_startup_64+0xa4/0xb0
[    0.832860] ---[ end trace 943bbb1c8b55b8c0 ]---

Given this commit is not a functional change, revert it for clean boot.
---
 arch/x86/kernel/cpu/common.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 13e217b01cbca..ad85ac6c83297 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1443,6 +1443,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 
 	cpu_set_core_cap_bits(c);
 
+	fpu__init_system();
+
 #ifdef CONFIG_X86_32
 	/*
 	 * Regardless of whether PCID is enumerated, the SDM says
@@ -2139,6 +2141,8 @@ void cpu_init(void)
 
 	doublefault_init_cpu_tss();
 
+	fpu__init_cpu();
+
 	if (is_uv_system())
 		uv_cpu_init();
 
@@ -2154,7 +2158,6 @@ void cpu_init_secondary(void)
 	 */
 	cpu_init_exception_handling();
 	cpu_init();
-	fpu__init_cpu();
 }
 #endif
 
@@ -2249,13 +2252,6 @@ void __init arch_cpu_finalize_init(void)
 			'0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
 	}
 
-	/*
-	 * Must be before alternatives because it might set or clear
-	 * feature bits.
-	 */
-	fpu__init_system();
-	fpu__init_cpu();
-
 	alternative_instructions();
 
 	if (IS_ENABLED(CONFIG_X86_64)) {

From 577b42b96822767e2cde046ebc0c6e08d2b79d7a Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Sat, 29 Jul 2023 20:37:32 +0000
Subject: [PATCH 735/737] mm/damon/core: initialize damo_filter->list from
 damos_new_filter()

commit 5f1fc67f2cb8d3035d3acd273b48b97835af8afd upstream.

damos_new_filter() is not initializing the list field of newly allocated
filter object.  However, DAMON sysfs interface and DAMON_RECLAIM are not
initializing it after calling damos_new_filter().  As a result, accessing
uninitialized memory is possible.  Actually, adding multiple DAMOS filters
via DAMON sysfs interface caused NULL pointer dereferencing.  Initialize
the field just after the allocation from damos_new_filter().

Link: https://lkml.kernel.org/r/20230729203733.38949-2-sj@kernel.org
Fixes: 98def236f63c ("mm/damon/core: implement damos filter")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/damon/core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index 91cff7f2997ef..eb9580942a5c3 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -273,6 +273,7 @@ struct damos_filter *damos_new_filter(enum damos_filter_type type,
 		return NULL;
 	filter->type = type;
 	filter->matching = matching;
+	INIT_LIST_HEAD(&filter->list);
 	return filter;
 }
 

From 4fa0da2d24ec853974b2d1b3fe0c763cc2f59f48 Mon Sep 17 00:00:00 2001
From: EC2 Default User <ec2-user@ip-172-31-19-64.ap-south-1.compute.internal>
Date: Tue, 8 Aug 2023 07:03:15 +0000
Subject: [PATCH 736/737] fix soft lockup issue when reading empty file

---
 fs/btrfs/file.c  | 3 ++-
 fs/btrfs/super.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 416a1b753ff62..d3da1748f93a7 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3596,7 +3596,8 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 			return ret;
 	}
 
-	return generic_file_buffered_read(iocb, to, ret);
+//	return generic_file_buffered_read(iocb, to, ret);
+	return generic_file_read_iter(iocb, to);
 }
 
 const struct file_operations btrfs_file_operations = {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b33505330e335..3b164d9663959 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2495,7 +2495,7 @@ static void __init btrfs_print_mod_info(void)
 			", ref-verify=on"
 #endif
 			;
-	pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options);
+	pr_info("Btrfs loaded[soft lockup fix 08 Aug], crc32c=%s%s\n", crc32c_impl(), options);
 }
 
 static int __init init_btrfs_fs(void)

From baec88df5c3ab3c8076e96c2c9ab9465c321b18e Mon Sep 17 00:00:00 2001
From: vewe-richard <jiang.j.qian@qq.com>
Date: Sun, 20 Aug 2023 07:51:21 +0000
Subject: [PATCH 737/737] revert log message before create a pull request

---
 fs/btrfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 3b164d9663959..b33505330e335 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2495,7 +2495,7 @@ static void __init btrfs_print_mod_info(void)
 			", ref-verify=on"
 #endif
 			;
-	pr_info("Btrfs loaded[soft lockup fix 08 Aug], crc32c=%s%s\n", crc32c_impl(), options);
+	pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options);
 }
 
 static int __init init_btrfs_fs(void)